diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_coincide_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_coincide_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..179755ffe38609edb0e1b4aab9f80a0a5d773e73 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_coincide_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,974 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 314, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006369426751592357, + "learning_rate": 2.5834789435204156e-06, + "loss": 0.0505, + "step": 2 + }, + { + "epoch": 0.012738853503184714, + "learning_rate": 2.73476360561837e-06, + "loss": 0.0862, + "step": 4 + }, + { + "epoch": 0.01910828025477707, + "learning_rate": 2.889654828892393e-06, + "loss": 0.2743, + "step": 6 + }, + { + "epoch": 0.025477707006369428, + "learning_rate": 3.0480757232535773e-06, + "loss": 0.2937, + "step": 8 + }, + { + "epoch": 0.03184713375796178, + "learning_rate": 3.2099476464367486e-06, + "loss": 0.3973, + "step": 10 + }, + { + "epoch": 0.03821656050955414, + "learning_rate": 3.3751902430395558e-06, + "loss": 0.4688, + "step": 12 + }, + { + "epoch": 0.044585987261146494, + "learning_rate": 3.5437214844119727e-06, + "loss": 0.5201, + "step": 14 + }, + { + "epoch": 0.050955414012738856, + "learning_rate": 3.7154577093764287e-06, + "loss": 0.448, + "step": 16 + }, + { + "epoch": 0.05732484076433121, + "learning_rate": 3.890313665758341e-06, + "loss": 0.1361, + "step": 18 + }, + { + "epoch": 0.06369426751592357, + "learning_rate": 4.068202552706455e-06, + "loss": 0.4843, + "step": 20 + }, + { + "epoch": 0.07006369426751592, + "learning_rate": 4.249036063781902e-06, + "loss": 0.8539, + "step": 22 + }, + { + "epoch": 0.07643312101910828, + "learning_rate": 4.432724430794775e-06, + "loss": 0.0686, + "step": 24 + }, + { + "epoch": 0.08280254777070063, + "learning_rate": 4.6191764683662625e-06, + "loss": 0.3056, + "step": 26 + }, + { + "epoch": 0.08917197452229299, + "learning_rate": 4.8082996191942354e-06, + "loss": 0.1533, + "step": 28 + }, + { + "epoch": 0.09554140127388536, + "learning_rate": 5.000000000000003e-06, + "loss": 0.1882, + "step": 30 + }, + { + "epoch": 0.10191082802547771, + "learning_rate": 5.194182448133163e-06, + "loss": 0.4956, + "step": 32 + }, + { + "epoch": 0.10828025477707007, + "learning_rate": 5.39075056881172e-06, + "loss": 0.0621, + "step": 34 + }, + { + "epoch": 0.11464968152866242, + "learning_rate": 5.589606782973682e-06, + "loss": 0.2904, + "step": 36 + }, + { + "epoch": 0.12101910828025478, + "learning_rate": 5.7906523757166475e-06, + "loss": 0.2351, + "step": 38 + }, + { + "epoch": 0.12738853503184713, + "learning_rate": 5.9937875453012e-06, + "loss": 0.3256, + "step": 40 + }, + { + "epoch": 0.1337579617834395, + "learning_rate": 6.198911452693847e-06, + "loss": 0.4068, + "step": 42 + }, + { + "epoch": 0.14012738853503184, + "learning_rate": 6.405922271624865e-06, + "loss": 0.2709, + "step": 44 + }, + { + "epoch": 0.1464968152866242, + "learning_rate": 6.614717239136237e-06, + "loss": 0.0155, + "step": 46 + }, + { + "epoch": 0.15286624203821655, + "learning_rate": 6.8251927065945815e-06, + "loss": 0.5768, + "step": 48 + }, + { + "epoch": 0.1592356687898089, + "learning_rate": 7.037244191143648e-06, + "loss": 0.2028, + "step": 50 + }, + { + "epoch": 0.16560509554140126, + "learning_rate": 7.250766427571185e-06, + "loss": 0.4226, + "step": 52 + }, + { + "epoch": 0.17197452229299362, + "learning_rate": 7.465653420563828e-06, + "loss": 0.1874, + "step": 54 + }, + { + "epoch": 0.17834394904458598, + "learning_rate": 7.68179849732472e-06, + "loss": 0.0486, + "step": 56 + }, + { + "epoch": 0.18471337579617833, + "learning_rate": 7.899094360527221e-06, + "loss": 0.3334, + "step": 58 + }, + { + "epoch": 0.1910828025477707, + "learning_rate": 8.117433141578865e-06, + "loss": 0.0569, + "step": 60 + }, + { + "epoch": 0.19745222929936307, + "learning_rate": 8.336706454168698e-06, + "loss": 0.0103, + "step": 62 + }, + { + "epoch": 0.20382165605095542, + "learning_rate": 8.55680544807173e-06, + "loss": 0.4509, + "step": 64 + }, + { + "epoch": 0.21019108280254778, + "learning_rate": 8.777620863183652e-06, + "loss": 0.0635, + "step": 66 + }, + { + "epoch": 0.21656050955414013, + "learning_rate": 8.99904308375901e-06, + "loss": 0.3395, + "step": 68 + }, + { + "epoch": 0.2229299363057325, + "learning_rate": 9.220962192825959e-06, + "loss": 0.0262, + "step": 70 + }, + { + "epoch": 0.22929936305732485, + "learning_rate": 9.443268026750509e-06, + "loss": 0.2095, + "step": 72 + }, + { + "epoch": 0.2356687898089172, + "learning_rate": 9.665850229923262e-06, + "loss": 0.1242, + "step": 74 + }, + { + "epoch": 0.24203821656050956, + "learning_rate": 9.88859830954135e-06, + "loss": 0.0032, + "step": 76 + }, + { + "epoch": 0.2484076433121019, + "learning_rate": 1.0111401690458642e-05, + "loss": 0.3737, + "step": 78 + }, + { + "epoch": 0.25477707006369427, + "learning_rate": 1.0334149770076732e-05, + "loss": 1.1583, + "step": 80 + }, + { + "epoch": 0.2611464968152866, + "learning_rate": 1.0556731973249482e-05, + "loss": 0.0546, + "step": 82 + }, + { + "epoch": 0.267515923566879, + "learning_rate": 1.0779037807174032e-05, + "loss": 0.3756, + "step": 84 + }, + { + "epoch": 0.27388535031847133, + "learning_rate": 1.1000956916240984e-05, + "loss": 0.0395, + "step": 86 + }, + { + "epoch": 0.2802547770700637, + "learning_rate": 1.1222379136816342e-05, + "loss": 0.1488, + "step": 88 + }, + { + "epoch": 0.28662420382165604, + "learning_rate": 1.1443194551928264e-05, + "loss": 0.2701, + "step": 90 + }, + { + "epoch": 0.2929936305732484, + "learning_rate": 1.1663293545831295e-05, + "loss": 0.7403, + "step": 92 + }, + { + "epoch": 0.29936305732484075, + "learning_rate": 1.188256685842113e-05, + "loss": 0.1445, + "step": 94 + }, + { + "epoch": 0.3057324840764331, + "learning_rate": 1.210090563947277e-05, + "loss": 0.3097, + "step": 96 + }, + { + "epoch": 0.31210191082802546, + "learning_rate": 1.2318201502675273e-05, + "loss": 0.4982, + "step": 98 + }, + { + "epoch": 0.3184713375796178, + "learning_rate": 1.2534346579436164e-05, + "loss": 0.2402, + "step": 100 + }, + { + "epoch": 0.3248407643312102, + "learning_rate": 1.274923357242881e-05, + "loss": 0.1077, + "step": 102 + }, + { + "epoch": 0.33121019108280253, + "learning_rate": 1.2962755808856345e-05, + "loss": 0.6843, + "step": 104 + }, + { + "epoch": 0.3375796178343949, + "learning_rate": 1.3174807293405412e-05, + "loss": 0.1502, + "step": 106 + }, + { + "epoch": 0.34394904458598724, + "learning_rate": 1.3385282760863758e-05, + "loss": 0.5122, + "step": 108 + }, + { + "epoch": 0.3503184713375796, + "learning_rate": 1.3594077728375129e-05, + "loss": 0.4533, + "step": 110 + }, + { + "epoch": 0.35668789808917195, + "learning_rate": 1.3801088547306147e-05, + "loss": 0.2561, + "step": 112 + }, + { + "epoch": 0.3630573248407643, + "learning_rate": 1.4006212454698793e-05, + "loss": 0.3514, + "step": 114 + }, + { + "epoch": 0.36942675159235666, + "learning_rate": 1.4209347624283347e-05, + "loss": 0.2827, + "step": 116 + }, + { + "epoch": 0.37579617834394907, + "learning_rate": 1.441039321702631e-05, + "loss": 0.0804, + "step": 118 + }, + { + "epoch": 0.3821656050955414, + "learning_rate": 1.4609249431188274e-05, + "loss": 0.9344, + "step": 120 + }, + { + "epoch": 0.3885350318471338, + "learning_rate": 1.480581755186683e-05, + "loss": 0.2892, + "step": 122 + }, + { + "epoch": 0.39490445859872614, + "learning_rate": 1.4999999999999992e-05, + "loss": 0.2521, + "step": 124 + }, + { + "epoch": 0.4012738853503185, + "learning_rate": 1.5191700380805761e-05, + "loss": 0.3105, + "step": 126 + }, + { + "epoch": 0.40764331210191085, + "learning_rate": 1.538082353163373e-05, + "loss": 0.3418, + "step": 128 + }, + { + "epoch": 0.4140127388535032, + "learning_rate": 1.556727556920522e-05, + "loss": 0.0588, + "step": 130 + }, + { + "epoch": 0.42038216560509556, + "learning_rate": 1.5750963936218094e-05, + "loss": 0.3193, + "step": 132 + }, + { + "epoch": 0.4267515923566879, + "learning_rate": 1.593179744729354e-05, + "loss": 0.1802, + "step": 134 + }, + { + "epoch": 0.43312101910828027, + "learning_rate": 1.6109686334241655e-05, + "loss": 0.2122, + "step": 136 + }, + { + "epoch": 0.4394904458598726, + "learning_rate": 1.6284542290623565e-05, + "loss": 0.8726, + "step": 138 + }, + { + "epoch": 0.445859872611465, + "learning_rate": 1.6456278515588023e-05, + "loss": 0.1363, + "step": 140 + }, + { + "epoch": 0.45222929936305734, + "learning_rate": 1.662480975696044e-05, + "loss": 0.0766, + "step": 142 + }, + { + "epoch": 0.4585987261146497, + "learning_rate": 1.6790052353563247e-05, + "loss": 0.7178, + "step": 144 + }, + { + "epoch": 0.46496815286624205, + "learning_rate": 1.6951924276746418e-05, + "loss": 0.7837, + "step": 146 + }, + { + "epoch": 0.4713375796178344, + "learning_rate": 1.7110345171107602e-05, + "loss": 0.5237, + "step": 148 + }, + { + "epoch": 0.47770700636942676, + "learning_rate": 1.7265236394381627e-05, + "loss": 0.4444, + "step": 150 + }, + { + "epoch": 0.4840764331210191, + "learning_rate": 1.741652105647958e-05, + "loss": 0.0142, + "step": 152 + }, + { + "epoch": 0.49044585987261147, + "learning_rate": 1.7564124057658057e-05, + "loss": 0.0969, + "step": 154 + }, + { + "epoch": 0.4968152866242038, + "learning_rate": 1.7707972125799738e-05, + "loss": 0.3022, + "step": 156 + }, + { + "epoch": 0.5031847133757962, + "learning_rate": 1.7847993852786612e-05, + "loss": 0.1648, + "step": 158 + }, + { + "epoch": 0.5095541401273885, + "learning_rate": 1.7984119729947937e-05, + "loss": 0.0168, + "step": 160 + }, + { + "epoch": 0.5159235668789809, + "learning_rate": 1.811628218256531e-05, + "loss": 0.0575, + "step": 162 + }, + { + "epoch": 0.5222929936305732, + "learning_rate": 1.8244415603417603e-05, + "loss": 0.6108, + "step": 164 + }, + { + "epoch": 0.5286624203821656, + "learning_rate": 1.836845638534933e-05, + "loss": 0.2658, + "step": 166 + }, + { + "epoch": 0.535031847133758, + "learning_rate": 1.8488342952846074e-05, + "loss": 0.3892, + "step": 168 + }, + { + "epoch": 0.5414012738853503, + "learning_rate": 1.860401579260139e-05, + "loss": 0.4322, + "step": 170 + }, + { + "epoch": 0.5477707006369427, + "learning_rate": 1.8715417483060044e-05, + "loss": 0.2036, + "step": 172 + }, + { + "epoch": 0.554140127388535, + "learning_rate": 1.8822492722922816e-05, + "loss": 0.1441, + "step": 174 + }, + { + "epoch": 0.5605095541401274, + "learning_rate": 1.8925188358598808e-05, + "loss": 0.3341, + "step": 176 + }, + { + "epoch": 0.5668789808917197, + "learning_rate": 1.902345341059163e-05, + "loss": 0.4069, + "step": 178 + }, + { + "epoch": 0.5732484076433121, + "learning_rate": 1.9117239098806296e-05, + "loss": 0.0086, + "step": 180 + }, + { + "epoch": 0.5796178343949044, + "learning_rate": 1.920649886676429e-05, + "loss": 0.497, + "step": 182 + }, + { + "epoch": 0.5859872611464968, + "learning_rate": 1.9291188404714876e-05, + "loss": 0.1694, + "step": 184 + }, + { + "epoch": 0.5923566878980892, + "learning_rate": 1.937126567163103e-05, + "loss": 0.4397, + "step": 186 + }, + { + "epoch": 0.5987261146496815, + "learning_rate": 1.944669091607919e-05, + "loss": 0.5807, + "step": 188 + }, + { + "epoch": 0.6050955414012739, + "learning_rate": 1.9517426695952354e-05, + "loss": 0.128, + "step": 190 + }, + { + "epoch": 0.6114649681528662, + "learning_rate": 1.9583437897056915e-05, + "loss": 0.7845, + "step": 192 + }, + { + "epoch": 0.6178343949044586, + "learning_rate": 1.964469175054377e-05, + "loss": 0.2964, + "step": 194 + }, + { + "epoch": 0.6242038216560509, + "learning_rate": 1.970115784917523e-05, + "loss": 0.0337, + "step": 196 + }, + { + "epoch": 0.6305732484076433, + "learning_rate": 1.975280816241959e-05, + "loss": 0.3463, + "step": 198 + }, + { + "epoch": 0.6369426751592356, + "learning_rate": 1.979961705036587e-05, + "loss": 0.7921, + "step": 200 + }, + { + "epoch": 0.643312101910828, + "learning_rate": 1.9841561276451777e-05, + "loss": 0.129, + "step": 202 + }, + { + "epoch": 0.6496815286624203, + "learning_rate": 1.9878620018998696e-05, + "loss": 0.028, + "step": 204 + }, + { + "epoch": 0.6560509554140127, + "learning_rate": 1.9910774881547803e-05, + "loss": 0.2591, + "step": 206 + }, + { + "epoch": 0.6624203821656051, + "learning_rate": 1.993800990199235e-05, + "loss": 0.4982, + "step": 208 + }, + { + "epoch": 0.6687898089171974, + "learning_rate": 1.9960311560501457e-05, + "loss": 0.2502, + "step": 210 + }, + { + "epoch": 0.6751592356687898, + "learning_rate": 1.9977668786231536e-05, + "loss": 0.9019, + "step": 212 + }, + { + "epoch": 0.6815286624203821, + "learning_rate": 1.999007296282201e-05, + "loss": 0.452, + "step": 214 + }, + { + "epoch": 0.6878980891719745, + "learning_rate": 1.9997517932672592e-05, + "loss": 0.283, + "step": 216 + }, + { + "epoch": 0.6942675159235668, + "learning_rate": 2e-05, + "loss": 0.2723, + "step": 218 + }, + { + "epoch": 0.7006369426751592, + "learning_rate": 1.9997517932672592e-05, + "loss": 0.1013, + "step": 220 + }, + { + "epoch": 0.7070063694267515, + "learning_rate": 1.999007296282201e-05, + "loss": 0.5748, + "step": 222 + }, + { + "epoch": 0.7133757961783439, + "learning_rate": 1.9977668786231536e-05, + "loss": 0.2457, + "step": 224 + }, + { + "epoch": 0.7197452229299363, + "learning_rate": 1.9960311560501457e-05, + "loss": 0.4159, + "step": 226 + }, + { + "epoch": 0.7261146496815286, + "learning_rate": 1.993800990199235e-05, + "loss": 0.2193, + "step": 228 + }, + { + "epoch": 0.732484076433121, + "learning_rate": 1.99107748815478e-05, + "loss": 0.4867, + "step": 230 + }, + { + "epoch": 0.7388535031847133, + "learning_rate": 1.9878620018998696e-05, + "loss": 0.2393, + "step": 232 + }, + { + "epoch": 0.7452229299363057, + "learning_rate": 1.984156127645178e-05, + "loss": 0.228, + "step": 234 + }, + { + "epoch": 0.7515923566878981, + "learning_rate": 1.979961705036587e-05, + "loss": 0.0731, + "step": 236 + }, + { + "epoch": 0.7579617834394905, + "learning_rate": 1.975280816241959e-05, + "loss": 0.0534, + "step": 238 + }, + { + "epoch": 0.7643312101910829, + "learning_rate": 1.9701157849175232e-05, + "loss": 0.1819, + "step": 240 + }, + { + "epoch": 0.7707006369426752, + "learning_rate": 1.9644691750543772e-05, + "loss": 0.0205, + "step": 242 + }, + { + "epoch": 0.7770700636942676, + "learning_rate": 1.958343789705692e-05, + "loss": 0.3531, + "step": 244 + }, + { + "epoch": 0.7834394904458599, + "learning_rate": 1.9517426695952354e-05, + "loss": 0.3391, + "step": 246 + }, + { + "epoch": 0.7898089171974523, + "learning_rate": 1.9446690916079184e-05, + "loss": 0.1271, + "step": 248 + }, + { + "epoch": 0.7961783439490446, + "learning_rate": 1.9371265671631034e-05, + "loss": 0.1924, + "step": 250 + }, + { + "epoch": 0.802547770700637, + "learning_rate": 1.929118840471488e-05, + "loss": 0.0193, + "step": 252 + }, + { + "epoch": 0.8089171974522293, + "learning_rate": 1.9206498866764293e-05, + "loss": 0.8572, + "step": 254 + }, + { + "epoch": 0.8152866242038217, + "learning_rate": 1.9117239098806302e-05, + "loss": 0.0557, + "step": 256 + }, + { + "epoch": 0.821656050955414, + "learning_rate": 1.9023453410591645e-05, + "loss": 0.2022, + "step": 258 + }, + { + "epoch": 0.8280254777070064, + "learning_rate": 1.8925188358598822e-05, + "loss": 0.5536, + "step": 260 + }, + { + "epoch": 0.8343949044585988, + "learning_rate": 1.882249272292283e-05, + "loss": 0.4296, + "step": 262 + }, + { + "epoch": 0.8407643312101911, + "learning_rate": 1.871541748306005e-05, + "loss": 0.1649, + "step": 264 + }, + { + "epoch": 0.8471337579617835, + "learning_rate": 1.8604015792601395e-05, + "loss": 0.3044, + "step": 266 + }, + { + "epoch": 0.8535031847133758, + "learning_rate": 1.8488342952846077e-05, + "loss": 0.3655, + "step": 268 + }, + { + "epoch": 0.8598726114649682, + "learning_rate": 1.8368456385349333e-05, + "loss": 0.569, + "step": 270 + }, + { + "epoch": 0.8662420382165605, + "learning_rate": 1.824441560341761e-05, + "loss": 0.3474, + "step": 272 + }, + { + "epoch": 0.8726114649681529, + "learning_rate": 1.811628218256532e-05, + "loss": 0.366, + "step": 274 + }, + { + "epoch": 0.8789808917197452, + "learning_rate": 1.798411972994795e-05, + "loss": 2.5077, + "step": 276 + }, + { + "epoch": 0.8853503184713376, + "learning_rate": 1.784799385278662e-05, + "loss": 0.2863, + "step": 278 + }, + { + "epoch": 0.89171974522293, + "learning_rate": 1.770797212579973e-05, + "loss": 0.3239, + "step": 280 + }, + { + "epoch": 0.8980891719745223, + "learning_rate": 1.756412405765805e-05, + "loss": 0.1448, + "step": 282 + }, + { + "epoch": 0.9044585987261147, + "learning_rate": 1.7416521056479573e-05, + "loss": 0.2393, + "step": 284 + }, + { + "epoch": 0.910828025477707, + "learning_rate": 1.7265236394381634e-05, + "loss": 0.3568, + "step": 286 + }, + { + "epoch": 0.9171974522292994, + "learning_rate": 1.711034517110761e-05, + "loss": 0.0124, + "step": 288 + }, + { + "epoch": 0.9235668789808917, + "learning_rate": 1.6951924276746425e-05, + "loss": 0.4765, + "step": 290 + }, + { + "epoch": 0.9299363057324841, + "learning_rate": 1.6790052353563254e-05, + "loss": 0.1502, + "step": 292 + }, + { + "epoch": 0.9363057324840764, + "learning_rate": 1.662480975696046e-05, + "loss": 0.1455, + "step": 294 + }, + { + "epoch": 0.9426751592356688, + "learning_rate": 1.6456278515588044e-05, + "loss": 0.1501, + "step": 296 + }, + { + "epoch": 0.9490445859872612, + "learning_rate": 1.6284542290623558e-05, + "loss": 0.4569, + "step": 298 + }, + { + "epoch": 0.9554140127388535, + "learning_rate": 1.6109686334241648e-05, + "loss": 0.2271, + "step": 300 + }, + { + "epoch": 0.9617834394904459, + "learning_rate": 1.593179744729355e-05, + "loss": 0.2835, + "step": 302 + }, + { + "epoch": 0.9681528662420382, + "learning_rate": 1.57509639362181e-05, + "loss": 0.2865, + "step": 304 + }, + { + "epoch": 0.9745222929936306, + "learning_rate": 1.5567275569205227e-05, + "loss": 0.0294, + "step": 306 + }, + { + "epoch": 0.9808917197452229, + "learning_rate": 1.538082353163374e-05, + "loss": 0.8354, + "step": 308 + }, + { + "epoch": 0.9872611464968153, + "learning_rate": 1.5191700380805768e-05, + "loss": 0.9003, + "step": 310 + }, + { + "epoch": 0.9936305732484076, + "learning_rate": 1.5000000000000014e-05, + "loss": 0.0171, + "step": 312 + }, + { + "epoch": 1.0, + "learning_rate": 1.4805817551866854e-05, + "loss": 0.0555, + "step": 314 + }, + { + "epoch": 1.0, + "step": 314, + "total_flos": 1802881866924032.0, + "train_loss": 0.3193168936469325, + "train_runtime": 1687.0863, + "train_samples_per_second": 2.978, + "train_steps_per_second": 0.186 + } + ], + "logging_steps": 2, + "max_steps": 314, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 1802881866924032.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..60aed153130c9645832a325db6b868a596068b61 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a92c1614a8bad7370023fec4431bb5e371fe7e3e508dc6e2af6fc6c14cc6f7a8 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3a530452ce684bab27c783667368bff8d365608c --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:139666b08b69317031fc1d6dd8bcfe1ed0e09d11b2ef71b517bd03b515a7ae4e +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..325f76df661853d6faed2fee99eae40917ff3f0a --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fcb1daf238aa41b7da5beec79cbf6d306a03c5f033165540036e6280b4c9be5 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..e8635ae24f66b0b1212c70717940c69ed5c131e2 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cade9800299ef877cbab4fdcccd1273c83f97e8a6696aee6fc4fc332bcaea98f +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_divbs_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_divbs_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..925df3f2937f0fc4f90c05dff14064407fe2cc6d --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_divbs_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,974 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 314, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006369426751592357, + "learning_rate": 2.5834789435204156e-06, + "loss": 0.5361, + "step": 2 + }, + { + "epoch": 0.012738853503184714, + "learning_rate": 2.73476360561837e-06, + "loss": 0.2018, + "step": 4 + }, + { + "epoch": 0.01910828025477707, + "learning_rate": 2.889654828892393e-06, + "loss": 0.3738, + "step": 6 + }, + { + "epoch": 0.025477707006369428, + "learning_rate": 3.0480757232535773e-06, + "loss": 1.0056, + "step": 8 + }, + { + "epoch": 0.03184713375796178, + "learning_rate": 3.2099476464367486e-06, + "loss": 0.6308, + "step": 10 + }, + { + "epoch": 0.03821656050955414, + "learning_rate": 3.3751902430395558e-06, + "loss": 0.6796, + "step": 12 + }, + { + "epoch": 0.044585987261146494, + "learning_rate": 3.5437214844119727e-06, + "loss": 0.43, + "step": 14 + }, + { + "epoch": 0.050955414012738856, + "learning_rate": 3.7154577093764287e-06, + "loss": 0.5045, + "step": 16 + }, + { + "epoch": 0.05732484076433121, + "learning_rate": 3.890313665758341e-06, + "loss": 0.2084, + "step": 18 + }, + { + "epoch": 0.06369426751592357, + "learning_rate": 4.068202552706455e-06, + "loss": 0.3108, + "step": 20 + }, + { + "epoch": 0.07006369426751592, + "learning_rate": 4.249036063781902e-06, + "loss": 1.0176, + "step": 22 + }, + { + "epoch": 0.07643312101910828, + "learning_rate": 4.432724430794775e-06, + "loss": 0.6321, + "step": 24 + }, + { + "epoch": 0.08280254777070063, + "learning_rate": 4.6191764683662625e-06, + "loss": 0.548, + "step": 26 + }, + { + "epoch": 0.08917197452229299, + "learning_rate": 4.8082996191942354e-06, + "loss": 0.3522, + "step": 28 + }, + { + "epoch": 0.09554140127388536, + "learning_rate": 5.000000000000003e-06, + "loss": 0.5196, + "step": 30 + }, + { + "epoch": 0.10191082802547771, + "learning_rate": 5.194182448133163e-06, + "loss": 0.4787, + "step": 32 + }, + { + "epoch": 0.10828025477707007, + "learning_rate": 5.39075056881172e-06, + "loss": 0.568, + "step": 34 + }, + { + "epoch": 0.11464968152866242, + "learning_rate": 5.589606782973682e-06, + "loss": 0.5401, + "step": 36 + }, + { + "epoch": 0.12101910828025478, + "learning_rate": 5.7906523757166475e-06, + "loss": 0.4241, + "step": 38 + }, + { + "epoch": 0.12738853503184713, + "learning_rate": 5.9937875453012e-06, + "loss": 0.1682, + "step": 40 + }, + { + "epoch": 0.1337579617834395, + "learning_rate": 6.198911452693847e-06, + "loss": 0.6789, + "step": 42 + }, + { + "epoch": 0.14012738853503184, + "learning_rate": 6.405922271624865e-06, + "loss": 0.7686, + "step": 44 + }, + { + "epoch": 0.1464968152866242, + "learning_rate": 6.614717239136237e-06, + "loss": 0.5242, + "step": 46 + }, + { + "epoch": 0.15286624203821655, + "learning_rate": 6.8251927065945815e-06, + "loss": 0.5326, + "step": 48 + }, + { + "epoch": 0.1592356687898089, + "learning_rate": 7.037244191143648e-06, + "loss": 0.2171, + "step": 50 + }, + { + "epoch": 0.16560509554140126, + "learning_rate": 7.250766427571185e-06, + "loss": 0.3231, + "step": 52 + }, + { + "epoch": 0.17197452229299362, + "learning_rate": 7.465653420563828e-06, + "loss": 0.6905, + "step": 54 + }, + { + "epoch": 0.17834394904458598, + "learning_rate": 7.68179849732472e-06, + "loss": 0.7773, + "step": 56 + }, + { + "epoch": 0.18471337579617833, + "learning_rate": 7.899094360527221e-06, + "loss": 0.8143, + "step": 58 + }, + { + "epoch": 0.1910828025477707, + "learning_rate": 8.117433141578865e-06, + "loss": 0.1776, + "step": 60 + }, + { + "epoch": 0.19745222929936307, + "learning_rate": 8.336706454168698e-06, + "loss": 0.6091, + "step": 62 + }, + { + "epoch": 0.20382165605095542, + "learning_rate": 8.55680544807173e-06, + "loss": 0.6649, + "step": 64 + }, + { + "epoch": 0.21019108280254778, + "learning_rate": 8.777620863183652e-06, + "loss": 0.4914, + "step": 66 + }, + { + "epoch": 0.21656050955414013, + "learning_rate": 8.99904308375901e-06, + "loss": 0.3618, + "step": 68 + }, + { + "epoch": 0.2229299363057325, + "learning_rate": 9.220962192825959e-06, + "loss": 0.5064, + "step": 70 + }, + { + "epoch": 0.22929936305732485, + "learning_rate": 9.443268026750509e-06, + "loss": 0.6748, + "step": 72 + }, + { + "epoch": 0.2356687898089172, + "learning_rate": 9.665850229923262e-06, + "loss": 0.7058, + "step": 74 + }, + { + "epoch": 0.24203821656050956, + "learning_rate": 9.88859830954135e-06, + "loss": 0.7704, + "step": 76 + }, + { + "epoch": 0.2484076433121019, + "learning_rate": 1.0111401690458642e-05, + "loss": 0.9882, + "step": 78 + }, + { + "epoch": 0.25477707006369427, + "learning_rate": 1.0334149770076732e-05, + "loss": 1.1594, + "step": 80 + }, + { + "epoch": 0.2611464968152866, + "learning_rate": 1.0556731973249482e-05, + "loss": 0.6744, + "step": 82 + }, + { + "epoch": 0.267515923566879, + "learning_rate": 1.0779037807174032e-05, + "loss": 0.4816, + "step": 84 + }, + { + "epoch": 0.27388535031847133, + "learning_rate": 1.1000956916240984e-05, + "loss": 0.2887, + "step": 86 + }, + { + "epoch": 0.2802547770700637, + "learning_rate": 1.1222379136816342e-05, + "loss": 0.348, + "step": 88 + }, + { + "epoch": 0.28662420382165604, + "learning_rate": 1.1443194551928264e-05, + "loss": 0.7011, + "step": 90 + }, + { + "epoch": 0.2929936305732484, + "learning_rate": 1.1663293545831295e-05, + "loss": 0.2513, + "step": 92 + }, + { + "epoch": 0.29936305732484075, + "learning_rate": 1.188256685842113e-05, + "loss": 0.7644, + "step": 94 + }, + { + "epoch": 0.3057324840764331, + "learning_rate": 1.210090563947277e-05, + "loss": 0.3947, + "step": 96 + }, + { + "epoch": 0.31210191082802546, + "learning_rate": 1.2318201502675273e-05, + "loss": 0.3527, + "step": 98 + }, + { + "epoch": 0.3184713375796178, + "learning_rate": 1.2534346579436164e-05, + "loss": 0.2598, + "step": 100 + }, + { + "epoch": 0.3248407643312102, + "learning_rate": 1.274923357242881e-05, + "loss": 0.4266, + "step": 102 + }, + { + "epoch": 0.33121019108280253, + "learning_rate": 1.2962755808856345e-05, + "loss": 0.3997, + "step": 104 + }, + { + "epoch": 0.3375796178343949, + "learning_rate": 1.3174807293405412e-05, + "loss": 0.3758, + "step": 106 + }, + { + "epoch": 0.34394904458598724, + "learning_rate": 1.3385282760863758e-05, + "loss": 0.4305, + "step": 108 + }, + { + "epoch": 0.3503184713375796, + "learning_rate": 1.3594077728375129e-05, + "loss": 0.0364, + "step": 110 + }, + { + "epoch": 0.35668789808917195, + "learning_rate": 1.3801088547306147e-05, + "loss": 0.6869, + "step": 112 + }, + { + "epoch": 0.3630573248407643, + "learning_rate": 1.4006212454698793e-05, + "loss": 0.5869, + "step": 114 + }, + { + "epoch": 0.36942675159235666, + "learning_rate": 1.4209347624283347e-05, + "loss": 0.3358, + "step": 116 + }, + { + "epoch": 0.37579617834394907, + "learning_rate": 1.441039321702631e-05, + "loss": 0.4655, + "step": 118 + }, + { + "epoch": 0.3821656050955414, + "learning_rate": 1.4609249431188274e-05, + "loss": 0.4452, + "step": 120 + }, + { + "epoch": 0.3885350318471338, + "learning_rate": 1.480581755186683e-05, + "loss": 0.7202, + "step": 122 + }, + { + "epoch": 0.39490445859872614, + "learning_rate": 1.4999999999999992e-05, + "loss": 0.618, + "step": 124 + }, + { + "epoch": 0.4012738853503185, + "learning_rate": 1.5191700380805761e-05, + "loss": 0.8044, + "step": 126 + }, + { + "epoch": 0.40764331210191085, + "learning_rate": 1.538082353163373e-05, + "loss": 0.4794, + "step": 128 + }, + { + "epoch": 0.4140127388535032, + "learning_rate": 1.556727556920522e-05, + "loss": 0.3746, + "step": 130 + }, + { + "epoch": 0.42038216560509556, + "learning_rate": 1.5750963936218094e-05, + "loss": 0.3161, + "step": 132 + }, + { + "epoch": 0.4267515923566879, + "learning_rate": 1.593179744729354e-05, + "loss": 0.5384, + "step": 134 + }, + { + "epoch": 0.43312101910828027, + "learning_rate": 1.6109686334241655e-05, + "loss": 0.8099, + "step": 136 + }, + { + "epoch": 0.4394904458598726, + "learning_rate": 1.6284542290623565e-05, + "loss": 0.7089, + "step": 138 + }, + { + "epoch": 0.445859872611465, + "learning_rate": 1.6456278515588023e-05, + "loss": 0.4458, + "step": 140 + }, + { + "epoch": 0.45222929936305734, + "learning_rate": 1.662480975696044e-05, + "loss": 0.76, + "step": 142 + }, + { + "epoch": 0.4585987261146497, + "learning_rate": 1.6790052353563247e-05, + "loss": 0.5168, + "step": 144 + }, + { + "epoch": 0.46496815286624205, + "learning_rate": 1.6951924276746418e-05, + "loss": 0.6594, + "step": 146 + }, + { + "epoch": 0.4713375796178344, + "learning_rate": 1.7110345171107602e-05, + "loss": 0.5252, + "step": 148 + }, + { + "epoch": 0.47770700636942676, + "learning_rate": 1.7265236394381627e-05, + "loss": 0.5808, + "step": 150 + }, + { + "epoch": 0.4840764331210191, + "learning_rate": 1.741652105647958e-05, + "loss": 0.3341, + "step": 152 + }, + { + "epoch": 0.49044585987261147, + "learning_rate": 1.7564124057658057e-05, + "loss": 0.3744, + "step": 154 + }, + { + "epoch": 0.4968152866242038, + "learning_rate": 1.7707972125799738e-05, + "loss": 0.4799, + "step": 156 + }, + { + "epoch": 0.5031847133757962, + "learning_rate": 1.7847993852786612e-05, + "loss": 0.5027, + "step": 158 + }, + { + "epoch": 0.5095541401273885, + "learning_rate": 1.7984119729947937e-05, + "loss": 0.3931, + "step": 160 + }, + { + "epoch": 0.5159235668789809, + "learning_rate": 1.811628218256531e-05, + "loss": 0.3879, + "step": 162 + }, + { + "epoch": 0.5222929936305732, + "learning_rate": 1.8244415603417603e-05, + "loss": 0.4793, + "step": 164 + }, + { + "epoch": 0.5286624203821656, + "learning_rate": 1.836845638534933e-05, + "loss": 0.5282, + "step": 166 + }, + { + "epoch": 0.535031847133758, + "learning_rate": 1.8488342952846074e-05, + "loss": 0.5434, + "step": 168 + }, + { + "epoch": 0.5414012738853503, + "learning_rate": 1.860401579260139e-05, + "loss": 0.5263, + "step": 170 + }, + { + "epoch": 0.5477707006369427, + "learning_rate": 1.8715417483060044e-05, + "loss": 0.7306, + "step": 172 + }, + { + "epoch": 0.554140127388535, + "learning_rate": 1.8822492722922816e-05, + "loss": 0.5666, + "step": 174 + }, + { + "epoch": 0.5605095541401274, + "learning_rate": 1.8925188358598808e-05, + "loss": 0.4222, + "step": 176 + }, + { + "epoch": 0.5668789808917197, + "learning_rate": 1.902345341059163e-05, + "loss": 0.9838, + "step": 178 + }, + { + "epoch": 0.5732484076433121, + "learning_rate": 1.9117239098806296e-05, + "loss": 0.6729, + "step": 180 + }, + { + "epoch": 0.5796178343949044, + "learning_rate": 1.920649886676429e-05, + "loss": 0.419, + "step": 182 + }, + { + "epoch": 0.5859872611464968, + "learning_rate": 1.9291188404714876e-05, + "loss": 0.3516, + "step": 184 + }, + { + "epoch": 0.5923566878980892, + "learning_rate": 1.937126567163103e-05, + "loss": 0.4074, + "step": 186 + }, + { + "epoch": 0.5987261146496815, + "learning_rate": 1.944669091607919e-05, + "loss": 0.4973, + "step": 188 + }, + { + "epoch": 0.6050955414012739, + "learning_rate": 1.9517426695952354e-05, + "loss": 0.3044, + "step": 190 + }, + { + "epoch": 0.6114649681528662, + "learning_rate": 1.9583437897056915e-05, + "loss": 0.7326, + "step": 192 + }, + { + "epoch": 0.6178343949044586, + "learning_rate": 1.964469175054377e-05, + "loss": 0.309, + "step": 194 + }, + { + "epoch": 0.6242038216560509, + "learning_rate": 1.970115784917523e-05, + "loss": 1.0457, + "step": 196 + }, + { + "epoch": 0.6305732484076433, + "learning_rate": 1.975280816241959e-05, + "loss": 0.4953, + "step": 198 + }, + { + "epoch": 0.6369426751592356, + "learning_rate": 1.979961705036587e-05, + "loss": 0.1983, + "step": 200 + }, + { + "epoch": 0.643312101910828, + "learning_rate": 1.9841561276451777e-05, + "loss": 0.5022, + "step": 202 + }, + { + "epoch": 0.6496815286624203, + "learning_rate": 1.9878620018998696e-05, + "loss": 0.3723, + "step": 204 + }, + { + "epoch": 0.6560509554140127, + "learning_rate": 1.9910774881547803e-05, + "loss": 0.3967, + "step": 206 + }, + { + "epoch": 0.6624203821656051, + "learning_rate": 1.993800990199235e-05, + "loss": 0.9567, + "step": 208 + }, + { + "epoch": 0.6687898089171974, + "learning_rate": 1.9960311560501457e-05, + "loss": 0.4802, + "step": 210 + }, + { + "epoch": 0.6751592356687898, + "learning_rate": 1.9977668786231536e-05, + "loss": 0.5766, + "step": 212 + }, + { + "epoch": 0.6815286624203821, + "learning_rate": 1.999007296282201e-05, + "loss": 0.754, + "step": 214 + }, + { + "epoch": 0.6878980891719745, + "learning_rate": 1.9997517932672592e-05, + "loss": 0.617, + "step": 216 + }, + { + "epoch": 0.6942675159235668, + "learning_rate": 2e-05, + "loss": 0.299, + "step": 218 + }, + { + "epoch": 0.7006369426751592, + "learning_rate": 1.9997517932672592e-05, + "loss": 0.3849, + "step": 220 + }, + { + "epoch": 0.7070063694267515, + "learning_rate": 1.999007296282201e-05, + "loss": 0.49, + "step": 222 + }, + { + "epoch": 0.7133757961783439, + "learning_rate": 1.9977668786231536e-05, + "loss": 0.279, + "step": 224 + }, + { + "epoch": 0.7197452229299363, + "learning_rate": 1.9960311560501457e-05, + "loss": 0.3816, + "step": 226 + }, + { + "epoch": 0.7261146496815286, + "learning_rate": 1.993800990199235e-05, + "loss": 0.1561, + "step": 228 + }, + { + "epoch": 0.732484076433121, + "learning_rate": 1.99107748815478e-05, + "loss": 0.1531, + "step": 230 + }, + { + "epoch": 0.7388535031847133, + "learning_rate": 1.9878620018998696e-05, + "loss": 0.5737, + "step": 232 + }, + { + "epoch": 0.7452229299363057, + "learning_rate": 1.984156127645178e-05, + "loss": 0.3855, + "step": 234 + }, + { + "epoch": 0.7515923566878981, + "learning_rate": 1.979961705036587e-05, + "loss": 0.4611, + "step": 236 + }, + { + "epoch": 0.7579617834394905, + "learning_rate": 1.975280816241959e-05, + "loss": 0.5788, + "step": 238 + }, + { + "epoch": 0.7643312101910829, + "learning_rate": 1.9701157849175232e-05, + "loss": 0.5932, + "step": 240 + }, + { + "epoch": 0.7707006369426752, + "learning_rate": 1.9644691750543772e-05, + "loss": 0.3169, + "step": 242 + }, + { + "epoch": 0.7770700636942676, + "learning_rate": 1.958343789705692e-05, + "loss": 0.4777, + "step": 244 + }, + { + "epoch": 0.7834394904458599, + "learning_rate": 1.9517426695952354e-05, + "loss": 0.5903, + "step": 246 + }, + { + "epoch": 0.7898089171974523, + "learning_rate": 1.9446690916079184e-05, + "loss": 0.5089, + "step": 248 + }, + { + "epoch": 0.7961783439490446, + "learning_rate": 1.9371265671631034e-05, + "loss": 0.3621, + "step": 250 + }, + { + "epoch": 0.802547770700637, + "learning_rate": 1.929118840471488e-05, + "loss": 0.5418, + "step": 252 + }, + { + "epoch": 0.8089171974522293, + "learning_rate": 1.9206498866764293e-05, + "loss": 0.4291, + "step": 254 + }, + { + "epoch": 0.8152866242038217, + "learning_rate": 1.9117239098806302e-05, + "loss": 0.1913, + "step": 256 + }, + { + "epoch": 0.821656050955414, + "learning_rate": 1.9023453410591645e-05, + "loss": 0.7233, + "step": 258 + }, + { + "epoch": 0.8280254777070064, + "learning_rate": 1.8925188358598822e-05, + "loss": 0.4016, + "step": 260 + }, + { + "epoch": 0.8343949044585988, + "learning_rate": 1.882249272292283e-05, + "loss": 0.5974, + "step": 262 + }, + { + "epoch": 0.8407643312101911, + "learning_rate": 1.871541748306005e-05, + "loss": 0.7111, + "step": 264 + }, + { + "epoch": 0.8471337579617835, + "learning_rate": 1.8604015792601395e-05, + "loss": 0.3797, + "step": 266 + }, + { + "epoch": 0.8535031847133758, + "learning_rate": 1.8488342952846077e-05, + "loss": 1.1778, + "step": 268 + }, + { + "epoch": 0.8598726114649682, + "learning_rate": 1.8368456385349333e-05, + "loss": 0.2904, + "step": 270 + }, + { + "epoch": 0.8662420382165605, + "learning_rate": 1.824441560341761e-05, + "loss": 0.3213, + "step": 272 + }, + { + "epoch": 0.8726114649681529, + "learning_rate": 1.811628218256532e-05, + "loss": 0.18, + "step": 274 + }, + { + "epoch": 0.8789808917197452, + "learning_rate": 1.798411972994795e-05, + "loss": 0.6601, + "step": 276 + }, + { + "epoch": 0.8853503184713376, + "learning_rate": 1.784799385278662e-05, + "loss": 0.4559, + "step": 278 + }, + { + "epoch": 0.89171974522293, + "learning_rate": 1.770797212579973e-05, + "loss": 0.4516, + "step": 280 + }, + { + "epoch": 0.8980891719745223, + "learning_rate": 1.756412405765805e-05, + "loss": 0.5078, + "step": 282 + }, + { + "epoch": 0.9044585987261147, + "learning_rate": 1.7416521056479573e-05, + "loss": 0.6255, + "step": 284 + }, + { + "epoch": 0.910828025477707, + "learning_rate": 1.7265236394381634e-05, + "loss": 0.3578, + "step": 286 + }, + { + "epoch": 0.9171974522292994, + "learning_rate": 1.711034517110761e-05, + "loss": 0.685, + "step": 288 + }, + { + "epoch": 0.9235668789808917, + "learning_rate": 1.6951924276746425e-05, + "loss": 0.3528, + "step": 290 + }, + { + "epoch": 0.9299363057324841, + "learning_rate": 1.6790052353563254e-05, + "loss": 0.5669, + "step": 292 + }, + { + "epoch": 0.9363057324840764, + "learning_rate": 1.662480975696046e-05, + "loss": 0.6841, + "step": 294 + }, + { + "epoch": 0.9426751592356688, + "learning_rate": 1.6456278515588044e-05, + "loss": 0.4325, + "step": 296 + }, + { + "epoch": 0.9490445859872612, + "learning_rate": 1.6284542290623558e-05, + "loss": 0.4169, + "step": 298 + }, + { + "epoch": 0.9554140127388535, + "learning_rate": 1.6109686334241648e-05, + "loss": 0.4317, + "step": 300 + }, + { + "epoch": 0.9617834394904459, + "learning_rate": 1.593179744729355e-05, + "loss": 0.4362, + "step": 302 + }, + { + "epoch": 0.9681528662420382, + "learning_rate": 1.57509639362181e-05, + "loss": 0.325, + "step": 304 + }, + { + "epoch": 0.9745222929936306, + "learning_rate": 1.5567275569205227e-05, + "loss": 0.54, + "step": 306 + }, + { + "epoch": 0.9808917197452229, + "learning_rate": 1.538082353163374e-05, + "loss": 0.5871, + "step": 308 + }, + { + "epoch": 0.9872611464968153, + "learning_rate": 1.5191700380805768e-05, + "loss": 0.4206, + "step": 310 + }, + { + "epoch": 0.9936305732484076, + "learning_rate": 1.5000000000000014e-05, + "loss": 0.2635, + "step": 312 + }, + { + "epoch": 1.0, + "learning_rate": 1.4805817551866854e-05, + "loss": 0.3167, + "step": 314 + }, + { + "epoch": 1.0, + "step": 314, + "total_flos": 0, + "train_loss": 0.509083763903873, + "train_runtime": 1444.3817, + "train_samples_per_second": 3.478, + "train_steps_per_second": 0.217 + } + ], + "logging_steps": 2, + "max_steps": 314, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..bac90c320bdc65ed3228ba15743e46f7cc95f2cb --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcb1da0d865900a86247a27953a7a0dc5458426437874d62407c4314902d9cf3 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..c82fa57eabbb1325ab160e569626d9cf2739ee1d --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b245521f6ecaab00312ce79cc07193da1fd88f560aabd1fd1ebf5fbc1b629acd +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..898aac76208ed731151bc469ab1e9316fc52cc19 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e48f4f06fb3434c6a8ddf19bb2458cadd666f5af656ea0aa0c3769237d7711b8 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..222e0fabe83ac3406a60faf3408d70a96da18247 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16b762a908c575dd87147925241bacb742eb964233f03206a1aabbc0e55fffa5 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_gradnorm_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_gradnorm_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..af37ace7a4ae35d60754d149604a6ec89a9d6921 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_gradnorm_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,1131 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 314, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006369426751592357, + "grad_norm": 10.967390060424805, + "learning_rate": 2.5834789435204156e-06, + "loss": 0.8083, + "step": 2 + }, + { + "epoch": 0.012738853503184714, + "grad_norm": 4.827531814575195, + "learning_rate": 2.73476360561837e-06, + "loss": 0.3808, + "step": 4 + }, + { + "epoch": 0.01910828025477707, + "grad_norm": 10.00290584564209, + "learning_rate": 2.889654828892393e-06, + "loss": 0.5577, + "step": 6 + }, + { + "epoch": 0.025477707006369428, + "grad_norm": 7.657961368560791, + "learning_rate": 3.0480757232535773e-06, + "loss": 0.7111, + "step": 8 + }, + { + "epoch": 0.03184713375796178, + "grad_norm": 14.12212085723877, + "learning_rate": 3.2099476464367486e-06, + "loss": 0.8264, + "step": 10 + }, + { + "epoch": 0.03821656050955414, + "grad_norm": 5.643587112426758, + "learning_rate": 3.3751902430395558e-06, + "loss": 0.8057, + "step": 12 + }, + { + "epoch": 0.044585987261146494, + "grad_norm": 13.343512535095215, + "learning_rate": 3.5437214844119727e-06, + "loss": 0.6541, + "step": 14 + }, + { + "epoch": 0.050955414012738856, + "grad_norm": 8.84065055847168, + "learning_rate": 3.7154577093764287e-06, + "loss": 0.6246, + "step": 16 + }, + { + "epoch": 0.05732484076433121, + "grad_norm": 11.651762962341309, + "learning_rate": 3.890313665758341e-06, + "loss": 0.719, + "step": 18 + }, + { + "epoch": 0.06369426751592357, + "grad_norm": 7.249644756317139, + "learning_rate": 4.068202552706455e-06, + "loss": 0.7934, + "step": 20 + }, + { + "epoch": 0.07006369426751592, + "grad_norm": 16.207679748535156, + "learning_rate": 4.249036063781902e-06, + "loss": 1.0335, + "step": 22 + }, + { + "epoch": 0.07643312101910828, + "grad_norm": 7.816985607147217, + "learning_rate": 4.432724430794775e-06, + "loss": 0.383, + "step": 24 + }, + { + "epoch": 0.08280254777070063, + "grad_norm": 3.924746036529541, + "learning_rate": 4.6191764683662625e-06, + "loss": 0.2819, + "step": 26 + }, + { + "epoch": 0.08917197452229299, + "grad_norm": 3.802884578704834, + "learning_rate": 4.8082996191942354e-06, + "loss": 0.2926, + "step": 28 + }, + { + "epoch": 0.09554140127388536, + "grad_norm": 7.976809978485107, + "learning_rate": 5.000000000000003e-06, + "loss": 0.5368, + "step": 30 + }, + { + "epoch": 0.10191082802547771, + "grad_norm": 27.513673782348633, + "learning_rate": 5.194182448133163e-06, + "loss": 0.7441, + "step": 32 + }, + { + "epoch": 0.10828025477707007, + "grad_norm": 9.224363327026367, + "learning_rate": 5.39075056881172e-06, + "loss": 0.7198, + "step": 34 + }, + { + "epoch": 0.11464968152866242, + "grad_norm": 4.422102928161621, + "learning_rate": 5.589606782973682e-06, + "loss": 0.4043, + "step": 36 + }, + { + "epoch": 0.12101910828025478, + "grad_norm": 4.438572406768799, + "learning_rate": 5.7906523757166475e-06, + "loss": 0.3523, + "step": 38 + }, + { + "epoch": 0.12738853503184713, + "grad_norm": 7.6988115310668945, + "learning_rate": 5.9937875453012e-06, + "loss": 0.442, + "step": 40 + }, + { + "epoch": 0.1337579617834395, + "grad_norm": 11.33991527557373, + "learning_rate": 6.198911452693847e-06, + "loss": 0.5794, + "step": 42 + }, + { + "epoch": 0.14012738853503184, + "grad_norm": 11.485501289367676, + "learning_rate": 6.405922271624865e-06, + "loss": 0.9043, + "step": 44 + }, + { + "epoch": 0.1464968152866242, + "grad_norm": 4.232797145843506, + "learning_rate": 6.614717239136237e-06, + "loss": 0.2157, + "step": 46 + }, + { + "epoch": 0.15286624203821655, + "grad_norm": 9.58480453491211, + "learning_rate": 6.8251927065945815e-06, + "loss": 0.6158, + "step": 48 + }, + { + "epoch": 0.1592356687898089, + "grad_norm": 4.837071895599365, + "learning_rate": 7.037244191143648e-06, + "loss": 0.3622, + "step": 50 + }, + { + "epoch": 0.16560509554140126, + "grad_norm": 25.650348663330078, + "learning_rate": 7.250766427571185e-06, + "loss": 0.8697, + "step": 52 + }, + { + "epoch": 0.17197452229299362, + "grad_norm": 14.62186050415039, + "learning_rate": 7.465653420563828e-06, + "loss": 0.8044, + "step": 54 + }, + { + "epoch": 0.17834394904458598, + "grad_norm": 8.525863647460938, + "learning_rate": 7.68179849732472e-06, + "loss": 1.0154, + "step": 56 + }, + { + "epoch": 0.18471337579617833, + "grad_norm": 4.199936389923096, + "learning_rate": 7.899094360527221e-06, + "loss": 0.6157, + "step": 58 + }, + { + "epoch": 0.1910828025477707, + "grad_norm": 4.920378684997559, + "learning_rate": 8.117433141578865e-06, + "loss": 0.4933, + "step": 60 + }, + { + "epoch": 0.19745222929936307, + "grad_norm": 3.483499765396118, + "learning_rate": 8.336706454168698e-06, + "loss": 0.3853, + "step": 62 + }, + { + "epoch": 0.20382165605095542, + "grad_norm": 10.102392196655273, + "learning_rate": 8.55680544807173e-06, + "loss": 0.7389, + "step": 64 + }, + { + "epoch": 0.21019108280254778, + "grad_norm": 4.874934673309326, + "learning_rate": 8.777620863183652e-06, + "loss": 0.7997, + "step": 66 + }, + { + "epoch": 0.21656050955414013, + "grad_norm": 10.369234085083008, + "learning_rate": 8.99904308375901e-06, + "loss": 0.4694, + "step": 68 + }, + { + "epoch": 0.2229299363057325, + "grad_norm": 6.755327224731445, + "learning_rate": 9.220962192825959e-06, + "loss": 0.643, + "step": 70 + }, + { + "epoch": 0.22929936305732485, + "grad_norm": 9.252140045166016, + "learning_rate": 9.443268026750509e-06, + "loss": 0.7274, + "step": 72 + }, + { + "epoch": 0.2356687898089172, + "grad_norm": 0.9168484807014465, + "learning_rate": 9.665850229923262e-06, + "loss": 0.2262, + "step": 74 + }, + { + "epoch": 0.24203821656050956, + "grad_norm": 11.39022159576416, + "learning_rate": 9.88859830954135e-06, + "loss": 0.8035, + "step": 76 + }, + { + "epoch": 0.2484076433121019, + "grad_norm": 3.0904781818389893, + "learning_rate": 1.0111401690458642e-05, + "loss": 0.3005, + "step": 78 + }, + { + "epoch": 0.25477707006369427, + "grad_norm": 13.052651405334473, + "learning_rate": 1.0334149770076732e-05, + "loss": 0.8611, + "step": 80 + }, + { + "epoch": 0.2611464968152866, + "grad_norm": 9.630870819091797, + "learning_rate": 1.0556731973249482e-05, + "loss": 0.4201, + "step": 82 + }, + { + "epoch": 0.267515923566879, + "grad_norm": 9.032388687133789, + "learning_rate": 1.0779037807174032e-05, + "loss": 0.3791, + "step": 84 + }, + { + "epoch": 0.27388535031847133, + "grad_norm": 7.560876369476318, + "learning_rate": 1.1000956916240984e-05, + "loss": 0.3656, + "step": 86 + }, + { + "epoch": 0.2802547770700637, + "grad_norm": 7.056580543518066, + "learning_rate": 1.1222379136816342e-05, + "loss": 0.5659, + "step": 88 + }, + { + "epoch": 0.28662420382165604, + "grad_norm": 15.845664024353027, + "learning_rate": 1.1443194551928264e-05, + "loss": 0.7557, + "step": 90 + }, + { + "epoch": 0.2929936305732484, + "grad_norm": 10.240681648254395, + "learning_rate": 1.1663293545831295e-05, + "loss": 0.954, + "step": 92 + }, + { + "epoch": 0.29936305732484075, + "grad_norm": 27.095378875732422, + "learning_rate": 1.188256685842113e-05, + "loss": 0.5918, + "step": 94 + }, + { + "epoch": 0.3057324840764331, + "grad_norm": 10.295605659484863, + "learning_rate": 1.210090563947277e-05, + "loss": 0.8789, + "step": 96 + }, + { + "epoch": 0.31210191082802546, + "grad_norm": 8.310393333435059, + "learning_rate": 1.2318201502675273e-05, + "loss": 0.8859, + "step": 98 + }, + { + "epoch": 0.3184713375796178, + "grad_norm": 13.601262092590332, + "learning_rate": 1.2534346579436164e-05, + "loss": 0.5679, + "step": 100 + }, + { + "epoch": 0.3248407643312102, + "grad_norm": 11.069892883300781, + "learning_rate": 1.274923357242881e-05, + "loss": 0.9464, + "step": 102 + }, + { + "epoch": 0.33121019108280253, + "grad_norm": 11.5009183883667, + "learning_rate": 1.2962755808856345e-05, + "loss": 0.8625, + "step": 104 + }, + { + "epoch": 0.3375796178343949, + "grad_norm": 10.298592567443848, + "learning_rate": 1.3174807293405412e-05, + "loss": 0.7258, + "step": 106 + }, + { + "epoch": 0.34394904458598724, + "grad_norm": 6.594852924346924, + "learning_rate": 1.3385282760863758e-05, + "loss": 0.708, + "step": 108 + }, + { + "epoch": 0.3503184713375796, + "grad_norm": 3.0060510635375977, + "learning_rate": 1.3594077728375129e-05, + "loss": 0.3931, + "step": 110 + }, + { + "epoch": 0.35668789808917195, + "grad_norm": 8.377557754516602, + "learning_rate": 1.3801088547306147e-05, + "loss": 0.765, + "step": 112 + }, + { + "epoch": 0.3630573248407643, + "grad_norm": 9.043540000915527, + "learning_rate": 1.4006212454698793e-05, + "loss": 0.5778, + "step": 114 + }, + { + "epoch": 0.36942675159235666, + "grad_norm": 9.072078704833984, + "learning_rate": 1.4209347624283347e-05, + "loss": 0.5493, + "step": 116 + }, + { + "epoch": 0.37579617834394907, + "grad_norm": 9.102341651916504, + "learning_rate": 1.441039321702631e-05, + "loss": 1.1755, + "step": 118 + }, + { + "epoch": 0.3821656050955414, + "grad_norm": 7.582345008850098, + "learning_rate": 1.4609249431188274e-05, + "loss": 0.7363, + "step": 120 + }, + { + "epoch": 0.3885350318471338, + "grad_norm": 5.271243095397949, + "learning_rate": 1.480581755186683e-05, + "loss": 0.5457, + "step": 122 + }, + { + "epoch": 0.39490445859872614, + "grad_norm": 6.142684459686279, + "learning_rate": 1.4999999999999992e-05, + "loss": 0.6939, + "step": 124 + }, + { + "epoch": 0.4012738853503185, + "grad_norm": 8.789036750793457, + "learning_rate": 1.5191700380805761e-05, + "loss": 0.7891, + "step": 126 + }, + { + "epoch": 0.40764331210191085, + "grad_norm": 5.271059989929199, + "learning_rate": 1.538082353163373e-05, + "loss": 0.3996, + "step": 128 + }, + { + "epoch": 0.4140127388535032, + "grad_norm": 4.013172626495361, + "learning_rate": 1.556727556920522e-05, + "loss": 0.5014, + "step": 130 + }, + { + "epoch": 0.42038216560509556, + "grad_norm": 3.9519894123077393, + "learning_rate": 1.5750963936218094e-05, + "loss": 0.8364, + "step": 132 + }, + { + "epoch": 0.4267515923566879, + "grad_norm": 7.547327041625977, + "learning_rate": 1.593179744729354e-05, + "loss": 0.7579, + "step": 134 + }, + { + "epoch": 0.43312101910828027, + "grad_norm": 4.156999588012695, + "learning_rate": 1.6109686334241655e-05, + "loss": 0.4703, + "step": 136 + }, + { + "epoch": 0.4394904458598726, + "grad_norm": 4.95545768737793, + "learning_rate": 1.6284542290623565e-05, + "loss": 0.3763, + "step": 138 + }, + { + "epoch": 0.445859872611465, + "grad_norm": 142.92599487304688, + "learning_rate": 1.6456278515588023e-05, + "loss": 3.0181, + "step": 140 + }, + { + "epoch": 0.45222929936305734, + "grad_norm": 6.379714488983154, + "learning_rate": 1.662480975696044e-05, + "loss": 0.6882, + "step": 142 + }, + { + "epoch": 0.4585987261146497, + "grad_norm": 7.686674118041992, + "learning_rate": 1.6790052353563247e-05, + "loss": 0.5241, + "step": 144 + }, + { + "epoch": 0.46496815286624205, + "grad_norm": 4.402493953704834, + "learning_rate": 1.6951924276746418e-05, + "loss": 0.5151, + "step": 146 + }, + { + "epoch": 0.4713375796178344, + "grad_norm": 4.853139400482178, + "learning_rate": 1.7110345171107602e-05, + "loss": 0.6634, + "step": 148 + }, + { + "epoch": 0.47770700636942676, + "grad_norm": 6.9735260009765625, + "learning_rate": 1.7265236394381627e-05, + "loss": 0.8028, + "step": 150 + }, + { + "epoch": 0.4840764331210191, + "grad_norm": 6.749086856842041, + "learning_rate": 1.741652105647958e-05, + "loss": 0.6335, + "step": 152 + }, + { + "epoch": 0.49044585987261147, + "grad_norm": 34.68336486816406, + "learning_rate": 1.7564124057658057e-05, + "loss": 0.7938, + "step": 154 + }, + { + "epoch": 0.4968152866242038, + "grad_norm": 22.7413387298584, + "learning_rate": 1.7707972125799738e-05, + "loss": 0.6283, + "step": 156 + }, + { + "epoch": 0.5031847133757962, + "grad_norm": 9.641555786132812, + "learning_rate": 1.7847993852786612e-05, + "loss": 0.9091, + "step": 158 + }, + { + "epoch": 0.5095541401273885, + "grad_norm": 7.344991683959961, + "learning_rate": 1.7984119729947937e-05, + "loss": 0.4567, + "step": 160 + }, + { + "epoch": 0.5159235668789809, + "grad_norm": 4.73659610748291, + "learning_rate": 1.811628218256531e-05, + "loss": 0.778, + "step": 162 + }, + { + "epoch": 0.5222929936305732, + "grad_norm": 5.34453010559082, + "learning_rate": 1.8244415603417603e-05, + "loss": 0.3745, + "step": 164 + }, + { + "epoch": 0.5286624203821656, + "grad_norm": 7.497495651245117, + "learning_rate": 1.836845638534933e-05, + "loss": 0.6032, + "step": 166 + }, + { + "epoch": 0.535031847133758, + "grad_norm": 4.8603925704956055, + "learning_rate": 1.8488342952846074e-05, + "loss": 0.4958, + "step": 168 + }, + { + "epoch": 0.5414012738853503, + "grad_norm": 10.660771369934082, + "learning_rate": 1.860401579260139e-05, + "loss": 0.512, + "step": 170 + }, + { + "epoch": 0.5477707006369427, + "grad_norm": 8.473343849182129, + "learning_rate": 1.8715417483060044e-05, + "loss": 0.7816, + "step": 172 + }, + { + "epoch": 0.554140127388535, + "grad_norm": 7.611669540405273, + "learning_rate": 1.8822492722922816e-05, + "loss": 0.6086, + "step": 174 + }, + { + "epoch": 0.5605095541401274, + "grad_norm": 14.955540657043457, + "learning_rate": 1.8925188358598808e-05, + "loss": 0.6231, + "step": 176 + }, + { + "epoch": 0.5668789808917197, + "grad_norm": 6.111566066741943, + "learning_rate": 1.902345341059163e-05, + "loss": 0.3272, + "step": 178 + }, + { + "epoch": 0.5732484076433121, + "grad_norm": 6.98579740524292, + "learning_rate": 1.9117239098806296e-05, + "loss": 0.6097, + "step": 180 + }, + { + "epoch": 0.5796178343949044, + "grad_norm": 12.00461196899414, + "learning_rate": 1.920649886676429e-05, + "loss": 0.8231, + "step": 182 + }, + { + "epoch": 0.5859872611464968, + "grad_norm": 13.3690767288208, + "learning_rate": 1.9291188404714876e-05, + "loss": 0.579, + "step": 184 + }, + { + "epoch": 0.5923566878980892, + "grad_norm": 8.468439102172852, + "learning_rate": 1.937126567163103e-05, + "loss": 0.5412, + "step": 186 + }, + { + "epoch": 0.5987261146496815, + "grad_norm": 8.921483039855957, + "learning_rate": 1.944669091607919e-05, + "loss": 0.6761, + "step": 188 + }, + { + "epoch": 0.6050955414012739, + "grad_norm": 5.962874412536621, + "learning_rate": 1.9517426695952354e-05, + "loss": 0.4777, + "step": 190 + }, + { + "epoch": 0.6114649681528662, + "grad_norm": 6.013513565063477, + "learning_rate": 1.9583437897056915e-05, + "loss": 0.3634, + "step": 192 + }, + { + "epoch": 0.6178343949044586, + "grad_norm": 7.779994010925293, + "learning_rate": 1.964469175054377e-05, + "loss": 0.7477, + "step": 194 + }, + { + "epoch": 0.6242038216560509, + "grad_norm": 4.3088788986206055, + "learning_rate": 1.970115784917523e-05, + "loss": 0.5438, + "step": 196 + }, + { + "epoch": 0.6305732484076433, + "grad_norm": 14.164244651794434, + "learning_rate": 1.975280816241959e-05, + "loss": 0.9717, + "step": 198 + }, + { + "epoch": 0.6369426751592356, + "grad_norm": 3.357556104660034, + "learning_rate": 1.979961705036587e-05, + "loss": 0.4938, + "step": 200 + }, + { + "epoch": 0.643312101910828, + "grad_norm": 4.60561990737915, + "learning_rate": 1.9841561276451777e-05, + "loss": 0.3408, + "step": 202 + }, + { + "epoch": 0.6496815286624203, + "grad_norm": 5.312678337097168, + "learning_rate": 1.9878620018998696e-05, + "loss": 0.4405, + "step": 204 + }, + { + "epoch": 0.6560509554140127, + "grad_norm": 5.059690475463867, + "learning_rate": 1.9910774881547803e-05, + "loss": 0.4156, + "step": 206 + }, + { + "epoch": 0.6624203821656051, + "grad_norm": 6.9592742919921875, + "learning_rate": 1.993800990199235e-05, + "loss": 1.0661, + "step": 208 + }, + { + "epoch": 0.6687898089171974, + "grad_norm": 10.315267562866211, + "learning_rate": 1.9960311560501457e-05, + "loss": 0.6111, + "step": 210 + }, + { + "epoch": 0.6751592356687898, + "grad_norm": 6.091007709503174, + "learning_rate": 1.9977668786231536e-05, + "loss": 0.6001, + "step": 212 + }, + { + "epoch": 0.6815286624203821, + "grad_norm": 8.963953018188477, + "learning_rate": 1.999007296282201e-05, + "loss": 0.8111, + "step": 214 + }, + { + "epoch": 0.6878980891719745, + "grad_norm": 15.050474166870117, + "learning_rate": 1.9997517932672592e-05, + "loss": 0.6209, + "step": 216 + }, + { + "epoch": 0.6942675159235668, + "grad_norm": 4.384186744689941, + "learning_rate": 2e-05, + "loss": 0.4493, + "step": 218 + }, + { + "epoch": 0.7006369426751592, + "grad_norm": 2.46761417388916, + "learning_rate": 1.9997517932672592e-05, + "loss": 0.4532, + "step": 220 + }, + { + "epoch": 0.7070063694267515, + "grad_norm": 7.234741687774658, + "learning_rate": 1.999007296282201e-05, + "loss": 0.6705, + "step": 222 + }, + { + "epoch": 0.7133757961783439, + "grad_norm": 11.147261619567871, + "learning_rate": 1.9977668786231536e-05, + "loss": 0.7441, + "step": 224 + }, + { + "epoch": 0.7197452229299363, + "grad_norm": 4.694805145263672, + "learning_rate": 1.9960311560501457e-05, + "loss": 0.9027, + "step": 226 + }, + { + "epoch": 0.7261146496815286, + "grad_norm": 6.381280899047852, + "learning_rate": 1.993800990199235e-05, + "loss": 0.6166, + "step": 228 + }, + { + "epoch": 0.732484076433121, + "grad_norm": 10.952537536621094, + "learning_rate": 1.99107748815478e-05, + "loss": 0.8709, + "step": 230 + }, + { + "epoch": 0.7388535031847133, + "grad_norm": 8.05765151977539, + "learning_rate": 1.9878620018998696e-05, + "loss": 0.619, + "step": 232 + }, + { + "epoch": 0.7452229299363057, + "grad_norm": 3.9685816764831543, + "learning_rate": 1.984156127645178e-05, + "loss": 1.0332, + "step": 234 + }, + { + "epoch": 0.7515923566878981, + "grad_norm": 8.497997283935547, + "learning_rate": 1.979961705036587e-05, + "loss": 0.9633, + "step": 236 + }, + { + "epoch": 0.7579617834394905, + "grad_norm": 7.018019676208496, + "learning_rate": 1.975280816241959e-05, + "loss": 0.4901, + "step": 238 + }, + { + "epoch": 0.7643312101910829, + "grad_norm": 5.196238994598389, + "learning_rate": 1.9701157849175232e-05, + "loss": 0.4994, + "step": 240 + }, + { + "epoch": 0.7707006369426752, + "grad_norm": 5.84513521194458, + "learning_rate": 1.9644691750543772e-05, + "loss": 0.5337, + "step": 242 + }, + { + "epoch": 0.7770700636942676, + "grad_norm": 3.9592578411102295, + "learning_rate": 1.958343789705692e-05, + "loss": 0.5261, + "step": 244 + }, + { + "epoch": 0.7834394904458599, + "grad_norm": 7.668013572692871, + "learning_rate": 1.9517426695952354e-05, + "loss": 0.4313, + "step": 246 + }, + { + "epoch": 0.7898089171974523, + "grad_norm": 6.829677104949951, + "learning_rate": 1.9446690916079184e-05, + "loss": 0.9799, + "step": 248 + }, + { + "epoch": 0.7961783439490446, + "grad_norm": 3.195507764816284, + "learning_rate": 1.9371265671631034e-05, + "loss": 0.425, + "step": 250 + }, + { + "epoch": 0.802547770700637, + "grad_norm": 2.563486099243164, + "learning_rate": 1.929118840471488e-05, + "loss": 0.362, + "step": 252 + }, + { + "epoch": 0.8089171974522293, + "grad_norm": 4.692176818847656, + "learning_rate": 1.9206498866764293e-05, + "loss": 0.3505, + "step": 254 + }, + { + "epoch": 0.8152866242038217, + "grad_norm": 3.836381673812866, + "learning_rate": 1.9117239098806302e-05, + "loss": 0.4359, + "step": 256 + }, + { + "epoch": 0.821656050955414, + "grad_norm": 5.667571067810059, + "learning_rate": 1.9023453410591645e-05, + "loss": 0.7398, + "step": 258 + }, + { + "epoch": 0.8280254777070064, + "grad_norm": 7.975452899932861, + "learning_rate": 1.8925188358598822e-05, + "loss": 0.6783, + "step": 260 + }, + { + "epoch": 0.8343949044585988, + "grad_norm": 5.269340991973877, + "learning_rate": 1.882249272292283e-05, + "loss": 0.6184, + "step": 262 + }, + { + "epoch": 0.8407643312101911, + "grad_norm": 10.280611038208008, + "learning_rate": 1.871541748306005e-05, + "loss": 0.3653, + "step": 264 + }, + { + "epoch": 0.8471337579617835, + "grad_norm": 2.831383466720581, + "learning_rate": 1.8604015792601395e-05, + "loss": 0.4349, + "step": 266 + }, + { + "epoch": 0.8535031847133758, + "grad_norm": 9.044788360595703, + "learning_rate": 1.8488342952846077e-05, + "loss": 0.4842, + "step": 268 + }, + { + "epoch": 0.8598726114649682, + "grad_norm": 5.392291069030762, + "learning_rate": 1.8368456385349333e-05, + "loss": 0.4772, + "step": 270 + }, + { + "epoch": 0.8662420382165605, + "grad_norm": 2.797391176223755, + "learning_rate": 1.824441560341761e-05, + "loss": 0.3883, + "step": 272 + }, + { + "epoch": 0.8726114649681529, + "grad_norm": 3.6665701866149902, + "learning_rate": 1.811628218256532e-05, + "loss": 0.504, + "step": 274 + }, + { + "epoch": 0.8789808917197452, + "grad_norm": 8.1128511428833, + "learning_rate": 1.798411972994795e-05, + "loss": 0.4383, + "step": 276 + }, + { + "epoch": 0.8853503184713376, + "grad_norm": 6.009725093841553, + "learning_rate": 1.784799385278662e-05, + "loss": 0.4464, + "step": 278 + }, + { + "epoch": 0.89171974522293, + "grad_norm": 2.803722858428955, + "learning_rate": 1.770797212579973e-05, + "loss": 0.6519, + "step": 280 + }, + { + "epoch": 0.8980891719745223, + "grad_norm": 3.778076410293579, + "learning_rate": 1.756412405765805e-05, + "loss": 0.6337, + "step": 282 + }, + { + "epoch": 0.9044585987261147, + "grad_norm": 5.328536510467529, + "learning_rate": 1.7416521056479573e-05, + "loss": 0.7336, + "step": 284 + }, + { + "epoch": 0.910828025477707, + "grad_norm": 5.951910495758057, + "learning_rate": 1.7265236394381634e-05, + "loss": 0.4997, + "step": 286 + }, + { + "epoch": 0.9171974522292994, + "grad_norm": 6.259306907653809, + "learning_rate": 1.711034517110761e-05, + "loss": 0.749, + "step": 288 + }, + { + "epoch": 0.9235668789808917, + "grad_norm": 2.7894022464752197, + "learning_rate": 1.6951924276746425e-05, + "loss": 0.3758, + "step": 290 + }, + { + "epoch": 0.9299363057324841, + "grad_norm": 5.453607082366943, + "learning_rate": 1.6790052353563254e-05, + "loss": 0.5871, + "step": 292 + }, + { + "epoch": 0.9363057324840764, + "grad_norm": 6.539675235748291, + "learning_rate": 1.662480975696046e-05, + "loss": 0.4869, + "step": 294 + }, + { + "epoch": 0.9426751592356688, + "grad_norm": 4.624678134918213, + "learning_rate": 1.6456278515588044e-05, + "loss": 0.3576, + "step": 296 + }, + { + "epoch": 0.9490445859872612, + "grad_norm": 5.726230621337891, + "learning_rate": 1.6284542290623558e-05, + "loss": 2.1993, + "step": 298 + }, + { + "epoch": 0.9554140127388535, + "grad_norm": 6.0250396728515625, + "learning_rate": 1.6109686334241648e-05, + "loss": 0.4139, + "step": 300 + }, + { + "epoch": 0.9617834394904459, + "grad_norm": 5.527227401733398, + "learning_rate": 1.593179744729355e-05, + "loss": 0.4961, + "step": 302 + }, + { + "epoch": 0.9681528662420382, + "grad_norm": 11.022268295288086, + "learning_rate": 1.57509639362181e-05, + "loss": 0.5184, + "step": 304 + }, + { + "epoch": 0.9745222929936306, + "grad_norm": 9.068928718566895, + "learning_rate": 1.5567275569205227e-05, + "loss": 0.6846, + "step": 306 + }, + { + "epoch": 0.9808917197452229, + "grad_norm": 10.308109283447266, + "learning_rate": 1.538082353163374e-05, + "loss": 0.5754, + "step": 308 + }, + { + "epoch": 0.9872611464968153, + "grad_norm": 7.5352935791015625, + "learning_rate": 1.5191700380805768e-05, + "loss": 0.5878, + "step": 310 + }, + { + "epoch": 0.9936305732484076, + "grad_norm": 5.239936828613281, + "learning_rate": 1.5000000000000014e-05, + "loss": 0.479, + "step": 312 + }, + { + "epoch": 1.0, + "grad_norm": 4.027707576751709, + "learning_rate": 1.4805817551866854e-05, + "loss": 0.7548, + "step": 314 + }, + { + "epoch": 1.0, + "step": 314, + "total_flos": 1239016874704896.0, + "train_loss": 0.6360755648202957, + "train_runtime": 2238.6022, + "train_samples_per_second": 2.244, + "train_steps_per_second": 0.14 + } + ], + "logging_steps": 2, + "max_steps": 314, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 1239016874704896.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..85f41621f161cdacd25c61ccda3e720858835b87 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bb2e81344a610bbb4729cc3bc280f7c12225d1885a5277ec3e35d2e6fc7dec9 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..279fe204b36fa0d0108e7444c00414a560741602 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7c7fbfc175c8049e53d2818ec913b868a0babc466f31266623cd3504a6410bf +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..7d061524a228369fd97c73588a6389c257c09268 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0e725a6a06be969403b94ee3a866f44b60a87b4c4ad06d3145ffd071f2d4ea6 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..3addc4dd1ad854e6e5cb7563afcf070ee97969ff --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:038e898b0d517ae99c3296ba7932ae5f7707c16d3caa882e4bdea4a3cf8e73c8 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_infoBatch_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_infoBatch_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3125acef47c907d54c62c6e8f04fa2fdeb5eabde --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_infoBatch_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,974 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 314, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006369426751592357, + "learning_rate": 2.5834789435204156e-06, + "loss": 1.3838, + "step": 2 + }, + { + "epoch": 0.012738853503184714, + "learning_rate": 2.73476360561837e-06, + "loss": 0.4167, + "step": 4 + }, + { + "epoch": 0.01910828025477707, + "learning_rate": 2.889654828892393e-06, + "loss": 0.7214, + "step": 6 + }, + { + "epoch": 0.025477707006369428, + "learning_rate": 3.0480757232535773e-06, + "loss": 0.5926, + "step": 8 + }, + { + "epoch": 0.03184713375796178, + "learning_rate": 3.2099476464367486e-06, + "loss": 0.8231, + "step": 10 + }, + { + "epoch": 0.03821656050955414, + "learning_rate": 3.3751902430395558e-06, + "loss": 0.6351, + "step": 12 + }, + { + "epoch": 0.044585987261146494, + "learning_rate": 3.5437214844119727e-06, + "loss": 0.8684, + "step": 14 + }, + { + "epoch": 0.050955414012738856, + "learning_rate": 3.7154577093764287e-06, + "loss": 0.4996, + "step": 16 + }, + { + "epoch": 0.05732484076433121, + "learning_rate": 3.890313665758341e-06, + "loss": 0.4307, + "step": 18 + }, + { + "epoch": 0.06369426751592357, + "learning_rate": 4.068202552706455e-06, + "loss": 0.4412, + "step": 20 + }, + { + "epoch": 0.07006369426751592, + "learning_rate": 4.249036063781902e-06, + "loss": 0.5067, + "step": 22 + }, + { + "epoch": 0.07643312101910828, + "learning_rate": 4.432724430794775e-06, + "loss": 0.3978, + "step": 24 + }, + { + "epoch": 0.08280254777070063, + "learning_rate": 4.6191764683662625e-06, + "loss": 0.4606, + "step": 26 + }, + { + "epoch": 0.08917197452229299, + "learning_rate": 4.8082996191942354e-06, + "loss": 0.4328, + "step": 28 + }, + { + "epoch": 0.09554140127388536, + "learning_rate": 5.000000000000003e-06, + "loss": 0.6153, + "step": 30 + }, + { + "epoch": 0.10191082802547771, + "learning_rate": 5.194182448133163e-06, + "loss": 0.894, + "step": 32 + }, + { + "epoch": 0.10828025477707007, + "learning_rate": 5.39075056881172e-06, + "loss": 0.512, + "step": 34 + }, + { + "epoch": 0.11464968152866242, + "learning_rate": 5.589606782973682e-06, + "loss": 0.4806, + "step": 36 + }, + { + "epoch": 0.12101910828025478, + "learning_rate": 5.7906523757166475e-06, + "loss": 0.3867, + "step": 38 + }, + { + "epoch": 0.12738853503184713, + "learning_rate": 5.9937875453012e-06, + "loss": 0.515, + "step": 40 + }, + { + "epoch": 0.1337579617834395, + "learning_rate": 6.198911452693847e-06, + "loss": 0.472, + "step": 42 + }, + { + "epoch": 0.14012738853503184, + "learning_rate": 6.405922271624865e-06, + "loss": 0.7776, + "step": 44 + }, + { + "epoch": 0.1464968152866242, + "learning_rate": 6.614717239136237e-06, + "loss": 0.2591, + "step": 46 + }, + { + "epoch": 0.15286624203821655, + "learning_rate": 6.8251927065945815e-06, + "loss": 0.4697, + "step": 48 + }, + { + "epoch": 0.1592356687898089, + "learning_rate": 7.037244191143648e-06, + "loss": 0.504, + "step": 50 + }, + { + "epoch": 0.16560509554140126, + "learning_rate": 7.250766427571185e-06, + "loss": 0.9246, + "step": 52 + }, + { + "epoch": 0.17197452229299362, + "learning_rate": 7.465653420563828e-06, + "loss": 0.3775, + "step": 54 + }, + { + "epoch": 0.17834394904458598, + "learning_rate": 7.68179849732472e-06, + "loss": 0.7175, + "step": 56 + }, + { + "epoch": 0.18471337579617833, + "learning_rate": 7.899094360527221e-06, + "loss": 0.4747, + "step": 58 + }, + { + "epoch": 0.1910828025477707, + "learning_rate": 8.117433141578865e-06, + "loss": 0.5985, + "step": 60 + }, + { + "epoch": 0.19745222929936307, + "learning_rate": 8.336706454168698e-06, + "loss": 0.3404, + "step": 62 + }, + { + "epoch": 0.20382165605095542, + "learning_rate": 8.55680544807173e-06, + "loss": 0.35, + "step": 64 + }, + { + "epoch": 0.21019108280254778, + "learning_rate": 8.777620863183652e-06, + "loss": 0.7105, + "step": 66 + }, + { + "epoch": 0.21656050955414013, + "learning_rate": 8.99904308375901e-06, + "loss": 0.3712, + "step": 68 + }, + { + "epoch": 0.2229299363057325, + "learning_rate": 9.220962192825959e-06, + "loss": 0.4918, + "step": 70 + }, + { + "epoch": 0.22929936305732485, + "learning_rate": 9.443268026750509e-06, + "loss": 0.6386, + "step": 72 + }, + { + "epoch": 0.2356687898089172, + "learning_rate": 9.665850229923262e-06, + "loss": 0.5113, + "step": 74 + }, + { + "epoch": 0.24203821656050956, + "learning_rate": 9.88859830954135e-06, + "loss": 0.5075, + "step": 76 + }, + { + "epoch": 0.2484076433121019, + "learning_rate": 1.0111401690458642e-05, + "loss": 0.5798, + "step": 78 + }, + { + "epoch": 0.25477707006369427, + "learning_rate": 1.0334149770076732e-05, + "loss": 0.6072, + "step": 80 + }, + { + "epoch": 0.2611464968152866, + "learning_rate": 1.0556731973249482e-05, + "loss": 0.4954, + "step": 82 + }, + { + "epoch": 0.267515923566879, + "learning_rate": 1.0779037807174032e-05, + "loss": 0.653, + "step": 84 + }, + { + "epoch": 0.27388535031847133, + "learning_rate": 1.1000956916240984e-05, + "loss": 0.5771, + "step": 86 + }, + { + "epoch": 0.2802547770700637, + "learning_rate": 1.1222379136816342e-05, + "loss": 0.4746, + "step": 88 + }, + { + "epoch": 0.28662420382165604, + "learning_rate": 1.1443194551928264e-05, + "loss": 0.6626, + "step": 90 + }, + { + "epoch": 0.2929936305732484, + "learning_rate": 1.1663293545831295e-05, + "loss": 0.8747, + "step": 92 + }, + { + "epoch": 0.29936305732484075, + "learning_rate": 1.188256685842113e-05, + "loss": 0.3416, + "step": 94 + }, + { + "epoch": 0.3057324840764331, + "learning_rate": 1.210090563947277e-05, + "loss": 0.4761, + "step": 96 + }, + { + "epoch": 0.31210191082802546, + "learning_rate": 1.2318201502675273e-05, + "loss": 0.5684, + "step": 98 + }, + { + "epoch": 0.3184713375796178, + "learning_rate": 1.2534346579436164e-05, + "loss": 0.7771, + "step": 100 + }, + { + "epoch": 0.3248407643312102, + "learning_rate": 1.274923357242881e-05, + "loss": 0.6231, + "step": 102 + }, + { + "epoch": 0.33121019108280253, + "learning_rate": 1.2962755808856345e-05, + "loss": 0.3676, + "step": 104 + }, + { + "epoch": 0.3375796178343949, + "learning_rate": 1.3174807293405412e-05, + "loss": 0.5425, + "step": 106 + }, + { + "epoch": 0.34394904458598724, + "learning_rate": 1.3385282760863758e-05, + "loss": 0.5188, + "step": 108 + }, + { + "epoch": 0.3503184713375796, + "learning_rate": 1.3594077728375129e-05, + "loss": 0.4089, + "step": 110 + }, + { + "epoch": 0.35668789808917195, + "learning_rate": 1.3801088547306147e-05, + "loss": 0.3555, + "step": 112 + }, + { + "epoch": 0.3630573248407643, + "learning_rate": 1.4006212454698793e-05, + "loss": 0.5001, + "step": 114 + }, + { + "epoch": 0.36942675159235666, + "learning_rate": 1.4209347624283347e-05, + "loss": 0.336, + "step": 116 + }, + { + "epoch": 0.37579617834394907, + "learning_rate": 1.441039321702631e-05, + "loss": 0.8143, + "step": 118 + }, + { + "epoch": 0.3821656050955414, + "learning_rate": 1.4609249431188274e-05, + "loss": 0.346, + "step": 120 + }, + { + "epoch": 0.3885350318471338, + "learning_rate": 1.480581755186683e-05, + "loss": 0.56, + "step": 122 + }, + { + "epoch": 0.39490445859872614, + "learning_rate": 1.4999999999999992e-05, + "loss": 0.5652, + "step": 124 + }, + { + "epoch": 0.4012738853503185, + "learning_rate": 1.5191700380805761e-05, + "loss": 0.32, + "step": 126 + }, + { + "epoch": 0.40764331210191085, + "learning_rate": 1.538082353163373e-05, + "loss": 0.5876, + "step": 128 + }, + { + "epoch": 0.4140127388535032, + "learning_rate": 1.556727556920522e-05, + "loss": 0.3075, + "step": 130 + }, + { + "epoch": 0.42038216560509556, + "learning_rate": 1.5750963936218094e-05, + "loss": 0.5996, + "step": 132 + }, + { + "epoch": 0.4267515923566879, + "learning_rate": 1.593179744729354e-05, + "loss": 0.4651, + "step": 134 + }, + { + "epoch": 0.43312101910828027, + "learning_rate": 1.6109686334241655e-05, + "loss": 0.5205, + "step": 136 + }, + { + "epoch": 0.4394904458598726, + "learning_rate": 1.6284542290623565e-05, + "loss": 0.3501, + "step": 138 + }, + { + "epoch": 0.445859872611465, + "learning_rate": 1.6456278515588023e-05, + "loss": 0.6194, + "step": 140 + }, + { + "epoch": 0.45222929936305734, + "learning_rate": 1.662480975696044e-05, + "loss": 0.4967, + "step": 142 + }, + { + "epoch": 0.4585987261146497, + "learning_rate": 1.6790052353563247e-05, + "loss": 0.3984, + "step": 144 + }, + { + "epoch": 0.46496815286624205, + "learning_rate": 1.6951924276746418e-05, + "loss": 0.6918, + "step": 146 + }, + { + "epoch": 0.4713375796178344, + "learning_rate": 1.7110345171107602e-05, + "loss": 0.9902, + "step": 148 + }, + { + "epoch": 0.47770700636942676, + "learning_rate": 1.7265236394381627e-05, + "loss": 0.4518, + "step": 150 + }, + { + "epoch": 0.4840764331210191, + "learning_rate": 1.741652105647958e-05, + "loss": 0.4963, + "step": 152 + }, + { + "epoch": 0.49044585987261147, + "learning_rate": 1.7564124057658057e-05, + "loss": 0.3386, + "step": 154 + }, + { + "epoch": 0.4968152866242038, + "learning_rate": 1.7707972125799738e-05, + "loss": 0.4369, + "step": 156 + }, + { + "epoch": 0.5031847133757962, + "learning_rate": 1.7847993852786612e-05, + "loss": 0.55, + "step": 158 + }, + { + "epoch": 0.5095541401273885, + "learning_rate": 1.7984119729947937e-05, + "loss": 0.5363, + "step": 160 + }, + { + "epoch": 0.5159235668789809, + "learning_rate": 1.811628218256531e-05, + "loss": 0.3066, + "step": 162 + }, + { + "epoch": 0.5222929936305732, + "learning_rate": 1.8244415603417603e-05, + "loss": 0.3565, + "step": 164 + }, + { + "epoch": 0.5286624203821656, + "learning_rate": 1.836845638534933e-05, + "loss": 0.6966, + "step": 166 + }, + { + "epoch": 0.535031847133758, + "learning_rate": 1.8488342952846074e-05, + "loss": 0.3571, + "step": 168 + }, + { + "epoch": 0.5414012738853503, + "learning_rate": 1.860401579260139e-05, + "loss": 0.7104, + "step": 170 + }, + { + "epoch": 0.5477707006369427, + "learning_rate": 1.8715417483060044e-05, + "loss": 0.4933, + "step": 172 + }, + { + "epoch": 0.554140127388535, + "learning_rate": 1.8822492722922816e-05, + "loss": 0.4154, + "step": 174 + }, + { + "epoch": 0.5605095541401274, + "learning_rate": 1.8925188358598808e-05, + "loss": 0.4303, + "step": 176 + }, + { + "epoch": 0.5668789808917197, + "learning_rate": 1.902345341059163e-05, + "loss": 0.4708, + "step": 178 + }, + { + "epoch": 0.5732484076433121, + "learning_rate": 1.9117239098806296e-05, + "loss": 0.5172, + "step": 180 + }, + { + "epoch": 0.5796178343949044, + "learning_rate": 1.920649886676429e-05, + "loss": 0.6281, + "step": 182 + }, + { + "epoch": 0.5859872611464968, + "learning_rate": 1.9291188404714876e-05, + "loss": 0.4933, + "step": 184 + }, + { + "epoch": 0.5923566878980892, + "learning_rate": 1.937126567163103e-05, + "loss": 0.4513, + "step": 186 + }, + { + "epoch": 0.5987261146496815, + "learning_rate": 1.944669091607919e-05, + "loss": 0.4949, + "step": 188 + }, + { + "epoch": 0.6050955414012739, + "learning_rate": 1.9517426695952354e-05, + "loss": 0.6284, + "step": 190 + }, + { + "epoch": 0.6114649681528662, + "learning_rate": 1.9583437897056915e-05, + "loss": 0.4047, + "step": 192 + }, + { + "epoch": 0.6178343949044586, + "learning_rate": 1.964469175054377e-05, + "loss": 0.6866, + "step": 194 + }, + { + "epoch": 0.6242038216560509, + "learning_rate": 1.970115784917523e-05, + "loss": 0.6026, + "step": 196 + }, + { + "epoch": 0.6305732484076433, + "learning_rate": 1.975280816241959e-05, + "loss": 0.4448, + "step": 198 + }, + { + "epoch": 0.6369426751592356, + "learning_rate": 1.979961705036587e-05, + "loss": 0.4261, + "step": 200 + }, + { + "epoch": 0.643312101910828, + "learning_rate": 1.9841561276451777e-05, + "loss": 0.4357, + "step": 202 + }, + { + "epoch": 0.6496815286624203, + "learning_rate": 1.9878620018998696e-05, + "loss": 0.3806, + "step": 204 + }, + { + "epoch": 0.6560509554140127, + "learning_rate": 1.9910774881547803e-05, + "loss": 0.3888, + "step": 206 + }, + { + "epoch": 0.6624203821656051, + "learning_rate": 1.993800990199235e-05, + "loss": 0.9534, + "step": 208 + }, + { + "epoch": 0.6687898089171974, + "learning_rate": 1.9960311560501457e-05, + "loss": 0.4221, + "step": 210 + }, + { + "epoch": 0.6751592356687898, + "learning_rate": 1.9977668786231536e-05, + "loss": 0.6196, + "step": 212 + }, + { + "epoch": 0.6815286624203821, + "learning_rate": 1.999007296282201e-05, + "loss": 0.4501, + "step": 214 + }, + { + "epoch": 0.6878980891719745, + "learning_rate": 1.9997517932672592e-05, + "loss": 0.3531, + "step": 216 + }, + { + "epoch": 0.6942675159235668, + "learning_rate": 2e-05, + "loss": 0.5846, + "step": 218 + }, + { + "epoch": 0.7006369426751592, + "learning_rate": 1.9997517932672592e-05, + "loss": 0.4323, + "step": 220 + }, + { + "epoch": 0.7070063694267515, + "learning_rate": 1.999007296282201e-05, + "loss": 0.4081, + "step": 222 + }, + { + "epoch": 0.7133757961783439, + "learning_rate": 1.9977668786231536e-05, + "loss": 0.7469, + "step": 224 + }, + { + "epoch": 0.7197452229299363, + "learning_rate": 1.9960311560501457e-05, + "loss": 1.0746, + "step": 226 + }, + { + "epoch": 0.7261146496815286, + "learning_rate": 1.993800990199235e-05, + "loss": 0.6235, + "step": 228 + }, + { + "epoch": 0.732484076433121, + "learning_rate": 1.99107748815478e-05, + "loss": 0.5052, + "step": 230 + }, + { + "epoch": 0.7388535031847133, + "learning_rate": 1.9878620018998696e-05, + "loss": 0.4844, + "step": 232 + }, + { + "epoch": 0.7452229299363057, + "learning_rate": 1.984156127645178e-05, + "loss": 0.4121, + "step": 234 + }, + { + "epoch": 0.7515923566878981, + "learning_rate": 1.979961705036587e-05, + "loss": 0.3291, + "step": 236 + }, + { + "epoch": 0.7579617834394905, + "learning_rate": 1.975280816241959e-05, + "loss": 0.4238, + "step": 238 + }, + { + "epoch": 0.7643312101910829, + "learning_rate": 1.9701157849175232e-05, + "loss": 0.5666, + "step": 240 + }, + { + "epoch": 0.7707006369426752, + "learning_rate": 1.9644691750543772e-05, + "loss": 0.5248, + "step": 242 + }, + { + "epoch": 0.7770700636942676, + "learning_rate": 1.958343789705692e-05, + "loss": 0.4315, + "step": 244 + }, + { + "epoch": 0.7834394904458599, + "learning_rate": 1.9517426695952354e-05, + "loss": 0.6065, + "step": 246 + }, + { + "epoch": 0.7898089171974523, + "learning_rate": 1.9446690916079184e-05, + "loss": 0.391, + "step": 248 + }, + { + "epoch": 0.7961783439490446, + "learning_rate": 1.9371265671631034e-05, + "loss": 0.6801, + "step": 250 + }, + { + "epoch": 0.802547770700637, + "learning_rate": 1.929118840471488e-05, + "loss": 0.4783, + "step": 252 + }, + { + "epoch": 0.8089171974522293, + "learning_rate": 1.9206498866764293e-05, + "loss": 0.4129, + "step": 254 + }, + { + "epoch": 0.8152866242038217, + "learning_rate": 1.9117239098806302e-05, + "loss": 0.3749, + "step": 256 + }, + { + "epoch": 0.821656050955414, + "learning_rate": 1.9023453410591645e-05, + "loss": 0.543, + "step": 258 + }, + { + "epoch": 0.8280254777070064, + "learning_rate": 1.8925188358598822e-05, + "loss": 0.4735, + "step": 260 + }, + { + "epoch": 0.8343949044585988, + "learning_rate": 1.882249272292283e-05, + "loss": 0.488, + "step": 262 + }, + { + "epoch": 0.8407643312101911, + "learning_rate": 1.871541748306005e-05, + "loss": 0.3531, + "step": 264 + }, + { + "epoch": 0.8471337579617835, + "learning_rate": 1.8604015792601395e-05, + "loss": 0.3552, + "step": 266 + }, + { + "epoch": 0.8535031847133758, + "learning_rate": 1.8488342952846077e-05, + "loss": 1.0282, + "step": 268 + }, + { + "epoch": 0.8598726114649682, + "learning_rate": 1.8368456385349333e-05, + "loss": 0.5032, + "step": 270 + }, + { + "epoch": 0.8662420382165605, + "learning_rate": 1.824441560341761e-05, + "loss": 0.4555, + "step": 272 + }, + { + "epoch": 0.8726114649681529, + "learning_rate": 1.811628218256532e-05, + "loss": 0.7456, + "step": 274 + }, + { + "epoch": 0.8789808917197452, + "learning_rate": 1.798411972994795e-05, + "loss": 0.5424, + "step": 276 + }, + { + "epoch": 0.8853503184713376, + "learning_rate": 1.784799385278662e-05, + "loss": 0.7407, + "step": 278 + }, + { + "epoch": 0.89171974522293, + "learning_rate": 1.770797212579973e-05, + "loss": 0.3791, + "step": 280 + }, + { + "epoch": 0.8980891719745223, + "learning_rate": 1.756412405765805e-05, + "loss": 0.3953, + "step": 282 + }, + { + "epoch": 0.9044585987261147, + "learning_rate": 1.7416521056479573e-05, + "loss": 0.5932, + "step": 284 + }, + { + "epoch": 0.910828025477707, + "learning_rate": 1.7265236394381634e-05, + "loss": 0.4036, + "step": 286 + }, + { + "epoch": 0.9171974522292994, + "learning_rate": 1.711034517110761e-05, + "loss": 0.3685, + "step": 288 + }, + { + "epoch": 0.9235668789808917, + "learning_rate": 1.6951924276746425e-05, + "loss": 0.3526, + "step": 290 + }, + { + "epoch": 0.9299363057324841, + "learning_rate": 1.6790052353563254e-05, + "loss": 0.5022, + "step": 292 + }, + { + "epoch": 0.9363057324840764, + "learning_rate": 1.662480975696046e-05, + "loss": 0.3848, + "step": 294 + }, + { + "epoch": 0.9426751592356688, + "learning_rate": 1.6456278515588044e-05, + "loss": 0.3712, + "step": 296 + }, + { + "epoch": 0.9490445859872612, + "learning_rate": 1.6284542290623558e-05, + "loss": 2.371, + "step": 298 + }, + { + "epoch": 0.9554140127388535, + "learning_rate": 1.6109686334241648e-05, + "loss": 0.4082, + "step": 300 + }, + { + "epoch": 0.9617834394904459, + "learning_rate": 1.593179744729355e-05, + "loss": 0.3691, + "step": 302 + }, + { + "epoch": 0.9681528662420382, + "learning_rate": 1.57509639362181e-05, + "loss": 0.4227, + "step": 304 + }, + { + "epoch": 0.9745222929936306, + "learning_rate": 1.5567275569205227e-05, + "loss": 0.3981, + "step": 306 + }, + { + "epoch": 0.9808917197452229, + "learning_rate": 1.538082353163374e-05, + "loss": 0.507, + "step": 308 + }, + { + "epoch": 0.9872611464968153, + "learning_rate": 1.5191700380805768e-05, + "loss": 0.4625, + "step": 310 + }, + { + "epoch": 0.9936305732484076, + "learning_rate": 1.5000000000000014e-05, + "loss": 0.509, + "step": 312 + }, + { + "epoch": 1.0, + "learning_rate": 1.4805817551866854e-05, + "loss": 0.4223, + "step": 314 + }, + { + "epoch": 1.0, + "step": 314, + "total_flos": 1322375011368960.0, + "train_loss": 0.5353610082796425, + "train_runtime": 1276.7534, + "train_samples_per_second": 3.935, + "train_steps_per_second": 0.246 + } + ], + "logging_steps": 2, + "max_steps": 314, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 1322375011368960.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..a36e814295519ce0460a45ec452ed1e2509dc0dc --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a10bb6d107a165c0bac8ee6b44d62bb23c9d77c43e50dce7ea33d1156f7c60ff +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..04f1070c218607117b8ef7cdfdc329d53916c434 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:957890eac44a229447322393018abccdc4c4d0a1d2e7c06a74d6bf3f3269c754 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..d01d29f75f459fcee98280b6a183b4cd40e9f0b0 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c728d92eee5b09c526d58b8394a8dce470256e69f80304e4be88e24d6c90d941 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..ebf1926e983d5cde6f3e0fbe2573c0925794de6f --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:482fb07cd37f8cce548388c0afd1b8ddef08fb7a8a6b5bf29655d3c58de6d165 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_selfsup_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_selfsup_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..72d24b14e4e9925d6e21caeadac106654d994377 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_selfsup_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,974 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 314, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006369426751592357, + "learning_rate": 2.5834789435204156e-06, + "loss": 0.0437, + "step": 2 + }, + { + "epoch": 0.012738853503184714, + "learning_rate": 2.73476360561837e-06, + "loss": 0.0635, + "step": 4 + }, + { + "epoch": 0.01910828025477707, + "learning_rate": 2.889654828892393e-06, + "loss": 0.3517, + "step": 6 + }, + { + "epoch": 0.025477707006369428, + "learning_rate": 3.0480757232535773e-06, + "loss": 0.8403, + "step": 8 + }, + { + "epoch": 0.03184713375796178, + "learning_rate": 3.2099476464367486e-06, + "loss": 0.0856, + "step": 10 + }, + { + "epoch": 0.03821656050955414, + "learning_rate": 3.3751902430395558e-06, + "loss": 0.2464, + "step": 12 + }, + { + "epoch": 0.044585987261146494, + "learning_rate": 3.5437214844119727e-06, + "loss": 0.0283, + "step": 14 + }, + { + "epoch": 0.050955414012738856, + "learning_rate": 3.7154577093764287e-06, + "loss": 0.4218, + "step": 16 + }, + { + "epoch": 0.05732484076433121, + "learning_rate": 3.890313665758341e-06, + "loss": 0.0523, + "step": 18 + }, + { + "epoch": 0.06369426751592357, + "learning_rate": 4.068202552706455e-06, + "loss": 0.1286, + "step": 20 + }, + { + "epoch": 0.07006369426751592, + "learning_rate": 4.249036063781902e-06, + "loss": 0.2258, + "step": 22 + }, + { + "epoch": 0.07643312101910828, + "learning_rate": 4.432724430794775e-06, + "loss": 0.2625, + "step": 24 + }, + { + "epoch": 0.08280254777070063, + "learning_rate": 4.6191764683662625e-06, + "loss": 0.0699, + "step": 26 + }, + { + "epoch": 0.08917197452229299, + "learning_rate": 4.8082996191942354e-06, + "loss": 1.0868, + "step": 28 + }, + { + "epoch": 0.09554140127388536, + "learning_rate": 5.000000000000003e-06, + "loss": 0.0467, + "step": 30 + }, + { + "epoch": 0.10191082802547771, + "learning_rate": 5.194182448133163e-06, + "loss": 0.0898, + "step": 32 + }, + { + "epoch": 0.10828025477707007, + "learning_rate": 5.39075056881172e-06, + "loss": 0.0576, + "step": 34 + }, + { + "epoch": 0.11464968152866242, + "learning_rate": 5.589606782973682e-06, + "loss": 0.2819, + "step": 36 + }, + { + "epoch": 0.12101910828025478, + "learning_rate": 5.7906523757166475e-06, + "loss": 0.0205, + "step": 38 + }, + { + "epoch": 0.12738853503184713, + "learning_rate": 5.9937875453012e-06, + "loss": 0.2175, + "step": 40 + }, + { + "epoch": 0.1337579617834395, + "learning_rate": 6.198911452693847e-06, + "loss": 0.0992, + "step": 42 + }, + { + "epoch": 0.14012738853503184, + "learning_rate": 6.405922271624865e-06, + "loss": 0.0549, + "step": 44 + }, + { + "epoch": 0.1464968152866242, + "learning_rate": 6.614717239136237e-06, + "loss": 0.1491, + "step": 46 + }, + { + "epoch": 0.15286624203821655, + "learning_rate": 6.8251927065945815e-06, + "loss": 0.8473, + "step": 48 + }, + { + "epoch": 0.1592356687898089, + "learning_rate": 7.037244191143648e-06, + "loss": 0.0447, + "step": 50 + }, + { + "epoch": 0.16560509554140126, + "learning_rate": 7.250766427571185e-06, + "loss": 0.0597, + "step": 52 + }, + { + "epoch": 0.17197452229299362, + "learning_rate": 7.465653420563828e-06, + "loss": 0.0587, + "step": 54 + }, + { + "epoch": 0.17834394904458598, + "learning_rate": 7.68179849732472e-06, + "loss": 0.167, + "step": 56 + }, + { + "epoch": 0.18471337579617833, + "learning_rate": 7.899094360527221e-06, + "loss": 0.0063, + "step": 58 + }, + { + "epoch": 0.1910828025477707, + "learning_rate": 8.117433141578865e-06, + "loss": 0.0427, + "step": 60 + }, + { + "epoch": 0.19745222929936307, + "learning_rate": 8.336706454168698e-06, + "loss": 0.006, + "step": 62 + }, + { + "epoch": 0.20382165605095542, + "learning_rate": 8.55680544807173e-06, + "loss": 0.1898, + "step": 64 + }, + { + "epoch": 0.21019108280254778, + "learning_rate": 8.777620863183652e-06, + "loss": 0.1642, + "step": 66 + }, + { + "epoch": 0.21656050955414013, + "learning_rate": 8.99904308375901e-06, + "loss": 0.0916, + "step": 68 + }, + { + "epoch": 0.2229299363057325, + "learning_rate": 9.220962192825959e-06, + "loss": 0.1389, + "step": 70 + }, + { + "epoch": 0.22929936305732485, + "learning_rate": 9.443268026750509e-06, + "loss": 0.0734, + "step": 72 + }, + { + "epoch": 0.2356687898089172, + "learning_rate": 9.665850229923262e-06, + "loss": 0.0275, + "step": 74 + }, + { + "epoch": 0.24203821656050956, + "learning_rate": 9.88859830954135e-06, + "loss": 0.01, + "step": 76 + }, + { + "epoch": 0.2484076433121019, + "learning_rate": 1.0111401690458642e-05, + "loss": 0.0131, + "step": 78 + }, + { + "epoch": 0.25477707006369427, + "learning_rate": 1.0334149770076732e-05, + "loss": 0.0065, + "step": 80 + }, + { + "epoch": 0.2611464968152866, + "learning_rate": 1.0556731973249482e-05, + "loss": 0.0109, + "step": 82 + }, + { + "epoch": 0.267515923566879, + "learning_rate": 1.0779037807174032e-05, + "loss": 0.4836, + "step": 84 + }, + { + "epoch": 0.27388535031847133, + "learning_rate": 1.1000956916240984e-05, + "loss": 0.2034, + "step": 86 + }, + { + "epoch": 0.2802547770700637, + "learning_rate": 1.1222379136816342e-05, + "loss": 0.0553, + "step": 88 + }, + { + "epoch": 0.28662420382165604, + "learning_rate": 1.1443194551928264e-05, + "loss": 0.5734, + "step": 90 + }, + { + "epoch": 0.2929936305732484, + "learning_rate": 1.1663293545831295e-05, + "loss": 0.2468, + "step": 92 + }, + { + "epoch": 0.29936305732484075, + "learning_rate": 1.188256685842113e-05, + "loss": 0.4644, + "step": 94 + }, + { + "epoch": 0.3057324840764331, + "learning_rate": 1.210090563947277e-05, + "loss": 0.0546, + "step": 96 + }, + { + "epoch": 0.31210191082802546, + "learning_rate": 1.2318201502675273e-05, + "loss": 0.0156, + "step": 98 + }, + { + "epoch": 0.3184713375796178, + "learning_rate": 1.2534346579436164e-05, + "loss": 0.0615, + "step": 100 + }, + { + "epoch": 0.3248407643312102, + "learning_rate": 1.274923357242881e-05, + "loss": 0.0354, + "step": 102 + }, + { + "epoch": 0.33121019108280253, + "learning_rate": 1.2962755808856345e-05, + "loss": 0.342, + "step": 104 + }, + { + "epoch": 0.3375796178343949, + "learning_rate": 1.3174807293405412e-05, + "loss": 0.405, + "step": 106 + }, + { + "epoch": 0.34394904458598724, + "learning_rate": 1.3385282760863758e-05, + "loss": 0.0077, + "step": 108 + }, + { + "epoch": 0.3503184713375796, + "learning_rate": 1.3594077728375129e-05, + "loss": 0.3975, + "step": 110 + }, + { + "epoch": 0.35668789808917195, + "learning_rate": 1.3801088547306147e-05, + "loss": 0.3575, + "step": 112 + }, + { + "epoch": 0.3630573248407643, + "learning_rate": 1.4006212454698793e-05, + "loss": 0.6353, + "step": 114 + }, + { + "epoch": 0.36942675159235666, + "learning_rate": 1.4209347624283347e-05, + "loss": 0.4914, + "step": 116 + }, + { + "epoch": 0.37579617834394907, + "learning_rate": 1.441039321702631e-05, + "loss": 0.3155, + "step": 118 + }, + { + "epoch": 0.3821656050955414, + "learning_rate": 1.4609249431188274e-05, + "loss": 0.5629, + "step": 120 + }, + { + "epoch": 0.3885350318471338, + "learning_rate": 1.480581755186683e-05, + "loss": 0.022, + "step": 122 + }, + { + "epoch": 0.39490445859872614, + "learning_rate": 1.4999999999999992e-05, + "loss": 0.0041, + "step": 124 + }, + { + "epoch": 0.4012738853503185, + "learning_rate": 1.5191700380805761e-05, + "loss": 0.6424, + "step": 126 + }, + { + "epoch": 0.40764331210191085, + "learning_rate": 1.538082353163373e-05, + "loss": 0.1021, + "step": 128 + }, + { + "epoch": 0.4140127388535032, + "learning_rate": 1.556727556920522e-05, + "loss": 0.9117, + "step": 130 + }, + { + "epoch": 0.42038216560509556, + "learning_rate": 1.5750963936218094e-05, + "loss": 0.2738, + "step": 132 + }, + { + "epoch": 0.4267515923566879, + "learning_rate": 1.593179744729354e-05, + "loss": 0.7301, + "step": 134 + }, + { + "epoch": 0.43312101910828027, + "learning_rate": 1.6109686334241655e-05, + "loss": 0.3389, + "step": 136 + }, + { + "epoch": 0.4394904458598726, + "learning_rate": 1.6284542290623565e-05, + "loss": 0.1688, + "step": 138 + }, + { + "epoch": 0.445859872611465, + "learning_rate": 1.6456278515588023e-05, + "loss": 0.2313, + "step": 140 + }, + { + "epoch": 0.45222929936305734, + "learning_rate": 1.662480975696044e-05, + "loss": 0.7155, + "step": 142 + }, + { + "epoch": 0.4585987261146497, + "learning_rate": 1.6790052353563247e-05, + "loss": 0.0314, + "step": 144 + }, + { + "epoch": 0.46496815286624205, + "learning_rate": 1.6951924276746418e-05, + "loss": 0.0323, + "step": 146 + }, + { + "epoch": 0.4713375796178344, + "learning_rate": 1.7110345171107602e-05, + "loss": 0.0467, + "step": 148 + }, + { + "epoch": 0.47770700636942676, + "learning_rate": 1.7265236394381627e-05, + "loss": 0.0883, + "step": 150 + }, + { + "epoch": 0.4840764331210191, + "learning_rate": 1.741652105647958e-05, + "loss": 0.1867, + "step": 152 + }, + { + "epoch": 0.49044585987261147, + "learning_rate": 1.7564124057658057e-05, + "loss": 0.2516, + "step": 154 + }, + { + "epoch": 0.4968152866242038, + "learning_rate": 1.7707972125799738e-05, + "loss": 0.0261, + "step": 156 + }, + { + "epoch": 0.5031847133757962, + "learning_rate": 1.7847993852786612e-05, + "loss": 0.1774, + "step": 158 + }, + { + "epoch": 0.5095541401273885, + "learning_rate": 1.7984119729947937e-05, + "loss": 0.1415, + "step": 160 + }, + { + "epoch": 0.5159235668789809, + "learning_rate": 1.811628218256531e-05, + "loss": 0.4364, + "step": 162 + }, + { + "epoch": 0.5222929936305732, + "learning_rate": 1.8244415603417603e-05, + "loss": 0.15, + "step": 164 + }, + { + "epoch": 0.5286624203821656, + "learning_rate": 1.836845638534933e-05, + "loss": 0.1189, + "step": 166 + }, + { + "epoch": 0.535031847133758, + "learning_rate": 1.8488342952846074e-05, + "loss": 0.0061, + "step": 168 + }, + { + "epoch": 0.5414012738853503, + "learning_rate": 1.860401579260139e-05, + "loss": 0.3801, + "step": 170 + }, + { + "epoch": 0.5477707006369427, + "learning_rate": 1.8715417483060044e-05, + "loss": 0.5063, + "step": 172 + }, + { + "epoch": 0.554140127388535, + "learning_rate": 1.8822492722922816e-05, + "loss": 0.1346, + "step": 174 + }, + { + "epoch": 0.5605095541401274, + "learning_rate": 1.8925188358598808e-05, + "loss": 0.0054, + "step": 176 + }, + { + "epoch": 0.5668789808917197, + "learning_rate": 1.902345341059163e-05, + "loss": 0.2782, + "step": 178 + }, + { + "epoch": 0.5732484076433121, + "learning_rate": 1.9117239098806296e-05, + "loss": 1.3079, + "step": 180 + }, + { + "epoch": 0.5796178343949044, + "learning_rate": 1.920649886676429e-05, + "loss": 0.1065, + "step": 182 + }, + { + "epoch": 0.5859872611464968, + "learning_rate": 1.9291188404714876e-05, + "loss": 0.1146, + "step": 184 + }, + { + "epoch": 0.5923566878980892, + "learning_rate": 1.937126567163103e-05, + "loss": 0.1226, + "step": 186 + }, + { + "epoch": 0.5987261146496815, + "learning_rate": 1.944669091607919e-05, + "loss": 0.032, + "step": 188 + }, + { + "epoch": 0.6050955414012739, + "learning_rate": 1.9517426695952354e-05, + "loss": 0.185, + "step": 190 + }, + { + "epoch": 0.6114649681528662, + "learning_rate": 1.9583437897056915e-05, + "loss": 1.2277, + "step": 192 + }, + { + "epoch": 0.6178343949044586, + "learning_rate": 1.964469175054377e-05, + "loss": 0.66, + "step": 194 + }, + { + "epoch": 0.6242038216560509, + "learning_rate": 1.970115784917523e-05, + "loss": 0.4214, + "step": 196 + }, + { + "epoch": 0.6305732484076433, + "learning_rate": 1.975280816241959e-05, + "loss": 0.3423, + "step": 198 + }, + { + "epoch": 0.6369426751592356, + "learning_rate": 1.979961705036587e-05, + "loss": 0.4321, + "step": 200 + }, + { + "epoch": 0.643312101910828, + "learning_rate": 1.9841561276451777e-05, + "loss": 0.0109, + "step": 202 + }, + { + "epoch": 0.6496815286624203, + "learning_rate": 1.9878620018998696e-05, + "loss": 0.2352, + "step": 204 + }, + { + "epoch": 0.6560509554140127, + "learning_rate": 1.9910774881547803e-05, + "loss": 0.112, + "step": 206 + }, + { + "epoch": 0.6624203821656051, + "learning_rate": 1.993800990199235e-05, + "loss": 0.3111, + "step": 208 + }, + { + "epoch": 0.6687898089171974, + "learning_rate": 1.9960311560501457e-05, + "loss": 0.5812, + "step": 210 + }, + { + "epoch": 0.6751592356687898, + "learning_rate": 1.9977668786231536e-05, + "loss": 0.2126, + "step": 212 + }, + { + "epoch": 0.6815286624203821, + "learning_rate": 1.999007296282201e-05, + "loss": 0.0831, + "step": 214 + }, + { + "epoch": 0.6878980891719745, + "learning_rate": 1.9997517932672592e-05, + "loss": 0.1763, + "step": 216 + }, + { + "epoch": 0.6942675159235668, + "learning_rate": 2e-05, + "loss": 0.3726, + "step": 218 + }, + { + "epoch": 0.7006369426751592, + "learning_rate": 1.9997517932672592e-05, + "loss": 0.141, + "step": 220 + }, + { + "epoch": 0.7070063694267515, + "learning_rate": 1.999007296282201e-05, + "loss": 0.1182, + "step": 222 + }, + { + "epoch": 0.7133757961783439, + "learning_rate": 1.9977668786231536e-05, + "loss": 0.1174, + "step": 224 + }, + { + "epoch": 0.7197452229299363, + "learning_rate": 1.9960311560501457e-05, + "loss": 0.1507, + "step": 226 + }, + { + "epoch": 0.7261146496815286, + "learning_rate": 1.993800990199235e-05, + "loss": 0.0347, + "step": 228 + }, + { + "epoch": 0.732484076433121, + "learning_rate": 1.99107748815478e-05, + "loss": 0.2476, + "step": 230 + }, + { + "epoch": 0.7388535031847133, + "learning_rate": 1.9878620018998696e-05, + "loss": 0.095, + "step": 232 + }, + { + "epoch": 0.7452229299363057, + "learning_rate": 1.984156127645178e-05, + "loss": 0.1634, + "step": 234 + }, + { + "epoch": 0.7515923566878981, + "learning_rate": 1.979961705036587e-05, + "loss": 0.0125, + "step": 236 + }, + { + "epoch": 0.7579617834394905, + "learning_rate": 1.975280816241959e-05, + "loss": 0.3666, + "step": 238 + }, + { + "epoch": 0.7643312101910829, + "learning_rate": 1.9701157849175232e-05, + "loss": 0.1501, + "step": 240 + }, + { + "epoch": 0.7707006369426752, + "learning_rate": 1.9644691750543772e-05, + "loss": 0.2974, + "step": 242 + }, + { + "epoch": 0.7770700636942676, + "learning_rate": 1.958343789705692e-05, + "loss": 0.1029, + "step": 244 + }, + { + "epoch": 0.7834394904458599, + "learning_rate": 1.9517426695952354e-05, + "loss": 0.0726, + "step": 246 + }, + { + "epoch": 0.7898089171974523, + "learning_rate": 1.9446690916079184e-05, + "loss": 0.0602, + "step": 248 + }, + { + "epoch": 0.7961783439490446, + "learning_rate": 1.9371265671631034e-05, + "loss": 2.234, + "step": 250 + }, + { + "epoch": 0.802547770700637, + "learning_rate": 1.929118840471488e-05, + "loss": 1.4173, + "step": 252 + }, + { + "epoch": 0.8089171974522293, + "learning_rate": 1.9206498866764293e-05, + "loss": 1.0029, + "step": 254 + }, + { + "epoch": 0.8152866242038217, + "learning_rate": 1.9117239098806302e-05, + "loss": 0.5414, + "step": 256 + }, + { + "epoch": 0.821656050955414, + "learning_rate": 1.9023453410591645e-05, + "loss": 0.0349, + "step": 258 + }, + { + "epoch": 0.8280254777070064, + "learning_rate": 1.8925188358598822e-05, + "loss": 0.3559, + "step": 260 + }, + { + "epoch": 0.8343949044585988, + "learning_rate": 1.882249272292283e-05, + "loss": 0.4071, + "step": 262 + }, + { + "epoch": 0.8407643312101911, + "learning_rate": 1.871541748306005e-05, + "loss": 0.0814, + "step": 264 + }, + { + "epoch": 0.8471337579617835, + "learning_rate": 1.8604015792601395e-05, + "loss": 0.8049, + "step": 266 + }, + { + "epoch": 0.8535031847133758, + "learning_rate": 1.8488342952846077e-05, + "loss": 0.5639, + "step": 268 + }, + { + "epoch": 0.8598726114649682, + "learning_rate": 1.8368456385349333e-05, + "loss": 0.5041, + "step": 270 + }, + { + "epoch": 0.8662420382165605, + "learning_rate": 1.824441560341761e-05, + "loss": 0.546, + "step": 272 + }, + { + "epoch": 0.8726114649681529, + "learning_rate": 1.811628218256532e-05, + "loss": 0.0711, + "step": 274 + }, + { + "epoch": 0.8789808917197452, + "learning_rate": 1.798411972994795e-05, + "loss": 1.1661, + "step": 276 + }, + { + "epoch": 0.8853503184713376, + "learning_rate": 1.784799385278662e-05, + "loss": 0.7112, + "step": 278 + }, + { + "epoch": 0.89171974522293, + "learning_rate": 1.770797212579973e-05, + "loss": 0.207, + "step": 280 + }, + { + "epoch": 0.8980891719745223, + "learning_rate": 1.756412405765805e-05, + "loss": 0.5363, + "step": 282 + }, + { + "epoch": 0.9044585987261147, + "learning_rate": 1.7416521056479573e-05, + "loss": 0.8558, + "step": 284 + }, + { + "epoch": 0.910828025477707, + "learning_rate": 1.7265236394381634e-05, + "loss": 0.2549, + "step": 286 + }, + { + "epoch": 0.9171974522292994, + "learning_rate": 1.711034517110761e-05, + "loss": 0.0656, + "step": 288 + }, + { + "epoch": 0.9235668789808917, + "learning_rate": 1.6951924276746425e-05, + "loss": 0.1805, + "step": 290 + }, + { + "epoch": 0.9299363057324841, + "learning_rate": 1.6790052353563254e-05, + "loss": 0.234, + "step": 292 + }, + { + "epoch": 0.9363057324840764, + "learning_rate": 1.662480975696046e-05, + "loss": 0.0778, + "step": 294 + }, + { + "epoch": 0.9426751592356688, + "learning_rate": 1.6456278515588044e-05, + "loss": 0.0258, + "step": 296 + }, + { + "epoch": 0.9490445859872612, + "learning_rate": 1.6284542290623558e-05, + "loss": 0.1791, + "step": 298 + }, + { + "epoch": 0.9554140127388535, + "learning_rate": 1.6109686334241648e-05, + "loss": 0.3059, + "step": 300 + }, + { + "epoch": 0.9617834394904459, + "learning_rate": 1.593179744729355e-05, + "loss": 0.4586, + "step": 302 + }, + { + "epoch": 0.9681528662420382, + "learning_rate": 1.57509639362181e-05, + "loss": 0.1173, + "step": 304 + }, + { + "epoch": 0.9745222929936306, + "learning_rate": 1.5567275569205227e-05, + "loss": 0.0107, + "step": 306 + }, + { + "epoch": 0.9808917197452229, + "learning_rate": 1.538082353163374e-05, + "loss": 1.0589, + "step": 308 + }, + { + "epoch": 0.9872611464968153, + "learning_rate": 1.5191700380805768e-05, + "loss": 0.5083, + "step": 310 + }, + { + "epoch": 0.9936305732484076, + "learning_rate": 1.5000000000000014e-05, + "loss": 0.3787, + "step": 312 + }, + { + "epoch": 1.0, + "learning_rate": 1.4805817551866854e-05, + "loss": 0.1243, + "step": 314 + }, + { + "epoch": 1.0, + "step": 314, + "total_flos": 1384288393625600.0, + "train_loss": 0.28617319338332126, + "train_runtime": 1575.6528, + "train_samples_per_second": 3.189, + "train_steps_per_second": 0.199 + } + ], + "logging_steps": 2, + "max_steps": 314, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 1384288393625600.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..91a458b55dbe1d59e6bba3080622d48b6107427c --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3b6809f72af3459a8bfb00453fa0fd9ca5d1cde65ac1977013093efde04fb94 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3fecc81c2e91ec492443bf26d0fccc31283b70aa --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8831935745a767aff6567bd96b61c8084b4afc2327500e40a130fca7e024012c +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..0a93e355ff0d194777e6cac213af7d948e16d7f5 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84cc3307cc3cba1b296b0da6d7a76813d8100d33676a8752931ef762e37c90ce +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..e6dd5eb4a6988c9a2f3d27bb1443d8ecd096d1b1 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_03125_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a07d3ce27b76176ebf16341ed50dd4062f55c99df2d42d2db6bd9474018113e +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_coincide_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_coincide_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..69f952f6a9ed0c2e9586db52a130e3ae18b1d265 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_coincide_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,1904 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "learning_rate": 2.4524967251364995e-06, + "loss": 0.1856, + "step": 2 + }, + { + "epoch": 0.0064, + "learning_rate": 2.5263093403840022e-06, + "loss": 0.2075, + "step": 4 + }, + { + "epoch": 0.0096, + "learning_rate": 2.6010561079587694e-06, + "loss": 0.2234, + "step": 6 + }, + { + "epoch": 0.0128, + "learning_rate": 2.6767276851049716e-06, + "loss": 0.2668, + "step": 8 + }, + { + "epoch": 0.016, + "learning_rate": 2.7533146134728993e-06, + "loss": 0.1002, + "step": 10 + }, + { + "epoch": 0.0192, + "learning_rate": 2.8308073203011634e-06, + "loss": 0.0876, + "step": 12 + }, + { + "epoch": 0.0224, + "learning_rate": 2.909196119613218e-06, + "loss": 0.0139, + "step": 14 + }, + { + "epoch": 0.0256, + "learning_rate": 2.988471213428035e-06, + "loss": 0.0248, + "step": 16 + }, + { + "epoch": 0.0288, + "learning_rate": 3.068622692984767e-06, + "loss": 0.0242, + "step": 18 + }, + { + "epoch": 0.032, + "learning_rate": 3.1496405399812602e-06, + "loss": 0.3101, + "step": 20 + }, + { + "epoch": 0.0352, + "learning_rate": 3.231514627826302e-06, + "loss": 0.1165, + "step": 22 + }, + { + "epoch": 0.0384, + "learning_rate": 3.314234722905302e-06, + "loss": 0.5796, + "step": 24 + }, + { + "epoch": 0.0416, + "learning_rate": 3.3977904858594534e-06, + "loss": 0.0059, + "step": 26 + }, + { + "epoch": 0.0448, + "learning_rate": 3.4821714728780654e-06, + "loss": 0.1573, + "step": 28 + }, + { + "epoch": 0.048, + "learning_rate": 3.567367137003953e-06, + "loss": 0.2623, + "step": 30 + }, + { + "epoch": 0.0512, + "learning_rate": 3.653366829451711e-06, + "loss": 0.3131, + "step": 32 + }, + { + "epoch": 0.0544, + "learning_rate": 3.740159800938784e-06, + "loss": 0.8242, + "step": 34 + }, + { + "epoch": 0.0576, + "learning_rate": 3.827735203028956e-06, + "loss": 0.0827, + "step": 36 + }, + { + "epoch": 0.0608, + "learning_rate": 3.916082089488379e-06, + "loss": 0.2051, + "step": 38 + }, + { + "epoch": 0.064, + "learning_rate": 4.005189417653737e-06, + "loss": 0.1269, + "step": 40 + }, + { + "epoch": 0.0672, + "learning_rate": 4.095046049812541e-06, + "loss": 0.2132, + "step": 42 + }, + { + "epoch": 0.0704, + "learning_rate": 4.1856407545951825e-06, + "loss": 0.333, + "step": 44 + }, + { + "epoch": 0.0736, + "learning_rate": 4.276962208378814e-06, + "loss": 0.0089, + "step": 46 + }, + { + "epoch": 0.0768, + "learning_rate": 4.368998996702686e-06, + "loss": 0.4542, + "step": 48 + }, + { + "epoch": 0.08, + "learning_rate": 4.461739615694921e-06, + "loss": 0.5164, + "step": 50 + }, + { + "epoch": 0.0832, + "learning_rate": 4.555172473510324e-06, + "loss": 0.3665, + "step": 52 + }, + { + "epoch": 0.0864, + "learning_rate": 4.649285891779326e-06, + "loss": 0.1527, + "step": 54 + }, + { + "epoch": 0.0896, + "learning_rate": 4.744068107067673e-06, + "loss": 0.1667, + "step": 56 + }, + { + "epoch": 0.0928, + "learning_rate": 4.839507272346751e-06, + "loss": 0.1369, + "step": 58 + }, + { + "epoch": 0.096, + "learning_rate": 4.935591458474425e-06, + "loss": 0.2281, + "step": 60 + }, + { + "epoch": 0.0992, + "learning_rate": 5.032308655686007e-06, + "loss": 0.1405, + "step": 62 + }, + { + "epoch": 0.1024, + "learning_rate": 5.129646775095432e-06, + "loss": 0.0045, + "step": 64 + }, + { + "epoch": 0.1056, + "learning_rate": 5.227593650206246e-06, + "loss": 0.7821, + "step": 66 + }, + { + "epoch": 0.1088, + "learning_rate": 5.3261370384323904e-06, + "loss": 0.4551, + "step": 68 + }, + { + "epoch": 0.112, + "learning_rate": 5.425264622628326e-06, + "loss": 0.0533, + "step": 70 + }, + { + "epoch": 0.1152, + "learning_rate": 5.524964012628644e-06, + "loss": 0.0006, + "step": 72 + }, + { + "epoch": 0.1184, + "learning_rate": 5.62522274679673e-06, + "loss": 0.0159, + "step": 74 + }, + { + "epoch": 0.1216, + "learning_rate": 5.726028293582342e-06, + "loss": 0.1656, + "step": 76 + }, + { + "epoch": 0.1248, + "learning_rate": 5.827368053088032e-06, + "loss": 0.7154, + "step": 78 + }, + { + "epoch": 0.128, + "learning_rate": 5.929229358643925e-06, + "loss": 0.1261, + "step": 80 + }, + { + "epoch": 0.1312, + "learning_rate": 6.03159947839103e-06, + "loss": 0.5616, + "step": 82 + }, + { + "epoch": 0.1344, + "learning_rate": 6.13446561687258e-06, + "loss": 0.3386, + "step": 84 + }, + { + "epoch": 0.1376, + "learning_rate": 6.237814916633431e-06, + "loss": 0.4369, + "step": 86 + }, + { + "epoch": 0.1408, + "learning_rate": 6.341634459827044e-06, + "loss": 0.3947, + "step": 88 + }, + { + "epoch": 0.144, + "learning_rate": 6.445911269830183e-06, + "loss": 0.0982, + "step": 90 + }, + { + "epoch": 0.1472, + "learning_rate": 6.5506323128648654e-06, + "loss": 0.3838, + "step": 92 + }, + { + "epoch": 0.1504, + "learning_rate": 6.655784499627476e-06, + "loss": 0.9755, + "step": 94 + }, + { + "epoch": 0.1536, + "learning_rate": 6.761354686924883e-06, + "loss": 0.0549, + "step": 96 + }, + { + "epoch": 0.1568, + "learning_rate": 6.867329679317144e-06, + "loss": 0.4365, + "step": 98 + }, + { + "epoch": 0.16, + "learning_rate": 6.973696230766884e-06, + "loss": 0.4253, + "step": 100 + }, + { + "epoch": 0.1632, + "learning_rate": 7.080441046294945e-06, + "loss": 0.06, + "step": 102 + }, + { + "epoch": 0.1664, + "learning_rate": 7.18755078364214e-06, + "loss": 0.4592, + "step": 104 + }, + { + "epoch": 0.1696, + "learning_rate": 7.2950120549369204e-06, + "loss": 0.2125, + "step": 106 + }, + { + "epoch": 0.1728, + "learning_rate": 7.402811428368824e-06, + "loss": 0.3792, + "step": 108 + }, + { + "epoch": 0.176, + "learning_rate": 7.510935429867233e-06, + "loss": 0.1535, + "step": 110 + }, + { + "epoch": 0.1792, + "learning_rate": 7.619370544785608e-06, + "loss": 0.1375, + "step": 112 + }, + { + "epoch": 0.1824, + "learning_rate": 7.728103219590684e-06, + "loss": 0.1129, + "step": 114 + }, + { + "epoch": 0.1856, + "learning_rate": 7.83711986355656e-06, + "loss": 0.1558, + "step": 116 + }, + { + "epoch": 0.1888, + "learning_rate": 7.946406850463435e-06, + "loss": 0.4794, + "step": 118 + }, + { + "epoch": 0.192, + "learning_rate": 8.055950520300756e-06, + "loss": 0.0791, + "step": 120 + }, + { + "epoch": 0.1952, + "learning_rate": 8.165737180974676e-06, + "loss": 0.0991, + "step": 122 + }, + { + "epoch": 0.1984, + "learning_rate": 8.275753110019367e-06, + "loss": 0.1581, + "step": 124 + }, + { + "epoch": 0.2016, + "learning_rate": 8.385984556312285e-06, + "loss": 0.4554, + "step": 126 + }, + { + "epoch": 0.2048, + "learning_rate": 8.496417741792922e-06, + "loss": 0.0035, + "step": 128 + }, + { + "epoch": 0.208, + "learning_rate": 8.607038863184952e-06, + "loss": 0.1598, + "step": 130 + }, + { + "epoch": 0.2112, + "learning_rate": 8.717834093721598e-06, + "loss": 0.0816, + "step": 132 + }, + { + "epoch": 0.2144, + "learning_rate": 8.828789584873757e-06, + "loss": 0.2173, + "step": 134 + }, + { + "epoch": 0.2176, + "learning_rate": 8.939891468081036e-06, + "loss": 0.5008, + "step": 136 + }, + { + "epoch": 0.2208, + "learning_rate": 9.051125856485175e-06, + "loss": 0.2191, + "step": 138 + }, + { + "epoch": 0.224, + "learning_rate": 9.162478846665854e-06, + "loss": 0.0324, + "step": 140 + }, + { + "epoch": 0.2272, + "learning_rate": 9.273936520378426e-06, + "loss": 0.1816, + "step": 142 + }, + { + "epoch": 0.2304, + "learning_rate": 9.38548494629364e-06, + "loss": 0.3324, + "step": 144 + }, + { + "epoch": 0.2336, + "learning_rate": 9.497110181738935e-06, + "loss": 0.0182, + "step": 146 + }, + { + "epoch": 0.2368, + "learning_rate": 9.608798274441153e-06, + "loss": 0.0734, + "step": 148 + }, + { + "epoch": 0.24, + "learning_rate": 9.720535264270526e-06, + "loss": 0.0685, + "step": 150 + }, + { + "epoch": 0.2432, + "learning_rate": 9.832307184985473e-06, + "loss": 0.0739, + "step": 152 + }, + { + "epoch": 0.2464, + "learning_rate": 9.944100065978354e-06, + "loss": 0.1421, + "step": 154 + }, + { + "epoch": 0.2496, + "learning_rate": 1.0055899934021637e-05, + "loss": 0.8392, + "step": 156 + }, + { + "epoch": 0.2528, + "learning_rate": 1.016769281501452e-05, + "loss": 0.043, + "step": 158 + }, + { + "epoch": 0.256, + "learning_rate": 1.0279464735729467e-05, + "loss": 0.0865, + "step": 160 + }, + { + "epoch": 0.2592, + "learning_rate": 1.039120172555884e-05, + "loss": 0.0151, + "step": 162 + }, + { + "epoch": 0.2624, + "learning_rate": 1.0502889818261058e-05, + "loss": 0.1952, + "step": 164 + }, + { + "epoch": 0.2656, + "learning_rate": 1.0614515053706354e-05, + "loss": 0.4652, + "step": 166 + }, + { + "epoch": 0.2688, + "learning_rate": 1.0726063479621567e-05, + "loss": 0.3148, + "step": 168 + }, + { + "epoch": 0.272, + "learning_rate": 1.083752115333414e-05, + "loss": 0.2418, + "step": 170 + }, + { + "epoch": 0.2752, + "learning_rate": 1.0948874143514818e-05, + "loss": 0.1, + "step": 172 + }, + { + "epoch": 0.2784, + "learning_rate": 1.1060108531918955e-05, + "loss": 0.431, + "step": 174 + }, + { + "epoch": 0.2816, + "learning_rate": 1.1171210415126238e-05, + "loss": 0.3789, + "step": 176 + }, + { + "epoch": 0.2848, + "learning_rate": 1.1282165906278395e-05, + "loss": 0.2438, + "step": 178 + }, + { + "epoch": 0.288, + "learning_rate": 1.1392961136815041e-05, + "loss": 0.087, + "step": 180 + }, + { + "epoch": 0.2912, + "learning_rate": 1.150358225820707e-05, + "loss": 0.7962, + "step": 182 + }, + { + "epoch": 0.2944, + "learning_rate": 1.1614015443687708e-05, + "loss": 0.3676, + "step": 184 + }, + { + "epoch": 0.2976, + "learning_rate": 1.1724246889980626e-05, + "loss": 0.0204, + "step": 186 + }, + { + "epoch": 0.3008, + "learning_rate": 1.1834262819025317e-05, + "loss": 0.0862, + "step": 188 + }, + { + "epoch": 0.304, + "learning_rate": 1.1944049479699241e-05, + "loss": 0.1435, + "step": 190 + }, + { + "epoch": 0.3072, + "learning_rate": 1.2053593149536557e-05, + "loss": 0.4801, + "step": 192 + }, + { + "epoch": 0.3104, + "learning_rate": 1.2162880136443434e-05, + "loss": 0.0117, + "step": 194 + }, + { + "epoch": 0.3136, + "learning_rate": 1.2271896780409309e-05, + "loss": 0.1483, + "step": 196 + }, + { + "epoch": 0.3168, + "learning_rate": 1.2380629455214385e-05, + "loss": 0.5002, + "step": 198 + }, + { + "epoch": 0.32, + "learning_rate": 1.2489064570132761e-05, + "loss": 0.2566, + "step": 200 + }, + { + "epoch": 0.3232, + "learning_rate": 1.259718857163117e-05, + "loss": 0.0124, + "step": 202 + }, + { + "epoch": 0.3264, + "learning_rate": 1.2704987945063073e-05, + "loss": 0.4494, + "step": 204 + }, + { + "epoch": 0.3296, + "learning_rate": 1.2812449216357855e-05, + "loss": 0.2693, + "step": 206 + }, + { + "epoch": 0.3328, + "learning_rate": 1.2919558953705047e-05, + "loss": 0.2243, + "step": 208 + }, + { + "epoch": 0.336, + "learning_rate": 1.3026303769233109e-05, + "loss": 0.0728, + "step": 210 + }, + { + "epoch": 0.3392, + "learning_rate": 1.313267032068285e-05, + "loss": 0.0464, + "step": 212 + }, + { + "epoch": 0.3424, + "learning_rate": 1.3238645313075109e-05, + "loss": 0.5131, + "step": 214 + }, + { + "epoch": 0.3456, + "learning_rate": 1.3344215500372517e-05, + "loss": 0.109, + "step": 216 + }, + { + "epoch": 0.3488, + "learning_rate": 1.344936768713513e-05, + "loss": 0.3551, + "step": 218 + }, + { + "epoch": 0.352, + "learning_rate": 1.3554088730169812e-05, + "loss": 0.2274, + "step": 220 + }, + { + "epoch": 0.3552, + "learning_rate": 1.3658365540172948e-05, + "loss": 0.3262, + "step": 222 + }, + { + "epoch": 0.3584, + "learning_rate": 1.3762185083366562e-05, + "loss": 0.0089, + "step": 224 + }, + { + "epoch": 0.3616, + "learning_rate": 1.3865534383127413e-05, + "loss": 0.2882, + "step": 226 + }, + { + "epoch": 0.3648, + "learning_rate": 1.3968400521608962e-05, + "loss": 0.1762, + "step": 228 + }, + { + "epoch": 0.368, + "learning_rate": 1.4070770641356069e-05, + "loss": 0.0952, + "step": 230 + }, + { + "epoch": 0.3712, + "learning_rate": 1.4172631946911964e-05, + "loss": 0.519, + "step": 232 + }, + { + "epoch": 0.3744, + "learning_rate": 1.4273971706417653e-05, + "loss": 0.0641, + "step": 234 + }, + { + "epoch": 0.3776, + "learning_rate": 1.4374777253203265e-05, + "loss": 0.0274, + "step": 236 + }, + { + "epoch": 0.3808, + "learning_rate": 1.4475035987371348e-05, + "loss": 0.5089, + "step": 238 + }, + { + "epoch": 0.384, + "learning_rate": 1.4574735377371669e-05, + "loss": 0.3623, + "step": 240 + }, + { + "epoch": 0.3872, + "learning_rate": 1.4673862961567604e-05, + "loss": 0.2898, + "step": 242 + }, + { + "epoch": 0.3904, + "learning_rate": 1.4772406349793749e-05, + "loss": 0.9582, + "step": 244 + }, + { + "epoch": 0.3936, + "learning_rate": 1.4870353224904563e-05, + "loss": 0.1007, + "step": 246 + }, + { + "epoch": 0.3968, + "learning_rate": 1.4967691344313988e-05, + "loss": 0.39, + "step": 248 + }, + { + "epoch": 0.4, + "learning_rate": 1.5064408541525568e-05, + "loss": 0.0082, + "step": 250 + }, + { + "epoch": 0.4032, + "learning_rate": 1.5160492727653245e-05, + "loss": 0.2143, + "step": 252 + }, + { + "epoch": 0.4064, + "learning_rate": 1.5255931892932322e-05, + "loss": 0.6342, + "step": 254 + }, + { + "epoch": 0.4096, + "learning_rate": 1.5350714108220667e-05, + "loss": 0.5303, + "step": 256 + }, + { + "epoch": 0.4128, + "learning_rate": 1.5444827526489668e-05, + "loss": 0.459, + "step": 258 + }, + { + "epoch": 0.416, + "learning_rate": 1.5538260384305073e-05, + "loss": 0.434, + "step": 260 + }, + { + "epoch": 0.4192, + "learning_rate": 1.563100100329731e-05, + "loss": 0.0634, + "step": 262 + }, + { + "epoch": 0.4224, + "learning_rate": 1.572303779162118e-05, + "loss": 0.0668, + "step": 264 + }, + { + "epoch": 0.4256, + "learning_rate": 1.581435924540481e-05, + "loss": 0.2634, + "step": 266 + }, + { + "epoch": 0.4288, + "learning_rate": 1.5904953950187455e-05, + "loss": 0.4033, + "step": 268 + }, + { + "epoch": 0.432, + "learning_rate": 1.599481058234626e-05, + "loss": 0.0395, + "step": 270 + }, + { + "epoch": 0.4352, + "learning_rate": 1.6083917910511616e-05, + "loss": 0.2896, + "step": 272 + }, + { + "epoch": 0.4384, + "learning_rate": 1.617226479697104e-05, + "loss": 0.3215, + "step": 274 + }, + { + "epoch": 0.4416, + "learning_rate": 1.6259840199061212e-05, + "loss": 0.1937, + "step": 276 + }, + { + "epoch": 0.4448, + "learning_rate": 1.6346633170548285e-05, + "loss": 0.4305, + "step": 278 + }, + { + "epoch": 0.448, + "learning_rate": 1.6432632862996042e-05, + "loss": 0.019, + "step": 280 + }, + { + "epoch": 0.4512, + "learning_rate": 1.6517828527121928e-05, + "loss": 0.548, + "step": 282 + }, + { + "epoch": 0.4544, + "learning_rate": 1.6602209514140542e-05, + "loss": 0.4799, + "step": 284 + }, + { + "epoch": 0.4576, + "learning_rate": 1.6685765277094695e-05, + "loss": 0.3917, + "step": 286 + }, + { + "epoch": 0.4608, + "learning_rate": 1.6768485372173696e-05, + "loss": 0.6199, + "step": 288 + }, + { + "epoch": 0.464, + "learning_rate": 1.6850359460018733e-05, + "loss": 0.8202, + "step": 290 + }, + { + "epoch": 0.4672, + "learning_rate": 1.6931377307015226e-05, + "loss": 0.0064, + "step": 292 + }, + { + "epoch": 0.4704, + "learning_rate": 1.701152878657196e-05, + "loss": 0.13, + "step": 294 + }, + { + "epoch": 0.4736, + "learning_rate": 1.7090803880386778e-05, + "loss": 0.9176, + "step": 296 + }, + { + "epoch": 0.4768, + "learning_rate": 1.716919267969883e-05, + "loss": 0.6146, + "step": 298 + }, + { + "epoch": 0.48, + "learning_rate": 1.7246685386527095e-05, + "loss": 0.142, + "step": 300 + }, + { + "epoch": 0.4832, + "learning_rate": 1.7323272314895022e-05, + "loss": 0.613, + "step": 302 + }, + { + "epoch": 0.4864, + "learning_rate": 1.7398943892041227e-05, + "loss": 0.1759, + "step": 304 + }, + { + "epoch": 0.4896, + "learning_rate": 1.7473690659615992e-05, + "loss": 0.3598, + "step": 306 + }, + { + "epoch": 0.4928, + "learning_rate": 1.7547503274863495e-05, + "loss": 0.7282, + "step": 308 + }, + { + "epoch": 0.496, + "learning_rate": 1.7620372511789604e-05, + "loss": 0.0885, + "step": 310 + }, + { + "epoch": 0.4992, + "learning_rate": 1.7692289262315e-05, + "loss": 0.3294, + "step": 312 + }, + { + "epoch": 0.5024, + "learning_rate": 1.7763244537413657e-05, + "loss": 0.0262, + "step": 314 + }, + { + "epoch": 0.5056, + "learning_rate": 1.7833229468236364e-05, + "loss": 0.1026, + "step": 316 + }, + { + "epoch": 0.5088, + "learning_rate": 1.790223530721933e-05, + "loss": 0.1589, + "step": 318 + }, + { + "epoch": 0.512, + "learning_rate": 1.7970253429177477e-05, + "loss": 0.7074, + "step": 320 + }, + { + "epoch": 0.5152, + "learning_rate": 1.803727533238257e-05, + "loss": 0.4914, + "step": 322 + }, + { + "epoch": 0.5184, + "learning_rate": 1.8103292639625842e-05, + "loss": 0.1835, + "step": 324 + }, + { + "epoch": 0.5216, + "learning_rate": 1.816829709926509e-05, + "loss": 0.0233, + "step": 326 + }, + { + "epoch": 0.5248, + "learning_rate": 1.8232280586256097e-05, + "loss": 0.0566, + "step": 328 + }, + { + "epoch": 0.528, + "learning_rate": 1.829523510316813e-05, + "loss": 0.6869, + "step": 330 + }, + { + "epoch": 0.5312, + "learning_rate": 1.8357152781183606e-05, + "loss": 0.1445, + "step": 332 + }, + { + "epoch": 0.5344, + "learning_rate": 1.8418025881081606e-05, + "loss": 0.7789, + "step": 334 + }, + { + "epoch": 0.5376, + "learning_rate": 1.8477846794205258e-05, + "loss": 0.5204, + "step": 336 + }, + { + "epoch": 0.5408, + "learning_rate": 1.8536608043412695e-05, + "loss": 0.1039, + "step": 338 + }, + { + "epoch": 0.544, + "learning_rate": 1.85943022840117e-05, + "loss": 0.4561, + "step": 340 + }, + { + "epoch": 0.5472, + "learning_rate": 1.865092230467769e-05, + "loss": 0.0233, + "step": 342 + }, + { + "epoch": 0.5504, + "learning_rate": 1.87064610283551e-05, + "loss": 0.1351, + "step": 344 + }, + { + "epoch": 0.5536, + "learning_rate": 1.876091151314196e-05, + "loss": 0.0252, + "step": 346 + }, + { + "epoch": 0.5568, + "learning_rate": 1.8814266953157557e-05, + "loss": 0.3327, + "step": 348 + }, + { + "epoch": 0.56, + "learning_rate": 1.8866520679393127e-05, + "loss": 0.3069, + "step": 350 + }, + { + "epoch": 0.5632, + "learning_rate": 1.8917666160545436e-05, + "loss": 0.4257, + "step": 352 + }, + { + "epoch": 0.5664, + "learning_rate": 1.896769700383315e-05, + "loss": 0.5447, + "step": 354 + }, + { + "epoch": 0.5696, + "learning_rate": 1.901660695579585e-05, + "loss": 0.024, + "step": 356 + }, + { + "epoch": 0.5728, + "learning_rate": 1.9064389903075676e-05, + "loss": 0.5923, + "step": 358 + }, + { + "epoch": 0.576, + "learning_rate": 1.911103987318148e-05, + "loss": 0.8732, + "step": 360 + }, + { + "epoch": 0.5792, + "learning_rate": 1.9156551035235288e-05, + "loss": 0.4092, + "step": 362 + }, + { + "epoch": 0.5824, + "learning_rate": 1.9200917700701173e-05, + "loss": 0.4093, + "step": 364 + }, + { + "epoch": 0.5856, + "learning_rate": 1.924413432409622e-05, + "loss": 0.0291, + "step": 366 + }, + { + "epoch": 0.5888, + "learning_rate": 1.9286195503683705e-05, + "loss": 0.3259, + "step": 368 + }, + { + "epoch": 0.592, + "learning_rate": 1.932709598214825e-05, + "loss": 0.3957, + "step": 370 + }, + { + "epoch": 0.5952, + "learning_rate": 1.9366830647252967e-05, + "loss": 0.5752, + "step": 372 + }, + { + "epoch": 0.5984, + "learning_rate": 1.940539453247842e-05, + "loss": 0.4475, + "step": 374 + }, + { + "epoch": 0.6016, + "learning_rate": 1.944278281764342e-05, + "loss": 0.004, + "step": 376 + }, + { + "epoch": 0.6048, + "learning_rate": 1.9478990829507504e-05, + "loss": 0.1485, + "step": 378 + }, + { + "epoch": 0.608, + "learning_rate": 1.951401404235505e-05, + "loss": 0.035, + "step": 380 + }, + { + "epoch": 0.6112, + "learning_rate": 1.9547848078560975e-05, + "loss": 0.2546, + "step": 382 + }, + { + "epoch": 0.6144, + "learning_rate": 1.9580488709137858e-05, + "loss": 0.0937, + "step": 384 + }, + { + "epoch": 0.6176, + "learning_rate": 1.961193185426459e-05, + "loss": 0.2251, + "step": 386 + }, + { + "epoch": 0.6208, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.6789, + "step": 388 + }, + { + "epoch": 0.624, + "learning_rate": 1.967121011775546e-05, + "loss": 0.06, + "step": 390 + }, + { + "epoch": 0.6272, + "learning_rate": 1.969903782680467e-05, + "loss": 0.0109, + "step": 392 + }, + { + "epoch": 0.6304, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.1111, + "step": 394 + }, + { + "epoch": 0.6336, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.1314, + "step": 396 + }, + { + "epoch": 0.6368, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.57, + "step": 398 + }, + { + "epoch": 0.64, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.5711, + "step": 400 + }, + { + "epoch": 0.6432, + "learning_rate": 1.9819927571953807e-05, + "loss": 0.1568, + "step": 402 + }, + { + "epoch": 0.6464, + "learning_rate": 1.9840434606066182e-05, + "loss": 0.5302, + "step": 404 + }, + { + "epoch": 0.6496, + "learning_rate": 1.985971166354357e-05, + "loss": 0.0343, + "step": 406 + }, + { + "epoch": 0.6528, + "learning_rate": 1.9877756334905983e-05, + "loss": 0.0959, + "step": 408 + }, + { + "epoch": 0.656, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.0006, + "step": 410 + }, + { + "epoch": 0.6592, + "learning_rate": 1.99101396518405e-05, + "loss": 0.3317, + "step": 412 + }, + { + "epoch": 0.6624, + "learning_rate": 1.9924474249753652e-05, + "loss": 0.1517, + "step": 414 + }, + { + "epoch": 0.6656, + "learning_rate": 1.9937568366739858e-05, + "loss": 0.2894, + "step": 416 + }, + { + "epoch": 0.6688, + "learning_rate": 1.994942036613787e-05, + "loss": 0.479, + "step": 418 + }, + { + "epoch": 0.672, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.8059, + "step": 420 + }, + { + "epoch": 0.6752, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.6762, + "step": 422 + }, + { + "epoch": 0.6784, + "learning_rate": 1.9977509622105233e-05, + "loss": 0.0211, + "step": 424 + }, + { + "epoch": 0.6816, + "learning_rate": 1.998437989229673e-05, + "loss": 0.1284, + "step": 426 + }, + { + "epoch": 0.6848, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.4959, + "step": 428 + }, + { + "epoch": 0.688, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.9201, + "step": 430 + }, + { + "epoch": 0.6912, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.4045, + "step": 432 + }, + { + "epoch": 0.6944, + "learning_rate": 1.9999375039475275e-05, + "loss": 0.6011, + "step": 434 + }, + { + "epoch": 0.6976, + "learning_rate": 2e-05, + "loss": 0.0079, + "step": 436 + }, + { + "epoch": 0.7008, + "learning_rate": 1.9999375039475278e-05, + "loss": 0.2923, + "step": 438 + }, + { + "epoch": 0.704, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.3311, + "step": 440 + }, + { + "epoch": 0.7072, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.475, + "step": 442 + }, + { + "epoch": 0.7104, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.5985, + "step": 444 + }, + { + "epoch": 0.7136, + "learning_rate": 1.9984379892296735e-05, + "loss": 0.2905, + "step": 446 + }, + { + "epoch": 0.7168, + "learning_rate": 1.9977509622105236e-05, + "loss": 0.41, + "step": 448 + }, + { + "epoch": 0.72, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.0997, + "step": 450 + }, + { + "epoch": 0.7232, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.4197, + "step": 452 + }, + { + "epoch": 0.7264, + "learning_rate": 1.9949420366137873e-05, + "loss": 0.531, + "step": 454 + }, + { + "epoch": 0.7296, + "learning_rate": 1.993756836673986e-05, + "loss": 0.1413, + "step": 456 + }, + { + "epoch": 0.7328, + "learning_rate": 1.9924474249753656e-05, + "loss": 0.3965, + "step": 458 + }, + { + "epoch": 0.736, + "learning_rate": 1.9910139651840497e-05, + "loss": 0.6252, + "step": 460 + }, + { + "epoch": 0.7392, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.0171, + "step": 462 + }, + { + "epoch": 0.7424, + "learning_rate": 1.987775633490599e-05, + "loss": 0.0143, + "step": 464 + }, + { + "epoch": 0.7456, + "learning_rate": 1.9859711663543573e-05, + "loss": 0.0024, + "step": 466 + }, + { + "epoch": 0.7488, + "learning_rate": 1.9840434606066186e-05, + "loss": 0.0078, + "step": 468 + }, + { + "epoch": 0.752, + "learning_rate": 1.9819927571953804e-05, + "loss": 0.3243, + "step": 470 + }, + { + "epoch": 0.7552, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.172, + "step": 472 + }, + { + "epoch": 0.7584, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.2799, + "step": 474 + }, + { + "epoch": 0.7616, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.5695, + "step": 476 + }, + { + "epoch": 0.7648, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.0014, + "step": 478 + }, + { + "epoch": 0.768, + "learning_rate": 1.969903782680467e-05, + "loss": 0.339, + "step": 480 + }, + { + "epoch": 0.7712, + "learning_rate": 1.9671210117755462e-05, + "loss": 0.1612, + "step": 482 + }, + { + "epoch": 0.7744, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.5288, + "step": 484 + }, + { + "epoch": 0.7776, + "learning_rate": 1.961193185426459e-05, + "loss": 0.0344, + "step": 486 + }, + { + "epoch": 0.7808, + "learning_rate": 1.958048870913786e-05, + "loss": 0.2335, + "step": 488 + }, + { + "epoch": 0.784, + "learning_rate": 1.9547848078560982e-05, + "loss": 0.1126, + "step": 490 + }, + { + "epoch": 0.7872, + "learning_rate": 1.9514014042355054e-05, + "loss": 0.3188, + "step": 492 + }, + { + "epoch": 0.7904, + "learning_rate": 1.947899082950751e-05, + "loss": 0.1649, + "step": 494 + }, + { + "epoch": 0.7936, + "learning_rate": 1.9442782817643425e-05, + "loss": 0.113, + "step": 496 + }, + { + "epoch": 0.7968, + "learning_rate": 1.9405394532478422e-05, + "loss": 0.0208, + "step": 498 + }, + { + "epoch": 0.8, + "learning_rate": 1.9366830647252977e-05, + "loss": 0.9054, + "step": 500 + }, + { + "epoch": 0.8032, + "learning_rate": 1.9327095982148255e-05, + "loss": 0.8002, + "step": 502 + }, + { + "epoch": 0.8064, + "learning_rate": 1.928619550368371e-05, + "loss": 0.5833, + "step": 504 + }, + { + "epoch": 0.8096, + "learning_rate": 1.9244134324096216e-05, + "loss": 0.3013, + "step": 506 + }, + { + "epoch": 0.8128, + "learning_rate": 1.9200917700701176e-05, + "loss": 0.1436, + "step": 508 + }, + { + "epoch": 0.816, + "learning_rate": 1.9156551035235298e-05, + "loss": 0.0074, + "step": 510 + }, + { + "epoch": 0.8192, + "learning_rate": 1.9111039873181475e-05, + "loss": 0.3746, + "step": 512 + }, + { + "epoch": 0.8224, + "learning_rate": 1.9064389903075683e-05, + "loss": 0.333, + "step": 514 + }, + { + "epoch": 0.8256, + "learning_rate": 1.9016606955795843e-05, + "loss": 0.3691, + "step": 516 + }, + { + "epoch": 0.8288, + "learning_rate": 1.8967697003833156e-05, + "loss": 0.0542, + "step": 518 + }, + { + "epoch": 0.832, + "learning_rate": 1.891766616054545e-05, + "loss": 0.1936, + "step": 520 + }, + { + "epoch": 0.8352, + "learning_rate": 1.8866520679393124e-05, + "loss": 0.409, + "step": 522 + }, + { + "epoch": 0.8384, + "learning_rate": 1.881426695315756e-05, + "loss": 0.4607, + "step": 524 + }, + { + "epoch": 0.8416, + "learning_rate": 1.8760911513141974e-05, + "loss": 0.2343, + "step": 526 + }, + { + "epoch": 0.8448, + "learning_rate": 1.8706461028355107e-05, + "loss": 0.0826, + "step": 528 + }, + { + "epoch": 0.848, + "learning_rate": 1.86509223046777e-05, + "loss": 0.2732, + "step": 530 + }, + { + "epoch": 0.8512, + "learning_rate": 1.8594302284011697e-05, + "loss": 0.1587, + "step": 532 + }, + { + "epoch": 0.8544, + "learning_rate": 1.8536608043412702e-05, + "loss": 0.1673, + "step": 534 + }, + { + "epoch": 0.8576, + "learning_rate": 1.847784679420527e-05, + "loss": 0.418, + "step": 536 + }, + { + "epoch": 0.8608, + "learning_rate": 1.841802588108161e-05, + "loss": 0.0498, + "step": 538 + }, + { + "epoch": 0.864, + "learning_rate": 1.8357152781183613e-05, + "loss": 0.1067, + "step": 540 + }, + { + "epoch": 0.8672, + "learning_rate": 1.8295235103168128e-05, + "loss": 0.4733, + "step": 542 + }, + { + "epoch": 0.8704, + "learning_rate": 1.8232280586256104e-05, + "loss": 0.3421, + "step": 544 + }, + { + "epoch": 0.8736, + "learning_rate": 1.8168297099265108e-05, + "loss": 0.0142, + "step": 546 + }, + { + "epoch": 0.8768, + "learning_rate": 1.8103292639625835e-05, + "loss": 0.4063, + "step": 548 + }, + { + "epoch": 0.88, + "learning_rate": 1.8037275332382575e-05, + "loss": 0.1884, + "step": 550 + }, + { + "epoch": 0.8832, + "learning_rate": 1.7970253429177494e-05, + "loss": 0.1479, + "step": 552 + }, + { + "epoch": 0.8864, + "learning_rate": 1.7902235307219336e-05, + "loss": 0.8131, + "step": 554 + }, + { + "epoch": 0.8896, + "learning_rate": 1.783322946823638e-05, + "loss": 1.4739, + "step": 556 + }, + { + "epoch": 0.8928, + "learning_rate": 1.776324453741365e-05, + "loss": 0.0167, + "step": 558 + }, + { + "epoch": 0.896, + "learning_rate": 1.7692289262315008e-05, + "loss": 0.1035, + "step": 560 + }, + { + "epoch": 0.8992, + "learning_rate": 1.762037251178961e-05, + "loss": 0.0091, + "step": 562 + }, + { + "epoch": 0.9024, + "learning_rate": 1.7547503274863502e-05, + "loss": 0.5114, + "step": 564 + }, + { + "epoch": 0.9056, + "learning_rate": 1.7473690659616e-05, + "loss": 0.1832, + "step": 566 + }, + { + "epoch": 0.9088, + "learning_rate": 1.739894389204122e-05, + "loss": 0.1871, + "step": 568 + }, + { + "epoch": 0.912, + "learning_rate": 1.732327231489503e-05, + "loss": 0.4769, + "step": 570 + }, + { + "epoch": 0.9152, + "learning_rate": 1.7246685386527105e-05, + "loss": 0.0114, + "step": 572 + }, + { + "epoch": 0.9184, + "learning_rate": 1.716919267969884e-05, + "loss": 0.1715, + "step": 574 + }, + { + "epoch": 0.9216, + "learning_rate": 1.7090803880386784e-05, + "loss": 0.0107, + "step": 576 + }, + { + "epoch": 0.9248, + "learning_rate": 1.701152878657197e-05, + "loss": 0.3016, + "step": 578 + }, + { + "epoch": 0.928, + "learning_rate": 1.6931377307015236e-05, + "loss": 0.0272, + "step": 580 + }, + { + "epoch": 0.9312, + "learning_rate": 1.6850359460018744e-05, + "loss": 0.3739, + "step": 582 + }, + { + "epoch": 0.9344, + "learning_rate": 1.67684853721737e-05, + "loss": 0.9064, + "step": 584 + }, + { + "epoch": 0.9376, + "learning_rate": 1.6685765277094702e-05, + "loss": 0.2249, + "step": 586 + }, + { + "epoch": 0.9408, + "learning_rate": 1.6602209514140562e-05, + "loss": 0.6704, + "step": 588 + }, + { + "epoch": 0.944, + "learning_rate": 1.651782852712194e-05, + "loss": 0.1726, + "step": 590 + }, + { + "epoch": 0.9472, + "learning_rate": 1.6432632862996062e-05, + "loss": 0.2114, + "step": 592 + }, + { + "epoch": 0.9504, + "learning_rate": 1.6346633170548275e-05, + "loss": 0.0606, + "step": 594 + }, + { + "epoch": 0.9536, + "learning_rate": 1.625984019906122e-05, + "loss": 0.184, + "step": 596 + }, + { + "epoch": 0.9568, + "learning_rate": 1.6172264796971063e-05, + "loss": 0.516, + "step": 598 + }, + { + "epoch": 0.96, + "learning_rate": 1.6083917910511623e-05, + "loss": 0.0286, + "step": 600 + }, + { + "epoch": 0.9632, + "learning_rate": 1.5994810582346266e-05, + "loss": 0.2475, + "step": 602 + }, + { + "epoch": 0.9664, + "learning_rate": 1.5904953950187448e-05, + "loss": 0.2101, + "step": 604 + }, + { + "epoch": 0.9696, + "learning_rate": 1.581435924540482e-05, + "loss": 0.2451, + "step": 606 + }, + { + "epoch": 0.9728, + "learning_rate": 1.5723037791621203e-05, + "loss": 0.3071, + "step": 608 + }, + { + "epoch": 0.976, + "learning_rate": 1.5631001003297302e-05, + "loss": 0.223, + "step": 610 + }, + { + "epoch": 0.9792, + "learning_rate": 1.5538260384305083e-05, + "loss": 0.1875, + "step": 612 + }, + { + "epoch": 0.9824, + "learning_rate": 1.544482752648966e-05, + "loss": 0.1216, + "step": 614 + }, + { + "epoch": 0.9856, + "learning_rate": 1.5350714108220677e-05, + "loss": 0.0796, + "step": 616 + }, + { + "epoch": 0.9888, + "learning_rate": 1.5255931892932344e-05, + "loss": 0.1365, + "step": 618 + }, + { + "epoch": 0.992, + "learning_rate": 1.5160492727653238e-05, + "loss": 0.5363, + "step": 620 + }, + { + "epoch": 0.9952, + "learning_rate": 1.5064408541525578e-05, + "loss": 0.3338, + "step": 622 + }, + { + "epoch": 0.9984, + "learning_rate": 1.4967691344314012e-05, + "loss": 0.3644, + "step": 624 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 3523135233327104.0, + "train_loss": 0.28257334279920904, + "train_runtime": 3413.6411, + "train_samples_per_second": 2.929, + "train_steps_per_second": 0.183 + } + ], + "logging_steps": 2, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 3523135233327104.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..08cccaa3a02b73b12c9ce642343252ea2485ba8c --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8edd93d657e7e9d35486e01f5d3766e956ff7856de7b1bac69a0a2a33707ab8b +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..61cbf09ca000f8a623e105cb6d5978ace168cf92 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c34a96a6e506cef0d9f29efce3eaec9754d717ad62ad5333cd8ea1994f502701 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..82f215f8d3f7e0236c3faee1201e918dbb1da3b7 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8298b28044f650bdff76a13efded749a58dc9a6552887937bbde7a04397441e8 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..8a85f204b32d267e8ae2a4ea578374cd2251b3ae --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9347c433b5ebd3b5020b1f157bc71e4067e52fba6ab26b446af171ab1d6060a1 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_divbs_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_divbs_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..01304b367372e220978a2fc19943b2bacd78359b --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_divbs_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,1904 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "learning_rate": 2.4524967251364995e-06, + "loss": 0.6146, + "step": 2 + }, + { + "epoch": 0.0064, + "learning_rate": 2.5263093403840022e-06, + "loss": 0.2054, + "step": 4 + }, + { + "epoch": 0.0096, + "learning_rate": 2.6010561079587694e-06, + "loss": 0.4025, + "step": 6 + }, + { + "epoch": 0.0128, + "learning_rate": 2.6767276851049716e-06, + "loss": 0.5981, + "step": 8 + }, + { + "epoch": 0.016, + "learning_rate": 2.7533146134728993e-06, + "loss": 0.6929, + "step": 10 + }, + { + "epoch": 0.0192, + "learning_rate": 2.8308073203011634e-06, + "loss": 0.7516, + "step": 12 + }, + { + "epoch": 0.0224, + "learning_rate": 2.909196119613218e-06, + "loss": 0.4648, + "step": 14 + }, + { + "epoch": 0.0256, + "learning_rate": 2.988471213428035e-06, + "loss": 0.3468, + "step": 16 + }, + { + "epoch": 0.0288, + "learning_rate": 3.068622692984767e-06, + "loss": 0.379, + "step": 18 + }, + { + "epoch": 0.032, + "learning_rate": 3.1496405399812602e-06, + "loss": 0.3141, + "step": 20 + }, + { + "epoch": 0.0352, + "learning_rate": 3.231514627826302e-06, + "loss": 0.3576, + "step": 22 + }, + { + "epoch": 0.0384, + "learning_rate": 3.314234722905302e-06, + "loss": 0.4298, + "step": 24 + }, + { + "epoch": 0.0416, + "learning_rate": 3.3977904858594534e-06, + "loss": 0.2964, + "step": 26 + }, + { + "epoch": 0.0448, + "learning_rate": 3.4821714728780654e-06, + "loss": 0.0854, + "step": 28 + }, + { + "epoch": 0.048, + "learning_rate": 3.567367137003953e-06, + "loss": 0.9886, + "step": 30 + }, + { + "epoch": 0.0512, + "learning_rate": 3.653366829451711e-06, + "loss": 0.2087, + "step": 32 + }, + { + "epoch": 0.0544, + "learning_rate": 3.740159800938784e-06, + "loss": 0.1179, + "step": 34 + }, + { + "epoch": 0.0576, + "learning_rate": 3.827735203028956e-06, + "loss": 0.5781, + "step": 36 + }, + { + "epoch": 0.0608, + "learning_rate": 3.916082089488379e-06, + "loss": 0.3181, + "step": 38 + }, + { + "epoch": 0.064, + "learning_rate": 4.005189417653737e-06, + "loss": 0.7001, + "step": 40 + }, + { + "epoch": 0.0672, + "learning_rate": 4.095046049812541e-06, + "loss": 0.534, + "step": 42 + }, + { + "epoch": 0.0704, + "learning_rate": 4.1856407545951825e-06, + "loss": 1.0717, + "step": 44 + }, + { + "epoch": 0.0736, + "learning_rate": 4.276962208378814e-06, + "loss": 0.5274, + "step": 46 + }, + { + "epoch": 0.0768, + "learning_rate": 4.368998996702686e-06, + "loss": 0.5314, + "step": 48 + }, + { + "epoch": 0.08, + "learning_rate": 4.461739615694921e-06, + "loss": 0.4112, + "step": 50 + }, + { + "epoch": 0.0832, + "learning_rate": 4.555172473510324e-06, + "loss": 0.8289, + "step": 52 + }, + { + "epoch": 0.0864, + "learning_rate": 4.649285891779326e-06, + "loss": 0.516, + "step": 54 + }, + { + "epoch": 0.0896, + "learning_rate": 4.744068107067673e-06, + "loss": 0.2971, + "step": 56 + }, + { + "epoch": 0.0928, + "learning_rate": 4.839507272346751e-06, + "loss": 0.2742, + "step": 58 + }, + { + "epoch": 0.096, + "learning_rate": 4.935591458474425e-06, + "loss": 0.491, + "step": 60 + }, + { + "epoch": 0.0992, + "learning_rate": 5.032308655686007e-06, + "loss": 0.237, + "step": 62 + }, + { + "epoch": 0.1024, + "learning_rate": 5.129646775095432e-06, + "loss": 0.7654, + "step": 64 + }, + { + "epoch": 0.1056, + "learning_rate": 5.227593650206246e-06, + "loss": 0.1849, + "step": 66 + }, + { + "epoch": 0.1088, + "learning_rate": 5.3261370384323904e-06, + "loss": 0.1932, + "step": 68 + }, + { + "epoch": 0.112, + "learning_rate": 5.425264622628326e-06, + "loss": 0.8049, + "step": 70 + }, + { + "epoch": 0.1152, + "learning_rate": 5.524964012628644e-06, + "loss": 0.3282, + "step": 72 + }, + { + "epoch": 0.1184, + "learning_rate": 5.62522274679673e-06, + "loss": 0.243, + "step": 74 + }, + { + "epoch": 0.1216, + "learning_rate": 5.726028293582342e-06, + "loss": 0.7267, + "step": 76 + }, + { + "epoch": 0.1248, + "learning_rate": 5.827368053088032e-06, + "loss": 0.3372, + "step": 78 + }, + { + "epoch": 0.128, + "learning_rate": 5.929229358643925e-06, + "loss": 0.5171, + "step": 80 + }, + { + "epoch": 0.1312, + "learning_rate": 6.03159947839103e-06, + "loss": 0.4375, + "step": 82 + }, + { + "epoch": 0.1344, + "learning_rate": 6.13446561687258e-06, + "loss": 0.4506, + "step": 84 + }, + { + "epoch": 0.1376, + "learning_rate": 6.237814916633431e-06, + "loss": 0.651, + "step": 86 + }, + { + "epoch": 0.1408, + "learning_rate": 6.341634459827044e-06, + "loss": 0.8014, + "step": 88 + }, + { + "epoch": 0.144, + "learning_rate": 6.445911269830183e-06, + "loss": 0.2773, + "step": 90 + }, + { + "epoch": 0.1472, + "learning_rate": 6.5506323128648654e-06, + "loss": 0.5722, + "step": 92 + }, + { + "epoch": 0.1504, + "learning_rate": 6.655784499627476e-06, + "loss": 0.5714, + "step": 94 + }, + { + "epoch": 0.1536, + "learning_rate": 6.761354686924883e-06, + "loss": 1.4643, + "step": 96 + }, + { + "epoch": 0.1568, + "learning_rate": 6.867329679317144e-06, + "loss": 0.6916, + "step": 98 + }, + { + "epoch": 0.16, + "learning_rate": 6.973696230766884e-06, + "loss": 0.5161, + "step": 100 + }, + { + "epoch": 0.1632, + "learning_rate": 7.080441046294945e-06, + "loss": 0.4103, + "step": 102 + }, + { + "epoch": 0.1664, + "learning_rate": 7.18755078364214e-06, + "loss": 0.1365, + "step": 104 + }, + { + "epoch": 0.1696, + "learning_rate": 7.2950120549369204e-06, + "loss": 0.6711, + "step": 106 + }, + { + "epoch": 0.1728, + "learning_rate": 7.402811428368824e-06, + "loss": 0.8708, + "step": 108 + }, + { + "epoch": 0.176, + "learning_rate": 7.510935429867233e-06, + "loss": 0.4039, + "step": 110 + }, + { + "epoch": 0.1792, + "learning_rate": 7.619370544785608e-06, + "loss": 0.6915, + "step": 112 + }, + { + "epoch": 0.1824, + "learning_rate": 7.728103219590684e-06, + "loss": 0.7916, + "step": 114 + }, + { + "epoch": 0.1856, + "learning_rate": 7.83711986355656e-06, + "loss": 0.6757, + "step": 116 + }, + { + "epoch": 0.1888, + "learning_rate": 7.946406850463435e-06, + "loss": 0.4944, + "step": 118 + }, + { + "epoch": 0.192, + "learning_rate": 8.055950520300756e-06, + "loss": 0.3472, + "step": 120 + }, + { + "epoch": 0.1952, + "learning_rate": 8.165737180974676e-06, + "loss": 0.6433, + "step": 122 + }, + { + "epoch": 0.1984, + "learning_rate": 8.275753110019367e-06, + "loss": 0.9815, + "step": 124 + }, + { + "epoch": 0.2016, + "learning_rate": 8.385984556312285e-06, + "loss": 0.6509, + "step": 126 + }, + { + "epoch": 0.2048, + "learning_rate": 8.496417741792922e-06, + "loss": 0.5941, + "step": 128 + }, + { + "epoch": 0.208, + "learning_rate": 8.607038863184952e-06, + "loss": 0.5394, + "step": 130 + }, + { + "epoch": 0.2112, + "learning_rate": 8.717834093721598e-06, + "loss": 0.3444, + "step": 132 + }, + { + "epoch": 0.2144, + "learning_rate": 8.828789584873757e-06, + "loss": 0.4633, + "step": 134 + }, + { + "epoch": 0.2176, + "learning_rate": 8.939891468081036e-06, + "loss": 0.6362, + "step": 136 + }, + { + "epoch": 0.2208, + "learning_rate": 9.051125856485175e-06, + "loss": 0.2793, + "step": 138 + }, + { + "epoch": 0.224, + "learning_rate": 9.162478846665854e-06, + "loss": 1.0965, + "step": 140 + }, + { + "epoch": 0.2272, + "learning_rate": 9.273936520378426e-06, + "loss": 0.5318, + "step": 142 + }, + { + "epoch": 0.2304, + "learning_rate": 9.38548494629364e-06, + "loss": 0.6123, + "step": 144 + }, + { + "epoch": 0.2336, + "learning_rate": 9.497110181738935e-06, + "loss": 0.2697, + "step": 146 + }, + { + "epoch": 0.2368, + "learning_rate": 9.608798274441153e-06, + "loss": 0.4086, + "step": 148 + }, + { + "epoch": 0.24, + "learning_rate": 9.720535264270526e-06, + "loss": 0.4048, + "step": 150 + }, + { + "epoch": 0.2432, + "learning_rate": 9.832307184985473e-06, + "loss": 0.1204, + "step": 152 + }, + { + "epoch": 0.2464, + "learning_rate": 9.944100065978354e-06, + "loss": 0.4174, + "step": 154 + }, + { + "epoch": 0.2496, + "learning_rate": 1.0055899934021637e-05, + "loss": 0.374, + "step": 156 + }, + { + "epoch": 0.2528, + "learning_rate": 1.016769281501452e-05, + "loss": 0.4033, + "step": 158 + }, + { + "epoch": 0.256, + "learning_rate": 1.0279464735729467e-05, + "loss": 0.4257, + "step": 160 + }, + { + "epoch": 0.2592, + "learning_rate": 1.039120172555884e-05, + "loss": 0.3176, + "step": 162 + }, + { + "epoch": 0.2624, + "learning_rate": 1.0502889818261058e-05, + "loss": 0.489, + "step": 164 + }, + { + "epoch": 0.2656, + "learning_rate": 1.0614515053706354e-05, + "loss": 0.6562, + "step": 166 + }, + { + "epoch": 0.2688, + "learning_rate": 1.0726063479621567e-05, + "loss": 0.6438, + "step": 168 + }, + { + "epoch": 0.272, + "learning_rate": 1.083752115333414e-05, + "loss": 0.6896, + "step": 170 + }, + { + "epoch": 0.2752, + "learning_rate": 1.0948874143514818e-05, + "loss": 0.3064, + "step": 172 + }, + { + "epoch": 0.2784, + "learning_rate": 1.1060108531918955e-05, + "loss": 0.3117, + "step": 174 + }, + { + "epoch": 0.2816, + "learning_rate": 1.1171210415126238e-05, + "loss": 0.4046, + "step": 176 + }, + { + "epoch": 0.2848, + "learning_rate": 1.1282165906278395e-05, + "loss": 0.479, + "step": 178 + }, + { + "epoch": 0.288, + "learning_rate": 1.1392961136815041e-05, + "loss": 3.3903, + "step": 180 + }, + { + "epoch": 0.2912, + "learning_rate": 1.150358225820707e-05, + "loss": 0.475, + "step": 182 + }, + { + "epoch": 0.2944, + "learning_rate": 1.1614015443687708e-05, + "loss": 0.6139, + "step": 184 + }, + { + "epoch": 0.2976, + "learning_rate": 1.1724246889980626e-05, + "loss": 0.3805, + "step": 186 + }, + { + "epoch": 0.3008, + "learning_rate": 1.1834262819025317e-05, + "loss": 0.8042, + "step": 188 + }, + { + "epoch": 0.304, + "learning_rate": 1.1944049479699241e-05, + "loss": 0.7214, + "step": 190 + }, + { + "epoch": 0.3072, + "learning_rate": 1.2053593149536557e-05, + "loss": 0.1696, + "step": 192 + }, + { + "epoch": 0.3104, + "learning_rate": 1.2162880136443434e-05, + "loss": 0.2186, + "step": 194 + }, + { + "epoch": 0.3136, + "learning_rate": 1.2271896780409309e-05, + "loss": 0.7518, + "step": 196 + }, + { + "epoch": 0.3168, + "learning_rate": 1.2380629455214385e-05, + "loss": 1.0384, + "step": 198 + }, + { + "epoch": 0.32, + "learning_rate": 1.2489064570132761e-05, + "loss": 0.2304, + "step": 200 + }, + { + "epoch": 0.3232, + "learning_rate": 1.259718857163117e-05, + "loss": 0.7747, + "step": 202 + }, + { + "epoch": 0.3264, + "learning_rate": 1.2704987945063073e-05, + "loss": 0.5036, + "step": 204 + }, + { + "epoch": 0.3296, + "learning_rate": 1.2812449216357855e-05, + "loss": 0.2968, + "step": 206 + }, + { + "epoch": 0.3328, + "learning_rate": 1.2919558953705047e-05, + "loss": 0.4474, + "step": 208 + }, + { + "epoch": 0.336, + "learning_rate": 1.3026303769233109e-05, + "loss": 0.4951, + "step": 210 + }, + { + "epoch": 0.3392, + "learning_rate": 1.313267032068285e-05, + "loss": 0.3272, + "step": 212 + }, + { + "epoch": 0.3424, + "learning_rate": 1.3238645313075109e-05, + "loss": 0.7141, + "step": 214 + }, + { + "epoch": 0.3456, + "learning_rate": 1.3344215500372517e-05, + "loss": 0.4329, + "step": 216 + }, + { + "epoch": 0.3488, + "learning_rate": 1.344936768713513e-05, + "loss": 0.3164, + "step": 218 + }, + { + "epoch": 0.352, + "learning_rate": 1.3554088730169812e-05, + "loss": 0.2702, + "step": 220 + }, + { + "epoch": 0.3552, + "learning_rate": 1.3658365540172948e-05, + "loss": 0.4299, + "step": 222 + }, + { + "epoch": 0.3584, + "learning_rate": 1.3762185083366562e-05, + "loss": 0.4197, + "step": 224 + }, + { + "epoch": 0.3616, + "learning_rate": 1.3865534383127413e-05, + "loss": 0.4456, + "step": 226 + }, + { + "epoch": 0.3648, + "learning_rate": 1.3968400521608962e-05, + "loss": 0.3374, + "step": 228 + }, + { + "epoch": 0.368, + "learning_rate": 1.4070770641356069e-05, + "loss": 0.3931, + "step": 230 + }, + { + "epoch": 0.3712, + "learning_rate": 1.4172631946911964e-05, + "loss": 0.4724, + "step": 232 + }, + { + "epoch": 0.3744, + "learning_rate": 1.4273971706417653e-05, + "loss": 0.5238, + "step": 234 + }, + { + "epoch": 0.3776, + "learning_rate": 1.4374777253203265e-05, + "loss": 0.4868, + "step": 236 + }, + { + "epoch": 0.3808, + "learning_rate": 1.4475035987371348e-05, + "loss": 0.4074, + "step": 238 + }, + { + "epoch": 0.384, + "learning_rate": 1.4574735377371669e-05, + "loss": 0.6099, + "step": 240 + }, + { + "epoch": 0.3872, + "learning_rate": 1.4673862961567604e-05, + "loss": 0.2957, + "step": 242 + }, + { + "epoch": 0.3904, + "learning_rate": 1.4772406349793749e-05, + "loss": 1.0299, + "step": 244 + }, + { + "epoch": 0.3936, + "learning_rate": 1.4870353224904563e-05, + "loss": 0.6435, + "step": 246 + }, + { + "epoch": 0.3968, + "learning_rate": 1.4967691344313988e-05, + "loss": 0.4243, + "step": 248 + }, + { + "epoch": 0.4, + "learning_rate": 1.5064408541525568e-05, + "loss": 0.6385, + "step": 250 + }, + { + "epoch": 0.4032, + "learning_rate": 1.5160492727653245e-05, + "loss": 0.3106, + "step": 252 + }, + { + "epoch": 0.4064, + "learning_rate": 1.5255931892932322e-05, + "loss": 0.1259, + "step": 254 + }, + { + "epoch": 0.4096, + "learning_rate": 1.5350714108220667e-05, + "loss": 0.7569, + "step": 256 + }, + { + "epoch": 0.4128, + "learning_rate": 1.5444827526489668e-05, + "loss": 0.6572, + "step": 258 + }, + { + "epoch": 0.416, + "learning_rate": 1.5538260384305073e-05, + "loss": 0.2119, + "step": 260 + }, + { + "epoch": 0.4192, + "learning_rate": 1.563100100329731e-05, + "loss": 0.6382, + "step": 262 + }, + { + "epoch": 0.4224, + "learning_rate": 1.572303779162118e-05, + "loss": 0.5905, + "step": 264 + }, + { + "epoch": 0.4256, + "learning_rate": 1.581435924540481e-05, + "loss": 0.3332, + "step": 266 + }, + { + "epoch": 0.4288, + "learning_rate": 1.5904953950187455e-05, + "loss": 0.2212, + "step": 268 + }, + { + "epoch": 0.432, + "learning_rate": 1.599481058234626e-05, + "loss": 0.5301, + "step": 270 + }, + { + "epoch": 0.4352, + "learning_rate": 1.6083917910511616e-05, + "loss": 0.4882, + "step": 272 + }, + { + "epoch": 0.4384, + "learning_rate": 1.617226479697104e-05, + "loss": 0.3687, + "step": 274 + }, + { + "epoch": 0.4416, + "learning_rate": 1.6259840199061212e-05, + "loss": 0.7489, + "step": 276 + }, + { + "epoch": 0.4448, + "learning_rate": 1.6346633170548285e-05, + "loss": 0.3595, + "step": 278 + }, + { + "epoch": 0.448, + "learning_rate": 1.6432632862996042e-05, + "loss": 0.3554, + "step": 280 + }, + { + "epoch": 0.4512, + "learning_rate": 1.6517828527121928e-05, + "loss": 0.3562, + "step": 282 + }, + { + "epoch": 0.4544, + "learning_rate": 1.6602209514140542e-05, + "loss": 0.7977, + "step": 284 + }, + { + "epoch": 0.4576, + "learning_rate": 1.6685765277094695e-05, + "loss": 0.2954, + "step": 286 + }, + { + "epoch": 0.4608, + "learning_rate": 1.6768485372173696e-05, + "loss": 0.3499, + "step": 288 + }, + { + "epoch": 0.464, + "learning_rate": 1.6850359460018733e-05, + "loss": 0.2234, + "step": 290 + }, + { + "epoch": 0.4672, + "learning_rate": 1.6931377307015226e-05, + "loss": 0.34, + "step": 292 + }, + { + "epoch": 0.4704, + "learning_rate": 1.701152878657196e-05, + "loss": 0.395, + "step": 294 + }, + { + "epoch": 0.4736, + "learning_rate": 1.7090803880386778e-05, + "loss": 0.3482, + "step": 296 + }, + { + "epoch": 0.4768, + "learning_rate": 1.716919267969883e-05, + "loss": 0.1296, + "step": 298 + }, + { + "epoch": 0.48, + "learning_rate": 1.7246685386527095e-05, + "loss": 0.425, + "step": 300 + }, + { + "epoch": 0.4832, + "learning_rate": 1.7323272314895022e-05, + "loss": 0.4906, + "step": 302 + }, + { + "epoch": 0.4864, + "learning_rate": 1.7398943892041227e-05, + "loss": 0.377, + "step": 304 + }, + { + "epoch": 0.4896, + "learning_rate": 1.7473690659615992e-05, + "loss": 0.7725, + "step": 306 + }, + { + "epoch": 0.4928, + "learning_rate": 1.7547503274863495e-05, + "loss": 0.8413, + "step": 308 + }, + { + "epoch": 0.496, + "learning_rate": 1.7620372511789604e-05, + "loss": 0.2823, + "step": 310 + }, + { + "epoch": 0.4992, + "learning_rate": 1.7692289262315e-05, + "loss": 0.1789, + "step": 312 + }, + { + "epoch": 0.5024, + "learning_rate": 1.7763244537413657e-05, + "loss": 0.3746, + "step": 314 + }, + { + "epoch": 0.5056, + "learning_rate": 1.7833229468236364e-05, + "loss": 0.4935, + "step": 316 + }, + { + "epoch": 0.5088, + "learning_rate": 1.790223530721933e-05, + "loss": 0.7973, + "step": 318 + }, + { + "epoch": 0.512, + "learning_rate": 1.7970253429177477e-05, + "loss": 0.6278, + "step": 320 + }, + { + "epoch": 0.5152, + "learning_rate": 1.803727533238257e-05, + "loss": 0.969, + "step": 322 + }, + { + "epoch": 0.5184, + "learning_rate": 1.8103292639625842e-05, + "loss": 0.4981, + "step": 324 + }, + { + "epoch": 0.5216, + "learning_rate": 1.816829709926509e-05, + "loss": 1.3557, + "step": 326 + }, + { + "epoch": 0.5248, + "learning_rate": 1.8232280586256097e-05, + "loss": 0.267, + "step": 328 + }, + { + "epoch": 0.528, + "learning_rate": 1.829523510316813e-05, + "loss": 0.4215, + "step": 330 + }, + { + "epoch": 0.5312, + "learning_rate": 1.8357152781183606e-05, + "loss": 0.4732, + "step": 332 + }, + { + "epoch": 0.5344, + "learning_rate": 1.8418025881081606e-05, + "loss": 0.6734, + "step": 334 + }, + { + "epoch": 0.5376, + "learning_rate": 1.8477846794205258e-05, + "loss": 0.2478, + "step": 336 + }, + { + "epoch": 0.5408, + "learning_rate": 1.8536608043412695e-05, + "loss": 0.6252, + "step": 338 + }, + { + "epoch": 0.544, + "learning_rate": 1.85943022840117e-05, + "loss": 0.9087, + "step": 340 + }, + { + "epoch": 0.5472, + "learning_rate": 1.865092230467769e-05, + "loss": 0.6885, + "step": 342 + }, + { + "epoch": 0.5504, + "learning_rate": 1.87064610283551e-05, + "loss": 0.447, + "step": 344 + }, + { + "epoch": 0.5536, + "learning_rate": 1.876091151314196e-05, + "loss": 0.2639, + "step": 346 + }, + { + "epoch": 0.5568, + "learning_rate": 1.8814266953157557e-05, + "loss": 0.3608, + "step": 348 + }, + { + "epoch": 0.56, + "learning_rate": 1.8866520679393127e-05, + "loss": 0.1993, + "step": 350 + }, + { + "epoch": 0.5632, + "learning_rate": 1.8917666160545436e-05, + "loss": 0.432, + "step": 352 + }, + { + "epoch": 0.5664, + "learning_rate": 1.896769700383315e-05, + "loss": 0.2783, + "step": 354 + }, + { + "epoch": 0.5696, + "learning_rate": 1.901660695579585e-05, + "loss": 0.4735, + "step": 356 + }, + { + "epoch": 0.5728, + "learning_rate": 1.9064389903075676e-05, + "loss": 0.2125, + "step": 358 + }, + { + "epoch": 0.576, + "learning_rate": 1.911103987318148e-05, + "loss": 0.2939, + "step": 360 + }, + { + "epoch": 0.5792, + "learning_rate": 1.9156551035235288e-05, + "loss": 0.7765, + "step": 362 + }, + { + "epoch": 0.5824, + "learning_rate": 1.9200917700701173e-05, + "loss": 0.2491, + "step": 364 + }, + { + "epoch": 0.5856, + "learning_rate": 1.924413432409622e-05, + "loss": 0.7522, + "step": 366 + }, + { + "epoch": 0.5888, + "learning_rate": 1.9286195503683705e-05, + "loss": 0.4949, + "step": 368 + }, + { + "epoch": 0.592, + "learning_rate": 1.932709598214825e-05, + "loss": 0.2942, + "step": 370 + }, + { + "epoch": 0.5952, + "learning_rate": 1.9366830647252967e-05, + "loss": 0.4678, + "step": 372 + }, + { + "epoch": 0.5984, + "learning_rate": 1.940539453247842e-05, + "loss": 0.3907, + "step": 374 + }, + { + "epoch": 0.6016, + "learning_rate": 1.944278281764342e-05, + "loss": 0.3525, + "step": 376 + }, + { + "epoch": 0.6048, + "learning_rate": 1.9478990829507504e-05, + "loss": 0.3866, + "step": 378 + }, + { + "epoch": 0.608, + "learning_rate": 1.951401404235505e-05, + "loss": 0.4149, + "step": 380 + }, + { + "epoch": 0.6112, + "learning_rate": 1.9547848078560975e-05, + "loss": 0.216, + "step": 382 + }, + { + "epoch": 0.6144, + "learning_rate": 1.9580488709137858e-05, + "loss": 0.8168, + "step": 384 + }, + { + "epoch": 0.6176, + "learning_rate": 1.961193185426459e-05, + "loss": 0.5152, + "step": 386 + }, + { + "epoch": 0.6208, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.4164, + "step": 388 + }, + { + "epoch": 0.624, + "learning_rate": 1.967121011775546e-05, + "loss": 0.3975, + "step": 390 + }, + { + "epoch": 0.6272, + "learning_rate": 1.969903782680467e-05, + "loss": 0.722, + "step": 392 + }, + { + "epoch": 0.6304, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.581, + "step": 394 + }, + { + "epoch": 0.6336, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.4199, + "step": 396 + }, + { + "epoch": 0.6368, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.2482, + "step": 398 + }, + { + "epoch": 0.64, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.5844, + "step": 400 + }, + { + "epoch": 0.6432, + "learning_rate": 1.9819927571953807e-05, + "loss": 0.6136, + "step": 402 + }, + { + "epoch": 0.6464, + "learning_rate": 1.9840434606066182e-05, + "loss": 0.3273, + "step": 404 + }, + { + "epoch": 0.6496, + "learning_rate": 1.985971166354357e-05, + "loss": 0.38, + "step": 406 + }, + { + "epoch": 0.6528, + "learning_rate": 1.9877756334905983e-05, + "loss": 0.4104, + "step": 408 + }, + { + "epoch": 0.656, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.553, + "step": 410 + }, + { + "epoch": 0.6592, + "learning_rate": 1.99101396518405e-05, + "loss": 0.2728, + "step": 412 + }, + { + "epoch": 0.6624, + "learning_rate": 1.9924474249753652e-05, + "loss": 0.3134, + "step": 414 + }, + { + "epoch": 0.6656, + "learning_rate": 1.9937568366739858e-05, + "loss": 0.376, + "step": 416 + }, + { + "epoch": 0.6688, + "learning_rate": 1.994942036613787e-05, + "loss": 0.4006, + "step": 418 + }, + { + "epoch": 0.672, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.3528, + "step": 420 + }, + { + "epoch": 0.6752, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.358, + "step": 422 + }, + { + "epoch": 0.6784, + "learning_rate": 1.9977509622105233e-05, + "loss": 0.5557, + "step": 424 + }, + { + "epoch": 0.6816, + "learning_rate": 1.998437989229673e-05, + "loss": 0.3374, + "step": 426 + }, + { + "epoch": 0.6848, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.5765, + "step": 428 + }, + { + "epoch": 0.688, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.3994, + "step": 430 + }, + { + "epoch": 0.6912, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.5955, + "step": 432 + }, + { + "epoch": 0.6944, + "learning_rate": 1.9999375039475275e-05, + "loss": 0.491, + "step": 434 + }, + { + "epoch": 0.6976, + "learning_rate": 2e-05, + "loss": 0.4032, + "step": 436 + }, + { + "epoch": 0.7008, + "learning_rate": 1.9999375039475278e-05, + "loss": 0.1566, + "step": 438 + }, + { + "epoch": 0.704, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.3797, + "step": 440 + }, + { + "epoch": 0.7072, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.3815, + "step": 442 + }, + { + "epoch": 0.7104, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.3353, + "step": 444 + }, + { + "epoch": 0.7136, + "learning_rate": 1.9984379892296735e-05, + "loss": 0.2807, + "step": 446 + }, + { + "epoch": 0.7168, + "learning_rate": 1.9977509622105236e-05, + "loss": 0.2739, + "step": 448 + }, + { + "epoch": 0.72, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.313, + "step": 450 + }, + { + "epoch": 0.7232, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.6094, + "step": 452 + }, + { + "epoch": 0.7264, + "learning_rate": 1.9949420366137873e-05, + "loss": 0.4315, + "step": 454 + }, + { + "epoch": 0.7296, + "learning_rate": 1.993756836673986e-05, + "loss": 0.418, + "step": 456 + }, + { + "epoch": 0.7328, + "learning_rate": 1.9924474249753656e-05, + "loss": 0.1987, + "step": 458 + }, + { + "epoch": 0.736, + "learning_rate": 1.9910139651840497e-05, + "loss": 0.1957, + "step": 460 + }, + { + "epoch": 0.7392, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.3727, + "step": 462 + }, + { + "epoch": 0.7424, + "learning_rate": 1.987775633490599e-05, + "loss": 0.3998, + "step": 464 + }, + { + "epoch": 0.7456, + "learning_rate": 1.9859711663543573e-05, + "loss": 0.2162, + "step": 466 + }, + { + "epoch": 0.7488, + "learning_rate": 1.9840434606066186e-05, + "loss": 0.9561, + "step": 468 + }, + { + "epoch": 0.752, + "learning_rate": 1.9819927571953804e-05, + "loss": 0.2485, + "step": 470 + }, + { + "epoch": 0.7552, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.464, + "step": 472 + }, + { + "epoch": 0.7584, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.5935, + "step": 474 + }, + { + "epoch": 0.7616, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.4563, + "step": 476 + }, + { + "epoch": 0.7648, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.6052, + "step": 478 + }, + { + "epoch": 0.768, + "learning_rate": 1.969903782680467e-05, + "loss": 0.6623, + "step": 480 + }, + { + "epoch": 0.7712, + "learning_rate": 1.9671210117755462e-05, + "loss": 0.5662, + "step": 482 + }, + { + "epoch": 0.7744, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.5735, + "step": 484 + }, + { + "epoch": 0.7776, + "learning_rate": 1.961193185426459e-05, + "loss": 0.4004, + "step": 486 + }, + { + "epoch": 0.7808, + "learning_rate": 1.958048870913786e-05, + "loss": 0.2139, + "step": 488 + }, + { + "epoch": 0.784, + "learning_rate": 1.9547848078560982e-05, + "loss": 0.8248, + "step": 490 + }, + { + "epoch": 0.7872, + "learning_rate": 1.9514014042355054e-05, + "loss": 0.3159, + "step": 492 + }, + { + "epoch": 0.7904, + "learning_rate": 1.947899082950751e-05, + "loss": 0.2233, + "step": 494 + }, + { + "epoch": 0.7936, + "learning_rate": 1.9442782817643425e-05, + "loss": 0.7077, + "step": 496 + }, + { + "epoch": 0.7968, + "learning_rate": 1.9405394532478422e-05, + "loss": 0.3517, + "step": 498 + }, + { + "epoch": 0.8, + "learning_rate": 1.9366830647252977e-05, + "loss": 0.4569, + "step": 500 + }, + { + "epoch": 0.8032, + "learning_rate": 1.9327095982148255e-05, + "loss": 0.3031, + "step": 502 + }, + { + "epoch": 0.8064, + "learning_rate": 1.928619550368371e-05, + "loss": 0.4575, + "step": 504 + }, + { + "epoch": 0.8096, + "learning_rate": 1.9244134324096216e-05, + "loss": 0.8606, + "step": 506 + }, + { + "epoch": 0.8128, + "learning_rate": 1.9200917700701176e-05, + "loss": 0.1693, + "step": 508 + }, + { + "epoch": 0.816, + "learning_rate": 1.9156551035235298e-05, + "loss": 0.8113, + "step": 510 + }, + { + "epoch": 0.8192, + "learning_rate": 1.9111039873181475e-05, + "loss": 0.926, + "step": 512 + }, + { + "epoch": 0.8224, + "learning_rate": 1.9064389903075683e-05, + "loss": 0.6971, + "step": 514 + }, + { + "epoch": 0.8256, + "learning_rate": 1.9016606955795843e-05, + "loss": 0.3265, + "step": 516 + }, + { + "epoch": 0.8288, + "learning_rate": 1.8967697003833156e-05, + "loss": 0.4668, + "step": 518 + }, + { + "epoch": 0.832, + "learning_rate": 1.891766616054545e-05, + "loss": 0.5266, + "step": 520 + }, + { + "epoch": 0.8352, + "learning_rate": 1.8866520679393124e-05, + "loss": 0.3557, + "step": 522 + }, + { + "epoch": 0.8384, + "learning_rate": 1.881426695315756e-05, + "loss": 0.619, + "step": 524 + }, + { + "epoch": 0.8416, + "learning_rate": 1.8760911513141974e-05, + "loss": 0.3365, + "step": 526 + }, + { + "epoch": 0.8448, + "learning_rate": 1.8706461028355107e-05, + "loss": 0.5723, + "step": 528 + }, + { + "epoch": 0.848, + "learning_rate": 1.86509223046777e-05, + "loss": 0.438, + "step": 530 + }, + { + "epoch": 0.8512, + "learning_rate": 1.8594302284011697e-05, + "loss": 0.4045, + "step": 532 + }, + { + "epoch": 0.8544, + "learning_rate": 1.8536608043412702e-05, + "loss": 0.2484, + "step": 534 + }, + { + "epoch": 0.8576, + "learning_rate": 1.847784679420527e-05, + "loss": 0.2137, + "step": 536 + }, + { + "epoch": 0.8608, + "learning_rate": 1.841802588108161e-05, + "loss": 0.3481, + "step": 538 + }, + { + "epoch": 0.864, + "learning_rate": 1.8357152781183613e-05, + "loss": 0.7572, + "step": 540 + }, + { + "epoch": 0.8672, + "learning_rate": 1.8295235103168128e-05, + "loss": 0.2932, + "step": 542 + }, + { + "epoch": 0.8704, + "learning_rate": 1.8232280586256104e-05, + "loss": 0.4697, + "step": 544 + }, + { + "epoch": 0.8736, + "learning_rate": 1.8168297099265108e-05, + "loss": 0.5688, + "step": 546 + }, + { + "epoch": 0.8768, + "learning_rate": 1.8103292639625835e-05, + "loss": 0.136, + "step": 548 + }, + { + "epoch": 0.88, + "learning_rate": 1.8037275332382575e-05, + "loss": 0.4427, + "step": 550 + }, + { + "epoch": 0.8832, + "learning_rate": 1.7970253429177494e-05, + "loss": 0.3046, + "step": 552 + }, + { + "epoch": 0.8864, + "learning_rate": 1.7902235307219336e-05, + "loss": 0.4759, + "step": 554 + }, + { + "epoch": 0.8896, + "learning_rate": 1.783322946823638e-05, + "loss": 0.4999, + "step": 556 + }, + { + "epoch": 0.8928, + "learning_rate": 1.776324453741365e-05, + "loss": 0.2137, + "step": 558 + }, + { + "epoch": 0.896, + "learning_rate": 1.7692289262315008e-05, + "loss": 0.4347, + "step": 560 + }, + { + "epoch": 0.8992, + "learning_rate": 1.762037251178961e-05, + "loss": 0.5361, + "step": 562 + }, + { + "epoch": 0.9024, + "learning_rate": 1.7547503274863502e-05, + "loss": 0.482, + "step": 564 + }, + { + "epoch": 0.9056, + "learning_rate": 1.7473690659616e-05, + "loss": 0.2961, + "step": 566 + }, + { + "epoch": 0.9088, + "learning_rate": 1.739894389204122e-05, + "loss": 0.3957, + "step": 568 + }, + { + "epoch": 0.912, + "learning_rate": 1.732327231489503e-05, + "loss": 0.6299, + "step": 570 + }, + { + "epoch": 0.9152, + "learning_rate": 1.7246685386527105e-05, + "loss": 0.4689, + "step": 572 + }, + { + "epoch": 0.9184, + "learning_rate": 1.716919267969884e-05, + "loss": 0.2789, + "step": 574 + }, + { + "epoch": 0.9216, + "learning_rate": 1.7090803880386784e-05, + "loss": 0.4365, + "step": 576 + }, + { + "epoch": 0.9248, + "learning_rate": 1.701152878657197e-05, + "loss": 0.605, + "step": 578 + }, + { + "epoch": 0.928, + "learning_rate": 1.6931377307015236e-05, + "loss": 0.343, + "step": 580 + }, + { + "epoch": 0.9312, + "learning_rate": 1.6850359460018744e-05, + "loss": 0.2095, + "step": 582 + }, + { + "epoch": 0.9344, + "learning_rate": 1.67684853721737e-05, + "loss": 0.2724, + "step": 584 + }, + { + "epoch": 0.9376, + "learning_rate": 1.6685765277094702e-05, + "loss": 0.1679, + "step": 586 + }, + { + "epoch": 0.9408, + "learning_rate": 1.6602209514140562e-05, + "loss": 0.5185, + "step": 588 + }, + { + "epoch": 0.944, + "learning_rate": 1.651782852712194e-05, + "loss": 0.2153, + "step": 590 + }, + { + "epoch": 0.9472, + "learning_rate": 1.6432632862996062e-05, + "loss": 0.139, + "step": 592 + }, + { + "epoch": 0.9504, + "learning_rate": 1.6346633170548275e-05, + "loss": 0.4417, + "step": 594 + }, + { + "epoch": 0.9536, + "learning_rate": 1.625984019906122e-05, + "loss": 0.8611, + "step": 596 + }, + { + "epoch": 0.9568, + "learning_rate": 1.6172264796971063e-05, + "loss": 0.1911, + "step": 598 + }, + { + "epoch": 0.96, + "learning_rate": 1.6083917910511623e-05, + "loss": 0.2645, + "step": 600 + }, + { + "epoch": 0.9632, + "learning_rate": 1.5994810582346266e-05, + "loss": 0.5107, + "step": 602 + }, + { + "epoch": 0.9664, + "learning_rate": 1.5904953950187448e-05, + "loss": 0.4218, + "step": 604 + }, + { + "epoch": 0.9696, + "learning_rate": 1.581435924540482e-05, + "loss": 0.5054, + "step": 606 + }, + { + "epoch": 0.9728, + "learning_rate": 1.5723037791621203e-05, + "loss": 0.5874, + "step": 608 + }, + { + "epoch": 0.976, + "learning_rate": 1.5631001003297302e-05, + "loss": 0.4475, + "step": 610 + }, + { + "epoch": 0.9792, + "learning_rate": 1.5538260384305083e-05, + "loss": 0.5059, + "step": 612 + }, + { + "epoch": 0.9824, + "learning_rate": 1.544482752648966e-05, + "loss": 0.17, + "step": 614 + }, + { + "epoch": 0.9856, + "learning_rate": 1.5350714108220677e-05, + "loss": 0.0608, + "step": 616 + }, + { + "epoch": 0.9888, + "learning_rate": 1.5255931892932344e-05, + "loss": 0.1834, + "step": 618 + }, + { + "epoch": 0.992, + "learning_rate": 1.5160492727653238e-05, + "loss": 0.2795, + "step": 620 + }, + { + "epoch": 0.9952, + "learning_rate": 1.5064408541525578e-05, + "loss": 0.3034, + "step": 622 + }, + { + "epoch": 0.9984, + "learning_rate": 1.4967691344314012e-05, + "loss": 0.472, + "step": 624 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 0, + "train_loss": 0.4759340593457222, + "train_runtime": 2952.9029, + "train_samples_per_second": 3.386, + "train_steps_per_second": 0.212 + } + ], + "logging_steps": 2, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5b3a5282b4f949f8fb63d2be714f9b5632416fc2 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dffd9bd42f34d1827469635d24f43b03b31536ad164f381a1f65164e5470321a +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..48b64189d8b67207ada7b039bb6d8f510d532537 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce6d5dae7ddf8fca1ca15ffb674f343835b36809501b52e2e635b5d7eda28814 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..886bdc34b0590fa5cb24c8dd4098e3309dbd26fc --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa08bd11c216502fd952ec771c17b90bb86dee458c09727fe50a43e4d15c1a3d +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef20fe397543df2be8e5de5071ddcffc4940d309 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9db542c5a6813efb07a01d9432b8801770383acf26b603c808864eebeaa90a44 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_gradnorm_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_gradnorm_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..259a3b60a4dfab8bde31c9b5d9594d5c3ae681d2 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_gradnorm_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,2216 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 11.924293518066406, + "learning_rate": 2.4524967251364995e-06, + "loss": 0.935, + "step": 2 + }, + { + "epoch": 0.0064, + "grad_norm": 3.9490787982940674, + "learning_rate": 2.5263093403840022e-06, + "loss": 0.3235, + "step": 4 + }, + { + "epoch": 0.0096, + "grad_norm": 5.28393030166626, + "learning_rate": 2.6010561079587694e-06, + "loss": 0.4202, + "step": 6 + }, + { + "epoch": 0.0128, + "grad_norm": 5.9949727058410645, + "learning_rate": 2.6767276851049716e-06, + "loss": 0.5825, + "step": 8 + }, + { + "epoch": 0.016, + "grad_norm": 11.239922523498535, + "learning_rate": 2.7533146134728993e-06, + "loss": 0.553, + "step": 10 + }, + { + "epoch": 0.0192, + "grad_norm": 5.190208435058594, + "learning_rate": 2.8308073203011634e-06, + "loss": 0.7123, + "step": 12 + }, + { + "epoch": 0.0224, + "grad_norm": 11.484992027282715, + "learning_rate": 2.909196119613218e-06, + "loss": 0.9147, + "step": 14 + }, + { + "epoch": 0.0256, + "grad_norm": 5.685008525848389, + "learning_rate": 2.988471213428035e-06, + "loss": 0.4397, + "step": 16 + }, + { + "epoch": 0.0288, + "grad_norm": 9.455109596252441, + "learning_rate": 3.068622692984767e-06, + "loss": 0.6378, + "step": 18 + }, + { + "epoch": 0.032, + "grad_norm": 4.522568702697754, + "learning_rate": 3.1496405399812602e-06, + "loss": 0.7933, + "step": 20 + }, + { + "epoch": 0.0352, + "grad_norm": 10.944385528564453, + "learning_rate": 3.231514627826302e-06, + "loss": 1.13, + "step": 22 + }, + { + "epoch": 0.0384, + "grad_norm": 10.723428726196289, + "learning_rate": 3.314234722905302e-06, + "loss": 0.3353, + "step": 24 + }, + { + "epoch": 0.0416, + "grad_norm": 3.2999091148376465, + "learning_rate": 3.3977904858594534e-06, + "loss": 0.3621, + "step": 26 + }, + { + "epoch": 0.0448, + "grad_norm": 3.1308634281158447, + "learning_rate": 3.4821714728780654e-06, + "loss": 0.7325, + "step": 28 + }, + { + "epoch": 0.048, + "grad_norm": 8.053144454956055, + "learning_rate": 3.567367137003953e-06, + "loss": 0.4884, + "step": 30 + }, + { + "epoch": 0.0512, + "grad_norm": 25.066747665405273, + "learning_rate": 3.653366829451711e-06, + "loss": 0.8357, + "step": 32 + }, + { + "epoch": 0.0544, + "grad_norm": 8.294330596923828, + "learning_rate": 3.740159800938784e-06, + "loss": 0.557, + "step": 34 + }, + { + "epoch": 0.0576, + "grad_norm": 10.253148078918457, + "learning_rate": 3.827735203028956e-06, + "loss": 0.53, + "step": 36 + }, + { + "epoch": 0.0608, + "grad_norm": 3.1425323486328125, + "learning_rate": 3.916082089488379e-06, + "loss": 0.6137, + "step": 38 + }, + { + "epoch": 0.064, + "grad_norm": 7.860047340393066, + "learning_rate": 4.005189417653737e-06, + "loss": 0.5169, + "step": 40 + }, + { + "epoch": 0.0672, + "grad_norm": 10.487458229064941, + "learning_rate": 4.095046049812541e-06, + "loss": 0.5802, + "step": 42 + }, + { + "epoch": 0.0704, + "grad_norm": 9.957070350646973, + "learning_rate": 4.1856407545951825e-06, + "loss": 0.9286, + "step": 44 + }, + { + "epoch": 0.0736, + "grad_norm": 4.670755863189697, + "learning_rate": 4.276962208378814e-06, + "loss": 0.4913, + "step": 46 + }, + { + "epoch": 0.0768, + "grad_norm": 10.305075645446777, + "learning_rate": 4.368998996702686e-06, + "loss": 0.765, + "step": 48 + }, + { + "epoch": 0.08, + "grad_norm": 9.407020568847656, + "learning_rate": 4.461739615694921e-06, + "loss": 0.4923, + "step": 50 + }, + { + "epoch": 0.0832, + "grad_norm": 23.41385841369629, + "learning_rate": 4.555172473510324e-06, + "loss": 0.6566, + "step": 52 + }, + { + "epoch": 0.0864, + "grad_norm": 2.507441997528076, + "learning_rate": 4.649285891779326e-06, + "loss": 0.8811, + "step": 54 + }, + { + "epoch": 0.0896, + "grad_norm": 15.887990951538086, + "learning_rate": 4.744068107067673e-06, + "loss": 0.7528, + "step": 56 + }, + { + "epoch": 0.0928, + "grad_norm": 7.44704532623291, + "learning_rate": 4.839507272346751e-06, + "loss": 0.4551, + "step": 58 + }, + { + "epoch": 0.096, + "grad_norm": 4.294933795928955, + "learning_rate": 4.935591458474425e-06, + "loss": 0.4674, + "step": 60 + }, + { + "epoch": 0.0992, + "grad_norm": 7.989336013793945, + "learning_rate": 5.032308655686007e-06, + "loss": 0.4042, + "step": 62 + }, + { + "epoch": 0.1024, + "grad_norm": 10.031631469726562, + "learning_rate": 5.129646775095432e-06, + "loss": 0.7977, + "step": 64 + }, + { + "epoch": 0.1056, + "grad_norm": 3.135248899459839, + "learning_rate": 5.227593650206246e-06, + "loss": 0.2882, + "step": 66 + }, + { + "epoch": 0.1088, + "grad_norm": 8.17870807647705, + "learning_rate": 5.3261370384323904e-06, + "loss": 0.4969, + "step": 68 + }, + { + "epoch": 0.112, + "grad_norm": 7.233709812164307, + "learning_rate": 5.425264622628326e-06, + "loss": 0.3908, + "step": 70 + }, + { + "epoch": 0.1152, + "grad_norm": 6.015388488769531, + "learning_rate": 5.524964012628644e-06, + "loss": 0.6212, + "step": 72 + }, + { + "epoch": 0.1184, + "grad_norm": 19.091726303100586, + "learning_rate": 5.62522274679673e-06, + "loss": 0.7046, + "step": 74 + }, + { + "epoch": 0.1216, + "grad_norm": 3.5434837341308594, + "learning_rate": 5.726028293582342e-06, + "loss": 0.6002, + "step": 76 + }, + { + "epoch": 0.1248, + "grad_norm": 8.320732116699219, + "learning_rate": 5.827368053088032e-06, + "loss": 0.5269, + "step": 78 + }, + { + "epoch": 0.128, + "grad_norm": 10.583352088928223, + "learning_rate": 5.929229358643925e-06, + "loss": 0.9489, + "step": 80 + }, + { + "epoch": 0.1312, + "grad_norm": 4.805012226104736, + "learning_rate": 6.03159947839103e-06, + "loss": 0.4776, + "step": 82 + }, + { + "epoch": 0.1344, + "grad_norm": 4.485952854156494, + "learning_rate": 6.13446561687258e-06, + "loss": 0.4304, + "step": 84 + }, + { + "epoch": 0.1376, + "grad_norm": 5.58962345123291, + "learning_rate": 6.237814916633431e-06, + "loss": 0.3162, + "step": 86 + }, + { + "epoch": 0.1408, + "grad_norm": 9.707934379577637, + "learning_rate": 6.341634459827044e-06, + "loss": 0.5723, + "step": 88 + }, + { + "epoch": 0.144, + "grad_norm": 6.585778713226318, + "learning_rate": 6.445911269830183e-06, + "loss": 0.467, + "step": 90 + }, + { + "epoch": 0.1472, + "grad_norm": 3.7008655071258545, + "learning_rate": 6.5506323128648654e-06, + "loss": 0.4938, + "step": 92 + }, + { + "epoch": 0.1504, + "grad_norm": 12.401470184326172, + "learning_rate": 6.655784499627476e-06, + "loss": 0.4442, + "step": 94 + }, + { + "epoch": 0.1536, + "grad_norm": 8.586926460266113, + "learning_rate": 6.761354686924883e-06, + "loss": 0.3653, + "step": 96 + }, + { + "epoch": 0.1568, + "grad_norm": 18.978261947631836, + "learning_rate": 6.867329679317144e-06, + "loss": 0.4901, + "step": 98 + }, + { + "epoch": 0.16, + "grad_norm": 8.923133850097656, + "learning_rate": 6.973696230766884e-06, + "loss": 0.5532, + "step": 100 + }, + { + "epoch": 0.1632, + "grad_norm": 4.800722599029541, + "learning_rate": 7.080441046294945e-06, + "loss": 0.7183, + "step": 102 + }, + { + "epoch": 0.1664, + "grad_norm": 9.728776931762695, + "learning_rate": 7.18755078364214e-06, + "loss": 0.5334, + "step": 104 + }, + { + "epoch": 0.1696, + "grad_norm": 8.5445556640625, + "learning_rate": 7.2950120549369204e-06, + "loss": 0.3432, + "step": 106 + }, + { + "epoch": 0.1728, + "grad_norm": 4.521656036376953, + "learning_rate": 7.402811428368824e-06, + "loss": 0.7348, + "step": 108 + }, + { + "epoch": 0.176, + "grad_norm": 6.669830322265625, + "learning_rate": 7.510935429867233e-06, + "loss": 0.762, + "step": 110 + }, + { + "epoch": 0.1792, + "grad_norm": 4.748926162719727, + "learning_rate": 7.619370544785608e-06, + "loss": 0.4783, + "step": 112 + }, + { + "epoch": 0.1824, + "grad_norm": 7.183364391326904, + "learning_rate": 7.728103219590684e-06, + "loss": 0.4328, + "step": 114 + }, + { + "epoch": 0.1856, + "grad_norm": 14.943339347839355, + "learning_rate": 7.83711986355656e-06, + "loss": 0.8939, + "step": 116 + }, + { + "epoch": 0.1888, + "grad_norm": 4.003686904907227, + "learning_rate": 7.946406850463435e-06, + "loss": 0.705, + "step": 118 + }, + { + "epoch": 0.192, + "grad_norm": 8.128644943237305, + "learning_rate": 8.055950520300756e-06, + "loss": 0.4062, + "step": 120 + }, + { + "epoch": 0.1952, + "grad_norm": 2.2518608570098877, + "learning_rate": 8.165737180974676e-06, + "loss": 0.6835, + "step": 122 + }, + { + "epoch": 0.1984, + "grad_norm": 11.007171630859375, + "learning_rate": 8.275753110019367e-06, + "loss": 0.7353, + "step": 124 + }, + { + "epoch": 0.2016, + "grad_norm": 4.774467468261719, + "learning_rate": 8.385984556312285e-06, + "loss": 0.5589, + "step": 126 + }, + { + "epoch": 0.2048, + "grad_norm": 7.269313335418701, + "learning_rate": 8.496417741792922e-06, + "loss": 0.3103, + "step": 128 + }, + { + "epoch": 0.208, + "grad_norm": 7.521888732910156, + "learning_rate": 8.607038863184952e-06, + "loss": 0.4026, + "step": 130 + }, + { + "epoch": 0.2112, + "grad_norm": 10.690435409545898, + "learning_rate": 8.717834093721598e-06, + "loss": 0.5056, + "step": 132 + }, + { + "epoch": 0.2144, + "grad_norm": 6.21688175201416, + "learning_rate": 8.828789584873757e-06, + "loss": 0.4383, + "step": 134 + }, + { + "epoch": 0.2176, + "grad_norm": 15.672515869140625, + "learning_rate": 8.939891468081036e-06, + "loss": 0.4669, + "step": 136 + }, + { + "epoch": 0.2208, + "grad_norm": 10.475600242614746, + "learning_rate": 9.051125856485175e-06, + "loss": 1.1409, + "step": 138 + }, + { + "epoch": 0.224, + "grad_norm": 11.176009178161621, + "learning_rate": 9.162478846665854e-06, + "loss": 0.7849, + "step": 140 + }, + { + "epoch": 0.2272, + "grad_norm": 20.8123722076416, + "learning_rate": 9.273936520378426e-06, + "loss": 0.5393, + "step": 142 + }, + { + "epoch": 0.2304, + "grad_norm": 5.819454669952393, + "learning_rate": 9.38548494629364e-06, + "loss": 0.5705, + "step": 144 + }, + { + "epoch": 0.2336, + "grad_norm": 3.6171233654022217, + "learning_rate": 9.497110181738935e-06, + "loss": 0.3567, + "step": 146 + }, + { + "epoch": 0.2368, + "grad_norm": 7.882349014282227, + "learning_rate": 9.608798274441153e-06, + "loss": 0.6364, + "step": 148 + }, + { + "epoch": 0.24, + "grad_norm": 2.2136592864990234, + "learning_rate": 9.720535264270526e-06, + "loss": 0.2534, + "step": 150 + }, + { + "epoch": 0.2432, + "grad_norm": 7.360774993896484, + "learning_rate": 9.832307184985473e-06, + "loss": 0.4552, + "step": 152 + }, + { + "epoch": 0.2464, + "grad_norm": 6.251705646514893, + "learning_rate": 9.944100065978354e-06, + "loss": 0.8051, + "step": 154 + }, + { + "epoch": 0.2496, + "grad_norm": 4.254730224609375, + "learning_rate": 1.0055899934021637e-05, + "loss": 0.3781, + "step": 156 + }, + { + "epoch": 0.2528, + "grad_norm": 10.385128021240234, + "learning_rate": 1.016769281501452e-05, + "loss": 0.6839, + "step": 158 + }, + { + "epoch": 0.256, + "grad_norm": 8.325345993041992, + "learning_rate": 1.0279464735729467e-05, + "loss": 0.6583, + "step": 160 + }, + { + "epoch": 0.2592, + "grad_norm": 6.293595790863037, + "learning_rate": 1.039120172555884e-05, + "loss": 0.4719, + "step": 162 + }, + { + "epoch": 0.2624, + "grad_norm": 4.342235565185547, + "learning_rate": 1.0502889818261058e-05, + "loss": 0.4733, + "step": 164 + }, + { + "epoch": 0.2656, + "grad_norm": 7.431089878082275, + "learning_rate": 1.0614515053706354e-05, + "loss": 0.4616, + "step": 166 + }, + { + "epoch": 0.2688, + "grad_norm": 11.834856033325195, + "learning_rate": 1.0726063479621567e-05, + "loss": 2.1594, + "step": 168 + }, + { + "epoch": 0.272, + "grad_norm": 7.422325611114502, + "learning_rate": 1.083752115333414e-05, + "loss": 0.4545, + "step": 170 + }, + { + "epoch": 0.2752, + "grad_norm": 5.249634742736816, + "learning_rate": 1.0948874143514818e-05, + "loss": 0.5212, + "step": 172 + }, + { + "epoch": 0.2784, + "grad_norm": 7.874746799468994, + "learning_rate": 1.1060108531918955e-05, + "loss": 0.29, + "step": 174 + }, + { + "epoch": 0.2816, + "grad_norm": 4.046351432800293, + "learning_rate": 1.1171210415126238e-05, + "loss": 0.5505, + "step": 176 + }, + { + "epoch": 0.2848, + "grad_norm": 4.0093464851379395, + "learning_rate": 1.1282165906278395e-05, + "loss": 0.7834, + "step": 178 + }, + { + "epoch": 0.288, + "grad_norm": 80.90779113769531, + "learning_rate": 1.1392961136815041e-05, + "loss": 2.8767, + "step": 180 + }, + { + "epoch": 0.2912, + "grad_norm": 7.990501880645752, + "learning_rate": 1.150358225820707e-05, + "loss": 0.3704, + "step": 182 + }, + { + "epoch": 0.2944, + "grad_norm": 6.763923645019531, + "learning_rate": 1.1614015443687708e-05, + "loss": 0.3771, + "step": 184 + }, + { + "epoch": 0.2976, + "grad_norm": 7.046037673950195, + "learning_rate": 1.1724246889980626e-05, + "loss": 0.4318, + "step": 186 + }, + { + "epoch": 0.3008, + "grad_norm": 5.150176048278809, + "learning_rate": 1.1834262819025317e-05, + "loss": 0.647, + "step": 188 + }, + { + "epoch": 0.304, + "grad_norm": 5.73397970199585, + "learning_rate": 1.1944049479699241e-05, + "loss": 0.6609, + "step": 190 + }, + { + "epoch": 0.3072, + "grad_norm": 7.286134719848633, + "learning_rate": 1.2053593149536557e-05, + "loss": 0.52, + "step": 192 + }, + { + "epoch": 0.3104, + "grad_norm": 4.6973981857299805, + "learning_rate": 1.2162880136443434e-05, + "loss": 0.5437, + "step": 194 + }, + { + "epoch": 0.3136, + "grad_norm": 10.70791244506836, + "learning_rate": 1.2271896780409309e-05, + "loss": 0.8167, + "step": 196 + }, + { + "epoch": 0.3168, + "grad_norm": 19.98440933227539, + "learning_rate": 1.2380629455214385e-05, + "loss": 1.2987, + "step": 198 + }, + { + "epoch": 0.32, + "grad_norm": 9.09523868560791, + "learning_rate": 1.2489064570132761e-05, + "loss": 0.653, + "step": 200 + }, + { + "epoch": 0.3232, + "grad_norm": 4.038908004760742, + "learning_rate": 1.259718857163117e-05, + "loss": 0.3572, + "step": 202 + }, + { + "epoch": 0.3264, + "grad_norm": 7.391031265258789, + "learning_rate": 1.2704987945063073e-05, + "loss": 0.5846, + "step": 204 + }, + { + "epoch": 0.3296, + "grad_norm": 12.767005920410156, + "learning_rate": 1.2812449216357855e-05, + "loss": 0.5314, + "step": 206 + }, + { + "epoch": 0.3328, + "grad_norm": 7.9224395751953125, + "learning_rate": 1.2919558953705047e-05, + "loss": 0.4724, + "step": 208 + }, + { + "epoch": 0.336, + "grad_norm": 8.15713119506836, + "learning_rate": 1.3026303769233109e-05, + "loss": 0.6462, + "step": 210 + }, + { + "epoch": 0.3392, + "grad_norm": 10.181936264038086, + "learning_rate": 1.313267032068285e-05, + "loss": 0.6951, + "step": 212 + }, + { + "epoch": 0.3424, + "grad_norm": 14.465421676635742, + "learning_rate": 1.3238645313075109e-05, + "loss": 0.8577, + "step": 214 + }, + { + "epoch": 0.3456, + "grad_norm": 4.582299709320068, + "learning_rate": 1.3344215500372517e-05, + "loss": 0.3597, + "step": 216 + }, + { + "epoch": 0.3488, + "grad_norm": 5.042653560638428, + "learning_rate": 1.344936768713513e-05, + "loss": 0.5193, + "step": 218 + }, + { + "epoch": 0.352, + "grad_norm": 9.032974243164062, + "learning_rate": 1.3554088730169812e-05, + "loss": 0.627, + "step": 220 + }, + { + "epoch": 0.3552, + "grad_norm": 9.70946979522705, + "learning_rate": 1.3658365540172948e-05, + "loss": 0.4374, + "step": 222 + }, + { + "epoch": 0.3584, + "grad_norm": 6.085891246795654, + "learning_rate": 1.3762185083366562e-05, + "loss": 0.3727, + "step": 224 + }, + { + "epoch": 0.3616, + "grad_norm": 6.504804611206055, + "learning_rate": 1.3865534383127413e-05, + "loss": 0.6068, + "step": 226 + }, + { + "epoch": 0.3648, + "grad_norm": 3.5541486740112305, + "learning_rate": 1.3968400521608962e-05, + "loss": 0.7332, + "step": 228 + }, + { + "epoch": 0.368, + "grad_norm": 11.440251350402832, + "learning_rate": 1.4070770641356069e-05, + "loss": 0.7768, + "step": 230 + }, + { + "epoch": 0.3712, + "grad_norm": 11.891138076782227, + "learning_rate": 1.4172631946911964e-05, + "loss": 0.8255, + "step": 232 + }, + { + "epoch": 0.3744, + "grad_norm": 7.3828840255737305, + "learning_rate": 1.4273971706417653e-05, + "loss": 0.3546, + "step": 234 + }, + { + "epoch": 0.3776, + "grad_norm": 4.123514652252197, + "learning_rate": 1.4374777253203265e-05, + "loss": 0.4818, + "step": 236 + }, + { + "epoch": 0.3808, + "grad_norm": 4.9134907722473145, + "learning_rate": 1.4475035987371348e-05, + "loss": 0.3604, + "step": 238 + }, + { + "epoch": 0.384, + "grad_norm": 5.931992053985596, + "learning_rate": 1.4574735377371669e-05, + "loss": 0.6014, + "step": 240 + }, + { + "epoch": 0.3872, + "grad_norm": 8.341025352478027, + "learning_rate": 1.4673862961567604e-05, + "loss": 0.4926, + "step": 242 + }, + { + "epoch": 0.3904, + "grad_norm": 6.980299949645996, + "learning_rate": 1.4772406349793749e-05, + "loss": 0.7693, + "step": 244 + }, + { + "epoch": 0.3936, + "grad_norm": 7.777401447296143, + "learning_rate": 1.4870353224904563e-05, + "loss": 0.4093, + "step": 246 + }, + { + "epoch": 0.3968, + "grad_norm": 10.939884185791016, + "learning_rate": 1.4967691344313988e-05, + "loss": 0.7075, + "step": 248 + }, + { + "epoch": 0.4, + "grad_norm": 10.896389961242676, + "learning_rate": 1.5064408541525568e-05, + "loss": 0.6076, + "step": 250 + }, + { + "epoch": 0.4032, + "grad_norm": 12.854388236999512, + "learning_rate": 1.5160492727653245e-05, + "loss": 0.9032, + "step": 252 + }, + { + "epoch": 0.4064, + "grad_norm": 4.0022382736206055, + "learning_rate": 1.5255931892932322e-05, + "loss": 0.3992, + "step": 254 + }, + { + "epoch": 0.4096, + "grad_norm": 12.87756061553955, + "learning_rate": 1.5350714108220667e-05, + "loss": 0.5944, + "step": 256 + }, + { + "epoch": 0.4128, + "grad_norm": 12.931772232055664, + "learning_rate": 1.5444827526489668e-05, + "loss": 0.8289, + "step": 258 + }, + { + "epoch": 0.416, + "grad_norm": 5.9502153396606445, + "learning_rate": 1.5538260384305073e-05, + "loss": 0.8897, + "step": 260 + }, + { + "epoch": 0.4192, + "grad_norm": 6.152744293212891, + "learning_rate": 1.563100100329731e-05, + "loss": 0.6036, + "step": 262 + }, + { + "epoch": 0.4224, + "grad_norm": 7.169362545013428, + "learning_rate": 1.572303779162118e-05, + "loss": 0.4665, + "step": 264 + }, + { + "epoch": 0.4256, + "grad_norm": 25.105201721191406, + "learning_rate": 1.581435924540481e-05, + "loss": 1.1965, + "step": 266 + }, + { + "epoch": 0.4288, + "grad_norm": 5.533500671386719, + "learning_rate": 1.5904953950187455e-05, + "loss": 0.7597, + "step": 268 + }, + { + "epoch": 0.432, + "grad_norm": 8.768479347229004, + "learning_rate": 1.599481058234626e-05, + "loss": 0.6425, + "step": 270 + }, + { + "epoch": 0.4352, + "grad_norm": 10.155613899230957, + "learning_rate": 1.6083917910511616e-05, + "loss": 0.5022, + "step": 272 + }, + { + "epoch": 0.4384, + "grad_norm": 6.755585193634033, + "learning_rate": 1.617226479697104e-05, + "loss": 0.4028, + "step": 274 + }, + { + "epoch": 0.4416, + "grad_norm": 10.291326522827148, + "learning_rate": 1.6259840199061212e-05, + "loss": 0.6177, + "step": 276 + }, + { + "epoch": 0.4448, + "grad_norm": 9.281944274902344, + "learning_rate": 1.6346633170548285e-05, + "loss": 0.6795, + "step": 278 + }, + { + "epoch": 0.448, + "grad_norm": 8.107259750366211, + "learning_rate": 1.6432632862996042e-05, + "loss": 0.5509, + "step": 280 + }, + { + "epoch": 0.4512, + "grad_norm": 15.570850372314453, + "learning_rate": 1.6517828527121928e-05, + "loss": 0.7286, + "step": 282 + }, + { + "epoch": 0.4544, + "grad_norm": 8.614456176757812, + "learning_rate": 1.6602209514140542e-05, + "loss": 0.8644, + "step": 284 + }, + { + "epoch": 0.4576, + "grad_norm": 2.1356348991394043, + "learning_rate": 1.6685765277094695e-05, + "loss": 0.3399, + "step": 286 + }, + { + "epoch": 0.4608, + "grad_norm": 4.229339599609375, + "learning_rate": 1.6768485372173696e-05, + "loss": 0.4909, + "step": 288 + }, + { + "epoch": 0.464, + "grad_norm": 4.009982585906982, + "learning_rate": 1.6850359460018733e-05, + "loss": 0.8696, + "step": 290 + }, + { + "epoch": 0.4672, + "grad_norm": 4.7873101234436035, + "learning_rate": 1.6931377307015226e-05, + "loss": 0.4585, + "step": 292 + }, + { + "epoch": 0.4704, + "grad_norm": 7.444589138031006, + "learning_rate": 1.701152878657196e-05, + "loss": 0.858, + "step": 294 + }, + { + "epoch": 0.4736, + "grad_norm": 9.475300788879395, + "learning_rate": 1.7090803880386778e-05, + "loss": 0.591, + "step": 296 + }, + { + "epoch": 0.4768, + "grad_norm": 7.943042278289795, + "learning_rate": 1.716919267969883e-05, + "loss": 0.7363, + "step": 298 + }, + { + "epoch": 0.48, + "grad_norm": 7.825207233428955, + "learning_rate": 1.7246685386527095e-05, + "loss": 0.6927, + "step": 300 + }, + { + "epoch": 0.4832, + "grad_norm": 3.181072473526001, + "learning_rate": 1.7323272314895022e-05, + "loss": 0.5384, + "step": 302 + }, + { + "epoch": 0.4864, + "grad_norm": 8.772839546203613, + "learning_rate": 1.7398943892041227e-05, + "loss": 0.6545, + "step": 304 + }, + { + "epoch": 0.4896, + "grad_norm": 9.947606086730957, + "learning_rate": 1.7473690659615992e-05, + "loss": 0.5997, + "step": 306 + }, + { + "epoch": 0.4928, + "grad_norm": 7.244208335876465, + "learning_rate": 1.7547503274863495e-05, + "loss": 0.4672, + "step": 308 + }, + { + "epoch": 0.496, + "grad_norm": 5.831227779388428, + "learning_rate": 1.7620372511789604e-05, + "loss": 0.6727, + "step": 310 + }, + { + "epoch": 0.4992, + "grad_norm": 11.163945198059082, + "learning_rate": 1.7692289262315e-05, + "loss": 0.8782, + "step": 312 + }, + { + "epoch": 0.5024, + "grad_norm": 5.730398654937744, + "learning_rate": 1.7763244537413657e-05, + "loss": 0.4557, + "step": 314 + }, + { + "epoch": 0.5056, + "grad_norm": 8.259847640991211, + "learning_rate": 1.7833229468236364e-05, + "loss": 0.463, + "step": 316 + }, + { + "epoch": 0.5088, + "grad_norm": 13.202224731445312, + "learning_rate": 1.790223530721933e-05, + "loss": 0.5084, + "step": 318 + }, + { + "epoch": 0.512, + "grad_norm": 5.565632343292236, + "learning_rate": 1.7970253429177477e-05, + "loss": 0.4924, + "step": 320 + }, + { + "epoch": 0.5152, + "grad_norm": 9.58140754699707, + "learning_rate": 1.803727533238257e-05, + "loss": 0.387, + "step": 322 + }, + { + "epoch": 0.5184, + "grad_norm": 6.084507465362549, + "learning_rate": 1.8103292639625842e-05, + "loss": 0.4273, + "step": 324 + }, + { + "epoch": 0.5216, + "grad_norm": 35.10951232910156, + "learning_rate": 1.816829709926509e-05, + "loss": 2.1971, + "step": 326 + }, + { + "epoch": 0.5248, + "grad_norm": 2.700080394744873, + "learning_rate": 1.8232280586256097e-05, + "loss": 0.3914, + "step": 328 + }, + { + "epoch": 0.528, + "grad_norm": 6.533969879150391, + "learning_rate": 1.829523510316813e-05, + "loss": 0.4316, + "step": 330 + }, + { + "epoch": 0.5312, + "grad_norm": 6.787297248840332, + "learning_rate": 1.8357152781183606e-05, + "loss": 0.6538, + "step": 332 + }, + { + "epoch": 0.5344, + "grad_norm": 6.87823486328125, + "learning_rate": 1.8418025881081606e-05, + "loss": 0.6642, + "step": 334 + }, + { + "epoch": 0.5376, + "grad_norm": 4.521157741546631, + "learning_rate": 1.8477846794205258e-05, + "loss": 0.5047, + "step": 336 + }, + { + "epoch": 0.5408, + "grad_norm": 2.835745334625244, + "learning_rate": 1.8536608043412695e-05, + "loss": 0.5598, + "step": 338 + }, + { + "epoch": 0.544, + "grad_norm": 7.604702472686768, + "learning_rate": 1.85943022840117e-05, + "loss": 0.9549, + "step": 340 + }, + { + "epoch": 0.5472, + "grad_norm": 5.502391815185547, + "learning_rate": 1.865092230467769e-05, + "loss": 0.5589, + "step": 342 + }, + { + "epoch": 0.5504, + "grad_norm": 5.125470161437988, + "learning_rate": 1.87064610283551e-05, + "loss": 0.3865, + "step": 344 + }, + { + "epoch": 0.5536, + "grad_norm": 15.728462219238281, + "learning_rate": 1.876091151314196e-05, + "loss": 0.6609, + "step": 346 + }, + { + "epoch": 0.5568, + "grad_norm": 5.035003185272217, + "learning_rate": 1.8814266953157557e-05, + "loss": 0.3558, + "step": 348 + }, + { + "epoch": 0.56, + "grad_norm": 2.466336965560913, + "learning_rate": 1.8866520679393127e-05, + "loss": 0.3271, + "step": 350 + }, + { + "epoch": 0.5632, + "grad_norm": 4.039994239807129, + "learning_rate": 1.8917666160545436e-05, + "loss": 0.3596, + "step": 352 + }, + { + "epoch": 0.5664, + "grad_norm": 5.38749361038208, + "learning_rate": 1.896769700383315e-05, + "loss": 0.6196, + "step": 354 + }, + { + "epoch": 0.5696, + "grad_norm": 4.640627384185791, + "learning_rate": 1.901660695579585e-05, + "loss": 0.4562, + "step": 356 + }, + { + "epoch": 0.5728, + "grad_norm": 6.774022102355957, + "learning_rate": 1.9064389903075676e-05, + "loss": 0.6438, + "step": 358 + }, + { + "epoch": 0.576, + "grad_norm": 2.7814035415649414, + "learning_rate": 1.911103987318148e-05, + "loss": 0.3665, + "step": 360 + }, + { + "epoch": 0.5792, + "grad_norm": 2.988558053970337, + "learning_rate": 1.9156551035235288e-05, + "loss": 0.3287, + "step": 362 + }, + { + "epoch": 0.5824, + "grad_norm": 11.730210304260254, + "learning_rate": 1.9200917700701173e-05, + "loss": 0.8063, + "step": 364 + }, + { + "epoch": 0.5856, + "grad_norm": 3.510537624359131, + "learning_rate": 1.924413432409622e-05, + "loss": 1.8139, + "step": 366 + }, + { + "epoch": 0.5888, + "grad_norm": 3.427419662475586, + "learning_rate": 1.9286195503683705e-05, + "loss": 0.6188, + "step": 368 + }, + { + "epoch": 0.592, + "grad_norm": 2.8159828186035156, + "learning_rate": 1.932709598214825e-05, + "loss": 0.3593, + "step": 370 + }, + { + "epoch": 0.5952, + "grad_norm": 12.421769142150879, + "learning_rate": 1.9366830647252967e-05, + "loss": 0.6811, + "step": 372 + }, + { + "epoch": 0.5984, + "grad_norm": 7.060184001922607, + "learning_rate": 1.940539453247842e-05, + "loss": 0.6729, + "step": 374 + }, + { + "epoch": 0.6016, + "grad_norm": 3.2482755184173584, + "learning_rate": 1.944278281764342e-05, + "loss": 0.598, + "step": 376 + }, + { + "epoch": 0.6048, + "grad_norm": 12.445656776428223, + "learning_rate": 1.9478990829507504e-05, + "loss": 0.9253, + "step": 378 + }, + { + "epoch": 0.608, + "grad_norm": 7.116509437561035, + "learning_rate": 1.951401404235505e-05, + "loss": 1.1703, + "step": 380 + }, + { + "epoch": 0.6112, + "grad_norm": 4.388617992401123, + "learning_rate": 1.9547848078560975e-05, + "loss": 0.3192, + "step": 382 + }, + { + "epoch": 0.6144, + "grad_norm": 8.191654205322266, + "learning_rate": 1.9580488709137858e-05, + "loss": 0.7508, + "step": 384 + }, + { + "epoch": 0.6176, + "grad_norm": 6.881166934967041, + "learning_rate": 1.961193185426459e-05, + "loss": 0.8673, + "step": 386 + }, + { + "epoch": 0.6208, + "grad_norm": 16.268980026245117, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.7658, + "step": 388 + }, + { + "epoch": 0.624, + "grad_norm": 5.663888931274414, + "learning_rate": 1.967121011775546e-05, + "loss": 0.471, + "step": 390 + }, + { + "epoch": 0.6272, + "grad_norm": 7.135434627532959, + "learning_rate": 1.969903782680467e-05, + "loss": 0.7198, + "step": 392 + }, + { + "epoch": 0.6304, + "grad_norm": 6.507286548614502, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.4459, + "step": 394 + }, + { + "epoch": 0.6336, + "grad_norm": 4.37092924118042, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.4201, + "step": 396 + }, + { + "epoch": 0.6368, + "grad_norm": 10.315455436706543, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.8372, + "step": 398 + }, + { + "epoch": 0.64, + "grad_norm": 5.6190385818481445, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.4728, + "step": 400 + }, + { + "epoch": 0.6432, + "grad_norm": 12.238266944885254, + "learning_rate": 1.9819927571953807e-05, + "loss": 0.9893, + "step": 402 + }, + { + "epoch": 0.6464, + "grad_norm": 6.412506103515625, + "learning_rate": 1.9840434606066182e-05, + "loss": 0.5909, + "step": 404 + }, + { + "epoch": 0.6496, + "grad_norm": 6.781770706176758, + "learning_rate": 1.985971166354357e-05, + "loss": 0.5031, + "step": 406 + }, + { + "epoch": 0.6528, + "grad_norm": 11.029583930969238, + "learning_rate": 1.9877756334905983e-05, + "loss": 0.8005, + "step": 408 + }, + { + "epoch": 0.656, + "grad_norm": 5.955713272094727, + "learning_rate": 1.9894566364711965e-05, + "loss": 2.9778, + "step": 410 + }, + { + "epoch": 0.6592, + "grad_norm": 5.804037094116211, + "learning_rate": 1.99101396518405e-05, + "loss": 0.4442, + "step": 412 + }, + { + "epoch": 0.6624, + "grad_norm": 5.956245422363281, + "learning_rate": 1.9924474249753652e-05, + "loss": 0.4676, + "step": 414 + }, + { + "epoch": 0.6656, + "grad_norm": 4.380008697509766, + "learning_rate": 1.9937568366739858e-05, + "loss": 0.5896, + "step": 416 + }, + { + "epoch": 0.6688, + "grad_norm": 3.2874741554260254, + "learning_rate": 1.994942036613787e-05, + "loss": 0.3899, + "step": 418 + }, + { + "epoch": 0.672, + "grad_norm": 4.871149063110352, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.3007, + "step": 420 + }, + { + "epoch": 0.6752, + "grad_norm": 5.472265720367432, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.5545, + "step": 422 + }, + { + "epoch": 0.6784, + "grad_norm": 4.547043800354004, + "learning_rate": 1.9977509622105233e-05, + "loss": 0.5253, + "step": 424 + }, + { + "epoch": 0.6816, + "grad_norm": 2.6497678756713867, + "learning_rate": 1.998437989229673e-05, + "loss": 0.5096, + "step": 426 + }, + { + "epoch": 0.6848, + "grad_norm": 8.168057441711426, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.8695, + "step": 428 + }, + { + "epoch": 0.688, + "grad_norm": 6.835053443908691, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.6131, + "step": 430 + }, + { + "epoch": 0.6912, + "grad_norm": 4.14987325668335, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.7508, + "step": 432 + }, + { + "epoch": 0.6944, + "grad_norm": 2.14447021484375, + "learning_rate": 1.9999375039475275e-05, + "loss": 0.3071, + "step": 434 + }, + { + "epoch": 0.6976, + "grad_norm": 7.206055641174316, + "learning_rate": 2e-05, + "loss": 0.3932, + "step": 436 + }, + { + "epoch": 0.7008, + "grad_norm": 5.099971771240234, + "learning_rate": 1.9999375039475278e-05, + "loss": 0.338, + "step": 438 + }, + { + "epoch": 0.704, + "grad_norm": 5.983880996704102, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.55, + "step": 440 + }, + { + "epoch": 0.7072, + "grad_norm": 1.6342906951904297, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.3481, + "step": 442 + }, + { + "epoch": 0.7104, + "grad_norm": 3.592808485031128, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.4214, + "step": 444 + }, + { + "epoch": 0.7136, + "grad_norm": 5.754815578460693, + "learning_rate": 1.9984379892296735e-05, + "loss": 0.4716, + "step": 446 + }, + { + "epoch": 0.7168, + "grad_norm": 3.611063003540039, + "learning_rate": 1.9977509622105236e-05, + "loss": 0.3382, + "step": 448 + }, + { + "epoch": 0.72, + "grad_norm": 6.442031383514404, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.4637, + "step": 450 + }, + { + "epoch": 0.7232, + "grad_norm": 4.785464763641357, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.4822, + "step": 452 + }, + { + "epoch": 0.7264, + "grad_norm": 4.596600532531738, + "learning_rate": 1.9949420366137873e-05, + "loss": 0.4387, + "step": 454 + }, + { + "epoch": 0.7296, + "grad_norm": 3.8151254653930664, + "learning_rate": 1.993756836673986e-05, + "loss": 0.3749, + "step": 456 + }, + { + "epoch": 0.7328, + "grad_norm": 4.351461887359619, + "learning_rate": 1.9924474249753656e-05, + "loss": 0.4763, + "step": 458 + }, + { + "epoch": 0.736, + "grad_norm": 7.7334418296813965, + "learning_rate": 1.9910139651840497e-05, + "loss": 0.4282, + "step": 460 + }, + { + "epoch": 0.7392, + "grad_norm": 5.742446422576904, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.417, + "step": 462 + }, + { + "epoch": 0.7424, + "grad_norm": 19.029937744140625, + "learning_rate": 1.987775633490599e-05, + "loss": 1.3878, + "step": 464 + }, + { + "epoch": 0.7456, + "grad_norm": 9.348970413208008, + "learning_rate": 1.9859711663543573e-05, + "loss": 0.736, + "step": 466 + }, + { + "epoch": 0.7488, + "grad_norm": 5.024008750915527, + "learning_rate": 1.9840434606066186e-05, + "loss": 0.328, + "step": 468 + }, + { + "epoch": 0.752, + "grad_norm": 9.727663040161133, + "learning_rate": 1.9819927571953804e-05, + "loss": 1.1213, + "step": 470 + }, + { + "epoch": 0.7552, + "grad_norm": 9.793577194213867, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.4617, + "step": 472 + }, + { + "epoch": 0.7584, + "grad_norm": 3.6658005714416504, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.3419, + "step": 474 + }, + { + "epoch": 0.7616, + "grad_norm": 7.4308085441589355, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.545, + "step": 476 + }, + { + "epoch": 0.7648, + "grad_norm": 2.164323329925537, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.47, + "step": 478 + }, + { + "epoch": 0.768, + "grad_norm": 6.013500690460205, + "learning_rate": 1.969903782680467e-05, + "loss": 0.4348, + "step": 480 + }, + { + "epoch": 0.7712, + "grad_norm": 6.188719749450684, + "learning_rate": 1.9671210117755462e-05, + "loss": 0.7735, + "step": 482 + }, + { + "epoch": 0.7744, + "grad_norm": 7.753485679626465, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.5738, + "step": 484 + }, + { + "epoch": 0.7776, + "grad_norm": 6.78783655166626, + "learning_rate": 1.961193185426459e-05, + "loss": 0.7013, + "step": 486 + }, + { + "epoch": 0.7808, + "grad_norm": 2.9399425983428955, + "learning_rate": 1.958048870913786e-05, + "loss": 0.5571, + "step": 488 + }, + { + "epoch": 0.784, + "grad_norm": 9.357385635375977, + "learning_rate": 1.9547848078560982e-05, + "loss": 0.7407, + "step": 490 + }, + { + "epoch": 0.7872, + "grad_norm": 7.049064636230469, + "learning_rate": 1.9514014042355054e-05, + "loss": 0.5239, + "step": 492 + }, + { + "epoch": 0.7904, + "grad_norm": 6.75682258605957, + "learning_rate": 1.947899082950751e-05, + "loss": 0.4966, + "step": 494 + }, + { + "epoch": 0.7936, + "grad_norm": 5.613454818725586, + "learning_rate": 1.9442782817643425e-05, + "loss": 0.445, + "step": 496 + }, + { + "epoch": 0.7968, + "grad_norm": 3.580491065979004, + "learning_rate": 1.9405394532478422e-05, + "loss": 0.2899, + "step": 498 + }, + { + "epoch": 0.8, + "grad_norm": 4.453496932983398, + "learning_rate": 1.9366830647252977e-05, + "loss": 0.7, + "step": 500 + }, + { + "epoch": 0.8032, + "grad_norm": 5.264734268188477, + "learning_rate": 1.9327095982148255e-05, + "loss": 0.3101, + "step": 502 + }, + { + "epoch": 0.8064, + "grad_norm": 10.876425743103027, + "learning_rate": 1.928619550368371e-05, + "loss": 0.7457, + "step": 504 + }, + { + "epoch": 0.8096, + "grad_norm": 7.00797700881958, + "learning_rate": 1.9244134324096216e-05, + "loss": 0.6821, + "step": 506 + }, + { + "epoch": 0.8128, + "grad_norm": 7.558223247528076, + "learning_rate": 1.9200917700701176e-05, + "loss": 0.5951, + "step": 508 + }, + { + "epoch": 0.816, + "grad_norm": 6.367091178894043, + "learning_rate": 1.9156551035235298e-05, + "loss": 0.5086, + "step": 510 + }, + { + "epoch": 0.8192, + "grad_norm": 7.159903526306152, + "learning_rate": 1.9111039873181475e-05, + "loss": 0.3697, + "step": 512 + }, + { + "epoch": 0.8224, + "grad_norm": 8.145002365112305, + "learning_rate": 1.9064389903075683e-05, + "loss": 0.496, + "step": 514 + }, + { + "epoch": 0.8256, + "grad_norm": 3.874540090560913, + "learning_rate": 1.9016606955795843e-05, + "loss": 0.1227, + "step": 516 + }, + { + "epoch": 0.8288, + "grad_norm": 4.757205963134766, + "learning_rate": 1.8967697003833156e-05, + "loss": 0.6644, + "step": 518 + }, + { + "epoch": 0.832, + "grad_norm": 6.409359455108643, + "learning_rate": 1.891766616054545e-05, + "loss": 0.257, + "step": 520 + }, + { + "epoch": 0.8352, + "grad_norm": 4.814531326293945, + "learning_rate": 1.8866520679393124e-05, + "loss": 0.3543, + "step": 522 + }, + { + "epoch": 0.8384, + "grad_norm": 15.44034481048584, + "learning_rate": 1.881426695315756e-05, + "loss": 1.3733, + "step": 524 + }, + { + "epoch": 0.8416, + "grad_norm": 2.6535580158233643, + "learning_rate": 1.8760911513141974e-05, + "loss": 0.4062, + "step": 526 + }, + { + "epoch": 0.8448, + "grad_norm": 3.2737133502960205, + "learning_rate": 1.8706461028355107e-05, + "loss": 0.45, + "step": 528 + }, + { + "epoch": 0.848, + "grad_norm": 6.56961727142334, + "learning_rate": 1.86509223046777e-05, + "loss": 0.5543, + "step": 530 + }, + { + "epoch": 0.8512, + "grad_norm": 2.931555986404419, + "learning_rate": 1.8594302284011697e-05, + "loss": 0.3695, + "step": 532 + }, + { + "epoch": 0.8544, + "grad_norm": 2.997446060180664, + "learning_rate": 1.8536608043412702e-05, + "loss": 0.6689, + "step": 534 + }, + { + "epoch": 0.8576, + "grad_norm": 5.753312587738037, + "learning_rate": 1.847784679420527e-05, + "loss": 0.4381, + "step": 536 + }, + { + "epoch": 0.8608, + "grad_norm": 2.823542594909668, + "learning_rate": 1.841802588108161e-05, + "loss": 0.4003, + "step": 538 + }, + { + "epoch": 0.864, + "grad_norm": 14.417923927307129, + "learning_rate": 1.8357152781183613e-05, + "loss": 0.934, + "step": 540 + }, + { + "epoch": 0.8672, + "grad_norm": 10.664857864379883, + "learning_rate": 1.8295235103168128e-05, + "loss": 0.7329, + "step": 542 + }, + { + "epoch": 0.8704, + "grad_norm": 5.107484817504883, + "learning_rate": 1.8232280586256104e-05, + "loss": 0.5008, + "step": 544 + }, + { + "epoch": 0.8736, + "grad_norm": 3.2516307830810547, + "learning_rate": 1.8168297099265108e-05, + "loss": 0.3427, + "step": 546 + }, + { + "epoch": 0.8768, + "grad_norm": 2.803086042404175, + "learning_rate": 1.8103292639625835e-05, + "loss": 0.3771, + "step": 548 + }, + { + "epoch": 0.88, + "grad_norm": 4.849076271057129, + "learning_rate": 1.8037275332382575e-05, + "loss": 0.6314, + "step": 550 + }, + { + "epoch": 0.8832, + "grad_norm": 4.658120155334473, + "learning_rate": 1.7970253429177494e-05, + "loss": 0.5067, + "step": 552 + }, + { + "epoch": 0.8864, + "grad_norm": 2.672494888305664, + "learning_rate": 1.7902235307219336e-05, + "loss": 0.394, + "step": 554 + }, + { + "epoch": 0.8896, + "grad_norm": 7.755502223968506, + "learning_rate": 1.783322946823638e-05, + "loss": 0.6454, + "step": 556 + }, + { + "epoch": 0.8928, + "grad_norm": 2.106848955154419, + "learning_rate": 1.776324453741365e-05, + "loss": 0.2759, + "step": 558 + }, + { + "epoch": 0.896, + "grad_norm": 3.1528866291046143, + "learning_rate": 1.7692289262315008e-05, + "loss": 0.4362, + "step": 560 + }, + { + "epoch": 0.8992, + "grad_norm": 15.362929344177246, + "learning_rate": 1.762037251178961e-05, + "loss": 0.8955, + "step": 562 + }, + { + "epoch": 0.9024, + "grad_norm": 1.7654240131378174, + "learning_rate": 1.7547503274863502e-05, + "loss": 0.2897, + "step": 564 + }, + { + "epoch": 0.9056, + "grad_norm": 3.3194448947906494, + "learning_rate": 1.7473690659616e-05, + "loss": 0.2994, + "step": 566 + }, + { + "epoch": 0.9088, + "grad_norm": 10.149787902832031, + "learning_rate": 1.739894389204122e-05, + "loss": 0.6527, + "step": 568 + }, + { + "epoch": 0.912, + "grad_norm": 10.171009063720703, + "learning_rate": 1.732327231489503e-05, + "loss": 0.6712, + "step": 570 + }, + { + "epoch": 0.9152, + "grad_norm": 2.9059488773345947, + "learning_rate": 1.7246685386527105e-05, + "loss": 0.4296, + "step": 572 + }, + { + "epoch": 0.9184, + "grad_norm": 8.409585952758789, + "learning_rate": 1.716919267969884e-05, + "loss": 0.784, + "step": 574 + }, + { + "epoch": 0.9216, + "grad_norm": 12.59908390045166, + "learning_rate": 1.7090803880386784e-05, + "loss": 0.5448, + "step": 576 + }, + { + "epoch": 0.9248, + "grad_norm": 6.19038724899292, + "learning_rate": 1.701152878657197e-05, + "loss": 0.5914, + "step": 578 + }, + { + "epoch": 0.928, + "grad_norm": 3.0019984245300293, + "learning_rate": 1.6931377307015236e-05, + "loss": 0.4509, + "step": 580 + }, + { + "epoch": 0.9312, + "grad_norm": 3.466404914855957, + "learning_rate": 1.6850359460018744e-05, + "loss": 0.4273, + "step": 582 + }, + { + "epoch": 0.9344, + "grad_norm": 2.9927778244018555, + "learning_rate": 1.67684853721737e-05, + "loss": 0.5488, + "step": 584 + }, + { + "epoch": 0.9376, + "grad_norm": 2.9774136543273926, + "learning_rate": 1.6685765277094702e-05, + "loss": 0.4706, + "step": 586 + }, + { + "epoch": 0.9408, + "grad_norm": 13.504951477050781, + "learning_rate": 1.6602209514140562e-05, + "loss": 0.7795, + "step": 588 + }, + { + "epoch": 0.944, + "grad_norm": 2.0812973976135254, + "learning_rate": 1.651782852712194e-05, + "loss": 0.6015, + "step": 590 + }, + { + "epoch": 0.9472, + "grad_norm": 8.495684623718262, + "learning_rate": 1.6432632862996062e-05, + "loss": 0.5996, + "step": 592 + }, + { + "epoch": 0.9504, + "grad_norm": 9.915968894958496, + "learning_rate": 1.6346633170548275e-05, + "loss": 0.4866, + "step": 594 + }, + { + "epoch": 0.9536, + "grad_norm": 2.894171953201294, + "learning_rate": 1.625984019906122e-05, + "loss": 0.7214, + "step": 596 + }, + { + "epoch": 0.9568, + "grad_norm": 6.13911771774292, + "learning_rate": 1.6172264796971063e-05, + "loss": 0.3976, + "step": 598 + }, + { + "epoch": 0.96, + "grad_norm": 2.5138468742370605, + "learning_rate": 1.6083917910511623e-05, + "loss": 0.4245, + "step": 600 + }, + { + "epoch": 0.9632, + "grad_norm": 5.338921070098877, + "learning_rate": 1.5994810582346266e-05, + "loss": 0.4325, + "step": 602 + }, + { + "epoch": 0.9664, + "grad_norm": 10.50223445892334, + "learning_rate": 1.5904953950187448e-05, + "loss": 0.8188, + "step": 604 + }, + { + "epoch": 0.9696, + "grad_norm": 7.8410325050354, + "learning_rate": 1.581435924540482e-05, + "loss": 0.8112, + "step": 606 + }, + { + "epoch": 0.9728, + "grad_norm": 7.011837005615234, + "learning_rate": 1.5723037791621203e-05, + "loss": 0.5045, + "step": 608 + }, + { + "epoch": 0.976, + "grad_norm": 4.2811431884765625, + "learning_rate": 1.5631001003297302e-05, + "loss": 0.5087, + "step": 610 + }, + { + "epoch": 0.9792, + "grad_norm": 8.028616905212402, + "learning_rate": 1.5538260384305083e-05, + "loss": 0.9064, + "step": 612 + }, + { + "epoch": 0.9824, + "grad_norm": 3.3687844276428223, + "learning_rate": 1.544482752648966e-05, + "loss": 0.4295, + "step": 614 + }, + { + "epoch": 0.9856, + "grad_norm": 5.316037654876709, + "learning_rate": 1.5350714108220677e-05, + "loss": 0.5034, + "step": 616 + }, + { + "epoch": 0.9888, + "grad_norm": 2.103130340576172, + "learning_rate": 1.5255931892932344e-05, + "loss": 0.4537, + "step": 618 + }, + { + "epoch": 0.992, + "grad_norm": 2.0409934520721436, + "learning_rate": 1.5160492727653238e-05, + "loss": 0.3681, + "step": 620 + }, + { + "epoch": 0.9952, + "grad_norm": 5.530198097229004, + "learning_rate": 1.5064408541525578e-05, + "loss": 0.4799, + "step": 622 + }, + { + "epoch": 0.9984, + "grad_norm": 6.037550926208496, + "learning_rate": 1.4967691344314012e-05, + "loss": 0.644, + "step": 624 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 2443576741724160.0, + "train_loss": 0.6015953216791153, + "train_runtime": 4485.8512, + "train_samples_per_second": 2.229, + "train_steps_per_second": 0.139 + } + ], + "logging_steps": 2, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 2443576741724160.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..0b14638d8f4a521c182b564165589dc11683a311 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f54bcb68d58bda2894c4538a3272cfd1b6327a6a7cd53546159b4758de7258bb +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..83445fdc12b5d8d817bcf5882cec7fe1567a0978 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cc4f69df59205eb6ff4a861186508183f12b211c704f7cdf2e2481e8d91f47c +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..564f6690ed20711feb305dc15268cbd34d4d7f1b --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12f2995c92e69406d3532e061b731c618d3120117dd7b86080ce7474e8b3de4f +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..adc361f60423e0fbca3628f9c7c9289627254f53 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79eea20bd503d8d82054dd83cfea59d210dee3e00a15b50bb3ebfad51f1d820a +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_infoBatch_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_infoBatch_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..177f98de98cb2c430d1b44a78148c36ee667bb29 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_infoBatch_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,1904 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "learning_rate": 2.4524967251364995e-06, + "loss": 0.7036, + "step": 2 + }, + { + "epoch": 0.0064, + "learning_rate": 2.5263093403840022e-06, + "loss": 0.327, + "step": 4 + }, + { + "epoch": 0.0096, + "learning_rate": 2.6010561079587694e-06, + "loss": 0.5459, + "step": 6 + }, + { + "epoch": 0.0128, + "learning_rate": 2.6767276851049716e-06, + "loss": 0.3284, + "step": 8 + }, + { + "epoch": 0.016, + "learning_rate": 2.7533146134728993e-06, + "loss": 0.4194, + "step": 10 + }, + { + "epoch": 0.0192, + "learning_rate": 2.8308073203011634e-06, + "loss": 0.6064, + "step": 12 + }, + { + "epoch": 0.0224, + "learning_rate": 2.909196119613218e-06, + "loss": 0.9268, + "step": 14 + }, + { + "epoch": 0.0256, + "learning_rate": 2.988471213428035e-06, + "loss": 0.3195, + "step": 16 + }, + { + "epoch": 0.0288, + "learning_rate": 3.068622692984767e-06, + "loss": 0.5438, + "step": 18 + }, + { + "epoch": 0.032, + "learning_rate": 3.1496405399812602e-06, + "loss": 0.5279, + "step": 20 + }, + { + "epoch": 0.0352, + "learning_rate": 3.231514627826302e-06, + "loss": 0.4847, + "step": 22 + }, + { + "epoch": 0.0384, + "learning_rate": 3.314234722905302e-06, + "loss": 0.4063, + "step": 24 + }, + { + "epoch": 0.0416, + "learning_rate": 3.3977904858594534e-06, + "loss": 0.612, + "step": 26 + }, + { + "epoch": 0.0448, + "learning_rate": 3.4821714728780654e-06, + "loss": 0.331, + "step": 28 + }, + { + "epoch": 0.048, + "learning_rate": 3.567367137003953e-06, + "loss": 0.3023, + "step": 30 + }, + { + "epoch": 0.0512, + "learning_rate": 3.653366829451711e-06, + "loss": 0.5405, + "step": 32 + }, + { + "epoch": 0.0544, + "learning_rate": 3.740159800938784e-06, + "loss": 0.4103, + "step": 34 + }, + { + "epoch": 0.0576, + "learning_rate": 3.827735203028956e-06, + "loss": 0.466, + "step": 36 + }, + { + "epoch": 0.0608, + "learning_rate": 3.916082089488379e-06, + "loss": 0.5685, + "step": 38 + }, + { + "epoch": 0.064, + "learning_rate": 4.005189417653737e-06, + "loss": 0.8703, + "step": 40 + }, + { + "epoch": 0.0672, + "learning_rate": 4.095046049812541e-06, + "loss": 0.7011, + "step": 42 + }, + { + "epoch": 0.0704, + "learning_rate": 4.1856407545951825e-06, + "loss": 0.3975, + "step": 44 + }, + { + "epoch": 0.0736, + "learning_rate": 4.276962208378814e-06, + "loss": 0.8369, + "step": 46 + }, + { + "epoch": 0.0768, + "learning_rate": 4.368998996702686e-06, + "loss": 0.5687, + "step": 48 + }, + { + "epoch": 0.08, + "learning_rate": 4.461739615694921e-06, + "loss": 0.6999, + "step": 50 + }, + { + "epoch": 0.0832, + "learning_rate": 4.555172473510324e-06, + "loss": 0.5198, + "step": 52 + }, + { + "epoch": 0.0864, + "learning_rate": 4.649285891779326e-06, + "loss": 0.3555, + "step": 54 + }, + { + "epoch": 0.0896, + "learning_rate": 4.744068107067673e-06, + "loss": 0.5996, + "step": 56 + }, + { + "epoch": 0.0928, + "learning_rate": 4.839507272346751e-06, + "loss": 0.4225, + "step": 58 + }, + { + "epoch": 0.096, + "learning_rate": 4.935591458474425e-06, + "loss": 0.4745, + "step": 60 + }, + { + "epoch": 0.0992, + "learning_rate": 5.032308655686007e-06, + "loss": 0.4378, + "step": 62 + }, + { + "epoch": 0.1024, + "learning_rate": 5.129646775095432e-06, + "loss": 0.3912, + "step": 64 + }, + { + "epoch": 0.1056, + "learning_rate": 5.227593650206246e-06, + "loss": 0.3867, + "step": 66 + }, + { + "epoch": 0.1088, + "learning_rate": 5.3261370384323904e-06, + "loss": 0.3087, + "step": 68 + }, + { + "epoch": 0.112, + "learning_rate": 5.425264622628326e-06, + "loss": 0.4686, + "step": 70 + }, + { + "epoch": 0.1152, + "learning_rate": 5.524964012628644e-06, + "loss": 0.3876, + "step": 72 + }, + { + "epoch": 0.1184, + "learning_rate": 5.62522274679673e-06, + "loss": 0.3213, + "step": 74 + }, + { + "epoch": 0.1216, + "learning_rate": 5.726028293582342e-06, + "loss": 0.5433, + "step": 76 + }, + { + "epoch": 0.1248, + "learning_rate": 5.827368053088032e-06, + "loss": 0.3393, + "step": 78 + }, + { + "epoch": 0.128, + "learning_rate": 5.929229358643925e-06, + "loss": 0.9177, + "step": 80 + }, + { + "epoch": 0.1312, + "learning_rate": 6.03159947839103e-06, + "loss": 0.4592, + "step": 82 + }, + { + "epoch": 0.1344, + "learning_rate": 6.13446561687258e-06, + "loss": 0.3325, + "step": 84 + }, + { + "epoch": 0.1376, + "learning_rate": 6.237814916633431e-06, + "loss": 0.4437, + "step": 86 + }, + { + "epoch": 0.1408, + "learning_rate": 6.341634459827044e-06, + "loss": 0.4454, + "step": 88 + }, + { + "epoch": 0.144, + "learning_rate": 6.445911269830183e-06, + "loss": 0.4098, + "step": 90 + }, + { + "epoch": 0.1472, + "learning_rate": 6.5506323128648654e-06, + "loss": 0.5805, + "step": 92 + }, + { + "epoch": 0.1504, + "learning_rate": 6.655784499627476e-06, + "loss": 0.3599, + "step": 94 + }, + { + "epoch": 0.1536, + "learning_rate": 6.761354686924883e-06, + "loss": 0.4033, + "step": 96 + }, + { + "epoch": 0.1568, + "learning_rate": 6.867329679317144e-06, + "loss": 0.4196, + "step": 98 + }, + { + "epoch": 0.16, + "learning_rate": 6.973696230766884e-06, + "loss": 0.3178, + "step": 100 + }, + { + "epoch": 0.1632, + "learning_rate": 7.080441046294945e-06, + "loss": 0.4176, + "step": 102 + }, + { + "epoch": 0.1664, + "learning_rate": 7.18755078364214e-06, + "loss": 0.4496, + "step": 104 + }, + { + "epoch": 0.1696, + "learning_rate": 7.2950120549369204e-06, + "loss": 0.3414, + "step": 106 + }, + { + "epoch": 0.1728, + "learning_rate": 7.402811428368824e-06, + "loss": 0.692, + "step": 108 + }, + { + "epoch": 0.176, + "learning_rate": 7.510935429867233e-06, + "loss": 0.5126, + "step": 110 + }, + { + "epoch": 0.1792, + "learning_rate": 7.619370544785608e-06, + "loss": 0.3592, + "step": 112 + }, + { + "epoch": 0.1824, + "learning_rate": 7.728103219590684e-06, + "loss": 0.515, + "step": 114 + }, + { + "epoch": 0.1856, + "learning_rate": 7.83711986355656e-06, + "loss": 0.5299, + "step": 116 + }, + { + "epoch": 0.1888, + "learning_rate": 7.946406850463435e-06, + "loss": 0.7093, + "step": 118 + }, + { + "epoch": 0.192, + "learning_rate": 8.055950520300756e-06, + "loss": 0.3534, + "step": 120 + }, + { + "epoch": 0.1952, + "learning_rate": 8.165737180974676e-06, + "loss": 0.5856, + "step": 122 + }, + { + "epoch": 0.1984, + "learning_rate": 8.275753110019367e-06, + "loss": 0.4656, + "step": 124 + }, + { + "epoch": 0.2016, + "learning_rate": 8.385984556312285e-06, + "loss": 0.849, + "step": 126 + }, + { + "epoch": 0.2048, + "learning_rate": 8.496417741792922e-06, + "loss": 0.6804, + "step": 128 + }, + { + "epoch": 0.208, + "learning_rate": 8.607038863184952e-06, + "loss": 0.7425, + "step": 130 + }, + { + "epoch": 0.2112, + "learning_rate": 8.717834093721598e-06, + "loss": 0.5759, + "step": 132 + }, + { + "epoch": 0.2144, + "learning_rate": 8.828789584873757e-06, + "loss": 0.2995, + "step": 134 + }, + { + "epoch": 0.2176, + "learning_rate": 8.939891468081036e-06, + "loss": 0.4656, + "step": 136 + }, + { + "epoch": 0.2208, + "learning_rate": 9.051125856485175e-06, + "loss": 0.912, + "step": 138 + }, + { + "epoch": 0.224, + "learning_rate": 9.162478846665854e-06, + "loss": 0.3673, + "step": 140 + }, + { + "epoch": 0.2272, + "learning_rate": 9.273936520378426e-06, + "loss": 0.5677, + "step": 142 + }, + { + "epoch": 0.2304, + "learning_rate": 9.38548494629364e-06, + "loss": 0.4492, + "step": 144 + }, + { + "epoch": 0.2336, + "learning_rate": 9.497110181738935e-06, + "loss": 0.8513, + "step": 146 + }, + { + "epoch": 0.2368, + "learning_rate": 9.608798274441153e-06, + "loss": 0.6077, + "step": 148 + }, + { + "epoch": 0.24, + "learning_rate": 9.720535264270526e-06, + "loss": 0.5087, + "step": 150 + }, + { + "epoch": 0.2432, + "learning_rate": 9.832307184985473e-06, + "loss": 0.3399, + "step": 152 + }, + { + "epoch": 0.2464, + "learning_rate": 9.944100065978354e-06, + "loss": 0.325, + "step": 154 + }, + { + "epoch": 0.2496, + "learning_rate": 1.0055899934021637e-05, + "loss": 0.6666, + "step": 156 + }, + { + "epoch": 0.2528, + "learning_rate": 1.016769281501452e-05, + "loss": 0.4477, + "step": 158 + }, + { + "epoch": 0.256, + "learning_rate": 1.0279464735729467e-05, + "loss": 0.3785, + "step": 160 + }, + { + "epoch": 0.2592, + "learning_rate": 1.039120172555884e-05, + "loss": 0.4558, + "step": 162 + }, + { + "epoch": 0.2624, + "learning_rate": 1.0502889818261058e-05, + "loss": 0.3491, + "step": 164 + }, + { + "epoch": 0.2656, + "learning_rate": 1.0614515053706354e-05, + "loss": 0.4749, + "step": 166 + }, + { + "epoch": 0.2688, + "learning_rate": 1.0726063479621567e-05, + "loss": 0.3133, + "step": 168 + }, + { + "epoch": 0.272, + "learning_rate": 1.083752115333414e-05, + "loss": 0.5629, + "step": 170 + }, + { + "epoch": 0.2752, + "learning_rate": 1.0948874143514818e-05, + "loss": 0.4181, + "step": 172 + }, + { + "epoch": 0.2784, + "learning_rate": 1.1060108531918955e-05, + "loss": 0.4558, + "step": 174 + }, + { + "epoch": 0.2816, + "learning_rate": 1.1171210415126238e-05, + "loss": 0.4645, + "step": 176 + }, + { + "epoch": 0.2848, + "learning_rate": 1.1282165906278395e-05, + "loss": 0.3057, + "step": 178 + }, + { + "epoch": 0.288, + "learning_rate": 1.1392961136815041e-05, + "loss": 0.34, + "step": 180 + }, + { + "epoch": 0.2912, + "learning_rate": 1.150358225820707e-05, + "loss": 0.7983, + "step": 182 + }, + { + "epoch": 0.2944, + "learning_rate": 1.1614015443687708e-05, + "loss": 0.8228, + "step": 184 + }, + { + "epoch": 0.2976, + "learning_rate": 1.1724246889980626e-05, + "loss": 0.5895, + "step": 186 + }, + { + "epoch": 0.3008, + "learning_rate": 1.1834262819025317e-05, + "loss": 0.3662, + "step": 188 + }, + { + "epoch": 0.304, + "learning_rate": 1.1944049479699241e-05, + "loss": 0.849, + "step": 190 + }, + { + "epoch": 0.3072, + "learning_rate": 1.2053593149536557e-05, + "loss": 0.4468, + "step": 192 + }, + { + "epoch": 0.3104, + "learning_rate": 1.2162880136443434e-05, + "loss": 0.5257, + "step": 194 + }, + { + "epoch": 0.3136, + "learning_rate": 1.2271896780409309e-05, + "loss": 0.3903, + "step": 196 + }, + { + "epoch": 0.3168, + "learning_rate": 1.2380629455214385e-05, + "loss": 0.3894, + "step": 198 + }, + { + "epoch": 0.32, + "learning_rate": 1.2489064570132761e-05, + "loss": 0.4684, + "step": 200 + }, + { + "epoch": 0.3232, + "learning_rate": 1.259718857163117e-05, + "loss": 0.3398, + "step": 202 + }, + { + "epoch": 0.3264, + "learning_rate": 1.2704987945063073e-05, + "loss": 0.429, + "step": 204 + }, + { + "epoch": 0.3296, + "learning_rate": 1.2812449216357855e-05, + "loss": 0.3714, + "step": 206 + }, + { + "epoch": 0.3328, + "learning_rate": 1.2919558953705047e-05, + "loss": 0.47, + "step": 208 + }, + { + "epoch": 0.336, + "learning_rate": 1.3026303769233109e-05, + "loss": 0.7464, + "step": 210 + }, + { + "epoch": 0.3392, + "learning_rate": 1.313267032068285e-05, + "loss": 0.3296, + "step": 212 + }, + { + "epoch": 0.3424, + "learning_rate": 1.3238645313075109e-05, + "loss": 0.3549, + "step": 214 + }, + { + "epoch": 0.3456, + "learning_rate": 1.3344215500372517e-05, + "loss": 0.5048, + "step": 216 + }, + { + "epoch": 0.3488, + "learning_rate": 1.344936768713513e-05, + "loss": 0.4158, + "step": 218 + }, + { + "epoch": 0.352, + "learning_rate": 1.3554088730169812e-05, + "loss": 0.4307, + "step": 220 + }, + { + "epoch": 0.3552, + "learning_rate": 1.3658365540172948e-05, + "loss": 0.3973, + "step": 222 + }, + { + "epoch": 0.3584, + "learning_rate": 1.3762185083366562e-05, + "loss": 0.4211, + "step": 224 + }, + { + "epoch": 0.3616, + "learning_rate": 1.3865534383127413e-05, + "loss": 0.6055, + "step": 226 + }, + { + "epoch": 0.3648, + "learning_rate": 1.3968400521608962e-05, + "loss": 0.682, + "step": 228 + }, + { + "epoch": 0.368, + "learning_rate": 1.4070770641356069e-05, + "loss": 0.3319, + "step": 230 + }, + { + "epoch": 0.3712, + "learning_rate": 1.4172631946911964e-05, + "loss": 0.3834, + "step": 232 + }, + { + "epoch": 0.3744, + "learning_rate": 1.4273971706417653e-05, + "loss": 0.5636, + "step": 234 + }, + { + "epoch": 0.3776, + "learning_rate": 1.4374777253203265e-05, + "loss": 0.4051, + "step": 236 + }, + { + "epoch": 0.3808, + "learning_rate": 1.4475035987371348e-05, + "loss": 0.268, + "step": 238 + }, + { + "epoch": 0.384, + "learning_rate": 1.4574735377371669e-05, + "loss": 0.3986, + "step": 240 + }, + { + "epoch": 0.3872, + "learning_rate": 1.4673862961567604e-05, + "loss": 0.5529, + "step": 242 + }, + { + "epoch": 0.3904, + "learning_rate": 1.4772406349793749e-05, + "loss": 1.1209, + "step": 244 + }, + { + "epoch": 0.3936, + "learning_rate": 1.4870353224904563e-05, + "loss": 0.5836, + "step": 246 + }, + { + "epoch": 0.3968, + "learning_rate": 1.4967691344313988e-05, + "loss": 1.0781, + "step": 248 + }, + { + "epoch": 0.4, + "learning_rate": 1.5064408541525568e-05, + "loss": 0.4529, + "step": 250 + }, + { + "epoch": 0.4032, + "learning_rate": 1.5160492727653245e-05, + "loss": 0.5813, + "step": 252 + }, + { + "epoch": 0.4064, + "learning_rate": 1.5255931892932322e-05, + "loss": 0.4578, + "step": 254 + }, + { + "epoch": 0.4096, + "learning_rate": 1.5350714108220667e-05, + "loss": 0.3759, + "step": 256 + }, + { + "epoch": 0.4128, + "learning_rate": 1.5444827526489668e-05, + "loss": 0.6413, + "step": 258 + }, + { + "epoch": 0.416, + "learning_rate": 1.5538260384305073e-05, + "loss": 0.3349, + "step": 260 + }, + { + "epoch": 0.4192, + "learning_rate": 1.563100100329731e-05, + "loss": 0.5868, + "step": 262 + }, + { + "epoch": 0.4224, + "learning_rate": 1.572303779162118e-05, + "loss": 0.35, + "step": 264 + }, + { + "epoch": 0.4256, + "learning_rate": 1.581435924540481e-05, + "loss": 0.8259, + "step": 266 + }, + { + "epoch": 0.4288, + "learning_rate": 1.5904953950187455e-05, + "loss": 0.4011, + "step": 268 + }, + { + "epoch": 0.432, + "learning_rate": 1.599481058234626e-05, + "loss": 0.8308, + "step": 270 + }, + { + "epoch": 0.4352, + "learning_rate": 1.6083917910511616e-05, + "loss": 0.6538, + "step": 272 + }, + { + "epoch": 0.4384, + "learning_rate": 1.617226479697104e-05, + "loss": 0.5024, + "step": 274 + }, + { + "epoch": 0.4416, + "learning_rate": 1.6259840199061212e-05, + "loss": 0.3737, + "step": 276 + }, + { + "epoch": 0.4448, + "learning_rate": 1.6346633170548285e-05, + "loss": 0.371, + "step": 278 + }, + { + "epoch": 0.448, + "learning_rate": 1.6432632862996042e-05, + "loss": 0.3252, + "step": 280 + }, + { + "epoch": 0.4512, + "learning_rate": 1.6517828527121928e-05, + "loss": 0.2844, + "step": 282 + }, + { + "epoch": 0.4544, + "learning_rate": 1.6602209514140542e-05, + "loss": 0.4009, + "step": 284 + }, + { + "epoch": 0.4576, + "learning_rate": 1.6685765277094695e-05, + "loss": 0.4064, + "step": 286 + }, + { + "epoch": 0.4608, + "learning_rate": 1.6768485372173696e-05, + "loss": 0.2609, + "step": 288 + }, + { + "epoch": 0.464, + "learning_rate": 1.6850359460018733e-05, + "loss": 0.3502, + "step": 290 + }, + { + "epoch": 0.4672, + "learning_rate": 1.6931377307015226e-05, + "loss": 0.348, + "step": 292 + }, + { + "epoch": 0.4704, + "learning_rate": 1.701152878657196e-05, + "loss": 0.644, + "step": 294 + }, + { + "epoch": 0.4736, + "learning_rate": 1.7090803880386778e-05, + "loss": 0.4308, + "step": 296 + }, + { + "epoch": 0.4768, + "learning_rate": 1.716919267969883e-05, + "loss": 0.568, + "step": 298 + }, + { + "epoch": 0.48, + "learning_rate": 1.7246685386527095e-05, + "loss": 0.6655, + "step": 300 + }, + { + "epoch": 0.4832, + "learning_rate": 1.7323272314895022e-05, + "loss": 0.4522, + "step": 302 + }, + { + "epoch": 0.4864, + "learning_rate": 1.7398943892041227e-05, + "loss": 0.3227, + "step": 304 + }, + { + "epoch": 0.4896, + "learning_rate": 1.7473690659615992e-05, + "loss": 0.6808, + "step": 306 + }, + { + "epoch": 0.4928, + "learning_rate": 1.7547503274863495e-05, + "loss": 0.7103, + "step": 308 + }, + { + "epoch": 0.496, + "learning_rate": 1.7620372511789604e-05, + "loss": 0.4166, + "step": 310 + }, + { + "epoch": 0.4992, + "learning_rate": 1.7692289262315e-05, + "loss": 0.3018, + "step": 312 + }, + { + "epoch": 0.5024, + "learning_rate": 1.7763244537413657e-05, + "loss": 0.5556, + "step": 314 + }, + { + "epoch": 0.5056, + "learning_rate": 1.7833229468236364e-05, + "loss": 0.8933, + "step": 316 + }, + { + "epoch": 0.5088, + "learning_rate": 1.790223530721933e-05, + "loss": 0.6276, + "step": 318 + }, + { + "epoch": 0.512, + "learning_rate": 1.7970253429177477e-05, + "loss": 0.7065, + "step": 320 + }, + { + "epoch": 0.5152, + "learning_rate": 1.803727533238257e-05, + "loss": 0.3694, + "step": 322 + }, + { + "epoch": 0.5184, + "learning_rate": 1.8103292639625842e-05, + "loss": 0.4256, + "step": 324 + }, + { + "epoch": 0.5216, + "learning_rate": 1.816829709926509e-05, + "loss": 0.3572, + "step": 326 + }, + { + "epoch": 0.5248, + "learning_rate": 1.8232280586256097e-05, + "loss": 0.4809, + "step": 328 + }, + { + "epoch": 0.528, + "learning_rate": 1.829523510316813e-05, + "loss": 0.3423, + "step": 330 + }, + { + "epoch": 0.5312, + "learning_rate": 1.8357152781183606e-05, + "loss": 0.3685, + "step": 332 + }, + { + "epoch": 0.5344, + "learning_rate": 1.8418025881081606e-05, + "loss": 0.3765, + "step": 334 + }, + { + "epoch": 0.5376, + "learning_rate": 1.8477846794205258e-05, + "loss": 0.9131, + "step": 336 + }, + { + "epoch": 0.5408, + "learning_rate": 1.8536608043412695e-05, + "loss": 0.2763, + "step": 338 + }, + { + "epoch": 0.544, + "learning_rate": 1.85943022840117e-05, + "loss": 0.4316, + "step": 340 + }, + { + "epoch": 0.5472, + "learning_rate": 1.865092230467769e-05, + "loss": 0.501, + "step": 342 + }, + { + "epoch": 0.5504, + "learning_rate": 1.87064610283551e-05, + "loss": 0.5479, + "step": 344 + }, + { + "epoch": 0.5536, + "learning_rate": 1.876091151314196e-05, + "loss": 0.4017, + "step": 346 + }, + { + "epoch": 0.5568, + "learning_rate": 1.8814266953157557e-05, + "loss": 0.3401, + "step": 348 + }, + { + "epoch": 0.56, + "learning_rate": 1.8866520679393127e-05, + "loss": 0.5058, + "step": 350 + }, + { + "epoch": 0.5632, + "learning_rate": 1.8917666160545436e-05, + "loss": 0.3441, + "step": 352 + }, + { + "epoch": 0.5664, + "learning_rate": 1.896769700383315e-05, + "loss": 0.2944, + "step": 354 + }, + { + "epoch": 0.5696, + "learning_rate": 1.901660695579585e-05, + "loss": 0.4928, + "step": 356 + }, + { + "epoch": 0.5728, + "learning_rate": 1.9064389903075676e-05, + "loss": 0.4822, + "step": 358 + }, + { + "epoch": 0.576, + "learning_rate": 1.911103987318148e-05, + "loss": 0.4891, + "step": 360 + }, + { + "epoch": 0.5792, + "learning_rate": 1.9156551035235288e-05, + "loss": 0.6295, + "step": 362 + }, + { + "epoch": 0.5824, + "learning_rate": 1.9200917700701173e-05, + "loss": 0.3944, + "step": 364 + }, + { + "epoch": 0.5856, + "learning_rate": 1.924413432409622e-05, + "loss": 0.4413, + "step": 366 + }, + { + "epoch": 0.5888, + "learning_rate": 1.9286195503683705e-05, + "loss": 0.3578, + "step": 368 + }, + { + "epoch": 0.592, + "learning_rate": 1.932709598214825e-05, + "loss": 0.3674, + "step": 370 + }, + { + "epoch": 0.5952, + "learning_rate": 1.9366830647252967e-05, + "loss": 0.3141, + "step": 372 + }, + { + "epoch": 0.5984, + "learning_rate": 1.940539453247842e-05, + "loss": 0.3549, + "step": 374 + }, + { + "epoch": 0.6016, + "learning_rate": 1.944278281764342e-05, + "loss": 0.2786, + "step": 376 + }, + { + "epoch": 0.6048, + "learning_rate": 1.9478990829507504e-05, + "loss": 0.5024, + "step": 378 + }, + { + "epoch": 0.608, + "learning_rate": 1.951401404235505e-05, + "loss": 0.317, + "step": 380 + }, + { + "epoch": 0.6112, + "learning_rate": 1.9547848078560975e-05, + "loss": 0.549, + "step": 382 + }, + { + "epoch": 0.6144, + "learning_rate": 1.9580488709137858e-05, + "loss": 0.5291, + "step": 384 + }, + { + "epoch": 0.6176, + "learning_rate": 1.961193185426459e-05, + "loss": 0.5376, + "step": 386 + }, + { + "epoch": 0.6208, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.4267, + "step": 388 + }, + { + "epoch": 0.624, + "learning_rate": 1.967121011775546e-05, + "loss": 0.2718, + "step": 390 + }, + { + "epoch": 0.6272, + "learning_rate": 1.969903782680467e-05, + "loss": 0.3591, + "step": 392 + }, + { + "epoch": 0.6304, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.8761, + "step": 394 + }, + { + "epoch": 0.6336, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.3512, + "step": 396 + }, + { + "epoch": 0.6368, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.3622, + "step": 398 + }, + { + "epoch": 0.64, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.526, + "step": 400 + }, + { + "epoch": 0.6432, + "learning_rate": 1.9819927571953807e-05, + "loss": 0.5224, + "step": 402 + }, + { + "epoch": 0.6464, + "learning_rate": 1.9840434606066182e-05, + "loss": 0.4473, + "step": 404 + }, + { + "epoch": 0.6496, + "learning_rate": 1.985971166354357e-05, + "loss": 0.4711, + "step": 406 + }, + { + "epoch": 0.6528, + "learning_rate": 1.9877756334905983e-05, + "loss": 0.4104, + "step": 408 + }, + { + "epoch": 0.656, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.3354, + "step": 410 + }, + { + "epoch": 0.6592, + "learning_rate": 1.99101396518405e-05, + "loss": 0.5118, + "step": 412 + }, + { + "epoch": 0.6624, + "learning_rate": 1.9924474249753652e-05, + "loss": 0.4001, + "step": 414 + }, + { + "epoch": 0.6656, + "learning_rate": 1.9937568366739858e-05, + "loss": 0.432, + "step": 416 + }, + { + "epoch": 0.6688, + "learning_rate": 1.994942036613787e-05, + "loss": 0.4111, + "step": 418 + }, + { + "epoch": 0.672, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.4681, + "step": 420 + }, + { + "epoch": 0.6752, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.3202, + "step": 422 + }, + { + "epoch": 0.6784, + "learning_rate": 1.9977509622105233e-05, + "loss": 0.3838, + "step": 424 + }, + { + "epoch": 0.6816, + "learning_rate": 1.998437989229673e-05, + "loss": 0.3591, + "step": 426 + }, + { + "epoch": 0.6848, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.3838, + "step": 428 + }, + { + "epoch": 0.688, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.3466, + "step": 430 + }, + { + "epoch": 0.6912, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.7046, + "step": 432 + }, + { + "epoch": 0.6944, + "learning_rate": 1.9999375039475275e-05, + "loss": 0.3992, + "step": 434 + }, + { + "epoch": 0.6976, + "learning_rate": 2e-05, + "loss": 0.3986, + "step": 436 + }, + { + "epoch": 0.7008, + "learning_rate": 1.9999375039475278e-05, + "loss": 0.5007, + "step": 438 + }, + { + "epoch": 0.704, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.8362, + "step": 440 + }, + { + "epoch": 0.7072, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.3802, + "step": 442 + }, + { + "epoch": 0.7104, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.7143, + "step": 444 + }, + { + "epoch": 0.7136, + "learning_rate": 1.9984379892296735e-05, + "loss": 0.7413, + "step": 446 + }, + { + "epoch": 0.7168, + "learning_rate": 1.9977509622105236e-05, + "loss": 0.8962, + "step": 448 + }, + { + "epoch": 0.72, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.3923, + "step": 450 + }, + { + "epoch": 0.7232, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.5831, + "step": 452 + }, + { + "epoch": 0.7264, + "learning_rate": 1.9949420366137873e-05, + "loss": 0.4438, + "step": 454 + }, + { + "epoch": 0.7296, + "learning_rate": 1.993756836673986e-05, + "loss": 0.3784, + "step": 456 + }, + { + "epoch": 0.7328, + "learning_rate": 1.9924474249753656e-05, + "loss": 0.3393, + "step": 458 + }, + { + "epoch": 0.736, + "learning_rate": 1.9910139651840497e-05, + "loss": 0.2945, + "step": 460 + }, + { + "epoch": 0.7392, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.3334, + "step": 462 + }, + { + "epoch": 0.7424, + "learning_rate": 1.987775633490599e-05, + "loss": 0.5353, + "step": 464 + }, + { + "epoch": 0.7456, + "learning_rate": 1.9859711663543573e-05, + "loss": 0.4004, + "step": 466 + }, + { + "epoch": 0.7488, + "learning_rate": 1.9840434606066186e-05, + "loss": 0.4969, + "step": 468 + }, + { + "epoch": 0.752, + "learning_rate": 1.9819927571953804e-05, + "loss": 0.2853, + "step": 470 + }, + { + "epoch": 0.7552, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.5008, + "step": 472 + }, + { + "epoch": 0.7584, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.2996, + "step": 474 + }, + { + "epoch": 0.7616, + "learning_rate": 1.9751053008725736e-05, + "loss": 1.05, + "step": 476 + }, + { + "epoch": 0.7648, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.3922, + "step": 478 + }, + { + "epoch": 0.768, + "learning_rate": 1.969903782680467e-05, + "loss": 0.4267, + "step": 480 + }, + { + "epoch": 0.7712, + "learning_rate": 1.9671210117755462e-05, + "loss": 0.4111, + "step": 482 + }, + { + "epoch": 0.7744, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.4971, + "step": 484 + }, + { + "epoch": 0.7776, + "learning_rate": 1.961193185426459e-05, + "loss": 0.776, + "step": 486 + }, + { + "epoch": 0.7808, + "learning_rate": 1.958048870913786e-05, + "loss": 0.4126, + "step": 488 + }, + { + "epoch": 0.784, + "learning_rate": 1.9547848078560982e-05, + "loss": 1.2728, + "step": 490 + }, + { + "epoch": 0.7872, + "learning_rate": 1.9514014042355054e-05, + "loss": 0.4098, + "step": 492 + }, + { + "epoch": 0.7904, + "learning_rate": 1.947899082950751e-05, + "loss": 0.3246, + "step": 494 + }, + { + "epoch": 0.7936, + "learning_rate": 1.9442782817643425e-05, + "loss": 0.4038, + "step": 496 + }, + { + "epoch": 0.7968, + "learning_rate": 1.9405394532478422e-05, + "loss": 0.5928, + "step": 498 + }, + { + "epoch": 0.8, + "learning_rate": 1.9366830647252977e-05, + "loss": 0.3213, + "step": 500 + }, + { + "epoch": 0.8032, + "learning_rate": 1.9327095982148255e-05, + "loss": 0.4125, + "step": 502 + }, + { + "epoch": 0.8064, + "learning_rate": 1.928619550368371e-05, + "loss": 0.3865, + "step": 504 + }, + { + "epoch": 0.8096, + "learning_rate": 1.9244134324096216e-05, + "loss": 0.8616, + "step": 506 + }, + { + "epoch": 0.8128, + "learning_rate": 1.9200917700701176e-05, + "loss": 0.4038, + "step": 508 + }, + { + "epoch": 0.816, + "learning_rate": 1.9156551035235298e-05, + "loss": 0.4095, + "step": 510 + }, + { + "epoch": 0.8192, + "learning_rate": 1.9111039873181475e-05, + "loss": 0.2725, + "step": 512 + }, + { + "epoch": 0.8224, + "learning_rate": 1.9064389903075683e-05, + "loss": 0.3414, + "step": 514 + }, + { + "epoch": 0.8256, + "learning_rate": 1.9016606955795843e-05, + "loss": 0.369, + "step": 516 + }, + { + "epoch": 0.8288, + "learning_rate": 1.8967697003833156e-05, + "loss": 0.3659, + "step": 518 + }, + { + "epoch": 0.832, + "learning_rate": 1.891766616054545e-05, + "loss": 0.3186, + "step": 520 + }, + { + "epoch": 0.8352, + "learning_rate": 1.8866520679393124e-05, + "loss": 0.2871, + "step": 522 + }, + { + "epoch": 0.8384, + "learning_rate": 1.881426695315756e-05, + "loss": 0.398, + "step": 524 + }, + { + "epoch": 0.8416, + "learning_rate": 1.8760911513141974e-05, + "loss": 0.3132, + "step": 526 + }, + { + "epoch": 0.8448, + "learning_rate": 1.8706461028355107e-05, + "loss": 0.5052, + "step": 528 + }, + { + "epoch": 0.848, + "learning_rate": 1.86509223046777e-05, + "loss": 0.4231, + "step": 530 + }, + { + "epoch": 0.8512, + "learning_rate": 1.8594302284011697e-05, + "loss": 0.3517, + "step": 532 + }, + { + "epoch": 0.8544, + "learning_rate": 1.8536608043412702e-05, + "loss": 0.3629, + "step": 534 + }, + { + "epoch": 0.8576, + "learning_rate": 1.847784679420527e-05, + "loss": 0.3954, + "step": 536 + }, + { + "epoch": 0.8608, + "learning_rate": 1.841802588108161e-05, + "loss": 0.5935, + "step": 538 + }, + { + "epoch": 0.864, + "learning_rate": 1.8357152781183613e-05, + "loss": 0.4527, + "step": 540 + }, + { + "epoch": 0.8672, + "learning_rate": 1.8295235103168128e-05, + "loss": 0.4106, + "step": 542 + }, + { + "epoch": 0.8704, + "learning_rate": 1.8232280586256104e-05, + "loss": 0.6128, + "step": 544 + }, + { + "epoch": 0.8736, + "learning_rate": 1.8168297099265108e-05, + "loss": 0.557, + "step": 546 + }, + { + "epoch": 0.8768, + "learning_rate": 1.8103292639625835e-05, + "loss": 0.3223, + "step": 548 + }, + { + "epoch": 0.88, + "learning_rate": 1.8037275332382575e-05, + "loss": 0.7693, + "step": 550 + }, + { + "epoch": 0.8832, + "learning_rate": 1.7970253429177494e-05, + "loss": 0.6488, + "step": 552 + }, + { + "epoch": 0.8864, + "learning_rate": 1.7902235307219336e-05, + "loss": 0.4684, + "step": 554 + }, + { + "epoch": 0.8896, + "learning_rate": 1.783322946823638e-05, + "loss": 0.7015, + "step": 556 + }, + { + "epoch": 0.8928, + "learning_rate": 1.776324453741365e-05, + "loss": 0.3606, + "step": 558 + }, + { + "epoch": 0.896, + "learning_rate": 1.7692289262315008e-05, + "loss": 0.5728, + "step": 560 + }, + { + "epoch": 0.8992, + "learning_rate": 1.762037251178961e-05, + "loss": 0.6078, + "step": 562 + }, + { + "epoch": 0.9024, + "learning_rate": 1.7547503274863502e-05, + "loss": 0.9526, + "step": 564 + }, + { + "epoch": 0.9056, + "learning_rate": 1.7473690659616e-05, + "loss": 0.3625, + "step": 566 + }, + { + "epoch": 0.9088, + "learning_rate": 1.739894389204122e-05, + "loss": 0.3565, + "step": 568 + }, + { + "epoch": 0.912, + "learning_rate": 1.732327231489503e-05, + "loss": 0.6755, + "step": 570 + }, + { + "epoch": 0.9152, + "learning_rate": 1.7246685386527105e-05, + "loss": 0.4678, + "step": 572 + }, + { + "epoch": 0.9184, + "learning_rate": 1.716919267969884e-05, + "loss": 0.5578, + "step": 574 + }, + { + "epoch": 0.9216, + "learning_rate": 1.7090803880386784e-05, + "loss": 0.3742, + "step": 576 + }, + { + "epoch": 0.9248, + "learning_rate": 1.701152878657197e-05, + "loss": 0.4484, + "step": 578 + }, + { + "epoch": 0.928, + "learning_rate": 1.6931377307015236e-05, + "loss": 0.4051, + "step": 580 + }, + { + "epoch": 0.9312, + "learning_rate": 1.6850359460018744e-05, + "loss": 0.6345, + "step": 582 + }, + { + "epoch": 0.9344, + "learning_rate": 1.67684853721737e-05, + "loss": 0.595, + "step": 584 + }, + { + "epoch": 0.9376, + "learning_rate": 1.6685765277094702e-05, + "loss": 0.2562, + "step": 586 + }, + { + "epoch": 0.9408, + "learning_rate": 1.6602209514140562e-05, + "loss": 0.6204, + "step": 588 + }, + { + "epoch": 0.944, + "learning_rate": 1.651782852712194e-05, + "loss": 0.2957, + "step": 590 + }, + { + "epoch": 0.9472, + "learning_rate": 1.6432632862996062e-05, + "loss": 0.4435, + "step": 592 + }, + { + "epoch": 0.9504, + "learning_rate": 1.6346633170548275e-05, + "loss": 0.3927, + "step": 594 + }, + { + "epoch": 0.9536, + "learning_rate": 1.625984019906122e-05, + "loss": 0.9383, + "step": 596 + }, + { + "epoch": 0.9568, + "learning_rate": 1.6172264796971063e-05, + "loss": 0.3296, + "step": 598 + }, + { + "epoch": 0.96, + "learning_rate": 1.6083917910511623e-05, + "loss": 0.2457, + "step": 600 + }, + { + "epoch": 0.9632, + "learning_rate": 1.5994810582346266e-05, + "loss": 0.2783, + "step": 602 + }, + { + "epoch": 0.9664, + "learning_rate": 1.5904953950187448e-05, + "loss": 0.5341, + "step": 604 + }, + { + "epoch": 0.9696, + "learning_rate": 1.581435924540482e-05, + "loss": 0.5131, + "step": 606 + }, + { + "epoch": 0.9728, + "learning_rate": 1.5723037791621203e-05, + "loss": 0.6233, + "step": 608 + }, + { + "epoch": 0.976, + "learning_rate": 1.5631001003297302e-05, + "loss": 0.4588, + "step": 610 + }, + { + "epoch": 0.9792, + "learning_rate": 1.5538260384305083e-05, + "loss": 0.2961, + "step": 612 + }, + { + "epoch": 0.9824, + "learning_rate": 1.544482752648966e-05, + "loss": 0.3157, + "step": 614 + }, + { + "epoch": 0.9856, + "learning_rate": 1.5350714108220677e-05, + "loss": 0.3386, + "step": 616 + }, + { + "epoch": 0.9888, + "learning_rate": 1.5255931892932344e-05, + "loss": 0.4643, + "step": 618 + }, + { + "epoch": 0.992, + "learning_rate": 1.5160492727653238e-05, + "loss": 0.3212, + "step": 620 + }, + { + "epoch": 0.9952, + "learning_rate": 1.5064408541525578e-05, + "loss": 0.3708, + "step": 622 + }, + { + "epoch": 0.9984, + "learning_rate": 1.4967691344314012e-05, + "loss": 0.2716, + "step": 624 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 2661804507398144.0, + "train_loss": 0.48295580658912657, + "train_runtime": 2484.1441, + "train_samples_per_second": 4.026, + "train_steps_per_second": 0.252 + } + ], + "logging_steps": 2, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 2661804507398144.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..031df656ffaa742ddbfd83873ed88c63127a1343 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06800ee3fb800f76dbce23952c4f3dd12debeccaafda8c7788c6b1b4408bc674 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..64112746e2d1ffb67d4fdab461f3e50dcc1a8cfb --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64bd280b15d88baf053fcba5a17e25b5b1d088f6f50c9d0ef7c225acd0af35b2 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..f79859470d4c21f92fd01fa0fdc5defc53b8c412 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b8b6efaaa6c0edfe31c7c349a95fe5e62461a4b89aa48eee3b4baeaaf05849b +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b2ffa609ca60e00e59cd106532fe583854c0e95f --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:369d3c8936e1a2c22d007bd6f3766bf82f85e5ddba7be9eb91a60579a81687e3 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_selfsup_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_selfsup_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a562132c64174ee1417e0dd0f635e580e8c01c82 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_selfsup_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,1904 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "learning_rate": 2.4524967251364995e-06, + "loss": 0.0124, + "step": 2 + }, + { + "epoch": 0.0064, + "learning_rate": 2.5263093403840022e-06, + "loss": 0.1119, + "step": 4 + }, + { + "epoch": 0.0096, + "learning_rate": 2.6010561079587694e-06, + "loss": 0.3245, + "step": 6 + }, + { + "epoch": 0.0128, + "learning_rate": 2.6767276851049716e-06, + "loss": 0.7283, + "step": 8 + }, + { + "epoch": 0.016, + "learning_rate": 2.7533146134728993e-06, + "loss": 0.0138, + "step": 10 + }, + { + "epoch": 0.0192, + "learning_rate": 2.8308073203011634e-06, + "loss": 0.3943, + "step": 12 + }, + { + "epoch": 0.0224, + "learning_rate": 2.909196119613218e-06, + "loss": 0.0171, + "step": 14 + }, + { + "epoch": 0.0256, + "learning_rate": 2.988471213428035e-06, + "loss": 0.5327, + "step": 16 + }, + { + "epoch": 0.0288, + "learning_rate": 3.068622692984767e-06, + "loss": 0.0363, + "step": 18 + }, + { + "epoch": 0.032, + "learning_rate": 3.1496405399812602e-06, + "loss": 0.1419, + "step": 20 + }, + { + "epoch": 0.0352, + "learning_rate": 3.231514627826302e-06, + "loss": 0.0843, + "step": 22 + }, + { + "epoch": 0.0384, + "learning_rate": 3.314234722905302e-06, + "loss": 0.2904, + "step": 24 + }, + { + "epoch": 0.0416, + "learning_rate": 3.3977904858594534e-06, + "loss": 0.0209, + "step": 26 + }, + { + "epoch": 0.0448, + "learning_rate": 3.4821714728780654e-06, + "loss": 0.9589, + "step": 28 + }, + { + "epoch": 0.048, + "learning_rate": 3.567367137003953e-06, + "loss": 0.02, + "step": 30 + }, + { + "epoch": 0.0512, + "learning_rate": 3.653366829451711e-06, + "loss": 0.0317, + "step": 32 + }, + { + "epoch": 0.0544, + "learning_rate": 3.740159800938784e-06, + "loss": 0.044, + "step": 34 + }, + { + "epoch": 0.0576, + "learning_rate": 3.827735203028956e-06, + "loss": 0.5648, + "step": 36 + }, + { + "epoch": 0.0608, + "learning_rate": 3.916082089488379e-06, + "loss": 0.1185, + "step": 38 + }, + { + "epoch": 0.064, + "learning_rate": 4.005189417653737e-06, + "loss": 0.2974, + "step": 40 + }, + { + "epoch": 0.0672, + "learning_rate": 4.095046049812541e-06, + "loss": 0.0897, + "step": 42 + }, + { + "epoch": 0.0704, + "learning_rate": 4.1856407545951825e-06, + "loss": 0.0195, + "step": 44 + }, + { + "epoch": 0.0736, + "learning_rate": 4.276962208378814e-06, + "loss": 0.4563, + "step": 46 + }, + { + "epoch": 0.0768, + "learning_rate": 4.368998996702686e-06, + "loss": 0.0352, + "step": 48 + }, + { + "epoch": 0.08, + "learning_rate": 4.461739615694921e-06, + "loss": 0.0451, + "step": 50 + }, + { + "epoch": 0.0832, + "learning_rate": 4.555172473510324e-06, + "loss": 0.1638, + "step": 52 + }, + { + "epoch": 0.0864, + "learning_rate": 4.649285891779326e-06, + "loss": 0.0027, + "step": 54 + }, + { + "epoch": 0.0896, + "learning_rate": 4.744068107067673e-06, + "loss": 0.0295, + "step": 56 + }, + { + "epoch": 0.0928, + "learning_rate": 4.839507272346751e-06, + "loss": 0.0075, + "step": 58 + }, + { + "epoch": 0.096, + "learning_rate": 4.935591458474425e-06, + "loss": 0.0477, + "step": 60 + }, + { + "epoch": 0.0992, + "learning_rate": 5.032308655686007e-06, + "loss": 0.2527, + "step": 62 + }, + { + "epoch": 0.1024, + "learning_rate": 5.129646775095432e-06, + "loss": 0.2065, + "step": 64 + }, + { + "epoch": 0.1056, + "learning_rate": 5.227593650206246e-06, + "loss": 0.008, + "step": 66 + }, + { + "epoch": 0.1088, + "learning_rate": 5.3261370384323904e-06, + "loss": 0.024, + "step": 68 + }, + { + "epoch": 0.112, + "learning_rate": 5.425264622628326e-06, + "loss": 0.1038, + "step": 70 + }, + { + "epoch": 0.1152, + "learning_rate": 5.524964012628644e-06, + "loss": 0.0882, + "step": 72 + }, + { + "epoch": 0.1184, + "learning_rate": 5.62522274679673e-06, + "loss": 0.0133, + "step": 74 + }, + { + "epoch": 0.1216, + "learning_rate": 5.726028293582342e-06, + "loss": 0.0983, + "step": 76 + }, + { + "epoch": 0.1248, + "learning_rate": 5.827368053088032e-06, + "loss": 0.0247, + "step": 78 + }, + { + "epoch": 0.128, + "learning_rate": 5.929229358643925e-06, + "loss": 0.0476, + "step": 80 + }, + { + "epoch": 0.1312, + "learning_rate": 6.03159947839103e-06, + "loss": 0.0985, + "step": 82 + }, + { + "epoch": 0.1344, + "learning_rate": 6.13446561687258e-06, + "loss": 0.1289, + "step": 84 + }, + { + "epoch": 0.1376, + "learning_rate": 6.237814916633431e-06, + "loss": 0.0906, + "step": 86 + }, + { + "epoch": 0.1408, + "learning_rate": 6.341634459827044e-06, + "loss": 0.2643, + "step": 88 + }, + { + "epoch": 0.144, + "learning_rate": 6.445911269830183e-06, + "loss": 0.0301, + "step": 90 + }, + { + "epoch": 0.1472, + "learning_rate": 6.5506323128648654e-06, + "loss": 0.0209, + "step": 92 + }, + { + "epoch": 0.1504, + "learning_rate": 6.655784499627476e-06, + "loss": 0.0335, + "step": 94 + }, + { + "epoch": 0.1536, + "learning_rate": 6.761354686924883e-06, + "loss": 0.0118, + "step": 96 + }, + { + "epoch": 0.1568, + "learning_rate": 6.867329679317144e-06, + "loss": 0.0038, + "step": 98 + }, + { + "epoch": 0.16, + "learning_rate": 6.973696230766884e-06, + "loss": 0.2888, + "step": 100 + }, + { + "epoch": 0.1632, + "learning_rate": 7.080441046294945e-06, + "loss": 0.5279, + "step": 102 + }, + { + "epoch": 0.1664, + "learning_rate": 7.18755078364214e-06, + "loss": 0.0023, + "step": 104 + }, + { + "epoch": 0.1696, + "learning_rate": 7.2950120549369204e-06, + "loss": 0.7006, + "step": 106 + }, + { + "epoch": 0.1728, + "learning_rate": 7.402811428368824e-06, + "loss": 0.0933, + "step": 108 + }, + { + "epoch": 0.176, + "learning_rate": 7.510935429867233e-06, + "loss": 0.6504, + "step": 110 + }, + { + "epoch": 0.1792, + "learning_rate": 7.619370544785608e-06, + "loss": 0.0698, + "step": 112 + }, + { + "epoch": 0.1824, + "learning_rate": 7.728103219590684e-06, + "loss": 0.1815, + "step": 114 + }, + { + "epoch": 0.1856, + "learning_rate": 7.83711986355656e-06, + "loss": 0.0406, + "step": 116 + }, + { + "epoch": 0.1888, + "learning_rate": 7.946406850463435e-06, + "loss": 0.0528, + "step": 118 + }, + { + "epoch": 0.192, + "learning_rate": 8.055950520300756e-06, + "loss": 0.0072, + "step": 120 + }, + { + "epoch": 0.1952, + "learning_rate": 8.165737180974676e-06, + "loss": 0.01, + "step": 122 + }, + { + "epoch": 0.1984, + "learning_rate": 8.275753110019367e-06, + "loss": 0.0128, + "step": 124 + }, + { + "epoch": 0.2016, + "learning_rate": 8.385984556312285e-06, + "loss": 0.0469, + "step": 126 + }, + { + "epoch": 0.2048, + "learning_rate": 8.496417741792922e-06, + "loss": 0.5632, + "step": 128 + }, + { + "epoch": 0.208, + "learning_rate": 8.607038863184952e-06, + "loss": 0.1772, + "step": 130 + }, + { + "epoch": 0.2112, + "learning_rate": 8.717834093721598e-06, + "loss": 0.0307, + "step": 132 + }, + { + "epoch": 0.2144, + "learning_rate": 8.828789584873757e-06, + "loss": 0.9967, + "step": 134 + }, + { + "epoch": 0.2176, + "learning_rate": 8.939891468081036e-06, + "loss": 0.5046, + "step": 136 + }, + { + "epoch": 0.2208, + "learning_rate": 9.051125856485175e-06, + "loss": 0.2298, + "step": 138 + }, + { + "epoch": 0.224, + "learning_rate": 9.162478846665854e-06, + "loss": 0.0459, + "step": 140 + }, + { + "epoch": 0.2272, + "learning_rate": 9.273936520378426e-06, + "loss": 0.1685, + "step": 142 + }, + { + "epoch": 0.2304, + "learning_rate": 9.38548494629364e-06, + "loss": 0.0063, + "step": 144 + }, + { + "epoch": 0.2336, + "learning_rate": 9.497110181738935e-06, + "loss": 0.3046, + "step": 146 + }, + { + "epoch": 0.2368, + "learning_rate": 9.608798274441153e-06, + "loss": 0.5494, + "step": 148 + }, + { + "epoch": 0.24, + "learning_rate": 9.720535264270526e-06, + "loss": 0.141, + "step": 150 + }, + { + "epoch": 0.2432, + "learning_rate": 9.832307184985473e-06, + "loss": 0.0304, + "step": 152 + }, + { + "epoch": 0.2464, + "learning_rate": 9.944100065978354e-06, + "loss": 0.0139, + "step": 154 + }, + { + "epoch": 0.2496, + "learning_rate": 1.0055899934021637e-05, + "loss": 0.0148, + "step": 156 + }, + { + "epoch": 0.2528, + "learning_rate": 1.016769281501452e-05, + "loss": 0.2346, + "step": 158 + }, + { + "epoch": 0.256, + "learning_rate": 1.0279464735729467e-05, + "loss": 0.0764, + "step": 160 + }, + { + "epoch": 0.2592, + "learning_rate": 1.039120172555884e-05, + "loss": 0.0553, + "step": 162 + }, + { + "epoch": 0.2624, + "learning_rate": 1.0502889818261058e-05, + "loss": 0.0073, + "step": 164 + }, + { + "epoch": 0.2656, + "learning_rate": 1.0614515053706354e-05, + "loss": 0.5272, + "step": 166 + }, + { + "epoch": 0.2688, + "learning_rate": 1.0726063479621567e-05, + "loss": 0.0259, + "step": 168 + }, + { + "epoch": 0.272, + "learning_rate": 1.083752115333414e-05, + "loss": 0.1772, + "step": 170 + }, + { + "epoch": 0.2752, + "learning_rate": 1.0948874143514818e-05, + "loss": 0.0383, + "step": 172 + }, + { + "epoch": 0.2784, + "learning_rate": 1.1060108531918955e-05, + "loss": 0.685, + "step": 174 + }, + { + "epoch": 0.2816, + "learning_rate": 1.1171210415126238e-05, + "loss": 0.4268, + "step": 176 + }, + { + "epoch": 0.2848, + "learning_rate": 1.1282165906278395e-05, + "loss": 0.1043, + "step": 178 + }, + { + "epoch": 0.288, + "learning_rate": 1.1392961136815041e-05, + "loss": 0.7231, + "step": 180 + }, + { + "epoch": 0.2912, + "learning_rate": 1.150358225820707e-05, + "loss": 0.0691, + "step": 182 + }, + { + "epoch": 0.2944, + "learning_rate": 1.1614015443687708e-05, + "loss": 0.9235, + "step": 184 + }, + { + "epoch": 0.2976, + "learning_rate": 1.1724246889980626e-05, + "loss": 0.1661, + "step": 186 + }, + { + "epoch": 0.3008, + "learning_rate": 1.1834262819025317e-05, + "loss": 0.0621, + "step": 188 + }, + { + "epoch": 0.304, + "learning_rate": 1.1944049479699241e-05, + "loss": 0.0336, + "step": 190 + }, + { + "epoch": 0.3072, + "learning_rate": 1.2053593149536557e-05, + "loss": 0.1993, + "step": 192 + }, + { + "epoch": 0.3104, + "learning_rate": 1.2162880136443434e-05, + "loss": 0.0189, + "step": 194 + }, + { + "epoch": 0.3136, + "learning_rate": 1.2271896780409309e-05, + "loss": 0.0297, + "step": 196 + }, + { + "epoch": 0.3168, + "learning_rate": 1.2380629455214385e-05, + "loss": 0.8886, + "step": 198 + }, + { + "epoch": 0.32, + "learning_rate": 1.2489064570132761e-05, + "loss": 0.1489, + "step": 200 + }, + { + "epoch": 0.3232, + "learning_rate": 1.259718857163117e-05, + "loss": 0.4239, + "step": 202 + }, + { + "epoch": 0.3264, + "learning_rate": 1.2704987945063073e-05, + "loss": 0.0831, + "step": 204 + }, + { + "epoch": 0.3296, + "learning_rate": 1.2812449216357855e-05, + "loss": 0.0315, + "step": 206 + }, + { + "epoch": 0.3328, + "learning_rate": 1.2919558953705047e-05, + "loss": 0.0255, + "step": 208 + }, + { + "epoch": 0.336, + "learning_rate": 1.3026303769233109e-05, + "loss": 0.0152, + "step": 210 + }, + { + "epoch": 0.3392, + "learning_rate": 1.313267032068285e-05, + "loss": 0.4755, + "step": 212 + }, + { + "epoch": 0.3424, + "learning_rate": 1.3238645313075109e-05, + "loss": 0.2811, + "step": 214 + }, + { + "epoch": 0.3456, + "learning_rate": 1.3344215500372517e-05, + "loss": 0.3031, + "step": 216 + }, + { + "epoch": 0.3488, + "learning_rate": 1.344936768713513e-05, + "loss": 0.179, + "step": 218 + }, + { + "epoch": 0.352, + "learning_rate": 1.3554088730169812e-05, + "loss": 0.0079, + "step": 220 + }, + { + "epoch": 0.3552, + "learning_rate": 1.3658365540172948e-05, + "loss": 0.032, + "step": 222 + }, + { + "epoch": 0.3584, + "learning_rate": 1.3762185083366562e-05, + "loss": 0.0984, + "step": 224 + }, + { + "epoch": 0.3616, + "learning_rate": 1.3865534383127413e-05, + "loss": 0.0538, + "step": 226 + }, + { + "epoch": 0.3648, + "learning_rate": 1.3968400521608962e-05, + "loss": 0.2697, + "step": 228 + }, + { + "epoch": 0.368, + "learning_rate": 1.4070770641356069e-05, + "loss": 0.5622, + "step": 230 + }, + { + "epoch": 0.3712, + "learning_rate": 1.4172631946911964e-05, + "loss": 0.007, + "step": 232 + }, + { + "epoch": 0.3744, + "learning_rate": 1.4273971706417653e-05, + "loss": 0.1558, + "step": 234 + }, + { + "epoch": 0.3776, + "learning_rate": 1.4374777253203265e-05, + "loss": 0.0037, + "step": 236 + }, + { + "epoch": 0.3808, + "learning_rate": 1.4475035987371348e-05, + "loss": 0.3933, + "step": 238 + }, + { + "epoch": 0.384, + "learning_rate": 1.4574735377371669e-05, + "loss": 0.0029, + "step": 240 + }, + { + "epoch": 0.3872, + "learning_rate": 1.4673862961567604e-05, + "loss": 0.0716, + "step": 242 + }, + { + "epoch": 0.3904, + "learning_rate": 1.4772406349793749e-05, + "loss": 0.3482, + "step": 244 + }, + { + "epoch": 0.3936, + "learning_rate": 1.4870353224904563e-05, + "loss": 1.1097, + "step": 246 + }, + { + "epoch": 0.3968, + "learning_rate": 1.4967691344313988e-05, + "loss": 0.0125, + "step": 248 + }, + { + "epoch": 0.4, + "learning_rate": 1.5064408541525568e-05, + "loss": 0.1143, + "step": 250 + }, + { + "epoch": 0.4032, + "learning_rate": 1.5160492727653245e-05, + "loss": 0.0952, + "step": 252 + }, + { + "epoch": 0.4064, + "learning_rate": 1.5255931892932322e-05, + "loss": 0.2256, + "step": 254 + }, + { + "epoch": 0.4096, + "learning_rate": 1.5350714108220667e-05, + "loss": 0.1509, + "step": 256 + }, + { + "epoch": 0.4128, + "learning_rate": 1.5444827526489668e-05, + "loss": 0.5427, + "step": 258 + }, + { + "epoch": 0.416, + "learning_rate": 1.5538260384305073e-05, + "loss": 0.2522, + "step": 260 + }, + { + "epoch": 0.4192, + "learning_rate": 1.563100100329731e-05, + "loss": 0.3037, + "step": 262 + }, + { + "epoch": 0.4224, + "learning_rate": 1.572303779162118e-05, + "loss": 0.2966, + "step": 264 + }, + { + "epoch": 0.4256, + "learning_rate": 1.581435924540481e-05, + "loss": 0.2745, + "step": 266 + }, + { + "epoch": 0.4288, + "learning_rate": 1.5904953950187455e-05, + "loss": 0.0866, + "step": 268 + }, + { + "epoch": 0.432, + "learning_rate": 1.599481058234626e-05, + "loss": 0.5184, + "step": 270 + }, + { + "epoch": 0.4352, + "learning_rate": 1.6083917910511616e-05, + "loss": 0.0244, + "step": 272 + }, + { + "epoch": 0.4384, + "learning_rate": 1.617226479697104e-05, + "loss": 0.0457, + "step": 274 + }, + { + "epoch": 0.4416, + "learning_rate": 1.6259840199061212e-05, + "loss": 0.3009, + "step": 276 + }, + { + "epoch": 0.4448, + "learning_rate": 1.6346633170548285e-05, + "loss": 0.0203, + "step": 278 + }, + { + "epoch": 0.448, + "learning_rate": 1.6432632862996042e-05, + "loss": 0.4014, + "step": 280 + }, + { + "epoch": 0.4512, + "learning_rate": 1.6517828527121928e-05, + "loss": 0.0453, + "step": 282 + }, + { + "epoch": 0.4544, + "learning_rate": 1.6602209514140542e-05, + "loss": 0.9482, + "step": 284 + }, + { + "epoch": 0.4576, + "learning_rate": 1.6685765277094695e-05, + "loss": 0.7503, + "step": 286 + }, + { + "epoch": 0.4608, + "learning_rate": 1.6768485372173696e-05, + "loss": 0.5349, + "step": 288 + }, + { + "epoch": 0.464, + "learning_rate": 1.6850359460018733e-05, + "loss": 0.0047, + "step": 290 + }, + { + "epoch": 0.4672, + "learning_rate": 1.6931377307015226e-05, + "loss": 0.3031, + "step": 292 + }, + { + "epoch": 0.4704, + "learning_rate": 1.701152878657196e-05, + "loss": 0.2769, + "step": 294 + }, + { + "epoch": 0.4736, + "learning_rate": 1.7090803880386778e-05, + "loss": 0.0565, + "step": 296 + }, + { + "epoch": 0.4768, + "learning_rate": 1.716919267969883e-05, + "loss": 0.5363, + "step": 298 + }, + { + "epoch": 0.48, + "learning_rate": 1.7246685386527095e-05, + "loss": 0.0496, + "step": 300 + }, + { + "epoch": 0.4832, + "learning_rate": 1.7323272314895022e-05, + "loss": 0.1653, + "step": 302 + }, + { + "epoch": 0.4864, + "learning_rate": 1.7398943892041227e-05, + "loss": 0.0299, + "step": 304 + }, + { + "epoch": 0.4896, + "learning_rate": 1.7473690659615992e-05, + "loss": 1.0825, + "step": 306 + }, + { + "epoch": 0.4928, + "learning_rate": 1.7547503274863495e-05, + "loss": 0.5881, + "step": 308 + }, + { + "epoch": 0.496, + "learning_rate": 1.7620372511789604e-05, + "loss": 0.0622, + "step": 310 + }, + { + "epoch": 0.4992, + "learning_rate": 1.7692289262315e-05, + "loss": 0.363, + "step": 312 + }, + { + "epoch": 0.5024, + "learning_rate": 1.7763244537413657e-05, + "loss": 0.3924, + "step": 314 + }, + { + "epoch": 0.5056, + "learning_rate": 1.7833229468236364e-05, + "loss": 0.0844, + "step": 316 + }, + { + "epoch": 0.5088, + "learning_rate": 1.790223530721933e-05, + "loss": 0.0594, + "step": 318 + }, + { + "epoch": 0.512, + "learning_rate": 1.7970253429177477e-05, + "loss": 0.0755, + "step": 320 + }, + { + "epoch": 0.5152, + "learning_rate": 1.803727533238257e-05, + "loss": 0.0391, + "step": 322 + }, + { + "epoch": 0.5184, + "learning_rate": 1.8103292639625842e-05, + "loss": 0.0105, + "step": 324 + }, + { + "epoch": 0.5216, + "learning_rate": 1.816829709926509e-05, + "loss": 0.3878, + "step": 326 + }, + { + "epoch": 0.5248, + "learning_rate": 1.8232280586256097e-05, + "loss": 0.7927, + "step": 328 + }, + { + "epoch": 0.528, + "learning_rate": 1.829523510316813e-05, + "loss": 0.3557, + "step": 330 + }, + { + "epoch": 0.5312, + "learning_rate": 1.8357152781183606e-05, + "loss": 0.1976, + "step": 332 + }, + { + "epoch": 0.5344, + "learning_rate": 1.8418025881081606e-05, + "loss": 0.0702, + "step": 334 + }, + { + "epoch": 0.5376, + "learning_rate": 1.8477846794205258e-05, + "loss": 0.326, + "step": 336 + }, + { + "epoch": 0.5408, + "learning_rate": 1.8536608043412695e-05, + "loss": 0.2874, + "step": 338 + }, + { + "epoch": 0.544, + "learning_rate": 1.85943022840117e-05, + "loss": 0.3472, + "step": 340 + }, + { + "epoch": 0.5472, + "learning_rate": 1.865092230467769e-05, + "loss": 0.0466, + "step": 342 + }, + { + "epoch": 0.5504, + "learning_rate": 1.87064610283551e-05, + "loss": 0.1335, + "step": 344 + }, + { + "epoch": 0.5536, + "learning_rate": 1.876091151314196e-05, + "loss": 0.0237, + "step": 346 + }, + { + "epoch": 0.5568, + "learning_rate": 1.8814266953157557e-05, + "loss": 0.0153, + "step": 348 + }, + { + "epoch": 0.56, + "learning_rate": 1.8866520679393127e-05, + "loss": 1.0523, + "step": 350 + }, + { + "epoch": 0.5632, + "learning_rate": 1.8917666160545436e-05, + "loss": 0.4291, + "step": 352 + }, + { + "epoch": 0.5664, + "learning_rate": 1.896769700383315e-05, + "loss": 0.2238, + "step": 354 + }, + { + "epoch": 0.5696, + "learning_rate": 1.901660695579585e-05, + "loss": 0.2438, + "step": 356 + }, + { + "epoch": 0.5728, + "learning_rate": 1.9064389903075676e-05, + "loss": 0.772, + "step": 358 + }, + { + "epoch": 0.576, + "learning_rate": 1.911103987318148e-05, + "loss": 0.4986, + "step": 360 + }, + { + "epoch": 0.5792, + "learning_rate": 1.9156551035235288e-05, + "loss": 0.9014, + "step": 362 + }, + { + "epoch": 0.5824, + "learning_rate": 1.9200917700701173e-05, + "loss": 0.0132, + "step": 364 + }, + { + "epoch": 0.5856, + "learning_rate": 1.924413432409622e-05, + "loss": 0.0687, + "step": 366 + }, + { + "epoch": 0.5888, + "learning_rate": 1.9286195503683705e-05, + "loss": 0.0116, + "step": 368 + }, + { + "epoch": 0.592, + "learning_rate": 1.932709598214825e-05, + "loss": 0.2541, + "step": 370 + }, + { + "epoch": 0.5952, + "learning_rate": 1.9366830647252967e-05, + "loss": 0.3812, + "step": 372 + }, + { + "epoch": 0.5984, + "learning_rate": 1.940539453247842e-05, + "loss": 0.5181, + "step": 374 + }, + { + "epoch": 0.6016, + "learning_rate": 1.944278281764342e-05, + "loss": 0.19, + "step": 376 + }, + { + "epoch": 0.6048, + "learning_rate": 1.9478990829507504e-05, + "loss": 0.1293, + "step": 378 + }, + { + "epoch": 0.608, + "learning_rate": 1.951401404235505e-05, + "loss": 0.0496, + "step": 380 + }, + { + "epoch": 0.6112, + "learning_rate": 1.9547848078560975e-05, + "loss": 0.2873, + "step": 382 + }, + { + "epoch": 0.6144, + "learning_rate": 1.9580488709137858e-05, + "loss": 0.5678, + "step": 384 + }, + { + "epoch": 0.6176, + "learning_rate": 1.961193185426459e-05, + "loss": 0.0317, + "step": 386 + }, + { + "epoch": 0.6208, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.1365, + "step": 388 + }, + { + "epoch": 0.624, + "learning_rate": 1.967121011775546e-05, + "loss": 0.1463, + "step": 390 + }, + { + "epoch": 0.6272, + "learning_rate": 1.969903782680467e-05, + "loss": 0.05, + "step": 392 + }, + { + "epoch": 0.6304, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.019, + "step": 394 + }, + { + "epoch": 0.6336, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.5648, + "step": 396 + }, + { + "epoch": 0.6368, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.0464, + "step": 398 + }, + { + "epoch": 0.64, + "learning_rate": 1.9798193124423804e-05, + "loss": 1.4197, + "step": 400 + }, + { + "epoch": 0.6432, + "learning_rate": 1.9819927571953807e-05, + "loss": 0.0523, + "step": 402 + }, + { + "epoch": 0.6464, + "learning_rate": 1.9840434606066182e-05, + "loss": 0.3419, + "step": 404 + }, + { + "epoch": 0.6496, + "learning_rate": 1.985971166354357e-05, + "loss": 0.7454, + "step": 406 + }, + { + "epoch": 0.6528, + "learning_rate": 1.9877756334905983e-05, + "loss": 0.1362, + "step": 408 + }, + { + "epoch": 0.656, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.0834, + "step": 410 + }, + { + "epoch": 0.6592, + "learning_rate": 1.99101396518405e-05, + "loss": 0.2647, + "step": 412 + }, + { + "epoch": 0.6624, + "learning_rate": 1.9924474249753652e-05, + "loss": 0.2932, + "step": 414 + }, + { + "epoch": 0.6656, + "learning_rate": 1.9937568366739858e-05, + "loss": 0.091, + "step": 416 + }, + { + "epoch": 0.6688, + "learning_rate": 1.994942036613787e-05, + "loss": 0.0372, + "step": 418 + }, + { + "epoch": 0.672, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.4564, + "step": 420 + }, + { + "epoch": 0.6752, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.2194, + "step": 422 + }, + { + "epoch": 0.6784, + "learning_rate": 1.9977509622105233e-05, + "loss": 0.0529, + "step": 424 + }, + { + "epoch": 0.6816, + "learning_rate": 1.998437989229673e-05, + "loss": 0.4882, + "step": 426 + }, + { + "epoch": 0.6848, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.3815, + "step": 428 + }, + { + "epoch": 0.688, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.1259, + "step": 430 + }, + { + "epoch": 0.6912, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.5569, + "step": 432 + }, + { + "epoch": 0.6944, + "learning_rate": 1.9999375039475275e-05, + "loss": 0.0036, + "step": 434 + }, + { + "epoch": 0.6976, + "learning_rate": 2e-05, + "loss": 0.4201, + "step": 436 + }, + { + "epoch": 0.7008, + "learning_rate": 1.9999375039475278e-05, + "loss": 0.2861, + "step": 438 + }, + { + "epoch": 0.704, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.2318, + "step": 440 + }, + { + "epoch": 0.7072, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.4632, + "step": 442 + }, + { + "epoch": 0.7104, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.2099, + "step": 444 + }, + { + "epoch": 0.7136, + "learning_rate": 1.9984379892296735e-05, + "loss": 0.0819, + "step": 446 + }, + { + "epoch": 0.7168, + "learning_rate": 1.9977509622105236e-05, + "loss": 0.538, + "step": 448 + }, + { + "epoch": 0.72, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.5216, + "step": 450 + }, + { + "epoch": 0.7232, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.0739, + "step": 452 + }, + { + "epoch": 0.7264, + "learning_rate": 1.9949420366137873e-05, + "loss": 0.0176, + "step": 454 + }, + { + "epoch": 0.7296, + "learning_rate": 1.993756836673986e-05, + "loss": 0.141, + "step": 456 + }, + { + "epoch": 0.7328, + "learning_rate": 1.9924474249753656e-05, + "loss": 0.0097, + "step": 458 + }, + { + "epoch": 0.736, + "learning_rate": 1.9910139651840497e-05, + "loss": 0.0071, + "step": 460 + }, + { + "epoch": 0.7392, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.0406, + "step": 462 + }, + { + "epoch": 0.7424, + "learning_rate": 1.987775633490599e-05, + "loss": 0.0902, + "step": 464 + }, + { + "epoch": 0.7456, + "learning_rate": 1.9859711663543573e-05, + "loss": 0.1248, + "step": 466 + }, + { + "epoch": 0.7488, + "learning_rate": 1.9840434606066186e-05, + "loss": 0.2472, + "step": 468 + }, + { + "epoch": 0.752, + "learning_rate": 1.9819927571953804e-05, + "loss": 0.1011, + "step": 470 + }, + { + "epoch": 0.7552, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.0349, + "step": 472 + }, + { + "epoch": 0.7584, + "learning_rate": 1.9775233980110524e-05, + "loss": 1.139, + "step": 474 + }, + { + "epoch": 0.7616, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.0181, + "step": 476 + }, + { + "epoch": 0.7648, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.5289, + "step": 478 + }, + { + "epoch": 0.768, + "learning_rate": 1.969903782680467e-05, + "loss": 0.4342, + "step": 480 + }, + { + "epoch": 0.7712, + "learning_rate": 1.9671210117755462e-05, + "loss": 1.4602, + "step": 482 + }, + { + "epoch": 0.7744, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.0076, + "step": 484 + }, + { + "epoch": 0.7776, + "learning_rate": 1.961193185426459e-05, + "loss": 0.2398, + "step": 486 + }, + { + "epoch": 0.7808, + "learning_rate": 1.958048870913786e-05, + "loss": 0.0098, + "step": 488 + }, + { + "epoch": 0.784, + "learning_rate": 1.9547848078560982e-05, + "loss": 0.63, + "step": 490 + }, + { + "epoch": 0.7872, + "learning_rate": 1.9514014042355054e-05, + "loss": 0.0606, + "step": 492 + }, + { + "epoch": 0.7904, + "learning_rate": 1.947899082950751e-05, + "loss": 0.0218, + "step": 494 + }, + { + "epoch": 0.7936, + "learning_rate": 1.9442782817643425e-05, + "loss": 0.5006, + "step": 496 + }, + { + "epoch": 0.7968, + "learning_rate": 1.9405394532478422e-05, + "loss": 0.0382, + "step": 498 + }, + { + "epoch": 0.8, + "learning_rate": 1.9366830647252977e-05, + "loss": 0.102, + "step": 500 + }, + { + "epoch": 0.8032, + "learning_rate": 1.9327095982148255e-05, + "loss": 0.1899, + "step": 502 + }, + { + "epoch": 0.8064, + "learning_rate": 1.928619550368371e-05, + "loss": 0.505, + "step": 504 + }, + { + "epoch": 0.8096, + "learning_rate": 1.9244134324096216e-05, + "loss": 0.3562, + "step": 506 + }, + { + "epoch": 0.8128, + "learning_rate": 1.9200917700701176e-05, + "loss": 0.5208, + "step": 508 + }, + { + "epoch": 0.816, + "learning_rate": 1.9156551035235298e-05, + "loss": 0.3292, + "step": 510 + }, + { + "epoch": 0.8192, + "learning_rate": 1.9111039873181475e-05, + "loss": 0.1156, + "step": 512 + }, + { + "epoch": 0.8224, + "learning_rate": 1.9064389903075683e-05, + "loss": 0.2045, + "step": 514 + }, + { + "epoch": 0.8256, + "learning_rate": 1.9016606955795843e-05, + "loss": 0.0543, + "step": 516 + }, + { + "epoch": 0.8288, + "learning_rate": 1.8967697003833156e-05, + "loss": 0.0878, + "step": 518 + }, + { + "epoch": 0.832, + "learning_rate": 1.891766616054545e-05, + "loss": 0.1163, + "step": 520 + }, + { + "epoch": 0.8352, + "learning_rate": 1.8866520679393124e-05, + "loss": 0.0034, + "step": 522 + }, + { + "epoch": 0.8384, + "learning_rate": 1.881426695315756e-05, + "loss": 0.4083, + "step": 524 + }, + { + "epoch": 0.8416, + "learning_rate": 1.8760911513141974e-05, + "loss": 0.1199, + "step": 526 + }, + { + "epoch": 0.8448, + "learning_rate": 1.8706461028355107e-05, + "loss": 0.2598, + "step": 528 + }, + { + "epoch": 0.848, + "learning_rate": 1.86509223046777e-05, + "loss": 0.4885, + "step": 530 + }, + { + "epoch": 0.8512, + "learning_rate": 1.8594302284011697e-05, + "loss": 0.1455, + "step": 532 + }, + { + "epoch": 0.8544, + "learning_rate": 1.8536608043412702e-05, + "loss": 0.0441, + "step": 534 + }, + { + "epoch": 0.8576, + "learning_rate": 1.847784679420527e-05, + "loss": 0.2049, + "step": 536 + }, + { + "epoch": 0.8608, + "learning_rate": 1.841802588108161e-05, + "loss": 0.0302, + "step": 538 + }, + { + "epoch": 0.864, + "learning_rate": 1.8357152781183613e-05, + "loss": 0.0036, + "step": 540 + }, + { + "epoch": 0.8672, + "learning_rate": 1.8295235103168128e-05, + "loss": 0.0575, + "step": 542 + }, + { + "epoch": 0.8704, + "learning_rate": 1.8232280586256104e-05, + "loss": 0.0007, + "step": 544 + }, + { + "epoch": 0.8736, + "learning_rate": 1.8168297099265108e-05, + "loss": 0.02, + "step": 546 + }, + { + "epoch": 0.8768, + "learning_rate": 1.8103292639625835e-05, + "loss": 0.0007, + "step": 548 + }, + { + "epoch": 0.88, + "learning_rate": 1.8037275332382575e-05, + "loss": 0.1148, + "step": 550 + }, + { + "epoch": 0.8832, + "learning_rate": 1.7970253429177494e-05, + "loss": 0.0016, + "step": 552 + }, + { + "epoch": 0.8864, + "learning_rate": 1.7902235307219336e-05, + "loss": 0.0004, + "step": 554 + }, + { + "epoch": 0.8896, + "learning_rate": 1.783322946823638e-05, + "loss": 0.004, + "step": 556 + }, + { + "epoch": 0.8928, + "learning_rate": 1.776324453741365e-05, + "loss": 0.1155, + "step": 558 + }, + { + "epoch": 0.896, + "learning_rate": 1.7692289262315008e-05, + "loss": 0.0402, + "step": 560 + }, + { + "epoch": 0.8992, + "learning_rate": 1.762037251178961e-05, + "loss": 0.0801, + "step": 562 + }, + { + "epoch": 0.9024, + "learning_rate": 1.7547503274863502e-05, + "loss": 0.0028, + "step": 564 + }, + { + "epoch": 0.9056, + "learning_rate": 1.7473690659616e-05, + "loss": 0.0006, + "step": 566 + }, + { + "epoch": 0.9088, + "learning_rate": 1.739894389204122e-05, + "loss": 0.8144, + "step": 568 + }, + { + "epoch": 0.912, + "learning_rate": 1.732327231489503e-05, + "loss": 0.0961, + "step": 570 + }, + { + "epoch": 0.9152, + "learning_rate": 1.7246685386527105e-05, + "loss": 0.1557, + "step": 572 + }, + { + "epoch": 0.9184, + "learning_rate": 1.716919267969884e-05, + "loss": 0.723, + "step": 574 + }, + { + "epoch": 0.9216, + "learning_rate": 1.7090803880386784e-05, + "loss": 0.19, + "step": 576 + }, + { + "epoch": 0.9248, + "learning_rate": 1.701152878657197e-05, + "loss": 0.0013, + "step": 578 + }, + { + "epoch": 0.928, + "learning_rate": 1.6931377307015236e-05, + "loss": 0.7352, + "step": 580 + }, + { + "epoch": 0.9312, + "learning_rate": 1.6850359460018744e-05, + "loss": 0.0095, + "step": 582 + }, + { + "epoch": 0.9344, + "learning_rate": 1.67684853721737e-05, + "loss": 0.0136, + "step": 584 + }, + { + "epoch": 0.9376, + "learning_rate": 1.6685765277094702e-05, + "loss": 0.0344, + "step": 586 + }, + { + "epoch": 0.9408, + "learning_rate": 1.6602209514140562e-05, + "loss": 0.0038, + "step": 588 + }, + { + "epoch": 0.944, + "learning_rate": 1.651782852712194e-05, + "loss": 0.1917, + "step": 590 + }, + { + "epoch": 0.9472, + "learning_rate": 1.6432632862996062e-05, + "loss": 0.0179, + "step": 592 + }, + { + "epoch": 0.9504, + "learning_rate": 1.6346633170548275e-05, + "loss": 0.0439, + "step": 594 + }, + { + "epoch": 0.9536, + "learning_rate": 1.625984019906122e-05, + "loss": 0.0745, + "step": 596 + }, + { + "epoch": 0.9568, + "learning_rate": 1.6172264796971063e-05, + "loss": 0.2384, + "step": 598 + }, + { + "epoch": 0.96, + "learning_rate": 1.6083917910511623e-05, + "loss": 0.0346, + "step": 600 + }, + { + "epoch": 0.9632, + "learning_rate": 1.5994810582346266e-05, + "loss": 1.3399, + "step": 602 + }, + { + "epoch": 0.9664, + "learning_rate": 1.5904953950187448e-05, + "loss": 0.0596, + "step": 604 + }, + { + "epoch": 0.9696, + "learning_rate": 1.581435924540482e-05, + "loss": 0.034, + "step": 606 + }, + { + "epoch": 0.9728, + "learning_rate": 1.5723037791621203e-05, + "loss": 0.1371, + "step": 608 + }, + { + "epoch": 0.976, + "learning_rate": 1.5631001003297302e-05, + "loss": 0.5681, + "step": 610 + }, + { + "epoch": 0.9792, + "learning_rate": 1.5538260384305083e-05, + "loss": 0.2622, + "step": 612 + }, + { + "epoch": 0.9824, + "learning_rate": 1.544482752648966e-05, + "loss": 0.0031, + "step": 614 + }, + { + "epoch": 0.9856, + "learning_rate": 1.5350714108220677e-05, + "loss": 0.0025, + "step": 616 + }, + { + "epoch": 0.9888, + "learning_rate": 1.5255931892932344e-05, + "loss": 0.1535, + "step": 618 + }, + { + "epoch": 0.992, + "learning_rate": 1.5160492727653238e-05, + "loss": 0.0149, + "step": 620 + }, + { + "epoch": 0.9952, + "learning_rate": 1.5064408541525578e-05, + "loss": 0.0155, + "step": 622 + }, + { + "epoch": 0.9984, + "learning_rate": 1.4967691344314012e-05, + "loss": 0.0022, + "step": 624 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 2760627376095232.0, + "train_loss": 0.22509195377584545, + "train_runtime": 3314.8358, + "train_samples_per_second": 3.017, + "train_steps_per_second": 0.189 + } + ], + "logging_steps": 2, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 2760627376095232.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9642ab4b94e0976b6e32188db2267e284432872 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1966630b157259af240e2c25b18fd6a885ee24fb9e84084af2f01e0c2fd77076 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..368a3dd9fabbc0e4e693d9610f7bfac0ba275e09 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:691c7e216c5ddbc2e717eb31ab2fdcb1295705461b48f57f703090492822d81d +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..c1cd31701b2742d6ce4764accc945e7a57c2c811 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcd8e4eb86c55039a290bf62b18afe66752a2da44f2e8b6a626172b84871616e +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9e398c0414e7b118dfa88e90352bc29dc5bffcc --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_0625_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5bb37bd3858aa50ed7d03838d14928a9cce1bdec346699a8ee288e8950e6f38 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7b3ea5403ac0217fa053043b45490745caf9427a --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,3776 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1249, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016012810248198558, + "learning_rate": 2.4341906163790364e-06, + "loss": 0.2275, + "step": 2 + }, + { + "epoch": 0.0032025620496397116, + "learning_rate": 2.4708617956148052e-06, + "loss": 0.1199, + "step": 4 + }, + { + "epoch": 0.004803843074459567, + "learning_rate": 2.507768247396697e-06, + "loss": 0.1487, + "step": 6 + }, + { + "epoch": 0.006405124099279423, + "learning_rate": 2.5449088184619065e-06, + "loss": 0.0618, + "step": 8 + }, + { + "epoch": 0.008006405124099279, + "learning_rate": 2.5822823482318517e-06, + "loss": 0.6438, + "step": 10 + }, + { + "epoch": 0.009607686148919135, + "learning_rate": 2.6198876688483453e-06, + "loss": 0.2055, + "step": 12 + }, + { + "epoch": 0.01120896717373899, + "learning_rate": 2.6577236052101764e-06, + "loss": 0.2081, + "step": 14 + }, + { + "epoch": 0.012810248198558846, + "learning_rate": 2.6957889750097866e-06, + "loss": 0.0915, + "step": 16 + }, + { + "epoch": 0.014411529223378704, + "learning_rate": 2.7340825887701848e-06, + "loss": 0.2862, + "step": 18 + }, + { + "epoch": 0.016012810248198558, + "learning_rate": 2.772603249882202e-06, + "loss": 0.1029, + "step": 20 + }, + { + "epoch": 0.017614091273018415, + "learning_rate": 2.81134975464178e-06, + "loss": 0.2512, + "step": 22 + }, + { + "epoch": 0.01921537229783827, + "learning_rate": 2.850320892287688e-06, + "loss": 0.1519, + "step": 24 + }, + { + "epoch": 0.020816653322658127, + "learning_rate": 2.889515445039256e-06, + "loss": 0.122, + "step": 26 + }, + { + "epoch": 0.02241793434747798, + "learning_rate": 2.928932188134529e-06, + "loss": 0.1606, + "step": 28 + }, + { + "epoch": 0.02401921537229784, + "learning_rate": 2.9685698898684355e-06, + "loss": 0.389, + "step": 30 + }, + { + "epoch": 0.025620496397117692, + "learning_rate": 3.00842731163137e-06, + "loss": 0.2451, + "step": 32 + }, + { + "epoch": 0.02722177742193755, + "learning_rate": 3.048503207947854e-06, + "loss": 0.1286, + "step": 34 + }, + { + "epoch": 0.028823058446757407, + "learning_rate": 3.0887963265154187e-06, + "loss": 0.5395, + "step": 36 + }, + { + "epoch": 0.03042433947157726, + "learning_rate": 3.129305408243829e-06, + "loss": 0.1047, + "step": 38 + }, + { + "epoch": 0.032025620496397116, + "learning_rate": 3.17002918729432e-06, + "loss": 0.1851, + "step": 40 + }, + { + "epoch": 0.03362690152121697, + "learning_rate": 3.2109663911192622e-06, + "loss": 0.1677, + "step": 42 + }, + { + "epoch": 0.03522818254603683, + "learning_rate": 3.2521157405018146e-06, + "loss": 0.2029, + "step": 44 + }, + { + "epoch": 0.03682946357085669, + "learning_rate": 3.293475949595998e-06, + "loss": 0.1351, + "step": 46 + }, + { + "epoch": 0.03843074459567654, + "learning_rate": 3.335045725966829e-06, + "loss": 0.2309, + "step": 48 + }, + { + "epoch": 0.040032025620496396, + "learning_rate": 3.3768237706306716e-06, + "loss": 0.205, + "step": 50 + }, + { + "epoch": 0.041633306645316254, + "learning_rate": 3.418808778095917e-06, + "loss": 0.2565, + "step": 52 + }, + { + "epoch": 0.04323458767013611, + "learning_rate": 3.460999436403676e-06, + "loss": 0.2198, + "step": 54 + }, + { + "epoch": 0.04483586869495596, + "learning_rate": 3.5033944271688624e-06, + "loss": 0.1523, + "step": 56 + }, + { + "epoch": 0.04643714971977582, + "learning_rate": 3.5459924256213596e-06, + "loss": 0.2195, + "step": 58 + }, + { + "epoch": 0.04803843074459568, + "learning_rate": 3.588792100647368e-06, + "loss": 0.1501, + "step": 60 + }, + { + "epoch": 0.049639711769415534, + "learning_rate": 3.6317921148310965e-06, + "loss": 0.1788, + "step": 62 + }, + { + "epoch": 0.051240992794235385, + "learning_rate": 3.674991124496452e-06, + "loss": 0.2798, + "step": 64 + }, + { + "epoch": 0.05284227381905524, + "learning_rate": 3.7183877797491143e-06, + "loss": 0.5088, + "step": 66 + }, + { + "epoch": 0.0544435548438751, + "learning_rate": 3.7619807245186824e-06, + "loss": 0.1379, + "step": 68 + }, + { + "epoch": 0.05604483586869496, + "learning_rate": 3.8057685966010025e-06, + "loss": 0.3523, + "step": 70 + }, + { + "epoch": 0.057646116893514815, + "learning_rate": 3.849750027700842e-06, + "loss": 0.2113, + "step": 72 + }, + { + "epoch": 0.059247397918334666, + "learning_rate": 3.8939236434745184e-06, + "loss": 0.132, + "step": 74 + }, + { + "epoch": 0.06084867894315452, + "learning_rate": 3.938288063572962e-06, + "loss": 0.6913, + "step": 76 + }, + { + "epoch": 0.06244995996797438, + "learning_rate": 3.982841901684792e-06, + "loss": 0.3813, + "step": 78 + }, + { + "epoch": 0.06405124099279423, + "learning_rate": 4.027583765579601e-06, + "loss": 0.2328, + "step": 80 + }, + { + "epoch": 0.0656525220176141, + "learning_rate": 4.072512257151546e-06, + "loss": 0.0686, + "step": 82 + }, + { + "epoch": 0.06725380304243395, + "learning_rate": 4.117625972462988e-06, + "loss": 0.3447, + "step": 84 + }, + { + "epoch": 0.0688550840672538, + "learning_rate": 4.1629235017883285e-06, + "loss": 0.2218, + "step": 86 + }, + { + "epoch": 0.07045636509207366, + "learning_rate": 4.208403429658151e-06, + "loss": 0.3388, + "step": 88 + }, + { + "epoch": 0.07205764611689351, + "learning_rate": 4.254064334903347e-06, + "loss": 0.1261, + "step": 90 + }, + { + "epoch": 0.07365892714171338, + "learning_rate": 4.299904790699619e-06, + "loss": 0.1319, + "step": 92 + }, + { + "epoch": 0.07526020816653323, + "learning_rate": 4.345923364612024e-06, + "loss": 0.2051, + "step": 94 + }, + { + "epoch": 0.07686148919135308, + "learning_rate": 4.392118618639698e-06, + "loss": 0.3228, + "step": 96 + }, + { + "epoch": 0.07846277021617294, + "learning_rate": 4.4384891092608795e-06, + "loss": 0.3367, + "step": 98 + }, + { + "epoch": 0.08006405124099279, + "learning_rate": 4.485033387477915e-06, + "loss": 0.2677, + "step": 100 + }, + { + "epoch": 0.08166533226581266, + "learning_rate": 4.531749998862628e-06, + "loss": 0.1195, + "step": 102 + }, + { + "epoch": 0.08326661329063251, + "learning_rate": 4.578637483601732e-06, + "loss": 0.2681, + "step": 104 + }, + { + "epoch": 0.08486789431545236, + "learning_rate": 4.625694376542399e-06, + "loss": 0.0985, + "step": 106 + }, + { + "epoch": 0.08646917534027222, + "learning_rate": 4.672919207238145e-06, + "loss": 0.1299, + "step": 108 + }, + { + "epoch": 0.08807045636509207, + "learning_rate": 4.720310499994664e-06, + "loss": 0.0364, + "step": 110 + }, + { + "epoch": 0.08967173738991192, + "learning_rate": 4.767866773916041e-06, + "loss": 0.1485, + "step": 112 + }, + { + "epoch": 0.09127301841473179, + "learning_rate": 4.81558654295099e-06, + "loss": 0.4099, + "step": 114 + }, + { + "epoch": 0.09287429943955164, + "learning_rate": 4.863468315939234e-06, + "loss": 0.1951, + "step": 116 + }, + { + "epoch": 0.0944755804643715, + "learning_rate": 4.911510596658202e-06, + "loss": 0.577, + "step": 118 + }, + { + "epoch": 0.09607686148919135, + "learning_rate": 4.959711883869734e-06, + "loss": 0.231, + "step": 120 + }, + { + "epoch": 0.0976781425140112, + "learning_rate": 5.0080706713669435e-06, + "loss": 0.0801, + "step": 122 + }, + { + "epoch": 0.09927942353883107, + "learning_rate": 5.056585448021398e-06, + "loss": 0.0869, + "step": 124 + }, + { + "epoch": 0.10088070456365092, + "learning_rate": 5.105254697830208e-06, + "loss": 0.2088, + "step": 126 + }, + { + "epoch": 0.10248198558847077, + "learning_rate": 5.154076899963514e-06, + "loss": 0.1984, + "step": 128 + }, + { + "epoch": 0.10408326661329063, + "learning_rate": 5.203050528811959e-06, + "loss": 0.6054, + "step": 130 + }, + { + "epoch": 0.10568454763811048, + "learning_rate": 5.2521740540343205e-06, + "loss": 0.2394, + "step": 132 + }, + { + "epoch": 0.10728582866293035, + "learning_rate": 5.3014459406054295e-06, + "loss": 0.1604, + "step": 134 + }, + { + "epoch": 0.1088871096877502, + "learning_rate": 5.350864648864026e-06, + "loss": 0.205, + "step": 136 + }, + { + "epoch": 0.11048839071257005, + "learning_rate": 5.4004286345609665e-06, + "loss": 0.2446, + "step": 138 + }, + { + "epoch": 0.11208967173738991, + "learning_rate": 5.450136348907444e-06, + "loss": 0.2101, + "step": 140 + }, + { + "epoch": 0.11369095276220977, + "learning_rate": 5.499986238623329e-06, + "loss": 0.3636, + "step": 142 + }, + { + "epoch": 0.11529223378702963, + "learning_rate": 5.549976745985809e-06, + "loss": 0.1035, + "step": 144 + }, + { + "epoch": 0.11689351481184948, + "learning_rate": 5.6001063088780085e-06, + "loss": 0.2524, + "step": 146 + }, + { + "epoch": 0.11849479583666933, + "learning_rate": 5.650373360837763e-06, + "loss": 0.089, + "step": 148 + }, + { + "epoch": 0.1200960768614892, + "learning_rate": 5.700776331106674e-06, + "loss": 0.104, + "step": 150 + }, + { + "epoch": 0.12169735788630905, + "learning_rate": 5.751313644679071e-06, + "loss": 0.068, + "step": 152 + }, + { + "epoch": 0.1232986389111289, + "learning_rate": 5.8019837223513295e-06, + "loss": 0.1207, + "step": 154 + }, + { + "epoch": 0.12489991993594876, + "learning_rate": 5.852784980771182e-06, + "loss": 0.3008, + "step": 156 + }, + { + "epoch": 0.1265012009607686, + "learning_rate": 5.903715832487138e-06, + "loss": 0.2284, + "step": 158 + }, + { + "epoch": 0.12810248198558846, + "learning_rate": 5.954774685998206e-06, + "loss": 0.2461, + "step": 160 + }, + { + "epoch": 0.1297037630104083, + "learning_rate": 6.005959945803494e-06, + "loss": 0.2537, + "step": 162 + }, + { + "epoch": 0.1313050440352282, + "learning_rate": 6.057270012452186e-06, + "loss": 0.1707, + "step": 164 + }, + { + "epoch": 0.13290632506004804, + "learning_rate": 6.108703282593461e-06, + "loss": 0.2555, + "step": 166 + }, + { + "epoch": 0.1345076060848679, + "learning_rate": 6.160258149026557e-06, + "loss": 0.1851, + "step": 168 + }, + { + "epoch": 0.13610888710968774, + "learning_rate": 6.2119330007511014e-06, + "loss": 0.4875, + "step": 170 + }, + { + "epoch": 0.1377101681345076, + "learning_rate": 6.263726223017326e-06, + "loss": 0.0809, + "step": 172 + }, + { + "epoch": 0.13931144915932747, + "learning_rate": 6.315636197376634e-06, + "loss": 0.2014, + "step": 174 + }, + { + "epoch": 0.14091273018414732, + "learning_rate": 6.3676613017321305e-06, + "loss": 0.3094, + "step": 176 + }, + { + "epoch": 0.14251401120896717, + "learning_rate": 6.419799910389257e-06, + "loss": 0.2186, + "step": 178 + }, + { + "epoch": 0.14411529223378702, + "learning_rate": 6.472050394106689e-06, + "loss": 0.0854, + "step": 180 + }, + { + "epoch": 0.14571657325860687, + "learning_rate": 6.524411120147204e-06, + "loss": 0.1137, + "step": 182 + }, + { + "epoch": 0.14731785428342675, + "learning_rate": 6.576880452328645e-06, + "loss": 0.2272, + "step": 184 + }, + { + "epoch": 0.1489191353082466, + "learning_rate": 6.6294567510751675e-06, + "loss": 0.5397, + "step": 186 + }, + { + "epoch": 0.15052041633306645, + "learning_rate": 6.682138373468341e-06, + "loss": 0.124, + "step": 188 + }, + { + "epoch": 0.1521216973578863, + "learning_rate": 6.734923673298605e-06, + "loss": 0.3223, + "step": 190 + }, + { + "epoch": 0.15372297838270615, + "learning_rate": 6.787811001116654e-06, + "loss": 0.0249, + "step": 192 + }, + { + "epoch": 0.15532425940752603, + "learning_rate": 6.840798704284939e-06, + "loss": 0.305, + "step": 194 + }, + { + "epoch": 0.15692554043234588, + "learning_rate": 6.893885127029419e-06, + "loss": 0.3134, + "step": 196 + }, + { + "epoch": 0.15852682145716573, + "learning_rate": 6.94706861049117e-06, + "loss": 0.091, + "step": 198 + }, + { + "epoch": 0.16012810248198558, + "learning_rate": 7.000347492778341e-06, + "loss": 0.1588, + "step": 200 + }, + { + "epoch": 0.16172938350680544, + "learning_rate": 7.05372010901803e-06, + "loss": 0.5336, + "step": 202 + }, + { + "epoch": 0.1633306645316253, + "learning_rate": 7.1071847914082605e-06, + "loss": 0.2141, + "step": 204 + }, + { + "epoch": 0.16493194555644516, + "learning_rate": 7.160739869270219e-06, + "loss": 0.1052, + "step": 206 + }, + { + "epoch": 0.16653322658126501, + "learning_rate": 7.214383669100317e-06, + "loss": 0.1579, + "step": 208 + }, + { + "epoch": 0.16813450760608487, + "learning_rate": 7.268114514622635e-06, + "loss": 0.728, + "step": 210 + }, + { + "epoch": 0.16973578863090472, + "learning_rate": 7.321930726841144e-06, + "loss": 0.1542, + "step": 212 + }, + { + "epoch": 0.17133706965572457, + "learning_rate": 7.375830624092336e-06, + "loss": 0.022, + "step": 214 + }, + { + "epoch": 0.17293835068054444, + "learning_rate": 7.429812522097613e-06, + "loss": 0.1505, + "step": 216 + }, + { + "epoch": 0.1745396317053643, + "learning_rate": 7.4838747340160475e-06, + "loss": 0.2115, + "step": 218 + }, + { + "epoch": 0.17614091273018415, + "learning_rate": 7.538015570497046e-06, + "loss": 0.2275, + "step": 220 + }, + { + "epoch": 0.177742193755004, + "learning_rate": 7.592233339733077e-06, + "loss": 0.324, + "step": 222 + }, + { + "epoch": 0.17934347477982385, + "learning_rate": 7.646526347512665e-06, + "loss": 0.7104, + "step": 224 + }, + { + "epoch": 0.18094475580464373, + "learning_rate": 7.70089289727319e-06, + "loss": 0.2685, + "step": 226 + }, + { + "epoch": 0.18254603682946358, + "learning_rate": 7.755331290154041e-06, + "loss": 0.1496, + "step": 228 + }, + { + "epoch": 0.18414731785428343, + "learning_rate": 7.809839825049565e-06, + "loss": 0.1531, + "step": 230 + }, + { + "epoch": 0.18574859887910328, + "learning_rate": 7.864416798662347e-06, + "loss": 0.2104, + "step": 232 + }, + { + "epoch": 0.18734987990392313, + "learning_rate": 7.919060505556376e-06, + "loss": 0.0287, + "step": 234 + }, + { + "epoch": 0.188951160928743, + "learning_rate": 7.973769238210291e-06, + "loss": 0.5513, + "step": 236 + }, + { + "epoch": 0.19055244195356286, + "learning_rate": 8.028541287070858e-06, + "loss": 0.1674, + "step": 238 + }, + { + "epoch": 0.1921537229783827, + "learning_rate": 8.083374940606256e-06, + "loss": 0.3903, + "step": 240 + }, + { + "epoch": 0.19375500400320256, + "learning_rate": 8.138268485359684e-06, + "loss": 0.6787, + "step": 242 + }, + { + "epoch": 0.1953562850280224, + "learning_rate": 8.193220206002785e-06, + "loss": 0.2906, + "step": 244 + }, + { + "epoch": 0.1969575660528423, + "learning_rate": 8.248228385389349e-06, + "loss": 0.1635, + "step": 246 + }, + { + "epoch": 0.19855884707766214, + "learning_rate": 8.303291304608936e-06, + "loss": 0.1572, + "step": 248 + }, + { + "epoch": 0.200160128102482, + "learning_rate": 8.358407243040524e-06, + "loss": 0.2199, + "step": 250 + }, + { + "epoch": 0.20176140912730184, + "learning_rate": 8.413574478406386e-06, + "loss": 0.0977, + "step": 252 + }, + { + "epoch": 0.2033626901521217, + "learning_rate": 8.468791286825856e-06, + "loss": 0.4269, + "step": 254 + }, + { + "epoch": 0.20496397117694154, + "learning_rate": 8.524055942869135e-06, + "loss": 0.235, + "step": 256 + }, + { + "epoch": 0.20656525220176142, + "learning_rate": 8.579366719611353e-06, + "loss": 0.0837, + "step": 258 + }, + { + "epoch": 0.20816653322658127, + "learning_rate": 8.634721888686368e-06, + "loss": 0.1225, + "step": 260 + }, + { + "epoch": 0.20976781425140112, + "learning_rate": 8.690119720340907e-06, + "loss": 0.1595, + "step": 262 + }, + { + "epoch": 0.21136909527622097, + "learning_rate": 8.74555848348857e-06, + "loss": 0.2561, + "step": 264 + }, + { + "epoch": 0.21297037630104082, + "learning_rate": 8.801036445763858e-06, + "loss": 0.1401, + "step": 266 + }, + { + "epoch": 0.2145716573258607, + "learning_rate": 8.856551873576448e-06, + "loss": 0.1043, + "step": 268 + }, + { + "epoch": 0.21617293835068055, + "learning_rate": 8.912103032165206e-06, + "loss": 0.2467, + "step": 270 + }, + { + "epoch": 0.2177742193755004, + "learning_rate": 8.967688185652527e-06, + "loss": 0.1485, + "step": 272 + }, + { + "epoch": 0.21937550040032025, + "learning_rate": 9.023305597098526e-06, + "loss": 0.1308, + "step": 274 + }, + { + "epoch": 0.2209767814251401, + "learning_rate": 9.078953528555258e-06, + "loss": 0.4821, + "step": 276 + }, + { + "epoch": 0.22257806244995998, + "learning_rate": 9.134630241121135e-06, + "loss": 0.2221, + "step": 278 + }, + { + "epoch": 0.22417934347477983, + "learning_rate": 9.190333994995208e-06, + "loss": 0.1771, + "step": 280 + }, + { + "epoch": 0.22578062449959968, + "learning_rate": 9.24606304953148e-06, + "loss": 0.0618, + "step": 282 + }, + { + "epoch": 0.22738190552441953, + "learning_rate": 9.301815663293426e-06, + "loss": 0.3763, + "step": 284 + }, + { + "epoch": 0.22898318654923938, + "learning_rate": 9.35759009410826e-06, + "loss": 0.3482, + "step": 286 + }, + { + "epoch": 0.23058446757405926, + "learning_rate": 9.41338459912151e-06, + "loss": 0.7765, + "step": 288 + }, + { + "epoch": 0.2321857485988791, + "learning_rate": 9.469197434851414e-06, + "loss": 0.1581, + "step": 290 + }, + { + "epoch": 0.23378702962369896, + "learning_rate": 9.52502685724336e-06, + "loss": 0.3701, + "step": 292 + }, + { + "epoch": 0.2353883106485188, + "learning_rate": 9.580871121724498e-06, + "loss": 0.7667, + "step": 294 + }, + { + "epoch": 0.23698959167333866, + "learning_rate": 9.636728483258116e-06, + "loss": 0.6489, + "step": 296 + }, + { + "epoch": 0.23859087269815854, + "learning_rate": 9.692597196398302e-06, + "loss": 0.3247, + "step": 298 + }, + { + "epoch": 0.2401921537229784, + "learning_rate": 9.748475515344416e-06, + "loss": 0.3378, + "step": 300 + }, + { + "epoch": 0.24179343474779824, + "learning_rate": 9.80436169399561e-06, + "loss": 0.4929, + "step": 302 + }, + { + "epoch": 0.2433947157726181, + "learning_rate": 9.8602539860055e-06, + "loss": 0.0797, + "step": 304 + }, + { + "epoch": 0.24499599679743794, + "learning_rate": 9.916150644836596e-06, + "loss": 0.1624, + "step": 306 + }, + { + "epoch": 0.2465972778222578, + "learning_rate": 9.972049923815011e-06, + "loss": 0.2551, + "step": 308 + }, + { + "epoch": 0.24819855884707767, + "learning_rate": 1.0027950076184982e-05, + "loss": 0.3709, + "step": 310 + }, + { + "epoch": 0.24979983987189752, + "learning_rate": 1.0083849355163397e-05, + "loss": 0.5852, + "step": 312 + }, + { + "epoch": 0.2514011208967174, + "learning_rate": 1.0139746013994493e-05, + "loss": 0.2163, + "step": 314 + }, + { + "epoch": 0.2530024019215372, + "learning_rate": 1.0195638306004383e-05, + "loss": 0.2825, + "step": 316 + }, + { + "epoch": 0.2546036829463571, + "learning_rate": 1.0251524484655577e-05, + "loss": 0.115, + "step": 318 + }, + { + "epoch": 0.2562049639711769, + "learning_rate": 1.0307402803601691e-05, + "loss": 0.1574, + "step": 320 + }, + { + "epoch": 0.2578062449959968, + "learning_rate": 1.0363271516741877e-05, + "loss": 0.1616, + "step": 322 + }, + { + "epoch": 0.2594075260208166, + "learning_rate": 1.0419128878275495e-05, + "loss": 0.3463, + "step": 324 + }, + { + "epoch": 0.2610088070456365, + "learning_rate": 1.0474973142756632e-05, + "loss": 0.1717, + "step": 326 + }, + { + "epoch": 0.2626100880704564, + "learning_rate": 1.053080256514858e-05, + "loss": 0.328, + "step": 328 + }, + { + "epoch": 0.2642113690952762, + "learning_rate": 1.0586615400878484e-05, + "loss": 0.2043, + "step": 330 + }, + { + "epoch": 0.2658126501200961, + "learning_rate": 1.0642409905891733e-05, + "loss": 0.2251, + "step": 332 + }, + { + "epoch": 0.2674139311449159, + "learning_rate": 1.0698184336706567e-05, + "loss": 0.2867, + "step": 334 + }, + { + "epoch": 0.2690152121697358, + "learning_rate": 1.0753936950468513e-05, + "loss": 0.3653, + "step": 336 + }, + { + "epoch": 0.27061649319455566, + "learning_rate": 1.0809666005004787e-05, + "loss": 0.367, + "step": 338 + }, + { + "epoch": 0.2722177742193755, + "learning_rate": 1.0865369758878858e-05, + "loss": 0.1778, + "step": 340 + }, + { + "epoch": 0.27381905524419536, + "learning_rate": 1.0921046471444737e-05, + "loss": 0.2229, + "step": 342 + }, + { + "epoch": 0.2754203362690152, + "learning_rate": 1.0976694402901467e-05, + "loss": 0.1056, + "step": 344 + }, + { + "epoch": 0.27702161729383507, + "learning_rate": 1.1032311814347467e-05, + "loss": 0.2468, + "step": 346 + }, + { + "epoch": 0.27862289831865494, + "learning_rate": 1.1087896967834787e-05, + "loss": 0.2882, + "step": 348 + }, + { + "epoch": 0.28022417934347477, + "learning_rate": 1.1143448126423545e-05, + "loss": 0.3341, + "step": 350 + }, + { + "epoch": 0.28182546036829464, + "learning_rate": 1.1198963554236135e-05, + "loss": 0.0985, + "step": 352 + }, + { + "epoch": 0.28342674139311447, + "learning_rate": 1.1254441516511425e-05, + "loss": 0.1268, + "step": 354 + }, + { + "epoch": 0.28502802241793435, + "learning_rate": 1.1309880279659087e-05, + "loss": 0.2957, + "step": 356 + }, + { + "epoch": 0.2866293034427542, + "learning_rate": 1.1365278111313625e-05, + "loss": 0.2625, + "step": 358 + }, + { + "epoch": 0.28823058446757405, + "learning_rate": 1.142063328038864e-05, + "loss": 0.3047, + "step": 360 + }, + { + "epoch": 0.2898318654923939, + "learning_rate": 1.1475944057130856e-05, + "loss": 0.2075, + "step": 362 + }, + { + "epoch": 0.29143314651721375, + "learning_rate": 1.1531208713174138e-05, + "loss": 0.5513, + "step": 364 + }, + { + "epoch": 0.2930344275420336, + "learning_rate": 1.1586425521593607e-05, + "loss": 0.1282, + "step": 366 + }, + { + "epoch": 0.2946357085668535, + "learning_rate": 1.1641592756959467e-05, + "loss": 0.1663, + "step": 368 + }, + { + "epoch": 0.2962369895916733, + "learning_rate": 1.1696708695391057e-05, + "loss": 0.2099, + "step": 370 + }, + { + "epoch": 0.2978382706164932, + "learning_rate": 1.1751771614610643e-05, + "loss": 0.0947, + "step": 372 + }, + { + "epoch": 0.29943955164131303, + "learning_rate": 1.180677979399721e-05, + "loss": 0.1601, + "step": 374 + }, + { + "epoch": 0.3010408326661329, + "learning_rate": 1.1861731514640309e-05, + "loss": 0.1655, + "step": 376 + }, + { + "epoch": 0.3026421136909528, + "learning_rate": 1.1916625059393739e-05, + "loss": 0.2512, + "step": 378 + }, + { + "epoch": 0.3042433947157726, + "learning_rate": 1.1971458712929133e-05, + "loss": 0.1029, + "step": 380 + }, + { + "epoch": 0.3058446757405925, + "learning_rate": 1.2026230761789702e-05, + "loss": 0.2616, + "step": 382 + }, + { + "epoch": 0.3074459567654123, + "learning_rate": 1.2080939494443618e-05, + "loss": 0.2542, + "step": 384 + }, + { + "epoch": 0.3090472377902322, + "learning_rate": 1.2135583201337646e-05, + "loss": 0.1152, + "step": 386 + }, + { + "epoch": 0.31064851881505207, + "learning_rate": 1.2190160174950428e-05, + "loss": 0.1199, + "step": 388 + }, + { + "epoch": 0.3122497998398719, + "learning_rate": 1.2244668709845952e-05, + "loss": 0.0089, + "step": 390 + }, + { + "epoch": 0.31385108086469177, + "learning_rate": 1.2299107102726804e-05, + "loss": 1.3146, + "step": 392 + }, + { + "epoch": 0.3154523618895116, + "learning_rate": 1.2353473652487329e-05, + "loss": 0.055, + "step": 394 + }, + { + "epoch": 0.31705364291433147, + "learning_rate": 1.2407766660266916e-05, + "loss": 0.0855, + "step": 396 + }, + { + "epoch": 0.31865492393915135, + "learning_rate": 1.2461984429502947e-05, + "loss": 0.9325, + "step": 398 + }, + { + "epoch": 0.32025620496397117, + "learning_rate": 1.2516125265983945e-05, + "loss": 0.1847, + "step": 400 + }, + { + "epoch": 0.32185748598879105, + "learning_rate": 1.257018747790238e-05, + "loss": 0.2146, + "step": 402 + }, + { + "epoch": 0.32345876701361087, + "learning_rate": 1.2624169375907657e-05, + "loss": 0.5036, + "step": 404 + }, + { + "epoch": 0.32506004803843075, + "learning_rate": 1.2678069273158849e-05, + "loss": 0.2038, + "step": 406 + }, + { + "epoch": 0.3266613290632506, + "learning_rate": 1.273188548537736e-05, + "loss": 0.3784, + "step": 408 + }, + { + "epoch": 0.32826261008807045, + "learning_rate": 1.2785616330899676e-05, + "loss": 1.1029, + "step": 410 + }, + { + "epoch": 0.32986389111289033, + "learning_rate": 1.2839260130729776e-05, + "loss": 0.1178, + "step": 412 + }, + { + "epoch": 0.33146517213771015, + "learning_rate": 1.2892815208591734e-05, + "loss": 0.3587, + "step": 414 + }, + { + "epoch": 0.33306645316253003, + "learning_rate": 1.2946279890981966e-05, + "loss": 0.2161, + "step": 416 + }, + { + "epoch": 0.33466773418734985, + "learning_rate": 1.2999652507221652e-05, + "loss": 0.2562, + "step": 418 + }, + { + "epoch": 0.33626901521216973, + "learning_rate": 1.3052931389508822e-05, + "loss": 0.2467, + "step": 420 + }, + { + "epoch": 0.3378702962369896, + "learning_rate": 1.3106114872970575e-05, + "loss": 0.1709, + "step": 422 + }, + { + "epoch": 0.33947157726180943, + "learning_rate": 1.3159201295715054e-05, + "loss": 0.6455, + "step": 424 + }, + { + "epoch": 0.3410728582866293, + "learning_rate": 1.321218899888334e-05, + "loss": 0.1511, + "step": 426 + }, + { + "epoch": 0.34267413931144913, + "learning_rate": 1.326507632670139e-05, + "loss": 0.4041, + "step": 428 + }, + { + "epoch": 0.344275420336269, + "learning_rate": 1.3317861626531652e-05, + "loss": 0.4531, + "step": 430 + }, + { + "epoch": 0.3458767013610889, + "learning_rate": 1.3370543248924826e-05, + "loss": 0.3937, + "step": 432 + }, + { + "epoch": 0.3474779823859087, + "learning_rate": 1.3423119547671348e-05, + "loss": 0.2514, + "step": 434 + }, + { + "epoch": 0.3490792634107286, + "learning_rate": 1.347558887985279e-05, + "loss": 0.3825, + "step": 436 + }, + { + "epoch": 0.3506805444355484, + "learning_rate": 1.3527949605893305e-05, + "loss": 0.1508, + "step": 438 + }, + { + "epoch": 0.3522818254603683, + "learning_rate": 1.3580200089610739e-05, + "loss": 0.4094, + "step": 440 + }, + { + "epoch": 0.35388310648518817, + "learning_rate": 1.3632338698267863e-05, + "loss": 0.2715, + "step": 442 + }, + { + "epoch": 0.355484387510008, + "learning_rate": 1.368436380262336e-05, + "loss": 0.3497, + "step": 444 + }, + { + "epoch": 0.35708566853482787, + "learning_rate": 1.3736273776982667e-05, + "loss": 0.1601, + "step": 446 + }, + { + "epoch": 0.3586869495596477, + "learning_rate": 1.3788066999248893e-05, + "loss": 0.2486, + "step": 448 + }, + { + "epoch": 0.3602882305844676, + "learning_rate": 1.3839741850973435e-05, + "loss": 0.1003, + "step": 450 + }, + { + "epoch": 0.36188951160928745, + "learning_rate": 1.3891296717406533e-05, + "loss": 0.1897, + "step": 452 + }, + { + "epoch": 0.3634907926341073, + "learning_rate": 1.3942729987547808e-05, + "loss": 0.3397, + "step": 454 + }, + { + "epoch": 0.36509207365892715, + "learning_rate": 1.3994040054196498e-05, + "loss": 0.4189, + "step": 456 + }, + { + "epoch": 0.366693354683747, + "learning_rate": 1.4045225314001789e-05, + "loss": 0.1395, + "step": 458 + }, + { + "epoch": 0.36829463570856685, + "learning_rate": 1.4096284167512856e-05, + "loss": 0.1541, + "step": 460 + }, + { + "epoch": 0.36989591673338673, + "learning_rate": 1.4147215019228813e-05, + "loss": 0.1644, + "step": 462 + }, + { + "epoch": 0.37149719775820655, + "learning_rate": 1.4198016277648665e-05, + "loss": 0.092, + "step": 464 + }, + { + "epoch": 0.37309847878302643, + "learning_rate": 1.4248686355320922e-05, + "loss": 0.4036, + "step": 466 + }, + { + "epoch": 0.37469975980784626, + "learning_rate": 1.429922366889332e-05, + "loss": 0.3707, + "step": 468 + }, + { + "epoch": 0.37630104083266613, + "learning_rate": 1.4349626639162231e-05, + "loss": 0.3981, + "step": 470 + }, + { + "epoch": 0.377902321857486, + "learning_rate": 1.4399893691121985e-05, + "loss": 0.3621, + "step": 472 + }, + { + "epoch": 0.37950360288230583, + "learning_rate": 1.4450023254014185e-05, + "loss": 0.1781, + "step": 474 + }, + { + "epoch": 0.3811048839071257, + "learning_rate": 1.4500013761376663e-05, + "loss": 0.2227, + "step": 476 + }, + { + "epoch": 0.38270616493194554, + "learning_rate": 1.454986365109255e-05, + "loss": 0.2198, + "step": 478 + }, + { + "epoch": 0.3843074459567654, + "learning_rate": 1.4599571365439027e-05, + "loss": 0.5968, + "step": 480 + }, + { + "epoch": 0.3859087269815853, + "learning_rate": 1.4649135351135968e-05, + "loss": 0.0642, + "step": 482 + }, + { + "epoch": 0.3875100080064051, + "learning_rate": 1.4698554059394563e-05, + "loss": 0.2467, + "step": 484 + }, + { + "epoch": 0.389111289031225, + "learning_rate": 1.4747825945965675e-05, + "loss": 0.229, + "step": 486 + }, + { + "epoch": 0.3907125700560448, + "learning_rate": 1.4796949471188033e-05, + "loss": 0.304, + "step": 488 + }, + { + "epoch": 0.3923138510808647, + "learning_rate": 1.4845923100036479e-05, + "loss": 0.2787, + "step": 490 + }, + { + "epoch": 0.3939151321056846, + "learning_rate": 1.4894745302169786e-05, + "loss": 0.1312, + "step": 492 + }, + { + "epoch": 0.3955164131305044, + "learning_rate": 1.4943414551978597e-05, + "loss": 0.3362, + "step": 494 + }, + { + "epoch": 0.3971176941553243, + "learning_rate": 1.499192932863305e-05, + "loss": 0.2804, + "step": 496 + }, + { + "epoch": 0.3987189751801441, + "learning_rate": 1.5040288116130261e-05, + "loss": 0.4927, + "step": 498 + }, + { + "epoch": 0.400320256204964, + "learning_rate": 1.5088489403341793e-05, + "loss": 0.3509, + "step": 500 + }, + { + "epoch": 0.40192153722978385, + "learning_rate": 1.513653168406076e-05, + "loss": 0.1726, + "step": 502 + }, + { + "epoch": 0.4035228182546037, + "learning_rate": 1.5184413457049006e-05, + "loss": 0.4971, + "step": 504 + }, + { + "epoch": 0.40512409927942356, + "learning_rate": 1.5232133226083954e-05, + "loss": 0.1932, + "step": 506 + }, + { + "epoch": 0.4067253803042434, + "learning_rate": 1.527968950000533e-05, + "loss": 0.3081, + "step": 508 + }, + { + "epoch": 0.40832666132906326, + "learning_rate": 1.532708079276185e-05, + "loss": 0.1631, + "step": 510 + }, + { + "epoch": 0.4099279423538831, + "learning_rate": 1.5374305623457594e-05, + "loss": 0.3645, + "step": 512 + }, + { + "epoch": 0.41152922337870296, + "learning_rate": 1.542136251639826e-05, + "loss": 0.1444, + "step": 514 + }, + { + "epoch": 0.41313050440352284, + "learning_rate": 1.5468250001137368e-05, + "loss": 0.1674, + "step": 516 + }, + { + "epoch": 0.41473178542834266, + "learning_rate": 1.551496661252208e-05, + "loss": 0.3177, + "step": 518 + }, + { + "epoch": 0.41633306645316254, + "learning_rate": 1.5561510890739113e-05, + "loss": 0.3221, + "step": 520 + }, + { + "epoch": 0.41793434747798236, + "learning_rate": 1.5607881381360296e-05, + "loss": 0.1761, + "step": 522 + }, + { + "epoch": 0.41953562850280224, + "learning_rate": 1.565407663538797e-05, + "loss": 0.126, + "step": 524 + }, + { + "epoch": 0.4211369095276221, + "learning_rate": 1.5700095209300376e-05, + "loss": 0.2189, + "step": 526 + }, + { + "epoch": 0.42273819055244194, + "learning_rate": 1.5745935665096647e-05, + "loss": 0.1773, + "step": 528 + }, + { + "epoch": 0.4243394715772618, + "learning_rate": 1.5791596570341844e-05, + "loss": 0.7247, + "step": 530 + }, + { + "epoch": 0.42594075260208164, + "learning_rate": 1.5837076498211666e-05, + "loss": 0.4036, + "step": 532 + }, + { + "epoch": 0.4275420336269015, + "learning_rate": 1.5882374027537005e-05, + "loss": 0.164, + "step": 534 + }, + { + "epoch": 0.4291433146517214, + "learning_rate": 1.5927487742848448e-05, + "loss": 0.6201, + "step": 536 + }, + { + "epoch": 0.4307445956765412, + "learning_rate": 1.5972416234420393e-05, + "loss": 0.6024, + "step": 538 + }, + { + "epoch": 0.4323458767013611, + "learning_rate": 1.60171580983152e-05, + "loss": 0.2382, + "step": 540 + }, + { + "epoch": 0.4339471577261809, + "learning_rate": 1.606171193642703e-05, + "loss": 0.2392, + "step": 542 + }, + { + "epoch": 0.4355484387510008, + "learning_rate": 1.6106076356525474e-05, + "loss": 0.4329, + "step": 544 + }, + { + "epoch": 0.4371497197758207, + "learning_rate": 1.6150249972299153e-05, + "loss": 0.1451, + "step": 546 + }, + { + "epoch": 0.4387510008006405, + "learning_rate": 1.6194231403398994e-05, + "loss": 0.4886, + "step": 548 + }, + { + "epoch": 0.4403522818254604, + "learning_rate": 1.6238019275481313e-05, + "loss": 0.2426, + "step": 550 + }, + { + "epoch": 0.4419535628502802, + "learning_rate": 1.6281612220250883e-05, + "loss": 0.2319, + "step": 552 + }, + { + "epoch": 0.4435548438751001, + "learning_rate": 1.6325008875503543e-05, + "loss": 0.2193, + "step": 554 + }, + { + "epoch": 0.44515612489991996, + "learning_rate": 1.6368207885168897e-05, + "loss": 0.3436, + "step": 556 + }, + { + "epoch": 0.4467574059247398, + "learning_rate": 1.641120789935263e-05, + "loss": 0.357, + "step": 558 + }, + { + "epoch": 0.44835868694955966, + "learning_rate": 1.6454007574378637e-05, + "loss": 0.4302, + "step": 560 + }, + { + "epoch": 0.4499599679743795, + "learning_rate": 1.6496605572831134e-05, + "loss": 0.3648, + "step": 562 + }, + { + "epoch": 0.45156124899919936, + "learning_rate": 1.6539000563596318e-05, + "loss": 0.2658, + "step": 564 + }, + { + "epoch": 0.45316253002401924, + "learning_rate": 1.6581191221904077e-05, + "loss": 0.0865, + "step": 566 + }, + { + "epoch": 0.45476381104883906, + "learning_rate": 1.6623176229369324e-05, + "loss": 0.2537, + "step": 568 + }, + { + "epoch": 0.45636509207365894, + "learning_rate": 1.6664954274033168e-05, + "loss": 0.3813, + "step": 570 + }, + { + "epoch": 0.45796637309847876, + "learning_rate": 1.6706524050403996e-05, + "loss": 0.0438, + "step": 572 + }, + { + "epoch": 0.45956765412329864, + "learning_rate": 1.674788425949818e-05, + "loss": 0.2827, + "step": 574 + }, + { + "epoch": 0.4611689351481185, + "learning_rate": 1.6789033608880735e-05, + "loss": 0.0801, + "step": 576 + }, + { + "epoch": 0.46277021617293834, + "learning_rate": 1.6829970812705674e-05, + "loss": 0.3019, + "step": 578 + }, + { + "epoch": 0.4643714971977582, + "learning_rate": 1.6870694591756165e-05, + "loss": 0.3157, + "step": 580 + }, + { + "epoch": 0.46597277822257804, + "learning_rate": 1.6911203673484577e-05, + "loss": 0.0425, + "step": 582 + }, + { + "epoch": 0.4675740592473979, + "learning_rate": 1.695149679205214e-05, + "loss": 0.8731, + "step": 584 + }, + { + "epoch": 0.4691753402722178, + "learning_rate": 1.6991572688368628e-05, + "loss": 0.4254, + "step": 586 + }, + { + "epoch": 0.4707766212970376, + "learning_rate": 1.7031430110131562e-05, + "loss": 0.4263, + "step": 588 + }, + { + "epoch": 0.4723779023218575, + "learning_rate": 1.7071067811865467e-05, + "loss": 0.3125, + "step": 590 + }, + { + "epoch": 0.4739791833466773, + "learning_rate": 1.7110484554960738e-05, + "loss": 0.0739, + "step": 592 + }, + { + "epoch": 0.4755804643714972, + "learning_rate": 1.7149679107712306e-05, + "loss": 0.5144, + "step": 594 + }, + { + "epoch": 0.4771817453963171, + "learning_rate": 1.7188650245358215e-05, + "loss": 0.0012, + "step": 596 + }, + { + "epoch": 0.4787830264211369, + "learning_rate": 1.722739675011779e-05, + "loss": 0.3337, + "step": 598 + }, + { + "epoch": 0.4803843074459568, + "learning_rate": 1.726591741122981e-05, + "loss": 0.2475, + "step": 600 + }, + { + "epoch": 0.4819855884707766, + "learning_rate": 1.730421102499021e-05, + "loss": 0.3555, + "step": 602 + }, + { + "epoch": 0.4835868694955965, + "learning_rate": 1.734227639478982e-05, + "loss": 0.4333, + "step": 604 + }, + { + "epoch": 0.4851881505204163, + "learning_rate": 1.738011233115165e-05, + "loss": 0.2514, + "step": 606 + }, + { + "epoch": 0.4867894315452362, + "learning_rate": 1.7417717651768144e-05, + "loss": 0.3297, + "step": 608 + }, + { + "epoch": 0.48839071257005606, + "learning_rate": 1.7455091181538087e-05, + "loss": 0.0692, + "step": 610 + }, + { + "epoch": 0.4899919935948759, + "learning_rate": 1.74922317526033e-05, + "loss": 0.0326, + "step": 612 + }, + { + "epoch": 0.49159327461969576, + "learning_rate": 1.752913820438519e-05, + "loss": 0.1388, + "step": 614 + }, + { + "epoch": 0.4931945556445156, + "learning_rate": 1.756580938362096e-05, + "loss": 0.424, + "step": 616 + }, + { + "epoch": 0.49479583666933546, + "learning_rate": 1.7602244144399693e-05, + "loss": 0.5701, + "step": 618 + }, + { + "epoch": 0.49639711769415534, + "learning_rate": 1.7638441348198147e-05, + "loss": 0.0739, + "step": 620 + }, + { + "epoch": 0.49799839871897517, + "learning_rate": 1.7674399863916295e-05, + "loss": 0.5175, + "step": 622 + }, + { + "epoch": 0.49959967974379504, + "learning_rate": 1.771011856791273e-05, + "loss": 0.0642, + "step": 624 + }, + { + "epoch": 0.5012009607686149, + "learning_rate": 1.7745596344039712e-05, + "loss": 0.139, + "step": 626 + }, + { + "epoch": 0.5028022417934348, + "learning_rate": 1.7780832083678116e-05, + "loss": 0.1603, + "step": 628 + }, + { + "epoch": 0.5044035228182546, + "learning_rate": 1.7815824685772035e-05, + "loss": 0.2986, + "step": 630 + }, + { + "epoch": 0.5060048038430744, + "learning_rate": 1.7850573056863156e-05, + "loss": 0.1698, + "step": 632 + }, + { + "epoch": 0.5076060848678943, + "learning_rate": 1.7885076111125004e-05, + "loss": 0.8349, + "step": 634 + }, + { + "epoch": 0.5092073658927142, + "learning_rate": 1.791933277039679e-05, + "loss": 0.3426, + "step": 636 + }, + { + "epoch": 0.510808646917534, + "learning_rate": 1.7953341964217183e-05, + "loss": 0.0642, + "step": 638 + }, + { + "epoch": 0.5124099279423538, + "learning_rate": 1.7987102629857696e-05, + "loss": 0.1074, + "step": 640 + }, + { + "epoch": 0.5140112089671738, + "learning_rate": 1.8020613712355912e-05, + "loss": 0.1284, + "step": 642 + }, + { + "epoch": 0.5156124899919936, + "learning_rate": 1.805387416454847e-05, + "loss": 0.2241, + "step": 644 + }, + { + "epoch": 0.5172137710168134, + "learning_rate": 1.8086882947103787e-05, + "loss": 0.2262, + "step": 646 + }, + { + "epoch": 0.5188150520416333, + "learning_rate": 1.811963902855447e-05, + "loss": 0.2786, + "step": 648 + }, + { + "epoch": 0.5204163330664532, + "learning_rate": 1.8152141385329658e-05, + "loss": 0.3075, + "step": 650 + }, + { + "epoch": 0.522017614091273, + "learning_rate": 1.8184389001786895e-05, + "loss": 0.1899, + "step": 652 + }, + { + "epoch": 0.5236188951160928, + "learning_rate": 1.821638087024396e-05, + "loss": 0.1767, + "step": 654 + }, + { + "epoch": 0.5252201761409128, + "learning_rate": 1.8248115991010296e-05, + "loss": 0.2476, + "step": 656 + }, + { + "epoch": 0.5268214571657326, + "learning_rate": 1.8279593372418264e-05, + "loss": 0.4328, + "step": 658 + }, + { + "epoch": 0.5284227381905524, + "learning_rate": 1.8310812030854155e-05, + "loss": 0.0012, + "step": 660 + }, + { + "epoch": 0.5300240192153723, + "learning_rate": 1.834177099078887e-05, + "loss": 0.1375, + "step": 662 + }, + { + "epoch": 0.5316253002401922, + "learning_rate": 1.8372469284808465e-05, + "loss": 0.1468, + "step": 664 + }, + { + "epoch": 0.533226581265012, + "learning_rate": 1.840290595364436e-05, + "loss": 0.8087, + "step": 666 + }, + { + "epoch": 0.5348278622898318, + "learning_rate": 1.8433080046203286e-05, + "loss": 0.4017, + "step": 668 + }, + { + "epoch": 0.5364291433146517, + "learning_rate": 1.8462990619597054e-05, + "loss": 0.2393, + "step": 670 + }, + { + "epoch": 0.5380304243394716, + "learning_rate": 1.8492636739171966e-05, + "loss": 0.4134, + "step": 672 + }, + { + "epoch": 0.5396317053642914, + "learning_rate": 1.8522017478538067e-05, + "loss": 0.3732, + "step": 674 + }, + { + "epoch": 0.5412329863891113, + "learning_rate": 1.855113191959808e-05, + "loss": 0.4379, + "step": 676 + }, + { + "epoch": 0.5428342674139311, + "learning_rate": 1.8579979152576063e-05, + "loss": 0.1595, + "step": 678 + }, + { + "epoch": 0.544435548438751, + "learning_rate": 1.8608558276045895e-05, + "loss": 0.2944, + "step": 680 + }, + { + "epoch": 0.5460368294635709, + "learning_rate": 1.86368683969594e-05, + "loss": 0.2257, + "step": 682 + }, + { + "epoch": 0.5476381104883907, + "learning_rate": 1.866490863067425e-05, + "loss": 0.2477, + "step": 684 + }, + { + "epoch": 0.5492393915132106, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.1501, + "step": 686 + }, + { + "epoch": 0.5508406725380304, + "learning_rate": 1.8720175940133705e-05, + "loss": 0.4446, + "step": 688 + }, + { + "epoch": 0.5524419535628503, + "learning_rate": 1.8747401288870472e-05, + "loss": 0.0004, + "step": 690 + }, + { + "epoch": 0.5540432345876701, + "learning_rate": 1.877435329644691e-05, + "loss": 0.2269, + "step": 692 + }, + { + "epoch": 0.55564451561249, + "learning_rate": 1.8801031120659393e-05, + "loss": 0.2099, + "step": 694 + }, + { + "epoch": 0.5572457966373099, + "learning_rate": 1.8827433927872066e-05, + "loss": 0.3402, + "step": 696 + }, + { + "epoch": 0.5588470776621297, + "learning_rate": 1.8853560893042854e-05, + "loss": 0.3654, + "step": 698 + }, + { + "epoch": 0.5604483586869495, + "learning_rate": 1.8879411199749303e-05, + "loss": 0.2487, + "step": 700 + }, + { + "epoch": 0.5620496397117695, + "learning_rate": 1.8904984040214037e-05, + "loss": 0.212, + "step": 702 + }, + { + "epoch": 0.5636509207365893, + "learning_rate": 1.893027861533002e-05, + "loss": 0.0979, + "step": 704 + }, + { + "epoch": 0.5652522017614091, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.0413, + "step": 706 + }, + { + "epoch": 0.5668534827862289, + "learning_rate": 1.898002981658886e-05, + "loss": 0.3222, + "step": 708 + }, + { + "epoch": 0.5684547638110489, + "learning_rate": 1.9004484888092724e-05, + "loss": 0.2231, + "step": 710 + }, + { + "epoch": 0.5700560448358687, + "learning_rate": 1.9028658585018455e-05, + "loss": 0.2221, + "step": 712 + }, + { + "epoch": 0.5716573258606885, + "learning_rate": 1.9052550151979816e-05, + "loss": 0.0846, + "step": 714 + }, + { + "epoch": 0.5732586068855084, + "learning_rate": 1.9076158842406674e-05, + "loss": 0.0515, + "step": 716 + }, + { + "epoch": 0.5748598879103283, + "learning_rate": 1.909948391856829e-05, + "loss": 0.1184, + "step": 718 + }, + { + "epoch": 0.5764611689351481, + "learning_rate": 1.912252465159637e-05, + "loss": 0.4528, + "step": 720 + }, + { + "epoch": 0.578062449959968, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.2771, + "step": 722 + }, + { + "epoch": 0.5796637309847879, + "learning_rate": 1.9167750217227454e-05, + "loss": 0.1909, + "step": 724 + }, + { + "epoch": 0.5812650120096077, + "learning_rate": 1.9189933636609747e-05, + "loss": 0.0977, + "step": 726 + }, + { + "epoch": 0.5828662930344275, + "learning_rate": 1.9211829886461274e-05, + "loss": 0.2653, + "step": 728 + }, + { + "epoch": 0.5844675740592474, + "learning_rate": 1.9233438282562085e-05, + "loss": 0.1125, + "step": 730 + }, + { + "epoch": 0.5860688550840673, + "learning_rate": 1.925475814968719e-05, + "loss": 0.0001, + "step": 732 + }, + { + "epoch": 0.5876701361088871, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.0855, + "step": 734 + }, + { + "epoch": 0.589271417133707, + "learning_rate": 1.9296529641211215e-05, + "loss": 0.0126, + "step": 736 + }, + { + "epoch": 0.5908726981585268, + "learning_rate": 1.9316979960323286e-05, + "loss": 0.0266, + "step": 738 + }, + { + "epoch": 0.5924739791833467, + "learning_rate": 1.9337139139926707e-05, + "loss": 0.4013, + "step": 740 + }, + { + "epoch": 0.5940752602081665, + "learning_rate": 1.935700655008199e-05, + "loss": 0.7972, + "step": 742 + }, + { + "epoch": 0.5956765412329864, + "learning_rate": 1.9376581569966933e-05, + "loss": 0.1536, + "step": 744 + }, + { + "epoch": 0.5972778222578062, + "learning_rate": 1.939586358789602e-05, + "loss": 0.0105, + "step": 746 + }, + { + "epoch": 0.5988791032826261, + "learning_rate": 1.9414852001339547e-05, + "loss": 0.3673, + "step": 748 + }, + { + "epoch": 0.600480384307446, + "learning_rate": 1.9433546216942423e-05, + "loss": 0.0113, + "step": 750 + }, + { + "epoch": 0.6020816653322658, + "learning_rate": 1.945194565054276e-05, + "loss": 0.9379, + "step": 752 + }, + { + "epoch": 0.6036829463570856, + "learning_rate": 1.9470049727190073e-05, + "loss": 0.3657, + "step": 754 + }, + { + "epoch": 0.6052842273819056, + "learning_rate": 1.948785788116329e-05, + "loss": 0.0001, + "step": 756 + }, + { + "epoch": 0.6068855084067254, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.1986, + "step": 758 + }, + { + "epoch": 0.6084867894315452, + "learning_rate": 1.952258420445583e-05, + "loss": 0.0882, + "step": 760 + }, + { + "epoch": 0.610088070456365, + "learning_rate": 1.953950128863762e-05, + "loss": 0.0113, + "step": 762 + }, + { + "epoch": 0.611689351481185, + "learning_rate": 1.9556120279904144e-05, + "loss": 0.5459, + "step": 764 + }, + { + "epoch": 0.6132906325060048, + "learning_rate": 1.957244065894066e-05, + "loss": 0.2271, + "step": 766 + }, + { + "epoch": 0.6148919135308246, + "learning_rate": 1.9588461915763566e-05, + "loss": 0.3052, + "step": 768 + }, + { + "epoch": 0.6164931945556446, + "learning_rate": 1.9604183549736283e-05, + "loss": 0.3504, + "step": 770 + }, + { + "epoch": 0.6180944755804644, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.6651, + "step": 772 + }, + { + "epoch": 0.6196957566052842, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.1277, + "step": 774 + }, + { + "epoch": 0.6212970376301041, + "learning_rate": 1.964954584871995e-05, + "loss": 0.4338, + "step": 776 + }, + { + "epoch": 0.622898318654924, + "learning_rate": 1.966406417240872e-05, + "loss": 0.2064, + "step": 778 + }, + { + "epoch": 0.6244995996797438, + "learning_rate": 1.967828051080755e-05, + "loss": 0.1587, + "step": 780 + }, + { + "epoch": 0.6261008807045636, + "learning_rate": 1.969219441968046e-05, + "loss": 0.001, + "step": 782 + }, + { + "epoch": 0.6277021617293835, + "learning_rate": 1.9705805464241856e-05, + "loss": 0.1858, + "step": 784 + }, + { + "epoch": 0.6293034427542034, + "learning_rate": 1.971911321917015e-05, + "loss": 0.1895, + "step": 786 + }, + { + "epoch": 0.6309047237790232, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.4305, + "step": 788 + }, + { + "epoch": 0.6325060048038431, + "learning_rate": 1.9744817206240377e-05, + "loss": 0.3815, + "step": 790 + }, + { + "epoch": 0.6341072858286629, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.1603, + "step": 792 + }, + { + "epoch": 0.6357085668534828, + "learning_rate": 1.976930316809569e-05, + "loss": 0.2476, + "step": 794 + }, + { + "epoch": 0.6373098478783027, + "learning_rate": 1.978108842718768e-05, + "loss": 0.4591, + "step": 796 + }, + { + "epoch": 0.6389111289031225, + "learning_rate": 1.9792568044184176e-05, + "loss": 0.2479, + "step": 798 + }, + { + "epoch": 0.6405124099279423, + "learning_rate": 1.9803741660367015e-05, + "loss": 0.2048, + "step": 800 + }, + { + "epoch": 0.6421136909527622, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.2994, + "step": 802 + }, + { + "epoch": 0.6437149719775821, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.4695, + "step": 804 + }, + { + "epoch": 0.6453162530024019, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.2209, + "step": 806 + }, + { + "epoch": 0.6469175340272217, + "learning_rate": 1.9845369277495102e-05, + "loss": 0.252, + "step": 808 + }, + { + "epoch": 0.6485188150520417, + "learning_rate": 1.985500784388244e-05, + "loss": 0.3088, + "step": 810 + }, + { + "epoch": 0.6501200960768615, + "learning_rate": 1.9864338458320366e-05, + "loss": 0.2844, + "step": 812 + }, + { + "epoch": 0.6517213771016813, + "learning_rate": 1.9873360829243323e-05, + "loss": 0.0504, + "step": 814 + }, + { + "epoch": 0.6533226581265013, + "learning_rate": 1.9882074674717836e-05, + "loss": 0.3226, + "step": 816 + }, + { + "epoch": 0.6549239391513211, + "learning_rate": 1.989047972245129e-05, + "loss": 0.1373, + "step": 818 + }, + { + "epoch": 0.6565252201761409, + "learning_rate": 1.989857570980049e-05, + "loss": 0.2287, + "step": 820 + }, + { + "epoch": 0.6581265012009607, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.0494, + "step": 822 + }, + { + "epoch": 0.6597277822257807, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.2851, + "step": 824 + }, + { + "epoch": 0.6613290632506005, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.1611, + "step": 826 + }, + { + "epoch": 0.6629303442754203, + "learning_rate": 1.9927864140670615e-05, + "loss": 0.2606, + "step": 828 + }, + { + "epoch": 0.6645316253002402, + "learning_rate": 1.99344112247369e-05, + "loss": 0.2624, + "step": 830 + }, + { + "epoch": 0.6661329063250601, + "learning_rate": 1.9940647875635463e-05, + "loss": 0.2501, + "step": 832 + }, + { + "epoch": 0.6677341873498799, + "learning_rate": 1.994657389848176e-05, + "loss": 0.175, + "step": 834 + }, + { + "epoch": 0.6693354683746997, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.2887, + "step": 836 + }, + { + "epoch": 0.6709367493995196, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.0588, + "step": 838 + }, + { + "epoch": 0.6725380304243395, + "learning_rate": 1.996248639549475e-05, + "loss": 0.2875, + "step": 840 + }, + { + "epoch": 0.6741393114491593, + "learning_rate": 1.9967168151503196e-05, + "loss": 0.3495, + "step": 842 + }, + { + "epoch": 0.6757405924739792, + "learning_rate": 1.997153845074662e-05, + "loss": 0.059, + "step": 844 + }, + { + "epoch": 0.677341873498799, + "learning_rate": 1.997559715666073e-05, + "loss": 0.2796, + "step": 846 + }, + { + "epoch": 0.6789431545236189, + "learning_rate": 1.9979344142417986e-05, + "loss": 0.296, + "step": 848 + }, + { + "epoch": 0.6805444355484388, + "learning_rate": 1.998277929093157e-05, + "loss": 0.1668, + "step": 850 + }, + { + "epoch": 0.6821457165732586, + "learning_rate": 1.9985902494859023e-05, + "loss": 0.1504, + "step": 852 + }, + { + "epoch": 0.6837469975980784, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.5176, + "step": 854 + }, + { + "epoch": 0.6853482786228983, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.1284, + "step": 856 + }, + { + "epoch": 0.6869495596477182, + "learning_rate": 1.999339951193407e-05, + "loss": 0.0473, + "step": 858 + }, + { + "epoch": 0.688550840672538, + "learning_rate": 1.9995274059091018e-05, + "loss": 0.3336, + "step": 860 + }, + { + "epoch": 0.6901521216973578, + "learning_rate": 1.999683627122195e-05, + "loss": 0.5706, + "step": 862 + }, + { + "epoch": 0.6917534027221778, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.0103, + "step": 864 + }, + { + "epoch": 0.6933546837469976, + "learning_rate": 1.99990235049015e-05, + "loss": 0.4279, + "step": 866 + }, + { + "epoch": 0.6949559647718174, + "learning_rate": 1.999964845810285e-05, + "loss": 0.3531, + "step": 868 + }, + { + "epoch": 0.6965572457966374, + "learning_rate": 1.999996093958578e-05, + "loss": 0.1025, + "step": 870 + }, + { + "epoch": 0.6981585268214572, + "learning_rate": 1.999996093958578e-05, + "loss": 0.2337, + "step": 872 + }, + { + "epoch": 0.699759807846277, + "learning_rate": 1.999964845810285e-05, + "loss": 0.1763, + "step": 874 + }, + { + "epoch": 0.7013610888710968, + "learning_rate": 1.99990235049015e-05, + "loss": 0.1986, + "step": 876 + }, + { + "epoch": 0.7029623698959168, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.1486, + "step": 878 + }, + { + "epoch": 0.7045636509207366, + "learning_rate": 1.999683627122195e-05, + "loss": 0.044, + "step": 880 + }, + { + "epoch": 0.7061649319455564, + "learning_rate": 1.999527405909102e-05, + "loss": 0.1858, + "step": 882 + }, + { + "epoch": 0.7077662129703763, + "learning_rate": 1.999339951193407e-05, + "loss": 0.2234, + "step": 884 + }, + { + "epoch": 0.7093674939951962, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.2177, + "step": 886 + }, + { + "epoch": 0.710968775020016, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.0387, + "step": 888 + }, + { + "epoch": 0.7125700560448359, + "learning_rate": 1.9985902494859026e-05, + "loss": 0.0463, + "step": 890 + }, + { + "epoch": 0.7141713370696557, + "learning_rate": 1.9982779290931572e-05, + "loss": 0.0867, + "step": 892 + }, + { + "epoch": 0.7157726180944756, + "learning_rate": 1.997934414241799e-05, + "loss": 0.068, + "step": 894 + }, + { + "epoch": 0.7173738991192954, + "learning_rate": 1.997559715666073e-05, + "loss": 0.0802, + "step": 896 + }, + { + "epoch": 0.7189751801441153, + "learning_rate": 1.997153845074662e-05, + "loss": 0.248, + "step": 898 + }, + { + "epoch": 0.7205764611689351, + "learning_rate": 1.9967168151503193e-05, + "loss": 0.0043, + "step": 900 + }, + { + "epoch": 0.722177742193755, + "learning_rate": 1.9962486395494753e-05, + "loss": 0.1692, + "step": 902 + }, + { + "epoch": 0.7237790232185749, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.5508, + "step": 904 + }, + { + "epoch": 0.7253803042433947, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.4798, + "step": 906 + }, + { + "epoch": 0.7269815852682145, + "learning_rate": 1.994657389848176e-05, + "loss": 0.4269, + "step": 908 + }, + { + "epoch": 0.7285828662930345, + "learning_rate": 1.9940647875635466e-05, + "loss": 0.2057, + "step": 910 + }, + { + "epoch": 0.7301841473178543, + "learning_rate": 1.99344112247369e-05, + "loss": 0.4704, + "step": 912 + }, + { + "epoch": 0.7317854283426741, + "learning_rate": 1.9927864140670618e-05, + "loss": 0.123, + "step": 914 + }, + { + "epoch": 0.733386709367494, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.2802, + "step": 916 + }, + { + "epoch": 0.7349879903923139, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.288, + "step": 918 + }, + { + "epoch": 0.7365892714171337, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.0914, + "step": 920 + }, + { + "epoch": 0.7381905524419535, + "learning_rate": 1.989857570980049e-05, + "loss": 0.15, + "step": 922 + }, + { + "epoch": 0.7397918334667735, + "learning_rate": 1.9890479722451292e-05, + "loss": 0.4179, + "step": 924 + }, + { + "epoch": 0.7413931144915933, + "learning_rate": 1.9882074674717832e-05, + "loss": 0.8721, + "step": 926 + }, + { + "epoch": 0.7429943955164131, + "learning_rate": 1.987336082924333e-05, + "loss": 0.0436, + "step": 928 + }, + { + "epoch": 0.7445956765412329, + "learning_rate": 1.986433845832037e-05, + "loss": 0.5673, + "step": 930 + }, + { + "epoch": 0.7461969575660529, + "learning_rate": 1.9855007843882437e-05, + "loss": 0.3196, + "step": 932 + }, + { + "epoch": 0.7477982385908727, + "learning_rate": 1.9845369277495105e-05, + "loss": 0.1367, + "step": 934 + }, + { + "epoch": 0.7493995196156925, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.4896, + "step": 936 + }, + { + "epoch": 0.7510008006405124, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.1374, + "step": 938 + }, + { + "epoch": 0.7526020816653323, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.1938, + "step": 940 + }, + { + "epoch": 0.7542033626901521, + "learning_rate": 1.9803741660367018e-05, + "loss": 0.358, + "step": 942 + }, + { + "epoch": 0.755804643714972, + "learning_rate": 1.979256804418418e-05, + "loss": 0.1398, + "step": 944 + }, + { + "epoch": 0.7574059247397918, + "learning_rate": 1.9781088427187677e-05, + "loss": 0.3543, + "step": 946 + }, + { + "epoch": 0.7590072057646117, + "learning_rate": 1.976930316809569e-05, + "loss": 0.2053, + "step": 948 + }, + { + "epoch": 0.7606084867894315, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.2176, + "step": 950 + }, + { + "epoch": 0.7622097678142514, + "learning_rate": 1.9744817206240374e-05, + "loss": 0.5327, + "step": 952 + }, + { + "epoch": 0.7638110488390712, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.2778, + "step": 954 + }, + { + "epoch": 0.7654123298638911, + "learning_rate": 1.9719113219170152e-05, + "loss": 0.1663, + "step": 956 + }, + { + "epoch": 0.767013610888711, + "learning_rate": 1.970580546424186e-05, + "loss": 0.1944, + "step": 958 + }, + { + "epoch": 0.7686148919135308, + "learning_rate": 1.9692194419680463e-05, + "loss": 0.7054, + "step": 960 + }, + { + "epoch": 0.7702161729383507, + "learning_rate": 1.9678280510807552e-05, + "loss": 0.3426, + "step": 962 + }, + { + "epoch": 0.7718174539631706, + "learning_rate": 1.966406417240872e-05, + "loss": 0.4666, + "step": 964 + }, + { + "epoch": 0.7734187349879904, + "learning_rate": 1.964954584871995e-05, + "loss": 0.2431, + "step": 966 + }, + { + "epoch": 0.7750200160128102, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.4727, + "step": 968 + }, + { + "epoch": 0.77662129703763, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.254, + "step": 970 + }, + { + "epoch": 0.77822257806245, + "learning_rate": 1.9604183549736287e-05, + "loss": 0.3761, + "step": 972 + }, + { + "epoch": 0.7798238590872698, + "learning_rate": 1.958846191576357e-05, + "loss": 0.2267, + "step": 974 + }, + { + "epoch": 0.7814251401120896, + "learning_rate": 1.9572440658940667e-05, + "loss": 0.0889, + "step": 976 + }, + { + "epoch": 0.7830264211369096, + "learning_rate": 1.955612027990415e-05, + "loss": 0.1505, + "step": 978 + }, + { + "epoch": 0.7846277021617294, + "learning_rate": 1.953950128863763e-05, + "loss": 0.3065, + "step": 980 + }, + { + "epoch": 0.7862289831865492, + "learning_rate": 1.9522584204455835e-05, + "loss": 0.2077, + "step": 982 + }, + { + "epoch": 0.7878302642113691, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.1435, + "step": 984 + }, + { + "epoch": 0.789431545236189, + "learning_rate": 1.9487857881163295e-05, + "loss": 0.1399, + "step": 986 + }, + { + "epoch": 0.7910328262610088, + "learning_rate": 1.947004972719008e-05, + "loss": 0.4127, + "step": 988 + }, + { + "epoch": 0.7926341072858286, + "learning_rate": 1.945194565054276e-05, + "loss": 0.1297, + "step": 990 + }, + { + "epoch": 0.7942353883106485, + "learning_rate": 1.9433546216942433e-05, + "loss": 0.018, + "step": 992 + }, + { + "epoch": 0.7958366693354684, + "learning_rate": 1.941485200133955e-05, + "loss": 0.3102, + "step": 994 + }, + { + "epoch": 0.7974379503602882, + "learning_rate": 1.9395863587896025e-05, + "loss": 0.5714, + "step": 996 + }, + { + "epoch": 0.7990392313851081, + "learning_rate": 1.937658156996694e-05, + "loss": 0.3497, + "step": 998 + }, + { + "epoch": 0.800640512409928, + "learning_rate": 1.9357006550082e-05, + "loss": 0.1735, + "step": 1000 + }, + { + "epoch": 0.8022417934347478, + "learning_rate": 1.933713913992671e-05, + "loss": 0.1322, + "step": 1002 + }, + { + "epoch": 0.8038430744595677, + "learning_rate": 1.9316979960323283e-05, + "loss": 0.6451, + "step": 1004 + }, + { + "epoch": 0.8054443554843875, + "learning_rate": 1.9296529641211226e-05, + "loss": 0.1871, + "step": 1006 + }, + { + "epoch": 0.8070456365092074, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.3626, + "step": 1008 + }, + { + "epoch": 0.8086469175340272, + "learning_rate": 1.9254758149687187e-05, + "loss": 0.2647, + "step": 1010 + }, + { + "epoch": 0.8102481985588471, + "learning_rate": 1.9233438282562095e-05, + "loss": 0.1087, + "step": 1012 + }, + { + "epoch": 0.8118494795836669, + "learning_rate": 1.9211829886461278e-05, + "loss": 0.4519, + "step": 1014 + }, + { + "epoch": 0.8134507606084868, + "learning_rate": 1.918993363660975e-05, + "loss": 0.1906, + "step": 1016 + }, + { + "epoch": 0.8150520416333067, + "learning_rate": 1.916775021722745e-05, + "loss": 0.2119, + "step": 1018 + }, + { + "epoch": 0.8166533226581265, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.3139, + "step": 1020 + }, + { + "epoch": 0.8182546036829463, + "learning_rate": 1.9122524651596372e-05, + "loss": 0.0881, + "step": 1022 + }, + { + "epoch": 0.8198558847077662, + "learning_rate": 1.9099483918568287e-05, + "loss": 0.1322, + "step": 1024 + }, + { + "epoch": 0.8214571657325861, + "learning_rate": 1.907615884240668e-05, + "loss": 0.1264, + "step": 1026 + }, + { + "epoch": 0.8230584467574059, + "learning_rate": 1.905255015197982e-05, + "loss": 0.1859, + "step": 1028 + }, + { + "epoch": 0.8246597277822257, + "learning_rate": 1.902865858501845e-05, + "loss": 0.1903, + "step": 1030 + }, + { + "epoch": 0.8262610088070457, + "learning_rate": 1.9004484888092734e-05, + "loss": 0.3949, + "step": 1032 + }, + { + "epoch": 0.8278622898318655, + "learning_rate": 1.8980029816588863e-05, + "loss": 0.2332, + "step": 1034 + }, + { + "epoch": 0.8294635708566853, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.1986, + "step": 1036 + }, + { + "epoch": 0.8310648518815053, + "learning_rate": 1.893027861533003e-05, + "loss": 0.7903, + "step": 1038 + }, + { + "epoch": 0.8326661329063251, + "learning_rate": 1.8904984040214043e-05, + "loss": 0.1138, + "step": 1040 + }, + { + "epoch": 0.8342674139311449, + "learning_rate": 1.8879411199749306e-05, + "loss": 0.5907, + "step": 1042 + }, + { + "epoch": 0.8358686949559647, + "learning_rate": 1.885356089304285e-05, + "loss": 0.2652, + "step": 1044 + }, + { + "epoch": 0.8374699759807847, + "learning_rate": 1.882743392787207e-05, + "loss": 0.1244, + "step": 1046 + }, + { + "epoch": 0.8390712570056045, + "learning_rate": 1.8801031120659396e-05, + "loss": 0.0383, + "step": 1048 + }, + { + "epoch": 0.8406725380304243, + "learning_rate": 1.877435329644691e-05, + "loss": 0.2471, + "step": 1050 + }, + { + "epoch": 0.8422738190552442, + "learning_rate": 1.8747401288870482e-05, + "loss": 0.2511, + "step": 1052 + }, + { + "epoch": 0.8438751000800641, + "learning_rate": 1.8720175940133712e-05, + "loss": 0.3851, + "step": 1054 + }, + { + "epoch": 0.8454763811048839, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.0323, + "step": 1056 + }, + { + "epoch": 0.8470776621297038, + "learning_rate": 1.8664908630674264e-05, + "loss": 0.3342, + "step": 1058 + }, + { + "epoch": 0.8486789431545236, + "learning_rate": 1.8636868396959406e-05, + "loss": 0.2435, + "step": 1060 + }, + { + "epoch": 0.8502802241793435, + "learning_rate": 1.8608558276045898e-05, + "loss": 0.1679, + "step": 1062 + }, + { + "epoch": 0.8518815052041633, + "learning_rate": 1.8579979152576076e-05, + "loss": 0.1493, + "step": 1064 + }, + { + "epoch": 0.8534827862289832, + "learning_rate": 1.8551131919598084e-05, + "loss": 0.1588, + "step": 1066 + }, + { + "epoch": 0.855084067253803, + "learning_rate": 1.852201747853807e-05, + "loss": 0.2018, + "step": 1068 + }, + { + "epoch": 0.8566853482786229, + "learning_rate": 1.849263673917196e-05, + "loss": 0.3249, + "step": 1070 + }, + { + "epoch": 0.8582866293034428, + "learning_rate": 1.846299061959706e-05, + "loss": 0.1739, + "step": 1072 + }, + { + "epoch": 0.8598879103282626, + "learning_rate": 1.8433080046203293e-05, + "loss": 0.0123, + "step": 1074 + }, + { + "epoch": 0.8614891913530824, + "learning_rate": 1.8402905953644356e-05, + "loss": 0.6587, + "step": 1076 + }, + { + "epoch": 0.8630904723779024, + "learning_rate": 1.837246928480848e-05, + "loss": 0.1449, + "step": 1078 + }, + { + "epoch": 0.8646917534027222, + "learning_rate": 1.8341770990788874e-05, + "loss": 0.1528, + "step": 1080 + }, + { + "epoch": 0.866293034427542, + "learning_rate": 1.831081203085415e-05, + "loss": 0.0512, + "step": 1082 + }, + { + "epoch": 0.8678943154523618, + "learning_rate": 1.8279593372418284e-05, + "loss": 0.543, + "step": 1084 + }, + { + "epoch": 0.8694955964771818, + "learning_rate": 1.8248115991010303e-05, + "loss": 0.1761, + "step": 1086 + }, + { + "epoch": 0.8710968775020016, + "learning_rate": 1.8216380870243963e-05, + "loss": 0.1229, + "step": 1088 + }, + { + "epoch": 0.8726981585268214, + "learning_rate": 1.8184389001786912e-05, + "loss": 0.424, + "step": 1090 + }, + { + "epoch": 0.8742994395516414, + "learning_rate": 1.815214138532966e-05, + "loss": 0.2345, + "step": 1092 + }, + { + "epoch": 0.8759007205764612, + "learning_rate": 1.8119639028554475e-05, + "loss": 0.0774, + "step": 1094 + }, + { + "epoch": 0.877502001601281, + "learning_rate": 1.808688294710378e-05, + "loss": 0.3901, + "step": 1096 + }, + { + "epoch": 0.8791032826261009, + "learning_rate": 1.805387416454849e-05, + "loss": 0.7828, + "step": 1098 + }, + { + "epoch": 0.8807045636509208, + "learning_rate": 1.802061371235592e-05, + "loss": 0.1246, + "step": 1100 + }, + { + "epoch": 0.8823058446757406, + "learning_rate": 1.7987102629857692e-05, + "loss": 0.0069, + "step": 1102 + }, + { + "epoch": 0.8839071257005604, + "learning_rate": 1.7953341964217196e-05, + "loss": 0.3291, + "step": 1104 + }, + { + "epoch": 0.8855084067253803, + "learning_rate": 1.7919332770396798e-05, + "loss": 0.3867, + "step": 1106 + }, + { + "epoch": 0.8871096877502002, + "learning_rate": 1.7885076111125e-05, + "loss": 0.1728, + "step": 1108 + }, + { + "epoch": 0.88871096877502, + "learning_rate": 1.7850573056863173e-05, + "loss": 2.9063, + "step": 1110 + }, + { + "epoch": 0.8903122497998399, + "learning_rate": 1.7815824685772042e-05, + "loss": 0.2028, + "step": 1112 + }, + { + "epoch": 0.8919135308246597, + "learning_rate": 1.7780832083678122e-05, + "loss": 0.1987, + "step": 1114 + }, + { + "epoch": 0.8935148118494796, + "learning_rate": 1.774559634403971e-05, + "loss": 0.7816, + "step": 1116 + }, + { + "epoch": 0.8951160928742994, + "learning_rate": 1.7710118567912732e-05, + "loss": 0.216, + "step": 1118 + }, + { + "epoch": 0.8967173738991193, + "learning_rate": 1.7674399863916298e-05, + "loss": 0.2811, + "step": 1120 + }, + { + "epoch": 0.8983186549239391, + "learning_rate": 1.7638441348198144e-05, + "loss": 0.3053, + "step": 1122 + }, + { + "epoch": 0.899919935948759, + "learning_rate": 1.7602244144399713e-05, + "loss": 0.3653, + "step": 1124 + }, + { + "epoch": 0.9015212169735789, + "learning_rate": 1.7565809383620966e-05, + "loss": 0.4174, + "step": 1126 + }, + { + "epoch": 0.9031224979983987, + "learning_rate": 1.7529138204385186e-05, + "loss": 0.3969, + "step": 1128 + }, + { + "epoch": 0.9047237790232185, + "learning_rate": 1.7492231752603305e-05, + "loss": 0.1293, + "step": 1130 + }, + { + "epoch": 0.9063250600480385, + "learning_rate": 1.7455091181538094e-05, + "loss": 0.3445, + "step": 1132 + }, + { + "epoch": 0.9079263410728583, + "learning_rate": 1.741771765176815e-05, + "loss": 0.3499, + "step": 1134 + }, + { + "epoch": 0.9095276220976781, + "learning_rate": 1.7380112331151657e-05, + "loss": 0.2625, + "step": 1136 + }, + { + "epoch": 0.911128903122498, + "learning_rate": 1.7342276394789825e-05, + "loss": 0.1867, + "step": 1138 + }, + { + "epoch": 0.9127301841473179, + "learning_rate": 1.7304211024990216e-05, + "loss": 0.4101, + "step": 1140 + }, + { + "epoch": 0.9143314651721377, + "learning_rate": 1.7265917411229803e-05, + "loss": 0.2058, + "step": 1142 + }, + { + "epoch": 0.9159327461969575, + "learning_rate": 1.7227396750117802e-05, + "loss": 0.0902, + "step": 1144 + }, + { + "epoch": 0.9175340272217775, + "learning_rate": 1.718865024535822e-05, + "loss": 0.4781, + "step": 1146 + }, + { + "epoch": 0.9191353082465973, + "learning_rate": 1.7149679107712317e-05, + "loss": 0.4411, + "step": 1148 + }, + { + "epoch": 0.9207365892714171, + "learning_rate": 1.711048455496075e-05, + "loss": 0.3121, + "step": 1150 + }, + { + "epoch": 0.922337870296237, + "learning_rate": 1.7071067811865474e-05, + "loss": 0.2882, + "step": 1152 + }, + { + "epoch": 0.9239391513210569, + "learning_rate": 1.7031430110131566e-05, + "loss": 0.3984, + "step": 1154 + }, + { + "epoch": 0.9255404323458767, + "learning_rate": 1.699157268836863e-05, + "loss": 0.23, + "step": 1156 + }, + { + "epoch": 0.9271417133706965, + "learning_rate": 1.6951496792052148e-05, + "loss": 0.1825, + "step": 1158 + }, + { + "epoch": 0.9287429943955164, + "learning_rate": 1.6911203673484583e-05, + "loss": 0.4847, + "step": 1160 + }, + { + "epoch": 0.9303442754203363, + "learning_rate": 1.687069459175619e-05, + "loss": 0.2523, + "step": 1162 + }, + { + "epoch": 0.9319455564451561, + "learning_rate": 1.682997081270568e-05, + "loss": 0.1801, + "step": 1164 + }, + { + "epoch": 0.933546837469976, + "learning_rate": 1.6789033608880742e-05, + "loss": 0.4355, + "step": 1166 + }, + { + "epoch": 0.9351481184947958, + "learning_rate": 1.6747884259498185e-05, + "loss": 0.3461, + "step": 1168 + }, + { + "epoch": 0.9367493995196157, + "learning_rate": 1.6706524050404006e-05, + "loss": 0.2676, + "step": 1170 + }, + { + "epoch": 0.9383506805444356, + "learning_rate": 1.6664954274033175e-05, + "loss": 0.2655, + "step": 1172 + }, + { + "epoch": 0.9399519615692554, + "learning_rate": 1.662317622936933e-05, + "loss": 0.1778, + "step": 1174 + }, + { + "epoch": 0.9415532425940752, + "learning_rate": 1.6581191221904098e-05, + "loss": 0.044, + "step": 1176 + }, + { + "epoch": 0.9431545236188951, + "learning_rate": 1.6539000563596328e-05, + "loss": 0.3363, + "step": 1178 + }, + { + "epoch": 0.944755804643715, + "learning_rate": 1.6496605572831127e-05, + "loss": 0.3089, + "step": 1180 + }, + { + "epoch": 0.9463570856685348, + "learning_rate": 1.6454007574378657e-05, + "loss": 0.0828, + "step": 1182 + }, + { + "epoch": 0.9479583666933546, + "learning_rate": 1.6411207899352633e-05, + "loss": 0.436, + "step": 1184 + }, + { + "epoch": 0.9495596477181746, + "learning_rate": 1.6368207885168904e-05, + "loss": 0.8128, + "step": 1186 + }, + { + "epoch": 0.9511609287429944, + "learning_rate": 1.6325008875503563e-05, + "loss": 0.2068, + "step": 1188 + }, + { + "epoch": 0.9527622097678142, + "learning_rate": 1.628161222025089e-05, + "loss": 0.3791, + "step": 1190 + }, + { + "epoch": 0.9543634907926342, + "learning_rate": 1.623801927548132e-05, + "loss": 0.1358, + "step": 1192 + }, + { + "epoch": 0.955964771817454, + "learning_rate": 1.6194231403398987e-05, + "loss": 0.3246, + "step": 1194 + }, + { + "epoch": 0.9575660528422738, + "learning_rate": 1.6150249972299173e-05, + "loss": 0.0612, + "step": 1196 + }, + { + "epoch": 0.9591673338670936, + "learning_rate": 1.6106076356525484e-05, + "loss": 0.3398, + "step": 1198 + }, + { + "epoch": 0.9607686148919136, + "learning_rate": 1.6061711936427028e-05, + "loss": 0.3827, + "step": 1200 + }, + { + "epoch": 0.9623698959167334, + "learning_rate": 1.6017158098315224e-05, + "loss": 0.2419, + "step": 1202 + }, + { + "epoch": 0.9639711769415532, + "learning_rate": 1.5972416234420404e-05, + "loss": 0.289, + "step": 1204 + }, + { + "epoch": 0.9655724579663731, + "learning_rate": 1.592748774284844e-05, + "loss": 0.2798, + "step": 1206 + }, + { + "epoch": 0.967173738991193, + "learning_rate": 1.588237402753703e-05, + "loss": 0.2394, + "step": 1208 + }, + { + "epoch": 0.9687750200160128, + "learning_rate": 1.5837076498211673e-05, + "loss": 0.1968, + "step": 1210 + }, + { + "epoch": 0.9703763010408326, + "learning_rate": 1.579159657034185e-05, + "loss": 0.1817, + "step": 1212 + }, + { + "epoch": 0.9719775820656525, + "learning_rate": 1.574593566509664e-05, + "loss": 0.2889, + "step": 1214 + }, + { + "epoch": 0.9735788630904724, + "learning_rate": 1.5700095209300386e-05, + "loss": 0.3723, + "step": 1216 + }, + { + "epoch": 0.9751801441152922, + "learning_rate": 1.5654076635387976e-05, + "loss": 0.2388, + "step": 1218 + }, + { + "epoch": 0.9767814251401121, + "learning_rate": 1.560788138136029e-05, + "loss": 0.2165, + "step": 1220 + }, + { + "epoch": 0.978382706164932, + "learning_rate": 1.5561510890739137e-05, + "loss": 0.1101, + "step": 1222 + }, + { + "epoch": 0.9799839871897518, + "learning_rate": 1.5514966612522088e-05, + "loss": 0.338, + "step": 1224 + }, + { + "epoch": 0.9815852682145717, + "learning_rate": 1.546825000113736e-05, + "loss": 0.1525, + "step": 1226 + }, + { + "epoch": 0.9831865492393915, + "learning_rate": 1.5421362516398285e-05, + "loss": 0.0621, + "step": 1228 + }, + { + "epoch": 0.9847878302642114, + "learning_rate": 1.5374305623457605e-05, + "loss": 0.1348, + "step": 1230 + }, + { + "epoch": 0.9863891112890312, + "learning_rate": 1.532708079276186e-05, + "loss": 0.4167, + "step": 1232 + }, + { + "epoch": 0.9879903923138511, + "learning_rate": 1.5279689500005353e-05, + "loss": 0.2386, + "step": 1234 + }, + { + "epoch": 0.9895916733386709, + "learning_rate": 1.5232133226083962e-05, + "loss": 0.1762, + "step": 1236 + }, + { + "epoch": 0.9911929543634908, + "learning_rate": 1.5184413457049014e-05, + "loss": 0.2175, + "step": 1238 + }, + { + "epoch": 0.9927942353883107, + "learning_rate": 1.5136531684060753e-05, + "loss": 0.342, + "step": 1240 + }, + { + "epoch": 0.9943955164131305, + "learning_rate": 1.50884894033418e-05, + "loss": 0.2702, + "step": 1242 + }, + { + "epoch": 0.9959967974379503, + "learning_rate": 1.504028811613027e-05, + "loss": 0.3579, + "step": 1244 + }, + { + "epoch": 0.9975980784627703, + "learning_rate": 1.4991929328633043e-05, + "loss": 0.191, + "step": 1246 + }, + { + "epoch": 0.9991993594875901, + "learning_rate": 1.4943414551978622e-05, + "loss": 1.2573, + "step": 1248 + }, + { + "epoch": 1.0, + "step": 1249, + "total_flos": 8056572981805056.0, + "train_loss": 0.2685424877216762, + "train_runtime": 6889.6253, + "train_samples_per_second": 2.901, + "train_steps_per_second": 0.181 + } + ], + "logging_steps": 2, + "max_steps": 1249, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 8056572981805056.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..95063545b6eb2b24d07d18e70c1a940995495e02 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:757cce386107dcc9815d57ce40216b929e324aaf0305ab8783aae6681d9c5c82 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..e01b261692d7d503551d567ad807536e3bd65ebf --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36c09bb6e94729d6254c9933807fe4bb2b9d0a1ee7f2bc27312c99f8b956fabf +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..b4f7912409c2bb348f9e6dcbfe3744cb0b461639 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0db9d11db570cdeb15e802d8495340fc6bfb726dd405e8d64e8d0c46e87c4001 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..caf98a4abebc185e4155e1b642f6e5fa54c8e08b --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82bd0f53a3db63337a576d32884e459d5384976233843fbb8abeec5c00816da2 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_125_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_125_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5ef738ab44d8c9bff077673c275f4a80bd166c41 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_125_seed1/0_trainer_state.json @@ -0,0 +1,7526 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2498, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008006405124099279, + "learning_rate": 2.415943612351265e-06, + "loss": 0.0226, + "step": 2 + }, + { + "epoch": 0.0016012810248198558, + "learning_rate": 2.4341906163790364e-06, + "loss": 0.0291, + "step": 4 + }, + { + "epoch": 0.0024019215372297837, + "learning_rate": 2.4524967251364995e-06, + "loss": 0.1859, + "step": 6 + }, + { + "epoch": 0.0032025620496397116, + "learning_rate": 2.4708617956148052e-06, + "loss": 0.0363, + "step": 8 + }, + { + "epoch": 0.0040032025620496394, + "learning_rate": 2.4892856843445236e-06, + "loss": 0.0445, + "step": 10 + }, + { + "epoch": 0.004803843074459567, + "learning_rate": 2.507768247396697e-06, + "loss": 0.1634, + "step": 12 + }, + { + "epoch": 0.005604483586869495, + "learning_rate": 2.5263093403840022e-06, + "loss": 0.1289, + "step": 14 + }, + { + "epoch": 0.006405124099279423, + "learning_rate": 2.5449088184619065e-06, + "loss": 0.0069, + "step": 16 + }, + { + "epoch": 0.007205764611689352, + "learning_rate": 2.5635665363297356e-06, + "loss": 0.2584, + "step": 18 + }, + { + "epoch": 0.008006405124099279, + "learning_rate": 2.5822823482318517e-06, + "loss": 0.8501, + "step": 20 + }, + { + "epoch": 0.008807045636509208, + "learning_rate": 2.6010561079587694e-06, + "loss": 0.2298, + "step": 22 + }, + { + "epoch": 0.009607686148919135, + "learning_rate": 2.6198876688483453e-06, + "loss": 0.2147, + "step": 24 + }, + { + "epoch": 0.010408326661329063, + "learning_rate": 2.6387768837868565e-06, + "loss": 0.0999, + "step": 26 + }, + { + "epoch": 0.01120896717373899, + "learning_rate": 2.6577236052101764e-06, + "loss": 0.0061, + "step": 28 + }, + { + "epoch": 0.01200960768614892, + "learning_rate": 2.6767276851049716e-06, + "loss": 0.077, + "step": 30 + }, + { + "epoch": 0.012810248198558846, + "learning_rate": 2.6957889750097866e-06, + "loss": 0.0008, + "step": 32 + }, + { + "epoch": 0.013610888710968775, + "learning_rate": 2.7149073260162416e-06, + "loss": 0.0372, + "step": 34 + }, + { + "epoch": 0.014411529223378704, + "learning_rate": 2.7340825887701848e-06, + "loss": 0.1547, + "step": 36 + }, + { + "epoch": 0.01521216973578863, + "learning_rate": 2.7533146134728993e-06, + "loss": 0.0652, + "step": 38 + }, + { + "epoch": 0.016012810248198558, + "learning_rate": 2.772603249882202e-06, + "loss": 0.0455, + "step": 40 + }, + { + "epoch": 0.016813450760608487, + "learning_rate": 2.7919483473136555e-06, + "loss": 0.5831, + "step": 42 + }, + { + "epoch": 0.017614091273018415, + "learning_rate": 2.81134975464178e-06, + "loss": 0.2058, + "step": 44 + }, + { + "epoch": 0.018414731785428344, + "learning_rate": 2.8308073203011634e-06, + "loss": 0.0709, + "step": 46 + }, + { + "epoch": 0.01921537229783827, + "learning_rate": 2.850320892287688e-06, + "loss": 0.0498, + "step": 48 + }, + { + "epoch": 0.020016012810248198, + "learning_rate": 2.8698903181597026e-06, + "loss": 0.0774, + "step": 50 + }, + { + "epoch": 0.020816653322658127, + "learning_rate": 2.889515445039256e-06, + "loss": 0.0885, + "step": 52 + }, + { + "epoch": 0.021617293835068056, + "learning_rate": 2.909196119613218e-06, + "loss": 0.3702, + "step": 54 + }, + { + "epoch": 0.02241793434747798, + "learning_rate": 2.928932188134529e-06, + "loss": 0.247, + "step": 56 + }, + { + "epoch": 0.02321857485988791, + "learning_rate": 2.9487234964233724e-06, + "loss": 0.1199, + "step": 58 + }, + { + "epoch": 0.02401921537229784, + "learning_rate": 2.9685698898684355e-06, + "loss": 0.143, + "step": 60 + }, + { + "epoch": 0.024819855884707767, + "learning_rate": 2.988471213428035e-06, + "loss": 0.0909, + "step": 62 + }, + { + "epoch": 0.025620496397117692, + "learning_rate": 3.00842731163137e-06, + "loss": 0.0304, + "step": 64 + }, + { + "epoch": 0.02642113690952762, + "learning_rate": 3.0284380285797733e-06, + "loss": 0.5234, + "step": 66 + }, + { + "epoch": 0.02722177742193755, + "learning_rate": 3.048503207947854e-06, + "loss": 0.123, + "step": 68 + }, + { + "epoch": 0.02802241793434748, + "learning_rate": 3.068622692984767e-06, + "loss": 0.0605, + "step": 70 + }, + { + "epoch": 0.028823058446757407, + "learning_rate": 3.0887963265154187e-06, + "loss": 0.1615, + "step": 72 + }, + { + "epoch": 0.029623698959167333, + "learning_rate": 3.1090239509417364e-06, + "loss": 0.0721, + "step": 74 + }, + { + "epoch": 0.03042433947157726, + "learning_rate": 3.129305408243829e-06, + "loss": 0.3223, + "step": 76 + }, + { + "epoch": 0.03122497998398719, + "learning_rate": 3.1496405399812602e-06, + "loss": 0.0614, + "step": 78 + }, + { + "epoch": 0.032025620496397116, + "learning_rate": 3.17002918729432e-06, + "loss": 0.0806, + "step": 80 + }, + { + "epoch": 0.03282626100880705, + "learning_rate": 3.1904711909051967e-06, + "loss": 0.0429, + "step": 82 + }, + { + "epoch": 0.03362690152121697, + "learning_rate": 3.2109663911192622e-06, + "loss": 0.0102, + "step": 84 + }, + { + "epoch": 0.0344275420336269, + "learning_rate": 3.231514627826302e-06, + "loss": 0.3327, + "step": 86 + }, + { + "epoch": 0.03522818254603683, + "learning_rate": 3.2521157405018146e-06, + "loss": 0.008, + "step": 88 + }, + { + "epoch": 0.036028823058446756, + "learning_rate": 3.2727695682081897e-06, + "loss": 0.014, + "step": 90 + }, + { + "epoch": 0.03682946357085669, + "learning_rate": 3.293475949595998e-06, + "loss": 0.0504, + "step": 92 + }, + { + "epoch": 0.03763010408326661, + "learning_rate": 3.314234722905302e-06, + "loss": 0.0178, + "step": 94 + }, + { + "epoch": 0.03843074459567654, + "learning_rate": 3.335045725966829e-06, + "loss": 0.1608, + "step": 96 + }, + { + "epoch": 0.03923138510808647, + "learning_rate": 3.355908796203301e-06, + "loss": 0.1094, + "step": 98 + }, + { + "epoch": 0.040032025620496396, + "learning_rate": 3.3768237706306716e-06, + "loss": 0.0211, + "step": 100 + }, + { + "epoch": 0.04083266613290633, + "learning_rate": 3.3977904858594534e-06, + "loss": 0.1315, + "step": 102 + }, + { + "epoch": 0.041633306645316254, + "learning_rate": 3.418808778095917e-06, + "loss": 0.1589, + "step": 104 + }, + { + "epoch": 0.04243394715772618, + "learning_rate": 3.4398784831434097e-06, + "loss": 0.0286, + "step": 106 + }, + { + "epoch": 0.04323458767013611, + "learning_rate": 3.460999436403676e-06, + "loss": 0.7295, + "step": 108 + }, + { + "epoch": 0.044035228182546036, + "learning_rate": 3.4821714728780654e-06, + "loss": 0.2056, + "step": 110 + }, + { + "epoch": 0.04483586869495596, + "learning_rate": 3.5033944271688624e-06, + "loss": 0.0407, + "step": 112 + }, + { + "epoch": 0.045636509207365894, + "learning_rate": 3.5246681334806177e-06, + "loss": 0.2238, + "step": 114 + }, + { + "epoch": 0.04643714971977582, + "learning_rate": 3.5459924256213596e-06, + "loss": 0.3645, + "step": 116 + }, + { + "epoch": 0.04723779023218575, + "learning_rate": 3.567367137003953e-06, + "loss": 0.0454, + "step": 118 + }, + { + "epoch": 0.04803843074459568, + "learning_rate": 3.588792100647368e-06, + "loss": 0.0877, + "step": 120 + }, + { + "epoch": 0.0488390712570056, + "learning_rate": 3.6102671491780393e-06, + "loss": 0.0479, + "step": 122 + }, + { + "epoch": 0.049639711769415534, + "learning_rate": 3.6317921148310965e-06, + "loss": 0.0257, + "step": 124 + }, + { + "epoch": 0.05044035228182546, + "learning_rate": 3.653366829451711e-06, + "loss": 0.0082, + "step": 126 + }, + { + "epoch": 0.051240992794235385, + "learning_rate": 3.674991124496452e-06, + "loss": 0.1184, + "step": 128 + }, + { + "epoch": 0.05204163330664532, + "learning_rate": 3.696664831034521e-06, + "loss": 0.1467, + "step": 130 + }, + { + "epoch": 0.05284227381905524, + "learning_rate": 3.7183877797491143e-06, + "loss": 0.36, + "step": 132 + }, + { + "epoch": 0.053642914331465175, + "learning_rate": 3.740159800938784e-06, + "loss": 0.8282, + "step": 134 + }, + { + "epoch": 0.0544435548438751, + "learning_rate": 3.7619807245186824e-06, + "loss": 0.0102, + "step": 136 + }, + { + "epoch": 0.055244195356285025, + "learning_rate": 3.783850380021933e-06, + "loss": 0.4521, + "step": 138 + }, + { + "epoch": 0.05604483586869496, + "learning_rate": 3.8057685966010025e-06, + "loss": 0.0595, + "step": 140 + }, + { + "epoch": 0.05684547638110488, + "learning_rate": 3.827735203028956e-06, + "loss": 0.1578, + "step": 142 + }, + { + "epoch": 0.057646116893514815, + "learning_rate": 3.849750027700842e-06, + "loss": 0.0039, + "step": 144 + }, + { + "epoch": 0.05844675740592474, + "learning_rate": 3.871812898635011e-06, + "loss": 0.0847, + "step": 146 + }, + { + "epoch": 0.059247397918334666, + "learning_rate": 3.8939236434745184e-06, + "loss": 0.0571, + "step": 148 + }, + { + "epoch": 0.0600480384307446, + "learning_rate": 3.916082089488379e-06, + "loss": 0.0481, + "step": 150 + }, + { + "epoch": 0.06084867894315452, + "learning_rate": 3.938288063572962e-06, + "loss": 0.1287, + "step": 152 + }, + { + "epoch": 0.06164931945556445, + "learning_rate": 3.960541392253387e-06, + "loss": 0.0173, + "step": 154 + }, + { + "epoch": 0.06244995996797438, + "learning_rate": 3.982841901684792e-06, + "loss": 0.1392, + "step": 156 + }, + { + "epoch": 0.0632506004803843, + "learning_rate": 4.005189417653737e-06, + "loss": 0.3516, + "step": 158 + }, + { + "epoch": 0.06405124099279423, + "learning_rate": 4.027583765579601e-06, + "loss": 0.0643, + "step": 160 + }, + { + "epoch": 0.06485188150520416, + "learning_rate": 4.050024770515873e-06, + "loss": 0.0008, + "step": 162 + }, + { + "epoch": 0.0656525220176141, + "learning_rate": 4.072512257151546e-06, + "loss": 0.0299, + "step": 164 + }, + { + "epoch": 0.06645316253002402, + "learning_rate": 4.095046049812541e-06, + "loss": 0.0977, + "step": 166 + }, + { + "epoch": 0.06725380304243395, + "learning_rate": 4.117625972462988e-06, + "loss": 0.2039, + "step": 168 + }, + { + "epoch": 0.06805444355484387, + "learning_rate": 4.1402518487066624e-06, + "loss": 0.6473, + "step": 170 + }, + { + "epoch": 0.0688550840672538, + "learning_rate": 4.1629235017883285e-06, + "loss": 0.1447, + "step": 172 + }, + { + "epoch": 0.06965572457966374, + "learning_rate": 4.1856407545951825e-06, + "loss": 0.0007, + "step": 174 + }, + { + "epoch": 0.07045636509207366, + "learning_rate": 4.208403429658151e-06, + "loss": 0.1387, + "step": 176 + }, + { + "epoch": 0.07125700560448359, + "learning_rate": 4.2312113491533145e-06, + "loss": 0.0122, + "step": 178 + }, + { + "epoch": 0.07205764611689351, + "learning_rate": 4.254064334903347e-06, + "loss": 0.2456, + "step": 180 + }, + { + "epoch": 0.07285828662930344, + "learning_rate": 4.276962208378814e-06, + "loss": 0.1934, + "step": 182 + }, + { + "epoch": 0.07365892714171338, + "learning_rate": 4.299904790699619e-06, + "loss": 0.0068, + "step": 184 + }, + { + "epoch": 0.0744595676541233, + "learning_rate": 4.3228919026364345e-06, + "loss": 0.0389, + "step": 186 + }, + { + "epoch": 0.07526020816653323, + "learning_rate": 4.345923364612024e-06, + "loss": 0.2351, + "step": 188 + }, + { + "epoch": 0.07606084867894315, + "learning_rate": 4.368998996702686e-06, + "loss": 0.0314, + "step": 190 + }, + { + "epoch": 0.07686148919135308, + "learning_rate": 4.392118618639698e-06, + "loss": 0.0015, + "step": 192 + }, + { + "epoch": 0.07766212970376302, + "learning_rate": 4.415282049810643e-06, + "loss": 0.0997, + "step": 194 + }, + { + "epoch": 0.07846277021617294, + "learning_rate": 4.4384891092608795e-06, + "loss": 0.6164, + "step": 196 + }, + { + "epoch": 0.07926341072858287, + "learning_rate": 4.461739615694921e-06, + "loss": 0.2563, + "step": 198 + }, + { + "epoch": 0.08006405124099279, + "learning_rate": 4.485033387477915e-06, + "loss": 0.1629, + "step": 200 + }, + { + "epoch": 0.08086469175340272, + "learning_rate": 4.5083702426369715e-06, + "loss": 0.132, + "step": 202 + }, + { + "epoch": 0.08166533226581266, + "learning_rate": 4.531749998862628e-06, + "loss": 0.0069, + "step": 204 + }, + { + "epoch": 0.08246597277822258, + "learning_rate": 4.555172473510324e-06, + "loss": 0.6589, + "step": 206 + }, + { + "epoch": 0.08326661329063251, + "learning_rate": 4.578637483601732e-06, + "loss": 0.0235, + "step": 208 + }, + { + "epoch": 0.08406725380304243, + "learning_rate": 4.602144845826234e-06, + "loss": 0.0233, + "step": 210 + }, + { + "epoch": 0.08486789431545236, + "learning_rate": 4.625694376542399e-06, + "loss": 0.0186, + "step": 212 + }, + { + "epoch": 0.08566853482786228, + "learning_rate": 4.649285891779326e-06, + "loss": 0.0443, + "step": 214 + }, + { + "epoch": 0.08646917534027222, + "learning_rate": 4.672919207238145e-06, + "loss": 0.0218, + "step": 216 + }, + { + "epoch": 0.08726981585268215, + "learning_rate": 4.696594138293421e-06, + "loss": 0.0382, + "step": 218 + }, + { + "epoch": 0.08807045636509207, + "learning_rate": 4.720310499994664e-06, + "loss": 0.2584, + "step": 220 + }, + { + "epoch": 0.088871096877502, + "learning_rate": 4.744068107067673e-06, + "loss": 0.6449, + "step": 222 + }, + { + "epoch": 0.08967173738991192, + "learning_rate": 4.767866773916041e-06, + "loss": 0.0809, + "step": 224 + }, + { + "epoch": 0.09047237790232186, + "learning_rate": 4.79170631462264e-06, + "loss": 0.0974, + "step": 226 + }, + { + "epoch": 0.09127301841473179, + "learning_rate": 4.81558654295099e-06, + "loss": 0.2181, + "step": 228 + }, + { + "epoch": 0.09207365892714171, + "learning_rate": 4.839507272346751e-06, + "loss": 0.0489, + "step": 230 + }, + { + "epoch": 0.09287429943955164, + "learning_rate": 4.863468315939234e-06, + "loss": 0.1512, + "step": 232 + }, + { + "epoch": 0.09367493995196156, + "learning_rate": 4.8874694865427676e-06, + "loss": 0.2301, + "step": 234 + }, + { + "epoch": 0.0944755804643715, + "learning_rate": 4.911510596658202e-06, + "loss": 0.0158, + "step": 236 + }, + { + "epoch": 0.09527622097678143, + "learning_rate": 4.935591458474425e-06, + "loss": 0.1014, + "step": 238 + }, + { + "epoch": 0.09607686148919135, + "learning_rate": 4.959711883869734e-06, + "loss": 0.127, + "step": 240 + }, + { + "epoch": 0.09687750200160128, + "learning_rate": 4.9838716844133665e-06, + "loss": 0.1418, + "step": 242 + }, + { + "epoch": 0.0976781425140112, + "learning_rate": 5.0080706713669435e-06, + "loss": 0.3214, + "step": 244 + }, + { + "epoch": 0.09847878302642114, + "learning_rate": 5.032308655686007e-06, + "loss": 0.1381, + "step": 246 + }, + { + "epoch": 0.09927942353883107, + "learning_rate": 5.056585448021398e-06, + "loss": 0.2522, + "step": 248 + }, + { + "epoch": 0.100080064051241, + "learning_rate": 5.080900858720789e-06, + "loss": 0.1938, + "step": 250 + }, + { + "epoch": 0.10088070456365092, + "learning_rate": 5.105254697830208e-06, + "loss": 1.0167, + "step": 252 + }, + { + "epoch": 0.10168134507606084, + "learning_rate": 5.129646775095432e-06, + "loss": 0.0005, + "step": 254 + }, + { + "epoch": 0.10248198558847077, + "learning_rate": 5.154076899963514e-06, + "loss": 1.0353, + "step": 256 + }, + { + "epoch": 0.10328262610088071, + "learning_rate": 5.178544881584328e-06, + "loss": 0.1064, + "step": 258 + }, + { + "epoch": 0.10408326661329063, + "learning_rate": 5.203050528811959e-06, + "loss": 0.2778, + "step": 260 + }, + { + "epoch": 0.10488390712570056, + "learning_rate": 5.227593650206246e-06, + "loss": 0.06, + "step": 262 + }, + { + "epoch": 0.10568454763811048, + "learning_rate": 5.2521740540343205e-06, + "loss": 0.1582, + "step": 264 + }, + { + "epoch": 0.10648518815052041, + "learning_rate": 5.2767915482720164e-06, + "loss": 0.0583, + "step": 266 + }, + { + "epoch": 0.10728582866293035, + "learning_rate": 5.3014459406054295e-06, + "loss": 0.0254, + "step": 268 + }, + { + "epoch": 0.10808646917534027, + "learning_rate": 5.3261370384323904e-06, + "loss": 0.0185, + "step": 270 + }, + { + "epoch": 0.1088871096877502, + "learning_rate": 5.350864648864026e-06, + "loss": 0.0272, + "step": 272 + }, + { + "epoch": 0.10968775020016013, + "learning_rate": 5.375628578726181e-06, + "loss": 0.1763, + "step": 274 + }, + { + "epoch": 0.11048839071257005, + "learning_rate": 5.4004286345609665e-06, + "loss": 0.5226, + "step": 276 + }, + { + "epoch": 0.11128903122497999, + "learning_rate": 5.425264622628326e-06, + "loss": 0.2944, + "step": 278 + }, + { + "epoch": 0.11208967173738991, + "learning_rate": 5.450136348907444e-06, + "loss": 0.1261, + "step": 280 + }, + { + "epoch": 0.11289031224979984, + "learning_rate": 5.475043619098321e-06, + "loss": 0.1872, + "step": 282 + }, + { + "epoch": 0.11369095276220977, + "learning_rate": 5.499986238623329e-06, + "loss": 0.4713, + "step": 284 + }, + { + "epoch": 0.11449159327461969, + "learning_rate": 5.524964012628644e-06, + "loss": 0.337, + "step": 286 + }, + { + "epoch": 0.11529223378702963, + "learning_rate": 5.549976745985809e-06, + "loss": 0.403, + "step": 288 + }, + { + "epoch": 0.11609287429943956, + "learning_rate": 5.57502424329331e-06, + "loss": 0.2097, + "step": 290 + }, + { + "epoch": 0.11689351481184948, + "learning_rate": 5.6001063088780085e-06, + "loss": 0.0403, + "step": 292 + }, + { + "epoch": 0.1176941553242594, + "learning_rate": 5.62522274679673e-06, + "loss": 0.5853, + "step": 294 + }, + { + "epoch": 0.11849479583666933, + "learning_rate": 5.650373360837763e-06, + "loss": 0.014, + "step": 296 + }, + { + "epoch": 0.11929543634907927, + "learning_rate": 5.675557954522462e-06, + "loss": 0.0281, + "step": 298 + }, + { + "epoch": 0.1200960768614892, + "learning_rate": 5.700776331106674e-06, + "loss": 0.2453, + "step": 300 + }, + { + "epoch": 0.12089671737389912, + "learning_rate": 5.726028293582342e-06, + "loss": 0.2841, + "step": 302 + }, + { + "epoch": 0.12169735788630905, + "learning_rate": 5.751313644679071e-06, + "loss": 0.0618, + "step": 304 + }, + { + "epoch": 0.12249799839871897, + "learning_rate": 5.776632186865589e-06, + "loss": 0.0709, + "step": 306 + }, + { + "epoch": 0.1232986389111289, + "learning_rate": 5.8019837223513295e-06, + "loss": 0.1957, + "step": 308 + }, + { + "epoch": 0.12409927942353884, + "learning_rate": 5.827368053088032e-06, + "loss": 0.0276, + "step": 310 + }, + { + "epoch": 0.12489991993594876, + "learning_rate": 5.852784980771182e-06, + "loss": 0.1094, + "step": 312 + }, + { + "epoch": 0.1257005604483587, + "learning_rate": 5.878234306841637e-06, + "loss": 0.0794, + "step": 314 + }, + { + "epoch": 0.1265012009607686, + "learning_rate": 5.903715832487138e-06, + "loss": 0.2306, + "step": 316 + }, + { + "epoch": 0.12730184147317855, + "learning_rate": 5.929229358643925e-06, + "loss": 0.3337, + "step": 318 + }, + { + "epoch": 0.12810248198558846, + "learning_rate": 5.954774685998206e-06, + "loss": 0.59, + "step": 320 + }, + { + "epoch": 0.1289031224979984, + "learning_rate": 5.9803516149877475e-06, + "loss": 0.0474, + "step": 322 + }, + { + "epoch": 0.1297037630104083, + "learning_rate": 6.005959945803494e-06, + "loss": 0.0087, + "step": 324 + }, + { + "epoch": 0.13050440352281825, + "learning_rate": 6.03159947839103e-06, + "loss": 0.1776, + "step": 326 + }, + { + "epoch": 0.1313050440352282, + "learning_rate": 6.057270012452186e-06, + "loss": 0.0663, + "step": 328 + }, + { + "epoch": 0.1321056845476381, + "learning_rate": 6.082971347446654e-06, + "loss": 0.2639, + "step": 330 + }, + { + "epoch": 0.13290632506004804, + "learning_rate": 6.108703282593461e-06, + "loss": 0.0431, + "step": 332 + }, + { + "epoch": 0.13370696557245795, + "learning_rate": 6.13446561687258e-06, + "loss": 0.0004, + "step": 334 + }, + { + "epoch": 0.1345076060848679, + "learning_rate": 6.160258149026557e-06, + "loss": 0.461, + "step": 336 + }, + { + "epoch": 0.13530824659727783, + "learning_rate": 6.186080677561974e-06, + "loss": 0.0539, + "step": 338 + }, + { + "epoch": 0.13610888710968774, + "learning_rate": 6.2119330007511014e-06, + "loss": 0.0529, + "step": 340 + }, + { + "epoch": 0.13690952762209768, + "learning_rate": 6.237814916633431e-06, + "loss": 0.0299, + "step": 342 + }, + { + "epoch": 0.1377101681345076, + "learning_rate": 6.263726223017326e-06, + "loss": 0.016, + "step": 344 + }, + { + "epoch": 0.13851080864691753, + "learning_rate": 6.289666717481496e-06, + "loss": 0.1544, + "step": 346 + }, + { + "epoch": 0.13931144915932747, + "learning_rate": 6.315636197376634e-06, + "loss": 0.2151, + "step": 348 + }, + { + "epoch": 0.14011208967173738, + "learning_rate": 6.341634459827044e-06, + "loss": 0.1595, + "step": 350 + }, + { + "epoch": 0.14091273018414732, + "learning_rate": 6.3676613017321305e-06, + "loss": 0.0564, + "step": 352 + }, + { + "epoch": 0.14171337069655723, + "learning_rate": 6.393716519768032e-06, + "loss": 0.106, + "step": 354 + }, + { + "epoch": 0.14251401120896717, + "learning_rate": 6.419799910389257e-06, + "loss": 0.0776, + "step": 356 + }, + { + "epoch": 0.1433146517213771, + "learning_rate": 6.445911269830183e-06, + "loss": 0.0288, + "step": 358 + }, + { + "epoch": 0.14411529223378702, + "learning_rate": 6.472050394106689e-06, + "loss": 0.4517, + "step": 360 + }, + { + "epoch": 0.14491593274619696, + "learning_rate": 6.498217079017806e-06, + "loss": 0.1365, + "step": 362 + }, + { + "epoch": 0.14571657325860687, + "learning_rate": 6.524411120147204e-06, + "loss": 0.0463, + "step": 364 + }, + { + "epoch": 0.1465172137710168, + "learning_rate": 6.5506323128648654e-06, + "loss": 0.4079, + "step": 366 + }, + { + "epoch": 0.14731785428342675, + "learning_rate": 6.576880452328645e-06, + "loss": 0.5232, + "step": 368 + }, + { + "epoch": 0.14811849479583666, + "learning_rate": 6.603155333485934e-06, + "loss": 0.0355, + "step": 370 + }, + { + "epoch": 0.1489191353082466, + "learning_rate": 6.6294567510751675e-06, + "loss": 0.1916, + "step": 372 + }, + { + "epoch": 0.14971977582065651, + "learning_rate": 6.655784499627476e-06, + "loss": 0.1699, + "step": 374 + }, + { + "epoch": 0.15052041633306645, + "learning_rate": 6.682138373468341e-06, + "loss": 0.0235, + "step": 376 + }, + { + "epoch": 0.1513210568454764, + "learning_rate": 6.7085181667191e-06, + "loss": 0.1748, + "step": 378 + }, + { + "epoch": 0.1521216973578863, + "learning_rate": 6.734923673298605e-06, + "loss": 0.1084, + "step": 380 + }, + { + "epoch": 0.15292233787029624, + "learning_rate": 6.761354686924883e-06, + "loss": 0.1521, + "step": 382 + }, + { + "epoch": 0.15372297838270615, + "learning_rate": 6.787811001116654e-06, + "loss": 0.2527, + "step": 384 + }, + { + "epoch": 0.1545236188951161, + "learning_rate": 6.8142924091949955e-06, + "loss": 0.0976, + "step": 386 + }, + { + "epoch": 0.15532425940752603, + "learning_rate": 6.840798704284939e-06, + "loss": 0.2646, + "step": 388 + }, + { + "epoch": 0.15612489991993594, + "learning_rate": 6.867329679317144e-06, + "loss": 0.1899, + "step": 390 + }, + { + "epoch": 0.15692554043234588, + "learning_rate": 6.893885127029419e-06, + "loss": 0.0227, + "step": 392 + }, + { + "epoch": 0.1577261809447558, + "learning_rate": 6.920464839968391e-06, + "loss": 0.0312, + "step": 394 + }, + { + "epoch": 0.15852682145716573, + "learning_rate": 6.94706861049117e-06, + "loss": 0.1219, + "step": 396 + }, + { + "epoch": 0.15932746196957567, + "learning_rate": 6.973696230766884e-06, + "loss": 0.1102, + "step": 398 + }, + { + "epoch": 0.16012810248198558, + "learning_rate": 7.000347492778341e-06, + "loss": 0.2329, + "step": 400 + }, + { + "epoch": 0.16092874299439552, + "learning_rate": 7.027022188323704e-06, + "loss": 0.6628, + "step": 402 + }, + { + "epoch": 0.16172938350680544, + "learning_rate": 7.05372010901803e-06, + "loss": 0.1585, + "step": 404 + }, + { + "epoch": 0.16253002401921537, + "learning_rate": 7.080441046294945e-06, + "loss": 0.0565, + "step": 406 + }, + { + "epoch": 0.1633306645316253, + "learning_rate": 7.1071847914082605e-06, + "loss": 0.049, + "step": 408 + }, + { + "epoch": 0.16413130504403523, + "learning_rate": 7.133951135433656e-06, + "loss": 0.2073, + "step": 410 + }, + { + "epoch": 0.16493194555644516, + "learning_rate": 7.160739869270219e-06, + "loss": 0.245, + "step": 412 + }, + { + "epoch": 0.16573258606885508, + "learning_rate": 7.18755078364214e-06, + "loss": 0.332, + "step": 414 + }, + { + "epoch": 0.16653322658126501, + "learning_rate": 7.214383669100317e-06, + "loss": 0.8122, + "step": 416 + }, + { + "epoch": 0.16733386709367493, + "learning_rate": 7.241238316024064e-06, + "loss": 0.0817, + "step": 418 + }, + { + "epoch": 0.16813450760608487, + "learning_rate": 7.268114514622635e-06, + "loss": 0.2892, + "step": 420 + }, + { + "epoch": 0.1689351481184948, + "learning_rate": 7.2950120549369204e-06, + "loss": 0.4415, + "step": 422 + }, + { + "epoch": 0.16973578863090472, + "learning_rate": 7.321930726841144e-06, + "loss": 0.163, + "step": 424 + }, + { + "epoch": 0.17053642914331466, + "learning_rate": 7.348870320044395e-06, + "loss": 0.3424, + "step": 426 + }, + { + "epoch": 0.17133706965572457, + "learning_rate": 7.375830624092336e-06, + "loss": 0.0964, + "step": 428 + }, + { + "epoch": 0.1721377101681345, + "learning_rate": 7.402811428368824e-06, + "loss": 0.2665, + "step": 430 + }, + { + "epoch": 0.17293835068054444, + "learning_rate": 7.429812522097613e-06, + "loss": 0.0912, + "step": 432 + }, + { + "epoch": 0.17373899119295436, + "learning_rate": 7.4568336943439055e-06, + "loss": 0.2371, + "step": 434 + }, + { + "epoch": 0.1745396317053643, + "learning_rate": 7.4838747340160475e-06, + "loss": 0.3368, + "step": 436 + }, + { + "epoch": 0.1753402722177742, + "learning_rate": 7.510935429867233e-06, + "loss": 0.0825, + "step": 438 + }, + { + "epoch": 0.17614091273018415, + "learning_rate": 7.538015570497046e-06, + "loss": 0.0706, + "step": 440 + }, + { + "epoch": 0.17694155324259409, + "learning_rate": 7.5651149443531846e-06, + "loss": 0.7486, + "step": 442 + }, + { + "epoch": 0.177742193755004, + "learning_rate": 7.592233339733077e-06, + "loss": 0.0478, + "step": 444 + }, + { + "epoch": 0.17854283426741394, + "learning_rate": 7.619370544785608e-06, + "loss": 0.215, + "step": 446 + }, + { + "epoch": 0.17934347477982385, + "learning_rate": 7.646526347512665e-06, + "loss": 0.3429, + "step": 448 + }, + { + "epoch": 0.1801441152922338, + "learning_rate": 7.67370053577085e-06, + "loss": 0.5098, + "step": 450 + }, + { + "epoch": 0.18094475580464373, + "learning_rate": 7.70089289727319e-06, + "loss": 0.2349, + "step": 452 + }, + { + "epoch": 0.18174539631705364, + "learning_rate": 7.728103219590684e-06, + "loss": 0.0307, + "step": 454 + }, + { + "epoch": 0.18254603682946358, + "learning_rate": 7.755331290154041e-06, + "loss": 0.384, + "step": 456 + }, + { + "epoch": 0.1833466773418735, + "learning_rate": 7.7825768962553e-06, + "loss": 0.1837, + "step": 458 + }, + { + "epoch": 0.18414731785428343, + "learning_rate": 7.809839825049565e-06, + "loss": 0.2904, + "step": 460 + }, + { + "epoch": 0.18494795836669337, + "learning_rate": 7.83711986355656e-06, + "loss": 0.0989, + "step": 462 + }, + { + "epoch": 0.18574859887910328, + "learning_rate": 7.864416798662347e-06, + "loss": 0.2105, + "step": 464 + }, + { + "epoch": 0.18654923939151322, + "learning_rate": 7.891730417121043e-06, + "loss": 0.0245, + "step": 466 + }, + { + "epoch": 0.18734987990392313, + "learning_rate": 7.919060505556376e-06, + "loss": 0.741, + "step": 468 + }, + { + "epoch": 0.18815052041633307, + "learning_rate": 7.946406850463435e-06, + "loss": 0.5618, + "step": 470 + }, + { + "epoch": 0.188951160928743, + "learning_rate": 7.973769238210291e-06, + "loss": 0.2254, + "step": 472 + }, + { + "epoch": 0.18975180144115292, + "learning_rate": 8.001147455039737e-06, + "loss": 0.0756, + "step": 474 + }, + { + "epoch": 0.19055244195356286, + "learning_rate": 8.028541287070858e-06, + "loss": 0.1649, + "step": 476 + }, + { + "epoch": 0.19135308246597277, + "learning_rate": 8.055950520300756e-06, + "loss": 0.0093, + "step": 478 + }, + { + "epoch": 0.1921537229783827, + "learning_rate": 8.083374940606256e-06, + "loss": 0.0911, + "step": 480 + }, + { + "epoch": 0.19295436349079265, + "learning_rate": 8.110814333745503e-06, + "loss": 0.031, + "step": 482 + }, + { + "epoch": 0.19375500400320256, + "learning_rate": 8.138268485359684e-06, + "loss": 0.1181, + "step": 484 + }, + { + "epoch": 0.1945556445156125, + "learning_rate": 8.165737180974676e-06, + "loss": 0.0812, + "step": 486 + }, + { + "epoch": 0.1953562850280224, + "learning_rate": 8.193220206002785e-06, + "loss": 0.091, + "step": 488 + }, + { + "epoch": 0.19615692554043235, + "learning_rate": 8.220717345744326e-06, + "loss": 0.2897, + "step": 490 + }, + { + "epoch": 0.1969575660528423, + "learning_rate": 8.248228385389349e-06, + "loss": 0.1432, + "step": 492 + }, + { + "epoch": 0.1977582065652522, + "learning_rate": 8.275753110019367e-06, + "loss": 0.2504, + "step": 494 + }, + { + "epoch": 0.19855884707766214, + "learning_rate": 8.303291304608936e-06, + "loss": 0.1457, + "step": 496 + }, + { + "epoch": 0.19935948759007205, + "learning_rate": 8.330842754027378e-06, + "loss": 0.233, + "step": 498 + }, + { + "epoch": 0.200160128102482, + "learning_rate": 8.358407243040524e-06, + "loss": 0.3126, + "step": 500 + }, + { + "epoch": 0.20096076861489193, + "learning_rate": 8.385984556312285e-06, + "loss": 0.2782, + "step": 502 + }, + { + "epoch": 0.20176140912730184, + "learning_rate": 8.413574478406386e-06, + "loss": 0.1857, + "step": 504 + }, + { + "epoch": 0.20256204963971178, + "learning_rate": 8.441176793788106e-06, + "loss": 0.0237, + "step": 506 + }, + { + "epoch": 0.2033626901521217, + "learning_rate": 8.468791286825856e-06, + "loss": 0.1021, + "step": 508 + }, + { + "epoch": 0.20416333066453163, + "learning_rate": 8.496417741792922e-06, + "loss": 0.1118, + "step": 510 + }, + { + "epoch": 0.20496397117694154, + "learning_rate": 8.524055942869135e-06, + "loss": 0.3842, + "step": 512 + }, + { + "epoch": 0.20576461168935148, + "learning_rate": 8.551705674142616e-06, + "loss": 0.2092, + "step": 514 + }, + { + "epoch": 0.20656525220176142, + "learning_rate": 8.579366719611353e-06, + "loss": 0.1207, + "step": 516 + }, + { + "epoch": 0.20736589271417133, + "learning_rate": 8.607038863184952e-06, + "loss": 0.2501, + "step": 518 + }, + { + "epoch": 0.20816653322658127, + "learning_rate": 8.634721888686368e-06, + "loss": 0.2311, + "step": 520 + }, + { + "epoch": 0.20896717373899118, + "learning_rate": 8.662415579853495e-06, + "loss": 1.2167, + "step": 522 + }, + { + "epoch": 0.20976781425140112, + "learning_rate": 8.690119720340907e-06, + "loss": 0.0135, + "step": 524 + }, + { + "epoch": 0.21056845476381106, + "learning_rate": 8.717834093721598e-06, + "loss": 0.4346, + "step": 526 + }, + { + "epoch": 0.21136909527622097, + "learning_rate": 8.74555848348857e-06, + "loss": 0.2709, + "step": 528 + }, + { + "epoch": 0.2121697357886309, + "learning_rate": 8.773292673056572e-06, + "loss": 0.373, + "step": 530 + }, + { + "epoch": 0.21297037630104082, + "learning_rate": 8.801036445763858e-06, + "loss": 0.0564, + "step": 532 + }, + { + "epoch": 0.21377101681345076, + "learning_rate": 8.828789584873757e-06, + "loss": 0.1507, + "step": 534 + }, + { + "epoch": 0.2145716573258607, + "learning_rate": 8.856551873576448e-06, + "loss": 0.0683, + "step": 536 + }, + { + "epoch": 0.2153722978382706, + "learning_rate": 8.884323094990613e-06, + "loss": 0.4396, + "step": 538 + }, + { + "epoch": 0.21617293835068055, + "learning_rate": 8.912103032165206e-06, + "loss": 0.0857, + "step": 540 + }, + { + "epoch": 0.21697357886309046, + "learning_rate": 8.939891468081036e-06, + "loss": 0.2528, + "step": 542 + }, + { + "epoch": 0.2177742193755004, + "learning_rate": 8.967688185652527e-06, + "loss": 0.099, + "step": 544 + }, + { + "epoch": 0.21857485988791034, + "learning_rate": 8.995492967729449e-06, + "loss": 0.2178, + "step": 546 + }, + { + "epoch": 0.21937550040032025, + "learning_rate": 9.023305597098526e-06, + "loss": 0.235, + "step": 548 + }, + { + "epoch": 0.2201761409127302, + "learning_rate": 9.051125856485175e-06, + "loss": 0.2419, + "step": 550 + }, + { + "epoch": 0.2209767814251401, + "learning_rate": 9.078953528555258e-06, + "loss": 0.2024, + "step": 552 + }, + { + "epoch": 0.22177742193755004, + "learning_rate": 9.106788395916682e-06, + "loss": 0.0784, + "step": 554 + }, + { + "epoch": 0.22257806244995998, + "learning_rate": 9.134630241121135e-06, + "loss": 0.3213, + "step": 556 + }, + { + "epoch": 0.2233787029623699, + "learning_rate": 9.162478846665854e-06, + "loss": 0.4772, + "step": 558 + }, + { + "epoch": 0.22417934347477983, + "learning_rate": 9.190333994995208e-06, + "loss": 0.2333, + "step": 560 + }, + { + "epoch": 0.22497998398718974, + "learning_rate": 9.218195468502469e-06, + "loss": 0.511, + "step": 562 + }, + { + "epoch": 0.22578062449959968, + "learning_rate": 9.24606304953148e-06, + "loss": 0.0982, + "step": 564 + }, + { + "epoch": 0.22658126501200962, + "learning_rate": 9.273936520378426e-06, + "loss": 0.0407, + "step": 566 + }, + { + "epoch": 0.22738190552441953, + "learning_rate": 9.301815663293426e-06, + "loss": 0.1601, + "step": 568 + }, + { + "epoch": 0.22818254603682947, + "learning_rate": 9.329700260482286e-06, + "loss": 0.192, + "step": 570 + }, + { + "epoch": 0.22898318654923938, + "learning_rate": 9.35759009410826e-06, + "loss": 0.0876, + "step": 572 + }, + { + "epoch": 0.22978382706164932, + "learning_rate": 9.38548494629364e-06, + "loss": 0.2764, + "step": 574 + }, + { + "epoch": 0.23058446757405926, + "learning_rate": 9.41338459912151e-06, + "loss": 0.0539, + "step": 576 + }, + { + "epoch": 0.23138510808646917, + "learning_rate": 9.441288834637507e-06, + "loss": 0.0457, + "step": 578 + }, + { + "epoch": 0.2321857485988791, + "learning_rate": 9.469197434851414e-06, + "loss": 0.1517, + "step": 580 + }, + { + "epoch": 0.23298638911128902, + "learning_rate": 9.497110181738935e-06, + "loss": 0.2357, + "step": 582 + }, + { + "epoch": 0.23378702962369896, + "learning_rate": 9.52502685724336e-06, + "loss": 0.1095, + "step": 584 + }, + { + "epoch": 0.2345876701361089, + "learning_rate": 9.552947243277342e-06, + "loss": 0.201, + "step": 586 + }, + { + "epoch": 0.2353883106485188, + "learning_rate": 9.580871121724498e-06, + "loss": 0.4036, + "step": 588 + }, + { + "epoch": 0.23618895116092875, + "learning_rate": 9.608798274441153e-06, + "loss": 0.0651, + "step": 590 + }, + { + "epoch": 0.23698959167333866, + "learning_rate": 9.636728483258116e-06, + "loss": 0.0689, + "step": 592 + }, + { + "epoch": 0.2377902321857486, + "learning_rate": 9.664661529982263e-06, + "loss": 0.0084, + "step": 594 + }, + { + "epoch": 0.23859087269815854, + "learning_rate": 9.692597196398302e-06, + "loss": 0.0955, + "step": 596 + }, + { + "epoch": 0.23939151321056845, + "learning_rate": 9.720535264270526e-06, + "loss": 0.3214, + "step": 598 + }, + { + "epoch": 0.2401921537229784, + "learning_rate": 9.748475515344416e-06, + "loss": 0.4949, + "step": 600 + }, + { + "epoch": 0.2409927942353883, + "learning_rate": 9.776417731348403e-06, + "loss": 0.3055, + "step": 602 + }, + { + "epoch": 0.24179343474779824, + "learning_rate": 9.80436169399561e-06, + "loss": 0.1078, + "step": 604 + }, + { + "epoch": 0.24259407526020815, + "learning_rate": 9.832307184985473e-06, + "loss": 0.106, + "step": 606 + }, + { + "epoch": 0.2433947157726181, + "learning_rate": 9.8602539860055e-06, + "loss": 0.0494, + "step": 608 + }, + { + "epoch": 0.24419535628502803, + "learning_rate": 9.888201878732946e-06, + "loss": 0.1881, + "step": 610 + }, + { + "epoch": 0.24499599679743794, + "learning_rate": 9.916150644836596e-06, + "loss": 0.0654, + "step": 612 + }, + { + "epoch": 0.24579663730984788, + "learning_rate": 9.944100065978354e-06, + "loss": 0.2779, + "step": 614 + }, + { + "epoch": 0.2465972778222578, + "learning_rate": 9.972049923815011e-06, + "loss": 0.0196, + "step": 616 + }, + { + "epoch": 0.24739791833466773, + "learning_rate": 9.999999999999996e-06, + "loss": 0.1879, + "step": 618 + }, + { + "epoch": 0.24819855884707767, + "learning_rate": 1.0027950076184982e-05, + "loss": 0.2126, + "step": 620 + }, + { + "epoch": 0.24899919935948758, + "learning_rate": 1.0055899934021637e-05, + "loss": 0.6166, + "step": 622 + }, + { + "epoch": 0.24979983987189752, + "learning_rate": 1.0083849355163397e-05, + "loss": 0.5064, + "step": 624 + }, + { + "epoch": 0.25060048038430743, + "learning_rate": 1.0111798121267047e-05, + "loss": 0.3389, + "step": 626 + }, + { + "epoch": 0.2514011208967174, + "learning_rate": 1.0139746013994493e-05, + "loss": 0.1661, + "step": 628 + }, + { + "epoch": 0.2522017614091273, + "learning_rate": 1.016769281501452e-05, + "loss": 0.1403, + "step": 630 + }, + { + "epoch": 0.2530024019215372, + "learning_rate": 1.0195638306004383e-05, + "loss": 0.3316, + "step": 632 + }, + { + "epoch": 0.25380304243394713, + "learning_rate": 1.022358226865159e-05, + "loss": 0.1353, + "step": 634 + }, + { + "epoch": 0.2546036829463571, + "learning_rate": 1.0251524484655577e-05, + "loss": 0.2763, + "step": 636 + }, + { + "epoch": 0.255404323458767, + "learning_rate": 1.0279464735729467e-05, + "loss": 0.2401, + "step": 638 + }, + { + "epoch": 0.2562049639711769, + "learning_rate": 1.0307402803601691e-05, + "loss": 0.1673, + "step": 640 + }, + { + "epoch": 0.2570056044835869, + "learning_rate": 1.033533847001773e-05, + "loss": 0.1134, + "step": 642 + }, + { + "epoch": 0.2578062449959968, + "learning_rate": 1.0363271516741877e-05, + "loss": 0.0667, + "step": 644 + }, + { + "epoch": 0.2586068855084067, + "learning_rate": 1.039120172555884e-05, + "loss": 0.1316, + "step": 646 + }, + { + "epoch": 0.2594075260208166, + "learning_rate": 1.0419128878275495e-05, + "loss": 0.1531, + "step": 648 + }, + { + "epoch": 0.2602081665332266, + "learning_rate": 1.0447052756722651e-05, + "loss": 0.7528, + "step": 650 + }, + { + "epoch": 0.2610088070456365, + "learning_rate": 1.0474973142756632e-05, + "loss": 0.0556, + "step": 652 + }, + { + "epoch": 0.2618094475580464, + "learning_rate": 1.0502889818261058e-05, + "loss": 0.3212, + "step": 654 + }, + { + "epoch": 0.2626100880704564, + "learning_rate": 1.053080256514858e-05, + "loss": 0.6016, + "step": 656 + }, + { + "epoch": 0.2634107285828663, + "learning_rate": 1.0558711165362488e-05, + "loss": 0.0168, + "step": 658 + }, + { + "epoch": 0.2642113690952762, + "learning_rate": 1.0586615400878484e-05, + "loss": 0.0205, + "step": 660 + }, + { + "epoch": 0.26501200960768617, + "learning_rate": 1.0614515053706354e-05, + "loss": 0.0486, + "step": 662 + }, + { + "epoch": 0.2658126501200961, + "learning_rate": 1.0642409905891733e-05, + "loss": 0.1587, + "step": 664 + }, + { + "epoch": 0.266613290632506, + "learning_rate": 1.0670299739517706e-05, + "loss": 0.145, + "step": 666 + }, + { + "epoch": 0.2674139311449159, + "learning_rate": 1.0698184336706567e-05, + "loss": 0.7095, + "step": 668 + }, + { + "epoch": 0.2682145716573259, + "learning_rate": 1.0726063479621567e-05, + "loss": 0.2606, + "step": 670 + }, + { + "epoch": 0.2690152121697358, + "learning_rate": 1.0753936950468513e-05, + "loss": 0.2429, + "step": 672 + }, + { + "epoch": 0.2698158526821457, + "learning_rate": 1.0781804531497525e-05, + "loss": 0.1221, + "step": 674 + }, + { + "epoch": 0.27061649319455566, + "learning_rate": 1.0809666005004787e-05, + "loss": 0.1832, + "step": 676 + }, + { + "epoch": 0.2714171337069656, + "learning_rate": 1.083752115333414e-05, + "loss": 0.8059, + "step": 678 + }, + { + "epoch": 0.2722177742193755, + "learning_rate": 1.0865369758878858e-05, + "loss": 0.12, + "step": 680 + }, + { + "epoch": 0.27301841473178545, + "learning_rate": 1.0893211604083311e-05, + "loss": 0.0301, + "step": 682 + }, + { + "epoch": 0.27381905524419536, + "learning_rate": 1.0921046471444737e-05, + "loss": 0.1914, + "step": 684 + }, + { + "epoch": 0.2746196957566053, + "learning_rate": 1.0948874143514818e-05, + "loss": 0.0902, + "step": 686 + }, + { + "epoch": 0.2754203362690152, + "learning_rate": 1.0976694402901467e-05, + "loss": 0.4047, + "step": 688 + }, + { + "epoch": 0.27622097678142515, + "learning_rate": 1.1004507032270544e-05, + "loss": 0.0176, + "step": 690 + }, + { + "epoch": 0.27702161729383507, + "learning_rate": 1.1032311814347467e-05, + "loss": 0.3045, + "step": 692 + }, + { + "epoch": 0.277822257806245, + "learning_rate": 1.1060108531918955e-05, + "loss": 0.1351, + "step": 694 + }, + { + "epoch": 0.27862289831865494, + "learning_rate": 1.1087896967834787e-05, + "loss": 0.3471, + "step": 696 + }, + { + "epoch": 0.27942353883106485, + "learning_rate": 1.111567690500938e-05, + "loss": 0.1783, + "step": 698 + }, + { + "epoch": 0.28022417934347477, + "learning_rate": 1.1143448126423545e-05, + "loss": 0.2826, + "step": 700 + }, + { + "epoch": 0.28102481985588473, + "learning_rate": 1.1171210415126238e-05, + "loss": 0.0893, + "step": 702 + }, + { + "epoch": 0.28182546036829464, + "learning_rate": 1.1198963554236135e-05, + "loss": 0.1358, + "step": 704 + }, + { + "epoch": 0.28262610088070456, + "learning_rate": 1.122670732694342e-05, + "loss": 0.1606, + "step": 706 + }, + { + "epoch": 0.28342674139311447, + "learning_rate": 1.1254441516511425e-05, + "loss": 0.0972, + "step": 708 + }, + { + "epoch": 0.28422738190552443, + "learning_rate": 1.1282165906278395e-05, + "loss": 0.2603, + "step": 710 + }, + { + "epoch": 0.28502802241793435, + "learning_rate": 1.1309880279659087e-05, + "loss": 0.3278, + "step": 712 + }, + { + "epoch": 0.28582866293034426, + "learning_rate": 1.1337584420146496e-05, + "loss": 0.1851, + "step": 714 + }, + { + "epoch": 0.2866293034427542, + "learning_rate": 1.1365278111313625e-05, + "loss": 0.3659, + "step": 716 + }, + { + "epoch": 0.28742994395516414, + "learning_rate": 1.1392961136815041e-05, + "loss": 0.228, + "step": 718 + }, + { + "epoch": 0.28823058446757405, + "learning_rate": 1.142063328038864e-05, + "loss": 0.0096, + "step": 720 + }, + { + "epoch": 0.289031224979984, + "learning_rate": 1.1448294325857377e-05, + "loss": 0.0655, + "step": 722 + }, + { + "epoch": 0.2898318654923939, + "learning_rate": 1.1475944057130856e-05, + "loss": 0.0874, + "step": 724 + }, + { + "epoch": 0.29063250600480384, + "learning_rate": 1.150358225820707e-05, + "loss": 0.2292, + "step": 726 + }, + { + "epoch": 0.29143314651721375, + "learning_rate": 1.1531208713174138e-05, + "loss": 0.1038, + "step": 728 + }, + { + "epoch": 0.2922337870296237, + "learning_rate": 1.1558823206211887e-05, + "loss": 0.2691, + "step": 730 + }, + { + "epoch": 0.2930344275420336, + "learning_rate": 1.1586425521593607e-05, + "loss": 0.2696, + "step": 732 + }, + { + "epoch": 0.29383506805444354, + "learning_rate": 1.1614015443687708e-05, + "loss": 0.1105, + "step": 734 + }, + { + "epoch": 0.2946357085668535, + "learning_rate": 1.1641592756959467e-05, + "loss": 0.1385, + "step": 736 + }, + { + "epoch": 0.2954363490792634, + "learning_rate": 1.1669157245972616e-05, + "loss": 0.4054, + "step": 738 + }, + { + "epoch": 0.2962369895916733, + "learning_rate": 1.1696708695391057e-05, + "loss": 0.0733, + "step": 740 + }, + { + "epoch": 0.29703763010408324, + "learning_rate": 1.1724246889980626e-05, + "loss": 0.0268, + "step": 742 + }, + { + "epoch": 0.2978382706164932, + "learning_rate": 1.1751771614610643e-05, + "loss": 0.6019, + "step": 744 + }, + { + "epoch": 0.2986389111289031, + "learning_rate": 1.1779282654255668e-05, + "loss": 0.2991, + "step": 746 + }, + { + "epoch": 0.29943955164131303, + "learning_rate": 1.180677979399721e-05, + "loss": 0.2375, + "step": 748 + }, + { + "epoch": 0.300240192153723, + "learning_rate": 1.1834262819025317e-05, + "loss": 0.1475, + "step": 750 + }, + { + "epoch": 0.3010408326661329, + "learning_rate": 1.1861731514640309e-05, + "loss": 0.3506, + "step": 752 + }, + { + "epoch": 0.3018414731785428, + "learning_rate": 1.188918566625449e-05, + "loss": 0.977, + "step": 754 + }, + { + "epoch": 0.3026421136909528, + "learning_rate": 1.1916625059393739e-05, + "loss": 0.1954, + "step": 756 + }, + { + "epoch": 0.3034427542033627, + "learning_rate": 1.1944049479699241e-05, + "loss": 0.659, + "step": 758 + }, + { + "epoch": 0.3042433947157726, + "learning_rate": 1.1971458712929133e-05, + "loss": 0.2362, + "step": 760 + }, + { + "epoch": 0.3050440352281825, + "learning_rate": 1.1998852544960256e-05, + "loss": 0.1278, + "step": 762 + }, + { + "epoch": 0.3058446757405925, + "learning_rate": 1.2026230761789702e-05, + "loss": 0.0822, + "step": 764 + }, + { + "epoch": 0.3066453162530024, + "learning_rate": 1.2053593149536557e-05, + "loss": 0.2293, + "step": 766 + }, + { + "epoch": 0.3074459567654123, + "learning_rate": 1.2080939494443618e-05, + "loss": 0.1954, + "step": 768 + }, + { + "epoch": 0.3082465972778223, + "learning_rate": 1.210826958287895e-05, + "loss": 0.2282, + "step": 770 + }, + { + "epoch": 0.3090472377902322, + "learning_rate": 1.2135583201337646e-05, + "loss": 0.0539, + "step": 772 + }, + { + "epoch": 0.3098478783026421, + "learning_rate": 1.2162880136443434e-05, + "loss": 0.3826, + "step": 774 + }, + { + "epoch": 0.31064851881505207, + "learning_rate": 1.2190160174950428e-05, + "loss": 0.3805, + "step": 776 + }, + { + "epoch": 0.311449159327462, + "learning_rate": 1.2217423103744692e-05, + "loss": 0.0147, + "step": 778 + }, + { + "epoch": 0.3122497998398719, + "learning_rate": 1.2244668709845952e-05, + "loss": 0.2874, + "step": 780 + }, + { + "epoch": 0.3130504403522818, + "learning_rate": 1.2271896780409309e-05, + "loss": 0.1617, + "step": 782 + }, + { + "epoch": 0.31385108086469177, + "learning_rate": 1.2299107102726804e-05, + "loss": 0.128, + "step": 784 + }, + { + "epoch": 0.3146517213771017, + "learning_rate": 1.2326299464229143e-05, + "loss": 0.127, + "step": 786 + }, + { + "epoch": 0.3154523618895116, + "learning_rate": 1.2353473652487329e-05, + "loss": 0.1367, + "step": 788 + }, + { + "epoch": 0.31625300240192156, + "learning_rate": 1.2380629455214385e-05, + "loss": 0.3964, + "step": 790 + }, + { + "epoch": 0.31705364291433147, + "learning_rate": 1.2407766660266916e-05, + "loss": 0.1828, + "step": 792 + }, + { + "epoch": 0.3178542834267414, + "learning_rate": 1.2434885055646808e-05, + "loss": 0.6291, + "step": 794 + }, + { + "epoch": 0.31865492393915135, + "learning_rate": 1.2461984429502947e-05, + "loss": 0.1637, + "step": 796 + }, + { + "epoch": 0.31945556445156126, + "learning_rate": 1.2489064570132761e-05, + "loss": 0.2669, + "step": 798 + }, + { + "epoch": 0.32025620496397117, + "learning_rate": 1.2516125265983945e-05, + "loss": 0.0166, + "step": 800 + }, + { + "epoch": 0.3210568454763811, + "learning_rate": 1.2543166305656089e-05, + "loss": 0.2783, + "step": 802 + }, + { + "epoch": 0.32185748598879105, + "learning_rate": 1.257018747790238e-05, + "loss": 0.0185, + "step": 804 + }, + { + "epoch": 0.32265812650120096, + "learning_rate": 1.259718857163117e-05, + "loss": 0.1306, + "step": 806 + }, + { + "epoch": 0.32345876701361087, + "learning_rate": 1.2624169375907657e-05, + "loss": 0.3331, + "step": 808 + }, + { + "epoch": 0.32425940752602084, + "learning_rate": 1.2651129679955598e-05, + "loss": 0.3639, + "step": 810 + }, + { + "epoch": 0.32506004803843075, + "learning_rate": 1.2678069273158849e-05, + "loss": 0.2169, + "step": 812 + }, + { + "epoch": 0.32586068855084066, + "learning_rate": 1.2704987945063073e-05, + "loss": 0.3378, + "step": 814 + }, + { + "epoch": 0.3266613290632506, + "learning_rate": 1.273188548537736e-05, + "loss": 0.2601, + "step": 816 + }, + { + "epoch": 0.32746196957566054, + "learning_rate": 1.2758761683975929e-05, + "loss": 0.0328, + "step": 818 + }, + { + "epoch": 0.32826261008807045, + "learning_rate": 1.2785616330899676e-05, + "loss": 0.1559, + "step": 820 + }, + { + "epoch": 0.32906325060048036, + "learning_rate": 1.2812449216357855e-05, + "loss": 0.0124, + "step": 822 + }, + { + "epoch": 0.32986389111289033, + "learning_rate": 1.2839260130729776e-05, + "loss": 0.1785, + "step": 824 + }, + { + "epoch": 0.33066453162530024, + "learning_rate": 1.2866048864566336e-05, + "loss": 0.0993, + "step": 826 + }, + { + "epoch": 0.33146517213771015, + "learning_rate": 1.2892815208591734e-05, + "loss": 0.1131, + "step": 828 + }, + { + "epoch": 0.3322658126501201, + "learning_rate": 1.2919558953705047e-05, + "loss": 0.3409, + "step": 830 + }, + { + "epoch": 0.33306645316253003, + "learning_rate": 1.2946279890981966e-05, + "loss": 0.3488, + "step": 832 + }, + { + "epoch": 0.33386709367493994, + "learning_rate": 1.2972977811676289e-05, + "loss": 0.6295, + "step": 834 + }, + { + "epoch": 0.33466773418734985, + "learning_rate": 1.2999652507221652e-05, + "loss": 0.0462, + "step": 836 + }, + { + "epoch": 0.3354683746997598, + "learning_rate": 1.3026303769233109e-05, + "loss": 0.2266, + "step": 838 + }, + { + "epoch": 0.33626901521216973, + "learning_rate": 1.3052931389508822e-05, + "loss": 0.1447, + "step": 840 + }, + { + "epoch": 0.33706965572457964, + "learning_rate": 1.3079535160031601e-05, + "loss": 0.0222, + "step": 842 + }, + { + "epoch": 0.3378702962369896, + "learning_rate": 1.3106114872970575e-05, + "loss": 0.2396, + "step": 844 + }, + { + "epoch": 0.3386709367493995, + "learning_rate": 1.313267032068285e-05, + "loss": 0.2275, + "step": 846 + }, + { + "epoch": 0.33947157726180943, + "learning_rate": 1.3159201295715054e-05, + "loss": 0.0583, + "step": 848 + }, + { + "epoch": 0.3402722177742194, + "learning_rate": 1.3185707590804997e-05, + "loss": 0.2187, + "step": 850 + }, + { + "epoch": 0.3410728582866293, + "learning_rate": 1.321218899888334e-05, + "loss": 0.6811, + "step": 852 + }, + { + "epoch": 0.3418734987990392, + "learning_rate": 1.3238645313075109e-05, + "loss": 0.1948, + "step": 854 + }, + { + "epoch": 0.34267413931144913, + "learning_rate": 1.326507632670139e-05, + "loss": 0.245, + "step": 856 + }, + { + "epoch": 0.3434747798238591, + "learning_rate": 1.3291481833280894e-05, + "loss": 0.239, + "step": 858 + }, + { + "epoch": 0.344275420336269, + "learning_rate": 1.3317861626531652e-05, + "loss": 0.0178, + "step": 860 + }, + { + "epoch": 0.3450760608486789, + "learning_rate": 1.3344215500372517e-05, + "loss": 0.243, + "step": 862 + }, + { + "epoch": 0.3458767013610889, + "learning_rate": 1.3370543248924826e-05, + "loss": 0.2263, + "step": 864 + }, + { + "epoch": 0.3466773418734988, + "learning_rate": 1.3396844666514062e-05, + "loss": 0.2272, + "step": 866 + }, + { + "epoch": 0.3474779823859087, + "learning_rate": 1.3423119547671348e-05, + "loss": 0.0791, + "step": 868 + }, + { + "epoch": 0.3482786228983187, + "learning_rate": 1.344936768713513e-05, + "loss": 0.1708, + "step": 870 + }, + { + "epoch": 0.3490792634107286, + "learning_rate": 1.347558887985279e-05, + "loss": 0.0574, + "step": 872 + }, + { + "epoch": 0.3498799039231385, + "learning_rate": 1.3501782920982189e-05, + "loss": 0.5931, + "step": 874 + }, + { + "epoch": 0.3506805444355484, + "learning_rate": 1.3527949605893305e-05, + "loss": 0.1019, + "step": 876 + }, + { + "epoch": 0.3514811849479584, + "learning_rate": 1.3554088730169812e-05, + "loss": 0.4335, + "step": 878 + }, + { + "epoch": 0.3522818254603683, + "learning_rate": 1.3580200089610739e-05, + "loss": 0.1171, + "step": 880 + }, + { + "epoch": 0.3530824659727782, + "learning_rate": 1.3606283480231962e-05, + "loss": 0.3145, + "step": 882 + }, + { + "epoch": 0.35388310648518817, + "learning_rate": 1.3632338698267863e-05, + "loss": 0.0384, + "step": 884 + }, + { + "epoch": 0.3546837469975981, + "learning_rate": 1.3658365540172948e-05, + "loss": 0.5349, + "step": 886 + }, + { + "epoch": 0.355484387510008, + "learning_rate": 1.368436380262336e-05, + "loss": 0.2957, + "step": 888 + }, + { + "epoch": 0.35628502802241796, + "learning_rate": 1.3710333282518497e-05, + "loss": 0.2578, + "step": 890 + }, + { + "epoch": 0.35708566853482787, + "learning_rate": 1.3736273776982667e-05, + "loss": 0.2949, + "step": 892 + }, + { + "epoch": 0.3578863090472378, + "learning_rate": 1.3762185083366562e-05, + "loss": 0.2862, + "step": 894 + }, + { + "epoch": 0.3586869495596477, + "learning_rate": 1.3788066999248893e-05, + "loss": 0.1721, + "step": 896 + }, + { + "epoch": 0.35948759007205766, + "learning_rate": 1.3813919322438018e-05, + "loss": 0.2932, + "step": 898 + }, + { + "epoch": 0.3602882305844676, + "learning_rate": 1.3839741850973435e-05, + "loss": 0.1905, + "step": 900 + }, + { + "epoch": 0.3610888710968775, + "learning_rate": 1.3865534383127413e-05, + "loss": 0.5687, + "step": 902 + }, + { + "epoch": 0.36188951160928745, + "learning_rate": 1.3891296717406533e-05, + "loss": 0.1368, + "step": 904 + }, + { + "epoch": 0.36269015212169736, + "learning_rate": 1.391702865255334e-05, + "loss": 0.2241, + "step": 906 + }, + { + "epoch": 0.3634907926341073, + "learning_rate": 1.3942729987547808e-05, + "loss": 0.045, + "step": 908 + }, + { + "epoch": 0.36429143314651724, + "learning_rate": 1.3968400521608962e-05, + "loss": 0.0139, + "step": 910 + }, + { + "epoch": 0.36509207365892715, + "learning_rate": 1.3994040054196498e-05, + "loss": 0.0354, + "step": 912 + }, + { + "epoch": 0.36589271417133706, + "learning_rate": 1.4019648385012245e-05, + "loss": 0.1449, + "step": 914 + }, + { + "epoch": 0.366693354683747, + "learning_rate": 1.4045225314001789e-05, + "loss": 0.409, + "step": 916 + }, + { + "epoch": 0.36749399519615694, + "learning_rate": 1.4070770641356069e-05, + "loss": 0.26, + "step": 918 + }, + { + "epoch": 0.36829463570856685, + "learning_rate": 1.4096284167512856e-05, + "loss": 0.2872, + "step": 920 + }, + { + "epoch": 0.36909527622097676, + "learning_rate": 1.4121765693158355e-05, + "loss": 0.2284, + "step": 922 + }, + { + "epoch": 0.36989591673338673, + "learning_rate": 1.4147215019228813e-05, + "loss": 0.4298, + "step": 924 + }, + { + "epoch": 0.37069655724579664, + "learning_rate": 1.4172631946911964e-05, + "loss": 0.2075, + "step": 926 + }, + { + "epoch": 0.37149719775820655, + "learning_rate": 1.4198016277648665e-05, + "loss": 0.204, + "step": 928 + }, + { + "epoch": 0.37229783827061647, + "learning_rate": 1.4223367813134406e-05, + "loss": 0.0654, + "step": 930 + }, + { + "epoch": 0.37309847878302643, + "learning_rate": 1.4248686355320922e-05, + "loss": 0.7338, + "step": 932 + }, + { + "epoch": 0.37389911929543634, + "learning_rate": 1.4273971706417653e-05, + "loss": 0.1829, + "step": 934 + }, + { + "epoch": 0.37469975980784626, + "learning_rate": 1.429922366889332e-05, + "loss": 0.2395, + "step": 936 + }, + { + "epoch": 0.3755004003202562, + "learning_rate": 1.4324442045477534e-05, + "loss": 0.2976, + "step": 938 + }, + { + "epoch": 0.37630104083266613, + "learning_rate": 1.4349626639162231e-05, + "loss": 0.0508, + "step": 940 + }, + { + "epoch": 0.37710168134507605, + "learning_rate": 1.4374777253203265e-05, + "loss": 0.1513, + "step": 942 + }, + { + "epoch": 0.377902321857486, + "learning_rate": 1.4399893691121985e-05, + "loss": 0.0523, + "step": 944 + }, + { + "epoch": 0.3787029623698959, + "learning_rate": 1.4424975756706684e-05, + "loss": 0.0725, + "step": 946 + }, + { + "epoch": 0.37950360288230583, + "learning_rate": 1.4450023254014185e-05, + "loss": 0.2612, + "step": 948 + }, + { + "epoch": 0.38030424339471575, + "learning_rate": 1.4475035987371348e-05, + "loss": 0.3659, + "step": 950 + }, + { + "epoch": 0.3811048839071257, + "learning_rate": 1.4500013761376663e-05, + "loss": 0.006, + "step": 952 + }, + { + "epoch": 0.3819055244195356, + "learning_rate": 1.4524956380901674e-05, + "loss": 0.1447, + "step": 954 + }, + { + "epoch": 0.38270616493194554, + "learning_rate": 1.454986365109255e-05, + "loss": 0.1132, + "step": 956 + }, + { + "epoch": 0.3835068054443555, + "learning_rate": 1.4574735377371669e-05, + "loss": 0.014, + "step": 958 + }, + { + "epoch": 0.3843074459567654, + "learning_rate": 1.4599571365439027e-05, + "loss": 0.129, + "step": 960 + }, + { + "epoch": 0.3851080864691753, + "learning_rate": 1.4624371421273812e-05, + "loss": 0.1057, + "step": 962 + }, + { + "epoch": 0.3859087269815853, + "learning_rate": 1.4649135351135968e-05, + "loss": 0.0129, + "step": 964 + }, + { + "epoch": 0.3867093674939952, + "learning_rate": 1.4673862961567604e-05, + "loss": 0.0078, + "step": 966 + }, + { + "epoch": 0.3875100080064051, + "learning_rate": 1.4698554059394563e-05, + "loss": 0.2277, + "step": 968 + }, + { + "epoch": 0.388310648518815, + "learning_rate": 1.4723208451727977e-05, + "loss": 0.2779, + "step": 970 + }, + { + "epoch": 0.389111289031225, + "learning_rate": 1.4747825945965675e-05, + "loss": 0.0426, + "step": 972 + }, + { + "epoch": 0.3899119295436349, + "learning_rate": 1.4772406349793749e-05, + "loss": 0.0443, + "step": 974 + }, + { + "epoch": 0.3907125700560448, + "learning_rate": 1.4796949471188033e-05, + "loss": 0.0909, + "step": 976 + }, + { + "epoch": 0.3915132105684548, + "learning_rate": 1.4821455118415666e-05, + "loss": 0.0431, + "step": 978 + }, + { + "epoch": 0.3923138510808647, + "learning_rate": 1.4845923100036479e-05, + "loss": 0.0885, + "step": 980 + }, + { + "epoch": 0.3931144915932746, + "learning_rate": 1.4870353224904563e-05, + "loss": 0.1609, + "step": 982 + }, + { + "epoch": 0.3939151321056846, + "learning_rate": 1.4894745302169786e-05, + "loss": 0.7022, + "step": 984 + }, + { + "epoch": 0.3947157726180945, + "learning_rate": 1.4919099141279205e-05, + "loss": 0.1839, + "step": 986 + }, + { + "epoch": 0.3955164131305044, + "learning_rate": 1.4943414551978597e-05, + "loss": 0.335, + "step": 988 + }, + { + "epoch": 0.3963170536429143, + "learning_rate": 1.4967691344313988e-05, + "loss": 0.0963, + "step": 990 + }, + { + "epoch": 0.3971176941553243, + "learning_rate": 1.499192932863305e-05, + "loss": 0.191, + "step": 992 + }, + { + "epoch": 0.3979183346677342, + "learning_rate": 1.5016128315586626e-05, + "loss": 0.1439, + "step": 994 + }, + { + "epoch": 0.3987189751801441, + "learning_rate": 1.5040288116130261e-05, + "loss": 0.0008, + "step": 996 + }, + { + "epoch": 0.39951961569255406, + "learning_rate": 1.5064408541525568e-05, + "loss": 0.5921, + "step": 998 + }, + { + "epoch": 0.400320256204964, + "learning_rate": 1.5088489403341793e-05, + "loss": 0.0945, + "step": 1000 + }, + { + "epoch": 0.4011208967173739, + "learning_rate": 1.5112530513457229e-05, + "loss": 0.4335, + "step": 1002 + }, + { + "epoch": 0.40192153722978385, + "learning_rate": 1.513653168406076e-05, + "loss": 0.4796, + "step": 1004 + }, + { + "epoch": 0.40272217774219377, + "learning_rate": 1.5160492727653245e-05, + "loss": 0.428, + "step": 1006 + }, + { + "epoch": 0.4035228182546037, + "learning_rate": 1.5184413457049006e-05, + "loss": 0.0034, + "step": 1008 + }, + { + "epoch": 0.4043234587670136, + "learning_rate": 1.5208293685377354e-05, + "loss": 0.1827, + "step": 1010 + }, + { + "epoch": 0.40512409927942356, + "learning_rate": 1.5232133226083954e-05, + "loss": 0.1666, + "step": 1012 + }, + { + "epoch": 0.40592473979183347, + "learning_rate": 1.5255931892932322e-05, + "loss": 0.2419, + "step": 1014 + }, + { + "epoch": 0.4067253803042434, + "learning_rate": 1.527968950000533e-05, + "loss": 0.001, + "step": 1016 + }, + { + "epoch": 0.40752602081665334, + "learning_rate": 1.5303405861706574e-05, + "loss": 0.4105, + "step": 1018 + }, + { + "epoch": 0.40832666132906326, + "learning_rate": 1.532708079276185e-05, + "loss": 0.2481, + "step": 1020 + }, + { + "epoch": 0.40912730184147317, + "learning_rate": 1.5350714108220667e-05, + "loss": 0.0914, + "step": 1022 + }, + { + "epoch": 0.4099279423538831, + "learning_rate": 1.5374305623457594e-05, + "loss": 0.2246, + "step": 1024 + }, + { + "epoch": 0.41072858286629305, + "learning_rate": 1.539785515417376e-05, + "loss": 0.0425, + "step": 1026 + }, + { + "epoch": 0.41152922337870296, + "learning_rate": 1.542136251639826e-05, + "loss": 0.3758, + "step": 1028 + }, + { + "epoch": 0.41232986389111287, + "learning_rate": 1.5444827526489668e-05, + "loss": 0.0473, + "step": 1030 + }, + { + "epoch": 0.41313050440352284, + "learning_rate": 1.5468250001137368e-05, + "loss": 0.1599, + "step": 1032 + }, + { + "epoch": 0.41393114491593275, + "learning_rate": 1.5491629757363026e-05, + "loss": 0.0047, + "step": 1034 + }, + { + "epoch": 0.41473178542834266, + "learning_rate": 1.551496661252208e-05, + "loss": 0.6549, + "step": 1036 + }, + { + "epoch": 0.4155324259407526, + "learning_rate": 1.5538260384305073e-05, + "loss": 0.0192, + "step": 1038 + }, + { + "epoch": 0.41633306645316254, + "learning_rate": 1.5561510890739113e-05, + "loss": 0.1766, + "step": 1040 + }, + { + "epoch": 0.41713370696557245, + "learning_rate": 1.5584717950189353e-05, + "loss": 0.0445, + "step": 1042 + }, + { + "epoch": 0.41793434747798236, + "learning_rate": 1.5607881381360296e-05, + "loss": 0.102, + "step": 1044 + }, + { + "epoch": 0.4187349879903923, + "learning_rate": 1.563100100329731e-05, + "loss": 0.3522, + "step": 1046 + }, + { + "epoch": 0.41953562850280224, + "learning_rate": 1.565407663538797e-05, + "loss": 0.007, + "step": 1048 + }, + { + "epoch": 0.42033626901521215, + "learning_rate": 1.567710809736356e-05, + "loss": 0.0164, + "step": 1050 + }, + { + "epoch": 0.4211369095276221, + "learning_rate": 1.5700095209300376e-05, + "loss": 0.0002, + "step": 1052 + }, + { + "epoch": 0.42193755004003203, + "learning_rate": 1.572303779162118e-05, + "loss": 1.2911, + "step": 1054 + }, + { + "epoch": 0.42273819055244194, + "learning_rate": 1.5745935665096647e-05, + "loss": 0.3321, + "step": 1056 + }, + { + "epoch": 0.4235388310648519, + "learning_rate": 1.5768788650846677e-05, + "loss": 0.2918, + "step": 1058 + }, + { + "epoch": 0.4243394715772618, + "learning_rate": 1.5791596570341844e-05, + "loss": 0.5569, + "step": 1060 + }, + { + "epoch": 0.42514011208967173, + "learning_rate": 1.581435924540481e-05, + "loss": 0.0934, + "step": 1062 + }, + { + "epoch": 0.42594075260208164, + "learning_rate": 1.5837076498211666e-05, + "loss": 0.0792, + "step": 1064 + }, + { + "epoch": 0.4267413931144916, + "learning_rate": 1.5859748151293333e-05, + "loss": 0.0252, + "step": 1066 + }, + { + "epoch": 0.4275420336269015, + "learning_rate": 1.5882374027537005e-05, + "loss": 0.4118, + "step": 1068 + }, + { + "epoch": 0.42834267413931143, + "learning_rate": 1.5904953950187455e-05, + "loss": 0.0859, + "step": 1070 + }, + { + "epoch": 0.4291433146517214, + "learning_rate": 1.5927487742848448e-05, + "loss": 0.4272, + "step": 1072 + }, + { + "epoch": 0.4299439551641313, + "learning_rate": 1.594997522948412e-05, + "loss": 0.1109, + "step": 1074 + }, + { + "epoch": 0.4307445956765412, + "learning_rate": 1.5972416234420393e-05, + "loss": 0.1082, + "step": 1076 + }, + { + "epoch": 0.4315452361889512, + "learning_rate": 1.599481058234626e-05, + "loss": 0.3222, + "step": 1078 + }, + { + "epoch": 0.4323458767013611, + "learning_rate": 1.60171580983152e-05, + "loss": 0.0858, + "step": 1080 + }, + { + "epoch": 0.433146517213771, + "learning_rate": 1.6039458607746607e-05, + "loss": 0.2849, + "step": 1082 + }, + { + "epoch": 0.4339471577261809, + "learning_rate": 1.606171193642703e-05, + "loss": 0.3763, + "step": 1084 + }, + { + "epoch": 0.4347477982385909, + "learning_rate": 1.6083917910511616e-05, + "loss": 0.1571, + "step": 1086 + }, + { + "epoch": 0.4355484387510008, + "learning_rate": 1.6106076356525474e-05, + "loss": 0.1033, + "step": 1088 + }, + { + "epoch": 0.4363490792634107, + "learning_rate": 1.6128187101364982e-05, + "loss": 0.2356, + "step": 1090 + }, + { + "epoch": 0.4371497197758207, + "learning_rate": 1.6150249972299153e-05, + "loss": 0.089, + "step": 1092 + }, + { + "epoch": 0.4379503602882306, + "learning_rate": 1.617226479697104e-05, + "loss": 0.1959, + "step": 1094 + }, + { + "epoch": 0.4387510008006405, + "learning_rate": 1.6194231403398994e-05, + "loss": 0.021, + "step": 1096 + }, + { + "epoch": 0.43955164131305047, + "learning_rate": 1.621614961997806e-05, + "loss": 0.1743, + "step": 1098 + }, + { + "epoch": 0.4403522818254604, + "learning_rate": 1.6238019275481313e-05, + "loss": 0.2954, + "step": 1100 + }, + { + "epoch": 0.4411529223378703, + "learning_rate": 1.6259840199061212e-05, + "loss": 0.2335, + "step": 1102 + }, + { + "epoch": 0.4419535628502802, + "learning_rate": 1.6281612220250883e-05, + "loss": 0.2436, + "step": 1104 + }, + { + "epoch": 0.44275420336269017, + "learning_rate": 1.6303335168965474e-05, + "loss": 0.0711, + "step": 1106 + }, + { + "epoch": 0.4435548438751001, + "learning_rate": 1.6325008875503543e-05, + "loss": 0.2716, + "step": 1108 + }, + { + "epoch": 0.44435548438751, + "learning_rate": 1.6346633170548285e-05, + "loss": 0.2081, + "step": 1110 + }, + { + "epoch": 0.44515612489991996, + "learning_rate": 1.6368207885168897e-05, + "loss": 0.6768, + "step": 1112 + }, + { + "epoch": 0.44595676541232987, + "learning_rate": 1.6389732850821957e-05, + "loss": 0.2903, + "step": 1114 + }, + { + "epoch": 0.4467574059247398, + "learning_rate": 1.641120789935263e-05, + "loss": 0.1891, + "step": 1116 + }, + { + "epoch": 0.4475580464371497, + "learning_rate": 1.6432632862996042e-05, + "loss": 0.1891, + "step": 1118 + }, + { + "epoch": 0.44835868694955966, + "learning_rate": 1.6454007574378637e-05, + "loss": 0.142, + "step": 1120 + }, + { + "epoch": 0.44915932746196957, + "learning_rate": 1.6475331866519377e-05, + "loss": 0.1016, + "step": 1122 + }, + { + "epoch": 0.4499599679743795, + "learning_rate": 1.6496605572831134e-05, + "loss": 0.0638, + "step": 1124 + }, + { + "epoch": 0.45076060848678945, + "learning_rate": 1.6517828527121928e-05, + "loss": 0.107, + "step": 1126 + }, + { + "epoch": 0.45156124899919936, + "learning_rate": 1.6539000563596318e-05, + "loss": 0.1569, + "step": 1128 + }, + { + "epoch": 0.45236188951160927, + "learning_rate": 1.6560121516856586e-05, + "loss": 0.2443, + "step": 1130 + }, + { + "epoch": 0.45316253002401924, + "learning_rate": 1.6581191221904077e-05, + "loss": 0.0489, + "step": 1132 + }, + { + "epoch": 0.45396317053642915, + "learning_rate": 1.6602209514140542e-05, + "loss": 0.088, + "step": 1134 + }, + { + "epoch": 0.45476381104883906, + "learning_rate": 1.6623176229369324e-05, + "loss": 0.2475, + "step": 1136 + }, + { + "epoch": 0.455564451561249, + "learning_rate": 1.6644091203796694e-05, + "loss": 0.1215, + "step": 1138 + }, + { + "epoch": 0.45636509207365894, + "learning_rate": 1.6664954274033168e-05, + "loss": 0.0376, + "step": 1140 + }, + { + "epoch": 0.45716573258606885, + "learning_rate": 1.6685765277094695e-05, + "loss": 0.4575, + "step": 1142 + }, + { + "epoch": 0.45796637309847876, + "learning_rate": 1.6706524050403996e-05, + "loss": 0.12, + "step": 1144 + }, + { + "epoch": 0.45876701361088873, + "learning_rate": 1.6727230431791806e-05, + "loss": 0.1484, + "step": 1146 + }, + { + "epoch": 0.45956765412329864, + "learning_rate": 1.674788425949818e-05, + "loss": 0.1277, + "step": 1148 + }, + { + "epoch": 0.46036829463570855, + "learning_rate": 1.6768485372173696e-05, + "loss": 0.3824, + "step": 1150 + }, + { + "epoch": 0.4611689351481185, + "learning_rate": 1.6789033608880735e-05, + "loss": 0.6403, + "step": 1152 + }, + { + "epoch": 0.46196957566052843, + "learning_rate": 1.6809528809094798e-05, + "loss": 0.3211, + "step": 1154 + }, + { + "epoch": 0.46277021617293834, + "learning_rate": 1.6829970812705674e-05, + "loss": 0.9452, + "step": 1156 + }, + { + "epoch": 0.46357085668534825, + "learning_rate": 1.6850359460018733e-05, + "loss": 0.0254, + "step": 1158 + }, + { + "epoch": 0.4643714971977582, + "learning_rate": 1.6870694591756165e-05, + "loss": 0.2212, + "step": 1160 + }, + { + "epoch": 0.46517213771016813, + "learning_rate": 1.689097604905826e-05, + "loss": 0.3745, + "step": 1162 + }, + { + "epoch": 0.46597277822257804, + "learning_rate": 1.6911203673484577e-05, + "loss": 0.2637, + "step": 1164 + }, + { + "epoch": 0.466773418734988, + "learning_rate": 1.6931377307015226e-05, + "loss": 0.3764, + "step": 1166 + }, + { + "epoch": 0.4675740592473979, + "learning_rate": 1.695149679205214e-05, + "loss": 0.3774, + "step": 1168 + }, + { + "epoch": 0.46837469975980783, + "learning_rate": 1.6971561971420222e-05, + "loss": 0.1398, + "step": 1170 + }, + { + "epoch": 0.4691753402722178, + "learning_rate": 1.6991572688368628e-05, + "loss": 0.0927, + "step": 1172 + }, + { + "epoch": 0.4699759807846277, + "learning_rate": 1.701152878657196e-05, + "loss": 0.2306, + "step": 1174 + }, + { + "epoch": 0.4707766212970376, + "learning_rate": 1.7031430110131562e-05, + "loss": 0.4956, + "step": 1176 + }, + { + "epoch": 0.47157726180944753, + "learning_rate": 1.705127650357662e-05, + "loss": 0.1058, + "step": 1178 + }, + { + "epoch": 0.4723779023218575, + "learning_rate": 1.7071067811865467e-05, + "loss": 0.1209, + "step": 1180 + }, + { + "epoch": 0.4731785428342674, + "learning_rate": 1.7090803880386778e-05, + "loss": 0.0265, + "step": 1182 + }, + { + "epoch": 0.4739791833466773, + "learning_rate": 1.7110484554960738e-05, + "loss": 0.0471, + "step": 1184 + }, + { + "epoch": 0.4747798238590873, + "learning_rate": 1.713010968184029e-05, + "loss": 0.8562, + "step": 1186 + }, + { + "epoch": 0.4755804643714972, + "learning_rate": 1.7149679107712306e-05, + "loss": 0.7528, + "step": 1188 + }, + { + "epoch": 0.4763811048839071, + "learning_rate": 1.716919267969883e-05, + "loss": 0.615, + "step": 1190 + }, + { + "epoch": 0.4771817453963171, + "learning_rate": 1.7188650245358215e-05, + "loss": 0.186, + "step": 1192 + }, + { + "epoch": 0.477982385908727, + "learning_rate": 1.7208051652686338e-05, + "loss": 0.0445, + "step": 1194 + }, + { + "epoch": 0.4787830264211369, + "learning_rate": 1.722739675011779e-05, + "loss": 0.1748, + "step": 1196 + }, + { + "epoch": 0.4795836669335468, + "learning_rate": 1.7246685386527095e-05, + "loss": 0.1152, + "step": 1198 + }, + { + "epoch": 0.4803843074459568, + "learning_rate": 1.726591741122981e-05, + "loss": 0.6113, + "step": 1200 + }, + { + "epoch": 0.4811849479583667, + "learning_rate": 1.7285092673983753e-05, + "loss": 0.596, + "step": 1202 + }, + { + "epoch": 0.4819855884707766, + "learning_rate": 1.730421102499021e-05, + "loss": 0.1785, + "step": 1204 + }, + { + "epoch": 0.48278622898318657, + "learning_rate": 1.7323272314895022e-05, + "loss": 0.2789, + "step": 1206 + }, + { + "epoch": 0.4835868694955965, + "learning_rate": 1.734227639478982e-05, + "loss": 0.1685, + "step": 1208 + }, + { + "epoch": 0.4843875100080064, + "learning_rate": 1.736122311621314e-05, + "loss": 0.9623, + "step": 1210 + }, + { + "epoch": 0.4851881505204163, + "learning_rate": 1.738011233115165e-05, + "loss": 0.21, + "step": 1212 + }, + { + "epoch": 0.4859887910328263, + "learning_rate": 1.7398943892041227e-05, + "loss": 0.0975, + "step": 1214 + }, + { + "epoch": 0.4867894315452362, + "learning_rate": 1.7417717651768144e-05, + "loss": 0.0701, + "step": 1216 + }, + { + "epoch": 0.4875900720576461, + "learning_rate": 1.743643346367026e-05, + "loss": 0.1197, + "step": 1218 + }, + { + "epoch": 0.48839071257005606, + "learning_rate": 1.7455091181538087e-05, + "loss": 0.4166, + "step": 1220 + }, + { + "epoch": 0.489191353082466, + "learning_rate": 1.7473690659615992e-05, + "loss": 0.0367, + "step": 1222 + }, + { + "epoch": 0.4899919935948759, + "learning_rate": 1.74922317526033e-05, + "loss": 0.4163, + "step": 1224 + }, + { + "epoch": 0.49079263410728585, + "learning_rate": 1.7510714315655474e-05, + "loss": 0.1617, + "step": 1226 + }, + { + "epoch": 0.49159327461969576, + "learning_rate": 1.752913820438519e-05, + "loss": 0.3654, + "step": 1228 + }, + { + "epoch": 0.4923939151321057, + "learning_rate": 1.7547503274863495e-05, + "loss": 0.3098, + "step": 1230 + }, + { + "epoch": 0.4931945556445156, + "learning_rate": 1.756580938362096e-05, + "loss": 0.4164, + "step": 1232 + }, + { + "epoch": 0.49399519615692555, + "learning_rate": 1.758405638764873e-05, + "loss": 0.2076, + "step": 1234 + }, + { + "epoch": 0.49479583666933546, + "learning_rate": 1.7602244144399693e-05, + "loss": 0.0981, + "step": 1236 + }, + { + "epoch": 0.4955964771817454, + "learning_rate": 1.7620372511789604e-05, + "loss": 0.1101, + "step": 1238 + }, + { + "epoch": 0.49639711769415534, + "learning_rate": 1.7638441348198147e-05, + "loss": 0.1748, + "step": 1240 + }, + { + "epoch": 0.49719775820656525, + "learning_rate": 1.7656450512470077e-05, + "loss": 0.1094, + "step": 1242 + }, + { + "epoch": 0.49799839871897517, + "learning_rate": 1.7674399863916295e-05, + "loss": 0.1017, + "step": 1244 + }, + { + "epoch": 0.49879903923138513, + "learning_rate": 1.7692289262315e-05, + "loss": 0.601, + "step": 1246 + }, + { + "epoch": 0.49959967974379504, + "learning_rate": 1.771011856791273e-05, + "loss": 0.2196, + "step": 1248 + }, + { + "epoch": 0.500400320256205, + "learning_rate": 1.7727887641425448e-05, + "loss": 0.12, + "step": 1250 + }, + { + "epoch": 0.5012009607686149, + "learning_rate": 1.7745596344039712e-05, + "loss": 0.4591, + "step": 1252 + }, + { + "epoch": 0.5020016012810248, + "learning_rate": 1.7763244537413657e-05, + "loss": 0.1194, + "step": 1254 + }, + { + "epoch": 0.5028022417934348, + "learning_rate": 1.7780832083678116e-05, + "loss": 0.1038, + "step": 1256 + }, + { + "epoch": 0.5036028823058447, + "learning_rate": 1.7798358845437754e-05, + "loss": 0.1654, + "step": 1258 + }, + { + "epoch": 0.5044035228182546, + "learning_rate": 1.7815824685772035e-05, + "loss": 0.2997, + "step": 1260 + }, + { + "epoch": 0.5052041633306645, + "learning_rate": 1.7833229468236364e-05, + "loss": 0.4124, + "step": 1262 + }, + { + "epoch": 0.5060048038430744, + "learning_rate": 1.7850573056863156e-05, + "loss": 0.1765, + "step": 1264 + }, + { + "epoch": 0.5068054443554844, + "learning_rate": 1.786785531616285e-05, + "loss": 0.2714, + "step": 1266 + }, + { + "epoch": 0.5076060848678943, + "learning_rate": 1.7885076111125004e-05, + "loss": 0.2869, + "step": 1268 + }, + { + "epoch": 0.5084067253803043, + "learning_rate": 1.790223530721933e-05, + "loss": 0.1046, + "step": 1270 + }, + { + "epoch": 0.5092073658927142, + "learning_rate": 1.791933277039679e-05, + "loss": 0.3665, + "step": 1272 + }, + { + "epoch": 0.5100080064051241, + "learning_rate": 1.7936368367090577e-05, + "loss": 0.0687, + "step": 1274 + }, + { + "epoch": 0.510808646917534, + "learning_rate": 1.7953341964217183e-05, + "loss": 0.0635, + "step": 1276 + }, + { + "epoch": 0.5116092874299439, + "learning_rate": 1.7970253429177477e-05, + "loss": 0.7197, + "step": 1278 + }, + { + "epoch": 0.5124099279423538, + "learning_rate": 1.7987102629857696e-05, + "loss": 0.4308, + "step": 1280 + }, + { + "epoch": 0.5132105684547638, + "learning_rate": 1.800388943463047e-05, + "loss": 0.5856, + "step": 1282 + }, + { + "epoch": 0.5140112089671738, + "learning_rate": 1.8020613712355912e-05, + "loss": 0.4038, + "step": 1284 + }, + { + "epoch": 0.5148118494795837, + "learning_rate": 1.803727533238257e-05, + "loss": 0.132, + "step": 1286 + }, + { + "epoch": 0.5156124899919936, + "learning_rate": 1.805387416454847e-05, + "loss": 0.2195, + "step": 1288 + }, + { + "epoch": 0.5164131305044035, + "learning_rate": 1.8070410079182195e-05, + "loss": 0.0802, + "step": 1290 + }, + { + "epoch": 0.5172137710168134, + "learning_rate": 1.8086882947103787e-05, + "loss": 0.0871, + "step": 1292 + }, + { + "epoch": 0.5180144115292233, + "learning_rate": 1.8103292639625842e-05, + "loss": 0.0379, + "step": 1294 + }, + { + "epoch": 0.5188150520416333, + "learning_rate": 1.811963902855447e-05, + "loss": 0.3399, + "step": 1296 + }, + { + "epoch": 0.5196156925540433, + "learning_rate": 1.813592198619035e-05, + "loss": 0.0499, + "step": 1298 + }, + { + "epoch": 0.5204163330664532, + "learning_rate": 1.8152141385329658e-05, + "loss": 0.0429, + "step": 1300 + }, + { + "epoch": 0.5212169735788631, + "learning_rate": 1.816829709926509e-05, + "loss": 0.4723, + "step": 1302 + }, + { + "epoch": 0.522017614091273, + "learning_rate": 1.8184389001786895e-05, + "loss": 0.3526, + "step": 1304 + }, + { + "epoch": 0.5228182546036829, + "learning_rate": 1.8200416967183785e-05, + "loss": 0.3145, + "step": 1306 + }, + { + "epoch": 0.5236188951160928, + "learning_rate": 1.821638087024396e-05, + "loss": 0.6276, + "step": 1308 + }, + { + "epoch": 0.5244195356285029, + "learning_rate": 1.8232280586256097e-05, + "loss": 0.0866, + "step": 1310 + }, + { + "epoch": 0.5252201761409128, + "learning_rate": 1.8248115991010296e-05, + "loss": 0.3023, + "step": 1312 + }, + { + "epoch": 0.5260208166533227, + "learning_rate": 1.8263886960799055e-05, + "loss": 0.0715, + "step": 1314 + }, + { + "epoch": 0.5268214571657326, + "learning_rate": 1.8279593372418264e-05, + "loss": 0.0608, + "step": 1316 + }, + { + "epoch": 0.5276220976781425, + "learning_rate": 1.829523510316813e-05, + "loss": 0.4829, + "step": 1318 + }, + { + "epoch": 0.5284227381905524, + "learning_rate": 1.8310812030854155e-05, + "loss": 0.0659, + "step": 1320 + }, + { + "epoch": 0.5292233787029623, + "learning_rate": 1.832632403378808e-05, + "loss": 0.0217, + "step": 1322 + }, + { + "epoch": 0.5300240192153723, + "learning_rate": 1.834177099078887e-05, + "loss": 0.0536, + "step": 1324 + }, + { + "epoch": 0.5308246597277823, + "learning_rate": 1.8357152781183606e-05, + "loss": 0.189, + "step": 1326 + }, + { + "epoch": 0.5316253002401922, + "learning_rate": 1.8372469284808465e-05, + "loss": 0.409, + "step": 1328 + }, + { + "epoch": 0.5324259407526021, + "learning_rate": 1.8387720382009665e-05, + "loss": 0.1041, + "step": 1330 + }, + { + "epoch": 0.533226581265012, + "learning_rate": 1.840290595364436e-05, + "loss": 0.6866, + "step": 1332 + }, + { + "epoch": 0.5340272217774219, + "learning_rate": 1.8418025881081606e-05, + "loss": 0.1732, + "step": 1334 + }, + { + "epoch": 0.5348278622898318, + "learning_rate": 1.8433080046203286e-05, + "loss": 0.1487, + "step": 1336 + }, + { + "epoch": 0.5356285028022418, + "learning_rate": 1.844806833140501e-05, + "loss": 0.1756, + "step": 1338 + }, + { + "epoch": 0.5364291433146517, + "learning_rate": 1.8462990619597054e-05, + "loss": 0.5604, + "step": 1340 + }, + { + "epoch": 0.5372297838270617, + "learning_rate": 1.8477846794205258e-05, + "loss": 0.0096, + "step": 1342 + }, + { + "epoch": 0.5380304243394716, + "learning_rate": 1.8492636739171966e-05, + "loss": 0.0382, + "step": 1344 + }, + { + "epoch": 0.5388310648518815, + "learning_rate": 1.85073603389569e-05, + "loss": 0.2741, + "step": 1346 + }, + { + "epoch": 0.5396317053642914, + "learning_rate": 1.8522017478538067e-05, + "loss": 0.3163, + "step": 1348 + }, + { + "epoch": 0.5404323458767014, + "learning_rate": 1.8536608043412695e-05, + "loss": 0.3021, + "step": 1350 + }, + { + "epoch": 0.5412329863891113, + "learning_rate": 1.855113191959808e-05, + "loss": 0.4251, + "step": 1352 + }, + { + "epoch": 0.5420336269015212, + "learning_rate": 1.856558899363248e-05, + "loss": 0.0997, + "step": 1354 + }, + { + "epoch": 0.5428342674139311, + "learning_rate": 1.8579979152576063e-05, + "loss": 0.1004, + "step": 1356 + }, + { + "epoch": 0.5436349079263411, + "learning_rate": 1.85943022840117e-05, + "loss": 0.0655, + "step": 1358 + }, + { + "epoch": 0.544435548438751, + "learning_rate": 1.8608558276045895e-05, + "loss": 0.0623, + "step": 1360 + }, + { + "epoch": 0.5452361889511609, + "learning_rate": 1.862274701730967e-05, + "loss": 0.1219, + "step": 1362 + }, + { + "epoch": 0.5460368294635709, + "learning_rate": 1.86368683969594e-05, + "loss": 0.0735, + "step": 1364 + }, + { + "epoch": 0.5468374699759808, + "learning_rate": 1.865092230467769e-05, + "loss": 0.2473, + "step": 1366 + }, + { + "epoch": 0.5476381104883907, + "learning_rate": 1.866490863067425e-05, + "loss": 0.1604, + "step": 1368 + }, + { + "epoch": 0.5484387510008006, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.1081, + "step": 1370 + }, + { + "epoch": 0.5492393915132106, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.1951, + "step": 1372 + }, + { + "epoch": 0.5500400320256205, + "learning_rate": 1.87064610283551e-05, + "loss": 0.2252, + "step": 1374 + }, + { + "epoch": 0.5508406725380304, + "learning_rate": 1.8720175940133705e-05, + "loss": 0.2134, + "step": 1376 + }, + { + "epoch": 0.5516413130504404, + "learning_rate": 1.873382272917545e-05, + "loss": 0.0877, + "step": 1378 + }, + { + "epoch": 0.5524419535628503, + "learning_rate": 1.8747401288870472e-05, + "loss": 0.0463, + "step": 1380 + }, + { + "epoch": 0.5532425940752602, + "learning_rate": 1.876091151314196e-05, + "loss": 0.3963, + "step": 1382 + }, + { + "epoch": 0.5540432345876701, + "learning_rate": 1.877435329644691e-05, + "loss": 0.0617, + "step": 1384 + }, + { + "epoch": 0.55484387510008, + "learning_rate": 1.8787726533776996e-05, + "loss": 0.2893, + "step": 1386 + }, + { + "epoch": 0.55564451561249, + "learning_rate": 1.8801031120659393e-05, + "loss": 0.2076, + "step": 1388 + }, + { + "epoch": 0.5564451561248999, + "learning_rate": 1.8814266953157557e-05, + "loss": 0.1323, + "step": 1390 + }, + { + "epoch": 0.5572457966373099, + "learning_rate": 1.8827433927872066e-05, + "loss": 0.2729, + "step": 1392 + }, + { + "epoch": 0.5580464371497198, + "learning_rate": 1.8840531941941415e-05, + "loss": 0.03, + "step": 1394 + }, + { + "epoch": 0.5588470776621297, + "learning_rate": 1.8853560893042854e-05, + "loss": 0.0054, + "step": 1396 + }, + { + "epoch": 0.5596477181745396, + "learning_rate": 1.8866520679393127e-05, + "loss": 0.0177, + "step": 1398 + }, + { + "epoch": 0.5604483586869495, + "learning_rate": 1.8879411199749303e-05, + "loss": 0.0062, + "step": 1400 + }, + { + "epoch": 0.5612489991993594, + "learning_rate": 1.889223235340958e-05, + "loss": 0.2164, + "step": 1402 + }, + { + "epoch": 0.5620496397117695, + "learning_rate": 1.8904984040214037e-05, + "loss": 0.3191, + "step": 1404 + }, + { + "epoch": 0.5628502802241794, + "learning_rate": 1.8917666160545436e-05, + "loss": 0.9136, + "step": 1406 + }, + { + "epoch": 0.5636509207365893, + "learning_rate": 1.893027861533002e-05, + "loss": 0.155, + "step": 1408 + }, + { + "epoch": 0.5644515612489992, + "learning_rate": 1.894282130603823e-05, + "loss": 0.4085, + "step": 1410 + }, + { + "epoch": 0.5652522017614091, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.2183, + "step": 1412 + }, + { + "epoch": 0.566052842273819, + "learning_rate": 1.896769700383315e-05, + "loss": 0.507, + "step": 1414 + }, + { + "epoch": 0.5668534827862289, + "learning_rate": 1.898002981658886e-05, + "loss": 0.0674, + "step": 1416 + }, + { + "epoch": 0.567654123298639, + "learning_rate": 1.899229247660769e-05, + "loss": 0.9905, + "step": 1418 + }, + { + "epoch": 0.5684547638110489, + "learning_rate": 1.9004484888092724e-05, + "loss": 0.2266, + "step": 1420 + }, + { + "epoch": 0.5692554043234588, + "learning_rate": 1.901660695579585e-05, + "loss": 0.3194, + "step": 1422 + }, + { + "epoch": 0.5700560448358687, + "learning_rate": 1.9028658585018455e-05, + "loss": 0.0865, + "step": 1424 + }, + { + "epoch": 0.5708566853482786, + "learning_rate": 1.9040639681612212e-05, + "loss": 0.3588, + "step": 1426 + }, + { + "epoch": 0.5716573258606885, + "learning_rate": 1.9052550151979816e-05, + "loss": 0.2454, + "step": 1428 + }, + { + "epoch": 0.5724579663730984, + "learning_rate": 1.9064389903075676e-05, + "loss": 0.1313, + "step": 1430 + }, + { + "epoch": 0.5732586068855084, + "learning_rate": 1.9076158842406674e-05, + "loss": 0.448, + "step": 1432 + }, + { + "epoch": 0.5740592473979184, + "learning_rate": 1.9087856878032886e-05, + "loss": 0.1761, + "step": 1434 + }, + { + "epoch": 0.5748598879103283, + "learning_rate": 1.909948391856829e-05, + "loss": 0.2844, + "step": 1436 + }, + { + "epoch": 0.5756605284227382, + "learning_rate": 1.911103987318148e-05, + "loss": 0.0798, + "step": 1438 + }, + { + "epoch": 0.5764611689351481, + "learning_rate": 1.912252465159637e-05, + "loss": 0.2795, + "step": 1440 + }, + { + "epoch": 0.577261809447558, + "learning_rate": 1.913393816409294e-05, + "loss": 0.8892, + "step": 1442 + }, + { + "epoch": 0.578062449959968, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.1686, + "step": 1444 + }, + { + "epoch": 0.5788630904723779, + "learning_rate": 1.9156551035235288e-05, + "loss": 0.2361, + "step": 1446 + }, + { + "epoch": 0.5796637309847879, + "learning_rate": 1.9167750217227454e-05, + "loss": 0.1208, + "step": 1448 + }, + { + "epoch": 0.5804643714971978, + "learning_rate": 1.9178877779995423e-05, + "loss": 0.3075, + "step": 1450 + }, + { + "epoch": 0.5812650120096077, + "learning_rate": 1.9189933636609747e-05, + "loss": 0.0164, + "step": 1452 + }, + { + "epoch": 0.5820656525220176, + "learning_rate": 1.9200917700701173e-05, + "loss": 0.4135, + "step": 1454 + }, + { + "epoch": 0.5828662930344275, + "learning_rate": 1.9211829886461274e-05, + "loss": 0.1756, + "step": 1456 + }, + { + "epoch": 0.5836669335468375, + "learning_rate": 1.9222670108643146e-05, + "loss": 0.2858, + "step": 1458 + }, + { + "epoch": 0.5844675740592474, + "learning_rate": 1.9233438282562085e-05, + "loss": 0.1881, + "step": 1460 + }, + { + "epoch": 0.5852682145716573, + "learning_rate": 1.924413432409622e-05, + "loss": 0.162, + "step": 1462 + }, + { + "epoch": 0.5860688550840673, + "learning_rate": 1.925475814968719e-05, + "loss": 0.0571, + "step": 1464 + }, + { + "epoch": 0.5868694955964772, + "learning_rate": 1.926530967634078e-05, + "loss": 0.1216, + "step": 1466 + }, + { + "epoch": 0.5876701361088871, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.1461, + "step": 1468 + }, + { + "epoch": 0.588470776621297, + "learning_rate": 1.9286195503683705e-05, + "loss": 0.1541, + "step": 1470 + }, + { + "epoch": 0.589271417133707, + "learning_rate": 1.9296529641211215e-05, + "loss": 0.289, + "step": 1472 + }, + { + "epoch": 0.5900720576461169, + "learning_rate": 1.9306791153479004e-05, + "loss": 0.0999, + "step": 1474 + }, + { + "epoch": 0.5908726981585268, + "learning_rate": 1.9316979960323286e-05, + "loss": 0.2162, + "step": 1476 + }, + { + "epoch": 0.5916733386709367, + "learning_rate": 1.932709598214825e-05, + "loss": 0.2189, + "step": 1478 + }, + { + "epoch": 0.5924739791833467, + "learning_rate": 1.9337139139926707e-05, + "loss": 0.1039, + "step": 1480 + }, + { + "epoch": 0.5932746196957566, + "learning_rate": 1.9347109355200672e-05, + "loss": 0.6263, + "step": 1482 + }, + { + "epoch": 0.5940752602081665, + "learning_rate": 1.935700655008199e-05, + "loss": 0.334, + "step": 1484 + }, + { + "epoch": 0.5948759007205765, + "learning_rate": 1.9366830647252967e-05, + "loss": 0.1187, + "step": 1486 + }, + { + "epoch": 0.5956765412329864, + "learning_rate": 1.9376581569966933e-05, + "loss": 0.4885, + "step": 1488 + }, + { + "epoch": 0.5964771817453963, + "learning_rate": 1.9386259242048883e-05, + "loss": 0.314, + "step": 1490 + }, + { + "epoch": 0.5972778222578062, + "learning_rate": 1.939586358789602e-05, + "loss": 0.1925, + "step": 1492 + }, + { + "epoch": 0.5980784627702161, + "learning_rate": 1.940539453247842e-05, + "loss": 0.2312, + "step": 1494 + }, + { + "epoch": 0.5988791032826261, + "learning_rate": 1.9414852001339547e-05, + "loss": 0.2604, + "step": 1496 + }, + { + "epoch": 0.5996797437950361, + "learning_rate": 1.9424235920596863e-05, + "loss": 0.0802, + "step": 1498 + }, + { + "epoch": 0.600480384307446, + "learning_rate": 1.9433546216942423e-05, + "loss": 0.2428, + "step": 1500 + }, + { + "epoch": 0.6012810248198559, + "learning_rate": 1.944278281764342e-05, + "loss": 0.017, + "step": 1502 + }, + { + "epoch": 0.6020816653322658, + "learning_rate": 1.945194565054276e-05, + "loss": 0.1088, + "step": 1504 + }, + { + "epoch": 0.6028823058446757, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.1602, + "step": 1506 + }, + { + "epoch": 0.6036829463570856, + "learning_rate": 1.9470049727190073e-05, + "loss": 0.1739, + "step": 1508 + }, + { + "epoch": 0.6044835868694955, + "learning_rate": 1.9478990829507504e-05, + "loss": 0.1652, + "step": 1510 + }, + { + "epoch": 0.6052842273819056, + "learning_rate": 1.948785788116329e-05, + "loss": 0.0309, + "step": 1512 + }, + { + "epoch": 0.6060848678943155, + "learning_rate": 1.9496650812887286e-05, + "loss": 0.2052, + "step": 1514 + }, + { + "epoch": 0.6068855084067254, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.1477, + "step": 1516 + }, + { + "epoch": 0.6076861489191353, + "learning_rate": 1.951401404235505e-05, + "loss": 0.1293, + "step": 1518 + }, + { + "epoch": 0.6084867894315452, + "learning_rate": 1.952258420445583e-05, + "loss": 0.1746, + "step": 1520 + }, + { + "epoch": 0.6092874299439551, + "learning_rate": 1.9531079975339912e-05, + "loss": 0.0316, + "step": 1522 + }, + { + "epoch": 0.610088070456365, + "learning_rate": 1.953950128863762e-05, + "loss": 0.0734, + "step": 1524 + }, + { + "epoch": 0.6108887109687751, + "learning_rate": 1.9547848078560975e-05, + "loss": 0.4149, + "step": 1526 + }, + { + "epoch": 0.611689351481185, + "learning_rate": 1.9556120279904144e-05, + "loss": 0.025, + "step": 1528 + }, + { + "epoch": 0.6124899919935949, + "learning_rate": 1.956431782804402e-05, + "loss": 0.213, + "step": 1530 + }, + { + "epoch": 0.6132906325060048, + "learning_rate": 1.957244065894066e-05, + "loss": 0.4586, + "step": 1532 + }, + { + "epoch": 0.6140912730184147, + "learning_rate": 1.9580488709137858e-05, + "loss": 0.0353, + "step": 1534 + }, + { + "epoch": 0.6148919135308246, + "learning_rate": 1.9588461915763566e-05, + "loss": 0.1638, + "step": 1536 + }, + { + "epoch": 0.6156925540432346, + "learning_rate": 1.9596360216530436e-05, + "loss": 0.0185, + "step": 1538 + }, + { + "epoch": 0.6164931945556446, + "learning_rate": 1.9604183549736283e-05, + "loss": 0.0889, + "step": 1540 + }, + { + "epoch": 0.6172938350680545, + "learning_rate": 1.961193185426459e-05, + "loss": 1.1072, + "step": 1542 + }, + { + "epoch": 0.6180944755804644, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.0079, + "step": 1544 + }, + { + "epoch": 0.6188951160928743, + "learning_rate": 1.9627203135753573e-05, + "loss": 0.3263, + "step": 1546 + }, + { + "epoch": 0.6196957566052842, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.2854, + "step": 1548 + }, + { + "epoch": 0.6204963971176941, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.0435, + "step": 1550 + }, + { + "epoch": 0.6212970376301041, + "learning_rate": 1.964954584871995e-05, + "loss": 0.1053, + "step": 1552 + }, + { + "epoch": 0.622097678142514, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.1358, + "step": 1554 + }, + { + "epoch": 0.622898318654924, + "learning_rate": 1.966406417240872e-05, + "loss": 0.1588, + "step": 1556 + }, + { + "epoch": 0.6236989591673339, + "learning_rate": 1.967121011775546e-05, + "loss": 0.1046, + "step": 1558 + }, + { + "epoch": 0.6244995996797438, + "learning_rate": 1.967828051080755e-05, + "loss": 0.2978, + "step": 1560 + }, + { + "epoch": 0.6253002401921537, + "learning_rate": 1.9685275296330497e-05, + "loss": 0.1096, + "step": 1562 + }, + { + "epoch": 0.6261008807045636, + "learning_rate": 1.969219441968046e-05, + "loss": 0.0477, + "step": 1564 + }, + { + "epoch": 0.6269015212169736, + "learning_rate": 1.969903782680467e-05, + "loss": 0.521, + "step": 1566 + }, + { + "epoch": 0.6277021617293835, + "learning_rate": 1.9705805464241856e-05, + "loss": 0.0368, + "step": 1568 + }, + { + "epoch": 0.6285028022417934, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.1041, + "step": 1570 + }, + { + "epoch": 0.6293034427542034, + "learning_rate": 1.971911321917015e-05, + "loss": 0.3526, + "step": 1572 + }, + { + "epoch": 0.6301040832666133, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.0235, + "step": 1574 + }, + { + "epoch": 0.6309047237790232, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.479, + "step": 1576 + }, + { + "epoch": 0.6317053642914331, + "learning_rate": 1.9738505276435692e-05, + "loss": 0.0526, + "step": 1578 + }, + { + "epoch": 0.6325060048038431, + "learning_rate": 1.9744817206240377e-05, + "loss": 0.2944, + "step": 1580 + }, + { + "epoch": 0.633306645316253, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.2095, + "step": 1582 + }, + { + "epoch": 0.6341072858286629, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.0163, + "step": 1584 + }, + { + "epoch": 0.6349079263410728, + "learning_rate": 1.9763296037475174e-05, + "loss": 0.049, + "step": 1586 + }, + { + "epoch": 0.6357085668534828, + "learning_rate": 1.976930316809569e-05, + "loss": 0.0415, + "step": 1588 + }, + { + "epoch": 0.6365092073658927, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.1525, + "step": 1590 + }, + { + "epoch": 0.6373098478783027, + "learning_rate": 1.978108842718768e-05, + "loss": 0.3124, + "step": 1592 + }, + { + "epoch": 0.6381104883907126, + "learning_rate": 1.9786866463591732e-05, + "loss": 0.0404, + "step": 1594 + }, + { + "epoch": 0.6389111289031225, + "learning_rate": 1.9792568044184176e-05, + "loss": 0.0273, + "step": 1596 + }, + { + "epoch": 0.6397117694155324, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.0324, + "step": 1598 + }, + { + "epoch": 0.6405124099279423, + "learning_rate": 1.9803741660367015e-05, + "loss": 0.0786, + "step": 1600 + }, + { + "epoch": 0.6413130504403523, + "learning_rate": 1.9809213608668185e-05, + "loss": 1.149, + "step": 1602 + }, + { + "epoch": 0.6421136909527622, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.5488, + "step": 1604 + }, + { + "epoch": 0.6429143314651722, + "learning_rate": 1.9819927571953807e-05, + "loss": 0.0822, + "step": 1606 + }, + { + "epoch": 0.6437149719775821, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.0372, + "step": 1608 + }, + { + "epoch": 0.644515612489992, + "learning_rate": 1.983033467948784e-05, + "loss": 0.3773, + "step": 1610 + }, + { + "epoch": 0.6453162530024019, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.2427, + "step": 1612 + }, + { + "epoch": 0.6461168935148118, + "learning_rate": 1.9840434606066182e-05, + "loss": 0.4978, + "step": 1614 + }, + { + "epoch": 0.6469175340272217, + "learning_rate": 1.9845369277495102e-05, + "loss": 0.4515, + "step": 1616 + }, + { + "epoch": 0.6477181745396317, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.4324, + "step": 1618 + }, + { + "epoch": 0.6485188150520417, + "learning_rate": 1.985500784388244e-05, + "loss": 0.0515, + "step": 1620 + }, + { + "epoch": 0.6493194555644516, + "learning_rate": 1.985971166354357e-05, + "loss": 0.0983, + "step": 1622 + }, + { + "epoch": 0.6501200960768615, + "learning_rate": 1.9864338458320366e-05, + "loss": 0.2475, + "step": 1624 + }, + { + "epoch": 0.6509207365892714, + "learning_rate": 1.986888819206792e-05, + "loss": 0.4888, + "step": 1626 + }, + { + "epoch": 0.6517213771016813, + "learning_rate": 1.9873360829243323e-05, + "loss": 0.1747, + "step": 1628 + }, + { + "epoch": 0.6525220176140912, + "learning_rate": 1.9877756334905983e-05, + "loss": 0.7543, + "step": 1630 + }, + { + "epoch": 0.6533226581265013, + "learning_rate": 1.9882074674717836e-05, + "loss": 0.1391, + "step": 1632 + }, + { + "epoch": 0.6541232986389112, + "learning_rate": 1.988631581494365e-05, + "loss": 0.1323, + "step": 1634 + }, + { + "epoch": 0.6549239391513211, + "learning_rate": 1.989047972245129e-05, + "loss": 0.3322, + "step": 1636 + }, + { + "epoch": 0.655724579663731, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.1865, + "step": 1638 + }, + { + "epoch": 0.6565252201761409, + "learning_rate": 1.989857570980049e-05, + "loss": 0.2062, + "step": 1640 + }, + { + "epoch": 0.6573258606885508, + "learning_rate": 1.990250772639552e-05, + "loss": 0.2262, + "step": 1642 + }, + { + "epoch": 0.6581265012009607, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.2199, + "step": 1644 + }, + { + "epoch": 0.6589271417133707, + "learning_rate": 1.99101396518405e-05, + "loss": 0.1645, + "step": 1646 + }, + { + "epoch": 0.6597277822257807, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.41, + "step": 1648 + }, + { + "epoch": 0.6605284227381906, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.4565, + "step": 1650 + }, + { + "epoch": 0.6613290632506005, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.5428, + "step": 1652 + }, + { + "epoch": 0.6621297037630104, + "learning_rate": 1.9924474249753652e-05, + "loss": 0.3886, + "step": 1654 + }, + { + "epoch": 0.6629303442754203, + "learning_rate": 1.9927864140670615e-05, + "loss": 0.0523, + "step": 1656 + }, + { + "epoch": 0.6637309847878302, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.1448, + "step": 1658 + }, + { + "epoch": 0.6645316253002402, + "learning_rate": 1.99344112247369e-05, + "loss": 0.1001, + "step": 1660 + }, + { + "epoch": 0.6653322658126501, + "learning_rate": 1.9937568366739858e-05, + "loss": 0.1113, + "step": 1662 + }, + { + "epoch": 0.6661329063250601, + "learning_rate": 1.9940647875635463e-05, + "loss": 0.2833, + "step": 1664 + }, + { + "epoch": 0.66693354683747, + "learning_rate": 1.9943649727366335e-05, + "loss": 0.373, + "step": 1666 + }, + { + "epoch": 0.6677341873498799, + "learning_rate": 1.994657389848176e-05, + "loss": 0.4961, + "step": 1668 + }, + { + "epoch": 0.6685348278622898, + "learning_rate": 1.994942036613787e-05, + "loss": 0.1421, + "step": 1670 + }, + { + "epoch": 0.6693354683746997, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.4958, + "step": 1672 + }, + { + "epoch": 0.6701361088871097, + "learning_rate": 1.995488010273198e-05, + "loss": 0.0513, + "step": 1674 + }, + { + "epoch": 0.6709367493995196, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.453, + "step": 1676 + }, + { + "epoch": 0.6717373899119295, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.121, + "step": 1678 + }, + { + "epoch": 0.6725380304243395, + "learning_rate": 1.996248639549475e-05, + "loss": 0.41, + "step": 1680 + }, + { + "epoch": 0.6733386709367494, + "learning_rate": 1.9964866196679105e-05, + "loss": 0.4555, + "step": 1682 + }, + { + "epoch": 0.6741393114491593, + "learning_rate": 1.9967168151503196e-05, + "loss": 0.0009, + "step": 1684 + }, + { + "epoch": 0.6749399519615693, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.1632, + "step": 1686 + }, + { + "epoch": 0.6757405924739792, + "learning_rate": 1.997153845074662e-05, + "loss": 0.3076, + "step": 1688 + }, + { + "epoch": 0.6765412329863891, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.2685, + "step": 1690 + }, + { + "epoch": 0.677341873498799, + "learning_rate": 1.997559715666073e-05, + "loss": 0.0119, + "step": 1692 + }, + { + "epoch": 0.678142514011209, + "learning_rate": 1.9977509622105233e-05, + "loss": 0.1099, + "step": 1694 + }, + { + "epoch": 0.6789431545236189, + "learning_rate": 1.9979344142417986e-05, + "loss": 0.5376, + "step": 1696 + }, + { + "epoch": 0.6797437950360288, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.2513, + "step": 1698 + }, + { + "epoch": 0.6805444355484388, + "learning_rate": 1.998277929093157e-05, + "loss": 0.1207, + "step": 1700 + }, + { + "epoch": 0.6813450760608487, + "learning_rate": 1.998437989229673e-05, + "loss": 0.0021, + "step": 1702 + }, + { + "epoch": 0.6821457165732586, + "learning_rate": 1.9985902494859023e-05, + "loss": 0.0574, + "step": 1704 + }, + { + "epoch": 0.6829463570856685, + "learning_rate": 1.998734708672375e-05, + "loss": 0.0211, + "step": 1706 + }, + { + "epoch": 0.6837469975980784, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.0251, + "step": 1708 + }, + { + "epoch": 0.6845476381104884, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.0357, + "step": 1710 + }, + { + "epoch": 0.6853482786228983, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.0536, + "step": 1712 + }, + { + "epoch": 0.6861489191353083, + "learning_rate": 1.9992345130644747e-05, + "loss": 0.5929, + "step": 1714 + }, + { + "epoch": 0.6869495596477182, + "learning_rate": 1.999339951193407e-05, + "loss": 0.0381, + "step": 1716 + }, + { + "epoch": 0.6877502001601281, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.0156, + "step": 1718 + }, + { + "epoch": 0.688550840672538, + "learning_rate": 1.9995274059091018e-05, + "loss": 0.0113, + "step": 1720 + }, + { + "epoch": 0.6893514811849479, + "learning_rate": 1.999609421031453e-05, + "loss": 0.0128, + "step": 1722 + }, + { + "epoch": 0.6901521216973578, + "learning_rate": 1.999683627122195e-05, + "loss": 0.0111, + "step": 1724 + }, + { + "epoch": 0.6909527622097679, + "learning_rate": 1.9997500236016233e-05, + "loss": 1.1377, + "step": 1726 + }, + { + "epoch": 0.6917534027221778, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.4164, + "step": 1728 + }, + { + "epoch": 0.6925540432345877, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.452, + "step": 1730 + }, + { + "epoch": 0.6933546837469976, + "learning_rate": 1.99990235049015e-05, + "loss": 0.4905, + "step": 1732 + }, + { + "epoch": 0.6941553242594075, + "learning_rate": 1.9999375039475275e-05, + "loss": 0.133, + "step": 1734 + }, + { + "epoch": 0.6949559647718174, + "learning_rate": 1.999964845810285e-05, + "loss": 0.1203, + "step": 1736 + }, + { + "epoch": 0.6957566052842273, + "learning_rate": 1.9999843758648253e-05, + "loss": 0.2363, + "step": 1738 + }, + { + "epoch": 0.6965572457966374, + "learning_rate": 1.999996093958578e-05, + "loss": 0.3615, + "step": 1740 + }, + { + "epoch": 0.6973578863090473, + "learning_rate": 2e-05, + "loss": 0.2648, + "step": 1742 + }, + { + "epoch": 0.6981585268214572, + "learning_rate": 1.999996093958578e-05, + "loss": 0.1095, + "step": 1744 + }, + { + "epoch": 0.6989591673338671, + "learning_rate": 1.9999843758648253e-05, + "loss": 0.1207, + "step": 1746 + }, + { + "epoch": 0.699759807846277, + "learning_rate": 1.999964845810285e-05, + "loss": 0.046, + "step": 1748 + }, + { + "epoch": 0.7005604483586869, + "learning_rate": 1.9999375039475278e-05, + "loss": 0.4335, + "step": 1750 + }, + { + "epoch": 0.7013610888710968, + "learning_rate": 1.99990235049015e-05, + "loss": 0.221, + "step": 1752 + }, + { + "epoch": 0.7021617293835068, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.1279, + "step": 1754 + }, + { + "epoch": 0.7029623698959168, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.2982, + "step": 1756 + }, + { + "epoch": 0.7037630104083267, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.2308, + "step": 1758 + }, + { + "epoch": 0.7045636509207366, + "learning_rate": 1.999683627122195e-05, + "loss": 0.1553, + "step": 1760 + }, + { + "epoch": 0.7053642914331465, + "learning_rate": 1.999609421031453e-05, + "loss": 0.2084, + "step": 1762 + }, + { + "epoch": 0.7061649319455564, + "learning_rate": 1.999527405909102e-05, + "loss": 0.2654, + "step": 1764 + }, + { + "epoch": 0.7069655724579663, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.772, + "step": 1766 + }, + { + "epoch": 0.7077662129703763, + "learning_rate": 1.999339951193407e-05, + "loss": 0.018, + "step": 1768 + }, + { + "epoch": 0.7085668534827863, + "learning_rate": 1.999234513064475e-05, + "loss": 0.2221, + "step": 1770 + }, + { + "epoch": 0.7093674939951962, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.3132, + "step": 1772 + }, + { + "epoch": 0.7101681345076061, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.4091, + "step": 1774 + }, + { + "epoch": 0.710968775020016, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.29, + "step": 1776 + }, + { + "epoch": 0.7117694155324259, + "learning_rate": 1.998734708672375e-05, + "loss": 0.014, + "step": 1778 + }, + { + "epoch": 0.7125700560448359, + "learning_rate": 1.9985902494859026e-05, + "loss": 0.161, + "step": 1780 + }, + { + "epoch": 0.7133706965572458, + "learning_rate": 1.9984379892296735e-05, + "loss": 0.2388, + "step": 1782 + }, + { + "epoch": 0.7141713370696557, + "learning_rate": 1.9982779290931572e-05, + "loss": 0.3763, + "step": 1784 + }, + { + "epoch": 0.7149719775820657, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.1976, + "step": 1786 + }, + { + "epoch": 0.7157726180944756, + "learning_rate": 1.997934414241799e-05, + "loss": 0.2113, + "step": 1788 + }, + { + "epoch": 0.7165732586068855, + "learning_rate": 1.9977509622105236e-05, + "loss": 0.2535, + "step": 1790 + }, + { + "epoch": 0.7173738991192954, + "learning_rate": 1.997559715666073e-05, + "loss": 0.1308, + "step": 1792 + }, + { + "epoch": 0.7181745396317054, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.0759, + "step": 1794 + }, + { + "epoch": 0.7189751801441153, + "learning_rate": 1.997153845074662e-05, + "loss": 0.3354, + "step": 1796 + }, + { + "epoch": 0.7197758206565252, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.395, + "step": 1798 + }, + { + "epoch": 0.7205764611689351, + "learning_rate": 1.9967168151503193e-05, + "loss": 0.1821, + "step": 1800 + }, + { + "epoch": 0.7213771016813451, + "learning_rate": 1.996486619667911e-05, + "loss": 0.0141, + "step": 1802 + }, + { + "epoch": 0.722177742193755, + "learning_rate": 1.9962486395494753e-05, + "loss": 0.1891, + "step": 1804 + }, + { + "epoch": 0.7229783827061649, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.052, + "step": 1806 + }, + { + "epoch": 0.7237790232185749, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.7159, + "step": 1808 + }, + { + "epoch": 0.7245796637309848, + "learning_rate": 1.995488010273198e-05, + "loss": 0.029, + "step": 1810 + }, + { + "epoch": 0.7253803042433947, + "learning_rate": 1.9952189108097825e-05, + "loss": 3.6529, + "step": 1812 + }, + { + "epoch": 0.7261809447558046, + "learning_rate": 1.9949420366137873e-05, + "loss": 1.2343, + "step": 1814 + }, + { + "epoch": 0.7269815852682145, + "learning_rate": 1.994657389848176e-05, + "loss": 0.3649, + "step": 1816 + }, + { + "epoch": 0.7277822257806245, + "learning_rate": 1.994364972736634e-05, + "loss": 0.167, + "step": 1818 + }, + { + "epoch": 0.7285828662930345, + "learning_rate": 1.9940647875635466e-05, + "loss": 0.1323, + "step": 1820 + }, + { + "epoch": 0.7293835068054444, + "learning_rate": 1.993756836673986e-05, + "loss": 0.2534, + "step": 1822 + }, + { + "epoch": 0.7301841473178543, + "learning_rate": 1.99344112247369e-05, + "loss": 0.2778, + "step": 1824 + }, + { + "epoch": 0.7309847878302642, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.481, + "step": 1826 + }, + { + "epoch": 0.7317854283426741, + "learning_rate": 1.9927864140670618e-05, + "loss": 0.0951, + "step": 1828 + }, + { + "epoch": 0.732586068855084, + "learning_rate": 1.9924474249753656e-05, + "loss": 0.0075, + "step": 1830 + }, + { + "epoch": 0.733386709367494, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.4145, + "step": 1832 + }, + { + "epoch": 0.734187349879904, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.1973, + "step": 1834 + }, + { + "epoch": 0.7349879903923139, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.0915, + "step": 1836 + }, + { + "epoch": 0.7357886309047238, + "learning_rate": 1.9910139651840497e-05, + "loss": 0.355, + "step": 1838 + }, + { + "epoch": 0.7365892714171337, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.1737, + "step": 1840 + }, + { + "epoch": 0.7373899119295436, + "learning_rate": 1.9902507726395524e-05, + "loss": 0.1477, + "step": 1842 + }, + { + "epoch": 0.7381905524419535, + "learning_rate": 1.989857570980049e-05, + "loss": 0.3447, + "step": 1844 + }, + { + "epoch": 0.7389911929543634, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.1635, + "step": 1846 + }, + { + "epoch": 0.7397918334667735, + "learning_rate": 1.9890479722451292e-05, + "loss": 0.4022, + "step": 1848 + }, + { + "epoch": 0.7405924739791834, + "learning_rate": 1.988631581494365e-05, + "loss": 0.3658, + "step": 1850 + }, + { + "epoch": 0.7413931144915933, + "learning_rate": 1.9882074674717832e-05, + "loss": 0.1296, + "step": 1852 + }, + { + "epoch": 0.7421937550040032, + "learning_rate": 1.987775633490599e-05, + "loss": 0.3454, + "step": 1854 + }, + { + "epoch": 0.7429943955164131, + "learning_rate": 1.987336082924333e-05, + "loss": 0.276, + "step": 1856 + }, + { + "epoch": 0.743795036028823, + "learning_rate": 1.986888819206792e-05, + "loss": 0.2723, + "step": 1858 + }, + { + "epoch": 0.7445956765412329, + "learning_rate": 1.986433845832037e-05, + "loss": 0.1763, + "step": 1860 + }, + { + "epoch": 0.745396317053643, + "learning_rate": 1.9859711663543573e-05, + "loss": 0.066, + "step": 1862 + }, + { + "epoch": 0.7461969575660529, + "learning_rate": 1.9855007843882437e-05, + "loss": 0.1013, + "step": 1864 + }, + { + "epoch": 0.7469975980784628, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.0511, + "step": 1866 + }, + { + "epoch": 0.7477982385908727, + "learning_rate": 1.9845369277495105e-05, + "loss": 0.1198, + "step": 1868 + }, + { + "epoch": 0.7485988791032826, + "learning_rate": 1.9840434606066186e-05, + "loss": 0.0198, + "step": 1870 + }, + { + "epoch": 0.7493995196156925, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.5328, + "step": 1872 + }, + { + "epoch": 0.7502001601281025, + "learning_rate": 1.983033467948784e-05, + "loss": 0.0303, + "step": 1874 + }, + { + "epoch": 0.7510008006405124, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.0295, + "step": 1876 + }, + { + "epoch": 0.7518014411529224, + "learning_rate": 1.9819927571953804e-05, + "loss": 0.4481, + "step": 1878 + }, + { + "epoch": 0.7526020816653323, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.0214, + "step": 1880 + }, + { + "epoch": 0.7534027221777422, + "learning_rate": 1.980921360866819e-05, + "loss": 0.0291, + "step": 1882 + }, + { + "epoch": 0.7542033626901521, + "learning_rate": 1.9803741660367018e-05, + "loss": 0.0416, + "step": 1884 + }, + { + "epoch": 0.755004003202562, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.0581, + "step": 1886 + }, + { + "epoch": 0.755804643714972, + "learning_rate": 1.979256804418418e-05, + "loss": 0.0344, + "step": 1888 + }, + { + "epoch": 0.7566052842273819, + "learning_rate": 1.978686646359173e-05, + "loss": 0.0692, + "step": 1890 + }, + { + "epoch": 0.7574059247397918, + "learning_rate": 1.9781088427187677e-05, + "loss": 0.3307, + "step": 1892 + }, + { + "epoch": 0.7582065652522018, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.2209, + "step": 1894 + }, + { + "epoch": 0.7590072057646117, + "learning_rate": 1.976930316809569e-05, + "loss": 0.4142, + "step": 1896 + }, + { + "epoch": 0.7598078462770216, + "learning_rate": 1.9763296037475177e-05, + "loss": 0.2652, + "step": 1898 + }, + { + "epoch": 0.7606084867894315, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.1194, + "step": 1900 + }, + { + "epoch": 0.7614091273018415, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.0339, + "step": 1902 + }, + { + "epoch": 0.7622097678142514, + "learning_rate": 1.9744817206240374e-05, + "loss": 0.097, + "step": 1904 + }, + { + "epoch": 0.7630104083266613, + "learning_rate": 1.9738505276435695e-05, + "loss": 0.3762, + "step": 1906 + }, + { + "epoch": 0.7638110488390712, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.2329, + "step": 1908 + }, + { + "epoch": 0.7646116893514812, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.4573, + "step": 1910 + }, + { + "epoch": 0.7654123298638911, + "learning_rate": 1.9719113219170152e-05, + "loss": 0.4161, + "step": 1912 + }, + { + "epoch": 0.7662129703763011, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.0913, + "step": 1914 + }, + { + "epoch": 0.767013610888711, + "learning_rate": 1.970580546424186e-05, + "loss": 0.0727, + "step": 1916 + }, + { + "epoch": 0.7678142514011209, + "learning_rate": 1.969903782680467e-05, + "loss": 0.3521, + "step": 1918 + }, + { + "epoch": 0.7686148919135308, + "learning_rate": 1.9692194419680463e-05, + "loss": 0.0516, + "step": 1920 + }, + { + "epoch": 0.7694155324259407, + "learning_rate": 1.96852752963305e-05, + "loss": 0.1951, + "step": 1922 + }, + { + "epoch": 0.7702161729383507, + "learning_rate": 1.9678280510807552e-05, + "loss": 0.6519, + "step": 1924 + }, + { + "epoch": 0.7710168134507606, + "learning_rate": 1.9671210117755462e-05, + "loss": 0.6369, + "step": 1926 + }, + { + "epoch": 0.7718174539631706, + "learning_rate": 1.966406417240872e-05, + "loss": 0.3762, + "step": 1928 + }, + { + "epoch": 0.7726180944755805, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.2234, + "step": 1930 + }, + { + "epoch": 0.7734187349879904, + "learning_rate": 1.964954584871995e-05, + "loss": 0.339, + "step": 1932 + }, + { + "epoch": 0.7742193755004003, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.3728, + "step": 1934 + }, + { + "epoch": 0.7750200160128102, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.3248, + "step": 1936 + }, + { + "epoch": 0.7758206565252201, + "learning_rate": 1.9627203135753576e-05, + "loss": 0.3527, + "step": 1938 + }, + { + "epoch": 0.77662129703763, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.1121, + "step": 1940 + }, + { + "epoch": 0.7774219375500401, + "learning_rate": 1.961193185426459e-05, + "loss": 0.3294, + "step": 1942 + }, + { + "epoch": 0.77822257806245, + "learning_rate": 1.9604183549736287e-05, + "loss": 0.1784, + "step": 1944 + }, + { + "epoch": 0.7790232185748599, + "learning_rate": 1.959636021653044e-05, + "loss": 0.0978, + "step": 1946 + }, + { + "epoch": 0.7798238590872698, + "learning_rate": 1.958846191576357e-05, + "loss": 0.0761, + "step": 1948 + }, + { + "epoch": 0.7806244995996797, + "learning_rate": 1.958048870913786e-05, + "loss": 0.2092, + "step": 1950 + }, + { + "epoch": 0.7814251401120896, + "learning_rate": 1.9572440658940667e-05, + "loss": 0.2472, + "step": 1952 + }, + { + "epoch": 0.7822257806244995, + "learning_rate": 1.9564317828044022e-05, + "loss": 0.1597, + "step": 1954 + }, + { + "epoch": 0.7830264211369096, + "learning_rate": 1.955612027990415e-05, + "loss": 0.2047, + "step": 1956 + }, + { + "epoch": 0.7838270616493195, + "learning_rate": 1.9547848078560982e-05, + "loss": 0.073, + "step": 1958 + }, + { + "epoch": 0.7846277021617294, + "learning_rate": 1.953950128863763e-05, + "loss": 0.5518, + "step": 1960 + }, + { + "epoch": 0.7854283426741393, + "learning_rate": 1.9531079975339915e-05, + "loss": 0.0273, + "step": 1962 + }, + { + "epoch": 0.7862289831865492, + "learning_rate": 1.9522584204455835e-05, + "loss": 0.0623, + "step": 1964 + }, + { + "epoch": 0.7870296236989591, + "learning_rate": 1.9514014042355054e-05, + "loss": 0.2235, + "step": 1966 + }, + { + "epoch": 0.7878302642113691, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.4253, + "step": 1968 + }, + { + "epoch": 0.7886309047237791, + "learning_rate": 1.9496650812887293e-05, + "loss": 0.7043, + "step": 1970 + }, + { + "epoch": 0.789431545236189, + "learning_rate": 1.9487857881163295e-05, + "loss": 0.2618, + "step": 1972 + }, + { + "epoch": 0.7902321857485989, + "learning_rate": 1.947899082950751e-05, + "loss": 0.4975, + "step": 1974 + }, + { + "epoch": 0.7910328262610088, + "learning_rate": 1.947004972719008e-05, + "loss": 0.3429, + "step": 1976 + }, + { + "epoch": 0.7918334667734187, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.1838, + "step": 1978 + }, + { + "epoch": 0.7926341072858286, + "learning_rate": 1.945194565054276e-05, + "loss": 0.1091, + "step": 1980 + }, + { + "epoch": 0.7934347477982386, + "learning_rate": 1.9442782817643425e-05, + "loss": 0.2386, + "step": 1982 + }, + { + "epoch": 0.7942353883106485, + "learning_rate": 1.9433546216942433e-05, + "loss": 0.4163, + "step": 1984 + }, + { + "epoch": 0.7950360288230585, + "learning_rate": 1.942423592059687e-05, + "loss": 0.1696, + "step": 1986 + }, + { + "epoch": 0.7958366693354684, + "learning_rate": 1.941485200133955e-05, + "loss": 0.032, + "step": 1988 + }, + { + "epoch": 0.7966373098478783, + "learning_rate": 1.9405394532478422e-05, + "loss": 0.0657, + "step": 1990 + }, + { + "epoch": 0.7974379503602882, + "learning_rate": 1.9395863587896025e-05, + "loss": 0.1788, + "step": 1992 + }, + { + "epoch": 0.7982385908726981, + "learning_rate": 1.938625924204888e-05, + "loss": 0.2899, + "step": 1994 + }, + { + "epoch": 0.7990392313851081, + "learning_rate": 1.937658156996694e-05, + "loss": 0.1433, + "step": 1996 + }, + { + "epoch": 0.799839871897518, + "learning_rate": 1.9366830647252977e-05, + "loss": 0.0022, + "step": 1998 + }, + { + "epoch": 0.800640512409928, + "learning_rate": 1.9357006550082e-05, + "loss": 0.5107, + "step": 2000 + }, + { + "epoch": 0.8014411529223379, + "learning_rate": 1.9347109355200676e-05, + "loss": 0.2226, + "step": 2002 + }, + { + "epoch": 0.8022417934347478, + "learning_rate": 1.933713913992671e-05, + "loss": 0.3163, + "step": 2004 + }, + { + "epoch": 0.8030424339471577, + "learning_rate": 1.9327095982148255e-05, + "loss": 0.3626, + "step": 2006 + }, + { + "epoch": 0.8038430744595677, + "learning_rate": 1.9316979960323283e-05, + "loss": 0.3958, + "step": 2008 + }, + { + "epoch": 0.8046437149719776, + "learning_rate": 1.9306791153479017e-05, + "loss": 0.0209, + "step": 2010 + }, + { + "epoch": 0.8054443554843875, + "learning_rate": 1.9296529641211226e-05, + "loss": 0.238, + "step": 2012 + }, + { + "epoch": 0.8062449959967974, + "learning_rate": 1.928619550368371e-05, + "loss": 0.0897, + "step": 2014 + }, + { + "epoch": 0.8070456365092074, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.0801, + "step": 2016 + }, + { + "epoch": 0.8078462770216173, + "learning_rate": 1.9265309676340783e-05, + "loss": 0.0812, + "step": 2018 + }, + { + "epoch": 0.8086469175340272, + "learning_rate": 1.9254758149687187e-05, + "loss": 0.5959, + "step": 2020 + }, + { + "epoch": 0.8094475580464372, + "learning_rate": 1.9244134324096216e-05, + "loss": 0.1141, + "step": 2022 + }, + { + "epoch": 0.8102481985588471, + "learning_rate": 1.9233438282562095e-05, + "loss": 0.0461, + "step": 2024 + }, + { + "epoch": 0.811048839071257, + "learning_rate": 1.9222670108643156e-05, + "loss": 0.0043, + "step": 2026 + }, + { + "epoch": 0.8118494795836669, + "learning_rate": 1.9211829886461278e-05, + "loss": 0.2787, + "step": 2028 + }, + { + "epoch": 0.8126501200960768, + "learning_rate": 1.9200917700701176e-05, + "loss": 0.1432, + "step": 2030 + }, + { + "epoch": 0.8134507606084868, + "learning_rate": 1.918993363660975e-05, + "loss": 0.0147, + "step": 2032 + }, + { + "epoch": 0.8142514011208967, + "learning_rate": 1.9178877779995416e-05, + "loss": 0.0795, + "step": 2034 + }, + { + "epoch": 0.8150520416333067, + "learning_rate": 1.916775021722745e-05, + "loss": 0.2862, + "step": 2036 + }, + { + "epoch": 0.8158526821457166, + "learning_rate": 1.9156551035235298e-05, + "loss": 0.0008, + "step": 2038 + }, + { + "epoch": 0.8166533226581265, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.0637, + "step": 2040 + }, + { + "epoch": 0.8174539631705364, + "learning_rate": 1.9133938164092942e-05, + "loss": 0.1453, + "step": 2042 + }, + { + "epoch": 0.8182546036829463, + "learning_rate": 1.9122524651596372e-05, + "loss": 0.6018, + "step": 2044 + }, + { + "epoch": 0.8190552441953562, + "learning_rate": 1.9111039873181475e-05, + "loss": 0.3994, + "step": 2046 + }, + { + "epoch": 0.8198558847077662, + "learning_rate": 1.9099483918568287e-05, + "loss": 0.0711, + "step": 2048 + }, + { + "epoch": 0.8206565252201762, + "learning_rate": 1.90878568780329e-05, + "loss": 0.2555, + "step": 2050 + }, + { + "epoch": 0.8214571657325861, + "learning_rate": 1.907615884240668e-05, + "loss": 0.4899, + "step": 2052 + }, + { + "epoch": 0.822257806244996, + "learning_rate": 1.9064389903075683e-05, + "loss": 0.0233, + "step": 2054 + }, + { + "epoch": 0.8230584467574059, + "learning_rate": 1.905255015197982e-05, + "loss": 0.1658, + "step": 2056 + }, + { + "epoch": 0.8238590872698158, + "learning_rate": 1.9040639681612216e-05, + "loss": 0.0904, + "step": 2058 + }, + { + "epoch": 0.8246597277822257, + "learning_rate": 1.902865858501845e-05, + "loss": 0.0428, + "step": 2060 + }, + { + "epoch": 0.8254603682946358, + "learning_rate": 1.9016606955795843e-05, + "loss": 0.1422, + "step": 2062 + }, + { + "epoch": 0.8262610088070457, + "learning_rate": 1.9004484888092734e-05, + "loss": 0.1907, + "step": 2064 + }, + { + "epoch": 0.8270616493194556, + "learning_rate": 1.8992292476607695e-05, + "loss": 0.0752, + "step": 2066 + }, + { + "epoch": 0.8278622898318655, + "learning_rate": 1.8980029816588863e-05, + "loss": 2.4576, + "step": 2068 + }, + { + "epoch": 0.8286629303442754, + "learning_rate": 1.8967697003833156e-05, + "loss": 0.4417, + "step": 2070 + }, + { + "epoch": 0.8294635708566853, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.1413, + "step": 2072 + }, + { + "epoch": 0.8302642113690952, + "learning_rate": 1.8942821306038227e-05, + "loss": 0.1288, + "step": 2074 + }, + { + "epoch": 0.8310648518815053, + "learning_rate": 1.893027861533003e-05, + "loss": 0.1518, + "step": 2076 + }, + { + "epoch": 0.8318654923939152, + "learning_rate": 1.891766616054545e-05, + "loss": 0.1541, + "step": 2078 + }, + { + "epoch": 0.8326661329063251, + "learning_rate": 1.8904984040214043e-05, + "loss": 0.4405, + "step": 2080 + }, + { + "epoch": 0.833466773418735, + "learning_rate": 1.8892232353409582e-05, + "loss": 0.0676, + "step": 2082 + }, + { + "epoch": 0.8342674139311449, + "learning_rate": 1.8879411199749306e-05, + "loss": 0.3345, + "step": 2084 + }, + { + "epoch": 0.8350680544435548, + "learning_rate": 1.8866520679393124e-05, + "loss": 0.4724, + "step": 2086 + }, + { + "epoch": 0.8358686949559647, + "learning_rate": 1.885356089304285e-05, + "loss": 0.3531, + "step": 2088 + }, + { + "epoch": 0.8366693354683747, + "learning_rate": 1.884053194194143e-05, + "loss": 0.6153, + "step": 2090 + }, + { + "epoch": 0.8374699759807847, + "learning_rate": 1.882743392787207e-05, + "loss": 0.0868, + "step": 2092 + }, + { + "epoch": 0.8382706164931946, + "learning_rate": 1.881426695315756e-05, + "loss": 0.2522, + "step": 2094 + }, + { + "epoch": 0.8390712570056045, + "learning_rate": 1.8801031120659396e-05, + "loss": 0.4213, + "step": 2096 + }, + { + "epoch": 0.8398718975180144, + "learning_rate": 1.8787726533777003e-05, + "loss": 0.4343, + "step": 2098 + }, + { + "epoch": 0.8406725380304243, + "learning_rate": 1.877435329644691e-05, + "loss": 0.2863, + "step": 2100 + }, + { + "epoch": 0.8414731785428343, + "learning_rate": 1.8760911513141974e-05, + "loss": 0.2492, + "step": 2102 + }, + { + "epoch": 0.8422738190552442, + "learning_rate": 1.8747401288870482e-05, + "loss": 0.2537, + "step": 2104 + }, + { + "epoch": 0.8430744595676541, + "learning_rate": 1.8733822729175455e-05, + "loss": 0.2477, + "step": 2106 + }, + { + "epoch": 0.8438751000800641, + "learning_rate": 1.8720175940133712e-05, + "loss": 0.2032, + "step": 2108 + }, + { + "epoch": 0.844675740592474, + "learning_rate": 1.8706461028355107e-05, + "loss": 0.5611, + "step": 2110 + }, + { + "epoch": 0.8454763811048839, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.3081, + "step": 2112 + }, + { + "epoch": 0.8462770216172938, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.515, + "step": 2114 + }, + { + "epoch": 0.8470776621297038, + "learning_rate": 1.8664908630674264e-05, + "loss": 0.3419, + "step": 2116 + }, + { + "epoch": 0.8478783026421137, + "learning_rate": 1.86509223046777e-05, + "loss": 0.138, + "step": 2118 + }, + { + "epoch": 0.8486789431545236, + "learning_rate": 1.8636868396959406e-05, + "loss": 0.2623, + "step": 2120 + }, + { + "epoch": 0.8494795836669335, + "learning_rate": 1.8622747017309676e-05, + "loss": 0.2178, + "step": 2122 + }, + { + "epoch": 0.8502802241793435, + "learning_rate": 1.8608558276045898e-05, + "loss": 0.1096, + "step": 2124 + }, + { + "epoch": 0.8510808646917534, + "learning_rate": 1.8594302284011697e-05, + "loss": 0.1311, + "step": 2126 + }, + { + "epoch": 0.8518815052041633, + "learning_rate": 1.8579979152576076e-05, + "loss": 0.1895, + "step": 2128 + }, + { + "epoch": 0.8526821457165733, + "learning_rate": 1.8565588993632498e-05, + "loss": 0.0473, + "step": 2130 + }, + { + "epoch": 0.8534827862289832, + "learning_rate": 1.8551131919598084e-05, + "loss": 0.1503, + "step": 2132 + }, + { + "epoch": 0.8542834267413931, + "learning_rate": 1.8536608043412702e-05, + "loss": 0.1487, + "step": 2134 + }, + { + "epoch": 0.855084067253803, + "learning_rate": 1.852201747853807e-05, + "loss": 0.094, + "step": 2136 + }, + { + "epoch": 0.855884707766213, + "learning_rate": 1.8507360338956896e-05, + "loss": 0.2206, + "step": 2138 + }, + { + "epoch": 0.8566853482786229, + "learning_rate": 1.849263673917196e-05, + "loss": 0.0799, + "step": 2140 + }, + { + "epoch": 0.8574859887910328, + "learning_rate": 1.847784679420527e-05, + "loss": 0.5064, + "step": 2142 + }, + { + "epoch": 0.8582866293034428, + "learning_rate": 1.846299061959706e-05, + "loss": 0.0929, + "step": 2144 + }, + { + "epoch": 0.8590872698158527, + "learning_rate": 1.8448068331405018e-05, + "loss": 0.0323, + "step": 2146 + }, + { + "epoch": 0.8598879103282626, + "learning_rate": 1.8433080046203293e-05, + "loss": 0.2162, + "step": 2148 + }, + { + "epoch": 0.8606885508406725, + "learning_rate": 1.841802588108161e-05, + "loss": 0.4716, + "step": 2150 + }, + { + "epoch": 0.8614891913530824, + "learning_rate": 1.8402905953644356e-05, + "loss": 0.121, + "step": 2152 + }, + { + "epoch": 0.8622898318654924, + "learning_rate": 1.838772038200968e-05, + "loss": 0.098, + "step": 2154 + }, + { + "epoch": 0.8630904723779024, + "learning_rate": 1.837246928480848e-05, + "loss": 0.0737, + "step": 2156 + }, + { + "epoch": 0.8638911128903123, + "learning_rate": 1.8357152781183613e-05, + "loss": 0.1776, + "step": 2158 + }, + { + "epoch": 0.8646917534027222, + "learning_rate": 1.8341770990788874e-05, + "loss": 0.3021, + "step": 2160 + }, + { + "epoch": 0.8654923939151321, + "learning_rate": 1.8326324033788087e-05, + "loss": 0.1531, + "step": 2162 + }, + { + "epoch": 0.866293034427542, + "learning_rate": 1.831081203085415e-05, + "loss": 0.4485, + "step": 2164 + }, + { + "epoch": 0.8670936749399519, + "learning_rate": 1.8295235103168128e-05, + "loss": 0.0659, + "step": 2166 + }, + { + "epoch": 0.8678943154523618, + "learning_rate": 1.8279593372418284e-05, + "loss": 0.0678, + "step": 2168 + }, + { + "epoch": 0.8686949559647719, + "learning_rate": 1.8263886960799072e-05, + "loss": 0.1868, + "step": 2170 + }, + { + "epoch": 0.8694955964771818, + "learning_rate": 1.8248115991010303e-05, + "loss": 0.084, + "step": 2172 + }, + { + "epoch": 0.8702962369895917, + "learning_rate": 1.8232280586256104e-05, + "loss": 0.3139, + "step": 2174 + }, + { + "epoch": 0.8710968775020016, + "learning_rate": 1.8216380870243963e-05, + "loss": 0.3554, + "step": 2176 + }, + { + "epoch": 0.8718975180144115, + "learning_rate": 1.820041696718378e-05, + "loss": 0.018, + "step": 2178 + }, + { + "epoch": 0.8726981585268214, + "learning_rate": 1.8184389001786912e-05, + "loss": 0.2365, + "step": 2180 + }, + { + "epoch": 0.8734987990392313, + "learning_rate": 1.8168297099265108e-05, + "loss": 0.0446, + "step": 2182 + }, + { + "epoch": 0.8742994395516414, + "learning_rate": 1.815214138532966e-05, + "loss": 0.0159, + "step": 2184 + }, + { + "epoch": 0.8751000800640513, + "learning_rate": 1.8135921986190358e-05, + "loss": 0.1313, + "step": 2186 + }, + { + "epoch": 0.8759007205764612, + "learning_rate": 1.8119639028554475e-05, + "loss": 0.4027, + "step": 2188 + }, + { + "epoch": 0.8767013610888711, + "learning_rate": 1.8103292639625835e-05, + "loss": 0.19, + "step": 2190 + }, + { + "epoch": 0.877502001601281, + "learning_rate": 1.808688294710378e-05, + "loss": 0.1179, + "step": 2192 + }, + { + "epoch": 0.8783026421136909, + "learning_rate": 1.807041007918221e-05, + "loss": 0.2544, + "step": 2194 + }, + { + "epoch": 0.8791032826261009, + "learning_rate": 1.805387416454849e-05, + "loss": 0.0669, + "step": 2196 + }, + { + "epoch": 0.8799039231385108, + "learning_rate": 1.8037275332382575e-05, + "loss": 0.017, + "step": 2198 + }, + { + "epoch": 0.8807045636509208, + "learning_rate": 1.802061371235592e-05, + "loss": 0.3813, + "step": 2200 + }, + { + "epoch": 0.8815052041633307, + "learning_rate": 1.8003889434630476e-05, + "loss": 0.1286, + "step": 2202 + }, + { + "epoch": 0.8823058446757406, + "learning_rate": 1.7987102629857692e-05, + "loss": 0.6528, + "step": 2204 + }, + { + "epoch": 0.8831064851881505, + "learning_rate": 1.7970253429177494e-05, + "loss": 0.3014, + "step": 2206 + }, + { + "epoch": 0.8839071257005604, + "learning_rate": 1.7953341964217196e-05, + "loss": 0.1269, + "step": 2208 + }, + { + "epoch": 0.8847077662129704, + "learning_rate": 1.7936368367090583e-05, + "loss": 0.1175, + "step": 2210 + }, + { + "epoch": 0.8855084067253803, + "learning_rate": 1.7919332770396798e-05, + "loss": 0.2806, + "step": 2212 + }, + { + "epoch": 0.8863090472377902, + "learning_rate": 1.7902235307219336e-05, + "loss": 0.0817, + "step": 2214 + }, + { + "epoch": 0.8871096877502002, + "learning_rate": 1.7885076111125e-05, + "loss": 0.1527, + "step": 2216 + }, + { + "epoch": 0.8879103282626101, + "learning_rate": 1.7867855316162846e-05, + "loss": 0.5265, + "step": 2218 + }, + { + "epoch": 0.88871096877502, + "learning_rate": 1.7850573056863173e-05, + "loss": 0.145, + "step": 2220 + }, + { + "epoch": 0.8895116092874299, + "learning_rate": 1.783322946823638e-05, + "loss": 0.3989, + "step": 2222 + }, + { + "epoch": 0.8903122497998399, + "learning_rate": 1.7815824685772042e-05, + "loss": 0.4344, + "step": 2224 + }, + { + "epoch": 0.8911128903122498, + "learning_rate": 1.779835884543776e-05, + "loss": 0.1454, + "step": 2226 + }, + { + "epoch": 0.8919135308246597, + "learning_rate": 1.7780832083678122e-05, + "loss": 0.2299, + "step": 2228 + }, + { + "epoch": 0.8927141713370697, + "learning_rate": 1.776324453741365e-05, + "loss": 0.1319, + "step": 2230 + }, + { + "epoch": 0.8935148118494796, + "learning_rate": 1.774559634403971e-05, + "loss": 0.0101, + "step": 2232 + }, + { + "epoch": 0.8943154523618895, + "learning_rate": 1.7727887641425465e-05, + "loss": 0.0591, + "step": 2234 + }, + { + "epoch": 0.8951160928742994, + "learning_rate": 1.7710118567912732e-05, + "loss": 0.3235, + "step": 2236 + }, + { + "epoch": 0.8959167333867094, + "learning_rate": 1.7692289262315008e-05, + "loss": 0.0968, + "step": 2238 + }, + { + "epoch": 0.8967173738991193, + "learning_rate": 1.7674399863916298e-05, + "loss": 0.3116, + "step": 2240 + }, + { + "epoch": 0.8975180144115292, + "learning_rate": 1.765645051247007e-05, + "loss": 0.1482, + "step": 2242 + }, + { + "epoch": 0.8983186549239391, + "learning_rate": 1.7638441348198144e-05, + "loss": 0.0327, + "step": 2244 + }, + { + "epoch": 0.899119295436349, + "learning_rate": 1.762037251178961e-05, + "loss": 0.0662, + "step": 2246 + }, + { + "epoch": 0.899919935948759, + "learning_rate": 1.7602244144399713e-05, + "loss": 0.0139, + "step": 2248 + }, + { + "epoch": 0.900720576461169, + "learning_rate": 1.7584056387648738e-05, + "loss": 0.3933, + "step": 2250 + }, + { + "epoch": 0.9015212169735789, + "learning_rate": 1.7565809383620966e-05, + "loss": 0.1404, + "step": 2252 + }, + { + "epoch": 0.9023218574859888, + "learning_rate": 1.7547503274863502e-05, + "loss": 0.1901, + "step": 2254 + }, + { + "epoch": 0.9031224979983987, + "learning_rate": 1.7529138204385186e-05, + "loss": 0.0964, + "step": 2256 + }, + { + "epoch": 0.9039231385108086, + "learning_rate": 1.7510714315655467e-05, + "loss": 0.1875, + "step": 2258 + }, + { + "epoch": 0.9047237790232185, + "learning_rate": 1.7492231752603305e-05, + "loss": 0.0709, + "step": 2260 + }, + { + "epoch": 0.9055244195356285, + "learning_rate": 1.7473690659616e-05, + "loss": 0.1437, + "step": 2262 + }, + { + "epoch": 0.9063250600480385, + "learning_rate": 1.7455091181538094e-05, + "loss": 1.2692, + "step": 2264 + }, + { + "epoch": 0.9071257005604484, + "learning_rate": 1.743643346367027e-05, + "loss": 0.0119, + "step": 2266 + }, + { + "epoch": 0.9079263410728583, + "learning_rate": 1.741771765176815e-05, + "loss": 0.3426, + "step": 2268 + }, + { + "epoch": 0.9087269815852682, + "learning_rate": 1.739894389204122e-05, + "loss": 0.1571, + "step": 2270 + }, + { + "epoch": 0.9095276220976781, + "learning_rate": 1.7380112331151657e-05, + "loss": 0.642, + "step": 2272 + }, + { + "epoch": 0.910328262610088, + "learning_rate": 1.7361223116213146e-05, + "loss": 0.4643, + "step": 2274 + }, + { + "epoch": 0.911128903122498, + "learning_rate": 1.7342276394789825e-05, + "loss": 0.2441, + "step": 2276 + }, + { + "epoch": 0.911929543634908, + "learning_rate": 1.732327231489503e-05, + "loss": 0.1001, + "step": 2278 + }, + { + "epoch": 0.9127301841473179, + "learning_rate": 1.7304211024990216e-05, + "loss": 0.1119, + "step": 2280 + }, + { + "epoch": 0.9135308246597278, + "learning_rate": 1.728509267398376e-05, + "loss": 0.2523, + "step": 2282 + }, + { + "epoch": 0.9143314651721377, + "learning_rate": 1.7265917411229803e-05, + "loss": 0.3757, + "step": 2284 + }, + { + "epoch": 0.9151321056845476, + "learning_rate": 1.7246685386527105e-05, + "loss": 0.1005, + "step": 2286 + }, + { + "epoch": 0.9159327461969575, + "learning_rate": 1.7227396750117802e-05, + "loss": 0.0359, + "step": 2288 + }, + { + "epoch": 0.9167333867093675, + "learning_rate": 1.7208051652686348e-05, + "loss": 0.1997, + "step": 2290 + }, + { + "epoch": 0.9175340272217775, + "learning_rate": 1.718865024535822e-05, + "loss": 0.1035, + "step": 2292 + }, + { + "epoch": 0.9183346677341874, + "learning_rate": 1.716919267969884e-05, + "loss": 0.1036, + "step": 2294 + }, + { + "epoch": 0.9191353082465973, + "learning_rate": 1.7149679107712317e-05, + "loss": 0.1555, + "step": 2296 + }, + { + "epoch": 0.9199359487590072, + "learning_rate": 1.7130109681840298e-05, + "loss": 0.062, + "step": 2298 + }, + { + "epoch": 0.9207365892714171, + "learning_rate": 1.711048455496075e-05, + "loss": 0.0461, + "step": 2300 + }, + { + "epoch": 0.921537229783827, + "learning_rate": 1.7090803880386784e-05, + "loss": 0.0088, + "step": 2302 + }, + { + "epoch": 0.922337870296237, + "learning_rate": 1.7071067811865474e-05, + "loss": 0.0457, + "step": 2304 + }, + { + "epoch": 0.923138510808647, + "learning_rate": 1.705127650357663e-05, + "loss": 0.0093, + "step": 2306 + }, + { + "epoch": 0.9239391513210569, + "learning_rate": 1.7031430110131566e-05, + "loss": 0.733, + "step": 2308 + }, + { + "epoch": 0.9247397918334668, + "learning_rate": 1.701152878657197e-05, + "loss": 0.3537, + "step": 2310 + }, + { + "epoch": 0.9255404323458767, + "learning_rate": 1.699157268836863e-05, + "loss": 0.0749, + "step": 2312 + }, + { + "epoch": 0.9263410728582866, + "learning_rate": 1.697156197142023e-05, + "loss": 0.1302, + "step": 2314 + }, + { + "epoch": 0.9271417133706965, + "learning_rate": 1.6951496792052148e-05, + "loss": 0.1955, + "step": 2316 + }, + { + "epoch": 0.9279423538831065, + "learning_rate": 1.6931377307015236e-05, + "loss": 0.1208, + "step": 2318 + }, + { + "epoch": 0.9287429943955164, + "learning_rate": 1.6911203673484583e-05, + "loss": 0.0668, + "step": 2320 + }, + { + "epoch": 0.9295436349079264, + "learning_rate": 1.6890976049058267e-05, + "loss": 0.1657, + "step": 2322 + }, + { + "epoch": 0.9303442754203363, + "learning_rate": 1.687069459175619e-05, + "loss": 0.1007, + "step": 2324 + }, + { + "epoch": 0.9311449159327462, + "learning_rate": 1.6850359460018744e-05, + "loss": 0.5009, + "step": 2326 + }, + { + "epoch": 0.9319455564451561, + "learning_rate": 1.682997081270568e-05, + "loss": 0.3312, + "step": 2328 + }, + { + "epoch": 0.932746196957566, + "learning_rate": 1.6809528809094805e-05, + "loss": 0.137, + "step": 2330 + }, + { + "epoch": 0.933546837469976, + "learning_rate": 1.6789033608880742e-05, + "loss": 0.0991, + "step": 2332 + }, + { + "epoch": 0.9343474779823859, + "learning_rate": 1.67684853721737e-05, + "loss": 0.0337, + "step": 2334 + }, + { + "epoch": 0.9351481184947958, + "learning_rate": 1.6747884259498185e-05, + "loss": 0.0503, + "step": 2336 + }, + { + "epoch": 0.9359487590072058, + "learning_rate": 1.6727230431791826e-05, + "loss": 0.4371, + "step": 2338 + }, + { + "epoch": 0.9367493995196157, + "learning_rate": 1.6706524050404006e-05, + "loss": 0.089, + "step": 2340 + }, + { + "epoch": 0.9375500400320256, + "learning_rate": 1.6685765277094702e-05, + "loss": 0.0053, + "step": 2342 + }, + { + "epoch": 0.9383506805444356, + "learning_rate": 1.6664954274033175e-05, + "loss": 0.0938, + "step": 2344 + }, + { + "epoch": 0.9391513210568455, + "learning_rate": 1.66440912037967e-05, + "loss": 0.4106, + "step": 2346 + }, + { + "epoch": 0.9399519615692554, + "learning_rate": 1.662317622936933e-05, + "loss": 0.0628, + "step": 2348 + }, + { + "epoch": 0.9407526020816653, + "learning_rate": 1.6602209514140562e-05, + "loss": 0.1182, + "step": 2350 + }, + { + "epoch": 0.9415532425940752, + "learning_rate": 1.6581191221904098e-05, + "loss": 0.259, + "step": 2352 + }, + { + "epoch": 0.9423538831064852, + "learning_rate": 1.6560121516856592e-05, + "loss": 0.0064, + "step": 2354 + }, + { + "epoch": 0.9431545236188951, + "learning_rate": 1.6539000563596328e-05, + "loss": 0.1436, + "step": 2356 + }, + { + "epoch": 0.9439551641313051, + "learning_rate": 1.651782852712194e-05, + "loss": 0.4925, + "step": 2358 + }, + { + "epoch": 0.944755804643715, + "learning_rate": 1.6496605572831127e-05, + "loss": 0.0951, + "step": 2360 + }, + { + "epoch": 0.9455564451561249, + "learning_rate": 1.6475331866519387e-05, + "loss": 0.4093, + "step": 2362 + }, + { + "epoch": 0.9463570856685348, + "learning_rate": 1.6454007574378657e-05, + "loss": 0.494, + "step": 2364 + }, + { + "epoch": 0.9471577261809447, + "learning_rate": 1.6432632862996062e-05, + "loss": 0.0571, + "step": 2366 + }, + { + "epoch": 0.9479583666933546, + "learning_rate": 1.6411207899352633e-05, + "loss": 0.0394, + "step": 2368 + }, + { + "epoch": 0.9487590072057646, + "learning_rate": 1.6389732850821964e-05, + "loss": 0.0463, + "step": 2370 + }, + { + "epoch": 0.9495596477181746, + "learning_rate": 1.6368207885168904e-05, + "loss": 0.3666, + "step": 2372 + }, + { + "epoch": 0.9503602882305845, + "learning_rate": 1.6346633170548275e-05, + "loss": 0.0794, + "step": 2374 + }, + { + "epoch": 0.9511609287429944, + "learning_rate": 1.6325008875503563e-05, + "loss": 0.0197, + "step": 2376 + }, + { + "epoch": 0.9519615692554043, + "learning_rate": 1.6303335168965495e-05, + "loss": 0.5861, + "step": 2378 + }, + { + "epoch": 0.9527622097678142, + "learning_rate": 1.628161222025089e-05, + "loss": 0.1982, + "step": 2380 + }, + { + "epoch": 0.9535628502802241, + "learning_rate": 1.625984019906122e-05, + "loss": 0.0247, + "step": 2382 + }, + { + "epoch": 0.9543634907926342, + "learning_rate": 1.623801927548132e-05, + "loss": 0.1643, + "step": 2384 + }, + { + "epoch": 0.9551641313050441, + "learning_rate": 1.6216149619978057e-05, + "loss": 0.086, + "step": 2386 + }, + { + "epoch": 0.955964771817454, + "learning_rate": 1.6194231403398987e-05, + "loss": 0.1059, + "step": 2388 + }, + { + "epoch": 0.9567654123298639, + "learning_rate": 1.6172264796971063e-05, + "loss": 0.1945, + "step": 2390 + }, + { + "epoch": 0.9575660528422738, + "learning_rate": 1.6150249972299173e-05, + "loss": 0.3127, + "step": 2392 + }, + { + "epoch": 0.9583666933546837, + "learning_rate": 1.612818710136499e-05, + "loss": 0.407, + "step": 2394 + }, + { + "epoch": 0.9591673338670936, + "learning_rate": 1.6106076356525484e-05, + "loss": 0.0486, + "step": 2396 + }, + { + "epoch": 0.9599679743795037, + "learning_rate": 1.6083917910511623e-05, + "loss": 0.024, + "step": 2398 + }, + { + "epoch": 0.9607686148919136, + "learning_rate": 1.6061711936427028e-05, + "loss": 0.1137, + "step": 2400 + }, + { + "epoch": 0.9615692554043235, + "learning_rate": 1.60394586077466e-05, + "loss": 0.7253, + "step": 2402 + }, + { + "epoch": 0.9623698959167334, + "learning_rate": 1.6017158098315224e-05, + "loss": 0.1423, + "step": 2404 + }, + { + "epoch": 0.9631705364291433, + "learning_rate": 1.5994810582346266e-05, + "loss": 0.1384, + "step": 2406 + }, + { + "epoch": 0.9639711769415532, + "learning_rate": 1.5972416234420404e-05, + "loss": 0.2109, + "step": 2408 + }, + { + "epoch": 0.9647718174539631, + "learning_rate": 1.594997522948413e-05, + "loss": 0.2799, + "step": 2410 + }, + { + "epoch": 0.9655724579663731, + "learning_rate": 1.592748774284844e-05, + "loss": 0.2256, + "step": 2412 + }, + { + "epoch": 0.966373098478783, + "learning_rate": 1.5904953950187448e-05, + "loss": 0.406, + "step": 2414 + }, + { + "epoch": 0.967173738991193, + "learning_rate": 1.588237402753703e-05, + "loss": 0.1234, + "step": 2416 + }, + { + "epoch": 0.9679743795036029, + "learning_rate": 1.5859748151293354e-05, + "loss": 0.0483, + "step": 2418 + }, + { + "epoch": 0.9687750200160128, + "learning_rate": 1.5837076498211673e-05, + "loss": 0.2138, + "step": 2420 + }, + { + "epoch": 0.9695756605284227, + "learning_rate": 1.581435924540482e-05, + "loss": 0.0894, + "step": 2422 + }, + { + "epoch": 0.9703763010408326, + "learning_rate": 1.579159657034185e-05, + "loss": 0.2603, + "step": 2424 + }, + { + "epoch": 0.9711769415532426, + "learning_rate": 1.5768788650846674e-05, + "loss": 0.1214, + "step": 2426 + }, + { + "epoch": 0.9719775820656525, + "learning_rate": 1.574593566509664e-05, + "loss": 0.6243, + "step": 2428 + }, + { + "epoch": 0.9727782225780625, + "learning_rate": 1.5723037791621203e-05, + "loss": 0.0428, + "step": 2430 + }, + { + "epoch": 0.9735788630904724, + "learning_rate": 1.5700095209300386e-05, + "loss": 0.2953, + "step": 2432 + }, + { + "epoch": 0.9743795036028823, + "learning_rate": 1.5677108097363565e-05, + "loss": 0.2483, + "step": 2434 + }, + { + "epoch": 0.9751801441152922, + "learning_rate": 1.5654076635387976e-05, + "loss": 0.1445, + "step": 2436 + }, + { + "epoch": 0.9759807846277022, + "learning_rate": 1.5631001003297302e-05, + "loss": 0.4608, + "step": 2438 + }, + { + "epoch": 0.9767814251401121, + "learning_rate": 1.560788138136029e-05, + "loss": 0.2921, + "step": 2440 + }, + { + "epoch": 0.977582065652522, + "learning_rate": 1.5584717950189373e-05, + "loss": 0.1577, + "step": 2442 + }, + { + "epoch": 0.978382706164932, + "learning_rate": 1.5561510890739137e-05, + "loss": 0.3222, + "step": 2444 + }, + { + "epoch": 0.9791833466773419, + "learning_rate": 1.5538260384305083e-05, + "loss": 0.1024, + "step": 2446 + }, + { + "epoch": 0.9799839871897518, + "learning_rate": 1.5514966612522088e-05, + "loss": 0.2159, + "step": 2448 + }, + { + "epoch": 0.9807846277021617, + "learning_rate": 1.5491629757363033e-05, + "loss": 0.3305, + "step": 2450 + }, + { + "epoch": 0.9815852682145717, + "learning_rate": 1.546825000113736e-05, + "loss": 0.2881, + "step": 2452 + }, + { + "epoch": 0.9823859087269816, + "learning_rate": 1.544482752648966e-05, + "loss": 0.0582, + "step": 2454 + }, + { + "epoch": 0.9831865492393915, + "learning_rate": 1.5421362516398285e-05, + "loss": 0.3563, + "step": 2456 + }, + { + "epoch": 0.9839871897518014, + "learning_rate": 1.539785515417377e-05, + "loss": 0.1255, + "step": 2458 + }, + { + "epoch": 0.9847878302642114, + "learning_rate": 1.5374305623457605e-05, + "loss": 0.0168, + "step": 2460 + }, + { + "epoch": 0.9855884707766213, + "learning_rate": 1.5350714108220677e-05, + "loss": 0.0264, + "step": 2462 + }, + { + "epoch": 0.9863891112890312, + "learning_rate": 1.532708079276186e-05, + "loss": 0.3097, + "step": 2464 + }, + { + "epoch": 0.9871897518014412, + "learning_rate": 1.5303405861706567e-05, + "loss": 0.1371, + "step": 2466 + }, + { + "epoch": 0.9879903923138511, + "learning_rate": 1.5279689500005353e-05, + "loss": 0.1368, + "step": 2468 + }, + { + "epoch": 0.988791032826261, + "learning_rate": 1.5255931892932344e-05, + "loss": 0.042, + "step": 2470 + }, + { + "epoch": 0.9895916733386709, + "learning_rate": 1.5232133226083962e-05, + "loss": 0.1272, + "step": 2472 + }, + { + "epoch": 0.9903923138510808, + "learning_rate": 1.5208293685377362e-05, + "loss": 0.4552, + "step": 2474 + }, + { + "epoch": 0.9911929543634908, + "learning_rate": 1.5184413457049014e-05, + "loss": 0.9693, + "step": 2476 + }, + { + "epoch": 0.9919935948759008, + "learning_rate": 1.5160492727653238e-05, + "loss": 0.2317, + "step": 2478 + }, + { + "epoch": 0.9927942353883107, + "learning_rate": 1.5136531684060753e-05, + "loss": 0.5706, + "step": 2480 + }, + { + "epoch": 0.9935948759007206, + "learning_rate": 1.5112530513457251e-05, + "loss": 0.0445, + "step": 2482 + }, + { + "epoch": 0.9943955164131305, + "learning_rate": 1.50884894033418e-05, + "loss": 0.0056, + "step": 2484 + }, + { + "epoch": 0.9951961569255404, + "learning_rate": 1.5064408541525578e-05, + "loss": 0.3846, + "step": 2486 + }, + { + "epoch": 0.9959967974379503, + "learning_rate": 1.504028811613027e-05, + "loss": 0.0589, + "step": 2488 + }, + { + "epoch": 0.9967974379503602, + "learning_rate": 1.5016128315586636e-05, + "loss": 0.0046, + "step": 2490 + }, + { + "epoch": 0.9975980784627703, + "learning_rate": 1.4991929328633043e-05, + "loss": 0.1353, + "step": 2492 + }, + { + "epoch": 0.9983987189751802, + "learning_rate": 1.4967691344314012e-05, + "loss": 0.1051, + "step": 2494 + }, + { + "epoch": 0.9991993594875901, + "learning_rate": 1.4943414551978622e-05, + "loss": 0.259, + "step": 2496 + }, + { + "epoch": 1.0, + "learning_rate": 1.4919099141279214e-05, + "loss": 1.1826, + "step": 2498 + }, + { + "epoch": 1.0, + "step": 2498, + "total_flos": 1.6279311437791232e+16, + "train_loss": 0.22241720583638128, + "train_runtime": 8224.532, + "train_samples_per_second": 2.43, + "train_steps_per_second": 0.304 + } + ], + "logging_steps": 2, + "max_steps": 2498, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 1.6279311437791232e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_125_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_125_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..a4a4d053c4ae063478604fb6b884774ef0dd4d96 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_125_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9a2d1e010296ecbcc75c2b1dd6ce26638ac5abdf233a739bb3c22d2393e220e +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_125_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_125_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..e3fbe3cecb516501f0adc106ab060d99dcc7db8c --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_125_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:007f5aaa9246c69eecff112edb96ee22fd2eecfb96826b02532072d9c14bb584 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_125_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_125_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..bf73cf6d5129e4bdd3287ed3af2471e2001df50f --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_125_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d92ec310b7a697caa7dbe1ff3e37b8443968f8cc306fadf3747c91471fcb561 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_125_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_125_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..1941546b2ecf7292708fbbf8b498175d1016f384 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_125_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bd15313eb7eeec53aba598c639ec650507aaa10fac0c221221b33afa6879117 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_25_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_25_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0157dc94c03ea6c4e1292aa4574216c66c36139b --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_25_seed1/0_trainer_state.json @@ -0,0 +1,15020 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 4996, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008006405124099279, + "learning_rate": 2.406842319175051e-06, + "loss": 0.2519, + "step": 2 + }, + { + "epoch": 0.0008006405124099279, + "learning_rate": 2.415943612351265e-06, + "loss": 0.0044, + "step": 4 + }, + { + "epoch": 0.0016012810248198558, + "learning_rate": 2.4250597173539104e-06, + "loss": 0.0343, + "step": 6 + }, + { + "epoch": 0.0016012810248198558, + "learning_rate": 2.4341906163790364e-06, + "loss": 0.3114, + "step": 8 + }, + { + "epoch": 0.0024019215372297837, + "learning_rate": 2.443336291593801e-06, + "loss": 0.1326, + "step": 10 + }, + { + "epoch": 0.0024019215372297837, + "learning_rate": 2.4524967251364995e-06, + "loss": 0.0705, + "step": 12 + }, + { + "epoch": 0.0032025620496397116, + "learning_rate": 2.461671899116598e-06, + "loss": 0.2532, + "step": 14 + }, + { + "epoch": 0.0032025620496397116, + "learning_rate": 2.4708617956148052e-06, + "loss": 0.0326, + "step": 16 + }, + { + "epoch": 0.0040032025620496394, + "learning_rate": 2.4800663966830417e-06, + "loss": 0.0141, + "step": 18 + }, + { + "epoch": 0.0040032025620496394, + "learning_rate": 2.4892856843445236e-06, + "loss": 0.0245, + "step": 20 + }, + { + "epoch": 0.004803843074459567, + "learning_rate": 2.4985196405937807e-06, + "loss": 0.3464, + "step": 22 + }, + { + "epoch": 0.004803843074459567, + "learning_rate": 2.507768247396697e-06, + "loss": 0.0756, + "step": 24 + }, + { + "epoch": 0.005604483586869495, + "learning_rate": 2.5170314866905443e-06, + "loss": 0.1596, + "step": 26 + }, + { + "epoch": 0.005604483586869495, + "learning_rate": 2.5263093403840022e-06, + "loss": 0.044, + "step": 28 + }, + { + "epoch": 0.006405124099279423, + "learning_rate": 2.535601790357246e-06, + "loss": 0.0665, + "step": 30 + }, + { + "epoch": 0.006405124099279423, + "learning_rate": 2.5449088184619065e-06, + "loss": 0.0006, + "step": 32 + }, + { + "epoch": 0.007205764611689352, + "learning_rate": 2.5542304065211578e-06, + "loss": 0.0057, + "step": 34 + }, + { + "epoch": 0.007205764611689352, + "learning_rate": 2.5635665363297356e-06, + "loss": 0.2457, + "step": 36 + }, + { + "epoch": 0.008006405124099279, + "learning_rate": 2.5729171896539763e-06, + "loss": 0.1338, + "step": 38 + }, + { + "epoch": 0.008006405124099279, + "learning_rate": 2.5822823482318517e-06, + "loss": 0.0902, + "step": 40 + }, + { + "epoch": 0.008807045636509208, + "learning_rate": 2.5916619937729915e-06, + "loss": 0.0018, + "step": 42 + }, + { + "epoch": 0.008807045636509208, + "learning_rate": 2.6010561079587694e-06, + "loss": 0.0002, + "step": 44 + }, + { + "epoch": 0.009607686148919135, + "learning_rate": 2.6104646724422643e-06, + "loss": 0.0601, + "step": 46 + }, + { + "epoch": 0.009607686148919135, + "learning_rate": 2.6198876688483453e-06, + "loss": 0.0171, + "step": 48 + }, + { + "epoch": 0.010408326661329063, + "learning_rate": 2.629325078773699e-06, + "loss": 0.019, + "step": 50 + }, + { + "epoch": 0.010408326661329063, + "learning_rate": 2.6387768837868565e-06, + "loss": 0.1217, + "step": 52 + }, + { + "epoch": 0.01120896717373899, + "learning_rate": 2.648243065428239e-06, + "loss": 0.8397, + "step": 54 + }, + { + "epoch": 0.01120896717373899, + "learning_rate": 2.6577236052101764e-06, + "loss": 1.0356, + "step": 56 + }, + { + "epoch": 0.01200960768614892, + "learning_rate": 2.6672184846169934e-06, + "loss": 0.0757, + "step": 58 + }, + { + "epoch": 0.01200960768614892, + "learning_rate": 2.6767276851049716e-06, + "loss": 0.0128, + "step": 60 + }, + { + "epoch": 0.012810248198558846, + "learning_rate": 2.686251188102439e-06, + "loss": 0.001, + "step": 62 + }, + { + "epoch": 0.012810248198558846, + "learning_rate": 2.6957889750097866e-06, + "loss": 0.1846, + "step": 64 + }, + { + "epoch": 0.013610888710968775, + "learning_rate": 2.7053410271995085e-06, + "loss": 0.2249, + "step": 66 + }, + { + "epoch": 0.013610888710968775, + "learning_rate": 2.7149073260162416e-06, + "loss": 0.0404, + "step": 68 + }, + { + "epoch": 0.014411529223378704, + "learning_rate": 2.724487852776785e-06, + "loss": 0.2846, + "step": 70 + }, + { + "epoch": 0.014411529223378704, + "learning_rate": 2.7340825887701848e-06, + "loss": 0.0004, + "step": 72 + }, + { + "epoch": 0.01521216973578863, + "learning_rate": 2.7436915152577038e-06, + "loss": 0.0171, + "step": 74 + }, + { + "epoch": 0.01521216973578863, + "learning_rate": 2.7533146134728993e-06, + "loss": 0.0151, + "step": 76 + }, + { + "epoch": 0.016012810248198558, + "learning_rate": 2.7629518646216522e-06, + "loss": 0.1361, + "step": 78 + }, + { + "epoch": 0.016012810248198558, + "learning_rate": 2.772603249882202e-06, + "loss": 0.0006, + "step": 80 + }, + { + "epoch": 0.016813450760608487, + "learning_rate": 2.782268750405185e-06, + "loss": 0.0756, + "step": 82 + }, + { + "epoch": 0.016813450760608487, + "learning_rate": 2.7919483473136555e-06, + "loss": 0.0452, + "step": 84 + }, + { + "epoch": 0.017614091273018415, + "learning_rate": 2.801642021703177e-06, + "loss": 0.0426, + "step": 86 + }, + { + "epoch": 0.017614091273018415, + "learning_rate": 2.81134975464178e-06, + "loss": 0.0342, + "step": 88 + }, + { + "epoch": 0.018414731785428344, + "learning_rate": 2.821071527170053e-06, + "loss": 0.0114, + "step": 90 + }, + { + "epoch": 0.018414731785428344, + "learning_rate": 2.8308073203011634e-06, + "loss": 0.0449, + "step": 92 + }, + { + "epoch": 0.01921537229783827, + "learning_rate": 2.8405571150208945e-06, + "loss": 0.5246, + "step": 94 + }, + { + "epoch": 0.01921537229783827, + "learning_rate": 2.850320892287688e-06, + "loss": 0.0202, + "step": 96 + }, + { + "epoch": 0.020016012810248198, + "learning_rate": 2.860098633032663e-06, + "loss": 0.035, + "step": 98 + }, + { + "epoch": 0.020016012810248198, + "learning_rate": 2.8698903181597026e-06, + "loss": 0.3206, + "step": 100 + }, + { + "epoch": 0.020816653322658127, + "learning_rate": 2.879695928545424e-06, + "loss": 0.0895, + "step": 102 + }, + { + "epoch": 0.020816653322658127, + "learning_rate": 2.889515445039256e-06, + "loss": 0.1564, + "step": 104 + }, + { + "epoch": 0.021617293835068056, + "learning_rate": 2.899348848463471e-06, + "loss": 0.0291, + "step": 106 + }, + { + "epoch": 0.021617293835068056, + "learning_rate": 2.909196119613218e-06, + "loss": 0.2459, + "step": 108 + }, + { + "epoch": 0.02241793434747798, + "learning_rate": 2.9190572392565643e-06, + "loss": 0.0886, + "step": 110 + }, + { + "epoch": 0.02241793434747798, + "learning_rate": 2.928932188134529e-06, + "loss": 0.3368, + "step": 112 + }, + { + "epoch": 0.02321857485988791, + "learning_rate": 2.9388209469611093e-06, + "loss": 0.1311, + "step": 114 + }, + { + "epoch": 0.02321857485988791, + "learning_rate": 2.9487234964233724e-06, + "loss": 0.0897, + "step": 116 + }, + { + "epoch": 0.02401921537229784, + "learning_rate": 2.9586398171814114e-06, + "loss": 0.1098, + "step": 118 + }, + { + "epoch": 0.02401921537229784, + "learning_rate": 2.9685698898684355e-06, + "loss": 0.0241, + "step": 120 + }, + { + "epoch": 0.024819855884707767, + "learning_rate": 2.9785136950907987e-06, + "loss": 0.0498, + "step": 122 + }, + { + "epoch": 0.024819855884707767, + "learning_rate": 2.988471213428035e-06, + "loss": 0.0184, + "step": 124 + }, + { + "epoch": 0.025620496397117692, + "learning_rate": 2.9984424254328936e-06, + "loss": 0.3426, + "step": 126 + }, + { + "epoch": 0.025620496397117692, + "learning_rate": 3.00842731163137e-06, + "loss": 0.0238, + "step": 128 + }, + { + "epoch": 0.02642113690952762, + "learning_rate": 3.0184258525227895e-06, + "loss": 0.0959, + "step": 130 + }, + { + "epoch": 0.02642113690952762, + "learning_rate": 3.0284380285797733e-06, + "loss": 0.001, + "step": 132 + }, + { + "epoch": 0.02722177742193755, + "learning_rate": 3.038463820248324e-06, + "loss": 0.0243, + "step": 134 + }, + { + "epoch": 0.02722177742193755, + "learning_rate": 3.048503207947854e-06, + "loss": 0.0324, + "step": 136 + }, + { + "epoch": 0.02802241793434748, + "learning_rate": 3.0585561720712207e-06, + "loss": 0.3695, + "step": 138 + }, + { + "epoch": 0.02802241793434748, + "learning_rate": 3.068622692984767e-06, + "loss": 0.0063, + "step": 140 + }, + { + "epoch": 0.028823058446757407, + "learning_rate": 3.0787027510283495e-06, + "loss": 0.3794, + "step": 142 + }, + { + "epoch": 0.028823058446757407, + "learning_rate": 3.0887963265154187e-06, + "loss": 1.2915, + "step": 144 + }, + { + "epoch": 0.029623698959167333, + "learning_rate": 3.098903399732992e-06, + "loss": 0.0519, + "step": 146 + }, + { + "epoch": 0.029623698959167333, + "learning_rate": 3.1090239509417364e-06, + "loss": 0.0815, + "step": 148 + }, + { + "epoch": 0.03042433947157726, + "learning_rate": 3.1191579603759946e-06, + "loss": 0.0028, + "step": 150 + }, + { + "epoch": 0.03042433947157726, + "learning_rate": 3.129305408243829e-06, + "loss": 0.0143, + "step": 152 + }, + { + "epoch": 0.03122497998398719, + "learning_rate": 3.139466274727052e-06, + "loss": 0.0705, + "step": 154 + }, + { + "epoch": 0.03122497998398719, + "learning_rate": 3.1496405399812602e-06, + "loss": 0.1418, + "step": 156 + }, + { + "epoch": 0.032025620496397116, + "learning_rate": 3.159828184135917e-06, + "loss": 0.0102, + "step": 158 + }, + { + "epoch": 0.032025620496397116, + "learning_rate": 3.17002918729432e-06, + "loss": 0.5672, + "step": 160 + }, + { + "epoch": 0.03282626100880705, + "learning_rate": 3.1802435295336908e-06, + "loss": 0.5021, + "step": 162 + }, + { + "epoch": 0.03282626100880705, + "learning_rate": 3.1904711909051967e-06, + "loss": 0.0203, + "step": 164 + }, + { + "epoch": 0.03362690152121697, + "learning_rate": 3.2007121514339924e-06, + "loss": 0.1172, + "step": 166 + }, + { + "epoch": 0.03362690152121697, + "learning_rate": 3.2109663911192622e-06, + "loss": 0.0511, + "step": 168 + }, + { + "epoch": 0.0344275420336269, + "learning_rate": 3.221233889934239e-06, + "loss": 0.0333, + "step": 170 + }, + { + "epoch": 0.0344275420336269, + "learning_rate": 3.231514627826302e-06, + "loss": 0.0604, + "step": 172 + }, + { + "epoch": 0.03522818254603683, + "learning_rate": 3.2418085847169344e-06, + "loss": 0.4689, + "step": 174 + }, + { + "epoch": 0.03522818254603683, + "learning_rate": 3.2521157405018146e-06, + "loss": 0.0882, + "step": 176 + }, + { + "epoch": 0.036028823058446756, + "learning_rate": 3.2624360750508457e-06, + "loss": 0.0788, + "step": 178 + }, + { + "epoch": 0.036028823058446756, + "learning_rate": 3.2727695682081897e-06, + "loss": 0.0404, + "step": 180 + }, + { + "epoch": 0.03682946357085669, + "learning_rate": 3.28311619979231e-06, + "loss": 0.0282, + "step": 182 + }, + { + "epoch": 0.03682946357085669, + "learning_rate": 3.293475949595998e-06, + "loss": 0.1247, + "step": 184 + }, + { + "epoch": 0.03763010408326661, + "learning_rate": 3.303848797386465e-06, + "loss": 0.0111, + "step": 186 + }, + { + "epoch": 0.03763010408326661, + "learning_rate": 3.314234722905302e-06, + "loss": 0.1128, + "step": 188 + }, + { + "epoch": 0.03843074459567654, + "learning_rate": 3.3246337058685697e-06, + "loss": 0.03, + "step": 190 + }, + { + "epoch": 0.03843074459567654, + "learning_rate": 3.335045725966829e-06, + "loss": 0.273, + "step": 192 + }, + { + "epoch": 0.03923138510808647, + "learning_rate": 3.3454707628651806e-06, + "loss": 0.2466, + "step": 194 + }, + { + "epoch": 0.03923138510808647, + "learning_rate": 3.355908796203301e-06, + "loss": 0.1428, + "step": 196 + }, + { + "epoch": 0.040032025620496396, + "learning_rate": 3.3663598055954716e-06, + "loss": 0.0119, + "step": 198 + }, + { + "epoch": 0.040032025620496396, + "learning_rate": 3.3768237706306716e-06, + "loss": 0.1348, + "step": 200 + }, + { + "epoch": 0.04083266613290633, + "learning_rate": 3.3873006708725365e-06, + "loss": 0.3758, + "step": 202 + }, + { + "epoch": 0.04083266613290633, + "learning_rate": 3.3977904858594534e-06, + "loss": 0.052, + "step": 204 + }, + { + "epoch": 0.041633306645316254, + "learning_rate": 3.408293195104586e-06, + "loss": 0.3071, + "step": 206 + }, + { + "epoch": 0.041633306645316254, + "learning_rate": 3.418808778095917e-06, + "loss": 0.1068, + "step": 208 + }, + { + "epoch": 0.04243394715772618, + "learning_rate": 3.4293372142962845e-06, + "loss": 0.0073, + "step": 210 + }, + { + "epoch": 0.04243394715772618, + "learning_rate": 3.4398784831434097e-06, + "loss": 0.0329, + "step": 212 + }, + { + "epoch": 0.04323458767013611, + "learning_rate": 3.4504325640499936e-06, + "loss": 0.0364, + "step": 214 + }, + { + "epoch": 0.04323458767013611, + "learning_rate": 3.460999436403676e-06, + "loss": 0.0416, + "step": 216 + }, + { + "epoch": 0.044035228182546036, + "learning_rate": 3.4715790795671232e-06, + "loss": 0.7662, + "step": 218 + }, + { + "epoch": 0.044035228182546036, + "learning_rate": 3.4821714728780654e-06, + "loss": 0.0144, + "step": 220 + }, + { + "epoch": 0.04483586869495596, + "learning_rate": 3.4927765956493276e-06, + "loss": 0.0009, + "step": 222 + }, + { + "epoch": 0.04483586869495596, + "learning_rate": 3.5033944271688624e-06, + "loss": 0.1467, + "step": 224 + }, + { + "epoch": 0.045636509207365894, + "learning_rate": 3.514024946699842e-06, + "loss": 0.0336, + "step": 226 + }, + { + "epoch": 0.045636509207365894, + "learning_rate": 3.5246681334806177e-06, + "loss": 0.0208, + "step": 228 + }, + { + "epoch": 0.04643714971977582, + "learning_rate": 3.535323966724814e-06, + "loss": 0.0059, + "step": 230 + }, + { + "epoch": 0.04643714971977582, + "learning_rate": 3.5459924256213596e-06, + "loss": 0.0677, + "step": 232 + }, + { + "epoch": 0.04723779023218575, + "learning_rate": 3.556673489334522e-06, + "loss": 0.0872, + "step": 234 + }, + { + "epoch": 0.04723779023218575, + "learning_rate": 3.567367137003953e-06, + "loss": 0.1536, + "step": 236 + }, + { + "epoch": 0.04803843074459568, + "learning_rate": 3.5780733477447127e-06, + "loss": 0.0045, + "step": 238 + }, + { + "epoch": 0.04803843074459568, + "learning_rate": 3.588792100647368e-06, + "loss": 0.105, + "step": 240 + }, + { + "epoch": 0.0488390712570056, + "learning_rate": 3.5995233747779467e-06, + "loss": 0.1688, + "step": 242 + }, + { + "epoch": 0.0488390712570056, + "learning_rate": 3.6102671491780393e-06, + "loss": 0.2149, + "step": 244 + }, + { + "epoch": 0.049639711769415534, + "learning_rate": 3.6210234028648216e-06, + "loss": 0.0029, + "step": 246 + }, + { + "epoch": 0.049639711769415534, + "learning_rate": 3.6317921148310965e-06, + "loss": 0.0119, + "step": 248 + }, + { + "epoch": 0.05044035228182546, + "learning_rate": 3.6425732640453235e-06, + "loss": 0.0028, + "step": 250 + }, + { + "epoch": 0.05044035228182546, + "learning_rate": 3.653366829451711e-06, + "loss": 0.0818, + "step": 252 + }, + { + "epoch": 0.051240992794235385, + "learning_rate": 3.6641727899701795e-06, + "loss": 0.0105, + "step": 254 + }, + { + "epoch": 0.051240992794235385, + "learning_rate": 3.674991124496452e-06, + "loss": 0.2018, + "step": 256 + }, + { + "epoch": 0.05204163330664532, + "learning_rate": 3.6858218119020884e-06, + "loss": 0.4371, + "step": 258 + }, + { + "epoch": 0.05204163330664532, + "learning_rate": 3.696664831034521e-06, + "loss": 0.0198, + "step": 260 + }, + { + "epoch": 0.05284227381905524, + "learning_rate": 3.7075201607170997e-06, + "loss": 0.0375, + "step": 262 + }, + { + "epoch": 0.05284227381905524, + "learning_rate": 3.7183877797491143e-06, + "loss": 0.0271, + "step": 264 + }, + { + "epoch": 0.053642914331465175, + "learning_rate": 3.729267666905899e-06, + "loss": 0.3141, + "step": 266 + }, + { + "epoch": 0.053642914331465175, + "learning_rate": 3.740159800938784e-06, + "loss": 0.0182, + "step": 268 + }, + { + "epoch": 0.0544435548438751, + "learning_rate": 3.751064160575195e-06, + "loss": 0.0219, + "step": 270 + }, + { + "epoch": 0.0544435548438751, + "learning_rate": 3.7619807245186824e-06, + "loss": 0.0221, + "step": 272 + }, + { + "epoch": 0.055244195356285025, + "learning_rate": 3.772909471448959e-06, + "loss": 0.0083, + "step": 274 + }, + { + "epoch": 0.055244195356285025, + "learning_rate": 3.783850380021933e-06, + "loss": 0.01, + "step": 276 + }, + { + "epoch": 0.05604483586869496, + "learning_rate": 3.794803428869799e-06, + "loss": 0.0402, + "step": 278 + }, + { + "epoch": 0.05604483586869496, + "learning_rate": 3.8057685966010025e-06, + "loss": 0.3223, + "step": 280 + }, + { + "epoch": 0.05684547638110488, + "learning_rate": 3.816745861800334e-06, + "loss": 0.0028, + "step": 282 + }, + { + "epoch": 0.05684547638110488, + "learning_rate": 3.827735203028956e-06, + "loss": 0.0101, + "step": 284 + }, + { + "epoch": 0.057646116893514815, + "learning_rate": 3.838736598824446e-06, + "loss": 0.1844, + "step": 286 + }, + { + "epoch": 0.057646116893514815, + "learning_rate": 3.849750027700842e-06, + "loss": 1.299, + "step": 288 + }, + { + "epoch": 0.05844675740592474, + "learning_rate": 3.860775468148662e-06, + "loss": 0.0407, + "step": 290 + }, + { + "epoch": 0.05844675740592474, + "learning_rate": 3.871812898635011e-06, + "loss": 0.0005, + "step": 292 + }, + { + "epoch": 0.059247397918334666, + "learning_rate": 3.882862297603536e-06, + "loss": 0.3631, + "step": 294 + }, + { + "epoch": 0.059247397918334666, + "learning_rate": 3.8939236434745184e-06, + "loss": 0.0459, + "step": 296 + }, + { + "epoch": 0.0600480384307446, + "learning_rate": 3.904996914644913e-06, + "loss": 0.0354, + "step": 298 + }, + { + "epoch": 0.0600480384307446, + "learning_rate": 3.916082089488379e-06, + "loss": 0.2152, + "step": 300 + }, + { + "epoch": 0.06084867894315452, + "learning_rate": 3.927179146355317e-06, + "loss": 0.1937, + "step": 302 + }, + { + "epoch": 0.06084867894315452, + "learning_rate": 3.938288063572962e-06, + "loss": 0.0141, + "step": 304 + }, + { + "epoch": 0.06164931945556445, + "learning_rate": 3.949408819445345e-06, + "loss": 0.0422, + "step": 306 + }, + { + "epoch": 0.06164931945556445, + "learning_rate": 3.960541392253387e-06, + "loss": 0.0013, + "step": 308 + }, + { + "epoch": 0.06244995996797438, + "learning_rate": 3.971685760254933e-06, + "loss": 0.0203, + "step": 310 + }, + { + "epoch": 0.06244995996797438, + "learning_rate": 3.982841901684792e-06, + "loss": 0.1088, + "step": 312 + }, + { + "epoch": 0.0632506004803843, + "learning_rate": 3.994009794754777e-06, + "loss": 0.1502, + "step": 314 + }, + { + "epoch": 0.0632506004803843, + "learning_rate": 4.005189417653737e-06, + "loss": 0.0118, + "step": 316 + }, + { + "epoch": 0.06405124099279423, + "learning_rate": 4.016380748547654e-06, + "loss": 0.0343, + "step": 318 + }, + { + "epoch": 0.06405124099279423, + "learning_rate": 4.027583765579601e-06, + "loss": 0.3875, + "step": 320 + }, + { + "epoch": 0.06485188150520416, + "learning_rate": 4.038798446869847e-06, + "loss": 0.009, + "step": 322 + }, + { + "epoch": 0.06485188150520416, + "learning_rate": 4.050024770515873e-06, + "loss": 0.3068, + "step": 324 + }, + { + "epoch": 0.0656525220176141, + "learning_rate": 4.061262714592426e-06, + "loss": 0.0024, + "step": 326 + }, + { + "epoch": 0.0656525220176141, + "learning_rate": 4.072512257151546e-06, + "loss": 0.2223, + "step": 328 + }, + { + "epoch": 0.06645316253002402, + "learning_rate": 4.0837733762226584e-06, + "loss": 0.0367, + "step": 330 + }, + { + "epoch": 0.06645316253002402, + "learning_rate": 4.095046049812541e-06, + "loss": 0.1421, + "step": 332 + }, + { + "epoch": 0.06725380304243395, + "learning_rate": 4.106330255905417e-06, + "loss": 0.0149, + "step": 334 + }, + { + "epoch": 0.06725380304243395, + "learning_rate": 4.117625972462988e-06, + "loss": 0.0418, + "step": 336 + }, + { + "epoch": 0.06805444355484387, + "learning_rate": 4.128933177424475e-06, + "loss": 0.1029, + "step": 338 + }, + { + "epoch": 0.06805444355484387, + "learning_rate": 4.1402518487066624e-06, + "loss": 0.5059, + "step": 340 + }, + { + "epoch": 0.0688550840672538, + "learning_rate": 4.151581964203924e-06, + "loss": 0.1905, + "step": 342 + }, + { + "epoch": 0.0688550840672538, + "learning_rate": 4.1629235017883285e-06, + "loss": 0.0657, + "step": 344 + }, + { + "epoch": 0.06965572457966374, + "learning_rate": 4.174276439309593e-06, + "loss": 0.0831, + "step": 346 + }, + { + "epoch": 0.06965572457966374, + "learning_rate": 4.1856407545951825e-06, + "loss": 0.0063, + "step": 348 + }, + { + "epoch": 0.07045636509207366, + "learning_rate": 4.197016425450347e-06, + "loss": 0.024, + "step": 350 + }, + { + "epoch": 0.07045636509207366, + "learning_rate": 4.208403429658151e-06, + "loss": 0.5677, + "step": 352 + }, + { + "epoch": 0.07125700560448359, + "learning_rate": 4.219801744979517e-06, + "loss": 0.135, + "step": 354 + }, + { + "epoch": 0.07125700560448359, + "learning_rate": 4.2312113491533145e-06, + "loss": 0.0984, + "step": 356 + }, + { + "epoch": 0.07205764611689351, + "learning_rate": 4.242632219896328e-06, + "loss": 0.0227, + "step": 358 + }, + { + "epoch": 0.07205764611689351, + "learning_rate": 4.254064334903347e-06, + "loss": 0.1908, + "step": 360 + }, + { + "epoch": 0.07285828662930344, + "learning_rate": 4.2655076718472045e-06, + "loss": 0.0083, + "step": 362 + }, + { + "epoch": 0.07285828662930344, + "learning_rate": 4.276962208378814e-06, + "loss": 0.1739, + "step": 364 + }, + { + "epoch": 0.07365892714171338, + "learning_rate": 4.28842792212722e-06, + "loss": 0.8132, + "step": 366 + }, + { + "epoch": 0.07365892714171338, + "learning_rate": 4.299904790699619e-06, + "loss": 0.1011, + "step": 368 + }, + { + "epoch": 0.0744595676541233, + "learning_rate": 4.3113927916814665e-06, + "loss": 0.0094, + "step": 370 + }, + { + "epoch": 0.0744595676541233, + "learning_rate": 4.3228919026364345e-06, + "loss": 0.0707, + "step": 372 + }, + { + "epoch": 0.07526020816653323, + "learning_rate": 4.33440210110651e-06, + "loss": 0.3988, + "step": 374 + }, + { + "epoch": 0.07526020816653323, + "learning_rate": 4.345923364612024e-06, + "loss": 0.0175, + "step": 376 + }, + { + "epoch": 0.07606084867894315, + "learning_rate": 4.3574556706517035e-06, + "loss": 0.015, + "step": 378 + }, + { + "epoch": 0.07606084867894315, + "learning_rate": 4.368998996702686e-06, + "loss": 0.0369, + "step": 380 + }, + { + "epoch": 0.07686148919135308, + "learning_rate": 4.380553320220638e-06, + "loss": 0.4302, + "step": 382 + }, + { + "epoch": 0.07686148919135308, + "learning_rate": 4.392118618639698e-06, + "loss": 0.0038, + "step": 384 + }, + { + "epoch": 0.07766212970376302, + "learning_rate": 4.403694869372589e-06, + "loss": 0.0319, + "step": 386 + }, + { + "epoch": 0.07766212970376302, + "learning_rate": 4.415282049810643e-06, + "loss": 0.1324, + "step": 388 + }, + { + "epoch": 0.07846277021617294, + "learning_rate": 4.4268801373238454e-06, + "loss": 0.2953, + "step": 390 + }, + { + "epoch": 0.07846277021617294, + "learning_rate": 4.4384891092608795e-06, + "loss": 0.4595, + "step": 392 + }, + { + "epoch": 0.07926341072858287, + "learning_rate": 4.450108942949158e-06, + "loss": 0.0508, + "step": 394 + }, + { + "epoch": 0.07926341072858287, + "learning_rate": 4.461739615694921e-06, + "loss": 0.1593, + "step": 396 + }, + { + "epoch": 0.08006405124099279, + "learning_rate": 4.473381104783201e-06, + "loss": 0.0033, + "step": 398 + }, + { + "epoch": 0.08006405124099279, + "learning_rate": 4.485033387477915e-06, + "loss": 0.0351, + "step": 400 + }, + { + "epoch": 0.08086469175340272, + "learning_rate": 4.496696441021904e-06, + "loss": 0.0104, + "step": 402 + }, + { + "epoch": 0.08086469175340272, + "learning_rate": 4.5083702426369715e-06, + "loss": 0.093, + "step": 404 + }, + { + "epoch": 0.08166533226581266, + "learning_rate": 4.520054769523929e-06, + "loss": 0.2448, + "step": 406 + }, + { + "epoch": 0.08166533226581266, + "learning_rate": 4.531749998862628e-06, + "loss": 0.0426, + "step": 408 + }, + { + "epoch": 0.08246597277822258, + "learning_rate": 4.543455907812063e-06, + "loss": 0.0386, + "step": 410 + }, + { + "epoch": 0.08246597277822258, + "learning_rate": 4.555172473510324e-06, + "loss": 0.2554, + "step": 412 + }, + { + "epoch": 0.08326661329063251, + "learning_rate": 4.566899673074706e-06, + "loss": 0.0275, + "step": 414 + }, + { + "epoch": 0.08326661329063251, + "learning_rate": 4.578637483601732e-06, + "loss": 0.0158, + "step": 416 + }, + { + "epoch": 0.08406725380304243, + "learning_rate": 4.590385882167206e-06, + "loss": 0.1314, + "step": 418 + }, + { + "epoch": 0.08406725380304243, + "learning_rate": 4.602144845826234e-06, + "loss": 0.0988, + "step": 420 + }, + { + "epoch": 0.08486789431545236, + "learning_rate": 4.613914351613337e-06, + "loss": 0.0459, + "step": 422 + }, + { + "epoch": 0.08486789431545236, + "learning_rate": 4.625694376542399e-06, + "loss": 0.2044, + "step": 424 + }, + { + "epoch": 0.08566853482786228, + "learning_rate": 4.637484897606777e-06, + "loss": 0.0798, + "step": 426 + }, + { + "epoch": 0.08566853482786228, + "learning_rate": 4.649285891779326e-06, + "loss": 0.0051, + "step": 428 + }, + { + "epoch": 0.08646917534027222, + "learning_rate": 4.661097336012451e-06, + "loss": 0.2954, + "step": 430 + }, + { + "epoch": 0.08646917534027222, + "learning_rate": 4.672919207238145e-06, + "loss": 0.002, + "step": 432 + }, + { + "epoch": 0.08726981585268215, + "learning_rate": 4.684751482368022e-06, + "loss": 0.348, + "step": 434 + }, + { + "epoch": 0.08726981585268215, + "learning_rate": 4.696594138293421e-06, + "loss": 0.0126, + "step": 436 + }, + { + "epoch": 0.08807045636509207, + "learning_rate": 4.7084471518853656e-06, + "loss": 0.6767, + "step": 438 + }, + { + "epoch": 0.08807045636509207, + "learning_rate": 4.720310499994664e-06, + "loss": 0.0477, + "step": 440 + }, + { + "epoch": 0.088871096877502, + "learning_rate": 4.732184159451937e-06, + "loss": 0.3168, + "step": 442 + }, + { + "epoch": 0.088871096877502, + "learning_rate": 4.744068107067673e-06, + "loss": 0.0699, + "step": 444 + }, + { + "epoch": 0.08967173738991192, + "learning_rate": 4.755962319632249e-06, + "loss": 0.0014, + "step": 446 + }, + { + "epoch": 0.08967173738991192, + "learning_rate": 4.767866773916041e-06, + "loss": 0.4021, + "step": 448 + }, + { + "epoch": 0.09047237790232186, + "learning_rate": 4.779781446669376e-06, + "loss": 0.0127, + "step": 450 + }, + { + "epoch": 0.09047237790232186, + "learning_rate": 4.79170631462264e-06, + "loss": 0.0511, + "step": 452 + }, + { + "epoch": 0.09127301841473179, + "learning_rate": 4.8036413544863095e-06, + "loss": 0.1386, + "step": 454 + }, + { + "epoch": 0.09127301841473179, + "learning_rate": 4.81558654295099e-06, + "loss": 0.4022, + "step": 456 + }, + { + "epoch": 0.09207365892714171, + "learning_rate": 4.827541856687471e-06, + "loss": 0.0018, + "step": 458 + }, + { + "epoch": 0.09207365892714171, + "learning_rate": 4.839507272346751e-06, + "loss": 0.0189, + "step": 460 + }, + { + "epoch": 0.09287429943955164, + "learning_rate": 4.8514827665601425e-06, + "loss": 0.1333, + "step": 462 + }, + { + "epoch": 0.09287429943955164, + "learning_rate": 4.863468315939234e-06, + "loss": 0.2648, + "step": 464 + }, + { + "epoch": 0.09367493995196156, + "learning_rate": 4.875463897075985e-06, + "loss": 0.0499, + "step": 466 + }, + { + "epoch": 0.09367493995196156, + "learning_rate": 4.8874694865427676e-06, + "loss": 0.1387, + "step": 468 + }, + { + "epoch": 0.0944755804643715, + "learning_rate": 4.899485060892404e-06, + "loss": 0.0088, + "step": 470 + }, + { + "epoch": 0.0944755804643715, + "learning_rate": 4.911510596658202e-06, + "loss": 0.164, + "step": 472 + }, + { + "epoch": 0.09527622097678143, + "learning_rate": 4.9235460703540615e-06, + "loss": 0.0177, + "step": 474 + }, + { + "epoch": 0.09527622097678143, + "learning_rate": 4.935591458474425e-06, + "loss": 0.3301, + "step": 476 + }, + { + "epoch": 0.09607686148919135, + "learning_rate": 4.947646737494389e-06, + "loss": 0.0099, + "step": 478 + }, + { + "epoch": 0.09607686148919135, + "learning_rate": 4.959711883869734e-06, + "loss": 0.0084, + "step": 480 + }, + { + "epoch": 0.09687750200160128, + "learning_rate": 4.9717868740369645e-06, + "loss": 0.0971, + "step": 482 + }, + { + "epoch": 0.09687750200160128, + "learning_rate": 4.9838716844133665e-06, + "loss": 0.0226, + "step": 484 + }, + { + "epoch": 0.0976781425140112, + "learning_rate": 4.9959662913970254e-06, + "loss": 0.0037, + "step": 486 + }, + { + "epoch": 0.0976781425140112, + "learning_rate": 5.0080706713669435e-06, + "loss": 0.1343, + "step": 488 + }, + { + "epoch": 0.09847878302642114, + "learning_rate": 5.02018480068299e-06, + "loss": 0.8829, + "step": 490 + }, + { + "epoch": 0.09847878302642114, + "learning_rate": 5.032308655686007e-06, + "loss": 0.0498, + "step": 492 + }, + { + "epoch": 0.09927942353883107, + "learning_rate": 5.044442212697842e-06, + "loss": 0.1901, + "step": 494 + }, + { + "epoch": 0.09927942353883107, + "learning_rate": 5.056585448021398e-06, + "loss": 0.0026, + "step": 496 + }, + { + "epoch": 0.100080064051241, + "learning_rate": 5.068738337940655e-06, + "loss": 0.0202, + "step": 498 + }, + { + "epoch": 0.100080064051241, + "learning_rate": 5.080900858720789e-06, + "loss": 0.0788, + "step": 500 + }, + { + "epoch": 0.10088070456365092, + "learning_rate": 5.093072986608116e-06, + "loss": 0.212, + "step": 502 + }, + { + "epoch": 0.10088070456365092, + "learning_rate": 5.105254697830208e-06, + "loss": 0.1079, + "step": 504 + }, + { + "epoch": 0.10168134507606084, + "learning_rate": 5.1174459685959175e-06, + "loss": 0.0125, + "step": 506 + }, + { + "epoch": 0.10168134507606084, + "learning_rate": 5.129646775095432e-06, + "loss": 0.2207, + "step": 508 + }, + { + "epoch": 0.10248198558847077, + "learning_rate": 5.141857093500307e-06, + "loss": 0.7551, + "step": 510 + }, + { + "epoch": 0.10248198558847077, + "learning_rate": 5.154076899963514e-06, + "loss": 0.1081, + "step": 512 + }, + { + "epoch": 0.10328262610088071, + "learning_rate": 5.166306170619537e-06, + "loss": 0.2252, + "step": 514 + }, + { + "epoch": 0.10328262610088071, + "learning_rate": 5.178544881584328e-06, + "loss": 0.0203, + "step": 516 + }, + { + "epoch": 0.10408326661329063, + "learning_rate": 5.190793008955421e-06, + "loss": 0.3512, + "step": 518 + }, + { + "epoch": 0.10408326661329063, + "learning_rate": 5.203050528811959e-06, + "loss": 0.0403, + "step": 520 + }, + { + "epoch": 0.10488390712570056, + "learning_rate": 5.215317417214739e-06, + "loss": 0.1196, + "step": 522 + }, + { + "epoch": 0.10488390712570056, + "learning_rate": 5.227593650206246e-06, + "loss": 0.1047, + "step": 524 + }, + { + "epoch": 0.10568454763811048, + "learning_rate": 5.239879203810763e-06, + "loss": 0.0878, + "step": 526 + }, + { + "epoch": 0.10568454763811048, + "learning_rate": 5.2521740540343205e-06, + "loss": 0.5903, + "step": 528 + }, + { + "epoch": 0.10648518815052041, + "learning_rate": 5.264478176864811e-06, + "loss": 0.2098, + "step": 530 + }, + { + "epoch": 0.10648518815052041, + "learning_rate": 5.2767915482720164e-06, + "loss": 0.3297, + "step": 532 + }, + { + "epoch": 0.10728582866293035, + "learning_rate": 5.289114144207656e-06, + "loss": 0.0015, + "step": 534 + }, + { + "epoch": 0.10728582866293035, + "learning_rate": 5.3014459406054295e-06, + "loss": 0.2861, + "step": 536 + }, + { + "epoch": 0.10808646917534027, + "learning_rate": 5.313786913381061e-06, + "loss": 0.1299, + "step": 538 + }, + { + "epoch": 0.10808646917534027, + "learning_rate": 5.3261370384323904e-06, + "loss": 0.0859, + "step": 540 + }, + { + "epoch": 0.1088871096877502, + "learning_rate": 5.338496291639341e-06, + "loss": 0.0443, + "step": 542 + }, + { + "epoch": 0.1088871096877502, + "learning_rate": 5.350864648864026e-06, + "loss": 0.1017, + "step": 544 + }, + { + "epoch": 0.10968775020016013, + "learning_rate": 5.363242085950773e-06, + "loss": 0.0319, + "step": 546 + }, + { + "epoch": 0.10968775020016013, + "learning_rate": 5.375628578726181e-06, + "loss": 0.0388, + "step": 548 + }, + { + "epoch": 0.11048839071257005, + "learning_rate": 5.3880241029991434e-06, + "loss": 0.2387, + "step": 550 + }, + { + "epoch": 0.11048839071257005, + "learning_rate": 5.4004286345609665e-06, + "loss": 0.1643, + "step": 552 + }, + { + "epoch": 0.11128903122497999, + "learning_rate": 5.412842149185316e-06, + "loss": 0.0511, + "step": 554 + }, + { + "epoch": 0.11128903122497999, + "learning_rate": 5.425264622628326e-06, + "loss": 0.0085, + "step": 556 + }, + { + "epoch": 0.11208967173738991, + "learning_rate": 5.437696030628639e-06, + "loss": 0.169, + "step": 558 + }, + { + "epoch": 0.11208967173738991, + "learning_rate": 5.450136348907444e-06, + "loss": 0.0071, + "step": 560 + }, + { + "epoch": 0.11289031224979984, + "learning_rate": 5.462585553168532e-06, + "loss": 0.0072, + "step": 562 + }, + { + "epoch": 0.11289031224979984, + "learning_rate": 5.475043619098321e-06, + "loss": 0.045, + "step": 564 + }, + { + "epoch": 0.11369095276220977, + "learning_rate": 5.487510522365969e-06, + "loss": 0.1839, + "step": 566 + }, + { + "epoch": 0.11369095276220977, + "learning_rate": 5.499986238623329e-06, + "loss": 0.1744, + "step": 568 + }, + { + "epoch": 0.11449159327461969, + "learning_rate": 5.512470743505057e-06, + "loss": 0.0111, + "step": 570 + }, + { + "epoch": 0.11449159327461969, + "learning_rate": 5.524964012628644e-06, + "loss": 0.0877, + "step": 572 + }, + { + "epoch": 0.11529223378702963, + "learning_rate": 5.537466021594464e-06, + "loss": 0.0096, + "step": 574 + }, + { + "epoch": 0.11529223378702963, + "learning_rate": 5.549976745985809e-06, + "loss": 0.2757, + "step": 576 + }, + { + "epoch": 0.11609287429943956, + "learning_rate": 5.5624961613689934e-06, + "loss": 0.1623, + "step": 578 + }, + { + "epoch": 0.11609287429943956, + "learning_rate": 5.57502424329331e-06, + "loss": 0.0124, + "step": 580 + }, + { + "epoch": 0.11689351481184948, + "learning_rate": 5.5875609672911465e-06, + "loss": 0.014, + "step": 582 + }, + { + "epoch": 0.11689351481184948, + "learning_rate": 5.6001063088780085e-06, + "loss": 0.2655, + "step": 584 + }, + { + "epoch": 0.1176941553242594, + "learning_rate": 5.6126602435525725e-06, + "loss": 0.0098, + "step": 586 + }, + { + "epoch": 0.1176941553242594, + "learning_rate": 5.62522274679673e-06, + "loss": 0.0114, + "step": 588 + }, + { + "epoch": 0.11849479583666933, + "learning_rate": 5.637793794075625e-06, + "loss": 0.0724, + "step": 590 + }, + { + "epoch": 0.11849479583666933, + "learning_rate": 5.650373360837763e-06, + "loss": 0.0009, + "step": 592 + }, + { + "epoch": 0.11929543634907927, + "learning_rate": 5.662961422514961e-06, + "loss": 0.4498, + "step": 594 + }, + { + "epoch": 0.11929543634907927, + "learning_rate": 5.675557954522462e-06, + "loss": 0.2697, + "step": 596 + }, + { + "epoch": 0.1200960768614892, + "learning_rate": 5.688162932258965e-06, + "loss": 0.0014, + "step": 598 + }, + { + "epoch": 0.1200960768614892, + "learning_rate": 5.700776331106674e-06, + "loss": 0.0477, + "step": 600 + }, + { + "epoch": 0.12089671737389912, + "learning_rate": 5.713398126431353e-06, + "loss": 0.0139, + "step": 602 + }, + { + "epoch": 0.12089671737389912, + "learning_rate": 5.726028293582342e-06, + "loss": 0.1022, + "step": 604 + }, + { + "epoch": 0.12169735788630905, + "learning_rate": 5.738666807892684e-06, + "loss": 0.2162, + "step": 606 + }, + { + "epoch": 0.12169735788630905, + "learning_rate": 5.751313644679071e-06, + "loss": 0.0051, + "step": 608 + }, + { + "epoch": 0.12249799839871897, + "learning_rate": 5.763968779241957e-06, + "loss": 0.1715, + "step": 610 + }, + { + "epoch": 0.12249799839871897, + "learning_rate": 5.776632186865589e-06, + "loss": 0.0047, + "step": 612 + }, + { + "epoch": 0.1232986389111289, + "learning_rate": 5.7893038428180584e-06, + "loss": 0.1592, + "step": 614 + }, + { + "epoch": 0.1232986389111289, + "learning_rate": 5.8019837223513295e-06, + "loss": 0.2256, + "step": 616 + }, + { + "epoch": 0.12409927942353884, + "learning_rate": 5.814671800701357e-06, + "loss": 0.8229, + "step": 618 + }, + { + "epoch": 0.12409927942353884, + "learning_rate": 5.827368053088032e-06, + "loss": 0.2531, + "step": 620 + }, + { + "epoch": 0.12489991993594876, + "learning_rate": 5.840072454715297e-06, + "loss": 0.0902, + "step": 622 + }, + { + "epoch": 0.12489991993594876, + "learning_rate": 5.852784980771182e-06, + "loss": 0.0162, + "step": 624 + }, + { + "epoch": 0.1257005604483587, + "learning_rate": 5.865505606427848e-06, + "loss": 0.0206, + "step": 626 + }, + { + "epoch": 0.1257005604483587, + "learning_rate": 5.878234306841637e-06, + "loss": 0.5957, + "step": 628 + }, + { + "epoch": 0.1265012009607686, + "learning_rate": 5.890971057153105e-06, + "loss": 0.011, + "step": 630 + }, + { + "epoch": 0.1265012009607686, + "learning_rate": 5.903715832487138e-06, + "loss": 0.0731, + "step": 632 + }, + { + "epoch": 0.12730184147317855, + "learning_rate": 5.916468607952892e-06, + "loss": 0.0028, + "step": 634 + }, + { + "epoch": 0.12730184147317855, + "learning_rate": 5.929229358643925e-06, + "loss": 0.0069, + "step": 636 + }, + { + "epoch": 0.12810248198558846, + "learning_rate": 5.941998059638212e-06, + "loss": 0.3759, + "step": 638 + }, + { + "epoch": 0.12810248198558846, + "learning_rate": 5.954774685998206e-06, + "loss": 0.6452, + "step": 640 + }, + { + "epoch": 0.1289031224979984, + "learning_rate": 5.9675592127708585e-06, + "loss": 0.3782, + "step": 642 + }, + { + "epoch": 0.1289031224979984, + "learning_rate": 5.9803516149877475e-06, + "loss": 0.0018, + "step": 644 + }, + { + "epoch": 0.1297037630104083, + "learning_rate": 5.993151867665015e-06, + "loss": 0.0112, + "step": 646 + }, + { + "epoch": 0.1297037630104083, + "learning_rate": 6.005959945803494e-06, + "loss": 0.0023, + "step": 648 + }, + { + "epoch": 0.13050440352281825, + "learning_rate": 6.01877582438873e-06, + "loss": 0.0068, + "step": 650 + }, + { + "epoch": 0.13050440352281825, + "learning_rate": 6.03159947839103e-06, + "loss": 0.007, + "step": 652 + }, + { + "epoch": 0.1313050440352282, + "learning_rate": 6.0444308827655265e-06, + "loss": 0.0086, + "step": 654 + }, + { + "epoch": 0.1313050440352282, + "learning_rate": 6.057270012452186e-06, + "loss": 0.0056, + "step": 656 + }, + { + "epoch": 0.1321056845476381, + "learning_rate": 6.070116842375947e-06, + "loss": 0.0196, + "step": 658 + }, + { + "epoch": 0.1321056845476381, + "learning_rate": 6.082971347446654e-06, + "loss": 0.0581, + "step": 660 + }, + { + "epoch": 0.13290632506004804, + "learning_rate": 6.095833502559182e-06, + "loss": 0.3065, + "step": 662 + }, + { + "epoch": 0.13290632506004804, + "learning_rate": 6.108703282593461e-06, + "loss": 0.3438, + "step": 664 + }, + { + "epoch": 0.13370696557245795, + "learning_rate": 6.121580662414533e-06, + "loss": 0.0088, + "step": 666 + }, + { + "epoch": 0.13370696557245795, + "learning_rate": 6.13446561687258e-06, + "loss": 0.047, + "step": 668 + }, + { + "epoch": 0.1345076060848679, + "learning_rate": 6.147358120803041e-06, + "loss": 0.0405, + "step": 670 + }, + { + "epoch": 0.1345076060848679, + "learning_rate": 6.160258149026557e-06, + "loss": 0.0877, + "step": 672 + }, + { + "epoch": 0.13530824659727783, + "learning_rate": 6.173165676349095e-06, + "loss": 0.0328, + "step": 674 + }, + { + "epoch": 0.13530824659727783, + "learning_rate": 6.186080677561974e-06, + "loss": 0.1791, + "step": 676 + }, + { + "epoch": 0.13610888710968774, + "learning_rate": 6.1990031274419186e-06, + "loss": 0.2537, + "step": 678 + }, + { + "epoch": 0.13610888710968774, + "learning_rate": 6.2119330007511014e-06, + "loss": 0.0833, + "step": 680 + }, + { + "epoch": 0.13690952762209768, + "learning_rate": 6.224870272237185e-06, + "loss": 0.0138, + "step": 682 + }, + { + "epoch": 0.13690952762209768, + "learning_rate": 6.237814916633431e-06, + "loss": 0.9689, + "step": 684 + }, + { + "epoch": 0.1377101681345076, + "learning_rate": 6.250766908658652e-06, + "loss": 0.1951, + "step": 686 + }, + { + "epoch": 0.1377101681345076, + "learning_rate": 6.263726223017326e-06, + "loss": 0.1915, + "step": 688 + }, + { + "epoch": 0.13851080864691753, + "learning_rate": 6.2766928343996314e-06, + "loss": 0.2126, + "step": 690 + }, + { + "epoch": 0.13851080864691753, + "learning_rate": 6.289666717481496e-06, + "loss": 1.0311, + "step": 692 + }, + { + "epoch": 0.13931144915932747, + "learning_rate": 6.3026478469246285e-06, + "loss": 0.0014, + "step": 694 + }, + { + "epoch": 0.13931144915932747, + "learning_rate": 6.315636197376634e-06, + "loss": 0.0095, + "step": 696 + }, + { + "epoch": 0.14011208967173738, + "learning_rate": 6.328631743470968e-06, + "loss": 0.0254, + "step": 698 + }, + { + "epoch": 0.14011208967173738, + "learning_rate": 6.341634459827044e-06, + "loss": 0.0013, + "step": 700 + }, + { + "epoch": 0.14091273018414732, + "learning_rate": 6.354644321050279e-06, + "loss": 0.0335, + "step": 702 + }, + { + "epoch": 0.14091273018414732, + "learning_rate": 6.3676613017321305e-06, + "loss": 0.0071, + "step": 704 + }, + { + "epoch": 0.14171337069655723, + "learning_rate": 6.380685376450153e-06, + "loss": 0.0215, + "step": 706 + }, + { + "epoch": 0.14171337069655723, + "learning_rate": 6.393716519768032e-06, + "loss": 0.0084, + "step": 708 + }, + { + "epoch": 0.14251401120896717, + "learning_rate": 6.406754706235692e-06, + "loss": 0.0888, + "step": 710 + }, + { + "epoch": 0.14251401120896717, + "learning_rate": 6.419799910389257e-06, + "loss": 0.0143, + "step": 712 + }, + { + "epoch": 0.1433146517213771, + "learning_rate": 6.432852106751162e-06, + "loss": 0.02, + "step": 714 + }, + { + "epoch": 0.1433146517213771, + "learning_rate": 6.445911269830183e-06, + "loss": 0.0184, + "step": 716 + }, + { + "epoch": 0.14411529223378702, + "learning_rate": 6.458977374121492e-06, + "loss": 0.068, + "step": 718 + }, + { + "epoch": 0.14411529223378702, + "learning_rate": 6.472050394106689e-06, + "loss": 0.2643, + "step": 720 + }, + { + "epoch": 0.14491593274619696, + "learning_rate": 6.485130304253915e-06, + "loss": 0.251, + "step": 722 + }, + { + "epoch": 0.14491593274619696, + "learning_rate": 6.498217079017806e-06, + "loss": 0.182, + "step": 724 + }, + { + "epoch": 0.14571657325860687, + "learning_rate": 6.511310692839605e-06, + "loss": 0.0044, + "step": 726 + }, + { + "epoch": 0.14571657325860687, + "learning_rate": 6.524411120147204e-06, + "loss": 0.0083, + "step": 728 + }, + { + "epoch": 0.1465172137710168, + "learning_rate": 6.537518335355182e-06, + "loss": 0.4395, + "step": 730 + }, + { + "epoch": 0.1465172137710168, + "learning_rate": 6.5506323128648654e-06, + "loss": 0.6011, + "step": 732 + }, + { + "epoch": 0.14731785428342675, + "learning_rate": 6.563753027064355e-06, + "loss": 0.3771, + "step": 734 + }, + { + "epoch": 0.14731785428342675, + "learning_rate": 6.576880452328645e-06, + "loss": 0.0884, + "step": 736 + }, + { + "epoch": 0.14811849479583666, + "learning_rate": 6.590014563019571e-06, + "loss": 0.0462, + "step": 738 + }, + { + "epoch": 0.14811849479583666, + "learning_rate": 6.603155333485934e-06, + "loss": 0.1391, + "step": 740 + }, + { + "epoch": 0.1489191353082466, + "learning_rate": 6.61630273806352e-06, + "loss": 0.0128, + "step": 742 + }, + { + "epoch": 0.1489191353082466, + "learning_rate": 6.6294567510751675e-06, + "loss": 0.0621, + "step": 744 + }, + { + "epoch": 0.14971977582065651, + "learning_rate": 6.642617346830784e-06, + "loss": 0.0076, + "step": 746 + }, + { + "epoch": 0.14971977582065651, + "learning_rate": 6.655784499627476e-06, + "loss": 0.0911, + "step": 748 + }, + { + "epoch": 0.15052041633306645, + "learning_rate": 6.6689581837494925e-06, + "loss": 0.1317, + "step": 750 + }, + { + "epoch": 0.15052041633306645, + "learning_rate": 6.682138373468341e-06, + "loss": 0.0065, + "step": 752 + }, + { + "epoch": 0.1513210568454764, + "learning_rate": 6.695325043042827e-06, + "loss": 0.1945, + "step": 754 + }, + { + "epoch": 0.1513210568454764, + "learning_rate": 6.7085181667191e-06, + "loss": 0.1226, + "step": 756 + }, + { + "epoch": 0.1521216973578863, + "learning_rate": 6.7217177187307e-06, + "loss": 0.0735, + "step": 758 + }, + { + "epoch": 0.1521216973578863, + "learning_rate": 6.734923673298605e-06, + "loss": 0.4032, + "step": 760 + }, + { + "epoch": 0.15292233787029624, + "learning_rate": 6.748136004631327e-06, + "loss": 0.2668, + "step": 762 + }, + { + "epoch": 0.15292233787029624, + "learning_rate": 6.761354686924883e-06, + "loss": 0.0087, + "step": 764 + }, + { + "epoch": 0.15372297838270615, + "learning_rate": 6.774579694362902e-06, + "loss": 0.2691, + "step": 766 + }, + { + "epoch": 0.15372297838270615, + "learning_rate": 6.787811001116654e-06, + "loss": 0.0024, + "step": 768 + }, + { + "epoch": 0.1545236188951161, + "learning_rate": 6.801048581345113e-06, + "loss": 0.1079, + "step": 770 + }, + { + "epoch": 0.1545236188951161, + "learning_rate": 6.8142924091949955e-06, + "loss": 0.0305, + "step": 772 + }, + { + "epoch": 0.15532425940752603, + "learning_rate": 6.827542458800804e-06, + "loss": 0.0197, + "step": 774 + }, + { + "epoch": 0.15532425940752603, + "learning_rate": 6.840798704284939e-06, + "loss": 0.0069, + "step": 776 + }, + { + "epoch": 0.15612489991993594, + "learning_rate": 6.854061119757647e-06, + "loss": 0.0584, + "step": 778 + }, + { + "epoch": 0.15612489991993594, + "learning_rate": 6.867329679317144e-06, + "loss": 0.0084, + "step": 780 + }, + { + "epoch": 0.15692554043234588, + "learning_rate": 6.880604357049646e-06, + "loss": 1.2221, + "step": 782 + }, + { + "epoch": 0.15692554043234588, + "learning_rate": 6.893885127029419e-06, + "loss": 0.0294, + "step": 784 + }, + { + "epoch": 0.1577261809447558, + "learning_rate": 6.907171963318815e-06, + "loss": 0.6667, + "step": 786 + }, + { + "epoch": 0.1577261809447558, + "learning_rate": 6.920464839968391e-06, + "loss": 0.6846, + "step": 788 + }, + { + "epoch": 0.15852682145716573, + "learning_rate": 6.9337637310168494e-06, + "loss": 0.072, + "step": 790 + }, + { + "epoch": 0.15852682145716573, + "learning_rate": 6.94706861049117e-06, + "loss": 0.018, + "step": 792 + }, + { + "epoch": 0.15932746196957567, + "learning_rate": 6.960379452406636e-06, + "loss": 0.4259, + "step": 794 + }, + { + "epoch": 0.15932746196957567, + "learning_rate": 6.973696230766884e-06, + "loss": 0.0259, + "step": 796 + }, + { + "epoch": 0.16012810248198558, + "learning_rate": 6.9870189195639595e-06, + "loss": 0.0229, + "step": 798 + }, + { + "epoch": 0.16012810248198558, + "learning_rate": 7.000347492778341e-06, + "loss": 0.1863, + "step": 800 + }, + { + "epoch": 0.16092874299439552, + "learning_rate": 7.013681924379073e-06, + "loss": 0.1147, + "step": 802 + }, + { + "epoch": 0.16092874299439552, + "learning_rate": 7.027022188323704e-06, + "loss": 0.1258, + "step": 804 + }, + { + "epoch": 0.16172938350680544, + "learning_rate": 7.040368258558412e-06, + "loss": 0.0262, + "step": 806 + }, + { + "epoch": 0.16172938350680544, + "learning_rate": 7.05372010901803e-06, + "loss": 0.0569, + "step": 808 + }, + { + "epoch": 0.16253002401921537, + "learning_rate": 7.0670777136261035e-06, + "loss": 0.0833, + "step": 810 + }, + { + "epoch": 0.16253002401921537, + "learning_rate": 7.080441046294945e-06, + "loss": 0.0001, + "step": 812 + }, + { + "epoch": 0.1633306645316253, + "learning_rate": 7.093810080925657e-06, + "loss": 0.0801, + "step": 814 + }, + { + "epoch": 0.1633306645316253, + "learning_rate": 7.1071847914082605e-06, + "loss": 0.0648, + "step": 816 + }, + { + "epoch": 0.16413130504403523, + "learning_rate": 7.120565151621638e-06, + "loss": 0.0006, + "step": 818 + }, + { + "epoch": 0.16413130504403523, + "learning_rate": 7.133951135433656e-06, + "loss": 0.0049, + "step": 820 + }, + { + "epoch": 0.16493194555644516, + "learning_rate": 7.1473427167012e-06, + "loss": 0.5044, + "step": 822 + }, + { + "epoch": 0.16493194555644516, + "learning_rate": 7.160739869270219e-06, + "loss": 0.1898, + "step": 824 + }, + { + "epoch": 0.16573258606885508, + "learning_rate": 7.1741425669757854e-06, + "loss": 0.0537, + "step": 826 + }, + { + "epoch": 0.16573258606885508, + "learning_rate": 7.18755078364214e-06, + "loss": 0.0287, + "step": 828 + }, + { + "epoch": 0.16653322658126501, + "learning_rate": 7.200964493082727e-06, + "loss": 0.0814, + "step": 830 + }, + { + "epoch": 0.16653322658126501, + "learning_rate": 7.214383669100317e-06, + "loss": 0.5248, + "step": 832 + }, + { + "epoch": 0.16733386709367493, + "learning_rate": 7.227808285486952e-06, + "loss": 0.0287, + "step": 834 + }, + { + "epoch": 0.16733386709367493, + "learning_rate": 7.241238316024064e-06, + "loss": 0.0573, + "step": 836 + }, + { + "epoch": 0.16813450760608487, + "learning_rate": 7.254673734482513e-06, + "loss": 0.225, + "step": 838 + }, + { + "epoch": 0.16813450760608487, + "learning_rate": 7.268114514622635e-06, + "loss": 0.0544, + "step": 840 + }, + { + "epoch": 0.1689351481184948, + "learning_rate": 7.2815606301942945e-06, + "loss": 0.0897, + "step": 842 + }, + { + "epoch": 0.1689351481184948, + "learning_rate": 7.2950120549369204e-06, + "loss": 0.0948, + "step": 844 + }, + { + "epoch": 0.16973578863090472, + "learning_rate": 7.308468762579623e-06, + "loss": 0.0286, + "step": 846 + }, + { + "epoch": 0.16973578863090472, + "learning_rate": 7.321930726841144e-06, + "loss": 0.0322, + "step": 848 + }, + { + "epoch": 0.17053642914331466, + "learning_rate": 7.3353979214299765e-06, + "loss": 0.0164, + "step": 850 + }, + { + "epoch": 0.17053642914331466, + "learning_rate": 7.348870320044395e-06, + "loss": 0.0932, + "step": 852 + }, + { + "epoch": 0.17133706965572457, + "learning_rate": 7.362347896372515e-06, + "loss": 0.0362, + "step": 854 + }, + { + "epoch": 0.17133706965572457, + "learning_rate": 7.375830624092336e-06, + "loss": 0.274, + "step": 856 + }, + { + "epoch": 0.1721377101681345, + "learning_rate": 7.389318476871784e-06, + "loss": 0.0044, + "step": 858 + }, + { + "epoch": 0.1721377101681345, + "learning_rate": 7.402811428368824e-06, + "loss": 0.3679, + "step": 860 + }, + { + "epoch": 0.17293835068054444, + "learning_rate": 7.416309452231411e-06, + "loss": 0.0142, + "step": 862 + }, + { + "epoch": 0.17293835068054444, + "learning_rate": 7.429812522097613e-06, + "loss": 0.008, + "step": 864 + }, + { + "epoch": 0.17373899119295436, + "learning_rate": 7.443320611595641e-06, + "loss": 0.5453, + "step": 866 + }, + { + "epoch": 0.17373899119295436, + "learning_rate": 7.4568336943439055e-06, + "loss": 0.052, + "step": 868 + }, + { + "epoch": 0.1745396317053643, + "learning_rate": 7.470351743951061e-06, + "loss": 0.0012, + "step": 870 + }, + { + "epoch": 0.1745396317053643, + "learning_rate": 7.4838747340160475e-06, + "loss": 0.0061, + "step": 872 + }, + { + "epoch": 0.1753402722177742, + "learning_rate": 7.497402638128209e-06, + "loss": 0.1165, + "step": 874 + }, + { + "epoch": 0.1753402722177742, + "learning_rate": 7.510935429867233e-06, + "loss": 0.0711, + "step": 876 + }, + { + "epoch": 0.17614091273018415, + "learning_rate": 7.52447308280329e-06, + "loss": 0.0069, + "step": 878 + }, + { + "epoch": 0.17614091273018415, + "learning_rate": 7.538015570497046e-06, + "loss": 0.0364, + "step": 880 + }, + { + "epoch": 0.17694155324259409, + "learning_rate": 7.551562866499732e-06, + "loss": 0.0262, + "step": 882 + }, + { + "epoch": 0.17694155324259409, + "learning_rate": 7.5651149443531846e-06, + "loss": 0.5189, + "step": 884 + }, + { + "epoch": 0.177742193755004, + "learning_rate": 7.578671777589884e-06, + "loss": 0.0354, + "step": 886 + }, + { + "epoch": 0.177742193755004, + "learning_rate": 7.592233339733077e-06, + "loss": 0.0402, + "step": 888 + }, + { + "epoch": 0.17854283426741394, + "learning_rate": 7.605799604296721e-06, + "loss": 0.6197, + "step": 890 + }, + { + "epoch": 0.17854283426741394, + "learning_rate": 7.619370544785608e-06, + "loss": 0.0876, + "step": 892 + }, + { + "epoch": 0.17934347477982385, + "learning_rate": 7.632946134695396e-06, + "loss": 0.0478, + "step": 894 + }, + { + "epoch": 0.17934347477982385, + "learning_rate": 7.646526347512665e-06, + "loss": 0.0371, + "step": 896 + }, + { + "epoch": 0.1801441152922338, + "learning_rate": 7.660111156714964e-06, + "loss": 0.4334, + "step": 898 + }, + { + "epoch": 0.1801441152922338, + "learning_rate": 7.67370053577085e-06, + "loss": 0.0114, + "step": 900 + }, + { + "epoch": 0.18094475580464373, + "learning_rate": 7.687294458140006e-06, + "loss": 0.3174, + "step": 902 + }, + { + "epoch": 0.18094475580464373, + "learning_rate": 7.70089289727319e-06, + "loss": 0.9112, + "step": 904 + }, + { + "epoch": 0.18174539631705364, + "learning_rate": 7.714495826612353e-06, + "loss": 0.0613, + "step": 906 + }, + { + "epoch": 0.18174539631705364, + "learning_rate": 7.728103219590684e-06, + "loss": 0.0887, + "step": 908 + }, + { + "epoch": 0.18254603682946358, + "learning_rate": 7.741715049632646e-06, + "loss": 0.3514, + "step": 910 + }, + { + "epoch": 0.18254603682946358, + "learning_rate": 7.755331290154041e-06, + "loss": 0.3151, + "step": 912 + }, + { + "epoch": 0.1833466773418735, + "learning_rate": 7.76895191456204e-06, + "loss": 0.2247, + "step": 914 + }, + { + "epoch": 0.1833466773418735, + "learning_rate": 7.7825768962553e-06, + "loss": 0.0977, + "step": 916 + }, + { + "epoch": 0.18414731785428343, + "learning_rate": 7.796206208623925e-06, + "loss": 0.2504, + "step": 918 + }, + { + "epoch": 0.18414731785428343, + "learning_rate": 7.809839825049565e-06, + "loss": 0.3692, + "step": 920 + }, + { + "epoch": 0.18494795836669337, + "learning_rate": 7.82347771890548e-06, + "loss": 0.03, + "step": 922 + }, + { + "epoch": 0.18494795836669337, + "learning_rate": 7.83711986355656e-06, + "loss": 0.0008, + "step": 924 + }, + { + "epoch": 0.18574859887910328, + "learning_rate": 7.850766232359408e-06, + "loss": 0.3938, + "step": 926 + }, + { + "epoch": 0.18574859887910328, + "learning_rate": 7.864416798662347e-06, + "loss": 0.0721, + "step": 928 + }, + { + "epoch": 0.18654923939151322, + "learning_rate": 7.878071535805564e-06, + "loss": 0.0117, + "step": 930 + }, + { + "epoch": 0.18654923939151322, + "learning_rate": 7.891730417121043e-06, + "loss": 0.2125, + "step": 932 + }, + { + "epoch": 0.18734987990392313, + "learning_rate": 7.90539341593269e-06, + "loss": 0.0544, + "step": 934 + }, + { + "epoch": 0.18734987990392313, + "learning_rate": 7.919060505556376e-06, + "loss": 0.0271, + "step": 936 + }, + { + "epoch": 0.18815052041633307, + "learning_rate": 7.932731659299978e-06, + "loss": 0.0084, + "step": 938 + }, + { + "epoch": 0.18815052041633307, + "learning_rate": 7.946406850463435e-06, + "loss": 0.2374, + "step": 940 + }, + { + "epoch": 0.188951160928743, + "learning_rate": 7.960086052338788e-06, + "loss": 0.0645, + "step": 942 + }, + { + "epoch": 0.188951160928743, + "learning_rate": 7.973769238210291e-06, + "loss": 0.0248, + "step": 944 + }, + { + "epoch": 0.18975180144115292, + "learning_rate": 7.987456381354371e-06, + "loss": 0.0282, + "step": 946 + }, + { + "epoch": 0.18975180144115292, + "learning_rate": 8.001147455039737e-06, + "loss": 0.0204, + "step": 948 + }, + { + "epoch": 0.19055244195356286, + "learning_rate": 8.01484243252743e-06, + "loss": 0.0152, + "step": 950 + }, + { + "epoch": 0.19055244195356286, + "learning_rate": 8.028541287070858e-06, + "loss": 0.1446, + "step": 952 + }, + { + "epoch": 0.19135308246597277, + "learning_rate": 8.042243991915866e-06, + "loss": 0.0126, + "step": 954 + }, + { + "epoch": 0.19135308246597277, + "learning_rate": 8.055950520300756e-06, + "loss": 0.0679, + "step": 956 + }, + { + "epoch": 0.1921537229783827, + "learning_rate": 8.069660845456411e-06, + "loss": 0.1161, + "step": 958 + }, + { + "epoch": 0.1921537229783827, + "learning_rate": 8.083374940606256e-06, + "loss": 0.1911, + "step": 960 + }, + { + "epoch": 0.19295436349079265, + "learning_rate": 8.097092778966364e-06, + "loss": 0.1766, + "step": 962 + }, + { + "epoch": 0.19295436349079265, + "learning_rate": 8.110814333745503e-06, + "loss": 0.0051, + "step": 964 + }, + { + "epoch": 0.19375500400320256, + "learning_rate": 8.124539578145176e-06, + "loss": 0.4481, + "step": 966 + }, + { + "epoch": 0.19375500400320256, + "learning_rate": 8.138268485359684e-06, + "loss": 0.0408, + "step": 968 + }, + { + "epoch": 0.1945556445156125, + "learning_rate": 8.152001028576158e-06, + "loss": 0.1114, + "step": 970 + }, + { + "epoch": 0.1945556445156125, + "learning_rate": 8.165737180974676e-06, + "loss": 0.0001, + "step": 972 + }, + { + "epoch": 0.1953562850280224, + "learning_rate": 8.179476915728217e-06, + "loss": 0.0037, + "step": 974 + }, + { + "epoch": 0.1953562850280224, + "learning_rate": 8.193220206002785e-06, + "loss": 0.0141, + "step": 976 + }, + { + "epoch": 0.19615692554043235, + "learning_rate": 8.206967024957432e-06, + "loss": 0.0468, + "step": 978 + }, + { + "epoch": 0.19615692554043235, + "learning_rate": 8.220717345744326e-06, + "loss": 0.0774, + "step": 980 + }, + { + "epoch": 0.1969575660528423, + "learning_rate": 8.234471141508773e-06, + "loss": 0.5023, + "step": 982 + }, + { + "epoch": 0.1969575660528423, + "learning_rate": 8.248228385389349e-06, + "loss": 0.0347, + "step": 984 + }, + { + "epoch": 0.1977582065652522, + "learning_rate": 8.261989050517841e-06, + "loss": 0.5099, + "step": 986 + }, + { + "epoch": 0.1977582065652522, + "learning_rate": 8.275753110019367e-06, + "loss": 0.0788, + "step": 988 + }, + { + "epoch": 0.19855884707766214, + "learning_rate": 8.289520537012428e-06, + "loss": 0.071, + "step": 990 + }, + { + "epoch": 0.19855884707766214, + "learning_rate": 8.303291304608936e-06, + "loss": 0.0008, + "step": 992 + }, + { + "epoch": 0.19935948759007205, + "learning_rate": 8.317065385914285e-06, + "loss": 0.0691, + "step": 994 + }, + { + "epoch": 0.19935948759007205, + "learning_rate": 8.330842754027378e-06, + "loss": 0.1388, + "step": 996 + }, + { + "epoch": 0.200160128102482, + "learning_rate": 8.344623382040752e-06, + "loss": 0.1382, + "step": 998 + }, + { + "epoch": 0.200160128102482, + "learning_rate": 8.358407243040524e-06, + "loss": 0.1387, + "step": 1000 + }, + { + "epoch": 0.20096076861489193, + "learning_rate": 8.372194310106515e-06, + "loss": 0.0097, + "step": 1002 + }, + { + "epoch": 0.20096076861489193, + "learning_rate": 8.385984556312285e-06, + "loss": 0.1741, + "step": 1004 + }, + { + "epoch": 0.20176140912730184, + "learning_rate": 8.399777954725183e-06, + "loss": 0.0568, + "step": 1006 + }, + { + "epoch": 0.20176140912730184, + "learning_rate": 8.413574478406386e-06, + "loss": 0.0276, + "step": 1008 + }, + { + "epoch": 0.20256204963971178, + "learning_rate": 8.427374100411022e-06, + "loss": 0.1255, + "step": 1010 + }, + { + "epoch": 0.20256204963971178, + "learning_rate": 8.441176793788106e-06, + "loss": 0.0025, + "step": 1012 + }, + { + "epoch": 0.2033626901521217, + "learning_rate": 8.454982531580687e-06, + "loss": 0.0007, + "step": 1014 + }, + { + "epoch": 0.2033626901521217, + "learning_rate": 8.468791286825856e-06, + "loss": 0.0061, + "step": 1016 + }, + { + "epoch": 0.20416333066453163, + "learning_rate": 8.482603032554812e-06, + "loss": 0.0184, + "step": 1018 + }, + { + "epoch": 0.20416333066453163, + "learning_rate": 8.496417741792922e-06, + "loss": 0.0854, + "step": 1020 + }, + { + "epoch": 0.20496397117694154, + "learning_rate": 8.510235387559738e-06, + "loss": 0.0077, + "step": 1022 + }, + { + "epoch": 0.20496397117694154, + "learning_rate": 8.524055942869135e-06, + "loss": 0.0506, + "step": 1024 + }, + { + "epoch": 0.20576461168935148, + "learning_rate": 8.537879380729254e-06, + "loss": 0.0331, + "step": 1026 + }, + { + "epoch": 0.20576461168935148, + "learning_rate": 8.551705674142616e-06, + "loss": 0.4746, + "step": 1028 + }, + { + "epoch": 0.20656525220176142, + "learning_rate": 8.565534796106175e-06, + "loss": 0.1821, + "step": 1030 + }, + { + "epoch": 0.20656525220176142, + "learning_rate": 8.579366719611353e-06, + "loss": 0.0217, + "step": 1032 + }, + { + "epoch": 0.20736589271417133, + "learning_rate": 8.593201417644091e-06, + "loss": 0.215, + "step": 1034 + }, + { + "epoch": 0.20736589271417133, + "learning_rate": 8.607038863184952e-06, + "loss": 0.0084, + "step": 1036 + }, + { + "epoch": 0.20816653322658127, + "learning_rate": 8.620879029209093e-06, + "loss": 0.0453, + "step": 1038 + }, + { + "epoch": 0.20816653322658127, + "learning_rate": 8.634721888686368e-06, + "loss": 0.0284, + "step": 1040 + }, + { + "epoch": 0.20896717373899118, + "learning_rate": 8.648567414581372e-06, + "loss": 0.5075, + "step": 1042 + }, + { + "epoch": 0.20896717373899118, + "learning_rate": 8.662415579853495e-06, + "loss": 0.4004, + "step": 1044 + }, + { + "epoch": 0.20976781425140112, + "learning_rate": 8.676266357456968e-06, + "loss": 0.0159, + "step": 1046 + }, + { + "epoch": 0.20976781425140112, + "learning_rate": 8.690119720340907e-06, + "loss": 0.4366, + "step": 1048 + }, + { + "epoch": 0.21056845476381106, + "learning_rate": 8.703975641449426e-06, + "loss": 0.2845, + "step": 1050 + }, + { + "epoch": 0.21056845476381106, + "learning_rate": 8.717834093721598e-06, + "loss": 0.0161, + "step": 1052 + }, + { + "epoch": 0.21136909527622097, + "learning_rate": 8.731695050091561e-06, + "loss": 0.0682, + "step": 1054 + }, + { + "epoch": 0.21136909527622097, + "learning_rate": 8.74555848348857e-06, + "loss": 0.0159, + "step": 1056 + }, + { + "epoch": 0.2121697357886309, + "learning_rate": 8.759424366837035e-06, + "loss": 0.0363, + "step": 1058 + }, + { + "epoch": 0.2121697357886309, + "learning_rate": 8.773292673056572e-06, + "loss": 0.0627, + "step": 1060 + }, + { + "epoch": 0.21297037630104082, + "learning_rate": 8.787163375062113e-06, + "loss": 0.0178, + "step": 1062 + }, + { + "epoch": 0.21297037630104082, + "learning_rate": 8.801036445763858e-06, + "loss": 0.0103, + "step": 1064 + }, + { + "epoch": 0.21377101681345076, + "learning_rate": 8.8149118580674e-06, + "loss": 0.0569, + "step": 1066 + }, + { + "epoch": 0.21377101681345076, + "learning_rate": 8.828789584873757e-06, + "loss": 0.0182, + "step": 1068 + }, + { + "epoch": 0.2145716573258607, + "learning_rate": 8.84266959907943e-06, + "loss": 0.0468, + "step": 1070 + }, + { + "epoch": 0.2145716573258607, + "learning_rate": 8.856551873576448e-06, + "loss": 0.0177, + "step": 1072 + }, + { + "epoch": 0.2153722978382706, + "learning_rate": 8.870436381252412e-06, + "loss": 0.3257, + "step": 1074 + }, + { + "epoch": 0.2153722978382706, + "learning_rate": 8.884323094990613e-06, + "loss": 0.1085, + "step": 1076 + }, + { + "epoch": 0.21617293835068055, + "learning_rate": 8.89821198766998e-06, + "loss": 0.0818, + "step": 1078 + }, + { + "epoch": 0.21617293835068055, + "learning_rate": 8.912103032165206e-06, + "loss": 0.0261, + "step": 1080 + }, + { + "epoch": 0.21697357886309046, + "learning_rate": 8.925996201346779e-06, + "loss": 0.0265, + "step": 1082 + }, + { + "epoch": 0.21697357886309046, + "learning_rate": 8.939891468081036e-06, + "loss": 0.0097, + "step": 1084 + }, + { + "epoch": 0.2177742193755004, + "learning_rate": 8.953788805230209e-06, + "loss": 0.0259, + "step": 1086 + }, + { + "epoch": 0.2177742193755004, + "learning_rate": 8.967688185652527e-06, + "loss": 0.0428, + "step": 1088 + }, + { + "epoch": 0.21857485988791034, + "learning_rate": 8.981589582202184e-06, + "loss": 0.0488, + "step": 1090 + }, + { + "epoch": 0.21857485988791034, + "learning_rate": 8.995492967729449e-06, + "loss": 0.0148, + "step": 1092 + }, + { + "epoch": 0.21937550040032025, + "learning_rate": 9.009398315080712e-06, + "loss": 0.0022, + "step": 1094 + }, + { + "epoch": 0.21937550040032025, + "learning_rate": 9.023305597098526e-06, + "loss": 0.0133, + "step": 1096 + }, + { + "epoch": 0.2201761409127302, + "learning_rate": 9.037214786621669e-06, + "loss": 0.1161, + "step": 1098 + }, + { + "epoch": 0.2201761409127302, + "learning_rate": 9.051125856485175e-06, + "loss": 0.0094, + "step": 1100 + }, + { + "epoch": 0.2209767814251401, + "learning_rate": 9.065038779520457e-06, + "loss": 0.1028, + "step": 1102 + }, + { + "epoch": 0.2209767814251401, + "learning_rate": 9.078953528555258e-06, + "loss": 0.0015, + "step": 1104 + }, + { + "epoch": 0.22177742193755004, + "learning_rate": 9.092870076413771e-06, + "loss": 0.0905, + "step": 1106 + }, + { + "epoch": 0.22177742193755004, + "learning_rate": 9.106788395916682e-06, + "loss": 0.0169, + "step": 1108 + }, + { + "epoch": 0.22257806244995998, + "learning_rate": 9.120708459881203e-06, + "loss": 0.3218, + "step": 1110 + }, + { + "epoch": 0.22257806244995998, + "learning_rate": 9.134630241121135e-06, + "loss": 1.3112, + "step": 1112 + }, + { + "epoch": 0.2233787029623699, + "learning_rate": 9.148553712446971e-06, + "loss": 0.1763, + "step": 1114 + }, + { + "epoch": 0.2233787029623699, + "learning_rate": 9.162478846665854e-06, + "loss": 0.1442, + "step": 1116 + }, + { + "epoch": 0.22417934347477983, + "learning_rate": 9.176405616581694e-06, + "loss": 0.0031, + "step": 1118 + }, + { + "epoch": 0.22417934347477983, + "learning_rate": 9.190333994995208e-06, + "loss": 0.0001, + "step": 1120 + }, + { + "epoch": 0.22497998398718974, + "learning_rate": 9.20426395470397e-06, + "loss": 0.0036, + "step": 1122 + }, + { + "epoch": 0.22497998398718974, + "learning_rate": 9.218195468502469e-06, + "loss": 0.0002, + "step": 1124 + }, + { + "epoch": 0.22578062449959968, + "learning_rate": 9.232128509182136e-06, + "loss": 0.0282, + "step": 1126 + }, + { + "epoch": 0.22578062449959968, + "learning_rate": 9.24606304953148e-06, + "loss": 0.0327, + "step": 1128 + }, + { + "epoch": 0.22658126501200962, + "learning_rate": 9.259999062336021e-06, + "loss": 0.1109, + "step": 1130 + }, + { + "epoch": 0.22658126501200962, + "learning_rate": 9.273936520378426e-06, + "loss": 0.0391, + "step": 1132 + }, + { + "epoch": 0.22738190552441953, + "learning_rate": 9.287875396438536e-06, + "loss": 0.0123, + "step": 1134 + }, + { + "epoch": 0.22738190552441953, + "learning_rate": 9.301815663293426e-06, + "loss": 0.2954, + "step": 1136 + }, + { + "epoch": 0.22818254603682947, + "learning_rate": 9.315757293717432e-06, + "loss": 0.0125, + "step": 1138 + }, + { + "epoch": 0.22818254603682947, + "learning_rate": 9.329700260482286e-06, + "loss": 0.0204, + "step": 1140 + }, + { + "epoch": 0.22898318654923938, + "learning_rate": 9.343644536357053e-06, + "loss": 0.4526, + "step": 1142 + }, + { + "epoch": 0.22898318654923938, + "learning_rate": 9.35759009410826e-06, + "loss": 0.0642, + "step": 1144 + }, + { + "epoch": 0.22978382706164932, + "learning_rate": 9.37153690649993e-06, + "loss": 0.0485, + "step": 1146 + }, + { + "epoch": 0.22978382706164932, + "learning_rate": 9.38548494629364e-06, + "loss": 0.0047, + "step": 1148 + }, + { + "epoch": 0.23058446757405926, + "learning_rate": 9.39943418624856e-06, + "loss": 0.3019, + "step": 1150 + }, + { + "epoch": 0.23058446757405926, + "learning_rate": 9.41338459912151e-06, + "loss": 0.0253, + "step": 1152 + }, + { + "epoch": 0.23138510808646917, + "learning_rate": 9.427336157667062e-06, + "loss": 0.0568, + "step": 1154 + }, + { + "epoch": 0.23138510808646917, + "learning_rate": 9.441288834637507e-06, + "loss": 0.3286, + "step": 1156 + }, + { + "epoch": 0.2321857485988791, + "learning_rate": 9.45524260278296e-06, + "loss": 0.0297, + "step": 1158 + }, + { + "epoch": 0.2321857485988791, + "learning_rate": 9.469197434851414e-06, + "loss": 0.0799, + "step": 1160 + }, + { + "epoch": 0.23298638911128902, + "learning_rate": 9.483153303588777e-06, + "loss": 0.1738, + "step": 1162 + }, + { + "epoch": 0.23298638911128902, + "learning_rate": 9.497110181738935e-06, + "loss": 0.0047, + "step": 1164 + }, + { + "epoch": 0.23378702962369896, + "learning_rate": 9.511068042043785e-06, + "loss": 0.0002, + "step": 1166 + }, + { + "epoch": 0.23378702962369896, + "learning_rate": 9.52502685724336e-06, + "loss": 0.1736, + "step": 1168 + }, + { + "epoch": 0.2345876701361089, + "learning_rate": 9.538986600075773e-06, + "loss": 0.1376, + "step": 1170 + }, + { + "epoch": 0.2345876701361089, + "learning_rate": 9.552947243277342e-06, + "loss": 0.0013, + "step": 1172 + }, + { + "epoch": 0.2353883106485188, + "learning_rate": 9.566908759582633e-06, + "loss": 0.0214, + "step": 1174 + }, + { + "epoch": 0.2353883106485188, + "learning_rate": 9.580871121724498e-06, + "loss": 0.1284, + "step": 1176 + }, + { + "epoch": 0.23618895116092875, + "learning_rate": 9.594834302434123e-06, + "loss": 0.1076, + "step": 1178 + }, + { + "epoch": 0.23618895116092875, + "learning_rate": 9.608798274441153e-06, + "loss": 0.218, + "step": 1180 + }, + { + "epoch": 0.23698959167333866, + "learning_rate": 9.622763010473628e-06, + "loss": 0.097, + "step": 1182 + }, + { + "epoch": 0.23698959167333866, + "learning_rate": 9.636728483258116e-06, + "loss": 0.0156, + "step": 1184 + }, + { + "epoch": 0.2377902321857486, + "learning_rate": 9.650694665519747e-06, + "loss": 1.0371, + "step": 1186 + }, + { + "epoch": 0.2377902321857486, + "learning_rate": 9.664661529982263e-06, + "loss": 0.0027, + "step": 1188 + }, + { + "epoch": 0.23859087269815854, + "learning_rate": 9.678629049368077e-06, + "loss": 0.0163, + "step": 1190 + }, + { + "epoch": 0.23859087269815854, + "learning_rate": 9.692597196398302e-06, + "loss": 0.0235, + "step": 1192 + }, + { + "epoch": 0.23939151321056845, + "learning_rate": 9.706565943792879e-06, + "loss": 0.2846, + "step": 1194 + }, + { + "epoch": 0.23939151321056845, + "learning_rate": 9.720535264270526e-06, + "loss": 0.0786, + "step": 1196 + }, + { + "epoch": 0.2401921537229784, + "learning_rate": 9.734505130548855e-06, + "loss": 0.0024, + "step": 1198 + }, + { + "epoch": 0.2401921537229784, + "learning_rate": 9.748475515344416e-06, + "loss": 0.6213, + "step": 1200 + }, + { + "epoch": 0.2409927942353883, + "learning_rate": 9.762446391372746e-06, + "loss": 0.0745, + "step": 1202 + }, + { + "epoch": 0.2409927942353883, + "learning_rate": 9.776417731348403e-06, + "loss": 0.0375, + "step": 1204 + }, + { + "epoch": 0.24179343474779824, + "learning_rate": 9.790389507985091e-06, + "loss": 0.2813, + "step": 1206 + }, + { + "epoch": 0.24179343474779824, + "learning_rate": 9.80436169399561e-06, + "loss": 0.3645, + "step": 1208 + }, + { + "epoch": 0.24259407526020815, + "learning_rate": 9.81833426209198e-06, + "loss": 0.8222, + "step": 1210 + }, + { + "epoch": 0.24259407526020815, + "learning_rate": 9.832307184985473e-06, + "loss": 0.0527, + "step": 1212 + }, + { + "epoch": 0.2433947157726181, + "learning_rate": 9.846280435386668e-06, + "loss": 0.0377, + "step": 1214 + }, + { + "epoch": 0.2433947157726181, + "learning_rate": 9.8602539860055e-06, + "loss": 0.5596, + "step": 1216 + }, + { + "epoch": 0.24419535628502803, + "learning_rate": 9.874227809551307e-06, + "loss": 0.0086, + "step": 1218 + }, + { + "epoch": 0.24419535628502803, + "learning_rate": 9.888201878732946e-06, + "loss": 0.0001, + "step": 1220 + }, + { + "epoch": 0.24499599679743794, + "learning_rate": 9.902176166258738e-06, + "loss": 0.0883, + "step": 1222 + }, + { + "epoch": 0.24499599679743794, + "learning_rate": 9.916150644836596e-06, + "loss": 0.0367, + "step": 1224 + }, + { + "epoch": 0.24579663730984788, + "learning_rate": 9.930125287174061e-06, + "loss": 0.0148, + "step": 1226 + }, + { + "epoch": 0.24579663730984788, + "learning_rate": 9.944100065978354e-06, + "loss": 0.2471, + "step": 1228 + }, + { + "epoch": 0.2465972778222578, + "learning_rate": 9.958074953956413e-06, + "loss": 0.1737, + "step": 1230 + }, + { + "epoch": 0.2465972778222578, + "learning_rate": 9.972049923815011e-06, + "loss": 0.1927, + "step": 1232 + }, + { + "epoch": 0.24739791833466773, + "learning_rate": 9.986024948260714e-06, + "loss": 0.2251, + "step": 1234 + }, + { + "epoch": 0.24739791833466773, + "learning_rate": 9.999999999999996e-06, + "loss": 0.2254, + "step": 1236 + }, + { + "epoch": 0.24819855884707767, + "learning_rate": 1.0013975051739277e-05, + "loss": 0.202, + "step": 1238 + }, + { + "epoch": 0.24819855884707767, + "learning_rate": 1.0027950076184982e-05, + "loss": 0.0178, + "step": 1240 + }, + { + "epoch": 0.24899919935948758, + "learning_rate": 1.004192504604358e-05, + "loss": 0.1312, + "step": 1242 + }, + { + "epoch": 0.24899919935948758, + "learning_rate": 1.0055899934021637e-05, + "loss": 0.4733, + "step": 1244 + }, + { + "epoch": 0.24979983987189752, + "learning_rate": 1.006987471282593e-05, + "loss": 0.1255, + "step": 1246 + }, + { + "epoch": 0.24979983987189752, + "learning_rate": 1.0083849355163397e-05, + "loss": 0.2217, + "step": 1248 + }, + { + "epoch": 0.25060048038430743, + "learning_rate": 1.0097823833741255e-05, + "loss": 0.1235, + "step": 1250 + }, + { + "epoch": 0.25060048038430743, + "learning_rate": 1.0111798121267047e-05, + "loss": 0.1946, + "step": 1252 + }, + { + "epoch": 0.2514011208967174, + "learning_rate": 1.0125772190448686e-05, + "loss": 0.0972, + "step": 1254 + }, + { + "epoch": 0.2514011208967174, + "learning_rate": 1.0139746013994493e-05, + "loss": 0.0448, + "step": 1256 + }, + { + "epoch": 0.2522017614091273, + "learning_rate": 1.0153719564613327e-05, + "loss": 0.0348, + "step": 1258 + }, + { + "epoch": 0.2522017614091273, + "learning_rate": 1.016769281501452e-05, + "loss": 0.0211, + "step": 1260 + }, + { + "epoch": 0.2530024019215372, + "learning_rate": 1.018166573790801e-05, + "loss": 0.0976, + "step": 1262 + }, + { + "epoch": 0.2530024019215372, + "learning_rate": 1.0195638306004383e-05, + "loss": 0.1337, + "step": 1264 + }, + { + "epoch": 0.25380304243394713, + "learning_rate": 1.0209610492014904e-05, + "loss": 0.02, + "step": 1266 + }, + { + "epoch": 0.25380304243394713, + "learning_rate": 1.022358226865159e-05, + "loss": 0.0144, + "step": 1268 + }, + { + "epoch": 0.2546036829463571, + "learning_rate": 1.0237553608627247e-05, + "loss": 0.0679, + "step": 1270 + }, + { + "epoch": 0.2546036829463571, + "learning_rate": 1.0251524484655577e-05, + "loss": 0.0857, + "step": 1272 + }, + { + "epoch": 0.255404323458767, + "learning_rate": 1.0265494869451138e-05, + "loss": 0.0804, + "step": 1274 + }, + { + "epoch": 0.255404323458767, + "learning_rate": 1.0279464735729467e-05, + "loss": 0.0638, + "step": 1276 + }, + { + "epoch": 0.2562049639711769, + "learning_rate": 1.0293434056207114e-05, + "loss": 0.0114, + "step": 1278 + }, + { + "epoch": 0.2562049639711769, + "learning_rate": 1.0307402803601691e-05, + "loss": 0.079, + "step": 1280 + }, + { + "epoch": 0.2570056044835869, + "learning_rate": 1.0321370950631918e-05, + "loss": 0.0507, + "step": 1282 + }, + { + "epoch": 0.2570056044835869, + "learning_rate": 1.033533847001773e-05, + "loss": 0.0953, + "step": 1284 + }, + { + "epoch": 0.2578062449959968, + "learning_rate": 1.0349305334480246e-05, + "loss": 0.0011, + "step": 1286 + }, + { + "epoch": 0.2578062449959968, + "learning_rate": 1.0363271516741877e-05, + "loss": 0.0299, + "step": 1288 + }, + { + "epoch": 0.2586068855084067, + "learning_rate": 1.0377236989526366e-05, + "loss": 0.3146, + "step": 1290 + }, + { + "epoch": 0.2586068855084067, + "learning_rate": 1.039120172555884e-05, + "loss": 0.1468, + "step": 1292 + }, + { + "epoch": 0.2594075260208166, + "learning_rate": 1.0405165697565868e-05, + "loss": 0.0155, + "step": 1294 + }, + { + "epoch": 0.2594075260208166, + "learning_rate": 1.0419128878275495e-05, + "loss": 0.0135, + "step": 1296 + }, + { + "epoch": 0.2602081665332266, + "learning_rate": 1.0433091240417362e-05, + "loss": 0.4514, + "step": 1298 + }, + { + "epoch": 0.2602081665332266, + "learning_rate": 1.0447052756722651e-05, + "loss": 0.4029, + "step": 1300 + }, + { + "epoch": 0.2610088070456365, + "learning_rate": 1.046101339992422e-05, + "loss": 0.3065, + "step": 1302 + }, + { + "epoch": 0.2610088070456365, + "learning_rate": 1.0474973142756632e-05, + "loss": 0.0033, + "step": 1304 + }, + { + "epoch": 0.2618094475580464, + "learning_rate": 1.0488931957956208e-05, + "loss": 0.4515, + "step": 1306 + }, + { + "epoch": 0.2618094475580464, + "learning_rate": 1.0502889818261058e-05, + "loss": 0.0069, + "step": 1308 + }, + { + "epoch": 0.2626100880704564, + "learning_rate": 1.0516846696411216e-05, + "loss": 0.0203, + "step": 1310 + }, + { + "epoch": 0.2626100880704564, + "learning_rate": 1.053080256514858e-05, + "loss": 0.2644, + "step": 1312 + }, + { + "epoch": 0.2634107285828663, + "learning_rate": 1.054475739721703e-05, + "loss": 0.0009, + "step": 1314 + }, + { + "epoch": 0.2634107285828663, + "learning_rate": 1.0558711165362488e-05, + "loss": 0.2009, + "step": 1316 + }, + { + "epoch": 0.2642113690952762, + "learning_rate": 1.0572663842332931e-05, + "loss": 0.0366, + "step": 1318 + }, + { + "epoch": 0.2642113690952762, + "learning_rate": 1.0586615400878484e-05, + "loss": 0.4282, + "step": 1320 + }, + { + "epoch": 0.26501200960768617, + "learning_rate": 1.0600565813751433e-05, + "loss": 0.0995, + "step": 1322 + }, + { + "epoch": 0.26501200960768617, + "learning_rate": 1.0614515053706354e-05, + "loss": 0.1734, + "step": 1324 + }, + { + "epoch": 0.2658126501200961, + "learning_rate": 1.0628463093500063e-05, + "loss": 0.0852, + "step": 1326 + }, + { + "epoch": 0.2658126501200961, + "learning_rate": 1.0642409905891733e-05, + "loss": 0.2269, + "step": 1328 + }, + { + "epoch": 0.266613290632506, + "learning_rate": 1.065635546364294e-05, + "loss": 0.1231, + "step": 1330 + }, + { + "epoch": 0.266613290632506, + "learning_rate": 1.0670299739517706e-05, + "loss": 0.0077, + "step": 1332 + }, + { + "epoch": 0.2674139311449159, + "learning_rate": 1.0684242706282562e-05, + "loss": 0.0299, + "step": 1334 + }, + { + "epoch": 0.2674139311449159, + "learning_rate": 1.0698184336706567e-05, + "loss": 0.0284, + "step": 1336 + }, + { + "epoch": 0.2682145716573259, + "learning_rate": 1.0712124603561457e-05, + "loss": 0.0447, + "step": 1338 + }, + { + "epoch": 0.2682145716573259, + "learning_rate": 1.0726063479621567e-05, + "loss": 0.1651, + "step": 1340 + }, + { + "epoch": 0.2690152121697358, + "learning_rate": 1.0740000937663972e-05, + "loss": 0.0032, + "step": 1342 + }, + { + "epoch": 0.2690152121697358, + "learning_rate": 1.0753936950468513e-05, + "loss": 0.0085, + "step": 1344 + }, + { + "epoch": 0.2698158526821457, + "learning_rate": 1.0767871490817856e-05, + "loss": 0.6799, + "step": 1346 + }, + { + "epoch": 0.2698158526821457, + "learning_rate": 1.0781804531497525e-05, + "loss": 0.0, + "step": 1348 + }, + { + "epoch": 0.27061649319455566, + "learning_rate": 1.0795736045296023e-05, + "loss": 0.0229, + "step": 1350 + }, + { + "epoch": 0.27061649319455566, + "learning_rate": 1.0809666005004787e-05, + "loss": 0.1696, + "step": 1352 + }, + { + "epoch": 0.2714171337069656, + "learning_rate": 1.08235943834183e-05, + "loss": 0.0568, + "step": 1354 + }, + { + "epoch": 0.2714171337069656, + "learning_rate": 1.083752115333414e-05, + "loss": 0.0237, + "step": 1356 + }, + { + "epoch": 0.2722177742193755, + "learning_rate": 1.0851446287553022e-05, + "loss": 0.0086, + "step": 1358 + }, + { + "epoch": 0.2722177742193755, + "learning_rate": 1.0865369758878858e-05, + "loss": 0.0159, + "step": 1360 + }, + { + "epoch": 0.27301841473178545, + "learning_rate": 1.087929154011879e-05, + "loss": 0.1379, + "step": 1362 + }, + { + "epoch": 0.27301841473178545, + "learning_rate": 1.0893211604083311e-05, + "loss": 0.0743, + "step": 1364 + }, + { + "epoch": 0.27381905524419536, + "learning_rate": 1.090712992358622e-05, + "loss": 0.0428, + "step": 1366 + }, + { + "epoch": 0.27381905524419536, + "learning_rate": 1.0921046471444737e-05, + "loss": 0.2061, + "step": 1368 + }, + { + "epoch": 0.2746196957566053, + "learning_rate": 1.0934961220479537e-05, + "loss": 0.0133, + "step": 1370 + }, + { + "epoch": 0.2746196957566053, + "learning_rate": 1.0948874143514818e-05, + "loss": 0.3065, + "step": 1372 + }, + { + "epoch": 0.2754203362690152, + "learning_rate": 1.0962785213378325e-05, + "loss": 0.1321, + "step": 1374 + }, + { + "epoch": 0.2754203362690152, + "learning_rate": 1.0976694402901467e-05, + "loss": 0.0024, + "step": 1376 + }, + { + "epoch": 0.27622097678142515, + "learning_rate": 1.0990601684919282e-05, + "loss": 0.0076, + "step": 1378 + }, + { + "epoch": 0.27622097678142515, + "learning_rate": 1.1004507032270544e-05, + "loss": 0.0625, + "step": 1380 + }, + { + "epoch": 0.27702161729383507, + "learning_rate": 1.1018410417797809e-05, + "loss": 0.6806, + "step": 1382 + }, + { + "epoch": 0.27702161729383507, + "learning_rate": 1.1032311814347467e-05, + "loss": 0.0053, + "step": 1384 + }, + { + "epoch": 0.277822257806245, + "learning_rate": 1.1046211194769784e-05, + "loss": 0.5405, + "step": 1386 + }, + { + "epoch": 0.277822257806245, + "learning_rate": 1.1060108531918955e-05, + "loss": 0.0408, + "step": 1388 + }, + { + "epoch": 0.27862289831865494, + "learning_rate": 1.1074003798653215e-05, + "loss": 0.0295, + "step": 1390 + }, + { + "epoch": 0.27862289831865494, + "learning_rate": 1.1087896967834787e-05, + "loss": 0.1385, + "step": 1392 + }, + { + "epoch": 0.27942353883106485, + "learning_rate": 1.1101788012330013e-05, + "loss": 0.3896, + "step": 1394 + }, + { + "epoch": 0.27942353883106485, + "learning_rate": 1.111567690500938e-05, + "loss": 0.0327, + "step": 1396 + }, + { + "epoch": 0.28022417934347477, + "learning_rate": 1.1129563618747581e-05, + "loss": 0.1067, + "step": 1398 + }, + { + "epoch": 0.28022417934347477, + "learning_rate": 1.1143448126423545e-05, + "loss": 0.0099, + "step": 1400 + }, + { + "epoch": 0.28102481985588473, + "learning_rate": 1.1157330400920563e-05, + "loss": 0.0985, + "step": 1402 + }, + { + "epoch": 0.28102481985588473, + "learning_rate": 1.1171210415126238e-05, + "loss": 0.3519, + "step": 1404 + }, + { + "epoch": 0.28182546036829464, + "learning_rate": 1.1185088141932594e-05, + "loss": 0.0387, + "step": 1406 + }, + { + "epoch": 0.28182546036829464, + "learning_rate": 1.1198963554236135e-05, + "loss": 0.1586, + "step": 1408 + }, + { + "epoch": 0.28262610088070456, + "learning_rate": 1.121283662493788e-05, + "loss": 0.0718, + "step": 1410 + }, + { + "epoch": 0.28262610088070456, + "learning_rate": 1.122670732694342e-05, + "loss": 0.1644, + "step": 1412 + }, + { + "epoch": 0.28342674139311447, + "learning_rate": 1.1240575633162958e-05, + "loss": 0.0021, + "step": 1414 + }, + { + "epoch": 0.28342674139311447, + "learning_rate": 1.1254441516511425e-05, + "loss": 0.1081, + "step": 1416 + }, + { + "epoch": 0.28422738190552443, + "learning_rate": 1.1268304949908434e-05, + "loss": 0.0654, + "step": 1418 + }, + { + "epoch": 0.28422738190552443, + "learning_rate": 1.1282165906278395e-05, + "loss": 0.3353, + "step": 1420 + }, + { + "epoch": 0.28502802241793435, + "learning_rate": 1.1296024358550565e-05, + "loss": 0.1529, + "step": 1422 + }, + { + "epoch": 0.28502802241793435, + "learning_rate": 1.1309880279659087e-05, + "loss": 0.1378, + "step": 1424 + }, + { + "epoch": 0.28582866293034426, + "learning_rate": 1.1323733642543024e-05, + "loss": 0.0109, + "step": 1426 + }, + { + "epoch": 0.28582866293034426, + "learning_rate": 1.1337584420146496e-05, + "loss": 0.3522, + "step": 1428 + }, + { + "epoch": 0.2866293034427542, + "learning_rate": 1.135143258541862e-05, + "loss": 0.3519, + "step": 1430 + }, + { + "epoch": 0.2866293034427542, + "learning_rate": 1.1365278111313625e-05, + "loss": 0.1899, + "step": 1432 + }, + { + "epoch": 0.28742994395516414, + "learning_rate": 1.13791209707909e-05, + "loss": 0.0443, + "step": 1434 + }, + { + "epoch": 0.28742994395516414, + "learning_rate": 1.1392961136815041e-05, + "loss": 0.0176, + "step": 1436 + }, + { + "epoch": 0.28823058446757405, + "learning_rate": 1.1406798582355902e-05, + "loss": 0.0162, + "step": 1438 + }, + { + "epoch": 0.28823058446757405, + "learning_rate": 1.142063328038864e-05, + "loss": 0.375, + "step": 1440 + }, + { + "epoch": 0.289031224979984, + "learning_rate": 1.1434465203893818e-05, + "loss": 0.0172, + "step": 1442 + }, + { + "epoch": 0.289031224979984, + "learning_rate": 1.1448294325857377e-05, + "loss": 0.2859, + "step": 1444 + }, + { + "epoch": 0.2898318654923939, + "learning_rate": 1.146212061927074e-05, + "loss": 0.1033, + "step": 1446 + }, + { + "epoch": 0.2898318654923939, + "learning_rate": 1.1475944057130856e-05, + "loss": 0.0001, + "step": 1448 + }, + { + "epoch": 0.29063250600480384, + "learning_rate": 1.1489764612440255e-05, + "loss": 0.0172, + "step": 1450 + }, + { + "epoch": 0.29063250600480384, + "learning_rate": 1.150358225820707e-05, + "loss": 0.0002, + "step": 1452 + }, + { + "epoch": 0.29143314651721375, + "learning_rate": 1.151739696744518e-05, + "loss": 0.1222, + "step": 1454 + }, + { + "epoch": 0.29143314651721375, + "learning_rate": 1.1531208713174138e-05, + "loss": 0.5667, + "step": 1456 + }, + { + "epoch": 0.2922337870296237, + "learning_rate": 1.1545017468419307e-05, + "loss": 0.3068, + "step": 1458 + }, + { + "epoch": 0.2922337870296237, + "learning_rate": 1.1558823206211887e-05, + "loss": 0.0363, + "step": 1460 + }, + { + "epoch": 0.2930344275420336, + "learning_rate": 1.1572625899588972e-05, + "loss": 0.0006, + "step": 1462 + }, + { + "epoch": 0.2930344275420336, + "learning_rate": 1.1586425521593607e-05, + "loss": 0.0543, + "step": 1464 + }, + { + "epoch": 0.29383506805444354, + "learning_rate": 1.1600222045274809e-05, + "loss": 0.0042, + "step": 1466 + }, + { + "epoch": 0.29383506805444354, + "learning_rate": 1.1614015443687708e-05, + "loss": 0.2639, + "step": 1468 + }, + { + "epoch": 0.2946357085668535, + "learning_rate": 1.1627805689893478e-05, + "loss": 0.0226, + "step": 1470 + }, + { + "epoch": 0.2946357085668535, + "learning_rate": 1.1641592756959467e-05, + "loss": 0.0926, + "step": 1472 + }, + { + "epoch": 0.2954363490792634, + "learning_rate": 1.1655376617959239e-05, + "loss": 0.1277, + "step": 1474 + }, + { + "epoch": 0.2954363490792634, + "learning_rate": 1.1669157245972616e-05, + "loss": 0.0971, + "step": 1476 + }, + { + "epoch": 0.2962369895916733, + "learning_rate": 1.1682934614085708e-05, + "loss": 0.0073, + "step": 1478 + }, + { + "epoch": 0.2962369895916733, + "learning_rate": 1.1696708695391057e-05, + "loss": 0.038, + "step": 1480 + }, + { + "epoch": 0.29703763010408324, + "learning_rate": 1.1710479462987565e-05, + "loss": 0.0627, + "step": 1482 + }, + { + "epoch": 0.29703763010408324, + "learning_rate": 1.1724246889980626e-05, + "loss": 0.1811, + "step": 1484 + }, + { + "epoch": 0.2978382706164932, + "learning_rate": 1.1738010949482152e-05, + "loss": 0.0115, + "step": 1486 + }, + { + "epoch": 0.2978382706164932, + "learning_rate": 1.1751771614610643e-05, + "loss": 0.0709, + "step": 1488 + }, + { + "epoch": 0.2986389111289031, + "learning_rate": 1.176552885849122e-05, + "loss": 0.0677, + "step": 1490 + }, + { + "epoch": 0.2986389111289031, + "learning_rate": 1.1779282654255668e-05, + "loss": 0.1108, + "step": 1492 + }, + { + "epoch": 0.29943955164131303, + "learning_rate": 1.1793032975042563e-05, + "loss": 0.0709, + "step": 1494 + }, + { + "epoch": 0.29943955164131303, + "learning_rate": 1.180677979399721e-05, + "loss": 0.7068, + "step": 1496 + }, + { + "epoch": 0.300240192153723, + "learning_rate": 1.1820523084271775e-05, + "loss": 0.0223, + "step": 1498 + }, + { + "epoch": 0.300240192153723, + "learning_rate": 1.1834262819025317e-05, + "loss": 0.0259, + "step": 1500 + }, + { + "epoch": 0.3010408326661329, + "learning_rate": 1.1847998971423835e-05, + "loss": 0.0188, + "step": 1502 + }, + { + "epoch": 0.3010408326661329, + "learning_rate": 1.1861731514640309e-05, + "loss": 0.0196, + "step": 1504 + }, + { + "epoch": 0.3018414731785428, + "learning_rate": 1.1875460421854816e-05, + "loss": 0.0047, + "step": 1506 + }, + { + "epoch": 0.3018414731785428, + "learning_rate": 1.188918566625449e-05, + "loss": 0.2957, + "step": 1508 + }, + { + "epoch": 0.3026421136909528, + "learning_rate": 1.1902907221033629e-05, + "loss": 0.0958, + "step": 1510 + }, + { + "epoch": 0.3026421136909528, + "learning_rate": 1.1916625059393739e-05, + "loss": 0.0485, + "step": 1512 + }, + { + "epoch": 0.3034427542033627, + "learning_rate": 1.1930339154543582e-05, + "loss": 0.0089, + "step": 1514 + }, + { + "epoch": 0.3034427542033627, + "learning_rate": 1.1944049479699241e-05, + "loss": 0.6677, + "step": 1516 + }, + { + "epoch": 0.3042433947157726, + "learning_rate": 1.1957756008084127e-05, + "loss": 0.0001, + "step": 1518 + }, + { + "epoch": 0.3042433947157726, + "learning_rate": 1.1971458712929133e-05, + "loss": 0.1564, + "step": 1520 + }, + { + "epoch": 0.3050440352281825, + "learning_rate": 1.1985157567472563e-05, + "loss": 0.3061, + "step": 1522 + }, + { + "epoch": 0.3050440352281825, + "learning_rate": 1.1998852544960256e-05, + "loss": 0.0003, + "step": 1524 + }, + { + "epoch": 0.3058446757405925, + "learning_rate": 1.2012543618645622e-05, + "loss": 0.0076, + "step": 1526 + }, + { + "epoch": 0.3058446757405925, + "learning_rate": 1.2026230761789702e-05, + "loss": 0.0786, + "step": 1528 + }, + { + "epoch": 0.3066453162530024, + "learning_rate": 1.2039913947661205e-05, + "loss": 0.0005, + "step": 1530 + }, + { + "epoch": 0.3066453162530024, + "learning_rate": 1.2053593149536557e-05, + "loss": 0.0924, + "step": 1532 + }, + { + "epoch": 0.3074459567654123, + "learning_rate": 1.2067268340700016e-05, + "loss": 0.7579, + "step": 1534 + }, + { + "epoch": 0.3074459567654123, + "learning_rate": 1.2080939494443618e-05, + "loss": 0.0124, + "step": 1536 + }, + { + "epoch": 0.3082465972778223, + "learning_rate": 1.2094606584067304e-05, + "loss": 0.0419, + "step": 1538 + }, + { + "epoch": 0.3082465972778223, + "learning_rate": 1.210826958287895e-05, + "loss": 0.27, + "step": 1540 + }, + { + "epoch": 0.3090472377902322, + "learning_rate": 1.212192846419443e-05, + "loss": 0.7328, + "step": 1542 + }, + { + "epoch": 0.3090472377902322, + "learning_rate": 1.2135583201337646e-05, + "loss": 0.0012, + "step": 1544 + }, + { + "epoch": 0.3098478783026421, + "learning_rate": 1.2149233767640587e-05, + "loss": 0.0772, + "step": 1546 + }, + { + "epoch": 0.3098478783026421, + "learning_rate": 1.2162880136443434e-05, + "loss": 0.0087, + "step": 1548 + }, + { + "epoch": 0.31064851881505207, + "learning_rate": 1.2176522281094514e-05, + "loss": 0.0162, + "step": 1550 + }, + { + "epoch": 0.31064851881505207, + "learning_rate": 1.2190160174950428e-05, + "loss": 0.0201, + "step": 1552 + }, + { + "epoch": 0.311449159327462, + "learning_rate": 1.220379379137607e-05, + "loss": 0.0114, + "step": 1554 + }, + { + "epoch": 0.311449159327462, + "learning_rate": 1.2217423103744692e-05, + "loss": 0.2302, + "step": 1556 + }, + { + "epoch": 0.3122497998398719, + "learning_rate": 1.2231048085437953e-05, + "loss": 0.0928, + "step": 1558 + }, + { + "epoch": 0.3122497998398719, + "learning_rate": 1.2244668709845952e-05, + "loss": 0.0793, + "step": 1560 + }, + { + "epoch": 0.3130504403522818, + "learning_rate": 1.2258284950367347e-05, + "loss": 0.0209, + "step": 1562 + }, + { + "epoch": 0.3130504403522818, + "learning_rate": 1.2271896780409309e-05, + "loss": 0.1783, + "step": 1564 + }, + { + "epoch": 0.31385108086469177, + "learning_rate": 1.228550417338764e-05, + "loss": 0.3526, + "step": 1566 + }, + { + "epoch": 0.31385108086469177, + "learning_rate": 1.2299107102726804e-05, + "loss": 0.0725, + "step": 1568 + }, + { + "epoch": 0.3146517213771017, + "learning_rate": 1.2312705541859985e-05, + "loss": 0.4456, + "step": 1570 + }, + { + "epoch": 0.3146517213771017, + "learning_rate": 1.2326299464229143e-05, + "loss": 0.0093, + "step": 1572 + }, + { + "epoch": 0.3154523618895116, + "learning_rate": 1.2339888843285029e-05, + "loss": 0.401, + "step": 1574 + }, + { + "epoch": 0.3154523618895116, + "learning_rate": 1.2353473652487329e-05, + "loss": 0.0227, + "step": 1576 + }, + { + "epoch": 0.31625300240192156, + "learning_rate": 1.2367053865304597e-05, + "loss": 0.0092, + "step": 1578 + }, + { + "epoch": 0.31625300240192156, + "learning_rate": 1.2380629455214385e-05, + "loss": 0.1621, + "step": 1580 + }, + { + "epoch": 0.31705364291433147, + "learning_rate": 1.2394200395703273e-05, + "loss": 0.0731, + "step": 1582 + }, + { + "epoch": 0.31705364291433147, + "learning_rate": 1.2407766660266916e-05, + "loss": 0.073, + "step": 1584 + }, + { + "epoch": 0.3178542834267414, + "learning_rate": 1.2421328222410109e-05, + "loss": 0.2418, + "step": 1586 + }, + { + "epoch": 0.3178542834267414, + "learning_rate": 1.2434885055646808e-05, + "loss": 0.1361, + "step": 1588 + }, + { + "epoch": 0.31865492393915135, + "learning_rate": 1.2448437133500262e-05, + "loss": 0.182, + "step": 1590 + }, + { + "epoch": 0.31865492393915135, + "learning_rate": 1.2461984429502947e-05, + "loss": 0.017, + "step": 1592 + }, + { + "epoch": 0.31945556445156126, + "learning_rate": 1.2475526917196703e-05, + "loss": 0.0634, + "step": 1594 + }, + { + "epoch": 0.31945556445156126, + "learning_rate": 1.2489064570132761e-05, + "loss": 0.0284, + "step": 1596 + }, + { + "epoch": 0.32025620496397117, + "learning_rate": 1.2502597361871787e-05, + "loss": 0.5452, + "step": 1598 + }, + { + "epoch": 0.32025620496397117, + "learning_rate": 1.2516125265983945e-05, + "loss": 0.0007, + "step": 1600 + }, + { + "epoch": 0.3210568454763811, + "learning_rate": 1.2529648256048931e-05, + "loss": 0.038, + "step": 1602 + }, + { + "epoch": 0.3210568454763811, + "learning_rate": 1.2543166305656089e-05, + "loss": 0.1361, + "step": 1604 + }, + { + "epoch": 0.32185748598879105, + "learning_rate": 1.2556679388404351e-05, + "loss": 0.449, + "step": 1606 + }, + { + "epoch": 0.32185748598879105, + "learning_rate": 1.257018747790238e-05, + "loss": 0.4011, + "step": 1608 + }, + { + "epoch": 0.32265812650120096, + "learning_rate": 1.2583690547768584e-05, + "loss": 0.1383, + "step": 1610 + }, + { + "epoch": 0.32265812650120096, + "learning_rate": 1.259718857163117e-05, + "loss": 0.1093, + "step": 1612 + }, + { + "epoch": 0.32345876701361087, + "learning_rate": 1.261068152312821e-05, + "loss": 0.0512, + "step": 1614 + }, + { + "epoch": 0.32345876701361087, + "learning_rate": 1.2624169375907657e-05, + "loss": 0.0713, + "step": 1616 + }, + { + "epoch": 0.32425940752602084, + "learning_rate": 1.2637652103627481e-05, + "loss": 0.0253, + "step": 1618 + }, + { + "epoch": 0.32425940752602084, + "learning_rate": 1.2651129679955598e-05, + "loss": 0.2472, + "step": 1620 + }, + { + "epoch": 0.32506004803843075, + "learning_rate": 1.2664602078570017e-05, + "loss": 0.0071, + "step": 1622 + }, + { + "epoch": 0.32506004803843075, + "learning_rate": 1.2678069273158849e-05, + "loss": 0.0206, + "step": 1624 + }, + { + "epoch": 0.32586068855084066, + "learning_rate": 1.2691531237420369e-05, + "loss": 0.0041, + "step": 1626 + }, + { + "epoch": 0.32586068855084066, + "learning_rate": 1.2704987945063073e-05, + "loss": 0.0792, + "step": 1628 + }, + { + "epoch": 0.3266613290632506, + "learning_rate": 1.27184393698057e-05, + "loss": 0.0099, + "step": 1630 + }, + { + "epoch": 0.3266613290632506, + "learning_rate": 1.273188548537736e-05, + "loss": 0.7459, + "step": 1632 + }, + { + "epoch": 0.32746196957566054, + "learning_rate": 1.2745326265517481e-05, + "loss": 0.0352, + "step": 1634 + }, + { + "epoch": 0.32746196957566054, + "learning_rate": 1.2758761683975929e-05, + "loss": 0.2075, + "step": 1636 + }, + { + "epoch": 0.32826261008807045, + "learning_rate": 1.277219171451304e-05, + "loss": 0.1902, + "step": 1638 + }, + { + "epoch": 0.32826261008807045, + "learning_rate": 1.2785616330899676e-05, + "loss": 0.1563, + "step": 1640 + }, + { + "epoch": 0.32906325060048036, + "learning_rate": 1.2799035506917265e-05, + "loss": 0.7621, + "step": 1642 + }, + { + "epoch": 0.32906325060048036, + "learning_rate": 1.2812449216357855e-05, + "loss": 0.1907, + "step": 1644 + }, + { + "epoch": 0.32986389111289033, + "learning_rate": 1.2825857433024208e-05, + "loss": 0.1192, + "step": 1646 + }, + { + "epoch": 0.32986389111289033, + "learning_rate": 1.2839260130729776e-05, + "loss": 0.0165, + "step": 1648 + }, + { + "epoch": 0.33066453162530024, + "learning_rate": 1.2852657283298794e-05, + "loss": 0.2252, + "step": 1650 + }, + { + "epoch": 0.33066453162530024, + "learning_rate": 1.2866048864566336e-05, + "loss": 0.3499, + "step": 1652 + }, + { + "epoch": 0.33146517213771015, + "learning_rate": 1.2879434848378356e-05, + "loss": 0.1238, + "step": 1654 + }, + { + "epoch": 0.33146517213771015, + "learning_rate": 1.2892815208591734e-05, + "loss": 0.1586, + "step": 1656 + }, + { + "epoch": 0.3322658126501201, + "learning_rate": 1.2906189919074336e-05, + "loss": 0.0737, + "step": 1658 + }, + { + "epoch": 0.3322658126501201, + "learning_rate": 1.2919558953705047e-05, + "loss": 0.0002, + "step": 1660 + }, + { + "epoch": 0.33306645316253003, + "learning_rate": 1.293292228637389e-05, + "loss": 0.0098, + "step": 1662 + }, + { + "epoch": 0.33306645316253003, + "learning_rate": 1.2946279890981966e-05, + "loss": 0.1109, + "step": 1664 + }, + { + "epoch": 0.33386709367493994, + "learning_rate": 1.2959631741441583e-05, + "loss": 0.1065, + "step": 1666 + }, + { + "epoch": 0.33386709367493994, + "learning_rate": 1.2972977811676289e-05, + "loss": 0.0544, + "step": 1668 + }, + { + "epoch": 0.33466773418734985, + "learning_rate": 1.298631807562092e-05, + "loss": 0.1201, + "step": 1670 + }, + { + "epoch": 0.33466773418734985, + "learning_rate": 1.2999652507221652e-05, + "loss": 0.0611, + "step": 1672 + }, + { + "epoch": 0.3354683746997598, + "learning_rate": 1.3012981080436036e-05, + "loss": 0.0062, + "step": 1674 + }, + { + "epoch": 0.3354683746997598, + "learning_rate": 1.3026303769233109e-05, + "loss": 0.0018, + "step": 1676 + }, + { + "epoch": 0.33626901521216973, + "learning_rate": 1.3039620547593357e-05, + "loss": 0.0014, + "step": 1678 + }, + { + "epoch": 0.33626901521216973, + "learning_rate": 1.3052931389508822e-05, + "loss": 0.1112, + "step": 1680 + }, + { + "epoch": 0.33706965572457964, + "learning_rate": 1.3066236268983143e-05, + "loss": 1.1674, + "step": 1682 + }, + { + "epoch": 0.33706965572457964, + "learning_rate": 1.3079535160031601e-05, + "loss": 0.0552, + "step": 1684 + }, + { + "epoch": 0.3378702962369896, + "learning_rate": 1.3092828036681178e-05, + "loss": 0.5078, + "step": 1686 + }, + { + "epoch": 0.3378702962369896, + "learning_rate": 1.3106114872970575e-05, + "loss": 0.4399, + "step": 1688 + }, + { + "epoch": 0.3386709367493995, + "learning_rate": 1.3119395642950348e-05, + "loss": 0.0199, + "step": 1690 + }, + { + "epoch": 0.3386709367493995, + "learning_rate": 1.313267032068285e-05, + "loss": 0.1788, + "step": 1692 + }, + { + "epoch": 0.33947157726180943, + "learning_rate": 1.3145938880242346e-05, + "loss": 0.0047, + "step": 1694 + }, + { + "epoch": 0.33947157726180943, + "learning_rate": 1.3159201295715054e-05, + "loss": 0.0052, + "step": 1696 + }, + { + "epoch": 0.3402722177742194, + "learning_rate": 1.3172457541199188e-05, + "loss": 0.0013, + "step": 1698 + }, + { + "epoch": 0.3402722177742194, + "learning_rate": 1.3185707590804997e-05, + "loss": 0.3553, + "step": 1700 + }, + { + "epoch": 0.3410728582866293, + "learning_rate": 1.3198951418654882e-05, + "loss": 0.0002, + "step": 1702 + }, + { + "epoch": 0.3410728582866293, + "learning_rate": 1.321218899888334e-05, + "loss": 0.1041, + "step": 1704 + }, + { + "epoch": 0.3418734987990392, + "learning_rate": 1.322542030563709e-05, + "loss": 0.0873, + "step": 1706 + }, + { + "epoch": 0.3418734987990392, + "learning_rate": 1.3238645313075109e-05, + "loss": 0.0052, + "step": 1708 + }, + { + "epoch": 0.34267413931144913, + "learning_rate": 1.3251863995368665e-05, + "loss": 0.2856, + "step": 1710 + }, + { + "epoch": 0.34267413931144913, + "learning_rate": 1.326507632670139e-05, + "loss": 0.0982, + "step": 1712 + }, + { + "epoch": 0.3434747798238591, + "learning_rate": 1.3278282281269293e-05, + "loss": 0.0113, + "step": 1714 + }, + { + "epoch": 0.3434747798238591, + "learning_rate": 1.3291481833280894e-05, + "loss": 0.1332, + "step": 1716 + }, + { + "epoch": 0.344275420336269, + "learning_rate": 1.3304674956957167e-05, + "loss": 0.0089, + "step": 1718 + }, + { + "epoch": 0.344275420336269, + "learning_rate": 1.3317861626531652e-05, + "loss": 0.0211, + "step": 1720 + }, + { + "epoch": 0.3450760608486789, + "learning_rate": 1.3331041816250503e-05, + "loss": 0.0081, + "step": 1722 + }, + { + "epoch": 0.3450760608486789, + "learning_rate": 1.3344215500372517e-05, + "loss": 0.0178, + "step": 1724 + }, + { + "epoch": 0.3458767013610889, + "learning_rate": 1.335738265316921e-05, + "loss": 0.5472, + "step": 1726 + }, + { + "epoch": 0.3458767013610889, + "learning_rate": 1.3370543248924826e-05, + "loss": 0.2749, + "step": 1728 + }, + { + "epoch": 0.3466773418734988, + "learning_rate": 1.3383697261936472e-05, + "loss": 0.0019, + "step": 1730 + }, + { + "epoch": 0.3466773418734988, + "learning_rate": 1.3396844666514062e-05, + "loss": 0.0025, + "step": 1732 + }, + { + "epoch": 0.3474779823859087, + "learning_rate": 1.3409985436980422e-05, + "loss": 0.0158, + "step": 1734 + }, + { + "epoch": 0.3474779823859087, + "learning_rate": 1.3423119547671348e-05, + "loss": 0.0113, + "step": 1736 + }, + { + "epoch": 0.3482786228983187, + "learning_rate": 1.3436246972935638e-05, + "loss": 0.0287, + "step": 1738 + }, + { + "epoch": 0.3482786228983187, + "learning_rate": 1.344936768713513e-05, + "loss": 0.0227, + "step": 1740 + }, + { + "epoch": 0.3490792634107286, + "learning_rate": 1.346248166464481e-05, + "loss": 0.048, + "step": 1742 + }, + { + "epoch": 0.3490792634107286, + "learning_rate": 1.347558887985279e-05, + "loss": 0.2022, + "step": 1744 + }, + { + "epoch": 0.3498799039231385, + "learning_rate": 1.348868930716039e-05, + "loss": 0.0577, + "step": 1746 + }, + { + "epoch": 0.3498799039231385, + "learning_rate": 1.3501782920982189e-05, + "loss": 0.2272, + "step": 1748 + }, + { + "epoch": 0.3506805444355484, + "learning_rate": 1.3514869695746078e-05, + "loss": 0.0979, + "step": 1750 + }, + { + "epoch": 0.3506805444355484, + "learning_rate": 1.3527949605893305e-05, + "loss": 0.8477, + "step": 1752 + }, + { + "epoch": 0.3514811849479584, + "learning_rate": 1.3541022625878501e-05, + "loss": 0.0125, + "step": 1754 + }, + { + "epoch": 0.3514811849479584, + "learning_rate": 1.3554088730169812e-05, + "loss": 0.5926, + "step": 1756 + }, + { + "epoch": 0.3522818254603683, + "learning_rate": 1.3567147893248833e-05, + "loss": 0.5768, + "step": 1758 + }, + { + "epoch": 0.3522818254603683, + "learning_rate": 1.3580200089610739e-05, + "loss": 0.0208, + "step": 1760 + }, + { + "epoch": 0.3530824659727782, + "learning_rate": 1.3593245293764303e-05, + "loss": 0.0211, + "step": 1762 + }, + { + "epoch": 0.3530824659727782, + "learning_rate": 1.3606283480231962e-05, + "loss": 0.0798, + "step": 1764 + }, + { + "epoch": 0.35388310648518817, + "learning_rate": 1.361931462354984e-05, + "loss": 0.2642, + "step": 1766 + }, + { + "epoch": 0.35388310648518817, + "learning_rate": 1.3632338698267863e-05, + "loss": 0.4483, + "step": 1768 + }, + { + "epoch": 0.3546837469975981, + "learning_rate": 1.3645355678949715e-05, + "loss": 0.0456, + "step": 1770 + }, + { + "epoch": 0.3546837469975981, + "learning_rate": 1.3658365540172948e-05, + "loss": 0.332, + "step": 1772 + }, + { + "epoch": 0.355484387510008, + "learning_rate": 1.3671368256529026e-05, + "loss": 0.1535, + "step": 1774 + }, + { + "epoch": 0.355484387510008, + "learning_rate": 1.368436380262336e-05, + "loss": 0.2509, + "step": 1776 + }, + { + "epoch": 0.35628502802241796, + "learning_rate": 1.3697352153075365e-05, + "loss": 0.1444, + "step": 1778 + }, + { + "epoch": 0.35628502802241796, + "learning_rate": 1.3710333282518497e-05, + "loss": 0.3762, + "step": 1780 + }, + { + "epoch": 0.35708566853482787, + "learning_rate": 1.3723307165600361e-05, + "loss": 0.753, + "step": 1782 + }, + { + "epoch": 0.35708566853482787, + "learning_rate": 1.3736273776982667e-05, + "loss": 0.2411, + "step": 1784 + }, + { + "epoch": 0.3578863090472378, + "learning_rate": 1.3749233091341344e-05, + "loss": 0.1482, + "step": 1786 + }, + { + "epoch": 0.3578863090472378, + "learning_rate": 1.3762185083366562e-05, + "loss": 0.4209, + "step": 1788 + }, + { + "epoch": 0.3586869495596477, + "learning_rate": 1.3775129727762808e-05, + "loss": 0.1017, + "step": 1790 + }, + { + "epoch": 0.3586869495596477, + "learning_rate": 1.3788066999248893e-05, + "loss": 0.0795, + "step": 1792 + }, + { + "epoch": 0.35948759007205766, + "learning_rate": 1.3800996872558075e-05, + "loss": 0.0416, + "step": 1794 + }, + { + "epoch": 0.35948759007205766, + "learning_rate": 1.3813919322438018e-05, + "loss": 0.1014, + "step": 1796 + }, + { + "epoch": 0.3602882305844676, + "learning_rate": 1.3826834323650899e-05, + "loss": 0.3639, + "step": 1798 + }, + { + "epoch": 0.3602882305844676, + "learning_rate": 1.3839741850973435e-05, + "loss": 0.075, + "step": 1800 + }, + { + "epoch": 0.3610888710968775, + "learning_rate": 1.3852641879196952e-05, + "loss": 0.0462, + "step": 1802 + }, + { + "epoch": 0.3610888710968775, + "learning_rate": 1.3865534383127413e-05, + "loss": 0.0759, + "step": 1804 + }, + { + "epoch": 0.36188951160928745, + "learning_rate": 1.387841933758546e-05, + "loss": 0.0586, + "step": 1806 + }, + { + "epoch": 0.36188951160928745, + "learning_rate": 1.3891296717406533e-05, + "loss": 0.013, + "step": 1808 + }, + { + "epoch": 0.36269015212169736, + "learning_rate": 1.3904166497440812e-05, + "loss": 0.3006, + "step": 1810 + }, + { + "epoch": 0.36269015212169736, + "learning_rate": 1.391702865255334e-05, + "loss": 0.0123, + "step": 1812 + }, + { + "epoch": 0.3634907926341073, + "learning_rate": 1.3929883157624046e-05, + "loss": 0.0383, + "step": 1814 + }, + { + "epoch": 0.3634907926341073, + "learning_rate": 1.3942729987547808e-05, + "loss": 0.3063, + "step": 1816 + }, + { + "epoch": 0.36429143314651724, + "learning_rate": 1.3955569117234468e-05, + "loss": 0.0001, + "step": 1818 + }, + { + "epoch": 0.36429143314651724, + "learning_rate": 1.3968400521608962e-05, + "loss": 0.0077, + "step": 1820 + }, + { + "epoch": 0.36509207365892715, + "learning_rate": 1.3981224175611265e-05, + "loss": 0.0787, + "step": 1822 + }, + { + "epoch": 0.36509207365892715, + "learning_rate": 1.3994040054196498e-05, + "loss": 0.2579, + "step": 1824 + }, + { + "epoch": 0.36589271417133706, + "learning_rate": 1.4006848132334979e-05, + "loss": 0.0016, + "step": 1826 + }, + { + "epoch": 0.36589271417133706, + "learning_rate": 1.4019648385012245e-05, + "loss": 0.1764, + "step": 1828 + }, + { + "epoch": 0.366693354683747, + "learning_rate": 1.4032440787229135e-05, + "loss": 0.0178, + "step": 1830 + }, + { + "epoch": 0.366693354683747, + "learning_rate": 1.4045225314001789e-05, + "loss": 0.313, + "step": 1832 + }, + { + "epoch": 0.36749399519615694, + "learning_rate": 1.4058001940361781e-05, + "loss": 0.704, + "step": 1834 + }, + { + "epoch": 0.36749399519615694, + "learning_rate": 1.4070770641356069e-05, + "loss": 0.1353, + "step": 1836 + }, + { + "epoch": 0.36829463570856685, + "learning_rate": 1.40835313920471e-05, + "loss": 0.0537, + "step": 1838 + }, + { + "epoch": 0.36829463570856685, + "learning_rate": 1.4096284167512856e-05, + "loss": 0.119, + "step": 1840 + }, + { + "epoch": 0.36909527622097676, + "learning_rate": 1.4109028942846888e-05, + "loss": 0.0091, + "step": 1842 + }, + { + "epoch": 0.36909527622097676, + "learning_rate": 1.4121765693158355e-05, + "loss": 0.2878, + "step": 1844 + }, + { + "epoch": 0.36989591673338673, + "learning_rate": 1.4134494393572146e-05, + "loss": 0.0048, + "step": 1846 + }, + { + "epoch": 0.36989591673338673, + "learning_rate": 1.4147215019228813e-05, + "loss": 0.0178, + "step": 1848 + }, + { + "epoch": 0.37069655724579664, + "learning_rate": 1.4159927545284697e-05, + "loss": 0.1082, + "step": 1850 + }, + { + "epoch": 0.37069655724579664, + "learning_rate": 1.4172631946911964e-05, + "loss": 0.2641, + "step": 1852 + }, + { + "epoch": 0.37149719775820655, + "learning_rate": 1.4185328199298636e-05, + "loss": 0.0003, + "step": 1854 + }, + { + "epoch": 0.37149719775820655, + "learning_rate": 1.4198016277648665e-05, + "loss": 0.0971, + "step": 1856 + }, + { + "epoch": 0.37229783827061647, + "learning_rate": 1.4210696157181936e-05, + "loss": 0.307, + "step": 1858 + }, + { + "epoch": 0.37229783827061647, + "learning_rate": 1.4223367813134406e-05, + "loss": 0.0112, + "step": 1860 + }, + { + "epoch": 0.37309847878302643, + "learning_rate": 1.4236031220758037e-05, + "loss": 0.0644, + "step": 1862 + }, + { + "epoch": 0.37309847878302643, + "learning_rate": 1.4248686355320922e-05, + "loss": 0.0509, + "step": 1864 + }, + { + "epoch": 0.37389911929543634, + "learning_rate": 1.426133319210731e-05, + "loss": 0.1905, + "step": 1866 + }, + { + "epoch": 0.37389911929543634, + "learning_rate": 1.4273971706417653e-05, + "loss": 0.0003, + "step": 1868 + }, + { + "epoch": 0.37469975980784626, + "learning_rate": 1.4286601873568642e-05, + "loss": 0.5313, + "step": 1870 + }, + { + "epoch": 0.37469975980784626, + "learning_rate": 1.429922366889332e-05, + "loss": 0.0208, + "step": 1872 + }, + { + "epoch": 0.3755004003202562, + "learning_rate": 1.431183706774103e-05, + "loss": 0.0846, + "step": 1874 + }, + { + "epoch": 0.3755004003202562, + "learning_rate": 1.4324442045477534e-05, + "loss": 0.426, + "step": 1876 + }, + { + "epoch": 0.37630104083266613, + "learning_rate": 1.4337038577485035e-05, + "loss": 0.0853, + "step": 1878 + }, + { + "epoch": 0.37630104083266613, + "learning_rate": 1.4349626639162231e-05, + "loss": 0.0576, + "step": 1880 + }, + { + "epoch": 0.37710168134507605, + "learning_rate": 1.436220620592437e-05, + "loss": 0.0242, + "step": 1882 + }, + { + "epoch": 0.37710168134507605, + "learning_rate": 1.4374777253203265e-05, + "loss": 0.1074, + "step": 1884 + }, + { + "epoch": 0.377902321857486, + "learning_rate": 1.4387339756447422e-05, + "loss": 0.0319, + "step": 1886 + }, + { + "epoch": 0.377902321857486, + "learning_rate": 1.4399893691121985e-05, + "loss": 0.006, + "step": 1888 + }, + { + "epoch": 0.3787029623698959, + "learning_rate": 1.4412439032708848e-05, + "loss": 0.0072, + "step": 1890 + }, + { + "epoch": 0.3787029623698959, + "learning_rate": 1.4424975756706684e-05, + "loss": 0.0112, + "step": 1892 + }, + { + "epoch": 0.37950360288230583, + "learning_rate": 1.4437503838631002e-05, + "loss": 0.1043, + "step": 1894 + }, + { + "epoch": 0.37950360288230583, + "learning_rate": 1.4450023254014185e-05, + "loss": 0.3608, + "step": 1896 + }, + { + "epoch": 0.38030424339471575, + "learning_rate": 1.4462533978405529e-05, + "loss": 0.3062, + "step": 1898 + }, + { + "epoch": 0.38030424339471575, + "learning_rate": 1.4475035987371348e-05, + "loss": 0.0303, + "step": 1900 + }, + { + "epoch": 0.3811048839071257, + "learning_rate": 1.4487529256494937e-05, + "loss": 0.4396, + "step": 1902 + }, + { + "epoch": 0.3811048839071257, + "learning_rate": 1.4500013761376663e-05, + "loss": 0.1742, + "step": 1904 + }, + { + "epoch": 0.3819055244195356, + "learning_rate": 1.4512489477634024e-05, + "loss": 0.0266, + "step": 1906 + }, + { + "epoch": 0.3819055244195356, + "learning_rate": 1.4524956380901674e-05, + "loss": 0.2346, + "step": 1908 + }, + { + "epoch": 0.38270616493194554, + "learning_rate": 1.4537414446831461e-05, + "loss": 0.3579, + "step": 1910 + }, + { + "epoch": 0.38270616493194554, + "learning_rate": 1.454986365109255e-05, + "loss": 0.0192, + "step": 1912 + }, + { + "epoch": 0.3835068054443555, + "learning_rate": 1.4562303969371357e-05, + "loss": 0.0793, + "step": 1914 + }, + { + "epoch": 0.3835068054443555, + "learning_rate": 1.4574735377371669e-05, + "loss": 0.0393, + "step": 1916 + }, + { + "epoch": 0.3843074459567654, + "learning_rate": 1.4587157850814679e-05, + "loss": 0.8558, + "step": 1918 + }, + { + "epoch": 0.3843074459567654, + "learning_rate": 1.4599571365439027e-05, + "loss": 0.2762, + "step": 1920 + }, + { + "epoch": 0.3851080864691753, + "learning_rate": 1.4611975897000849e-05, + "loss": 0.1076, + "step": 1922 + }, + { + "epoch": 0.3851080864691753, + "learning_rate": 1.4624371421273812e-05, + "loss": 0.0033, + "step": 1924 + }, + { + "epoch": 0.3859087269815853, + "learning_rate": 1.463675791404922e-05, + "loss": 0.1745, + "step": 1926 + }, + { + "epoch": 0.3859087269815853, + "learning_rate": 1.4649135351135968e-05, + "loss": 0.0073, + "step": 1928 + }, + { + "epoch": 0.3867093674939952, + "learning_rate": 1.4661503708360652e-05, + "loss": 0.023, + "step": 1930 + }, + { + "epoch": 0.3867093674939952, + "learning_rate": 1.4673862961567604e-05, + "loss": 0.2686, + "step": 1932 + }, + { + "epoch": 0.3875100080064051, + "learning_rate": 1.4686213086618932e-05, + "loss": 0.1671, + "step": 1934 + }, + { + "epoch": 0.3875100080064051, + "learning_rate": 1.4698554059394563e-05, + "loss": 0.1591, + "step": 1936 + }, + { + "epoch": 0.388310648518815, + "learning_rate": 1.4710885855792338e-05, + "loss": 0.0202, + "step": 1938 + }, + { + "epoch": 0.388310648518815, + "learning_rate": 1.4723208451727977e-05, + "loss": 0.0404, + "step": 1940 + }, + { + "epoch": 0.389111289031225, + "learning_rate": 1.4735521823135184e-05, + "loss": 0.2897, + "step": 1942 + }, + { + "epoch": 0.389111289031225, + "learning_rate": 1.4747825945965675e-05, + "loss": 0.0286, + "step": 1944 + }, + { + "epoch": 0.3899119295436349, + "learning_rate": 1.4760120796189233e-05, + "loss": 0.0035, + "step": 1946 + }, + { + "epoch": 0.3899119295436349, + "learning_rate": 1.4772406349793749e-05, + "loss": 0.0511, + "step": 1948 + }, + { + "epoch": 0.3907125700560448, + "learning_rate": 1.4784682582785254e-05, + "loss": 0.0396, + "step": 1950 + }, + { + "epoch": 0.3907125700560448, + "learning_rate": 1.4796949471188033e-05, + "loss": 0.2095, + "step": 1952 + }, + { + "epoch": 0.3915132105684548, + "learning_rate": 1.4809206991044571e-05, + "loss": 0.0011, + "step": 1954 + }, + { + "epoch": 0.3915132105684548, + "learning_rate": 1.4821455118415666e-05, + "loss": 0.1327, + "step": 1956 + }, + { + "epoch": 0.3923138510808647, + "learning_rate": 1.4833693829380458e-05, + "loss": 0.1325, + "step": 1958 + }, + { + "epoch": 0.3923138510808647, + "learning_rate": 1.4845923100036479e-05, + "loss": 0.0092, + "step": 1960 + }, + { + "epoch": 0.3931144915932746, + "learning_rate": 1.4858142906499686e-05, + "loss": 0.0204, + "step": 1962 + }, + { + "epoch": 0.3931144915932746, + "learning_rate": 1.4870353224904563e-05, + "loss": 0.0405, + "step": 1964 + }, + { + "epoch": 0.3939151321056846, + "learning_rate": 1.4882554031404075e-05, + "loss": 0.2306, + "step": 1966 + }, + { + "epoch": 0.3939151321056846, + "learning_rate": 1.4894745302169786e-05, + "loss": 0.017, + "step": 1968 + }, + { + "epoch": 0.3947157726180945, + "learning_rate": 1.4906927013391879e-05, + "loss": 0.1708, + "step": 1970 + }, + { + "epoch": 0.3947157726180945, + "learning_rate": 1.4919099141279205e-05, + "loss": 0.0055, + "step": 1972 + }, + { + "epoch": 0.3955164131305044, + "learning_rate": 1.4931261662059338e-05, + "loss": 0.0158, + "step": 1974 + }, + { + "epoch": 0.3955164131305044, + "learning_rate": 1.4943414551978597e-05, + "loss": 0.2075, + "step": 1976 + }, + { + "epoch": 0.3963170536429143, + "learning_rate": 1.4955557787302151e-05, + "loss": 0.0173, + "step": 1978 + }, + { + "epoch": 0.3963170536429143, + "learning_rate": 1.4967691344313988e-05, + "loss": 0.7033, + "step": 1980 + }, + { + "epoch": 0.3971176941553243, + "learning_rate": 1.4979815199317005e-05, + "loss": 0.1084, + "step": 1982 + }, + { + "epoch": 0.3971176941553243, + "learning_rate": 1.499192932863305e-05, + "loss": 0.2574, + "step": 1984 + }, + { + "epoch": 0.3979183346677342, + "learning_rate": 1.5004033708602967e-05, + "loss": 0.3759, + "step": 1986 + }, + { + "epoch": 0.3979183346677342, + "learning_rate": 1.5016128315586626e-05, + "loss": 0.0349, + "step": 1988 + }, + { + "epoch": 0.3987189751801441, + "learning_rate": 1.5028213125963029e-05, + "loss": 0.0128, + "step": 1990 + }, + { + "epoch": 0.3987189751801441, + "learning_rate": 1.5040288116130261e-05, + "loss": 0.082, + "step": 1992 + }, + { + "epoch": 0.39951961569255406, + "learning_rate": 1.5052353262505603e-05, + "loss": 0.3515, + "step": 1994 + }, + { + "epoch": 0.39951961569255406, + "learning_rate": 1.5064408541525568e-05, + "loss": 0.0098, + "step": 1996 + }, + { + "epoch": 0.400320256204964, + "learning_rate": 1.5076453929645933e-05, + "loss": 0.015, + "step": 1998 + }, + { + "epoch": 0.400320256204964, + "learning_rate": 1.5088489403341793e-05, + "loss": 0.3411, + "step": 2000 + }, + { + "epoch": 0.4011208967173739, + "learning_rate": 1.510051493910759e-05, + "loss": 0.3925, + "step": 2002 + }, + { + "epoch": 0.4011208967173739, + "learning_rate": 1.5112530513457229e-05, + "loss": 0.376, + "step": 2004 + }, + { + "epoch": 0.40192153722978385, + "learning_rate": 1.512453610292401e-05, + "loss": 0.0255, + "step": 2006 + }, + { + "epoch": 0.40192153722978385, + "learning_rate": 1.513653168406076e-05, + "loss": 0.0254, + "step": 2008 + }, + { + "epoch": 0.40272217774219377, + "learning_rate": 1.514851723343985e-05, + "loss": 0.0322, + "step": 2010 + }, + { + "epoch": 0.40272217774219377, + "learning_rate": 1.5160492727653245e-05, + "loss": 0.0725, + "step": 2012 + }, + { + "epoch": 0.4035228182546037, + "learning_rate": 1.5172458143312522e-05, + "loss": 0.5022, + "step": 2014 + }, + { + "epoch": 0.4035228182546037, + "learning_rate": 1.5184413457049006e-05, + "loss": 0.1864, + "step": 2016 + }, + { + "epoch": 0.4043234587670136, + "learning_rate": 1.5196358645513685e-05, + "loss": 0.0871, + "step": 2018 + }, + { + "epoch": 0.4043234587670136, + "learning_rate": 1.5208293685377354e-05, + "loss": 0.0126, + "step": 2020 + }, + { + "epoch": 0.40512409927942356, + "learning_rate": 1.5220218553330618e-05, + "loss": 0.9182, + "step": 2022 + }, + { + "epoch": 0.40512409927942356, + "learning_rate": 1.5232133226083954e-05, + "loss": 0.0128, + "step": 2024 + }, + { + "epoch": 0.40592473979183347, + "learning_rate": 1.5244037680367744e-05, + "loss": 0.7652, + "step": 2026 + }, + { + "epoch": 0.40592473979183347, + "learning_rate": 1.5255931892932322e-05, + "loss": 0.1148, + "step": 2028 + }, + { + "epoch": 0.4067253803042434, + "learning_rate": 1.5267815840548057e-05, + "loss": 0.376, + "step": 2030 + }, + { + "epoch": 0.4067253803042434, + "learning_rate": 1.527968950000533e-05, + "loss": 0.0529, + "step": 2032 + }, + { + "epoch": 0.40752602081665334, + "learning_rate": 1.529155284811463e-05, + "loss": 0.0534, + "step": 2034 + }, + { + "epoch": 0.40752602081665334, + "learning_rate": 1.5303405861706574e-05, + "loss": 0.0843, + "step": 2036 + }, + { + "epoch": 0.40832666132906326, + "learning_rate": 1.5315248517631975e-05, + "loss": 0.0413, + "step": 2038 + }, + { + "epoch": 0.40832666132906326, + "learning_rate": 1.532708079276185e-05, + "loss": 0.0487, + "step": 2040 + }, + { + "epoch": 0.40912730184147317, + "learning_rate": 1.5338902663987544e-05, + "loss": 0.0434, + "step": 2042 + }, + { + "epoch": 0.40912730184147317, + "learning_rate": 1.5350714108220667e-05, + "loss": 0.0172, + "step": 2044 + }, + { + "epoch": 0.4099279423538831, + "learning_rate": 1.5362515102393217e-05, + "loss": 0.1813, + "step": 2046 + }, + { + "epoch": 0.4099279423538831, + "learning_rate": 1.5374305623457594e-05, + "loss": 0.2477, + "step": 2048 + }, + { + "epoch": 0.41072858286629305, + "learning_rate": 1.5386085648386656e-05, + "loss": 0.0221, + "step": 2050 + }, + { + "epoch": 0.41072858286629305, + "learning_rate": 1.539785515417376e-05, + "loss": 0.8005, + "step": 2052 + }, + { + "epoch": 0.41152922337870296, + "learning_rate": 1.540961411783279e-05, + "loss": 0.0089, + "step": 2054 + }, + { + "epoch": 0.41152922337870296, + "learning_rate": 1.542136251639826e-05, + "loss": 0.0495, + "step": 2056 + }, + { + "epoch": 0.41232986389111287, + "learning_rate": 1.5433100326925288e-05, + "loss": 0.2361, + "step": 2058 + }, + { + "epoch": 0.41232986389111287, + "learning_rate": 1.5444827526489668e-05, + "loss": 0.4373, + "step": 2060 + }, + { + "epoch": 0.41313050440352284, + "learning_rate": 1.545654409218793e-05, + "loss": 0.7688, + "step": 2062 + }, + { + "epoch": 0.41313050440352284, + "learning_rate": 1.5468250001137368e-05, + "loss": 0.0789, + "step": 2064 + }, + { + "epoch": 0.41393114491593275, + "learning_rate": 1.5479945230476066e-05, + "loss": 0.0225, + "step": 2066 + }, + { + "epoch": 0.41393114491593275, + "learning_rate": 1.5491629757363026e-05, + "loss": 0.2234, + "step": 2068 + }, + { + "epoch": 0.41473178542834266, + "learning_rate": 1.550330355897809e-05, + "loss": 0.0076, + "step": 2070 + }, + { + "epoch": 0.41473178542834266, + "learning_rate": 1.551496661252208e-05, + "loss": 0.0, + "step": 2072 + }, + { + "epoch": 0.4155324259407526, + "learning_rate": 1.5526618895216793e-05, + "loss": 0.1215, + "step": 2074 + }, + { + "epoch": 0.4155324259407526, + "learning_rate": 1.5538260384305073e-05, + "loss": 0.1669, + "step": 2076 + }, + { + "epoch": 0.41633306645316254, + "learning_rate": 1.5549891057050837e-05, + "loss": 0.0229, + "step": 2078 + }, + { + "epoch": 0.41633306645316254, + "learning_rate": 1.5561510890739113e-05, + "loss": 0.8524, + "step": 2080 + }, + { + "epoch": 0.41713370696557245, + "learning_rate": 1.557311986267615e-05, + "loss": 0.2305, + "step": 2082 + }, + { + "epoch": 0.41713370696557245, + "learning_rate": 1.5584717950189353e-05, + "loss": 0.0207, + "step": 2084 + }, + { + "epoch": 0.41793434747798236, + "learning_rate": 1.5596305130627404e-05, + "loss": 0.037, + "step": 2086 + }, + { + "epoch": 0.41793434747798236, + "learning_rate": 1.5607881381360296e-05, + "loss": 0.3288, + "step": 2088 + }, + { + "epoch": 0.4187349879903923, + "learning_rate": 1.5619446679779357e-05, + "loss": 0.068, + "step": 2090 + }, + { + "epoch": 0.4187349879903923, + "learning_rate": 1.563100100329731e-05, + "loss": 0.034, + "step": 2092 + }, + { + "epoch": 0.41953562850280224, + "learning_rate": 1.564254432934829e-05, + "loss": 0.0406, + "step": 2094 + }, + { + "epoch": 0.41953562850280224, + "learning_rate": 1.565407663538797e-05, + "loss": 0.0152, + "step": 2096 + }, + { + "epoch": 0.42033626901521215, + "learning_rate": 1.5665597898893484e-05, + "loss": 0.1609, + "step": 2098 + }, + { + "epoch": 0.42033626901521215, + "learning_rate": 1.567710809736356e-05, + "loss": 0.0404, + "step": 2100 + }, + { + "epoch": 0.4211369095276221, + "learning_rate": 1.568860720831853e-05, + "loss": 0.0053, + "step": 2102 + }, + { + "epoch": 0.4211369095276221, + "learning_rate": 1.5700095209300376e-05, + "loss": 0.0544, + "step": 2104 + }, + { + "epoch": 0.42193755004003203, + "learning_rate": 1.5711572077872774e-05, + "loss": 0.3045, + "step": 2106 + }, + { + "epoch": 0.42193755004003203, + "learning_rate": 1.572303779162118e-05, + "loss": 0.0292, + "step": 2108 + }, + { + "epoch": 0.42273819055244194, + "learning_rate": 1.573449232815279e-05, + "loss": 0.1737, + "step": 2110 + }, + { + "epoch": 0.42273819055244194, + "learning_rate": 1.5745935665096647e-05, + "loss": 0.0499, + "step": 2112 + }, + { + "epoch": 0.4235388310648519, + "learning_rate": 1.5757367780103666e-05, + "loss": 0.0322, + "step": 2114 + }, + { + "epoch": 0.4235388310648519, + "learning_rate": 1.5768788650846677e-05, + "loss": 0.2819, + "step": 2116 + }, + { + "epoch": 0.4243394715772618, + "learning_rate": 1.5780198255020478e-05, + "loss": 0.0313, + "step": 2118 + }, + { + "epoch": 0.4243394715772618, + "learning_rate": 1.5791596570341844e-05, + "loss": 0.0816, + "step": 2120 + }, + { + "epoch": 0.42514011208967173, + "learning_rate": 1.580298357454965e-05, + "loss": 0.0935, + "step": 2122 + }, + { + "epoch": 0.42514011208967173, + "learning_rate": 1.581435924540481e-05, + "loss": 0.0074, + "step": 2124 + }, + { + "epoch": 0.42594075260208164, + "learning_rate": 1.5825723560690403e-05, + "loss": 0.0657, + "step": 2126 + }, + { + "epoch": 0.42594075260208164, + "learning_rate": 1.5837076498211666e-05, + "loss": 0.2391, + "step": 2128 + }, + { + "epoch": 0.4267413931144916, + "learning_rate": 1.5848418035796068e-05, + "loss": 0.563, + "step": 2130 + }, + { + "epoch": 0.4267413931144916, + "learning_rate": 1.5859748151293333e-05, + "loss": 0.099, + "step": 2132 + }, + { + "epoch": 0.4275420336269015, + "learning_rate": 1.587106682257552e-05, + "loss": 0.13, + "step": 2134 + }, + { + "epoch": 0.4275420336269015, + "learning_rate": 1.5882374027537005e-05, + "loss": 0.1899, + "step": 2136 + }, + { + "epoch": 0.42834267413931143, + "learning_rate": 1.5893669744094577e-05, + "loss": 0.1267, + "step": 2138 + }, + { + "epoch": 0.42834267413931143, + "learning_rate": 1.5904953950187455e-05, + "loss": 0.0223, + "step": 2140 + }, + { + "epoch": 0.4291433146517214, + "learning_rate": 1.591622662377734e-05, + "loss": 0.1586, + "step": 2142 + }, + { + "epoch": 0.4291433146517214, + "learning_rate": 1.5927487742848448e-05, + "loss": 0.0031, + "step": 2144 + }, + { + "epoch": 0.4299439551641313, + "learning_rate": 1.5938737285407567e-05, + "loss": 0.1374, + "step": 2146 + }, + { + "epoch": 0.4299439551641313, + "learning_rate": 1.594997522948412e-05, + "loss": 0.0509, + "step": 2148 + }, + { + "epoch": 0.4307445956765412, + "learning_rate": 1.5961201553130148e-05, + "loss": 0.0371, + "step": 2150 + }, + { + "epoch": 0.4307445956765412, + "learning_rate": 1.5972416234420393e-05, + "loss": 0.0287, + "step": 2152 + }, + { + "epoch": 0.4315452361889512, + "learning_rate": 1.598361925145234e-05, + "loss": 0.0179, + "step": 2154 + }, + { + "epoch": 0.4315452361889512, + "learning_rate": 1.599481058234626e-05, + "loss": 0.1267, + "step": 2156 + }, + { + "epoch": 0.4323458767013611, + "learning_rate": 1.6005990205245216e-05, + "loss": 0.0959, + "step": 2158 + }, + { + "epoch": 0.4323458767013611, + "learning_rate": 1.60171580983152e-05, + "loss": 0.6807, + "step": 2160 + }, + { + "epoch": 0.433146517213771, + "learning_rate": 1.602831423974506e-05, + "loss": 0.0773, + "step": 2162 + }, + { + "epoch": 0.433146517213771, + "learning_rate": 1.6039458607746607e-05, + "loss": 0.0394, + "step": 2164 + }, + { + "epoch": 0.4339471577261809, + "learning_rate": 1.6050591180554648e-05, + "loss": 0.8808, + "step": 2166 + }, + { + "epoch": 0.4339471577261809, + "learning_rate": 1.606171193642703e-05, + "loss": 0.0076, + "step": 2168 + }, + { + "epoch": 0.4347477982385909, + "learning_rate": 1.6072820853644677e-05, + "loss": 0.0715, + "step": 2170 + }, + { + "epoch": 0.4347477982385909, + "learning_rate": 1.6083917910511616e-05, + "loss": 0.6604, + "step": 2172 + }, + { + "epoch": 0.4355484387510008, + "learning_rate": 1.6095003085355082e-05, + "loss": 0.0471, + "step": 2174 + }, + { + "epoch": 0.4355484387510008, + "learning_rate": 1.6106076356525474e-05, + "loss": 0.0405, + "step": 2176 + }, + { + "epoch": 0.4363490792634107, + "learning_rate": 1.611713770239646e-05, + "loss": 0.3565, + "step": 2178 + }, + { + "epoch": 0.4363490792634107, + "learning_rate": 1.6128187101364982e-05, + "loss": 0.0986, + "step": 2180 + }, + { + "epoch": 0.4371497197758207, + "learning_rate": 1.6139224531851332e-05, + "loss": 0.2505, + "step": 2182 + }, + { + "epoch": 0.4371497197758207, + "learning_rate": 1.6150249972299153e-05, + "loss": 0.0508, + "step": 2184 + }, + { + "epoch": 0.4379503602882306, + "learning_rate": 1.616126340117555e-05, + "loss": 0.0227, + "step": 2186 + }, + { + "epoch": 0.4379503602882306, + "learning_rate": 1.617226479697104e-05, + "loss": 0.7222, + "step": 2188 + }, + { + "epoch": 0.4387510008006405, + "learning_rate": 1.618325413819966e-05, + "loss": 0.0464, + "step": 2190 + }, + { + "epoch": 0.4387510008006405, + "learning_rate": 1.6194231403398994e-05, + "loss": 0.1743, + "step": 2192 + }, + { + "epoch": 0.43955164131305047, + "learning_rate": 1.6205196571130194e-05, + "loss": 0.0255, + "step": 2194 + }, + { + "epoch": 0.43955164131305047, + "learning_rate": 1.621614961997806e-05, + "loss": 0.0087, + "step": 2196 + }, + { + "epoch": 0.4403522818254604, + "learning_rate": 1.6227090528551034e-05, + "loss": 0.0372, + "step": 2198 + }, + { + "epoch": 0.4403522818254604, + "learning_rate": 1.6238019275481313e-05, + "loss": 0.1587, + "step": 2200 + }, + { + "epoch": 0.4411529223378703, + "learning_rate": 1.62489358394248e-05, + "loss": 0.0001, + "step": 2202 + }, + { + "epoch": 0.4411529223378703, + "learning_rate": 1.6259840199061212e-05, + "loss": 0.0002, + "step": 2204 + }, + { + "epoch": 0.4419535628502802, + "learning_rate": 1.6270732333094095e-05, + "loss": 0.0149, + "step": 2206 + }, + { + "epoch": 0.4419535628502802, + "learning_rate": 1.6281612220250883e-05, + "loss": 0.0033, + "step": 2208 + }, + { + "epoch": 0.44275420336269017, + "learning_rate": 1.6292479839282897e-05, + "loss": 0.011, + "step": 2210 + }, + { + "epoch": 0.44275420336269017, + "learning_rate": 1.6303335168965474e-05, + "loss": 0.0469, + "step": 2212 + }, + { + "epoch": 0.4435548438751001, + "learning_rate": 1.6314178188097907e-05, + "loss": 0.8533, + "step": 2214 + }, + { + "epoch": 0.4435548438751001, + "learning_rate": 1.6325008875503543e-05, + "loss": 0.0111, + "step": 2216 + }, + { + "epoch": 0.44435548438751, + "learning_rate": 1.6335827210029816e-05, + "loss": 0.4627, + "step": 2218 + }, + { + "epoch": 0.44435548438751, + "learning_rate": 1.6346633170548285e-05, + "loss": 0.1509, + "step": 2220 + }, + { + "epoch": 0.44515612489991996, + "learning_rate": 1.635742673595467e-05, + "loss": 0.0619, + "step": 2222 + }, + { + "epoch": 0.44515612489991996, + "learning_rate": 1.6368207885168897e-05, + "loss": 0.2355, + "step": 2224 + }, + { + "epoch": 0.44595676541232987, + "learning_rate": 1.6378976597135173e-05, + "loss": 0.1696, + "step": 2226 + }, + { + "epoch": 0.44595676541232987, + "learning_rate": 1.6389732850821957e-05, + "loss": 0.0414, + "step": 2228 + }, + { + "epoch": 0.4467574059247398, + "learning_rate": 1.640047662522205e-05, + "loss": 0.1448, + "step": 2230 + }, + { + "epoch": 0.4467574059247398, + "learning_rate": 1.641120789935263e-05, + "loss": 0.0393, + "step": 2232 + }, + { + "epoch": 0.4475580464371497, + "learning_rate": 1.6421926652255282e-05, + "loss": 0.0215, + "step": 2234 + }, + { + "epoch": 0.4475580464371497, + "learning_rate": 1.6432632862996042e-05, + "loss": 0.0649, + "step": 2236 + }, + { + "epoch": 0.44835868694955966, + "learning_rate": 1.6443326510665474e-05, + "loss": 0.0048, + "step": 2238 + }, + { + "epoch": 0.44835868694955966, + "learning_rate": 1.6454007574378637e-05, + "loss": 0.1349, + "step": 2240 + }, + { + "epoch": 0.44915932746196957, + "learning_rate": 1.646467603327518e-05, + "loss": 0.0508, + "step": 2242 + }, + { + "epoch": 0.44915932746196957, + "learning_rate": 1.6475331866519377e-05, + "loss": 0.0374, + "step": 2244 + }, + { + "epoch": 0.4499599679743795, + "learning_rate": 1.6485975053300154e-05, + "loss": 0.0911, + "step": 2246 + }, + { + "epoch": 0.4499599679743795, + "learning_rate": 1.6496605572831134e-05, + "loss": 0.0823, + "step": 2248 + }, + { + "epoch": 0.45076060848678945, + "learning_rate": 1.650722340435067e-05, + "loss": 0.007, + "step": 2250 + }, + { + "epoch": 0.45076060848678945, + "learning_rate": 1.6517828527121928e-05, + "loss": 0.0105, + "step": 2252 + }, + { + "epoch": 0.45156124899919936, + "learning_rate": 1.652842092043287e-05, + "loss": 0.0307, + "step": 2254 + }, + { + "epoch": 0.45156124899919936, + "learning_rate": 1.6539000563596318e-05, + "loss": 0.0462, + "step": 2256 + }, + { + "epoch": 0.45236188951160927, + "learning_rate": 1.6549567435950004e-05, + "loss": 0.0042, + "step": 2258 + }, + { + "epoch": 0.45236188951160927, + "learning_rate": 1.6560121516856586e-05, + "loss": 0.3596, + "step": 2260 + }, + { + "epoch": 0.45316253002401924, + "learning_rate": 1.6570662785703713e-05, + "loss": 0.1873, + "step": 2262 + }, + { + "epoch": 0.45316253002401924, + "learning_rate": 1.6581191221904077e-05, + "loss": 0.1086, + "step": 2264 + }, + { + "epoch": 0.45396317053642915, + "learning_rate": 1.6591706804895408e-05, + "loss": 0.0256, + "step": 2266 + }, + { + "epoch": 0.45396317053642915, + "learning_rate": 1.6602209514140542e-05, + "loss": 0.0792, + "step": 2268 + }, + { + "epoch": 0.45476381104883906, + "learning_rate": 1.6612699329127457e-05, + "loss": 0.2814, + "step": 2270 + }, + { + "epoch": 0.45476381104883906, + "learning_rate": 1.6623176229369324e-05, + "loss": 0.0726, + "step": 2272 + }, + { + "epoch": 0.455564451561249, + "learning_rate": 1.6633640194404523e-05, + "loss": 0.1189, + "step": 2274 + }, + { + "epoch": 0.455564451561249, + "learning_rate": 1.6644091203796694e-05, + "loss": 0.0176, + "step": 2276 + }, + { + "epoch": 0.45636509207365894, + "learning_rate": 1.6654529237134816e-05, + "loss": 0.0, + "step": 2278 + }, + { + "epoch": 0.45636509207365894, + "learning_rate": 1.6664954274033168e-05, + "loss": 0.0279, + "step": 2280 + }, + { + "epoch": 0.45716573258606885, + "learning_rate": 1.667536629413143e-05, + "loss": 0.0345, + "step": 2282 + }, + { + "epoch": 0.45716573258606885, + "learning_rate": 1.6685765277094695e-05, + "loss": 0.1605, + "step": 2284 + }, + { + "epoch": 0.45796637309847876, + "learning_rate": 1.6696151202613527e-05, + "loss": 1.1588, + "step": 2286 + }, + { + "epoch": 0.45796637309847876, + "learning_rate": 1.6706524050403996e-05, + "loss": 0.0321, + "step": 2288 + }, + { + "epoch": 0.45876701361088873, + "learning_rate": 1.6716883800207685e-05, + "loss": 0.0055, + "step": 2290 + }, + { + "epoch": 0.45876701361088873, + "learning_rate": 1.6727230431791806e-05, + "loss": 0.0229, + "step": 2292 + }, + { + "epoch": 0.45956765412329864, + "learning_rate": 1.673756392494915e-05, + "loss": 0.0285, + "step": 2294 + }, + { + "epoch": 0.45956765412329864, + "learning_rate": 1.674788425949818e-05, + "loss": 0.037, + "step": 2296 + }, + { + "epoch": 0.46036829463570855, + "learning_rate": 1.6758191415283063e-05, + "loss": 0.0124, + "step": 2298 + }, + { + "epoch": 0.46036829463570855, + "learning_rate": 1.6768485372173696e-05, + "loss": 0.0456, + "step": 2300 + }, + { + "epoch": 0.4611689351481185, + "learning_rate": 1.6778766110065755e-05, + "loss": 0.009, + "step": 2302 + }, + { + "epoch": 0.4611689351481185, + "learning_rate": 1.6789033608880735e-05, + "loss": 0.0407, + "step": 2304 + }, + { + "epoch": 0.46196957566052843, + "learning_rate": 1.6799287848566e-05, + "loss": 0.0274, + "step": 2306 + }, + { + "epoch": 0.46196957566052843, + "learning_rate": 1.6809528809094798e-05, + "loss": 0.0178, + "step": 2308 + }, + { + "epoch": 0.46277021617293834, + "learning_rate": 1.6819756470466305e-05, + "loss": 0.0825, + "step": 2310 + }, + { + "epoch": 0.46277021617293834, + "learning_rate": 1.6829970812705674e-05, + "loss": 0.0425, + "step": 2312 + }, + { + "epoch": 0.46357085668534825, + "learning_rate": 1.684017181586408e-05, + "loss": 1.181, + "step": 2314 + }, + { + "epoch": 0.46357085668534825, + "learning_rate": 1.6850359460018733e-05, + "loss": 0.0232, + "step": 2316 + }, + { + "epoch": 0.4643714971977582, + "learning_rate": 1.6860533725272943e-05, + "loss": 0.6878, + "step": 2318 + }, + { + "epoch": 0.4643714971977582, + "learning_rate": 1.6870694591756165e-05, + "loss": 0.5161, + "step": 2320 + }, + { + "epoch": 0.46517213771016813, + "learning_rate": 1.6880842039624e-05, + "loss": 0.0775, + "step": 2322 + }, + { + "epoch": 0.46517213771016813, + "learning_rate": 1.689097604905826e-05, + "loss": 0.7654, + "step": 2324 + }, + { + "epoch": 0.46597277822257804, + "learning_rate": 1.6901096600267e-05, + "loss": 0.0003, + "step": 2326 + }, + { + "epoch": 0.46597277822257804, + "learning_rate": 1.6911203673484577e-05, + "loss": 1.2219, + "step": 2328 + }, + { + "epoch": 0.466773418734988, + "learning_rate": 1.6921297248971645e-05, + "loss": 0.0822, + "step": 2330 + }, + { + "epoch": 0.466773418734988, + "learning_rate": 1.6931377307015226e-05, + "loss": 0.3522, + "step": 2332 + }, + { + "epoch": 0.4675740592473979, + "learning_rate": 1.6941443827928778e-05, + "loss": 0.315, + "step": 2334 + }, + { + "epoch": 0.4675740592473979, + "learning_rate": 1.695149679205214e-05, + "loss": 0.8148, + "step": 2336 + }, + { + "epoch": 0.46837469975980783, + "learning_rate": 1.6961536179751672e-05, + "loss": 0.1129, + "step": 2338 + }, + { + "epoch": 0.46837469975980783, + "learning_rate": 1.6971561971420222e-05, + "loss": 0.2475, + "step": 2340 + }, + { + "epoch": 0.4691753402722178, + "learning_rate": 1.6981574147477204e-05, + "loss": 0.0066, + "step": 2342 + }, + { + "epoch": 0.4691753402722178, + "learning_rate": 1.6991572688368628e-05, + "loss": 0.3134, + "step": 2344 + }, + { + "epoch": 0.4699759807846277, + "learning_rate": 1.70015575745671e-05, + "loss": 0.2156, + "step": 2346 + }, + { + "epoch": 0.4699759807846277, + "learning_rate": 1.701152878657196e-05, + "loss": 0.018, + "step": 2348 + }, + { + "epoch": 0.4707766212970376, + "learning_rate": 1.7021486304909196e-05, + "loss": 0.2065, + "step": 2350 + }, + { + "epoch": 0.4707766212970376, + "learning_rate": 1.7031430110131562e-05, + "loss": 0.2374, + "step": 2352 + }, + { + "epoch": 0.47157726180944753, + "learning_rate": 1.7041360182818583e-05, + "loss": 0.1077, + "step": 2354 + }, + { + "epoch": 0.47157726180944753, + "learning_rate": 1.705127650357662e-05, + "loss": 0.0773, + "step": 2356 + }, + { + "epoch": 0.4723779023218575, + "learning_rate": 1.7061179053038887e-05, + "loss": 0.0739, + "step": 2358 + }, + { + "epoch": 0.4723779023218575, + "learning_rate": 1.7071067811865467e-05, + "loss": 0.5686, + "step": 2360 + }, + { + "epoch": 0.4731785428342674, + "learning_rate": 1.708094276074343e-05, + "loss": 0.0331, + "step": 2362 + }, + { + "epoch": 0.4731785428342674, + "learning_rate": 1.7090803880386778e-05, + "loss": 0.7307, + "step": 2364 + }, + { + "epoch": 0.4739791833466773, + "learning_rate": 1.7100651151536525e-05, + "loss": 0.9878, + "step": 2366 + }, + { + "epoch": 0.4739791833466773, + "learning_rate": 1.7110484554960738e-05, + "loss": 0.1265, + "step": 2368 + }, + { + "epoch": 0.4747798238590873, + "learning_rate": 1.712030407145457e-05, + "loss": 0.2579, + "step": 2370 + }, + { + "epoch": 0.4747798238590873, + "learning_rate": 1.713010968184029e-05, + "loss": 0.0519, + "step": 2372 + }, + { + "epoch": 0.4755804643714972, + "learning_rate": 1.7139901366967332e-05, + "loss": 0.2155, + "step": 2374 + }, + { + "epoch": 0.4755804643714972, + "learning_rate": 1.7149679107712306e-05, + "loss": 0.0591, + "step": 2376 + }, + { + "epoch": 0.4763811048839071, + "learning_rate": 1.71594428849791e-05, + "loss": 0.532, + "step": 2378 + }, + { + "epoch": 0.4763811048839071, + "learning_rate": 1.716919267969883e-05, + "loss": 0.0326, + "step": 2380 + }, + { + "epoch": 0.4771817453963171, + "learning_rate": 1.717892847282994e-05, + "loss": 0.1499, + "step": 2382 + }, + { + "epoch": 0.4771817453963171, + "learning_rate": 1.7188650245358215e-05, + "loss": 0.0671, + "step": 2384 + }, + { + "epoch": 0.477982385908727, + "learning_rate": 1.7198357978296817e-05, + "loss": 0.1884, + "step": 2386 + }, + { + "epoch": 0.477982385908727, + "learning_rate": 1.7208051652686338e-05, + "loss": 0.1494, + "step": 2388 + }, + { + "epoch": 0.4787830264211369, + "learning_rate": 1.721773124959481e-05, + "loss": 0.0799, + "step": 2390 + }, + { + "epoch": 0.4787830264211369, + "learning_rate": 1.722739675011779e-05, + "loss": 0.0146, + "step": 2392 + }, + { + "epoch": 0.4795836669335468, + "learning_rate": 1.723704813537834e-05, + "loss": 0.3149, + "step": 2394 + }, + { + "epoch": 0.4795836669335468, + "learning_rate": 1.7246685386527095e-05, + "loss": 0.0255, + "step": 2396 + }, + { + "epoch": 0.4803843074459568, + "learning_rate": 1.725630848474229e-05, + "loss": 0.035, + "step": 2398 + }, + { + "epoch": 0.4803843074459568, + "learning_rate": 1.726591741122981e-05, + "loss": 0.0267, + "step": 2400 + }, + { + "epoch": 0.4811849479583667, + "learning_rate": 1.727551214722321e-05, + "loss": 0.258, + "step": 2402 + }, + { + "epoch": 0.4811849479583667, + "learning_rate": 1.7285092673983753e-05, + "loss": 0.0134, + "step": 2404 + }, + { + "epoch": 0.4819855884707766, + "learning_rate": 1.7294658972800488e-05, + "loss": 0.0055, + "step": 2406 + }, + { + "epoch": 0.4819855884707766, + "learning_rate": 1.730421102499021e-05, + "loss": 0.1337, + "step": 2408 + }, + { + "epoch": 0.48278622898318657, + "learning_rate": 1.7313748811897558e-05, + "loss": 0.1067, + "step": 2410 + }, + { + "epoch": 0.48278622898318657, + "learning_rate": 1.7323272314895022e-05, + "loss": 0.1671, + "step": 2412 + }, + { + "epoch": 0.4835868694955965, + "learning_rate": 1.7332781515383003e-05, + "loss": 0.001, + "step": 2414 + }, + { + "epoch": 0.4835868694955965, + "learning_rate": 1.734227639478982e-05, + "loss": 0.0542, + "step": 2416 + }, + { + "epoch": 0.4843875100080064, + "learning_rate": 1.7351756934571758e-05, + "loss": 0.0088, + "step": 2418 + }, + { + "epoch": 0.4843875100080064, + "learning_rate": 1.736122311621314e-05, + "loss": 0.7743, + "step": 2420 + }, + { + "epoch": 0.4851881505204163, + "learning_rate": 1.7370674921226296e-05, + "loss": 0.0347, + "step": 2422 + }, + { + "epoch": 0.4851881505204163, + "learning_rate": 1.738011233115165e-05, + "loss": 0.0074, + "step": 2424 + }, + { + "epoch": 0.4859887910328263, + "learning_rate": 1.7389535327557733e-05, + "loss": 0.2156, + "step": 2426 + }, + { + "epoch": 0.4859887910328263, + "learning_rate": 1.7398943892041227e-05, + "loss": 0.262, + "step": 2428 + }, + { + "epoch": 0.4867894315452362, + "learning_rate": 1.7408338006227005e-05, + "loss": 0.0713, + "step": 2430 + }, + { + "epoch": 0.4867894315452362, + "learning_rate": 1.7417717651768144e-05, + "loss": 0.3025, + "step": 2432 + }, + { + "epoch": 0.4875900720576461, + "learning_rate": 1.7427082810346018e-05, + "loss": 0.0829, + "step": 2434 + }, + { + "epoch": 0.4875900720576461, + "learning_rate": 1.743643346367026e-05, + "loss": 0.2072, + "step": 2436 + }, + { + "epoch": 0.48839071257005606, + "learning_rate": 1.744576959347884e-05, + "loss": 0.1665, + "step": 2438 + }, + { + "epoch": 0.48839071257005606, + "learning_rate": 1.7455091181538087e-05, + "loss": 0.2431, + "step": 2440 + }, + { + "epoch": 0.489191353082466, + "learning_rate": 1.746439820964275e-05, + "loss": 0.0572, + "step": 2442 + }, + { + "epoch": 0.489191353082466, + "learning_rate": 1.7473690659615992e-05, + "loss": 0.0465, + "step": 2444 + }, + { + "epoch": 0.4899919935948759, + "learning_rate": 1.748296851330945e-05, + "loss": 0.3216, + "step": 2446 + }, + { + "epoch": 0.4899919935948759, + "learning_rate": 1.74922317526033e-05, + "loss": 0.0379, + "step": 2448 + }, + { + "epoch": 0.49079263410728585, + "learning_rate": 1.7501480359406217e-05, + "loss": 0.4012, + "step": 2450 + }, + { + "epoch": 0.49079263410728585, + "learning_rate": 1.7510714315655474e-05, + "loss": 0.0423, + "step": 2452 + }, + { + "epoch": 0.49159327461969576, + "learning_rate": 1.7519933603316955e-05, + "loss": 0.1227, + "step": 2454 + }, + { + "epoch": 0.49159327461969576, + "learning_rate": 1.752913820438519e-05, + "loss": 0.0445, + "step": 2456 + }, + { + "epoch": 0.4923939151321057, + "learning_rate": 1.7538328100883397e-05, + "loss": 0.1742, + "step": 2458 + }, + { + "epoch": 0.4923939151321057, + "learning_rate": 1.7547503274863495e-05, + "loss": 0.0249, + "step": 2460 + }, + { + "epoch": 0.4931945556445156, + "learning_rate": 1.7556663708406193e-05, + "loss": 0.2926, + "step": 2462 + }, + { + "epoch": 0.4931945556445156, + "learning_rate": 1.756580938362096e-05, + "loss": 0.0278, + "step": 2464 + }, + { + "epoch": 0.49399519615692555, + "learning_rate": 1.7574940282646085e-05, + "loss": 0.0118, + "step": 2466 + }, + { + "epoch": 0.49399519615692555, + "learning_rate": 1.758405638764873e-05, + "loss": 0.1087, + "step": 2468 + }, + { + "epoch": 0.49479583666933546, + "learning_rate": 1.7593157680824946e-05, + "loss": 0.011, + "step": 2470 + }, + { + "epoch": 0.49479583666933546, + "learning_rate": 1.7602244144399693e-05, + "loss": 0.011, + "step": 2472 + }, + { + "epoch": 0.4955964771817454, + "learning_rate": 1.761131576062694e-05, + "loss": 0.0397, + "step": 2474 + }, + { + "epoch": 0.4955964771817454, + "learning_rate": 1.7620372511789604e-05, + "loss": 0.0694, + "step": 2476 + }, + { + "epoch": 0.49639711769415534, + "learning_rate": 1.7629414380199662e-05, + "loss": 0.0144, + "step": 2478 + }, + { + "epoch": 0.49639711769415534, + "learning_rate": 1.7638441348198147e-05, + "loss": 0.0225, + "step": 2480 + }, + { + "epoch": 0.49719775820656525, + "learning_rate": 1.7647453398155194e-05, + "loss": 0.4899, + "step": 2482 + }, + { + "epoch": 0.49719775820656525, + "learning_rate": 1.7656450512470077e-05, + "loss": 0.0877, + "step": 2484 + }, + { + "epoch": 0.49799839871897517, + "learning_rate": 1.7665432673571218e-05, + "loss": 0.0933, + "step": 2486 + }, + { + "epoch": 0.49799839871897517, + "learning_rate": 1.7674399863916295e-05, + "loss": 0.0636, + "step": 2488 + }, + { + "epoch": 0.49879903923138513, + "learning_rate": 1.768335206599217e-05, + "loss": 0.0615, + "step": 2490 + }, + { + "epoch": 0.49879903923138513, + "learning_rate": 1.7692289262315e-05, + "loss": 0.0174, + "step": 2492 + }, + { + "epoch": 0.49959967974379504, + "learning_rate": 1.7701211435430256e-05, + "loss": 0.0299, + "step": 2494 + }, + { + "epoch": 0.49959967974379504, + "learning_rate": 1.771011856791273e-05, + "loss": 0.0735, + "step": 2496 + }, + { + "epoch": 0.500400320256205, + "learning_rate": 1.771901064236659e-05, + "loss": 0.0458, + "step": 2498 + }, + { + "epoch": 0.500400320256205, + "learning_rate": 1.7727887641425448e-05, + "loss": 0.0076, + "step": 2500 + }, + { + "epoch": 0.5012009607686149, + "learning_rate": 1.773674954775232e-05, + "loss": 1.2352, + "step": 2502 + }, + { + "epoch": 0.5012009607686149, + "learning_rate": 1.7745596344039712e-05, + "loss": 0.3141, + "step": 2504 + }, + { + "epoch": 0.5020016012810248, + "learning_rate": 1.7754428013009637e-05, + "loss": 0.0515, + "step": 2506 + }, + { + "epoch": 0.5020016012810248, + "learning_rate": 1.7763244537413657e-05, + "loss": 0.0608, + "step": 2508 + }, + { + "epoch": 0.5028022417934348, + "learning_rate": 1.77720459000329e-05, + "loss": 0.0058, + "step": 2510 + }, + { + "epoch": 0.5028022417934348, + "learning_rate": 1.7780832083678116e-05, + "loss": 0.5101, + "step": 2512 + }, + { + "epoch": 0.5036028823058447, + "learning_rate": 1.7789603071189712e-05, + "loss": 0.2926, + "step": 2514 + }, + { + "epoch": 0.5036028823058447, + "learning_rate": 1.7798358845437754e-05, + "loss": 0.0888, + "step": 2516 + }, + { + "epoch": 0.5044035228182546, + "learning_rate": 1.780709938932202e-05, + "loss": 0.0141, + "step": 2518 + }, + { + "epoch": 0.5044035228182546, + "learning_rate": 1.7815824685772035e-05, + "loss": 0.3146, + "step": 2520 + }, + { + "epoch": 0.5052041633306645, + "learning_rate": 1.7824534717747115e-05, + "loss": 0.1866, + "step": 2522 + }, + { + "epoch": 0.5052041633306645, + "learning_rate": 1.7833229468236364e-05, + "loss": 0.2681, + "step": 2524 + }, + { + "epoch": 0.5060048038430744, + "learning_rate": 1.7841908920258767e-05, + "loss": 0.02, + "step": 2526 + }, + { + "epoch": 0.5060048038430744, + "learning_rate": 1.7850573056863156e-05, + "loss": 0.0358, + "step": 2528 + }, + { + "epoch": 0.5068054443554844, + "learning_rate": 1.7859221861128284e-05, + "loss": 0.7081, + "step": 2530 + }, + { + "epoch": 0.5068054443554844, + "learning_rate": 1.786785531616285e-05, + "loss": 0.1328, + "step": 2532 + }, + { + "epoch": 0.5076060848678943, + "learning_rate": 1.7876473405105528e-05, + "loss": 0.1503, + "step": 2534 + }, + { + "epoch": 0.5076060848678943, + "learning_rate": 1.7885076111125004e-05, + "loss": 0.0797, + "step": 2536 + }, + { + "epoch": 0.5084067253803043, + "learning_rate": 1.7893663417419995e-05, + "loss": 0.9898, + "step": 2538 + }, + { + "epoch": 0.5084067253803043, + "learning_rate": 1.790223530721933e-05, + "loss": 0.0964, + "step": 2540 + }, + { + "epoch": 0.5092073658927142, + "learning_rate": 1.791079176378191e-05, + "loss": 0.0835, + "step": 2542 + }, + { + "epoch": 0.5092073658927142, + "learning_rate": 1.791933277039679e-05, + "loss": 0.2102, + "step": 2544 + }, + { + "epoch": 0.5100080064051241, + "learning_rate": 1.7927858310383202e-05, + "loss": 0.1048, + "step": 2546 + }, + { + "epoch": 0.5100080064051241, + "learning_rate": 1.7936368367090577e-05, + "loss": 0.4835, + "step": 2548 + }, + { + "epoch": 0.510808646917534, + "learning_rate": 1.794486292389858e-05, + "loss": 0.1793, + "step": 2550 + }, + { + "epoch": 0.510808646917534, + "learning_rate": 1.7953341964217183e-05, + "loss": 0.4255, + "step": 2552 + }, + { + "epoch": 0.5116092874299439, + "learning_rate": 1.7961805471486618e-05, + "loss": 0.172, + "step": 2554 + }, + { + "epoch": 0.5116092874299439, + "learning_rate": 1.7970253429177477e-05, + "loss": 0.7454, + "step": 2556 + }, + { + "epoch": 0.5124099279423538, + "learning_rate": 1.797868582079072e-05, + "loss": 0.0704, + "step": 2558 + }, + { + "epoch": 0.5124099279423538, + "learning_rate": 1.7987102629857696e-05, + "loss": 0.5617, + "step": 2560 + }, + { + "epoch": 0.5132105684547638, + "learning_rate": 1.7995503839940197e-05, + "loss": 0.0282, + "step": 2562 + }, + { + "epoch": 0.5132105684547638, + "learning_rate": 1.800388943463047e-05, + "loss": 0.1281, + "step": 2564 + }, + { + "epoch": 0.5140112089671738, + "learning_rate": 1.8012259397551283e-05, + "loss": 0.1239, + "step": 2566 + }, + { + "epoch": 0.5140112089671738, + "learning_rate": 1.8020613712355912e-05, + "loss": 0.1996, + "step": 2568 + }, + { + "epoch": 0.5148118494795837, + "learning_rate": 1.8028952362728197e-05, + "loss": 0.101, + "step": 2570 + }, + { + "epoch": 0.5148118494795837, + "learning_rate": 1.803727533238257e-05, + "loss": 0.1851, + "step": 2572 + }, + { + "epoch": 0.5156124899919936, + "learning_rate": 1.804558260506409e-05, + "loss": 0.1223, + "step": 2574 + }, + { + "epoch": 0.5156124899919936, + "learning_rate": 1.805387416454847e-05, + "loss": 0.4868, + "step": 2576 + }, + { + "epoch": 0.5164131305044035, + "learning_rate": 1.8062149994642135e-05, + "loss": 0.3604, + "step": 2578 + }, + { + "epoch": 0.5164131305044035, + "learning_rate": 1.8070410079182195e-05, + "loss": 0.4328, + "step": 2580 + }, + { + "epoch": 0.5172137710168134, + "learning_rate": 1.8078654402036526e-05, + "loss": 0.1593, + "step": 2582 + }, + { + "epoch": 0.5172137710168134, + "learning_rate": 1.8086882947103787e-05, + "loss": 0.3842, + "step": 2584 + }, + { + "epoch": 0.5180144115292233, + "learning_rate": 1.8095095698313452e-05, + "loss": 0.1867, + "step": 2586 + }, + { + "epoch": 0.5180144115292233, + "learning_rate": 1.8103292639625842e-05, + "loss": 0.2476, + "step": 2588 + }, + { + "epoch": 0.5188150520416333, + "learning_rate": 1.811147375503214e-05, + "loss": 0.0735, + "step": 2590 + }, + { + "epoch": 0.5188150520416333, + "learning_rate": 1.811963902855447e-05, + "loss": 0.0911, + "step": 2592 + }, + { + "epoch": 0.5196156925540433, + "learning_rate": 1.812778844424587e-05, + "loss": 0.4232, + "step": 2594 + }, + { + "epoch": 0.5196156925540433, + "learning_rate": 1.813592198619035e-05, + "loss": 0.051, + "step": 2596 + }, + { + "epoch": 0.5204163330664532, + "learning_rate": 1.814403963850293e-05, + "loss": 0.1655, + "step": 2598 + }, + { + "epoch": 0.5204163330664532, + "learning_rate": 1.8152141385329658e-05, + "loss": 0.3857, + "step": 2600 + }, + { + "epoch": 0.5212169735788631, + "learning_rate": 1.8160227210847636e-05, + "loss": 0.0127, + "step": 2602 + }, + { + "epoch": 0.5212169735788631, + "learning_rate": 1.816829709926509e-05, + "loss": 0.0623, + "step": 2604 + }, + { + "epoch": 0.522017614091273, + "learning_rate": 1.8176351034821345e-05, + "loss": 0.0373, + "step": 2606 + }, + { + "epoch": 0.522017614091273, + "learning_rate": 1.8184389001786895e-05, + "loss": 0.0338, + "step": 2608 + }, + { + "epoch": 0.5228182546036829, + "learning_rate": 1.819241098446341e-05, + "loss": 0.1043, + "step": 2610 + }, + { + "epoch": 0.5228182546036829, + "learning_rate": 1.8200416967183785e-05, + "loss": 0.0127, + "step": 2612 + }, + { + "epoch": 0.5236188951160928, + "learning_rate": 1.8208406934312167e-05, + "loss": 0.19, + "step": 2614 + }, + { + "epoch": 0.5236188951160928, + "learning_rate": 1.821638087024396e-05, + "loss": 0.1875, + "step": 2616 + }, + { + "epoch": 0.5244195356285029, + "learning_rate": 1.8224338759405917e-05, + "loss": 0.0708, + "step": 2618 + }, + { + "epoch": 0.5244195356285029, + "learning_rate": 1.8232280586256097e-05, + "loss": 0.0396, + "step": 2620 + }, + { + "epoch": 0.5252201761409128, + "learning_rate": 1.8240206335283947e-05, + "loss": 0.1284, + "step": 2622 + }, + { + "epoch": 0.5252201761409128, + "learning_rate": 1.8248115991010296e-05, + "loss": 0.2847, + "step": 2624 + }, + { + "epoch": 0.5260208166533227, + "learning_rate": 1.825600953798743e-05, + "loss": 0.0676, + "step": 2626 + }, + { + "epoch": 0.5260208166533227, + "learning_rate": 1.8263886960799055e-05, + "loss": 0.3971, + "step": 2628 + }, + { + "epoch": 0.5268214571657326, + "learning_rate": 1.8271748244060426e-05, + "loss": 0.0287, + "step": 2630 + }, + { + "epoch": 0.5268214571657326, + "learning_rate": 1.8279593372418264e-05, + "loss": 0.0145, + "step": 2632 + }, + { + "epoch": 0.5276220976781425, + "learning_rate": 1.8287422330550878e-05, + "loss": 0.018, + "step": 2634 + }, + { + "epoch": 0.5276220976781425, + "learning_rate": 1.829523510316813e-05, + "loss": 0.0196, + "step": 2636 + }, + { + "epoch": 0.5284227381905524, + "learning_rate": 1.8303031675011515e-05, + "loss": 0.7601, + "step": 2638 + }, + { + "epoch": 0.5284227381905524, + "learning_rate": 1.8310812030854155e-05, + "loss": 0.0066, + "step": 2640 + }, + { + "epoch": 0.5292233787029623, + "learning_rate": 1.8318576155500838e-05, + "loss": 0.1848, + "step": 2642 + }, + { + "epoch": 0.5292233787029623, + "learning_rate": 1.832632403378808e-05, + "loss": 0.0973, + "step": 2644 + }, + { + "epoch": 0.5300240192153723, + "learning_rate": 1.8334055650584094e-05, + "loss": 0.0035, + "step": 2646 + }, + { + "epoch": 0.5300240192153723, + "learning_rate": 1.834177099078887e-05, + "loss": 0.21, + "step": 2648 + }, + { + "epoch": 0.5308246597277823, + "learning_rate": 1.8349470039334173e-05, + "loss": 0.1423, + "step": 2650 + }, + { + "epoch": 0.5308246597277823, + "learning_rate": 1.8357152781183606e-05, + "loss": 0.0758, + "step": 2652 + }, + { + "epoch": 0.5316253002401922, + "learning_rate": 1.83648192013326e-05, + "loss": 0.089, + "step": 2654 + }, + { + "epoch": 0.5316253002401922, + "learning_rate": 1.8372469284808465e-05, + "loss": 0.7896, + "step": 2656 + }, + { + "epoch": 0.5324259407526021, + "learning_rate": 1.8380103016670437e-05, + "loss": 0.0826, + "step": 2658 + }, + { + "epoch": 0.5324259407526021, + "learning_rate": 1.8387720382009665e-05, + "loss": 0.0497, + "step": 2660 + }, + { + "epoch": 0.533226581265012, + "learning_rate": 1.839532136594927e-05, + "loss": 0.7377, + "step": 2662 + }, + { + "epoch": 0.533226581265012, + "learning_rate": 1.840290595364436e-05, + "loss": 0.1807, + "step": 2664 + }, + { + "epoch": 0.5340272217774219, + "learning_rate": 1.8410474130282085e-05, + "loss": 0.3913, + "step": 2666 + }, + { + "epoch": 0.5340272217774219, + "learning_rate": 1.8418025881081606e-05, + "loss": 0.0464, + "step": 2668 + }, + { + "epoch": 0.5348278622898318, + "learning_rate": 1.8425561191294217e-05, + "loss": 0.1641, + "step": 2670 + }, + { + "epoch": 0.5348278622898318, + "learning_rate": 1.8433080046203286e-05, + "loss": 0.187, + "step": 2672 + }, + { + "epoch": 0.5356285028022418, + "learning_rate": 1.8440582431124325e-05, + "loss": 0.0163, + "step": 2674 + }, + { + "epoch": 0.5356285028022418, + "learning_rate": 1.844806833140501e-05, + "loss": 0.0026, + "step": 2676 + }, + { + "epoch": 0.5364291433146517, + "learning_rate": 1.8455537732425223e-05, + "loss": 0.3318, + "step": 2678 + }, + { + "epoch": 0.5364291433146517, + "learning_rate": 1.8462990619597054e-05, + "loss": 0.7395, + "step": 2680 + }, + { + "epoch": 0.5372297838270617, + "learning_rate": 1.847042697836485e-05, + "loss": 0.9306, + "step": 2682 + }, + { + "epoch": 0.5372297838270617, + "learning_rate": 1.8477846794205258e-05, + "loss": 0.258, + "step": 2684 + }, + { + "epoch": 0.5380304243394716, + "learning_rate": 1.84852500526272e-05, + "loss": 0.1334, + "step": 2686 + }, + { + "epoch": 0.5380304243394716, + "learning_rate": 1.8492636739171966e-05, + "loss": 0.0462, + "step": 2688 + }, + { + "epoch": 0.5388310648518815, + "learning_rate": 1.8500006839413183e-05, + "loss": 0.0606, + "step": 2690 + }, + { + "epoch": 0.5388310648518815, + "learning_rate": 1.85073603389569e-05, + "loss": 0.0308, + "step": 2692 + }, + { + "epoch": 0.5396317053642914, + "learning_rate": 1.851469722344155e-05, + "loss": 0.2573, + "step": 2694 + }, + { + "epoch": 0.5396317053642914, + "learning_rate": 1.8522017478538067e-05, + "loss": 0.1572, + "step": 2696 + }, + { + "epoch": 0.5404323458767014, + "learning_rate": 1.8529321089949817e-05, + "loss": 0.2022, + "step": 2698 + }, + { + "epoch": 0.5404323458767014, + "learning_rate": 1.8536608043412695e-05, + "loss": 0.194, + "step": 2700 + }, + { + "epoch": 0.5412329863891113, + "learning_rate": 1.8543878324695122e-05, + "loss": 0.475, + "step": 2702 + }, + { + "epoch": 0.5412329863891113, + "learning_rate": 1.855113191959808e-05, + "loss": 0.1744, + "step": 2704 + }, + { + "epoch": 0.5420336269015212, + "learning_rate": 1.8558368813955143e-05, + "loss": 0.1668, + "step": 2706 + }, + { + "epoch": 0.5420336269015212, + "learning_rate": 1.856558899363248e-05, + "loss": 0.0189, + "step": 2708 + }, + { + "epoch": 0.5428342674139311, + "learning_rate": 1.857279244452896e-05, + "loss": 0.2642, + "step": 2710 + }, + { + "epoch": 0.5428342674139311, + "learning_rate": 1.8579979152576063e-05, + "loss": 0.0569, + "step": 2712 + }, + { + "epoch": 0.5436349079263411, + "learning_rate": 1.8587149103738e-05, + "loss": 0.2193, + "step": 2714 + }, + { + "epoch": 0.5436349079263411, + "learning_rate": 1.85943022840117e-05, + "loss": 0.5457, + "step": 2716 + }, + { + "epoch": 0.544435548438751, + "learning_rate": 1.8601438679426847e-05, + "loss": 0.1581, + "step": 2718 + }, + { + "epoch": 0.544435548438751, + "learning_rate": 1.8608558276045895e-05, + "loss": 0.3804, + "step": 2720 + }, + { + "epoch": 0.5452361889511609, + "learning_rate": 1.8615661059964134e-05, + "loss": 0.0187, + "step": 2722 + }, + { + "epoch": 0.5452361889511609, + "learning_rate": 1.862274701730967e-05, + "loss": 0.0464, + "step": 2724 + }, + { + "epoch": 0.5460368294635709, + "learning_rate": 1.862981613424347e-05, + "loss": 0.0747, + "step": 2726 + }, + { + "epoch": 0.5460368294635709, + "learning_rate": 1.86368683969594e-05, + "loss": 0.0502, + "step": 2728 + }, + { + "epoch": 0.5468374699759808, + "learning_rate": 1.864390379168423e-05, + "loss": 0.2717, + "step": 2730 + }, + { + "epoch": 0.5468374699759808, + "learning_rate": 1.865092230467769e-05, + "loss": 0.3331, + "step": 2732 + }, + { + "epoch": 0.5476381104883907, + "learning_rate": 1.8657923922232464e-05, + "loss": 0.0152, + "step": 2734 + }, + { + "epoch": 0.5476381104883907, + "learning_rate": 1.866490863067425e-05, + "loss": 0.401, + "step": 2736 + }, + { + "epoch": 0.5484387510008006, + "learning_rate": 1.8671876416361763e-05, + "loss": 0.0083, + "step": 2738 + }, + { + "epoch": 0.5484387510008006, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.0975, + "step": 2740 + }, + { + "epoch": 0.5492393915132106, + "learning_rate": 1.8685761165074073e-05, + "loss": 0.1924, + "step": 2742 + }, + { + "epoch": 0.5492393915132106, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.0366, + "step": 2744 + }, + { + "epoch": 0.5500400320256205, + "learning_rate": 1.869957805990059e-05, + "loss": 0.0624, + "step": 2746 + }, + { + "epoch": 0.5500400320256205, + "learning_rate": 1.87064610283551e-05, + "loss": 0.0136, + "step": 2748 + }, + { + "epoch": 0.5508406725380304, + "learning_rate": 1.87133269929026e-05, + "loss": 0.2998, + "step": 2750 + }, + { + "epoch": 0.5508406725380304, + "learning_rate": 1.8720175940133705e-05, + "loss": 0.2102, + "step": 2752 + }, + { + "epoch": 0.5516413130504404, + "learning_rate": 1.8727007856672285e-05, + "loss": 0.4385, + "step": 2754 + }, + { + "epoch": 0.5516413130504404, + "learning_rate": 1.873382272917545e-05, + "loss": 0.3589, + "step": 2756 + }, + { + "epoch": 0.5524419535628503, + "learning_rate": 1.8740620544333607e-05, + "loss": 0.2484, + "step": 2758 + }, + { + "epoch": 0.5524419535628503, + "learning_rate": 1.8747401288870472e-05, + "loss": 0.6678, + "step": 2760 + }, + { + "epoch": 0.5532425940752602, + "learning_rate": 1.875416494954312e-05, + "loss": 0.0265, + "step": 2762 + }, + { + "epoch": 0.5532425940752602, + "learning_rate": 1.876091151314196e-05, + "loss": 0.1493, + "step": 2764 + }, + { + "epoch": 0.5540432345876701, + "learning_rate": 1.8767640966490813e-05, + "loss": 0.0007, + "step": 2766 + }, + { + "epoch": 0.5540432345876701, + "learning_rate": 1.877435329644691e-05, + "loss": 0.0735, + "step": 2768 + }, + { + "epoch": 0.55484387510008, + "learning_rate": 1.878104848990093e-05, + "loss": 0.2428, + "step": 2770 + }, + { + "epoch": 0.55484387510008, + "learning_rate": 1.8787726533776996e-05, + "loss": 0.1282, + "step": 2772 + }, + { + "epoch": 0.55564451561249, + "learning_rate": 1.879438741503277e-05, + "loss": 0.0125, + "step": 2774 + }, + { + "epoch": 0.55564451561249, + "learning_rate": 1.8801031120659393e-05, + "loss": 0.0359, + "step": 2776 + }, + { + "epoch": 0.5564451561248999, + "learning_rate": 1.8807657637681563e-05, + "loss": 0.6463, + "step": 2778 + }, + { + "epoch": 0.5564451561248999, + "learning_rate": 1.8814266953157557e-05, + "loss": 0.0759, + "step": 2780 + }, + { + "epoch": 0.5572457966373099, + "learning_rate": 1.8820859054179225e-05, + "loss": 0.053, + "step": 2782 + }, + { + "epoch": 0.5572457966373099, + "learning_rate": 1.8827433927872066e-05, + "loss": 0.0775, + "step": 2784 + }, + { + "epoch": 0.5580464371497198, + "learning_rate": 1.883399156139519e-05, + "loss": 0.0607, + "step": 2786 + }, + { + "epoch": 0.5580464371497198, + "learning_rate": 1.8840531941941415e-05, + "loss": 0.0883, + "step": 2788 + }, + { + "epoch": 0.5588470776621297, + "learning_rate": 1.8847055056737233e-05, + "loss": 0.2617, + "step": 2790 + }, + { + "epoch": 0.5588470776621297, + "learning_rate": 1.8853560893042854e-05, + "loss": 0.0397, + "step": 2792 + }, + { + "epoch": 0.5596477181745396, + "learning_rate": 1.8860049438152244e-05, + "loss": 0.1746, + "step": 2794 + }, + { + "epoch": 0.5596477181745396, + "learning_rate": 1.8866520679393127e-05, + "loss": 0.0384, + "step": 2796 + }, + { + "epoch": 0.5604483586869495, + "learning_rate": 1.8872974604127025e-05, + "loss": 0.0563, + "step": 2798 + }, + { + "epoch": 0.5604483586869495, + "learning_rate": 1.8879411199749303e-05, + "loss": 0.0186, + "step": 2800 + }, + { + "epoch": 0.5612489991993594, + "learning_rate": 1.8885830453689132e-05, + "loss": 0.1743, + "step": 2802 + }, + { + "epoch": 0.5612489991993594, + "learning_rate": 1.889223235340958e-05, + "loss": 0.1027, + "step": 2804 + }, + { + "epoch": 0.5620496397117695, + "learning_rate": 1.889861688640759e-05, + "loss": 0.0204, + "step": 2806 + }, + { + "epoch": 0.5620496397117695, + "learning_rate": 1.8904984040214037e-05, + "loss": 0.7633, + "step": 2808 + }, + { + "epoch": 0.5628502802241794, + "learning_rate": 1.891133380239373e-05, + "loss": 0.4549, + "step": 2810 + }, + { + "epoch": 0.5628502802241794, + "learning_rate": 1.8917666160545436e-05, + "loss": 0.1316, + "step": 2812 + }, + { + "epoch": 0.5636509207365893, + "learning_rate": 1.892398110230194e-05, + "loss": 0.2188, + "step": 2814 + }, + { + "epoch": 0.5636509207365893, + "learning_rate": 1.893027861533002e-05, + "loss": 0.3679, + "step": 2816 + }, + { + "epoch": 0.5644515612489992, + "learning_rate": 1.8936558687330485e-05, + "loss": 0.029, + "step": 2818 + }, + { + "epoch": 0.5644515612489992, + "learning_rate": 1.894282130603823e-05, + "loss": 0.1532, + "step": 2820 + }, + { + "epoch": 0.5652522017614091, + "learning_rate": 1.8949066459222217e-05, + "loss": 0.0004, + "step": 2822 + }, + { + "epoch": 0.5652522017614091, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.0107, + "step": 2824 + }, + { + "epoch": 0.566052842273819, + "learning_rate": 1.8961504320265382e-05, + "loss": 0.0502, + "step": 2826 + }, + { + "epoch": 0.566052842273819, + "learning_rate": 1.896769700383315e-05, + "loss": 0.0432, + "step": 2828 + }, + { + "epoch": 0.5668534827862289, + "learning_rate": 1.897387217329439e-05, + "loss": 0.0877, + "step": 2830 + }, + { + "epoch": 0.5668534827862289, + "learning_rate": 1.898002981658886e-05, + "loss": 0.0279, + "step": 2832 + }, + { + "epoch": 0.567654123298639, + "learning_rate": 1.8986169921690543e-05, + "loss": 0.503, + "step": 2834 + }, + { + "epoch": 0.567654123298639, + "learning_rate": 1.899229247660769e-05, + "loss": 0.0652, + "step": 2836 + }, + { + "epoch": 0.5684547638110489, + "learning_rate": 1.899839746938281e-05, + "loss": 0.0885, + "step": 2838 + }, + { + "epoch": 0.5684547638110489, + "learning_rate": 1.9004484888092724e-05, + "loss": 0.0057, + "step": 2840 + }, + { + "epoch": 0.5692554043234588, + "learning_rate": 1.9010554720848577e-05, + "loss": 0.08, + "step": 2842 + }, + { + "epoch": 0.5692554043234588, + "learning_rate": 1.901660695579585e-05, + "loss": 0.1319, + "step": 2844 + }, + { + "epoch": 0.5700560448358687, + "learning_rate": 1.9022641581114392e-05, + "loss": 0.2047, + "step": 2846 + }, + { + "epoch": 0.5700560448358687, + "learning_rate": 1.9028658585018455e-05, + "loss": 0.2139, + "step": 2848 + }, + { + "epoch": 0.5708566853482786, + "learning_rate": 1.9034657955756695e-05, + "loss": 0.0255, + "step": 2850 + }, + { + "epoch": 0.5708566853482786, + "learning_rate": 1.9040639681612212e-05, + "loss": 0.0789, + "step": 2852 + }, + { + "epoch": 0.5716573258606885, + "learning_rate": 1.904660375090257e-05, + "loss": 0.0114, + "step": 2854 + }, + { + "epoch": 0.5716573258606885, + "learning_rate": 1.9052550151979816e-05, + "loss": 0.0273, + "step": 2856 + }, + { + "epoch": 0.5724579663730984, + "learning_rate": 1.905847887323049e-05, + "loss": 0.0891, + "step": 2858 + }, + { + "epoch": 0.5724579663730984, + "learning_rate": 1.9064389903075676e-05, + "loss": 0.026, + "step": 2860 + }, + { + "epoch": 0.5732586068855084, + "learning_rate": 1.9070283229971007e-05, + "loss": 0.0328, + "step": 2862 + }, + { + "epoch": 0.5732586068855084, + "learning_rate": 1.9076158842406674e-05, + "loss": 0.0519, + "step": 2864 + }, + { + "epoch": 0.5740592473979184, + "learning_rate": 1.9082016728907496e-05, + "loss": 0.4009, + "step": 2866 + }, + { + "epoch": 0.5740592473979184, + "learning_rate": 1.9087856878032886e-05, + "loss": 0.0011, + "step": 2868 + }, + { + "epoch": 0.5748598879103283, + "learning_rate": 1.909367927837691e-05, + "loss": 0.2279, + "step": 2870 + }, + { + "epoch": 0.5748598879103283, + "learning_rate": 1.909948391856829e-05, + "loss": 0.753, + "step": 2872 + }, + { + "epoch": 0.5756605284227382, + "learning_rate": 1.910527078727044e-05, + "loss": 0.1, + "step": 2874 + }, + { + "epoch": 0.5756605284227382, + "learning_rate": 1.911103987318148e-05, + "loss": 1.1071, + "step": 2876 + }, + { + "epoch": 0.5764611689351481, + "learning_rate": 1.911679116503425e-05, + "loss": 0.0113, + "step": 2878 + }, + { + "epoch": 0.5764611689351481, + "learning_rate": 1.912252465159637e-05, + "loss": 0.197, + "step": 2880 + }, + { + "epoch": 0.577261809447558, + "learning_rate": 1.9128240321670208e-05, + "loss": 0.1335, + "step": 2882 + }, + { + "epoch": 0.577261809447558, + "learning_rate": 1.913393816409294e-05, + "loss": 0.0666, + "step": 2884 + }, + { + "epoch": 0.578062449959968, + "learning_rate": 1.913961816773655e-05, + "loss": 0.3412, + "step": 2886 + }, + { + "epoch": 0.578062449959968, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.2249, + "step": 2888 + }, + { + "epoch": 0.5788630904723779, + "learning_rate": 1.9150924614348588e-05, + "loss": 0.0016, + "step": 2890 + }, + { + "epoch": 0.5788630904723779, + "learning_rate": 1.9156551035235288e-05, + "loss": 0.0658, + "step": 2892 + }, + { + "epoch": 0.5796637309847879, + "learning_rate": 1.916215957317944e-05, + "loss": 0.4264, + "step": 2894 + }, + { + "epoch": 0.5796637309847879, + "learning_rate": 1.9167750217227454e-05, + "loss": 0.048, + "step": 2896 + }, + { + "epoch": 0.5804643714971978, + "learning_rate": 1.9173322956460675e-05, + "loss": 0.313, + "step": 2898 + }, + { + "epoch": 0.5804643714971978, + "learning_rate": 1.9178877779995423e-05, + "loss": 0.0925, + "step": 2900 + }, + { + "epoch": 0.5812650120096077, + "learning_rate": 1.9184414676983006e-05, + "loss": 0.1905, + "step": 2902 + }, + { + "epoch": 0.5812650120096077, + "learning_rate": 1.9189933636609747e-05, + "loss": 0.0008, + "step": 2904 + }, + { + "epoch": 0.5820656525220176, + "learning_rate": 1.9195434648097003e-05, + "loss": 0.0969, + "step": 2906 + }, + { + "epoch": 0.5820656525220176, + "learning_rate": 1.9200917700701173e-05, + "loss": 0.0759, + "step": 2908 + }, + { + "epoch": 0.5828662930344275, + "learning_rate": 1.9206382783713738e-05, + "loss": 0.0689, + "step": 2910 + }, + { + "epoch": 0.5828662930344275, + "learning_rate": 1.9211829886461274e-05, + "loss": 0.0544, + "step": 2912 + }, + { + "epoch": 0.5836669335468375, + "learning_rate": 1.921725899830547e-05, + "loss": 0.2829, + "step": 2914 + }, + { + "epoch": 0.5836669335468375, + "learning_rate": 1.9222670108643146e-05, + "loss": 0.2253, + "step": 2916 + }, + { + "epoch": 0.5844675740592474, + "learning_rate": 1.92280632069063e-05, + "loss": 0.2933, + "step": 2918 + }, + { + "epoch": 0.5844675740592474, + "learning_rate": 1.9233438282562085e-05, + "loss": 0.0244, + "step": 2920 + }, + { + "epoch": 0.5852682145716573, + "learning_rate": 1.9238795325112867e-05, + "loss": 0.304, + "step": 2922 + }, + { + "epoch": 0.5852682145716573, + "learning_rate": 1.924413432409622e-05, + "loss": 0.0877, + "step": 2924 + }, + { + "epoch": 0.5860688550840673, + "learning_rate": 1.924945526908497e-05, + "loss": 0.0013, + "step": 2926 + }, + { + "epoch": 0.5860688550840673, + "learning_rate": 1.925475814968719e-05, + "loss": 0.013, + "step": 2928 + }, + { + "epoch": 0.5868694955964772, + "learning_rate": 1.9260042955546237e-05, + "loss": 0.3928, + "step": 2930 + }, + { + "epoch": 0.5868694955964772, + "learning_rate": 1.926530967634078e-05, + "loss": 0.0423, + "step": 2932 + }, + { + "epoch": 0.5876701361088871, + "learning_rate": 1.9270558301784795e-05, + "loss": 0.5612, + "step": 2934 + }, + { + "epoch": 0.5876701361088871, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.3141, + "step": 2936 + }, + { + "epoch": 0.588470776621297, + "learning_rate": 1.9281001225653887e-05, + "loss": 0.051, + "step": 2938 + }, + { + "epoch": 0.588470776621297, + "learning_rate": 1.9286195503683705e-05, + "loss": 0.1254, + "step": 2940 + }, + { + "epoch": 0.589271417133707, + "learning_rate": 1.9291371645572517e-05, + "loss": 0.1406, + "step": 2942 + }, + { + "epoch": 0.589271417133707, + "learning_rate": 1.9296529641211215e-05, + "loss": 0.0324, + "step": 2944 + }, + { + "epoch": 0.5900720576461169, + "learning_rate": 1.9301669480526115e-05, + "loss": 0.0367, + "step": 2946 + }, + { + "epoch": 0.5900720576461169, + "learning_rate": 1.9306791153479004e-05, + "loss": 0.0067, + "step": 2948 + }, + { + "epoch": 0.5908726981585268, + "learning_rate": 1.931189465006714e-05, + "loss": 0.3079, + "step": 2950 + }, + { + "epoch": 0.5908726981585268, + "learning_rate": 1.9316979960323286e-05, + "loss": 0.043, + "step": 2952 + }, + { + "epoch": 0.5916733386709367, + "learning_rate": 1.9322047074315717e-05, + "loss": 0.3387, + "step": 2954 + }, + { + "epoch": 0.5916733386709367, + "learning_rate": 1.932709598214825e-05, + "loss": 0.0264, + "step": 2956 + }, + { + "epoch": 0.5924739791833467, + "learning_rate": 1.9332126673960262e-05, + "loss": 0.5051, + "step": 2958 + }, + { + "epoch": 0.5924739791833467, + "learning_rate": 1.9337139139926707e-05, + "loss": 0.1903, + "step": 2960 + }, + { + "epoch": 0.5932746196957566, + "learning_rate": 1.934213337025812e-05, + "loss": 0.0196, + "step": 2962 + }, + { + "epoch": 0.5932746196957566, + "learning_rate": 1.9347109355200672e-05, + "loss": 0.6344, + "step": 2964 + }, + { + "epoch": 0.5940752602081665, + "learning_rate": 1.9352067085036145e-05, + "loss": 0.0228, + "step": 2966 + }, + { + "epoch": 0.5940752602081665, + "learning_rate": 1.935700655008199e-05, + "loss": 0.2708, + "step": 2968 + }, + { + "epoch": 0.5948759007205765, + "learning_rate": 1.9361927740691327e-05, + "loss": 0.0204, + "step": 2970 + }, + { + "epoch": 0.5948759007205765, + "learning_rate": 1.9366830647252967e-05, + "loss": 0.1295, + "step": 2972 + }, + { + "epoch": 0.5956765412329864, + "learning_rate": 1.937171526019142e-05, + "loss": 0.4028, + "step": 2974 + }, + { + "epoch": 0.5956765412329864, + "learning_rate": 1.9376581569966933e-05, + "loss": 0.6196, + "step": 2976 + }, + { + "epoch": 0.5964771817453963, + "learning_rate": 1.9381429567075504e-05, + "loss": 0.206, + "step": 2978 + }, + { + "epoch": 0.5964771817453963, + "learning_rate": 1.9386259242048883e-05, + "loss": 0.1636, + "step": 2980 + }, + { + "epoch": 0.5972778222578062, + "learning_rate": 1.93910705854546e-05, + "loss": 0.0254, + "step": 2982 + }, + { + "epoch": 0.5972778222578062, + "learning_rate": 1.939586358789602e-05, + "loss": 0.0513, + "step": 2984 + }, + { + "epoch": 0.5980784627702161, + "learning_rate": 1.9400638240012294e-05, + "loss": 0.2472, + "step": 2986 + }, + { + "epoch": 0.5980784627702161, + "learning_rate": 1.940539453247842e-05, + "loss": 0.0633, + "step": 2988 + }, + { + "epoch": 0.5988791032826261, + "learning_rate": 1.9410132456005262e-05, + "loss": 0.1365, + "step": 2990 + }, + { + "epoch": 0.5988791032826261, + "learning_rate": 1.9414852001339547e-05, + "loss": 0.0713, + "step": 2992 + }, + { + "epoch": 0.5996797437950361, + "learning_rate": 1.9419553159263896e-05, + "loss": 0.0337, + "step": 2994 + }, + { + "epoch": 0.5996797437950361, + "learning_rate": 1.9424235920596863e-05, + "loss": 0.1765, + "step": 2996 + }, + { + "epoch": 0.600480384307446, + "learning_rate": 1.94289002761929e-05, + "loss": 0.0763, + "step": 2998 + }, + { + "epoch": 0.600480384307446, + "learning_rate": 1.9433546216942423e-05, + "loss": 0.1236, + "step": 3000 + }, + { + "epoch": 0.6012810248198559, + "learning_rate": 1.943817373377181e-05, + "loss": 0.0208, + "step": 3002 + }, + { + "epoch": 0.6012810248198559, + "learning_rate": 1.944278281764342e-05, + "loss": 0.0534, + "step": 3004 + }, + { + "epoch": 0.6020816653322658, + "learning_rate": 1.944737345955561e-05, + "loss": 0.3149, + "step": 3006 + }, + { + "epoch": 0.6020816653322658, + "learning_rate": 1.945194565054276e-05, + "loss": 0.1051, + "step": 3008 + }, + { + "epoch": 0.6028823058446757, + "learning_rate": 1.945649938167528e-05, + "loss": 0.2579, + "step": 3010 + }, + { + "epoch": 0.6028823058446757, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.0009, + "step": 3012 + }, + { + "epoch": 0.6036829463570856, + "learning_rate": 1.946555142883836e-05, + "loss": 0.0328, + "step": 3014 + }, + { + "epoch": 0.6036829463570856, + "learning_rate": 1.9470049727190073e-05, + "loss": 0.4594, + "step": 3016 + }, + { + "epoch": 0.6044835868694955, + "learning_rate": 1.9474529530329507e-05, + "loss": 0.226, + "step": 3018 + }, + { + "epoch": 0.6044835868694955, + "learning_rate": 1.9478990829507504e-05, + "loss": 0.0774, + "step": 3020 + }, + { + "epoch": 0.6052842273819056, + "learning_rate": 1.9483433616011047e-05, + "loss": 0.2098, + "step": 3022 + }, + { + "epoch": 0.6052842273819056, + "learning_rate": 1.948785788116329e-05, + "loss": 0.1908, + "step": 3024 + }, + { + "epoch": 0.6060848678943155, + "learning_rate": 1.9492263616323533e-05, + "loss": 0.02, + "step": 3026 + }, + { + "epoch": 0.6060848678943155, + "learning_rate": 1.9496650812887286e-05, + "loss": 0.0975, + "step": 3028 + }, + { + "epoch": 0.6068855084067254, + "learning_rate": 1.9501019462286263e-05, + "loss": 0.044, + "step": 3030 + }, + { + "epoch": 0.6068855084067254, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.6919, + "step": 3032 + }, + { + "epoch": 0.6076861489191353, + "learning_rate": 1.9509701085497842e-05, + "loss": 0.5616, + "step": 3034 + }, + { + "epoch": 0.6076861489191353, + "learning_rate": 1.951401404235505e-05, + "loss": 0.0814, + "step": 3036 + }, + { + "epoch": 0.6084867894315452, + "learning_rate": 1.9518308418136718e-05, + "loss": 0.0647, + "step": 3038 + }, + { + "epoch": 0.6084867894315452, + "learning_rate": 1.952258420445583e-05, + "loss": 0.0022, + "step": 3040 + }, + { + "epoch": 0.6092874299439551, + "learning_rate": 1.952684139296169e-05, + "loss": 0.4671, + "step": 3042 + }, + { + "epoch": 0.6092874299439551, + "learning_rate": 1.9531079975339912e-05, + "loss": 0.2754, + "step": 3044 + }, + { + "epoch": 0.610088070456365, + "learning_rate": 1.9535299943312455e-05, + "loss": 0.003, + "step": 3046 + }, + { + "epoch": 0.610088070456365, + "learning_rate": 1.953950128863762e-05, + "loss": 0.0414, + "step": 3048 + }, + { + "epoch": 0.6108887109687751, + "learning_rate": 1.9543684003110105e-05, + "loss": 0.3835, + "step": 3050 + }, + { + "epoch": 0.6108887109687751, + "learning_rate": 1.9547848078560975e-05, + "loss": 0.5707, + "step": 3052 + }, + { + "epoch": 0.611689351481185, + "learning_rate": 1.9551993506857688e-05, + "loss": 0.0032, + "step": 3054 + }, + { + "epoch": 0.611689351481185, + "learning_rate": 1.9556120279904144e-05, + "loss": 0.0004, + "step": 3056 + }, + { + "epoch": 0.6124899919935949, + "learning_rate": 1.9560228389640664e-05, + "loss": 0.0181, + "step": 3058 + }, + { + "epoch": 0.6124899919935949, + "learning_rate": 1.956431782804402e-05, + "loss": 0.2729, + "step": 3060 + }, + { + "epoch": 0.6132906325060048, + "learning_rate": 1.956838858712744e-05, + "loss": 0.0456, + "step": 3062 + }, + { + "epoch": 0.6132906325060048, + "learning_rate": 1.957244065894066e-05, + "loss": 0.2669, + "step": 3064 + }, + { + "epoch": 0.6140912730184147, + "learning_rate": 1.9576474035569892e-05, + "loss": 0.0426, + "step": 3066 + }, + { + "epoch": 0.6140912730184147, + "learning_rate": 1.9580488709137858e-05, + "loss": 0.1911, + "step": 3068 + }, + { + "epoch": 0.6148919135308246, + "learning_rate": 1.9584484671803818e-05, + "loss": 0.1603, + "step": 3070 + }, + { + "epoch": 0.6148919135308246, + "learning_rate": 1.9588461915763566e-05, + "loss": 0.4472, + "step": 3072 + }, + { + "epoch": 0.6156925540432346, + "learning_rate": 1.9592420433249462e-05, + "loss": 0.0796, + "step": 3074 + }, + { + "epoch": 0.6156925540432346, + "learning_rate": 1.9596360216530436e-05, + "loss": 0.0239, + "step": 3076 + }, + { + "epoch": 0.6164931945556446, + "learning_rate": 1.9600281257912e-05, + "loss": 0.2301, + "step": 3078 + }, + { + "epoch": 0.6164931945556446, + "learning_rate": 1.9604183549736283e-05, + "loss": 0.0151, + "step": 3080 + }, + { + "epoch": 0.6172938350680545, + "learning_rate": 1.960806708438202e-05, + "loss": 0.0399, + "step": 3082 + }, + { + "epoch": 0.6172938350680545, + "learning_rate": 1.961193185426459e-05, + "loss": 0.0157, + "step": 3084 + }, + { + "epoch": 0.6180944755804644, + "learning_rate": 1.9615777851836003e-05, + "loss": 0.5777, + "step": 3086 + }, + { + "epoch": 0.6180944755804644, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.1007, + "step": 3088 + }, + { + "epoch": 0.6188951160928743, + "learning_rate": 1.962341350003679e-05, + "loss": 0.3055, + "step": 3090 + }, + { + "epoch": 0.6188951160928743, + "learning_rate": 1.9627203135753573e-05, + "loss": 0.1015, + "step": 3092 + }, + { + "epoch": 0.6196957566052842, + "learning_rate": 1.9630973969334068e-05, + "loss": 0.0797, + "step": 3094 + }, + { + "epoch": 0.6196957566052842, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.0085, + "step": 3096 + }, + { + "epoch": 0.6204963971176941, + "learning_rate": 1.9638459200664822e-05, + "loss": 0.0902, + "step": 3098 + }, + { + "epoch": 0.6204963971176941, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.3003, + "step": 3100 + }, + { + "epoch": 0.6212970376301041, + "learning_rate": 1.9645869135553806e-05, + "loss": 0.0885, + "step": 3102 + }, + { + "epoch": 0.6212970376301041, + "learning_rate": 1.964954584871995e-05, + "loss": 0.0303, + "step": 3104 + }, + { + "epoch": 0.622097678142514, + "learning_rate": 1.965320371611399e-05, + "loss": 0.0008, + "step": 3106 + }, + { + "epoch": 0.622097678142514, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.7543, + "step": 3108 + }, + { + "epoch": 0.622898318654924, + "learning_rate": 1.9660462885047032e-05, + "loss": 0.4338, + "step": 3110 + }, + { + "epoch": 0.622898318654924, + "learning_rate": 1.966406417240872e-05, + "loss": 0.4545, + "step": 3112 + }, + { + "epoch": 0.6236989591673339, + "learning_rate": 1.9667646585643703e-05, + "loss": 0.0027, + "step": 3114 + }, + { + "epoch": 0.6236989591673339, + "learning_rate": 1.967121011775546e-05, + "loss": 0.0461, + "step": 3116 + }, + { + "epoch": 0.6244995996797438, + "learning_rate": 1.967475476178433e-05, + "loss": 0.4258, + "step": 3118 + }, + { + "epoch": 0.6244995996797438, + "learning_rate": 1.967828051080755e-05, + "loss": 0.0258, + "step": 3120 + }, + { + "epoch": 0.6253002401921537, + "learning_rate": 1.9681787357939254e-05, + "loss": 0.2104, + "step": 3122 + }, + { + "epoch": 0.6253002401921537, + "learning_rate": 1.9685275296330497e-05, + "loss": 0.3567, + "step": 3124 + }, + { + "epoch": 0.6261008807045636, + "learning_rate": 1.968874431916926e-05, + "loss": 0.6952, + "step": 3126 + }, + { + "epoch": 0.6261008807045636, + "learning_rate": 1.969219441968046e-05, + "loss": 0.0573, + "step": 3128 + }, + { + "epoch": 0.6269015212169736, + "learning_rate": 1.969562559112598e-05, + "loss": 0.0086, + "step": 3130 + }, + { + "epoch": 0.6269015212169736, + "learning_rate": 1.969903782680467e-05, + "loss": 0.1163, + "step": 3132 + }, + { + "epoch": 0.6277021617293835, + "learning_rate": 1.970243112005235e-05, + "loss": 0.0185, + "step": 3134 + }, + { + "epoch": 0.6277021617293835, + "learning_rate": 1.9705805464241856e-05, + "loss": 0.0286, + "step": 3136 + }, + { + "epoch": 0.6285028022417934, + "learning_rate": 1.970916085278302e-05, + "loss": 0.575, + "step": 3138 + }, + { + "epoch": 0.6285028022417934, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.5137, + "step": 3140 + }, + { + "epoch": 0.6293034427542034, + "learning_rate": 1.9715814736744755e-05, + "loss": 0.0539, + "step": 3142 + }, + { + "epoch": 0.6293034427542034, + "learning_rate": 1.971911321917015e-05, + "loss": 0.4387, + "step": 3144 + }, + { + "epoch": 0.6301040832666133, + "learning_rate": 1.9722392719956864e-05, + "loss": 0.0602, + "step": 3146 + }, + { + "epoch": 0.6301040832666133, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.2079, + "step": 3148 + }, + { + "epoch": 0.6309047237790232, + "learning_rate": 1.9728894751031595e-05, + "loss": 0.0084, + "step": 3150 + }, + { + "epoch": 0.6309047237790232, + "learning_rate": 1.9732117268621005e-05, + "loss": 1.2046, + "step": 3152 + }, + { + "epoch": 0.6317053642914331, + "learning_rate": 1.9735320779174545e-05, + "loss": 0.4877, + "step": 3154 + }, + { + "epoch": 0.6317053642914331, + "learning_rate": 1.9738505276435692e-05, + "loss": 0.3391, + "step": 3156 + }, + { + "epoch": 0.6325060048038431, + "learning_rate": 1.974167075418505e-05, + "loss": 0.0799, + "step": 3158 + }, + { + "epoch": 0.6325060048038431, + "learning_rate": 1.9744817206240377e-05, + "loss": 0.1959, + "step": 3160 + }, + { + "epoch": 0.633306645316253, + "learning_rate": 1.9747944626456577e-05, + "loss": 0.061, + "step": 3162 + }, + { + "epoch": 0.633306645316253, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.1458, + "step": 3164 + }, + { + "epoch": 0.6341072858286629, + "learning_rate": 1.975414234697712e-05, + "loss": 0.046, + "step": 3166 + }, + { + "epoch": 0.6341072858286629, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.1543, + "step": 3168 + }, + { + "epoch": 0.6349079263410728, + "learning_rate": 1.9760263867329568e-05, + "loss": 0.2984, + "step": 3170 + }, + { + "epoch": 0.6349079263410728, + "learning_rate": 1.9763296037475174e-05, + "loss": 0.0931, + "step": 3172 + }, + { + "epoch": 0.6357085668534828, + "learning_rate": 1.97663091396921e-05, + "loss": 0.0653, + "step": 3174 + }, + { + "epoch": 0.6357085668534828, + "learning_rate": 1.976930316809569e-05, + "loss": 0.4278, + "step": 3176 + }, + { + "epoch": 0.6365092073658927, + "learning_rate": 1.9772278116838543e-05, + "loss": 0.0031, + "step": 3178 + }, + { + "epoch": 0.6365092073658927, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.1083, + "step": 3180 + }, + { + "epoch": 0.6373098478783027, + "learning_rate": 1.977817075213876e-05, + "loss": 0.046, + "step": 3182 + }, + { + "epoch": 0.6373098478783027, + "learning_rate": 1.978108842718768e-05, + "loss": 0.3315, + "step": 3184 + }, + { + "epoch": 0.6381104883907126, + "learning_rate": 1.9783986999558994e-05, + "loss": 0.1094, + "step": 3186 + }, + { + "epoch": 0.6381104883907126, + "learning_rate": 1.9786866463591732e-05, + "loss": 0.2643, + "step": 3188 + }, + { + "epoch": 0.6389111289031225, + "learning_rate": 1.9789726813662233e-05, + "loss": 0.1071, + "step": 3190 + }, + { + "epoch": 0.6389111289031225, + "learning_rate": 1.9792568044184176e-05, + "loss": 0.2529, + "step": 3192 + }, + { + "epoch": 0.6397117694155324, + "learning_rate": 1.979539014960858e-05, + "loss": 0.0163, + "step": 3194 + }, + { + "epoch": 0.6397117694155324, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.1942, + "step": 3196 + }, + { + "epoch": 0.6405124099279423, + "learning_rate": 1.9800976963155584e-05, + "loss": 0.3772, + "step": 3198 + }, + { + "epoch": 0.6405124099279423, + "learning_rate": 1.9803741660367015e-05, + "loss": 0.2323, + "step": 3200 + }, + { + "epoch": 0.6413130504403523, + "learning_rate": 1.980648721065859e-05, + "loss": 0.0012, + "step": 3202 + }, + { + "epoch": 0.6413130504403523, + "learning_rate": 1.9809213608668185e-05, + "loss": 0.2128, + "step": 3204 + }, + { + "epoch": 0.6421136909527622, + "learning_rate": 1.9811920849071092e-05, + "loss": 1.3179, + "step": 3206 + }, + { + "epoch": 0.6421136909527622, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.2101, + "step": 3208 + }, + { + "epoch": 0.6429143314651722, + "learning_rate": 1.9817277835945057e-05, + "loss": 0.0426, + "step": 3210 + }, + { + "epoch": 0.6429143314651722, + "learning_rate": 1.9819927571953807e-05, + "loss": 0.4232, + "step": 3212 + }, + { + "epoch": 0.6437149719775821, + "learning_rate": 1.9822558129431263e-05, + "loss": 0.259, + "step": 3214 + }, + { + "epoch": 0.6437149719775821, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.2671, + "step": 3216 + }, + { + "epoch": 0.644515612489992, + "learning_rate": 1.9827761688279606e-05, + "loss": 0.1441, + "step": 3218 + }, + { + "epoch": 0.644515612489992, + "learning_rate": 1.983033467948784e-05, + "loss": 0.102, + "step": 3220 + }, + { + "epoch": 0.6453162530024019, + "learning_rate": 1.983288847183947e-05, + "loss": 0.0541, + "step": 3222 + }, + { + "epoch": 0.6453162530024019, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.1411, + "step": 3224 + }, + { + "epoch": 0.6461168935148118, + "learning_rate": 1.9837938440059992e-05, + "loss": 0.6173, + "step": 3226 + }, + { + "epoch": 0.6461168935148118, + "learning_rate": 1.9840434606066182e-05, + "loss": 0.2087, + "step": 3228 + }, + { + "epoch": 0.6469175340272217, + "learning_rate": 1.9842911553490392e-05, + "loss": 0.9188, + "step": 3230 + }, + { + "epoch": 0.6469175340272217, + "learning_rate": 1.9845369277495102e-05, + "loss": 0.1792, + "step": 3232 + }, + { + "epoch": 0.6477181745396317, + "learning_rate": 1.984780777328031e-05, + "loss": 0.1453, + "step": 3234 + }, + { + "epoch": 0.6477181745396317, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.0494, + "step": 3236 + }, + { + "epoch": 0.6485188150520417, + "learning_rate": 1.985262706118007e-05, + "loss": 0.3524, + "step": 3238 + }, + { + "epoch": 0.6485188150520417, + "learning_rate": 1.985500784388244e-05, + "loss": 0.0027, + "step": 3240 + }, + { + "epoch": 0.6493194555644516, + "learning_rate": 1.9857369379540982e-05, + "loss": 0.4123, + "step": 3242 + }, + { + "epoch": 0.6493194555644516, + "learning_rate": 1.985971166354357e-05, + "loss": 0.0321, + "step": 3244 + }, + { + "epoch": 0.6501200960768615, + "learning_rate": 1.986203469131567e-05, + "loss": 0.0128, + "step": 3246 + }, + { + "epoch": 0.6501200960768615, + "learning_rate": 1.9864338458320366e-05, + "loss": 0.2863, + "step": 3248 + }, + { + "epoch": 0.6509207365892714, + "learning_rate": 1.986662296005834e-05, + "loss": 0.1284, + "step": 3250 + }, + { + "epoch": 0.6509207365892714, + "learning_rate": 1.986888819206792e-05, + "loss": 0.0381, + "step": 3252 + }, + { + "epoch": 0.6517213771016813, + "learning_rate": 1.987113414992505e-05, + "loss": 0.8372, + "step": 3254 + }, + { + "epoch": 0.6517213771016813, + "learning_rate": 1.9873360829243323e-05, + "loss": 0.3556, + "step": 3256 + }, + { + "epoch": 0.6525220176140912, + "learning_rate": 1.9875568225674e-05, + "loss": 0.3524, + "step": 3258 + }, + { + "epoch": 0.6525220176140912, + "learning_rate": 1.9877756334905983e-05, + "loss": 0.0542, + "step": 3260 + }, + { + "epoch": 0.6533226581265013, + "learning_rate": 1.9879925152665845e-05, + "loss": 0.1777, + "step": 3262 + }, + { + "epoch": 0.6533226581265013, + "learning_rate": 1.9882074674717836e-05, + "loss": 0.4505, + "step": 3264 + }, + { + "epoch": 0.6541232986389112, + "learning_rate": 1.9884204896863895e-05, + "loss": 0.0367, + "step": 3266 + }, + { + "epoch": 0.6541232986389112, + "learning_rate": 1.988631581494365e-05, + "loss": 0.1779, + "step": 3268 + }, + { + "epoch": 0.6549239391513211, + "learning_rate": 1.9888407424834433e-05, + "loss": 0.3763, + "step": 3270 + }, + { + "epoch": 0.6549239391513211, + "learning_rate": 1.989047972245129e-05, + "loss": 0.0357, + "step": 3272 + }, + { + "epoch": 0.655724579663731, + "learning_rate": 1.989253270374697e-05, + "loss": 0.009, + "step": 3274 + }, + { + "epoch": 0.655724579663731, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.3273, + "step": 3276 + }, + { + "epoch": 0.6565252201761409, + "learning_rate": 1.9896580701374482e-05, + "loss": 0.458, + "step": 3278 + }, + { + "epoch": 0.6565252201761409, + "learning_rate": 1.989857570980049e-05, + "loss": 0.009, + "step": 3280 + }, + { + "epoch": 0.6573258606885508, + "learning_rate": 1.9900551386093677e-05, + "loss": 0.0142, + "step": 3282 + }, + { + "epoch": 0.6573258606885508, + "learning_rate": 1.990250772639552e-05, + "loss": 0.1096, + "step": 3284 + }, + { + "epoch": 0.6581265012009607, + "learning_rate": 1.9904444726885236e-05, + "loss": 0.2111, + "step": 3286 + }, + { + "epoch": 0.6581265012009607, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.067, + "step": 3288 + }, + { + "epoch": 0.6589271417133707, + "learning_rate": 1.990826069333406e-05, + "loss": 0.6391, + "step": 3290 + }, + { + "epoch": 0.6589271417133707, + "learning_rate": 1.99101396518405e-05, + "loss": 0.033, + "step": 3292 + }, + { + "epoch": 0.6597277822257807, + "learning_rate": 1.99119992556295e-05, + "loss": 0.3959, + "step": 3294 + }, + { + "epoch": 0.6597277822257807, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.1904, + "step": 3296 + }, + { + "epoch": 0.6605284227381906, + "learning_rate": 1.99156603845656e-05, + "loss": 0.1339, + "step": 3298 + }, + { + "epoch": 0.6605284227381906, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.0432, + "step": 3300 + }, + { + "epoch": 0.6613290632506005, + "learning_rate": 1.9919244051541315e-05, + "loss": 0.1147, + "step": 3302 + }, + { + "epoch": 0.6613290632506005, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.0038, + "step": 3304 + }, + { + "epoch": 0.6621297037630104, + "learning_rate": 1.9922750228560746e-05, + "loss": 0.2075, + "step": 3306 + }, + { + "epoch": 0.6621297037630104, + "learning_rate": 1.9924474249753652e-05, + "loss": 0.2383, + "step": 3308 + }, + { + "epoch": 0.6629303442754203, + "learning_rate": 1.9926178888233344e-05, + "loss": 0.2646, + "step": 3310 + }, + { + "epoch": 0.6629303442754203, + "learning_rate": 1.9927864140670615e-05, + "loss": 0.1998, + "step": 3312 + }, + { + "epoch": 0.6637309847878302, + "learning_rate": 1.9929530003774133e-05, + "loss": 0.384, + "step": 3314 + }, + { + "epoch": 0.6637309847878302, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.1201, + "step": 3316 + }, + { + "epoch": 0.6645316253002402, + "learning_rate": 1.993280354900393e-05, + "loss": 0.0805, + "step": 3318 + }, + { + "epoch": 0.6645316253002402, + "learning_rate": 1.99344112247369e-05, + "loss": 0.0277, + "step": 3320 + }, + { + "epoch": 0.6653322658126501, + "learning_rate": 1.9935999498349518e-05, + "loss": 0.0125, + "step": 3322 + }, + { + "epoch": 0.6653322658126501, + "learning_rate": 1.9937568366739858e-05, + "loss": 0.0784, + "step": 3324 + }, + { + "epoch": 0.6661329063250601, + "learning_rate": 1.9939117826843883e-05, + "loss": 0.0919, + "step": 3326 + }, + { + "epoch": 0.6661329063250601, + "learning_rate": 1.9940647875635463e-05, + "loss": 0.0151, + "step": 3328 + }, + { + "epoch": 0.66693354683747, + "learning_rate": 1.9942158510126384e-05, + "loss": 0.016, + "step": 3330 + }, + { + "epoch": 0.66693354683747, + "learning_rate": 1.9943649727366335e-05, + "loss": 0.1609, + "step": 3332 + }, + { + "epoch": 0.6677341873498799, + "learning_rate": 1.9945121524442944e-05, + "loss": 0.0683, + "step": 3334 + }, + { + "epoch": 0.6677341873498799, + "learning_rate": 1.994657389848176e-05, + "loss": 0.0965, + "step": 3336 + }, + { + "epoch": 0.6685348278622898, + "learning_rate": 1.9948006846646262e-05, + "loss": 0.1623, + "step": 3338 + }, + { + "epoch": 0.6685348278622898, + "learning_rate": 1.994942036613787e-05, + "loss": 1.4073, + "step": 3340 + }, + { + "epoch": 0.6693354683746997, + "learning_rate": 1.9950814454195953e-05, + "loss": 0.0522, + "step": 3342 + }, + { + "epoch": 0.6693354683746997, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.116, + "step": 3344 + }, + { + "epoch": 0.6701361088871097, + "learning_rate": 1.9953544325158755e-05, + "loss": 0.1799, + "step": 3346 + }, + { + "epoch": 0.6701361088871097, + "learning_rate": 1.995488010273198e-05, + "loss": 0.0502, + "step": 3348 + }, + { + "epoch": 0.6709367493995196, + "learning_rate": 1.9956196438208693e-05, + "loss": 0.1913, + "step": 3350 + }, + { + "epoch": 0.6709367493995196, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.15, + "step": 3352 + }, + { + "epoch": 0.6717373899119295, + "learning_rate": 1.9958770772627236e-05, + "loss": 0.0231, + "step": 3354 + }, + { + "epoch": 0.6717373899119295, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.006, + "step": 3356 + }, + { + "epoch": 0.6725380304243395, + "learning_rate": 1.9961267308303473e-05, + "loss": 0.3775, + "step": 3358 + }, + { + "epoch": 0.6725380304243395, + "learning_rate": 1.996248639549475e-05, + "loss": 0.018, + "step": 3360 + }, + { + "epoch": 0.6733386709367494, + "learning_rate": 1.9963686025734262e-05, + "loss": 0.0518, + "step": 3362 + }, + { + "epoch": 0.6733386709367494, + "learning_rate": 1.9964866196679105e-05, + "loss": 0.1289, + "step": 3364 + }, + { + "epoch": 0.6741393114491593, + "learning_rate": 1.9966026906024377e-05, + "loss": 0.2643, + "step": 3366 + }, + { + "epoch": 0.6741393114491593, + "learning_rate": 1.9967168151503196e-05, + "loss": 0.001, + "step": 3368 + }, + { + "epoch": 0.6749399519615693, + "learning_rate": 1.9968289930886675e-05, + "loss": 0.0263, + "step": 3370 + }, + { + "epoch": 0.6749399519615693, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.4972, + "step": 3372 + }, + { + "epoch": 0.6757405924739792, + "learning_rate": 1.997047508264221e-05, + "loss": 0.2254, + "step": 3374 + }, + { + "epoch": 0.6757405924739792, + "learning_rate": 1.997153845074662e-05, + "loss": 0.461, + "step": 3376 + }, + { + "epoch": 0.6765412329863891, + "learning_rate": 1.99725823442204e-05, + "loss": 0.0834, + "step": 3378 + }, + { + "epoch": 0.6765412329863891, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.0907, + "step": 3380 + }, + { + "epoch": 0.677341873498799, + "learning_rate": 1.9974611699159142e-05, + "loss": 0.2199, + "step": 3382 + }, + { + "epoch": 0.677341873498799, + "learning_rate": 1.997559715666073e-05, + "loss": 0.0822, + "step": 3384 + }, + { + "epoch": 0.678142514011209, + "learning_rate": 1.9976563131604945e-05, + "loss": 0.1336, + "step": 3386 + }, + { + "epoch": 0.678142514011209, + "learning_rate": 1.9977509622105233e-05, + "loss": 0.0338, + "step": 3388 + }, + { + "epoch": 0.6789431545236189, + "learning_rate": 1.9978436626313065e-05, + "loss": 0.143, + "step": 3390 + }, + { + "epoch": 0.6789431545236189, + "learning_rate": 1.9979344142417986e-05, + "loss": 0.0251, + "step": 3392 + }, + { + "epoch": 0.6797437950360288, + "learning_rate": 1.99802321686476e-05, + "loss": 0.0006, + "step": 3394 + }, + { + "epoch": 0.6797437950360288, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.1199, + "step": 3396 + }, + { + "epoch": 0.6805444355484388, + "learning_rate": 1.9981949744581622e-05, + "loss": 0.0142, + "step": 3398 + }, + { + "epoch": 0.6805444355484388, + "learning_rate": 1.998277929093157e-05, + "loss": 0.0015, + "step": 3400 + }, + { + "epoch": 0.6813450760608487, + "learning_rate": 1.9983589340697288e-05, + "loss": 0.3574, + "step": 3402 + }, + { + "epoch": 0.6813450760608487, + "learning_rate": 1.998437989229673e-05, + "loss": 0.1, + "step": 3404 + }, + { + "epoch": 0.6821457165732586, + "learning_rate": 1.998515094418594e-05, + "loss": 0.0003, + "step": 3406 + }, + { + "epoch": 0.6821457165732586, + "learning_rate": 1.9985902494859023e-05, + "loss": 0.2868, + "step": 3408 + }, + { + "epoch": 0.6829463570856685, + "learning_rate": 1.99866345428482e-05, + "loss": 0.1281, + "step": 3410 + }, + { + "epoch": 0.6829463570856685, + "learning_rate": 1.998734708672375e-05, + "loss": 0.3306, + "step": 3412 + }, + { + "epoch": 0.6837469975980784, + "learning_rate": 1.998804012509407e-05, + "loss": 0.0462, + "step": 3414 + }, + { + "epoch": 0.6837469975980784, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.0204, + "step": 3416 + }, + { + "epoch": 0.6845476381104884, + "learning_rate": 1.9989367679943025e-05, + "loss": 0.0001, + "step": 3418 + }, + { + "epoch": 0.6845476381104884, + "learning_rate": 1.9990002193828923e-05, + "loss": 1.0227, + "step": 3420 + }, + { + "epoch": 0.6853482786228983, + "learning_rate": 1.9990617197024103e-05, + "loss": 0.0296, + "step": 3422 + }, + { + "epoch": 0.6853482786228983, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.1025, + "step": 3424 + }, + { + "epoch": 0.6861489191353083, + "learning_rate": 1.999178866657597e-05, + "loss": 0.1038, + "step": 3426 + }, + { + "epoch": 0.6861489191353083, + "learning_rate": 1.9992345130644747e-05, + "loss": 0.1705, + "step": 3428 + }, + { + "epoch": 0.6869495596477182, + "learning_rate": 1.999288207944701e-05, + "loss": 0.3318, + "step": 3430 + }, + { + "epoch": 0.6869495596477182, + "learning_rate": 1.999339951193407e-05, + "loss": 0.0481, + "step": 3432 + }, + { + "epoch": 0.6877502001601281, + "learning_rate": 1.999389742709538e-05, + "loss": 0.1194, + "step": 3434 + }, + { + "epoch": 0.6877502001601281, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.1316, + "step": 3436 + }, + { + "epoch": 0.688550840672538, + "learning_rate": 1.9994834701589113e-05, + "loss": 0.1078, + "step": 3438 + }, + { + "epoch": 0.688550840672538, + "learning_rate": 1.9995274059091018e-05, + "loss": 0.0712, + "step": 3440 + }, + { + "epoch": 0.6893514811849479, + "learning_rate": 1.999569389560614e-05, + "loss": 0.0146, + "step": 3442 + }, + { + "epoch": 0.6893514811849479, + "learning_rate": 1.999609421031453e-05, + "loss": 0.0108, + "step": 3444 + }, + { + "epoch": 0.6901521216973578, + "learning_rate": 1.9996475002434365e-05, + "loss": 0.3523, + "step": 3446 + }, + { + "epoch": 0.6901521216973578, + "learning_rate": 1.999683627122195e-05, + "loss": 0.0196, + "step": 3448 + }, + { + "epoch": 0.6909527622097679, + "learning_rate": 1.999717801597172e-05, + "loss": 0.0043, + "step": 3450 + }, + { + "epoch": 0.6909527622097679, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.145, + "step": 3452 + }, + { + "epoch": 0.6917534027221778, + "learning_rate": 1.9997802930726195e-05, + "loss": 0.4146, + "step": 3454 + }, + { + "epoch": 0.6917534027221778, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.0994, + "step": 3456 + }, + { + "epoch": 0.6925540432345877, + "learning_rate": 1.9998349741815916e-05, + "loss": 0.1452, + "step": 3458 + }, + { + "epoch": 0.6925540432345877, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.3334, + "step": 3460 + }, + { + "epoch": 0.6933546837469976, + "learning_rate": 1.999881844496914e-05, + "loss": 0.051, + "step": 3462 + }, + { + "epoch": 0.6933546837469976, + "learning_rate": 1.99990235049015e-05, + "loss": 0.2324, + "step": 3464 + }, + { + "epoch": 0.6941553242594075, + "learning_rate": 1.9999209036524326e-05, + "loss": 0.0881, + "step": 3466 + }, + { + "epoch": 0.6941553242594075, + "learning_rate": 1.9999375039475275e-05, + "loss": 0.5747, + "step": 3468 + }, + { + "epoch": 0.6949559647718174, + "learning_rate": 1.999952151343014e-05, + "loss": 0.0897, + "step": 3470 + }, + { + "epoch": 0.6949559647718174, + "learning_rate": 1.999964845810285e-05, + "loss": 0.0713, + "step": 3472 + }, + { + "epoch": 0.6957566052842273, + "learning_rate": 1.9999755873245484e-05, + "loss": 0.3227, + "step": 3474 + }, + { + "epoch": 0.6957566052842273, + "learning_rate": 1.9999843758648253e-05, + "loss": 0.4008, + "step": 3476 + }, + { + "epoch": 0.6965572457966374, + "learning_rate": 1.999991211413952e-05, + "loss": 0.0023, + "step": 3478 + }, + { + "epoch": 0.6965572457966374, + "learning_rate": 1.999996093958578e-05, + "loss": 0.0872, + "step": 3480 + }, + { + "epoch": 0.6973578863090473, + "learning_rate": 1.9999990234891677e-05, + "loss": 0.0102, + "step": 3482 + }, + { + "epoch": 0.6973578863090473, + "learning_rate": 2e-05, + "loss": 0.2099, + "step": 3484 + }, + { + "epoch": 0.6981585268214572, + "learning_rate": 1.999999023489168e-05, + "loss": 0.0129, + "step": 3486 + }, + { + "epoch": 0.6981585268214572, + "learning_rate": 1.999996093958578e-05, + "loss": 0.2256, + "step": 3488 + }, + { + "epoch": 0.6989591673338671, + "learning_rate": 1.999991211413952e-05, + "loss": 0.066, + "step": 3490 + }, + { + "epoch": 0.6989591673338671, + "learning_rate": 1.9999843758648253e-05, + "loss": 0.6092, + "step": 3492 + }, + { + "epoch": 0.699759807846277, + "learning_rate": 1.9999755873245484e-05, + "loss": 0.1594, + "step": 3494 + }, + { + "epoch": 0.699759807846277, + "learning_rate": 1.999964845810285e-05, + "loss": 0.0323, + "step": 3496 + }, + { + "epoch": 0.7005604483586869, + "learning_rate": 1.999952151343014e-05, + "loss": 0.0354, + "step": 3498 + }, + { + "epoch": 0.7005604483586869, + "learning_rate": 1.9999375039475278e-05, + "loss": 0.0366, + "step": 3500 + }, + { + "epoch": 0.7013610888710968, + "learning_rate": 1.9999209036524326e-05, + "loss": 0.1743, + "step": 3502 + }, + { + "epoch": 0.7013610888710968, + "learning_rate": 1.99990235049015e-05, + "loss": 0.9231, + "step": 3504 + }, + { + "epoch": 0.7021617293835068, + "learning_rate": 1.999881844496914e-05, + "loss": 0.004, + "step": 3506 + }, + { + "epoch": 0.7021617293835068, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.0271, + "step": 3508 + }, + { + "epoch": 0.7029623698959168, + "learning_rate": 1.9998349741815916e-05, + "loss": 0.0322, + "step": 3510 + }, + { + "epoch": 0.7029623698959168, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.2394, + "step": 3512 + }, + { + "epoch": 0.7037630104083267, + "learning_rate": 1.9997802930726195e-05, + "loss": 0.251, + "step": 3514 + }, + { + "epoch": 0.7037630104083267, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.0338, + "step": 3516 + }, + { + "epoch": 0.7045636509207366, + "learning_rate": 1.999717801597172e-05, + "loss": 0.4135, + "step": 3518 + }, + { + "epoch": 0.7045636509207366, + "learning_rate": 1.999683627122195e-05, + "loss": 0.6819, + "step": 3520 + }, + { + "epoch": 0.7053642914331465, + "learning_rate": 1.9996475002434365e-05, + "loss": 0.0145, + "step": 3522 + }, + { + "epoch": 0.7053642914331465, + "learning_rate": 1.999609421031453e-05, + "loss": 0.1079, + "step": 3524 + }, + { + "epoch": 0.7061649319455564, + "learning_rate": 1.999569389560614e-05, + "loss": 1.4426, + "step": 3526 + }, + { + "epoch": 0.7061649319455564, + "learning_rate": 1.999527405909102e-05, + "loss": 0.0265, + "step": 3528 + }, + { + "epoch": 0.7069655724579663, + "learning_rate": 1.9994834701589113e-05, + "loss": 0.0481, + "step": 3530 + }, + { + "epoch": 0.7069655724579663, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.7708, + "step": 3532 + }, + { + "epoch": 0.7077662129703763, + "learning_rate": 1.9993897427095378e-05, + "loss": 0.5394, + "step": 3534 + }, + { + "epoch": 0.7077662129703763, + "learning_rate": 1.999339951193407e-05, + "loss": 0.0881, + "step": 3536 + }, + { + "epoch": 0.7085668534827863, + "learning_rate": 1.999288207944701e-05, + "loss": 0.0317, + "step": 3538 + }, + { + "epoch": 0.7085668534827863, + "learning_rate": 1.999234513064475e-05, + "loss": 0.1413, + "step": 3540 + }, + { + "epoch": 0.7093674939951962, + "learning_rate": 1.999178866657597e-05, + "loss": 0.1078, + "step": 3542 + }, + { + "epoch": 0.7093674939951962, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.1937, + "step": 3544 + }, + { + "epoch": 0.7101681345076061, + "learning_rate": 1.9990617197024103e-05, + "loss": 0.0796, + "step": 3546 + }, + { + "epoch": 0.7101681345076061, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.0104, + "step": 3548 + }, + { + "epoch": 0.710968775020016, + "learning_rate": 1.998936767994303e-05, + "loss": 0.0433, + "step": 3550 + }, + { + "epoch": 0.710968775020016, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.0061, + "step": 3552 + }, + { + "epoch": 0.7117694155324259, + "learning_rate": 1.998804012509407e-05, + "loss": 0.1058, + "step": 3554 + }, + { + "epoch": 0.7117694155324259, + "learning_rate": 1.998734708672375e-05, + "loss": 0.8481, + "step": 3556 + }, + { + "epoch": 0.7125700560448359, + "learning_rate": 1.99866345428482e-05, + "loss": 0.6389, + "step": 3558 + }, + { + "epoch": 0.7125700560448359, + "learning_rate": 1.9985902494859026e-05, + "loss": 0.1155, + "step": 3560 + }, + { + "epoch": 0.7133706965572458, + "learning_rate": 1.998515094418594e-05, + "loss": 0.135, + "step": 3562 + }, + { + "epoch": 0.7133706965572458, + "learning_rate": 1.9984379892296735e-05, + "loss": 0.2909, + "step": 3564 + }, + { + "epoch": 0.7141713370696557, + "learning_rate": 1.9983589340697288e-05, + "loss": 0.6494, + "step": 3566 + }, + { + "epoch": 0.7141713370696557, + "learning_rate": 1.9982779290931572e-05, + "loss": 0.7335, + "step": 3568 + }, + { + "epoch": 0.7149719775820657, + "learning_rate": 1.9981949744581622e-05, + "loss": 0.633, + "step": 3570 + }, + { + "epoch": 0.7149719775820657, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.2325, + "step": 3572 + }, + { + "epoch": 0.7157726180944756, + "learning_rate": 1.99802321686476e-05, + "loss": 0.0268, + "step": 3574 + }, + { + "epoch": 0.7157726180944756, + "learning_rate": 1.997934414241799e-05, + "loss": 0.0579, + "step": 3576 + }, + { + "epoch": 0.7165732586068855, + "learning_rate": 1.9978436626313068e-05, + "loss": 0.0102, + "step": 3578 + }, + { + "epoch": 0.7165732586068855, + "learning_rate": 1.9977509622105236e-05, + "loss": 0.2505, + "step": 3580 + }, + { + "epoch": 0.7173738991192954, + "learning_rate": 1.997656313160495e-05, + "loss": 0.0323, + "step": 3582 + }, + { + "epoch": 0.7173738991192954, + "learning_rate": 1.997559715666073e-05, + "loss": 0.0287, + "step": 3584 + }, + { + "epoch": 0.7181745396317054, + "learning_rate": 1.9974611699159142e-05, + "loss": 0.4973, + "step": 3586 + }, + { + "epoch": 0.7181745396317054, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.2891, + "step": 3588 + }, + { + "epoch": 0.7189751801441153, + "learning_rate": 1.99725823442204e-05, + "loss": 0.0311, + "step": 3590 + }, + { + "epoch": 0.7189751801441153, + "learning_rate": 1.997153845074662e-05, + "loss": 0.0102, + "step": 3592 + }, + { + "epoch": 0.7197758206565252, + "learning_rate": 1.9970475082642212e-05, + "loss": 0.2567, + "step": 3594 + }, + { + "epoch": 0.7197758206565252, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.165, + "step": 3596 + }, + { + "epoch": 0.7205764611689351, + "learning_rate": 1.9968289930886675e-05, + "loss": 0.4882, + "step": 3598 + }, + { + "epoch": 0.7205764611689351, + "learning_rate": 1.9967168151503193e-05, + "loss": 0.087, + "step": 3600 + }, + { + "epoch": 0.7213771016813451, + "learning_rate": 1.9966026906024377e-05, + "loss": 0.3895, + "step": 3602 + }, + { + "epoch": 0.7213771016813451, + "learning_rate": 1.996486619667911e-05, + "loss": 0.2159, + "step": 3604 + }, + { + "epoch": 0.722177742193755, + "learning_rate": 1.9963686025734262e-05, + "loss": 0.1777, + "step": 3606 + }, + { + "epoch": 0.722177742193755, + "learning_rate": 1.9962486395494753e-05, + "loss": 0.0293, + "step": 3608 + }, + { + "epoch": 0.7229783827061649, + "learning_rate": 1.9961267308303473e-05, + "loss": 0.038, + "step": 3610 + }, + { + "epoch": 0.7229783827061649, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.3849, + "step": 3612 + }, + { + "epoch": 0.7237790232185749, + "learning_rate": 1.9958770772627236e-05, + "loss": 0.1034, + "step": 3614 + }, + { + "epoch": 0.7237790232185749, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.0436, + "step": 3616 + }, + { + "epoch": 0.7245796637309848, + "learning_rate": 1.9956196438208693e-05, + "loss": 0.5114, + "step": 3618 + }, + { + "epoch": 0.7245796637309848, + "learning_rate": 1.995488010273198e-05, + "loss": 0.002, + "step": 3620 + }, + { + "epoch": 0.7253803042433947, + "learning_rate": 1.9953544325158755e-05, + "loss": 0.6114, + "step": 3622 + }, + { + "epoch": 0.7253803042433947, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.5294, + "step": 3624 + }, + { + "epoch": 0.7261809447558046, + "learning_rate": 1.9950814454195953e-05, + "loss": 0.0329, + "step": 3626 + }, + { + "epoch": 0.7261809447558046, + "learning_rate": 1.9949420366137873e-05, + "loss": 0.1905, + "step": 3628 + }, + { + "epoch": 0.7269815852682145, + "learning_rate": 1.9948006846646262e-05, + "loss": 0.2819, + "step": 3630 + }, + { + "epoch": 0.7269815852682145, + "learning_rate": 1.994657389848176e-05, + "loss": 0.0043, + "step": 3632 + }, + { + "epoch": 0.7277822257806245, + "learning_rate": 1.9945121524442947e-05, + "loss": 0.1731, + "step": 3634 + }, + { + "epoch": 0.7277822257806245, + "learning_rate": 1.994364972736634e-05, + "loss": 0.2963, + "step": 3636 + }, + { + "epoch": 0.7285828662930345, + "learning_rate": 1.9942158510126384e-05, + "loss": 0.0688, + "step": 3638 + }, + { + "epoch": 0.7285828662930345, + "learning_rate": 1.9940647875635466e-05, + "loss": 0.0581, + "step": 3640 + }, + { + "epoch": 0.7293835068054444, + "learning_rate": 1.9939117826843887e-05, + "loss": 0.2762, + "step": 3642 + }, + { + "epoch": 0.7293835068054444, + "learning_rate": 1.993756836673986e-05, + "loss": 0.191, + "step": 3644 + }, + { + "epoch": 0.7301841473178543, + "learning_rate": 1.9935999498349525e-05, + "loss": 0.0944, + "step": 3646 + }, + { + "epoch": 0.7301841473178543, + "learning_rate": 1.99344112247369e-05, + "loss": 0.2159, + "step": 3648 + }, + { + "epoch": 0.7309847878302642, + "learning_rate": 1.9932803549003932e-05, + "loss": 0.3857, + "step": 3650 + }, + { + "epoch": 0.7309847878302642, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.0979, + "step": 3652 + }, + { + "epoch": 0.7317854283426741, + "learning_rate": 1.9929530003774136e-05, + "loss": 0.0148, + "step": 3654 + }, + { + "epoch": 0.7317854283426741, + "learning_rate": 1.9927864140670618e-05, + "loss": 0.3837, + "step": 3656 + }, + { + "epoch": 0.732586068855084, + "learning_rate": 1.9926178888233344e-05, + "loss": 0.172, + "step": 3658 + }, + { + "epoch": 0.732586068855084, + "learning_rate": 1.9924474249753656e-05, + "loss": 0.1651, + "step": 3660 + }, + { + "epoch": 0.733386709367494, + "learning_rate": 1.9922750228560746e-05, + "loss": 0.1082, + "step": 3662 + }, + { + "epoch": 0.733386709367494, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.3767, + "step": 3664 + }, + { + "epoch": 0.734187349879904, + "learning_rate": 1.9919244051541315e-05, + "loss": 0.1763, + "step": 3666 + }, + { + "epoch": 0.734187349879904, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.0143, + "step": 3668 + }, + { + "epoch": 0.7349879903923139, + "learning_rate": 1.9915660384565603e-05, + "loss": 0.2575, + "step": 3670 + }, + { + "epoch": 0.7349879903923139, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.1761, + "step": 3672 + }, + { + "epoch": 0.7357886309047238, + "learning_rate": 1.9911999255629504e-05, + "loss": 0.0154, + "step": 3674 + }, + { + "epoch": 0.7357886309047238, + "learning_rate": 1.9910139651840497e-05, + "loss": 0.0137, + "step": 3676 + }, + { + "epoch": 0.7365892714171337, + "learning_rate": 1.990826069333406e-05, + "loss": 0.006, + "step": 3678 + }, + { + "epoch": 0.7365892714171337, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.2907, + "step": 3680 + }, + { + "epoch": 0.7373899119295436, + "learning_rate": 1.9904444726885236e-05, + "loss": 0.2472, + "step": 3682 + }, + { + "epoch": 0.7373899119295436, + "learning_rate": 1.9902507726395524e-05, + "loss": 0.148, + "step": 3684 + }, + { + "epoch": 0.7381905524419535, + "learning_rate": 1.9900551386093677e-05, + "loss": 0.4785, + "step": 3686 + }, + { + "epoch": 0.7381905524419535, + "learning_rate": 1.989857570980049e-05, + "loss": 0.2555, + "step": 3688 + }, + { + "epoch": 0.7389911929543634, + "learning_rate": 1.9896580701374482e-05, + "loss": 0.2707, + "step": 3690 + }, + { + "epoch": 0.7389911929543634, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.1172, + "step": 3692 + }, + { + "epoch": 0.7397918334667735, + "learning_rate": 1.9892532703746977e-05, + "loss": 0.1287, + "step": 3694 + }, + { + "epoch": 0.7397918334667735, + "learning_rate": 1.9890479722451292e-05, + "loss": 0.0476, + "step": 3696 + }, + { + "epoch": 0.7405924739791834, + "learning_rate": 1.9888407424834437e-05, + "loss": 0.2895, + "step": 3698 + }, + { + "epoch": 0.7405924739791834, + "learning_rate": 1.988631581494365e-05, + "loss": 0.0579, + "step": 3700 + }, + { + "epoch": 0.7413931144915933, + "learning_rate": 1.9884204896863895e-05, + "loss": 0.0681, + "step": 3702 + }, + { + "epoch": 0.7413931144915933, + "learning_rate": 1.9882074674717832e-05, + "loss": 0.0656, + "step": 3704 + }, + { + "epoch": 0.7421937550040032, + "learning_rate": 1.9879925152665845e-05, + "loss": 0.0801, + "step": 3706 + }, + { + "epoch": 0.7421937550040032, + "learning_rate": 1.987775633490599e-05, + "loss": 0.3903, + "step": 3708 + }, + { + "epoch": 0.7429943955164131, + "learning_rate": 1.9875568225674005e-05, + "loss": 0.3768, + "step": 3710 + }, + { + "epoch": 0.7429943955164131, + "learning_rate": 1.987336082924333e-05, + "loss": 0.0064, + "step": 3712 + }, + { + "epoch": 0.743795036028823, + "learning_rate": 1.987113414992505e-05, + "loss": 0.0182, + "step": 3714 + }, + { + "epoch": 0.743795036028823, + "learning_rate": 1.986888819206792e-05, + "loss": 0.5451, + "step": 3716 + }, + { + "epoch": 0.7445956765412329, + "learning_rate": 1.986662296005834e-05, + "loss": 0.0228, + "step": 3718 + }, + { + "epoch": 0.7445956765412329, + "learning_rate": 1.986433845832037e-05, + "loss": 0.0812, + "step": 3720 + }, + { + "epoch": 0.745396317053643, + "learning_rate": 1.9862034691315678e-05, + "loss": 0.4831, + "step": 3722 + }, + { + "epoch": 0.745396317053643, + "learning_rate": 1.9859711663543573e-05, + "loss": 0.0904, + "step": 3724 + }, + { + "epoch": 0.7461969575660529, + "learning_rate": 1.9857369379540985e-05, + "loss": 0.161, + "step": 3726 + }, + { + "epoch": 0.7461969575660529, + "learning_rate": 1.9855007843882437e-05, + "loss": 0.0101, + "step": 3728 + }, + { + "epoch": 0.7469975980784628, + "learning_rate": 1.985262706118007e-05, + "loss": 0.1595, + "step": 3730 + }, + { + "epoch": 0.7469975980784628, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.2779, + "step": 3732 + }, + { + "epoch": 0.7477982385908727, + "learning_rate": 1.9847807773280314e-05, + "loss": 0.007, + "step": 3734 + }, + { + "epoch": 0.7477982385908727, + "learning_rate": 1.9845369277495105e-05, + "loss": 0.1109, + "step": 3736 + }, + { + "epoch": 0.7485988791032826, + "learning_rate": 1.9842911553490396e-05, + "loss": 0.0595, + "step": 3738 + }, + { + "epoch": 0.7485988791032826, + "learning_rate": 1.9840434606066186e-05, + "loss": 0.3591, + "step": 3740 + }, + { + "epoch": 0.7493995196156925, + "learning_rate": 1.983793844005999e-05, + "loss": 0.0165, + "step": 3742 + }, + { + "epoch": 0.7493995196156925, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.2296, + "step": 3744 + }, + { + "epoch": 0.7502001601281025, + "learning_rate": 1.9832888471839475e-05, + "loss": 0.0061, + "step": 3746 + }, + { + "epoch": 0.7502001601281025, + "learning_rate": 1.983033467948784e-05, + "loss": 0.2255, + "step": 3748 + }, + { + "epoch": 0.7510008006405124, + "learning_rate": 1.9827761688279613e-05, + "loss": 0.0857, + "step": 3750 + }, + { + "epoch": 0.7510008006405124, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.2889, + "step": 3752 + }, + { + "epoch": 0.7518014411529224, + "learning_rate": 1.9822558129431263e-05, + "loss": 0.0203, + "step": 3754 + }, + { + "epoch": 0.7518014411529224, + "learning_rate": 1.9819927571953804e-05, + "loss": 0.1044, + "step": 3756 + }, + { + "epoch": 0.7526020816653323, + "learning_rate": 1.981727783594506e-05, + "loss": 0.4166, + "step": 3758 + }, + { + "epoch": 0.7526020816653323, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.0287, + "step": 3760 + }, + { + "epoch": 0.7534027221777422, + "learning_rate": 1.9811920849071092e-05, + "loss": 0.1189, + "step": 3762 + }, + { + "epoch": 0.7534027221777422, + "learning_rate": 1.980921360866819e-05, + "loss": 0.0446, + "step": 3764 + }, + { + "epoch": 0.7542033626901521, + "learning_rate": 1.980648721065859e-05, + "loss": 0.2288, + "step": 3766 + }, + { + "epoch": 0.7542033626901521, + "learning_rate": 1.9803741660367018e-05, + "loss": 0.1543, + "step": 3768 + }, + { + "epoch": 0.755004003202562, + "learning_rate": 1.980097696315558e-05, + "loss": 0.0376, + "step": 3770 + }, + { + "epoch": 0.755004003202562, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.594, + "step": 3772 + }, + { + "epoch": 0.755804643714972, + "learning_rate": 1.979539014960858e-05, + "loss": 0.0433, + "step": 3774 + }, + { + "epoch": 0.755804643714972, + "learning_rate": 1.979256804418418e-05, + "loss": 0.0208, + "step": 3776 + }, + { + "epoch": 0.7566052842273819, + "learning_rate": 1.9789726813662233e-05, + "loss": 0.0292, + "step": 3778 + }, + { + "epoch": 0.7566052842273819, + "learning_rate": 1.978686646359173e-05, + "loss": 0.0309, + "step": 3780 + }, + { + "epoch": 0.7574059247397918, + "learning_rate": 1.9783986999558994e-05, + "loss": 0.0512, + "step": 3782 + }, + { + "epoch": 0.7574059247397918, + "learning_rate": 1.9781088427187677e-05, + "loss": 0.0187, + "step": 3784 + }, + { + "epoch": 0.7582065652522018, + "learning_rate": 1.9778170752138763e-05, + "loss": 0.0027, + "step": 3786 + }, + { + "epoch": 0.7582065652522018, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.3521, + "step": 3788 + }, + { + "epoch": 0.7590072057646117, + "learning_rate": 1.9772278116838546e-05, + "loss": 0.0109, + "step": 3790 + }, + { + "epoch": 0.7590072057646117, + "learning_rate": 1.976930316809569e-05, + "loss": 0.0464, + "step": 3792 + }, + { + "epoch": 0.7598078462770216, + "learning_rate": 1.97663091396921e-05, + "loss": 0.1143, + "step": 3794 + }, + { + "epoch": 0.7598078462770216, + "learning_rate": 1.9763296037475177e-05, + "loss": 0.1737, + "step": 3796 + }, + { + "epoch": 0.7606084867894315, + "learning_rate": 1.976026386732957e-05, + "loss": 0.1467, + "step": 3798 + }, + { + "epoch": 0.7606084867894315, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.0114, + "step": 3800 + }, + { + "epoch": 0.7614091273018415, + "learning_rate": 1.9754142346977122e-05, + "loss": 0.0542, + "step": 3802 + }, + { + "epoch": 0.7614091273018415, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.0102, + "step": 3804 + }, + { + "epoch": 0.7622097678142514, + "learning_rate": 1.9747944626456577e-05, + "loss": 0.088, + "step": 3806 + }, + { + "epoch": 0.7622097678142514, + "learning_rate": 1.9744817206240374e-05, + "loss": 0.1358, + "step": 3808 + }, + { + "epoch": 0.7630104083266613, + "learning_rate": 1.9741670754185054e-05, + "loss": 0.0878, + "step": 3810 + }, + { + "epoch": 0.7630104083266613, + "learning_rate": 1.9738505276435695e-05, + "loss": 0.4587, + "step": 3812 + }, + { + "epoch": 0.7638110488390712, + "learning_rate": 1.9735320779174548e-05, + "loss": 0.0697, + "step": 3814 + }, + { + "epoch": 0.7638110488390712, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.3523, + "step": 3816 + }, + { + "epoch": 0.7646116893514812, + "learning_rate": 1.9728894751031595e-05, + "loss": 0.0919, + "step": 3818 + }, + { + "epoch": 0.7646116893514812, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.071, + "step": 3820 + }, + { + "epoch": 0.7654123298638911, + "learning_rate": 1.972239271995686e-05, + "loss": 0.3305, + "step": 3822 + }, + { + "epoch": 0.7654123298638911, + "learning_rate": 1.9719113219170152e-05, + "loss": 0.0228, + "step": 3824 + }, + { + "epoch": 0.7662129703763011, + "learning_rate": 1.9715814736744758e-05, + "loss": 0.5598, + "step": 3826 + }, + { + "epoch": 0.7662129703763011, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.0668, + "step": 3828 + }, + { + "epoch": 0.767013610888711, + "learning_rate": 1.9709160852783022e-05, + "loss": 0.0651, + "step": 3830 + }, + { + "epoch": 0.767013610888711, + "learning_rate": 1.970580546424186e-05, + "loss": 0.0148, + "step": 3832 + }, + { + "epoch": 0.7678142514011209, + "learning_rate": 1.9702431120052352e-05, + "loss": 0.5351, + "step": 3834 + }, + { + "epoch": 0.7678142514011209, + "learning_rate": 1.969903782680467e-05, + "loss": 2.0663, + "step": 3836 + }, + { + "epoch": 0.7686148919135308, + "learning_rate": 1.9695625591125984e-05, + "loss": 0.1082, + "step": 3838 + }, + { + "epoch": 0.7686148919135308, + "learning_rate": 1.9692194419680463e-05, + "loss": 0.0965, + "step": 3840 + }, + { + "epoch": 0.7694155324259407, + "learning_rate": 1.968874431916926e-05, + "loss": 0.6789, + "step": 3842 + }, + { + "epoch": 0.7694155324259407, + "learning_rate": 1.96852752963305e-05, + "loss": 0.0027, + "step": 3844 + }, + { + "epoch": 0.7702161729383507, + "learning_rate": 1.9681787357939257e-05, + "loss": 0.0312, + "step": 3846 + }, + { + "epoch": 0.7702161729383507, + "learning_rate": 1.9678280510807552e-05, + "loss": 0.093, + "step": 3848 + }, + { + "epoch": 0.7710168134507606, + "learning_rate": 1.9674754761784334e-05, + "loss": 0.0396, + "step": 3850 + }, + { + "epoch": 0.7710168134507606, + "learning_rate": 1.9671210117755462e-05, + "loss": 0.0336, + "step": 3852 + }, + { + "epoch": 0.7718174539631706, + "learning_rate": 1.9667646585643706e-05, + "loss": 0.5045, + "step": 3854 + }, + { + "epoch": 0.7718174539631706, + "learning_rate": 1.966406417240872e-05, + "loss": 0.3149, + "step": 3856 + }, + { + "epoch": 0.7726180944755805, + "learning_rate": 1.966046288504704e-05, + "loss": 0.0833, + "step": 3858 + }, + { + "epoch": 0.7726180944755805, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.329, + "step": 3860 + }, + { + "epoch": 0.7734187349879904, + "learning_rate": 1.965320371611399e-05, + "loss": 0.0326, + "step": 3862 + }, + { + "epoch": 0.7734187349879904, + "learning_rate": 1.964954584871995e-05, + "loss": 0.0142, + "step": 3864 + }, + { + "epoch": 0.7742193755004003, + "learning_rate": 1.964586913555381e-05, + "loss": 0.1032, + "step": 3866 + }, + { + "epoch": 0.7742193755004003, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.0393, + "step": 3868 + }, + { + "epoch": 0.7750200160128102, + "learning_rate": 1.9638459200664822e-05, + "loss": 0.1461, + "step": 3870 + }, + { + "epoch": 0.7750200160128102, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.0228, + "step": 3872 + }, + { + "epoch": 0.7758206565252201, + "learning_rate": 1.9630973969334068e-05, + "loss": 0.2061, + "step": 3874 + }, + { + "epoch": 0.7758206565252201, + "learning_rate": 1.9627203135753576e-05, + "loss": 0.1568, + "step": 3876 + }, + { + "epoch": 0.77662129703763, + "learning_rate": 1.9623413500036795e-05, + "loss": 0.0163, + "step": 3878 + }, + { + "epoch": 0.77662129703763, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.0913, + "step": 3880 + }, + { + "epoch": 0.7774219375500401, + "learning_rate": 1.9615777851836007e-05, + "loss": 0.0384, + "step": 3882 + }, + { + "epoch": 0.7774219375500401, + "learning_rate": 1.961193185426459e-05, + "loss": 0.0209, + "step": 3884 + }, + { + "epoch": 0.77822257806245, + "learning_rate": 1.9608067084382025e-05, + "loss": 0.2946, + "step": 3886 + }, + { + "epoch": 0.77822257806245, + "learning_rate": 1.9604183549736287e-05, + "loss": 0.1034, + "step": 3888 + }, + { + "epoch": 0.7790232185748599, + "learning_rate": 1.9600281257912002e-05, + "loss": 0.0472, + "step": 3890 + }, + { + "epoch": 0.7790232185748599, + "learning_rate": 1.959636021653044e-05, + "loss": 0.4786, + "step": 3892 + }, + { + "epoch": 0.7798238590872698, + "learning_rate": 1.9592420433249465e-05, + "loss": 0.3189, + "step": 3894 + }, + { + "epoch": 0.7798238590872698, + "learning_rate": 1.958846191576357e-05, + "loss": 0.04, + "step": 3896 + }, + { + "epoch": 0.7806244995996797, + "learning_rate": 1.958448467180382e-05, + "loss": 0.0189, + "step": 3898 + }, + { + "epoch": 0.7806244995996797, + "learning_rate": 1.958048870913786e-05, + "loss": 0.0783, + "step": 3900 + }, + { + "epoch": 0.7814251401120896, + "learning_rate": 1.9576474035569895e-05, + "loss": 0.3841, + "step": 3902 + }, + { + "epoch": 0.7814251401120896, + "learning_rate": 1.9572440658940667e-05, + "loss": 0.0599, + "step": 3904 + }, + { + "epoch": 0.7822257806244995, + "learning_rate": 1.9568388587127448e-05, + "loss": 0.4115, + "step": 3906 + }, + { + "epoch": 0.7822257806244995, + "learning_rate": 1.9564317828044022e-05, + "loss": 0.0146, + "step": 3908 + }, + { + "epoch": 0.7830264211369096, + "learning_rate": 1.9560228389640668e-05, + "loss": 0.1738, + "step": 3910 + }, + { + "epoch": 0.7830264211369096, + "learning_rate": 1.955612027990415e-05, + "loss": 0.0093, + "step": 3912 + }, + { + "epoch": 0.7838270616493195, + "learning_rate": 1.955199350685769e-05, + "loss": 0.5101, + "step": 3914 + }, + { + "epoch": 0.7838270616493195, + "learning_rate": 1.9547848078560982e-05, + "loss": 0.1417, + "step": 3916 + }, + { + "epoch": 0.7846277021617294, + "learning_rate": 1.954368400311011e-05, + "loss": 0.2318, + "step": 3918 + }, + { + "epoch": 0.7846277021617294, + "learning_rate": 1.953950128863763e-05, + "loss": 0.0721, + "step": 3920 + }, + { + "epoch": 0.7854283426741393, + "learning_rate": 1.9535299943312455e-05, + "loss": 0.0186, + "step": 3922 + }, + { + "epoch": 0.7854283426741393, + "learning_rate": 1.9531079975339915e-05, + "loss": 0.0141, + "step": 3924 + }, + { + "epoch": 0.7862289831865492, + "learning_rate": 1.9526841392961694e-05, + "loss": 0.0007, + "step": 3926 + }, + { + "epoch": 0.7862289831865492, + "learning_rate": 1.9522584204455835e-05, + "loss": 0.0187, + "step": 3928 + }, + { + "epoch": 0.7870296236989591, + "learning_rate": 1.9518308418136728e-05, + "loss": 0.2523, + "step": 3930 + }, + { + "epoch": 0.7870296236989591, + "learning_rate": 1.9514014042355054e-05, + "loss": 0.0511, + "step": 3932 + }, + { + "epoch": 0.7878302642113691, + "learning_rate": 1.9509701085497852e-05, + "loss": 0.0227, + "step": 3934 + }, + { + "epoch": 0.7878302642113691, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.2945, + "step": 3936 + }, + { + "epoch": 0.7886309047237791, + "learning_rate": 1.9501019462286266e-05, + "loss": 0.2252, + "step": 3938 + }, + { + "epoch": 0.7886309047237791, + "learning_rate": 1.9496650812887293e-05, + "loss": 0.0056, + "step": 3940 + }, + { + "epoch": 0.789431545236189, + "learning_rate": 1.9492263616323536e-05, + "loss": 0.0122, + "step": 3942 + }, + { + "epoch": 0.789431545236189, + "learning_rate": 1.9487857881163295e-05, + "loss": 0.0068, + "step": 3944 + }, + { + "epoch": 0.7902321857485989, + "learning_rate": 1.948343361601105e-05, + "loss": 1.0613, + "step": 3946 + }, + { + "epoch": 0.7902321857485989, + "learning_rate": 1.947899082950751e-05, + "loss": 0.0115, + "step": 3948 + }, + { + "epoch": 0.7910328262610088, + "learning_rate": 1.947452953032951e-05, + "loss": 0.4586, + "step": 3950 + }, + { + "epoch": 0.7910328262610088, + "learning_rate": 1.947004972719008e-05, + "loss": 0.4793, + "step": 3952 + }, + { + "epoch": 0.7918334667734187, + "learning_rate": 1.9465551428838363e-05, + "loss": 0.1768, + "step": 3954 + }, + { + "epoch": 0.7918334667734187, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.1236, + "step": 3956 + }, + { + "epoch": 0.7926341072858286, + "learning_rate": 1.9456499381675285e-05, + "loss": 0.2535, + "step": 3958 + }, + { + "epoch": 0.7926341072858286, + "learning_rate": 1.945194565054276e-05, + "loss": 0.0406, + "step": 3960 + }, + { + "epoch": 0.7934347477982386, + "learning_rate": 1.9447373459555617e-05, + "loss": 0.112, + "step": 3962 + }, + { + "epoch": 0.7934347477982386, + "learning_rate": 1.9442782817643425e-05, + "loss": 0.0105, + "step": 3964 + }, + { + "epoch": 0.7942353883106485, + "learning_rate": 1.9438173733771814e-05, + "loss": 0.2641, + "step": 3966 + }, + { + "epoch": 0.7942353883106485, + "learning_rate": 1.9433546216942433e-05, + "loss": 0.2171, + "step": 3968 + }, + { + "epoch": 0.7950360288230585, + "learning_rate": 1.9428900276192903e-05, + "loss": 0.0231, + "step": 3970 + }, + { + "epoch": 0.7950360288230585, + "learning_rate": 1.942423592059687e-05, + "loss": 0.0177, + "step": 3972 + }, + { + "epoch": 0.7958366693354684, + "learning_rate": 1.94195531592639e-05, + "loss": 0.0752, + "step": 3974 + }, + { + "epoch": 0.7958366693354684, + "learning_rate": 1.941485200133955e-05, + "loss": 0.007, + "step": 3976 + }, + { + "epoch": 0.7966373098478783, + "learning_rate": 1.9410132456005262e-05, + "loss": 0.3616, + "step": 3978 + }, + { + "epoch": 0.7966373098478783, + "learning_rate": 1.9405394532478422e-05, + "loss": 0.078, + "step": 3980 + }, + { + "epoch": 0.7974379503602882, + "learning_rate": 1.94006382400123e-05, + "loss": 0.0413, + "step": 3982 + }, + { + "epoch": 0.7974379503602882, + "learning_rate": 1.9395863587896025e-05, + "loss": 0.0416, + "step": 3984 + }, + { + "epoch": 0.7982385908726981, + "learning_rate": 1.939107058545461e-05, + "loss": 0.2539, + "step": 3986 + }, + { + "epoch": 0.7982385908726981, + "learning_rate": 1.938625924204888e-05, + "loss": 0.0149, + "step": 3988 + }, + { + "epoch": 0.7990392313851081, + "learning_rate": 1.9381429567075507e-05, + "loss": 0.0841, + "step": 3990 + }, + { + "epoch": 0.7990392313851081, + "learning_rate": 1.937658156996694e-05, + "loss": 0.005, + "step": 3992 + }, + { + "epoch": 0.799839871897518, + "learning_rate": 1.9371715260191425e-05, + "loss": 0.04, + "step": 3994 + }, + { + "epoch": 0.799839871897518, + "learning_rate": 1.9366830647252977e-05, + "loss": 0.2964, + "step": 3996 + }, + { + "epoch": 0.800640512409928, + "learning_rate": 1.936192774069133e-05, + "loss": 0.0767, + "step": 3998 + }, + { + "epoch": 0.800640512409928, + "learning_rate": 1.9357006550082e-05, + "loss": 0.041, + "step": 4000 + }, + { + "epoch": 0.8014411529223379, + "learning_rate": 1.9352067085036145e-05, + "loss": 0.0163, + "step": 4002 + }, + { + "epoch": 0.8014411529223379, + "learning_rate": 1.9347109355200676e-05, + "loss": 0.0058, + "step": 4004 + }, + { + "epoch": 0.8022417934347478, + "learning_rate": 1.9342133370258124e-05, + "loss": 0.351, + "step": 4006 + }, + { + "epoch": 0.8022417934347478, + "learning_rate": 1.933713913992671e-05, + "loss": 0.0768, + "step": 4008 + }, + { + "epoch": 0.8030424339471577, + "learning_rate": 1.9332126673960276e-05, + "loss": 0.0296, + "step": 4010 + }, + { + "epoch": 0.8030424339471577, + "learning_rate": 1.9327095982148255e-05, + "loss": 0.0778, + "step": 4012 + }, + { + "epoch": 0.8038430744595677, + "learning_rate": 1.932204707431572e-05, + "loss": 0.0347, + "step": 4014 + }, + { + "epoch": 0.8038430744595677, + "learning_rate": 1.9316979960323283e-05, + "loss": 0.2699, + "step": 4016 + }, + { + "epoch": 0.8046437149719776, + "learning_rate": 1.9311894650067146e-05, + "loss": 0.0048, + "step": 4018 + }, + { + "epoch": 0.8046437149719776, + "learning_rate": 1.9306791153479017e-05, + "loss": 0.0601, + "step": 4020 + }, + { + "epoch": 0.8054443554843875, + "learning_rate": 1.9301669480526118e-05, + "loss": 0.0039, + "step": 4022 + }, + { + "epoch": 0.8054443554843875, + "learning_rate": 1.9296529641211226e-05, + "loss": 0.0011, + "step": 4024 + }, + { + "epoch": 0.8062449959967974, + "learning_rate": 1.929137164557252e-05, + "loss": 0.1325, + "step": 4026 + }, + { + "epoch": 0.8062449959967974, + "learning_rate": 1.928619550368371e-05, + "loss": 0.0115, + "step": 4028 + }, + { + "epoch": 0.8070456365092074, + "learning_rate": 1.9281001225653883e-05, + "loss": 0.1262, + "step": 4030 + }, + { + "epoch": 0.8070456365092074, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.5932, + "step": 4032 + }, + { + "epoch": 0.8078462770216173, + "learning_rate": 1.9270558301784808e-05, + "loss": 0.1201, + "step": 4034 + }, + { + "epoch": 0.8078462770216173, + "learning_rate": 1.9265309676340783e-05, + "loss": 0.0366, + "step": 4036 + }, + { + "epoch": 0.8086469175340272, + "learning_rate": 1.9260042955546247e-05, + "loss": 0.0044, + "step": 4038 + }, + { + "epoch": 0.8086469175340272, + "learning_rate": 1.9254758149687187e-05, + "loss": 1.1213, + "step": 4040 + }, + { + "epoch": 0.8094475580464372, + "learning_rate": 1.9249455269084972e-05, + "loss": 0.0262, + "step": 4042 + }, + { + "epoch": 0.8094475580464372, + "learning_rate": 1.9244134324096216e-05, + "loss": 0.079, + "step": 4044 + }, + { + "epoch": 0.8102481985588471, + "learning_rate": 1.9238795325112867e-05, + "loss": 0.2666, + "step": 4046 + }, + { + "epoch": 0.8102481985588471, + "learning_rate": 1.9233438282562095e-05, + "loss": 0.017, + "step": 4048 + }, + { + "epoch": 0.811048839071257, + "learning_rate": 1.9228063206906302e-05, + "loss": 0.0292, + "step": 4050 + }, + { + "epoch": 0.811048839071257, + "learning_rate": 1.9222670108643156e-05, + "loss": 0.0176, + "step": 4052 + }, + { + "epoch": 0.8118494795836669, + "learning_rate": 1.9217258998305464e-05, + "loss": 0.0254, + "step": 4054 + }, + { + "epoch": 0.8118494795836669, + "learning_rate": 1.9211829886461278e-05, + "loss": 0.1631, + "step": 4056 + }, + { + "epoch": 0.8126501200960768, + "learning_rate": 1.9206382783713735e-05, + "loss": 0.1854, + "step": 4058 + }, + { + "epoch": 0.8126501200960768, + "learning_rate": 1.9200917700701176e-05, + "loss": 0.4553, + "step": 4060 + }, + { + "epoch": 0.8134507606084868, + "learning_rate": 1.9195434648097013e-05, + "loss": 0.0791, + "step": 4062 + }, + { + "epoch": 0.8134507606084868, + "learning_rate": 1.918993363660975e-05, + "loss": 0.433, + "step": 4064 + }, + { + "epoch": 0.8142514011208967, + "learning_rate": 1.9184414676983013e-05, + "loss": 0.0367, + "step": 4066 + }, + { + "epoch": 0.8142514011208967, + "learning_rate": 1.9178877779995416e-05, + "loss": 0.0288, + "step": 4068 + }, + { + "epoch": 0.8150520416333067, + "learning_rate": 1.9173322956460678e-05, + "loss": 0.2709, + "step": 4070 + }, + { + "epoch": 0.8150520416333067, + "learning_rate": 1.916775021722745e-05, + "loss": 0.0996, + "step": 4072 + }, + { + "epoch": 0.8158526821457166, + "learning_rate": 1.9162159573179446e-05, + "loss": 0.1254, + "step": 4074 + }, + { + "epoch": 0.8158526821457166, + "learning_rate": 1.9156551035235298e-05, + "loss": 0.0412, + "step": 4076 + }, + { + "epoch": 0.8166533226581265, + "learning_rate": 1.915092461434859e-05, + "loss": 0.0792, + "step": 4078 + }, + { + "epoch": 0.8166533226581265, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.1446, + "step": 4080 + }, + { + "epoch": 0.8174539631705364, + "learning_rate": 1.9139618167736547e-05, + "loss": 0.1445, + "step": 4082 + }, + { + "epoch": 0.8174539631705364, + "learning_rate": 1.9133938164092942e-05, + "loss": 0.0745, + "step": 4084 + }, + { + "epoch": 0.8182546036829463, + "learning_rate": 1.912824032167022e-05, + "loss": 0.0148, + "step": 4086 + }, + { + "epoch": 0.8182546036829463, + "learning_rate": 1.9122524651596372e-05, + "loss": 0.4154, + "step": 4088 + }, + { + "epoch": 0.8190552441953562, + "learning_rate": 1.911679116503426e-05, + "loss": 0.0026, + "step": 4090 + }, + { + "epoch": 0.8190552441953562, + "learning_rate": 1.9111039873181475e-05, + "loss": 0.5693, + "step": 4092 + }, + { + "epoch": 0.8198558847077662, + "learning_rate": 1.9105270787270446e-05, + "loss": 0.5454, + "step": 4094 + }, + { + "epoch": 0.8198558847077662, + "learning_rate": 1.9099483918568287e-05, + "loss": 0.0934, + "step": 4096 + }, + { + "epoch": 0.8206565252201762, + "learning_rate": 1.9093679278376913e-05, + "loss": 1.0358, + "step": 4098 + }, + { + "epoch": 0.8206565252201762, + "learning_rate": 1.90878568780329e-05, + "loss": 0.0237, + "step": 4100 + }, + { + "epoch": 0.8214571657325861, + "learning_rate": 1.90820167289075e-05, + "loss": 0.007, + "step": 4102 + }, + { + "epoch": 0.8214571657325861, + "learning_rate": 1.907615884240668e-05, + "loss": 0.0481, + "step": 4104 + }, + { + "epoch": 0.822257806244996, + "learning_rate": 1.9070283229971003e-05, + "loss": 0.0298, + "step": 4106 + }, + { + "epoch": 0.822257806244996, + "learning_rate": 1.9064389903075683e-05, + "loss": 0.4559, + "step": 4108 + }, + { + "epoch": 0.8230584467574059, + "learning_rate": 1.9058478873230487e-05, + "loss": 0.2526, + "step": 4110 + }, + { + "epoch": 0.8230584467574059, + "learning_rate": 1.905255015197982e-05, + "loss": 0.376, + "step": 4112 + }, + { + "epoch": 0.8238590872698158, + "learning_rate": 1.9046603750902585e-05, + "loss": 0.0285, + "step": 4114 + }, + { + "epoch": 0.8238590872698158, + "learning_rate": 1.9040639681612216e-05, + "loss": 0.0715, + "step": 4116 + }, + { + "epoch": 0.8246597277822257, + "learning_rate": 1.9034657955756702e-05, + "loss": 0.0647, + "step": 4118 + }, + { + "epoch": 0.8246597277822257, + "learning_rate": 1.902865858501845e-05, + "loss": 0.0574, + "step": 4120 + }, + { + "epoch": 0.8254603682946358, + "learning_rate": 1.9022641581114396e-05, + "loss": 0.1847, + "step": 4122 + }, + { + "epoch": 0.8254603682946358, + "learning_rate": 1.9016606955795843e-05, + "loss": 0.1294, + "step": 4124 + }, + { + "epoch": 0.8262610088070457, + "learning_rate": 1.901055472084858e-05, + "loss": 0.2551, + "step": 4126 + }, + { + "epoch": 0.8262610088070457, + "learning_rate": 1.9004484888092734e-05, + "loss": 0.2457, + "step": 4128 + }, + { + "epoch": 0.8270616493194556, + "learning_rate": 1.8998397469382812e-05, + "loss": 0.55, + "step": 4130 + }, + { + "epoch": 0.8270616493194556, + "learning_rate": 1.8992292476607695e-05, + "loss": 0.0805, + "step": 4132 + }, + { + "epoch": 0.8278622898318655, + "learning_rate": 1.898616992169054e-05, + "loss": 0.1443, + "step": 4134 + }, + { + "epoch": 0.8278622898318655, + "learning_rate": 1.8980029816588863e-05, + "loss": 0.1947, + "step": 4136 + }, + { + "epoch": 0.8286629303442754, + "learning_rate": 1.89738721732944e-05, + "loss": 0.3637, + "step": 4138 + }, + { + "epoch": 0.8286629303442754, + "learning_rate": 1.8967697003833156e-05, + "loss": 0.0112, + "step": 4140 + }, + { + "epoch": 0.8294635708566853, + "learning_rate": 1.8961504320265392e-05, + "loss": 0.2468, + "step": 4142 + }, + { + "epoch": 0.8294635708566853, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.0746, + "step": 4144 + }, + { + "epoch": 0.8302642113690952, + "learning_rate": 1.8949066459222224e-05, + "loss": 0.093, + "step": 4146 + }, + { + "epoch": 0.8302642113690952, + "learning_rate": 1.8942821306038227e-05, + "loss": 0.2445, + "step": 4148 + }, + { + "epoch": 0.8310648518815053, + "learning_rate": 1.8936558687330492e-05, + "loss": 0.0368, + "step": 4150 + }, + { + "epoch": 0.8310648518815053, + "learning_rate": 1.893027861533003e-05, + "loss": 0.4575, + "step": 4152 + }, + { + "epoch": 0.8318654923939152, + "learning_rate": 1.8923981102301944e-05, + "loss": 0.2185, + "step": 4154 + }, + { + "epoch": 0.8318654923939152, + "learning_rate": 1.891766616054545e-05, + "loss": 0.1499, + "step": 4156 + }, + { + "epoch": 0.8326661329063251, + "learning_rate": 1.8911333802393725e-05, + "loss": 0.0034, + "step": 4158 + }, + { + "epoch": 0.8326661329063251, + "learning_rate": 1.8904984040214043e-05, + "loss": 0.1607, + "step": 4160 + }, + { + "epoch": 0.833466773418735, + "learning_rate": 1.8898616886407588e-05, + "loss": 0.3574, + "step": 4162 + }, + { + "epoch": 0.833466773418735, + "learning_rate": 1.8892232353409582e-05, + "loss": 0.1112, + "step": 4164 + }, + { + "epoch": 0.8342674139311449, + "learning_rate": 1.8885830453689146e-05, + "loss": 0.0255, + "step": 4166 + }, + { + "epoch": 0.8342674139311449, + "learning_rate": 1.8879411199749306e-05, + "loss": 0.0232, + "step": 4168 + }, + { + "epoch": 0.8350680544435548, + "learning_rate": 1.8872974604127038e-05, + "loss": 0.1576, + "step": 4170 + }, + { + "epoch": 0.8350680544435548, + "learning_rate": 1.8866520679393124e-05, + "loss": 0.0409, + "step": 4172 + }, + { + "epoch": 0.8358686949559647, + "learning_rate": 1.8860049438152247e-05, + "loss": 0.2601, + "step": 4174 + }, + { + "epoch": 0.8358686949559647, + "learning_rate": 1.885356089304285e-05, + "loss": 0.001, + "step": 4176 + }, + { + "epoch": 0.8366693354683747, + "learning_rate": 1.8847055056737236e-05, + "loss": 0.0408, + "step": 4178 + }, + { + "epoch": 0.8366693354683747, + "learning_rate": 1.884053194194143e-05, + "loss": 0.0889, + "step": 4180 + }, + { + "epoch": 0.8374699759807847, + "learning_rate": 1.8833991561395194e-05, + "loss": 0.1759, + "step": 4182 + }, + { + "epoch": 0.8374699759807847, + "learning_rate": 1.882743392787207e-05, + "loss": 0.1201, + "step": 4184 + }, + { + "epoch": 0.8382706164931946, + "learning_rate": 1.8820859054179225e-05, + "loss": 0.475, + "step": 4186 + }, + { + "epoch": 0.8382706164931946, + "learning_rate": 1.881426695315756e-05, + "loss": 0.0792, + "step": 4188 + }, + { + "epoch": 0.8390712570056045, + "learning_rate": 1.8807657637681577e-05, + "loss": 0.1563, + "step": 4190 + }, + { + "epoch": 0.8390712570056045, + "learning_rate": 1.8801031120659396e-05, + "loss": 0.2101, + "step": 4192 + }, + { + "epoch": 0.8398718975180144, + "learning_rate": 1.8794387415032783e-05, + "loss": 0.0124, + "step": 4194 + }, + { + "epoch": 0.8398718975180144, + "learning_rate": 1.8787726533777003e-05, + "loss": 0.071, + "step": 4196 + }, + { + "epoch": 0.8406725380304243, + "learning_rate": 1.8781048489900936e-05, + "loss": 0.3962, + "step": 4198 + }, + { + "epoch": 0.8406725380304243, + "learning_rate": 1.877435329644691e-05, + "loss": 0.1056, + "step": 4200 + }, + { + "epoch": 0.8414731785428343, + "learning_rate": 1.876764096649082e-05, + "loss": 0.0013, + "step": 4202 + }, + { + "epoch": 0.8414731785428343, + "learning_rate": 1.8760911513141974e-05, + "loss": 0.0729, + "step": 4204 + }, + { + "epoch": 0.8422738190552442, + "learning_rate": 1.8754164949543123e-05, + "loss": 0.1445, + "step": 4206 + }, + { + "epoch": 0.8422738190552442, + "learning_rate": 1.8747401288870482e-05, + "loss": 0.0878, + "step": 4208 + }, + { + "epoch": 0.8430744595676541, + "learning_rate": 1.8740620544333604e-05, + "loss": 0.0091, + "step": 4210 + }, + { + "epoch": 0.8430744595676541, + "learning_rate": 1.8733822729175455e-05, + "loss": 0.0596, + "step": 4212 + }, + { + "epoch": 0.8438751000800641, + "learning_rate": 1.872700785667228e-05, + "loss": 0.3071, + "step": 4214 + }, + { + "epoch": 0.8438751000800641, + "learning_rate": 1.8720175940133712e-05, + "loss": 0.0093, + "step": 4216 + }, + { + "epoch": 0.844675740592474, + "learning_rate": 1.8713326992902612e-05, + "loss": 0.0119, + "step": 4218 + }, + { + "epoch": 0.844675740592474, + "learning_rate": 1.8706461028355107e-05, + "loss": 0.2974, + "step": 4220 + }, + { + "epoch": 0.8454763811048839, + "learning_rate": 1.8699578059900604e-05, + "loss": 0.1076, + "step": 4222 + }, + { + "epoch": 0.8454763811048839, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.0216, + "step": 4224 + }, + { + "epoch": 0.8462770216172938, + "learning_rate": 1.868576116507408e-05, + "loss": 0.2355, + "step": 4226 + }, + { + "epoch": 0.8462770216172938, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.005, + "step": 4228 + }, + { + "epoch": 0.8470776621297038, + "learning_rate": 1.8671876416361767e-05, + "loss": 0.0187, + "step": 4230 + }, + { + "epoch": 0.8470776621297038, + "learning_rate": 1.8664908630674264e-05, + "loss": 0.0051, + "step": 4232 + }, + { + "epoch": 0.8478783026421137, + "learning_rate": 1.8657923922232467e-05, + "loss": 0.01, + "step": 4234 + }, + { + "epoch": 0.8478783026421137, + "learning_rate": 1.86509223046777e-05, + "loss": 0.6653, + "step": 4236 + }, + { + "epoch": 0.8486789431545236, + "learning_rate": 1.8643903791684228e-05, + "loss": 0.0449, + "step": 4238 + }, + { + "epoch": 0.8486789431545236, + "learning_rate": 1.8636868396959406e-05, + "loss": 0.0577, + "step": 4240 + }, + { + "epoch": 0.8494795836669335, + "learning_rate": 1.8629816134243466e-05, + "loss": 0.0974, + "step": 4242 + }, + { + "epoch": 0.8494795836669335, + "learning_rate": 1.8622747017309676e-05, + "loss": 0.0454, + "step": 4244 + }, + { + "epoch": 0.8502802241793435, + "learning_rate": 1.8615661059964148e-05, + "loss": 0.3469, + "step": 4246 + }, + { + "epoch": 0.8502802241793435, + "learning_rate": 1.8608558276045898e-05, + "loss": 0.0374, + "step": 4248 + }, + { + "epoch": 0.8510808646917534, + "learning_rate": 1.860143867942685e-05, + "loss": 0.023, + "step": 4250 + }, + { + "epoch": 0.8510808646917534, + "learning_rate": 1.8594302284011697e-05, + "loss": 0.154, + "step": 4252 + }, + { + "epoch": 0.8518815052041633, + "learning_rate": 1.8587149103738006e-05, + "loss": 0.0011, + "step": 4254 + }, + { + "epoch": 0.8518815052041633, + "learning_rate": 1.8579979152576076e-05, + "loss": 0.0055, + "step": 4256 + }, + { + "epoch": 0.8526821457165733, + "learning_rate": 1.8572792444528963e-05, + "loss": 0.0104, + "step": 4258 + }, + { + "epoch": 0.8526821457165733, + "learning_rate": 1.8565588993632498e-05, + "loss": 0.1089, + "step": 4260 + }, + { + "epoch": 0.8534827862289832, + "learning_rate": 1.8558368813955136e-05, + "loss": 0.426, + "step": 4262 + }, + { + "epoch": 0.8534827862289832, + "learning_rate": 1.8551131919598084e-05, + "loss": 0.769, + "step": 4264 + }, + { + "epoch": 0.8542834267413931, + "learning_rate": 1.854387832469512e-05, + "loss": 0.5599, + "step": 4266 + }, + { + "epoch": 0.8542834267413931, + "learning_rate": 1.8536608043412702e-05, + "loss": 0.0367, + "step": 4268 + }, + { + "epoch": 0.855084067253803, + "learning_rate": 1.8529321089949833e-05, + "loss": 0.1718, + "step": 4270 + }, + { + "epoch": 0.855084067253803, + "learning_rate": 1.852201747853807e-05, + "loss": 0.247, + "step": 4272 + }, + { + "epoch": 0.855884707766213, + "learning_rate": 1.8514697223441565e-05, + "loss": 0.6096, + "step": 4274 + }, + { + "epoch": 0.855884707766213, + "learning_rate": 1.8507360338956896e-05, + "loss": 0.1603, + "step": 4276 + }, + { + "epoch": 0.8566853482786229, + "learning_rate": 1.850000683941319e-05, + "loss": 0.0325, + "step": 4278 + }, + { + "epoch": 0.8566853482786229, + "learning_rate": 1.849263673917196e-05, + "loss": 0.0001, + "step": 4280 + }, + { + "epoch": 0.8574859887910328, + "learning_rate": 1.8485250052627205e-05, + "loss": 0.014, + "step": 4282 + }, + { + "epoch": 0.8574859887910328, + "learning_rate": 1.847784679420527e-05, + "loss": 0.0002, + "step": 4284 + }, + { + "epoch": 0.8582866293034428, + "learning_rate": 1.8470426978364857e-05, + "loss": 0.4896, + "step": 4286 + }, + { + "epoch": 0.8582866293034428, + "learning_rate": 1.846299061959706e-05, + "loss": 0.489, + "step": 4288 + }, + { + "epoch": 0.8590872698158527, + "learning_rate": 1.845553773242522e-05, + "loss": 0.2362, + "step": 4290 + }, + { + "epoch": 0.8590872698158527, + "learning_rate": 1.8448068331405018e-05, + "loss": 0.0572, + "step": 4292 + }, + { + "epoch": 0.8598879103282626, + "learning_rate": 1.8440582431124322e-05, + "loss": 0.5116, + "step": 4294 + }, + { + "epoch": 0.8598879103282626, + "learning_rate": 1.8433080046203293e-05, + "loss": 0.0736, + "step": 4296 + }, + { + "epoch": 0.8606885508406725, + "learning_rate": 1.842556119129423e-05, + "loss": 0.0001, + "step": 4298 + }, + { + "epoch": 0.8606885508406725, + "learning_rate": 1.841802588108161e-05, + "loss": 0.3615, + "step": 4300 + }, + { + "epoch": 0.8614891913530824, + "learning_rate": 1.841047413028209e-05, + "loss": 0.1996, + "step": 4302 + }, + { + "epoch": 0.8614891913530824, + "learning_rate": 1.8402905953644356e-05, + "loss": 0.0177, + "step": 4304 + }, + { + "epoch": 0.8622898318654924, + "learning_rate": 1.8395321365949273e-05, + "loss": 0.0285, + "step": 4306 + }, + { + "epoch": 0.8622898318654924, + "learning_rate": 1.838772038200968e-05, + "loss": 0.046, + "step": 4308 + }, + { + "epoch": 0.8630904723779024, + "learning_rate": 1.838010301667044e-05, + "loss": 0.0904, + "step": 4310 + }, + { + "epoch": 0.8630904723779024, + "learning_rate": 1.837246928480848e-05, + "loss": 0.0115, + "step": 4312 + }, + { + "epoch": 0.8638911128903123, + "learning_rate": 1.8364819201332596e-05, + "loss": 0.0573, + "step": 4314 + }, + { + "epoch": 0.8638911128903123, + "learning_rate": 1.8357152781183613e-05, + "loss": 0.0821, + "step": 4316 + }, + { + "epoch": 0.8646917534027222, + "learning_rate": 1.834947003933417e-05, + "loss": 0.0774, + "step": 4318 + }, + { + "epoch": 0.8646917534027222, + "learning_rate": 1.8341770990788874e-05, + "loss": 0.0014, + "step": 4320 + }, + { + "epoch": 0.8654923939151321, + "learning_rate": 1.8334055650584107e-05, + "loss": 0.1081, + "step": 4322 + }, + { + "epoch": 0.8654923939151321, + "learning_rate": 1.8326324033788087e-05, + "loss": 0.0447, + "step": 4324 + }, + { + "epoch": 0.866293034427542, + "learning_rate": 1.8318576155500855e-05, + "loss": 0.0763, + "step": 4326 + }, + { + "epoch": 0.866293034427542, + "learning_rate": 1.831081203085415e-05, + "loss": 0.0992, + "step": 4328 + }, + { + "epoch": 0.8670936749399519, + "learning_rate": 1.830303167501152e-05, + "loss": 0.6514, + "step": 4330 + }, + { + "epoch": 0.8670936749399519, + "learning_rate": 1.8295235103168128e-05, + "loss": 0.0364, + "step": 4332 + }, + { + "epoch": 0.8678943154523618, + "learning_rate": 1.8287422330550885e-05, + "loss": 0.0751, + "step": 4334 + }, + { + "epoch": 0.8678943154523618, + "learning_rate": 1.8279593372418284e-05, + "loss": 0.023, + "step": 4336 + }, + { + "epoch": 0.8686949559647719, + "learning_rate": 1.827174824406043e-05, + "loss": 0.1194, + "step": 4338 + }, + { + "epoch": 0.8686949559647719, + "learning_rate": 1.8263886960799072e-05, + "loss": 0.2256, + "step": 4340 + }, + { + "epoch": 0.8694955964771818, + "learning_rate": 1.8256009537987424e-05, + "loss": 0.0737, + "step": 4342 + }, + { + "epoch": 0.8694955964771818, + "learning_rate": 1.8248115991010303e-05, + "loss": 0.0043, + "step": 4344 + }, + { + "epoch": 0.8702962369895917, + "learning_rate": 1.8240206335283943e-05, + "loss": 0.1299, + "step": 4346 + }, + { + "epoch": 0.8702962369895917, + "learning_rate": 1.8232280586256104e-05, + "loss": 0.0895, + "step": 4348 + }, + { + "epoch": 0.8710968775020016, + "learning_rate": 1.8224338759405934e-05, + "loss": 0.0823, + "step": 4350 + }, + { + "epoch": 0.8710968775020016, + "learning_rate": 1.8216380870243963e-05, + "loss": 1.2117, + "step": 4352 + }, + { + "epoch": 0.8718975180144115, + "learning_rate": 1.820840693431217e-05, + "loss": 0.0343, + "step": 4354 + }, + { + "epoch": 0.8718975180144115, + "learning_rate": 1.820041696718378e-05, + "loss": 0.0406, + "step": 4356 + }, + { + "epoch": 0.8726981585268214, + "learning_rate": 1.8192410984463416e-05, + "loss": 0.0259, + "step": 4358 + }, + { + "epoch": 0.8726981585268214, + "learning_rate": 1.8184389001786912e-05, + "loss": 0.0112, + "step": 4360 + }, + { + "epoch": 0.8734987990392313, + "learning_rate": 1.8176351034821352e-05, + "loss": 0.9494, + "step": 4362 + }, + { + "epoch": 0.8734987990392313, + "learning_rate": 1.8168297099265108e-05, + "loss": 0.0128, + "step": 4364 + }, + { + "epoch": 0.8742994395516414, + "learning_rate": 1.8160227210847642e-05, + "loss": 0.0368, + "step": 4366 + }, + { + "epoch": 0.8742994395516414, + "learning_rate": 1.815214138532966e-05, + "loss": 0.0, + "step": 4368 + }, + { + "epoch": 0.8751000800640513, + "learning_rate": 1.8144039638502927e-05, + "loss": 0.1192, + "step": 4370 + }, + { + "epoch": 0.8751000800640513, + "learning_rate": 1.8135921986190358e-05, + "loss": 0.0408, + "step": 4372 + }, + { + "epoch": 0.8759007205764612, + "learning_rate": 1.8127788444245884e-05, + "loss": 0.0278, + "step": 4374 + }, + { + "epoch": 0.8759007205764612, + "learning_rate": 1.8119639028554475e-05, + "loss": 0.0377, + "step": 4376 + }, + { + "epoch": 0.8767013610888711, + "learning_rate": 1.8111473755032152e-05, + "loss": 0.1588, + "step": 4378 + }, + { + "epoch": 0.8767013610888711, + "learning_rate": 1.8103292639625835e-05, + "loss": 0.1902, + "step": 4380 + }, + { + "epoch": 0.877502001601281, + "learning_rate": 1.8095095698313456e-05, + "loss": 0.3099, + "step": 4382 + }, + { + "epoch": 0.877502001601281, + "learning_rate": 1.808688294710378e-05, + "loss": 0.0254, + "step": 4384 + }, + { + "epoch": 0.8783026421136909, + "learning_rate": 1.807865440203653e-05, + "loss": 0.2285, + "step": 4386 + }, + { + "epoch": 0.8783026421136909, + "learning_rate": 1.807041007918221e-05, + "loss": 0.1363, + "step": 4388 + }, + { + "epoch": 0.8791032826261009, + "learning_rate": 1.806214999464214e-05, + "loss": 0.1094, + "step": 4390 + }, + { + "epoch": 0.8791032826261009, + "learning_rate": 1.805387416454849e-05, + "loss": 0.4849, + "step": 4392 + }, + { + "epoch": 0.8799039231385108, + "learning_rate": 1.8045582605064087e-05, + "loss": 0.2641, + "step": 4394 + }, + { + "epoch": 0.8799039231385108, + "learning_rate": 1.8037275332382575e-05, + "loss": 0.1164, + "step": 4396 + }, + { + "epoch": 0.8807045636509208, + "learning_rate": 1.802895236272819e-05, + "loss": 0.002, + "step": 4398 + }, + { + "epoch": 0.8807045636509208, + "learning_rate": 1.802061371235592e-05, + "loss": 0.0463, + "step": 4400 + }, + { + "epoch": 0.8815052041633307, + "learning_rate": 1.80122593975513e-05, + "loss": 0.0076, + "step": 4402 + }, + { + "epoch": 0.8815052041633307, + "learning_rate": 1.8003889434630476e-05, + "loss": 0.0509, + "step": 4404 + }, + { + "epoch": 0.8823058446757406, + "learning_rate": 1.7995503839940204e-05, + "loss": 0.045, + "step": 4406 + }, + { + "epoch": 0.8823058446757406, + "learning_rate": 1.7987102629857692e-05, + "loss": 0.5135, + "step": 4408 + }, + { + "epoch": 0.8831064851881505, + "learning_rate": 1.7978685820790725e-05, + "loss": 0.08, + "step": 4410 + }, + { + "epoch": 0.8831064851881505, + "learning_rate": 1.7970253429177494e-05, + "loss": 0.1446, + "step": 4412 + }, + { + "epoch": 0.8839071257005604, + "learning_rate": 1.796180547148662e-05, + "loss": 0.0043, + "step": 4414 + }, + { + "epoch": 0.8839071257005604, + "learning_rate": 1.7953341964217196e-05, + "loss": 0.0552, + "step": 4416 + }, + { + "epoch": 0.8847077662129704, + "learning_rate": 1.7944862923898586e-05, + "loss": 0.0141, + "step": 4418 + }, + { + "epoch": 0.8847077662129704, + "learning_rate": 1.7936368367090583e-05, + "loss": 0.0456, + "step": 4420 + }, + { + "epoch": 0.8855084067253803, + "learning_rate": 1.7927858310383196e-05, + "loss": 0.1623, + "step": 4422 + }, + { + "epoch": 0.8855084067253803, + "learning_rate": 1.7919332770396798e-05, + "loss": 0.0151, + "step": 4424 + }, + { + "epoch": 0.8863090472377902, + "learning_rate": 1.7910791763781928e-05, + "loss": 0.0231, + "step": 4426 + }, + { + "epoch": 0.8863090472377902, + "learning_rate": 1.7902235307219336e-05, + "loss": 0.0008, + "step": 4428 + }, + { + "epoch": 0.8871096877502002, + "learning_rate": 1.789366341742001e-05, + "loss": 0.0134, + "step": 4430 + }, + { + "epoch": 0.8871096877502002, + "learning_rate": 1.7885076111125e-05, + "loss": 0.0308, + "step": 4432 + }, + { + "epoch": 0.8879103282626101, + "learning_rate": 1.7876473405105535e-05, + "loss": 0.1159, + "step": 4434 + }, + { + "epoch": 0.8879103282626101, + "learning_rate": 1.7867855316162846e-05, + "loss": 0.0522, + "step": 4436 + }, + { + "epoch": 0.88871096877502, + "learning_rate": 1.785922186112829e-05, + "loss": 0.264, + "step": 4438 + }, + { + "epoch": 0.88871096877502, + "learning_rate": 1.7850573056863173e-05, + "loss": 0.6233, + "step": 4440 + }, + { + "epoch": 0.8895116092874299, + "learning_rate": 1.7841908920258774e-05, + "loss": 0.7353, + "step": 4442 + }, + { + "epoch": 0.8895116092874299, + "learning_rate": 1.783322946823638e-05, + "loss": 0.3071, + "step": 4444 + }, + { + "epoch": 0.8903122497998399, + "learning_rate": 1.782453471774711e-05, + "loss": 0.0024, + "step": 4446 + }, + { + "epoch": 0.8903122497998399, + "learning_rate": 1.7815824685772042e-05, + "loss": 0.6517, + "step": 4448 + }, + { + "epoch": 0.8911128903122498, + "learning_rate": 1.7807099389322013e-05, + "loss": 0.063, + "step": 4450 + }, + { + "epoch": 0.8911128903122498, + "learning_rate": 1.779835884543776e-05, + "loss": 0.0738, + "step": 4452 + }, + { + "epoch": 0.8919135308246597, + "learning_rate": 1.7789603071189733e-05, + "loss": 0.0363, + "step": 4454 + }, + { + "epoch": 0.8919135308246597, + "learning_rate": 1.7780832083678122e-05, + "loss": 0.0072, + "step": 4456 + }, + { + "epoch": 0.8927141713370697, + "learning_rate": 1.7772045900032912e-05, + "loss": 0.0789, + "step": 4458 + }, + { + "epoch": 0.8927141713370697, + "learning_rate": 1.776324453741365e-05, + "loss": 0.0286, + "step": 4460 + }, + { + "epoch": 0.8935148118494796, + "learning_rate": 1.7754428013009644e-05, + "loss": 0.0114, + "step": 4462 + }, + { + "epoch": 0.8935148118494796, + "learning_rate": 1.774559634403971e-05, + "loss": 0.1418, + "step": 4464 + }, + { + "epoch": 0.8943154523618895, + "learning_rate": 1.7736749547752327e-05, + "loss": 0.6417, + "step": 4466 + }, + { + "epoch": 0.8943154523618895, + "learning_rate": 1.7727887641425465e-05, + "loss": 0.411, + "step": 4468 + }, + { + "epoch": 0.8951160928742994, + "learning_rate": 1.7719010642366597e-05, + "loss": 0.0563, + "step": 4470 + }, + { + "epoch": 0.8951160928742994, + "learning_rate": 1.7710118567912732e-05, + "loss": 0.0059, + "step": 4472 + }, + { + "epoch": 0.8959167333867094, + "learning_rate": 1.770121143543025e-05, + "loss": 0.003, + "step": 4474 + }, + { + "epoch": 0.8959167333867094, + "learning_rate": 1.7692289262315008e-05, + "loss": 0.428, + "step": 4476 + }, + { + "epoch": 0.8967173738991193, + "learning_rate": 1.7683352065992174e-05, + "loss": 0.1806, + "step": 4478 + }, + { + "epoch": 0.8967173738991193, + "learning_rate": 1.7674399863916298e-05, + "loss": 0.3717, + "step": 4480 + }, + { + "epoch": 0.8975180144115292, + "learning_rate": 1.7665432673571238e-05, + "loss": 0.2776, + "step": 4482 + }, + { + "epoch": 0.8975180144115292, + "learning_rate": 1.765645051247007e-05, + "loss": 0.0682, + "step": 4484 + }, + { + "epoch": 0.8983186549239391, + "learning_rate": 1.7647453398155204e-05, + "loss": 0.0404, + "step": 4486 + }, + { + "epoch": 0.8983186549239391, + "learning_rate": 1.7638441348198144e-05, + "loss": 0.1813, + "step": 4488 + }, + { + "epoch": 0.899119295436349, + "learning_rate": 1.7629414380199672e-05, + "loss": 0.0003, + "step": 4490 + }, + { + "epoch": 0.899119295436349, + "learning_rate": 1.762037251178961e-05, + "loss": 0.2154, + "step": 4492 + }, + { + "epoch": 0.899919935948759, + "learning_rate": 1.7611315760626943e-05, + "loss": 0.0037, + "step": 4494 + }, + { + "epoch": 0.899919935948759, + "learning_rate": 1.7602244144399713e-05, + "loss": 0.0596, + "step": 4496 + }, + { + "epoch": 0.900720576461169, + "learning_rate": 1.7593157680824943e-05, + "loss": 0.0634, + "step": 4498 + }, + { + "epoch": 0.900720576461169, + "learning_rate": 1.7584056387648738e-05, + "loss": 0.2049, + "step": 4500 + }, + { + "epoch": 0.9015212169735789, + "learning_rate": 1.757494028264608e-05, + "loss": 0.2119, + "step": 4502 + }, + { + "epoch": 0.9015212169735789, + "learning_rate": 1.7565809383620966e-05, + "loss": 0.3046, + "step": 4504 + }, + { + "epoch": 0.9023218574859888, + "learning_rate": 1.7556663708406203e-05, + "loss": 0.0463, + "step": 4506 + }, + { + "epoch": 0.9023218574859888, + "learning_rate": 1.7547503274863502e-05, + "loss": 0.0054, + "step": 4508 + }, + { + "epoch": 0.9031224979983987, + "learning_rate": 1.7538328100883404e-05, + "loss": 0.0003, + "step": 4510 + }, + { + "epoch": 0.9031224979983987, + "learning_rate": 1.7529138204385186e-05, + "loss": 0.0589, + "step": 4512 + }, + { + "epoch": 0.9039231385108086, + "learning_rate": 1.7519933603316962e-05, + "loss": 0.0509, + "step": 4514 + }, + { + "epoch": 0.9039231385108086, + "learning_rate": 1.7510714315655467e-05, + "loss": 0.0, + "step": 4516 + }, + { + "epoch": 0.9047237790232185, + "learning_rate": 1.750148035940622e-05, + "loss": 0.6103, + "step": 4518 + }, + { + "epoch": 0.9047237790232185, + "learning_rate": 1.7492231752603305e-05, + "loss": 0.0047, + "step": 4520 + }, + { + "epoch": 0.9055244195356285, + "learning_rate": 1.7482968513309458e-05, + "loss": 0.0003, + "step": 4522 + }, + { + "epoch": 0.9055244195356285, + "learning_rate": 1.7473690659616e-05, + "loss": 0.0362, + "step": 4524 + }, + { + "epoch": 0.9063250600480385, + "learning_rate": 1.7464398209642744e-05, + "loss": 0.0712, + "step": 4526 + }, + { + "epoch": 0.9063250600480385, + "learning_rate": 1.7455091181538094e-05, + "loss": 0.004, + "step": 4528 + }, + { + "epoch": 0.9071257005604484, + "learning_rate": 1.7445769593478842e-05, + "loss": 0.0406, + "step": 4530 + }, + { + "epoch": 0.9071257005604484, + "learning_rate": 1.743643346367027e-05, + "loss": 0.016, + "step": 4532 + }, + { + "epoch": 0.9079263410728583, + "learning_rate": 1.7427082810346024e-05, + "loss": 0.0254, + "step": 4534 + }, + { + "epoch": 0.9079263410728583, + "learning_rate": 1.741771765176815e-05, + "loss": 0.0323, + "step": 4536 + }, + { + "epoch": 0.9087269815852682, + "learning_rate": 1.740833800622701e-05, + "loss": 0.0788, + "step": 4538 + }, + { + "epoch": 0.9087269815852682, + "learning_rate": 1.739894389204122e-05, + "loss": 0.2928, + "step": 4540 + }, + { + "epoch": 0.9095276220976781, + "learning_rate": 1.738953532755774e-05, + "loss": 0.1507, + "step": 4542 + }, + { + "epoch": 0.9095276220976781, + "learning_rate": 1.7380112331151657e-05, + "loss": 0.0033, + "step": 4544 + }, + { + "epoch": 0.910328262610088, + "learning_rate": 1.7370674921226306e-05, + "loss": 0.1068, + "step": 4546 + }, + { + "epoch": 0.910328262610088, + "learning_rate": 1.7361223116213146e-05, + "loss": 0.0361, + "step": 4548 + }, + { + "epoch": 0.911128903122498, + "learning_rate": 1.7351756934571764e-05, + "loss": 0.2407, + "step": 4550 + }, + { + "epoch": 0.911128903122498, + "learning_rate": 1.7342276394789825e-05, + "loss": 0.3522, + "step": 4552 + }, + { + "epoch": 0.911929543634908, + "learning_rate": 1.7332781515382996e-05, + "loss": 0.0575, + "step": 4554 + }, + { + "epoch": 0.911929543634908, + "learning_rate": 1.732327231489503e-05, + "loss": 0.0004, + "step": 4556 + }, + { + "epoch": 0.9127301841473179, + "learning_rate": 1.7313748811897564e-05, + "loss": 0.058, + "step": 4558 + }, + { + "epoch": 0.9127301841473179, + "learning_rate": 1.7304211024990216e-05, + "loss": 0.0416, + "step": 4560 + }, + { + "epoch": 0.9135308246597278, + "learning_rate": 1.7294658972800495e-05, + "loss": 0.001, + "step": 4562 + }, + { + "epoch": 0.9135308246597278, + "learning_rate": 1.728509267398376e-05, + "loss": 0.0234, + "step": 4564 + }, + { + "epoch": 0.9143314651721377, + "learning_rate": 1.727551214722322e-05, + "loss": 0.108, + "step": 4566 + }, + { + "epoch": 0.9143314651721377, + "learning_rate": 1.7265917411229803e-05, + "loss": 0.1588, + "step": 4568 + }, + { + "epoch": 0.9151321056845476, + "learning_rate": 1.72563084847423e-05, + "loss": 0.0203, + "step": 4570 + }, + { + "epoch": 0.9151321056845476, + "learning_rate": 1.7246685386527105e-05, + "loss": 0.0479, + "step": 4572 + }, + { + "epoch": 0.9159327461969575, + "learning_rate": 1.723704813537835e-05, + "loss": 0.0141, + "step": 4574 + }, + { + "epoch": 0.9159327461969575, + "learning_rate": 1.7227396750117802e-05, + "loss": 0.0188, + "step": 4576 + }, + { + "epoch": 0.9167333867093675, + "learning_rate": 1.7217731249594817e-05, + "loss": 0.0974, + "step": 4578 + }, + { + "epoch": 0.9167333867093675, + "learning_rate": 1.7208051652686348e-05, + "loss": 0.0532, + "step": 4580 + }, + { + "epoch": 0.9175340272217775, + "learning_rate": 1.7198357978296827e-05, + "loss": 0.0793, + "step": 4582 + }, + { + "epoch": 0.9175340272217775, + "learning_rate": 1.718865024535822e-05, + "loss": 0.0047, + "step": 4584 + }, + { + "epoch": 0.9183346677341874, + "learning_rate": 1.717892847282995e-05, + "loss": 0.0157, + "step": 4586 + }, + { + "epoch": 0.9183346677341874, + "learning_rate": 1.716919267969884e-05, + "loss": 0.0047, + "step": 4588 + }, + { + "epoch": 0.9191353082465973, + "learning_rate": 1.715944288497911e-05, + "loss": 0.012, + "step": 4590 + }, + { + "epoch": 0.9191353082465973, + "learning_rate": 1.7149679107712317e-05, + "loss": 0.1495, + "step": 4592 + }, + { + "epoch": 0.9199359487590072, + "learning_rate": 1.713990136696734e-05, + "loss": 0.3541, + "step": 4594 + }, + { + "epoch": 0.9199359487590072, + "learning_rate": 1.7130109681840298e-05, + "loss": 0.0064, + "step": 4596 + }, + { + "epoch": 0.9207365892714171, + "learning_rate": 1.7120304071454578e-05, + "loss": 0.0053, + "step": 4598 + }, + { + "epoch": 0.9207365892714171, + "learning_rate": 1.711048455496075e-05, + "loss": 0.0331, + "step": 4600 + }, + { + "epoch": 0.921537229783827, + "learning_rate": 1.7100651151536532e-05, + "loss": 0.2639, + "step": 4602 + }, + { + "epoch": 0.921537229783827, + "learning_rate": 1.7090803880386784e-05, + "loss": 0.0801, + "step": 4604 + }, + { + "epoch": 0.922337870296237, + "learning_rate": 1.708094276074344e-05, + "loss": 0.0002, + "step": 4606 + }, + { + "epoch": 0.922337870296237, + "learning_rate": 1.7071067811865474e-05, + "loss": 0.4805, + "step": 4608 + }, + { + "epoch": 0.923138510808647, + "learning_rate": 1.7061179053038894e-05, + "loss": 0.0634, + "step": 4610 + }, + { + "epoch": 0.923138510808647, + "learning_rate": 1.705127650357663e-05, + "loss": 0.0121, + "step": 4612 + }, + { + "epoch": 0.9239391513210569, + "learning_rate": 1.704136018281859e-05, + "loss": 0.1587, + "step": 4614 + }, + { + "epoch": 0.9239391513210569, + "learning_rate": 1.7031430110131566e-05, + "loss": 0.0181, + "step": 4616 + }, + { + "epoch": 0.9247397918334668, + "learning_rate": 1.7021486304909202e-05, + "loss": 0.0197, + "step": 4618 + }, + { + "epoch": 0.9247397918334668, + "learning_rate": 1.701152878657197e-05, + "loss": 0.0055, + "step": 4620 + }, + { + "epoch": 0.9255404323458767, + "learning_rate": 1.700155757456711e-05, + "loss": 1.1688, + "step": 4622 + }, + { + "epoch": 0.9255404323458767, + "learning_rate": 1.699157268836863e-05, + "loss": 0.0328, + "step": 4624 + }, + { + "epoch": 0.9263410728582866, + "learning_rate": 1.6981574147477214e-05, + "loss": 0.0001, + "step": 4626 + }, + { + "epoch": 0.9263410728582866, + "learning_rate": 1.697156197142023e-05, + "loss": 0.0946, + "step": 4628 + }, + { + "epoch": 0.9271417133706965, + "learning_rate": 1.696153617975168e-05, + "loss": 0.1089, + "step": 4630 + }, + { + "epoch": 0.9271417133706965, + "learning_rate": 1.6951496792052148e-05, + "loss": 0.3983, + "step": 4632 + }, + { + "epoch": 0.9279423538831065, + "learning_rate": 1.694144382792878e-05, + "loss": 0.2347, + "step": 4634 + }, + { + "epoch": 0.9279423538831065, + "learning_rate": 1.6931377307015236e-05, + "loss": 0.064, + "step": 4636 + }, + { + "epoch": 0.9287429943955164, + "learning_rate": 1.6921297248971652e-05, + "loss": 0.0592, + "step": 4638 + }, + { + "epoch": 0.9287429943955164, + "learning_rate": 1.6911203673484583e-05, + "loss": 0.1446, + "step": 4640 + }, + { + "epoch": 0.9295436349079264, + "learning_rate": 1.690109660026701e-05, + "loss": 0.2534, + "step": 4642 + }, + { + "epoch": 0.9295436349079264, + "learning_rate": 1.6890976049058267e-05, + "loss": 0.0052, + "step": 4644 + }, + { + "epoch": 0.9303442754203363, + "learning_rate": 1.688084203962401e-05, + "loss": 0.0049, + "step": 4646 + }, + { + "epoch": 0.9303442754203363, + "learning_rate": 1.687069459175619e-05, + "loss": 0.0178, + "step": 4648 + }, + { + "epoch": 0.9311449159327462, + "learning_rate": 1.6860533725272953e-05, + "loss": 0.0015, + "step": 4650 + }, + { + "epoch": 0.9311449159327462, + "learning_rate": 1.6850359460018744e-05, + "loss": 0.0009, + "step": 4652 + }, + { + "epoch": 0.9319455564451561, + "learning_rate": 1.6840171815864085e-05, + "loss": 0.0771, + "step": 4654 + }, + { + "epoch": 0.9319455564451561, + "learning_rate": 1.682997081270568e-05, + "loss": 0.0375, + "step": 4656 + }, + { + "epoch": 0.932746196957566, + "learning_rate": 1.681975647046631e-05, + "loss": 0.0635, + "step": 4658 + }, + { + "epoch": 0.932746196957566, + "learning_rate": 1.6809528809094805e-05, + "loss": 0.1313, + "step": 4660 + }, + { + "epoch": 0.933546837469976, + "learning_rate": 1.6799287848566024e-05, + "loss": 0.0158, + "step": 4662 + }, + { + "epoch": 0.933546837469976, + "learning_rate": 1.6789033608880742e-05, + "loss": 0.0023, + "step": 4664 + }, + { + "epoch": 0.9343474779823859, + "learning_rate": 1.6778766110065765e-05, + "loss": 0.0213, + "step": 4666 + }, + { + "epoch": 0.9343474779823859, + "learning_rate": 1.67684853721737e-05, + "loss": 0.0211, + "step": 4668 + }, + { + "epoch": 0.9351481184947958, + "learning_rate": 1.6758191415283066e-05, + "loss": 0.0114, + "step": 4670 + }, + { + "epoch": 0.9351481184947958, + "learning_rate": 1.6747884259498185e-05, + "loss": 0.0034, + "step": 4672 + }, + { + "epoch": 0.9359487590072058, + "learning_rate": 1.673756392494916e-05, + "loss": 0.2457, + "step": 4674 + }, + { + "epoch": 0.9359487590072058, + "learning_rate": 1.6727230431791826e-05, + "loss": 0.2208, + "step": 4676 + }, + { + "epoch": 0.9367493995196157, + "learning_rate": 1.671688380020769e-05, + "loss": 0.0082, + "step": 4678 + }, + { + "epoch": 0.9367493995196157, + "learning_rate": 1.6706524050404006e-05, + "loss": 0.0104, + "step": 4680 + }, + { + "epoch": 0.9375500400320256, + "learning_rate": 1.6696151202613537e-05, + "loss": 0.0657, + "step": 4682 + }, + { + "epoch": 0.9375500400320256, + "learning_rate": 1.6685765277094702e-05, + "loss": 0.1299, + "step": 4684 + }, + { + "epoch": 0.9383506805444356, + "learning_rate": 1.6675366294131432e-05, + "loss": 0.0083, + "step": 4686 + }, + { + "epoch": 0.9383506805444356, + "learning_rate": 1.6664954274033175e-05, + "loss": 0.0045, + "step": 4688 + }, + { + "epoch": 0.9391513210568455, + "learning_rate": 1.6654529237134833e-05, + "loss": 0.3237, + "step": 4690 + }, + { + "epoch": 0.9391513210568455, + "learning_rate": 1.66440912037967e-05, + "loss": 0.4264, + "step": 4692 + }, + { + "epoch": 0.9399519615692554, + "learning_rate": 1.663364019440453e-05, + "loss": 0.0931, + "step": 4694 + }, + { + "epoch": 0.9399519615692554, + "learning_rate": 1.662317622936933e-05, + "loss": 0.0, + "step": 4696 + }, + { + "epoch": 0.9407526020816653, + "learning_rate": 1.6612699329127467e-05, + "loss": 0.004, + "step": 4698 + }, + { + "epoch": 0.9407526020816653, + "learning_rate": 1.6602209514140562e-05, + "loss": 0.6903, + "step": 4700 + }, + { + "epoch": 0.9415532425940752, + "learning_rate": 1.6591706804895415e-05, + "loss": 0.0166, + "step": 4702 + }, + { + "epoch": 0.9415532425940752, + "learning_rate": 1.6581191221904098e-05, + "loss": 0.3837, + "step": 4704 + }, + { + "epoch": 0.9423538831064852, + "learning_rate": 1.6570662785703716e-05, + "loss": 0.6708, + "step": 4706 + }, + { + "epoch": 0.9423538831064852, + "learning_rate": 1.6560121516856592e-05, + "loss": 0.072, + "step": 4708 + }, + { + "epoch": 0.9431545236188951, + "learning_rate": 1.654956743595001e-05, + "loss": 0.0302, + "step": 4710 + }, + { + "epoch": 0.9431545236188951, + "learning_rate": 1.6539000563596328e-05, + "loss": 0.014, + "step": 4712 + }, + { + "epoch": 0.9439551641313051, + "learning_rate": 1.6528420920432893e-05, + "loss": 0.0111, + "step": 4714 + }, + { + "epoch": 0.9439551641313051, + "learning_rate": 1.651782852712194e-05, + "loss": 0.0069, + "step": 4716 + }, + { + "epoch": 0.944755804643715, + "learning_rate": 1.6507223404350686e-05, + "loss": 0.5286, + "step": 4718 + }, + { + "epoch": 0.944755804643715, + "learning_rate": 1.6496605572831127e-05, + "loss": 0.0048, + "step": 4720 + }, + { + "epoch": 0.9455564451561249, + "learning_rate": 1.648597505330016e-05, + "loss": 0.0089, + "step": 4722 + }, + { + "epoch": 0.9455564451561249, + "learning_rate": 1.6475331866519387e-05, + "loss": 0.0043, + "step": 4724 + }, + { + "epoch": 0.9463570856685348, + "learning_rate": 1.6464676033275187e-05, + "loss": 0.0033, + "step": 4726 + }, + { + "epoch": 0.9463570856685348, + "learning_rate": 1.6454007574378657e-05, + "loss": 0.0385, + "step": 4728 + }, + { + "epoch": 0.9471577261809447, + "learning_rate": 1.644332651066548e-05, + "loss": 0.426, + "step": 4730 + }, + { + "epoch": 0.9471577261809447, + "learning_rate": 1.6432632862996062e-05, + "loss": 0.0419, + "step": 4732 + }, + { + "epoch": 0.9479583666933546, + "learning_rate": 1.6421926652255275e-05, + "loss": 0.0467, + "step": 4734 + }, + { + "epoch": 0.9479583666933546, + "learning_rate": 1.6411207899352633e-05, + "loss": 0.0229, + "step": 4736 + }, + { + "epoch": 0.9487590072057646, + "learning_rate": 1.6400476625222057e-05, + "loss": 0.0037, + "step": 4738 + }, + { + "epoch": 0.9487590072057646, + "learning_rate": 1.6389732850821964e-05, + "loss": 0.5068, + "step": 4740 + }, + { + "epoch": 0.9495596477181746, + "learning_rate": 1.6378976597135193e-05, + "loss": 0.2856, + "step": 4742 + }, + { + "epoch": 0.9495596477181746, + "learning_rate": 1.6368207885168904e-05, + "loss": 0.8362, + "step": 4744 + }, + { + "epoch": 0.9503602882305845, + "learning_rate": 1.635742673595468e-05, + "loss": 0.0543, + "step": 4746 + }, + { + "epoch": 0.9503602882305845, + "learning_rate": 1.6346633170548275e-05, + "loss": 0.1737, + "step": 4748 + }, + { + "epoch": 0.9511609287429944, + "learning_rate": 1.6335827210029823e-05, + "loss": 0.0162, + "step": 4750 + }, + { + "epoch": 0.9511609287429944, + "learning_rate": 1.6325008875503563e-05, + "loss": 0.2757, + "step": 4752 + }, + { + "epoch": 0.9519615692554043, + "learning_rate": 1.6314178188097917e-05, + "loss": 0.9768, + "step": 4754 + }, + { + "epoch": 0.9519615692554043, + "learning_rate": 1.6303335168965495e-05, + "loss": 0.4329, + "step": 4756 + }, + { + "epoch": 0.9527622097678142, + "learning_rate": 1.6292479839282904e-05, + "loss": 0.0528, + "step": 4758 + }, + { + "epoch": 0.9527622097678142, + "learning_rate": 1.628161222025089e-05, + "loss": 0.1169, + "step": 4760 + }, + { + "epoch": 0.9535628502802241, + "learning_rate": 1.627073233309409e-05, + "loss": 0.159, + "step": 4762 + }, + { + "epoch": 0.9535628502802241, + "learning_rate": 1.625984019906122e-05, + "loss": 0.0001, + "step": 4764 + }, + { + "epoch": 0.9543634907926342, + "learning_rate": 1.624893583942482e-05, + "loss": 0.0002, + "step": 4766 + }, + { + "epoch": 0.9543634907926342, + "learning_rate": 1.623801927548132e-05, + "loss": 0.167, + "step": 4768 + }, + { + "epoch": 0.9551641313050441, + "learning_rate": 1.6227090528551058e-05, + "loss": 0.089, + "step": 4770 + }, + { + "epoch": 0.9551641313050441, + "learning_rate": 1.6216149619978057e-05, + "loss": 0.0431, + "step": 4772 + }, + { + "epoch": 0.955964771817454, + "learning_rate": 1.6205196571130204e-05, + "loss": 0.009, + "step": 4774 + }, + { + "epoch": 0.955964771817454, + "learning_rate": 1.6194231403398987e-05, + "loss": 0.4781, + "step": 4776 + }, + { + "epoch": 0.9567654123298639, + "learning_rate": 1.618325413819967e-05, + "loss": 0.0085, + "step": 4778 + }, + { + "epoch": 0.9567654123298639, + "learning_rate": 1.6172264796971063e-05, + "loss": 0.0167, + "step": 4780 + }, + { + "epoch": 0.9575660528422738, + "learning_rate": 1.6161263401175555e-05, + "loss": 0.6417, + "step": 4782 + }, + { + "epoch": 0.9575660528422738, + "learning_rate": 1.6150249972299173e-05, + "loss": 0.1041, + "step": 4784 + }, + { + "epoch": 0.9583666933546837, + "learning_rate": 1.613922453185133e-05, + "loss": 0.0194, + "step": 4786 + }, + { + "epoch": 0.9583666933546837, + "learning_rate": 1.612818710136499e-05, + "loss": 0.0726, + "step": 4788 + }, + { + "epoch": 0.9591673338670936, + "learning_rate": 1.6117137702396454e-05, + "loss": 0.7044, + "step": 4790 + }, + { + "epoch": 0.9591673338670936, + "learning_rate": 1.6106076356525484e-05, + "loss": 0.2246, + "step": 4792 + }, + { + "epoch": 0.9599679743795037, + "learning_rate": 1.6095003085355103e-05, + "loss": 0.0477, + "step": 4794 + }, + { + "epoch": 0.9599679743795037, + "learning_rate": 1.6083917910511623e-05, + "loss": 0.0462, + "step": 4796 + }, + { + "epoch": 0.9607686148919136, + "learning_rate": 1.6072820853644688e-05, + "loss": 0.0431, + "step": 4798 + }, + { + "epoch": 0.9607686148919136, + "learning_rate": 1.6061711936427028e-05, + "loss": 0.1948, + "step": 4800 + }, + { + "epoch": 0.9615692554043235, + "learning_rate": 1.6050591180554658e-05, + "loss": 0.1897, + "step": 4802 + }, + { + "epoch": 0.9615692554043235, + "learning_rate": 1.60394586077466e-05, + "loss": 0.0005, + "step": 4804 + }, + { + "epoch": 0.9623698959167334, + "learning_rate": 1.6028314239745068e-05, + "loss": 0.0033, + "step": 4806 + }, + { + "epoch": 0.9623698959167334, + "learning_rate": 1.6017158098315224e-05, + "loss": 0.1555, + "step": 4808 + }, + { + "epoch": 0.9631705364291433, + "learning_rate": 1.6005990205245226e-05, + "loss": 0.1044, + "step": 4810 + }, + { + "epoch": 0.9631705364291433, + "learning_rate": 1.5994810582346266e-05, + "loss": 0.1461, + "step": 4812 + }, + { + "epoch": 0.9639711769415532, + "learning_rate": 1.5983619251452334e-05, + "loss": 0.3041, + "step": 4814 + }, + { + "epoch": 0.9639711769415532, + "learning_rate": 1.5972416234420404e-05, + "loss": 0.4083, + "step": 4816 + }, + { + "epoch": 0.9647718174539631, + "learning_rate": 1.596120155313017e-05, + "loss": 0.0573, + "step": 4818 + }, + { + "epoch": 0.9647718174539631, + "learning_rate": 1.594997522948413e-05, + "loss": 0.3077, + "step": 4820 + }, + { + "epoch": 0.9655724579663731, + "learning_rate": 1.593873728540759e-05, + "loss": 0.0709, + "step": 4822 + }, + { + "epoch": 0.9655724579663731, + "learning_rate": 1.592748774284844e-05, + "loss": 0.0349, + "step": 4824 + }, + { + "epoch": 0.966373098478783, + "learning_rate": 1.5916226623777346e-05, + "loss": 0.3513, + "step": 4826 + }, + { + "epoch": 0.966373098478783, + "learning_rate": 1.5904953950187448e-05, + "loss": 0.0549, + "step": 4828 + }, + { + "epoch": 0.967173738991193, + "learning_rate": 1.5893669744094587e-05, + "loss": 0.0569, + "step": 4830 + }, + { + "epoch": 0.967173738991193, + "learning_rate": 1.588237402753703e-05, + "loss": 0.0179, + "step": 4832 + }, + { + "epoch": 0.9679743795036029, + "learning_rate": 1.5871066822575526e-05, + "loss": 0.0106, + "step": 4834 + }, + { + "epoch": 0.9679743795036029, + "learning_rate": 1.5859748151293354e-05, + "loss": 0.1422, + "step": 4836 + }, + { + "epoch": 0.9687750200160128, + "learning_rate": 1.5848418035796064e-05, + "loss": 0.0975, + "step": 4838 + }, + { + "epoch": 0.9687750200160128, + "learning_rate": 1.5837076498211673e-05, + "loss": 0.0267, + "step": 4840 + }, + { + "epoch": 0.9695756605284227, + "learning_rate": 1.5825723560690396e-05, + "loss": 0.0037, + "step": 4842 + }, + { + "epoch": 0.9695756605284227, + "learning_rate": 1.581435924540482e-05, + "loss": 0.1899, + "step": 4844 + }, + { + "epoch": 0.9703763010408326, + "learning_rate": 1.580298357454967e-05, + "loss": 0.0834, + "step": 4846 + }, + { + "epoch": 0.9703763010408326, + "learning_rate": 1.579159657034185e-05, + "loss": 0.0013, + "step": 4848 + }, + { + "epoch": 0.9711769415532426, + "learning_rate": 1.5780198255020485e-05, + "loss": 0.0181, + "step": 4850 + }, + { + "epoch": 0.9711769415532426, + "learning_rate": 1.5768788650846674e-05, + "loss": 0.1624, + "step": 4852 + }, + { + "epoch": 0.9719775820656525, + "learning_rate": 1.5757367780103672e-05, + "loss": 0.0649, + "step": 4854 + }, + { + "epoch": 0.9719775820656525, + "learning_rate": 1.574593566509664e-05, + "loss": 0.5367, + "step": 4856 + }, + { + "epoch": 0.9727782225780625, + "learning_rate": 1.5734492328152796e-05, + "loss": 0.0002, + "step": 4858 + }, + { + "epoch": 0.9727782225780625, + "learning_rate": 1.5723037791621203e-05, + "loss": 0.1711, + "step": 4860 + }, + { + "epoch": 0.9735788630904724, + "learning_rate": 1.5711572077872784e-05, + "loss": 0.0298, + "step": 4862 + }, + { + "epoch": 0.9735788630904724, + "learning_rate": 1.5700095209300386e-05, + "loss": 0.1438, + "step": 4864 + }, + { + "epoch": 0.9743795036028823, + "learning_rate": 1.568860720831852e-05, + "loss": 0.0877, + "step": 4866 + }, + { + "epoch": 0.9743795036028823, + "learning_rate": 1.5677108097363565e-05, + "loss": 0.236, + "step": 4868 + }, + { + "epoch": 0.9751801441152922, + "learning_rate": 1.5665597898893508e-05, + "loss": 0.0428, + "step": 4870 + }, + { + "epoch": 0.9751801441152922, + "learning_rate": 1.5654076635387976e-05, + "loss": 0.0694, + "step": 4872 + }, + { + "epoch": 0.9759807846277022, + "learning_rate": 1.5642544329348316e-05, + "loss": 0.133, + "step": 4874 + }, + { + "epoch": 0.9759807846277022, + "learning_rate": 1.5631001003297302e-05, + "loss": 0.0262, + "step": 4876 + }, + { + "epoch": 0.9767814251401121, + "learning_rate": 1.5619446679779367e-05, + "loss": 0.1406, + "step": 4878 + }, + { + "epoch": 0.9767814251401121, + "learning_rate": 1.560788138136029e-05, + "loss": 0.7344, + "step": 4880 + }, + { + "epoch": 0.977582065652522, + "learning_rate": 1.5596305130627414e-05, + "loss": 0.6747, + "step": 4882 + }, + { + "epoch": 0.977582065652522, + "learning_rate": 1.5584717950189373e-05, + "loss": 0.0887, + "step": 4884 + }, + { + "epoch": 0.978382706164932, + "learning_rate": 1.5573119862676155e-05, + "loss": 0.02, + "step": 4886 + }, + { + "epoch": 0.978382706164932, + "learning_rate": 1.5561510890739137e-05, + "loss": 0.4013, + "step": 4888 + }, + { + "epoch": 0.9791833466773419, + "learning_rate": 1.554989105705083e-05, + "loss": 0.7627, + "step": 4890 + }, + { + "epoch": 0.9791833466773419, + "learning_rate": 1.5538260384305083e-05, + "loss": 0.0254, + "step": 4892 + }, + { + "epoch": 0.9799839871897518, + "learning_rate": 1.5526618895216786e-05, + "loss": 0.0183, + "step": 4894 + }, + { + "epoch": 0.9799839871897518, + "learning_rate": 1.5514966612522088e-05, + "loss": 0.0455, + "step": 4896 + }, + { + "epoch": 0.9807846277021617, + "learning_rate": 1.5503303558978112e-05, + "loss": 0.0529, + "step": 4898 + }, + { + "epoch": 0.9807846277021617, + "learning_rate": 1.5491629757363033e-05, + "loss": 0.1761, + "step": 4900 + }, + { + "epoch": 0.9815852682145717, + "learning_rate": 1.547994523047609e-05, + "loss": 0.0535, + "step": 4902 + }, + { + "epoch": 0.9815852682145717, + "learning_rate": 1.546825000113736e-05, + "loss": 0.1445, + "step": 4904 + }, + { + "epoch": 0.9823859087269816, + "learning_rate": 1.545654409218794e-05, + "loss": 0.0026, + "step": 4906 + }, + { + "epoch": 0.9823859087269816, + "learning_rate": 1.544482752648966e-05, + "loss": 0.0048, + "step": 4908 + }, + { + "epoch": 0.9831865492393915, + "learning_rate": 1.5433100326925298e-05, + "loss": 0.0062, + "step": 4910 + }, + { + "epoch": 0.9831865492393915, + "learning_rate": 1.5421362516398285e-05, + "loss": 0.2465, + "step": 4912 + }, + { + "epoch": 0.9839871897518014, + "learning_rate": 1.5409614117832797e-05, + "loss": 0.0974, + "step": 4914 + }, + { + "epoch": 0.9839871897518014, + "learning_rate": 1.539785515417377e-05, + "loss": 0.0977, + "step": 4916 + }, + { + "epoch": 0.9847878302642114, + "learning_rate": 1.538608564838665e-05, + "loss": 0.0335, + "step": 4918 + }, + { + "epoch": 0.9847878302642114, + "learning_rate": 1.5374305623457605e-05, + "loss": 0.0092, + "step": 4920 + }, + { + "epoch": 0.9855884707766213, + "learning_rate": 1.5362515102393244e-05, + "loss": 0.0216, + "step": 4922 + }, + { + "epoch": 0.9855884707766213, + "learning_rate": 1.5350714108220677e-05, + "loss": 0.0098, + "step": 4924 + }, + { + "epoch": 0.9863891112890312, + "learning_rate": 1.5338902663987564e-05, + "loss": 0.1448, + "step": 4926 + }, + { + "epoch": 0.9863891112890312, + "learning_rate": 1.532708079276186e-05, + "loss": 0.098, + "step": 4928 + }, + { + "epoch": 0.9871897518014412, + "learning_rate": 1.531524851763198e-05, + "loss": 0.0596, + "step": 4930 + }, + { + "epoch": 0.9871897518014412, + "learning_rate": 1.5303405861706567e-05, + "loss": 0.0437, + "step": 4932 + }, + { + "epoch": 0.9879903923138511, + "learning_rate": 1.529155284811464e-05, + "loss": 0.5414, + "step": 4934 + }, + { + "epoch": 0.9879903923138511, + "learning_rate": 1.5279689500005353e-05, + "loss": 0.133, + "step": 4936 + }, + { + "epoch": 0.988791032826261, + "learning_rate": 1.5267815840548067e-05, + "loss": 0.0359, + "step": 4938 + }, + { + "epoch": 0.988791032826261, + "learning_rate": 1.5255931892932344e-05, + "loss": 0.3837, + "step": 4940 + }, + { + "epoch": 0.9895916733386709, + "learning_rate": 1.5244037680367739e-05, + "loss": 0.0035, + "step": 4942 + }, + { + "epoch": 0.9895916733386709, + "learning_rate": 1.5232133226083962e-05, + "loss": 0.0843, + "step": 4944 + }, + { + "epoch": 0.9903923138510808, + "learning_rate": 1.522021855333061e-05, + "loss": 0.6196, + "step": 4946 + }, + { + "epoch": 0.9903923138510808, + "learning_rate": 1.5208293685377362e-05, + "loss": 0.0255, + "step": 4948 + }, + { + "epoch": 0.9911929543634908, + "learning_rate": 1.519635864551371e-05, + "loss": 0.0007, + "step": 4950 + }, + { + "epoch": 0.9911929543634908, + "learning_rate": 1.5184413457049014e-05, + "loss": 0.1788, + "step": 4952 + }, + { + "epoch": 0.9919935948759008, + "learning_rate": 1.5172458143312548e-05, + "loss": 0.0568, + "step": 4954 + }, + { + "epoch": 0.9919935948759008, + "learning_rate": 1.5160492727653238e-05, + "loss": 0.1111, + "step": 4956 + }, + { + "epoch": 0.9927942353883107, + "learning_rate": 1.5148517233439858e-05, + "loss": 0.133, + "step": 4958 + }, + { + "epoch": 0.9927942353883107, + "learning_rate": 1.5136531684060753e-05, + "loss": 0.119, + "step": 4960 + }, + { + "epoch": 0.9935948759007206, + "learning_rate": 1.512453610292402e-05, + "loss": 0.0975, + "step": 4962 + }, + { + "epoch": 0.9935948759007206, + "learning_rate": 1.5112530513457251e-05, + "loss": 0.0018, + "step": 4964 + }, + { + "epoch": 0.9943955164131305, + "learning_rate": 1.5100514939107598e-05, + "loss": 0.0984, + "step": 4966 + }, + { + "epoch": 0.9943955164131305, + "learning_rate": 1.50884894033418e-05, + "loss": 0.0364, + "step": 4968 + }, + { + "epoch": 0.9951961569255404, + "learning_rate": 1.5076453929645927e-05, + "loss": 0.0005, + "step": 4970 + }, + { + "epoch": 0.9951961569255404, + "learning_rate": 1.5064408541525578e-05, + "loss": 0.0912, + "step": 4972 + }, + { + "epoch": 0.9959967974379503, + "learning_rate": 1.505235326250563e-05, + "loss": 0.0017, + "step": 4974 + }, + { + "epoch": 0.9959967974379503, + "learning_rate": 1.504028811613027e-05, + "loss": 0.0185, + "step": 4976 + }, + { + "epoch": 0.9967974379503602, + "learning_rate": 1.5028213125963054e-05, + "loss": 0.002, + "step": 4978 + }, + { + "epoch": 0.9967974379503602, + "learning_rate": 1.5016128315586636e-05, + "loss": 0.0157, + "step": 4980 + }, + { + "epoch": 0.9975980784627703, + "learning_rate": 1.5004033708602977e-05, + "loss": 0.0186, + "step": 4982 + }, + { + "epoch": 0.9975980784627703, + "learning_rate": 1.4991929328633043e-05, + "loss": 0.001, + "step": 4984 + }, + { + "epoch": 0.9983987189751802, + "learning_rate": 1.4979815199317011e-05, + "loss": 0.1908, + "step": 4986 + }, + { + "epoch": 0.9983987189751802, + "learning_rate": 1.4967691344314012e-05, + "loss": 0.023, + "step": 4988 + }, + { + "epoch": 0.9991993594875901, + "learning_rate": 1.495555778730216e-05, + "loss": 0.2409, + "step": 4990 + }, + { + "epoch": 0.9991993594875901, + "learning_rate": 1.4943414551978622e-05, + "loss": 0.0812, + "step": 4992 + }, + { + "epoch": 1.0, + "learning_rate": 1.4931261662059333e-05, + "loss": 0.0033, + "step": 4994 + }, + { + "epoch": 1.0, + "learning_rate": 1.4919099141279214e-05, + "loss": 0.0039, + "step": 4996 + }, + { + "epoch": 1.0, + "step": 4996, + "total_flos": 3.2354709008285696e+16, + "train_loss": 0.15874619513870436, + "train_runtime": 10376.8862, + "train_samples_per_second": 1.926, + "train_steps_per_second": 0.481 + } + ], + "logging_steps": 2, + "max_steps": 4996, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 3.2354709008285696e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_25_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_25_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5fa4b85a03a5e3b77ccbf46ba1a5e1f341209d51 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_25_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8a104dd8875994d99946e448926a36839e91f01795076021e5b43279507564f +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_25_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_25_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..86dd4974df11908e109e75c2e66dcca185bcee61 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_25_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0c6f27f6f132a7c62b63254e39aea65996994f1fb5b4dc72767866eb82a2789 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_25_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_25_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..86b0d9f61962aaa64c6330e8e6e5ebd2db9eee5d --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_25_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80ac8786a4a477c740ffc6c3d86f50eeae8dd9a2dee081c488c10f1721ba25e4 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_25_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_25_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..86eeda3d84cf6d15f00b9697820256e86be04a3b --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_coincide_scenario12_new_10000_random0_25_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:077edcaf7500081fd82bad877900afa2f60359d29866243aaefc8831da32758d +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9220d233550fc9edfe9efb54ad5276ea15abe4d2 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,3776 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1249, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016012810248198558, + "learning_rate": 2.4341906163790364e-06, + "loss": 0.2903, + "step": 2 + }, + { + "epoch": 0.0032025620496397116, + "learning_rate": 2.4708617956148052e-06, + "loss": 0.1414, + "step": 4 + }, + { + "epoch": 0.004803843074459567, + "learning_rate": 2.507768247396697e-06, + "loss": 0.3564, + "step": 6 + }, + { + "epoch": 0.006405124099279423, + "learning_rate": 2.5449088184619065e-06, + "loss": 0.2415, + "step": 8 + }, + { + "epoch": 0.008006405124099279, + "learning_rate": 2.5822823482318517e-06, + "loss": 0.342, + "step": 10 + }, + { + "epoch": 0.009607686148919135, + "learning_rate": 2.6198876688483453e-06, + "loss": 0.2286, + "step": 12 + }, + { + "epoch": 0.01120896717373899, + "learning_rate": 2.6577236052101764e-06, + "loss": 0.2281, + "step": 14 + }, + { + "epoch": 0.012810248198558846, + "learning_rate": 2.6957889750097866e-06, + "loss": 0.4142, + "step": 16 + }, + { + "epoch": 0.014411529223378704, + "learning_rate": 2.7340825887701848e-06, + "loss": 0.5099, + "step": 18 + }, + { + "epoch": 0.016012810248198558, + "learning_rate": 2.772603249882202e-06, + "loss": 0.1902, + "step": 20 + }, + { + "epoch": 0.017614091273018415, + "learning_rate": 2.81134975464178e-06, + "loss": 0.1767, + "step": 22 + }, + { + "epoch": 0.01921537229783827, + "learning_rate": 2.850320892287688e-06, + "loss": 0.2834, + "step": 24 + }, + { + "epoch": 0.020816653322658127, + "learning_rate": 2.889515445039256e-06, + "loss": 0.2944, + "step": 26 + }, + { + "epoch": 0.02241793434747798, + "learning_rate": 2.928932188134529e-06, + "loss": 0.2856, + "step": 28 + }, + { + "epoch": 0.02401921537229784, + "learning_rate": 2.9685698898684355e-06, + "loss": 0.2376, + "step": 30 + }, + { + "epoch": 0.025620496397117692, + "learning_rate": 3.00842731163137e-06, + "loss": 0.3753, + "step": 32 + }, + { + "epoch": 0.02722177742193755, + "learning_rate": 3.048503207947854e-06, + "loss": 0.2576, + "step": 34 + }, + { + "epoch": 0.028823058446757407, + "learning_rate": 3.0887963265154187e-06, + "loss": 0.3993, + "step": 36 + }, + { + "epoch": 0.03042433947157726, + "learning_rate": 3.129305408243829e-06, + "loss": 0.2434, + "step": 38 + }, + { + "epoch": 0.032025620496397116, + "learning_rate": 3.17002918729432e-06, + "loss": 0.1838, + "step": 40 + }, + { + "epoch": 0.03362690152121697, + "learning_rate": 3.2109663911192622e-06, + "loss": 0.1866, + "step": 42 + }, + { + "epoch": 0.03522818254603683, + "learning_rate": 3.2521157405018146e-06, + "loss": 0.17, + "step": 44 + }, + { + "epoch": 0.03682946357085669, + "learning_rate": 3.293475949595998e-06, + "loss": 0.2443, + "step": 46 + }, + { + "epoch": 0.03843074459567654, + "learning_rate": 3.335045725966829e-06, + "loss": 0.3971, + "step": 48 + }, + { + "epoch": 0.040032025620496396, + "learning_rate": 3.3768237706306716e-06, + "loss": 0.3417, + "step": 50 + }, + { + "epoch": 0.041633306645316254, + "learning_rate": 3.418808778095917e-06, + "loss": 0.2133, + "step": 52 + }, + { + "epoch": 0.04323458767013611, + "learning_rate": 3.460999436403676e-06, + "loss": 0.2903, + "step": 54 + }, + { + "epoch": 0.04483586869495596, + "learning_rate": 3.5033944271688624e-06, + "loss": 0.2477, + "step": 56 + }, + { + "epoch": 0.04643714971977582, + "learning_rate": 3.5459924256213596e-06, + "loss": 0.2156, + "step": 58 + }, + { + "epoch": 0.04803843074459568, + "learning_rate": 3.588792100647368e-06, + "loss": 0.2449, + "step": 60 + }, + { + "epoch": 0.049639711769415534, + "learning_rate": 3.6317921148310965e-06, + "loss": 0.1624, + "step": 62 + }, + { + "epoch": 0.051240992794235385, + "learning_rate": 3.674991124496452e-06, + "loss": 0.3443, + "step": 64 + }, + { + "epoch": 0.05284227381905524, + "learning_rate": 3.7183877797491143e-06, + "loss": 0.2865, + "step": 66 + }, + { + "epoch": 0.0544435548438751, + "learning_rate": 3.7619807245186824e-06, + "loss": 0.526, + "step": 68 + }, + { + "epoch": 0.05604483586869496, + "learning_rate": 3.8057685966010025e-06, + "loss": 0.1375, + "step": 70 + }, + { + "epoch": 0.057646116893514815, + "learning_rate": 3.849750027700842e-06, + "loss": 0.3607, + "step": 72 + }, + { + "epoch": 0.059247397918334666, + "learning_rate": 3.8939236434745184e-06, + "loss": 0.2887, + "step": 74 + }, + { + "epoch": 0.06084867894315452, + "learning_rate": 3.938288063572962e-06, + "loss": 0.2947, + "step": 76 + }, + { + "epoch": 0.06244995996797438, + "learning_rate": 3.982841901684792e-06, + "loss": 0.4337, + "step": 78 + }, + { + "epoch": 0.06405124099279423, + "learning_rate": 4.027583765579601e-06, + "loss": 0.1906, + "step": 80 + }, + { + "epoch": 0.0656525220176141, + "learning_rate": 4.072512257151546e-06, + "loss": 0.4154, + "step": 82 + }, + { + "epoch": 0.06725380304243395, + "learning_rate": 4.117625972462988e-06, + "loss": 0.2858, + "step": 84 + }, + { + "epoch": 0.0688550840672538, + "learning_rate": 4.1629235017883285e-06, + "loss": 0.2044, + "step": 86 + }, + { + "epoch": 0.07045636509207366, + "learning_rate": 4.208403429658151e-06, + "loss": 0.0576, + "step": 88 + }, + { + "epoch": 0.07205764611689351, + "learning_rate": 4.254064334903347e-06, + "loss": 0.1437, + "step": 90 + }, + { + "epoch": 0.07365892714171338, + "learning_rate": 4.299904790699619e-06, + "loss": 0.3809, + "step": 92 + }, + { + "epoch": 0.07526020816653323, + "learning_rate": 4.345923364612024e-06, + "loss": 0.4291, + "step": 94 + }, + { + "epoch": 0.07686148919135308, + "learning_rate": 4.392118618639698e-06, + "loss": 0.364, + "step": 96 + }, + { + "epoch": 0.07846277021617294, + "learning_rate": 4.4384891092608795e-06, + "loss": 0.3476, + "step": 98 + }, + { + "epoch": 0.08006405124099279, + "learning_rate": 4.485033387477915e-06, + "loss": 0.2717, + "step": 100 + }, + { + "epoch": 0.08166533226581266, + "learning_rate": 4.531749998862628e-06, + "loss": 0.4651, + "step": 102 + }, + { + "epoch": 0.08326661329063251, + "learning_rate": 4.578637483601732e-06, + "loss": 0.2861, + "step": 104 + }, + { + "epoch": 0.08486789431545236, + "learning_rate": 4.625694376542399e-06, + "loss": 0.2752, + "step": 106 + }, + { + "epoch": 0.08646917534027222, + "learning_rate": 4.672919207238145e-06, + "loss": 0.2478, + "step": 108 + }, + { + "epoch": 0.08807045636509207, + "learning_rate": 4.720310499994664e-06, + "loss": 0.3811, + "step": 110 + }, + { + "epoch": 0.08967173738991192, + "learning_rate": 4.767866773916041e-06, + "loss": 0.2803, + "step": 112 + }, + { + "epoch": 0.09127301841473179, + "learning_rate": 4.81558654295099e-06, + "loss": 0.2036, + "step": 114 + }, + { + "epoch": 0.09287429943955164, + "learning_rate": 4.863468315939234e-06, + "loss": 0.2354, + "step": 116 + }, + { + "epoch": 0.0944755804643715, + "learning_rate": 4.911510596658202e-06, + "loss": 0.429, + "step": 118 + }, + { + "epoch": 0.09607686148919135, + "learning_rate": 4.959711883869734e-06, + "loss": 0.3091, + "step": 120 + }, + { + "epoch": 0.0976781425140112, + "learning_rate": 5.0080706713669435e-06, + "loss": 0.3097, + "step": 122 + }, + { + "epoch": 0.09927942353883107, + "learning_rate": 5.056585448021398e-06, + "loss": 0.2233, + "step": 124 + }, + { + "epoch": 0.10088070456365092, + "learning_rate": 5.105254697830208e-06, + "loss": 0.2093, + "step": 126 + }, + { + "epoch": 0.10248198558847077, + "learning_rate": 5.154076899963514e-06, + "loss": 0.1945, + "step": 128 + }, + { + "epoch": 0.10408326661329063, + "learning_rate": 5.203050528811959e-06, + "loss": 0.2446, + "step": 130 + }, + { + "epoch": 0.10568454763811048, + "learning_rate": 5.2521740540343205e-06, + "loss": 0.2351, + "step": 132 + }, + { + "epoch": 0.10728582866293035, + "learning_rate": 5.3014459406054295e-06, + "loss": 0.363, + "step": 134 + }, + { + "epoch": 0.1088871096877502, + "learning_rate": 5.350864648864026e-06, + "loss": 0.2161, + "step": 136 + }, + { + "epoch": 0.11048839071257005, + "learning_rate": 5.4004286345609665e-06, + "loss": 0.2356, + "step": 138 + }, + { + "epoch": 0.11208967173738991, + "learning_rate": 5.450136348907444e-06, + "loss": 0.2657, + "step": 140 + }, + { + "epoch": 0.11369095276220977, + "learning_rate": 5.499986238623329e-06, + "loss": 0.4139, + "step": 142 + }, + { + "epoch": 0.11529223378702963, + "learning_rate": 5.549976745985809e-06, + "loss": 0.301, + "step": 144 + }, + { + "epoch": 0.11689351481184948, + "learning_rate": 5.6001063088780085e-06, + "loss": 0.4299, + "step": 146 + }, + { + "epoch": 0.11849479583666933, + "learning_rate": 5.650373360837763e-06, + "loss": 0.3969, + "step": 148 + }, + { + "epoch": 0.1200960768614892, + "learning_rate": 5.700776331106674e-06, + "loss": 0.2901, + "step": 150 + }, + { + "epoch": 0.12169735788630905, + "learning_rate": 5.751313644679071e-06, + "loss": 0.1541, + "step": 152 + }, + { + "epoch": 0.1232986389111289, + "learning_rate": 5.8019837223513295e-06, + "loss": 0.4732, + "step": 154 + }, + { + "epoch": 0.12489991993594876, + "learning_rate": 5.852784980771182e-06, + "loss": 0.4834, + "step": 156 + }, + { + "epoch": 0.1265012009607686, + "learning_rate": 5.903715832487138e-06, + "loss": 0.1607, + "step": 158 + }, + { + "epoch": 0.12810248198558846, + "learning_rate": 5.954774685998206e-06, + "loss": 0.4084, + "step": 160 + }, + { + "epoch": 0.1297037630104083, + "learning_rate": 6.005959945803494e-06, + "loss": 0.1804, + "step": 162 + }, + { + "epoch": 0.1313050440352282, + "learning_rate": 6.057270012452186e-06, + "loss": 0.4073, + "step": 164 + }, + { + "epoch": 0.13290632506004804, + "learning_rate": 6.108703282593461e-06, + "loss": 0.2155, + "step": 166 + }, + { + "epoch": 0.1345076060848679, + "learning_rate": 6.160258149026557e-06, + "loss": 0.2354, + "step": 168 + }, + { + "epoch": 0.13610888710968774, + "learning_rate": 6.2119330007511014e-06, + "loss": 0.301, + "step": 170 + }, + { + "epoch": 0.1377101681345076, + "learning_rate": 6.263726223017326e-06, + "loss": 0.3257, + "step": 172 + }, + { + "epoch": 0.13931144915932747, + "learning_rate": 6.315636197376634e-06, + "loss": 0.3191, + "step": 174 + }, + { + "epoch": 0.14091273018414732, + "learning_rate": 6.3676613017321305e-06, + "loss": 0.3262, + "step": 176 + }, + { + "epoch": 0.14251401120896717, + "learning_rate": 6.419799910389257e-06, + "loss": 0.2321, + "step": 178 + }, + { + "epoch": 0.14411529223378702, + "learning_rate": 6.472050394106689e-06, + "loss": 0.2706, + "step": 180 + }, + { + "epoch": 0.14571657325860687, + "learning_rate": 6.524411120147204e-06, + "loss": 0.1911, + "step": 182 + }, + { + "epoch": 0.14731785428342675, + "learning_rate": 6.576880452328645e-06, + "loss": 0.6786, + "step": 184 + }, + { + "epoch": 0.1489191353082466, + "learning_rate": 6.6294567510751675e-06, + "loss": 0.4033, + "step": 186 + }, + { + "epoch": 0.15052041633306645, + "learning_rate": 6.682138373468341e-06, + "loss": 0.5998, + "step": 188 + }, + { + "epoch": 0.1521216973578863, + "learning_rate": 6.734923673298605e-06, + "loss": 0.27, + "step": 190 + }, + { + "epoch": 0.15372297838270615, + "learning_rate": 6.787811001116654e-06, + "loss": 0.2624, + "step": 192 + }, + { + "epoch": 0.15532425940752603, + "learning_rate": 6.840798704284939e-06, + "loss": 0.261, + "step": 194 + }, + { + "epoch": 0.15692554043234588, + "learning_rate": 6.893885127029419e-06, + "loss": 0.3798, + "step": 196 + }, + { + "epoch": 0.15852682145716573, + "learning_rate": 6.94706861049117e-06, + "loss": 0.2464, + "step": 198 + }, + { + "epoch": 0.16012810248198558, + "learning_rate": 7.000347492778341e-06, + "loss": 0.229, + "step": 200 + }, + { + "epoch": 0.16172938350680544, + "learning_rate": 7.05372010901803e-06, + "loss": 0.2082, + "step": 202 + }, + { + "epoch": 0.1633306645316253, + "learning_rate": 7.1071847914082605e-06, + "loss": 0.0794, + "step": 204 + }, + { + "epoch": 0.16493194555644516, + "learning_rate": 7.160739869270219e-06, + "loss": 0.204, + "step": 206 + }, + { + "epoch": 0.16653322658126501, + "learning_rate": 7.214383669100317e-06, + "loss": 0.2751, + "step": 208 + }, + { + "epoch": 0.16813450760608487, + "learning_rate": 7.268114514622635e-06, + "loss": 0.3567, + "step": 210 + }, + { + "epoch": 0.16973578863090472, + "learning_rate": 7.321930726841144e-06, + "loss": 0.194, + "step": 212 + }, + { + "epoch": 0.17133706965572457, + "learning_rate": 7.375830624092336e-06, + "loss": 0.2544, + "step": 214 + }, + { + "epoch": 0.17293835068054444, + "learning_rate": 7.429812522097613e-06, + "loss": 0.3186, + "step": 216 + }, + { + "epoch": 0.1745396317053643, + "learning_rate": 7.4838747340160475e-06, + "loss": 0.1946, + "step": 218 + }, + { + "epoch": 0.17614091273018415, + "learning_rate": 7.538015570497046e-06, + "loss": 0.1584, + "step": 220 + }, + { + "epoch": 0.177742193755004, + "learning_rate": 7.592233339733077e-06, + "loss": 0.2897, + "step": 222 + }, + { + "epoch": 0.17934347477982385, + "learning_rate": 7.646526347512665e-06, + "loss": 0.3518, + "step": 224 + }, + { + "epoch": 0.18094475580464373, + "learning_rate": 7.70089289727319e-06, + "loss": 0.399, + "step": 226 + }, + { + "epoch": 0.18254603682946358, + "learning_rate": 7.755331290154041e-06, + "loss": 0.4025, + "step": 228 + }, + { + "epoch": 0.18414731785428343, + "learning_rate": 7.809839825049565e-06, + "loss": 0.1331, + "step": 230 + }, + { + "epoch": 0.18574859887910328, + "learning_rate": 7.864416798662347e-06, + "loss": 0.1915, + "step": 232 + }, + { + "epoch": 0.18734987990392313, + "learning_rate": 7.919060505556376e-06, + "loss": 0.0811, + "step": 234 + }, + { + "epoch": 0.188951160928743, + "learning_rate": 7.973769238210291e-06, + "loss": 0.3639, + "step": 236 + }, + { + "epoch": 0.19055244195356286, + "learning_rate": 8.028541287070858e-06, + "loss": 0.3095, + "step": 238 + }, + { + "epoch": 0.1921537229783827, + "learning_rate": 8.083374940606256e-06, + "loss": 0.2954, + "step": 240 + }, + { + "epoch": 0.19375500400320256, + "learning_rate": 8.138268485359684e-06, + "loss": 0.2803, + "step": 242 + }, + { + "epoch": 0.1953562850280224, + "learning_rate": 8.193220206002785e-06, + "loss": 0.3606, + "step": 244 + }, + { + "epoch": 0.1969575660528423, + "learning_rate": 8.248228385389349e-06, + "loss": 0.4747, + "step": 246 + }, + { + "epoch": 0.19855884707766214, + "learning_rate": 8.303291304608936e-06, + "loss": 0.3564, + "step": 248 + }, + { + "epoch": 0.200160128102482, + "learning_rate": 8.358407243040524e-06, + "loss": 0.6027, + "step": 250 + }, + { + "epoch": 0.20176140912730184, + "learning_rate": 8.413574478406386e-06, + "loss": 0.3639, + "step": 252 + }, + { + "epoch": 0.2033626901521217, + "learning_rate": 8.468791286825856e-06, + "loss": 0.2614, + "step": 254 + }, + { + "epoch": 0.20496397117694154, + "learning_rate": 8.524055942869135e-06, + "loss": 0.2291, + "step": 256 + }, + { + "epoch": 0.20656525220176142, + "learning_rate": 8.579366719611353e-06, + "loss": 0.4086, + "step": 258 + }, + { + "epoch": 0.20816653322658127, + "learning_rate": 8.634721888686368e-06, + "loss": 0.1942, + "step": 260 + }, + { + "epoch": 0.20976781425140112, + "learning_rate": 8.690119720340907e-06, + "loss": 0.3955, + "step": 262 + }, + { + "epoch": 0.21136909527622097, + "learning_rate": 8.74555848348857e-06, + "loss": 0.2556, + "step": 264 + }, + { + "epoch": 0.21297037630104082, + "learning_rate": 8.801036445763858e-06, + "loss": 0.3186, + "step": 266 + }, + { + "epoch": 0.2145716573258607, + "learning_rate": 8.856551873576448e-06, + "loss": 0.3044, + "step": 268 + }, + { + "epoch": 0.21617293835068055, + "learning_rate": 8.912103032165206e-06, + "loss": 0.5136, + "step": 270 + }, + { + "epoch": 0.2177742193755004, + "learning_rate": 8.967688185652527e-06, + "loss": 0.151, + "step": 272 + }, + { + "epoch": 0.21937550040032025, + "learning_rate": 9.023305597098526e-06, + "loss": 0.1981, + "step": 274 + }, + { + "epoch": 0.2209767814251401, + "learning_rate": 9.078953528555258e-06, + "loss": 0.2479, + "step": 276 + }, + { + "epoch": 0.22257806244995998, + "learning_rate": 9.134630241121135e-06, + "loss": 0.1826, + "step": 278 + }, + { + "epoch": 0.22417934347477983, + "learning_rate": 9.190333994995208e-06, + "loss": 0.1712, + "step": 280 + }, + { + "epoch": 0.22578062449959968, + "learning_rate": 9.24606304953148e-06, + "loss": 0.3182, + "step": 282 + }, + { + "epoch": 0.22738190552441953, + "learning_rate": 9.301815663293426e-06, + "loss": 0.3801, + "step": 284 + }, + { + "epoch": 0.22898318654923938, + "learning_rate": 9.35759009410826e-06, + "loss": 0.7529, + "step": 286 + }, + { + "epoch": 0.23058446757405926, + "learning_rate": 9.41338459912151e-06, + "loss": 0.3857, + "step": 288 + }, + { + "epoch": 0.2321857485988791, + "learning_rate": 9.469197434851414e-06, + "loss": 0.265, + "step": 290 + }, + { + "epoch": 0.23378702962369896, + "learning_rate": 9.52502685724336e-06, + "loss": 0.6295, + "step": 292 + }, + { + "epoch": 0.2353883106485188, + "learning_rate": 9.580871121724498e-06, + "loss": 0.2772, + "step": 294 + }, + { + "epoch": 0.23698959167333866, + "learning_rate": 9.636728483258116e-06, + "loss": 0.2187, + "step": 296 + }, + { + "epoch": 0.23859087269815854, + "learning_rate": 9.692597196398302e-06, + "loss": 0.2114, + "step": 298 + }, + { + "epoch": 0.2401921537229784, + "learning_rate": 9.748475515344416e-06, + "loss": 0.9093, + "step": 300 + }, + { + "epoch": 0.24179343474779824, + "learning_rate": 9.80436169399561e-06, + "loss": 0.6105, + "step": 302 + }, + { + "epoch": 0.2433947157726181, + "learning_rate": 9.8602539860055e-06, + "loss": 0.3887, + "step": 304 + }, + { + "epoch": 0.24499599679743794, + "learning_rate": 9.916150644836596e-06, + "loss": 0.1576, + "step": 306 + }, + { + "epoch": 0.2465972778222578, + "learning_rate": 9.972049923815011e-06, + "loss": 0.4897, + "step": 308 + }, + { + "epoch": 0.24819855884707767, + "learning_rate": 1.0027950076184982e-05, + "loss": 0.2143, + "step": 310 + }, + { + "epoch": 0.24979983987189752, + "learning_rate": 1.0083849355163397e-05, + "loss": 0.2139, + "step": 312 + }, + { + "epoch": 0.2514011208967174, + "learning_rate": 1.0139746013994493e-05, + "loss": 0.2611, + "step": 314 + }, + { + "epoch": 0.2530024019215372, + "learning_rate": 1.0195638306004383e-05, + "loss": 0.2608, + "step": 316 + }, + { + "epoch": 0.2546036829463571, + "learning_rate": 1.0251524484655577e-05, + "loss": 0.6007, + "step": 318 + }, + { + "epoch": 0.2562049639711769, + "learning_rate": 1.0307402803601691e-05, + "loss": 0.4326, + "step": 320 + }, + { + "epoch": 0.2578062449959968, + "learning_rate": 1.0363271516741877e-05, + "loss": 0.2655, + "step": 322 + }, + { + "epoch": 0.2594075260208166, + "learning_rate": 1.0419128878275495e-05, + "loss": 0.153, + "step": 324 + }, + { + "epoch": 0.2610088070456365, + "learning_rate": 1.0474973142756632e-05, + "loss": 0.3059, + "step": 326 + }, + { + "epoch": 0.2626100880704564, + "learning_rate": 1.053080256514858e-05, + "loss": 0.2758, + "step": 328 + }, + { + "epoch": 0.2642113690952762, + "learning_rate": 1.0586615400878484e-05, + "loss": 0.1941, + "step": 330 + }, + { + "epoch": 0.2658126501200961, + "learning_rate": 1.0642409905891733e-05, + "loss": 0.2526, + "step": 332 + }, + { + "epoch": 0.2674139311449159, + "learning_rate": 1.0698184336706567e-05, + "loss": 0.188, + "step": 334 + }, + { + "epoch": 0.2690152121697358, + "learning_rate": 1.0753936950468513e-05, + "loss": 0.2161, + "step": 336 + }, + { + "epoch": 0.27061649319455566, + "learning_rate": 1.0809666005004787e-05, + "loss": 0.4512, + "step": 338 + }, + { + "epoch": 0.2722177742193755, + "learning_rate": 1.0865369758878858e-05, + "loss": 0.1424, + "step": 340 + }, + { + "epoch": 0.27381905524419536, + "learning_rate": 1.0921046471444737e-05, + "loss": 0.242, + "step": 342 + }, + { + "epoch": 0.2754203362690152, + "learning_rate": 1.0976694402901467e-05, + "loss": 0.5488, + "step": 344 + }, + { + "epoch": 0.27702161729383507, + "learning_rate": 1.1032311814347467e-05, + "loss": 0.5284, + "step": 346 + }, + { + "epoch": 0.27862289831865494, + "learning_rate": 1.1087896967834787e-05, + "loss": 0.3832, + "step": 348 + }, + { + "epoch": 0.28022417934347477, + "learning_rate": 1.1143448126423545e-05, + "loss": 0.3525, + "step": 350 + }, + { + "epoch": 0.28182546036829464, + "learning_rate": 1.1198963554236135e-05, + "loss": 0.3334, + "step": 352 + }, + { + "epoch": 0.28342674139311447, + "learning_rate": 1.1254441516511425e-05, + "loss": 0.2933, + "step": 354 + }, + { + "epoch": 0.28502802241793435, + "learning_rate": 1.1309880279659087e-05, + "loss": 0.4692, + "step": 356 + }, + { + "epoch": 0.2866293034427542, + "learning_rate": 1.1365278111313625e-05, + "loss": 0.295, + "step": 358 + }, + { + "epoch": 0.28823058446757405, + "learning_rate": 1.142063328038864e-05, + "loss": 0.2235, + "step": 360 + }, + { + "epoch": 0.2898318654923939, + "learning_rate": 1.1475944057130856e-05, + "loss": 0.4145, + "step": 362 + }, + { + "epoch": 0.29143314651721375, + "learning_rate": 1.1531208713174138e-05, + "loss": 0.4087, + "step": 364 + }, + { + "epoch": 0.2930344275420336, + "learning_rate": 1.1586425521593607e-05, + "loss": 0.3093, + "step": 366 + }, + { + "epoch": 0.2946357085668535, + "learning_rate": 1.1641592756959467e-05, + "loss": 0.253, + "step": 368 + }, + { + "epoch": 0.2962369895916733, + "learning_rate": 1.1696708695391057e-05, + "loss": 0.2507, + "step": 370 + }, + { + "epoch": 0.2978382706164932, + "learning_rate": 1.1751771614610643e-05, + "loss": 0.1077, + "step": 372 + }, + { + "epoch": 0.29943955164131303, + "learning_rate": 1.180677979399721e-05, + "loss": 0.2479, + "step": 374 + }, + { + "epoch": 0.3010408326661329, + "learning_rate": 1.1861731514640309e-05, + "loss": 0.3662, + "step": 376 + }, + { + "epoch": 0.3026421136909528, + "learning_rate": 1.1916625059393739e-05, + "loss": 0.2715, + "step": 378 + }, + { + "epoch": 0.3042433947157726, + "learning_rate": 1.1971458712929133e-05, + "loss": 0.1901, + "step": 380 + }, + { + "epoch": 0.3058446757405925, + "learning_rate": 1.2026230761789702e-05, + "loss": 0.427, + "step": 382 + }, + { + "epoch": 0.3074459567654123, + "learning_rate": 1.2080939494443618e-05, + "loss": 0.1728, + "step": 384 + }, + { + "epoch": 0.3090472377902322, + "learning_rate": 1.2135583201337646e-05, + "loss": 0.3839, + "step": 386 + }, + { + "epoch": 0.31064851881505207, + "learning_rate": 1.2190160174950428e-05, + "loss": 0.4501, + "step": 388 + }, + { + "epoch": 0.3122497998398719, + "learning_rate": 1.2244668709845952e-05, + "loss": 0.2978, + "step": 390 + }, + { + "epoch": 0.31385108086469177, + "learning_rate": 1.2299107102726804e-05, + "loss": 0.3775, + "step": 392 + }, + { + "epoch": 0.3154523618895116, + "learning_rate": 1.2353473652487329e-05, + "loss": 0.3263, + "step": 394 + }, + { + "epoch": 0.31705364291433147, + "learning_rate": 1.2407766660266916e-05, + "loss": 0.2357, + "step": 396 + }, + { + "epoch": 0.31865492393915135, + "learning_rate": 1.2461984429502947e-05, + "loss": 0.2358, + "step": 398 + }, + { + "epoch": 0.32025620496397117, + "learning_rate": 1.2516125265983945e-05, + "loss": 0.3523, + "step": 400 + }, + { + "epoch": 0.32185748598879105, + "learning_rate": 1.257018747790238e-05, + "loss": 0.5192, + "step": 402 + }, + { + "epoch": 0.32345876701361087, + "learning_rate": 1.2624169375907657e-05, + "loss": 0.2523, + "step": 404 + }, + { + "epoch": 0.32506004803843075, + "learning_rate": 1.2678069273158849e-05, + "loss": 0.2411, + "step": 406 + }, + { + "epoch": 0.3266613290632506, + "learning_rate": 1.273188548537736e-05, + "loss": 0.358, + "step": 408 + }, + { + "epoch": 0.32826261008807045, + "learning_rate": 1.2785616330899676e-05, + "loss": 0.2523, + "step": 410 + }, + { + "epoch": 0.32986389111289033, + "learning_rate": 1.2839260130729776e-05, + "loss": 0.1498, + "step": 412 + }, + { + "epoch": 0.33146517213771015, + "learning_rate": 1.2892815208591734e-05, + "loss": 0.371, + "step": 414 + }, + { + "epoch": 0.33306645316253003, + "learning_rate": 1.2946279890981966e-05, + "loss": 0.4179, + "step": 416 + }, + { + "epoch": 0.33466773418734985, + "learning_rate": 1.2999652507221652e-05, + "loss": 0.3325, + "step": 418 + }, + { + "epoch": 0.33626901521216973, + "learning_rate": 1.3052931389508822e-05, + "loss": 0.3577, + "step": 420 + }, + { + "epoch": 0.3378702962369896, + "learning_rate": 1.3106114872970575e-05, + "loss": 0.2958, + "step": 422 + }, + { + "epoch": 0.33947157726180943, + "learning_rate": 1.3159201295715054e-05, + "loss": 0.1022, + "step": 424 + }, + { + "epoch": 0.3410728582866293, + "learning_rate": 1.321218899888334e-05, + "loss": 0.2703, + "step": 426 + }, + { + "epoch": 0.34267413931144913, + "learning_rate": 1.326507632670139e-05, + "loss": 0.1981, + "step": 428 + }, + { + "epoch": 0.344275420336269, + "learning_rate": 1.3317861626531652e-05, + "loss": 0.3417, + "step": 430 + }, + { + "epoch": 0.3458767013610889, + "learning_rate": 1.3370543248924826e-05, + "loss": 0.1279, + "step": 432 + }, + { + "epoch": 0.3474779823859087, + "learning_rate": 1.3423119547671348e-05, + "loss": 0.3149, + "step": 434 + }, + { + "epoch": 0.3490792634107286, + "learning_rate": 1.347558887985279e-05, + "loss": 0.3445, + "step": 436 + }, + { + "epoch": 0.3506805444355484, + "learning_rate": 1.3527949605893305e-05, + "loss": 0.5742, + "step": 438 + }, + { + "epoch": 0.3522818254603683, + "learning_rate": 1.3580200089610739e-05, + "loss": 0.3941, + "step": 440 + }, + { + "epoch": 0.35388310648518817, + "learning_rate": 1.3632338698267863e-05, + "loss": 0.1528, + "step": 442 + }, + { + "epoch": 0.355484387510008, + "learning_rate": 1.368436380262336e-05, + "loss": 0.2585, + "step": 444 + }, + { + "epoch": 0.35708566853482787, + "learning_rate": 1.3736273776982667e-05, + "loss": 0.5539, + "step": 446 + }, + { + "epoch": 0.3586869495596477, + "learning_rate": 1.3788066999248893e-05, + "loss": 0.4153, + "step": 448 + }, + { + "epoch": 0.3602882305844676, + "learning_rate": 1.3839741850973435e-05, + "loss": 0.4326, + "step": 450 + }, + { + "epoch": 0.36188951160928745, + "learning_rate": 1.3891296717406533e-05, + "loss": 0.2197, + "step": 452 + }, + { + "epoch": 0.3634907926341073, + "learning_rate": 1.3942729987547808e-05, + "loss": 0.3346, + "step": 454 + }, + { + "epoch": 0.36509207365892715, + "learning_rate": 1.3994040054196498e-05, + "loss": 0.3057, + "step": 456 + }, + { + "epoch": 0.366693354683747, + "learning_rate": 1.4045225314001789e-05, + "loss": 0.2272, + "step": 458 + }, + { + "epoch": 0.36829463570856685, + "learning_rate": 1.4096284167512856e-05, + "loss": 0.3636, + "step": 460 + }, + { + "epoch": 0.36989591673338673, + "learning_rate": 1.4147215019228813e-05, + "loss": 0.4178, + "step": 462 + }, + { + "epoch": 0.37149719775820655, + "learning_rate": 1.4198016277648665e-05, + "loss": 0.4713, + "step": 464 + }, + { + "epoch": 0.37309847878302643, + "learning_rate": 1.4248686355320922e-05, + "loss": 0.1643, + "step": 466 + }, + { + "epoch": 0.37469975980784626, + "learning_rate": 1.429922366889332e-05, + "loss": 0.4181, + "step": 468 + }, + { + "epoch": 0.37630104083266613, + "learning_rate": 1.4349626639162231e-05, + "loss": 0.5728, + "step": 470 + }, + { + "epoch": 0.377902321857486, + "learning_rate": 1.4399893691121985e-05, + "loss": 0.295, + "step": 472 + }, + { + "epoch": 0.37950360288230583, + "learning_rate": 1.4450023254014185e-05, + "loss": 0.3491, + "step": 474 + }, + { + "epoch": 0.3811048839071257, + "learning_rate": 1.4500013761376663e-05, + "loss": 0.2327, + "step": 476 + }, + { + "epoch": 0.38270616493194554, + "learning_rate": 1.454986365109255e-05, + "loss": 0.6588, + "step": 478 + }, + { + "epoch": 0.3843074459567654, + "learning_rate": 1.4599571365439027e-05, + "loss": 0.2952, + "step": 480 + }, + { + "epoch": 0.3859087269815853, + "learning_rate": 1.4649135351135968e-05, + "loss": 0.4028, + "step": 482 + }, + { + "epoch": 0.3875100080064051, + "learning_rate": 1.4698554059394563e-05, + "loss": 0.2079, + "step": 484 + }, + { + "epoch": 0.389111289031225, + "learning_rate": 1.4747825945965675e-05, + "loss": 0.1204, + "step": 486 + }, + { + "epoch": 0.3907125700560448, + "learning_rate": 1.4796949471188033e-05, + "loss": 0.2889, + "step": 488 + }, + { + "epoch": 0.3923138510808647, + "learning_rate": 1.4845923100036479e-05, + "loss": 0.2378, + "step": 490 + }, + { + "epoch": 0.3939151321056846, + "learning_rate": 1.4894745302169786e-05, + "loss": 0.5849, + "step": 492 + }, + { + "epoch": 0.3955164131305044, + "learning_rate": 1.4943414551978597e-05, + "loss": 0.1786, + "step": 494 + }, + { + "epoch": 0.3971176941553243, + "learning_rate": 1.499192932863305e-05, + "loss": 0.3802, + "step": 496 + }, + { + "epoch": 0.3987189751801441, + "learning_rate": 1.5040288116130261e-05, + "loss": 0.3521, + "step": 498 + }, + { + "epoch": 0.400320256204964, + "learning_rate": 1.5088489403341793e-05, + "loss": 0.2197, + "step": 500 + }, + { + "epoch": 0.40192153722978385, + "learning_rate": 1.513653168406076e-05, + "loss": 0.4146, + "step": 502 + }, + { + "epoch": 0.4035228182546037, + "learning_rate": 1.5184413457049006e-05, + "loss": 0.3185, + "step": 504 + }, + { + "epoch": 0.40512409927942356, + "learning_rate": 1.5232133226083954e-05, + "loss": 0.1355, + "step": 506 + }, + { + "epoch": 0.4067253803042434, + "learning_rate": 1.527968950000533e-05, + "loss": 0.2762, + "step": 508 + }, + { + "epoch": 0.40832666132906326, + "learning_rate": 1.532708079276185e-05, + "loss": 0.4326, + "step": 510 + }, + { + "epoch": 0.4099279423538831, + "learning_rate": 1.5374305623457594e-05, + "loss": 0.2247, + "step": 512 + }, + { + "epoch": 0.41152922337870296, + "learning_rate": 1.542136251639826e-05, + "loss": 0.6592, + "step": 514 + }, + { + "epoch": 0.41313050440352284, + "learning_rate": 1.5468250001137368e-05, + "loss": 0.3659, + "step": 516 + }, + { + "epoch": 0.41473178542834266, + "learning_rate": 1.551496661252208e-05, + "loss": 0.2576, + "step": 518 + }, + { + "epoch": 0.41633306645316254, + "learning_rate": 1.5561510890739113e-05, + "loss": 0.3176, + "step": 520 + }, + { + "epoch": 0.41793434747798236, + "learning_rate": 1.5607881381360296e-05, + "loss": 0.3187, + "step": 522 + }, + { + "epoch": 0.41953562850280224, + "learning_rate": 1.565407663538797e-05, + "loss": 0.4297, + "step": 524 + }, + { + "epoch": 0.4211369095276221, + "learning_rate": 1.5700095209300376e-05, + "loss": 0.383, + "step": 526 + }, + { + "epoch": 0.42273819055244194, + "learning_rate": 1.5745935665096647e-05, + "loss": 0.3178, + "step": 528 + }, + { + "epoch": 0.4243394715772618, + "learning_rate": 1.5791596570341844e-05, + "loss": 0.1942, + "step": 530 + }, + { + "epoch": 0.42594075260208164, + "learning_rate": 1.5837076498211666e-05, + "loss": 0.349, + "step": 532 + }, + { + "epoch": 0.4275420336269015, + "learning_rate": 1.5882374027537005e-05, + "loss": 0.2974, + "step": 534 + }, + { + "epoch": 0.4291433146517214, + "learning_rate": 1.5927487742848448e-05, + "loss": 0.528, + "step": 536 + }, + { + "epoch": 0.4307445956765412, + "learning_rate": 1.5972416234420393e-05, + "loss": 0.2817, + "step": 538 + }, + { + "epoch": 0.4323458767013611, + "learning_rate": 1.60171580983152e-05, + "loss": 0.4088, + "step": 540 + }, + { + "epoch": 0.4339471577261809, + "learning_rate": 1.606171193642703e-05, + "loss": 0.2257, + "step": 542 + }, + { + "epoch": 0.4355484387510008, + "learning_rate": 1.6106076356525474e-05, + "loss": 0.1423, + "step": 544 + }, + { + "epoch": 0.4371497197758207, + "learning_rate": 1.6150249972299153e-05, + "loss": 0.4715, + "step": 546 + }, + { + "epoch": 0.4387510008006405, + "learning_rate": 1.6194231403398994e-05, + "loss": 0.2137, + "step": 548 + }, + { + "epoch": 0.4403522818254604, + "learning_rate": 1.6238019275481313e-05, + "loss": 0.3578, + "step": 550 + }, + { + "epoch": 0.4419535628502802, + "learning_rate": 1.6281612220250883e-05, + "loss": 0.381, + "step": 552 + }, + { + "epoch": 0.4435548438751001, + "learning_rate": 1.6325008875503543e-05, + "loss": 0.2471, + "step": 554 + }, + { + "epoch": 0.44515612489991996, + "learning_rate": 1.6368207885168897e-05, + "loss": 0.3642, + "step": 556 + }, + { + "epoch": 0.4467574059247398, + "learning_rate": 1.641120789935263e-05, + "loss": 0.3177, + "step": 558 + }, + { + "epoch": 0.44835868694955966, + "learning_rate": 1.6454007574378637e-05, + "loss": 0.234, + "step": 560 + }, + { + "epoch": 0.4499599679743795, + "learning_rate": 1.6496605572831134e-05, + "loss": 0.2379, + "step": 562 + }, + { + "epoch": 0.45156124899919936, + "learning_rate": 1.6539000563596318e-05, + "loss": 0.4404, + "step": 564 + }, + { + "epoch": 0.45316253002401924, + "learning_rate": 1.6581191221904077e-05, + "loss": 0.6368, + "step": 566 + }, + { + "epoch": 0.45476381104883906, + "learning_rate": 1.6623176229369324e-05, + "loss": 0.3803, + "step": 568 + }, + { + "epoch": 0.45636509207365894, + "learning_rate": 1.6664954274033168e-05, + "loss": 0.3813, + "step": 570 + }, + { + "epoch": 0.45796637309847876, + "learning_rate": 1.6706524050403996e-05, + "loss": 0.2612, + "step": 572 + }, + { + "epoch": 0.45956765412329864, + "learning_rate": 1.674788425949818e-05, + "loss": 0.2377, + "step": 574 + }, + { + "epoch": 0.4611689351481185, + "learning_rate": 1.6789033608880735e-05, + "loss": 0.2904, + "step": 576 + }, + { + "epoch": 0.46277021617293834, + "learning_rate": 1.6829970812705674e-05, + "loss": 0.1953, + "step": 578 + }, + { + "epoch": 0.4643714971977582, + "learning_rate": 1.6870694591756165e-05, + "loss": 0.3054, + "step": 580 + }, + { + "epoch": 0.46597277822257804, + "learning_rate": 1.6911203673484577e-05, + "loss": 0.3179, + "step": 582 + }, + { + "epoch": 0.4675740592473979, + "learning_rate": 1.695149679205214e-05, + "loss": 0.3263, + "step": 584 + }, + { + "epoch": 0.4691753402722178, + "learning_rate": 1.6991572688368628e-05, + "loss": 0.2193, + "step": 586 + }, + { + "epoch": 0.4707766212970376, + "learning_rate": 1.7031430110131562e-05, + "loss": 0.1603, + "step": 588 + }, + { + "epoch": 0.4723779023218575, + "learning_rate": 1.7071067811865467e-05, + "loss": 0.3056, + "step": 590 + }, + { + "epoch": 0.4739791833466773, + "learning_rate": 1.7110484554960738e-05, + "loss": 0.3012, + "step": 592 + }, + { + "epoch": 0.4755804643714972, + "learning_rate": 1.7149679107712306e-05, + "loss": 0.5593, + "step": 594 + }, + { + "epoch": 0.4771817453963171, + "learning_rate": 1.7188650245358215e-05, + "loss": 0.3178, + "step": 596 + }, + { + "epoch": 0.4787830264211369, + "learning_rate": 1.722739675011779e-05, + "loss": 0.4248, + "step": 598 + }, + { + "epoch": 0.4803843074459568, + "learning_rate": 1.726591741122981e-05, + "loss": 0.2739, + "step": 600 + }, + { + "epoch": 0.4819855884707766, + "learning_rate": 1.730421102499021e-05, + "loss": 0.2863, + "step": 602 + }, + { + "epoch": 0.4835868694955965, + "learning_rate": 1.734227639478982e-05, + "loss": 0.2465, + "step": 604 + }, + { + "epoch": 0.4851881505204163, + "learning_rate": 1.738011233115165e-05, + "loss": 0.2094, + "step": 606 + }, + { + "epoch": 0.4867894315452362, + "learning_rate": 1.7417717651768144e-05, + "loss": 0.3215, + "step": 608 + }, + { + "epoch": 0.48839071257005606, + "learning_rate": 1.7455091181538087e-05, + "loss": 0.301, + "step": 610 + }, + { + "epoch": 0.4899919935948759, + "learning_rate": 1.74922317526033e-05, + "loss": 0.2581, + "step": 612 + }, + { + "epoch": 0.49159327461969576, + "learning_rate": 1.752913820438519e-05, + "loss": 0.2955, + "step": 614 + }, + { + "epoch": 0.4931945556445156, + "learning_rate": 1.756580938362096e-05, + "loss": 0.6412, + "step": 616 + }, + { + "epoch": 0.49479583666933546, + "learning_rate": 1.7602244144399693e-05, + "loss": 0.2727, + "step": 618 + }, + { + "epoch": 0.49639711769415534, + "learning_rate": 1.7638441348198147e-05, + "loss": 0.5029, + "step": 620 + }, + { + "epoch": 0.49799839871897517, + "learning_rate": 1.7674399863916295e-05, + "loss": 0.5423, + "step": 622 + }, + { + "epoch": 0.49959967974379504, + "learning_rate": 1.771011856791273e-05, + "loss": 0.573, + "step": 624 + }, + { + "epoch": 0.5012009607686149, + "learning_rate": 1.7745596344039712e-05, + "loss": 0.5288, + "step": 626 + }, + { + "epoch": 0.5028022417934348, + "learning_rate": 1.7780832083678116e-05, + "loss": 0.5491, + "step": 628 + }, + { + "epoch": 0.5044035228182546, + "learning_rate": 1.7815824685772035e-05, + "loss": 0.4164, + "step": 630 + }, + { + "epoch": 0.5060048038430744, + "learning_rate": 1.7850573056863156e-05, + "loss": 0.3098, + "step": 632 + }, + { + "epoch": 0.5076060848678943, + "learning_rate": 1.7885076111125004e-05, + "loss": 0.3816, + "step": 634 + }, + { + "epoch": 0.5092073658927142, + "learning_rate": 1.791933277039679e-05, + "loss": 0.2982, + "step": 636 + }, + { + "epoch": 0.510808646917534, + "learning_rate": 1.7953341964217183e-05, + "loss": 0.3891, + "step": 638 + }, + { + "epoch": 0.5124099279423538, + "learning_rate": 1.7987102629857696e-05, + "loss": 0.3531, + "step": 640 + }, + { + "epoch": 0.5140112089671738, + "learning_rate": 1.8020613712355912e-05, + "loss": 0.4591, + "step": 642 + }, + { + "epoch": 0.5156124899919936, + "learning_rate": 1.805387416454847e-05, + "loss": 0.2072, + "step": 644 + }, + { + "epoch": 0.5172137710168134, + "learning_rate": 1.8086882947103787e-05, + "loss": 0.2968, + "step": 646 + }, + { + "epoch": 0.5188150520416333, + "learning_rate": 1.811963902855447e-05, + "loss": 0.3706, + "step": 648 + }, + { + "epoch": 0.5204163330664532, + "learning_rate": 1.8152141385329658e-05, + "loss": 0.3769, + "step": 650 + }, + { + "epoch": 0.522017614091273, + "learning_rate": 1.8184389001786895e-05, + "loss": 0.2334, + "step": 652 + }, + { + "epoch": 0.5236188951160928, + "learning_rate": 1.821638087024396e-05, + "loss": 0.5088, + "step": 654 + }, + { + "epoch": 0.5252201761409128, + "learning_rate": 1.8248115991010296e-05, + "loss": 0.2041, + "step": 656 + }, + { + "epoch": 0.5268214571657326, + "learning_rate": 1.8279593372418264e-05, + "loss": 0.3667, + "step": 658 + }, + { + "epoch": 0.5284227381905524, + "learning_rate": 1.8310812030854155e-05, + "loss": 0.3333, + "step": 660 + }, + { + "epoch": 0.5300240192153723, + "learning_rate": 1.834177099078887e-05, + "loss": 0.2821, + "step": 662 + }, + { + "epoch": 0.5316253002401922, + "learning_rate": 1.8372469284808465e-05, + "loss": 0.2258, + "step": 664 + }, + { + "epoch": 0.533226581265012, + "learning_rate": 1.840290595364436e-05, + "loss": 0.3455, + "step": 666 + }, + { + "epoch": 0.5348278622898318, + "learning_rate": 1.8433080046203286e-05, + "loss": 0.2789, + "step": 668 + }, + { + "epoch": 0.5364291433146517, + "learning_rate": 1.8462990619597054e-05, + "loss": 0.1532, + "step": 670 + }, + { + "epoch": 0.5380304243394716, + "learning_rate": 1.8492636739171966e-05, + "loss": 0.4843, + "step": 672 + }, + { + "epoch": 0.5396317053642914, + "learning_rate": 1.8522017478538067e-05, + "loss": 0.1098, + "step": 674 + }, + { + "epoch": 0.5412329863891113, + "learning_rate": 1.855113191959808e-05, + "loss": 0.6409, + "step": 676 + }, + { + "epoch": 0.5428342674139311, + "learning_rate": 1.8579979152576063e-05, + "loss": 0.2029, + "step": 678 + }, + { + "epoch": 0.544435548438751, + "learning_rate": 1.8608558276045895e-05, + "loss": 0.2365, + "step": 680 + }, + { + "epoch": 0.5460368294635709, + "learning_rate": 1.86368683969594e-05, + "loss": 0.2559, + "step": 682 + }, + { + "epoch": 0.5476381104883907, + "learning_rate": 1.866490863067425e-05, + "loss": 0.3269, + "step": 684 + }, + { + "epoch": 0.5492393915132106, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.3538, + "step": 686 + }, + { + "epoch": 0.5508406725380304, + "learning_rate": 1.8720175940133705e-05, + "loss": 0.2044, + "step": 688 + }, + { + "epoch": 0.5524419535628503, + "learning_rate": 1.8747401288870472e-05, + "loss": 0.2178, + "step": 690 + }, + { + "epoch": 0.5540432345876701, + "learning_rate": 1.877435329644691e-05, + "loss": 0.4298, + "step": 692 + }, + { + "epoch": 0.55564451561249, + "learning_rate": 1.8801031120659393e-05, + "loss": 0.2489, + "step": 694 + }, + { + "epoch": 0.5572457966373099, + "learning_rate": 1.8827433927872066e-05, + "loss": 0.3354, + "step": 696 + }, + { + "epoch": 0.5588470776621297, + "learning_rate": 1.8853560893042854e-05, + "loss": 0.3572, + "step": 698 + }, + { + "epoch": 0.5604483586869495, + "learning_rate": 1.8879411199749303e-05, + "loss": 0.2201, + "step": 700 + }, + { + "epoch": 0.5620496397117695, + "learning_rate": 1.8904984040214037e-05, + "loss": 0.6081, + "step": 702 + }, + { + "epoch": 0.5636509207365893, + "learning_rate": 1.893027861533002e-05, + "loss": 0.2074, + "step": 704 + }, + { + "epoch": 0.5652522017614091, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.272, + "step": 706 + }, + { + "epoch": 0.5668534827862289, + "learning_rate": 1.898002981658886e-05, + "loss": 0.3499, + "step": 708 + }, + { + "epoch": 0.5684547638110489, + "learning_rate": 1.9004484888092724e-05, + "loss": 0.2571, + "step": 710 + }, + { + "epoch": 0.5700560448358687, + "learning_rate": 1.9028658585018455e-05, + "loss": 0.4096, + "step": 712 + }, + { + "epoch": 0.5716573258606885, + "learning_rate": 1.9052550151979816e-05, + "loss": 0.2561, + "step": 714 + }, + { + "epoch": 0.5732586068855084, + "learning_rate": 1.9076158842406674e-05, + "loss": 0.4921, + "step": 716 + }, + { + "epoch": 0.5748598879103283, + "learning_rate": 1.909948391856829e-05, + "loss": 0.2173, + "step": 718 + }, + { + "epoch": 0.5764611689351481, + "learning_rate": 1.912252465159637e-05, + "loss": 0.31, + "step": 720 + }, + { + "epoch": 0.578062449959968, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.205, + "step": 722 + }, + { + "epoch": 0.5796637309847879, + "learning_rate": 1.9167750217227454e-05, + "loss": 0.2309, + "step": 724 + }, + { + "epoch": 0.5812650120096077, + "learning_rate": 1.9189933636609747e-05, + "loss": 0.2162, + "step": 726 + }, + { + "epoch": 0.5828662930344275, + "learning_rate": 1.9211829886461274e-05, + "loss": 0.3833, + "step": 728 + }, + { + "epoch": 0.5844675740592474, + "learning_rate": 1.9233438282562085e-05, + "loss": 0.3615, + "step": 730 + }, + { + "epoch": 0.5860688550840673, + "learning_rate": 1.925475814968719e-05, + "loss": 0.3381, + "step": 732 + }, + { + "epoch": 0.5876701361088871, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.3071, + "step": 734 + }, + { + "epoch": 0.589271417133707, + "learning_rate": 1.9296529641211215e-05, + "loss": 0.2172, + "step": 736 + }, + { + "epoch": 0.5908726981585268, + "learning_rate": 1.9316979960323286e-05, + "loss": 0.6295, + "step": 738 + }, + { + "epoch": 0.5924739791833467, + "learning_rate": 1.9337139139926707e-05, + "loss": 0.2147, + "step": 740 + }, + { + "epoch": 0.5940752602081665, + "learning_rate": 1.935700655008199e-05, + "loss": 0.3329, + "step": 742 + }, + { + "epoch": 0.5956765412329864, + "learning_rate": 1.9376581569966933e-05, + "loss": 0.2848, + "step": 744 + }, + { + "epoch": 0.5972778222578062, + "learning_rate": 1.939586358789602e-05, + "loss": 0.1857, + "step": 746 + }, + { + "epoch": 0.5988791032826261, + "learning_rate": 1.9414852001339547e-05, + "loss": 0.2687, + "step": 748 + }, + { + "epoch": 0.600480384307446, + "learning_rate": 1.9433546216942423e-05, + "loss": 0.1438, + "step": 750 + }, + { + "epoch": 0.6020816653322658, + "learning_rate": 1.945194565054276e-05, + "loss": 0.8083, + "step": 752 + }, + { + "epoch": 0.6036829463570856, + "learning_rate": 1.9470049727190073e-05, + "loss": 0.3965, + "step": 754 + }, + { + "epoch": 0.6052842273819056, + "learning_rate": 1.948785788116329e-05, + "loss": 0.2767, + "step": 756 + }, + { + "epoch": 0.6068855084067254, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.196, + "step": 758 + }, + { + "epoch": 0.6084867894315452, + "learning_rate": 1.952258420445583e-05, + "loss": 0.2514, + "step": 760 + }, + { + "epoch": 0.610088070456365, + "learning_rate": 1.953950128863762e-05, + "loss": 0.384, + "step": 762 + }, + { + "epoch": 0.611689351481185, + "learning_rate": 1.9556120279904144e-05, + "loss": 0.3368, + "step": 764 + }, + { + "epoch": 0.6132906325060048, + "learning_rate": 1.957244065894066e-05, + "loss": 0.178, + "step": 766 + }, + { + "epoch": 0.6148919135308246, + "learning_rate": 1.9588461915763566e-05, + "loss": 0.4237, + "step": 768 + }, + { + "epoch": 0.6164931945556446, + "learning_rate": 1.9604183549736283e-05, + "loss": 0.3731, + "step": 770 + }, + { + "epoch": 0.6180944755804644, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.2067, + "step": 772 + }, + { + "epoch": 0.6196957566052842, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.9718, + "step": 774 + }, + { + "epoch": 0.6212970376301041, + "learning_rate": 1.964954584871995e-05, + "loss": 0.8331, + "step": 776 + }, + { + "epoch": 0.622898318654924, + "learning_rate": 1.966406417240872e-05, + "loss": 0.6774, + "step": 778 + }, + { + "epoch": 0.6244995996797438, + "learning_rate": 1.967828051080755e-05, + "loss": 0.1875, + "step": 780 + }, + { + "epoch": 0.6261008807045636, + "learning_rate": 1.969219441968046e-05, + "loss": 0.1729, + "step": 782 + }, + { + "epoch": 0.6277021617293835, + "learning_rate": 1.9705805464241856e-05, + "loss": 0.4116, + "step": 784 + }, + { + "epoch": 0.6293034427542034, + "learning_rate": 1.971911321917015e-05, + "loss": 0.3399, + "step": 786 + }, + { + "epoch": 0.6309047237790232, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.5712, + "step": 788 + }, + { + "epoch": 0.6325060048038431, + "learning_rate": 1.9744817206240377e-05, + "loss": 0.3246, + "step": 790 + }, + { + "epoch": 0.6341072858286629, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.5946, + "step": 792 + }, + { + "epoch": 0.6357085668534828, + "learning_rate": 1.976930316809569e-05, + "loss": 0.3456, + "step": 794 + }, + { + "epoch": 0.6373098478783027, + "learning_rate": 1.978108842718768e-05, + "loss": 0.3054, + "step": 796 + }, + { + "epoch": 0.6389111289031225, + "learning_rate": 1.9792568044184176e-05, + "loss": 0.3628, + "step": 798 + }, + { + "epoch": 0.6405124099279423, + "learning_rate": 1.9803741660367015e-05, + "loss": 0.4554, + "step": 800 + }, + { + "epoch": 0.6421136909527622, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.1687, + "step": 802 + }, + { + "epoch": 0.6437149719775821, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.3817, + "step": 804 + }, + { + "epoch": 0.6453162530024019, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.498, + "step": 806 + }, + { + "epoch": 0.6469175340272217, + "learning_rate": 1.9845369277495102e-05, + "loss": 0.3245, + "step": 808 + }, + { + "epoch": 0.6485188150520417, + "learning_rate": 1.985500784388244e-05, + "loss": 0.3523, + "step": 810 + }, + { + "epoch": 0.6501200960768615, + "learning_rate": 1.9864338458320366e-05, + "loss": 0.254, + "step": 812 + }, + { + "epoch": 0.6517213771016813, + "learning_rate": 1.9873360829243323e-05, + "loss": 0.2298, + "step": 814 + }, + { + "epoch": 0.6533226581265013, + "learning_rate": 1.9882074674717836e-05, + "loss": 0.3504, + "step": 816 + }, + { + "epoch": 0.6549239391513211, + "learning_rate": 1.989047972245129e-05, + "loss": 0.2615, + "step": 818 + }, + { + "epoch": 0.6565252201761409, + "learning_rate": 1.989857570980049e-05, + "loss": 0.2027, + "step": 820 + }, + { + "epoch": 0.6581265012009607, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.3503, + "step": 822 + }, + { + "epoch": 0.6597277822257807, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.3276, + "step": 824 + }, + { + "epoch": 0.6613290632506005, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.2316, + "step": 826 + }, + { + "epoch": 0.6629303442754203, + "learning_rate": 1.9927864140670615e-05, + "loss": 0.4523, + "step": 828 + }, + { + "epoch": 0.6645316253002402, + "learning_rate": 1.99344112247369e-05, + "loss": 0.2241, + "step": 830 + }, + { + "epoch": 0.6661329063250601, + "learning_rate": 1.9940647875635463e-05, + "loss": 0.1858, + "step": 832 + }, + { + "epoch": 0.6677341873498799, + "learning_rate": 1.994657389848176e-05, + "loss": 0.5501, + "step": 834 + }, + { + "epoch": 0.6693354683746997, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.1244, + "step": 836 + }, + { + "epoch": 0.6709367493995196, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.3512, + "step": 838 + }, + { + "epoch": 0.6725380304243395, + "learning_rate": 1.996248639549475e-05, + "loss": 0.353, + "step": 840 + }, + { + "epoch": 0.6741393114491593, + "learning_rate": 1.9967168151503196e-05, + "loss": 0.4388, + "step": 842 + }, + { + "epoch": 0.6757405924739792, + "learning_rate": 1.997153845074662e-05, + "loss": 0.4521, + "step": 844 + }, + { + "epoch": 0.677341873498799, + "learning_rate": 1.997559715666073e-05, + "loss": 0.3101, + "step": 846 + }, + { + "epoch": 0.6789431545236189, + "learning_rate": 1.9979344142417986e-05, + "loss": 0.1684, + "step": 848 + }, + { + "epoch": 0.6805444355484388, + "learning_rate": 1.998277929093157e-05, + "loss": 0.3267, + "step": 850 + }, + { + "epoch": 0.6821457165732586, + "learning_rate": 1.9985902494859023e-05, + "loss": 0.4093, + "step": 852 + }, + { + "epoch": 0.6837469975980784, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.3143, + "step": 854 + }, + { + "epoch": 0.6853482786228983, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.217, + "step": 856 + }, + { + "epoch": 0.6869495596477182, + "learning_rate": 1.999339951193407e-05, + "loss": 0.2512, + "step": 858 + }, + { + "epoch": 0.688550840672538, + "learning_rate": 1.9995274059091018e-05, + "loss": 0.3446, + "step": 860 + }, + { + "epoch": 0.6901521216973578, + "learning_rate": 1.999683627122195e-05, + "loss": 0.2876, + "step": 862 + }, + { + "epoch": 0.6917534027221778, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.5495, + "step": 864 + }, + { + "epoch": 0.6933546837469976, + "learning_rate": 1.99990235049015e-05, + "loss": 0.4898, + "step": 866 + }, + { + "epoch": 0.6949559647718174, + "learning_rate": 1.999964845810285e-05, + "loss": 0.4127, + "step": 868 + }, + { + "epoch": 0.6965572457966374, + "learning_rate": 1.999996093958578e-05, + "loss": 0.2745, + "step": 870 + }, + { + "epoch": 0.6981585268214572, + "learning_rate": 1.999996093958578e-05, + "loss": 0.3736, + "step": 872 + }, + { + "epoch": 0.699759807846277, + "learning_rate": 1.999964845810285e-05, + "loss": 0.9237, + "step": 874 + }, + { + "epoch": 0.7013610888710968, + "learning_rate": 1.99990235049015e-05, + "loss": 0.615, + "step": 876 + }, + { + "epoch": 0.7029623698959168, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.358, + "step": 878 + }, + { + "epoch": 0.7045636509207366, + "learning_rate": 1.999683627122195e-05, + "loss": 0.2029, + "step": 880 + }, + { + "epoch": 0.7061649319455564, + "learning_rate": 1.999527405909102e-05, + "loss": 0.298, + "step": 882 + }, + { + "epoch": 0.7077662129703763, + "learning_rate": 1.999339951193407e-05, + "loss": 0.2875, + "step": 884 + }, + { + "epoch": 0.7093674939951962, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.2484, + "step": 886 + }, + { + "epoch": 0.710968775020016, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.3408, + "step": 888 + }, + { + "epoch": 0.7125700560448359, + "learning_rate": 1.9985902494859026e-05, + "loss": 0.3462, + "step": 890 + }, + { + "epoch": 0.7141713370696557, + "learning_rate": 1.9982779290931572e-05, + "loss": 0.1581, + "step": 892 + }, + { + "epoch": 0.7157726180944756, + "learning_rate": 1.997934414241799e-05, + "loss": 0.254, + "step": 894 + }, + { + "epoch": 0.7173738991192954, + "learning_rate": 1.997559715666073e-05, + "loss": 0.2163, + "step": 896 + }, + { + "epoch": 0.7189751801441153, + "learning_rate": 1.997153845074662e-05, + "loss": 0.3428, + "step": 898 + }, + { + "epoch": 0.7205764611689351, + "learning_rate": 1.9967168151503193e-05, + "loss": 0.3503, + "step": 900 + }, + { + "epoch": 0.722177742193755, + "learning_rate": 1.9962486395494753e-05, + "loss": 0.1992, + "step": 902 + }, + { + "epoch": 0.7237790232185749, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.2944, + "step": 904 + }, + { + "epoch": 0.7253803042433947, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.2908, + "step": 906 + }, + { + "epoch": 0.7269815852682145, + "learning_rate": 1.994657389848176e-05, + "loss": 0.3483, + "step": 908 + }, + { + "epoch": 0.7285828662930345, + "learning_rate": 1.9940647875635466e-05, + "loss": 0.4511, + "step": 910 + }, + { + "epoch": 0.7301841473178543, + "learning_rate": 1.99344112247369e-05, + "loss": 0.3143, + "step": 912 + }, + { + "epoch": 0.7317854283426741, + "learning_rate": 1.9927864140670618e-05, + "loss": 0.2714, + "step": 914 + }, + { + "epoch": 0.733386709367494, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.3365, + "step": 916 + }, + { + "epoch": 0.7349879903923139, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.2572, + "step": 918 + }, + { + "epoch": 0.7365892714171337, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.1858, + "step": 920 + }, + { + "epoch": 0.7381905524419535, + "learning_rate": 1.989857570980049e-05, + "loss": 0.1394, + "step": 922 + }, + { + "epoch": 0.7397918334667735, + "learning_rate": 1.9890479722451292e-05, + "loss": 0.2484, + "step": 924 + }, + { + "epoch": 0.7413931144915933, + "learning_rate": 1.9882074674717832e-05, + "loss": 0.556, + "step": 926 + }, + { + "epoch": 0.7429943955164131, + "learning_rate": 1.987336082924333e-05, + "loss": 0.6835, + "step": 928 + }, + { + "epoch": 0.7445956765412329, + "learning_rate": 1.986433845832037e-05, + "loss": 0.8031, + "step": 930 + }, + { + "epoch": 0.7461969575660529, + "learning_rate": 1.9855007843882437e-05, + "loss": 0.3514, + "step": 932 + }, + { + "epoch": 0.7477982385908727, + "learning_rate": 1.9845369277495105e-05, + "loss": 0.2651, + "step": 934 + }, + { + "epoch": 0.7493995196156925, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.2983, + "step": 936 + }, + { + "epoch": 0.7510008006405124, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.4556, + "step": 938 + }, + { + "epoch": 0.7526020816653323, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.395, + "step": 940 + }, + { + "epoch": 0.7542033626901521, + "learning_rate": 1.9803741660367018e-05, + "loss": 0.1893, + "step": 942 + }, + { + "epoch": 0.755804643714972, + "learning_rate": 1.979256804418418e-05, + "loss": 0.2868, + "step": 944 + }, + { + "epoch": 0.7574059247397918, + "learning_rate": 1.9781088427187677e-05, + "loss": 0.2541, + "step": 946 + }, + { + "epoch": 0.7590072057646117, + "learning_rate": 1.976930316809569e-05, + "loss": 0.2164, + "step": 948 + }, + { + "epoch": 0.7606084867894315, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.4704, + "step": 950 + }, + { + "epoch": 0.7622097678142514, + "learning_rate": 1.9744817206240374e-05, + "loss": 0.4721, + "step": 952 + }, + { + "epoch": 0.7638110488390712, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.2064, + "step": 954 + }, + { + "epoch": 0.7654123298638911, + "learning_rate": 1.9719113219170152e-05, + "loss": 0.2255, + "step": 956 + }, + { + "epoch": 0.767013610888711, + "learning_rate": 1.970580546424186e-05, + "loss": 0.2942, + "step": 958 + }, + { + "epoch": 0.7686148919135308, + "learning_rate": 1.9692194419680463e-05, + "loss": 0.1781, + "step": 960 + }, + { + "epoch": 0.7702161729383507, + "learning_rate": 1.9678280510807552e-05, + "loss": 0.2677, + "step": 962 + }, + { + "epoch": 0.7718174539631706, + "learning_rate": 1.966406417240872e-05, + "loss": 0.4658, + "step": 964 + }, + { + "epoch": 0.7734187349879904, + "learning_rate": 1.964954584871995e-05, + "loss": 0.3044, + "step": 966 + }, + { + "epoch": 0.7750200160128102, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.4647, + "step": 968 + }, + { + "epoch": 0.77662129703763, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.7646, + "step": 970 + }, + { + "epoch": 0.77822257806245, + "learning_rate": 1.9604183549736287e-05, + "loss": 0.262, + "step": 972 + }, + { + "epoch": 0.7798238590872698, + "learning_rate": 1.958846191576357e-05, + "loss": 0.3763, + "step": 974 + }, + { + "epoch": 0.7814251401120896, + "learning_rate": 1.9572440658940667e-05, + "loss": 0.313, + "step": 976 + }, + { + "epoch": 0.7830264211369096, + "learning_rate": 1.955612027990415e-05, + "loss": 0.5548, + "step": 978 + }, + { + "epoch": 0.7846277021617294, + "learning_rate": 1.953950128863763e-05, + "loss": 0.5132, + "step": 980 + }, + { + "epoch": 0.7862289831865492, + "learning_rate": 1.9522584204455835e-05, + "loss": 0.3252, + "step": 982 + }, + { + "epoch": 0.7878302642113691, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.2486, + "step": 984 + }, + { + "epoch": 0.789431545236189, + "learning_rate": 1.9487857881163295e-05, + "loss": 0.4192, + "step": 986 + }, + { + "epoch": 0.7910328262610088, + "learning_rate": 1.947004972719008e-05, + "loss": 0.1878, + "step": 988 + }, + { + "epoch": 0.7926341072858286, + "learning_rate": 1.945194565054276e-05, + "loss": 0.4041, + "step": 990 + }, + { + "epoch": 0.7942353883106485, + "learning_rate": 1.9433546216942433e-05, + "loss": 0.2525, + "step": 992 + }, + { + "epoch": 0.7958366693354684, + "learning_rate": 1.941485200133955e-05, + "loss": 0.3329, + "step": 994 + }, + { + "epoch": 0.7974379503602882, + "learning_rate": 1.9395863587896025e-05, + "loss": 0.3363, + "step": 996 + }, + { + "epoch": 0.7990392313851081, + "learning_rate": 1.937658156996694e-05, + "loss": 0.26, + "step": 998 + }, + { + "epoch": 0.800640512409928, + "learning_rate": 1.9357006550082e-05, + "loss": 0.2571, + "step": 1000 + }, + { + "epoch": 0.8022417934347478, + "learning_rate": 1.933713913992671e-05, + "loss": 0.3111, + "step": 1002 + }, + { + "epoch": 0.8038430744595677, + "learning_rate": 1.9316979960323283e-05, + "loss": 0.5297, + "step": 1004 + }, + { + "epoch": 0.8054443554843875, + "learning_rate": 1.9296529641211226e-05, + "loss": 0.3201, + "step": 1006 + }, + { + "epoch": 0.8070456365092074, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.2183, + "step": 1008 + }, + { + "epoch": 0.8086469175340272, + "learning_rate": 1.9254758149687187e-05, + "loss": 0.2665, + "step": 1010 + }, + { + "epoch": 0.8102481985588471, + "learning_rate": 1.9233438282562095e-05, + "loss": 0.3053, + "step": 1012 + }, + { + "epoch": 0.8118494795836669, + "learning_rate": 1.9211829886461278e-05, + "loss": 0.1892, + "step": 1014 + }, + { + "epoch": 0.8134507606084868, + "learning_rate": 1.918993363660975e-05, + "loss": 0.2647, + "step": 1016 + }, + { + "epoch": 0.8150520416333067, + "learning_rate": 1.916775021722745e-05, + "loss": 0.3649, + "step": 1018 + }, + { + "epoch": 0.8166533226581265, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.4101, + "step": 1020 + }, + { + "epoch": 0.8182546036829463, + "learning_rate": 1.9122524651596372e-05, + "loss": 0.3342, + "step": 1022 + }, + { + "epoch": 0.8198558847077662, + "learning_rate": 1.9099483918568287e-05, + "loss": 0.3063, + "step": 1024 + }, + { + "epoch": 0.8214571657325861, + "learning_rate": 1.907615884240668e-05, + "loss": 0.4045, + "step": 1026 + }, + { + "epoch": 0.8230584467574059, + "learning_rate": 1.905255015197982e-05, + "loss": 0.4896, + "step": 1028 + }, + { + "epoch": 0.8246597277822257, + "learning_rate": 1.902865858501845e-05, + "loss": 0.2883, + "step": 1030 + }, + { + "epoch": 0.8262610088070457, + "learning_rate": 1.9004484888092734e-05, + "loss": 0.3194, + "step": 1032 + }, + { + "epoch": 0.8278622898318655, + "learning_rate": 1.8980029816588863e-05, + "loss": 0.2316, + "step": 1034 + }, + { + "epoch": 0.8294635708566853, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.3102, + "step": 1036 + }, + { + "epoch": 0.8310648518815053, + "learning_rate": 1.893027861533003e-05, + "loss": 0.3676, + "step": 1038 + }, + { + "epoch": 0.8326661329063251, + "learning_rate": 1.8904984040214043e-05, + "loss": 0.3159, + "step": 1040 + }, + { + "epoch": 0.8342674139311449, + "learning_rate": 1.8879411199749306e-05, + "loss": 0.3417, + "step": 1042 + }, + { + "epoch": 0.8358686949559647, + "learning_rate": 1.885356089304285e-05, + "loss": 0.365, + "step": 1044 + }, + { + "epoch": 0.8374699759807847, + "learning_rate": 1.882743392787207e-05, + "loss": 0.1952, + "step": 1046 + }, + { + "epoch": 0.8390712570056045, + "learning_rate": 1.8801031120659396e-05, + "loss": 0.4328, + "step": 1048 + }, + { + "epoch": 0.8406725380304243, + "learning_rate": 1.877435329644691e-05, + "loss": 0.226, + "step": 1050 + }, + { + "epoch": 0.8422738190552442, + "learning_rate": 1.8747401288870482e-05, + "loss": 0.3357, + "step": 1052 + }, + { + "epoch": 0.8438751000800641, + "learning_rate": 1.8720175940133712e-05, + "loss": 0.2814, + "step": 1054 + }, + { + "epoch": 0.8454763811048839, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.3818, + "step": 1056 + }, + { + "epoch": 0.8470776621297038, + "learning_rate": 1.8664908630674264e-05, + "loss": 0.1846, + "step": 1058 + }, + { + "epoch": 0.8486789431545236, + "learning_rate": 1.8636868396959406e-05, + "loss": 0.2306, + "step": 1060 + }, + { + "epoch": 0.8502802241793435, + "learning_rate": 1.8608558276045898e-05, + "loss": 0.3648, + "step": 1062 + }, + { + "epoch": 0.8518815052041633, + "learning_rate": 1.8579979152576076e-05, + "loss": 0.3184, + "step": 1064 + }, + { + "epoch": 0.8534827862289832, + "learning_rate": 1.8551131919598084e-05, + "loss": 0.1611, + "step": 1066 + }, + { + "epoch": 0.855084067253803, + "learning_rate": 1.852201747853807e-05, + "loss": 0.4646, + "step": 1068 + }, + { + "epoch": 0.8566853482786229, + "learning_rate": 1.849263673917196e-05, + "loss": 0.3392, + "step": 1070 + }, + { + "epoch": 0.8582866293034428, + "learning_rate": 1.846299061959706e-05, + "loss": 0.3136, + "step": 1072 + }, + { + "epoch": 0.8598879103282626, + "learning_rate": 1.8433080046203293e-05, + "loss": 0.3193, + "step": 1074 + }, + { + "epoch": 0.8614891913530824, + "learning_rate": 1.8402905953644356e-05, + "loss": 0.2825, + "step": 1076 + }, + { + "epoch": 0.8630904723779024, + "learning_rate": 1.837246928480848e-05, + "loss": 0.2269, + "step": 1078 + }, + { + "epoch": 0.8646917534027222, + "learning_rate": 1.8341770990788874e-05, + "loss": 0.2833, + "step": 1080 + }, + { + "epoch": 0.866293034427542, + "learning_rate": 1.831081203085415e-05, + "loss": 0.2813, + "step": 1082 + }, + { + "epoch": 0.8678943154523618, + "learning_rate": 1.8279593372418284e-05, + "loss": 0.1831, + "step": 1084 + }, + { + "epoch": 0.8694955964771818, + "learning_rate": 1.8248115991010303e-05, + "loss": 0.599, + "step": 1086 + }, + { + "epoch": 0.8710968775020016, + "learning_rate": 1.8216380870243963e-05, + "loss": 0.4159, + "step": 1088 + }, + { + "epoch": 0.8726981585268214, + "learning_rate": 1.8184389001786912e-05, + "loss": 0.14, + "step": 1090 + }, + { + "epoch": 0.8742994395516414, + "learning_rate": 1.815214138532966e-05, + "loss": 0.2961, + "step": 1092 + }, + { + "epoch": 0.8759007205764612, + "learning_rate": 1.8119639028554475e-05, + "loss": 0.4508, + "step": 1094 + }, + { + "epoch": 0.877502001601281, + "learning_rate": 1.808688294710378e-05, + "loss": 0.3221, + "step": 1096 + }, + { + "epoch": 0.8791032826261009, + "learning_rate": 1.805387416454849e-05, + "loss": 0.3192, + "step": 1098 + }, + { + "epoch": 0.8807045636509208, + "learning_rate": 1.802061371235592e-05, + "loss": 0.364, + "step": 1100 + }, + { + "epoch": 0.8823058446757406, + "learning_rate": 1.7987102629857692e-05, + "loss": 0.8246, + "step": 1102 + }, + { + "epoch": 0.8839071257005604, + "learning_rate": 1.7953341964217196e-05, + "loss": 0.398, + "step": 1104 + }, + { + "epoch": 0.8855084067253803, + "learning_rate": 1.7919332770396798e-05, + "loss": 0.3578, + "step": 1106 + }, + { + "epoch": 0.8871096877502002, + "learning_rate": 1.7885076111125e-05, + "loss": 0.4609, + "step": 1108 + }, + { + "epoch": 0.88871096877502, + "learning_rate": 1.7850573056863173e-05, + "loss": 0.2512, + "step": 1110 + }, + { + "epoch": 0.8903122497998399, + "learning_rate": 1.7815824685772042e-05, + "loss": 0.6265, + "step": 1112 + }, + { + "epoch": 0.8919135308246597, + "learning_rate": 1.7780832083678122e-05, + "loss": 0.2277, + "step": 1114 + }, + { + "epoch": 0.8935148118494796, + "learning_rate": 1.774559634403971e-05, + "loss": 0.2362, + "step": 1116 + }, + { + "epoch": 0.8951160928742994, + "learning_rate": 1.7710118567912732e-05, + "loss": 0.6171, + "step": 1118 + }, + { + "epoch": 0.8967173738991193, + "learning_rate": 1.7674399863916298e-05, + "loss": 0.5211, + "step": 1120 + }, + { + "epoch": 0.8983186549239391, + "learning_rate": 1.7638441348198144e-05, + "loss": 0.1601, + "step": 1122 + }, + { + "epoch": 0.899919935948759, + "learning_rate": 1.7602244144399713e-05, + "loss": 0.5551, + "step": 1124 + }, + { + "epoch": 0.9015212169735789, + "learning_rate": 1.7565809383620966e-05, + "loss": 0.2709, + "step": 1126 + }, + { + "epoch": 0.9031224979983987, + "learning_rate": 1.7529138204385186e-05, + "loss": 0.3238, + "step": 1128 + }, + { + "epoch": 0.9047237790232185, + "learning_rate": 1.7492231752603305e-05, + "loss": 0.228, + "step": 1130 + }, + { + "epoch": 0.9063250600480385, + "learning_rate": 1.7455091181538094e-05, + "loss": 0.342, + "step": 1132 + }, + { + "epoch": 0.9079263410728583, + "learning_rate": 1.741771765176815e-05, + "loss": 0.2541, + "step": 1134 + }, + { + "epoch": 0.9095276220976781, + "learning_rate": 1.7380112331151657e-05, + "loss": 0.2665, + "step": 1136 + }, + { + "epoch": 0.911128903122498, + "learning_rate": 1.7342276394789825e-05, + "loss": 0.3273, + "step": 1138 + }, + { + "epoch": 0.9127301841473179, + "learning_rate": 1.7304211024990216e-05, + "loss": 0.3045, + "step": 1140 + }, + { + "epoch": 0.9143314651721377, + "learning_rate": 1.7265917411229803e-05, + "loss": 0.3356, + "step": 1142 + }, + { + "epoch": 0.9159327461969575, + "learning_rate": 1.7227396750117802e-05, + "loss": 0.381, + "step": 1144 + }, + { + "epoch": 0.9175340272217775, + "learning_rate": 1.718865024535822e-05, + "loss": 0.4305, + "step": 1146 + }, + { + "epoch": 0.9191353082465973, + "learning_rate": 1.7149679107712317e-05, + "loss": 0.1989, + "step": 1148 + }, + { + "epoch": 0.9207365892714171, + "learning_rate": 1.711048455496075e-05, + "loss": 0.2626, + "step": 1150 + }, + { + "epoch": 0.922337870296237, + "learning_rate": 1.7071067811865474e-05, + "loss": 0.2201, + "step": 1152 + }, + { + "epoch": 0.9239391513210569, + "learning_rate": 1.7031430110131566e-05, + "loss": 0.3052, + "step": 1154 + }, + { + "epoch": 0.9255404323458767, + "learning_rate": 1.699157268836863e-05, + "loss": 0.3722, + "step": 1156 + }, + { + "epoch": 0.9271417133706965, + "learning_rate": 1.6951496792052148e-05, + "loss": 0.2728, + "step": 1158 + }, + { + "epoch": 0.9287429943955164, + "learning_rate": 1.6911203673484583e-05, + "loss": 0.5517, + "step": 1160 + }, + { + "epoch": 0.9303442754203363, + "learning_rate": 1.687069459175619e-05, + "loss": 0.3048, + "step": 1162 + }, + { + "epoch": 0.9319455564451561, + "learning_rate": 1.682997081270568e-05, + "loss": 0.3998, + "step": 1164 + }, + { + "epoch": 0.933546837469976, + "learning_rate": 1.6789033608880742e-05, + "loss": 0.3864, + "step": 1166 + }, + { + "epoch": 0.9351481184947958, + "learning_rate": 1.6747884259498185e-05, + "loss": 0.4548, + "step": 1168 + }, + { + "epoch": 0.9367493995196157, + "learning_rate": 1.6706524050404006e-05, + "loss": 0.3354, + "step": 1170 + }, + { + "epoch": 0.9383506805444356, + "learning_rate": 1.6664954274033175e-05, + "loss": 0.3056, + "step": 1172 + }, + { + "epoch": 0.9399519615692554, + "learning_rate": 1.662317622936933e-05, + "loss": 0.2787, + "step": 1174 + }, + { + "epoch": 0.9415532425940752, + "learning_rate": 1.6581191221904098e-05, + "loss": 0.2166, + "step": 1176 + }, + { + "epoch": 0.9431545236188951, + "learning_rate": 1.6539000563596328e-05, + "loss": 0.4149, + "step": 1178 + }, + { + "epoch": 0.944755804643715, + "learning_rate": 1.6496605572831127e-05, + "loss": 0.3334, + "step": 1180 + }, + { + "epoch": 0.9463570856685348, + "learning_rate": 1.6454007574378657e-05, + "loss": 0.3432, + "step": 1182 + }, + { + "epoch": 0.9479583666933546, + "learning_rate": 1.6411207899352633e-05, + "loss": 0.3184, + "step": 1184 + }, + { + "epoch": 0.9495596477181746, + "learning_rate": 1.6368207885168904e-05, + "loss": 0.2172, + "step": 1186 + }, + { + "epoch": 0.9511609287429944, + "learning_rate": 1.6325008875503563e-05, + "loss": 0.2767, + "step": 1188 + }, + { + "epoch": 0.9527622097678142, + "learning_rate": 1.628161222025089e-05, + "loss": 0.3526, + "step": 1190 + }, + { + "epoch": 0.9543634907926342, + "learning_rate": 1.623801927548132e-05, + "loss": 0.3494, + "step": 1192 + }, + { + "epoch": 0.955964771817454, + "learning_rate": 1.6194231403398987e-05, + "loss": 0.1632, + "step": 1194 + }, + { + "epoch": 0.9575660528422738, + "learning_rate": 1.6150249972299173e-05, + "loss": 0.4487, + "step": 1196 + }, + { + "epoch": 0.9591673338670936, + "learning_rate": 1.6106076356525484e-05, + "loss": 0.3098, + "step": 1198 + }, + { + "epoch": 0.9607686148919136, + "learning_rate": 1.6061711936427028e-05, + "loss": 0.3178, + "step": 1200 + }, + { + "epoch": 0.9623698959167334, + "learning_rate": 1.6017158098315224e-05, + "loss": 0.2195, + "step": 1202 + }, + { + "epoch": 0.9639711769415532, + "learning_rate": 1.5972416234420404e-05, + "loss": 0.4592, + "step": 1204 + }, + { + "epoch": 0.9655724579663731, + "learning_rate": 1.592748774284844e-05, + "loss": 0.2383, + "step": 1206 + }, + { + "epoch": 0.967173738991193, + "learning_rate": 1.588237402753703e-05, + "loss": 0.2514, + "step": 1208 + }, + { + "epoch": 0.9687750200160128, + "learning_rate": 1.5837076498211673e-05, + "loss": 0.6167, + "step": 1210 + }, + { + "epoch": 0.9703763010408326, + "learning_rate": 1.579159657034185e-05, + "loss": 0.3192, + "step": 1212 + }, + { + "epoch": 0.9719775820656525, + "learning_rate": 1.574593566509664e-05, + "loss": 0.3845, + "step": 1214 + }, + { + "epoch": 0.9735788630904724, + "learning_rate": 1.5700095209300386e-05, + "loss": 0.247, + "step": 1216 + }, + { + "epoch": 0.9751801441152922, + "learning_rate": 1.5654076635387976e-05, + "loss": 0.12, + "step": 1218 + }, + { + "epoch": 0.9767814251401121, + "learning_rate": 1.560788138136029e-05, + "loss": 0.7037, + "step": 1220 + }, + { + "epoch": 0.978382706164932, + "learning_rate": 1.5561510890739137e-05, + "loss": 0.2335, + "step": 1222 + }, + { + "epoch": 0.9799839871897518, + "learning_rate": 1.5514966612522088e-05, + "loss": 0.3816, + "step": 1224 + }, + { + "epoch": 0.9815852682145717, + "learning_rate": 1.546825000113736e-05, + "loss": 0.183, + "step": 1226 + }, + { + "epoch": 0.9831865492393915, + "learning_rate": 1.5421362516398285e-05, + "loss": 0.2516, + "step": 1228 + }, + { + "epoch": 0.9847878302642114, + "learning_rate": 1.5374305623457605e-05, + "loss": 0.242, + "step": 1230 + }, + { + "epoch": 0.9863891112890312, + "learning_rate": 1.532708079276186e-05, + "loss": 0.2454, + "step": 1232 + }, + { + "epoch": 0.9879903923138511, + "learning_rate": 1.5279689500005353e-05, + "loss": 0.3157, + "step": 1234 + }, + { + "epoch": 0.9895916733386709, + "learning_rate": 1.5232133226083962e-05, + "loss": 0.2644, + "step": 1236 + }, + { + "epoch": 0.9911929543634908, + "learning_rate": 1.5184413457049014e-05, + "loss": 0.2562, + "step": 1238 + }, + { + "epoch": 0.9927942353883107, + "learning_rate": 1.5136531684060753e-05, + "loss": 0.2634, + "step": 1240 + }, + { + "epoch": 0.9943955164131305, + "learning_rate": 1.50884894033418e-05, + "loss": 0.4035, + "step": 1242 + }, + { + "epoch": 0.9959967974379503, + "learning_rate": 1.504028811613027e-05, + "loss": 0.143, + "step": 1244 + }, + { + "epoch": 0.9975980784627703, + "learning_rate": 1.4991929328633043e-05, + "loss": 0.3527, + "step": 1246 + }, + { + "epoch": 0.9991993594875901, + "learning_rate": 1.4943414551978622e-05, + "loss": 0.2562, + "step": 1248 + }, + { + "epoch": 1.0, + "step": 1249, + "total_flos": 0, + "train_loss": 0.3264200721887515, + "train_runtime": 6533.1383, + "train_samples_per_second": 3.059, + "train_steps_per_second": 0.191 + } + ], + "logging_steps": 2, + "max_steps": 1249, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..84a6838318b59ee5db54a6a2c2a6c436db73b804 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f83722f1895f7a7c43f37e5d77f16a1590a1f2fd76e60a5c422f004434b172f9 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..ce278b3b76e6fd1ce90d06ed14334f75f8fdfa1a --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c18786aa6f514ed1b86febd0ff3a4b48b69ee7f6cdacb8a6a17c75febcbb5aa2 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6a3fb43737e9270615edf415c1f1f79a57878800 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e6416a7140be394be7808aef1c78c14314e7b67607c01c968295fc3579483d3 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..4030290b14a5269705e39ab5d262d1cdea5f774d --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bee6083259f6b062cf29ffb08cb7f65749a0f1408712ae1a1b5995be334150f0 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_125_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_125_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ea68bda54201e02ec3181a242fd5f8ed3896c1fa --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_125_seed1/0_trainer_state.json @@ -0,0 +1,7526 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2498, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008006405124099279, + "learning_rate": 2.415943612351265e-06, + "loss": 0.1757, + "step": 2 + }, + { + "epoch": 0.0016012810248198558, + "learning_rate": 2.4341906163790364e-06, + "loss": 0.3146, + "step": 4 + }, + { + "epoch": 0.0024019215372297837, + "learning_rate": 2.4524967251364995e-06, + "loss": 0.4748, + "step": 6 + }, + { + "epoch": 0.0032025620496397116, + "learning_rate": 2.4708617956148052e-06, + "loss": 0.2931, + "step": 8 + }, + { + "epoch": 0.0040032025620496394, + "learning_rate": 2.4892856843445236e-06, + "loss": 0.0775, + "step": 10 + }, + { + "epoch": 0.004803843074459567, + "learning_rate": 2.507768247396697e-06, + "loss": 0.3062, + "step": 12 + }, + { + "epoch": 0.005604483586869495, + "learning_rate": 2.5263093403840022e-06, + "loss": 0.0782, + "step": 14 + }, + { + "epoch": 0.006405124099279423, + "learning_rate": 2.5449088184619065e-06, + "loss": 0.1719, + "step": 16 + }, + { + "epoch": 0.007205764611689352, + "learning_rate": 2.5635665363297356e-06, + "loss": 0.454, + "step": 18 + }, + { + "epoch": 0.008006405124099279, + "learning_rate": 2.5822823482318517e-06, + "loss": 0.3048, + "step": 20 + }, + { + "epoch": 0.008807045636509208, + "learning_rate": 2.6010561079587694e-06, + "loss": 0.3501, + "step": 22 + }, + { + "epoch": 0.009607686148919135, + "learning_rate": 2.6198876688483453e-06, + "loss": 0.3041, + "step": 24 + }, + { + "epoch": 0.010408326661329063, + "learning_rate": 2.6387768837868565e-06, + "loss": 0.1196, + "step": 26 + }, + { + "epoch": 0.01120896717373899, + "learning_rate": 2.6577236052101764e-06, + "loss": 0.3309, + "step": 28 + }, + { + "epoch": 0.01200960768614892, + "learning_rate": 2.6767276851049716e-06, + "loss": 0.1507, + "step": 30 + }, + { + "epoch": 0.012810248198558846, + "learning_rate": 2.6957889750097866e-06, + "loss": 0.2604, + "step": 32 + }, + { + "epoch": 0.013610888710968775, + "learning_rate": 2.7149073260162416e-06, + "loss": 0.1507, + "step": 34 + }, + { + "epoch": 0.014411529223378704, + "learning_rate": 2.7340825887701848e-06, + "loss": 0.1501, + "step": 36 + }, + { + "epoch": 0.01521216973578863, + "learning_rate": 2.7533146134728993e-06, + "loss": 0.122, + "step": 38 + }, + { + "epoch": 0.016012810248198558, + "learning_rate": 2.772603249882202e-06, + "loss": 0.2025, + "step": 40 + }, + { + "epoch": 0.016813450760608487, + "learning_rate": 2.7919483473136555e-06, + "loss": 0.3054, + "step": 42 + }, + { + "epoch": 0.017614091273018415, + "learning_rate": 2.81134975464178e-06, + "loss": 0.3263, + "step": 44 + }, + { + "epoch": 0.018414731785428344, + "learning_rate": 2.8308073203011634e-06, + "loss": 0.0352, + "step": 46 + }, + { + "epoch": 0.01921537229783827, + "learning_rate": 2.850320892287688e-06, + "loss": 0.2409, + "step": 48 + }, + { + "epoch": 0.020016012810248198, + "learning_rate": 2.8698903181597026e-06, + "loss": 0.2572, + "step": 50 + }, + { + "epoch": 0.020816653322658127, + "learning_rate": 2.889515445039256e-06, + "loss": 0.1388, + "step": 52 + }, + { + "epoch": 0.021617293835068056, + "learning_rate": 2.909196119613218e-06, + "loss": 0.1215, + "step": 54 + }, + { + "epoch": 0.02241793434747798, + "learning_rate": 2.928932188134529e-06, + "loss": 0.3169, + "step": 56 + }, + { + "epoch": 0.02321857485988791, + "learning_rate": 2.9487234964233724e-06, + "loss": 0.2346, + "step": 58 + }, + { + "epoch": 0.02401921537229784, + "learning_rate": 2.9685698898684355e-06, + "loss": 0.0996, + "step": 60 + }, + { + "epoch": 0.024819855884707767, + "learning_rate": 2.988471213428035e-06, + "loss": 0.0508, + "step": 62 + }, + { + "epoch": 0.025620496397117692, + "learning_rate": 3.00842731163137e-06, + "loss": 0.2968, + "step": 64 + }, + { + "epoch": 0.02642113690952762, + "learning_rate": 3.0284380285797733e-06, + "loss": 0.1775, + "step": 66 + }, + { + "epoch": 0.02722177742193755, + "learning_rate": 3.048503207947854e-06, + "loss": 0.1957, + "step": 68 + }, + { + "epoch": 0.02802241793434748, + "learning_rate": 3.068622692984767e-06, + "loss": 0.1101, + "step": 70 + }, + { + "epoch": 0.028823058446757407, + "learning_rate": 3.0887963265154187e-06, + "loss": 0.1761, + "step": 72 + }, + { + "epoch": 0.029623698959167333, + "learning_rate": 3.1090239509417364e-06, + "loss": 0.2377, + "step": 74 + }, + { + "epoch": 0.03042433947157726, + "learning_rate": 3.129305408243829e-06, + "loss": 0.1837, + "step": 76 + }, + { + "epoch": 0.03122497998398719, + "learning_rate": 3.1496405399812602e-06, + "loss": 0.1677, + "step": 78 + }, + { + "epoch": 0.032025620496397116, + "learning_rate": 3.17002918729432e-06, + "loss": 0.3029, + "step": 80 + }, + { + "epoch": 0.03282626100880705, + "learning_rate": 3.1904711909051967e-06, + "loss": 0.1383, + "step": 82 + }, + { + "epoch": 0.03362690152121697, + "learning_rate": 3.2109663911192622e-06, + "loss": 0.2464, + "step": 84 + }, + { + "epoch": 0.0344275420336269, + "learning_rate": 3.231514627826302e-06, + "loss": 0.5593, + "step": 86 + }, + { + "epoch": 0.03522818254603683, + "learning_rate": 3.2521157405018146e-06, + "loss": 0.0603, + "step": 88 + }, + { + "epoch": 0.036028823058446756, + "learning_rate": 3.2727695682081897e-06, + "loss": 0.3926, + "step": 90 + }, + { + "epoch": 0.03682946357085669, + "learning_rate": 3.293475949595998e-06, + "loss": 0.3755, + "step": 92 + }, + { + "epoch": 0.03763010408326661, + "learning_rate": 3.314234722905302e-06, + "loss": 0.4975, + "step": 94 + }, + { + "epoch": 0.03843074459567654, + "learning_rate": 3.335045725966829e-06, + "loss": 0.2012, + "step": 96 + }, + { + "epoch": 0.03923138510808647, + "learning_rate": 3.355908796203301e-06, + "loss": 0.2656, + "step": 98 + }, + { + "epoch": 0.040032025620496396, + "learning_rate": 3.3768237706306716e-06, + "loss": 0.3698, + "step": 100 + }, + { + "epoch": 0.04083266613290633, + "learning_rate": 3.3977904858594534e-06, + "loss": 0.2154, + "step": 102 + }, + { + "epoch": 0.041633306645316254, + "learning_rate": 3.418808778095917e-06, + "loss": 0.5302, + "step": 104 + }, + { + "epoch": 0.04243394715772618, + "learning_rate": 3.4398784831434097e-06, + "loss": 0.2053, + "step": 106 + }, + { + "epoch": 0.04323458767013611, + "learning_rate": 3.460999436403676e-06, + "loss": 0.1236, + "step": 108 + }, + { + "epoch": 0.044035228182546036, + "learning_rate": 3.4821714728780654e-06, + "loss": 0.0929, + "step": 110 + }, + { + "epoch": 0.04483586869495596, + "learning_rate": 3.5033944271688624e-06, + "loss": 0.1226, + "step": 112 + }, + { + "epoch": 0.045636509207365894, + "learning_rate": 3.5246681334806177e-06, + "loss": 0.3226, + "step": 114 + }, + { + "epoch": 0.04643714971977582, + "learning_rate": 3.5459924256213596e-06, + "loss": 0.2305, + "step": 116 + }, + { + "epoch": 0.04723779023218575, + "learning_rate": 3.567367137003953e-06, + "loss": 0.1142, + "step": 118 + }, + { + "epoch": 0.04803843074459568, + "learning_rate": 3.588792100647368e-06, + "loss": 0.1294, + "step": 120 + }, + { + "epoch": 0.0488390712570056, + "learning_rate": 3.6102671491780393e-06, + "loss": 0.4166, + "step": 122 + }, + { + "epoch": 0.049639711769415534, + "learning_rate": 3.6317921148310965e-06, + "loss": 0.1951, + "step": 124 + }, + { + "epoch": 0.05044035228182546, + "learning_rate": 3.653366829451711e-06, + "loss": 0.1275, + "step": 126 + }, + { + "epoch": 0.051240992794235385, + "learning_rate": 3.674991124496452e-06, + "loss": 0.3484, + "step": 128 + }, + { + "epoch": 0.05204163330664532, + "learning_rate": 3.696664831034521e-06, + "loss": 0.2096, + "step": 130 + }, + { + "epoch": 0.05284227381905524, + "learning_rate": 3.7183877797491143e-06, + "loss": 0.2704, + "step": 132 + }, + { + "epoch": 0.053642914331465175, + "learning_rate": 3.740159800938784e-06, + "loss": 0.1905, + "step": 134 + }, + { + "epoch": 0.0544435548438751, + "learning_rate": 3.7619807245186824e-06, + "loss": 0.0868, + "step": 136 + }, + { + "epoch": 0.055244195356285025, + "learning_rate": 3.783850380021933e-06, + "loss": 0.4635, + "step": 138 + }, + { + "epoch": 0.05604483586869496, + "learning_rate": 3.8057685966010025e-06, + "loss": 0.0412, + "step": 140 + }, + { + "epoch": 0.05684547638110488, + "learning_rate": 3.827735203028956e-06, + "loss": 0.0696, + "step": 142 + }, + { + "epoch": 0.057646116893514815, + "learning_rate": 3.849750027700842e-06, + "loss": 0.021, + "step": 144 + }, + { + "epoch": 0.05844675740592474, + "learning_rate": 3.871812898635011e-06, + "loss": 0.3174, + "step": 146 + }, + { + "epoch": 0.059247397918334666, + "learning_rate": 3.8939236434745184e-06, + "loss": 0.6806, + "step": 148 + }, + { + "epoch": 0.0600480384307446, + "learning_rate": 3.916082089488379e-06, + "loss": 0.2286, + "step": 150 + }, + { + "epoch": 0.06084867894315452, + "learning_rate": 3.938288063572962e-06, + "loss": 0.498, + "step": 152 + }, + { + "epoch": 0.06164931945556445, + "learning_rate": 3.960541392253387e-06, + "loss": 0.3167, + "step": 154 + }, + { + "epoch": 0.06244995996797438, + "learning_rate": 3.982841901684792e-06, + "loss": 0.0648, + "step": 156 + }, + { + "epoch": 0.0632506004803843, + "learning_rate": 4.005189417653737e-06, + "loss": 0.148, + "step": 158 + }, + { + "epoch": 0.06405124099279423, + "learning_rate": 4.027583765579601e-06, + "loss": 0.1649, + "step": 160 + }, + { + "epoch": 0.06485188150520416, + "learning_rate": 4.050024770515873e-06, + "loss": 0.1655, + "step": 162 + }, + { + "epoch": 0.0656525220176141, + "learning_rate": 4.072512257151546e-06, + "loss": 0.2286, + "step": 164 + }, + { + "epoch": 0.06645316253002402, + "learning_rate": 4.095046049812541e-06, + "loss": 0.3697, + "step": 166 + }, + { + "epoch": 0.06725380304243395, + "learning_rate": 4.117625972462988e-06, + "loss": 0.1542, + "step": 168 + }, + { + "epoch": 0.06805444355484387, + "learning_rate": 4.1402518487066624e-06, + "loss": 0.2376, + "step": 170 + }, + { + "epoch": 0.0688550840672538, + "learning_rate": 4.1629235017883285e-06, + "loss": 0.3886, + "step": 172 + }, + { + "epoch": 0.06965572457966374, + "learning_rate": 4.1856407545951825e-06, + "loss": 0.3383, + "step": 174 + }, + { + "epoch": 0.07045636509207366, + "learning_rate": 4.208403429658151e-06, + "loss": 0.1403, + "step": 176 + }, + { + "epoch": 0.07125700560448359, + "learning_rate": 4.2312113491533145e-06, + "loss": 0.2166, + "step": 178 + }, + { + "epoch": 0.07205764611689351, + "learning_rate": 4.254064334903347e-06, + "loss": 0.1296, + "step": 180 + }, + { + "epoch": 0.07285828662930344, + "learning_rate": 4.276962208378814e-06, + "loss": 0.3165, + "step": 182 + }, + { + "epoch": 0.07365892714171338, + "learning_rate": 4.299904790699619e-06, + "loss": 0.169, + "step": 184 + }, + { + "epoch": 0.0744595676541233, + "learning_rate": 4.3228919026364345e-06, + "loss": 0.2691, + "step": 186 + }, + { + "epoch": 0.07526020816653323, + "learning_rate": 4.345923364612024e-06, + "loss": 0.2838, + "step": 188 + }, + { + "epoch": 0.07606084867894315, + "learning_rate": 4.368998996702686e-06, + "loss": 0.2759, + "step": 190 + }, + { + "epoch": 0.07686148919135308, + "learning_rate": 4.392118618639698e-06, + "loss": 0.0848, + "step": 192 + }, + { + "epoch": 0.07766212970376302, + "learning_rate": 4.415282049810643e-06, + "loss": 0.1768, + "step": 194 + }, + { + "epoch": 0.07846277021617294, + "learning_rate": 4.4384891092608795e-06, + "loss": 0.2069, + "step": 196 + }, + { + "epoch": 0.07926341072858287, + "learning_rate": 4.461739615694921e-06, + "loss": 0.6011, + "step": 198 + }, + { + "epoch": 0.08006405124099279, + "learning_rate": 4.485033387477915e-06, + "loss": 0.1303, + "step": 200 + }, + { + "epoch": 0.08086469175340272, + "learning_rate": 4.5083702426369715e-06, + "loss": 0.2551, + "step": 202 + }, + { + "epoch": 0.08166533226581266, + "learning_rate": 4.531749998862628e-06, + "loss": 0.2931, + "step": 204 + }, + { + "epoch": 0.08246597277822258, + "learning_rate": 4.555172473510324e-06, + "loss": 0.0391, + "step": 206 + }, + { + "epoch": 0.08326661329063251, + "learning_rate": 4.578637483601732e-06, + "loss": 0.4543, + "step": 208 + }, + { + "epoch": 0.08406725380304243, + "learning_rate": 4.602144845826234e-06, + "loss": 0.3755, + "step": 210 + }, + { + "epoch": 0.08486789431545236, + "learning_rate": 4.625694376542399e-06, + "loss": 0.1805, + "step": 212 + }, + { + "epoch": 0.08566853482786228, + "learning_rate": 4.649285891779326e-06, + "loss": 0.3121, + "step": 214 + }, + { + "epoch": 0.08646917534027222, + "learning_rate": 4.672919207238145e-06, + "loss": 0.1513, + "step": 216 + }, + { + "epoch": 0.08726981585268215, + "learning_rate": 4.696594138293421e-06, + "loss": 0.3711, + "step": 218 + }, + { + "epoch": 0.08807045636509207, + "learning_rate": 4.720310499994664e-06, + "loss": 0.3265, + "step": 220 + }, + { + "epoch": 0.088871096877502, + "learning_rate": 4.744068107067673e-06, + "loss": 0.2321, + "step": 222 + }, + { + "epoch": 0.08967173738991192, + "learning_rate": 4.767866773916041e-06, + "loss": 0.0677, + "step": 224 + }, + { + "epoch": 0.09047237790232186, + "learning_rate": 4.79170631462264e-06, + "loss": 0.403, + "step": 226 + }, + { + "epoch": 0.09127301841473179, + "learning_rate": 4.81558654295099e-06, + "loss": 0.1842, + "step": 228 + }, + { + "epoch": 0.09207365892714171, + "learning_rate": 4.839507272346751e-06, + "loss": 0.3128, + "step": 230 + }, + { + "epoch": 0.09287429943955164, + "learning_rate": 4.863468315939234e-06, + "loss": 0.3936, + "step": 232 + }, + { + "epoch": 0.09367493995196156, + "learning_rate": 4.8874694865427676e-06, + "loss": 0.5071, + "step": 234 + }, + { + "epoch": 0.0944755804643715, + "learning_rate": 4.911510596658202e-06, + "loss": 0.0701, + "step": 236 + }, + { + "epoch": 0.09527622097678143, + "learning_rate": 4.935591458474425e-06, + "loss": 0.1957, + "step": 238 + }, + { + "epoch": 0.09607686148919135, + "learning_rate": 4.959711883869734e-06, + "loss": 0.4694, + "step": 240 + }, + { + "epoch": 0.09687750200160128, + "learning_rate": 4.9838716844133665e-06, + "loss": 0.0934, + "step": 242 + }, + { + "epoch": 0.0976781425140112, + "learning_rate": 5.0080706713669435e-06, + "loss": 0.3034, + "step": 244 + }, + { + "epoch": 0.09847878302642114, + "learning_rate": 5.032308655686007e-06, + "loss": 0.1455, + "step": 246 + }, + { + "epoch": 0.09927942353883107, + "learning_rate": 5.056585448021398e-06, + "loss": 0.1127, + "step": 248 + }, + { + "epoch": 0.100080064051241, + "learning_rate": 5.080900858720789e-06, + "loss": 0.3517, + "step": 250 + }, + { + "epoch": 0.10088070456365092, + "learning_rate": 5.105254697830208e-06, + "loss": 0.2188, + "step": 252 + }, + { + "epoch": 0.10168134507606084, + "learning_rate": 5.129646775095432e-06, + "loss": 0.2783, + "step": 254 + }, + { + "epoch": 0.10248198558847077, + "learning_rate": 5.154076899963514e-06, + "loss": 0.293, + "step": 256 + }, + { + "epoch": 0.10328262610088071, + "learning_rate": 5.178544881584328e-06, + "loss": 0.199, + "step": 258 + }, + { + "epoch": 0.10408326661329063, + "learning_rate": 5.203050528811959e-06, + "loss": 0.3231, + "step": 260 + }, + { + "epoch": 0.10488390712570056, + "learning_rate": 5.227593650206246e-06, + "loss": 0.2011, + "step": 262 + }, + { + "epoch": 0.10568454763811048, + "learning_rate": 5.2521740540343205e-06, + "loss": 0.1632, + "step": 264 + }, + { + "epoch": 0.10648518815052041, + "learning_rate": 5.2767915482720164e-06, + "loss": 0.8641, + "step": 266 + }, + { + "epoch": 0.10728582866293035, + "learning_rate": 5.3014459406054295e-06, + "loss": 0.1923, + "step": 268 + }, + { + "epoch": 0.10808646917534027, + "learning_rate": 5.3261370384323904e-06, + "loss": 0.2988, + "step": 270 + }, + { + "epoch": 0.1088871096877502, + "learning_rate": 5.350864648864026e-06, + "loss": 0.4573, + "step": 272 + }, + { + "epoch": 0.10968775020016013, + "learning_rate": 5.375628578726181e-06, + "loss": 0.4961, + "step": 274 + }, + { + "epoch": 0.11048839071257005, + "learning_rate": 5.4004286345609665e-06, + "loss": 0.2237, + "step": 276 + }, + { + "epoch": 0.11128903122497999, + "learning_rate": 5.425264622628326e-06, + "loss": 0.1193, + "step": 278 + }, + { + "epoch": 0.11208967173738991, + "learning_rate": 5.450136348907444e-06, + "loss": 0.2466, + "step": 280 + }, + { + "epoch": 0.11289031224979984, + "learning_rate": 5.475043619098321e-06, + "loss": 0.4177, + "step": 282 + }, + { + "epoch": 0.11369095276220977, + "learning_rate": 5.499986238623329e-06, + "loss": 0.2412, + "step": 284 + }, + { + "epoch": 0.11449159327461969, + "learning_rate": 5.524964012628644e-06, + "loss": 0.1934, + "step": 286 + }, + { + "epoch": 0.11529223378702963, + "learning_rate": 5.549976745985809e-06, + "loss": 0.1383, + "step": 288 + }, + { + "epoch": 0.11609287429943956, + "learning_rate": 5.57502424329331e-06, + "loss": 0.2469, + "step": 290 + }, + { + "epoch": 0.11689351481184948, + "learning_rate": 5.6001063088780085e-06, + "loss": 0.2778, + "step": 292 + }, + { + "epoch": 0.1176941553242594, + "learning_rate": 5.62522274679673e-06, + "loss": 0.1568, + "step": 294 + }, + { + "epoch": 0.11849479583666933, + "learning_rate": 5.650373360837763e-06, + "loss": 0.5591, + "step": 296 + }, + { + "epoch": 0.11929543634907927, + "learning_rate": 5.675557954522462e-06, + "loss": 0.2041, + "step": 298 + }, + { + "epoch": 0.1200960768614892, + "learning_rate": 5.700776331106674e-06, + "loss": 0.2381, + "step": 300 + }, + { + "epoch": 0.12089671737389912, + "learning_rate": 5.726028293582342e-06, + "loss": 0.3562, + "step": 302 + }, + { + "epoch": 0.12169735788630905, + "learning_rate": 5.751313644679071e-06, + "loss": 0.2817, + "step": 304 + }, + { + "epoch": 0.12249799839871897, + "learning_rate": 5.776632186865589e-06, + "loss": 0.4518, + "step": 306 + }, + { + "epoch": 0.1232986389111289, + "learning_rate": 5.8019837223513295e-06, + "loss": 0.5542, + "step": 308 + }, + { + "epoch": 0.12409927942353884, + "learning_rate": 5.827368053088032e-06, + "loss": 0.2039, + "step": 310 + }, + { + "epoch": 0.12489991993594876, + "learning_rate": 5.852784980771182e-06, + "loss": 0.1957, + "step": 312 + }, + { + "epoch": 0.1257005604483587, + "learning_rate": 5.878234306841637e-06, + "loss": 0.3327, + "step": 314 + }, + { + "epoch": 0.1265012009607686, + "learning_rate": 5.903715832487138e-06, + "loss": 0.3211, + "step": 316 + }, + { + "epoch": 0.12730184147317855, + "learning_rate": 5.929229358643925e-06, + "loss": 0.4742, + "step": 318 + }, + { + "epoch": 0.12810248198558846, + "learning_rate": 5.954774685998206e-06, + "loss": 0.4028, + "step": 320 + }, + { + "epoch": 0.1289031224979984, + "learning_rate": 5.9803516149877475e-06, + "loss": 0.1953, + "step": 322 + }, + { + "epoch": 0.1297037630104083, + "learning_rate": 6.005959945803494e-06, + "loss": 0.2952, + "step": 324 + }, + { + "epoch": 0.13050440352281825, + "learning_rate": 6.03159947839103e-06, + "loss": 0.3798, + "step": 326 + }, + { + "epoch": 0.1313050440352282, + "learning_rate": 6.057270012452186e-06, + "loss": 0.8319, + "step": 328 + }, + { + "epoch": 0.1321056845476381, + "learning_rate": 6.082971347446654e-06, + "loss": 0.1467, + "step": 330 + }, + { + "epoch": 0.13290632506004804, + "learning_rate": 6.108703282593461e-06, + "loss": 0.17, + "step": 332 + }, + { + "epoch": 0.13370696557245795, + "learning_rate": 6.13446561687258e-06, + "loss": 0.1881, + "step": 334 + }, + { + "epoch": 0.1345076060848679, + "learning_rate": 6.160258149026557e-06, + "loss": 0.3319, + "step": 336 + }, + { + "epoch": 0.13530824659727783, + "learning_rate": 6.186080677561974e-06, + "loss": 0.2161, + "step": 338 + }, + { + "epoch": 0.13610888710968774, + "learning_rate": 6.2119330007511014e-06, + "loss": 0.5048, + "step": 340 + }, + { + "epoch": 0.13690952762209768, + "learning_rate": 6.237814916633431e-06, + "loss": 0.2691, + "step": 342 + }, + { + "epoch": 0.1377101681345076, + "learning_rate": 6.263726223017326e-06, + "loss": 0.1782, + "step": 344 + }, + { + "epoch": 0.13851080864691753, + "learning_rate": 6.289666717481496e-06, + "loss": 0.2307, + "step": 346 + }, + { + "epoch": 0.13931144915932747, + "learning_rate": 6.315636197376634e-06, + "loss": 0.3117, + "step": 348 + }, + { + "epoch": 0.14011208967173738, + "learning_rate": 6.341634459827044e-06, + "loss": 0.36, + "step": 350 + }, + { + "epoch": 0.14091273018414732, + "learning_rate": 6.3676613017321305e-06, + "loss": 0.2222, + "step": 352 + }, + { + "epoch": 0.14171337069655723, + "learning_rate": 6.393716519768032e-06, + "loss": 0.1159, + "step": 354 + }, + { + "epoch": 0.14251401120896717, + "learning_rate": 6.419799910389257e-06, + "loss": 0.3075, + "step": 356 + }, + { + "epoch": 0.1433146517213771, + "learning_rate": 6.445911269830183e-06, + "loss": 0.7767, + "step": 358 + }, + { + "epoch": 0.14411529223378702, + "learning_rate": 6.472050394106689e-06, + "loss": 0.0887, + "step": 360 + }, + { + "epoch": 0.14491593274619696, + "learning_rate": 6.498217079017806e-06, + "loss": 0.5453, + "step": 362 + }, + { + "epoch": 0.14571657325860687, + "learning_rate": 6.524411120147204e-06, + "loss": 0.0914, + "step": 364 + }, + { + "epoch": 0.1465172137710168, + "learning_rate": 6.5506323128648654e-06, + "loss": 0.2192, + "step": 366 + }, + { + "epoch": 0.14731785428342675, + "learning_rate": 6.576880452328645e-06, + "loss": 0.4225, + "step": 368 + }, + { + "epoch": 0.14811849479583666, + "learning_rate": 6.603155333485934e-06, + "loss": 0.2285, + "step": 370 + }, + { + "epoch": 0.1489191353082466, + "learning_rate": 6.6294567510751675e-06, + "loss": 0.2864, + "step": 372 + }, + { + "epoch": 0.14971977582065651, + "learning_rate": 6.655784499627476e-06, + "loss": 0.1165, + "step": 374 + }, + { + "epoch": 0.15052041633306645, + "learning_rate": 6.682138373468341e-06, + "loss": 0.3206, + "step": 376 + }, + { + "epoch": 0.1513210568454764, + "learning_rate": 6.7085181667191e-06, + "loss": 0.1438, + "step": 378 + }, + { + "epoch": 0.1521216973578863, + "learning_rate": 6.734923673298605e-06, + "loss": 0.1251, + "step": 380 + }, + { + "epoch": 0.15292233787029624, + "learning_rate": 6.761354686924883e-06, + "loss": 0.2778, + "step": 382 + }, + { + "epoch": 0.15372297838270615, + "learning_rate": 6.787811001116654e-06, + "loss": 0.2944, + "step": 384 + }, + { + "epoch": 0.1545236188951161, + "learning_rate": 6.8142924091949955e-06, + "loss": 0.3024, + "step": 386 + }, + { + "epoch": 0.15532425940752603, + "learning_rate": 6.840798704284939e-06, + "loss": 0.1695, + "step": 388 + }, + { + "epoch": 0.15612489991993594, + "learning_rate": 6.867329679317144e-06, + "loss": 0.3764, + "step": 390 + }, + { + "epoch": 0.15692554043234588, + "learning_rate": 6.893885127029419e-06, + "loss": 0.2609, + "step": 392 + }, + { + "epoch": 0.1577261809447558, + "learning_rate": 6.920464839968391e-06, + "loss": 0.0808, + "step": 394 + }, + { + "epoch": 0.15852682145716573, + "learning_rate": 6.94706861049117e-06, + "loss": 0.2689, + "step": 396 + }, + { + "epoch": 0.15932746196957567, + "learning_rate": 6.973696230766884e-06, + "loss": 0.3236, + "step": 398 + }, + { + "epoch": 0.16012810248198558, + "learning_rate": 7.000347492778341e-06, + "loss": 0.3469, + "step": 400 + }, + { + "epoch": 0.16092874299439552, + "learning_rate": 7.027022188323704e-06, + "loss": 0.1666, + "step": 402 + }, + { + "epoch": 0.16172938350680544, + "learning_rate": 7.05372010901803e-06, + "loss": 0.2508, + "step": 404 + }, + { + "epoch": 0.16253002401921537, + "learning_rate": 7.080441046294945e-06, + "loss": 0.1196, + "step": 406 + }, + { + "epoch": 0.1633306645316253, + "learning_rate": 7.1071847914082605e-06, + "loss": 0.3571, + "step": 408 + }, + { + "epoch": 0.16413130504403523, + "learning_rate": 7.133951135433656e-06, + "loss": 0.1194, + "step": 410 + }, + { + "epoch": 0.16493194555644516, + "learning_rate": 7.160739869270219e-06, + "loss": 0.0726, + "step": 412 + }, + { + "epoch": 0.16573258606885508, + "learning_rate": 7.18755078364214e-06, + "loss": 0.4341, + "step": 414 + }, + { + "epoch": 0.16653322658126501, + "learning_rate": 7.214383669100317e-06, + "loss": 0.0686, + "step": 416 + }, + { + "epoch": 0.16733386709367493, + "learning_rate": 7.241238316024064e-06, + "loss": 0.6515, + "step": 418 + }, + { + "epoch": 0.16813450760608487, + "learning_rate": 7.268114514622635e-06, + "loss": 0.1301, + "step": 420 + }, + { + "epoch": 0.1689351481184948, + "learning_rate": 7.2950120549369204e-06, + "loss": 0.4152, + "step": 422 + }, + { + "epoch": 0.16973578863090472, + "learning_rate": 7.321930726841144e-06, + "loss": 0.2178, + "step": 424 + }, + { + "epoch": 0.17053642914331466, + "learning_rate": 7.348870320044395e-06, + "loss": 0.2538, + "step": 426 + }, + { + "epoch": 0.17133706965572457, + "learning_rate": 7.375830624092336e-06, + "loss": 0.8807, + "step": 428 + }, + { + "epoch": 0.1721377101681345, + "learning_rate": 7.402811428368824e-06, + "loss": 0.0945, + "step": 430 + }, + { + "epoch": 0.17293835068054444, + "learning_rate": 7.429812522097613e-06, + "loss": 0.1143, + "step": 432 + }, + { + "epoch": 0.17373899119295436, + "learning_rate": 7.4568336943439055e-06, + "loss": 0.2428, + "step": 434 + }, + { + "epoch": 0.1745396317053643, + "learning_rate": 7.4838747340160475e-06, + "loss": 0.2581, + "step": 436 + }, + { + "epoch": 0.1753402722177742, + "learning_rate": 7.510935429867233e-06, + "loss": 0.2846, + "step": 438 + }, + { + "epoch": 0.17614091273018415, + "learning_rate": 7.538015570497046e-06, + "loss": 0.2507, + "step": 440 + }, + { + "epoch": 0.17694155324259409, + "learning_rate": 7.5651149443531846e-06, + "loss": 0.0886, + "step": 442 + }, + { + "epoch": 0.177742193755004, + "learning_rate": 7.592233339733077e-06, + "loss": 0.2461, + "step": 444 + }, + { + "epoch": 0.17854283426741394, + "learning_rate": 7.619370544785608e-06, + "loss": 0.3631, + "step": 446 + }, + { + "epoch": 0.17934347477982385, + "learning_rate": 7.646526347512665e-06, + "loss": 0.2862, + "step": 448 + }, + { + "epoch": 0.1801441152922338, + "learning_rate": 7.67370053577085e-06, + "loss": 0.4121, + "step": 450 + }, + { + "epoch": 0.18094475580464373, + "learning_rate": 7.70089289727319e-06, + "loss": 0.1209, + "step": 452 + }, + { + "epoch": 0.18174539631705364, + "learning_rate": 7.728103219590684e-06, + "loss": 0.2457, + "step": 454 + }, + { + "epoch": 0.18254603682946358, + "learning_rate": 7.755331290154041e-06, + "loss": 0.2927, + "step": 456 + }, + { + "epoch": 0.1833466773418735, + "learning_rate": 7.7825768962553e-06, + "loss": 0.2778, + "step": 458 + }, + { + "epoch": 0.18414731785428343, + "learning_rate": 7.809839825049565e-06, + "loss": 0.2691, + "step": 460 + }, + { + "epoch": 0.18494795836669337, + "learning_rate": 7.83711986355656e-06, + "loss": 0.1952, + "step": 462 + }, + { + "epoch": 0.18574859887910328, + "learning_rate": 7.864416798662347e-06, + "loss": 0.1762, + "step": 464 + }, + { + "epoch": 0.18654923939151322, + "learning_rate": 7.891730417121043e-06, + "loss": 0.2695, + "step": 466 + }, + { + "epoch": 0.18734987990392313, + "learning_rate": 7.919060505556376e-06, + "loss": 0.1501, + "step": 468 + }, + { + "epoch": 0.18815052041633307, + "learning_rate": 7.946406850463435e-06, + "loss": 0.2235, + "step": 470 + }, + { + "epoch": 0.188951160928743, + "learning_rate": 7.973769238210291e-06, + "loss": 0.2527, + "step": 472 + }, + { + "epoch": 0.18975180144115292, + "learning_rate": 8.001147455039737e-06, + "loss": 0.2393, + "step": 474 + }, + { + "epoch": 0.19055244195356286, + "learning_rate": 8.028541287070858e-06, + "loss": 0.1437, + "step": 476 + }, + { + "epoch": 0.19135308246597277, + "learning_rate": 8.055950520300756e-06, + "loss": 0.2818, + "step": 478 + }, + { + "epoch": 0.1921537229783827, + "learning_rate": 8.083374940606256e-06, + "loss": 0.2376, + "step": 480 + }, + { + "epoch": 0.19295436349079265, + "learning_rate": 8.110814333745503e-06, + "loss": 0.3043, + "step": 482 + }, + { + "epoch": 0.19375500400320256, + "learning_rate": 8.138268485359684e-06, + "loss": 0.1607, + "step": 484 + }, + { + "epoch": 0.1945556445156125, + "learning_rate": 8.165737180974676e-06, + "loss": 0.4739, + "step": 486 + }, + { + "epoch": 0.1953562850280224, + "learning_rate": 8.193220206002785e-06, + "loss": 0.2351, + "step": 488 + }, + { + "epoch": 0.19615692554043235, + "learning_rate": 8.220717345744326e-06, + "loss": 0.2772, + "step": 490 + }, + { + "epoch": 0.1969575660528423, + "learning_rate": 8.248228385389349e-06, + "loss": 0.3184, + "step": 492 + }, + { + "epoch": 0.1977582065652522, + "learning_rate": 8.275753110019367e-06, + "loss": 0.264, + "step": 494 + }, + { + "epoch": 0.19855884707766214, + "learning_rate": 8.303291304608936e-06, + "loss": 0.1673, + "step": 496 + }, + { + "epoch": 0.19935948759007205, + "learning_rate": 8.330842754027378e-06, + "loss": 0.081, + "step": 498 + }, + { + "epoch": 0.200160128102482, + "learning_rate": 8.358407243040524e-06, + "loss": 0.3094, + "step": 500 + }, + { + "epoch": 0.20096076861489193, + "learning_rate": 8.385984556312285e-06, + "loss": 0.5189, + "step": 502 + }, + { + "epoch": 0.20176140912730184, + "learning_rate": 8.413574478406386e-06, + "loss": 0.1672, + "step": 504 + }, + { + "epoch": 0.20256204963971178, + "learning_rate": 8.441176793788106e-06, + "loss": 0.1464, + "step": 506 + }, + { + "epoch": 0.2033626901521217, + "learning_rate": 8.468791286825856e-06, + "loss": 0.272, + "step": 508 + }, + { + "epoch": 0.20416333066453163, + "learning_rate": 8.496417741792922e-06, + "loss": 0.2757, + "step": 510 + }, + { + "epoch": 0.20496397117694154, + "learning_rate": 8.524055942869135e-06, + "loss": 0.2393, + "step": 512 + }, + { + "epoch": 0.20576461168935148, + "learning_rate": 8.551705674142616e-06, + "loss": 0.1534, + "step": 514 + }, + { + "epoch": 0.20656525220176142, + "learning_rate": 8.579366719611353e-06, + "loss": 0.2465, + "step": 516 + }, + { + "epoch": 0.20736589271417133, + "learning_rate": 8.607038863184952e-06, + "loss": 0.1102, + "step": 518 + }, + { + "epoch": 0.20816653322658127, + "learning_rate": 8.634721888686368e-06, + "loss": 0.4567, + "step": 520 + }, + { + "epoch": 0.20896717373899118, + "learning_rate": 8.662415579853495e-06, + "loss": 0.052, + "step": 522 + }, + { + "epoch": 0.20976781425140112, + "learning_rate": 8.690119720340907e-06, + "loss": 0.2537, + "step": 524 + }, + { + "epoch": 0.21056845476381106, + "learning_rate": 8.717834093721598e-06, + "loss": 0.0955, + "step": 526 + }, + { + "epoch": 0.21136909527622097, + "learning_rate": 8.74555848348857e-06, + "loss": 0.0945, + "step": 528 + }, + { + "epoch": 0.2121697357886309, + "learning_rate": 8.773292673056572e-06, + "loss": 0.0979, + "step": 530 + }, + { + "epoch": 0.21297037630104082, + "learning_rate": 8.801036445763858e-06, + "loss": 0.1587, + "step": 532 + }, + { + "epoch": 0.21377101681345076, + "learning_rate": 8.828789584873757e-06, + "loss": 0.1522, + "step": 534 + }, + { + "epoch": 0.2145716573258607, + "learning_rate": 8.856551873576448e-06, + "loss": 0.3214, + "step": 536 + }, + { + "epoch": 0.2153722978382706, + "learning_rate": 8.884323094990613e-06, + "loss": 0.7849, + "step": 538 + }, + { + "epoch": 0.21617293835068055, + "learning_rate": 8.912103032165206e-06, + "loss": 0.2705, + "step": 540 + }, + { + "epoch": 0.21697357886309046, + "learning_rate": 8.939891468081036e-06, + "loss": 0.3347, + "step": 542 + }, + { + "epoch": 0.2177742193755004, + "learning_rate": 8.967688185652527e-06, + "loss": 0.312, + "step": 544 + }, + { + "epoch": 0.21857485988791034, + "learning_rate": 8.995492967729449e-06, + "loss": 0.2077, + "step": 546 + }, + { + "epoch": 0.21937550040032025, + "learning_rate": 9.023305597098526e-06, + "loss": 0.5325, + "step": 548 + }, + { + "epoch": 0.2201761409127302, + "learning_rate": 9.051125856485175e-06, + "loss": 0.3148, + "step": 550 + }, + { + "epoch": 0.2209767814251401, + "learning_rate": 9.078953528555258e-06, + "loss": 0.2705, + "step": 552 + }, + { + "epoch": 0.22177742193755004, + "learning_rate": 9.106788395916682e-06, + "loss": 0.3641, + "step": 554 + }, + { + "epoch": 0.22257806244995998, + "learning_rate": 9.134630241121135e-06, + "loss": 0.3094, + "step": 556 + }, + { + "epoch": 0.2233787029623699, + "learning_rate": 9.162478846665854e-06, + "loss": 0.1135, + "step": 558 + }, + { + "epoch": 0.22417934347477983, + "learning_rate": 9.190333994995208e-06, + "loss": 0.2579, + "step": 560 + }, + { + "epoch": 0.22497998398718974, + "learning_rate": 9.218195468502469e-06, + "loss": 0.492, + "step": 562 + }, + { + "epoch": 0.22578062449959968, + "learning_rate": 9.24606304953148e-06, + "loss": 0.3837, + "step": 564 + }, + { + "epoch": 0.22658126501200962, + "learning_rate": 9.273936520378426e-06, + "loss": 0.2131, + "step": 566 + }, + { + "epoch": 0.22738190552441953, + "learning_rate": 9.301815663293426e-06, + "loss": 0.5955, + "step": 568 + }, + { + "epoch": 0.22818254603682947, + "learning_rate": 9.329700260482286e-06, + "loss": 0.298, + "step": 570 + }, + { + "epoch": 0.22898318654923938, + "learning_rate": 9.35759009410826e-06, + "loss": 0.1058, + "step": 572 + }, + { + "epoch": 0.22978382706164932, + "learning_rate": 9.38548494629364e-06, + "loss": 0.3346, + "step": 574 + }, + { + "epoch": 0.23058446757405926, + "learning_rate": 9.41338459912151e-06, + "loss": 0.1455, + "step": 576 + }, + { + "epoch": 0.23138510808646917, + "learning_rate": 9.441288834637507e-06, + "loss": 0.3713, + "step": 578 + }, + { + "epoch": 0.2321857485988791, + "learning_rate": 9.469197434851414e-06, + "loss": 0.2567, + "step": 580 + }, + { + "epoch": 0.23298638911128902, + "learning_rate": 9.497110181738935e-06, + "loss": 0.4603, + "step": 582 + }, + { + "epoch": 0.23378702962369896, + "learning_rate": 9.52502685724336e-06, + "loss": 0.2698, + "step": 584 + }, + { + "epoch": 0.2345876701361089, + "learning_rate": 9.552947243277342e-06, + "loss": 0.4166, + "step": 586 + }, + { + "epoch": 0.2353883106485188, + "learning_rate": 9.580871121724498e-06, + "loss": 0.4154, + "step": 588 + }, + { + "epoch": 0.23618895116092875, + "learning_rate": 9.608798274441153e-06, + "loss": 0.3659, + "step": 590 + }, + { + "epoch": 0.23698959167333866, + "learning_rate": 9.636728483258116e-06, + "loss": 0.288, + "step": 592 + }, + { + "epoch": 0.2377902321857486, + "learning_rate": 9.664661529982263e-06, + "loss": 0.6109, + "step": 594 + }, + { + "epoch": 0.23859087269815854, + "learning_rate": 9.692597196398302e-06, + "loss": 0.3806, + "step": 596 + }, + { + "epoch": 0.23939151321056845, + "learning_rate": 9.720535264270526e-06, + "loss": 0.2535, + "step": 598 + }, + { + "epoch": 0.2401921537229784, + "learning_rate": 9.748475515344416e-06, + "loss": 0.1511, + "step": 600 + }, + { + "epoch": 0.2409927942353883, + "learning_rate": 9.776417731348403e-06, + "loss": 0.3331, + "step": 602 + }, + { + "epoch": 0.24179343474779824, + "learning_rate": 9.80436169399561e-06, + "loss": 0.8799, + "step": 604 + }, + { + "epoch": 0.24259407526020815, + "learning_rate": 9.832307184985473e-06, + "loss": 0.4972, + "step": 606 + }, + { + "epoch": 0.2433947157726181, + "learning_rate": 9.8602539860055e-06, + "loss": 0.1153, + "step": 608 + }, + { + "epoch": 0.24419535628502803, + "learning_rate": 9.888201878732946e-06, + "loss": 0.4513, + "step": 610 + }, + { + "epoch": 0.24499599679743794, + "learning_rate": 9.916150644836596e-06, + "loss": 0.0866, + "step": 612 + }, + { + "epoch": 0.24579663730984788, + "learning_rate": 9.944100065978354e-06, + "loss": 0.1639, + "step": 614 + }, + { + "epoch": 0.2465972778222578, + "learning_rate": 9.972049923815011e-06, + "loss": 0.2583, + "step": 616 + }, + { + "epoch": 0.24739791833466773, + "learning_rate": 9.999999999999996e-06, + "loss": 0.1511, + "step": 618 + }, + { + "epoch": 0.24819855884707767, + "learning_rate": 1.0027950076184982e-05, + "loss": 0.101, + "step": 620 + }, + { + "epoch": 0.24899919935948758, + "learning_rate": 1.0055899934021637e-05, + "loss": 0.088, + "step": 622 + }, + { + "epoch": 0.24979983987189752, + "learning_rate": 1.0083849355163397e-05, + "loss": 0.343, + "step": 624 + }, + { + "epoch": 0.25060048038430743, + "learning_rate": 1.0111798121267047e-05, + "loss": 0.3348, + "step": 626 + }, + { + "epoch": 0.2514011208967174, + "learning_rate": 1.0139746013994493e-05, + "loss": 0.3043, + "step": 628 + }, + { + "epoch": 0.2522017614091273, + "learning_rate": 1.016769281501452e-05, + "loss": 0.0686, + "step": 630 + }, + { + "epoch": 0.2530024019215372, + "learning_rate": 1.0195638306004383e-05, + "loss": 0.4924, + "step": 632 + }, + { + "epoch": 0.25380304243394713, + "learning_rate": 1.022358226865159e-05, + "loss": 0.179, + "step": 634 + }, + { + "epoch": 0.2546036829463571, + "learning_rate": 1.0251524484655577e-05, + "loss": 0.4161, + "step": 636 + }, + { + "epoch": 0.255404323458767, + "learning_rate": 1.0279464735729467e-05, + "loss": 0.1533, + "step": 638 + }, + { + "epoch": 0.2562049639711769, + "learning_rate": 1.0307402803601691e-05, + "loss": 0.1463, + "step": 640 + }, + { + "epoch": 0.2570056044835869, + "learning_rate": 1.033533847001773e-05, + "loss": 0.1664, + "step": 642 + }, + { + "epoch": 0.2578062449959968, + "learning_rate": 1.0363271516741877e-05, + "loss": 0.2287, + "step": 644 + }, + { + "epoch": 0.2586068855084067, + "learning_rate": 1.039120172555884e-05, + "loss": 0.352, + "step": 646 + }, + { + "epoch": 0.2594075260208166, + "learning_rate": 1.0419128878275495e-05, + "loss": 0.2459, + "step": 648 + }, + { + "epoch": 0.2602081665332266, + "learning_rate": 1.0447052756722651e-05, + "loss": 0.147, + "step": 650 + }, + { + "epoch": 0.2610088070456365, + "learning_rate": 1.0474973142756632e-05, + "loss": 0.111, + "step": 652 + }, + { + "epoch": 0.2618094475580464, + "learning_rate": 1.0502889818261058e-05, + "loss": 0.8034, + "step": 654 + }, + { + "epoch": 0.2626100880704564, + "learning_rate": 1.053080256514858e-05, + "loss": 0.0999, + "step": 656 + }, + { + "epoch": 0.2634107285828663, + "learning_rate": 1.0558711165362488e-05, + "loss": 0.5107, + "step": 658 + }, + { + "epoch": 0.2642113690952762, + "learning_rate": 1.0586615400878484e-05, + "loss": 0.1508, + "step": 660 + }, + { + "epoch": 0.26501200960768617, + "learning_rate": 1.0614515053706354e-05, + "loss": 0.46, + "step": 662 + }, + { + "epoch": 0.2658126501200961, + "learning_rate": 1.0642409905891733e-05, + "loss": 0.1847, + "step": 664 + }, + { + "epoch": 0.266613290632506, + "learning_rate": 1.0670299739517706e-05, + "loss": 0.2874, + "step": 666 + }, + { + "epoch": 0.2674139311449159, + "learning_rate": 1.0698184336706567e-05, + "loss": 0.2952, + "step": 668 + }, + { + "epoch": 0.2682145716573259, + "learning_rate": 1.0726063479621567e-05, + "loss": 0.3167, + "step": 670 + }, + { + "epoch": 0.2690152121697358, + "learning_rate": 1.0753936950468513e-05, + "loss": 0.2192, + "step": 672 + }, + { + "epoch": 0.2698158526821457, + "learning_rate": 1.0781804531497525e-05, + "loss": 0.2049, + "step": 674 + }, + { + "epoch": 0.27061649319455566, + "learning_rate": 1.0809666005004787e-05, + "loss": 0.142, + "step": 676 + }, + { + "epoch": 0.2714171337069656, + "learning_rate": 1.083752115333414e-05, + "loss": 0.3636, + "step": 678 + }, + { + "epoch": 0.2722177742193755, + "learning_rate": 1.0865369758878858e-05, + "loss": 0.1554, + "step": 680 + }, + { + "epoch": 0.27301841473178545, + "learning_rate": 1.0893211604083311e-05, + "loss": 0.2077, + "step": 682 + }, + { + "epoch": 0.27381905524419536, + "learning_rate": 1.0921046471444737e-05, + "loss": 0.4589, + "step": 684 + }, + { + "epoch": 0.2746196957566053, + "learning_rate": 1.0948874143514818e-05, + "loss": 0.3858, + "step": 686 + }, + { + "epoch": 0.2754203362690152, + "learning_rate": 1.0976694402901467e-05, + "loss": 0.1693, + "step": 688 + }, + { + "epoch": 0.27622097678142515, + "learning_rate": 1.1004507032270544e-05, + "loss": 0.4269, + "step": 690 + }, + { + "epoch": 0.27702161729383507, + "learning_rate": 1.1032311814347467e-05, + "loss": 0.1767, + "step": 692 + }, + { + "epoch": 0.277822257806245, + "learning_rate": 1.1060108531918955e-05, + "loss": 0.1304, + "step": 694 + }, + { + "epoch": 0.27862289831865494, + "learning_rate": 1.1087896967834787e-05, + "loss": 0.4525, + "step": 696 + }, + { + "epoch": 0.27942353883106485, + "learning_rate": 1.111567690500938e-05, + "loss": 0.0835, + "step": 698 + }, + { + "epoch": 0.28022417934347477, + "learning_rate": 1.1143448126423545e-05, + "loss": 0.2658, + "step": 700 + }, + { + "epoch": 0.28102481985588473, + "learning_rate": 1.1171210415126238e-05, + "loss": 0.3637, + "step": 702 + }, + { + "epoch": 0.28182546036829464, + "learning_rate": 1.1198963554236135e-05, + "loss": 0.0766, + "step": 704 + }, + { + "epoch": 0.28262610088070456, + "learning_rate": 1.122670732694342e-05, + "loss": 0.0878, + "step": 706 + }, + { + "epoch": 0.28342674139311447, + "learning_rate": 1.1254441516511425e-05, + "loss": 0.1512, + "step": 708 + }, + { + "epoch": 0.28422738190552443, + "learning_rate": 1.1282165906278395e-05, + "loss": 0.6163, + "step": 710 + }, + { + "epoch": 0.28502802241793435, + "learning_rate": 1.1309880279659087e-05, + "loss": 0.3853, + "step": 712 + }, + { + "epoch": 0.28582866293034426, + "learning_rate": 1.1337584420146496e-05, + "loss": 0.3328, + "step": 714 + }, + { + "epoch": 0.2866293034427542, + "learning_rate": 1.1365278111313625e-05, + "loss": 0.3501, + "step": 716 + }, + { + "epoch": 0.28742994395516414, + "learning_rate": 1.1392961136815041e-05, + "loss": 0.1953, + "step": 718 + }, + { + "epoch": 0.28823058446757405, + "learning_rate": 1.142063328038864e-05, + "loss": 0.2465, + "step": 720 + }, + { + "epoch": 0.289031224979984, + "learning_rate": 1.1448294325857377e-05, + "loss": 0.2026, + "step": 722 + }, + { + "epoch": 0.2898318654923939, + "learning_rate": 1.1475944057130856e-05, + "loss": 0.1587, + "step": 724 + }, + { + "epoch": 0.29063250600480384, + "learning_rate": 1.150358225820707e-05, + "loss": 0.4342, + "step": 726 + }, + { + "epoch": 0.29143314651721375, + "learning_rate": 1.1531208713174138e-05, + "loss": 0.1901, + "step": 728 + }, + { + "epoch": 0.2922337870296237, + "learning_rate": 1.1558823206211887e-05, + "loss": 0.097, + "step": 730 + }, + { + "epoch": 0.2930344275420336, + "learning_rate": 1.1586425521593607e-05, + "loss": 0.4326, + "step": 732 + }, + { + "epoch": 0.29383506805444354, + "learning_rate": 1.1614015443687708e-05, + "loss": 0.5731, + "step": 734 + }, + { + "epoch": 0.2946357085668535, + "learning_rate": 1.1641592756959467e-05, + "loss": 0.3812, + "step": 736 + }, + { + "epoch": 0.2954363490792634, + "learning_rate": 1.1669157245972616e-05, + "loss": 0.2712, + "step": 738 + }, + { + "epoch": 0.2962369895916733, + "learning_rate": 1.1696708695391057e-05, + "loss": 0.1854, + "step": 740 + }, + { + "epoch": 0.29703763010408324, + "learning_rate": 1.1724246889980626e-05, + "loss": 0.6397, + "step": 742 + }, + { + "epoch": 0.2978382706164932, + "learning_rate": 1.1751771614610643e-05, + "loss": 0.2468, + "step": 744 + }, + { + "epoch": 0.2986389111289031, + "learning_rate": 1.1779282654255668e-05, + "loss": 0.1908, + "step": 746 + }, + { + "epoch": 0.29943955164131303, + "learning_rate": 1.180677979399721e-05, + "loss": 0.4075, + "step": 748 + }, + { + "epoch": 0.300240192153723, + "learning_rate": 1.1834262819025317e-05, + "loss": 0.1726, + "step": 750 + }, + { + "epoch": 0.3010408326661329, + "learning_rate": 1.1861731514640309e-05, + "loss": 0.2609, + "step": 752 + }, + { + "epoch": 0.3018414731785428, + "learning_rate": 1.188918566625449e-05, + "loss": 0.2262, + "step": 754 + }, + { + "epoch": 0.3026421136909528, + "learning_rate": 1.1916625059393739e-05, + "loss": 0.2958, + "step": 756 + }, + { + "epoch": 0.3034427542033627, + "learning_rate": 1.1944049479699241e-05, + "loss": 0.3629, + "step": 758 + }, + { + "epoch": 0.3042433947157726, + "learning_rate": 1.1971458712929133e-05, + "loss": 0.2706, + "step": 760 + }, + { + "epoch": 0.3050440352281825, + "learning_rate": 1.1998852544960256e-05, + "loss": 0.1885, + "step": 762 + }, + { + "epoch": 0.3058446757405925, + "learning_rate": 1.2026230761789702e-05, + "loss": 0.3559, + "step": 764 + }, + { + "epoch": 0.3066453162530024, + "learning_rate": 1.2053593149536557e-05, + "loss": 0.3798, + "step": 766 + }, + { + "epoch": 0.3074459567654123, + "learning_rate": 1.2080939494443618e-05, + "loss": 0.3634, + "step": 768 + }, + { + "epoch": 0.3082465972778223, + "learning_rate": 1.210826958287895e-05, + "loss": 0.0566, + "step": 770 + }, + { + "epoch": 0.3090472377902322, + "learning_rate": 1.2135583201337646e-05, + "loss": 0.5112, + "step": 772 + }, + { + "epoch": 0.3098478783026421, + "learning_rate": 1.2162880136443434e-05, + "loss": 0.2579, + "step": 774 + }, + { + "epoch": 0.31064851881505207, + "learning_rate": 1.2190160174950428e-05, + "loss": 0.1835, + "step": 776 + }, + { + "epoch": 0.311449159327462, + "learning_rate": 1.2217423103744692e-05, + "loss": 0.3909, + "step": 778 + }, + { + "epoch": 0.3122497998398719, + "learning_rate": 1.2244668709845952e-05, + "loss": 0.0585, + "step": 780 + }, + { + "epoch": 0.3130504403522818, + "learning_rate": 1.2271896780409309e-05, + "loss": 0.5295, + "step": 782 + }, + { + "epoch": 0.31385108086469177, + "learning_rate": 1.2299107102726804e-05, + "loss": 0.4067, + "step": 784 + }, + { + "epoch": 0.3146517213771017, + "learning_rate": 1.2326299464229143e-05, + "loss": 0.5494, + "step": 786 + }, + { + "epoch": 0.3154523618895116, + "learning_rate": 1.2353473652487329e-05, + "loss": 0.2105, + "step": 788 + }, + { + "epoch": 0.31625300240192156, + "learning_rate": 1.2380629455214385e-05, + "loss": 0.2494, + "step": 790 + }, + { + "epoch": 0.31705364291433147, + "learning_rate": 1.2407766660266916e-05, + "loss": 0.1844, + "step": 792 + }, + { + "epoch": 0.3178542834267414, + "learning_rate": 1.2434885055646808e-05, + "loss": 0.0701, + "step": 794 + }, + { + "epoch": 0.31865492393915135, + "learning_rate": 1.2461984429502947e-05, + "loss": 0.1013, + "step": 796 + }, + { + "epoch": 0.31945556445156126, + "learning_rate": 1.2489064570132761e-05, + "loss": 0.1912, + "step": 798 + }, + { + "epoch": 0.32025620496397117, + "learning_rate": 1.2516125265983945e-05, + "loss": 0.0976, + "step": 800 + }, + { + "epoch": 0.3210568454763811, + "learning_rate": 1.2543166305656089e-05, + "loss": 0.376, + "step": 802 + }, + { + "epoch": 0.32185748598879105, + "learning_rate": 1.257018747790238e-05, + "loss": 0.2782, + "step": 804 + }, + { + "epoch": 0.32265812650120096, + "learning_rate": 1.259718857163117e-05, + "loss": 0.21, + "step": 806 + }, + { + "epoch": 0.32345876701361087, + "learning_rate": 1.2624169375907657e-05, + "loss": 0.5085, + "step": 808 + }, + { + "epoch": 0.32425940752602084, + "learning_rate": 1.2651129679955598e-05, + "loss": 0.3488, + "step": 810 + }, + { + "epoch": 0.32506004803843075, + "learning_rate": 1.2678069273158849e-05, + "loss": 0.3642, + "step": 812 + }, + { + "epoch": 0.32586068855084066, + "learning_rate": 1.2704987945063073e-05, + "loss": 0.2467, + "step": 814 + }, + { + "epoch": 0.3266613290632506, + "learning_rate": 1.273188548537736e-05, + "loss": 0.2426, + "step": 816 + }, + { + "epoch": 0.32746196957566054, + "learning_rate": 1.2758761683975929e-05, + "loss": 0.1364, + "step": 818 + }, + { + "epoch": 0.32826261008807045, + "learning_rate": 1.2785616330899676e-05, + "loss": 0.4922, + "step": 820 + }, + { + "epoch": 0.32906325060048036, + "learning_rate": 1.2812449216357855e-05, + "loss": 0.1384, + "step": 822 + }, + { + "epoch": 0.32986389111289033, + "learning_rate": 1.2839260130729776e-05, + "loss": 0.3823, + "step": 824 + }, + { + "epoch": 0.33066453162530024, + "learning_rate": 1.2866048864566336e-05, + "loss": 0.2426, + "step": 826 + }, + { + "epoch": 0.33146517213771015, + "learning_rate": 1.2892815208591734e-05, + "loss": 0.3239, + "step": 828 + }, + { + "epoch": 0.3322658126501201, + "learning_rate": 1.2919558953705047e-05, + "loss": 0.6351, + "step": 830 + }, + { + "epoch": 0.33306645316253003, + "learning_rate": 1.2946279890981966e-05, + "loss": 0.2564, + "step": 832 + }, + { + "epoch": 0.33386709367493994, + "learning_rate": 1.2972977811676289e-05, + "loss": 0.1204, + "step": 834 + }, + { + "epoch": 0.33466773418734985, + "learning_rate": 1.2999652507221652e-05, + "loss": 0.4836, + "step": 836 + }, + { + "epoch": 0.3354683746997598, + "learning_rate": 1.3026303769233109e-05, + "loss": 0.6292, + "step": 838 + }, + { + "epoch": 0.33626901521216973, + "learning_rate": 1.3052931389508822e-05, + "loss": 0.1024, + "step": 840 + }, + { + "epoch": 0.33706965572457964, + "learning_rate": 1.3079535160031601e-05, + "loss": 0.2525, + "step": 842 + }, + { + "epoch": 0.3378702962369896, + "learning_rate": 1.3106114872970575e-05, + "loss": 0.3958, + "step": 844 + }, + { + "epoch": 0.3386709367493995, + "learning_rate": 1.313267032068285e-05, + "loss": 0.2527, + "step": 846 + }, + { + "epoch": 0.33947157726180943, + "learning_rate": 1.3159201295715054e-05, + "loss": 0.3217, + "step": 848 + }, + { + "epoch": 0.3402722177742194, + "learning_rate": 1.3185707590804997e-05, + "loss": 0.3802, + "step": 850 + }, + { + "epoch": 0.3410728582866293, + "learning_rate": 1.321218899888334e-05, + "loss": 0.4561, + "step": 852 + }, + { + "epoch": 0.3418734987990392, + "learning_rate": 1.3238645313075109e-05, + "loss": 0.5112, + "step": 854 + }, + { + "epoch": 0.34267413931144913, + "learning_rate": 1.326507632670139e-05, + "loss": 0.261, + "step": 856 + }, + { + "epoch": 0.3434747798238591, + "learning_rate": 1.3291481833280894e-05, + "loss": 0.4738, + "step": 858 + }, + { + "epoch": 0.344275420336269, + "learning_rate": 1.3317861626531652e-05, + "loss": 0.1244, + "step": 860 + }, + { + "epoch": 0.3450760608486789, + "learning_rate": 1.3344215500372517e-05, + "loss": 0.5855, + "step": 862 + }, + { + "epoch": 0.3458767013610889, + "learning_rate": 1.3370543248924826e-05, + "loss": 0.2148, + "step": 864 + }, + { + "epoch": 0.3466773418734988, + "learning_rate": 1.3396844666514062e-05, + "loss": 0.2262, + "step": 866 + }, + { + "epoch": 0.3474779823859087, + "learning_rate": 1.3423119547671348e-05, + "loss": 0.2616, + "step": 868 + }, + { + "epoch": 0.3482786228983187, + "learning_rate": 1.344936768713513e-05, + "loss": 0.2214, + "step": 870 + }, + { + "epoch": 0.3490792634107286, + "learning_rate": 1.347558887985279e-05, + "loss": 0.3332, + "step": 872 + }, + { + "epoch": 0.3498799039231385, + "learning_rate": 1.3501782920982189e-05, + "loss": 0.6014, + "step": 874 + }, + { + "epoch": 0.3506805444355484, + "learning_rate": 1.3527949605893305e-05, + "loss": 0.3634, + "step": 876 + }, + { + "epoch": 0.3514811849479584, + "learning_rate": 1.3554088730169812e-05, + "loss": 0.1967, + "step": 878 + }, + { + "epoch": 0.3522818254603683, + "learning_rate": 1.3580200089610739e-05, + "loss": 0.3127, + "step": 880 + }, + { + "epoch": 0.3530824659727782, + "learning_rate": 1.3606283480231962e-05, + "loss": 0.2422, + "step": 882 + }, + { + "epoch": 0.35388310648518817, + "learning_rate": 1.3632338698267863e-05, + "loss": 0.4698, + "step": 884 + }, + { + "epoch": 0.3546837469975981, + "learning_rate": 1.3658365540172948e-05, + "loss": 0.278, + "step": 886 + }, + { + "epoch": 0.355484387510008, + "learning_rate": 1.368436380262336e-05, + "loss": 0.3511, + "step": 888 + }, + { + "epoch": 0.35628502802241796, + "learning_rate": 1.3710333282518497e-05, + "loss": 0.4351, + "step": 890 + }, + { + "epoch": 0.35708566853482787, + "learning_rate": 1.3736273776982667e-05, + "loss": 0.0777, + "step": 892 + }, + { + "epoch": 0.3578863090472378, + "learning_rate": 1.3762185083366562e-05, + "loss": 1.0026, + "step": 894 + }, + { + "epoch": 0.3586869495596477, + "learning_rate": 1.3788066999248893e-05, + "loss": 0.2428, + "step": 896 + }, + { + "epoch": 0.35948759007205766, + "learning_rate": 1.3813919322438018e-05, + "loss": 0.1825, + "step": 898 + }, + { + "epoch": 0.3602882305844676, + "learning_rate": 1.3839741850973435e-05, + "loss": 0.1073, + "step": 900 + }, + { + "epoch": 0.3610888710968775, + "learning_rate": 1.3865534383127413e-05, + "loss": 0.2377, + "step": 902 + }, + { + "epoch": 0.36188951160928745, + "learning_rate": 1.3891296717406533e-05, + "loss": 0.2862, + "step": 904 + }, + { + "epoch": 0.36269015212169736, + "learning_rate": 1.391702865255334e-05, + "loss": 0.3219, + "step": 906 + }, + { + "epoch": 0.3634907926341073, + "learning_rate": 1.3942729987547808e-05, + "loss": 0.0618, + "step": 908 + }, + { + "epoch": 0.36429143314651724, + "learning_rate": 1.3968400521608962e-05, + "loss": 0.2822, + "step": 910 + }, + { + "epoch": 0.36509207365892715, + "learning_rate": 1.3994040054196498e-05, + "loss": 0.1798, + "step": 912 + }, + { + "epoch": 0.36589271417133706, + "learning_rate": 1.4019648385012245e-05, + "loss": 0.19, + "step": 914 + }, + { + "epoch": 0.366693354683747, + "learning_rate": 1.4045225314001789e-05, + "loss": 0.2119, + "step": 916 + }, + { + "epoch": 0.36749399519615694, + "learning_rate": 1.4070770641356069e-05, + "loss": 0.2781, + "step": 918 + }, + { + "epoch": 0.36829463570856685, + "learning_rate": 1.4096284167512856e-05, + "loss": 0.2785, + "step": 920 + }, + { + "epoch": 0.36909527622097676, + "learning_rate": 1.4121765693158355e-05, + "loss": 0.2166, + "step": 922 + }, + { + "epoch": 0.36989591673338673, + "learning_rate": 1.4147215019228813e-05, + "loss": 0.3819, + "step": 924 + }, + { + "epoch": 0.37069655724579664, + "learning_rate": 1.4172631946911964e-05, + "loss": 0.3234, + "step": 926 + }, + { + "epoch": 0.37149719775820655, + "learning_rate": 1.4198016277648665e-05, + "loss": 0.2964, + "step": 928 + }, + { + "epoch": 0.37229783827061647, + "learning_rate": 1.4223367813134406e-05, + "loss": 0.2478, + "step": 930 + }, + { + "epoch": 0.37309847878302643, + "learning_rate": 1.4248686355320922e-05, + "loss": 0.1258, + "step": 932 + }, + { + "epoch": 0.37389911929543634, + "learning_rate": 1.4273971706417653e-05, + "loss": 0.1522, + "step": 934 + }, + { + "epoch": 0.37469975980784626, + "learning_rate": 1.429922366889332e-05, + "loss": 0.2411, + "step": 936 + }, + { + "epoch": 0.3755004003202562, + "learning_rate": 1.4324442045477534e-05, + "loss": 0.3463, + "step": 938 + }, + { + "epoch": 0.37630104083266613, + "learning_rate": 1.4349626639162231e-05, + "loss": 0.2645, + "step": 940 + }, + { + "epoch": 0.37710168134507605, + "learning_rate": 1.4374777253203265e-05, + "loss": 0.3761, + "step": 942 + }, + { + "epoch": 0.377902321857486, + "learning_rate": 1.4399893691121985e-05, + "loss": 0.1042, + "step": 944 + }, + { + "epoch": 0.3787029623698959, + "learning_rate": 1.4424975756706684e-05, + "loss": 0.5171, + "step": 946 + }, + { + "epoch": 0.37950360288230583, + "learning_rate": 1.4450023254014185e-05, + "loss": 0.1163, + "step": 948 + }, + { + "epoch": 0.38030424339471575, + "learning_rate": 1.4475035987371348e-05, + "loss": 0.0624, + "step": 950 + }, + { + "epoch": 0.3811048839071257, + "learning_rate": 1.4500013761376663e-05, + "loss": 0.4574, + "step": 952 + }, + { + "epoch": 0.3819055244195356, + "learning_rate": 1.4524956380901674e-05, + "loss": 0.1466, + "step": 954 + }, + { + "epoch": 0.38270616493194554, + "learning_rate": 1.454986365109255e-05, + "loss": 0.133, + "step": 956 + }, + { + "epoch": 0.3835068054443555, + "learning_rate": 1.4574735377371669e-05, + "loss": 0.0988, + "step": 958 + }, + { + "epoch": 0.3843074459567654, + "learning_rate": 1.4599571365439027e-05, + "loss": 0.2128, + "step": 960 + }, + { + "epoch": 0.3851080864691753, + "learning_rate": 1.4624371421273812e-05, + "loss": 0.3484, + "step": 962 + }, + { + "epoch": 0.3859087269815853, + "learning_rate": 1.4649135351135968e-05, + "loss": 0.1111, + "step": 964 + }, + { + "epoch": 0.3867093674939952, + "learning_rate": 1.4673862961567604e-05, + "loss": 0.6983, + "step": 966 + }, + { + "epoch": 0.3875100080064051, + "learning_rate": 1.4698554059394563e-05, + "loss": 0.7469, + "step": 968 + }, + { + "epoch": 0.388310648518815, + "learning_rate": 1.4723208451727977e-05, + "loss": 0.2583, + "step": 970 + }, + { + "epoch": 0.389111289031225, + "learning_rate": 1.4747825945965675e-05, + "loss": 0.778, + "step": 972 + }, + { + "epoch": 0.3899119295436349, + "learning_rate": 1.4772406349793749e-05, + "loss": 0.2623, + "step": 974 + }, + { + "epoch": 0.3907125700560448, + "learning_rate": 1.4796949471188033e-05, + "loss": 0.1956, + "step": 976 + }, + { + "epoch": 0.3915132105684548, + "learning_rate": 1.4821455118415666e-05, + "loss": 0.3502, + "step": 978 + }, + { + "epoch": 0.3923138510808647, + "learning_rate": 1.4845923100036479e-05, + "loss": 0.2353, + "step": 980 + }, + { + "epoch": 0.3931144915932746, + "learning_rate": 1.4870353224904563e-05, + "loss": 0.15, + "step": 982 + }, + { + "epoch": 0.3939151321056846, + "learning_rate": 1.4894745302169786e-05, + "loss": 0.3047, + "step": 984 + }, + { + "epoch": 0.3947157726180945, + "learning_rate": 1.4919099141279205e-05, + "loss": 0.2879, + "step": 986 + }, + { + "epoch": 0.3955164131305044, + "learning_rate": 1.4943414551978597e-05, + "loss": 0.2037, + "step": 988 + }, + { + "epoch": 0.3963170536429143, + "learning_rate": 1.4967691344313988e-05, + "loss": 0.1555, + "step": 990 + }, + { + "epoch": 0.3971176941553243, + "learning_rate": 1.499192932863305e-05, + "loss": 0.2984, + "step": 992 + }, + { + "epoch": 0.3979183346677342, + "learning_rate": 1.5016128315586626e-05, + "loss": 0.3494, + "step": 994 + }, + { + "epoch": 0.3987189751801441, + "learning_rate": 1.5040288116130261e-05, + "loss": 0.2962, + "step": 996 + }, + { + "epoch": 0.39951961569255406, + "learning_rate": 1.5064408541525568e-05, + "loss": 0.3216, + "step": 998 + }, + { + "epoch": 0.400320256204964, + "learning_rate": 1.5088489403341793e-05, + "loss": 0.1683, + "step": 1000 + }, + { + "epoch": 0.4011208967173739, + "learning_rate": 1.5112530513457229e-05, + "loss": 0.2896, + "step": 1002 + }, + { + "epoch": 0.40192153722978385, + "learning_rate": 1.513653168406076e-05, + "loss": 0.2631, + "step": 1004 + }, + { + "epoch": 0.40272217774219377, + "learning_rate": 1.5160492727653245e-05, + "loss": 0.4151, + "step": 1006 + }, + { + "epoch": 0.4035228182546037, + "learning_rate": 1.5184413457049006e-05, + "loss": 0.3125, + "step": 1008 + }, + { + "epoch": 0.4043234587670136, + "learning_rate": 1.5208293685377354e-05, + "loss": 0.3836, + "step": 1010 + }, + { + "epoch": 0.40512409927942356, + "learning_rate": 1.5232133226083954e-05, + "loss": 0.277, + "step": 1012 + }, + { + "epoch": 0.40592473979183347, + "learning_rate": 1.5255931892932322e-05, + "loss": 0.3321, + "step": 1014 + }, + { + "epoch": 0.4067253803042434, + "learning_rate": 1.527968950000533e-05, + "loss": 0.3132, + "step": 1016 + }, + { + "epoch": 0.40752602081665334, + "learning_rate": 1.5303405861706574e-05, + "loss": 0.1702, + "step": 1018 + }, + { + "epoch": 0.40832666132906326, + "learning_rate": 1.532708079276185e-05, + "loss": 0.403, + "step": 1020 + }, + { + "epoch": 0.40912730184147317, + "learning_rate": 1.5350714108220667e-05, + "loss": 0.4544, + "step": 1022 + }, + { + "epoch": 0.4099279423538831, + "learning_rate": 1.5374305623457594e-05, + "loss": 0.148, + "step": 1024 + }, + { + "epoch": 0.41072858286629305, + "learning_rate": 1.539785515417376e-05, + "loss": 0.2234, + "step": 1026 + }, + { + "epoch": 0.41152922337870296, + "learning_rate": 1.542136251639826e-05, + "loss": 0.3132, + "step": 1028 + }, + { + "epoch": 0.41232986389111287, + "learning_rate": 1.5444827526489668e-05, + "loss": 0.1908, + "step": 1030 + }, + { + "epoch": 0.41313050440352284, + "learning_rate": 1.5468250001137368e-05, + "loss": 0.2949, + "step": 1032 + }, + { + "epoch": 0.41393114491593275, + "learning_rate": 1.5491629757363026e-05, + "loss": 0.1784, + "step": 1034 + }, + { + "epoch": 0.41473178542834266, + "learning_rate": 1.551496661252208e-05, + "loss": 0.2707, + "step": 1036 + }, + { + "epoch": 0.4155324259407526, + "learning_rate": 1.5538260384305073e-05, + "loss": 0.5597, + "step": 1038 + }, + { + "epoch": 0.41633306645316254, + "learning_rate": 1.5561510890739113e-05, + "loss": 0.2622, + "step": 1040 + }, + { + "epoch": 0.41713370696557245, + "learning_rate": 1.5584717950189353e-05, + "loss": 0.5114, + "step": 1042 + }, + { + "epoch": 0.41793434747798236, + "learning_rate": 1.5607881381360296e-05, + "loss": 0.2025, + "step": 1044 + }, + { + "epoch": 0.4187349879903923, + "learning_rate": 1.563100100329731e-05, + "loss": 0.3266, + "step": 1046 + }, + { + "epoch": 0.41953562850280224, + "learning_rate": 1.565407663538797e-05, + "loss": 0.7817, + "step": 1048 + }, + { + "epoch": 0.42033626901521215, + "learning_rate": 1.567710809736356e-05, + "loss": 0.2389, + "step": 1050 + }, + { + "epoch": 0.4211369095276221, + "learning_rate": 1.5700095209300376e-05, + "loss": 0.1799, + "step": 1052 + }, + { + "epoch": 0.42193755004003203, + "learning_rate": 1.572303779162118e-05, + "loss": 0.1905, + "step": 1054 + }, + { + "epoch": 0.42273819055244194, + "learning_rate": 1.5745935665096647e-05, + "loss": 0.2662, + "step": 1056 + }, + { + "epoch": 0.4235388310648519, + "learning_rate": 1.5768788650846677e-05, + "loss": 0.2414, + "step": 1058 + }, + { + "epoch": 0.4243394715772618, + "learning_rate": 1.5791596570341844e-05, + "loss": 0.4353, + "step": 1060 + }, + { + "epoch": 0.42514011208967173, + "learning_rate": 1.581435924540481e-05, + "loss": 0.3293, + "step": 1062 + }, + { + "epoch": 0.42594075260208164, + "learning_rate": 1.5837076498211666e-05, + "loss": 0.2534, + "step": 1064 + }, + { + "epoch": 0.4267413931144916, + "learning_rate": 1.5859748151293333e-05, + "loss": 0.2068, + "step": 1066 + }, + { + "epoch": 0.4275420336269015, + "learning_rate": 1.5882374027537005e-05, + "loss": 0.373, + "step": 1068 + }, + { + "epoch": 0.42834267413931143, + "learning_rate": 1.5904953950187455e-05, + "loss": 0.1462, + "step": 1070 + }, + { + "epoch": 0.4291433146517214, + "learning_rate": 1.5927487742848448e-05, + "loss": 0.3699, + "step": 1072 + }, + { + "epoch": 0.4299439551641313, + "learning_rate": 1.594997522948412e-05, + "loss": 0.2029, + "step": 1074 + }, + { + "epoch": 0.4307445956765412, + "learning_rate": 1.5972416234420393e-05, + "loss": 0.255, + "step": 1076 + }, + { + "epoch": 0.4315452361889512, + "learning_rate": 1.599481058234626e-05, + "loss": 0.409, + "step": 1078 + }, + { + "epoch": 0.4323458767013611, + "learning_rate": 1.60171580983152e-05, + "loss": 0.2614, + "step": 1080 + }, + { + "epoch": 0.433146517213771, + "learning_rate": 1.6039458607746607e-05, + "loss": 0.4808, + "step": 1082 + }, + { + "epoch": 0.4339471577261809, + "learning_rate": 1.606171193642703e-05, + "loss": 0.3349, + "step": 1084 + }, + { + "epoch": 0.4347477982385909, + "learning_rate": 1.6083917910511616e-05, + "loss": 0.5853, + "step": 1086 + }, + { + "epoch": 0.4355484387510008, + "learning_rate": 1.6106076356525474e-05, + "loss": 0.4155, + "step": 1088 + }, + { + "epoch": 0.4363490792634107, + "learning_rate": 1.6128187101364982e-05, + "loss": 0.5683, + "step": 1090 + }, + { + "epoch": 0.4371497197758207, + "learning_rate": 1.6150249972299153e-05, + "loss": 0.3072, + "step": 1092 + }, + { + "epoch": 0.4379503602882306, + "learning_rate": 1.617226479697104e-05, + "loss": 0.3223, + "step": 1094 + }, + { + "epoch": 0.4387510008006405, + "learning_rate": 1.6194231403398994e-05, + "loss": 0.271, + "step": 1096 + }, + { + "epoch": 0.43955164131305047, + "learning_rate": 1.621614961997806e-05, + "loss": 0.2387, + "step": 1098 + }, + { + "epoch": 0.4403522818254604, + "learning_rate": 1.6238019275481313e-05, + "loss": 0.1096, + "step": 1100 + }, + { + "epoch": 0.4411529223378703, + "learning_rate": 1.6259840199061212e-05, + "loss": 0.1591, + "step": 1102 + }, + { + "epoch": 0.4419535628502802, + "learning_rate": 1.6281612220250883e-05, + "loss": 0.3224, + "step": 1104 + }, + { + "epoch": 0.44275420336269017, + "learning_rate": 1.6303335168965474e-05, + "loss": 0.4509, + "step": 1106 + }, + { + "epoch": 0.4435548438751001, + "learning_rate": 1.6325008875503543e-05, + "loss": 0.2859, + "step": 1108 + }, + { + "epoch": 0.44435548438751, + "learning_rate": 1.6346633170548285e-05, + "loss": 0.4602, + "step": 1110 + }, + { + "epoch": 0.44515612489991996, + "learning_rate": 1.6368207885168897e-05, + "loss": 0.1955, + "step": 1112 + }, + { + "epoch": 0.44595676541232987, + "learning_rate": 1.6389732850821957e-05, + "loss": 0.1756, + "step": 1114 + }, + { + "epoch": 0.4467574059247398, + "learning_rate": 1.641120789935263e-05, + "loss": 0.2381, + "step": 1116 + }, + { + "epoch": 0.4475580464371497, + "learning_rate": 1.6432632862996042e-05, + "loss": 0.2783, + "step": 1118 + }, + { + "epoch": 0.44835868694955966, + "learning_rate": 1.6454007574378637e-05, + "loss": 0.229, + "step": 1120 + }, + { + "epoch": 0.44915932746196957, + "learning_rate": 1.6475331866519377e-05, + "loss": 0.2867, + "step": 1122 + }, + { + "epoch": 0.4499599679743795, + "learning_rate": 1.6496605572831134e-05, + "loss": 0.3959, + "step": 1124 + }, + { + "epoch": 0.45076060848678945, + "learning_rate": 1.6517828527121928e-05, + "loss": 0.3648, + "step": 1126 + }, + { + "epoch": 0.45156124899919936, + "learning_rate": 1.6539000563596318e-05, + "loss": 0.175, + "step": 1128 + }, + { + "epoch": 0.45236188951160927, + "learning_rate": 1.6560121516856586e-05, + "loss": 0.2987, + "step": 1130 + }, + { + "epoch": 0.45316253002401924, + "learning_rate": 1.6581191221904077e-05, + "loss": 0.3171, + "step": 1132 + }, + { + "epoch": 0.45396317053642915, + "learning_rate": 1.6602209514140542e-05, + "loss": 0.3947, + "step": 1134 + }, + { + "epoch": 0.45476381104883906, + "learning_rate": 1.6623176229369324e-05, + "loss": 0.1341, + "step": 1136 + }, + { + "epoch": 0.455564451561249, + "learning_rate": 1.6644091203796694e-05, + "loss": 0.23, + "step": 1138 + }, + { + "epoch": 0.45636509207365894, + "learning_rate": 1.6664954274033168e-05, + "loss": 0.3066, + "step": 1140 + }, + { + "epoch": 0.45716573258606885, + "learning_rate": 1.6685765277094695e-05, + "loss": 0.4326, + "step": 1142 + }, + { + "epoch": 0.45796637309847876, + "learning_rate": 1.6706524050403996e-05, + "loss": 0.1671, + "step": 1144 + }, + { + "epoch": 0.45876701361088873, + "learning_rate": 1.6727230431791806e-05, + "loss": 0.2534, + "step": 1146 + }, + { + "epoch": 0.45956765412329864, + "learning_rate": 1.674788425949818e-05, + "loss": 0.1792, + "step": 1148 + }, + { + "epoch": 0.46036829463570855, + "learning_rate": 1.6768485372173696e-05, + "loss": 0.1166, + "step": 1150 + }, + { + "epoch": 0.4611689351481185, + "learning_rate": 1.6789033608880735e-05, + "loss": 0.3658, + "step": 1152 + }, + { + "epoch": 0.46196957566052843, + "learning_rate": 1.6809528809094798e-05, + "loss": 0.7121, + "step": 1154 + }, + { + "epoch": 0.46277021617293834, + "learning_rate": 1.6829970812705674e-05, + "loss": 0.1084, + "step": 1156 + }, + { + "epoch": 0.46357085668534825, + "learning_rate": 1.6850359460018733e-05, + "loss": 0.1658, + "step": 1158 + }, + { + "epoch": 0.4643714971977582, + "learning_rate": 1.6870694591756165e-05, + "loss": 0.2063, + "step": 1160 + }, + { + "epoch": 0.46517213771016813, + "learning_rate": 1.689097604905826e-05, + "loss": 0.4037, + "step": 1162 + }, + { + "epoch": 0.46597277822257804, + "learning_rate": 1.6911203673484577e-05, + "loss": 0.352, + "step": 1164 + }, + { + "epoch": 0.466773418734988, + "learning_rate": 1.6931377307015226e-05, + "loss": 0.2822, + "step": 1166 + }, + { + "epoch": 0.4675740592473979, + "learning_rate": 1.695149679205214e-05, + "loss": 0.3036, + "step": 1168 + }, + { + "epoch": 0.46837469975980783, + "learning_rate": 1.6971561971420222e-05, + "loss": 0.4972, + "step": 1170 + }, + { + "epoch": 0.4691753402722178, + "learning_rate": 1.6991572688368628e-05, + "loss": 0.2863, + "step": 1172 + }, + { + "epoch": 0.4699759807846277, + "learning_rate": 1.701152878657196e-05, + "loss": 0.4158, + "step": 1174 + }, + { + "epoch": 0.4707766212970376, + "learning_rate": 1.7031430110131562e-05, + "loss": 0.1295, + "step": 1176 + }, + { + "epoch": 0.47157726180944753, + "learning_rate": 1.705127650357662e-05, + "loss": 0.2042, + "step": 1178 + }, + { + "epoch": 0.4723779023218575, + "learning_rate": 1.7071067811865467e-05, + "loss": 0.3643, + "step": 1180 + }, + { + "epoch": 0.4731785428342674, + "learning_rate": 1.7090803880386778e-05, + "loss": 0.1327, + "step": 1182 + }, + { + "epoch": 0.4739791833466773, + "learning_rate": 1.7110484554960738e-05, + "loss": 0.2102, + "step": 1184 + }, + { + "epoch": 0.4747798238590873, + "learning_rate": 1.713010968184029e-05, + "loss": 0.1542, + "step": 1186 + }, + { + "epoch": 0.4755804643714972, + "learning_rate": 1.7149679107712306e-05, + "loss": 0.2286, + "step": 1188 + }, + { + "epoch": 0.4763811048839071, + "learning_rate": 1.716919267969883e-05, + "loss": 0.4714, + "step": 1190 + }, + { + "epoch": 0.4771817453963171, + "learning_rate": 1.7188650245358215e-05, + "loss": 0.3188, + "step": 1192 + }, + { + "epoch": 0.477982385908727, + "learning_rate": 1.7208051652686338e-05, + "loss": 0.2869, + "step": 1194 + }, + { + "epoch": 0.4787830264211369, + "learning_rate": 1.722739675011779e-05, + "loss": 0.5337, + "step": 1196 + }, + { + "epoch": 0.4795836669335468, + "learning_rate": 1.7246685386527095e-05, + "loss": 0.2299, + "step": 1198 + }, + { + "epoch": 0.4803843074459568, + "learning_rate": 1.726591741122981e-05, + "loss": 0.2303, + "step": 1200 + }, + { + "epoch": 0.4811849479583667, + "learning_rate": 1.7285092673983753e-05, + "loss": 0.459, + "step": 1202 + }, + { + "epoch": 0.4819855884707766, + "learning_rate": 1.730421102499021e-05, + "loss": 0.2412, + "step": 1204 + }, + { + "epoch": 0.48278622898318657, + "learning_rate": 1.7323272314895022e-05, + "loss": 0.5511, + "step": 1206 + }, + { + "epoch": 0.4835868694955965, + "learning_rate": 1.734227639478982e-05, + "loss": 0.0907, + "step": 1208 + }, + { + "epoch": 0.4843875100080064, + "learning_rate": 1.736122311621314e-05, + "loss": 0.2052, + "step": 1210 + }, + { + "epoch": 0.4851881505204163, + "learning_rate": 1.738011233115165e-05, + "loss": 0.6105, + "step": 1212 + }, + { + "epoch": 0.4859887910328263, + "learning_rate": 1.7398943892041227e-05, + "loss": 0.2901, + "step": 1214 + }, + { + "epoch": 0.4867894315452362, + "learning_rate": 1.7417717651768144e-05, + "loss": 0.3529, + "step": 1216 + }, + { + "epoch": 0.4875900720576461, + "learning_rate": 1.743643346367026e-05, + "loss": 0.241, + "step": 1218 + }, + { + "epoch": 0.48839071257005606, + "learning_rate": 1.7455091181538087e-05, + "loss": 0.2894, + "step": 1220 + }, + { + "epoch": 0.489191353082466, + "learning_rate": 1.7473690659615992e-05, + "loss": 0.5997, + "step": 1222 + }, + { + "epoch": 0.4899919935948759, + "learning_rate": 1.74922317526033e-05, + "loss": 0.2602, + "step": 1224 + }, + { + "epoch": 0.49079263410728585, + "learning_rate": 1.7510714315655474e-05, + "loss": 0.085, + "step": 1226 + }, + { + "epoch": 0.49159327461969576, + "learning_rate": 1.752913820438519e-05, + "loss": 0.1323, + "step": 1228 + }, + { + "epoch": 0.4923939151321057, + "learning_rate": 1.7547503274863495e-05, + "loss": 1.0105, + "step": 1230 + }, + { + "epoch": 0.4931945556445156, + "learning_rate": 1.756580938362096e-05, + "loss": 0.5574, + "step": 1232 + }, + { + "epoch": 0.49399519615692555, + "learning_rate": 1.758405638764873e-05, + "loss": 0.3977, + "step": 1234 + }, + { + "epoch": 0.49479583666933546, + "learning_rate": 1.7602244144399693e-05, + "loss": 0.3226, + "step": 1236 + }, + { + "epoch": 0.4955964771817454, + "learning_rate": 1.7620372511789604e-05, + "loss": 0.2825, + "step": 1238 + }, + { + "epoch": 0.49639711769415534, + "learning_rate": 1.7638441348198147e-05, + "loss": 0.6854, + "step": 1240 + }, + { + "epoch": 0.49719775820656525, + "learning_rate": 1.7656450512470077e-05, + "loss": 0.5194, + "step": 1242 + }, + { + "epoch": 0.49799839871897517, + "learning_rate": 1.7674399863916295e-05, + "loss": 0.2823, + "step": 1244 + }, + { + "epoch": 0.49879903923138513, + "learning_rate": 1.7692289262315e-05, + "loss": 0.3846, + "step": 1246 + }, + { + "epoch": 0.49959967974379504, + "learning_rate": 1.771011856791273e-05, + "loss": 0.2196, + "step": 1248 + }, + { + "epoch": 0.500400320256205, + "learning_rate": 1.7727887641425448e-05, + "loss": 0.1496, + "step": 1250 + }, + { + "epoch": 0.5012009607686149, + "learning_rate": 1.7745596344039712e-05, + "loss": 0.3823, + "step": 1252 + }, + { + "epoch": 0.5020016012810248, + "learning_rate": 1.7763244537413657e-05, + "loss": 0.2783, + "step": 1254 + }, + { + "epoch": 0.5028022417934348, + "learning_rate": 1.7780832083678116e-05, + "loss": 0.4102, + "step": 1256 + }, + { + "epoch": 0.5036028823058447, + "learning_rate": 1.7798358845437754e-05, + "loss": 0.1999, + "step": 1258 + }, + { + "epoch": 0.5044035228182546, + "learning_rate": 1.7815824685772035e-05, + "loss": 0.1274, + "step": 1260 + }, + { + "epoch": 0.5052041633306645, + "learning_rate": 1.7833229468236364e-05, + "loss": 0.6111, + "step": 1262 + }, + { + "epoch": 0.5060048038430744, + "learning_rate": 1.7850573056863156e-05, + "loss": 0.4522, + "step": 1264 + }, + { + "epoch": 0.5068054443554844, + "learning_rate": 1.786785531616285e-05, + "loss": 0.4445, + "step": 1266 + }, + { + "epoch": 0.5076060848678943, + "learning_rate": 1.7885076111125004e-05, + "loss": 0.141, + "step": 1268 + }, + { + "epoch": 0.5084067253803043, + "learning_rate": 1.790223530721933e-05, + "loss": 0.3333, + "step": 1270 + }, + { + "epoch": 0.5092073658927142, + "learning_rate": 1.791933277039679e-05, + "loss": 0.3523, + "step": 1272 + }, + { + "epoch": 0.5100080064051241, + "learning_rate": 1.7936368367090577e-05, + "loss": 0.1969, + "step": 1274 + }, + { + "epoch": 0.510808646917534, + "learning_rate": 1.7953341964217183e-05, + "loss": 0.3716, + "step": 1276 + }, + { + "epoch": 0.5116092874299439, + "learning_rate": 1.7970253429177477e-05, + "loss": 0.3874, + "step": 1278 + }, + { + "epoch": 0.5124099279423538, + "learning_rate": 1.7987102629857696e-05, + "loss": 0.2075, + "step": 1280 + }, + { + "epoch": 0.5132105684547638, + "learning_rate": 1.800388943463047e-05, + "loss": 0.3219, + "step": 1282 + }, + { + "epoch": 0.5140112089671738, + "learning_rate": 1.8020613712355912e-05, + "loss": 0.2784, + "step": 1284 + }, + { + "epoch": 0.5148118494795837, + "learning_rate": 1.803727533238257e-05, + "loss": 0.3331, + "step": 1286 + }, + { + "epoch": 0.5156124899919936, + "learning_rate": 1.805387416454847e-05, + "loss": 0.3887, + "step": 1288 + }, + { + "epoch": 0.5164131305044035, + "learning_rate": 1.8070410079182195e-05, + "loss": 0.2324, + "step": 1290 + }, + { + "epoch": 0.5172137710168134, + "learning_rate": 1.8086882947103787e-05, + "loss": 0.2299, + "step": 1292 + }, + { + "epoch": 0.5180144115292233, + "learning_rate": 1.8103292639625842e-05, + "loss": 0.2724, + "step": 1294 + }, + { + "epoch": 0.5188150520416333, + "learning_rate": 1.811963902855447e-05, + "loss": 0.2967, + "step": 1296 + }, + { + "epoch": 0.5196156925540433, + "learning_rate": 1.813592198619035e-05, + "loss": 0.191, + "step": 1298 + }, + { + "epoch": 0.5204163330664532, + "learning_rate": 1.8152141385329658e-05, + "loss": 0.1803, + "step": 1300 + }, + { + "epoch": 0.5212169735788631, + "learning_rate": 1.816829709926509e-05, + "loss": 0.2182, + "step": 1302 + }, + { + "epoch": 0.522017614091273, + "learning_rate": 1.8184389001786895e-05, + "loss": 0.2768, + "step": 1304 + }, + { + "epoch": 0.5228182546036829, + "learning_rate": 1.8200416967183785e-05, + "loss": 0.2397, + "step": 1306 + }, + { + "epoch": 0.5236188951160928, + "learning_rate": 1.821638087024396e-05, + "loss": 0.2709, + "step": 1308 + }, + { + "epoch": 0.5244195356285029, + "learning_rate": 1.8232280586256097e-05, + "loss": 0.3392, + "step": 1310 + }, + { + "epoch": 0.5252201761409128, + "learning_rate": 1.8248115991010296e-05, + "loss": 0.4718, + "step": 1312 + }, + { + "epoch": 0.5260208166533227, + "learning_rate": 1.8263886960799055e-05, + "loss": 0.3522, + "step": 1314 + }, + { + "epoch": 0.5268214571657326, + "learning_rate": 1.8279593372418264e-05, + "loss": 0.2285, + "step": 1316 + }, + { + "epoch": 0.5276220976781425, + "learning_rate": 1.829523510316813e-05, + "loss": 0.6591, + "step": 1318 + }, + { + "epoch": 0.5284227381905524, + "learning_rate": 1.8310812030854155e-05, + "loss": 0.152, + "step": 1320 + }, + { + "epoch": 0.5292233787029623, + "learning_rate": 1.832632403378808e-05, + "loss": 0.6184, + "step": 1322 + }, + { + "epoch": 0.5300240192153723, + "learning_rate": 1.834177099078887e-05, + "loss": 0.1225, + "step": 1324 + }, + { + "epoch": 0.5308246597277823, + "learning_rate": 1.8357152781183606e-05, + "loss": 0.3651, + "step": 1326 + }, + { + "epoch": 0.5316253002401922, + "learning_rate": 1.8372469284808465e-05, + "loss": 0.3845, + "step": 1328 + }, + { + "epoch": 0.5324259407526021, + "learning_rate": 1.8387720382009665e-05, + "loss": 0.1674, + "step": 1330 + }, + { + "epoch": 0.533226581265012, + "learning_rate": 1.840290595364436e-05, + "loss": 0.3743, + "step": 1332 + }, + { + "epoch": 0.5340272217774219, + "learning_rate": 1.8418025881081606e-05, + "loss": 0.2416, + "step": 1334 + }, + { + "epoch": 0.5348278622898318, + "learning_rate": 1.8433080046203286e-05, + "loss": 0.2383, + "step": 1336 + }, + { + "epoch": 0.5356285028022418, + "learning_rate": 1.844806833140501e-05, + "loss": 0.4386, + "step": 1338 + }, + { + "epoch": 0.5364291433146517, + "learning_rate": 1.8462990619597054e-05, + "loss": 0.0852, + "step": 1340 + }, + { + "epoch": 0.5372297838270617, + "learning_rate": 1.8477846794205258e-05, + "loss": 0.2264, + "step": 1342 + }, + { + "epoch": 0.5380304243394716, + "learning_rate": 1.8492636739171966e-05, + "loss": 0.0777, + "step": 1344 + }, + { + "epoch": 0.5388310648518815, + "learning_rate": 1.85073603389569e-05, + "loss": 0.2439, + "step": 1346 + }, + { + "epoch": 0.5396317053642914, + "learning_rate": 1.8522017478538067e-05, + "loss": 0.2781, + "step": 1348 + }, + { + "epoch": 0.5404323458767014, + "learning_rate": 1.8536608043412695e-05, + "loss": 0.2703, + "step": 1350 + }, + { + "epoch": 0.5412329863891113, + "learning_rate": 1.855113191959808e-05, + "loss": 0.2228, + "step": 1352 + }, + { + "epoch": 0.5420336269015212, + "learning_rate": 1.856558899363248e-05, + "loss": 0.145, + "step": 1354 + }, + { + "epoch": 0.5428342674139311, + "learning_rate": 1.8579979152576063e-05, + "loss": 0.1954, + "step": 1356 + }, + { + "epoch": 0.5436349079263411, + "learning_rate": 1.85943022840117e-05, + "loss": 0.3267, + "step": 1358 + }, + { + "epoch": 0.544435548438751, + "learning_rate": 1.8608558276045895e-05, + "loss": 0.4871, + "step": 1360 + }, + { + "epoch": 0.5452361889511609, + "learning_rate": 1.862274701730967e-05, + "loss": 0.3663, + "step": 1362 + }, + { + "epoch": 0.5460368294635709, + "learning_rate": 1.86368683969594e-05, + "loss": 0.2158, + "step": 1364 + }, + { + "epoch": 0.5468374699759808, + "learning_rate": 1.865092230467769e-05, + "loss": 0.3769, + "step": 1366 + }, + { + "epoch": 0.5476381104883907, + "learning_rate": 1.866490863067425e-05, + "loss": 0.4372, + "step": 1368 + }, + { + "epoch": 0.5484387510008006, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.328, + "step": 1370 + }, + { + "epoch": 0.5492393915132106, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.2919, + "step": 1372 + }, + { + "epoch": 0.5500400320256205, + "learning_rate": 1.87064610283551e-05, + "loss": 0.79, + "step": 1374 + }, + { + "epoch": 0.5508406725380304, + "learning_rate": 1.8720175940133705e-05, + "loss": 0.5456, + "step": 1376 + }, + { + "epoch": 0.5516413130504404, + "learning_rate": 1.873382272917545e-05, + "loss": 0.088, + "step": 1378 + }, + { + "epoch": 0.5524419535628503, + "learning_rate": 1.8747401288870472e-05, + "loss": 0.2778, + "step": 1380 + }, + { + "epoch": 0.5532425940752602, + "learning_rate": 1.876091151314196e-05, + "loss": 0.2479, + "step": 1382 + }, + { + "epoch": 0.5540432345876701, + "learning_rate": 1.877435329644691e-05, + "loss": 0.1669, + "step": 1384 + }, + { + "epoch": 0.55484387510008, + "learning_rate": 1.8787726533776996e-05, + "loss": 0.1516, + "step": 1386 + }, + { + "epoch": 0.55564451561249, + "learning_rate": 1.8801031120659393e-05, + "loss": 0.4655, + "step": 1388 + }, + { + "epoch": 0.5564451561248999, + "learning_rate": 1.8814266953157557e-05, + "loss": 0.224, + "step": 1390 + }, + { + "epoch": 0.5572457966373099, + "learning_rate": 1.8827433927872066e-05, + "loss": 0.2074, + "step": 1392 + }, + { + "epoch": 0.5580464371497198, + "learning_rate": 1.8840531941941415e-05, + "loss": 0.1431, + "step": 1394 + }, + { + "epoch": 0.5588470776621297, + "learning_rate": 1.8853560893042854e-05, + "loss": 0.5086, + "step": 1396 + }, + { + "epoch": 0.5596477181745396, + "learning_rate": 1.8866520679393127e-05, + "loss": 0.1433, + "step": 1398 + }, + { + "epoch": 0.5604483586869495, + "learning_rate": 1.8879411199749303e-05, + "loss": 0.214, + "step": 1400 + }, + { + "epoch": 0.5612489991993594, + "learning_rate": 1.889223235340958e-05, + "loss": 0.2713, + "step": 1402 + }, + { + "epoch": 0.5620496397117695, + "learning_rate": 1.8904984040214037e-05, + "loss": 0.4895, + "step": 1404 + }, + { + "epoch": 0.5628502802241794, + "learning_rate": 1.8917666160545436e-05, + "loss": 0.2415, + "step": 1406 + }, + { + "epoch": 0.5636509207365893, + "learning_rate": 1.893027861533002e-05, + "loss": 0.3049, + "step": 1408 + }, + { + "epoch": 0.5644515612489992, + "learning_rate": 1.894282130603823e-05, + "loss": 0.2472, + "step": 1410 + }, + { + "epoch": 0.5652522017614091, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.1859, + "step": 1412 + }, + { + "epoch": 0.566052842273819, + "learning_rate": 1.896769700383315e-05, + "loss": 0.287, + "step": 1414 + }, + { + "epoch": 0.5668534827862289, + "learning_rate": 1.898002981658886e-05, + "loss": 0.3063, + "step": 1416 + }, + { + "epoch": 0.567654123298639, + "learning_rate": 1.899229247660769e-05, + "loss": 0.2148, + "step": 1418 + }, + { + "epoch": 0.5684547638110489, + "learning_rate": 1.9004484888092724e-05, + "loss": 0.3833, + "step": 1420 + }, + { + "epoch": 0.5692554043234588, + "learning_rate": 1.901660695579585e-05, + "loss": 0.1353, + "step": 1422 + }, + { + "epoch": 0.5700560448358687, + "learning_rate": 1.9028658585018455e-05, + "loss": 0.1216, + "step": 1424 + }, + { + "epoch": 0.5708566853482786, + "learning_rate": 1.9040639681612212e-05, + "loss": 0.5595, + "step": 1426 + }, + { + "epoch": 0.5716573258606885, + "learning_rate": 1.9052550151979816e-05, + "loss": 0.1231, + "step": 1428 + }, + { + "epoch": 0.5724579663730984, + "learning_rate": 1.9064389903075676e-05, + "loss": 0.5917, + "step": 1430 + }, + { + "epoch": 0.5732586068855084, + "learning_rate": 1.9076158842406674e-05, + "loss": 0.1137, + "step": 1432 + }, + { + "epoch": 0.5740592473979184, + "learning_rate": 1.9087856878032886e-05, + "loss": 0.4544, + "step": 1434 + }, + { + "epoch": 0.5748598879103283, + "learning_rate": 1.909948391856829e-05, + "loss": 0.3797, + "step": 1436 + }, + { + "epoch": 0.5756605284227382, + "learning_rate": 1.911103987318148e-05, + "loss": 0.2901, + "step": 1438 + }, + { + "epoch": 0.5764611689351481, + "learning_rate": 1.912252465159637e-05, + "loss": 0.0969, + "step": 1440 + }, + { + "epoch": 0.577261809447558, + "learning_rate": 1.913393816409294e-05, + "loss": 0.3098, + "step": 1442 + }, + { + "epoch": 0.578062449959968, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.5909, + "step": 1444 + }, + { + "epoch": 0.5788630904723779, + "learning_rate": 1.9156551035235288e-05, + "loss": 0.7086, + "step": 1446 + }, + { + "epoch": 0.5796637309847879, + "learning_rate": 1.9167750217227454e-05, + "loss": 0.6336, + "step": 1448 + }, + { + "epoch": 0.5804643714971978, + "learning_rate": 1.9178877779995423e-05, + "loss": 0.1792, + "step": 1450 + }, + { + "epoch": 0.5812650120096077, + "learning_rate": 1.9189933636609747e-05, + "loss": 0.4454, + "step": 1452 + }, + { + "epoch": 0.5820656525220176, + "learning_rate": 1.9200917700701173e-05, + "loss": 0.1538, + "step": 1454 + }, + { + "epoch": 0.5828662930344275, + "learning_rate": 1.9211829886461274e-05, + "loss": 0.1758, + "step": 1456 + }, + { + "epoch": 0.5836669335468375, + "learning_rate": 1.9222670108643146e-05, + "loss": 0.3067, + "step": 1458 + }, + { + "epoch": 0.5844675740592474, + "learning_rate": 1.9233438282562085e-05, + "loss": 0.262, + "step": 1460 + }, + { + "epoch": 0.5852682145716573, + "learning_rate": 1.924413432409622e-05, + "loss": 0.2535, + "step": 1462 + }, + { + "epoch": 0.5860688550840673, + "learning_rate": 1.925475814968719e-05, + "loss": 0.3805, + "step": 1464 + }, + { + "epoch": 0.5868694955964772, + "learning_rate": 1.926530967634078e-05, + "loss": 0.2101, + "step": 1466 + }, + { + "epoch": 0.5876701361088871, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.3355, + "step": 1468 + }, + { + "epoch": 0.588470776621297, + "learning_rate": 1.9286195503683705e-05, + "loss": 0.1648, + "step": 1470 + }, + { + "epoch": 0.589271417133707, + "learning_rate": 1.9296529641211215e-05, + "loss": 0.2384, + "step": 1472 + }, + { + "epoch": 0.5900720576461169, + "learning_rate": 1.9306791153479004e-05, + "loss": 1.1956, + "step": 1474 + }, + { + "epoch": 0.5908726981585268, + "learning_rate": 1.9316979960323286e-05, + "loss": 0.1724, + "step": 1476 + }, + { + "epoch": 0.5916733386709367, + "learning_rate": 1.932709598214825e-05, + "loss": 0.3612, + "step": 1478 + }, + { + "epoch": 0.5924739791833467, + "learning_rate": 1.9337139139926707e-05, + "loss": 0.2329, + "step": 1480 + }, + { + "epoch": 0.5932746196957566, + "learning_rate": 1.9347109355200672e-05, + "loss": 0.104, + "step": 1482 + }, + { + "epoch": 0.5940752602081665, + "learning_rate": 1.935700655008199e-05, + "loss": 0.5496, + "step": 1484 + }, + { + "epoch": 0.5948759007205765, + "learning_rate": 1.9366830647252967e-05, + "loss": 0.1671, + "step": 1486 + }, + { + "epoch": 0.5956765412329864, + "learning_rate": 1.9376581569966933e-05, + "loss": 0.4091, + "step": 1488 + }, + { + "epoch": 0.5964771817453963, + "learning_rate": 1.9386259242048883e-05, + "loss": 0.3041, + "step": 1490 + }, + { + "epoch": 0.5972778222578062, + "learning_rate": 1.939586358789602e-05, + "loss": 0.3336, + "step": 1492 + }, + { + "epoch": 0.5980784627702161, + "learning_rate": 1.940539453247842e-05, + "loss": 0.1726, + "step": 1494 + }, + { + "epoch": 0.5988791032826261, + "learning_rate": 1.9414852001339547e-05, + "loss": 0.1478, + "step": 1496 + }, + { + "epoch": 0.5996797437950361, + "learning_rate": 1.9424235920596863e-05, + "loss": 0.1972, + "step": 1498 + }, + { + "epoch": 0.600480384307446, + "learning_rate": 1.9433546216942423e-05, + "loss": 0.1201, + "step": 1500 + }, + { + "epoch": 0.6012810248198559, + "learning_rate": 1.944278281764342e-05, + "loss": 0.2278, + "step": 1502 + }, + { + "epoch": 0.6020816653322658, + "learning_rate": 1.945194565054276e-05, + "loss": 0.1384, + "step": 1504 + }, + { + "epoch": 0.6028823058446757, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.1672, + "step": 1506 + }, + { + "epoch": 0.6036829463570856, + "learning_rate": 1.9470049727190073e-05, + "loss": 0.3705, + "step": 1508 + }, + { + "epoch": 0.6044835868694955, + "learning_rate": 1.9478990829507504e-05, + "loss": 0.2118, + "step": 1510 + }, + { + "epoch": 0.6052842273819056, + "learning_rate": 1.948785788116329e-05, + "loss": 0.0961, + "step": 1512 + }, + { + "epoch": 0.6060848678943155, + "learning_rate": 1.9496650812887286e-05, + "loss": 0.3409, + "step": 1514 + }, + { + "epoch": 0.6068855084067254, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.659, + "step": 1516 + }, + { + "epoch": 0.6076861489191353, + "learning_rate": 1.951401404235505e-05, + "loss": 0.0599, + "step": 1518 + }, + { + "epoch": 0.6084867894315452, + "learning_rate": 1.952258420445583e-05, + "loss": 0.4508, + "step": 1520 + }, + { + "epoch": 0.6092874299439551, + "learning_rate": 1.9531079975339912e-05, + "loss": 0.1953, + "step": 1522 + }, + { + "epoch": 0.610088070456365, + "learning_rate": 1.953950128863762e-05, + "loss": 0.3343, + "step": 1524 + }, + { + "epoch": 0.6108887109687751, + "learning_rate": 1.9547848078560975e-05, + "loss": 0.4001, + "step": 1526 + }, + { + "epoch": 0.611689351481185, + "learning_rate": 1.9556120279904144e-05, + "loss": 0.2584, + "step": 1528 + }, + { + "epoch": 0.6124899919935949, + "learning_rate": 1.956431782804402e-05, + "loss": 0.3392, + "step": 1530 + }, + { + "epoch": 0.6132906325060048, + "learning_rate": 1.957244065894066e-05, + "loss": 0.2988, + "step": 1532 + }, + { + "epoch": 0.6140912730184147, + "learning_rate": 1.9580488709137858e-05, + "loss": 0.5967, + "step": 1534 + }, + { + "epoch": 0.6148919135308246, + "learning_rate": 1.9588461915763566e-05, + "loss": 0.5861, + "step": 1536 + }, + { + "epoch": 0.6156925540432346, + "learning_rate": 1.9596360216530436e-05, + "loss": 0.2728, + "step": 1538 + }, + { + "epoch": 0.6164931945556446, + "learning_rate": 1.9604183549736283e-05, + "loss": 0.2395, + "step": 1540 + }, + { + "epoch": 0.6172938350680545, + "learning_rate": 1.961193185426459e-05, + "loss": 0.4044, + "step": 1542 + }, + { + "epoch": 0.6180944755804644, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.2054, + "step": 1544 + }, + { + "epoch": 0.6188951160928743, + "learning_rate": 1.9627203135753573e-05, + "loss": 0.5833, + "step": 1546 + }, + { + "epoch": 0.6196957566052842, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.5507, + "step": 1548 + }, + { + "epoch": 0.6204963971176941, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.4328, + "step": 1550 + }, + { + "epoch": 0.6212970376301041, + "learning_rate": 1.964954584871995e-05, + "loss": 0.367, + "step": 1552 + }, + { + "epoch": 0.622097678142514, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.2623, + "step": 1554 + }, + { + "epoch": 0.622898318654924, + "learning_rate": 1.966406417240872e-05, + "loss": 0.1994, + "step": 1556 + }, + { + "epoch": 0.6236989591673339, + "learning_rate": 1.967121011775546e-05, + "loss": 0.1523, + "step": 1558 + }, + { + "epoch": 0.6244995996797438, + "learning_rate": 1.967828051080755e-05, + "loss": 0.4406, + "step": 1560 + }, + { + "epoch": 0.6253002401921537, + "learning_rate": 1.9685275296330497e-05, + "loss": 0.1155, + "step": 1562 + }, + { + "epoch": 0.6261008807045636, + "learning_rate": 1.969219441968046e-05, + "loss": 0.3867, + "step": 1564 + }, + { + "epoch": 0.6269015212169736, + "learning_rate": 1.969903782680467e-05, + "loss": 0.1554, + "step": 1566 + }, + { + "epoch": 0.6277021617293835, + "learning_rate": 1.9705805464241856e-05, + "loss": 0.5517, + "step": 1568 + }, + { + "epoch": 0.6285028022417934, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.2908, + "step": 1570 + }, + { + "epoch": 0.6293034427542034, + "learning_rate": 1.971911321917015e-05, + "loss": 0.1327, + "step": 1572 + }, + { + "epoch": 0.6301040832666133, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.5923, + "step": 1574 + }, + { + "epoch": 0.6309047237790232, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.4901, + "step": 1576 + }, + { + "epoch": 0.6317053642914331, + "learning_rate": 1.9738505276435692e-05, + "loss": 0.2243, + "step": 1578 + }, + { + "epoch": 0.6325060048038431, + "learning_rate": 1.9744817206240377e-05, + "loss": 0.5561, + "step": 1580 + }, + { + "epoch": 0.633306645316253, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.1974, + "step": 1582 + }, + { + "epoch": 0.6341072858286629, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.4617, + "step": 1584 + }, + { + "epoch": 0.6349079263410728, + "learning_rate": 1.9763296037475174e-05, + "loss": 0.6367, + "step": 1586 + }, + { + "epoch": 0.6357085668534828, + "learning_rate": 1.976930316809569e-05, + "loss": 0.475, + "step": 1588 + }, + { + "epoch": 0.6365092073658927, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.2678, + "step": 1590 + }, + { + "epoch": 0.6373098478783027, + "learning_rate": 1.978108842718768e-05, + "loss": 0.5828, + "step": 1592 + }, + { + "epoch": 0.6381104883907126, + "learning_rate": 1.9786866463591732e-05, + "loss": 0.2228, + "step": 1594 + }, + { + "epoch": 0.6389111289031225, + "learning_rate": 1.9792568044184176e-05, + "loss": 0.1968, + "step": 1596 + }, + { + "epoch": 0.6397117694155324, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.3652, + "step": 1598 + }, + { + "epoch": 0.6405124099279423, + "learning_rate": 1.9803741660367015e-05, + "loss": 0.3047, + "step": 1600 + }, + { + "epoch": 0.6413130504403523, + "learning_rate": 1.9809213608668185e-05, + "loss": 0.3399, + "step": 1602 + }, + { + "epoch": 0.6421136909527622, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.0923, + "step": 1604 + }, + { + "epoch": 0.6429143314651722, + "learning_rate": 1.9819927571953807e-05, + "loss": 0.385, + "step": 1606 + }, + { + "epoch": 0.6437149719775821, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.184, + "step": 1608 + }, + { + "epoch": 0.644515612489992, + "learning_rate": 1.983033467948784e-05, + "loss": 0.2034, + "step": 1610 + }, + { + "epoch": 0.6453162530024019, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.2416, + "step": 1612 + }, + { + "epoch": 0.6461168935148118, + "learning_rate": 1.9840434606066182e-05, + "loss": 0.2729, + "step": 1614 + }, + { + "epoch": 0.6469175340272217, + "learning_rate": 1.9845369277495102e-05, + "loss": 0.2545, + "step": 1616 + }, + { + "epoch": 0.6477181745396317, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.3338, + "step": 1618 + }, + { + "epoch": 0.6485188150520417, + "learning_rate": 1.985500784388244e-05, + "loss": 0.2247, + "step": 1620 + }, + { + "epoch": 0.6493194555644516, + "learning_rate": 1.985971166354357e-05, + "loss": 0.5331, + "step": 1622 + }, + { + "epoch": 0.6501200960768615, + "learning_rate": 1.9864338458320366e-05, + "loss": 0.2314, + "step": 1624 + }, + { + "epoch": 0.6509207365892714, + "learning_rate": 1.986888819206792e-05, + "loss": 0.8902, + "step": 1626 + }, + { + "epoch": 0.6517213771016813, + "learning_rate": 1.9873360829243323e-05, + "loss": 0.6089, + "step": 1628 + }, + { + "epoch": 0.6525220176140912, + "learning_rate": 1.9877756334905983e-05, + "loss": 0.3353, + "step": 1630 + }, + { + "epoch": 0.6533226581265013, + "learning_rate": 1.9882074674717836e-05, + "loss": 0.1792, + "step": 1632 + }, + { + "epoch": 0.6541232986389112, + "learning_rate": 1.988631581494365e-05, + "loss": 0.1738, + "step": 1634 + }, + { + "epoch": 0.6549239391513211, + "learning_rate": 1.989047972245129e-05, + "loss": 0.4287, + "step": 1636 + }, + { + "epoch": 0.655724579663731, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.2662, + "step": 1638 + }, + { + "epoch": 0.6565252201761409, + "learning_rate": 1.989857570980049e-05, + "loss": 0.0661, + "step": 1640 + }, + { + "epoch": 0.6573258606885508, + "learning_rate": 1.990250772639552e-05, + "loss": 0.3524, + "step": 1642 + }, + { + "epoch": 0.6581265012009607, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.5632, + "step": 1644 + }, + { + "epoch": 0.6589271417133707, + "learning_rate": 1.99101396518405e-05, + "loss": 0.2315, + "step": 1646 + }, + { + "epoch": 0.6597277822257807, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.2635, + "step": 1648 + }, + { + "epoch": 0.6605284227381906, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.2547, + "step": 1650 + }, + { + "epoch": 0.6613290632506005, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.347, + "step": 1652 + }, + { + "epoch": 0.6621297037630104, + "learning_rate": 1.9924474249753652e-05, + "loss": 0.185, + "step": 1654 + }, + { + "epoch": 0.6629303442754203, + "learning_rate": 1.9927864140670615e-05, + "loss": 0.3952, + "step": 1656 + }, + { + "epoch": 0.6637309847878302, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.2481, + "step": 1658 + }, + { + "epoch": 0.6645316253002402, + "learning_rate": 1.99344112247369e-05, + "loss": 0.2682, + "step": 1660 + }, + { + "epoch": 0.6653322658126501, + "learning_rate": 1.9937568366739858e-05, + "loss": 0.2193, + "step": 1662 + }, + { + "epoch": 0.6661329063250601, + "learning_rate": 1.9940647875635463e-05, + "loss": 0.1358, + "step": 1664 + }, + { + "epoch": 0.66693354683747, + "learning_rate": 1.9943649727366335e-05, + "loss": 0.2401, + "step": 1666 + }, + { + "epoch": 0.6677341873498799, + "learning_rate": 1.994657389848176e-05, + "loss": 0.2018, + "step": 1668 + }, + { + "epoch": 0.6685348278622898, + "learning_rate": 1.994942036613787e-05, + "loss": 0.2172, + "step": 1670 + }, + { + "epoch": 0.6693354683746997, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.1031, + "step": 1672 + }, + { + "epoch": 0.6701361088871097, + "learning_rate": 1.995488010273198e-05, + "loss": 0.5666, + "step": 1674 + }, + { + "epoch": 0.6709367493995196, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.4108, + "step": 1676 + }, + { + "epoch": 0.6717373899119295, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.1424, + "step": 1678 + }, + { + "epoch": 0.6725380304243395, + "learning_rate": 1.996248639549475e-05, + "loss": 0.3526, + "step": 1680 + }, + { + "epoch": 0.6733386709367494, + "learning_rate": 1.9964866196679105e-05, + "loss": 0.279, + "step": 1682 + }, + { + "epoch": 0.6741393114491593, + "learning_rate": 1.9967168151503196e-05, + "loss": 0.1957, + "step": 1684 + }, + { + "epoch": 0.6749399519615693, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.0805, + "step": 1686 + }, + { + "epoch": 0.6757405924739792, + "learning_rate": 1.997153845074662e-05, + "loss": 0.3964, + "step": 1688 + }, + { + "epoch": 0.6765412329863891, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.2805, + "step": 1690 + }, + { + "epoch": 0.677341873498799, + "learning_rate": 1.997559715666073e-05, + "loss": 0.1975, + "step": 1692 + }, + { + "epoch": 0.678142514011209, + "learning_rate": 1.9977509622105233e-05, + "loss": 0.2046, + "step": 1694 + }, + { + "epoch": 0.6789431545236189, + "learning_rate": 1.9979344142417986e-05, + "loss": 0.1535, + "step": 1696 + }, + { + "epoch": 0.6797437950360288, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.1439, + "step": 1698 + }, + { + "epoch": 0.6805444355484388, + "learning_rate": 1.998277929093157e-05, + "loss": 0.2703, + "step": 1700 + }, + { + "epoch": 0.6813450760608487, + "learning_rate": 1.998437989229673e-05, + "loss": 0.3527, + "step": 1702 + }, + { + "epoch": 0.6821457165732586, + "learning_rate": 1.9985902494859023e-05, + "loss": 0.2159, + "step": 1704 + }, + { + "epoch": 0.6829463570856685, + "learning_rate": 1.998734708672375e-05, + "loss": 0.3154, + "step": 1706 + }, + { + "epoch": 0.6837469975980784, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.3982, + "step": 1708 + }, + { + "epoch": 0.6845476381104884, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.3223, + "step": 1710 + }, + { + "epoch": 0.6853482786228983, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.3223, + "step": 1712 + }, + { + "epoch": 0.6861489191353083, + "learning_rate": 1.9992345130644747e-05, + "loss": 0.2514, + "step": 1714 + }, + { + "epoch": 0.6869495596477182, + "learning_rate": 1.999339951193407e-05, + "loss": 0.1153, + "step": 1716 + }, + { + "epoch": 0.6877502001601281, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.1759, + "step": 1718 + }, + { + "epoch": 0.688550840672538, + "learning_rate": 1.9995274059091018e-05, + "loss": 0.6616, + "step": 1720 + }, + { + "epoch": 0.6893514811849479, + "learning_rate": 1.999609421031453e-05, + "loss": 0.1758, + "step": 1722 + }, + { + "epoch": 0.6901521216973578, + "learning_rate": 1.999683627122195e-05, + "loss": 0.3946, + "step": 1724 + }, + { + "epoch": 0.6909527622097679, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.5087, + "step": 1726 + }, + { + "epoch": 0.6917534027221778, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.1902, + "step": 1728 + }, + { + "epoch": 0.6925540432345877, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.5089, + "step": 1730 + }, + { + "epoch": 0.6933546837469976, + "learning_rate": 1.99990235049015e-05, + "loss": 0.2558, + "step": 1732 + }, + { + "epoch": 0.6941553242594075, + "learning_rate": 1.9999375039475275e-05, + "loss": 0.2839, + "step": 1734 + }, + { + "epoch": 0.6949559647718174, + "learning_rate": 1.999964845810285e-05, + "loss": 0.2647, + "step": 1736 + }, + { + "epoch": 0.6957566052842273, + "learning_rate": 1.9999843758648253e-05, + "loss": 0.353, + "step": 1738 + }, + { + "epoch": 0.6965572457966374, + "learning_rate": 1.999996093958578e-05, + "loss": 0.162, + "step": 1740 + }, + { + "epoch": 0.6973578863090473, + "learning_rate": 2e-05, + "loss": 0.217, + "step": 1742 + }, + { + "epoch": 0.6981585268214572, + "learning_rate": 1.999996093958578e-05, + "loss": 0.2022, + "step": 1744 + }, + { + "epoch": 0.6989591673338671, + "learning_rate": 1.9999843758648253e-05, + "loss": 0.1672, + "step": 1746 + }, + { + "epoch": 0.699759807846277, + "learning_rate": 1.999964845810285e-05, + "loss": 0.4882, + "step": 1748 + }, + { + "epoch": 0.7005604483586869, + "learning_rate": 1.9999375039475278e-05, + "loss": 0.2292, + "step": 1750 + }, + { + "epoch": 0.7013610888710968, + "learning_rate": 1.99990235049015e-05, + "loss": 0.2291, + "step": 1752 + }, + { + "epoch": 0.7021617293835068, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.3039, + "step": 1754 + }, + { + "epoch": 0.7029623698959168, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.2212, + "step": 1756 + }, + { + "epoch": 0.7037630104083267, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.3666, + "step": 1758 + }, + { + "epoch": 0.7045636509207366, + "learning_rate": 1.999683627122195e-05, + "loss": 0.1941, + "step": 1760 + }, + { + "epoch": 0.7053642914331465, + "learning_rate": 1.999609421031453e-05, + "loss": 0.1243, + "step": 1762 + }, + { + "epoch": 0.7061649319455564, + "learning_rate": 1.999527405909102e-05, + "loss": 0.7226, + "step": 1764 + }, + { + "epoch": 0.7069655724579663, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.2627, + "step": 1766 + }, + { + "epoch": 0.7077662129703763, + "learning_rate": 1.999339951193407e-05, + "loss": 0.125, + "step": 1768 + }, + { + "epoch": 0.7085668534827863, + "learning_rate": 1.999234513064475e-05, + "loss": 0.313, + "step": 1770 + }, + { + "epoch": 0.7093674939951962, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.1699, + "step": 1772 + }, + { + "epoch": 0.7101681345076061, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.3893, + "step": 1774 + }, + { + "epoch": 0.710968775020016, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.2232, + "step": 1776 + }, + { + "epoch": 0.7117694155324259, + "learning_rate": 1.998734708672375e-05, + "loss": 0.2315, + "step": 1778 + }, + { + "epoch": 0.7125700560448359, + "learning_rate": 1.9985902494859026e-05, + "loss": 0.1683, + "step": 1780 + }, + { + "epoch": 0.7133706965572458, + "learning_rate": 1.9984379892296735e-05, + "loss": 0.1875, + "step": 1782 + }, + { + "epoch": 0.7141713370696557, + "learning_rate": 1.9982779290931572e-05, + "loss": 0.1916, + "step": 1784 + }, + { + "epoch": 0.7149719775820657, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.2191, + "step": 1786 + }, + { + "epoch": 0.7157726180944756, + "learning_rate": 1.997934414241799e-05, + "loss": 0.2531, + "step": 1788 + }, + { + "epoch": 0.7165732586068855, + "learning_rate": 1.9977509622105236e-05, + "loss": 0.2317, + "step": 1790 + }, + { + "epoch": 0.7173738991192954, + "learning_rate": 1.997559715666073e-05, + "loss": 0.2311, + "step": 1792 + }, + { + "epoch": 0.7181745396317054, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.3787, + "step": 1794 + }, + { + "epoch": 0.7189751801441153, + "learning_rate": 1.997153845074662e-05, + "loss": 0.0832, + "step": 1796 + }, + { + "epoch": 0.7197758206565252, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.2474, + "step": 1798 + }, + { + "epoch": 0.7205764611689351, + "learning_rate": 1.9967168151503193e-05, + "loss": 0.1431, + "step": 1800 + }, + { + "epoch": 0.7213771016813451, + "learning_rate": 1.996486619667911e-05, + "loss": 0.4809, + "step": 1802 + }, + { + "epoch": 0.722177742193755, + "learning_rate": 1.9962486395494753e-05, + "loss": 0.1831, + "step": 1804 + }, + { + "epoch": 0.7229783827061649, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.3728, + "step": 1806 + }, + { + "epoch": 0.7237790232185749, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.4604, + "step": 1808 + }, + { + "epoch": 0.7245796637309848, + "learning_rate": 1.995488010273198e-05, + "loss": 0.2252, + "step": 1810 + }, + { + "epoch": 0.7253803042433947, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.3333, + "step": 1812 + }, + { + "epoch": 0.7261809447558046, + "learning_rate": 1.9949420366137873e-05, + "loss": 0.1656, + "step": 1814 + }, + { + "epoch": 0.7269815852682145, + "learning_rate": 1.994657389848176e-05, + "loss": 0.1976, + "step": 1816 + }, + { + "epoch": 0.7277822257806245, + "learning_rate": 1.994364972736634e-05, + "loss": 0.4776, + "step": 1818 + }, + { + "epoch": 0.7285828662930345, + "learning_rate": 1.9940647875635466e-05, + "loss": 0.7805, + "step": 1820 + }, + { + "epoch": 0.7293835068054444, + "learning_rate": 1.993756836673986e-05, + "loss": 0.5114, + "step": 1822 + }, + { + "epoch": 0.7301841473178543, + "learning_rate": 1.99344112247369e-05, + "loss": 0.2193, + "step": 1824 + }, + { + "epoch": 0.7309847878302642, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.3807, + "step": 1826 + }, + { + "epoch": 0.7317854283426741, + "learning_rate": 1.9927864140670618e-05, + "loss": 0.342, + "step": 1828 + }, + { + "epoch": 0.732586068855084, + "learning_rate": 1.9924474249753656e-05, + "loss": 0.6012, + "step": 1830 + }, + { + "epoch": 0.733386709367494, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.1519, + "step": 1832 + }, + { + "epoch": 0.734187349879904, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.288, + "step": 1834 + }, + { + "epoch": 0.7349879903923139, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.0701, + "step": 1836 + }, + { + "epoch": 0.7357886309047238, + "learning_rate": 1.9910139651840497e-05, + "loss": 0.3051, + "step": 1838 + }, + { + "epoch": 0.7365892714171337, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.2196, + "step": 1840 + }, + { + "epoch": 0.7373899119295436, + "learning_rate": 1.9902507726395524e-05, + "loss": 0.2191, + "step": 1842 + }, + { + "epoch": 0.7381905524419535, + "learning_rate": 1.989857570980049e-05, + "loss": 0.2339, + "step": 1844 + }, + { + "epoch": 0.7389911929543634, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.5476, + "step": 1846 + }, + { + "epoch": 0.7397918334667735, + "learning_rate": 1.9890479722451292e-05, + "loss": 0.2075, + "step": 1848 + }, + { + "epoch": 0.7405924739791834, + "learning_rate": 1.988631581494365e-05, + "loss": 0.6791, + "step": 1850 + }, + { + "epoch": 0.7413931144915933, + "learning_rate": 1.9882074674717832e-05, + "loss": 0.4627, + "step": 1852 + }, + { + "epoch": 0.7421937550040032, + "learning_rate": 1.987775633490599e-05, + "loss": 0.3095, + "step": 1854 + }, + { + "epoch": 0.7429943955164131, + "learning_rate": 1.987336082924333e-05, + "loss": 0.1411, + "step": 1856 + }, + { + "epoch": 0.743795036028823, + "learning_rate": 1.986888819206792e-05, + "loss": 0.3564, + "step": 1858 + }, + { + "epoch": 0.7445956765412329, + "learning_rate": 1.986433845832037e-05, + "loss": 0.2786, + "step": 1860 + }, + { + "epoch": 0.745396317053643, + "learning_rate": 1.9859711663543573e-05, + "loss": 0.1428, + "step": 1862 + }, + { + "epoch": 0.7461969575660529, + "learning_rate": 1.9855007843882437e-05, + "loss": 0.4825, + "step": 1864 + }, + { + "epoch": 0.7469975980784628, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.5399, + "step": 1866 + }, + { + "epoch": 0.7477982385908727, + "learning_rate": 1.9845369277495105e-05, + "loss": 0.4094, + "step": 1868 + }, + { + "epoch": 0.7485988791032826, + "learning_rate": 1.9840434606066186e-05, + "loss": 0.2501, + "step": 1870 + }, + { + "epoch": 0.7493995196156925, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.196, + "step": 1872 + }, + { + "epoch": 0.7502001601281025, + "learning_rate": 1.983033467948784e-05, + "loss": 0.4893, + "step": 1874 + }, + { + "epoch": 0.7510008006405124, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.4091, + "step": 1876 + }, + { + "epoch": 0.7518014411529224, + "learning_rate": 1.9819927571953804e-05, + "loss": 0.2951, + "step": 1878 + }, + { + "epoch": 0.7526020816653323, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.342, + "step": 1880 + }, + { + "epoch": 0.7534027221777422, + "learning_rate": 1.980921360866819e-05, + "loss": 0.2619, + "step": 1882 + }, + { + "epoch": 0.7542033626901521, + "learning_rate": 1.9803741660367018e-05, + "loss": 0.2985, + "step": 1884 + }, + { + "epoch": 0.755004003202562, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.2801, + "step": 1886 + }, + { + "epoch": 0.755804643714972, + "learning_rate": 1.979256804418418e-05, + "loss": 0.091, + "step": 1888 + }, + { + "epoch": 0.7566052842273819, + "learning_rate": 1.978686646359173e-05, + "loss": 0.24, + "step": 1890 + }, + { + "epoch": 0.7574059247397918, + "learning_rate": 1.9781088427187677e-05, + "loss": 0.2009, + "step": 1892 + }, + { + "epoch": 0.7582065652522018, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.3835, + "step": 1894 + }, + { + "epoch": 0.7590072057646117, + "learning_rate": 1.976930316809569e-05, + "loss": 0.2562, + "step": 1896 + }, + { + "epoch": 0.7598078462770216, + "learning_rate": 1.9763296037475177e-05, + "loss": 0.2474, + "step": 1898 + }, + { + "epoch": 0.7606084867894315, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.4461, + "step": 1900 + }, + { + "epoch": 0.7614091273018415, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.1045, + "step": 1902 + }, + { + "epoch": 0.7622097678142514, + "learning_rate": 1.9744817206240374e-05, + "loss": 0.4829, + "step": 1904 + }, + { + "epoch": 0.7630104083266613, + "learning_rate": 1.9738505276435695e-05, + "loss": 0.2623, + "step": 1906 + }, + { + "epoch": 0.7638110488390712, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.511, + "step": 1908 + }, + { + "epoch": 0.7646116893514812, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.4978, + "step": 1910 + }, + { + "epoch": 0.7654123298638911, + "learning_rate": 1.9719113219170152e-05, + "loss": 0.1652, + "step": 1912 + }, + { + "epoch": 0.7662129703763011, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.365, + "step": 1914 + }, + { + "epoch": 0.767013610888711, + "learning_rate": 1.970580546424186e-05, + "loss": 0.2513, + "step": 1916 + }, + { + "epoch": 0.7678142514011209, + "learning_rate": 1.969903782680467e-05, + "loss": 0.2578, + "step": 1918 + }, + { + "epoch": 0.7686148919135308, + "learning_rate": 1.9692194419680463e-05, + "loss": 0.2782, + "step": 1920 + }, + { + "epoch": 0.7694155324259407, + "learning_rate": 1.96852752963305e-05, + "loss": 0.2587, + "step": 1922 + }, + { + "epoch": 0.7702161729383507, + "learning_rate": 1.9678280510807552e-05, + "loss": 0.2579, + "step": 1924 + }, + { + "epoch": 0.7710168134507606, + "learning_rate": 1.9671210117755462e-05, + "loss": 0.0978, + "step": 1926 + }, + { + "epoch": 0.7718174539631706, + "learning_rate": 1.966406417240872e-05, + "loss": 0.344, + "step": 1928 + }, + { + "epoch": 0.7726180944755805, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.2952, + "step": 1930 + }, + { + "epoch": 0.7734187349879904, + "learning_rate": 1.964954584871995e-05, + "loss": 0.1135, + "step": 1932 + }, + { + "epoch": 0.7742193755004003, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.642, + "step": 1934 + }, + { + "epoch": 0.7750200160128102, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.4704, + "step": 1936 + }, + { + "epoch": 0.7758206565252201, + "learning_rate": 1.9627203135753576e-05, + "loss": 0.4414, + "step": 1938 + }, + { + "epoch": 0.77662129703763, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.2152, + "step": 1940 + }, + { + "epoch": 0.7774219375500401, + "learning_rate": 1.961193185426459e-05, + "loss": 0.2204, + "step": 1942 + }, + { + "epoch": 0.77822257806245, + "learning_rate": 1.9604183549736287e-05, + "loss": 0.2381, + "step": 1944 + }, + { + "epoch": 0.7790232185748599, + "learning_rate": 1.959636021653044e-05, + "loss": 0.4098, + "step": 1946 + }, + { + "epoch": 0.7798238590872698, + "learning_rate": 1.958846191576357e-05, + "loss": 0.2063, + "step": 1948 + }, + { + "epoch": 0.7806244995996797, + "learning_rate": 1.958048870913786e-05, + "loss": 0.3984, + "step": 1950 + }, + { + "epoch": 0.7814251401120896, + "learning_rate": 1.9572440658940667e-05, + "loss": 0.1169, + "step": 1952 + }, + { + "epoch": 0.7822257806244995, + "learning_rate": 1.9564317828044022e-05, + "loss": 0.2808, + "step": 1954 + }, + { + "epoch": 0.7830264211369096, + "learning_rate": 1.955612027990415e-05, + "loss": 0.415, + "step": 1956 + }, + { + "epoch": 0.7838270616493195, + "learning_rate": 1.9547848078560982e-05, + "loss": 0.4124, + "step": 1958 + }, + { + "epoch": 0.7846277021617294, + "learning_rate": 1.953950128863763e-05, + "loss": 0.1847, + "step": 1960 + }, + { + "epoch": 0.7854283426741393, + "learning_rate": 1.9531079975339915e-05, + "loss": 0.3619, + "step": 1962 + }, + { + "epoch": 0.7862289831865492, + "learning_rate": 1.9522584204455835e-05, + "loss": 0.412, + "step": 1964 + }, + { + "epoch": 0.7870296236989591, + "learning_rate": 1.9514014042355054e-05, + "loss": 0.2333, + "step": 1966 + }, + { + "epoch": 0.7878302642113691, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.317, + "step": 1968 + }, + { + "epoch": 0.7886309047237791, + "learning_rate": 1.9496650812887293e-05, + "loss": 0.2386, + "step": 1970 + }, + { + "epoch": 0.789431545236189, + "learning_rate": 1.9487857881163295e-05, + "loss": 0.8487, + "step": 1972 + }, + { + "epoch": 0.7902321857485989, + "learning_rate": 1.947899082950751e-05, + "loss": 0.3337, + "step": 1974 + }, + { + "epoch": 0.7910328262610088, + "learning_rate": 1.947004972719008e-05, + "loss": 0.3232, + "step": 1976 + }, + { + "epoch": 0.7918334667734187, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.1913, + "step": 1978 + }, + { + "epoch": 0.7926341072858286, + "learning_rate": 1.945194565054276e-05, + "loss": 0.2048, + "step": 1980 + }, + { + "epoch": 0.7934347477982386, + "learning_rate": 1.9442782817643425e-05, + "loss": 0.2702, + "step": 1982 + }, + { + "epoch": 0.7942353883106485, + "learning_rate": 1.9433546216942433e-05, + "loss": 0.216, + "step": 1984 + }, + { + "epoch": 0.7950360288230585, + "learning_rate": 1.942423592059687e-05, + "loss": 0.4702, + "step": 1986 + }, + { + "epoch": 0.7958366693354684, + "learning_rate": 1.941485200133955e-05, + "loss": 0.1356, + "step": 1988 + }, + { + "epoch": 0.7966373098478783, + "learning_rate": 1.9405394532478422e-05, + "loss": 0.273, + "step": 1990 + }, + { + "epoch": 0.7974379503602882, + "learning_rate": 1.9395863587896025e-05, + "loss": 0.3819, + "step": 1992 + }, + { + "epoch": 0.7982385908726981, + "learning_rate": 1.938625924204888e-05, + "loss": 0.205, + "step": 1994 + }, + { + "epoch": 0.7990392313851081, + "learning_rate": 1.937658156996694e-05, + "loss": 0.3643, + "step": 1996 + }, + { + "epoch": 0.799839871897518, + "learning_rate": 1.9366830647252977e-05, + "loss": 0.1691, + "step": 1998 + }, + { + "epoch": 0.800640512409928, + "learning_rate": 1.9357006550082e-05, + "loss": 0.2077, + "step": 2000 + }, + { + "epoch": 0.8014411529223379, + "learning_rate": 1.9347109355200676e-05, + "loss": 0.4977, + "step": 2002 + }, + { + "epoch": 0.8022417934347478, + "learning_rate": 1.933713913992671e-05, + "loss": 0.1701, + "step": 2004 + }, + { + "epoch": 0.8030424339471577, + "learning_rate": 1.9327095982148255e-05, + "loss": 0.6776, + "step": 2006 + }, + { + "epoch": 0.8038430744595677, + "learning_rate": 1.9316979960323283e-05, + "loss": 0.0996, + "step": 2008 + }, + { + "epoch": 0.8046437149719776, + "learning_rate": 1.9306791153479017e-05, + "loss": 0.3996, + "step": 2010 + }, + { + "epoch": 0.8054443554843875, + "learning_rate": 1.9296529641211226e-05, + "loss": 0.398, + "step": 2012 + }, + { + "epoch": 0.8062449959967974, + "learning_rate": 1.928619550368371e-05, + "loss": 0.1775, + "step": 2014 + }, + { + "epoch": 0.8070456365092074, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.2615, + "step": 2016 + }, + { + "epoch": 0.8078462770216173, + "learning_rate": 1.9265309676340783e-05, + "loss": 0.4162, + "step": 2018 + }, + { + "epoch": 0.8086469175340272, + "learning_rate": 1.9254758149687187e-05, + "loss": 0.2084, + "step": 2020 + }, + { + "epoch": 0.8094475580464372, + "learning_rate": 1.9244134324096216e-05, + "loss": 0.135, + "step": 2022 + }, + { + "epoch": 0.8102481985588471, + "learning_rate": 1.9233438282562095e-05, + "loss": 0.3496, + "step": 2024 + }, + { + "epoch": 0.811048839071257, + "learning_rate": 1.9222670108643156e-05, + "loss": 0.151, + "step": 2026 + }, + { + "epoch": 0.8118494795836669, + "learning_rate": 1.9211829886461278e-05, + "loss": 0.3528, + "step": 2028 + }, + { + "epoch": 0.8126501200960768, + "learning_rate": 1.9200917700701176e-05, + "loss": 0.4159, + "step": 2030 + }, + { + "epoch": 0.8134507606084868, + "learning_rate": 1.918993363660975e-05, + "loss": 0.2199, + "step": 2032 + }, + { + "epoch": 0.8142514011208967, + "learning_rate": 1.9178877779995416e-05, + "loss": 0.2053, + "step": 2034 + }, + { + "epoch": 0.8150520416333067, + "learning_rate": 1.916775021722745e-05, + "loss": 0.1266, + "step": 2036 + }, + { + "epoch": 0.8158526821457166, + "learning_rate": 1.9156551035235298e-05, + "loss": 0.6173, + "step": 2038 + }, + { + "epoch": 0.8166533226581265, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.6758, + "step": 2040 + }, + { + "epoch": 0.8174539631705364, + "learning_rate": 1.9133938164092942e-05, + "loss": 0.6381, + "step": 2042 + }, + { + "epoch": 0.8182546036829463, + "learning_rate": 1.9122524651596372e-05, + "loss": 0.3434, + "step": 2044 + }, + { + "epoch": 0.8190552441953562, + "learning_rate": 1.9111039873181475e-05, + "loss": 0.3288, + "step": 2046 + }, + { + "epoch": 0.8198558847077662, + "learning_rate": 1.9099483918568287e-05, + "loss": 0.3194, + "step": 2048 + }, + { + "epoch": 0.8206565252201762, + "learning_rate": 1.90878568780329e-05, + "loss": 0.1987, + "step": 2050 + }, + { + "epoch": 0.8214571657325861, + "learning_rate": 1.907615884240668e-05, + "loss": 0.3393, + "step": 2052 + }, + { + "epoch": 0.822257806244996, + "learning_rate": 1.9064389903075683e-05, + "loss": 0.4161, + "step": 2054 + }, + { + "epoch": 0.8230584467574059, + "learning_rate": 1.905255015197982e-05, + "loss": 0.5198, + "step": 2056 + }, + { + "epoch": 0.8238590872698158, + "learning_rate": 1.9040639681612216e-05, + "loss": 0.2978, + "step": 2058 + }, + { + "epoch": 0.8246597277822257, + "learning_rate": 1.902865858501845e-05, + "loss": 0.1439, + "step": 2060 + }, + { + "epoch": 0.8254603682946358, + "learning_rate": 1.9016606955795843e-05, + "loss": 0.195, + "step": 2062 + }, + { + "epoch": 0.8262610088070457, + "learning_rate": 1.9004484888092734e-05, + "loss": 0.3996, + "step": 2064 + }, + { + "epoch": 0.8270616493194556, + "learning_rate": 1.8992292476607695e-05, + "loss": 0.4275, + "step": 2066 + }, + { + "epoch": 0.8278622898318655, + "learning_rate": 1.8980029816588863e-05, + "loss": 0.2615, + "step": 2068 + }, + { + "epoch": 0.8286629303442754, + "learning_rate": 1.8967697003833156e-05, + "loss": 0.2176, + "step": 2070 + }, + { + "epoch": 0.8294635708566853, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.2163, + "step": 2072 + }, + { + "epoch": 0.8302642113690952, + "learning_rate": 1.8942821306038227e-05, + "loss": 0.2361, + "step": 2074 + }, + { + "epoch": 0.8310648518815053, + "learning_rate": 1.893027861533003e-05, + "loss": 0.1405, + "step": 2076 + }, + { + "epoch": 0.8318654923939152, + "learning_rate": 1.891766616054545e-05, + "loss": 0.2168, + "step": 2078 + }, + { + "epoch": 0.8326661329063251, + "learning_rate": 1.8904984040214043e-05, + "loss": 0.2475, + "step": 2080 + }, + { + "epoch": 0.833466773418735, + "learning_rate": 1.8892232353409582e-05, + "loss": 0.2019, + "step": 2082 + }, + { + "epoch": 0.8342674139311449, + "learning_rate": 1.8879411199749306e-05, + "loss": 0.3334, + "step": 2084 + }, + { + "epoch": 0.8350680544435548, + "learning_rate": 1.8866520679393124e-05, + "loss": 0.3915, + "step": 2086 + }, + { + "epoch": 0.8358686949559647, + "learning_rate": 1.885356089304285e-05, + "loss": 0.2303, + "step": 2088 + }, + { + "epoch": 0.8366693354683747, + "learning_rate": 1.884053194194143e-05, + "loss": 0.2542, + "step": 2090 + }, + { + "epoch": 0.8374699759807847, + "learning_rate": 1.882743392787207e-05, + "loss": 0.261, + "step": 2092 + }, + { + "epoch": 0.8382706164931946, + "learning_rate": 1.881426695315756e-05, + "loss": 0.6262, + "step": 2094 + }, + { + "epoch": 0.8390712570056045, + "learning_rate": 1.8801031120659396e-05, + "loss": 0.2098, + "step": 2096 + }, + { + "epoch": 0.8398718975180144, + "learning_rate": 1.8787726533777003e-05, + "loss": 0.3639, + "step": 2098 + }, + { + "epoch": 0.8406725380304243, + "learning_rate": 1.877435329644691e-05, + "loss": 0.2418, + "step": 2100 + }, + { + "epoch": 0.8414731785428343, + "learning_rate": 1.8760911513141974e-05, + "loss": 0.3352, + "step": 2102 + }, + { + "epoch": 0.8422738190552442, + "learning_rate": 1.8747401288870482e-05, + "loss": 0.4615, + "step": 2104 + }, + { + "epoch": 0.8430744595676541, + "learning_rate": 1.8733822729175455e-05, + "loss": 0.2267, + "step": 2106 + }, + { + "epoch": 0.8438751000800641, + "learning_rate": 1.8720175940133712e-05, + "loss": 0.3413, + "step": 2108 + }, + { + "epoch": 0.844675740592474, + "learning_rate": 1.8706461028355107e-05, + "loss": 0.0643, + "step": 2110 + }, + { + "epoch": 0.8454763811048839, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.1938, + "step": 2112 + }, + { + "epoch": 0.8462770216172938, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.4721, + "step": 2114 + }, + { + "epoch": 0.8470776621297038, + "learning_rate": 1.8664908630674264e-05, + "loss": 0.2381, + "step": 2116 + }, + { + "epoch": 0.8478783026421137, + "learning_rate": 1.86509223046777e-05, + "loss": 0.4356, + "step": 2118 + }, + { + "epoch": 0.8486789431545236, + "learning_rate": 1.8636868396959406e-05, + "loss": 0.5825, + "step": 2120 + }, + { + "epoch": 0.8494795836669335, + "learning_rate": 1.8622747017309676e-05, + "loss": 0.567, + "step": 2122 + }, + { + "epoch": 0.8502802241793435, + "learning_rate": 1.8608558276045898e-05, + "loss": 0.196, + "step": 2124 + }, + { + "epoch": 0.8510808646917534, + "learning_rate": 1.8594302284011697e-05, + "loss": 0.4434, + "step": 2126 + }, + { + "epoch": 0.8518815052041633, + "learning_rate": 1.8579979152576076e-05, + "loss": 0.5357, + "step": 2128 + }, + { + "epoch": 0.8526821457165733, + "learning_rate": 1.8565588993632498e-05, + "loss": 0.1399, + "step": 2130 + }, + { + "epoch": 0.8534827862289832, + "learning_rate": 1.8551131919598084e-05, + "loss": 0.4342, + "step": 2132 + }, + { + "epoch": 0.8542834267413931, + "learning_rate": 1.8536608043412702e-05, + "loss": 0.3069, + "step": 2134 + }, + { + "epoch": 0.855084067253803, + "learning_rate": 1.852201747853807e-05, + "loss": 0.2446, + "step": 2136 + }, + { + "epoch": 0.855884707766213, + "learning_rate": 1.8507360338956896e-05, + "loss": 0.2623, + "step": 2138 + }, + { + "epoch": 0.8566853482786229, + "learning_rate": 1.849263673917196e-05, + "loss": 0.1786, + "step": 2140 + }, + { + "epoch": 0.8574859887910328, + "learning_rate": 1.847784679420527e-05, + "loss": 0.2083, + "step": 2142 + }, + { + "epoch": 0.8582866293034428, + "learning_rate": 1.846299061959706e-05, + "loss": 0.294, + "step": 2144 + }, + { + "epoch": 0.8590872698158527, + "learning_rate": 1.8448068331405018e-05, + "loss": 0.1737, + "step": 2146 + }, + { + "epoch": 0.8598879103282626, + "learning_rate": 1.8433080046203293e-05, + "loss": 0.1908, + "step": 2148 + }, + { + "epoch": 0.8606885508406725, + "learning_rate": 1.841802588108161e-05, + "loss": 0.2544, + "step": 2150 + }, + { + "epoch": 0.8614891913530824, + "learning_rate": 1.8402905953644356e-05, + "loss": 0.2568, + "step": 2152 + }, + { + "epoch": 0.8622898318654924, + "learning_rate": 1.838772038200968e-05, + "loss": 0.2534, + "step": 2154 + }, + { + "epoch": 0.8630904723779024, + "learning_rate": 1.837246928480848e-05, + "loss": 0.212, + "step": 2156 + }, + { + "epoch": 0.8638911128903123, + "learning_rate": 1.8357152781183613e-05, + "loss": 0.2497, + "step": 2158 + }, + { + "epoch": 0.8646917534027222, + "learning_rate": 1.8341770990788874e-05, + "loss": 0.5035, + "step": 2160 + }, + { + "epoch": 0.8654923939151321, + "learning_rate": 1.8326324033788087e-05, + "loss": 0.1756, + "step": 2162 + }, + { + "epoch": 0.866293034427542, + "learning_rate": 1.831081203085415e-05, + "loss": 0.2215, + "step": 2164 + }, + { + "epoch": 0.8670936749399519, + "learning_rate": 1.8295235103168128e-05, + "loss": 0.2623, + "step": 2166 + }, + { + "epoch": 0.8678943154523618, + "learning_rate": 1.8279593372418284e-05, + "loss": 0.3575, + "step": 2168 + }, + { + "epoch": 0.8686949559647719, + "learning_rate": 1.8263886960799072e-05, + "loss": 0.1685, + "step": 2170 + }, + { + "epoch": 0.8694955964771818, + "learning_rate": 1.8248115991010303e-05, + "loss": 0.1717, + "step": 2172 + }, + { + "epoch": 0.8702962369895917, + "learning_rate": 1.8232280586256104e-05, + "loss": 0.1403, + "step": 2174 + }, + { + "epoch": 0.8710968775020016, + "learning_rate": 1.8216380870243963e-05, + "loss": 0.2512, + "step": 2176 + }, + { + "epoch": 0.8718975180144115, + "learning_rate": 1.820041696718378e-05, + "loss": 0.3041, + "step": 2178 + }, + { + "epoch": 0.8726981585268214, + "learning_rate": 1.8184389001786912e-05, + "loss": 0.4482, + "step": 2180 + }, + { + "epoch": 0.8734987990392313, + "learning_rate": 1.8168297099265108e-05, + "loss": 0.253, + "step": 2182 + }, + { + "epoch": 0.8742994395516414, + "learning_rate": 1.815214138532966e-05, + "loss": 0.2359, + "step": 2184 + }, + { + "epoch": 0.8751000800640513, + "learning_rate": 1.8135921986190358e-05, + "loss": 0.4484, + "step": 2186 + }, + { + "epoch": 0.8759007205764612, + "learning_rate": 1.8119639028554475e-05, + "loss": 0.3117, + "step": 2188 + }, + { + "epoch": 0.8767013610888711, + "learning_rate": 1.8103292639625835e-05, + "loss": 0.1578, + "step": 2190 + }, + { + "epoch": 0.877502001601281, + "learning_rate": 1.808688294710378e-05, + "loss": 0.1441, + "step": 2192 + }, + { + "epoch": 0.8783026421136909, + "learning_rate": 1.807041007918221e-05, + "loss": 0.4258, + "step": 2194 + }, + { + "epoch": 0.8791032826261009, + "learning_rate": 1.805387416454849e-05, + "loss": 0.339, + "step": 2196 + }, + { + "epoch": 0.8799039231385108, + "learning_rate": 1.8037275332382575e-05, + "loss": 0.2432, + "step": 2198 + }, + { + "epoch": 0.8807045636509208, + "learning_rate": 1.802061371235592e-05, + "loss": 0.3494, + "step": 2200 + }, + { + "epoch": 0.8815052041633307, + "learning_rate": 1.8003889434630476e-05, + "loss": 0.2748, + "step": 2202 + }, + { + "epoch": 0.8823058446757406, + "learning_rate": 1.7987102629857692e-05, + "loss": 0.2381, + "step": 2204 + }, + { + "epoch": 0.8831064851881505, + "learning_rate": 1.7970253429177494e-05, + "loss": 0.2686, + "step": 2206 + }, + { + "epoch": 0.8839071257005604, + "learning_rate": 1.7953341964217196e-05, + "loss": 0.2953, + "step": 2208 + }, + { + "epoch": 0.8847077662129704, + "learning_rate": 1.7936368367090583e-05, + "loss": 0.1939, + "step": 2210 + }, + { + "epoch": 0.8855084067253803, + "learning_rate": 1.7919332770396798e-05, + "loss": 0.3219, + "step": 2212 + }, + { + "epoch": 0.8863090472377902, + "learning_rate": 1.7902235307219336e-05, + "loss": 0.3994, + "step": 2214 + }, + { + "epoch": 0.8871096877502002, + "learning_rate": 1.7885076111125e-05, + "loss": 0.1768, + "step": 2216 + }, + { + "epoch": 0.8879103282626101, + "learning_rate": 1.7867855316162846e-05, + "loss": 0.3977, + "step": 2218 + }, + { + "epoch": 0.88871096877502, + "learning_rate": 1.7850573056863173e-05, + "loss": 0.1781, + "step": 2220 + }, + { + "epoch": 0.8895116092874299, + "learning_rate": 1.783322946823638e-05, + "loss": 0.1901, + "step": 2222 + }, + { + "epoch": 0.8903122497998399, + "learning_rate": 1.7815824685772042e-05, + "loss": 0.0933, + "step": 2224 + }, + { + "epoch": 0.8911128903122498, + "learning_rate": 1.779835884543776e-05, + "loss": 0.3637, + "step": 2226 + }, + { + "epoch": 0.8919135308246597, + "learning_rate": 1.7780832083678122e-05, + "loss": 0.1943, + "step": 2228 + }, + { + "epoch": 0.8927141713370697, + "learning_rate": 1.776324453741365e-05, + "loss": 0.4262, + "step": 2230 + }, + { + "epoch": 0.8935148118494796, + "learning_rate": 1.774559634403971e-05, + "loss": 0.7793, + "step": 2232 + }, + { + "epoch": 0.8943154523618895, + "learning_rate": 1.7727887641425465e-05, + "loss": 0.4142, + "step": 2234 + }, + { + "epoch": 0.8951160928742994, + "learning_rate": 1.7710118567912732e-05, + "loss": 0.2556, + "step": 2236 + }, + { + "epoch": 0.8959167333867094, + "learning_rate": 1.7692289262315008e-05, + "loss": 0.2956, + "step": 2238 + }, + { + "epoch": 0.8967173738991193, + "learning_rate": 1.7674399863916298e-05, + "loss": 0.3522, + "step": 2240 + }, + { + "epoch": 0.8975180144115292, + "learning_rate": 1.765645051247007e-05, + "loss": 0.1241, + "step": 2242 + }, + { + "epoch": 0.8983186549239391, + "learning_rate": 1.7638441348198144e-05, + "loss": 0.254, + "step": 2244 + }, + { + "epoch": 0.899119295436349, + "learning_rate": 1.762037251178961e-05, + "loss": 0.2784, + "step": 2246 + }, + { + "epoch": 0.899919935948759, + "learning_rate": 1.7602244144399713e-05, + "loss": 0.3355, + "step": 2248 + }, + { + "epoch": 0.900720576461169, + "learning_rate": 1.7584056387648738e-05, + "loss": 0.1411, + "step": 2250 + }, + { + "epoch": 0.9015212169735789, + "learning_rate": 1.7565809383620966e-05, + "loss": 0.1382, + "step": 2252 + }, + { + "epoch": 0.9023218574859888, + "learning_rate": 1.7547503274863502e-05, + "loss": 0.4424, + "step": 2254 + }, + { + "epoch": 0.9031224979983987, + "learning_rate": 1.7529138204385186e-05, + "loss": 0.4228, + "step": 2256 + }, + { + "epoch": 0.9039231385108086, + "learning_rate": 1.7510714315655467e-05, + "loss": 0.2401, + "step": 2258 + }, + { + "epoch": 0.9047237790232185, + "learning_rate": 1.7492231752603305e-05, + "loss": 0.4546, + "step": 2260 + }, + { + "epoch": 0.9055244195356285, + "learning_rate": 1.7473690659616e-05, + "loss": 0.6144, + "step": 2262 + }, + { + "epoch": 0.9063250600480385, + "learning_rate": 1.7455091181538094e-05, + "loss": 0.3137, + "step": 2264 + }, + { + "epoch": 0.9071257005604484, + "learning_rate": 1.743643346367027e-05, + "loss": 0.1778, + "step": 2266 + }, + { + "epoch": 0.9079263410728583, + "learning_rate": 1.741771765176815e-05, + "loss": 0.7857, + "step": 2268 + }, + { + "epoch": 0.9087269815852682, + "learning_rate": 1.739894389204122e-05, + "loss": 0.2436, + "step": 2270 + }, + { + "epoch": 0.9095276220976781, + "learning_rate": 1.7380112331151657e-05, + "loss": 0.4935, + "step": 2272 + }, + { + "epoch": 0.910328262610088, + "learning_rate": 1.7361223116213146e-05, + "loss": 0.1534, + "step": 2274 + }, + { + "epoch": 0.911128903122498, + "learning_rate": 1.7342276394789825e-05, + "loss": 0.2024, + "step": 2276 + }, + { + "epoch": 0.911929543634908, + "learning_rate": 1.732327231489503e-05, + "loss": 0.3393, + "step": 2278 + }, + { + "epoch": 0.9127301841473179, + "learning_rate": 1.7304211024990216e-05, + "loss": 0.1925, + "step": 2280 + }, + { + "epoch": 0.9135308246597278, + "learning_rate": 1.728509267398376e-05, + "loss": 0.1077, + "step": 2282 + }, + { + "epoch": 0.9143314651721377, + "learning_rate": 1.7265917411229803e-05, + "loss": 0.3574, + "step": 2284 + }, + { + "epoch": 0.9151321056845476, + "learning_rate": 1.7246685386527105e-05, + "loss": 0.2332, + "step": 2286 + }, + { + "epoch": 0.9159327461969575, + "learning_rate": 1.7227396750117802e-05, + "loss": 0.0817, + "step": 2288 + }, + { + "epoch": 0.9167333867093675, + "learning_rate": 1.7208051652686348e-05, + "loss": 0.7245, + "step": 2290 + }, + { + "epoch": 0.9175340272217775, + "learning_rate": 1.718865024535822e-05, + "loss": 0.243, + "step": 2292 + }, + { + "epoch": 0.9183346677341874, + "learning_rate": 1.716919267969884e-05, + "loss": 0.3335, + "step": 2294 + }, + { + "epoch": 0.9191353082465973, + "learning_rate": 1.7149679107712317e-05, + "loss": 0.4264, + "step": 2296 + }, + { + "epoch": 0.9199359487590072, + "learning_rate": 1.7130109681840298e-05, + "loss": 0.3961, + "step": 2298 + }, + { + "epoch": 0.9207365892714171, + "learning_rate": 1.711048455496075e-05, + "loss": 0.0711, + "step": 2300 + }, + { + "epoch": 0.921537229783827, + "learning_rate": 1.7090803880386784e-05, + "loss": 0.1906, + "step": 2302 + }, + { + "epoch": 0.922337870296237, + "learning_rate": 1.7071067811865474e-05, + "loss": 0.1804, + "step": 2304 + }, + { + "epoch": 0.923138510808647, + "learning_rate": 1.705127650357663e-05, + "loss": 0.339, + "step": 2306 + }, + { + "epoch": 0.9239391513210569, + "learning_rate": 1.7031430110131566e-05, + "loss": 0.4034, + "step": 2308 + }, + { + "epoch": 0.9247397918334668, + "learning_rate": 1.701152878657197e-05, + "loss": 0.398, + "step": 2310 + }, + { + "epoch": 0.9255404323458767, + "learning_rate": 1.699157268836863e-05, + "loss": 0.1696, + "step": 2312 + }, + { + "epoch": 0.9263410728582866, + "learning_rate": 1.697156197142023e-05, + "loss": 0.2539, + "step": 2314 + }, + { + "epoch": 0.9271417133706965, + "learning_rate": 1.6951496792052148e-05, + "loss": 0.1636, + "step": 2316 + }, + { + "epoch": 0.9279423538831065, + "learning_rate": 1.6931377307015236e-05, + "loss": 0.1932, + "step": 2318 + }, + { + "epoch": 0.9287429943955164, + "learning_rate": 1.6911203673484583e-05, + "loss": 0.213, + "step": 2320 + }, + { + "epoch": 0.9295436349079264, + "learning_rate": 1.6890976049058267e-05, + "loss": 0.1382, + "step": 2322 + }, + { + "epoch": 0.9303442754203363, + "learning_rate": 1.687069459175619e-05, + "loss": 0.2644, + "step": 2324 + }, + { + "epoch": 0.9311449159327462, + "learning_rate": 1.6850359460018744e-05, + "loss": 0.1321, + "step": 2326 + }, + { + "epoch": 0.9319455564451561, + "learning_rate": 1.682997081270568e-05, + "loss": 0.1907, + "step": 2328 + }, + { + "epoch": 0.932746196957566, + "learning_rate": 1.6809528809094805e-05, + "loss": 0.3268, + "step": 2330 + }, + { + "epoch": 0.933546837469976, + "learning_rate": 1.6789033608880742e-05, + "loss": 0.6376, + "step": 2332 + }, + { + "epoch": 0.9343474779823859, + "learning_rate": 1.67684853721737e-05, + "loss": 0.2561, + "step": 2334 + }, + { + "epoch": 0.9351481184947958, + "learning_rate": 1.6747884259498185e-05, + "loss": 0.3149, + "step": 2336 + }, + { + "epoch": 0.9359487590072058, + "learning_rate": 1.6727230431791826e-05, + "loss": 0.3334, + "step": 2338 + }, + { + "epoch": 0.9367493995196157, + "learning_rate": 1.6706524050404006e-05, + "loss": 0.3644, + "step": 2340 + }, + { + "epoch": 0.9375500400320256, + "learning_rate": 1.6685765277094702e-05, + "loss": 0.2017, + "step": 2342 + }, + { + "epoch": 0.9383506805444356, + "learning_rate": 1.6664954274033175e-05, + "loss": 0.1883, + "step": 2344 + }, + { + "epoch": 0.9391513210568455, + "learning_rate": 1.66440912037967e-05, + "loss": 0.2544, + "step": 2346 + }, + { + "epoch": 0.9399519615692554, + "learning_rate": 1.662317622936933e-05, + "loss": 0.3652, + "step": 2348 + }, + { + "epoch": 0.9407526020816653, + "learning_rate": 1.6602209514140562e-05, + "loss": 0.3362, + "step": 2350 + }, + { + "epoch": 0.9415532425940752, + "learning_rate": 1.6581191221904098e-05, + "loss": 0.2418, + "step": 2352 + }, + { + "epoch": 0.9423538831064852, + "learning_rate": 1.6560121516856592e-05, + "loss": 0.4332, + "step": 2354 + }, + { + "epoch": 0.9431545236188951, + "learning_rate": 1.6539000563596328e-05, + "loss": 0.4542, + "step": 2356 + }, + { + "epoch": 0.9439551641313051, + "learning_rate": 1.651782852712194e-05, + "loss": 0.436, + "step": 2358 + }, + { + "epoch": 0.944755804643715, + "learning_rate": 1.6496605572831127e-05, + "loss": 0.2826, + "step": 2360 + }, + { + "epoch": 0.9455564451561249, + "learning_rate": 1.6475331866519387e-05, + "loss": 0.4329, + "step": 2362 + }, + { + "epoch": 0.9463570856685348, + "learning_rate": 1.6454007574378657e-05, + "loss": 0.0567, + "step": 2364 + }, + { + "epoch": 0.9471577261809447, + "learning_rate": 1.6432632862996062e-05, + "loss": 0.4374, + "step": 2366 + }, + { + "epoch": 0.9479583666933546, + "learning_rate": 1.6411207899352633e-05, + "loss": 0.3355, + "step": 2368 + }, + { + "epoch": 0.9487590072057646, + "learning_rate": 1.6389732850821964e-05, + "loss": 0.1582, + "step": 2370 + }, + { + "epoch": 0.9495596477181746, + "learning_rate": 1.6368207885168904e-05, + "loss": 0.2732, + "step": 2372 + }, + { + "epoch": 0.9503602882305845, + "learning_rate": 1.6346633170548275e-05, + "loss": 0.3249, + "step": 2374 + }, + { + "epoch": 0.9511609287429944, + "learning_rate": 1.6325008875503563e-05, + "loss": 0.2827, + "step": 2376 + }, + { + "epoch": 0.9519615692554043, + "learning_rate": 1.6303335168965495e-05, + "loss": 0.1964, + "step": 2378 + }, + { + "epoch": 0.9527622097678142, + "learning_rate": 1.628161222025089e-05, + "loss": 0.2353, + "step": 2380 + }, + { + "epoch": 0.9535628502802241, + "learning_rate": 1.625984019906122e-05, + "loss": 0.1889, + "step": 2382 + }, + { + "epoch": 0.9543634907926342, + "learning_rate": 1.623801927548132e-05, + "loss": 0.3837, + "step": 2384 + }, + { + "epoch": 0.9551641313050441, + "learning_rate": 1.6216149619978057e-05, + "loss": 0.3073, + "step": 2386 + }, + { + "epoch": 0.955964771817454, + "learning_rate": 1.6194231403398987e-05, + "loss": 0.0769, + "step": 2388 + }, + { + "epoch": 0.9567654123298639, + "learning_rate": 1.6172264796971063e-05, + "loss": 0.413, + "step": 2390 + }, + { + "epoch": 0.9575660528422738, + "learning_rate": 1.6150249972299173e-05, + "loss": 0.4299, + "step": 2392 + }, + { + "epoch": 0.9583666933546837, + "learning_rate": 1.612818710136499e-05, + "loss": 0.1766, + "step": 2394 + }, + { + "epoch": 0.9591673338670936, + "learning_rate": 1.6106076356525484e-05, + "loss": 0.3647, + "step": 2396 + }, + { + "epoch": 0.9599679743795037, + "learning_rate": 1.6083917910511623e-05, + "loss": 0.1286, + "step": 2398 + }, + { + "epoch": 0.9607686148919136, + "learning_rate": 1.6061711936427028e-05, + "loss": 0.3527, + "step": 2400 + }, + { + "epoch": 0.9615692554043235, + "learning_rate": 1.60394586077466e-05, + "loss": 0.4262, + "step": 2402 + }, + { + "epoch": 0.9623698959167334, + "learning_rate": 1.6017158098315224e-05, + "loss": 0.1572, + "step": 2404 + }, + { + "epoch": 0.9631705364291433, + "learning_rate": 1.5994810582346266e-05, + "loss": 0.3394, + "step": 2406 + }, + { + "epoch": 0.9639711769415532, + "learning_rate": 1.5972416234420404e-05, + "loss": 0.3192, + "step": 2408 + }, + { + "epoch": 0.9647718174539631, + "learning_rate": 1.594997522948413e-05, + "loss": 0.2058, + "step": 2410 + }, + { + "epoch": 0.9655724579663731, + "learning_rate": 1.592748774284844e-05, + "loss": 0.2061, + "step": 2412 + }, + { + "epoch": 0.966373098478783, + "learning_rate": 1.5904953950187448e-05, + "loss": 0.153, + "step": 2414 + }, + { + "epoch": 0.967173738991193, + "learning_rate": 1.588237402753703e-05, + "loss": 0.2784, + "step": 2416 + }, + { + "epoch": 0.9679743795036029, + "learning_rate": 1.5859748151293354e-05, + "loss": 0.4319, + "step": 2418 + }, + { + "epoch": 0.9687750200160128, + "learning_rate": 1.5837076498211673e-05, + "loss": 0.4336, + "step": 2420 + }, + { + "epoch": 0.9695756605284227, + "learning_rate": 1.581435924540482e-05, + "loss": 0.48, + "step": 2422 + }, + { + "epoch": 0.9703763010408326, + "learning_rate": 1.579159657034185e-05, + "loss": 0.2646, + "step": 2424 + }, + { + "epoch": 0.9711769415532426, + "learning_rate": 1.5768788650846674e-05, + "loss": 0.0719, + "step": 2426 + }, + { + "epoch": 0.9719775820656525, + "learning_rate": 1.574593566509664e-05, + "loss": 0.2512, + "step": 2428 + }, + { + "epoch": 0.9727782225780625, + "learning_rate": 1.5723037791621203e-05, + "loss": 0.584, + "step": 2430 + }, + { + "epoch": 0.9735788630904724, + "learning_rate": 1.5700095209300386e-05, + "loss": 0.4238, + "step": 2432 + }, + { + "epoch": 0.9743795036028823, + "learning_rate": 1.5677108097363565e-05, + "loss": 0.2876, + "step": 2434 + }, + { + "epoch": 0.9751801441152922, + "learning_rate": 1.5654076635387976e-05, + "loss": 0.131, + "step": 2436 + }, + { + "epoch": 0.9759807846277022, + "learning_rate": 1.5631001003297302e-05, + "loss": 0.3334, + "step": 2438 + }, + { + "epoch": 0.9767814251401121, + "learning_rate": 1.560788138136029e-05, + "loss": 0.5389, + "step": 2440 + }, + { + "epoch": 0.977582065652522, + "learning_rate": 1.5584717950189373e-05, + "loss": 0.0843, + "step": 2442 + }, + { + "epoch": 0.978382706164932, + "learning_rate": 1.5561510890739137e-05, + "loss": 0.2986, + "step": 2444 + }, + { + "epoch": 0.9791833466773419, + "learning_rate": 1.5538260384305083e-05, + "loss": 0.2054, + "step": 2446 + }, + { + "epoch": 0.9799839871897518, + "learning_rate": 1.5514966612522088e-05, + "loss": 0.2955, + "step": 2448 + }, + { + "epoch": 0.9807846277021617, + "learning_rate": 1.5491629757363033e-05, + "loss": 0.16, + "step": 2450 + }, + { + "epoch": 0.9815852682145717, + "learning_rate": 1.546825000113736e-05, + "loss": 0.2974, + "step": 2452 + }, + { + "epoch": 0.9823859087269816, + "learning_rate": 1.544482752648966e-05, + "loss": 0.267, + "step": 2454 + }, + { + "epoch": 0.9831865492393915, + "learning_rate": 1.5421362516398285e-05, + "loss": 0.4148, + "step": 2456 + }, + { + "epoch": 0.9839871897518014, + "learning_rate": 1.539785515417377e-05, + "loss": 0.1287, + "step": 2458 + }, + { + "epoch": 0.9847878302642114, + "learning_rate": 1.5374305623457605e-05, + "loss": 0.2933, + "step": 2460 + }, + { + "epoch": 0.9855884707766213, + "learning_rate": 1.5350714108220677e-05, + "loss": 0.1032, + "step": 2462 + }, + { + "epoch": 0.9863891112890312, + "learning_rate": 1.532708079276186e-05, + "loss": 0.6815, + "step": 2464 + }, + { + "epoch": 0.9871897518014412, + "learning_rate": 1.5303405861706567e-05, + "loss": 0.4237, + "step": 2466 + }, + { + "epoch": 0.9879903923138511, + "learning_rate": 1.5279689500005353e-05, + "loss": 0.0938, + "step": 2468 + }, + { + "epoch": 0.988791032826261, + "learning_rate": 1.5255931892932344e-05, + "loss": 0.1729, + "step": 2470 + }, + { + "epoch": 0.9895916733386709, + "learning_rate": 1.5232133226083962e-05, + "loss": 0.2148, + "step": 2472 + }, + { + "epoch": 0.9903923138510808, + "learning_rate": 1.5208293685377362e-05, + "loss": 0.0816, + "step": 2474 + }, + { + "epoch": 0.9911929543634908, + "learning_rate": 1.5184413457049014e-05, + "loss": 0.1236, + "step": 2476 + }, + { + "epoch": 0.9919935948759008, + "learning_rate": 1.5160492727653238e-05, + "loss": 0.2645, + "step": 2478 + }, + { + "epoch": 0.9927942353883107, + "learning_rate": 1.5136531684060753e-05, + "loss": 0.3352, + "step": 2480 + }, + { + "epoch": 0.9935948759007206, + "learning_rate": 1.5112530513457251e-05, + "loss": 0.2512, + "step": 2482 + }, + { + "epoch": 0.9943955164131305, + "learning_rate": 1.50884894033418e-05, + "loss": 0.2389, + "step": 2484 + }, + { + "epoch": 0.9951961569255404, + "learning_rate": 1.5064408541525578e-05, + "loss": 0.5457, + "step": 2486 + }, + { + "epoch": 0.9959967974379503, + "learning_rate": 1.504028811613027e-05, + "loss": 0.1824, + "step": 2488 + }, + { + "epoch": 0.9967974379503602, + "learning_rate": 1.5016128315586636e-05, + "loss": 0.1725, + "step": 2490 + }, + { + "epoch": 0.9975980784627703, + "learning_rate": 1.4991929328633043e-05, + "loss": 0.1159, + "step": 2492 + }, + { + "epoch": 0.9983987189751802, + "learning_rate": 1.4967691344314012e-05, + "loss": 0.2984, + "step": 2494 + }, + { + "epoch": 0.9991993594875901, + "learning_rate": 1.4943414551978622e-05, + "loss": 0.3733, + "step": 2496 + }, + { + "epoch": 1.0, + "learning_rate": 1.4919099141279214e-05, + "loss": 0.5975, + "step": 2498 + }, + { + "epoch": 1.0, + "step": 2498, + "total_flos": 0, + "train_loss": 0.2935618901069017, + "train_runtime": 7481.7992, + "train_samples_per_second": 2.671, + "train_steps_per_second": 0.334 + } + ], + "logging_steps": 2, + "max_steps": 2498, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_125_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_125_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5f96997d16f7f095eacd2a489e1d34b18084e00e --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_125_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fadbc0b5bbe3c18449de96ce8e72c68e333fc28f4036f7f051b79a60ced7f15 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_125_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_125_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..7c7e66f431484de7647f4462bb63c8b312993ba9 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_125_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca2539497316868b9abc4b4bee5ccd56463805ca3153c736e826da0086d6acc8 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_125_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_125_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..0c51426cb4c0742b9769cb52420d85302d9ccbb0 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_125_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d515d4a815e7ebfd8cfff00effd12f35e7b8be2b82eb652eadb98465a3fc6e6b +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_125_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_125_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..5389282c73297d9b4afe67770d5c876829c2ae90 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_125_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52511999e3fbe087ef6bbd8e58f74ac60b79537682ec1d7c440ab720e1b7e430 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_25_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_25_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7f39ace3b10806b19daf5f8c7bfc8ac2e8685438 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_25_seed1/0_trainer_state.json @@ -0,0 +1,15020 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 4996, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008006405124099279, + "learning_rate": 2.406842319175051e-06, + "loss": 0.1145, + "step": 2 + }, + { + "epoch": 0.0008006405124099279, + "learning_rate": 2.415943612351265e-06, + "loss": 0.1126, + "step": 4 + }, + { + "epoch": 0.0016012810248198558, + "learning_rate": 2.4250597173539104e-06, + "loss": 0.4033, + "step": 6 + }, + { + "epoch": 0.0016012810248198558, + "learning_rate": 2.4341906163790364e-06, + "loss": 0.2529, + "step": 8 + }, + { + "epoch": 0.0024019215372297837, + "learning_rate": 2.443336291593801e-06, + "loss": 0.1026, + "step": 10 + }, + { + "epoch": 0.0024019215372297837, + "learning_rate": 2.4524967251364995e-06, + "loss": 0.2035, + "step": 12 + }, + { + "epoch": 0.0032025620496397116, + "learning_rate": 2.461671899116598e-06, + "loss": 0.0477, + "step": 14 + }, + { + "epoch": 0.0032025620496397116, + "learning_rate": 2.4708617956148052e-06, + "loss": 0.0843, + "step": 16 + }, + { + "epoch": 0.0040032025620496394, + "learning_rate": 2.4800663966830417e-06, + "loss": 0.3298, + "step": 18 + }, + { + "epoch": 0.0040032025620496394, + "learning_rate": 2.4892856843445236e-06, + "loss": 0.1764, + "step": 20 + }, + { + "epoch": 0.004803843074459567, + "learning_rate": 2.4985196405937807e-06, + "loss": 0.3429, + "step": 22 + }, + { + "epoch": 0.004803843074459567, + "learning_rate": 2.507768247396697e-06, + "loss": 0.0559, + "step": 24 + }, + { + "epoch": 0.005604483586869495, + "learning_rate": 2.5170314866905443e-06, + "loss": 0.0538, + "step": 26 + }, + { + "epoch": 0.005604483586869495, + "learning_rate": 2.5263093403840022e-06, + "loss": 0.2758, + "step": 28 + }, + { + "epoch": 0.006405124099279423, + "learning_rate": 2.535601790357246e-06, + "loss": 0.0855, + "step": 30 + }, + { + "epoch": 0.006405124099279423, + "learning_rate": 2.5449088184619065e-06, + "loss": 0.1245, + "step": 32 + }, + { + "epoch": 0.007205764611689352, + "learning_rate": 2.5542304065211578e-06, + "loss": 0.1435, + "step": 34 + }, + { + "epoch": 0.007205764611689352, + "learning_rate": 2.5635665363297356e-06, + "loss": 0.1374, + "step": 36 + }, + { + "epoch": 0.008006405124099279, + "learning_rate": 2.5729171896539763e-06, + "loss": 0.4145, + "step": 38 + }, + { + "epoch": 0.008006405124099279, + "learning_rate": 2.5822823482318517e-06, + "loss": 0.3471, + "step": 40 + }, + { + "epoch": 0.008807045636509208, + "learning_rate": 2.5916619937729915e-06, + "loss": 0.8295, + "step": 42 + }, + { + "epoch": 0.008807045636509208, + "learning_rate": 2.6010561079587694e-06, + "loss": 0.1424, + "step": 44 + }, + { + "epoch": 0.009607686148919135, + "learning_rate": 2.6104646724422643e-06, + "loss": 0.3854, + "step": 46 + }, + { + "epoch": 0.009607686148919135, + "learning_rate": 2.6198876688483453e-06, + "loss": 0.0588, + "step": 48 + }, + { + "epoch": 0.010408326661329063, + "learning_rate": 2.629325078773699e-06, + "loss": 0.1492, + "step": 50 + }, + { + "epoch": 0.010408326661329063, + "learning_rate": 2.6387768837868565e-06, + "loss": 0.0359, + "step": 52 + }, + { + "epoch": 0.01120896717373899, + "learning_rate": 2.648243065428239e-06, + "loss": 0.0979, + "step": 54 + }, + { + "epoch": 0.01120896717373899, + "learning_rate": 2.6577236052101764e-06, + "loss": 0.2293, + "step": 56 + }, + { + "epoch": 0.01200960768614892, + "learning_rate": 2.6672184846169934e-06, + "loss": 0.0519, + "step": 58 + }, + { + "epoch": 0.01200960768614892, + "learning_rate": 2.6767276851049716e-06, + "loss": 0.0348, + "step": 60 + }, + { + "epoch": 0.012810248198558846, + "learning_rate": 2.686251188102439e-06, + "loss": 0.1904, + "step": 62 + }, + { + "epoch": 0.012810248198558846, + "learning_rate": 2.6957889750097866e-06, + "loss": 0.1763, + "step": 64 + }, + { + "epoch": 0.013610888710968775, + "learning_rate": 2.7053410271995085e-06, + "loss": 0.1286, + "step": 66 + }, + { + "epoch": 0.013610888710968775, + "learning_rate": 2.7149073260162416e-06, + "loss": 0.1509, + "step": 68 + }, + { + "epoch": 0.014411529223378704, + "learning_rate": 2.724487852776785e-06, + "loss": 0.3204, + "step": 70 + }, + { + "epoch": 0.014411529223378704, + "learning_rate": 2.7340825887701848e-06, + "loss": 0.0467, + "step": 72 + }, + { + "epoch": 0.01521216973578863, + "learning_rate": 2.7436915152577038e-06, + "loss": 0.0243, + "step": 74 + }, + { + "epoch": 0.01521216973578863, + "learning_rate": 2.7533146134728993e-06, + "loss": 0.3378, + "step": 76 + }, + { + "epoch": 0.016012810248198558, + "learning_rate": 2.7629518646216522e-06, + "loss": 0.6603, + "step": 78 + }, + { + "epoch": 0.016012810248198558, + "learning_rate": 2.772603249882202e-06, + "loss": 0.1466, + "step": 80 + }, + { + "epoch": 0.016813450760608487, + "learning_rate": 2.782268750405185e-06, + "loss": 0.1948, + "step": 82 + }, + { + "epoch": 0.016813450760608487, + "learning_rate": 2.7919483473136555e-06, + "loss": 0.1146, + "step": 84 + }, + { + "epoch": 0.017614091273018415, + "learning_rate": 2.801642021703177e-06, + "loss": 0.1801, + "step": 86 + }, + { + "epoch": 0.017614091273018415, + "learning_rate": 2.81134975464178e-06, + "loss": 0.2754, + "step": 88 + }, + { + "epoch": 0.018414731785428344, + "learning_rate": 2.821071527170053e-06, + "loss": 0.2521, + "step": 90 + }, + { + "epoch": 0.018414731785428344, + "learning_rate": 2.8308073203011634e-06, + "loss": 0.0705, + "step": 92 + }, + { + "epoch": 0.01921537229783827, + "learning_rate": 2.8405571150208945e-06, + "loss": 0.2281, + "step": 94 + }, + { + "epoch": 0.01921537229783827, + "learning_rate": 2.850320892287688e-06, + "loss": 0.0931, + "step": 96 + }, + { + "epoch": 0.020016012810248198, + "learning_rate": 2.860098633032663e-06, + "loss": 0.307, + "step": 98 + }, + { + "epoch": 0.020016012810248198, + "learning_rate": 2.8698903181597026e-06, + "loss": 0.1265, + "step": 100 + }, + { + "epoch": 0.020816653322658127, + "learning_rate": 2.879695928545424e-06, + "loss": 0.2442, + "step": 102 + }, + { + "epoch": 0.020816653322658127, + "learning_rate": 2.889515445039256e-06, + "loss": 0.0397, + "step": 104 + }, + { + "epoch": 0.021617293835068056, + "learning_rate": 2.899348848463471e-06, + "loss": 0.3484, + "step": 106 + }, + { + "epoch": 0.021617293835068056, + "learning_rate": 2.909196119613218e-06, + "loss": 0.2706, + "step": 108 + }, + { + "epoch": 0.02241793434747798, + "learning_rate": 2.9190572392565643e-06, + "loss": 0.3204, + "step": 110 + }, + { + "epoch": 0.02241793434747798, + "learning_rate": 2.928932188134529e-06, + "loss": 0.3403, + "step": 112 + }, + { + "epoch": 0.02321857485988791, + "learning_rate": 2.9388209469611093e-06, + "loss": 0.2397, + "step": 114 + }, + { + "epoch": 0.02321857485988791, + "learning_rate": 2.9487234964233724e-06, + "loss": 0.0305, + "step": 116 + }, + { + "epoch": 0.02401921537229784, + "learning_rate": 2.9586398171814114e-06, + "loss": 0.1126, + "step": 118 + }, + { + "epoch": 0.02401921537229784, + "learning_rate": 2.9685698898684355e-06, + "loss": 0.1169, + "step": 120 + }, + { + "epoch": 0.024819855884707767, + "learning_rate": 2.9785136950907987e-06, + "loss": 0.1138, + "step": 122 + }, + { + "epoch": 0.024819855884707767, + "learning_rate": 2.988471213428035e-06, + "loss": 0.0806, + "step": 124 + }, + { + "epoch": 0.025620496397117692, + "learning_rate": 2.9984424254328936e-06, + "loss": 0.1154, + "step": 126 + }, + { + "epoch": 0.025620496397117692, + "learning_rate": 3.00842731163137e-06, + "loss": 0.3838, + "step": 128 + }, + { + "epoch": 0.02642113690952762, + "learning_rate": 3.0184258525227895e-06, + "loss": 0.6069, + "step": 130 + }, + { + "epoch": 0.02642113690952762, + "learning_rate": 3.0284380285797733e-06, + "loss": 0.4487, + "step": 132 + }, + { + "epoch": 0.02722177742193755, + "learning_rate": 3.038463820248324e-06, + "loss": 0.1094, + "step": 134 + }, + { + "epoch": 0.02722177742193755, + "learning_rate": 3.048503207947854e-06, + "loss": 0.1641, + "step": 136 + }, + { + "epoch": 0.02802241793434748, + "learning_rate": 3.0585561720712207e-06, + "loss": 0.3794, + "step": 138 + }, + { + "epoch": 0.02802241793434748, + "learning_rate": 3.068622692984767e-06, + "loss": 0.2369, + "step": 140 + }, + { + "epoch": 0.028823058446757407, + "learning_rate": 3.0787027510283495e-06, + "loss": 0.2244, + "step": 142 + }, + { + "epoch": 0.028823058446757407, + "learning_rate": 3.0887963265154187e-06, + "loss": 0.1374, + "step": 144 + }, + { + "epoch": 0.029623698959167333, + "learning_rate": 3.098903399732992e-06, + "loss": 0.3995, + "step": 146 + }, + { + "epoch": 0.029623698959167333, + "learning_rate": 3.1090239509417364e-06, + "loss": 0.059, + "step": 148 + }, + { + "epoch": 0.03042433947157726, + "learning_rate": 3.1191579603759946e-06, + "loss": 0.0369, + "step": 150 + }, + { + "epoch": 0.03042433947157726, + "learning_rate": 3.129305408243829e-06, + "loss": 0.2755, + "step": 152 + }, + { + "epoch": 0.03122497998398719, + "learning_rate": 3.139466274727052e-06, + "loss": 0.3748, + "step": 154 + }, + { + "epoch": 0.03122497998398719, + "learning_rate": 3.1496405399812602e-06, + "loss": 0.3558, + "step": 156 + }, + { + "epoch": 0.032025620496397116, + "learning_rate": 3.159828184135917e-06, + "loss": 0.3693, + "step": 158 + }, + { + "epoch": 0.032025620496397116, + "learning_rate": 3.17002918729432e-06, + "loss": 0.0669, + "step": 160 + }, + { + "epoch": 0.03282626100880705, + "learning_rate": 3.1802435295336908e-06, + "loss": 0.2016, + "step": 162 + }, + { + "epoch": 0.03282626100880705, + "learning_rate": 3.1904711909051967e-06, + "loss": 0.0814, + "step": 164 + }, + { + "epoch": 0.03362690152121697, + "learning_rate": 3.2007121514339924e-06, + "loss": 0.3834, + "step": 166 + }, + { + "epoch": 0.03362690152121697, + "learning_rate": 3.2109663911192622e-06, + "loss": 0.4145, + "step": 168 + }, + { + "epoch": 0.0344275420336269, + "learning_rate": 3.221233889934239e-06, + "loss": 0.0762, + "step": 170 + }, + { + "epoch": 0.0344275420336269, + "learning_rate": 3.231514627826302e-06, + "loss": 0.5082, + "step": 172 + }, + { + "epoch": 0.03522818254603683, + "learning_rate": 3.2418085847169344e-06, + "loss": 0.116, + "step": 174 + }, + { + "epoch": 0.03522818254603683, + "learning_rate": 3.2521157405018146e-06, + "loss": 0.1845, + "step": 176 + }, + { + "epoch": 0.036028823058446756, + "learning_rate": 3.2624360750508457e-06, + "loss": 0.2089, + "step": 178 + }, + { + "epoch": 0.036028823058446756, + "learning_rate": 3.2727695682081897e-06, + "loss": 0.4744, + "step": 180 + }, + { + "epoch": 0.03682946357085669, + "learning_rate": 3.28311619979231e-06, + "loss": 0.2204, + "step": 182 + }, + { + "epoch": 0.03682946357085669, + "learning_rate": 3.293475949595998e-06, + "loss": 0.2529, + "step": 184 + }, + { + "epoch": 0.03763010408326661, + "learning_rate": 3.303848797386465e-06, + "loss": 0.2754, + "step": 186 + }, + { + "epoch": 0.03763010408326661, + "learning_rate": 3.314234722905302e-06, + "loss": 0.0976, + "step": 188 + }, + { + "epoch": 0.03843074459567654, + "learning_rate": 3.3246337058685697e-06, + "loss": 0.4025, + "step": 190 + }, + { + "epoch": 0.03843074459567654, + "learning_rate": 3.335045725966829e-06, + "loss": 0.3878, + "step": 192 + }, + { + "epoch": 0.03923138510808647, + "learning_rate": 3.3454707628651806e-06, + "loss": 0.1009, + "step": 194 + }, + { + "epoch": 0.03923138510808647, + "learning_rate": 3.355908796203301e-06, + "loss": 0.2473, + "step": 196 + }, + { + "epoch": 0.040032025620496396, + "learning_rate": 3.3663598055954716e-06, + "loss": 0.2241, + "step": 198 + }, + { + "epoch": 0.040032025620496396, + "learning_rate": 3.3768237706306716e-06, + "loss": 0.2895, + "step": 200 + }, + { + "epoch": 0.04083266613290633, + "learning_rate": 3.3873006708725365e-06, + "loss": 0.4173, + "step": 202 + }, + { + "epoch": 0.04083266613290633, + "learning_rate": 3.3977904858594534e-06, + "loss": 0.0873, + "step": 204 + }, + { + "epoch": 0.041633306645316254, + "learning_rate": 3.408293195104586e-06, + "loss": 0.2055, + "step": 206 + }, + { + "epoch": 0.041633306645316254, + "learning_rate": 3.418808778095917e-06, + "loss": 0.1639, + "step": 208 + }, + { + "epoch": 0.04243394715772618, + "learning_rate": 3.4293372142962845e-06, + "loss": 0.2403, + "step": 210 + }, + { + "epoch": 0.04243394715772618, + "learning_rate": 3.4398784831434097e-06, + "loss": 0.1808, + "step": 212 + }, + { + "epoch": 0.04323458767013611, + "learning_rate": 3.4504325640499936e-06, + "loss": 0.2256, + "step": 214 + }, + { + "epoch": 0.04323458767013611, + "learning_rate": 3.460999436403676e-06, + "loss": 0.1603, + "step": 216 + }, + { + "epoch": 0.044035228182546036, + "learning_rate": 3.4715790795671232e-06, + "loss": 0.1025, + "step": 218 + }, + { + "epoch": 0.044035228182546036, + "learning_rate": 3.4821714728780654e-06, + "loss": 0.3128, + "step": 220 + }, + { + "epoch": 0.04483586869495596, + "learning_rate": 3.4927765956493276e-06, + "loss": 0.2404, + "step": 222 + }, + { + "epoch": 0.04483586869495596, + "learning_rate": 3.5033944271688624e-06, + "loss": 0.1011, + "step": 224 + }, + { + "epoch": 0.045636509207365894, + "learning_rate": 3.514024946699842e-06, + "loss": 0.3141, + "step": 226 + }, + { + "epoch": 0.045636509207365894, + "learning_rate": 3.5246681334806177e-06, + "loss": 0.3713, + "step": 228 + }, + { + "epoch": 0.04643714971977582, + "learning_rate": 3.535323966724814e-06, + "loss": 0.0536, + "step": 230 + }, + { + "epoch": 0.04643714971977582, + "learning_rate": 3.5459924256213596e-06, + "loss": 0.0649, + "step": 232 + }, + { + "epoch": 0.04723779023218575, + "learning_rate": 3.556673489334522e-06, + "loss": 0.1049, + "step": 234 + }, + { + "epoch": 0.04723779023218575, + "learning_rate": 3.567367137003953e-06, + "loss": 0.3311, + "step": 236 + }, + { + "epoch": 0.04803843074459568, + "learning_rate": 3.5780733477447127e-06, + "loss": 0.4651, + "step": 238 + }, + { + "epoch": 0.04803843074459568, + "learning_rate": 3.588792100647368e-06, + "loss": 0.5284, + "step": 240 + }, + { + "epoch": 0.0488390712570056, + "learning_rate": 3.5995233747779467e-06, + "loss": 0.6784, + "step": 242 + }, + { + "epoch": 0.0488390712570056, + "learning_rate": 3.6102671491780393e-06, + "loss": 0.7217, + "step": 244 + }, + { + "epoch": 0.049639711769415534, + "learning_rate": 3.6210234028648216e-06, + "loss": 0.1948, + "step": 246 + }, + { + "epoch": 0.049639711769415534, + "learning_rate": 3.6317921148310965e-06, + "loss": 0.0482, + "step": 248 + }, + { + "epoch": 0.05044035228182546, + "learning_rate": 3.6425732640453235e-06, + "loss": 0.2941, + "step": 250 + }, + { + "epoch": 0.05044035228182546, + "learning_rate": 3.653366829451711e-06, + "loss": 0.3695, + "step": 252 + }, + { + "epoch": 0.051240992794235385, + "learning_rate": 3.6641727899701795e-06, + "loss": 0.2756, + "step": 254 + }, + { + "epoch": 0.051240992794235385, + "learning_rate": 3.674991124496452e-06, + "loss": 0.3963, + "step": 256 + }, + { + "epoch": 0.05204163330664532, + "learning_rate": 3.6858218119020884e-06, + "loss": 0.2055, + "step": 258 + }, + { + "epoch": 0.05204163330664532, + "learning_rate": 3.696664831034521e-06, + "loss": 0.1745, + "step": 260 + }, + { + "epoch": 0.05284227381905524, + "learning_rate": 3.7075201607170997e-06, + "loss": 0.2035, + "step": 262 + }, + { + "epoch": 0.05284227381905524, + "learning_rate": 3.7183877797491143e-06, + "loss": 0.3805, + "step": 264 + }, + { + "epoch": 0.053642914331465175, + "learning_rate": 3.729267666905899e-06, + "loss": 0.2302, + "step": 266 + }, + { + "epoch": 0.053642914331465175, + "learning_rate": 3.740159800938784e-06, + "loss": 0.1681, + "step": 268 + }, + { + "epoch": 0.0544435548438751, + "learning_rate": 3.751064160575195e-06, + "loss": 0.1904, + "step": 270 + }, + { + "epoch": 0.0544435548438751, + "learning_rate": 3.7619807245186824e-06, + "loss": 0.2377, + "step": 272 + }, + { + "epoch": 0.055244195356285025, + "learning_rate": 3.772909471448959e-06, + "loss": 0.5623, + "step": 274 + }, + { + "epoch": 0.055244195356285025, + "learning_rate": 3.783850380021933e-06, + "loss": 0.0557, + "step": 276 + }, + { + "epoch": 0.05604483586869496, + "learning_rate": 3.794803428869799e-06, + "loss": 0.4323, + "step": 278 + }, + { + "epoch": 0.05604483586869496, + "learning_rate": 3.8057685966010025e-06, + "loss": 0.2573, + "step": 280 + }, + { + "epoch": 0.05684547638110488, + "learning_rate": 3.816745861800334e-06, + "loss": 0.1948, + "step": 282 + }, + { + "epoch": 0.05684547638110488, + "learning_rate": 3.827735203028956e-06, + "loss": 0.3512, + "step": 284 + }, + { + "epoch": 0.057646116893514815, + "learning_rate": 3.838736598824446e-06, + "loss": 0.1012, + "step": 286 + }, + { + "epoch": 0.057646116893514815, + "learning_rate": 3.849750027700842e-06, + "loss": 0.1526, + "step": 288 + }, + { + "epoch": 0.05844675740592474, + "learning_rate": 3.860775468148662e-06, + "loss": 0.1183, + "step": 290 + }, + { + "epoch": 0.05844675740592474, + "learning_rate": 3.871812898635011e-06, + "loss": 0.0213, + "step": 292 + }, + { + "epoch": 0.059247397918334666, + "learning_rate": 3.882862297603536e-06, + "loss": 0.1894, + "step": 294 + }, + { + "epoch": 0.059247397918334666, + "learning_rate": 3.8939236434745184e-06, + "loss": 0.7541, + "step": 296 + }, + { + "epoch": 0.0600480384307446, + "learning_rate": 3.904996914644913e-06, + "loss": 0.0608, + "step": 298 + }, + { + "epoch": 0.0600480384307446, + "learning_rate": 3.916082089488379e-06, + "loss": 0.1952, + "step": 300 + }, + { + "epoch": 0.06084867894315452, + "learning_rate": 3.927179146355317e-06, + "loss": 0.5169, + "step": 302 + }, + { + "epoch": 0.06084867894315452, + "learning_rate": 3.938288063572962e-06, + "loss": 0.5388, + "step": 304 + }, + { + "epoch": 0.06164931945556445, + "learning_rate": 3.949408819445345e-06, + "loss": 0.3511, + "step": 306 + }, + { + "epoch": 0.06164931945556445, + "learning_rate": 3.960541392253387e-06, + "loss": 0.2533, + "step": 308 + }, + { + "epoch": 0.06244995996797438, + "learning_rate": 3.971685760254933e-06, + "loss": 0.0704, + "step": 310 + }, + { + "epoch": 0.06244995996797438, + "learning_rate": 3.982841901684792e-06, + "loss": 0.2366, + "step": 312 + }, + { + "epoch": 0.0632506004803843, + "learning_rate": 3.994009794754777e-06, + "loss": 0.2181, + "step": 314 + }, + { + "epoch": 0.0632506004803843, + "learning_rate": 4.005189417653737e-06, + "loss": 0.0416, + "step": 316 + }, + { + "epoch": 0.06405124099279423, + "learning_rate": 4.016380748547654e-06, + "loss": 0.4031, + "step": 318 + }, + { + "epoch": 0.06405124099279423, + "learning_rate": 4.027583765579601e-06, + "loss": 0.1098, + "step": 320 + }, + { + "epoch": 0.06485188150520416, + "learning_rate": 4.038798446869847e-06, + "loss": 0.1472, + "step": 322 + }, + { + "epoch": 0.06485188150520416, + "learning_rate": 4.050024770515873e-06, + "loss": 0.1207, + "step": 324 + }, + { + "epoch": 0.0656525220176141, + "learning_rate": 4.061262714592426e-06, + "loss": 0.5692, + "step": 326 + }, + { + "epoch": 0.0656525220176141, + "learning_rate": 4.072512257151546e-06, + "loss": 0.1325, + "step": 328 + }, + { + "epoch": 0.06645316253002402, + "learning_rate": 4.0837733762226584e-06, + "loss": 0.2428, + "step": 330 + }, + { + "epoch": 0.06645316253002402, + "learning_rate": 4.095046049812541e-06, + "loss": 0.0885, + "step": 332 + }, + { + "epoch": 0.06725380304243395, + "learning_rate": 4.106330255905417e-06, + "loss": 0.3047, + "step": 334 + }, + { + "epoch": 0.06725380304243395, + "learning_rate": 4.117625972462988e-06, + "loss": 0.1264, + "step": 336 + }, + { + "epoch": 0.06805444355484387, + "learning_rate": 4.128933177424475e-06, + "loss": 0.178, + "step": 338 + }, + { + "epoch": 0.06805444355484387, + "learning_rate": 4.1402518487066624e-06, + "loss": 0.0543, + "step": 340 + }, + { + "epoch": 0.0688550840672538, + "learning_rate": 4.151581964203924e-06, + "loss": 0.2616, + "step": 342 + }, + { + "epoch": 0.0688550840672538, + "learning_rate": 4.1629235017883285e-06, + "loss": 0.0895, + "step": 344 + }, + { + "epoch": 0.06965572457966374, + "learning_rate": 4.174276439309593e-06, + "loss": 0.0265, + "step": 346 + }, + { + "epoch": 0.06965572457966374, + "learning_rate": 4.1856407545951825e-06, + "loss": 0.3407, + "step": 348 + }, + { + "epoch": 0.07045636509207366, + "learning_rate": 4.197016425450347e-06, + "loss": 0.2292, + "step": 350 + }, + { + "epoch": 0.07045636509207366, + "learning_rate": 4.208403429658151e-06, + "loss": 0.2301, + "step": 352 + }, + { + "epoch": 0.07125700560448359, + "learning_rate": 4.219801744979517e-06, + "loss": 0.2012, + "step": 354 + }, + { + "epoch": 0.07125700560448359, + "learning_rate": 4.2312113491533145e-06, + "loss": 0.2096, + "step": 356 + }, + { + "epoch": 0.07205764611689351, + "learning_rate": 4.242632219896328e-06, + "loss": 0.1144, + "step": 358 + }, + { + "epoch": 0.07205764611689351, + "learning_rate": 4.254064334903347e-06, + "loss": 0.0832, + "step": 360 + }, + { + "epoch": 0.07285828662930344, + "learning_rate": 4.2655076718472045e-06, + "loss": 0.0433, + "step": 362 + }, + { + "epoch": 0.07285828662930344, + "learning_rate": 4.276962208378814e-06, + "loss": 0.2386, + "step": 364 + }, + { + "epoch": 0.07365892714171338, + "learning_rate": 4.28842792212722e-06, + "loss": 0.3483, + "step": 366 + }, + { + "epoch": 0.07365892714171338, + "learning_rate": 4.299904790699619e-06, + "loss": 0.1677, + "step": 368 + }, + { + "epoch": 0.0744595676541233, + "learning_rate": 4.3113927916814665e-06, + "loss": 0.1944, + "step": 370 + }, + { + "epoch": 0.0744595676541233, + "learning_rate": 4.3228919026364345e-06, + "loss": 0.5094, + "step": 372 + }, + { + "epoch": 0.07526020816653323, + "learning_rate": 4.33440210110651e-06, + "loss": 0.2254, + "step": 374 + }, + { + "epoch": 0.07526020816653323, + "learning_rate": 4.345923364612024e-06, + "loss": 0.4023, + "step": 376 + }, + { + "epoch": 0.07606084867894315, + "learning_rate": 4.3574556706517035e-06, + "loss": 0.4146, + "step": 378 + }, + { + "epoch": 0.07606084867894315, + "learning_rate": 4.368998996702686e-06, + "loss": 0.2608, + "step": 380 + }, + { + "epoch": 0.07686148919135308, + "learning_rate": 4.380553320220638e-06, + "loss": 0.425, + "step": 382 + }, + { + "epoch": 0.07686148919135308, + "learning_rate": 4.392118618639698e-06, + "loss": 0.1459, + "step": 384 + }, + { + "epoch": 0.07766212970376302, + "learning_rate": 4.403694869372589e-06, + "loss": 0.1641, + "step": 386 + }, + { + "epoch": 0.07766212970376302, + "learning_rate": 4.415282049810643e-06, + "loss": 0.0295, + "step": 388 + }, + { + "epoch": 0.07846277021617294, + "learning_rate": 4.4268801373238454e-06, + "loss": 0.079, + "step": 390 + }, + { + "epoch": 0.07846277021617294, + "learning_rate": 4.4384891092608795e-06, + "loss": 0.0951, + "step": 392 + }, + { + "epoch": 0.07926341072858287, + "learning_rate": 4.450108942949158e-06, + "loss": 0.3231, + "step": 394 + }, + { + "epoch": 0.07926341072858287, + "learning_rate": 4.461739615694921e-06, + "loss": 0.6489, + "step": 396 + }, + { + "epoch": 0.08006405124099279, + "learning_rate": 4.473381104783201e-06, + "loss": 0.141, + "step": 398 + }, + { + "epoch": 0.08006405124099279, + "learning_rate": 4.485033387477915e-06, + "loss": 0.0369, + "step": 400 + }, + { + "epoch": 0.08086469175340272, + "learning_rate": 4.496696441021904e-06, + "loss": 0.7016, + "step": 402 + }, + { + "epoch": 0.08086469175340272, + "learning_rate": 4.5083702426369715e-06, + "loss": 0.8059, + "step": 404 + }, + { + "epoch": 0.08166533226581266, + "learning_rate": 4.520054769523929e-06, + "loss": 0.2633, + "step": 406 + }, + { + "epoch": 0.08166533226581266, + "learning_rate": 4.531749998862628e-06, + "loss": 0.2632, + "step": 408 + }, + { + "epoch": 0.08246597277822258, + "learning_rate": 4.543455907812063e-06, + "loss": 0.1339, + "step": 410 + }, + { + "epoch": 0.08246597277822258, + "learning_rate": 4.555172473510324e-06, + "loss": 0.1536, + "step": 412 + }, + { + "epoch": 0.08326661329063251, + "learning_rate": 4.566899673074706e-06, + "loss": 0.0866, + "step": 414 + }, + { + "epoch": 0.08326661329063251, + "learning_rate": 4.578637483601732e-06, + "loss": 0.3411, + "step": 416 + }, + { + "epoch": 0.08406725380304243, + "learning_rate": 4.590385882167206e-06, + "loss": 0.3512, + "step": 418 + }, + { + "epoch": 0.08406725380304243, + "learning_rate": 4.602144845826234e-06, + "loss": 0.1631, + "step": 420 + }, + { + "epoch": 0.08486789431545236, + "learning_rate": 4.613914351613337e-06, + "loss": 0.0762, + "step": 422 + }, + { + "epoch": 0.08486789431545236, + "learning_rate": 4.625694376542399e-06, + "loss": 0.0445, + "step": 424 + }, + { + "epoch": 0.08566853482786228, + "learning_rate": 4.637484897606777e-06, + "loss": 0.4498, + "step": 426 + }, + { + "epoch": 0.08566853482786228, + "learning_rate": 4.649285891779326e-06, + "loss": 0.4569, + "step": 428 + }, + { + "epoch": 0.08646917534027222, + "learning_rate": 4.661097336012451e-06, + "loss": 0.5406, + "step": 430 + }, + { + "epoch": 0.08646917534027222, + "learning_rate": 4.672919207238145e-06, + "loss": 0.1167, + "step": 432 + }, + { + "epoch": 0.08726981585268215, + "learning_rate": 4.684751482368022e-06, + "loss": 0.0208, + "step": 434 + }, + { + "epoch": 0.08726981585268215, + "learning_rate": 4.696594138293421e-06, + "loss": 0.3037, + "step": 436 + }, + { + "epoch": 0.08807045636509207, + "learning_rate": 4.7084471518853656e-06, + "loss": 0.6739, + "step": 438 + }, + { + "epoch": 0.08807045636509207, + "learning_rate": 4.720310499994664e-06, + "loss": 0.0517, + "step": 440 + }, + { + "epoch": 0.088871096877502, + "learning_rate": 4.732184159451937e-06, + "loss": 0.288, + "step": 442 + }, + { + "epoch": 0.088871096877502, + "learning_rate": 4.744068107067673e-06, + "loss": 0.0861, + "step": 444 + }, + { + "epoch": 0.08967173738991192, + "learning_rate": 4.755962319632249e-06, + "loss": 0.1975, + "step": 446 + }, + { + "epoch": 0.08967173738991192, + "learning_rate": 4.767866773916041e-06, + "loss": 0.1164, + "step": 448 + }, + { + "epoch": 0.09047237790232186, + "learning_rate": 4.779781446669376e-06, + "loss": 0.2547, + "step": 450 + }, + { + "epoch": 0.09047237790232186, + "learning_rate": 4.79170631462264e-06, + "loss": 0.1272, + "step": 452 + }, + { + "epoch": 0.09127301841473179, + "learning_rate": 4.8036413544863095e-06, + "loss": 0.2427, + "step": 454 + }, + { + "epoch": 0.09127301841473179, + "learning_rate": 4.81558654295099e-06, + "loss": 0.0971, + "step": 456 + }, + { + "epoch": 0.09207365892714171, + "learning_rate": 4.827541856687471e-06, + "loss": 0.0826, + "step": 458 + }, + { + "epoch": 0.09207365892714171, + "learning_rate": 4.839507272346751e-06, + "loss": 0.158, + "step": 460 + }, + { + "epoch": 0.09287429943955164, + "learning_rate": 4.8514827665601425e-06, + "loss": 0.5747, + "step": 462 + }, + { + "epoch": 0.09287429943955164, + "learning_rate": 4.863468315939234e-06, + "loss": 0.1288, + "step": 464 + }, + { + "epoch": 0.09367493995196156, + "learning_rate": 4.875463897075985e-06, + "loss": 0.09, + "step": 466 + }, + { + "epoch": 0.09367493995196156, + "learning_rate": 4.8874694865427676e-06, + "loss": 0.3039, + "step": 468 + }, + { + "epoch": 0.0944755804643715, + "learning_rate": 4.899485060892404e-06, + "loss": 0.1982, + "step": 470 + }, + { + "epoch": 0.0944755804643715, + "learning_rate": 4.911510596658202e-06, + "loss": 0.1274, + "step": 472 + }, + { + "epoch": 0.09527622097678143, + "learning_rate": 4.9235460703540615e-06, + "loss": 0.0958, + "step": 474 + }, + { + "epoch": 0.09527622097678143, + "learning_rate": 4.935591458474425e-06, + "loss": 0.1346, + "step": 476 + }, + { + "epoch": 0.09607686148919135, + "learning_rate": 4.947646737494389e-06, + "loss": 0.5726, + "step": 478 + }, + { + "epoch": 0.09607686148919135, + "learning_rate": 4.959711883869734e-06, + "loss": 0.334, + "step": 480 + }, + { + "epoch": 0.09687750200160128, + "learning_rate": 4.9717868740369645e-06, + "loss": 0.0676, + "step": 482 + }, + { + "epoch": 0.09687750200160128, + "learning_rate": 4.9838716844133665e-06, + "loss": 0.215, + "step": 484 + }, + { + "epoch": 0.0976781425140112, + "learning_rate": 4.9959662913970254e-06, + "loss": 0.1572, + "step": 486 + }, + { + "epoch": 0.0976781425140112, + "learning_rate": 5.0080706713669435e-06, + "loss": 0.1659, + "step": 488 + }, + { + "epoch": 0.09847878302642114, + "learning_rate": 5.02018480068299e-06, + "loss": 0.2028, + "step": 490 + }, + { + "epoch": 0.09847878302642114, + "learning_rate": 5.032308655686007e-06, + "loss": 0.0487, + "step": 492 + }, + { + "epoch": 0.09927942353883107, + "learning_rate": 5.044442212697842e-06, + "loss": 0.0991, + "step": 494 + }, + { + "epoch": 0.09927942353883107, + "learning_rate": 5.056585448021398e-06, + "loss": 0.4251, + "step": 496 + }, + { + "epoch": 0.100080064051241, + "learning_rate": 5.068738337940655e-06, + "loss": 0.3921, + "step": 498 + }, + { + "epoch": 0.100080064051241, + "learning_rate": 5.080900858720789e-06, + "loss": 0.1423, + "step": 500 + }, + { + "epoch": 0.10088070456365092, + "learning_rate": 5.093072986608116e-06, + "loss": 0.3415, + "step": 502 + }, + { + "epoch": 0.10088070456365092, + "learning_rate": 5.105254697830208e-06, + "loss": 0.2554, + "step": 504 + }, + { + "epoch": 0.10168134507606084, + "learning_rate": 5.1174459685959175e-06, + "loss": 0.4332, + "step": 506 + }, + { + "epoch": 0.10168134507606084, + "learning_rate": 5.129646775095432e-06, + "loss": 0.4879, + "step": 508 + }, + { + "epoch": 0.10248198558847077, + "learning_rate": 5.141857093500307e-06, + "loss": 0.6631, + "step": 510 + }, + { + "epoch": 0.10248198558847077, + "learning_rate": 5.154076899963514e-06, + "loss": 0.4588, + "step": 512 + }, + { + "epoch": 0.10328262610088071, + "learning_rate": 5.166306170619537e-06, + "loss": 0.1652, + "step": 514 + }, + { + "epoch": 0.10328262610088071, + "learning_rate": 5.178544881584328e-06, + "loss": 0.1848, + "step": 516 + }, + { + "epoch": 0.10408326661329063, + "learning_rate": 5.190793008955421e-06, + "loss": 0.0782, + "step": 518 + }, + { + "epoch": 0.10408326661329063, + "learning_rate": 5.203050528811959e-06, + "loss": 0.085, + "step": 520 + }, + { + "epoch": 0.10488390712570056, + "learning_rate": 5.215317417214739e-06, + "loss": 0.199, + "step": 522 + }, + { + "epoch": 0.10488390712570056, + "learning_rate": 5.227593650206246e-06, + "loss": 0.1873, + "step": 524 + }, + { + "epoch": 0.10568454763811048, + "learning_rate": 5.239879203810763e-06, + "loss": 0.2958, + "step": 526 + }, + { + "epoch": 0.10568454763811048, + "learning_rate": 5.2521740540343205e-06, + "loss": 0.1939, + "step": 528 + }, + { + "epoch": 0.10648518815052041, + "learning_rate": 5.264478176864811e-06, + "loss": 0.2289, + "step": 530 + }, + { + "epoch": 0.10648518815052041, + "learning_rate": 5.2767915482720164e-06, + "loss": 0.0573, + "step": 532 + }, + { + "epoch": 0.10728582866293035, + "learning_rate": 5.289114144207656e-06, + "loss": 0.3005, + "step": 534 + }, + { + "epoch": 0.10728582866293035, + "learning_rate": 5.3014459406054295e-06, + "loss": 0.3791, + "step": 536 + }, + { + "epoch": 0.10808646917534027, + "learning_rate": 5.313786913381061e-06, + "loss": 0.1098, + "step": 538 + }, + { + "epoch": 0.10808646917534027, + "learning_rate": 5.3261370384323904e-06, + "loss": 0.1196, + "step": 540 + }, + { + "epoch": 0.1088871096877502, + "learning_rate": 5.338496291639341e-06, + "loss": 0.4871, + "step": 542 + }, + { + "epoch": 0.1088871096877502, + "learning_rate": 5.350864648864026e-06, + "loss": 0.2009, + "step": 544 + }, + { + "epoch": 0.10968775020016013, + "learning_rate": 5.363242085950773e-06, + "loss": 0.7372, + "step": 546 + }, + { + "epoch": 0.10968775020016013, + "learning_rate": 5.375628578726181e-06, + "loss": 0.7332, + "step": 548 + }, + { + "epoch": 0.11048839071257005, + "learning_rate": 5.3880241029991434e-06, + "loss": 0.3179, + "step": 550 + }, + { + "epoch": 0.11048839071257005, + "learning_rate": 5.4004286345609665e-06, + "loss": 0.2063, + "step": 552 + }, + { + "epoch": 0.11128903122497999, + "learning_rate": 5.412842149185316e-06, + "loss": 0.3364, + "step": 554 + }, + { + "epoch": 0.11128903122497999, + "learning_rate": 5.425264622628326e-06, + "loss": 0.0903, + "step": 556 + }, + { + "epoch": 0.11208967173738991, + "learning_rate": 5.437696030628639e-06, + "loss": 0.1791, + "step": 558 + }, + { + "epoch": 0.11208967173738991, + "learning_rate": 5.450136348907444e-06, + "loss": 0.0392, + "step": 560 + }, + { + "epoch": 0.11289031224979984, + "learning_rate": 5.462585553168532e-06, + "loss": 0.3504, + "step": 562 + }, + { + "epoch": 0.11289031224979984, + "learning_rate": 5.475043619098321e-06, + "loss": 0.2573, + "step": 564 + }, + { + "epoch": 0.11369095276220977, + "learning_rate": 5.487510522365969e-06, + "loss": 0.2554, + "step": 566 + }, + { + "epoch": 0.11369095276220977, + "learning_rate": 5.499986238623329e-06, + "loss": 0.3795, + "step": 568 + }, + { + "epoch": 0.11449159327461969, + "learning_rate": 5.512470743505057e-06, + "loss": 0.1761, + "step": 570 + }, + { + "epoch": 0.11449159327461969, + "learning_rate": 5.524964012628644e-06, + "loss": 0.3046, + "step": 572 + }, + { + "epoch": 0.11529223378702963, + "learning_rate": 5.537466021594464e-06, + "loss": 0.1707, + "step": 574 + }, + { + "epoch": 0.11529223378702963, + "learning_rate": 5.549976745985809e-06, + "loss": 0.1092, + "step": 576 + }, + { + "epoch": 0.11609287429943956, + "learning_rate": 5.5624961613689934e-06, + "loss": 0.4, + "step": 578 + }, + { + "epoch": 0.11609287429943956, + "learning_rate": 5.57502424329331e-06, + "loss": 0.1143, + "step": 580 + }, + { + "epoch": 0.11689351481184948, + "learning_rate": 5.5875609672911465e-06, + "loss": 0.2034, + "step": 582 + }, + { + "epoch": 0.11689351481184948, + "learning_rate": 5.6001063088780085e-06, + "loss": 0.2377, + "step": 584 + }, + { + "epoch": 0.1176941553242594, + "learning_rate": 5.6126602435525725e-06, + "loss": 0.0466, + "step": 586 + }, + { + "epoch": 0.1176941553242594, + "learning_rate": 5.62522274679673e-06, + "loss": 0.3462, + "step": 588 + }, + { + "epoch": 0.11849479583666933, + "learning_rate": 5.637793794075625e-06, + "loss": 0.1706, + "step": 590 + }, + { + "epoch": 0.11849479583666933, + "learning_rate": 5.650373360837763e-06, + "loss": 0.2304, + "step": 592 + }, + { + "epoch": 0.11929543634907927, + "learning_rate": 5.662961422514961e-06, + "loss": 0.378, + "step": 594 + }, + { + "epoch": 0.11929543634907927, + "learning_rate": 5.675557954522462e-06, + "loss": 0.2403, + "step": 596 + }, + { + "epoch": 0.1200960768614892, + "learning_rate": 5.688162932258965e-06, + "loss": 0.1508, + "step": 598 + }, + { + "epoch": 0.1200960768614892, + "learning_rate": 5.700776331106674e-06, + "loss": 0.2944, + "step": 600 + }, + { + "epoch": 0.12089671737389912, + "learning_rate": 5.713398126431353e-06, + "loss": 0.3424, + "step": 602 + }, + { + "epoch": 0.12089671737389912, + "learning_rate": 5.726028293582342e-06, + "loss": 0.3514, + "step": 604 + }, + { + "epoch": 0.12169735788630905, + "learning_rate": 5.738666807892684e-06, + "loss": 0.2351, + "step": 606 + }, + { + "epoch": 0.12169735788630905, + "learning_rate": 5.751313644679071e-06, + "loss": 0.0801, + "step": 608 + }, + { + "epoch": 0.12249799839871897, + "learning_rate": 5.763968779241957e-06, + "loss": 0.0744, + "step": 610 + }, + { + "epoch": 0.12249799839871897, + "learning_rate": 5.776632186865589e-06, + "loss": 0.5384, + "step": 612 + }, + { + "epoch": 0.1232986389111289, + "learning_rate": 5.7893038428180584e-06, + "loss": 0.4505, + "step": 614 + }, + { + "epoch": 0.1232986389111289, + "learning_rate": 5.8019837223513295e-06, + "loss": 0.4034, + "step": 616 + }, + { + "epoch": 0.12409927942353884, + "learning_rate": 5.814671800701357e-06, + "loss": 0.2632, + "step": 618 + }, + { + "epoch": 0.12409927942353884, + "learning_rate": 5.827368053088032e-06, + "loss": 0.2008, + "step": 620 + }, + { + "epoch": 0.12489991993594876, + "learning_rate": 5.840072454715297e-06, + "loss": 0.0213, + "step": 622 + }, + { + "epoch": 0.12489991993594876, + "learning_rate": 5.852784980771182e-06, + "loss": 0.5131, + "step": 624 + }, + { + "epoch": 0.1257005604483587, + "learning_rate": 5.865505606427848e-06, + "loss": 0.1659, + "step": 626 + }, + { + "epoch": 0.1257005604483587, + "learning_rate": 5.878234306841637e-06, + "loss": 0.1233, + "step": 628 + }, + { + "epoch": 0.1265012009607686, + "learning_rate": 5.890971057153105e-06, + "loss": 0.1098, + "step": 630 + }, + { + "epoch": 0.1265012009607686, + "learning_rate": 5.903715832487138e-06, + "loss": 0.2881, + "step": 632 + }, + { + "epoch": 0.12730184147317855, + "learning_rate": 5.916468607952892e-06, + "loss": 0.3983, + "step": 634 + }, + { + "epoch": 0.12730184147317855, + "learning_rate": 5.929229358643925e-06, + "loss": 0.4497, + "step": 636 + }, + { + "epoch": 0.12810248198558846, + "learning_rate": 5.941998059638212e-06, + "loss": 0.8392, + "step": 638 + }, + { + "epoch": 0.12810248198558846, + "learning_rate": 5.954774685998206e-06, + "loss": 0.1682, + "step": 640 + }, + { + "epoch": 0.1289031224979984, + "learning_rate": 5.9675592127708585e-06, + "loss": 0.0285, + "step": 642 + }, + { + "epoch": 0.1289031224979984, + "learning_rate": 5.9803516149877475e-06, + "loss": 0.0988, + "step": 644 + }, + { + "epoch": 0.1297037630104083, + "learning_rate": 5.993151867665015e-06, + "loss": 0.6035, + "step": 646 + }, + { + "epoch": 0.1297037630104083, + "learning_rate": 6.005959945803494e-06, + "loss": 0.3628, + "step": 648 + }, + { + "epoch": 0.13050440352281825, + "learning_rate": 6.01877582438873e-06, + "loss": 0.1713, + "step": 650 + }, + { + "epoch": 0.13050440352281825, + "learning_rate": 6.03159947839103e-06, + "loss": 0.3033, + "step": 652 + }, + { + "epoch": 0.1313050440352282, + "learning_rate": 6.0444308827655265e-06, + "loss": 2.3413, + "step": 654 + }, + { + "epoch": 0.1313050440352282, + "learning_rate": 6.057270012452186e-06, + "loss": 1.7396, + "step": 656 + }, + { + "epoch": 0.1321056845476381, + "learning_rate": 6.070116842375947e-06, + "loss": 0.1008, + "step": 658 + }, + { + "epoch": 0.1321056845476381, + "learning_rate": 6.082971347446654e-06, + "loss": 0.479, + "step": 660 + }, + { + "epoch": 0.13290632506004804, + "learning_rate": 6.095833502559182e-06, + "loss": 0.135, + "step": 662 + }, + { + "epoch": 0.13290632506004804, + "learning_rate": 6.108703282593461e-06, + "loss": 0.0887, + "step": 664 + }, + { + "epoch": 0.13370696557245795, + "learning_rate": 6.121580662414533e-06, + "loss": 0.232, + "step": 666 + }, + { + "epoch": 0.13370696557245795, + "learning_rate": 6.13446561687258e-06, + "loss": 0.0289, + "step": 668 + }, + { + "epoch": 0.1345076060848679, + "learning_rate": 6.147358120803041e-06, + "loss": 0.0921, + "step": 670 + }, + { + "epoch": 0.1345076060848679, + "learning_rate": 6.160258149026557e-06, + "loss": 0.243, + "step": 672 + }, + { + "epoch": 0.13530824659727783, + "learning_rate": 6.173165676349095e-06, + "loss": 0.1964, + "step": 674 + }, + { + "epoch": 0.13530824659727783, + "learning_rate": 6.186080677561974e-06, + "loss": 0.0913, + "step": 676 + }, + { + "epoch": 0.13610888710968774, + "learning_rate": 6.1990031274419186e-06, + "loss": 0.6157, + "step": 678 + }, + { + "epoch": 0.13610888710968774, + "learning_rate": 6.2119330007511014e-06, + "loss": 0.4794, + "step": 680 + }, + { + "epoch": 0.13690952762209768, + "learning_rate": 6.224870272237185e-06, + "loss": 0.1148, + "step": 682 + }, + { + "epoch": 0.13690952762209768, + "learning_rate": 6.237814916633431e-06, + "loss": 0.2775, + "step": 684 + }, + { + "epoch": 0.1377101681345076, + "learning_rate": 6.250766908658652e-06, + "loss": 0.2523, + "step": 686 + }, + { + "epoch": 0.1377101681345076, + "learning_rate": 6.263726223017326e-06, + "loss": 0.0722, + "step": 688 + }, + { + "epoch": 0.13851080864691753, + "learning_rate": 6.2766928343996314e-06, + "loss": 0.2115, + "step": 690 + }, + { + "epoch": 0.13851080864691753, + "learning_rate": 6.289666717481496e-06, + "loss": 0.2321, + "step": 692 + }, + { + "epoch": 0.13931144915932747, + "learning_rate": 6.3026478469246285e-06, + "loss": 0.1146, + "step": 694 + }, + { + "epoch": 0.13931144915932747, + "learning_rate": 6.315636197376634e-06, + "loss": 0.1945, + "step": 696 + }, + { + "epoch": 0.14011208967173738, + "learning_rate": 6.328631743470968e-06, + "loss": 0.2376, + "step": 698 + }, + { + "epoch": 0.14011208967173738, + "learning_rate": 6.341634459827044e-06, + "loss": 0.3088, + "step": 700 + }, + { + "epoch": 0.14091273018414732, + "learning_rate": 6.354644321050279e-06, + "loss": 0.1803, + "step": 702 + }, + { + "epoch": 0.14091273018414732, + "learning_rate": 6.3676613017321305e-06, + "loss": 0.0814, + "step": 704 + }, + { + "epoch": 0.14171337069655723, + "learning_rate": 6.380685376450153e-06, + "loss": 0.1644, + "step": 706 + }, + { + "epoch": 0.14171337069655723, + "learning_rate": 6.393716519768032e-06, + "loss": 0.0515, + "step": 708 + }, + { + "epoch": 0.14251401120896717, + "learning_rate": 6.406754706235692e-06, + "loss": 0.1193, + "step": 710 + }, + { + "epoch": 0.14251401120896717, + "learning_rate": 6.419799910389257e-06, + "loss": 0.1614, + "step": 712 + }, + { + "epoch": 0.1433146517213771, + "learning_rate": 6.432852106751162e-06, + "loss": 0.0478, + "step": 714 + }, + { + "epoch": 0.1433146517213771, + "learning_rate": 6.445911269830183e-06, + "loss": 0.2306, + "step": 716 + }, + { + "epoch": 0.14411529223378702, + "learning_rate": 6.458977374121492e-06, + "loss": 0.199, + "step": 718 + }, + { + "epoch": 0.14411529223378702, + "learning_rate": 6.472050394106689e-06, + "loss": 0.1937, + "step": 720 + }, + { + "epoch": 0.14491593274619696, + "learning_rate": 6.485130304253915e-06, + "loss": 0.3484, + "step": 722 + }, + { + "epoch": 0.14491593274619696, + "learning_rate": 6.498217079017806e-06, + "loss": 0.0309, + "step": 724 + }, + { + "epoch": 0.14571657325860687, + "learning_rate": 6.511310692839605e-06, + "loss": 0.2075, + "step": 726 + }, + { + "epoch": 0.14571657325860687, + "learning_rate": 6.524411120147204e-06, + "loss": 0.3484, + "step": 728 + }, + { + "epoch": 0.1465172137710168, + "learning_rate": 6.537518335355182e-06, + "loss": 0.3836, + "step": 730 + }, + { + "epoch": 0.1465172137710168, + "learning_rate": 6.5506323128648654e-06, + "loss": 0.304, + "step": 732 + }, + { + "epoch": 0.14731785428342675, + "learning_rate": 6.563753027064355e-06, + "loss": 0.1088, + "step": 734 + }, + { + "epoch": 0.14731785428342675, + "learning_rate": 6.576880452328645e-06, + "loss": 0.2051, + "step": 736 + }, + { + "epoch": 0.14811849479583666, + "learning_rate": 6.590014563019571e-06, + "loss": 0.0931, + "step": 738 + }, + { + "epoch": 0.14811849479583666, + "learning_rate": 6.603155333485934e-06, + "loss": 0.0242, + "step": 740 + }, + { + "epoch": 0.1489191353082466, + "learning_rate": 6.61630273806352e-06, + "loss": 0.1331, + "step": 742 + }, + { + "epoch": 0.1489191353082466, + "learning_rate": 6.6294567510751675e-06, + "loss": 0.0407, + "step": 744 + }, + { + "epoch": 0.14971977582065651, + "learning_rate": 6.642617346830784e-06, + "loss": 0.3484, + "step": 746 + }, + { + "epoch": 0.14971977582065651, + "learning_rate": 6.655784499627476e-06, + "loss": 0.2671, + "step": 748 + }, + { + "epoch": 0.15052041633306645, + "learning_rate": 6.6689581837494925e-06, + "loss": 0.3514, + "step": 750 + }, + { + "epoch": 0.15052041633306645, + "learning_rate": 6.682138373468341e-06, + "loss": 0.1989, + "step": 752 + }, + { + "epoch": 0.1513210568454764, + "learning_rate": 6.695325043042827e-06, + "loss": 0.1213, + "step": 754 + }, + { + "epoch": 0.1513210568454764, + "learning_rate": 6.7085181667191e-06, + "loss": 0.0429, + "step": 756 + }, + { + "epoch": 0.1521216973578863, + "learning_rate": 6.7217177187307e-06, + "loss": 0.1189, + "step": 758 + }, + { + "epoch": 0.1521216973578863, + "learning_rate": 6.734923673298605e-06, + "loss": 0.0833, + "step": 760 + }, + { + "epoch": 0.15292233787029624, + "learning_rate": 6.748136004631327e-06, + "loss": 0.0591, + "step": 762 + }, + { + "epoch": 0.15292233787029624, + "learning_rate": 6.761354686924883e-06, + "loss": 0.1145, + "step": 764 + }, + { + "epoch": 0.15372297838270615, + "learning_rate": 6.774579694362902e-06, + "loss": 0.2219, + "step": 766 + }, + { + "epoch": 0.15372297838270615, + "learning_rate": 6.787811001116654e-06, + "loss": 0.3313, + "step": 768 + }, + { + "epoch": 0.1545236188951161, + "learning_rate": 6.801048581345113e-06, + "loss": 0.4712, + "step": 770 + }, + { + "epoch": 0.1545236188951161, + "learning_rate": 6.8142924091949955e-06, + "loss": 0.1821, + "step": 772 + }, + { + "epoch": 0.15532425940752603, + "learning_rate": 6.827542458800804e-06, + "loss": 0.057, + "step": 774 + }, + { + "epoch": 0.15532425940752603, + "learning_rate": 6.840798704284939e-06, + "loss": 0.0922, + "step": 776 + }, + { + "epoch": 0.15612489991993594, + "learning_rate": 6.854061119757647e-06, + "loss": 0.2308, + "step": 778 + }, + { + "epoch": 0.15612489991993594, + "learning_rate": 6.867329679317144e-06, + "loss": 0.0325, + "step": 780 + }, + { + "epoch": 0.15692554043234588, + "learning_rate": 6.880604357049646e-06, + "loss": 0.6701, + "step": 782 + }, + { + "epoch": 0.15692554043234588, + "learning_rate": 6.893885127029419e-06, + "loss": 0.0458, + "step": 784 + }, + { + "epoch": 0.1577261809447558, + "learning_rate": 6.907171963318815e-06, + "loss": 0.6007, + "step": 786 + }, + { + "epoch": 0.1577261809447558, + "learning_rate": 6.920464839968391e-06, + "loss": 0.334, + "step": 788 + }, + { + "epoch": 0.15852682145716573, + "learning_rate": 6.9337637310168494e-06, + "loss": 0.1817, + "step": 790 + }, + { + "epoch": 0.15852682145716573, + "learning_rate": 6.94706861049117e-06, + "loss": 0.202, + "step": 792 + }, + { + "epoch": 0.15932746196957567, + "learning_rate": 6.960379452406636e-06, + "loss": 0.1285, + "step": 794 + }, + { + "epoch": 0.15932746196957567, + "learning_rate": 6.973696230766884e-06, + "loss": 0.0334, + "step": 796 + }, + { + "epoch": 0.16012810248198558, + "learning_rate": 6.9870189195639595e-06, + "loss": 0.4901, + "step": 798 + }, + { + "epoch": 0.16012810248198558, + "learning_rate": 7.000347492778341e-06, + "loss": 0.1319, + "step": 800 + }, + { + "epoch": 0.16092874299439552, + "learning_rate": 7.013681924379073e-06, + "loss": 0.1974, + "step": 802 + }, + { + "epoch": 0.16092874299439552, + "learning_rate": 7.027022188323704e-06, + "loss": 0.1473, + "step": 804 + }, + { + "epoch": 0.16172938350680544, + "learning_rate": 7.040368258558412e-06, + "loss": 0.2318, + "step": 806 + }, + { + "epoch": 0.16172938350680544, + "learning_rate": 7.05372010901803e-06, + "loss": 0.2389, + "step": 808 + }, + { + "epoch": 0.16253002401921537, + "learning_rate": 7.0670777136261035e-06, + "loss": 0.2774, + "step": 810 + }, + { + "epoch": 0.16253002401921537, + "learning_rate": 7.080441046294945e-06, + "loss": 0.0749, + "step": 812 + }, + { + "epoch": 0.1633306645316253, + "learning_rate": 7.093810080925657e-06, + "loss": 0.2632, + "step": 814 + }, + { + "epoch": 0.1633306645316253, + "learning_rate": 7.1071847914082605e-06, + "loss": 0.0814, + "step": 816 + }, + { + "epoch": 0.16413130504403523, + "learning_rate": 7.120565151621638e-06, + "loss": 0.3145, + "step": 818 + }, + { + "epoch": 0.16413130504403523, + "learning_rate": 7.133951135433656e-06, + "loss": 0.083, + "step": 820 + }, + { + "epoch": 0.16493194555644516, + "learning_rate": 7.1473427167012e-06, + "loss": 0.4223, + "step": 822 + }, + { + "epoch": 0.16493194555644516, + "learning_rate": 7.160739869270219e-06, + "loss": 0.5719, + "step": 824 + }, + { + "epoch": 0.16573258606885508, + "learning_rate": 7.1741425669757854e-06, + "loss": 0.2632, + "step": 826 + }, + { + "epoch": 0.16573258606885508, + "learning_rate": 7.18755078364214e-06, + "loss": 0.1596, + "step": 828 + }, + { + "epoch": 0.16653322658126501, + "learning_rate": 7.200964493082727e-06, + "loss": 0.416, + "step": 830 + }, + { + "epoch": 0.16653322658126501, + "learning_rate": 7.214383669100317e-06, + "loss": 0.2089, + "step": 832 + }, + { + "epoch": 0.16733386709367493, + "learning_rate": 7.227808285486952e-06, + "loss": 0.4637, + "step": 834 + }, + { + "epoch": 0.16733386709367493, + "learning_rate": 7.241238316024064e-06, + "loss": 0.5417, + "step": 836 + }, + { + "epoch": 0.16813450760608487, + "learning_rate": 7.254673734482513e-06, + "loss": 0.2923, + "step": 838 + }, + { + "epoch": 0.16813450760608487, + "learning_rate": 7.268114514622635e-06, + "loss": 0.2321, + "step": 840 + }, + { + "epoch": 0.1689351481184948, + "learning_rate": 7.2815606301942945e-06, + "loss": 0.1216, + "step": 842 + }, + { + "epoch": 0.1689351481184948, + "learning_rate": 7.2950120549369204e-06, + "loss": 0.0603, + "step": 844 + }, + { + "epoch": 0.16973578863090472, + "learning_rate": 7.308468762579623e-06, + "loss": 0.1148, + "step": 846 + }, + { + "epoch": 0.16973578863090472, + "learning_rate": 7.321930726841144e-06, + "loss": 0.3627, + "step": 848 + }, + { + "epoch": 0.17053642914331466, + "learning_rate": 7.3353979214299765e-06, + "loss": 0.0484, + "step": 850 + }, + { + "epoch": 0.17053642914331466, + "learning_rate": 7.348870320044395e-06, + "loss": 0.2815, + "step": 852 + }, + { + "epoch": 0.17133706965572457, + "learning_rate": 7.362347896372515e-06, + "loss": 0.2997, + "step": 854 + }, + { + "epoch": 0.17133706965572457, + "learning_rate": 7.375830624092336e-06, + "loss": 0.3046, + "step": 856 + }, + { + "epoch": 0.1721377101681345, + "learning_rate": 7.389318476871784e-06, + "loss": 0.0953, + "step": 858 + }, + { + "epoch": 0.1721377101681345, + "learning_rate": 7.402811428368824e-06, + "loss": 0.0723, + "step": 860 + }, + { + "epoch": 0.17293835068054444, + "learning_rate": 7.416309452231411e-06, + "loss": 0.3166, + "step": 862 + }, + { + "epoch": 0.17293835068054444, + "learning_rate": 7.429812522097613e-06, + "loss": 0.2555, + "step": 864 + }, + { + "epoch": 0.17373899119295436, + "learning_rate": 7.443320611595641e-06, + "loss": 0.0331, + "step": 866 + }, + { + "epoch": 0.17373899119295436, + "learning_rate": 7.4568336943439055e-06, + "loss": 0.1328, + "step": 868 + }, + { + "epoch": 0.1745396317053643, + "learning_rate": 7.470351743951061e-06, + "loss": 0.1861, + "step": 870 + }, + { + "epoch": 0.1745396317053643, + "learning_rate": 7.4838747340160475e-06, + "loss": 0.0987, + "step": 872 + }, + { + "epoch": 0.1753402722177742, + "learning_rate": 7.497402638128209e-06, + "loss": 0.0484, + "step": 874 + }, + { + "epoch": 0.1753402722177742, + "learning_rate": 7.510935429867233e-06, + "loss": 0.182, + "step": 876 + }, + { + "epoch": 0.17614091273018415, + "learning_rate": 7.52447308280329e-06, + "loss": 0.1869, + "step": 878 + }, + { + "epoch": 0.17614091273018415, + "learning_rate": 7.538015570497046e-06, + "loss": 0.1574, + "step": 880 + }, + { + "epoch": 0.17694155324259409, + "learning_rate": 7.551562866499732e-06, + "loss": 0.3934, + "step": 882 + }, + { + "epoch": 0.17694155324259409, + "learning_rate": 7.5651149443531846e-06, + "loss": 0.167, + "step": 884 + }, + { + "epoch": 0.177742193755004, + "learning_rate": 7.578671777589884e-06, + "loss": 0.2535, + "step": 886 + }, + { + "epoch": 0.177742193755004, + "learning_rate": 7.592233339733077e-06, + "loss": 0.1872, + "step": 888 + }, + { + "epoch": 0.17854283426741394, + "learning_rate": 7.605799604296721e-06, + "loss": 0.2282, + "step": 890 + }, + { + "epoch": 0.17854283426741394, + "learning_rate": 7.619370544785608e-06, + "loss": 0.1935, + "step": 892 + }, + { + "epoch": 0.17934347477982385, + "learning_rate": 7.632946134695396e-06, + "loss": 0.0857, + "step": 894 + }, + { + "epoch": 0.17934347477982385, + "learning_rate": 7.646526347512665e-06, + "loss": 0.7365, + "step": 896 + }, + { + "epoch": 0.1801441152922338, + "learning_rate": 7.660111156714964e-06, + "loss": 0.0516, + "step": 898 + }, + { + "epoch": 0.1801441152922338, + "learning_rate": 7.67370053577085e-06, + "loss": 0.2793, + "step": 900 + }, + { + "epoch": 0.18094475580464373, + "learning_rate": 7.687294458140006e-06, + "loss": 0.1512, + "step": 902 + }, + { + "epoch": 0.18094475580464373, + "learning_rate": 7.70089289727319e-06, + "loss": 0.252, + "step": 904 + }, + { + "epoch": 0.18174539631705364, + "learning_rate": 7.714495826612353e-06, + "loss": 0.4318, + "step": 906 + }, + { + "epoch": 0.18174539631705364, + "learning_rate": 7.728103219590684e-06, + "loss": 0.1781, + "step": 908 + }, + { + "epoch": 0.18254603682946358, + "learning_rate": 7.741715049632646e-06, + "loss": 0.1025, + "step": 910 + }, + { + "epoch": 0.18254603682946358, + "learning_rate": 7.755331290154041e-06, + "loss": 0.0864, + "step": 912 + }, + { + "epoch": 0.1833466773418735, + "learning_rate": 7.76895191456204e-06, + "loss": 0.4176, + "step": 914 + }, + { + "epoch": 0.1833466773418735, + "learning_rate": 7.7825768962553e-06, + "loss": 0.0369, + "step": 916 + }, + { + "epoch": 0.18414731785428343, + "learning_rate": 7.796206208623925e-06, + "loss": 0.266, + "step": 918 + }, + { + "epoch": 0.18414731785428343, + "learning_rate": 7.809839825049565e-06, + "loss": 0.1479, + "step": 920 + }, + { + "epoch": 0.18494795836669337, + "learning_rate": 7.82347771890548e-06, + "loss": 0.1326, + "step": 922 + }, + { + "epoch": 0.18494795836669337, + "learning_rate": 7.83711986355656e-06, + "loss": 0.1757, + "step": 924 + }, + { + "epoch": 0.18574859887910328, + "learning_rate": 7.850766232359408e-06, + "loss": 0.1639, + "step": 926 + }, + { + "epoch": 0.18574859887910328, + "learning_rate": 7.864416798662347e-06, + "loss": 0.0451, + "step": 928 + }, + { + "epoch": 0.18654923939151322, + "learning_rate": 7.878071535805564e-06, + "loss": 0.3636, + "step": 930 + }, + { + "epoch": 0.18654923939151322, + "learning_rate": 7.891730417121043e-06, + "loss": 0.1085, + "step": 932 + }, + { + "epoch": 0.18734987990392313, + "learning_rate": 7.90539341593269e-06, + "loss": 0.0676, + "step": 934 + }, + { + "epoch": 0.18734987990392313, + "learning_rate": 7.919060505556376e-06, + "loss": 0.0572, + "step": 936 + }, + { + "epoch": 0.18815052041633307, + "learning_rate": 7.932731659299978e-06, + "loss": 0.0623, + "step": 938 + }, + { + "epoch": 0.18815052041633307, + "learning_rate": 7.946406850463435e-06, + "loss": 0.0559, + "step": 940 + }, + { + "epoch": 0.188951160928743, + "learning_rate": 7.960086052338788e-06, + "loss": 0.3344, + "step": 942 + }, + { + "epoch": 0.188951160928743, + "learning_rate": 7.973769238210291e-06, + "loss": 0.1265, + "step": 944 + }, + { + "epoch": 0.18975180144115292, + "learning_rate": 7.987456381354371e-06, + "loss": 0.0984, + "step": 946 + }, + { + "epoch": 0.18975180144115292, + "learning_rate": 8.001147455039737e-06, + "loss": 0.1843, + "step": 948 + }, + { + "epoch": 0.19055244195356286, + "learning_rate": 8.01484243252743e-06, + "loss": 0.234, + "step": 950 + }, + { + "epoch": 0.19055244195356286, + "learning_rate": 8.028541287070858e-06, + "loss": 0.4966, + "step": 952 + }, + { + "epoch": 0.19135308246597277, + "learning_rate": 8.042243991915866e-06, + "loss": 0.2971, + "step": 954 + }, + { + "epoch": 0.19135308246597277, + "learning_rate": 8.055950520300756e-06, + "loss": 0.0296, + "step": 956 + }, + { + "epoch": 0.1921537229783827, + "learning_rate": 8.069660845456411e-06, + "loss": 0.5079, + "step": 958 + }, + { + "epoch": 0.1921537229783827, + "learning_rate": 8.083374940606256e-06, + "loss": 0.7318, + "step": 960 + }, + { + "epoch": 0.19295436349079265, + "learning_rate": 8.097092778966364e-06, + "loss": 0.3087, + "step": 962 + }, + { + "epoch": 0.19295436349079265, + "learning_rate": 8.110814333745503e-06, + "loss": 0.1412, + "step": 964 + }, + { + "epoch": 0.19375500400320256, + "learning_rate": 8.124539578145176e-06, + "loss": 0.0831, + "step": 966 + }, + { + "epoch": 0.19375500400320256, + "learning_rate": 8.138268485359684e-06, + "loss": 0.1253, + "step": 968 + }, + { + "epoch": 0.1945556445156125, + "learning_rate": 8.152001028576158e-06, + "loss": 0.0954, + "step": 970 + }, + { + "epoch": 0.1945556445156125, + "learning_rate": 8.165737180974676e-06, + "loss": 0.116, + "step": 972 + }, + { + "epoch": 0.1953562850280224, + "learning_rate": 8.179476915728217e-06, + "loss": 0.1264, + "step": 974 + }, + { + "epoch": 0.1953562850280224, + "learning_rate": 8.193220206002785e-06, + "loss": 0.0885, + "step": 976 + }, + { + "epoch": 0.19615692554043235, + "learning_rate": 8.206967024957432e-06, + "loss": 0.3342, + "step": 978 + }, + { + "epoch": 0.19615692554043235, + "learning_rate": 8.220717345744326e-06, + "loss": 0.2388, + "step": 980 + }, + { + "epoch": 0.1969575660528423, + "learning_rate": 8.234471141508773e-06, + "loss": 0.3179, + "step": 982 + }, + { + "epoch": 0.1969575660528423, + "learning_rate": 8.248228385389349e-06, + "loss": 0.1489, + "step": 984 + }, + { + "epoch": 0.1977582065652522, + "learning_rate": 8.261989050517841e-06, + "loss": 0.5499, + "step": 986 + }, + { + "epoch": 0.1977582065652522, + "learning_rate": 8.275753110019367e-06, + "loss": 0.1861, + "step": 988 + }, + { + "epoch": 0.19855884707766214, + "learning_rate": 8.289520537012428e-06, + "loss": 0.2606, + "step": 990 + }, + { + "epoch": 0.19855884707766214, + "learning_rate": 8.303291304608936e-06, + "loss": 0.2852, + "step": 992 + }, + { + "epoch": 0.19935948759007205, + "learning_rate": 8.317065385914285e-06, + "loss": 0.4076, + "step": 994 + }, + { + "epoch": 0.19935948759007205, + "learning_rate": 8.330842754027378e-06, + "loss": 0.0471, + "step": 996 + }, + { + "epoch": 0.200160128102482, + "learning_rate": 8.344623382040752e-06, + "loss": 0.4035, + "step": 998 + }, + { + "epoch": 0.200160128102482, + "learning_rate": 8.358407243040524e-06, + "loss": 0.1677, + "step": 1000 + }, + { + "epoch": 0.20096076861489193, + "learning_rate": 8.372194310106515e-06, + "loss": 0.1602, + "step": 1002 + }, + { + "epoch": 0.20096076861489193, + "learning_rate": 8.385984556312285e-06, + "loss": 0.3203, + "step": 1004 + }, + { + "epoch": 0.20176140912730184, + "learning_rate": 8.399777954725183e-06, + "loss": 0.277, + "step": 1006 + }, + { + "epoch": 0.20176140912730184, + "learning_rate": 8.413574478406386e-06, + "loss": 0.1591, + "step": 1008 + }, + { + "epoch": 0.20256204963971178, + "learning_rate": 8.427374100411022e-06, + "loss": 0.0755, + "step": 1010 + }, + { + "epoch": 0.20256204963971178, + "learning_rate": 8.441176793788106e-06, + "loss": 0.1893, + "step": 1012 + }, + { + "epoch": 0.2033626901521217, + "learning_rate": 8.454982531580687e-06, + "loss": 0.4014, + "step": 1014 + }, + { + "epoch": 0.2033626901521217, + "learning_rate": 8.468791286825856e-06, + "loss": 0.168, + "step": 1016 + }, + { + "epoch": 0.20416333066453163, + "learning_rate": 8.482603032554812e-06, + "loss": 0.1817, + "step": 1018 + }, + { + "epoch": 0.20416333066453163, + "learning_rate": 8.496417741792922e-06, + "loss": 0.1098, + "step": 1020 + }, + { + "epoch": 0.20496397117694154, + "learning_rate": 8.510235387559738e-06, + "loss": 0.1761, + "step": 1022 + }, + { + "epoch": 0.20496397117694154, + "learning_rate": 8.524055942869135e-06, + "loss": 0.2678, + "step": 1024 + }, + { + "epoch": 0.20576461168935148, + "learning_rate": 8.537879380729254e-06, + "loss": 0.0895, + "step": 1026 + }, + { + "epoch": 0.20576461168935148, + "learning_rate": 8.551705674142616e-06, + "loss": 0.0742, + "step": 1028 + }, + { + "epoch": 0.20656525220176142, + "learning_rate": 8.565534796106175e-06, + "loss": 0.1355, + "step": 1030 + }, + { + "epoch": 0.20656525220176142, + "learning_rate": 8.579366719611353e-06, + "loss": 0.3409, + "step": 1032 + }, + { + "epoch": 0.20736589271417133, + "learning_rate": 8.593201417644091e-06, + "loss": 0.1214, + "step": 1034 + }, + { + "epoch": 0.20736589271417133, + "learning_rate": 8.607038863184952e-06, + "loss": 0.1194, + "step": 1036 + }, + { + "epoch": 0.20816653322658127, + "learning_rate": 8.620879029209093e-06, + "loss": 0.05, + "step": 1038 + }, + { + "epoch": 0.20816653322658127, + "learning_rate": 8.634721888686368e-06, + "loss": 0.3483, + "step": 1040 + }, + { + "epoch": 0.20896717373899118, + "learning_rate": 8.648567414581372e-06, + "loss": 0.209, + "step": 1042 + }, + { + "epoch": 0.20896717373899118, + "learning_rate": 8.662415579853495e-06, + "loss": 0.1941, + "step": 1044 + }, + { + "epoch": 0.20976781425140112, + "learning_rate": 8.676266357456968e-06, + "loss": 0.304, + "step": 1046 + }, + { + "epoch": 0.20976781425140112, + "learning_rate": 8.690119720340907e-06, + "loss": 0.2352, + "step": 1048 + }, + { + "epoch": 0.21056845476381106, + "learning_rate": 8.703975641449426e-06, + "loss": 0.166, + "step": 1050 + }, + { + "epoch": 0.21056845476381106, + "learning_rate": 8.717834093721598e-06, + "loss": 0.061, + "step": 1052 + }, + { + "epoch": 0.21136909527622097, + "learning_rate": 8.731695050091561e-06, + "loss": 0.2008, + "step": 1054 + }, + { + "epoch": 0.21136909527622097, + "learning_rate": 8.74555848348857e-06, + "loss": 0.1784, + "step": 1056 + }, + { + "epoch": 0.2121697357886309, + "learning_rate": 8.759424366837035e-06, + "loss": 0.0261, + "step": 1058 + }, + { + "epoch": 0.2121697357886309, + "learning_rate": 8.773292673056572e-06, + "loss": 0.0692, + "step": 1060 + }, + { + "epoch": 0.21297037630104082, + "learning_rate": 8.787163375062113e-06, + "loss": 1.0916, + "step": 1062 + }, + { + "epoch": 0.21297037630104082, + "learning_rate": 8.801036445763858e-06, + "loss": 0.1158, + "step": 1064 + }, + { + "epoch": 0.21377101681345076, + "learning_rate": 8.8149118580674e-06, + "loss": 0.0417, + "step": 1066 + }, + { + "epoch": 0.21377101681345076, + "learning_rate": 8.828789584873757e-06, + "loss": 0.0902, + "step": 1068 + }, + { + "epoch": 0.2145716573258607, + "learning_rate": 8.84266959907943e-06, + "loss": 0.2691, + "step": 1070 + }, + { + "epoch": 0.2145716573258607, + "learning_rate": 8.856551873576448e-06, + "loss": 0.178, + "step": 1072 + }, + { + "epoch": 0.2153722978382706, + "learning_rate": 8.870436381252412e-06, + "loss": 0.7575, + "step": 1074 + }, + { + "epoch": 0.2153722978382706, + "learning_rate": 8.884323094990613e-06, + "loss": 0.1877, + "step": 1076 + }, + { + "epoch": 0.21617293835068055, + "learning_rate": 8.89821198766998e-06, + "loss": 0.0808, + "step": 1078 + }, + { + "epoch": 0.21617293835068055, + "learning_rate": 8.912103032165206e-06, + "loss": 0.5612, + "step": 1080 + }, + { + "epoch": 0.21697357886309046, + "learning_rate": 8.925996201346779e-06, + "loss": 0.3931, + "step": 1082 + }, + { + "epoch": 0.21697357886309046, + "learning_rate": 8.939891468081036e-06, + "loss": 0.149, + "step": 1084 + }, + { + "epoch": 0.2177742193755004, + "learning_rate": 8.953788805230209e-06, + "loss": 0.3122, + "step": 1086 + }, + { + "epoch": 0.2177742193755004, + "learning_rate": 8.967688185652527e-06, + "loss": 0.1115, + "step": 1088 + }, + { + "epoch": 0.21857485988791034, + "learning_rate": 8.981589582202184e-06, + "loss": 0.2383, + "step": 1090 + }, + { + "epoch": 0.21857485988791034, + "learning_rate": 8.995492967729449e-06, + "loss": 0.1381, + "step": 1092 + }, + { + "epoch": 0.21937550040032025, + "learning_rate": 9.009398315080712e-06, + "loss": 0.2413, + "step": 1094 + }, + { + "epoch": 0.21937550040032025, + "learning_rate": 9.023305597098526e-06, + "loss": 0.0651, + "step": 1096 + }, + { + "epoch": 0.2201761409127302, + "learning_rate": 9.037214786621669e-06, + "loss": 0.0602, + "step": 1098 + }, + { + "epoch": 0.2201761409127302, + "learning_rate": 9.051125856485175e-06, + "loss": 0.1695, + "step": 1100 + }, + { + "epoch": 0.2209767814251401, + "learning_rate": 9.065038779520457e-06, + "loss": 0.212, + "step": 1102 + }, + { + "epoch": 0.2209767814251401, + "learning_rate": 9.078953528555258e-06, + "loss": 0.0372, + "step": 1104 + }, + { + "epoch": 0.22177742193755004, + "learning_rate": 9.092870076413771e-06, + "loss": 0.1329, + "step": 1106 + }, + { + "epoch": 0.22177742193755004, + "learning_rate": 9.106788395916682e-06, + "loss": 0.0171, + "step": 1108 + }, + { + "epoch": 0.22257806244995998, + "learning_rate": 9.120708459881203e-06, + "loss": 0.3314, + "step": 1110 + }, + { + "epoch": 0.22257806244995998, + "learning_rate": 9.134630241121135e-06, + "loss": 0.3001, + "step": 1112 + }, + { + "epoch": 0.2233787029623699, + "learning_rate": 9.148553712446971e-06, + "loss": 0.5388, + "step": 1114 + }, + { + "epoch": 0.2233787029623699, + "learning_rate": 9.162478846665854e-06, + "loss": 0.3171, + "step": 1116 + }, + { + "epoch": 0.22417934347477983, + "learning_rate": 9.176405616581694e-06, + "loss": 0.7605, + "step": 1118 + }, + { + "epoch": 0.22417934347477983, + "learning_rate": 9.190333994995208e-06, + "loss": 0.0997, + "step": 1120 + }, + { + "epoch": 0.22497998398718974, + "learning_rate": 9.20426395470397e-06, + "loss": 0.1803, + "step": 1122 + }, + { + "epoch": 0.22497998398718974, + "learning_rate": 9.218195468502469e-06, + "loss": 0.0717, + "step": 1124 + }, + { + "epoch": 0.22578062449959968, + "learning_rate": 9.232128509182136e-06, + "loss": 0.4041, + "step": 1126 + }, + { + "epoch": 0.22578062449959968, + "learning_rate": 9.24606304953148e-06, + "loss": 0.1251, + "step": 1128 + }, + { + "epoch": 0.22658126501200962, + "learning_rate": 9.259999062336021e-06, + "loss": 0.5991, + "step": 1130 + }, + { + "epoch": 0.22658126501200962, + "learning_rate": 9.273936520378426e-06, + "loss": 0.1819, + "step": 1132 + }, + { + "epoch": 0.22738190552441953, + "learning_rate": 9.287875396438536e-06, + "loss": 0.7128, + "step": 1134 + }, + { + "epoch": 0.22738190552441953, + "learning_rate": 9.301815663293426e-06, + "loss": 0.8231, + "step": 1136 + }, + { + "epoch": 0.22818254603682947, + "learning_rate": 9.315757293717432e-06, + "loss": 0.1622, + "step": 1138 + }, + { + "epoch": 0.22818254603682947, + "learning_rate": 9.329700260482286e-06, + "loss": 0.1428, + "step": 1140 + }, + { + "epoch": 0.22898318654923938, + "learning_rate": 9.343644536357053e-06, + "loss": 0.1043, + "step": 1142 + }, + { + "epoch": 0.22898318654923938, + "learning_rate": 9.35759009410826e-06, + "loss": 0.0208, + "step": 1144 + }, + { + "epoch": 0.22978382706164932, + "learning_rate": 9.37153690649993e-06, + "loss": 0.1506, + "step": 1146 + }, + { + "epoch": 0.22978382706164932, + "learning_rate": 9.38548494629364e-06, + "loss": 0.1675, + "step": 1148 + }, + { + "epoch": 0.23058446757405926, + "learning_rate": 9.39943418624856e-06, + "loss": 0.416, + "step": 1150 + }, + { + "epoch": 0.23058446757405926, + "learning_rate": 9.41338459912151e-06, + "loss": 0.0318, + "step": 1152 + }, + { + "epoch": 0.23138510808646917, + "learning_rate": 9.427336157667062e-06, + "loss": 1.176, + "step": 1154 + }, + { + "epoch": 0.23138510808646917, + "learning_rate": 9.441288834637507e-06, + "loss": 0.1194, + "step": 1156 + }, + { + "epoch": 0.2321857485988791, + "learning_rate": 9.45524260278296e-06, + "loss": 0.2501, + "step": 1158 + }, + { + "epoch": 0.2321857485988791, + "learning_rate": 9.469197434851414e-06, + "loss": 0.0974, + "step": 1160 + }, + { + "epoch": 0.23298638911128902, + "learning_rate": 9.483153303588777e-06, + "loss": 0.2725, + "step": 1162 + }, + { + "epoch": 0.23298638911128902, + "learning_rate": 9.497110181738935e-06, + "loss": 0.4687, + "step": 1164 + }, + { + "epoch": 0.23378702962369896, + "learning_rate": 9.511068042043785e-06, + "loss": 0.1213, + "step": 1166 + }, + { + "epoch": 0.23378702962369896, + "learning_rate": 9.52502685724336e-06, + "loss": 0.7522, + "step": 1168 + }, + { + "epoch": 0.2345876701361089, + "learning_rate": 9.538986600075773e-06, + "loss": 0.129, + "step": 1170 + }, + { + "epoch": 0.2345876701361089, + "learning_rate": 9.552947243277342e-06, + "loss": 0.2184, + "step": 1172 + }, + { + "epoch": 0.2353883106485188, + "learning_rate": 9.566908759582633e-06, + "loss": 0.2461, + "step": 1174 + }, + { + "epoch": 0.2353883106485188, + "learning_rate": 9.580871121724498e-06, + "loss": 0.0451, + "step": 1176 + }, + { + "epoch": 0.23618895116092875, + "learning_rate": 9.594834302434123e-06, + "loss": 0.3539, + "step": 1178 + }, + { + "epoch": 0.23618895116092875, + "learning_rate": 9.608798274441153e-06, + "loss": 0.361, + "step": 1180 + }, + { + "epoch": 0.23698959167333866, + "learning_rate": 9.622763010473628e-06, + "loss": 0.3958, + "step": 1182 + }, + { + "epoch": 0.23698959167333866, + "learning_rate": 9.636728483258116e-06, + "loss": 0.229, + "step": 1184 + }, + { + "epoch": 0.2377902321857486, + "learning_rate": 9.650694665519747e-06, + "loss": 0.2575, + "step": 1186 + }, + { + "epoch": 0.2377902321857486, + "learning_rate": 9.664661529982263e-06, + "loss": 0.1759, + "step": 1188 + }, + { + "epoch": 0.23859087269815854, + "learning_rate": 9.678629049368077e-06, + "loss": 0.4037, + "step": 1190 + }, + { + "epoch": 0.23859087269815854, + "learning_rate": 9.692597196398302e-06, + "loss": 0.2369, + "step": 1192 + }, + { + "epoch": 0.23939151321056845, + "learning_rate": 9.706565943792879e-06, + "loss": 0.0231, + "step": 1194 + }, + { + "epoch": 0.23939151321056845, + "learning_rate": 9.720535264270526e-06, + "loss": 0.0851, + "step": 1196 + }, + { + "epoch": 0.2401921537229784, + "learning_rate": 9.734505130548855e-06, + "loss": 0.3627, + "step": 1198 + }, + { + "epoch": 0.2401921537229784, + "learning_rate": 9.748475515344416e-06, + "loss": 0.2856, + "step": 1200 + }, + { + "epoch": 0.2409927942353883, + "learning_rate": 9.762446391372746e-06, + "loss": 0.361, + "step": 1202 + }, + { + "epoch": 0.2409927942353883, + "learning_rate": 9.776417731348403e-06, + "loss": 0.2192, + "step": 1204 + }, + { + "epoch": 0.24179343474779824, + "learning_rate": 9.790389507985091e-06, + "loss": 0.9324, + "step": 1206 + }, + { + "epoch": 0.24179343474779824, + "learning_rate": 9.80436169399561e-06, + "loss": 0.3331, + "step": 1208 + }, + { + "epoch": 0.24259407526020815, + "learning_rate": 9.81833426209198e-06, + "loss": 0.2959, + "step": 1210 + }, + { + "epoch": 0.24259407526020815, + "learning_rate": 9.832307184985473e-06, + "loss": 0.1123, + "step": 1212 + }, + { + "epoch": 0.2433947157726181, + "learning_rate": 9.846280435386668e-06, + "loss": 0.1632, + "step": 1214 + }, + { + "epoch": 0.2433947157726181, + "learning_rate": 9.8602539860055e-06, + "loss": 0.11, + "step": 1216 + }, + { + "epoch": 0.24419535628502803, + "learning_rate": 9.874227809551307e-06, + "loss": 0.4289, + "step": 1218 + }, + { + "epoch": 0.24419535628502803, + "learning_rate": 9.888201878732946e-06, + "loss": 0.1154, + "step": 1220 + }, + { + "epoch": 0.24499599679743794, + "learning_rate": 9.902176166258738e-06, + "loss": 0.0341, + "step": 1222 + }, + { + "epoch": 0.24499599679743794, + "learning_rate": 9.916150644836596e-06, + "loss": 0.2244, + "step": 1224 + }, + { + "epoch": 0.24579663730984788, + "learning_rate": 9.930125287174061e-06, + "loss": 0.2928, + "step": 1226 + }, + { + "epoch": 0.24579663730984788, + "learning_rate": 9.944100065978354e-06, + "loss": 0.2944, + "step": 1228 + }, + { + "epoch": 0.2465972778222578, + "learning_rate": 9.958074953956413e-06, + "loss": 0.0694, + "step": 1230 + }, + { + "epoch": 0.2465972778222578, + "learning_rate": 9.972049923815011e-06, + "loss": 0.1869, + "step": 1232 + }, + { + "epoch": 0.24739791833466773, + "learning_rate": 9.986024948260714e-06, + "loss": 0.1041, + "step": 1234 + }, + { + "epoch": 0.24739791833466773, + "learning_rate": 9.999999999999996e-06, + "loss": 0.1353, + "step": 1236 + }, + { + "epoch": 0.24819855884707767, + "learning_rate": 1.0013975051739277e-05, + "loss": 0.1762, + "step": 1238 + }, + { + "epoch": 0.24819855884707767, + "learning_rate": 1.0027950076184982e-05, + "loss": 0.2115, + "step": 1240 + }, + { + "epoch": 0.24899919935948758, + "learning_rate": 1.004192504604358e-05, + "loss": 0.318, + "step": 1242 + }, + { + "epoch": 0.24899919935948758, + "learning_rate": 1.0055899934021637e-05, + "loss": 0.1641, + "step": 1244 + }, + { + "epoch": 0.24979983987189752, + "learning_rate": 1.006987471282593e-05, + "loss": 0.2502, + "step": 1246 + }, + { + "epoch": 0.24979983987189752, + "learning_rate": 1.0083849355163397e-05, + "loss": 0.0409, + "step": 1248 + }, + { + "epoch": 0.25060048038430743, + "learning_rate": 1.0097823833741255e-05, + "loss": 0.2217, + "step": 1250 + }, + { + "epoch": 0.25060048038430743, + "learning_rate": 1.0111798121267047e-05, + "loss": 0.0459, + "step": 1252 + }, + { + "epoch": 0.2514011208967174, + "learning_rate": 1.0125772190448686e-05, + "loss": 0.3258, + "step": 1254 + }, + { + "epoch": 0.2514011208967174, + "learning_rate": 1.0139746013994493e-05, + "loss": 0.2276, + "step": 1256 + }, + { + "epoch": 0.2522017614091273, + "learning_rate": 1.0153719564613327e-05, + "loss": 0.1085, + "step": 1258 + }, + { + "epoch": 0.2522017614091273, + "learning_rate": 1.016769281501452e-05, + "loss": 0.0675, + "step": 1260 + }, + { + "epoch": 0.2530024019215372, + "learning_rate": 1.018166573790801e-05, + "loss": 0.5405, + "step": 1262 + }, + { + "epoch": 0.2530024019215372, + "learning_rate": 1.0195638306004383e-05, + "loss": 0.0541, + "step": 1264 + }, + { + "epoch": 0.25380304243394713, + "learning_rate": 1.0209610492014904e-05, + "loss": 0.3768, + "step": 1266 + }, + { + "epoch": 0.25380304243394713, + "learning_rate": 1.022358226865159e-05, + "loss": 0.5553, + "step": 1268 + }, + { + "epoch": 0.2546036829463571, + "learning_rate": 1.0237553608627247e-05, + "loss": 0.3036, + "step": 1270 + }, + { + "epoch": 0.2546036829463571, + "learning_rate": 1.0251524484655577e-05, + "loss": 0.1334, + "step": 1272 + }, + { + "epoch": 0.255404323458767, + "learning_rate": 1.0265494869451138e-05, + "loss": 0.3836, + "step": 1274 + }, + { + "epoch": 0.255404323458767, + "learning_rate": 1.0279464735729467e-05, + "loss": 0.1764, + "step": 1276 + }, + { + "epoch": 0.2562049639711769, + "learning_rate": 1.0293434056207114e-05, + "loss": 0.238, + "step": 1278 + }, + { + "epoch": 0.2562049639711769, + "learning_rate": 1.0307402803601691e-05, + "loss": 0.0429, + "step": 1280 + }, + { + "epoch": 0.2570056044835869, + "learning_rate": 1.0321370950631918e-05, + "loss": 0.425, + "step": 1282 + }, + { + "epoch": 0.2570056044835869, + "learning_rate": 1.033533847001773e-05, + "loss": 0.3712, + "step": 1284 + }, + { + "epoch": 0.2578062449959968, + "learning_rate": 1.0349305334480246e-05, + "loss": 0.1473, + "step": 1286 + }, + { + "epoch": 0.2578062449959968, + "learning_rate": 1.0363271516741877e-05, + "loss": 0.1729, + "step": 1288 + }, + { + "epoch": 0.2586068855084067, + "learning_rate": 1.0377236989526366e-05, + "loss": 0.0395, + "step": 1290 + }, + { + "epoch": 0.2586068855084067, + "learning_rate": 1.039120172555884e-05, + "loss": 0.2499, + "step": 1292 + }, + { + "epoch": 0.2594075260208166, + "learning_rate": 1.0405165697565868e-05, + "loss": 0.5762, + "step": 1294 + }, + { + "epoch": 0.2594075260208166, + "learning_rate": 1.0419128878275495e-05, + "loss": 0.0809, + "step": 1296 + }, + { + "epoch": 0.2602081665332266, + "learning_rate": 1.0433091240417362e-05, + "loss": 0.2047, + "step": 1298 + }, + { + "epoch": 0.2602081665332266, + "learning_rate": 1.0447052756722651e-05, + "loss": 0.1011, + "step": 1300 + }, + { + "epoch": 0.2610088070456365, + "learning_rate": 1.046101339992422e-05, + "loss": 0.5043, + "step": 1302 + }, + { + "epoch": 0.2610088070456365, + "learning_rate": 1.0474973142756632e-05, + "loss": 0.0539, + "step": 1304 + }, + { + "epoch": 0.2618094475580464, + "learning_rate": 1.0488931957956208e-05, + "loss": 1.0963, + "step": 1306 + }, + { + "epoch": 0.2618094475580464, + "learning_rate": 1.0502889818261058e-05, + "loss": 0.0828, + "step": 1308 + }, + { + "epoch": 0.2626100880704564, + "learning_rate": 1.0516846696411216e-05, + "loss": 0.1845, + "step": 1310 + }, + { + "epoch": 0.2626100880704564, + "learning_rate": 1.053080256514858e-05, + "loss": 0.4335, + "step": 1312 + }, + { + "epoch": 0.2634107285828663, + "learning_rate": 1.054475739721703e-05, + "loss": 0.5021, + "step": 1314 + }, + { + "epoch": 0.2634107285828663, + "learning_rate": 1.0558711165362488e-05, + "loss": 0.1267, + "step": 1316 + }, + { + "epoch": 0.2642113690952762, + "learning_rate": 1.0572663842332931e-05, + "loss": 0.3218, + "step": 1318 + }, + { + "epoch": 0.2642113690952762, + "learning_rate": 1.0586615400878484e-05, + "loss": 0.1263, + "step": 1320 + }, + { + "epoch": 0.26501200960768617, + "learning_rate": 1.0600565813751433e-05, + "loss": 0.3412, + "step": 1322 + }, + { + "epoch": 0.26501200960768617, + "learning_rate": 1.0614515053706354e-05, + "loss": 0.2836, + "step": 1324 + }, + { + "epoch": 0.2658126501200961, + "learning_rate": 1.0628463093500063e-05, + "loss": 0.4117, + "step": 1326 + }, + { + "epoch": 0.2658126501200961, + "learning_rate": 1.0642409905891733e-05, + "loss": 0.416, + "step": 1328 + }, + { + "epoch": 0.266613290632506, + "learning_rate": 1.065635546364294e-05, + "loss": 0.2044, + "step": 1330 + }, + { + "epoch": 0.266613290632506, + "learning_rate": 1.0670299739517706e-05, + "loss": 0.5967, + "step": 1332 + }, + { + "epoch": 0.2674139311449159, + "learning_rate": 1.0684242706282562e-05, + "loss": 0.2413, + "step": 1334 + }, + { + "epoch": 0.2674139311449159, + "learning_rate": 1.0698184336706567e-05, + "loss": 0.0481, + "step": 1336 + }, + { + "epoch": 0.2682145716573259, + "learning_rate": 1.0712124603561457e-05, + "loss": 0.6643, + "step": 1338 + }, + { + "epoch": 0.2682145716573259, + "learning_rate": 1.0726063479621567e-05, + "loss": 0.1214, + "step": 1340 + }, + { + "epoch": 0.2690152121697358, + "learning_rate": 1.0740000937663972e-05, + "loss": 0.3952, + "step": 1342 + }, + { + "epoch": 0.2690152121697358, + "learning_rate": 1.0753936950468513e-05, + "loss": 0.0603, + "step": 1344 + }, + { + "epoch": 0.2698158526821457, + "learning_rate": 1.0767871490817856e-05, + "loss": 0.187, + "step": 1346 + }, + { + "epoch": 0.2698158526821457, + "learning_rate": 1.0781804531497525e-05, + "loss": 0.1594, + "step": 1348 + }, + { + "epoch": 0.27061649319455566, + "learning_rate": 1.0795736045296023e-05, + "loss": 0.4928, + "step": 1350 + }, + { + "epoch": 0.27061649319455566, + "learning_rate": 1.0809666005004787e-05, + "loss": 0.0831, + "step": 1352 + }, + { + "epoch": 0.2714171337069656, + "learning_rate": 1.08235943834183e-05, + "loss": 0.0918, + "step": 1354 + }, + { + "epoch": 0.2714171337069656, + "learning_rate": 1.083752115333414e-05, + "loss": 0.4139, + "step": 1356 + }, + { + "epoch": 0.2722177742193755, + "learning_rate": 1.0851446287553022e-05, + "loss": 0.395, + "step": 1358 + }, + { + "epoch": 0.2722177742193755, + "learning_rate": 1.0865369758878858e-05, + "loss": 0.1287, + "step": 1360 + }, + { + "epoch": 0.27301841473178545, + "learning_rate": 1.087929154011879e-05, + "loss": 0.1047, + "step": 1362 + }, + { + "epoch": 0.27301841473178545, + "learning_rate": 1.0893211604083311e-05, + "loss": 0.1507, + "step": 1364 + }, + { + "epoch": 0.27381905524419536, + "learning_rate": 1.090712992358622e-05, + "loss": 0.2143, + "step": 1366 + }, + { + "epoch": 0.27381905524419536, + "learning_rate": 1.0921046471444737e-05, + "loss": 0.3028, + "step": 1368 + }, + { + "epoch": 0.2746196957566053, + "learning_rate": 1.0934961220479537e-05, + "loss": 0.3345, + "step": 1370 + }, + { + "epoch": 0.2746196957566053, + "learning_rate": 1.0948874143514818e-05, + "loss": 0.8261, + "step": 1372 + }, + { + "epoch": 0.2754203362690152, + "learning_rate": 1.0962785213378325e-05, + "loss": 0.5255, + "step": 1374 + }, + { + "epoch": 0.2754203362690152, + "learning_rate": 1.0976694402901467e-05, + "loss": 0.0732, + "step": 1376 + }, + { + "epoch": 0.27622097678142515, + "learning_rate": 1.0990601684919282e-05, + "loss": 0.3126, + "step": 1378 + }, + { + "epoch": 0.27622097678142515, + "learning_rate": 1.1004507032270544e-05, + "loss": 0.0944, + "step": 1380 + }, + { + "epoch": 0.27702161729383507, + "learning_rate": 1.1018410417797809e-05, + "loss": 0.1062, + "step": 1382 + }, + { + "epoch": 0.27702161729383507, + "learning_rate": 1.1032311814347467e-05, + "loss": 0.2354, + "step": 1384 + }, + { + "epoch": 0.277822257806245, + "learning_rate": 1.1046211194769784e-05, + "loss": 0.2778, + "step": 1386 + }, + { + "epoch": 0.277822257806245, + "learning_rate": 1.1060108531918955e-05, + "loss": 0.3117, + "step": 1388 + }, + { + "epoch": 0.27862289831865494, + "learning_rate": 1.1074003798653215e-05, + "loss": 0.2232, + "step": 1390 + }, + { + "epoch": 0.27862289831865494, + "learning_rate": 1.1087896967834787e-05, + "loss": 0.0642, + "step": 1392 + }, + { + "epoch": 0.27942353883106485, + "learning_rate": 1.1101788012330013e-05, + "loss": 0.1443, + "step": 1394 + }, + { + "epoch": 0.27942353883106485, + "learning_rate": 1.111567690500938e-05, + "loss": 0.0678, + "step": 1396 + }, + { + "epoch": 0.28022417934347477, + "learning_rate": 1.1129563618747581e-05, + "loss": 0.0879, + "step": 1398 + }, + { + "epoch": 0.28022417934347477, + "learning_rate": 1.1143448126423545e-05, + "loss": 0.0826, + "step": 1400 + }, + { + "epoch": 0.28102481985588473, + "learning_rate": 1.1157330400920563e-05, + "loss": 0.1277, + "step": 1402 + }, + { + "epoch": 0.28102481985588473, + "learning_rate": 1.1171210415126238e-05, + "loss": 0.1479, + "step": 1404 + }, + { + "epoch": 0.28182546036829464, + "learning_rate": 1.1185088141932594e-05, + "loss": 0.414, + "step": 1406 + }, + { + "epoch": 0.28182546036829464, + "learning_rate": 1.1198963554236135e-05, + "loss": 0.0224, + "step": 1408 + }, + { + "epoch": 0.28262610088070456, + "learning_rate": 1.121283662493788e-05, + "loss": 0.3222, + "step": 1410 + }, + { + "epoch": 0.28262610088070456, + "learning_rate": 1.122670732694342e-05, + "loss": 0.4996, + "step": 1412 + }, + { + "epoch": 0.28342674139311447, + "learning_rate": 1.1240575633162958e-05, + "loss": 0.0146, + "step": 1414 + }, + { + "epoch": 0.28342674139311447, + "learning_rate": 1.1254441516511425e-05, + "loss": 0.9534, + "step": 1416 + }, + { + "epoch": 0.28422738190552443, + "learning_rate": 1.1268304949908434e-05, + "loss": 0.1435, + "step": 1418 + }, + { + "epoch": 0.28422738190552443, + "learning_rate": 1.1282165906278395e-05, + "loss": 0.3, + "step": 1420 + }, + { + "epoch": 0.28502802241793435, + "learning_rate": 1.1296024358550565e-05, + "loss": 0.3513, + "step": 1422 + }, + { + "epoch": 0.28502802241793435, + "learning_rate": 1.1309880279659087e-05, + "loss": 0.496, + "step": 1424 + }, + { + "epoch": 0.28582866293034426, + "learning_rate": 1.1323733642543024e-05, + "loss": 0.2293, + "step": 1426 + }, + { + "epoch": 0.28582866293034426, + "learning_rate": 1.1337584420146496e-05, + "loss": 0.2017, + "step": 1428 + }, + { + "epoch": 0.2866293034427542, + "learning_rate": 1.135143258541862e-05, + "loss": 0.3182, + "step": 1430 + }, + { + "epoch": 0.2866293034427542, + "learning_rate": 1.1365278111313625e-05, + "loss": 0.2151, + "step": 1432 + }, + { + "epoch": 0.28742994395516414, + "learning_rate": 1.13791209707909e-05, + "loss": 0.3514, + "step": 1434 + }, + { + "epoch": 0.28742994395516414, + "learning_rate": 1.1392961136815041e-05, + "loss": 0.0514, + "step": 1436 + }, + { + "epoch": 0.28823058446757405, + "learning_rate": 1.1406798582355902e-05, + "loss": 0.092, + "step": 1438 + }, + { + "epoch": 0.28823058446757405, + "learning_rate": 1.142063328038864e-05, + "loss": 0.2046, + "step": 1440 + }, + { + "epoch": 0.289031224979984, + "learning_rate": 1.1434465203893818e-05, + "loss": 0.4689, + "step": 1442 + }, + { + "epoch": 0.289031224979984, + "learning_rate": 1.1448294325857377e-05, + "loss": 0.5958, + "step": 1444 + }, + { + "epoch": 0.2898318654923939, + "learning_rate": 1.146212061927074e-05, + "loss": 0.1821, + "step": 1446 + }, + { + "epoch": 0.2898318654923939, + "learning_rate": 1.1475944057130856e-05, + "loss": 0.1167, + "step": 1448 + }, + { + "epoch": 0.29063250600480384, + "learning_rate": 1.1489764612440255e-05, + "loss": 0.6106, + "step": 1450 + }, + { + "epoch": 0.29063250600480384, + "learning_rate": 1.150358225820707e-05, + "loss": 0.1661, + "step": 1452 + }, + { + "epoch": 0.29143314651721375, + "learning_rate": 1.151739696744518e-05, + "loss": 0.3145, + "step": 1454 + }, + { + "epoch": 0.29143314651721375, + "learning_rate": 1.1531208713174138e-05, + "loss": 0.1435, + "step": 1456 + }, + { + "epoch": 0.2922337870296237, + "learning_rate": 1.1545017468419307e-05, + "loss": 0.1068, + "step": 1458 + }, + { + "epoch": 0.2922337870296237, + "learning_rate": 1.1558823206211887e-05, + "loss": 0.1946, + "step": 1460 + }, + { + "epoch": 0.2930344275420336, + "learning_rate": 1.1572625899588972e-05, + "loss": 0.3063, + "step": 1462 + }, + { + "epoch": 0.2930344275420336, + "learning_rate": 1.1586425521593607e-05, + "loss": 0.116, + "step": 1464 + }, + { + "epoch": 0.29383506805444354, + "learning_rate": 1.1600222045274809e-05, + "loss": 0.1479, + "step": 1466 + }, + { + "epoch": 0.29383506805444354, + "learning_rate": 1.1614015443687708e-05, + "loss": 0.3315, + "step": 1468 + }, + { + "epoch": 0.2946357085668535, + "learning_rate": 1.1627805689893478e-05, + "loss": 0.7527, + "step": 1470 + }, + { + "epoch": 0.2946357085668535, + "learning_rate": 1.1641592756959467e-05, + "loss": 0.2604, + "step": 1472 + }, + { + "epoch": 0.2954363490792634, + "learning_rate": 1.1655376617959239e-05, + "loss": 0.0453, + "step": 1474 + }, + { + "epoch": 0.2954363490792634, + "learning_rate": 1.1669157245972616e-05, + "loss": 0.5021, + "step": 1476 + }, + { + "epoch": 0.2962369895916733, + "learning_rate": 1.1682934614085708e-05, + "loss": 0.2799, + "step": 1478 + }, + { + "epoch": 0.2962369895916733, + "learning_rate": 1.1696708695391057e-05, + "loss": 0.3967, + "step": 1480 + }, + { + "epoch": 0.29703763010408324, + "learning_rate": 1.1710479462987565e-05, + "loss": 0.3936, + "step": 1482 + }, + { + "epoch": 0.29703763010408324, + "learning_rate": 1.1724246889980626e-05, + "loss": 0.1394, + "step": 1484 + }, + { + "epoch": 0.2978382706164932, + "learning_rate": 1.1738010949482152e-05, + "loss": 0.1422, + "step": 1486 + }, + { + "epoch": 0.2978382706164932, + "learning_rate": 1.1751771614610643e-05, + "loss": 0.3321, + "step": 1488 + }, + { + "epoch": 0.2986389111289031, + "learning_rate": 1.176552885849122e-05, + "loss": 0.8769, + "step": 1490 + }, + { + "epoch": 0.2986389111289031, + "learning_rate": 1.1779282654255668e-05, + "loss": 0.2379, + "step": 1492 + }, + { + "epoch": 0.29943955164131303, + "learning_rate": 1.1793032975042563e-05, + "loss": 0.4829, + "step": 1494 + }, + { + "epoch": 0.29943955164131303, + "learning_rate": 1.180677979399721e-05, + "loss": 0.3164, + "step": 1496 + }, + { + "epoch": 0.300240192153723, + "learning_rate": 1.1820523084271775e-05, + "loss": 0.1213, + "step": 1498 + }, + { + "epoch": 0.300240192153723, + "learning_rate": 1.1834262819025317e-05, + "loss": 0.1749, + "step": 1500 + }, + { + "epoch": 0.3010408326661329, + "learning_rate": 1.1847998971423835e-05, + "loss": 0.3351, + "step": 1502 + }, + { + "epoch": 0.3010408326661329, + "learning_rate": 1.1861731514640309e-05, + "loss": 0.0968, + "step": 1504 + }, + { + "epoch": 0.3018414731785428, + "learning_rate": 1.1875460421854816e-05, + "loss": 0.2624, + "step": 1506 + }, + { + "epoch": 0.3018414731785428, + "learning_rate": 1.188918566625449e-05, + "loss": 0.5706, + "step": 1508 + }, + { + "epoch": 0.3026421136909528, + "learning_rate": 1.1902907221033629e-05, + "loss": 0.2771, + "step": 1510 + }, + { + "epoch": 0.3026421136909528, + "learning_rate": 1.1916625059393739e-05, + "loss": 0.2077, + "step": 1512 + }, + { + "epoch": 0.3034427542033627, + "learning_rate": 1.1930339154543582e-05, + "loss": 0.2773, + "step": 1514 + }, + { + "epoch": 0.3034427542033627, + "learning_rate": 1.1944049479699241e-05, + "loss": 0.2898, + "step": 1516 + }, + { + "epoch": 0.3042433947157726, + "learning_rate": 1.1957756008084127e-05, + "loss": 0.2375, + "step": 1518 + }, + { + "epoch": 0.3042433947157726, + "learning_rate": 1.1971458712929133e-05, + "loss": 0.1824, + "step": 1520 + }, + { + "epoch": 0.3050440352281825, + "learning_rate": 1.1985157567472563e-05, + "loss": 0.2268, + "step": 1522 + }, + { + "epoch": 0.3050440352281825, + "learning_rate": 1.1998852544960256e-05, + "loss": 0.2841, + "step": 1524 + }, + { + "epoch": 0.3058446757405925, + "learning_rate": 1.2012543618645622e-05, + "loss": 0.8211, + "step": 1526 + }, + { + "epoch": 0.3058446757405925, + "learning_rate": 1.2026230761789702e-05, + "loss": 0.9402, + "step": 1528 + }, + { + "epoch": 0.3066453162530024, + "learning_rate": 1.2039913947661205e-05, + "loss": 0.2532, + "step": 1530 + }, + { + "epoch": 0.3066453162530024, + "learning_rate": 1.2053593149536557e-05, + "loss": 0.2143, + "step": 1532 + }, + { + "epoch": 0.3074459567654123, + "learning_rate": 1.2067268340700016e-05, + "loss": 0.0974, + "step": 1534 + }, + { + "epoch": 0.3074459567654123, + "learning_rate": 1.2080939494443618e-05, + "loss": 0.0677, + "step": 1536 + }, + { + "epoch": 0.3082465972778223, + "learning_rate": 1.2094606584067304e-05, + "loss": 0.1463, + "step": 1538 + }, + { + "epoch": 0.3082465972778223, + "learning_rate": 1.210826958287895e-05, + "loss": 0.0394, + "step": 1540 + }, + { + "epoch": 0.3090472377902322, + "learning_rate": 1.212192846419443e-05, + "loss": 0.3969, + "step": 1542 + }, + { + "epoch": 0.3090472377902322, + "learning_rate": 1.2135583201337646e-05, + "loss": 0.1278, + "step": 1544 + }, + { + "epoch": 0.3098478783026421, + "learning_rate": 1.2149233767640587e-05, + "loss": 0.1759, + "step": 1546 + }, + { + "epoch": 0.3098478783026421, + "learning_rate": 1.2162880136443434e-05, + "loss": 0.035, + "step": 1548 + }, + { + "epoch": 0.31064851881505207, + "learning_rate": 1.2176522281094514e-05, + "loss": 0.1877, + "step": 1550 + }, + { + "epoch": 0.31064851881505207, + "learning_rate": 1.2190160174950428e-05, + "loss": 0.2079, + "step": 1552 + }, + { + "epoch": 0.311449159327462, + "learning_rate": 1.220379379137607e-05, + "loss": 0.1168, + "step": 1554 + }, + { + "epoch": 0.311449159327462, + "learning_rate": 1.2217423103744692e-05, + "loss": 1.0754, + "step": 1556 + }, + { + "epoch": 0.3122497998398719, + "learning_rate": 1.2231048085437953e-05, + "loss": 0.1347, + "step": 1558 + }, + { + "epoch": 0.3122497998398719, + "learning_rate": 1.2244668709845952e-05, + "loss": 0.1597, + "step": 1560 + }, + { + "epoch": 0.3130504403522818, + "learning_rate": 1.2258284950367347e-05, + "loss": 0.6158, + "step": 1562 + }, + { + "epoch": 0.3130504403522818, + "learning_rate": 1.2271896780409309e-05, + "loss": 0.3008, + "step": 1564 + }, + { + "epoch": 0.31385108086469177, + "learning_rate": 1.228550417338764e-05, + "loss": 0.0264, + "step": 1566 + }, + { + "epoch": 0.31385108086469177, + "learning_rate": 1.2299107102726804e-05, + "loss": 0.2265, + "step": 1568 + }, + { + "epoch": 0.3146517213771017, + "learning_rate": 1.2312705541859985e-05, + "loss": 0.322, + "step": 1570 + }, + { + "epoch": 0.3146517213771017, + "learning_rate": 1.2326299464229143e-05, + "loss": 0.1481, + "step": 1572 + }, + { + "epoch": 0.3154523618895116, + "learning_rate": 1.2339888843285029e-05, + "loss": 0.0395, + "step": 1574 + }, + { + "epoch": 0.3154523618895116, + "learning_rate": 1.2353473652487329e-05, + "loss": 0.3722, + "step": 1576 + }, + { + "epoch": 0.31625300240192156, + "learning_rate": 1.2367053865304597e-05, + "loss": 0.9226, + "step": 1578 + }, + { + "epoch": 0.31625300240192156, + "learning_rate": 1.2380629455214385e-05, + "loss": 0.2643, + "step": 1580 + }, + { + "epoch": 0.31705364291433147, + "learning_rate": 1.2394200395703273e-05, + "loss": 0.1948, + "step": 1582 + }, + { + "epoch": 0.31705364291433147, + "learning_rate": 1.2407766660266916e-05, + "loss": 0.2839, + "step": 1584 + }, + { + "epoch": 0.3178542834267414, + "learning_rate": 1.2421328222410109e-05, + "loss": 0.7845, + "step": 1586 + }, + { + "epoch": 0.3178542834267414, + "learning_rate": 1.2434885055646808e-05, + "loss": 0.043, + "step": 1588 + }, + { + "epoch": 0.31865492393915135, + "learning_rate": 1.2448437133500262e-05, + "loss": 0.1697, + "step": 1590 + }, + { + "epoch": 0.31865492393915135, + "learning_rate": 1.2461984429502947e-05, + "loss": 0.1528, + "step": 1592 + }, + { + "epoch": 0.31945556445156126, + "learning_rate": 1.2475526917196703e-05, + "loss": 0.0764, + "step": 1594 + }, + { + "epoch": 0.31945556445156126, + "learning_rate": 1.2489064570132761e-05, + "loss": 0.2374, + "step": 1596 + }, + { + "epoch": 0.32025620496397117, + "learning_rate": 1.2502597361871787e-05, + "loss": 0.2354, + "step": 1598 + }, + { + "epoch": 0.32025620496397117, + "learning_rate": 1.2516125265983945e-05, + "loss": 0.1944, + "step": 1600 + }, + { + "epoch": 0.3210568454763811, + "learning_rate": 1.2529648256048931e-05, + "loss": 0.5594, + "step": 1602 + }, + { + "epoch": 0.3210568454763811, + "learning_rate": 1.2543166305656089e-05, + "loss": 0.1195, + "step": 1604 + }, + { + "epoch": 0.32185748598879105, + "learning_rate": 1.2556679388404351e-05, + "loss": 0.0707, + "step": 1606 + }, + { + "epoch": 0.32185748598879105, + "learning_rate": 1.257018747790238e-05, + "loss": 0.1946, + "step": 1608 + }, + { + "epoch": 0.32265812650120096, + "learning_rate": 1.2583690547768584e-05, + "loss": 0.488, + "step": 1610 + }, + { + "epoch": 0.32265812650120096, + "learning_rate": 1.259718857163117e-05, + "loss": 0.3125, + "step": 1612 + }, + { + "epoch": 0.32345876701361087, + "learning_rate": 1.261068152312821e-05, + "loss": 0.1585, + "step": 1614 + }, + { + "epoch": 0.32345876701361087, + "learning_rate": 1.2624169375907657e-05, + "loss": 0.2891, + "step": 1616 + }, + { + "epoch": 0.32425940752602084, + "learning_rate": 1.2637652103627481e-05, + "loss": 0.1897, + "step": 1618 + }, + { + "epoch": 0.32425940752602084, + "learning_rate": 1.2651129679955598e-05, + "loss": 0.2431, + "step": 1620 + }, + { + "epoch": 0.32506004803843075, + "learning_rate": 1.2664602078570017e-05, + "loss": 0.4466, + "step": 1622 + }, + { + "epoch": 0.32506004803843075, + "learning_rate": 1.2678069273158849e-05, + "loss": 0.394, + "step": 1624 + }, + { + "epoch": 0.32586068855084066, + "learning_rate": 1.2691531237420369e-05, + "loss": 0.1644, + "step": 1626 + }, + { + "epoch": 0.32586068855084066, + "learning_rate": 1.2704987945063073e-05, + "loss": 0.3542, + "step": 1628 + }, + { + "epoch": 0.3266613290632506, + "learning_rate": 1.27184393698057e-05, + "loss": 0.1454, + "step": 1630 + }, + { + "epoch": 0.3266613290632506, + "learning_rate": 1.273188548537736e-05, + "loss": 0.0898, + "step": 1632 + }, + { + "epoch": 0.32746196957566054, + "learning_rate": 1.2745326265517481e-05, + "loss": 0.6367, + "step": 1634 + }, + { + "epoch": 0.32746196957566054, + "learning_rate": 1.2758761683975929e-05, + "loss": 0.3162, + "step": 1636 + }, + { + "epoch": 0.32826261008807045, + "learning_rate": 1.277219171451304e-05, + "loss": 0.3405, + "step": 1638 + }, + { + "epoch": 0.32826261008807045, + "learning_rate": 1.2785616330899676e-05, + "loss": 0.2094, + "step": 1640 + }, + { + "epoch": 0.32906325060048036, + "learning_rate": 1.2799035506917265e-05, + "loss": 0.2461, + "step": 1642 + }, + { + "epoch": 0.32906325060048036, + "learning_rate": 1.2812449216357855e-05, + "loss": 0.1547, + "step": 1644 + }, + { + "epoch": 0.32986389111289033, + "learning_rate": 1.2825857433024208e-05, + "loss": 0.3694, + "step": 1646 + }, + { + "epoch": 0.32986389111289033, + "learning_rate": 1.2839260130729776e-05, + "loss": 0.429, + "step": 1648 + }, + { + "epoch": 0.33066453162530024, + "learning_rate": 1.2852657283298794e-05, + "loss": 0.2466, + "step": 1650 + }, + { + "epoch": 0.33066453162530024, + "learning_rate": 1.2866048864566336e-05, + "loss": 0.0369, + "step": 1652 + }, + { + "epoch": 0.33146517213771015, + "learning_rate": 1.2879434848378356e-05, + "loss": 0.1354, + "step": 1654 + }, + { + "epoch": 0.33146517213771015, + "learning_rate": 1.2892815208591734e-05, + "loss": 0.1711, + "step": 1656 + }, + { + "epoch": 0.3322658126501201, + "learning_rate": 1.2906189919074336e-05, + "loss": 0.2693, + "step": 1658 + }, + { + "epoch": 0.3322658126501201, + "learning_rate": 1.2919558953705047e-05, + "loss": 0.4711, + "step": 1660 + }, + { + "epoch": 0.33306645316253003, + "learning_rate": 1.293292228637389e-05, + "loss": 0.2221, + "step": 1662 + }, + { + "epoch": 0.33306645316253003, + "learning_rate": 1.2946279890981966e-05, + "loss": 0.1384, + "step": 1664 + }, + { + "epoch": 0.33386709367493994, + "learning_rate": 1.2959631741441583e-05, + "loss": 0.3301, + "step": 1666 + }, + { + "epoch": 0.33386709367493994, + "learning_rate": 1.2972977811676289e-05, + "loss": 0.116, + "step": 1668 + }, + { + "epoch": 0.33466773418734985, + "learning_rate": 1.298631807562092e-05, + "loss": 0.0607, + "step": 1670 + }, + { + "epoch": 0.33466773418734985, + "learning_rate": 1.2999652507221652e-05, + "loss": 0.1672, + "step": 1672 + }, + { + "epoch": 0.3354683746997598, + "learning_rate": 1.3012981080436036e-05, + "loss": 0.0686, + "step": 1674 + }, + { + "epoch": 0.3354683746997598, + "learning_rate": 1.3026303769233109e-05, + "loss": 0.9691, + "step": 1676 + }, + { + "epoch": 0.33626901521216973, + "learning_rate": 1.3039620547593357e-05, + "loss": 0.2879, + "step": 1678 + }, + { + "epoch": 0.33626901521216973, + "learning_rate": 1.3052931389508822e-05, + "loss": 0.2663, + "step": 1680 + }, + { + "epoch": 0.33706965572457964, + "learning_rate": 1.3066236268983143e-05, + "loss": 0.4962, + "step": 1682 + }, + { + "epoch": 0.33706965572457964, + "learning_rate": 1.3079535160031601e-05, + "loss": 0.0791, + "step": 1684 + }, + { + "epoch": 0.3378702962369896, + "learning_rate": 1.3092828036681178e-05, + "loss": 0.186, + "step": 1686 + }, + { + "epoch": 0.3378702962369896, + "learning_rate": 1.3106114872970575e-05, + "loss": 0.1241, + "step": 1688 + }, + { + "epoch": 0.3386709367493995, + "learning_rate": 1.3119395642950348e-05, + "loss": 0.6674, + "step": 1690 + }, + { + "epoch": 0.3386709367493995, + "learning_rate": 1.313267032068285e-05, + "loss": 0.1813, + "step": 1692 + }, + { + "epoch": 0.33947157726180943, + "learning_rate": 1.3145938880242346e-05, + "loss": 0.1538, + "step": 1694 + }, + { + "epoch": 0.33947157726180943, + "learning_rate": 1.3159201295715054e-05, + "loss": 0.2058, + "step": 1696 + }, + { + "epoch": 0.3402722177742194, + "learning_rate": 1.3172457541199188e-05, + "loss": 0.1129, + "step": 1698 + }, + { + "epoch": 0.3402722177742194, + "learning_rate": 1.3185707590804997e-05, + "loss": 0.1088, + "step": 1700 + }, + { + "epoch": 0.3410728582866293, + "learning_rate": 1.3198951418654882e-05, + "loss": 0.0726, + "step": 1702 + }, + { + "epoch": 0.3410728582866293, + "learning_rate": 1.321218899888334e-05, + "loss": 0.5356, + "step": 1704 + }, + { + "epoch": 0.3418734987990392, + "learning_rate": 1.322542030563709e-05, + "loss": 0.0571, + "step": 1706 + }, + { + "epoch": 0.3418734987990392, + "learning_rate": 1.3238645313075109e-05, + "loss": 0.1424, + "step": 1708 + }, + { + "epoch": 0.34267413931144913, + "learning_rate": 1.3251863995368665e-05, + "loss": 0.1215, + "step": 1710 + }, + { + "epoch": 0.34267413931144913, + "learning_rate": 1.326507632670139e-05, + "loss": 0.4204, + "step": 1712 + }, + { + "epoch": 0.3434747798238591, + "learning_rate": 1.3278282281269293e-05, + "loss": 0.0371, + "step": 1714 + }, + { + "epoch": 0.3434747798238591, + "learning_rate": 1.3291481833280894e-05, + "loss": 0.15, + "step": 1716 + }, + { + "epoch": 0.344275420336269, + "learning_rate": 1.3304674956957167e-05, + "loss": 0.1695, + "step": 1718 + }, + { + "epoch": 0.344275420336269, + "learning_rate": 1.3317861626531652e-05, + "loss": 0.7978, + "step": 1720 + }, + { + "epoch": 0.3450760608486789, + "learning_rate": 1.3331041816250503e-05, + "loss": 0.3833, + "step": 1722 + }, + { + "epoch": 0.3450760608486789, + "learning_rate": 1.3344215500372517e-05, + "loss": 0.0912, + "step": 1724 + }, + { + "epoch": 0.3458767013610889, + "learning_rate": 1.335738265316921e-05, + "loss": 0.1595, + "step": 1726 + }, + { + "epoch": 0.3458767013610889, + "learning_rate": 1.3370543248924826e-05, + "loss": 0.2221, + "step": 1728 + }, + { + "epoch": 0.3466773418734988, + "learning_rate": 1.3383697261936472e-05, + "loss": 0.0707, + "step": 1730 + }, + { + "epoch": 0.3466773418734988, + "learning_rate": 1.3396844666514062e-05, + "loss": 0.1374, + "step": 1732 + }, + { + "epoch": 0.3474779823859087, + "learning_rate": 1.3409985436980422e-05, + "loss": 0.6474, + "step": 1734 + }, + { + "epoch": 0.3474779823859087, + "learning_rate": 1.3423119547671348e-05, + "loss": 0.4162, + "step": 1736 + }, + { + "epoch": 0.3482786228983187, + "learning_rate": 1.3436246972935638e-05, + "loss": 0.2305, + "step": 1738 + }, + { + "epoch": 0.3482786228983187, + "learning_rate": 1.344936768713513e-05, + "loss": 0.0302, + "step": 1740 + }, + { + "epoch": 0.3490792634107286, + "learning_rate": 1.346248166464481e-05, + "loss": 0.2663, + "step": 1742 + }, + { + "epoch": 0.3490792634107286, + "learning_rate": 1.347558887985279e-05, + "loss": 0.7525, + "step": 1744 + }, + { + "epoch": 0.3498799039231385, + "learning_rate": 1.348868930716039e-05, + "loss": 0.3146, + "step": 1746 + }, + { + "epoch": 0.3498799039231385, + "learning_rate": 1.3501782920982189e-05, + "loss": 0.1897, + "step": 1748 + }, + { + "epoch": 0.3506805444355484, + "learning_rate": 1.3514869695746078e-05, + "loss": 0.1415, + "step": 1750 + }, + { + "epoch": 0.3506805444355484, + "learning_rate": 1.3527949605893305e-05, + "loss": 0.3808, + "step": 1752 + }, + { + "epoch": 0.3514811849479584, + "learning_rate": 1.3541022625878501e-05, + "loss": 0.081, + "step": 1754 + }, + { + "epoch": 0.3514811849479584, + "learning_rate": 1.3554088730169812e-05, + "loss": 0.3631, + "step": 1756 + }, + { + "epoch": 0.3522818254603683, + "learning_rate": 1.3567147893248833e-05, + "loss": 0.4178, + "step": 1758 + }, + { + "epoch": 0.3522818254603683, + "learning_rate": 1.3580200089610739e-05, + "loss": 0.3713, + "step": 1760 + }, + { + "epoch": 0.3530824659727782, + "learning_rate": 1.3593245293764303e-05, + "loss": 0.3952, + "step": 1762 + }, + { + "epoch": 0.3530824659727782, + "learning_rate": 1.3606283480231962e-05, + "loss": 0.3147, + "step": 1764 + }, + { + "epoch": 0.35388310648518817, + "learning_rate": 1.361931462354984e-05, + "loss": 0.5828, + "step": 1766 + }, + { + "epoch": 0.35388310648518817, + "learning_rate": 1.3632338698267863e-05, + "loss": 0.2503, + "step": 1768 + }, + { + "epoch": 0.3546837469975981, + "learning_rate": 1.3645355678949715e-05, + "loss": 0.3487, + "step": 1770 + }, + { + "epoch": 0.3546837469975981, + "learning_rate": 1.3658365540172948e-05, + "loss": 0.097, + "step": 1772 + }, + { + "epoch": 0.355484387510008, + "learning_rate": 1.3671368256529026e-05, + "loss": 0.2245, + "step": 1774 + }, + { + "epoch": 0.355484387510008, + "learning_rate": 1.368436380262336e-05, + "loss": 0.16, + "step": 1776 + }, + { + "epoch": 0.35628502802241796, + "learning_rate": 1.3697352153075365e-05, + "loss": 0.3214, + "step": 1778 + }, + { + "epoch": 0.35628502802241796, + "learning_rate": 1.3710333282518497e-05, + "loss": 0.3808, + "step": 1780 + }, + { + "epoch": 0.35708566853482787, + "learning_rate": 1.3723307165600361e-05, + "loss": 0.1821, + "step": 1782 + }, + { + "epoch": 0.35708566853482787, + "learning_rate": 1.3736273776982667e-05, + "loss": 0.0827, + "step": 1784 + }, + { + "epoch": 0.3578863090472378, + "learning_rate": 1.3749233091341344e-05, + "loss": 0.1945, + "step": 1786 + }, + { + "epoch": 0.3578863090472378, + "learning_rate": 1.3762185083366562e-05, + "loss": 0.2323, + "step": 1788 + }, + { + "epoch": 0.3586869495596477, + "learning_rate": 1.3775129727762808e-05, + "loss": 0.2976, + "step": 1790 + }, + { + "epoch": 0.3586869495596477, + "learning_rate": 1.3788066999248893e-05, + "loss": 0.1952, + "step": 1792 + }, + { + "epoch": 0.35948759007205766, + "learning_rate": 1.3800996872558075e-05, + "loss": 0.2948, + "step": 1794 + }, + { + "epoch": 0.35948759007205766, + "learning_rate": 1.3813919322438018e-05, + "loss": 0.3343, + "step": 1796 + }, + { + "epoch": 0.3602882305844676, + "learning_rate": 1.3826834323650899e-05, + "loss": 0.1979, + "step": 1798 + }, + { + "epoch": 0.3602882305844676, + "learning_rate": 1.3839741850973435e-05, + "loss": 0.2704, + "step": 1800 + }, + { + "epoch": 0.3610888710968775, + "learning_rate": 1.3852641879196952e-05, + "loss": 0.109, + "step": 1802 + }, + { + "epoch": 0.3610888710968775, + "learning_rate": 1.3865534383127413e-05, + "loss": 0.2722, + "step": 1804 + }, + { + "epoch": 0.36188951160928745, + "learning_rate": 1.387841933758546e-05, + "loss": 0.0941, + "step": 1806 + }, + { + "epoch": 0.36188951160928745, + "learning_rate": 1.3891296717406533e-05, + "loss": 0.5743, + "step": 1808 + }, + { + "epoch": 0.36269015212169736, + "learning_rate": 1.3904166497440812e-05, + "loss": 0.239, + "step": 1810 + }, + { + "epoch": 0.36269015212169736, + "learning_rate": 1.391702865255334e-05, + "loss": 0.1878, + "step": 1812 + }, + { + "epoch": 0.3634907926341073, + "learning_rate": 1.3929883157624046e-05, + "loss": 0.1695, + "step": 1814 + }, + { + "epoch": 0.3634907926341073, + "learning_rate": 1.3942729987547808e-05, + "loss": 0.1499, + "step": 1816 + }, + { + "epoch": 0.36429143314651724, + "learning_rate": 1.3955569117234468e-05, + "loss": 0.6509, + "step": 1818 + }, + { + "epoch": 0.36429143314651724, + "learning_rate": 1.3968400521608962e-05, + "loss": 0.2154, + "step": 1820 + }, + { + "epoch": 0.36509207365892715, + "learning_rate": 1.3981224175611265e-05, + "loss": 0.2239, + "step": 1822 + }, + { + "epoch": 0.36509207365892715, + "learning_rate": 1.3994040054196498e-05, + "loss": 0.195, + "step": 1824 + }, + { + "epoch": 0.36589271417133706, + "learning_rate": 1.4006848132334979e-05, + "loss": 0.0763, + "step": 1826 + }, + { + "epoch": 0.36589271417133706, + "learning_rate": 1.4019648385012245e-05, + "loss": 0.2023, + "step": 1828 + }, + { + "epoch": 0.366693354683747, + "learning_rate": 1.4032440787229135e-05, + "loss": 0.2013, + "step": 1830 + }, + { + "epoch": 0.366693354683747, + "learning_rate": 1.4045225314001789e-05, + "loss": 0.1757, + "step": 1832 + }, + { + "epoch": 0.36749399519615694, + "learning_rate": 1.4058001940361781e-05, + "loss": 0.3511, + "step": 1834 + }, + { + "epoch": 0.36749399519615694, + "learning_rate": 1.4070770641356069e-05, + "loss": 0.1936, + "step": 1836 + }, + { + "epoch": 0.36829463570856685, + "learning_rate": 1.40835313920471e-05, + "loss": 0.1758, + "step": 1838 + }, + { + "epoch": 0.36829463570856685, + "learning_rate": 1.4096284167512856e-05, + "loss": 0.5221, + "step": 1840 + }, + { + "epoch": 0.36909527622097676, + "learning_rate": 1.4109028942846888e-05, + "loss": 0.2099, + "step": 1842 + }, + { + "epoch": 0.36909527622097676, + "learning_rate": 1.4121765693158355e-05, + "loss": 0.1561, + "step": 1844 + }, + { + "epoch": 0.36989591673338673, + "learning_rate": 1.4134494393572146e-05, + "loss": 0.0772, + "step": 1846 + }, + { + "epoch": 0.36989591673338673, + "learning_rate": 1.4147215019228813e-05, + "loss": 0.2589, + "step": 1848 + }, + { + "epoch": 0.37069655724579664, + "learning_rate": 1.4159927545284697e-05, + "loss": 0.4158, + "step": 1850 + }, + { + "epoch": 0.37069655724579664, + "learning_rate": 1.4172631946911964e-05, + "loss": 0.2405, + "step": 1852 + }, + { + "epoch": 0.37149719775820655, + "learning_rate": 1.4185328199298636e-05, + "loss": 0.2282, + "step": 1854 + }, + { + "epoch": 0.37149719775820655, + "learning_rate": 1.4198016277648665e-05, + "loss": 0.3647, + "step": 1856 + }, + { + "epoch": 0.37229783827061647, + "learning_rate": 1.4210696157181936e-05, + "loss": 0.2975, + "step": 1858 + }, + { + "epoch": 0.37229783827061647, + "learning_rate": 1.4223367813134406e-05, + "loss": 0.1583, + "step": 1860 + }, + { + "epoch": 0.37309847878302643, + "learning_rate": 1.4236031220758037e-05, + "loss": 0.31, + "step": 1862 + }, + { + "epoch": 0.37309847878302643, + "learning_rate": 1.4248686355320922e-05, + "loss": 0.3733, + "step": 1864 + }, + { + "epoch": 0.37389911929543634, + "learning_rate": 1.426133319210731e-05, + "loss": 0.5984, + "step": 1866 + }, + { + "epoch": 0.37389911929543634, + "learning_rate": 1.4273971706417653e-05, + "loss": 0.2247, + "step": 1868 + }, + { + "epoch": 0.37469975980784626, + "learning_rate": 1.4286601873568642e-05, + "loss": 0.2321, + "step": 1870 + }, + { + "epoch": 0.37469975980784626, + "learning_rate": 1.429922366889332e-05, + "loss": 0.0765, + "step": 1872 + }, + { + "epoch": 0.3755004003202562, + "learning_rate": 1.431183706774103e-05, + "loss": 0.1443, + "step": 1874 + }, + { + "epoch": 0.3755004003202562, + "learning_rate": 1.4324442045477534e-05, + "loss": 0.1193, + "step": 1876 + }, + { + "epoch": 0.37630104083266613, + "learning_rate": 1.4337038577485035e-05, + "loss": 0.5958, + "step": 1878 + }, + { + "epoch": 0.37630104083266613, + "learning_rate": 1.4349626639162231e-05, + "loss": 0.0906, + "step": 1880 + }, + { + "epoch": 0.37710168134507605, + "learning_rate": 1.436220620592437e-05, + "loss": 0.0944, + "step": 1882 + }, + { + "epoch": 0.37710168134507605, + "learning_rate": 1.4374777253203265e-05, + "loss": 0.1197, + "step": 1884 + }, + { + "epoch": 0.377902321857486, + "learning_rate": 1.4387339756447422e-05, + "loss": 0.3659, + "step": 1886 + }, + { + "epoch": 0.377902321857486, + "learning_rate": 1.4399893691121985e-05, + "loss": 0.0653, + "step": 1888 + }, + { + "epoch": 0.3787029623698959, + "learning_rate": 1.4412439032708848e-05, + "loss": 0.7545, + "step": 1890 + }, + { + "epoch": 0.3787029623698959, + "learning_rate": 1.4424975756706684e-05, + "loss": 0.2608, + "step": 1892 + }, + { + "epoch": 0.37950360288230583, + "learning_rate": 1.4437503838631002e-05, + "loss": 0.4889, + "step": 1894 + }, + { + "epoch": 0.37950360288230583, + "learning_rate": 1.4450023254014185e-05, + "loss": 0.2038, + "step": 1896 + }, + { + "epoch": 0.38030424339471575, + "learning_rate": 1.4462533978405529e-05, + "loss": 0.6511, + "step": 1898 + }, + { + "epoch": 0.38030424339471575, + "learning_rate": 1.4475035987371348e-05, + "loss": 0.1018, + "step": 1900 + }, + { + "epoch": 0.3811048839071257, + "learning_rate": 1.4487529256494937e-05, + "loss": 0.4916, + "step": 1902 + }, + { + "epoch": 0.3811048839071257, + "learning_rate": 1.4500013761376663e-05, + "loss": 0.0667, + "step": 1904 + }, + { + "epoch": 0.3819055244195356, + "learning_rate": 1.4512489477634024e-05, + "loss": 0.2891, + "step": 1906 + }, + { + "epoch": 0.3819055244195356, + "learning_rate": 1.4524956380901674e-05, + "loss": 0.135, + "step": 1908 + }, + { + "epoch": 0.38270616493194554, + "learning_rate": 1.4537414446831461e-05, + "loss": 0.0749, + "step": 1910 + }, + { + "epoch": 0.38270616493194554, + "learning_rate": 1.454986365109255e-05, + "loss": 0.1818, + "step": 1912 + }, + { + "epoch": 0.3835068054443555, + "learning_rate": 1.4562303969371357e-05, + "loss": 0.4657, + "step": 1914 + }, + { + "epoch": 0.3835068054443555, + "learning_rate": 1.4574735377371669e-05, + "loss": 0.235, + "step": 1916 + }, + { + "epoch": 0.3843074459567654, + "learning_rate": 1.4587157850814679e-05, + "loss": 0.2986, + "step": 1918 + }, + { + "epoch": 0.3843074459567654, + "learning_rate": 1.4599571365439027e-05, + "loss": 0.7949, + "step": 1920 + }, + { + "epoch": 0.3851080864691753, + "learning_rate": 1.4611975897000849e-05, + "loss": 0.2227, + "step": 1922 + }, + { + "epoch": 0.3851080864691753, + "learning_rate": 1.4624371421273812e-05, + "loss": 0.1674, + "step": 1924 + }, + { + "epoch": 0.3859087269815853, + "learning_rate": 1.463675791404922e-05, + "loss": 0.1511, + "step": 1926 + }, + { + "epoch": 0.3859087269815853, + "learning_rate": 1.4649135351135968e-05, + "loss": 0.2014, + "step": 1928 + }, + { + "epoch": 0.3867093674939952, + "learning_rate": 1.4661503708360652e-05, + "loss": 0.3052, + "step": 1930 + }, + { + "epoch": 0.3867093674939952, + "learning_rate": 1.4673862961567604e-05, + "loss": 0.1899, + "step": 1932 + }, + { + "epoch": 0.3875100080064051, + "learning_rate": 1.4686213086618932e-05, + "loss": 0.6388, + "step": 1934 + }, + { + "epoch": 0.3875100080064051, + "learning_rate": 1.4698554059394563e-05, + "loss": 0.2089, + "step": 1936 + }, + { + "epoch": 0.388310648518815, + "learning_rate": 1.4710885855792338e-05, + "loss": 0.3237, + "step": 1938 + }, + { + "epoch": 0.388310648518815, + "learning_rate": 1.4723208451727977e-05, + "loss": 0.2388, + "step": 1940 + }, + { + "epoch": 0.389111289031225, + "learning_rate": 1.4735521823135184e-05, + "loss": 0.3124, + "step": 1942 + }, + { + "epoch": 0.389111289031225, + "learning_rate": 1.4747825945965675e-05, + "loss": 0.5046, + "step": 1944 + }, + { + "epoch": 0.3899119295436349, + "learning_rate": 1.4760120796189233e-05, + "loss": 0.053, + "step": 1946 + }, + { + "epoch": 0.3899119295436349, + "learning_rate": 1.4772406349793749e-05, + "loss": 0.1472, + "step": 1948 + }, + { + "epoch": 0.3907125700560448, + "learning_rate": 1.4784682582785254e-05, + "loss": 0.168, + "step": 1950 + }, + { + "epoch": 0.3907125700560448, + "learning_rate": 1.4796949471188033e-05, + "loss": 0.0156, + "step": 1952 + }, + { + "epoch": 0.3915132105684548, + "learning_rate": 1.4809206991044571e-05, + "loss": 0.3514, + "step": 1954 + }, + { + "epoch": 0.3915132105684548, + "learning_rate": 1.4821455118415666e-05, + "loss": 0.6698, + "step": 1956 + }, + { + "epoch": 0.3923138510808647, + "learning_rate": 1.4833693829380458e-05, + "loss": 0.2655, + "step": 1958 + }, + { + "epoch": 0.3923138510808647, + "learning_rate": 1.4845923100036479e-05, + "loss": 0.4723, + "step": 1960 + }, + { + "epoch": 0.3931144915932746, + "learning_rate": 1.4858142906499686e-05, + "loss": 0.8415, + "step": 1962 + }, + { + "epoch": 0.3931144915932746, + "learning_rate": 1.4870353224904563e-05, + "loss": 0.3317, + "step": 1964 + }, + { + "epoch": 0.3939151321056846, + "learning_rate": 1.4882554031404075e-05, + "loss": 0.4388, + "step": 1966 + }, + { + "epoch": 0.3939151321056846, + "learning_rate": 1.4894745302169786e-05, + "loss": 0.185, + "step": 1968 + }, + { + "epoch": 0.3947157726180945, + "learning_rate": 1.4906927013391879e-05, + "loss": 0.2467, + "step": 1970 + }, + { + "epoch": 0.3947157726180945, + "learning_rate": 1.4919099141279205e-05, + "loss": 0.2766, + "step": 1972 + }, + { + "epoch": 0.3955164131305044, + "learning_rate": 1.4931261662059338e-05, + "loss": 0.1239, + "step": 1974 + }, + { + "epoch": 0.3955164131305044, + "learning_rate": 1.4943414551978597e-05, + "loss": 0.4461, + "step": 1976 + }, + { + "epoch": 0.3963170536429143, + "learning_rate": 1.4955557787302151e-05, + "loss": 0.4748, + "step": 1978 + }, + { + "epoch": 0.3963170536429143, + "learning_rate": 1.4967691344313988e-05, + "loss": 0.1206, + "step": 1980 + }, + { + "epoch": 0.3971176941553243, + "learning_rate": 1.4979815199317005e-05, + "loss": 0.37, + "step": 1982 + }, + { + "epoch": 0.3971176941553243, + "learning_rate": 1.499192932863305e-05, + "loss": 0.1359, + "step": 1984 + }, + { + "epoch": 0.3979183346677342, + "learning_rate": 1.5004033708602967e-05, + "loss": 0.2084, + "step": 1986 + }, + { + "epoch": 0.3979183346677342, + "learning_rate": 1.5016128315586626e-05, + "loss": 0.1363, + "step": 1988 + }, + { + "epoch": 0.3987189751801441, + "learning_rate": 1.5028213125963029e-05, + "loss": 0.1769, + "step": 1990 + }, + { + "epoch": 0.3987189751801441, + "learning_rate": 1.5040288116130261e-05, + "loss": 0.3743, + "step": 1992 + }, + { + "epoch": 0.39951961569255406, + "learning_rate": 1.5052353262505603e-05, + "loss": 0.2893, + "step": 1994 + }, + { + "epoch": 0.39951961569255406, + "learning_rate": 1.5064408541525568e-05, + "loss": 0.4122, + "step": 1996 + }, + { + "epoch": 0.400320256204964, + "learning_rate": 1.5076453929645933e-05, + "loss": 0.207, + "step": 1998 + }, + { + "epoch": 0.400320256204964, + "learning_rate": 1.5088489403341793e-05, + "loss": 0.1173, + "step": 2000 + }, + { + "epoch": 0.4011208967173739, + "learning_rate": 1.510051493910759e-05, + "loss": 0.705, + "step": 2002 + }, + { + "epoch": 0.4011208967173739, + "learning_rate": 1.5112530513457229e-05, + "loss": 0.1415, + "step": 2004 + }, + { + "epoch": 0.40192153722978385, + "learning_rate": 1.512453610292401e-05, + "loss": 0.1679, + "step": 2006 + }, + { + "epoch": 0.40192153722978385, + "learning_rate": 1.513653168406076e-05, + "loss": 0.1274, + "step": 2008 + }, + { + "epoch": 0.40272217774219377, + "learning_rate": 1.514851723343985e-05, + "loss": 0.0143, + "step": 2010 + }, + { + "epoch": 0.40272217774219377, + "learning_rate": 1.5160492727653245e-05, + "loss": 0.4324, + "step": 2012 + }, + { + "epoch": 0.4035228182546037, + "learning_rate": 1.5172458143312522e-05, + "loss": 0.328, + "step": 2014 + }, + { + "epoch": 0.4035228182546037, + "learning_rate": 1.5184413457049006e-05, + "loss": 0.1768, + "step": 2016 + }, + { + "epoch": 0.4043234587670136, + "learning_rate": 1.5196358645513685e-05, + "loss": 0.4041, + "step": 2018 + }, + { + "epoch": 0.4043234587670136, + "learning_rate": 1.5208293685377354e-05, + "loss": 0.7619, + "step": 2020 + }, + { + "epoch": 0.40512409927942356, + "learning_rate": 1.5220218553330618e-05, + "loss": 0.1985, + "step": 2022 + }, + { + "epoch": 0.40512409927942356, + "learning_rate": 1.5232133226083954e-05, + "loss": 0.1426, + "step": 2024 + }, + { + "epoch": 0.40592473979183347, + "learning_rate": 1.5244037680367744e-05, + "loss": 0.3148, + "step": 2026 + }, + { + "epoch": 0.40592473979183347, + "learning_rate": 1.5255931892932322e-05, + "loss": 0.2295, + "step": 2028 + }, + { + "epoch": 0.4067253803042434, + "learning_rate": 1.5267815840548057e-05, + "loss": 0.1907, + "step": 2030 + }, + { + "epoch": 0.4067253803042434, + "learning_rate": 1.527968950000533e-05, + "loss": 0.4836, + "step": 2032 + }, + { + "epoch": 0.40752602081665334, + "learning_rate": 1.529155284811463e-05, + "loss": 0.635, + "step": 2034 + }, + { + "epoch": 0.40752602081665334, + "learning_rate": 1.5303405861706574e-05, + "loss": 0.2523, + "step": 2036 + }, + { + "epoch": 0.40832666132906326, + "learning_rate": 1.5315248517631975e-05, + "loss": 0.3366, + "step": 2038 + }, + { + "epoch": 0.40832666132906326, + "learning_rate": 1.532708079276185e-05, + "loss": 0.2931, + "step": 2040 + }, + { + "epoch": 0.40912730184147317, + "learning_rate": 1.5338902663987544e-05, + "loss": 0.1039, + "step": 2042 + }, + { + "epoch": 0.40912730184147317, + "learning_rate": 1.5350714108220667e-05, + "loss": 0.124, + "step": 2044 + }, + { + "epoch": 0.4099279423538831, + "learning_rate": 1.5362515102393217e-05, + "loss": 0.7778, + "step": 2046 + }, + { + "epoch": 0.4099279423538831, + "learning_rate": 1.5374305623457594e-05, + "loss": 0.4029, + "step": 2048 + }, + { + "epoch": 0.41072858286629305, + "learning_rate": 1.5386085648386656e-05, + "loss": 0.1991, + "step": 2050 + }, + { + "epoch": 0.41072858286629305, + "learning_rate": 1.539785515417376e-05, + "loss": 0.1197, + "step": 2052 + }, + { + "epoch": 0.41152922337870296, + "learning_rate": 1.540961411783279e-05, + "loss": 0.3148, + "step": 2054 + }, + { + "epoch": 0.41152922337870296, + "learning_rate": 1.542136251639826e-05, + "loss": 0.6593, + "step": 2056 + }, + { + "epoch": 0.41232986389111287, + "learning_rate": 1.5433100326925288e-05, + "loss": 0.8371, + "step": 2058 + }, + { + "epoch": 0.41232986389111287, + "learning_rate": 1.5444827526489668e-05, + "loss": 0.2865, + "step": 2060 + }, + { + "epoch": 0.41313050440352284, + "learning_rate": 1.545654409218793e-05, + "loss": 0.0836, + "step": 2062 + }, + { + "epoch": 0.41313050440352284, + "learning_rate": 1.5468250001137368e-05, + "loss": 0.1281, + "step": 2064 + }, + { + "epoch": 0.41393114491593275, + "learning_rate": 1.5479945230476066e-05, + "loss": 0.441, + "step": 2066 + }, + { + "epoch": 0.41393114491593275, + "learning_rate": 1.5491629757363026e-05, + "loss": 0.1379, + "step": 2068 + }, + { + "epoch": 0.41473178542834266, + "learning_rate": 1.550330355897809e-05, + "loss": 0.0615, + "step": 2070 + }, + { + "epoch": 0.41473178542834266, + "learning_rate": 1.551496661252208e-05, + "loss": 0.1092, + "step": 2072 + }, + { + "epoch": 0.4155324259407526, + "learning_rate": 1.5526618895216793e-05, + "loss": 0.0648, + "step": 2074 + }, + { + "epoch": 0.4155324259407526, + "learning_rate": 1.5538260384305073e-05, + "loss": 0.1105, + "step": 2076 + }, + { + "epoch": 0.41633306645316254, + "learning_rate": 1.5549891057050837e-05, + "loss": 0.0824, + "step": 2078 + }, + { + "epoch": 0.41633306645316254, + "learning_rate": 1.5561510890739113e-05, + "loss": 0.0863, + "step": 2080 + }, + { + "epoch": 0.41713370696557245, + "learning_rate": 1.557311986267615e-05, + "loss": 0.2408, + "step": 2082 + }, + { + "epoch": 0.41713370696557245, + "learning_rate": 1.5584717950189353e-05, + "loss": 0.05, + "step": 2084 + }, + { + "epoch": 0.41793434747798236, + "learning_rate": 1.5596305130627404e-05, + "loss": 0.133, + "step": 2086 + }, + { + "epoch": 0.41793434747798236, + "learning_rate": 1.5607881381360296e-05, + "loss": 0.3431, + "step": 2088 + }, + { + "epoch": 0.4187349879903923, + "learning_rate": 1.5619446679779357e-05, + "loss": 0.3186, + "step": 2090 + }, + { + "epoch": 0.4187349879903923, + "learning_rate": 1.563100100329731e-05, + "loss": 0.5189, + "step": 2092 + }, + { + "epoch": 0.41953562850280224, + "learning_rate": 1.564254432934829e-05, + "loss": 0.8141, + "step": 2094 + }, + { + "epoch": 0.41953562850280224, + "learning_rate": 1.565407663538797e-05, + "loss": 0.6202, + "step": 2096 + }, + { + "epoch": 0.42033626901521215, + "learning_rate": 1.5665597898893484e-05, + "loss": 0.1282, + "step": 2098 + }, + { + "epoch": 0.42033626901521215, + "learning_rate": 1.567710809736356e-05, + "loss": 0.1454, + "step": 2100 + }, + { + "epoch": 0.4211369095276221, + "learning_rate": 1.568860720831853e-05, + "loss": 0.1461, + "step": 2102 + }, + { + "epoch": 0.4211369095276221, + "learning_rate": 1.5700095209300376e-05, + "loss": 0.2707, + "step": 2104 + }, + { + "epoch": 0.42193755004003203, + "learning_rate": 1.5711572077872774e-05, + "loss": 0.3645, + "step": 2106 + }, + { + "epoch": 0.42193755004003203, + "learning_rate": 1.572303779162118e-05, + "loss": 0.1039, + "step": 2108 + }, + { + "epoch": 0.42273819055244194, + "learning_rate": 1.573449232815279e-05, + "loss": 0.3355, + "step": 2110 + }, + { + "epoch": 0.42273819055244194, + "learning_rate": 1.5745935665096647e-05, + "loss": 0.0727, + "step": 2112 + }, + { + "epoch": 0.4235388310648519, + "learning_rate": 1.5757367780103666e-05, + "loss": 0.3095, + "step": 2114 + }, + { + "epoch": 0.4235388310648519, + "learning_rate": 1.5768788650846677e-05, + "loss": 0.2663, + "step": 2116 + }, + { + "epoch": 0.4243394715772618, + "learning_rate": 1.5780198255020478e-05, + "loss": 0.4324, + "step": 2118 + }, + { + "epoch": 0.4243394715772618, + "learning_rate": 1.5791596570341844e-05, + "loss": 0.2381, + "step": 2120 + }, + { + "epoch": 0.42514011208967173, + "learning_rate": 1.580298357454965e-05, + "loss": 0.241, + "step": 2122 + }, + { + "epoch": 0.42514011208967173, + "learning_rate": 1.581435924540481e-05, + "loss": 0.4908, + "step": 2124 + }, + { + "epoch": 0.42594075260208164, + "learning_rate": 1.5825723560690403e-05, + "loss": 0.1954, + "step": 2126 + }, + { + "epoch": 0.42594075260208164, + "learning_rate": 1.5837076498211666e-05, + "loss": 0.2759, + "step": 2128 + }, + { + "epoch": 0.4267413931144916, + "learning_rate": 1.5848418035796068e-05, + "loss": 0.7154, + "step": 2130 + }, + { + "epoch": 0.4267413931144916, + "learning_rate": 1.5859748151293333e-05, + "loss": 0.5505, + "step": 2132 + }, + { + "epoch": 0.4275420336269015, + "learning_rate": 1.587106682257552e-05, + "loss": 0.1504, + "step": 2134 + }, + { + "epoch": 0.4275420336269015, + "learning_rate": 1.5882374027537005e-05, + "loss": 0.0464, + "step": 2136 + }, + { + "epoch": 0.42834267413931143, + "learning_rate": 1.5893669744094577e-05, + "loss": 0.2095, + "step": 2138 + }, + { + "epoch": 0.42834267413931143, + "learning_rate": 1.5904953950187455e-05, + "loss": 0.1332, + "step": 2140 + }, + { + "epoch": 0.4291433146517214, + "learning_rate": 1.591622662377734e-05, + "loss": 0.1645, + "step": 2142 + }, + { + "epoch": 0.4291433146517214, + "learning_rate": 1.5927487742848448e-05, + "loss": 0.0246, + "step": 2144 + }, + { + "epoch": 0.4299439551641313, + "learning_rate": 1.5938737285407567e-05, + "loss": 0.7714, + "step": 2146 + }, + { + "epoch": 0.4299439551641313, + "learning_rate": 1.594997522948412e-05, + "loss": 0.1869, + "step": 2148 + }, + { + "epoch": 0.4307445956765412, + "learning_rate": 1.5961201553130148e-05, + "loss": 0.3543, + "step": 2150 + }, + { + "epoch": 0.4307445956765412, + "learning_rate": 1.5972416234420393e-05, + "loss": 0.0868, + "step": 2152 + }, + { + "epoch": 0.4315452361889512, + "learning_rate": 1.598361925145234e-05, + "loss": 0.2378, + "step": 2154 + }, + { + "epoch": 0.4315452361889512, + "learning_rate": 1.599481058234626e-05, + "loss": 0.3559, + "step": 2156 + }, + { + "epoch": 0.4323458767013611, + "learning_rate": 1.6005990205245216e-05, + "loss": 0.4346, + "step": 2158 + }, + { + "epoch": 0.4323458767013611, + "learning_rate": 1.60171580983152e-05, + "loss": 0.3386, + "step": 2160 + }, + { + "epoch": 0.433146517213771, + "learning_rate": 1.602831423974506e-05, + "loss": 0.1343, + "step": 2162 + }, + { + "epoch": 0.433146517213771, + "learning_rate": 1.6039458607746607e-05, + "loss": 0.4791, + "step": 2164 + }, + { + "epoch": 0.4339471577261809, + "learning_rate": 1.6050591180554648e-05, + "loss": 0.2522, + "step": 2166 + }, + { + "epoch": 0.4339471577261809, + "learning_rate": 1.606171193642703e-05, + "loss": 0.0404, + "step": 2168 + }, + { + "epoch": 0.4347477982385909, + "learning_rate": 1.6072820853644677e-05, + "loss": 0.1587, + "step": 2170 + }, + { + "epoch": 0.4347477982385909, + "learning_rate": 1.6083917910511616e-05, + "loss": 0.8356, + "step": 2172 + }, + { + "epoch": 0.4355484387510008, + "learning_rate": 1.6095003085355082e-05, + "loss": 0.332, + "step": 2174 + }, + { + "epoch": 0.4355484387510008, + "learning_rate": 1.6106076356525474e-05, + "loss": 0.0972, + "step": 2176 + }, + { + "epoch": 0.4363490792634107, + "learning_rate": 1.611713770239646e-05, + "loss": 0.4573, + "step": 2178 + }, + { + "epoch": 0.4363490792634107, + "learning_rate": 1.6128187101364982e-05, + "loss": 0.1238, + "step": 2180 + }, + { + "epoch": 0.4371497197758207, + "learning_rate": 1.6139224531851332e-05, + "loss": 0.1954, + "step": 2182 + }, + { + "epoch": 0.4371497197758207, + "learning_rate": 1.6150249972299153e-05, + "loss": 0.249, + "step": 2184 + }, + { + "epoch": 0.4379503602882306, + "learning_rate": 1.616126340117555e-05, + "loss": 0.349, + "step": 2186 + }, + { + "epoch": 0.4379503602882306, + "learning_rate": 1.617226479697104e-05, + "loss": 0.1145, + "step": 2188 + }, + { + "epoch": 0.4387510008006405, + "learning_rate": 1.618325413819966e-05, + "loss": 0.093, + "step": 2190 + }, + { + "epoch": 0.4387510008006405, + "learning_rate": 1.6194231403398994e-05, + "loss": 0.1425, + "step": 2192 + }, + { + "epoch": 0.43955164131305047, + "learning_rate": 1.6205196571130194e-05, + "loss": 0.0805, + "step": 2194 + }, + { + "epoch": 0.43955164131305047, + "learning_rate": 1.621614961997806e-05, + "loss": 0.1518, + "step": 2196 + }, + { + "epoch": 0.4403522818254604, + "learning_rate": 1.6227090528551034e-05, + "loss": 0.2051, + "step": 2198 + }, + { + "epoch": 0.4403522818254604, + "learning_rate": 1.6238019275481313e-05, + "loss": 0.5948, + "step": 2200 + }, + { + "epoch": 0.4411529223378703, + "learning_rate": 1.62489358394248e-05, + "loss": 0.247, + "step": 2202 + }, + { + "epoch": 0.4411529223378703, + "learning_rate": 1.6259840199061212e-05, + "loss": 0.3703, + "step": 2204 + }, + { + "epoch": 0.4419535628502802, + "learning_rate": 1.6270732333094095e-05, + "loss": 0.1524, + "step": 2206 + }, + { + "epoch": 0.4419535628502802, + "learning_rate": 1.6281612220250883e-05, + "loss": 0.1728, + "step": 2208 + }, + { + "epoch": 0.44275420336269017, + "learning_rate": 1.6292479839282897e-05, + "loss": 0.2477, + "step": 2210 + }, + { + "epoch": 0.44275420336269017, + "learning_rate": 1.6303335168965474e-05, + "loss": 0.5225, + "step": 2212 + }, + { + "epoch": 0.4435548438751001, + "learning_rate": 1.6314178188097907e-05, + "loss": 0.1221, + "step": 2214 + }, + { + "epoch": 0.4435548438751001, + "learning_rate": 1.6325008875503543e-05, + "loss": 0.3724, + "step": 2216 + }, + { + "epoch": 0.44435548438751, + "learning_rate": 1.6335827210029816e-05, + "loss": 0.4743, + "step": 2218 + }, + { + "epoch": 0.44435548438751, + "learning_rate": 1.6346633170548285e-05, + "loss": 0.1058, + "step": 2220 + }, + { + "epoch": 0.44515612489991996, + "learning_rate": 1.635742673595467e-05, + "loss": 0.0703, + "step": 2222 + }, + { + "epoch": 0.44515612489991996, + "learning_rate": 1.6368207885168897e-05, + "loss": 0.0767, + "step": 2224 + }, + { + "epoch": 0.44595676541232987, + "learning_rate": 1.6378976597135173e-05, + "loss": 0.1666, + "step": 2226 + }, + { + "epoch": 0.44595676541232987, + "learning_rate": 1.6389732850821957e-05, + "loss": 0.2295, + "step": 2228 + }, + { + "epoch": 0.4467574059247398, + "learning_rate": 1.640047662522205e-05, + "loss": 0.1457, + "step": 2230 + }, + { + "epoch": 0.4467574059247398, + "learning_rate": 1.641120789935263e-05, + "loss": 0.1584, + "step": 2232 + }, + { + "epoch": 0.4475580464371497, + "learning_rate": 1.6421926652255282e-05, + "loss": 0.2196, + "step": 2234 + }, + { + "epoch": 0.4475580464371497, + "learning_rate": 1.6432632862996042e-05, + "loss": 0.0553, + "step": 2236 + }, + { + "epoch": 0.44835868694955966, + "learning_rate": 1.6443326510665474e-05, + "loss": 0.0776, + "step": 2238 + }, + { + "epoch": 0.44835868694955966, + "learning_rate": 1.6454007574378637e-05, + "loss": 0.092, + "step": 2240 + }, + { + "epoch": 0.44915932746196957, + "learning_rate": 1.646467603327518e-05, + "loss": 0.0807, + "step": 2242 + }, + { + "epoch": 0.44915932746196957, + "learning_rate": 1.6475331866519377e-05, + "loss": 0.2193, + "step": 2244 + }, + { + "epoch": 0.4499599679743795, + "learning_rate": 1.6485975053300154e-05, + "loss": 0.2321, + "step": 2246 + }, + { + "epoch": 0.4499599679743795, + "learning_rate": 1.6496605572831134e-05, + "loss": 0.3186, + "step": 2248 + }, + { + "epoch": 0.45076060848678945, + "learning_rate": 1.650722340435067e-05, + "loss": 0.2763, + "step": 2250 + }, + { + "epoch": 0.45076060848678945, + "learning_rate": 1.6517828527121928e-05, + "loss": 0.0634, + "step": 2252 + }, + { + "epoch": 0.45156124899919936, + "learning_rate": 1.652842092043287e-05, + "loss": 0.3669, + "step": 2254 + }, + { + "epoch": 0.45156124899919936, + "learning_rate": 1.6539000563596318e-05, + "loss": 0.0735, + "step": 2256 + }, + { + "epoch": 0.45236188951160927, + "learning_rate": 1.6549567435950004e-05, + "loss": 0.4802, + "step": 2258 + }, + { + "epoch": 0.45236188951160927, + "learning_rate": 1.6560121516856586e-05, + "loss": 0.8038, + "step": 2260 + }, + { + "epoch": 0.45316253002401924, + "learning_rate": 1.6570662785703713e-05, + "loss": 0.5926, + "step": 2262 + }, + { + "epoch": 0.45316253002401924, + "learning_rate": 1.6581191221904077e-05, + "loss": 0.0294, + "step": 2264 + }, + { + "epoch": 0.45396317053642915, + "learning_rate": 1.6591706804895408e-05, + "loss": 0.2732, + "step": 2266 + }, + { + "epoch": 0.45396317053642915, + "learning_rate": 1.6602209514140542e-05, + "loss": 0.1218, + "step": 2268 + }, + { + "epoch": 0.45476381104883906, + "learning_rate": 1.6612699329127457e-05, + "loss": 0.1182, + "step": 2270 + }, + { + "epoch": 0.45476381104883906, + "learning_rate": 1.6623176229369324e-05, + "loss": 0.0913, + "step": 2272 + }, + { + "epoch": 0.455564451561249, + "learning_rate": 1.6633640194404523e-05, + "loss": 0.1566, + "step": 2274 + }, + { + "epoch": 0.455564451561249, + "learning_rate": 1.6644091203796694e-05, + "loss": 0.0792, + "step": 2276 + }, + { + "epoch": 0.45636509207365894, + "learning_rate": 1.6654529237134816e-05, + "loss": 0.3037, + "step": 2278 + }, + { + "epoch": 0.45636509207365894, + "learning_rate": 1.6664954274033168e-05, + "loss": 0.0965, + "step": 2280 + }, + { + "epoch": 0.45716573258606885, + "learning_rate": 1.667536629413143e-05, + "loss": 0.583, + "step": 2282 + }, + { + "epoch": 0.45716573258606885, + "learning_rate": 1.6685765277094695e-05, + "loss": 0.227, + "step": 2284 + }, + { + "epoch": 0.45796637309847876, + "learning_rate": 1.6696151202613527e-05, + "loss": 0.4164, + "step": 2286 + }, + { + "epoch": 0.45796637309847876, + "learning_rate": 1.6706524050403996e-05, + "loss": 0.5687, + "step": 2288 + }, + { + "epoch": 0.45876701361088873, + "learning_rate": 1.6716883800207685e-05, + "loss": 0.4588, + "step": 2290 + }, + { + "epoch": 0.45876701361088873, + "learning_rate": 1.6727230431791806e-05, + "loss": 0.5289, + "step": 2292 + }, + { + "epoch": 0.45956765412329864, + "learning_rate": 1.673756392494915e-05, + "loss": 0.1849, + "step": 2294 + }, + { + "epoch": 0.45956765412329864, + "learning_rate": 1.674788425949818e-05, + "loss": 0.065, + "step": 2296 + }, + { + "epoch": 0.46036829463570855, + "learning_rate": 1.6758191415283063e-05, + "loss": 0.1345, + "step": 2298 + }, + { + "epoch": 0.46036829463570855, + "learning_rate": 1.6768485372173696e-05, + "loss": 0.0995, + "step": 2300 + }, + { + "epoch": 0.4611689351481185, + "learning_rate": 1.6778766110065755e-05, + "loss": 0.098, + "step": 2302 + }, + { + "epoch": 0.4611689351481185, + "learning_rate": 1.6789033608880735e-05, + "loss": 0.145, + "step": 2304 + }, + { + "epoch": 0.46196957566052843, + "learning_rate": 1.6799287848566e-05, + "loss": 0.6193, + "step": 2306 + }, + { + "epoch": 0.46196957566052843, + "learning_rate": 1.6809528809094798e-05, + "loss": 0.336, + "step": 2308 + }, + { + "epoch": 0.46277021617293834, + "learning_rate": 1.6819756470466305e-05, + "loss": 0.2611, + "step": 2310 + }, + { + "epoch": 0.46277021617293834, + "learning_rate": 1.6829970812705674e-05, + "loss": 0.5102, + "step": 2312 + }, + { + "epoch": 0.46357085668534825, + "learning_rate": 1.684017181586408e-05, + "loss": 0.1306, + "step": 2314 + }, + { + "epoch": 0.46357085668534825, + "learning_rate": 1.6850359460018733e-05, + "loss": 0.1643, + "step": 2316 + }, + { + "epoch": 0.4643714971977582, + "learning_rate": 1.6860533725272943e-05, + "loss": 0.113, + "step": 2318 + }, + { + "epoch": 0.4643714971977582, + "learning_rate": 1.6870694591756165e-05, + "loss": 0.3482, + "step": 2320 + }, + { + "epoch": 0.46517213771016813, + "learning_rate": 1.6880842039624e-05, + "loss": 0.0796, + "step": 2322 + }, + { + "epoch": 0.46517213771016813, + "learning_rate": 1.689097604905826e-05, + "loss": 0.5447, + "step": 2324 + }, + { + "epoch": 0.46597277822257804, + "learning_rate": 1.6901096600267e-05, + "loss": 0.4793, + "step": 2326 + }, + { + "epoch": 0.46597277822257804, + "learning_rate": 1.6911203673484577e-05, + "loss": 0.2417, + "step": 2328 + }, + { + "epoch": 0.466773418734988, + "learning_rate": 1.6921297248971645e-05, + "loss": 0.4162, + "step": 2330 + }, + { + "epoch": 0.466773418734988, + "learning_rate": 1.6931377307015226e-05, + "loss": 0.1772, + "step": 2332 + }, + { + "epoch": 0.4675740592473979, + "learning_rate": 1.6941443827928778e-05, + "loss": 0.0683, + "step": 2334 + }, + { + "epoch": 0.4675740592473979, + "learning_rate": 1.695149679205214e-05, + "loss": 0.3699, + "step": 2336 + }, + { + "epoch": 0.46837469975980783, + "learning_rate": 1.6961536179751672e-05, + "loss": 0.4691, + "step": 2338 + }, + { + "epoch": 0.46837469975980783, + "learning_rate": 1.6971561971420222e-05, + "loss": 0.1159, + "step": 2340 + }, + { + "epoch": 0.4691753402722178, + "learning_rate": 1.6981574147477204e-05, + "loss": 0.2931, + "step": 2342 + }, + { + "epoch": 0.4691753402722178, + "learning_rate": 1.6991572688368628e-05, + "loss": 0.3345, + "step": 2344 + }, + { + "epoch": 0.4699759807846277, + "learning_rate": 1.70015575745671e-05, + "loss": 0.3756, + "step": 2346 + }, + { + "epoch": 0.4699759807846277, + "learning_rate": 1.701152878657196e-05, + "loss": 0.1016, + "step": 2348 + }, + { + "epoch": 0.4707766212970376, + "learning_rate": 1.7021486304909196e-05, + "loss": 0.486, + "step": 2350 + }, + { + "epoch": 0.4707766212970376, + "learning_rate": 1.7031430110131562e-05, + "loss": 0.2282, + "step": 2352 + }, + { + "epoch": 0.47157726180944753, + "learning_rate": 1.7041360182818583e-05, + "loss": 1.0106, + "step": 2354 + }, + { + "epoch": 0.47157726180944753, + "learning_rate": 1.705127650357662e-05, + "loss": 0.6699, + "step": 2356 + }, + { + "epoch": 0.4723779023218575, + "learning_rate": 1.7061179053038887e-05, + "loss": 0.092, + "step": 2358 + }, + { + "epoch": 0.4723779023218575, + "learning_rate": 1.7071067811865467e-05, + "loss": 0.4504, + "step": 2360 + }, + { + "epoch": 0.4731785428342674, + "learning_rate": 1.708094276074343e-05, + "loss": 0.176, + "step": 2362 + }, + { + "epoch": 0.4731785428342674, + "learning_rate": 1.7090803880386778e-05, + "loss": 0.0602, + "step": 2364 + }, + { + "epoch": 0.4739791833466773, + "learning_rate": 1.7100651151536525e-05, + "loss": 0.1543, + "step": 2366 + }, + { + "epoch": 0.4739791833466773, + "learning_rate": 1.7110484554960738e-05, + "loss": 0.1851, + "step": 2368 + }, + { + "epoch": 0.4747798238590873, + "learning_rate": 1.712030407145457e-05, + "loss": 0.2849, + "step": 2370 + }, + { + "epoch": 0.4747798238590873, + "learning_rate": 1.713010968184029e-05, + "loss": 0.129, + "step": 2372 + }, + { + "epoch": 0.4755804643714972, + "learning_rate": 1.7139901366967332e-05, + "loss": 0.2826, + "step": 2374 + }, + { + "epoch": 0.4755804643714972, + "learning_rate": 1.7149679107712306e-05, + "loss": 0.1602, + "step": 2376 + }, + { + "epoch": 0.4763811048839071, + "learning_rate": 1.71594428849791e-05, + "loss": 0.5596, + "step": 2378 + }, + { + "epoch": 0.4763811048839071, + "learning_rate": 1.716919267969883e-05, + "loss": 0.0886, + "step": 2380 + }, + { + "epoch": 0.4771817453963171, + "learning_rate": 1.717892847282994e-05, + "loss": 0.4925, + "step": 2382 + }, + { + "epoch": 0.4771817453963171, + "learning_rate": 1.7188650245358215e-05, + "loss": 0.4966, + "step": 2384 + }, + { + "epoch": 0.477982385908727, + "learning_rate": 1.7198357978296817e-05, + "loss": 0.4932, + "step": 2386 + }, + { + "epoch": 0.477982385908727, + "learning_rate": 1.7208051652686338e-05, + "loss": 0.0158, + "step": 2388 + }, + { + "epoch": 0.4787830264211369, + "learning_rate": 1.721773124959481e-05, + "loss": 0.7964, + "step": 2390 + }, + { + "epoch": 0.4787830264211369, + "learning_rate": 1.722739675011779e-05, + "loss": 0.5353, + "step": 2392 + }, + { + "epoch": 0.4795836669335468, + "learning_rate": 1.723704813537834e-05, + "loss": 0.5635, + "step": 2394 + }, + { + "epoch": 0.4795836669335468, + "learning_rate": 1.7246685386527095e-05, + "loss": 0.2196, + "step": 2396 + }, + { + "epoch": 0.4803843074459568, + "learning_rate": 1.725630848474229e-05, + "loss": 0.3388, + "step": 2398 + }, + { + "epoch": 0.4803843074459568, + "learning_rate": 1.726591741122981e-05, + "loss": 0.2197, + "step": 2400 + }, + { + "epoch": 0.4811849479583667, + "learning_rate": 1.727551214722321e-05, + "loss": 0.4927, + "step": 2402 + }, + { + "epoch": 0.4811849479583667, + "learning_rate": 1.7285092673983753e-05, + "loss": 0.1078, + "step": 2404 + }, + { + "epoch": 0.4819855884707766, + "learning_rate": 1.7294658972800488e-05, + "loss": 0.2471, + "step": 2406 + }, + { + "epoch": 0.4819855884707766, + "learning_rate": 1.730421102499021e-05, + "loss": 0.177, + "step": 2408 + }, + { + "epoch": 0.48278622898318657, + "learning_rate": 1.7313748811897558e-05, + "loss": 0.0755, + "step": 2410 + }, + { + "epoch": 0.48278622898318657, + "learning_rate": 1.7323272314895022e-05, + "loss": 0.0767, + "step": 2412 + }, + { + "epoch": 0.4835868694955965, + "learning_rate": 1.7332781515383003e-05, + "loss": 0.1511, + "step": 2414 + }, + { + "epoch": 0.4835868694955965, + "learning_rate": 1.734227639478982e-05, + "loss": 0.0369, + "step": 2416 + }, + { + "epoch": 0.4843875100080064, + "learning_rate": 1.7351756934571758e-05, + "loss": 0.3943, + "step": 2418 + }, + { + "epoch": 0.4843875100080064, + "learning_rate": 1.736122311621314e-05, + "loss": 0.2949, + "step": 2420 + }, + { + "epoch": 0.4851881505204163, + "learning_rate": 1.7370674921226296e-05, + "loss": 0.8281, + "step": 2422 + }, + { + "epoch": 0.4851881505204163, + "learning_rate": 1.738011233115165e-05, + "loss": 0.271, + "step": 2424 + }, + { + "epoch": 0.4859887910328263, + "learning_rate": 1.7389535327557733e-05, + "loss": 0.2197, + "step": 2426 + }, + { + "epoch": 0.4859887910328263, + "learning_rate": 1.7398943892041227e-05, + "loss": 0.1042, + "step": 2428 + }, + { + "epoch": 0.4867894315452362, + "learning_rate": 1.7408338006227005e-05, + "loss": 0.3099, + "step": 2430 + }, + { + "epoch": 0.4867894315452362, + "learning_rate": 1.7417717651768144e-05, + "loss": 0.0501, + "step": 2432 + }, + { + "epoch": 0.4875900720576461, + "learning_rate": 1.7427082810346018e-05, + "loss": 0.2199, + "step": 2434 + }, + { + "epoch": 0.4875900720576461, + "learning_rate": 1.743643346367026e-05, + "loss": 0.266, + "step": 2436 + }, + { + "epoch": 0.48839071257005606, + "learning_rate": 1.744576959347884e-05, + "loss": 0.3448, + "step": 2438 + }, + { + "epoch": 0.48839071257005606, + "learning_rate": 1.7455091181538087e-05, + "loss": 0.1843, + "step": 2440 + }, + { + "epoch": 0.489191353082466, + "learning_rate": 1.746439820964275e-05, + "loss": 0.0577, + "step": 2442 + }, + { + "epoch": 0.489191353082466, + "learning_rate": 1.7473690659615992e-05, + "loss": 0.2669, + "step": 2444 + }, + { + "epoch": 0.4899919935948759, + "learning_rate": 1.748296851330945e-05, + "loss": 0.1144, + "step": 2446 + }, + { + "epoch": 0.4899919935948759, + "learning_rate": 1.74922317526033e-05, + "loss": 0.1636, + "step": 2448 + }, + { + "epoch": 0.49079263410728585, + "learning_rate": 1.7501480359406217e-05, + "loss": 0.2328, + "step": 2450 + }, + { + "epoch": 0.49079263410728585, + "learning_rate": 1.7510714315655474e-05, + "loss": 0.0822, + "step": 2452 + }, + { + "epoch": 0.49159327461969576, + "learning_rate": 1.7519933603316955e-05, + "loss": 0.0801, + "step": 2454 + }, + { + "epoch": 0.49159327461969576, + "learning_rate": 1.752913820438519e-05, + "loss": 0.427, + "step": 2456 + }, + { + "epoch": 0.4923939151321057, + "learning_rate": 1.7538328100883397e-05, + "loss": 0.1938, + "step": 2458 + }, + { + "epoch": 0.4923939151321057, + "learning_rate": 1.7547503274863495e-05, + "loss": 0.4694, + "step": 2460 + }, + { + "epoch": 0.4931945556445156, + "learning_rate": 1.7556663708406193e-05, + "loss": 0.2724, + "step": 2462 + }, + { + "epoch": 0.4931945556445156, + "learning_rate": 1.756580938362096e-05, + "loss": 0.038, + "step": 2464 + }, + { + "epoch": 0.49399519615692555, + "learning_rate": 1.7574940282646085e-05, + "loss": 0.5558, + "step": 2466 + }, + { + "epoch": 0.49399519615692555, + "learning_rate": 1.758405638764873e-05, + "loss": 0.2502, + "step": 2468 + }, + { + "epoch": 0.49479583666933546, + "learning_rate": 1.7593157680824946e-05, + "loss": 0.438, + "step": 2470 + }, + { + "epoch": 0.49479583666933546, + "learning_rate": 1.7602244144399693e-05, + "loss": 0.2478, + "step": 2472 + }, + { + "epoch": 0.4955964771817454, + "learning_rate": 1.761131576062694e-05, + "loss": 0.6608, + "step": 2474 + }, + { + "epoch": 0.4955964771817454, + "learning_rate": 1.7620372511789604e-05, + "loss": 0.3307, + "step": 2476 + }, + { + "epoch": 0.49639711769415534, + "learning_rate": 1.7629414380199662e-05, + "loss": 0.0244, + "step": 2478 + }, + { + "epoch": 0.49639711769415534, + "learning_rate": 1.7638441348198147e-05, + "loss": 0.4588, + "step": 2480 + }, + { + "epoch": 0.49719775820656525, + "learning_rate": 1.7647453398155194e-05, + "loss": 0.3747, + "step": 2482 + }, + { + "epoch": 0.49719775820656525, + "learning_rate": 1.7656450512470077e-05, + "loss": 0.0267, + "step": 2484 + }, + { + "epoch": 0.49799839871897517, + "learning_rate": 1.7665432673571218e-05, + "loss": 0.9057, + "step": 2486 + }, + { + "epoch": 0.49799839871897517, + "learning_rate": 1.7674399863916295e-05, + "loss": 0.8022, + "step": 2488 + }, + { + "epoch": 0.49879903923138513, + "learning_rate": 1.768335206599217e-05, + "loss": 1.1991, + "step": 2490 + }, + { + "epoch": 0.49879903923138513, + "learning_rate": 1.7692289262315e-05, + "loss": 0.4233, + "step": 2492 + }, + { + "epoch": 0.49959967974379504, + "learning_rate": 1.7701211435430256e-05, + "loss": 0.091, + "step": 2494 + }, + { + "epoch": 0.49959967974379504, + "learning_rate": 1.771011856791273e-05, + "loss": 0.4274, + "step": 2496 + }, + { + "epoch": 0.500400320256205, + "learning_rate": 1.771901064236659e-05, + "loss": 0.134, + "step": 2498 + }, + { + "epoch": 0.500400320256205, + "learning_rate": 1.7727887641425448e-05, + "loss": 0.196, + "step": 2500 + }, + { + "epoch": 0.5012009607686149, + "learning_rate": 1.773674954775232e-05, + "loss": 0.1765, + "step": 2502 + }, + { + "epoch": 0.5012009607686149, + "learning_rate": 1.7745596344039712e-05, + "loss": 0.1173, + "step": 2504 + }, + { + "epoch": 0.5020016012810248, + "learning_rate": 1.7754428013009637e-05, + "loss": 0.3651, + "step": 2506 + }, + { + "epoch": 0.5020016012810248, + "learning_rate": 1.7763244537413657e-05, + "loss": 0.1023, + "step": 2508 + }, + { + "epoch": 0.5028022417934348, + "learning_rate": 1.77720459000329e-05, + "loss": 0.2911, + "step": 2510 + }, + { + "epoch": 0.5028022417934348, + "learning_rate": 1.7780832083678116e-05, + "loss": 0.2428, + "step": 2512 + }, + { + "epoch": 0.5036028823058447, + "learning_rate": 1.7789603071189712e-05, + "loss": 0.1146, + "step": 2514 + }, + { + "epoch": 0.5036028823058447, + "learning_rate": 1.7798358845437754e-05, + "loss": 0.0726, + "step": 2516 + }, + { + "epoch": 0.5044035228182546, + "learning_rate": 1.780709938932202e-05, + "loss": 0.1591, + "step": 2518 + }, + { + "epoch": 0.5044035228182546, + "learning_rate": 1.7815824685772035e-05, + "loss": 0.1033, + "step": 2520 + }, + { + "epoch": 0.5052041633306645, + "learning_rate": 1.7824534717747115e-05, + "loss": 0.2721, + "step": 2522 + }, + { + "epoch": 0.5052041633306645, + "learning_rate": 1.7833229468236364e-05, + "loss": 0.169, + "step": 2524 + }, + { + "epoch": 0.5060048038430744, + "learning_rate": 1.7841908920258767e-05, + "loss": 0.333, + "step": 2526 + }, + { + "epoch": 0.5060048038430744, + "learning_rate": 1.7850573056863156e-05, + "loss": 0.4545, + "step": 2528 + }, + { + "epoch": 0.5068054443554844, + "learning_rate": 1.7859221861128284e-05, + "loss": 0.3054, + "step": 2530 + }, + { + "epoch": 0.5068054443554844, + "learning_rate": 1.786785531616285e-05, + "loss": 0.3003, + "step": 2532 + }, + { + "epoch": 0.5076060848678943, + "learning_rate": 1.7876473405105528e-05, + "loss": 0.1912, + "step": 2534 + }, + { + "epoch": 0.5076060848678943, + "learning_rate": 1.7885076111125004e-05, + "loss": 0.1411, + "step": 2536 + }, + { + "epoch": 0.5084067253803043, + "learning_rate": 1.7893663417419995e-05, + "loss": 0.3095, + "step": 2538 + }, + { + "epoch": 0.5084067253803043, + "learning_rate": 1.790223530721933e-05, + "loss": 0.0393, + "step": 2540 + }, + { + "epoch": 0.5092073658927142, + "learning_rate": 1.791079176378191e-05, + "loss": 0.7548, + "step": 2542 + }, + { + "epoch": 0.5092073658927142, + "learning_rate": 1.791933277039679e-05, + "loss": 0.4713, + "step": 2544 + }, + { + "epoch": 0.5100080064051241, + "learning_rate": 1.7927858310383202e-05, + "loss": 0.2709, + "step": 2546 + }, + { + "epoch": 0.5100080064051241, + "learning_rate": 1.7936368367090577e-05, + "loss": 0.2234, + "step": 2548 + }, + { + "epoch": 0.510808646917534, + "learning_rate": 1.794486292389858e-05, + "loss": 0.0413, + "step": 2550 + }, + { + "epoch": 0.510808646917534, + "learning_rate": 1.7953341964217183e-05, + "loss": 0.278, + "step": 2552 + }, + { + "epoch": 0.5116092874299439, + "learning_rate": 1.7961805471486618e-05, + "loss": 0.2372, + "step": 2554 + }, + { + "epoch": 0.5116092874299439, + "learning_rate": 1.7970253429177477e-05, + "loss": 0.196, + "step": 2556 + }, + { + "epoch": 0.5124099279423538, + "learning_rate": 1.797868582079072e-05, + "loss": 0.0696, + "step": 2558 + }, + { + "epoch": 0.5124099279423538, + "learning_rate": 1.7987102629857696e-05, + "loss": 0.0217, + "step": 2560 + }, + { + "epoch": 0.5132105684547638, + "learning_rate": 1.7995503839940197e-05, + "loss": 0.4133, + "step": 2562 + }, + { + "epoch": 0.5132105684547638, + "learning_rate": 1.800388943463047e-05, + "loss": 0.1076, + "step": 2564 + }, + { + "epoch": 0.5140112089671738, + "learning_rate": 1.8012259397551283e-05, + "loss": 0.1056, + "step": 2566 + }, + { + "epoch": 0.5140112089671738, + "learning_rate": 1.8020613712355912e-05, + "loss": 0.0415, + "step": 2568 + }, + { + "epoch": 0.5148118494795837, + "learning_rate": 1.8028952362728197e-05, + "loss": 0.0895, + "step": 2570 + }, + { + "epoch": 0.5148118494795837, + "learning_rate": 1.803727533238257e-05, + "loss": 0.1284, + "step": 2572 + }, + { + "epoch": 0.5156124899919936, + "learning_rate": 1.804558260506409e-05, + "loss": 0.1365, + "step": 2574 + }, + { + "epoch": 0.5156124899919936, + "learning_rate": 1.805387416454847e-05, + "loss": 0.138, + "step": 2576 + }, + { + "epoch": 0.5164131305044035, + "learning_rate": 1.8062149994642135e-05, + "loss": 0.2839, + "step": 2578 + }, + { + "epoch": 0.5164131305044035, + "learning_rate": 1.8070410079182195e-05, + "loss": 0.3699, + "step": 2580 + }, + { + "epoch": 0.5172137710168134, + "learning_rate": 1.8078654402036526e-05, + "loss": 0.0625, + "step": 2582 + }, + { + "epoch": 0.5172137710168134, + "learning_rate": 1.8086882947103787e-05, + "loss": 0.0338, + "step": 2584 + }, + { + "epoch": 0.5180144115292233, + "learning_rate": 1.8095095698313452e-05, + "loss": 0.2025, + "step": 2586 + }, + { + "epoch": 0.5180144115292233, + "learning_rate": 1.8103292639625842e-05, + "loss": 0.055, + "step": 2588 + }, + { + "epoch": 0.5188150520416333, + "learning_rate": 1.811147375503214e-05, + "loss": 0.0417, + "step": 2590 + }, + { + "epoch": 0.5188150520416333, + "learning_rate": 1.811963902855447e-05, + "loss": 0.1765, + "step": 2592 + }, + { + "epoch": 0.5196156925540433, + "learning_rate": 1.812778844424587e-05, + "loss": 0.445, + "step": 2594 + }, + { + "epoch": 0.5196156925540433, + "learning_rate": 1.813592198619035e-05, + "loss": 0.5657, + "step": 2596 + }, + { + "epoch": 0.5204163330664532, + "learning_rate": 1.814403963850293e-05, + "loss": 0.762, + "step": 2598 + }, + { + "epoch": 0.5204163330664532, + "learning_rate": 1.8152141385329658e-05, + "loss": 0.3167, + "step": 2600 + }, + { + "epoch": 0.5212169735788631, + "learning_rate": 1.8160227210847636e-05, + "loss": 0.1979, + "step": 2602 + }, + { + "epoch": 0.5212169735788631, + "learning_rate": 1.816829709926509e-05, + "loss": 0.0931, + "step": 2604 + }, + { + "epoch": 0.522017614091273, + "learning_rate": 1.8176351034821345e-05, + "loss": 0.165, + "step": 2606 + }, + { + "epoch": 0.522017614091273, + "learning_rate": 1.8184389001786895e-05, + "loss": 0.1626, + "step": 2608 + }, + { + "epoch": 0.5228182546036829, + "learning_rate": 1.819241098446341e-05, + "loss": 0.4377, + "step": 2610 + }, + { + "epoch": 0.5228182546036829, + "learning_rate": 1.8200416967183785e-05, + "loss": 0.0409, + "step": 2612 + }, + { + "epoch": 0.5236188951160928, + "learning_rate": 1.8208406934312167e-05, + "loss": 0.2079, + "step": 2614 + }, + { + "epoch": 0.5236188951160928, + "learning_rate": 1.821638087024396e-05, + "loss": 0.2513, + "step": 2616 + }, + { + "epoch": 0.5244195356285029, + "learning_rate": 1.8224338759405917e-05, + "loss": 0.2965, + "step": 2618 + }, + { + "epoch": 0.5244195356285029, + "learning_rate": 1.8232280586256097e-05, + "loss": 0.0866, + "step": 2620 + }, + { + "epoch": 0.5252201761409128, + "learning_rate": 1.8240206335283947e-05, + "loss": 0.0766, + "step": 2622 + }, + { + "epoch": 0.5252201761409128, + "learning_rate": 1.8248115991010296e-05, + "loss": 0.266, + "step": 2624 + }, + { + "epoch": 0.5260208166533227, + "learning_rate": 1.825600953798743e-05, + "loss": 0.6517, + "step": 2626 + }, + { + "epoch": 0.5260208166533227, + "learning_rate": 1.8263886960799055e-05, + "loss": 0.5598, + "step": 2628 + }, + { + "epoch": 0.5268214571657326, + "learning_rate": 1.8271748244060426e-05, + "loss": 0.054, + "step": 2630 + }, + { + "epoch": 0.5268214571657326, + "learning_rate": 1.8279593372418264e-05, + "loss": 0.475, + "step": 2632 + }, + { + "epoch": 0.5276220976781425, + "learning_rate": 1.8287422330550878e-05, + "loss": 0.3155, + "step": 2634 + }, + { + "epoch": 0.5276220976781425, + "learning_rate": 1.829523510316813e-05, + "loss": 0.1332, + "step": 2636 + }, + { + "epoch": 0.5284227381905524, + "learning_rate": 1.8303031675011515e-05, + "loss": 0.1668, + "step": 2638 + }, + { + "epoch": 0.5284227381905524, + "learning_rate": 1.8310812030854155e-05, + "loss": 0.2638, + "step": 2640 + }, + { + "epoch": 0.5292233787029623, + "learning_rate": 1.8318576155500838e-05, + "loss": 0.0757, + "step": 2642 + }, + { + "epoch": 0.5292233787029623, + "learning_rate": 1.832632403378808e-05, + "loss": 0.2376, + "step": 2644 + }, + { + "epoch": 0.5300240192153723, + "learning_rate": 1.8334055650584094e-05, + "loss": 0.1537, + "step": 2646 + }, + { + "epoch": 0.5300240192153723, + "learning_rate": 1.834177099078887e-05, + "loss": 0.4716, + "step": 2648 + }, + { + "epoch": 0.5308246597277823, + "learning_rate": 1.8349470039334173e-05, + "loss": 0.4034, + "step": 2650 + }, + { + "epoch": 0.5308246597277823, + "learning_rate": 1.8357152781183606e-05, + "loss": 0.1113, + "step": 2652 + }, + { + "epoch": 0.5316253002401922, + "learning_rate": 1.83648192013326e-05, + "loss": 0.1771, + "step": 2654 + }, + { + "epoch": 0.5316253002401922, + "learning_rate": 1.8372469284808465e-05, + "loss": 0.828, + "step": 2656 + }, + { + "epoch": 0.5324259407526021, + "learning_rate": 1.8380103016670437e-05, + "loss": 0.0862, + "step": 2658 + }, + { + "epoch": 0.5324259407526021, + "learning_rate": 1.8387720382009665e-05, + "loss": 0.5145, + "step": 2660 + }, + { + "epoch": 0.533226581265012, + "learning_rate": 1.839532136594927e-05, + "loss": 0.5603, + "step": 2662 + }, + { + "epoch": 0.533226581265012, + "learning_rate": 1.840290595364436e-05, + "loss": 0.5613, + "step": 2664 + }, + { + "epoch": 0.5340272217774219, + "learning_rate": 1.8410474130282085e-05, + "loss": 0.5969, + "step": 2666 + }, + { + "epoch": 0.5340272217774219, + "learning_rate": 1.8418025881081606e-05, + "loss": 0.2315, + "step": 2668 + }, + { + "epoch": 0.5348278622898318, + "learning_rate": 1.8425561191294217e-05, + "loss": 0.4169, + "step": 2670 + }, + { + "epoch": 0.5348278622898318, + "learning_rate": 1.8433080046203286e-05, + "loss": 0.0715, + "step": 2672 + }, + { + "epoch": 0.5356285028022418, + "learning_rate": 1.8440582431124325e-05, + "loss": 0.2696, + "step": 2674 + }, + { + "epoch": 0.5356285028022418, + "learning_rate": 1.844806833140501e-05, + "loss": 0.2104, + "step": 2676 + }, + { + "epoch": 0.5364291433146517, + "learning_rate": 1.8455537732425223e-05, + "loss": 0.1562, + "step": 2678 + }, + { + "epoch": 0.5364291433146517, + "learning_rate": 1.8462990619597054e-05, + "loss": 0.1901, + "step": 2680 + }, + { + "epoch": 0.5372297838270617, + "learning_rate": 1.847042697836485e-05, + "loss": 0.3962, + "step": 2682 + }, + { + "epoch": 0.5372297838270617, + "learning_rate": 1.8477846794205258e-05, + "loss": 0.1166, + "step": 2684 + }, + { + "epoch": 0.5380304243394716, + "learning_rate": 1.84852500526272e-05, + "loss": 0.1645, + "step": 2686 + }, + { + "epoch": 0.5380304243394716, + "learning_rate": 1.8492636739171966e-05, + "loss": 0.5977, + "step": 2688 + }, + { + "epoch": 0.5388310648518815, + "learning_rate": 1.8500006839413183e-05, + "loss": 0.1766, + "step": 2690 + }, + { + "epoch": 0.5388310648518815, + "learning_rate": 1.85073603389569e-05, + "loss": 0.0523, + "step": 2692 + }, + { + "epoch": 0.5396317053642914, + "learning_rate": 1.851469722344155e-05, + "loss": 0.1875, + "step": 2694 + }, + { + "epoch": 0.5396317053642914, + "learning_rate": 1.8522017478538067e-05, + "loss": 0.0452, + "step": 2696 + }, + { + "epoch": 0.5404323458767014, + "learning_rate": 1.8529321089949817e-05, + "loss": 0.7137, + "step": 2698 + }, + { + "epoch": 0.5404323458767014, + "learning_rate": 1.8536608043412695e-05, + "loss": 0.8239, + "step": 2700 + }, + { + "epoch": 0.5412329863891113, + "learning_rate": 1.8543878324695122e-05, + "loss": 0.0438, + "step": 2702 + }, + { + "epoch": 0.5412329863891113, + "learning_rate": 1.855113191959808e-05, + "loss": 0.1101, + "step": 2704 + }, + { + "epoch": 0.5420336269015212, + "learning_rate": 1.8558368813955143e-05, + "loss": 0.2014, + "step": 2706 + }, + { + "epoch": 0.5420336269015212, + "learning_rate": 1.856558899363248e-05, + "loss": 0.2017, + "step": 2708 + }, + { + "epoch": 0.5428342674139311, + "learning_rate": 1.857279244452896e-05, + "loss": 0.198, + "step": 2710 + }, + { + "epoch": 0.5428342674139311, + "learning_rate": 1.8579979152576063e-05, + "loss": 0.227, + "step": 2712 + }, + { + "epoch": 0.5436349079263411, + "learning_rate": 1.8587149103738e-05, + "loss": 0.1225, + "step": 2714 + }, + { + "epoch": 0.5436349079263411, + "learning_rate": 1.85943022840117e-05, + "loss": 0.2396, + "step": 2716 + }, + { + "epoch": 0.544435548438751, + "learning_rate": 1.8601438679426847e-05, + "loss": 0.5975, + "step": 2718 + }, + { + "epoch": 0.544435548438751, + "learning_rate": 1.8608558276045895e-05, + "loss": 0.2234, + "step": 2720 + }, + { + "epoch": 0.5452361889511609, + "learning_rate": 1.8615661059964134e-05, + "loss": 0.1239, + "step": 2722 + }, + { + "epoch": 0.5452361889511609, + "learning_rate": 1.862274701730967e-05, + "loss": 0.096, + "step": 2724 + }, + { + "epoch": 0.5460368294635709, + "learning_rate": 1.862981613424347e-05, + "loss": 0.5037, + "step": 2726 + }, + { + "epoch": 0.5460368294635709, + "learning_rate": 1.86368683969594e-05, + "loss": 0.3029, + "step": 2728 + }, + { + "epoch": 0.5468374699759808, + "learning_rate": 1.864390379168423e-05, + "loss": 0.523, + "step": 2730 + }, + { + "epoch": 0.5468374699759808, + "learning_rate": 1.865092230467769e-05, + "loss": 0.1358, + "step": 2732 + }, + { + "epoch": 0.5476381104883907, + "learning_rate": 1.8657923922232464e-05, + "loss": 0.103, + "step": 2734 + }, + { + "epoch": 0.5476381104883907, + "learning_rate": 1.866490863067425e-05, + "loss": 0.1104, + "step": 2736 + }, + { + "epoch": 0.5484387510008006, + "learning_rate": 1.8671876416361763e-05, + "loss": 0.3209, + "step": 2738 + }, + { + "epoch": 0.5484387510008006, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.0796, + "step": 2740 + }, + { + "epoch": 0.5492393915132106, + "learning_rate": 1.8685761165074073e-05, + "loss": 0.5779, + "step": 2742 + }, + { + "epoch": 0.5492393915132106, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.1091, + "step": 2744 + }, + { + "epoch": 0.5500400320256205, + "learning_rate": 1.869957805990059e-05, + "loss": 0.1336, + "step": 2746 + }, + { + "epoch": 0.5500400320256205, + "learning_rate": 1.87064610283551e-05, + "loss": 0.2532, + "step": 2748 + }, + { + "epoch": 0.5508406725380304, + "learning_rate": 1.87133269929026e-05, + "loss": 0.087, + "step": 2750 + }, + { + "epoch": 0.5508406725380304, + "learning_rate": 1.8720175940133705e-05, + "loss": 0.3213, + "step": 2752 + }, + { + "epoch": 0.5516413130504404, + "learning_rate": 1.8727007856672285e-05, + "loss": 0.0195, + "step": 2754 + }, + { + "epoch": 0.5516413130504404, + "learning_rate": 1.873382272917545e-05, + "loss": 0.2782, + "step": 2756 + }, + { + "epoch": 0.5524419535628503, + "learning_rate": 1.8740620544333607e-05, + "loss": 0.8599, + "step": 2758 + }, + { + "epoch": 0.5524419535628503, + "learning_rate": 1.8747401288870472e-05, + "loss": 0.702, + "step": 2760 + }, + { + "epoch": 0.5532425940752602, + "learning_rate": 1.875416494954312e-05, + "loss": 0.1321, + "step": 2762 + }, + { + "epoch": 0.5532425940752602, + "learning_rate": 1.876091151314196e-05, + "loss": 0.1736, + "step": 2764 + }, + { + "epoch": 0.5540432345876701, + "learning_rate": 1.8767640966490813e-05, + "loss": 0.1385, + "step": 2766 + }, + { + "epoch": 0.5540432345876701, + "learning_rate": 1.877435329644691e-05, + "loss": 0.3638, + "step": 2768 + }, + { + "epoch": 0.55484387510008, + "learning_rate": 1.878104848990093e-05, + "loss": 0.2943, + "step": 2770 + }, + { + "epoch": 0.55484387510008, + "learning_rate": 1.8787726533776996e-05, + "loss": 0.1112, + "step": 2772 + }, + { + "epoch": 0.55564451561249, + "learning_rate": 1.879438741503277e-05, + "loss": 0.3989, + "step": 2774 + }, + { + "epoch": 0.55564451561249, + "learning_rate": 1.8801031120659393e-05, + "loss": 0.1905, + "step": 2776 + }, + { + "epoch": 0.5564451561248999, + "learning_rate": 1.8807657637681563e-05, + "loss": 0.4695, + "step": 2778 + }, + { + "epoch": 0.5564451561248999, + "learning_rate": 1.8814266953157557e-05, + "loss": 0.165, + "step": 2780 + }, + { + "epoch": 0.5572457966373099, + "learning_rate": 1.8820859054179225e-05, + "loss": 0.0812, + "step": 2782 + }, + { + "epoch": 0.5572457966373099, + "learning_rate": 1.8827433927872066e-05, + "loss": 0.2156, + "step": 2784 + }, + { + "epoch": 0.5580464371497198, + "learning_rate": 1.883399156139519e-05, + "loss": 0.5123, + "step": 2786 + }, + { + "epoch": 0.5580464371497198, + "learning_rate": 1.8840531941941415e-05, + "loss": 0.0374, + "step": 2788 + }, + { + "epoch": 0.5588470776621297, + "learning_rate": 1.8847055056737233e-05, + "loss": 0.0857, + "step": 2790 + }, + { + "epoch": 0.5588470776621297, + "learning_rate": 1.8853560893042854e-05, + "loss": 0.1639, + "step": 2792 + }, + { + "epoch": 0.5596477181745396, + "learning_rate": 1.8860049438152244e-05, + "loss": 0.3234, + "step": 2794 + }, + { + "epoch": 0.5596477181745396, + "learning_rate": 1.8866520679393127e-05, + "loss": 0.1203, + "step": 2796 + }, + { + "epoch": 0.5604483586869495, + "learning_rate": 1.8872974604127025e-05, + "loss": 0.2188, + "step": 2798 + }, + { + "epoch": 0.5604483586869495, + "learning_rate": 1.8879411199749303e-05, + "loss": 0.1078, + "step": 2800 + }, + { + "epoch": 0.5612489991993594, + "learning_rate": 1.8885830453689132e-05, + "loss": 0.7967, + "step": 2802 + }, + { + "epoch": 0.5612489991993594, + "learning_rate": 1.889223235340958e-05, + "loss": 0.2765, + "step": 2804 + }, + { + "epoch": 0.5620496397117695, + "learning_rate": 1.889861688640759e-05, + "loss": 0.4266, + "step": 2806 + }, + { + "epoch": 0.5620496397117695, + "learning_rate": 1.8904984040214037e-05, + "loss": 0.256, + "step": 2808 + }, + { + "epoch": 0.5628502802241794, + "learning_rate": 1.891133380239373e-05, + "loss": 0.154, + "step": 2810 + }, + { + "epoch": 0.5628502802241794, + "learning_rate": 1.8917666160545436e-05, + "loss": 0.2148, + "step": 2812 + }, + { + "epoch": 0.5636509207365893, + "learning_rate": 1.892398110230194e-05, + "loss": 0.2613, + "step": 2814 + }, + { + "epoch": 0.5636509207365893, + "learning_rate": 1.893027861533002e-05, + "loss": 0.2616, + "step": 2816 + }, + { + "epoch": 0.5644515612489992, + "learning_rate": 1.8936558687330485e-05, + "loss": 0.1438, + "step": 2818 + }, + { + "epoch": 0.5644515612489992, + "learning_rate": 1.894282130603823e-05, + "loss": 0.207, + "step": 2820 + }, + { + "epoch": 0.5652522017614091, + "learning_rate": 1.8949066459222217e-05, + "loss": 0.2287, + "step": 2822 + }, + { + "epoch": 0.5652522017614091, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.0849, + "step": 2824 + }, + { + "epoch": 0.566052842273819, + "learning_rate": 1.8961504320265382e-05, + "loss": 0.3227, + "step": 2826 + }, + { + "epoch": 0.566052842273819, + "learning_rate": 1.896769700383315e-05, + "loss": 0.1222, + "step": 2828 + }, + { + "epoch": 0.5668534827862289, + "learning_rate": 1.897387217329439e-05, + "loss": 0.6299, + "step": 2830 + }, + { + "epoch": 0.5668534827862289, + "learning_rate": 1.898002981658886e-05, + "loss": 0.1318, + "step": 2832 + }, + { + "epoch": 0.567654123298639, + "learning_rate": 1.8986169921690543e-05, + "loss": 0.7736, + "step": 2834 + }, + { + "epoch": 0.567654123298639, + "learning_rate": 1.899229247660769e-05, + "loss": 0.1114, + "step": 2836 + }, + { + "epoch": 0.5684547638110489, + "learning_rate": 1.899839746938281e-05, + "loss": 0.6883, + "step": 2838 + }, + { + "epoch": 0.5684547638110489, + "learning_rate": 1.9004484888092724e-05, + "loss": 0.5893, + "step": 2840 + }, + { + "epoch": 0.5692554043234588, + "learning_rate": 1.9010554720848577e-05, + "loss": 0.0496, + "step": 2842 + }, + { + "epoch": 0.5692554043234588, + "learning_rate": 1.901660695579585e-05, + "loss": 0.1606, + "step": 2844 + }, + { + "epoch": 0.5700560448358687, + "learning_rate": 1.9022641581114392e-05, + "loss": 0.2904, + "step": 2846 + }, + { + "epoch": 0.5700560448358687, + "learning_rate": 1.9028658585018455e-05, + "loss": 0.4134, + "step": 2848 + }, + { + "epoch": 0.5708566853482786, + "learning_rate": 1.9034657955756695e-05, + "loss": 0.7613, + "step": 2850 + }, + { + "epoch": 0.5708566853482786, + "learning_rate": 1.9040639681612212e-05, + "loss": 0.2263, + "step": 2852 + }, + { + "epoch": 0.5716573258606885, + "learning_rate": 1.904660375090257e-05, + "loss": 0.3098, + "step": 2854 + }, + { + "epoch": 0.5716573258606885, + "learning_rate": 1.9052550151979816e-05, + "loss": 0.3493, + "step": 2856 + }, + { + "epoch": 0.5724579663730984, + "learning_rate": 1.905847887323049e-05, + "loss": 0.1828, + "step": 2858 + }, + { + "epoch": 0.5724579663730984, + "learning_rate": 1.9064389903075676e-05, + "loss": 0.7169, + "step": 2860 + }, + { + "epoch": 0.5732586068855084, + "learning_rate": 1.9070283229971007e-05, + "loss": 0.2434, + "step": 2862 + }, + { + "epoch": 0.5732586068855084, + "learning_rate": 1.9076158842406674e-05, + "loss": 0.165, + "step": 2864 + }, + { + "epoch": 0.5740592473979184, + "learning_rate": 1.9082016728907496e-05, + "loss": 0.4803, + "step": 2866 + }, + { + "epoch": 0.5740592473979184, + "learning_rate": 1.9087856878032886e-05, + "loss": 0.1682, + "step": 2868 + }, + { + "epoch": 0.5748598879103283, + "learning_rate": 1.909367927837691e-05, + "loss": 0.3861, + "step": 2870 + }, + { + "epoch": 0.5748598879103283, + "learning_rate": 1.909948391856829e-05, + "loss": 0.0578, + "step": 2872 + }, + { + "epoch": 0.5756605284227382, + "learning_rate": 1.910527078727044e-05, + "loss": 0.1889, + "step": 2874 + }, + { + "epoch": 0.5756605284227382, + "learning_rate": 1.911103987318148e-05, + "loss": 0.083, + "step": 2876 + }, + { + "epoch": 0.5764611689351481, + "learning_rate": 1.911679116503425e-05, + "loss": 0.2106, + "step": 2878 + }, + { + "epoch": 0.5764611689351481, + "learning_rate": 1.912252465159637e-05, + "loss": 0.2265, + "step": 2880 + }, + { + "epoch": 0.577261809447558, + "learning_rate": 1.9128240321670208e-05, + "loss": 0.2223, + "step": 2882 + }, + { + "epoch": 0.577261809447558, + "learning_rate": 1.913393816409294e-05, + "loss": 0.2516, + "step": 2884 + }, + { + "epoch": 0.578062449959968, + "learning_rate": 1.913961816773655e-05, + "loss": 0.1994, + "step": 2886 + }, + { + "epoch": 0.578062449959968, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.3141, + "step": 2888 + }, + { + "epoch": 0.5788630904723779, + "learning_rate": 1.9150924614348588e-05, + "loss": 0.4519, + "step": 2890 + }, + { + "epoch": 0.5788630904723779, + "learning_rate": 1.9156551035235288e-05, + "loss": 0.1089, + "step": 2892 + }, + { + "epoch": 0.5796637309847879, + "learning_rate": 1.916215957317944e-05, + "loss": 0.6129, + "step": 2894 + }, + { + "epoch": 0.5796637309847879, + "learning_rate": 1.9167750217227454e-05, + "loss": 0.0712, + "step": 2896 + }, + { + "epoch": 0.5804643714971978, + "learning_rate": 1.9173322956460675e-05, + "loss": 0.2041, + "step": 2898 + }, + { + "epoch": 0.5804643714971978, + "learning_rate": 1.9178877779995423e-05, + "loss": 0.0614, + "step": 2900 + }, + { + "epoch": 0.5812650120096077, + "learning_rate": 1.9184414676983006e-05, + "loss": 0.1856, + "step": 2902 + }, + { + "epoch": 0.5812650120096077, + "learning_rate": 1.9189933636609747e-05, + "loss": 0.4771, + "step": 2904 + }, + { + "epoch": 0.5820656525220176, + "learning_rate": 1.9195434648097003e-05, + "loss": 0.3892, + "step": 2906 + }, + { + "epoch": 0.5820656525220176, + "learning_rate": 1.9200917700701173e-05, + "loss": 0.1307, + "step": 2908 + }, + { + "epoch": 0.5828662930344275, + "learning_rate": 1.9206382783713738e-05, + "loss": 0.2222, + "step": 2910 + }, + { + "epoch": 0.5828662930344275, + "learning_rate": 1.9211829886461274e-05, + "loss": 0.205, + "step": 2912 + }, + { + "epoch": 0.5836669335468375, + "learning_rate": 1.921725899830547e-05, + "loss": 0.1249, + "step": 2914 + }, + { + "epoch": 0.5836669335468375, + "learning_rate": 1.9222670108643146e-05, + "loss": 0.2107, + "step": 2916 + }, + { + "epoch": 0.5844675740592474, + "learning_rate": 1.92280632069063e-05, + "loss": 0.2044, + "step": 2918 + }, + { + "epoch": 0.5844675740592474, + "learning_rate": 1.9233438282562085e-05, + "loss": 0.3389, + "step": 2920 + }, + { + "epoch": 0.5852682145716573, + "learning_rate": 1.9238795325112867e-05, + "loss": 0.2531, + "step": 2922 + }, + { + "epoch": 0.5852682145716573, + "learning_rate": 1.924413432409622e-05, + "loss": 0.088, + "step": 2924 + }, + { + "epoch": 0.5860688550840673, + "learning_rate": 1.924945526908497e-05, + "loss": 0.3052, + "step": 2926 + }, + { + "epoch": 0.5860688550840673, + "learning_rate": 1.925475814968719e-05, + "loss": 0.1701, + "step": 2928 + }, + { + "epoch": 0.5868694955964772, + "learning_rate": 1.9260042955546237e-05, + "loss": 0.1572, + "step": 2930 + }, + { + "epoch": 0.5868694955964772, + "learning_rate": 1.926530967634078e-05, + "loss": 0.1152, + "step": 2932 + }, + { + "epoch": 0.5876701361088871, + "learning_rate": 1.9270558301784795e-05, + "loss": 0.2622, + "step": 2934 + }, + { + "epoch": 0.5876701361088871, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.0997, + "step": 2936 + }, + { + "epoch": 0.588470776621297, + "learning_rate": 1.9281001225653887e-05, + "loss": 0.0684, + "step": 2938 + }, + { + "epoch": 0.588470776621297, + "learning_rate": 1.9286195503683705e-05, + "loss": 0.5234, + "step": 2940 + }, + { + "epoch": 0.589271417133707, + "learning_rate": 1.9291371645572517e-05, + "loss": 0.37, + "step": 2942 + }, + { + "epoch": 0.589271417133707, + "learning_rate": 1.9296529641211215e-05, + "loss": 0.0704, + "step": 2944 + }, + { + "epoch": 0.5900720576461169, + "learning_rate": 1.9301669480526115e-05, + "loss": 1.3807, + "step": 2946 + }, + { + "epoch": 0.5900720576461169, + "learning_rate": 1.9306791153479004e-05, + "loss": 0.7164, + "step": 2948 + }, + { + "epoch": 0.5908726981585268, + "learning_rate": 1.931189465006714e-05, + "loss": 0.2306, + "step": 2950 + }, + { + "epoch": 0.5908726981585268, + "learning_rate": 1.9316979960323286e-05, + "loss": 0.2397, + "step": 2952 + }, + { + "epoch": 0.5916733386709367, + "learning_rate": 1.9322047074315717e-05, + "loss": 0.3193, + "step": 2954 + }, + { + "epoch": 0.5916733386709367, + "learning_rate": 1.932709598214825e-05, + "loss": 0.1363, + "step": 2956 + }, + { + "epoch": 0.5924739791833467, + "learning_rate": 1.9332126673960262e-05, + "loss": 0.1721, + "step": 2958 + }, + { + "epoch": 0.5924739791833467, + "learning_rate": 1.9337139139926707e-05, + "loss": 0.1248, + "step": 2960 + }, + { + "epoch": 0.5932746196957566, + "learning_rate": 1.934213337025812e-05, + "loss": 0.0625, + "step": 2962 + }, + { + "epoch": 0.5932746196957566, + "learning_rate": 1.9347109355200672e-05, + "loss": 0.2721, + "step": 2964 + }, + { + "epoch": 0.5940752602081665, + "learning_rate": 1.9352067085036145e-05, + "loss": 0.1558, + "step": 2966 + }, + { + "epoch": 0.5940752602081665, + "learning_rate": 1.935700655008199e-05, + "loss": 0.084, + "step": 2968 + }, + { + "epoch": 0.5948759007205765, + "learning_rate": 1.9361927740691327e-05, + "loss": 0.4458, + "step": 2970 + }, + { + "epoch": 0.5948759007205765, + "learning_rate": 1.9366830647252967e-05, + "loss": 0.1429, + "step": 2972 + }, + { + "epoch": 0.5956765412329864, + "learning_rate": 1.937171526019142e-05, + "loss": 0.3706, + "step": 2974 + }, + { + "epoch": 0.5956765412329864, + "learning_rate": 1.9376581569966933e-05, + "loss": 0.2047, + "step": 2976 + }, + { + "epoch": 0.5964771817453963, + "learning_rate": 1.9381429567075504e-05, + "loss": 0.5028, + "step": 2978 + }, + { + "epoch": 0.5964771817453963, + "learning_rate": 1.9386259242048883e-05, + "loss": 0.1852, + "step": 2980 + }, + { + "epoch": 0.5972778222578062, + "learning_rate": 1.93910705854546e-05, + "loss": 0.6163, + "step": 2982 + }, + { + "epoch": 0.5972778222578062, + "learning_rate": 1.939586358789602e-05, + "loss": 0.1197, + "step": 2984 + }, + { + "epoch": 0.5980784627702161, + "learning_rate": 1.9400638240012294e-05, + "loss": 0.0999, + "step": 2986 + }, + { + "epoch": 0.5980784627702161, + "learning_rate": 1.940539453247842e-05, + "loss": 0.1723, + "step": 2988 + }, + { + "epoch": 0.5988791032826261, + "learning_rate": 1.9410132456005262e-05, + "loss": 0.5901, + "step": 2990 + }, + { + "epoch": 0.5988791032826261, + "learning_rate": 1.9414852001339547e-05, + "loss": 0.2908, + "step": 2992 + }, + { + "epoch": 0.5996797437950361, + "learning_rate": 1.9419553159263896e-05, + "loss": 0.297, + "step": 2994 + }, + { + "epoch": 0.5996797437950361, + "learning_rate": 1.9424235920596863e-05, + "loss": 0.1112, + "step": 2996 + }, + { + "epoch": 0.600480384307446, + "learning_rate": 1.94289002761929e-05, + "loss": 0.4471, + "step": 2998 + }, + { + "epoch": 0.600480384307446, + "learning_rate": 1.9433546216942423e-05, + "loss": 0.0835, + "step": 3000 + }, + { + "epoch": 0.6012810248198559, + "learning_rate": 1.943817373377181e-05, + "loss": 0.282, + "step": 3002 + }, + { + "epoch": 0.6012810248198559, + "learning_rate": 1.944278281764342e-05, + "loss": 0.3279, + "step": 3004 + }, + { + "epoch": 0.6020816653322658, + "learning_rate": 1.944737345955561e-05, + "loss": 0.1833, + "step": 3006 + }, + { + "epoch": 0.6020816653322658, + "learning_rate": 1.945194565054276e-05, + "loss": 0.0792, + "step": 3008 + }, + { + "epoch": 0.6028823058446757, + "learning_rate": 1.945649938167528e-05, + "loss": 0.0202, + "step": 3010 + }, + { + "epoch": 0.6028823058446757, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.1952, + "step": 3012 + }, + { + "epoch": 0.6036829463570856, + "learning_rate": 1.946555142883836e-05, + "loss": 0.1349, + "step": 3014 + }, + { + "epoch": 0.6036829463570856, + "learning_rate": 1.9470049727190073e-05, + "loss": 0.1063, + "step": 3016 + }, + { + "epoch": 0.6044835868694955, + "learning_rate": 1.9474529530329507e-05, + "loss": 0.1955, + "step": 3018 + }, + { + "epoch": 0.6044835868694955, + "learning_rate": 1.9478990829507504e-05, + "loss": 0.1261, + "step": 3020 + }, + { + "epoch": 0.6052842273819056, + "learning_rate": 1.9483433616011047e-05, + "loss": 0.1224, + "step": 3022 + }, + { + "epoch": 0.6052842273819056, + "learning_rate": 1.948785788116329e-05, + "loss": 0.1731, + "step": 3024 + }, + { + "epoch": 0.6060848678943155, + "learning_rate": 1.9492263616323533e-05, + "loss": 0.0562, + "step": 3026 + }, + { + "epoch": 0.6060848678943155, + "learning_rate": 1.9496650812887286e-05, + "loss": 0.3717, + "step": 3028 + }, + { + "epoch": 0.6068855084067254, + "learning_rate": 1.9501019462286263e-05, + "loss": 0.2447, + "step": 3030 + }, + { + "epoch": 0.6068855084067254, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.1077, + "step": 3032 + }, + { + "epoch": 0.6076861489191353, + "learning_rate": 1.9509701085497842e-05, + "loss": 0.0735, + "step": 3034 + }, + { + "epoch": 0.6076861489191353, + "learning_rate": 1.951401404235505e-05, + "loss": 0.3788, + "step": 3036 + }, + { + "epoch": 0.6084867894315452, + "learning_rate": 1.9518308418136718e-05, + "loss": 1.1337, + "step": 3038 + }, + { + "epoch": 0.6084867894315452, + "learning_rate": 1.952258420445583e-05, + "loss": 0.1586, + "step": 3040 + }, + { + "epoch": 0.6092874299439551, + "learning_rate": 1.952684139296169e-05, + "loss": 0.1764, + "step": 3042 + }, + { + "epoch": 0.6092874299439551, + "learning_rate": 1.9531079975339912e-05, + "loss": 0.1253, + "step": 3044 + }, + { + "epoch": 0.610088070456365, + "learning_rate": 1.9535299943312455e-05, + "loss": 0.1202, + "step": 3046 + }, + { + "epoch": 0.610088070456365, + "learning_rate": 1.953950128863762e-05, + "loss": 0.807, + "step": 3048 + }, + { + "epoch": 0.6108887109687751, + "learning_rate": 1.9543684003110105e-05, + "loss": 0.0819, + "step": 3050 + }, + { + "epoch": 0.6108887109687751, + "learning_rate": 1.9547848078560975e-05, + "loss": 0.0416, + "step": 3052 + }, + { + "epoch": 0.611689351481185, + "learning_rate": 1.9551993506857688e-05, + "loss": 0.1725, + "step": 3054 + }, + { + "epoch": 0.611689351481185, + "learning_rate": 1.9556120279904144e-05, + "loss": 0.0796, + "step": 3056 + }, + { + "epoch": 0.6124899919935949, + "learning_rate": 1.9560228389640664e-05, + "loss": 0.2081, + "step": 3058 + }, + { + "epoch": 0.6124899919935949, + "learning_rate": 1.956431782804402e-05, + "loss": 0.7223, + "step": 3060 + }, + { + "epoch": 0.6132906325060048, + "learning_rate": 1.956838858712744e-05, + "loss": 0.4442, + "step": 3062 + }, + { + "epoch": 0.6132906325060048, + "learning_rate": 1.957244065894066e-05, + "loss": 0.2895, + "step": 3064 + }, + { + "epoch": 0.6140912730184147, + "learning_rate": 1.9576474035569892e-05, + "loss": 0.1384, + "step": 3066 + }, + { + "epoch": 0.6140912730184147, + "learning_rate": 1.9580488709137858e-05, + "loss": 0.2912, + "step": 3068 + }, + { + "epoch": 0.6148919135308246, + "learning_rate": 1.9584484671803818e-05, + "loss": 0.2768, + "step": 3070 + }, + { + "epoch": 0.6148919135308246, + "learning_rate": 1.9588461915763566e-05, + "loss": 0.0136, + "step": 3072 + }, + { + "epoch": 0.6156925540432346, + "learning_rate": 1.9592420433249462e-05, + "loss": 0.4639, + "step": 3074 + }, + { + "epoch": 0.6156925540432346, + "learning_rate": 1.9596360216530436e-05, + "loss": 0.1668, + "step": 3076 + }, + { + "epoch": 0.6164931945556446, + "learning_rate": 1.9600281257912e-05, + "loss": 0.53, + "step": 3078 + }, + { + "epoch": 0.6164931945556446, + "learning_rate": 1.9604183549736283e-05, + "loss": 0.4044, + "step": 3080 + }, + { + "epoch": 0.6172938350680545, + "learning_rate": 1.960806708438202e-05, + "loss": 0.1147, + "step": 3082 + }, + { + "epoch": 0.6172938350680545, + "learning_rate": 1.961193185426459e-05, + "loss": 0.4281, + "step": 3084 + }, + { + "epoch": 0.6180944755804644, + "learning_rate": 1.9615777851836003e-05, + "loss": 0.1689, + "step": 3086 + }, + { + "epoch": 0.6180944755804644, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.282, + "step": 3088 + }, + { + "epoch": 0.6188951160928743, + "learning_rate": 1.962341350003679e-05, + "loss": 0.1738, + "step": 3090 + }, + { + "epoch": 0.6188951160928743, + "learning_rate": 1.9627203135753573e-05, + "loss": 0.6132, + "step": 3092 + }, + { + "epoch": 0.6196957566052842, + "learning_rate": 1.9630973969334068e-05, + "loss": 0.4272, + "step": 3094 + }, + { + "epoch": 0.6196957566052842, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.3025, + "step": 3096 + }, + { + "epoch": 0.6204963971176941, + "learning_rate": 1.9638459200664822e-05, + "loss": 0.1092, + "step": 3098 + }, + { + "epoch": 0.6204963971176941, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.2163, + "step": 3100 + }, + { + "epoch": 0.6212970376301041, + "learning_rate": 1.9645869135553806e-05, + "loss": 0.4154, + "step": 3102 + }, + { + "epoch": 0.6212970376301041, + "learning_rate": 1.964954584871995e-05, + "loss": 0.3096, + "step": 3104 + }, + { + "epoch": 0.622097678142514, + "learning_rate": 1.965320371611399e-05, + "loss": 0.0933, + "step": 3106 + }, + { + "epoch": 0.622097678142514, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.0512, + "step": 3108 + }, + { + "epoch": 0.622898318654924, + "learning_rate": 1.9660462885047032e-05, + "loss": 0.2394, + "step": 3110 + }, + { + "epoch": 0.622898318654924, + "learning_rate": 1.966406417240872e-05, + "loss": 0.349, + "step": 3112 + }, + { + "epoch": 0.6236989591673339, + "learning_rate": 1.9667646585643703e-05, + "loss": 0.1493, + "step": 3114 + }, + { + "epoch": 0.6236989591673339, + "learning_rate": 1.967121011775546e-05, + "loss": 0.5136, + "step": 3116 + }, + { + "epoch": 0.6244995996797438, + "learning_rate": 1.967475476178433e-05, + "loss": 0.958, + "step": 3118 + }, + { + "epoch": 0.6244995996797438, + "learning_rate": 1.967828051080755e-05, + "loss": 0.1342, + "step": 3120 + }, + { + "epoch": 0.6253002401921537, + "learning_rate": 1.9681787357939254e-05, + "loss": 0.2646, + "step": 3122 + }, + { + "epoch": 0.6253002401921537, + "learning_rate": 1.9685275296330497e-05, + "loss": 0.3354, + "step": 3124 + }, + { + "epoch": 0.6261008807045636, + "learning_rate": 1.968874431916926e-05, + "loss": 0.5902, + "step": 3126 + }, + { + "epoch": 0.6261008807045636, + "learning_rate": 1.969219441968046e-05, + "loss": 0.112, + "step": 3128 + }, + { + "epoch": 0.6269015212169736, + "learning_rate": 1.969562559112598e-05, + "loss": 0.1588, + "step": 3130 + }, + { + "epoch": 0.6269015212169736, + "learning_rate": 1.969903782680467e-05, + "loss": 0.266, + "step": 3132 + }, + { + "epoch": 0.6277021617293835, + "learning_rate": 1.970243112005235e-05, + "loss": 0.2755, + "step": 3134 + }, + { + "epoch": 0.6277021617293835, + "learning_rate": 1.9705805464241856e-05, + "loss": 0.2974, + "step": 3136 + }, + { + "epoch": 0.6285028022417934, + "learning_rate": 1.970916085278302e-05, + "loss": 0.1084, + "step": 3138 + }, + { + "epoch": 0.6285028022417934, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.1599, + "step": 3140 + }, + { + "epoch": 0.6293034427542034, + "learning_rate": 1.9715814736744755e-05, + "loss": 0.0884, + "step": 3142 + }, + { + "epoch": 0.6293034427542034, + "learning_rate": 1.971911321917015e-05, + "loss": 0.0617, + "step": 3144 + }, + { + "epoch": 0.6301040832666133, + "learning_rate": 1.9722392719956864e-05, + "loss": 0.313, + "step": 3146 + }, + { + "epoch": 0.6301040832666133, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.1809, + "step": 3148 + }, + { + "epoch": 0.6309047237790232, + "learning_rate": 1.9728894751031595e-05, + "loss": 0.593, + "step": 3150 + }, + { + "epoch": 0.6309047237790232, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.3232, + "step": 3152 + }, + { + "epoch": 0.6317053642914331, + "learning_rate": 1.9735320779174545e-05, + "loss": 0.2127, + "step": 3154 + }, + { + "epoch": 0.6317053642914331, + "learning_rate": 1.9738505276435692e-05, + "loss": 0.1953, + "step": 3156 + }, + { + "epoch": 0.6325060048038431, + "learning_rate": 1.974167075418505e-05, + "loss": 0.1736, + "step": 3158 + }, + { + "epoch": 0.6325060048038431, + "learning_rate": 1.9744817206240377e-05, + "loss": 0.05, + "step": 3160 + }, + { + "epoch": 0.633306645316253, + "learning_rate": 1.9747944626456577e-05, + "loss": 0.3003, + "step": 3162 + }, + { + "epoch": 0.633306645316253, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.0649, + "step": 3164 + }, + { + "epoch": 0.6341072858286629, + "learning_rate": 1.975414234697712e-05, + "loss": 0.102, + "step": 3166 + }, + { + "epoch": 0.6341072858286629, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.0587, + "step": 3168 + }, + { + "epoch": 0.6349079263410728, + "learning_rate": 1.9760263867329568e-05, + "loss": 0.4973, + "step": 3170 + }, + { + "epoch": 0.6349079263410728, + "learning_rate": 1.9763296037475174e-05, + "loss": 0.7157, + "step": 3172 + }, + { + "epoch": 0.6357085668534828, + "learning_rate": 1.97663091396921e-05, + "loss": 0.2039, + "step": 3174 + }, + { + "epoch": 0.6357085668534828, + "learning_rate": 1.976930316809569e-05, + "loss": 0.1205, + "step": 3176 + }, + { + "epoch": 0.6365092073658927, + "learning_rate": 1.9772278116838543e-05, + "loss": 0.389, + "step": 3178 + }, + { + "epoch": 0.6365092073658927, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.1719, + "step": 3180 + }, + { + "epoch": 0.6373098478783027, + "learning_rate": 1.977817075213876e-05, + "loss": 0.1035, + "step": 3182 + }, + { + "epoch": 0.6373098478783027, + "learning_rate": 1.978108842718768e-05, + "loss": 0.1396, + "step": 3184 + }, + { + "epoch": 0.6381104883907126, + "learning_rate": 1.9783986999558994e-05, + "loss": 0.1691, + "step": 3186 + }, + { + "epoch": 0.6381104883907126, + "learning_rate": 1.9786866463591732e-05, + "loss": 0.323, + "step": 3188 + }, + { + "epoch": 0.6389111289031225, + "learning_rate": 1.9789726813662233e-05, + "loss": 0.2694, + "step": 3190 + }, + { + "epoch": 0.6389111289031225, + "learning_rate": 1.9792568044184176e-05, + "loss": 0.4369, + "step": 3192 + }, + { + "epoch": 0.6397117694155324, + "learning_rate": 1.979539014960858e-05, + "loss": 0.1019, + "step": 3194 + }, + { + "epoch": 0.6397117694155324, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.1458, + "step": 3196 + }, + { + "epoch": 0.6405124099279423, + "learning_rate": 1.9800976963155584e-05, + "loss": 0.8464, + "step": 3198 + }, + { + "epoch": 0.6405124099279423, + "learning_rate": 1.9803741660367015e-05, + "loss": 0.065, + "step": 3200 + }, + { + "epoch": 0.6413130504403523, + "learning_rate": 1.980648721065859e-05, + "loss": 0.642, + "step": 3202 + }, + { + "epoch": 0.6413130504403523, + "learning_rate": 1.9809213608668185e-05, + "loss": 0.0868, + "step": 3204 + }, + { + "epoch": 0.6421136909527622, + "learning_rate": 1.9811920849071092e-05, + "loss": 0.1503, + "step": 3206 + }, + { + "epoch": 0.6421136909527622, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.069, + "step": 3208 + }, + { + "epoch": 0.6429143314651722, + "learning_rate": 1.9817277835945057e-05, + "loss": 0.2533, + "step": 3210 + }, + { + "epoch": 0.6429143314651722, + "learning_rate": 1.9819927571953807e-05, + "loss": 0.371, + "step": 3212 + }, + { + "epoch": 0.6437149719775821, + "learning_rate": 1.9822558129431263e-05, + "loss": 0.1831, + "step": 3214 + }, + { + "epoch": 0.6437149719775821, + "learning_rate": 1.9825169503239885e-05, + "loss": 1.2335, + "step": 3216 + }, + { + "epoch": 0.644515612489992, + "learning_rate": 1.9827761688279606e-05, + "loss": 0.2566, + "step": 3218 + }, + { + "epoch": 0.644515612489992, + "learning_rate": 1.983033467948784e-05, + "loss": 0.1205, + "step": 3220 + }, + { + "epoch": 0.6453162530024019, + "learning_rate": 1.983288847183947e-05, + "loss": 0.7638, + "step": 3222 + }, + { + "epoch": 0.6453162530024019, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.135, + "step": 3224 + }, + { + "epoch": 0.6461168935148118, + "learning_rate": 1.9837938440059992e-05, + "loss": 0.1306, + "step": 3226 + }, + { + "epoch": 0.6461168935148118, + "learning_rate": 1.9840434606066182e-05, + "loss": 0.1835, + "step": 3228 + }, + { + "epoch": 0.6469175340272217, + "learning_rate": 1.9842911553490392e-05, + "loss": 0.1213, + "step": 3230 + }, + { + "epoch": 0.6469175340272217, + "learning_rate": 1.9845369277495102e-05, + "loss": 0.3228, + "step": 3232 + }, + { + "epoch": 0.6477181745396317, + "learning_rate": 1.984780777328031e-05, + "loss": 0.1642, + "step": 3234 + }, + { + "epoch": 0.6477181745396317, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.3724, + "step": 3236 + }, + { + "epoch": 0.6485188150520417, + "learning_rate": 1.985262706118007e-05, + "loss": 0.185, + "step": 3238 + }, + { + "epoch": 0.6485188150520417, + "learning_rate": 1.985500784388244e-05, + "loss": 0.2764, + "step": 3240 + }, + { + "epoch": 0.6493194555644516, + "learning_rate": 1.9857369379540982e-05, + "loss": 0.1753, + "step": 3242 + }, + { + "epoch": 0.6493194555644516, + "learning_rate": 1.985971166354357e-05, + "loss": 0.0387, + "step": 3244 + }, + { + "epoch": 0.6501200960768615, + "learning_rate": 1.986203469131567e-05, + "loss": 0.1374, + "step": 3246 + }, + { + "epoch": 0.6501200960768615, + "learning_rate": 1.9864338458320366e-05, + "loss": 0.2203, + "step": 3248 + }, + { + "epoch": 0.6509207365892714, + "learning_rate": 1.986662296005834e-05, + "loss": 0.9006, + "step": 3250 + }, + { + "epoch": 0.6509207365892714, + "learning_rate": 1.986888819206792e-05, + "loss": 0.1561, + "step": 3252 + }, + { + "epoch": 0.6517213771016813, + "learning_rate": 1.987113414992505e-05, + "loss": 0.4232, + "step": 3254 + }, + { + "epoch": 0.6517213771016813, + "learning_rate": 1.9873360829243323e-05, + "loss": 0.2862, + "step": 3256 + }, + { + "epoch": 0.6525220176140912, + "learning_rate": 1.9875568225674e-05, + "loss": 0.1382, + "step": 3258 + }, + { + "epoch": 0.6525220176140912, + "learning_rate": 1.9877756334905983e-05, + "loss": 0.1459, + "step": 3260 + }, + { + "epoch": 0.6533226581265013, + "learning_rate": 1.9879925152665845e-05, + "loss": 0.3614, + "step": 3262 + }, + { + "epoch": 0.6533226581265013, + "learning_rate": 1.9882074674717836e-05, + "loss": 0.6575, + "step": 3264 + }, + { + "epoch": 0.6541232986389112, + "learning_rate": 1.9884204896863895e-05, + "loss": 0.0836, + "step": 3266 + }, + { + "epoch": 0.6541232986389112, + "learning_rate": 1.988631581494365e-05, + "loss": 0.0697, + "step": 3268 + }, + { + "epoch": 0.6549239391513211, + "learning_rate": 1.9888407424834433e-05, + "loss": 0.4591, + "step": 3270 + }, + { + "epoch": 0.6549239391513211, + "learning_rate": 1.989047972245129e-05, + "loss": 0.352, + "step": 3272 + }, + { + "epoch": 0.655724579663731, + "learning_rate": 1.989253270374697e-05, + "loss": 0.1953, + "step": 3274 + }, + { + "epoch": 0.655724579663731, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.0285, + "step": 3276 + }, + { + "epoch": 0.6565252201761409, + "learning_rate": 1.9896580701374482e-05, + "loss": 0.0742, + "step": 3278 + }, + { + "epoch": 0.6565252201761409, + "learning_rate": 1.989857570980049e-05, + "loss": 0.0813, + "step": 3280 + }, + { + "epoch": 0.6573258606885508, + "learning_rate": 1.9900551386093677e-05, + "loss": 0.2594, + "step": 3282 + }, + { + "epoch": 0.6573258606885508, + "learning_rate": 1.990250772639552e-05, + "loss": 0.0955, + "step": 3284 + }, + { + "epoch": 0.6581265012009607, + "learning_rate": 1.9904444726885236e-05, + "loss": 0.5752, + "step": 3286 + }, + { + "epoch": 0.6581265012009607, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.2534, + "step": 3288 + }, + { + "epoch": 0.6589271417133707, + "learning_rate": 1.990826069333406e-05, + "loss": 0.4589, + "step": 3290 + }, + { + "epoch": 0.6589271417133707, + "learning_rate": 1.99101396518405e-05, + "loss": 0.2807, + "step": 3292 + }, + { + "epoch": 0.6597277822257807, + "learning_rate": 1.99119992556295e-05, + "loss": 0.6914, + "step": 3294 + }, + { + "epoch": 0.6597277822257807, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.2982, + "step": 3296 + }, + { + "epoch": 0.6605284227381906, + "learning_rate": 1.99156603845656e-05, + "loss": 0.6058, + "step": 3298 + }, + { + "epoch": 0.6605284227381906, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.8572, + "step": 3300 + }, + { + "epoch": 0.6613290632506005, + "learning_rate": 1.9919244051541315e-05, + "loss": 0.5944, + "step": 3302 + }, + { + "epoch": 0.6613290632506005, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.3357, + "step": 3304 + }, + { + "epoch": 0.6621297037630104, + "learning_rate": 1.9922750228560746e-05, + "loss": 0.2696, + "step": 3306 + }, + { + "epoch": 0.6621297037630104, + "learning_rate": 1.9924474249753652e-05, + "loss": 0.5087, + "step": 3308 + }, + { + "epoch": 0.6629303442754203, + "learning_rate": 1.9926178888233344e-05, + "loss": 0.8608, + "step": 3310 + }, + { + "epoch": 0.6629303442754203, + "learning_rate": 1.9927864140670615e-05, + "loss": 1.0463, + "step": 3312 + }, + { + "epoch": 0.6637309847878302, + "learning_rate": 1.9929530003774133e-05, + "loss": 0.4897, + "step": 3314 + }, + { + "epoch": 0.6637309847878302, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.1121, + "step": 3316 + }, + { + "epoch": 0.6645316253002402, + "learning_rate": 1.993280354900393e-05, + "loss": 0.1188, + "step": 3318 + }, + { + "epoch": 0.6645316253002402, + "learning_rate": 1.99344112247369e-05, + "loss": 0.254, + "step": 3320 + }, + { + "epoch": 0.6653322658126501, + "learning_rate": 1.9935999498349518e-05, + "loss": 0.2429, + "step": 3322 + }, + { + "epoch": 0.6653322658126501, + "learning_rate": 1.9937568366739858e-05, + "loss": 0.2629, + "step": 3324 + }, + { + "epoch": 0.6661329063250601, + "learning_rate": 1.9939117826843883e-05, + "loss": 0.5847, + "step": 3326 + }, + { + "epoch": 0.6661329063250601, + "learning_rate": 1.9940647875635463e-05, + "loss": 0.2966, + "step": 3328 + }, + { + "epoch": 0.66693354683747, + "learning_rate": 1.9942158510126384e-05, + "loss": 0.2967, + "step": 3330 + }, + { + "epoch": 0.66693354683747, + "learning_rate": 1.9943649727366335e-05, + "loss": 0.3056, + "step": 3332 + }, + { + "epoch": 0.6677341873498799, + "learning_rate": 1.9945121524442944e-05, + "loss": 0.1313, + "step": 3334 + }, + { + "epoch": 0.6677341873498799, + "learning_rate": 1.994657389848176e-05, + "loss": 0.1563, + "step": 3336 + }, + { + "epoch": 0.6685348278622898, + "learning_rate": 1.9948006846646262e-05, + "loss": 0.2795, + "step": 3338 + }, + { + "epoch": 0.6685348278622898, + "learning_rate": 1.994942036613787e-05, + "loss": 0.1359, + "step": 3340 + }, + { + "epoch": 0.6693354683746997, + "learning_rate": 1.9950814454195953e-05, + "loss": 0.4768, + "step": 3342 + }, + { + "epoch": 0.6693354683746997, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.1164, + "step": 3344 + }, + { + "epoch": 0.6701361088871097, + "learning_rate": 1.9953544325158755e-05, + "loss": 0.2071, + "step": 3346 + }, + { + "epoch": 0.6701361088871097, + "learning_rate": 1.995488010273198e-05, + "loss": 0.8203, + "step": 3348 + }, + { + "epoch": 0.6709367493995196, + "learning_rate": 1.9956196438208693e-05, + "loss": 0.7193, + "step": 3350 + }, + { + "epoch": 0.6709367493995196, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.0822, + "step": 3352 + }, + { + "epoch": 0.6717373899119295, + "learning_rate": 1.9958770772627236e-05, + "loss": 0.1685, + "step": 3354 + }, + { + "epoch": 0.6717373899119295, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.0433, + "step": 3356 + }, + { + "epoch": 0.6725380304243395, + "learning_rate": 1.9961267308303473e-05, + "loss": 0.5088, + "step": 3358 + }, + { + "epoch": 0.6725380304243395, + "learning_rate": 1.996248639549475e-05, + "loss": 0.2155, + "step": 3360 + }, + { + "epoch": 0.6733386709367494, + "learning_rate": 1.9963686025734262e-05, + "loss": 0.2014, + "step": 3362 + }, + { + "epoch": 0.6733386709367494, + "learning_rate": 1.9964866196679105e-05, + "loss": 0.3261, + "step": 3364 + }, + { + "epoch": 0.6741393114491593, + "learning_rate": 1.9966026906024377e-05, + "loss": 0.4743, + "step": 3366 + }, + { + "epoch": 0.6741393114491593, + "learning_rate": 1.9967168151503196e-05, + "loss": 0.0305, + "step": 3368 + }, + { + "epoch": 0.6749399519615693, + "learning_rate": 1.9968289930886675e-05, + "loss": 0.2961, + "step": 3370 + }, + { + "epoch": 0.6749399519615693, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.2373, + "step": 3372 + }, + { + "epoch": 0.6757405924739792, + "learning_rate": 1.997047508264221e-05, + "loss": 0.5667, + "step": 3374 + }, + { + "epoch": 0.6757405924739792, + "learning_rate": 1.997153845074662e-05, + "loss": 0.1772, + "step": 3376 + }, + { + "epoch": 0.6765412329863891, + "learning_rate": 1.99725823442204e-05, + "loss": 0.3661, + "step": 3378 + }, + { + "epoch": 0.6765412329863891, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.2649, + "step": 3380 + }, + { + "epoch": 0.677341873498799, + "learning_rate": 1.9974611699159142e-05, + "loss": 0.2479, + "step": 3382 + }, + { + "epoch": 0.677341873498799, + "learning_rate": 1.997559715666073e-05, + "loss": 0.0278, + "step": 3384 + }, + { + "epoch": 0.678142514011209, + "learning_rate": 1.9976563131604945e-05, + "loss": 0.4502, + "step": 3386 + }, + { + "epoch": 0.678142514011209, + "learning_rate": 1.9977509622105233e-05, + "loss": 0.1313, + "step": 3388 + }, + { + "epoch": 0.6789431545236189, + "learning_rate": 1.9978436626313065e-05, + "loss": 0.2147, + "step": 3390 + }, + { + "epoch": 0.6789431545236189, + "learning_rate": 1.9979344142417986e-05, + "loss": 0.6919, + "step": 3392 + }, + { + "epoch": 0.6797437950360288, + "learning_rate": 1.99802321686476e-05, + "loss": 0.1723, + "step": 3394 + }, + { + "epoch": 0.6797437950360288, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.1237, + "step": 3396 + }, + { + "epoch": 0.6805444355484388, + "learning_rate": 1.9981949744581622e-05, + "loss": 0.2997, + "step": 3398 + }, + { + "epoch": 0.6805444355484388, + "learning_rate": 1.998277929093157e-05, + "loss": 0.1079, + "step": 3400 + }, + { + "epoch": 0.6813450760608487, + "learning_rate": 1.9983589340697288e-05, + "loss": 0.4721, + "step": 3402 + }, + { + "epoch": 0.6813450760608487, + "learning_rate": 1.998437989229673e-05, + "loss": 0.0397, + "step": 3404 + }, + { + "epoch": 0.6821457165732586, + "learning_rate": 1.998515094418594e-05, + "loss": 0.1445, + "step": 3406 + }, + { + "epoch": 0.6821457165732586, + "learning_rate": 1.9985902494859023e-05, + "loss": 0.2697, + "step": 3408 + }, + { + "epoch": 0.6829463570856685, + "learning_rate": 1.99866345428482e-05, + "loss": 0.0272, + "step": 3410 + }, + { + "epoch": 0.6829463570856685, + "learning_rate": 1.998734708672375e-05, + "loss": 0.4402, + "step": 3412 + }, + { + "epoch": 0.6837469975980784, + "learning_rate": 1.998804012509407e-05, + "loss": 0.5754, + "step": 3414 + }, + { + "epoch": 0.6837469975980784, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.0805, + "step": 3416 + }, + { + "epoch": 0.6845476381104884, + "learning_rate": 1.9989367679943025e-05, + "loss": 0.162, + "step": 3418 + }, + { + "epoch": 0.6845476381104884, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.0903, + "step": 3420 + }, + { + "epoch": 0.6853482786228983, + "learning_rate": 1.9990617197024103e-05, + "loss": 0.5562, + "step": 3422 + }, + { + "epoch": 0.6853482786228983, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.2205, + "step": 3424 + }, + { + "epoch": 0.6861489191353083, + "learning_rate": 1.999178866657597e-05, + "loss": 0.4582, + "step": 3426 + }, + { + "epoch": 0.6861489191353083, + "learning_rate": 1.9992345130644747e-05, + "loss": 0.0294, + "step": 3428 + }, + { + "epoch": 0.6869495596477182, + "learning_rate": 1.999288207944701e-05, + "loss": 0.2484, + "step": 3430 + }, + { + "epoch": 0.6869495596477182, + "learning_rate": 1.999339951193407e-05, + "loss": 0.0363, + "step": 3432 + }, + { + "epoch": 0.6877502001601281, + "learning_rate": 1.999389742709538e-05, + "loss": 0.1131, + "step": 3434 + }, + { + "epoch": 0.6877502001601281, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.2087, + "step": 3436 + }, + { + "epoch": 0.688550840672538, + "learning_rate": 1.9994834701589113e-05, + "loss": 0.0308, + "step": 3438 + }, + { + "epoch": 0.688550840672538, + "learning_rate": 1.9995274059091018e-05, + "loss": 0.8039, + "step": 3440 + }, + { + "epoch": 0.6893514811849479, + "learning_rate": 1.999569389560614e-05, + "loss": 0.4411, + "step": 3442 + }, + { + "epoch": 0.6893514811849479, + "learning_rate": 1.999609421031453e-05, + "loss": 0.6082, + "step": 3444 + }, + { + "epoch": 0.6901521216973578, + "learning_rate": 1.9996475002434365e-05, + "loss": 0.1998, + "step": 3446 + }, + { + "epoch": 0.6901521216973578, + "learning_rate": 1.999683627122195e-05, + "loss": 0.5496, + "step": 3448 + }, + { + "epoch": 0.6909527622097679, + "learning_rate": 1.999717801597172e-05, + "loss": 0.2958, + "step": 3450 + }, + { + "epoch": 0.6909527622097679, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.2252, + "step": 3452 + }, + { + "epoch": 0.6917534027221778, + "learning_rate": 1.9997802930726195e-05, + "loss": 0.2824, + "step": 3454 + }, + { + "epoch": 0.6917534027221778, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.206, + "step": 3456 + }, + { + "epoch": 0.6925540432345877, + "learning_rate": 1.9998349741815916e-05, + "loss": 0.1651, + "step": 3458 + }, + { + "epoch": 0.6925540432345877, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.0816, + "step": 3460 + }, + { + "epoch": 0.6933546837469976, + "learning_rate": 1.999881844496914e-05, + "loss": 0.2051, + "step": 3462 + }, + { + "epoch": 0.6933546837469976, + "learning_rate": 1.99990235049015e-05, + "loss": 0.3326, + "step": 3464 + }, + { + "epoch": 0.6941553242594075, + "learning_rate": 1.9999209036524326e-05, + "loss": 0.5267, + "step": 3466 + }, + { + "epoch": 0.6941553242594075, + "learning_rate": 1.9999375039475275e-05, + "loss": 0.2015, + "step": 3468 + }, + { + "epoch": 0.6949559647718174, + "learning_rate": 1.999952151343014e-05, + "loss": 0.3913, + "step": 3470 + }, + { + "epoch": 0.6949559647718174, + "learning_rate": 1.999964845810285e-05, + "loss": 0.1681, + "step": 3472 + }, + { + "epoch": 0.6957566052842273, + "learning_rate": 1.9999755873245484e-05, + "loss": 0.2226, + "step": 3474 + }, + { + "epoch": 0.6957566052842273, + "learning_rate": 1.9999843758648253e-05, + "loss": 0.159, + "step": 3476 + }, + { + "epoch": 0.6965572457966374, + "learning_rate": 1.999991211413952e-05, + "loss": 0.3178, + "step": 3478 + }, + { + "epoch": 0.6965572457966374, + "learning_rate": 1.999996093958578e-05, + "loss": 0.4337, + "step": 3480 + }, + { + "epoch": 0.6973578863090473, + "learning_rate": 1.9999990234891677e-05, + "loss": 0.1383, + "step": 3482 + }, + { + "epoch": 0.6973578863090473, + "learning_rate": 2e-05, + "loss": 0.0695, + "step": 3484 + }, + { + "epoch": 0.6981585268214572, + "learning_rate": 1.999999023489168e-05, + "loss": 0.2032, + "step": 3486 + }, + { + "epoch": 0.6981585268214572, + "learning_rate": 1.999996093958578e-05, + "loss": 0.1228, + "step": 3488 + }, + { + "epoch": 0.6989591673338671, + "learning_rate": 1.999991211413952e-05, + "loss": 0.1037, + "step": 3490 + }, + { + "epoch": 0.6989591673338671, + "learning_rate": 1.9999843758648253e-05, + "loss": 0.0917, + "step": 3492 + }, + { + "epoch": 0.699759807846277, + "learning_rate": 1.9999755873245484e-05, + "loss": 0.9843, + "step": 3494 + }, + { + "epoch": 0.699759807846277, + "learning_rate": 1.999964845810285e-05, + "loss": 0.5761, + "step": 3496 + }, + { + "epoch": 0.7005604483586869, + "learning_rate": 1.999952151343014e-05, + "loss": 0.2473, + "step": 3498 + }, + { + "epoch": 0.7005604483586869, + "learning_rate": 1.9999375039475278e-05, + "loss": 0.1406, + "step": 3500 + }, + { + "epoch": 0.7013610888710968, + "learning_rate": 1.9999209036524326e-05, + "loss": 0.2723, + "step": 3502 + }, + { + "epoch": 0.7013610888710968, + "learning_rate": 1.99990235049015e-05, + "loss": 0.1169, + "step": 3504 + }, + { + "epoch": 0.7021617293835068, + "learning_rate": 1.999881844496914e-05, + "loss": 0.465, + "step": 3506 + }, + { + "epoch": 0.7021617293835068, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.1355, + "step": 3508 + }, + { + "epoch": 0.7029623698959168, + "learning_rate": 1.9998349741815916e-05, + "loss": 0.0734, + "step": 3510 + }, + { + "epoch": 0.7029623698959168, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.2099, + "step": 3512 + }, + { + "epoch": 0.7037630104083267, + "learning_rate": 1.9997802930726195e-05, + "loss": 0.5843, + "step": 3514 + }, + { + "epoch": 0.7037630104083267, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.0194, + "step": 3516 + }, + { + "epoch": 0.7045636509207366, + "learning_rate": 1.999717801597172e-05, + "loss": 0.0917, + "step": 3518 + }, + { + "epoch": 0.7045636509207366, + "learning_rate": 1.999683627122195e-05, + "loss": 0.4022, + "step": 3520 + }, + { + "epoch": 0.7053642914331465, + "learning_rate": 1.9996475002434365e-05, + "loss": 0.5186, + "step": 3522 + }, + { + "epoch": 0.7053642914331465, + "learning_rate": 1.999609421031453e-05, + "loss": 0.5172, + "step": 3524 + }, + { + "epoch": 0.7061649319455564, + "learning_rate": 1.999569389560614e-05, + "loss": 0.1705, + "step": 3526 + }, + { + "epoch": 0.7061649319455564, + "learning_rate": 1.999527405909102e-05, + "loss": 0.548, + "step": 3528 + }, + { + "epoch": 0.7069655724579663, + "learning_rate": 1.9994834701589113e-05, + "loss": 0.0821, + "step": 3530 + }, + { + "epoch": 0.7069655724579663, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.0684, + "step": 3532 + }, + { + "epoch": 0.7077662129703763, + "learning_rate": 1.9993897427095378e-05, + "loss": 0.7767, + "step": 3534 + }, + { + "epoch": 0.7077662129703763, + "learning_rate": 1.999339951193407e-05, + "loss": 0.3145, + "step": 3536 + }, + { + "epoch": 0.7085668534827863, + "learning_rate": 1.999288207944701e-05, + "loss": 0.2592, + "step": 3538 + }, + { + "epoch": 0.7085668534827863, + "learning_rate": 1.999234513064475e-05, + "loss": 0.4402, + "step": 3540 + }, + { + "epoch": 0.7093674939951962, + "learning_rate": 1.999178866657597e-05, + "loss": 0.4165, + "step": 3542 + }, + { + "epoch": 0.7093674939951962, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.419, + "step": 3544 + }, + { + "epoch": 0.7101681345076061, + "learning_rate": 1.9990617197024103e-05, + "loss": 0.2623, + "step": 3546 + }, + { + "epoch": 0.7101681345076061, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.2105, + "step": 3548 + }, + { + "epoch": 0.710968775020016, + "learning_rate": 1.998936767994303e-05, + "loss": 0.1584, + "step": 3550 + }, + { + "epoch": 0.710968775020016, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.1488, + "step": 3552 + }, + { + "epoch": 0.7117694155324259, + "learning_rate": 1.998804012509407e-05, + "loss": 0.1286, + "step": 3554 + }, + { + "epoch": 0.7117694155324259, + "learning_rate": 1.998734708672375e-05, + "loss": 0.1725, + "step": 3556 + }, + { + "epoch": 0.7125700560448359, + "learning_rate": 1.99866345428482e-05, + "loss": 0.2787, + "step": 3558 + }, + { + "epoch": 0.7125700560448359, + "learning_rate": 1.9985902494859026e-05, + "loss": 0.1562, + "step": 3560 + }, + { + "epoch": 0.7133706965572458, + "learning_rate": 1.998515094418594e-05, + "loss": 0.3827, + "step": 3562 + }, + { + "epoch": 0.7133706965572458, + "learning_rate": 1.9984379892296735e-05, + "loss": 0.1295, + "step": 3564 + }, + { + "epoch": 0.7141713370696557, + "learning_rate": 1.9983589340697288e-05, + "loss": 0.2312, + "step": 3566 + }, + { + "epoch": 0.7141713370696557, + "learning_rate": 1.9982779290931572e-05, + "loss": 0.4087, + "step": 3568 + }, + { + "epoch": 0.7149719775820657, + "learning_rate": 1.9981949744581622e-05, + "loss": 0.0306, + "step": 3570 + }, + { + "epoch": 0.7149719775820657, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.0705, + "step": 3572 + }, + { + "epoch": 0.7157726180944756, + "learning_rate": 1.99802321686476e-05, + "loss": 0.1914, + "step": 3574 + }, + { + "epoch": 0.7157726180944756, + "learning_rate": 1.997934414241799e-05, + "loss": 0.3778, + "step": 3576 + }, + { + "epoch": 0.7165732586068855, + "learning_rate": 1.9978436626313068e-05, + "loss": 0.1545, + "step": 3578 + }, + { + "epoch": 0.7165732586068855, + "learning_rate": 1.9977509622105236e-05, + "loss": 0.1262, + "step": 3580 + }, + { + "epoch": 0.7173738991192954, + "learning_rate": 1.997656313160495e-05, + "loss": 0.0612, + "step": 3582 + }, + { + "epoch": 0.7173738991192954, + "learning_rate": 1.997559715666073e-05, + "loss": 0.1499, + "step": 3584 + }, + { + "epoch": 0.7181745396317054, + "learning_rate": 1.9974611699159142e-05, + "loss": 0.2527, + "step": 3586 + }, + { + "epoch": 0.7181745396317054, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.3957, + "step": 3588 + }, + { + "epoch": 0.7189751801441153, + "learning_rate": 1.99725823442204e-05, + "loss": 0.1842, + "step": 3590 + }, + { + "epoch": 0.7189751801441153, + "learning_rate": 1.997153845074662e-05, + "loss": 0.1609, + "step": 3592 + }, + { + "epoch": 0.7197758206565252, + "learning_rate": 1.9970475082642212e-05, + "loss": 0.2379, + "step": 3594 + }, + { + "epoch": 0.7197758206565252, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.1071, + "step": 3596 + }, + { + "epoch": 0.7205764611689351, + "learning_rate": 1.9968289930886675e-05, + "loss": 0.4719, + "step": 3598 + }, + { + "epoch": 0.7205764611689351, + "learning_rate": 1.9967168151503193e-05, + "loss": 0.0682, + "step": 3600 + }, + { + "epoch": 0.7213771016813451, + "learning_rate": 1.9966026906024377e-05, + "loss": 0.3471, + "step": 3602 + }, + { + "epoch": 0.7213771016813451, + "learning_rate": 1.996486619667911e-05, + "loss": 0.075, + "step": 3604 + }, + { + "epoch": 0.722177742193755, + "learning_rate": 1.9963686025734262e-05, + "loss": 0.2529, + "step": 3606 + }, + { + "epoch": 0.722177742193755, + "learning_rate": 1.9962486395494753e-05, + "loss": 0.1759, + "step": 3608 + }, + { + "epoch": 0.7229783827061649, + "learning_rate": 1.9961267308303473e-05, + "loss": 0.372, + "step": 3610 + }, + { + "epoch": 0.7229783827061649, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.4715, + "step": 3612 + }, + { + "epoch": 0.7237790232185749, + "learning_rate": 1.9958770772627236e-05, + "loss": 0.1702, + "step": 3614 + }, + { + "epoch": 0.7237790232185749, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.3567, + "step": 3616 + }, + { + "epoch": 0.7245796637309848, + "learning_rate": 1.9956196438208693e-05, + "loss": 0.1767, + "step": 3618 + }, + { + "epoch": 0.7245796637309848, + "learning_rate": 1.995488010273198e-05, + "loss": 0.0979, + "step": 3620 + }, + { + "epoch": 0.7253803042433947, + "learning_rate": 1.9953544325158755e-05, + "loss": 0.0873, + "step": 3622 + }, + { + "epoch": 0.7253803042433947, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.1575, + "step": 3624 + }, + { + "epoch": 0.7261809447558046, + "learning_rate": 1.9950814454195953e-05, + "loss": 0.423, + "step": 3626 + }, + { + "epoch": 0.7261809447558046, + "learning_rate": 1.9949420366137873e-05, + "loss": 0.2148, + "step": 3628 + }, + { + "epoch": 0.7269815852682145, + "learning_rate": 1.9948006846646262e-05, + "loss": 0.2293, + "step": 3630 + }, + { + "epoch": 0.7269815852682145, + "learning_rate": 1.994657389848176e-05, + "loss": 0.1088, + "step": 3632 + }, + { + "epoch": 0.7277822257806245, + "learning_rate": 1.9945121524442947e-05, + "loss": 0.5542, + "step": 3634 + }, + { + "epoch": 0.7277822257806245, + "learning_rate": 1.994364972736634e-05, + "loss": 0.0738, + "step": 3636 + }, + { + "epoch": 0.7285828662930345, + "learning_rate": 1.9942158510126384e-05, + "loss": 1.0139, + "step": 3638 + }, + { + "epoch": 0.7285828662930345, + "learning_rate": 1.9940647875635466e-05, + "loss": 0.6827, + "step": 3640 + }, + { + "epoch": 0.7293835068054444, + "learning_rate": 1.9939117826843887e-05, + "loss": 0.0637, + "step": 3642 + }, + { + "epoch": 0.7293835068054444, + "learning_rate": 1.993756836673986e-05, + "loss": 0.195, + "step": 3644 + }, + { + "epoch": 0.7301841473178543, + "learning_rate": 1.9935999498349525e-05, + "loss": 0.3844, + "step": 3646 + }, + { + "epoch": 0.7301841473178543, + "learning_rate": 1.99344112247369e-05, + "loss": 0.4385, + "step": 3648 + }, + { + "epoch": 0.7309847878302642, + "learning_rate": 1.9932803549003932e-05, + "loss": 0.1484, + "step": 3650 + }, + { + "epoch": 0.7309847878302642, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.2821, + "step": 3652 + }, + { + "epoch": 0.7317854283426741, + "learning_rate": 1.9929530003774136e-05, + "loss": 0.1425, + "step": 3654 + }, + { + "epoch": 0.7317854283426741, + "learning_rate": 1.9927864140670618e-05, + "loss": 0.1791, + "step": 3656 + }, + { + "epoch": 0.732586068855084, + "learning_rate": 1.9926178888233344e-05, + "loss": 0.165, + "step": 3658 + }, + { + "epoch": 0.732586068855084, + "learning_rate": 1.9924474249753656e-05, + "loss": 0.1448, + "step": 3660 + }, + { + "epoch": 0.733386709367494, + "learning_rate": 1.9922750228560746e-05, + "loss": 0.3977, + "step": 3662 + }, + { + "epoch": 0.733386709367494, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.3334, + "step": 3664 + }, + { + "epoch": 0.734187349879904, + "learning_rate": 1.9919244051541315e-05, + "loss": 0.2298, + "step": 3666 + }, + { + "epoch": 0.734187349879904, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.1918, + "step": 3668 + }, + { + "epoch": 0.7349879903923139, + "learning_rate": 1.9915660384565603e-05, + "loss": 0.1165, + "step": 3670 + }, + { + "epoch": 0.7349879903923139, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.1688, + "step": 3672 + }, + { + "epoch": 0.7357886309047238, + "learning_rate": 1.9911999255629504e-05, + "loss": 0.1613, + "step": 3674 + }, + { + "epoch": 0.7357886309047238, + "learning_rate": 1.9910139651840497e-05, + "loss": 0.0782, + "step": 3676 + }, + { + "epoch": 0.7365892714171337, + "learning_rate": 1.990826069333406e-05, + "loss": 0.0668, + "step": 3678 + }, + { + "epoch": 0.7365892714171337, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.5131, + "step": 3680 + }, + { + "epoch": 0.7373899119295436, + "learning_rate": 1.9904444726885236e-05, + "loss": 0.1027, + "step": 3682 + }, + { + "epoch": 0.7373899119295436, + "learning_rate": 1.9902507726395524e-05, + "loss": 0.0724, + "step": 3684 + }, + { + "epoch": 0.7381905524419535, + "learning_rate": 1.9900551386093677e-05, + "loss": 0.1958, + "step": 3686 + }, + { + "epoch": 0.7381905524419535, + "learning_rate": 1.989857570980049e-05, + "loss": 0.3156, + "step": 3688 + }, + { + "epoch": 0.7389911929543634, + "learning_rate": 1.9896580701374482e-05, + "loss": 0.0729, + "step": 3690 + }, + { + "epoch": 0.7389911929543634, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.0864, + "step": 3692 + }, + { + "epoch": 0.7397918334667735, + "learning_rate": 1.9892532703746977e-05, + "loss": 0.1823, + "step": 3694 + }, + { + "epoch": 0.7397918334667735, + "learning_rate": 1.9890479722451292e-05, + "loss": 0.1051, + "step": 3696 + }, + { + "epoch": 0.7405924739791834, + "learning_rate": 1.9888407424834437e-05, + "loss": 0.5638, + "step": 3698 + }, + { + "epoch": 0.7405924739791834, + "learning_rate": 1.988631581494365e-05, + "loss": 0.6701, + "step": 3700 + }, + { + "epoch": 0.7413931144915933, + "learning_rate": 1.9884204896863895e-05, + "loss": 1.0381, + "step": 3702 + }, + { + "epoch": 0.7413931144915933, + "learning_rate": 1.9882074674717832e-05, + "loss": 0.1873, + "step": 3704 + }, + { + "epoch": 0.7421937550040032, + "learning_rate": 1.9879925152665845e-05, + "loss": 0.2481, + "step": 3706 + }, + { + "epoch": 0.7421937550040032, + "learning_rate": 1.987775633490599e-05, + "loss": 0.2766, + "step": 3708 + }, + { + "epoch": 0.7429943955164131, + "learning_rate": 1.9875568225674005e-05, + "loss": 0.0439, + "step": 3710 + }, + { + "epoch": 0.7429943955164131, + "learning_rate": 1.987336082924333e-05, + "loss": 0.4032, + "step": 3712 + }, + { + "epoch": 0.743795036028823, + "learning_rate": 1.987113414992505e-05, + "loss": 0.5101, + "step": 3714 + }, + { + "epoch": 0.743795036028823, + "learning_rate": 1.986888819206792e-05, + "loss": 0.1166, + "step": 3716 + }, + { + "epoch": 0.7445956765412329, + "learning_rate": 1.986662296005834e-05, + "loss": 0.0635, + "step": 3718 + }, + { + "epoch": 0.7445956765412329, + "learning_rate": 1.986433845832037e-05, + "loss": 0.1279, + "step": 3720 + }, + { + "epoch": 0.745396317053643, + "learning_rate": 1.9862034691315678e-05, + "loss": 0.0884, + "step": 3722 + }, + { + "epoch": 0.745396317053643, + "learning_rate": 1.9859711663543573e-05, + "loss": 0.3638, + "step": 3724 + }, + { + "epoch": 0.7461969575660529, + "learning_rate": 1.9857369379540985e-05, + "loss": 0.0997, + "step": 3726 + }, + { + "epoch": 0.7461969575660529, + "learning_rate": 1.9855007843882437e-05, + "loss": 0.0408, + "step": 3728 + }, + { + "epoch": 0.7469975980784628, + "learning_rate": 1.985262706118007e-05, + "loss": 0.089, + "step": 3730 + }, + { + "epoch": 0.7469975980784628, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.2894, + "step": 3732 + }, + { + "epoch": 0.7477982385908727, + "learning_rate": 1.9847807773280314e-05, + "loss": 0.1601, + "step": 3734 + }, + { + "epoch": 0.7477982385908727, + "learning_rate": 1.9845369277495105e-05, + "loss": 0.1334, + "step": 3736 + }, + { + "epoch": 0.7485988791032826, + "learning_rate": 1.9842911553490396e-05, + "loss": 0.3956, + "step": 3738 + }, + { + "epoch": 0.7485988791032826, + "learning_rate": 1.9840434606066186e-05, + "loss": 0.1502, + "step": 3740 + }, + { + "epoch": 0.7493995196156925, + "learning_rate": 1.983793844005999e-05, + "loss": 0.1572, + "step": 3742 + }, + { + "epoch": 0.7493995196156925, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.1198, + "step": 3744 + }, + { + "epoch": 0.7502001601281025, + "learning_rate": 1.9832888471839475e-05, + "loss": 0.3976, + "step": 3746 + }, + { + "epoch": 0.7502001601281025, + "learning_rate": 1.983033467948784e-05, + "loss": 0.5033, + "step": 3748 + }, + { + "epoch": 0.7510008006405124, + "learning_rate": 1.9827761688279613e-05, + "loss": 0.3151, + "step": 3750 + }, + { + "epoch": 0.7510008006405124, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.1284, + "step": 3752 + }, + { + "epoch": 0.7518014411529224, + "learning_rate": 1.9822558129431263e-05, + "loss": 0.7225, + "step": 3754 + }, + { + "epoch": 0.7518014411529224, + "learning_rate": 1.9819927571953804e-05, + "loss": 0.3715, + "step": 3756 + }, + { + "epoch": 0.7526020816653323, + "learning_rate": 1.981727783594506e-05, + "loss": 0.04, + "step": 3758 + }, + { + "epoch": 0.7526020816653323, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.1206, + "step": 3760 + }, + { + "epoch": 0.7534027221777422, + "learning_rate": 1.9811920849071092e-05, + "loss": 0.4089, + "step": 3762 + }, + { + "epoch": 0.7534027221777422, + "learning_rate": 1.980921360866819e-05, + "loss": 0.2933, + "step": 3764 + }, + { + "epoch": 0.7542033626901521, + "learning_rate": 1.980648721065859e-05, + "loss": 0.3881, + "step": 3766 + }, + { + "epoch": 0.7542033626901521, + "learning_rate": 1.9803741660367018e-05, + "loss": 0.4265, + "step": 3768 + }, + { + "epoch": 0.755004003202562, + "learning_rate": 1.980097696315558e-05, + "loss": 0.3214, + "step": 3770 + }, + { + "epoch": 0.755004003202562, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.0948, + "step": 3772 + }, + { + "epoch": 0.755804643714972, + "learning_rate": 1.979539014960858e-05, + "loss": 0.1034, + "step": 3774 + }, + { + "epoch": 0.755804643714972, + "learning_rate": 1.979256804418418e-05, + "loss": 0.0632, + "step": 3776 + }, + { + "epoch": 0.7566052842273819, + "learning_rate": 1.9789726813662233e-05, + "loss": 0.4583, + "step": 3778 + }, + { + "epoch": 0.7566052842273819, + "learning_rate": 1.978686646359173e-05, + "loss": 0.0521, + "step": 3780 + }, + { + "epoch": 0.7574059247397918, + "learning_rate": 1.9783986999558994e-05, + "loss": 0.0631, + "step": 3782 + }, + { + "epoch": 0.7574059247397918, + "learning_rate": 1.9781088427187677e-05, + "loss": 0.3361, + "step": 3784 + }, + { + "epoch": 0.7582065652522018, + "learning_rate": 1.9778170752138763e-05, + "loss": 0.3639, + "step": 3786 + }, + { + "epoch": 0.7582065652522018, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.3058, + "step": 3788 + }, + { + "epoch": 0.7590072057646117, + "learning_rate": 1.9772278116838546e-05, + "loss": 0.3806, + "step": 3790 + }, + { + "epoch": 0.7590072057646117, + "learning_rate": 1.976930316809569e-05, + "loss": 0.1293, + "step": 3792 + }, + { + "epoch": 0.7598078462770216, + "learning_rate": 1.97663091396921e-05, + "loss": 0.6871, + "step": 3794 + }, + { + "epoch": 0.7598078462770216, + "learning_rate": 1.9763296037475177e-05, + "loss": 0.2709, + "step": 3796 + }, + { + "epoch": 0.7606084867894315, + "learning_rate": 1.976026386732957e-05, + "loss": 0.4163, + "step": 3798 + }, + { + "epoch": 0.7606084867894315, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.216, + "step": 3800 + }, + { + "epoch": 0.7614091273018415, + "learning_rate": 1.9754142346977122e-05, + "loss": 0.0475, + "step": 3802 + }, + { + "epoch": 0.7614091273018415, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.0482, + "step": 3804 + }, + { + "epoch": 0.7622097678142514, + "learning_rate": 1.9747944626456577e-05, + "loss": 0.0379, + "step": 3806 + }, + { + "epoch": 0.7622097678142514, + "learning_rate": 1.9744817206240374e-05, + "loss": 0.2659, + "step": 3808 + }, + { + "epoch": 0.7630104083266613, + "learning_rate": 1.9741670754185054e-05, + "loss": 0.4142, + "step": 3810 + }, + { + "epoch": 0.7630104083266613, + "learning_rate": 1.9738505276435695e-05, + "loss": 0.1768, + "step": 3812 + }, + { + "epoch": 0.7638110488390712, + "learning_rate": 1.9735320779174548e-05, + "loss": 0.4399, + "step": 3814 + }, + { + "epoch": 0.7638110488390712, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.1909, + "step": 3816 + }, + { + "epoch": 0.7646116893514812, + "learning_rate": 1.9728894751031595e-05, + "loss": 0.4264, + "step": 3818 + }, + { + "epoch": 0.7646116893514812, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.1995, + "step": 3820 + }, + { + "epoch": 0.7654123298638911, + "learning_rate": 1.972239271995686e-05, + "loss": 0.3101, + "step": 3822 + }, + { + "epoch": 0.7654123298638911, + "learning_rate": 1.9719113219170152e-05, + "loss": 0.2051, + "step": 3824 + }, + { + "epoch": 0.7662129703763011, + "learning_rate": 1.9715814736744758e-05, + "loss": 0.315, + "step": 3826 + }, + { + "epoch": 0.7662129703763011, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.3641, + "step": 3828 + }, + { + "epoch": 0.767013610888711, + "learning_rate": 1.9709160852783022e-05, + "loss": 0.1296, + "step": 3830 + }, + { + "epoch": 0.767013610888711, + "learning_rate": 1.970580546424186e-05, + "loss": 0.0977, + "step": 3832 + }, + { + "epoch": 0.7678142514011209, + "learning_rate": 1.9702431120052352e-05, + "loss": 0.0429, + "step": 3834 + }, + { + "epoch": 0.7678142514011209, + "learning_rate": 1.969903782680467e-05, + "loss": 0.2098, + "step": 3836 + }, + { + "epoch": 0.7686148919135308, + "learning_rate": 1.9695625591125984e-05, + "loss": 0.3348, + "step": 3838 + }, + { + "epoch": 0.7686148919135308, + "learning_rate": 1.9692194419680463e-05, + "loss": 0.1006, + "step": 3840 + }, + { + "epoch": 0.7694155324259407, + "learning_rate": 1.968874431916926e-05, + "loss": 0.2537, + "step": 3842 + }, + { + "epoch": 0.7694155324259407, + "learning_rate": 1.96852752963305e-05, + "loss": 0.2702, + "step": 3844 + }, + { + "epoch": 0.7702161729383507, + "learning_rate": 1.9681787357939257e-05, + "loss": 0.2561, + "step": 3846 + }, + { + "epoch": 0.7702161729383507, + "learning_rate": 1.9678280510807552e-05, + "loss": 0.0297, + "step": 3848 + }, + { + "epoch": 0.7710168134507606, + "learning_rate": 1.9674754761784334e-05, + "loss": 0.0975, + "step": 3850 + }, + { + "epoch": 0.7710168134507606, + "learning_rate": 1.9671210117755462e-05, + "loss": 0.0518, + "step": 3852 + }, + { + "epoch": 0.7718174539631706, + "learning_rate": 1.9667646585643706e-05, + "loss": 0.3801, + "step": 3854 + }, + { + "epoch": 0.7718174539631706, + "learning_rate": 1.966406417240872e-05, + "loss": 0.184, + "step": 3856 + }, + { + "epoch": 0.7726180944755805, + "learning_rate": 1.966046288504704e-05, + "loss": 0.3041, + "step": 3858 + }, + { + "epoch": 0.7726180944755805, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.0737, + "step": 3860 + }, + { + "epoch": 0.7734187349879904, + "learning_rate": 1.965320371611399e-05, + "loss": 0.2961, + "step": 3862 + }, + { + "epoch": 0.7734187349879904, + "learning_rate": 1.964954584871995e-05, + "loss": 0.3276, + "step": 3864 + }, + { + "epoch": 0.7742193755004003, + "learning_rate": 1.964586913555381e-05, + "loss": 0.093, + "step": 3866 + }, + { + "epoch": 0.7742193755004003, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.1847, + "step": 3868 + }, + { + "epoch": 0.7750200160128102, + "learning_rate": 1.9638459200664822e-05, + "loss": 1.5551, + "step": 3870 + }, + { + "epoch": 0.7750200160128102, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.4971, + "step": 3872 + }, + { + "epoch": 0.7758206565252201, + "learning_rate": 1.9630973969334068e-05, + "loss": 0.1452, + "step": 3874 + }, + { + "epoch": 0.7758206565252201, + "learning_rate": 1.9627203135753576e-05, + "loss": 0.3211, + "step": 3876 + }, + { + "epoch": 0.77662129703763, + "learning_rate": 1.9623413500036795e-05, + "loss": 0.1499, + "step": 3878 + }, + { + "epoch": 0.77662129703763, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.0457, + "step": 3880 + }, + { + "epoch": 0.7774219375500401, + "learning_rate": 1.9615777851836007e-05, + "loss": 0.0965, + "step": 3882 + }, + { + "epoch": 0.7774219375500401, + "learning_rate": 1.961193185426459e-05, + "loss": 0.5337, + "step": 3884 + }, + { + "epoch": 0.77822257806245, + "learning_rate": 1.9608067084382025e-05, + "loss": 0.4258, + "step": 3886 + }, + { + "epoch": 0.77822257806245, + "learning_rate": 1.9604183549736287e-05, + "loss": 0.2759, + "step": 3888 + }, + { + "epoch": 0.7790232185748599, + "learning_rate": 1.9600281257912002e-05, + "loss": 0.3521, + "step": 3890 + }, + { + "epoch": 0.7790232185748599, + "learning_rate": 1.959636021653044e-05, + "loss": 0.2053, + "step": 3892 + }, + { + "epoch": 0.7798238590872698, + "learning_rate": 1.9592420433249465e-05, + "loss": 0.3264, + "step": 3894 + }, + { + "epoch": 0.7798238590872698, + "learning_rate": 1.958846191576357e-05, + "loss": 0.3302, + "step": 3896 + }, + { + "epoch": 0.7806244995996797, + "learning_rate": 1.958448467180382e-05, + "loss": 0.1118, + "step": 3898 + }, + { + "epoch": 0.7806244995996797, + "learning_rate": 1.958048870913786e-05, + "loss": 0.3856, + "step": 3900 + }, + { + "epoch": 0.7814251401120896, + "learning_rate": 1.9576474035569895e-05, + "loss": 0.0231, + "step": 3902 + }, + { + "epoch": 0.7814251401120896, + "learning_rate": 1.9572440658940667e-05, + "loss": 0.1528, + "step": 3904 + }, + { + "epoch": 0.7822257806244995, + "learning_rate": 1.9568388587127448e-05, + "loss": 0.102, + "step": 3906 + }, + { + "epoch": 0.7822257806244995, + "learning_rate": 1.9564317828044022e-05, + "loss": 0.4118, + "step": 3908 + }, + { + "epoch": 0.7830264211369096, + "learning_rate": 1.9560228389640668e-05, + "loss": 0.0435, + "step": 3910 + }, + { + "epoch": 0.7830264211369096, + "learning_rate": 1.955612027990415e-05, + "loss": 0.2863, + "step": 3912 + }, + { + "epoch": 0.7838270616493195, + "learning_rate": 1.955199350685769e-05, + "loss": 0.6138, + "step": 3914 + }, + { + "epoch": 0.7838270616493195, + "learning_rate": 1.9547848078560982e-05, + "loss": 0.1772, + "step": 3916 + }, + { + "epoch": 0.7846277021617294, + "learning_rate": 1.954368400311011e-05, + "loss": 0.13, + "step": 3918 + }, + { + "epoch": 0.7846277021617294, + "learning_rate": 1.953950128863763e-05, + "loss": 0.0874, + "step": 3920 + }, + { + "epoch": 0.7854283426741393, + "learning_rate": 1.9535299943312455e-05, + "loss": 0.333, + "step": 3922 + }, + { + "epoch": 0.7854283426741393, + "learning_rate": 1.9531079975339915e-05, + "loss": 0.0964, + "step": 3924 + }, + { + "epoch": 0.7862289831865492, + "learning_rate": 1.9526841392961694e-05, + "loss": 0.1868, + "step": 3926 + }, + { + "epoch": 0.7862289831865492, + "learning_rate": 1.9522584204455835e-05, + "loss": 0.2265, + "step": 3928 + }, + { + "epoch": 0.7870296236989591, + "learning_rate": 1.9518308418136728e-05, + "loss": 0.3069, + "step": 3930 + }, + { + "epoch": 0.7870296236989591, + "learning_rate": 1.9514014042355054e-05, + "loss": 0.0891, + "step": 3932 + }, + { + "epoch": 0.7878302642113691, + "learning_rate": 1.9509701085497852e-05, + "loss": 0.6903, + "step": 3934 + }, + { + "epoch": 0.7878302642113691, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.6734, + "step": 3936 + }, + { + "epoch": 0.7886309047237791, + "learning_rate": 1.9501019462286266e-05, + "loss": 0.0502, + "step": 3938 + }, + { + "epoch": 0.7886309047237791, + "learning_rate": 1.9496650812887293e-05, + "loss": 0.236, + "step": 3940 + }, + { + "epoch": 0.789431545236189, + "learning_rate": 1.9492263616323536e-05, + "loss": 0.3278, + "step": 3942 + }, + { + "epoch": 0.789431545236189, + "learning_rate": 1.9487857881163295e-05, + "loss": 0.4183, + "step": 3944 + }, + { + "epoch": 0.7902321857485989, + "learning_rate": 1.948343361601105e-05, + "loss": 0.0856, + "step": 3946 + }, + { + "epoch": 0.7902321857485989, + "learning_rate": 1.947899082950751e-05, + "loss": 0.2194, + "step": 3948 + }, + { + "epoch": 0.7910328262610088, + "learning_rate": 1.947452953032951e-05, + "loss": 0.6366, + "step": 3950 + }, + { + "epoch": 0.7910328262610088, + "learning_rate": 1.947004972719008e-05, + "loss": 0.4751, + "step": 3952 + }, + { + "epoch": 0.7918334667734187, + "learning_rate": 1.9465551428838363e-05, + "loss": 0.2253, + "step": 3954 + }, + { + "epoch": 0.7918334667734187, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.4032, + "step": 3956 + }, + { + "epoch": 0.7926341072858286, + "learning_rate": 1.9456499381675285e-05, + "loss": 0.1949, + "step": 3958 + }, + { + "epoch": 0.7926341072858286, + "learning_rate": 1.945194565054276e-05, + "loss": 0.091, + "step": 3960 + }, + { + "epoch": 0.7934347477982386, + "learning_rate": 1.9447373459555617e-05, + "loss": 0.0897, + "step": 3962 + }, + { + "epoch": 0.7934347477982386, + "learning_rate": 1.9442782817643425e-05, + "loss": 0.1539, + "step": 3964 + }, + { + "epoch": 0.7942353883106485, + "learning_rate": 1.9438173733771814e-05, + "loss": 0.1241, + "step": 3966 + }, + { + "epoch": 0.7942353883106485, + "learning_rate": 1.9433546216942433e-05, + "loss": 0.4647, + "step": 3968 + }, + { + "epoch": 0.7950360288230585, + "learning_rate": 1.9428900276192903e-05, + "loss": 0.497, + "step": 3970 + }, + { + "epoch": 0.7950360288230585, + "learning_rate": 1.942423592059687e-05, + "loss": 0.3702, + "step": 3972 + }, + { + "epoch": 0.7958366693354684, + "learning_rate": 1.94195531592639e-05, + "loss": 0.4405, + "step": 3974 + }, + { + "epoch": 0.7958366693354684, + "learning_rate": 1.941485200133955e-05, + "loss": 0.1456, + "step": 3976 + }, + { + "epoch": 0.7966373098478783, + "learning_rate": 1.9410132456005262e-05, + "loss": 0.5228, + "step": 3978 + }, + { + "epoch": 0.7966373098478783, + "learning_rate": 1.9405394532478422e-05, + "loss": 0.1101, + "step": 3980 + }, + { + "epoch": 0.7974379503602882, + "learning_rate": 1.94006382400123e-05, + "loss": 0.5661, + "step": 3982 + }, + { + "epoch": 0.7974379503602882, + "learning_rate": 1.9395863587896025e-05, + "loss": 0.7427, + "step": 3984 + }, + { + "epoch": 0.7982385908726981, + "learning_rate": 1.939107058545461e-05, + "loss": 0.1306, + "step": 3986 + }, + { + "epoch": 0.7982385908726981, + "learning_rate": 1.938625924204888e-05, + "loss": 0.0894, + "step": 3988 + }, + { + "epoch": 0.7990392313851081, + "learning_rate": 1.9381429567075507e-05, + "loss": 0.4499, + "step": 3990 + }, + { + "epoch": 0.7990392313851081, + "learning_rate": 1.937658156996694e-05, + "loss": 0.3079, + "step": 3992 + }, + { + "epoch": 0.799839871897518, + "learning_rate": 1.9371715260191425e-05, + "loss": 0.1551, + "step": 3994 + }, + { + "epoch": 0.799839871897518, + "learning_rate": 1.9366830647252977e-05, + "loss": 0.1771, + "step": 3996 + }, + { + "epoch": 0.800640512409928, + "learning_rate": 1.936192774069133e-05, + "loss": 0.1781, + "step": 3998 + }, + { + "epoch": 0.800640512409928, + "learning_rate": 1.9357006550082e-05, + "loss": 0.5268, + "step": 4000 + }, + { + "epoch": 0.8014411529223379, + "learning_rate": 1.9352067085036145e-05, + "loss": 0.2984, + "step": 4002 + }, + { + "epoch": 0.8014411529223379, + "learning_rate": 1.9347109355200676e-05, + "loss": 0.4718, + "step": 4004 + }, + { + "epoch": 0.8022417934347478, + "learning_rate": 1.9342133370258124e-05, + "loss": 0.0789, + "step": 4006 + }, + { + "epoch": 0.8022417934347478, + "learning_rate": 1.933713913992671e-05, + "loss": 0.1489, + "step": 4008 + }, + { + "epoch": 0.8030424339471577, + "learning_rate": 1.9332126673960276e-05, + "loss": 0.2452, + "step": 4010 + }, + { + "epoch": 0.8030424339471577, + "learning_rate": 1.9327095982148255e-05, + "loss": 0.2166, + "step": 4012 + }, + { + "epoch": 0.8038430744595677, + "learning_rate": 1.932204707431572e-05, + "loss": 1.1368, + "step": 4014 + }, + { + "epoch": 0.8038430744595677, + "learning_rate": 1.9316979960323283e-05, + "loss": 0.9567, + "step": 4016 + }, + { + "epoch": 0.8046437149719776, + "learning_rate": 1.9311894650067146e-05, + "loss": 0.5507, + "step": 4018 + }, + { + "epoch": 0.8046437149719776, + "learning_rate": 1.9306791153479017e-05, + "loss": 0.7231, + "step": 4020 + }, + { + "epoch": 0.8054443554843875, + "learning_rate": 1.9301669480526118e-05, + "loss": 0.402, + "step": 4022 + }, + { + "epoch": 0.8054443554843875, + "learning_rate": 1.9296529641211226e-05, + "loss": 0.5114, + "step": 4024 + }, + { + "epoch": 0.8062449959967974, + "learning_rate": 1.929137164557252e-05, + "loss": 0.4273, + "step": 4026 + }, + { + "epoch": 0.8062449959967974, + "learning_rate": 1.928619550368371e-05, + "loss": 0.0734, + "step": 4028 + }, + { + "epoch": 0.8070456365092074, + "learning_rate": 1.9281001225653883e-05, + "loss": 0.1227, + "step": 4030 + }, + { + "epoch": 0.8070456365092074, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.6313, + "step": 4032 + }, + { + "epoch": 0.8078462770216173, + "learning_rate": 1.9270558301784808e-05, + "loss": 0.1663, + "step": 4034 + }, + { + "epoch": 0.8078462770216173, + "learning_rate": 1.9265309676340783e-05, + "loss": 0.1152, + "step": 4036 + }, + { + "epoch": 0.8086469175340272, + "learning_rate": 1.9260042955546247e-05, + "loss": 0.3344, + "step": 4038 + }, + { + "epoch": 0.8086469175340272, + "learning_rate": 1.9254758149687187e-05, + "loss": 0.2905, + "step": 4040 + }, + { + "epoch": 0.8094475580464372, + "learning_rate": 1.9249455269084972e-05, + "loss": 0.393, + "step": 4042 + }, + { + "epoch": 0.8094475580464372, + "learning_rate": 1.9244134324096216e-05, + "loss": 0.4155, + "step": 4044 + }, + { + "epoch": 0.8102481985588471, + "learning_rate": 1.9238795325112867e-05, + "loss": 0.7045, + "step": 4046 + }, + { + "epoch": 0.8102481985588471, + "learning_rate": 1.9233438282562095e-05, + "loss": 0.2533, + "step": 4048 + }, + { + "epoch": 0.811048839071257, + "learning_rate": 1.9228063206906302e-05, + "loss": 0.3705, + "step": 4050 + }, + { + "epoch": 0.811048839071257, + "learning_rate": 1.9222670108643156e-05, + "loss": 0.3493, + "step": 4052 + }, + { + "epoch": 0.8118494795836669, + "learning_rate": 1.9217258998305464e-05, + "loss": 0.3158, + "step": 4054 + }, + { + "epoch": 0.8118494795836669, + "learning_rate": 1.9211829886461278e-05, + "loss": 0.7624, + "step": 4056 + }, + { + "epoch": 0.8126501200960768, + "learning_rate": 1.9206382783713735e-05, + "loss": 0.5377, + "step": 4058 + }, + { + "epoch": 0.8126501200960768, + "learning_rate": 1.9200917700701176e-05, + "loss": 0.1774, + "step": 4060 + }, + { + "epoch": 0.8134507606084868, + "learning_rate": 1.9195434648097013e-05, + "loss": 0.2588, + "step": 4062 + }, + { + "epoch": 0.8134507606084868, + "learning_rate": 1.918993363660975e-05, + "loss": 0.3508, + "step": 4064 + }, + { + "epoch": 0.8142514011208967, + "learning_rate": 1.9184414676983013e-05, + "loss": 0.0628, + "step": 4066 + }, + { + "epoch": 0.8142514011208967, + "learning_rate": 1.9178877779995416e-05, + "loss": 0.177, + "step": 4068 + }, + { + "epoch": 0.8150520416333067, + "learning_rate": 1.9173322956460678e-05, + "loss": 0.3999, + "step": 4070 + }, + { + "epoch": 0.8150520416333067, + "learning_rate": 1.916775021722745e-05, + "loss": 0.1469, + "step": 4072 + }, + { + "epoch": 0.8158526821457166, + "learning_rate": 1.9162159573179446e-05, + "loss": 0.1995, + "step": 4074 + }, + { + "epoch": 0.8158526821457166, + "learning_rate": 1.9156551035235298e-05, + "loss": 0.2017, + "step": 4076 + }, + { + "epoch": 0.8166533226581265, + "learning_rate": 1.915092461434859e-05, + "loss": 0.3223, + "step": 4078 + }, + { + "epoch": 0.8166533226581265, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.2158, + "step": 4080 + }, + { + "epoch": 0.8174539631705364, + "learning_rate": 1.9139618167736547e-05, + "loss": 0.2226, + "step": 4082 + }, + { + "epoch": 0.8174539631705364, + "learning_rate": 1.9133938164092942e-05, + "loss": 0.279, + "step": 4084 + }, + { + "epoch": 0.8182546036829463, + "learning_rate": 1.912824032167022e-05, + "loss": 0.2046, + "step": 4086 + }, + { + "epoch": 0.8182546036829463, + "learning_rate": 1.9122524651596372e-05, + "loss": 0.1504, + "step": 4088 + }, + { + "epoch": 0.8190552441953562, + "learning_rate": 1.911679116503426e-05, + "loss": 0.4899, + "step": 4090 + }, + { + "epoch": 0.8190552441953562, + "learning_rate": 1.9111039873181475e-05, + "loss": 0.4517, + "step": 4092 + }, + { + "epoch": 0.8198558847077662, + "learning_rate": 1.9105270787270446e-05, + "loss": 0.3235, + "step": 4094 + }, + { + "epoch": 0.8198558847077662, + "learning_rate": 1.9099483918568287e-05, + "loss": 0.2614, + "step": 4096 + }, + { + "epoch": 0.8206565252201762, + "learning_rate": 1.9093679278376913e-05, + "loss": 0.2099, + "step": 4098 + }, + { + "epoch": 0.8206565252201762, + "learning_rate": 1.90878568780329e-05, + "loss": 0.1343, + "step": 4100 + }, + { + "epoch": 0.8214571657325861, + "learning_rate": 1.90820167289075e-05, + "loss": 0.2331, + "step": 4102 + }, + { + "epoch": 0.8214571657325861, + "learning_rate": 1.907615884240668e-05, + "loss": 0.0424, + "step": 4104 + }, + { + "epoch": 0.822257806244996, + "learning_rate": 1.9070283229971003e-05, + "loss": 0.393, + "step": 4106 + }, + { + "epoch": 0.822257806244996, + "learning_rate": 1.9064389903075683e-05, + "loss": 0.126, + "step": 4108 + }, + { + "epoch": 0.8230584467574059, + "learning_rate": 1.9058478873230487e-05, + "loss": 0.1048, + "step": 4110 + }, + { + "epoch": 0.8230584467574059, + "learning_rate": 1.905255015197982e-05, + "loss": 0.0693, + "step": 4112 + }, + { + "epoch": 0.8238590872698158, + "learning_rate": 1.9046603750902585e-05, + "loss": 0.3096, + "step": 4114 + }, + { + "epoch": 0.8238590872698158, + "learning_rate": 1.9040639681612216e-05, + "loss": 0.2963, + "step": 4116 + }, + { + "epoch": 0.8246597277822257, + "learning_rate": 1.9034657955756702e-05, + "loss": 0.1052, + "step": 4118 + }, + { + "epoch": 0.8246597277822257, + "learning_rate": 1.902865858501845e-05, + "loss": 0.2453, + "step": 4120 + }, + { + "epoch": 0.8254603682946358, + "learning_rate": 1.9022641581114396e-05, + "loss": 0.0729, + "step": 4122 + }, + { + "epoch": 0.8254603682946358, + "learning_rate": 1.9016606955795843e-05, + "loss": 0.2668, + "step": 4124 + }, + { + "epoch": 0.8262610088070457, + "learning_rate": 1.901055472084858e-05, + "loss": 0.3976, + "step": 4126 + }, + { + "epoch": 0.8262610088070457, + "learning_rate": 1.9004484888092734e-05, + "loss": 0.4267, + "step": 4128 + }, + { + "epoch": 0.8270616493194556, + "learning_rate": 1.8998397469382812e-05, + "loss": 0.2162, + "step": 4130 + }, + { + "epoch": 0.8270616493194556, + "learning_rate": 1.8992292476607695e-05, + "loss": 0.0524, + "step": 4132 + }, + { + "epoch": 0.8278622898318655, + "learning_rate": 1.898616992169054e-05, + "loss": 0.1485, + "step": 4134 + }, + { + "epoch": 0.8278622898318655, + "learning_rate": 1.8980029816588863e-05, + "loss": 0.0866, + "step": 4136 + }, + { + "epoch": 0.8286629303442754, + "learning_rate": 1.89738721732944e-05, + "loss": 0.2168, + "step": 4138 + }, + { + "epoch": 0.8286629303442754, + "learning_rate": 1.8967697003833156e-05, + "loss": 0.6759, + "step": 4140 + }, + { + "epoch": 0.8294635708566853, + "learning_rate": 1.8961504320265392e-05, + "loss": 0.0998, + "step": 4142 + }, + { + "epoch": 0.8294635708566853, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.1735, + "step": 4144 + }, + { + "epoch": 0.8302642113690952, + "learning_rate": 1.8949066459222224e-05, + "loss": 0.0743, + "step": 4146 + }, + { + "epoch": 0.8302642113690952, + "learning_rate": 1.8942821306038227e-05, + "loss": 0.1149, + "step": 4148 + }, + { + "epoch": 0.8310648518815053, + "learning_rate": 1.8936558687330492e-05, + "loss": 0.2024, + "step": 4150 + }, + { + "epoch": 0.8310648518815053, + "learning_rate": 1.893027861533003e-05, + "loss": 0.0641, + "step": 4152 + }, + { + "epoch": 0.8318654923939152, + "learning_rate": 1.8923981102301944e-05, + "loss": 0.2379, + "step": 4154 + }, + { + "epoch": 0.8318654923939152, + "learning_rate": 1.891766616054545e-05, + "loss": 0.0923, + "step": 4156 + }, + { + "epoch": 0.8326661329063251, + "learning_rate": 1.8911333802393725e-05, + "loss": 0.1586, + "step": 4158 + }, + { + "epoch": 0.8326661329063251, + "learning_rate": 1.8904984040214043e-05, + "loss": 0.7612, + "step": 4160 + }, + { + "epoch": 0.833466773418735, + "learning_rate": 1.8898616886407588e-05, + "loss": 0.332, + "step": 4162 + }, + { + "epoch": 0.833466773418735, + "learning_rate": 1.8892232353409582e-05, + "loss": 0.1042, + "step": 4164 + }, + { + "epoch": 0.8342674139311449, + "learning_rate": 1.8885830453689146e-05, + "loss": 0.1941, + "step": 4166 + }, + { + "epoch": 0.8342674139311449, + "learning_rate": 1.8879411199749306e-05, + "loss": 0.1379, + "step": 4168 + }, + { + "epoch": 0.8350680544435548, + "learning_rate": 1.8872974604127038e-05, + "loss": 0.4546, + "step": 4170 + }, + { + "epoch": 0.8350680544435548, + "learning_rate": 1.8866520679393124e-05, + "loss": 0.5964, + "step": 4172 + }, + { + "epoch": 0.8358686949559647, + "learning_rate": 1.8860049438152247e-05, + "loss": 0.2896, + "step": 4174 + }, + { + "epoch": 0.8358686949559647, + "learning_rate": 1.885356089304285e-05, + "loss": 0.219, + "step": 4176 + }, + { + "epoch": 0.8366693354683747, + "learning_rate": 1.8847055056737236e-05, + "loss": 0.7804, + "step": 4178 + }, + { + "epoch": 0.8366693354683747, + "learning_rate": 1.884053194194143e-05, + "loss": 0.3533, + "step": 4180 + }, + { + "epoch": 0.8374699759807847, + "learning_rate": 1.8833991561395194e-05, + "loss": 0.2777, + "step": 4182 + }, + { + "epoch": 0.8374699759807847, + "learning_rate": 1.882743392787207e-05, + "loss": 0.2771, + "step": 4184 + }, + { + "epoch": 0.8382706164931946, + "learning_rate": 1.8820859054179225e-05, + "loss": 0.3483, + "step": 4186 + }, + { + "epoch": 0.8382706164931946, + "learning_rate": 1.881426695315756e-05, + "loss": 0.2961, + "step": 4188 + }, + { + "epoch": 0.8390712570056045, + "learning_rate": 1.8807657637681577e-05, + "loss": 0.1032, + "step": 4190 + }, + { + "epoch": 0.8390712570056045, + "learning_rate": 1.8801031120659396e-05, + "loss": 0.0654, + "step": 4192 + }, + { + "epoch": 0.8398718975180144, + "learning_rate": 1.8794387415032783e-05, + "loss": 0.1436, + "step": 4194 + }, + { + "epoch": 0.8398718975180144, + "learning_rate": 1.8787726533777003e-05, + "loss": 0.29, + "step": 4196 + }, + { + "epoch": 0.8406725380304243, + "learning_rate": 1.8781048489900936e-05, + "loss": 0.5595, + "step": 4198 + }, + { + "epoch": 0.8406725380304243, + "learning_rate": 1.877435329644691e-05, + "loss": 0.503, + "step": 4200 + }, + { + "epoch": 0.8414731785428343, + "learning_rate": 1.876764096649082e-05, + "loss": 0.0853, + "step": 4202 + }, + { + "epoch": 0.8414731785428343, + "learning_rate": 1.8760911513141974e-05, + "loss": 0.3093, + "step": 4204 + }, + { + "epoch": 0.8422738190552442, + "learning_rate": 1.8754164949543123e-05, + "loss": 0.5763, + "step": 4206 + }, + { + "epoch": 0.8422738190552442, + "learning_rate": 1.8747401288870482e-05, + "loss": 0.1334, + "step": 4208 + }, + { + "epoch": 0.8430744595676541, + "learning_rate": 1.8740620544333604e-05, + "loss": 0.2274, + "step": 4210 + }, + { + "epoch": 0.8430744595676541, + "learning_rate": 1.8733822729175455e-05, + "loss": 0.1392, + "step": 4212 + }, + { + "epoch": 0.8438751000800641, + "learning_rate": 1.872700785667228e-05, + "loss": 0.2891, + "step": 4214 + }, + { + "epoch": 0.8438751000800641, + "learning_rate": 1.8720175940133712e-05, + "loss": 0.0647, + "step": 4216 + }, + { + "epoch": 0.844675740592474, + "learning_rate": 1.8713326992902612e-05, + "loss": 0.1764, + "step": 4218 + }, + { + "epoch": 0.844675740592474, + "learning_rate": 1.8706461028355107e-05, + "loss": 0.0713, + "step": 4220 + }, + { + "epoch": 0.8454763811048839, + "learning_rate": 1.8699578059900604e-05, + "loss": 0.3322, + "step": 4222 + }, + { + "epoch": 0.8454763811048839, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.0381, + "step": 4224 + }, + { + "epoch": 0.8462770216172938, + "learning_rate": 1.868576116507408e-05, + "loss": 0.3873, + "step": 4226 + }, + { + "epoch": 0.8462770216172938, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.0987, + "step": 4228 + }, + { + "epoch": 0.8470776621297038, + "learning_rate": 1.8671876416361767e-05, + "loss": 0.6828, + "step": 4230 + }, + { + "epoch": 0.8470776621297038, + "learning_rate": 1.8664908630674264e-05, + "loss": 0.8818, + "step": 4232 + }, + { + "epoch": 0.8478783026421137, + "learning_rate": 1.8657923922232467e-05, + "loss": 0.108, + "step": 4234 + }, + { + "epoch": 0.8478783026421137, + "learning_rate": 1.86509223046777e-05, + "loss": 0.3958, + "step": 4236 + }, + { + "epoch": 0.8486789431545236, + "learning_rate": 1.8643903791684228e-05, + "loss": 0.724, + "step": 4238 + }, + { + "epoch": 0.8486789431545236, + "learning_rate": 1.8636868396959406e-05, + "loss": 0.7338, + "step": 4240 + }, + { + "epoch": 0.8494795836669335, + "learning_rate": 1.8629816134243466e-05, + "loss": 0.8025, + "step": 4242 + }, + { + "epoch": 0.8494795836669335, + "learning_rate": 1.8622747017309676e-05, + "loss": 0.1481, + "step": 4244 + }, + { + "epoch": 0.8502802241793435, + "learning_rate": 1.8615661059964148e-05, + "loss": 0.1442, + "step": 4246 + }, + { + "epoch": 0.8502802241793435, + "learning_rate": 1.8608558276045898e-05, + "loss": 0.0912, + "step": 4248 + }, + { + "epoch": 0.8510808646917534, + "learning_rate": 1.860143867942685e-05, + "loss": 0.1765, + "step": 4250 + }, + { + "epoch": 0.8510808646917534, + "learning_rate": 1.8594302284011697e-05, + "loss": 0.4218, + "step": 4252 + }, + { + "epoch": 0.8518815052041633, + "learning_rate": 1.8587149103738006e-05, + "loss": 0.2778, + "step": 4254 + }, + { + "epoch": 0.8518815052041633, + "learning_rate": 1.8579979152576076e-05, + "loss": 0.1689, + "step": 4256 + }, + { + "epoch": 0.8526821457165733, + "learning_rate": 1.8572792444528963e-05, + "loss": 0.1851, + "step": 4258 + }, + { + "epoch": 0.8526821457165733, + "learning_rate": 1.8565588993632498e-05, + "loss": 0.4656, + "step": 4260 + }, + { + "epoch": 0.8534827862289832, + "learning_rate": 1.8558368813955136e-05, + "loss": 0.1888, + "step": 4262 + }, + { + "epoch": 0.8534827862289832, + "learning_rate": 1.8551131919598084e-05, + "loss": 0.3703, + "step": 4264 + }, + { + "epoch": 0.8542834267413931, + "learning_rate": 1.854387832469512e-05, + "loss": 0.0515, + "step": 4266 + }, + { + "epoch": 0.8542834267413931, + "learning_rate": 1.8536608043412702e-05, + "loss": 0.1036, + "step": 4268 + }, + { + "epoch": 0.855084067253803, + "learning_rate": 1.8529321089949833e-05, + "loss": 0.4333, + "step": 4270 + }, + { + "epoch": 0.855084067253803, + "learning_rate": 1.852201747853807e-05, + "loss": 0.2671, + "step": 4272 + }, + { + "epoch": 0.855884707766213, + "learning_rate": 1.8514697223441565e-05, + "loss": 0.118, + "step": 4274 + }, + { + "epoch": 0.855884707766213, + "learning_rate": 1.8507360338956896e-05, + "loss": 0.3216, + "step": 4276 + }, + { + "epoch": 0.8566853482786229, + "learning_rate": 1.850000683941319e-05, + "loss": 0.2166, + "step": 4278 + }, + { + "epoch": 0.8566853482786229, + "learning_rate": 1.849263673917196e-05, + "loss": 0.5776, + "step": 4280 + }, + { + "epoch": 0.8574859887910328, + "learning_rate": 1.8485250052627205e-05, + "loss": 0.0781, + "step": 4282 + }, + { + "epoch": 0.8574859887910328, + "learning_rate": 1.847784679420527e-05, + "loss": 0.305, + "step": 4284 + }, + { + "epoch": 0.8582866293034428, + "learning_rate": 1.8470426978364857e-05, + "loss": 0.5779, + "step": 4286 + }, + { + "epoch": 0.8582866293034428, + "learning_rate": 1.846299061959706e-05, + "loss": 0.6686, + "step": 4288 + }, + { + "epoch": 0.8590872698158527, + "learning_rate": 1.845553773242522e-05, + "loss": 0.507, + "step": 4290 + }, + { + "epoch": 0.8590872698158527, + "learning_rate": 1.8448068331405018e-05, + "loss": 0.2231, + "step": 4292 + }, + { + "epoch": 0.8598879103282626, + "learning_rate": 1.8440582431124322e-05, + "loss": 0.2765, + "step": 4294 + }, + { + "epoch": 0.8598879103282626, + "learning_rate": 1.8433080046203293e-05, + "loss": 0.1445, + "step": 4296 + }, + { + "epoch": 0.8606885508406725, + "learning_rate": 1.842556119129423e-05, + "loss": 0.4974, + "step": 4298 + }, + { + "epoch": 0.8606885508406725, + "learning_rate": 1.841802588108161e-05, + "loss": 0.6648, + "step": 4300 + }, + { + "epoch": 0.8614891913530824, + "learning_rate": 1.841047413028209e-05, + "loss": 0.5033, + "step": 4302 + }, + { + "epoch": 0.8614891913530824, + "learning_rate": 1.8402905953644356e-05, + "loss": 0.4975, + "step": 4304 + }, + { + "epoch": 0.8622898318654924, + "learning_rate": 1.8395321365949273e-05, + "loss": 0.1244, + "step": 4306 + }, + { + "epoch": 0.8622898318654924, + "learning_rate": 1.838772038200968e-05, + "loss": 0.2322, + "step": 4308 + }, + { + "epoch": 0.8630904723779024, + "learning_rate": 1.838010301667044e-05, + "loss": 0.0771, + "step": 4310 + }, + { + "epoch": 0.8630904723779024, + "learning_rate": 1.837246928480848e-05, + "loss": 0.0972, + "step": 4312 + }, + { + "epoch": 0.8638911128903123, + "learning_rate": 1.8364819201332596e-05, + "loss": 0.1539, + "step": 4314 + }, + { + "epoch": 0.8638911128903123, + "learning_rate": 1.8357152781183613e-05, + "loss": 0.5113, + "step": 4316 + }, + { + "epoch": 0.8646917534027222, + "learning_rate": 1.834947003933417e-05, + "loss": 0.546, + "step": 4318 + }, + { + "epoch": 0.8646917534027222, + "learning_rate": 1.8341770990788874e-05, + "loss": 0.1107, + "step": 4320 + }, + { + "epoch": 0.8654923939151321, + "learning_rate": 1.8334055650584107e-05, + "loss": 0.108, + "step": 4322 + }, + { + "epoch": 0.8654923939151321, + "learning_rate": 1.8326324033788087e-05, + "loss": 0.1001, + "step": 4324 + }, + { + "epoch": 0.866293034427542, + "learning_rate": 1.8318576155500855e-05, + "loss": 0.2862, + "step": 4326 + }, + { + "epoch": 0.866293034427542, + "learning_rate": 1.831081203085415e-05, + "loss": 0.131, + "step": 4328 + }, + { + "epoch": 0.8670936749399519, + "learning_rate": 1.830303167501152e-05, + "loss": 0.1174, + "step": 4330 + }, + { + "epoch": 0.8670936749399519, + "learning_rate": 1.8295235103168128e-05, + "loss": 0.3296, + "step": 4332 + }, + { + "epoch": 0.8678943154523618, + "learning_rate": 1.8287422330550885e-05, + "loss": 0.5157, + "step": 4334 + }, + { + "epoch": 0.8678943154523618, + "learning_rate": 1.8279593372418284e-05, + "loss": 0.3532, + "step": 4336 + }, + { + "epoch": 0.8686949559647719, + "learning_rate": 1.827174824406043e-05, + "loss": 0.1097, + "step": 4338 + }, + { + "epoch": 0.8686949559647719, + "learning_rate": 1.8263886960799072e-05, + "loss": 0.1397, + "step": 4340 + }, + { + "epoch": 0.8694955964771818, + "learning_rate": 1.8256009537987424e-05, + "loss": 0.4354, + "step": 4342 + }, + { + "epoch": 0.8694955964771818, + "learning_rate": 1.8248115991010303e-05, + "loss": 0.2334, + "step": 4344 + }, + { + "epoch": 0.8702962369895917, + "learning_rate": 1.8240206335283943e-05, + "loss": 0.3099, + "step": 4346 + }, + { + "epoch": 0.8702962369895917, + "learning_rate": 1.8232280586256104e-05, + "loss": 0.0771, + "step": 4348 + }, + { + "epoch": 0.8710968775020016, + "learning_rate": 1.8224338759405934e-05, + "loss": 0.1452, + "step": 4350 + }, + { + "epoch": 0.8710968775020016, + "learning_rate": 1.8216380870243963e-05, + "loss": 0.6286, + "step": 4352 + }, + { + "epoch": 0.8718975180144115, + "learning_rate": 1.820840693431217e-05, + "loss": 0.5467, + "step": 4354 + }, + { + "epoch": 0.8718975180144115, + "learning_rate": 1.820041696718378e-05, + "loss": 0.4171, + "step": 4356 + }, + { + "epoch": 0.8726981585268214, + "learning_rate": 1.8192410984463416e-05, + "loss": 0.4261, + "step": 4358 + }, + { + "epoch": 0.8726981585268214, + "learning_rate": 1.8184389001786912e-05, + "loss": 0.3644, + "step": 4360 + }, + { + "epoch": 0.8734987990392313, + "learning_rate": 1.8176351034821352e-05, + "loss": 0.1552, + "step": 4362 + }, + { + "epoch": 0.8734987990392313, + "learning_rate": 1.8168297099265108e-05, + "loss": 0.126, + "step": 4364 + }, + { + "epoch": 0.8742994395516414, + "learning_rate": 1.8160227210847642e-05, + "loss": 0.4744, + "step": 4366 + }, + { + "epoch": 0.8742994395516414, + "learning_rate": 1.815214138532966e-05, + "loss": 0.0994, + "step": 4368 + }, + { + "epoch": 0.8751000800640513, + "learning_rate": 1.8144039638502927e-05, + "loss": 0.1222, + "step": 4370 + }, + { + "epoch": 0.8751000800640513, + "learning_rate": 1.8135921986190358e-05, + "loss": 0.0591, + "step": 4372 + }, + { + "epoch": 0.8759007205764612, + "learning_rate": 1.8127788444245884e-05, + "loss": 0.5198, + "step": 4374 + }, + { + "epoch": 0.8759007205764612, + "learning_rate": 1.8119639028554475e-05, + "loss": 0.1093, + "step": 4376 + }, + { + "epoch": 0.8767013610888711, + "learning_rate": 1.8111473755032152e-05, + "loss": 0.452, + "step": 4378 + }, + { + "epoch": 0.8767013610888711, + "learning_rate": 1.8103292639625835e-05, + "loss": 0.1435, + "step": 4380 + }, + { + "epoch": 0.877502001601281, + "learning_rate": 1.8095095698313456e-05, + "loss": 0.1266, + "step": 4382 + }, + { + "epoch": 0.877502001601281, + "learning_rate": 1.808688294710378e-05, + "loss": 0.0513, + "step": 4384 + }, + { + "epoch": 0.8783026421136909, + "learning_rate": 1.807865440203653e-05, + "loss": 0.6135, + "step": 4386 + }, + { + "epoch": 0.8783026421136909, + "learning_rate": 1.807041007918221e-05, + "loss": 0.6593, + "step": 4388 + }, + { + "epoch": 0.8791032826261009, + "learning_rate": 1.806214999464214e-05, + "loss": 0.0918, + "step": 4390 + }, + { + "epoch": 0.8791032826261009, + "learning_rate": 1.805387416454849e-05, + "loss": 0.3891, + "step": 4392 + }, + { + "epoch": 0.8799039231385108, + "learning_rate": 1.8045582605064087e-05, + "loss": 0.4934, + "step": 4394 + }, + { + "epoch": 0.8799039231385108, + "learning_rate": 1.8037275332382575e-05, + "loss": 0.3227, + "step": 4396 + }, + { + "epoch": 0.8807045636509208, + "learning_rate": 1.802895236272819e-05, + "loss": 0.3136, + "step": 4398 + }, + { + "epoch": 0.8807045636509208, + "learning_rate": 1.802061371235592e-05, + "loss": 0.2513, + "step": 4400 + }, + { + "epoch": 0.8815052041633307, + "learning_rate": 1.80122593975513e-05, + "loss": 0.1406, + "step": 4402 + }, + { + "epoch": 0.8815052041633307, + "learning_rate": 1.8003889434630476e-05, + "loss": 0.3522, + "step": 4404 + }, + { + "epoch": 0.8823058446757406, + "learning_rate": 1.7995503839940204e-05, + "loss": 0.7165, + "step": 4406 + }, + { + "epoch": 0.8823058446757406, + "learning_rate": 1.7987102629857692e-05, + "loss": 0.1403, + "step": 4408 + }, + { + "epoch": 0.8831064851881505, + "learning_rate": 1.7978685820790725e-05, + "loss": 0.2953, + "step": 4410 + }, + { + "epoch": 0.8831064851881505, + "learning_rate": 1.7970253429177494e-05, + "loss": 0.2616, + "step": 4412 + }, + { + "epoch": 0.8839071257005604, + "learning_rate": 1.796180547148662e-05, + "loss": 0.358, + "step": 4414 + }, + { + "epoch": 0.8839071257005604, + "learning_rate": 1.7953341964217196e-05, + "loss": 0.3216, + "step": 4416 + }, + { + "epoch": 0.8847077662129704, + "learning_rate": 1.7944862923898586e-05, + "loss": 0.621, + "step": 4418 + }, + { + "epoch": 0.8847077662129704, + "learning_rate": 1.7936368367090583e-05, + "loss": 0.5701, + "step": 4420 + }, + { + "epoch": 0.8855084067253803, + "learning_rate": 1.7927858310383196e-05, + "loss": 0.2216, + "step": 4422 + }, + { + "epoch": 0.8855084067253803, + "learning_rate": 1.7919332770396798e-05, + "loss": 0.1549, + "step": 4424 + }, + { + "epoch": 0.8863090472377902, + "learning_rate": 1.7910791763781928e-05, + "loss": 0.1067, + "step": 4426 + }, + { + "epoch": 0.8863090472377902, + "learning_rate": 1.7902235307219336e-05, + "loss": 0.1155, + "step": 4428 + }, + { + "epoch": 0.8871096877502002, + "learning_rate": 1.789366341742001e-05, + "loss": 0.0929, + "step": 4430 + }, + { + "epoch": 0.8871096877502002, + "learning_rate": 1.7885076111125e-05, + "loss": 0.1389, + "step": 4432 + }, + { + "epoch": 0.8879103282626101, + "learning_rate": 1.7876473405105535e-05, + "loss": 0.4899, + "step": 4434 + }, + { + "epoch": 0.8879103282626101, + "learning_rate": 1.7867855316162846e-05, + "loss": 0.1377, + "step": 4436 + }, + { + "epoch": 0.88871096877502, + "learning_rate": 1.785922186112829e-05, + "loss": 0.1828, + "step": 4438 + }, + { + "epoch": 0.88871096877502, + "learning_rate": 1.7850573056863173e-05, + "loss": 0.3933, + "step": 4440 + }, + { + "epoch": 0.8895116092874299, + "learning_rate": 1.7841908920258774e-05, + "loss": 0.3098, + "step": 4442 + }, + { + "epoch": 0.8895116092874299, + "learning_rate": 1.783322946823638e-05, + "loss": 0.3267, + "step": 4444 + }, + { + "epoch": 0.8903122497998399, + "learning_rate": 1.782453471774711e-05, + "loss": 0.4218, + "step": 4446 + }, + { + "epoch": 0.8903122497998399, + "learning_rate": 1.7815824685772042e-05, + "loss": 0.2325, + "step": 4448 + }, + { + "epoch": 0.8911128903122498, + "learning_rate": 1.7807099389322013e-05, + "loss": 0.1349, + "step": 4450 + }, + { + "epoch": 0.8911128903122498, + "learning_rate": 1.779835884543776e-05, + "loss": 0.2381, + "step": 4452 + }, + { + "epoch": 0.8919135308246597, + "learning_rate": 1.7789603071189733e-05, + "loss": 0.1431, + "step": 4454 + }, + { + "epoch": 0.8919135308246597, + "learning_rate": 1.7780832083678122e-05, + "loss": 0.0645, + "step": 4456 + }, + { + "epoch": 0.8927141713370697, + "learning_rate": 1.7772045900032912e-05, + "loss": 0.2431, + "step": 4458 + }, + { + "epoch": 0.8927141713370697, + "learning_rate": 1.776324453741365e-05, + "loss": 0.3641, + "step": 4460 + }, + { + "epoch": 0.8935148118494796, + "learning_rate": 1.7754428013009644e-05, + "loss": 0.567, + "step": 4462 + }, + { + "epoch": 0.8935148118494796, + "learning_rate": 1.774559634403971e-05, + "loss": 0.1084, + "step": 4464 + }, + { + "epoch": 0.8943154523618895, + "learning_rate": 1.7736749547752327e-05, + "loss": 0.4807, + "step": 4466 + }, + { + "epoch": 0.8943154523618895, + "learning_rate": 1.7727887641425465e-05, + "loss": 0.1047, + "step": 4468 + }, + { + "epoch": 0.8951160928742994, + "learning_rate": 1.7719010642366597e-05, + "loss": 0.259, + "step": 4470 + }, + { + "epoch": 0.8951160928742994, + "learning_rate": 1.7710118567912732e-05, + "loss": 0.4516, + "step": 4472 + }, + { + "epoch": 0.8959167333867094, + "learning_rate": 1.770121143543025e-05, + "loss": 0.5321, + "step": 4474 + }, + { + "epoch": 0.8959167333867094, + "learning_rate": 1.7692289262315008e-05, + "loss": 0.2549, + "step": 4476 + }, + { + "epoch": 0.8967173738991193, + "learning_rate": 1.7683352065992174e-05, + "loss": 0.2016, + "step": 4478 + }, + { + "epoch": 0.8967173738991193, + "learning_rate": 1.7674399863916298e-05, + "loss": 0.138, + "step": 4480 + }, + { + "epoch": 0.8975180144115292, + "learning_rate": 1.7665432673571238e-05, + "loss": 0.0754, + "step": 4482 + }, + { + "epoch": 0.8975180144115292, + "learning_rate": 1.765645051247007e-05, + "loss": 0.1784, + "step": 4484 + }, + { + "epoch": 0.8983186549239391, + "learning_rate": 1.7647453398155204e-05, + "loss": 0.1729, + "step": 4486 + }, + { + "epoch": 0.8983186549239391, + "learning_rate": 1.7638441348198144e-05, + "loss": 0.0484, + "step": 4488 + }, + { + "epoch": 0.899119295436349, + "learning_rate": 1.7629414380199672e-05, + "loss": 0.3674, + "step": 4490 + }, + { + "epoch": 0.899119295436349, + "learning_rate": 1.762037251178961e-05, + "loss": 0.4089, + "step": 4492 + }, + { + "epoch": 0.899919935948759, + "learning_rate": 1.7611315760626943e-05, + "loss": 0.0702, + "step": 4494 + }, + { + "epoch": 0.899919935948759, + "learning_rate": 1.7602244144399713e-05, + "loss": 0.2382, + "step": 4496 + }, + { + "epoch": 0.900720576461169, + "learning_rate": 1.7593157680824943e-05, + "loss": 0.6587, + "step": 4498 + }, + { + "epoch": 0.900720576461169, + "learning_rate": 1.7584056387648738e-05, + "loss": 0.0438, + "step": 4500 + }, + { + "epoch": 0.9015212169735789, + "learning_rate": 1.757494028264608e-05, + "loss": 0.187, + "step": 4502 + }, + { + "epoch": 0.9015212169735789, + "learning_rate": 1.7565809383620966e-05, + "loss": 0.1091, + "step": 4504 + }, + { + "epoch": 0.9023218574859888, + "learning_rate": 1.7556663708406203e-05, + "loss": 0.0307, + "step": 4506 + }, + { + "epoch": 0.9023218574859888, + "learning_rate": 1.7547503274863502e-05, + "loss": 0.2157, + "step": 4508 + }, + { + "epoch": 0.9031224979983987, + "learning_rate": 1.7538328100883404e-05, + "loss": 0.0916, + "step": 4510 + }, + { + "epoch": 0.9031224979983987, + "learning_rate": 1.7529138204385186e-05, + "loss": 0.1373, + "step": 4512 + }, + { + "epoch": 0.9039231385108086, + "learning_rate": 1.7519933603316962e-05, + "loss": 0.2692, + "step": 4514 + }, + { + "epoch": 0.9039231385108086, + "learning_rate": 1.7510714315655467e-05, + "loss": 0.3329, + "step": 4516 + }, + { + "epoch": 0.9047237790232185, + "learning_rate": 1.750148035940622e-05, + "loss": 0.4232, + "step": 4518 + }, + { + "epoch": 0.9047237790232185, + "learning_rate": 1.7492231752603305e-05, + "loss": 0.4771, + "step": 4520 + }, + { + "epoch": 0.9055244195356285, + "learning_rate": 1.7482968513309458e-05, + "loss": 0.3312, + "step": 4522 + }, + { + "epoch": 0.9055244195356285, + "learning_rate": 1.7473690659616e-05, + "loss": 0.3696, + "step": 4524 + }, + { + "epoch": 0.9063250600480385, + "learning_rate": 1.7464398209642744e-05, + "loss": 0.3151, + "step": 4526 + }, + { + "epoch": 0.9063250600480385, + "learning_rate": 1.7455091181538094e-05, + "loss": 0.1387, + "step": 4528 + }, + { + "epoch": 0.9071257005604484, + "learning_rate": 1.7445769593478842e-05, + "loss": 0.0811, + "step": 4530 + }, + { + "epoch": 0.9071257005604484, + "learning_rate": 1.743643346367027e-05, + "loss": 0.3729, + "step": 4532 + }, + { + "epoch": 0.9079263410728583, + "learning_rate": 1.7427082810346024e-05, + "loss": 0.2452, + "step": 4534 + }, + { + "epoch": 0.9079263410728583, + "learning_rate": 1.741771765176815e-05, + "loss": 0.4155, + "step": 4536 + }, + { + "epoch": 0.9087269815852682, + "learning_rate": 1.740833800622701e-05, + "loss": 0.0589, + "step": 4538 + }, + { + "epoch": 0.9087269815852682, + "learning_rate": 1.739894389204122e-05, + "loss": 0.161, + "step": 4540 + }, + { + "epoch": 0.9095276220976781, + "learning_rate": 1.738953532755774e-05, + "loss": 0.3235, + "step": 4542 + }, + { + "epoch": 0.9095276220976781, + "learning_rate": 1.7380112331151657e-05, + "loss": 0.1058, + "step": 4544 + }, + { + "epoch": 0.910328262610088, + "learning_rate": 1.7370674921226306e-05, + "loss": 0.0767, + "step": 4546 + }, + { + "epoch": 0.910328262610088, + "learning_rate": 1.7361223116213146e-05, + "loss": 0.3632, + "step": 4548 + }, + { + "epoch": 0.911128903122498, + "learning_rate": 1.7351756934571764e-05, + "loss": 0.2029, + "step": 4550 + }, + { + "epoch": 0.911128903122498, + "learning_rate": 1.7342276394789825e-05, + "loss": 0.1649, + "step": 4552 + }, + { + "epoch": 0.911929543634908, + "learning_rate": 1.7332781515382996e-05, + "loss": 0.7108, + "step": 4554 + }, + { + "epoch": 0.911929543634908, + "learning_rate": 1.732327231489503e-05, + "loss": 0.4483, + "step": 4556 + }, + { + "epoch": 0.9127301841473179, + "learning_rate": 1.7313748811897564e-05, + "loss": 0.0679, + "step": 4558 + }, + { + "epoch": 0.9127301841473179, + "learning_rate": 1.7304211024990216e-05, + "loss": 0.065, + "step": 4560 + }, + { + "epoch": 0.9135308246597278, + "learning_rate": 1.7294658972800495e-05, + "loss": 0.1077, + "step": 4562 + }, + { + "epoch": 0.9135308246597278, + "learning_rate": 1.728509267398376e-05, + "loss": 0.0835, + "step": 4564 + }, + { + "epoch": 0.9143314651721377, + "learning_rate": 1.727551214722322e-05, + "loss": 0.3568, + "step": 4566 + }, + { + "epoch": 0.9143314651721377, + "learning_rate": 1.7265917411229803e-05, + "loss": 0.1346, + "step": 4568 + }, + { + "epoch": 0.9151321056845476, + "learning_rate": 1.72563084847423e-05, + "loss": 0.1204, + "step": 4570 + }, + { + "epoch": 0.9151321056845476, + "learning_rate": 1.7246685386527105e-05, + "loss": 0.1365, + "step": 4572 + }, + { + "epoch": 0.9159327461969575, + "learning_rate": 1.723704813537835e-05, + "loss": 0.3515, + "step": 4574 + }, + { + "epoch": 0.9159327461969575, + "learning_rate": 1.7227396750117802e-05, + "loss": 0.1117, + "step": 4576 + }, + { + "epoch": 0.9167333867093675, + "learning_rate": 1.7217731249594817e-05, + "loss": 0.1188, + "step": 4578 + }, + { + "epoch": 0.9167333867093675, + "learning_rate": 1.7208051652686348e-05, + "loss": 0.7845, + "step": 4580 + }, + { + "epoch": 0.9175340272217775, + "learning_rate": 1.7198357978296827e-05, + "loss": 0.5706, + "step": 4582 + }, + { + "epoch": 0.9175340272217775, + "learning_rate": 1.718865024535822e-05, + "loss": 0.3825, + "step": 4584 + }, + { + "epoch": 0.9183346677341874, + "learning_rate": 1.717892847282995e-05, + "loss": 0.3446, + "step": 4586 + }, + { + "epoch": 0.9183346677341874, + "learning_rate": 1.716919267969884e-05, + "loss": 0.2618, + "step": 4588 + }, + { + "epoch": 0.9191353082465973, + "learning_rate": 1.715944288497911e-05, + "loss": 0.4582, + "step": 4590 + }, + { + "epoch": 0.9191353082465973, + "learning_rate": 1.7149679107712317e-05, + "loss": 0.0656, + "step": 4592 + }, + { + "epoch": 0.9199359487590072, + "learning_rate": 1.713990136696734e-05, + "loss": 0.1702, + "step": 4594 + }, + { + "epoch": 0.9199359487590072, + "learning_rate": 1.7130109681840298e-05, + "loss": 0.2508, + "step": 4596 + }, + { + "epoch": 0.9207365892714171, + "learning_rate": 1.7120304071454578e-05, + "loss": 0.0568, + "step": 4598 + }, + { + "epoch": 0.9207365892714171, + "learning_rate": 1.711048455496075e-05, + "loss": 0.2079, + "step": 4600 + }, + { + "epoch": 0.921537229783827, + "learning_rate": 1.7100651151536532e-05, + "loss": 0.3215, + "step": 4602 + }, + { + "epoch": 0.921537229783827, + "learning_rate": 1.7090803880386784e-05, + "loss": 0.4313, + "step": 4604 + }, + { + "epoch": 0.922337870296237, + "learning_rate": 1.708094276074344e-05, + "loss": 0.1398, + "step": 4606 + }, + { + "epoch": 0.922337870296237, + "learning_rate": 1.7071067811865474e-05, + "loss": 0.1093, + "step": 4608 + }, + { + "epoch": 0.923138510808647, + "learning_rate": 1.7061179053038894e-05, + "loss": 0.1843, + "step": 4610 + }, + { + "epoch": 0.923138510808647, + "learning_rate": 1.705127650357663e-05, + "loss": 0.1042, + "step": 4612 + }, + { + "epoch": 0.9239391513210569, + "learning_rate": 1.704136018281859e-05, + "loss": 0.3579, + "step": 4614 + }, + { + "epoch": 0.9239391513210569, + "learning_rate": 1.7031430110131566e-05, + "loss": 0.3594, + "step": 4616 + }, + { + "epoch": 0.9247397918334668, + "learning_rate": 1.7021486304909202e-05, + "loss": 0.0764, + "step": 4618 + }, + { + "epoch": 0.9247397918334668, + "learning_rate": 1.701152878657197e-05, + "loss": 0.4147, + "step": 4620 + }, + { + "epoch": 0.9255404323458767, + "learning_rate": 1.700155757456711e-05, + "loss": 0.4324, + "step": 4622 + }, + { + "epoch": 0.9255404323458767, + "learning_rate": 1.699157268836863e-05, + "loss": 0.1996, + "step": 4624 + }, + { + "epoch": 0.9263410728582866, + "learning_rate": 1.6981574147477214e-05, + "loss": 0.2539, + "step": 4626 + }, + { + "epoch": 0.9263410728582866, + "learning_rate": 1.697156197142023e-05, + "loss": 0.2052, + "step": 4628 + }, + { + "epoch": 0.9271417133706965, + "learning_rate": 1.696153617975168e-05, + "loss": 0.2816, + "step": 4630 + }, + { + "epoch": 0.9271417133706965, + "learning_rate": 1.6951496792052148e-05, + "loss": 0.1359, + "step": 4632 + }, + { + "epoch": 0.9279423538831065, + "learning_rate": 1.694144382792878e-05, + "loss": 0.0434, + "step": 4634 + }, + { + "epoch": 0.9279423538831065, + "learning_rate": 1.6931377307015236e-05, + "loss": 0.2373, + "step": 4636 + }, + { + "epoch": 0.9287429943955164, + "learning_rate": 1.6921297248971652e-05, + "loss": 0.4543, + "step": 4638 + }, + { + "epoch": 0.9287429943955164, + "learning_rate": 1.6911203673484583e-05, + "loss": 0.142, + "step": 4640 + }, + { + "epoch": 0.9295436349079264, + "learning_rate": 1.690109660026701e-05, + "loss": 0.4002, + "step": 4642 + }, + { + "epoch": 0.9295436349079264, + "learning_rate": 1.6890976049058267e-05, + "loss": 0.1115, + "step": 4644 + }, + { + "epoch": 0.9303442754203363, + "learning_rate": 1.688084203962401e-05, + "loss": 0.6009, + "step": 4646 + }, + { + "epoch": 0.9303442754203363, + "learning_rate": 1.687069459175619e-05, + "loss": 0.1483, + "step": 4648 + }, + { + "epoch": 0.9311449159327462, + "learning_rate": 1.6860533725272953e-05, + "loss": 0.1037, + "step": 4650 + }, + { + "epoch": 0.9311449159327462, + "learning_rate": 1.6850359460018744e-05, + "loss": 0.2482, + "step": 4652 + }, + { + "epoch": 0.9319455564451561, + "learning_rate": 1.6840171815864085e-05, + "loss": 0.4031, + "step": 4654 + }, + { + "epoch": 0.9319455564451561, + "learning_rate": 1.682997081270568e-05, + "loss": 0.3184, + "step": 4656 + }, + { + "epoch": 0.932746196957566, + "learning_rate": 1.681975647046631e-05, + "loss": 0.3946, + "step": 4658 + }, + { + "epoch": 0.932746196957566, + "learning_rate": 1.6809528809094805e-05, + "loss": 0.2265, + "step": 4660 + }, + { + "epoch": 0.933546837469976, + "learning_rate": 1.6799287848566024e-05, + "loss": 0.1853, + "step": 4662 + }, + { + "epoch": 0.933546837469976, + "learning_rate": 1.6789033608880742e-05, + "loss": 0.1386, + "step": 4664 + }, + { + "epoch": 0.9343474779823859, + "learning_rate": 1.6778766110065765e-05, + "loss": 0.5597, + "step": 4666 + }, + { + "epoch": 0.9343474779823859, + "learning_rate": 1.67684853721737e-05, + "loss": 0.0993, + "step": 4668 + }, + { + "epoch": 0.9351481184947958, + "learning_rate": 1.6758191415283066e-05, + "loss": 0.1845, + "step": 4670 + }, + { + "epoch": 0.9351481184947958, + "learning_rate": 1.6747884259498185e-05, + "loss": 0.0245, + "step": 4672 + }, + { + "epoch": 0.9359487590072058, + "learning_rate": 1.673756392494916e-05, + "loss": 0.2342, + "step": 4674 + }, + { + "epoch": 0.9359487590072058, + "learning_rate": 1.6727230431791826e-05, + "loss": 0.6331, + "step": 4676 + }, + { + "epoch": 0.9367493995196157, + "learning_rate": 1.671688380020769e-05, + "loss": 0.1829, + "step": 4678 + }, + { + "epoch": 0.9367493995196157, + "learning_rate": 1.6706524050404006e-05, + "loss": 0.0797, + "step": 4680 + }, + { + "epoch": 0.9375500400320256, + "learning_rate": 1.6696151202613537e-05, + "loss": 0.5491, + "step": 4682 + }, + { + "epoch": 0.9375500400320256, + "learning_rate": 1.6685765277094702e-05, + "loss": 0.2762, + "step": 4684 + }, + { + "epoch": 0.9383506805444356, + "learning_rate": 1.6675366294131432e-05, + "loss": 0.1078, + "step": 4686 + }, + { + "epoch": 0.9383506805444356, + "learning_rate": 1.6664954274033175e-05, + "loss": 0.1612, + "step": 4688 + }, + { + "epoch": 0.9391513210568455, + "learning_rate": 1.6654529237134833e-05, + "loss": 0.3081, + "step": 4690 + }, + { + "epoch": 0.9391513210568455, + "learning_rate": 1.66440912037967e-05, + "loss": 0.0229, + "step": 4692 + }, + { + "epoch": 0.9399519615692554, + "learning_rate": 1.663364019440453e-05, + "loss": 0.0959, + "step": 4694 + }, + { + "epoch": 0.9399519615692554, + "learning_rate": 1.662317622936933e-05, + "loss": 0.393, + "step": 4696 + }, + { + "epoch": 0.9407526020816653, + "learning_rate": 1.6612699329127467e-05, + "loss": 0.014, + "step": 4698 + }, + { + "epoch": 0.9407526020816653, + "learning_rate": 1.6602209514140562e-05, + "loss": 0.2582, + "step": 4700 + }, + { + "epoch": 0.9415532425940752, + "learning_rate": 1.6591706804895415e-05, + "loss": 0.1493, + "step": 4702 + }, + { + "epoch": 0.9415532425940752, + "learning_rate": 1.6581191221904098e-05, + "loss": 0.068, + "step": 4704 + }, + { + "epoch": 0.9423538831064852, + "learning_rate": 1.6570662785703716e-05, + "loss": 0.5085, + "step": 4706 + }, + { + "epoch": 0.9423538831064852, + "learning_rate": 1.6560121516856592e-05, + "loss": 0.384, + "step": 4708 + }, + { + "epoch": 0.9431545236188951, + "learning_rate": 1.654956743595001e-05, + "loss": 0.3168, + "step": 4710 + }, + { + "epoch": 0.9431545236188951, + "learning_rate": 1.6539000563596328e-05, + "loss": 0.3041, + "step": 4712 + }, + { + "epoch": 0.9439551641313051, + "learning_rate": 1.6528420920432893e-05, + "loss": 0.41, + "step": 4714 + }, + { + "epoch": 0.9439551641313051, + "learning_rate": 1.651782852712194e-05, + "loss": 0.1143, + "step": 4716 + }, + { + "epoch": 0.944755804643715, + "learning_rate": 1.6507223404350686e-05, + "loss": 0.3913, + "step": 4718 + }, + { + "epoch": 0.944755804643715, + "learning_rate": 1.6496605572831127e-05, + "loss": 0.4166, + "step": 4720 + }, + { + "epoch": 0.9455564451561249, + "learning_rate": 1.648597505330016e-05, + "loss": 0.2901, + "step": 4722 + }, + { + "epoch": 0.9455564451561249, + "learning_rate": 1.6475331866519387e-05, + "loss": 0.1202, + "step": 4724 + }, + { + "epoch": 0.9463570856685348, + "learning_rate": 1.6464676033275187e-05, + "loss": 0.2577, + "step": 4726 + }, + { + "epoch": 0.9463570856685348, + "learning_rate": 1.6454007574378657e-05, + "loss": 0.1167, + "step": 4728 + }, + { + "epoch": 0.9471577261809447, + "learning_rate": 1.644332651066548e-05, + "loss": 0.5596, + "step": 4730 + }, + { + "epoch": 0.9471577261809447, + "learning_rate": 1.6432632862996062e-05, + "loss": 0.1153, + "step": 4732 + }, + { + "epoch": 0.9479583666933546, + "learning_rate": 1.6421926652255275e-05, + "loss": 0.1629, + "step": 4734 + }, + { + "epoch": 0.9479583666933546, + "learning_rate": 1.6411207899352633e-05, + "loss": 0.2613, + "step": 4736 + }, + { + "epoch": 0.9487590072057646, + "learning_rate": 1.6400476625222057e-05, + "loss": 0.1109, + "step": 4738 + }, + { + "epoch": 0.9487590072057646, + "learning_rate": 1.6389732850821964e-05, + "loss": 0.3575, + "step": 4740 + }, + { + "epoch": 0.9495596477181746, + "learning_rate": 1.6378976597135193e-05, + "loss": 0.4839, + "step": 4742 + }, + { + "epoch": 0.9495596477181746, + "learning_rate": 1.6368207885168904e-05, + "loss": 0.1556, + "step": 4744 + }, + { + "epoch": 0.9503602882305845, + "learning_rate": 1.635742673595468e-05, + "loss": 0.4532, + "step": 4746 + }, + { + "epoch": 0.9503602882305845, + "learning_rate": 1.6346633170548275e-05, + "loss": 0.1261, + "step": 4748 + }, + { + "epoch": 0.9511609287429944, + "learning_rate": 1.6335827210029823e-05, + "loss": 0.104, + "step": 4750 + }, + { + "epoch": 0.9511609287429944, + "learning_rate": 1.6325008875503563e-05, + "loss": 0.2965, + "step": 4752 + }, + { + "epoch": 0.9519615692554043, + "learning_rate": 1.6314178188097917e-05, + "loss": 0.1134, + "step": 4754 + }, + { + "epoch": 0.9519615692554043, + "learning_rate": 1.6303335168965495e-05, + "loss": 0.0842, + "step": 4756 + }, + { + "epoch": 0.9527622097678142, + "learning_rate": 1.6292479839282904e-05, + "loss": 0.1359, + "step": 4758 + }, + { + "epoch": 0.9527622097678142, + "learning_rate": 1.628161222025089e-05, + "loss": 0.266, + "step": 4760 + }, + { + "epoch": 0.9535628502802241, + "learning_rate": 1.627073233309409e-05, + "loss": 0.4089, + "step": 4762 + }, + { + "epoch": 0.9535628502802241, + "learning_rate": 1.625984019906122e-05, + "loss": 0.0622, + "step": 4764 + }, + { + "epoch": 0.9543634907926342, + "learning_rate": 1.624893583942482e-05, + "loss": 0.1207, + "step": 4766 + }, + { + "epoch": 0.9543634907926342, + "learning_rate": 1.623801927548132e-05, + "loss": 0.1486, + "step": 4768 + }, + { + "epoch": 0.9551641313050441, + "learning_rate": 1.6227090528551058e-05, + "loss": 0.2325, + "step": 4770 + }, + { + "epoch": 0.9551641313050441, + "learning_rate": 1.6216149619978057e-05, + "loss": 0.2052, + "step": 4772 + }, + { + "epoch": 0.955964771817454, + "learning_rate": 1.6205196571130204e-05, + "loss": 0.499, + "step": 4774 + }, + { + "epoch": 0.955964771817454, + "learning_rate": 1.6194231403398987e-05, + "loss": 0.0207, + "step": 4776 + }, + { + "epoch": 0.9567654123298639, + "learning_rate": 1.618325413819967e-05, + "loss": 0.5748, + "step": 4778 + }, + { + "epoch": 0.9567654123298639, + "learning_rate": 1.6172264796971063e-05, + "loss": 0.3719, + "step": 4780 + }, + { + "epoch": 0.9575660528422738, + "learning_rate": 1.6161263401175555e-05, + "loss": 0.2764, + "step": 4782 + }, + { + "epoch": 0.9575660528422738, + "learning_rate": 1.6150249972299173e-05, + "loss": 0.7086, + "step": 4784 + }, + { + "epoch": 0.9583666933546837, + "learning_rate": 1.613922453185133e-05, + "loss": 0.2161, + "step": 4786 + }, + { + "epoch": 0.9583666933546837, + "learning_rate": 1.612818710136499e-05, + "loss": 0.2781, + "step": 4788 + }, + { + "epoch": 0.9591673338670936, + "learning_rate": 1.6117137702396454e-05, + "loss": 0.497, + "step": 4790 + }, + { + "epoch": 0.9591673338670936, + "learning_rate": 1.6106076356525484e-05, + "loss": 0.7224, + "step": 4792 + }, + { + "epoch": 0.9599679743795037, + "learning_rate": 1.6095003085355103e-05, + "loss": 0.2395, + "step": 4794 + }, + { + "epoch": 0.9599679743795037, + "learning_rate": 1.6083917910511623e-05, + "loss": 0.1079, + "step": 4796 + }, + { + "epoch": 0.9607686148919136, + "learning_rate": 1.6072820853644688e-05, + "loss": 0.0841, + "step": 4798 + }, + { + "epoch": 0.9607686148919136, + "learning_rate": 1.6061711936427028e-05, + "loss": 0.1574, + "step": 4800 + }, + { + "epoch": 0.9615692554043235, + "learning_rate": 1.6050591180554658e-05, + "loss": 0.0889, + "step": 4802 + }, + { + "epoch": 0.9615692554043235, + "learning_rate": 1.60394586077466e-05, + "loss": 0.5938, + "step": 4804 + }, + { + "epoch": 0.9623698959167334, + "learning_rate": 1.6028314239745068e-05, + "loss": 0.0891, + "step": 4806 + }, + { + "epoch": 0.9623698959167334, + "learning_rate": 1.6017158098315224e-05, + "loss": 0.1675, + "step": 4808 + }, + { + "epoch": 0.9631705364291433, + "learning_rate": 1.6005990205245226e-05, + "loss": 0.5502, + "step": 4810 + }, + { + "epoch": 0.9631705364291433, + "learning_rate": 1.5994810582346266e-05, + "loss": 0.594, + "step": 4812 + }, + { + "epoch": 0.9639711769415532, + "learning_rate": 1.5983619251452334e-05, + "loss": 0.0512, + "step": 4814 + }, + { + "epoch": 0.9639711769415532, + "learning_rate": 1.5972416234420404e-05, + "loss": 0.043, + "step": 4816 + }, + { + "epoch": 0.9647718174539631, + "learning_rate": 1.596120155313017e-05, + "loss": 0.1129, + "step": 4818 + }, + { + "epoch": 0.9647718174539631, + "learning_rate": 1.594997522948413e-05, + "loss": 0.1032, + "step": 4820 + }, + { + "epoch": 0.9655724579663731, + "learning_rate": 1.593873728540759e-05, + "loss": 0.1518, + "step": 4822 + }, + { + "epoch": 0.9655724579663731, + "learning_rate": 1.592748774284844e-05, + "loss": 0.0859, + "step": 4824 + }, + { + "epoch": 0.966373098478783, + "learning_rate": 1.5916226623777346e-05, + "loss": 0.3548, + "step": 4826 + }, + { + "epoch": 0.966373098478783, + "learning_rate": 1.5904953950187448e-05, + "loss": 0.4498, + "step": 4828 + }, + { + "epoch": 0.967173738991193, + "learning_rate": 1.5893669744094587e-05, + "loss": 0.1713, + "step": 4830 + }, + { + "epoch": 0.967173738991193, + "learning_rate": 1.588237402753703e-05, + "loss": 0.1078, + "step": 4832 + }, + { + "epoch": 0.9679743795036029, + "learning_rate": 1.5871066822575526e-05, + "loss": 0.3512, + "step": 4834 + }, + { + "epoch": 0.9679743795036029, + "learning_rate": 1.5859748151293354e-05, + "loss": 0.2171, + "step": 4836 + }, + { + "epoch": 0.9687750200160128, + "learning_rate": 1.5848418035796064e-05, + "loss": 0.303, + "step": 4838 + }, + { + "epoch": 0.9687750200160128, + "learning_rate": 1.5837076498211673e-05, + "loss": 0.2168, + "step": 4840 + }, + { + "epoch": 0.9695756605284227, + "learning_rate": 1.5825723560690396e-05, + "loss": 0.1648, + "step": 4842 + }, + { + "epoch": 0.9695756605284227, + "learning_rate": 1.581435924540482e-05, + "loss": 0.2823, + "step": 4844 + }, + { + "epoch": 0.9703763010408326, + "learning_rate": 1.580298357454967e-05, + "loss": 0.263, + "step": 4846 + }, + { + "epoch": 0.9703763010408326, + "learning_rate": 1.579159657034185e-05, + "loss": 0.0468, + "step": 4848 + }, + { + "epoch": 0.9711769415532426, + "learning_rate": 1.5780198255020485e-05, + "loss": 0.2299, + "step": 4850 + }, + { + "epoch": 0.9711769415532426, + "learning_rate": 1.5768788650846674e-05, + "loss": 0.3577, + "step": 4852 + }, + { + "epoch": 0.9719775820656525, + "learning_rate": 1.5757367780103672e-05, + "loss": 0.1159, + "step": 4854 + }, + { + "epoch": 0.9719775820656525, + "learning_rate": 1.574593566509664e-05, + "loss": 0.179, + "step": 4856 + }, + { + "epoch": 0.9727782225780625, + "learning_rate": 1.5734492328152796e-05, + "loss": 0.2558, + "step": 4858 + }, + { + "epoch": 0.9727782225780625, + "learning_rate": 1.5723037791621203e-05, + "loss": 0.0993, + "step": 4860 + }, + { + "epoch": 0.9735788630904724, + "learning_rate": 1.5711572077872784e-05, + "loss": 0.3576, + "step": 4862 + }, + { + "epoch": 0.9735788630904724, + "learning_rate": 1.5700095209300386e-05, + "loss": 0.2658, + "step": 4864 + }, + { + "epoch": 0.9743795036028823, + "learning_rate": 1.568860720831852e-05, + "loss": 0.2964, + "step": 4866 + }, + { + "epoch": 0.9743795036028823, + "learning_rate": 1.5677108097363565e-05, + "loss": 0.0606, + "step": 4868 + }, + { + "epoch": 0.9751801441152922, + "learning_rate": 1.5665597898893508e-05, + "loss": 0.1666, + "step": 4870 + }, + { + "epoch": 0.9751801441152922, + "learning_rate": 1.5654076635387976e-05, + "loss": 0.0388, + "step": 4872 + }, + { + "epoch": 0.9759807846277022, + "learning_rate": 1.5642544329348316e-05, + "loss": 0.2268, + "step": 4874 + }, + { + "epoch": 0.9759807846277022, + "learning_rate": 1.5631001003297302e-05, + "loss": 0.1168, + "step": 4876 + }, + { + "epoch": 0.9767814251401121, + "learning_rate": 1.5619446679779367e-05, + "loss": 0.6538, + "step": 4878 + }, + { + "epoch": 0.9767814251401121, + "learning_rate": 1.560788138136029e-05, + "loss": 0.1981, + "step": 4880 + }, + { + "epoch": 0.977582065652522, + "learning_rate": 1.5596305130627414e-05, + "loss": 0.0833, + "step": 4882 + }, + { + "epoch": 0.977582065652522, + "learning_rate": 1.5584717950189373e-05, + "loss": 0.2107, + "step": 4884 + }, + { + "epoch": 0.978382706164932, + "learning_rate": 1.5573119862676155e-05, + "loss": 0.0353, + "step": 4886 + }, + { + "epoch": 0.978382706164932, + "learning_rate": 1.5561510890739137e-05, + "loss": 0.1639, + "step": 4888 + }, + { + "epoch": 0.9791833466773419, + "learning_rate": 1.554989105705083e-05, + "loss": 0.0787, + "step": 4890 + }, + { + "epoch": 0.9791833466773419, + "learning_rate": 1.5538260384305083e-05, + "loss": 0.0579, + "step": 4892 + }, + { + "epoch": 0.9799839871897518, + "learning_rate": 1.5526618895216786e-05, + "loss": 0.0171, + "step": 4894 + }, + { + "epoch": 0.9799839871897518, + "learning_rate": 1.5514966612522088e-05, + "loss": 0.2899, + "step": 4896 + }, + { + "epoch": 0.9807846277021617, + "learning_rate": 1.5503303558978112e-05, + "loss": 0.3731, + "step": 4898 + }, + { + "epoch": 0.9807846277021617, + "learning_rate": 1.5491629757363033e-05, + "loss": 0.4404, + "step": 4900 + }, + { + "epoch": 0.9815852682145717, + "learning_rate": 1.547994523047609e-05, + "loss": 0.1878, + "step": 4902 + }, + { + "epoch": 0.9815852682145717, + "learning_rate": 1.546825000113736e-05, + "loss": 0.0658, + "step": 4904 + }, + { + "epoch": 0.9823859087269816, + "learning_rate": 1.545654409218794e-05, + "loss": 0.2615, + "step": 4906 + }, + { + "epoch": 0.9823859087269816, + "learning_rate": 1.544482752648966e-05, + "loss": 0.1827, + "step": 4908 + }, + { + "epoch": 0.9831865492393915, + "learning_rate": 1.5433100326925298e-05, + "loss": 0.1778, + "step": 4910 + }, + { + "epoch": 0.9831865492393915, + "learning_rate": 1.5421362516398285e-05, + "loss": 0.1057, + "step": 4912 + }, + { + "epoch": 0.9839871897518014, + "learning_rate": 1.5409614117832797e-05, + "loss": 0.497, + "step": 4914 + }, + { + "epoch": 0.9839871897518014, + "learning_rate": 1.539785515417377e-05, + "loss": 0.2978, + "step": 4916 + }, + { + "epoch": 0.9847878302642114, + "learning_rate": 1.538608564838665e-05, + "loss": 0.1014, + "step": 4918 + }, + { + "epoch": 0.9847878302642114, + "learning_rate": 1.5374305623457605e-05, + "loss": 0.199, + "step": 4920 + }, + { + "epoch": 0.9855884707766213, + "learning_rate": 1.5362515102393244e-05, + "loss": 0.5205, + "step": 4922 + }, + { + "epoch": 0.9855884707766213, + "learning_rate": 1.5350714108220677e-05, + "loss": 0.3225, + "step": 4924 + }, + { + "epoch": 0.9863891112890312, + "learning_rate": 1.5338902663987564e-05, + "loss": 0.8268, + "step": 4926 + }, + { + "epoch": 0.9863891112890312, + "learning_rate": 1.532708079276186e-05, + "loss": 0.3845, + "step": 4928 + }, + { + "epoch": 0.9871897518014412, + "learning_rate": 1.531524851763198e-05, + "loss": 0.0849, + "step": 4930 + }, + { + "epoch": 0.9871897518014412, + "learning_rate": 1.5303405861706567e-05, + "loss": 0.1802, + "step": 4932 + }, + { + "epoch": 0.9879903923138511, + "learning_rate": 1.529155284811464e-05, + "loss": 0.0855, + "step": 4934 + }, + { + "epoch": 0.9879903923138511, + "learning_rate": 1.5279689500005353e-05, + "loss": 0.2302, + "step": 4936 + }, + { + "epoch": 0.988791032826261, + "learning_rate": 1.5267815840548067e-05, + "loss": 0.7477, + "step": 4938 + }, + { + "epoch": 0.988791032826261, + "learning_rate": 1.5255931892932344e-05, + "loss": 0.5083, + "step": 4940 + }, + { + "epoch": 0.9895916733386709, + "learning_rate": 1.5244037680367739e-05, + "loss": 0.1347, + "step": 4942 + }, + { + "epoch": 0.9895916733386709, + "learning_rate": 1.5232133226083962e-05, + "loss": 0.0542, + "step": 4944 + }, + { + "epoch": 0.9903923138510808, + "learning_rate": 1.522021855333061e-05, + "loss": 0.2233, + "step": 4946 + }, + { + "epoch": 0.9903923138510808, + "learning_rate": 1.5208293685377362e-05, + "loss": 0.2936, + "step": 4948 + }, + { + "epoch": 0.9911929543634908, + "learning_rate": 1.519635864551371e-05, + "loss": 0.0506, + "step": 4950 + }, + { + "epoch": 0.9911929543634908, + "learning_rate": 1.5184413457049014e-05, + "loss": 0.0735, + "step": 4952 + }, + { + "epoch": 0.9919935948759008, + "learning_rate": 1.5172458143312548e-05, + "loss": 0.1928, + "step": 4954 + }, + { + "epoch": 0.9919935948759008, + "learning_rate": 1.5160492727653238e-05, + "loss": 0.0394, + "step": 4956 + }, + { + "epoch": 0.9927942353883107, + "learning_rate": 1.5148517233439858e-05, + "loss": 0.3703, + "step": 4958 + }, + { + "epoch": 0.9927942353883107, + "learning_rate": 1.5136531684060753e-05, + "loss": 0.7397, + "step": 4960 + }, + { + "epoch": 0.9935948759007206, + "learning_rate": 1.512453610292402e-05, + "loss": 0.0734, + "step": 4962 + }, + { + "epoch": 0.9935948759007206, + "learning_rate": 1.5112530513457251e-05, + "loss": 0.102, + "step": 4964 + }, + { + "epoch": 0.9943955164131305, + "learning_rate": 1.5100514939107598e-05, + "loss": 0.0409, + "step": 4966 + }, + { + "epoch": 0.9943955164131305, + "learning_rate": 1.50884894033418e-05, + "loss": 0.2058, + "step": 4968 + }, + { + "epoch": 0.9951961569255404, + "learning_rate": 1.5076453929645927e-05, + "loss": 0.7539, + "step": 4970 + }, + { + "epoch": 0.9951961569255404, + "learning_rate": 1.5064408541525578e-05, + "loss": 0.0294, + "step": 4972 + }, + { + "epoch": 0.9959967974379503, + "learning_rate": 1.505235326250563e-05, + "loss": 0.2225, + "step": 4974 + }, + { + "epoch": 0.9959967974379503, + "learning_rate": 1.504028811613027e-05, + "loss": 0.0261, + "step": 4976 + }, + { + "epoch": 0.9967974379503602, + "learning_rate": 1.5028213125963054e-05, + "loss": 0.1582, + "step": 4978 + }, + { + "epoch": 0.9967974379503602, + "learning_rate": 1.5016128315586636e-05, + "loss": 0.0178, + "step": 4980 + }, + { + "epoch": 0.9975980784627703, + "learning_rate": 1.5004033708602977e-05, + "loss": 0.1243, + "step": 4982 + }, + { + "epoch": 0.9975980784627703, + "learning_rate": 1.4991929328633043e-05, + "loss": 0.0384, + "step": 4984 + }, + { + "epoch": 0.9983987189751802, + "learning_rate": 1.4979815199317011e-05, + "loss": 0.3846, + "step": 4986 + }, + { + "epoch": 0.9983987189751802, + "learning_rate": 1.4967691344314012e-05, + "loss": 0.2784, + "step": 4988 + }, + { + "epoch": 0.9991993594875901, + "learning_rate": 1.495555778730216e-05, + "loss": 0.3257, + "step": 4990 + }, + { + "epoch": 0.9991993594875901, + "learning_rate": 1.4943414551978622e-05, + "loss": 0.2533, + "step": 4992 + }, + { + "epoch": 1.0, + "learning_rate": 1.4931261662059333e-05, + "loss": 0.7029, + "step": 4994 + }, + { + "epoch": 1.0, + "learning_rate": 1.4919099141279214e-05, + "loss": 0.2787, + "step": 4996 + }, + { + "epoch": 1.0, + "step": 4996, + "total_flos": 0, + "train_loss": 0.26570447263351865, + "train_runtime": 9135.8399, + "train_samples_per_second": 2.187, + "train_steps_per_second": 0.547 + } + ], + "logging_steps": 2, + "max_steps": 4996, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_25_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_25_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d3759180b74a5f4a23672b5aa02099e71f0ac7bd --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_25_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83c8391d8b14f495a8bba5a73f00d6ac8bfe2fa3ca9856b89a092d2e236b7f23 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_25_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_25_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..44f57111ddc792b754d8efefe5cefcd5c4489523 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_25_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:433ba68121862279e099151423febd108bd5276731d68d933d40f39740f4df64 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_25_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_25_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..4faa042fd9d0a148501cdbedc7872cb05aeb1115 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_25_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd2c3dc58d0cfbd72caf72a5df214e80e5a0e0b618ecb6be45bd7abdb9ce08bc +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_25_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_25_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..4154b58511a5fc0beac2cce19b8724948f89651b --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_divbs_scenario12_new_10000_random0_25_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6e9469c131643f0861d8bb16e509aef82112b9f92d633adea1d3d37390f4b74 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..523298405a702fef07220a9a00559e4e976e13c8 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,4400 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1249, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016012810248198558, + "grad_norm": 2.8969898223876953, + "learning_rate": 2.4341906163790364e-06, + "loss": 0.3516, + "step": 2 + }, + { + "epoch": 0.0032025620496397116, + "grad_norm": 2.0521013736724854, + "learning_rate": 2.4708617956148052e-06, + "loss": 0.3031, + "step": 4 + }, + { + "epoch": 0.004803843074459567, + "grad_norm": 2.765810251235962, + "learning_rate": 2.507768247396697e-06, + "loss": 0.3797, + "step": 6 + }, + { + "epoch": 0.006405124099279423, + "grad_norm": 1.8172677755355835, + "learning_rate": 2.5449088184619065e-06, + "loss": 0.4087, + "step": 8 + }, + { + "epoch": 0.008006405124099279, + "grad_norm": 3.2965493202209473, + "learning_rate": 2.5822823482318517e-06, + "loss": 0.397, + "step": 10 + }, + { + "epoch": 0.009607686148919135, + "grad_norm": 1.1102368831634521, + "learning_rate": 2.6198876688483453e-06, + "loss": 0.2051, + "step": 12 + }, + { + "epoch": 0.01120896717373899, + "grad_norm": 2.980532646179199, + "learning_rate": 2.6577236052101764e-06, + "loss": 0.3807, + "step": 14 + }, + { + "epoch": 0.012810248198558846, + "grad_norm": 1.839591383934021, + "learning_rate": 2.6957889750097866e-06, + "loss": 0.2761, + "step": 16 + }, + { + "epoch": 0.014411529223378704, + "grad_norm": 2.8217523097991943, + "learning_rate": 2.7340825887701848e-06, + "loss": 0.2927, + "step": 18 + }, + { + "epoch": 0.016012810248198558, + "grad_norm": 1.8294413089752197, + "learning_rate": 2.772603249882202e-06, + "loss": 0.3187, + "step": 20 + }, + { + "epoch": 0.017614091273018415, + "grad_norm": 2.123487949371338, + "learning_rate": 2.81134975464178e-06, + "loss": 0.3324, + "step": 22 + }, + { + "epoch": 0.01921537229783827, + "grad_norm": 2.4420013427734375, + "learning_rate": 2.850320892287688e-06, + "loss": 0.3381, + "step": 24 + }, + { + "epoch": 0.020816653322658127, + "grad_norm": 1.3755011558532715, + "learning_rate": 2.889515445039256e-06, + "loss": 0.3179, + "step": 26 + }, + { + "epoch": 0.02241793434747798, + "grad_norm": 3.2971866130828857, + "learning_rate": 2.928932188134529e-06, + "loss": 0.2932, + "step": 28 + }, + { + "epoch": 0.02401921537229784, + "grad_norm": 1.7407487630844116, + "learning_rate": 2.9685698898684355e-06, + "loss": 0.3054, + "step": 30 + }, + { + "epoch": 0.025620496397117692, + "grad_norm": 2.178921937942505, + "learning_rate": 3.00842731163137e-06, + "loss": 0.3329, + "step": 32 + }, + { + "epoch": 0.02722177742193755, + "grad_norm": 2.3187286853790283, + "learning_rate": 3.048503207947854e-06, + "loss": 0.2635, + "step": 34 + }, + { + "epoch": 0.028823058446757407, + "grad_norm": 2.3793349266052246, + "learning_rate": 3.0887963265154187e-06, + "loss": 0.4325, + "step": 36 + }, + { + "epoch": 0.03042433947157726, + "grad_norm": 1.7196837663650513, + "learning_rate": 3.129305408243829e-06, + "loss": 0.2894, + "step": 38 + }, + { + "epoch": 0.032025620496397116, + "grad_norm": 2.46417236328125, + "learning_rate": 3.17002918729432e-06, + "loss": 0.2012, + "step": 40 + }, + { + "epoch": 0.03362690152121697, + "grad_norm": 2.162390947341919, + "learning_rate": 3.2109663911192622e-06, + "loss": 0.2266, + "step": 42 + }, + { + "epoch": 0.03522818254603683, + "grad_norm": 2.2925500869750977, + "learning_rate": 3.2521157405018146e-06, + "loss": 0.3443, + "step": 44 + }, + { + "epoch": 0.03682946357085669, + "grad_norm": 3.0166170597076416, + "learning_rate": 3.293475949595998e-06, + "loss": 0.4322, + "step": 46 + }, + { + "epoch": 0.03843074459567654, + "grad_norm": 2.007530689239502, + "learning_rate": 3.335045725966829e-06, + "loss": 0.397, + "step": 48 + }, + { + "epoch": 0.040032025620496396, + "grad_norm": 2.707064628601074, + "learning_rate": 3.3768237706306716e-06, + "loss": 0.397, + "step": 50 + }, + { + "epoch": 0.041633306645316254, + "grad_norm": 2.079900026321411, + "learning_rate": 3.418808778095917e-06, + "loss": 0.293, + "step": 52 + }, + { + "epoch": 0.04323458767013611, + "grad_norm": 1.890999674797058, + "learning_rate": 3.460999436403676e-06, + "loss": 0.289, + "step": 54 + }, + { + "epoch": 0.04483586869495596, + "grad_norm": 1.7686502933502197, + "learning_rate": 3.5033944271688624e-06, + "loss": 0.1894, + "step": 56 + }, + { + "epoch": 0.04643714971977582, + "grad_norm": 2.2891736030578613, + "learning_rate": 3.5459924256213596e-06, + "loss": 0.364, + "step": 58 + }, + { + "epoch": 0.04803843074459568, + "grad_norm": 1.1851152181625366, + "learning_rate": 3.588792100647368e-06, + "loss": 0.3321, + "step": 60 + }, + { + "epoch": 0.049639711769415534, + "grad_norm": 2.7429542541503906, + "learning_rate": 3.6317921148310965e-06, + "loss": 0.3483, + "step": 62 + }, + { + "epoch": 0.051240992794235385, + "grad_norm": 2.454606056213379, + "learning_rate": 3.674991124496452e-06, + "loss": 0.3636, + "step": 64 + }, + { + "epoch": 0.05284227381905524, + "grad_norm": 2.0178585052490234, + "learning_rate": 3.7183877797491143e-06, + "loss": 0.2902, + "step": 66 + }, + { + "epoch": 0.0544435548438751, + "grad_norm": 3.26253342628479, + "learning_rate": 3.7619807245186824e-06, + "loss": 0.399, + "step": 68 + }, + { + "epoch": 0.05604483586869496, + "grad_norm": 2.606025218963623, + "learning_rate": 3.8057685966010025e-06, + "loss": 0.3986, + "step": 70 + }, + { + "epoch": 0.057646116893514815, + "grad_norm": 1.118543028831482, + "learning_rate": 3.849750027700842e-06, + "loss": 0.2563, + "step": 72 + }, + { + "epoch": 0.059247397918334666, + "grad_norm": 2.235337734222412, + "learning_rate": 3.8939236434745184e-06, + "loss": 0.2928, + "step": 74 + }, + { + "epoch": 0.06084867894315452, + "grad_norm": 2.2577459812164307, + "learning_rate": 3.938288063572962e-06, + "loss": 0.2371, + "step": 76 + }, + { + "epoch": 0.06244995996797438, + "grad_norm": 2.8654427528381348, + "learning_rate": 3.982841901684792e-06, + "loss": 0.4027, + "step": 78 + }, + { + "epoch": 0.06405124099279423, + "grad_norm": 2.775254964828491, + "learning_rate": 4.027583765579601e-06, + "loss": 0.3476, + "step": 80 + }, + { + "epoch": 0.0656525220176141, + "grad_norm": 2.773869514465332, + "learning_rate": 4.072512257151546e-06, + "loss": 0.397, + "step": 82 + }, + { + "epoch": 0.06725380304243395, + "grad_norm": 3.711998224258423, + "learning_rate": 4.117625972462988e-06, + "loss": 0.3315, + "step": 84 + }, + { + "epoch": 0.0688550840672538, + "grad_norm": 2.848271608352661, + "learning_rate": 4.1629235017883285e-06, + "loss": 0.3329, + "step": 86 + }, + { + "epoch": 0.07045636509207366, + "grad_norm": 1.7330138683319092, + "learning_rate": 4.208403429658151e-06, + "loss": 0.2291, + "step": 88 + }, + { + "epoch": 0.07205764611689351, + "grad_norm": 1.784081220626831, + "learning_rate": 4.254064334903347e-06, + "loss": 0.2893, + "step": 90 + }, + { + "epoch": 0.07365892714171338, + "grad_norm": 1.6720924377441406, + "learning_rate": 4.299904790699619e-06, + "loss": 0.3658, + "step": 92 + }, + { + "epoch": 0.07526020816653323, + "grad_norm": 3.6100242137908936, + "learning_rate": 4.345923364612024e-06, + "loss": 0.3967, + "step": 94 + }, + { + "epoch": 0.07686148919135308, + "grad_norm": 1.1456620693206787, + "learning_rate": 4.392118618639698e-06, + "loss": 0.2563, + "step": 96 + }, + { + "epoch": 0.07846277021617294, + "grad_norm": 1.9568294286727905, + "learning_rate": 4.4384891092608795e-06, + "loss": 0.2612, + "step": 98 + }, + { + "epoch": 0.08006405124099279, + "grad_norm": 1.7511234283447266, + "learning_rate": 4.485033387477915e-06, + "loss": 0.2151, + "step": 100 + }, + { + "epoch": 0.08166533226581266, + "grad_norm": 3.3538620471954346, + "learning_rate": 4.531749998862628e-06, + "loss": 0.5289, + "step": 102 + }, + { + "epoch": 0.08326661329063251, + "grad_norm": 1.8283573389053345, + "learning_rate": 4.578637483601732e-06, + "loss": 0.3839, + "step": 104 + }, + { + "epoch": 0.08486789431545236, + "grad_norm": 2.0266289710998535, + "learning_rate": 4.625694376542399e-06, + "loss": 0.3658, + "step": 106 + }, + { + "epoch": 0.08646917534027222, + "grad_norm": 3.6079843044281006, + "learning_rate": 4.672919207238145e-06, + "loss": 0.3057, + "step": 108 + }, + { + "epoch": 0.08807045636509207, + "grad_norm": 2.0364773273468018, + "learning_rate": 4.720310499994664e-06, + "loss": 0.2463, + "step": 110 + }, + { + "epoch": 0.08967173738991192, + "grad_norm": 3.3218822479248047, + "learning_rate": 4.767866773916041e-06, + "loss": 0.38, + "step": 112 + }, + { + "epoch": 0.09127301841473179, + "grad_norm": 2.6150918006896973, + "learning_rate": 4.81558654295099e-06, + "loss": 0.3184, + "step": 114 + }, + { + "epoch": 0.09287429943955164, + "grad_norm": 1.749948263168335, + "learning_rate": 4.863468315939234e-06, + "loss": 0.3347, + "step": 116 + }, + { + "epoch": 0.0944755804643715, + "grad_norm": 1.8724067211151123, + "learning_rate": 4.911510596658202e-06, + "loss": 0.2774, + "step": 118 + }, + { + "epoch": 0.09607686148919135, + "grad_norm": 2.2535030841827393, + "learning_rate": 4.959711883869734e-06, + "loss": 0.3259, + "step": 120 + }, + { + "epoch": 0.0976781425140112, + "grad_norm": 2.728620767593384, + "learning_rate": 5.0080706713669435e-06, + "loss": 0.2417, + "step": 122 + }, + { + "epoch": 0.09927942353883107, + "grad_norm": 2.3641433715820312, + "learning_rate": 5.056585448021398e-06, + "loss": 0.2776, + "step": 124 + }, + { + "epoch": 0.10088070456365092, + "grad_norm": 5.685856819152832, + "learning_rate": 5.105254697830208e-06, + "loss": 0.4155, + "step": 126 + }, + { + "epoch": 0.10248198558847077, + "grad_norm": 1.6316652297973633, + "learning_rate": 5.154076899963514e-06, + "loss": 0.2744, + "step": 128 + }, + { + "epoch": 0.10408326661329063, + "grad_norm": 2.2459123134613037, + "learning_rate": 5.203050528811959e-06, + "loss": 0.1872, + "step": 130 + }, + { + "epoch": 0.10568454763811048, + "grad_norm": 2.274412155151367, + "learning_rate": 5.2521740540343205e-06, + "loss": 0.2781, + "step": 132 + }, + { + "epoch": 0.10728582866293035, + "grad_norm": 1.981119155883789, + "learning_rate": 5.3014459406054295e-06, + "loss": 0.489, + "step": 134 + }, + { + "epoch": 0.1088871096877502, + "grad_norm": 3.0431439876556396, + "learning_rate": 5.350864648864026e-06, + "loss": 0.3511, + "step": 136 + }, + { + "epoch": 0.11048839071257005, + "grad_norm": 4.302409648895264, + "learning_rate": 5.4004286345609665e-06, + "loss": 0.531, + "step": 138 + }, + { + "epoch": 0.11208967173738991, + "grad_norm": 2.0921714305877686, + "learning_rate": 5.450136348907444e-06, + "loss": 0.0909, + "step": 140 + }, + { + "epoch": 0.11369095276220977, + "grad_norm": 1.7997318506240845, + "learning_rate": 5.499986238623329e-06, + "loss": 0.346, + "step": 142 + }, + { + "epoch": 0.11529223378702963, + "grad_norm": 1.6628451347351074, + "learning_rate": 5.549976745985809e-06, + "loss": 0.2533, + "step": 144 + }, + { + "epoch": 0.11689351481184948, + "grad_norm": 2.0659525394439697, + "learning_rate": 5.6001063088780085e-06, + "loss": 0.3175, + "step": 146 + }, + { + "epoch": 0.11849479583666933, + "grad_norm": 3.562079668045044, + "learning_rate": 5.650373360837763e-06, + "loss": 0.3087, + "step": 148 + }, + { + "epoch": 0.1200960768614892, + "grad_norm": 5.586178779602051, + "learning_rate": 5.700776331106674e-06, + "loss": 0.4687, + "step": 150 + }, + { + "epoch": 0.12169735788630905, + "grad_norm": 2.687237024307251, + "learning_rate": 5.751313644679071e-06, + "loss": 0.3186, + "step": 152 + }, + { + "epoch": 0.1232986389111289, + "grad_norm": 3.164597511291504, + "learning_rate": 5.8019837223513295e-06, + "loss": 0.4026, + "step": 154 + }, + { + "epoch": 0.12489991993594876, + "grad_norm": 1.6679465770721436, + "learning_rate": 5.852784980771182e-06, + "loss": 0.3488, + "step": 156 + }, + { + "epoch": 0.1265012009607686, + "grad_norm": 1.535750389099121, + "learning_rate": 5.903715832487138e-06, + "loss": 0.3031, + "step": 158 + }, + { + "epoch": 0.12810248198558846, + "grad_norm": 7.858137607574463, + "learning_rate": 5.954774685998206e-06, + "loss": 0.5452, + "step": 160 + }, + { + "epoch": 0.1297037630104083, + "grad_norm": 2.096879243850708, + "learning_rate": 6.005959945803494e-06, + "loss": 0.4502, + "step": 162 + }, + { + "epoch": 0.1313050440352282, + "grad_norm": 12.8297119140625, + "learning_rate": 6.057270012452186e-06, + "loss": 0.5923, + "step": 164 + }, + { + "epoch": 0.13290632506004804, + "grad_norm": 1.9221842288970947, + "learning_rate": 6.108703282593461e-06, + "loss": 0.215, + "step": 166 + }, + { + "epoch": 0.1345076060848679, + "grad_norm": 1.6607649326324463, + "learning_rate": 6.160258149026557e-06, + "loss": 0.29, + "step": 168 + }, + { + "epoch": 0.13610888710968774, + "grad_norm": 2.36198353767395, + "learning_rate": 6.2119330007511014e-06, + "loss": 0.3049, + "step": 170 + }, + { + "epoch": 0.1377101681345076, + "grad_norm": 1.2904274463653564, + "learning_rate": 6.263726223017326e-06, + "loss": 0.2245, + "step": 172 + }, + { + "epoch": 0.13931144915932747, + "grad_norm": 1.6111624240875244, + "learning_rate": 6.315636197376634e-06, + "loss": 0.3324, + "step": 174 + }, + { + "epoch": 0.14091273018414732, + "grad_norm": 1.8647018671035767, + "learning_rate": 6.3676613017321305e-06, + "loss": 0.4151, + "step": 176 + }, + { + "epoch": 0.14251401120896717, + "grad_norm": 4.412709712982178, + "learning_rate": 6.419799910389257e-06, + "loss": 0.471, + "step": 178 + }, + { + "epoch": 0.14411529223378702, + "grad_norm": 1.8745092153549194, + "learning_rate": 6.472050394106689e-06, + "loss": 0.3659, + "step": 180 + }, + { + "epoch": 0.14571657325860687, + "grad_norm": 2.351091146469116, + "learning_rate": 6.524411120147204e-06, + "loss": 0.4143, + "step": 182 + }, + { + "epoch": 0.14731785428342675, + "grad_norm": 1.391424536705017, + "learning_rate": 6.576880452328645e-06, + "loss": 0.2723, + "step": 184 + }, + { + "epoch": 0.1489191353082466, + "grad_norm": 1.687239408493042, + "learning_rate": 6.6294567510751675e-06, + "loss": 0.3327, + "step": 186 + }, + { + "epoch": 0.15052041633306645, + "grad_norm": 3.272613286972046, + "learning_rate": 6.682138373468341e-06, + "loss": 0.4291, + "step": 188 + }, + { + "epoch": 0.1521216973578863, + "grad_norm": 1.3705826997756958, + "learning_rate": 6.734923673298605e-06, + "loss": 0.2553, + "step": 190 + }, + { + "epoch": 0.15372297838270615, + "grad_norm": 1.722609043121338, + "learning_rate": 6.787811001116654e-06, + "loss": 0.3052, + "step": 192 + }, + { + "epoch": 0.15532425940752603, + "grad_norm": 2.0848464965820312, + "learning_rate": 6.840798704284939e-06, + "loss": 0.3346, + "step": 194 + }, + { + "epoch": 0.15692554043234588, + "grad_norm": 1.8060904741287231, + "learning_rate": 6.893885127029419e-06, + "loss": 0.3089, + "step": 196 + }, + { + "epoch": 0.15852682145716573, + "grad_norm": 2.2263824939727783, + "learning_rate": 6.94706861049117e-06, + "loss": 0.3812, + "step": 198 + }, + { + "epoch": 0.16012810248198558, + "grad_norm": 1.9936295747756958, + "learning_rate": 7.000347492778341e-06, + "loss": 0.3641, + "step": 200 + }, + { + "epoch": 0.16172938350680544, + "grad_norm": 2.6321988105773926, + "learning_rate": 7.05372010901803e-06, + "loss": 0.3801, + "step": 202 + }, + { + "epoch": 0.1633306645316253, + "grad_norm": 1.8345807790756226, + "learning_rate": 7.1071847914082605e-06, + "loss": 0.3479, + "step": 204 + }, + { + "epoch": 0.16493194555644516, + "grad_norm": 2.9115114212036133, + "learning_rate": 7.160739869270219e-06, + "loss": 0.3517, + "step": 206 + }, + { + "epoch": 0.16653322658126501, + "grad_norm": 1.208371639251709, + "learning_rate": 7.214383669100317e-06, + "loss": 0.297, + "step": 208 + }, + { + "epoch": 0.16813450760608487, + "grad_norm": 1.6053305864334106, + "learning_rate": 7.268114514622635e-06, + "loss": 0.3037, + "step": 210 + }, + { + "epoch": 0.16973578863090472, + "grad_norm": 1.8439911603927612, + "learning_rate": 7.321930726841144e-06, + "loss": 0.278, + "step": 212 + }, + { + "epoch": 0.17133706965572457, + "grad_norm": 2.527151584625244, + "learning_rate": 7.375830624092336e-06, + "loss": 0.3214, + "step": 214 + }, + { + "epoch": 0.17293835068054444, + "grad_norm": 2.096677780151367, + "learning_rate": 7.429812522097613e-06, + "loss": 0.3638, + "step": 216 + }, + { + "epoch": 0.1745396317053643, + "grad_norm": 1.3505491018295288, + "learning_rate": 7.4838747340160475e-06, + "loss": 0.2269, + "step": 218 + }, + { + "epoch": 0.17614091273018415, + "grad_norm": 1.1443514823913574, + "learning_rate": 7.538015570497046e-06, + "loss": 0.2874, + "step": 220 + }, + { + "epoch": 0.177742193755004, + "grad_norm": 2.5437207221984863, + "learning_rate": 7.592233339733077e-06, + "loss": 0.2776, + "step": 222 + }, + { + "epoch": 0.17934347477982385, + "grad_norm": 2.688483238220215, + "learning_rate": 7.646526347512665e-06, + "loss": 0.415, + "step": 224 + }, + { + "epoch": 0.18094475580464373, + "grad_norm": 1.078393816947937, + "learning_rate": 7.70089289727319e-06, + "loss": 0.2786, + "step": 226 + }, + { + "epoch": 0.18254603682946358, + "grad_norm": 2.1621243953704834, + "learning_rate": 7.755331290154041e-06, + "loss": 0.3658, + "step": 228 + }, + { + "epoch": 0.18414731785428343, + "grad_norm": 2.128343343734741, + "learning_rate": 7.809839825049565e-06, + "loss": 0.2286, + "step": 230 + }, + { + "epoch": 0.18574859887910328, + "grad_norm": 1.7669260501861572, + "learning_rate": 7.864416798662347e-06, + "loss": 0.3507, + "step": 232 + }, + { + "epoch": 0.18734987990392313, + "grad_norm": 1.7092615365982056, + "learning_rate": 7.919060505556376e-06, + "loss": 0.3537, + "step": 234 + }, + { + "epoch": 0.188951160928743, + "grad_norm": 1.8732891082763672, + "learning_rate": 7.973769238210291e-06, + "loss": 0.3478, + "step": 236 + }, + { + "epoch": 0.19055244195356286, + "grad_norm": 1.9242267608642578, + "learning_rate": 8.028541287070858e-06, + "loss": 0.381, + "step": 238 + }, + { + "epoch": 0.1921537229783827, + "grad_norm": 2.7227590084075928, + "learning_rate": 8.083374940606256e-06, + "loss": 0.3516, + "step": 240 + }, + { + "epoch": 0.19375500400320256, + "grad_norm": 1.876050353050232, + "learning_rate": 8.138268485359684e-06, + "loss": 0.2956, + "step": 242 + }, + { + "epoch": 0.1953562850280224, + "grad_norm": 1.9604945182800293, + "learning_rate": 8.193220206002785e-06, + "loss": 0.3478, + "step": 244 + }, + { + "epoch": 0.1969575660528423, + "grad_norm": 1.8093197345733643, + "learning_rate": 8.248228385389349e-06, + "loss": 0.4025, + "step": 246 + }, + { + "epoch": 0.19855884707766214, + "grad_norm": 2.2794013023376465, + "learning_rate": 8.303291304608936e-06, + "loss": 0.4339, + "step": 248 + }, + { + "epoch": 0.200160128102482, + "grad_norm": 1.7428088188171387, + "learning_rate": 8.358407243040524e-06, + "loss": 0.3971, + "step": 250 + }, + { + "epoch": 0.20176140912730184, + "grad_norm": 1.3561124801635742, + "learning_rate": 8.413574478406386e-06, + "loss": 0.3015, + "step": 252 + }, + { + "epoch": 0.2033626901521217, + "grad_norm": 2.6708834171295166, + "learning_rate": 8.468791286825856e-06, + "loss": 0.3608, + "step": 254 + }, + { + "epoch": 0.20496397117694154, + "grad_norm": 1.637882113456726, + "learning_rate": 8.524055942869135e-06, + "loss": 0.3991, + "step": 256 + }, + { + "epoch": 0.20656525220176142, + "grad_norm": 2.983764410018921, + "learning_rate": 8.579366719611353e-06, + "loss": 0.2466, + "step": 258 + }, + { + "epoch": 0.20816653322658127, + "grad_norm": 1.524172067642212, + "learning_rate": 8.634721888686368e-06, + "loss": 0.2188, + "step": 260 + }, + { + "epoch": 0.20976781425140112, + "grad_norm": 1.7539112567901611, + "learning_rate": 8.690119720340907e-06, + "loss": 0.2628, + "step": 262 + }, + { + "epoch": 0.21136909527622097, + "grad_norm": 2.086392402648926, + "learning_rate": 8.74555848348857e-06, + "loss": 0.3347, + "step": 264 + }, + { + "epoch": 0.21297037630104082, + "grad_norm": 4.761439800262451, + "learning_rate": 8.801036445763858e-06, + "loss": 0.3989, + "step": 266 + }, + { + "epoch": 0.2145716573258607, + "grad_norm": 2.470111131668091, + "learning_rate": 8.856551873576448e-06, + "loss": 0.3017, + "step": 268 + }, + { + "epoch": 0.21617293835068055, + "grad_norm": 3.2955055236816406, + "learning_rate": 8.912103032165206e-06, + "loss": 0.6583, + "step": 270 + }, + { + "epoch": 0.2177742193755004, + "grad_norm": 2.539903163909912, + "learning_rate": 8.967688185652527e-06, + "loss": 0.2374, + "step": 272 + }, + { + "epoch": 0.21937550040032025, + "grad_norm": 4.212851524353027, + "learning_rate": 9.023305597098526e-06, + "loss": 0.3754, + "step": 274 + }, + { + "epoch": 0.2209767814251401, + "grad_norm": 1.3846100568771362, + "learning_rate": 9.078953528555258e-06, + "loss": 0.3015, + "step": 276 + }, + { + "epoch": 0.22257806244995998, + "grad_norm": 1.1559629440307617, + "learning_rate": 9.134630241121135e-06, + "loss": 0.2261, + "step": 278 + }, + { + "epoch": 0.22417934347477983, + "grad_norm": 1.824791669845581, + "learning_rate": 9.190333994995208e-06, + "loss": 0.3328, + "step": 280 + }, + { + "epoch": 0.22578062449959968, + "grad_norm": 1.8389813899993896, + "learning_rate": 9.24606304953148e-06, + "loss": 0.3185, + "step": 282 + }, + { + "epoch": 0.22738190552441953, + "grad_norm": 1.5443767309188843, + "learning_rate": 9.301815663293426e-06, + "loss": 0.1946, + "step": 284 + }, + { + "epoch": 0.22898318654923938, + "grad_norm": 1.332728624343872, + "learning_rate": 9.35759009410826e-06, + "loss": 0.3348, + "step": 286 + }, + { + "epoch": 0.23058446757405926, + "grad_norm": 1.4865375757217407, + "learning_rate": 9.41338459912151e-06, + "loss": 0.2382, + "step": 288 + }, + { + "epoch": 0.2321857485988791, + "grad_norm": 1.639641523361206, + "learning_rate": 9.469197434851414e-06, + "loss": 0.4181, + "step": 290 + }, + { + "epoch": 0.23378702962369896, + "grad_norm": 1.754004955291748, + "learning_rate": 9.52502685724336e-06, + "loss": 0.1955, + "step": 292 + }, + { + "epoch": 0.2353883106485188, + "grad_norm": 1.878404974937439, + "learning_rate": 9.580871121724498e-06, + "loss": 0.2628, + "step": 294 + }, + { + "epoch": 0.23698959167333866, + "grad_norm": 2.0904641151428223, + "learning_rate": 9.636728483258116e-06, + "loss": 0.2504, + "step": 296 + }, + { + "epoch": 0.23859087269815854, + "grad_norm": 2.481448173522949, + "learning_rate": 9.692597196398302e-06, + "loss": 0.3345, + "step": 298 + }, + { + "epoch": 0.2401921537229784, + "grad_norm": 1.5625683069229126, + "learning_rate": 9.748475515344416e-06, + "loss": 0.3215, + "step": 300 + }, + { + "epoch": 0.24179343474779824, + "grad_norm": 3.5054006576538086, + "learning_rate": 9.80436169399561e-06, + "loss": 0.4917, + "step": 302 + }, + { + "epoch": 0.2433947157726181, + "grad_norm": 2.459205389022827, + "learning_rate": 9.8602539860055e-06, + "loss": 0.3187, + "step": 304 + }, + { + "epoch": 0.24499599679743794, + "grad_norm": 3.288682699203491, + "learning_rate": 9.916150644836596e-06, + "loss": 0.3489, + "step": 306 + }, + { + "epoch": 0.2465972778222578, + "grad_norm": 1.197472333908081, + "learning_rate": 9.972049923815011e-06, + "loss": 0.3139, + "step": 308 + }, + { + "epoch": 0.24819855884707767, + "grad_norm": 4.154095649719238, + "learning_rate": 1.0027950076184982e-05, + "loss": 0.3046, + "step": 310 + }, + { + "epoch": 0.24979983987189752, + "grad_norm": 2.2775681018829346, + "learning_rate": 1.0083849355163397e-05, + "loss": 0.3641, + "step": 312 + }, + { + "epoch": 0.2514011208967174, + "grad_norm": 1.5846518278121948, + "learning_rate": 1.0139746013994493e-05, + "loss": 0.3347, + "step": 314 + }, + { + "epoch": 0.2530024019215372, + "grad_norm": 2.296945571899414, + "learning_rate": 1.0195638306004383e-05, + "loss": 0.2021, + "step": 316 + }, + { + "epoch": 0.2546036829463571, + "grad_norm": 2.9808125495910645, + "learning_rate": 1.0251524484655577e-05, + "loss": 0.3408, + "step": 318 + }, + { + "epoch": 0.2562049639711769, + "grad_norm": 2.628178596496582, + "learning_rate": 1.0307402803601691e-05, + "loss": 0.384, + "step": 320 + }, + { + "epoch": 0.2578062449959968, + "grad_norm": 1.5159741640090942, + "learning_rate": 1.0363271516741877e-05, + "loss": 0.2625, + "step": 322 + }, + { + "epoch": 0.2594075260208166, + "grad_norm": 1.4917831420898438, + "learning_rate": 1.0419128878275495e-05, + "loss": 0.3291, + "step": 324 + }, + { + "epoch": 0.2610088070456365, + "grad_norm": 1.811463713645935, + "learning_rate": 1.0474973142756632e-05, + "loss": 0.2711, + "step": 326 + }, + { + "epoch": 0.2626100880704564, + "grad_norm": 5.573718547821045, + "learning_rate": 1.053080256514858e-05, + "loss": 0.3443, + "step": 328 + }, + { + "epoch": 0.2642113690952762, + "grad_norm": 2.441801071166992, + "learning_rate": 1.0586615400878484e-05, + "loss": 0.4087, + "step": 330 + }, + { + "epoch": 0.2658126501200961, + "grad_norm": 2.6247429847717285, + "learning_rate": 1.0642409905891733e-05, + "loss": 0.532, + "step": 332 + }, + { + "epoch": 0.2674139311449159, + "grad_norm": 4.646184921264648, + "learning_rate": 1.0698184336706567e-05, + "loss": 0.6012, + "step": 334 + }, + { + "epoch": 0.2690152121697358, + "grad_norm": 2.1405415534973145, + "learning_rate": 1.0753936950468513e-05, + "loss": 0.3067, + "step": 336 + }, + { + "epoch": 0.27061649319455566, + "grad_norm": 0.41845548152923584, + "learning_rate": 1.0809666005004787e-05, + "loss": 0.1388, + "step": 338 + }, + { + "epoch": 0.2722177742193755, + "grad_norm": 1.854414463043213, + "learning_rate": 1.0865369758878858e-05, + "loss": 0.415, + "step": 340 + }, + { + "epoch": 0.27381905524419536, + "grad_norm": 2.5001614093780518, + "learning_rate": 1.0921046471444737e-05, + "loss": 0.4508, + "step": 342 + }, + { + "epoch": 0.2754203362690152, + "grad_norm": 1.6296179294586182, + "learning_rate": 1.0976694402901467e-05, + "loss": 0.3249, + "step": 344 + }, + { + "epoch": 0.27702161729383507, + "grad_norm": 1.378982663154602, + "learning_rate": 1.1032311814347467e-05, + "loss": 0.3347, + "step": 346 + }, + { + "epoch": 0.27862289831865494, + "grad_norm": 1.6568927764892578, + "learning_rate": 1.1087896967834787e-05, + "loss": 0.3642, + "step": 348 + }, + { + "epoch": 0.28022417934347477, + "grad_norm": 2.3074300289154053, + "learning_rate": 1.1143448126423545e-05, + "loss": 0.2778, + "step": 350 + }, + { + "epoch": 0.28182546036829464, + "grad_norm": 2.07737135887146, + "learning_rate": 1.1198963554236135e-05, + "loss": 0.2969, + "step": 352 + }, + { + "epoch": 0.28342674139311447, + "grad_norm": 1.7472927570343018, + "learning_rate": 1.1254441516511425e-05, + "loss": 0.2224, + "step": 354 + }, + { + "epoch": 0.28502802241793435, + "grad_norm": 3.047524929046631, + "learning_rate": 1.1309880279659087e-05, + "loss": 0.3759, + "step": 356 + }, + { + "epoch": 0.2866293034427542, + "grad_norm": 2.331300973892212, + "learning_rate": 1.1365278111313625e-05, + "loss": 0.3265, + "step": 358 + }, + { + "epoch": 0.28823058446757405, + "grad_norm": 1.4776637554168701, + "learning_rate": 1.142063328038864e-05, + "loss": 0.3185, + "step": 360 + }, + { + "epoch": 0.2898318654923939, + "grad_norm": 2.6258203983306885, + "learning_rate": 1.1475944057130856e-05, + "loss": 0.4027, + "step": 362 + }, + { + "epoch": 0.29143314651721375, + "grad_norm": 1.3408443927764893, + "learning_rate": 1.1531208713174138e-05, + "loss": 0.186, + "step": 364 + }, + { + "epoch": 0.2930344275420336, + "grad_norm": 1.940138578414917, + "learning_rate": 1.1586425521593607e-05, + "loss": 0.2779, + "step": 366 + }, + { + "epoch": 0.2946357085668535, + "grad_norm": 3.022674322128296, + "learning_rate": 1.1641592756959467e-05, + "loss": 0.3951, + "step": 368 + }, + { + "epoch": 0.2962369895916733, + "grad_norm": 1.685876727104187, + "learning_rate": 1.1696708695391057e-05, + "loss": 0.5109, + "step": 370 + }, + { + "epoch": 0.2978382706164932, + "grad_norm": 1.6997098922729492, + "learning_rate": 1.1751771614610643e-05, + "loss": 0.4209, + "step": 372 + }, + { + "epoch": 0.29943955164131303, + "grad_norm": 1.7384082078933716, + "learning_rate": 1.180677979399721e-05, + "loss": 0.3054, + "step": 374 + }, + { + "epoch": 0.3010408326661329, + "grad_norm": 1.675770878791809, + "learning_rate": 1.1861731514640309e-05, + "loss": 0.3972, + "step": 376 + }, + { + "epoch": 0.3026421136909528, + "grad_norm": 1.5920178890228271, + "learning_rate": 1.1916625059393739e-05, + "loss": 0.3658, + "step": 378 + }, + { + "epoch": 0.3042433947157726, + "grad_norm": 1.48033607006073, + "learning_rate": 1.1971458712929133e-05, + "loss": 0.348, + "step": 380 + }, + { + "epoch": 0.3058446757405925, + "grad_norm": 2.6713180541992188, + "learning_rate": 1.2026230761789702e-05, + "loss": 0.3658, + "step": 382 + }, + { + "epoch": 0.3074459567654123, + "grad_norm": 2.8610692024230957, + "learning_rate": 1.2080939494443618e-05, + "loss": 0.3087, + "step": 384 + }, + { + "epoch": 0.3090472377902322, + "grad_norm": 1.8404741287231445, + "learning_rate": 1.2135583201337646e-05, + "loss": 0.3348, + "step": 386 + }, + { + "epoch": 0.31064851881505207, + "grad_norm": 1.4327110052108765, + "learning_rate": 1.2190160174950428e-05, + "loss": 0.2581, + "step": 388 + }, + { + "epoch": 0.3122497998398719, + "grad_norm": 1.7647172212600708, + "learning_rate": 1.2244668709845952e-05, + "loss": 0.3489, + "step": 390 + }, + { + "epoch": 0.31385108086469177, + "grad_norm": 2.0167479515075684, + "learning_rate": 1.2299107102726804e-05, + "loss": 0.3972, + "step": 392 + }, + { + "epoch": 0.3154523618895116, + "grad_norm": 2.173624038696289, + "learning_rate": 1.2353473652487329e-05, + "loss": 0.323, + "step": 394 + }, + { + "epoch": 0.31705364291433147, + "grad_norm": 2.08650803565979, + "learning_rate": 1.2407766660266916e-05, + "loss": 0.3972, + "step": 396 + }, + { + "epoch": 0.31865492393915135, + "grad_norm": 1.8034814596176147, + "learning_rate": 1.2461984429502947e-05, + "loss": 0.286, + "step": 398 + }, + { + "epoch": 0.32025620496397117, + "grad_norm": 2.0325725078582764, + "learning_rate": 1.2516125265983945e-05, + "loss": 0.3662, + "step": 400 + }, + { + "epoch": 0.32185748598879105, + "grad_norm": 1.9155844449996948, + "learning_rate": 1.257018747790238e-05, + "loss": 0.3032, + "step": 402 + }, + { + "epoch": 0.32345876701361087, + "grad_norm": 2.3540198802948, + "learning_rate": 1.2624169375907657e-05, + "loss": 0.3843, + "step": 404 + }, + { + "epoch": 0.32506004803843075, + "grad_norm": 1.9836889505386353, + "learning_rate": 1.2678069273158849e-05, + "loss": 0.3641, + "step": 406 + }, + { + "epoch": 0.3266613290632506, + "grad_norm": 2.676269769668579, + "learning_rate": 1.273188548537736e-05, + "loss": 0.3994, + "step": 408 + }, + { + "epoch": 0.32826261008807045, + "grad_norm": 1.7180349826812744, + "learning_rate": 1.2785616330899676e-05, + "loss": 0.3641, + "step": 410 + }, + { + "epoch": 0.32986389111289033, + "grad_norm": 1.5088156461715698, + "learning_rate": 1.2839260130729776e-05, + "loss": 0.3057, + "step": 412 + }, + { + "epoch": 0.33146517213771015, + "grad_norm": 2.0771749019622803, + "learning_rate": 1.2892815208591734e-05, + "loss": 0.3347, + "step": 414 + }, + { + "epoch": 0.33306645316253003, + "grad_norm": 1.9146101474761963, + "learning_rate": 1.2946279890981966e-05, + "loss": 0.3325, + "step": 416 + }, + { + "epoch": 0.33466773418734985, + "grad_norm": 1.9532804489135742, + "learning_rate": 1.2999652507221652e-05, + "loss": 0.3639, + "step": 418 + }, + { + "epoch": 0.33626901521216973, + "grad_norm": 1.751731038093567, + "learning_rate": 1.3052931389508822e-05, + "loss": 0.3185, + "step": 420 + }, + { + "epoch": 0.3378702962369896, + "grad_norm": 2.0606656074523926, + "learning_rate": 1.3106114872970575e-05, + "loss": 0.3517, + "step": 422 + }, + { + "epoch": 0.33947157726180943, + "grad_norm": 2.1029534339904785, + "learning_rate": 1.3159201295715054e-05, + "loss": 0.3662, + "step": 424 + }, + { + "epoch": 0.3410728582866293, + "grad_norm": 1.616235613822937, + "learning_rate": 1.321218899888334e-05, + "loss": 0.3518, + "step": 426 + }, + { + "epoch": 0.34267413931144913, + "grad_norm": 2.0449812412261963, + "learning_rate": 1.326507632670139e-05, + "loss": 0.3462, + "step": 428 + }, + { + "epoch": 0.344275420336269, + "grad_norm": 2.8831207752227783, + "learning_rate": 1.3317861626531652e-05, + "loss": 0.3462, + "step": 430 + }, + { + "epoch": 0.3458767013610889, + "grad_norm": 2.1896278858184814, + "learning_rate": 1.3370543248924826e-05, + "loss": 0.3801, + "step": 432 + }, + { + "epoch": 0.3474779823859087, + "grad_norm": 2.166494131088257, + "learning_rate": 1.3423119547671348e-05, + "loss": 0.3802, + "step": 434 + }, + { + "epoch": 0.3490792634107286, + "grad_norm": 1.8489634990692139, + "learning_rate": 1.347558887985279e-05, + "loss": 0.3478, + "step": 436 + }, + { + "epoch": 0.3506805444355484, + "grad_norm": 1.8106091022491455, + "learning_rate": 1.3527949605893305e-05, + "loss": 0.3055, + "step": 438 + }, + { + "epoch": 0.3522818254603683, + "grad_norm": 1.8300484418869019, + "learning_rate": 1.3580200089610739e-05, + "loss": 0.349, + "step": 440 + }, + { + "epoch": 0.35388310648518817, + "grad_norm": 2.0066466331481934, + "learning_rate": 1.3632338698267863e-05, + "loss": 0.3479, + "step": 442 + }, + { + "epoch": 0.355484387510008, + "grad_norm": 2.238502025604248, + "learning_rate": 1.368436380262336e-05, + "loss": 0.3975, + "step": 444 + }, + { + "epoch": 0.35708566853482787, + "grad_norm": 2.0213420391082764, + "learning_rate": 1.3736273776982667e-05, + "loss": 0.3481, + "step": 446 + }, + { + "epoch": 0.3586869495596477, + "grad_norm": 2.09098482131958, + "learning_rate": 1.3788066999248893e-05, + "loss": 0.5591, + "step": 448 + }, + { + "epoch": 0.3602882305844676, + "grad_norm": 2.1597900390625, + "learning_rate": 1.3839741850973435e-05, + "loss": 0.3802, + "step": 450 + }, + { + "epoch": 0.36188951160928745, + "grad_norm": 2.41070294380188, + "learning_rate": 1.3891296717406533e-05, + "loss": 0.3488, + "step": 452 + }, + { + "epoch": 0.3634907926341073, + "grad_norm": 1.8809187412261963, + "learning_rate": 1.3942729987547808e-05, + "loss": 0.2781, + "step": 454 + }, + { + "epoch": 0.36509207365892715, + "grad_norm": 2.5464258193969727, + "learning_rate": 1.3994040054196498e-05, + "loss": 0.3699, + "step": 456 + }, + { + "epoch": 0.366693354683747, + "grad_norm": 3.266629934310913, + "learning_rate": 1.4045225314001789e-05, + "loss": 0.3956, + "step": 458 + }, + { + "epoch": 0.36829463570856685, + "grad_norm": 1.642899513244629, + "learning_rate": 1.4096284167512856e-05, + "loss": 0.259, + "step": 460 + }, + { + "epoch": 0.36989591673338673, + "grad_norm": 6.459237098693848, + "learning_rate": 1.4147215019228813e-05, + "loss": 0.4653, + "step": 462 + }, + { + "epoch": 0.37149719775820655, + "grad_norm": 1.8249849081039429, + "learning_rate": 1.4198016277648665e-05, + "loss": 0.3348, + "step": 464 + }, + { + "epoch": 0.37309847878302643, + "grad_norm": 2.1837408542633057, + "learning_rate": 1.4248686355320922e-05, + "loss": 0.3217, + "step": 466 + }, + { + "epoch": 0.37469975980784626, + "grad_norm": 1.8836848735809326, + "learning_rate": 1.429922366889332e-05, + "loss": 0.333, + "step": 468 + }, + { + "epoch": 0.37630104083266613, + "grad_norm": 1.833238124847412, + "learning_rate": 1.4349626639162231e-05, + "loss": 0.3481, + "step": 470 + }, + { + "epoch": 0.377902321857486, + "grad_norm": 1.6237941980361938, + "learning_rate": 1.4399893691121985e-05, + "loss": 0.3056, + "step": 472 + }, + { + "epoch": 0.37950360288230583, + "grad_norm": 2.040926933288574, + "learning_rate": 1.4450023254014185e-05, + "loss": 0.4713, + "step": 474 + }, + { + "epoch": 0.3811048839071257, + "grad_norm": 1.970820665359497, + "learning_rate": 1.4500013761376663e-05, + "loss": 0.4153, + "step": 476 + }, + { + "epoch": 0.38270616493194554, + "grad_norm": 1.77826726436615, + "learning_rate": 1.454986365109255e-05, + "loss": 0.3482, + "step": 478 + }, + { + "epoch": 0.3843074459567654, + "grad_norm": 1.71432626247406, + "learning_rate": 1.4599571365439027e-05, + "loss": 0.3328, + "step": 480 + }, + { + "epoch": 0.3859087269815853, + "grad_norm": 1.879286766052246, + "learning_rate": 1.4649135351135968e-05, + "loss": 0.3482, + "step": 482 + }, + { + "epoch": 0.3875100080064051, + "grad_norm": 2.670933961868286, + "learning_rate": 1.4698554059394563e-05, + "loss": 0.4088, + "step": 484 + }, + { + "epoch": 0.389111289031225, + "grad_norm": 2.2173593044281006, + "learning_rate": 1.4747825945965675e-05, + "loss": 0.4325, + "step": 486 + }, + { + "epoch": 0.3907125700560448, + "grad_norm": 2.0346882343292236, + "learning_rate": 1.4796949471188033e-05, + "loss": 0.3387, + "step": 488 + }, + { + "epoch": 0.3923138510808647, + "grad_norm": 1.1058213710784912, + "learning_rate": 1.4845923100036479e-05, + "loss": 0.2241, + "step": 490 + }, + { + "epoch": 0.3939151321056846, + "grad_norm": 1.937864899635315, + "learning_rate": 1.4894745302169786e-05, + "loss": 0.3189, + "step": 492 + }, + { + "epoch": 0.3955164131305044, + "grad_norm": 1.9153870344161987, + "learning_rate": 1.4943414551978597e-05, + "loss": 0.3178, + "step": 494 + }, + { + "epoch": 0.3971176941553243, + "grad_norm": 1.8156814575195312, + "learning_rate": 1.499192932863305e-05, + "loss": 0.3802, + "step": 496 + }, + { + "epoch": 0.3987189751801441, + "grad_norm": 1.5624991655349731, + "learning_rate": 1.5040288116130261e-05, + "loss": 0.3842, + "step": 498 + }, + { + "epoch": 0.400320256204964, + "grad_norm": 1.6995452642440796, + "learning_rate": 1.5088489403341793e-05, + "loss": 0.3643, + "step": 500 + }, + { + "epoch": 0.40192153722978385, + "grad_norm": 2.1517858505249023, + "learning_rate": 1.513653168406076e-05, + "loss": 0.352, + "step": 502 + }, + { + "epoch": 0.4035228182546037, + "grad_norm": 1.6353377103805542, + "learning_rate": 1.5184413457049006e-05, + "loss": 0.3643, + "step": 504 + }, + { + "epoch": 0.40512409927942356, + "grad_norm": 1.63137686252594, + "learning_rate": 1.5232133226083954e-05, + "loss": 0.3036, + "step": 506 + }, + { + "epoch": 0.4067253803042434, + "grad_norm": 1.6981267929077148, + "learning_rate": 1.527968950000533e-05, + "loss": 0.3331, + "step": 508 + }, + { + "epoch": 0.40832666132906326, + "grad_norm": 1.8865103721618652, + "learning_rate": 1.532708079276185e-05, + "loss": 0.348, + "step": 510 + }, + { + "epoch": 0.4099279423538831, + "grad_norm": 1.747130036354065, + "learning_rate": 1.5374305623457594e-05, + "loss": 0.3804, + "step": 512 + }, + { + "epoch": 0.41152922337870296, + "grad_norm": 1.7363591194152832, + "learning_rate": 1.542136251639826e-05, + "loss": 0.348, + "step": 514 + }, + { + "epoch": 0.41313050440352284, + "grad_norm": 1.7197147607803345, + "learning_rate": 1.5468250001137368e-05, + "loss": 0.3177, + "step": 516 + }, + { + "epoch": 0.41473178542834266, + "grad_norm": 1.5683311223983765, + "learning_rate": 1.551496661252208e-05, + "loss": 0.3386, + "step": 518 + }, + { + "epoch": 0.41633306645316254, + "grad_norm": 1.5975334644317627, + "learning_rate": 1.5561510890739113e-05, + "loss": 0.3093, + "step": 520 + }, + { + "epoch": 0.41793434747798236, + "grad_norm": 1.6663936376571655, + "learning_rate": 1.5607881381360296e-05, + "loss": 0.333, + "step": 522 + }, + { + "epoch": 0.41953562850280224, + "grad_norm": 1.9398167133331299, + "learning_rate": 1.565407663538797e-05, + "loss": 0.3662, + "step": 524 + }, + { + "epoch": 0.4211369095276221, + "grad_norm": 1.7309986352920532, + "learning_rate": 1.5700095209300376e-05, + "loss": 0.225, + "step": 526 + }, + { + "epoch": 0.42273819055244194, + "grad_norm": 1.6755869388580322, + "learning_rate": 1.5745935665096647e-05, + "loss": 0.3481, + "step": 528 + }, + { + "epoch": 0.4243394715772618, + "grad_norm": 1.6270941495895386, + "learning_rate": 1.5791596570341844e-05, + "loss": 0.3361, + "step": 530 + }, + { + "epoch": 0.42594075260208164, + "grad_norm": 1.9306557178497314, + "learning_rate": 1.5837076498211666e-05, + "loss": 0.3489, + "step": 532 + }, + { + "epoch": 0.4275420336269015, + "grad_norm": 1.2436866760253906, + "learning_rate": 1.5882374027537005e-05, + "loss": 0.2426, + "step": 534 + }, + { + "epoch": 0.4291433146517214, + "grad_norm": 1.7200788259506226, + "learning_rate": 1.5927487742848448e-05, + "loss": 0.3217, + "step": 536 + }, + { + "epoch": 0.4307445956765412, + "grad_norm": 2.5357277393341064, + "learning_rate": 1.5972416234420393e-05, + "loss": 0.4749, + "step": 538 + }, + { + "epoch": 0.4323458767013611, + "grad_norm": 1.9820090532302856, + "learning_rate": 1.60171580983152e-05, + "loss": 0.3974, + "step": 540 + }, + { + "epoch": 0.4339471577261809, + "grad_norm": 2.0953848361968994, + "learning_rate": 1.606171193642703e-05, + "loss": 0.3641, + "step": 542 + }, + { + "epoch": 0.4355484387510008, + "grad_norm": 2.016631841659546, + "learning_rate": 1.6106076356525474e-05, + "loss": 0.3346, + "step": 544 + }, + { + "epoch": 0.4371497197758207, + "grad_norm": 1.38259756565094, + "learning_rate": 1.6150249972299153e-05, + "loss": 0.2723, + "step": 546 + }, + { + "epoch": 0.4387510008006405, + "grad_norm": 1.9949777126312256, + "learning_rate": 1.6194231403398994e-05, + "loss": 0.3973, + "step": 548 + }, + { + "epoch": 0.4403522818254604, + "grad_norm": 2.0576248168945312, + "learning_rate": 1.6238019275481313e-05, + "loss": 0.3329, + "step": 550 + }, + { + "epoch": 0.4419535628502802, + "grad_norm": 1.6442257165908813, + "learning_rate": 1.6281612220250883e-05, + "loss": 0.3813, + "step": 552 + }, + { + "epoch": 0.4435548438751001, + "grad_norm": 1.9549113512039185, + "learning_rate": 1.6325008875503543e-05, + "loss": 0.3973, + "step": 554 + }, + { + "epoch": 0.44515612489991996, + "grad_norm": 1.6894091367721558, + "learning_rate": 1.6368207885168897e-05, + "loss": 0.3489, + "step": 556 + }, + { + "epoch": 0.4467574059247398, + "grad_norm": 3.2086870670318604, + "learning_rate": 1.641120789935263e-05, + "loss": 0.3756, + "step": 558 + }, + { + "epoch": 0.44835868694955966, + "grad_norm": 2.638944625854492, + "learning_rate": 1.6454007574378637e-05, + "loss": 0.3349, + "step": 560 + }, + { + "epoch": 0.4499599679743795, + "grad_norm": 1.3562051057815552, + "learning_rate": 1.6496605572831134e-05, + "loss": 0.2867, + "step": 562 + }, + { + "epoch": 0.45156124899919936, + "grad_norm": 1.2741633653640747, + "learning_rate": 1.6539000563596318e-05, + "loss": 0.3073, + "step": 564 + }, + { + "epoch": 0.45316253002401924, + "grad_norm": 2.7496259212493896, + "learning_rate": 1.6581191221904077e-05, + "loss": 0.3518, + "step": 566 + }, + { + "epoch": 0.45476381104883906, + "grad_norm": 1.6119768619537354, + "learning_rate": 1.6623176229369324e-05, + "loss": 0.3643, + "step": 568 + }, + { + "epoch": 0.45636509207365894, + "grad_norm": 3.486994743347168, + "learning_rate": 1.6664954274033168e-05, + "loss": 0.3974, + "step": 570 + }, + { + "epoch": 0.45796637309847876, + "grad_norm": 1.9129976034164429, + "learning_rate": 1.6706524050403996e-05, + "loss": 0.3347, + "step": 572 + }, + { + "epoch": 0.45956765412329864, + "grad_norm": 2.300387144088745, + "learning_rate": 1.674788425949818e-05, + "loss": 0.3351, + "step": 574 + }, + { + "epoch": 0.4611689351481185, + "grad_norm": 1.5266647338867188, + "learning_rate": 1.6789033608880735e-05, + "loss": 0.2817, + "step": 576 + }, + { + "epoch": 0.46277021617293834, + "grad_norm": 1.4700289964675903, + "learning_rate": 1.6829970812705674e-05, + "loss": 0.4145, + "step": 578 + }, + { + "epoch": 0.4643714971977582, + "grad_norm": 1.954932689666748, + "learning_rate": 1.6870694591756165e-05, + "loss": 0.2528, + "step": 580 + }, + { + "epoch": 0.46597277822257804, + "grad_norm": 2.3609018325805664, + "learning_rate": 1.6911203673484577e-05, + "loss": 0.3814, + "step": 582 + }, + { + "epoch": 0.4675740592473979, + "grad_norm": 3.7101941108703613, + "learning_rate": 1.695149679205214e-05, + "loss": 0.3635, + "step": 584 + }, + { + "epoch": 0.4691753402722178, + "grad_norm": 1.639758825302124, + "learning_rate": 1.6991572688368628e-05, + "loss": 0.403, + "step": 586 + }, + { + "epoch": 0.4707766212970376, + "grad_norm": 2.006375312805176, + "learning_rate": 1.7031430110131562e-05, + "loss": 0.4144, + "step": 588 + }, + { + "epoch": 0.4723779023218575, + "grad_norm": 1.8549580574035645, + "learning_rate": 1.7071067811865467e-05, + "loss": 0.4154, + "step": 590 + }, + { + "epoch": 0.4739791833466773, + "grad_norm": 1.7956687211990356, + "learning_rate": 1.7110484554960738e-05, + "loss": 0.333, + "step": 592 + }, + { + "epoch": 0.4755804643714972, + "grad_norm": 1.839286208152771, + "learning_rate": 1.7149679107712306e-05, + "loss": 0.3804, + "step": 594 + }, + { + "epoch": 0.4771817453963171, + "grad_norm": 2.663442611694336, + "learning_rate": 1.7188650245358215e-05, + "loss": 0.4696, + "step": 596 + }, + { + "epoch": 0.4787830264211369, + "grad_norm": 3.3117480278015137, + "learning_rate": 1.722739675011779e-05, + "loss": 0.3941, + "step": 598 + }, + { + "epoch": 0.4803843074459568, + "grad_norm": 2.152716875076294, + "learning_rate": 1.726591741122981e-05, + "loss": 0.3974, + "step": 600 + }, + { + "epoch": 0.4819855884707766, + "grad_norm": 1.8596621751785278, + "learning_rate": 1.730421102499021e-05, + "loss": 0.2875, + "step": 602 + }, + { + "epoch": 0.4835868694955965, + "grad_norm": 1.9682468175888062, + "learning_rate": 1.734227639478982e-05, + "loss": 0.3177, + "step": 604 + }, + { + "epoch": 0.4851881505204163, + "grad_norm": 2.0866036415100098, + "learning_rate": 1.738011233115165e-05, + "loss": 0.3802, + "step": 606 + }, + { + "epoch": 0.4867894315452362, + "grad_norm": 1.8462809324264526, + "learning_rate": 1.7417717651768144e-05, + "loss": 0.4154, + "step": 608 + }, + { + "epoch": 0.48839071257005606, + "grad_norm": 2.0835726261138916, + "learning_rate": 1.7455091181538087e-05, + "loss": 0.4697, + "step": 610 + }, + { + "epoch": 0.4899919935948759, + "grad_norm": 1.5900888442993164, + "learning_rate": 1.74922317526033e-05, + "loss": 0.2894, + "step": 612 + }, + { + "epoch": 0.49159327461969576, + "grad_norm": 1.7028789520263672, + "learning_rate": 1.752913820438519e-05, + "loss": 0.3482, + "step": 614 + }, + { + "epoch": 0.4931945556445156, + "grad_norm": 1.3874263763427734, + "learning_rate": 1.756580938362096e-05, + "loss": 0.2728, + "step": 616 + }, + { + "epoch": 0.49479583666933546, + "grad_norm": 2.1336591243743896, + "learning_rate": 1.7602244144399693e-05, + "loss": 0.4156, + "step": 618 + }, + { + "epoch": 0.49639711769415534, + "grad_norm": 1.832646369934082, + "learning_rate": 1.7638441348198147e-05, + "loss": 0.1536, + "step": 620 + }, + { + "epoch": 0.49799839871897517, + "grad_norm": 2.6830151081085205, + "learning_rate": 1.7674399863916295e-05, + "loss": 0.3723, + "step": 622 + }, + { + "epoch": 0.49959967974379504, + "grad_norm": 2.5655150413513184, + "learning_rate": 1.771011856791273e-05, + "loss": 0.4157, + "step": 624 + }, + { + "epoch": 0.5012009607686149, + "grad_norm": 1.8397849798202515, + "learning_rate": 1.7745596344039712e-05, + "loss": 0.3224, + "step": 626 + }, + { + "epoch": 0.5028022417934348, + "grad_norm": 2.302499771118164, + "learning_rate": 1.7780832083678116e-05, + "loss": 0.3662, + "step": 628 + }, + { + "epoch": 0.5044035228182546, + "grad_norm": 1.7506077289581299, + "learning_rate": 1.7815824685772035e-05, + "loss": 0.3382, + "step": 630 + }, + { + "epoch": 0.5060048038430744, + "grad_norm": 2.0748207569122314, + "learning_rate": 1.7850573056863156e-05, + "loss": 0.4148, + "step": 632 + }, + { + "epoch": 0.5076060848678943, + "grad_norm": 1.938187599182129, + "learning_rate": 1.7885076111125004e-05, + "loss": 0.3483, + "step": 634 + }, + { + "epoch": 0.5092073658927142, + "grad_norm": 2.0107624530792236, + "learning_rate": 1.791933277039679e-05, + "loss": 0.3974, + "step": 636 + }, + { + "epoch": 0.510808646917534, + "grad_norm": 1.8011302947998047, + "learning_rate": 1.7953341964217183e-05, + "loss": 0.282, + "step": 638 + }, + { + "epoch": 0.5124099279423538, + "grad_norm": 1.9910082817077637, + "learning_rate": 1.7987102629857696e-05, + "loss": 0.3975, + "step": 640 + }, + { + "epoch": 0.5140112089671738, + "grad_norm": 2.447939157485962, + "learning_rate": 1.8020613712355912e-05, + "loss": 0.3057, + "step": 642 + }, + { + "epoch": 0.5156124899919936, + "grad_norm": 2.5724034309387207, + "learning_rate": 1.805387416454847e-05, + "loss": 0.3331, + "step": 644 + }, + { + "epoch": 0.5172137710168134, + "grad_norm": 2.0866804122924805, + "learning_rate": 1.8086882947103787e-05, + "loss": 0.3152, + "step": 646 + }, + { + "epoch": 0.5188150520416333, + "grad_norm": 2.240696668624878, + "learning_rate": 1.811963902855447e-05, + "loss": 0.3522, + "step": 648 + }, + { + "epoch": 0.5204163330664532, + "grad_norm": 3.908458709716797, + "learning_rate": 1.8152141385329658e-05, + "loss": 0.459, + "step": 650 + }, + { + "epoch": 0.522017614091273, + "grad_norm": 2.5555665493011475, + "learning_rate": 1.8184389001786895e-05, + "loss": 0.4517, + "step": 652 + }, + { + "epoch": 0.5236188951160928, + "grad_norm": 2.213336706161499, + "learning_rate": 1.821638087024396e-05, + "loss": 0.3183, + "step": 654 + }, + { + "epoch": 0.5252201761409128, + "grad_norm": 1.903831124305725, + "learning_rate": 1.8248115991010296e-05, + "loss": 0.2905, + "step": 656 + }, + { + "epoch": 0.5268214571657326, + "grad_norm": 2.098426103591919, + "learning_rate": 1.8279593372418264e-05, + "loss": 0.3813, + "step": 658 + }, + { + "epoch": 0.5284227381905524, + "grad_norm": 2.0248429775238037, + "learning_rate": 1.8310812030854155e-05, + "loss": 0.3974, + "step": 660 + }, + { + "epoch": 0.5300240192153723, + "grad_norm": 2.2284696102142334, + "learning_rate": 1.834177099078887e-05, + "loss": 0.3815, + "step": 662 + }, + { + "epoch": 0.5316253002401922, + "grad_norm": 2.517500162124634, + "learning_rate": 1.8372469284808465e-05, + "loss": 0.3815, + "step": 664 + }, + { + "epoch": 0.533226581265012, + "grad_norm": 2.9114553928375244, + "learning_rate": 1.840290595364436e-05, + "loss": 0.4183, + "step": 666 + }, + { + "epoch": 0.5348278622898318, + "grad_norm": 2.6809699535369873, + "learning_rate": 1.8433080046203286e-05, + "loss": 0.304, + "step": 668 + }, + { + "epoch": 0.5364291433146517, + "grad_norm": 1.4788970947265625, + "learning_rate": 1.8462990619597054e-05, + "loss": 0.333, + "step": 670 + }, + { + "epoch": 0.5380304243394716, + "grad_norm": 1.604383111000061, + "learning_rate": 1.8492636739171966e-05, + "loss": 0.3133, + "step": 672 + }, + { + "epoch": 0.5396317053642914, + "grad_norm": 1.5856913328170776, + "learning_rate": 1.8522017478538067e-05, + "loss": 0.3097, + "step": 674 + }, + { + "epoch": 0.5412329863891113, + "grad_norm": 1.7136253118515015, + "learning_rate": 1.855113191959808e-05, + "loss": 0.3803, + "step": 676 + }, + { + "epoch": 0.5428342674139311, + "grad_norm": 1.638028860092163, + "learning_rate": 1.8579979152576063e-05, + "loss": 0.2629, + "step": 678 + }, + { + "epoch": 0.544435548438751, + "grad_norm": 1.5598770380020142, + "learning_rate": 1.8608558276045895e-05, + "loss": 0.2783, + "step": 680 + }, + { + "epoch": 0.5460368294635709, + "grad_norm": 2.4856526851654053, + "learning_rate": 1.86368683969594e-05, + "loss": 0.5138, + "step": 682 + }, + { + "epoch": 0.5476381104883907, + "grad_norm": 4.384547710418701, + "learning_rate": 1.866490863067425e-05, + "loss": 0.2759, + "step": 684 + }, + { + "epoch": 0.5492393915132106, + "grad_norm": 1.7379145622253418, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.352, + "step": 686 + }, + { + "epoch": 0.5508406725380304, + "grad_norm": 1.5868635177612305, + "learning_rate": 1.8720175940133705e-05, + "loss": 0.3759, + "step": 688 + }, + { + "epoch": 0.5524419535628503, + "grad_norm": 2.866621971130371, + "learning_rate": 1.8747401288870472e-05, + "loss": 0.3953, + "step": 690 + }, + { + "epoch": 0.5540432345876701, + "grad_norm": 1.9210234880447388, + "learning_rate": 1.877435329644691e-05, + "loss": 0.4152, + "step": 692 + }, + { + "epoch": 0.55564451561249, + "grad_norm": 1.825788974761963, + "learning_rate": 1.8801031120659393e-05, + "loss": 0.3847, + "step": 694 + }, + { + "epoch": 0.5572457966373099, + "grad_norm": 1.6110035181045532, + "learning_rate": 1.8827433927872066e-05, + "loss": 0.3521, + "step": 696 + }, + { + "epoch": 0.5588470776621297, + "grad_norm": 2.214165210723877, + "learning_rate": 1.8853560893042854e-05, + "loss": 0.2875, + "step": 698 + }, + { + "epoch": 0.5604483586869495, + "grad_norm": 2.398297071456909, + "learning_rate": 1.8879411199749303e-05, + "loss": 0.3347, + "step": 700 + }, + { + "epoch": 0.5620496397117695, + "grad_norm": 2.625048875808716, + "learning_rate": 1.8904984040214037e-05, + "loss": 0.3974, + "step": 702 + }, + { + "epoch": 0.5636509207365893, + "grad_norm": 2.24446702003479, + "learning_rate": 1.893027861533002e-05, + "loss": 0.4585, + "step": 704 + }, + { + "epoch": 0.5652522017614091, + "grad_norm": 2.4870846271514893, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.492, + "step": 706 + }, + { + "epoch": 0.5668534827862289, + "grad_norm": 2.0068306922912598, + "learning_rate": 1.898002981658886e-05, + "loss": 0.2931, + "step": 708 + }, + { + "epoch": 0.5684547638110489, + "grad_norm": 3.6644887924194336, + "learning_rate": 1.9004484888092724e-05, + "loss": 0.5487, + "step": 710 + }, + { + "epoch": 0.5700560448358687, + "grad_norm": 2.7486441135406494, + "learning_rate": 1.9028658585018455e-05, + "loss": 0.4921, + "step": 712 + }, + { + "epoch": 0.5716573258606885, + "grad_norm": 1.82902193069458, + "learning_rate": 1.9052550151979816e-05, + "loss": 0.3351, + "step": 714 + }, + { + "epoch": 0.5732586068855084, + "grad_norm": 1.3474596738815308, + "learning_rate": 1.9076158842406674e-05, + "loss": 0.3218, + "step": 716 + }, + { + "epoch": 0.5748598879103283, + "grad_norm": 1.4313018321990967, + "learning_rate": 1.909948391856829e-05, + "loss": 0.2906, + "step": 718 + }, + { + "epoch": 0.5764611689351481, + "grad_norm": 1.4171478748321533, + "learning_rate": 1.912252465159637e-05, + "loss": 0.322, + "step": 720 + }, + { + "epoch": 0.578062449959968, + "grad_norm": 1.4715297222137451, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.2641, + "step": 722 + }, + { + "epoch": 0.5796637309847879, + "grad_norm": 2.312265157699585, + "learning_rate": 1.9167750217227454e-05, + "loss": 0.4548, + "step": 724 + }, + { + "epoch": 0.5812650120096077, + "grad_norm": 2.1883792877197266, + "learning_rate": 1.9189933636609747e-05, + "loss": 0.4346, + "step": 726 + }, + { + "epoch": 0.5828662930344275, + "grad_norm": 1.583168625831604, + "learning_rate": 1.9211829886461274e-05, + "loss": 0.3667, + "step": 728 + }, + { + "epoch": 0.5844675740592474, + "grad_norm": 1.949686050415039, + "learning_rate": 1.9233438282562085e-05, + "loss": 0.3333, + "step": 730 + }, + { + "epoch": 0.5860688550840673, + "grad_norm": 1.8988748788833618, + "learning_rate": 1.925475814968719e-05, + "loss": 0.3978, + "step": 732 + }, + { + "epoch": 0.5876701361088871, + "grad_norm": 1.5637749433517456, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.2982, + "step": 734 + }, + { + "epoch": 0.589271417133707, + "grad_norm": 1.7998675107955933, + "learning_rate": 1.9296529641211215e-05, + "loss": 0.3815, + "step": 736 + }, + { + "epoch": 0.5908726981585268, + "grad_norm": 1.5816408395767212, + "learning_rate": 1.9316979960323286e-05, + "loss": 0.459, + "step": 738 + }, + { + "epoch": 0.5924739791833467, + "grad_norm": 1.5297447443008423, + "learning_rate": 1.9337139139926707e-05, + "loss": 0.3493, + "step": 740 + }, + { + "epoch": 0.5940752602081665, + "grad_norm": 1.4742164611816406, + "learning_rate": 1.935700655008199e-05, + "loss": 0.2785, + "step": 742 + }, + { + "epoch": 0.5956765412329864, + "grad_norm": 1.2854877710342407, + "learning_rate": 1.9376581569966933e-05, + "loss": 0.3188, + "step": 744 + }, + { + "epoch": 0.5972778222578062, + "grad_norm": 1.828426718711853, + "learning_rate": 1.939586358789602e-05, + "loss": 0.3391, + "step": 746 + }, + { + "epoch": 0.5988791032826261, + "grad_norm": 1.4127424955368042, + "learning_rate": 1.9414852001339547e-05, + "loss": 0.2766, + "step": 748 + }, + { + "epoch": 0.600480384307446, + "grad_norm": 1.3020000457763672, + "learning_rate": 1.9433546216942423e-05, + "loss": 0.2643, + "step": 750 + }, + { + "epoch": 0.6020816653322658, + "grad_norm": 2.2265963554382324, + "learning_rate": 1.945194565054276e-05, + "loss": 0.4516, + "step": 752 + }, + { + "epoch": 0.6036829463570856, + "grad_norm": 2.0862138271331787, + "learning_rate": 1.9470049727190073e-05, + "loss": 0.4329, + "step": 754 + }, + { + "epoch": 0.6052842273819056, + "grad_norm": 1.4935728311538696, + "learning_rate": 1.948785788116329e-05, + "loss": 0.4233, + "step": 756 + }, + { + "epoch": 0.6068855084067254, + "grad_norm": 1.6998820304870605, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.4147, + "step": 758 + }, + { + "epoch": 0.6084867894315452, + "grad_norm": 1.4720284938812256, + "learning_rate": 1.952258420445583e-05, + "loss": 0.3102, + "step": 760 + }, + { + "epoch": 0.610088070456365, + "grad_norm": 1.6229115724563599, + "learning_rate": 1.953950128863762e-05, + "loss": 0.3483, + "step": 762 + }, + { + "epoch": 0.611689351481185, + "grad_norm": 2.1361851692199707, + "learning_rate": 1.9556120279904144e-05, + "loss": 0.3702, + "step": 764 + }, + { + "epoch": 0.6132906325060048, + "grad_norm": 1.380447506904602, + "learning_rate": 1.957244065894066e-05, + "loss": 0.2728, + "step": 766 + }, + { + "epoch": 0.6148919135308246, + "grad_norm": 1.9240666627883911, + "learning_rate": 1.9588461915763566e-05, + "loss": 0.3976, + "step": 768 + }, + { + "epoch": 0.6164931945556446, + "grad_norm": 1.5732346773147583, + "learning_rate": 1.9604183549736283e-05, + "loss": 0.37, + "step": 770 + }, + { + "epoch": 0.6180944755804644, + "grad_norm": 1.779310703277588, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.3521, + "step": 772 + }, + { + "epoch": 0.6196957566052842, + "grad_norm": 1.9436887502670288, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.3664, + "step": 774 + }, + { + "epoch": 0.6212970376301041, + "grad_norm": 2.265984535217285, + "learning_rate": 1.964954584871995e-05, + "loss": 0.4149, + "step": 776 + }, + { + "epoch": 0.622898318654924, + "grad_norm": 1.8760462999343872, + "learning_rate": 1.966406417240872e-05, + "loss": 0.3332, + "step": 778 + }, + { + "epoch": 0.6244995996797438, + "grad_norm": 1.3171790838241577, + "learning_rate": 1.967828051080755e-05, + "loss": 0.3524, + "step": 780 + }, + { + "epoch": 0.6261008807045636, + "grad_norm": 1.8025842905044556, + "learning_rate": 1.969219441968046e-05, + "loss": 0.3805, + "step": 782 + }, + { + "epoch": 0.6277021617293835, + "grad_norm": 1.5282752513885498, + "learning_rate": 1.9705805464241856e-05, + "loss": 0.3189, + "step": 784 + }, + { + "epoch": 0.6293034427542034, + "grad_norm": 0.933535635471344, + "learning_rate": 1.971911321917015e-05, + "loss": 0.2583, + "step": 786 + }, + { + "epoch": 0.6309047237790232, + "grad_norm": 2.7523796558380127, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.4591, + "step": 788 + }, + { + "epoch": 0.6325060048038431, + "grad_norm": 1.683789849281311, + "learning_rate": 1.9744817206240377e-05, + "loss": 0.3189, + "step": 790 + }, + { + "epoch": 0.6341072858286629, + "grad_norm": 1.4451898336410522, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.3483, + "step": 792 + }, + { + "epoch": 0.6357085668534828, + "grad_norm": 1.322089672088623, + "learning_rate": 1.976930316809569e-05, + "loss": 0.4145, + "step": 794 + }, + { + "epoch": 0.6373098478783027, + "grad_norm": 2.0772879123687744, + "learning_rate": 1.978108842718768e-05, + "loss": 0.3714, + "step": 796 + }, + { + "epoch": 0.6389111289031225, + "grad_norm": 1.3334157466888428, + "learning_rate": 1.9792568044184176e-05, + "loss": 0.304, + "step": 798 + }, + { + "epoch": 0.6405124099279423, + "grad_norm": 1.7107504606246948, + "learning_rate": 1.9803741660367015e-05, + "loss": 0.253, + "step": 800 + }, + { + "epoch": 0.6421136909527622, + "grad_norm": 2.3833019733428955, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.4508, + "step": 802 + }, + { + "epoch": 0.6437149719775821, + "grad_norm": 1.5616796016693115, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.4345, + "step": 804 + }, + { + "epoch": 0.6453162530024019, + "grad_norm": 1.7232791185379028, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.2764, + "step": 806 + }, + { + "epoch": 0.6469175340272217, + "grad_norm": 2.776564598083496, + "learning_rate": 1.9845369277495102e-05, + "loss": 0.3845, + "step": 808 + }, + { + "epoch": 0.6485188150520417, + "grad_norm": 2.731919050216675, + "learning_rate": 1.985500784388244e-05, + "loss": 0.4158, + "step": 810 + }, + { + "epoch": 0.6501200960768615, + "grad_norm": 1.7926528453826904, + "learning_rate": 1.9864338458320366e-05, + "loss": 0.3483, + "step": 812 + }, + { + "epoch": 0.6517213771016813, + "grad_norm": 1.4568711519241333, + "learning_rate": 1.9873360829243323e-05, + "loss": 0.3666, + "step": 814 + }, + { + "epoch": 0.6533226581265013, + "grad_norm": 1.5340912342071533, + "learning_rate": 1.9882074674717836e-05, + "loss": 0.3333, + "step": 816 + }, + { + "epoch": 0.6549239391513211, + "grad_norm": 1.7231264114379883, + "learning_rate": 1.989047972245129e-05, + "loss": 0.3525, + "step": 818 + }, + { + "epoch": 0.6565252201761409, + "grad_norm": 2.0017316341400146, + "learning_rate": 1.989857570980049e-05, + "loss": 0.2953, + "step": 820 + }, + { + "epoch": 0.6581265012009607, + "grad_norm": 1.9696844816207886, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.3148, + "step": 822 + }, + { + "epoch": 0.6597277822257807, + "grad_norm": 1.5798091888427734, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.3666, + "step": 824 + }, + { + "epoch": 0.6613290632506005, + "grad_norm": 2.298478126525879, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.3807, + "step": 826 + }, + { + "epoch": 0.6629303442754203, + "grad_norm": 1.048483967781067, + "learning_rate": 1.9927864140670615e-05, + "loss": 0.2381, + "step": 828 + }, + { + "epoch": 0.6645316253002402, + "grad_norm": 2.1363868713378906, + "learning_rate": 1.99344112247369e-05, + "loss": 0.4216, + "step": 830 + }, + { + "epoch": 0.6661329063250601, + "grad_norm": 2.262768507003784, + "learning_rate": 1.9940647875635463e-05, + "loss": 0.4517, + "step": 832 + }, + { + "epoch": 0.6677341873498799, + "grad_norm": 1.1426008939743042, + "learning_rate": 1.994657389848176e-05, + "loss": 0.2718, + "step": 834 + }, + { + "epoch": 0.6693354683746997, + "grad_norm": 1.3310153484344482, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.3038, + "step": 836 + }, + { + "epoch": 0.6709367493995196, + "grad_norm": 1.5684987306594849, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.3663, + "step": 838 + }, + { + "epoch": 0.6725380304243395, + "grad_norm": 1.5685181617736816, + "learning_rate": 1.996248639549475e-05, + "loss": 0.2959, + "step": 840 + }, + { + "epoch": 0.6741393114491593, + "grad_norm": 1.9755942821502686, + "learning_rate": 1.9967168151503196e-05, + "loss": 0.4326, + "step": 842 + }, + { + "epoch": 0.6757405924739792, + "grad_norm": 1.5409562587738037, + "learning_rate": 1.997153845074662e-05, + "loss": 0.2934, + "step": 844 + }, + { + "epoch": 0.677341873498799, + "grad_norm": 1.837429165840149, + "learning_rate": 1.997559715666073e-05, + "loss": 0.3814, + "step": 846 + }, + { + "epoch": 0.6789431545236189, + "grad_norm": 1.7441240549087524, + "learning_rate": 1.9979344142417986e-05, + "loss": 0.2562, + "step": 848 + }, + { + "epoch": 0.6805444355484388, + "grad_norm": 1.6502047777175903, + "learning_rate": 1.998277929093157e-05, + "loss": 0.3492, + "step": 850 + }, + { + "epoch": 0.6821457165732586, + "grad_norm": 1.5168346166610718, + "learning_rate": 1.9985902494859023e-05, + "loss": 0.339, + "step": 852 + }, + { + "epoch": 0.6837469975980784, + "grad_norm": 2.801290273666382, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.4888, + "step": 854 + }, + { + "epoch": 0.6853482786228983, + "grad_norm": 1.5909329652786255, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.4034, + "step": 856 + }, + { + "epoch": 0.6869495596477182, + "grad_norm": 1.4478493928909302, + "learning_rate": 1.999339951193407e-05, + "loss": 0.2871, + "step": 858 + }, + { + "epoch": 0.688550840672538, + "grad_norm": 1.8693878650665283, + "learning_rate": 1.9995274059091018e-05, + "loss": 0.4146, + "step": 860 + }, + { + "epoch": 0.6901521216973578, + "grad_norm": 3.367631673812866, + "learning_rate": 1.999683627122195e-05, + "loss": 0.613, + "step": 862 + }, + { + "epoch": 0.6917534027221778, + "grad_norm": 2.220822811126709, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.4156, + "step": 864 + }, + { + "epoch": 0.6933546837469976, + "grad_norm": 2.7891247272491455, + "learning_rate": 1.99990235049015e-05, + "loss": 0.3493, + "step": 866 + }, + { + "epoch": 0.6949559647718174, + "grad_norm": 1.5803334712982178, + "learning_rate": 1.999964845810285e-05, + "loss": 0.318, + "step": 868 + }, + { + "epoch": 0.6965572457966374, + "grad_norm": 1.7391237020492554, + "learning_rate": 1.999996093958578e-05, + "loss": 0.3351, + "step": 870 + }, + { + "epoch": 0.6981585268214572, + "grad_norm": 1.6775270700454712, + "learning_rate": 1.999996093958578e-05, + "loss": 0.3189, + "step": 872 + }, + { + "epoch": 0.699759807846277, + "grad_norm": 2.3344333171844482, + "learning_rate": 1.999964845810285e-05, + "loss": 0.3997, + "step": 874 + }, + { + "epoch": 0.7013610888710968, + "grad_norm": 1.4875624179840088, + "learning_rate": 1.99990235049015e-05, + "loss": 0.3059, + "step": 876 + }, + { + "epoch": 0.7029623698959168, + "grad_norm": 1.5839314460754395, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.318, + "step": 878 + }, + { + "epoch": 0.7045636509207366, + "grad_norm": 1.9073481559753418, + "learning_rate": 1.999683627122195e-05, + "loss": 0.3295, + "step": 880 + }, + { + "epoch": 0.7061649319455564, + "grad_norm": 2.5535762310028076, + "learning_rate": 1.999527405909102e-05, + "loss": 0.4184, + "step": 882 + }, + { + "epoch": 0.7077662129703763, + "grad_norm": 1.6383652687072754, + "learning_rate": 1.999339951193407e-05, + "loss": 0.2621, + "step": 884 + }, + { + "epoch": 0.7093674939951962, + "grad_norm": 2.295819044113159, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.5087, + "step": 886 + }, + { + "epoch": 0.710968775020016, + "grad_norm": 1.5624088048934937, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.3057, + "step": 888 + }, + { + "epoch": 0.7125700560448359, + "grad_norm": 1.7825961112976074, + "learning_rate": 1.9985902494859026e-05, + "loss": 0.3493, + "step": 890 + }, + { + "epoch": 0.7141713370696557, + "grad_norm": 2.017639398574829, + "learning_rate": 1.9982779290931572e-05, + "loss": 0.3664, + "step": 892 + }, + { + "epoch": 0.7157726180944756, + "grad_norm": 2.172041177749634, + "learning_rate": 1.997934414241799e-05, + "loss": 0.3997, + "step": 894 + }, + { + "epoch": 0.7173738991192954, + "grad_norm": 2.1997971534729004, + "learning_rate": 1.997559715666073e-05, + "loss": 0.4147, + "step": 896 + }, + { + "epoch": 0.7189751801441153, + "grad_norm": 1.6276147365570068, + "learning_rate": 1.997153845074662e-05, + "loss": 0.3039, + "step": 898 + }, + { + "epoch": 0.7205764611689351, + "grad_norm": 1.693739891052246, + "learning_rate": 1.9967168151503193e-05, + "loss": 0.3039, + "step": 900 + }, + { + "epoch": 0.722177742193755, + "grad_norm": 1.9625521898269653, + "learning_rate": 1.9962486395494753e-05, + "loss": 0.3644, + "step": 902 + }, + { + "epoch": 0.7237790232185749, + "grad_norm": 1.9005026817321777, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.3334, + "step": 904 + }, + { + "epoch": 0.7253803042433947, + "grad_norm": 2.004976272583008, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.3331, + "step": 906 + }, + { + "epoch": 0.7269815852682145, + "grad_norm": 1.966732144355774, + "learning_rate": 1.994657389848176e-05, + "loss": 0.3842, + "step": 908 + }, + { + "epoch": 0.7285828662930345, + "grad_norm": 1.8985217809677124, + "learning_rate": 1.9940647875635466e-05, + "loss": 0.3036, + "step": 910 + }, + { + "epoch": 0.7301841473178543, + "grad_norm": 1.49187171459198, + "learning_rate": 1.99344112247369e-05, + "loss": 0.198, + "step": 912 + }, + { + "epoch": 0.7317854283426741, + "grad_norm": 1.7424341440200806, + "learning_rate": 1.9927864140670618e-05, + "loss": 0.3834, + "step": 914 + }, + { + "epoch": 0.733386709367494, + "grad_norm": 1.7833532094955444, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.3188, + "step": 916 + }, + { + "epoch": 0.7349879903923139, + "grad_norm": 1.587798833847046, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.352, + "step": 918 + }, + { + "epoch": 0.7365892714171337, + "grad_norm": 2.072819471359253, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.3805, + "step": 920 + }, + { + "epoch": 0.7381905524419535, + "grad_norm": 1.9230507612228394, + "learning_rate": 1.989857570980049e-05, + "loss": 0.3296, + "step": 922 + }, + { + "epoch": 0.7397918334667735, + "grad_norm": 1.6681948900222778, + "learning_rate": 1.9890479722451292e-05, + "loss": 0.3482, + "step": 924 + }, + { + "epoch": 0.7413931144915933, + "grad_norm": 2.014960527420044, + "learning_rate": 1.9882074674717832e-05, + "loss": 0.3332, + "step": 926 + }, + { + "epoch": 0.7429943955164131, + "grad_norm": 1.9558994770050049, + "learning_rate": 1.987336082924333e-05, + "loss": 0.4345, + "step": 928 + }, + { + "epoch": 0.7445956765412329, + "grad_norm": 1.8306491374969482, + "learning_rate": 1.986433845832037e-05, + "loss": 0.3994, + "step": 930 + }, + { + "epoch": 0.7461969575660529, + "grad_norm": 1.7769701480865479, + "learning_rate": 1.9855007843882437e-05, + "loss": 0.3643, + "step": 932 + }, + { + "epoch": 0.7477982385908727, + "grad_norm": 1.9312670230865479, + "learning_rate": 1.9845369277495105e-05, + "loss": 0.333, + "step": 934 + }, + { + "epoch": 0.7493995196156925, + "grad_norm": 1.8680124282836914, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.3642, + "step": 936 + }, + { + "epoch": 0.7510008006405124, + "grad_norm": 2.0867233276367188, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.3662, + "step": 938 + }, + { + "epoch": 0.7526020816653323, + "grad_norm": 2.058663845062256, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.3185, + "step": 940 + }, + { + "epoch": 0.7542033626901521, + "grad_norm": 1.7908549308776855, + "learning_rate": 1.9803741660367018e-05, + "loss": 0.3643, + "step": 942 + }, + { + "epoch": 0.755804643714972, + "grad_norm": 1.8367648124694824, + "learning_rate": 1.979256804418418e-05, + "loss": 0.3994, + "step": 944 + }, + { + "epoch": 0.7574059247397918, + "grad_norm": 1.7602410316467285, + "learning_rate": 1.9781088427187677e-05, + "loss": 0.3501, + "step": 946 + }, + { + "epoch": 0.7590072057646117, + "grad_norm": 1.5048695802688599, + "learning_rate": 1.976930316809569e-05, + "loss": 0.3222, + "step": 948 + }, + { + "epoch": 0.7606084867894315, + "grad_norm": 1.5154637098312378, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.2632, + "step": 950 + }, + { + "epoch": 0.7622097678142514, + "grad_norm": 2.6485586166381836, + "learning_rate": 1.9744817206240374e-05, + "loss": 0.4183, + "step": 952 + }, + { + "epoch": 0.7638110488390712, + "grad_norm": 2.3263063430786133, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.4887, + "step": 954 + }, + { + "epoch": 0.7654123298638911, + "grad_norm": 2.012033224105835, + "learning_rate": 1.9719113219170152e-05, + "loss": 0.4327, + "step": 956 + }, + { + "epoch": 0.767013610888711, + "grad_norm": 1.6866446733474731, + "learning_rate": 1.970580546424186e-05, + "loss": 0.3388, + "step": 958 + }, + { + "epoch": 0.7686148919135308, + "grad_norm": 1.8290393352508545, + "learning_rate": 1.9692194419680463e-05, + "loss": 0.3643, + "step": 960 + }, + { + "epoch": 0.7702161729383507, + "grad_norm": 1.7240321636199951, + "learning_rate": 1.9678280510807552e-05, + "loss": 0.3349, + "step": 962 + }, + { + "epoch": 0.7718174539631706, + "grad_norm": 2.314815044403076, + "learning_rate": 1.966406417240872e-05, + "loss": 0.3332, + "step": 964 + }, + { + "epoch": 0.7734187349879904, + "grad_norm": 1.809171199798584, + "learning_rate": 1.964954584871995e-05, + "loss": 0.3977, + "step": 966 + }, + { + "epoch": 0.7750200160128102, + "grad_norm": 1.5390005111694336, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.2907, + "step": 968 + }, + { + "epoch": 0.77662129703763, + "grad_norm": 3.9524943828582764, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.349, + "step": 970 + }, + { + "epoch": 0.77822257806245, + "grad_norm": 1.6044213771820068, + "learning_rate": 1.9604183549736287e-05, + "loss": 0.3483, + "step": 972 + }, + { + "epoch": 0.7798238590872698, + "grad_norm": 1.6362640857696533, + "learning_rate": 1.958846191576357e-05, + "loss": 0.3482, + "step": 974 + }, + { + "epoch": 0.7814251401120896, + "grad_norm": 2.248365640640259, + "learning_rate": 1.9572440658940667e-05, + "loss": 0.3331, + "step": 976 + }, + { + "epoch": 0.7830264211369096, + "grad_norm": 1.749508261680603, + "learning_rate": 1.955612027990415e-05, + "loss": 0.3218, + "step": 978 + }, + { + "epoch": 0.7846277021617294, + "grad_norm": 1.6974258422851562, + "learning_rate": 1.953950128863763e-05, + "loss": 0.4029, + "step": 980 + }, + { + "epoch": 0.7862289831865492, + "grad_norm": 1.647813320159912, + "learning_rate": 1.9522584204455835e-05, + "loss": 0.3036, + "step": 982 + }, + { + "epoch": 0.7878302642113691, + "grad_norm": 2.603489637374878, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.4183, + "step": 984 + }, + { + "epoch": 0.789431545236189, + "grad_norm": 2.957503080368042, + "learning_rate": 1.9487857881163295e-05, + "loss": 0.4344, + "step": 986 + }, + { + "epoch": 0.7910328262610088, + "grad_norm": 1.9224066734313965, + "learning_rate": 1.947004972719008e-05, + "loss": 0.3803, + "step": 988 + }, + { + "epoch": 0.7926341072858286, + "grad_norm": 1.1030638217926025, + "learning_rate": 1.945194565054276e-05, + "loss": 0.2663, + "step": 990 + }, + { + "epoch": 0.7942353883106485, + "grad_norm": 2.2811684608459473, + "learning_rate": 1.9433546216942433e-05, + "loss": 0.4155, + "step": 992 + }, + { + "epoch": 0.7958366693354684, + "grad_norm": 1.937842607498169, + "learning_rate": 1.941485200133955e-05, + "loss": 0.349, + "step": 994 + }, + { + "epoch": 0.7974379503602882, + "grad_norm": 1.824407935142517, + "learning_rate": 1.9395863587896025e-05, + "loss": 0.3329, + "step": 996 + }, + { + "epoch": 0.7990392313851081, + "grad_norm": 5.219600200653076, + "learning_rate": 1.937658156996694e-05, + "loss": 0.5102, + "step": 998 + }, + { + "epoch": 0.800640512409928, + "grad_norm": 2.0986828804016113, + "learning_rate": 1.9357006550082e-05, + "loss": 0.4327, + "step": 1000 + }, + { + "epoch": 0.8022417934347478, + "grad_norm": 1.6727732419967651, + "learning_rate": 1.933713913992671e-05, + "loss": 0.4658, + "step": 1002 + }, + { + "epoch": 0.8038430744595677, + "grad_norm": 1.655251383781433, + "learning_rate": 1.9316979960323283e-05, + "loss": 0.333, + "step": 1004 + }, + { + "epoch": 0.8054443554843875, + "grad_norm": 1.6332967281341553, + "learning_rate": 1.9296529641211226e-05, + "loss": 0.4154, + "step": 1006 + }, + { + "epoch": 0.8070456365092074, + "grad_norm": 2.1250832080841064, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.3891, + "step": 1008 + }, + { + "epoch": 0.8086469175340272, + "grad_norm": 1.5177032947540283, + "learning_rate": 1.9254758149687187e-05, + "loss": 0.3057, + "step": 1010 + }, + { + "epoch": 0.8102481985588471, + "grad_norm": 1.4395006895065308, + "learning_rate": 1.9233438282562095e-05, + "loss": 0.3492, + "step": 1012 + }, + { + "epoch": 0.8118494795836669, + "grad_norm": 1.721771240234375, + "learning_rate": 1.9211829886461278e-05, + "loss": 0.3643, + "step": 1014 + }, + { + "epoch": 0.8134507606084868, + "grad_norm": 1.980399250984192, + "learning_rate": 1.918993363660975e-05, + "loss": 0.2525, + "step": 1016 + }, + { + "epoch": 0.8150520416333067, + "grad_norm": 1.5820918083190918, + "learning_rate": 1.916775021722745e-05, + "loss": 0.3493, + "step": 1018 + }, + { + "epoch": 0.8166533226581265, + "grad_norm": 1.6989330053329468, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.3039, + "step": 1020 + }, + { + "epoch": 0.8182546036829463, + "grad_norm": 1.8151925802230835, + "learning_rate": 1.9122524651596372e-05, + "loss": 0.2934, + "step": 1022 + }, + { + "epoch": 0.8198558847077662, + "grad_norm": 1.679717779159546, + "learning_rate": 1.9099483918568287e-05, + "loss": 0.318, + "step": 1024 + }, + { + "epoch": 0.8214571657325861, + "grad_norm": 1.733376145362854, + "learning_rate": 1.907615884240668e-05, + "loss": 0.3492, + "step": 1026 + }, + { + "epoch": 0.8230584467574059, + "grad_norm": 2.1388115882873535, + "learning_rate": 1.905255015197982e-05, + "loss": 0.3191, + "step": 1028 + }, + { + "epoch": 0.8246597277822257, + "grad_norm": 1.3508343696594238, + "learning_rate": 1.902865858501845e-05, + "loss": 0.3258, + "step": 1030 + }, + { + "epoch": 0.8262610088070457, + "grad_norm": 2.0171124935150146, + "learning_rate": 1.9004484888092734e-05, + "loss": 0.3332, + "step": 1032 + }, + { + "epoch": 0.8278622898318655, + "grad_norm": 1.708388328552246, + "learning_rate": 1.8980029816588863e-05, + "loss": 0.3188, + "step": 1034 + }, + { + "epoch": 0.8294635708566853, + "grad_norm": 1.896478295326233, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.3804, + "step": 1036 + }, + { + "epoch": 0.8310648518815053, + "grad_norm": 1.9103481769561768, + "learning_rate": 1.893027861533003e-05, + "loss": 0.3493, + "step": 1038 + }, + { + "epoch": 0.8326661329063251, + "grad_norm": 2.155748128890991, + "learning_rate": 1.8904984040214043e-05, + "loss": 0.3661, + "step": 1040 + }, + { + "epoch": 0.8342674139311449, + "grad_norm": 1.6716892719268799, + "learning_rate": 1.8879411199749306e-05, + "loss": 0.3055, + "step": 1042 + }, + { + "epoch": 0.8358686949559647, + "grad_norm": 2.0004866123199463, + "learning_rate": 1.885356089304285e-05, + "loss": 0.3491, + "step": 1044 + }, + { + "epoch": 0.8374699759807847, + "grad_norm": 1.790610432624817, + "learning_rate": 1.882743392787207e-05, + "loss": 0.348, + "step": 1046 + }, + { + "epoch": 0.8390712570056045, + "grad_norm": 2.304841995239258, + "learning_rate": 1.8801031120659396e-05, + "loss": 0.3389, + "step": 1048 + }, + { + "epoch": 0.8406725380304243, + "grad_norm": 1.2002736330032349, + "learning_rate": 1.877435329644691e-05, + "loss": 0.3356, + "step": 1050 + }, + { + "epoch": 0.8422738190552442, + "grad_norm": 2.0355052947998047, + "learning_rate": 1.8747401288870482e-05, + "loss": 0.4144, + "step": 1052 + }, + { + "epoch": 0.8438751000800641, + "grad_norm": 2.118791103363037, + "learning_rate": 1.8720175940133712e-05, + "loss": 0.3992, + "step": 1054 + }, + { + "epoch": 0.8454763811048839, + "grad_norm": 1.8823494911193848, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.349, + "step": 1056 + }, + { + "epoch": 0.8470776621297038, + "grad_norm": 1.911580204963684, + "learning_rate": 1.8664908630674264e-05, + "loss": 0.2524, + "step": 1058 + }, + { + "epoch": 0.8486789431545236, + "grad_norm": 2.9307801723480225, + "learning_rate": 1.8636868396959406e-05, + "loss": 0.4513, + "step": 1060 + }, + { + "epoch": 0.8502802241793435, + "grad_norm": 2.292638063430786, + "learning_rate": 1.8608558276045898e-05, + "loss": 0.4693, + "step": 1062 + }, + { + "epoch": 0.8518815052041633, + "grad_norm": 3.8273518085479736, + "learning_rate": 1.8579979152576076e-05, + "loss": 0.339, + "step": 1064 + }, + { + "epoch": 0.8534827862289832, + "grad_norm": 1.7459690570831299, + "learning_rate": 1.8551131919598084e-05, + "loss": 0.3266, + "step": 1066 + }, + { + "epoch": 0.855084067253803, + "grad_norm": 1.3376331329345703, + "learning_rate": 1.852201747853807e-05, + "loss": 0.2396, + "step": 1068 + }, + { + "epoch": 0.8566853482786229, + "grad_norm": 1.6592035293579102, + "learning_rate": 1.849263673917196e-05, + "loss": 0.3975, + "step": 1070 + }, + { + "epoch": 0.8582866293034428, + "grad_norm": 1.4328246116638184, + "learning_rate": 1.846299061959706e-05, + "loss": 0.349, + "step": 1072 + }, + { + "epoch": 0.8598879103282626, + "grad_norm": 1.5319830179214478, + "learning_rate": 1.8433080046203293e-05, + "loss": 0.3481, + "step": 1074 + }, + { + "epoch": 0.8614891913530824, + "grad_norm": 1.7530953884124756, + "learning_rate": 1.8402905953644356e-05, + "loss": 0.3641, + "step": 1076 + }, + { + "epoch": 0.8630904723779024, + "grad_norm": 1.5799342393875122, + "learning_rate": 1.837246928480848e-05, + "loss": 0.3492, + "step": 1078 + }, + { + "epoch": 0.8646917534027222, + "grad_norm": 1.7069640159606934, + "learning_rate": 1.8341770990788874e-05, + "loss": 0.366, + "step": 1080 + }, + { + "epoch": 0.866293034427542, + "grad_norm": 1.5264437198638916, + "learning_rate": 1.831081203085415e-05, + "loss": 0.264, + "step": 1082 + }, + { + "epoch": 0.8678943154523618, + "grad_norm": 2.048262119293213, + "learning_rate": 1.8279593372418284e-05, + "loss": 0.2817, + "step": 1084 + }, + { + "epoch": 0.8694955964771818, + "grad_norm": 1.705351710319519, + "learning_rate": 1.8248115991010303e-05, + "loss": 0.2612, + "step": 1086 + }, + { + "epoch": 0.8710968775020016, + "grad_norm": 1.9370341300964355, + "learning_rate": 1.8216380870243963e-05, + "loss": 0.335, + "step": 1088 + }, + { + "epoch": 0.8726981585268214, + "grad_norm": 1.4643553495407104, + "learning_rate": 1.8184389001786912e-05, + "loss": 0.335, + "step": 1090 + }, + { + "epoch": 0.8742994395516414, + "grad_norm": 1.817090392112732, + "learning_rate": 1.815214138532966e-05, + "loss": 0.3803, + "step": 1092 + }, + { + "epoch": 0.8759007205764612, + "grad_norm": 3.00193190574646, + "learning_rate": 1.8119639028554475e-05, + "loss": 0.4517, + "step": 1094 + }, + { + "epoch": 0.877502001601281, + "grad_norm": 1.694606900215149, + "learning_rate": 1.808688294710378e-05, + "loss": 0.348, + "step": 1096 + }, + { + "epoch": 0.8791032826261009, + "grad_norm": 2.1206088066101074, + "learning_rate": 1.805387416454849e-05, + "loss": 0.3188, + "step": 1098 + }, + { + "epoch": 0.8807045636509208, + "grad_norm": 1.6188896894454956, + "learning_rate": 1.802061371235592e-05, + "loss": 0.2763, + "step": 1100 + }, + { + "epoch": 0.8823058446757406, + "grad_norm": 1.88460111618042, + "learning_rate": 1.7987102629857692e-05, + "loss": 0.3348, + "step": 1102 + }, + { + "epoch": 0.8839071257005604, + "grad_norm": 2.0864145755767822, + "learning_rate": 1.7953341964217196e-05, + "loss": 0.2298, + "step": 1104 + }, + { + "epoch": 0.8855084067253803, + "grad_norm": 1.5535658597946167, + "learning_rate": 1.7919332770396798e-05, + "loss": 0.3348, + "step": 1106 + }, + { + "epoch": 0.8871096877502002, + "grad_norm": 1.7750366926193237, + "learning_rate": 1.7885076111125e-05, + "loss": 0.3479, + "step": 1108 + }, + { + "epoch": 0.88871096877502, + "grad_norm": 2.5089354515075684, + "learning_rate": 1.7850573056863173e-05, + "loss": 0.3814, + "step": 1110 + }, + { + "epoch": 0.8903122497998399, + "grad_norm": 2.3630449771881104, + "learning_rate": 1.7815824685772042e-05, + "loss": 0.3566, + "step": 1112 + }, + { + "epoch": 0.8919135308246597, + "grad_norm": 1.9498798847198486, + "learning_rate": 1.7780832083678122e-05, + "loss": 0.4154, + "step": 1114 + }, + { + "epoch": 0.8935148118494796, + "grad_norm": 2.889556884765625, + "learning_rate": 1.774559634403971e-05, + "loss": 0.549, + "step": 1116 + }, + { + "epoch": 0.8951160928742994, + "grad_norm": 5.886616230010986, + "learning_rate": 1.7710118567912732e-05, + "loss": 0.536, + "step": 1118 + }, + { + "epoch": 0.8967173738991193, + "grad_norm": 2.425182819366455, + "learning_rate": 1.7674399863916298e-05, + "loss": 0.2905, + "step": 1120 + }, + { + "epoch": 0.8983186549239391, + "grad_norm": 2.4988224506378174, + "learning_rate": 1.7638441348198144e-05, + "loss": 0.3093, + "step": 1122 + }, + { + "epoch": 0.899919935948759, + "grad_norm": 2.0586156845092773, + "learning_rate": 1.7602244144399713e-05, + "loss": 0.3643, + "step": 1124 + }, + { + "epoch": 0.9015212169735789, + "grad_norm": 1.6856615543365479, + "learning_rate": 1.7565809383620966e-05, + "loss": 0.4232, + "step": 1126 + }, + { + "epoch": 0.9031224979983987, + "grad_norm": 1.953451156616211, + "learning_rate": 1.7529138204385186e-05, + "loss": 0.4155, + "step": 1128 + }, + { + "epoch": 0.9047237790232185, + "grad_norm": 2.7082619667053223, + "learning_rate": 1.7492231752603305e-05, + "loss": 0.3519, + "step": 1130 + }, + { + "epoch": 0.9063250600480385, + "grad_norm": 2.2742538452148438, + "learning_rate": 1.7455091181538094e-05, + "loss": 0.3977, + "step": 1132 + }, + { + "epoch": 0.9079263410728583, + "grad_norm": 3.9067795276641846, + "learning_rate": 1.741771765176815e-05, + "loss": 0.4381, + "step": 1134 + }, + { + "epoch": 0.9095276220976781, + "grad_norm": 3.890784978866577, + "learning_rate": 1.7380112331151657e-05, + "loss": 0.4184, + "step": 1136 + }, + { + "epoch": 0.911128903122498, + "grad_norm": 2.0616989135742188, + "learning_rate": 1.7342276394789825e-05, + "loss": 0.3332, + "step": 1138 + }, + { + "epoch": 0.9127301841473179, + "grad_norm": 2.661250591278076, + "learning_rate": 1.7304211024990216e-05, + "loss": 0.4347, + "step": 1140 + }, + { + "epoch": 0.9143314651721377, + "grad_norm": 3.337064504623413, + "learning_rate": 1.7265917411229803e-05, + "loss": 0.3722, + "step": 1142 + }, + { + "epoch": 0.9159327461969575, + "grad_norm": 1.9536464214324951, + "learning_rate": 1.7227396750117802e-05, + "loss": 0.2907, + "step": 1144 + }, + { + "epoch": 0.9175340272217775, + "grad_norm": 2.7560789585113525, + "learning_rate": 1.718865024535822e-05, + "loss": 0.4698, + "step": 1146 + }, + { + "epoch": 0.9191353082465973, + "grad_norm": 2.3424930572509766, + "learning_rate": 1.7149679107712317e-05, + "loss": 0.4157, + "step": 1148 + }, + { + "epoch": 0.9207365892714171, + "grad_norm": 2.655748128890991, + "learning_rate": 1.711048455496075e-05, + "loss": 0.3645, + "step": 1150 + }, + { + "epoch": 0.922337870296237, + "grad_norm": 2.0748190879821777, + "learning_rate": 1.7071067811865474e-05, + "loss": 0.3333, + "step": 1152 + }, + { + "epoch": 0.9239391513210569, + "grad_norm": 2.285050868988037, + "learning_rate": 1.7031430110131566e-05, + "loss": 0.3978, + "step": 1154 + }, + { + "epoch": 0.9255404323458767, + "grad_norm": 2.049224853515625, + "learning_rate": 1.699157268836863e-05, + "loss": 0.2906, + "step": 1156 + }, + { + "epoch": 0.9271417133706965, + "grad_norm": 2.03804874420166, + "learning_rate": 1.6951496792052148e-05, + "loss": 0.3485, + "step": 1158 + }, + { + "epoch": 0.9287429943955164, + "grad_norm": 1.9715267419815063, + "learning_rate": 1.6911203673484583e-05, + "loss": 0.3333, + "step": 1160 + }, + { + "epoch": 0.9303442754203363, + "grad_norm": 1.946840524673462, + "learning_rate": 1.687069459175619e-05, + "loss": 0.3332, + "step": 1162 + }, + { + "epoch": 0.9319455564451561, + "grad_norm": 2.0048491954803467, + "learning_rate": 1.682997081270568e-05, + "loss": 0.3484, + "step": 1164 + }, + { + "epoch": 0.933546837469976, + "grad_norm": 1.3172447681427002, + "learning_rate": 1.6789033608880742e-05, + "loss": 0.2732, + "step": 1166 + }, + { + "epoch": 0.9351481184947958, + "grad_norm": 2.157862663269043, + "learning_rate": 1.6747884259498185e-05, + "loss": 0.3491, + "step": 1168 + }, + { + "epoch": 0.9367493995196157, + "grad_norm": 2.119666814804077, + "learning_rate": 1.6706524050404006e-05, + "loss": 0.3482, + "step": 1170 + }, + { + "epoch": 0.9383506805444356, + "grad_norm": 1.9860771894454956, + "learning_rate": 1.6664954274033175e-05, + "loss": 0.3644, + "step": 1172 + }, + { + "epoch": 0.9399519615692554, + "grad_norm": 2.100893020629883, + "learning_rate": 1.662317622936933e-05, + "loss": 0.3805, + "step": 1174 + }, + { + "epoch": 0.9415532425940752, + "grad_norm": 2.021897077560425, + "learning_rate": 1.6581191221904098e-05, + "loss": 0.3332, + "step": 1176 + }, + { + "epoch": 0.9431545236188951, + "grad_norm": 1.5471954345703125, + "learning_rate": 1.6539000563596328e-05, + "loss": 0.2669, + "step": 1178 + }, + { + "epoch": 0.944755804643715, + "grad_norm": 2.602501392364502, + "learning_rate": 1.6496605572831127e-05, + "loss": 0.4506, + "step": 1180 + }, + { + "epoch": 0.9463570856685348, + "grad_norm": 2.3999478816986084, + "learning_rate": 1.6454007574378657e-05, + "loss": 0.4506, + "step": 1182 + }, + { + "epoch": 0.9479583666933546, + "grad_norm": 2.1515636444091797, + "learning_rate": 1.6411207899352633e-05, + "loss": 0.3641, + "step": 1184 + }, + { + "epoch": 0.9495596477181746, + "grad_norm": 1.9450993537902832, + "learning_rate": 1.6368207885168904e-05, + "loss": 0.3645, + "step": 1186 + }, + { + "epoch": 0.9511609287429944, + "grad_norm": 1.6930755376815796, + "learning_rate": 1.6325008875503563e-05, + "loss": 0.2904, + "step": 1188 + }, + { + "epoch": 0.9527622097678142, + "grad_norm": 1.5802665948867798, + "learning_rate": 1.628161222025089e-05, + "loss": 0.3645, + "step": 1190 + }, + { + "epoch": 0.9543634907926342, + "grad_norm": 1.3155769109725952, + "learning_rate": 1.623801927548132e-05, + "loss": 0.3024, + "step": 1192 + }, + { + "epoch": 0.955964771817454, + "grad_norm": 1.4544779062271118, + "learning_rate": 1.6194231403398987e-05, + "loss": 0.3494, + "step": 1194 + }, + { + "epoch": 0.9575660528422738, + "grad_norm": 1.5009181499481201, + "learning_rate": 1.6150249972299173e-05, + "loss": 0.3492, + "step": 1196 + }, + { + "epoch": 0.9591673338670936, + "grad_norm": 1.5786657333374023, + "learning_rate": 1.6106076356525484e-05, + "loss": 0.3038, + "step": 1198 + }, + { + "epoch": 0.9607686148919136, + "grad_norm": 1.7375128269195557, + "learning_rate": 1.6061711936427028e-05, + "loss": 0.3643, + "step": 1200 + }, + { + "epoch": 0.9623698959167334, + "grad_norm": 1.6674367189407349, + "learning_rate": 1.6017158098315224e-05, + "loss": 0.3217, + "step": 1202 + }, + { + "epoch": 0.9639711769415532, + "grad_norm": 0.9107713103294373, + "learning_rate": 1.5972416234420404e-05, + "loss": 0.225, + "step": 1204 + }, + { + "epoch": 0.9655724579663731, + "grad_norm": 1.756014108657837, + "learning_rate": 1.592748774284844e-05, + "loss": 0.3492, + "step": 1206 + }, + { + "epoch": 0.967173738991193, + "grad_norm": 2.06087064743042, + "learning_rate": 1.588237402753703e-05, + "loss": 0.4516, + "step": 1208 + }, + { + "epoch": 0.9687750200160128, + "grad_norm": 1.324047565460205, + "learning_rate": 1.5837076498211673e-05, + "loss": 0.3334, + "step": 1210 + }, + { + "epoch": 0.9703763010408326, + "grad_norm": 2.768547534942627, + "learning_rate": 1.579159657034185e-05, + "loss": 0.4895, + "step": 1212 + }, + { + "epoch": 0.9719775820656525, + "grad_norm": 1.5352240800857544, + "learning_rate": 1.574593566509664e-05, + "loss": 0.3494, + "step": 1214 + }, + { + "epoch": 0.9735788630904724, + "grad_norm": 1.765828013420105, + "learning_rate": 1.5700095209300386e-05, + "loss": 0.3179, + "step": 1216 + }, + { + "epoch": 0.9751801441152922, + "grad_norm": 2.486772060394287, + "learning_rate": 1.5654076635387976e-05, + "loss": 0.3757, + "step": 1218 + }, + { + "epoch": 0.9767814251401121, + "grad_norm": 1.7985016107559204, + "learning_rate": 1.560788138136029e-05, + "loss": 0.3974, + "step": 1220 + }, + { + "epoch": 0.978382706164932, + "grad_norm": 1.679533839225769, + "learning_rate": 1.5561510890739137e-05, + "loss": 0.4156, + "step": 1222 + }, + { + "epoch": 0.9799839871897518, + "grad_norm": 2.177866220474243, + "learning_rate": 1.5514966612522088e-05, + "loss": 0.3702, + "step": 1224 + }, + { + "epoch": 0.9815852682145717, + "grad_norm": 1.8794207572937012, + "learning_rate": 1.546825000113736e-05, + "loss": 0.3388, + "step": 1226 + }, + { + "epoch": 0.9831865492393915, + "grad_norm": 1.630739450454712, + "learning_rate": 1.5421362516398285e-05, + "loss": 0.4344, + "step": 1228 + }, + { + "epoch": 0.9847878302642114, + "grad_norm": 1.1475162506103516, + "learning_rate": 1.5374305623457605e-05, + "loss": 0.2266, + "step": 1230 + }, + { + "epoch": 0.9863891112890312, + "grad_norm": 1.3003648519515991, + "learning_rate": 1.532708079276186e-05, + "loss": 0.2598, + "step": 1232 + }, + { + "epoch": 0.9879903923138511, + "grad_norm": 1.179754376411438, + "learning_rate": 1.5279689500005353e-05, + "loss": 0.252, + "step": 1234 + }, + { + "epoch": 0.9895916733386709, + "grad_norm": 1.3521227836608887, + "learning_rate": 1.5232133226083962e-05, + "loss": 0.3349, + "step": 1236 + }, + { + "epoch": 0.9911929543634908, + "grad_norm": 1.4656428098678589, + "learning_rate": 1.5184413457049014e-05, + "loss": 0.3036, + "step": 1238 + }, + { + "epoch": 0.9927942353883107, + "grad_norm": 1.362708330154419, + "learning_rate": 1.5136531684060753e-05, + "loss": 0.3037, + "step": 1240 + }, + { + "epoch": 0.9943955164131305, + "grad_norm": 1.49736750125885, + "learning_rate": 1.50884894033418e-05, + "loss": 0.3217, + "step": 1242 + }, + { + "epoch": 0.9959967974379503, + "grad_norm": 1.7868070602416992, + "learning_rate": 1.504028811613027e-05, + "loss": 0.3388, + "step": 1244 + }, + { + "epoch": 0.9975980784627703, + "grad_norm": 1.823192834854126, + "learning_rate": 1.4991929328633043e-05, + "loss": 0.4146, + "step": 1246 + }, + { + "epoch": 0.9991993594875901, + "grad_norm": 1.8062609434127808, + "learning_rate": 1.4943414551978622e-05, + "loss": 0.3814, + "step": 1248 + }, + { + "epoch": 1.0, + "step": 1249, + "total_flos": 7139041339244544.0, + "train_loss": 0.34942736782676415, + "train_runtime": 10240.1907, + "train_samples_per_second": 1.952, + "train_steps_per_second": 0.122 + } + ], + "logging_steps": 2, + "max_steps": 1249, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 7139041339244544.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..dca1b582cac318bc4df2f8d1bf72d9e5f381d31c --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54d133bde9b8847aa67a3c7405b613d272bf978df6945a4d74f5f89ea91ca7f8 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..fe56df59442a3f9a1d720e64392f79a384797523 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e9feea85ad654ef0259b5490c653b34de85825a1dc9d5979f997efe59af5e68 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..a37a23be0f6a730bf802fea1dcf774fc851068b9 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15e912b30c885392d13c5c1e4c169154b3ea5ab95a72cf179c57576cdb7617e2 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..ac9cbf995a4da8438015fcdd9d51ce7a34d1aa73 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9fa4ab5189fb170274a5a727b9cc57ac3900d40b26a9b966fd11d3ba46a5efc +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_125_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_125_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..66802be3a8156bf80f15399d5aba4727131db620 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_125_seed1/0_trainer_state.json @@ -0,0 +1,8775 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2498, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008006405124099279, + "grad_norm": 1.95060133934021, + "learning_rate": 2.415943612351265e-06, + "loss": 0.2387, + "step": 2 + }, + { + "epoch": 0.0016012810248198558, + "grad_norm": 2.3849074840545654, + "learning_rate": 2.4341906163790364e-06, + "loss": 0.2962, + "step": 4 + }, + { + "epoch": 0.0024019215372297837, + "grad_norm": 1.8322919607162476, + "learning_rate": 2.4524967251364995e-06, + "loss": 0.195, + "step": 6 + }, + { + "epoch": 0.0032025620496397116, + "grad_norm": 1.6364036798477173, + "learning_rate": 2.4708617956148052e-06, + "loss": 0.2151, + "step": 8 + }, + { + "epoch": 0.0040032025620496394, + "grad_norm": 2.109316110610962, + "learning_rate": 2.4892856843445236e-06, + "loss": 0.3483, + "step": 10 + }, + { + "epoch": 0.004803843074459567, + "grad_norm": 2.0380055904388428, + "learning_rate": 2.507768247396697e-06, + "loss": 0.3437, + "step": 12 + }, + { + "epoch": 0.005604483586869495, + "grad_norm": 3.775493621826172, + "learning_rate": 2.5263093403840022e-06, + "loss": 0.3381, + "step": 14 + }, + { + "epoch": 0.006405124099279423, + "grad_norm": 2.6269102096557617, + "learning_rate": 2.5449088184619065e-06, + "loss": 0.2318, + "step": 16 + }, + { + "epoch": 0.007205764611689352, + "grad_norm": 1.738417625427246, + "learning_rate": 2.5635665363297356e-06, + "loss": 0.2886, + "step": 18 + }, + { + "epoch": 0.008006405124099279, + "grad_norm": 2.2099556922912598, + "learning_rate": 2.5822823482318517e-06, + "loss": 0.3796, + "step": 20 + }, + { + "epoch": 0.008807045636509208, + "grad_norm": 1.1535362005233765, + "learning_rate": 2.6010561079587694e-06, + "loss": 0.2582, + "step": 22 + }, + { + "epoch": 0.009607686148919135, + "grad_norm": 1.152381420135498, + "learning_rate": 2.6198876688483453e-06, + "loss": 0.334, + "step": 24 + }, + { + "epoch": 0.010408326661329063, + "grad_norm": 2.149780035018921, + "learning_rate": 2.6387768837868565e-06, + "loss": 0.3653, + "step": 26 + }, + { + "epoch": 0.01120896717373899, + "grad_norm": 3.3133909702301025, + "learning_rate": 2.6577236052101764e-06, + "loss": 0.375, + "step": 28 + }, + { + "epoch": 0.01200960768614892, + "grad_norm": 2.2218825817108154, + "learning_rate": 2.6767276851049716e-06, + "loss": 0.3342, + "step": 30 + }, + { + "epoch": 0.012810248198558846, + "grad_norm": 3.0526983737945557, + "learning_rate": 2.6957889750097866e-06, + "loss": 0.3342, + "step": 32 + }, + { + "epoch": 0.013610888710968775, + "grad_norm": 2.462351083755493, + "learning_rate": 2.7149073260162416e-06, + "loss": 0.3182, + "step": 34 + }, + { + "epoch": 0.014411529223378704, + "grad_norm": 2.1866707801818848, + "learning_rate": 2.7340825887701848e-06, + "loss": 0.5067, + "step": 36 + }, + { + "epoch": 0.01521216973578863, + "grad_norm": 2.726121664047241, + "learning_rate": 2.7533146134728993e-06, + "loss": 0.3211, + "step": 38 + }, + { + "epoch": 0.016012810248198558, + "grad_norm": 3.0204548835754395, + "learning_rate": 2.772603249882202e-06, + "loss": 0.2889, + "step": 40 + }, + { + "epoch": 0.016813450760608487, + "grad_norm": 2.189263105392456, + "learning_rate": 2.7919483473136555e-06, + "loss": 0.2413, + "step": 42 + }, + { + "epoch": 0.017614091273018415, + "grad_norm": 3.1925265789031982, + "learning_rate": 2.81134975464178e-06, + "loss": 0.3654, + "step": 44 + }, + { + "epoch": 0.018414731785428344, + "grad_norm": 1.8031692504882812, + "learning_rate": 2.8308073203011634e-06, + "loss": 0.2886, + "step": 46 + }, + { + "epoch": 0.01921537229783827, + "grad_norm": 3.6225697994232178, + "learning_rate": 2.850320892287688e-06, + "loss": 0.3048, + "step": 48 + }, + { + "epoch": 0.020016012810248198, + "grad_norm": 1.9345873594284058, + "learning_rate": 2.8698903181597026e-06, + "loss": 0.2901, + "step": 50 + }, + { + "epoch": 0.020816653322658127, + "grad_norm": 2.42756724357605, + "learning_rate": 2.889515445039256e-06, + "loss": 0.3048, + "step": 52 + }, + { + "epoch": 0.021617293835068056, + "grad_norm": 1.8817776441574097, + "learning_rate": 2.909196119613218e-06, + "loss": 0.3181, + "step": 54 + }, + { + "epoch": 0.02241793434747798, + "grad_norm": 2.8325541019439697, + "learning_rate": 2.928932188134529e-06, + "loss": 0.2941, + "step": 56 + }, + { + "epoch": 0.02321857485988791, + "grad_norm": 2.0121748447418213, + "learning_rate": 2.9487234964233724e-06, + "loss": 0.2518, + "step": 58 + }, + { + "epoch": 0.02401921537229784, + "grad_norm": 2.1821517944335938, + "learning_rate": 2.9685698898684355e-06, + "loss": 0.3438, + "step": 60 + }, + { + "epoch": 0.024819855884707767, + "grad_norm": 1.1516562700271606, + "learning_rate": 2.988471213428035e-06, + "loss": 0.2607, + "step": 62 + }, + { + "epoch": 0.025620496397117692, + "grad_norm": 1.972014307975769, + "learning_rate": 3.00842731163137e-06, + "loss": 0.3437, + "step": 64 + }, + { + "epoch": 0.02642113690952762, + "grad_norm": 2.0212337970733643, + "learning_rate": 3.0284380285797733e-06, + "loss": 0.3749, + "step": 66 + }, + { + "epoch": 0.02722177742193755, + "grad_norm": 2.074852705001831, + "learning_rate": 3.048503207947854e-06, + "loss": 0.2159, + "step": 68 + }, + { + "epoch": 0.02802241793434748, + "grad_norm": 1.0391790866851807, + "learning_rate": 3.068622692984767e-06, + "loss": 0.28, + "step": 70 + }, + { + "epoch": 0.028823058446757407, + "grad_norm": 2.14133882522583, + "learning_rate": 3.0887963265154187e-06, + "loss": 0.4223, + "step": 72 + }, + { + "epoch": 0.029623698959167333, + "grad_norm": 3.538231372833252, + "learning_rate": 3.1090239509417364e-06, + "loss": 0.4374, + "step": 74 + }, + { + "epoch": 0.03042433947157726, + "grad_norm": 3.205475091934204, + "learning_rate": 3.129305408243829e-06, + "loss": 0.3341, + "step": 76 + }, + { + "epoch": 0.03122497998398719, + "grad_norm": 2.8722221851348877, + "learning_rate": 3.1496405399812602e-06, + "loss": 0.3482, + "step": 78 + }, + { + "epoch": 0.032025620496397116, + "grad_norm": 1.8400166034698486, + "learning_rate": 3.17002918729432e-06, + "loss": 0.3834, + "step": 80 + }, + { + "epoch": 0.03282626100880705, + "grad_norm": 2.0396242141723633, + "learning_rate": 3.1904711909051967e-06, + "loss": 0.2622, + "step": 82 + }, + { + "epoch": 0.03362690152121697, + "grad_norm": 1.68440580368042, + "learning_rate": 3.2109663911192622e-06, + "loss": 0.2263, + "step": 84 + }, + { + "epoch": 0.0344275420336269, + "grad_norm": 1.8064314126968384, + "learning_rate": 3.231514627826302e-06, + "loss": 0.2376, + "step": 86 + }, + { + "epoch": 0.03522818254603683, + "grad_norm": 1.965749740600586, + "learning_rate": 3.2521157405018146e-06, + "loss": 0.3181, + "step": 88 + }, + { + "epoch": 0.036028823058446756, + "grad_norm": 2.273778200149536, + "learning_rate": 3.2727695682081897e-06, + "loss": 0.3964, + "step": 90 + }, + { + "epoch": 0.03682946357085669, + "grad_norm": 3.0435099601745605, + "learning_rate": 3.293475949595998e-06, + "loss": 0.3436, + "step": 92 + }, + { + "epoch": 0.03763010408326661, + "grad_norm": 2.8785464763641357, + "learning_rate": 3.314234722905302e-06, + "loss": 0.4336, + "step": 94 + }, + { + "epoch": 0.03843074459567654, + "grad_norm": 2.28155255317688, + "learning_rate": 3.335045725966829e-06, + "loss": 0.4507, + "step": 96 + }, + { + "epoch": 0.03923138510808647, + "grad_norm": 1.0141669511795044, + "learning_rate": 3.355908796203301e-06, + "loss": 0.2253, + "step": 98 + }, + { + "epoch": 0.040032025620496396, + "grad_norm": 1.1522538661956787, + "learning_rate": 3.3768237706306716e-06, + "loss": 0.3339, + "step": 100 + }, + { + "epoch": 0.04083266613290633, + "grad_norm": 2.4604341983795166, + "learning_rate": 3.3977904858594534e-06, + "loss": 0.317, + "step": 102 + }, + { + "epoch": 0.041633306645316254, + "grad_norm": 2.140285015106201, + "learning_rate": 3.418808778095917e-06, + "loss": 0.2755, + "step": 104 + }, + { + "epoch": 0.04243394715772618, + "grad_norm": 1.9649993181228638, + "learning_rate": 3.4398784831434097e-06, + "loss": 0.338, + "step": 106 + }, + { + "epoch": 0.04323458767013611, + "grad_norm": 2.1364569664001465, + "learning_rate": 3.460999436403676e-06, + "loss": 0.2499, + "step": 108 + }, + { + "epoch": 0.044035228182546036, + "grad_norm": 1.116102933883667, + "learning_rate": 3.4821714728780654e-06, + "loss": 0.3413, + "step": 110 + }, + { + "epoch": 0.04483586869495596, + "grad_norm": 1.1271781921386719, + "learning_rate": 3.5033944271688624e-06, + "loss": 0.1601, + "step": 112 + }, + { + "epoch": 0.045636509207365894, + "grad_norm": 3.2062900066375732, + "learning_rate": 3.5246681334806177e-06, + "loss": 0.305, + "step": 114 + }, + { + "epoch": 0.04643714971977582, + "grad_norm": 1.19222092628479, + "learning_rate": 3.5459924256213596e-06, + "loss": 0.209, + "step": 116 + }, + { + "epoch": 0.04723779023218575, + "grad_norm": 1.9911789894104004, + "learning_rate": 3.567367137003953e-06, + "loss": 0.338, + "step": 118 + }, + { + "epoch": 0.04803843074459568, + "grad_norm": 0.8726086020469666, + "learning_rate": 3.588792100647368e-06, + "loss": 0.2405, + "step": 120 + }, + { + "epoch": 0.0488390712570056, + "grad_norm": 2.3061420917510986, + "learning_rate": 3.6102671491780393e-06, + "loss": 0.4597, + "step": 122 + }, + { + "epoch": 0.049639711769415534, + "grad_norm": 2.1375999450683594, + "learning_rate": 3.6317921148310965e-06, + "loss": 0.3047, + "step": 124 + }, + { + "epoch": 0.05044035228182546, + "grad_norm": 2.572079658508301, + "learning_rate": 3.653366829451711e-06, + "loss": 0.3633, + "step": 126 + }, + { + "epoch": 0.051240992794235385, + "grad_norm": 2.711524248123169, + "learning_rate": 3.674991124496452e-06, + "loss": 0.3805, + "step": 128 + }, + { + "epoch": 0.05204163330664532, + "grad_norm": 0.9553995728492737, + "learning_rate": 3.696664831034521e-06, + "loss": 0.2241, + "step": 130 + }, + { + "epoch": 0.05284227381905524, + "grad_norm": 1.8356773853302002, + "learning_rate": 3.7183877797491143e-06, + "loss": 0.3312, + "step": 132 + }, + { + "epoch": 0.053642914331465175, + "grad_norm": 2.676448345184326, + "learning_rate": 3.740159800938784e-06, + "loss": 0.3519, + "step": 134 + }, + { + "epoch": 0.0544435548438751, + "grad_norm": 1.3976871967315674, + "learning_rate": 3.7619807245186824e-06, + "loss": 0.261, + "step": 136 + }, + { + "epoch": 0.055244195356285025, + "grad_norm": 1.9826667308807373, + "learning_rate": 3.783850380021933e-06, + "loss": 0.5227, + "step": 138 + }, + { + "epoch": 0.05604483586869496, + "grad_norm": 1.9318493604660034, + "learning_rate": 3.8057685966010025e-06, + "loss": 0.408, + "step": 140 + }, + { + "epoch": 0.05684547638110488, + "grad_norm": 2.0600426197052, + "learning_rate": 3.827735203028956e-06, + "loss": 0.3692, + "step": 142 + }, + { + "epoch": 0.057646116893514815, + "grad_norm": 3.1062395572662354, + "learning_rate": 3.849750027700842e-06, + "loss": 0.3948, + "step": 144 + }, + { + "epoch": 0.05844675740592474, + "grad_norm": 0.9624987244606018, + "learning_rate": 3.871812898635011e-06, + "loss": 0.2156, + "step": 146 + }, + { + "epoch": 0.059247397918334666, + "grad_norm": 2.1867053508758545, + "learning_rate": 3.8939236434745184e-06, + "loss": 0.4032, + "step": 148 + }, + { + "epoch": 0.0600480384307446, + "grad_norm": 2.833893060684204, + "learning_rate": 3.916082089488379e-06, + "loss": 0.3511, + "step": 150 + }, + { + "epoch": 0.06084867894315452, + "grad_norm": 3.8301355838775635, + "learning_rate": 3.938288063572962e-06, + "loss": 0.4507, + "step": 152 + }, + { + "epoch": 0.06164931945556445, + "grad_norm": 2.59808087348938, + "learning_rate": 3.960541392253387e-06, + "loss": 0.3143, + "step": 154 + }, + { + "epoch": 0.06244995996797438, + "grad_norm": 4.245327472686768, + "learning_rate": 3.982841901684792e-06, + "loss": 0.3805, + "step": 156 + }, + { + "epoch": 0.0632506004803843, + "grad_norm": 1.2837531566619873, + "learning_rate": 4.005189417653737e-06, + "loss": 0.3424, + "step": 158 + }, + { + "epoch": 0.06405124099279423, + "grad_norm": 2.0677602291107178, + "learning_rate": 4.027583765579601e-06, + "loss": 0.2632, + "step": 160 + }, + { + "epoch": 0.06485188150520416, + "grad_norm": 1.8339593410491943, + "learning_rate": 4.050024770515873e-06, + "loss": 0.4432, + "step": 162 + }, + { + "epoch": 0.0656525220176141, + "grad_norm": 1.721583604812622, + "learning_rate": 4.072512257151546e-06, + "loss": 0.246, + "step": 164 + }, + { + "epoch": 0.06645316253002402, + "grad_norm": 3.4039413928985596, + "learning_rate": 4.095046049812541e-06, + "loss": 0.2632, + "step": 166 + }, + { + "epoch": 0.06725380304243395, + "grad_norm": 1.8817945718765259, + "learning_rate": 4.117625972462988e-06, + "loss": 0.3324, + "step": 168 + }, + { + "epoch": 0.06805444355484387, + "grad_norm": 1.905517578125, + "learning_rate": 4.1402518487066624e-06, + "loss": 0.2895, + "step": 170 + }, + { + "epoch": 0.0688550840672538, + "grad_norm": 2.4500277042388916, + "learning_rate": 4.1629235017883285e-06, + "loss": 0.2755, + "step": 172 + }, + { + "epoch": 0.06965572457966374, + "grad_norm": 3.1148903369903564, + "learning_rate": 4.1856407545951825e-06, + "loss": 0.3635, + "step": 174 + }, + { + "epoch": 0.07045636509207366, + "grad_norm": 1.2598246335983276, + "learning_rate": 4.208403429658151e-06, + "loss": 0.2971, + "step": 176 + }, + { + "epoch": 0.07125700560448359, + "grad_norm": 1.79021418094635, + "learning_rate": 4.2312113491533145e-06, + "loss": 0.2887, + "step": 178 + }, + { + "epoch": 0.07205764611689351, + "grad_norm": 1.9274691343307495, + "learning_rate": 4.254064334903347e-06, + "loss": 0.3028, + "step": 180 + }, + { + "epoch": 0.07285828662930344, + "grad_norm": 2.8469083309173584, + "learning_rate": 4.276962208378814e-06, + "loss": 0.4137, + "step": 182 + }, + { + "epoch": 0.07365892714171338, + "grad_norm": 1.1544628143310547, + "learning_rate": 4.299904790699619e-06, + "loss": 0.3908, + "step": 184 + }, + { + "epoch": 0.0744595676541233, + "grad_norm": 1.839211106300354, + "learning_rate": 4.3228919026364345e-06, + "loss": 0.3751, + "step": 186 + }, + { + "epoch": 0.07526020816653323, + "grad_norm": 2.9891157150268555, + "learning_rate": 4.345923364612024e-06, + "loss": 0.3312, + "step": 188 + }, + { + "epoch": 0.07606084867894315, + "grad_norm": 2.053901433944702, + "learning_rate": 4.368998996702686e-06, + "loss": 0.2263, + "step": 190 + }, + { + "epoch": 0.07686148919135308, + "grad_norm": 2.85687255859375, + "learning_rate": 4.392118618639698e-06, + "loss": 0.3511, + "step": 192 + }, + { + "epoch": 0.07766212970376302, + "grad_norm": 1.6295607089996338, + "learning_rate": 4.415282049810643e-06, + "loss": 0.2755, + "step": 194 + }, + { + "epoch": 0.07846277021617294, + "grad_norm": 1.2351876497268677, + "learning_rate": 4.4384891092608795e-06, + "loss": 0.2856, + "step": 196 + }, + { + "epoch": 0.07926341072858287, + "grad_norm": 2.2440104484558105, + "learning_rate": 4.461739615694921e-06, + "loss": 0.2811, + "step": 198 + }, + { + "epoch": 0.08006405124099279, + "grad_norm": 1.0663623809814453, + "learning_rate": 4.485033387477915e-06, + "loss": 0.2125, + "step": 200 + }, + { + "epoch": 0.08086469175340272, + "grad_norm": 2.93976092338562, + "learning_rate": 4.5083702426369715e-06, + "loss": 0.3965, + "step": 202 + }, + { + "epoch": 0.08166533226581266, + "grad_norm": 1.7852249145507812, + "learning_rate": 4.531749998862628e-06, + "loss": 0.3483, + "step": 204 + }, + { + "epoch": 0.08246597277822258, + "grad_norm": 2.172910451889038, + "learning_rate": 4.555172473510324e-06, + "loss": 0.2754, + "step": 206 + }, + { + "epoch": 0.08326661329063251, + "grad_norm": 1.0781487226486206, + "learning_rate": 4.578637483601732e-06, + "loss": 0.2506, + "step": 208 + }, + { + "epoch": 0.08406725380304243, + "grad_norm": 2.670433521270752, + "learning_rate": 4.602144845826234e-06, + "loss": 0.4507, + "step": 210 + }, + { + "epoch": 0.08486789431545236, + "grad_norm": 1.9758044481277466, + "learning_rate": 4.625694376542399e-06, + "loss": 0.507, + "step": 212 + }, + { + "epoch": 0.08566853482786228, + "grad_norm": 1.7940325736999512, + "learning_rate": 4.649285891779326e-06, + "loss": 0.3343, + "step": 214 + }, + { + "epoch": 0.08646917534027222, + "grad_norm": 1.281074047088623, + "learning_rate": 4.672919207238145e-06, + "loss": 0.3455, + "step": 216 + }, + { + "epoch": 0.08726981585268215, + "grad_norm": 1.7813434600830078, + "learning_rate": 4.696594138293421e-06, + "loss": 0.3179, + "step": 218 + }, + { + "epoch": 0.08807045636509207, + "grad_norm": 1.0340434312820435, + "learning_rate": 4.720310499994664e-06, + "loss": 0.2799, + "step": 220 + }, + { + "epoch": 0.088871096877502, + "grad_norm": 1.6256141662597656, + "learning_rate": 4.744068107067673e-06, + "loss": 0.2281, + "step": 222 + }, + { + "epoch": 0.08967173738991192, + "grad_norm": 1.152833342552185, + "learning_rate": 4.767866773916041e-06, + "loss": 0.2348, + "step": 224 + }, + { + "epoch": 0.09047237790232186, + "grad_norm": 0.9562775492668152, + "learning_rate": 4.79170631462264e-06, + "loss": 0.3037, + "step": 226 + }, + { + "epoch": 0.09127301841473179, + "grad_norm": 3.1727426052093506, + "learning_rate": 4.81558654295099e-06, + "loss": 0.3269, + "step": 228 + }, + { + "epoch": 0.09207365892714171, + "grad_norm": 1.7656917572021484, + "learning_rate": 4.839507272346751e-06, + "loss": 0.2403, + "step": 230 + }, + { + "epoch": 0.09287429943955164, + "grad_norm": 1.920957088470459, + "learning_rate": 4.863468315939234e-06, + "loss": 0.2889, + "step": 232 + }, + { + "epoch": 0.09367493995196156, + "grad_norm": 1.6229881048202515, + "learning_rate": 4.8874694865427676e-06, + "loss": 0.4033, + "step": 234 + }, + { + "epoch": 0.0944755804643715, + "grad_norm": 0.8364437818527222, + "learning_rate": 4.911510596658202e-06, + "loss": 0.1846, + "step": 236 + }, + { + "epoch": 0.09527622097678143, + "grad_norm": 3.1142466068267822, + "learning_rate": 4.935591458474425e-06, + "loss": 0.4222, + "step": 238 + }, + { + "epoch": 0.09607686148919135, + "grad_norm": 1.8459943532943726, + "learning_rate": 4.959711883869734e-06, + "loss": 0.4872, + "step": 240 + }, + { + "epoch": 0.09687750200160128, + "grad_norm": 2.291538953781128, + "learning_rate": 4.9838716844133665e-06, + "loss": 0.4536, + "step": 242 + }, + { + "epoch": 0.0976781425140112, + "grad_norm": 2.047617197036743, + "learning_rate": 5.0080706713669435e-06, + "loss": 0.2706, + "step": 244 + }, + { + "epoch": 0.09847878302642114, + "grad_norm": 1.7988871335983276, + "learning_rate": 5.032308655686007e-06, + "loss": 0.3323, + "step": 246 + }, + { + "epoch": 0.09927942353883107, + "grad_norm": 1.1432017087936401, + "learning_rate": 5.056585448021398e-06, + "loss": 0.401, + "step": 248 + }, + { + "epoch": 0.100080064051241, + "grad_norm": 1.5853811502456665, + "learning_rate": 5.080900858720789e-06, + "loss": 0.2897, + "step": 250 + }, + { + "epoch": 0.10088070456365092, + "grad_norm": 1.2969950437545776, + "learning_rate": 5.105254697830208e-06, + "loss": 0.2064, + "step": 252 + }, + { + "epoch": 0.10168134507606084, + "grad_norm": 1.6325477361679077, + "learning_rate": 5.129646775095432e-06, + "loss": 0.3087, + "step": 254 + }, + { + "epoch": 0.10248198558847077, + "grad_norm": 1.77692449092865, + "learning_rate": 5.154076899963514e-06, + "loss": 0.2611, + "step": 256 + }, + { + "epoch": 0.10328262610088071, + "grad_norm": 1.3447515964508057, + "learning_rate": 5.178544881584328e-06, + "loss": 0.1419, + "step": 258 + }, + { + "epoch": 0.10408326661329063, + "grad_norm": 1.8046120405197144, + "learning_rate": 5.203050528811959e-06, + "loss": 0.3032, + "step": 260 + }, + { + "epoch": 0.10488390712570056, + "grad_norm": 1.0634700059890747, + "learning_rate": 5.227593650206246e-06, + "loss": 0.2962, + "step": 262 + }, + { + "epoch": 0.10568454763811048, + "grad_norm": 1.1181424856185913, + "learning_rate": 5.2521740540343205e-06, + "loss": 0.4011, + "step": 264 + }, + { + "epoch": 0.10648518815052041, + "grad_norm": 1.4445732831954956, + "learning_rate": 5.2767915482720164e-06, + "loss": 0.3173, + "step": 266 + }, + { + "epoch": 0.10728582866293035, + "grad_norm": 1.1590688228607178, + "learning_rate": 5.3014459406054295e-06, + "loss": 0.3906, + "step": 268 + }, + { + "epoch": 0.10808646917534027, + "grad_norm": 1.2471072673797607, + "learning_rate": 5.3261370384323904e-06, + "loss": 0.2354, + "step": 270 + }, + { + "epoch": 0.1088871096877502, + "grad_norm": 2.356154203414917, + "learning_rate": 5.350864648864026e-06, + "loss": 0.4743, + "step": 272 + }, + { + "epoch": 0.10968775020016013, + "grad_norm": 3.4677579402923584, + "learning_rate": 5.375628578726181e-06, + "loss": 0.4743, + "step": 274 + }, + { + "epoch": 0.11048839071257005, + "grad_norm": 1.5109831094741821, + "learning_rate": 5.4004286345609665e-06, + "loss": 0.2524, + "step": 276 + }, + { + "epoch": 0.11128903122497999, + "grad_norm": 1.9253437519073486, + "learning_rate": 5.425264622628326e-06, + "loss": 0.2386, + "step": 278 + }, + { + "epoch": 0.11208967173738991, + "grad_norm": 1.1591182947158813, + "learning_rate": 5.450136348907444e-06, + "loss": 0.2334, + "step": 280 + }, + { + "epoch": 0.11289031224979984, + "grad_norm": 1.1802468299865723, + "learning_rate": 5.475043619098321e-06, + "loss": 0.1977, + "step": 282 + }, + { + "epoch": 0.11369095276220977, + "grad_norm": 4.672449111938477, + "learning_rate": 5.499986238623329e-06, + "loss": 0.3343, + "step": 284 + }, + { + "epoch": 0.11449159327461969, + "grad_norm": 2.4708926677703857, + "learning_rate": 5.524964012628644e-06, + "loss": 0.3636, + "step": 286 + }, + { + "epoch": 0.11529223378702963, + "grad_norm": 1.6256619691848755, + "learning_rate": 5.549976745985809e-06, + "loss": 0.3049, + "step": 288 + }, + { + "epoch": 0.11609287429943956, + "grad_norm": 2.0721547603607178, + "learning_rate": 5.57502424329331e-06, + "loss": 0.2264, + "step": 290 + }, + { + "epoch": 0.11689351481184948, + "grad_norm": 1.923458456993103, + "learning_rate": 5.6001063088780085e-06, + "loss": 0.2633, + "step": 292 + }, + { + "epoch": 0.1176941553242594, + "grad_norm": 2.0029776096343994, + "learning_rate": 5.62522274679673e-06, + "loss": 0.3324, + "step": 294 + }, + { + "epoch": 0.11849479583666933, + "grad_norm": 3.395490884780884, + "learning_rate": 5.650373360837763e-06, + "loss": 0.4318, + "step": 296 + }, + { + "epoch": 0.11929543634907927, + "grad_norm": 2.725689649581909, + "learning_rate": 5.675557954522462e-06, + "loss": 0.3635, + "step": 298 + }, + { + "epoch": 0.1200960768614892, + "grad_norm": 2.713259696960449, + "learning_rate": 5.700776331106674e-06, + "loss": 0.2813, + "step": 300 + }, + { + "epoch": 0.12089671737389912, + "grad_norm": 1.1104077100753784, + "learning_rate": 5.726028293582342e-06, + "loss": 0.2902, + "step": 302 + }, + { + "epoch": 0.12169735788630905, + "grad_norm": 2.4573733806610107, + "learning_rate": 5.751313644679071e-06, + "loss": 0.29, + "step": 304 + }, + { + "epoch": 0.12249799839871897, + "grad_norm": 4.677824020385742, + "learning_rate": 5.776632186865589e-06, + "loss": 0.3802, + "step": 306 + }, + { + "epoch": 0.1232986389111289, + "grad_norm": 1.7891209125518799, + "learning_rate": 5.8019837223513295e-06, + "loss": 0.3381, + "step": 308 + }, + { + "epoch": 0.12409927942353884, + "grad_norm": 2.698471784591675, + "learning_rate": 5.827368053088032e-06, + "loss": 0.4509, + "step": 310 + }, + { + "epoch": 0.12489991993594876, + "grad_norm": 3.470750331878662, + "learning_rate": 5.852784980771182e-06, + "loss": 0.3702, + "step": 312 + }, + { + "epoch": 0.1257005604483587, + "grad_norm": 2.1616930961608887, + "learning_rate": 5.878234306841637e-06, + "loss": 0.2661, + "step": 314 + }, + { + "epoch": 0.1265012009607686, + "grad_norm": 4.727813720703125, + "learning_rate": 5.903715832487138e-06, + "loss": 0.3796, + "step": 316 + }, + { + "epoch": 0.12730184147317855, + "grad_norm": 2.959223985671997, + "learning_rate": 5.929229358643925e-06, + "loss": 0.3968, + "step": 318 + }, + { + "epoch": 0.12810248198558846, + "grad_norm": 1.024084210395813, + "learning_rate": 5.954774685998206e-06, + "loss": 0.4581, + "step": 320 + }, + { + "epoch": 0.1289031224979984, + "grad_norm": 2.142812728881836, + "learning_rate": 5.9803516149877475e-06, + "loss": 0.3343, + "step": 322 + }, + { + "epoch": 0.1297037630104083, + "grad_norm": 1.6881309747695923, + "learning_rate": 6.005959945803494e-06, + "loss": 0.3032, + "step": 324 + }, + { + "epoch": 0.13050440352281825, + "grad_norm": 2.475003242492676, + "learning_rate": 6.03159947839103e-06, + "loss": 0.3325, + "step": 326 + }, + { + "epoch": 0.1313050440352282, + "grad_norm": 2.611116409301758, + "learning_rate": 6.057270012452186e-06, + "loss": 0.8183, + "step": 328 + }, + { + "epoch": 0.1321056845476381, + "grad_norm": 1.6926289796829224, + "learning_rate": 6.082971347446654e-06, + "loss": 0.29, + "step": 330 + }, + { + "epoch": 0.13290632506004804, + "grad_norm": 1.4876524209976196, + "learning_rate": 6.108703282593461e-06, + "loss": 0.2613, + "step": 332 + }, + { + "epoch": 0.13370696557245795, + "grad_norm": 1.7157933712005615, + "learning_rate": 6.13446561687258e-06, + "loss": 0.1862, + "step": 334 + }, + { + "epoch": 0.1345076060848679, + "grad_norm": 1.793121099472046, + "learning_rate": 6.160258149026557e-06, + "loss": 0.2521, + "step": 336 + }, + { + "epoch": 0.13530824659727783, + "grad_norm": 2.314870834350586, + "learning_rate": 6.186080677561974e-06, + "loss": 0.2479, + "step": 338 + }, + { + "epoch": 0.13610888710968774, + "grad_norm": 1.6335136890411377, + "learning_rate": 6.2119330007511014e-06, + "loss": 0.3969, + "step": 340 + }, + { + "epoch": 0.13690952762209768, + "grad_norm": 3.1724295616149902, + "learning_rate": 6.237814916633431e-06, + "loss": 0.4026, + "step": 342 + }, + { + "epoch": 0.1377101681345076, + "grad_norm": 3.4926207065582275, + "learning_rate": 6.263726223017326e-06, + "loss": 0.4322, + "step": 344 + }, + { + "epoch": 0.13851080864691753, + "grad_norm": 1.0829813480377197, + "learning_rate": 6.289666717481496e-06, + "loss": 0.3132, + "step": 346 + }, + { + "epoch": 0.13931144915932747, + "grad_norm": 1.4402697086334229, + "learning_rate": 6.315636197376634e-06, + "loss": 0.3212, + "step": 348 + }, + { + "epoch": 0.14011208967173738, + "grad_norm": 2.972395181655884, + "learning_rate": 6.341634459827044e-06, + "loss": 0.3609, + "step": 350 + }, + { + "epoch": 0.14091273018414732, + "grad_norm": 1.590192198753357, + "learning_rate": 6.3676613017321305e-06, + "loss": 0.3183, + "step": 352 + }, + { + "epoch": 0.14171337069655723, + "grad_norm": 2.191331624984741, + "learning_rate": 6.393716519768032e-06, + "loss": 0.2818, + "step": 354 + }, + { + "epoch": 0.14251401120896717, + "grad_norm": 1.3126225471496582, + "learning_rate": 6.419799910389257e-06, + "loss": 0.2463, + "step": 356 + }, + { + "epoch": 0.1433146517213771, + "grad_norm": 1.4824910163879395, + "learning_rate": 6.445911269830183e-06, + "loss": 0.3011, + "step": 358 + }, + { + "epoch": 0.14411529223378702, + "grad_norm": 1.5778580904006958, + "learning_rate": 6.472050394106689e-06, + "loss": 0.3808, + "step": 360 + }, + { + "epoch": 0.14491593274619696, + "grad_norm": 1.792614459991455, + "learning_rate": 6.498217079017806e-06, + "loss": 0.3475, + "step": 362 + }, + { + "epoch": 0.14571657325860687, + "grad_norm": 1.937895655632019, + "learning_rate": 6.524411120147204e-06, + "loss": 0.2266, + "step": 364 + }, + { + "epoch": 0.1465172137710168, + "grad_norm": 1.0928398370742798, + "learning_rate": 6.5506323128648654e-06, + "loss": 0.275, + "step": 366 + }, + { + "epoch": 0.14731785428342675, + "grad_norm": 2.519529342651367, + "learning_rate": 6.576880452328645e-06, + "loss": 0.2872, + "step": 368 + }, + { + "epoch": 0.14811849479583666, + "grad_norm": 1.841261386871338, + "learning_rate": 6.603155333485934e-06, + "loss": 0.3182, + "step": 370 + }, + { + "epoch": 0.1489191353082466, + "grad_norm": 1.6670360565185547, + "learning_rate": 6.6294567510751675e-06, + "loss": 0.3439, + "step": 372 + }, + { + "epoch": 0.14971977582065651, + "grad_norm": 1.5819344520568848, + "learning_rate": 6.655784499627476e-06, + "loss": 0.3324, + "step": 374 + }, + { + "epoch": 0.15052041633306645, + "grad_norm": 6.221319198608398, + "learning_rate": 6.682138373468341e-06, + "loss": 0.5313, + "step": 376 + }, + { + "epoch": 0.1513210568454764, + "grad_norm": 1.9108086824417114, + "learning_rate": 6.7085181667191e-06, + "loss": 0.2557, + "step": 378 + }, + { + "epoch": 0.1521216973578863, + "grad_norm": 3.0079498291015625, + "learning_rate": 6.734923673298605e-06, + "loss": 0.2526, + "step": 380 + }, + { + "epoch": 0.15292233787029624, + "grad_norm": 1.6423708200454712, + "learning_rate": 6.761354686924883e-06, + "loss": 0.2036, + "step": 382 + }, + { + "epoch": 0.15372297838270615, + "grad_norm": 0.8900743126869202, + "learning_rate": 6.787811001116654e-06, + "loss": 0.1849, + "step": 384 + }, + { + "epoch": 0.1545236188951161, + "grad_norm": 3.0917351245880127, + "learning_rate": 6.8142924091949955e-06, + "loss": 0.3968, + "step": 386 + }, + { + "epoch": 0.15532425940752603, + "grad_norm": 1.001408576965332, + "learning_rate": 6.840798704284939e-06, + "loss": 0.2353, + "step": 388 + }, + { + "epoch": 0.15612489991993594, + "grad_norm": 1.7336856126785278, + "learning_rate": 6.867329679317144e-06, + "loss": 0.2829, + "step": 390 + }, + { + "epoch": 0.15692554043234588, + "grad_norm": 1.4903119802474976, + "learning_rate": 6.893885127029419e-06, + "loss": 0.2371, + "step": 392 + }, + { + "epoch": 0.1577261809447558, + "grad_norm": 1.8008278608322144, + "learning_rate": 6.920464839968391e-06, + "loss": 0.2899, + "step": 394 + }, + { + "epoch": 0.15852682145716573, + "grad_norm": 1.419907569885254, + "learning_rate": 6.94706861049117e-06, + "loss": 0.2525, + "step": 396 + }, + { + "epoch": 0.15932746196957567, + "grad_norm": 1.348390817642212, + "learning_rate": 6.973696230766884e-06, + "loss": 0.3089, + "step": 398 + }, + { + "epoch": 0.16012810248198558, + "grad_norm": 1.5187983512878418, + "learning_rate": 7.000347492778341e-06, + "loss": 0.3087, + "step": 400 + }, + { + "epoch": 0.16092874299439552, + "grad_norm": 1.4567490816116333, + "learning_rate": 7.027022188323704e-06, + "loss": 0.3827, + "step": 402 + }, + { + "epoch": 0.16172938350680544, + "grad_norm": 2.4638514518737793, + "learning_rate": 7.05372010901803e-06, + "loss": 0.3834, + "step": 404 + }, + { + "epoch": 0.16253002401921537, + "grad_norm": 1.4170327186584473, + "learning_rate": 7.080441046294945e-06, + "loss": 0.2524, + "step": 406 + }, + { + "epoch": 0.1633306645316253, + "grad_norm": 1.8990858793258667, + "learning_rate": 7.1071847914082605e-06, + "loss": 0.252, + "step": 408 + }, + { + "epoch": 0.16413130504403523, + "grad_norm": 1.566562533378601, + "learning_rate": 7.133951135433656e-06, + "loss": 0.3439, + "step": 410 + }, + { + "epoch": 0.16493194555644516, + "grad_norm": 1.8480348587036133, + "learning_rate": 7.160739869270219e-06, + "loss": 0.5183, + "step": 412 + }, + { + "epoch": 0.16573258606885508, + "grad_norm": 1.4662901163101196, + "learning_rate": 7.18755078364214e-06, + "loss": 0.3693, + "step": 414 + }, + { + "epoch": 0.16653322658126501, + "grad_norm": 2.4548027515411377, + "learning_rate": 7.214383669100317e-06, + "loss": 0.5184, + "step": 416 + }, + { + "epoch": 0.16733386709367493, + "grad_norm": 1.8245604038238525, + "learning_rate": 7.241238316024064e-06, + "loss": 0.3628, + "step": 418 + }, + { + "epoch": 0.16813450760608487, + "grad_norm": 2.5252621173858643, + "learning_rate": 7.268114514622635e-06, + "loss": 0.3322, + "step": 420 + }, + { + "epoch": 0.1689351481184948, + "grad_norm": 1.8267076015472412, + "learning_rate": 7.2950120549369204e-06, + "loss": 0.3634, + "step": 422 + }, + { + "epoch": 0.16973578863090472, + "grad_norm": 1.223475456237793, + "learning_rate": 7.321930726841144e-06, + "loss": 0.3283, + "step": 424 + }, + { + "epoch": 0.17053642914331466, + "grad_norm": 2.4018161296844482, + "learning_rate": 7.348870320044395e-06, + "loss": 0.3211, + "step": 426 + }, + { + "epoch": 0.17133706965572457, + "grad_norm": 1.878448724746704, + "learning_rate": 7.375830624092336e-06, + "loss": 0.266, + "step": 428 + }, + { + "epoch": 0.1721377101681345, + "grad_norm": 2.1052086353302, + "learning_rate": 7.402811428368824e-06, + "loss": 0.239, + "step": 430 + }, + { + "epoch": 0.17293835068054444, + "grad_norm": 1.233818531036377, + "learning_rate": 7.429812522097613e-06, + "loss": 0.3123, + "step": 432 + }, + { + "epoch": 0.17373899119295436, + "grad_norm": 1.3044219017028809, + "learning_rate": 7.4568336943439055e-06, + "loss": 0.2956, + "step": 434 + }, + { + "epoch": 0.1745396317053643, + "grad_norm": 2.3567352294921875, + "learning_rate": 7.4838747340160475e-06, + "loss": 0.2389, + "step": 436 + }, + { + "epoch": 0.1753402722177742, + "grad_norm": 2.2242367267608643, + "learning_rate": 7.510935429867233e-06, + "loss": 0.275, + "step": 438 + }, + { + "epoch": 0.17614091273018415, + "grad_norm": 2.4698288440704346, + "learning_rate": 7.538015570497046e-06, + "loss": 0.3967, + "step": 440 + }, + { + "epoch": 0.17694155324259409, + "grad_norm": 2.2687125205993652, + "learning_rate": 7.5651149443531846e-06, + "loss": 0.3474, + "step": 442 + }, + { + "epoch": 0.177742193755004, + "grad_norm": 1.696685791015625, + "learning_rate": 7.592233339733077e-06, + "loss": 0.2973, + "step": 444 + }, + { + "epoch": 0.17854283426741394, + "grad_norm": 2.946443796157837, + "learning_rate": 7.619370544785608e-06, + "loss": 0.2475, + "step": 446 + }, + { + "epoch": 0.17934347477982385, + "grad_norm": 1.0961471796035767, + "learning_rate": 7.646526347512665e-06, + "loss": 0.2608, + "step": 448 + }, + { + "epoch": 0.1801441152922338, + "grad_norm": 2.7312629222869873, + "learning_rate": 7.67370053577085e-06, + "loss": 0.3086, + "step": 450 + }, + { + "epoch": 0.18094475580464373, + "grad_norm": 2.624192714691162, + "learning_rate": 7.70089289727319e-06, + "loss": 0.2371, + "step": 452 + }, + { + "epoch": 0.18174539631705364, + "grad_norm": 1.819016695022583, + "learning_rate": 7.728103219590684e-06, + "loss": 0.3806, + "step": 454 + }, + { + "epoch": 0.18254603682946358, + "grad_norm": 1.9570043087005615, + "learning_rate": 7.755331290154041e-06, + "loss": 0.2377, + "step": 456 + }, + { + "epoch": 0.1833466773418735, + "grad_norm": 2.6655914783477783, + "learning_rate": 7.7825768962553e-06, + "loss": 0.2813, + "step": 458 + }, + { + "epoch": 0.18414731785428343, + "grad_norm": 1.8741893768310547, + "learning_rate": 7.809839825049565e-06, + "loss": 0.3049, + "step": 460 + }, + { + "epoch": 0.18494795836669337, + "grad_norm": 1.7667878866195679, + "learning_rate": 7.83711986355656e-06, + "loss": 0.3258, + "step": 462 + }, + { + "epoch": 0.18574859887910328, + "grad_norm": 1.0771892070770264, + "learning_rate": 7.864416798662347e-06, + "loss": 0.3061, + "step": 464 + }, + { + "epoch": 0.18654923939151322, + "grad_norm": 1.700187087059021, + "learning_rate": 7.891730417121043e-06, + "loss": 0.2888, + "step": 466 + }, + { + "epoch": 0.18734987990392313, + "grad_norm": 2.3407528400421143, + "learning_rate": 7.919060505556376e-06, + "loss": 0.2385, + "step": 468 + }, + { + "epoch": 0.18815052041633307, + "grad_norm": 3.151266098022461, + "learning_rate": 7.946406850463435e-06, + "loss": 0.2624, + "step": 470 + }, + { + "epoch": 0.188951160928743, + "grad_norm": 2.6381173133850098, + "learning_rate": 7.973769238210291e-06, + "loss": 0.215, + "step": 472 + }, + { + "epoch": 0.18975180144115292, + "grad_norm": 2.433835744857788, + "learning_rate": 8.001147455039737e-06, + "loss": 0.4221, + "step": 474 + }, + { + "epoch": 0.19055244195356286, + "grad_norm": 2.4531333446502686, + "learning_rate": 8.028541287070858e-06, + "loss": 0.3412, + "step": 476 + }, + { + "epoch": 0.19135308246597277, + "grad_norm": 1.1494615077972412, + "learning_rate": 8.055950520300756e-06, + "loss": 0.3006, + "step": 478 + }, + { + "epoch": 0.1921537229783827, + "grad_norm": 3.976663827896118, + "learning_rate": 8.083374940606256e-06, + "loss": 0.5076, + "step": 480 + }, + { + "epoch": 0.19295436349079265, + "grad_norm": 3.141592502593994, + "learning_rate": 8.110814333745503e-06, + "loss": 0.3966, + "step": 482 + }, + { + "epoch": 0.19375500400320256, + "grad_norm": 1.9734450578689575, + "learning_rate": 8.138268485359684e-06, + "loss": 0.1843, + "step": 484 + }, + { + "epoch": 0.1945556445156125, + "grad_norm": 1.2226758003234863, + "learning_rate": 8.165737180974676e-06, + "loss": 0.2435, + "step": 486 + }, + { + "epoch": 0.1953562850280224, + "grad_norm": 2.8335437774658203, + "learning_rate": 8.193220206002785e-06, + "loss": 0.3171, + "step": 488 + }, + { + "epoch": 0.19615692554043235, + "grad_norm": 3.284465789794922, + "learning_rate": 8.220717345744326e-06, + "loss": 0.3439, + "step": 490 + }, + { + "epoch": 0.1969575660528423, + "grad_norm": 2.028076648712158, + "learning_rate": 8.248228385389349e-06, + "loss": 0.338, + "step": 492 + }, + { + "epoch": 0.1977582065652522, + "grad_norm": 1.744632601737976, + "learning_rate": 8.275753110019367e-06, + "loss": 0.3512, + "step": 494 + }, + { + "epoch": 0.19855884707766214, + "grad_norm": 1.9435656070709229, + "learning_rate": 8.303291304608936e-06, + "loss": 0.2608, + "step": 496 + }, + { + "epoch": 0.19935948759007205, + "grad_norm": 3.164795398712158, + "learning_rate": 8.330842754027378e-06, + "loss": 0.5131, + "step": 498 + }, + { + "epoch": 0.200160128102482, + "grad_norm": 3.272547483444214, + "learning_rate": 8.358407243040524e-06, + "loss": 0.3654, + "step": 500 + }, + { + "epoch": 0.20096076861489193, + "grad_norm": 0.8381208181381226, + "learning_rate": 8.385984556312285e-06, + "loss": 0.2702, + "step": 502 + }, + { + "epoch": 0.20176140912730184, + "grad_norm": 7.656544208526611, + "learning_rate": 8.413574478406386e-06, + "loss": 0.3967, + "step": 504 + }, + { + "epoch": 0.20256204963971178, + "grad_norm": 3.6612207889556885, + "learning_rate": 8.441176793788106e-06, + "loss": 0.321, + "step": 506 + }, + { + "epoch": 0.2033626901521217, + "grad_norm": 5.426351547241211, + "learning_rate": 8.468791286825856e-06, + "loss": 0.4915, + "step": 508 + }, + { + "epoch": 0.20416333066453163, + "grad_norm": 3.072937250137329, + "learning_rate": 8.496417741792922e-06, + "loss": 0.356, + "step": 510 + }, + { + "epoch": 0.20496397117694154, + "grad_norm": 2.155280351638794, + "learning_rate": 8.524055942869135e-06, + "loss": 0.3257, + "step": 512 + }, + { + "epoch": 0.20576461168935148, + "grad_norm": 0.995248019695282, + "learning_rate": 8.551705674142616e-06, + "loss": 0.2554, + "step": 514 + }, + { + "epoch": 0.20656525220176142, + "grad_norm": 1.5766077041625977, + "learning_rate": 8.579366719611353e-06, + "loss": 0.3316, + "step": 516 + }, + { + "epoch": 0.20736589271417133, + "grad_norm": 1.819577932357788, + "learning_rate": 8.607038863184952e-06, + "loss": 0.3753, + "step": 518 + }, + { + "epoch": 0.20816653322658127, + "grad_norm": 4.197436809539795, + "learning_rate": 8.634721888686368e-06, + "loss": 0.4507, + "step": 520 + }, + { + "epoch": 0.20896717373899118, + "grad_norm": 3.562283754348755, + "learning_rate": 8.662415579853495e-06, + "loss": 0.4176, + "step": 522 + }, + { + "epoch": 0.20976781425140112, + "grad_norm": 2.6193058490753174, + "learning_rate": 8.690119720340907e-06, + "loss": 0.2813, + "step": 524 + }, + { + "epoch": 0.21056845476381106, + "grad_norm": 2.1119489669799805, + "learning_rate": 8.717834093721598e-06, + "loss": 0.4289, + "step": 526 + }, + { + "epoch": 0.21136909527622097, + "grad_norm": 1.20167076587677, + "learning_rate": 8.74555848348857e-06, + "loss": 0.3009, + "step": 528 + }, + { + "epoch": 0.2121697357886309, + "grad_norm": 3.0801377296447754, + "learning_rate": 8.773292673056572e-06, + "loss": 0.3656, + "step": 530 + }, + { + "epoch": 0.21297037630104082, + "grad_norm": 2.8127048015594482, + "learning_rate": 8.801036445763858e-06, + "loss": 0.4511, + "step": 532 + }, + { + "epoch": 0.21377101681345076, + "grad_norm": 2.548767328262329, + "learning_rate": 8.828789584873757e-06, + "loss": 0.3382, + "step": 534 + }, + { + "epoch": 0.2145716573258607, + "grad_norm": 2.0544488430023193, + "learning_rate": 8.856551873576448e-06, + "loss": 0.3562, + "step": 536 + }, + { + "epoch": 0.2153722978382706, + "grad_norm": 1.2972933053970337, + "learning_rate": 8.884323094990613e-06, + "loss": 0.3457, + "step": 538 + }, + { + "epoch": 0.21617293835068055, + "grad_norm": 1.8546746969223022, + "learning_rate": 8.912103032165206e-06, + "loss": 0.224, + "step": 540 + }, + { + "epoch": 0.21697357886309046, + "grad_norm": 3.0155301094055176, + "learning_rate": 8.939891468081036e-06, + "loss": 0.2801, + "step": 542 + }, + { + "epoch": 0.2177742193755004, + "grad_norm": 1.1038973331451416, + "learning_rate": 8.967688185652527e-06, + "loss": 0.2239, + "step": 544 + }, + { + "epoch": 0.21857485988791034, + "grad_norm": 1.77118980884552, + "learning_rate": 8.995492967729449e-06, + "loss": 0.3041, + "step": 546 + }, + { + "epoch": 0.21937550040032025, + "grad_norm": 1.21462082862854, + "learning_rate": 9.023305597098526e-06, + "loss": 0.2858, + "step": 548 + }, + { + "epoch": 0.2201761409127302, + "grad_norm": 1.8217580318450928, + "learning_rate": 9.051125856485175e-06, + "loss": 0.2634, + "step": 550 + }, + { + "epoch": 0.2209767814251401, + "grad_norm": 1.7191119194030762, + "learning_rate": 9.078953528555258e-06, + "loss": 0.2609, + "step": 552 + }, + { + "epoch": 0.22177742193755004, + "grad_norm": 1.7745189666748047, + "learning_rate": 9.106788395916682e-06, + "loss": 0.2521, + "step": 554 + }, + { + "epoch": 0.22257806244995998, + "grad_norm": 2.7059080600738525, + "learning_rate": 9.134630241121135e-06, + "loss": 0.4538, + "step": 556 + }, + { + "epoch": 0.2233787029623699, + "grad_norm": 1.8019850254058838, + "learning_rate": 9.162478846665854e-06, + "loss": 0.3799, + "step": 558 + }, + { + "epoch": 0.22417934347477983, + "grad_norm": 2.8111212253570557, + "learning_rate": 9.190333994995208e-06, + "loss": 0.3635, + "step": 560 + }, + { + "epoch": 0.22497998398718974, + "grad_norm": 1.6625094413757324, + "learning_rate": 9.218195468502469e-06, + "loss": 0.2607, + "step": 562 + }, + { + "epoch": 0.22578062449959968, + "grad_norm": 2.562446117401123, + "learning_rate": 9.24606304953148e-06, + "loss": 0.3808, + "step": 564 + }, + { + "epoch": 0.22658126501200962, + "grad_norm": 1.1912777423858643, + "learning_rate": 9.273936520378426e-06, + "loss": 0.1952, + "step": 566 + }, + { + "epoch": 0.22738190552441953, + "grad_norm": 1.2281590700149536, + "learning_rate": 9.301815663293426e-06, + "loss": 0.3521, + "step": 568 + }, + { + "epoch": 0.22818254603682947, + "grad_norm": 1.418253779411316, + "learning_rate": 9.329700260482286e-06, + "loss": 0.1908, + "step": 570 + }, + { + "epoch": 0.22898318654923938, + "grad_norm": 2.3358800411224365, + "learning_rate": 9.35759009410826e-06, + "loss": 0.4138, + "step": 572 + }, + { + "epoch": 0.22978382706164932, + "grad_norm": 2.0067007541656494, + "learning_rate": 9.38548494629364e-06, + "loss": 0.363, + "step": 574 + }, + { + "epoch": 0.23058446757405926, + "grad_norm": 1.0957093238830566, + "learning_rate": 9.41338459912151e-06, + "loss": 0.2153, + "step": 576 + }, + { + "epoch": 0.23138510808646917, + "grad_norm": 2.2307331562042236, + "learning_rate": 9.441288834637507e-06, + "loss": 0.4801, + "step": 578 + }, + { + "epoch": 0.2321857485988791, + "grad_norm": 1.0070244073867798, + "learning_rate": 9.469197434851414e-06, + "loss": 0.2799, + "step": 580 + }, + { + "epoch": 0.23298638911128902, + "grad_norm": 3.055666923522949, + "learning_rate": 9.497110181738935e-06, + "loss": 0.3323, + "step": 582 + }, + { + "epoch": 0.23378702962369896, + "grad_norm": 1.9424164295196533, + "learning_rate": 9.52502685724336e-06, + "loss": 0.2757, + "step": 584 + }, + { + "epoch": 0.2345876701361089, + "grad_norm": 2.179699659347534, + "learning_rate": 9.552947243277342e-06, + "loss": 0.3837, + "step": 586 + }, + { + "epoch": 0.2353883106485188, + "grad_norm": 2.0080482959747314, + "learning_rate": 9.580871121724498e-06, + "loss": 0.3515, + "step": 588 + }, + { + "epoch": 0.23618895116092875, + "grad_norm": 1.5254676342010498, + "learning_rate": 9.608798274441153e-06, + "loss": 0.176, + "step": 590 + }, + { + "epoch": 0.23698959167333866, + "grad_norm": 1.1396440267562866, + "learning_rate": 9.636728483258116e-06, + "loss": 0.2584, + "step": 592 + }, + { + "epoch": 0.2377902321857486, + "grad_norm": 1.7112116813659668, + "learning_rate": 9.664661529982263e-06, + "loss": 0.2163, + "step": 594 + }, + { + "epoch": 0.23859087269815854, + "grad_norm": 1.6850990056991577, + "learning_rate": 9.692597196398302e-06, + "loss": 0.3924, + "step": 596 + }, + { + "epoch": 0.23939151321056845, + "grad_norm": 4.149386405944824, + "learning_rate": 9.720535264270526e-06, + "loss": 0.4339, + "step": 598 + }, + { + "epoch": 0.2401921537229784, + "grad_norm": 2.7910373210906982, + "learning_rate": 9.748475515344416e-06, + "loss": 0.5497, + "step": 600 + }, + { + "epoch": 0.2409927942353883, + "grad_norm": 2.4378538131713867, + "learning_rate": 9.776417731348403e-06, + "loss": 0.3383, + "step": 602 + }, + { + "epoch": 0.24179343474779824, + "grad_norm": 3.921812057495117, + "learning_rate": 9.80436169399561e-06, + "loss": 0.4873, + "step": 604 + }, + { + "epoch": 0.24259407526020815, + "grad_norm": 1.454581379890442, + "learning_rate": 9.832307184985473e-06, + "loss": 0.4227, + "step": 606 + }, + { + "epoch": 0.2433947157726181, + "grad_norm": 1.7393807172775269, + "learning_rate": 9.8602539860055e-06, + "loss": 0.135, + "step": 608 + }, + { + "epoch": 0.24419535628502803, + "grad_norm": 0.7894531488418579, + "learning_rate": 9.888201878732946e-06, + "loss": 0.1991, + "step": 610 + }, + { + "epoch": 0.24499599679743794, + "grad_norm": 1.031651258468628, + "learning_rate": 9.916150644836596e-06, + "loss": 0.2133, + "step": 612 + }, + { + "epoch": 0.24579663730984788, + "grad_norm": 1.7962702512741089, + "learning_rate": 9.944100065978354e-06, + "loss": 0.4693, + "step": 614 + }, + { + "epoch": 0.2465972778222578, + "grad_norm": 1.8060624599456787, + "learning_rate": 9.972049923815011e-06, + "loss": 0.3261, + "step": 616 + }, + { + "epoch": 0.24739791833466773, + "grad_norm": 1.7656223773956299, + "learning_rate": 9.999999999999996e-06, + "loss": 0.344, + "step": 618 + }, + { + "epoch": 0.24819855884707767, + "grad_norm": 1.661628246307373, + "learning_rate": 1.0027950076184982e-05, + "loss": 0.4144, + "step": 620 + }, + { + "epoch": 0.24899919935948758, + "grad_norm": 1.790093183517456, + "learning_rate": 1.0055899934021637e-05, + "loss": 0.2973, + "step": 622 + }, + { + "epoch": 0.24979983987189752, + "grad_norm": 1.855914831161499, + "learning_rate": 1.0083849355163397e-05, + "loss": 0.293, + "step": 624 + }, + { + "epoch": 0.25060048038430743, + "grad_norm": 1.553846836090088, + "learning_rate": 1.0111798121267047e-05, + "loss": 0.2702, + "step": 626 + }, + { + "epoch": 0.2514011208967174, + "grad_norm": 1.494340181350708, + "learning_rate": 1.0139746013994493e-05, + "loss": 0.3803, + "step": 628 + }, + { + "epoch": 0.2522017614091273, + "grad_norm": 2.2561280727386475, + "learning_rate": 1.016769281501452e-05, + "loss": 0.3179, + "step": 630 + }, + { + "epoch": 0.2530024019215372, + "grad_norm": 1.0040173530578613, + "learning_rate": 1.0195638306004383e-05, + "loss": 0.2871, + "step": 632 + }, + { + "epoch": 0.25380304243394713, + "grad_norm": 2.5311622619628906, + "learning_rate": 1.022358226865159e-05, + "loss": 0.6135, + "step": 634 + }, + { + "epoch": 0.2546036829463571, + "grad_norm": 1.055013656616211, + "learning_rate": 1.0251524484655577e-05, + "loss": 0.2377, + "step": 636 + }, + { + "epoch": 0.255404323458767, + "grad_norm": 1.288938045501709, + "learning_rate": 1.0279464735729467e-05, + "loss": 0.2466, + "step": 638 + }, + { + "epoch": 0.2562049639711769, + "grad_norm": 1.3394522666931152, + "learning_rate": 1.0307402803601691e-05, + "loss": 0.2451, + "step": 640 + }, + { + "epoch": 0.2570056044835869, + "grad_norm": 2.467089891433716, + "learning_rate": 1.033533847001773e-05, + "loss": 0.3329, + "step": 642 + }, + { + "epoch": 0.2578062449959968, + "grad_norm": 1.3933576345443726, + "learning_rate": 1.0363271516741877e-05, + "loss": 0.2713, + "step": 644 + }, + { + "epoch": 0.2586068855084067, + "grad_norm": 2.216547966003418, + "learning_rate": 1.039120172555884e-05, + "loss": 0.2705, + "step": 646 + }, + { + "epoch": 0.2594075260208166, + "grad_norm": 1.2875484228134155, + "learning_rate": 1.0419128878275495e-05, + "loss": 0.2267, + "step": 648 + }, + { + "epoch": 0.2602081665332266, + "grad_norm": 1.5260372161865234, + "learning_rate": 1.0447052756722651e-05, + "loss": 0.2933, + "step": 650 + }, + { + "epoch": 0.2610088070456365, + "grad_norm": 1.4702479839324951, + "learning_rate": 1.0474973142756632e-05, + "loss": 0.2874, + "step": 652 + }, + { + "epoch": 0.2618094475580464, + "grad_norm": 3.1260263919830322, + "learning_rate": 1.0502889818261058e-05, + "loss": 0.4748, + "step": 654 + }, + { + "epoch": 0.2626100880704564, + "grad_norm": 2.142977476119995, + "learning_rate": 1.053080256514858e-05, + "loss": 0.3326, + "step": 656 + }, + { + "epoch": 0.2634107285828663, + "grad_norm": 1.3213237524032593, + "learning_rate": 1.0558711165362488e-05, + "loss": 0.261, + "step": 658 + }, + { + "epoch": 0.2642113690952762, + "grad_norm": 1.358491063117981, + "learning_rate": 1.0586615400878484e-05, + "loss": 0.2763, + "step": 660 + }, + { + "epoch": 0.26501200960768617, + "grad_norm": 1.2568645477294922, + "learning_rate": 1.0614515053706354e-05, + "loss": 0.2124, + "step": 662 + }, + { + "epoch": 0.2658126501200961, + "grad_norm": 2.675794839859009, + "learning_rate": 1.0642409905891733e-05, + "loss": 0.4695, + "step": 664 + }, + { + "epoch": 0.266613290632506, + "grad_norm": 6.392685413360596, + "learning_rate": 1.0670299739517706e-05, + "loss": 0.5315, + "step": 666 + }, + { + "epoch": 0.2674139311449159, + "grad_norm": 1.9113832712173462, + "learning_rate": 1.0698184336706567e-05, + "loss": 0.2462, + "step": 668 + }, + { + "epoch": 0.2682145716573259, + "grad_norm": 1.9614572525024414, + "learning_rate": 1.0726063479621567e-05, + "loss": 0.4076, + "step": 670 + }, + { + "epoch": 0.2690152121697358, + "grad_norm": 2.027254104614258, + "learning_rate": 1.0753936950468513e-05, + "loss": 0.3488, + "step": 672 + }, + { + "epoch": 0.2698158526821457, + "grad_norm": 1.2301055192947388, + "learning_rate": 1.0781804531497525e-05, + "loss": 0.2761, + "step": 674 + }, + { + "epoch": 0.27061649319455566, + "grad_norm": 3.244642972946167, + "learning_rate": 1.0809666005004787e-05, + "loss": 0.2312, + "step": 676 + }, + { + "epoch": 0.2714171337069656, + "grad_norm": 0.8700428605079651, + "learning_rate": 1.083752115333414e-05, + "loss": 0.1956, + "step": 678 + }, + { + "epoch": 0.2722177742193755, + "grad_norm": 1.2658177614212036, + "learning_rate": 1.0865369758878858e-05, + "loss": 0.2526, + "step": 680 + }, + { + "epoch": 0.27301841473178545, + "grad_norm": 1.095572829246521, + "learning_rate": 1.0893211604083311e-05, + "loss": 0.2507, + "step": 682 + }, + { + "epoch": 0.27381905524419536, + "grad_norm": 1.0172237157821655, + "learning_rate": 1.0921046471444737e-05, + "loss": 0.2132, + "step": 684 + }, + { + "epoch": 0.2746196957566053, + "grad_norm": 2.4742257595062256, + "learning_rate": 1.0948874143514818e-05, + "loss": 0.5289, + "step": 686 + }, + { + "epoch": 0.2754203362690152, + "grad_norm": 1.4196285009384155, + "learning_rate": 1.0976694402901467e-05, + "loss": 0.2779, + "step": 688 + }, + { + "epoch": 0.27622097678142515, + "grad_norm": 3.7288296222686768, + "learning_rate": 1.1004507032270544e-05, + "loss": 0.3819, + "step": 690 + }, + { + "epoch": 0.27702161729383507, + "grad_norm": 4.177191257476807, + "learning_rate": 1.1032311814347467e-05, + "loss": 0.3152, + "step": 692 + }, + { + "epoch": 0.277822257806245, + "grad_norm": 1.30329430103302, + "learning_rate": 1.1060108531918955e-05, + "loss": 0.352, + "step": 694 + }, + { + "epoch": 0.27862289831865494, + "grad_norm": 1.771214246749878, + "learning_rate": 1.1087896967834787e-05, + "loss": 0.3639, + "step": 696 + }, + { + "epoch": 0.27942353883106485, + "grad_norm": 1.3823013305664062, + "learning_rate": 1.111567690500938e-05, + "loss": 0.3035, + "step": 698 + }, + { + "epoch": 0.28022417934347477, + "grad_norm": 1.3697372674942017, + "learning_rate": 1.1143448126423545e-05, + "loss": 0.3638, + "step": 700 + }, + { + "epoch": 0.28102481985588473, + "grad_norm": 1.0516256093978882, + "learning_rate": 1.1171210415126238e-05, + "loss": 0.2353, + "step": 702 + }, + { + "epoch": 0.28182546036829464, + "grad_norm": 2.3455872535705566, + "learning_rate": 1.1198963554236135e-05, + "loss": 0.3152, + "step": 704 + }, + { + "epoch": 0.28262610088070456, + "grad_norm": 2.326951265335083, + "learning_rate": 1.122670732694342e-05, + "loss": 0.3813, + "step": 706 + }, + { + "epoch": 0.28342674139311447, + "grad_norm": 1.2246034145355225, + "learning_rate": 1.1254441516511425e-05, + "loss": 0.2961, + "step": 708 + }, + { + "epoch": 0.28422738190552443, + "grad_norm": 1.2205806970596313, + "learning_rate": 1.1282165906278395e-05, + "loss": 0.4119, + "step": 710 + }, + { + "epoch": 0.28502802241793435, + "grad_norm": 1.5300731658935547, + "learning_rate": 1.1309880279659087e-05, + "loss": 0.4599, + "step": 712 + }, + { + "epoch": 0.28582866293034426, + "grad_norm": 1.5241029262542725, + "learning_rate": 1.1337584420146496e-05, + "loss": 0.2874, + "step": 714 + }, + { + "epoch": 0.2866293034427542, + "grad_norm": 2.6119472980499268, + "learning_rate": 1.1365278111313625e-05, + "loss": 0.4343, + "step": 716 + }, + { + "epoch": 0.28742994395516414, + "grad_norm": 1.1366642713546753, + "learning_rate": 1.1392961136815041e-05, + "loss": 0.265, + "step": 718 + }, + { + "epoch": 0.28823058446757405, + "grad_norm": 2.9049196243286133, + "learning_rate": 1.142063328038864e-05, + "loss": 0.282, + "step": 720 + }, + { + "epoch": 0.289031224979984, + "grad_norm": 3.6186904907226562, + "learning_rate": 1.1448294325857377e-05, + "loss": 0.4514, + "step": 722 + }, + { + "epoch": 0.2898318654923939, + "grad_norm": 0.9780982136726379, + "learning_rate": 1.1475944057130856e-05, + "loss": 0.2071, + "step": 724 + }, + { + "epoch": 0.29063250600480384, + "grad_norm": 1.8270292282104492, + "learning_rate": 1.150358225820707e-05, + "loss": 0.2829, + "step": 726 + }, + { + "epoch": 0.29143314651721375, + "grad_norm": 1.9693379402160645, + "learning_rate": 1.1531208713174138e-05, + "loss": 0.2243, + "step": 728 + }, + { + "epoch": 0.2922337870296237, + "grad_norm": 1.9137483835220337, + "learning_rate": 1.1558823206211887e-05, + "loss": 0.3136, + "step": 730 + }, + { + "epoch": 0.2930344275420336, + "grad_norm": 1.3502650260925293, + "learning_rate": 1.1586425521593607e-05, + "loss": 0.215, + "step": 732 + }, + { + "epoch": 0.29383506805444354, + "grad_norm": 1.7239874601364136, + "learning_rate": 1.1614015443687708e-05, + "loss": 0.2761, + "step": 734 + }, + { + "epoch": 0.2946357085668535, + "grad_norm": 1.6140371561050415, + "learning_rate": 1.1641592756959467e-05, + "loss": 0.3812, + "step": 736 + }, + { + "epoch": 0.2954363490792634, + "grad_norm": 3.825488328933716, + "learning_rate": 1.1669157245972616e-05, + "loss": 0.4039, + "step": 738 + }, + { + "epoch": 0.2962369895916733, + "grad_norm": 2.2808945178985596, + "learning_rate": 1.1696708695391057e-05, + "loss": 0.3329, + "step": 740 + }, + { + "epoch": 0.29703763010408324, + "grad_norm": 1.7055107355117798, + "learning_rate": 1.1724246889980626e-05, + "loss": 0.3217, + "step": 742 + }, + { + "epoch": 0.2978382706164932, + "grad_norm": 2.248680353164673, + "learning_rate": 1.1751771614610643e-05, + "loss": 0.3813, + "step": 744 + }, + { + "epoch": 0.2986389111289031, + "grad_norm": 2.4696545600891113, + "learning_rate": 1.1779282654255668e-05, + "loss": 0.3524, + "step": 746 + }, + { + "epoch": 0.29943955164131303, + "grad_norm": 2.311148166656494, + "learning_rate": 1.180677979399721e-05, + "loss": 0.2469, + "step": 748 + }, + { + "epoch": 0.300240192153723, + "grad_norm": 1.2554863691329956, + "learning_rate": 1.1834262819025317e-05, + "loss": 0.2965, + "step": 750 + }, + { + "epoch": 0.3010408326661329, + "grad_norm": 1.4923019409179688, + "learning_rate": 1.1861731514640309e-05, + "loss": 0.2936, + "step": 752 + }, + { + "epoch": 0.3018414731785428, + "grad_norm": 1.8577159643173218, + "learning_rate": 1.188918566625449e-05, + "loss": 0.3887, + "step": 754 + }, + { + "epoch": 0.3026421136909528, + "grad_norm": 1.5135411024093628, + "learning_rate": 1.1916625059393739e-05, + "loss": 0.2876, + "step": 756 + }, + { + "epoch": 0.3034427542033627, + "grad_norm": 2.320923089981079, + "learning_rate": 1.1944049479699241e-05, + "loss": 0.366, + "step": 758 + }, + { + "epoch": 0.3042433947157726, + "grad_norm": 1.7143603563308716, + "learning_rate": 1.1971458712929133e-05, + "loss": 0.2873, + "step": 760 + }, + { + "epoch": 0.3050440352281825, + "grad_norm": 1.8522568941116333, + "learning_rate": 1.1998852544960256e-05, + "loss": 0.3035, + "step": 762 + }, + { + "epoch": 0.3058446757405925, + "grad_norm": 2.6824395656585693, + "learning_rate": 1.2026230761789702e-05, + "loss": 0.4541, + "step": 764 + }, + { + "epoch": 0.3066453162530024, + "grad_norm": 1.804224967956543, + "learning_rate": 1.2053593149536557e-05, + "loss": 0.239, + "step": 766 + }, + { + "epoch": 0.3074459567654123, + "grad_norm": 1.7440787553787231, + "learning_rate": 1.2080939494443618e-05, + "loss": 0.3666, + "step": 768 + }, + { + "epoch": 0.3082465972778223, + "grad_norm": 3.09769606590271, + "learning_rate": 1.210826958287895e-05, + "loss": 0.3954, + "step": 770 + }, + { + "epoch": 0.3090472377902322, + "grad_norm": 3.009582042694092, + "learning_rate": 1.2135583201337646e-05, + "loss": 0.3992, + "step": 772 + }, + { + "epoch": 0.3098478783026421, + "grad_norm": 1.6240441799163818, + "learning_rate": 1.2162880136443434e-05, + "loss": 0.2974, + "step": 774 + }, + { + "epoch": 0.31064851881505207, + "grad_norm": 1.6686104536056519, + "learning_rate": 1.2190160174950428e-05, + "loss": 0.3484, + "step": 776 + }, + { + "epoch": 0.311449159327462, + "grad_norm": 2.510789394378662, + "learning_rate": 1.2217423103744692e-05, + "loss": 0.8132, + "step": 778 + }, + { + "epoch": 0.3122497998398719, + "grad_norm": 2.057119369506836, + "learning_rate": 1.2244668709845952e-05, + "loss": 0.271, + "step": 780 + }, + { + "epoch": 0.3130504403522818, + "grad_norm": 1.4748404026031494, + "learning_rate": 1.2271896780409309e-05, + "loss": 0.3482, + "step": 782 + }, + { + "epoch": 0.31385108086469177, + "grad_norm": 1.4520041942596436, + "learning_rate": 1.2299107102726804e-05, + "loss": 0.2781, + "step": 784 + }, + { + "epoch": 0.3146517213771017, + "grad_norm": 2.6499245166778564, + "learning_rate": 1.2326299464229143e-05, + "loss": 0.5029, + "step": 786 + }, + { + "epoch": 0.3154523618895116, + "grad_norm": 1.6387618780136108, + "learning_rate": 1.2353473652487329e-05, + "loss": 0.2054, + "step": 788 + }, + { + "epoch": 0.31625300240192156, + "grad_norm": 2.31320858001709, + "learning_rate": 1.2380629455214385e-05, + "loss": 0.3972, + "step": 790 + }, + { + "epoch": 0.31705364291433147, + "grad_norm": 1.6731876134872437, + "learning_rate": 1.2407766660266916e-05, + "loss": 0.3187, + "step": 792 + }, + { + "epoch": 0.3178542834267414, + "grad_norm": 1.6435164213180542, + "learning_rate": 1.2434885055646808e-05, + "loss": 0.3056, + "step": 794 + }, + { + "epoch": 0.31865492393915135, + "grad_norm": 1.4753636121749878, + "learning_rate": 1.2461984429502947e-05, + "loss": 0.2165, + "step": 796 + }, + { + "epoch": 0.31945556445156126, + "grad_norm": 2.4585118293762207, + "learning_rate": 1.2489064570132761e-05, + "loss": 0.3842, + "step": 798 + }, + { + "epoch": 0.32025620496397117, + "grad_norm": 1.9108567237854004, + "learning_rate": 1.2516125265983945e-05, + "loss": 0.364, + "step": 800 + }, + { + "epoch": 0.3210568454763811, + "grad_norm": 3.215984344482422, + "learning_rate": 1.2543166305656089e-05, + "loss": 0.3829, + "step": 802 + }, + { + "epoch": 0.32185748598879105, + "grad_norm": 2.732006072998047, + "learning_rate": 1.257018747790238e-05, + "loss": 0.3348, + "step": 804 + }, + { + "epoch": 0.32265812650120096, + "grad_norm": 2.8160431385040283, + "learning_rate": 1.259718857163117e-05, + "loss": 0.2507, + "step": 806 + }, + { + "epoch": 0.32345876701361087, + "grad_norm": 1.473008632659912, + "learning_rate": 1.2624169375907657e-05, + "loss": 0.3225, + "step": 808 + }, + { + "epoch": 0.32425940752602084, + "grad_norm": 2.44392466545105, + "learning_rate": 1.2651129679955598e-05, + "loss": 0.4699, + "step": 810 + }, + { + "epoch": 0.32506004803843075, + "grad_norm": 2.2038912773132324, + "learning_rate": 1.2678069273158849e-05, + "loss": 0.3812, + "step": 812 + }, + { + "epoch": 0.32586068855084066, + "grad_norm": 1.930592656135559, + "learning_rate": 1.2704987945063073e-05, + "loss": 0.4506, + "step": 814 + }, + { + "epoch": 0.3266613290632506, + "grad_norm": 2.3977086544036865, + "learning_rate": 1.273188548537736e-05, + "loss": 0.3267, + "step": 816 + }, + { + "epoch": 0.32746196957566054, + "grad_norm": 1.6223602294921875, + "learning_rate": 1.2758761683975929e-05, + "loss": 0.3056, + "step": 818 + }, + { + "epoch": 0.32826261008807045, + "grad_norm": 2.035266160964966, + "learning_rate": 1.2785616330899676e-05, + "loss": 0.3643, + "step": 820 + }, + { + "epoch": 0.32906325060048036, + "grad_norm": 1.5685473680496216, + "learning_rate": 1.2812449216357855e-05, + "loss": 0.2761, + "step": 822 + }, + { + "epoch": 0.32986389111289033, + "grad_norm": 1.7522467374801636, + "learning_rate": 1.2839260130729776e-05, + "loss": 0.2466, + "step": 824 + }, + { + "epoch": 0.33066453162530024, + "grad_norm": 1.1452152729034424, + "learning_rate": 1.2866048864566336e-05, + "loss": 0.1983, + "step": 826 + }, + { + "epoch": 0.33146517213771015, + "grad_norm": 1.7196146249771118, + "learning_rate": 1.2892815208591734e-05, + "loss": 0.3519, + "step": 828 + }, + { + "epoch": 0.3322658126501201, + "grad_norm": 1.786125898361206, + "learning_rate": 1.2919558953705047e-05, + "loss": 0.3517, + "step": 830 + }, + { + "epoch": 0.33306645316253003, + "grad_norm": 1.4323543310165405, + "learning_rate": 1.2946279890981966e-05, + "loss": 0.3035, + "step": 832 + }, + { + "epoch": 0.33386709367493994, + "grad_norm": 1.2566041946411133, + "learning_rate": 1.2972977811676289e-05, + "loss": 0.3011, + "step": 834 + }, + { + "epoch": 0.33466773418734985, + "grad_norm": 1.6687662601470947, + "learning_rate": 1.2999652507221652e-05, + "loss": 0.2816, + "step": 836 + }, + { + "epoch": 0.3354683746997598, + "grad_norm": 1.9730874300003052, + "learning_rate": 1.3026303769233109e-05, + "loss": 0.4586, + "step": 838 + }, + { + "epoch": 0.33626901521216973, + "grad_norm": 1.1179251670837402, + "learning_rate": 1.3052931389508822e-05, + "loss": 0.2888, + "step": 840 + }, + { + "epoch": 0.33706965572457964, + "grad_norm": 1.9313640594482422, + "learning_rate": 1.3079535160031601e-05, + "loss": 0.3214, + "step": 842 + }, + { + "epoch": 0.3378702962369896, + "grad_norm": 1.5428621768951416, + "learning_rate": 1.3106114872970575e-05, + "loss": 0.2779, + "step": 844 + }, + { + "epoch": 0.3386709367493995, + "grad_norm": 3.006662607192993, + "learning_rate": 1.313267032068285e-05, + "loss": 0.5281, + "step": 846 + }, + { + "epoch": 0.33947157726180943, + "grad_norm": 2.130033016204834, + "learning_rate": 1.3159201295715054e-05, + "loss": 0.3093, + "step": 848 + }, + { + "epoch": 0.3402722177742194, + "grad_norm": 2.883986473083496, + "learning_rate": 1.3185707590804997e-05, + "loss": 0.3187, + "step": 850 + }, + { + "epoch": 0.3410728582866293, + "grad_norm": 2.7177200317382812, + "learning_rate": 1.321218899888334e-05, + "loss": 0.4165, + "step": 852 + }, + { + "epoch": 0.3418734987990392, + "grad_norm": 2.270155191421509, + "learning_rate": 1.3238645313075109e-05, + "loss": 0.2447, + "step": 854 + }, + { + "epoch": 0.34267413931144913, + "grad_norm": 1.8765709400177002, + "learning_rate": 1.326507632670139e-05, + "loss": 0.4694, + "step": 856 + }, + { + "epoch": 0.3434747798238591, + "grad_norm": 3.7348527908325195, + "learning_rate": 1.3291481833280894e-05, + "loss": 0.2895, + "step": 858 + }, + { + "epoch": 0.344275420336269, + "grad_norm": 1.2450296878814697, + "learning_rate": 1.3317861626531652e-05, + "loss": 0.3136, + "step": 860 + }, + { + "epoch": 0.3450760608486789, + "grad_norm": 1.9517041444778442, + "learning_rate": 1.3344215500372517e-05, + "loss": 0.3659, + "step": 862 + }, + { + "epoch": 0.3458767013610889, + "grad_norm": 1.2345126867294312, + "learning_rate": 1.3370543248924826e-05, + "loss": 0.3065, + "step": 864 + }, + { + "epoch": 0.3466773418734988, + "grad_norm": 1.5329704284667969, + "learning_rate": 1.3396844666514062e-05, + "loss": 0.3641, + "step": 866 + }, + { + "epoch": 0.3474779823859087, + "grad_norm": 1.1509764194488525, + "learning_rate": 1.3423119547671348e-05, + "loss": 0.2903, + "step": 868 + }, + { + "epoch": 0.3482786228983187, + "grad_norm": 4.564413070678711, + "learning_rate": 1.344936768713513e-05, + "loss": 0.4602, + "step": 870 + }, + { + "epoch": 0.3490792634107286, + "grad_norm": 1.4297897815704346, + "learning_rate": 1.347558887985279e-05, + "loss": 0.409, + "step": 872 + }, + { + "epoch": 0.3498799039231385, + "grad_norm": 1.9911104440689087, + "learning_rate": 1.3501782920982189e-05, + "loss": 0.4542, + "step": 874 + }, + { + "epoch": 0.3506805444355484, + "grad_norm": 1.1582509279251099, + "learning_rate": 1.3527949605893305e-05, + "loss": 0.3078, + "step": 876 + }, + { + "epoch": 0.3514811849479584, + "grad_norm": 2.0739545822143555, + "learning_rate": 1.3554088730169812e-05, + "loss": 0.3615, + "step": 878 + }, + { + "epoch": 0.3522818254603683, + "grad_norm": 2.549520969390869, + "learning_rate": 1.3580200089610739e-05, + "loss": 0.4921, + "step": 880 + }, + { + "epoch": 0.3530824659727782, + "grad_norm": 1.398646593093872, + "learning_rate": 1.3606283480231962e-05, + "loss": 0.3662, + "step": 882 + }, + { + "epoch": 0.35388310648518817, + "grad_norm": 1.827439785003662, + "learning_rate": 1.3632338698267863e-05, + "loss": 0.3177, + "step": 884 + }, + { + "epoch": 0.3546837469975981, + "grad_norm": 2.0493557453155518, + "learning_rate": 1.3658365540172948e-05, + "loss": 0.2935, + "step": 886 + }, + { + "epoch": 0.355484387510008, + "grad_norm": 1.8041282892227173, + "learning_rate": 1.368436380262336e-05, + "loss": 0.3349, + "step": 888 + }, + { + "epoch": 0.35628502802241796, + "grad_norm": 2.4190833568573, + "learning_rate": 1.3710333282518497e-05, + "loss": 0.3287, + "step": 890 + }, + { + "epoch": 0.35708566853482787, + "grad_norm": 1.8165340423583984, + "learning_rate": 1.3736273776982667e-05, + "loss": 0.3184, + "step": 892 + }, + { + "epoch": 0.3578863090472378, + "grad_norm": 1.7285417318344116, + "learning_rate": 1.3762185083366562e-05, + "loss": 0.4733, + "step": 894 + }, + { + "epoch": 0.3586869495596477, + "grad_norm": 2.340744972229004, + "learning_rate": 1.3788066999248893e-05, + "loss": 0.3519, + "step": 896 + }, + { + "epoch": 0.35948759007205766, + "grad_norm": 1.829300045967102, + "learning_rate": 1.3813919322438018e-05, + "loss": 0.348, + "step": 898 + }, + { + "epoch": 0.3602882305844676, + "grad_norm": 1.778882622718811, + "learning_rate": 1.3839741850973435e-05, + "loss": 0.4154, + "step": 900 + }, + { + "epoch": 0.3610888710968775, + "grad_norm": 2.7192635536193848, + "learning_rate": 1.3865534383127413e-05, + "loss": 0.3809, + "step": 902 + }, + { + "epoch": 0.36188951160928745, + "grad_norm": 1.776031494140625, + "learning_rate": 1.3891296717406533e-05, + "loss": 0.264, + "step": 904 + }, + { + "epoch": 0.36269015212169736, + "grad_norm": 2.0688414573669434, + "learning_rate": 1.391702865255334e-05, + "loss": 0.4506, + "step": 906 + }, + { + "epoch": 0.3634907926341073, + "grad_norm": 1.3032102584838867, + "learning_rate": 1.3942729987547808e-05, + "loss": 0.2194, + "step": 908 + }, + { + "epoch": 0.36429143314651724, + "grad_norm": 3.259617328643799, + "learning_rate": 1.3968400521608962e-05, + "loss": 0.5706, + "step": 910 + }, + { + "epoch": 0.36509207365892715, + "grad_norm": 1.405910611152649, + "learning_rate": 1.3994040054196498e-05, + "loss": 0.364, + "step": 912 + }, + { + "epoch": 0.36589271417133706, + "grad_norm": 3.7014219760894775, + "learning_rate": 1.4019648385012245e-05, + "loss": 0.5707, + "step": 914 + }, + { + "epoch": 0.366693354683747, + "grad_norm": 2.2222471237182617, + "learning_rate": 1.4045225314001789e-05, + "loss": 0.3994, + "step": 916 + }, + { + "epoch": 0.36749399519615694, + "grad_norm": 1.2945737838745117, + "learning_rate": 1.4070770641356069e-05, + "loss": 0.2138, + "step": 918 + }, + { + "epoch": 0.36829463570856685, + "grad_norm": 1.3794894218444824, + "learning_rate": 1.4096284167512856e-05, + "loss": 0.2896, + "step": 920 + }, + { + "epoch": 0.36909527622097676, + "grad_norm": 2.2994024753570557, + "learning_rate": 1.4121765693158355e-05, + "loss": 0.2901, + "step": 922 + }, + { + "epoch": 0.36989591673338673, + "grad_norm": 0.5453316569328308, + "learning_rate": 1.4147215019228813e-05, + "loss": 0.2645, + "step": 924 + }, + { + "epoch": 0.37069655724579664, + "grad_norm": 1.5285919904708862, + "learning_rate": 1.4172631946911964e-05, + "loss": 0.4572, + "step": 926 + }, + { + "epoch": 0.37149719775820655, + "grad_norm": 2.6378567218780518, + "learning_rate": 1.4198016277648665e-05, + "loss": 0.5491, + "step": 928 + }, + { + "epoch": 0.37229783827061647, + "grad_norm": 2.0437612533569336, + "learning_rate": 1.4223367813134406e-05, + "loss": 0.4924, + "step": 930 + }, + { + "epoch": 0.37309847878302643, + "grad_norm": 1.179230809211731, + "learning_rate": 1.4248686355320922e-05, + "loss": 0.198, + "step": 932 + }, + { + "epoch": 0.37389911929543634, + "grad_norm": 2.303351402282715, + "learning_rate": 1.4273971706417653e-05, + "loss": 0.3704, + "step": 934 + }, + { + "epoch": 0.37469975980784626, + "grad_norm": 2.3958756923675537, + "learning_rate": 1.429922366889332e-05, + "loss": 0.2671, + "step": 936 + }, + { + "epoch": 0.3755004003202562, + "grad_norm": 3.1879501342773438, + "learning_rate": 1.4324442045477534e-05, + "loss": 0.3645, + "step": 938 + }, + { + "epoch": 0.37630104083266613, + "grad_norm": 2.177776336669922, + "learning_rate": 1.4349626639162231e-05, + "loss": 0.3804, + "step": 940 + }, + { + "epoch": 0.37710168134507605, + "grad_norm": 2.790613889694214, + "learning_rate": 1.4374777253203265e-05, + "loss": 0.3413, + "step": 942 + }, + { + "epoch": 0.377902321857486, + "grad_norm": 2.547140121459961, + "learning_rate": 1.4399893691121985e-05, + "loss": 0.3496, + "step": 944 + }, + { + "epoch": 0.3787029623698959, + "grad_norm": 3.849318504333496, + "learning_rate": 1.4424975756706684e-05, + "loss": 0.4353, + "step": 946 + }, + { + "epoch": 0.37950360288230583, + "grad_norm": 1.3976528644561768, + "learning_rate": 1.4450023254014185e-05, + "loss": 0.2965, + "step": 948 + }, + { + "epoch": 0.38030424339471575, + "grad_norm": 1.9619717597961426, + "learning_rate": 1.4475035987371348e-05, + "loss": 0.2806, + "step": 950 + }, + { + "epoch": 0.3811048839071257, + "grad_norm": 1.4712979793548584, + "learning_rate": 1.4500013761376663e-05, + "loss": 0.3222, + "step": 952 + }, + { + "epoch": 0.3819055244195356, + "grad_norm": 2.613535165786743, + "learning_rate": 1.4524956380901674e-05, + "loss": 0.3975, + "step": 954 + }, + { + "epoch": 0.38270616493194554, + "grad_norm": 1.823974847793579, + "learning_rate": 1.454986365109255e-05, + "loss": 0.3054, + "step": 956 + }, + { + "epoch": 0.3835068054443555, + "grad_norm": 2.3367133140563965, + "learning_rate": 1.4574735377371669e-05, + "loss": 0.4325, + "step": 958 + }, + { + "epoch": 0.3843074459567654, + "grad_norm": 1.8418333530426025, + "learning_rate": 1.4599571365439027e-05, + "loss": 0.2953, + "step": 960 + }, + { + "epoch": 0.3851080864691753, + "grad_norm": 1.1152279376983643, + "learning_rate": 1.4624371421273812e-05, + "loss": 0.19, + "step": 962 + }, + { + "epoch": 0.3859087269815853, + "grad_norm": 4.43094539642334, + "learning_rate": 1.4649135351135968e-05, + "loss": 0.3265, + "step": 964 + }, + { + "epoch": 0.3867093674939952, + "grad_norm": 2.02075457572937, + "learning_rate": 1.4673862961567604e-05, + "loss": 0.3246, + "step": 966 + }, + { + "epoch": 0.3875100080064051, + "grad_norm": 1.197021484375, + "learning_rate": 1.4698554059394563e-05, + "loss": 0.4842, + "step": 968 + }, + { + "epoch": 0.388310648518815, + "grad_norm": 2.510496139526367, + "learning_rate": 1.4723208451727977e-05, + "loss": 0.4895, + "step": 970 + }, + { + "epoch": 0.389111289031225, + "grad_norm": 3.230113983154297, + "learning_rate": 1.4747825945965675e-05, + "loss": 0.6357, + "step": 972 + }, + { + "epoch": 0.3899119295436349, + "grad_norm": 1.3785008192062378, + "learning_rate": 1.4772406349793749e-05, + "loss": 0.2976, + "step": 974 + }, + { + "epoch": 0.3907125700560448, + "grad_norm": 1.3037809133529663, + "learning_rate": 1.4796949471188033e-05, + "loss": 0.3345, + "step": 976 + }, + { + "epoch": 0.3915132105684548, + "grad_norm": 1.1565457582473755, + "learning_rate": 1.4821455118415666e-05, + "loss": 0.2904, + "step": 978 + }, + { + "epoch": 0.3923138510808647, + "grad_norm": 1.9340049028396606, + "learning_rate": 1.4845923100036479e-05, + "loss": 0.3569, + "step": 980 + }, + { + "epoch": 0.3931144915932746, + "grad_norm": 2.5167102813720703, + "learning_rate": 1.4870353224904563e-05, + "loss": 0.3637, + "step": 982 + }, + { + "epoch": 0.3939151321056846, + "grad_norm": 1.7179125547409058, + "learning_rate": 1.4894745302169786e-05, + "loss": 0.2902, + "step": 984 + }, + { + "epoch": 0.3947157726180945, + "grad_norm": 2.171660900115967, + "learning_rate": 1.4919099141279205e-05, + "loss": 0.2671, + "step": 986 + }, + { + "epoch": 0.3955164131305044, + "grad_norm": 2.3815298080444336, + "learning_rate": 1.4943414551978597e-05, + "loss": 0.1958, + "step": 988 + }, + { + "epoch": 0.3963170536429143, + "grad_norm": 1.6362024545669556, + "learning_rate": 1.4967691344313988e-05, + "loss": 0.3645, + "step": 990 + }, + { + "epoch": 0.3971176941553243, + "grad_norm": 1.7021138668060303, + "learning_rate": 1.499192932863305e-05, + "loss": 0.333, + "step": 992 + }, + { + "epoch": 0.3979183346677342, + "grad_norm": 1.8788517713546753, + "learning_rate": 1.5016128315586626e-05, + "loss": 0.3815, + "step": 994 + }, + { + "epoch": 0.3987189751801441, + "grad_norm": 1.7066657543182373, + "learning_rate": 1.5040288116130261e-05, + "loss": 0.2904, + "step": 996 + }, + { + "epoch": 0.39951961569255406, + "grad_norm": 1.4280292987823486, + "learning_rate": 1.5064408541525568e-05, + "loss": 0.3187, + "step": 998 + }, + { + "epoch": 0.400320256204964, + "grad_norm": 2.9533514976501465, + "learning_rate": 1.5088489403341793e-05, + "loss": 0.2641, + "step": 1000 + }, + { + "epoch": 0.4011208967173739, + "grad_norm": 1.7368699312210083, + "learning_rate": 1.5112530513457229e-05, + "loss": 0.384, + "step": 1002 + }, + { + "epoch": 0.40192153722978385, + "grad_norm": 1.1102229356765747, + "learning_rate": 1.513653168406076e-05, + "loss": 0.3136, + "step": 1004 + }, + { + "epoch": 0.40272217774219377, + "grad_norm": 2.8318326473236084, + "learning_rate": 1.5160492727653245e-05, + "loss": 0.352, + "step": 1006 + }, + { + "epoch": 0.4035228182546037, + "grad_norm": 1.132244348526001, + "learning_rate": 1.5184413457049006e-05, + "loss": 0.2965, + "step": 1008 + }, + { + "epoch": 0.4043234587670136, + "grad_norm": 1.7977246046066284, + "learning_rate": 1.5208293685377354e-05, + "loss": 0.3993, + "step": 1010 + }, + { + "epoch": 0.40512409927942356, + "grad_norm": 1.4883641004562378, + "learning_rate": 1.5232133226083954e-05, + "loss": 0.3248, + "step": 1012 + }, + { + "epoch": 0.40592473979183347, + "grad_norm": 1.5216751098632812, + "learning_rate": 1.5255931892932322e-05, + "loss": 0.4148, + "step": 1014 + }, + { + "epoch": 0.4067253803042434, + "grad_norm": 1.3317413330078125, + "learning_rate": 1.527968950000533e-05, + "loss": 0.2908, + "step": 1016 + }, + { + "epoch": 0.40752602081665334, + "grad_norm": 1.7333203554153442, + "learning_rate": 1.5303405861706574e-05, + "loss": 0.2288, + "step": 1018 + }, + { + "epoch": 0.40832666132906326, + "grad_norm": 1.2713737487792969, + "learning_rate": 1.532708079276185e-05, + "loss": 0.3016, + "step": 1020 + }, + { + "epoch": 0.40912730184147317, + "grad_norm": 1.227898120880127, + "learning_rate": 1.5350714108220667e-05, + "loss": 0.3013, + "step": 1022 + }, + { + "epoch": 0.4099279423538831, + "grad_norm": 1.4102058410644531, + "learning_rate": 1.5374305623457594e-05, + "loss": 0.3484, + "step": 1024 + }, + { + "epoch": 0.41072858286629305, + "grad_norm": 1.573798418045044, + "learning_rate": 1.539785515417376e-05, + "loss": 0.2562, + "step": 1026 + }, + { + "epoch": 0.41152922337870296, + "grad_norm": 1.5501513481140137, + "learning_rate": 1.542136251639826e-05, + "loss": 0.3814, + "step": 1028 + }, + { + "epoch": 0.41232986389111287, + "grad_norm": 3.8987221717834473, + "learning_rate": 1.5444827526489668e-05, + "loss": 0.3418, + "step": 1030 + }, + { + "epoch": 0.41313050440352284, + "grad_norm": 1.4468276500701904, + "learning_rate": 1.5468250001137368e-05, + "loss": 0.2289, + "step": 1032 + }, + { + "epoch": 0.41393114491593275, + "grad_norm": 2.665318727493286, + "learning_rate": 1.5491629757363026e-05, + "loss": 0.4326, + "step": 1034 + }, + { + "epoch": 0.41473178542834266, + "grad_norm": 1.0114474296569824, + "learning_rate": 1.551496661252208e-05, + "loss": 0.1943, + "step": 1036 + }, + { + "epoch": 0.4155324259407526, + "grad_norm": 1.4317598342895508, + "learning_rate": 1.5538260384305073e-05, + "loss": 0.4806, + "step": 1038 + }, + { + "epoch": 0.41633306645316254, + "grad_norm": 1.3969917297363281, + "learning_rate": 1.5561510890739113e-05, + "loss": 0.3353, + "step": 1040 + }, + { + "epoch": 0.41713370696557245, + "grad_norm": 1.7000137567520142, + "learning_rate": 1.5584717950189353e-05, + "loss": 0.4404, + "step": 1042 + }, + { + "epoch": 0.41793434747798236, + "grad_norm": 1.5187366008758545, + "learning_rate": 1.5607881381360296e-05, + "loss": 0.3055, + "step": 1044 + }, + { + "epoch": 0.4187349879903923, + "grad_norm": 1.3523516654968262, + "learning_rate": 1.563100100329731e-05, + "loss": 0.2781, + "step": 1046 + }, + { + "epoch": 0.41953562850280224, + "grad_norm": 1.4860470294952393, + "learning_rate": 1.565407663538797e-05, + "loss": 0.2385, + "step": 1048 + }, + { + "epoch": 0.42033626901521215, + "grad_norm": 1.341766357421875, + "learning_rate": 1.567710809736356e-05, + "loss": 0.3038, + "step": 1050 + }, + { + "epoch": 0.4211369095276221, + "grad_norm": 2.5805540084838867, + "learning_rate": 1.5700095209300376e-05, + "loss": 0.3329, + "step": 1052 + }, + { + "epoch": 0.42193755004003203, + "grad_norm": 1.5754356384277344, + "learning_rate": 1.572303779162118e-05, + "loss": 0.2969, + "step": 1054 + }, + { + "epoch": 0.42273819055244194, + "grad_norm": 1.869661569595337, + "learning_rate": 1.5745935665096647e-05, + "loss": 0.3481, + "step": 1056 + }, + { + "epoch": 0.4235388310648519, + "grad_norm": 4.04746150970459, + "learning_rate": 1.5768788650846677e-05, + "loss": 0.4086, + "step": 1058 + }, + { + "epoch": 0.4243394715772618, + "grad_norm": 1.66744863986969, + "learning_rate": 1.5791596570341844e-05, + "loss": 0.3348, + "step": 1060 + }, + { + "epoch": 0.42514011208967173, + "grad_norm": 1.70791494846344, + "learning_rate": 1.581435924540481e-05, + "loss": 0.2452, + "step": 1062 + }, + { + "epoch": 0.42594075260208164, + "grad_norm": 1.8463305234909058, + "learning_rate": 1.5837076498211666e-05, + "loss": 0.3188, + "step": 1064 + }, + { + "epoch": 0.4267413931144916, + "grad_norm": 1.4474669694900513, + "learning_rate": 1.5859748151293333e-05, + "loss": 0.3125, + "step": 1066 + }, + { + "epoch": 0.4275420336269015, + "grad_norm": 2.311518430709839, + "learning_rate": 1.5882374027537005e-05, + "loss": 0.2225, + "step": 1068 + }, + { + "epoch": 0.42834267413931143, + "grad_norm": 1.1879162788391113, + "learning_rate": 1.5904953950187455e-05, + "loss": 0.2031, + "step": 1070 + }, + { + "epoch": 0.4291433146517214, + "grad_norm": 1.5468870401382446, + "learning_rate": 1.5927487742848448e-05, + "loss": 0.3823, + "step": 1072 + }, + { + "epoch": 0.4299439551641313, + "grad_norm": 2.032813310623169, + "learning_rate": 1.594997522948412e-05, + "loss": 0.4229, + "step": 1074 + }, + { + "epoch": 0.4307445956765412, + "grad_norm": 1.8879151344299316, + "learning_rate": 1.5972416234420393e-05, + "loss": 0.2763, + "step": 1076 + }, + { + "epoch": 0.4315452361889512, + "grad_norm": 1.6454302072525024, + "learning_rate": 1.599481058234626e-05, + "loss": 0.3246, + "step": 1078 + }, + { + "epoch": 0.4323458767013611, + "grad_norm": 2.1233456134796143, + "learning_rate": 1.60171580983152e-05, + "loss": 0.3661, + "step": 1080 + }, + { + "epoch": 0.433146517213771, + "grad_norm": 2.297830820083618, + "learning_rate": 1.6039458607746607e-05, + "loss": 0.3971, + "step": 1082 + }, + { + "epoch": 0.4339471577261809, + "grad_norm": 1.6209495067596436, + "learning_rate": 1.606171193642703e-05, + "loss": 0.2508, + "step": 1084 + }, + { + "epoch": 0.4347477982385909, + "grad_norm": 3.041398286819458, + "learning_rate": 1.6083917910511616e-05, + "loss": 0.4655, + "step": 1086 + }, + { + "epoch": 0.4355484387510008, + "grad_norm": 4.118158340454102, + "learning_rate": 1.6106076356525474e-05, + "loss": 0.637, + "step": 1088 + }, + { + "epoch": 0.4363490792634107, + "grad_norm": 2.458219051361084, + "learning_rate": 1.6128187101364982e-05, + "loss": 0.4028, + "step": 1090 + }, + { + "epoch": 0.4371497197758207, + "grad_norm": 1.8116475343704224, + "learning_rate": 1.6150249972299153e-05, + "loss": 0.3236, + "step": 1092 + }, + { + "epoch": 0.4379503602882306, + "grad_norm": 1.7430410385131836, + "learning_rate": 1.617226479697104e-05, + "loss": 0.3491, + "step": 1094 + }, + { + "epoch": 0.4387510008006405, + "grad_norm": 1.8438650369644165, + "learning_rate": 1.6194231403398994e-05, + "loss": 0.3036, + "step": 1096 + }, + { + "epoch": 0.43955164131305047, + "grad_norm": 1.499780535697937, + "learning_rate": 1.621614961997806e-05, + "loss": 0.3179, + "step": 1098 + }, + { + "epoch": 0.4403522818254604, + "grad_norm": 1.6525261402130127, + "learning_rate": 1.6238019275481313e-05, + "loss": 0.2631, + "step": 1100 + }, + { + "epoch": 0.4411529223378703, + "grad_norm": 2.6605048179626465, + "learning_rate": 1.6259840199061212e-05, + "loss": 0.3041, + "step": 1102 + }, + { + "epoch": 0.4419535628502802, + "grad_norm": 2.301974296569824, + "learning_rate": 1.6281612220250883e-05, + "loss": 0.5493, + "step": 1104 + }, + { + "epoch": 0.44275420336269017, + "grad_norm": 1.1462171077728271, + "learning_rate": 1.6303335168965474e-05, + "loss": 0.248, + "step": 1106 + }, + { + "epoch": 0.4435548438751001, + "grad_norm": 1.9763410091400146, + "learning_rate": 1.6325008875503543e-05, + "loss": 0.2212, + "step": 1108 + }, + { + "epoch": 0.44435548438751, + "grad_norm": 2.3119008541107178, + "learning_rate": 1.6346633170548285e-05, + "loss": 0.5295, + "step": 1110 + }, + { + "epoch": 0.44515612489991996, + "grad_norm": 4.5283403396606445, + "learning_rate": 1.6368207885168897e-05, + "loss": 0.2965, + "step": 1112 + }, + { + "epoch": 0.44595676541232987, + "grad_norm": 1.5044605731964111, + "learning_rate": 1.6389732850821957e-05, + "loss": 0.3386, + "step": 1114 + }, + { + "epoch": 0.4467574059247398, + "grad_norm": 1.6019591093063354, + "learning_rate": 1.641120789935263e-05, + "loss": 0.3191, + "step": 1116 + }, + { + "epoch": 0.4475580464371497, + "grad_norm": 1.6397215127944946, + "learning_rate": 1.6432632862996042e-05, + "loss": 0.3332, + "step": 1118 + }, + { + "epoch": 0.44835868694955966, + "grad_norm": 1.5365222692489624, + "learning_rate": 1.6454007574378637e-05, + "loss": 0.2586, + "step": 1120 + }, + { + "epoch": 0.44915932746196957, + "grad_norm": 2.565610885620117, + "learning_rate": 1.6475331866519377e-05, + "loss": 0.3315, + "step": 1122 + }, + { + "epoch": 0.4499599679743795, + "grad_norm": 2.131470203399658, + "learning_rate": 1.6496605572831134e-05, + "loss": 0.3994, + "step": 1124 + }, + { + "epoch": 0.45076060848678945, + "grad_norm": 2.0177133083343506, + "learning_rate": 1.6517828527121928e-05, + "loss": 0.2143, + "step": 1126 + }, + { + "epoch": 0.45156124899919936, + "grad_norm": 1.864417552947998, + "learning_rate": 1.6539000563596318e-05, + "loss": 0.2974, + "step": 1128 + }, + { + "epoch": 0.45236188951160927, + "grad_norm": 2.230363130569458, + "learning_rate": 1.6560121516856586e-05, + "loss": 0.3974, + "step": 1130 + }, + { + "epoch": 0.45316253002401924, + "grad_norm": 3.372126817703247, + "learning_rate": 1.6581191221904077e-05, + "loss": 0.3886, + "step": 1132 + }, + { + "epoch": 0.45396317053642915, + "grad_norm": 2.50730299949646, + "learning_rate": 1.6602209514140542e-05, + "loss": 0.2433, + "step": 1134 + }, + { + "epoch": 0.45476381104883906, + "grad_norm": 1.4936344623565674, + "learning_rate": 1.6623176229369324e-05, + "loss": 0.2328, + "step": 1136 + }, + { + "epoch": 0.455564451561249, + "grad_norm": 1.5799744129180908, + "learning_rate": 1.6644091203796694e-05, + "loss": 0.3332, + "step": 1138 + }, + { + "epoch": 0.45636509207365894, + "grad_norm": 2.1539037227630615, + "learning_rate": 1.6664954274033168e-05, + "loss": 0.3993, + "step": 1140 + }, + { + "epoch": 0.45716573258606885, + "grad_norm": 1.847986102104187, + "learning_rate": 1.6685765277094695e-05, + "loss": 0.3637, + "step": 1142 + }, + { + "epoch": 0.45796637309847876, + "grad_norm": 1.6346838474273682, + "learning_rate": 1.6706524050403996e-05, + "loss": 0.3352, + "step": 1144 + }, + { + "epoch": 0.45876701361088873, + "grad_norm": 1.6563067436218262, + "learning_rate": 1.6727230431791806e-05, + "loss": 0.3535, + "step": 1146 + }, + { + "epoch": 0.45956765412329864, + "grad_norm": 1.5965502262115479, + "learning_rate": 1.674788425949818e-05, + "loss": 0.2976, + "step": 1148 + }, + { + "epoch": 0.46036829463570855, + "grad_norm": 2.609780788421631, + "learning_rate": 1.6768485372173696e-05, + "loss": 0.2879, + "step": 1150 + }, + { + "epoch": 0.4611689351481185, + "grad_norm": 1.5971169471740723, + "learning_rate": 1.6789033608880735e-05, + "loss": 0.2805, + "step": 1152 + }, + { + "epoch": 0.46196957566052843, + "grad_norm": 3.0670669078826904, + "learning_rate": 1.6809528809094798e-05, + "loss": 0.4228, + "step": 1154 + }, + { + "epoch": 0.46277021617293834, + "grad_norm": 4.2267255783081055, + "learning_rate": 1.6829970812705674e-05, + "loss": 0.4516, + "step": 1156 + }, + { + "epoch": 0.46357085668534825, + "grad_norm": 3.423551082611084, + "learning_rate": 1.6850359460018733e-05, + "loss": 0.5932, + "step": 1158 + }, + { + "epoch": 0.4643714971977582, + "grad_norm": 1.961235761642456, + "learning_rate": 1.6870694591756165e-05, + "loss": 0.3349, + "step": 1160 + }, + { + "epoch": 0.46517213771016813, + "grad_norm": 1.1842010021209717, + "learning_rate": 1.689097604905826e-05, + "loss": 0.2444, + "step": 1162 + }, + { + "epoch": 0.46597277822257804, + "grad_norm": 1.3347169160842896, + "learning_rate": 1.6911203673484577e-05, + "loss": 0.3095, + "step": 1164 + }, + { + "epoch": 0.466773418734988, + "grad_norm": 0.8394862413406372, + "learning_rate": 1.6931377307015226e-05, + "loss": 0.2555, + "step": 1166 + }, + { + "epoch": 0.4675740592473979, + "grad_norm": 2.916417121887207, + "learning_rate": 1.695149679205214e-05, + "loss": 0.3571, + "step": 1168 + }, + { + "epoch": 0.46837469975980783, + "grad_norm": 2.5485095977783203, + "learning_rate": 1.6971561971420222e-05, + "loss": 0.5511, + "step": 1170 + }, + { + "epoch": 0.4691753402722178, + "grad_norm": 0.8793407082557678, + "learning_rate": 1.6991572688368628e-05, + "loss": 0.2727, + "step": 1172 + }, + { + "epoch": 0.4699759807846277, + "grad_norm": 1.8218306303024292, + "learning_rate": 1.701152878657196e-05, + "loss": 0.2877, + "step": 1174 + }, + { + "epoch": 0.4707766212970376, + "grad_norm": 1.897231936454773, + "learning_rate": 1.7031430110131562e-05, + "loss": 0.367, + "step": 1176 + }, + { + "epoch": 0.47157726180944753, + "grad_norm": 2.1378321647644043, + "learning_rate": 1.705127650357662e-05, + "loss": 0.3645, + "step": 1178 + }, + { + "epoch": 0.4723779023218575, + "grad_norm": 1.332521677017212, + "learning_rate": 1.7071067811865467e-05, + "loss": 0.3493, + "step": 1180 + }, + { + "epoch": 0.4731785428342674, + "grad_norm": 2.105557441711426, + "learning_rate": 1.7090803880386778e-05, + "loss": 0.4896, + "step": 1182 + }, + { + "epoch": 0.4739791833466773, + "grad_norm": 1.24476957321167, + "learning_rate": 1.7110484554960738e-05, + "loss": 0.2532, + "step": 1184 + }, + { + "epoch": 0.4747798238590873, + "grad_norm": 1.2456012964248657, + "learning_rate": 1.713010968184029e-05, + "loss": 0.2765, + "step": 1186 + }, + { + "epoch": 0.4755804643714972, + "grad_norm": 1.7597260475158691, + "learning_rate": 1.7149679107712306e-05, + "loss": 0.3538, + "step": 1188 + }, + { + "epoch": 0.4763811048839071, + "grad_norm": 2.4455912113189697, + "learning_rate": 1.716919267969883e-05, + "loss": 0.4327, + "step": 1190 + }, + { + "epoch": 0.4771817453963171, + "grad_norm": 1.390151858329773, + "learning_rate": 1.7188650245358215e-05, + "loss": 0.3041, + "step": 1192 + }, + { + "epoch": 0.477982385908727, + "grad_norm": 1.42485511302948, + "learning_rate": 1.7208051652686338e-05, + "loss": 0.2909, + "step": 1194 + }, + { + "epoch": 0.4787830264211369, + "grad_norm": 1.2891227006912231, + "learning_rate": 1.722739675011779e-05, + "loss": 0.3608, + "step": 1196 + }, + { + "epoch": 0.4795836669335468, + "grad_norm": 1.2989217042922974, + "learning_rate": 1.7246685386527095e-05, + "loss": 0.3057, + "step": 1198 + }, + { + "epoch": 0.4803843074459568, + "grad_norm": 1.6624996662139893, + "learning_rate": 1.726591741122981e-05, + "loss": 0.3645, + "step": 1200 + }, + { + "epoch": 0.4811849479583667, + "grad_norm": 2.3487319946289062, + "learning_rate": 1.7285092673983753e-05, + "loss": 0.47, + "step": 1202 + }, + { + "epoch": 0.4819855884707766, + "grad_norm": 1.679592490196228, + "learning_rate": 1.730421102499021e-05, + "loss": 0.247, + "step": 1204 + }, + { + "epoch": 0.48278622898318657, + "grad_norm": 1.645397663116455, + "learning_rate": 1.7323272314895022e-05, + "loss": 0.3037, + "step": 1206 + }, + { + "epoch": 0.4835868694955965, + "grad_norm": 1.534003734588623, + "learning_rate": 1.734227639478982e-05, + "loss": 0.2764, + "step": 1208 + }, + { + "epoch": 0.4843875100080064, + "grad_norm": 3.20851731300354, + "learning_rate": 1.736122311621314e-05, + "loss": 0.4382, + "step": 1210 + }, + { + "epoch": 0.4851881505204163, + "grad_norm": 3.1535825729370117, + "learning_rate": 1.738011233115165e-05, + "loss": 0.845, + "step": 1212 + }, + { + "epoch": 0.4859887910328263, + "grad_norm": 2.150028705596924, + "learning_rate": 1.7398943892041227e-05, + "loss": 0.5208, + "step": 1214 + }, + { + "epoch": 0.4867894315452362, + "grad_norm": 2.560722827911377, + "learning_rate": 1.7417717651768144e-05, + "loss": 0.3389, + "step": 1216 + }, + { + "epoch": 0.4875900720576461, + "grad_norm": 1.9925283193588257, + "learning_rate": 1.743643346367026e-05, + "loss": 0.357, + "step": 1218 + }, + { + "epoch": 0.48839071257005606, + "grad_norm": 1.5963295698165894, + "learning_rate": 1.7455091181538087e-05, + "loss": 0.3669, + "step": 1220 + }, + { + "epoch": 0.489191353082466, + "grad_norm": 0.8267001509666443, + "learning_rate": 1.7473690659615992e-05, + "loss": 0.3098, + "step": 1222 + }, + { + "epoch": 0.4899919935948759, + "grad_norm": 1.5689071416854858, + "learning_rate": 1.74922317526033e-05, + "loss": 0.2541, + "step": 1224 + }, + { + "epoch": 0.49079263410728585, + "grad_norm": 1.662617564201355, + "learning_rate": 1.7510714315655474e-05, + "loss": 0.3533, + "step": 1226 + }, + { + "epoch": 0.49159327461969576, + "grad_norm": 1.5118064880371094, + "learning_rate": 1.752913820438519e-05, + "loss": 0.3581, + "step": 1228 + }, + { + "epoch": 0.4923939151321057, + "grad_norm": 1.5861945152282715, + "learning_rate": 1.7547503274863495e-05, + "loss": 0.4037, + "step": 1230 + }, + { + "epoch": 0.4931945556445156, + "grad_norm": 2.628358840942383, + "learning_rate": 1.756580938362096e-05, + "loss": 0.3387, + "step": 1232 + }, + { + "epoch": 0.49399519615692555, + "grad_norm": 2.7314889430999756, + "learning_rate": 1.758405638764873e-05, + "loss": 0.4167, + "step": 1234 + }, + { + "epoch": 0.49479583666933546, + "grad_norm": 2.4931931495666504, + "learning_rate": 1.7602244144399693e-05, + "loss": 0.3993, + "step": 1236 + }, + { + "epoch": 0.4955964771817454, + "grad_norm": 2.8124592304229736, + "learning_rate": 1.7620372511789604e-05, + "loss": 0.4893, + "step": 1238 + }, + { + "epoch": 0.49639711769415534, + "grad_norm": 1.703611135482788, + "learning_rate": 1.7638441348198147e-05, + "loss": 0.3893, + "step": 1240 + }, + { + "epoch": 0.49719775820656525, + "grad_norm": 2.312281847000122, + "learning_rate": 1.7656450512470077e-05, + "loss": 0.3176, + "step": 1242 + }, + { + "epoch": 0.49799839871897517, + "grad_norm": 2.107011079788208, + "learning_rate": 1.7674399863916295e-05, + "loss": 0.4972, + "step": 1244 + }, + { + "epoch": 0.49879903923138513, + "grad_norm": 2.6281487941741943, + "learning_rate": 1.7692289262315e-05, + "loss": 0.591, + "step": 1246 + }, + { + "epoch": 0.49959967974379504, + "grad_norm": 2.069932222366333, + "learning_rate": 1.771011856791273e-05, + "loss": 0.4235, + "step": 1248 + }, + { + "epoch": 0.500400320256205, + "grad_norm": 1.230250358581543, + "learning_rate": 1.7727887641425448e-05, + "loss": 0.2646, + "step": 1250 + }, + { + "epoch": 0.5012009607686149, + "grad_norm": 1.3627375364303589, + "learning_rate": 1.7745596344039712e-05, + "loss": 0.2594, + "step": 1252 + }, + { + "epoch": 0.5020016012810248, + "grad_norm": 1.3321560621261597, + "learning_rate": 1.7763244537413657e-05, + "loss": 0.3062, + "step": 1254 + }, + { + "epoch": 0.5028022417934348, + "grad_norm": 1.5635974407196045, + "learning_rate": 1.7780832083678116e-05, + "loss": 0.2908, + "step": 1256 + }, + { + "epoch": 0.5036028823058447, + "grad_norm": 1.5955641269683838, + "learning_rate": 1.7798358845437754e-05, + "loss": 0.3061, + "step": 1258 + }, + { + "epoch": 0.5044035228182546, + "grad_norm": 1.5333644151687622, + "learning_rate": 1.7815824685772035e-05, + "loss": 0.2826, + "step": 1260 + }, + { + "epoch": 0.5052041633306645, + "grad_norm": 1.8643745183944702, + "learning_rate": 1.7833229468236364e-05, + "loss": 0.2786, + "step": 1262 + }, + { + "epoch": 0.5060048038430744, + "grad_norm": 1.484750509262085, + "learning_rate": 1.7850573056863156e-05, + "loss": 0.3194, + "step": 1264 + }, + { + "epoch": 0.5068054443554844, + "grad_norm": 1.2632009983062744, + "learning_rate": 1.786785531616285e-05, + "loss": 0.3182, + "step": 1266 + }, + { + "epoch": 0.5076060848678943, + "grad_norm": 1.2482757568359375, + "learning_rate": 1.7885076111125004e-05, + "loss": 0.2645, + "step": 1268 + }, + { + "epoch": 0.5084067253803043, + "grad_norm": 1.5875747203826904, + "learning_rate": 1.790223530721933e-05, + "loss": 0.3495, + "step": 1270 + }, + { + "epoch": 0.5092073658927142, + "grad_norm": 2.4716286659240723, + "learning_rate": 1.791933277039679e-05, + "loss": 0.2934, + "step": 1272 + }, + { + "epoch": 0.5100080064051241, + "grad_norm": 1.6958657503128052, + "learning_rate": 1.7936368367090577e-05, + "loss": 0.2542, + "step": 1274 + }, + { + "epoch": 0.510808646917534, + "grad_norm": 1.7094265222549438, + "learning_rate": 1.7953341964217183e-05, + "loss": 0.305, + "step": 1276 + }, + { + "epoch": 0.5116092874299439, + "grad_norm": 1.1859160661697388, + "learning_rate": 1.7970253429177477e-05, + "loss": 0.2911, + "step": 1278 + }, + { + "epoch": 0.5124099279423538, + "grad_norm": 1.8984309434890747, + "learning_rate": 1.7987102629857696e-05, + "loss": 0.3645, + "step": 1280 + }, + { + "epoch": 0.5132105684547638, + "grad_norm": 1.6031272411346436, + "learning_rate": 1.800388943463047e-05, + "loss": 0.3818, + "step": 1282 + }, + { + "epoch": 0.5140112089671738, + "grad_norm": 3.7991015911102295, + "learning_rate": 1.8020613712355912e-05, + "loss": 0.3179, + "step": 1284 + }, + { + "epoch": 0.5148118494795837, + "grad_norm": 2.31718373298645, + "learning_rate": 1.803727533238257e-05, + "loss": 0.339, + "step": 1286 + }, + { + "epoch": 0.5156124899919936, + "grad_norm": 1.5102046728134155, + "learning_rate": 1.805387416454847e-05, + "loss": 0.3523, + "step": 1288 + }, + { + "epoch": 0.5164131305044035, + "grad_norm": 1.8630553483963013, + "learning_rate": 1.8070410079182195e-05, + "loss": 0.2865, + "step": 1290 + }, + { + "epoch": 0.5172137710168134, + "grad_norm": 3.54534912109375, + "learning_rate": 1.8086882947103787e-05, + "loss": 0.3522, + "step": 1292 + }, + { + "epoch": 0.5180144115292233, + "grad_norm": 1.676632285118103, + "learning_rate": 1.8103292639625842e-05, + "loss": 0.3332, + "step": 1294 + }, + { + "epoch": 0.5188150520416333, + "grad_norm": 1.13118314743042, + "learning_rate": 1.811963902855447e-05, + "loss": 0.2477, + "step": 1296 + }, + { + "epoch": 0.5196156925540433, + "grad_norm": 1.0857309103012085, + "learning_rate": 1.813592198619035e-05, + "loss": 0.4036, + "step": 1298 + }, + { + "epoch": 0.5204163330664532, + "grad_norm": 1.4757311344146729, + "learning_rate": 1.8152141385329658e-05, + "loss": 0.3041, + "step": 1300 + }, + { + "epoch": 0.5212169735788631, + "grad_norm": 1.7229307889938354, + "learning_rate": 1.816829709926509e-05, + "loss": 0.3051, + "step": 1302 + }, + { + "epoch": 0.522017614091273, + "grad_norm": 2.0348527431488037, + "learning_rate": 1.8184389001786895e-05, + "loss": 0.4349, + "step": 1304 + }, + { + "epoch": 0.5228182546036829, + "grad_norm": 1.6300259828567505, + "learning_rate": 1.8200416967183785e-05, + "loss": 0.3885, + "step": 1306 + }, + { + "epoch": 0.5236188951160928, + "grad_norm": 1.4763151407241821, + "learning_rate": 1.821638087024396e-05, + "loss": 0.267, + "step": 1308 + }, + { + "epoch": 0.5244195356285029, + "grad_norm": 1.491414189338684, + "learning_rate": 1.8232280586256097e-05, + "loss": 0.3098, + "step": 1310 + }, + { + "epoch": 0.5252201761409128, + "grad_norm": 3.098536968231201, + "learning_rate": 1.8248115991010296e-05, + "loss": 0.3192, + "step": 1312 + }, + { + "epoch": 0.5260208166533227, + "grad_norm": 1.6139681339263916, + "learning_rate": 1.8263886960799055e-05, + "loss": 0.416, + "step": 1314 + }, + { + "epoch": 0.5268214571657326, + "grad_norm": 1.4375265836715698, + "learning_rate": 1.8279593372418264e-05, + "loss": 0.3181, + "step": 1316 + }, + { + "epoch": 0.5276220976781425, + "grad_norm": 2.2026777267456055, + "learning_rate": 1.829523510316813e-05, + "loss": 0.5265, + "step": 1318 + }, + { + "epoch": 0.5284227381905524, + "grad_norm": 1.1767995357513428, + "learning_rate": 1.8310812030854155e-05, + "loss": 0.3017, + "step": 1320 + }, + { + "epoch": 0.5292233787029623, + "grad_norm": 1.4497796297073364, + "learning_rate": 1.832632403378808e-05, + "loss": 0.5854, + "step": 1322 + }, + { + "epoch": 0.5300240192153723, + "grad_norm": 3.98708176612854, + "learning_rate": 1.834177099078887e-05, + "loss": 0.3613, + "step": 1324 + }, + { + "epoch": 0.5308246597277823, + "grad_norm": 1.6663038730621338, + "learning_rate": 1.8357152781183606e-05, + "loss": 0.3976, + "step": 1326 + }, + { + "epoch": 0.5316253002401922, + "grad_norm": 2.8677451610565186, + "learning_rate": 1.8372469284808465e-05, + "loss": 0.3194, + "step": 1328 + }, + { + "epoch": 0.5324259407526021, + "grad_norm": 3.1205546855926514, + "learning_rate": 1.8387720382009665e-05, + "loss": 0.314, + "step": 1330 + }, + { + "epoch": 0.533226581265012, + "grad_norm": 2.561814308166504, + "learning_rate": 1.840290595364436e-05, + "loss": 0.3039, + "step": 1332 + }, + { + "epoch": 0.5340272217774219, + "grad_norm": 1.3719024658203125, + "learning_rate": 1.8418025881081606e-05, + "loss": 0.3041, + "step": 1334 + }, + { + "epoch": 0.5348278622898318, + "grad_norm": 2.2830185890197754, + "learning_rate": 1.8433080046203286e-05, + "loss": 0.2719, + "step": 1336 + }, + { + "epoch": 0.5356285028022418, + "grad_norm": 1.7668317556381226, + "learning_rate": 1.844806833140501e-05, + "loss": 0.382, + "step": 1338 + }, + { + "epoch": 0.5364291433146517, + "grad_norm": 1.6818101406097412, + "learning_rate": 1.8462990619597054e-05, + "loss": 0.3497, + "step": 1340 + }, + { + "epoch": 0.5372297838270617, + "grad_norm": 1.5738331079483032, + "learning_rate": 1.8477846794205258e-05, + "loss": 0.322, + "step": 1342 + }, + { + "epoch": 0.5380304243394716, + "grad_norm": 2.4746949672698975, + "learning_rate": 1.8492636739171966e-05, + "loss": 0.3817, + "step": 1344 + }, + { + "epoch": 0.5388310648518815, + "grad_norm": 1.6559386253356934, + "learning_rate": 1.85073603389569e-05, + "loss": 0.3487, + "step": 1346 + }, + { + "epoch": 0.5396317053642914, + "grad_norm": 1.8410348892211914, + "learning_rate": 1.8522017478538067e-05, + "loss": 0.3646, + "step": 1348 + }, + { + "epoch": 0.5404323458767014, + "grad_norm": 1.6128240823745728, + "learning_rate": 1.8536608043412695e-05, + "loss": 0.2935, + "step": 1350 + }, + { + "epoch": 0.5412329863891113, + "grad_norm": 1.5008306503295898, + "learning_rate": 1.855113191959808e-05, + "loss": 0.2788, + "step": 1352 + }, + { + "epoch": 0.5420336269015212, + "grad_norm": 3.505697011947632, + "learning_rate": 1.856558899363248e-05, + "loss": 0.3815, + "step": 1354 + }, + { + "epoch": 0.5428342674139311, + "grad_norm": 1.9124630689620972, + "learning_rate": 1.8579979152576063e-05, + "loss": 0.2985, + "step": 1356 + }, + { + "epoch": 0.5436349079263411, + "grad_norm": 1.3040895462036133, + "learning_rate": 1.85943022840117e-05, + "loss": 0.2866, + "step": 1358 + }, + { + "epoch": 0.544435548438751, + "grad_norm": 2.16788387298584, + "learning_rate": 1.8608558276045895e-05, + "loss": 0.3643, + "step": 1360 + }, + { + "epoch": 0.5452361889511609, + "grad_norm": 3.0807197093963623, + "learning_rate": 1.862274701730967e-05, + "loss": 0.4147, + "step": 1362 + }, + { + "epoch": 0.5460368294635709, + "grad_norm": 1.1215083599090576, + "learning_rate": 1.86368683969594e-05, + "loss": 0.2691, + "step": 1364 + }, + { + "epoch": 0.5468374699759808, + "grad_norm": 2.8542909622192383, + "learning_rate": 1.865092230467769e-05, + "loss": 0.2716, + "step": 1366 + }, + { + "epoch": 0.5476381104883907, + "grad_norm": 2.451612949371338, + "learning_rate": 1.866490863067425e-05, + "loss": 0.4381, + "step": 1368 + }, + { + "epoch": 0.5484387510008006, + "grad_norm": 2.144955635070801, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.3994, + "step": 1370 + }, + { + "epoch": 0.5492393915132106, + "grad_norm": 1.9501135349273682, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.319, + "step": 1372 + }, + { + "epoch": 0.5500400320256205, + "grad_norm": 2.8958382606506348, + "learning_rate": 1.87064610283551e-05, + "loss": 0.3567, + "step": 1374 + }, + { + "epoch": 0.5508406725380304, + "grad_norm": 2.7687907218933105, + "learning_rate": 1.8720175940133705e-05, + "loss": 0.5136, + "step": 1376 + }, + { + "epoch": 0.5516413130504404, + "grad_norm": 2.504481315612793, + "learning_rate": 1.873382272917545e-05, + "loss": 0.4716, + "step": 1378 + }, + { + "epoch": 0.5524419535628503, + "grad_norm": 2.945852279663086, + "learning_rate": 1.8747401288870472e-05, + "loss": 0.2582, + "step": 1380 + }, + { + "epoch": 0.5532425940752602, + "grad_norm": 1.53285551071167, + "learning_rate": 1.876091151314196e-05, + "loss": 0.3038, + "step": 1382 + }, + { + "epoch": 0.5540432345876701, + "grad_norm": 0.8849778771400452, + "learning_rate": 1.877435329644691e-05, + "loss": 0.2309, + "step": 1384 + }, + { + "epoch": 0.55484387510008, + "grad_norm": 1.3683156967163086, + "learning_rate": 1.8787726533776996e-05, + "loss": 0.3463, + "step": 1386 + }, + { + "epoch": 0.55564451561249, + "grad_norm": 2.246551513671875, + "learning_rate": 1.8801031120659393e-05, + "loss": 0.3661, + "step": 1388 + }, + { + "epoch": 0.5564451561248999, + "grad_norm": 2.4067773818969727, + "learning_rate": 1.8814266953157557e-05, + "loss": 0.282, + "step": 1390 + }, + { + "epoch": 0.5572457966373099, + "grad_norm": 2.164475679397583, + "learning_rate": 1.8827433927872066e-05, + "loss": 0.2905, + "step": 1392 + }, + { + "epoch": 0.5580464371497198, + "grad_norm": 1.106062412261963, + "learning_rate": 1.8840531941941415e-05, + "loss": 0.2194, + "step": 1394 + }, + { + "epoch": 0.5588470776621297, + "grad_norm": 3.0217344760894775, + "learning_rate": 1.8853560893042854e-05, + "loss": 0.3491, + "step": 1396 + }, + { + "epoch": 0.5596477181745396, + "grad_norm": 1.7989710569381714, + "learning_rate": 1.8866520679393127e-05, + "loss": 0.2669, + "step": 1398 + }, + { + "epoch": 0.5604483586869495, + "grad_norm": 2.619840621948242, + "learning_rate": 1.8879411199749303e-05, + "loss": 0.4147, + "step": 1400 + }, + { + "epoch": 0.5612489991993594, + "grad_norm": 2.3668651580810547, + "learning_rate": 1.889223235340958e-05, + "loss": 0.2783, + "step": 1402 + }, + { + "epoch": 0.5620496397117695, + "grad_norm": 1.7369557619094849, + "learning_rate": 1.8904984040214037e-05, + "loss": 0.3645, + "step": 1404 + }, + { + "epoch": 0.5628502802241794, + "grad_norm": 2.8007590770721436, + "learning_rate": 1.8917666160545436e-05, + "loss": 0.3844, + "step": 1406 + }, + { + "epoch": 0.5636509207365893, + "grad_norm": 2.4085254669189453, + "learning_rate": 1.893027861533002e-05, + "loss": 0.3178, + "step": 1408 + }, + { + "epoch": 0.5644515612489992, + "grad_norm": 2.566793918609619, + "learning_rate": 1.894282130603823e-05, + "loss": 0.4696, + "step": 1410 + }, + { + "epoch": 0.5652522017614091, + "grad_norm": 1.9040905237197876, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.4439, + "step": 1412 + }, + { + "epoch": 0.566052842273819, + "grad_norm": 2.290382146835327, + "learning_rate": 1.896769700383315e-05, + "loss": 0.5491, + "step": 1414 + }, + { + "epoch": 0.5668534827862289, + "grad_norm": 2.11851167678833, + "learning_rate": 1.898002981658886e-05, + "loss": 0.4592, + "step": 1416 + }, + { + "epoch": 0.567654123298639, + "grad_norm": 2.0083091259002686, + "learning_rate": 1.899229247660769e-05, + "loss": 0.2861, + "step": 1418 + }, + { + "epoch": 0.5684547638110489, + "grad_norm": 3.0315170288085938, + "learning_rate": 1.9004484888092724e-05, + "loss": 0.5702, + "step": 1420 + }, + { + "epoch": 0.5692554043234588, + "grad_norm": 1.246562123298645, + "learning_rate": 1.901660695579585e-05, + "loss": 0.2821, + "step": 1422 + }, + { + "epoch": 0.5700560448358687, + "grad_norm": 1.6468160152435303, + "learning_rate": 1.9028658585018455e-05, + "loss": 0.4326, + "step": 1424 + }, + { + "epoch": 0.5708566853482786, + "grad_norm": 1.3408123254776, + "learning_rate": 1.9040639681612212e-05, + "loss": 0.2044, + "step": 1426 + }, + { + "epoch": 0.5716573258606885, + "grad_norm": 4.640432834625244, + "learning_rate": 1.9052550151979816e-05, + "loss": 0.298, + "step": 1428 + }, + { + "epoch": 0.5724579663730984, + "grad_norm": 1.6885050535202026, + "learning_rate": 1.9064389903075676e-05, + "loss": 0.304, + "step": 1430 + }, + { + "epoch": 0.5732586068855084, + "grad_norm": 1.2635599374771118, + "learning_rate": 1.9076158842406674e-05, + "loss": 0.3293, + "step": 1432 + }, + { + "epoch": 0.5740592473979184, + "grad_norm": 2.5152063369750977, + "learning_rate": 1.9087856878032886e-05, + "loss": 0.5085, + "step": 1434 + }, + { + "epoch": 0.5748598879103283, + "grad_norm": 1.6995970010757446, + "learning_rate": 1.909948391856829e-05, + "loss": 0.4547, + "step": 1436 + }, + { + "epoch": 0.5756605284227382, + "grad_norm": 2.7735228538513184, + "learning_rate": 1.911103987318148e-05, + "loss": 0.3892, + "step": 1438 + }, + { + "epoch": 0.5764611689351481, + "grad_norm": 1.547253966331482, + "learning_rate": 1.912252465159637e-05, + "loss": 0.3228, + "step": 1440 + }, + { + "epoch": 0.577261809447558, + "grad_norm": 2.876004934310913, + "learning_rate": 1.913393816409294e-05, + "loss": 0.3644, + "step": 1442 + }, + { + "epoch": 0.578062449959968, + "grad_norm": 1.5494165420532227, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.3267, + "step": 1444 + }, + { + "epoch": 0.5788630904723779, + "grad_norm": 2.8303210735321045, + "learning_rate": 1.9156551035235288e-05, + "loss": 0.3831, + "step": 1446 + }, + { + "epoch": 0.5796637309847879, + "grad_norm": 2.5205883979797363, + "learning_rate": 1.9167750217227454e-05, + "loss": 0.366, + "step": 1448 + }, + { + "epoch": 0.5804643714971978, + "grad_norm": 1.7761430740356445, + "learning_rate": 1.9178877779995423e-05, + "loss": 0.3977, + "step": 1450 + }, + { + "epoch": 0.5812650120096077, + "grad_norm": 0.5572300553321838, + "learning_rate": 1.9189933636609747e-05, + "loss": 0.1669, + "step": 1452 + }, + { + "epoch": 0.5820656525220176, + "grad_norm": 1.758844017982483, + "learning_rate": 1.9200917700701173e-05, + "loss": 0.3484, + "step": 1454 + }, + { + "epoch": 0.5828662930344275, + "grad_norm": 2.3627676963806152, + "learning_rate": 1.9211829886461274e-05, + "loss": 0.3524, + "step": 1456 + }, + { + "epoch": 0.5836669335468375, + "grad_norm": 2.3309192657470703, + "learning_rate": 1.9222670108643146e-05, + "loss": 0.2909, + "step": 1458 + }, + { + "epoch": 0.5844675740592474, + "grad_norm": 3.0152571201324463, + "learning_rate": 1.9233438282562085e-05, + "loss": 0.3524, + "step": 1460 + }, + { + "epoch": 0.5852682145716573, + "grad_norm": 1.821384072303772, + "learning_rate": 1.924413432409622e-05, + "loss": 0.3061, + "step": 1462 + }, + { + "epoch": 0.5860688550840673, + "grad_norm": 3.454529047012329, + "learning_rate": 1.925475814968719e-05, + "loss": 0.3705, + "step": 1464 + }, + { + "epoch": 0.5868694955964772, + "grad_norm": 1.0908498764038086, + "learning_rate": 1.926530967634078e-05, + "loss": 0.2421, + "step": 1466 + }, + { + "epoch": 0.5876701361088871, + "grad_norm": 1.1369857788085938, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.3142, + "step": 1468 + }, + { + "epoch": 0.588470776621297, + "grad_norm": 1.4316868782043457, + "learning_rate": 1.9286195503683705e-05, + "loss": 0.3041, + "step": 1470 + }, + { + "epoch": 0.589271417133707, + "grad_norm": 1.646378755569458, + "learning_rate": 1.9296529641211215e-05, + "loss": 0.2398, + "step": 1472 + }, + { + "epoch": 0.5900720576461169, + "grad_norm": 3.3945441246032715, + "learning_rate": 1.9306791153479004e-05, + "loss": 0.7624, + "step": 1474 + }, + { + "epoch": 0.5908726981585268, + "grad_norm": 1.8712395429611206, + "learning_rate": 1.9316979960323286e-05, + "loss": 0.2027, + "step": 1476 + }, + { + "epoch": 0.5916733386709367, + "grad_norm": 2.3308944702148438, + "learning_rate": 1.932709598214825e-05, + "loss": 0.3268, + "step": 1478 + }, + { + "epoch": 0.5924739791833467, + "grad_norm": 1.8116668462753296, + "learning_rate": 1.9337139139926707e-05, + "loss": 0.3645, + "step": 1480 + }, + { + "epoch": 0.5932746196957566, + "grad_norm": 1.4934889078140259, + "learning_rate": 1.9347109355200672e-05, + "loss": 0.2953, + "step": 1482 + }, + { + "epoch": 0.5940752602081665, + "grad_norm": 1.6928720474243164, + "learning_rate": 1.935700655008199e-05, + "loss": 0.2783, + "step": 1484 + }, + { + "epoch": 0.5948759007205765, + "grad_norm": 3.628389596939087, + "learning_rate": 1.9366830647252967e-05, + "loss": 0.5103, + "step": 1486 + }, + { + "epoch": 0.5956765412329864, + "grad_norm": 3.1892073154449463, + "learning_rate": 1.9376581569966933e-05, + "loss": 0.4546, + "step": 1488 + }, + { + "epoch": 0.5964771817453963, + "grad_norm": 4.481762409210205, + "learning_rate": 1.9386259242048883e-05, + "loss": 0.3663, + "step": 1490 + }, + { + "epoch": 0.5972778222578062, + "grad_norm": 3.9146902561187744, + "learning_rate": 1.939586358789602e-05, + "loss": 0.3978, + "step": 1492 + }, + { + "epoch": 0.5980784627702161, + "grad_norm": 1.2030060291290283, + "learning_rate": 1.940539453247842e-05, + "loss": 0.3349, + "step": 1494 + }, + { + "epoch": 0.5988791032826261, + "grad_norm": 3.288029432296753, + "learning_rate": 1.9414852001339547e-05, + "loss": 0.4896, + "step": 1496 + }, + { + "epoch": 0.5996797437950361, + "grad_norm": 1.210679292678833, + "learning_rate": 1.9424235920596863e-05, + "loss": 0.3148, + "step": 1498 + }, + { + "epoch": 0.600480384307446, + "grad_norm": 2.87322998046875, + "learning_rate": 1.9433546216942423e-05, + "loss": 0.3419, + "step": 1500 + }, + { + "epoch": 0.6012810248198559, + "grad_norm": 1.8442732095718384, + "learning_rate": 1.944278281764342e-05, + "loss": 0.5117, + "step": 1502 + }, + { + "epoch": 0.6020816653322658, + "grad_norm": 2.1268885135650635, + "learning_rate": 1.945194565054276e-05, + "loss": 0.4148, + "step": 1504 + }, + { + "epoch": 0.6028823058446757, + "grad_norm": 2.246314764022827, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.451, + "step": 1506 + }, + { + "epoch": 0.6036829463570856, + "grad_norm": 3.9688937664031982, + "learning_rate": 1.9470049727190073e-05, + "loss": 0.3892, + "step": 1508 + }, + { + "epoch": 0.6044835868694955, + "grad_norm": 1.7225316762924194, + "learning_rate": 1.9478990829507504e-05, + "loss": 0.3468, + "step": 1510 + }, + { + "epoch": 0.6052842273819056, + "grad_norm": 1.6679717302322388, + "learning_rate": 1.948785788116329e-05, + "loss": 0.2983, + "step": 1512 + }, + { + "epoch": 0.6060848678943155, + "grad_norm": 2.3782589435577393, + "learning_rate": 1.9496650812887286e-05, + "loss": 0.415, + "step": 1514 + }, + { + "epoch": 0.6068855084067254, + "grad_norm": 1.731931209564209, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.2388, + "step": 1516 + }, + { + "epoch": 0.6076861489191353, + "grad_norm": 1.1310231685638428, + "learning_rate": 1.951401404235505e-05, + "loss": 0.2699, + "step": 1518 + }, + { + "epoch": 0.6084867894315452, + "grad_norm": 1.3082243204116821, + "learning_rate": 1.952258420445583e-05, + "loss": 0.3357, + "step": 1520 + }, + { + "epoch": 0.6092874299439551, + "grad_norm": 1.5922343730926514, + "learning_rate": 1.9531079975339912e-05, + "loss": 0.3041, + "step": 1522 + }, + { + "epoch": 0.610088070456365, + "grad_norm": 0.9816897511482239, + "learning_rate": 1.953950128863762e-05, + "loss": 0.2356, + "step": 1524 + }, + { + "epoch": 0.6108887109687751, + "grad_norm": 1.314747929573059, + "learning_rate": 1.9547848078560975e-05, + "loss": 0.2469, + "step": 1526 + }, + { + "epoch": 0.611689351481185, + "grad_norm": 1.8525187969207764, + "learning_rate": 1.9556120279904144e-05, + "loss": 0.3039, + "step": 1528 + }, + { + "epoch": 0.6124899919935949, + "grad_norm": 1.2565035820007324, + "learning_rate": 1.956431782804402e-05, + "loss": 0.3018, + "step": 1530 + }, + { + "epoch": 0.6132906325060048, + "grad_norm": 1.740918755531311, + "learning_rate": 1.957244065894066e-05, + "loss": 0.3488, + "step": 1532 + }, + { + "epoch": 0.6140912730184147, + "grad_norm": 2.3988301753997803, + "learning_rate": 1.9580488709137858e-05, + "loss": 0.21, + "step": 1534 + }, + { + "epoch": 0.6148919135308246, + "grad_norm": 2.031578540802002, + "learning_rate": 1.9588461915763566e-05, + "loss": 0.3486, + "step": 1536 + }, + { + "epoch": 0.6156925540432346, + "grad_norm": 1.3977347612380981, + "learning_rate": 1.9596360216530436e-05, + "loss": 0.3763, + "step": 1538 + }, + { + "epoch": 0.6164931945556446, + "grad_norm": 2.0349316596984863, + "learning_rate": 1.9604183549736283e-05, + "loss": 0.433, + "step": 1540 + }, + { + "epoch": 0.6172938350680545, + "grad_norm": 2.2839879989624023, + "learning_rate": 1.961193185426459e-05, + "loss": 0.4329, + "step": 1542 + }, + { + "epoch": 0.6180944755804644, + "grad_norm": 3.0397114753723145, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.4896, + "step": 1544 + }, + { + "epoch": 0.6188951160928743, + "grad_norm": 2.039973497390747, + "learning_rate": 1.9627203135753573e-05, + "loss": 0.3835, + "step": 1546 + }, + { + "epoch": 0.6196957566052842, + "grad_norm": 1.585484504699707, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.2878, + "step": 1548 + }, + { + "epoch": 0.6204963971176941, + "grad_norm": 1.4757667779922485, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.2905, + "step": 1550 + }, + { + "epoch": 0.6212970376301041, + "grad_norm": 2.5283360481262207, + "learning_rate": 1.964954584871995e-05, + "loss": 0.4327, + "step": 1552 + }, + { + "epoch": 0.622097678142514, + "grad_norm": 1.9776372909545898, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.2786, + "step": 1554 + }, + { + "epoch": 0.622898318654924, + "grad_norm": 1.1000696420669556, + "learning_rate": 1.966406417240872e-05, + "loss": 0.2655, + "step": 1556 + }, + { + "epoch": 0.6236989591673339, + "grad_norm": 1.7607388496398926, + "learning_rate": 1.967121011775546e-05, + "loss": 0.3492, + "step": 1558 + }, + { + "epoch": 0.6244995996797438, + "grad_norm": 1.4707963466644287, + "learning_rate": 1.967828051080755e-05, + "loss": 0.4444, + "step": 1560 + }, + { + "epoch": 0.6253002401921537, + "grad_norm": 1.6824913024902344, + "learning_rate": 1.9685275296330497e-05, + "loss": 0.3807, + "step": 1562 + }, + { + "epoch": 0.6261008807045636, + "grad_norm": 1.9426971673965454, + "learning_rate": 1.969219441968046e-05, + "loss": 0.3421, + "step": 1564 + }, + { + "epoch": 0.6269015212169736, + "grad_norm": 2.099825620651245, + "learning_rate": 1.969903782680467e-05, + "loss": 0.2644, + "step": 1566 + }, + { + "epoch": 0.6277021617293835, + "grad_norm": 2.2114386558532715, + "learning_rate": 1.9705805464241856e-05, + "loss": 0.3422, + "step": 1568 + }, + { + "epoch": 0.6285028022417934, + "grad_norm": 2.1358606815338135, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.5091, + "step": 1570 + }, + { + "epoch": 0.6293034427542034, + "grad_norm": 1.69170081615448, + "learning_rate": 1.971911321917015e-05, + "loss": 0.2875, + "step": 1572 + }, + { + "epoch": 0.6301040832666133, + "grad_norm": 2.767228126525879, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.3193, + "step": 1574 + }, + { + "epoch": 0.6309047237790232, + "grad_norm": 1.5974769592285156, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.2825, + "step": 1576 + }, + { + "epoch": 0.6317053642914331, + "grad_norm": 0.9872749447822571, + "learning_rate": 1.9738505276435692e-05, + "loss": 0.2785, + "step": 1578 + }, + { + "epoch": 0.6325060048038431, + "grad_norm": 1.7384178638458252, + "learning_rate": 1.9744817206240377e-05, + "loss": 0.2835, + "step": 1580 + }, + { + "epoch": 0.633306645316253, + "grad_norm": 1.4108997583389282, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.1606, + "step": 1582 + }, + { + "epoch": 0.6341072858286629, + "grad_norm": 1.8386383056640625, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.2632, + "step": 1584 + }, + { + "epoch": 0.6349079263410728, + "grad_norm": 8.29961109161377, + "learning_rate": 1.9763296037475174e-05, + "loss": 0.7034, + "step": 1586 + }, + { + "epoch": 0.6357085668534828, + "grad_norm": 3.7611887454986572, + "learning_rate": 1.976930316809569e-05, + "loss": 0.5085, + "step": 1588 + }, + { + "epoch": 0.6365092073658927, + "grad_norm": 3.145343065261841, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.4504, + "step": 1590 + }, + { + "epoch": 0.6373098478783027, + "grad_norm": 3.132668972015381, + "learning_rate": 1.978108842718768e-05, + "loss": 0.4717, + "step": 1592 + }, + { + "epoch": 0.6381104883907126, + "grad_norm": 1.791672706604004, + "learning_rate": 1.9786866463591732e-05, + "loss": 0.2809, + "step": 1594 + }, + { + "epoch": 0.6389111289031225, + "grad_norm": 1.5206650495529175, + "learning_rate": 1.9792568044184176e-05, + "loss": 0.3393, + "step": 1596 + }, + { + "epoch": 0.6397117694155324, + "grad_norm": 4.317498207092285, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.3352, + "step": 1598 + }, + { + "epoch": 0.6405124099279423, + "grad_norm": 3.9570133686065674, + "learning_rate": 1.9803741660367015e-05, + "loss": 0.6817, + "step": 1600 + }, + { + "epoch": 0.6413130504403523, + "grad_norm": 2.829583168029785, + "learning_rate": 1.9809213608668185e-05, + "loss": 0.4976, + "step": 1602 + }, + { + "epoch": 0.6421136909527622, + "grad_norm": 2.2616565227508545, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.4154, + "step": 1604 + }, + { + "epoch": 0.6429143314651722, + "grad_norm": 2.1117727756500244, + "learning_rate": 1.9819927571953807e-05, + "loss": 0.3812, + "step": 1606 + }, + { + "epoch": 0.6437149719775821, + "grad_norm": 2.2392687797546387, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.56, + "step": 1608 + }, + { + "epoch": 0.644515612489992, + "grad_norm": 2.5339088439941406, + "learning_rate": 1.983033467948784e-05, + "loss": 0.433, + "step": 1610 + }, + { + "epoch": 0.6453162530024019, + "grad_norm": 1.598150610923767, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.2401, + "step": 1612 + }, + { + "epoch": 0.6461168935148118, + "grad_norm": 2.28891921043396, + "learning_rate": 1.9840434606066182e-05, + "loss": 0.4514, + "step": 1614 + }, + { + "epoch": 0.6469175340272217, + "grad_norm": 2.0882880687713623, + "learning_rate": 1.9845369277495102e-05, + "loss": 0.3333, + "step": 1616 + }, + { + "epoch": 0.6477181745396317, + "grad_norm": 2.1075315475463867, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.4153, + "step": 1618 + }, + { + "epoch": 0.6485188150520417, + "grad_norm": 1.6866825819015503, + "learning_rate": 1.985500784388244e-05, + "loss": 0.2897, + "step": 1620 + }, + { + "epoch": 0.6493194555644516, + "grad_norm": 2.0826141834259033, + "learning_rate": 1.985971166354357e-05, + "loss": 0.3812, + "step": 1622 + }, + { + "epoch": 0.6501200960768615, + "grad_norm": 1.6899255514144897, + "learning_rate": 1.9864338458320366e-05, + "loss": 0.404, + "step": 1624 + }, + { + "epoch": 0.6509207365892714, + "grad_norm": 2.319334030151367, + "learning_rate": 1.986888819206792e-05, + "loss": 0.3648, + "step": 1626 + }, + { + "epoch": 0.6517213771016813, + "grad_norm": 2.0903737545013428, + "learning_rate": 1.9873360829243323e-05, + "loss": 0.416, + "step": 1628 + }, + { + "epoch": 0.6525220176140912, + "grad_norm": 1.9995465278625488, + "learning_rate": 1.9877756334905983e-05, + "loss": 0.3647, + "step": 1630 + }, + { + "epoch": 0.6533226581265013, + "grad_norm": 2.022003412246704, + "learning_rate": 1.9882074674717836e-05, + "loss": 0.3977, + "step": 1632 + }, + { + "epoch": 0.6541232986389112, + "grad_norm": 1.7696533203125, + "learning_rate": 1.988631581494365e-05, + "loss": 0.3336, + "step": 1634 + }, + { + "epoch": 0.6549239391513211, + "grad_norm": 2.0292069911956787, + "learning_rate": 1.989047972245129e-05, + "loss": 0.3813, + "step": 1636 + }, + { + "epoch": 0.655724579663731, + "grad_norm": 1.8103466033935547, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.3669, + "step": 1638 + }, + { + "epoch": 0.6565252201761409, + "grad_norm": 2.5568008422851562, + "learning_rate": 1.989857570980049e-05, + "loss": 0.339, + "step": 1640 + }, + { + "epoch": 0.6573258606885508, + "grad_norm": 1.730578064918518, + "learning_rate": 1.990250772639552e-05, + "loss": 0.3356, + "step": 1642 + }, + { + "epoch": 0.6581265012009607, + "grad_norm": 1.9249799251556396, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.3075, + "step": 1644 + }, + { + "epoch": 0.6589271417133707, + "grad_norm": 1.9832234382629395, + "learning_rate": 1.99101396518405e-05, + "loss": 0.3098, + "step": 1646 + }, + { + "epoch": 0.6597277822257807, + "grad_norm": 1.9964865446090698, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.3496, + "step": 1648 + }, + { + "epoch": 0.6605284227381906, + "grad_norm": 1.5802888870239258, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.2985, + "step": 1650 + }, + { + "epoch": 0.6613290632506005, + "grad_norm": 2.861604690551758, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.3647, + "step": 1652 + }, + { + "epoch": 0.6621297037630104, + "grad_norm": 2.0382912158966064, + "learning_rate": 1.9924474249753652e-05, + "loss": 0.3485, + "step": 1654 + }, + { + "epoch": 0.6629303442754203, + "grad_norm": 2.7539308071136475, + "learning_rate": 1.9927864140670615e-05, + "loss": 0.4386, + "step": 1656 + }, + { + "epoch": 0.6637309847878302, + "grad_norm": 1.9771264791488647, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.3251, + "step": 1658 + }, + { + "epoch": 0.6645316253002402, + "grad_norm": 1.2857084274291992, + "learning_rate": 1.99344112247369e-05, + "loss": 0.2196, + "step": 1660 + }, + { + "epoch": 0.6653322658126501, + "grad_norm": 2.098388433456421, + "learning_rate": 1.9937568366739858e-05, + "loss": 0.3847, + "step": 1662 + }, + { + "epoch": 0.6661329063250601, + "grad_norm": 2.4926531314849854, + "learning_rate": 1.9940647875635463e-05, + "loss": 0.4698, + "step": 1664 + }, + { + "epoch": 0.66693354683747, + "grad_norm": 2.1841230392456055, + "learning_rate": 1.9943649727366335e-05, + "loss": 0.3645, + "step": 1666 + }, + { + "epoch": 0.6677341873498799, + "grad_norm": 2.2118539810180664, + "learning_rate": 1.994657389848176e-05, + "loss": 0.4329, + "step": 1668 + }, + { + "epoch": 0.6685348278622898, + "grad_norm": 1.6676889657974243, + "learning_rate": 1.994942036613787e-05, + "loss": 0.2907, + "step": 1670 + }, + { + "epoch": 0.6693354683746997, + "grad_norm": 1.8910391330718994, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.3352, + "step": 1672 + }, + { + "epoch": 0.6701361088871097, + "grad_norm": 1.9776356220245361, + "learning_rate": 1.995488010273198e-05, + "loss": 0.5144, + "step": 1674 + }, + { + "epoch": 0.6709367493995196, + "grad_norm": 2.8098082542419434, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.5736, + "step": 1676 + }, + { + "epoch": 0.6717373899119295, + "grad_norm": 1.6069252490997314, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.3982, + "step": 1678 + }, + { + "epoch": 0.6725380304243395, + "grad_norm": 1.4546754360198975, + "learning_rate": 1.996248639549475e-05, + "loss": 0.3762, + "step": 1680 + }, + { + "epoch": 0.6733386709367494, + "grad_norm": 1.2577720880508423, + "learning_rate": 1.9964866196679105e-05, + "loss": 0.2659, + "step": 1682 + }, + { + "epoch": 0.6741393114491593, + "grad_norm": 1.3435182571411133, + "learning_rate": 1.9967168151503196e-05, + "loss": 0.3059, + "step": 1684 + }, + { + "epoch": 0.6749399519615693, + "grad_norm": 1.304619550704956, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.3071, + "step": 1686 + }, + { + "epoch": 0.6757405924739792, + "grad_norm": 1.0225098133087158, + "learning_rate": 1.997153845074662e-05, + "loss": 0.232, + "step": 1688 + }, + { + "epoch": 0.6765412329863891, + "grad_norm": 1.476054072380066, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.3463, + "step": 1690 + }, + { + "epoch": 0.677341873498799, + "grad_norm": 1.9530041217803955, + "learning_rate": 1.997559715666073e-05, + "loss": 0.2726, + "step": 1692 + }, + { + "epoch": 0.678142514011209, + "grad_norm": 2.173236608505249, + "learning_rate": 1.9977509622105233e-05, + "loss": 0.4151, + "step": 1694 + }, + { + "epoch": 0.6789431545236189, + "grad_norm": 2.147254705429077, + "learning_rate": 1.9979344142417986e-05, + "loss": 0.4519, + "step": 1696 + }, + { + "epoch": 0.6797437950360288, + "grad_norm": 1.4378623962402344, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.3895, + "step": 1698 + }, + { + "epoch": 0.6805444355484388, + "grad_norm": 1.7947319746017456, + "learning_rate": 1.998277929093157e-05, + "loss": 0.4152, + "step": 1700 + }, + { + "epoch": 0.6813450760608487, + "grad_norm": 1.439346432685852, + "learning_rate": 1.998437989229673e-05, + "loss": 0.265, + "step": 1702 + }, + { + "epoch": 0.6821457165732586, + "grad_norm": 1.5022159814834595, + "learning_rate": 1.9985902494859023e-05, + "loss": 0.2171, + "step": 1704 + }, + { + "epoch": 0.6829463570856685, + "grad_norm": 1.8143515586853027, + "learning_rate": 1.998734708672375e-05, + "loss": 0.365, + "step": 1706 + }, + { + "epoch": 0.6837469975980784, + "grad_norm": 1.6142022609710693, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.365, + "step": 1708 + }, + { + "epoch": 0.6845476381104884, + "grad_norm": 1.3724932670593262, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.2343, + "step": 1710 + }, + { + "epoch": 0.6853482786228983, + "grad_norm": 2.4668290615081787, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.3981, + "step": 1712 + }, + { + "epoch": 0.6861489191353083, + "grad_norm": 2.3768036365509033, + "learning_rate": 1.9992345130644747e-05, + "loss": 0.3046, + "step": 1714 + }, + { + "epoch": 0.6869495596477182, + "grad_norm": 1.8890491724014282, + "learning_rate": 1.999339951193407e-05, + "loss": 0.1828, + "step": 1716 + }, + { + "epoch": 0.6877502001601281, + "grad_norm": 2.3439788818359375, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.3812, + "step": 1718 + }, + { + "epoch": 0.688550840672538, + "grad_norm": 1.8840681314468384, + "learning_rate": 1.9995274059091018e-05, + "loss": 0.3101, + "step": 1720 + }, + { + "epoch": 0.6893514811849479, + "grad_norm": 1.5289530754089355, + "learning_rate": 1.999609421031453e-05, + "loss": 0.3355, + "step": 1722 + }, + { + "epoch": 0.6901521216973578, + "grad_norm": 1.9118807315826416, + "learning_rate": 1.999683627122195e-05, + "loss": 0.3041, + "step": 1724 + }, + { + "epoch": 0.6909527622097679, + "grad_norm": 3.7248287200927734, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.3483, + "step": 1726 + }, + { + "epoch": 0.6917534027221778, + "grad_norm": 1.829479455947876, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.3041, + "step": 1728 + }, + { + "epoch": 0.6925540432345877, + "grad_norm": 1.9075309038162231, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.4161, + "step": 1730 + }, + { + "epoch": 0.6933546837469976, + "grad_norm": 1.5816259384155273, + "learning_rate": 1.99990235049015e-05, + "loss": 0.3039, + "step": 1732 + }, + { + "epoch": 0.6941553242594075, + "grad_norm": 1.5950456857681274, + "learning_rate": 1.9999375039475275e-05, + "loss": 0.3088, + "step": 1734 + }, + { + "epoch": 0.6949559647718174, + "grad_norm": 1.812955617904663, + "learning_rate": 1.999964845810285e-05, + "loss": 0.3379, + "step": 1736 + }, + { + "epoch": 0.6957566052842273, + "grad_norm": 2.8463058471679688, + "learning_rate": 1.9999843758648253e-05, + "loss": 0.3665, + "step": 1738 + }, + { + "epoch": 0.6965572457966374, + "grad_norm": 2.398483991622925, + "learning_rate": 1.999996093958578e-05, + "loss": 0.5323, + "step": 1740 + }, + { + "epoch": 0.6973578863090473, + "grad_norm": 2.0739543437957764, + "learning_rate": 2e-05, + "loss": 0.3979, + "step": 1742 + }, + { + "epoch": 0.6981585268214572, + "grad_norm": 1.9141533374786377, + "learning_rate": 1.999996093958578e-05, + "loss": 0.2767, + "step": 1744 + }, + { + "epoch": 0.6989591673338671, + "grad_norm": 1.9969048500061035, + "learning_rate": 1.9999843758648253e-05, + "loss": 0.3667, + "step": 1746 + }, + { + "epoch": 0.699759807846277, + "grad_norm": 2.642072916030884, + "learning_rate": 1.999964845810285e-05, + "loss": 0.4516, + "step": 1748 + }, + { + "epoch": 0.7005604483586869, + "grad_norm": 2.105377435684204, + "learning_rate": 1.9999375039475278e-05, + "loss": 0.3493, + "step": 1750 + }, + { + "epoch": 0.7013610888710968, + "grad_norm": 1.9239312410354614, + "learning_rate": 1.99990235049015e-05, + "loss": 0.3649, + "step": 1752 + }, + { + "epoch": 0.7021617293835068, + "grad_norm": 1.8371707201004028, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.3099, + "step": 1754 + }, + { + "epoch": 0.7029623698959168, + "grad_norm": 1.7146347761154175, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.3354, + "step": 1756 + }, + { + "epoch": 0.7037630104083267, + "grad_norm": 2.043933629989624, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.3266, + "step": 1758 + }, + { + "epoch": 0.7045636509207366, + "grad_norm": 2.1082658767700195, + "learning_rate": 1.999683627122195e-05, + "loss": 0.3354, + "step": 1760 + }, + { + "epoch": 0.7053642914331465, + "grad_norm": 2.024374008178711, + "learning_rate": 1.999609421031453e-05, + "loss": 0.3485, + "step": 1762 + }, + { + "epoch": 0.7061649319455564, + "grad_norm": 3.5911264419555664, + "learning_rate": 1.999527405909102e-05, + "loss": 0.4969, + "step": 1764 + }, + { + "epoch": 0.7069655724579663, + "grad_norm": 1.9529521465301514, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.3193, + "step": 1766 + }, + { + "epoch": 0.7077662129703763, + "grad_norm": 2.932257890701294, + "learning_rate": 1.999339951193407e-05, + "loss": 0.4698, + "step": 1768 + }, + { + "epoch": 0.7085668534827863, + "grad_norm": 1.4555257558822632, + "learning_rate": 1.999234513064475e-05, + "loss": 0.3098, + "step": 1770 + }, + { + "epoch": 0.7093674939951962, + "grad_norm": 2.2114884853363037, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.444, + "step": 1772 + }, + { + "epoch": 0.7101681345076061, + "grad_norm": 2.2350263595581055, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.4331, + "step": 1774 + }, + { + "epoch": 0.710968775020016, + "grad_norm": 2.3046798706054688, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.4512, + "step": 1776 + }, + { + "epoch": 0.7117694155324259, + "grad_norm": 1.868099570274353, + "learning_rate": 1.998734708672375e-05, + "loss": 0.2895, + "step": 1778 + }, + { + "epoch": 0.7125700560448359, + "grad_norm": 1.5812509059906006, + "learning_rate": 1.9985902494859026e-05, + "loss": 0.3191, + "step": 1780 + }, + { + "epoch": 0.7133706965572458, + "grad_norm": 2.1828789710998535, + "learning_rate": 1.9984379892296735e-05, + "loss": 0.3221, + "step": 1782 + }, + { + "epoch": 0.7141713370696557, + "grad_norm": 1.060969352722168, + "learning_rate": 1.9982779290931572e-05, + "loss": 0.3214, + "step": 1784 + }, + { + "epoch": 0.7149719775820657, + "grad_norm": 2.0192513465881348, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.3357, + "step": 1786 + }, + { + "epoch": 0.7157726180944756, + "grad_norm": 2.0573973655700684, + "learning_rate": 1.997934414241799e-05, + "loss": 0.3818, + "step": 1788 + }, + { + "epoch": 0.7165732586068855, + "grad_norm": 2.1431705951690674, + "learning_rate": 1.9977509622105236e-05, + "loss": 0.2809, + "step": 1790 + }, + { + "epoch": 0.7173738991192954, + "grad_norm": 1.3017196655273438, + "learning_rate": 1.997559715666073e-05, + "loss": 0.3041, + "step": 1792 + }, + { + "epoch": 0.7181745396317054, + "grad_norm": 1.2303800582885742, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.2517, + "step": 1794 + }, + { + "epoch": 0.7189751801441153, + "grad_norm": 1.7496037483215332, + "learning_rate": 1.997153845074662e-05, + "loss": 0.3072, + "step": 1796 + }, + { + "epoch": 0.7197758206565252, + "grad_norm": 2.213139057159424, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.3963, + "step": 1798 + }, + { + "epoch": 0.7205764611689351, + "grad_norm": 2.0531721115112305, + "learning_rate": 1.9967168151503193e-05, + "loss": 0.3047, + "step": 1800 + }, + { + "epoch": 0.7213771016813451, + "grad_norm": 2.0746915340423584, + "learning_rate": 1.996486619667911e-05, + "loss": 0.4544, + "step": 1802 + }, + { + "epoch": 0.722177742193755, + "grad_norm": 1.6495476961135864, + "learning_rate": 1.9962486395494753e-05, + "loss": 0.2591, + "step": 1804 + }, + { + "epoch": 0.7229783827061649, + "grad_norm": 1.4454381465911865, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.3819, + "step": 1806 + }, + { + "epoch": 0.7237790232185749, + "grad_norm": 1.4011479616165161, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.2869, + "step": 1808 + }, + { + "epoch": 0.7245796637309848, + "grad_norm": 1.1484624147415161, + "learning_rate": 1.995488010273198e-05, + "loss": 0.3336, + "step": 1810 + }, + { + "epoch": 0.7253803042433947, + "grad_norm": 1.4461569786071777, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.2591, + "step": 1812 + }, + { + "epoch": 0.7261809447558046, + "grad_norm": 2.097295045852661, + "learning_rate": 1.9949420366137873e-05, + "loss": 0.4346, + "step": 1814 + }, + { + "epoch": 0.7269815852682145, + "grad_norm": 2.2927258014678955, + "learning_rate": 1.994657389848176e-05, + "loss": 0.4332, + "step": 1816 + }, + { + "epoch": 0.7277822257806245, + "grad_norm": 2.1687190532684326, + "learning_rate": 1.994364972736634e-05, + "loss": 0.4439, + "step": 1818 + }, + { + "epoch": 0.7285828662930345, + "grad_norm": 1.6976609230041504, + "learning_rate": 1.9940647875635466e-05, + "loss": 0.3413, + "step": 1820 + }, + { + "epoch": 0.7293835068054444, + "grad_norm": 1.5597078800201416, + "learning_rate": 1.993756836673986e-05, + "loss": 0.329, + "step": 1822 + }, + { + "epoch": 0.7301841473178543, + "grad_norm": 3.9231553077697754, + "learning_rate": 1.99344112247369e-05, + "loss": 0.4717, + "step": 1824 + }, + { + "epoch": 0.7309847878302642, + "grad_norm": 1.3618313074111938, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.3042, + "step": 1826 + }, + { + "epoch": 0.7317854283426741, + "grad_norm": 1.3680285215377808, + "learning_rate": 1.9927864140670618e-05, + "loss": 0.2868, + "step": 1828 + }, + { + "epoch": 0.732586068855084, + "grad_norm": 1.1294482946395874, + "learning_rate": 1.9924474249753656e-05, + "loss": 0.2274, + "step": 1830 + }, + { + "epoch": 0.733386709367494, + "grad_norm": 2.4449658393859863, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.3237, + "step": 1832 + }, + { + "epoch": 0.734187349879904, + "grad_norm": 2.174004077911377, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.3978, + "step": 1834 + }, + { + "epoch": 0.7349879903923139, + "grad_norm": 1.5214084386825562, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.2529, + "step": 1836 + }, + { + "epoch": 0.7357886309047238, + "grad_norm": 1.9714912176132202, + "learning_rate": 1.9910139651840497e-05, + "loss": 0.3269, + "step": 1838 + }, + { + "epoch": 0.7365892714171337, + "grad_norm": 1.6271953582763672, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.3839, + "step": 1840 + }, + { + "epoch": 0.7373899119295436, + "grad_norm": 1.8272771835327148, + "learning_rate": 1.9902507726395524e-05, + "loss": 0.291, + "step": 1842 + }, + { + "epoch": 0.7381905524419535, + "grad_norm": 2.568488597869873, + "learning_rate": 1.989857570980049e-05, + "loss": 0.3647, + "step": 1844 + }, + { + "epoch": 0.7389911929543634, + "grad_norm": 1.1207910776138306, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.2875, + "step": 1846 + }, + { + "epoch": 0.7397918334667735, + "grad_norm": 1.437292456626892, + "learning_rate": 1.9890479722451292e-05, + "loss": 0.3354, + "step": 1848 + }, + { + "epoch": 0.7405924739791834, + "grad_norm": 3.117565870285034, + "learning_rate": 1.988631581494365e-05, + "loss": 0.3977, + "step": 1850 + }, + { + "epoch": 0.7413931144915933, + "grad_norm": 2.394374132156372, + "learning_rate": 1.9882074674717832e-05, + "loss": 0.3704, + "step": 1852 + }, + { + "epoch": 0.7421937550040032, + "grad_norm": 1.0983376502990723, + "learning_rate": 1.987775633490599e-05, + "loss": 0.239, + "step": 1854 + }, + { + "epoch": 0.7429943955164131, + "grad_norm": 1.9386751651763916, + "learning_rate": 1.987336082924333e-05, + "loss": 0.159, + "step": 1856 + }, + { + "epoch": 0.743795036028823, + "grad_norm": 1.570999264717102, + "learning_rate": 1.986888819206792e-05, + "loss": 0.3409, + "step": 1858 + }, + { + "epoch": 0.7445956765412329, + "grad_norm": 1.2688533067703247, + "learning_rate": 1.986433845832037e-05, + "loss": 0.2468, + "step": 1860 + }, + { + "epoch": 0.745396317053643, + "grad_norm": 1.31288743019104, + "learning_rate": 1.9859711663543573e-05, + "loss": 0.3348, + "step": 1862 + }, + { + "epoch": 0.7461969575660529, + "grad_norm": 2.3842742443084717, + "learning_rate": 1.9855007843882437e-05, + "loss": 0.4697, + "step": 1864 + }, + { + "epoch": 0.7469975980784628, + "grad_norm": 2.031827449798584, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.3643, + "step": 1866 + }, + { + "epoch": 0.7477982385908727, + "grad_norm": 2.281920909881592, + "learning_rate": 1.9845369277495105e-05, + "loss": 0.3495, + "step": 1868 + }, + { + "epoch": 0.7485988791032826, + "grad_norm": 2.4656429290771484, + "learning_rate": 1.9840434606066186e-05, + "loss": 0.5286, + "step": 1870 + }, + { + "epoch": 0.7493995196156925, + "grad_norm": 1.8807258605957031, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.2894, + "step": 1872 + }, + { + "epoch": 0.7502001601281025, + "grad_norm": 2.5805678367614746, + "learning_rate": 1.983033467948784e-05, + "loss": 0.419, + "step": 1874 + }, + { + "epoch": 0.7510008006405124, + "grad_norm": 1.9327245950698853, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.4159, + "step": 1876 + }, + { + "epoch": 0.7518014411529224, + "grad_norm": 3.7604217529296875, + "learning_rate": 1.9819927571953804e-05, + "loss": 0.4923, + "step": 1878 + }, + { + "epoch": 0.7526020816653323, + "grad_norm": 1.5705304145812988, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.2935, + "step": 1880 + }, + { + "epoch": 0.7534027221777422, + "grad_norm": 1.741519570350647, + "learning_rate": 1.980921360866819e-05, + "loss": 0.3039, + "step": 1882 + }, + { + "epoch": 0.7542033626901521, + "grad_norm": 1.8408738374710083, + "learning_rate": 1.9803741660367018e-05, + "loss": 0.3651, + "step": 1884 + }, + { + "epoch": 0.755004003202562, + "grad_norm": 1.636572003364563, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.2527, + "step": 1886 + }, + { + "epoch": 0.755804643714972, + "grad_norm": 1.52336585521698, + "learning_rate": 1.979256804418418e-05, + "loss": 0.3087, + "step": 1888 + }, + { + "epoch": 0.7566052842273819, + "grad_norm": 1.4984246492385864, + "learning_rate": 1.978686646359173e-05, + "loss": 0.3647, + "step": 1890 + }, + { + "epoch": 0.7574059247397918, + "grad_norm": 1.474303960800171, + "learning_rate": 1.9781088427187677e-05, + "loss": 0.3892, + "step": 1892 + }, + { + "epoch": 0.7582065652522018, + "grad_norm": 1.9434893131256104, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.3525, + "step": 1894 + }, + { + "epoch": 0.7590072057646117, + "grad_norm": 1.5981813669204712, + "learning_rate": 1.976930316809569e-05, + "loss": 0.2054, + "step": 1896 + }, + { + "epoch": 0.7598078462770216, + "grad_norm": 1.7283824682235718, + "learning_rate": 1.9763296037475177e-05, + "loss": 0.497, + "step": 1898 + }, + { + "epoch": 0.7606084867894315, + "grad_norm": 3.127582311630249, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.3665, + "step": 1900 + }, + { + "epoch": 0.7614091273018415, + "grad_norm": 1.5416374206542969, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.2765, + "step": 1902 + }, + { + "epoch": 0.7622097678142514, + "grad_norm": 1.8044326305389404, + "learning_rate": 1.9744817206240374e-05, + "loss": 0.335, + "step": 1904 + }, + { + "epoch": 0.7630104083266613, + "grad_norm": 1.295658826828003, + "learning_rate": 1.9738505276435695e-05, + "loss": 0.3568, + "step": 1906 + }, + { + "epoch": 0.7638110488390712, + "grad_norm": 3.4765822887420654, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.4717, + "step": 1908 + }, + { + "epoch": 0.7646116893514812, + "grad_norm": 1.971982717514038, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.3189, + "step": 1910 + }, + { + "epoch": 0.7654123298638911, + "grad_norm": 1.3843297958374023, + "learning_rate": 1.9719113219170152e-05, + "loss": 0.2906, + "step": 1912 + }, + { + "epoch": 0.7662129703763011, + "grad_norm": 1.1560331583023071, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.2861, + "step": 1914 + }, + { + "epoch": 0.767013610888711, + "grad_norm": 2.285705804824829, + "learning_rate": 1.970580546424186e-05, + "loss": 0.3191, + "step": 1916 + }, + { + "epoch": 0.7678142514011209, + "grad_norm": 1.7245991230010986, + "learning_rate": 1.969903782680467e-05, + "loss": 0.3644, + "step": 1918 + }, + { + "epoch": 0.7686148919135308, + "grad_norm": 1.9701316356658936, + "learning_rate": 1.9692194419680463e-05, + "loss": 0.3644, + "step": 1920 + }, + { + "epoch": 0.7694155324259407, + "grad_norm": 2.021693468093872, + "learning_rate": 1.96852752963305e-05, + "loss": 0.3189, + "step": 1922 + }, + { + "epoch": 0.7702161729383507, + "grad_norm": 2.016028881072998, + "learning_rate": 1.9678280510807552e-05, + "loss": 0.3221, + "step": 1924 + }, + { + "epoch": 0.7710168134507606, + "grad_norm": 1.0357534885406494, + "learning_rate": 1.9671210117755462e-05, + "loss": 0.2617, + "step": 1926 + }, + { + "epoch": 0.7718174539631706, + "grad_norm": 2.87754225730896, + "learning_rate": 1.966406417240872e-05, + "loss": 0.3494, + "step": 1928 + }, + { + "epoch": 0.7726180944755805, + "grad_norm": 1.6966907978057861, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.3056, + "step": 1930 + }, + { + "epoch": 0.7734187349879904, + "grad_norm": 3.053945779800415, + "learning_rate": 1.964954584871995e-05, + "loss": 0.335, + "step": 1932 + }, + { + "epoch": 0.7742193755004003, + "grad_norm": 1.5729306936264038, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.2273, + "step": 1934 + }, + { + "epoch": 0.7750200160128102, + "grad_norm": 1.5992624759674072, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.3039, + "step": 1936 + }, + { + "epoch": 0.7758206565252201, + "grad_norm": 1.8731753826141357, + "learning_rate": 1.9627203135753576e-05, + "loss": 0.4546, + "step": 1938 + }, + { + "epoch": 0.77662129703763, + "grad_norm": 2.0451152324676514, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.4328, + "step": 1940 + }, + { + "epoch": 0.7774219375500401, + "grad_norm": 2.9926600456237793, + "learning_rate": 1.961193185426459e-05, + "loss": 0.4439, + "step": 1942 + }, + { + "epoch": 0.77822257806245, + "grad_norm": 1.3710049390792847, + "learning_rate": 1.9604183549736287e-05, + "loss": 0.3228, + "step": 1944 + }, + { + "epoch": 0.7790232185748599, + "grad_norm": 1.8223562240600586, + "learning_rate": 1.959636021653044e-05, + "loss": 0.2196, + "step": 1946 + }, + { + "epoch": 0.7798238590872698, + "grad_norm": 12.421886444091797, + "learning_rate": 1.958846191576357e-05, + "loss": 0.3992, + "step": 1948 + }, + { + "epoch": 0.7806244995996797, + "grad_norm": 3.162137508392334, + "learning_rate": 1.958048870913786e-05, + "loss": 0.4887, + "step": 1950 + }, + { + "epoch": 0.7814251401120896, + "grad_norm": 1.5685700178146362, + "learning_rate": 1.9572440658940667e-05, + "loss": 0.3999, + "step": 1952 + }, + { + "epoch": 0.7822257806244995, + "grad_norm": 1.047526478767395, + "learning_rate": 1.9564317828044022e-05, + "loss": 0.2964, + "step": 1954 + }, + { + "epoch": 0.7830264211369096, + "grad_norm": 1.5971962213516235, + "learning_rate": 1.955612027990415e-05, + "loss": 0.3818, + "step": 1956 + }, + { + "epoch": 0.7838270616493195, + "grad_norm": 1.7540650367736816, + "learning_rate": 1.9547848078560982e-05, + "loss": 0.2614, + "step": 1958 + }, + { + "epoch": 0.7846277021617294, + "grad_norm": 1.6170439720153809, + "learning_rate": 1.953950128863763e-05, + "loss": 0.4159, + "step": 1960 + }, + { + "epoch": 0.7854283426741393, + "grad_norm": 1.774768590927124, + "learning_rate": 1.9531079975339915e-05, + "loss": 0.2985, + "step": 1962 + }, + { + "epoch": 0.7862289831865492, + "grad_norm": 2.8288071155548096, + "learning_rate": 1.9522584204455835e-05, + "loss": 0.3193, + "step": 1964 + }, + { + "epoch": 0.7870296236989591, + "grad_norm": 1.0530158281326294, + "learning_rate": 1.9514014042355054e-05, + "loss": 0.2721, + "step": 1966 + }, + { + "epoch": 0.7878302642113691, + "grad_norm": 2.178260087966919, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.3338, + "step": 1968 + }, + { + "epoch": 0.7886309047237791, + "grad_norm": 1.8271315097808838, + "learning_rate": 1.9496650812887293e-05, + "loss": 0.2653, + "step": 1970 + }, + { + "epoch": 0.789431545236189, + "grad_norm": 2.5157248973846436, + "learning_rate": 1.9487857881163295e-05, + "loss": 0.4282, + "step": 1972 + }, + { + "epoch": 0.7902321857485989, + "grad_norm": 2.0321598052978516, + "learning_rate": 1.947899082950751e-05, + "loss": 0.3999, + "step": 1974 + }, + { + "epoch": 0.7910328262610088, + "grad_norm": 1.4095289707183838, + "learning_rate": 1.947004972719008e-05, + "loss": 0.272, + "step": 1976 + }, + { + "epoch": 0.7918334667734187, + "grad_norm": 2.15885329246521, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.3703, + "step": 1978 + }, + { + "epoch": 0.7926341072858286, + "grad_norm": 1.0161467790603638, + "learning_rate": 1.945194565054276e-05, + "loss": 0.2716, + "step": 1980 + }, + { + "epoch": 0.7934347477982386, + "grad_norm": 1.5447043180465698, + "learning_rate": 1.9442782817643425e-05, + "loss": 0.183, + "step": 1982 + }, + { + "epoch": 0.7942353883106485, + "grad_norm": 2.7783126831054688, + "learning_rate": 1.9433546216942433e-05, + "loss": 0.4738, + "step": 1984 + }, + { + "epoch": 0.7950360288230585, + "grad_norm": 2.5871963500976562, + "learning_rate": 1.942423592059687e-05, + "loss": 0.5289, + "step": 1986 + }, + { + "epoch": 0.7958366693354684, + "grad_norm": 1.9891127347946167, + "learning_rate": 1.941485200133955e-05, + "loss": 0.3097, + "step": 1988 + }, + { + "epoch": 0.7966373098478783, + "grad_norm": 1.6599457263946533, + "learning_rate": 1.9405394532478422e-05, + "loss": 0.3525, + "step": 1990 + }, + { + "epoch": 0.7974379503602882, + "grad_norm": 2.059659719467163, + "learning_rate": 1.9395863587896025e-05, + "loss": 0.4157, + "step": 1992 + }, + { + "epoch": 0.7982385908726981, + "grad_norm": 2.074141502380371, + "learning_rate": 1.938625924204888e-05, + "loss": 0.3664, + "step": 1994 + }, + { + "epoch": 0.7990392313851081, + "grad_norm": 3.233135223388672, + "learning_rate": 1.937658156996694e-05, + "loss": 0.315, + "step": 1996 + }, + { + "epoch": 0.799839871897518, + "grad_norm": 1.3279279470443726, + "learning_rate": 1.9366830647252977e-05, + "loss": 0.2708, + "step": 1998 + }, + { + "epoch": 0.800640512409928, + "grad_norm": 2.44195556640625, + "learning_rate": 1.9357006550082e-05, + "loss": 0.4517, + "step": 2000 + }, + { + "epoch": 0.8014411529223379, + "grad_norm": 1.5708134174346924, + "learning_rate": 1.9347109355200676e-05, + "loss": 0.2531, + "step": 2002 + }, + { + "epoch": 0.8022417934347478, + "grad_norm": 1.996256947517395, + "learning_rate": 1.933713913992671e-05, + "loss": 0.3665, + "step": 2004 + }, + { + "epoch": 0.8030424339471577, + "grad_norm": 4.001356601715088, + "learning_rate": 1.9327095982148255e-05, + "loss": 0.6356, + "step": 2006 + }, + { + "epoch": 0.8038430744595677, + "grad_norm": 1.9797258377075195, + "learning_rate": 1.9316979960323283e-05, + "loss": 0.4543, + "step": 2008 + }, + { + "epoch": 0.8046437149719776, + "grad_norm": 2.3345234394073486, + "learning_rate": 1.9306791153479017e-05, + "loss": 0.3973, + "step": 2010 + }, + { + "epoch": 0.8054443554843875, + "grad_norm": 1.7019286155700684, + "learning_rate": 1.9296529641211226e-05, + "loss": 0.4593, + "step": 2012 + }, + { + "epoch": 0.8062449959967974, + "grad_norm": 2.0184085369110107, + "learning_rate": 1.928619550368371e-05, + "loss": 0.3218, + "step": 2014 + }, + { + "epoch": 0.8070456365092074, + "grad_norm": 1.8129746913909912, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.3335, + "step": 2016 + }, + { + "epoch": 0.8078462770216173, + "grad_norm": 1.7098060846328735, + "learning_rate": 1.9265309676340783e-05, + "loss": 0.3333, + "step": 2018 + }, + { + "epoch": 0.8086469175340272, + "grad_norm": 0.9989462494850159, + "learning_rate": 1.9254758149687187e-05, + "loss": 0.1609, + "step": 2020 + }, + { + "epoch": 0.8094475580464372, + "grad_norm": 2.2205958366394043, + "learning_rate": 1.9244134324096216e-05, + "loss": 0.3761, + "step": 2022 + }, + { + "epoch": 0.8102481985588471, + "grad_norm": 1.7769612073898315, + "learning_rate": 1.9233438282562095e-05, + "loss": 0.3819, + "step": 2024 + }, + { + "epoch": 0.811048839071257, + "grad_norm": 1.78672456741333, + "learning_rate": 1.9222670108643156e-05, + "loss": 0.3334, + "step": 2026 + }, + { + "epoch": 0.8118494795836669, + "grad_norm": 1.6391185522079468, + "learning_rate": 1.9211829886461278e-05, + "loss": 0.3494, + "step": 2028 + }, + { + "epoch": 0.8126501200960768, + "grad_norm": 2.059591293334961, + "learning_rate": 1.9200917700701176e-05, + "loss": 0.3569, + "step": 2030 + }, + { + "epoch": 0.8134507606084868, + "grad_norm": 1.0531617403030396, + "learning_rate": 1.918993363660975e-05, + "loss": 0.2272, + "step": 2032 + }, + { + "epoch": 0.8142514011208967, + "grad_norm": 1.6785855293273926, + "learning_rate": 1.9178877779995416e-05, + "loss": 0.3888, + "step": 2034 + }, + { + "epoch": 0.8150520416333067, + "grad_norm": 1.6212927103042603, + "learning_rate": 1.916775021722745e-05, + "loss": 0.2469, + "step": 2036 + }, + { + "epoch": 0.8158526821457166, + "grad_norm": 1.666360855102539, + "learning_rate": 1.9156551035235298e-05, + "loss": 0.3057, + "step": 2038 + }, + { + "epoch": 0.8166533226581265, + "grad_norm": 1.119423747062683, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.2483, + "step": 2040 + }, + { + "epoch": 0.8174539631705364, + "grad_norm": 1.4689459800720215, + "learning_rate": 1.9133938164092942e-05, + "loss": 0.3193, + "step": 2042 + }, + { + "epoch": 0.8182546036829463, + "grad_norm": 2.374619722366333, + "learning_rate": 1.9122524651596372e-05, + "loss": 0.392, + "step": 2044 + }, + { + "epoch": 0.8190552441953562, + "grad_norm": 0.9161267280578613, + "learning_rate": 1.9111039873181475e-05, + "loss": 0.3654, + "step": 2046 + }, + { + "epoch": 0.8198558847077662, + "grad_norm": 2.5642735958099365, + "learning_rate": 1.9099483918568287e-05, + "loss": 0.3722, + "step": 2048 + }, + { + "epoch": 0.8206565252201762, + "grad_norm": 2.255756139755249, + "learning_rate": 1.90878568780329e-05, + "loss": 0.4158, + "step": 2050 + }, + { + "epoch": 0.8214571657325861, + "grad_norm": 2.712294816970825, + "learning_rate": 1.907615884240668e-05, + "loss": 0.4348, + "step": 2052 + }, + { + "epoch": 0.822257806244996, + "grad_norm": 2.815875291824341, + "learning_rate": 1.9064389903075683e-05, + "loss": 0.3334, + "step": 2054 + }, + { + "epoch": 0.8230584467574059, + "grad_norm": 3.5685105323791504, + "learning_rate": 1.905255015197982e-05, + "loss": 0.3867, + "step": 2056 + }, + { + "epoch": 0.8238590872698158, + "grad_norm": 2.0109736919403076, + "learning_rate": 1.9040639681612216e-05, + "loss": 0.3102, + "step": 2058 + }, + { + "epoch": 0.8246597277822257, + "grad_norm": 1.801324725151062, + "learning_rate": 1.902865858501845e-05, + "loss": 0.367, + "step": 2060 + }, + { + "epoch": 0.8254603682946358, + "grad_norm": 2.429637908935547, + "learning_rate": 1.9016606955795843e-05, + "loss": 0.4506, + "step": 2062 + }, + { + "epoch": 0.8262610088070457, + "grad_norm": 4.220076084136963, + "learning_rate": 1.9004484888092734e-05, + "loss": 0.4889, + "step": 2064 + }, + { + "epoch": 0.8270616493194556, + "grad_norm": 1.7491154670715332, + "learning_rate": 1.8992292476607695e-05, + "loss": 0.3978, + "step": 2066 + }, + { + "epoch": 0.8278622898318655, + "grad_norm": 1.3625667095184326, + "learning_rate": 1.8980029816588863e-05, + "loss": 0.262, + "step": 2068 + }, + { + "epoch": 0.8286629303442754, + "grad_norm": 1.9692350625991821, + "learning_rate": 1.8967697003833156e-05, + "loss": 0.3666, + "step": 2070 + }, + { + "epoch": 0.8294635708566853, + "grad_norm": 1.4705450534820557, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.3392, + "step": 2072 + }, + { + "epoch": 0.8302642113690952, + "grad_norm": 2.2307722568511963, + "learning_rate": 1.8942821306038227e-05, + "loss": 0.4189, + "step": 2074 + }, + { + "epoch": 0.8310648518815053, + "grad_norm": 1.3099098205566406, + "learning_rate": 1.893027861533003e-05, + "loss": 0.1155, + "step": 2076 + }, + { + "epoch": 0.8318654923939152, + "grad_norm": 2.007342576980591, + "learning_rate": 1.891766616054545e-05, + "loss": 0.3979, + "step": 2078 + }, + { + "epoch": 0.8326661329063251, + "grad_norm": 2.180225372314453, + "learning_rate": 1.8904984040214043e-05, + "loss": 0.5035, + "step": 2080 + }, + { + "epoch": 0.833466773418735, + "grad_norm": 1.4589061737060547, + "learning_rate": 1.8892232353409582e-05, + "loss": 0.2365, + "step": 2082 + }, + { + "epoch": 0.8342674139311449, + "grad_norm": 1.668036699295044, + "learning_rate": 1.8879411199749306e-05, + "loss": 0.306, + "step": 2084 + }, + { + "epoch": 0.8350680544435548, + "grad_norm": 4.015446186065674, + "learning_rate": 1.8866520679393124e-05, + "loss": 0.3815, + "step": 2086 + }, + { + "epoch": 0.8358686949559647, + "grad_norm": 1.9432164430618286, + "learning_rate": 1.885356089304285e-05, + "loss": 0.3037, + "step": 2088 + }, + { + "epoch": 0.8366693354683747, + "grad_norm": 1.6915879249572754, + "learning_rate": 1.884053194194143e-05, + "loss": 0.3645, + "step": 2090 + }, + { + "epoch": 0.8374699759807847, + "grad_norm": 2.150740385055542, + "learning_rate": 1.882743392787207e-05, + "loss": 0.4156, + "step": 2092 + }, + { + "epoch": 0.8382706164931946, + "grad_norm": 1.346297264099121, + "learning_rate": 1.881426695315756e-05, + "loss": 0.4019, + "step": 2094 + }, + { + "epoch": 0.8390712570056045, + "grad_norm": 2.627264976501465, + "learning_rate": 1.8801031120659396e-05, + "loss": 0.4343, + "step": 2096 + }, + { + "epoch": 0.8398718975180144, + "grad_norm": 2.673058271408081, + "learning_rate": 1.8787726533777003e-05, + "loss": 0.2909, + "step": 2098 + }, + { + "epoch": 0.8406725380304243, + "grad_norm": 1.092536211013794, + "learning_rate": 1.877435329644691e-05, + "loss": 0.2628, + "step": 2100 + }, + { + "epoch": 0.8414731785428343, + "grad_norm": 1.7632068395614624, + "learning_rate": 1.8760911513141974e-05, + "loss": 0.398, + "step": 2102 + }, + { + "epoch": 0.8422738190552442, + "grad_norm": 1.4025750160217285, + "learning_rate": 1.8747401288870482e-05, + "loss": 0.3355, + "step": 2104 + }, + { + "epoch": 0.8430744595676541, + "grad_norm": 1.5885928869247437, + "learning_rate": 1.8733822729175455e-05, + "loss": 0.3024, + "step": 2106 + }, + { + "epoch": 0.8438751000800641, + "grad_norm": 2.2402327060699463, + "learning_rate": 1.8720175940133712e-05, + "loss": 0.3101, + "step": 2108 + }, + { + "epoch": 0.844675740592474, + "grad_norm": 1.8983564376831055, + "learning_rate": 1.8706461028355107e-05, + "loss": 0.306, + "step": 2110 + }, + { + "epoch": 0.8454763811048839, + "grad_norm": 1.9304133653640747, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.3359, + "step": 2112 + }, + { + "epoch": 0.8462770216172938, + "grad_norm": 2.3422863483428955, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.3998, + "step": 2114 + }, + { + "epoch": 0.8470776621297038, + "grad_norm": 1.9561142921447754, + "learning_rate": 1.8664908630674264e-05, + "loss": 0.4347, + "step": 2116 + }, + { + "epoch": 0.8478783026421137, + "grad_norm": 1.385205864906311, + "learning_rate": 1.86509223046777e-05, + "loss": 0.3135, + "step": 2118 + }, + { + "epoch": 0.8486789431545236, + "grad_norm": 2.467895746231079, + "learning_rate": 1.8636868396959406e-05, + "loss": 0.3529, + "step": 2120 + }, + { + "epoch": 0.8494795836669335, + "grad_norm": 1.082444429397583, + "learning_rate": 1.8622747017309676e-05, + "loss": 0.3351, + "step": 2122 + }, + { + "epoch": 0.8502802241793435, + "grad_norm": 1.7314245700836182, + "learning_rate": 1.8608558276045898e-05, + "loss": 0.3486, + "step": 2124 + }, + { + "epoch": 0.8510808646917534, + "grad_norm": 3.74178147315979, + "learning_rate": 1.8594302284011697e-05, + "loss": 0.2909, + "step": 2126 + }, + { + "epoch": 0.8518815052041633, + "grad_norm": 1.416300892829895, + "learning_rate": 1.8579979152576076e-05, + "loss": 0.3041, + "step": 2128 + }, + { + "epoch": 0.8526821457165733, + "grad_norm": 1.9148643016815186, + "learning_rate": 1.8565588993632498e-05, + "loss": 0.2852, + "step": 2130 + }, + { + "epoch": 0.8534827862289832, + "grad_norm": 2.8833649158477783, + "learning_rate": 1.8551131919598084e-05, + "loss": 0.4148, + "step": 2132 + }, + { + "epoch": 0.8542834267413931, + "grad_norm": 1.8299134969711304, + "learning_rate": 1.8536608043412702e-05, + "loss": 0.3817, + "step": 2134 + }, + { + "epoch": 0.855084067253803, + "grad_norm": 7.417710781097412, + "learning_rate": 1.852201747853807e-05, + "loss": 0.3564, + "step": 2136 + }, + { + "epoch": 0.855884707766213, + "grad_norm": 0.9801858067512512, + "learning_rate": 1.8507360338956896e-05, + "loss": 0.3422, + "step": 2138 + }, + { + "epoch": 0.8566853482786229, + "grad_norm": 2.2677383422851562, + "learning_rate": 1.849263673917196e-05, + "loss": 0.342, + "step": 2140 + }, + { + "epoch": 0.8574859887910328, + "grad_norm": 1.9444514513015747, + "learning_rate": 1.847784679420527e-05, + "loss": 0.2724, + "step": 2142 + }, + { + "epoch": 0.8582866293034428, + "grad_norm": 1.122397541999817, + "learning_rate": 1.846299061959706e-05, + "loss": 0.3828, + "step": 2144 + }, + { + "epoch": 0.8590872698158527, + "grad_norm": 1.7408475875854492, + "learning_rate": 1.8448068331405018e-05, + "loss": 0.4158, + "step": 2146 + }, + { + "epoch": 0.8598879103282626, + "grad_norm": 2.814633369445801, + "learning_rate": 1.8433080046203293e-05, + "loss": 0.3921, + "step": 2148 + }, + { + "epoch": 0.8606885508406725, + "grad_norm": 2.0172040462493896, + "learning_rate": 1.841802588108161e-05, + "loss": 0.3486, + "step": 2150 + }, + { + "epoch": 0.8614891913530824, + "grad_norm": 1.2121919393539429, + "learning_rate": 1.8402905953644356e-05, + "loss": 0.2157, + "step": 2152 + }, + { + "epoch": 0.8622898318654924, + "grad_norm": 1.157463788986206, + "learning_rate": 1.838772038200968e-05, + "loss": 0.3221, + "step": 2154 + }, + { + "epoch": 0.8630904723779024, + "grad_norm": 1.3618022203445435, + "learning_rate": 1.837246928480848e-05, + "loss": 0.2363, + "step": 2156 + }, + { + "epoch": 0.8638911128903123, + "grad_norm": 1.2547794580459595, + "learning_rate": 1.8357152781183613e-05, + "loss": 0.3719, + "step": 2158 + }, + { + "epoch": 0.8646917534027222, + "grad_norm": 1.5543447732925415, + "learning_rate": 1.8341770990788874e-05, + "loss": 0.3848, + "step": 2160 + }, + { + "epoch": 0.8654923939151321, + "grad_norm": 1.102698564529419, + "learning_rate": 1.8326324033788087e-05, + "loss": 0.2659, + "step": 2162 + }, + { + "epoch": 0.866293034427542, + "grad_norm": 1.3055088520050049, + "learning_rate": 1.831081203085415e-05, + "loss": 0.3131, + "step": 2164 + }, + { + "epoch": 0.8670936749399519, + "grad_norm": 1.503210425376892, + "learning_rate": 1.8295235103168128e-05, + "loss": 0.3728, + "step": 2166 + }, + { + "epoch": 0.8678943154523618, + "grad_norm": 2.6354424953460693, + "learning_rate": 1.8279593372418284e-05, + "loss": 0.3492, + "step": 2168 + }, + { + "epoch": 0.8686949559647719, + "grad_norm": 1.0803451538085938, + "learning_rate": 1.8263886960799072e-05, + "loss": 0.1852, + "step": 2170 + }, + { + "epoch": 0.8694955964771818, + "grad_norm": 1.9417184591293335, + "learning_rate": 1.8248115991010303e-05, + "loss": 0.6958, + "step": 2172 + }, + { + "epoch": 0.8702962369895917, + "grad_norm": 1.3278688192367554, + "learning_rate": 1.8232280586256104e-05, + "loss": 0.277, + "step": 2174 + }, + { + "epoch": 0.8710968775020016, + "grad_norm": 1.8817968368530273, + "learning_rate": 1.8216380870243963e-05, + "loss": 0.371, + "step": 2176 + }, + { + "epoch": 0.8718975180144115, + "grad_norm": 1.768061637878418, + "learning_rate": 1.820041696718378e-05, + "loss": 0.3336, + "step": 2178 + }, + { + "epoch": 0.8726981585268214, + "grad_norm": 1.7466208934783936, + "learning_rate": 1.8184389001786912e-05, + "loss": 0.3488, + "step": 2180 + }, + { + "epoch": 0.8734987990392313, + "grad_norm": 1.656299352645874, + "learning_rate": 1.8168297099265108e-05, + "loss": 0.3194, + "step": 2182 + }, + { + "epoch": 0.8742994395516414, + "grad_norm": 1.2894476652145386, + "learning_rate": 1.815214138532966e-05, + "loss": 0.2171, + "step": 2184 + }, + { + "epoch": 0.8751000800640513, + "grad_norm": 2.72453236579895, + "learning_rate": 1.8135921986190358e-05, + "loss": 0.3977, + "step": 2186 + }, + { + "epoch": 0.8759007205764612, + "grad_norm": 1.4677865505218506, + "learning_rate": 1.8119639028554475e-05, + "loss": 0.4038, + "step": 2188 + }, + { + "epoch": 0.8767013610888711, + "grad_norm": 1.6058014631271362, + "learning_rate": 1.8103292639625835e-05, + "loss": 0.2911, + "step": 2190 + }, + { + "epoch": 0.877502001601281, + "grad_norm": 1.2809691429138184, + "learning_rate": 1.808688294710378e-05, + "loss": 0.2486, + "step": 2192 + }, + { + "epoch": 0.8783026421136909, + "grad_norm": 4.73563289642334, + "learning_rate": 1.807041007918221e-05, + "loss": 0.5509, + "step": 2194 + }, + { + "epoch": 0.8791032826261009, + "grad_norm": 1.631977915763855, + "learning_rate": 1.805387416454849e-05, + "loss": 0.2226, + "step": 2196 + }, + { + "epoch": 0.8799039231385108, + "grad_norm": 1.765251874923706, + "learning_rate": 1.8037275332382575e-05, + "loss": 0.2969, + "step": 2198 + }, + { + "epoch": 0.8807045636509208, + "grad_norm": 1.2714011669158936, + "learning_rate": 1.802061371235592e-05, + "loss": 0.2913, + "step": 2200 + }, + { + "epoch": 0.8815052041633307, + "grad_norm": 2.4311609268188477, + "learning_rate": 1.8003889434630476e-05, + "loss": 0.2984, + "step": 2202 + }, + { + "epoch": 0.8823058446757406, + "grad_norm": 1.2438703775405884, + "learning_rate": 1.7987102629857692e-05, + "loss": 0.2275, + "step": 2204 + }, + { + "epoch": 0.8831064851881505, + "grad_norm": 2.042208433151245, + "learning_rate": 1.7970253429177494e-05, + "loss": 0.3336, + "step": 2206 + }, + { + "epoch": 0.8839071257005604, + "grad_norm": 2.147754669189453, + "learning_rate": 1.7953341964217196e-05, + "loss": 0.4519, + "step": 2208 + }, + { + "epoch": 0.8847077662129704, + "grad_norm": 1.3914358615875244, + "learning_rate": 1.7936368367090583e-05, + "loss": 0.2644, + "step": 2210 + }, + { + "epoch": 0.8855084067253803, + "grad_norm": 0.9618812203407288, + "learning_rate": 1.7919332770396798e-05, + "loss": 0.3154, + "step": 2212 + }, + { + "epoch": 0.8863090472377902, + "grad_norm": 1.919188141822815, + "learning_rate": 1.7902235307219336e-05, + "loss": 0.3267, + "step": 2214 + }, + { + "epoch": 0.8871096877502002, + "grad_norm": 0.9547030329704285, + "learning_rate": 1.7885076111125e-05, + "loss": 0.2377, + "step": 2216 + }, + { + "epoch": 0.8879103282626101, + "grad_norm": 2.0331990718841553, + "learning_rate": 1.7867855316162846e-05, + "loss": 0.3098, + "step": 2218 + }, + { + "epoch": 0.88871096877502, + "grad_norm": 2.1551802158355713, + "learning_rate": 1.7850573056863173e-05, + "loss": 0.3055, + "step": 2220 + }, + { + "epoch": 0.8895116092874299, + "grad_norm": 2.6411960124969482, + "learning_rate": 1.783322946823638e-05, + "loss": 0.3391, + "step": 2222 + }, + { + "epoch": 0.8903122497998399, + "grad_norm": 3.4217631816864014, + "learning_rate": 1.7815824685772042e-05, + "loss": 0.446, + "step": 2224 + }, + { + "epoch": 0.8911128903122498, + "grad_norm": 1.6346440315246582, + "learning_rate": 1.779835884543776e-05, + "loss": 0.3331, + "step": 2226 + }, + { + "epoch": 0.8919135308246597, + "grad_norm": 1.2116870880126953, + "learning_rate": 1.7780832083678122e-05, + "loss": 0.2513, + "step": 2228 + }, + { + "epoch": 0.8927141713370697, + "grad_norm": 3.3853113651275635, + "learning_rate": 1.776324453741365e-05, + "loss": 0.332, + "step": 2230 + }, + { + "epoch": 0.8935148118494796, + "grad_norm": 1.1895833015441895, + "learning_rate": 1.774559634403971e-05, + "loss": 0.4146, + "step": 2232 + }, + { + "epoch": 0.8943154523618895, + "grad_norm": 1.5353069305419922, + "learning_rate": 1.7727887641425465e-05, + "loss": 0.1944, + "step": 2234 + }, + { + "epoch": 0.8951160928742994, + "grad_norm": 1.3402515649795532, + "learning_rate": 1.7710118567912732e-05, + "loss": 0.3449, + "step": 2236 + }, + { + "epoch": 0.8959167333867094, + "grad_norm": 2.3798153400421143, + "learning_rate": 1.7692289262315008e-05, + "loss": 0.2952, + "step": 2238 + }, + { + "epoch": 0.8967173738991193, + "grad_norm": 2.175363063812256, + "learning_rate": 1.7674399863916298e-05, + "loss": 0.3036, + "step": 2240 + }, + { + "epoch": 0.8975180144115292, + "grad_norm": 2.1126277446746826, + "learning_rate": 1.765645051247007e-05, + "loss": 0.3537, + "step": 2242 + }, + { + "epoch": 0.8983186549239391, + "grad_norm": 2.2855224609375, + "learning_rate": 1.7638441348198144e-05, + "loss": 0.2823, + "step": 2244 + }, + { + "epoch": 0.899119295436349, + "grad_norm": 4.008136749267578, + "learning_rate": 1.762037251178961e-05, + "loss": 0.3813, + "step": 2246 + }, + { + "epoch": 0.899919935948759, + "grad_norm": 2.064683675765991, + "learning_rate": 1.7602244144399713e-05, + "loss": 0.3844, + "step": 2248 + }, + { + "epoch": 0.900720576461169, + "grad_norm": 1.8862433433532715, + "learning_rate": 1.7584056387648738e-05, + "loss": 0.2905, + "step": 2250 + }, + { + "epoch": 0.9015212169735789, + "grad_norm": 1.1513227224349976, + "learning_rate": 1.7565809383620966e-05, + "loss": 0.2755, + "step": 2252 + }, + { + "epoch": 0.9023218574859888, + "grad_norm": 2.250540256500244, + "learning_rate": 1.7547503274863502e-05, + "loss": 0.3493, + "step": 2254 + }, + { + "epoch": 0.9031224979983987, + "grad_norm": 2.252394914627075, + "learning_rate": 1.7529138204385186e-05, + "loss": 0.4156, + "step": 2256 + }, + { + "epoch": 0.9039231385108086, + "grad_norm": 3.321204900741577, + "learning_rate": 1.7510714315655467e-05, + "loss": 0.2359, + "step": 2258 + }, + { + "epoch": 0.9047237790232185, + "grad_norm": 1.8961786031723022, + "learning_rate": 1.7492231752603305e-05, + "loss": 0.3763, + "step": 2260 + }, + { + "epoch": 0.9055244195356285, + "grad_norm": 5.603198051452637, + "learning_rate": 1.7473690659616e-05, + "loss": 0.4147, + "step": 2262 + }, + { + "epoch": 0.9063250600480385, + "grad_norm": 1.967324137687683, + "learning_rate": 1.7455091181538094e-05, + "loss": 0.3803, + "step": 2264 + }, + { + "epoch": 0.9071257005604484, + "grad_norm": 1.7079949378967285, + "learning_rate": 1.743643346367027e-05, + "loss": 0.2422, + "step": 2266 + }, + { + "epoch": 0.9079263410728583, + "grad_norm": 2.956300735473633, + "learning_rate": 1.741771765176815e-05, + "loss": 0.4325, + "step": 2268 + }, + { + "epoch": 0.9087269815852682, + "grad_norm": 4.0792717933654785, + "learning_rate": 1.739894389204122e-05, + "loss": 0.2889, + "step": 2270 + }, + { + "epoch": 0.9095276220976781, + "grad_norm": 3.0052907466888428, + "learning_rate": 1.7380112331151657e-05, + "loss": 0.3492, + "step": 2272 + }, + { + "epoch": 0.910328262610088, + "grad_norm": 1.9260516166687012, + "learning_rate": 1.7361223116213146e-05, + "loss": 0.2157, + "step": 2274 + }, + { + "epoch": 0.911128903122498, + "grad_norm": 2.2401070594787598, + "learning_rate": 1.7342276394789825e-05, + "loss": 0.4381, + "step": 2276 + }, + { + "epoch": 0.911929543634908, + "grad_norm": 3.1091361045837402, + "learning_rate": 1.732327231489503e-05, + "loss": 0.5509, + "step": 2278 + }, + { + "epoch": 0.9127301841473179, + "grad_norm": 3.085422992706299, + "learning_rate": 1.7304211024990216e-05, + "loss": 0.3034, + "step": 2280 + }, + { + "epoch": 0.9135308246597278, + "grad_norm": 1.362424612045288, + "learning_rate": 1.728509267398376e-05, + "loss": 0.2819, + "step": 2282 + }, + { + "epoch": 0.9143314651721377, + "grad_norm": 1.938071846961975, + "learning_rate": 1.7265917411229803e-05, + "loss": 0.4157, + "step": 2284 + }, + { + "epoch": 0.9151321056845476, + "grad_norm": 2.6250786781311035, + "learning_rate": 1.7246685386527105e-05, + "loss": 0.2395, + "step": 2286 + }, + { + "epoch": 0.9159327461969575, + "grad_norm": 2.193608283996582, + "learning_rate": 1.7227396750117802e-05, + "loss": 0.4752, + "step": 2288 + }, + { + "epoch": 0.9167333867093675, + "grad_norm": 3.8884313106536865, + "learning_rate": 1.7208051652686348e-05, + "loss": 0.4807, + "step": 2290 + }, + { + "epoch": 0.9175340272217775, + "grad_norm": 2.270993947982788, + "learning_rate": 1.718865024535822e-05, + "loss": 0.5029, + "step": 2292 + }, + { + "epoch": 0.9183346677341874, + "grad_norm": 3.053530216217041, + "learning_rate": 1.716919267969884e-05, + "loss": 0.4387, + "step": 2294 + }, + { + "epoch": 0.9191353082465973, + "grad_norm": 3.09564208984375, + "learning_rate": 1.7149679107712317e-05, + "loss": 0.4897, + "step": 2296 + }, + { + "epoch": 0.9199359487590072, + "grad_norm": 2.8178999423980713, + "learning_rate": 1.7130109681840298e-05, + "loss": 0.4343, + "step": 2298 + }, + { + "epoch": 0.9207365892714171, + "grad_norm": 1.1022289991378784, + "learning_rate": 1.711048455496075e-05, + "loss": 0.2158, + "step": 2300 + }, + { + "epoch": 0.921537229783827, + "grad_norm": 2.5116710662841797, + "learning_rate": 1.7090803880386784e-05, + "loss": 0.3824, + "step": 2302 + }, + { + "epoch": 0.922337870296237, + "grad_norm": 2.1685965061187744, + "learning_rate": 1.7071067811865474e-05, + "loss": 0.3641, + "step": 2304 + }, + { + "epoch": 0.923138510808647, + "grad_norm": 2.2170488834381104, + "learning_rate": 1.705127650357663e-05, + "loss": 0.5286, + "step": 2306 + }, + { + "epoch": 0.9239391513210569, + "grad_norm": 1.856063723564148, + "learning_rate": 1.7031430110131566e-05, + "loss": 0.3665, + "step": 2308 + }, + { + "epoch": 0.9247397918334668, + "grad_norm": 1.4921120405197144, + "learning_rate": 1.701152878657197e-05, + "loss": 0.304, + "step": 2310 + }, + { + "epoch": 0.9255404323458767, + "grad_norm": 1.0860601663589478, + "learning_rate": 1.699157268836863e-05, + "loss": 0.3511, + "step": 2312 + }, + { + "epoch": 0.9263410728582866, + "grad_norm": 1.22074294090271, + "learning_rate": 1.697156197142023e-05, + "loss": 0.2247, + "step": 2314 + }, + { + "epoch": 0.9271417133706965, + "grad_norm": 1.3671035766601562, + "learning_rate": 1.6951496792052148e-05, + "loss": 0.322, + "step": 2316 + }, + { + "epoch": 0.9279423538831065, + "grad_norm": 1.022951364517212, + "learning_rate": 1.6931377307015236e-05, + "loss": 0.2716, + "step": 2318 + }, + { + "epoch": 0.9287429943955164, + "grad_norm": 1.383567452430725, + "learning_rate": 1.6911203673484583e-05, + "loss": 0.3353, + "step": 2320 + }, + { + "epoch": 0.9295436349079264, + "grad_norm": 1.357797384262085, + "learning_rate": 1.6890976049058267e-05, + "loss": 0.3041, + "step": 2322 + }, + { + "epoch": 0.9303442754203363, + "grad_norm": 1.723847508430481, + "learning_rate": 1.687069459175619e-05, + "loss": 0.2878, + "step": 2324 + }, + { + "epoch": 0.9311449159327462, + "grad_norm": 1.6303527355194092, + "learning_rate": 1.6850359460018744e-05, + "loss": 0.3664, + "step": 2326 + }, + { + "epoch": 0.9319455564451561, + "grad_norm": 1.5210304260253906, + "learning_rate": 1.682997081270568e-05, + "loss": 0.3484, + "step": 2328 + }, + { + "epoch": 0.932746196957566, + "grad_norm": 1.7018555402755737, + "learning_rate": 1.6809528809094805e-05, + "loss": 0.4159, + "step": 2330 + }, + { + "epoch": 0.933546837469976, + "grad_norm": 1.6109943389892578, + "learning_rate": 1.6789033608880742e-05, + "loss": 0.3816, + "step": 2332 + }, + { + "epoch": 0.9343474779823859, + "grad_norm": 1.4665824174880981, + "learning_rate": 1.67684853721737e-05, + "loss": 0.304, + "step": 2334 + }, + { + "epoch": 0.9351481184947958, + "grad_norm": 1.3999348878860474, + "learning_rate": 1.6747884259498185e-05, + "loss": 0.2471, + "step": 2336 + }, + { + "epoch": 0.9359487590072058, + "grad_norm": 0.7288603782653809, + "learning_rate": 1.6727230431791826e-05, + "loss": 0.183, + "step": 2338 + }, + { + "epoch": 0.9367493995196157, + "grad_norm": 1.4613627195358276, + "learning_rate": 1.6706524050404006e-05, + "loss": 0.2936, + "step": 2340 + }, + { + "epoch": 0.9375500400320256, + "grad_norm": 1.5691227912902832, + "learning_rate": 1.6685765277094702e-05, + "loss": 0.3096, + "step": 2342 + }, + { + "epoch": 0.9383506805444356, + "grad_norm": 1.2199252843856812, + "learning_rate": 1.6664954274033175e-05, + "loss": 0.2644, + "step": 2344 + }, + { + "epoch": 0.9391513210568455, + "grad_norm": 1.3933489322662354, + "learning_rate": 1.66440912037967e-05, + "loss": 0.2167, + "step": 2346 + }, + { + "epoch": 0.9399519615692554, + "grad_norm": 1.2608258724212646, + "learning_rate": 1.662317622936933e-05, + "loss": 0.2389, + "step": 2348 + }, + { + "epoch": 0.9407526020816653, + "grad_norm": 2.2554931640625, + "learning_rate": 1.6602209514140562e-05, + "loss": 0.4923, + "step": 2350 + }, + { + "epoch": 0.9415532425940752, + "grad_norm": 1.5096319913864136, + "learning_rate": 1.6581191221904098e-05, + "loss": 0.3702, + "step": 2352 + }, + { + "epoch": 0.9423538831064852, + "grad_norm": 1.6621919870376587, + "learning_rate": 1.6560121516856592e-05, + "loss": 0.3056, + "step": 2354 + }, + { + "epoch": 0.9431545236188951, + "grad_norm": 1.8173503875732422, + "learning_rate": 1.6539000563596328e-05, + "loss": 0.4329, + "step": 2356 + }, + { + "epoch": 0.9439551641313051, + "grad_norm": 2.180309772491455, + "learning_rate": 1.651782852712194e-05, + "loss": 0.3645, + "step": 2358 + }, + { + "epoch": 0.944755804643715, + "grad_norm": 1.9194144010543823, + "learning_rate": 1.6496605572831127e-05, + "loss": 0.3647, + "step": 2360 + }, + { + "epoch": 0.9455564451561249, + "grad_norm": 1.9975744485855103, + "learning_rate": 1.6475331866519387e-05, + "loss": 0.2964, + "step": 2362 + }, + { + "epoch": 0.9463570856685348, + "grad_norm": 1.280642032623291, + "learning_rate": 1.6454007574378657e-05, + "loss": 0.2246, + "step": 2364 + }, + { + "epoch": 0.9471577261809447, + "grad_norm": 2.4046499729156494, + "learning_rate": 1.6432632862996062e-05, + "loss": 0.4896, + "step": 2366 + }, + { + "epoch": 0.9479583666933546, + "grad_norm": 1.7787344455718994, + "learning_rate": 1.6411207899352633e-05, + "loss": 0.4233, + "step": 2368 + }, + { + "epoch": 0.9487590072057646, + "grad_norm": 1.7199516296386719, + "learning_rate": 1.6389732850821964e-05, + "loss": 0.2399, + "step": 2370 + }, + { + "epoch": 0.9495596477181746, + "grad_norm": 1.9237161874771118, + "learning_rate": 1.6368207885168904e-05, + "loss": 0.3977, + "step": 2372 + }, + { + "epoch": 0.9503602882305845, + "grad_norm": 1.4867990016937256, + "learning_rate": 1.6346633170548275e-05, + "loss": 0.3039, + "step": 2374 + }, + { + "epoch": 0.9511609287429944, + "grad_norm": 0.9809843301773071, + "learning_rate": 1.6325008875503563e-05, + "loss": 0.2332, + "step": 2376 + }, + { + "epoch": 0.9519615692554043, + "grad_norm": 1.433406949043274, + "learning_rate": 1.6303335168965495e-05, + "loss": 0.3187, + "step": 2378 + }, + { + "epoch": 0.9527622097678142, + "grad_norm": 2.198542356491089, + "learning_rate": 1.628161222025089e-05, + "loss": 0.4516, + "step": 2380 + }, + { + "epoch": 0.9535628502802241, + "grad_norm": 1.5030566453933716, + "learning_rate": 1.625984019906122e-05, + "loss": 0.229, + "step": 2382 + }, + { + "epoch": 0.9543634907926342, + "grad_norm": 2.46911883354187, + "learning_rate": 1.623801927548132e-05, + "loss": 0.4546, + "step": 2384 + }, + { + "epoch": 0.9551641313050441, + "grad_norm": 2.0465636253356934, + "learning_rate": 1.6216149619978057e-05, + "loss": 0.3582, + "step": 2386 + }, + { + "epoch": 0.955964771817454, + "grad_norm": 2.4855711460113525, + "learning_rate": 1.6194231403398987e-05, + "loss": 0.4147, + "step": 2388 + }, + { + "epoch": 0.9567654123298639, + "grad_norm": 1.0863906145095825, + "learning_rate": 1.6172264796971063e-05, + "loss": 0.2137, + "step": 2390 + }, + { + "epoch": 0.9575660528422738, + "grad_norm": 1.2886790037155151, + "learning_rate": 1.6150249972299173e-05, + "loss": 0.2723, + "step": 2392 + }, + { + "epoch": 0.9583666933546837, + "grad_norm": 1.9359744787216187, + "learning_rate": 1.612818710136499e-05, + "loss": 0.4382, + "step": 2394 + }, + { + "epoch": 0.9591673338670936, + "grad_norm": 2.349468946456909, + "learning_rate": 1.6106076356525484e-05, + "loss": 0.3176, + "step": 2396 + }, + { + "epoch": 0.9599679743795037, + "grad_norm": 1.4763431549072266, + "learning_rate": 1.6083917910511623e-05, + "loss": 0.2908, + "step": 2398 + }, + { + "epoch": 0.9607686148919136, + "grad_norm": 1.4675970077514648, + "learning_rate": 1.6061711936427028e-05, + "loss": 0.264, + "step": 2400 + }, + { + "epoch": 0.9615692554043235, + "grad_norm": 1.6704723834991455, + "learning_rate": 1.60394586077466e-05, + "loss": 0.2905, + "step": 2402 + }, + { + "epoch": 0.9623698959167334, + "grad_norm": 2.001084089279175, + "learning_rate": 1.6017158098315224e-05, + "loss": 0.2908, + "step": 2404 + }, + { + "epoch": 0.9631705364291433, + "grad_norm": 1.6939998865127563, + "learning_rate": 1.5994810582346266e-05, + "loss": 0.3815, + "step": 2406 + }, + { + "epoch": 0.9639711769415532, + "grad_norm": 1.3423815965652466, + "learning_rate": 1.5972416234420404e-05, + "loss": 0.2753, + "step": 2408 + }, + { + "epoch": 0.9647718174539631, + "grad_norm": 2.180957078933716, + "learning_rate": 1.594997522948413e-05, + "loss": 0.3702, + "step": 2410 + }, + { + "epoch": 0.9655724579663731, + "grad_norm": 1.8678675889968872, + "learning_rate": 1.592748774284844e-05, + "loss": 0.3803, + "step": 2412 + }, + { + "epoch": 0.966373098478783, + "grad_norm": 1.3624097108840942, + "learning_rate": 1.5904953950187448e-05, + "loss": 0.2163, + "step": 2414 + }, + { + "epoch": 0.967173738991193, + "grad_norm": 1.7047420740127563, + "learning_rate": 1.588237402753703e-05, + "loss": 0.3037, + "step": 2416 + }, + { + "epoch": 0.9679743795036029, + "grad_norm": 2.1652908325195312, + "learning_rate": 1.5859748151293354e-05, + "loss": 0.4348, + "step": 2418 + }, + { + "epoch": 0.9687750200160128, + "grad_norm": 2.1502034664154053, + "learning_rate": 1.5837076498211673e-05, + "loss": 0.5367, + "step": 2420 + }, + { + "epoch": 0.9695756605284227, + "grad_norm": 2.214573621749878, + "learning_rate": 1.581435924540482e-05, + "loss": 0.4345, + "step": 2422 + }, + { + "epoch": 0.9703763010408326, + "grad_norm": 2.386291980743408, + "learning_rate": 1.579159657034185e-05, + "loss": 0.3995, + "step": 2424 + }, + { + "epoch": 0.9711769415532426, + "grad_norm": 1.7125344276428223, + "learning_rate": 1.5768788650846674e-05, + "loss": 0.1954, + "step": 2426 + }, + { + "epoch": 0.9719775820656525, + "grad_norm": 1.9502114057540894, + "learning_rate": 1.574593566509664e-05, + "loss": 0.3097, + "step": 2428 + }, + { + "epoch": 0.9727782225780625, + "grad_norm": 1.636108636856079, + "learning_rate": 1.5723037791621203e-05, + "loss": 0.3057, + "step": 2430 + }, + { + "epoch": 0.9735788630904724, + "grad_norm": 1.849358081817627, + "learning_rate": 1.5700095209300386e-05, + "loss": 0.2466, + "step": 2432 + }, + { + "epoch": 0.9743795036028823, + "grad_norm": 2.233956813812256, + "learning_rate": 1.5677108097363565e-05, + "loss": 0.2029, + "step": 2434 + }, + { + "epoch": 0.9751801441152922, + "grad_norm": 1.4080708026885986, + "learning_rate": 1.5654076635387976e-05, + "loss": 0.2466, + "step": 2436 + }, + { + "epoch": 0.9759807846277022, + "grad_norm": 7.048268795013428, + "learning_rate": 1.5631001003297302e-05, + "loss": 0.3756, + "step": 2438 + }, + { + "epoch": 0.9767814251401121, + "grad_norm": 1.6451681852340698, + "learning_rate": 1.560788138136029e-05, + "loss": 0.4074, + "step": 2440 + }, + { + "epoch": 0.977582065652522, + "grad_norm": 2.132462501525879, + "learning_rate": 1.5584717950189373e-05, + "loss": 0.3494, + "step": 2442 + }, + { + "epoch": 0.978382706164932, + "grad_norm": 1.5286540985107422, + "learning_rate": 1.5561510890739137e-05, + "loss": 0.3569, + "step": 2444 + }, + { + "epoch": 0.9791833466773419, + "grad_norm": 2.292171001434326, + "learning_rate": 1.5538260384305083e-05, + "loss": 0.2952, + "step": 2446 + }, + { + "epoch": 0.9799839871897518, + "grad_norm": 1.7551361322402954, + "learning_rate": 1.5514966612522088e-05, + "loss": 0.319, + "step": 2448 + }, + { + "epoch": 0.9807846277021617, + "grad_norm": 2.480187177658081, + "learning_rate": 1.5491629757363033e-05, + "loss": 0.4924, + "step": 2450 + }, + { + "epoch": 0.9815852682145717, + "grad_norm": 1.2223252058029175, + "learning_rate": 1.546825000113736e-05, + "loss": 0.238, + "step": 2452 + }, + { + "epoch": 0.9823859087269816, + "grad_norm": 2.2826361656188965, + "learning_rate": 1.544482752648966e-05, + "loss": 0.4694, + "step": 2454 + }, + { + "epoch": 0.9831865492393915, + "grad_norm": 1.833378791809082, + "learning_rate": 1.5421362516398285e-05, + "loss": 0.2588, + "step": 2456 + }, + { + "epoch": 0.9839871897518014, + "grad_norm": 2.4878454208374023, + "learning_rate": 1.539785515417377e-05, + "loss": 0.2934, + "step": 2458 + }, + { + "epoch": 0.9847878302642114, + "grad_norm": 1.8636246919631958, + "learning_rate": 1.5374305623457605e-05, + "loss": 0.4155, + "step": 2460 + }, + { + "epoch": 0.9855884707766213, + "grad_norm": 2.147399663925171, + "learning_rate": 1.5350714108220677e-05, + "loss": 0.349, + "step": 2462 + }, + { + "epoch": 0.9863891112890312, + "grad_norm": 1.7579898834228516, + "learning_rate": 1.532708079276186e-05, + "loss": 0.4157, + "step": 2464 + }, + { + "epoch": 0.9871897518014412, + "grad_norm": 1.7056758403778076, + "learning_rate": 1.5303405861706567e-05, + "loss": 0.333, + "step": 2466 + }, + { + "epoch": 0.9879903923138511, + "grad_norm": 1.3905423879623413, + "learning_rate": 1.5279689500005353e-05, + "loss": 0.2723, + "step": 2468 + }, + { + "epoch": 0.988791032826261, + "grad_norm": 1.97187340259552, + "learning_rate": 1.5255931892932344e-05, + "loss": 0.3804, + "step": 2470 + }, + { + "epoch": 0.9895916733386709, + "grad_norm": 1.5064928531646729, + "learning_rate": 1.5232133226083962e-05, + "loss": 0.3523, + "step": 2472 + }, + { + "epoch": 0.9903923138510808, + "grad_norm": 2.6863813400268555, + "learning_rate": 1.5208293685377362e-05, + "loss": 0.3994, + "step": 2474 + }, + { + "epoch": 0.9911929543634908, + "grad_norm": 1.5400047302246094, + "learning_rate": 1.5184413457049014e-05, + "loss": 0.2381, + "step": 2476 + }, + { + "epoch": 0.9919935948759008, + "grad_norm": 1.2665342092514038, + "learning_rate": 1.5160492727653238e-05, + "loss": 0.2615, + "step": 2478 + }, + { + "epoch": 0.9927942353883107, + "grad_norm": 1.7095102071762085, + "learning_rate": 1.5136531684060753e-05, + "loss": 0.4349, + "step": 2480 + }, + { + "epoch": 0.9935948759007206, + "grad_norm": 1.7871954441070557, + "learning_rate": 1.5112530513457251e-05, + "loss": 0.3351, + "step": 2482 + }, + { + "epoch": 0.9943955164131305, + "grad_norm": 1.3601773977279663, + "learning_rate": 1.50884894033418e-05, + "loss": 0.3057, + "step": 2484 + }, + { + "epoch": 0.9951961569255404, + "grad_norm": 1.7827646732330322, + "learning_rate": 1.5064408541525578e-05, + "loss": 0.384, + "step": 2486 + }, + { + "epoch": 0.9959967974379503, + "grad_norm": 1.3904117345809937, + "learning_rate": 1.504028811613027e-05, + "loss": 0.2399, + "step": 2488 + }, + { + "epoch": 0.9967974379503602, + "grad_norm": 1.5541892051696777, + "learning_rate": 1.5016128315586636e-05, + "loss": 0.2894, + "step": 2490 + }, + { + "epoch": 0.9975980784627703, + "grad_norm": 1.7922195196151733, + "learning_rate": 1.4991929328633043e-05, + "loss": 0.3505, + "step": 2492 + }, + { + "epoch": 0.9983987189751802, + "grad_norm": 2.5908703804016113, + "learning_rate": 1.4967691344314012e-05, + "loss": 0.4183, + "step": 2494 + }, + { + "epoch": 0.9991993594875901, + "grad_norm": 1.553139567375183, + "learning_rate": 1.4943414551978622e-05, + "loss": 0.3666, + "step": 2496 + }, + { + "epoch": 1.0, + "grad_norm": 2.2227511405944824, + "learning_rate": 1.4919099141279214e-05, + "loss": 0.4382, + "step": 2498 + }, + { + "epoch": 1.0, + "step": 2498, + "total_flos": 1.4480561108156416e+16, + "train_loss": 0.34094005745587297, + "train_runtime": 11043.5453, + "train_samples_per_second": 1.81, + "train_steps_per_second": 0.226 + } + ], + "logging_steps": 2, + "max_steps": 2498, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 1.4480561108156416e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_125_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_125_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc1c10bf2d19460df4396e663dbb7752a4a282c8 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_125_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09e7f2a97d2774c06024a3ff3d23a5d8157074095f3cb427e70870d1046523f0 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_125_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_125_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..ce50d08b1f4a1186d7ec3e0398cbf3f3c5ab9c46 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_125_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96a19f6b3f3fb4693d1bb5d1f5cb752d32c43904318d5d3214269d71bc2cec11 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_125_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_125_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..04930e2a6095f14199a887e53d63e346496f07f6 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_125_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d235e39c92b59620f8a2d9f8868055db79296cc1e4109af71586bfab684ace9a +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_125_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_125_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..a1f7a20ae2a69f6d6d8e50fb52adf47ef9d642d9 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_125_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b59e1f08c4890187ca41838447402b2681e7aaee99b0f1296ca68db4fa6fe645 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_25_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_25_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..508476298c49525a2f56f4db805cbc93996e4d91 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_25_seed1/0_trainer_state.json @@ -0,0 +1,17518 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 4996, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008006405124099279, + "grad_norm": 7.80233097076416, + "learning_rate": 2.406842319175051e-06, + "loss": 0.2896, + "step": 2 + }, + { + "epoch": 0.0008006405124099279, + "grad_norm": 2.737185001373291, + "learning_rate": 2.415943612351265e-06, + "loss": 0.1428, + "step": 4 + }, + { + "epoch": 0.0016012810248198558, + "grad_norm": 4.819119453430176, + "learning_rate": 2.4250597173539104e-06, + "loss": 0.5131, + "step": 6 + }, + { + "epoch": 0.0016012810248198558, + "grad_norm": 7.364744663238525, + "learning_rate": 2.4341906163790364e-06, + "loss": 0.2244, + "step": 8 + }, + { + "epoch": 0.0024019215372297837, + "grad_norm": 1.7665841579437256, + "learning_rate": 2.443336291593801e-06, + "loss": 0.1753, + "step": 10 + }, + { + "epoch": 0.0024019215372297837, + "grad_norm": 0.6727489233016968, + "learning_rate": 2.4524967251364995e-06, + "loss": 0.0986, + "step": 12 + }, + { + "epoch": 0.0032025620496397116, + "grad_norm": 6.884824752807617, + "learning_rate": 2.461671899116598e-06, + "loss": 0.1976, + "step": 14 + }, + { + "epoch": 0.0032025620496397116, + "grad_norm": 0.8878536224365234, + "learning_rate": 2.4708617956148052e-06, + "loss": 0.0441, + "step": 16 + }, + { + "epoch": 0.0040032025620496394, + "grad_norm": 7.774904251098633, + "learning_rate": 2.4800663966830417e-06, + "loss": 0.5132, + "step": 18 + }, + { + "epoch": 0.0040032025620496394, + "grad_norm": 3.9651920795440674, + "learning_rate": 2.4892856843445236e-06, + "loss": 0.2349, + "step": 20 + }, + { + "epoch": 0.004803843074459567, + "grad_norm": 3.5629847049713135, + "learning_rate": 2.4985196405937807e-06, + "loss": 0.2455, + "step": 22 + }, + { + "epoch": 0.004803843074459567, + "grad_norm": 3.0273921489715576, + "learning_rate": 2.507768247396697e-06, + "loss": 0.2231, + "step": 24 + }, + { + "epoch": 0.005604483586869495, + "grad_norm": 4.464300632476807, + "learning_rate": 2.5170314866905443e-06, + "loss": 0.4329, + "step": 26 + }, + { + "epoch": 0.005604483586869495, + "grad_norm": 0.27717480063438416, + "learning_rate": 2.5263093403840022e-06, + "loss": 0.2421, + "step": 28 + }, + { + "epoch": 0.006405124099279423, + "grad_norm": 3.4895153045654297, + "learning_rate": 2.535601790357246e-06, + "loss": 0.4537, + "step": 30 + }, + { + "epoch": 0.006405124099279423, + "grad_norm": 2.2487993240356445, + "learning_rate": 2.5449088184619065e-06, + "loss": 0.216, + "step": 32 + }, + { + "epoch": 0.007205764611689352, + "grad_norm": 1.3813062906265259, + "learning_rate": 2.5542304065211578e-06, + "loss": 0.0775, + "step": 34 + }, + { + "epoch": 0.007205764611689352, + "grad_norm": 1.5240283012390137, + "learning_rate": 2.5635665363297356e-06, + "loss": 0.0701, + "step": 36 + }, + { + "epoch": 0.008006405124099279, + "grad_norm": 6.977014541625977, + "learning_rate": 2.5729171896539763e-06, + "loss": 0.4372, + "step": 38 + }, + { + "epoch": 0.008006405124099279, + "grad_norm": 13.339950561523438, + "learning_rate": 2.5822823482318517e-06, + "loss": 0.252, + "step": 40 + }, + { + "epoch": 0.008807045636509208, + "grad_norm": 2.152944803237915, + "learning_rate": 2.5916619937729915e-06, + "loss": 0.3822, + "step": 42 + }, + { + "epoch": 0.008807045636509208, + "grad_norm": 1.0419551134109497, + "learning_rate": 2.6010561079587694e-06, + "loss": 0.038, + "step": 44 + }, + { + "epoch": 0.009607686148919135, + "grad_norm": 1.4586684703826904, + "learning_rate": 2.6104646724422643e-06, + "loss": 0.3316, + "step": 46 + }, + { + "epoch": 0.009607686148919135, + "grad_norm": 0.9437859654426575, + "learning_rate": 2.6198876688483453e-06, + "loss": 0.0756, + "step": 48 + }, + { + "epoch": 0.010408326661329063, + "grad_norm": 1.872849941253662, + "learning_rate": 2.629325078773699e-06, + "loss": 0.4551, + "step": 50 + }, + { + "epoch": 0.010408326661329063, + "grad_norm": 7.864923000335693, + "learning_rate": 2.6387768837868565e-06, + "loss": 0.166, + "step": 52 + }, + { + "epoch": 0.01120896717373899, + "grad_norm": 3.748821258544922, + "learning_rate": 2.648243065428239e-06, + "loss": 0.4581, + "step": 54 + }, + { + "epoch": 0.01120896717373899, + "grad_norm": 1.4499348402023315, + "learning_rate": 2.6577236052101764e-06, + "loss": 0.1506, + "step": 56 + }, + { + "epoch": 0.01200960768614892, + "grad_norm": 1.3827787637710571, + "learning_rate": 2.6672184846169934e-06, + "loss": 0.1052, + "step": 58 + }, + { + "epoch": 0.01200960768614892, + "grad_norm": 0.7270983457565308, + "learning_rate": 2.6767276851049716e-06, + "loss": 0.0705, + "step": 60 + }, + { + "epoch": 0.012810248198558846, + "grad_norm": 1.9675313234329224, + "learning_rate": 2.686251188102439e-06, + "loss": 0.1947, + "step": 62 + }, + { + "epoch": 0.012810248198558846, + "grad_norm": 1.6819177865982056, + "learning_rate": 2.6957889750097866e-06, + "loss": 0.1189, + "step": 64 + }, + { + "epoch": 0.013610888710968775, + "grad_norm": 8.813508987426758, + "learning_rate": 2.7053410271995085e-06, + "loss": 0.2324, + "step": 66 + }, + { + "epoch": 0.013610888710968775, + "grad_norm": 4.427382469177246, + "learning_rate": 2.7149073260162416e-06, + "loss": 0.1755, + "step": 68 + }, + { + "epoch": 0.014411529223378704, + "grad_norm": 6.045440673828125, + "learning_rate": 2.724487852776785e-06, + "loss": 0.2501, + "step": 70 + }, + { + "epoch": 0.014411529223378704, + "grad_norm": 3.800697088241577, + "learning_rate": 2.7340825887701848e-06, + "loss": 0.1433, + "step": 72 + }, + { + "epoch": 0.01521216973578863, + "grad_norm": 4.230377674102783, + "learning_rate": 2.7436915152577038e-06, + "loss": 0.2456, + "step": 74 + }, + { + "epoch": 0.01521216973578863, + "grad_norm": 13.001970291137695, + "learning_rate": 2.7533146134728993e-06, + "loss": 0.3257, + "step": 76 + }, + { + "epoch": 0.016012810248198558, + "grad_norm": 6.945789813995361, + "learning_rate": 2.7629518646216522e-06, + "loss": 0.5076, + "step": 78 + }, + { + "epoch": 0.016012810248198558, + "grad_norm": 9.0550537109375, + "learning_rate": 2.772603249882202e-06, + "loss": 0.3656, + "step": 80 + }, + { + "epoch": 0.016813450760608487, + "grad_norm": 8.364053726196289, + "learning_rate": 2.782268750405185e-06, + "loss": 0.1951, + "step": 82 + }, + { + "epoch": 0.016813450760608487, + "grad_norm": 2.4625084400177, + "learning_rate": 2.7919483473136555e-06, + "loss": 0.304, + "step": 84 + }, + { + "epoch": 0.017614091273018415, + "grad_norm": 2.313206672668457, + "learning_rate": 2.801642021703177e-06, + "loss": 0.105, + "step": 86 + }, + { + "epoch": 0.017614091273018415, + "grad_norm": 1.3618698120117188, + "learning_rate": 2.81134975464178e-06, + "loss": 0.1546, + "step": 88 + }, + { + "epoch": 0.018414731785428344, + "grad_norm": 1.3984222412109375, + "learning_rate": 2.821071527170053e-06, + "loss": 0.1359, + "step": 90 + }, + { + "epoch": 0.018414731785428344, + "grad_norm": 2.8718864917755127, + "learning_rate": 2.8308073203011634e-06, + "loss": 0.1434, + "step": 92 + }, + { + "epoch": 0.01921537229783827, + "grad_norm": 10.02181625366211, + "learning_rate": 2.8405571150208945e-06, + "loss": 0.4913, + "step": 94 + }, + { + "epoch": 0.01921537229783827, + "grad_norm": 3.6782588958740234, + "learning_rate": 2.850320892287688e-06, + "loss": 0.2264, + "step": 96 + }, + { + "epoch": 0.020016012810248198, + "grad_norm": 0.961725652217865, + "learning_rate": 2.860098633032663e-06, + "loss": 0.3605, + "step": 98 + }, + { + "epoch": 0.020016012810248198, + "grad_norm": 4.157869338989258, + "learning_rate": 2.8698903181597026e-06, + "loss": 0.166, + "step": 100 + }, + { + "epoch": 0.020816653322658127, + "grad_norm": 0.14601141214370728, + "learning_rate": 2.879695928545424e-06, + "loss": 0.1363, + "step": 102 + }, + { + "epoch": 0.020816653322658127, + "grad_norm": 1.916751742362976, + "learning_rate": 2.889515445039256e-06, + "loss": 0.1328, + "step": 104 + }, + { + "epoch": 0.021617293835068056, + "grad_norm": 0.386229932308197, + "learning_rate": 2.899348848463471e-06, + "loss": 0.1541, + "step": 106 + }, + { + "epoch": 0.021617293835068056, + "grad_norm": 0.38670891523361206, + "learning_rate": 2.909196119613218e-06, + "loss": 0.0455, + "step": 108 + }, + { + "epoch": 0.02241793434747798, + "grad_norm": 2.9763340950012207, + "learning_rate": 2.9190572392565643e-06, + "loss": 0.085, + "step": 110 + }, + { + "epoch": 0.02241793434747798, + "grad_norm": 3.0374221801757812, + "learning_rate": 2.928932188134529e-06, + "loss": 0.3058, + "step": 112 + }, + { + "epoch": 0.02321857485988791, + "grad_norm": 2.5094945430755615, + "learning_rate": 2.9388209469611093e-06, + "loss": 0.1579, + "step": 114 + }, + { + "epoch": 0.02321857485988791, + "grad_norm": 2.194896936416626, + "learning_rate": 2.9487234964233724e-06, + "loss": 0.1776, + "step": 116 + }, + { + "epoch": 0.02401921537229784, + "grad_norm": 0.38413166999816895, + "learning_rate": 2.9586398171814114e-06, + "loss": 0.1012, + "step": 118 + }, + { + "epoch": 0.02401921537229784, + "grad_norm": 4.909361839294434, + "learning_rate": 2.9685698898684355e-06, + "loss": 0.0722, + "step": 120 + }, + { + "epoch": 0.024819855884707767, + "grad_norm": 0.09988866001367569, + "learning_rate": 2.9785136950907987e-06, + "loss": 0.0902, + "step": 122 + }, + { + "epoch": 0.024819855884707767, + "grad_norm": 0.6805497407913208, + "learning_rate": 2.988471213428035e-06, + "loss": 0.0291, + "step": 124 + }, + { + "epoch": 0.025620496397117692, + "grad_norm": 0.35814595222473145, + "learning_rate": 2.9984424254328936e-06, + "loss": 0.2921, + "step": 126 + }, + { + "epoch": 0.025620496397117692, + "grad_norm": 3.0216400623321533, + "learning_rate": 3.00842731163137e-06, + "loss": 0.3345, + "step": 128 + }, + { + "epoch": 0.02642113690952762, + "grad_norm": 1.2282987833023071, + "learning_rate": 3.0184258525227895e-06, + "loss": 0.6752, + "step": 130 + }, + { + "epoch": 0.02642113690952762, + "grad_norm": 2.491196393966675, + "learning_rate": 3.0284380285797733e-06, + "loss": 0.1257, + "step": 132 + }, + { + "epoch": 0.02722177742193755, + "grad_norm": 4.013164520263672, + "learning_rate": 3.038463820248324e-06, + "loss": 0.3806, + "step": 134 + }, + { + "epoch": 0.02722177742193755, + "grad_norm": 2.25406551361084, + "learning_rate": 3.048503207947854e-06, + "loss": 0.1458, + "step": 136 + }, + { + "epoch": 0.02802241793434748, + "grad_norm": 3.8059892654418945, + "learning_rate": 3.0585561720712207e-06, + "loss": 0.3713, + "step": 138 + }, + { + "epoch": 0.02802241793434748, + "grad_norm": 0.3889507055282593, + "learning_rate": 3.068622692984767e-06, + "loss": 0.0377, + "step": 140 + }, + { + "epoch": 0.028823058446757407, + "grad_norm": 5.243758201599121, + "learning_rate": 3.0787027510283495e-06, + "loss": 0.4913, + "step": 142 + }, + { + "epoch": 0.028823058446757407, + "grad_norm": 3.231865644454956, + "learning_rate": 3.0887963265154187e-06, + "loss": 0.4584, + "step": 144 + }, + { + "epoch": 0.029623698959167333, + "grad_norm": 4.858358860015869, + "learning_rate": 3.098903399732992e-06, + "loss": 0.5655, + "step": 146 + }, + { + "epoch": 0.029623698959167333, + "grad_norm": 0.6040743589401245, + "learning_rate": 3.1090239509417364e-06, + "loss": 0.1925, + "step": 148 + }, + { + "epoch": 0.03042433947157726, + "grad_norm": 2.1857290267944336, + "learning_rate": 3.1191579603759946e-06, + "loss": 0.1479, + "step": 150 + }, + { + "epoch": 0.03042433947157726, + "grad_norm": 2.9908463954925537, + "learning_rate": 3.129305408243829e-06, + "loss": 0.1248, + "step": 152 + }, + { + "epoch": 0.03122497998398719, + "grad_norm": 2.4863839149475098, + "learning_rate": 3.139466274727052e-06, + "loss": 0.235, + "step": 154 + }, + { + "epoch": 0.03122497998398719, + "grad_norm": 1.3782933950424194, + "learning_rate": 3.1496405399812602e-06, + "loss": 0.2402, + "step": 156 + }, + { + "epoch": 0.032025620496397116, + "grad_norm": 2.621680498123169, + "learning_rate": 3.159828184135917e-06, + "loss": 0.3207, + "step": 158 + }, + { + "epoch": 0.032025620496397116, + "grad_norm": 2.1933043003082275, + "learning_rate": 3.17002918729432e-06, + "loss": 0.1495, + "step": 160 + }, + { + "epoch": 0.03282626100880705, + "grad_norm": 1.0111573934555054, + "learning_rate": 3.1802435295336908e-06, + "loss": 0.1088, + "step": 162 + }, + { + "epoch": 0.03282626100880705, + "grad_norm": 3.774712562561035, + "learning_rate": 3.1904711909051967e-06, + "loss": 0.1792, + "step": 164 + }, + { + "epoch": 0.03362690152121697, + "grad_norm": 6.037362575531006, + "learning_rate": 3.2007121514339924e-06, + "loss": 0.5076, + "step": 166 + }, + { + "epoch": 0.03362690152121697, + "grad_norm": 3.338298797607422, + "learning_rate": 3.2109663911192622e-06, + "loss": 0.2926, + "step": 168 + }, + { + "epoch": 0.0344275420336269, + "grad_norm": 0.3564451336860657, + "learning_rate": 3.221233889934239e-06, + "loss": 0.4351, + "step": 170 + }, + { + "epoch": 0.0344275420336269, + "grad_norm": 2.6110756397247314, + "learning_rate": 3.231514627826302e-06, + "loss": 0.2575, + "step": 172 + }, + { + "epoch": 0.03522818254603683, + "grad_norm": 5.543386936187744, + "learning_rate": 3.2418085847169344e-06, + "loss": 0.2186, + "step": 174 + }, + { + "epoch": 0.03522818254603683, + "grad_norm": 4.181483268737793, + "learning_rate": 3.2521157405018146e-06, + "loss": 0.2351, + "step": 176 + }, + { + "epoch": 0.036028823058446756, + "grad_norm": 3.3522531986236572, + "learning_rate": 3.2624360750508457e-06, + "loss": 0.2813, + "step": 178 + }, + { + "epoch": 0.036028823058446756, + "grad_norm": 3.137221097946167, + "learning_rate": 3.2727695682081897e-06, + "loss": 0.1642, + "step": 180 + }, + { + "epoch": 0.03682946357085669, + "grad_norm": 5.348705768585205, + "learning_rate": 3.28311619979231e-06, + "loss": 0.5282, + "step": 182 + }, + { + "epoch": 0.03682946357085669, + "grad_norm": 3.829209327697754, + "learning_rate": 3.293475949595998e-06, + "loss": 0.3967, + "step": 184 + }, + { + "epoch": 0.03763010408326661, + "grad_norm": 3.3031723499298096, + "learning_rate": 3.303848797386465e-06, + "loss": 0.2631, + "step": 186 + }, + { + "epoch": 0.03763010408326661, + "grad_norm": 3.7475264072418213, + "learning_rate": 3.314234722905302e-06, + "loss": 0.4508, + "step": 188 + }, + { + "epoch": 0.03843074459567654, + "grad_norm": 2.758577585220337, + "learning_rate": 3.3246337058685697e-06, + "loss": 0.3271, + "step": 190 + }, + { + "epoch": 0.03843074459567654, + "grad_norm": 9.843360900878906, + "learning_rate": 3.335045725966829e-06, + "loss": 0.2703, + "step": 192 + }, + { + "epoch": 0.03923138510808647, + "grad_norm": 0.5049953460693359, + "learning_rate": 3.3454707628651806e-06, + "loss": 0.0742, + "step": 194 + }, + { + "epoch": 0.03923138510808647, + "grad_norm": 1.1400386095046997, + "learning_rate": 3.355908796203301e-06, + "loss": 0.0706, + "step": 196 + }, + { + "epoch": 0.040032025620496396, + "grad_norm": 1.0002307891845703, + "learning_rate": 3.3663598055954716e-06, + "loss": 0.2139, + "step": 198 + }, + { + "epoch": 0.040032025620496396, + "grad_norm": 1.704769253730774, + "learning_rate": 3.3768237706306716e-06, + "loss": 0.0902, + "step": 200 + }, + { + "epoch": 0.04083266613290633, + "grad_norm": 6.201662540435791, + "learning_rate": 3.3873006708725365e-06, + "loss": 0.5102, + "step": 202 + }, + { + "epoch": 0.04083266613290633, + "grad_norm": 1.1764291524887085, + "learning_rate": 3.3977904858594534e-06, + "loss": 0.1941, + "step": 204 + }, + { + "epoch": 0.041633306645316254, + "grad_norm": 3.1756532192230225, + "learning_rate": 3.408293195104586e-06, + "loss": 0.1844, + "step": 206 + }, + { + "epoch": 0.041633306645316254, + "grad_norm": 3.113288402557373, + "learning_rate": 3.418808778095917e-06, + "loss": 0.2461, + "step": 208 + }, + { + "epoch": 0.04243394715772618, + "grad_norm": 9.711479187011719, + "learning_rate": 3.4293372142962845e-06, + "loss": 0.3796, + "step": 210 + }, + { + "epoch": 0.04243394715772618, + "grad_norm": 2.1500513553619385, + "learning_rate": 3.4398784831434097e-06, + "loss": 0.1691, + "step": 212 + }, + { + "epoch": 0.04323458767013611, + "grad_norm": 3.55709171295166, + "learning_rate": 3.4504325640499936e-06, + "loss": 0.4393, + "step": 214 + }, + { + "epoch": 0.04323458767013611, + "grad_norm": 0.9907680153846741, + "learning_rate": 3.460999436403676e-06, + "loss": 0.0655, + "step": 216 + }, + { + "epoch": 0.044035228182546036, + "grad_norm": 0.0016694199293851852, + "learning_rate": 3.4715790795671232e-06, + "loss": 0.1897, + "step": 218 + }, + { + "epoch": 0.044035228182546036, + "grad_norm": 4.552898406982422, + "learning_rate": 3.4821714728780654e-06, + "loss": 0.2185, + "step": 220 + }, + { + "epoch": 0.04483586869495596, + "grad_norm": 2.558058977127075, + "learning_rate": 3.4927765956493276e-06, + "loss": 0.134, + "step": 222 + }, + { + "epoch": 0.04483586869495596, + "grad_norm": 3.040621280670166, + "learning_rate": 3.5033944271688624e-06, + "loss": 0.1859, + "step": 224 + }, + { + "epoch": 0.045636509207365894, + "grad_norm": 11.246976852416992, + "learning_rate": 3.514024946699842e-06, + "loss": 0.3341, + "step": 226 + }, + { + "epoch": 0.045636509207365894, + "grad_norm": 1.6681205034255981, + "learning_rate": 3.5246681334806177e-06, + "loss": 0.1785, + "step": 228 + }, + { + "epoch": 0.04643714971977582, + "grad_norm": 0.5771212577819824, + "learning_rate": 3.535323966724814e-06, + "loss": 0.2073, + "step": 230 + }, + { + "epoch": 0.04643714971977582, + "grad_norm": 1.4882224798202515, + "learning_rate": 3.5459924256213596e-06, + "loss": 0.0574, + "step": 232 + }, + { + "epoch": 0.04723779023218575, + "grad_norm": 1.4970464706420898, + "learning_rate": 3.556673489334522e-06, + "loss": 0.2701, + "step": 234 + }, + { + "epoch": 0.04723779023218575, + "grad_norm": 1.0679023265838623, + "learning_rate": 3.567367137003953e-06, + "loss": 0.0675, + "step": 236 + }, + { + "epoch": 0.04803843074459568, + "grad_norm": 4.779284954071045, + "learning_rate": 3.5780733477447127e-06, + "loss": 0.685, + "step": 238 + }, + { + "epoch": 0.04803843074459568, + "grad_norm": 4.353030681610107, + "learning_rate": 3.588792100647368e-06, + "loss": 0.3437, + "step": 240 + }, + { + "epoch": 0.0488390712570056, + "grad_norm": 2.0363659858703613, + "learning_rate": 3.5995233747779467e-06, + "loss": 0.0576, + "step": 242 + }, + { + "epoch": 0.0488390712570056, + "grad_norm": 5.35968017578125, + "learning_rate": 3.6102671491780393e-06, + "loss": 0.2609, + "step": 244 + }, + { + "epoch": 0.049639711769415534, + "grad_norm": 4.360085964202881, + "learning_rate": 3.6210234028648216e-06, + "loss": 0.2499, + "step": 246 + }, + { + "epoch": 0.049639711769415534, + "grad_norm": 4.029691696166992, + "learning_rate": 3.6317921148310965e-06, + "loss": 0.1418, + "step": 248 + }, + { + "epoch": 0.05044035228182546, + "grad_norm": 3.182138204574585, + "learning_rate": 3.6425732640453235e-06, + "loss": 0.3934, + "step": 250 + }, + { + "epoch": 0.05044035228182546, + "grad_norm": 0.958607017993927, + "learning_rate": 3.653366829451711e-06, + "loss": 0.0233, + "step": 252 + }, + { + "epoch": 0.051240992794235385, + "grad_norm": 4.669559478759766, + "learning_rate": 3.6641727899701795e-06, + "loss": 0.2187, + "step": 254 + }, + { + "epoch": 0.051240992794235385, + "grad_norm": 5.583956241607666, + "learning_rate": 3.674991124496452e-06, + "loss": 0.229, + "step": 256 + }, + { + "epoch": 0.05204163330664532, + "grad_norm": 0.8557757139205933, + "learning_rate": 3.6858218119020884e-06, + "loss": 0.109, + "step": 258 + }, + { + "epoch": 0.05204163330664532, + "grad_norm": 0.9255304932594299, + "learning_rate": 3.696664831034521e-06, + "loss": 0.0201, + "step": 260 + }, + { + "epoch": 0.05284227381905524, + "grad_norm": 0.49434441328048706, + "learning_rate": 3.7075201607170997e-06, + "loss": 0.1875, + "step": 262 + }, + { + "epoch": 0.05284227381905524, + "grad_norm": 0.0047366018407046795, + "learning_rate": 3.7183877797491143e-06, + "loss": 0.0213, + "step": 264 + }, + { + "epoch": 0.053642914331465175, + "grad_norm": 3.8905997276306152, + "learning_rate": 3.729267666905899e-06, + "loss": 0.2159, + "step": 266 + }, + { + "epoch": 0.053642914331465175, + "grad_norm": 3.2119369506835938, + "learning_rate": 3.740159800938784e-06, + "loss": 0.1354, + "step": 268 + }, + { + "epoch": 0.0544435548438751, + "grad_norm": 3.109204053878784, + "learning_rate": 3.751064160575195e-06, + "loss": 0.1056, + "step": 270 + }, + { + "epoch": 0.0544435548438751, + "grad_norm": 0.8103315830230713, + "learning_rate": 3.7619807245186824e-06, + "loss": 0.0213, + "step": 272 + }, + { + "epoch": 0.055244195356285025, + "grad_norm": 2.416623115539551, + "learning_rate": 3.772909471448959e-06, + "loss": 0.639, + "step": 274 + }, + { + "epoch": 0.055244195356285025, + "grad_norm": 1.0840078592300415, + "learning_rate": 3.783850380021933e-06, + "loss": 0.0377, + "step": 276 + }, + { + "epoch": 0.05604483586869496, + "grad_norm": 4.788816928863525, + "learning_rate": 3.794803428869799e-06, + "loss": 0.6006, + "step": 278 + }, + { + "epoch": 0.05604483586869496, + "grad_norm": 1.0210392475128174, + "learning_rate": 3.8057685966010025e-06, + "loss": 0.2236, + "step": 280 + }, + { + "epoch": 0.05684547638110488, + "grad_norm": 8.340980529785156, + "learning_rate": 3.816745861800334e-06, + "loss": 0.1798, + "step": 282 + }, + { + "epoch": 0.05684547638110488, + "grad_norm": 1.4198607206344604, + "learning_rate": 3.827735203028956e-06, + "loss": 0.062, + "step": 284 + }, + { + "epoch": 0.057646116893514815, + "grad_norm": 3.2383360862731934, + "learning_rate": 3.838736598824446e-06, + "loss": 0.1045, + "step": 286 + }, + { + "epoch": 0.057646116893514815, + "grad_norm": 0.5152770280838013, + "learning_rate": 3.849750027700842e-06, + "loss": 0.0629, + "step": 288 + }, + { + "epoch": 0.05844675740592474, + "grad_norm": 0.8122913837432861, + "learning_rate": 3.860775468148662e-06, + "loss": 0.2958, + "step": 290 + }, + { + "epoch": 0.05844675740592474, + "grad_norm": 0.5061805844306946, + "learning_rate": 3.871812898635011e-06, + "loss": 0.0704, + "step": 292 + }, + { + "epoch": 0.059247397918334666, + "grad_norm": 15.378660202026367, + "learning_rate": 3.882862297603536e-06, + "loss": 0.8723, + "step": 294 + }, + { + "epoch": 0.059247397918334666, + "grad_norm": 0.9841398000717163, + "learning_rate": 3.8939236434745184e-06, + "loss": 0.0604, + "step": 296 + }, + { + "epoch": 0.0600480384307446, + "grad_norm": 1.0767229795455933, + "learning_rate": 3.904996914644913e-06, + "loss": 0.0851, + "step": 298 + }, + { + "epoch": 0.0600480384307446, + "grad_norm": 1.4478883743286133, + "learning_rate": 3.916082089488379e-06, + "loss": 0.1477, + "step": 300 + }, + { + "epoch": 0.06084867894315452, + "grad_norm": 2.182016372680664, + "learning_rate": 3.927179146355317e-06, + "loss": 0.261, + "step": 302 + }, + { + "epoch": 0.06084867894315452, + "grad_norm": 1.4225610494613647, + "learning_rate": 3.938288063572962e-06, + "loss": 0.0477, + "step": 304 + }, + { + "epoch": 0.06164931945556445, + "grad_norm": 2.983097791671753, + "learning_rate": 3.949408819445345e-06, + "loss": 0.3181, + "step": 306 + }, + { + "epoch": 0.06164931945556445, + "grad_norm": 2.370604991912842, + "learning_rate": 3.960541392253387e-06, + "loss": 0.1255, + "step": 308 + }, + { + "epoch": 0.06244995996797438, + "grad_norm": 0.704069197177887, + "learning_rate": 3.971685760254933e-06, + "loss": 0.0223, + "step": 310 + }, + { + "epoch": 0.06244995996797438, + "grad_norm": 0.35000357031822205, + "learning_rate": 3.982841901684792e-06, + "loss": 0.046, + "step": 312 + }, + { + "epoch": 0.0632506004803843, + "grad_norm": 0.21540145576000214, + "learning_rate": 3.994009794754777e-06, + "loss": 0.1235, + "step": 314 + }, + { + "epoch": 0.0632506004803843, + "grad_norm": 3.3525726795196533, + "learning_rate": 4.005189417653737e-06, + "loss": 0.158, + "step": 316 + }, + { + "epoch": 0.06405124099279423, + "grad_norm": 3.1712722778320312, + "learning_rate": 4.016380748547654e-06, + "loss": 0.2699, + "step": 318 + }, + { + "epoch": 0.06405124099279423, + "grad_norm": 0.3713848888874054, + "learning_rate": 4.027583765579601e-06, + "loss": 0.0896, + "step": 320 + }, + { + "epoch": 0.06485188150520416, + "grad_norm": 3.6907074451446533, + "learning_rate": 4.038798446869847e-06, + "loss": 0.2111, + "step": 322 + }, + { + "epoch": 0.06485188150520416, + "grad_norm": 1.9553935527801514, + "learning_rate": 4.050024770515873e-06, + "loss": 0.0636, + "step": 324 + }, + { + "epoch": 0.0656525220176141, + "grad_norm": 3.838704824447632, + "learning_rate": 4.061262714592426e-06, + "loss": 0.215, + "step": 326 + }, + { + "epoch": 0.0656525220176141, + "grad_norm": 0.7191022038459778, + "learning_rate": 4.072512257151546e-06, + "loss": 0.0954, + "step": 328 + }, + { + "epoch": 0.06645316253002402, + "grad_norm": 0.0013650758191943169, + "learning_rate": 4.0837733762226584e-06, + "loss": 0.0253, + "step": 330 + }, + { + "epoch": 0.06645316253002402, + "grad_norm": 2.2045044898986816, + "learning_rate": 4.095046049812541e-06, + "loss": 0.1508, + "step": 332 + }, + { + "epoch": 0.06725380304243395, + "grad_norm": 5.439484119415283, + "learning_rate": 4.106330255905417e-06, + "loss": 0.4173, + "step": 334 + }, + { + "epoch": 0.06725380304243395, + "grad_norm": 1.9524422883987427, + "learning_rate": 4.117625972462988e-06, + "loss": 0.0328, + "step": 336 + }, + { + "epoch": 0.06805444355484387, + "grad_norm": 3.610140800476074, + "learning_rate": 4.128933177424475e-06, + "loss": 0.1338, + "step": 338 + }, + { + "epoch": 0.06805444355484387, + "grad_norm": 0.6187162399291992, + "learning_rate": 4.1402518487066624e-06, + "loss": 0.0414, + "step": 340 + }, + { + "epoch": 0.0688550840672538, + "grad_norm": 0.10598500818014145, + "learning_rate": 4.151581964203924e-06, + "loss": 0.1605, + "step": 342 + }, + { + "epoch": 0.0688550840672538, + "grad_norm": 2.9833741188049316, + "learning_rate": 4.1629235017883285e-06, + "loss": 0.0695, + "step": 344 + }, + { + "epoch": 0.06965572457966374, + "grad_norm": 0.3621214032173157, + "learning_rate": 4.174276439309593e-06, + "loss": 0.2941, + "step": 346 + }, + { + "epoch": 0.06965572457966374, + "grad_norm": 0.1364554464817047, + "learning_rate": 4.1856407545951825e-06, + "loss": 0.0041, + "step": 348 + }, + { + "epoch": 0.07045636509207366, + "grad_norm": 3.573472738265991, + "learning_rate": 4.197016425450347e-06, + "loss": 0.4392, + "step": 350 + }, + { + "epoch": 0.07045636509207366, + "grad_norm": 0.03819806128740311, + "learning_rate": 4.208403429658151e-06, + "loss": 0.0367, + "step": 352 + }, + { + "epoch": 0.07125700560448359, + "grad_norm": 4.656810760498047, + "learning_rate": 4.219801744979517e-06, + "loss": 0.1947, + "step": 354 + }, + { + "epoch": 0.07125700560448359, + "grad_norm": 3.929865598678589, + "learning_rate": 4.2312113491533145e-06, + "loss": 0.1714, + "step": 356 + }, + { + "epoch": 0.07205764611689351, + "grad_norm": 2.81821870803833, + "learning_rate": 4.242632219896328e-06, + "loss": 0.0289, + "step": 358 + }, + { + "epoch": 0.07205764611689351, + "grad_norm": 3.1115880012512207, + "learning_rate": 4.254064334903347e-06, + "loss": 0.0789, + "step": 360 + }, + { + "epoch": 0.07285828662930344, + "grad_norm": 1.952222228050232, + "learning_rate": 4.2655076718472045e-06, + "loss": 0.2546, + "step": 362 + }, + { + "epoch": 0.07285828662930344, + "grad_norm": 1.7833361625671387, + "learning_rate": 4.276962208378814e-06, + "loss": 0.1435, + "step": 364 + }, + { + "epoch": 0.07365892714171338, + "grad_norm": 5.8015899658203125, + "learning_rate": 4.28842792212722e-06, + "loss": 0.4115, + "step": 366 + }, + { + "epoch": 0.07365892714171338, + "grad_norm": 3.002747058868408, + "learning_rate": 4.299904790699619e-06, + "loss": 0.3407, + "step": 368 + }, + { + "epoch": 0.0744595676541233, + "grad_norm": 3.9715540409088135, + "learning_rate": 4.3113927916814665e-06, + "loss": 0.55, + "step": 370 + }, + { + "epoch": 0.0744595676541233, + "grad_norm": 3.350931406021118, + "learning_rate": 4.3228919026364345e-06, + "loss": 0.4648, + "step": 372 + }, + { + "epoch": 0.07526020816653323, + "grad_norm": 6.578980445861816, + "learning_rate": 4.33440210110651e-06, + "loss": 0.2615, + "step": 374 + }, + { + "epoch": 0.07526020816653323, + "grad_norm": 3.5558462142944336, + "learning_rate": 4.345923364612024e-06, + "loss": 0.1022, + "step": 376 + }, + { + "epoch": 0.07606084867894315, + "grad_norm": 4.011159896850586, + "learning_rate": 4.3574556706517035e-06, + "loss": 0.1939, + "step": 378 + }, + { + "epoch": 0.07606084867894315, + "grad_norm": 0.7525694966316223, + "learning_rate": 4.368998996702686e-06, + "loss": 0.059, + "step": 380 + }, + { + "epoch": 0.07686148919135308, + "grad_norm": 4.68489408493042, + "learning_rate": 4.380553320220638e-06, + "loss": 0.2316, + "step": 382 + }, + { + "epoch": 0.07686148919135308, + "grad_norm": 5.436942100524902, + "learning_rate": 4.392118618639698e-06, + "loss": 0.1868, + "step": 384 + }, + { + "epoch": 0.07766212970376302, + "grad_norm": 3.324280261993408, + "learning_rate": 4.403694869372589e-06, + "loss": 0.142, + "step": 386 + }, + { + "epoch": 0.07766212970376302, + "grad_norm": 0.2706053555011749, + "learning_rate": 4.415282049810643e-06, + "loss": 0.0066, + "step": 388 + }, + { + "epoch": 0.07846277021617294, + "grad_norm": 0.00690037664026022, + "learning_rate": 4.4268801373238454e-06, + "loss": 0.0785, + "step": 390 + }, + { + "epoch": 0.07846277021617294, + "grad_norm": 0.3114011585712433, + "learning_rate": 4.4384891092608795e-06, + "loss": 0.0552, + "step": 392 + }, + { + "epoch": 0.07926341072858287, + "grad_norm": 8.371416091918945, + "learning_rate": 4.450108942949158e-06, + "loss": 1.1418, + "step": 394 + }, + { + "epoch": 0.07926341072858287, + "grad_norm": 1.8441888093948364, + "learning_rate": 4.461739615694921e-06, + "loss": 0.1222, + "step": 396 + }, + { + "epoch": 0.08006405124099279, + "grad_norm": 2.0632243156433105, + "learning_rate": 4.473381104783201e-06, + "loss": 0.0742, + "step": 398 + }, + { + "epoch": 0.08006405124099279, + "grad_norm": 2.144061326980591, + "learning_rate": 4.485033387477915e-06, + "loss": 0.081, + "step": 400 + }, + { + "epoch": 0.08086469175340272, + "grad_norm": 8.206186294555664, + "learning_rate": 4.496696441021904e-06, + "loss": 0.6784, + "step": 402 + }, + { + "epoch": 0.08086469175340272, + "grad_norm": 1.6857935190200806, + "learning_rate": 4.5083702426369715e-06, + "loss": 0.1904, + "step": 404 + }, + { + "epoch": 0.08166533226581266, + "grad_norm": 3.4324777126312256, + "learning_rate": 4.520054769523929e-06, + "loss": 0.4319, + "step": 406 + }, + { + "epoch": 0.08166533226581266, + "grad_norm": 1.3632526397705078, + "learning_rate": 4.531749998862628e-06, + "loss": 0.1527, + "step": 408 + }, + { + "epoch": 0.08246597277822258, + "grad_norm": 0.7287818193435669, + "learning_rate": 4.543455907812063e-06, + "loss": 0.3408, + "step": 410 + }, + { + "epoch": 0.08246597277822258, + "grad_norm": 0.004486845340579748, + "learning_rate": 4.555172473510324e-06, + "loss": 0.0453, + "step": 412 + }, + { + "epoch": 0.08326661329063251, + "grad_norm": 2.2924399375915527, + "learning_rate": 4.566899673074706e-06, + "loss": 0.507, + "step": 414 + }, + { + "epoch": 0.08326661329063251, + "grad_norm": 0.49359390139579773, + "learning_rate": 4.578637483601732e-06, + "loss": 0.1031, + "step": 416 + }, + { + "epoch": 0.08406725380304243, + "grad_norm": 2.275036573410034, + "learning_rate": 4.590385882167206e-06, + "loss": 0.5959, + "step": 418 + }, + { + "epoch": 0.08406725380304243, + "grad_norm": 1.7156637907028198, + "learning_rate": 4.602144845826234e-06, + "loss": 0.0454, + "step": 420 + }, + { + "epoch": 0.08486789431545236, + "grad_norm": 1.9523868560791016, + "learning_rate": 4.613914351613337e-06, + "loss": 0.182, + "step": 422 + }, + { + "epoch": 0.08486789431545236, + "grad_norm": 1.2702696323394775, + "learning_rate": 4.625694376542399e-06, + "loss": 0.0989, + "step": 424 + }, + { + "epoch": 0.08566853482786228, + "grad_norm": 6.898612976074219, + "learning_rate": 4.637484897606777e-06, + "loss": 0.4022, + "step": 426 + }, + { + "epoch": 0.08566853482786228, + "grad_norm": 0.8967157602310181, + "learning_rate": 4.649285891779326e-06, + "loss": 0.1672, + "step": 428 + }, + { + "epoch": 0.08646917534027222, + "grad_norm": 3.0510785579681396, + "learning_rate": 4.661097336012451e-06, + "loss": 0.2349, + "step": 430 + }, + { + "epoch": 0.08646917534027222, + "grad_norm": 0.7351176142692566, + "learning_rate": 4.672919207238145e-06, + "loss": 0.0625, + "step": 432 + }, + { + "epoch": 0.08726981585268215, + "grad_norm": 4.772363185882568, + "learning_rate": 4.684751482368022e-06, + "loss": 0.5077, + "step": 434 + }, + { + "epoch": 0.08726981585268215, + "grad_norm": 0.27374622225761414, + "learning_rate": 4.696594138293421e-06, + "loss": 0.0651, + "step": 436 + }, + { + "epoch": 0.08807045636509207, + "grad_norm": 4.84055757522583, + "learning_rate": 4.7084471518853656e-06, + "loss": 0.4405, + "step": 438 + }, + { + "epoch": 0.08807045636509207, + "grad_norm": 0.15434280037879944, + "learning_rate": 4.720310499994664e-06, + "loss": 0.2465, + "step": 440 + }, + { + "epoch": 0.088871096877502, + "grad_norm": 0.6090686917304993, + "learning_rate": 4.732184159451937e-06, + "loss": 0.3461, + "step": 442 + }, + { + "epoch": 0.088871096877502, + "grad_norm": 2.2852623462677, + "learning_rate": 4.744068107067673e-06, + "loss": 0.1187, + "step": 444 + }, + { + "epoch": 0.08967173738991192, + "grad_norm": 2.948848247528076, + "learning_rate": 4.755962319632249e-06, + "loss": 0.1004, + "step": 446 + }, + { + "epoch": 0.08967173738991192, + "grad_norm": 0.6760413646697998, + "learning_rate": 4.767866773916041e-06, + "loss": 0.0213, + "step": 448 + }, + { + "epoch": 0.09047237790232186, + "grad_norm": 1.8621052503585815, + "learning_rate": 4.779781446669376e-06, + "loss": 0.3018, + "step": 450 + }, + { + "epoch": 0.09047237790232186, + "grad_norm": 9.793964385986328, + "learning_rate": 4.79170631462264e-06, + "loss": 0.1144, + "step": 452 + }, + { + "epoch": 0.09127301841473179, + "grad_norm": 4.344635963439941, + "learning_rate": 4.8036413544863095e-06, + "loss": 0.3749, + "step": 454 + }, + { + "epoch": 0.09127301841473179, + "grad_norm": 2.6749088764190674, + "learning_rate": 4.81558654295099e-06, + "loss": 0.3496, + "step": 456 + }, + { + "epoch": 0.09207365892714171, + "grad_norm": 2.9184677600860596, + "learning_rate": 4.827541856687471e-06, + "loss": 0.3398, + "step": 458 + }, + { + "epoch": 0.09207365892714171, + "grad_norm": 2.0620532035827637, + "learning_rate": 4.839507272346751e-06, + "loss": 0.0881, + "step": 460 + }, + { + "epoch": 0.09287429943955164, + "grad_norm": 3.8525278568267822, + "learning_rate": 4.8514827665601425e-06, + "loss": 0.3378, + "step": 462 + }, + { + "epoch": 0.09287429943955164, + "grad_norm": 1.8385846614837646, + "learning_rate": 4.863468315939234e-06, + "loss": 0.0584, + "step": 464 + }, + { + "epoch": 0.09367493995196156, + "grad_norm": 7.008689880371094, + "learning_rate": 4.875463897075985e-06, + "loss": 0.2859, + "step": 466 + }, + { + "epoch": 0.09367493995196156, + "grad_norm": 0.12091206759214401, + "learning_rate": 4.8874694865427676e-06, + "loss": 0.0731, + "step": 468 + }, + { + "epoch": 0.0944755804643715, + "grad_norm": 3.547574043273926, + "learning_rate": 4.899485060892404e-06, + "loss": 0.2499, + "step": 470 + }, + { + "epoch": 0.0944755804643715, + "grad_norm": 4.135746479034424, + "learning_rate": 4.911510596658202e-06, + "loss": 0.2317, + "step": 472 + }, + { + "epoch": 0.09527622097678143, + "grad_norm": 7.44768762588501, + "learning_rate": 4.9235460703540615e-06, + "loss": 0.3947, + "step": 474 + }, + { + "epoch": 0.09527622097678143, + "grad_norm": 2.8448238372802734, + "learning_rate": 4.935591458474425e-06, + "loss": 0.0571, + "step": 476 + }, + { + "epoch": 0.09607686148919135, + "grad_norm": 7.297703742980957, + "learning_rate": 4.947646737494389e-06, + "loss": 0.3168, + "step": 478 + }, + { + "epoch": 0.09607686148919135, + "grad_norm": 4.3133955001831055, + "learning_rate": 4.959711883869734e-06, + "loss": 0.1136, + "step": 480 + }, + { + "epoch": 0.09687750200160128, + "grad_norm": 19.307701110839844, + "learning_rate": 4.9717868740369645e-06, + "loss": 0.4686, + "step": 482 + }, + { + "epoch": 0.09687750200160128, + "grad_norm": 3.329108953475952, + "learning_rate": 4.9838716844133665e-06, + "loss": 0.0624, + "step": 484 + }, + { + "epoch": 0.0976781425140112, + "grad_norm": 3.526294469833374, + "learning_rate": 4.9959662913970254e-06, + "loss": 0.1892, + "step": 486 + }, + { + "epoch": 0.0976781425140112, + "grad_norm": 11.25652027130127, + "learning_rate": 5.0080706713669435e-06, + "loss": 0.1676, + "step": 488 + }, + { + "epoch": 0.09847878302642114, + "grad_norm": 4.035782814025879, + "learning_rate": 5.02018480068299e-06, + "loss": 0.1669, + "step": 490 + }, + { + "epoch": 0.09847878302642114, + "grad_norm": 0.7984126806259155, + "learning_rate": 5.032308655686007e-06, + "loss": 0.0199, + "step": 492 + }, + { + "epoch": 0.09927942353883107, + "grad_norm": 7.922055721282959, + "learning_rate": 5.044442212697842e-06, + "loss": 0.3514, + "step": 494 + }, + { + "epoch": 0.09927942353883107, + "grad_norm": 0.6124506592750549, + "learning_rate": 5.056585448021398e-06, + "loss": 0.0249, + "step": 496 + }, + { + "epoch": 0.100080064051241, + "grad_norm": 2.0416152477264404, + "learning_rate": 5.068738337940655e-06, + "loss": 0.4598, + "step": 498 + }, + { + "epoch": 0.100080064051241, + "grad_norm": 3.7704429626464844, + "learning_rate": 5.080900858720789e-06, + "loss": 0.1714, + "step": 500 + }, + { + "epoch": 0.10088070456365092, + "grad_norm": 2.6705102920532227, + "learning_rate": 5.093072986608116e-06, + "loss": 0.3694, + "step": 502 + }, + { + "epoch": 0.10088070456365092, + "grad_norm": 0.7138646245002747, + "learning_rate": 5.105254697830208e-06, + "loss": 0.0348, + "step": 504 + }, + { + "epoch": 0.10168134507606084, + "grad_norm": 2.989302396774292, + "learning_rate": 5.1174459685959175e-06, + "loss": 0.6368, + "step": 506 + }, + { + "epoch": 0.10168134507606084, + "grad_norm": 0.6363722681999207, + "learning_rate": 5.129646775095432e-06, + "loss": 0.2314, + "step": 508 + }, + { + "epoch": 0.10248198558847077, + "grad_norm": 2.041794776916504, + "learning_rate": 5.141857093500307e-06, + "loss": 0.2319, + "step": 510 + }, + { + "epoch": 0.10248198558847077, + "grad_norm": 1.4195246696472168, + "learning_rate": 5.154076899963514e-06, + "loss": 0.0371, + "step": 512 + }, + { + "epoch": 0.10328262610088071, + "grad_norm": 1.662986397743225, + "learning_rate": 5.166306170619537e-06, + "loss": 0.1527, + "step": 514 + }, + { + "epoch": 0.10328262610088071, + "grad_norm": 0.5833873748779297, + "learning_rate": 5.178544881584328e-06, + "loss": 0.0541, + "step": 516 + }, + { + "epoch": 0.10408326661329063, + "grad_norm": 1.906090259552002, + "learning_rate": 5.190793008955421e-06, + "loss": 0.0663, + "step": 518 + }, + { + "epoch": 0.10408326661329063, + "grad_norm": 1.2730053663253784, + "learning_rate": 5.203050528811959e-06, + "loss": 0.0452, + "step": 520 + }, + { + "epoch": 0.10488390712570056, + "grad_norm": 0.12163811177015305, + "learning_rate": 5.215317417214739e-06, + "loss": 0.0055, + "step": 522 + }, + { + "epoch": 0.10488390712570056, + "grad_norm": 2.297805070877075, + "learning_rate": 5.227593650206246e-06, + "loss": 0.0991, + "step": 524 + }, + { + "epoch": 0.10568454763811048, + "grad_norm": 0.15003959834575653, + "learning_rate": 5.239879203810763e-06, + "loss": 0.0092, + "step": 526 + }, + { + "epoch": 0.10568454763811048, + "grad_norm": 0.20931299030780792, + "learning_rate": 5.2521740540343205e-06, + "loss": 0.0546, + "step": 528 + }, + { + "epoch": 0.10648518815052041, + "grad_norm": 2.3057427406311035, + "learning_rate": 5.264478176864811e-06, + "loss": 0.2502, + "step": 530 + }, + { + "epoch": 0.10648518815052041, + "grad_norm": 2.547064781188965, + "learning_rate": 5.2767915482720164e-06, + "loss": 0.1143, + "step": 532 + }, + { + "epoch": 0.10728582866293035, + "grad_norm": 4.4723687171936035, + "learning_rate": 5.289114144207656e-06, + "loss": 0.7623, + "step": 534 + }, + { + "epoch": 0.10728582866293035, + "grad_norm": 0.26206138730049133, + "learning_rate": 5.3014459406054295e-06, + "loss": 0.1224, + "step": 536 + }, + { + "epoch": 0.10808646917534027, + "grad_norm": 2.698770761489868, + "learning_rate": 5.313786913381061e-06, + "loss": 0.2918, + "step": 538 + }, + { + "epoch": 0.10808646917534027, + "grad_norm": 0.07038102298974991, + "learning_rate": 5.3261370384323904e-06, + "loss": 0.0728, + "step": 540 + }, + { + "epoch": 0.1088871096877502, + "grad_norm": 5.863232135772705, + "learning_rate": 5.338496291639341e-06, + "loss": 0.574, + "step": 542 + }, + { + "epoch": 0.1088871096877502, + "grad_norm": 6.8894147872924805, + "learning_rate": 5.350864648864026e-06, + "loss": 0.4944, + "step": 544 + }, + { + "epoch": 0.10968775020016013, + "grad_norm": 2.8473637104034424, + "learning_rate": 5.363242085950773e-06, + "loss": 0.7928, + "step": 546 + }, + { + "epoch": 0.10968775020016013, + "grad_norm": 0.36154070496559143, + "learning_rate": 5.375628578726181e-06, + "loss": 0.0223, + "step": 548 + }, + { + "epoch": 0.11048839071257005, + "grad_norm": 3.2261953353881836, + "learning_rate": 5.3880241029991434e-06, + "loss": 0.4926, + "step": 550 + }, + { + "epoch": 0.11048839071257005, + "grad_norm": 3.307530641555786, + "learning_rate": 5.4004286345609665e-06, + "loss": 0.3628, + "step": 552 + }, + { + "epoch": 0.11128903122497999, + "grad_norm": 0.45561233162879944, + "learning_rate": 5.412842149185316e-06, + "loss": 0.1134, + "step": 554 + }, + { + "epoch": 0.11128903122497999, + "grad_norm": 1.3241045475006104, + "learning_rate": 5.425264622628326e-06, + "loss": 0.0526, + "step": 556 + }, + { + "epoch": 0.11208967173738991, + "grad_norm": 4.672948837280273, + "learning_rate": 5.437696030628639e-06, + "loss": 0.2406, + "step": 558 + }, + { + "epoch": 0.11208967173738991, + "grad_norm": 3.461677312850952, + "learning_rate": 5.450136348907444e-06, + "loss": 0.0956, + "step": 560 + }, + { + "epoch": 0.11289031224979984, + "grad_norm": 2.0890707969665527, + "learning_rate": 5.462585553168532e-06, + "loss": 0.1779, + "step": 562 + }, + { + "epoch": 0.11289031224979984, + "grad_norm": 3.645028591156006, + "learning_rate": 5.475043619098321e-06, + "loss": 0.1496, + "step": 564 + }, + { + "epoch": 0.11369095276220977, + "grad_norm": 2.728060483932495, + "learning_rate": 5.487510522365969e-06, + "loss": 0.2928, + "step": 566 + }, + { + "epoch": 0.11369095276220977, + "grad_norm": 3.371488332748413, + "learning_rate": 5.499986238623329e-06, + "loss": 0.1749, + "step": 568 + }, + { + "epoch": 0.11449159327461969, + "grad_norm": 6.001924991607666, + "learning_rate": 5.512470743505057e-06, + "loss": 0.2886, + "step": 570 + }, + { + "epoch": 0.11449159327461969, + "grad_norm": 1.4761625528335571, + "learning_rate": 5.524964012628644e-06, + "loss": 0.0828, + "step": 572 + }, + { + "epoch": 0.11529223378702963, + "grad_norm": 3.3271560668945312, + "learning_rate": 5.537466021594464e-06, + "loss": 0.2359, + "step": 574 + }, + { + "epoch": 0.11529223378702963, + "grad_norm": 2.1610708236694336, + "learning_rate": 5.549976745985809e-06, + "loss": 0.1595, + "step": 576 + }, + { + "epoch": 0.11609287429943956, + "grad_norm": 2.2066574096679688, + "learning_rate": 5.5624961613689934e-06, + "loss": 0.2919, + "step": 578 + }, + { + "epoch": 0.11609287429943956, + "grad_norm": 0.0030274728778749704, + "learning_rate": 5.57502424329331e-06, + "loss": 0.0001, + "step": 580 + }, + { + "epoch": 0.11689351481184948, + "grad_norm": 3.5111703872680664, + "learning_rate": 5.5875609672911465e-06, + "loss": 0.3048, + "step": 582 + }, + { + "epoch": 0.11689351481184948, + "grad_norm": 0.9855237007141113, + "learning_rate": 5.6001063088780085e-06, + "loss": 0.0442, + "step": 584 + }, + { + "epoch": 0.1176941553242594, + "grad_norm": 1.908501386642456, + "learning_rate": 5.6126602435525725e-06, + "loss": 0.4251, + "step": 586 + }, + { + "epoch": 0.1176941553242594, + "grad_norm": 2.4515469074249268, + "learning_rate": 5.62522274679673e-06, + "loss": 0.0586, + "step": 588 + }, + { + "epoch": 0.11849479583666933, + "grad_norm": 2.755725622177124, + "learning_rate": 5.637793794075625e-06, + "loss": 0.1336, + "step": 590 + }, + { + "epoch": 0.11849479583666933, + "grad_norm": 0.6445931792259216, + "learning_rate": 5.650373360837763e-06, + "loss": 0.02, + "step": 592 + }, + { + "epoch": 0.11929543634907927, + "grad_norm": 2.6321585178375244, + "learning_rate": 5.662961422514961e-06, + "loss": 0.2347, + "step": 594 + }, + { + "epoch": 0.11929543634907927, + "grad_norm": 0.3918767273426056, + "learning_rate": 5.675557954522462e-06, + "loss": 0.0981, + "step": 596 + }, + { + "epoch": 0.1200960768614892, + "grad_norm": 5.6721086502075195, + "learning_rate": 5.688162932258965e-06, + "loss": 0.3988, + "step": 598 + }, + { + "epoch": 0.1200960768614892, + "grad_norm": 1.5852898359298706, + "learning_rate": 5.700776331106674e-06, + "loss": 0.2889, + "step": 600 + }, + { + "epoch": 0.12089671737389912, + "grad_norm": 2.2258803844451904, + "learning_rate": 5.713398126431353e-06, + "loss": 0.3564, + "step": 602 + }, + { + "epoch": 0.12089671737389912, + "grad_norm": 0.20809262990951538, + "learning_rate": 5.726028293582342e-06, + "loss": 0.0282, + "step": 604 + }, + { + "epoch": 0.12169735788630905, + "grad_norm": 6.963326930999756, + "learning_rate": 5.738666807892684e-06, + "loss": 0.0805, + "step": 606 + }, + { + "epoch": 0.12169735788630905, + "grad_norm": 0.14865165948867798, + "learning_rate": 5.751313644679071e-06, + "loss": 0.0069, + "step": 608 + }, + { + "epoch": 0.12249799839871897, + "grad_norm": 5.027193069458008, + "learning_rate": 5.763968779241957e-06, + "loss": 0.338, + "step": 610 + }, + { + "epoch": 0.12249799839871897, + "grad_norm": 0.31596672534942627, + "learning_rate": 5.776632186865589e-06, + "loss": 0.0416, + "step": 612 + }, + { + "epoch": 0.1232986389111289, + "grad_norm": 5.708477020263672, + "learning_rate": 5.7893038428180584e-06, + "loss": 0.4375, + "step": 614 + }, + { + "epoch": 0.1232986389111289, + "grad_norm": 0.05977749451994896, + "learning_rate": 5.8019837223513295e-06, + "loss": 0.0295, + "step": 616 + }, + { + "epoch": 0.12409927942353884, + "grad_norm": 4.153718948364258, + "learning_rate": 5.814671800701357e-06, + "loss": 0.533, + "step": 618 + }, + { + "epoch": 0.12409927942353884, + "grad_norm": 2.6187288761138916, + "learning_rate": 5.827368053088032e-06, + "loss": 0.2281, + "step": 620 + }, + { + "epoch": 0.12489991993594876, + "grad_norm": 42.35293960571289, + "learning_rate": 5.840072454715297e-06, + "loss": 0.7318, + "step": 622 + }, + { + "epoch": 0.12489991993594876, + "grad_norm": 1.4397735595703125, + "learning_rate": 5.852784980771182e-06, + "loss": 0.1394, + "step": 624 + }, + { + "epoch": 0.1257005604483587, + "grad_norm": 2.9732463359832764, + "learning_rate": 5.865505606427848e-06, + "loss": 0.2941, + "step": 626 + }, + { + "epoch": 0.1257005604483587, + "grad_norm": 8.152449607849121, + "learning_rate": 5.878234306841637e-06, + "loss": 0.1235, + "step": 628 + }, + { + "epoch": 0.1265012009607686, + "grad_norm": 3.109363555908203, + "learning_rate": 5.890971057153105e-06, + "loss": 0.307, + "step": 630 + }, + { + "epoch": 0.1265012009607686, + "grad_norm": 2.906020164489746, + "learning_rate": 5.903715832487138e-06, + "loss": 0.1763, + "step": 632 + }, + { + "epoch": 0.12730184147317855, + "grad_norm": 5.7242536544799805, + "learning_rate": 5.916468607952892e-06, + "loss": 0.2457, + "step": 634 + }, + { + "epoch": 0.12730184147317855, + "grad_norm": 2.758869171142578, + "learning_rate": 5.929229358643925e-06, + "loss": 0.2111, + "step": 636 + }, + { + "epoch": 0.12810248198558846, + "grad_norm": 2.89686918258667, + "learning_rate": 5.941998059638212e-06, + "loss": 0.8186, + "step": 638 + }, + { + "epoch": 0.12810248198558846, + "grad_norm": 1.7656415700912476, + "learning_rate": 5.954774685998206e-06, + "loss": 0.1131, + "step": 640 + }, + { + "epoch": 0.1289031224979984, + "grad_norm": 1.976163387298584, + "learning_rate": 5.9675592127708585e-06, + "loss": 0.1784, + "step": 642 + }, + { + "epoch": 0.1289031224979984, + "grad_norm": 1.2660081386566162, + "learning_rate": 5.9803516149877475e-06, + "loss": 0.0507, + "step": 644 + }, + { + "epoch": 0.1297037630104083, + "grad_norm": 3.1521048545837402, + "learning_rate": 5.993151867665015e-06, + "loss": 0.4912, + "step": 646 + }, + { + "epoch": 0.1297037630104083, + "grad_norm": 3.5050647258758545, + "learning_rate": 6.005959945803494e-06, + "loss": 0.2161, + "step": 648 + }, + { + "epoch": 0.13050440352281825, + "grad_norm": 9.943366050720215, + "learning_rate": 6.01877582438873e-06, + "loss": 0.5698, + "step": 650 + }, + { + "epoch": 0.13050440352281825, + "grad_norm": 1.4234579801559448, + "learning_rate": 6.03159947839103e-06, + "loss": 0.291, + "step": 652 + }, + { + "epoch": 0.1313050440352282, + "grad_norm": 1.6343657970428467, + "learning_rate": 6.0444308827655265e-06, + "loss": 2.6554, + "step": 654 + }, + { + "epoch": 0.1313050440352282, + "grad_norm": 5.449174880981445, + "learning_rate": 6.057270012452186e-06, + "loss": 0.201, + "step": 656 + }, + { + "epoch": 0.1321056845476381, + "grad_norm": 0.7384207248687744, + "learning_rate": 6.070116842375947e-06, + "loss": 0.4364, + "step": 658 + }, + { + "epoch": 0.1321056845476381, + "grad_norm": 1.3327181339263916, + "learning_rate": 6.082971347446654e-06, + "loss": 0.0986, + "step": 660 + }, + { + "epoch": 0.13290632506004804, + "grad_norm": 5.545157432556152, + "learning_rate": 6.095833502559182e-06, + "loss": 0.4319, + "step": 662 + }, + { + "epoch": 0.13290632506004804, + "grad_norm": 2.0596842765808105, + "learning_rate": 6.108703282593461e-06, + "loss": 0.2318, + "step": 664 + }, + { + "epoch": 0.13370696557245795, + "grad_norm": 2.6190128326416016, + "learning_rate": 6.121580662414533e-06, + "loss": 0.4226, + "step": 666 + }, + { + "epoch": 0.13370696557245795, + "grad_norm": 0.8955932855606079, + "learning_rate": 6.13446561687258e-06, + "loss": 0.0496, + "step": 668 + }, + { + "epoch": 0.1345076060848679, + "grad_norm": 4.162669658660889, + "learning_rate": 6.147358120803041e-06, + "loss": 0.4689, + "step": 670 + }, + { + "epoch": 0.1345076060848679, + "grad_norm": 3.648500919342041, + "learning_rate": 6.160258149026557e-06, + "loss": 0.2154, + "step": 672 + }, + { + "epoch": 0.13530824659727783, + "grad_norm": 4.180898666381836, + "learning_rate": 6.173165676349095e-06, + "loss": 0.3165, + "step": 674 + }, + { + "epoch": 0.13530824659727783, + "grad_norm": 2.4379935264587402, + "learning_rate": 6.186080677561974e-06, + "loss": 0.1416, + "step": 676 + }, + { + "epoch": 0.13610888710968774, + "grad_norm": 4.984748840332031, + "learning_rate": 6.1990031274419186e-06, + "loss": 0.5725, + "step": 678 + }, + { + "epoch": 0.13610888710968774, + "grad_norm": 1.3582208156585693, + "learning_rate": 6.2119330007511014e-06, + "loss": 0.2579, + "step": 680 + }, + { + "epoch": 0.13690952762209768, + "grad_norm": 3.549015998840332, + "learning_rate": 6.224870272237185e-06, + "loss": 0.2828, + "step": 682 + }, + { + "epoch": 0.13690952762209768, + "grad_norm": 6.787088394165039, + "learning_rate": 6.237814916633431e-06, + "loss": 0.1774, + "step": 684 + }, + { + "epoch": 0.1377101681345076, + "grad_norm": 0.4546138346195221, + "learning_rate": 6.250766908658652e-06, + "loss": 0.0219, + "step": 686 + }, + { + "epoch": 0.1377101681345076, + "grad_norm": 0.4279613792896271, + "learning_rate": 6.263726223017326e-06, + "loss": 0.1299, + "step": 688 + }, + { + "epoch": 0.13851080864691753, + "grad_norm": 2.2189102172851562, + "learning_rate": 6.2766928343996314e-06, + "loss": 0.3314, + "step": 690 + }, + { + "epoch": 0.13851080864691753, + "grad_norm": 1.8106606006622314, + "learning_rate": 6.289666717481496e-06, + "loss": 0.1953, + "step": 692 + }, + { + "epoch": 0.13931144915932747, + "grad_norm": 2.5571038722991943, + "learning_rate": 6.3026478469246285e-06, + "loss": 0.2154, + "step": 694 + }, + { + "epoch": 0.13931144915932747, + "grad_norm": 2.476729154586792, + "learning_rate": 6.315636197376634e-06, + "loss": 0.2463, + "step": 696 + }, + { + "epoch": 0.14011208967173738, + "grad_norm": 5.744518756866455, + "learning_rate": 6.328631743470968e-06, + "loss": 0.3904, + "step": 698 + }, + { + "epoch": 0.14011208967173738, + "grad_norm": 2.996471643447876, + "learning_rate": 6.341634459827044e-06, + "loss": 0.3209, + "step": 700 + }, + { + "epoch": 0.14091273018414732, + "grad_norm": 2.5936036109924316, + "learning_rate": 6.354644321050279e-06, + "loss": 0.2709, + "step": 702 + }, + { + "epoch": 0.14091273018414732, + "grad_norm": 1.3608124256134033, + "learning_rate": 6.3676613017321305e-06, + "loss": 0.1018, + "step": 704 + }, + { + "epoch": 0.14171337069655723, + "grad_norm": 5.774235725402832, + "learning_rate": 6.380685376450153e-06, + "loss": 0.2636, + "step": 706 + }, + { + "epoch": 0.14171337069655723, + "grad_norm": 2.632924795150757, + "learning_rate": 6.393716519768032e-06, + "loss": 0.2305, + "step": 708 + }, + { + "epoch": 0.14251401120896717, + "grad_norm": 2.1998963356018066, + "learning_rate": 6.406754706235692e-06, + "loss": 0.2869, + "step": 710 + }, + { + "epoch": 0.14251401120896717, + "grad_norm": 3.664700984954834, + "learning_rate": 6.419799910389257e-06, + "loss": 0.0757, + "step": 712 + }, + { + "epoch": 0.1433146517213771, + "grad_norm": 2.3955607414245605, + "learning_rate": 6.432852106751162e-06, + "loss": 0.3887, + "step": 714 + }, + { + "epoch": 0.1433146517213771, + "grad_norm": 0.5279382467269897, + "learning_rate": 6.445911269830183e-06, + "loss": 0.0265, + "step": 716 + }, + { + "epoch": 0.14411529223378702, + "grad_norm": 0.711703360080719, + "learning_rate": 6.458977374121492e-06, + "loss": 0.4934, + "step": 718 + }, + { + "epoch": 0.14411529223378702, + "grad_norm": 2.8265082836151123, + "learning_rate": 6.472050394106689e-06, + "loss": 0.3173, + "step": 720 + }, + { + "epoch": 0.14491593274619696, + "grad_norm": 0.870319128036499, + "learning_rate": 6.485130304253915e-06, + "loss": 0.2654, + "step": 722 + }, + { + "epoch": 0.14491593274619696, + "grad_norm": 2.929919958114624, + "learning_rate": 6.498217079017806e-06, + "loss": 0.2062, + "step": 724 + }, + { + "epoch": 0.14571657325860687, + "grad_norm": 2.9711148738861084, + "learning_rate": 6.511310692839605e-06, + "loss": 0.397, + "step": 726 + }, + { + "epoch": 0.14571657325860687, + "grad_norm": 3.196951150894165, + "learning_rate": 6.524411120147204e-06, + "loss": 0.3308, + "step": 728 + }, + { + "epoch": 0.1465172137710168, + "grad_norm": 2.7800328731536865, + "learning_rate": 6.537518335355182e-06, + "loss": 0.4962, + "step": 730 + }, + { + "epoch": 0.1465172137710168, + "grad_norm": 3.3612918853759766, + "learning_rate": 6.5506323128648654e-06, + "loss": 0.4025, + "step": 732 + }, + { + "epoch": 0.14731785428342675, + "grad_norm": 3.04292893409729, + "learning_rate": 6.563753027064355e-06, + "loss": 0.2776, + "step": 734 + }, + { + "epoch": 0.14731785428342675, + "grad_norm": 3.164085626602173, + "learning_rate": 6.576880452328645e-06, + "loss": 0.2632, + "step": 736 + }, + { + "epoch": 0.14811849479583666, + "grad_norm": 8.899778366088867, + "learning_rate": 6.590014563019571e-06, + "loss": 0.3483, + "step": 738 + }, + { + "epoch": 0.14811849479583666, + "grad_norm": 2.2495205402374268, + "learning_rate": 6.603155333485934e-06, + "loss": 0.2524, + "step": 740 + }, + { + "epoch": 0.1489191353082466, + "grad_norm": 2.032424211502075, + "learning_rate": 6.61630273806352e-06, + "loss": 0.1068, + "step": 742 + }, + { + "epoch": 0.1489191353082466, + "grad_norm": 7.539082050323486, + "learning_rate": 6.6294567510751675e-06, + "loss": 0.3048, + "step": 744 + }, + { + "epoch": 0.14971977582065651, + "grad_norm": 4.074437618255615, + "learning_rate": 6.642617346830784e-06, + "loss": 0.2444, + "step": 746 + }, + { + "epoch": 0.14971977582065651, + "grad_norm": 3.6053149700164795, + "learning_rate": 6.655784499627476e-06, + "loss": 0.3325, + "step": 748 + }, + { + "epoch": 0.15052041633306645, + "grad_norm": 7.177565097808838, + "learning_rate": 6.6689581837494925e-06, + "loss": 0.4536, + "step": 750 + }, + { + "epoch": 0.15052041633306645, + "grad_norm": 3.257957696914673, + "learning_rate": 6.682138373468341e-06, + "loss": 0.4648, + "step": 752 + }, + { + "epoch": 0.1513210568454764, + "grad_norm": 1.3324891328811646, + "learning_rate": 6.695325043042827e-06, + "loss": 0.0759, + "step": 754 + }, + { + "epoch": 0.1513210568454764, + "grad_norm": 0.7022082209587097, + "learning_rate": 6.7085181667191e-06, + "loss": 0.0464, + "step": 756 + }, + { + "epoch": 0.1521216973578863, + "grad_norm": 0.26039180159568787, + "learning_rate": 6.7217177187307e-06, + "loss": 0.1664, + "step": 758 + }, + { + "epoch": 0.1521216973578863, + "grad_norm": 0.4494319558143616, + "learning_rate": 6.734923673298605e-06, + "loss": 0.0234, + "step": 760 + }, + { + "epoch": 0.15292233787029624, + "grad_norm": 2.069424867630005, + "learning_rate": 6.748136004631327e-06, + "loss": 0.1672, + "step": 762 + }, + { + "epoch": 0.15292233787029624, + "grad_norm": 3.3659703731536865, + "learning_rate": 6.761354686924883e-06, + "loss": 0.1027, + "step": 764 + }, + { + "epoch": 0.15372297838270615, + "grad_norm": 4.30006217956543, + "learning_rate": 6.774579694362902e-06, + "loss": 0.4147, + "step": 766 + }, + { + "epoch": 0.15372297838270615, + "grad_norm": 1.989828109741211, + "learning_rate": 6.787811001116654e-06, + "loss": 0.2636, + "step": 768 + }, + { + "epoch": 0.1545236188951161, + "grad_norm": 0.29658645391464233, + "learning_rate": 6.801048581345113e-06, + "loss": 0.4599, + "step": 770 + }, + { + "epoch": 0.1545236188951161, + "grad_norm": 10.556374549865723, + "learning_rate": 6.8142924091949955e-06, + "loss": 0.2149, + "step": 772 + }, + { + "epoch": 0.15532425940752603, + "grad_norm": 2.6858749389648438, + "learning_rate": 6.827542458800804e-06, + "loss": 0.1765, + "step": 774 + }, + { + "epoch": 0.15532425940752603, + "grad_norm": 0.1258305013179779, + "learning_rate": 6.840798704284939e-06, + "loss": 0.0172, + "step": 776 + }, + { + "epoch": 0.15612489991993594, + "grad_norm": 3.187528133392334, + "learning_rate": 6.854061119757647e-06, + "loss": 0.341, + "step": 778 + }, + { + "epoch": 0.15612489991993594, + "grad_norm": 6.165633678436279, + "learning_rate": 6.867329679317144e-06, + "loss": 0.2499, + "step": 780 + }, + { + "epoch": 0.15692554043234588, + "grad_norm": 3.798178195953369, + "learning_rate": 6.880604357049646e-06, + "loss": 0.9506, + "step": 782 + }, + { + "epoch": 0.15692554043234588, + "grad_norm": 1.3152011632919312, + "learning_rate": 6.893885127029419e-06, + "loss": 0.079, + "step": 784 + }, + { + "epoch": 0.1577261809447558, + "grad_norm": 5.850693702697754, + "learning_rate": 6.907171963318815e-06, + "loss": 0.3967, + "step": 786 + }, + { + "epoch": 0.1577261809447558, + "grad_norm": 0.9051020741462708, + "learning_rate": 6.920464839968391e-06, + "loss": 0.1762, + "step": 788 + }, + { + "epoch": 0.15852682145716573, + "grad_norm": 3.7642264366149902, + "learning_rate": 6.9337637310168494e-06, + "loss": 0.1893, + "step": 790 + }, + { + "epoch": 0.15852682145716573, + "grad_norm": 3.01886248588562, + "learning_rate": 6.94706861049117e-06, + "loss": 0.142, + "step": 792 + }, + { + "epoch": 0.15932746196957567, + "grad_norm": 0.789147138595581, + "learning_rate": 6.960379452406636e-06, + "loss": 0.1361, + "step": 794 + }, + { + "epoch": 0.15932746196957567, + "grad_norm": 4.018482208251953, + "learning_rate": 6.973696230766884e-06, + "loss": 0.1123, + "step": 796 + }, + { + "epoch": 0.16012810248198558, + "grad_norm": 12.77450942993164, + "learning_rate": 6.9870189195639595e-06, + "loss": 0.9882, + "step": 798 + }, + { + "epoch": 0.16012810248198558, + "grad_norm": 0.2976391911506653, + "learning_rate": 7.000347492778341e-06, + "loss": 0.4599, + "step": 800 + }, + { + "epoch": 0.16092874299439552, + "grad_norm": 14.961612701416016, + "learning_rate": 7.013681924379073e-06, + "loss": 0.4251, + "step": 802 + }, + { + "epoch": 0.16092874299439552, + "grad_norm": 1.0256636142730713, + "learning_rate": 7.027022188323704e-06, + "loss": 0.0971, + "step": 804 + }, + { + "epoch": 0.16172938350680544, + "grad_norm": 2.6278669834136963, + "learning_rate": 7.040368258558412e-06, + "loss": 0.4792, + "step": 806 + }, + { + "epoch": 0.16172938350680544, + "grad_norm": 1.1832668781280518, + "learning_rate": 7.05372010901803e-06, + "loss": 0.1644, + "step": 808 + }, + { + "epoch": 0.16253002401921537, + "grad_norm": 1.853737235069275, + "learning_rate": 7.0670777136261035e-06, + "loss": 0.151, + "step": 810 + }, + { + "epoch": 0.16253002401921537, + "grad_norm": 2.166440010070801, + "learning_rate": 7.080441046294945e-06, + "loss": 0.1784, + "step": 812 + }, + { + "epoch": 0.1633306645316253, + "grad_norm": 3.9319541454315186, + "learning_rate": 7.093810080925657e-06, + "loss": 0.2655, + "step": 814 + }, + { + "epoch": 0.1633306645316253, + "grad_norm": 0.1573193073272705, + "learning_rate": 7.1071847914082605e-06, + "loss": 0.0295, + "step": 816 + }, + { + "epoch": 0.16413130504403523, + "grad_norm": 4.863795757293701, + "learning_rate": 7.120565151621638e-06, + "loss": 0.2713, + "step": 818 + }, + { + "epoch": 0.16413130504403523, + "grad_norm": 0.4033626914024353, + "learning_rate": 7.133951135433656e-06, + "loss": 0.0689, + "step": 820 + }, + { + "epoch": 0.16493194555644516, + "grad_norm": 3.697798013687134, + "learning_rate": 7.1473427167012e-06, + "loss": 0.7959, + "step": 822 + }, + { + "epoch": 0.16493194555644516, + "grad_norm": 2.8913819789886475, + "learning_rate": 7.160739869270219e-06, + "loss": 0.303, + "step": 824 + }, + { + "epoch": 0.16573258606885508, + "grad_norm": 2.190490961074829, + "learning_rate": 7.1741425669757854e-06, + "loss": 0.2639, + "step": 826 + }, + { + "epoch": 0.16573258606885508, + "grad_norm": 7.330079555511475, + "learning_rate": 7.18755078364214e-06, + "loss": 0.3086, + "step": 828 + }, + { + "epoch": 0.16653322658126501, + "grad_norm": 4.309293270111084, + "learning_rate": 7.200964493082727e-06, + "loss": 0.6679, + "step": 830 + }, + { + "epoch": 0.16653322658126501, + "grad_norm": 4.358169078826904, + "learning_rate": 7.214383669100317e-06, + "loss": 0.3836, + "step": 832 + }, + { + "epoch": 0.16733386709367493, + "grad_norm": 3.6948628425598145, + "learning_rate": 7.227808285486952e-06, + "loss": 0.4708, + "step": 834 + }, + { + "epoch": 0.16733386709367493, + "grad_norm": 1.2524821758270264, + "learning_rate": 7.241238316024064e-06, + "loss": 0.1293, + "step": 836 + }, + { + "epoch": 0.16813450760608487, + "grad_norm": 1.5137959718704224, + "learning_rate": 7.254673734482513e-06, + "loss": 0.3005, + "step": 838 + }, + { + "epoch": 0.16813450760608487, + "grad_norm": 1.1660137176513672, + "learning_rate": 7.268114514622635e-06, + "loss": 0.0623, + "step": 840 + }, + { + "epoch": 0.1689351481184948, + "grad_norm": 1.8562008142471313, + "learning_rate": 7.2815606301942945e-06, + "loss": 0.2447, + "step": 842 + }, + { + "epoch": 0.1689351481184948, + "grad_norm": 2.818376064300537, + "learning_rate": 7.2950120549369204e-06, + "loss": 0.1822, + "step": 844 + }, + { + "epoch": 0.16973578863090472, + "grad_norm": 2.179291248321533, + "learning_rate": 7.308468762579623e-06, + "loss": 0.4027, + "step": 846 + }, + { + "epoch": 0.16973578863090472, + "grad_norm": 0.8923459649085999, + "learning_rate": 7.321930726841144e-06, + "loss": 0.2402, + "step": 848 + }, + { + "epoch": 0.17053642914331466, + "grad_norm": 2.08050799369812, + "learning_rate": 7.3353979214299765e-06, + "loss": 0.4259, + "step": 850 + }, + { + "epoch": 0.17053642914331466, + "grad_norm": 1.8339521884918213, + "learning_rate": 7.348870320044395e-06, + "loss": 0.29, + "step": 852 + }, + { + "epoch": 0.17133706965572457, + "grad_norm": 2.4560959339141846, + "learning_rate": 7.362347896372515e-06, + "loss": 0.2389, + "step": 854 + }, + { + "epoch": 0.17133706965572457, + "grad_norm": 0.4332774579524994, + "learning_rate": 7.375830624092336e-06, + "loss": 0.1193, + "step": 856 + }, + { + "epoch": 0.1721377101681345, + "grad_norm": 6.64344596862793, + "learning_rate": 7.389318476871784e-06, + "loss": 0.5356, + "step": 858 + }, + { + "epoch": 0.1721377101681345, + "grad_norm": 1.6086114645004272, + "learning_rate": 7.402811428368824e-06, + "loss": 0.1548, + "step": 860 + }, + { + "epoch": 0.17293835068054444, + "grad_norm": 3.3224079608917236, + "learning_rate": 7.416309452231411e-06, + "loss": 0.4092, + "step": 862 + }, + { + "epoch": 0.17293835068054444, + "grad_norm": 0.8371086716651917, + "learning_rate": 7.429812522097613e-06, + "loss": 0.0857, + "step": 864 + }, + { + "epoch": 0.17373899119295436, + "grad_norm": 1.9845166206359863, + "learning_rate": 7.443320611595641e-06, + "loss": 0.3146, + "step": 866 + }, + { + "epoch": 0.17373899119295436, + "grad_norm": 1.9986348152160645, + "learning_rate": 7.4568336943439055e-06, + "loss": 0.0981, + "step": 868 + }, + { + "epoch": 0.1745396317053643, + "grad_norm": 2.900111198425293, + "learning_rate": 7.470351743951061e-06, + "loss": 0.4399, + "step": 870 + }, + { + "epoch": 0.1745396317053643, + "grad_norm": 0.7706056237220764, + "learning_rate": 7.4838747340160475e-06, + "loss": 0.034, + "step": 872 + }, + { + "epoch": 0.1753402722177742, + "grad_norm": 3.21415376663208, + "learning_rate": 7.497402638128209e-06, + "loss": 0.3657, + "step": 874 + }, + { + "epoch": 0.1753402722177742, + "grad_norm": 0.9737322926521301, + "learning_rate": 7.510935429867233e-06, + "loss": 0.1066, + "step": 876 + }, + { + "epoch": 0.17614091273018415, + "grad_norm": 2.5714962482452393, + "learning_rate": 7.52447308280329e-06, + "loss": 0.1227, + "step": 878 + }, + { + "epoch": 0.17614091273018415, + "grad_norm": 1.5249933004379272, + "learning_rate": 7.538015570497046e-06, + "loss": 0.1436, + "step": 880 + }, + { + "epoch": 0.17694155324259409, + "grad_norm": 3.4244651794433594, + "learning_rate": 7.551562866499732e-06, + "loss": 0.3713, + "step": 882 + }, + { + "epoch": 0.17694155324259409, + "grad_norm": 2.2829418182373047, + "learning_rate": 7.5651149443531846e-06, + "loss": 0.2373, + "step": 884 + }, + { + "epoch": 0.177742193755004, + "grad_norm": 1.0020244121551514, + "learning_rate": 7.578671777589884e-06, + "loss": 0.1396, + "step": 886 + }, + { + "epoch": 0.177742193755004, + "grad_norm": 1.226610779762268, + "learning_rate": 7.592233339733077e-06, + "loss": 0.0584, + "step": 888 + }, + { + "epoch": 0.17854283426741394, + "grad_norm": 3.2434868812561035, + "learning_rate": 7.605799604296721e-06, + "loss": 0.3829, + "step": 890 + }, + { + "epoch": 0.17854283426741394, + "grad_norm": 2.9036407470703125, + "learning_rate": 7.619370544785608e-06, + "loss": 0.3487, + "step": 892 + }, + { + "epoch": 0.17934347477982385, + "grad_norm": 1.4213128089904785, + "learning_rate": 7.632946134695396e-06, + "loss": 0.3793, + "step": 894 + }, + { + "epoch": 0.17934347477982385, + "grad_norm": 3.093550443649292, + "learning_rate": 7.646526347512665e-06, + "loss": 0.1953, + "step": 896 + }, + { + "epoch": 0.1801441152922338, + "grad_norm": 1.0177700519561768, + "learning_rate": 7.660111156714964e-06, + "loss": 0.104, + "step": 898 + }, + { + "epoch": 0.1801441152922338, + "grad_norm": 0.48696082830429077, + "learning_rate": 7.67370053577085e-06, + "loss": 0.0444, + "step": 900 + }, + { + "epoch": 0.18094475580464373, + "grad_norm": 7.853082180023193, + "learning_rate": 7.687294458140006e-06, + "loss": 0.2372, + "step": 902 + }, + { + "epoch": 0.18094475580464373, + "grad_norm": 0.3764018416404724, + "learning_rate": 7.70089289727319e-06, + "loss": 0.168, + "step": 904 + }, + { + "epoch": 0.18174539631705364, + "grad_norm": 1.3835159540176392, + "learning_rate": 7.714495826612353e-06, + "loss": 0.1824, + "step": 906 + }, + { + "epoch": 0.18174539631705364, + "grad_norm": 0.04649856686592102, + "learning_rate": 7.728103219590684e-06, + "loss": 0.0351, + "step": 908 + }, + { + "epoch": 0.18254603682946358, + "grad_norm": 2.408698797225952, + "learning_rate": 7.741715049632646e-06, + "loss": 0.2415, + "step": 910 + }, + { + "epoch": 0.18254603682946358, + "grad_norm": 1.6619751453399658, + "learning_rate": 7.755331290154041e-06, + "loss": 0.1025, + "step": 912 + }, + { + "epoch": 0.1833466773418735, + "grad_norm": 0.0678478479385376, + "learning_rate": 7.76895191456204e-06, + "loss": 0.1207, + "step": 914 + }, + { + "epoch": 0.1833466773418735, + "grad_norm": 1.2040098905563354, + "learning_rate": 7.7825768962553e-06, + "loss": 0.1593, + "step": 916 + }, + { + "epoch": 0.18414731785428343, + "grad_norm": 1.0690350532531738, + "learning_rate": 7.796206208623925e-06, + "loss": 0.1747, + "step": 918 + }, + { + "epoch": 0.18414731785428343, + "grad_norm": 1.979194164276123, + "learning_rate": 7.809839825049565e-06, + "loss": 0.1189, + "step": 920 + }, + { + "epoch": 0.18494795836669337, + "grad_norm": 2.3235013484954834, + "learning_rate": 7.82347771890548e-06, + "loss": 0.1018, + "step": 922 + }, + { + "epoch": 0.18494795836669337, + "grad_norm": 3.4005424976348877, + "learning_rate": 7.83711986355656e-06, + "loss": 0.2318, + "step": 924 + }, + { + "epoch": 0.18574859887910328, + "grad_norm": 3.8044965267181396, + "learning_rate": 7.850766232359408e-06, + "loss": 0.1039, + "step": 926 + }, + { + "epoch": 0.18574859887910328, + "grad_norm": 0.40569543838500977, + "learning_rate": 7.864416798662347e-06, + "loss": 0.1088, + "step": 928 + }, + { + "epoch": 0.18654923939151322, + "grad_norm": 1.9753904342651367, + "learning_rate": 7.878071535805564e-06, + "loss": 0.1759, + "step": 930 + }, + { + "epoch": 0.18654923939151322, + "grad_norm": 4.064412593841553, + "learning_rate": 7.891730417121043e-06, + "loss": 0.1899, + "step": 932 + }, + { + "epoch": 0.18734987990392313, + "grad_norm": 1.7872743606567383, + "learning_rate": 7.90539341593269e-06, + "loss": 0.0979, + "step": 934 + }, + { + "epoch": 0.18734987990392313, + "grad_norm": 3.029320240020752, + "learning_rate": 7.919060505556376e-06, + "loss": 0.1098, + "step": 936 + }, + { + "epoch": 0.18815052041633307, + "grad_norm": 0.2298375368118286, + "learning_rate": 7.932731659299978e-06, + "loss": 0.0426, + "step": 938 + }, + { + "epoch": 0.18815052041633307, + "grad_norm": 1.5646917819976807, + "learning_rate": 7.946406850463435e-06, + "loss": 0.0715, + "step": 940 + }, + { + "epoch": 0.188951160928743, + "grad_norm": 5.812572956085205, + "learning_rate": 7.960086052338788e-06, + "loss": 0.4252, + "step": 942 + }, + { + "epoch": 0.188951160928743, + "grad_norm": 0.33225002884864807, + "learning_rate": 7.973769238210291e-06, + "loss": 0.0319, + "step": 944 + }, + { + "epoch": 0.18975180144115292, + "grad_norm": 4.8428449630737305, + "learning_rate": 7.987456381354371e-06, + "loss": 0.5564, + "step": 946 + }, + { + "epoch": 0.18975180144115292, + "grad_norm": 7.3845672607421875, + "learning_rate": 8.001147455039737e-06, + "loss": 0.1352, + "step": 948 + }, + { + "epoch": 0.19055244195356286, + "grad_norm": 3.5620193481445312, + "learning_rate": 8.01484243252743e-06, + "loss": 0.5418, + "step": 950 + }, + { + "epoch": 0.19055244195356286, + "grad_norm": 0.9196518063545227, + "learning_rate": 8.028541287070858e-06, + "loss": 0.0904, + "step": 952 + }, + { + "epoch": 0.19135308246597277, + "grad_norm": 3.536456823348999, + "learning_rate": 8.042243991915866e-06, + "loss": 0.1849, + "step": 954 + }, + { + "epoch": 0.19135308246597277, + "grad_norm": 0.7981649041175842, + "learning_rate": 8.055950520300756e-06, + "loss": 0.0749, + "step": 956 + }, + { + "epoch": 0.1921537229783827, + "grad_norm": 5.120934009552002, + "learning_rate": 8.069660845456411e-06, + "loss": 0.5655, + "step": 958 + }, + { + "epoch": 0.1921537229783827, + "grad_norm": 7.370887756347656, + "learning_rate": 8.083374940606256e-06, + "loss": 0.1637, + "step": 960 + }, + { + "epoch": 0.19295436349079265, + "grad_norm": 2.3627796173095703, + "learning_rate": 8.097092778966364e-06, + "loss": 0.3923, + "step": 962 + }, + { + "epoch": 0.19295436349079265, + "grad_norm": 2.3376715183258057, + "learning_rate": 8.110814333745503e-06, + "loss": 0.1234, + "step": 964 + }, + { + "epoch": 0.19375500400320256, + "grad_norm": 0.18484675884246826, + "learning_rate": 8.124539578145176e-06, + "loss": 0.0765, + "step": 966 + }, + { + "epoch": 0.19375500400320256, + "grad_norm": 1.591740608215332, + "learning_rate": 8.138268485359684e-06, + "loss": 0.0558, + "step": 968 + }, + { + "epoch": 0.1945556445156125, + "grad_norm": 2.6780142784118652, + "learning_rate": 8.152001028576158e-06, + "loss": 0.2163, + "step": 970 + }, + { + "epoch": 0.1945556445156125, + "grad_norm": 4.302670955657959, + "learning_rate": 8.165737180974676e-06, + "loss": 0.247, + "step": 972 + }, + { + "epoch": 0.1953562850280224, + "grad_norm": 2.10835862159729, + "learning_rate": 8.179476915728217e-06, + "loss": 0.1195, + "step": 974 + }, + { + "epoch": 0.1953562850280224, + "grad_norm": 0.6982696056365967, + "learning_rate": 8.193220206002785e-06, + "loss": 0.1106, + "step": 976 + }, + { + "epoch": 0.19615692554043235, + "grad_norm": 6.05687141418457, + "learning_rate": 8.206967024957432e-06, + "loss": 0.2927, + "step": 978 + }, + { + "epoch": 0.19615692554043235, + "grad_norm": 0.801679253578186, + "learning_rate": 8.220717345744326e-06, + "loss": 0.107, + "step": 980 + }, + { + "epoch": 0.1969575660528423, + "grad_norm": 4.5012688636779785, + "learning_rate": 8.234471141508773e-06, + "loss": 0.3884, + "step": 982 + }, + { + "epoch": 0.1969575660528423, + "grad_norm": 3.2769672870635986, + "learning_rate": 8.248228385389349e-06, + "loss": 0.0586, + "step": 984 + }, + { + "epoch": 0.1977582065652522, + "grad_norm": 5.125418663024902, + "learning_rate": 8.261989050517841e-06, + "loss": 0.4373, + "step": 986 + }, + { + "epoch": 0.1977582065652522, + "grad_norm": 5.005241870880127, + "learning_rate": 8.275753110019367e-06, + "loss": 0.3654, + "step": 988 + }, + { + "epoch": 0.19855884707766214, + "grad_norm": 3.142179012298584, + "learning_rate": 8.289520537012428e-06, + "loss": 0.2895, + "step": 990 + }, + { + "epoch": 0.19855884707766214, + "grad_norm": 3.512333393096924, + "learning_rate": 8.303291304608936e-06, + "loss": 0.1599, + "step": 992 + }, + { + "epoch": 0.19935948759007205, + "grad_norm": 6.499198913574219, + "learning_rate": 8.317065385914285e-06, + "loss": 0.8007, + "step": 994 + }, + { + "epoch": 0.19935948759007205, + "grad_norm": 5.462932109832764, + "learning_rate": 8.330842754027378e-06, + "loss": 0.318, + "step": 996 + }, + { + "epoch": 0.200160128102482, + "grad_norm": 6.850478172302246, + "learning_rate": 8.344623382040752e-06, + "loss": 0.4536, + "step": 998 + }, + { + "epoch": 0.200160128102482, + "grad_norm": 4.702681541442871, + "learning_rate": 8.358407243040524e-06, + "loss": 0.2158, + "step": 1000 + }, + { + "epoch": 0.20096076861489193, + "grad_norm": 4.148573875427246, + "learning_rate": 8.372194310106515e-06, + "loss": 0.2161, + "step": 1002 + }, + { + "epoch": 0.20096076861489193, + "grad_norm": 2.430013418197632, + "learning_rate": 8.385984556312285e-06, + "loss": 0.1863, + "step": 1004 + }, + { + "epoch": 0.20176140912730184, + "grad_norm": 3.773601531982422, + "learning_rate": 8.399777954725183e-06, + "loss": 0.5343, + "step": 1006 + }, + { + "epoch": 0.20176140912730184, + "grad_norm": 0.6426126956939697, + "learning_rate": 8.413574478406386e-06, + "loss": 0.1874, + "step": 1008 + }, + { + "epoch": 0.20256204963971178, + "grad_norm": 2.144646167755127, + "learning_rate": 8.427374100411022e-06, + "loss": 0.1282, + "step": 1010 + }, + { + "epoch": 0.20256204963971178, + "grad_norm": 1.1615400314331055, + "learning_rate": 8.441176793788106e-06, + "loss": 0.2243, + "step": 1012 + }, + { + "epoch": 0.2033626901521217, + "grad_norm": 3.0456247329711914, + "learning_rate": 8.454982531580687e-06, + "loss": 0.5957, + "step": 1014 + }, + { + "epoch": 0.2033626901521217, + "grad_norm": 3.518455982208252, + "learning_rate": 8.468791286825856e-06, + "loss": 0.0896, + "step": 1016 + }, + { + "epoch": 0.20416333066453163, + "grad_norm": 4.13201904296875, + "learning_rate": 8.482603032554812e-06, + "loss": 0.2386, + "step": 1018 + }, + { + "epoch": 0.20416333066453163, + "grad_norm": 6.20754337310791, + "learning_rate": 8.496417741792922e-06, + "loss": 0.29, + "step": 1020 + }, + { + "epoch": 0.20496397117694154, + "grad_norm": 9.310336112976074, + "learning_rate": 8.510235387559738e-06, + "loss": 0.2621, + "step": 1022 + }, + { + "epoch": 0.20496397117694154, + "grad_norm": 3.823458433151245, + "learning_rate": 8.524055942869135e-06, + "loss": 0.1145, + "step": 1024 + }, + { + "epoch": 0.20576461168935148, + "grad_norm": 0.7818806767463684, + "learning_rate": 8.537879380729254e-06, + "loss": 0.1802, + "step": 1026 + }, + { + "epoch": 0.20576461168935148, + "grad_norm": 1.430079698562622, + "learning_rate": 8.551705674142616e-06, + "loss": 0.0412, + "step": 1028 + }, + { + "epoch": 0.20656525220176142, + "grad_norm": 3.6036953926086426, + "learning_rate": 8.565534796106175e-06, + "loss": 0.2504, + "step": 1030 + }, + { + "epoch": 0.20656525220176142, + "grad_norm": 4.325464725494385, + "learning_rate": 8.579366719611353e-06, + "loss": 0.0455, + "step": 1032 + }, + { + "epoch": 0.20736589271417133, + "grad_norm": 0.6585804224014282, + "learning_rate": 8.593201417644091e-06, + "loss": 0.1014, + "step": 1034 + }, + { + "epoch": 0.20736589271417133, + "grad_norm": 0.6883473992347717, + "learning_rate": 8.607038863184952e-06, + "loss": 0.0792, + "step": 1036 + }, + { + "epoch": 0.20816653322658127, + "grad_norm": 3.171746015548706, + "learning_rate": 8.620879029209093e-06, + "loss": 0.1969, + "step": 1038 + }, + { + "epoch": 0.20816653322658127, + "grad_norm": 0.020111123099923134, + "learning_rate": 8.634721888686368e-06, + "loss": 0.008, + "step": 1040 + }, + { + "epoch": 0.20896717373899118, + "grad_norm": 3.7079508304595947, + "learning_rate": 8.648567414581372e-06, + "loss": 0.3042, + "step": 1042 + }, + { + "epoch": 0.20896717373899118, + "grad_norm": 2.0614211559295654, + "learning_rate": 8.662415579853495e-06, + "loss": 0.0846, + "step": 1044 + }, + { + "epoch": 0.20976781425140112, + "grad_norm": 5.929505825042725, + "learning_rate": 8.676266357456968e-06, + "loss": 0.3344, + "step": 1046 + }, + { + "epoch": 0.20976781425140112, + "grad_norm": 0.08310666680335999, + "learning_rate": 8.690119720340907e-06, + "loss": 0.0039, + "step": 1048 + }, + { + "epoch": 0.21056845476381106, + "grad_norm": 4.560184478759766, + "learning_rate": 8.703975641449426e-06, + "loss": 0.2615, + "step": 1050 + }, + { + "epoch": 0.21056845476381106, + "grad_norm": 6.260319709777832, + "learning_rate": 8.717834093721598e-06, + "loss": 0.3146, + "step": 1052 + }, + { + "epoch": 0.21136909527622097, + "grad_norm": 0.3967616856098175, + "learning_rate": 8.731695050091561e-06, + "loss": 0.145, + "step": 1054 + }, + { + "epoch": 0.21136909527622097, + "grad_norm": 0.4248598515987396, + "learning_rate": 8.74555848348857e-06, + "loss": 0.1165, + "step": 1056 + }, + { + "epoch": 0.2121697357886309, + "grad_norm": 2.145010471343994, + "learning_rate": 8.759424366837035e-06, + "loss": 0.3624, + "step": 1058 + }, + { + "epoch": 0.2121697357886309, + "grad_norm": 0.18818777799606323, + "learning_rate": 8.773292673056572e-06, + "loss": 0.0625, + "step": 1060 + }, + { + "epoch": 0.21297037630104082, + "grad_norm": 9.511555671691895, + "learning_rate": 8.787163375062113e-06, + "loss": 0.1141, + "step": 1062 + }, + { + "epoch": 0.21297037630104082, + "grad_norm": 1.2696008682250977, + "learning_rate": 8.801036445763858e-06, + "loss": 0.101, + "step": 1064 + }, + { + "epoch": 0.21377101681345076, + "grad_norm": 2.169360876083374, + "learning_rate": 8.8149118580674e-06, + "loss": 0.0951, + "step": 1066 + }, + { + "epoch": 0.21377101681345076, + "grad_norm": 2.1604015827178955, + "learning_rate": 8.828789584873757e-06, + "loss": 0.054, + "step": 1068 + }, + { + "epoch": 0.2145716573258607, + "grad_norm": 2.564650774002075, + "learning_rate": 8.84266959907943e-06, + "loss": 0.2855, + "step": 1070 + }, + { + "epoch": 0.2145716573258607, + "grad_norm": 0.012313512153923512, + "learning_rate": 8.856551873576448e-06, + "loss": 0.0405, + "step": 1072 + }, + { + "epoch": 0.2153722978382706, + "grad_norm": 5.476550579071045, + "learning_rate": 8.870436381252412e-06, + "loss": 0.5742, + "step": 1074 + }, + { + "epoch": 0.2153722978382706, + "grad_norm": 5.92363166809082, + "learning_rate": 8.884323094990613e-06, + "loss": 0.1948, + "step": 1076 + }, + { + "epoch": 0.21617293835068055, + "grad_norm": 6.86102294921875, + "learning_rate": 8.89821198766998e-06, + "loss": 0.5966, + "step": 1078 + }, + { + "epoch": 0.21617293835068055, + "grad_norm": 0.6846101880073547, + "learning_rate": 8.912103032165206e-06, + "loss": 0.0705, + "step": 1080 + }, + { + "epoch": 0.21697357886309046, + "grad_norm": 2.0893683433532715, + "learning_rate": 8.925996201346779e-06, + "loss": 0.1944, + "step": 1082 + }, + { + "epoch": 0.21697357886309046, + "grad_norm": 0.12105356901884079, + "learning_rate": 8.939891468081036e-06, + "loss": 0.0665, + "step": 1084 + }, + { + "epoch": 0.2177742193755004, + "grad_norm": 10.61878490447998, + "learning_rate": 8.953788805230209e-06, + "loss": 0.5432, + "step": 1086 + }, + { + "epoch": 0.2177742193755004, + "grad_norm": 0.505480945110321, + "learning_rate": 8.967688185652527e-06, + "loss": 0.1518, + "step": 1088 + }, + { + "epoch": 0.21857485988791034, + "grad_norm": 2.392848491668701, + "learning_rate": 8.981589582202184e-06, + "loss": 0.0818, + "step": 1090 + }, + { + "epoch": 0.21857485988791034, + "grad_norm": 1.290427803993225, + "learning_rate": 8.995492967729449e-06, + "loss": 0.0442, + "step": 1092 + }, + { + "epoch": 0.21937550040032025, + "grad_norm": 4.185362339019775, + "learning_rate": 9.009398315080712e-06, + "loss": 0.2219, + "step": 1094 + }, + { + "epoch": 0.21937550040032025, + "grad_norm": 9.187676429748535, + "learning_rate": 9.023305597098526e-06, + "loss": 0.2305, + "step": 1096 + }, + { + "epoch": 0.2201761409127302, + "grad_norm": 9.177149772644043, + "learning_rate": 9.037214786621669e-06, + "loss": 0.2941, + "step": 1098 + }, + { + "epoch": 0.2201761409127302, + "grad_norm": 0.5715648531913757, + "learning_rate": 9.051125856485175e-06, + "loss": 0.1305, + "step": 1100 + }, + { + "epoch": 0.2209767814251401, + "grad_norm": 0.11211008578538895, + "learning_rate": 9.065038779520457e-06, + "loss": 0.161, + "step": 1102 + }, + { + "epoch": 0.2209767814251401, + "grad_norm": 0.12706512212753296, + "learning_rate": 9.078953528555258e-06, + "loss": 0.0049, + "step": 1104 + }, + { + "epoch": 0.22177742193755004, + "grad_norm": 0.7942839860916138, + "learning_rate": 9.092870076413771e-06, + "loss": 0.1224, + "step": 1106 + }, + { + "epoch": 0.22177742193755004, + "grad_norm": 3.527724027633667, + "learning_rate": 9.106788395916682e-06, + "loss": 0.0899, + "step": 1108 + }, + { + "epoch": 0.22257806244995998, + "grad_norm": 16.31683921813965, + "learning_rate": 9.120708459881203e-06, + "loss": 1.1568, + "step": 1110 + }, + { + "epoch": 0.22257806244995998, + "grad_norm": 0.2121962606906891, + "learning_rate": 9.134630241121135e-06, + "loss": 0.1518, + "step": 1112 + }, + { + "epoch": 0.2233787029623699, + "grad_norm": 10.086137771606445, + "learning_rate": 9.148553712446971e-06, + "loss": 0.6349, + "step": 1114 + }, + { + "epoch": 0.2233787029623699, + "grad_norm": 1.0592637062072754, + "learning_rate": 9.162478846665854e-06, + "loss": 0.0442, + "step": 1116 + }, + { + "epoch": 0.22417934347477983, + "grad_norm": 0.29470095038414, + "learning_rate": 9.176405616581694e-06, + "loss": 0.346, + "step": 1118 + }, + { + "epoch": 0.22417934347477983, + "grad_norm": 2.5368611812591553, + "learning_rate": 9.190333994995208e-06, + "loss": 0.1507, + "step": 1120 + }, + { + "epoch": 0.22497998398718974, + "grad_norm": 2.134645700454712, + "learning_rate": 9.20426395470397e-06, + "loss": 0.3898, + "step": 1122 + }, + { + "epoch": 0.22497998398718974, + "grad_norm": 4.081484317779541, + "learning_rate": 9.218195468502469e-06, + "loss": 0.1065, + "step": 1124 + }, + { + "epoch": 0.22578062449959968, + "grad_norm": 8.43867301940918, + "learning_rate": 9.232128509182136e-06, + "loss": 0.3258, + "step": 1126 + }, + { + "epoch": 0.22578062449959968, + "grad_norm": 1.711873173713684, + "learning_rate": 9.24606304953148e-06, + "loss": 0.2295, + "step": 1128 + }, + { + "epoch": 0.22658126501200962, + "grad_norm": 1.0991772413253784, + "learning_rate": 9.259999062336021e-06, + "loss": 0.214, + "step": 1130 + }, + { + "epoch": 0.22658126501200962, + "grad_norm": 1.1201461553573608, + "learning_rate": 9.273936520378426e-06, + "loss": 0.0325, + "step": 1132 + }, + { + "epoch": 0.22738190552441953, + "grad_norm": 9.665533065795898, + "learning_rate": 9.287875396438536e-06, + "loss": 0.7842, + "step": 1134 + }, + { + "epoch": 0.22738190552441953, + "grad_norm": 10.207245826721191, + "learning_rate": 9.301815663293426e-06, + "loss": 0.175, + "step": 1136 + }, + { + "epoch": 0.22818254603682947, + "grad_norm": 10.602134704589844, + "learning_rate": 9.315757293717432e-06, + "loss": 0.2688, + "step": 1138 + }, + { + "epoch": 0.22818254603682947, + "grad_norm": 1.964490532875061, + "learning_rate": 9.329700260482286e-06, + "loss": 0.1586, + "step": 1140 + }, + { + "epoch": 0.22898318654923938, + "grad_norm": 7.414196014404297, + "learning_rate": 9.343644536357053e-06, + "loss": 0.363, + "step": 1142 + }, + { + "epoch": 0.22898318654923938, + "grad_norm": 0.5532780289649963, + "learning_rate": 9.35759009410826e-06, + "loss": 0.0342, + "step": 1144 + }, + { + "epoch": 0.22978382706164932, + "grad_norm": 0.3870878219604492, + "learning_rate": 9.37153690649993e-06, + "loss": 0.1993, + "step": 1146 + }, + { + "epoch": 0.22978382706164932, + "grad_norm": 2.6077816486358643, + "learning_rate": 9.38548494629364e-06, + "loss": 0.0886, + "step": 1148 + }, + { + "epoch": 0.23058446757405926, + "grad_norm": 3.366546154022217, + "learning_rate": 9.39943418624856e-06, + "loss": 0.2613, + "step": 1150 + }, + { + "epoch": 0.23058446757405926, + "grad_norm": 0.46467190980911255, + "learning_rate": 9.41338459912151e-06, + "loss": 0.0182, + "step": 1152 + }, + { + "epoch": 0.23138510808646917, + "grad_norm": 7.289209365844727, + "learning_rate": 9.427336157667062e-06, + "loss": 0.8186, + "step": 1154 + }, + { + "epoch": 0.23138510808646917, + "grad_norm": 0.9739195108413696, + "learning_rate": 9.441288834637507e-06, + "loss": 0.0988, + "step": 1156 + }, + { + "epoch": 0.2321857485988791, + "grad_norm": 4.7051496505737305, + "learning_rate": 9.45524260278296e-06, + "loss": 0.5344, + "step": 1158 + }, + { + "epoch": 0.2321857485988791, + "grad_norm": 2.1622889041900635, + "learning_rate": 9.469197434851414e-06, + "loss": 0.1894, + "step": 1160 + }, + { + "epoch": 0.23298638911128902, + "grad_norm": 1.6194286346435547, + "learning_rate": 9.483153303588777e-06, + "loss": 0.1812, + "step": 1162 + }, + { + "epoch": 0.23298638911128902, + "grad_norm": 3.3453094959259033, + "learning_rate": 9.497110181738935e-06, + "loss": 0.096, + "step": 1164 + }, + { + "epoch": 0.23378702962369896, + "grad_norm": 2.0823328495025635, + "learning_rate": 9.511068042043785e-06, + "loss": 0.2526, + "step": 1166 + }, + { + "epoch": 0.23378702962369896, + "grad_norm": 0.12816272675991058, + "learning_rate": 9.52502685724336e-06, + "loss": 0.0995, + "step": 1168 + }, + { + "epoch": 0.2345876701361089, + "grad_norm": 14.618685722351074, + "learning_rate": 9.538986600075773e-06, + "loss": 0.3515, + "step": 1170 + }, + { + "epoch": 0.2345876701361089, + "grad_norm": 1.7445372343063354, + "learning_rate": 9.552947243277342e-06, + "loss": 0.0535, + "step": 1172 + }, + { + "epoch": 0.2353883106485188, + "grad_norm": 3.57304310798645, + "learning_rate": 9.566908759582633e-06, + "loss": 0.2778, + "step": 1174 + }, + { + "epoch": 0.2353883106485188, + "grad_norm": 0.3465813398361206, + "learning_rate": 9.580871121724498e-06, + "loss": 0.0847, + "step": 1176 + }, + { + "epoch": 0.23618895116092875, + "grad_norm": 0.0434630811214447, + "learning_rate": 9.594834302434123e-06, + "loss": 0.427, + "step": 1178 + }, + { + "epoch": 0.23618895116092875, + "grad_norm": 2.558215856552124, + "learning_rate": 9.608798274441153e-06, + "loss": 0.2927, + "step": 1180 + }, + { + "epoch": 0.23698959167333866, + "grad_norm": 3.400484800338745, + "learning_rate": 9.622763010473628e-06, + "loss": 0.2076, + "step": 1182 + }, + { + "epoch": 0.23698959167333866, + "grad_norm": 0.5033531785011292, + "learning_rate": 9.636728483258116e-06, + "loss": 0.0831, + "step": 1184 + }, + { + "epoch": 0.2377902321857486, + "grad_norm": 12.481202125549316, + "learning_rate": 9.650694665519747e-06, + "loss": 0.298, + "step": 1186 + }, + { + "epoch": 0.2377902321857486, + "grad_norm": 3.7285196781158447, + "learning_rate": 9.664661529982263e-06, + "loss": 0.1479, + "step": 1188 + }, + { + "epoch": 0.23859087269815854, + "grad_norm": 0.2910246253013611, + "learning_rate": 9.678629049368077e-06, + "loss": 0.1264, + "step": 1190 + }, + { + "epoch": 0.23859087269815854, + "grad_norm": 0.3428088426589966, + "learning_rate": 9.692597196398302e-06, + "loss": 0.1045, + "step": 1192 + }, + { + "epoch": 0.23939151321056845, + "grad_norm": 1.9182852506637573, + "learning_rate": 9.706565943792879e-06, + "loss": 0.1359, + "step": 1194 + }, + { + "epoch": 0.23939151321056845, + "grad_norm": 0.059658270329236984, + "learning_rate": 9.720535264270526e-06, + "loss": 0.0041, + "step": 1196 + }, + { + "epoch": 0.2401921537229784, + "grad_norm": 5.064812660217285, + "learning_rate": 9.734505130548855e-06, + "loss": 0.4141, + "step": 1198 + }, + { + "epoch": 0.2401921537229784, + "grad_norm": 1.7536243200302124, + "learning_rate": 9.748475515344416e-06, + "loss": 0.1159, + "step": 1200 + }, + { + "epoch": 0.2409927942353883, + "grad_norm": 5.998963356018066, + "learning_rate": 9.762446391372746e-06, + "loss": 0.4537, + "step": 1202 + }, + { + "epoch": 0.2409927942353883, + "grad_norm": 2.863370895385742, + "learning_rate": 9.776417731348403e-06, + "loss": 0.2778, + "step": 1204 + }, + { + "epoch": 0.24179343474779824, + "grad_norm": 4.320647239685059, + "learning_rate": 9.790389507985091e-06, + "loss": 0.3499, + "step": 1206 + }, + { + "epoch": 0.24179343474779824, + "grad_norm": 2.023000478744507, + "learning_rate": 9.80436169399561e-06, + "loss": 0.0937, + "step": 1208 + }, + { + "epoch": 0.24259407526020815, + "grad_norm": 3.0558021068573, + "learning_rate": 9.81833426209198e-06, + "loss": 0.2607, + "step": 1210 + }, + { + "epoch": 0.24259407526020815, + "grad_norm": 0.22003920376300812, + "learning_rate": 9.832307184985473e-06, + "loss": 0.1016, + "step": 1212 + }, + { + "epoch": 0.2433947157726181, + "grad_norm": 0.8381786346435547, + "learning_rate": 9.846280435386668e-06, + "loss": 0.1283, + "step": 1214 + }, + { + "epoch": 0.2433947157726181, + "grad_norm": 10.45914077758789, + "learning_rate": 9.8602539860055e-06, + "loss": 0.13, + "step": 1216 + }, + { + "epoch": 0.24419535628502803, + "grad_norm": 4.897336959838867, + "learning_rate": 9.874227809551307e-06, + "loss": 0.0989, + "step": 1218 + }, + { + "epoch": 0.24419535628502803, + "grad_norm": 1.4955857992172241, + "learning_rate": 9.888201878732946e-06, + "loss": 0.0461, + "step": 1220 + }, + { + "epoch": 0.24499599679743794, + "grad_norm": 10.073846817016602, + "learning_rate": 9.902176166258738e-06, + "loss": 0.6944, + "step": 1222 + }, + { + "epoch": 0.24499599679743794, + "grad_norm": 0.07852977514266968, + "learning_rate": 9.916150644836596e-06, + "loss": 0.1462, + "step": 1224 + }, + { + "epoch": 0.24579663730984788, + "grad_norm": 9.66071605682373, + "learning_rate": 9.930125287174061e-06, + "loss": 0.7779, + "step": 1226 + }, + { + "epoch": 0.24579663730984788, + "grad_norm": 0.19854483008384705, + "learning_rate": 9.944100065978354e-06, + "loss": 0.1965, + "step": 1228 + }, + { + "epoch": 0.2465972778222578, + "grad_norm": 11.14420223236084, + "learning_rate": 9.958074953956413e-06, + "loss": 0.409, + "step": 1230 + }, + { + "epoch": 0.2465972778222578, + "grad_norm": 1.1786837577819824, + "learning_rate": 9.972049923815011e-06, + "loss": 0.0884, + "step": 1232 + }, + { + "epoch": 0.24739791833466773, + "grad_norm": 0.15301698446273804, + "learning_rate": 9.986024948260714e-06, + "loss": 0.163, + "step": 1234 + }, + { + "epoch": 0.24739791833466773, + "grad_norm": 1.4263633489608765, + "learning_rate": 9.999999999999996e-06, + "loss": 0.1613, + "step": 1236 + }, + { + "epoch": 0.24819855884707767, + "grad_norm": 2.948676824569702, + "learning_rate": 1.0013975051739277e-05, + "loss": 0.1797, + "step": 1238 + }, + { + "epoch": 0.24819855884707767, + "grad_norm": 0.24510517716407776, + "learning_rate": 1.0027950076184982e-05, + "loss": 0.0151, + "step": 1240 + }, + { + "epoch": 0.24899919935948758, + "grad_norm": 4.115630626678467, + "learning_rate": 1.004192504604358e-05, + "loss": 0.1662, + "step": 1242 + }, + { + "epoch": 0.24899919935948758, + "grad_norm": 0.48996153473854065, + "learning_rate": 1.0055899934021637e-05, + "loss": 0.0226, + "step": 1244 + }, + { + "epoch": 0.24979983987189752, + "grad_norm": 11.069146156311035, + "learning_rate": 1.006987471282593e-05, + "loss": 0.3487, + "step": 1246 + }, + { + "epoch": 0.24979983987189752, + "grad_norm": 1.264935851097107, + "learning_rate": 1.0083849355163397e-05, + "loss": 0.0357, + "step": 1248 + }, + { + "epoch": 0.25060048038430743, + "grad_norm": 6.976475238800049, + "learning_rate": 1.0097823833741255e-05, + "loss": 0.5996, + "step": 1250 + }, + { + "epoch": 0.25060048038430743, + "grad_norm": 0.8853281736373901, + "learning_rate": 1.0111798121267047e-05, + "loss": 0.0855, + "step": 1252 + }, + { + "epoch": 0.2514011208967174, + "grad_norm": 7.458795547485352, + "learning_rate": 1.0125772190448686e-05, + "loss": 0.5907, + "step": 1254 + }, + { + "epoch": 0.2514011208967174, + "grad_norm": 0.061160456389188766, + "learning_rate": 1.0139746013994493e-05, + "loss": 0.0067, + "step": 1256 + }, + { + "epoch": 0.2522017614091273, + "grad_norm": 1.25249183177948, + "learning_rate": 1.0153719564613327e-05, + "loss": 0.1768, + "step": 1258 + }, + { + "epoch": 0.2522017614091273, + "grad_norm": 0.27487096190452576, + "learning_rate": 1.016769281501452e-05, + "loss": 0.0406, + "step": 1260 + }, + { + "epoch": 0.2530024019215372, + "grad_norm": 3.1278483867645264, + "learning_rate": 1.018166573790801e-05, + "loss": 0.372, + "step": 1262 + }, + { + "epoch": 0.2530024019215372, + "grad_norm": 0.6599329113960266, + "learning_rate": 1.0195638306004383e-05, + "loss": 0.0707, + "step": 1264 + }, + { + "epoch": 0.25380304243394713, + "grad_norm": 5.169724464416504, + "learning_rate": 1.0209610492014904e-05, + "loss": 0.4707, + "step": 1266 + }, + { + "epoch": 0.25380304243394713, + "grad_norm": 8.746573448181152, + "learning_rate": 1.022358226865159e-05, + "loss": 0.5131, + "step": 1268 + }, + { + "epoch": 0.2546036829463571, + "grad_norm": 11.251683235168457, + "learning_rate": 1.0237553608627247e-05, + "loss": 0.6075, + "step": 1270 + }, + { + "epoch": 0.2546036829463571, + "grad_norm": 5.261743545532227, + "learning_rate": 1.0251524484655577e-05, + "loss": 0.3654, + "step": 1272 + }, + { + "epoch": 0.255404323458767, + "grad_norm": 4.347501277923584, + "learning_rate": 1.0265494869451138e-05, + "loss": 0.2268, + "step": 1274 + }, + { + "epoch": 0.255404323458767, + "grad_norm": 1.454378604888916, + "learning_rate": 1.0279464735729467e-05, + "loss": 0.0318, + "step": 1276 + }, + { + "epoch": 0.2562049639711769, + "grad_norm": 1.0934926271438599, + "learning_rate": 1.0293434056207114e-05, + "loss": 0.0443, + "step": 1278 + }, + { + "epoch": 0.2562049639711769, + "grad_norm": 0.43896493315696716, + "learning_rate": 1.0307402803601691e-05, + "loss": 0.0644, + "step": 1280 + }, + { + "epoch": 0.2570056044835869, + "grad_norm": 5.103718280792236, + "learning_rate": 1.0321370950631918e-05, + "loss": 0.3208, + "step": 1282 + }, + { + "epoch": 0.2570056044835869, + "grad_norm": 4.0250935554504395, + "learning_rate": 1.033533847001773e-05, + "loss": 0.2683, + "step": 1284 + }, + { + "epoch": 0.2578062449959968, + "grad_norm": 15.71651840209961, + "learning_rate": 1.0349305334480246e-05, + "loss": 0.8891, + "step": 1286 + }, + { + "epoch": 0.2578062449959968, + "grad_norm": 1.3387471437454224, + "learning_rate": 1.0363271516741877e-05, + "loss": 0.1287, + "step": 1288 + }, + { + "epoch": 0.2586068855084067, + "grad_norm": 6.212350845336914, + "learning_rate": 1.0377236989526366e-05, + "loss": 0.2639, + "step": 1290 + }, + { + "epoch": 0.2586068855084067, + "grad_norm": 2.084254503250122, + "learning_rate": 1.039120172555884e-05, + "loss": 0.1341, + "step": 1292 + }, + { + "epoch": 0.2594075260208166, + "grad_norm": 8.175564765930176, + "learning_rate": 1.0405165697565868e-05, + "loss": 0.2107, + "step": 1294 + }, + { + "epoch": 0.2594075260208166, + "grad_norm": 13.565658569335938, + "learning_rate": 1.0419128878275495e-05, + "loss": 0.2053, + "step": 1296 + }, + { + "epoch": 0.2602081665332266, + "grad_norm": 2.6404342651367188, + "learning_rate": 1.0433091240417362e-05, + "loss": 0.115, + "step": 1298 + }, + { + "epoch": 0.2602081665332266, + "grad_norm": 4.348731994628906, + "learning_rate": 1.0447052756722651e-05, + "loss": 0.2392, + "step": 1300 + }, + { + "epoch": 0.2610088070456365, + "grad_norm": 3.9827799797058105, + "learning_rate": 1.046101339992422e-05, + "loss": 0.0893, + "step": 1302 + }, + { + "epoch": 0.2610088070456365, + "grad_norm": 1.1935129165649414, + "learning_rate": 1.0474973142756632e-05, + "loss": 0.0341, + "step": 1304 + }, + { + "epoch": 0.2618094475580464, + "grad_norm": 7.56549596786499, + "learning_rate": 1.0488931957956208e-05, + "loss": 0.6954, + "step": 1306 + }, + { + "epoch": 0.2618094475580464, + "grad_norm": 1.6071922779083252, + "learning_rate": 1.0502889818261058e-05, + "loss": 0.0575, + "step": 1308 + }, + { + "epoch": 0.2626100880704564, + "grad_norm": 6.4388427734375, + "learning_rate": 1.0516846696411216e-05, + "loss": 0.4832, + "step": 1310 + }, + { + "epoch": 0.2626100880704564, + "grad_norm": 4.401688098907471, + "learning_rate": 1.053080256514858e-05, + "loss": 0.0887, + "step": 1312 + }, + { + "epoch": 0.2634107285828663, + "grad_norm": 5.868397235870361, + "learning_rate": 1.054475739721703e-05, + "loss": 0.5277, + "step": 1314 + }, + { + "epoch": 0.2634107285828663, + "grad_norm": 0.41200825572013855, + "learning_rate": 1.0558711165362488e-05, + "loss": 0.0661, + "step": 1316 + }, + { + "epoch": 0.2642113690952762, + "grad_norm": 2.306913375854492, + "learning_rate": 1.0572663842332931e-05, + "loss": 0.0742, + "step": 1318 + }, + { + "epoch": 0.2642113690952762, + "grad_norm": 1.91344153881073, + "learning_rate": 1.0586615400878484e-05, + "loss": 0.1581, + "step": 1320 + }, + { + "epoch": 0.26501200960768617, + "grad_norm": 12.408026695251465, + "learning_rate": 1.0600565813751433e-05, + "loss": 0.5655, + "step": 1322 + }, + { + "epoch": 0.26501200960768617, + "grad_norm": 3.2770888805389404, + "learning_rate": 1.0614515053706354e-05, + "loss": 0.1089, + "step": 1324 + }, + { + "epoch": 0.2658126501200961, + "grad_norm": 3.828705072402954, + "learning_rate": 1.0628463093500063e-05, + "loss": 0.3695, + "step": 1326 + }, + { + "epoch": 0.2658126501200961, + "grad_norm": 2.186295986175537, + "learning_rate": 1.0642409905891733e-05, + "loss": 0.2688, + "step": 1328 + }, + { + "epoch": 0.266613290632506, + "grad_norm": 11.682731628417969, + "learning_rate": 1.065635546364294e-05, + "loss": 0.2247, + "step": 1330 + }, + { + "epoch": 0.266613290632506, + "grad_norm": 1.7910422086715698, + "learning_rate": 1.0670299739517706e-05, + "loss": 0.1822, + "step": 1332 + }, + { + "epoch": 0.2674139311449159, + "grad_norm": 2.368471384048462, + "learning_rate": 1.0684242706282562e-05, + "loss": 0.0887, + "step": 1334 + }, + { + "epoch": 0.2674139311449159, + "grad_norm": 6.8485565185546875, + "learning_rate": 1.0698184336706567e-05, + "loss": 0.1845, + "step": 1336 + }, + { + "epoch": 0.2682145716573259, + "grad_norm": 1.7841463088989258, + "learning_rate": 1.0712124603561457e-05, + "loss": 0.3093, + "step": 1338 + }, + { + "epoch": 0.2682145716573259, + "grad_norm": 3.9047093391418457, + "learning_rate": 1.0726063479621567e-05, + "loss": 0.3537, + "step": 1340 + }, + { + "epoch": 0.2690152121697358, + "grad_norm": 5.913300514221191, + "learning_rate": 1.0740000937663972e-05, + "loss": 0.3656, + "step": 1342 + }, + { + "epoch": 0.2690152121697358, + "grad_norm": 4.769776821136475, + "learning_rate": 1.0753936950468513e-05, + "loss": 0.0737, + "step": 1344 + }, + { + "epoch": 0.2698158526821457, + "grad_norm": 5.208962917327881, + "learning_rate": 1.0767871490817856e-05, + "loss": 0.2701, + "step": 1346 + }, + { + "epoch": 0.2698158526821457, + "grad_norm": 1.9023667573928833, + "learning_rate": 1.0781804531497525e-05, + "loss": 0.0739, + "step": 1348 + }, + { + "epoch": 0.27061649319455566, + "grad_norm": 7.180617332458496, + "learning_rate": 1.0795736045296023e-05, + "loss": 0.2853, + "step": 1350 + }, + { + "epoch": 0.27061649319455566, + "grad_norm": 3.9352126121520996, + "learning_rate": 1.0809666005004787e-05, + "loss": 0.047, + "step": 1352 + }, + { + "epoch": 0.2714171337069656, + "grad_norm": 9.09389591217041, + "learning_rate": 1.08235943834183e-05, + "loss": 1.0918, + "step": 1354 + }, + { + "epoch": 0.2714171337069656, + "grad_norm": 1.996163249015808, + "learning_rate": 1.083752115333414e-05, + "loss": 0.2219, + "step": 1356 + }, + { + "epoch": 0.2722177742193755, + "grad_norm": 4.9678874015808105, + "learning_rate": 1.0851446287553022e-05, + "loss": 0.4298, + "step": 1358 + }, + { + "epoch": 0.2722177742193755, + "grad_norm": 2.471865177154541, + "learning_rate": 1.0865369758878858e-05, + "loss": 0.0572, + "step": 1360 + }, + { + "epoch": 0.27301841473178545, + "grad_norm": 0.099858358502388, + "learning_rate": 1.087929154011879e-05, + "loss": 0.2659, + "step": 1362 + }, + { + "epoch": 0.27301841473178545, + "grad_norm": 0.18502700328826904, + "learning_rate": 1.0893211604083311e-05, + "loss": 0.0145, + "step": 1364 + }, + { + "epoch": 0.27381905524419536, + "grad_norm": 2.0469491481781006, + "learning_rate": 1.090712992358622e-05, + "loss": 0.6105, + "step": 1366 + }, + { + "epoch": 0.27381905524419536, + "grad_norm": 4.166383743286133, + "learning_rate": 1.0921046471444737e-05, + "loss": 0.1498, + "step": 1368 + }, + { + "epoch": 0.2746196957566053, + "grad_norm": 6.3169708251953125, + "learning_rate": 1.0934961220479537e-05, + "loss": 0.7767, + "step": 1370 + }, + { + "epoch": 0.2746196957566053, + "grad_norm": 5.1938371658325195, + "learning_rate": 1.0948874143514818e-05, + "loss": 0.381, + "step": 1372 + }, + { + "epoch": 0.2754203362690152, + "grad_norm": 2.8779289722442627, + "learning_rate": 1.0962785213378325e-05, + "loss": 0.4444, + "step": 1374 + }, + { + "epoch": 0.2754203362690152, + "grad_norm": 0.8294830322265625, + "learning_rate": 1.0976694402901467e-05, + "loss": 0.0542, + "step": 1376 + }, + { + "epoch": 0.27622097678142515, + "grad_norm": 3.150517225265503, + "learning_rate": 1.0990601684919282e-05, + "loss": 0.3955, + "step": 1378 + }, + { + "epoch": 0.27622097678142515, + "grad_norm": 1.9811937808990479, + "learning_rate": 1.1004507032270544e-05, + "loss": 0.2195, + "step": 1380 + }, + { + "epoch": 0.27702161729383507, + "grad_norm": 2.6061928272247314, + "learning_rate": 1.1018410417797809e-05, + "loss": 0.2098, + "step": 1382 + }, + { + "epoch": 0.27702161729383507, + "grad_norm": 0.23917898535728455, + "learning_rate": 1.1032311814347467e-05, + "loss": 0.0873, + "step": 1384 + }, + { + "epoch": 0.277822257806245, + "grad_norm": 3.9853761196136475, + "learning_rate": 1.1046211194769784e-05, + "loss": 0.5113, + "step": 1386 + }, + { + "epoch": 0.277822257806245, + "grad_norm": 0.9220866560935974, + "learning_rate": 1.1060108531918955e-05, + "loss": 0.126, + "step": 1388 + }, + { + "epoch": 0.27862289831865494, + "grad_norm": 1.6102864742279053, + "learning_rate": 1.1074003798653215e-05, + "loss": 0.1584, + "step": 1390 + }, + { + "epoch": 0.27862289831865494, + "grad_norm": 5.952439785003662, + "learning_rate": 1.1087896967834787e-05, + "loss": 0.1666, + "step": 1392 + }, + { + "epoch": 0.27942353883106485, + "grad_norm": 1.1672078371047974, + "learning_rate": 1.1101788012330013e-05, + "loss": 0.0928, + "step": 1394 + }, + { + "epoch": 0.27942353883106485, + "grad_norm": 1.0884426832199097, + "learning_rate": 1.111567690500938e-05, + "loss": 0.0372, + "step": 1396 + }, + { + "epoch": 0.28022417934347477, + "grad_norm": 1.9977093935012817, + "learning_rate": 1.1129563618747581e-05, + "loss": 0.2666, + "step": 1398 + }, + { + "epoch": 0.28022417934347477, + "grad_norm": 1.2261021137237549, + "learning_rate": 1.1143448126423545e-05, + "loss": 0.0645, + "step": 1400 + }, + { + "epoch": 0.28102481985588473, + "grad_norm": 1.6730952262878418, + "learning_rate": 1.1157330400920563e-05, + "loss": 0.0908, + "step": 1402 + }, + { + "epoch": 0.28102481985588473, + "grad_norm": 2.7741458415985107, + "learning_rate": 1.1171210415126238e-05, + "loss": 0.0681, + "step": 1404 + }, + { + "epoch": 0.28182546036829464, + "grad_norm": 3.170379161834717, + "learning_rate": 1.1185088141932594e-05, + "loss": 0.4166, + "step": 1406 + }, + { + "epoch": 0.28182546036829464, + "grad_norm": 0.5818377733230591, + "learning_rate": 1.1198963554236135e-05, + "loss": 0.016, + "step": 1408 + }, + { + "epoch": 0.28262610088070456, + "grad_norm": 3.0156638622283936, + "learning_rate": 1.121283662493788e-05, + "loss": 0.4653, + "step": 1410 + }, + { + "epoch": 0.28262610088070456, + "grad_norm": 2.903531551361084, + "learning_rate": 1.122670732694342e-05, + "loss": 0.1899, + "step": 1412 + }, + { + "epoch": 0.28342674139311447, + "grad_norm": 0.16344138979911804, + "learning_rate": 1.1240575633162958e-05, + "loss": 1.1636, + "step": 1414 + }, + { + "epoch": 0.28342674139311447, + "grad_norm": 5.756230354309082, + "learning_rate": 1.1254441516511425e-05, + "loss": 0.2231, + "step": 1416 + }, + { + "epoch": 0.28422738190552443, + "grad_norm": 3.4955687522888184, + "learning_rate": 1.1268304949908434e-05, + "loss": 0.3955, + "step": 1418 + }, + { + "epoch": 0.28422738190552443, + "grad_norm": 1.4507321119308472, + "learning_rate": 1.1282165906278395e-05, + "loss": 0.143, + "step": 1420 + }, + { + "epoch": 0.28502802241793435, + "grad_norm": 7.562114715576172, + "learning_rate": 1.1296024358550565e-05, + "loss": 0.8236, + "step": 1422 + }, + { + "epoch": 0.28502802241793435, + "grad_norm": 4.746067523956299, + "learning_rate": 1.1309880279659087e-05, + "loss": 0.4503, + "step": 1424 + }, + { + "epoch": 0.28582866293034426, + "grad_norm": 1.788559079170227, + "learning_rate": 1.1323733642543024e-05, + "loss": 0.1427, + "step": 1426 + }, + { + "epoch": 0.28582866293034426, + "grad_norm": 4.756726264953613, + "learning_rate": 1.1337584420146496e-05, + "loss": 0.3218, + "step": 1428 + }, + { + "epoch": 0.2866293034427542, + "grad_norm": 3.14638352394104, + "learning_rate": 1.135143258541862e-05, + "loss": 0.3888, + "step": 1430 + }, + { + "epoch": 0.2866293034427542, + "grad_norm": 3.249009609222412, + "learning_rate": 1.1365278111313625e-05, + "loss": 0.2692, + "step": 1432 + }, + { + "epoch": 0.28742994395516414, + "grad_norm": 12.097743034362793, + "learning_rate": 1.13791209707909e-05, + "loss": 0.4226, + "step": 1434 + }, + { + "epoch": 0.28742994395516414, + "grad_norm": 0.6259809732437134, + "learning_rate": 1.1392961136815041e-05, + "loss": 0.1032, + "step": 1436 + }, + { + "epoch": 0.28823058446757405, + "grad_norm": 1.6407843828201294, + "learning_rate": 1.1406798582355902e-05, + "loss": 0.1886, + "step": 1438 + }, + { + "epoch": 0.28823058446757405, + "grad_norm": 1.9160236120224, + "learning_rate": 1.142063328038864e-05, + "loss": 0.1072, + "step": 1440 + }, + { + "epoch": 0.289031224979984, + "grad_norm": 4.209094524383545, + "learning_rate": 1.1434465203893818e-05, + "loss": 0.8518, + "step": 1442 + }, + { + "epoch": 0.289031224979984, + "grad_norm": 21.491573333740234, + "learning_rate": 1.1448294325857377e-05, + "loss": 0.2704, + "step": 1444 + }, + { + "epoch": 0.2898318654923939, + "grad_norm": 2.229045867919922, + "learning_rate": 1.146212061927074e-05, + "loss": 0.1501, + "step": 1446 + }, + { + "epoch": 0.2898318654923939, + "grad_norm": 2.8173532485961914, + "learning_rate": 1.1475944057130856e-05, + "loss": 0.1882, + "step": 1448 + }, + { + "epoch": 0.29063250600480384, + "grad_norm": 0.9258885383605957, + "learning_rate": 1.1489764612440255e-05, + "loss": 0.3805, + "step": 1450 + }, + { + "epoch": 0.29063250600480384, + "grad_norm": 2.1413919925689697, + "learning_rate": 1.150358225820707e-05, + "loss": 0.2775, + "step": 1452 + }, + { + "epoch": 0.29143314651721375, + "grad_norm": 3.008946418762207, + "learning_rate": 1.151739696744518e-05, + "loss": 0.2246, + "step": 1454 + }, + { + "epoch": 0.29143314651721375, + "grad_norm": 3.55814790725708, + "learning_rate": 1.1531208713174138e-05, + "loss": 0.1444, + "step": 1456 + }, + { + "epoch": 0.2922337870296237, + "grad_norm": 1.7388416528701782, + "learning_rate": 1.1545017468419307e-05, + "loss": 0.2346, + "step": 1458 + }, + { + "epoch": 0.2922337870296237, + "grad_norm": 0.8970092535018921, + "learning_rate": 1.1558823206211887e-05, + "loss": 0.0279, + "step": 1460 + }, + { + "epoch": 0.2930344275420336, + "grad_norm": 3.4738340377807617, + "learning_rate": 1.1572625899588972e-05, + "loss": 0.5423, + "step": 1462 + }, + { + "epoch": 0.2930344275420336, + "grad_norm": 0.1386861354112625, + "learning_rate": 1.1586425521593607e-05, + "loss": 0.031, + "step": 1464 + }, + { + "epoch": 0.29383506805444354, + "grad_norm": 1.7958307266235352, + "learning_rate": 1.1600222045274809e-05, + "loss": 0.3701, + "step": 1466 + }, + { + "epoch": 0.29383506805444354, + "grad_norm": 1.520005702972412, + "learning_rate": 1.1614015443687708e-05, + "loss": 0.1542, + "step": 1468 + }, + { + "epoch": 0.2946357085668535, + "grad_norm": 1.8117802143096924, + "learning_rate": 1.1627805689893478e-05, + "loss": 0.3152, + "step": 1470 + }, + { + "epoch": 0.2946357085668535, + "grad_norm": 4.086404323577881, + "learning_rate": 1.1641592756959467e-05, + "loss": 0.2562, + "step": 1472 + }, + { + "epoch": 0.2954363490792634, + "grad_norm": 5.325958251953125, + "learning_rate": 1.1655376617959239e-05, + "loss": 0.6251, + "step": 1474 + }, + { + "epoch": 0.2954363490792634, + "grad_norm": 3.3030588626861572, + "learning_rate": 1.1669157245972616e-05, + "loss": 0.381, + "step": 1476 + }, + { + "epoch": 0.2962369895916733, + "grad_norm": 2.3996694087982178, + "learning_rate": 1.1682934614085708e-05, + "loss": 0.3419, + "step": 1478 + }, + { + "epoch": 0.2962369895916733, + "grad_norm": 1.6195110082626343, + "learning_rate": 1.1696708695391057e-05, + "loss": 0.1148, + "step": 1480 + }, + { + "epoch": 0.29703763010408324, + "grad_norm": 1.3251314163208008, + "learning_rate": 1.1710479462987565e-05, + "loss": 0.2776, + "step": 1482 + }, + { + "epoch": 0.29703763010408324, + "grad_norm": 1.019709587097168, + "learning_rate": 1.1724246889980626e-05, + "loss": 0.1191, + "step": 1484 + }, + { + "epoch": 0.2978382706164932, + "grad_norm": 25.56956672668457, + "learning_rate": 1.1738010949482152e-05, + "loss": 0.8036, + "step": 1486 + }, + { + "epoch": 0.2978382706164932, + "grad_norm": 3.8367459774017334, + "learning_rate": 1.1751771614610643e-05, + "loss": 0.2416, + "step": 1488 + }, + { + "epoch": 0.2986389111289031, + "grad_norm": 7.264700412750244, + "learning_rate": 1.176552885849122e-05, + "loss": 0.5181, + "step": 1490 + }, + { + "epoch": 0.2986389111289031, + "grad_norm": 2.8028981685638428, + "learning_rate": 1.1779282654255668e-05, + "loss": 0.2613, + "step": 1492 + }, + { + "epoch": 0.29943955164131303, + "grad_norm": 3.7328431606292725, + "learning_rate": 1.1793032975042563e-05, + "loss": 0.6009, + "step": 1494 + }, + { + "epoch": 0.29943955164131303, + "grad_norm": 2.6228582859039307, + "learning_rate": 1.180677979399721e-05, + "loss": 0.256, + "step": 1496 + }, + { + "epoch": 0.300240192153723, + "grad_norm": 2.234867811203003, + "learning_rate": 1.1820523084271775e-05, + "loss": 0.2061, + "step": 1498 + }, + { + "epoch": 0.300240192153723, + "grad_norm": 3.857367753982544, + "learning_rate": 1.1834262819025317e-05, + "loss": 0.2094, + "step": 1500 + }, + { + "epoch": 0.3010408326661329, + "grad_norm": 6.610302925109863, + "learning_rate": 1.1847998971423835e-05, + "loss": 0.742, + "step": 1502 + }, + { + "epoch": 0.3010408326661329, + "grad_norm": 3.172524929046631, + "learning_rate": 1.1861731514640309e-05, + "loss": 0.2823, + "step": 1504 + }, + { + "epoch": 0.3018414731785428, + "grad_norm": 1.522755742073059, + "learning_rate": 1.1875460421854816e-05, + "loss": 0.4273, + "step": 1506 + }, + { + "epoch": 0.3018414731785428, + "grad_norm": 3.1445794105529785, + "learning_rate": 1.188918566625449e-05, + "loss": 0.2712, + "step": 1508 + }, + { + "epoch": 0.3026421136909528, + "grad_norm": 6.25697135925293, + "learning_rate": 1.1902907221033629e-05, + "loss": 0.3829, + "step": 1510 + }, + { + "epoch": 0.3026421136909528, + "grad_norm": 3.6653525829315186, + "learning_rate": 1.1916625059393739e-05, + "loss": 0.1849, + "step": 1512 + }, + { + "epoch": 0.3034427542033627, + "grad_norm": 1.423952341079712, + "learning_rate": 1.1930339154543582e-05, + "loss": 0.2468, + "step": 1514 + }, + { + "epoch": 0.3034427542033627, + "grad_norm": 3.3148374557495117, + "learning_rate": 1.1944049479699241e-05, + "loss": 0.3187, + "step": 1516 + }, + { + "epoch": 0.3042433947157726, + "grad_norm": 4.138520240783691, + "learning_rate": 1.1957756008084127e-05, + "loss": 0.1753, + "step": 1518 + }, + { + "epoch": 0.3042433947157726, + "grad_norm": 0.7803539633750916, + "learning_rate": 1.1971458712929133e-05, + "loss": 0.1258, + "step": 1520 + }, + { + "epoch": 0.3050440352281825, + "grad_norm": 1.4368648529052734, + "learning_rate": 1.1985157567472563e-05, + "loss": 0.2325, + "step": 1522 + }, + { + "epoch": 0.3050440352281825, + "grad_norm": 1.1482466459274292, + "learning_rate": 1.1998852544960256e-05, + "loss": 0.0684, + "step": 1524 + }, + { + "epoch": 0.3058446757405925, + "grad_norm": 5.689779281616211, + "learning_rate": 1.2012543618645622e-05, + "loss": 1.0397, + "step": 1526 + }, + { + "epoch": 0.3058446757405925, + "grad_norm": 4.361678600311279, + "learning_rate": 1.2026230761789702e-05, + "loss": 0.4284, + "step": 1528 + }, + { + "epoch": 0.3066453162530024, + "grad_norm": 3.0908913612365723, + "learning_rate": 1.2039913947661205e-05, + "loss": 0.2284, + "step": 1530 + }, + { + "epoch": 0.3066453162530024, + "grad_norm": 1.6493077278137207, + "learning_rate": 1.2053593149536557e-05, + "loss": 0.2373, + "step": 1532 + }, + { + "epoch": 0.3074459567654123, + "grad_norm": 1.6853619813919067, + "learning_rate": 1.2067268340700016e-05, + "loss": 0.3855, + "step": 1534 + }, + { + "epoch": 0.3074459567654123, + "grad_norm": 1.6147798299789429, + "learning_rate": 1.2080939494443618e-05, + "loss": 0.1678, + "step": 1536 + }, + { + "epoch": 0.3082465972778223, + "grad_norm": 4.845739841461182, + "learning_rate": 1.2094606584067304e-05, + "loss": 0.38, + "step": 1538 + }, + { + "epoch": 0.3082465972778223, + "grad_norm": 4.380086898803711, + "learning_rate": 1.210826958287895e-05, + "loss": 0.2506, + "step": 1540 + }, + { + "epoch": 0.3090472377902322, + "grad_norm": 3.657365322113037, + "learning_rate": 1.212192846419443e-05, + "loss": 0.4586, + "step": 1542 + }, + { + "epoch": 0.3090472377902322, + "grad_norm": 1.8320730924606323, + "learning_rate": 1.2135583201337646e-05, + "loss": 0.2537, + "step": 1544 + }, + { + "epoch": 0.3098478783026421, + "grad_norm": 2.191260576248169, + "learning_rate": 1.2149233767640587e-05, + "loss": 0.2064, + "step": 1546 + }, + { + "epoch": 0.3098478783026421, + "grad_norm": 2.480032444000244, + "learning_rate": 1.2162880136443434e-05, + "loss": 0.063, + "step": 1548 + }, + { + "epoch": 0.31064851881505207, + "grad_norm": 2.8678643703460693, + "learning_rate": 1.2176522281094514e-05, + "loss": 0.219, + "step": 1550 + }, + { + "epoch": 0.31064851881505207, + "grad_norm": 4.9773101806640625, + "learning_rate": 1.2190160174950428e-05, + "loss": 0.1552, + "step": 1552 + }, + { + "epoch": 0.311449159327462, + "grad_norm": 6.838129997253418, + "learning_rate": 1.220379379137607e-05, + "loss": 1.0265, + "step": 1554 + }, + { + "epoch": 0.311449159327462, + "grad_norm": 3.47139048576355, + "learning_rate": 1.2217423103744692e-05, + "loss": 0.3573, + "step": 1556 + }, + { + "epoch": 0.3122497998398719, + "grad_norm": 2.8829493522644043, + "learning_rate": 1.2231048085437953e-05, + "loss": 0.2067, + "step": 1558 + }, + { + "epoch": 0.3122497998398719, + "grad_norm": 1.5416744947433472, + "learning_rate": 1.2244668709845952e-05, + "loss": 0.2534, + "step": 1560 + }, + { + "epoch": 0.3130504403522818, + "grad_norm": 3.1662230491638184, + "learning_rate": 1.2258284950367347e-05, + "loss": 0.5289, + "step": 1562 + }, + { + "epoch": 0.3130504403522818, + "grad_norm": 7.046089172363281, + "learning_rate": 1.2271896780409309e-05, + "loss": 0.2892, + "step": 1564 + }, + { + "epoch": 0.31385108086469177, + "grad_norm": 4.849677085876465, + "learning_rate": 1.228550417338764e-05, + "loss": 0.5858, + "step": 1566 + }, + { + "epoch": 0.31385108086469177, + "grad_norm": 2.2673544883728027, + "learning_rate": 1.2299107102726804e-05, + "loss": 0.2162, + "step": 1568 + }, + { + "epoch": 0.3146517213771017, + "grad_norm": 0.23894120752811432, + "learning_rate": 1.2312705541859985e-05, + "loss": 0.2134, + "step": 1570 + }, + { + "epoch": 0.3146517213771017, + "grad_norm": 5.36576509475708, + "learning_rate": 1.2326299464229143e-05, + "loss": 0.1952, + "step": 1572 + }, + { + "epoch": 0.3154523618895116, + "grad_norm": 1.4862465858459473, + "learning_rate": 1.2339888843285029e-05, + "loss": 0.6665, + "step": 1574 + }, + { + "epoch": 0.3154523618895116, + "grad_norm": 3.9165492057800293, + "learning_rate": 1.2353473652487329e-05, + "loss": 0.3657, + "step": 1576 + }, + { + "epoch": 0.31625300240192156, + "grad_norm": 3.018007516860962, + "learning_rate": 1.2367053865304597e-05, + "loss": 0.4114, + "step": 1578 + }, + { + "epoch": 0.31625300240192156, + "grad_norm": 7.270834445953369, + "learning_rate": 1.2380629455214385e-05, + "loss": 0.3657, + "step": 1580 + }, + { + "epoch": 0.31705364291433147, + "grad_norm": 3.3254470825195312, + "learning_rate": 1.2394200395703273e-05, + "loss": 0.2961, + "step": 1582 + }, + { + "epoch": 0.31705364291433147, + "grad_norm": 3.4595348834991455, + "learning_rate": 1.2407766660266916e-05, + "loss": 0.2775, + "step": 1584 + }, + { + "epoch": 0.3178542834267414, + "grad_norm": 11.118377685546875, + "learning_rate": 1.2421328222410109e-05, + "loss": 0.4151, + "step": 1586 + }, + { + "epoch": 0.3178542834267414, + "grad_norm": 2.36765718460083, + "learning_rate": 1.2434885055646808e-05, + "loss": 0.2766, + "step": 1588 + }, + { + "epoch": 0.31865492393915135, + "grad_norm": 10.415533065795898, + "learning_rate": 1.2448437133500262e-05, + "loss": 0.5508, + "step": 1590 + }, + { + "epoch": 0.31865492393915135, + "grad_norm": 2.7754740715026855, + "learning_rate": 1.2461984429502947e-05, + "loss": 0.2236, + "step": 1592 + }, + { + "epoch": 0.31945556445156126, + "grad_norm": 2.4158527851104736, + "learning_rate": 1.2475526917196703e-05, + "loss": 0.217, + "step": 1594 + }, + { + "epoch": 0.31945556445156126, + "grad_norm": 2.3118739128112793, + "learning_rate": 1.2489064570132761e-05, + "loss": 0.2269, + "step": 1596 + }, + { + "epoch": 0.32025620496397117, + "grad_norm": 3.554960250854492, + "learning_rate": 1.2502597361871787e-05, + "loss": 0.1872, + "step": 1598 + }, + { + "epoch": 0.32025620496397117, + "grad_norm": 1.3752363920211792, + "learning_rate": 1.2516125265983945e-05, + "loss": 0.202, + "step": 1600 + }, + { + "epoch": 0.3210568454763811, + "grad_norm": 2.0233864784240723, + "learning_rate": 1.2529648256048931e-05, + "loss": 0.3636, + "step": 1602 + }, + { + "epoch": 0.3210568454763811, + "grad_norm": 2.3633856773376465, + "learning_rate": 1.2543166305656089e-05, + "loss": 0.205, + "step": 1604 + }, + { + "epoch": 0.32185748598879105, + "grad_norm": 2.483638048171997, + "learning_rate": 1.2556679388404351e-05, + "loss": 0.1576, + "step": 1606 + }, + { + "epoch": 0.32185748598879105, + "grad_norm": 1.838841199874878, + "learning_rate": 1.257018747790238e-05, + "loss": 0.1496, + "step": 1608 + }, + { + "epoch": 0.32265812650120096, + "grad_norm": 3.385293483734131, + "learning_rate": 1.2583690547768584e-05, + "loss": 0.339, + "step": 1610 + }, + { + "epoch": 0.32265812650120096, + "grad_norm": 1.0684871673583984, + "learning_rate": 1.259718857163117e-05, + "loss": 0.1672, + "step": 1612 + }, + { + "epoch": 0.32345876701361087, + "grad_norm": 4.998286247253418, + "learning_rate": 1.261068152312821e-05, + "loss": 0.3185, + "step": 1614 + }, + { + "epoch": 0.32345876701361087, + "grad_norm": 0.60429447889328, + "learning_rate": 1.2624169375907657e-05, + "loss": 0.1041, + "step": 1616 + }, + { + "epoch": 0.32425940752602084, + "grad_norm": 3.3847968578338623, + "learning_rate": 1.2637652103627481e-05, + "loss": 0.376, + "step": 1618 + }, + { + "epoch": 0.32425940752602084, + "grad_norm": 5.785454273223877, + "learning_rate": 1.2651129679955598e-05, + "loss": 0.2388, + "step": 1620 + }, + { + "epoch": 0.32506004803843075, + "grad_norm": 6.866513252258301, + "learning_rate": 1.2664602078570017e-05, + "loss": 0.5959, + "step": 1622 + }, + { + "epoch": 0.32506004803843075, + "grad_norm": 3.054244041442871, + "learning_rate": 1.2678069273158849e-05, + "loss": 0.3486, + "step": 1624 + }, + { + "epoch": 0.32586068855084066, + "grad_norm": 3.9487979412078857, + "learning_rate": 1.2691531237420369e-05, + "loss": 0.3053, + "step": 1626 + }, + { + "epoch": 0.32586068855084066, + "grad_norm": 2.9807450771331787, + "learning_rate": 1.2704987945063073e-05, + "loss": 0.2632, + "step": 1628 + }, + { + "epoch": 0.3266613290632506, + "grad_norm": 0.5733102560043335, + "learning_rate": 1.27184393698057e-05, + "loss": 0.1909, + "step": 1630 + }, + { + "epoch": 0.3266613290632506, + "grad_norm": 2.753782272338867, + "learning_rate": 1.273188548537736e-05, + "loss": 0.2145, + "step": 1632 + }, + { + "epoch": 0.32746196957566054, + "grad_norm": 2.272684335708618, + "learning_rate": 1.2745326265517481e-05, + "loss": 0.3716, + "step": 1634 + }, + { + "epoch": 0.32746196957566054, + "grad_norm": 14.032787322998047, + "learning_rate": 1.2758761683975929e-05, + "loss": 0.451, + "step": 1636 + }, + { + "epoch": 0.32826261008807045, + "grad_norm": 11.034360885620117, + "learning_rate": 1.277219171451304e-05, + "loss": 0.2824, + "step": 1638 + }, + { + "epoch": 0.32826261008807045, + "grad_norm": 5.266378402709961, + "learning_rate": 1.2785616330899676e-05, + "loss": 0.1759, + "step": 1640 + }, + { + "epoch": 0.32906325060048036, + "grad_norm": 5.297228813171387, + "learning_rate": 1.2799035506917265e-05, + "loss": 0.4151, + "step": 1642 + }, + { + "epoch": 0.32906325060048036, + "grad_norm": 2.733729839324951, + "learning_rate": 1.2812449216357855e-05, + "loss": 0.0767, + "step": 1644 + }, + { + "epoch": 0.32986389111289033, + "grad_norm": 2.2371137142181396, + "learning_rate": 1.2825857433024208e-05, + "loss": 0.6494, + "step": 1646 + }, + { + "epoch": 0.32986389111289033, + "grad_norm": 2.9887843132019043, + "learning_rate": 1.2839260130729776e-05, + "loss": 0.2264, + "step": 1648 + }, + { + "epoch": 0.33066453162530024, + "grad_norm": 5.566283226013184, + "learning_rate": 1.2852657283298794e-05, + "loss": 0.139, + "step": 1650 + }, + { + "epoch": 0.33066453162530024, + "grad_norm": 5.828705310821533, + "learning_rate": 1.2866048864566336e-05, + "loss": 0.2267, + "step": 1652 + }, + { + "epoch": 0.33146517213771015, + "grad_norm": 3.9273178577423096, + "learning_rate": 1.2879434848378356e-05, + "loss": 0.3569, + "step": 1654 + }, + { + "epoch": 0.33146517213771015, + "grad_norm": 1.1598875522613525, + "learning_rate": 1.2892815208591734e-05, + "loss": 0.0368, + "step": 1656 + }, + { + "epoch": 0.3322658126501201, + "grad_norm": 12.594849586486816, + "learning_rate": 1.2906189919074336e-05, + "loss": 0.9051, + "step": 1658 + }, + { + "epoch": 0.3322658126501201, + "grad_norm": 6.461385726928711, + "learning_rate": 1.2919558953705047e-05, + "loss": 0.3968, + "step": 1660 + }, + { + "epoch": 0.33306645316253003, + "grad_norm": 3.95434832572937, + "learning_rate": 1.293292228637389e-05, + "loss": 0.4388, + "step": 1662 + }, + { + "epoch": 0.33306645316253003, + "grad_norm": 4.212325572967529, + "learning_rate": 1.2946279890981966e-05, + "loss": 0.115, + "step": 1664 + }, + { + "epoch": 0.33386709367493994, + "grad_norm": 4.356366157531738, + "learning_rate": 1.2959631741441583e-05, + "loss": 0.2463, + "step": 1666 + }, + { + "epoch": 0.33386709367493994, + "grad_norm": 3.189866304397583, + "learning_rate": 1.2972977811676289e-05, + "loss": 0.2023, + "step": 1668 + }, + { + "epoch": 0.33466773418734985, + "grad_norm": 1.5567737817764282, + "learning_rate": 1.298631807562092e-05, + "loss": 0.4088, + "step": 1670 + }, + { + "epoch": 0.33466773418734985, + "grad_norm": 3.2250163555145264, + "learning_rate": 1.2999652507221652e-05, + "loss": 0.2419, + "step": 1672 + }, + { + "epoch": 0.3354683746997598, + "grad_norm": 3.260206460952759, + "learning_rate": 1.3012981080436036e-05, + "loss": 1.0441, + "step": 1674 + }, + { + "epoch": 0.3354683746997598, + "grad_norm": 2.3090832233428955, + "learning_rate": 1.3026303769233109e-05, + "loss": 0.2049, + "step": 1676 + }, + { + "epoch": 0.33626901521216973, + "grad_norm": 3.001978635787964, + "learning_rate": 1.3039620547593357e-05, + "loss": 0.3837, + "step": 1678 + }, + { + "epoch": 0.33626901521216973, + "grad_norm": 2.3867292404174805, + "learning_rate": 1.3052931389508822e-05, + "loss": 0.1538, + "step": 1680 + }, + { + "epoch": 0.33706965572457964, + "grad_norm": 3.1937811374664307, + "learning_rate": 1.3066236268983143e-05, + "loss": 0.2644, + "step": 1682 + }, + { + "epoch": 0.33706965572457964, + "grad_norm": 1.58845055103302, + "learning_rate": 1.3079535160031601e-05, + "loss": 0.15, + "step": 1684 + }, + { + "epoch": 0.3378702962369896, + "grad_norm": 4.976072788238525, + "learning_rate": 1.3092828036681178e-05, + "loss": 0.3349, + "step": 1686 + }, + { + "epoch": 0.3378702962369896, + "grad_norm": 5.401527404785156, + "learning_rate": 1.3106114872970575e-05, + "loss": 0.2765, + "step": 1688 + }, + { + "epoch": 0.3386709367493995, + "grad_norm": 9.37804889678955, + "learning_rate": 1.3119395642950348e-05, + "loss": 0.5206, + "step": 1690 + }, + { + "epoch": 0.3386709367493995, + "grad_norm": 1.0056020021438599, + "learning_rate": 1.313267032068285e-05, + "loss": 0.1857, + "step": 1692 + }, + { + "epoch": 0.33947157726180943, + "grad_norm": 5.005178451538086, + "learning_rate": 1.3145938880242346e-05, + "loss": 0.365, + "step": 1694 + }, + { + "epoch": 0.33947157726180943, + "grad_norm": 1.5336145162582397, + "learning_rate": 1.3159201295715054e-05, + "loss": 0.2083, + "step": 1696 + }, + { + "epoch": 0.3402722177742194, + "grad_norm": 2.9032723903656006, + "learning_rate": 1.3172457541199188e-05, + "loss": 0.1668, + "step": 1698 + }, + { + "epoch": 0.3402722177742194, + "grad_norm": 0.31457871198654175, + "learning_rate": 1.3185707590804997e-05, + "loss": 0.0936, + "step": 1700 + }, + { + "epoch": 0.3410728582866293, + "grad_norm": 4.354226589202881, + "learning_rate": 1.3198951418654882e-05, + "loss": 0.2759, + "step": 1702 + }, + { + "epoch": 0.3410728582866293, + "grad_norm": 1.306763768196106, + "learning_rate": 1.321218899888334e-05, + "loss": 0.2099, + "step": 1704 + }, + { + "epoch": 0.3418734987990392, + "grad_norm": 2.587979316711426, + "learning_rate": 1.322542030563709e-05, + "loss": 0.1763, + "step": 1706 + }, + { + "epoch": 0.3418734987990392, + "grad_norm": 0.968517005443573, + "learning_rate": 1.3238645313075109e-05, + "loss": 0.0994, + "step": 1708 + }, + { + "epoch": 0.34267413931144913, + "grad_norm": 3.2493691444396973, + "learning_rate": 1.3251863995368665e-05, + "loss": 0.5215, + "step": 1710 + }, + { + "epoch": 0.34267413931144913, + "grad_norm": 11.355826377868652, + "learning_rate": 1.326507632670139e-05, + "loss": 0.2894, + "step": 1712 + }, + { + "epoch": 0.3434747798238591, + "grad_norm": 0.45056354999542236, + "learning_rate": 1.3278282281269293e-05, + "loss": 0.2224, + "step": 1714 + }, + { + "epoch": 0.3434747798238591, + "grad_norm": 1.7059470415115356, + "learning_rate": 1.3291481833280894e-05, + "loss": 0.1276, + "step": 1716 + }, + { + "epoch": 0.344275420336269, + "grad_norm": 3.9045169353485107, + "learning_rate": 1.3304674956957167e-05, + "loss": 0.7475, + "step": 1718 + }, + { + "epoch": 0.344275420336269, + "grad_norm": 0.6166409254074097, + "learning_rate": 1.3317861626531652e-05, + "loss": 0.2077, + "step": 1720 + }, + { + "epoch": 0.3450760608486789, + "grad_norm": 6.520531177520752, + "learning_rate": 1.3331041816250503e-05, + "loss": 0.5418, + "step": 1722 + }, + { + "epoch": 0.3450760608486789, + "grad_norm": 2.984787702560425, + "learning_rate": 1.3344215500372517e-05, + "loss": 0.1875, + "step": 1724 + }, + { + "epoch": 0.3458767013610889, + "grad_norm": 2.09181547164917, + "learning_rate": 1.335738265316921e-05, + "loss": 0.1357, + "step": 1726 + }, + { + "epoch": 0.3458767013610889, + "grad_norm": 2.283186435699463, + "learning_rate": 1.3370543248924826e-05, + "loss": 0.0655, + "step": 1728 + }, + { + "epoch": 0.3466773418734988, + "grad_norm": 9.593862533569336, + "learning_rate": 1.3383697261936472e-05, + "loss": 0.3325, + "step": 1730 + }, + { + "epoch": 0.3466773418734988, + "grad_norm": 1.1370501518249512, + "learning_rate": 1.3396844666514062e-05, + "loss": 0.0547, + "step": 1732 + }, + { + "epoch": 0.3474779823859087, + "grad_norm": 4.796603679656982, + "learning_rate": 1.3409985436980422e-05, + "loss": 0.8121, + "step": 1734 + }, + { + "epoch": 0.3474779823859087, + "grad_norm": 0.9535071849822998, + "learning_rate": 1.3423119547671348e-05, + "loss": 0.2726, + "step": 1736 + }, + { + "epoch": 0.3482786228983187, + "grad_norm": 7.567232131958008, + "learning_rate": 1.3436246972935638e-05, + "loss": 0.1715, + "step": 1738 + }, + { + "epoch": 0.3482786228983187, + "grad_norm": 2.1800146102905273, + "learning_rate": 1.344936768713513e-05, + "loss": 0.1978, + "step": 1740 + }, + { + "epoch": 0.3490792634107286, + "grad_norm": 5.999657154083252, + "learning_rate": 1.346248166464481e-05, + "loss": 0.2978, + "step": 1742 + }, + { + "epoch": 0.3490792634107286, + "grad_norm": 1.3640875816345215, + "learning_rate": 1.347558887985279e-05, + "loss": 0.1879, + "step": 1744 + }, + { + "epoch": 0.3498799039231385, + "grad_norm": 1.0128766298294067, + "learning_rate": 1.348868930716039e-05, + "loss": 0.1805, + "step": 1746 + }, + { + "epoch": 0.3498799039231385, + "grad_norm": 2.3596715927124023, + "learning_rate": 1.3501782920982189e-05, + "loss": 0.2373, + "step": 1748 + }, + { + "epoch": 0.3506805444355484, + "grad_norm": 3.8849594593048096, + "learning_rate": 1.3514869695746078e-05, + "loss": 0.3986, + "step": 1750 + }, + { + "epoch": 0.3506805444355484, + "grad_norm": 2.088613271713257, + "learning_rate": 1.3527949605893305e-05, + "loss": 0.1952, + "step": 1752 + }, + { + "epoch": 0.3514811849479584, + "grad_norm": 6.86254358291626, + "learning_rate": 1.3541022625878501e-05, + "loss": 0.3655, + "step": 1754 + }, + { + "epoch": 0.3514811849479584, + "grad_norm": 1.3669837713241577, + "learning_rate": 1.3554088730169812e-05, + "loss": 0.1275, + "step": 1756 + }, + { + "epoch": 0.3522818254603683, + "grad_norm": 5.027182579040527, + "learning_rate": 1.3567147893248833e-05, + "loss": 0.6851, + "step": 1758 + }, + { + "epoch": 0.3522818254603683, + "grad_norm": 10.089662551879883, + "learning_rate": 1.3580200089610739e-05, + "loss": 0.2627, + "step": 1760 + }, + { + "epoch": 0.3530824659727782, + "grad_norm": 3.5067386627197266, + "learning_rate": 1.3593245293764303e-05, + "loss": 0.2765, + "step": 1762 + }, + { + "epoch": 0.3530824659727782, + "grad_norm": 0.4109172523021698, + "learning_rate": 1.3606283480231962e-05, + "loss": 0.2444, + "step": 1764 + }, + { + "epoch": 0.35388310648518817, + "grad_norm": 6.979426383972168, + "learning_rate": 1.361931462354984e-05, + "loss": 0.601, + "step": 1766 + }, + { + "epoch": 0.35388310648518817, + "grad_norm": 1.6104693412780762, + "learning_rate": 1.3632338698267863e-05, + "loss": 0.2026, + "step": 1768 + }, + { + "epoch": 0.3546837469975981, + "grad_norm": 4.281369209289551, + "learning_rate": 1.3645355678949715e-05, + "loss": 0.2573, + "step": 1770 + }, + { + "epoch": 0.3546837469975981, + "grad_norm": 2.8880181312561035, + "learning_rate": 1.3658365540172948e-05, + "loss": 0.1719, + "step": 1772 + }, + { + "epoch": 0.355484387510008, + "grad_norm": 4.36014461517334, + "learning_rate": 1.3671368256529026e-05, + "loss": 0.3034, + "step": 1774 + }, + { + "epoch": 0.355484387510008, + "grad_norm": 3.7101047039031982, + "learning_rate": 1.368436380262336e-05, + "loss": 0.2419, + "step": 1776 + }, + { + "epoch": 0.35628502802241796, + "grad_norm": 2.430959939956665, + "learning_rate": 1.3697352153075365e-05, + "loss": 0.2364, + "step": 1778 + }, + { + "epoch": 0.35628502802241796, + "grad_norm": 1.1185076236724854, + "learning_rate": 1.3710333282518497e-05, + "loss": 0.1009, + "step": 1780 + }, + { + "epoch": 0.35708566853482787, + "grad_norm": 3.602783679962158, + "learning_rate": 1.3723307165600361e-05, + "loss": 0.597, + "step": 1782 + }, + { + "epoch": 0.35708566853482787, + "grad_norm": 1.8709505796432495, + "learning_rate": 1.3736273776982667e-05, + "loss": 0.2098, + "step": 1784 + }, + { + "epoch": 0.3578863090472378, + "grad_norm": 6.458472728729248, + "learning_rate": 1.3749233091341344e-05, + "loss": 0.8885, + "step": 1786 + }, + { + "epoch": 0.3578863090472378, + "grad_norm": 0.865864098072052, + "learning_rate": 1.3762185083366562e-05, + "loss": 0.1945, + "step": 1788 + }, + { + "epoch": 0.3586869495596477, + "grad_norm": 2.808619260787964, + "learning_rate": 1.3775129727762808e-05, + "loss": 0.3048, + "step": 1790 + }, + { + "epoch": 0.3586869495596477, + "grad_norm": 1.6234745979309082, + "learning_rate": 1.3788066999248893e-05, + "loss": 0.154, + "step": 1792 + }, + { + "epoch": 0.35948759007205766, + "grad_norm": 1.2265236377716064, + "learning_rate": 1.3800996872558075e-05, + "loss": 0.2702, + "step": 1794 + }, + { + "epoch": 0.35948759007205766, + "grad_norm": 2.1746463775634766, + "learning_rate": 1.3813919322438018e-05, + "loss": 0.3148, + "step": 1796 + }, + { + "epoch": 0.3602882305844676, + "grad_norm": 5.96541690826416, + "learning_rate": 1.3826834323650899e-05, + "loss": 0.4324, + "step": 1798 + }, + { + "epoch": 0.3602882305844676, + "grad_norm": 1.113220453262329, + "learning_rate": 1.3839741850973435e-05, + "loss": 0.1295, + "step": 1800 + }, + { + "epoch": 0.3610888710968775, + "grad_norm": 0.5807571411132812, + "learning_rate": 1.3852641879196952e-05, + "loss": 0.3071, + "step": 1802 + }, + { + "epoch": 0.3610888710968775, + "grad_norm": 1.3971216678619385, + "learning_rate": 1.3865534383127413e-05, + "loss": 0.2609, + "step": 1804 + }, + { + "epoch": 0.36188951160928745, + "grad_norm": 2.055445432662964, + "learning_rate": 1.387841933758546e-05, + "loss": 0.1949, + "step": 1806 + }, + { + "epoch": 0.36188951160928745, + "grad_norm": 5.560498237609863, + "learning_rate": 1.3891296717406533e-05, + "loss": 0.1196, + "step": 1808 + }, + { + "epoch": 0.36269015212169736, + "grad_norm": 3.2385313510894775, + "learning_rate": 1.3904166497440812e-05, + "loss": 0.1955, + "step": 1810 + }, + { + "epoch": 0.36269015212169736, + "grad_norm": 0.392408162355423, + "learning_rate": 1.391702865255334e-05, + "loss": 0.0486, + "step": 1812 + }, + { + "epoch": 0.3634907926341073, + "grad_norm": 9.849658012390137, + "learning_rate": 1.3929883157624046e-05, + "loss": 0.488, + "step": 1814 + }, + { + "epoch": 0.3634907926341073, + "grad_norm": 2.220021963119507, + "learning_rate": 1.3942729987547808e-05, + "loss": 0.1864, + "step": 1816 + }, + { + "epoch": 0.36429143314651724, + "grad_norm": 4.8420000076293945, + "learning_rate": 1.3955569117234468e-05, + "loss": 0.5286, + "step": 1818 + }, + { + "epoch": 0.36429143314651724, + "grad_norm": 0.3795752227306366, + "learning_rate": 1.3968400521608962e-05, + "loss": 0.1409, + "step": 1820 + }, + { + "epoch": 0.36509207365892715, + "grad_norm": 2.0654520988464355, + "learning_rate": 1.3981224175611265e-05, + "loss": 0.0993, + "step": 1822 + }, + { + "epoch": 0.36509207365892715, + "grad_norm": 2.3335392475128174, + "learning_rate": 1.3994040054196498e-05, + "loss": 0.1953, + "step": 1824 + }, + { + "epoch": 0.36589271417133706, + "grad_norm": 0.18284086883068085, + "learning_rate": 1.4006848132334979e-05, + "loss": 0.2493, + "step": 1826 + }, + { + "epoch": 0.36589271417133706, + "grad_norm": 4.558528900146484, + "learning_rate": 1.4019648385012245e-05, + "loss": 0.1297, + "step": 1828 + }, + { + "epoch": 0.366693354683747, + "grad_norm": 5.870181083679199, + "learning_rate": 1.4032440787229135e-05, + "loss": 0.381, + "step": 1830 + }, + { + "epoch": 0.366693354683747, + "grad_norm": 0.8284279704093933, + "learning_rate": 1.4045225314001789e-05, + "loss": 0.0297, + "step": 1832 + }, + { + "epoch": 0.36749399519615694, + "grad_norm": 11.55343246459961, + "learning_rate": 1.4058001940361781e-05, + "loss": 0.4583, + "step": 1834 + }, + { + "epoch": 0.36749399519615694, + "grad_norm": 1.067719578742981, + "learning_rate": 1.4070770641356069e-05, + "loss": 0.1302, + "step": 1836 + }, + { + "epoch": 0.36829463570856685, + "grad_norm": 2.8894267082214355, + "learning_rate": 1.40835313920471e-05, + "loss": 0.5593, + "step": 1838 + }, + { + "epoch": 0.36829463570856685, + "grad_norm": 0.8451491594314575, + "learning_rate": 1.4096284167512856e-05, + "loss": 0.1333, + "step": 1840 + }, + { + "epoch": 0.36909527622097676, + "grad_norm": 0.1553415209054947, + "learning_rate": 1.4109028942846888e-05, + "loss": 0.4827, + "step": 1842 + }, + { + "epoch": 0.36909527622097676, + "grad_norm": 2.1367712020874023, + "learning_rate": 1.4121765693158355e-05, + "loss": 0.2267, + "step": 1844 + }, + { + "epoch": 0.36989591673338673, + "grad_norm": 5.240777492523193, + "learning_rate": 1.4134494393572146e-05, + "loss": 0.3185, + "step": 1846 + }, + { + "epoch": 0.36989591673338673, + "grad_norm": 3.2826931476593018, + "learning_rate": 1.4147215019228813e-05, + "loss": 0.2818, + "step": 1848 + }, + { + "epoch": 0.37069655724579664, + "grad_norm": 2.879746437072754, + "learning_rate": 1.4159927545284697e-05, + "loss": 0.6019, + "step": 1850 + }, + { + "epoch": 0.37069655724579664, + "grad_norm": 3.6365630626678467, + "learning_rate": 1.4172631946911964e-05, + "loss": 0.3564, + "step": 1852 + }, + { + "epoch": 0.37149719775820655, + "grad_norm": 2.3527262210845947, + "learning_rate": 1.4185328199298636e-05, + "loss": 0.3315, + "step": 1854 + }, + { + "epoch": 0.37149719775820655, + "grad_norm": 1.749263882637024, + "learning_rate": 1.4198016277648665e-05, + "loss": 0.222, + "step": 1856 + }, + { + "epoch": 0.37229783827061647, + "grad_norm": 5.344796657562256, + "learning_rate": 1.4210696157181936e-05, + "loss": 0.4231, + "step": 1858 + }, + { + "epoch": 0.37229783827061647, + "grad_norm": 2.8102540969848633, + "learning_rate": 1.4223367813134406e-05, + "loss": 0.2392, + "step": 1860 + }, + { + "epoch": 0.37309847878302643, + "grad_norm": 4.470452785491943, + "learning_rate": 1.4236031220758037e-05, + "loss": 0.404, + "step": 1862 + }, + { + "epoch": 0.37309847878302643, + "grad_norm": 1.2124143838882446, + "learning_rate": 1.4248686355320922e-05, + "loss": 0.1194, + "step": 1864 + }, + { + "epoch": 0.37389911929543634, + "grad_norm": 5.91017484664917, + "learning_rate": 1.426133319210731e-05, + "loss": 0.3331, + "step": 1866 + }, + { + "epoch": 0.37389911929543634, + "grad_norm": 2.3426077365875244, + "learning_rate": 1.4273971706417653e-05, + "loss": 0.2424, + "step": 1868 + }, + { + "epoch": 0.37469975980784626, + "grad_norm": 10.299975395202637, + "learning_rate": 1.4286601873568642e-05, + "loss": 0.2948, + "step": 1870 + }, + { + "epoch": 0.37469975980784626, + "grad_norm": 1.400730848312378, + "learning_rate": 1.429922366889332e-05, + "loss": 0.2071, + "step": 1872 + }, + { + "epoch": 0.3755004003202562, + "grad_norm": 1.9408386945724487, + "learning_rate": 1.431183706774103e-05, + "loss": 0.1829, + "step": 1874 + }, + { + "epoch": 0.3755004003202562, + "grad_norm": 0.4981194734573364, + "learning_rate": 1.4324442045477534e-05, + "loss": 0.1494, + "step": 1876 + }, + { + "epoch": 0.37630104083266613, + "grad_norm": 6.186419486999512, + "learning_rate": 1.4337038577485035e-05, + "loss": 0.6449, + "step": 1878 + }, + { + "epoch": 0.37630104083266613, + "grad_norm": 2.76420521736145, + "learning_rate": 1.4349626639162231e-05, + "loss": 0.2718, + "step": 1880 + }, + { + "epoch": 0.37710168134507605, + "grad_norm": 1.2049068212509155, + "learning_rate": 1.436220620592437e-05, + "loss": 0.5888, + "step": 1882 + }, + { + "epoch": 0.37710168134507605, + "grad_norm": 3.2432661056518555, + "learning_rate": 1.4374777253203265e-05, + "loss": 0.1772, + "step": 1884 + }, + { + "epoch": 0.377902321857486, + "grad_norm": 3.1232874393463135, + "learning_rate": 1.4387339756447422e-05, + "loss": 0.2164, + "step": 1886 + }, + { + "epoch": 0.377902321857486, + "grad_norm": 1.9856919050216675, + "learning_rate": 1.4399893691121985e-05, + "loss": 0.0746, + "step": 1888 + }, + { + "epoch": 0.3787029623698959, + "grad_norm": 7.944924831390381, + "learning_rate": 1.4412439032708848e-05, + "loss": 0.6036, + "step": 1890 + }, + { + "epoch": 0.3787029623698959, + "grad_norm": 1.1771142482757568, + "learning_rate": 1.4424975756706684e-05, + "loss": 0.0906, + "step": 1892 + }, + { + "epoch": 0.37950360288230583, + "grad_norm": 3.2103841304779053, + "learning_rate": 1.4437503838631002e-05, + "loss": 0.178, + "step": 1894 + }, + { + "epoch": 0.37950360288230583, + "grad_norm": 1.5288552045822144, + "learning_rate": 1.4450023254014185e-05, + "loss": 0.1693, + "step": 1896 + }, + { + "epoch": 0.38030424339471575, + "grad_norm": 0.67243891954422, + "learning_rate": 1.4462533978405529e-05, + "loss": 0.354, + "step": 1898 + }, + { + "epoch": 0.38030424339471575, + "grad_norm": 1.871901273727417, + "learning_rate": 1.4475035987371348e-05, + "loss": 0.1023, + "step": 1900 + }, + { + "epoch": 0.3811048839071257, + "grad_norm": 3.1831471920013428, + "learning_rate": 1.4487529256494937e-05, + "loss": 0.3783, + "step": 1902 + }, + { + "epoch": 0.3811048839071257, + "grad_norm": 0.7014806866645813, + "learning_rate": 1.4500013761376663e-05, + "loss": 0.1199, + "step": 1904 + }, + { + "epoch": 0.3819055244195356, + "grad_norm": 5.5787248611450195, + "learning_rate": 1.4512489477634024e-05, + "loss": 0.5345, + "step": 1906 + }, + { + "epoch": 0.3819055244195356, + "grad_norm": 2.289496421813965, + "learning_rate": 1.4524956380901674e-05, + "loss": 0.2774, + "step": 1908 + }, + { + "epoch": 0.38270616493194554, + "grad_norm": 4.628843784332275, + "learning_rate": 1.4537414446831461e-05, + "loss": 0.4179, + "step": 1910 + }, + { + "epoch": 0.38270616493194554, + "grad_norm": 1.688610315322876, + "learning_rate": 1.454986365109255e-05, + "loss": 0.2777, + "step": 1912 + }, + { + "epoch": 0.3835068054443555, + "grad_norm": 10.364228248596191, + "learning_rate": 1.4562303969371357e-05, + "loss": 0.8515, + "step": 1914 + }, + { + "epoch": 0.3835068054443555, + "grad_norm": 4.6316819190979, + "learning_rate": 1.4574735377371669e-05, + "loss": 0.4511, + "step": 1916 + }, + { + "epoch": 0.3843074459567654, + "grad_norm": 6.410867214202881, + "learning_rate": 1.4587157850814679e-05, + "loss": 0.7381, + "step": 1918 + }, + { + "epoch": 0.3843074459567654, + "grad_norm": 4.943200588226318, + "learning_rate": 1.4599571365439027e-05, + "loss": 0.3484, + "step": 1920 + }, + { + "epoch": 0.3851080864691753, + "grad_norm": 3.092768907546997, + "learning_rate": 1.4611975897000849e-05, + "loss": 0.2494, + "step": 1922 + }, + { + "epoch": 0.3851080864691753, + "grad_norm": 4.496169090270996, + "learning_rate": 1.4624371421273812e-05, + "loss": 0.1826, + "step": 1924 + }, + { + "epoch": 0.3859087269815853, + "grad_norm": 4.158265590667725, + "learning_rate": 1.463675791404922e-05, + "loss": 0.2561, + "step": 1926 + }, + { + "epoch": 0.3859087269815853, + "grad_norm": 2.875715732574463, + "learning_rate": 1.4649135351135968e-05, + "loss": 0.1898, + "step": 1928 + }, + { + "epoch": 0.3867093674939952, + "grad_norm": 1.390495777130127, + "learning_rate": 1.4661503708360652e-05, + "loss": 0.1161, + "step": 1930 + }, + { + "epoch": 0.3867093674939952, + "grad_norm": 2.294372081756592, + "learning_rate": 1.4673862961567604e-05, + "loss": 0.1488, + "step": 1932 + }, + { + "epoch": 0.3875100080064051, + "grad_norm": 2.8940486907958984, + "learning_rate": 1.4686213086618932e-05, + "loss": 0.4377, + "step": 1934 + }, + { + "epoch": 0.3875100080064051, + "grad_norm": 1.221592903137207, + "learning_rate": 1.4698554059394563e-05, + "loss": 0.1528, + "step": 1936 + }, + { + "epoch": 0.388310648518815, + "grad_norm": 4.837406635284424, + "learning_rate": 1.4710885855792338e-05, + "loss": 0.4296, + "step": 1938 + }, + { + "epoch": 0.388310648518815, + "grad_norm": 2.385875701904297, + "learning_rate": 1.4723208451727977e-05, + "loss": 0.1688, + "step": 1940 + }, + { + "epoch": 0.389111289031225, + "grad_norm": 7.758847713470459, + "learning_rate": 1.4735521823135184e-05, + "loss": 0.9032, + "step": 1942 + }, + { + "epoch": 0.389111289031225, + "grad_norm": 4.20594596862793, + "learning_rate": 1.4747825945965675e-05, + "loss": 0.438, + "step": 1944 + }, + { + "epoch": 0.3899119295436349, + "grad_norm": 3.9570648670196533, + "learning_rate": 1.4760120796189233e-05, + "loss": 0.2607, + "step": 1946 + }, + { + "epoch": 0.3899119295436349, + "grad_norm": 1.4451581239700317, + "learning_rate": 1.4772406349793749e-05, + "loss": 0.1419, + "step": 1948 + }, + { + "epoch": 0.3907125700560448, + "grad_norm": 0.852167010307312, + "learning_rate": 1.4784682582785254e-05, + "loss": 0.1849, + "step": 1950 + }, + { + "epoch": 0.3907125700560448, + "grad_norm": 0.7290820479393005, + "learning_rate": 1.4796949471188033e-05, + "loss": 0.0481, + "step": 1952 + }, + { + "epoch": 0.3915132105684548, + "grad_norm": 6.995693206787109, + "learning_rate": 1.4809206991044571e-05, + "loss": 0.7037, + "step": 1954 + }, + { + "epoch": 0.3915132105684548, + "grad_norm": 2.9630699157714844, + "learning_rate": 1.4821455118415666e-05, + "loss": 0.3217, + "step": 1956 + }, + { + "epoch": 0.3923138510808647, + "grad_norm": 4.018960952758789, + "learning_rate": 1.4833693829380458e-05, + "loss": 0.4376, + "step": 1958 + }, + { + "epoch": 0.3923138510808647, + "grad_norm": 3.409332036972046, + "learning_rate": 1.4845923100036479e-05, + "loss": 0.3956, + "step": 1960 + }, + { + "epoch": 0.3931144915932746, + "grad_norm": 3.9635257720947266, + "learning_rate": 1.4858142906499686e-05, + "loss": 0.5189, + "step": 1962 + }, + { + "epoch": 0.3931144915932746, + "grad_norm": 2.3552675247192383, + "learning_rate": 1.4870353224904563e-05, + "loss": 0.3094, + "step": 1964 + }, + { + "epoch": 0.3939151321056846, + "grad_norm": 3.1630258560180664, + "learning_rate": 1.4882554031404075e-05, + "loss": 0.4508, + "step": 1966 + }, + { + "epoch": 0.3939151321056846, + "grad_norm": 1.5279723405838013, + "learning_rate": 1.4894745302169786e-05, + "loss": 0.1749, + "step": 1968 + }, + { + "epoch": 0.3947157726180945, + "grad_norm": 2.285238742828369, + "learning_rate": 1.4906927013391879e-05, + "loss": 0.2244, + "step": 1970 + }, + { + "epoch": 0.3947157726180945, + "grad_norm": 1.1655954122543335, + "learning_rate": 1.4919099141279205e-05, + "loss": 0.1201, + "step": 1972 + }, + { + "epoch": 0.3955164131305044, + "grad_norm": 4.015878677368164, + "learning_rate": 1.4931261662059338e-05, + "loss": 0.3609, + "step": 1974 + }, + { + "epoch": 0.3955164131305044, + "grad_norm": 1.706411600112915, + "learning_rate": 1.4943414551978597e-05, + "loss": 0.0848, + "step": 1976 + }, + { + "epoch": 0.3963170536429143, + "grad_norm": 4.617360591888428, + "learning_rate": 1.4955557787302151e-05, + "loss": 0.3482, + "step": 1978 + }, + { + "epoch": 0.3963170536429143, + "grad_norm": 5.708012580871582, + "learning_rate": 1.4967691344313988e-05, + "loss": 0.5706, + "step": 1980 + }, + { + "epoch": 0.3971176941553243, + "grad_norm": 7.054632186889648, + "learning_rate": 1.4979815199317005e-05, + "loss": 0.7108, + "step": 1982 + }, + { + "epoch": 0.3971176941553243, + "grad_norm": 3.0773961544036865, + "learning_rate": 1.499192932863305e-05, + "loss": 0.5087, + "step": 1984 + }, + { + "epoch": 0.3979183346677342, + "grad_norm": 0.8716643452644348, + "learning_rate": 1.5004033708602967e-05, + "loss": 0.2438, + "step": 1986 + }, + { + "epoch": 0.3979183346677342, + "grad_norm": 0.8150566220283508, + "learning_rate": 1.5016128315586626e-05, + "loss": 0.1152, + "step": 1988 + }, + { + "epoch": 0.3987189751801441, + "grad_norm": 2.8830676078796387, + "learning_rate": 1.5028213125963029e-05, + "loss": 0.3223, + "step": 1990 + }, + { + "epoch": 0.3987189751801441, + "grad_norm": 3.063448667526245, + "learning_rate": 1.5040288116130261e-05, + "loss": 0.3031, + "step": 1992 + }, + { + "epoch": 0.39951961569255406, + "grad_norm": 2.6819210052490234, + "learning_rate": 1.5052353262505603e-05, + "loss": 0.3632, + "step": 1994 + }, + { + "epoch": 0.39951961569255406, + "grad_norm": 1.4269779920578003, + "learning_rate": 1.5064408541525568e-05, + "loss": 0.1693, + "step": 1996 + }, + { + "epoch": 0.400320256204964, + "grad_norm": 1.5757626295089722, + "learning_rate": 1.5076453929645933e-05, + "loss": 0.1441, + "step": 1998 + }, + { + "epoch": 0.400320256204964, + "grad_norm": 0.4093298614025116, + "learning_rate": 1.5088489403341793e-05, + "loss": 0.0914, + "step": 2000 + }, + { + "epoch": 0.4011208967173739, + "grad_norm": 2.644867181777954, + "learning_rate": 1.510051493910759e-05, + "loss": 0.4167, + "step": 2002 + }, + { + "epoch": 0.4011208967173739, + "grad_norm": 2.368976354598999, + "learning_rate": 1.5112530513457229e-05, + "loss": 0.1665, + "step": 2004 + }, + { + "epoch": 0.40192153722978385, + "grad_norm": 5.054984092712402, + "learning_rate": 1.512453610292401e-05, + "loss": 0.3844, + "step": 2006 + }, + { + "epoch": 0.40192153722978385, + "grad_norm": 1.8217912912368774, + "learning_rate": 1.513653168406076e-05, + "loss": 0.2233, + "step": 2008 + }, + { + "epoch": 0.40272217774219377, + "grad_norm": 2.8398332595825195, + "learning_rate": 1.514851723343985e-05, + "loss": 0.278, + "step": 2010 + }, + { + "epoch": 0.40272217774219377, + "grad_norm": 0.6333589553833008, + "learning_rate": 1.5160492727653245e-05, + "loss": 0.0854, + "step": 2012 + }, + { + "epoch": 0.4035228182546037, + "grad_norm": 2.3953187465667725, + "learning_rate": 1.5172458143312522e-05, + "loss": 0.1608, + "step": 2014 + }, + { + "epoch": 0.4035228182546037, + "grad_norm": 1.013063907623291, + "learning_rate": 1.5184413457049006e-05, + "loss": 0.1517, + "step": 2016 + }, + { + "epoch": 0.4043234587670136, + "grad_norm": 5.675294876098633, + "learning_rate": 1.5196358645513685e-05, + "loss": 1.0658, + "step": 2018 + }, + { + "epoch": 0.4043234587670136, + "grad_norm": 3.441972255706787, + "learning_rate": 1.5208293685377354e-05, + "loss": 0.3216, + "step": 2020 + }, + { + "epoch": 0.40512409927942356, + "grad_norm": 2.4654810428619385, + "learning_rate": 1.5220218553330618e-05, + "loss": 0.333, + "step": 2022 + }, + { + "epoch": 0.40512409927942356, + "grad_norm": 1.3125483989715576, + "learning_rate": 1.5232133226083954e-05, + "loss": 0.1549, + "step": 2024 + }, + { + "epoch": 0.40592473979183347, + "grad_norm": 3.1168930530548096, + "learning_rate": 1.5244037680367744e-05, + "loss": 0.4801, + "step": 2026 + }, + { + "epoch": 0.40592473979183347, + "grad_norm": 0.12399838119745255, + "learning_rate": 1.5255931892932322e-05, + "loss": 0.1941, + "step": 2028 + }, + { + "epoch": 0.4067253803042434, + "grad_norm": 1.5968430042266846, + "learning_rate": 1.5267815840548057e-05, + "loss": 0.2458, + "step": 2030 + }, + { + "epoch": 0.4067253803042434, + "grad_norm": 1.087570071220398, + "learning_rate": 1.527968950000533e-05, + "loss": 0.153, + "step": 2032 + }, + { + "epoch": 0.40752602081665334, + "grad_norm": 4.070639610290527, + "learning_rate": 1.529155284811463e-05, + "loss": 0.3481, + "step": 2034 + }, + { + "epoch": 0.40752602081665334, + "grad_norm": 3.6515493392944336, + "learning_rate": 1.5303405861706574e-05, + "loss": 0.2504, + "step": 2036 + }, + { + "epoch": 0.40832666132906326, + "grad_norm": 0.18392695486545563, + "learning_rate": 1.5315248517631975e-05, + "loss": 0.1244, + "step": 2038 + }, + { + "epoch": 0.40832666132906326, + "grad_norm": 5.477138042449951, + "learning_rate": 1.532708079276185e-05, + "loss": 0.3188, + "step": 2040 + }, + { + "epoch": 0.40912730184147317, + "grad_norm": 1.9146369695663452, + "learning_rate": 1.5338902663987544e-05, + "loss": 0.1446, + "step": 2042 + }, + { + "epoch": 0.40912730184147317, + "grad_norm": 1.2914267778396606, + "learning_rate": 1.5350714108220667e-05, + "loss": 0.0778, + "step": 2044 + }, + { + "epoch": 0.4099279423538831, + "grad_norm": 4.73802375793457, + "learning_rate": 1.5362515102393217e-05, + "loss": 0.8244, + "step": 2046 + }, + { + "epoch": 0.4099279423538831, + "grad_norm": 0.6808088421821594, + "learning_rate": 1.5374305623457594e-05, + "loss": 0.0539, + "step": 2048 + }, + { + "epoch": 0.41072858286629305, + "grad_norm": 0.9782052040100098, + "learning_rate": 1.5386085648386656e-05, + "loss": 0.1852, + "step": 2050 + }, + { + "epoch": 0.41072858286629305, + "grad_norm": 3.792337417602539, + "learning_rate": 1.539785515417376e-05, + "loss": 0.1604, + "step": 2052 + }, + { + "epoch": 0.41152922337870296, + "grad_norm": 3.3044750690460205, + "learning_rate": 1.540961411783279e-05, + "loss": 0.6711, + "step": 2054 + }, + { + "epoch": 0.41152922337870296, + "grad_norm": 4.935807228088379, + "learning_rate": 1.542136251639826e-05, + "loss": 0.2862, + "step": 2056 + }, + { + "epoch": 0.41232986389111287, + "grad_norm": 8.285751342773438, + "learning_rate": 1.5433100326925288e-05, + "loss": 0.7106, + "step": 2058 + }, + { + "epoch": 0.41232986389111287, + "grad_norm": 7.807729244232178, + "learning_rate": 1.5444827526489668e-05, + "loss": 0.4334, + "step": 2060 + }, + { + "epoch": 0.41313050440352284, + "grad_norm": 0.8675726652145386, + "learning_rate": 1.545654409218793e-05, + "loss": 0.069, + "step": 2062 + }, + { + "epoch": 0.41313050440352284, + "grad_norm": 2.5095322132110596, + "learning_rate": 1.5468250001137368e-05, + "loss": 0.1242, + "step": 2064 + }, + { + "epoch": 0.41393114491593275, + "grad_norm": 2.9617068767547607, + "learning_rate": 1.5479945230476066e-05, + "loss": 0.2696, + "step": 2066 + }, + { + "epoch": 0.41393114491593275, + "grad_norm": 2.4880943298339844, + "learning_rate": 1.5491629757363026e-05, + "loss": 0.1959, + "step": 2068 + }, + { + "epoch": 0.41473178542834266, + "grad_norm": 2.362239360809326, + "learning_rate": 1.550330355897809e-05, + "loss": 0.2706, + "step": 2070 + }, + { + "epoch": 0.41473178542834266, + "grad_norm": 0.26938533782958984, + "learning_rate": 1.551496661252208e-05, + "loss": 0.0581, + "step": 2072 + }, + { + "epoch": 0.4155324259407526, + "grad_norm": 4.136385917663574, + "learning_rate": 1.5526618895216793e-05, + "loss": 0.8413, + "step": 2074 + }, + { + "epoch": 0.4155324259407526, + "grad_norm": 4.035400390625, + "learning_rate": 1.5538260384305073e-05, + "loss": 0.2942, + "step": 2076 + }, + { + "epoch": 0.41633306645316254, + "grad_norm": 1.3763575553894043, + "learning_rate": 1.5549891057050837e-05, + "loss": 0.1386, + "step": 2078 + }, + { + "epoch": 0.41633306645316254, + "grad_norm": 2.1700727939605713, + "learning_rate": 1.5561510890739113e-05, + "loss": 0.1584, + "step": 2080 + }, + { + "epoch": 0.41713370696557245, + "grad_norm": 7.763471603393555, + "learning_rate": 1.557311986267615e-05, + "loss": 0.195, + "step": 2082 + }, + { + "epoch": 0.41713370696557245, + "grad_norm": 1.9990510940551758, + "learning_rate": 1.5584717950189353e-05, + "loss": 0.1239, + "step": 2084 + }, + { + "epoch": 0.41793434747798236, + "grad_norm": 3.986832857131958, + "learning_rate": 1.5596305130627404e-05, + "loss": 0.163, + "step": 2086 + }, + { + "epoch": 0.41793434747798236, + "grad_norm": 0.9359152913093567, + "learning_rate": 1.5607881381360296e-05, + "loss": 0.1329, + "step": 2088 + }, + { + "epoch": 0.4187349879903923, + "grad_norm": 14.315071105957031, + "learning_rate": 1.5619446679779357e-05, + "loss": 0.4512, + "step": 2090 + }, + { + "epoch": 0.4187349879903923, + "grad_norm": 6.952315807342529, + "learning_rate": 1.563100100329731e-05, + "loss": 0.3098, + "step": 2092 + }, + { + "epoch": 0.41953562850280224, + "grad_norm": 9.741835594177246, + "learning_rate": 1.564254432934829e-05, + "loss": 1.0647, + "step": 2094 + }, + { + "epoch": 0.41953562850280224, + "grad_norm": 1.629775881767273, + "learning_rate": 1.565407663538797e-05, + "loss": 0.1542, + "step": 2096 + }, + { + "epoch": 0.42033626901521215, + "grad_norm": 8.892681121826172, + "learning_rate": 1.5665597898893484e-05, + "loss": 0.7428, + "step": 2098 + }, + { + "epoch": 0.42033626901521215, + "grad_norm": 1.469712495803833, + "learning_rate": 1.567710809736356e-05, + "loss": 0.2374, + "step": 2100 + }, + { + "epoch": 0.4211369095276221, + "grad_norm": 7.1044392585754395, + "learning_rate": 1.568860720831853e-05, + "loss": 0.3994, + "step": 2102 + }, + { + "epoch": 0.4211369095276221, + "grad_norm": 1.001214623451233, + "learning_rate": 1.5700095209300376e-05, + "loss": 0.1193, + "step": 2104 + }, + { + "epoch": 0.42193755004003203, + "grad_norm": 2.5383987426757812, + "learning_rate": 1.5711572077872774e-05, + "loss": 0.3133, + "step": 2106 + }, + { + "epoch": 0.42193755004003203, + "grad_norm": 1.5843006372451782, + "learning_rate": 1.572303779162118e-05, + "loss": 0.2083, + "step": 2108 + }, + { + "epoch": 0.42273819055244194, + "grad_norm": 3.0574893951416016, + "learning_rate": 1.573449232815279e-05, + "loss": 0.404, + "step": 2110 + }, + { + "epoch": 0.42273819055244194, + "grad_norm": 0.5393478274345398, + "learning_rate": 1.5745935665096647e-05, + "loss": 0.0762, + "step": 2112 + }, + { + "epoch": 0.4235388310648519, + "grad_norm": 6.206510543823242, + "learning_rate": 1.5757367780103666e-05, + "loss": 0.5351, + "step": 2114 + }, + { + "epoch": 0.4235388310648519, + "grad_norm": 4.198579788208008, + "learning_rate": 1.5768788650846677e-05, + "loss": 0.2392, + "step": 2116 + }, + { + "epoch": 0.4243394715772618, + "grad_norm": 3.425102949142456, + "learning_rate": 1.5780198255020478e-05, + "loss": 0.4585, + "step": 2118 + }, + { + "epoch": 0.4243394715772618, + "grad_norm": 3.4158027172088623, + "learning_rate": 1.5791596570341844e-05, + "loss": 0.4153, + "step": 2120 + }, + { + "epoch": 0.42514011208967173, + "grad_norm": 3.4007251262664795, + "learning_rate": 1.580298357454965e-05, + "loss": 0.4489, + "step": 2122 + }, + { + "epoch": 0.42514011208967173, + "grad_norm": 3.418968677520752, + "learning_rate": 1.581435924540481e-05, + "loss": 0.2647, + "step": 2124 + }, + { + "epoch": 0.42594075260208164, + "grad_norm": 3.218684673309326, + "learning_rate": 1.5825723560690403e-05, + "loss": 0.4378, + "step": 2126 + }, + { + "epoch": 0.42594075260208164, + "grad_norm": 2.818122386932373, + "learning_rate": 1.5837076498211666e-05, + "loss": 0.1448, + "step": 2128 + }, + { + "epoch": 0.4267413931144916, + "grad_norm": 2.469158172607422, + "learning_rate": 1.5848418035796068e-05, + "loss": 0.2856, + "step": 2130 + }, + { + "epoch": 0.4267413931144916, + "grad_norm": 4.149569511413574, + "learning_rate": 1.5859748151293333e-05, + "loss": 0.1366, + "step": 2132 + }, + { + "epoch": 0.4275420336269015, + "grad_norm": 7.417590141296387, + "learning_rate": 1.587106682257552e-05, + "loss": 0.4947, + "step": 2134 + }, + { + "epoch": 0.4275420336269015, + "grad_norm": 2.009355306625366, + "learning_rate": 1.5882374027537005e-05, + "loss": 0.1376, + "step": 2136 + }, + { + "epoch": 0.42834267413931143, + "grad_norm": 2.082042932510376, + "learning_rate": 1.5893669744094577e-05, + "loss": 0.3478, + "step": 2138 + }, + { + "epoch": 0.42834267413931143, + "grad_norm": 1.4400086402893066, + "learning_rate": 1.5904953950187455e-05, + "loss": 0.1863, + "step": 2140 + }, + { + "epoch": 0.4291433146517214, + "grad_norm": 3.4386940002441406, + "learning_rate": 1.591622662377734e-05, + "loss": 0.3213, + "step": 2142 + }, + { + "epoch": 0.4291433146517214, + "grad_norm": 0.6285686492919922, + "learning_rate": 1.5927487742848448e-05, + "loss": 0.1819, + "step": 2144 + }, + { + "epoch": 0.4299439551641313, + "grad_norm": 7.449779510498047, + "learning_rate": 1.5938737285407567e-05, + "loss": 0.7516, + "step": 2146 + }, + { + "epoch": 0.4299439551641313, + "grad_norm": 5.1106648445129395, + "learning_rate": 1.594997522948412e-05, + "loss": 0.3061, + "step": 2148 + }, + { + "epoch": 0.4307445956765412, + "grad_norm": 3.5748400688171387, + "learning_rate": 1.5961201553130148e-05, + "loss": 0.2521, + "step": 2150 + }, + { + "epoch": 0.4307445956765412, + "grad_norm": 1.4178043603897095, + "learning_rate": 1.5972416234420393e-05, + "loss": 0.1483, + "step": 2152 + }, + { + "epoch": 0.4315452361889512, + "grad_norm": 1.5531257390975952, + "learning_rate": 1.598361925145234e-05, + "loss": 0.2492, + "step": 2154 + }, + { + "epoch": 0.4315452361889512, + "grad_norm": 1.9048939943313599, + "learning_rate": 1.599481058234626e-05, + "loss": 0.2421, + "step": 2156 + }, + { + "epoch": 0.4323458767013611, + "grad_norm": 3.1559202671051025, + "learning_rate": 1.6005990205245216e-05, + "loss": 0.3058, + "step": 2158 + }, + { + "epoch": 0.4323458767013611, + "grad_norm": 1.1827607154846191, + "learning_rate": 1.60171580983152e-05, + "loss": 0.2221, + "step": 2160 + }, + { + "epoch": 0.433146517213771, + "grad_norm": 3.0863430500030518, + "learning_rate": 1.602831423974506e-05, + "loss": 0.3908, + "step": 2162 + }, + { + "epoch": 0.433146517213771, + "grad_norm": 3.1559300422668457, + "learning_rate": 1.6039458607746607e-05, + "loss": 0.2268, + "step": 2164 + }, + { + "epoch": 0.4339471577261809, + "grad_norm": 3.0364270210266113, + "learning_rate": 1.6050591180554648e-05, + "loss": 0.2373, + "step": 2166 + }, + { + "epoch": 0.4339471577261809, + "grad_norm": 2.0479319095611572, + "learning_rate": 1.606171193642703e-05, + "loss": 0.1067, + "step": 2168 + }, + { + "epoch": 0.4347477982385909, + "grad_norm": 6.445245265960693, + "learning_rate": 1.6072820853644677e-05, + "loss": 0.8615, + "step": 2170 + }, + { + "epoch": 0.4347477982385909, + "grad_norm": 3.085031270980835, + "learning_rate": 1.6083917910511616e-05, + "loss": 0.2522, + "step": 2172 + }, + { + "epoch": 0.4355484387510008, + "grad_norm": 6.956778526306152, + "learning_rate": 1.6095003085355082e-05, + "loss": 0.403, + "step": 2174 + }, + { + "epoch": 0.4355484387510008, + "grad_norm": 1.4653464555740356, + "learning_rate": 1.6106076356525474e-05, + "loss": 0.2099, + "step": 2176 + }, + { + "epoch": 0.4363490792634107, + "grad_norm": 3.943056106567383, + "learning_rate": 1.611713770239646e-05, + "loss": 0.3931, + "step": 2178 + }, + { + "epoch": 0.4363490792634107, + "grad_norm": 5.623725891113281, + "learning_rate": 1.6128187101364982e-05, + "loss": 0.2777, + "step": 2180 + }, + { + "epoch": 0.4371497197758207, + "grad_norm": 2.521444320678711, + "learning_rate": 1.6139224531851332e-05, + "loss": 0.2418, + "step": 2182 + }, + { + "epoch": 0.4371497197758207, + "grad_norm": 1.7543635368347168, + "learning_rate": 1.6150249972299153e-05, + "loss": 0.1576, + "step": 2184 + }, + { + "epoch": 0.4379503602882306, + "grad_norm": 7.471405029296875, + "learning_rate": 1.616126340117555e-05, + "loss": 0.4141, + "step": 2186 + }, + { + "epoch": 0.4379503602882306, + "grad_norm": 5.538974761962891, + "learning_rate": 1.617226479697104e-05, + "loss": 0.2907, + "step": 2188 + }, + { + "epoch": 0.4387510008006405, + "grad_norm": 4.0698466300964355, + "learning_rate": 1.618325413819966e-05, + "loss": 0.3324, + "step": 2190 + }, + { + "epoch": 0.4387510008006405, + "grad_norm": 1.1350922584533691, + "learning_rate": 1.6194231403398994e-05, + "loss": 0.1112, + "step": 2192 + }, + { + "epoch": 0.43955164131305047, + "grad_norm": 1.6271010637283325, + "learning_rate": 1.6205196571130194e-05, + "loss": 0.1331, + "step": 2194 + }, + { + "epoch": 0.43955164131305047, + "grad_norm": 1.3089637756347656, + "learning_rate": 1.621614961997806e-05, + "loss": 0.153, + "step": 2196 + }, + { + "epoch": 0.4403522818254604, + "grad_norm": 1.6775732040405273, + "learning_rate": 1.6227090528551034e-05, + "loss": 0.2376, + "step": 2198 + }, + { + "epoch": 0.4403522818254604, + "grad_norm": 2.240344762802124, + "learning_rate": 1.6238019275481313e-05, + "loss": 0.1216, + "step": 2200 + }, + { + "epoch": 0.4411529223378703, + "grad_norm": 3.7298552989959717, + "learning_rate": 1.62489358394248e-05, + "loss": 0.3228, + "step": 2202 + }, + { + "epoch": 0.4411529223378703, + "grad_norm": 2.486121654510498, + "learning_rate": 1.6259840199061212e-05, + "loss": 0.2287, + "step": 2204 + }, + { + "epoch": 0.4419535628502802, + "grad_norm": 2.7446866035461426, + "learning_rate": 1.6270732333094095e-05, + "loss": 0.3045, + "step": 2206 + }, + { + "epoch": 0.4419535628502802, + "grad_norm": 1.4923310279846191, + "learning_rate": 1.6281612220250883e-05, + "loss": 0.1387, + "step": 2208 + }, + { + "epoch": 0.44275420336269017, + "grad_norm": 4.746013164520264, + "learning_rate": 1.6292479839282897e-05, + "loss": 0.7242, + "step": 2210 + }, + { + "epoch": 0.44275420336269017, + "grad_norm": 5.777390480041504, + "learning_rate": 1.6303335168965474e-05, + "loss": 0.2066, + "step": 2212 + }, + { + "epoch": 0.4435548438751001, + "grad_norm": 10.444276809692383, + "learning_rate": 1.6314178188097907e-05, + "loss": 0.1613, + "step": 2214 + }, + { + "epoch": 0.4435548438751001, + "grad_norm": 7.228999614715576, + "learning_rate": 1.6325008875503543e-05, + "loss": 0.2503, + "step": 2216 + }, + { + "epoch": 0.44435548438751, + "grad_norm": 2.351062059402466, + "learning_rate": 1.6335827210029816e-05, + "loss": 0.2322, + "step": 2218 + }, + { + "epoch": 0.44435548438751, + "grad_norm": 0.5046015381813049, + "learning_rate": 1.6346633170548285e-05, + "loss": 0.1937, + "step": 2220 + }, + { + "epoch": 0.44515612489991996, + "grad_norm": 0.6082844138145447, + "learning_rate": 1.635742673595467e-05, + "loss": 0.1165, + "step": 2222 + }, + { + "epoch": 0.44515612489991996, + "grad_norm": 4.14862585067749, + "learning_rate": 1.6368207885168897e-05, + "loss": 0.1586, + "step": 2224 + }, + { + "epoch": 0.44595676541232987, + "grad_norm": 15.102370262145996, + "learning_rate": 1.6378976597135173e-05, + "loss": 0.3269, + "step": 2226 + }, + { + "epoch": 0.44595676541232987, + "grad_norm": 1.7139651775360107, + "learning_rate": 1.6389732850821957e-05, + "loss": 0.1728, + "step": 2228 + }, + { + "epoch": 0.4467574059247398, + "grad_norm": 5.743370056152344, + "learning_rate": 1.640047662522205e-05, + "loss": 0.2605, + "step": 2230 + }, + { + "epoch": 0.4467574059247398, + "grad_norm": 4.587190628051758, + "learning_rate": 1.641120789935263e-05, + "loss": 0.2756, + "step": 2232 + }, + { + "epoch": 0.4475580464371497, + "grad_norm": 5.637535095214844, + "learning_rate": 1.6421926652255282e-05, + "loss": 0.3641, + "step": 2234 + }, + { + "epoch": 0.4475580464371497, + "grad_norm": 3.9835045337677, + "learning_rate": 1.6432632862996042e-05, + "loss": 0.0888, + "step": 2236 + }, + { + "epoch": 0.44835868694955966, + "grad_norm": 1.2728132009506226, + "learning_rate": 1.6443326510665474e-05, + "loss": 0.1524, + "step": 2238 + }, + { + "epoch": 0.44835868694955966, + "grad_norm": 4.906287670135498, + "learning_rate": 1.6454007574378637e-05, + "loss": 0.0907, + "step": 2240 + }, + { + "epoch": 0.44915932746196957, + "grad_norm": 1.2475829124450684, + "learning_rate": 1.646467603327518e-05, + "loss": 0.057, + "step": 2242 + }, + { + "epoch": 0.44915932746196957, + "grad_norm": 1.3707671165466309, + "learning_rate": 1.6475331866519377e-05, + "loss": 0.0984, + "step": 2244 + }, + { + "epoch": 0.4499599679743795, + "grad_norm": 3.9742984771728516, + "learning_rate": 1.6485975053300154e-05, + "loss": 0.4599, + "step": 2246 + }, + { + "epoch": 0.4499599679743795, + "grad_norm": 0.3879615068435669, + "learning_rate": 1.6496605572831134e-05, + "loss": 0.0482, + "step": 2248 + }, + { + "epoch": 0.45076060848678945, + "grad_norm": 1.877485990524292, + "learning_rate": 1.650722340435067e-05, + "loss": 0.1151, + "step": 2250 + }, + { + "epoch": 0.45076060848678945, + "grad_norm": 0.7177757620811462, + "learning_rate": 1.6517828527121928e-05, + "loss": 0.0577, + "step": 2252 + }, + { + "epoch": 0.45156124899919936, + "grad_norm": 1.3681731224060059, + "learning_rate": 1.652842092043287e-05, + "loss": 0.1435, + "step": 2254 + }, + { + "epoch": 0.45156124899919936, + "grad_norm": 0.13087652623653412, + "learning_rate": 1.6539000563596318e-05, + "loss": 0.0109, + "step": 2256 + }, + { + "epoch": 0.45236188951160927, + "grad_norm": 3.8824405670166016, + "learning_rate": 1.6549567435950004e-05, + "loss": 0.4716, + "step": 2258 + }, + { + "epoch": 0.45236188951160927, + "grad_norm": 2.1784331798553467, + "learning_rate": 1.6560121516856586e-05, + "loss": 0.0419, + "step": 2260 + }, + { + "epoch": 0.45316253002401924, + "grad_norm": 8.197497367858887, + "learning_rate": 1.6570662785703713e-05, + "loss": 0.7485, + "step": 2262 + }, + { + "epoch": 0.45316253002401924, + "grad_norm": 2.758165121078491, + "learning_rate": 1.6581191221904077e-05, + "loss": 0.0822, + "step": 2264 + }, + { + "epoch": 0.45396317053642915, + "grad_norm": 6.236542224884033, + "learning_rate": 1.6591706804895408e-05, + "loss": 0.409, + "step": 2266 + }, + { + "epoch": 0.45396317053642915, + "grad_norm": 0.3710917532444, + "learning_rate": 1.6602209514140542e-05, + "loss": 0.0656, + "step": 2268 + }, + { + "epoch": 0.45476381104883906, + "grad_norm": 11.119144439697266, + "learning_rate": 1.6612699329127457e-05, + "loss": 0.2257, + "step": 2270 + }, + { + "epoch": 0.45476381104883906, + "grad_norm": 1.4367777109146118, + "learning_rate": 1.6623176229369324e-05, + "loss": 0.1516, + "step": 2272 + }, + { + "epoch": 0.455564451561249, + "grad_norm": 4.863069534301758, + "learning_rate": 1.6633640194404523e-05, + "loss": 0.6148, + "step": 2274 + }, + { + "epoch": 0.455564451561249, + "grad_norm": 4.463088035583496, + "learning_rate": 1.6644091203796694e-05, + "loss": 0.2555, + "step": 2276 + }, + { + "epoch": 0.45636509207365894, + "grad_norm": 0.8924980759620667, + "learning_rate": 1.6654529237134816e-05, + "loss": 0.0583, + "step": 2278 + }, + { + "epoch": 0.45636509207365894, + "grad_norm": 1.1015686988830566, + "learning_rate": 1.6664954274033168e-05, + "loss": 0.0507, + "step": 2280 + }, + { + "epoch": 0.45716573258606885, + "grad_norm": 0.5944099426269531, + "learning_rate": 1.667536629413143e-05, + "loss": 0.3839, + "step": 2282 + }, + { + "epoch": 0.45716573258606885, + "grad_norm": 2.5762908458709717, + "learning_rate": 1.6685765277094695e-05, + "loss": 0.0904, + "step": 2284 + }, + { + "epoch": 0.45796637309847876, + "grad_norm": 7.209502696990967, + "learning_rate": 1.6696151202613527e-05, + "loss": 0.6175, + "step": 2286 + }, + { + "epoch": 0.45796637309847876, + "grad_norm": 3.0320687294006348, + "learning_rate": 1.6706524050403996e-05, + "loss": 0.2009, + "step": 2288 + }, + { + "epoch": 0.45876701361088873, + "grad_norm": 4.820057392120361, + "learning_rate": 1.6716883800207685e-05, + "loss": 0.4419, + "step": 2290 + }, + { + "epoch": 0.45876701361088873, + "grad_norm": 2.6655876636505127, + "learning_rate": 1.6727230431791806e-05, + "loss": 0.2465, + "step": 2292 + }, + { + "epoch": 0.45956765412329864, + "grad_norm": 16.09896469116211, + "learning_rate": 1.673756392494915e-05, + "loss": 0.3331, + "step": 2294 + }, + { + "epoch": 0.45956765412329864, + "grad_norm": 0.8991698026657104, + "learning_rate": 1.674788425949818e-05, + "loss": 0.1508, + "step": 2296 + }, + { + "epoch": 0.46036829463570855, + "grad_norm": 2.0800375938415527, + "learning_rate": 1.6758191415283063e-05, + "loss": 0.1113, + "step": 2298 + }, + { + "epoch": 0.46036829463570855, + "grad_norm": 1.6421167850494385, + "learning_rate": 1.6768485372173696e-05, + "loss": 0.1364, + "step": 2300 + }, + { + "epoch": 0.4611689351481185, + "grad_norm": 1.4822124242782593, + "learning_rate": 1.6778766110065755e-05, + "loss": 0.1378, + "step": 2302 + }, + { + "epoch": 0.4611689351481185, + "grad_norm": 2.3338429927825928, + "learning_rate": 1.6789033608880735e-05, + "loss": 0.1092, + "step": 2304 + }, + { + "epoch": 0.46196957566052843, + "grad_norm": 10.819419860839844, + "learning_rate": 1.6799287848566e-05, + "loss": 1.1841, + "step": 2306 + }, + { + "epoch": 0.46196957566052843, + "grad_norm": 0.12212887406349182, + "learning_rate": 1.6809528809094798e-05, + "loss": 0.1225, + "step": 2308 + }, + { + "epoch": 0.46277021617293834, + "grad_norm": 2.0545661449432373, + "learning_rate": 1.6819756470466305e-05, + "loss": 0.5412, + "step": 2310 + }, + { + "epoch": 0.46277021617293834, + "grad_norm": 2.592833995819092, + "learning_rate": 1.6829970812705674e-05, + "loss": 0.3103, + "step": 2312 + }, + { + "epoch": 0.46357085668534825, + "grad_norm": 1.0092791318893433, + "learning_rate": 1.684017181586408e-05, + "loss": 0.2568, + "step": 2314 + }, + { + "epoch": 0.46357085668534825, + "grad_norm": 0.3566044569015503, + "learning_rate": 1.6850359460018733e-05, + "loss": 0.0602, + "step": 2316 + }, + { + "epoch": 0.4643714971977582, + "grad_norm": 7.5693678855896, + "learning_rate": 1.6860533725272943e-05, + "loss": 0.7765, + "step": 2318 + }, + { + "epoch": 0.4643714971977582, + "grad_norm": 1.3673146963119507, + "learning_rate": 1.6870694591756165e-05, + "loss": 0.1604, + "step": 2320 + }, + { + "epoch": 0.46517213771016813, + "grad_norm": 6.080178737640381, + "learning_rate": 1.6880842039624e-05, + "loss": 0.5968, + "step": 2322 + }, + { + "epoch": 0.46517213771016813, + "grad_norm": 8.150190353393555, + "learning_rate": 1.689097604905826e-05, + "loss": 0.2065, + "step": 2324 + }, + { + "epoch": 0.46597277822257804, + "grad_norm": 0.9336661100387573, + "learning_rate": 1.6901096600267e-05, + "loss": 0.4711, + "step": 2326 + }, + { + "epoch": 0.46597277822257804, + "grad_norm": 2.2692923545837402, + "learning_rate": 1.6911203673484577e-05, + "loss": 0.2382, + "step": 2328 + }, + { + "epoch": 0.466773418734988, + "grad_norm": 2.6562089920043945, + "learning_rate": 1.6921297248971645e-05, + "loss": 0.545, + "step": 2330 + }, + { + "epoch": 0.466773418734988, + "grad_norm": 6.607024192810059, + "learning_rate": 1.6931377307015226e-05, + "loss": 0.321, + "step": 2332 + }, + { + "epoch": 0.4675740592473979, + "grad_norm": 1.2300655841827393, + "learning_rate": 1.6941443827928778e-05, + "loss": 0.2077, + "step": 2334 + }, + { + "epoch": 0.4675740592473979, + "grad_norm": 2.7884891033172607, + "learning_rate": 1.695149679205214e-05, + "loss": 0.2076, + "step": 2336 + }, + { + "epoch": 0.46837469975980783, + "grad_norm": 0.9983940124511719, + "learning_rate": 1.6961536179751672e-05, + "loss": 0.3984, + "step": 2338 + }, + { + "epoch": 0.46837469975980783, + "grad_norm": 3.441647529602051, + "learning_rate": 1.6971561971420222e-05, + "loss": 0.3795, + "step": 2340 + }, + { + "epoch": 0.4691753402722178, + "grad_norm": 2.8214025497436523, + "learning_rate": 1.6981574147477204e-05, + "loss": 0.4377, + "step": 2342 + }, + { + "epoch": 0.4691753402722178, + "grad_norm": 1.9519199132919312, + "learning_rate": 1.6991572688368628e-05, + "loss": 0.1256, + "step": 2344 + }, + { + "epoch": 0.4699759807846277, + "grad_norm": 4.035347938537598, + "learning_rate": 1.70015575745671e-05, + "loss": 0.3636, + "step": 2346 + }, + { + "epoch": 0.4699759807846277, + "grad_norm": 0.9321067929267883, + "learning_rate": 1.701152878657196e-05, + "loss": 0.1368, + "step": 2348 + }, + { + "epoch": 0.4707766212970376, + "grad_norm": 3.558406352996826, + "learning_rate": 1.7021486304909196e-05, + "loss": 0.4537, + "step": 2350 + }, + { + "epoch": 0.4707766212970376, + "grad_norm": 3.0133163928985596, + "learning_rate": 1.7031430110131562e-05, + "loss": 0.2049, + "step": 2352 + }, + { + "epoch": 0.47157726180944753, + "grad_norm": 4.378469467163086, + "learning_rate": 1.7041360182818583e-05, + "loss": 0.4138, + "step": 2354 + }, + { + "epoch": 0.47157726180944753, + "grad_norm": 0.4633253514766693, + "learning_rate": 1.705127650357662e-05, + "loss": 0.1537, + "step": 2356 + }, + { + "epoch": 0.4723779023218575, + "grad_norm": 1.3094156980514526, + "learning_rate": 1.7061179053038887e-05, + "loss": 0.1857, + "step": 2358 + }, + { + "epoch": 0.4723779023218575, + "grad_norm": 0.789790153503418, + "learning_rate": 1.7071067811865467e-05, + "loss": 0.0728, + "step": 2360 + }, + { + "epoch": 0.4731785428342674, + "grad_norm": 0.19915759563446045, + "learning_rate": 1.708094276074343e-05, + "loss": 0.5375, + "step": 2362 + }, + { + "epoch": 0.4731785428342674, + "grad_norm": 0.7729790210723877, + "learning_rate": 1.7090803880386778e-05, + "loss": 0.1038, + "step": 2364 + }, + { + "epoch": 0.4739791833466773, + "grad_norm": 1.3428341150283813, + "learning_rate": 1.7100651151536525e-05, + "loss": 0.1201, + "step": 2366 + }, + { + "epoch": 0.4739791833466773, + "grad_norm": 1.4603532552719116, + "learning_rate": 1.7110484554960738e-05, + "loss": 0.1082, + "step": 2368 + }, + { + "epoch": 0.4747798238590873, + "grad_norm": 1.408722162246704, + "learning_rate": 1.712030407145457e-05, + "loss": 0.3275, + "step": 2370 + }, + { + "epoch": 0.4747798238590873, + "grad_norm": 1.4733169078826904, + "learning_rate": 1.713010968184029e-05, + "loss": 0.1447, + "step": 2372 + }, + { + "epoch": 0.4755804643714972, + "grad_norm": 3.4286673069000244, + "learning_rate": 1.7139901366967332e-05, + "loss": 0.7155, + "step": 2374 + }, + { + "epoch": 0.4755804643714972, + "grad_norm": 2.239189386367798, + "learning_rate": 1.7149679107712306e-05, + "loss": 0.0683, + "step": 2376 + }, + { + "epoch": 0.4763811048839071, + "grad_norm": 4.584059238433838, + "learning_rate": 1.71594428849791e-05, + "loss": 0.3346, + "step": 2378 + }, + { + "epoch": 0.4763811048839071, + "grad_norm": 7.055164337158203, + "learning_rate": 1.716919267969883e-05, + "loss": 0.3488, + "step": 2380 + }, + { + "epoch": 0.4771817453963171, + "grad_norm": 3.2347970008850098, + "learning_rate": 1.717892847282994e-05, + "loss": 0.3658, + "step": 2382 + }, + { + "epoch": 0.4771817453963171, + "grad_norm": 1.707948088645935, + "learning_rate": 1.7188650245358215e-05, + "loss": 0.3147, + "step": 2384 + }, + { + "epoch": 0.477982385908727, + "grad_norm": 3.585576295852661, + "learning_rate": 1.7198357978296817e-05, + "loss": 0.1852, + "step": 2386 + }, + { + "epoch": 0.477982385908727, + "grad_norm": 2.0706615447998047, + "learning_rate": 1.7208051652686338e-05, + "loss": 0.1725, + "step": 2388 + }, + { + "epoch": 0.4787830264211369, + "grad_norm": 3.843757152557373, + "learning_rate": 1.721773124959481e-05, + "loss": 0.304, + "step": 2390 + }, + { + "epoch": 0.4787830264211369, + "grad_norm": 0.9396626949310303, + "learning_rate": 1.722739675011779e-05, + "loss": 0.2406, + "step": 2392 + }, + { + "epoch": 0.4795836669335468, + "grad_norm": 2.6398236751556396, + "learning_rate": 1.723704813537834e-05, + "loss": 0.4965, + "step": 2394 + }, + { + "epoch": 0.4795836669335468, + "grad_norm": 1.556936264038086, + "learning_rate": 1.7246685386527095e-05, + "loss": 0.198, + "step": 2396 + }, + { + "epoch": 0.4803843074459568, + "grad_norm": 3.1677372455596924, + "learning_rate": 1.725630848474229e-05, + "loss": 0.473, + "step": 2398 + }, + { + "epoch": 0.4803843074459568, + "grad_norm": 2.35331392288208, + "learning_rate": 1.726591741122981e-05, + "loss": 0.188, + "step": 2400 + }, + { + "epoch": 0.4811849479583667, + "grad_norm": 2.03420090675354, + "learning_rate": 1.727551214722321e-05, + "loss": 0.3617, + "step": 2402 + }, + { + "epoch": 0.4811849479583667, + "grad_norm": 0.8967152833938599, + "learning_rate": 1.7285092673983753e-05, + "loss": 0.2393, + "step": 2404 + }, + { + "epoch": 0.4819855884707766, + "grad_norm": 1.2701177597045898, + "learning_rate": 1.7294658972800488e-05, + "loss": 0.154, + "step": 2406 + }, + { + "epoch": 0.4819855884707766, + "grad_norm": 6.1322526931762695, + "learning_rate": 1.730421102499021e-05, + "loss": 0.155, + "step": 2408 + }, + { + "epoch": 0.48278622898318657, + "grad_norm": 2.2564525604248047, + "learning_rate": 1.7313748811897558e-05, + "loss": 0.297, + "step": 2410 + }, + { + "epoch": 0.48278622898318657, + "grad_norm": 0.08073009550571442, + "learning_rate": 1.7323272314895022e-05, + "loss": 0.0729, + "step": 2412 + }, + { + "epoch": 0.4835868694955965, + "grad_norm": 3.875516891479492, + "learning_rate": 1.7332781515383003e-05, + "loss": 0.3041, + "step": 2414 + }, + { + "epoch": 0.4835868694955965, + "grad_norm": 2.8866536617279053, + "learning_rate": 1.734227639478982e-05, + "loss": 0.1382, + "step": 2416 + }, + { + "epoch": 0.4843875100080064, + "grad_norm": 7.96664571762085, + "learning_rate": 1.7351756934571758e-05, + "loss": 0.3989, + "step": 2418 + }, + { + "epoch": 0.4843875100080064, + "grad_norm": 0.27675095200538635, + "learning_rate": 1.736122311621314e-05, + "loss": 0.0583, + "step": 2420 + }, + { + "epoch": 0.4851881505204163, + "grad_norm": 8.195183753967285, + "learning_rate": 1.7370674921226296e-05, + "loss": 1.0319, + "step": 2422 + }, + { + "epoch": 0.4851881505204163, + "grad_norm": 4.048768520355225, + "learning_rate": 1.738011233115165e-05, + "loss": 0.2816, + "step": 2424 + }, + { + "epoch": 0.4859887910328263, + "grad_norm": 10.002546310424805, + "learning_rate": 1.7389535327557733e-05, + "loss": 0.9037, + "step": 2426 + }, + { + "epoch": 0.4859887910328263, + "grad_norm": 2.108306884765625, + "learning_rate": 1.7398943892041227e-05, + "loss": 0.1944, + "step": 2428 + }, + { + "epoch": 0.4867894315452362, + "grad_norm": 4.3191118240356445, + "learning_rate": 1.7408338006227005e-05, + "loss": 0.4968, + "step": 2430 + }, + { + "epoch": 0.4867894315452362, + "grad_norm": 0.4869023859500885, + "learning_rate": 1.7417717651768144e-05, + "loss": 0.1393, + "step": 2432 + }, + { + "epoch": 0.4875900720576461, + "grad_norm": 6.874834060668945, + "learning_rate": 1.7427082810346018e-05, + "loss": 0.5135, + "step": 2434 + }, + { + "epoch": 0.4875900720576461, + "grad_norm": 1.5692269802093506, + "learning_rate": 1.743643346367026e-05, + "loss": 0.2538, + "step": 2436 + }, + { + "epoch": 0.48839071257005606, + "grad_norm": 4.374287128448486, + "learning_rate": 1.744576959347884e-05, + "loss": 0.4713, + "step": 2438 + }, + { + "epoch": 0.48839071257005606, + "grad_norm": 3.195158004760742, + "learning_rate": 1.7455091181538087e-05, + "loss": 0.0935, + "step": 2440 + }, + { + "epoch": 0.489191353082466, + "grad_norm": 2.4633028507232666, + "learning_rate": 1.746439820964275e-05, + "loss": 0.343, + "step": 2442 + }, + { + "epoch": 0.489191353082466, + "grad_norm": 6.474045276641846, + "learning_rate": 1.7473690659615992e-05, + "loss": 0.1495, + "step": 2444 + }, + { + "epoch": 0.4899919935948759, + "grad_norm": 2.9856784343719482, + "learning_rate": 1.748296851330945e-05, + "loss": 0.2533, + "step": 2446 + }, + { + "epoch": 0.4899919935948759, + "grad_norm": 1.8277050256729126, + "learning_rate": 1.74922317526033e-05, + "loss": 0.2471, + "step": 2448 + }, + { + "epoch": 0.49079263410728585, + "grad_norm": 2.0432722568511963, + "learning_rate": 1.7501480359406217e-05, + "loss": 0.2378, + "step": 2450 + }, + { + "epoch": 0.49079263410728585, + "grad_norm": 0.5191777944564819, + "learning_rate": 1.7510714315655474e-05, + "loss": 0.1393, + "step": 2452 + }, + { + "epoch": 0.49159327461969576, + "grad_norm": 1.4057385921478271, + "learning_rate": 1.7519933603316955e-05, + "loss": 0.296, + "step": 2454 + }, + { + "epoch": 0.49159327461969576, + "grad_norm": 1.0543240308761597, + "learning_rate": 1.752913820438519e-05, + "loss": 0.0645, + "step": 2456 + }, + { + "epoch": 0.4923939151321057, + "grad_norm": 4.538109302520752, + "learning_rate": 1.7538328100883397e-05, + "loss": 0.5136, + "step": 2458 + }, + { + "epoch": 0.4923939151321057, + "grad_norm": 3.8467319011688232, + "learning_rate": 1.7547503274863495e-05, + "loss": 0.3972, + "step": 2460 + }, + { + "epoch": 0.4931945556445156, + "grad_norm": 1.5259850025177002, + "learning_rate": 1.7556663708406193e-05, + "loss": 0.1898, + "step": 2462 + }, + { + "epoch": 0.4931945556445156, + "grad_norm": 1.7327384948730469, + "learning_rate": 1.756580938362096e-05, + "loss": 0.1585, + "step": 2464 + }, + { + "epoch": 0.49399519615692555, + "grad_norm": 7.1075334548950195, + "learning_rate": 1.7574940282646085e-05, + "loss": 0.6449, + "step": 2466 + }, + { + "epoch": 0.49399519615692555, + "grad_norm": 0.27413666248321533, + "learning_rate": 1.758405638764873e-05, + "loss": 0.1978, + "step": 2468 + }, + { + "epoch": 0.49479583666933546, + "grad_norm": 4.352715015411377, + "learning_rate": 1.7593157680824946e-05, + "loss": 0.2902, + "step": 2470 + }, + { + "epoch": 0.49479583666933546, + "grad_norm": 2.5614006519317627, + "learning_rate": 1.7602244144399693e-05, + "loss": 0.1663, + "step": 2472 + }, + { + "epoch": 0.4955964771817454, + "grad_norm": 6.705410480499268, + "learning_rate": 1.761131576062694e-05, + "loss": 0.7571, + "step": 2474 + }, + { + "epoch": 0.4955964771817454, + "grad_norm": 1.084343433380127, + "learning_rate": 1.7620372511789604e-05, + "loss": 0.2377, + "step": 2476 + }, + { + "epoch": 0.49639711769415534, + "grad_norm": 6.248439311981201, + "learning_rate": 1.7629414380199662e-05, + "loss": 0.219, + "step": 2478 + }, + { + "epoch": 0.49639711769415534, + "grad_norm": 5.443384170532227, + "learning_rate": 1.7638441348198147e-05, + "loss": 0.1282, + "step": 2480 + }, + { + "epoch": 0.49719775820656525, + "grad_norm": 7.800180435180664, + "learning_rate": 1.7647453398155194e-05, + "loss": 0.4507, + "step": 2482 + }, + { + "epoch": 0.49719775820656525, + "grad_norm": 1.5663750171661377, + "learning_rate": 1.7656450512470077e-05, + "loss": 0.0647, + "step": 2484 + }, + { + "epoch": 0.49799839871897517, + "grad_norm": 5.80070686340332, + "learning_rate": 1.7665432673571218e-05, + "loss": 0.8264, + "step": 2486 + }, + { + "epoch": 0.49799839871897517, + "grad_norm": 0.6345700025558472, + "learning_rate": 1.7674399863916295e-05, + "loss": 0.1908, + "step": 2488 + }, + { + "epoch": 0.49879903923138513, + "grad_norm": 10.182790756225586, + "learning_rate": 1.768335206599217e-05, + "loss": 1.0563, + "step": 2490 + }, + { + "epoch": 0.49879903923138513, + "grad_norm": 2.9547836780548096, + "learning_rate": 1.7692289262315e-05, + "loss": 0.3394, + "step": 2492 + }, + { + "epoch": 0.49959967974379504, + "grad_norm": 2.4862353801727295, + "learning_rate": 1.7701211435430256e-05, + "loss": 0.3833, + "step": 2494 + }, + { + "epoch": 0.49959967974379504, + "grad_norm": 2.324763536453247, + "learning_rate": 1.771011856791273e-05, + "loss": 0.2563, + "step": 2496 + }, + { + "epoch": 0.500400320256205, + "grad_norm": 2.483809471130371, + "learning_rate": 1.771901064236659e-05, + "loss": 0.2903, + "step": 2498 + }, + { + "epoch": 0.500400320256205, + "grad_norm": 0.9356859922409058, + "learning_rate": 1.7727887641425448e-05, + "loss": 0.0811, + "step": 2500 + }, + { + "epoch": 0.5012009607686149, + "grad_norm": 5.978772163391113, + "learning_rate": 1.773674954775232e-05, + "loss": 0.4202, + "step": 2502 + }, + { + "epoch": 0.5012009607686149, + "grad_norm": 2.630692958831787, + "learning_rate": 1.7745596344039712e-05, + "loss": 0.1821, + "step": 2504 + }, + { + "epoch": 0.5020016012810248, + "grad_norm": 4.402559280395508, + "learning_rate": 1.7754428013009637e-05, + "loss": 0.2226, + "step": 2506 + }, + { + "epoch": 0.5020016012810248, + "grad_norm": 2.897047281265259, + "learning_rate": 1.7763244537413657e-05, + "loss": 0.2155, + "step": 2508 + }, + { + "epoch": 0.5028022417934348, + "grad_norm": 2.68210506439209, + "learning_rate": 1.77720459000329e-05, + "loss": 0.3834, + "step": 2510 + }, + { + "epoch": 0.5028022417934348, + "grad_norm": 2.750938892364502, + "learning_rate": 1.7780832083678116e-05, + "loss": 0.3047, + "step": 2512 + }, + { + "epoch": 0.5036028823058447, + "grad_norm": 1.8552926778793335, + "learning_rate": 1.7789603071189712e-05, + "loss": 0.1072, + "step": 2514 + }, + { + "epoch": 0.5036028823058447, + "grad_norm": 1.0018991231918335, + "learning_rate": 1.7798358845437754e-05, + "loss": 0.1853, + "step": 2516 + }, + { + "epoch": 0.5044035228182546, + "grad_norm": 1.8009657859802246, + "learning_rate": 1.780709938932202e-05, + "loss": 0.1868, + "step": 2518 + }, + { + "epoch": 0.5044035228182546, + "grad_norm": 0.051243290305137634, + "learning_rate": 1.7815824685772035e-05, + "loss": 0.1754, + "step": 2520 + }, + { + "epoch": 0.5052041633306645, + "grad_norm": 7.955173015594482, + "learning_rate": 1.7824534717747115e-05, + "loss": 0.4096, + "step": 2522 + }, + { + "epoch": 0.5052041633306645, + "grad_norm": 1.14535653591156, + "learning_rate": 1.7833229468236364e-05, + "loss": 0.1483, + "step": 2524 + }, + { + "epoch": 0.5060048038430744, + "grad_norm": 10.273138046264648, + "learning_rate": 1.7841908920258767e-05, + "loss": 0.6373, + "step": 2526 + }, + { + "epoch": 0.5060048038430744, + "grad_norm": 6.1071882247924805, + "learning_rate": 1.7850573056863156e-05, + "loss": 0.4884, + "step": 2528 + }, + { + "epoch": 0.5068054443554844, + "grad_norm": 1.9639047384262085, + "learning_rate": 1.7859221861128284e-05, + "loss": 0.2467, + "step": 2530 + }, + { + "epoch": 0.5068054443554844, + "grad_norm": 1.9897651672363281, + "learning_rate": 1.786785531616285e-05, + "loss": 0.2782, + "step": 2532 + }, + { + "epoch": 0.5076060848678943, + "grad_norm": 4.995655536651611, + "learning_rate": 1.7876473405105528e-05, + "loss": 0.5775, + "step": 2534 + }, + { + "epoch": 0.5076060848678943, + "grad_norm": 4.143044948577881, + "learning_rate": 1.7885076111125004e-05, + "loss": 0.2505, + "step": 2536 + }, + { + "epoch": 0.5084067253803043, + "grad_norm": 4.438729763031006, + "learning_rate": 1.7893663417419995e-05, + "loss": 0.3481, + "step": 2538 + }, + { + "epoch": 0.5084067253803043, + "grad_norm": 3.323035955429077, + "learning_rate": 1.790223530721933e-05, + "loss": 0.3092, + "step": 2540 + }, + { + "epoch": 0.5092073658927142, + "grad_norm": 8.580005645751953, + "learning_rate": 1.791079176378191e-05, + "loss": 0.4576, + "step": 2542 + }, + { + "epoch": 0.5092073658927142, + "grad_norm": 1.0588291883468628, + "learning_rate": 1.791933277039679e-05, + "loss": 0.2014, + "step": 2544 + }, + { + "epoch": 0.5100080064051241, + "grad_norm": 3.6984071731567383, + "learning_rate": 1.7927858310383202e-05, + "loss": 0.1978, + "step": 2546 + }, + { + "epoch": 0.5100080064051241, + "grad_norm": 0.2238209992647171, + "learning_rate": 1.7936368367090577e-05, + "loss": 0.1694, + "step": 2548 + }, + { + "epoch": 0.510808646917534, + "grad_norm": 3.318337917327881, + "learning_rate": 1.794486292389858e-05, + "loss": 0.2446, + "step": 2550 + }, + { + "epoch": 0.510808646917534, + "grad_norm": 5.371819972991943, + "learning_rate": 1.7953341964217183e-05, + "loss": 0.2902, + "step": 2552 + }, + { + "epoch": 0.5116092874299439, + "grad_norm": 4.489548206329346, + "learning_rate": 1.7961805471486618e-05, + "loss": 0.3701, + "step": 2554 + }, + { + "epoch": 0.5116092874299439, + "grad_norm": 3.927816867828369, + "learning_rate": 1.7970253429177477e-05, + "loss": 0.1942, + "step": 2556 + }, + { + "epoch": 0.5124099279423538, + "grad_norm": 1.2281385660171509, + "learning_rate": 1.797868582079072e-05, + "loss": 0.1255, + "step": 2558 + }, + { + "epoch": 0.5124099279423538, + "grad_norm": 0.37840694189071655, + "learning_rate": 1.7987102629857696e-05, + "loss": 0.0917, + "step": 2560 + }, + { + "epoch": 0.5132105684547638, + "grad_norm": 1.2677125930786133, + "learning_rate": 1.7995503839940197e-05, + "loss": 0.0595, + "step": 2562 + }, + { + "epoch": 0.5132105684547638, + "grad_norm": 1.294288992881775, + "learning_rate": 1.800388943463047e-05, + "loss": 0.2284, + "step": 2564 + }, + { + "epoch": 0.5140112089671738, + "grad_norm": 2.4906344413757324, + "learning_rate": 1.8012259397551283e-05, + "loss": 0.1217, + "step": 2566 + }, + { + "epoch": 0.5140112089671738, + "grad_norm": 0.6130330562591553, + "learning_rate": 1.8020613712355912e-05, + "loss": 0.0587, + "step": 2568 + }, + { + "epoch": 0.5148118494795837, + "grad_norm": 1.9814350605010986, + "learning_rate": 1.8028952362728197e-05, + "loss": 0.1586, + "step": 2570 + }, + { + "epoch": 0.5148118494795837, + "grad_norm": 1.3547848463058472, + "learning_rate": 1.803727533238257e-05, + "loss": 0.1029, + "step": 2572 + }, + { + "epoch": 0.5156124899919936, + "grad_norm": 3.740375280380249, + "learning_rate": 1.804558260506409e-05, + "loss": 0.2901, + "step": 2574 + }, + { + "epoch": 0.5156124899919936, + "grad_norm": 2.0951552391052246, + "learning_rate": 1.805387416454847e-05, + "loss": 0.1761, + "step": 2576 + }, + { + "epoch": 0.5164131305044035, + "grad_norm": 5.398137092590332, + "learning_rate": 1.8062149994642135e-05, + "loss": 0.3799, + "step": 2578 + }, + { + "epoch": 0.5164131305044035, + "grad_norm": 2.7941036224365234, + "learning_rate": 1.8070410079182195e-05, + "loss": 0.3573, + "step": 2580 + }, + { + "epoch": 0.5172137710168134, + "grad_norm": 6.3737568855285645, + "learning_rate": 1.8078654402036526e-05, + "loss": 0.153, + "step": 2582 + }, + { + "epoch": 0.5172137710168134, + "grad_norm": 0.9410229921340942, + "learning_rate": 1.8086882947103787e-05, + "loss": 0.0538, + "step": 2584 + }, + { + "epoch": 0.5180144115292233, + "grad_norm": 8.123785972595215, + "learning_rate": 1.8095095698313452e-05, + "loss": 0.2778, + "step": 2586 + }, + { + "epoch": 0.5180144115292233, + "grad_norm": 0.5919559001922607, + "learning_rate": 1.8103292639625842e-05, + "loss": 0.1416, + "step": 2588 + }, + { + "epoch": 0.5188150520416333, + "grad_norm": 1.1283785104751587, + "learning_rate": 1.811147375503214e-05, + "loss": 0.353, + "step": 2590 + }, + { + "epoch": 0.5188150520416333, + "grad_norm": 6.769442081451416, + "learning_rate": 1.811963902855447e-05, + "loss": 0.1069, + "step": 2592 + }, + { + "epoch": 0.5196156925540433, + "grad_norm": 6.771814346313477, + "learning_rate": 1.812778844424587e-05, + "loss": 0.552, + "step": 2594 + }, + { + "epoch": 0.5196156925540433, + "grad_norm": 2.2799465656280518, + "learning_rate": 1.813592198619035e-05, + "loss": 0.3411, + "step": 2596 + }, + { + "epoch": 0.5204163330664532, + "grad_norm": 5.2547149658203125, + "learning_rate": 1.814403963850293e-05, + "loss": 0.6363, + "step": 2598 + }, + { + "epoch": 0.5204163330664532, + "grad_norm": 0.09771018475294113, + "learning_rate": 1.8152141385329658e-05, + "loss": 0.0444, + "step": 2600 + }, + { + "epoch": 0.5212169735788631, + "grad_norm": 2.01021409034729, + "learning_rate": 1.8160227210847636e-05, + "loss": 0.1992, + "step": 2602 + }, + { + "epoch": 0.5212169735788631, + "grad_norm": 2.247833490371704, + "learning_rate": 1.816829709926509e-05, + "loss": 0.0658, + "step": 2604 + }, + { + "epoch": 0.522017614091273, + "grad_norm": 2.6412012577056885, + "learning_rate": 1.8176351034821345e-05, + "loss": 0.1646, + "step": 2606 + }, + { + "epoch": 0.522017614091273, + "grad_norm": 0.15333357453346252, + "learning_rate": 1.8184389001786895e-05, + "loss": 0.0369, + "step": 2608 + }, + { + "epoch": 0.5228182546036829, + "grad_norm": 8.662123680114746, + "learning_rate": 1.819241098446341e-05, + "loss": 0.7808, + "step": 2610 + }, + { + "epoch": 0.5228182546036829, + "grad_norm": 3.445805549621582, + "learning_rate": 1.8200416967183785e-05, + "loss": 0.0792, + "step": 2612 + }, + { + "epoch": 0.5236188951160928, + "grad_norm": 6.853911399841309, + "learning_rate": 1.8208406934312167e-05, + "loss": 0.2299, + "step": 2614 + }, + { + "epoch": 0.5236188951160928, + "grad_norm": 1.2726572751998901, + "learning_rate": 1.821638087024396e-05, + "loss": 0.1062, + "step": 2616 + }, + { + "epoch": 0.5244195356285029, + "grad_norm": 17.77869987487793, + "learning_rate": 1.8224338759405917e-05, + "loss": 0.6223, + "step": 2618 + }, + { + "epoch": 0.5244195356285029, + "grad_norm": 2.5284688472747803, + "learning_rate": 1.8232280586256097e-05, + "loss": 0.1198, + "step": 2620 + }, + { + "epoch": 0.5252201761409128, + "grad_norm": 2.2292234897613525, + "learning_rate": 1.8240206335283947e-05, + "loss": 0.225, + "step": 2622 + }, + { + "epoch": 0.5252201761409128, + "grad_norm": 6.948677062988281, + "learning_rate": 1.8248115991010296e-05, + "loss": 0.1953, + "step": 2624 + }, + { + "epoch": 0.5260208166533227, + "grad_norm": 1.9778485298156738, + "learning_rate": 1.825600953798743e-05, + "loss": 0.2896, + "step": 2626 + }, + { + "epoch": 0.5260208166533227, + "grad_norm": 5.957225799560547, + "learning_rate": 1.8263886960799055e-05, + "loss": 0.0928, + "step": 2628 + }, + { + "epoch": 0.5268214571657326, + "grad_norm": 3.9544565677642822, + "learning_rate": 1.8271748244060426e-05, + "loss": 0.3481, + "step": 2630 + }, + { + "epoch": 0.5268214571657326, + "grad_norm": 0.31938138604164124, + "learning_rate": 1.8279593372418264e-05, + "loss": 0.1422, + "step": 2632 + }, + { + "epoch": 0.5276220976781425, + "grad_norm": 7.159646987915039, + "learning_rate": 1.8287422330550878e-05, + "loss": 0.3043, + "step": 2634 + }, + { + "epoch": 0.5276220976781425, + "grad_norm": 4.154680252075195, + "learning_rate": 1.829523510316813e-05, + "loss": 0.2057, + "step": 2636 + }, + { + "epoch": 0.5284227381905524, + "grad_norm": 8.218935012817383, + "learning_rate": 1.8303031675011515e-05, + "loss": 0.982, + "step": 2638 + }, + { + "epoch": 0.5284227381905524, + "grad_norm": 1.475289225578308, + "learning_rate": 1.8310812030854155e-05, + "loss": 0.2325, + "step": 2640 + }, + { + "epoch": 0.5292233787029623, + "grad_norm": 6.610187530517578, + "learning_rate": 1.8318576155500838e-05, + "loss": 0.6174, + "step": 2642 + }, + { + "epoch": 0.5292233787029623, + "grad_norm": 0.15218967199325562, + "learning_rate": 1.832632403378808e-05, + "loss": 0.2298, + "step": 2644 + }, + { + "epoch": 0.5300240192153723, + "grad_norm": 6.905365943908691, + "learning_rate": 1.8334055650584094e-05, + "loss": 0.2874, + "step": 2646 + }, + { + "epoch": 0.5300240192153723, + "grad_norm": 2.7616374492645264, + "learning_rate": 1.834177099078887e-05, + "loss": 0.2309, + "step": 2648 + }, + { + "epoch": 0.5308246597277823, + "grad_norm": 1.1723989248275757, + "learning_rate": 1.8349470039334173e-05, + "loss": 0.2478, + "step": 2650 + }, + { + "epoch": 0.5308246597277823, + "grad_norm": 3.5114617347717285, + "learning_rate": 1.8357152781183606e-05, + "loss": 0.1367, + "step": 2652 + }, + { + "epoch": 0.5316253002401922, + "grad_norm": 7.585908889770508, + "learning_rate": 1.83648192013326e-05, + "loss": 0.7299, + "step": 2654 + }, + { + "epoch": 0.5316253002401922, + "grad_norm": 3.80045485496521, + "learning_rate": 1.8372469284808465e-05, + "loss": 0.216, + "step": 2656 + }, + { + "epoch": 0.5324259407526021, + "grad_norm": 2.580702543258667, + "learning_rate": 1.8380103016670437e-05, + "loss": 0.3801, + "step": 2658 + }, + { + "epoch": 0.5324259407526021, + "grad_norm": 4.191203594207764, + "learning_rate": 1.8387720382009665e-05, + "loss": 0.16, + "step": 2660 + }, + { + "epoch": 0.533226581265012, + "grad_norm": 7.475527286529541, + "learning_rate": 1.839532136594927e-05, + "loss": 0.3637, + "step": 2662 + }, + { + "epoch": 0.533226581265012, + "grad_norm": 4.064618110656738, + "learning_rate": 1.840290595364436e-05, + "loss": 0.1681, + "step": 2664 + }, + { + "epoch": 0.5340272217774219, + "grad_norm": 3.054591655731201, + "learning_rate": 1.8410474130282085e-05, + "loss": 0.3955, + "step": 2666 + }, + { + "epoch": 0.5340272217774219, + "grad_norm": 2.7351677417755127, + "learning_rate": 1.8418025881081606e-05, + "loss": 0.0994, + "step": 2668 + }, + { + "epoch": 0.5348278622898318, + "grad_norm": 2.752753734588623, + "learning_rate": 1.8425561191294217e-05, + "loss": 0.2845, + "step": 2670 + }, + { + "epoch": 0.5348278622898318, + "grad_norm": 0.2480597198009491, + "learning_rate": 1.8433080046203286e-05, + "loss": 0.0646, + "step": 2672 + }, + { + "epoch": 0.5356285028022418, + "grad_norm": 5.378298282623291, + "learning_rate": 1.8440582431124325e-05, + "loss": 0.6358, + "step": 2674 + }, + { + "epoch": 0.5356285028022418, + "grad_norm": 1.8441389799118042, + "learning_rate": 1.844806833140501e-05, + "loss": 0.2376, + "step": 2676 + }, + { + "epoch": 0.5364291433146517, + "grad_norm": 3.1350090503692627, + "learning_rate": 1.8455537732425223e-05, + "loss": 0.4086, + "step": 2678 + }, + { + "epoch": 0.5364291433146517, + "grad_norm": 3.6409738063812256, + "learning_rate": 1.8462990619597054e-05, + "loss": 0.2522, + "step": 2680 + }, + { + "epoch": 0.5372297838270617, + "grad_norm": 3.5256221294403076, + "learning_rate": 1.847042697836485e-05, + "loss": 0.3031, + "step": 2682 + }, + { + "epoch": 0.5372297838270617, + "grad_norm": 1.7345776557922363, + "learning_rate": 1.8477846794205258e-05, + "loss": 0.1071, + "step": 2684 + }, + { + "epoch": 0.5380304243394716, + "grad_norm": 1.1631652116775513, + "learning_rate": 1.84852500526272e-05, + "loss": 0.2389, + "step": 2686 + }, + { + "epoch": 0.5380304243394716, + "grad_norm": 0.9893761277198792, + "learning_rate": 1.8492636739171966e-05, + "loss": 0.1218, + "step": 2688 + }, + { + "epoch": 0.5388310648518815, + "grad_norm": 3.71940279006958, + "learning_rate": 1.8500006839413183e-05, + "loss": 0.5025, + "step": 2690 + }, + { + "epoch": 0.5388310648518815, + "grad_norm": 1.243133306503296, + "learning_rate": 1.85073603389569e-05, + "loss": 0.0996, + "step": 2692 + }, + { + "epoch": 0.5396317053642914, + "grad_norm": 2.8305013179779053, + "learning_rate": 1.851469722344155e-05, + "loss": 0.1329, + "step": 2694 + }, + { + "epoch": 0.5396317053642914, + "grad_norm": 1.6187266111373901, + "learning_rate": 1.8522017478538067e-05, + "loss": 0.1173, + "step": 2696 + }, + { + "epoch": 0.5404323458767014, + "grad_norm": 12.070884704589844, + "learning_rate": 1.8529321089949817e-05, + "loss": 1.0966, + "step": 2698 + }, + { + "epoch": 0.5404323458767014, + "grad_norm": 1.944111704826355, + "learning_rate": 1.8536608043412695e-05, + "loss": 0.3127, + "step": 2700 + }, + { + "epoch": 0.5412329863891113, + "grad_norm": 2.8052823543548584, + "learning_rate": 1.8543878324695122e-05, + "loss": 0.1712, + "step": 2702 + }, + { + "epoch": 0.5412329863891113, + "grad_norm": 2.412444591522217, + "learning_rate": 1.855113191959808e-05, + "loss": 0.0908, + "step": 2704 + }, + { + "epoch": 0.5420336269015212, + "grad_norm": 1.242977261543274, + "learning_rate": 1.8558368813955143e-05, + "loss": 0.155, + "step": 2706 + }, + { + "epoch": 0.5420336269015212, + "grad_norm": 0.28852835297584534, + "learning_rate": 1.856558899363248e-05, + "loss": 0.0339, + "step": 2708 + }, + { + "epoch": 0.5428342674139311, + "grad_norm": 7.488397121429443, + "learning_rate": 1.857279244452896e-05, + "loss": 0.5704, + "step": 2710 + }, + { + "epoch": 0.5428342674139311, + "grad_norm": 3.18495512008667, + "learning_rate": 1.8579979152576063e-05, + "loss": 0.1747, + "step": 2712 + }, + { + "epoch": 0.5436349079263411, + "grad_norm": 15.97856616973877, + "learning_rate": 1.8587149103738e-05, + "loss": 0.6608, + "step": 2714 + }, + { + "epoch": 0.5436349079263411, + "grad_norm": 2.240490198135376, + "learning_rate": 1.85943022840117e-05, + "loss": 0.297, + "step": 2716 + }, + { + "epoch": 0.544435548438751, + "grad_norm": 6.026638031005859, + "learning_rate": 1.8601438679426847e-05, + "loss": 0.4804, + "step": 2718 + }, + { + "epoch": 0.544435548438751, + "grad_norm": 1.4259699583053589, + "learning_rate": 1.8608558276045895e-05, + "loss": 0.1898, + "step": 2720 + }, + { + "epoch": 0.5452361889511609, + "grad_norm": 9.018102645874023, + "learning_rate": 1.8615661059964134e-05, + "loss": 0.8301, + "step": 2722 + }, + { + "epoch": 0.5452361889511609, + "grad_norm": 1.8973910808563232, + "learning_rate": 1.862274701730967e-05, + "loss": 0.1767, + "step": 2724 + }, + { + "epoch": 0.5460368294635709, + "grad_norm": 1.8889931440353394, + "learning_rate": 1.862981613424347e-05, + "loss": 0.3943, + "step": 2726 + }, + { + "epoch": 0.5460368294635709, + "grad_norm": 2.367530345916748, + "learning_rate": 1.86368683969594e-05, + "loss": 0.1896, + "step": 2728 + }, + { + "epoch": 0.5468374699759808, + "grad_norm": 2.292177677154541, + "learning_rate": 1.864390379168423e-05, + "loss": 0.3615, + "step": 2730 + }, + { + "epoch": 0.5468374699759808, + "grad_norm": 1.510378360748291, + "learning_rate": 1.865092230467769e-05, + "loss": 0.0997, + "step": 2732 + }, + { + "epoch": 0.5476381104883907, + "grad_norm": 2.777294158935547, + "learning_rate": 1.8657923922232464e-05, + "loss": 0.293, + "step": 2734 + }, + { + "epoch": 0.5476381104883907, + "grad_norm": 1.9353588819503784, + "learning_rate": 1.866490863067425e-05, + "loss": 0.1718, + "step": 2736 + }, + { + "epoch": 0.5484387510008006, + "grad_norm": 2.5130770206451416, + "learning_rate": 1.8671876416361763e-05, + "loss": 0.2463, + "step": 2738 + }, + { + "epoch": 0.5484387510008006, + "grad_norm": 4.156477451324463, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.3576, + "step": 2740 + }, + { + "epoch": 0.5492393915132106, + "grad_norm": 2.3990230560302734, + "learning_rate": 1.8685761165074073e-05, + "loss": 0.4041, + "step": 2742 + }, + { + "epoch": 0.5492393915132106, + "grad_norm": 1.1864171028137207, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.1908, + "step": 2744 + }, + { + "epoch": 0.5500400320256205, + "grad_norm": 6.416761875152588, + "learning_rate": 1.869957805990059e-05, + "loss": 0.5133, + "step": 2746 + }, + { + "epoch": 0.5500400320256205, + "grad_norm": 1.625580072402954, + "learning_rate": 1.87064610283551e-05, + "loss": 0.2295, + "step": 2748 + }, + { + "epoch": 0.5508406725380304, + "grad_norm": 3.9066593647003174, + "learning_rate": 1.87133269929026e-05, + "loss": 0.2668, + "step": 2750 + }, + { + "epoch": 0.5508406725380304, + "grad_norm": 2.3005428314208984, + "learning_rate": 1.8720175940133705e-05, + "loss": 0.0991, + "step": 2752 + }, + { + "epoch": 0.5516413130504404, + "grad_norm": 1.6140111684799194, + "learning_rate": 1.8727007856672285e-05, + "loss": 0.1863, + "step": 2754 + }, + { + "epoch": 0.5516413130504404, + "grad_norm": 1.3977134227752686, + "learning_rate": 1.873382272917545e-05, + "loss": 0.1073, + "step": 2756 + }, + { + "epoch": 0.5524419535628503, + "grad_norm": 7.789316654205322, + "learning_rate": 1.8740620544333607e-05, + "loss": 0.6326, + "step": 2758 + }, + { + "epoch": 0.5524419535628503, + "grad_norm": 5.34277868270874, + "learning_rate": 1.8747401288870472e-05, + "loss": 0.5368, + "step": 2760 + }, + { + "epoch": 0.5532425940752602, + "grad_norm": 4.427399635314941, + "learning_rate": 1.875416494954312e-05, + "loss": 0.4881, + "step": 2762 + }, + { + "epoch": 0.5532425940752602, + "grad_norm": 1.3412363529205322, + "learning_rate": 1.876091151314196e-05, + "loss": 0.2306, + "step": 2764 + }, + { + "epoch": 0.5540432345876701, + "grad_norm": 0.48319554328918457, + "learning_rate": 1.8767640966490813e-05, + "loss": 0.1813, + "step": 2766 + }, + { + "epoch": 0.5540432345876701, + "grad_norm": 1.793357014656067, + "learning_rate": 1.877435329644691e-05, + "loss": 0.1239, + "step": 2768 + }, + { + "epoch": 0.55484387510008, + "grad_norm": 3.294755697250366, + "learning_rate": 1.878104848990093e-05, + "loss": 0.4842, + "step": 2770 + }, + { + "epoch": 0.55484387510008, + "grad_norm": 0.652742326259613, + "learning_rate": 1.8787726533776996e-05, + "loss": 0.1399, + "step": 2772 + }, + { + "epoch": 0.55564451561249, + "grad_norm": 7.438967704772949, + "learning_rate": 1.879438741503277e-05, + "loss": 0.5913, + "step": 2774 + }, + { + "epoch": 0.55564451561249, + "grad_norm": 3.6939921379089355, + "learning_rate": 1.8801031120659393e-05, + "loss": 0.2894, + "step": 2776 + }, + { + "epoch": 0.5564451561248999, + "grad_norm": 1.395262360572815, + "learning_rate": 1.8807657637681563e-05, + "loss": 0.2249, + "step": 2778 + }, + { + "epoch": 0.5564451561248999, + "grad_norm": 2.047849178314209, + "learning_rate": 1.8814266953157557e-05, + "loss": 0.1502, + "step": 2780 + }, + { + "epoch": 0.5572457966373099, + "grad_norm": 0.19455352425575256, + "learning_rate": 1.8820859054179225e-05, + "loss": 0.0343, + "step": 2782 + }, + { + "epoch": 0.5572457966373099, + "grad_norm": 1.5211355686187744, + "learning_rate": 1.8827433927872066e-05, + "loss": 0.0711, + "step": 2784 + }, + { + "epoch": 0.5580464371497198, + "grad_norm": 1.8171402215957642, + "learning_rate": 1.883399156139519e-05, + "loss": 0.1788, + "step": 2786 + }, + { + "epoch": 0.5580464371497198, + "grad_norm": 5.655097961425781, + "learning_rate": 1.8840531941941415e-05, + "loss": 0.1504, + "step": 2788 + }, + { + "epoch": 0.5588470776621297, + "grad_norm": 0.14688794314861298, + "learning_rate": 1.8847055056737233e-05, + "loss": 0.3796, + "step": 2790 + }, + { + "epoch": 0.5588470776621297, + "grad_norm": 1.424973964691162, + "learning_rate": 1.8853560893042854e-05, + "loss": 0.0859, + "step": 2792 + }, + { + "epoch": 0.5596477181745396, + "grad_norm": 3.447580337524414, + "learning_rate": 1.8860049438152244e-05, + "loss": 0.2983, + "step": 2794 + }, + { + "epoch": 0.5596477181745396, + "grad_norm": 1.249214768409729, + "learning_rate": 1.8866520679393127e-05, + "loss": 0.1386, + "step": 2796 + }, + { + "epoch": 0.5604483586869495, + "grad_norm": 1.9398133754730225, + "learning_rate": 1.8872974604127025e-05, + "loss": 0.5774, + "step": 2798 + }, + { + "epoch": 0.5604483586869495, + "grad_norm": 2.1248080730438232, + "learning_rate": 1.8879411199749303e-05, + "loss": 0.1954, + "step": 2800 + }, + { + "epoch": 0.5612489991993594, + "grad_norm": 1.1995235681533813, + "learning_rate": 1.8885830453689132e-05, + "loss": 0.3845, + "step": 2802 + }, + { + "epoch": 0.5612489991993594, + "grad_norm": 2.1997199058532715, + "learning_rate": 1.889223235340958e-05, + "loss": 0.1024, + "step": 2804 + }, + { + "epoch": 0.5620496397117695, + "grad_norm": 3.4237496852874756, + "learning_rate": 1.889861688640759e-05, + "loss": 0.6296, + "step": 2806 + }, + { + "epoch": 0.5620496397117695, + "grad_norm": 1.9968397617340088, + "learning_rate": 1.8904984040214037e-05, + "loss": 0.0503, + "step": 2808 + }, + { + "epoch": 0.5628502802241794, + "grad_norm": 1.5554423332214355, + "learning_rate": 1.891133380239373e-05, + "loss": 0.1767, + "step": 2810 + }, + { + "epoch": 0.5628502802241794, + "grad_norm": 2.529503583908081, + "learning_rate": 1.8917666160545436e-05, + "loss": 0.1951, + "step": 2812 + }, + { + "epoch": 0.5636509207365893, + "grad_norm": 7.815764427185059, + "learning_rate": 1.892398110230194e-05, + "loss": 0.6825, + "step": 2814 + }, + { + "epoch": 0.5636509207365893, + "grad_norm": 0.7336897253990173, + "learning_rate": 1.893027861533002e-05, + "loss": 0.1701, + "step": 2816 + }, + { + "epoch": 0.5644515612489992, + "grad_norm": 2.923396348953247, + "learning_rate": 1.8936558687330485e-05, + "loss": 0.3319, + "step": 2818 + }, + { + "epoch": 0.5644515612489992, + "grad_norm": 0.7274232506752014, + "learning_rate": 1.894282130603823e-05, + "loss": 0.1364, + "step": 2820 + }, + { + "epoch": 0.5652522017614091, + "grad_norm": 5.261726379394531, + "learning_rate": 1.8949066459222217e-05, + "loss": 0.4967, + "step": 2822 + }, + { + "epoch": 0.5652522017614091, + "grad_norm": 1.5002192258834839, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.291, + "step": 2824 + }, + { + "epoch": 0.566052842273819, + "grad_norm": 5.200399875640869, + "learning_rate": 1.8961504320265382e-05, + "loss": 0.2528, + "step": 2826 + }, + { + "epoch": 0.566052842273819, + "grad_norm": 2.592994213104248, + "learning_rate": 1.896769700383315e-05, + "loss": 0.2797, + "step": 2828 + }, + { + "epoch": 0.5668534827862289, + "grad_norm": 4.205753326416016, + "learning_rate": 1.897387217329439e-05, + "loss": 0.4974, + "step": 2830 + }, + { + "epoch": 0.5668534827862289, + "grad_norm": 6.040187835693359, + "learning_rate": 1.898002981658886e-05, + "loss": 0.2391, + "step": 2832 + }, + { + "epoch": 0.567654123298639, + "grad_norm": 3.838240146636963, + "learning_rate": 1.8986169921690543e-05, + "loss": 0.2189, + "step": 2834 + }, + { + "epoch": 0.567654123298639, + "grad_norm": 1.9351215362548828, + "learning_rate": 1.899229247660769e-05, + "loss": 0.2158, + "step": 2836 + }, + { + "epoch": 0.5684547638110489, + "grad_norm": 3.7071828842163086, + "learning_rate": 1.899839746938281e-05, + "loss": 0.3356, + "step": 2838 + }, + { + "epoch": 0.5684547638110489, + "grad_norm": 6.713110446929932, + "learning_rate": 1.9004484888092724e-05, + "loss": 0.3326, + "step": 2840 + }, + { + "epoch": 0.5692554043234588, + "grad_norm": 1.2103016376495361, + "learning_rate": 1.9010554720848577e-05, + "loss": 0.1302, + "step": 2842 + }, + { + "epoch": 0.5692554043234588, + "grad_norm": 2.7963967323303223, + "learning_rate": 1.901660695579585e-05, + "loss": 0.1194, + "step": 2844 + }, + { + "epoch": 0.5700560448358687, + "grad_norm": 10.494611740112305, + "learning_rate": 1.9022641581114392e-05, + "loss": 0.5695, + "step": 2846 + }, + { + "epoch": 0.5700560448358687, + "grad_norm": 6.353857517242432, + "learning_rate": 1.9028658585018455e-05, + "loss": 0.3051, + "step": 2848 + }, + { + "epoch": 0.5708566853482786, + "grad_norm": 2.013359546661377, + "learning_rate": 1.9034657955756695e-05, + "loss": 0.7324, + "step": 2850 + }, + { + "epoch": 0.5708566853482786, + "grad_norm": 1.9182790517807007, + "learning_rate": 1.9040639681612212e-05, + "loss": 0.2265, + "step": 2852 + }, + { + "epoch": 0.5716573258606885, + "grad_norm": 2.5857932567596436, + "learning_rate": 1.904660375090257e-05, + "loss": 0.558, + "step": 2854 + }, + { + "epoch": 0.5716573258606885, + "grad_norm": 1.587411880493164, + "learning_rate": 1.9052550151979816e-05, + "loss": 0.1097, + "step": 2856 + }, + { + "epoch": 0.5724579663730984, + "grad_norm": 8.514728546142578, + "learning_rate": 1.905847887323049e-05, + "loss": 0.508, + "step": 2858 + }, + { + "epoch": 0.5724579663730984, + "grad_norm": 8.016975402832031, + "learning_rate": 1.9064389903075676e-05, + "loss": 0.7032, + "step": 2860 + }, + { + "epoch": 0.5732586068855084, + "grad_norm": 3.605529546737671, + "learning_rate": 1.9070283229971007e-05, + "loss": 0.5493, + "step": 2862 + }, + { + "epoch": 0.5732586068855084, + "grad_norm": 9.602310180664062, + "learning_rate": 1.9076158842406674e-05, + "loss": 0.3054, + "step": 2864 + }, + { + "epoch": 0.5740592473979184, + "grad_norm": 4.3133416175842285, + "learning_rate": 1.9082016728907496e-05, + "loss": 0.7125, + "step": 2866 + }, + { + "epoch": 0.5740592473979184, + "grad_norm": 0.5951982736587524, + "learning_rate": 1.9087856878032886e-05, + "loss": 0.1477, + "step": 2868 + }, + { + "epoch": 0.5748598879103283, + "grad_norm": 3.9494829177856445, + "learning_rate": 1.909367927837691e-05, + "loss": 0.532, + "step": 2870 + }, + { + "epoch": 0.5748598879103283, + "grad_norm": 1.879807710647583, + "learning_rate": 1.909948391856829e-05, + "loss": 0.1099, + "step": 2872 + }, + { + "epoch": 0.5756605284227382, + "grad_norm": 2.7843101024627686, + "learning_rate": 1.910527078727044e-05, + "loss": 0.2271, + "step": 2874 + }, + { + "epoch": 0.5756605284227382, + "grad_norm": 2.8905253410339355, + "learning_rate": 1.911103987318148e-05, + "loss": 0.3174, + "step": 2876 + }, + { + "epoch": 0.5764611689351481, + "grad_norm": 9.05832576751709, + "learning_rate": 1.911679116503425e-05, + "loss": 0.366, + "step": 2878 + }, + { + "epoch": 0.5764611689351481, + "grad_norm": 3.2533252239227295, + "learning_rate": 1.912252465159637e-05, + "loss": 0.2527, + "step": 2880 + }, + { + "epoch": 0.577261809447558, + "grad_norm": 11.567764282226562, + "learning_rate": 1.9128240321670208e-05, + "loss": 0.4381, + "step": 2882 + }, + { + "epoch": 0.577261809447558, + "grad_norm": 2.3477237224578857, + "learning_rate": 1.913393816409294e-05, + "loss": 0.3212, + "step": 2884 + }, + { + "epoch": 0.578062449959968, + "grad_norm": 4.319459915161133, + "learning_rate": 1.913961816773655e-05, + "loss": 0.333, + "step": 2886 + }, + { + "epoch": 0.578062449959968, + "grad_norm": 0.8835427761077881, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.1298, + "step": 2888 + }, + { + "epoch": 0.5788630904723779, + "grad_norm": 3.786252498626709, + "learning_rate": 1.9150924614348588e-05, + "loss": 0.3831, + "step": 2890 + }, + { + "epoch": 0.5788630904723779, + "grad_norm": 2.3339285850524902, + "learning_rate": 1.9156551035235288e-05, + "loss": 0.2192, + "step": 2892 + }, + { + "epoch": 0.5796637309847879, + "grad_norm": 0.8181456923484802, + "learning_rate": 1.916215957317944e-05, + "loss": 0.4055, + "step": 2894 + }, + { + "epoch": 0.5796637309847879, + "grad_norm": 1.3828552961349487, + "learning_rate": 1.9167750217227454e-05, + "loss": 0.2608, + "step": 2896 + }, + { + "epoch": 0.5804643714971978, + "grad_norm": 0.6247013807296753, + "learning_rate": 1.9173322956460675e-05, + "loss": 0.1337, + "step": 2898 + }, + { + "epoch": 0.5804643714971978, + "grad_norm": 0.2891789972782135, + "learning_rate": 1.9178877779995423e-05, + "loss": 0.0382, + "step": 2900 + }, + { + "epoch": 0.5812650120096077, + "grad_norm": 2.018420696258545, + "learning_rate": 1.9184414676983006e-05, + "loss": 0.4635, + "step": 2902 + }, + { + "epoch": 0.5812650120096077, + "grad_norm": 1.6878528594970703, + "learning_rate": 1.9189933636609747e-05, + "loss": 0.2082, + "step": 2904 + }, + { + "epoch": 0.5820656525220176, + "grad_norm": 6.274703502655029, + "learning_rate": 1.9195434648097003e-05, + "loss": 0.4751, + "step": 2906 + }, + { + "epoch": 0.5820656525220176, + "grad_norm": 3.7055654525756836, + "learning_rate": 1.9200917700701173e-05, + "loss": 0.3517, + "step": 2908 + }, + { + "epoch": 0.5828662930344275, + "grad_norm": 3.4332404136657715, + "learning_rate": 1.9206382783713738e-05, + "loss": 0.3248, + "step": 2910 + }, + { + "epoch": 0.5828662930344275, + "grad_norm": 1.7410038709640503, + "learning_rate": 1.9211829886461274e-05, + "loss": 0.2614, + "step": 2912 + }, + { + "epoch": 0.5836669335468375, + "grad_norm": 1.0471842288970947, + "learning_rate": 1.921725899830547e-05, + "loss": 0.3797, + "step": 2914 + }, + { + "epoch": 0.5836669335468375, + "grad_norm": 1.8819118738174438, + "learning_rate": 1.9222670108643146e-05, + "loss": 0.0977, + "step": 2916 + }, + { + "epoch": 0.5844675740592474, + "grad_norm": 4.601459503173828, + "learning_rate": 1.92280632069063e-05, + "loss": 0.4341, + "step": 2918 + }, + { + "epoch": 0.5844675740592474, + "grad_norm": 6.487861156463623, + "learning_rate": 1.9233438282562085e-05, + "loss": 0.2716, + "step": 2920 + }, + { + "epoch": 0.5852682145716573, + "grad_norm": 4.183629035949707, + "learning_rate": 1.9238795325112867e-05, + "loss": 0.3331, + "step": 2922 + }, + { + "epoch": 0.5852682145716573, + "grad_norm": 1.4620354175567627, + "learning_rate": 1.924413432409622e-05, + "loss": 0.2026, + "step": 2924 + }, + { + "epoch": 0.5860688550840673, + "grad_norm": 3.362947702407837, + "learning_rate": 1.924945526908497e-05, + "loss": 0.3699, + "step": 2926 + }, + { + "epoch": 0.5860688550840673, + "grad_norm": 0.9566460847854614, + "learning_rate": 1.925475814968719e-05, + "loss": 0.1282, + "step": 2928 + }, + { + "epoch": 0.5868694955964772, + "grad_norm": 4.243828296661377, + "learning_rate": 1.9260042955546237e-05, + "loss": 0.2976, + "step": 2930 + }, + { + "epoch": 0.5868694955964772, + "grad_norm": 1.1008321046829224, + "learning_rate": 1.926530967634078e-05, + "loss": 0.1163, + "step": 2932 + }, + { + "epoch": 0.5876701361088871, + "grad_norm": 2.894547939300537, + "learning_rate": 1.9270558301784795e-05, + "loss": 0.419, + "step": 2934 + }, + { + "epoch": 0.5876701361088871, + "grad_norm": 1.0936301946640015, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.2044, + "step": 2936 + }, + { + "epoch": 0.588470776621297, + "grad_norm": 2.6102609634399414, + "learning_rate": 1.9281001225653887e-05, + "loss": 0.309, + "step": 2938 + }, + { + "epoch": 0.588470776621297, + "grad_norm": 1.5045009851455688, + "learning_rate": 1.9286195503683705e-05, + "loss": 0.1761, + "step": 2940 + }, + { + "epoch": 0.589271417133707, + "grad_norm": 3.4876372814178467, + "learning_rate": 1.9291371645572517e-05, + "loss": 0.2267, + "step": 2942 + }, + { + "epoch": 0.589271417133707, + "grad_norm": 1.5613679885864258, + "learning_rate": 1.9296529641211215e-05, + "loss": 0.1078, + "step": 2944 + }, + { + "epoch": 0.5900720576461169, + "grad_norm": 7.721551895141602, + "learning_rate": 1.9301669480526115e-05, + "loss": 1.2198, + "step": 2946 + }, + { + "epoch": 0.5900720576461169, + "grad_norm": 2.614772319793701, + "learning_rate": 1.9306791153479004e-05, + "loss": 0.3227, + "step": 2948 + }, + { + "epoch": 0.5908726981585268, + "grad_norm": 1.9564096927642822, + "learning_rate": 1.931189465006714e-05, + "loss": 0.1285, + "step": 2950 + }, + { + "epoch": 0.5908726981585268, + "grad_norm": 6.130621433258057, + "learning_rate": 1.9316979960323286e-05, + "loss": 0.251, + "step": 2952 + }, + { + "epoch": 0.5916733386709367, + "grad_norm": 4.636622428894043, + "learning_rate": 1.9322047074315717e-05, + "loss": 0.3075, + "step": 2954 + }, + { + "epoch": 0.5916733386709367, + "grad_norm": 5.665347576141357, + "learning_rate": 1.932709598214825e-05, + "loss": 0.2386, + "step": 2956 + }, + { + "epoch": 0.5924739791833467, + "grad_norm": 4.946974277496338, + "learning_rate": 1.9332126673960262e-05, + "loss": 0.2464, + "step": 2958 + }, + { + "epoch": 0.5924739791833467, + "grad_norm": 3.491076946258545, + "learning_rate": 1.9337139139926707e-05, + "loss": 0.1439, + "step": 2960 + }, + { + "epoch": 0.5932746196957566, + "grad_norm": 1.5862833261489868, + "learning_rate": 1.934213337025812e-05, + "loss": 0.1435, + "step": 2962 + }, + { + "epoch": 0.5932746196957566, + "grad_norm": 0.6689804792404175, + "learning_rate": 1.9347109355200672e-05, + "loss": 0.0572, + "step": 2964 + }, + { + "epoch": 0.5940752602081665, + "grad_norm": 3.518786668777466, + "learning_rate": 1.9352067085036145e-05, + "loss": 0.1541, + "step": 2966 + }, + { + "epoch": 0.5940752602081665, + "grad_norm": 0.5915401577949524, + "learning_rate": 1.935700655008199e-05, + "loss": 0.0335, + "step": 2968 + }, + { + "epoch": 0.5948759007205765, + "grad_norm": 3.39398193359375, + "learning_rate": 1.9361927740691327e-05, + "loss": 0.5767, + "step": 2970 + }, + { + "epoch": 0.5948759007205765, + "grad_norm": 0.9166590571403503, + "learning_rate": 1.9366830647252967e-05, + "loss": 0.0399, + "step": 2972 + }, + { + "epoch": 0.5956765412329864, + "grad_norm": 0.6669431924819946, + "learning_rate": 1.937171526019142e-05, + "loss": 0.4084, + "step": 2974 + }, + { + "epoch": 0.5956765412329864, + "grad_norm": 0.7640724182128906, + "learning_rate": 1.9376581569966933e-05, + "loss": 0.0539, + "step": 2976 + }, + { + "epoch": 0.5964771817453963, + "grad_norm": 17.298851013183594, + "learning_rate": 1.9381429567075504e-05, + "loss": 0.4878, + "step": 2978 + }, + { + "epoch": 0.5964771817453963, + "grad_norm": 4.555431365966797, + "learning_rate": 1.9386259242048883e-05, + "loss": 0.1439, + "step": 2980 + }, + { + "epoch": 0.5972778222578062, + "grad_norm": 6.310392379760742, + "learning_rate": 1.93910705854546e-05, + "loss": 0.6138, + "step": 2982 + }, + { + "epoch": 0.5972778222578062, + "grad_norm": 0.6291956305503845, + "learning_rate": 1.939586358789602e-05, + "loss": 0.1479, + "step": 2984 + }, + { + "epoch": 0.5980784627702161, + "grad_norm": 11.875497817993164, + "learning_rate": 1.9400638240012294e-05, + "loss": 0.6129, + "step": 2986 + }, + { + "epoch": 0.5980784627702161, + "grad_norm": 3.024225950241089, + "learning_rate": 1.940539453247842e-05, + "loss": 0.1024, + "step": 2988 + }, + { + "epoch": 0.5988791032826261, + "grad_norm": 2.0448012351989746, + "learning_rate": 1.9410132456005262e-05, + "loss": 0.6668, + "step": 2990 + }, + { + "epoch": 0.5988791032826261, + "grad_norm": 2.347778797149658, + "learning_rate": 1.9414852001339547e-05, + "loss": 0.1131, + "step": 2992 + }, + { + "epoch": 0.5996797437950361, + "grad_norm": 6.083974838256836, + "learning_rate": 1.9419553159263896e-05, + "loss": 0.1818, + "step": 2994 + }, + { + "epoch": 0.5996797437950361, + "grad_norm": 3.5000579357147217, + "learning_rate": 1.9424235920596863e-05, + "loss": 0.1789, + "step": 2996 + }, + { + "epoch": 0.600480384307446, + "grad_norm": 9.262458801269531, + "learning_rate": 1.94289002761929e-05, + "loss": 0.1898, + "step": 2998 + }, + { + "epoch": 0.600480384307446, + "grad_norm": 0.4429357945919037, + "learning_rate": 1.9433546216942423e-05, + "loss": 0.113, + "step": 3000 + }, + { + "epoch": 0.6012810248198559, + "grad_norm": 1.3951910734176636, + "learning_rate": 1.943817373377181e-05, + "loss": 0.2012, + "step": 3002 + }, + { + "epoch": 0.6012810248198559, + "grad_norm": 1.0302425622940063, + "learning_rate": 1.944278281764342e-05, + "loss": 0.1088, + "step": 3004 + }, + { + "epoch": 0.6020816653322658, + "grad_norm": 4.28736686706543, + "learning_rate": 1.944737345955561e-05, + "loss": 0.3329, + "step": 3006 + }, + { + "epoch": 0.6020816653322658, + "grad_norm": 7.48621129989624, + "learning_rate": 1.945194565054276e-05, + "loss": 0.1878, + "step": 3008 + }, + { + "epoch": 0.6028823058446757, + "grad_norm": 0.16736437380313873, + "learning_rate": 1.945649938167528e-05, + "loss": 0.196, + "step": 3010 + }, + { + "epoch": 0.6028823058446757, + "grad_norm": 2.363410472869873, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.1897, + "step": 3012 + }, + { + "epoch": 0.6036829463570856, + "grad_norm": 4.7391815185546875, + "learning_rate": 1.946555142883836e-05, + "loss": 0.3439, + "step": 3014 + }, + { + "epoch": 0.6036829463570856, + "grad_norm": 2.5998456478118896, + "learning_rate": 1.9470049727190073e-05, + "loss": 0.1952, + "step": 3016 + }, + { + "epoch": 0.6044835868694955, + "grad_norm": 6.695474147796631, + "learning_rate": 1.9474529530329507e-05, + "loss": 0.3986, + "step": 3018 + }, + { + "epoch": 0.6044835868694955, + "grad_norm": 3.2983670234680176, + "learning_rate": 1.9478990829507504e-05, + "loss": 0.2721, + "step": 3020 + }, + { + "epoch": 0.6052842273819056, + "grad_norm": 2.0716960430145264, + "learning_rate": 1.9483433616011047e-05, + "loss": 0.8095, + "step": 3022 + }, + { + "epoch": 0.6052842273819056, + "grad_norm": 18.4224853515625, + "learning_rate": 1.948785788116329e-05, + "loss": 0.347, + "step": 3024 + }, + { + "epoch": 0.6060848678943155, + "grad_norm": 6.321012020111084, + "learning_rate": 1.9492263616323533e-05, + "loss": 0.6075, + "step": 3026 + }, + { + "epoch": 0.6060848678943155, + "grad_norm": 22.40316390991211, + "learning_rate": 1.9496650812887286e-05, + "loss": 0.2656, + "step": 3028 + }, + { + "epoch": 0.6068855084067254, + "grad_norm": 1.3848379850387573, + "learning_rate": 1.9501019462286263e-05, + "loss": 0.1138, + "step": 3030 + }, + { + "epoch": 0.6068855084067254, + "grad_norm": 0.9045932292938232, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.1076, + "step": 3032 + }, + { + "epoch": 0.6076861489191353, + "grad_norm": 10.81799602508545, + "learning_rate": 1.9509701085497842e-05, + "loss": 0.4693, + "step": 3034 + }, + { + "epoch": 0.6076861489191353, + "grad_norm": 3.8105218410491943, + "learning_rate": 1.951401404235505e-05, + "loss": 0.3971, + "step": 3036 + }, + { + "epoch": 0.6084867894315452, + "grad_norm": 10.101715087890625, + "learning_rate": 1.9518308418136718e-05, + "loss": 0.6609, + "step": 3038 + }, + { + "epoch": 0.6084867894315452, + "grad_norm": 3.1276614665985107, + "learning_rate": 1.952258420445583e-05, + "loss": 0.352, + "step": 3040 + }, + { + "epoch": 0.6092874299439551, + "grad_norm": 1.6298216581344604, + "learning_rate": 1.952684139296169e-05, + "loss": 0.3467, + "step": 3042 + }, + { + "epoch": 0.6092874299439551, + "grad_norm": 0.8027591109275818, + "learning_rate": 1.9531079975339912e-05, + "loss": 0.0523, + "step": 3044 + }, + { + "epoch": 0.610088070456365, + "grad_norm": 7.5043416023254395, + "learning_rate": 1.9535299943312455e-05, + "loss": 0.6331, + "step": 3046 + }, + { + "epoch": 0.610088070456365, + "grad_norm": 5.957387447357178, + "learning_rate": 1.953950128863762e-05, + "loss": 0.3642, + "step": 3048 + }, + { + "epoch": 0.6108887109687751, + "grad_norm": 3.3469066619873047, + "learning_rate": 1.9543684003110105e-05, + "loss": 0.1301, + "step": 3050 + }, + { + "epoch": 0.6108887109687751, + "grad_norm": 8.220071792602539, + "learning_rate": 1.9547848078560975e-05, + "loss": 0.136, + "step": 3052 + }, + { + "epoch": 0.611689351481185, + "grad_norm": 2.5740127563476562, + "learning_rate": 1.9551993506857688e-05, + "loss": 0.3418, + "step": 3054 + }, + { + "epoch": 0.611689351481185, + "grad_norm": 0.5017393827438354, + "learning_rate": 1.9556120279904144e-05, + "loss": 0.0554, + "step": 3056 + }, + { + "epoch": 0.6124899919935949, + "grad_norm": 2.3689324855804443, + "learning_rate": 1.9560228389640664e-05, + "loss": 0.2242, + "step": 3058 + }, + { + "epoch": 0.6124899919935949, + "grad_norm": 0.6158408522605896, + "learning_rate": 1.956431782804402e-05, + "loss": 0.1007, + "step": 3060 + }, + { + "epoch": 0.6132906325060048, + "grad_norm": 2.515407085418701, + "learning_rate": 1.956838858712744e-05, + "loss": 0.2325, + "step": 3062 + }, + { + "epoch": 0.6132906325060048, + "grad_norm": 3.014709234237671, + "learning_rate": 1.957244065894066e-05, + "loss": 0.1499, + "step": 3064 + }, + { + "epoch": 0.6140912730184147, + "grad_norm": 10.147005081176758, + "learning_rate": 1.9576474035569892e-05, + "loss": 0.287, + "step": 3066 + }, + { + "epoch": 0.6140912730184147, + "grad_norm": 7.067925453186035, + "learning_rate": 1.9580488709137858e-05, + "loss": 0.2166, + "step": 3068 + }, + { + "epoch": 0.6148919135308246, + "grad_norm": 5.905797004699707, + "learning_rate": 1.9584484671803818e-05, + "loss": 0.3882, + "step": 3070 + }, + { + "epoch": 0.6148919135308246, + "grad_norm": 5.907956123352051, + "learning_rate": 1.9588461915763566e-05, + "loss": 0.2394, + "step": 3072 + }, + { + "epoch": 0.6156925540432346, + "grad_norm": 3.8632469177246094, + "learning_rate": 1.9592420433249462e-05, + "loss": 0.3786, + "step": 3074 + }, + { + "epoch": 0.6156925540432346, + "grad_norm": 2.7870635986328125, + "learning_rate": 1.9596360216530436e-05, + "loss": 0.2818, + "step": 3076 + }, + { + "epoch": 0.6164931945556446, + "grad_norm": 1.6616604328155518, + "learning_rate": 1.9600281257912e-05, + "loss": 0.2613, + "step": 3078 + }, + { + "epoch": 0.6164931945556446, + "grad_norm": 1.3428399562835693, + "learning_rate": 1.9604183549736283e-05, + "loss": 0.1478, + "step": 3080 + }, + { + "epoch": 0.6172938350680545, + "grad_norm": 7.343536376953125, + "learning_rate": 1.960806708438202e-05, + "loss": 0.9067, + "step": 3082 + }, + { + "epoch": 0.6172938350680545, + "grad_norm": 3.0228700637817383, + "learning_rate": 1.961193185426459e-05, + "loss": 0.2309, + "step": 3084 + }, + { + "epoch": 0.6180944755804644, + "grad_norm": 15.891963005065918, + "learning_rate": 1.9615777851836003e-05, + "loss": 0.7119, + "step": 3086 + }, + { + "epoch": 0.6180944755804644, + "grad_norm": 2.4570014476776123, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.1956, + "step": 3088 + }, + { + "epoch": 0.6188951160928743, + "grad_norm": 1.477921962738037, + "learning_rate": 1.962341350003679e-05, + "loss": 0.2316, + "step": 3090 + }, + { + "epoch": 0.6188951160928743, + "grad_norm": 1.3735840320587158, + "learning_rate": 1.9627203135753573e-05, + "loss": 0.146, + "step": 3092 + }, + { + "epoch": 0.6196957566052842, + "grad_norm": 7.117755889892578, + "learning_rate": 1.9630973969334068e-05, + "loss": 0.6586, + "step": 3094 + }, + { + "epoch": 0.6196957566052842, + "grad_norm": 4.376545429229736, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.2933, + "step": 3096 + }, + { + "epoch": 0.6204963971176941, + "grad_norm": 3.648791551589966, + "learning_rate": 1.9638459200664822e-05, + "loss": 0.2525, + "step": 3098 + }, + { + "epoch": 0.6204963971176941, + "grad_norm": 2.8253984451293945, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.276, + "step": 3100 + }, + { + "epoch": 0.6212970376301041, + "grad_norm": 6.303553581237793, + "learning_rate": 1.9645869135553806e-05, + "loss": 0.614, + "step": 3102 + }, + { + "epoch": 0.6212970376301041, + "grad_norm": 0.6833983063697815, + "learning_rate": 1.964954584871995e-05, + "loss": 0.2093, + "step": 3104 + }, + { + "epoch": 0.622097678142514, + "grad_norm": 0.9761205315589905, + "learning_rate": 1.965320371611399e-05, + "loss": 0.1254, + "step": 3106 + }, + { + "epoch": 0.622097678142514, + "grad_norm": 1.015260934829712, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.0669, + "step": 3108 + }, + { + "epoch": 0.622898318654924, + "grad_norm": 3.8411953449249268, + "learning_rate": 1.9660462885047032e-05, + "loss": 0.4806, + "step": 3110 + }, + { + "epoch": 0.622898318654924, + "grad_norm": 0.33261629939079285, + "learning_rate": 1.966406417240872e-05, + "loss": 0.0715, + "step": 3112 + }, + { + "epoch": 0.6236989591673339, + "grad_norm": 11.829501152038574, + "learning_rate": 1.9667646585643703e-05, + "loss": 0.4805, + "step": 3114 + }, + { + "epoch": 0.6236989591673339, + "grad_norm": 1.3749874830245972, + "learning_rate": 1.967121011775546e-05, + "loss": 0.0642, + "step": 3116 + }, + { + "epoch": 0.6244995996797438, + "grad_norm": 0.4603588283061981, + "learning_rate": 1.967475476178433e-05, + "loss": 0.3725, + "step": 3118 + }, + { + "epoch": 0.6244995996797438, + "grad_norm": 1.6110888719558716, + "learning_rate": 1.967828051080755e-05, + "loss": 0.1078, + "step": 3120 + }, + { + "epoch": 0.6253002401921537, + "grad_norm": 1.5357049703598022, + "learning_rate": 1.9681787357939254e-05, + "loss": 0.2824, + "step": 3122 + }, + { + "epoch": 0.6253002401921537, + "grad_norm": 2.2445566654205322, + "learning_rate": 1.9685275296330497e-05, + "loss": 0.11, + "step": 3124 + }, + { + "epoch": 0.6261008807045636, + "grad_norm": 10.892742156982422, + "learning_rate": 1.968874431916926e-05, + "loss": 0.4655, + "step": 3126 + }, + { + "epoch": 0.6261008807045636, + "grad_norm": 3.176837682723999, + "learning_rate": 1.969219441968046e-05, + "loss": 0.2538, + "step": 3128 + }, + { + "epoch": 0.6269015212169736, + "grad_norm": 7.471782207489014, + "learning_rate": 1.969562559112598e-05, + "loss": 0.5327, + "step": 3130 + }, + { + "epoch": 0.6269015212169736, + "grad_norm": 1.5007481575012207, + "learning_rate": 1.969903782680467e-05, + "loss": 0.2191, + "step": 3132 + }, + { + "epoch": 0.6277021617293835, + "grad_norm": 7.751013278961182, + "learning_rate": 1.970243112005235e-05, + "loss": 1.3041, + "step": 3134 + }, + { + "epoch": 0.6277021617293835, + "grad_norm": 2.98165225982666, + "learning_rate": 1.9705805464241856e-05, + "loss": 0.5235, + "step": 3136 + }, + { + "epoch": 0.6285028022417934, + "grad_norm": 5.294494152069092, + "learning_rate": 1.970916085278302e-05, + "loss": 0.6085, + "step": 3138 + }, + { + "epoch": 0.6285028022417934, + "grad_norm": 3.9178731441497803, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.3128, + "step": 3140 + }, + { + "epoch": 0.6293034427542034, + "grad_norm": 2.0843327045440674, + "learning_rate": 1.9715814736744755e-05, + "loss": 0.0936, + "step": 3142 + }, + { + "epoch": 0.6293034427542034, + "grad_norm": 1.3904253244400024, + "learning_rate": 1.971911321917015e-05, + "loss": 0.0923, + "step": 3144 + }, + { + "epoch": 0.6301040832666133, + "grad_norm": 2.990675210952759, + "learning_rate": 1.9722392719956864e-05, + "loss": 0.5139, + "step": 3146 + }, + { + "epoch": 0.6301040832666133, + "grad_norm": 4.385390281677246, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.3192, + "step": 3148 + }, + { + "epoch": 0.6309047237790232, + "grad_norm": 5.511481285095215, + "learning_rate": 1.9728894751031595e-05, + "loss": 0.637, + "step": 3150 + }, + { + "epoch": 0.6309047237790232, + "grad_norm": 2.6941189765930176, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.3194, + "step": 3152 + }, + { + "epoch": 0.6317053642914331, + "grad_norm": 3.8107783794403076, + "learning_rate": 1.9735320779174545e-05, + "loss": 0.4714, + "step": 3154 + }, + { + "epoch": 0.6317053642914331, + "grad_norm": 1.2685428857803345, + "learning_rate": 1.9738505276435692e-05, + "loss": 0.1498, + "step": 3156 + }, + { + "epoch": 0.6325060048038431, + "grad_norm": 2.9755051136016846, + "learning_rate": 1.974167075418505e-05, + "loss": 0.5085, + "step": 3158 + }, + { + "epoch": 0.6325060048038431, + "grad_norm": 1.3344436883926392, + "learning_rate": 1.9744817206240377e-05, + "loss": 0.1953, + "step": 3160 + }, + { + "epoch": 0.633306645316253, + "grad_norm": 2.6920711994171143, + "learning_rate": 1.9747944626456577e-05, + "loss": 0.2161, + "step": 3162 + }, + { + "epoch": 0.633306645316253, + "grad_norm": 2.8614935874938965, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.1481, + "step": 3164 + }, + { + "epoch": 0.6341072858286629, + "grad_norm": 1.2153960466384888, + "learning_rate": 1.975414234697712e-05, + "loss": 0.1831, + "step": 3166 + }, + { + "epoch": 0.6341072858286629, + "grad_norm": 1.483147382736206, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.1756, + "step": 3168 + }, + { + "epoch": 0.6349079263410728, + "grad_norm": 3.512660503387451, + "learning_rate": 1.9760263867329568e-05, + "loss": 0.7189, + "step": 3170 + }, + { + "epoch": 0.6349079263410728, + "grad_norm": 2.323740243911743, + "learning_rate": 1.9763296037475174e-05, + "loss": 0.2177, + "step": 3172 + }, + { + "epoch": 0.6357085668534828, + "grad_norm": 3.339843511581421, + "learning_rate": 1.97663091396921e-05, + "loss": 0.2952, + "step": 3174 + }, + { + "epoch": 0.6357085668534828, + "grad_norm": 1.9967347383499146, + "learning_rate": 1.976930316809569e-05, + "loss": 0.2524, + "step": 3176 + }, + { + "epoch": 0.6365092073658927, + "grad_norm": 1.625946044921875, + "learning_rate": 1.9772278116838543e-05, + "loss": 0.382, + "step": 3178 + }, + { + "epoch": 0.6365092073658927, + "grad_norm": 1.7274712324142456, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.2821, + "step": 3180 + }, + { + "epoch": 0.6373098478783027, + "grad_norm": 7.581551551818848, + "learning_rate": 1.977817075213876e-05, + "loss": 0.3036, + "step": 3182 + }, + { + "epoch": 0.6373098478783027, + "grad_norm": 3.336413621902466, + "learning_rate": 1.978108842718768e-05, + "loss": 0.3497, + "step": 3184 + }, + { + "epoch": 0.6381104883907126, + "grad_norm": 6.7421464920043945, + "learning_rate": 1.9783986999558994e-05, + "loss": 0.1786, + "step": 3186 + }, + { + "epoch": 0.6381104883907126, + "grad_norm": 2.1065635681152344, + "learning_rate": 1.9786866463591732e-05, + "loss": 0.0968, + "step": 3188 + }, + { + "epoch": 0.6389111289031225, + "grad_norm": 3.2261300086975098, + "learning_rate": 1.9789726813662233e-05, + "loss": 0.4603, + "step": 3190 + }, + { + "epoch": 0.6389111289031225, + "grad_norm": 0.3843856453895569, + "learning_rate": 1.9792568044184176e-05, + "loss": 0.2251, + "step": 3192 + }, + { + "epoch": 0.6397117694155324, + "grad_norm": 3.6769278049468994, + "learning_rate": 1.979539014960858e-05, + "loss": 0.2167, + "step": 3194 + }, + { + "epoch": 0.6397117694155324, + "grad_norm": 1.5556795597076416, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.0976, + "step": 3196 + }, + { + "epoch": 0.6405124099279423, + "grad_norm": 5.852959156036377, + "learning_rate": 1.9800976963155584e-05, + "loss": 0.7634, + "step": 3198 + }, + { + "epoch": 0.6405124099279423, + "grad_norm": 0.7414265275001526, + "learning_rate": 1.9803741660367015e-05, + "loss": 0.052, + "step": 3200 + }, + { + "epoch": 0.6413130504403523, + "grad_norm": 3.859779119491577, + "learning_rate": 1.980648721065859e-05, + "loss": 0.4719, + "step": 3202 + }, + { + "epoch": 0.6413130504403523, + "grad_norm": 0.9730708599090576, + "learning_rate": 1.9809213608668185e-05, + "loss": 0.1115, + "step": 3204 + }, + { + "epoch": 0.6421136909527622, + "grad_norm": 0.43513768911361694, + "learning_rate": 1.9811920849071092e-05, + "loss": 0.2411, + "step": 3206 + }, + { + "epoch": 0.6421136909527622, + "grad_norm": 0.5433670282363892, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.1331, + "step": 3208 + }, + { + "epoch": 0.6429143314651722, + "grad_norm": 2.92608380317688, + "learning_rate": 1.9817277835945057e-05, + "loss": 0.4049, + "step": 3210 + }, + { + "epoch": 0.6429143314651722, + "grad_norm": 2.241579532623291, + "learning_rate": 1.9819927571953807e-05, + "loss": 0.2291, + "step": 3212 + }, + { + "epoch": 0.6437149719775821, + "grad_norm": 0.6054361462593079, + "learning_rate": 1.9822558129431263e-05, + "loss": 0.5887, + "step": 3214 + }, + { + "epoch": 0.6437149719775821, + "grad_norm": 1.036758542060852, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.1219, + "step": 3216 + }, + { + "epoch": 0.644515612489992, + "grad_norm": 0.720119833946228, + "learning_rate": 1.9827761688279606e-05, + "loss": 0.3169, + "step": 3218 + }, + { + "epoch": 0.644515612489992, + "grad_norm": 2.73846435546875, + "learning_rate": 1.983033467948784e-05, + "loss": 0.1333, + "step": 3220 + }, + { + "epoch": 0.6453162530024019, + "grad_norm": 2.896918535232544, + "learning_rate": 1.983288847183947e-05, + "loss": 0.2224, + "step": 3222 + }, + { + "epoch": 0.6453162530024019, + "grad_norm": 0.20758797228336334, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.165, + "step": 3224 + }, + { + "epoch": 0.6461168935148118, + "grad_norm": 2.470942974090576, + "learning_rate": 1.9837938440059992e-05, + "loss": 0.1249, + "step": 3226 + }, + { + "epoch": 0.6461168935148118, + "grad_norm": 0.1916690468788147, + "learning_rate": 1.9840434606066182e-05, + "loss": 0.0863, + "step": 3228 + }, + { + "epoch": 0.6469175340272217, + "grad_norm": 4.079509258270264, + "learning_rate": 1.9842911553490392e-05, + "loss": 0.3339, + "step": 3230 + }, + { + "epoch": 0.6469175340272217, + "grad_norm": 3.589513063430786, + "learning_rate": 1.9845369277495102e-05, + "loss": 0.2275, + "step": 3232 + }, + { + "epoch": 0.6477181745396317, + "grad_norm": 7.674187183380127, + "learning_rate": 1.984780777328031e-05, + "loss": 0.3639, + "step": 3234 + }, + { + "epoch": 0.6477181745396317, + "grad_norm": 2.8400983810424805, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.2762, + "step": 3236 + }, + { + "epoch": 0.6485188150520417, + "grad_norm": 5.75058126449585, + "learning_rate": 1.985262706118007e-05, + "loss": 0.5191, + "step": 3238 + }, + { + "epoch": 0.6485188150520417, + "grad_norm": 6.608194828033447, + "learning_rate": 1.985500784388244e-05, + "loss": 0.2452, + "step": 3240 + }, + { + "epoch": 0.6493194555644516, + "grad_norm": 7.193440914154053, + "learning_rate": 1.9857369379540982e-05, + "loss": 0.6204, + "step": 3242 + }, + { + "epoch": 0.6493194555644516, + "grad_norm": 0.6048113703727722, + "learning_rate": 1.985971166354357e-05, + "loss": 0.2027, + "step": 3244 + }, + { + "epoch": 0.6501200960768615, + "grad_norm": 1.540043592453003, + "learning_rate": 1.986203469131567e-05, + "loss": 0.0925, + "step": 3246 + }, + { + "epoch": 0.6501200960768615, + "grad_norm": 3.159672260284424, + "learning_rate": 1.9864338458320366e-05, + "loss": 0.1402, + "step": 3248 + }, + { + "epoch": 0.6509207365892714, + "grad_norm": 4.399783611297607, + "learning_rate": 1.986662296005834e-05, + "loss": 0.2615, + "step": 3250 + }, + { + "epoch": 0.6509207365892714, + "grad_norm": 2.528106927871704, + "learning_rate": 1.986888819206792e-05, + "loss": 0.2179, + "step": 3252 + }, + { + "epoch": 0.6517213771016813, + "grad_norm": 3.4460391998291016, + "learning_rate": 1.987113414992505e-05, + "loss": 0.5084, + "step": 3254 + }, + { + "epoch": 0.6517213771016813, + "grad_norm": 4.448776721954346, + "learning_rate": 1.9873360829243323e-05, + "loss": 0.2069, + "step": 3256 + }, + { + "epoch": 0.6525220176140912, + "grad_norm": 6.823792934417725, + "learning_rate": 1.9875568225674e-05, + "loss": 0.1768, + "step": 3258 + }, + { + "epoch": 0.6525220176140912, + "grad_norm": 1.5018579959869385, + "learning_rate": 1.9877756334905983e-05, + "loss": 0.1272, + "step": 3260 + }, + { + "epoch": 0.6533226581265013, + "grad_norm": 1.7634000778198242, + "learning_rate": 1.9879925152665845e-05, + "loss": 0.3614, + "step": 3262 + }, + { + "epoch": 0.6533226581265013, + "grad_norm": 3.571075916290283, + "learning_rate": 1.9882074674717836e-05, + "loss": 0.1429, + "step": 3264 + }, + { + "epoch": 0.6541232986389112, + "grad_norm": 10.773670196533203, + "learning_rate": 1.9884204896863895e-05, + "loss": 0.4884, + "step": 3266 + }, + { + "epoch": 0.6541232986389112, + "grad_norm": 5.742667198181152, + "learning_rate": 1.988631581494365e-05, + "loss": 0.2474, + "step": 3268 + }, + { + "epoch": 0.6549239391513211, + "grad_norm": 11.128340721130371, + "learning_rate": 1.9888407424834433e-05, + "loss": 0.6831, + "step": 3270 + }, + { + "epoch": 0.6549239391513211, + "grad_norm": 9.647910118103027, + "learning_rate": 1.989047972245129e-05, + "loss": 0.3361, + "step": 3272 + }, + { + "epoch": 0.655724579663731, + "grad_norm": 0.9570426940917969, + "learning_rate": 1.989253270374697e-05, + "loss": 0.06, + "step": 3274 + }, + { + "epoch": 0.655724579663731, + "grad_norm": 0.8649162650108337, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.059, + "step": 3276 + }, + { + "epoch": 0.6565252201761409, + "grad_norm": 2.561380624771118, + "learning_rate": 1.9896580701374482e-05, + "loss": 0.3837, + "step": 3278 + }, + { + "epoch": 0.6565252201761409, + "grad_norm": 0.9337728023529053, + "learning_rate": 1.989857570980049e-05, + "loss": 0.0816, + "step": 3280 + }, + { + "epoch": 0.6573258606885508, + "grad_norm": 3.6111676692962646, + "learning_rate": 1.9900551386093677e-05, + "loss": 0.3226, + "step": 3282 + }, + { + "epoch": 0.6573258606885508, + "grad_norm": 2.4519143104553223, + "learning_rate": 1.990250772639552e-05, + "loss": 0.2213, + "step": 3284 + }, + { + "epoch": 0.6581265012009607, + "grad_norm": 4.7809648513793945, + "learning_rate": 1.9904444726885236e-05, + "loss": 0.6564, + "step": 3286 + }, + { + "epoch": 0.6581265012009607, + "grad_norm": 0.5325888991355896, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.1942, + "step": 3288 + }, + { + "epoch": 0.6589271417133707, + "grad_norm": 5.1001667976379395, + "learning_rate": 1.990826069333406e-05, + "loss": 0.6371, + "step": 3290 + }, + { + "epoch": 0.6589271417133707, + "grad_norm": 4.062544345855713, + "learning_rate": 1.99101396518405e-05, + "loss": 0.626, + "step": 3292 + }, + { + "epoch": 0.6597277822257807, + "grad_norm": 4.053399562835693, + "learning_rate": 1.99119992556295e-05, + "loss": 0.7691, + "step": 3294 + }, + { + "epoch": 0.6597277822257807, + "grad_norm": 2.865462064743042, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.4089, + "step": 3296 + }, + { + "epoch": 0.6605284227381906, + "grad_norm": 10.615232467651367, + "learning_rate": 1.99156603845656e-05, + "loss": 0.56, + "step": 3298 + }, + { + "epoch": 0.6605284227381906, + "grad_norm": 0.6358562707901001, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.2027, + "step": 3300 + }, + { + "epoch": 0.6613290632506005, + "grad_norm": 2.806757926940918, + "learning_rate": 1.9919244051541315e-05, + "loss": 0.4751, + "step": 3302 + }, + { + "epoch": 0.6613290632506005, + "grad_norm": 2.0853638648986816, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.2331, + "step": 3304 + }, + { + "epoch": 0.6621297037630104, + "grad_norm": 4.294143199920654, + "learning_rate": 1.9922750228560746e-05, + "loss": 0.6831, + "step": 3306 + }, + { + "epoch": 0.6621297037630104, + "grad_norm": 2.80135178565979, + "learning_rate": 1.9924474249753652e-05, + "loss": 0.3942, + "step": 3308 + }, + { + "epoch": 0.6629303442754203, + "grad_norm": 8.292777061462402, + "learning_rate": 1.9926178888233344e-05, + "loss": 0.8614, + "step": 3310 + }, + { + "epoch": 0.6629303442754203, + "grad_norm": 1.775967001914978, + "learning_rate": 1.9927864140670615e-05, + "loss": 0.3431, + "step": 3312 + }, + { + "epoch": 0.6637309847878302, + "grad_norm": 3.270768165588379, + "learning_rate": 1.9929530003774133e-05, + "loss": 0.2905, + "step": 3314 + }, + { + "epoch": 0.6637309847878302, + "grad_norm": 1.7941439151763916, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.209, + "step": 3316 + }, + { + "epoch": 0.6645316253002402, + "grad_norm": 1.5572370290756226, + "learning_rate": 1.993280354900393e-05, + "loss": 0.3053, + "step": 3318 + }, + { + "epoch": 0.6645316253002402, + "grad_norm": 2.272634267807007, + "learning_rate": 1.99344112247369e-05, + "loss": 0.1651, + "step": 3320 + }, + { + "epoch": 0.6653322658126501, + "grad_norm": 1.7969772815704346, + "learning_rate": 1.9935999498349518e-05, + "loss": 0.1788, + "step": 3322 + }, + { + "epoch": 0.6653322658126501, + "grad_norm": 1.9660718441009521, + "learning_rate": 1.9937568366739858e-05, + "loss": 0.2642, + "step": 3324 + }, + { + "epoch": 0.6661329063250601, + "grad_norm": 1.312108039855957, + "learning_rate": 1.9939117826843883e-05, + "loss": 0.3432, + "step": 3326 + }, + { + "epoch": 0.6661329063250601, + "grad_norm": 3.3726367950439453, + "learning_rate": 1.9940647875635463e-05, + "loss": 0.3703, + "step": 3328 + }, + { + "epoch": 0.66693354683747, + "grad_norm": 1.7417798042297363, + "learning_rate": 1.9942158510126384e-05, + "loss": 0.2785, + "step": 3330 + }, + { + "epoch": 0.66693354683747, + "grad_norm": 2.327225923538208, + "learning_rate": 1.9943649727366335e-05, + "loss": 0.2516, + "step": 3332 + }, + { + "epoch": 0.6677341873498799, + "grad_norm": 1.7290884256362915, + "learning_rate": 1.9945121524442944e-05, + "loss": 0.2057, + "step": 3334 + }, + { + "epoch": 0.6677341873498799, + "grad_norm": 1.8311511278152466, + "learning_rate": 1.994657389848176e-05, + "loss": 0.2161, + "step": 3336 + }, + { + "epoch": 0.6685348278622898, + "grad_norm": 3.088359832763672, + "learning_rate": 1.9948006846646262e-05, + "loss": 0.3705, + "step": 3338 + }, + { + "epoch": 0.6685348278622898, + "grad_norm": 2.5623486042022705, + "learning_rate": 1.994942036613787e-05, + "loss": 0.1336, + "step": 3340 + }, + { + "epoch": 0.6693354683746997, + "grad_norm": 6.902070045471191, + "learning_rate": 1.9950814454195953e-05, + "loss": 0.5965, + "step": 3342 + }, + { + "epoch": 0.6693354683746997, + "grad_norm": 1.4973150491714478, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.2532, + "step": 3344 + }, + { + "epoch": 0.6701361088871097, + "grad_norm": 7.57047176361084, + "learning_rate": 1.9953544325158755e-05, + "loss": 0.6205, + "step": 3346 + }, + { + "epoch": 0.6701361088871097, + "grad_norm": 0.6709346175193787, + "learning_rate": 1.995488010273198e-05, + "loss": 0.1262, + "step": 3348 + }, + { + "epoch": 0.6709367493995196, + "grad_norm": 4.095053195953369, + "learning_rate": 1.9956196438208693e-05, + "loss": 0.5722, + "step": 3350 + }, + { + "epoch": 0.6709367493995196, + "grad_norm": 3.511445999145508, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.1855, + "step": 3352 + }, + { + "epoch": 0.6717373899119295, + "grad_norm": 1.17584228515625, + "learning_rate": 1.9958770772627236e-05, + "loss": 0.1089, + "step": 3354 + }, + { + "epoch": 0.6717373899119295, + "grad_norm": 2.8529558181762695, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.1515, + "step": 3356 + }, + { + "epoch": 0.6725380304243395, + "grad_norm": 5.495058059692383, + "learning_rate": 1.9961267308303473e-05, + "loss": 0.3329, + "step": 3358 + }, + { + "epoch": 0.6725380304243395, + "grad_norm": 3.322465419769287, + "learning_rate": 1.996248639549475e-05, + "loss": 0.1698, + "step": 3360 + }, + { + "epoch": 0.6733386709367494, + "grad_norm": 2.7710249423980713, + "learning_rate": 1.9963686025734262e-05, + "loss": 0.3387, + "step": 3362 + }, + { + "epoch": 0.6733386709367494, + "grad_norm": 1.4199219942092896, + "learning_rate": 1.9964866196679105e-05, + "loss": 0.1578, + "step": 3364 + }, + { + "epoch": 0.6741393114491593, + "grad_norm": 2.0797665119171143, + "learning_rate": 1.9966026906024377e-05, + "loss": 0.2275, + "step": 3366 + }, + { + "epoch": 0.6741393114491593, + "grad_norm": 1.277516484260559, + "learning_rate": 1.9967168151503196e-05, + "loss": 0.1036, + "step": 3368 + }, + { + "epoch": 0.6749399519615693, + "grad_norm": 0.712367057800293, + "learning_rate": 1.9968289930886675e-05, + "loss": 0.1402, + "step": 3370 + }, + { + "epoch": 0.6749399519615693, + "grad_norm": 0.9500345587730408, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.0819, + "step": 3372 + }, + { + "epoch": 0.6757405924739792, + "grad_norm": 3.673982858657837, + "learning_rate": 1.997047508264221e-05, + "loss": 0.4345, + "step": 3374 + }, + { + "epoch": 0.6757405924739792, + "grad_norm": 6.356472969055176, + "learning_rate": 1.997153845074662e-05, + "loss": 0.2664, + "step": 3376 + }, + { + "epoch": 0.6765412329863891, + "grad_norm": 5.5342607498168945, + "learning_rate": 1.99725823442204e-05, + "loss": 0.5749, + "step": 3378 + }, + { + "epoch": 0.6765412329863891, + "grad_norm": 1.8448833227157593, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.1166, + "step": 3380 + }, + { + "epoch": 0.677341873498799, + "grad_norm": 0.5776091814041138, + "learning_rate": 1.9974611699159142e-05, + "loss": 0.0497, + "step": 3382 + }, + { + "epoch": 0.677341873498799, + "grad_norm": 2.9415884017944336, + "learning_rate": 1.997559715666073e-05, + "loss": 0.0954, + "step": 3384 + }, + { + "epoch": 0.678142514011209, + "grad_norm": 5.971640110015869, + "learning_rate": 1.9976563131604945e-05, + "loss": 0.2396, + "step": 3386 + }, + { + "epoch": 0.678142514011209, + "grad_norm": 2.4153146743774414, + "learning_rate": 1.9977509622105233e-05, + "loss": 0.2588, + "step": 3388 + }, + { + "epoch": 0.6789431545236189, + "grad_norm": 4.5485148429870605, + "learning_rate": 1.9978436626313065e-05, + "loss": 0.6162, + "step": 3390 + }, + { + "epoch": 0.6789431545236189, + "grad_norm": 1.9180742502212524, + "learning_rate": 1.9979344142417986e-05, + "loss": 0.1516, + "step": 3392 + }, + { + "epoch": 0.6797437950360288, + "grad_norm": 1.402932047843933, + "learning_rate": 1.99802321686476e-05, + "loss": 0.4575, + "step": 3394 + }, + { + "epoch": 0.6797437950360288, + "grad_norm": 2.1443257331848145, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.4041, + "step": 3396 + }, + { + "epoch": 0.6805444355484388, + "grad_norm": 5.4469194412231445, + "learning_rate": 1.9981949744581622e-05, + "loss": 0.5085, + "step": 3398 + }, + { + "epoch": 0.6805444355484388, + "grad_norm": 0.653738796710968, + "learning_rate": 1.998277929093157e-05, + "loss": 0.078, + "step": 3400 + }, + { + "epoch": 0.6813450760608487, + "grad_norm": 5.865527153015137, + "learning_rate": 1.9983589340697288e-05, + "loss": 0.5277, + "step": 3402 + }, + { + "epoch": 0.6813450760608487, + "grad_norm": 0.6982697248458862, + "learning_rate": 1.998437989229673e-05, + "loss": 0.0547, + "step": 3404 + }, + { + "epoch": 0.6821457165732586, + "grad_norm": 1.5117623805999756, + "learning_rate": 1.998515094418594e-05, + "loss": 0.1091, + "step": 3406 + }, + { + "epoch": 0.6821457165732586, + "grad_norm": 1.4895625114440918, + "learning_rate": 1.9985902494859023e-05, + "loss": 0.1608, + "step": 3408 + }, + { + "epoch": 0.6829463570856685, + "grad_norm": 1.8693584203720093, + "learning_rate": 1.99866345428482e-05, + "loss": 0.2272, + "step": 3410 + }, + { + "epoch": 0.6829463570856685, + "grad_norm": 1.0386098623275757, + "learning_rate": 1.998734708672375e-05, + "loss": 0.0818, + "step": 3412 + }, + { + "epoch": 0.6837469975980784, + "grad_norm": 3.2201948165893555, + "learning_rate": 1.998804012509407e-05, + "loss": 0.3817, + "step": 3414 + }, + { + "epoch": 0.6837469975980784, + "grad_norm": 2.477696657180786, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.2527, + "step": 3416 + }, + { + "epoch": 0.6845476381104884, + "grad_norm": 2.7365195751190186, + "learning_rate": 1.9989367679943025e-05, + "loss": 0.2527, + "step": 3418 + }, + { + "epoch": 0.6845476381104884, + "grad_norm": 3.402339220046997, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.2582, + "step": 3420 + }, + { + "epoch": 0.6853482786228983, + "grad_norm": 3.8126344680786133, + "learning_rate": 1.9990617197024103e-05, + "loss": 0.5975, + "step": 3422 + }, + { + "epoch": 0.6853482786228983, + "grad_norm": 6.469859600067139, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.2424, + "step": 3424 + }, + { + "epoch": 0.6861489191353083, + "grad_norm": 7.670634746551514, + "learning_rate": 1.999178866657597e-05, + "loss": 0.6257, + "step": 3426 + }, + { + "epoch": 0.6861489191353083, + "grad_norm": 6.907395839691162, + "learning_rate": 1.9992345130644747e-05, + "loss": 0.1686, + "step": 3428 + }, + { + "epoch": 0.6869495596477182, + "grad_norm": 1.9114350080490112, + "learning_rate": 1.999288207944701e-05, + "loss": 0.308, + "step": 3430 + }, + { + "epoch": 0.6869495596477182, + "grad_norm": 2.290929079055786, + "learning_rate": 1.999339951193407e-05, + "loss": 0.1714, + "step": 3432 + }, + { + "epoch": 0.6877502001601281, + "grad_norm": 3.496532678604126, + "learning_rate": 1.999389742709538e-05, + "loss": 0.282, + "step": 3434 + }, + { + "epoch": 0.6877502001601281, + "grad_norm": 0.8751980066299438, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.1441, + "step": 3436 + }, + { + "epoch": 0.688550840672538, + "grad_norm": 0.18534532189369202, + "learning_rate": 1.9994834701589113e-05, + "loss": 0.4851, + "step": 3438 + }, + { + "epoch": 0.688550840672538, + "grad_norm": 0.9383301734924316, + "learning_rate": 1.9995274059091018e-05, + "loss": 0.1372, + "step": 3440 + }, + { + "epoch": 0.6893514811849479, + "grad_norm": 3.8426496982574463, + "learning_rate": 1.999569389560614e-05, + "loss": 0.4748, + "step": 3442 + }, + { + "epoch": 0.6893514811849479, + "grad_norm": 1.7869377136230469, + "learning_rate": 1.999609421031453e-05, + "loss": 0.1828, + "step": 3444 + }, + { + "epoch": 0.6901521216973578, + "grad_norm": 3.9936063289642334, + "learning_rate": 1.9996475002434365e-05, + "loss": 0.6589, + "step": 3446 + }, + { + "epoch": 0.6901521216973578, + "grad_norm": 1.927378535270691, + "learning_rate": 1.999683627122195e-05, + "loss": 0.2462, + "step": 3448 + }, + { + "epoch": 0.6909527622097679, + "grad_norm": 1.7020918130874634, + "learning_rate": 1.999717801597172e-05, + "loss": 0.4396, + "step": 3450 + }, + { + "epoch": 0.6909527622097679, + "grad_norm": 1.733694076538086, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.2288, + "step": 3452 + }, + { + "epoch": 0.6917534027221778, + "grad_norm": 1.6692661046981812, + "learning_rate": 1.9997802930726195e-05, + "loss": 0.4264, + "step": 3454 + }, + { + "epoch": 0.6917534027221778, + "grad_norm": 2.4237611293792725, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.2562, + "step": 3456 + }, + { + "epoch": 0.6925540432345877, + "grad_norm": 0.642198383808136, + "learning_rate": 1.9998349741815916e-05, + "loss": 0.0573, + "step": 3458 + }, + { + "epoch": 0.6925540432345877, + "grad_norm": 2.3655800819396973, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.1827, + "step": 3460 + }, + { + "epoch": 0.6933546837469976, + "grad_norm": 2.866231679916382, + "learning_rate": 1.999881844496914e-05, + "loss": 0.3358, + "step": 3462 + }, + { + "epoch": 0.6933546837469976, + "grad_norm": 0.8496546745300293, + "learning_rate": 1.99990235049015e-05, + "loss": 0.128, + "step": 3464 + }, + { + "epoch": 0.6941553242594075, + "grad_norm": 6.452030181884766, + "learning_rate": 1.9999209036524326e-05, + "loss": 0.6258, + "step": 3466 + }, + { + "epoch": 0.6941553242594075, + "grad_norm": 1.5651286840438843, + "learning_rate": 1.9999375039475275e-05, + "loss": 0.2689, + "step": 3468 + }, + { + "epoch": 0.6949559647718174, + "grad_norm": 1.011326789855957, + "learning_rate": 1.999952151343014e-05, + "loss": 0.3485, + "step": 3470 + }, + { + "epoch": 0.6949559647718174, + "grad_norm": 2.012007713317871, + "learning_rate": 1.999964845810285e-05, + "loss": 0.2388, + "step": 3472 + }, + { + "epoch": 0.6957566052842273, + "grad_norm": 3.229433059692383, + "learning_rate": 1.9999755873245484e-05, + "loss": 0.5839, + "step": 3474 + }, + { + "epoch": 0.6957566052842273, + "grad_norm": 1.3683396577835083, + "learning_rate": 1.9999843758648253e-05, + "loss": 0.1765, + "step": 3476 + }, + { + "epoch": 0.6965572457966374, + "grad_norm": 5.020935535430908, + "learning_rate": 1.999991211413952e-05, + "loss": 0.4717, + "step": 3478 + }, + { + "epoch": 0.6965572457966374, + "grad_norm": 1.844724416732788, + "learning_rate": 1.999996093958578e-05, + "loss": 0.2312, + "step": 3480 + }, + { + "epoch": 0.6973578863090473, + "grad_norm": 5.299980640411377, + "learning_rate": 1.9999990234891677e-05, + "loss": 0.5322, + "step": 3482 + }, + { + "epoch": 0.6973578863090473, + "grad_norm": 1.536635398864746, + "learning_rate": 2e-05, + "loss": 0.115, + "step": 3484 + }, + { + "epoch": 0.6981585268214572, + "grad_norm": 4.766470432281494, + "learning_rate": 1.999999023489168e-05, + "loss": 0.3659, + "step": 3486 + }, + { + "epoch": 0.6981585268214572, + "grad_norm": 2.565600872039795, + "learning_rate": 1.999996093958578e-05, + "loss": 0.2714, + "step": 3488 + }, + { + "epoch": 0.6989591673338671, + "grad_norm": 2.3254637718200684, + "learning_rate": 1.999991211413952e-05, + "loss": 0.2142, + "step": 3490 + }, + { + "epoch": 0.6989591673338671, + "grad_norm": 1.2487372159957886, + "learning_rate": 1.9999843758648253e-05, + "loss": 0.1272, + "step": 3492 + }, + { + "epoch": 0.699759807846277, + "grad_norm": 1.2230582237243652, + "learning_rate": 1.9999755873245484e-05, + "loss": 0.6108, + "step": 3494 + }, + { + "epoch": 0.699759807846277, + "grad_norm": 5.416754722595215, + "learning_rate": 1.999964845810285e-05, + "loss": 0.216, + "step": 3496 + }, + { + "epoch": 0.7005604483586869, + "grad_norm": 1.9604777097702026, + "learning_rate": 1.999952151343014e-05, + "loss": 0.2379, + "step": 3498 + }, + { + "epoch": 0.7005604483586869, + "grad_norm": 1.3195626735687256, + "learning_rate": 1.9999375039475278e-05, + "loss": 0.2227, + "step": 3500 + }, + { + "epoch": 0.7013610888710968, + "grad_norm": 3.3220343589782715, + "learning_rate": 1.9999209036524326e-05, + "loss": 0.3349, + "step": 3502 + }, + { + "epoch": 0.7013610888710968, + "grad_norm": 4.593979835510254, + "learning_rate": 1.99990235049015e-05, + "loss": 0.1695, + "step": 3504 + }, + { + "epoch": 0.7021617293835068, + "grad_norm": 0.5859458446502686, + "learning_rate": 1.999881844496914e-05, + "loss": 0.0691, + "step": 3506 + }, + { + "epoch": 0.7021617293835068, + "grad_norm": 8.339945793151855, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.19, + "step": 3508 + }, + { + "epoch": 0.7029623698959168, + "grad_norm": 4.157390594482422, + "learning_rate": 1.9998349741815916e-05, + "loss": 0.5323, + "step": 3510 + }, + { + "epoch": 0.7029623698959168, + "grad_norm": 0.7790760397911072, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.0512, + "step": 3512 + }, + { + "epoch": 0.7037630104083267, + "grad_norm": 5.302454471588135, + "learning_rate": 1.9997802930726195e-05, + "loss": 0.8322, + "step": 3514 + }, + { + "epoch": 0.7037630104083267, + "grad_norm": 0.9014292359352112, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.107, + "step": 3516 + }, + { + "epoch": 0.7045636509207366, + "grad_norm": 1.6731563806533813, + "learning_rate": 1.999717801597172e-05, + "loss": 0.3048, + "step": 3518 + }, + { + "epoch": 0.7045636509207366, + "grad_norm": 1.35103178024292, + "learning_rate": 1.999683627122195e-05, + "loss": 0.0897, + "step": 3520 + }, + { + "epoch": 0.7053642914331465, + "grad_norm": 1.2724040746688843, + "learning_rate": 1.9996475002434365e-05, + "loss": 0.3501, + "step": 3522 + }, + { + "epoch": 0.7053642914331465, + "grad_norm": 0.9164237380027771, + "learning_rate": 1.999609421031453e-05, + "loss": 0.1284, + "step": 3524 + }, + { + "epoch": 0.7061649319455564, + "grad_norm": 2.2598679065704346, + "learning_rate": 1.999569389560614e-05, + "loss": 0.5202, + "step": 3526 + }, + { + "epoch": 0.7061649319455564, + "grad_norm": 3.098735809326172, + "learning_rate": 1.999527405909102e-05, + "loss": 0.2923, + "step": 3528 + }, + { + "epoch": 0.7069655724579663, + "grad_norm": 6.341969966888428, + "learning_rate": 1.9994834701589113e-05, + "loss": 0.5575, + "step": 3530 + }, + { + "epoch": 0.7069655724579663, + "grad_norm": 1.0890014171600342, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.1078, + "step": 3532 + }, + { + "epoch": 0.7077662129703763, + "grad_norm": 1.7618839740753174, + "learning_rate": 1.9993897427095378e-05, + "loss": 0.3418, + "step": 3534 + }, + { + "epoch": 0.7077662129703763, + "grad_norm": 3.084141731262207, + "learning_rate": 1.999339951193407e-05, + "loss": 0.2586, + "step": 3536 + }, + { + "epoch": 0.7085668534827863, + "grad_norm": 9.997249603271484, + "learning_rate": 1.999288207944701e-05, + "loss": 0.4383, + "step": 3538 + }, + { + "epoch": 0.7085668534827863, + "grad_norm": 4.620125770568848, + "learning_rate": 1.999234513064475e-05, + "loss": 0.2158, + "step": 3540 + }, + { + "epoch": 0.7093674939951962, + "grad_norm": 1.859646201133728, + "learning_rate": 1.999178866657597e-05, + "loss": 0.3818, + "step": 3542 + }, + { + "epoch": 0.7093674939951962, + "grad_norm": 5.292738437652588, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.2714, + "step": 3544 + }, + { + "epoch": 0.7101681345076061, + "grad_norm": 1.4272756576538086, + "learning_rate": 1.9990617197024103e-05, + "loss": 0.23, + "step": 3546 + }, + { + "epoch": 0.7101681345076061, + "grad_norm": 2.260686159133911, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.2982, + "step": 3548 + }, + { + "epoch": 0.710968775020016, + "grad_norm": 0.508266031742096, + "learning_rate": 1.998936767994303e-05, + "loss": 0.0794, + "step": 3550 + }, + { + "epoch": 0.710968775020016, + "grad_norm": 1.8599307537078857, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.1545, + "step": 3552 + }, + { + "epoch": 0.7117694155324259, + "grad_norm": 4.1258697509765625, + "learning_rate": 1.998804012509407e-05, + "loss": 0.2786, + "step": 3554 + }, + { + "epoch": 0.7117694155324259, + "grad_norm": 8.66682243347168, + "learning_rate": 1.998734708672375e-05, + "loss": 0.2538, + "step": 3556 + }, + { + "epoch": 0.7125700560448359, + "grad_norm": 13.872123718261719, + "learning_rate": 1.99866345428482e-05, + "loss": 0.3036, + "step": 3558 + }, + { + "epoch": 0.7125700560448359, + "grad_norm": 0.5863029956817627, + "learning_rate": 1.9985902494859026e-05, + "loss": 0.0839, + "step": 3560 + }, + { + "epoch": 0.7133706965572458, + "grad_norm": 1.5728942155838013, + "learning_rate": 1.998515094418594e-05, + "loss": 0.0978, + "step": 3562 + }, + { + "epoch": 0.7133706965572458, + "grad_norm": 2.1630947589874268, + "learning_rate": 1.9984379892296735e-05, + "loss": 0.1985, + "step": 3564 + }, + { + "epoch": 0.7141713370696557, + "grad_norm": 6.5758233070373535, + "learning_rate": 1.9983589340697288e-05, + "loss": 0.5337, + "step": 3566 + }, + { + "epoch": 0.7141713370696557, + "grad_norm": 0.550474226474762, + "learning_rate": 1.9982779290931572e-05, + "loss": 0.1266, + "step": 3568 + }, + { + "epoch": 0.7149719775820657, + "grad_norm": 0.14604683220386505, + "learning_rate": 1.9981949744581622e-05, + "loss": 0.102, + "step": 3570 + }, + { + "epoch": 0.7149719775820657, + "grad_norm": 0.9340583086013794, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.1365, + "step": 3572 + }, + { + "epoch": 0.7157726180944756, + "grad_norm": 6.124854564666748, + "learning_rate": 1.99802321686476e-05, + "loss": 0.3893, + "step": 3574 + }, + { + "epoch": 0.7157726180944756, + "grad_norm": 2.7216408252716064, + "learning_rate": 1.997934414241799e-05, + "loss": 0.2907, + "step": 3576 + }, + { + "epoch": 0.7165732586068855, + "grad_norm": 1.9788362979888916, + "learning_rate": 1.9978436626313068e-05, + "loss": 0.5624, + "step": 3578 + }, + { + "epoch": 0.7165732586068855, + "grad_norm": 1.7733237743377686, + "learning_rate": 1.9977509622105236e-05, + "loss": 0.0817, + "step": 3580 + }, + { + "epoch": 0.7173738991192954, + "grad_norm": 5.567836761474609, + "learning_rate": 1.997656313160495e-05, + "loss": 0.335, + "step": 3582 + }, + { + "epoch": 0.7173738991192954, + "grad_norm": 6.661506175994873, + "learning_rate": 1.997559715666073e-05, + "loss": 0.1983, + "step": 3584 + }, + { + "epoch": 0.7181745396317054, + "grad_norm": 8.47241497039795, + "learning_rate": 1.9974611699159142e-05, + "loss": 0.7288, + "step": 3586 + }, + { + "epoch": 0.7181745396317054, + "grad_norm": 0.9757118225097656, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.2496, + "step": 3588 + }, + { + "epoch": 0.7189751801441153, + "grad_norm": 3.987398624420166, + "learning_rate": 1.99725823442204e-05, + "loss": 0.1877, + "step": 3590 + }, + { + "epoch": 0.7189751801441153, + "grad_norm": 1.101069688796997, + "learning_rate": 1.997153845074662e-05, + "loss": 0.1282, + "step": 3592 + }, + { + "epoch": 0.7197758206565252, + "grad_norm": 6.1263017654418945, + "learning_rate": 1.9970475082642212e-05, + "loss": 0.4653, + "step": 3594 + }, + { + "epoch": 0.7197758206565252, + "grad_norm": 2.4558095932006836, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.1665, + "step": 3596 + }, + { + "epoch": 0.7205764611689351, + "grad_norm": 2.907579183578491, + "learning_rate": 1.9968289930886675e-05, + "loss": 0.0941, + "step": 3598 + }, + { + "epoch": 0.7205764611689351, + "grad_norm": 0.7757080793380737, + "learning_rate": 1.9967168151503193e-05, + "loss": 0.0455, + "step": 3600 + }, + { + "epoch": 0.7213771016813451, + "grad_norm": 1.5536539554595947, + "learning_rate": 1.9966026906024377e-05, + "loss": 0.4461, + "step": 3602 + }, + { + "epoch": 0.7213771016813451, + "grad_norm": 0.9385843276977539, + "learning_rate": 1.996486619667911e-05, + "loss": 0.2845, + "step": 3604 + }, + { + "epoch": 0.722177742193755, + "grad_norm": 1.3489714860916138, + "learning_rate": 1.9963686025734262e-05, + "loss": 0.0954, + "step": 3606 + }, + { + "epoch": 0.722177742193755, + "grad_norm": 0.3645288646221161, + "learning_rate": 1.9962486395494753e-05, + "loss": 0.102, + "step": 3608 + }, + { + "epoch": 0.7229783827061649, + "grad_norm": 5.252882957458496, + "learning_rate": 1.9961267308303473e-05, + "loss": 0.5137, + "step": 3610 + }, + { + "epoch": 0.7229783827061649, + "grad_norm": 10.87562084197998, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.2522, + "step": 3612 + }, + { + "epoch": 0.7237790232185749, + "grad_norm": 3.775437831878662, + "learning_rate": 1.9958770772627236e-05, + "loss": 0.2528, + "step": 3614 + }, + { + "epoch": 0.7237790232185749, + "grad_norm": 4.813497066497803, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.3755, + "step": 3616 + }, + { + "epoch": 0.7245796637309848, + "grad_norm": 2.8341798782348633, + "learning_rate": 1.9956196438208693e-05, + "loss": 0.2904, + "step": 3618 + }, + { + "epoch": 0.7245796637309848, + "grad_norm": 2.42409348487854, + "learning_rate": 1.995488010273198e-05, + "loss": 0.1183, + "step": 3620 + }, + { + "epoch": 0.7253803042433947, + "grad_norm": 1.028183102607727, + "learning_rate": 1.9953544325158755e-05, + "loss": 0.185, + "step": 3622 + }, + { + "epoch": 0.7253803042433947, + "grad_norm": 1.4447120428085327, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.1608, + "step": 3624 + }, + { + "epoch": 0.7261809447558046, + "grad_norm": 8.240716934204102, + "learning_rate": 1.9950814454195953e-05, + "loss": 0.471, + "step": 3626 + }, + { + "epoch": 0.7261809447558046, + "grad_norm": 0.680472195148468, + "learning_rate": 1.9949420366137873e-05, + "loss": 0.072, + "step": 3628 + }, + { + "epoch": 0.7269815852682145, + "grad_norm": 1.092917561531067, + "learning_rate": 1.9948006846646262e-05, + "loss": 0.2038, + "step": 3630 + }, + { + "epoch": 0.7269815852682145, + "grad_norm": 0.3755256235599518, + "learning_rate": 1.994657389848176e-05, + "loss": 0.0427, + "step": 3632 + }, + { + "epoch": 0.7277822257806245, + "grad_norm": 5.257049560546875, + "learning_rate": 1.9945121524442947e-05, + "loss": 0.6136, + "step": 3634 + }, + { + "epoch": 0.7277822257806245, + "grad_norm": 0.129800945520401, + "learning_rate": 1.994364972736634e-05, + "loss": 0.0443, + "step": 3636 + }, + { + "epoch": 0.7285828662930345, + "grad_norm": 5.304008483886719, + "learning_rate": 1.9942158510126384e-05, + "loss": 0.704, + "step": 3638 + }, + { + "epoch": 0.7285828662930345, + "grad_norm": 1.296195387840271, + "learning_rate": 1.9940647875635466e-05, + "loss": 0.2637, + "step": 3640 + }, + { + "epoch": 0.7293835068054444, + "grad_norm": 20.882789611816406, + "learning_rate": 1.9939117826843887e-05, + "loss": 0.758, + "step": 3642 + }, + { + "epoch": 0.7293835068054444, + "grad_norm": 4.087749481201172, + "learning_rate": 1.993756836673986e-05, + "loss": 0.2872, + "step": 3644 + }, + { + "epoch": 0.7301841473178543, + "grad_norm": 13.557066917419434, + "learning_rate": 1.9935999498349525e-05, + "loss": 0.778, + "step": 3646 + }, + { + "epoch": 0.7301841473178543, + "grad_norm": 3.781580686569214, + "learning_rate": 1.99344112247369e-05, + "loss": 0.4292, + "step": 3648 + }, + { + "epoch": 0.7309847878302642, + "grad_norm": 9.510293960571289, + "learning_rate": 1.9932803549003932e-05, + "loss": 0.3415, + "step": 3650 + }, + { + "epoch": 0.7309847878302642, + "grad_norm": 3.6086361408233643, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.1407, + "step": 3652 + }, + { + "epoch": 0.7317854283426741, + "grad_norm": 7.780667781829834, + "learning_rate": 1.9929530003774136e-05, + "loss": 0.5098, + "step": 3654 + }, + { + "epoch": 0.7317854283426741, + "grad_norm": 8.015007019042969, + "learning_rate": 1.9927864140670618e-05, + "loss": 0.1708, + "step": 3656 + }, + { + "epoch": 0.732586068855084, + "grad_norm": 3.186073064804077, + "learning_rate": 1.9926178888233344e-05, + "loss": 0.2273, + "step": 3658 + }, + { + "epoch": 0.732586068855084, + "grad_norm": 1.322554111480713, + "learning_rate": 1.9924474249753656e-05, + "loss": 0.1423, + "step": 3660 + }, + { + "epoch": 0.733386709367494, + "grad_norm": 3.9222300052642822, + "learning_rate": 1.9922750228560746e-05, + "loss": 0.5509, + "step": 3662 + }, + { + "epoch": 0.733386709367494, + "grad_norm": 2.252254009246826, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.2073, + "step": 3664 + }, + { + "epoch": 0.734187349879904, + "grad_norm": 9.622203826904297, + "learning_rate": 1.9919244051541315e-05, + "loss": 0.2758, + "step": 3666 + }, + { + "epoch": 0.734187349879904, + "grad_norm": 2.5079874992370605, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.2848, + "step": 3668 + }, + { + "epoch": 0.7349879903923139, + "grad_norm": 3.8380258083343506, + "learning_rate": 1.9915660384565603e-05, + "loss": 0.3051, + "step": 3670 + }, + { + "epoch": 0.7349879903923139, + "grad_norm": 2.39021372795105, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.1278, + "step": 3672 + }, + { + "epoch": 0.7357886309047238, + "grad_norm": 1.283332347869873, + "learning_rate": 1.9911999255629504e-05, + "loss": 0.231, + "step": 3674 + }, + { + "epoch": 0.7357886309047238, + "grad_norm": 0.40193960070610046, + "learning_rate": 1.9910139651840497e-05, + "loss": 0.102, + "step": 3676 + }, + { + "epoch": 0.7365892714171337, + "grad_norm": 4.273895263671875, + "learning_rate": 1.990826069333406e-05, + "loss": 0.511, + "step": 3678 + }, + { + "epoch": 0.7365892714171337, + "grad_norm": 1.3366891145706177, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.1733, + "step": 3680 + }, + { + "epoch": 0.7373899119295436, + "grad_norm": 5.002992153167725, + "learning_rate": 1.9904444726885236e-05, + "loss": 0.1647, + "step": 3682 + }, + { + "epoch": 0.7373899119295436, + "grad_norm": 1.2469110488891602, + "learning_rate": 1.9902507726395524e-05, + "loss": 0.1267, + "step": 3684 + }, + { + "epoch": 0.7381905524419535, + "grad_norm": 3.7404067516326904, + "learning_rate": 1.9900551386093677e-05, + "loss": 0.3189, + "step": 3686 + }, + { + "epoch": 0.7381905524419535, + "grad_norm": 2.4834072589874268, + "learning_rate": 1.989857570980049e-05, + "loss": 0.2638, + "step": 3688 + }, + { + "epoch": 0.7389911929543634, + "grad_norm": 1.070765495300293, + "learning_rate": 1.9896580701374482e-05, + "loss": 0.222, + "step": 3690 + }, + { + "epoch": 0.7389911929543634, + "grad_norm": 2.5947673320770264, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.0381, + "step": 3692 + }, + { + "epoch": 0.7397918334667735, + "grad_norm": 3.993537425994873, + "learning_rate": 1.9892532703746977e-05, + "loss": 0.2905, + "step": 3694 + }, + { + "epoch": 0.7397918334667735, + "grad_norm": 0.6930636763572693, + "learning_rate": 1.9890479722451292e-05, + "loss": 0.1303, + "step": 3696 + }, + { + "epoch": 0.7405924739791834, + "grad_norm": 6.0091705322265625, + "learning_rate": 1.9888407424834437e-05, + "loss": 0.8112, + "step": 3698 + }, + { + "epoch": 0.7405924739791834, + "grad_norm": 8.96474552154541, + "learning_rate": 1.988631581494365e-05, + "loss": 0.4382, + "step": 3700 + }, + { + "epoch": 0.7413931144915933, + "grad_norm": 3.9382643699645996, + "learning_rate": 1.9884204896863895e-05, + "loss": 0.5187, + "step": 3702 + }, + { + "epoch": 0.7413931144915933, + "grad_norm": 3.3014440536499023, + "learning_rate": 1.9882074674717832e-05, + "loss": 0.3033, + "step": 3704 + }, + { + "epoch": 0.7421937550040032, + "grad_norm": 1.3698487281799316, + "learning_rate": 1.9879925152665845e-05, + "loss": 0.1697, + "step": 3706 + }, + { + "epoch": 0.7421937550040032, + "grad_norm": 1.2222024202346802, + "learning_rate": 1.987775633490599e-05, + "loss": 0.0686, + "step": 3708 + }, + { + "epoch": 0.7429943955164131, + "grad_norm": 12.103696823120117, + "learning_rate": 1.9875568225674005e-05, + "loss": 0.2762, + "step": 3710 + }, + { + "epoch": 0.7429943955164131, + "grad_norm": 1.8887919187545776, + "learning_rate": 1.987336082924333e-05, + "loss": 0.0596, + "step": 3712 + }, + { + "epoch": 0.743795036028823, + "grad_norm": 2.2258410453796387, + "learning_rate": 1.987113414992505e-05, + "loss": 0.4156, + "step": 3714 + }, + { + "epoch": 0.743795036028823, + "grad_norm": 2.362119197845459, + "learning_rate": 1.986888819206792e-05, + "loss": 0.1715, + "step": 3716 + }, + { + "epoch": 0.7445956765412329, + "grad_norm": 6.44940185546875, + "learning_rate": 1.986662296005834e-05, + "loss": 0.5354, + "step": 3718 + }, + { + "epoch": 0.7445956765412329, + "grad_norm": 1.9616944789886475, + "learning_rate": 1.986433845832037e-05, + "loss": 0.2775, + "step": 3720 + }, + { + "epoch": 0.745396317053643, + "grad_norm": 2.132735013961792, + "learning_rate": 1.9862034691315678e-05, + "loss": 0.1141, + "step": 3722 + }, + { + "epoch": 0.745396317053643, + "grad_norm": 0.7432072758674622, + "learning_rate": 1.9859711663543573e-05, + "loss": 0.1825, + "step": 3724 + }, + { + "epoch": 0.7461969575660529, + "grad_norm": 3.108621597290039, + "learning_rate": 1.9857369379540985e-05, + "loss": 0.2163, + "step": 3726 + }, + { + "epoch": 0.7461969575660529, + "grad_norm": 2.6190829277038574, + "learning_rate": 1.9855007843882437e-05, + "loss": 0.2069, + "step": 3728 + }, + { + "epoch": 0.7469975980784628, + "grad_norm": 2.666851758956909, + "learning_rate": 1.985262706118007e-05, + "loss": 0.2612, + "step": 3730 + }, + { + "epoch": 0.7469975980784628, + "grad_norm": 1.0152792930603027, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.0894, + "step": 3732 + }, + { + "epoch": 0.7477982385908727, + "grad_norm": 1.685050129890442, + "learning_rate": 1.9847807773280314e-05, + "loss": 0.5226, + "step": 3734 + }, + { + "epoch": 0.7477982385908727, + "grad_norm": 0.05818062648177147, + "learning_rate": 1.9845369277495105e-05, + "loss": 0.0991, + "step": 3736 + }, + { + "epoch": 0.7485988791032826, + "grad_norm": 8.969691276550293, + "learning_rate": 1.9842911553490396e-05, + "loss": 1.3194, + "step": 3738 + }, + { + "epoch": 0.7485988791032826, + "grad_norm": 3.6384401321411133, + "learning_rate": 1.9840434606066186e-05, + "loss": 0.3924, + "step": 3740 + }, + { + "epoch": 0.7493995196156925, + "grad_norm": 4.435880661010742, + "learning_rate": 1.983793844005999e-05, + "loss": 0.3098, + "step": 3742 + }, + { + "epoch": 0.7493995196156925, + "grad_norm": 2.0602378845214844, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.1515, + "step": 3744 + }, + { + "epoch": 0.7502001601281025, + "grad_norm": 4.728958606719971, + "learning_rate": 1.9832888471839475e-05, + "loss": 0.6146, + "step": 3746 + }, + { + "epoch": 0.7502001601281025, + "grad_norm": 4.3305840492248535, + "learning_rate": 1.983033467948784e-05, + "loss": 0.3697, + "step": 3748 + }, + { + "epoch": 0.7510008006405124, + "grad_norm": 4.002319812774658, + "learning_rate": 1.9827761688279613e-05, + "loss": 0.339, + "step": 3750 + }, + { + "epoch": 0.7510008006405124, + "grad_norm": 2.359212875366211, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.1585, + "step": 3752 + }, + { + "epoch": 0.7518014411529224, + "grad_norm": 3.0198311805725098, + "learning_rate": 1.9822558129431263e-05, + "loss": 0.298, + "step": 3754 + }, + { + "epoch": 0.7518014411529224, + "grad_norm": 3.044529676437378, + "learning_rate": 1.9819927571953804e-05, + "loss": 0.2762, + "step": 3756 + }, + { + "epoch": 0.7526020816653323, + "grad_norm": 1.946054458618164, + "learning_rate": 1.981727783594506e-05, + "loss": 0.1077, + "step": 3758 + }, + { + "epoch": 0.7526020816653323, + "grad_norm": 0.5331586003303528, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.0723, + "step": 3760 + }, + { + "epoch": 0.7534027221777422, + "grad_norm": 3.217803955078125, + "learning_rate": 1.9811920849071092e-05, + "loss": 0.4161, + "step": 3762 + }, + { + "epoch": 0.7534027221777422, + "grad_norm": 2.459212303161621, + "learning_rate": 1.980921360866819e-05, + "loss": 0.2505, + "step": 3764 + }, + { + "epoch": 0.7542033626901521, + "grad_norm": 4.551301002502441, + "learning_rate": 1.980648721065859e-05, + "loss": 0.475, + "step": 3766 + }, + { + "epoch": 0.7542033626901521, + "grad_norm": 3.3786633014678955, + "learning_rate": 1.9803741660367018e-05, + "loss": 0.1823, + "step": 3768 + }, + { + "epoch": 0.755004003202562, + "grad_norm": 0.8134878277778625, + "learning_rate": 1.980097696315558e-05, + "loss": 0.0592, + "step": 3770 + }, + { + "epoch": 0.755004003202562, + "grad_norm": 3.469376564025879, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.1143, + "step": 3772 + }, + { + "epoch": 0.755804643714972, + "grad_norm": 3.5760271549224854, + "learning_rate": 1.979539014960858e-05, + "loss": 0.1957, + "step": 3774 + }, + { + "epoch": 0.755804643714972, + "grad_norm": 0.9199708700180054, + "learning_rate": 1.979256804418418e-05, + "loss": 0.195, + "step": 3776 + }, + { + "epoch": 0.7566052842273819, + "grad_norm": 4.32065486907959, + "learning_rate": 1.9789726813662233e-05, + "loss": 0.5965, + "step": 3778 + }, + { + "epoch": 0.7566052842273819, + "grad_norm": 0.6332926154136658, + "learning_rate": 1.978686646359173e-05, + "loss": 0.0966, + "step": 3780 + }, + { + "epoch": 0.7574059247397918, + "grad_norm": 3.9632043838500977, + "learning_rate": 1.9783986999558994e-05, + "loss": 0.4714, + "step": 3782 + }, + { + "epoch": 0.7574059247397918, + "grad_norm": 4.317626953125, + "learning_rate": 1.9781088427187677e-05, + "loss": 0.2505, + "step": 3784 + }, + { + "epoch": 0.7582065652522018, + "grad_norm": 2.513829469680786, + "learning_rate": 1.9778170752138763e-05, + "loss": 0.2297, + "step": 3786 + }, + { + "epoch": 0.7582065652522018, + "grad_norm": 1.435736060142517, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.095, + "step": 3788 + }, + { + "epoch": 0.7590072057646117, + "grad_norm": 2.9074862003326416, + "learning_rate": 1.9772278116838546e-05, + "loss": 0.2164, + "step": 3790 + }, + { + "epoch": 0.7590072057646117, + "grad_norm": 1.617477297782898, + "learning_rate": 1.976930316809569e-05, + "loss": 0.1114, + "step": 3792 + }, + { + "epoch": 0.7598078462770216, + "grad_norm": 3.273162603378296, + "learning_rate": 1.97663091396921e-05, + "loss": 0.6029, + "step": 3794 + }, + { + "epoch": 0.7598078462770216, + "grad_norm": 0.3599972128868103, + "learning_rate": 1.9763296037475177e-05, + "loss": 0.0775, + "step": 3796 + }, + { + "epoch": 0.7606084867894315, + "grad_norm": 5.136079788208008, + "learning_rate": 1.976026386732957e-05, + "loss": 0.5703, + "step": 3798 + }, + { + "epoch": 0.7606084867894315, + "grad_norm": 2.607485055923462, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.2279, + "step": 3800 + }, + { + "epoch": 0.7614091273018415, + "grad_norm": 3.238182544708252, + "learning_rate": 1.9754142346977122e-05, + "loss": 0.1403, + "step": 3802 + }, + { + "epoch": 0.7614091273018415, + "grad_norm": 0.668150007724762, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.0887, + "step": 3804 + }, + { + "epoch": 0.7622097678142514, + "grad_norm": 2.8943252563476562, + "learning_rate": 1.9747944626456577e-05, + "loss": 0.3632, + "step": 3806 + }, + { + "epoch": 0.7622097678142514, + "grad_norm": 1.5986707210540771, + "learning_rate": 1.9744817206240374e-05, + "loss": 0.1669, + "step": 3808 + }, + { + "epoch": 0.7630104083266613, + "grad_norm": 6.4006876945495605, + "learning_rate": 1.9741670754185054e-05, + "loss": 0.6576, + "step": 3810 + }, + { + "epoch": 0.7630104083266613, + "grad_norm": 4.9398884773254395, + "learning_rate": 1.9738505276435695e-05, + "loss": 0.1955, + "step": 3812 + }, + { + "epoch": 0.7638110488390712, + "grad_norm": 7.481172561645508, + "learning_rate": 1.9735320779174548e-05, + "loss": 1.0954, + "step": 3814 + }, + { + "epoch": 0.7638110488390712, + "grad_norm": 4.712623596191406, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.2102, + "step": 3816 + }, + { + "epoch": 0.7646116893514812, + "grad_norm": 5.762992858886719, + "learning_rate": 1.9728894751031595e-05, + "loss": 0.3798, + "step": 3818 + }, + { + "epoch": 0.7646116893514812, + "grad_norm": 1.1963118314743042, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.173, + "step": 3820 + }, + { + "epoch": 0.7654123298638911, + "grad_norm": 1.3511197566986084, + "learning_rate": 1.972239271995686e-05, + "loss": 0.264, + "step": 3822 + }, + { + "epoch": 0.7654123298638911, + "grad_norm": 1.7166374921798706, + "learning_rate": 1.9719113219170152e-05, + "loss": 0.1575, + "step": 3824 + }, + { + "epoch": 0.7662129703763011, + "grad_norm": 10.516679763793945, + "learning_rate": 1.9715814736744758e-05, + "loss": 0.3345, + "step": 3826 + }, + { + "epoch": 0.7662129703763011, + "grad_norm": 2.7944583892822266, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.3329, + "step": 3828 + }, + { + "epoch": 0.767013610888711, + "grad_norm": 5.933850288391113, + "learning_rate": 1.9709160852783022e-05, + "loss": 0.2105, + "step": 3830 + }, + { + "epoch": 0.767013610888711, + "grad_norm": 1.307266116142273, + "learning_rate": 1.970580546424186e-05, + "loss": 0.0922, + "step": 3832 + }, + { + "epoch": 0.7678142514011209, + "grad_norm": 4.155867099761963, + "learning_rate": 1.9702431120052352e-05, + "loss": 0.1103, + "step": 3834 + }, + { + "epoch": 0.7678142514011209, + "grad_norm": 1.3520747423171997, + "learning_rate": 1.969903782680467e-05, + "loss": 0.1358, + "step": 3836 + }, + { + "epoch": 0.7686148919135308, + "grad_norm": 3.2739996910095215, + "learning_rate": 1.9695625591125984e-05, + "loss": 0.4837, + "step": 3838 + }, + { + "epoch": 0.7686148919135308, + "grad_norm": 0.7425373196601868, + "learning_rate": 1.9692194419680463e-05, + "loss": 0.2359, + "step": 3840 + }, + { + "epoch": 0.7694155324259407, + "grad_norm": 4.643808841705322, + "learning_rate": 1.968874431916926e-05, + "loss": 0.2259, + "step": 3842 + }, + { + "epoch": 0.7694155324259407, + "grad_norm": 2.800455093383789, + "learning_rate": 1.96852752963305e-05, + "loss": 0.0797, + "step": 3844 + }, + { + "epoch": 0.7702161729383507, + "grad_norm": 2.4526994228363037, + "learning_rate": 1.9681787357939257e-05, + "loss": 0.3343, + "step": 3846 + }, + { + "epoch": 0.7702161729383507, + "grad_norm": 1.5955270528793335, + "learning_rate": 1.9678280510807552e-05, + "loss": 0.0814, + "step": 3848 + }, + { + "epoch": 0.7710168134507606, + "grad_norm": 2.30515456199646, + "learning_rate": 1.9674754761784334e-05, + "loss": 0.2376, + "step": 3850 + }, + { + "epoch": 0.7710168134507606, + "grad_norm": 0.2819700241088867, + "learning_rate": 1.9671210117755462e-05, + "loss": 0.1087, + "step": 3852 + }, + { + "epoch": 0.7718174539631706, + "grad_norm": 9.822226524353027, + "learning_rate": 1.9667646585643706e-05, + "loss": 0.4163, + "step": 3854 + }, + { + "epoch": 0.7718174539631706, + "grad_norm": 5.362023830413818, + "learning_rate": 1.966406417240872e-05, + "loss": 0.3329, + "step": 3856 + }, + { + "epoch": 0.7726180944755805, + "grad_norm": 5.430173873901367, + "learning_rate": 1.966046288504704e-05, + "loss": 0.2723, + "step": 3858 + }, + { + "epoch": 0.7726180944755805, + "grad_norm": 0.14677520096302032, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.0828, + "step": 3860 + }, + { + "epoch": 0.7734187349879904, + "grad_norm": 5.093488693237305, + "learning_rate": 1.965320371611399e-05, + "loss": 0.2713, + "step": 3862 + }, + { + "epoch": 0.7734187349879904, + "grad_norm": 10.066317558288574, + "learning_rate": 1.964954584871995e-05, + "loss": 0.1766, + "step": 3864 + }, + { + "epoch": 0.7742193755004003, + "grad_norm": 3.6446893215179443, + "learning_rate": 1.964586913555381e-05, + "loss": 0.4609, + "step": 3866 + }, + { + "epoch": 0.7742193755004003, + "grad_norm": 3.108360767364502, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.2066, + "step": 3868 + }, + { + "epoch": 0.7750200160128102, + "grad_norm": 11.169960975646973, + "learning_rate": 1.9638459200664822e-05, + "loss": 0.559, + "step": 3870 + }, + { + "epoch": 0.7750200160128102, + "grad_norm": 8.972526550292969, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.3643, + "step": 3872 + }, + { + "epoch": 0.7758206565252201, + "grad_norm": 13.28365707397461, + "learning_rate": 1.9630973969334068e-05, + "loss": 0.4182, + "step": 3874 + }, + { + "epoch": 0.7758206565252201, + "grad_norm": 0.06249832734465599, + "learning_rate": 1.9627203135753576e-05, + "loss": 0.1648, + "step": 3876 + }, + { + "epoch": 0.77662129703763, + "grad_norm": 4.370636463165283, + "learning_rate": 1.9623413500036795e-05, + "loss": 0.2322, + "step": 3878 + }, + { + "epoch": 0.77662129703763, + "grad_norm": 2.740384578704834, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.1215, + "step": 3880 + }, + { + "epoch": 0.7774219375500401, + "grad_norm": 8.083786010742188, + "learning_rate": 1.9615777851836007e-05, + "loss": 0.5904, + "step": 3882 + }, + { + "epoch": 0.7774219375500401, + "grad_norm": 3.0525734424591064, + "learning_rate": 1.961193185426459e-05, + "loss": 0.1512, + "step": 3884 + }, + { + "epoch": 0.77822257806245, + "grad_norm": 0.7152694463729858, + "learning_rate": 1.9608067084382025e-05, + "loss": 0.0367, + "step": 3886 + }, + { + "epoch": 0.77822257806245, + "grad_norm": 0.2191184163093567, + "learning_rate": 1.9604183549736287e-05, + "loss": 0.0756, + "step": 3888 + }, + { + "epoch": 0.7790232185748599, + "grad_norm": 6.865670680999756, + "learning_rate": 1.9600281257912002e-05, + "loss": 0.8803, + "step": 3890 + }, + { + "epoch": 0.7790232185748599, + "grad_norm": 0.10013610869646072, + "learning_rate": 1.959636021653044e-05, + "loss": 0.0812, + "step": 3892 + }, + { + "epoch": 0.7798238590872698, + "grad_norm": 9.476913452148438, + "learning_rate": 1.9592420433249465e-05, + "loss": 0.8296, + "step": 3894 + }, + { + "epoch": 0.7798238590872698, + "grad_norm": 3.9510371685028076, + "learning_rate": 1.958846191576357e-05, + "loss": 0.1758, + "step": 3896 + }, + { + "epoch": 0.7806244995996797, + "grad_norm": 6.593127250671387, + "learning_rate": 1.958448467180382e-05, + "loss": 1.0322, + "step": 3898 + }, + { + "epoch": 0.7806244995996797, + "grad_norm": 1.6347088813781738, + "learning_rate": 1.958048870913786e-05, + "loss": 0.2294, + "step": 3900 + }, + { + "epoch": 0.7814251401120896, + "grad_norm": 1.689170241355896, + "learning_rate": 1.9576474035569895e-05, + "loss": 0.2327, + "step": 3902 + }, + { + "epoch": 0.7814251401120896, + "grad_norm": 5.172220230102539, + "learning_rate": 1.9572440658940667e-05, + "loss": 0.0908, + "step": 3904 + }, + { + "epoch": 0.7822257806244995, + "grad_norm": 1.653778076171875, + "learning_rate": 1.9568388587127448e-05, + "loss": 0.6591, + "step": 3906 + }, + { + "epoch": 0.7822257806244995, + "grad_norm": 4.496364116668701, + "learning_rate": 1.9564317828044022e-05, + "loss": 0.2292, + "step": 3908 + }, + { + "epoch": 0.7830264211369096, + "grad_norm": 4.258923053741455, + "learning_rate": 1.9560228389640668e-05, + "loss": 0.404, + "step": 3910 + }, + { + "epoch": 0.7830264211369096, + "grad_norm": 3.3120033740997314, + "learning_rate": 1.955612027990415e-05, + "loss": 0.1683, + "step": 3912 + }, + { + "epoch": 0.7838270616493195, + "grad_norm": 3.3662068843841553, + "learning_rate": 1.955199350685769e-05, + "loss": 0.6128, + "step": 3914 + }, + { + "epoch": 0.7838270616493195, + "grad_norm": 3.126394748687744, + "learning_rate": 1.9547848078560982e-05, + "loss": 0.1955, + "step": 3916 + }, + { + "epoch": 0.7846277021617294, + "grad_norm": 3.094393253326416, + "learning_rate": 1.954368400311011e-05, + "loss": 0.5204, + "step": 3918 + }, + { + "epoch": 0.7846277021617294, + "grad_norm": 1.532977819442749, + "learning_rate": 1.953950128863763e-05, + "loss": 0.1501, + "step": 3920 + }, + { + "epoch": 0.7854283426741393, + "grad_norm": 5.69983434677124, + "learning_rate": 1.9535299943312455e-05, + "loss": 0.4328, + "step": 3922 + }, + { + "epoch": 0.7854283426741393, + "grad_norm": 1.675588607788086, + "learning_rate": 1.9531079975339915e-05, + "loss": 0.1274, + "step": 3924 + }, + { + "epoch": 0.7862289831865492, + "grad_norm": 0.29629939794540405, + "learning_rate": 1.9526841392961694e-05, + "loss": 0.0601, + "step": 3926 + }, + { + "epoch": 0.7862289831865492, + "grad_norm": 0.9080084562301636, + "learning_rate": 1.9522584204455835e-05, + "loss": 0.0566, + "step": 3928 + }, + { + "epoch": 0.7870296236989591, + "grad_norm": 1.2637550830841064, + "learning_rate": 1.9518308418136728e-05, + "loss": 0.4773, + "step": 3930 + }, + { + "epoch": 0.7870296236989591, + "grad_norm": 1.6753275394439697, + "learning_rate": 1.9514014042355054e-05, + "loss": 0.0689, + "step": 3932 + }, + { + "epoch": 0.7878302642113691, + "grad_norm": 4.526984214782715, + "learning_rate": 1.9509701085497852e-05, + "loss": 0.5968, + "step": 3934 + }, + { + "epoch": 0.7878302642113691, + "grad_norm": 2.4850504398345947, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.2402, + "step": 3936 + }, + { + "epoch": 0.7886309047237791, + "grad_norm": 3.779095411300659, + "learning_rate": 1.9501019462286266e-05, + "loss": 0.5457, + "step": 3938 + }, + { + "epoch": 0.7886309047237791, + "grad_norm": 2.7290821075439453, + "learning_rate": 1.9496650812887293e-05, + "loss": 0.2278, + "step": 3940 + }, + { + "epoch": 0.789431545236189, + "grad_norm": 6.2700581550598145, + "learning_rate": 1.9492263616323536e-05, + "loss": 0.4925, + "step": 3942 + }, + { + "epoch": 0.789431545236189, + "grad_norm": 1.0756261348724365, + "learning_rate": 1.9487857881163295e-05, + "loss": 0.2072, + "step": 3944 + }, + { + "epoch": 0.7902321857485989, + "grad_norm": 2.7320897579193115, + "learning_rate": 1.948343361601105e-05, + "loss": 0.2277, + "step": 3946 + }, + { + "epoch": 0.7902321857485989, + "grad_norm": 3.579179048538208, + "learning_rate": 1.947899082950751e-05, + "loss": 0.2238, + "step": 3948 + }, + { + "epoch": 0.7910328262610088, + "grad_norm": 2.995529890060425, + "learning_rate": 1.947452953032951e-05, + "loss": 0.3037, + "step": 3950 + }, + { + "epoch": 0.7910328262610088, + "grad_norm": 1.0693085193634033, + "learning_rate": 1.947004972719008e-05, + "loss": 0.0978, + "step": 3952 + }, + { + "epoch": 0.7918334667734187, + "grad_norm": 1.4225482940673828, + "learning_rate": 1.9465551428838363e-05, + "loss": 0.2095, + "step": 3954 + }, + { + "epoch": 0.7918334667734187, + "grad_norm": 4.264603137969971, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.1361, + "step": 3956 + }, + { + "epoch": 0.7926341072858286, + "grad_norm": 0.8472277522087097, + "learning_rate": 1.9456499381675285e-05, + "loss": 0.1378, + "step": 3958 + }, + { + "epoch": 0.7926341072858286, + "grad_norm": 0.07638305425643921, + "learning_rate": 1.945194565054276e-05, + "loss": 0.0735, + "step": 3960 + }, + { + "epoch": 0.7934347477982386, + "grad_norm": 2.322277069091797, + "learning_rate": 1.9447373459555617e-05, + "loss": 0.1981, + "step": 3962 + }, + { + "epoch": 0.7934347477982386, + "grad_norm": 1.682759404182434, + "learning_rate": 1.9442782817643425e-05, + "loss": 0.0536, + "step": 3964 + }, + { + "epoch": 0.7942353883106485, + "grad_norm": 7.871078968048096, + "learning_rate": 1.9438173733771814e-05, + "loss": 0.8039, + "step": 3966 + }, + { + "epoch": 0.7942353883106485, + "grad_norm": 0.13976740837097168, + "learning_rate": 1.9433546216942433e-05, + "loss": 0.1629, + "step": 3968 + }, + { + "epoch": 0.7950360288230585, + "grad_norm": 7.626214504241943, + "learning_rate": 1.9428900276192903e-05, + "loss": 0.6161, + "step": 3970 + }, + { + "epoch": 0.7950360288230585, + "grad_norm": 4.12851095199585, + "learning_rate": 1.942423592059687e-05, + "loss": 0.3329, + "step": 3972 + }, + { + "epoch": 0.7958366693354684, + "grad_norm": 3.2808837890625, + "learning_rate": 1.94195531592639e-05, + "loss": 0.2324, + "step": 3974 + }, + { + "epoch": 0.7958366693354684, + "grad_norm": 1.5002015829086304, + "learning_rate": 1.941485200133955e-05, + "loss": 0.0879, + "step": 3976 + }, + { + "epoch": 0.7966373098478783, + "grad_norm": 2.6623218059539795, + "learning_rate": 1.9410132456005262e-05, + "loss": 0.3187, + "step": 3978 + }, + { + "epoch": 0.7966373098478783, + "grad_norm": 1.2459443807601929, + "learning_rate": 1.9405394532478422e-05, + "loss": 0.0734, + "step": 3980 + }, + { + "epoch": 0.7974379503602882, + "grad_norm": 2.6879022121429443, + "learning_rate": 1.94006382400123e-05, + "loss": 0.3447, + "step": 3982 + }, + { + "epoch": 0.7974379503602882, + "grad_norm": 2.133051633834839, + "learning_rate": 1.9395863587896025e-05, + "loss": 0.1031, + "step": 3984 + }, + { + "epoch": 0.7982385908726981, + "grad_norm": 5.9016594886779785, + "learning_rate": 1.939107058545461e-05, + "loss": 0.3327, + "step": 3986 + }, + { + "epoch": 0.7982385908726981, + "grad_norm": 0.587170422077179, + "learning_rate": 1.938625924204888e-05, + "loss": 0.0925, + "step": 3988 + }, + { + "epoch": 0.7990392313851081, + "grad_norm": 9.500762939453125, + "learning_rate": 1.9381429567075507e-05, + "loss": 0.763, + "step": 3990 + }, + { + "epoch": 0.7990392313851081, + "grad_norm": 14.768190383911133, + "learning_rate": 1.937658156996694e-05, + "loss": 0.5698, + "step": 3992 + }, + { + "epoch": 0.799839871897518, + "grad_norm": 0.844855546951294, + "learning_rate": 1.9371715260191425e-05, + "loss": 0.066, + "step": 3994 + }, + { + "epoch": 0.799839871897518, + "grad_norm": 0.4304735064506531, + "learning_rate": 1.9366830647252977e-05, + "loss": 0.0366, + "step": 3996 + }, + { + "epoch": 0.800640512409928, + "grad_norm": 0.24239462614059448, + "learning_rate": 1.936192774069133e-05, + "loss": 0.1535, + "step": 3998 + }, + { + "epoch": 0.800640512409928, + "grad_norm": 5.14066743850708, + "learning_rate": 1.9357006550082e-05, + "loss": 0.1802, + "step": 4000 + }, + { + "epoch": 0.8014411529223379, + "grad_norm": 6.906435489654541, + "learning_rate": 1.9352067085036145e-05, + "loss": 0.6827, + "step": 4002 + }, + { + "epoch": 0.8014411529223379, + "grad_norm": 3.075216770172119, + "learning_rate": 1.9347109355200676e-05, + "loss": 0.1073, + "step": 4004 + }, + { + "epoch": 0.8022417934347478, + "grad_norm": 2.60537052154541, + "learning_rate": 1.9342133370258124e-05, + "loss": 0.167, + "step": 4006 + }, + { + "epoch": 0.8022417934347478, + "grad_norm": 0.4861251413822174, + "learning_rate": 1.933713913992671e-05, + "loss": 0.0317, + "step": 4008 + }, + { + "epoch": 0.8030424339471577, + "grad_norm": 3.1748709678649902, + "learning_rate": 1.9332126673960276e-05, + "loss": 0.4548, + "step": 4010 + }, + { + "epoch": 0.8030424339471577, + "grad_norm": 0.6028808951377869, + "learning_rate": 1.9327095982148255e-05, + "loss": 0.1449, + "step": 4012 + }, + { + "epoch": 0.8038430744595677, + "grad_norm": 1.5240721702575684, + "learning_rate": 1.932204707431572e-05, + "loss": 0.3923, + "step": 4014 + }, + { + "epoch": 0.8038430744595677, + "grad_norm": 6.58589506149292, + "learning_rate": 1.9316979960323283e-05, + "loss": 0.2712, + "step": 4016 + }, + { + "epoch": 0.8046437149719776, + "grad_norm": 3.9160521030426025, + "learning_rate": 1.9311894650067146e-05, + "loss": 1.1145, + "step": 4018 + }, + { + "epoch": 0.8046437149719776, + "grad_norm": 4.43244743347168, + "learning_rate": 1.9306791153479017e-05, + "loss": 0.3659, + "step": 4020 + }, + { + "epoch": 0.8054443554843875, + "grad_norm": 8.40818977355957, + "learning_rate": 1.9301669480526118e-05, + "loss": 0.6901, + "step": 4022 + }, + { + "epoch": 0.8054443554843875, + "grad_norm": 1.5860828161239624, + "learning_rate": 1.9296529641211226e-05, + "loss": 0.2701, + "step": 4024 + }, + { + "epoch": 0.8062449959967974, + "grad_norm": 6.084012985229492, + "learning_rate": 1.929137164557252e-05, + "loss": 0.4654, + "step": 4026 + }, + { + "epoch": 0.8062449959967974, + "grad_norm": 1.9051108360290527, + "learning_rate": 1.928619550368371e-05, + "loss": 0.1712, + "step": 4028 + }, + { + "epoch": 0.8070456365092074, + "grad_norm": 4.623058319091797, + "learning_rate": 1.9281001225653883e-05, + "loss": 0.4661, + "step": 4030 + }, + { + "epoch": 0.8070456365092074, + "grad_norm": 2.973703145980835, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.2105, + "step": 4032 + }, + { + "epoch": 0.8078462770216173, + "grad_norm": 0.5655364394187927, + "learning_rate": 1.9270558301784808e-05, + "loss": 0.1005, + "step": 4034 + }, + { + "epoch": 0.8078462770216173, + "grad_norm": 1.5539458990097046, + "learning_rate": 1.9265309676340783e-05, + "loss": 0.1166, + "step": 4036 + }, + { + "epoch": 0.8086469175340272, + "grad_norm": 5.289125919342041, + "learning_rate": 1.9260042955546247e-05, + "loss": 0.4507, + "step": 4038 + }, + { + "epoch": 0.8086469175340272, + "grad_norm": 1.0275965929031372, + "learning_rate": 1.9254758149687187e-05, + "loss": 0.2438, + "step": 4040 + }, + { + "epoch": 0.8094475580464372, + "grad_norm": 5.605642318725586, + "learning_rate": 1.9249455269084972e-05, + "loss": 0.4496, + "step": 4042 + }, + { + "epoch": 0.8094475580464372, + "grad_norm": 5.4897847175598145, + "learning_rate": 1.9244134324096216e-05, + "loss": 0.2935, + "step": 4044 + }, + { + "epoch": 0.8102481985588471, + "grad_norm": 4.178155422210693, + "learning_rate": 1.9238795325112867e-05, + "loss": 0.1997, + "step": 4046 + }, + { + "epoch": 0.8102481985588471, + "grad_norm": 4.780753135681152, + "learning_rate": 1.9233438282562095e-05, + "loss": 0.2939, + "step": 4048 + }, + { + "epoch": 0.811048839071257, + "grad_norm": 2.229004383087158, + "learning_rate": 1.9228063206906302e-05, + "loss": 0.2291, + "step": 4050 + }, + { + "epoch": 0.811048839071257, + "grad_norm": 0.6525901556015015, + "learning_rate": 1.9222670108643156e-05, + "loss": 0.0794, + "step": 4052 + }, + { + "epoch": 0.8118494795836669, + "grad_norm": 5.27366828918457, + "learning_rate": 1.9217258998305464e-05, + "loss": 0.7226, + "step": 4054 + }, + { + "epoch": 0.8118494795836669, + "grad_norm": 0.8718607425689697, + "learning_rate": 1.9211829886461278e-05, + "loss": 0.1078, + "step": 4056 + }, + { + "epoch": 0.8126501200960768, + "grad_norm": 6.0721893310546875, + "learning_rate": 1.9206382783713735e-05, + "loss": 1.0324, + "step": 4058 + }, + { + "epoch": 0.8126501200960768, + "grad_norm": 3.7400641441345215, + "learning_rate": 1.9200917700701176e-05, + "loss": 0.4326, + "step": 4060 + }, + { + "epoch": 0.8134507606084868, + "grad_norm": 1.947516918182373, + "learning_rate": 1.9195434648097013e-05, + "loss": 0.1915, + "step": 4062 + }, + { + "epoch": 0.8134507606084868, + "grad_norm": 0.5172745585441589, + "learning_rate": 1.918993363660975e-05, + "loss": 0.1037, + "step": 4064 + }, + { + "epoch": 0.8142514011208967, + "grad_norm": 1.3781492710113525, + "learning_rate": 1.9184414676983013e-05, + "loss": 0.1666, + "step": 4066 + }, + { + "epoch": 0.8142514011208967, + "grad_norm": 1.409956932067871, + "learning_rate": 1.9178877779995416e-05, + "loss": 0.2224, + "step": 4068 + }, + { + "epoch": 0.8150520416333067, + "grad_norm": 3.40158748626709, + "learning_rate": 1.9173322956460678e-05, + "loss": 0.4155, + "step": 4070 + }, + { + "epoch": 0.8150520416333067, + "grad_norm": 2.1078975200653076, + "learning_rate": 1.916775021722745e-05, + "loss": 0.3047, + "step": 4072 + }, + { + "epoch": 0.8158526821457166, + "grad_norm": 2.7200639247894287, + "learning_rate": 1.9162159573179446e-05, + "loss": 0.2423, + "step": 4074 + }, + { + "epoch": 0.8158526821457166, + "grad_norm": 1.8931523561477661, + "learning_rate": 1.9156551035235298e-05, + "loss": 0.1714, + "step": 4076 + }, + { + "epoch": 0.8166533226581265, + "grad_norm": 3.82817006111145, + "learning_rate": 1.915092461434859e-05, + "loss": 0.4274, + "step": 4078 + }, + { + "epoch": 0.8166533226581265, + "grad_norm": 1.7903690338134766, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.1886, + "step": 4080 + }, + { + "epoch": 0.8174539631705364, + "grad_norm": 5.318288803100586, + "learning_rate": 1.9139618167736547e-05, + "loss": 0.6775, + "step": 4082 + }, + { + "epoch": 0.8174539631705364, + "grad_norm": 2.358879327774048, + "learning_rate": 1.9133938164092942e-05, + "loss": 0.1756, + "step": 4084 + }, + { + "epoch": 0.8182546036829463, + "grad_norm": 8.168492317199707, + "learning_rate": 1.912824032167022e-05, + "loss": 0.4714, + "step": 4086 + }, + { + "epoch": 0.8182546036829463, + "grad_norm": 3.081153631210327, + "learning_rate": 1.9122524651596372e-05, + "loss": 0.3974, + "step": 4088 + }, + { + "epoch": 0.8190552441953562, + "grad_norm": 3.3065969944000244, + "learning_rate": 1.911679116503426e-05, + "loss": 0.2781, + "step": 4090 + }, + { + "epoch": 0.8190552441953562, + "grad_norm": 2.8107573986053467, + "learning_rate": 1.9111039873181475e-05, + "loss": 0.1478, + "step": 4092 + }, + { + "epoch": 0.8198558847077662, + "grad_norm": 6.291133403778076, + "learning_rate": 1.9105270787270446e-05, + "loss": 0.5193, + "step": 4094 + }, + { + "epoch": 0.8198558847077662, + "grad_norm": 3.3825061321258545, + "learning_rate": 1.9099483918568287e-05, + "loss": 0.3757, + "step": 4096 + }, + { + "epoch": 0.8206565252201762, + "grad_norm": 6.831535816192627, + "learning_rate": 1.9093679278376913e-05, + "loss": 0.2156, + "step": 4098 + }, + { + "epoch": 0.8206565252201762, + "grad_norm": 9.504416465759277, + "learning_rate": 1.90878568780329e-05, + "loss": 0.3092, + "step": 4100 + }, + { + "epoch": 0.8214571657325861, + "grad_norm": 1.0027188062667847, + "learning_rate": 1.90820167289075e-05, + "loss": 0.2187, + "step": 4102 + }, + { + "epoch": 0.8214571657325861, + "grad_norm": 1.8388631343841553, + "learning_rate": 1.907615884240668e-05, + "loss": 0.1951, + "step": 4104 + }, + { + "epoch": 0.822257806244996, + "grad_norm": 1.5271998643875122, + "learning_rate": 1.9070283229971003e-05, + "loss": 0.3556, + "step": 4106 + }, + { + "epoch": 0.822257806244996, + "grad_norm": 0.8319740295410156, + "learning_rate": 1.9064389903075683e-05, + "loss": 0.1399, + "step": 4108 + }, + { + "epoch": 0.8230584467574059, + "grad_norm": 3.888397693634033, + "learning_rate": 1.9058478873230487e-05, + "loss": 0.4495, + "step": 4110 + }, + { + "epoch": 0.8230584467574059, + "grad_norm": 2.0186636447906494, + "learning_rate": 1.905255015197982e-05, + "loss": 0.1573, + "step": 4112 + }, + { + "epoch": 0.8238590872698158, + "grad_norm": 2.8909239768981934, + "learning_rate": 1.9046603750902585e-05, + "loss": 0.2237, + "step": 4114 + }, + { + "epoch": 0.8238590872698158, + "grad_norm": 4.778850555419922, + "learning_rate": 1.9040639681612216e-05, + "loss": 0.51, + "step": 4116 + }, + { + "epoch": 0.8246597277822257, + "grad_norm": 4.81637716293335, + "learning_rate": 1.9034657955756702e-05, + "loss": 0.2506, + "step": 4118 + }, + { + "epoch": 0.8246597277822257, + "grad_norm": 1.5094337463378906, + "learning_rate": 1.902865858501845e-05, + "loss": 0.1067, + "step": 4120 + }, + { + "epoch": 0.8254603682946358, + "grad_norm": 4.91127872467041, + "learning_rate": 1.9022641581114396e-05, + "loss": 0.3034, + "step": 4122 + }, + { + "epoch": 0.8254603682946358, + "grad_norm": 2.1556341648101807, + "learning_rate": 1.9016606955795843e-05, + "loss": 0.1617, + "step": 4124 + }, + { + "epoch": 0.8262610088070457, + "grad_norm": 2.643125057220459, + "learning_rate": 1.901055472084858e-05, + "loss": 0.4226, + "step": 4126 + }, + { + "epoch": 0.8262610088070457, + "grad_norm": 4.4631147384643555, + "learning_rate": 1.9004484888092734e-05, + "loss": 0.2902, + "step": 4128 + }, + { + "epoch": 0.8270616493194556, + "grad_norm": 2.3088715076446533, + "learning_rate": 1.8998397469382812e-05, + "loss": 0.3332, + "step": 4130 + }, + { + "epoch": 0.8270616493194556, + "grad_norm": 1.289046287536621, + "learning_rate": 1.8992292476607695e-05, + "loss": 0.0797, + "step": 4132 + }, + { + "epoch": 0.8278622898318655, + "grad_norm": 2.61971116065979, + "learning_rate": 1.898616992169054e-05, + "loss": 0.3041, + "step": 4134 + }, + { + "epoch": 0.8278622898318655, + "grad_norm": 2.6609787940979004, + "learning_rate": 1.8980029816588863e-05, + "loss": 0.1522, + "step": 4136 + }, + { + "epoch": 0.8286629303442754, + "grad_norm": 6.440356731414795, + "learning_rate": 1.89738721732944e-05, + "loss": 0.2527, + "step": 4138 + }, + { + "epoch": 0.8286629303442754, + "grad_norm": 0.53935706615448, + "learning_rate": 1.8967697003833156e-05, + "loss": 0.0761, + "step": 4140 + }, + { + "epoch": 0.8294635708566853, + "grad_norm": 3.784952402114868, + "learning_rate": 1.8961504320265392e-05, + "loss": 0.3442, + "step": 4142 + }, + { + "epoch": 0.8294635708566853, + "grad_norm": 1.6281559467315674, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.1023, + "step": 4144 + }, + { + "epoch": 0.8302642113690952, + "grad_norm": 1.3377023935317993, + "learning_rate": 1.8949066459222224e-05, + "loss": 0.2951, + "step": 4146 + }, + { + "epoch": 0.8302642113690952, + "grad_norm": 2.442554235458374, + "learning_rate": 1.8942821306038227e-05, + "loss": 0.0931, + "step": 4148 + }, + { + "epoch": 0.8310648518815053, + "grad_norm": 1.2032828330993652, + "learning_rate": 1.8936558687330492e-05, + "loss": 0.1255, + "step": 4150 + }, + { + "epoch": 0.8310648518815053, + "grad_norm": 1.0676536560058594, + "learning_rate": 1.893027861533003e-05, + "loss": 0.083, + "step": 4152 + }, + { + "epoch": 0.8318654923939152, + "grad_norm": 0.6579300761222839, + "learning_rate": 1.8923981102301944e-05, + "loss": 0.0746, + "step": 4154 + }, + { + "epoch": 0.8318654923939152, + "grad_norm": 3.157085657119751, + "learning_rate": 1.891766616054545e-05, + "loss": 0.0965, + "step": 4156 + }, + { + "epoch": 0.8326661329063251, + "grad_norm": 9.201611518859863, + "learning_rate": 1.8911333802393725e-05, + "loss": 0.8063, + "step": 4158 + }, + { + "epoch": 0.8326661329063251, + "grad_norm": 3.266364574432373, + "learning_rate": 1.8904984040214043e-05, + "loss": 0.2929, + "step": 4160 + }, + { + "epoch": 0.833466773418735, + "grad_norm": 8.91170883178711, + "learning_rate": 1.8898616886407588e-05, + "loss": 0.1701, + "step": 4162 + }, + { + "epoch": 0.833466773418735, + "grad_norm": 0.1476612687110901, + "learning_rate": 1.8892232353409582e-05, + "loss": 0.0842, + "step": 4164 + }, + { + "epoch": 0.8342674139311449, + "grad_norm": 1.3913941383361816, + "learning_rate": 1.8885830453689146e-05, + "loss": 0.1442, + "step": 4166 + }, + { + "epoch": 0.8342674139311449, + "grad_norm": 1.2783845663070679, + "learning_rate": 1.8879411199749306e-05, + "loss": 0.1091, + "step": 4168 + }, + { + "epoch": 0.8350680544435548, + "grad_norm": 8.897526741027832, + "learning_rate": 1.8872974604127038e-05, + "loss": 0.8793, + "step": 4170 + }, + { + "epoch": 0.8350680544435548, + "grad_norm": 4.882668972015381, + "learning_rate": 1.8866520679393124e-05, + "loss": 0.2544, + "step": 4172 + }, + { + "epoch": 0.8358686949559647, + "grad_norm": 3.751826763153076, + "learning_rate": 1.8860049438152247e-05, + "loss": 0.3973, + "step": 4174 + }, + { + "epoch": 0.8358686949559647, + "grad_norm": 0.04675702750682831, + "learning_rate": 1.885356089304285e-05, + "loss": 0.0045, + "step": 4176 + }, + { + "epoch": 0.8366693354683747, + "grad_norm": 3.0347652435302734, + "learning_rate": 1.8847055056737236e-05, + "loss": 0.3834, + "step": 4178 + }, + { + "epoch": 0.8366693354683747, + "grad_norm": 0.2955886125564575, + "learning_rate": 1.884053194194143e-05, + "loss": 0.0735, + "step": 4180 + }, + { + "epoch": 0.8374699759807847, + "grad_norm": 1.5000163316726685, + "learning_rate": 1.8833991561395194e-05, + "loss": 0.8193, + "step": 4182 + }, + { + "epoch": 0.8374699759807847, + "grad_norm": 3.0538628101348877, + "learning_rate": 1.882743392787207e-05, + "loss": 0.2289, + "step": 4184 + }, + { + "epoch": 0.8382706164931946, + "grad_norm": 0.2960163652896881, + "learning_rate": 1.8820859054179225e-05, + "loss": 0.1824, + "step": 4186 + }, + { + "epoch": 0.8382706164931946, + "grad_norm": 0.512634813785553, + "learning_rate": 1.881426695315756e-05, + "loss": 0.1031, + "step": 4188 + }, + { + "epoch": 0.8390712570056045, + "grad_norm": 2.0235650539398193, + "learning_rate": 1.8807657637681577e-05, + "loss": 0.222, + "step": 4190 + }, + { + "epoch": 0.8390712570056045, + "grad_norm": 1.1077826023101807, + "learning_rate": 1.8801031120659396e-05, + "loss": 0.1175, + "step": 4192 + }, + { + "epoch": 0.8398718975180144, + "grad_norm": 1.8474644422531128, + "learning_rate": 1.8794387415032783e-05, + "loss": 0.2857, + "step": 4194 + }, + { + "epoch": 0.8398718975180144, + "grad_norm": 0.7331463694572449, + "learning_rate": 1.8787726533777003e-05, + "loss": 0.0354, + "step": 4196 + }, + { + "epoch": 0.8406725380304243, + "grad_norm": 1.9994101524353027, + "learning_rate": 1.8781048489900936e-05, + "loss": 0.2709, + "step": 4198 + }, + { + "epoch": 0.8406725380304243, + "grad_norm": 6.4489850997924805, + "learning_rate": 1.877435329644691e-05, + "loss": 0.5083, + "step": 4200 + }, + { + "epoch": 0.8414731785428343, + "grad_norm": 7.723071098327637, + "learning_rate": 1.876764096649082e-05, + "loss": 0.7548, + "step": 4202 + }, + { + "epoch": 0.8414731785428343, + "grad_norm": 3.1097967624664307, + "learning_rate": 1.8760911513141974e-05, + "loss": 0.1609, + "step": 4204 + }, + { + "epoch": 0.8422738190552442, + "grad_norm": 3.140307903289795, + "learning_rate": 1.8754164949543123e-05, + "loss": 0.1153, + "step": 4206 + }, + { + "epoch": 0.8422738190552442, + "grad_norm": 0.1402135193347931, + "learning_rate": 1.8747401288870482e-05, + "loss": 0.1119, + "step": 4208 + }, + { + "epoch": 0.8430744595676541, + "grad_norm": 2.1135613918304443, + "learning_rate": 1.8740620544333604e-05, + "loss": 0.2158, + "step": 4210 + }, + { + "epoch": 0.8430744595676541, + "grad_norm": 0.2727619409561157, + "learning_rate": 1.8733822729175455e-05, + "loss": 0.0417, + "step": 4212 + }, + { + "epoch": 0.8438751000800641, + "grad_norm": 6.628386497497559, + "learning_rate": 1.872700785667228e-05, + "loss": 0.2528, + "step": 4214 + }, + { + "epoch": 0.8438751000800641, + "grad_norm": 1.814210057258606, + "learning_rate": 1.8720175940133712e-05, + "loss": 0.0935, + "step": 4216 + }, + { + "epoch": 0.844675740592474, + "grad_norm": 6.386908054351807, + "learning_rate": 1.8713326992902612e-05, + "loss": 0.1477, + "step": 4218 + }, + { + "epoch": 0.844675740592474, + "grad_norm": 0.7650297284126282, + "learning_rate": 1.8706461028355107e-05, + "loss": 0.0411, + "step": 4220 + }, + { + "epoch": 0.8454763811048839, + "grad_norm": 16.5892276763916, + "learning_rate": 1.8699578059900604e-05, + "loss": 0.5706, + "step": 4222 + }, + { + "epoch": 0.8454763811048839, + "grad_norm": 6.4100341796875, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.3077, + "step": 4224 + }, + { + "epoch": 0.8462770216172938, + "grad_norm": 3.6774182319641113, + "learning_rate": 1.868576116507408e-05, + "loss": 0.3726, + "step": 4226 + }, + { + "epoch": 0.8462770216172938, + "grad_norm": 0.7024332284927368, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.0831, + "step": 4228 + }, + { + "epoch": 0.8470776621297038, + "grad_norm": 7.781497001647949, + "learning_rate": 1.8671876416361767e-05, + "loss": 0.6259, + "step": 4230 + }, + { + "epoch": 0.8470776621297038, + "grad_norm": 1.9756361246109009, + "learning_rate": 1.8664908630674264e-05, + "loss": 0.1497, + "step": 4232 + }, + { + "epoch": 0.8478783026421137, + "grad_norm": 2.0836331844329834, + "learning_rate": 1.8657923922232467e-05, + "loss": 0.2464, + "step": 4234 + }, + { + "epoch": 0.8478783026421137, + "grad_norm": 0.7938516736030579, + "learning_rate": 1.86509223046777e-05, + "loss": 0.0934, + "step": 4236 + }, + { + "epoch": 0.8486789431545236, + "grad_norm": 5.330477714538574, + "learning_rate": 1.8643903791684228e-05, + "loss": 1.0395, + "step": 4238 + }, + { + "epoch": 0.8486789431545236, + "grad_norm": 4.775360107421875, + "learning_rate": 1.8636868396959406e-05, + "loss": 0.1384, + "step": 4240 + }, + { + "epoch": 0.8494795836669335, + "grad_norm": 6.992191314697266, + "learning_rate": 1.8629816134243466e-05, + "loss": 0.6367, + "step": 4242 + }, + { + "epoch": 0.8494795836669335, + "grad_norm": 6.897693157196045, + "learning_rate": 1.8622747017309676e-05, + "loss": 0.5288, + "step": 4244 + }, + { + "epoch": 0.8502802241793435, + "grad_norm": 3.5853328704833984, + "learning_rate": 1.8615661059964148e-05, + "loss": 0.3042, + "step": 4246 + }, + { + "epoch": 0.8502802241793435, + "grad_norm": 1.2119287252426147, + "learning_rate": 1.8608558276045898e-05, + "loss": 0.1336, + "step": 4248 + }, + { + "epoch": 0.8510808646917534, + "grad_norm": 3.5143659114837646, + "learning_rate": 1.860143867942685e-05, + "loss": 0.3698, + "step": 4250 + }, + { + "epoch": 0.8510808646917534, + "grad_norm": 1.4036415815353394, + "learning_rate": 1.8594302284011697e-05, + "loss": 0.1139, + "step": 4252 + }, + { + "epoch": 0.8518815052041633, + "grad_norm": 3.1153054237365723, + "learning_rate": 1.8587149103738006e-05, + "loss": 0.1332, + "step": 4254 + }, + { + "epoch": 0.8518815052041633, + "grad_norm": 0.10829364508390427, + "learning_rate": 1.8579979152576076e-05, + "loss": 0.1347, + "step": 4256 + }, + { + "epoch": 0.8526821457165733, + "grad_norm": 4.164417266845703, + "learning_rate": 1.8572792444528963e-05, + "loss": 0.4713, + "step": 4258 + }, + { + "epoch": 0.8526821457165733, + "grad_norm": 1.020261526107788, + "learning_rate": 1.8565588993632498e-05, + "loss": 0.1819, + "step": 4260 + }, + { + "epoch": 0.8534827862289832, + "grad_norm": 12.097563743591309, + "learning_rate": 1.8558368813955136e-05, + "loss": 0.4811, + "step": 4262 + }, + { + "epoch": 0.8534827862289832, + "grad_norm": 1.0365298986434937, + "learning_rate": 1.8551131919598084e-05, + "loss": 0.0648, + "step": 4264 + }, + { + "epoch": 0.8542834267413931, + "grad_norm": 3.474332332611084, + "learning_rate": 1.854387832469512e-05, + "loss": 0.4187, + "step": 4266 + }, + { + "epoch": 0.8542834267413931, + "grad_norm": 0.7461860179901123, + "learning_rate": 1.8536608043412702e-05, + "loss": 0.1301, + "step": 4268 + }, + { + "epoch": 0.855084067253803, + "grad_norm": 2.8793938159942627, + "learning_rate": 1.8529321089949833e-05, + "loss": 0.2629, + "step": 4270 + }, + { + "epoch": 0.855084067253803, + "grad_norm": 23.552536010742188, + "learning_rate": 1.852201747853807e-05, + "loss": 0.2615, + "step": 4272 + }, + { + "epoch": 0.855884707766213, + "grad_norm": 4.63593864440918, + "learning_rate": 1.8514697223441565e-05, + "loss": 0.5317, + "step": 4274 + }, + { + "epoch": 0.855884707766213, + "grad_norm": 3.3408241271972656, + "learning_rate": 1.8507360338956896e-05, + "loss": 0.2509, + "step": 4276 + }, + { + "epoch": 0.8566853482786229, + "grad_norm": 0.9087585210800171, + "learning_rate": 1.850000683941319e-05, + "loss": 0.0814, + "step": 4278 + }, + { + "epoch": 0.8566853482786229, + "grad_norm": 1.9450879096984863, + "learning_rate": 1.849263673917196e-05, + "loss": 0.1216, + "step": 4280 + }, + { + "epoch": 0.8574859887910328, + "grad_norm": 11.352913856506348, + "learning_rate": 1.8485250052627205e-05, + "loss": 0.454, + "step": 4282 + }, + { + "epoch": 0.8574859887910328, + "grad_norm": 0.735169529914856, + "learning_rate": 1.847784679420527e-05, + "loss": 0.1134, + "step": 4284 + }, + { + "epoch": 0.8582866293034428, + "grad_norm": 12.785299301147461, + "learning_rate": 1.8470426978364857e-05, + "loss": 0.38, + "step": 4286 + }, + { + "epoch": 0.8582866293034428, + "grad_norm": 3.7504265308380127, + "learning_rate": 1.846299061959706e-05, + "loss": 0.1826, + "step": 4288 + }, + { + "epoch": 0.8590872698158527, + "grad_norm": 2.6693971157073975, + "learning_rate": 1.845553773242522e-05, + "loss": 0.1848, + "step": 4290 + }, + { + "epoch": 0.8590872698158527, + "grad_norm": 3.759401798248291, + "learning_rate": 1.8448068331405018e-05, + "loss": 0.1756, + "step": 4292 + }, + { + "epoch": 0.8598879103282626, + "grad_norm": 11.394953727722168, + "learning_rate": 1.8440582431124322e-05, + "loss": 0.4151, + "step": 4294 + }, + { + "epoch": 0.8598879103282626, + "grad_norm": 0.3875369727611542, + "learning_rate": 1.8433080046203293e-05, + "loss": 0.0478, + "step": 4296 + }, + { + "epoch": 0.8606885508406725, + "grad_norm": 7.473723888397217, + "learning_rate": 1.842556119129423e-05, + "loss": 0.8271, + "step": 4298 + }, + { + "epoch": 0.8606885508406725, + "grad_norm": 5.955150604248047, + "learning_rate": 1.841802588108161e-05, + "loss": 0.2163, + "step": 4300 + }, + { + "epoch": 0.8614891913530824, + "grad_norm": 6.920459270477295, + "learning_rate": 1.841047413028209e-05, + "loss": 0.1257, + "step": 4302 + }, + { + "epoch": 0.8614891913530824, + "grad_norm": 0.3053504526615143, + "learning_rate": 1.8402905953644356e-05, + "loss": 0.0764, + "step": 4304 + }, + { + "epoch": 0.8622898318654924, + "grad_norm": 4.715519905090332, + "learning_rate": 1.8395321365949273e-05, + "loss": 0.2977, + "step": 4306 + }, + { + "epoch": 0.8622898318654924, + "grad_norm": 2.0216898918151855, + "learning_rate": 1.838772038200968e-05, + "loss": 0.1138, + "step": 4308 + }, + { + "epoch": 0.8630904723779024, + "grad_norm": 0.4010469913482666, + "learning_rate": 1.838010301667044e-05, + "loss": 0.0516, + "step": 4310 + }, + { + "epoch": 0.8630904723779024, + "grad_norm": 0.15106599032878876, + "learning_rate": 1.837246928480848e-05, + "loss": 0.0678, + "step": 4312 + }, + { + "epoch": 0.8638911128903123, + "grad_norm": 6.648336887359619, + "learning_rate": 1.8364819201332596e-05, + "loss": 0.6769, + "step": 4314 + }, + { + "epoch": 0.8638911128903123, + "grad_norm": 2.521239995956421, + "learning_rate": 1.8357152781183613e-05, + "loss": 0.1328, + "step": 4316 + }, + { + "epoch": 0.8646917534027222, + "grad_norm": 16.9267635345459, + "learning_rate": 1.834947003933417e-05, + "loss": 0.7034, + "step": 4318 + }, + { + "epoch": 0.8646917534027222, + "grad_norm": 0.15159545838832855, + "learning_rate": 1.8341770990788874e-05, + "loss": 0.0119, + "step": 4320 + }, + { + "epoch": 0.8654923939151321, + "grad_norm": 0.7200623154640198, + "learning_rate": 1.8334055650584107e-05, + "loss": 0.1393, + "step": 4322 + }, + { + "epoch": 0.8654923939151321, + "grad_norm": 0.16369254887104034, + "learning_rate": 1.8326324033788087e-05, + "loss": 0.0407, + "step": 4324 + }, + { + "epoch": 0.866293034427542, + "grad_norm": 12.05486011505127, + "learning_rate": 1.8318576155500855e-05, + "loss": 0.7322, + "step": 4326 + }, + { + "epoch": 0.866293034427542, + "grad_norm": 3.9058122634887695, + "learning_rate": 1.831081203085415e-05, + "loss": 0.2254, + "step": 4328 + }, + { + "epoch": 0.8670936749399519, + "grad_norm": 17.48163414001465, + "learning_rate": 1.830303167501152e-05, + "loss": 0.7283, + "step": 4330 + }, + { + "epoch": 0.8670936749399519, + "grad_norm": 2.5653655529022217, + "learning_rate": 1.8295235103168128e-05, + "loss": 0.0606, + "step": 4332 + }, + { + "epoch": 0.8678943154523618, + "grad_norm": 10.272154808044434, + "learning_rate": 1.8287422330550885e-05, + "loss": 0.3834, + "step": 4334 + }, + { + "epoch": 0.8678943154523618, + "grad_norm": 0.7604150772094727, + "learning_rate": 1.8279593372418284e-05, + "loss": 0.2263, + "step": 4336 + }, + { + "epoch": 0.8686949559647719, + "grad_norm": 1.405009150505066, + "learning_rate": 1.827174824406043e-05, + "loss": 0.1442, + "step": 4338 + }, + { + "epoch": 0.8686949559647719, + "grad_norm": 4.079202651977539, + "learning_rate": 1.8263886960799072e-05, + "loss": 0.0913, + "step": 4340 + }, + { + "epoch": 0.8694955964771818, + "grad_norm": 15.166433334350586, + "learning_rate": 1.8256009537987424e-05, + "loss": 1.1257, + "step": 4342 + }, + { + "epoch": 0.8694955964771818, + "grad_norm": 4.590144634246826, + "learning_rate": 1.8248115991010303e-05, + "loss": 0.1886, + "step": 4344 + }, + { + "epoch": 0.8702962369895917, + "grad_norm": 3.939413547515869, + "learning_rate": 1.8240206335283943e-05, + "loss": 0.4438, + "step": 4346 + }, + { + "epoch": 0.8702962369895917, + "grad_norm": 0.2689581513404846, + "learning_rate": 1.8232280586256104e-05, + "loss": 0.0195, + "step": 4348 + }, + { + "epoch": 0.8710968775020016, + "grad_norm": 0.3303256332874298, + "learning_rate": 1.8224338759405934e-05, + "loss": 0.4104, + "step": 4350 + }, + { + "epoch": 0.8710968775020016, + "grad_norm": 0.8884463906288147, + "learning_rate": 1.8216380870243963e-05, + "loss": 0.0343, + "step": 4352 + }, + { + "epoch": 0.8718975180144115, + "grad_norm": 6.118797779083252, + "learning_rate": 1.820840693431217e-05, + "loss": 0.4334, + "step": 4354 + }, + { + "epoch": 0.8718975180144115, + "grad_norm": 2.843533754348755, + "learning_rate": 1.820041696718378e-05, + "loss": 0.1445, + "step": 4356 + }, + { + "epoch": 0.8726981585268214, + "grad_norm": 5.390256404876709, + "learning_rate": 1.8192410984463416e-05, + "loss": 0.5593, + "step": 4358 + }, + { + "epoch": 0.8726981585268214, + "grad_norm": 7.177663326263428, + "learning_rate": 1.8184389001786912e-05, + "loss": 0.5136, + "step": 4360 + }, + { + "epoch": 0.8734987990392313, + "grad_norm": 2.272752523422241, + "learning_rate": 1.8176351034821352e-05, + "loss": 0.2394, + "step": 4362 + }, + { + "epoch": 0.8734987990392313, + "grad_norm": 1.052101969718933, + "learning_rate": 1.8168297099265108e-05, + "loss": 0.123, + "step": 4364 + }, + { + "epoch": 0.8742994395516414, + "grad_norm": 9.083051681518555, + "learning_rate": 1.8160227210847642e-05, + "loss": 0.7044, + "step": 4366 + }, + { + "epoch": 0.8742994395516414, + "grad_norm": 3.21313214302063, + "learning_rate": 1.815214138532966e-05, + "loss": 0.2956, + "step": 4368 + }, + { + "epoch": 0.8751000800640513, + "grad_norm": 2.7512950897216797, + "learning_rate": 1.8144039638502927e-05, + "loss": 0.2384, + "step": 4370 + }, + { + "epoch": 0.8751000800640513, + "grad_norm": 1.744044542312622, + "learning_rate": 1.8135921986190358e-05, + "loss": 0.1852, + "step": 4372 + }, + { + "epoch": 0.8759007205764612, + "grad_norm": 8.501399040222168, + "learning_rate": 1.8127788444245884e-05, + "loss": 0.6362, + "step": 4374 + }, + { + "epoch": 0.8759007205764612, + "grad_norm": 1.4244064092636108, + "learning_rate": 1.8119639028554475e-05, + "loss": 0.2159, + "step": 4376 + }, + { + "epoch": 0.8767013610888711, + "grad_norm": 5.590197563171387, + "learning_rate": 1.8111473755032152e-05, + "loss": 0.6374, + "step": 4378 + }, + { + "epoch": 0.8767013610888711, + "grad_norm": 6.809462547302246, + "learning_rate": 1.8103292639625835e-05, + "loss": 0.3844, + "step": 4380 + }, + { + "epoch": 0.877502001601281, + "grad_norm": 7.559084415435791, + "learning_rate": 1.8095095698313456e-05, + "loss": 0.3156, + "step": 4382 + }, + { + "epoch": 0.877502001601281, + "grad_norm": 0.5014828443527222, + "learning_rate": 1.808688294710378e-05, + "loss": 0.1165, + "step": 4384 + }, + { + "epoch": 0.8783026421136909, + "grad_norm": 5.606857776641846, + "learning_rate": 1.807865440203653e-05, + "loss": 0.7284, + "step": 4386 + }, + { + "epoch": 0.8783026421136909, + "grad_norm": 3.6239657402038574, + "learning_rate": 1.807041007918221e-05, + "loss": 0.5365, + "step": 4388 + }, + { + "epoch": 0.8791032826261009, + "grad_norm": 1.944422960281372, + "learning_rate": 1.806214999464214e-05, + "loss": 0.1362, + "step": 4390 + }, + { + "epoch": 0.8791032826261009, + "grad_norm": 1.2786122560501099, + "learning_rate": 1.805387416454849e-05, + "loss": 0.2102, + "step": 4392 + }, + { + "epoch": 0.8799039231385108, + "grad_norm": 2.410829782485962, + "learning_rate": 1.8045582605064087e-05, + "loss": 0.3807, + "step": 4394 + }, + { + "epoch": 0.8799039231385108, + "grad_norm": 2.332818031311035, + "learning_rate": 1.8037275332382575e-05, + "loss": 0.2471, + "step": 4396 + }, + { + "epoch": 0.8807045636509208, + "grad_norm": 3.1832051277160645, + "learning_rate": 1.802895236272819e-05, + "loss": 0.1542, + "step": 4398 + }, + { + "epoch": 0.8807045636509208, + "grad_norm": 1.2044411897659302, + "learning_rate": 1.802061371235592e-05, + "loss": 0.088, + "step": 4400 + }, + { + "epoch": 0.8815052041633307, + "grad_norm": 5.065806865692139, + "learning_rate": 1.80122593975513e-05, + "loss": 0.3235, + "step": 4402 + }, + { + "epoch": 0.8815052041633307, + "grad_norm": 6.310384273529053, + "learning_rate": 1.8003889434630476e-05, + "loss": 0.1903, + "step": 4404 + }, + { + "epoch": 0.8823058446757406, + "grad_norm": 3.7290170192718506, + "learning_rate": 1.7995503839940204e-05, + "loss": 0.3519, + "step": 4406 + }, + { + "epoch": 0.8823058446757406, + "grad_norm": 0.24426941573619843, + "learning_rate": 1.7987102629857692e-05, + "loss": 0.1394, + "step": 4408 + }, + { + "epoch": 0.8831064851881505, + "grad_norm": 1.3304563760757446, + "learning_rate": 1.7978685820790725e-05, + "loss": 0.161, + "step": 4410 + }, + { + "epoch": 0.8831064851881505, + "grad_norm": 2.761929750442505, + "learning_rate": 1.7970253429177494e-05, + "loss": 0.2465, + "step": 4412 + }, + { + "epoch": 0.8839071257005604, + "grad_norm": 4.869638919830322, + "learning_rate": 1.796180547148662e-05, + "loss": 0.3812, + "step": 4414 + }, + { + "epoch": 0.8839071257005604, + "grad_norm": 2.1477580070495605, + "learning_rate": 1.7953341964217196e-05, + "loss": 0.1105, + "step": 4416 + }, + { + "epoch": 0.8847077662129704, + "grad_norm": 5.9751176834106445, + "learning_rate": 1.7944862923898586e-05, + "loss": 0.6366, + "step": 4418 + }, + { + "epoch": 0.8847077662129704, + "grad_norm": 4.221614837646484, + "learning_rate": 1.7936368367090583e-05, + "loss": 0.2167, + "step": 4420 + }, + { + "epoch": 0.8855084067253803, + "grad_norm": 1.4528051614761353, + "learning_rate": 1.7927858310383196e-05, + "loss": 0.2299, + "step": 4422 + }, + { + "epoch": 0.8855084067253803, + "grad_norm": 1.0621566772460938, + "learning_rate": 1.7919332770396798e-05, + "loss": 0.1428, + "step": 4424 + }, + { + "epoch": 0.8863090472377902, + "grad_norm": 1.2208244800567627, + "learning_rate": 1.7910791763781928e-05, + "loss": 0.1841, + "step": 4426 + }, + { + "epoch": 0.8863090472377902, + "grad_norm": 3.941770553588867, + "learning_rate": 1.7902235307219336e-05, + "loss": 0.13, + "step": 4428 + }, + { + "epoch": 0.8871096877502002, + "grad_norm": 1.9176123142242432, + "learning_rate": 1.789366341742001e-05, + "loss": 0.1498, + "step": 4430 + }, + { + "epoch": 0.8871096877502002, + "grad_norm": 0.3550972640514374, + "learning_rate": 1.7885076111125e-05, + "loss": 0.0414, + "step": 4432 + }, + { + "epoch": 0.8879103282626101, + "grad_norm": 0.18521283566951752, + "learning_rate": 1.7876473405105535e-05, + "loss": 0.2311, + "step": 4434 + }, + { + "epoch": 0.8879103282626101, + "grad_norm": 3.084169864654541, + "learning_rate": 1.7867855316162846e-05, + "loss": 0.2157, + "step": 4436 + }, + { + "epoch": 0.88871096877502, + "grad_norm": 3.4583911895751953, + "learning_rate": 1.785922186112829e-05, + "loss": 0.3644, + "step": 4438 + }, + { + "epoch": 0.88871096877502, + "grad_norm": 1.5633032321929932, + "learning_rate": 1.7850573056863173e-05, + "loss": 0.0777, + "step": 4440 + }, + { + "epoch": 0.8895116092874299, + "grad_norm": 3.6711835861206055, + "learning_rate": 1.7841908920258774e-05, + "loss": 0.4033, + "step": 4442 + }, + { + "epoch": 0.8895116092874299, + "grad_norm": 1.9865062236785889, + "learning_rate": 1.783322946823638e-05, + "loss": 0.189, + "step": 4444 + }, + { + "epoch": 0.8903122497998399, + "grad_norm": 3.987778902053833, + "learning_rate": 1.782453471774711e-05, + "loss": 0.5501, + "step": 4446 + }, + { + "epoch": 0.8903122497998399, + "grad_norm": 1.0028119087219238, + "learning_rate": 1.7815824685772042e-05, + "loss": 0.1466, + "step": 4448 + }, + { + "epoch": 0.8911128903122498, + "grad_norm": 2.177347421646118, + "learning_rate": 1.7807099389322013e-05, + "loss": 0.0998, + "step": 4450 + }, + { + "epoch": 0.8911128903122498, + "grad_norm": 10.552443504333496, + "learning_rate": 1.779835884543776e-05, + "loss": 0.1588, + "step": 4452 + }, + { + "epoch": 0.8919135308246597, + "grad_norm": 10.352629661560059, + "learning_rate": 1.7789603071189733e-05, + "loss": 0.2416, + "step": 4454 + }, + { + "epoch": 0.8919135308246597, + "grad_norm": 1.3962016105651855, + "learning_rate": 1.7780832083678122e-05, + "loss": 0.1114, + "step": 4456 + }, + { + "epoch": 0.8927141713370697, + "grad_norm": 12.268455505371094, + "learning_rate": 1.7772045900032912e-05, + "loss": 0.5102, + "step": 4458 + }, + { + "epoch": 0.8927141713370697, + "grad_norm": 2.938213348388672, + "learning_rate": 1.776324453741365e-05, + "loss": 0.1984, + "step": 4460 + }, + { + "epoch": 0.8935148118494796, + "grad_norm": 10.112679481506348, + "learning_rate": 1.7754428013009644e-05, + "loss": 0.4544, + "step": 4462 + }, + { + "epoch": 0.8935148118494796, + "grad_norm": 4.299936771392822, + "learning_rate": 1.774559634403971e-05, + "loss": 0.1851, + "step": 4464 + }, + { + "epoch": 0.8943154523618895, + "grad_norm": 10.568756103515625, + "learning_rate": 1.7736749547752327e-05, + "loss": 0.6589, + "step": 4466 + }, + { + "epoch": 0.8943154523618895, + "grad_norm": 0.0730956569314003, + "learning_rate": 1.7727887641425465e-05, + "loss": 0.0344, + "step": 4468 + }, + { + "epoch": 0.8951160928742994, + "grad_norm": 10.706608772277832, + "learning_rate": 1.7719010642366597e-05, + "loss": 1.046, + "step": 4470 + }, + { + "epoch": 0.8951160928742994, + "grad_norm": 3.822122812271118, + "learning_rate": 1.7710118567912732e-05, + "loss": 0.2084, + "step": 4472 + }, + { + "epoch": 0.8959167333867094, + "grad_norm": 4.0429368019104, + "learning_rate": 1.770121143543025e-05, + "loss": 0.6684, + "step": 4474 + }, + { + "epoch": 0.8959167333867094, + "grad_norm": 0.15353582799434662, + "learning_rate": 1.7692289262315008e-05, + "loss": 0.2292, + "step": 4476 + }, + { + "epoch": 0.8967173738991193, + "grad_norm": 4.702209949493408, + "learning_rate": 1.7683352065992174e-05, + "loss": 0.2667, + "step": 4478 + }, + { + "epoch": 0.8967173738991193, + "grad_norm": 2.6245276927948, + "learning_rate": 1.7674399863916298e-05, + "loss": 0.1792, + "step": 4480 + }, + { + "epoch": 0.8975180144115292, + "grad_norm": 2.475062847137451, + "learning_rate": 1.7665432673571238e-05, + "loss": 0.3248, + "step": 4482 + }, + { + "epoch": 0.8975180144115292, + "grad_norm": 1.158987045288086, + "learning_rate": 1.765645051247007e-05, + "loss": 0.1244, + "step": 4484 + }, + { + "epoch": 0.8983186549239391, + "grad_norm": 4.340035915374756, + "learning_rate": 1.7647453398155204e-05, + "loss": 0.3524, + "step": 4486 + }, + { + "epoch": 0.8983186549239391, + "grad_norm": 4.592500686645508, + "learning_rate": 1.7638441348198144e-05, + "loss": 0.2398, + "step": 4488 + }, + { + "epoch": 0.899119295436349, + "grad_norm": 5.115654945373535, + "learning_rate": 1.7629414380199672e-05, + "loss": 0.3664, + "step": 4490 + }, + { + "epoch": 0.899119295436349, + "grad_norm": 1.94034743309021, + "learning_rate": 1.762037251178961e-05, + "loss": 0.1593, + "step": 4492 + }, + { + "epoch": 0.899919935948759, + "grad_norm": 1.2473502159118652, + "learning_rate": 1.7611315760626943e-05, + "loss": 0.1302, + "step": 4494 + }, + { + "epoch": 0.899919935948759, + "grad_norm": 3.430352210998535, + "learning_rate": 1.7602244144399713e-05, + "loss": 0.1419, + "step": 4496 + }, + { + "epoch": 0.900720576461169, + "grad_norm": 3.510406494140625, + "learning_rate": 1.7593157680824943e-05, + "loss": 0.5754, + "step": 4498 + }, + { + "epoch": 0.900720576461169, + "grad_norm": 0.5526865124702454, + "learning_rate": 1.7584056387648738e-05, + "loss": 0.0388, + "step": 4500 + }, + { + "epoch": 0.9015212169735789, + "grad_norm": 1.2678951025009155, + "learning_rate": 1.757494028264608e-05, + "loss": 0.2556, + "step": 4502 + }, + { + "epoch": 0.9015212169735789, + "grad_norm": 1.5873442888259888, + "learning_rate": 1.7565809383620966e-05, + "loss": 0.1442, + "step": 4504 + }, + { + "epoch": 0.9023218574859888, + "grad_norm": 7.450283050537109, + "learning_rate": 1.7556663708406203e-05, + "loss": 0.5697, + "step": 4506 + }, + { + "epoch": 0.9023218574859888, + "grad_norm": 0.3743077218532562, + "learning_rate": 1.7547503274863502e-05, + "loss": 0.0461, + "step": 4508 + }, + { + "epoch": 0.9031224979983987, + "grad_norm": 1.4675856828689575, + "learning_rate": 1.7538328100883404e-05, + "loss": 0.2862, + "step": 4510 + }, + { + "epoch": 0.9031224979983987, + "grad_norm": 0.7564343214035034, + "learning_rate": 1.7529138204385186e-05, + "loss": 0.0402, + "step": 4512 + }, + { + "epoch": 0.9039231385108086, + "grad_norm": 1.6252456903457642, + "learning_rate": 1.7519933603316962e-05, + "loss": 0.2472, + "step": 4514 + }, + { + "epoch": 0.9039231385108086, + "grad_norm": 2.030759572982788, + "learning_rate": 1.7510714315655467e-05, + "loss": 0.2309, + "step": 4516 + }, + { + "epoch": 0.9047237790232185, + "grad_norm": 7.029176235198975, + "learning_rate": 1.750148035940622e-05, + "loss": 0.6137, + "step": 4518 + }, + { + "epoch": 0.9047237790232185, + "grad_norm": 3.088568687438965, + "learning_rate": 1.7492231752603305e-05, + "loss": 0.2508, + "step": 4520 + }, + { + "epoch": 0.9055244195356285, + "grad_norm": 4.27255392074585, + "learning_rate": 1.7482968513309458e-05, + "loss": 0.7033, + "step": 4522 + }, + { + "epoch": 0.9055244195356285, + "grad_norm": 1.064418077468872, + "learning_rate": 1.7473690659616e-05, + "loss": 0.1519, + "step": 4524 + }, + { + "epoch": 0.9063250600480385, + "grad_norm": 2.100539207458496, + "learning_rate": 1.7464398209642744e-05, + "loss": 0.3077, + "step": 4526 + }, + { + "epoch": 0.9063250600480385, + "grad_norm": 6.6347455978393555, + "learning_rate": 1.7455091181538094e-05, + "loss": 0.2563, + "step": 4528 + }, + { + "epoch": 0.9071257005604484, + "grad_norm": 0.8639596104621887, + "learning_rate": 1.7445769593478842e-05, + "loss": 0.4072, + "step": 4530 + }, + { + "epoch": 0.9071257005604484, + "grad_norm": 1.288110613822937, + "learning_rate": 1.743643346367027e-05, + "loss": 0.1275, + "step": 4532 + }, + { + "epoch": 0.9079263410728583, + "grad_norm": 11.604455947875977, + "learning_rate": 1.7427082810346024e-05, + "loss": 0.2306, + "step": 4534 + }, + { + "epoch": 0.9079263410728583, + "grad_norm": 3.4507527351379395, + "learning_rate": 1.741771765176815e-05, + "loss": 0.1666, + "step": 4536 + }, + { + "epoch": 0.9087269815852682, + "grad_norm": 2.0574159622192383, + "learning_rate": 1.740833800622701e-05, + "loss": 0.3304, + "step": 4538 + }, + { + "epoch": 0.9087269815852682, + "grad_norm": 2.1205101013183594, + "learning_rate": 1.739894389204122e-05, + "loss": 0.0864, + "step": 4540 + }, + { + "epoch": 0.9095276220976781, + "grad_norm": 3.3198814392089844, + "learning_rate": 1.738953532755774e-05, + "loss": 0.2561, + "step": 4542 + }, + { + "epoch": 0.9095276220976781, + "grad_norm": 3.7939276695251465, + "learning_rate": 1.7380112331151657e-05, + "loss": 0.2153, + "step": 4544 + }, + { + "epoch": 0.910328262610088, + "grad_norm": 5.092320919036865, + "learning_rate": 1.7370674921226306e-05, + "loss": 0.2615, + "step": 4546 + }, + { + "epoch": 0.910328262610088, + "grad_norm": 4.087704658508301, + "learning_rate": 1.7361223116213146e-05, + "loss": 0.3388, + "step": 4548 + }, + { + "epoch": 0.911128903122498, + "grad_norm": 1.2574706077575684, + "learning_rate": 1.7351756934571764e-05, + "loss": 0.2042, + "step": 4550 + }, + { + "epoch": 0.911128903122498, + "grad_norm": 1.8466956615447998, + "learning_rate": 1.7342276394789825e-05, + "loss": 0.2227, + "step": 4552 + }, + { + "epoch": 0.911929543634908, + "grad_norm": 7.945632457733154, + "learning_rate": 1.7332781515382996e-05, + "loss": 0.6855, + "step": 4554 + }, + { + "epoch": 0.911929543634908, + "grad_norm": 11.640820503234863, + "learning_rate": 1.732327231489503e-05, + "loss": 0.4181, + "step": 4556 + }, + { + "epoch": 0.9127301841473179, + "grad_norm": 2.405789613723755, + "learning_rate": 1.7313748811897564e-05, + "loss": 0.1434, + "step": 4558 + }, + { + "epoch": 0.9127301841473179, + "grad_norm": 1.8656169176101685, + "learning_rate": 1.7304211024990216e-05, + "loss": 0.0609, + "step": 4560 + }, + { + "epoch": 0.9135308246597278, + "grad_norm": 1.0396745204925537, + "learning_rate": 1.7294658972800495e-05, + "loss": 0.0684, + "step": 4562 + }, + { + "epoch": 0.9135308246597278, + "grad_norm": 0.7443292140960693, + "learning_rate": 1.728509267398376e-05, + "loss": 0.0645, + "step": 4564 + }, + { + "epoch": 0.9143314651721377, + "grad_norm": 8.072303771972656, + "learning_rate": 1.727551214722322e-05, + "loss": 0.9357, + "step": 4566 + }, + { + "epoch": 0.9143314651721377, + "grad_norm": 4.648294925689697, + "learning_rate": 1.7265917411229803e-05, + "loss": 0.454, + "step": 4568 + }, + { + "epoch": 0.9151321056845476, + "grad_norm": 4.65705680847168, + "learning_rate": 1.72563084847423e-05, + "loss": 0.38, + "step": 4570 + }, + { + "epoch": 0.9151321056845476, + "grad_norm": 4.9178972244262695, + "learning_rate": 1.7246685386527105e-05, + "loss": 0.0985, + "step": 4572 + }, + { + "epoch": 0.9159327461969575, + "grad_norm": 5.160616874694824, + "learning_rate": 1.723704813537835e-05, + "loss": 0.716, + "step": 4574 + }, + { + "epoch": 0.9159327461969575, + "grad_norm": 0.7092592120170593, + "learning_rate": 1.7227396750117802e-05, + "loss": 0.2101, + "step": 4576 + }, + { + "epoch": 0.9167333867093675, + "grad_norm": 2.8270206451416016, + "learning_rate": 1.7217731249594817e-05, + "loss": 0.4839, + "step": 4578 + }, + { + "epoch": 0.9167333867093675, + "grad_norm": 2.6069400310516357, + "learning_rate": 1.7208051652686348e-05, + "loss": 0.1666, + "step": 4580 + }, + { + "epoch": 0.9175340272217775, + "grad_norm": 4.075070858001709, + "learning_rate": 1.7198357978296827e-05, + "loss": 0.3838, + "step": 4582 + }, + { + "epoch": 0.9175340272217775, + "grad_norm": 3.559901714324951, + "learning_rate": 1.718865024535822e-05, + "loss": 0.1201, + "step": 4584 + }, + { + "epoch": 0.9183346677341874, + "grad_norm": 2.241713762283325, + "learning_rate": 1.717892847282995e-05, + "loss": 0.4384, + "step": 4586 + }, + { + "epoch": 0.9183346677341874, + "grad_norm": 2.623020887374878, + "learning_rate": 1.716919267969884e-05, + "loss": 0.1765, + "step": 4588 + }, + { + "epoch": 0.9191353082465973, + "grad_norm": 1.2983473539352417, + "learning_rate": 1.715944288497911e-05, + "loss": 0.2727, + "step": 4590 + }, + { + "epoch": 0.9191353082465973, + "grad_norm": 0.7214236855506897, + "learning_rate": 1.7149679107712317e-05, + "loss": 0.1408, + "step": 4592 + }, + { + "epoch": 0.9199359487590072, + "grad_norm": 5.305279731750488, + "learning_rate": 1.713990136696734e-05, + "loss": 0.6255, + "step": 4594 + }, + { + "epoch": 0.9199359487590072, + "grad_norm": 4.059255599975586, + "learning_rate": 1.7130109681840298e-05, + "loss": 0.2933, + "step": 4596 + }, + { + "epoch": 0.9207365892714171, + "grad_norm": 7.456916809082031, + "learning_rate": 1.7120304071454578e-05, + "loss": 0.399, + "step": 4598 + }, + { + "epoch": 0.9207365892714171, + "grad_norm": 1.7160990238189697, + "learning_rate": 1.711048455496075e-05, + "loss": 0.103, + "step": 4600 + }, + { + "epoch": 0.921537229783827, + "grad_norm": 6.855238437652588, + "learning_rate": 1.7100651151536532e-05, + "loss": 0.5929, + "step": 4602 + }, + { + "epoch": 0.921537229783827, + "grad_norm": 4.361216068267822, + "learning_rate": 1.7090803880386784e-05, + "loss": 0.3211, + "step": 4604 + }, + { + "epoch": 0.922337870296237, + "grad_norm": 2.7488791942596436, + "learning_rate": 1.708094276074344e-05, + "loss": 0.1849, + "step": 4606 + }, + { + "epoch": 0.922337870296237, + "grad_norm": 1.7108547687530518, + "learning_rate": 1.7071067811865474e-05, + "loss": 0.2376, + "step": 4608 + }, + { + "epoch": 0.923138510808647, + "grad_norm": 5.2297186851501465, + "learning_rate": 1.7061179053038894e-05, + "loss": 0.5598, + "step": 4610 + }, + { + "epoch": 0.923138510808647, + "grad_norm": 1.9463510513305664, + "learning_rate": 1.705127650357663e-05, + "loss": 0.1216, + "step": 4612 + }, + { + "epoch": 0.9239391513210569, + "grad_norm": 3.6582741737365723, + "learning_rate": 1.704136018281859e-05, + "loss": 0.4258, + "step": 4614 + }, + { + "epoch": 0.9239391513210569, + "grad_norm": 2.2200405597686768, + "learning_rate": 1.7031430110131566e-05, + "loss": 0.1589, + "step": 4616 + }, + { + "epoch": 0.9247397918334668, + "grad_norm": 3.939946174621582, + "learning_rate": 1.7021486304909202e-05, + "loss": 0.1722, + "step": 4618 + }, + { + "epoch": 0.9247397918334668, + "grad_norm": 2.5155560970306396, + "learning_rate": 1.701152878657197e-05, + "loss": 0.2053, + "step": 4620 + }, + { + "epoch": 0.9255404323458767, + "grad_norm": 3.4614078998565674, + "learning_rate": 1.700155757456711e-05, + "loss": 0.4806, + "step": 4622 + }, + { + "epoch": 0.9255404323458767, + "grad_norm": 0.43212881684303284, + "learning_rate": 1.699157268836863e-05, + "loss": 0.0994, + "step": 4624 + }, + { + "epoch": 0.9263410728582866, + "grad_norm": 4.3865065574646, + "learning_rate": 1.6981574147477214e-05, + "loss": 0.366, + "step": 4626 + }, + { + "epoch": 0.9263410728582866, + "grad_norm": 3.265779972076416, + "learning_rate": 1.697156197142023e-05, + "loss": 0.2326, + "step": 4628 + }, + { + "epoch": 0.9271417133706965, + "grad_norm": 4.096377849578857, + "learning_rate": 1.696153617975168e-05, + "loss": 0.1845, + "step": 4630 + }, + { + "epoch": 0.9271417133706965, + "grad_norm": 1.1044899225234985, + "learning_rate": 1.6951496792052148e-05, + "loss": 0.058, + "step": 4632 + }, + { + "epoch": 0.9279423538831065, + "grad_norm": 3.874562978744507, + "learning_rate": 1.694144382792878e-05, + "loss": 0.2056, + "step": 4634 + }, + { + "epoch": 0.9279423538831065, + "grad_norm": 6.0627641677856445, + "learning_rate": 1.6931377307015236e-05, + "loss": 0.1276, + "step": 4636 + }, + { + "epoch": 0.9287429943955164, + "grad_norm": 0.5523784160614014, + "learning_rate": 1.6921297248971652e-05, + "loss": 0.4398, + "step": 4638 + }, + { + "epoch": 0.9287429943955164, + "grad_norm": 3.3879754543304443, + "learning_rate": 1.6911203673484583e-05, + "loss": 0.2609, + "step": 4640 + }, + { + "epoch": 0.9295436349079264, + "grad_norm": 7.5655741691589355, + "learning_rate": 1.690109660026701e-05, + "loss": 0.1514, + "step": 4642 + }, + { + "epoch": 0.9295436349079264, + "grad_norm": 3.76267409324646, + "learning_rate": 1.6890976049058267e-05, + "loss": 0.167, + "step": 4644 + }, + { + "epoch": 0.9303442754203363, + "grad_norm": 0.516728937625885, + "learning_rate": 1.688084203962401e-05, + "loss": 0.1966, + "step": 4646 + }, + { + "epoch": 0.9303442754203363, + "grad_norm": 3.6739399433135986, + "learning_rate": 1.687069459175619e-05, + "loss": 0.1202, + "step": 4648 + }, + { + "epoch": 0.9311449159327462, + "grad_norm": 5.179727554321289, + "learning_rate": 1.6860533725272953e-05, + "loss": 0.8038, + "step": 4650 + }, + { + "epoch": 0.9311449159327462, + "grad_norm": 2.8255844116210938, + "learning_rate": 1.6850359460018744e-05, + "loss": 0.2617, + "step": 4652 + }, + { + "epoch": 0.9319455564451561, + "grad_norm": 6.140084743499756, + "learning_rate": 1.6840171815864085e-05, + "loss": 0.3173, + "step": 4654 + }, + { + "epoch": 0.9319455564451561, + "grad_norm": 1.772773265838623, + "learning_rate": 1.682997081270568e-05, + "loss": 0.1899, + "step": 4656 + }, + { + "epoch": 0.932746196957566, + "grad_norm": 3.7942843437194824, + "learning_rate": 1.681975647046631e-05, + "loss": 0.3973, + "step": 4658 + }, + { + "epoch": 0.932746196957566, + "grad_norm": 2.3360884189605713, + "learning_rate": 1.6809528809094805e-05, + "loss": 0.2638, + "step": 4660 + }, + { + "epoch": 0.933546837469976, + "grad_norm": 3.009964942932129, + "learning_rate": 1.6799287848566024e-05, + "loss": 0.4603, + "step": 4662 + }, + { + "epoch": 0.933546837469976, + "grad_norm": 2.0106537342071533, + "learning_rate": 1.6789033608880742e-05, + "loss": 0.1201, + "step": 4664 + }, + { + "epoch": 0.9343474779823859, + "grad_norm": 3.8360259532928467, + "learning_rate": 1.6778766110065765e-05, + "loss": 0.4224, + "step": 4666 + }, + { + "epoch": 0.9343474779823859, + "grad_norm": 7.135696887969971, + "learning_rate": 1.67684853721737e-05, + "loss": 0.133, + "step": 4668 + }, + { + "epoch": 0.9351481184947958, + "grad_norm": 3.0786213874816895, + "learning_rate": 1.6758191415283066e-05, + "loss": 0.248, + "step": 4670 + }, + { + "epoch": 0.9351481184947958, + "grad_norm": 1.00486421585083, + "learning_rate": 1.6747884259498185e-05, + "loss": 0.124, + "step": 4672 + }, + { + "epoch": 0.9359487590072058, + "grad_norm": 3.8432722091674805, + "learning_rate": 1.673756392494916e-05, + "loss": 0.4832, + "step": 4674 + }, + { + "epoch": 0.9359487590072058, + "grad_norm": 6.025712013244629, + "learning_rate": 1.6727230431791826e-05, + "loss": 0.1446, + "step": 4676 + }, + { + "epoch": 0.9367493995196157, + "grad_norm": 6.561376571655273, + "learning_rate": 1.671688380020769e-05, + "loss": 0.4294, + "step": 4678 + }, + { + "epoch": 0.9367493995196157, + "grad_norm": 2.1361560821533203, + "learning_rate": 1.6706524050404006e-05, + "loss": 0.1445, + "step": 4680 + }, + { + "epoch": 0.9375500400320256, + "grad_norm": 2.1833434104919434, + "learning_rate": 1.6696151202613537e-05, + "loss": 0.456, + "step": 4682 + }, + { + "epoch": 0.9375500400320256, + "grad_norm": 0.835757851600647, + "learning_rate": 1.6685765277094702e-05, + "loss": 0.2081, + "step": 4684 + }, + { + "epoch": 0.9383506805444356, + "grad_norm": 2.665493965148926, + "learning_rate": 1.6675366294131432e-05, + "loss": 0.17, + "step": 4686 + }, + { + "epoch": 0.9383506805444356, + "grad_norm": 0.19338200986385345, + "learning_rate": 1.6664954274033175e-05, + "loss": 0.0618, + "step": 4688 + }, + { + "epoch": 0.9391513210568455, + "grad_norm": 3.116933584213257, + "learning_rate": 1.6654529237134833e-05, + "loss": 0.1809, + "step": 4690 + }, + { + "epoch": 0.9391513210568455, + "grad_norm": 0.945227861404419, + "learning_rate": 1.66440912037967e-05, + "loss": 0.087, + "step": 4692 + }, + { + "epoch": 0.9399519615692554, + "grad_norm": 6.845286846160889, + "learning_rate": 1.663364019440453e-05, + "loss": 0.6875, + "step": 4694 + }, + { + "epoch": 0.9399519615692554, + "grad_norm": 0.663843035697937, + "learning_rate": 1.662317622936933e-05, + "loss": 0.1139, + "step": 4696 + }, + { + "epoch": 0.9407526020816653, + "grad_norm": 6.5559515953063965, + "learning_rate": 1.6612699329127467e-05, + "loss": 0.3562, + "step": 4698 + }, + { + "epoch": 0.9407526020816653, + "grad_norm": 4.440352916717529, + "learning_rate": 1.6602209514140562e-05, + "loss": 0.4444, + "step": 4700 + }, + { + "epoch": 0.9415532425940752, + "grad_norm": 6.407455921173096, + "learning_rate": 1.6591706804895415e-05, + "loss": 0.3479, + "step": 4702 + }, + { + "epoch": 0.9415532425940752, + "grad_norm": 0.3754770755767822, + "learning_rate": 1.6581191221904098e-05, + "loss": 0.1806, + "step": 4704 + }, + { + "epoch": 0.9423538831064852, + "grad_norm": 3.4751529693603516, + "learning_rate": 1.6570662785703716e-05, + "loss": 0.3811, + "step": 4706 + }, + { + "epoch": 0.9423538831064852, + "grad_norm": 0.576911985874176, + "learning_rate": 1.6560121516856592e-05, + "loss": 0.0948, + "step": 4708 + }, + { + "epoch": 0.9431545236188951, + "grad_norm": 10.397960662841797, + "learning_rate": 1.654956743595001e-05, + "loss": 0.4343, + "step": 4710 + }, + { + "epoch": 0.9431545236188951, + "grad_norm": 2.0833613872528076, + "learning_rate": 1.6539000563596328e-05, + "loss": 0.3504, + "step": 4712 + }, + { + "epoch": 0.9439551641313051, + "grad_norm": 5.1058735847473145, + "learning_rate": 1.6528420920432893e-05, + "loss": 0.1954, + "step": 4714 + }, + { + "epoch": 0.9439551641313051, + "grad_norm": 1.614511489868164, + "learning_rate": 1.651782852712194e-05, + "loss": 0.2023, + "step": 4716 + }, + { + "epoch": 0.944755804643715, + "grad_norm": 9.391684532165527, + "learning_rate": 1.6507223404350686e-05, + "loss": 0.26, + "step": 4718 + }, + { + "epoch": 0.944755804643715, + "grad_norm": 0.9295578002929688, + "learning_rate": 1.6496605572831127e-05, + "loss": 0.0922, + "step": 4720 + }, + { + "epoch": 0.9455564451561249, + "grad_norm": 5.3317742347717285, + "learning_rate": 1.648597505330016e-05, + "loss": 0.3746, + "step": 4722 + }, + { + "epoch": 0.9455564451561249, + "grad_norm": 1.5213603973388672, + "learning_rate": 1.6475331866519387e-05, + "loss": 0.0869, + "step": 4724 + }, + { + "epoch": 0.9463570856685348, + "grad_norm": 6.223171710968018, + "learning_rate": 1.6464676033275187e-05, + "loss": 0.1703, + "step": 4726 + }, + { + "epoch": 0.9463570856685348, + "grad_norm": 1.0867812633514404, + "learning_rate": 1.6454007574378657e-05, + "loss": 0.1502, + "step": 4728 + }, + { + "epoch": 0.9471577261809447, + "grad_norm": 5.643699645996094, + "learning_rate": 1.644332651066548e-05, + "loss": 1.0651, + "step": 4730 + }, + { + "epoch": 0.9471577261809447, + "grad_norm": 9.412582397460938, + "learning_rate": 1.6432632862996062e-05, + "loss": 0.613, + "step": 4732 + }, + { + "epoch": 0.9479583666933546, + "grad_norm": 7.2638773918151855, + "learning_rate": 1.6421926652255275e-05, + "loss": 0.4951, + "step": 4734 + }, + { + "epoch": 0.9479583666933546, + "grad_norm": 0.8472667932510376, + "learning_rate": 1.6411207899352633e-05, + "loss": 0.0507, + "step": 4736 + }, + { + "epoch": 0.9487590072057646, + "grad_norm": 6.039588451385498, + "learning_rate": 1.6400476625222057e-05, + "loss": 0.5594, + "step": 4738 + }, + { + "epoch": 0.9487590072057646, + "grad_norm": 4.380916118621826, + "learning_rate": 1.6389732850821964e-05, + "loss": 0.3329, + "step": 4740 + }, + { + "epoch": 0.9495596477181746, + "grad_norm": 1.4710291624069214, + "learning_rate": 1.6378976597135193e-05, + "loss": 0.1611, + "step": 4742 + }, + { + "epoch": 0.9495596477181746, + "grad_norm": 1.8565295934677124, + "learning_rate": 1.6368207885168904e-05, + "loss": 0.1224, + "step": 4744 + }, + { + "epoch": 0.9503602882305845, + "grad_norm": 5.433703422546387, + "learning_rate": 1.635742673595468e-05, + "loss": 0.6163, + "step": 4746 + }, + { + "epoch": 0.9503602882305845, + "grad_norm": 2.4298794269561768, + "learning_rate": 1.6346633170548275e-05, + "loss": 0.2157, + "step": 4748 + }, + { + "epoch": 0.9511609287429944, + "grad_norm": 1.5381369590759277, + "learning_rate": 1.6335827210029823e-05, + "loss": 0.0889, + "step": 4750 + }, + { + "epoch": 0.9511609287429944, + "grad_norm": 1.8349841833114624, + "learning_rate": 1.6325008875503563e-05, + "loss": 0.0657, + "step": 4752 + }, + { + "epoch": 0.9519615692554043, + "grad_norm": 3.6086432933807373, + "learning_rate": 1.6314178188097917e-05, + "loss": 0.2311, + "step": 4754 + }, + { + "epoch": 0.9519615692554043, + "grad_norm": 1.8219592571258545, + "learning_rate": 1.6303335168965495e-05, + "loss": 0.0747, + "step": 4756 + }, + { + "epoch": 0.9527622097678142, + "grad_norm": 7.754385471343994, + "learning_rate": 1.6292479839282904e-05, + "loss": 0.3329, + "step": 4758 + }, + { + "epoch": 0.9527622097678142, + "grad_norm": 0.22768713533878326, + "learning_rate": 1.628161222025089e-05, + "loss": 0.1501, + "step": 4760 + }, + { + "epoch": 0.9535628502802241, + "grad_norm": 4.880794048309326, + "learning_rate": 1.627073233309409e-05, + "loss": 0.2932, + "step": 4762 + }, + { + "epoch": 0.9535628502802241, + "grad_norm": 4.875804901123047, + "learning_rate": 1.625984019906122e-05, + "loss": 0.2166, + "step": 4764 + }, + { + "epoch": 0.9543634907926342, + "grad_norm": 5.037921905517578, + "learning_rate": 1.624893583942482e-05, + "loss": 0.3184, + "step": 4766 + }, + { + "epoch": 0.9543634907926342, + "grad_norm": 1.5768930912017822, + "learning_rate": 1.623801927548132e-05, + "loss": 0.1344, + "step": 4768 + }, + { + "epoch": 0.9551641313050441, + "grad_norm": 9.83228874206543, + "learning_rate": 1.6227090528551058e-05, + "loss": 0.7947, + "step": 4770 + }, + { + "epoch": 0.9551641313050441, + "grad_norm": 2.605186700820923, + "learning_rate": 1.6216149619978057e-05, + "loss": 0.1227, + "step": 4772 + }, + { + "epoch": 0.955964771817454, + "grad_norm": 7.0235137939453125, + "learning_rate": 1.6205196571130204e-05, + "loss": 0.2603, + "step": 4774 + }, + { + "epoch": 0.955964771817454, + "grad_norm": 3.386622905731201, + "learning_rate": 1.6194231403398987e-05, + "loss": 0.0829, + "step": 4776 + }, + { + "epoch": 0.9567654123298639, + "grad_norm": 3.9102277755737305, + "learning_rate": 1.618325413819967e-05, + "loss": 0.2221, + "step": 4778 + }, + { + "epoch": 0.9567654123298639, + "grad_norm": 2.3071391582489014, + "learning_rate": 1.6172264796971063e-05, + "loss": 0.0818, + "step": 4780 + }, + { + "epoch": 0.9575660528422738, + "grad_norm": 13.980855941772461, + "learning_rate": 1.6161263401175555e-05, + "loss": 0.49, + "step": 4782 + }, + { + "epoch": 0.9575660528422738, + "grad_norm": 4.941564083099365, + "learning_rate": 1.6150249972299173e-05, + "loss": 0.4838, + "step": 4784 + }, + { + "epoch": 0.9583666933546837, + "grad_norm": 3.666306257247925, + "learning_rate": 1.613922453185133e-05, + "loss": 0.7617, + "step": 4786 + }, + { + "epoch": 0.9583666933546837, + "grad_norm": 1.7290914058685303, + "learning_rate": 1.612818710136499e-05, + "loss": 0.112, + "step": 4788 + }, + { + "epoch": 0.9591673338670936, + "grad_norm": 5.280057907104492, + "learning_rate": 1.6117137702396454e-05, + "loss": 0.64, + "step": 4790 + }, + { + "epoch": 0.9591673338670936, + "grad_norm": 1.2270598411560059, + "learning_rate": 1.6106076356525484e-05, + "loss": 0.225, + "step": 4792 + }, + { + "epoch": 0.9599679743795037, + "grad_norm": 3.735119104385376, + "learning_rate": 1.6095003085355103e-05, + "loss": 0.4691, + "step": 4794 + }, + { + "epoch": 0.9599679743795037, + "grad_norm": 1.3581197261810303, + "learning_rate": 1.6083917910511623e-05, + "loss": 0.074, + "step": 4796 + }, + { + "epoch": 0.9607686148919136, + "grad_norm": 7.077986717224121, + "learning_rate": 1.6072820853644688e-05, + "loss": 0.3638, + "step": 4798 + }, + { + "epoch": 0.9607686148919136, + "grad_norm": 1.6393439769744873, + "learning_rate": 1.6061711936427028e-05, + "loss": 0.1585, + "step": 4800 + }, + { + "epoch": 0.9615692554043235, + "grad_norm": 1.6192734241485596, + "learning_rate": 1.6050591180554658e-05, + "loss": 0.1576, + "step": 4802 + }, + { + "epoch": 0.9615692554043235, + "grad_norm": 1.9862310886383057, + "learning_rate": 1.60394586077466e-05, + "loss": 0.1899, + "step": 4804 + }, + { + "epoch": 0.9623698959167334, + "grad_norm": 1.4536337852478027, + "learning_rate": 1.6028314239745068e-05, + "loss": 0.1866, + "step": 4806 + }, + { + "epoch": 0.9623698959167334, + "grad_norm": 1.2065790891647339, + "learning_rate": 1.6017158098315224e-05, + "loss": 0.1022, + "step": 4808 + }, + { + "epoch": 0.9631705364291433, + "grad_norm": 0.7055391669273376, + "learning_rate": 1.6005990205245226e-05, + "loss": 0.2447, + "step": 4810 + }, + { + "epoch": 0.9631705364291433, + "grad_norm": 2.4689550399780273, + "learning_rate": 1.5994810582346266e-05, + "loss": 0.2166, + "step": 4812 + }, + { + "epoch": 0.9639711769415532, + "grad_norm": 8.619380950927734, + "learning_rate": 1.5983619251452334e-05, + "loss": 0.6178, + "step": 4814 + }, + { + "epoch": 0.9639711769415532, + "grad_norm": 3.7860257625579834, + "learning_rate": 1.5972416234420404e-05, + "loss": 0.1955, + "step": 4816 + }, + { + "epoch": 0.9647718174539631, + "grad_norm": 1.8001779317855835, + "learning_rate": 1.596120155313017e-05, + "loss": 0.7861, + "step": 4818 + }, + { + "epoch": 0.9647718174539631, + "grad_norm": 0.4968314468860626, + "learning_rate": 1.594997522948413e-05, + "loss": 0.0166, + "step": 4820 + }, + { + "epoch": 0.9655724579663731, + "grad_norm": 5.843556880950928, + "learning_rate": 1.593873728540759e-05, + "loss": 0.293, + "step": 4822 + }, + { + "epoch": 0.9655724579663731, + "grad_norm": 2.7355475425720215, + "learning_rate": 1.592748774284844e-05, + "loss": 0.1333, + "step": 4824 + }, + { + "epoch": 0.966373098478783, + "grad_norm": 3.0672571659088135, + "learning_rate": 1.5916226623777346e-05, + "loss": 0.3329, + "step": 4826 + }, + { + "epoch": 0.966373098478783, + "grad_norm": 3.2818126678466797, + "learning_rate": 1.5904953950187448e-05, + "loss": 0.1759, + "step": 4828 + }, + { + "epoch": 0.967173738991193, + "grad_norm": 2.6799046993255615, + "learning_rate": 1.5893669744094587e-05, + "loss": 0.3416, + "step": 4830 + }, + { + "epoch": 0.967173738991193, + "grad_norm": 1.3844380378723145, + "learning_rate": 1.588237402753703e-05, + "loss": 0.1647, + "step": 4832 + }, + { + "epoch": 0.9679743795036029, + "grad_norm": 5.174304485321045, + "learning_rate": 1.5871066822575526e-05, + "loss": 0.7819, + "step": 4834 + }, + { + "epoch": 0.9679743795036029, + "grad_norm": 0.8069095015525818, + "learning_rate": 1.5859748151293354e-05, + "loss": 0.0709, + "step": 4836 + }, + { + "epoch": 0.9687750200160128, + "grad_norm": 3.2471611499786377, + "learning_rate": 1.5848418035796064e-05, + "loss": 0.2321, + "step": 4838 + }, + { + "epoch": 0.9687750200160128, + "grad_norm": 2.423759937286377, + "learning_rate": 1.5837076498211673e-05, + "loss": 0.1121, + "step": 4840 + }, + { + "epoch": 0.9695756605284227, + "grad_norm": 5.163968563079834, + "learning_rate": 1.5825723560690396e-05, + "loss": 0.38, + "step": 4842 + }, + { + "epoch": 0.9695756605284227, + "grad_norm": 0.4930083453655243, + "learning_rate": 1.581435924540482e-05, + "loss": 0.1255, + "step": 4844 + }, + { + "epoch": 0.9703763010408326, + "grad_norm": 6.762197971343994, + "learning_rate": 1.580298357454967e-05, + "loss": 0.2928, + "step": 4846 + }, + { + "epoch": 0.9703763010408326, + "grad_norm": 0.6415007710456848, + "learning_rate": 1.579159657034185e-05, + "loss": 0.0841, + "step": 4848 + }, + { + "epoch": 0.9711769415532426, + "grad_norm": 1.8433983325958252, + "learning_rate": 1.5780198255020485e-05, + "loss": 0.1499, + "step": 4850 + }, + { + "epoch": 0.9711769415532426, + "grad_norm": 3.0712270736694336, + "learning_rate": 1.5768788650846674e-05, + "loss": 0.1422, + "step": 4852 + }, + { + "epoch": 0.9719775820656525, + "grad_norm": 1.3497483730316162, + "learning_rate": 1.5757367780103672e-05, + "loss": 0.1542, + "step": 4854 + }, + { + "epoch": 0.9719775820656525, + "grad_norm": 2.3422505855560303, + "learning_rate": 1.574593566509664e-05, + "loss": 0.1305, + "step": 4856 + }, + { + "epoch": 0.9727782225780625, + "grad_norm": 2.2825849056243896, + "learning_rate": 1.5734492328152796e-05, + "loss": 0.1624, + "step": 4858 + }, + { + "epoch": 0.9727782225780625, + "grad_norm": 1.3143624067306519, + "learning_rate": 1.5723037791621203e-05, + "loss": 0.1084, + "step": 4860 + }, + { + "epoch": 0.9735788630904724, + "grad_norm": 0.9252250790596008, + "learning_rate": 1.5711572077872784e-05, + "loss": 0.2048, + "step": 4862 + }, + { + "epoch": 0.9735788630904724, + "grad_norm": 1.363356113433838, + "learning_rate": 1.5700095209300386e-05, + "loss": 0.1148, + "step": 4864 + }, + { + "epoch": 0.9743795036028823, + "grad_norm": 5.8834686279296875, + "learning_rate": 1.568860720831852e-05, + "loss": 0.1839, + "step": 4866 + }, + { + "epoch": 0.9743795036028823, + "grad_norm": 1.4760607481002808, + "learning_rate": 1.5677108097363565e-05, + "loss": 0.0453, + "step": 4868 + }, + { + "epoch": 0.9751801441152922, + "grad_norm": 0.8130593299865723, + "learning_rate": 1.5665597898893508e-05, + "loss": 0.1397, + "step": 4870 + }, + { + "epoch": 0.9751801441152922, + "grad_norm": 2.410952568054199, + "learning_rate": 1.5654076635387976e-05, + "loss": 0.0847, + "step": 4872 + }, + { + "epoch": 0.9759807846277022, + "grad_norm": 5.913552761077881, + "learning_rate": 1.5642544329348316e-05, + "loss": 0.3316, + "step": 4874 + }, + { + "epoch": 0.9759807846277022, + "grad_norm": 0.4340955913066864, + "learning_rate": 1.5631001003297302e-05, + "loss": 0.1424, + "step": 4876 + }, + { + "epoch": 0.9767814251401121, + "grad_norm": 6.584893703460693, + "learning_rate": 1.5619446679779367e-05, + "loss": 0.7558, + "step": 4878 + }, + { + "epoch": 0.9767814251401121, + "grad_norm": 6.198083877563477, + "learning_rate": 1.560788138136029e-05, + "loss": 0.1625, + "step": 4880 + }, + { + "epoch": 0.977582065652522, + "grad_norm": 5.276307582855225, + "learning_rate": 1.5596305130627414e-05, + "loss": 0.2479, + "step": 4882 + }, + { + "epoch": 0.977582065652522, + "grad_norm": 0.23203061521053314, + "learning_rate": 1.5584717950189373e-05, + "loss": 0.0319, + "step": 4884 + }, + { + "epoch": 0.978382706164932, + "grad_norm": 3.199326515197754, + "learning_rate": 1.5573119862676155e-05, + "loss": 0.3272, + "step": 4886 + }, + { + "epoch": 0.978382706164932, + "grad_norm": 14.296629905700684, + "learning_rate": 1.5561510890739137e-05, + "loss": 0.2558, + "step": 4888 + }, + { + "epoch": 0.9791833466773419, + "grad_norm": 0.33210670948028564, + "learning_rate": 1.554989105705083e-05, + "loss": 0.1996, + "step": 4890 + }, + { + "epoch": 0.9791833466773419, + "grad_norm": 0.22034180164337158, + "learning_rate": 1.5538260384305083e-05, + "loss": 0.0621, + "step": 4892 + }, + { + "epoch": 0.9799839871897518, + "grad_norm": 2.1669843196868896, + "learning_rate": 1.5526618895216786e-05, + "loss": 0.5022, + "step": 4894 + }, + { + "epoch": 0.9799839871897518, + "grad_norm": 7.05861759185791, + "learning_rate": 1.5514966612522088e-05, + "loss": 0.1217, + "step": 4896 + }, + { + "epoch": 0.9807846277021617, + "grad_norm": 12.10041618347168, + "learning_rate": 1.5503303558978112e-05, + "loss": 0.5138, + "step": 4898 + }, + { + "epoch": 0.9807846277021617, + "grad_norm": 2.97271466255188, + "learning_rate": 1.5491629757363033e-05, + "loss": 0.239, + "step": 4900 + }, + { + "epoch": 0.9815852682145717, + "grad_norm": 0.8525753617286682, + "learning_rate": 1.547994523047609e-05, + "loss": 0.051, + "step": 4902 + }, + { + "epoch": 0.9815852682145717, + "grad_norm": 0.21690915524959564, + "learning_rate": 1.546825000113736e-05, + "loss": 0.1326, + "step": 4904 + }, + { + "epoch": 0.9823859087269816, + "grad_norm": 9.991996765136719, + "learning_rate": 1.545654409218794e-05, + "loss": 0.9844, + "step": 4906 + }, + { + "epoch": 0.9823859087269816, + "grad_norm": 12.783656120300293, + "learning_rate": 1.544482752648966e-05, + "loss": 0.1865, + "step": 4908 + }, + { + "epoch": 0.9831865492393915, + "grad_norm": 1.5402576923370361, + "learning_rate": 1.5433100326925298e-05, + "loss": 0.1847, + "step": 4910 + }, + { + "epoch": 0.9831865492393915, + "grad_norm": 2.537280797958374, + "learning_rate": 1.5421362516398285e-05, + "loss": 0.1242, + "step": 4912 + }, + { + "epoch": 0.9839871897518014, + "grad_norm": 7.171698570251465, + "learning_rate": 1.5409614117832797e-05, + "loss": 0.5489, + "step": 4914 + }, + { + "epoch": 0.9839871897518014, + "grad_norm": 0.5166508555412292, + "learning_rate": 1.539785515417377e-05, + "loss": 0.047, + "step": 4916 + }, + { + "epoch": 0.9847878302642114, + "grad_norm": 3.0343897342681885, + "learning_rate": 1.538608564838665e-05, + "loss": 0.2059, + "step": 4918 + }, + { + "epoch": 0.9847878302642114, + "grad_norm": 3.2183854579925537, + "learning_rate": 1.5374305623457605e-05, + "loss": 0.1007, + "step": 4920 + }, + { + "epoch": 0.9855884707766213, + "grad_norm": 9.68044376373291, + "learning_rate": 1.5362515102393244e-05, + "loss": 0.5589, + "step": 4922 + }, + { + "epoch": 0.9855884707766213, + "grad_norm": 1.0447769165039062, + "learning_rate": 1.5350714108220677e-05, + "loss": 0.3349, + "step": 4924 + }, + { + "epoch": 0.9863891112890312, + "grad_norm": 0.4139030873775482, + "learning_rate": 1.5338902663987564e-05, + "loss": 0.2146, + "step": 4926 + }, + { + "epoch": 0.9863891112890312, + "grad_norm": 0.0896034836769104, + "learning_rate": 1.532708079276186e-05, + "loss": 0.11, + "step": 4928 + }, + { + "epoch": 0.9871897518014412, + "grad_norm": 0.13244567811489105, + "learning_rate": 1.531524851763198e-05, + "loss": 0.1479, + "step": 4930 + }, + { + "epoch": 0.9871897518014412, + "grad_norm": 1.3198281526565552, + "learning_rate": 1.5303405861706567e-05, + "loss": 0.145, + "step": 4932 + }, + { + "epoch": 0.9879903923138511, + "grad_norm": 6.030502796173096, + "learning_rate": 1.529155284811464e-05, + "loss": 0.5539, + "step": 4934 + }, + { + "epoch": 0.9879903923138511, + "grad_norm": 1.2341971397399902, + "learning_rate": 1.5279689500005353e-05, + "loss": 0.2151, + "step": 4936 + }, + { + "epoch": 0.988791032826261, + "grad_norm": 9.124841690063477, + "learning_rate": 1.5267815840548067e-05, + "loss": 0.418, + "step": 4938 + }, + { + "epoch": 0.988791032826261, + "grad_norm": 1.8590996265411377, + "learning_rate": 1.5255931892932344e-05, + "loss": 0.2189, + "step": 4940 + }, + { + "epoch": 0.9895916733386709, + "grad_norm": 0.7875673174858093, + "learning_rate": 1.5244037680367739e-05, + "loss": 0.132, + "step": 4942 + }, + { + "epoch": 0.9895916733386709, + "grad_norm": 2.1960654258728027, + "learning_rate": 1.5232133226083962e-05, + "loss": 0.2946, + "step": 4944 + }, + { + "epoch": 0.9903923138510808, + "grad_norm": 3.7766363620758057, + "learning_rate": 1.522021855333061e-05, + "loss": 0.409, + "step": 4946 + }, + { + "epoch": 0.9903923138510808, + "grad_norm": 2.5850186347961426, + "learning_rate": 1.5208293685377362e-05, + "loss": 0.0827, + "step": 4948 + }, + { + "epoch": 0.9911929543634908, + "grad_norm": 1.026114583015442, + "learning_rate": 1.519635864551371e-05, + "loss": 0.0664, + "step": 4950 + }, + { + "epoch": 0.9911929543634908, + "grad_norm": 0.5107403993606567, + "learning_rate": 1.5184413457049014e-05, + "loss": 0.0652, + "step": 4952 + }, + { + "epoch": 0.9919935948759008, + "grad_norm": 0.5458845496177673, + "learning_rate": 1.5172458143312548e-05, + "loss": 0.1662, + "step": 4954 + }, + { + "epoch": 0.9919935948759008, + "grad_norm": 1.0446908473968506, + "learning_rate": 1.5160492727653238e-05, + "loss": 0.172, + "step": 4956 + }, + { + "epoch": 0.9927942353883107, + "grad_norm": 18.559038162231445, + "learning_rate": 1.5148517233439858e-05, + "loss": 1.0372, + "step": 4958 + }, + { + "epoch": 0.9927942353883107, + "grad_norm": 5.7443084716796875, + "learning_rate": 1.5136531684060753e-05, + "loss": 0.4142, + "step": 4960 + }, + { + "epoch": 0.9935948759007206, + "grad_norm": 0.2382277399301529, + "learning_rate": 1.512453610292402e-05, + "loss": 0.0945, + "step": 4962 + }, + { + "epoch": 0.9935948759007206, + "grad_norm": 1.581146478652954, + "learning_rate": 1.5112530513457251e-05, + "loss": 0.1246, + "step": 4964 + }, + { + "epoch": 0.9943955164131305, + "grad_norm": 4.015986919403076, + "learning_rate": 1.5100514939107598e-05, + "loss": 0.2673, + "step": 4966 + }, + { + "epoch": 0.9943955164131305, + "grad_norm": 0.8544518947601318, + "learning_rate": 1.50884894033418e-05, + "loss": 0.0405, + "step": 4968 + }, + { + "epoch": 0.9951961569255404, + "grad_norm": 8.923181533813477, + "learning_rate": 1.5076453929645927e-05, + "loss": 0.7578, + "step": 4970 + }, + { + "epoch": 0.9951961569255404, + "grad_norm": 0.7633010745048523, + "learning_rate": 1.5064408541525578e-05, + "loss": 0.0887, + "step": 4972 + }, + { + "epoch": 0.9959967974379503, + "grad_norm": 4.297773838043213, + "learning_rate": 1.505235326250563e-05, + "loss": 0.3815, + "step": 4974 + }, + { + "epoch": 0.9959967974379503, + "grad_norm": 0.25559738278388977, + "learning_rate": 1.504028811613027e-05, + "loss": 0.0342, + "step": 4976 + }, + { + "epoch": 0.9967974379503602, + "grad_norm": 1.2921879291534424, + "learning_rate": 1.5028213125963054e-05, + "loss": 0.1516, + "step": 4978 + }, + { + "epoch": 0.9967974379503602, + "grad_norm": 1.1732321977615356, + "learning_rate": 1.5016128315586636e-05, + "loss": 0.1335, + "step": 4980 + }, + { + "epoch": 0.9975980784627703, + "grad_norm": 0.02667553536593914, + "learning_rate": 1.5004033708602977e-05, + "loss": 0.191, + "step": 4982 + }, + { + "epoch": 0.9975980784627703, + "grad_norm": 0.4958198666572571, + "learning_rate": 1.4991929328633043e-05, + "loss": 0.0611, + "step": 4984 + }, + { + "epoch": 0.9983987189751802, + "grad_norm": 3.287203550338745, + "learning_rate": 1.4979815199317011e-05, + "loss": 0.6667, + "step": 4986 + }, + { + "epoch": 0.9983987189751802, + "grad_norm": 0.12712432444095612, + "learning_rate": 1.4967691344314012e-05, + "loss": 0.0722, + "step": 4988 + }, + { + "epoch": 0.9991993594875901, + "grad_norm": 0.716675341129303, + "learning_rate": 1.495555778730216e-05, + "loss": 0.3779, + "step": 4990 + }, + { + "epoch": 0.9991993594875901, + "grad_norm": 1.487331748008728, + "learning_rate": 1.4943414551978622e-05, + "loss": 0.0819, + "step": 4992 + }, + { + "epoch": 1.0, + "grad_norm": 10.063554763793945, + "learning_rate": 1.4931261662059333e-05, + "loss": 0.9096, + "step": 4994 + }, + { + "epoch": 1.0, + "grad_norm": 1.4471527338027954, + "learning_rate": 1.4919099141279214e-05, + "loss": 0.1358, + "step": 4996 + }, + { + "epoch": 1.0, + "step": 4996, + "total_flos": 2.9232688791552e+16, + "train_loss": 0.2778742440145095, + "train_runtime": 12921.7401, + "train_samples_per_second": 1.547, + "train_steps_per_second": 0.387 + } + ], + "logging_steps": 2, + "max_steps": 4996, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 2.9232688791552e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_25_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_25_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c4850e17f6b1d746b90c26a9143c83557832c37 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_25_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc69835549761b75a710614ef71d52a0c3b2206b5b36711325dc9bf5e76dc9ef +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_25_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_25_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..55da68661ad61af6e55745b614db1b7b548d1f5d --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_25_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b69f3fed1a6de2ee44d7229ae8bd8d509535066b051fdba6f00db7a7b654e37 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_25_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_25_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..cd5f52e74438dc0de10438b59332cbc39072bcbc --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_25_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9eae89a70b3dcb234c97873d3e2a413a1f296750a59cd35d19b3717bf195928 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_25_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_25_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..a57c163bd1a78429ecba3369ddc782426231d5a6 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_gradnorm_scenario12_new_10000_random0_25_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46bc12b9a0a1dcfbf4bf5bfbb37173660fac7a5a390063828967ee72a57d5b16 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..00f8dea970ac6bf935df12cdbf13720e01eb7044 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,3776 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1249, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016012810248198558, + "learning_rate": 2.4341906163790364e-06, + "loss": 0.2873, + "step": 2 + }, + { + "epoch": 0.0032025620496397116, + "learning_rate": 2.4708617956148052e-06, + "loss": 0.3187, + "step": 4 + }, + { + "epoch": 0.004803843074459567, + "learning_rate": 2.507768247396697e-06, + "loss": 0.3698, + "step": 6 + }, + { + "epoch": 0.006405124099279423, + "learning_rate": 2.5449088184619065e-06, + "loss": 0.3329, + "step": 8 + }, + { + "epoch": 0.008006405124099279, + "learning_rate": 2.5822823482318517e-06, + "loss": 0.4027, + "step": 10 + }, + { + "epoch": 0.009607686148919135, + "learning_rate": 2.6198876688483453e-06, + "loss": 0.3515, + "step": 12 + }, + { + "epoch": 0.01120896717373899, + "learning_rate": 2.6577236052101764e-06, + "loss": 0.4139, + "step": 14 + }, + { + "epoch": 0.012810248198558846, + "learning_rate": 2.6957889750097866e-06, + "loss": 0.3033, + "step": 16 + }, + { + "epoch": 0.014411529223378704, + "learning_rate": 2.7340825887701848e-06, + "loss": 0.3034, + "step": 18 + }, + { + "epoch": 0.016012810248198558, + "learning_rate": 2.772603249882202e-06, + "loss": 0.3214, + "step": 20 + }, + { + "epoch": 0.017614091273018415, + "learning_rate": 2.81134975464178e-06, + "loss": 0.3889, + "step": 22 + }, + { + "epoch": 0.01921537229783827, + "learning_rate": 2.850320892287688e-06, + "loss": 0.2902, + "step": 24 + }, + { + "epoch": 0.020816653322658127, + "learning_rate": 2.889515445039256e-06, + "loss": 0.3517, + "step": 26 + }, + { + "epoch": 0.02241793434747798, + "learning_rate": 2.928932188134529e-06, + "loss": 0.3639, + "step": 28 + }, + { + "epoch": 0.02401921537229784, + "learning_rate": 2.9685698898684355e-06, + "loss": 0.2902, + "step": 30 + }, + { + "epoch": 0.025620496397117692, + "learning_rate": 3.00842731163137e-06, + "loss": 0.3802, + "step": 32 + }, + { + "epoch": 0.02722177742193755, + "learning_rate": 3.048503207947854e-06, + "loss": 0.2903, + "step": 34 + }, + { + "epoch": 0.028823058446757407, + "learning_rate": 3.0887963265154187e-06, + "loss": 0.2894, + "step": 36 + }, + { + "epoch": 0.03042433947157726, + "learning_rate": 3.129305408243829e-06, + "loss": 0.3188, + "step": 38 + }, + { + "epoch": 0.032025620496397116, + "learning_rate": 3.17002918729432e-06, + "loss": 0.3032, + "step": 40 + }, + { + "epoch": 0.03362690152121697, + "learning_rate": 3.2109663911192622e-06, + "loss": 0.3344, + "step": 42 + }, + { + "epoch": 0.03522818254603683, + "learning_rate": 3.2521157405018146e-06, + "loss": 0.3036, + "step": 44 + }, + { + "epoch": 0.03682946357085669, + "learning_rate": 3.293475949595998e-06, + "loss": 0.4138, + "step": 46 + }, + { + "epoch": 0.03843074459567654, + "learning_rate": 3.335045725966829e-06, + "loss": 0.3349, + "step": 48 + }, + { + "epoch": 0.040032025620496396, + "learning_rate": 3.3768237706306716e-06, + "loss": 0.4503, + "step": 50 + }, + { + "epoch": 0.041633306645316254, + "learning_rate": 3.418808778095917e-06, + "loss": 0.3346, + "step": 52 + }, + { + "epoch": 0.04323458767013611, + "learning_rate": 3.460999436403676e-06, + "loss": 0.3641, + "step": 54 + }, + { + "epoch": 0.04483586869495596, + "learning_rate": 3.5033944271688624e-06, + "loss": 0.276, + "step": 56 + }, + { + "epoch": 0.04643714971977582, + "learning_rate": 3.5459924256213596e-06, + "loss": 0.3186, + "step": 58 + }, + { + "epoch": 0.04803843074459568, + "learning_rate": 3.588792100647368e-06, + "loss": 0.2757, + "step": 60 + }, + { + "epoch": 0.049639711769415534, + "learning_rate": 3.6317921148310965e-06, + "loss": 0.3185, + "step": 62 + }, + { + "epoch": 0.051240992794235385, + "learning_rate": 3.674991124496452e-06, + "loss": 0.3327, + "step": 64 + }, + { + "epoch": 0.05284227381905524, + "learning_rate": 3.7183877797491143e-06, + "loss": 0.3488, + "step": 66 + }, + { + "epoch": 0.0544435548438751, + "learning_rate": 3.7619807245186824e-06, + "loss": 0.3175, + "step": 68 + }, + { + "epoch": 0.05604483586869496, + "learning_rate": 3.8057685966010025e-06, + "loss": 0.3886, + "step": 70 + }, + { + "epoch": 0.057646116893514815, + "learning_rate": 3.849750027700842e-06, + "loss": 0.384, + "step": 72 + }, + { + "epoch": 0.059247397918334666, + "learning_rate": 3.8939236434745184e-06, + "loss": 0.3177, + "step": 74 + }, + { + "epoch": 0.06084867894315452, + "learning_rate": 3.938288063572962e-06, + "loss": 0.3994, + "step": 76 + }, + { + "epoch": 0.06244995996797438, + "learning_rate": 3.982841901684792e-06, + "loss": 0.3297, + "step": 78 + }, + { + "epoch": 0.06405124099279423, + "learning_rate": 4.027583765579601e-06, + "loss": 0.3639, + "step": 80 + }, + { + "epoch": 0.0656525220176141, + "learning_rate": 4.072512257151546e-06, + "loss": 0.3185, + "step": 82 + }, + { + "epoch": 0.06725380304243395, + "learning_rate": 4.117625972462988e-06, + "loss": 0.3034, + "step": 84 + }, + { + "epoch": 0.0688550840672538, + "learning_rate": 4.1629235017883285e-06, + "loss": 0.3053, + "step": 86 + }, + { + "epoch": 0.07045636509207366, + "learning_rate": 4.208403429658151e-06, + "loss": 0.3487, + "step": 88 + }, + { + "epoch": 0.07205764611689351, + "learning_rate": 4.254064334903347e-06, + "loss": 0.3347, + "step": 90 + }, + { + "epoch": 0.07365892714171338, + "learning_rate": 4.299904790699619e-06, + "loss": 0.3342, + "step": 92 + }, + { + "epoch": 0.07526020816653323, + "learning_rate": 4.345923364612024e-06, + "loss": 0.3212, + "step": 94 + }, + { + "epoch": 0.07686148919135308, + "learning_rate": 4.392118618639698e-06, + "loss": 0.3519, + "step": 96 + }, + { + "epoch": 0.07846277021617294, + "learning_rate": 4.4384891092608795e-06, + "loss": 0.3346, + "step": 98 + }, + { + "epoch": 0.08006405124099279, + "learning_rate": 4.485033387477915e-06, + "loss": 0.3031, + "step": 100 + }, + { + "epoch": 0.08166533226581266, + "learning_rate": 4.531749998862628e-06, + "loss": 0.3639, + "step": 102 + }, + { + "epoch": 0.08326661329063251, + "learning_rate": 4.578637483601732e-06, + "loss": 0.3325, + "step": 104 + }, + { + "epoch": 0.08486789431545236, + "learning_rate": 4.625694376542399e-06, + "loss": 0.3536, + "step": 106 + }, + { + "epoch": 0.08646917534027222, + "learning_rate": 4.672919207238145e-06, + "loss": 0.3659, + "step": 108 + }, + { + "epoch": 0.08807045636509207, + "learning_rate": 4.720310499994664e-06, + "loss": 0.305, + "step": 110 + }, + { + "epoch": 0.08967173738991192, + "learning_rate": 4.767866773916041e-06, + "loss": 0.2889, + "step": 112 + }, + { + "epoch": 0.09127301841473179, + "learning_rate": 4.81558654295099e-06, + "loss": 0.3052, + "step": 114 + }, + { + "epoch": 0.09287429943955164, + "learning_rate": 4.863468315939234e-06, + "loss": 0.3799, + "step": 116 + }, + { + "epoch": 0.0944755804643715, + "learning_rate": 4.911510596658202e-06, + "loss": 0.3637, + "step": 118 + }, + { + "epoch": 0.09607686148919135, + "learning_rate": 4.959711883869734e-06, + "loss": 0.3328, + "step": 120 + }, + { + "epoch": 0.0976781425140112, + "learning_rate": 5.0080706713669435e-06, + "loss": 0.29, + "step": 122 + }, + { + "epoch": 0.09927942353883107, + "learning_rate": 5.056585448021398e-06, + "loss": 0.3639, + "step": 124 + }, + { + "epoch": 0.10088070456365092, + "learning_rate": 5.105254697830208e-06, + "loss": 0.3173, + "step": 126 + }, + { + "epoch": 0.10248198558847077, + "learning_rate": 5.154076899963514e-06, + "loss": 0.3517, + "step": 128 + }, + { + "epoch": 0.10408326661329063, + "learning_rate": 5.203050528811959e-06, + "loss": 0.2889, + "step": 130 + }, + { + "epoch": 0.10568454763811048, + "learning_rate": 5.2521740540343205e-06, + "loss": 0.3658, + "step": 132 + }, + { + "epoch": 0.10728582866293035, + "learning_rate": 5.3014459406054295e-06, + "loss": 0.289, + "step": 134 + }, + { + "epoch": 0.1088871096877502, + "learning_rate": 5.350864648864026e-06, + "loss": 0.4089, + "step": 136 + }, + { + "epoch": 0.11048839071257005, + "learning_rate": 5.4004286345609665e-06, + "loss": 0.3886, + "step": 138 + }, + { + "epoch": 0.11208967173738991, + "learning_rate": 5.450136348907444e-06, + "loss": 0.2892, + "step": 140 + }, + { + "epoch": 0.11369095276220977, + "learning_rate": 5.499986238623329e-06, + "loss": 0.3297, + "step": 142 + }, + { + "epoch": 0.11529223378702963, + "learning_rate": 5.549976745985809e-06, + "loss": 0.3812, + "step": 144 + }, + { + "epoch": 0.11689351481184948, + "learning_rate": 5.6001063088780085e-06, + "loss": 0.2876, + "step": 146 + }, + { + "epoch": 0.11849479583666933, + "learning_rate": 5.650373360837763e-06, + "loss": 0.4152, + "step": 148 + }, + { + "epoch": 0.1200960768614892, + "learning_rate": 5.700776331106674e-06, + "loss": 0.3808, + "step": 150 + }, + { + "epoch": 0.12169735788630905, + "learning_rate": 5.751313644679071e-06, + "loss": 0.3476, + "step": 152 + }, + { + "epoch": 0.1232986389111289, + "learning_rate": 5.8019837223513295e-06, + "loss": 0.3637, + "step": 154 + }, + { + "epoch": 0.12489991993594876, + "learning_rate": 5.852784980771182e-06, + "loss": 0.3174, + "step": 156 + }, + { + "epoch": 0.1265012009607686, + "learning_rate": 5.903715832487138e-06, + "loss": 0.2898, + "step": 158 + }, + { + "epoch": 0.12810248198558846, + "learning_rate": 5.954774685998206e-06, + "loss": 0.3183, + "step": 160 + }, + { + "epoch": 0.1297037630104083, + "learning_rate": 6.005959945803494e-06, + "loss": 0.399, + "step": 162 + }, + { + "epoch": 0.1313050440352282, + "learning_rate": 6.057270012452186e-06, + "loss": 0.4584, + "step": 164 + }, + { + "epoch": 0.13290632506004804, + "learning_rate": 6.108703282593461e-06, + "loss": 0.3185, + "step": 166 + }, + { + "epoch": 0.1345076060848679, + "learning_rate": 6.160258149026557e-06, + "loss": 0.3803, + "step": 168 + }, + { + "epoch": 0.13610888710968774, + "learning_rate": 6.2119330007511014e-06, + "loss": 0.3183, + "step": 170 + }, + { + "epoch": 0.1377101681345076, + "learning_rate": 6.263726223017326e-06, + "loss": 0.4025, + "step": 172 + }, + { + "epoch": 0.13931144915932747, + "learning_rate": 6.315636197376634e-06, + "loss": 0.2899, + "step": 174 + }, + { + "epoch": 0.14091273018414732, + "learning_rate": 6.3676613017321305e-06, + "loss": 0.3034, + "step": 176 + }, + { + "epoch": 0.14251401120896717, + "learning_rate": 6.419799910389257e-06, + "loss": 0.3173, + "step": 178 + }, + { + "epoch": 0.14411529223378702, + "learning_rate": 6.472050394106689e-06, + "loss": 0.4692, + "step": 180 + }, + { + "epoch": 0.14571657325860687, + "learning_rate": 6.524411120147204e-06, + "loss": 0.3811, + "step": 182 + }, + { + "epoch": 0.14731785428342675, + "learning_rate": 6.576880452328645e-06, + "loss": 0.2763, + "step": 184 + }, + { + "epoch": 0.1489191353082466, + "learning_rate": 6.6294567510751675e-06, + "loss": 0.3658, + "step": 186 + }, + { + "epoch": 0.15052041633306645, + "learning_rate": 6.682138373468341e-06, + "loss": 0.3487, + "step": 188 + }, + { + "epoch": 0.1521216973578863, + "learning_rate": 6.734923673298605e-06, + "loss": 0.3837, + "step": 190 + }, + { + "epoch": 0.15372297838270615, + "learning_rate": 6.787811001116654e-06, + "loss": 0.3489, + "step": 192 + }, + { + "epoch": 0.15532425940752603, + "learning_rate": 6.840798704284939e-06, + "loss": 0.3186, + "step": 194 + }, + { + "epoch": 0.15692554043234588, + "learning_rate": 6.893885127029419e-06, + "loss": 0.3485, + "step": 196 + }, + { + "epoch": 0.15852682145716573, + "learning_rate": 6.94706861049117e-06, + "loss": 0.3189, + "step": 198 + }, + { + "epoch": 0.16012810248198558, + "learning_rate": 7.000347492778341e-06, + "loss": 0.3886, + "step": 200 + }, + { + "epoch": 0.16172938350680544, + "learning_rate": 7.05372010901803e-06, + "loss": 0.38, + "step": 202 + }, + { + "epoch": 0.1633306645316253, + "learning_rate": 7.1071847914082605e-06, + "loss": 0.3641, + "step": 204 + }, + { + "epoch": 0.16493194555644516, + "learning_rate": 7.160739869270219e-06, + "loss": 0.3328, + "step": 206 + }, + { + "epoch": 0.16653322658126501, + "learning_rate": 7.214383669100317e-06, + "loss": 0.3643, + "step": 208 + }, + { + "epoch": 0.16813450760608487, + "learning_rate": 7.268114514622635e-06, + "loss": 0.3659, + "step": 210 + }, + { + "epoch": 0.16973578863090472, + "learning_rate": 7.321930726841144e-06, + "loss": 0.3809, + "step": 212 + }, + { + "epoch": 0.17133706965572457, + "learning_rate": 7.375830624092336e-06, + "loss": 0.4804, + "step": 214 + }, + { + "epoch": 0.17293835068054444, + "learning_rate": 7.429812522097613e-06, + "loss": 0.3487, + "step": 216 + }, + { + "epoch": 0.1745396317053643, + "learning_rate": 7.4838747340160475e-06, + "loss": 0.3519, + "step": 218 + }, + { + "epoch": 0.17614091273018415, + "learning_rate": 7.538015570497046e-06, + "loss": 0.4227, + "step": 220 + }, + { + "epoch": 0.177742193755004, + "learning_rate": 7.592233339733077e-06, + "loss": 0.3057, + "step": 222 + }, + { + "epoch": 0.17934347477982385, + "learning_rate": 7.646526347512665e-06, + "loss": 0.3637, + "step": 224 + }, + { + "epoch": 0.18094475580464373, + "learning_rate": 7.70089289727319e-06, + "loss": 0.3036, + "step": 226 + }, + { + "epoch": 0.18254603682946358, + "learning_rate": 7.755331290154041e-06, + "loss": 0.3347, + "step": 228 + }, + { + "epoch": 0.18414731785428343, + "learning_rate": 7.809839825049565e-06, + "loss": 0.2894, + "step": 230 + }, + { + "epoch": 0.18574859887910328, + "learning_rate": 7.864416798662347e-06, + "loss": 0.2507, + "step": 232 + }, + { + "epoch": 0.18734987990392313, + "learning_rate": 7.919060505556376e-06, + "loss": 0.3037, + "step": 234 + }, + { + "epoch": 0.188951160928743, + "learning_rate": 7.973769238210291e-06, + "loss": 0.3383, + "step": 236 + }, + { + "epoch": 0.19055244195356286, + "learning_rate": 8.028541287070858e-06, + "loss": 0.3973, + "step": 238 + }, + { + "epoch": 0.1921537229783827, + "learning_rate": 8.083374940606256e-06, + "loss": 0.3217, + "step": 240 + }, + { + "epoch": 0.19375500400320256, + "learning_rate": 8.138268485359684e-06, + "loss": 0.3094, + "step": 242 + }, + { + "epoch": 0.1953562850280224, + "learning_rate": 8.193220206002785e-06, + "loss": 0.3188, + "step": 244 + }, + { + "epoch": 0.1969575660528423, + "learning_rate": 8.248228385389349e-06, + "loss": 0.4585, + "step": 246 + }, + { + "epoch": 0.19855884707766214, + "learning_rate": 8.303291304608936e-06, + "loss": 0.3348, + "step": 248 + }, + { + "epoch": 0.200160128102482, + "learning_rate": 8.358407243040524e-06, + "loss": 0.5072, + "step": 250 + }, + { + "epoch": 0.20176140912730184, + "learning_rate": 8.413574478406386e-06, + "loss": 0.2892, + "step": 252 + }, + { + "epoch": 0.2033626901521217, + "learning_rate": 8.468791286825856e-06, + "loss": 0.3638, + "step": 254 + }, + { + "epoch": 0.20496397117694154, + "learning_rate": 8.524055942869135e-06, + "loss": 0.289, + "step": 256 + }, + { + "epoch": 0.20656525220176142, + "learning_rate": 8.579366719611353e-06, + "loss": 0.3992, + "step": 258 + }, + { + "epoch": 0.20816653322658127, + "learning_rate": 8.634721888686368e-06, + "loss": 0.3491, + "step": 260 + }, + { + "epoch": 0.20976781425140112, + "learning_rate": 8.690119720340907e-06, + "loss": 0.4324, + "step": 262 + }, + { + "epoch": 0.21136909527622097, + "learning_rate": 8.74555848348857e-06, + "loss": 0.3183, + "step": 264 + }, + { + "epoch": 0.21297037630104082, + "learning_rate": 8.801036445763858e-06, + "loss": 0.3641, + "step": 266 + }, + { + "epoch": 0.2145716573258607, + "learning_rate": 8.856551873576448e-06, + "loss": 0.2926, + "step": 268 + }, + { + "epoch": 0.21617293835068055, + "learning_rate": 8.912103032165206e-06, + "loss": 0.3662, + "step": 270 + }, + { + "epoch": 0.2177742193755004, + "learning_rate": 8.967688185652527e-06, + "loss": 0.3697, + "step": 272 + }, + { + "epoch": 0.21937550040032025, + "learning_rate": 9.023305597098526e-06, + "loss": 0.2904, + "step": 274 + }, + { + "epoch": 0.2209767814251401, + "learning_rate": 9.078953528555258e-06, + "loss": 0.2759, + "step": 276 + }, + { + "epoch": 0.22257806244995998, + "learning_rate": 9.134630241121135e-06, + "loss": 0.3566, + "step": 278 + }, + { + "epoch": 0.22417934347477983, + "learning_rate": 9.190333994995208e-06, + "loss": 0.4144, + "step": 280 + }, + { + "epoch": 0.22578062449959968, + "learning_rate": 9.24606304953148e-06, + "loss": 0.4542, + "step": 282 + }, + { + "epoch": 0.22738190552441953, + "learning_rate": 9.301815663293426e-06, + "loss": 0.3218, + "step": 284 + }, + { + "epoch": 0.22898318654923938, + "learning_rate": 9.35759009410826e-06, + "loss": 0.4542, + "step": 286 + }, + { + "epoch": 0.23058446757405926, + "learning_rate": 9.41338459912151e-06, + "loss": 0.4297, + "step": 288 + }, + { + "epoch": 0.2321857485988791, + "learning_rate": 9.469197434851414e-06, + "loss": 0.3659, + "step": 290 + }, + { + "epoch": 0.23378702962369896, + "learning_rate": 9.52502685724336e-06, + "loss": 0.3216, + "step": 292 + }, + { + "epoch": 0.2353883106485188, + "learning_rate": 9.580871121724498e-06, + "loss": 0.3188, + "step": 294 + }, + { + "epoch": 0.23698959167333866, + "learning_rate": 9.636728483258116e-06, + "loss": 0.3385, + "step": 296 + }, + { + "epoch": 0.23859087269815854, + "learning_rate": 9.692597196398302e-06, + "loss": 0.3054, + "step": 298 + }, + { + "epoch": 0.2401921537229784, + "learning_rate": 9.748475515344416e-06, + "loss": 0.3187, + "step": 300 + }, + { + "epoch": 0.24179343474779824, + "learning_rate": 9.80436169399561e-06, + "loss": 0.3189, + "step": 302 + }, + { + "epoch": 0.2433947157726181, + "learning_rate": 9.8602539860055e-06, + "loss": 0.4183, + "step": 304 + }, + { + "epoch": 0.24499599679743794, + "learning_rate": 9.916150644836596e-06, + "loss": 0.3384, + "step": 306 + }, + { + "epoch": 0.2465972778222578, + "learning_rate": 9.972049923815011e-06, + "loss": 0.335, + "step": 308 + }, + { + "epoch": 0.24819855884707767, + "learning_rate": 1.0027950076184982e-05, + "loss": 0.3187, + "step": 310 + }, + { + "epoch": 0.24979983987189752, + "learning_rate": 1.0083849355163397e-05, + "loss": 0.2734, + "step": 312 + }, + { + "epoch": 0.2514011208967174, + "learning_rate": 1.0139746013994493e-05, + "loss": 0.4145, + "step": 314 + }, + { + "epoch": 0.2530024019215372, + "learning_rate": 1.0195638306004383e-05, + "loss": 0.3536, + "step": 316 + }, + { + "epoch": 0.2546036829463571, + "learning_rate": 1.0251524484655577e-05, + "loss": 0.3834, + "step": 318 + }, + { + "epoch": 0.2562049639711769, + "learning_rate": 1.0307402803601691e-05, + "loss": 0.3639, + "step": 320 + }, + { + "epoch": 0.2578062449959968, + "learning_rate": 1.0363271516741877e-05, + "loss": 0.3348, + "step": 322 + }, + { + "epoch": 0.2594075260208166, + "learning_rate": 1.0419128878275495e-05, + "loss": 0.2639, + "step": 324 + }, + { + "epoch": 0.2610088070456365, + "learning_rate": 1.0474973142756632e-05, + "loss": 0.2572, + "step": 326 + }, + { + "epoch": 0.2626100880704564, + "learning_rate": 1.053080256514858e-05, + "loss": 0.271, + "step": 328 + }, + { + "epoch": 0.2642113690952762, + "learning_rate": 1.0586615400878484e-05, + "loss": 0.3146, + "step": 330 + }, + { + "epoch": 0.2658126501200961, + "learning_rate": 1.0642409905891733e-05, + "loss": 0.3026, + "step": 332 + }, + { + "epoch": 0.2674139311449159, + "learning_rate": 1.0698184336706567e-05, + "loss": 0.5543, + "step": 334 + }, + { + "epoch": 0.2690152121697358, + "learning_rate": 1.0753936950468513e-05, + "loss": 0.3037, + "step": 336 + }, + { + "epoch": 0.27061649319455566, + "learning_rate": 1.0809666005004787e-05, + "loss": 0.3356, + "step": 338 + }, + { + "epoch": 0.2722177742193755, + "learning_rate": 1.0865369758878858e-05, + "loss": 0.3694, + "step": 340 + }, + { + "epoch": 0.27381905524419536, + "learning_rate": 1.0921046471444737e-05, + "loss": 0.3837, + "step": 342 + }, + { + "epoch": 0.2754203362690152, + "learning_rate": 1.0976694402901467e-05, + "loss": 0.3387, + "step": 344 + }, + { + "epoch": 0.27702161729383507, + "learning_rate": 1.1032311814347467e-05, + "loss": 0.4292, + "step": 346 + }, + { + "epoch": 0.27862289831865494, + "learning_rate": 1.1087896967834787e-05, + "loss": 0.2892, + "step": 348 + }, + { + "epoch": 0.28022417934347477, + "learning_rate": 1.1143448126423545e-05, + "loss": 0.3327, + "step": 350 + }, + { + "epoch": 0.28182546036829464, + "learning_rate": 1.1198963554236135e-05, + "loss": 0.3033, + "step": 352 + }, + { + "epoch": 0.28342674139311447, + "learning_rate": 1.1254441516511425e-05, + "loss": 0.3328, + "step": 354 + }, + { + "epoch": 0.28502802241793435, + "learning_rate": 1.1309880279659087e-05, + "loss": 0.3054, + "step": 356 + }, + { + "epoch": 0.2866293034427542, + "learning_rate": 1.1365278111313625e-05, + "loss": 0.2598, + "step": 358 + }, + { + "epoch": 0.28823058446757405, + "learning_rate": 1.142063328038864e-05, + "loss": 0.3633, + "step": 360 + }, + { + "epoch": 0.2898318654923939, + "learning_rate": 1.1475944057130856e-05, + "loss": 0.6129, + "step": 362 + }, + { + "epoch": 0.29143314651721375, + "learning_rate": 1.1531208713174138e-05, + "loss": 0.3481, + "step": 364 + }, + { + "epoch": 0.2930344275420336, + "learning_rate": 1.1586425521593607e-05, + "loss": 0.304, + "step": 366 + }, + { + "epoch": 0.2946357085668535, + "learning_rate": 1.1641592756959467e-05, + "loss": 0.3187, + "step": 368 + }, + { + "epoch": 0.2962369895916733, + "learning_rate": 1.1696708695391057e-05, + "loss": 0.2632, + "step": 370 + }, + { + "epoch": 0.2978382706164932, + "learning_rate": 1.1751771614610643e-05, + "loss": 0.3988, + "step": 372 + }, + { + "epoch": 0.29943955164131303, + "learning_rate": 1.180677979399721e-05, + "loss": 0.3298, + "step": 374 + }, + { + "epoch": 0.3010408326661329, + "learning_rate": 1.1861731514640309e-05, + "loss": 0.314, + "step": 376 + }, + { + "epoch": 0.3026421136909528, + "learning_rate": 1.1916625059393739e-05, + "loss": 0.3975, + "step": 378 + }, + { + "epoch": 0.3042433947157726, + "learning_rate": 1.1971458712929133e-05, + "loss": 0.4027, + "step": 380 + }, + { + "epoch": 0.3058446757405925, + "learning_rate": 1.2026230761789702e-05, + "loss": 0.2758, + "step": 382 + }, + { + "epoch": 0.3074459567654123, + "learning_rate": 1.2080939494443618e-05, + "loss": 0.3029, + "step": 384 + }, + { + "epoch": 0.3090472377902322, + "learning_rate": 1.2135583201337646e-05, + "loss": 0.2493, + "step": 386 + }, + { + "epoch": 0.31064851881505207, + "learning_rate": 1.2190160174950428e-05, + "loss": 0.3348, + "step": 388 + }, + { + "epoch": 0.3122497998398719, + "learning_rate": 1.2244668709845952e-05, + "loss": 0.4747, + "step": 390 + }, + { + "epoch": 0.31385108086469177, + "learning_rate": 1.2299107102726804e-05, + "loss": 0.2845, + "step": 392 + }, + { + "epoch": 0.3154523618895116, + "learning_rate": 1.2353473652487329e-05, + "loss": 0.3489, + "step": 394 + }, + { + "epoch": 0.31705364291433147, + "learning_rate": 1.2407766660266916e-05, + "loss": 0.4029, + "step": 396 + }, + { + "epoch": 0.31865492393915135, + "learning_rate": 1.2461984429502947e-05, + "loss": 0.3804, + "step": 398 + }, + { + "epoch": 0.32025620496397117, + "learning_rate": 1.2516125265983945e-05, + "loss": 0.2729, + "step": 400 + }, + { + "epoch": 0.32185748598879105, + "learning_rate": 1.257018747790238e-05, + "loss": 0.3662, + "step": 402 + }, + { + "epoch": 0.32345876701361087, + "learning_rate": 1.2624169375907657e-05, + "loss": 0.3036, + "step": 404 + }, + { + "epoch": 0.32506004803843075, + "learning_rate": 1.2678069273158849e-05, + "loss": 0.409, + "step": 406 + }, + { + "epoch": 0.3266613290632506, + "learning_rate": 1.273188548537736e-05, + "loss": 0.3258, + "step": 408 + }, + { + "epoch": 0.32826261008807045, + "learning_rate": 1.2785616330899676e-05, + "loss": 0.4164, + "step": 410 + }, + { + "epoch": 0.32986389111289033, + "learning_rate": 1.2839260130729776e-05, + "loss": 0.381, + "step": 412 + }, + { + "epoch": 0.33146517213771015, + "learning_rate": 1.2892815208591734e-05, + "loss": 0.3217, + "step": 414 + }, + { + "epoch": 0.33306645316253003, + "learning_rate": 1.2946279890981966e-05, + "loss": 0.2929, + "step": 416 + }, + { + "epoch": 0.33466773418734985, + "learning_rate": 1.2999652507221652e-05, + "loss": 0.2477, + "step": 418 + }, + { + "epoch": 0.33626901521216973, + "learning_rate": 1.3052931389508822e-05, + "loss": 0.3386, + "step": 420 + }, + { + "epoch": 0.3378702962369896, + "learning_rate": 1.3106114872970575e-05, + "loss": 0.3516, + "step": 422 + }, + { + "epoch": 0.33947157726180943, + "learning_rate": 1.3159201295715054e-05, + "loss": 0.3214, + "step": 424 + }, + { + "epoch": 0.3410728582866293, + "learning_rate": 1.321218899888334e-05, + "loss": 0.3521, + "step": 426 + }, + { + "epoch": 0.34267413931144913, + "learning_rate": 1.326507632670139e-05, + "loss": 0.3094, + "step": 428 + }, + { + "epoch": 0.344275420336269, + "learning_rate": 1.3317861626531652e-05, + "loss": 0.2961, + "step": 430 + }, + { + "epoch": 0.3458767013610889, + "learning_rate": 1.3370543248924826e-05, + "loss": 0.3518, + "step": 432 + }, + { + "epoch": 0.3474779823859087, + "learning_rate": 1.3423119547671348e-05, + "loss": 0.2935, + "step": 434 + }, + { + "epoch": 0.3490792634107286, + "learning_rate": 1.347558887985279e-05, + "loss": 0.3642, + "step": 436 + }, + { + "epoch": 0.3506805444355484, + "learning_rate": 1.3527949605893305e-05, + "loss": 0.6361, + "step": 438 + }, + { + "epoch": 0.3522818254603683, + "learning_rate": 1.3580200089610739e-05, + "loss": 0.6814, + "step": 440 + }, + { + "epoch": 0.35388310648518817, + "learning_rate": 1.3632338698267863e-05, + "loss": 0.3518, + "step": 442 + }, + { + "epoch": 0.355484387510008, + "learning_rate": 1.368436380262336e-05, + "loss": 0.4718, + "step": 444 + }, + { + "epoch": 0.35708566853482787, + "learning_rate": 1.3736273776982667e-05, + "loss": 0.2935, + "step": 446 + }, + { + "epoch": 0.3586869495596477, + "learning_rate": 1.3788066999248893e-05, + "loss": 0.3482, + "step": 448 + }, + { + "epoch": 0.3602882305844676, + "learning_rate": 1.3839741850973435e-05, + "loss": 0.3038, + "step": 450 + }, + { + "epoch": 0.36188951160928745, + "learning_rate": 1.3891296717406533e-05, + "loss": 0.3653, + "step": 452 + }, + { + "epoch": 0.3634907926341073, + "learning_rate": 1.3942729987547808e-05, + "loss": 0.3568, + "step": 454 + }, + { + "epoch": 0.36509207365892715, + "learning_rate": 1.3994040054196498e-05, + "loss": 0.4697, + "step": 456 + }, + { + "epoch": 0.366693354683747, + "learning_rate": 1.4045225314001789e-05, + "loss": 0.2799, + "step": 458 + }, + { + "epoch": 0.36829463570856685, + "learning_rate": 1.4096284167512856e-05, + "loss": 0.352, + "step": 460 + }, + { + "epoch": 0.36989591673338673, + "learning_rate": 1.4147215019228813e-05, + "loss": 0.4387, + "step": 462 + }, + { + "epoch": 0.37149719775820655, + "learning_rate": 1.4198016277648665e-05, + "loss": 0.3261, + "step": 464 + }, + { + "epoch": 0.37309847878302643, + "learning_rate": 1.4248686355320922e-05, + "loss": 0.3142, + "step": 466 + }, + { + "epoch": 0.37469975980784626, + "learning_rate": 1.429922366889332e-05, + "loss": 0.3842, + "step": 468 + }, + { + "epoch": 0.37630104083266613, + "learning_rate": 1.4349626639162231e-05, + "loss": 0.3189, + "step": 470 + }, + { + "epoch": 0.377902321857486, + "learning_rate": 1.4399893691121985e-05, + "loss": 0.2498, + "step": 472 + }, + { + "epoch": 0.37950360288230583, + "learning_rate": 1.4450023254014185e-05, + "loss": 0.4698, + "step": 474 + }, + { + "epoch": 0.3811048839071257, + "learning_rate": 1.4500013761376663e-05, + "loss": 0.2919, + "step": 476 + }, + { + "epoch": 0.38270616493194554, + "learning_rate": 1.454986365109255e-05, + "loss": 0.3662, + "step": 478 + }, + { + "epoch": 0.3843074459567654, + "learning_rate": 1.4599571365439027e-05, + "loss": 0.2848, + "step": 480 + }, + { + "epoch": 0.3859087269815853, + "learning_rate": 1.4649135351135968e-05, + "loss": 0.3643, + "step": 482 + }, + { + "epoch": 0.3875100080064051, + "learning_rate": 1.4698554059394563e-05, + "loss": 0.3494, + "step": 484 + }, + { + "epoch": 0.389111289031225, + "learning_rate": 1.4747825945965675e-05, + "loss": 0.3891, + "step": 486 + }, + { + "epoch": 0.3907125700560448, + "learning_rate": 1.4796949471188033e-05, + "loss": 0.2509, + "step": 488 + }, + { + "epoch": 0.3923138510808647, + "learning_rate": 1.4845923100036479e-05, + "loss": 0.5709, + "step": 490 + }, + { + "epoch": 0.3939151321056846, + "learning_rate": 1.4894745302169786e-05, + "loss": 0.4518, + "step": 492 + }, + { + "epoch": 0.3955164131305044, + "learning_rate": 1.4943414551978597e-05, + "loss": 0.3704, + "step": 494 + }, + { + "epoch": 0.3971176941553243, + "learning_rate": 1.499192932863305e-05, + "loss": 0.3701, + "step": 496 + }, + { + "epoch": 0.3987189751801441, + "learning_rate": 1.5040288116130261e-05, + "loss": 0.3494, + "step": 498 + }, + { + "epoch": 0.400320256204964, + "learning_rate": 1.5088489403341793e-05, + "loss": 0.3333, + "step": 500 + }, + { + "epoch": 0.40192153722978385, + "learning_rate": 1.513653168406076e-05, + "loss": 0.3522, + "step": 502 + }, + { + "epoch": 0.4035228182546037, + "learning_rate": 1.5184413457049006e-05, + "loss": 0.339, + "step": 504 + }, + { + "epoch": 0.40512409927942356, + "learning_rate": 1.5232133226083954e-05, + "loss": 0.3977, + "step": 506 + }, + { + "epoch": 0.4067253803042434, + "learning_rate": 1.527968950000533e-05, + "loss": 0.3494, + "step": 508 + }, + { + "epoch": 0.40832666132906326, + "learning_rate": 1.532708079276185e-05, + "loss": 0.2764, + "step": 510 + }, + { + "epoch": 0.4099279423538831, + "learning_rate": 1.5374305623457594e-05, + "loss": 0.3349, + "step": 512 + }, + { + "epoch": 0.41152922337870296, + "learning_rate": 1.542136251639826e-05, + "loss": 0.3483, + "step": 514 + }, + { + "epoch": 0.41313050440352284, + "learning_rate": 1.5468250001137368e-05, + "loss": 0.3569, + "step": 516 + }, + { + "epoch": 0.41473178542834266, + "learning_rate": 1.551496661252208e-05, + "loss": 0.3189, + "step": 518 + }, + { + "epoch": 0.41633306645316254, + "learning_rate": 1.5561510890739113e-05, + "loss": 0.3482, + "step": 520 + }, + { + "epoch": 0.41793434747798236, + "learning_rate": 1.5607881381360296e-05, + "loss": 0.2895, + "step": 522 + }, + { + "epoch": 0.41953562850280224, + "learning_rate": 1.565407663538797e-05, + "loss": 0.3333, + "step": 524 + }, + { + "epoch": 0.4211369095276221, + "learning_rate": 1.5700095209300376e-05, + "loss": 0.3387, + "step": 526 + }, + { + "epoch": 0.42273819055244194, + "learning_rate": 1.5745935665096647e-05, + "loss": 0.3331, + "step": 528 + }, + { + "epoch": 0.4243394715772618, + "learning_rate": 1.5791596570341844e-05, + "loss": 0.3989, + "step": 530 + }, + { + "epoch": 0.42594075260208164, + "learning_rate": 1.5837076498211666e-05, + "loss": 0.319, + "step": 532 + }, + { + "epoch": 0.4275420336269015, + "learning_rate": 1.5882374027537005e-05, + "loss": 0.339, + "step": 534 + }, + { + "epoch": 0.4291433146517214, + "learning_rate": 1.5927487742848448e-05, + "loss": 0.3696, + "step": 536 + }, + { + "epoch": 0.4307445956765412, + "learning_rate": 1.5972416234420393e-05, + "loss": 0.2508, + "step": 538 + }, + { + "epoch": 0.4323458767013611, + "learning_rate": 1.60171580983152e-05, + "loss": 0.2726, + "step": 540 + }, + { + "epoch": 0.4339471577261809, + "learning_rate": 1.606171193642703e-05, + "loss": 0.3635, + "step": 542 + }, + { + "epoch": 0.4355484387510008, + "learning_rate": 1.6106076356525474e-05, + "loss": 0.5845, + "step": 544 + }, + { + "epoch": 0.4371497197758207, + "learning_rate": 1.6150249972299153e-05, + "loss": 0.3092, + "step": 546 + }, + { + "epoch": 0.4387510008006405, + "learning_rate": 1.6194231403398994e-05, + "loss": 0.4232, + "step": 548 + }, + { + "epoch": 0.4403522818254604, + "learning_rate": 1.6238019275481313e-05, + "loss": 0.3215, + "step": 550 + }, + { + "epoch": 0.4419535628502802, + "learning_rate": 1.6281612220250883e-05, + "loss": 0.3632, + "step": 552 + }, + { + "epoch": 0.4435548438751001, + "learning_rate": 1.6325008875503543e-05, + "loss": 0.4436, + "step": 554 + }, + { + "epoch": 0.44515612489991996, + "learning_rate": 1.6368207885168897e-05, + "loss": 0.3371, + "step": 556 + }, + { + "epoch": 0.4467574059247398, + "learning_rate": 1.641120789935263e-05, + "loss": 0.3547, + "step": 558 + }, + { + "epoch": 0.44835868694955966, + "learning_rate": 1.6454007574378637e-05, + "loss": 0.3893, + "step": 560 + }, + { + "epoch": 0.4499599679743795, + "learning_rate": 1.6496605572831134e-05, + "loss": 0.2633, + "step": 562 + }, + { + "epoch": 0.45156124899919936, + "learning_rate": 1.6539000563596318e-05, + "loss": 0.342, + "step": 564 + }, + { + "epoch": 0.45316253002401924, + "learning_rate": 1.6581191221904077e-05, + "loss": 0.3223, + "step": 566 + }, + { + "epoch": 0.45476381104883906, + "learning_rate": 1.6623176229369324e-05, + "loss": 0.2961, + "step": 568 + }, + { + "epoch": 0.45636509207365894, + "learning_rate": 1.6664954274033168e-05, + "loss": 0.3393, + "step": 570 + }, + { + "epoch": 0.45796637309847876, + "learning_rate": 1.6706524050403996e-05, + "loss": 0.322, + "step": 572 + }, + { + "epoch": 0.45956765412329864, + "learning_rate": 1.674788425949818e-05, + "loss": 0.2849, + "step": 574 + }, + { + "epoch": 0.4611689351481185, + "learning_rate": 1.6789033608880735e-05, + "loss": 0.3266, + "step": 576 + }, + { + "epoch": 0.46277021617293834, + "learning_rate": 1.6829970812705674e-05, + "loss": 0.4029, + "step": 578 + }, + { + "epoch": 0.4643714971977582, + "learning_rate": 1.6870694591756165e-05, + "loss": 0.3353, + "step": 580 + }, + { + "epoch": 0.46597277822257804, + "learning_rate": 1.6911203673484577e-05, + "loss": 0.484, + "step": 582 + }, + { + "epoch": 0.4675740592473979, + "learning_rate": 1.695149679205214e-05, + "loss": 0.3523, + "step": 584 + }, + { + "epoch": 0.4691753402722178, + "learning_rate": 1.6991572688368628e-05, + "loss": 0.5135, + "step": 586 + }, + { + "epoch": 0.4707766212970376, + "learning_rate": 1.7031430110131562e-05, + "loss": 0.5288, + "step": 588 + }, + { + "epoch": 0.4723779023218575, + "learning_rate": 1.7071067811865467e-05, + "loss": 0.3354, + "step": 590 + }, + { + "epoch": 0.4739791833466773, + "learning_rate": 1.7110484554960738e-05, + "loss": 0.2517, + "step": 592 + }, + { + "epoch": 0.4755804643714972, + "learning_rate": 1.7149679107712306e-05, + "loss": 0.306, + "step": 594 + }, + { + "epoch": 0.4771817453963171, + "learning_rate": 1.7188650245358215e-05, + "loss": 0.3709, + "step": 596 + }, + { + "epoch": 0.4787830264211369, + "learning_rate": 1.722739675011779e-05, + "loss": 0.2763, + "step": 598 + }, + { + "epoch": 0.4803843074459568, + "learning_rate": 1.726591741122981e-05, + "loss": 0.3649, + "step": 600 + }, + { + "epoch": 0.4819855884707766, + "learning_rate": 1.730421102499021e-05, + "loss": 0.365, + "step": 602 + }, + { + "epoch": 0.4835868694955965, + "learning_rate": 1.734227639478982e-05, + "loss": 0.3194, + "step": 604 + }, + { + "epoch": 0.4851881505204163, + "learning_rate": 1.738011233115165e-05, + "loss": 0.3499, + "step": 606 + }, + { + "epoch": 0.4867894315452362, + "learning_rate": 1.7417717651768144e-05, + "loss": 0.3486, + "step": 608 + }, + { + "epoch": 0.48839071257005606, + "learning_rate": 1.7455091181538087e-05, + "loss": 0.3497, + "step": 610 + }, + { + "epoch": 0.4899919935948759, + "learning_rate": 1.74922317526033e-05, + "loss": 0.3932, + "step": 612 + }, + { + "epoch": 0.49159327461969576, + "learning_rate": 1.752913820438519e-05, + "loss": 0.4385, + "step": 614 + }, + { + "epoch": 0.4931945556445156, + "learning_rate": 1.756580938362096e-05, + "loss": 0.3704, + "step": 616 + }, + { + "epoch": 0.49479583666933546, + "learning_rate": 1.7602244144399693e-05, + "loss": 0.2767, + "step": 618 + }, + { + "epoch": 0.49639711769415534, + "learning_rate": 1.7638441348198147e-05, + "loss": 0.3637, + "step": 620 + }, + { + "epoch": 0.49799839871897517, + "learning_rate": 1.7674399863916295e-05, + "loss": 0.3036, + "step": 622 + }, + { + "epoch": 0.49959967974379504, + "learning_rate": 1.771011856791273e-05, + "loss": 0.4168, + "step": 624 + }, + { + "epoch": 0.5012009607686149, + "learning_rate": 1.7745596344039712e-05, + "loss": 0.3571, + "step": 626 + }, + { + "epoch": 0.5028022417934348, + "learning_rate": 1.7780832083678116e-05, + "loss": 0.2763, + "step": 628 + }, + { + "epoch": 0.5044035228182546, + "learning_rate": 1.7815824685772035e-05, + "loss": 0.4086, + "step": 630 + }, + { + "epoch": 0.5060048038430744, + "learning_rate": 1.7850573056863156e-05, + "loss": 0.3418, + "step": 632 + }, + { + "epoch": 0.5076060848678943, + "learning_rate": 1.7885076111125004e-05, + "loss": 0.2845, + "step": 634 + }, + { + "epoch": 0.5092073658927142, + "learning_rate": 1.791933277039679e-05, + "loss": 0.314, + "step": 636 + }, + { + "epoch": 0.510808646917534, + "learning_rate": 1.7953341964217183e-05, + "loss": 0.3139, + "step": 638 + }, + { + "epoch": 0.5124099279423538, + "learning_rate": 1.7987102629857696e-05, + "loss": 0.3189, + "step": 640 + }, + { + "epoch": 0.5140112089671738, + "learning_rate": 1.8020613712355912e-05, + "loss": 0.4228, + "step": 642 + }, + { + "epoch": 0.5156124899919936, + "learning_rate": 1.805387416454847e-05, + "loss": 0.4131, + "step": 644 + }, + { + "epoch": 0.5172137710168134, + "learning_rate": 1.8086882947103787e-05, + "loss": 0.3519, + "step": 646 + }, + { + "epoch": 0.5188150520416333, + "learning_rate": 1.811963902855447e-05, + "loss": 0.4185, + "step": 648 + }, + { + "epoch": 0.5204163330664532, + "learning_rate": 1.8152141385329658e-05, + "loss": 0.4594, + "step": 650 + }, + { + "epoch": 0.522017614091273, + "learning_rate": 1.8184389001786895e-05, + "loss": 0.3351, + "step": 652 + }, + { + "epoch": 0.5236188951160928, + "learning_rate": 1.821638087024396e-05, + "loss": 0.3592, + "step": 654 + }, + { + "epoch": 0.5252201761409128, + "learning_rate": 1.8248115991010296e-05, + "loss": 0.3808, + "step": 656 + }, + { + "epoch": 0.5268214571657326, + "learning_rate": 1.8279593372418264e-05, + "loss": 0.3351, + "step": 658 + }, + { + "epoch": 0.5284227381905524, + "learning_rate": 1.8310812030854155e-05, + "loss": 0.3061, + "step": 660 + }, + { + "epoch": 0.5300240192153723, + "learning_rate": 1.834177099078887e-05, + "loss": 0.4518, + "step": 662 + }, + { + "epoch": 0.5316253002401922, + "learning_rate": 1.8372469284808465e-05, + "loss": 0.3976, + "step": 664 + }, + { + "epoch": 0.533226581265012, + "learning_rate": 1.840290595364436e-05, + "loss": 0.3041, + "step": 666 + }, + { + "epoch": 0.5348278622898318, + "learning_rate": 1.8433080046203286e-05, + "loss": 0.3646, + "step": 668 + }, + { + "epoch": 0.5364291433146517, + "learning_rate": 1.8462990619597054e-05, + "loss": 0.2631, + "step": 670 + }, + { + "epoch": 0.5380304243394716, + "learning_rate": 1.8492636739171966e-05, + "loss": 0.2764, + "step": 672 + }, + { + "epoch": 0.5396317053642914, + "learning_rate": 1.8522017478538067e-05, + "loss": 0.3219, + "step": 674 + }, + { + "epoch": 0.5412329863891113, + "learning_rate": 1.855113191959808e-05, + "loss": 0.3038, + "step": 676 + }, + { + "epoch": 0.5428342674139311, + "learning_rate": 1.8579979152576063e-05, + "loss": 0.2643, + "step": 678 + }, + { + "epoch": 0.544435548438751, + "learning_rate": 1.8608558276045895e-05, + "loss": 0.3661, + "step": 680 + }, + { + "epoch": 0.5460368294635709, + "learning_rate": 1.86368683969594e-05, + "loss": 0.2784, + "step": 682 + }, + { + "epoch": 0.5476381104883907, + "learning_rate": 1.866490863067425e-05, + "loss": 0.2601, + "step": 684 + }, + { + "epoch": 0.5492393915132106, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.3093, + "step": 686 + }, + { + "epoch": 0.5508406725380304, + "learning_rate": 1.8720175940133705e-05, + "loss": 0.3642, + "step": 688 + }, + { + "epoch": 0.5524419535628503, + "learning_rate": 1.8747401288870472e-05, + "loss": 0.3519, + "step": 690 + }, + { + "epoch": 0.5540432345876701, + "learning_rate": 1.877435329644691e-05, + "loss": 0.4804, + "step": 692 + }, + { + "epoch": 0.55564451561249, + "learning_rate": 1.8801031120659393e-05, + "loss": 0.4653, + "step": 694 + }, + { + "epoch": 0.5572457966373099, + "learning_rate": 1.8827433927872066e-05, + "loss": 0.5541, + "step": 696 + }, + { + "epoch": 0.5588470776621297, + "learning_rate": 1.8853560893042854e-05, + "loss": 0.4749, + "step": 698 + }, + { + "epoch": 0.5604483586869495, + "learning_rate": 1.8879411199749303e-05, + "loss": 0.3702, + "step": 700 + }, + { + "epoch": 0.5620496397117695, + "learning_rate": 1.8904984040214037e-05, + "loss": 0.2736, + "step": 702 + }, + { + "epoch": 0.5636509207365893, + "learning_rate": 1.893027861533002e-05, + "loss": 0.4034, + "step": 704 + }, + { + "epoch": 0.5652522017614091, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.5294, + "step": 706 + }, + { + "epoch": 0.5668534827862289, + "learning_rate": 1.898002981658886e-05, + "loss": 0.4544, + "step": 708 + }, + { + "epoch": 0.5684547638110489, + "learning_rate": 1.9004484888092724e-05, + "loss": 0.3038, + "step": 710 + }, + { + "epoch": 0.5700560448358687, + "learning_rate": 1.9028658585018455e-05, + "loss": 0.3353, + "step": 712 + }, + { + "epoch": 0.5716573258606885, + "learning_rate": 1.9052550151979816e-05, + "loss": 0.3638, + "step": 714 + }, + { + "epoch": 0.5732586068855084, + "learning_rate": 1.9076158842406674e-05, + "loss": 0.3541, + "step": 716 + }, + { + "epoch": 0.5748598879103283, + "learning_rate": 1.909948391856829e-05, + "loss": 0.4153, + "step": 718 + }, + { + "epoch": 0.5764611689351481, + "learning_rate": 1.912252465159637e-05, + "loss": 0.5033, + "step": 720 + }, + { + "epoch": 0.578062449959968, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.4325, + "step": 722 + }, + { + "epoch": 0.5796637309847879, + "learning_rate": 1.9167750217227454e-05, + "loss": 0.3646, + "step": 724 + }, + { + "epoch": 0.5812650120096077, + "learning_rate": 1.9189933636609747e-05, + "loss": 0.2635, + "step": 726 + }, + { + "epoch": 0.5828662930344275, + "learning_rate": 1.9211829886461274e-05, + "loss": 0.4895, + "step": 728 + }, + { + "epoch": 0.5844675740592474, + "learning_rate": 1.9233438282562085e-05, + "loss": 0.5323, + "step": 730 + }, + { + "epoch": 0.5860688550840673, + "learning_rate": 1.925475814968719e-05, + "loss": 0.2904, + "step": 732 + }, + { + "epoch": 0.5876701361088871, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.3037, + "step": 734 + }, + { + "epoch": 0.589271417133707, + "learning_rate": 1.9296529641211215e-05, + "loss": 0.339, + "step": 736 + }, + { + "epoch": 0.5908726981585268, + "learning_rate": 1.9316979960323286e-05, + "loss": 0.3521, + "step": 738 + }, + { + "epoch": 0.5924739791833467, + "learning_rate": 1.9337139139926707e-05, + "loss": 0.2962, + "step": 740 + }, + { + "epoch": 0.5940752602081665, + "learning_rate": 1.935700655008199e-05, + "loss": 0.3419, + "step": 742 + }, + { + "epoch": 0.5956765412329864, + "learning_rate": 1.9376581569966933e-05, + "loss": 0.4441, + "step": 744 + }, + { + "epoch": 0.5972778222578062, + "learning_rate": 1.939586358789602e-05, + "loss": 0.4761, + "step": 746 + }, + { + "epoch": 0.5988791032826261, + "learning_rate": 1.9414852001339547e-05, + "loss": 0.3496, + "step": 748 + }, + { + "epoch": 0.600480384307446, + "learning_rate": 1.9433546216942423e-05, + "loss": 0.3392, + "step": 750 + }, + { + "epoch": 0.6020816653322658, + "learning_rate": 1.945194565054276e-05, + "loss": 0.3194, + "step": 752 + }, + { + "epoch": 0.6036829463570856, + "learning_rate": 1.9470049727190073e-05, + "loss": 0.3486, + "step": 754 + }, + { + "epoch": 0.6052842273819056, + "learning_rate": 1.948785788116329e-05, + "loss": 0.3189, + "step": 756 + }, + { + "epoch": 0.6068855084067254, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.3059, + "step": 758 + }, + { + "epoch": 0.6084867894315452, + "learning_rate": 1.952258420445583e-05, + "loss": 0.4351, + "step": 760 + }, + { + "epoch": 0.610088070456365, + "learning_rate": 1.953950128863762e-05, + "loss": 0.3848, + "step": 762 + }, + { + "epoch": 0.611689351481185, + "learning_rate": 1.9556120279904144e-05, + "loss": 0.3353, + "step": 764 + }, + { + "epoch": 0.6132906325060048, + "learning_rate": 1.957244065894066e-05, + "loss": 0.3807, + "step": 766 + }, + { + "epoch": 0.6148919135308246, + "learning_rate": 1.9588461915763566e-05, + "loss": 0.4382, + "step": 768 + }, + { + "epoch": 0.6164931945556446, + "learning_rate": 1.9604183549736283e-05, + "loss": 0.396, + "step": 770 + }, + { + "epoch": 0.6180944755804644, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.3643, + "step": 772 + }, + { + "epoch": 0.6196957566052842, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.4389, + "step": 774 + }, + { + "epoch": 0.6212970376301041, + "learning_rate": 1.964954584871995e-05, + "loss": 0.3889, + "step": 776 + }, + { + "epoch": 0.622898318654924, + "learning_rate": 1.966406417240872e-05, + "loss": 0.3964, + "step": 778 + }, + { + "epoch": 0.6244995996797438, + "learning_rate": 1.967828051080755e-05, + "loss": 0.3664, + "step": 780 + }, + { + "epoch": 0.6261008807045636, + "learning_rate": 1.969219441968046e-05, + "loss": 0.2899, + "step": 782 + }, + { + "epoch": 0.6277021617293835, + "learning_rate": 1.9705805464241856e-05, + "loss": 0.3194, + "step": 784 + }, + { + "epoch": 0.6293034427542034, + "learning_rate": 1.971911321917015e-05, + "loss": 0.3043, + "step": 786 + }, + { + "epoch": 0.6309047237790232, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.5937, + "step": 788 + }, + { + "epoch": 0.6325060048038431, + "learning_rate": 1.9744817206240377e-05, + "loss": 0.3968, + "step": 790 + }, + { + "epoch": 0.6341072858286629, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.4511, + "step": 792 + }, + { + "epoch": 0.6357085668534828, + "learning_rate": 1.976930316809569e-05, + "loss": 0.4443, + "step": 794 + }, + { + "epoch": 0.6373098478783027, + "learning_rate": 1.978108842718768e-05, + "loss": 0.29, + "step": 796 + }, + { + "epoch": 0.6389111289031225, + "learning_rate": 1.9792568044184176e-05, + "loss": 0.4037, + "step": 798 + }, + { + "epoch": 0.6405124099279423, + "learning_rate": 1.9803741660367015e-05, + "loss": 0.3576, + "step": 800 + }, + { + "epoch": 0.6421136909527622, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.3665, + "step": 802 + }, + { + "epoch": 0.6437149719775821, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.4153, + "step": 804 + }, + { + "epoch": 0.6453162530024019, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.3395, + "step": 806 + }, + { + "epoch": 0.6469175340272217, + "learning_rate": 1.9845369277495102e-05, + "loss": 0.2903, + "step": 808 + }, + { + "epoch": 0.6485188150520417, + "learning_rate": 1.985500784388244e-05, + "loss": 0.404, + "step": 810 + }, + { + "epoch": 0.6501200960768615, + "learning_rate": 1.9864338458320366e-05, + "loss": 0.2719, + "step": 812 + }, + { + "epoch": 0.6517213771016813, + "learning_rate": 1.9873360829243323e-05, + "loss": 0.4353, + "step": 814 + }, + { + "epoch": 0.6533226581265013, + "learning_rate": 1.9882074674717836e-05, + "loss": 0.3651, + "step": 816 + }, + { + "epoch": 0.6549239391513211, + "learning_rate": 1.989047972245129e-05, + "loss": 0.3485, + "step": 818 + }, + { + "epoch": 0.6565252201761409, + "learning_rate": 1.989857570980049e-05, + "loss": 0.4699, + "step": 820 + }, + { + "epoch": 0.6581265012009607, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.3762, + "step": 822 + }, + { + "epoch": 0.6597277822257807, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.3392, + "step": 824 + }, + { + "epoch": 0.6613290632506005, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.3391, + "step": 826 + }, + { + "epoch": 0.6629303442754203, + "learning_rate": 1.9927864140670615e-05, + "loss": 0.3423, + "step": 828 + }, + { + "epoch": 0.6645316253002402, + "learning_rate": 1.99344112247369e-05, + "loss": 0.3334, + "step": 830 + }, + { + "epoch": 0.6661329063250601, + "learning_rate": 1.9940647875635463e-05, + "loss": 0.3664, + "step": 832 + }, + { + "epoch": 0.6677341873498799, + "learning_rate": 1.994657389848176e-05, + "loss": 0.3335, + "step": 834 + }, + { + "epoch": 0.6693354683746997, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.3993, + "step": 836 + }, + { + "epoch": 0.6709367493995196, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.291, + "step": 838 + }, + { + "epoch": 0.6725380304243395, + "learning_rate": 1.996248639549475e-05, + "loss": 0.3646, + "step": 840 + }, + { + "epoch": 0.6741393114491593, + "learning_rate": 1.9967168151503196e-05, + "loss": 0.2765, + "step": 842 + }, + { + "epoch": 0.6757405924739792, + "learning_rate": 1.997153845074662e-05, + "loss": 0.3475, + "step": 844 + }, + { + "epoch": 0.677341873498799, + "learning_rate": 1.997559715666073e-05, + "loss": 0.3331, + "step": 846 + }, + { + "epoch": 0.6789431545236189, + "learning_rate": 1.9979344142417986e-05, + "loss": 0.3665, + "step": 848 + }, + { + "epoch": 0.6805444355484388, + "learning_rate": 1.998277929093157e-05, + "loss": 0.3805, + "step": 850 + }, + { + "epoch": 0.6821457165732586, + "learning_rate": 1.9985902494859023e-05, + "loss": 0.3222, + "step": 852 + }, + { + "epoch": 0.6837469975980784, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.291, + "step": 854 + }, + { + "epoch": 0.6853482786228983, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.3354, + "step": 856 + }, + { + "epoch": 0.6869495596477182, + "learning_rate": 1.999339951193407e-05, + "loss": 0.2719, + "step": 858 + }, + { + "epoch": 0.688550840672538, + "learning_rate": 1.9995274059091018e-05, + "loss": 0.3333, + "step": 860 + }, + { + "epoch": 0.6901521216973578, + "learning_rate": 1.999683627122195e-05, + "loss": 0.3331, + "step": 862 + }, + { + "epoch": 0.6917534027221778, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.3521, + "step": 864 + }, + { + "epoch": 0.6933546837469976, + "learning_rate": 1.99990235049015e-05, + "loss": 0.339, + "step": 866 + }, + { + "epoch": 0.6949559647718174, + "learning_rate": 1.999964845810285e-05, + "loss": 0.2899, + "step": 868 + }, + { + "epoch": 0.6965572457966374, + "learning_rate": 1.999996093958578e-05, + "loss": 0.2905, + "step": 870 + }, + { + "epoch": 0.6981585268214572, + "learning_rate": 1.999996093958578e-05, + "loss": 0.263, + "step": 872 + }, + { + "epoch": 0.699759807846277, + "learning_rate": 1.999964845810285e-05, + "loss": 0.299, + "step": 874 + }, + { + "epoch": 0.7013610888710968, + "learning_rate": 1.99990235049015e-05, + "loss": 0.551, + "step": 876 + }, + { + "epoch": 0.7029623698959168, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.3568, + "step": 878 + }, + { + "epoch": 0.7045636509207366, + "learning_rate": 1.999683627122195e-05, + "loss": 0.2896, + "step": 880 + }, + { + "epoch": 0.7061649319455564, + "learning_rate": 1.999527405909102e-05, + "loss": 0.529, + "step": 882 + }, + { + "epoch": 0.7077662129703763, + "learning_rate": 1.999339951193407e-05, + "loss": 0.3568, + "step": 884 + }, + { + "epoch": 0.7093674939951962, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.4383, + "step": 886 + }, + { + "epoch": 0.710968775020016, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.3351, + "step": 888 + }, + { + "epoch": 0.7125700560448359, + "learning_rate": 1.9985902494859026e-05, + "loss": 0.3996, + "step": 890 + }, + { + "epoch": 0.7141713370696557, + "learning_rate": 1.9982779290931572e-05, + "loss": 0.365, + "step": 892 + }, + { + "epoch": 0.7157726180944756, + "learning_rate": 1.997934414241799e-05, + "loss": 0.6902, + "step": 894 + }, + { + "epoch": 0.7173738991192954, + "learning_rate": 1.997559715666073e-05, + "loss": 0.451, + "step": 896 + }, + { + "epoch": 0.7189751801441153, + "learning_rate": 1.997153845074662e-05, + "loss": 0.5664, + "step": 898 + }, + { + "epoch": 0.7205764611689351, + "learning_rate": 1.9967168151503193e-05, + "loss": 0.304, + "step": 900 + }, + { + "epoch": 0.722177742193755, + "learning_rate": 1.9962486395494753e-05, + "loss": 0.3489, + "step": 902 + }, + { + "epoch": 0.7237790232185749, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.3487, + "step": 904 + }, + { + "epoch": 0.7253803042433947, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.4516, + "step": 906 + }, + { + "epoch": 0.7269815852682145, + "learning_rate": 1.994657389848176e-05, + "loss": 0.4327, + "step": 908 + }, + { + "epoch": 0.7285828662930345, + "learning_rate": 1.9940647875635466e-05, + "loss": 0.415, + "step": 910 + }, + { + "epoch": 0.7301841473178543, + "learning_rate": 1.99344112247369e-05, + "loss": 0.3475, + "step": 912 + }, + { + "epoch": 0.7317854283426741, + "learning_rate": 1.9927864140670618e-05, + "loss": 0.353, + "step": 914 + }, + { + "epoch": 0.733386709367494, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.4727, + "step": 916 + }, + { + "epoch": 0.7349879903923139, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.3146, + "step": 918 + }, + { + "epoch": 0.7365892714171337, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.416, + "step": 920 + }, + { + "epoch": 0.7381905524419535, + "learning_rate": 1.989857570980049e-05, + "loss": 0.3653, + "step": 922 + }, + { + "epoch": 0.7397918334667735, + "learning_rate": 1.9890479722451292e-05, + "loss": 0.382, + "step": 924 + }, + { + "epoch": 0.7413931144915933, + "learning_rate": 1.9882074674717832e-05, + "loss": 0.4006, + "step": 926 + }, + { + "epoch": 0.7429943955164131, + "learning_rate": 1.987336082924333e-05, + "loss": 0.3192, + "step": 928 + }, + { + "epoch": 0.7445956765412329, + "learning_rate": 1.986433845832037e-05, + "loss": 0.4334, + "step": 930 + }, + { + "epoch": 0.7461969575660529, + "learning_rate": 1.9855007843882437e-05, + "loss": 0.2692, + "step": 932 + }, + { + "epoch": 0.7477982385908727, + "learning_rate": 1.9845369277495105e-05, + "loss": 0.4331, + "step": 934 + }, + { + "epoch": 0.7493995196156925, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.4194, + "step": 936 + }, + { + "epoch": 0.7510008006405124, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.3824, + "step": 938 + }, + { + "epoch": 0.7526020816653323, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.4003, + "step": 940 + }, + { + "epoch": 0.7542033626901521, + "learning_rate": 1.9803741660367018e-05, + "loss": 0.3241, + "step": 942 + }, + { + "epoch": 0.755804643714972, + "learning_rate": 1.979256804418418e-05, + "loss": 0.3049, + "step": 944 + }, + { + "epoch": 0.7574059247397918, + "learning_rate": 1.9781088427187677e-05, + "loss": 0.3361, + "step": 946 + }, + { + "epoch": 0.7590072057646117, + "learning_rate": 1.976930316809569e-05, + "loss": 0.4166, + "step": 948 + }, + { + "epoch": 0.7606084867894315, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.6862, + "step": 950 + }, + { + "epoch": 0.7622097678142514, + "learning_rate": 1.9744817206240374e-05, + "loss": 0.4047, + "step": 952 + }, + { + "epoch": 0.7638110488390712, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.3664, + "step": 954 + }, + { + "epoch": 0.7654123298638911, + "learning_rate": 1.9719113219170152e-05, + "loss": 0.3503, + "step": 956 + }, + { + "epoch": 0.767013610888711, + "learning_rate": 1.970580546424186e-05, + "loss": 0.3191, + "step": 958 + }, + { + "epoch": 0.7686148919135308, + "learning_rate": 1.9692194419680463e-05, + "loss": 0.3344, + "step": 960 + }, + { + "epoch": 0.7702161729383507, + "learning_rate": 1.9678280510807552e-05, + "loss": 0.3069, + "step": 962 + }, + { + "epoch": 0.7718174539631706, + "learning_rate": 1.966406417240872e-05, + "loss": 0.453, + "step": 964 + }, + { + "epoch": 0.7734187349879904, + "learning_rate": 1.964954584871995e-05, + "loss": 0.4358, + "step": 966 + }, + { + "epoch": 0.7750200160128102, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.3505, + "step": 968 + }, + { + "epoch": 0.77662129703763, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.3047, + "step": 970 + }, + { + "epoch": 0.77822257806245, + "learning_rate": 1.9604183549736287e-05, + "loss": 0.3191, + "step": 972 + }, + { + "epoch": 0.7798238590872698, + "learning_rate": 1.958846191576357e-05, + "loss": 0.3983, + "step": 974 + }, + { + "epoch": 0.7814251401120896, + "learning_rate": 1.9572440658940667e-05, + "loss": 0.3196, + "step": 976 + }, + { + "epoch": 0.7830264211369096, + "learning_rate": 1.955612027990415e-05, + "loss": 0.3498, + "step": 978 + }, + { + "epoch": 0.7846277021617294, + "learning_rate": 1.953950128863763e-05, + "loss": 0.3358, + "step": 980 + }, + { + "epoch": 0.7862289831865492, + "learning_rate": 1.9522584204455835e-05, + "loss": 0.4039, + "step": 982 + }, + { + "epoch": 0.7878302642113691, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.349, + "step": 984 + }, + { + "epoch": 0.789431545236189, + "learning_rate": 1.9487857881163295e-05, + "loss": 0.4353, + "step": 986 + }, + { + "epoch": 0.7910328262610088, + "learning_rate": 1.947004972719008e-05, + "loss": 0.4551, + "step": 988 + }, + { + "epoch": 0.7926341072858286, + "learning_rate": 1.945194565054276e-05, + "loss": 0.4238, + "step": 990 + }, + { + "epoch": 0.7942353883106485, + "learning_rate": 1.9433546216942433e-05, + "loss": 0.4513, + "step": 992 + }, + { + "epoch": 0.7958366693354684, + "learning_rate": 1.941485200133955e-05, + "loss": 0.3818, + "step": 994 + }, + { + "epoch": 0.7974379503602882, + "learning_rate": 1.9395863587896025e-05, + "loss": 0.3496, + "step": 996 + }, + { + "epoch": 0.7990392313851081, + "learning_rate": 1.937658156996694e-05, + "loss": 0.4976, + "step": 998 + }, + { + "epoch": 0.800640512409928, + "learning_rate": 1.9357006550082e-05, + "loss": 0.4524, + "step": 1000 + }, + { + "epoch": 0.8022417934347478, + "learning_rate": 1.933713913992671e-05, + "loss": 0.2902, + "step": 1002 + }, + { + "epoch": 0.8038430744595677, + "learning_rate": 1.9316979960323283e-05, + "loss": 0.3186, + "step": 1004 + }, + { + "epoch": 0.8054443554843875, + "learning_rate": 1.9296529641211226e-05, + "loss": 0.3358, + "step": 1006 + }, + { + "epoch": 0.8070456365092074, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.3358, + "step": 1008 + }, + { + "epoch": 0.8086469175340272, + "learning_rate": 1.9254758149687187e-05, + "loss": 0.3822, + "step": 1010 + }, + { + "epoch": 0.8102481985588471, + "learning_rate": 1.9233438282562095e-05, + "loss": 0.365, + "step": 1012 + }, + { + "epoch": 0.8118494795836669, + "learning_rate": 1.9211829886461278e-05, + "loss": 0.4044, + "step": 1014 + }, + { + "epoch": 0.8134507606084868, + "learning_rate": 1.918993363660975e-05, + "loss": 0.3198, + "step": 1016 + }, + { + "epoch": 0.8150520416333067, + "learning_rate": 1.916775021722745e-05, + "loss": 0.3651, + "step": 1018 + }, + { + "epoch": 0.8166533226581265, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.3337, + "step": 1020 + }, + { + "epoch": 0.8182546036829463, + "learning_rate": 1.9122524651596372e-05, + "loss": 0.3668, + "step": 1022 + }, + { + "epoch": 0.8198558847077662, + "learning_rate": 1.9099483918568287e-05, + "loss": 0.3338, + "step": 1024 + }, + { + "epoch": 0.8214571657325861, + "learning_rate": 1.907615884240668e-05, + "loss": 0.3575, + "step": 1026 + }, + { + "epoch": 0.8230584467574059, + "learning_rate": 1.905255015197982e-05, + "loss": 0.3046, + "step": 1028 + }, + { + "epoch": 0.8246597277822257, + "learning_rate": 1.902865858501845e-05, + "loss": 0.3811, + "step": 1030 + }, + { + "epoch": 0.8262610088070457, + "learning_rate": 1.9004484888092734e-05, + "loss": 0.3063, + "step": 1032 + }, + { + "epoch": 0.8278622898318655, + "learning_rate": 1.8980029816588863e-05, + "loss": 0.2768, + "step": 1034 + }, + { + "epoch": 0.8294635708566853, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.398, + "step": 1036 + }, + { + "epoch": 0.8310648518815053, + "learning_rate": 1.893027861533003e-05, + "loss": 0.291, + "step": 1038 + }, + { + "epoch": 0.8326661329063251, + "learning_rate": 1.8904984040214043e-05, + "loss": 0.4332, + "step": 1040 + }, + { + "epoch": 0.8342674139311449, + "learning_rate": 1.8879411199749306e-05, + "loss": 0.2898, + "step": 1042 + }, + { + "epoch": 0.8358686949559647, + "learning_rate": 1.885356089304285e-05, + "loss": 0.3062, + "step": 1044 + }, + { + "epoch": 0.8374699759807847, + "learning_rate": 1.882743392787207e-05, + "loss": 0.3493, + "step": 1046 + }, + { + "epoch": 0.8390712570056045, + "learning_rate": 1.8801031120659396e-05, + "loss": 0.4809, + "step": 1048 + }, + { + "epoch": 0.8406725380304243, + "learning_rate": 1.877435329644691e-05, + "loss": 0.4896, + "step": 1050 + }, + { + "epoch": 0.8422738190552442, + "learning_rate": 1.8747401288870482e-05, + "loss": 0.3667, + "step": 1052 + }, + { + "epoch": 0.8438751000800641, + "learning_rate": 1.8720175940133712e-05, + "loss": 0.3806, + "step": 1054 + }, + { + "epoch": 0.8454763811048839, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.3815, + "step": 1056 + }, + { + "epoch": 0.8470776621297038, + "learning_rate": 1.8664908630674264e-05, + "loss": 0.4344, + "step": 1058 + }, + { + "epoch": 0.8486789431545236, + "learning_rate": 1.8636868396959406e-05, + "loss": 0.4188, + "step": 1060 + }, + { + "epoch": 0.8502802241793435, + "learning_rate": 1.8608558276045898e-05, + "loss": 0.4158, + "step": 1062 + }, + { + "epoch": 0.8518815052041633, + "learning_rate": 1.8579979152576076e-05, + "loss": 0.29, + "step": 1064 + }, + { + "epoch": 0.8534827862289832, + "learning_rate": 1.8551131919598084e-05, + "loss": 0.4158, + "step": 1066 + }, + { + "epoch": 0.855084067253803, + "learning_rate": 1.852201747853807e-05, + "loss": 0.4971, + "step": 1068 + }, + { + "epoch": 0.8566853482786229, + "learning_rate": 1.849263673917196e-05, + "loss": 0.3338, + "step": 1070 + }, + { + "epoch": 0.8582866293034428, + "learning_rate": 1.846299061959706e-05, + "loss": 0.2902, + "step": 1072 + }, + { + "epoch": 0.8598879103282626, + "learning_rate": 1.8433080046203293e-05, + "loss": 0.3196, + "step": 1074 + }, + { + "epoch": 0.8614891913530824, + "learning_rate": 1.8402905953644356e-05, + "loss": 0.4152, + "step": 1076 + }, + { + "epoch": 0.8630904723779024, + "learning_rate": 1.837246928480848e-05, + "loss": 0.3998, + "step": 1078 + }, + { + "epoch": 0.8646917534027222, + "learning_rate": 1.8341770990788874e-05, + "loss": 0.3525, + "step": 1080 + }, + { + "epoch": 0.866293034427542, + "learning_rate": 1.831081203085415e-05, + "loss": 0.3487, + "step": 1082 + }, + { + "epoch": 0.8678943154523618, + "learning_rate": 1.8279593372418284e-05, + "loss": 0.3485, + "step": 1084 + }, + { + "epoch": 0.8694955964771818, + "learning_rate": 1.8248115991010303e-05, + "loss": 0.3337, + "step": 1086 + }, + { + "epoch": 0.8710968775020016, + "learning_rate": 1.8216380870243963e-05, + "loss": 0.3709, + "step": 1088 + }, + { + "epoch": 0.8726981585268214, + "learning_rate": 1.8184389001786912e-05, + "loss": 0.4161, + "step": 1090 + }, + { + "epoch": 0.8742994395516414, + "learning_rate": 1.815214138532966e-05, + "loss": 0.3334, + "step": 1092 + }, + { + "epoch": 0.8759007205764612, + "learning_rate": 1.8119639028554475e-05, + "loss": 0.3335, + "step": 1094 + }, + { + "epoch": 0.877502001601281, + "learning_rate": 1.808688294710378e-05, + "loss": 0.3978, + "step": 1096 + }, + { + "epoch": 0.8791032826261009, + "learning_rate": 1.805387416454849e-05, + "loss": 0.4445, + "step": 1098 + }, + { + "epoch": 0.8807045636509208, + "learning_rate": 1.802061371235592e-05, + "loss": 0.3488, + "step": 1100 + }, + { + "epoch": 0.8823058446757406, + "learning_rate": 1.7987102629857692e-05, + "loss": 0.3496, + "step": 1102 + }, + { + "epoch": 0.8839071257005604, + "learning_rate": 1.7953341964217196e-05, + "loss": 0.3999, + "step": 1104 + }, + { + "epoch": 0.8855084067253803, + "learning_rate": 1.7919332770396798e-05, + "loss": 0.3193, + "step": 1106 + }, + { + "epoch": 0.8871096877502002, + "learning_rate": 1.7885076111125e-05, + "loss": 0.3354, + "step": 1108 + }, + { + "epoch": 0.88871096877502, + "learning_rate": 1.7850573056863173e-05, + "loss": 0.3042, + "step": 1110 + }, + { + "epoch": 0.8903122497998399, + "learning_rate": 1.7815824685772042e-05, + "loss": 0.3816, + "step": 1112 + }, + { + "epoch": 0.8919135308246597, + "learning_rate": 1.7780832083678122e-05, + "loss": 0.2764, + "step": 1114 + }, + { + "epoch": 0.8935148118494796, + "learning_rate": 1.774559634403971e-05, + "loss": 0.3573, + "step": 1116 + }, + { + "epoch": 0.8951160928742994, + "learning_rate": 1.7710118567912732e-05, + "loss": 0.4972, + "step": 1118 + }, + { + "epoch": 0.8967173738991193, + "learning_rate": 1.7674399863916298e-05, + "loss": 0.3495, + "step": 1120 + }, + { + "epoch": 0.8983186549239391, + "learning_rate": 1.7638441348198144e-05, + "loss": 0.5779, + "step": 1122 + }, + { + "epoch": 0.899919935948759, + "learning_rate": 1.7602244144399713e-05, + "loss": 0.435, + "step": 1124 + }, + { + "epoch": 0.9015212169735789, + "learning_rate": 1.7565809383620966e-05, + "loss": 0.3489, + "step": 1126 + }, + { + "epoch": 0.9031224979983987, + "learning_rate": 1.7529138204385186e-05, + "loss": 0.3847, + "step": 1128 + }, + { + "epoch": 0.9047237790232185, + "learning_rate": 1.7492231752603305e-05, + "loss": 0.4351, + "step": 1130 + }, + { + "epoch": 0.9063250600480385, + "learning_rate": 1.7455091181538094e-05, + "loss": 0.365, + "step": 1132 + }, + { + "epoch": 0.9079263410728583, + "learning_rate": 1.741771765176815e-05, + "loss": 0.3193, + "step": 1134 + }, + { + "epoch": 0.9095276220976781, + "learning_rate": 1.7380112331151657e-05, + "loss": 0.3668, + "step": 1136 + }, + { + "epoch": 0.911128903122498, + "learning_rate": 1.7342276394789825e-05, + "loss": 0.3811, + "step": 1138 + }, + { + "epoch": 0.9127301841473179, + "learning_rate": 1.7304211024990216e-05, + "loss": 0.29, + "step": 1140 + }, + { + "epoch": 0.9143314651721377, + "learning_rate": 1.7265917411229803e-05, + "loss": 0.3487, + "step": 1142 + }, + { + "epoch": 0.9159327461969575, + "learning_rate": 1.7227396750117802e-05, + "loss": 0.4161, + "step": 1144 + }, + { + "epoch": 0.9175340272217775, + "learning_rate": 1.718865024535822e-05, + "loss": 0.3352, + "step": 1146 + }, + { + "epoch": 0.9191353082465973, + "learning_rate": 1.7149679107712317e-05, + "loss": 0.3668, + "step": 1148 + }, + { + "epoch": 0.9207365892714171, + "learning_rate": 1.711048455496075e-05, + "loss": 0.3494, + "step": 1150 + }, + { + "epoch": 0.922337870296237, + "learning_rate": 1.7071067811865474e-05, + "loss": 0.4036, + "step": 1152 + }, + { + "epoch": 0.9239391513210569, + "learning_rate": 1.7031430110131566e-05, + "loss": 0.3644, + "step": 1154 + }, + { + "epoch": 0.9255404323458767, + "learning_rate": 1.699157268836863e-05, + "loss": 0.4591, + "step": 1156 + }, + { + "epoch": 0.9271417133706965, + "learning_rate": 1.6951496792052148e-05, + "loss": 0.3894, + "step": 1158 + }, + { + "epoch": 0.9287429943955164, + "learning_rate": 1.6911203673484583e-05, + "loss": 0.3493, + "step": 1160 + }, + { + "epoch": 0.9303442754203363, + "learning_rate": 1.687069459175619e-05, + "loss": 0.3977, + "step": 1162 + }, + { + "epoch": 0.9319455564451561, + "learning_rate": 1.682997081270568e-05, + "loss": 0.3484, + "step": 1164 + }, + { + "epoch": 0.933546837469976, + "learning_rate": 1.6789033608880742e-05, + "loss": 0.3668, + "step": 1166 + }, + { + "epoch": 0.9351481184947958, + "learning_rate": 1.6747884259498185e-05, + "loss": 0.3487, + "step": 1168 + }, + { + "epoch": 0.9367493995196157, + "learning_rate": 1.6706524050404006e-05, + "loss": 0.2903, + "step": 1170 + }, + { + "epoch": 0.9383506805444356, + "learning_rate": 1.6664954274033175e-05, + "loss": 0.3339, + "step": 1172 + }, + { + "epoch": 0.9399519615692554, + "learning_rate": 1.662317622936933e-05, + "loss": 0.3526, + "step": 1174 + }, + { + "epoch": 0.9415532425940752, + "learning_rate": 1.6581191221904098e-05, + "loss": 0.3495, + "step": 1176 + }, + { + "epoch": 0.9431545236188951, + "learning_rate": 1.6539000563596328e-05, + "loss": 0.3523, + "step": 1178 + }, + { + "epoch": 0.944755804643715, + "learning_rate": 1.6496605572831127e-05, + "loss": 0.4548, + "step": 1180 + }, + { + "epoch": 0.9463570856685348, + "learning_rate": 1.6454007574378657e-05, + "loss": 0.3664, + "step": 1182 + }, + { + "epoch": 0.9479583666933546, + "learning_rate": 1.6411207899352633e-05, + "loss": 0.4093, + "step": 1184 + }, + { + "epoch": 0.9495596477181746, + "learning_rate": 1.6368207885168904e-05, + "loss": 0.3703, + "step": 1186 + }, + { + "epoch": 0.9511609287429944, + "learning_rate": 1.6325008875503563e-05, + "loss": 0.3183, + "step": 1188 + }, + { + "epoch": 0.9527622097678142, + "learning_rate": 1.628161222025089e-05, + "loss": 0.3042, + "step": 1190 + }, + { + "epoch": 0.9543634907926342, + "learning_rate": 1.623801927548132e-05, + "loss": 0.5296, + "step": 1192 + }, + { + "epoch": 0.955964771817454, + "learning_rate": 1.6194231403398987e-05, + "loss": 0.382, + "step": 1194 + }, + { + "epoch": 0.9575660528422738, + "learning_rate": 1.6150249972299173e-05, + "loss": 0.3644, + "step": 1196 + }, + { + "epoch": 0.9591673338670936, + "learning_rate": 1.6106076356525484e-05, + "loss": 0.3183, + "step": 1198 + }, + { + "epoch": 0.9607686148919136, + "learning_rate": 1.6061711936427028e-05, + "loss": 0.4037, + "step": 1200 + }, + { + "epoch": 0.9623698959167334, + "learning_rate": 1.6017158098315224e-05, + "loss": 0.3335, + "step": 1202 + }, + { + "epoch": 0.9639711769415532, + "learning_rate": 1.5972416234420404e-05, + "loss": 0.435, + "step": 1204 + }, + { + "epoch": 0.9655724579663731, + "learning_rate": 1.592748774284844e-05, + "loss": 0.3494, + "step": 1206 + }, + { + "epoch": 0.967173738991193, + "learning_rate": 1.588237402753703e-05, + "loss": 0.3064, + "step": 1208 + }, + { + "epoch": 0.9687750200160128, + "learning_rate": 1.5837076498211673e-05, + "loss": 0.3494, + "step": 1210 + }, + { + "epoch": 0.9703763010408326, + "learning_rate": 1.579159657034185e-05, + "loss": 0.2899, + "step": 1212 + }, + { + "epoch": 0.9719775820656525, + "learning_rate": 1.574593566509664e-05, + "loss": 0.3524, + "step": 1214 + }, + { + "epoch": 0.9735788630904724, + "learning_rate": 1.5700095209300386e-05, + "loss": 0.4521, + "step": 1216 + }, + { + "epoch": 0.9751801441152922, + "learning_rate": 1.5654076635387976e-05, + "loss": 0.3334, + "step": 1218 + }, + { + "epoch": 0.9767814251401121, + "learning_rate": 1.560788138136029e-05, + "loss": 0.3484, + "step": 1220 + }, + { + "epoch": 0.978382706164932, + "learning_rate": 1.5561510890739137e-05, + "loss": 0.3189, + "step": 1222 + }, + { + "epoch": 0.9799839871897518, + "learning_rate": 1.5514966612522088e-05, + "loss": 0.3352, + "step": 1224 + }, + { + "epoch": 0.9815852682145717, + "learning_rate": 1.546825000113736e-05, + "loss": 0.3333, + "step": 1226 + }, + { + "epoch": 0.9831865492393915, + "learning_rate": 1.5421362516398285e-05, + "loss": 0.3809, + "step": 1228 + }, + { + "epoch": 0.9847878302642114, + "learning_rate": 1.5374305623457605e-05, + "loss": 0.3039, + "step": 1230 + }, + { + "epoch": 0.9863891112890312, + "learning_rate": 1.532708079276186e-05, + "loss": 0.2764, + "step": 1232 + }, + { + "epoch": 0.9879903923138511, + "learning_rate": 1.5279689500005353e-05, + "loss": 0.4146, + "step": 1234 + }, + { + "epoch": 0.9895916733386709, + "learning_rate": 1.5232133226083962e-05, + "loss": 0.2898, + "step": 1236 + }, + { + "epoch": 0.9911929543634908, + "learning_rate": 1.5184413457049014e-05, + "loss": 0.304, + "step": 1238 + }, + { + "epoch": 0.9927942353883107, + "learning_rate": 1.5136531684060753e-05, + "loss": 0.2898, + "step": 1240 + }, + { + "epoch": 0.9943955164131305, + "learning_rate": 1.50884894033418e-05, + "loss": 0.335, + "step": 1242 + }, + { + "epoch": 0.9959967974379503, + "learning_rate": 1.504028811613027e-05, + "loss": 0.3189, + "step": 1244 + }, + { + "epoch": 0.9975980784627703, + "learning_rate": 1.4991929328633043e-05, + "loss": 0.335, + "step": 1246 + }, + { + "epoch": 0.9991993594875901, + "learning_rate": 1.4943414551978622e-05, + "loss": 0.3219, + "step": 1248 + }, + { + "epoch": 1.0, + "step": 1249, + "total_flos": 6978372445929472.0, + "train_loss": 0.35933564053333883, + "train_runtime": 5585.8147, + "train_samples_per_second": 3.578, + "train_steps_per_second": 0.224 + } + ], + "logging_steps": 2, + "max_steps": 1249, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 6978372445929472.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..2f5ddb162693035b26cbd4d1ea345e092f7865c8 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57ebbb09ec965bdcef301b271e2c3de83677ae40d951179e532a12ff279bee73 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..8c883fc70d9cb63a5c912740c62047be18963d57 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:264d208572e7f4ae752cd03d0e5bb670ae19a32d07e51c0f7daa7ce4c091d495 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..940ca863fcb2419d8c2b2d4df577b01b0cf1c543 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfb473a8fde6e39ae3be0ad455640424c3c2fb3b92524dd4378fcf663b77f506 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..becb241cf4ca262b925398a538c50a79bb03a104 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adea41cdcc7907f10369fb0eff99f94a2c6efffbcf995a3c557bf84c10904712 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_125_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_125_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a6eb6b864a91a95964e62c37f9c7aebdc8ec1b8d --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_125_seed1/0_trainer_state.json @@ -0,0 +1,7526 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2498, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008006405124099279, + "learning_rate": 2.415943612351265e-06, + "loss": 0.2756, + "step": 2 + }, + { + "epoch": 0.0016012810248198558, + "learning_rate": 2.4341906163790364e-06, + "loss": 0.2151, + "step": 4 + }, + { + "epoch": 0.0024019215372297837, + "learning_rate": 2.4524967251364995e-06, + "loss": 0.2355, + "step": 6 + }, + { + "epoch": 0.0032025620496397116, + "learning_rate": 2.4708617956148052e-06, + "loss": 0.214, + "step": 8 + }, + { + "epoch": 0.0040032025620496394, + "learning_rate": 2.4892856843445236e-06, + "loss": 0.3439, + "step": 10 + }, + { + "epoch": 0.004803843074459567, + "learning_rate": 2.507768247396697e-06, + "loss": 0.2356, + "step": 12 + }, + { + "epoch": 0.005604483586869495, + "learning_rate": 2.5263093403840022e-06, + "loss": 0.2351, + "step": 14 + }, + { + "epoch": 0.006405124099279423, + "learning_rate": 2.5449088184619065e-06, + "loss": 0.2235, + "step": 16 + }, + { + "epoch": 0.007205764611689352, + "learning_rate": 2.5635665363297356e-06, + "loss": 0.2776, + "step": 18 + }, + { + "epoch": 0.008006405124099279, + "learning_rate": 2.5822823482318517e-06, + "loss": 0.4376, + "step": 20 + }, + { + "epoch": 0.008807045636509208, + "learning_rate": 2.6010561079587694e-06, + "loss": 0.2612, + "step": 22 + }, + { + "epoch": 0.009607686148919135, + "learning_rate": 2.6198876688483453e-06, + "loss": 0.2037, + "step": 24 + }, + { + "epoch": 0.010408326661329063, + "learning_rate": 2.6387768837868565e-06, + "loss": 0.2046, + "step": 26 + }, + { + "epoch": 0.01120896717373899, + "learning_rate": 2.6577236052101764e-06, + "loss": 0.2814, + "step": 28 + }, + { + "epoch": 0.01200960768614892, + "learning_rate": 2.6767276851049716e-06, + "loss": 0.2265, + "step": 30 + }, + { + "epoch": 0.012810248198558846, + "learning_rate": 2.6957889750097866e-06, + "loss": 0.2336, + "step": 32 + }, + { + "epoch": 0.013610888710968775, + "learning_rate": 2.7149073260162416e-06, + "loss": 0.3145, + "step": 34 + }, + { + "epoch": 0.014411529223378704, + "learning_rate": 2.7340825887701848e-06, + "loss": 0.2965, + "step": 36 + }, + { + "epoch": 0.01521216973578863, + "learning_rate": 2.7533146134728993e-06, + "loss": 0.3014, + "step": 38 + }, + { + "epoch": 0.016012810248198558, + "learning_rate": 2.772603249882202e-06, + "loss": 0.3884, + "step": 40 + }, + { + "epoch": 0.016813450760608487, + "learning_rate": 2.7919483473136555e-06, + "loss": 0.2357, + "step": 42 + }, + { + "epoch": 0.017614091273018415, + "learning_rate": 2.81134975464178e-06, + "loss": 0.2242, + "step": 44 + }, + { + "epoch": 0.018414731785428344, + "learning_rate": 2.8308073203011634e-06, + "loss": 0.2161, + "step": 46 + }, + { + "epoch": 0.01921537229783827, + "learning_rate": 2.850320892287688e-06, + "loss": 0.3325, + "step": 48 + }, + { + "epoch": 0.020016012810248198, + "learning_rate": 2.8698903181597026e-06, + "loss": 0.2653, + "step": 50 + }, + { + "epoch": 0.020816653322658127, + "learning_rate": 2.889515445039256e-06, + "loss": 0.2454, + "step": 52 + }, + { + "epoch": 0.021617293835068056, + "learning_rate": 2.909196119613218e-06, + "loss": 0.3342, + "step": 54 + }, + { + "epoch": 0.02241793434747798, + "learning_rate": 2.928932188134529e-06, + "loss": 0.2512, + "step": 56 + }, + { + "epoch": 0.02321857485988791, + "learning_rate": 2.9487234964233724e-06, + "loss": 0.2701, + "step": 58 + }, + { + "epoch": 0.02401921537229784, + "learning_rate": 2.9685698898684355e-06, + "loss": 0.2378, + "step": 60 + }, + { + "epoch": 0.024819855884707767, + "learning_rate": 2.988471213428035e-06, + "loss": 0.2463, + "step": 62 + }, + { + "epoch": 0.025620496397117692, + "learning_rate": 3.00842731163137e-06, + "loss": 0.4374, + "step": 64 + }, + { + "epoch": 0.02642113690952762, + "learning_rate": 3.0284380285797733e-06, + "loss": 0.2833, + "step": 66 + }, + { + "epoch": 0.02722177742193755, + "learning_rate": 3.048503207947854e-06, + "loss": 0.2903, + "step": 68 + }, + { + "epoch": 0.02802241793434748, + "learning_rate": 3.068622692984767e-06, + "loss": 0.2464, + "step": 70 + }, + { + "epoch": 0.028823058446757407, + "learning_rate": 3.0887963265154187e-06, + "loss": 0.2579, + "step": 72 + }, + { + "epoch": 0.029623698959167333, + "learning_rate": 3.1090239509417364e-06, + "loss": 0.2356, + "step": 74 + }, + { + "epoch": 0.03042433947157726, + "learning_rate": 3.129305408243829e-06, + "loss": 0.3012, + "step": 76 + }, + { + "epoch": 0.03122497998398719, + "learning_rate": 3.1496405399812602e-06, + "loss": 0.2926, + "step": 78 + }, + { + "epoch": 0.032025620496397116, + "learning_rate": 3.17002918729432e-06, + "loss": 0.2333, + "step": 80 + }, + { + "epoch": 0.03282626100880705, + "learning_rate": 3.1904711909051967e-06, + "loss": 0.3325, + "step": 82 + }, + { + "epoch": 0.03362690152121697, + "learning_rate": 3.2109663911192622e-06, + "loss": 0.3607, + "step": 84 + }, + { + "epoch": 0.0344275420336269, + "learning_rate": 3.231514627826302e-06, + "loss": 0.4014, + "step": 86 + }, + { + "epoch": 0.03522818254603683, + "learning_rate": 3.2521157405018146e-06, + "loss": 0.2479, + "step": 88 + }, + { + "epoch": 0.036028823058446756, + "learning_rate": 3.2727695682081897e-06, + "loss": 0.4036, + "step": 90 + }, + { + "epoch": 0.03682946357085669, + "learning_rate": 3.293475949595998e-06, + "loss": 0.3172, + "step": 92 + }, + { + "epoch": 0.03763010408326661, + "learning_rate": 3.314234722905302e-06, + "loss": 0.3172, + "step": 94 + }, + { + "epoch": 0.03843074459567654, + "learning_rate": 3.335045725966829e-06, + "loss": 0.3912, + "step": 96 + }, + { + "epoch": 0.03923138510808647, + "learning_rate": 3.355908796203301e-06, + "loss": 0.2965, + "step": 98 + }, + { + "epoch": 0.040032025620496396, + "learning_rate": 3.3768237706306716e-06, + "loss": 0.3913, + "step": 100 + }, + { + "epoch": 0.04083266613290633, + "learning_rate": 3.3977904858594534e-06, + "loss": 0.2623, + "step": 102 + }, + { + "epoch": 0.041633306645316254, + "learning_rate": 3.418808778095917e-06, + "loss": 0.2355, + "step": 104 + }, + { + "epoch": 0.04243394715772618, + "learning_rate": 3.4398784831434097e-06, + "loss": 0.2141, + "step": 106 + }, + { + "epoch": 0.04323458767013611, + "learning_rate": 3.460999436403676e-06, + "loss": 0.2558, + "step": 108 + }, + { + "epoch": 0.044035228182546036, + "learning_rate": 3.4821714728780654e-06, + "loss": 0.3561, + "step": 110 + }, + { + "epoch": 0.04483586869495596, + "learning_rate": 3.5033944271688624e-06, + "loss": 0.2863, + "step": 112 + }, + { + "epoch": 0.045636509207365894, + "learning_rate": 3.5246681334806177e-06, + "loss": 0.2379, + "step": 114 + }, + { + "epoch": 0.04643714971977582, + "learning_rate": 3.5459924256213596e-06, + "loss": 0.2717, + "step": 116 + }, + { + "epoch": 0.04723779023218575, + "learning_rate": 3.567367137003953e-06, + "loss": 0.2585, + "step": 118 + }, + { + "epoch": 0.04803843074459568, + "learning_rate": 3.588792100647368e-06, + "loss": 0.2132, + "step": 120 + }, + { + "epoch": 0.0488390712570056, + "learning_rate": 3.6102671491780393e-06, + "loss": 0.2378, + "step": 122 + }, + { + "epoch": 0.049639711769415534, + "learning_rate": 3.6317921148310965e-06, + "loss": 0.2132, + "step": 124 + }, + { + "epoch": 0.05044035228182546, + "learning_rate": 3.653366829451711e-06, + "loss": 0.3816, + "step": 126 + }, + { + "epoch": 0.051240992794235385, + "learning_rate": 3.674991124496452e-06, + "loss": 0.2359, + "step": 128 + }, + { + "epoch": 0.05204163330664532, + "learning_rate": 3.696664831034521e-06, + "loss": 0.2662, + "step": 130 + }, + { + "epoch": 0.05284227381905524, + "learning_rate": 3.7183877797491143e-06, + "loss": 0.2831, + "step": 132 + }, + { + "epoch": 0.053642914331465175, + "learning_rate": 3.740159800938784e-06, + "loss": 0.27, + "step": 134 + }, + { + "epoch": 0.0544435548438751, + "learning_rate": 3.7619807245186824e-06, + "loss": 0.2973, + "step": 136 + }, + { + "epoch": 0.055244195356285025, + "learning_rate": 3.783850380021933e-06, + "loss": 0.4026, + "step": 138 + }, + { + "epoch": 0.05604483586869496, + "learning_rate": 3.8057685966010025e-06, + "loss": 0.4373, + "step": 140 + }, + { + "epoch": 0.05684547638110488, + "learning_rate": 3.827735203028956e-06, + "loss": 0.2441, + "step": 142 + }, + { + "epoch": 0.057646116893514815, + "learning_rate": 3.849750027700842e-06, + "loss": 0.2234, + "step": 144 + }, + { + "epoch": 0.05844675740592474, + "learning_rate": 3.871812898635011e-06, + "loss": 0.2355, + "step": 146 + }, + { + "epoch": 0.059247397918334666, + "learning_rate": 3.8939236434745184e-06, + "loss": 0.2234, + "step": 148 + }, + { + "epoch": 0.0600480384307446, + "learning_rate": 3.916082089488379e-06, + "loss": 0.3627, + "step": 150 + }, + { + "epoch": 0.06084867894315452, + "learning_rate": 3.938288063572962e-06, + "loss": 0.3171, + "step": 152 + }, + { + "epoch": 0.06164931945556445, + "learning_rate": 3.960541392253387e-06, + "loss": 0.2576, + "step": 154 + }, + { + "epoch": 0.06244995996797438, + "learning_rate": 3.982841901684792e-06, + "loss": 0.2502, + "step": 156 + }, + { + "epoch": 0.0632506004803843, + "learning_rate": 4.005189417653737e-06, + "loss": 0.3694, + "step": 158 + }, + { + "epoch": 0.06405124099279423, + "learning_rate": 4.027583765579601e-06, + "loss": 0.3561, + "step": 160 + }, + { + "epoch": 0.06485188150520416, + "learning_rate": 4.050024770515873e-06, + "loss": 0.3912, + "step": 162 + }, + { + "epoch": 0.0656525220176141, + "learning_rate": 4.072512257151546e-06, + "loss": 0.2244, + "step": 164 + }, + { + "epoch": 0.06645316253002402, + "learning_rate": 4.095046049812541e-06, + "loss": 0.3473, + "step": 166 + }, + { + "epoch": 0.06725380304243395, + "learning_rate": 4.117625972462988e-06, + "loss": 0.3439, + "step": 168 + }, + { + "epoch": 0.06805444355484387, + "learning_rate": 4.1402518487066624e-06, + "loss": 0.244, + "step": 170 + }, + { + "epoch": 0.0688550840672538, + "learning_rate": 4.1629235017883285e-06, + "loss": 0.251, + "step": 172 + }, + { + "epoch": 0.06965572457966374, + "learning_rate": 4.1856407545951825e-06, + "loss": 0.4175, + "step": 174 + }, + { + "epoch": 0.07045636509207366, + "learning_rate": 4.208403429658151e-06, + "loss": 0.2242, + "step": 176 + }, + { + "epoch": 0.07125700560448359, + "learning_rate": 4.2312113491533145e-06, + "loss": 0.2263, + "step": 178 + }, + { + "epoch": 0.07205764611689351, + "learning_rate": 4.254064334903347e-06, + "loss": 0.3208, + "step": 180 + }, + { + "epoch": 0.07285828662930344, + "learning_rate": 4.276962208378814e-06, + "loss": 0.2233, + "step": 182 + }, + { + "epoch": 0.07365892714171338, + "learning_rate": 4.299904790699619e-06, + "loss": 0.251, + "step": 184 + }, + { + "epoch": 0.0744595676541233, + "learning_rate": 4.3228919026364345e-06, + "loss": 0.3703, + "step": 186 + }, + { + "epoch": 0.07526020816653323, + "learning_rate": 4.345923364612024e-06, + "loss": 0.2338, + "step": 188 + }, + { + "epoch": 0.07606084867894315, + "learning_rate": 4.368998996702686e-06, + "loss": 0.3209, + "step": 190 + }, + { + "epoch": 0.07686148919135308, + "learning_rate": 4.392118618639698e-06, + "loss": 0.3627, + "step": 192 + }, + { + "epoch": 0.07766212970376302, + "learning_rate": 4.415282049810643e-06, + "loss": 0.3473, + "step": 194 + }, + { + "epoch": 0.07846277021617294, + "learning_rate": 4.4384891092608795e-06, + "loss": 0.3244, + "step": 196 + }, + { + "epoch": 0.07926341072858287, + "learning_rate": 4.461739615694921e-06, + "loss": 0.496, + "step": 198 + }, + { + "epoch": 0.08006405124099279, + "learning_rate": 4.485033387477915e-06, + "loss": 0.3015, + "step": 200 + }, + { + "epoch": 0.08086469175340272, + "learning_rate": 4.5083702426369715e-06, + "loss": 0.3627, + "step": 202 + }, + { + "epoch": 0.08166533226581266, + "learning_rate": 4.531749998862628e-06, + "loss": 0.282, + "step": 204 + }, + { + "epoch": 0.08246597277822258, + "learning_rate": 4.555172473510324e-06, + "loss": 0.3379, + "step": 206 + }, + { + "epoch": 0.08326661329063251, + "learning_rate": 4.578637483601732e-06, + "loss": 0.266, + "step": 208 + }, + { + "epoch": 0.08406725380304243, + "learning_rate": 4.602144845826234e-06, + "loss": 0.2462, + "step": 210 + }, + { + "epoch": 0.08486789431545236, + "learning_rate": 4.625694376542399e-06, + "loss": 0.3524, + "step": 212 + }, + { + "epoch": 0.08566853482786228, + "learning_rate": 4.649285891779326e-06, + "loss": 0.2386, + "step": 214 + }, + { + "epoch": 0.08646917534027222, + "learning_rate": 4.672919207238145e-06, + "loss": 0.3088, + "step": 216 + }, + { + "epoch": 0.08726981585268215, + "learning_rate": 4.696594138293421e-06, + "loss": 0.4024, + "step": 218 + }, + { + "epoch": 0.08807045636509207, + "learning_rate": 4.720310499994664e-06, + "loss": 0.3417, + "step": 220 + }, + { + "epoch": 0.088871096877502, + "learning_rate": 4.744068107067673e-06, + "loss": 0.3343, + "step": 222 + }, + { + "epoch": 0.08967173738991192, + "learning_rate": 4.767866773916041e-06, + "loss": 0.2634, + "step": 224 + }, + { + "epoch": 0.09047237790232186, + "learning_rate": 4.79170631462264e-06, + "loss": 0.4177, + "step": 226 + }, + { + "epoch": 0.09127301841473179, + "learning_rate": 4.81558654295099e-06, + "loss": 0.4508, + "step": 228 + }, + { + "epoch": 0.09207365892714171, + "learning_rate": 4.839507272346751e-06, + "loss": 0.4138, + "step": 230 + }, + { + "epoch": 0.09287429943955164, + "learning_rate": 4.863468315939234e-06, + "loss": 0.4176, + "step": 232 + }, + { + "epoch": 0.09367493995196156, + "learning_rate": 4.8874694865427676e-06, + "loss": 0.3948, + "step": 234 + }, + { + "epoch": 0.0944755804643715, + "learning_rate": 4.911510596658202e-06, + "loss": 0.4148, + "step": 236 + }, + { + "epoch": 0.09527622097678143, + "learning_rate": 4.935591458474425e-06, + "loss": 0.2718, + "step": 238 + }, + { + "epoch": 0.09607686148919135, + "learning_rate": 4.959711883869734e-06, + "loss": 0.4705, + "step": 240 + }, + { + "epoch": 0.09687750200160128, + "learning_rate": 4.9838716844133665e-06, + "loss": 0.2814, + "step": 242 + }, + { + "epoch": 0.0976781425140112, + "learning_rate": 5.0080706713669435e-06, + "loss": 0.286, + "step": 244 + }, + { + "epoch": 0.09847878302642114, + "learning_rate": 5.032308655686007e-06, + "loss": 0.2586, + "step": 246 + }, + { + "epoch": 0.09927942353883107, + "learning_rate": 5.056585448021398e-06, + "loss": 0.2861, + "step": 248 + }, + { + "epoch": 0.100080064051241, + "learning_rate": 5.080900858720789e-06, + "loss": 0.2611, + "step": 250 + }, + { + "epoch": 0.10088070456365092, + "learning_rate": 5.105254697830208e-06, + "loss": 0.3475, + "step": 252 + }, + { + "epoch": 0.10168134507606084, + "learning_rate": 5.129646775095432e-06, + "loss": 0.2334, + "step": 254 + }, + { + "epoch": 0.10248198558847077, + "learning_rate": 5.154076899963514e-06, + "loss": 0.3639, + "step": 256 + }, + { + "epoch": 0.10328262610088071, + "learning_rate": 5.178544881584328e-06, + "loss": 0.2334, + "step": 258 + }, + { + "epoch": 0.10408326661329063, + "learning_rate": 5.203050528811959e-06, + "loss": 0.2378, + "step": 260 + }, + { + "epoch": 0.10488390712570056, + "learning_rate": 5.227593650206246e-06, + "loss": 0.3174, + "step": 262 + }, + { + "epoch": 0.10568454763811048, + "learning_rate": 5.2521740540343205e-06, + "loss": 0.2442, + "step": 264 + }, + { + "epoch": 0.10648518815052041, + "learning_rate": 5.2767915482720164e-06, + "loss": 0.3012, + "step": 266 + }, + { + "epoch": 0.10728582866293035, + "learning_rate": 5.3014459406054295e-06, + "loss": 0.2254, + "step": 268 + }, + { + "epoch": 0.10808646917534027, + "learning_rate": 5.3261370384323904e-06, + "loss": 0.2549, + "step": 270 + }, + { + "epoch": 0.1088871096877502, + "learning_rate": 5.350864648864026e-06, + "loss": 0.3209, + "step": 272 + }, + { + "epoch": 0.10968775020016013, + "learning_rate": 5.375628578726181e-06, + "loss": 0.3694, + "step": 274 + }, + { + "epoch": 0.11048839071257005, + "learning_rate": 5.4004286345609665e-06, + "loss": 0.2752, + "step": 276 + }, + { + "epoch": 0.11128903122497999, + "learning_rate": 5.425264622628326e-06, + "loss": 0.2585, + "step": 278 + }, + { + "epoch": 0.11208967173738991, + "learning_rate": 5.450136348907444e-06, + "loss": 0.2479, + "step": 280 + }, + { + "epoch": 0.11289031224979984, + "learning_rate": 5.475043619098321e-06, + "loss": 0.3341, + "step": 282 + }, + { + "epoch": 0.11369095276220977, + "learning_rate": 5.499986238623329e-06, + "loss": 0.2478, + "step": 284 + }, + { + "epoch": 0.11449159327461969, + "learning_rate": 5.524964012628644e-06, + "loss": 0.2232, + "step": 286 + }, + { + "epoch": 0.11529223378702963, + "learning_rate": 5.549976745985809e-06, + "loss": 0.3562, + "step": 288 + }, + { + "epoch": 0.11609287429943956, + "learning_rate": 5.57502424329331e-06, + "loss": 0.2331, + "step": 290 + }, + { + "epoch": 0.11689351481184948, + "learning_rate": 5.6001063088780085e-06, + "loss": 0.3012, + "step": 292 + }, + { + "epoch": 0.1176941553242594, + "learning_rate": 5.62522274679673e-06, + "loss": 0.25, + "step": 294 + }, + { + "epoch": 0.11849479583666933, + "learning_rate": 5.650373360837763e-06, + "loss": 0.3028, + "step": 296 + }, + { + "epoch": 0.11929543634907927, + "learning_rate": 5.675557954522462e-06, + "loss": 0.334, + "step": 298 + }, + { + "epoch": 0.1200960768614892, + "learning_rate": 5.700776331106674e-06, + "loss": 0.275, + "step": 300 + }, + { + "epoch": 0.12089671737389912, + "learning_rate": 5.726028293582342e-06, + "loss": 0.3656, + "step": 302 + }, + { + "epoch": 0.12169735788630905, + "learning_rate": 5.751313644679071e-06, + "loss": 0.2889, + "step": 304 + }, + { + "epoch": 0.12249799839871897, + "learning_rate": 5.776632186865589e-06, + "loss": 0.2609, + "step": 306 + }, + { + "epoch": 0.1232986389111289, + "learning_rate": 5.8019837223513295e-06, + "loss": 0.244, + "step": 308 + }, + { + "epoch": 0.12409927942353884, + "learning_rate": 5.827368053088032e-06, + "loss": 0.2354, + "step": 310 + }, + { + "epoch": 0.12489991993594876, + "learning_rate": 5.852784980771182e-06, + "loss": 0.2547, + "step": 312 + }, + { + "epoch": 0.1257005604483587, + "learning_rate": 5.878234306841637e-06, + "loss": 0.2925, + "step": 314 + }, + { + "epoch": 0.1265012009607686, + "learning_rate": 5.903715832487138e-06, + "loss": 0.3985, + "step": 316 + }, + { + "epoch": 0.12730184147317855, + "learning_rate": 5.929229358643925e-06, + "loss": 0.2477, + "step": 318 + }, + { + "epoch": 0.12810248198558846, + "learning_rate": 5.954774685998206e-06, + "loss": 0.2478, + "step": 320 + }, + { + "epoch": 0.1289031224979984, + "learning_rate": 5.9803516149877475e-06, + "loss": 0.3343, + "step": 322 + }, + { + "epoch": 0.1297037630104083, + "learning_rate": 6.005959945803494e-06, + "loss": 0.213, + "step": 324 + }, + { + "epoch": 0.13050440352281825, + "learning_rate": 6.03159947839103e-06, + "loss": 0.2898, + "step": 326 + }, + { + "epoch": 0.1313050440352282, + "learning_rate": 6.057270012452186e-06, + "loss": 0.687, + "step": 328 + }, + { + "epoch": 0.1321056845476381, + "learning_rate": 6.082971347446654e-06, + "loss": 0.4118, + "step": 330 + }, + { + "epoch": 0.13290632506004804, + "learning_rate": 6.108703282593461e-06, + "loss": 0.3833, + "step": 332 + }, + { + "epoch": 0.13370696557245795, + "learning_rate": 6.13446561687258e-06, + "loss": 0.2509, + "step": 334 + }, + { + "epoch": 0.1345076060848679, + "learning_rate": 6.160258149026557e-06, + "loss": 0.2697, + "step": 336 + }, + { + "epoch": 0.13530824659727783, + "learning_rate": 6.186080677561974e-06, + "loss": 0.3913, + "step": 338 + }, + { + "epoch": 0.13610888710968774, + "learning_rate": 6.2119330007511014e-06, + "loss": 0.4176, + "step": 340 + }, + { + "epoch": 0.13690952762209768, + "learning_rate": 6.237814916633431e-06, + "loss": 0.261, + "step": 342 + }, + { + "epoch": 0.1377101681345076, + "learning_rate": 6.263726223017326e-06, + "loss": 0.2757, + "step": 344 + }, + { + "epoch": 0.13851080864691753, + "learning_rate": 6.289666717481496e-06, + "loss": 0.3439, + "step": 346 + }, + { + "epoch": 0.13931144915932747, + "learning_rate": 6.315636197376634e-06, + "loss": 0.2757, + "step": 348 + }, + { + "epoch": 0.14011208967173738, + "learning_rate": 6.341634459827044e-06, + "loss": 0.469, + "step": 350 + }, + { + "epoch": 0.14091273018414732, + "learning_rate": 6.3676613017321305e-06, + "loss": 0.3637, + "step": 352 + }, + { + "epoch": 0.14171337069655723, + "learning_rate": 6.393716519768032e-06, + "loss": 0.3323, + "step": 354 + }, + { + "epoch": 0.14251401120896717, + "learning_rate": 6.419799910389257e-06, + "loss": 0.3561, + "step": 356 + }, + { + "epoch": 0.1433146517213771, + "learning_rate": 6.445911269830183e-06, + "loss": 0.3523, + "step": 358 + }, + { + "epoch": 0.14411529223378702, + "learning_rate": 6.472050394106689e-06, + "loss": 0.303, + "step": 360 + }, + { + "epoch": 0.14491593274619696, + "learning_rate": 6.498217079017806e-06, + "loss": 0.2611, + "step": 362 + }, + { + "epoch": 0.14571657325860687, + "learning_rate": 6.524411120147204e-06, + "loss": 0.2609, + "step": 364 + }, + { + "epoch": 0.1465172137710168, + "learning_rate": 6.5506323128648654e-06, + "loss": 0.2463, + "step": 366 + }, + { + "epoch": 0.14731785428342675, + "learning_rate": 6.576880452328645e-06, + "loss": 0.2717, + "step": 368 + }, + { + "epoch": 0.14811849479583666, + "learning_rate": 6.603155333485934e-06, + "loss": 0.25, + "step": 370 + }, + { + "epoch": 0.1489191353082466, + "learning_rate": 6.6294567510751675e-06, + "loss": 0.2332, + "step": 372 + }, + { + "epoch": 0.14971977582065651, + "learning_rate": 6.655784499627476e-06, + "loss": 0.2754, + "step": 374 + }, + { + "epoch": 0.15052041633306645, + "learning_rate": 6.682138373468341e-06, + "loss": 0.2355, + "step": 376 + }, + { + "epoch": 0.1513210568454764, + "learning_rate": 6.7085181667191e-06, + "loss": 0.2355, + "step": 378 + }, + { + "epoch": 0.1521216973578863, + "learning_rate": 6.734923673298605e-06, + "loss": 0.3342, + "step": 380 + }, + { + "epoch": 0.15292233787029624, + "learning_rate": 6.761354686924883e-06, + "loss": 0.2374, + "step": 382 + }, + { + "epoch": 0.15372297838270615, + "learning_rate": 6.787811001116654e-06, + "loss": 0.2717, + "step": 384 + }, + { + "epoch": 0.1545236188951161, + "learning_rate": 6.8142924091949955e-06, + "loss": 0.4011, + "step": 386 + }, + { + "epoch": 0.15532425940752603, + "learning_rate": 6.840798704284939e-06, + "loss": 0.2378, + "step": 388 + }, + { + "epoch": 0.15612489991993594, + "learning_rate": 6.867329679317144e-06, + "loss": 0.2331, + "step": 390 + }, + { + "epoch": 0.15692554043234588, + "learning_rate": 6.893885127029419e-06, + "loss": 0.3063, + "step": 392 + }, + { + "epoch": 0.1577261809447558, + "learning_rate": 6.920464839968391e-06, + "loss": 0.2585, + "step": 394 + }, + { + "epoch": 0.15852682145716573, + "learning_rate": 6.94706861049117e-06, + "loss": 0.2263, + "step": 396 + }, + { + "epoch": 0.15932746196957567, + "learning_rate": 6.973696230766884e-06, + "loss": 0.2609, + "step": 398 + }, + { + "epoch": 0.16012810248198558, + "learning_rate": 7.000347492778341e-06, + "loss": 0.2802, + "step": 400 + }, + { + "epoch": 0.16092874299439552, + "learning_rate": 7.027022188323704e-06, + "loss": 0.4224, + "step": 402 + }, + { + "epoch": 0.16172938350680544, + "learning_rate": 7.05372010901803e-06, + "loss": 0.2519, + "step": 404 + }, + { + "epoch": 0.16253002401921537, + "learning_rate": 7.080441046294945e-06, + "loss": 0.2462, + "step": 406 + }, + { + "epoch": 0.1633306645316253, + "learning_rate": 7.1071847914082605e-06, + "loss": 0.4118, + "step": 408 + }, + { + "epoch": 0.16413130504403523, + "learning_rate": 7.133951135433656e-06, + "loss": 0.3342, + "step": 410 + }, + { + "epoch": 0.16493194555644516, + "learning_rate": 7.160739869270219e-06, + "loss": 0.3009, + "step": 412 + }, + { + "epoch": 0.16573258606885508, + "learning_rate": 7.18755078364214e-06, + "loss": 0.261, + "step": 414 + }, + { + "epoch": 0.16653322658126501, + "learning_rate": 7.214383669100317e-06, + "loss": 0.2329, + "step": 416 + }, + { + "epoch": 0.16733386709367493, + "learning_rate": 7.241238316024064e-06, + "loss": 0.4011, + "step": 418 + }, + { + "epoch": 0.16813450760608487, + "learning_rate": 7.268114514622635e-06, + "loss": 0.2437, + "step": 420 + }, + { + "epoch": 0.1689351481184948, + "learning_rate": 7.2950120549369204e-06, + "loss": 0.3133, + "step": 422 + }, + { + "epoch": 0.16973578863090472, + "learning_rate": 7.321930726841144e-06, + "loss": 0.252, + "step": 424 + }, + { + "epoch": 0.17053642914331466, + "learning_rate": 7.348870320044395e-06, + "loss": 0.2437, + "step": 426 + }, + { + "epoch": 0.17133706965572457, + "learning_rate": 7.375830624092336e-06, + "loss": 0.2499, + "step": 428 + }, + { + "epoch": 0.1721377101681345, + "learning_rate": 7.402811428368824e-06, + "loss": 0.2229, + "step": 430 + }, + { + "epoch": 0.17293835068054444, + "learning_rate": 7.429812522097613e-06, + "loss": 0.2336, + "step": 432 + }, + { + "epoch": 0.17373899119295436, + "learning_rate": 7.4568336943439055e-06, + "loss": 0.3132, + "step": 434 + }, + { + "epoch": 0.1745396317053643, + "learning_rate": 7.4838747340160475e-06, + "loss": 0.2228, + "step": 436 + }, + { + "epoch": 0.1753402722177742, + "learning_rate": 7.510935429867233e-06, + "loss": 0.3009, + "step": 438 + }, + { + "epoch": 0.17614091273018415, + "learning_rate": 7.538015570497046e-06, + "loss": 0.2436, + "step": 440 + }, + { + "epoch": 0.17694155324259409, + "learning_rate": 7.5651149443531846e-06, + "loss": 0.3233, + "step": 442 + }, + { + "epoch": 0.177742193755004, + "learning_rate": 7.592233339733077e-06, + "loss": 0.2974, + "step": 444 + }, + { + "epoch": 0.17854283426741394, + "learning_rate": 7.619370544785608e-06, + "loss": 0.3511, + "step": 446 + }, + { + "epoch": 0.17934347477982385, + "learning_rate": 7.646526347512665e-06, + "loss": 0.4251, + "step": 448 + }, + { + "epoch": 0.1801441152922338, + "learning_rate": 7.67370053577085e-06, + "loss": 0.3182, + "step": 450 + }, + { + "epoch": 0.18094475580464373, + "learning_rate": 7.70089289727319e-06, + "loss": 0.25, + "step": 452 + }, + { + "epoch": 0.18174539631705364, + "learning_rate": 7.728103219590684e-06, + "loss": 0.2771, + "step": 454 + }, + { + "epoch": 0.18254603682946358, + "learning_rate": 7.755331290154041e-06, + "loss": 0.2894, + "step": 456 + }, + { + "epoch": 0.1833466773418735, + "learning_rate": 7.7825768962553e-06, + "loss": 0.3833, + "step": 458 + }, + { + "epoch": 0.18414731785428343, + "learning_rate": 7.809839825049565e-06, + "loss": 0.2479, + "step": 460 + }, + { + "epoch": 0.18494795836669337, + "learning_rate": 7.83711986355656e-06, + "loss": 0.2438, + "step": 462 + }, + { + "epoch": 0.18574859887910328, + "learning_rate": 7.864416798662347e-06, + "loss": 0.2657, + "step": 464 + }, + { + "epoch": 0.18654923939151322, + "learning_rate": 7.891730417121043e-06, + "loss": 0.334, + "step": 466 + }, + { + "epoch": 0.18734987990392313, + "learning_rate": 7.919060505556376e-06, + "loss": 0.2773, + "step": 468 + }, + { + "epoch": 0.18815052041633307, + "learning_rate": 7.946406850463435e-06, + "loss": 0.3234, + "step": 470 + }, + { + "epoch": 0.188951160928743, + "learning_rate": 7.973769238210291e-06, + "loss": 0.2331, + "step": 472 + }, + { + "epoch": 0.18975180144115292, + "learning_rate": 8.001147455039737e-06, + "loss": 0.391, + "step": 474 + }, + { + "epoch": 0.19055244195356286, + "learning_rate": 8.028541287070858e-06, + "loss": 0.3949, + "step": 476 + }, + { + "epoch": 0.19135308246597277, + "learning_rate": 8.055950520300756e-06, + "loss": 0.2973, + "step": 478 + }, + { + "epoch": 0.1921537229783827, + "learning_rate": 8.083374940606256e-06, + "loss": 0.5024, + "step": 480 + }, + { + "epoch": 0.19295436349079265, + "learning_rate": 8.110814333745503e-06, + "loss": 0.2772, + "step": 482 + }, + { + "epoch": 0.19375500400320256, + "learning_rate": 8.138268485359684e-06, + "loss": 0.3048, + "step": 484 + }, + { + "epoch": 0.1945556445156125, + "learning_rate": 8.165737180974676e-06, + "loss": 0.3182, + "step": 486 + }, + { + "epoch": 0.1953562850280224, + "learning_rate": 8.193220206002785e-06, + "loss": 0.233, + "step": 488 + }, + { + "epoch": 0.19615692554043235, + "learning_rate": 8.220717345744326e-06, + "loss": 0.3484, + "step": 490 + }, + { + "epoch": 0.1969575660528423, + "learning_rate": 8.248228385389349e-06, + "loss": 0.2478, + "step": 492 + }, + { + "epoch": 0.1977582065652522, + "learning_rate": 8.275753110019367e-06, + "loss": 0.3323, + "step": 494 + }, + { + "epoch": 0.19855884707766214, + "learning_rate": 8.303291304608936e-06, + "loss": 0.3322, + "step": 496 + }, + { + "epoch": 0.19935948759007205, + "learning_rate": 8.330842754027378e-06, + "loss": 0.3806, + "step": 498 + }, + { + "epoch": 0.200160128102482, + "learning_rate": 8.358407243040524e-06, + "loss": 0.2831, + "step": 500 + }, + { + "epoch": 0.20096076861489193, + "learning_rate": 8.385984556312285e-06, + "loss": 0.301, + "step": 502 + }, + { + "epoch": 0.20176140912730184, + "learning_rate": 8.413574478406386e-06, + "loss": 0.318, + "step": 504 + }, + { + "epoch": 0.20256204963971178, + "learning_rate": 8.441176793788106e-06, + "loss": 0.2461, + "step": 506 + }, + { + "epoch": 0.2033626901521217, + "learning_rate": 8.468791286825856e-06, + "loss": 0.3802, + "step": 508 + }, + { + "epoch": 0.20416333066453163, + "learning_rate": 8.496417741792922e-06, + "loss": 0.2477, + "step": 510 + }, + { + "epoch": 0.20496397117694154, + "learning_rate": 8.524055942869135e-06, + "loss": 0.283, + "step": 512 + }, + { + "epoch": 0.20576461168935148, + "learning_rate": 8.551705674142616e-06, + "loss": 0.2717, + "step": 514 + }, + { + "epoch": 0.20656525220176142, + "learning_rate": 8.579366719611353e-06, + "loss": 0.2388, + "step": 516 + }, + { + "epoch": 0.20736589271417133, + "learning_rate": 8.607038863184952e-06, + "loss": 0.2388, + "step": 518 + }, + { + "epoch": 0.20816653322658127, + "learning_rate": 8.634721888686368e-06, + "loss": 0.2388, + "step": 520 + }, + { + "epoch": 0.20896717373899118, + "learning_rate": 8.662415579853495e-06, + "loss": 0.2546, + "step": 522 + }, + { + "epoch": 0.20976781425140112, + "learning_rate": 8.690119720340907e-06, + "loss": 0.2388, + "step": 524 + }, + { + "epoch": 0.21056845476381106, + "learning_rate": 8.717834093721598e-06, + "loss": 0.2126, + "step": 526 + }, + { + "epoch": 0.21136909527622097, + "learning_rate": 8.74555848348857e-06, + "loss": 0.2354, + "step": 528 + }, + { + "epoch": 0.2121697357886309, + "learning_rate": 8.773292673056572e-06, + "loss": 0.2575, + "step": 530 + }, + { + "epoch": 0.21297037630104082, + "learning_rate": 8.801036445763858e-06, + "loss": 0.4342, + "step": 532 + }, + { + "epoch": 0.21377101681345076, + "learning_rate": 8.828789584873757e-06, + "loss": 0.2544, + "step": 534 + }, + { + "epoch": 0.2145716573258607, + "learning_rate": 8.856551873576448e-06, + "loss": 0.334, + "step": 536 + }, + { + "epoch": 0.2153722978382706, + "learning_rate": 8.884323094990613e-06, + "loss": 0.303, + "step": 538 + }, + { + "epoch": 0.21617293835068055, + "learning_rate": 8.912103032165206e-06, + "loss": 0.3693, + "step": 540 + }, + { + "epoch": 0.21697357886309046, + "learning_rate": 8.939891468081036e-06, + "loss": 0.2327, + "step": 542 + }, + { + "epoch": 0.2177742193755004, + "learning_rate": 8.967688185652527e-06, + "loss": 0.2329, + "step": 544 + }, + { + "epoch": 0.21857485988791034, + "learning_rate": 8.995492967729449e-06, + "loss": 0.3605, + "step": 546 + }, + { + "epoch": 0.21937550040032025, + "learning_rate": 9.023305597098526e-06, + "loss": 0.2715, + "step": 548 + }, + { + "epoch": 0.2201761409127302, + "learning_rate": 9.051125856485175e-06, + "loss": 0.4475, + "step": 550 + }, + { + "epoch": 0.2209767814251401, + "learning_rate": 9.078953528555258e-06, + "loss": 0.2352, + "step": 552 + }, + { + "epoch": 0.22177742193755004, + "learning_rate": 9.106788395916682e-06, + "loss": 0.2436, + "step": 554 + }, + { + "epoch": 0.22257806244995998, + "learning_rate": 9.134630241121135e-06, + "loss": 0.5077, + "step": 556 + }, + { + "epoch": 0.2233787029623699, + "learning_rate": 9.162478846665854e-06, + "loss": 0.261, + "step": 558 + }, + { + "epoch": 0.22417934347477983, + "learning_rate": 9.190333994995208e-06, + "loss": 0.3314, + "step": 560 + }, + { + "epoch": 0.22497998398718974, + "learning_rate": 9.218195468502469e-06, + "loss": 0.2354, + "step": 562 + }, + { + "epoch": 0.22578062449959968, + "learning_rate": 9.24606304953148e-06, + "loss": 0.3482, + "step": 564 + }, + { + "epoch": 0.22658126501200962, + "learning_rate": 9.273936520378426e-06, + "loss": 0.338, + "step": 566 + }, + { + "epoch": 0.22738190552441953, + "learning_rate": 9.301815663293426e-06, + "loss": 0.5067, + "step": 568 + }, + { + "epoch": 0.22818254603682947, + "learning_rate": 9.329700260482286e-06, + "loss": 0.2857, + "step": 570 + }, + { + "epoch": 0.22898318654923938, + "learning_rate": 9.35759009410826e-06, + "loss": 0.2437, + "step": 572 + }, + { + "epoch": 0.22978382706164932, + "learning_rate": 9.38548494629364e-06, + "loss": 0.3795, + "step": 574 + }, + { + "epoch": 0.23058446757405926, + "learning_rate": 9.41338459912151e-06, + "loss": 0.2632, + "step": 576 + }, + { + "epoch": 0.23138510808646917, + "learning_rate": 9.441288834637507e-06, + "loss": 0.3806, + "step": 578 + }, + { + "epoch": 0.2321857485988791, + "learning_rate": 9.469197434851414e-06, + "loss": 0.2435, + "step": 580 + }, + { + "epoch": 0.23298638911128902, + "learning_rate": 9.497110181738935e-06, + "loss": 0.3413, + "step": 582 + }, + { + "epoch": 0.23378702962369896, + "learning_rate": 9.52502685724336e-06, + "loss": 0.2462, + "step": 584 + }, + { + "epoch": 0.2345876701361089, + "learning_rate": 9.552947243277342e-06, + "loss": 0.309, + "step": 586 + }, + { + "epoch": 0.2353883106485188, + "learning_rate": 9.580871121724498e-06, + "loss": 0.2662, + "step": 588 + }, + { + "epoch": 0.23618895116092875, + "learning_rate": 9.608798274441153e-06, + "loss": 0.318, + "step": 590 + }, + { + "epoch": 0.23698959167333866, + "learning_rate": 9.636728483258116e-06, + "loss": 0.3749, + "step": 592 + }, + { + "epoch": 0.2377902321857486, + "learning_rate": 9.664661529982263e-06, + "loss": 0.2896, + "step": 594 + }, + { + "epoch": 0.23859087269815854, + "learning_rate": 9.692597196398302e-06, + "loss": 0.2583, + "step": 596 + }, + { + "epoch": 0.23939151321056845, + "learning_rate": 9.720535264270526e-06, + "loss": 0.3909, + "step": 598 + }, + { + "epoch": 0.2401921537229784, + "learning_rate": 9.748475515344416e-06, + "loss": 0.5285, + "step": 600 + }, + { + "epoch": 0.2409927942353883, + "learning_rate": 9.776417731348403e-06, + "loss": 0.4036, + "step": 602 + }, + { + "epoch": 0.24179343474779824, + "learning_rate": 9.80436169399561e-06, + "loss": 0.4084, + "step": 604 + }, + { + "epoch": 0.24259407526020815, + "learning_rate": 9.832307184985473e-06, + "loss": 0.4128, + "step": 606 + }, + { + "epoch": 0.2433947157726181, + "learning_rate": 9.8602539860055e-06, + "loss": 0.3284, + "step": 608 + }, + { + "epoch": 0.24419535628502803, + "learning_rate": 9.888201878732946e-06, + "loss": 0.2697, + "step": 610 + }, + { + "epoch": 0.24499599679743794, + "learning_rate": 9.916150644836596e-06, + "loss": 0.2829, + "step": 612 + }, + { + "epoch": 0.24579663730984788, + "learning_rate": 9.944100065978354e-06, + "loss": 0.3382, + "step": 614 + }, + { + "epoch": 0.2465972778222578, + "learning_rate": 9.972049923815011e-06, + "loss": 0.2477, + "step": 616 + }, + { + "epoch": 0.24739791833466773, + "learning_rate": 9.999999999999996e-06, + "loss": 0.2926, + "step": 618 + }, + { + "epoch": 0.24819855884707767, + "learning_rate": 1.0027950076184982e-05, + "loss": 0.2356, + "step": 620 + }, + { + "epoch": 0.24899919935948758, + "learning_rate": 1.0055899934021637e-05, + "loss": 0.2348, + "step": 622 + }, + { + "epoch": 0.24979983987189752, + "learning_rate": 1.0083849355163397e-05, + "loss": 0.3884, + "step": 624 + }, + { + "epoch": 0.25060048038430743, + "learning_rate": 1.0111798121267047e-05, + "loss": 0.2387, + "step": 626 + }, + { + "epoch": 0.2514011208967174, + "learning_rate": 1.0139746013994493e-05, + "loss": 0.2543, + "step": 628 + }, + { + "epoch": 0.2522017614091273, + "learning_rate": 1.016769281501452e-05, + "loss": 0.275, + "step": 630 + }, + { + "epoch": 0.2530024019215372, + "learning_rate": 1.0195638306004383e-05, + "loss": 0.3909, + "step": 632 + }, + { + "epoch": 0.25380304243394713, + "learning_rate": 1.022358226865159e-05, + "loss": 0.2924, + "step": 634 + }, + { + "epoch": 0.2546036829463571, + "learning_rate": 1.0251524484655577e-05, + "loss": 0.2657, + "step": 636 + }, + { + "epoch": 0.255404323458767, + "learning_rate": 1.0279464735729467e-05, + "loss": 0.3804, + "step": 638 + }, + { + "epoch": 0.2562049639711769, + "learning_rate": 1.0307402803601691e-05, + "loss": 0.4709, + "step": 640 + }, + { + "epoch": 0.2570056044835869, + "learning_rate": 1.033533847001773e-05, + "loss": 0.3562, + "step": 642 + }, + { + "epoch": 0.2578062449959968, + "learning_rate": 1.0363271516741877e-05, + "loss": 0.2756, + "step": 644 + }, + { + "epoch": 0.2586068855084067, + "learning_rate": 1.039120172555884e-05, + "loss": 0.3181, + "step": 646 + }, + { + "epoch": 0.2594075260208166, + "learning_rate": 1.0419128878275495e-05, + "loss": 0.2773, + "step": 648 + }, + { + "epoch": 0.2602081665332266, + "learning_rate": 1.0447052756722651e-05, + "loss": 0.3563, + "step": 650 + }, + { + "epoch": 0.2610088070456365, + "learning_rate": 1.0474973142756632e-05, + "loss": 0.2906, + "step": 652 + }, + { + "epoch": 0.2618094475580464, + "learning_rate": 1.0502889818261058e-05, + "loss": 0.4025, + "step": 654 + }, + { + "epoch": 0.2626100880704564, + "learning_rate": 1.053080256514858e-05, + "loss": 0.3341, + "step": 656 + }, + { + "epoch": 0.2634107285828663, + "learning_rate": 1.0558711165362488e-05, + "loss": 0.401, + "step": 658 + }, + { + "epoch": 0.2642113690952762, + "learning_rate": 1.0586615400878484e-05, + "loss": 0.2499, + "step": 660 + }, + { + "epoch": 0.26501200960768617, + "learning_rate": 1.0614515053706354e-05, + "loss": 0.371, + "step": 662 + }, + { + "epoch": 0.2658126501200961, + "learning_rate": 1.0642409905891733e-05, + "loss": 0.4158, + "step": 664 + }, + { + "epoch": 0.266613290632506, + "learning_rate": 1.0670299739517706e-05, + "loss": 0.2476, + "step": 666 + }, + { + "epoch": 0.2674139311449159, + "learning_rate": 1.0698184336706567e-05, + "loss": 0.3985, + "step": 668 + }, + { + "epoch": 0.2682145716573259, + "learning_rate": 1.0726063479621567e-05, + "loss": 0.3172, + "step": 670 + }, + { + "epoch": 0.2690152121697358, + "learning_rate": 1.0753936950468513e-05, + "loss": 0.3212, + "step": 672 + }, + { + "epoch": 0.2698158526821457, + "learning_rate": 1.0781804531497525e-05, + "loss": 0.25, + "step": 674 + }, + { + "epoch": 0.27061649319455566, + "learning_rate": 1.0809666005004787e-05, + "loss": 0.3171, + "step": 676 + }, + { + "epoch": 0.2714171337069656, + "learning_rate": 1.083752115333414e-05, + "loss": 0.5279, + "step": 678 + }, + { + "epoch": 0.2722177742193755, + "learning_rate": 1.0865369758878858e-05, + "loss": 0.3324, + "step": 680 + }, + { + "epoch": 0.27301841473178545, + "learning_rate": 1.0893211604083311e-05, + "loss": 0.3342, + "step": 682 + }, + { + "epoch": 0.27381905524419536, + "learning_rate": 1.0921046471444737e-05, + "loss": 0.2625, + "step": 684 + }, + { + "epoch": 0.2746196957566053, + "learning_rate": 1.0948874143514818e-05, + "loss": 0.4335, + "step": 686 + }, + { + "epoch": 0.2754203362690152, + "learning_rate": 1.0976694402901467e-05, + "loss": 0.2859, + "step": 688 + }, + { + "epoch": 0.27622097678142515, + "learning_rate": 1.1004507032270544e-05, + "loss": 0.261, + "step": 690 + }, + { + "epoch": 0.27702161729383507, + "learning_rate": 1.1032311814347467e-05, + "loss": 0.4028, + "step": 692 + }, + { + "epoch": 0.277822257806245, + "learning_rate": 1.1060108531918955e-05, + "loss": 0.3416, + "step": 694 + }, + { + "epoch": 0.27862289831865494, + "learning_rate": 1.1087896967834787e-05, + "loss": 0.363, + "step": 696 + }, + { + "epoch": 0.27942353883106485, + "learning_rate": 1.111567690500938e-05, + "loss": 0.2355, + "step": 698 + }, + { + "epoch": 0.28022417934347477, + "learning_rate": 1.1143448126423545e-05, + "loss": 0.2355, + "step": 700 + }, + { + "epoch": 0.28102481985588473, + "learning_rate": 1.1171210415126238e-05, + "loss": 0.2544, + "step": 702 + }, + { + "epoch": 0.28182546036829464, + "learning_rate": 1.1198963554236135e-05, + "loss": 0.3441, + "step": 704 + }, + { + "epoch": 0.28262610088070456, + "learning_rate": 1.122670732694342e-05, + "loss": 0.2585, + "step": 706 + }, + { + "epoch": 0.28342674139311447, + "learning_rate": 1.1254441516511425e-05, + "loss": 0.2586, + "step": 708 + }, + { + "epoch": 0.28422738190552443, + "learning_rate": 1.1282165906278395e-05, + "loss": 0.3135, + "step": 710 + }, + { + "epoch": 0.28502802241793435, + "learning_rate": 1.1309880279659087e-05, + "loss": 0.3261, + "step": 712 + }, + { + "epoch": 0.28582866293034426, + "learning_rate": 1.1337584420146496e-05, + "loss": 0.238, + "step": 714 + }, + { + "epoch": 0.2866293034427542, + "learning_rate": 1.1365278111313625e-05, + "loss": 0.2477, + "step": 716 + }, + { + "epoch": 0.28742994395516414, + "learning_rate": 1.1392961136815041e-05, + "loss": 0.3806, + "step": 718 + }, + { + "epoch": 0.28823058446757405, + "learning_rate": 1.142063328038864e-05, + "loss": 0.2502, + "step": 720 + }, + { + "epoch": 0.289031224979984, + "learning_rate": 1.1448294325857377e-05, + "loss": 0.5078, + "step": 722 + }, + { + "epoch": 0.2898318654923939, + "learning_rate": 1.1475944057130856e-05, + "loss": 0.2972, + "step": 724 + }, + { + "epoch": 0.29063250600480384, + "learning_rate": 1.150358225820707e-05, + "loss": 0.2585, + "step": 726 + }, + { + "epoch": 0.29143314651721375, + "learning_rate": 1.1531208713174138e-05, + "loss": 0.2583, + "step": 728 + }, + { + "epoch": 0.2922337870296237, + "learning_rate": 1.1558823206211887e-05, + "loss": 0.2329, + "step": 730 + }, + { + "epoch": 0.2930344275420336, + "learning_rate": 1.1586425521593607e-05, + "loss": 0.2752, + "step": 732 + }, + { + "epoch": 0.29383506805444354, + "learning_rate": 1.1614015443687708e-05, + "loss": 0.3134, + "step": 734 + }, + { + "epoch": 0.2946357085668535, + "learning_rate": 1.1641592756959467e-05, + "loss": 0.4148, + "step": 736 + }, + { + "epoch": 0.2954363490792634, + "learning_rate": 1.1669157245972616e-05, + "loss": 0.3178, + "step": 738 + }, + { + "epoch": 0.2962369895916733, + "learning_rate": 1.1696708695391057e-05, + "loss": 0.2699, + "step": 740 + }, + { + "epoch": 0.29703763010408324, + "learning_rate": 1.1724246889980626e-05, + "loss": 0.3088, + "step": 742 + }, + { + "epoch": 0.2978382706164932, + "learning_rate": 1.1751771614610643e-05, + "loss": 0.3473, + "step": 744 + }, + { + "epoch": 0.2986389111289031, + "learning_rate": 1.1779282654255668e-05, + "loss": 0.301, + "step": 746 + }, + { + "epoch": 0.29943955164131303, + "learning_rate": 1.180677979399721e-05, + "loss": 0.2716, + "step": 748 + }, + { + "epoch": 0.300240192153723, + "learning_rate": 1.1834262819025317e-05, + "loss": 0.391, + "step": 750 + }, + { + "epoch": 0.3010408326661329, + "learning_rate": 1.1861731514640309e-05, + "loss": 0.2585, + "step": 752 + }, + { + "epoch": 0.3018414731785428, + "learning_rate": 1.188918566625449e-05, + "loss": 0.4375, + "step": 754 + }, + { + "epoch": 0.3026421136909528, + "learning_rate": 1.1916625059393739e-05, + "loss": 0.2576, + "step": 756 + }, + { + "epoch": 0.3034427542033627, + "learning_rate": 1.1944049479699241e-05, + "loss": 0.3757, + "step": 758 + }, + { + "epoch": 0.3042433947157726, + "learning_rate": 1.1971458712929133e-05, + "loss": 0.2354, + "step": 760 + }, + { + "epoch": 0.3050440352281825, + "learning_rate": 1.1998852544960256e-05, + "loss": 0.5926, + "step": 762 + }, + { + "epoch": 0.3058446757405925, + "learning_rate": 1.2026230761789702e-05, + "loss": 0.7027, + "step": 764 + }, + { + "epoch": 0.3066453162530024, + "learning_rate": 1.2053593149536557e-05, + "loss": 0.2226, + "step": 766 + }, + { + "epoch": 0.3074459567654123, + "learning_rate": 1.2080939494443618e-05, + "loss": 0.3235, + "step": 768 + }, + { + "epoch": 0.3082465972778223, + "learning_rate": 1.210826958287895e-05, + "loss": 0.3285, + "step": 770 + }, + { + "epoch": 0.3090472377902322, + "learning_rate": 1.2135583201337646e-05, + "loss": 0.3969, + "step": 772 + }, + { + "epoch": 0.3098478783026421, + "learning_rate": 1.2162880136443434e-05, + "loss": 0.2389, + "step": 774 + }, + { + "epoch": 0.31064851881505207, + "learning_rate": 1.2190160174950428e-05, + "loss": 0.2461, + "step": 776 + }, + { + "epoch": 0.311449159327462, + "learning_rate": 1.2217423103744692e-05, + "loss": 0.2757, + "step": 778 + }, + { + "epoch": 0.3122497998398719, + "learning_rate": 1.2244668709845952e-05, + "loss": 0.3382, + "step": 780 + }, + { + "epoch": 0.3130504403522818, + "learning_rate": 1.2271896780409309e-05, + "loss": 0.3214, + "step": 782 + }, + { + "epoch": 0.31385108086469177, + "learning_rate": 1.2299107102726804e-05, + "loss": 0.2902, + "step": 784 + }, + { + "epoch": 0.3146517213771017, + "learning_rate": 1.2326299464229143e-05, + "loss": 0.301, + "step": 786 + }, + { + "epoch": 0.3154523618895116, + "learning_rate": 1.2353473652487329e-05, + "loss": 0.3033, + "step": 788 + }, + { + "epoch": 0.31625300240192156, + "learning_rate": 1.2380629455214385e-05, + "loss": 0.2463, + "step": 790 + }, + { + "epoch": 0.31705364291433147, + "learning_rate": 1.2407766660266916e-05, + "loss": 0.3342, + "step": 792 + }, + { + "epoch": 0.3178542834267414, + "learning_rate": 1.2434885055646808e-05, + "loss": 0.4321, + "step": 794 + }, + { + "epoch": 0.31865492393915135, + "learning_rate": 1.2461984429502947e-05, + "loss": 0.293, + "step": 796 + }, + { + "epoch": 0.31945556445156126, + "learning_rate": 1.2489064570132761e-05, + "loss": 0.2348, + "step": 798 + }, + { + "epoch": 0.32025620496397117, + "learning_rate": 1.2516125265983945e-05, + "loss": 0.2462, + "step": 800 + }, + { + "epoch": 0.3210568454763811, + "learning_rate": 1.2543166305656089e-05, + "loss": 0.4026, + "step": 802 + }, + { + "epoch": 0.32185748598879105, + "learning_rate": 1.257018747790238e-05, + "loss": 0.3063, + "step": 804 + }, + { + "epoch": 0.32265812650120096, + "learning_rate": 1.259718857163117e-05, + "loss": 0.3603, + "step": 806 + }, + { + "epoch": 0.32345876701361087, + "learning_rate": 1.2624169375907657e-05, + "loss": 0.2335, + "step": 808 + }, + { + "epoch": 0.32425940752602084, + "learning_rate": 1.2651129679955598e-05, + "loss": 0.3636, + "step": 810 + }, + { + "epoch": 0.32506004803843075, + "learning_rate": 1.2678069273158849e-05, + "loss": 0.2756, + "step": 812 + }, + { + "epoch": 0.32586068855084066, + "learning_rate": 1.2704987945063073e-05, + "loss": 0.2503, + "step": 814 + }, + { + "epoch": 0.3266613290632506, + "learning_rate": 1.273188548537736e-05, + "loss": 0.2824, + "step": 816 + }, + { + "epoch": 0.32746196957566054, + "learning_rate": 1.2758761683975929e-05, + "loss": 0.3211, + "step": 818 + }, + { + "epoch": 0.32826261008807045, + "learning_rate": 1.2785616330899676e-05, + "loss": 0.4016, + "step": 820 + }, + { + "epoch": 0.32906325060048036, + "learning_rate": 1.2812449216357855e-05, + "loss": 0.3078, + "step": 822 + }, + { + "epoch": 0.32986389111289033, + "learning_rate": 1.2839260130729776e-05, + "loss": 0.278, + "step": 824 + }, + { + "epoch": 0.33066453162530024, + "learning_rate": 1.2866048864566336e-05, + "loss": 0.2901, + "step": 826 + }, + { + "epoch": 0.33146517213771015, + "learning_rate": 1.2892815208591734e-05, + "loss": 0.3347, + "step": 828 + }, + { + "epoch": 0.3322658126501201, + "learning_rate": 1.2919558953705047e-05, + "loss": 0.384, + "step": 830 + }, + { + "epoch": 0.33306645316253003, + "learning_rate": 1.2946279890981966e-05, + "loss": 0.3637, + "step": 832 + }, + { + "epoch": 0.33386709367493994, + "learning_rate": 1.2972977811676289e-05, + "loss": 0.3341, + "step": 834 + }, + { + "epoch": 0.33466773418734985, + "learning_rate": 1.2999652507221652e-05, + "loss": 0.4014, + "step": 836 + }, + { + "epoch": 0.3354683746997598, + "learning_rate": 1.3026303769233109e-05, + "loss": 0.2436, + "step": 838 + }, + { + "epoch": 0.33626901521216973, + "learning_rate": 1.3052931389508822e-05, + "loss": 0.2861, + "step": 840 + }, + { + "epoch": 0.33706965572457964, + "learning_rate": 1.3079535160031601e-05, + "loss": 0.2229, + "step": 842 + }, + { + "epoch": 0.3378702962369896, + "learning_rate": 1.3106114872970575e-05, + "loss": 0.3064, + "step": 844 + }, + { + "epoch": 0.3386709367493995, + "learning_rate": 1.313267032068285e-05, + "loss": 0.2437, + "step": 846 + }, + { + "epoch": 0.33947157726180943, + "learning_rate": 1.3159201295715054e-05, + "loss": 0.238, + "step": 848 + }, + { + "epoch": 0.3402722177742194, + "learning_rate": 1.3185707590804997e-05, + "loss": 0.2717, + "step": 850 + }, + { + "epoch": 0.3410728582866293, + "learning_rate": 1.321218899888334e-05, + "loss": 0.2584, + "step": 852 + }, + { + "epoch": 0.3418734987990392, + "learning_rate": 1.3238645313075109e-05, + "loss": 0.2462, + "step": 854 + }, + { + "epoch": 0.34267413931144913, + "learning_rate": 1.326507632670139e-05, + "loss": 0.5502, + "step": 856 + }, + { + "epoch": 0.3434747798238591, + "learning_rate": 1.3291481833280894e-05, + "loss": 0.3825, + "step": 858 + }, + { + "epoch": 0.344275420336269, + "learning_rate": 1.3317861626531652e-05, + "loss": 0.3031, + "step": 860 + }, + { + "epoch": 0.3450760608486789, + "learning_rate": 1.3344215500372517e-05, + "loss": 0.5317, + "step": 862 + }, + { + "epoch": 0.3458767013610889, + "learning_rate": 1.3370543248924826e-05, + "loss": 0.2625, + "step": 864 + }, + { + "epoch": 0.3466773418734988, + "learning_rate": 1.3396844666514062e-05, + "loss": 0.2502, + "step": 866 + }, + { + "epoch": 0.3474779823859087, + "learning_rate": 1.3423119547671348e-05, + "loss": 0.5829, + "step": 868 + }, + { + "epoch": 0.3482786228983187, + "learning_rate": 1.344936768713513e-05, + "loss": 0.2584, + "step": 870 + }, + { + "epoch": 0.3490792634107286, + "learning_rate": 1.347558887985279e-05, + "loss": 0.3798, + "step": 872 + }, + { + "epoch": 0.3498799039231385, + "learning_rate": 1.3501782920982189e-05, + "loss": 0.3658, + "step": 874 + }, + { + "epoch": 0.3506805444355484, + "learning_rate": 1.3527949605893305e-05, + "loss": 0.2664, + "step": 876 + }, + { + "epoch": 0.3514811849479584, + "learning_rate": 1.3554088730169812e-05, + "loss": 0.4161, + "step": 878 + }, + { + "epoch": 0.3522818254603683, + "learning_rate": 1.3580200089610739e-05, + "loss": 0.6126, + "step": 880 + }, + { + "epoch": 0.3530824659727782, + "learning_rate": 1.3606283480231962e-05, + "loss": 0.3236, + "step": 882 + }, + { + "epoch": 0.35388310648518817, + "learning_rate": 1.3632338698267863e-05, + "loss": 0.5203, + "step": 884 + }, + { + "epoch": 0.3546837469975981, + "learning_rate": 1.3658365540172948e-05, + "loss": 0.5262, + "step": 886 + }, + { + "epoch": 0.355484387510008, + "learning_rate": 1.368436380262336e-05, + "loss": 0.2904, + "step": 888 + }, + { + "epoch": 0.35628502802241796, + "learning_rate": 1.3710333282518497e-05, + "loss": 0.4229, + "step": 890 + }, + { + "epoch": 0.35708566853482787, + "learning_rate": 1.3736273776982667e-05, + "loss": 0.3065, + "step": 892 + }, + { + "epoch": 0.3578863090472378, + "learning_rate": 1.3762185083366562e-05, + "loss": 0.38, + "step": 894 + }, + { + "epoch": 0.3586869495596477, + "learning_rate": 1.3788066999248893e-05, + "loss": 0.2892, + "step": 896 + }, + { + "epoch": 0.35948759007205766, + "learning_rate": 1.3813919322438018e-05, + "loss": 0.2779, + "step": 898 + }, + { + "epoch": 0.3602882305844676, + "learning_rate": 1.3839741850973435e-05, + "loss": 0.3185, + "step": 900 + }, + { + "epoch": 0.3610888710968775, + "learning_rate": 1.3865534383127413e-05, + "loss": 0.3287, + "step": 902 + }, + { + "epoch": 0.36188951160928745, + "learning_rate": 1.3891296717406533e-05, + "loss": 0.3011, + "step": 904 + }, + { + "epoch": 0.36269015212169736, + "learning_rate": 1.391702865255334e-05, + "loss": 0.3343, + "step": 906 + }, + { + "epoch": 0.3634907926341073, + "learning_rate": 1.3942729987547808e-05, + "loss": 0.2636, + "step": 908 + }, + { + "epoch": 0.36429143314651724, + "learning_rate": 1.3968400521608962e-05, + "loss": 0.3969, + "step": 910 + }, + { + "epoch": 0.36509207365892715, + "learning_rate": 1.3994040054196498e-05, + "loss": 0.4086, + "step": 912 + }, + { + "epoch": 0.36589271417133706, + "learning_rate": 1.4019648385012245e-05, + "loss": 0.3185, + "step": 914 + }, + { + "epoch": 0.366693354683747, + "learning_rate": 1.4045225314001789e-05, + "loss": 0.2702, + "step": 916 + }, + { + "epoch": 0.36749399519615694, + "learning_rate": 1.4070770641356069e-05, + "loss": 0.2701, + "step": 918 + }, + { + "epoch": 0.36829463570856685, + "learning_rate": 1.4096284167512856e-05, + "loss": 0.3186, + "step": 920 + }, + { + "epoch": 0.36909527622097676, + "learning_rate": 1.4121765693158355e-05, + "loss": 0.2637, + "step": 922 + }, + { + "epoch": 0.36989591673338673, + "learning_rate": 1.4147215019228813e-05, + "loss": 0.3523, + "step": 924 + }, + { + "epoch": 0.37069655724579664, + "learning_rate": 1.4172631946911964e-05, + "loss": 0.3052, + "step": 926 + }, + { + "epoch": 0.37149719775820655, + "learning_rate": 1.4198016277648665e-05, + "loss": 0.2778, + "step": 928 + }, + { + "epoch": 0.37229783827061647, + "learning_rate": 1.4223367813134406e-05, + "loss": 0.2892, + "step": 930 + }, + { + "epoch": 0.37309847878302643, + "learning_rate": 1.4248686355320922e-05, + "loss": 0.3755, + "step": 932 + }, + { + "epoch": 0.37389911929543634, + "learning_rate": 1.4273971706417653e-05, + "loss": 0.4336, + "step": 934 + }, + { + "epoch": 0.37469975980784626, + "learning_rate": 1.429922366889332e-05, + "loss": 0.3517, + "step": 936 + }, + { + "epoch": 0.3755004003202562, + "learning_rate": 1.4324442045477534e-05, + "loss": 0.286, + "step": 938 + }, + { + "epoch": 0.37630104083266613, + "learning_rate": 1.4349626639162231e-05, + "loss": 0.3064, + "step": 940 + }, + { + "epoch": 0.37710168134507605, + "learning_rate": 1.4374777253203265e-05, + "loss": 0.3346, + "step": 942 + }, + { + "epoch": 0.377902321857486, + "learning_rate": 1.4399893691121985e-05, + "loss": 0.4968, + "step": 944 + }, + { + "epoch": 0.3787029623698959, + "learning_rate": 1.4424975756706684e-05, + "loss": 0.4041, + "step": 946 + }, + { + "epoch": 0.37950360288230583, + "learning_rate": 1.4450023254014185e-05, + "loss": 0.3458, + "step": 948 + }, + { + "epoch": 0.38030424339471575, + "learning_rate": 1.4475035987371348e-05, + "loss": 0.4747, + "step": 950 + }, + { + "epoch": 0.3811048839071257, + "learning_rate": 1.4500013761376663e-05, + "loss": 0.3971, + "step": 952 + }, + { + "epoch": 0.3819055244195356, + "learning_rate": 1.4524956380901674e-05, + "loss": 0.4228, + "step": 954 + }, + { + "epoch": 0.38270616493194554, + "learning_rate": 1.454986365109255e-05, + "loss": 0.2759, + "step": 956 + }, + { + "epoch": 0.3835068054443555, + "learning_rate": 1.4574735377371669e-05, + "loss": 0.3125, + "step": 958 + }, + { + "epoch": 0.3843074459567654, + "learning_rate": 1.4599571365439027e-05, + "loss": 0.3564, + "step": 960 + }, + { + "epoch": 0.3851080864691753, + "learning_rate": 1.4624371421273812e-05, + "loss": 0.3955, + "step": 962 + }, + { + "epoch": 0.3859087269815853, + "learning_rate": 1.4649135351135968e-05, + "loss": 0.3885, + "step": 964 + }, + { + "epoch": 0.3867093674939952, + "learning_rate": 1.4673862961567604e-05, + "loss": 0.2778, + "step": 966 + }, + { + "epoch": 0.3875100080064051, + "learning_rate": 1.4698554059394563e-05, + "loss": 0.2635, + "step": 968 + }, + { + "epoch": 0.388310648518815, + "learning_rate": 1.4723208451727977e-05, + "loss": 0.3486, + "step": 970 + }, + { + "epoch": 0.389111289031225, + "learning_rate": 1.4747825945965675e-05, + "loss": 0.4228, + "step": 972 + }, + { + "epoch": 0.3899119295436349, + "learning_rate": 1.4772406349793749e-05, + "loss": 0.3326, + "step": 974 + }, + { + "epoch": 0.3907125700560448, + "learning_rate": 1.4796949471188033e-05, + "loss": 0.3346, + "step": 976 + }, + { + "epoch": 0.3915132105684548, + "learning_rate": 1.4821455118415666e-05, + "loss": 0.2586, + "step": 978 + }, + { + "epoch": 0.3923138510808647, + "learning_rate": 1.4845923100036479e-05, + "loss": 0.3184, + "step": 980 + }, + { + "epoch": 0.3931144915932746, + "learning_rate": 1.4870353224904563e-05, + "loss": 0.3346, + "step": 982 + }, + { + "epoch": 0.3939151321056846, + "learning_rate": 1.4894745302169786e-05, + "loss": 0.3184, + "step": 984 + }, + { + "epoch": 0.3947157726180945, + "learning_rate": 1.4919099141279205e-05, + "loss": 0.276, + "step": 986 + }, + { + "epoch": 0.3955164131305044, + "learning_rate": 1.4943414551978597e-05, + "loss": 0.3053, + "step": 988 + }, + { + "epoch": 0.3963170536429143, + "learning_rate": 1.4967691344313988e-05, + "loss": 0.3326, + "step": 990 + }, + { + "epoch": 0.3971176941553243, + "learning_rate": 1.499192932863305e-05, + "loss": 0.508, + "step": 992 + }, + { + "epoch": 0.3979183346677342, + "learning_rate": 1.5016128315586626e-05, + "loss": 0.5928, + "step": 994 + }, + { + "epoch": 0.3987189751801441, + "learning_rate": 1.5040288116130261e-05, + "loss": 0.2901, + "step": 996 + }, + { + "epoch": 0.39951961569255406, + "learning_rate": 1.5064408541525568e-05, + "loss": 0.2438, + "step": 998 + }, + { + "epoch": 0.400320256204964, + "learning_rate": 1.5088489403341793e-05, + "loss": 0.5027, + "step": 1000 + }, + { + "epoch": 0.4011208967173739, + "learning_rate": 1.5112530513457229e-05, + "loss": 0.3238, + "step": 1002 + }, + { + "epoch": 0.40192153722978385, + "learning_rate": 1.513653168406076e-05, + "loss": 0.4142, + "step": 1004 + }, + { + "epoch": 0.40272217774219377, + "learning_rate": 1.5160492727653245e-05, + "loss": 0.3488, + "step": 1006 + }, + { + "epoch": 0.4035228182546037, + "learning_rate": 1.5184413457049006e-05, + "loss": 0.3487, + "step": 1008 + }, + { + "epoch": 0.4043234587670136, + "learning_rate": 1.5208293685377354e-05, + "loss": 0.4379, + "step": 1010 + }, + { + "epoch": 0.40512409927942356, + "learning_rate": 1.5232133226083954e-05, + "loss": 0.3176, + "step": 1012 + }, + { + "epoch": 0.40592473979183347, + "learning_rate": 1.5255931892932322e-05, + "loss": 0.583, + "step": 1014 + }, + { + "epoch": 0.4067253803042434, + "learning_rate": 1.527968950000533e-05, + "loss": 0.348, + "step": 1016 + }, + { + "epoch": 0.40752602081665334, + "learning_rate": 1.5303405861706574e-05, + "loss": 0.2635, + "step": 1018 + }, + { + "epoch": 0.40832666132906326, + "learning_rate": 1.532708079276185e-05, + "loss": 0.2719, + "step": 1020 + }, + { + "epoch": 0.40912730184147317, + "learning_rate": 1.5350714108220667e-05, + "loss": 0.3991, + "step": 1022 + }, + { + "epoch": 0.4099279423538831, + "learning_rate": 1.5374305623457594e-05, + "loss": 0.2352, + "step": 1024 + }, + { + "epoch": 0.41072858286629305, + "learning_rate": 1.539785515417376e-05, + "loss": 0.3328, + "step": 1026 + }, + { + "epoch": 0.41152922337870296, + "learning_rate": 1.542136251639826e-05, + "loss": 0.3993, + "step": 1028 + }, + { + "epoch": 0.41232986389111287, + "learning_rate": 1.5444827526489668e-05, + "loss": 0.3187, + "step": 1030 + }, + { + "epoch": 0.41313050440352284, + "learning_rate": 1.5468250001137368e-05, + "loss": 0.3179, + "step": 1032 + }, + { + "epoch": 0.41393114491593275, + "learning_rate": 1.5491629757363026e-05, + "loss": 0.3345, + "step": 1034 + }, + { + "epoch": 0.41473178542834266, + "learning_rate": 1.551496661252208e-05, + "loss": 0.3973, + "step": 1036 + }, + { + "epoch": 0.4155324259407526, + "learning_rate": 1.5538260384305073e-05, + "loss": 0.264, + "step": 1038 + }, + { + "epoch": 0.41633306645316254, + "learning_rate": 1.5561510890739113e-05, + "loss": 0.3328, + "step": 1040 + }, + { + "epoch": 0.41713370696557245, + "learning_rate": 1.5584717950189353e-05, + "loss": 0.2722, + "step": 1042 + }, + { + "epoch": 0.41793434747798236, + "learning_rate": 1.5607881381360296e-05, + "loss": 0.3216, + "step": 1044 + }, + { + "epoch": 0.4187349879903923, + "learning_rate": 1.563100100329731e-05, + "loss": 0.3518, + "step": 1046 + }, + { + "epoch": 0.41953562850280224, + "learning_rate": 1.565407663538797e-05, + "loss": 0.389, + "step": 1048 + }, + { + "epoch": 0.42033626901521215, + "learning_rate": 1.567710809736356e-05, + "loss": 0.2721, + "step": 1050 + }, + { + "epoch": 0.4211369095276221, + "learning_rate": 1.5700095209300376e-05, + "loss": 0.3347, + "step": 1052 + }, + { + "epoch": 0.42193755004003203, + "learning_rate": 1.572303779162118e-05, + "loss": 0.3328, + "step": 1054 + }, + { + "epoch": 0.42273819055244194, + "learning_rate": 1.5745935665096647e-05, + "loss": 0.2719, + "step": 1056 + }, + { + "epoch": 0.4235388310648519, + "learning_rate": 1.5768788650846677e-05, + "loss": 0.3216, + "step": 1058 + }, + { + "epoch": 0.4243394715772618, + "learning_rate": 1.5791596570341844e-05, + "loss": 0.4087, + "step": 1060 + }, + { + "epoch": 0.42514011208967173, + "learning_rate": 1.581435924540481e-05, + "loss": 0.3756, + "step": 1062 + }, + { + "epoch": 0.42594075260208164, + "learning_rate": 1.5837076498211666e-05, + "loss": 0.364, + "step": 1064 + }, + { + "epoch": 0.4267413931144916, + "learning_rate": 1.5859748151293333e-05, + "loss": 0.4542, + "step": 1066 + }, + { + "epoch": 0.4275420336269015, + "learning_rate": 1.5882374027537005e-05, + "loss": 0.3972, + "step": 1068 + }, + { + "epoch": 0.42834267413931143, + "learning_rate": 1.5904953950187455e-05, + "loss": 0.3971, + "step": 1070 + }, + { + "epoch": 0.4291433146517214, + "learning_rate": 1.5927487742848448e-05, + "loss": 0.3658, + "step": 1072 + }, + { + "epoch": 0.4299439551641313, + "learning_rate": 1.594997522948412e-05, + "loss": 0.3186, + "step": 1074 + }, + { + "epoch": 0.4307445956765412, + "learning_rate": 1.5972416234420393e-05, + "loss": 0.2637, + "step": 1076 + }, + { + "epoch": 0.4315452361889512, + "learning_rate": 1.599481058234626e-05, + "loss": 0.3034, + "step": 1078 + }, + { + "epoch": 0.4323458767013611, + "learning_rate": 1.60171580983152e-05, + "loss": 0.2892, + "step": 1080 + }, + { + "epoch": 0.433146517213771, + "learning_rate": 1.6039458607746607e-05, + "loss": 0.3808, + "step": 1082 + }, + { + "epoch": 0.4339471577261809, + "learning_rate": 1.606171193642703e-05, + "loss": 0.3032, + "step": 1084 + }, + { + "epoch": 0.4347477982385909, + "learning_rate": 1.6083917910511616e-05, + "loss": 0.3347, + "step": 1086 + }, + { + "epoch": 0.4355484387510008, + "learning_rate": 1.6106076356525474e-05, + "loss": 0.4341, + "step": 1088 + }, + { + "epoch": 0.4363490792634107, + "learning_rate": 1.6128187101364982e-05, + "loss": 0.3033, + "step": 1090 + }, + { + "epoch": 0.4371497197758207, + "learning_rate": 1.6150249972299153e-05, + "loss": 0.3384, + "step": 1092 + }, + { + "epoch": 0.4379503602882306, + "learning_rate": 1.617226479697104e-05, + "loss": 0.454, + "step": 1094 + }, + { + "epoch": 0.4387510008006405, + "learning_rate": 1.6194231403398994e-05, + "loss": 0.3033, + "step": 1096 + }, + { + "epoch": 0.43955164131305047, + "learning_rate": 1.621614961997806e-05, + "loss": 0.2719, + "step": 1098 + }, + { + "epoch": 0.4403522818254604, + "learning_rate": 1.6238019275481313e-05, + "loss": 0.4964, + "step": 1100 + }, + { + "epoch": 0.4411529223378703, + "learning_rate": 1.6259840199061212e-05, + "loss": 0.3347, + "step": 1102 + }, + { + "epoch": 0.4419535628502802, + "learning_rate": 1.6281612220250883e-05, + "loss": 0.2613, + "step": 1104 + }, + { + "epoch": 0.44275420336269017, + "learning_rate": 1.6303335168965474e-05, + "loss": 0.3972, + "step": 1106 + }, + { + "epoch": 0.4435548438751001, + "learning_rate": 1.6325008875503543e-05, + "loss": 0.3418, + "step": 1108 + }, + { + "epoch": 0.44435548438751, + "learning_rate": 1.6346633170548285e-05, + "loss": 0.3031, + "step": 1110 + }, + { + "epoch": 0.44515612489991996, + "learning_rate": 1.6368207885168897e-05, + "loss": 0.2548, + "step": 1112 + }, + { + "epoch": 0.44595676541232987, + "learning_rate": 1.6389732850821957e-05, + "loss": 0.3054, + "step": 1114 + }, + { + "epoch": 0.4467574059247398, + "learning_rate": 1.641120789935263e-05, + "loss": 0.2505, + "step": 1116 + }, + { + "epoch": 0.4475580464371497, + "learning_rate": 1.6432632862996042e-05, + "loss": 0.364, + "step": 1118 + }, + { + "epoch": 0.44835868694955966, + "learning_rate": 1.6454007574378637e-05, + "loss": 0.349, + "step": 1120 + }, + { + "epoch": 0.44915932746196957, + "learning_rate": 1.6475331866519377e-05, + "loss": 0.4737, + "step": 1122 + }, + { + "epoch": 0.4499599679743795, + "learning_rate": 1.6496605572831134e-05, + "loss": 0.451, + "step": 1124 + }, + { + "epoch": 0.45076060848678945, + "learning_rate": 1.6517828527121928e-05, + "loss": 0.329, + "step": 1126 + }, + { + "epoch": 0.45156124899919936, + "learning_rate": 1.6539000563596318e-05, + "loss": 0.2554, + "step": 1128 + }, + { + "epoch": 0.45236188951160927, + "learning_rate": 1.6560121516856586e-05, + "loss": 0.3178, + "step": 1130 + }, + { + "epoch": 0.45316253002401924, + "learning_rate": 1.6581191221904077e-05, + "loss": 0.2661, + "step": 1132 + }, + { + "epoch": 0.45396317053642915, + "learning_rate": 1.6602209514140542e-05, + "loss": 0.2721, + "step": 1134 + }, + { + "epoch": 0.45476381104883906, + "learning_rate": 1.6623176229369324e-05, + "loss": 0.2862, + "step": 1136 + }, + { + "epoch": 0.455564451561249, + "learning_rate": 1.6644091203796694e-05, + "loss": 0.3956, + "step": 1138 + }, + { + "epoch": 0.45636509207365894, + "learning_rate": 1.6664954274033168e-05, + "loss": 0.364, + "step": 1140 + }, + { + "epoch": 0.45716573258606885, + "learning_rate": 1.6685765277094695e-05, + "loss": 0.6647, + "step": 1142 + }, + { + "epoch": 0.45796637309847876, + "learning_rate": 1.6706524050403996e-05, + "loss": 0.2466, + "step": 1144 + }, + { + "epoch": 0.45876701361088873, + "learning_rate": 1.6727230431791806e-05, + "loss": 0.37, + "step": 1146 + }, + { + "epoch": 0.45956765412329864, + "learning_rate": 1.674788425949818e-05, + "loss": 0.2591, + "step": 1148 + }, + { + "epoch": 0.46036829463570855, + "learning_rate": 1.6768485372173696e-05, + "loss": 0.3569, + "step": 1150 + }, + { + "epoch": 0.4611689351481185, + "learning_rate": 1.6789033608880735e-05, + "loss": 0.263, + "step": 1152 + }, + { + "epoch": 0.46196957566052843, + "learning_rate": 1.6809528809094798e-05, + "loss": 0.4031, + "step": 1154 + }, + { + "epoch": 0.46277021617293834, + "learning_rate": 1.6829970812705674e-05, + "loss": 0.3973, + "step": 1156 + }, + { + "epoch": 0.46357085668534825, + "learning_rate": 1.6850359460018733e-05, + "loss": 0.3812, + "step": 1158 + }, + { + "epoch": 0.4643714971977582, + "learning_rate": 1.6870694591756165e-05, + "loss": 0.2762, + "step": 1160 + }, + { + "epoch": 0.46517213771016813, + "learning_rate": 1.689097604905826e-05, + "loss": 0.2776, + "step": 1162 + }, + { + "epoch": 0.46597277822257804, + "learning_rate": 1.6911203673484577e-05, + "loss": 0.2721, + "step": 1164 + }, + { + "epoch": 0.466773418734988, + "learning_rate": 1.6931377307015226e-05, + "loss": 0.3216, + "step": 1166 + }, + { + "epoch": 0.4675740592473979, + "learning_rate": 1.695149679205214e-05, + "loss": 0.3889, + "step": 1168 + }, + { + "epoch": 0.46837469975980783, + "learning_rate": 1.6971561971420222e-05, + "loss": 0.4182, + "step": 1170 + }, + { + "epoch": 0.4691753402722178, + "learning_rate": 1.6991572688368628e-05, + "loss": 0.2701, + "step": 1172 + }, + { + "epoch": 0.4699759807846277, + "learning_rate": 1.701152878657196e-05, + "loss": 0.3186, + "step": 1174 + }, + { + "epoch": 0.4707766212970376, + "learning_rate": 1.7031430110131562e-05, + "loss": 0.2902, + "step": 1176 + }, + { + "epoch": 0.47157726180944753, + "learning_rate": 1.705127650357662e-05, + "loss": 0.3518, + "step": 1178 + }, + { + "epoch": 0.4723779023218575, + "learning_rate": 1.7071067811865467e-05, + "loss": 0.3215, + "step": 1180 + }, + { + "epoch": 0.4731785428342674, + "learning_rate": 1.7090803880386778e-05, + "loss": 0.3035, + "step": 1182 + }, + { + "epoch": 0.4739791833466773, + "learning_rate": 1.7110484554960738e-05, + "loss": 0.3328, + "step": 1184 + }, + { + "epoch": 0.4747798238590873, + "learning_rate": 1.713010968184029e-05, + "loss": 0.3034, + "step": 1186 + }, + { + "epoch": 0.4755804643714972, + "learning_rate": 1.7149679107712306e-05, + "loss": 0.2903, + "step": 1188 + }, + { + "epoch": 0.4763811048839071, + "learning_rate": 1.716919267969883e-05, + "loss": 0.403, + "step": 1190 + }, + { + "epoch": 0.4771817453963171, + "learning_rate": 1.7188650245358215e-05, + "loss": 0.293, + "step": 1192 + }, + { + "epoch": 0.477982385908727, + "learning_rate": 1.7208051652686338e-05, + "loss": 0.2835, + "step": 1194 + }, + { + "epoch": 0.4787830264211369, + "learning_rate": 1.722739675011779e-05, + "loss": 0.3524, + "step": 1196 + }, + { + "epoch": 0.4795836669335468, + "learning_rate": 1.7246685386527095e-05, + "loss": 0.3185, + "step": 1198 + }, + { + "epoch": 0.4803843074459568, + "learning_rate": 1.726591741122981e-05, + "loss": 0.29, + "step": 1200 + }, + { + "epoch": 0.4811849479583667, + "learning_rate": 1.7285092673983753e-05, + "loss": 0.364, + "step": 1202 + }, + { + "epoch": 0.4819855884707766, + "learning_rate": 1.730421102499021e-05, + "loss": 0.3032, + "step": 1204 + }, + { + "epoch": 0.48278622898318657, + "learning_rate": 1.7323272314895022e-05, + "loss": 0.2628, + "step": 1206 + }, + { + "epoch": 0.4835868694955965, + "learning_rate": 1.734227639478982e-05, + "loss": 0.3176, + "step": 1208 + }, + { + "epoch": 0.4843875100080064, + "learning_rate": 1.736122311621314e-05, + "loss": 0.282, + "step": 1210 + }, + { + "epoch": 0.4851881505204163, + "learning_rate": 1.738011233115165e-05, + "loss": 0.4321, + "step": 1212 + }, + { + "epoch": 0.4859887910328263, + "learning_rate": 1.7398943892041227e-05, + "loss": 0.3384, + "step": 1214 + }, + { + "epoch": 0.4867894315452362, + "learning_rate": 1.7417717651768144e-05, + "loss": 0.412, + "step": 1216 + }, + { + "epoch": 0.4875900720576461, + "learning_rate": 1.743643346367026e-05, + "loss": 0.4882, + "step": 1218 + }, + { + "epoch": 0.48839071257005606, + "learning_rate": 1.7455091181538087e-05, + "loss": 0.248, + "step": 1220 + }, + { + "epoch": 0.489191353082466, + "learning_rate": 1.7473690659615992e-05, + "loss": 0.4256, + "step": 1222 + }, + { + "epoch": 0.4899919935948759, + "learning_rate": 1.74922317526033e-05, + "loss": 0.3478, + "step": 1224 + }, + { + "epoch": 0.49079263410728585, + "learning_rate": 1.7510714315655474e-05, + "loss": 0.2892, + "step": 1226 + }, + { + "epoch": 0.49159327461969576, + "learning_rate": 1.752913820438519e-05, + "loss": 0.3238, + "step": 1228 + }, + { + "epoch": 0.4923939151321057, + "learning_rate": 1.7547503274863495e-05, + "loss": 0.4968, + "step": 1230 + }, + { + "epoch": 0.4931945556445156, + "learning_rate": 1.756580938362096e-05, + "loss": 0.4338, + "step": 1232 + }, + { + "epoch": 0.49399519615692555, + "learning_rate": 1.758405638764873e-05, + "loss": 0.3915, + "step": 1234 + }, + { + "epoch": 0.49479583666933546, + "learning_rate": 1.7602244144399693e-05, + "loss": 0.3487, + "step": 1236 + }, + { + "epoch": 0.4955964771817454, + "learning_rate": 1.7620372511789604e-05, + "loss": 0.3476, + "step": 1238 + }, + { + "epoch": 0.49639711769415534, + "learning_rate": 1.7638441348198147e-05, + "loss": 0.3489, + "step": 1240 + }, + { + "epoch": 0.49719775820656525, + "learning_rate": 1.7656450512470077e-05, + "loss": 0.2931, + "step": 1242 + }, + { + "epoch": 0.49799839871897517, + "learning_rate": 1.7674399863916295e-05, + "loss": 0.2614, + "step": 1244 + }, + { + "epoch": 0.49879903923138513, + "learning_rate": 1.7692289262315e-05, + "loss": 0.3387, + "step": 1246 + }, + { + "epoch": 0.49959967974379504, + "learning_rate": 1.771011856791273e-05, + "loss": 0.3757, + "step": 1248 + }, + { + "epoch": 0.500400320256205, + "learning_rate": 1.7727887641425448e-05, + "loss": 0.2863, + "step": 1250 + }, + { + "epoch": 0.5012009607686149, + "learning_rate": 1.7745596344039712e-05, + "loss": 0.3128, + "step": 1252 + }, + { + "epoch": 0.5020016012810248, + "learning_rate": 1.7763244537413657e-05, + "loss": 0.2891, + "step": 1254 + }, + { + "epoch": 0.5028022417934348, + "learning_rate": 1.7780832083678116e-05, + "loss": 0.2834, + "step": 1256 + }, + { + "epoch": 0.5036028823058447, + "learning_rate": 1.7798358845437754e-05, + "loss": 0.4748, + "step": 1258 + }, + { + "epoch": 0.5044035228182546, + "learning_rate": 1.7815824685772035e-05, + "loss": 0.3289, + "step": 1260 + }, + { + "epoch": 0.5052041633306645, + "learning_rate": 1.7833229468236364e-05, + "loss": 0.2506, + "step": 1262 + }, + { + "epoch": 0.5060048038430744, + "learning_rate": 1.7850573056863156e-05, + "loss": 0.4031, + "step": 1264 + }, + { + "epoch": 0.5068054443554844, + "learning_rate": 1.786785531616285e-05, + "loss": 0.3567, + "step": 1266 + }, + { + "epoch": 0.5076060848678943, + "learning_rate": 1.7885076111125004e-05, + "loss": 0.3033, + "step": 1268 + }, + { + "epoch": 0.5084067253803043, + "learning_rate": 1.790223530721933e-05, + "loss": 0.381, + "step": 1270 + }, + { + "epoch": 0.5092073658927142, + "learning_rate": 1.791933277039679e-05, + "loss": 0.2758, + "step": 1272 + }, + { + "epoch": 0.5100080064051241, + "learning_rate": 1.7936368367090577e-05, + "loss": 0.3518, + "step": 1274 + }, + { + "epoch": 0.510808646917534, + "learning_rate": 1.7953341964217183e-05, + "loss": 0.536, + "step": 1276 + }, + { + "epoch": 0.5116092874299439, + "learning_rate": 1.7970253429177477e-05, + "loss": 0.2442, + "step": 1278 + }, + { + "epoch": 0.5124099279423538, + "learning_rate": 1.7987102629857696e-05, + "loss": 0.3326, + "step": 1280 + }, + { + "epoch": 0.5132105684547638, + "learning_rate": 1.800388943463047e-05, + "loss": 0.2639, + "step": 1282 + }, + { + "epoch": 0.5140112089671738, + "learning_rate": 1.8020613712355912e-05, + "loss": 0.3264, + "step": 1284 + }, + { + "epoch": 0.5148118494795837, + "learning_rate": 1.803727533238257e-05, + "loss": 0.3186, + "step": 1286 + }, + { + "epoch": 0.5156124899919936, + "learning_rate": 1.805387416454847e-05, + "loss": 0.3641, + "step": 1288 + }, + { + "epoch": 0.5164131305044035, + "learning_rate": 1.8070410079182195e-05, + "loss": 0.2779, + "step": 1290 + }, + { + "epoch": 0.5172137710168134, + "learning_rate": 1.8086882947103787e-05, + "loss": 0.2615, + "step": 1292 + }, + { + "epoch": 0.5180144115292233, + "learning_rate": 1.8103292639625842e-05, + "loss": 0.2904, + "step": 1294 + }, + { + "epoch": 0.5188150520416333, + "learning_rate": 1.811963902855447e-05, + "loss": 0.276, + "step": 1296 + }, + { + "epoch": 0.5196156925540433, + "learning_rate": 1.813592198619035e-05, + "loss": 0.3346, + "step": 1298 + }, + { + "epoch": 0.5204163330664532, + "learning_rate": 1.8152141385329658e-05, + "loss": 0.3421, + "step": 1300 + }, + { + "epoch": 0.5212169735788631, + "learning_rate": 1.816829709926509e-05, + "loss": 0.2549, + "step": 1302 + }, + { + "epoch": 0.522017614091273, + "learning_rate": 1.8184389001786895e-05, + "loss": 0.2892, + "step": 1304 + }, + { + "epoch": 0.5228182546036829, + "learning_rate": 1.8200416967183785e-05, + "loss": 0.2626, + "step": 1306 + }, + { + "epoch": 0.5236188951160928, + "learning_rate": 1.821638087024396e-05, + "loss": 0.2932, + "step": 1308 + }, + { + "epoch": 0.5244195356285029, + "learning_rate": 1.8232280586256097e-05, + "loss": 0.2758, + "step": 1310 + }, + { + "epoch": 0.5252201761409128, + "learning_rate": 1.8248115991010296e-05, + "loss": 0.3639, + "step": 1312 + }, + { + "epoch": 0.5260208166533227, + "learning_rate": 1.8263886960799055e-05, + "loss": 0.4411, + "step": 1314 + }, + { + "epoch": 0.5268214571657326, + "learning_rate": 1.8279593372418264e-05, + "loss": 0.2956, + "step": 1316 + }, + { + "epoch": 0.5276220976781425, + "learning_rate": 1.829523510316813e-05, + "loss": 0.438, + "step": 1318 + }, + { + "epoch": 0.5284227381905524, + "learning_rate": 1.8310812030854155e-05, + "loss": 0.4748, + "step": 1320 + }, + { + "epoch": 0.5292233787029623, + "learning_rate": 1.832632403378808e-05, + "loss": 0.4153, + "step": 1322 + }, + { + "epoch": 0.5300240192153723, + "learning_rate": 1.834177099078887e-05, + "loss": 0.3034, + "step": 1324 + }, + { + "epoch": 0.5308246597277823, + "learning_rate": 1.8357152781183606e-05, + "loss": 0.4228, + "step": 1326 + }, + { + "epoch": 0.5316253002401922, + "learning_rate": 1.8372469284808465e-05, + "loss": 0.4455, + "step": 1328 + }, + { + "epoch": 0.5324259407526021, + "learning_rate": 1.8387720382009665e-05, + "loss": 0.2909, + "step": 1330 + }, + { + "epoch": 0.533226581265012, + "learning_rate": 1.840290595364436e-05, + "loss": 0.3639, + "step": 1332 + }, + { + "epoch": 0.5340272217774219, + "learning_rate": 1.8418025881081606e-05, + "loss": 0.3188, + "step": 1334 + }, + { + "epoch": 0.5348278622898318, + "learning_rate": 1.8433080046203286e-05, + "loss": 0.3216, + "step": 1336 + }, + { + "epoch": 0.5356285028022418, + "learning_rate": 1.844806833140501e-05, + "loss": 0.3023, + "step": 1338 + }, + { + "epoch": 0.5364291433146517, + "learning_rate": 1.8462990619597054e-05, + "loss": 0.2705, + "step": 1340 + }, + { + "epoch": 0.5372297838270617, + "learning_rate": 1.8477846794205258e-05, + "loss": 0.2552, + "step": 1342 + }, + { + "epoch": 0.5380304243394716, + "learning_rate": 1.8492636739171966e-05, + "loss": 0.3347, + "step": 1344 + }, + { + "epoch": 0.5388310648518815, + "learning_rate": 1.85073603389569e-05, + "loss": 0.3326, + "step": 1346 + }, + { + "epoch": 0.5396317053642914, + "learning_rate": 1.8522017478538067e-05, + "loss": 0.2956, + "step": 1348 + }, + { + "epoch": 0.5404323458767014, + "learning_rate": 1.8536608043412695e-05, + "loss": 0.37, + "step": 1350 + }, + { + "epoch": 0.5412329863891113, + "learning_rate": 1.855113191959808e-05, + "loss": 0.3375, + "step": 1352 + }, + { + "epoch": 0.5420336269015212, + "learning_rate": 1.856558899363248e-05, + "loss": 0.3185, + "step": 1354 + }, + { + "epoch": 0.5428342674139311, + "learning_rate": 1.8579979152576063e-05, + "loss": 0.2701, + "step": 1356 + }, + { + "epoch": 0.5436349079263411, + "learning_rate": 1.85943022840117e-05, + "loss": 0.3954, + "step": 1358 + }, + { + "epoch": 0.544435548438751, + "learning_rate": 1.8608558276045895e-05, + "loss": 0.4511, + "step": 1360 + }, + { + "epoch": 0.5452361889511609, + "learning_rate": 1.862274701730967e-05, + "loss": 0.3215, + "step": 1362 + }, + { + "epoch": 0.5460368294635709, + "learning_rate": 1.86368683969594e-05, + "loss": 0.3016, + "step": 1364 + }, + { + "epoch": 0.5468374699759808, + "learning_rate": 1.865092230467769e-05, + "loss": 0.3345, + "step": 1366 + }, + { + "epoch": 0.5476381104883907, + "learning_rate": 1.866490863067425e-05, + "loss": 0.4379, + "step": 1368 + }, + { + "epoch": 0.5484387510008006, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.2615, + "step": 1370 + }, + { + "epoch": 0.5492393915132106, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.5318, + "step": 1372 + }, + { + "epoch": 0.5500400320256205, + "learning_rate": 1.87064610283551e-05, + "loss": 0.5962, + "step": 1374 + }, + { + "epoch": 0.5508406725380304, + "learning_rate": 1.8720175940133705e-05, + "loss": 0.3659, + "step": 1376 + }, + { + "epoch": 0.5516413130504404, + "learning_rate": 1.873382272917545e-05, + "loss": 0.3185, + "step": 1378 + }, + { + "epoch": 0.5524419535628503, + "learning_rate": 1.8747401288870472e-05, + "loss": 0.4515, + "step": 1380 + }, + { + "epoch": 0.5532425940752602, + "learning_rate": 1.876091151314196e-05, + "loss": 0.3016, + "step": 1382 + }, + { + "epoch": 0.5540432345876701, + "learning_rate": 1.877435329644691e-05, + "loss": 0.3218, + "step": 1384 + }, + { + "epoch": 0.55484387510008, + "learning_rate": 1.8787726533776996e-05, + "loss": 0.2554, + "step": 1386 + }, + { + "epoch": 0.55564451561249, + "learning_rate": 1.8801031120659393e-05, + "loss": 0.349, + "step": 1388 + }, + { + "epoch": 0.5564451561248999, + "learning_rate": 1.8814266953157557e-05, + "loss": 0.3038, + "step": 1390 + }, + { + "epoch": 0.5572457966373099, + "learning_rate": 1.8827433927872066e-05, + "loss": 0.3519, + "step": 1392 + }, + { + "epoch": 0.5580464371497198, + "learning_rate": 1.8840531941941415e-05, + "loss": 0.3409, + "step": 1394 + }, + { + "epoch": 0.5588470776621297, + "learning_rate": 1.8853560893042854e-05, + "loss": 0.3762, + "step": 1396 + }, + { + "epoch": 0.5596477181745396, + "learning_rate": 1.8866520679393127e-05, + "loss": 0.3718, + "step": 1398 + }, + { + "epoch": 0.5604483586869495, + "learning_rate": 1.8879411199749303e-05, + "loss": 0.3326, + "step": 1400 + }, + { + "epoch": 0.5612489991993594, + "learning_rate": 1.889223235340958e-05, + "loss": 0.5488, + "step": 1402 + }, + { + "epoch": 0.5620496397117695, + "learning_rate": 1.8904984040214037e-05, + "loss": 0.2629, + "step": 1404 + }, + { + "epoch": 0.5628502802241794, + "learning_rate": 1.8917666160545436e-05, + "loss": 0.3633, + "step": 1406 + }, + { + "epoch": 0.5636509207365893, + "learning_rate": 1.893027861533002e-05, + "loss": 0.3098, + "step": 1408 + }, + { + "epoch": 0.5644515612489992, + "learning_rate": 1.894282130603823e-05, + "loss": 0.313, + "step": 1410 + }, + { + "epoch": 0.5652522017614091, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.3891, + "step": 1412 + }, + { + "epoch": 0.566052842273819, + "learning_rate": 1.896769700383315e-05, + "loss": 0.3177, + "step": 1414 + }, + { + "epoch": 0.5668534827862289, + "learning_rate": 1.898002981658886e-05, + "loss": 0.475, + "step": 1416 + }, + { + "epoch": 0.567654123298639, + "learning_rate": 1.899229247660769e-05, + "loss": 0.3249, + "step": 1418 + }, + { + "epoch": 0.5684547638110489, + "learning_rate": 1.9004484888092724e-05, + "loss": 0.2708, + "step": 1420 + }, + { + "epoch": 0.5692554043234588, + "learning_rate": 1.901660695579585e-05, + "loss": 0.3329, + "step": 1422 + }, + { + "epoch": 0.5700560448358687, + "learning_rate": 1.9028658585018455e-05, + "loss": 0.3812, + "step": 1424 + }, + { + "epoch": 0.5708566853482786, + "learning_rate": 1.9040639681612212e-05, + "loss": 0.2524, + "step": 1426 + }, + { + "epoch": 0.5716573258606885, + "learning_rate": 1.9052550151979816e-05, + "loss": 0.2481, + "step": 1428 + }, + { + "epoch": 0.5724579663730984, + "learning_rate": 1.9064389903075676e-05, + "loss": 0.3032, + "step": 1430 + }, + { + "epoch": 0.5732586068855084, + "learning_rate": 1.9076158842406674e-05, + "loss": 0.3386, + "step": 1432 + }, + { + "epoch": 0.5740592473979184, + "learning_rate": 1.9087856878032886e-05, + "loss": 0.3036, + "step": 1434 + }, + { + "epoch": 0.5748598879103283, + "learning_rate": 1.909948391856829e-05, + "loss": 0.3016, + "step": 1436 + }, + { + "epoch": 0.5756605284227382, + "learning_rate": 1.911103987318148e-05, + "loss": 0.4087, + "step": 1438 + }, + { + "epoch": 0.5764611689351481, + "learning_rate": 1.912252465159637e-05, + "loss": 0.3569, + "step": 1440 + }, + { + "epoch": 0.577261809447558, + "learning_rate": 1.913393816409294e-05, + "loss": 0.4836, + "step": 1442 + }, + { + "epoch": 0.578062449959968, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.2485, + "step": 1444 + }, + { + "epoch": 0.5788630904723779, + "learning_rate": 1.9156551035235288e-05, + "loss": 0.5207, + "step": 1446 + }, + { + "epoch": 0.5796637309847879, + "learning_rate": 1.9167750217227454e-05, + "loss": 0.2781, + "step": 1448 + }, + { + "epoch": 0.5804643714971978, + "learning_rate": 1.9178877779995423e-05, + "loss": 0.2593, + "step": 1450 + }, + { + "epoch": 0.5812650120096077, + "learning_rate": 1.9189933636609747e-05, + "loss": 0.4711, + "step": 1452 + }, + { + "epoch": 0.5820656525220176, + "learning_rate": 1.9200917700701173e-05, + "loss": 0.2628, + "step": 1454 + }, + { + "epoch": 0.5828662930344275, + "learning_rate": 1.9211829886461274e-05, + "loss": 0.4229, + "step": 1456 + }, + { + "epoch": 0.5836669335468375, + "learning_rate": 1.9222670108643146e-05, + "loss": 0.2981, + "step": 1458 + }, + { + "epoch": 0.5844675740592474, + "learning_rate": 1.9233438282562085e-05, + "loss": 0.3328, + "step": 1460 + }, + { + "epoch": 0.5852682145716573, + "learning_rate": 1.924413432409622e-05, + "loss": 0.4322, + "step": 1462 + }, + { + "epoch": 0.5860688550840673, + "learning_rate": 1.925475814968719e-05, + "loss": 0.3832, + "step": 1464 + }, + { + "epoch": 0.5868694955964772, + "learning_rate": 1.926530967634078e-05, + "loss": 0.3489, + "step": 1466 + }, + { + "epoch": 0.5876701361088871, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.3035, + "step": 1468 + }, + { + "epoch": 0.588470776621297, + "learning_rate": 1.9286195503683705e-05, + "loss": 0.3095, + "step": 1470 + }, + { + "epoch": 0.589271417133707, + "learning_rate": 1.9296529641211215e-05, + "loss": 0.366, + "step": 1472 + }, + { + "epoch": 0.5900720576461169, + "learning_rate": 1.9306791153479004e-05, + "loss": 0.5746, + "step": 1474 + }, + { + "epoch": 0.5908726981585268, + "learning_rate": 1.9316979960323286e-05, + "loss": 0.5915, + "step": 1476 + }, + { + "epoch": 0.5916733386709367, + "learning_rate": 1.932709598214825e-05, + "loss": 0.3833, + "step": 1478 + }, + { + "epoch": 0.5924739791833467, + "learning_rate": 1.9337139139926707e-05, + "loss": 0.3351, + "step": 1480 + }, + { + "epoch": 0.5932746196957566, + "learning_rate": 1.9347109355200672e-05, + "loss": 0.2593, + "step": 1482 + }, + { + "epoch": 0.5940752602081665, + "learning_rate": 1.935700655008199e-05, + "loss": 0.3037, + "step": 1484 + }, + { + "epoch": 0.5948759007205765, + "learning_rate": 1.9366830647252967e-05, + "loss": 0.3973, + "step": 1486 + }, + { + "epoch": 0.5956765412329864, + "learning_rate": 1.9376581569966933e-05, + "loss": 0.3056, + "step": 1488 + }, + { + "epoch": 0.5964771817453963, + "learning_rate": 1.9386259242048883e-05, + "loss": 0.3178, + "step": 1490 + }, + { + "epoch": 0.5972778222578062, + "learning_rate": 1.939586358789602e-05, + "loss": 0.333, + "step": 1492 + }, + { + "epoch": 0.5980784627702161, + "learning_rate": 1.940539453247842e-05, + "loss": 0.2839, + "step": 1494 + }, + { + "epoch": 0.5988791032826261, + "learning_rate": 1.9414852001339547e-05, + "loss": 0.4591, + "step": 1496 + }, + { + "epoch": 0.5996797437950361, + "learning_rate": 1.9424235920596863e-05, + "loss": 0.2598, + "step": 1498 + }, + { + "epoch": 0.600480384307446, + "learning_rate": 1.9433546216942423e-05, + "loss": 0.302, + "step": 1500 + }, + { + "epoch": 0.6012810248198559, + "learning_rate": 1.944278281764342e-05, + "loss": 0.3571, + "step": 1502 + }, + { + "epoch": 0.6020816653322658, + "learning_rate": 1.945194565054276e-05, + "loss": 0.3522, + "step": 1504 + }, + { + "epoch": 0.6028823058446757, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.3491, + "step": 1506 + }, + { + "epoch": 0.6036829463570856, + "learning_rate": 1.9470049727190073e-05, + "loss": 0.2827, + "step": 1508 + }, + { + "epoch": 0.6044835868694955, + "learning_rate": 1.9478990829507504e-05, + "loss": 0.3217, + "step": 1510 + }, + { + "epoch": 0.6052842273819056, + "learning_rate": 1.948785788116329e-05, + "loss": 0.3178, + "step": 1512 + }, + { + "epoch": 0.6060848678943155, + "learning_rate": 1.9496650812887286e-05, + "loss": 0.3662, + "step": 1514 + }, + { + "epoch": 0.6068855084067254, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.3349, + "step": 1516 + }, + { + "epoch": 0.6076861489191353, + "learning_rate": 1.951401404235505e-05, + "loss": 0.2509, + "step": 1518 + }, + { + "epoch": 0.6084867894315452, + "learning_rate": 1.952258420445583e-05, + "loss": 0.2582, + "step": 1520 + }, + { + "epoch": 0.6092874299439551, + "learning_rate": 1.9531079975339912e-05, + "loss": 0.2782, + "step": 1522 + }, + { + "epoch": 0.610088070456365, + "learning_rate": 1.953950128863762e-05, + "loss": 0.4381, + "step": 1524 + }, + { + "epoch": 0.6108887109687751, + "learning_rate": 1.9547848078560975e-05, + "loss": 0.2585, + "step": 1526 + }, + { + "epoch": 0.611689351481185, + "learning_rate": 1.9556120279904144e-05, + "loss": 0.2961, + "step": 1528 + }, + { + "epoch": 0.6124899919935949, + "learning_rate": 1.956431782804402e-05, + "loss": 0.3813, + "step": 1530 + }, + { + "epoch": 0.6132906325060048, + "learning_rate": 1.957244065894066e-05, + "loss": 0.3464, + "step": 1532 + }, + { + "epoch": 0.6140912730184147, + "learning_rate": 1.9580488709137858e-05, + "loss": 0.389, + "step": 1534 + }, + { + "epoch": 0.6148919135308246, + "learning_rate": 1.9588461915763566e-05, + "loss": 0.2761, + "step": 1536 + }, + { + "epoch": 0.6156925540432346, + "learning_rate": 1.9596360216530436e-05, + "loss": 0.2723, + "step": 1538 + }, + { + "epoch": 0.6164931945556446, + "learning_rate": 1.9604183549736283e-05, + "loss": 0.2762, + "step": 1540 + }, + { + "epoch": 0.6172938350680545, + "learning_rate": 1.961193185426459e-05, + "loss": 0.5205, + "step": 1542 + }, + { + "epoch": 0.6180944755804644, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.3054, + "step": 1544 + }, + { + "epoch": 0.6188951160928743, + "learning_rate": 1.9627203135753573e-05, + "loss": 0.4504, + "step": 1546 + }, + { + "epoch": 0.6196957566052842, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.3567, + "step": 1548 + }, + { + "epoch": 0.6204963971176941, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.333, + "step": 1550 + }, + { + "epoch": 0.6212970376301041, + "learning_rate": 1.964954584871995e-05, + "loss": 0.3891, + "step": 1552 + }, + { + "epoch": 0.622097678142514, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.3889, + "step": 1554 + }, + { + "epoch": 0.622898318654924, + "learning_rate": 1.966406417240872e-05, + "loss": 0.324, + "step": 1556 + }, + { + "epoch": 0.6236989591673339, + "learning_rate": 1.967121011775546e-05, + "loss": 0.4295, + "step": 1558 + }, + { + "epoch": 0.6244995996797438, + "learning_rate": 1.967828051080755e-05, + "loss": 0.2761, + "step": 1560 + }, + { + "epoch": 0.6253002401921537, + "learning_rate": 1.9685275296330497e-05, + "loss": 0.4152, + "step": 1562 + }, + { + "epoch": 0.6261008807045636, + "learning_rate": 1.969219441968046e-05, + "loss": 0.3015, + "step": 1564 + }, + { + "epoch": 0.6269015212169736, + "learning_rate": 1.969903782680467e-05, + "loss": 0.4326, + "step": 1566 + }, + { + "epoch": 0.6277021617293835, + "learning_rate": 1.9705805464241856e-05, + "loss": 0.3718, + "step": 1568 + }, + { + "epoch": 0.6285028022417934, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.4032, + "step": 1570 + }, + { + "epoch": 0.6293034427542034, + "learning_rate": 1.971911321917015e-05, + "loss": 0.2781, + "step": 1572 + }, + { + "epoch": 0.6301040832666133, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.4516, + "step": 1574 + }, + { + "epoch": 0.6309047237790232, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.3815, + "step": 1576 + }, + { + "epoch": 0.6317053642914331, + "learning_rate": 1.9738505276435692e-05, + "loss": 0.3568, + "step": 1578 + }, + { + "epoch": 0.6325060048038431, + "learning_rate": 1.9744817206240377e-05, + "loss": 0.3721, + "step": 1580 + }, + { + "epoch": 0.633306645316253, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.2723, + "step": 1582 + }, + { + "epoch": 0.6341072858286629, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.264, + "step": 1584 + }, + { + "epoch": 0.6349079263410728, + "learning_rate": 1.9763296037475174e-05, + "loss": 0.3491, + "step": 1586 + }, + { + "epoch": 0.6357085668534828, + "learning_rate": 1.976930316809569e-05, + "loss": 0.2724, + "step": 1588 + }, + { + "epoch": 0.6365092073658927, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.3188, + "step": 1590 + }, + { + "epoch": 0.6373098478783027, + "learning_rate": 1.978108842718768e-05, + "loss": 0.3351, + "step": 1592 + }, + { + "epoch": 0.6381104883907126, + "learning_rate": 1.9786866463591732e-05, + "loss": 0.2396, + "step": 1594 + }, + { + "epoch": 0.6389111289031225, + "learning_rate": 1.9792568044184176e-05, + "loss": 0.3974, + "step": 1596 + }, + { + "epoch": 0.6397117694155324, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.2362, + "step": 1598 + }, + { + "epoch": 0.6405124099279423, + "learning_rate": 1.9803741660367015e-05, + "loss": 0.7299, + "step": 1600 + }, + { + "epoch": 0.6413130504403523, + "learning_rate": 1.9809213608668185e-05, + "loss": 0.3057, + "step": 1602 + }, + { + "epoch": 0.6421136909527622, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.3918, + "step": 1604 + }, + { + "epoch": 0.6429143314651722, + "learning_rate": 1.9819927571953807e-05, + "loss": 0.376, + "step": 1606 + }, + { + "epoch": 0.6437149719775821, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.318, + "step": 1608 + }, + { + "epoch": 0.644515612489992, + "learning_rate": 1.983033467948784e-05, + "loss": 0.4145, + "step": 1610 + }, + { + "epoch": 0.6453162530024019, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.3388, + "step": 1612 + }, + { + "epoch": 0.6461168935148118, + "learning_rate": 1.9840434606066182e-05, + "loss": 0.3348, + "step": 1614 + }, + { + "epoch": 0.6469175340272217, + "learning_rate": 1.9845369277495102e-05, + "loss": 0.4328, + "step": 1616 + }, + { + "epoch": 0.6477181745396317, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.3993, + "step": 1618 + }, + { + "epoch": 0.6485188150520417, + "learning_rate": 1.985500784388244e-05, + "loss": 0.3349, + "step": 1620 + }, + { + "epoch": 0.6493194555644516, + "learning_rate": 1.985971166354357e-05, + "loss": 0.3388, + "step": 1622 + }, + { + "epoch": 0.6501200960768615, + "learning_rate": 1.9864338458320366e-05, + "loss": 0.2554, + "step": 1624 + }, + { + "epoch": 0.6509207365892714, + "learning_rate": 1.986888819206792e-05, + "loss": 0.4236, + "step": 1626 + }, + { + "epoch": 0.6517213771016813, + "learning_rate": 1.9873360829243323e-05, + "loss": 0.2766, + "step": 1628 + }, + { + "epoch": 0.6525220176140912, + "learning_rate": 1.9877756334905983e-05, + "loss": 0.3491, + "step": 1630 + }, + { + "epoch": 0.6533226581265013, + "learning_rate": 1.9882074674717836e-05, + "loss": 0.4151, + "step": 1632 + }, + { + "epoch": 0.6541232986389112, + "learning_rate": 1.988631581494365e-05, + "loss": 0.3037, + "step": 1634 + }, + { + "epoch": 0.6549239391513211, + "learning_rate": 1.989047972245129e-05, + "loss": 0.4243, + "step": 1636 + }, + { + "epoch": 0.655724579663731, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.3189, + "step": 1638 + }, + { + "epoch": 0.6565252201761409, + "learning_rate": 1.989857570980049e-05, + "loss": 0.2724, + "step": 1640 + }, + { + "epoch": 0.6573258606885508, + "learning_rate": 1.990250772639552e-05, + "loss": 0.3757, + "step": 1642 + }, + { + "epoch": 0.6581265012009607, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.4091, + "step": 1644 + }, + { + "epoch": 0.6589271417133707, + "learning_rate": 1.99101396518405e-05, + "loss": 0.3521, + "step": 1646 + }, + { + "epoch": 0.6597277822257807, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.4032, + "step": 1648 + }, + { + "epoch": 0.6605284227381906, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.3569, + "step": 1650 + }, + { + "epoch": 0.6613290632506005, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.6106, + "step": 1652 + }, + { + "epoch": 0.6621297037630104, + "learning_rate": 1.9924474249753652e-05, + "loss": 0.2982, + "step": 1654 + }, + { + "epoch": 0.6629303442754203, + "learning_rate": 1.9927864140670615e-05, + "loss": 0.3056, + "step": 1656 + }, + { + "epoch": 0.6637309847878302, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.3841, + "step": 1658 + }, + { + "epoch": 0.6645316253002402, + "learning_rate": 1.99344112247369e-05, + "loss": 0.4383, + "step": 1660 + }, + { + "epoch": 0.6653322658126501, + "learning_rate": 1.9937568366739858e-05, + "loss": 0.2906, + "step": 1662 + }, + { + "epoch": 0.6661329063250601, + "learning_rate": 1.9940647875635463e-05, + "loss": 0.2868, + "step": 1664 + }, + { + "epoch": 0.66693354683747, + "learning_rate": 1.9943649727366335e-05, + "loss": 0.4186, + "step": 1666 + }, + { + "epoch": 0.6677341873498799, + "learning_rate": 1.994657389848176e-05, + "loss": 0.2908, + "step": 1668 + }, + { + "epoch": 0.6685348278622898, + "learning_rate": 1.994942036613787e-05, + "loss": 0.357, + "step": 1670 + }, + { + "epoch": 0.6693354683746997, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.3524, + "step": 1672 + }, + { + "epoch": 0.6701361088871097, + "learning_rate": 1.995488010273198e-05, + "loss": 0.3492, + "step": 1674 + }, + { + "epoch": 0.6709367493995196, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.4568, + "step": 1676 + }, + { + "epoch": 0.6717373899119295, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.2362, + "step": 1678 + }, + { + "epoch": 0.6725380304243395, + "learning_rate": 1.996248639549475e-05, + "loss": 0.361, + "step": 1680 + }, + { + "epoch": 0.6733386709367494, + "learning_rate": 1.9964866196679105e-05, + "loss": 0.3519, + "step": 1682 + }, + { + "epoch": 0.6741393114491593, + "learning_rate": 1.9967168151503196e-05, + "loss": 0.3803, + "step": 1684 + }, + { + "epoch": 0.6749399519615693, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.3251, + "step": 1686 + }, + { + "epoch": 0.6757405924739792, + "learning_rate": 1.997153845074662e-05, + "loss": 0.319, + "step": 1688 + }, + { + "epoch": 0.6765412329863891, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.4514, + "step": 1690 + }, + { + "epoch": 0.677341873498799, + "learning_rate": 1.997559715666073e-05, + "loss": 0.3103, + "step": 1692 + }, + { + "epoch": 0.678142514011209, + "learning_rate": 1.9977509622105233e-05, + "loss": 0.2839, + "step": 1694 + }, + { + "epoch": 0.6789431545236189, + "learning_rate": 1.9979344142417986e-05, + "loss": 0.3293, + "step": 1696 + }, + { + "epoch": 0.6797437950360288, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.3483, + "step": 1698 + }, + { + "epoch": 0.6805444355484388, + "learning_rate": 1.998277929093157e-05, + "loss": 0.3764, + "step": 1700 + }, + { + "epoch": 0.6813450760608487, + "learning_rate": 1.998437989229673e-05, + "loss": 0.4716, + "step": 1702 + }, + { + "epoch": 0.6821457165732586, + "learning_rate": 1.9985902494859023e-05, + "loss": 0.3085, + "step": 1704 + }, + { + "epoch": 0.6829463570856685, + "learning_rate": 1.998734708672375e-05, + "loss": 0.3646, + "step": 1706 + }, + { + "epoch": 0.6837469975980784, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.2764, + "step": 1708 + }, + { + "epoch": 0.6845476381104884, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.4442, + "step": 1710 + }, + { + "epoch": 0.6853482786228983, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.3025, + "step": 1712 + }, + { + "epoch": 0.6861489191353083, + "learning_rate": 1.9992345130644747e-05, + "loss": 0.3191, + "step": 1714 + }, + { + "epoch": 0.6869495596477182, + "learning_rate": 1.999339951193407e-05, + "loss": 0.4127, + "step": 1716 + }, + { + "epoch": 0.6877502001601281, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.3294, + "step": 1718 + }, + { + "epoch": 0.688550840672538, + "learning_rate": 1.9995274059091018e-05, + "loss": 0.4606, + "step": 1720 + }, + { + "epoch": 0.6893514811849479, + "learning_rate": 1.999609421031453e-05, + "loss": 0.2955, + "step": 1722 + }, + { + "epoch": 0.6901521216973578, + "learning_rate": 1.999683627122195e-05, + "loss": 0.4345, + "step": 1724 + }, + { + "epoch": 0.6909527622097679, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.4518, + "step": 1726 + }, + { + "epoch": 0.6917534027221778, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.466, + "step": 1728 + }, + { + "epoch": 0.6925540432345877, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.3487, + "step": 1730 + }, + { + "epoch": 0.6933546837469976, + "learning_rate": 1.99990235049015e-05, + "loss": 0.3526, + "step": 1732 + }, + { + "epoch": 0.6941553242594075, + "learning_rate": 1.9999375039475275e-05, + "loss": 0.3355, + "step": 1734 + }, + { + "epoch": 0.6949559647718174, + "learning_rate": 1.999964845810285e-05, + "loss": 0.3392, + "step": 1736 + }, + { + "epoch": 0.6957566052842273, + "learning_rate": 1.9999843758648253e-05, + "loss": 0.3394, + "step": 1738 + }, + { + "epoch": 0.6965572457966374, + "learning_rate": 1.999996093958578e-05, + "loss": 0.4811, + "step": 1740 + }, + { + "epoch": 0.6973578863090473, + "learning_rate": 2e-05, + "loss": 0.2599, + "step": 1742 + }, + { + "epoch": 0.6981585268214572, + "learning_rate": 1.999996093958578e-05, + "loss": 0.3574, + "step": 1744 + }, + { + "epoch": 0.6989591673338671, + "learning_rate": 1.9999843758648253e-05, + "loss": 0.3183, + "step": 1746 + }, + { + "epoch": 0.699759807846277, + "learning_rate": 1.999964845810285e-05, + "loss": 0.5702, + "step": 1748 + }, + { + "epoch": 0.7005604483586869, + "learning_rate": 1.9999375039475278e-05, + "loss": 0.365, + "step": 1750 + }, + { + "epoch": 0.7013610888710968, + "learning_rate": 1.99990235049015e-05, + "loss": 0.335, + "step": 1752 + }, + { + "epoch": 0.7021617293835068, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.3535, + "step": 1754 + }, + { + "epoch": 0.7029623698959168, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.306, + "step": 1756 + }, + { + "epoch": 0.7037630104083267, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.251, + "step": 1758 + }, + { + "epoch": 0.7045636509207366, + "learning_rate": 1.999683627122195e-05, + "loss": 0.3725, + "step": 1760 + }, + { + "epoch": 0.7053642914331465, + "learning_rate": 1.999609421031453e-05, + "loss": 0.3846, + "step": 1762 + }, + { + "epoch": 0.7061649319455564, + "learning_rate": 1.999527405909102e-05, + "loss": 0.234, + "step": 1764 + }, + { + "epoch": 0.7069655724579663, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.4096, + "step": 1766 + }, + { + "epoch": 0.7077662129703763, + "learning_rate": 1.999339951193407e-05, + "loss": 0.2667, + "step": 1768 + }, + { + "epoch": 0.7085668534827863, + "learning_rate": 1.999234513064475e-05, + "loss": 0.4168, + "step": 1770 + }, + { + "epoch": 0.7093674939951962, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.3356, + "step": 1772 + }, + { + "epoch": 0.7101681345076061, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.2765, + "step": 1774 + }, + { + "epoch": 0.710968775020016, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.3191, + "step": 1776 + }, + { + "epoch": 0.7117694155324259, + "learning_rate": 1.998734708672375e-05, + "loss": 0.3191, + "step": 1778 + }, + { + "epoch": 0.7125700560448359, + "learning_rate": 1.9985902494859026e-05, + "loss": 0.306, + "step": 1780 + }, + { + "epoch": 0.7133706965572458, + "learning_rate": 1.9984379892296735e-05, + "loss": 0.3996, + "step": 1782 + }, + { + "epoch": 0.7141713370696557, + "learning_rate": 1.9982779290931572e-05, + "loss": 0.3894, + "step": 1784 + }, + { + "epoch": 0.7149719775820657, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.3354, + "step": 1786 + }, + { + "epoch": 0.7157726180944756, + "learning_rate": 1.997934414241799e-05, + "loss": 0.3182, + "step": 1788 + }, + { + "epoch": 0.7165732586068855, + "learning_rate": 1.9977509622105236e-05, + "loss": 0.376, + "step": 1790 + }, + { + "epoch": 0.7173738991192954, + "learning_rate": 1.997559715666073e-05, + "loss": 0.3646, + "step": 1792 + }, + { + "epoch": 0.7181745396317054, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.3219, + "step": 1794 + }, + { + "epoch": 0.7189751801441153, + "learning_rate": 1.997153845074662e-05, + "loss": 0.3191, + "step": 1796 + }, + { + "epoch": 0.7197758206565252, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.409, + "step": 1798 + }, + { + "epoch": 0.7205764611689351, + "learning_rate": 1.9967168151503193e-05, + "loss": 0.304, + "step": 1800 + }, + { + "epoch": 0.7213771016813451, + "learning_rate": 1.996486619667911e-05, + "loss": 0.2722, + "step": 1802 + }, + { + "epoch": 0.722177742193755, + "learning_rate": 1.9962486395494753e-05, + "loss": 0.2723, + "step": 1804 + }, + { + "epoch": 0.7229783827061649, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.3189, + "step": 1806 + }, + { + "epoch": 0.7237790232185749, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.4887, + "step": 1808 + }, + { + "epoch": 0.7245796637309848, + "learning_rate": 1.995488010273198e-05, + "loss": 0.4591, + "step": 1810 + }, + { + "epoch": 0.7253803042433947, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.3219, + "step": 1812 + }, + { + "epoch": 0.7261809447558046, + "learning_rate": 1.9949420366137873e-05, + "loss": 0.4327, + "step": 1814 + }, + { + "epoch": 0.7269815852682145, + "learning_rate": 1.994657389848176e-05, + "loss": 0.4145, + "step": 1816 + }, + { + "epoch": 0.7277822257806245, + "learning_rate": 1.994364972736634e-05, + "loss": 0.3957, + "step": 1818 + }, + { + "epoch": 0.7285828662930345, + "learning_rate": 1.9940647875635466e-05, + "loss": 0.4925, + "step": 1820 + }, + { + "epoch": 0.7293835068054444, + "learning_rate": 1.993756836673986e-05, + "loss": 0.3189, + "step": 1822 + }, + { + "epoch": 0.7301841473178543, + "learning_rate": 1.99344112247369e-05, + "loss": 0.4737, + "step": 1824 + }, + { + "epoch": 0.7309847878302642, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.3705, + "step": 1826 + }, + { + "epoch": 0.7317854283426741, + "learning_rate": 1.9927864140670618e-05, + "loss": 0.3643, + "step": 1828 + }, + { + "epoch": 0.732586068855084, + "learning_rate": 1.9924474249753656e-05, + "loss": 0.3522, + "step": 1830 + }, + { + "epoch": 0.733386709367494, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.3189, + "step": 1832 + }, + { + "epoch": 0.734187349879904, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.3352, + "step": 1834 + }, + { + "epoch": 0.7349879903923139, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.2903, + "step": 1836 + }, + { + "epoch": 0.7357886309047238, + "learning_rate": 1.9910139651840497e-05, + "loss": 0.2632, + "step": 1838 + }, + { + "epoch": 0.7365892714171337, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.4184, + "step": 1840 + }, + { + "epoch": 0.7373899119295436, + "learning_rate": 1.9902507726395524e-05, + "loss": 0.3834, + "step": 1842 + }, + { + "epoch": 0.7381905524419535, + "learning_rate": 1.989857570980049e-05, + "loss": 0.2593, + "step": 1844 + }, + { + "epoch": 0.7389911929543634, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.3036, + "step": 1846 + }, + { + "epoch": 0.7397918334667735, + "learning_rate": 1.9890479722451292e-05, + "loss": 0.333, + "step": 1848 + }, + { + "epoch": 0.7405924739791834, + "learning_rate": 1.988631581494365e-05, + "loss": 0.2725, + "step": 1850 + }, + { + "epoch": 0.7413931144915933, + "learning_rate": 1.9882074674717832e-05, + "loss": 0.3643, + "step": 1852 + }, + { + "epoch": 0.7421937550040032, + "learning_rate": 1.987775633490599e-05, + "loss": 0.3218, + "step": 1854 + }, + { + "epoch": 0.7429943955164131, + "learning_rate": 1.987336082924333e-05, + "loss": 0.2668, + "step": 1856 + }, + { + "epoch": 0.743795036028823, + "learning_rate": 1.986888819206792e-05, + "loss": 0.3758, + "step": 1858 + }, + { + "epoch": 0.7445956765412329, + "learning_rate": 1.986433845832037e-05, + "loss": 0.3644, + "step": 1860 + }, + { + "epoch": 0.745396317053643, + "learning_rate": 1.9859711663543573e-05, + "loss": 0.3072, + "step": 1862 + }, + { + "epoch": 0.7461969575660529, + "learning_rate": 1.9855007843882437e-05, + "loss": 0.2723, + "step": 1864 + }, + { + "epoch": 0.7469975980784628, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.3522, + "step": 1866 + }, + { + "epoch": 0.7477982385908727, + "learning_rate": 1.9845369277495105e-05, + "loss": 0.3956, + "step": 1868 + }, + { + "epoch": 0.7485988791032826, + "learning_rate": 1.9840434606066186e-05, + "loss": 0.4153, + "step": 1870 + }, + { + "epoch": 0.7493995196156925, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.3036, + "step": 1872 + }, + { + "epoch": 0.7502001601281025, + "learning_rate": 1.983033467948784e-05, + "loss": 0.543, + "step": 1874 + }, + { + "epoch": 0.7510008006405124, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.3661, + "step": 1876 + }, + { + "epoch": 0.7518014411529224, + "learning_rate": 1.9819927571953804e-05, + "loss": 0.3293, + "step": 1878 + }, + { + "epoch": 0.7526020816653323, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.2707, + "step": 1880 + }, + { + "epoch": 0.7534027221777422, + "learning_rate": 1.980921360866819e-05, + "loss": 0.3036, + "step": 1882 + }, + { + "epoch": 0.7542033626901521, + "learning_rate": 1.9803741660367018e-05, + "loss": 0.4381, + "step": 1884 + }, + { + "epoch": 0.755004003202562, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.2723, + "step": 1886 + }, + { + "epoch": 0.755804643714972, + "learning_rate": 1.979256804418418e-05, + "loss": 0.3491, + "step": 1888 + }, + { + "epoch": 0.7566052842273819, + "learning_rate": 1.978686646359173e-05, + "loss": 0.3891, + "step": 1890 + }, + { + "epoch": 0.7574059247397918, + "learning_rate": 1.9781088427187677e-05, + "loss": 0.3349, + "step": 1892 + }, + { + "epoch": 0.7582065652522018, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.3349, + "step": 1894 + }, + { + "epoch": 0.7590072057646117, + "learning_rate": 1.976930316809569e-05, + "loss": 0.2905, + "step": 1896 + }, + { + "epoch": 0.7598078462770216, + "learning_rate": 1.9763296037475177e-05, + "loss": 0.4696, + "step": 1898 + }, + { + "epoch": 0.7606084867894315, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.3332, + "step": 1900 + }, + { + "epoch": 0.7614091273018415, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.2906, + "step": 1902 + }, + { + "epoch": 0.7622097678142514, + "learning_rate": 1.9744817206240374e-05, + "loss": 0.3242, + "step": 1904 + }, + { + "epoch": 0.7630104083266613, + "learning_rate": 1.9738505276435695e-05, + "loss": 0.4033, + "step": 1906 + }, + { + "epoch": 0.7638110488390712, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.2708, + "step": 1908 + }, + { + "epoch": 0.7646116893514812, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.2935, + "step": 1910 + }, + { + "epoch": 0.7654123298638911, + "learning_rate": 1.9719113219170152e-05, + "loss": 0.376, + "step": 1912 + }, + { + "epoch": 0.7662129703763011, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.4971, + "step": 1914 + }, + { + "epoch": 0.767013610888711, + "learning_rate": 1.970580546424186e-05, + "loss": 0.3761, + "step": 1916 + }, + { + "epoch": 0.7678142514011209, + "learning_rate": 1.969903782680467e-05, + "loss": 0.4232, + "step": 1918 + }, + { + "epoch": 0.7686148919135308, + "learning_rate": 1.9692194419680463e-05, + "loss": 0.4158, + "step": 1920 + }, + { + "epoch": 0.7694155324259407, + "learning_rate": 1.96852752963305e-05, + "loss": 0.3703, + "step": 1922 + }, + { + "epoch": 0.7702161729383507, + "learning_rate": 1.9678280510807552e-05, + "loss": 0.2896, + "step": 1924 + }, + { + "epoch": 0.7710168134507606, + "learning_rate": 1.9671210117755462e-05, + "loss": 0.3702, + "step": 1926 + }, + { + "epoch": 0.7718174539631706, + "learning_rate": 1.966406417240872e-05, + "loss": 0.2765, + "step": 1928 + }, + { + "epoch": 0.7726180944755805, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.3037, + "step": 1930 + }, + { + "epoch": 0.7734187349879904, + "learning_rate": 1.964954584871995e-05, + "loss": 0.2767, + "step": 1932 + }, + { + "epoch": 0.7742193755004003, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.2725, + "step": 1934 + }, + { + "epoch": 0.7750200160128102, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.2511, + "step": 1936 + }, + { + "epoch": 0.7758206565252201, + "learning_rate": 1.9627203135753576e-05, + "loss": 0.3663, + "step": 1938 + }, + { + "epoch": 0.77662129703763, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.4146, + "step": 1940 + }, + { + "epoch": 0.7774219375500401, + "learning_rate": 1.961193185426459e-05, + "loss": 0.348, + "step": 1942 + }, + { + "epoch": 0.77822257806245, + "learning_rate": 1.9604183549736287e-05, + "loss": 0.2981, + "step": 1944 + }, + { + "epoch": 0.7790232185748599, + "learning_rate": 1.959636021653044e-05, + "loss": 0.3993, + "step": 1946 + }, + { + "epoch": 0.7798238590872698, + "learning_rate": 1.958846191576357e-05, + "loss": 0.4592, + "step": 1948 + }, + { + "epoch": 0.7806244995996797, + "learning_rate": 1.958048870913786e-05, + "loss": 0.2763, + "step": 1950 + }, + { + "epoch": 0.7814251401120896, + "learning_rate": 1.9572440658940667e-05, + "loss": 0.2867, + "step": 1952 + }, + { + "epoch": 0.7822257806244995, + "learning_rate": 1.9564317828044022e-05, + "loss": 0.3071, + "step": 1954 + }, + { + "epoch": 0.7830264211369096, + "learning_rate": 1.955612027990415e-05, + "loss": 0.4512, + "step": 1956 + }, + { + "epoch": 0.7838270616493195, + "learning_rate": 1.9547848078560982e-05, + "loss": 0.2896, + "step": 1958 + }, + { + "epoch": 0.7846277021617294, + "learning_rate": 1.953950128863763e-05, + "loss": 0.5083, + "step": 1960 + }, + { + "epoch": 0.7854283426741393, + "learning_rate": 1.9531079975339915e-05, + "loss": 0.318, + "step": 1962 + }, + { + "epoch": 0.7862289831865492, + "learning_rate": 1.9522584204455835e-05, + "loss": 0.2616, + "step": 1964 + }, + { + "epoch": 0.7870296236989591, + "learning_rate": 1.9514014042355054e-05, + "loss": 0.3181, + "step": 1966 + }, + { + "epoch": 0.7878302642113691, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.4518, + "step": 1968 + }, + { + "epoch": 0.7886309047237791, + "learning_rate": 1.9496650812887293e-05, + "loss": 0.3192, + "step": 1970 + }, + { + "epoch": 0.789431545236189, + "learning_rate": 1.9487857881163295e-05, + "loss": 0.339, + "step": 1972 + }, + { + "epoch": 0.7902321857485989, + "learning_rate": 1.947899082950751e-05, + "loss": 0.4566, + "step": 1974 + }, + { + "epoch": 0.7910328262610088, + "learning_rate": 1.947004972719008e-05, + "loss": 0.2592, + "step": 1976 + }, + { + "epoch": 0.7918334667734187, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.2248, + "step": 1978 + }, + { + "epoch": 0.7926341072858286, + "learning_rate": 1.945194565054276e-05, + "loss": 0.3057, + "step": 1980 + }, + { + "epoch": 0.7934347477982386, + "learning_rate": 1.9442782817643425e-05, + "loss": 0.2957, + "step": 1982 + }, + { + "epoch": 0.7942353883106485, + "learning_rate": 1.9433546216942433e-05, + "loss": 0.4151, + "step": 1984 + }, + { + "epoch": 0.7950360288230585, + "learning_rate": 1.942423592059687e-05, + "loss": 0.3973, + "step": 1986 + }, + { + "epoch": 0.7958366693354684, + "learning_rate": 1.941485200133955e-05, + "loss": 0.4031, + "step": 1988 + }, + { + "epoch": 0.7966373098478783, + "learning_rate": 1.9405394532478422e-05, + "loss": 0.3348, + "step": 1990 + }, + { + "epoch": 0.7974379503602882, + "learning_rate": 1.9395863587896025e-05, + "loss": 0.3643, + "step": 1992 + }, + { + "epoch": 0.7982385908726981, + "learning_rate": 1.938625924204888e-05, + "loss": 0.2706, + "step": 1994 + }, + { + "epoch": 0.7990392313851081, + "learning_rate": 1.937658156996694e-05, + "loss": 0.4382, + "step": 1996 + }, + { + "epoch": 0.799839871897518, + "learning_rate": 1.9366830647252977e-05, + "loss": 0.2668, + "step": 1998 + }, + { + "epoch": 0.800640512409928, + "learning_rate": 1.9357006550082e-05, + "loss": 0.2488, + "step": 2000 + }, + { + "epoch": 0.8014411529223379, + "learning_rate": 1.9347109355200676e-05, + "loss": 0.409, + "step": 2002 + }, + { + "epoch": 0.8022417934347478, + "learning_rate": 1.933713913992671e-05, + "loss": 0.2446, + "step": 2004 + }, + { + "epoch": 0.8030424339471577, + "learning_rate": 1.9327095982148255e-05, + "loss": 0.5494, + "step": 2006 + }, + { + "epoch": 0.8038430744595677, + "learning_rate": 1.9316979960323283e-05, + "loss": 0.3843, + "step": 2008 + }, + { + "epoch": 0.8046437149719776, + "learning_rate": 1.9306791153479017e-05, + "loss": 0.4545, + "step": 2010 + }, + { + "epoch": 0.8054443554843875, + "learning_rate": 1.9296529641211226e-05, + "loss": 0.3332, + "step": 2012 + }, + { + "epoch": 0.8062449959967974, + "learning_rate": 1.928619550368371e-05, + "loss": 0.3353, + "step": 2014 + }, + { + "epoch": 0.8070456365092074, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.2591, + "step": 2016 + }, + { + "epoch": 0.8078462770216173, + "learning_rate": 1.9265309676340783e-05, + "loss": 0.3422, + "step": 2018 + }, + { + "epoch": 0.8086469175340272, + "learning_rate": 1.9254758149687187e-05, + "loss": 0.366, + "step": 2020 + }, + { + "epoch": 0.8094475580464372, + "learning_rate": 1.9244134324096216e-05, + "loss": 0.335, + "step": 2022 + }, + { + "epoch": 0.8102481985588471, + "learning_rate": 1.9233438282562095e-05, + "loss": 0.3266, + "step": 2024 + }, + { + "epoch": 0.811048839071257, + "learning_rate": 1.9222670108643156e-05, + "loss": 0.3058, + "step": 2026 + }, + { + "epoch": 0.8118494795836669, + "learning_rate": 1.9211829886461278e-05, + "loss": 0.4282, + "step": 2028 + }, + { + "epoch": 0.8126501200960768, + "learning_rate": 1.9200917700701176e-05, + "loss": 0.4034, + "step": 2030 + }, + { + "epoch": 0.8134507606084868, + "learning_rate": 1.918993363660975e-05, + "loss": 0.3252, + "step": 2032 + }, + { + "epoch": 0.8142514011208967, + "learning_rate": 1.9178877779995416e-05, + "loss": 0.3639, + "step": 2034 + }, + { + "epoch": 0.8150520416333067, + "learning_rate": 1.916775021722745e-05, + "loss": 0.339, + "step": 2036 + }, + { + "epoch": 0.8158526821457166, + "learning_rate": 1.9156551035235298e-05, + "loss": 0.2631, + "step": 2038 + }, + { + "epoch": 0.8166533226581265, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.2726, + "step": 2040 + }, + { + "epoch": 0.8174539631705364, + "learning_rate": 1.9133938164092942e-05, + "loss": 0.2762, + "step": 2042 + }, + { + "epoch": 0.8182546036829463, + "learning_rate": 1.9122524651596372e-05, + "loss": 0.3331, + "step": 2044 + }, + { + "epoch": 0.8190552441953562, + "learning_rate": 1.9111039873181475e-05, + "loss": 0.291, + "step": 2046 + }, + { + "epoch": 0.8198558847077662, + "learning_rate": 1.9099483918568287e-05, + "loss": 0.2763, + "step": 2048 + }, + { + "epoch": 0.8206565252201762, + "learning_rate": 1.90878568780329e-05, + "loss": 0.3533, + "step": 2050 + }, + { + "epoch": 0.8214571657325861, + "learning_rate": 1.907615884240668e-05, + "loss": 0.4127, + "step": 2052 + }, + { + "epoch": 0.822257806244996, + "learning_rate": 1.9064389903075683e-05, + "loss": 0.3387, + "step": 2054 + }, + { + "epoch": 0.8230584467574059, + "learning_rate": 1.905255015197982e-05, + "loss": 0.4971, + "step": 2056 + }, + { + "epoch": 0.8238590872698158, + "learning_rate": 1.9040639681612216e-05, + "loss": 0.2594, + "step": 2058 + }, + { + "epoch": 0.8246597277822257, + "learning_rate": 1.902865858501845e-05, + "loss": 0.2632, + "step": 2060 + }, + { + "epoch": 0.8254603682946358, + "learning_rate": 1.9016606955795843e-05, + "loss": 0.4505, + "step": 2062 + }, + { + "epoch": 0.8262610088070457, + "learning_rate": 1.9004484888092734e-05, + "loss": 0.3349, + "step": 2064 + }, + { + "epoch": 0.8270616493194556, + "learning_rate": 1.8992292476607695e-05, + "loss": 0.3016, + "step": 2066 + }, + { + "epoch": 0.8278622898318655, + "learning_rate": 1.8980029816588863e-05, + "loss": 0.2897, + "step": 2068 + }, + { + "epoch": 0.8286629303442754, + "learning_rate": 1.8967697003833156e-05, + "loss": 0.3331, + "step": 2070 + }, + { + "epoch": 0.8294635708566853, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.2476, + "step": 2072 + }, + { + "epoch": 0.8302642113690952, + "learning_rate": 1.8942821306038227e-05, + "loss": 0.2707, + "step": 2074 + }, + { + "epoch": 0.8310648518815053, + "learning_rate": 1.893027861533003e-05, + "loss": 0.0879, + "step": 2076 + }, + { + "epoch": 0.8318654923939152, + "learning_rate": 1.891766616054545e-05, + "loss": 0.4261, + "step": 2078 + }, + { + "epoch": 0.8326661329063251, + "learning_rate": 1.8904984040214043e-05, + "loss": 0.3832, + "step": 2080 + }, + { + "epoch": 0.833466773418735, + "learning_rate": 1.8892232353409582e-05, + "loss": 0.5707, + "step": 2082 + }, + { + "epoch": 0.8342674139311449, + "learning_rate": 1.8879411199749306e-05, + "loss": 0.3643, + "step": 2084 + }, + { + "epoch": 0.8350680544435548, + "learning_rate": 1.8866520679393124e-05, + "loss": 0.3977, + "step": 2086 + }, + { + "epoch": 0.8358686949559647, + "learning_rate": 1.885356089304285e-05, + "loss": 0.4348, + "step": 2088 + }, + { + "epoch": 0.8366693354683747, + "learning_rate": 1.884053194194143e-05, + "loss": 0.2906, + "step": 2090 + }, + { + "epoch": 0.8374699759807847, + "learning_rate": 1.882743392787207e-05, + "loss": 0.2764, + "step": 2092 + }, + { + "epoch": 0.8382706164931946, + "learning_rate": 1.881426695315756e-05, + "loss": 0.2867, + "step": 2094 + }, + { + "epoch": 0.8390712570056045, + "learning_rate": 1.8801031120659396e-05, + "loss": 0.3387, + "step": 2096 + }, + { + "epoch": 0.8398718975180144, + "learning_rate": 1.8787726533777003e-05, + "loss": 0.2723, + "step": 2098 + }, + { + "epoch": 0.8406725380304243, + "learning_rate": 1.877435329644691e-05, + "loss": 0.3018, + "step": 2100 + }, + { + "epoch": 0.8414731785428343, + "learning_rate": 1.8760911513141974e-05, + "loss": 0.3037, + "step": 2102 + }, + { + "epoch": 0.8422738190552442, + "learning_rate": 1.8747401288870482e-05, + "loss": 0.4381, + "step": 2104 + }, + { + "epoch": 0.8430744595676541, + "learning_rate": 1.8733822729175455e-05, + "loss": 0.2443, + "step": 2106 + }, + { + "epoch": 0.8438751000800641, + "learning_rate": 1.8720175940133712e-05, + "loss": 0.3491, + "step": 2108 + }, + { + "epoch": 0.844675740592474, + "learning_rate": 1.8706461028355107e-05, + "loss": 0.2838, + "step": 2110 + }, + { + "epoch": 0.8454763811048839, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.481, + "step": 2112 + }, + { + "epoch": 0.8462770216172938, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.3643, + "step": 2114 + }, + { + "epoch": 0.8470776621297038, + "learning_rate": 1.8664908630674264e-05, + "loss": 0.396, + "step": 2116 + }, + { + "epoch": 0.8478783026421137, + "learning_rate": 1.86509223046777e-05, + "loss": 0.3348, + "step": 2118 + }, + { + "epoch": 0.8486789431545236, + "learning_rate": 1.8636868396959406e-05, + "loss": 0.5572, + "step": 2120 + }, + { + "epoch": 0.8494795836669335, + "learning_rate": 1.8622747017309676e-05, + "loss": 0.4381, + "step": 2122 + }, + { + "epoch": 0.8502802241793435, + "learning_rate": 1.8608558276045898e-05, + "loss": 0.2445, + "step": 2124 + }, + { + "epoch": 0.8510808646917534, + "learning_rate": 1.8594302284011697e-05, + "loss": 0.307, + "step": 2126 + }, + { + "epoch": 0.8518815052041633, + "learning_rate": 1.8579979152576076e-05, + "loss": 0.3523, + "step": 2128 + }, + { + "epoch": 0.8526821457165733, + "learning_rate": 1.8565588993632498e-05, + "loss": 0.3039, + "step": 2130 + }, + { + "epoch": 0.8534827862289832, + "learning_rate": 1.8551131919598084e-05, + "loss": 0.2764, + "step": 2132 + }, + { + "epoch": 0.8542834267413931, + "learning_rate": 1.8536608043412702e-05, + "loss": 0.3059, + "step": 2134 + }, + { + "epoch": 0.855084067253803, + "learning_rate": 1.852201747853807e-05, + "loss": 0.2949, + "step": 2136 + }, + { + "epoch": 0.855884707766213, + "learning_rate": 1.8507360338956896e-05, + "loss": 0.3841, + "step": 2138 + }, + { + "epoch": 0.8566853482786229, + "learning_rate": 1.849263673917196e-05, + "loss": 0.3082, + "step": 2140 + }, + { + "epoch": 0.8574859887910328, + "learning_rate": 1.847784679420527e-05, + "loss": 0.2896, + "step": 2142 + }, + { + "epoch": 0.8582866293034428, + "learning_rate": 1.846299061959706e-05, + "loss": 0.4167, + "step": 2144 + }, + { + "epoch": 0.8590872698158527, + "learning_rate": 1.8448068331405018e-05, + "loss": 0.4517, + "step": 2146 + }, + { + "epoch": 0.8598879103282626, + "learning_rate": 1.8433080046203293e-05, + "loss": 0.3191, + "step": 2148 + }, + { + "epoch": 0.8606885508406725, + "learning_rate": 1.841802588108161e-05, + "loss": 0.2867, + "step": 2150 + }, + { + "epoch": 0.8614891913530824, + "learning_rate": 1.8402905953644356e-05, + "loss": 0.3018, + "step": 2152 + }, + { + "epoch": 0.8622898318654924, + "learning_rate": 1.838772038200968e-05, + "loss": 0.4657, + "step": 2154 + }, + { + "epoch": 0.8630904723779024, + "learning_rate": 1.837246928480848e-05, + "loss": 0.3804, + "step": 2156 + }, + { + "epoch": 0.8638911128903123, + "learning_rate": 1.8357152781183613e-05, + "loss": 0.335, + "step": 2158 + }, + { + "epoch": 0.8646917534027222, + "learning_rate": 1.8341770990788874e-05, + "loss": 0.335, + "step": 2160 + }, + { + "epoch": 0.8654923939151321, + "learning_rate": 1.8326324033788087e-05, + "loss": 0.3056, + "step": 2162 + }, + { + "epoch": 0.866293034427542, + "learning_rate": 1.831081203085415e-05, + "loss": 0.4184, + "step": 2164 + }, + { + "epoch": 0.8670936749399519, + "learning_rate": 1.8295235103168128e-05, + "loss": 0.3352, + "step": 2166 + }, + { + "epoch": 0.8678943154523618, + "learning_rate": 1.8279593372418284e-05, + "loss": 0.2643, + "step": 2168 + }, + { + "epoch": 0.8686949559647719, + "learning_rate": 1.8263886960799072e-05, + "loss": 0.319, + "step": 2170 + }, + { + "epoch": 0.8694955964771818, + "learning_rate": 1.8248115991010303e-05, + "loss": 0.3448, + "step": 2172 + }, + { + "epoch": 0.8702962369895917, + "learning_rate": 1.8232280586256104e-05, + "loss": 0.4971, + "step": 2174 + }, + { + "epoch": 0.8710968775020016, + "learning_rate": 1.8216380870243963e-05, + "loss": 0.3812, + "step": 2176 + }, + { + "epoch": 0.8718975180144115, + "learning_rate": 1.820041696718378e-05, + "loss": 0.4165, + "step": 2178 + }, + { + "epoch": 0.8726981585268214, + "learning_rate": 1.8184389001786912e-05, + "loss": 0.4754, + "step": 2180 + }, + { + "epoch": 0.8734987990392313, + "learning_rate": 1.8168297099265108e-05, + "loss": 0.2248, + "step": 2182 + }, + { + "epoch": 0.8742994395516414, + "learning_rate": 1.815214138532966e-05, + "loss": 0.3421, + "step": 2184 + }, + { + "epoch": 0.8751000800640513, + "learning_rate": 1.8135921986190358e-05, + "loss": 0.339, + "step": 2186 + }, + { + "epoch": 0.8759007205764612, + "learning_rate": 1.8119639028554475e-05, + "loss": 0.3181, + "step": 2188 + }, + { + "epoch": 0.8767013610888711, + "learning_rate": 1.8103292639625835e-05, + "loss": 0.3096, + "step": 2190 + }, + { + "epoch": 0.877502001601281, + "learning_rate": 1.808688294710378e-05, + "loss": 0.2472, + "step": 2192 + }, + { + "epoch": 0.8783026421136909, + "learning_rate": 1.807041007918221e-05, + "loss": 0.497, + "step": 2194 + }, + { + "epoch": 0.8791032826261009, + "learning_rate": 1.805387416454849e-05, + "loss": 0.3222, + "step": 2196 + }, + { + "epoch": 0.8799039231385108, + "learning_rate": 1.8037275332382575e-05, + "loss": 0.4511, + "step": 2198 + }, + { + "epoch": 0.8807045636509208, + "learning_rate": 1.802061371235592e-05, + "loss": 0.4518, + "step": 2200 + }, + { + "epoch": 0.8815052041633307, + "learning_rate": 1.8003889434630476e-05, + "loss": 0.2472, + "step": 2202 + }, + { + "epoch": 0.8823058446757406, + "learning_rate": 1.7987102629857692e-05, + "loss": 0.3218, + "step": 2204 + }, + { + "epoch": 0.8831064851881505, + "learning_rate": 1.7970253429177494e-05, + "loss": 0.3571, + "step": 2206 + }, + { + "epoch": 0.8839071257005604, + "learning_rate": 1.7953341964217196e-05, + "loss": 0.4545, + "step": 2208 + }, + { + "epoch": 0.8847077662129704, + "learning_rate": 1.7936368367090583e-05, + "loss": 0.2276, + "step": 2210 + }, + { + "epoch": 0.8855084067253803, + "learning_rate": 1.7919332770396798e-05, + "loss": 0.2632, + "step": 2212 + }, + { + "epoch": 0.8863090472377902, + "learning_rate": 1.7902235307219336e-05, + "loss": 0.2592, + "step": 2214 + }, + { + "epoch": 0.8871096877502002, + "learning_rate": 1.7885076111125e-05, + "loss": 0.2552, + "step": 2216 + }, + { + "epoch": 0.8879103282626101, + "learning_rate": 1.7867855316162846e-05, + "loss": 0.3061, + "step": 2218 + }, + { + "epoch": 0.88871096877502, + "learning_rate": 1.7850573056863173e-05, + "loss": 0.3352, + "step": 2220 + }, + { + "epoch": 0.8895116092874299, + "learning_rate": 1.783322946823638e-05, + "loss": 0.5295, + "step": 2222 + }, + { + "epoch": 0.8903122497998399, + "learning_rate": 1.7815824685772042e-05, + "loss": 0.5512, + "step": 2224 + }, + { + "epoch": 0.8911128903122498, + "learning_rate": 1.779835884543776e-05, + "loss": 0.251, + "step": 2226 + }, + { + "epoch": 0.8919135308246597, + "learning_rate": 1.7780832083678122e-05, + "loss": 0.2619, + "step": 2228 + }, + { + "epoch": 0.8927141713370697, + "learning_rate": 1.776324453741365e-05, + "loss": 0.3061, + "step": 2230 + }, + { + "epoch": 0.8935148118494796, + "learning_rate": 1.774559634403971e-05, + "loss": 0.2509, + "step": 2232 + }, + { + "epoch": 0.8943154523618895, + "learning_rate": 1.7727887641425465e-05, + "loss": 0.3094, + "step": 2234 + }, + { + "epoch": 0.8951160928742994, + "learning_rate": 1.7710118567912732e-05, + "loss": 0.3037, + "step": 2236 + }, + { + "epoch": 0.8959167333867094, + "learning_rate": 1.7692289262315008e-05, + "loss": 0.7035, + "step": 2238 + }, + { + "epoch": 0.8967173738991193, + "learning_rate": 1.7674399863916298e-05, + "loss": 0.2512, + "step": 2240 + }, + { + "epoch": 0.8975180144115292, + "learning_rate": 1.765645051247007e-05, + "loss": 0.2468, + "step": 2242 + }, + { + "epoch": 0.8983186549239391, + "learning_rate": 1.7638441348198144e-05, + "loss": 0.3524, + "step": 2244 + }, + { + "epoch": 0.899119295436349, + "learning_rate": 1.762037251178961e-05, + "loss": 0.5079, + "step": 2246 + }, + { + "epoch": 0.899919935948759, + "learning_rate": 1.7602244144399713e-05, + "loss": 0.2768, + "step": 2248 + }, + { + "epoch": 0.900720576461169, + "learning_rate": 1.7584056387648738e-05, + "loss": 0.5345, + "step": 2250 + }, + { + "epoch": 0.9015212169735789, + "learning_rate": 1.7565809383620966e-05, + "loss": 0.2911, + "step": 2252 + }, + { + "epoch": 0.9023218574859888, + "learning_rate": 1.7547503274863502e-05, + "loss": 0.278, + "step": 2254 + }, + { + "epoch": 0.9031224979983987, + "learning_rate": 1.7529138204385186e-05, + "loss": 0.4899, + "step": 2256 + }, + { + "epoch": 0.9039231385108086, + "learning_rate": 1.7510714315655467e-05, + "loss": 0.6137, + "step": 2258 + }, + { + "epoch": 0.9047237790232185, + "learning_rate": 1.7492231752603305e-05, + "loss": 0.2593, + "step": 2260 + }, + { + "epoch": 0.9055244195356285, + "learning_rate": 1.7473690659616e-05, + "loss": 0.3514, + "step": 2262 + }, + { + "epoch": 0.9063250600480385, + "learning_rate": 1.7455091181538094e-05, + "loss": 0.3356, + "step": 2264 + }, + { + "epoch": 0.9071257005604484, + "learning_rate": 1.743643346367027e-05, + "loss": 0.2842, + "step": 2266 + }, + { + "epoch": 0.9079263410728583, + "learning_rate": 1.741771765176815e-05, + "loss": 0.3819, + "step": 2268 + }, + { + "epoch": 0.9087269815852682, + "learning_rate": 1.739894389204122e-05, + "loss": 0.2595, + "step": 2270 + }, + { + "epoch": 0.9095276220976781, + "learning_rate": 1.7380112331151657e-05, + "loss": 0.3394, + "step": 2272 + }, + { + "epoch": 0.910328262610088, + "learning_rate": 1.7361223116213146e-05, + "loss": 0.2646, + "step": 2274 + }, + { + "epoch": 0.911128903122498, + "learning_rate": 1.7342276394789825e-05, + "loss": 0.2765, + "step": 2276 + }, + { + "epoch": 0.911929543634908, + "learning_rate": 1.732327231489503e-05, + "loss": 0.602, + "step": 2278 + }, + { + "epoch": 0.9127301841473179, + "learning_rate": 1.7304211024990216e-05, + "loss": 0.3394, + "step": 2280 + }, + { + "epoch": 0.9135308246597278, + "learning_rate": 1.728509267398376e-05, + "loss": 0.2278, + "step": 2282 + }, + { + "epoch": 0.9143314651721377, + "learning_rate": 1.7265917411229803e-05, + "loss": 0.4161, + "step": 2284 + }, + { + "epoch": 0.9151321056845476, + "learning_rate": 1.7246685386527105e-05, + "loss": 0.247, + "step": 2286 + }, + { + "epoch": 0.9159327461969575, + "learning_rate": 1.7227396750117802e-05, + "loss": 0.3571, + "step": 2288 + }, + { + "epoch": 0.9167333867093675, + "learning_rate": 1.7208051652686348e-05, + "loss": 0.2584, + "step": 2290 + }, + { + "epoch": 0.9175340272217775, + "learning_rate": 1.718865024535822e-05, + "loss": 0.2911, + "step": 2292 + }, + { + "epoch": 0.9183346677341874, + "learning_rate": 1.716919267969884e-05, + "loss": 0.2642, + "step": 2294 + }, + { + "epoch": 0.9191353082465973, + "learning_rate": 1.7149679107712317e-05, + "loss": 0.302, + "step": 2296 + }, + { + "epoch": 0.9199359487590072, + "learning_rate": 1.7130109681840298e-05, + "loss": 0.3332, + "step": 2298 + }, + { + "epoch": 0.9207365892714171, + "learning_rate": 1.711048455496075e-05, + "loss": 0.2984, + "step": 2300 + }, + { + "epoch": 0.921537229783827, + "learning_rate": 1.7090803880386784e-05, + "loss": 0.571, + "step": 2302 + }, + { + "epoch": 0.922337870296237, + "learning_rate": 1.7071067811865474e-05, + "loss": 0.3348, + "step": 2304 + }, + { + "epoch": 0.923138510808647, + "learning_rate": 1.705127650357663e-05, + "loss": 0.3219, + "step": 2306 + }, + { + "epoch": 0.9239391513210569, + "learning_rate": 1.7031430110131566e-05, + "loss": 0.393, + "step": 2308 + }, + { + "epoch": 0.9247397918334668, + "learning_rate": 1.701152878657197e-05, + "loss": 0.2365, + "step": 2310 + }, + { + "epoch": 0.9255404323458767, + "learning_rate": 1.699157268836863e-05, + "loss": 0.3764, + "step": 2312 + }, + { + "epoch": 0.9263410728582866, + "learning_rate": 1.697156197142023e-05, + "loss": 0.3193, + "step": 2314 + }, + { + "epoch": 0.9271417133706965, + "learning_rate": 1.6951496792052148e-05, + "loss": 0.3763, + "step": 2316 + }, + { + "epoch": 0.9279423538831065, + "learning_rate": 1.6931377307015236e-05, + "loss": 0.4259, + "step": 2318 + }, + { + "epoch": 0.9287429943955164, + "learning_rate": 1.6911203673484583e-05, + "loss": 0.3334, + "step": 2320 + }, + { + "epoch": 0.9295436349079264, + "learning_rate": 1.6890976049058267e-05, + "loss": 0.3182, + "step": 2322 + }, + { + "epoch": 0.9303442754203363, + "learning_rate": 1.687069459175619e-05, + "loss": 0.3243, + "step": 2324 + }, + { + "epoch": 0.9311449159327462, + "learning_rate": 1.6850359460018744e-05, + "loss": 0.2896, + "step": 2326 + }, + { + "epoch": 0.9319455564451561, + "learning_rate": 1.682997081270568e-05, + "loss": 0.2766, + "step": 2328 + }, + { + "epoch": 0.932746196957566, + "learning_rate": 1.6809528809094805e-05, + "loss": 0.3179, + "step": 2330 + }, + { + "epoch": 0.933546837469976, + "learning_rate": 1.6789033608880742e-05, + "loss": 0.4158, + "step": 2332 + }, + { + "epoch": 0.9343474779823859, + "learning_rate": 1.67684853721737e-05, + "loss": 0.3266, + "step": 2334 + }, + { + "epoch": 0.9351481184947958, + "learning_rate": 1.6747884259498185e-05, + "loss": 0.3493, + "step": 2336 + }, + { + "epoch": 0.9359487590072058, + "learning_rate": 1.6727230431791826e-05, + "loss": 0.306, + "step": 2338 + }, + { + "epoch": 0.9367493995196157, + "learning_rate": 1.6706524050404006e-05, + "loss": 0.2386, + "step": 2340 + }, + { + "epoch": 0.9375500400320256, + "learning_rate": 1.6685765277094702e-05, + "loss": 0.4329, + "step": 2342 + }, + { + "epoch": 0.9383506805444356, + "learning_rate": 1.6664954274033175e-05, + "loss": 0.2764, + "step": 2344 + }, + { + "epoch": 0.9391513210568455, + "learning_rate": 1.66440912037967e-05, + "loss": 0.3817, + "step": 2346 + }, + { + "epoch": 0.9399519615692554, + "learning_rate": 1.662317622936933e-05, + "loss": 0.251, + "step": 2348 + }, + { + "epoch": 0.9407526020816653, + "learning_rate": 1.6602209514140562e-05, + "loss": 0.3993, + "step": 2350 + }, + { + "epoch": 0.9415532425940752, + "learning_rate": 1.6581191221904098e-05, + "loss": 0.3811, + "step": 2352 + }, + { + "epoch": 0.9423538831064852, + "learning_rate": 1.6560121516856592e-05, + "loss": 0.2761, + "step": 2354 + }, + { + "epoch": 0.9431545236188951, + "learning_rate": 1.6539000563596328e-05, + "loss": 0.3643, + "step": 2356 + }, + { + "epoch": 0.9439551641313051, + "learning_rate": 1.651782852712194e-05, + "loss": 0.3349, + "step": 2358 + }, + { + "epoch": 0.944755804643715, + "learning_rate": 1.6496605572831127e-05, + "loss": 0.3058, + "step": 2360 + }, + { + "epoch": 0.9455564451561249, + "learning_rate": 1.6475331866519387e-05, + "loss": 0.3215, + "step": 2362 + }, + { + "epoch": 0.9463570856685348, + "learning_rate": 1.6454007574378657e-05, + "loss": 0.1995, + "step": 2364 + }, + { + "epoch": 0.9471577261809447, + "learning_rate": 1.6432632862996062e-05, + "loss": 0.5351, + "step": 2366 + }, + { + "epoch": 0.9479583666933546, + "learning_rate": 1.6411207899352633e-05, + "loss": 0.3183, + "step": 2368 + }, + { + "epoch": 0.9487590072057646, + "learning_rate": 1.6389732850821964e-05, + "loss": 0.335, + "step": 2370 + }, + { + "epoch": 0.9495596477181746, + "learning_rate": 1.6368207885168904e-05, + "loss": 0.3188, + "step": 2372 + }, + { + "epoch": 0.9503602882305845, + "learning_rate": 1.6346633170548275e-05, + "loss": 0.3918, + "step": 2374 + }, + { + "epoch": 0.9511609287429944, + "learning_rate": 1.6325008875503563e-05, + "loss": 0.2716, + "step": 2376 + }, + { + "epoch": 0.9519615692554043, + "learning_rate": 1.6303335168965495e-05, + "loss": 0.335, + "step": 2378 + }, + { + "epoch": 0.9527622097678142, + "learning_rate": 1.628161222025089e-05, + "loss": 0.4043, + "step": 2380 + }, + { + "epoch": 0.9535628502802241, + "learning_rate": 1.625984019906122e-05, + "loss": 0.2359, + "step": 2382 + }, + { + "epoch": 0.9543634907926342, + "learning_rate": 1.623801927548132e-05, + "loss": 0.2905, + "step": 2384 + }, + { + "epoch": 0.9551641313050441, + "learning_rate": 1.6216149619978057e-05, + "loss": 0.3152, + "step": 2386 + }, + { + "epoch": 0.955964771817454, + "learning_rate": 1.6194231403398987e-05, + "loss": 0.7502, + "step": 2388 + }, + { + "epoch": 0.9567654123298639, + "learning_rate": 1.6172264796971063e-05, + "loss": 0.2725, + "step": 2390 + }, + { + "epoch": 0.9575660528422738, + "learning_rate": 1.6150249972299173e-05, + "loss": 0.5086, + "step": 2392 + }, + { + "epoch": 0.9583666933546837, + "learning_rate": 1.612818710136499e-05, + "loss": 0.3527, + "step": 2394 + }, + { + "epoch": 0.9591673338670936, + "learning_rate": 1.6106076356525484e-05, + "loss": 0.3842, + "step": 2396 + }, + { + "epoch": 0.9599679743795037, + "learning_rate": 1.6083917910511623e-05, + "loss": 0.3036, + "step": 2398 + }, + { + "epoch": 0.9607686148919136, + "learning_rate": 1.6061711936427028e-05, + "loss": 0.2935, + "step": 2400 + }, + { + "epoch": 0.9615692554043235, + "learning_rate": 1.60394586077466e-05, + "loss": 0.2629, + "step": 2402 + }, + { + "epoch": 0.9623698959167334, + "learning_rate": 1.6017158098315224e-05, + "loss": 0.5324, + "step": 2404 + }, + { + "epoch": 0.9631705364291433, + "learning_rate": 1.5994810582346266e-05, + "loss": 0.3351, + "step": 2406 + }, + { + "epoch": 0.9639711769415532, + "learning_rate": 1.5972416234420404e-05, + "loss": 0.3332, + "step": 2408 + }, + { + "epoch": 0.9647718174539631, + "learning_rate": 1.594997522948413e-05, + "loss": 0.2777, + "step": 2410 + }, + { + "epoch": 0.9655724579663731, + "learning_rate": 1.592748774284844e-05, + "loss": 0.2641, + "step": 2412 + }, + { + "epoch": 0.966373098478783, + "learning_rate": 1.5904953950187448e-05, + "loss": 0.4737, + "step": 2414 + }, + { + "epoch": 0.967173738991193, + "learning_rate": 1.588237402753703e-05, + "loss": 0.2516, + "step": 2416 + }, + { + "epoch": 0.9679743795036029, + "learning_rate": 1.5859748151293354e-05, + "loss": 0.3222, + "step": 2418 + }, + { + "epoch": 0.9687750200160128, + "learning_rate": 1.5837076498211673e-05, + "loss": 0.3818, + "step": 2420 + }, + { + "epoch": 0.9695756605284227, + "learning_rate": 1.581435924540482e-05, + "loss": 0.5602, + "step": 2422 + }, + { + "epoch": 0.9703763010408326, + "learning_rate": 1.579159657034185e-05, + "loss": 0.2985, + "step": 2424 + }, + { + "epoch": 0.9711769415532426, + "learning_rate": 1.5768788650846674e-05, + "loss": 0.2645, + "step": 2426 + }, + { + "epoch": 0.9719775820656525, + "learning_rate": 1.574593566509664e-05, + "loss": 0.2471, + "step": 2428 + }, + { + "epoch": 0.9727782225780625, + "learning_rate": 1.5723037791621203e-05, + "loss": 0.327, + "step": 2430 + }, + { + "epoch": 0.9735788630904724, + "learning_rate": 1.5700095209300386e-05, + "loss": 0.2906, + "step": 2432 + }, + { + "epoch": 0.9743795036028823, + "learning_rate": 1.5677108097363565e-05, + "loss": 0.2786, + "step": 2434 + }, + { + "epoch": 0.9751801441152922, + "learning_rate": 1.5654076635387976e-05, + "loss": 0.251, + "step": 2436 + }, + { + "epoch": 0.9759807846277022, + "learning_rate": 1.5631001003297302e-05, + "loss": 0.329, + "step": 2438 + }, + { + "epoch": 0.9767814251401121, + "learning_rate": 1.560788138136029e-05, + "loss": 0.4895, + "step": 2440 + }, + { + "epoch": 0.977582065652522, + "learning_rate": 1.5584717950189373e-05, + "loss": 0.4519, + "step": 2442 + }, + { + "epoch": 0.978382706164932, + "learning_rate": 1.5561510890739137e-05, + "loss": 0.238, + "step": 2444 + }, + { + "epoch": 0.9791833466773419, + "learning_rate": 1.5538260384305083e-05, + "loss": 0.5112, + "step": 2446 + }, + { + "epoch": 0.9799839871897518, + "learning_rate": 1.5514966612522088e-05, + "loss": 0.3349, + "step": 2448 + }, + { + "epoch": 0.9807846277021617, + "learning_rate": 1.5491629757363033e-05, + "loss": 0.2723, + "step": 2450 + }, + { + "epoch": 0.9815852682145717, + "learning_rate": 1.546825000113736e-05, + "loss": 0.3349, + "step": 2452 + }, + { + "epoch": 0.9823859087269816, + "learning_rate": 1.544482752648966e-05, + "loss": 0.2896, + "step": 2454 + }, + { + "epoch": 0.9831865492393915, + "learning_rate": 1.5421362516398285e-05, + "loss": 0.2616, + "step": 2456 + }, + { + "epoch": 0.9839871897518014, + "learning_rate": 1.539785515417377e-05, + "loss": 0.3391, + "step": 2458 + }, + { + "epoch": 0.9847878302642114, + "learning_rate": 1.5374305623457605e-05, + "loss": 0.3241, + "step": 2460 + }, + { + "epoch": 0.9855884707766213, + "learning_rate": 1.5350714108220677e-05, + "loss": 0.7162, + "step": 2462 + }, + { + "epoch": 0.9863891112890312, + "learning_rate": 1.532708079276186e-05, + "loss": 0.2615, + "step": 2464 + }, + { + "epoch": 0.9871897518014412, + "learning_rate": 1.5303405861706567e-05, + "loss": 0.2528, + "step": 2466 + }, + { + "epoch": 0.9879903923138511, + "learning_rate": 1.5279689500005353e-05, + "loss": 0.2335, + "step": 2468 + }, + { + "epoch": 0.988791032826261, + "learning_rate": 1.5255931892932344e-05, + "loss": 0.2909, + "step": 2470 + }, + { + "epoch": 0.9895916733386709, + "learning_rate": 1.5232133226083962e-05, + "loss": 0.3188, + "step": 2472 + }, + { + "epoch": 0.9903923138510808, + "learning_rate": 1.5208293685377362e-05, + "loss": 0.5323, + "step": 2474 + }, + { + "epoch": 0.9911929543634908, + "learning_rate": 1.5184413457049014e-05, + "loss": 0.3289, + "step": 2476 + }, + { + "epoch": 0.9919935948759008, + "learning_rate": 1.5160492727653238e-05, + "loss": 0.3289, + "step": 2478 + }, + { + "epoch": 0.9927942353883107, + "learning_rate": 1.5136531684060753e-05, + "loss": 0.3483, + "step": 2480 + }, + { + "epoch": 0.9935948759007206, + "learning_rate": 1.5112530513457251e-05, + "loss": 0.3389, + "step": 2482 + }, + { + "epoch": 0.9943955164131305, + "learning_rate": 1.50884894033418e-05, + "loss": 0.304, + "step": 2484 + }, + { + "epoch": 0.9951961569255404, + "learning_rate": 1.5064408541525578e-05, + "loss": 0.4043, + "step": 2486 + }, + { + "epoch": 0.9959967974379503, + "learning_rate": 1.504028811613027e-05, + "loss": 0.2484, + "step": 2488 + }, + { + "epoch": 0.9967974379503602, + "learning_rate": 1.5016128315586636e-05, + "loss": 0.2721, + "step": 2490 + }, + { + "epoch": 0.9975980784627703, + "learning_rate": 1.4991929328633043e-05, + "loss": 0.2723, + "step": 2492 + }, + { + "epoch": 0.9983987189751802, + "learning_rate": 1.4967691344314012e-05, + "loss": 0.446, + "step": 2494 + }, + { + "epoch": 0.9991993594875901, + "learning_rate": 1.4943414551978622e-05, + "loss": 0.3806, + "step": 2496 + }, + { + "epoch": 1.0, + "learning_rate": 1.4919099141279214e-05, + "loss": 0.3191, + "step": 2498 + }, + { + "epoch": 1.0, + "step": 2498, + "total_flos": 1.5157693679403008e+16, + "train_loss": 0.33343404956800254, + "train_runtime": 6963.9531, + "train_samples_per_second": 2.87, + "train_steps_per_second": 0.359 + } + ], + "logging_steps": 2, + "max_steps": 2498, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 1.5157693679403008e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_125_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_125_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9252de45ffe1eec4de4ab89ac853545167362654 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_125_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97f4fd412ee33d1bd79fbb88773909c9e0c7ff98318bf516f923a7cbcb6459bf +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_125_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_125_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..92b8d3c6a355c8f3cdb742b4768a7df1905d49d5 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_125_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0a9f09a5caa1710a28a3cfcfb1b93cac4a7d774d59b385791506e28e09300c5 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_125_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_125_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..32718e8a4d70cdee6c5139952b8b3084a23f27c5 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_125_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76a57ecfcd0384eece7460055782ead19d01efe2166d8879aa383016d1f963e5 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_125_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_125_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b27b83cf664d7f1b4965e9a7a6434d3f0468985d --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_125_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2bda0ebd181847d4833267fd6c95ae55f1d1d4c0e92bd12526d8ba2019fad26 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_25_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_25_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f0bee82b498540bef3897a3b0cee4dd10ad88dc5 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_25_seed1/0_trainer_state.json @@ -0,0 +1,15020 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 4996, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008006405124099279, + "learning_rate": 2.406842319175051e-06, + "loss": 0.1837, + "step": 2 + }, + { + "epoch": 0.0008006405124099279, + "learning_rate": 2.415943612351265e-06, + "loss": 0.137, + "step": 4 + }, + { + "epoch": 0.0016012810248198558, + "learning_rate": 2.4250597173539104e-06, + "loss": 0.1388, + "step": 6 + }, + { + "epoch": 0.0016012810248198558, + "learning_rate": 2.4341906163790364e-06, + "loss": 0.3247, + "step": 8 + }, + { + "epoch": 0.0024019215372297837, + "learning_rate": 2.443336291593801e-06, + "loss": 0.1085, + "step": 10 + }, + { + "epoch": 0.0024019215372297837, + "learning_rate": 2.4524967251364995e-06, + "loss": 0.1875, + "step": 12 + }, + { + "epoch": 0.0032025620496397116, + "learning_rate": 2.461671899116598e-06, + "loss": 0.164, + "step": 14 + }, + { + "epoch": 0.0032025620496397116, + "learning_rate": 2.4708617956148052e-06, + "loss": 0.079, + "step": 16 + }, + { + "epoch": 0.0040032025620496394, + "learning_rate": 2.4800663966830417e-06, + "loss": 0.2091, + "step": 18 + }, + { + "epoch": 0.0040032025620496394, + "learning_rate": 2.4892856843445236e-06, + "loss": 0.1944, + "step": 20 + }, + { + "epoch": 0.004803843074459567, + "learning_rate": 2.4985196405937807e-06, + "loss": 0.1327, + "step": 22 + }, + { + "epoch": 0.004803843074459567, + "learning_rate": 2.507768247396697e-06, + "loss": 0.2811, + "step": 24 + }, + { + "epoch": 0.005604483586869495, + "learning_rate": 2.5170314866905443e-06, + "loss": 0.1624, + "step": 26 + }, + { + "epoch": 0.005604483586869495, + "learning_rate": 2.5263093403840022e-06, + "loss": 0.1611, + "step": 28 + }, + { + "epoch": 0.006405124099279423, + "learning_rate": 2.535601790357246e-06, + "loss": 0.214, + "step": 30 + }, + { + "epoch": 0.006405124099279423, + "learning_rate": 2.5449088184619065e-06, + "loss": 0.0992, + "step": 32 + }, + { + "epoch": 0.007205764611689352, + "learning_rate": 2.5542304065211578e-06, + "loss": 0.0958, + "step": 34 + }, + { + "epoch": 0.007205764611689352, + "learning_rate": 2.5635665363297356e-06, + "loss": 0.2021, + "step": 36 + }, + { + "epoch": 0.008006405124099279, + "learning_rate": 2.5729171896539763e-06, + "loss": 0.3269, + "step": 38 + }, + { + "epoch": 0.008006405124099279, + "learning_rate": 2.5822823482318517e-06, + "loss": 0.0858, + "step": 40 + }, + { + "epoch": 0.008807045636509208, + "learning_rate": 2.5916619937729915e-06, + "loss": 0.1152, + "step": 42 + }, + { + "epoch": 0.008807045636509208, + "learning_rate": 2.6010561079587694e-06, + "loss": 0.078, + "step": 44 + }, + { + "epoch": 0.009607686148919135, + "learning_rate": 2.6104646724422643e-06, + "loss": 0.2577, + "step": 46 + }, + { + "epoch": 0.009607686148919135, + "learning_rate": 2.6198876688483453e-06, + "loss": 0.083, + "step": 48 + }, + { + "epoch": 0.010408326661329063, + "learning_rate": 2.629325078773699e-06, + "loss": 0.6694, + "step": 50 + }, + { + "epoch": 0.010408326661329063, + "learning_rate": 2.6387768837868565e-06, + "loss": 0.1245, + "step": 52 + }, + { + "epoch": 0.01120896717373899, + "learning_rate": 2.648243065428239e-06, + "loss": 0.2879, + "step": 54 + }, + { + "epoch": 0.01120896717373899, + "learning_rate": 2.6577236052101764e-06, + "loss": 0.1754, + "step": 56 + }, + { + "epoch": 0.01200960768614892, + "learning_rate": 2.6672184846169934e-06, + "loss": 0.2798, + "step": 58 + }, + { + "epoch": 0.01200960768614892, + "learning_rate": 2.6767276851049716e-06, + "loss": 0.091, + "step": 60 + }, + { + "epoch": 0.012810248198558846, + "learning_rate": 2.686251188102439e-06, + "loss": 0.1506, + "step": 62 + }, + { + "epoch": 0.012810248198558846, + "learning_rate": 2.6957889750097866e-06, + "loss": 0.2014, + "step": 64 + }, + { + "epoch": 0.013610888710968775, + "learning_rate": 2.7053410271995085e-06, + "loss": 0.1241, + "step": 66 + }, + { + "epoch": 0.013610888710968775, + "learning_rate": 2.7149073260162416e-06, + "loss": 0.219, + "step": 68 + }, + { + "epoch": 0.014411529223378704, + "learning_rate": 2.724487852776785e-06, + "loss": 0.1256, + "step": 70 + }, + { + "epoch": 0.014411529223378704, + "learning_rate": 2.7340825887701848e-06, + "loss": 0.3652, + "step": 72 + }, + { + "epoch": 0.01521216973578863, + "learning_rate": 2.7436915152577038e-06, + "loss": 0.1133, + "step": 74 + }, + { + "epoch": 0.01521216973578863, + "learning_rate": 2.7533146134728993e-06, + "loss": 0.1085, + "step": 76 + }, + { + "epoch": 0.016012810248198558, + "learning_rate": 2.7629518646216522e-06, + "loss": 0.5842, + "step": 78 + }, + { + "epoch": 0.016012810248198558, + "learning_rate": 2.772603249882202e-06, + "loss": 0.6253, + "step": 80 + }, + { + "epoch": 0.016813450760608487, + "learning_rate": 2.782268750405185e-06, + "loss": 0.1312, + "step": 82 + }, + { + "epoch": 0.016813450760608487, + "learning_rate": 2.7919483473136555e-06, + "loss": 0.1036, + "step": 84 + }, + { + "epoch": 0.017614091273018415, + "learning_rate": 2.801642021703177e-06, + "loss": 0.1071, + "step": 86 + }, + { + "epoch": 0.017614091273018415, + "learning_rate": 2.81134975464178e-06, + "loss": 0.209, + "step": 88 + }, + { + "epoch": 0.018414731785428344, + "learning_rate": 2.821071527170053e-06, + "loss": 0.1211, + "step": 90 + }, + { + "epoch": 0.018414731785428344, + "learning_rate": 2.8308073203011634e-06, + "loss": 0.1012, + "step": 92 + }, + { + "epoch": 0.01921537229783827, + "learning_rate": 2.8405571150208945e-06, + "loss": 0.3513, + "step": 94 + }, + { + "epoch": 0.01921537229783827, + "learning_rate": 2.850320892287688e-06, + "loss": 0.2698, + "step": 96 + }, + { + "epoch": 0.020016012810248198, + "learning_rate": 2.860098633032663e-06, + "loss": 0.2184, + "step": 98 + }, + { + "epoch": 0.020016012810248198, + "learning_rate": 2.8698903181597026e-06, + "loss": 0.1107, + "step": 100 + }, + { + "epoch": 0.020816653322658127, + "learning_rate": 2.879695928545424e-06, + "loss": 0.1169, + "step": 102 + }, + { + "epoch": 0.020816653322658127, + "learning_rate": 2.889515445039256e-06, + "loss": 0.1433, + "step": 104 + }, + { + "epoch": 0.021617293835068056, + "learning_rate": 2.899348848463471e-06, + "loss": 0.0968, + "step": 106 + }, + { + "epoch": 0.021617293835068056, + "learning_rate": 2.909196119613218e-06, + "loss": 0.1201, + "step": 108 + }, + { + "epoch": 0.02241793434747798, + "learning_rate": 2.9190572392565643e-06, + "loss": 0.2138, + "step": 110 + }, + { + "epoch": 0.02241793434747798, + "learning_rate": 2.928932188134529e-06, + "loss": 0.1597, + "step": 112 + }, + { + "epoch": 0.02321857485988791, + "learning_rate": 2.9388209469611093e-06, + "loss": 0.0958, + "step": 114 + }, + { + "epoch": 0.02321857485988791, + "learning_rate": 2.9487234964233724e-06, + "loss": 0.1017, + "step": 116 + }, + { + "epoch": 0.02401921537229784, + "learning_rate": 2.9586398171814114e-06, + "loss": 0.1107, + "step": 118 + }, + { + "epoch": 0.02401921537229784, + "learning_rate": 2.9685698898684355e-06, + "loss": 0.04, + "step": 120 + }, + { + "epoch": 0.024819855884707767, + "learning_rate": 2.9785136950907987e-06, + "loss": 0.3738, + "step": 122 + }, + { + "epoch": 0.024819855884707767, + "learning_rate": 2.988471213428035e-06, + "loss": 0.0265, + "step": 124 + }, + { + "epoch": 0.025620496397117692, + "learning_rate": 2.9984424254328936e-06, + "loss": 0.1569, + "step": 126 + }, + { + "epoch": 0.025620496397117692, + "learning_rate": 3.00842731163137e-06, + "loss": 0.5498, + "step": 128 + }, + { + "epoch": 0.02642113690952762, + "learning_rate": 3.0184258525227895e-06, + "loss": 0.6811, + "step": 130 + }, + { + "epoch": 0.02642113690952762, + "learning_rate": 3.0284380285797733e-06, + "loss": 0.0504, + "step": 132 + }, + { + "epoch": 0.02722177742193755, + "learning_rate": 3.038463820248324e-06, + "loss": 0.2216, + "step": 134 + }, + { + "epoch": 0.02722177742193755, + "learning_rate": 3.048503207947854e-06, + "loss": 0.0719, + "step": 136 + }, + { + "epoch": 0.02802241793434748, + "learning_rate": 3.0585561720712207e-06, + "loss": 0.1581, + "step": 138 + }, + { + "epoch": 0.02802241793434748, + "learning_rate": 3.068622692984767e-06, + "loss": 0.0228, + "step": 140 + }, + { + "epoch": 0.028823058446757407, + "learning_rate": 3.0787027510283495e-06, + "loss": 0.1019, + "step": 142 + }, + { + "epoch": 0.028823058446757407, + "learning_rate": 3.0887963265154187e-06, + "loss": 0.209, + "step": 144 + }, + { + "epoch": 0.029623698959167333, + "learning_rate": 3.098903399732992e-06, + "loss": 0.4918, + "step": 146 + }, + { + "epoch": 0.029623698959167333, + "learning_rate": 3.1090239509417364e-06, + "loss": 0.0516, + "step": 148 + }, + { + "epoch": 0.03042433947157726, + "learning_rate": 3.1191579603759946e-06, + "loss": 0.1138, + "step": 150 + }, + { + "epoch": 0.03042433947157726, + "learning_rate": 3.129305408243829e-06, + "loss": 0.0958, + "step": 152 + }, + { + "epoch": 0.03122497998398719, + "learning_rate": 3.139466274727052e-06, + "loss": 0.1987, + "step": 154 + }, + { + "epoch": 0.03122497998398719, + "learning_rate": 3.1496405399812602e-06, + "loss": 0.1433, + "step": 156 + }, + { + "epoch": 0.032025620496397116, + "learning_rate": 3.159828184135917e-06, + "loss": 0.3302, + "step": 158 + }, + { + "epoch": 0.032025620496397116, + "learning_rate": 3.17002918729432e-06, + "loss": 0.04, + "step": 160 + }, + { + "epoch": 0.03282626100880705, + "learning_rate": 3.1802435295336908e-06, + "loss": 0.1231, + "step": 162 + }, + { + "epoch": 0.03282626100880705, + "learning_rate": 3.1904711909051967e-06, + "loss": 0.2238, + "step": 164 + }, + { + "epoch": 0.03362690152121697, + "learning_rate": 3.2007121514339924e-06, + "loss": 0.1847, + "step": 166 + }, + { + "epoch": 0.03362690152121697, + "learning_rate": 3.2109663911192622e-06, + "loss": 0.4912, + "step": 168 + }, + { + "epoch": 0.0344275420336269, + "learning_rate": 3.221233889934239e-06, + "loss": 0.4588, + "step": 170 + }, + { + "epoch": 0.0344275420336269, + "learning_rate": 3.231514627826302e-06, + "loss": 0.2498, + "step": 172 + }, + { + "epoch": 0.03522818254603683, + "learning_rate": 3.2418085847169344e-06, + "loss": 0.0887, + "step": 174 + }, + { + "epoch": 0.03522818254603683, + "learning_rate": 3.2521157405018146e-06, + "loss": 0.0328, + "step": 176 + }, + { + "epoch": 0.036028823058446756, + "learning_rate": 3.2624360750508457e-06, + "loss": 0.2531, + "step": 178 + }, + { + "epoch": 0.036028823058446756, + "learning_rate": 3.2727695682081897e-06, + "loss": 0.4032, + "step": 180 + }, + { + "epoch": 0.03682946357085669, + "learning_rate": 3.28311619979231e-06, + "loss": 0.2236, + "step": 182 + }, + { + "epoch": 0.03682946357085669, + "learning_rate": 3.293475949595998e-06, + "loss": 0.6358, + "step": 184 + }, + { + "epoch": 0.03763010408326661, + "learning_rate": 3.303848797386465e-06, + "loss": 0.1072, + "step": 186 + }, + { + "epoch": 0.03763010408326661, + "learning_rate": 3.314234722905302e-06, + "loss": 0.1338, + "step": 188 + }, + { + "epoch": 0.03843074459567654, + "learning_rate": 3.3246337058685697e-06, + "loss": 0.3992, + "step": 190 + }, + { + "epoch": 0.03843074459567654, + "learning_rate": 3.335045725966829e-06, + "loss": 0.0423, + "step": 192 + }, + { + "epoch": 0.03923138510808647, + "learning_rate": 3.3454707628651806e-06, + "loss": 0.3688, + "step": 194 + }, + { + "epoch": 0.03923138510808647, + "learning_rate": 3.355908796203301e-06, + "loss": 0.0216, + "step": 196 + }, + { + "epoch": 0.040032025620496396, + "learning_rate": 3.3663598055954716e-06, + "loss": 0.1035, + "step": 198 + }, + { + "epoch": 0.040032025620496396, + "learning_rate": 3.3768237706306716e-06, + "loss": 0.4789, + "step": 200 + }, + { + "epoch": 0.04083266613290633, + "learning_rate": 3.3873006708725365e-06, + "loss": 0.1658, + "step": 202 + }, + { + "epoch": 0.04083266613290633, + "learning_rate": 3.3977904858594534e-06, + "loss": 0.4219, + "step": 204 + }, + { + "epoch": 0.041633306645316254, + "learning_rate": 3.408293195104586e-06, + "loss": 0.1471, + "step": 206 + }, + { + "epoch": 0.041633306645316254, + "learning_rate": 3.418808778095917e-06, + "loss": 0.3087, + "step": 208 + }, + { + "epoch": 0.04243394715772618, + "learning_rate": 3.4293372142962845e-06, + "loss": 0.1581, + "step": 210 + }, + { + "epoch": 0.04243394715772618, + "learning_rate": 3.4398784831434097e-06, + "loss": 0.1016, + "step": 212 + }, + { + "epoch": 0.04323458767013611, + "learning_rate": 3.4504325640499936e-06, + "loss": 0.2027, + "step": 214 + }, + { + "epoch": 0.04323458767013611, + "learning_rate": 3.460999436403676e-06, + "loss": 0.105, + "step": 216 + }, + { + "epoch": 0.044035228182546036, + "learning_rate": 3.4715790795671232e-06, + "loss": 0.144, + "step": 218 + }, + { + "epoch": 0.044035228182546036, + "learning_rate": 3.4821714728780654e-06, + "loss": 0.4032, + "step": 220 + }, + { + "epoch": 0.04483586869495596, + "learning_rate": 3.4927765956493276e-06, + "loss": 0.0924, + "step": 222 + }, + { + "epoch": 0.04483586869495596, + "learning_rate": 3.5033944271688624e-06, + "loss": 0.1438, + "step": 224 + }, + { + "epoch": 0.045636509207365894, + "learning_rate": 3.514024946699842e-06, + "loss": 0.2157, + "step": 226 + }, + { + "epoch": 0.045636509207365894, + "learning_rate": 3.5246681334806177e-06, + "loss": 0.2133, + "step": 228 + }, + { + "epoch": 0.04643714971977582, + "learning_rate": 3.535323966724814e-06, + "loss": 0.0215, + "step": 230 + }, + { + "epoch": 0.04643714971977582, + "learning_rate": 3.5459924256213596e-06, + "loss": 0.028, + "step": 232 + }, + { + "epoch": 0.04723779023218575, + "learning_rate": 3.556673489334522e-06, + "loss": 0.1222, + "step": 234 + }, + { + "epoch": 0.04723779023218575, + "learning_rate": 3.567367137003953e-06, + "loss": 0.2998, + "step": 236 + }, + { + "epoch": 0.04803843074459568, + "learning_rate": 3.5780733477447127e-06, + "loss": 0.6442, + "step": 238 + }, + { + "epoch": 0.04803843074459568, + "learning_rate": 3.588792100647368e-06, + "loss": 0.3422, + "step": 240 + }, + { + "epoch": 0.0488390712570056, + "learning_rate": 3.5995233747779467e-06, + "loss": 0.078, + "step": 242 + }, + { + "epoch": 0.0488390712570056, + "learning_rate": 3.6102671491780393e-06, + "loss": 0.2073, + "step": 244 + }, + { + "epoch": 0.049639711769415534, + "learning_rate": 3.6210234028648216e-06, + "loss": 0.0957, + "step": 246 + }, + { + "epoch": 0.049639711769415534, + "learning_rate": 3.6317921148310965e-06, + "loss": 0.0095, + "step": 248 + }, + { + "epoch": 0.05044035228182546, + "learning_rate": 3.6425732640453235e-06, + "loss": 0.1492, + "step": 250 + }, + { + "epoch": 0.05044035228182546, + "learning_rate": 3.653366829451711e-06, + "loss": 0.0762, + "step": 252 + }, + { + "epoch": 0.051240992794235385, + "learning_rate": 3.6641727899701795e-06, + "loss": 0.209, + "step": 254 + }, + { + "epoch": 0.051240992794235385, + "learning_rate": 3.674991124496452e-06, + "loss": 0.1668, + "step": 256 + }, + { + "epoch": 0.05204163330664532, + "learning_rate": 3.6858218119020884e-06, + "loss": 0.078, + "step": 258 + }, + { + "epoch": 0.05204163330664532, + "learning_rate": 3.696664831034521e-06, + "loss": 0.0654, + "step": 260 + }, + { + "epoch": 0.05284227381905524, + "learning_rate": 3.7075201607170997e-06, + "loss": 0.0909, + "step": 262 + }, + { + "epoch": 0.05284227381905524, + "learning_rate": 3.7183877797491143e-06, + "loss": 0.0376, + "step": 264 + }, + { + "epoch": 0.053642914331465175, + "learning_rate": 3.729267666905899e-06, + "loss": 0.3431, + "step": 266 + }, + { + "epoch": 0.053642914331465175, + "learning_rate": 3.740159800938784e-06, + "loss": 0.0255, + "step": 268 + }, + { + "epoch": 0.0544435548438751, + "learning_rate": 3.751064160575195e-06, + "loss": 0.4538, + "step": 270 + }, + { + "epoch": 0.0544435548438751, + "learning_rate": 3.7619807245186824e-06, + "loss": 0.0347, + "step": 272 + }, + { + "epoch": 0.055244195356285025, + "learning_rate": 3.772909471448959e-06, + "loss": 0.6067, + "step": 274 + }, + { + "epoch": 0.055244195356285025, + "learning_rate": 3.783850380021933e-06, + "loss": 0.0212, + "step": 276 + }, + { + "epoch": 0.05604483586869496, + "learning_rate": 3.794803428869799e-06, + "loss": 0.7026, + "step": 278 + }, + { + "epoch": 0.05604483586869496, + "learning_rate": 3.8057685966010025e-06, + "loss": 0.3088, + "step": 280 + }, + { + "epoch": 0.05684547638110488, + "learning_rate": 3.816745861800334e-06, + "loss": 0.158, + "step": 282 + }, + { + "epoch": 0.05684547638110488, + "learning_rate": 3.827735203028956e-06, + "loss": 0.1093, + "step": 284 + }, + { + "epoch": 0.057646116893514815, + "learning_rate": 3.838736598824446e-06, + "loss": 0.3071, + "step": 286 + }, + { + "epoch": 0.057646116893514815, + "learning_rate": 3.849750027700842e-06, + "loss": 0.0633, + "step": 288 + }, + { + "epoch": 0.05844675740592474, + "learning_rate": 3.860775468148662e-06, + "loss": 0.3126, + "step": 290 + }, + { + "epoch": 0.05844675740592474, + "learning_rate": 3.871812898635011e-06, + "loss": 0.0558, + "step": 292 + }, + { + "epoch": 0.059247397918334666, + "learning_rate": 3.882862297603536e-06, + "loss": 0.6192, + "step": 294 + }, + { + "epoch": 0.059247397918334666, + "learning_rate": 3.8939236434745184e-06, + "loss": 0.1351, + "step": 296 + }, + { + "epoch": 0.0600480384307446, + "learning_rate": 3.904996914644913e-06, + "loss": 0.1579, + "step": 298 + }, + { + "epoch": 0.0600480384307446, + "learning_rate": 3.916082089488379e-06, + "loss": 0.0492, + "step": 300 + }, + { + "epoch": 0.06084867894315452, + "learning_rate": 3.927179146355317e-06, + "loss": 0.5427, + "step": 302 + }, + { + "epoch": 0.06084867894315452, + "learning_rate": 3.938288063572962e-06, + "loss": 0.0958, + "step": 304 + }, + { + "epoch": 0.06164931945556445, + "learning_rate": 3.949408819445345e-06, + "loss": 0.2609, + "step": 306 + }, + { + "epoch": 0.06164931945556445, + "learning_rate": 3.960541392253387e-06, + "loss": 0.27, + "step": 308 + }, + { + "epoch": 0.06244995996797438, + "learning_rate": 3.971685760254933e-06, + "loss": 0.4258, + "step": 310 + }, + { + "epoch": 0.06244995996797438, + "learning_rate": 3.982841901684792e-06, + "loss": 0.0762, + "step": 312 + }, + { + "epoch": 0.0632506004803843, + "learning_rate": 3.994009794754777e-06, + "loss": 0.0854, + "step": 314 + }, + { + "epoch": 0.0632506004803843, + "learning_rate": 4.005189417653737e-06, + "loss": 0.2072, + "step": 316 + }, + { + "epoch": 0.06405124099279423, + "learning_rate": 4.016380748547654e-06, + "loss": 0.1933, + "step": 318 + }, + { + "epoch": 0.06405124099279423, + "learning_rate": 4.027583765579601e-06, + "loss": 0.1992, + "step": 320 + }, + { + "epoch": 0.06485188150520416, + "learning_rate": 4.038798446869847e-06, + "loss": 0.2866, + "step": 322 + }, + { + "epoch": 0.06485188150520416, + "learning_rate": 4.050024770515873e-06, + "loss": 0.0506, + "step": 324 + }, + { + "epoch": 0.0656525220176141, + "learning_rate": 4.061262714592426e-06, + "loss": 0.1326, + "step": 326 + }, + { + "epoch": 0.0656525220176141, + "learning_rate": 4.072512257151546e-06, + "loss": 0.0592, + "step": 328 + }, + { + "epoch": 0.06645316253002402, + "learning_rate": 4.0837733762226584e-06, + "loss": 0.1291, + "step": 330 + }, + { + "epoch": 0.06645316253002402, + "learning_rate": 4.095046049812541e-06, + "loss": 0.2255, + "step": 332 + }, + { + "epoch": 0.06725380304243395, + "learning_rate": 4.106330255905417e-06, + "loss": 0.4503, + "step": 334 + }, + { + "epoch": 0.06725380304243395, + "learning_rate": 4.117625972462988e-06, + "loss": 0.1545, + "step": 336 + }, + { + "epoch": 0.06805444355484387, + "learning_rate": 4.128933177424475e-06, + "loss": 0.0849, + "step": 338 + }, + { + "epoch": 0.06805444355484387, + "learning_rate": 4.1402518487066624e-06, + "loss": 0.005, + "step": 340 + }, + { + "epoch": 0.0688550840672538, + "learning_rate": 4.151581964203924e-06, + "loss": 0.1231, + "step": 342 + }, + { + "epoch": 0.0688550840672538, + "learning_rate": 4.1629235017883285e-06, + "loss": 0.0066, + "step": 344 + }, + { + "epoch": 0.06965572457966374, + "learning_rate": 4.174276439309593e-06, + "loss": 0.2019, + "step": 346 + }, + { + "epoch": 0.06965572457966374, + "learning_rate": 4.1856407545951825e-06, + "loss": 0.0196, + "step": 348 + }, + { + "epoch": 0.07045636509207366, + "learning_rate": 4.197016425450347e-06, + "loss": 0.1949, + "step": 350 + }, + { + "epoch": 0.07045636509207366, + "learning_rate": 4.208403429658151e-06, + "loss": 0.3205, + "step": 352 + }, + { + "epoch": 0.07125700560448359, + "learning_rate": 4.219801744979517e-06, + "loss": 0.098, + "step": 354 + }, + { + "epoch": 0.07125700560448359, + "learning_rate": 4.2312113491533145e-06, + "loss": 0.0923, + "step": 356 + }, + { + "epoch": 0.07205764611689351, + "learning_rate": 4.242632219896328e-06, + "loss": 0.1154, + "step": 358 + }, + { + "epoch": 0.07205764611689351, + "learning_rate": 4.254064334903347e-06, + "loss": 0.0383, + "step": 360 + }, + { + "epoch": 0.07285828662930344, + "learning_rate": 4.2655076718472045e-06, + "loss": 0.2207, + "step": 362 + }, + { + "epoch": 0.07285828662930344, + "learning_rate": 4.276962208378814e-06, + "loss": 0.0514, + "step": 364 + }, + { + "epoch": 0.07365892714171338, + "learning_rate": 4.28842792212722e-06, + "loss": 0.5628, + "step": 366 + }, + { + "epoch": 0.07365892714171338, + "learning_rate": 4.299904790699619e-06, + "loss": 0.3137, + "step": 368 + }, + { + "epoch": 0.0744595676541233, + "learning_rate": 4.3113927916814665e-06, + "loss": 0.3749, + "step": 370 + }, + { + "epoch": 0.0744595676541233, + "learning_rate": 4.3228919026364345e-06, + "loss": 0.2242, + "step": 372 + }, + { + "epoch": 0.07526020816653323, + "learning_rate": 4.33440210110651e-06, + "loss": 0.3419, + "step": 374 + }, + { + "epoch": 0.07526020816653323, + "learning_rate": 4.345923364612024e-06, + "loss": 0.0688, + "step": 376 + }, + { + "epoch": 0.07606084867894315, + "learning_rate": 4.3574556706517035e-06, + "loss": 0.0983, + "step": 378 + }, + { + "epoch": 0.07606084867894315, + "learning_rate": 4.368998996702686e-06, + "loss": 0.0157, + "step": 380 + }, + { + "epoch": 0.07686148919135308, + "learning_rate": 4.380553320220638e-06, + "loss": 0.0847, + "step": 382 + }, + { + "epoch": 0.07686148919135308, + "learning_rate": 4.392118618639698e-06, + "loss": 0.4016, + "step": 384 + }, + { + "epoch": 0.07766212970376302, + "learning_rate": 4.403694869372589e-06, + "loss": 0.138, + "step": 386 + }, + { + "epoch": 0.07766212970376302, + "learning_rate": 4.415282049810643e-06, + "loss": 0.0347, + "step": 388 + }, + { + "epoch": 0.07846277021617294, + "learning_rate": 4.4268801373238454e-06, + "loss": 0.0908, + "step": 390 + }, + { + "epoch": 0.07846277021617294, + "learning_rate": 4.4384891092608795e-06, + "loss": 0.0011, + "step": 392 + }, + { + "epoch": 0.07926341072858287, + "learning_rate": 4.450108942949158e-06, + "loss": 0.5788, + "step": 394 + }, + { + "epoch": 0.07926341072858287, + "learning_rate": 4.461739615694921e-06, + "loss": 0.667, + "step": 396 + }, + { + "epoch": 0.08006405124099279, + "learning_rate": 4.473381104783201e-06, + "loss": 0.167, + "step": 398 + }, + { + "epoch": 0.08006405124099279, + "learning_rate": 4.485033387477915e-06, + "loss": 0.1626, + "step": 400 + }, + { + "epoch": 0.08086469175340272, + "learning_rate": 4.496696441021904e-06, + "loss": 1.0366, + "step": 402 + }, + { + "epoch": 0.08086469175340272, + "learning_rate": 4.5083702426369715e-06, + "loss": 0.3269, + "step": 404 + }, + { + "epoch": 0.08166533226581266, + "learning_rate": 4.520054769523929e-06, + "loss": 0.3985, + "step": 406 + }, + { + "epoch": 0.08166533226581266, + "learning_rate": 4.531749998862628e-06, + "loss": 0.1938, + "step": 408 + }, + { + "epoch": 0.08246597277822258, + "learning_rate": 4.543455907812063e-06, + "loss": 0.3644, + "step": 410 + }, + { + "epoch": 0.08246597277822258, + "learning_rate": 4.555172473510324e-06, + "loss": 0.0716, + "step": 412 + }, + { + "epoch": 0.08326661329063251, + "learning_rate": 4.566899673074706e-06, + "loss": 0.1137, + "step": 414 + }, + { + "epoch": 0.08326661329063251, + "learning_rate": 4.578637483601732e-06, + "loss": 0.4658, + "step": 416 + }, + { + "epoch": 0.08406725380304243, + "learning_rate": 4.590385882167206e-06, + "loss": 0.3881, + "step": 418 + }, + { + "epoch": 0.08406725380304243, + "learning_rate": 4.602144845826234e-06, + "loss": 0.5698, + "step": 420 + }, + { + "epoch": 0.08486789431545236, + "learning_rate": 4.613914351613337e-06, + "loss": 0.1149, + "step": 422 + }, + { + "epoch": 0.08486789431545236, + "learning_rate": 4.625694376542399e-06, + "loss": 0.149, + "step": 424 + }, + { + "epoch": 0.08566853482786228, + "learning_rate": 4.637484897606777e-06, + "loss": 0.4396, + "step": 426 + }, + { + "epoch": 0.08566853482786228, + "learning_rate": 4.649285891779326e-06, + "loss": 0.5493, + "step": 428 + }, + { + "epoch": 0.08646917534027222, + "learning_rate": 4.661097336012451e-06, + "loss": 0.4238, + "step": 430 + }, + { + "epoch": 0.08646917534027222, + "learning_rate": 4.672919207238145e-06, + "loss": 0.4509, + "step": 432 + }, + { + "epoch": 0.08726981585268215, + "learning_rate": 4.684751482368022e-06, + "loss": 0.2188, + "step": 434 + }, + { + "epoch": 0.08726981585268215, + "learning_rate": 4.696594138293421e-06, + "loss": 0.1277, + "step": 436 + }, + { + "epoch": 0.08807045636509207, + "learning_rate": 4.7084471518853656e-06, + "loss": 0.4742, + "step": 438 + }, + { + "epoch": 0.08807045636509207, + "learning_rate": 4.720310499994664e-06, + "loss": 0.3137, + "step": 440 + }, + { + "epoch": 0.088871096877502, + "learning_rate": 4.732184159451937e-06, + "loss": 0.3476, + "step": 442 + }, + { + "epoch": 0.088871096877502, + "learning_rate": 4.744068107067673e-06, + "loss": 0.141, + "step": 444 + }, + { + "epoch": 0.08967173738991192, + "learning_rate": 4.755962319632249e-06, + "loss": 0.3123, + "step": 446 + }, + { + "epoch": 0.08967173738991192, + "learning_rate": 4.767866773916041e-06, + "loss": 0.0884, + "step": 448 + }, + { + "epoch": 0.09047237790232186, + "learning_rate": 4.779781446669376e-06, + "loss": 0.2609, + "step": 450 + }, + { + "epoch": 0.09047237790232186, + "learning_rate": 4.79170631462264e-06, + "loss": 0.2777, + "step": 452 + }, + { + "epoch": 0.09127301841473179, + "learning_rate": 4.8036413544863095e-06, + "loss": 0.3376, + "step": 454 + }, + { + "epoch": 0.09127301841473179, + "learning_rate": 4.81558654295099e-06, + "loss": 0.3037, + "step": 456 + }, + { + "epoch": 0.09207365892714171, + "learning_rate": 4.827541856687471e-06, + "loss": 0.2645, + "step": 458 + }, + { + "epoch": 0.09207365892714171, + "learning_rate": 4.839507272346751e-06, + "loss": 0.2243, + "step": 460 + }, + { + "epoch": 0.09287429943955164, + "learning_rate": 4.8514827665601425e-06, + "loss": 0.1133, + "step": 462 + }, + { + "epoch": 0.09287429943955164, + "learning_rate": 4.863468315939234e-06, + "loss": 0.0677, + "step": 464 + }, + { + "epoch": 0.09367493995196156, + "learning_rate": 4.875463897075985e-06, + "loss": 0.1072, + "step": 466 + }, + { + "epoch": 0.09367493995196156, + "learning_rate": 4.8874694865427676e-06, + "loss": 0.1586, + "step": 468 + }, + { + "epoch": 0.0944755804643715, + "learning_rate": 4.899485060892404e-06, + "loss": 0.2633, + "step": 470 + }, + { + "epoch": 0.0944755804643715, + "learning_rate": 4.911510596658202e-06, + "loss": 0.0976, + "step": 472 + }, + { + "epoch": 0.09527622097678143, + "learning_rate": 4.9235460703540615e-06, + "loss": 0.1016, + "step": 474 + }, + { + "epoch": 0.09527622097678143, + "learning_rate": 4.935591458474425e-06, + "loss": 0.7014, + "step": 476 + }, + { + "epoch": 0.09607686148919135, + "learning_rate": 4.947646737494389e-06, + "loss": 0.5678, + "step": 478 + }, + { + "epoch": 0.09607686148919135, + "learning_rate": 4.959711883869734e-06, + "loss": 0.267, + "step": 480 + }, + { + "epoch": 0.09687750200160128, + "learning_rate": 4.9717868740369645e-06, + "loss": 0.2347, + "step": 482 + }, + { + "epoch": 0.09687750200160128, + "learning_rate": 4.9838716844133665e-06, + "loss": 0.1177, + "step": 484 + }, + { + "epoch": 0.0976781425140112, + "learning_rate": 4.9959662913970254e-06, + "loss": 0.1108, + "step": 486 + }, + { + "epoch": 0.0976781425140112, + "learning_rate": 5.0080706713669435e-06, + "loss": 0.0902, + "step": 488 + }, + { + "epoch": 0.09847878302642114, + "learning_rate": 5.02018480068299e-06, + "loss": 0.1663, + "step": 490 + }, + { + "epoch": 0.09847878302642114, + "learning_rate": 5.032308655686007e-06, + "loss": 0.105, + "step": 492 + }, + { + "epoch": 0.09927942353883107, + "learning_rate": 5.044442212697842e-06, + "loss": 0.2408, + "step": 494 + }, + { + "epoch": 0.09927942353883107, + "learning_rate": 5.056585448021398e-06, + "loss": 0.0514, + "step": 496 + }, + { + "epoch": 0.100080064051241, + "learning_rate": 5.068738337940655e-06, + "loss": 0.2519, + "step": 498 + }, + { + "epoch": 0.100080064051241, + "learning_rate": 5.080900858720789e-06, + "loss": 0.158, + "step": 500 + }, + { + "epoch": 0.10088070456365092, + "learning_rate": 5.093072986608116e-06, + "loss": 0.3834, + "step": 502 + }, + { + "epoch": 0.10088070456365092, + "learning_rate": 5.105254697830208e-06, + "loss": 0.0758, + "step": 504 + }, + { + "epoch": 0.10168134507606084, + "learning_rate": 5.1174459685959175e-06, + "loss": 0.5506, + "step": 506 + }, + { + "epoch": 0.10168134507606084, + "learning_rate": 5.129646775095432e-06, + "loss": 0.2481, + "step": 508 + }, + { + "epoch": 0.10248198558847077, + "learning_rate": 5.141857093500307e-06, + "loss": 0.4852, + "step": 510 + }, + { + "epoch": 0.10248198558847077, + "learning_rate": 5.154076899963514e-06, + "loss": 0.1585, + "step": 512 + }, + { + "epoch": 0.10328262610088071, + "learning_rate": 5.166306170619537e-06, + "loss": 0.1898, + "step": 514 + }, + { + "epoch": 0.10328262610088071, + "learning_rate": 5.178544881584328e-06, + "loss": 0.1211, + "step": 516 + }, + { + "epoch": 0.10408326661329063, + "learning_rate": 5.190793008955421e-06, + "loss": 0.2775, + "step": 518 + }, + { + "epoch": 0.10408326661329063, + "learning_rate": 5.203050528811959e-06, + "loss": 0.0987, + "step": 520 + }, + { + "epoch": 0.10488390712570056, + "learning_rate": 5.215317417214739e-06, + "loss": 0.0789, + "step": 522 + }, + { + "epoch": 0.10488390712570056, + "learning_rate": 5.227593650206246e-06, + "loss": 0.0017, + "step": 524 + }, + { + "epoch": 0.10568454763811048, + "learning_rate": 5.239879203810763e-06, + "loss": 0.0573, + "step": 526 + }, + { + "epoch": 0.10568454763811048, + "learning_rate": 5.2521740540343205e-06, + "loss": 0.0253, + "step": 528 + }, + { + "epoch": 0.10648518815052041, + "learning_rate": 5.264478176864811e-06, + "loss": 0.2421, + "step": 530 + }, + { + "epoch": 0.10648518815052041, + "learning_rate": 5.2767915482720164e-06, + "loss": 0.117, + "step": 532 + }, + { + "epoch": 0.10728582866293035, + "learning_rate": 5.289114144207656e-06, + "loss": 0.1657, + "step": 534 + }, + { + "epoch": 0.10728582866293035, + "learning_rate": 5.3014459406054295e-06, + "loss": 0.3285, + "step": 536 + }, + { + "epoch": 0.10808646917534027, + "learning_rate": 5.313786913381061e-06, + "loss": 0.1668, + "step": 538 + }, + { + "epoch": 0.10808646917534027, + "learning_rate": 5.3261370384323904e-06, + "loss": 0.2319, + "step": 540 + }, + { + "epoch": 0.1088871096877502, + "learning_rate": 5.338496291639341e-06, + "loss": 0.956, + "step": 542 + }, + { + "epoch": 0.1088871096877502, + "learning_rate": 5.350864648864026e-06, + "loss": 0.2241, + "step": 544 + }, + { + "epoch": 0.10968775020016013, + "learning_rate": 5.363242085950773e-06, + "loss": 0.1962, + "step": 546 + }, + { + "epoch": 0.10968775020016013, + "learning_rate": 5.375628578726181e-06, + "loss": 0.5747, + "step": 548 + }, + { + "epoch": 0.11048839071257005, + "learning_rate": 5.3880241029991434e-06, + "loss": 0.232, + "step": 550 + }, + { + "epoch": 0.11048839071257005, + "learning_rate": 5.4004286345609665e-06, + "loss": 0.4421, + "step": 552 + }, + { + "epoch": 0.11128903122497999, + "learning_rate": 5.412842149185316e-06, + "loss": 0.1223, + "step": 554 + }, + { + "epoch": 0.11128903122497999, + "learning_rate": 5.425264622628326e-06, + "loss": 0.1072, + "step": 556 + }, + { + "epoch": 0.11208967173738991, + "learning_rate": 5.437696030628639e-06, + "loss": 0.1282, + "step": 558 + }, + { + "epoch": 0.11208967173738991, + "learning_rate": 5.450136348907444e-06, + "loss": 0.046, + "step": 560 + }, + { + "epoch": 0.11289031224979984, + "learning_rate": 5.462585553168532e-06, + "loss": 0.1456, + "step": 562 + }, + { + "epoch": 0.11289031224979984, + "learning_rate": 5.475043619098321e-06, + "loss": 0.2147, + "step": 564 + }, + { + "epoch": 0.11369095276220977, + "learning_rate": 5.487510522365969e-06, + "loss": 0.2151, + "step": 566 + }, + { + "epoch": 0.11369095276220977, + "learning_rate": 5.499986238623329e-06, + "loss": 0.2098, + "step": 568 + }, + { + "epoch": 0.11449159327461969, + "learning_rate": 5.512470743505057e-06, + "loss": 0.1212, + "step": 570 + }, + { + "epoch": 0.11449159327461969, + "learning_rate": 5.524964012628644e-06, + "loss": 0.1138, + "step": 572 + }, + { + "epoch": 0.11529223378702963, + "learning_rate": 5.537466021594464e-06, + "loss": 0.3512, + "step": 574 + }, + { + "epoch": 0.11529223378702963, + "learning_rate": 5.549976745985809e-06, + "loss": 0.0874, + "step": 576 + }, + { + "epoch": 0.11609287429943956, + "learning_rate": 5.5624961613689934e-06, + "loss": 0.2841, + "step": 578 + }, + { + "epoch": 0.11609287429943956, + "learning_rate": 5.57502424329331e-06, + "loss": 0.0634, + "step": 580 + }, + { + "epoch": 0.11689351481184948, + "learning_rate": 5.5875609672911465e-06, + "loss": 0.1715, + "step": 582 + }, + { + "epoch": 0.11689351481184948, + "learning_rate": 5.6001063088780085e-06, + "loss": 0.684, + "step": 584 + }, + { + "epoch": 0.1176941553242594, + "learning_rate": 5.6126602435525725e-06, + "loss": 0.2229, + "step": 586 + }, + { + "epoch": 0.1176941553242594, + "learning_rate": 5.62522274679673e-06, + "loss": 0.7892, + "step": 588 + }, + { + "epoch": 0.11849479583666933, + "learning_rate": 5.637793794075625e-06, + "loss": 0.1324, + "step": 590 + }, + { + "epoch": 0.11849479583666933, + "learning_rate": 5.650373360837763e-06, + "loss": 0.2225, + "step": 592 + }, + { + "epoch": 0.11929543634907927, + "learning_rate": 5.662961422514961e-06, + "loss": 0.6194, + "step": 594 + }, + { + "epoch": 0.11929543634907927, + "learning_rate": 5.675557954522462e-06, + "loss": 0.0789, + "step": 596 + }, + { + "epoch": 0.1200960768614892, + "learning_rate": 5.688162932258965e-06, + "loss": 0.1673, + "step": 598 + }, + { + "epoch": 0.1200960768614892, + "learning_rate": 5.700776331106674e-06, + "loss": 0.3653, + "step": 600 + }, + { + "epoch": 0.12089671737389912, + "learning_rate": 5.713398126431353e-06, + "loss": 0.3828, + "step": 602 + }, + { + "epoch": 0.12089671737389912, + "learning_rate": 5.726028293582342e-06, + "loss": 0.6325, + "step": 604 + }, + { + "epoch": 0.12169735788630905, + "learning_rate": 5.738666807892684e-06, + "loss": 0.2609, + "step": 606 + }, + { + "epoch": 0.12169735788630905, + "learning_rate": 5.751313644679071e-06, + "loss": 0.0512, + "step": 608 + }, + { + "epoch": 0.12249799839871897, + "learning_rate": 5.763968779241957e-06, + "loss": 0.1663, + "step": 610 + }, + { + "epoch": 0.12249799839871897, + "learning_rate": 5.776632186865589e-06, + "loss": 0.4288, + "step": 612 + }, + { + "epoch": 0.1232986389111289, + "learning_rate": 5.7893038428180584e-06, + "loss": 0.3353, + "step": 614 + }, + { + "epoch": 0.1232986389111289, + "learning_rate": 5.8019837223513295e-06, + "loss": 0.1313, + "step": 616 + }, + { + "epoch": 0.12409927942353884, + "learning_rate": 5.814671800701357e-06, + "loss": 0.3143, + "step": 618 + }, + { + "epoch": 0.12409927942353884, + "learning_rate": 5.827368053088032e-06, + "loss": 0.1862, + "step": 620 + }, + { + "epoch": 0.12489991993594876, + "learning_rate": 5.840072454715297e-06, + "loss": 0.5415, + "step": 622 + }, + { + "epoch": 0.12489991993594876, + "learning_rate": 5.852784980771182e-06, + "loss": 0.7485, + "step": 624 + }, + { + "epoch": 0.1257005604483587, + "learning_rate": 5.865505606427848e-06, + "loss": 0.3478, + "step": 626 + }, + { + "epoch": 0.1257005604483587, + "learning_rate": 5.878234306841637e-06, + "loss": 0.1675, + "step": 628 + }, + { + "epoch": 0.1265012009607686, + "learning_rate": 5.890971057153105e-06, + "loss": 0.2724, + "step": 630 + }, + { + "epoch": 0.1265012009607686, + "learning_rate": 5.903715832487138e-06, + "loss": 0.0508, + "step": 632 + }, + { + "epoch": 0.12730184147317855, + "learning_rate": 5.916468607952892e-06, + "loss": 0.3178, + "step": 634 + }, + { + "epoch": 0.12730184147317855, + "learning_rate": 5.929229358643925e-06, + "loss": 0.1944, + "step": 636 + }, + { + "epoch": 0.12810248198558846, + "learning_rate": 5.941998059638212e-06, + "loss": 0.1959, + "step": 638 + }, + { + "epoch": 0.12810248198558846, + "learning_rate": 5.954774685998206e-06, + "loss": 0.6868, + "step": 640 + }, + { + "epoch": 0.1289031224979984, + "learning_rate": 5.9675592127708585e-06, + "loss": 0.1083, + "step": 642 + }, + { + "epoch": 0.1289031224979984, + "learning_rate": 5.9803516149877475e-06, + "loss": 0.2595, + "step": 644 + }, + { + "epoch": 0.1297037630104083, + "learning_rate": 5.993151867665015e-06, + "loss": 0.5629, + "step": 646 + }, + { + "epoch": 0.1297037630104083, + "learning_rate": 6.005959945803494e-06, + "loss": 0.1435, + "step": 648 + }, + { + "epoch": 0.13050440352281825, + "learning_rate": 6.01877582438873e-06, + "loss": 0.4032, + "step": 650 + }, + { + "epoch": 0.13050440352281825, + "learning_rate": 6.03159947839103e-06, + "loss": 0.0958, + "step": 652 + }, + { + "epoch": 0.1313050440352282, + "learning_rate": 6.0444308827655265e-06, + "loss": 2.0009, + "step": 654 + }, + { + "epoch": 0.1313050440352282, + "learning_rate": 6.057270012452186e-06, + "loss": 0.1283, + "step": 656 + }, + { + "epoch": 0.1321056845476381, + "learning_rate": 6.070116842375947e-06, + "loss": 0.4762, + "step": 658 + }, + { + "epoch": 0.1321056845476381, + "learning_rate": 6.082971347446654e-06, + "loss": 0.1662, + "step": 660 + }, + { + "epoch": 0.13290632506004804, + "learning_rate": 6.095833502559182e-06, + "loss": 0.2811, + "step": 662 + }, + { + "epoch": 0.13290632506004804, + "learning_rate": 6.108703282593461e-06, + "loss": 0.142, + "step": 664 + }, + { + "epoch": 0.13370696557245795, + "learning_rate": 6.121580662414533e-06, + "loss": 0.2816, + "step": 666 + }, + { + "epoch": 0.13370696557245795, + "learning_rate": 6.13446561687258e-06, + "loss": 0.1697, + "step": 668 + }, + { + "epoch": 0.1345076060848679, + "learning_rate": 6.147358120803041e-06, + "loss": 0.2465, + "step": 670 + }, + { + "epoch": 0.1345076060848679, + "learning_rate": 6.160258149026557e-06, + "loss": 0.2944, + "step": 672 + }, + { + "epoch": 0.13530824659727783, + "learning_rate": 6.173165676349095e-06, + "loss": 0.1375, + "step": 674 + }, + { + "epoch": 0.13530824659727783, + "learning_rate": 6.186080677561974e-06, + "loss": 0.1281, + "step": 676 + }, + { + "epoch": 0.13610888710968774, + "learning_rate": 6.1990031274419186e-06, + "loss": 0.3145, + "step": 678 + }, + { + "epoch": 0.13610888710968774, + "learning_rate": 6.2119330007511014e-06, + "loss": 0.5827, + "step": 680 + }, + { + "epoch": 0.13690952762209768, + "learning_rate": 6.224870272237185e-06, + "loss": 0.1274, + "step": 682 + }, + { + "epoch": 0.13690952762209768, + "learning_rate": 6.237814916633431e-06, + "loss": 0.1176, + "step": 684 + }, + { + "epoch": 0.1377101681345076, + "learning_rate": 6.250766908658652e-06, + "loss": 0.232, + "step": 686 + }, + { + "epoch": 0.1377101681345076, + "learning_rate": 6.263726223017326e-06, + "loss": 0.1949, + "step": 688 + }, + { + "epoch": 0.13851080864691753, + "learning_rate": 6.2766928343996314e-06, + "loss": 0.4071, + "step": 690 + }, + { + "epoch": 0.13851080864691753, + "learning_rate": 6.289666717481496e-06, + "loss": 0.1494, + "step": 692 + }, + { + "epoch": 0.13931144915932747, + "learning_rate": 6.3026478469246285e-06, + "loss": 0.237, + "step": 694 + }, + { + "epoch": 0.13931144915932747, + "learning_rate": 6.315636197376634e-06, + "loss": 0.2303, + "step": 696 + }, + { + "epoch": 0.14011208967173738, + "learning_rate": 6.328631743470968e-06, + "loss": 0.3638, + "step": 698 + }, + { + "epoch": 0.14011208967173738, + "learning_rate": 6.341634459827044e-06, + "loss": 0.2009, + "step": 700 + }, + { + "epoch": 0.14091273018414732, + "learning_rate": 6.354644321050279e-06, + "loss": 0.1447, + "step": 702 + }, + { + "epoch": 0.14091273018414732, + "learning_rate": 6.3676613017321305e-06, + "loss": 0.2465, + "step": 704 + }, + { + "epoch": 0.14171337069655723, + "learning_rate": 6.380685376450153e-06, + "loss": 0.1154, + "step": 706 + }, + { + "epoch": 0.14171337069655723, + "learning_rate": 6.393716519768032e-06, + "loss": 0.2026, + "step": 708 + }, + { + "epoch": 0.14251401120896717, + "learning_rate": 6.406754706235692e-06, + "loss": 0.2759, + "step": 710 + }, + { + "epoch": 0.14251401120896717, + "learning_rate": 6.419799910389257e-06, + "loss": 0.2683, + "step": 712 + }, + { + "epoch": 0.1433146517213771, + "learning_rate": 6.432852106751162e-06, + "loss": 0.0986, + "step": 714 + }, + { + "epoch": 0.1433146517213771, + "learning_rate": 6.445911269830183e-06, + "loss": 0.3629, + "step": 716 + }, + { + "epoch": 0.14411529223378702, + "learning_rate": 6.458977374121492e-06, + "loss": 0.4507, + "step": 718 + }, + { + "epoch": 0.14411529223378702, + "learning_rate": 6.472050394106689e-06, + "loss": 0.1346, + "step": 720 + }, + { + "epoch": 0.14491593274619696, + "learning_rate": 6.485130304253915e-06, + "loss": 0.1281, + "step": 722 + }, + { + "epoch": 0.14491593274619696, + "learning_rate": 6.498217079017806e-06, + "loss": 0.3212, + "step": 724 + }, + { + "epoch": 0.14571657325860687, + "learning_rate": 6.511310692839605e-06, + "loss": 0.4745, + "step": 726 + }, + { + "epoch": 0.14571657325860687, + "learning_rate": 6.524411120147204e-06, + "loss": 0.232, + "step": 728 + }, + { + "epoch": 0.1465172137710168, + "learning_rate": 6.537518335355182e-06, + "loss": 0.3634, + "step": 730 + }, + { + "epoch": 0.1465172137710168, + "learning_rate": 6.5506323128648654e-06, + "loss": 0.1944, + "step": 732 + }, + { + "epoch": 0.14731785428342675, + "learning_rate": 6.563753027064355e-06, + "loss": 0.2973, + "step": 734 + }, + { + "epoch": 0.14731785428342675, + "learning_rate": 6.576880452328645e-06, + "loss": 0.2412, + "step": 736 + }, + { + "epoch": 0.14811849479583666, + "learning_rate": 6.590014563019571e-06, + "loss": 0.3632, + "step": 738 + }, + { + "epoch": 0.14811849479583666, + "learning_rate": 6.603155333485934e-06, + "loss": 0.4961, + "step": 740 + }, + { + "epoch": 0.1489191353082466, + "learning_rate": 6.61630273806352e-06, + "loss": 0.2074, + "step": 742 + }, + { + "epoch": 0.1489191353082466, + "learning_rate": 6.6294567510751675e-06, + "loss": 0.2529, + "step": 744 + }, + { + "epoch": 0.14971977582065651, + "learning_rate": 6.642617346830784e-06, + "loss": 0.3805, + "step": 746 + }, + { + "epoch": 0.14971977582065651, + "learning_rate": 6.655784499627476e-06, + "loss": 0.1138, + "step": 748 + }, + { + "epoch": 0.15052041633306645, + "learning_rate": 6.6689581837494925e-06, + "loss": 0.5477, + "step": 750 + }, + { + "epoch": 0.15052041633306645, + "learning_rate": 6.682138373468341e-06, + "loss": 0.5494, + "step": 752 + }, + { + "epoch": 0.1513210568454764, + "learning_rate": 6.695325043042827e-06, + "loss": 0.1011, + "step": 754 + }, + { + "epoch": 0.1513210568454764, + "learning_rate": 6.7085181667191e-06, + "loss": 0.1012, + "step": 756 + }, + { + "epoch": 0.1521216973578863, + "learning_rate": 6.7217177187307e-06, + "loss": 0.1505, + "step": 758 + }, + { + "epoch": 0.1521216973578863, + "learning_rate": 6.734923673298605e-06, + "loss": 0.2461, + "step": 760 + }, + { + "epoch": 0.15292233787029624, + "learning_rate": 6.748136004631327e-06, + "loss": 0.0736, + "step": 762 + }, + { + "epoch": 0.15292233787029624, + "learning_rate": 6.761354686924883e-06, + "loss": 0.0537, + "step": 764 + }, + { + "epoch": 0.15372297838270615, + "learning_rate": 6.774579694362902e-06, + "loss": 0.3171, + "step": 766 + }, + { + "epoch": 0.15372297838270615, + "learning_rate": 6.787811001116654e-06, + "loss": 0.2944, + "step": 768 + }, + { + "epoch": 0.1545236188951161, + "learning_rate": 6.801048581345113e-06, + "loss": 0.4731, + "step": 770 + }, + { + "epoch": 0.1545236188951161, + "learning_rate": 6.8142924091949955e-06, + "loss": 0.215, + "step": 772 + }, + { + "epoch": 0.15532425940752603, + "learning_rate": 6.827542458800804e-06, + "loss": 0.1031, + "step": 774 + }, + { + "epoch": 0.15532425940752603, + "learning_rate": 6.840798704284939e-06, + "loss": 0.0111, + "step": 776 + }, + { + "epoch": 0.15612489991993594, + "learning_rate": 6.854061119757647e-06, + "loss": 0.129, + "step": 778 + }, + { + "epoch": 0.15612489991993594, + "learning_rate": 6.867329679317144e-06, + "loss": 0.3653, + "step": 780 + }, + { + "epoch": 0.15692554043234588, + "learning_rate": 6.880604357049646e-06, + "loss": 0.6752, + "step": 782 + }, + { + "epoch": 0.15692554043234588, + "learning_rate": 6.893885127029419e-06, + "loss": 0.0583, + "step": 784 + }, + { + "epoch": 0.1577261809447558, + "learning_rate": 6.907171963318815e-06, + "loss": 0.2816, + "step": 786 + }, + { + "epoch": 0.1577261809447558, + "learning_rate": 6.920464839968391e-06, + "loss": 0.2771, + "step": 788 + }, + { + "epoch": 0.15852682145716573, + "learning_rate": 6.9337637310168494e-06, + "loss": 0.263, + "step": 790 + }, + { + "epoch": 0.15852682145716573, + "learning_rate": 6.94706861049117e-06, + "loss": 0.052, + "step": 792 + }, + { + "epoch": 0.15932746196957567, + "learning_rate": 6.960379452406636e-06, + "loss": 0.1439, + "step": 794 + }, + { + "epoch": 0.15932746196957567, + "learning_rate": 6.973696230766884e-06, + "loss": 0.0985, + "step": 796 + }, + { + "epoch": 0.16012810248198558, + "learning_rate": 6.9870189195639595e-06, + "loss": 0.6513, + "step": 798 + }, + { + "epoch": 0.16012810248198558, + "learning_rate": 7.000347492778341e-06, + "loss": 0.2266, + "step": 800 + }, + { + "epoch": 0.16092874299439552, + "learning_rate": 7.013681924379073e-06, + "loss": 0.1196, + "step": 802 + }, + { + "epoch": 0.16092874299439552, + "learning_rate": 7.027022188323704e-06, + "loss": 0.0512, + "step": 804 + }, + { + "epoch": 0.16172938350680544, + "learning_rate": 7.040368258558412e-06, + "loss": 0.2521, + "step": 806 + }, + { + "epoch": 0.16172938350680544, + "learning_rate": 7.05372010901803e-06, + "loss": 0.0772, + "step": 808 + }, + { + "epoch": 0.16253002401921537, + "learning_rate": 7.0670777136261035e-06, + "loss": 0.0958, + "step": 810 + }, + { + "epoch": 0.16253002401921537, + "learning_rate": 7.080441046294945e-06, + "loss": 0.4974, + "step": 812 + }, + { + "epoch": 0.1633306645316253, + "learning_rate": 7.093810080925657e-06, + "loss": 0.6324, + "step": 814 + }, + { + "epoch": 0.1633306645316253, + "learning_rate": 7.1071847914082605e-06, + "loss": 0.1765, + "step": 816 + }, + { + "epoch": 0.16413130504403523, + "learning_rate": 7.120565151621638e-06, + "loss": 0.2464, + "step": 818 + }, + { + "epoch": 0.16413130504403523, + "learning_rate": 7.133951135433656e-06, + "loss": 0.0276, + "step": 820 + }, + { + "epoch": 0.16493194555644516, + "learning_rate": 7.1473427167012e-06, + "loss": 0.166, + "step": 822 + }, + { + "epoch": 0.16493194555644516, + "learning_rate": 7.160739869270219e-06, + "loss": 0.7568, + "step": 824 + }, + { + "epoch": 0.16573258606885508, + "learning_rate": 7.1741425669757854e-06, + "loss": 0.5495, + "step": 826 + }, + { + "epoch": 0.16573258606885508, + "learning_rate": 7.18755078364214e-06, + "loss": 0.2303, + "step": 828 + }, + { + "epoch": 0.16653322658126501, + "learning_rate": 7.200964493082727e-06, + "loss": 0.1223, + "step": 830 + }, + { + "epoch": 0.16653322658126501, + "learning_rate": 7.214383669100317e-06, + "loss": 0.1789, + "step": 832 + }, + { + "epoch": 0.16733386709367493, + "learning_rate": 7.227808285486952e-06, + "loss": 0.1132, + "step": 834 + }, + { + "epoch": 0.16733386709367493, + "learning_rate": 7.241238316024064e-06, + "loss": 0.2302, + "step": 836 + }, + { + "epoch": 0.16813450760608487, + "learning_rate": 7.254673734482513e-06, + "loss": 0.1941, + "step": 838 + }, + { + "epoch": 0.16813450760608487, + "learning_rate": 7.268114514622635e-06, + "loss": 0.1538, + "step": 840 + }, + { + "epoch": 0.1689351481184948, + "learning_rate": 7.2815606301942945e-06, + "loss": 0.246, + "step": 842 + }, + { + "epoch": 0.1689351481184948, + "learning_rate": 7.2950120549369204e-06, + "loss": 0.1976, + "step": 844 + }, + { + "epoch": 0.16973578863090472, + "learning_rate": 7.308468762579623e-06, + "loss": 0.3776, + "step": 846 + }, + { + "epoch": 0.16973578863090472, + "learning_rate": 7.321930726841144e-06, + "loss": 0.229, + "step": 848 + }, + { + "epoch": 0.17053642914331466, + "learning_rate": 7.3353979214299765e-06, + "loss": 0.1439, + "step": 850 + }, + { + "epoch": 0.17053642914331466, + "learning_rate": 7.348870320044395e-06, + "loss": 0.2369, + "step": 852 + }, + { + "epoch": 0.17133706965572457, + "learning_rate": 7.362347896372515e-06, + "loss": 0.2018, + "step": 854 + }, + { + "epoch": 0.17133706965572457, + "learning_rate": 7.375830624092336e-06, + "loss": 0.0117, + "step": 856 + }, + { + "epoch": 0.1721377101681345, + "learning_rate": 7.389318476871784e-06, + "loss": 0.2814, + "step": 858 + }, + { + "epoch": 0.1721377101681345, + "learning_rate": 7.402811428368824e-06, + "loss": 0.0235, + "step": 860 + }, + { + "epoch": 0.17293835068054444, + "learning_rate": 7.416309452231411e-06, + "loss": 0.1867, + "step": 862 + }, + { + "epoch": 0.17293835068054444, + "learning_rate": 7.429812522097613e-06, + "loss": 0.2095, + "step": 864 + }, + { + "epoch": 0.17373899119295436, + "learning_rate": 7.443320611595641e-06, + "loss": 0.307, + "step": 866 + }, + { + "epoch": 0.17373899119295436, + "learning_rate": 7.4568336943439055e-06, + "loss": 0.0347, + "step": 868 + }, + { + "epoch": 0.1745396317053643, + "learning_rate": 7.470351743951061e-06, + "loss": 0.1052, + "step": 870 + }, + { + "epoch": 0.1745396317053643, + "learning_rate": 7.4838747340160475e-06, + "loss": 0.0678, + "step": 872 + }, + { + "epoch": 0.1753402722177742, + "learning_rate": 7.497402638128209e-06, + "loss": 0.3855, + "step": 874 + }, + { + "epoch": 0.1753402722177742, + "learning_rate": 7.510935429867233e-06, + "loss": 0.2218, + "step": 876 + }, + { + "epoch": 0.17614091273018415, + "learning_rate": 7.52447308280329e-06, + "loss": 0.2914, + "step": 878 + }, + { + "epoch": 0.17614091273018415, + "learning_rate": 7.538015570497046e-06, + "loss": 0.0428, + "step": 880 + }, + { + "epoch": 0.17694155324259409, + "learning_rate": 7.551562866499732e-06, + "loss": 0.2521, + "step": 882 + }, + { + "epoch": 0.17694155324259409, + "learning_rate": 7.5651149443531846e-06, + "loss": 0.1713, + "step": 884 + }, + { + "epoch": 0.177742193755004, + "learning_rate": 7.578671777589884e-06, + "loss": 0.114, + "step": 886 + }, + { + "epoch": 0.177742193755004, + "learning_rate": 7.592233339733077e-06, + "loss": 0.0543, + "step": 888 + }, + { + "epoch": 0.17854283426741394, + "learning_rate": 7.605799604296721e-06, + "loss": 0.2854, + "step": 890 + }, + { + "epoch": 0.17854283426741394, + "learning_rate": 7.619370544785608e-06, + "loss": 0.4686, + "step": 892 + }, + { + "epoch": 0.17934347477982385, + "learning_rate": 7.632946134695396e-06, + "loss": 0.6396, + "step": 894 + }, + { + "epoch": 0.17934347477982385, + "learning_rate": 7.646526347512665e-06, + "loss": 0.0534, + "step": 896 + }, + { + "epoch": 0.1801441152922338, + "learning_rate": 7.660111156714964e-06, + "loss": 0.6108, + "step": 898 + }, + { + "epoch": 0.1801441152922338, + "learning_rate": 7.67370053577085e-06, + "loss": 0.2915, + "step": 900 + }, + { + "epoch": 0.18094475580464373, + "learning_rate": 7.687294458140006e-06, + "loss": 0.281, + "step": 902 + }, + { + "epoch": 0.18094475580464373, + "learning_rate": 7.70089289727319e-06, + "loss": 0.1215, + "step": 904 + }, + { + "epoch": 0.18174539631705364, + "learning_rate": 7.714495826612353e-06, + "loss": 0.246, + "step": 906 + }, + { + "epoch": 0.18174539631705364, + "learning_rate": 7.728103219590684e-06, + "loss": 0.123, + "step": 908 + }, + { + "epoch": 0.18254603682946358, + "learning_rate": 7.741715049632646e-06, + "loss": 0.1279, + "step": 910 + }, + { + "epoch": 0.18254603682946358, + "learning_rate": 7.755331290154041e-06, + "loss": 0.0398, + "step": 912 + }, + { + "epoch": 0.1833466773418735, + "learning_rate": 7.76895191456204e-06, + "loss": 0.2773, + "step": 914 + }, + { + "epoch": 0.1833466773418735, + "learning_rate": 7.7825768962553e-06, + "loss": 0.0247, + "step": 916 + }, + { + "epoch": 0.18414731785428343, + "learning_rate": 7.796206208623925e-06, + "loss": 0.3029, + "step": 918 + }, + { + "epoch": 0.18414731785428343, + "learning_rate": 7.809839825049565e-06, + "loss": 0.016, + "step": 920 + }, + { + "epoch": 0.18494795836669337, + "learning_rate": 7.82347771890548e-06, + "loss": 0.1394, + "step": 922 + }, + { + "epoch": 0.18494795836669337, + "learning_rate": 7.83711986355656e-06, + "loss": 0.0625, + "step": 924 + }, + { + "epoch": 0.18574859887910328, + "learning_rate": 7.850766232359408e-06, + "loss": 0.2673, + "step": 926 + }, + { + "epoch": 0.18574859887910328, + "learning_rate": 7.864416798662347e-06, + "loss": 0.0296, + "step": 928 + }, + { + "epoch": 0.18654923939151322, + "learning_rate": 7.878071535805564e-06, + "loss": 0.2151, + "step": 930 + }, + { + "epoch": 0.18654923939151322, + "learning_rate": 7.891730417121043e-06, + "loss": 0.0957, + "step": 932 + }, + { + "epoch": 0.18734987990392313, + "learning_rate": 7.90539341593269e-06, + "loss": 0.1187, + "step": 934 + }, + { + "epoch": 0.18734987990392313, + "learning_rate": 7.919060505556376e-06, + "loss": 0.0193, + "step": 936 + }, + { + "epoch": 0.18815052041633307, + "learning_rate": 7.932731659299978e-06, + "loss": 0.2859, + "step": 938 + }, + { + "epoch": 0.18815052041633307, + "learning_rate": 7.946406850463435e-06, + "loss": 0.0515, + "step": 940 + }, + { + "epoch": 0.188951160928743, + "learning_rate": 7.960086052338788e-06, + "loss": 0.3203, + "step": 942 + }, + { + "epoch": 0.188951160928743, + "learning_rate": 7.973769238210291e-06, + "loss": 0.028, + "step": 944 + }, + { + "epoch": 0.18975180144115292, + "learning_rate": 7.987456381354371e-06, + "loss": 0.5219, + "step": 946 + }, + { + "epoch": 0.18975180144115292, + "learning_rate": 8.001147455039737e-06, + "loss": 0.0784, + "step": 948 + }, + { + "epoch": 0.19055244195356286, + "learning_rate": 8.01484243252743e-06, + "loss": 0.4221, + "step": 950 + }, + { + "epoch": 0.19055244195356286, + "learning_rate": 8.028541287070858e-06, + "loss": 0.0357, + "step": 952 + }, + { + "epoch": 0.19135308246597277, + "learning_rate": 8.042243991915866e-06, + "loss": 0.0908, + "step": 954 + }, + { + "epoch": 0.19135308246597277, + "learning_rate": 8.055950520300756e-06, + "loss": 0.0167, + "step": 956 + }, + { + "epoch": 0.1921537229783827, + "learning_rate": 8.069660845456411e-06, + "loss": 0.6893, + "step": 958 + }, + { + "epoch": 0.1921537229783827, + "learning_rate": 8.083374940606256e-06, + "loss": 0.3342, + "step": 960 + }, + { + "epoch": 0.19295436349079265, + "learning_rate": 8.097092778966364e-06, + "loss": 0.5962, + "step": 962 + }, + { + "epoch": 0.19295436349079265, + "learning_rate": 8.110814333745503e-06, + "loss": 0.4965, + "step": 964 + }, + { + "epoch": 0.19375500400320256, + "learning_rate": 8.124539578145176e-06, + "loss": 0.1091, + "step": 966 + }, + { + "epoch": 0.19375500400320256, + "learning_rate": 8.138268485359684e-06, + "loss": 0.0374, + "step": 968 + }, + { + "epoch": 0.1945556445156125, + "learning_rate": 8.152001028576158e-06, + "loss": 0.2603, + "step": 970 + }, + { + "epoch": 0.1945556445156125, + "learning_rate": 8.165737180974676e-06, + "loss": 0.2151, + "step": 972 + }, + { + "epoch": 0.1953562850280224, + "learning_rate": 8.179476915728217e-06, + "loss": 0.4335, + "step": 974 + }, + { + "epoch": 0.1953562850280224, + "learning_rate": 8.193220206002785e-06, + "loss": 0.0152, + "step": 976 + }, + { + "epoch": 0.19615692554043235, + "learning_rate": 8.206967024957432e-06, + "loss": 0.3212, + "step": 978 + }, + { + "epoch": 0.19615692554043235, + "learning_rate": 8.220717345744326e-06, + "loss": 0.079, + "step": 980 + }, + { + "epoch": 0.1969575660528423, + "learning_rate": 8.234471141508773e-06, + "loss": 0.3126, + "step": 982 + }, + { + "epoch": 0.1969575660528423, + "learning_rate": 8.248228385389349e-06, + "loss": 0.145, + "step": 984 + }, + { + "epoch": 0.1977582065652522, + "learning_rate": 8.261989050517841e-06, + "loss": 0.6945, + "step": 986 + }, + { + "epoch": 0.1977582065652522, + "learning_rate": 8.275753110019367e-06, + "loss": 0.2941, + "step": 988 + }, + { + "epoch": 0.19855884707766214, + "learning_rate": 8.289520537012428e-06, + "loss": 0.2412, + "step": 990 + }, + { + "epoch": 0.19855884707766214, + "learning_rate": 8.303291304608936e-06, + "loss": 0.0904, + "step": 992 + }, + { + "epoch": 0.19935948759007205, + "learning_rate": 8.317065385914285e-06, + "loss": 0.5552, + "step": 994 + }, + { + "epoch": 0.19935948759007205, + "learning_rate": 8.330842754027378e-06, + "loss": 0.4889, + "step": 996 + }, + { + "epoch": 0.200160128102482, + "learning_rate": 8.344623382040752e-06, + "loss": 0.1383, + "step": 998 + }, + { + "epoch": 0.200160128102482, + "learning_rate": 8.358407243040524e-06, + "loss": 0.88, + "step": 1000 + }, + { + "epoch": 0.20096076861489193, + "learning_rate": 8.372194310106515e-06, + "loss": 0.1949, + "step": 1002 + }, + { + "epoch": 0.20096076861489193, + "learning_rate": 8.385984556312285e-06, + "loss": 0.1013, + "step": 1004 + }, + { + "epoch": 0.20176140912730184, + "learning_rate": 8.399777954725183e-06, + "loss": 0.158, + "step": 1006 + }, + { + "epoch": 0.20176140912730184, + "learning_rate": 8.413574478406386e-06, + "loss": 0.1358, + "step": 1008 + }, + { + "epoch": 0.20256204963971178, + "learning_rate": 8.427374100411022e-06, + "loss": 0.1601, + "step": 1010 + }, + { + "epoch": 0.20256204963971178, + "learning_rate": 8.441176793788106e-06, + "loss": 0.0809, + "step": 1012 + }, + { + "epoch": 0.2033626901521217, + "learning_rate": 8.454982531580687e-06, + "loss": 0.1624, + "step": 1014 + }, + { + "epoch": 0.2033626901521217, + "learning_rate": 8.468791286825856e-06, + "loss": 0.8361, + "step": 1016 + }, + { + "epoch": 0.20416333066453163, + "learning_rate": 8.482603032554812e-06, + "loss": 0.2148, + "step": 1018 + }, + { + "epoch": 0.20416333066453163, + "learning_rate": 8.496417741792922e-06, + "loss": 0.5845, + "step": 1020 + }, + { + "epoch": 0.20496397117694154, + "learning_rate": 8.510235387559738e-06, + "loss": 0.1581, + "step": 1022 + }, + { + "epoch": 0.20496397117694154, + "learning_rate": 8.524055942869135e-06, + "loss": 0.1794, + "step": 1024 + }, + { + "epoch": 0.20576461168935148, + "learning_rate": 8.537879380729254e-06, + "loss": 0.2084, + "step": 1026 + }, + { + "epoch": 0.20576461168935148, + "learning_rate": 8.551705674142616e-06, + "loss": 0.2956, + "step": 1028 + }, + { + "epoch": 0.20656525220176142, + "learning_rate": 8.565534796106175e-06, + "loss": 0.5345, + "step": 1030 + }, + { + "epoch": 0.20656525220176142, + "learning_rate": 8.579366719611353e-06, + "loss": 0.2775, + "step": 1032 + }, + { + "epoch": 0.20736589271417133, + "learning_rate": 8.593201417644091e-06, + "loss": 0.1895, + "step": 1034 + }, + { + "epoch": 0.20736589271417133, + "learning_rate": 8.607038863184952e-06, + "loss": 0.069, + "step": 1036 + }, + { + "epoch": 0.20816653322658127, + "learning_rate": 8.620879029209093e-06, + "loss": 0.1286, + "step": 1038 + }, + { + "epoch": 0.20816653322658127, + "learning_rate": 8.634721888686368e-06, + "loss": 0.4137, + "step": 1040 + }, + { + "epoch": 0.20896717373899118, + "learning_rate": 8.648567414581372e-06, + "loss": 0.2577, + "step": 1042 + }, + { + "epoch": 0.20896717373899118, + "learning_rate": 8.662415579853495e-06, + "loss": 0.1521, + "step": 1044 + }, + { + "epoch": 0.20976781425140112, + "learning_rate": 8.676266357456968e-06, + "loss": 0.2616, + "step": 1046 + }, + { + "epoch": 0.20976781425140112, + "learning_rate": 8.690119720340907e-06, + "loss": 0.114, + "step": 1048 + }, + { + "epoch": 0.21056845476381106, + "learning_rate": 8.703975641449426e-06, + "loss": 0.1495, + "step": 1050 + }, + { + "epoch": 0.21056845476381106, + "learning_rate": 8.717834093721598e-06, + "loss": 0.6946, + "step": 1052 + }, + { + "epoch": 0.21136909527622097, + "learning_rate": 8.731695050091561e-06, + "loss": 0.1582, + "step": 1054 + }, + { + "epoch": 0.21136909527622097, + "learning_rate": 8.74555848348857e-06, + "loss": 0.0481, + "step": 1056 + }, + { + "epoch": 0.2121697357886309, + "learning_rate": 8.759424366837035e-06, + "loss": 0.1945, + "step": 1058 + }, + { + "epoch": 0.2121697357886309, + "learning_rate": 8.773292673056572e-06, + "loss": 0.0642, + "step": 1060 + }, + { + "epoch": 0.21297037630104082, + "learning_rate": 8.787163375062113e-06, + "loss": 0.3856, + "step": 1062 + }, + { + "epoch": 0.21297037630104082, + "learning_rate": 8.801036445763858e-06, + "loss": 0.2244, + "step": 1064 + }, + { + "epoch": 0.21377101681345076, + "learning_rate": 8.8149118580674e-06, + "loss": 0.0743, + "step": 1066 + }, + { + "epoch": 0.21377101681345076, + "learning_rate": 8.828789584873757e-06, + "loss": 0.0157, + "step": 1068 + }, + { + "epoch": 0.2145716573258607, + "learning_rate": 8.84266959907943e-06, + "loss": 0.3323, + "step": 1070 + }, + { + "epoch": 0.2145716573258607, + "learning_rate": 8.856551873576448e-06, + "loss": 0.0831, + "step": 1072 + }, + { + "epoch": 0.2153722978382706, + "learning_rate": 8.870436381252412e-06, + "loss": 0.3486, + "step": 1074 + }, + { + "epoch": 0.2153722978382706, + "learning_rate": 8.884323094990613e-06, + "loss": 1.0135, + "step": 1076 + }, + { + "epoch": 0.21617293835068055, + "learning_rate": 8.89821198766998e-06, + "loss": 0.2306, + "step": 1078 + }, + { + "epoch": 0.21617293835068055, + "learning_rate": 8.912103032165206e-06, + "loss": 0.4392, + "step": 1080 + }, + { + "epoch": 0.21697357886309046, + "learning_rate": 8.925996201346779e-06, + "loss": 0.2244, + "step": 1082 + }, + { + "epoch": 0.21697357886309046, + "learning_rate": 8.939891468081036e-06, + "loss": 0.4113, + "step": 1084 + }, + { + "epoch": 0.2177742193755004, + "learning_rate": 8.953788805230209e-06, + "loss": 0.2755, + "step": 1086 + }, + { + "epoch": 0.2177742193755004, + "learning_rate": 8.967688185652527e-06, + "loss": 0.0494, + "step": 1088 + }, + { + "epoch": 0.21857485988791034, + "learning_rate": 8.981589582202184e-06, + "loss": 0.1752, + "step": 1090 + }, + { + "epoch": 0.21857485988791034, + "learning_rate": 8.995492967729449e-06, + "loss": 0.1019, + "step": 1092 + }, + { + "epoch": 0.21937550040032025, + "learning_rate": 9.009398315080712e-06, + "loss": 0.1083, + "step": 1094 + }, + { + "epoch": 0.21937550040032025, + "learning_rate": 9.023305597098526e-06, + "loss": 0.2046, + "step": 1096 + }, + { + "epoch": 0.2201761409127302, + "learning_rate": 9.037214786621669e-06, + "loss": 0.1159, + "step": 1098 + }, + { + "epoch": 0.2201761409127302, + "learning_rate": 9.051125856485175e-06, + "loss": 0.1266, + "step": 1100 + }, + { + "epoch": 0.2209767814251401, + "learning_rate": 9.065038779520457e-06, + "loss": 0.381, + "step": 1102 + }, + { + "epoch": 0.2209767814251401, + "learning_rate": 9.078953528555258e-06, + "loss": 0.4569, + "step": 1104 + }, + { + "epoch": 0.22177742193755004, + "learning_rate": 9.092870076413771e-06, + "loss": 0.1823, + "step": 1106 + }, + { + "epoch": 0.22177742193755004, + "learning_rate": 9.106788395916682e-06, + "loss": 0.155, + "step": 1108 + }, + { + "epoch": 0.22257806244995998, + "learning_rate": 9.120708459881203e-06, + "loss": 0.2775, + "step": 1110 + }, + { + "epoch": 0.22257806244995998, + "learning_rate": 9.134630241121135e-06, + "loss": 0.235, + "step": 1112 + }, + { + "epoch": 0.2233787029623699, + "learning_rate": 9.148553712446971e-06, + "loss": 0.0803, + "step": 1114 + }, + { + "epoch": 0.2233787029623699, + "learning_rate": 9.162478846665854e-06, + "loss": 0.3807, + "step": 1116 + }, + { + "epoch": 0.22417934347477983, + "learning_rate": 9.176405616581694e-06, + "loss": 0.3145, + "step": 1118 + }, + { + "epoch": 0.22417934347477983, + "learning_rate": 9.190333994995208e-06, + "loss": 0.4833, + "step": 1120 + }, + { + "epoch": 0.22497998398718974, + "learning_rate": 9.20426395470397e-06, + "loss": 0.3565, + "step": 1122 + }, + { + "epoch": 0.22497998398718974, + "learning_rate": 9.218195468502469e-06, + "loss": 0.222, + "step": 1124 + }, + { + "epoch": 0.22578062449959968, + "learning_rate": 9.232128509182136e-06, + "loss": 0.1821, + "step": 1126 + }, + { + "epoch": 0.22578062449959968, + "learning_rate": 9.24606304953148e-06, + "loss": 0.2193, + "step": 1128 + }, + { + "epoch": 0.22658126501200962, + "learning_rate": 9.259999062336021e-06, + "loss": 0.2552, + "step": 1130 + }, + { + "epoch": 0.22658126501200962, + "learning_rate": 9.273936520378426e-06, + "loss": 0.1217, + "step": 1132 + }, + { + "epoch": 0.22738190552441953, + "learning_rate": 9.287875396438536e-06, + "loss": 0.223, + "step": 1134 + }, + { + "epoch": 0.22738190552441953, + "learning_rate": 9.301815663293426e-06, + "loss": 0.6569, + "step": 1136 + }, + { + "epoch": 0.22818254603682947, + "learning_rate": 9.315757293717432e-06, + "loss": 0.2378, + "step": 1138 + }, + { + "epoch": 0.22818254603682947, + "learning_rate": 9.329700260482286e-06, + "loss": 0.153, + "step": 1140 + }, + { + "epoch": 0.22898318654923938, + "learning_rate": 9.343644536357053e-06, + "loss": 0.4707, + "step": 1142 + }, + { + "epoch": 0.22898318654923938, + "learning_rate": 9.35759009410826e-06, + "loss": 0.1035, + "step": 1144 + }, + { + "epoch": 0.22978382706164932, + "learning_rate": 9.37153690649993e-06, + "loss": 0.2942, + "step": 1146 + }, + { + "epoch": 0.22978382706164932, + "learning_rate": 9.38548494629364e-06, + "loss": 0.252, + "step": 1148 + }, + { + "epoch": 0.23058446757405926, + "learning_rate": 9.39943418624856e-06, + "loss": 0.2149, + "step": 1150 + }, + { + "epoch": 0.23058446757405926, + "learning_rate": 9.41338459912151e-06, + "loss": 0.2158, + "step": 1152 + }, + { + "epoch": 0.23138510808646917, + "learning_rate": 9.427336157667062e-06, + "loss": 0.3848, + "step": 1154 + }, + { + "epoch": 0.23138510808646917, + "learning_rate": 9.441288834637507e-06, + "loss": 0.6956, + "step": 1156 + }, + { + "epoch": 0.2321857485988791, + "learning_rate": 9.45524260278296e-06, + "loss": 0.182, + "step": 1158 + }, + { + "epoch": 0.2321857485988791, + "learning_rate": 9.469197434851414e-06, + "loss": 0.1821, + "step": 1160 + }, + { + "epoch": 0.23298638911128902, + "learning_rate": 9.483153303588777e-06, + "loss": 0.2151, + "step": 1162 + }, + { + "epoch": 0.23298638911128902, + "learning_rate": 9.497110181738935e-06, + "loss": 0.1823, + "step": 1164 + }, + { + "epoch": 0.23378702962369896, + "learning_rate": 9.511068042043785e-06, + "loss": 0.2086, + "step": 1166 + }, + { + "epoch": 0.23378702962369896, + "learning_rate": 9.52502685724336e-06, + "loss": 0.1339, + "step": 1168 + }, + { + "epoch": 0.2345876701361089, + "learning_rate": 9.538986600075773e-06, + "loss": 0.1976, + "step": 1170 + }, + { + "epoch": 0.2345876701361089, + "learning_rate": 9.552947243277342e-06, + "loss": 0.2006, + "step": 1172 + }, + { + "epoch": 0.2353883106485188, + "learning_rate": 9.566908759582633e-06, + "loss": 0.2605, + "step": 1174 + }, + { + "epoch": 0.2353883106485188, + "learning_rate": 9.580871121724498e-06, + "loss": 0.0686, + "step": 1176 + }, + { + "epoch": 0.23618895116092875, + "learning_rate": 9.594834302434123e-06, + "loss": 0.5887, + "step": 1178 + }, + { + "epoch": 0.23618895116092875, + "learning_rate": 9.608798274441153e-06, + "loss": 0.6413, + "step": 1180 + }, + { + "epoch": 0.23698959167333866, + "learning_rate": 9.622763010473628e-06, + "loss": 0.2964, + "step": 1182 + }, + { + "epoch": 0.23698959167333866, + "learning_rate": 9.636728483258116e-06, + "loss": 0.0442, + "step": 1184 + }, + { + "epoch": 0.2377902321857486, + "learning_rate": 9.650694665519747e-06, + "loss": 0.2219, + "step": 1186 + }, + { + "epoch": 0.2377902321857486, + "learning_rate": 9.664661529982263e-06, + "loss": 0.3407, + "step": 1188 + }, + { + "epoch": 0.23859087269815854, + "learning_rate": 9.678629049368077e-06, + "loss": 0.1783, + "step": 1190 + }, + { + "epoch": 0.23859087269815854, + "learning_rate": 9.692597196398302e-06, + "loss": 0.0235, + "step": 1192 + }, + { + "epoch": 0.23939151321056845, + "learning_rate": 9.706565943792879e-06, + "loss": 0.1508, + "step": 1194 + }, + { + "epoch": 0.23939151321056845, + "learning_rate": 9.720535264270526e-06, + "loss": 0.0139, + "step": 1196 + }, + { + "epoch": 0.2401921537229784, + "learning_rate": 9.734505130548855e-06, + "loss": 0.1419, + "step": 1198 + }, + { + "epoch": 0.2401921537229784, + "learning_rate": 9.748475515344416e-06, + "loss": 0.7775, + "step": 1200 + }, + { + "epoch": 0.2409927942353883, + "learning_rate": 9.762446391372746e-06, + "loss": 0.2755, + "step": 1202 + }, + { + "epoch": 0.2409927942353883, + "learning_rate": 9.776417731348403e-06, + "loss": 0.253, + "step": 1204 + }, + { + "epoch": 0.24179343474779824, + "learning_rate": 9.790389507985091e-06, + "loss": 0.508, + "step": 1206 + }, + { + "epoch": 0.24179343474779824, + "learning_rate": 9.80436169399561e-06, + "loss": 0.6494, + "step": 1208 + }, + { + "epoch": 0.24259407526020815, + "learning_rate": 9.81833426209198e-06, + "loss": 0.2596, + "step": 1210 + }, + { + "epoch": 0.24259407526020815, + "learning_rate": 9.832307184985473e-06, + "loss": 0.235, + "step": 1212 + }, + { + "epoch": 0.2433947157726181, + "learning_rate": 9.846280435386668e-06, + "loss": 0.0825, + "step": 1214 + }, + { + "epoch": 0.2433947157726181, + "learning_rate": 9.8602539860055e-06, + "loss": 0.0159, + "step": 1216 + }, + { + "epoch": 0.24419535628502803, + "learning_rate": 9.874227809551307e-06, + "loss": 0.6583, + "step": 1218 + }, + { + "epoch": 0.24419535628502803, + "learning_rate": 9.888201878732946e-06, + "loss": 0.0958, + "step": 1220 + }, + { + "epoch": 0.24499599679743794, + "learning_rate": 9.902176166258738e-06, + "loss": 0.2326, + "step": 1222 + }, + { + "epoch": 0.24499599679743794, + "learning_rate": 9.916150644836596e-06, + "loss": 0.3552, + "step": 1224 + }, + { + "epoch": 0.24579663730984788, + "learning_rate": 9.930125287174061e-06, + "loss": 0.5497, + "step": 1226 + }, + { + "epoch": 0.24579663730984788, + "learning_rate": 9.944100065978354e-06, + "loss": 0.1444, + "step": 1228 + }, + { + "epoch": 0.2465972778222578, + "learning_rate": 9.958074953956413e-06, + "loss": 0.1661, + "step": 1230 + }, + { + "epoch": 0.2465972778222578, + "learning_rate": 9.972049923815011e-06, + "loss": 0.5448, + "step": 1232 + }, + { + "epoch": 0.24739791833466773, + "learning_rate": 9.986024948260714e-06, + "loss": 0.2388, + "step": 1234 + }, + { + "epoch": 0.24739791833466773, + "learning_rate": 9.999999999999996e-06, + "loss": 0.2446, + "step": 1236 + }, + { + "epoch": 0.24819855884707767, + "learning_rate": 1.0013975051739277e-05, + "loss": 0.2243, + "step": 1238 + }, + { + "epoch": 0.24819855884707767, + "learning_rate": 1.0027950076184982e-05, + "loss": 0.3724, + "step": 1240 + }, + { + "epoch": 0.24899919935948758, + "learning_rate": 1.004192504604358e-05, + "loss": 0.1837, + "step": 1242 + }, + { + "epoch": 0.24899919935948758, + "learning_rate": 1.0055899934021637e-05, + "loss": 0.3612, + "step": 1244 + }, + { + "epoch": 0.24979983987189752, + "learning_rate": 1.006987471282593e-05, + "loss": 0.124, + "step": 1246 + }, + { + "epoch": 0.24979983987189752, + "learning_rate": 1.0083849355163397e-05, + "loss": 0.3383, + "step": 1248 + }, + { + "epoch": 0.25060048038430743, + "learning_rate": 1.0097823833741255e-05, + "loss": 0.3006, + "step": 1250 + }, + { + "epoch": 0.25060048038430743, + "learning_rate": 1.0111798121267047e-05, + "loss": 0.3046, + "step": 1252 + }, + { + "epoch": 0.2514011208967174, + "learning_rate": 1.0125772190448686e-05, + "loss": 0.3415, + "step": 1254 + }, + { + "epoch": 0.2514011208967174, + "learning_rate": 1.0139746013994493e-05, + "loss": 0.2615, + "step": 1256 + }, + { + "epoch": 0.2522017614091273, + "learning_rate": 1.0153719564613327e-05, + "loss": 0.0314, + "step": 1258 + }, + { + "epoch": 0.2522017614091273, + "learning_rate": 1.016769281501452e-05, + "loss": 0.0604, + "step": 1260 + }, + { + "epoch": 0.2530024019215372, + "learning_rate": 1.018166573790801e-05, + "loss": 0.315, + "step": 1262 + }, + { + "epoch": 0.2530024019215372, + "learning_rate": 1.0195638306004383e-05, + "loss": 0.0887, + "step": 1264 + }, + { + "epoch": 0.25380304243394713, + "learning_rate": 1.0209610492014904e-05, + "loss": 0.5316, + "step": 1266 + }, + { + "epoch": 0.25380304243394713, + "learning_rate": 1.022358226865159e-05, + "loss": 0.3689, + "step": 1268 + }, + { + "epoch": 0.2546036829463571, + "learning_rate": 1.0237553608627247e-05, + "loss": 0.1677, + "step": 1270 + }, + { + "epoch": 0.2546036829463571, + "learning_rate": 1.0251524484655577e-05, + "loss": 0.3043, + "step": 1272 + }, + { + "epoch": 0.255404323458767, + "learning_rate": 1.0265494869451138e-05, + "loss": 0.2319, + "step": 1274 + }, + { + "epoch": 0.255404323458767, + "learning_rate": 1.0279464735729467e-05, + "loss": 0.2709, + "step": 1276 + }, + { + "epoch": 0.2562049639711769, + "learning_rate": 1.0293434056207114e-05, + "loss": 0.215, + "step": 1278 + }, + { + "epoch": 0.2562049639711769, + "learning_rate": 1.0307402803601691e-05, + "loss": 0.1662, + "step": 1280 + }, + { + "epoch": 0.2570056044835869, + "learning_rate": 1.0321370950631918e-05, + "loss": 0.1603, + "step": 1282 + }, + { + "epoch": 0.2570056044835869, + "learning_rate": 1.033533847001773e-05, + "loss": 0.3312, + "step": 1284 + }, + { + "epoch": 0.2578062449959968, + "learning_rate": 1.0349305334480246e-05, + "loss": 0.3263, + "step": 1286 + }, + { + "epoch": 0.2578062449959968, + "learning_rate": 1.0363271516741877e-05, + "loss": 0.2638, + "step": 1288 + }, + { + "epoch": 0.2586068855084067, + "learning_rate": 1.0377236989526366e-05, + "loss": 0.1641, + "step": 1290 + }, + { + "epoch": 0.2586068855084067, + "learning_rate": 1.039120172555884e-05, + "loss": 0.4289, + "step": 1292 + }, + { + "epoch": 0.2594075260208166, + "learning_rate": 1.0405165697565868e-05, + "loss": 0.0813, + "step": 1294 + }, + { + "epoch": 0.2594075260208166, + "learning_rate": 1.0419128878275495e-05, + "loss": 0.0068, + "step": 1296 + }, + { + "epoch": 0.2602081665332266, + "learning_rate": 1.0433091240417362e-05, + "loss": 0.1943, + "step": 1298 + }, + { + "epoch": 0.2602081665332266, + "learning_rate": 1.0447052756722651e-05, + "loss": 0.1301, + "step": 1300 + }, + { + "epoch": 0.2610088070456365, + "learning_rate": 1.046101339992422e-05, + "loss": 0.1204, + "step": 1302 + }, + { + "epoch": 0.2610088070456365, + "learning_rate": 1.0474973142756632e-05, + "loss": 0.3643, + "step": 1304 + }, + { + "epoch": 0.2618094475580464, + "learning_rate": 1.0488931957956208e-05, + "loss": 0.5458, + "step": 1306 + }, + { + "epoch": 0.2618094475580464, + "learning_rate": 1.0502889818261058e-05, + "loss": 0.0403, + "step": 1308 + }, + { + "epoch": 0.2626100880704564, + "learning_rate": 1.0516846696411216e-05, + "loss": 0.2424, + "step": 1310 + }, + { + "epoch": 0.2626100880704564, + "learning_rate": 1.053080256514858e-05, + "loss": 0.1271, + "step": 1312 + }, + { + "epoch": 0.2634107285828663, + "learning_rate": 1.054475739721703e-05, + "loss": 0.8296, + "step": 1314 + }, + { + "epoch": 0.2634107285828663, + "learning_rate": 1.0558711165362488e-05, + "loss": 0.0591, + "step": 1316 + }, + { + "epoch": 0.2642113690952762, + "learning_rate": 1.0572663842332931e-05, + "loss": 0.142, + "step": 1318 + }, + { + "epoch": 0.2642113690952762, + "learning_rate": 1.0586615400878484e-05, + "loss": 0.1677, + "step": 1320 + }, + { + "epoch": 0.26501200960768617, + "learning_rate": 1.0600565813751433e-05, + "loss": 0.2612, + "step": 1322 + }, + { + "epoch": 0.26501200960768617, + "learning_rate": 1.0614515053706354e-05, + "loss": 0.11, + "step": 1324 + }, + { + "epoch": 0.2658126501200961, + "learning_rate": 1.0628463093500063e-05, + "loss": 0.4709, + "step": 1326 + }, + { + "epoch": 0.2658126501200961, + "learning_rate": 1.0642409905891733e-05, + "loss": 0.1879, + "step": 1328 + }, + { + "epoch": 0.266613290632506, + "learning_rate": 1.065635546364294e-05, + "loss": 0.1199, + "step": 1330 + }, + { + "epoch": 0.266613290632506, + "learning_rate": 1.0670299739517706e-05, + "loss": 0.109, + "step": 1332 + }, + { + "epoch": 0.2674139311449159, + "learning_rate": 1.0684242706282562e-05, + "loss": 0.3833, + "step": 1334 + }, + { + "epoch": 0.2674139311449159, + "learning_rate": 1.0698184336706567e-05, + "loss": 0.2179, + "step": 1336 + }, + { + "epoch": 0.2682145716573259, + "learning_rate": 1.0712124603561457e-05, + "loss": 0.2701, + "step": 1338 + }, + { + "epoch": 0.2682145716573259, + "learning_rate": 1.0726063479621567e-05, + "loss": 0.3134, + "step": 1340 + }, + { + "epoch": 0.2690152121697358, + "learning_rate": 1.0740000937663972e-05, + "loss": 0.3418, + "step": 1342 + }, + { + "epoch": 0.2690152121697358, + "learning_rate": 1.0753936950468513e-05, + "loss": 0.1581, + "step": 1344 + }, + { + "epoch": 0.2698158526821457, + "learning_rate": 1.0767871490817856e-05, + "loss": 0.2163, + "step": 1346 + }, + { + "epoch": 0.2698158526821457, + "learning_rate": 1.0781804531497525e-05, + "loss": 0.138, + "step": 1348 + }, + { + "epoch": 0.27061649319455566, + "learning_rate": 1.0795736045296023e-05, + "loss": 0.3642, + "step": 1350 + }, + { + "epoch": 0.27061649319455566, + "learning_rate": 1.0809666005004787e-05, + "loss": 0.0183, + "step": 1352 + }, + { + "epoch": 0.2714171337069656, + "learning_rate": 1.08235943834183e-05, + "loss": 0.5841, + "step": 1354 + }, + { + "epoch": 0.2714171337069656, + "learning_rate": 1.083752115333414e-05, + "loss": 0.201, + "step": 1356 + }, + { + "epoch": 0.2722177742193755, + "learning_rate": 1.0851446287553022e-05, + "loss": 0.4889, + "step": 1358 + }, + { + "epoch": 0.2722177742193755, + "learning_rate": 1.0865369758878858e-05, + "loss": 0.0572, + "step": 1360 + }, + { + "epoch": 0.27301841473178545, + "learning_rate": 1.087929154011879e-05, + "loss": 0.0833, + "step": 1362 + }, + { + "epoch": 0.27301841473178545, + "learning_rate": 1.0893211604083311e-05, + "loss": 0.0278, + "step": 1364 + }, + { + "epoch": 0.27381905524419536, + "learning_rate": 1.090712992358622e-05, + "loss": 0.1132, + "step": 1366 + }, + { + "epoch": 0.27381905524419536, + "learning_rate": 1.0921046471444737e-05, + "loss": 0.4376, + "step": 1368 + }, + { + "epoch": 0.2746196957566053, + "learning_rate": 1.0934961220479537e-05, + "loss": 0.1689, + "step": 1370 + }, + { + "epoch": 0.2746196957566053, + "learning_rate": 1.0948874143514818e-05, + "loss": 0.8235, + "step": 1372 + }, + { + "epoch": 0.2754203362690152, + "learning_rate": 1.0962785213378325e-05, + "loss": 0.3316, + "step": 1374 + }, + { + "epoch": 0.2754203362690152, + "learning_rate": 1.0976694402901467e-05, + "loss": 0.492, + "step": 1376 + }, + { + "epoch": 0.27622097678142515, + "learning_rate": 1.0990601684919282e-05, + "loss": 0.1865, + "step": 1378 + }, + { + "epoch": 0.27622097678142515, + "learning_rate": 1.1004507032270544e-05, + "loss": 0.3552, + "step": 1380 + }, + { + "epoch": 0.27702161729383507, + "learning_rate": 1.1018410417797809e-05, + "loss": 0.7587, + "step": 1382 + }, + { + "epoch": 0.27702161729383507, + "learning_rate": 1.1032311814347467e-05, + "loss": 0.3838, + "step": 1384 + }, + { + "epoch": 0.277822257806245, + "learning_rate": 1.1046211194769784e-05, + "loss": 0.3149, + "step": 1386 + }, + { + "epoch": 0.277822257806245, + "learning_rate": 1.1060108531918955e-05, + "loss": 0.46, + "step": 1388 + }, + { + "epoch": 0.27862289831865494, + "learning_rate": 1.1074003798653215e-05, + "loss": 0.1758, + "step": 1390 + }, + { + "epoch": 0.27862289831865494, + "learning_rate": 1.1087896967834787e-05, + "loss": 0.0706, + "step": 1392 + }, + { + "epoch": 0.27942353883106485, + "learning_rate": 1.1101788012330013e-05, + "loss": 0.1362, + "step": 1394 + }, + { + "epoch": 0.27942353883106485, + "learning_rate": 1.111567690500938e-05, + "loss": 0.1505, + "step": 1396 + }, + { + "epoch": 0.28022417934347477, + "learning_rate": 1.1129563618747581e-05, + "loss": 0.1617, + "step": 1398 + }, + { + "epoch": 0.28022417934347477, + "learning_rate": 1.1143448126423545e-05, + "loss": 0.1319, + "step": 1400 + }, + { + "epoch": 0.28102481985588473, + "learning_rate": 1.1157330400920563e-05, + "loss": 0.3119, + "step": 1402 + }, + { + "epoch": 0.28102481985588473, + "learning_rate": 1.1171210415126238e-05, + "loss": 0.1945, + "step": 1404 + }, + { + "epoch": 0.28182546036829464, + "learning_rate": 1.1185088141932594e-05, + "loss": 0.363, + "step": 1406 + }, + { + "epoch": 0.28182546036829464, + "learning_rate": 1.1198963554236135e-05, + "loss": 0.1512, + "step": 1408 + }, + { + "epoch": 0.28262610088070456, + "learning_rate": 1.121283662493788e-05, + "loss": 0.6766, + "step": 1410 + }, + { + "epoch": 0.28262610088070456, + "learning_rate": 1.122670732694342e-05, + "loss": 0.3273, + "step": 1412 + }, + { + "epoch": 0.28342674139311447, + "learning_rate": 1.1240575633162958e-05, + "loss": 0.791, + "step": 1414 + }, + { + "epoch": 0.28342674139311447, + "learning_rate": 1.1254441516511425e-05, + "loss": 0.0908, + "step": 1416 + }, + { + "epoch": 0.28422738190552443, + "learning_rate": 1.1268304949908434e-05, + "loss": 0.4785, + "step": 1418 + }, + { + "epoch": 0.28422738190552443, + "learning_rate": 1.1282165906278395e-05, + "loss": 0.1865, + "step": 1420 + }, + { + "epoch": 0.28502802241793435, + "learning_rate": 1.1296024358550565e-05, + "loss": 0.5856, + "step": 1422 + }, + { + "epoch": 0.28502802241793435, + "learning_rate": 1.1309880279659087e-05, + "loss": 0.3426, + "step": 1424 + }, + { + "epoch": 0.28582866293034426, + "learning_rate": 1.1323733642543024e-05, + "loss": 0.2391, + "step": 1426 + }, + { + "epoch": 0.28582866293034426, + "learning_rate": 1.1337584420146496e-05, + "loss": 0.1511, + "step": 1428 + }, + { + "epoch": 0.2866293034427542, + "learning_rate": 1.135143258541862e-05, + "loss": 0.2774, + "step": 1430 + }, + { + "epoch": 0.2866293034427542, + "learning_rate": 1.1365278111313625e-05, + "loss": 0.3206, + "step": 1432 + }, + { + "epoch": 0.28742994395516414, + "learning_rate": 1.13791209707909e-05, + "loss": 0.3635, + "step": 1434 + }, + { + "epoch": 0.28742994395516414, + "learning_rate": 1.1392961136815041e-05, + "loss": 0.2231, + "step": 1436 + }, + { + "epoch": 0.28823058446757405, + "learning_rate": 1.1406798582355902e-05, + "loss": 0.1714, + "step": 1438 + }, + { + "epoch": 0.28823058446757405, + "learning_rate": 1.142063328038864e-05, + "loss": 0.134, + "step": 1440 + }, + { + "epoch": 0.289031224979984, + "learning_rate": 1.1434465203893818e-05, + "loss": 0.2081, + "step": 1442 + }, + { + "epoch": 0.289031224979984, + "learning_rate": 1.1448294325857377e-05, + "loss": 0.2605, + "step": 1444 + }, + { + "epoch": 0.2898318654923939, + "learning_rate": 1.146212061927074e-05, + "loss": 0.2458, + "step": 1446 + }, + { + "epoch": 0.2898318654923939, + "learning_rate": 1.1475944057130856e-05, + "loss": 0.1125, + "step": 1448 + }, + { + "epoch": 0.29063250600480384, + "learning_rate": 1.1489764612440255e-05, + "loss": 0.4706, + "step": 1450 + }, + { + "epoch": 0.29063250600480384, + "learning_rate": 1.150358225820707e-05, + "loss": 0.1494, + "step": 1452 + }, + { + "epoch": 0.29143314651721375, + "learning_rate": 1.151739696744518e-05, + "loss": 0.1354, + "step": 1454 + }, + { + "epoch": 0.29143314651721375, + "learning_rate": 1.1531208713174138e-05, + "loss": 0.2099, + "step": 1456 + }, + { + "epoch": 0.2922337870296237, + "learning_rate": 1.1545017468419307e-05, + "loss": 0.2054, + "step": 1458 + }, + { + "epoch": 0.2922337870296237, + "learning_rate": 1.1558823206211887e-05, + "loss": 0.0943, + "step": 1460 + }, + { + "epoch": 0.2930344275420336, + "learning_rate": 1.1572625899588972e-05, + "loss": 0.3127, + "step": 1462 + }, + { + "epoch": 0.2930344275420336, + "learning_rate": 1.1586425521593607e-05, + "loss": 0.1961, + "step": 1464 + }, + { + "epoch": 0.29383506805444354, + "learning_rate": 1.1600222045274809e-05, + "loss": 0.2268, + "step": 1466 + }, + { + "epoch": 0.29383506805444354, + "learning_rate": 1.1614015443687708e-05, + "loss": 0.0103, + "step": 1468 + }, + { + "epoch": 0.2946357085668535, + "learning_rate": 1.1627805689893478e-05, + "loss": 0.2393, + "step": 1470 + }, + { + "epoch": 0.2946357085668535, + "learning_rate": 1.1641592756959467e-05, + "loss": 0.5504, + "step": 1472 + }, + { + "epoch": 0.2954363490792634, + "learning_rate": 1.1655376617959239e-05, + "loss": 0.7182, + "step": 1474 + }, + { + "epoch": 0.2954363490792634, + "learning_rate": 1.1669157245972616e-05, + "loss": 0.6106, + "step": 1476 + }, + { + "epoch": 0.2962369895916733, + "learning_rate": 1.1682934614085708e-05, + "loss": 0.4883, + "step": 1478 + }, + { + "epoch": 0.2962369895916733, + "learning_rate": 1.1696708695391057e-05, + "loss": 0.2193, + "step": 1480 + }, + { + "epoch": 0.29703763010408324, + "learning_rate": 1.1710479462987565e-05, + "loss": 0.188, + "step": 1482 + }, + { + "epoch": 0.29703763010408324, + "learning_rate": 1.1724246889980626e-05, + "loss": 0.2708, + "step": 1484 + }, + { + "epoch": 0.2978382706164932, + "learning_rate": 1.1738010949482152e-05, + "loss": 0.2946, + "step": 1486 + }, + { + "epoch": 0.2978382706164932, + "learning_rate": 1.1751771614610643e-05, + "loss": 0.4407, + "step": 1488 + }, + { + "epoch": 0.2986389111289031, + "learning_rate": 1.176552885849122e-05, + "loss": 0.7374, + "step": 1490 + }, + { + "epoch": 0.2986389111289031, + "learning_rate": 1.1779282654255668e-05, + "loss": 0.2638, + "step": 1492 + }, + { + "epoch": 0.29943955164131303, + "learning_rate": 1.1793032975042563e-05, + "loss": 0.5854, + "step": 1494 + }, + { + "epoch": 0.29943955164131303, + "learning_rate": 1.180677979399721e-05, + "loss": 0.344, + "step": 1496 + }, + { + "epoch": 0.300240192153723, + "learning_rate": 1.1820523084271775e-05, + "loss": 0.3481, + "step": 1498 + }, + { + "epoch": 0.300240192153723, + "learning_rate": 1.1834262819025317e-05, + "loss": 0.202, + "step": 1500 + }, + { + "epoch": 0.3010408326661329, + "learning_rate": 1.1847998971423835e-05, + "loss": 0.2194, + "step": 1502 + }, + { + "epoch": 0.3010408326661329, + "learning_rate": 1.1861731514640309e-05, + "loss": 0.1423, + "step": 1504 + }, + { + "epoch": 0.3018414731785428, + "learning_rate": 1.1875460421854816e-05, + "loss": 0.5414, + "step": 1506 + }, + { + "epoch": 0.3018414731785428, + "learning_rate": 1.188918566625449e-05, + "loss": 0.2507, + "step": 1508 + }, + { + "epoch": 0.3026421136909528, + "learning_rate": 1.1902907221033629e-05, + "loss": 0.2284, + "step": 1510 + }, + { + "epoch": 0.3026421136909528, + "learning_rate": 1.1916625059393739e-05, + "loss": 0.3953, + "step": 1512 + }, + { + "epoch": 0.3034427542033627, + "learning_rate": 1.1930339154543582e-05, + "loss": 0.1421, + "step": 1514 + }, + { + "epoch": 0.3034427542033627, + "learning_rate": 1.1944049479699241e-05, + "loss": 0.1371, + "step": 1516 + }, + { + "epoch": 0.3042433947157726, + "learning_rate": 1.1957756008084127e-05, + "loss": 0.134, + "step": 1518 + }, + { + "epoch": 0.3042433947157726, + "learning_rate": 1.1971458712929133e-05, + "loss": 0.1444, + "step": 1520 + }, + { + "epoch": 0.3050440352281825, + "learning_rate": 1.1985157567472563e-05, + "loss": 0.3414, + "step": 1522 + }, + { + "epoch": 0.3050440352281825, + "learning_rate": 1.1998852544960256e-05, + "loss": 0.1708, + "step": 1524 + }, + { + "epoch": 0.3058446757405925, + "learning_rate": 1.2012543618645622e-05, + "loss": 0.4088, + "step": 1526 + }, + { + "epoch": 0.3058446757405925, + "learning_rate": 1.2026230761789702e-05, + "loss": 0.5856, + "step": 1528 + }, + { + "epoch": 0.3066453162530024, + "learning_rate": 1.2039913947661205e-05, + "loss": 0.1267, + "step": 1530 + }, + { + "epoch": 0.3066453162530024, + "learning_rate": 1.2053593149536557e-05, + "loss": 0.1749, + "step": 1532 + }, + { + "epoch": 0.3074459567654123, + "learning_rate": 1.2067268340700016e-05, + "loss": 0.171, + "step": 1534 + }, + { + "epoch": 0.3074459567654123, + "learning_rate": 1.2080939494443618e-05, + "loss": 0.5205, + "step": 1536 + }, + { + "epoch": 0.3082465972778223, + "learning_rate": 1.2094606584067304e-05, + "loss": 0.2663, + "step": 1538 + }, + { + "epoch": 0.3082465972778223, + "learning_rate": 1.210826958287895e-05, + "loss": 0.5045, + "step": 1540 + }, + { + "epoch": 0.3090472377902322, + "learning_rate": 1.212192846419443e-05, + "loss": 0.2819, + "step": 1542 + }, + { + "epoch": 0.3090472377902322, + "learning_rate": 1.2135583201337646e-05, + "loss": 0.5187, + "step": 1544 + }, + { + "epoch": 0.3098478783026421, + "learning_rate": 1.2149233767640587e-05, + "loss": 0.1424, + "step": 1546 + }, + { + "epoch": 0.3098478783026421, + "learning_rate": 1.2162880136443434e-05, + "loss": 0.0775, + "step": 1548 + }, + { + "epoch": 0.31064851881505207, + "learning_rate": 1.2176522281094514e-05, + "loss": 0.2619, + "step": 1550 + }, + { + "epoch": 0.31064851881505207, + "learning_rate": 1.2190160174950428e-05, + "loss": 0.3382, + "step": 1552 + }, + { + "epoch": 0.311449159327462, + "learning_rate": 1.220379379137607e-05, + "loss": 0.1415, + "step": 1554 + }, + { + "epoch": 0.311449159327462, + "learning_rate": 1.2217423103744692e-05, + "loss": 0.8941, + "step": 1556 + }, + { + "epoch": 0.3122497998398719, + "learning_rate": 1.2231048085437953e-05, + "loss": 0.3341, + "step": 1558 + }, + { + "epoch": 0.3122497998398719, + "learning_rate": 1.2244668709845952e-05, + "loss": 0.1666, + "step": 1560 + }, + { + "epoch": 0.3130504403522818, + "learning_rate": 1.2258284950367347e-05, + "loss": 0.557, + "step": 1562 + }, + { + "epoch": 0.3130504403522818, + "learning_rate": 1.2271896780409309e-05, + "loss": 0.3639, + "step": 1564 + }, + { + "epoch": 0.31385108086469177, + "learning_rate": 1.228550417338764e-05, + "loss": 0.2868, + "step": 1566 + }, + { + "epoch": 0.31385108086469177, + "learning_rate": 1.2299107102726804e-05, + "loss": 0.4484, + "step": 1568 + }, + { + "epoch": 0.3146517213771017, + "learning_rate": 1.2312705541859985e-05, + "loss": 0.4792, + "step": 1570 + }, + { + "epoch": 0.3146517213771017, + "learning_rate": 1.2326299464229143e-05, + "loss": 0.4259, + "step": 1572 + }, + { + "epoch": 0.3154523618895116, + "learning_rate": 1.2339888843285029e-05, + "loss": 0.2203, + "step": 1574 + }, + { + "epoch": 0.3154523618895116, + "learning_rate": 1.2353473652487329e-05, + "loss": 0.2445, + "step": 1576 + }, + { + "epoch": 0.31625300240192156, + "learning_rate": 1.2367053865304597e-05, + "loss": 0.1356, + "step": 1578 + }, + { + "epoch": 0.31625300240192156, + "learning_rate": 1.2380629455214385e-05, + "loss": 0.1783, + "step": 1580 + }, + { + "epoch": 0.31705364291433147, + "learning_rate": 1.2394200395703273e-05, + "loss": 0.4181, + "step": 1582 + }, + { + "epoch": 0.31705364291433147, + "learning_rate": 1.2407766660266916e-05, + "loss": 0.5074, + "step": 1584 + }, + { + "epoch": 0.3178542834267414, + "learning_rate": 1.2421328222410109e-05, + "loss": 0.2392, + "step": 1586 + }, + { + "epoch": 0.3178542834267414, + "learning_rate": 1.2434885055646808e-05, + "loss": 0.2884, + "step": 1588 + }, + { + "epoch": 0.31865492393915135, + "learning_rate": 1.2448437133500262e-05, + "loss": 0.2466, + "step": 1590 + }, + { + "epoch": 0.31865492393915135, + "learning_rate": 1.2461984429502947e-05, + "loss": 0.2529, + "step": 1592 + }, + { + "epoch": 0.31945556445156126, + "learning_rate": 1.2475526917196703e-05, + "loss": 0.204, + "step": 1594 + }, + { + "epoch": 0.31945556445156126, + "learning_rate": 1.2489064570132761e-05, + "loss": 0.1597, + "step": 1596 + }, + { + "epoch": 0.32025620496397117, + "learning_rate": 1.2502597361871787e-05, + "loss": 0.1865, + "step": 1598 + }, + { + "epoch": 0.32025620496397117, + "learning_rate": 1.2516125265983945e-05, + "loss": 0.5956, + "step": 1600 + }, + { + "epoch": 0.3210568454763811, + "learning_rate": 1.2529648256048931e-05, + "loss": 0.4371, + "step": 1602 + }, + { + "epoch": 0.3210568454763811, + "learning_rate": 1.2543166305656089e-05, + "loss": 0.4267, + "step": 1604 + }, + { + "epoch": 0.32185748598879105, + "learning_rate": 1.2556679388404351e-05, + "loss": 0.1836, + "step": 1606 + }, + { + "epoch": 0.32185748598879105, + "learning_rate": 1.257018747790238e-05, + "loss": 0.107, + "step": 1608 + }, + { + "epoch": 0.32265812650120096, + "learning_rate": 1.2583690547768584e-05, + "loss": 0.2521, + "step": 1610 + }, + { + "epoch": 0.32265812650120096, + "learning_rate": 1.259718857163117e-05, + "loss": 0.0604, + "step": 1612 + }, + { + "epoch": 0.32345876701361087, + "learning_rate": 1.261068152312821e-05, + "loss": 0.4678, + "step": 1614 + }, + { + "epoch": 0.32345876701361087, + "learning_rate": 1.2624169375907657e-05, + "loss": 0.3044, + "step": 1616 + }, + { + "epoch": 0.32425940752602084, + "learning_rate": 1.2637652103627481e-05, + "loss": 0.2477, + "step": 1618 + }, + { + "epoch": 0.32425940752602084, + "learning_rate": 1.2651129679955598e-05, + "loss": 0.1585, + "step": 1620 + }, + { + "epoch": 0.32506004803843075, + "learning_rate": 1.2664602078570017e-05, + "loss": 0.1879, + "step": 1622 + }, + { + "epoch": 0.32506004803843075, + "learning_rate": 1.2678069273158849e-05, + "loss": 0.1581, + "step": 1624 + }, + { + "epoch": 0.32586068855084066, + "learning_rate": 1.2691531237420369e-05, + "loss": 0.352, + "step": 1626 + }, + { + "epoch": 0.32586068855084066, + "learning_rate": 1.2704987945063073e-05, + "loss": 0.2161, + "step": 1628 + }, + { + "epoch": 0.3266613290632506, + "learning_rate": 1.27184393698057e-05, + "loss": 0.2447, + "step": 1630 + }, + { + "epoch": 0.3266613290632506, + "learning_rate": 1.273188548537736e-05, + "loss": 0.1586, + "step": 1632 + }, + { + "epoch": 0.32746196957566054, + "learning_rate": 1.2745326265517481e-05, + "loss": 0.2529, + "step": 1634 + }, + { + "epoch": 0.32746196957566054, + "learning_rate": 1.2758761683975929e-05, + "loss": 0.2019, + "step": 1636 + }, + { + "epoch": 0.32826261008807045, + "learning_rate": 1.277219171451304e-05, + "loss": 0.2159, + "step": 1638 + }, + { + "epoch": 0.32826261008807045, + "learning_rate": 1.2785616330899676e-05, + "loss": 0.347, + "step": 1640 + }, + { + "epoch": 0.32906325060048036, + "learning_rate": 1.2799035506917265e-05, + "loss": 0.2066, + "step": 1642 + }, + { + "epoch": 0.32906325060048036, + "learning_rate": 1.2812449216357855e-05, + "loss": 0.0917, + "step": 1644 + }, + { + "epoch": 0.32986389111289033, + "learning_rate": 1.2825857433024208e-05, + "loss": 0.4873, + "step": 1646 + }, + { + "epoch": 0.32986389111289033, + "learning_rate": 1.2839260130729776e-05, + "loss": 0.151, + "step": 1648 + }, + { + "epoch": 0.33066453162530024, + "learning_rate": 1.2852657283298794e-05, + "loss": 0.4022, + "step": 1650 + }, + { + "epoch": 0.33066453162530024, + "learning_rate": 1.2866048864566336e-05, + "loss": 0.0321, + "step": 1652 + }, + { + "epoch": 0.33146517213771015, + "learning_rate": 1.2879434848378356e-05, + "loss": 0.3497, + "step": 1654 + }, + { + "epoch": 0.33146517213771015, + "learning_rate": 1.2892815208591734e-05, + "loss": 0.4356, + "step": 1656 + }, + { + "epoch": 0.3322658126501201, + "learning_rate": 1.2906189919074336e-05, + "loss": 0.2773, + "step": 1658 + }, + { + "epoch": 0.3322658126501201, + "learning_rate": 1.2919558953705047e-05, + "loss": 0.7321, + "step": 1660 + }, + { + "epoch": 0.33306645316253003, + "learning_rate": 1.293292228637389e-05, + "loss": 0.5336, + "step": 1662 + }, + { + "epoch": 0.33306645316253003, + "learning_rate": 1.2946279890981966e-05, + "loss": 0.1866, + "step": 1664 + }, + { + "epoch": 0.33386709367493994, + "learning_rate": 1.2959631741441583e-05, + "loss": 0.4319, + "step": 1666 + }, + { + "epoch": 0.33386709367493994, + "learning_rate": 1.2972977811676289e-05, + "loss": 0.0908, + "step": 1668 + }, + { + "epoch": 0.33466773418734985, + "learning_rate": 1.298631807562092e-05, + "loss": 0.1757, + "step": 1670 + }, + { + "epoch": 0.33466773418734985, + "learning_rate": 1.2999652507221652e-05, + "loss": 0.1134, + "step": 1672 + }, + { + "epoch": 0.3354683746997598, + "learning_rate": 1.3012981080436036e-05, + "loss": 0.9963, + "step": 1674 + }, + { + "epoch": 0.3354683746997598, + "learning_rate": 1.3026303769233109e-05, + "loss": 0.1905, + "step": 1676 + }, + { + "epoch": 0.33626901521216973, + "learning_rate": 1.3039620547593357e-05, + "loss": 0.1582, + "step": 1678 + }, + { + "epoch": 0.33626901521216973, + "learning_rate": 1.3052931389508822e-05, + "loss": 0.3927, + "step": 1680 + }, + { + "epoch": 0.33706965572457964, + "learning_rate": 1.3066236268983143e-05, + "loss": 0.2775, + "step": 1682 + }, + { + "epoch": 0.33706965572457964, + "learning_rate": 1.3079535160031601e-05, + "loss": 0.3273, + "step": 1684 + }, + { + "epoch": 0.3378702962369896, + "learning_rate": 1.3092828036681178e-05, + "loss": 0.3939, + "step": 1686 + }, + { + "epoch": 0.3378702962369896, + "learning_rate": 1.3106114872970575e-05, + "loss": 0.6645, + "step": 1688 + }, + { + "epoch": 0.3386709367493995, + "learning_rate": 1.3119395642950348e-05, + "loss": 0.5927, + "step": 1690 + }, + { + "epoch": 0.3386709367493995, + "learning_rate": 1.313267032068285e-05, + "loss": 0.1866, + "step": 1692 + }, + { + "epoch": 0.33947157726180943, + "learning_rate": 1.3145938880242346e-05, + "loss": 0.5036, + "step": 1694 + }, + { + "epoch": 0.33947157726180943, + "learning_rate": 1.3159201295715054e-05, + "loss": 0.2308, + "step": 1696 + }, + { + "epoch": 0.3402722177742194, + "learning_rate": 1.3172457541199188e-05, + "loss": 0.1594, + "step": 1698 + }, + { + "epoch": 0.3402722177742194, + "learning_rate": 1.3185707590804997e-05, + "loss": 0.1036, + "step": 1700 + }, + { + "epoch": 0.3410728582866293, + "learning_rate": 1.3198951418654882e-05, + "loss": 0.1588, + "step": 1702 + }, + { + "epoch": 0.3410728582866293, + "learning_rate": 1.321218899888334e-05, + "loss": 0.3016, + "step": 1704 + }, + { + "epoch": 0.3418734987990392, + "learning_rate": 1.322542030563709e-05, + "loss": 0.1278, + "step": 1706 + }, + { + "epoch": 0.3418734987990392, + "learning_rate": 1.3238645313075109e-05, + "loss": 0.198, + "step": 1708 + }, + { + "epoch": 0.34267413931144913, + "learning_rate": 1.3251863995368665e-05, + "loss": 0.3702, + "step": 1710 + }, + { + "epoch": 0.34267413931144913, + "learning_rate": 1.326507632670139e-05, + "loss": 0.3422, + "step": 1712 + }, + { + "epoch": 0.3434747798238591, + "learning_rate": 1.3278282281269293e-05, + "loss": 0.1581, + "step": 1714 + }, + { + "epoch": 0.3434747798238591, + "learning_rate": 1.3291481833280894e-05, + "loss": 0.1753, + "step": 1716 + }, + { + "epoch": 0.344275420336269, + "learning_rate": 1.3304674956957167e-05, + "loss": 0.1498, + "step": 1718 + }, + { + "epoch": 0.344275420336269, + "learning_rate": 1.3317861626531652e-05, + "loss": 0.9579, + "step": 1720 + }, + { + "epoch": 0.3450760608486789, + "learning_rate": 1.3331041816250503e-05, + "loss": 0.3371, + "step": 1722 + }, + { + "epoch": 0.3450760608486789, + "learning_rate": 1.3344215500372517e-05, + "loss": 0.29, + "step": 1724 + }, + { + "epoch": 0.3458767013610889, + "learning_rate": 1.335738265316921e-05, + "loss": 0.1687, + "step": 1726 + }, + { + "epoch": 0.3458767013610889, + "learning_rate": 1.3370543248924826e-05, + "loss": 0.0662, + "step": 1728 + }, + { + "epoch": 0.3466773418734988, + "learning_rate": 1.3383697261936472e-05, + "loss": 0.1648, + "step": 1730 + }, + { + "epoch": 0.3466773418734988, + "learning_rate": 1.3396844666514062e-05, + "loss": 0.1584, + "step": 1732 + }, + { + "epoch": 0.3474779823859087, + "learning_rate": 1.3409985436980422e-05, + "loss": 0.6786, + "step": 1734 + }, + { + "epoch": 0.3474779823859087, + "learning_rate": 1.3423119547671348e-05, + "loss": 0.1425, + "step": 1736 + }, + { + "epoch": 0.3482786228983187, + "learning_rate": 1.3436246972935638e-05, + "loss": 0.1852, + "step": 1738 + }, + { + "epoch": 0.3482786228983187, + "learning_rate": 1.344936768713513e-05, + "loss": 0.1847, + "step": 1740 + }, + { + "epoch": 0.3490792634107286, + "learning_rate": 1.346248166464481e-05, + "loss": 0.3442, + "step": 1742 + }, + { + "epoch": 0.3490792634107286, + "learning_rate": 1.347558887985279e-05, + "loss": 0.2026, + "step": 1744 + }, + { + "epoch": 0.3498799039231385, + "learning_rate": 1.348868930716039e-05, + "loss": 0.1953, + "step": 1746 + }, + { + "epoch": 0.3498799039231385, + "learning_rate": 1.3501782920982189e-05, + "loss": 0.2775, + "step": 1748 + }, + { + "epoch": 0.3506805444355484, + "learning_rate": 1.3514869695746078e-05, + "loss": 0.7842, + "step": 1750 + }, + { + "epoch": 0.3506805444355484, + "learning_rate": 1.3527949605893305e-05, + "loss": 0.1103, + "step": 1752 + }, + { + "epoch": 0.3514811849479584, + "learning_rate": 1.3541022625878501e-05, + "loss": 0.4597, + "step": 1754 + }, + { + "epoch": 0.3514811849479584, + "learning_rate": 1.3554088730169812e-05, + "loss": 0.155, + "step": 1756 + }, + { + "epoch": 0.3522818254603683, + "learning_rate": 1.3567147893248833e-05, + "loss": 0.3697, + "step": 1758 + }, + { + "epoch": 0.3522818254603683, + "learning_rate": 1.3580200089610739e-05, + "loss": 0.3635, + "step": 1760 + }, + { + "epoch": 0.3530824659727782, + "learning_rate": 1.3593245293764303e-05, + "loss": 0.2324, + "step": 1762 + }, + { + "epoch": 0.3530824659727782, + "learning_rate": 1.3606283480231962e-05, + "loss": 0.1206, + "step": 1764 + }, + { + "epoch": 0.35388310648518817, + "learning_rate": 1.361931462354984e-05, + "loss": 0.4571, + "step": 1766 + }, + { + "epoch": 0.35388310648518817, + "learning_rate": 1.3632338698267863e-05, + "loss": 0.2093, + "step": 1768 + }, + { + "epoch": 0.3546837469975981, + "learning_rate": 1.3645355678949715e-05, + "loss": 0.2104, + "step": 1770 + }, + { + "epoch": 0.3546837469975981, + "learning_rate": 1.3658365540172948e-05, + "loss": 0.1509, + "step": 1772 + }, + { + "epoch": 0.355484387510008, + "learning_rate": 1.3671368256529026e-05, + "loss": 0.3641, + "step": 1774 + }, + { + "epoch": 0.355484387510008, + "learning_rate": 1.368436380262336e-05, + "loss": 0.4026, + "step": 1776 + }, + { + "epoch": 0.35628502802241796, + "learning_rate": 1.3697352153075365e-05, + "loss": 0.2054, + "step": 1778 + }, + { + "epoch": 0.35628502802241796, + "learning_rate": 1.3710333282518497e-05, + "loss": 0.3873, + "step": 1780 + }, + { + "epoch": 0.35708566853482787, + "learning_rate": 1.3723307165600361e-05, + "loss": 0.2098, + "step": 1782 + }, + { + "epoch": 0.35708566853482787, + "learning_rate": 1.3736273776982667e-05, + "loss": 0.5413, + "step": 1784 + }, + { + "epoch": 0.3578863090472378, + "learning_rate": 1.3749233091341344e-05, + "loss": 0.6091, + "step": 1786 + }, + { + "epoch": 0.3578863090472378, + "learning_rate": 1.3762185083366562e-05, + "loss": 0.363, + "step": 1788 + }, + { + "epoch": 0.3586869495596477, + "learning_rate": 1.3775129727762808e-05, + "loss": 0.1509, + "step": 1790 + }, + { + "epoch": 0.3586869495596477, + "learning_rate": 1.3788066999248893e-05, + "loss": 0.5257, + "step": 1792 + }, + { + "epoch": 0.35948759007205766, + "learning_rate": 1.3800996872558075e-05, + "loss": 0.1339, + "step": 1794 + }, + { + "epoch": 0.35948759007205766, + "learning_rate": 1.3813919322438018e-05, + "loss": 0.2294, + "step": 1796 + }, + { + "epoch": 0.3602882305844676, + "learning_rate": 1.3826834323650899e-05, + "loss": 0.1752, + "step": 1798 + }, + { + "epoch": 0.3602882305844676, + "learning_rate": 1.3839741850973435e-05, + "loss": 0.2616, + "step": 1800 + }, + { + "epoch": 0.3610888710968775, + "learning_rate": 1.3852641879196952e-05, + "loss": 0.223, + "step": 1802 + }, + { + "epoch": 0.3610888710968775, + "learning_rate": 1.3865534383127413e-05, + "loss": 0.2532, + "step": 1804 + }, + { + "epoch": 0.36188951160928745, + "learning_rate": 1.387841933758546e-05, + "loss": 0.6906, + "step": 1806 + }, + { + "epoch": 0.36188951160928745, + "learning_rate": 1.3891296717406533e-05, + "loss": 0.1677, + "step": 1808 + }, + { + "epoch": 0.36269015212169736, + "learning_rate": 1.3904166497440812e-05, + "loss": 0.382, + "step": 1810 + }, + { + "epoch": 0.36269015212169736, + "learning_rate": 1.391702865255334e-05, + "loss": 0.0419, + "step": 1812 + }, + { + "epoch": 0.3634907926341073, + "learning_rate": 1.3929883157624046e-05, + "loss": 0.2186, + "step": 1814 + }, + { + "epoch": 0.3634907926341073, + "learning_rate": 1.3942729987547808e-05, + "loss": 0.1339, + "step": 1816 + }, + { + "epoch": 0.36429143314651724, + "learning_rate": 1.3955569117234468e-05, + "loss": 0.2884, + "step": 1818 + }, + { + "epoch": 0.36429143314651724, + "learning_rate": 1.3968400521608962e-05, + "loss": 0.4636, + "step": 1820 + }, + { + "epoch": 0.36509207365892715, + "learning_rate": 1.3981224175611265e-05, + "loss": 0.1977, + "step": 1822 + }, + { + "epoch": 0.36509207365892715, + "learning_rate": 1.3994040054196498e-05, + "loss": 0.1427, + "step": 1824 + }, + { + "epoch": 0.36589271417133706, + "learning_rate": 1.4006848132334979e-05, + "loss": 0.2076, + "step": 1826 + }, + { + "epoch": 0.36589271417133706, + "learning_rate": 1.4019648385012245e-05, + "loss": 0.2774, + "step": 1828 + }, + { + "epoch": 0.366693354683747, + "learning_rate": 1.4032440787229135e-05, + "loss": 0.2814, + "step": 1830 + }, + { + "epoch": 0.366693354683747, + "learning_rate": 1.4045225314001789e-05, + "loss": 0.0472, + "step": 1832 + }, + { + "epoch": 0.36749399519615694, + "learning_rate": 1.4058001940361781e-05, + "loss": 0.1134, + "step": 1834 + }, + { + "epoch": 0.36749399519615694, + "learning_rate": 1.4070770641356069e-05, + "loss": 0.1165, + "step": 1836 + }, + { + "epoch": 0.36829463570856685, + "learning_rate": 1.40835313920471e-05, + "loss": 0.5222, + "step": 1838 + }, + { + "epoch": 0.36829463570856685, + "learning_rate": 1.4096284167512856e-05, + "loss": 0.093, + "step": 1840 + }, + { + "epoch": 0.36909527622097676, + "learning_rate": 1.4109028942846888e-05, + "loss": 0.3191, + "step": 1842 + }, + { + "epoch": 0.36909527622097676, + "learning_rate": 1.4121765693158355e-05, + "loss": 0.0727, + "step": 1844 + }, + { + "epoch": 0.36989591673338673, + "learning_rate": 1.4134494393572146e-05, + "loss": 0.4688, + "step": 1846 + }, + { + "epoch": 0.36989591673338673, + "learning_rate": 1.4147215019228813e-05, + "loss": 0.473, + "step": 1848 + }, + { + "epoch": 0.37069655724579664, + "learning_rate": 1.4159927545284697e-05, + "loss": 0.3694, + "step": 1850 + }, + { + "epoch": 0.37069655724579664, + "learning_rate": 1.4172631946911964e-05, + "loss": 0.3039, + "step": 1852 + }, + { + "epoch": 0.37149719775820655, + "learning_rate": 1.4185328199298636e-05, + "loss": 0.5704, + "step": 1854 + }, + { + "epoch": 0.37149719775820655, + "learning_rate": 1.4198016277648665e-05, + "loss": 0.2662, + "step": 1856 + }, + { + "epoch": 0.37229783827061647, + "learning_rate": 1.4210696157181936e-05, + "loss": 0.1665, + "step": 1858 + }, + { + "epoch": 0.37229783827061647, + "learning_rate": 1.4223367813134406e-05, + "loss": 0.3147, + "step": 1860 + }, + { + "epoch": 0.37309847878302643, + "learning_rate": 1.4236031220758037e-05, + "loss": 0.3042, + "step": 1862 + }, + { + "epoch": 0.37309847878302643, + "learning_rate": 1.4248686355320922e-05, + "loss": 0.508, + "step": 1864 + }, + { + "epoch": 0.37389911929543634, + "learning_rate": 1.426133319210731e-05, + "loss": 0.382, + "step": 1866 + }, + { + "epoch": 0.37389911929543634, + "learning_rate": 1.4273971706417653e-05, + "loss": 0.2046, + "step": 1868 + }, + { + "epoch": 0.37469975980784626, + "learning_rate": 1.4286601873568642e-05, + "loss": 0.2612, + "step": 1870 + }, + { + "epoch": 0.37469975980784626, + "learning_rate": 1.429922366889332e-05, + "loss": 0.2237, + "step": 1872 + }, + { + "epoch": 0.3755004003202562, + "learning_rate": 1.431183706774103e-05, + "loss": 0.1515, + "step": 1874 + }, + { + "epoch": 0.3755004003202562, + "learning_rate": 1.4324442045477534e-05, + "loss": 0.1314, + "step": 1876 + }, + { + "epoch": 0.37630104083266613, + "learning_rate": 1.4337038577485035e-05, + "loss": 0.3572, + "step": 1878 + }, + { + "epoch": 0.37630104083266613, + "learning_rate": 1.4349626639162231e-05, + "loss": 0.2457, + "step": 1880 + }, + { + "epoch": 0.37710168134507605, + "learning_rate": 1.436220620592437e-05, + "loss": 0.2285, + "step": 1882 + }, + { + "epoch": 0.37710168134507605, + "learning_rate": 1.4374777253203265e-05, + "loss": 0.1202, + "step": 1884 + }, + { + "epoch": 0.377902321857486, + "learning_rate": 1.4387339756447422e-05, + "loss": 0.1896, + "step": 1886 + }, + { + "epoch": 0.377902321857486, + "learning_rate": 1.4399893691121985e-05, + "loss": 0.2798, + "step": 1888 + }, + { + "epoch": 0.3787029623698959, + "learning_rate": 1.4412439032708848e-05, + "loss": 0.4395, + "step": 1890 + }, + { + "epoch": 0.3787029623698959, + "learning_rate": 1.4424975756706684e-05, + "loss": 0.7018, + "step": 1892 + }, + { + "epoch": 0.37950360288230583, + "learning_rate": 1.4437503838631002e-05, + "loss": 0.1509, + "step": 1894 + }, + { + "epoch": 0.37950360288230583, + "learning_rate": 1.4450023254014185e-05, + "loss": 0.2884, + "step": 1896 + }, + { + "epoch": 0.38030424339471575, + "learning_rate": 1.4462533978405529e-05, + "loss": 0.1681, + "step": 1898 + }, + { + "epoch": 0.38030424339471575, + "learning_rate": 1.4475035987371348e-05, + "loss": 0.6103, + "step": 1900 + }, + { + "epoch": 0.3811048839071257, + "learning_rate": 1.4487529256494937e-05, + "loss": 0.4381, + "step": 1902 + }, + { + "epoch": 0.3811048839071257, + "learning_rate": 1.4500013761376663e-05, + "loss": 0.0341, + "step": 1904 + }, + { + "epoch": 0.3819055244195356, + "learning_rate": 1.4512489477634024e-05, + "loss": 0.2503, + "step": 1906 + }, + { + "epoch": 0.3819055244195356, + "learning_rate": 1.4524956380901674e-05, + "loss": 0.5279, + "step": 1908 + }, + { + "epoch": 0.38270616493194554, + "learning_rate": 1.4537414446831461e-05, + "loss": 0.3645, + "step": 1910 + }, + { + "epoch": 0.38270616493194554, + "learning_rate": 1.454986365109255e-05, + "loss": 0.2444, + "step": 1912 + }, + { + "epoch": 0.3835068054443555, + "learning_rate": 1.4562303969371357e-05, + "loss": 0.6764, + "step": 1914 + }, + { + "epoch": 0.3835068054443555, + "learning_rate": 1.4574735377371669e-05, + "loss": 0.1493, + "step": 1916 + }, + { + "epoch": 0.3843074459567654, + "learning_rate": 1.4587157850814679e-05, + "loss": 0.4251, + "step": 1918 + }, + { + "epoch": 0.3843074459567654, + "learning_rate": 1.4599571365439027e-05, + "loss": 0.5829, + "step": 1920 + }, + { + "epoch": 0.3851080864691753, + "learning_rate": 1.4611975897000849e-05, + "loss": 0.1838, + "step": 1922 + }, + { + "epoch": 0.3851080864691753, + "learning_rate": 1.4624371421273812e-05, + "loss": 0.4492, + "step": 1924 + }, + { + "epoch": 0.3859087269815853, + "learning_rate": 1.463675791404922e-05, + "loss": 0.1978, + "step": 1926 + }, + { + "epoch": 0.3859087269815853, + "learning_rate": 1.4649135351135968e-05, + "loss": 0.2612, + "step": 1928 + }, + { + "epoch": 0.3867093674939952, + "learning_rate": 1.4661503708360652e-05, + "loss": 0.2757, + "step": 1930 + }, + { + "epoch": 0.3867093674939952, + "learning_rate": 1.4673862961567604e-05, + "loss": 0.1295, + "step": 1932 + }, + { + "epoch": 0.3875100080064051, + "learning_rate": 1.4686213086618932e-05, + "loss": 0.1662, + "step": 1934 + }, + { + "epoch": 0.3875100080064051, + "learning_rate": 1.4698554059394563e-05, + "loss": 0.5028, + "step": 1936 + }, + { + "epoch": 0.388310648518815, + "learning_rate": 1.4710885855792338e-05, + "loss": 0.3159, + "step": 1938 + }, + { + "epoch": 0.388310648518815, + "learning_rate": 1.4723208451727977e-05, + "loss": 0.2233, + "step": 1940 + }, + { + "epoch": 0.389111289031225, + "learning_rate": 1.4735521823135184e-05, + "loss": 0.7031, + "step": 1942 + }, + { + "epoch": 0.389111289031225, + "learning_rate": 1.4747825945965675e-05, + "loss": 0.5282, + "step": 1944 + }, + { + "epoch": 0.3899119295436349, + "learning_rate": 1.4760120796189233e-05, + "loss": 0.2355, + "step": 1946 + }, + { + "epoch": 0.3899119295436349, + "learning_rate": 1.4772406349793749e-05, + "loss": 0.2714, + "step": 1948 + }, + { + "epoch": 0.3907125700560448, + "learning_rate": 1.4784682582785254e-05, + "loss": 0.098, + "step": 1950 + }, + { + "epoch": 0.3907125700560448, + "learning_rate": 1.4796949471188033e-05, + "loss": 0.0441, + "step": 1952 + }, + { + "epoch": 0.3915132105684548, + "learning_rate": 1.4809206991044571e-05, + "loss": 0.4596, + "step": 1954 + }, + { + "epoch": 0.3915132105684548, + "learning_rate": 1.4821455118415666e-05, + "loss": 0.48, + "step": 1956 + }, + { + "epoch": 0.3923138510808647, + "learning_rate": 1.4833693829380458e-05, + "loss": 0.602, + "step": 1958 + }, + { + "epoch": 0.3923138510808647, + "learning_rate": 1.4845923100036479e-05, + "loss": 0.2665, + "step": 1960 + }, + { + "epoch": 0.3931144915932746, + "learning_rate": 1.4858142906499686e-05, + "loss": 0.2872, + "step": 1962 + }, + { + "epoch": 0.3931144915932746, + "learning_rate": 1.4870353224904563e-05, + "loss": 0.2957, + "step": 1964 + }, + { + "epoch": 0.3939151321056846, + "learning_rate": 1.4882554031404075e-05, + "loss": 0.3213, + "step": 1966 + }, + { + "epoch": 0.3939151321056846, + "learning_rate": 1.4894745302169786e-05, + "loss": 0.4493, + "step": 1968 + }, + { + "epoch": 0.3947157726180945, + "learning_rate": 1.4906927013391879e-05, + "loss": 0.1186, + "step": 1970 + }, + { + "epoch": 0.3947157726180945, + "learning_rate": 1.4919099141279205e-05, + "loss": 0.0231, + "step": 1972 + }, + { + "epoch": 0.3955164131305044, + "learning_rate": 1.4931261662059338e-05, + "loss": 0.5704, + "step": 1974 + }, + { + "epoch": 0.3955164131305044, + "learning_rate": 1.4943414551978597e-05, + "loss": 0.1527, + "step": 1976 + }, + { + "epoch": 0.3963170536429143, + "learning_rate": 1.4955557787302151e-05, + "loss": 0.2607, + "step": 1978 + }, + { + "epoch": 0.3963170536429143, + "learning_rate": 1.4967691344313988e-05, + "loss": 0.4968, + "step": 1980 + }, + { + "epoch": 0.3971176941553243, + "learning_rate": 1.4979815199317005e-05, + "loss": 0.3033, + "step": 1982 + }, + { + "epoch": 0.3971176941553243, + "learning_rate": 1.499192932863305e-05, + "loss": 0.3226, + "step": 1984 + }, + { + "epoch": 0.3979183346677342, + "learning_rate": 1.5004033708602967e-05, + "loss": 0.2159, + "step": 1986 + }, + { + "epoch": 0.3979183346677342, + "learning_rate": 1.5016128315586626e-05, + "loss": 0.3821, + "step": 1988 + }, + { + "epoch": 0.3987189751801441, + "learning_rate": 1.5028213125963029e-05, + "loss": 0.3776, + "step": 1990 + }, + { + "epoch": 0.3987189751801441, + "learning_rate": 1.5040288116130261e-05, + "loss": 0.3598, + "step": 1992 + }, + { + "epoch": 0.39951961569255406, + "learning_rate": 1.5052353262505603e-05, + "loss": 0.1422, + "step": 1994 + }, + { + "epoch": 0.39951961569255406, + "learning_rate": 1.5064408541525568e-05, + "loss": 0.2193, + "step": 1996 + }, + { + "epoch": 0.400320256204964, + "learning_rate": 1.5076453929645933e-05, + "loss": 0.2446, + "step": 1998 + }, + { + "epoch": 0.400320256204964, + "learning_rate": 1.5088489403341793e-05, + "loss": 0.091, + "step": 2000 + }, + { + "epoch": 0.4011208967173739, + "learning_rate": 1.510051493910759e-05, + "loss": 0.1945, + "step": 2002 + }, + { + "epoch": 0.4011208967173739, + "learning_rate": 1.5112530513457229e-05, + "loss": 0.4264, + "step": 2004 + }, + { + "epoch": 0.40192153722978385, + "learning_rate": 1.512453610292401e-05, + "loss": 0.3213, + "step": 2006 + }, + { + "epoch": 0.40192153722978385, + "learning_rate": 1.513653168406076e-05, + "loss": 0.4588, + "step": 2008 + }, + { + "epoch": 0.40272217774219377, + "learning_rate": 1.514851723343985e-05, + "loss": 0.2394, + "step": 2010 + }, + { + "epoch": 0.40272217774219377, + "learning_rate": 1.5160492727653245e-05, + "loss": 0.2162, + "step": 2012 + }, + { + "epoch": 0.4035228182546037, + "learning_rate": 1.5172458143312522e-05, + "loss": 0.2277, + "step": 2014 + }, + { + "epoch": 0.4035228182546037, + "learning_rate": 1.5184413457049006e-05, + "loss": 0.1239, + "step": 2016 + }, + { + "epoch": 0.4043234587670136, + "learning_rate": 1.5196358645513685e-05, + "loss": 0.3698, + "step": 2018 + }, + { + "epoch": 0.4043234587670136, + "learning_rate": 1.5208293685377354e-05, + "loss": 0.6308, + "step": 2020 + }, + { + "epoch": 0.40512409927942356, + "learning_rate": 1.5220218553330618e-05, + "loss": 0.1513, + "step": 2022 + }, + { + "epoch": 0.40512409927942356, + "learning_rate": 1.5232133226083954e-05, + "loss": 0.1977, + "step": 2024 + }, + { + "epoch": 0.40592473979183347, + "learning_rate": 1.5244037680367744e-05, + "loss": 0.1753, + "step": 2026 + }, + { + "epoch": 0.40592473979183347, + "learning_rate": 1.5255931892932322e-05, + "loss": 0.154, + "step": 2028 + }, + { + "epoch": 0.4067253803042434, + "learning_rate": 1.5267815840548057e-05, + "loss": 0.352, + "step": 2030 + }, + { + "epoch": 0.4067253803042434, + "learning_rate": 1.527968950000533e-05, + "loss": 0.1966, + "step": 2032 + }, + { + "epoch": 0.40752602081665334, + "learning_rate": 1.529155284811463e-05, + "loss": 0.2521, + "step": 2034 + }, + { + "epoch": 0.40752602081665334, + "learning_rate": 1.5303405861706574e-05, + "loss": 0.3046, + "step": 2036 + }, + { + "epoch": 0.40832666132906326, + "learning_rate": 1.5315248517631975e-05, + "loss": 0.6125, + "step": 2038 + }, + { + "epoch": 0.40832666132906326, + "learning_rate": 1.532708079276185e-05, + "loss": 0.0514, + "step": 2040 + }, + { + "epoch": 0.40912730184147317, + "learning_rate": 1.5338902663987544e-05, + "loss": 0.1347, + "step": 2042 + }, + { + "epoch": 0.40912730184147317, + "learning_rate": 1.5350714108220667e-05, + "loss": 0.0644, + "step": 2044 + }, + { + "epoch": 0.4099279423538831, + "learning_rate": 1.5362515102393217e-05, + "loss": 0.7223, + "step": 2046 + }, + { + "epoch": 0.4099279423538831, + "learning_rate": 1.5374305623457594e-05, + "loss": 0.0498, + "step": 2048 + }, + { + "epoch": 0.41072858286629305, + "learning_rate": 1.5386085648386656e-05, + "loss": 0.151, + "step": 2050 + }, + { + "epoch": 0.41072858286629305, + "learning_rate": 1.539785515417376e-05, + "loss": 0.2163, + "step": 2052 + }, + { + "epoch": 0.41152922337870296, + "learning_rate": 1.540961411783279e-05, + "loss": 0.3696, + "step": 2054 + }, + { + "epoch": 0.41152922337870296, + "learning_rate": 1.542136251639826e-05, + "loss": 0.4409, + "step": 2056 + }, + { + "epoch": 0.41232986389111287, + "learning_rate": 1.5433100326925288e-05, + "loss": 0.4636, + "step": 2058 + }, + { + "epoch": 0.41232986389111287, + "learning_rate": 1.5444827526489668e-05, + "loss": 0.4201, + "step": 2060 + }, + { + "epoch": 0.41313050440352284, + "learning_rate": 1.545654409218793e-05, + "loss": 0.1682, + "step": 2062 + }, + { + "epoch": 0.41313050440352284, + "learning_rate": 1.5468250001137368e-05, + "loss": 0.133, + "step": 2064 + }, + { + "epoch": 0.41393114491593275, + "learning_rate": 1.5479945230476066e-05, + "loss": 0.1679, + "step": 2066 + }, + { + "epoch": 0.41393114491593275, + "learning_rate": 1.5491629757363026e-05, + "loss": 0.0951, + "step": 2068 + }, + { + "epoch": 0.41473178542834266, + "learning_rate": 1.550330355897809e-05, + "loss": 0.1947, + "step": 2070 + }, + { + "epoch": 0.41473178542834266, + "learning_rate": 1.551496661252208e-05, + "loss": 0.3252, + "step": 2072 + }, + { + "epoch": 0.4155324259407526, + "learning_rate": 1.5526618895216793e-05, + "loss": 0.3031, + "step": 2074 + }, + { + "epoch": 0.4155324259407526, + "learning_rate": 1.5538260384305073e-05, + "loss": 0.6415, + "step": 2076 + }, + { + "epoch": 0.41633306645316254, + "learning_rate": 1.5549891057050837e-05, + "loss": 0.2067, + "step": 2078 + }, + { + "epoch": 0.41633306645316254, + "learning_rate": 1.5561510890739113e-05, + "loss": 0.0631, + "step": 2080 + }, + { + "epoch": 0.41713370696557245, + "learning_rate": 1.557311986267615e-05, + "loss": 0.5288, + "step": 2082 + }, + { + "epoch": 0.41713370696557245, + "learning_rate": 1.5584717950189353e-05, + "loss": 0.5654, + "step": 2084 + }, + { + "epoch": 0.41793434747798236, + "learning_rate": 1.5596305130627404e-05, + "loss": 0.3229, + "step": 2086 + }, + { + "epoch": 0.41793434747798236, + "learning_rate": 1.5607881381360296e-05, + "loss": 0.2014, + "step": 2088 + }, + { + "epoch": 0.4187349879903923, + "learning_rate": 1.5619446679779357e-05, + "loss": 0.2087, + "step": 2090 + }, + { + "epoch": 0.4187349879903923, + "learning_rate": 1.563100100329731e-05, + "loss": 0.4383, + "step": 2092 + }, + { + "epoch": 0.41953562850280224, + "learning_rate": 1.564254432934829e-05, + "loss": 0.8945, + "step": 2094 + }, + { + "epoch": 0.41953562850280224, + "learning_rate": 1.565407663538797e-05, + "loss": 0.0433, + "step": 2096 + }, + { + "epoch": 0.42033626901521215, + "learning_rate": 1.5665597898893484e-05, + "loss": 0.4326, + "step": 2098 + }, + { + "epoch": 0.42033626901521215, + "learning_rate": 1.567710809736356e-05, + "loss": 0.2864, + "step": 2100 + }, + { + "epoch": 0.4211369095276221, + "learning_rate": 1.568860720831853e-05, + "loss": 0.3651, + "step": 2102 + }, + { + "epoch": 0.4211369095276221, + "learning_rate": 1.5700095209300376e-05, + "loss": 0.1667, + "step": 2104 + }, + { + "epoch": 0.42193755004003203, + "learning_rate": 1.5711572077872774e-05, + "loss": 0.3225, + "step": 2106 + }, + { + "epoch": 0.42193755004003203, + "learning_rate": 1.572303779162118e-05, + "loss": 0.2241, + "step": 2108 + }, + { + "epoch": 0.42273819055244194, + "learning_rate": 1.573449232815279e-05, + "loss": 0.3348, + "step": 2110 + }, + { + "epoch": 0.42273819055244194, + "learning_rate": 1.5745935665096647e-05, + "loss": 0.1496, + "step": 2112 + }, + { + "epoch": 0.4235388310648519, + "learning_rate": 1.5757367780103666e-05, + "loss": 0.2714, + "step": 2114 + }, + { + "epoch": 0.4235388310648519, + "learning_rate": 1.5768788650846677e-05, + "loss": 0.4636, + "step": 2116 + }, + { + "epoch": 0.4243394715772618, + "learning_rate": 1.5780198255020478e-05, + "loss": 0.397, + "step": 2118 + }, + { + "epoch": 0.4243394715772618, + "learning_rate": 1.5791596570341844e-05, + "loss": 0.1761, + "step": 2120 + }, + { + "epoch": 0.42514011208967173, + "learning_rate": 1.580298357454965e-05, + "loss": 0.2564, + "step": 2122 + }, + { + "epoch": 0.42514011208967173, + "learning_rate": 1.581435924540481e-05, + "loss": 0.4445, + "step": 2124 + }, + { + "epoch": 0.42594075260208164, + "learning_rate": 1.5825723560690403e-05, + "loss": 0.238, + "step": 2126 + }, + { + "epoch": 0.42594075260208164, + "learning_rate": 1.5837076498211666e-05, + "loss": 0.3033, + "step": 2128 + }, + { + "epoch": 0.4267413931144916, + "learning_rate": 1.5848418035796068e-05, + "loss": 0.2188, + "step": 2130 + }, + { + "epoch": 0.4267413931144916, + "learning_rate": 1.5859748151293333e-05, + "loss": 0.3723, + "step": 2132 + }, + { + "epoch": 0.4275420336269015, + "learning_rate": 1.587106682257552e-05, + "loss": 0.2064, + "step": 2134 + }, + { + "epoch": 0.4275420336269015, + "learning_rate": 1.5882374027537005e-05, + "loss": 0.226, + "step": 2136 + }, + { + "epoch": 0.42834267413931143, + "learning_rate": 1.5893669744094577e-05, + "loss": 0.1439, + "step": 2138 + }, + { + "epoch": 0.42834267413931143, + "learning_rate": 1.5904953950187455e-05, + "loss": 0.0465, + "step": 2140 + }, + { + "epoch": 0.4291433146517214, + "learning_rate": 1.591622662377734e-05, + "loss": 0.3633, + "step": 2142 + }, + { + "epoch": 0.4291433146517214, + "learning_rate": 1.5927487742848448e-05, + "loss": 0.1977, + "step": 2144 + }, + { + "epoch": 0.4299439551641313, + "learning_rate": 1.5938737285407567e-05, + "loss": 0.3223, + "step": 2146 + }, + { + "epoch": 0.4299439551641313, + "learning_rate": 1.594997522948412e-05, + "loss": 0.8772, + "step": 2148 + }, + { + "epoch": 0.4307445956765412, + "learning_rate": 1.5961201553130148e-05, + "loss": 0.1245, + "step": 2150 + }, + { + "epoch": 0.4307445956765412, + "learning_rate": 1.5972416234420393e-05, + "loss": 0.0522, + "step": 2152 + }, + { + "epoch": 0.4315452361889512, + "learning_rate": 1.598361925145234e-05, + "loss": 0.1581, + "step": 2154 + }, + { + "epoch": 0.4315452361889512, + "learning_rate": 1.599481058234626e-05, + "loss": 0.2856, + "step": 2156 + }, + { + "epoch": 0.4323458767013611, + "learning_rate": 1.6005990205245216e-05, + "loss": 0.415, + "step": 2158 + }, + { + "epoch": 0.4323458767013611, + "learning_rate": 1.60171580983152e-05, + "loss": 0.5578, + "step": 2160 + }, + { + "epoch": 0.433146517213771, + "learning_rate": 1.602831423974506e-05, + "loss": 0.1504, + "step": 2162 + }, + { + "epoch": 0.433146517213771, + "learning_rate": 1.6039458607746607e-05, + "loss": 0.4326, + "step": 2164 + }, + { + "epoch": 0.4339471577261809, + "learning_rate": 1.6050591180554648e-05, + "loss": 0.1494, + "step": 2166 + }, + { + "epoch": 0.4339471577261809, + "learning_rate": 1.606171193642703e-05, + "loss": 0.1186, + "step": 2168 + }, + { + "epoch": 0.4347477982385909, + "learning_rate": 1.6072820853644677e-05, + "loss": 0.1952, + "step": 2170 + }, + { + "epoch": 0.4347477982385909, + "learning_rate": 1.6083917910511616e-05, + "loss": 0.5925, + "step": 2172 + }, + { + "epoch": 0.4355484387510008, + "learning_rate": 1.6095003085355082e-05, + "loss": 0.2538, + "step": 2174 + }, + { + "epoch": 0.4355484387510008, + "learning_rate": 1.6106076356525474e-05, + "loss": 0.343, + "step": 2176 + }, + { + "epoch": 0.4363490792634107, + "learning_rate": 1.611713770239646e-05, + "loss": 0.2065, + "step": 2178 + }, + { + "epoch": 0.4363490792634107, + "learning_rate": 1.6128187101364982e-05, + "loss": 0.5187, + "step": 2180 + }, + { + "epoch": 0.4371497197758207, + "learning_rate": 1.6139224531851332e-05, + "loss": 0.1274, + "step": 2182 + }, + { + "epoch": 0.4371497197758207, + "learning_rate": 1.6150249972299153e-05, + "loss": 0.0868, + "step": 2184 + }, + { + "epoch": 0.4379503602882306, + "learning_rate": 1.616126340117555e-05, + "loss": 0.3844, + "step": 2186 + }, + { + "epoch": 0.4379503602882306, + "learning_rate": 1.617226479697104e-05, + "loss": 0.3451, + "step": 2188 + }, + { + "epoch": 0.4387510008006405, + "learning_rate": 1.618325413819966e-05, + "loss": 0.1991, + "step": 2190 + }, + { + "epoch": 0.4387510008006405, + "learning_rate": 1.6194231403398994e-05, + "loss": 0.0863, + "step": 2192 + }, + { + "epoch": 0.43955164131305047, + "learning_rate": 1.6205196571130194e-05, + "loss": 0.2099, + "step": 2194 + }, + { + "epoch": 0.43955164131305047, + "learning_rate": 1.621614961997806e-05, + "loss": 0.6277, + "step": 2196 + }, + { + "epoch": 0.4403522818254604, + "learning_rate": 1.6227090528551034e-05, + "loss": 0.0712, + "step": 2198 + }, + { + "epoch": 0.4403522818254604, + "learning_rate": 1.6238019275481313e-05, + "loss": 0.0873, + "step": 2200 + }, + { + "epoch": 0.4411529223378703, + "learning_rate": 1.62489358394248e-05, + "loss": 0.256, + "step": 2202 + }, + { + "epoch": 0.4411529223378703, + "learning_rate": 1.6259840199061212e-05, + "loss": 0.0645, + "step": 2204 + }, + { + "epoch": 0.4419535628502802, + "learning_rate": 1.6270732333094095e-05, + "loss": 0.2258, + "step": 2206 + }, + { + "epoch": 0.4419535628502802, + "learning_rate": 1.6281612220250883e-05, + "loss": 0.4092, + "step": 2208 + }, + { + "epoch": 0.44275420336269017, + "learning_rate": 1.6292479839282897e-05, + "loss": 0.2853, + "step": 2210 + }, + { + "epoch": 0.44275420336269017, + "learning_rate": 1.6303335168965474e-05, + "loss": 0.7046, + "step": 2212 + }, + { + "epoch": 0.4435548438751001, + "learning_rate": 1.6314178188097907e-05, + "loss": 0.5072, + "step": 2214 + }, + { + "epoch": 0.4435548438751001, + "learning_rate": 1.6325008875503543e-05, + "loss": 0.2541, + "step": 2216 + }, + { + "epoch": 0.44435548438751, + "learning_rate": 1.6335827210029816e-05, + "loss": 0.6665, + "step": 2218 + }, + { + "epoch": 0.44435548438751, + "learning_rate": 1.6346633170548285e-05, + "loss": 0.5315, + "step": 2220 + }, + { + "epoch": 0.44515612489991996, + "learning_rate": 1.635742673595467e-05, + "loss": 0.545, + "step": 2222 + }, + { + "epoch": 0.44515612489991996, + "learning_rate": 1.6368207885168897e-05, + "loss": 0.0607, + "step": 2224 + }, + { + "epoch": 0.44595676541232987, + "learning_rate": 1.6378976597135173e-05, + "loss": 0.1846, + "step": 2226 + }, + { + "epoch": 0.44595676541232987, + "learning_rate": 1.6389732850821957e-05, + "loss": 0.4225, + "step": 2228 + }, + { + "epoch": 0.4467574059247398, + "learning_rate": 1.640047662522205e-05, + "loss": 0.3091, + "step": 2230 + }, + { + "epoch": 0.4467574059247398, + "learning_rate": 1.641120789935263e-05, + "loss": 0.2481, + "step": 2232 + }, + { + "epoch": 0.4475580464371497, + "learning_rate": 1.6421926652255282e-05, + "loss": 0.2102, + "step": 2234 + }, + { + "epoch": 0.4475580464371497, + "learning_rate": 1.6432632862996042e-05, + "loss": 0.1568, + "step": 2236 + }, + { + "epoch": 0.44835868694955966, + "learning_rate": 1.6443326510665474e-05, + "loss": 0.2707, + "step": 2238 + }, + { + "epoch": 0.44835868694955966, + "learning_rate": 1.6454007574378637e-05, + "loss": 0.0835, + "step": 2240 + }, + { + "epoch": 0.44915932746196957, + "learning_rate": 1.646467603327518e-05, + "loss": 0.219, + "step": 2242 + }, + { + "epoch": 0.44915932746196957, + "learning_rate": 1.6475331866519377e-05, + "loss": 0.2712, + "step": 2244 + }, + { + "epoch": 0.4499599679743795, + "learning_rate": 1.6485975053300154e-05, + "loss": 0.4228, + "step": 2246 + }, + { + "epoch": 0.4499599679743795, + "learning_rate": 1.6496605572831134e-05, + "loss": 0.4714, + "step": 2248 + }, + { + "epoch": 0.45076060848678945, + "learning_rate": 1.650722340435067e-05, + "loss": 0.5088, + "step": 2250 + }, + { + "epoch": 0.45076060848678945, + "learning_rate": 1.6517828527121928e-05, + "loss": 0.1424, + "step": 2252 + }, + { + "epoch": 0.45156124899919936, + "learning_rate": 1.652842092043287e-05, + "loss": 0.1763, + "step": 2254 + }, + { + "epoch": 0.45156124899919936, + "learning_rate": 1.6539000563596318e-05, + "loss": 0.1936, + "step": 2256 + }, + { + "epoch": 0.45236188951160927, + "learning_rate": 1.6549567435950004e-05, + "loss": 0.8804, + "step": 2258 + }, + { + "epoch": 0.45236188951160927, + "learning_rate": 1.6560121516856586e-05, + "loss": 0.5306, + "step": 2260 + }, + { + "epoch": 0.45316253002401924, + "learning_rate": 1.6570662785703713e-05, + "loss": 0.2049, + "step": 2262 + }, + { + "epoch": 0.45316253002401924, + "learning_rate": 1.6581191221904077e-05, + "loss": 0.8036, + "step": 2264 + }, + { + "epoch": 0.45396317053642915, + "learning_rate": 1.6591706804895408e-05, + "loss": 0.2407, + "step": 2266 + }, + { + "epoch": 0.45396317053642915, + "learning_rate": 1.6602209514140542e-05, + "loss": 0.1464, + "step": 2268 + }, + { + "epoch": 0.45476381104883906, + "learning_rate": 1.6612699329127457e-05, + "loss": 0.198, + "step": 2270 + }, + { + "epoch": 0.45476381104883906, + "learning_rate": 1.6623176229369324e-05, + "loss": 0.1756, + "step": 2272 + }, + { + "epoch": 0.455564451561249, + "learning_rate": 1.6633640194404523e-05, + "loss": 0.5481, + "step": 2274 + }, + { + "epoch": 0.455564451561249, + "learning_rate": 1.6644091203796694e-05, + "loss": 0.0634, + "step": 2276 + }, + { + "epoch": 0.45636509207365894, + "learning_rate": 1.6654529237134816e-05, + "loss": 0.1461, + "step": 2278 + }, + { + "epoch": 0.45636509207365894, + "learning_rate": 1.6664954274033168e-05, + "loss": 0.217, + "step": 2280 + }, + { + "epoch": 0.45716573258606885, + "learning_rate": 1.667536629413143e-05, + "loss": 0.1683, + "step": 2282 + }, + { + "epoch": 0.45716573258606885, + "learning_rate": 1.6685765277094695e-05, + "loss": 0.4839, + "step": 2284 + }, + { + "epoch": 0.45796637309847876, + "learning_rate": 1.6696151202613527e-05, + "loss": 0.1511, + "step": 2286 + }, + { + "epoch": 0.45796637309847876, + "learning_rate": 1.6706524050403996e-05, + "loss": 0.5398, + "step": 2288 + }, + { + "epoch": 0.45876701361088873, + "learning_rate": 1.6716883800207685e-05, + "loss": 0.2436, + "step": 2290 + }, + { + "epoch": 0.45876701361088873, + "learning_rate": 1.6727230431791806e-05, + "loss": 0.2422, + "step": 2292 + }, + { + "epoch": 0.45956765412329864, + "learning_rate": 1.673756392494915e-05, + "loss": 0.1677, + "step": 2294 + }, + { + "epoch": 0.45956765412329864, + "learning_rate": 1.674788425949818e-05, + "loss": 0.1576, + "step": 2296 + }, + { + "epoch": 0.46036829463570855, + "learning_rate": 1.6758191415283063e-05, + "loss": 0.4196, + "step": 2298 + }, + { + "epoch": 0.46036829463570855, + "learning_rate": 1.6768485372173696e-05, + "loss": 0.1937, + "step": 2300 + }, + { + "epoch": 0.4611689351481185, + "learning_rate": 1.6778766110065755e-05, + "loss": 0.1729, + "step": 2302 + }, + { + "epoch": 0.4611689351481185, + "learning_rate": 1.6789033608880735e-05, + "loss": 0.2618, + "step": 2304 + }, + { + "epoch": 0.46196957566052843, + "learning_rate": 1.6799287848566e-05, + "loss": 0.5152, + "step": 2306 + }, + { + "epoch": 0.46196957566052843, + "learning_rate": 1.6809528809094798e-05, + "loss": 0.6898, + "step": 2308 + }, + { + "epoch": 0.46277021617293834, + "learning_rate": 1.6819756470466305e-05, + "loss": 0.4229, + "step": 2310 + }, + { + "epoch": 0.46277021617293834, + "learning_rate": 1.6829970812705674e-05, + "loss": 0.2503, + "step": 2312 + }, + { + "epoch": 0.46357085668534825, + "learning_rate": 1.684017181586408e-05, + "loss": 0.1583, + "step": 2314 + }, + { + "epoch": 0.46357085668534825, + "learning_rate": 1.6850359460018733e-05, + "loss": 0.2153, + "step": 2316 + }, + { + "epoch": 0.4643714971977582, + "learning_rate": 1.6860533725272943e-05, + "loss": 0.1977, + "step": 2318 + }, + { + "epoch": 0.4643714971977582, + "learning_rate": 1.6870694591756165e-05, + "loss": 0.1865, + "step": 2320 + }, + { + "epoch": 0.46517213771016813, + "learning_rate": 1.6880842039624e-05, + "loss": 0.1416, + "step": 2322 + }, + { + "epoch": 0.46517213771016813, + "learning_rate": 1.689097604905826e-05, + "loss": 0.3322, + "step": 2324 + }, + { + "epoch": 0.46597277822257804, + "learning_rate": 1.6901096600267e-05, + "loss": 0.202, + "step": 2326 + }, + { + "epoch": 0.46597277822257804, + "learning_rate": 1.6911203673484577e-05, + "loss": 0.206, + "step": 2328 + }, + { + "epoch": 0.466773418734988, + "learning_rate": 1.6921297248971645e-05, + "loss": 0.1342, + "step": 2330 + }, + { + "epoch": 0.466773418734988, + "learning_rate": 1.6931377307015226e-05, + "loss": 0.2286, + "step": 2332 + }, + { + "epoch": 0.4675740592473979, + "learning_rate": 1.6941443827928778e-05, + "loss": 0.3483, + "step": 2334 + }, + { + "epoch": 0.4675740592473979, + "learning_rate": 1.695149679205214e-05, + "loss": 0.0824, + "step": 2336 + }, + { + "epoch": 0.46837469975980783, + "learning_rate": 1.6961536179751672e-05, + "loss": 0.5755, + "step": 2338 + }, + { + "epoch": 0.46837469975980783, + "learning_rate": 1.6971561971420222e-05, + "loss": 0.1356, + "step": 2340 + }, + { + "epoch": 0.4691753402722178, + "learning_rate": 1.6981574147477204e-05, + "loss": 0.3308, + "step": 2342 + }, + { + "epoch": 0.4691753402722178, + "learning_rate": 1.6991572688368628e-05, + "loss": 0.255, + "step": 2344 + }, + { + "epoch": 0.4699759807846277, + "learning_rate": 1.70015575745671e-05, + "loss": 0.8065, + "step": 2346 + }, + { + "epoch": 0.4699759807846277, + "learning_rate": 1.701152878657196e-05, + "loss": 0.3164, + "step": 2348 + }, + { + "epoch": 0.4707766212970376, + "learning_rate": 1.7021486304909196e-05, + "loss": 0.2188, + "step": 2350 + }, + { + "epoch": 0.4707766212970376, + "learning_rate": 1.7031430110131562e-05, + "loss": 0.4395, + "step": 2352 + }, + { + "epoch": 0.47157726180944753, + "learning_rate": 1.7041360182818583e-05, + "loss": 0.1784, + "step": 2354 + }, + { + "epoch": 0.47157726180944753, + "learning_rate": 1.705127650357662e-05, + "loss": 0.246, + "step": 2356 + }, + { + "epoch": 0.4723779023218575, + "learning_rate": 1.7061179053038887e-05, + "loss": 0.6699, + "step": 2358 + }, + { + "epoch": 0.4723779023218575, + "learning_rate": 1.7071067811865467e-05, + "loss": 0.4094, + "step": 2360 + }, + { + "epoch": 0.4731785428342674, + "learning_rate": 1.708094276074343e-05, + "loss": 0.9815, + "step": 2362 + }, + { + "epoch": 0.4731785428342674, + "learning_rate": 1.7090803880386778e-05, + "loss": 0.0879, + "step": 2364 + }, + { + "epoch": 0.4739791833466773, + "learning_rate": 1.7100651151536525e-05, + "loss": 0.1681, + "step": 2366 + }, + { + "epoch": 0.4739791833466773, + "learning_rate": 1.7110484554960738e-05, + "loss": 0.2034, + "step": 2368 + }, + { + "epoch": 0.4747798238590873, + "learning_rate": 1.712030407145457e-05, + "loss": 0.4398, + "step": 2370 + }, + { + "epoch": 0.4747798238590873, + "learning_rate": 1.713010968184029e-05, + "loss": 0.1583, + "step": 2372 + }, + { + "epoch": 0.4755804643714972, + "learning_rate": 1.7139901366967332e-05, + "loss": 0.2618, + "step": 2374 + }, + { + "epoch": 0.4755804643714972, + "learning_rate": 1.7149679107712306e-05, + "loss": 0.0996, + "step": 2376 + }, + { + "epoch": 0.4763811048839071, + "learning_rate": 1.71594428849791e-05, + "loss": 0.3188, + "step": 2378 + }, + { + "epoch": 0.4763811048839071, + "learning_rate": 1.716919267969883e-05, + "loss": 0.3078, + "step": 2380 + }, + { + "epoch": 0.4771817453963171, + "learning_rate": 1.717892847282994e-05, + "loss": 0.298, + "step": 2382 + }, + { + "epoch": 0.4771817453963171, + "learning_rate": 1.7188650245358215e-05, + "loss": 0.4512, + "step": 2384 + }, + { + "epoch": 0.477982385908727, + "learning_rate": 1.7198357978296817e-05, + "loss": 0.2024, + "step": 2386 + }, + { + "epoch": 0.477982385908727, + "learning_rate": 1.7208051652686338e-05, + "loss": 0.1444, + "step": 2388 + }, + { + "epoch": 0.4787830264211369, + "learning_rate": 1.721773124959481e-05, + "loss": 0.3035, + "step": 2390 + }, + { + "epoch": 0.4787830264211369, + "learning_rate": 1.722739675011779e-05, + "loss": 0.1482, + "step": 2392 + }, + { + "epoch": 0.4795836669335468, + "learning_rate": 1.723704813537834e-05, + "loss": 0.2527, + "step": 2394 + }, + { + "epoch": 0.4795836669335468, + "learning_rate": 1.7246685386527095e-05, + "loss": 0.2582, + "step": 2396 + }, + { + "epoch": 0.4803843074459568, + "learning_rate": 1.725630848474229e-05, + "loss": 0.5233, + "step": 2398 + }, + { + "epoch": 0.4803843074459568, + "learning_rate": 1.726591741122981e-05, + "loss": 0.0761, + "step": 2400 + }, + { + "epoch": 0.4811849479583667, + "learning_rate": 1.727551214722321e-05, + "loss": 0.232, + "step": 2402 + }, + { + "epoch": 0.4811849479583667, + "learning_rate": 1.7285092673983753e-05, + "loss": 0.74, + "step": 2404 + }, + { + "epoch": 0.4819855884707766, + "learning_rate": 1.7294658972800488e-05, + "loss": 0.2872, + "step": 2406 + }, + { + "epoch": 0.4819855884707766, + "learning_rate": 1.730421102499021e-05, + "loss": 0.1952, + "step": 2408 + }, + { + "epoch": 0.48278622898318657, + "learning_rate": 1.7313748811897558e-05, + "loss": 0.4487, + "step": 2410 + }, + { + "epoch": 0.48278622898318657, + "learning_rate": 1.7323272314895022e-05, + "loss": 0.1421, + "step": 2412 + }, + { + "epoch": 0.4835868694955965, + "learning_rate": 1.7332781515383003e-05, + "loss": 0.2526, + "step": 2414 + }, + { + "epoch": 0.4835868694955965, + "learning_rate": 1.734227639478982e-05, + "loss": 0.1204, + "step": 2416 + }, + { + "epoch": 0.4843875100080064, + "learning_rate": 1.7351756934571758e-05, + "loss": 0.3345, + "step": 2418 + }, + { + "epoch": 0.4843875100080064, + "learning_rate": 1.736122311621314e-05, + "loss": 0.3129, + "step": 2420 + }, + { + "epoch": 0.4851881505204163, + "learning_rate": 1.7370674921226296e-05, + "loss": 0.4167, + "step": 2422 + }, + { + "epoch": 0.4851881505204163, + "learning_rate": 1.738011233115165e-05, + "loss": 0.4551, + "step": 2424 + }, + { + "epoch": 0.4859887910328263, + "learning_rate": 1.7389535327557733e-05, + "loss": 0.554, + "step": 2426 + }, + { + "epoch": 0.4859887910328263, + "learning_rate": 1.7398943892041227e-05, + "loss": 0.4983, + "step": 2428 + }, + { + "epoch": 0.4867894315452362, + "learning_rate": 1.7408338006227005e-05, + "loss": 0.4439, + "step": 2430 + }, + { + "epoch": 0.4867894315452362, + "learning_rate": 1.7417717651768144e-05, + "loss": 0.1658, + "step": 2432 + }, + { + "epoch": 0.4875900720576461, + "learning_rate": 1.7427082810346018e-05, + "loss": 0.2466, + "step": 2434 + }, + { + "epoch": 0.4875900720576461, + "learning_rate": 1.743643346367026e-05, + "loss": 0.2512, + "step": 2436 + }, + { + "epoch": 0.48839071257005606, + "learning_rate": 1.744576959347884e-05, + "loss": 0.238, + "step": 2438 + }, + { + "epoch": 0.48839071257005606, + "learning_rate": 1.7455091181538087e-05, + "loss": 0.2271, + "step": 2440 + }, + { + "epoch": 0.489191353082466, + "learning_rate": 1.746439820964275e-05, + "loss": 0.1682, + "step": 2442 + }, + { + "epoch": 0.489191353082466, + "learning_rate": 1.7473690659615992e-05, + "loss": 0.1304, + "step": 2444 + }, + { + "epoch": 0.4899919935948759, + "learning_rate": 1.748296851330945e-05, + "loss": 0.205, + "step": 2446 + }, + { + "epoch": 0.4899919935948759, + "learning_rate": 1.74922317526033e-05, + "loss": 0.1145, + "step": 2448 + }, + { + "epoch": 0.49079263410728585, + "learning_rate": 1.7501480359406217e-05, + "loss": 0.1598, + "step": 2450 + }, + { + "epoch": 0.49079263410728585, + "learning_rate": 1.7510714315655474e-05, + "loss": 0.1798, + "step": 2452 + }, + { + "epoch": 0.49159327461969576, + "learning_rate": 1.7519933603316955e-05, + "loss": 0.4794, + "step": 2454 + }, + { + "epoch": 0.49159327461969576, + "learning_rate": 1.752913820438519e-05, + "loss": 0.0358, + "step": 2456 + }, + { + "epoch": 0.4923939151321057, + "learning_rate": 1.7538328100883397e-05, + "loss": 0.5099, + "step": 2458 + }, + { + "epoch": 0.4923939151321057, + "learning_rate": 1.7547503274863495e-05, + "loss": 0.4349, + "step": 2460 + }, + { + "epoch": 0.4931945556445156, + "learning_rate": 1.7556663708406193e-05, + "loss": 0.1685, + "step": 2462 + }, + { + "epoch": 0.4931945556445156, + "learning_rate": 1.756580938362096e-05, + "loss": 0.202, + "step": 2464 + }, + { + "epoch": 0.49399519615692555, + "learning_rate": 1.7574940282646085e-05, + "loss": 0.4309, + "step": 2466 + }, + { + "epoch": 0.49399519615692555, + "learning_rate": 1.758405638764873e-05, + "loss": 0.5622, + "step": 2468 + }, + { + "epoch": 0.49479583666933546, + "learning_rate": 1.7593157680824946e-05, + "loss": 0.1961, + "step": 2470 + }, + { + "epoch": 0.49479583666933546, + "learning_rate": 1.7602244144399693e-05, + "loss": 0.1869, + "step": 2472 + }, + { + "epoch": 0.4955964771817454, + "learning_rate": 1.761131576062694e-05, + "loss": 0.1668, + "step": 2474 + }, + { + "epoch": 0.4955964771817454, + "learning_rate": 1.7620372511789604e-05, + "loss": 0.2749, + "step": 2476 + }, + { + "epoch": 0.49639711769415534, + "learning_rate": 1.7629414380199662e-05, + "loss": 0.2235, + "step": 2478 + }, + { + "epoch": 0.49639711769415534, + "learning_rate": 1.7638441348198147e-05, + "loss": 0.1031, + "step": 2480 + }, + { + "epoch": 0.49719775820656525, + "learning_rate": 1.7647453398155194e-05, + "loss": 0.3745, + "step": 2482 + }, + { + "epoch": 0.49719775820656525, + "learning_rate": 1.7656450512470077e-05, + "loss": 0.1008, + "step": 2484 + }, + { + "epoch": 0.49799839871897517, + "learning_rate": 1.7665432673571218e-05, + "loss": 0.7476, + "step": 2486 + }, + { + "epoch": 0.49799839871897517, + "learning_rate": 1.7674399863916295e-05, + "loss": 1.0966, + "step": 2488 + }, + { + "epoch": 0.49879903923138513, + "learning_rate": 1.768335206599217e-05, + "loss": 0.775, + "step": 2490 + }, + { + "epoch": 0.49879903923138513, + "learning_rate": 1.7692289262315e-05, + "loss": 0.2311, + "step": 2492 + }, + { + "epoch": 0.49959967974379504, + "learning_rate": 1.7701211435430256e-05, + "loss": 0.391, + "step": 2494 + }, + { + "epoch": 0.49959967974379504, + "learning_rate": 1.771011856791273e-05, + "loss": 0.5734, + "step": 2496 + }, + { + "epoch": 0.500400320256205, + "learning_rate": 1.771901064236659e-05, + "loss": 0.2815, + "step": 2498 + }, + { + "epoch": 0.500400320256205, + "learning_rate": 1.7727887641425448e-05, + "loss": 0.2396, + "step": 2500 + }, + { + "epoch": 0.5012009607686149, + "learning_rate": 1.773674954775232e-05, + "loss": 0.1344, + "step": 2502 + }, + { + "epoch": 0.5012009607686149, + "learning_rate": 1.7745596344039712e-05, + "loss": 0.3452, + "step": 2504 + }, + { + "epoch": 0.5020016012810248, + "learning_rate": 1.7754428013009637e-05, + "loss": 0.1867, + "step": 2506 + }, + { + "epoch": 0.5020016012810248, + "learning_rate": 1.7763244537413657e-05, + "loss": 0.3293, + "step": 2508 + }, + { + "epoch": 0.5028022417934348, + "learning_rate": 1.77720459000329e-05, + "loss": 0.2863, + "step": 2510 + }, + { + "epoch": 0.5028022417934348, + "learning_rate": 1.7780832083678116e-05, + "loss": 0.3385, + "step": 2512 + }, + { + "epoch": 0.5036028823058447, + "learning_rate": 1.7789603071189712e-05, + "loss": 0.3413, + "step": 2514 + }, + { + "epoch": 0.5036028823058447, + "learning_rate": 1.7798358845437754e-05, + "loss": 0.3642, + "step": 2516 + }, + { + "epoch": 0.5044035228182546, + "learning_rate": 1.780709938932202e-05, + "loss": 0.2378, + "step": 2518 + }, + { + "epoch": 0.5044035228182546, + "learning_rate": 1.7815824685772035e-05, + "loss": 0.2243, + "step": 2520 + }, + { + "epoch": 0.5052041633306645, + "learning_rate": 1.7824534717747115e-05, + "loss": 0.1958, + "step": 2522 + }, + { + "epoch": 0.5052041633306645, + "learning_rate": 1.7833229468236364e-05, + "loss": 0.4071, + "step": 2524 + }, + { + "epoch": 0.5060048038430744, + "learning_rate": 1.7841908920258767e-05, + "loss": 0.5398, + "step": 2526 + }, + { + "epoch": 0.5060048038430744, + "learning_rate": 1.7850573056863156e-05, + "loss": 0.3774, + "step": 2528 + }, + { + "epoch": 0.5068054443554844, + "learning_rate": 1.7859221861128284e-05, + "loss": 0.1713, + "step": 2530 + }, + { + "epoch": 0.5068054443554844, + "learning_rate": 1.786785531616285e-05, + "loss": 0.535, + "step": 2532 + }, + { + "epoch": 0.5076060848678943, + "learning_rate": 1.7876473405105528e-05, + "loss": 0.3927, + "step": 2534 + }, + { + "epoch": 0.5076060848678943, + "learning_rate": 1.7885076111125004e-05, + "loss": 0.1844, + "step": 2536 + }, + { + "epoch": 0.5084067253803043, + "learning_rate": 1.7893663417419995e-05, + "loss": 0.3155, + "step": 2538 + }, + { + "epoch": 0.5084067253803043, + "learning_rate": 1.790223530721933e-05, + "loss": 0.2506, + "step": 2540 + }, + { + "epoch": 0.5092073658927142, + "learning_rate": 1.791079176378191e-05, + "loss": 0.2709, + "step": 2542 + }, + { + "epoch": 0.5092073658927142, + "learning_rate": 1.791933277039679e-05, + "loss": 0.349, + "step": 2544 + }, + { + "epoch": 0.5100080064051241, + "learning_rate": 1.7927858310383202e-05, + "loss": 0.3345, + "step": 2546 + }, + { + "epoch": 0.5100080064051241, + "learning_rate": 1.7936368367090577e-05, + "loss": 0.1962, + "step": 2548 + }, + { + "epoch": 0.510808646917534, + "learning_rate": 1.794486292389858e-05, + "loss": 0.3233, + "step": 2550 + }, + { + "epoch": 0.510808646917534, + "learning_rate": 1.7953341964217183e-05, + "loss": 0.2564, + "step": 2552 + }, + { + "epoch": 0.5116092874299439, + "learning_rate": 1.7961805471486618e-05, + "loss": 0.2419, + "step": 2554 + }, + { + "epoch": 0.5116092874299439, + "learning_rate": 1.7970253429177477e-05, + "loss": 0.4263, + "step": 2556 + }, + { + "epoch": 0.5124099279423538, + "learning_rate": 1.797868582079072e-05, + "loss": 0.5227, + "step": 2558 + }, + { + "epoch": 0.5124099279423538, + "learning_rate": 1.7987102629857696e-05, + "loss": 0.091, + "step": 2560 + }, + { + "epoch": 0.5132105684547638, + "learning_rate": 1.7995503839940197e-05, + "loss": 0.3153, + "step": 2562 + }, + { + "epoch": 0.5132105684547638, + "learning_rate": 1.800388943463047e-05, + "loss": 0.0395, + "step": 2564 + }, + { + "epoch": 0.5140112089671738, + "learning_rate": 1.8012259397551283e-05, + "loss": 0.1586, + "step": 2566 + }, + { + "epoch": 0.5140112089671738, + "learning_rate": 1.8020613712355912e-05, + "loss": 0.1021, + "step": 2568 + }, + { + "epoch": 0.5148118494795837, + "learning_rate": 1.8028952362728197e-05, + "loss": 0.0406, + "step": 2570 + }, + { + "epoch": 0.5148118494795837, + "learning_rate": 1.803727533238257e-05, + "loss": 0.0778, + "step": 2572 + }, + { + "epoch": 0.5156124899919936, + "learning_rate": 1.804558260506409e-05, + "loss": 0.194, + "step": 2574 + }, + { + "epoch": 0.5156124899919936, + "learning_rate": 1.805387416454847e-05, + "loss": 0.1681, + "step": 2576 + }, + { + "epoch": 0.5164131305044035, + "learning_rate": 1.8062149994642135e-05, + "loss": 0.1868, + "step": 2578 + }, + { + "epoch": 0.5164131305044035, + "learning_rate": 1.8070410079182195e-05, + "loss": 0.0327, + "step": 2580 + }, + { + "epoch": 0.5172137710168134, + "learning_rate": 1.8078654402036526e-05, + "loss": 0.1116, + "step": 2582 + }, + { + "epoch": 0.5172137710168134, + "learning_rate": 1.8086882947103787e-05, + "loss": 0.0425, + "step": 2584 + }, + { + "epoch": 0.5180144115292233, + "learning_rate": 1.8095095698313452e-05, + "loss": 0.1516, + "step": 2586 + }, + { + "epoch": 0.5180144115292233, + "learning_rate": 1.8103292639625842e-05, + "loss": 0.1115, + "step": 2588 + }, + { + "epoch": 0.5188150520416333, + "learning_rate": 1.811147375503214e-05, + "loss": 1.0942, + "step": 2590 + }, + { + "epoch": 0.5188150520416333, + "learning_rate": 1.811963902855447e-05, + "loss": 0.0614, + "step": 2592 + }, + { + "epoch": 0.5196156925540433, + "learning_rate": 1.812778844424587e-05, + "loss": 0.2463, + "step": 2594 + }, + { + "epoch": 0.5196156925540433, + "learning_rate": 1.813592198619035e-05, + "loss": 0.472, + "step": 2596 + }, + { + "epoch": 0.5204163330664532, + "learning_rate": 1.814403963850293e-05, + "loss": 0.1681, + "step": 2598 + }, + { + "epoch": 0.5204163330664532, + "learning_rate": 1.8152141385329658e-05, + "loss": 0.1759, + "step": 2600 + }, + { + "epoch": 0.5212169735788631, + "learning_rate": 1.8160227210847636e-05, + "loss": 0.144, + "step": 2602 + }, + { + "epoch": 0.5212169735788631, + "learning_rate": 1.816829709926509e-05, + "loss": 0.1328, + "step": 2604 + }, + { + "epoch": 0.522017614091273, + "learning_rate": 1.8176351034821345e-05, + "loss": 0.3637, + "step": 2606 + }, + { + "epoch": 0.522017614091273, + "learning_rate": 1.8184389001786895e-05, + "loss": 0.3628, + "step": 2608 + }, + { + "epoch": 0.5228182546036829, + "learning_rate": 1.819241098446341e-05, + "loss": 0.5551, + "step": 2610 + }, + { + "epoch": 0.5228182546036829, + "learning_rate": 1.8200416967183785e-05, + "loss": 0.1671, + "step": 2612 + }, + { + "epoch": 0.5236188951160928, + "learning_rate": 1.8208406934312167e-05, + "loss": 0.3377, + "step": 2614 + }, + { + "epoch": 0.5236188951160928, + "learning_rate": 1.821638087024396e-05, + "loss": 0.1464, + "step": 2616 + }, + { + "epoch": 0.5244195356285029, + "learning_rate": 1.8224338759405917e-05, + "loss": 0.733, + "step": 2618 + }, + { + "epoch": 0.5244195356285029, + "learning_rate": 1.8232280586256097e-05, + "loss": 0.1686, + "step": 2620 + }, + { + "epoch": 0.5252201761409128, + "learning_rate": 1.8240206335283947e-05, + "loss": 0.29, + "step": 2622 + }, + { + "epoch": 0.5252201761409128, + "learning_rate": 1.8248115991010296e-05, + "loss": 0.362, + "step": 2624 + }, + { + "epoch": 0.5260208166533227, + "learning_rate": 1.825600953798743e-05, + "loss": 0.4373, + "step": 2626 + }, + { + "epoch": 0.5260208166533227, + "learning_rate": 1.8263886960799055e-05, + "loss": 0.038, + "step": 2628 + }, + { + "epoch": 0.5268214571657326, + "learning_rate": 1.8271748244060426e-05, + "loss": 0.2377, + "step": 2630 + }, + { + "epoch": 0.5268214571657326, + "learning_rate": 1.8279593372418264e-05, + "loss": 0.2393, + "step": 2632 + }, + { + "epoch": 0.5276220976781425, + "learning_rate": 1.8287422330550878e-05, + "loss": 0.6519, + "step": 2634 + }, + { + "epoch": 0.5276220976781425, + "learning_rate": 1.829523510316813e-05, + "loss": 0.4658, + "step": 2636 + }, + { + "epoch": 0.5284227381905524, + "learning_rate": 1.8303031675011515e-05, + "loss": 0.3502, + "step": 2638 + }, + { + "epoch": 0.5284227381905524, + "learning_rate": 1.8310812030854155e-05, + "loss": 0.3361, + "step": 2640 + }, + { + "epoch": 0.5292233787029623, + "learning_rate": 1.8318576155500838e-05, + "loss": 0.2481, + "step": 2642 + }, + { + "epoch": 0.5292233787029623, + "learning_rate": 1.832632403378808e-05, + "loss": 0.3521, + "step": 2644 + }, + { + "epoch": 0.5300240192153723, + "learning_rate": 1.8334055650584094e-05, + "loss": 0.1969, + "step": 2646 + }, + { + "epoch": 0.5300240192153723, + "learning_rate": 1.834177099078887e-05, + "loss": 0.1755, + "step": 2648 + }, + { + "epoch": 0.5308246597277823, + "learning_rate": 1.8349470039334173e-05, + "loss": 0.5506, + "step": 2650 + }, + { + "epoch": 0.5308246597277823, + "learning_rate": 1.8357152781183606e-05, + "loss": 0.2311, + "step": 2652 + }, + { + "epoch": 0.5316253002401922, + "learning_rate": 1.83648192013326e-05, + "loss": 0.2634, + "step": 2654 + }, + { + "epoch": 0.5316253002401922, + "learning_rate": 1.8372469284808465e-05, + "loss": 0.3706, + "step": 2656 + }, + { + "epoch": 0.5324259407526021, + "learning_rate": 1.8380103016670437e-05, + "loss": 0.5143, + "step": 2658 + }, + { + "epoch": 0.5324259407526021, + "learning_rate": 1.8387720382009665e-05, + "loss": 0.1756, + "step": 2660 + }, + { + "epoch": 0.533226581265012, + "learning_rate": 1.839532136594927e-05, + "loss": 0.2278, + "step": 2662 + }, + { + "epoch": 0.533226581265012, + "learning_rate": 1.840290595364436e-05, + "loss": 0.1966, + "step": 2664 + }, + { + "epoch": 0.5340272217774219, + "learning_rate": 1.8410474130282085e-05, + "loss": 0.5232, + "step": 2666 + }, + { + "epoch": 0.5340272217774219, + "learning_rate": 1.8418025881081606e-05, + "loss": 0.2224, + "step": 2668 + }, + { + "epoch": 0.5348278622898318, + "learning_rate": 1.8425561191294217e-05, + "loss": 0.1512, + "step": 2670 + }, + { + "epoch": 0.5348278622898318, + "learning_rate": 1.8433080046203286e-05, + "loss": 0.2844, + "step": 2672 + }, + { + "epoch": 0.5356285028022418, + "learning_rate": 1.8440582431124325e-05, + "loss": 0.1787, + "step": 2674 + }, + { + "epoch": 0.5356285028022418, + "learning_rate": 1.844806833140501e-05, + "loss": 0.2778, + "step": 2676 + }, + { + "epoch": 0.5364291433146517, + "learning_rate": 1.8455537732425223e-05, + "loss": 0.2382, + "step": 2678 + }, + { + "epoch": 0.5364291433146517, + "learning_rate": 1.8462990619597054e-05, + "loss": 0.1623, + "step": 2680 + }, + { + "epoch": 0.5372297838270617, + "learning_rate": 1.847042697836485e-05, + "loss": 0.2065, + "step": 2682 + }, + { + "epoch": 0.5372297838270617, + "learning_rate": 1.8477846794205258e-05, + "loss": 0.1673, + "step": 2684 + }, + { + "epoch": 0.5380304243394716, + "learning_rate": 1.84852500526272e-05, + "loss": 0.4928, + "step": 2686 + }, + { + "epoch": 0.5380304243394716, + "learning_rate": 1.8492636739171966e-05, + "loss": 0.1676, + "step": 2688 + }, + { + "epoch": 0.5388310648518815, + "learning_rate": 1.8500006839413183e-05, + "loss": 0.1712, + "step": 2690 + }, + { + "epoch": 0.5388310648518815, + "learning_rate": 1.85073603389569e-05, + "loss": 0.0793, + "step": 2692 + }, + { + "epoch": 0.5396317053642914, + "learning_rate": 1.851469722344155e-05, + "loss": 0.1865, + "step": 2694 + }, + { + "epoch": 0.5396317053642914, + "learning_rate": 1.8522017478538067e-05, + "loss": 0.1241, + "step": 2696 + }, + { + "epoch": 0.5404323458767014, + "learning_rate": 1.8529321089949817e-05, + "loss": 1.0657, + "step": 2698 + }, + { + "epoch": 0.5404323458767014, + "learning_rate": 1.8536608043412695e-05, + "loss": 0.3722, + "step": 2700 + }, + { + "epoch": 0.5412329863891113, + "learning_rate": 1.8543878324695122e-05, + "loss": 0.2222, + "step": 2702 + }, + { + "epoch": 0.5412329863891113, + "learning_rate": 1.855113191959808e-05, + "loss": 0.1297, + "step": 2704 + }, + { + "epoch": 0.5420336269015212, + "learning_rate": 1.8558368813955143e-05, + "loss": 0.1843, + "step": 2706 + }, + { + "epoch": 0.5420336269015212, + "learning_rate": 1.856558899363248e-05, + "loss": 0.0395, + "step": 2708 + }, + { + "epoch": 0.5428342674139311, + "learning_rate": 1.857279244452896e-05, + "loss": 0.177, + "step": 2710 + }, + { + "epoch": 0.5428342674139311, + "learning_rate": 1.8579979152576063e-05, + "loss": 0.1148, + "step": 2712 + }, + { + "epoch": 0.5436349079263411, + "learning_rate": 1.8587149103738e-05, + "loss": 0.5083, + "step": 2714 + }, + { + "epoch": 0.5436349079263411, + "learning_rate": 1.85943022840117e-05, + "loss": 0.0513, + "step": 2716 + }, + { + "epoch": 0.544435548438751, + "learning_rate": 1.8601438679426847e-05, + "loss": 0.5416, + "step": 2718 + }, + { + "epoch": 0.544435548438751, + "learning_rate": 1.8608558276045895e-05, + "loss": 0.1071, + "step": 2720 + }, + { + "epoch": 0.5452361889511609, + "learning_rate": 1.8615661059964134e-05, + "loss": 0.3568, + "step": 2722 + }, + { + "epoch": 0.5452361889511609, + "learning_rate": 1.862274701730967e-05, + "loss": 0.2019, + "step": 2724 + }, + { + "epoch": 0.5460368294635709, + "learning_rate": 1.862981613424347e-05, + "loss": 0.2101, + "step": 2726 + }, + { + "epoch": 0.5460368294635709, + "learning_rate": 1.86368683969594e-05, + "loss": 0.1669, + "step": 2728 + }, + { + "epoch": 0.5468374699759808, + "learning_rate": 1.864390379168423e-05, + "loss": 0.3327, + "step": 2730 + }, + { + "epoch": 0.5468374699759808, + "learning_rate": 1.865092230467769e-05, + "loss": 0.4152, + "step": 2732 + }, + { + "epoch": 0.5476381104883907, + "learning_rate": 1.8657923922232464e-05, + "loss": 0.6162, + "step": 2734 + }, + { + "epoch": 0.5476381104883907, + "learning_rate": 1.866490863067425e-05, + "loss": 0.1787, + "step": 2736 + }, + { + "epoch": 0.5484387510008006, + "learning_rate": 1.8671876416361763e-05, + "loss": 0.1753, + "step": 2738 + }, + { + "epoch": 0.5484387510008006, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.1863, + "step": 2740 + }, + { + "epoch": 0.5492393915132106, + "learning_rate": 1.8685761165074073e-05, + "loss": 0.2162, + "step": 2742 + }, + { + "epoch": 0.5492393915132106, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.3516, + "step": 2744 + }, + { + "epoch": 0.5500400320256205, + "learning_rate": 1.869957805990059e-05, + "loss": 0.4707, + "step": 2746 + }, + { + "epoch": 0.5500400320256205, + "learning_rate": 1.87064610283551e-05, + "loss": 0.4434, + "step": 2748 + }, + { + "epoch": 0.5508406725380304, + "learning_rate": 1.87133269929026e-05, + "loss": 0.3514, + "step": 2750 + }, + { + "epoch": 0.5508406725380304, + "learning_rate": 1.8720175940133705e-05, + "loss": 0.4731, + "step": 2752 + }, + { + "epoch": 0.5516413130504404, + "learning_rate": 1.8727007856672285e-05, + "loss": 0.1661, + "step": 2754 + }, + { + "epoch": 0.5516413130504404, + "learning_rate": 1.873382272917545e-05, + "loss": 0.2822, + "step": 2756 + }, + { + "epoch": 0.5524419535628503, + "learning_rate": 1.8740620544333607e-05, + "loss": 0.1664, + "step": 2758 + }, + { + "epoch": 0.5524419535628503, + "learning_rate": 1.8747401288870472e-05, + "loss": 0.6414, + "step": 2760 + }, + { + "epoch": 0.5532425940752602, + "learning_rate": 1.875416494954312e-05, + "loss": 0.1342, + "step": 2762 + }, + { + "epoch": 0.5532425940752602, + "learning_rate": 1.876091151314196e-05, + "loss": 0.256, + "step": 2764 + }, + { + "epoch": 0.5540432345876701, + "learning_rate": 1.8767640966490813e-05, + "loss": 0.6766, + "step": 2766 + }, + { + "epoch": 0.5540432345876701, + "learning_rate": 1.877435329644691e-05, + "loss": 0.0962, + "step": 2768 + }, + { + "epoch": 0.55484387510008, + "learning_rate": 1.878104848990093e-05, + "loss": 0.4713, + "step": 2770 + }, + { + "epoch": 0.55484387510008, + "learning_rate": 1.8787726533776996e-05, + "loss": 0.042, + "step": 2772 + }, + { + "epoch": 0.55564451561249, + "learning_rate": 1.879438741503277e-05, + "loss": 0.6157, + "step": 2774 + }, + { + "epoch": 0.55564451561249, + "learning_rate": 1.8801031120659393e-05, + "loss": 0.3977, + "step": 2776 + }, + { + "epoch": 0.5564451561248999, + "learning_rate": 1.8807657637681563e-05, + "loss": 0.3495, + "step": 2778 + }, + { + "epoch": 0.5564451561248999, + "learning_rate": 1.8814266953157557e-05, + "loss": 0.3486, + "step": 2780 + }, + { + "epoch": 0.5572457966373099, + "learning_rate": 1.8820859054179225e-05, + "loss": 0.3993, + "step": 2782 + }, + { + "epoch": 0.5572457966373099, + "learning_rate": 1.8827433927872066e-05, + "loss": 0.1276, + "step": 2784 + }, + { + "epoch": 0.5580464371497198, + "learning_rate": 1.883399156139519e-05, + "loss": 0.1528, + "step": 2786 + }, + { + "epoch": 0.5580464371497198, + "learning_rate": 1.8840531941941415e-05, + "loss": 0.0514, + "step": 2788 + }, + { + "epoch": 0.5588470776621297, + "learning_rate": 1.8847055056737233e-05, + "loss": 0.1769, + "step": 2790 + }, + { + "epoch": 0.5588470776621297, + "learning_rate": 1.8853560893042854e-05, + "loss": 0.1586, + "step": 2792 + }, + { + "epoch": 0.5596477181745396, + "learning_rate": 1.8860049438152244e-05, + "loss": 0.1765, + "step": 2794 + }, + { + "epoch": 0.5596477181745396, + "learning_rate": 1.8866520679393127e-05, + "loss": 0.27, + "step": 2796 + }, + { + "epoch": 0.5604483586869495, + "learning_rate": 1.8872974604127025e-05, + "loss": 0.1669, + "step": 2798 + }, + { + "epoch": 0.5604483586869495, + "learning_rate": 1.8879411199749303e-05, + "loss": 0.1201, + "step": 2800 + }, + { + "epoch": 0.5612489991993594, + "learning_rate": 1.8885830453689132e-05, + "loss": 0.5391, + "step": 2802 + }, + { + "epoch": 0.5612489991993594, + "learning_rate": 1.889223235340958e-05, + "loss": 0.1809, + "step": 2804 + }, + { + "epoch": 0.5620496397117695, + "learning_rate": 1.889861688640759e-05, + "loss": 0.2612, + "step": 2806 + }, + { + "epoch": 0.5620496397117695, + "learning_rate": 1.8904984040214037e-05, + "loss": 0.5066, + "step": 2808 + }, + { + "epoch": 0.5628502802241794, + "learning_rate": 1.891133380239373e-05, + "loss": 0.2505, + "step": 2810 + }, + { + "epoch": 0.5628502802241794, + "learning_rate": 1.8917666160545436e-05, + "loss": 0.2059, + "step": 2812 + }, + { + "epoch": 0.5636509207365893, + "learning_rate": 1.892398110230194e-05, + "loss": 0.5197, + "step": 2814 + }, + { + "epoch": 0.5636509207365893, + "learning_rate": 1.893027861533002e-05, + "loss": 0.2614, + "step": 2816 + }, + { + "epoch": 0.5644515612489992, + "learning_rate": 1.8936558687330485e-05, + "loss": 0.2859, + "step": 2818 + }, + { + "epoch": 0.5644515612489992, + "learning_rate": 1.894282130603823e-05, + "loss": 0.1955, + "step": 2820 + }, + { + "epoch": 0.5652522017614091, + "learning_rate": 1.8949066459222217e-05, + "loss": 0.4254, + "step": 2822 + }, + { + "epoch": 0.5652522017614091, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.3996, + "step": 2824 + }, + { + "epoch": 0.566052842273819, + "learning_rate": 1.8961504320265382e-05, + "loss": 0.3123, + "step": 2826 + }, + { + "epoch": 0.566052842273819, + "learning_rate": 1.896769700383315e-05, + "loss": 0.5108, + "step": 2828 + }, + { + "epoch": 0.5668534827862289, + "learning_rate": 1.897387217329439e-05, + "loss": 0.2395, + "step": 2830 + }, + { + "epoch": 0.5668534827862289, + "learning_rate": 1.898002981658886e-05, + "loss": 0.1866, + "step": 2832 + }, + { + "epoch": 0.567654123298639, + "learning_rate": 1.8986169921690543e-05, + "loss": 0.224, + "step": 2834 + }, + { + "epoch": 0.567654123298639, + "learning_rate": 1.899229247660769e-05, + "loss": 0.0974, + "step": 2836 + }, + { + "epoch": 0.5684547638110489, + "learning_rate": 1.899839746938281e-05, + "loss": 0.2569, + "step": 2838 + }, + { + "epoch": 0.5684547638110489, + "learning_rate": 1.9004484888092724e-05, + "loss": 0.1579, + "step": 2840 + }, + { + "epoch": 0.5692554043234588, + "learning_rate": 1.9010554720848577e-05, + "loss": 0.3427, + "step": 2842 + }, + { + "epoch": 0.5692554043234588, + "learning_rate": 1.901660695579585e-05, + "loss": 0.0411, + "step": 2844 + }, + { + "epoch": 0.5700560448358687, + "learning_rate": 1.9022641581114392e-05, + "loss": 0.5222, + "step": 2846 + }, + { + "epoch": 0.5700560448358687, + "learning_rate": 1.9028658585018455e-05, + "loss": 0.3942, + "step": 2848 + }, + { + "epoch": 0.5708566853482786, + "learning_rate": 1.9034657955756695e-05, + "loss": 0.2259, + "step": 2850 + }, + { + "epoch": 0.5708566853482786, + "learning_rate": 1.9040639681612212e-05, + "loss": 0.2525, + "step": 2852 + }, + { + "epoch": 0.5716573258606885, + "learning_rate": 1.904660375090257e-05, + "loss": 0.5569, + "step": 2854 + }, + { + "epoch": 0.5716573258606885, + "learning_rate": 1.9052550151979816e-05, + "loss": 0.2262, + "step": 2856 + }, + { + "epoch": 0.5724579663730984, + "learning_rate": 1.905847887323049e-05, + "loss": 0.2463, + "step": 2858 + }, + { + "epoch": 0.5724579663730984, + "learning_rate": 1.9064389903075676e-05, + "loss": 0.2776, + "step": 2860 + }, + { + "epoch": 0.5732586068855084, + "learning_rate": 1.9070283229971007e-05, + "loss": 0.5323, + "step": 2862 + }, + { + "epoch": 0.5732586068855084, + "learning_rate": 1.9076158842406674e-05, + "loss": 0.2051, + "step": 2864 + }, + { + "epoch": 0.5740592473979184, + "learning_rate": 1.9082016728907496e-05, + "loss": 0.9686, + "step": 2866 + }, + { + "epoch": 0.5740592473979184, + "learning_rate": 1.9087856878032886e-05, + "loss": 0.3131, + "step": 2868 + }, + { + "epoch": 0.5748598879103283, + "learning_rate": 1.909367927837691e-05, + "loss": 0.4471, + "step": 2870 + }, + { + "epoch": 0.5748598879103283, + "learning_rate": 1.909948391856829e-05, + "loss": 0.2458, + "step": 2872 + }, + { + "epoch": 0.5756605284227382, + "learning_rate": 1.910527078727044e-05, + "loss": 0.2416, + "step": 2874 + }, + { + "epoch": 0.5756605284227382, + "learning_rate": 1.911103987318148e-05, + "loss": 0.2777, + "step": 2876 + }, + { + "epoch": 0.5764611689351481, + "learning_rate": 1.911679116503425e-05, + "loss": 0.3449, + "step": 2878 + }, + { + "epoch": 0.5764611689351481, + "learning_rate": 1.912252465159637e-05, + "loss": 0.2106, + "step": 2880 + }, + { + "epoch": 0.577261809447558, + "learning_rate": 1.9128240321670208e-05, + "loss": 0.2166, + "step": 2882 + }, + { + "epoch": 0.577261809447558, + "learning_rate": 1.913393816409294e-05, + "loss": 0.3659, + "step": 2884 + }, + { + "epoch": 0.578062449959968, + "learning_rate": 1.913961816773655e-05, + "loss": 0.4227, + "step": 2886 + }, + { + "epoch": 0.578062449959968, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.1546, + "step": 2888 + }, + { + "epoch": 0.5788630904723779, + "learning_rate": 1.9150924614348588e-05, + "loss": 0.1763, + "step": 2890 + }, + { + "epoch": 0.5788630904723779, + "learning_rate": 1.9156551035235288e-05, + "loss": 0.55, + "step": 2892 + }, + { + "epoch": 0.5796637309847879, + "learning_rate": 1.916215957317944e-05, + "loss": 0.6128, + "step": 2894 + }, + { + "epoch": 0.5796637309847879, + "learning_rate": 1.9167750217227454e-05, + "loss": 0.0501, + "step": 2896 + }, + { + "epoch": 0.5804643714971978, + "learning_rate": 1.9173322956460675e-05, + "loss": 0.1496, + "step": 2898 + }, + { + "epoch": 0.5804643714971978, + "learning_rate": 1.9178877779995423e-05, + "loss": 0.0263, + "step": 2900 + }, + { + "epoch": 0.5812650120096077, + "learning_rate": 1.9184414676983006e-05, + "loss": 0.4598, + "step": 2902 + }, + { + "epoch": 0.5812650120096077, + "learning_rate": 1.9189933636609747e-05, + "loss": 0.2697, + "step": 2904 + }, + { + "epoch": 0.5820656525220176, + "learning_rate": 1.9195434648097003e-05, + "loss": 0.4029, + "step": 2906 + }, + { + "epoch": 0.5820656525220176, + "learning_rate": 1.9200917700701173e-05, + "loss": 0.1956, + "step": 2908 + }, + { + "epoch": 0.5828662930344275, + "learning_rate": 1.9206382783713738e-05, + "loss": 0.2976, + "step": 2910 + }, + { + "epoch": 0.5828662930344275, + "learning_rate": 1.9211829886461274e-05, + "loss": 0.1694, + "step": 2912 + }, + { + "epoch": 0.5836669335468375, + "learning_rate": 1.921725899830547e-05, + "loss": 0.3611, + "step": 2914 + }, + { + "epoch": 0.5836669335468375, + "learning_rate": 1.9222670108643146e-05, + "loss": 0.2782, + "step": 2916 + }, + { + "epoch": 0.5844675740592474, + "learning_rate": 1.92280632069063e-05, + "loss": 0.2271, + "step": 2918 + }, + { + "epoch": 0.5844675740592474, + "learning_rate": 1.9233438282562085e-05, + "loss": 0.135, + "step": 2920 + }, + { + "epoch": 0.5852682145716573, + "learning_rate": 1.9238795325112867e-05, + "loss": 0.1275, + "step": 2922 + }, + { + "epoch": 0.5852682145716573, + "learning_rate": 1.924413432409622e-05, + "loss": 0.3228, + "step": 2924 + }, + { + "epoch": 0.5860688550840673, + "learning_rate": 1.924945526908497e-05, + "loss": 0.519, + "step": 2926 + }, + { + "epoch": 0.5860688550840673, + "learning_rate": 1.925475814968719e-05, + "loss": 0.121, + "step": 2928 + }, + { + "epoch": 0.5868694955964772, + "learning_rate": 1.9260042955546237e-05, + "loss": 0.6254, + "step": 2930 + }, + { + "epoch": 0.5868694955964772, + "learning_rate": 1.926530967634078e-05, + "loss": 0.1059, + "step": 2932 + }, + { + "epoch": 0.5876701361088871, + "learning_rate": 1.9270558301784795e-05, + "loss": 0.4969, + "step": 2934 + }, + { + "epoch": 0.5876701361088871, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.1946, + "step": 2936 + }, + { + "epoch": 0.588470776621297, + "learning_rate": 1.9281001225653887e-05, + "loss": 0.3149, + "step": 2938 + }, + { + "epoch": 0.588470776621297, + "learning_rate": 1.9286195503683705e-05, + "loss": 0.4259, + "step": 2940 + }, + { + "epoch": 0.589271417133707, + "learning_rate": 1.9291371645572517e-05, + "loss": 0.4693, + "step": 2942 + }, + { + "epoch": 0.589271417133707, + "learning_rate": 1.9296529641211215e-05, + "loss": 0.0127, + "step": 2944 + }, + { + "epoch": 0.5900720576461169, + "learning_rate": 1.9301669480526115e-05, + "loss": 0.4233, + "step": 2946 + }, + { + "epoch": 0.5900720576461169, + "learning_rate": 1.9306791153479004e-05, + "loss": 0.984, + "step": 2948 + }, + { + "epoch": 0.5908726981585268, + "learning_rate": 1.931189465006714e-05, + "loss": 0.2328, + "step": 2950 + }, + { + "epoch": 0.5908726981585268, + "learning_rate": 1.9316979960323286e-05, + "loss": 0.2282, + "step": 2952 + }, + { + "epoch": 0.5916733386709367, + "learning_rate": 1.9322047074315717e-05, + "loss": 0.6037, + "step": 2954 + }, + { + "epoch": 0.5916733386709367, + "learning_rate": 1.932709598214825e-05, + "loss": 0.144, + "step": 2956 + }, + { + "epoch": 0.5924739791833467, + "learning_rate": 1.9332126673960262e-05, + "loss": 0.2022, + "step": 2958 + }, + { + "epoch": 0.5924739791833467, + "learning_rate": 1.9337139139926707e-05, + "loss": 0.1427, + "step": 2960 + }, + { + "epoch": 0.5932746196957566, + "learning_rate": 1.934213337025812e-05, + "loss": 0.0921, + "step": 2962 + }, + { + "epoch": 0.5932746196957566, + "learning_rate": 1.9347109355200672e-05, + "loss": 0.0734, + "step": 2964 + }, + { + "epoch": 0.5940752602081665, + "learning_rate": 1.9352067085036145e-05, + "loss": 0.1581, + "step": 2966 + }, + { + "epoch": 0.5940752602081665, + "learning_rate": 1.935700655008199e-05, + "loss": 0.1153, + "step": 2968 + }, + { + "epoch": 0.5948759007205765, + "learning_rate": 1.9361927740691327e-05, + "loss": 0.2135, + "step": 2970 + }, + { + "epoch": 0.5948759007205765, + "learning_rate": 1.9366830647252967e-05, + "loss": 0.1675, + "step": 2972 + }, + { + "epoch": 0.5956765412329864, + "learning_rate": 1.937171526019142e-05, + "loss": 0.4079, + "step": 2974 + }, + { + "epoch": 0.5956765412329864, + "learning_rate": 1.9376581569966933e-05, + "loss": 0.1283, + "step": 2976 + }, + { + "epoch": 0.5964771817453963, + "learning_rate": 1.9381429567075504e-05, + "loss": 0.2776, + "step": 2978 + }, + { + "epoch": 0.5964771817453963, + "learning_rate": 1.9386259242048883e-05, + "loss": 0.3433, + "step": 2980 + }, + { + "epoch": 0.5972778222578062, + "learning_rate": 1.93910705854546e-05, + "loss": 0.5167, + "step": 2982 + }, + { + "epoch": 0.5972778222578062, + "learning_rate": 1.939586358789602e-05, + "loss": 0.2453, + "step": 2984 + }, + { + "epoch": 0.5980784627702161, + "learning_rate": 1.9400638240012294e-05, + "loss": 0.2479, + "step": 2986 + }, + { + "epoch": 0.5980784627702161, + "learning_rate": 1.940539453247842e-05, + "loss": 0.1008, + "step": 2988 + }, + { + "epoch": 0.5988791032826261, + "learning_rate": 1.9410132456005262e-05, + "loss": 0.8446, + "step": 2990 + }, + { + "epoch": 0.5988791032826261, + "learning_rate": 1.9414852001339547e-05, + "loss": 0.0862, + "step": 2992 + }, + { + "epoch": 0.5996797437950361, + "learning_rate": 1.9419553159263896e-05, + "loss": 0.2936, + "step": 2994 + }, + { + "epoch": 0.5996797437950361, + "learning_rate": 1.9424235920596863e-05, + "loss": 0.0187, + "step": 2996 + }, + { + "epoch": 0.600480384307446, + "learning_rate": 1.94289002761929e-05, + "loss": 0.3349, + "step": 2998 + }, + { + "epoch": 0.600480384307446, + "learning_rate": 1.9433546216942423e-05, + "loss": 0.0936, + "step": 3000 + }, + { + "epoch": 0.6012810248198559, + "learning_rate": 1.943817373377181e-05, + "loss": 0.3229, + "step": 3002 + }, + { + "epoch": 0.6012810248198559, + "learning_rate": 1.944278281764342e-05, + "loss": 0.1426, + "step": 3004 + }, + { + "epoch": 0.6020816653322658, + "learning_rate": 1.944737345955561e-05, + "loss": 0.2377, + "step": 3006 + }, + { + "epoch": 0.6020816653322658, + "learning_rate": 1.945194565054276e-05, + "loss": 0.1163, + "step": 3008 + }, + { + "epoch": 0.6028823058446757, + "learning_rate": 1.945649938167528e-05, + "loss": 0.1426, + "step": 3010 + }, + { + "epoch": 0.6028823058446757, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.1146, + "step": 3012 + }, + { + "epoch": 0.6036829463570856, + "learning_rate": 1.946555142883836e-05, + "loss": 0.3716, + "step": 3014 + }, + { + "epoch": 0.6036829463570856, + "learning_rate": 1.9470049727190073e-05, + "loss": 0.1954, + "step": 3016 + }, + { + "epoch": 0.6044835868694955, + "learning_rate": 1.9474529530329507e-05, + "loss": 0.4179, + "step": 3018 + }, + { + "epoch": 0.6044835868694955, + "learning_rate": 1.9478990829507504e-05, + "loss": 0.2286, + "step": 3020 + }, + { + "epoch": 0.6052842273819056, + "learning_rate": 1.9483433616011047e-05, + "loss": 0.5044, + "step": 3022 + }, + { + "epoch": 0.6052842273819056, + "learning_rate": 1.948785788116329e-05, + "loss": 0.4513, + "step": 3024 + }, + { + "epoch": 0.6060848678943155, + "learning_rate": 1.9492263616323533e-05, + "loss": 0.3038, + "step": 3026 + }, + { + "epoch": 0.6060848678943155, + "learning_rate": 1.9496650812887286e-05, + "loss": 0.2418, + "step": 3028 + }, + { + "epoch": 0.6068855084067254, + "learning_rate": 1.9501019462286263e-05, + "loss": 0.6364, + "step": 3030 + }, + { + "epoch": 0.6068855084067254, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.0595, + "step": 3032 + }, + { + "epoch": 0.6076861489191353, + "learning_rate": 1.9509701085497842e-05, + "loss": 0.4716, + "step": 3034 + }, + { + "epoch": 0.6076861489191353, + "learning_rate": 1.951401404235505e-05, + "loss": 0.749, + "step": 3036 + }, + { + "epoch": 0.6084867894315452, + "learning_rate": 1.9518308418136718e-05, + "loss": 0.3639, + "step": 3038 + }, + { + "epoch": 0.6084867894315452, + "learning_rate": 1.952258420445583e-05, + "loss": 0.7821, + "step": 3040 + }, + { + "epoch": 0.6092874299439551, + "learning_rate": 1.952684139296169e-05, + "loss": 0.2778, + "step": 3042 + }, + { + "epoch": 0.6092874299439551, + "learning_rate": 1.9531079975339912e-05, + "loss": 0.0734, + "step": 3044 + }, + { + "epoch": 0.610088070456365, + "learning_rate": 1.9535299943312455e-05, + "loss": 0.3615, + "step": 3046 + }, + { + "epoch": 0.610088070456365, + "learning_rate": 1.953950128863762e-05, + "loss": 0.1937, + "step": 3048 + }, + { + "epoch": 0.6108887109687751, + "learning_rate": 1.9543684003110105e-05, + "loss": 0.1667, + "step": 3050 + }, + { + "epoch": 0.6108887109687751, + "learning_rate": 1.9547848078560975e-05, + "loss": 0.261, + "step": 3052 + }, + { + "epoch": 0.611689351481185, + "learning_rate": 1.9551993506857688e-05, + "loss": 0.1349, + "step": 3054 + }, + { + "epoch": 0.611689351481185, + "learning_rate": 1.9556120279904144e-05, + "loss": 0.5711, + "step": 3056 + }, + { + "epoch": 0.6124899919935949, + "learning_rate": 1.9560228389640664e-05, + "loss": 0.2478, + "step": 3058 + }, + { + "epoch": 0.6124899919935949, + "learning_rate": 1.956431782804402e-05, + "loss": 0.1349, + "step": 3060 + }, + { + "epoch": 0.6132906325060048, + "learning_rate": 1.956838858712744e-05, + "loss": 0.2274, + "step": 3062 + }, + { + "epoch": 0.6132906325060048, + "learning_rate": 1.957244065894066e-05, + "loss": 0.1054, + "step": 3064 + }, + { + "epoch": 0.6140912730184147, + "learning_rate": 1.9576474035569892e-05, + "loss": 0.3092, + "step": 3066 + }, + { + "epoch": 0.6140912730184147, + "learning_rate": 1.9580488709137858e-05, + "loss": 0.1817, + "step": 3068 + }, + { + "epoch": 0.6148919135308246, + "learning_rate": 1.9584484671803818e-05, + "loss": 0.2106, + "step": 3070 + }, + { + "epoch": 0.6148919135308246, + "learning_rate": 1.9588461915763566e-05, + "loss": 0.1778, + "step": 3072 + }, + { + "epoch": 0.6156925540432346, + "learning_rate": 1.9592420433249462e-05, + "loss": 0.2018, + "step": 3074 + }, + { + "epoch": 0.6156925540432346, + "learning_rate": 1.9596360216530436e-05, + "loss": 0.5225, + "step": 3076 + }, + { + "epoch": 0.6164931945556446, + "learning_rate": 1.9600281257912e-05, + "loss": 0.1496, + "step": 3078 + }, + { + "epoch": 0.6164931945556446, + "learning_rate": 1.9604183549736283e-05, + "loss": 0.208, + "step": 3080 + }, + { + "epoch": 0.6172938350680545, + "learning_rate": 1.960806708438202e-05, + "loss": 0.2119, + "step": 3082 + }, + { + "epoch": 0.6172938350680545, + "learning_rate": 1.961193185426459e-05, + "loss": 0.5395, + "step": 3084 + }, + { + "epoch": 0.6180944755804644, + "learning_rate": 1.9615777851836003e-05, + "loss": 0.2192, + "step": 3086 + }, + { + "epoch": 0.6180944755804644, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.4713, + "step": 3088 + }, + { + "epoch": 0.6188951160928743, + "learning_rate": 1.962341350003679e-05, + "loss": 0.6812, + "step": 3090 + }, + { + "epoch": 0.6188951160928743, + "learning_rate": 1.9627203135753573e-05, + "loss": 0.1647, + "step": 3092 + }, + { + "epoch": 0.6196957566052842, + "learning_rate": 1.9630973969334068e-05, + "loss": 0.4365, + "step": 3094 + }, + { + "epoch": 0.6196957566052842, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.205, + "step": 3096 + }, + { + "epoch": 0.6204963971176941, + "learning_rate": 1.9638459200664822e-05, + "loss": 0.1851, + "step": 3098 + }, + { + "epoch": 0.6204963971176941, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.1681, + "step": 3100 + }, + { + "epoch": 0.6212970376301041, + "learning_rate": 1.9645869135553806e-05, + "loss": 0.2529, + "step": 3102 + }, + { + "epoch": 0.6212970376301041, + "learning_rate": 1.964954584871995e-05, + "loss": 0.1761, + "step": 3104 + }, + { + "epoch": 0.622097678142514, + "learning_rate": 1.965320371611399e-05, + "loss": 0.2018, + "step": 3106 + }, + { + "epoch": 0.622097678142514, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.0646, + "step": 3108 + }, + { + "epoch": 0.622898318654924, + "learning_rate": 1.9660462885047032e-05, + "loss": 0.1787, + "step": 3110 + }, + { + "epoch": 0.622898318654924, + "learning_rate": 1.966406417240872e-05, + "loss": 0.2307, + "step": 3112 + }, + { + "epoch": 0.6236989591673339, + "learning_rate": 1.9667646585643703e-05, + "loss": 0.5142, + "step": 3114 + }, + { + "epoch": 0.6236989591673339, + "learning_rate": 1.967121011775546e-05, + "loss": 0.0361, + "step": 3116 + }, + { + "epoch": 0.6244995996797438, + "learning_rate": 1.967475476178433e-05, + "loss": 0.4309, + "step": 3118 + }, + { + "epoch": 0.6244995996797438, + "learning_rate": 1.967828051080755e-05, + "loss": 0.2464, + "step": 3120 + }, + { + "epoch": 0.6253002401921537, + "learning_rate": 1.9681787357939254e-05, + "loss": 0.6014, + "step": 3122 + }, + { + "epoch": 0.6253002401921537, + "learning_rate": 1.9685275296330497e-05, + "loss": 0.5234, + "step": 3124 + }, + { + "epoch": 0.6261008807045636, + "learning_rate": 1.968874431916926e-05, + "loss": 0.4965, + "step": 3126 + }, + { + "epoch": 0.6261008807045636, + "learning_rate": 1.969219441968046e-05, + "loss": 0.091, + "step": 3128 + }, + { + "epoch": 0.6269015212169736, + "learning_rate": 1.969562559112598e-05, + "loss": 0.2164, + "step": 3130 + }, + { + "epoch": 0.6269015212169736, + "learning_rate": 1.969903782680467e-05, + "loss": 0.716, + "step": 3132 + }, + { + "epoch": 0.6277021617293835, + "learning_rate": 1.970243112005235e-05, + "loss": 0.2961, + "step": 3134 + }, + { + "epoch": 0.6277021617293835, + "learning_rate": 1.9705805464241856e-05, + "loss": 0.3265, + "step": 3136 + }, + { + "epoch": 0.6285028022417934, + "learning_rate": 1.970916085278302e-05, + "loss": 0.2236, + "step": 3138 + }, + { + "epoch": 0.6285028022417934, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.5319, + "step": 3140 + }, + { + "epoch": 0.6293034427542034, + "learning_rate": 1.9715814736744755e-05, + "loss": 0.1162, + "step": 3142 + }, + { + "epoch": 0.6293034427542034, + "learning_rate": 1.971911321917015e-05, + "loss": 0.0386, + "step": 3144 + }, + { + "epoch": 0.6301040832666133, + "learning_rate": 1.9722392719956864e-05, + "loss": 0.1987, + "step": 3146 + }, + { + "epoch": 0.6301040832666133, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.2876, + "step": 3148 + }, + { + "epoch": 0.6309047237790232, + "learning_rate": 1.9728894751031595e-05, + "loss": 0.3842, + "step": 3150 + }, + { + "epoch": 0.6309047237790232, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.7951, + "step": 3152 + }, + { + "epoch": 0.6317053642914331, + "learning_rate": 1.9735320779174545e-05, + "loss": 0.3069, + "step": 3154 + }, + { + "epoch": 0.6317053642914331, + "learning_rate": 1.9738505276435692e-05, + "loss": 0.1682, + "step": 3156 + }, + { + "epoch": 0.6325060048038431, + "learning_rate": 1.974167075418505e-05, + "loss": 0.2534, + "step": 3158 + }, + { + "epoch": 0.6325060048038431, + "learning_rate": 1.9744817206240377e-05, + "loss": 0.2263, + "step": 3160 + }, + { + "epoch": 0.633306645316253, + "learning_rate": 1.9747944626456577e-05, + "loss": 0.1977, + "step": 3162 + }, + { + "epoch": 0.633306645316253, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.1496, + "step": 3164 + }, + { + "epoch": 0.6341072858286629, + "learning_rate": 1.975414234697712e-05, + "loss": 0.0671, + "step": 3166 + }, + { + "epoch": 0.6341072858286629, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.0523, + "step": 3168 + }, + { + "epoch": 0.6349079263410728, + "learning_rate": 1.9760263867329568e-05, + "loss": 0.1675, + "step": 3170 + }, + { + "epoch": 0.6349079263410728, + "learning_rate": 1.9763296037475174e-05, + "loss": 0.6791, + "step": 3172 + }, + { + "epoch": 0.6357085668534828, + "learning_rate": 1.97663091396921e-05, + "loss": 0.1423, + "step": 3174 + }, + { + "epoch": 0.6357085668534828, + "learning_rate": 1.976930316809569e-05, + "loss": 0.3445, + "step": 3176 + }, + { + "epoch": 0.6365092073658927, + "learning_rate": 1.9772278116838543e-05, + "loss": 0.2066, + "step": 3178 + }, + { + "epoch": 0.6365092073658927, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.1515, + "step": 3180 + }, + { + "epoch": 0.6373098478783027, + "learning_rate": 1.977817075213876e-05, + "loss": 0.4071, + "step": 3182 + }, + { + "epoch": 0.6373098478783027, + "learning_rate": 1.978108842718768e-05, + "loss": 0.2463, + "step": 3184 + }, + { + "epoch": 0.6381104883907126, + "learning_rate": 1.9783986999558994e-05, + "loss": 0.1663, + "step": 3186 + }, + { + "epoch": 0.6381104883907126, + "learning_rate": 1.9786866463591732e-05, + "loss": 0.62, + "step": 3188 + }, + { + "epoch": 0.6389111289031225, + "learning_rate": 1.9789726813662233e-05, + "loss": 0.1948, + "step": 3190 + }, + { + "epoch": 0.6389111289031225, + "learning_rate": 1.9792568044184176e-05, + "loss": 0.4769, + "step": 3192 + }, + { + "epoch": 0.6397117694155324, + "learning_rate": 1.979539014960858e-05, + "loss": 0.1599, + "step": 3194 + }, + { + "epoch": 0.6397117694155324, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.1502, + "step": 3196 + }, + { + "epoch": 0.6405124099279423, + "learning_rate": 1.9800976963155584e-05, + "loss": 0.549, + "step": 3198 + }, + { + "epoch": 0.6405124099279423, + "learning_rate": 1.9803741660367015e-05, + "loss": 0.1827, + "step": 3200 + }, + { + "epoch": 0.6413130504403523, + "learning_rate": 1.980648721065859e-05, + "loss": 0.2668, + "step": 3202 + }, + { + "epoch": 0.6413130504403523, + "learning_rate": 1.9809213608668185e-05, + "loss": 0.2382, + "step": 3204 + }, + { + "epoch": 0.6421136909527622, + "learning_rate": 1.9811920849071092e-05, + "loss": 0.2692, + "step": 3206 + }, + { + "epoch": 0.6421136909527622, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.1033, + "step": 3208 + }, + { + "epoch": 0.6429143314651722, + "learning_rate": 1.9817277835945057e-05, + "loss": 0.5423, + "step": 3210 + }, + { + "epoch": 0.6429143314651722, + "learning_rate": 1.9819927571953807e-05, + "loss": 0.4381, + "step": 3212 + }, + { + "epoch": 0.6437149719775821, + "learning_rate": 1.9822558129431263e-05, + "loss": 0.929, + "step": 3214 + }, + { + "epoch": 0.6437149719775821, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.1673, + "step": 3216 + }, + { + "epoch": 0.644515612489992, + "learning_rate": 1.9827761688279606e-05, + "loss": 0.2194, + "step": 3218 + }, + { + "epoch": 0.644515612489992, + "learning_rate": 1.983033467948784e-05, + "loss": 0.0304, + "step": 3220 + }, + { + "epoch": 0.6453162530024019, + "learning_rate": 1.983288847183947e-05, + "loss": 0.2982, + "step": 3222 + }, + { + "epoch": 0.6453162530024019, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.2106, + "step": 3224 + }, + { + "epoch": 0.6461168935148118, + "learning_rate": 1.9837938440059992e-05, + "loss": 0.4267, + "step": 3226 + }, + { + "epoch": 0.6461168935148118, + "learning_rate": 1.9840434606066182e-05, + "loss": 0.1768, + "step": 3228 + }, + { + "epoch": 0.6469175340272217, + "learning_rate": 1.9842911553490392e-05, + "loss": 0.5597, + "step": 3230 + }, + { + "epoch": 0.6469175340272217, + "learning_rate": 1.9845369277495102e-05, + "loss": 0.1497, + "step": 3232 + }, + { + "epoch": 0.6477181745396317, + "learning_rate": 1.984780777328031e-05, + "loss": 0.2465, + "step": 3234 + }, + { + "epoch": 0.6477181745396317, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.1608, + "step": 3236 + }, + { + "epoch": 0.6485188150520417, + "learning_rate": 1.985262706118007e-05, + "loss": 0.2411, + "step": 3238 + }, + { + "epoch": 0.6485188150520417, + "learning_rate": 1.985500784388244e-05, + "loss": 0.2069, + "step": 3240 + }, + { + "epoch": 0.6493194555644516, + "learning_rate": 1.9857369379540982e-05, + "loss": 0.3757, + "step": 3242 + }, + { + "epoch": 0.6493194555644516, + "learning_rate": 1.985971166354357e-05, + "loss": 0.3775, + "step": 3244 + }, + { + "epoch": 0.6501200960768615, + "learning_rate": 1.986203469131567e-05, + "loss": 0.5834, + "step": 3246 + }, + { + "epoch": 0.6501200960768615, + "learning_rate": 1.9864338458320366e-05, + "loss": 0.5152, + "step": 3248 + }, + { + "epoch": 0.6509207365892714, + "learning_rate": 1.986662296005834e-05, + "loss": 0.5086, + "step": 3250 + }, + { + "epoch": 0.6509207365892714, + "learning_rate": 1.986888819206792e-05, + "loss": 0.5967, + "step": 3252 + }, + { + "epoch": 0.6517213771016813, + "learning_rate": 1.987113414992505e-05, + "loss": 0.4372, + "step": 3254 + }, + { + "epoch": 0.6517213771016813, + "learning_rate": 1.9873360829243323e-05, + "loss": 0.2861, + "step": 3256 + }, + { + "epoch": 0.6525220176140912, + "learning_rate": 1.9875568225674e-05, + "loss": 0.1581, + "step": 3258 + }, + { + "epoch": 0.6525220176140912, + "learning_rate": 1.9877756334905983e-05, + "loss": 0.1872, + "step": 3260 + }, + { + "epoch": 0.6533226581265013, + "learning_rate": 1.9879925152665845e-05, + "loss": 0.5262, + "step": 3262 + }, + { + "epoch": 0.6533226581265013, + "learning_rate": 1.9882074674717836e-05, + "loss": 0.183, + "step": 3264 + }, + { + "epoch": 0.6541232986389112, + "learning_rate": 1.9884204896863895e-05, + "loss": 0.3277, + "step": 3266 + }, + { + "epoch": 0.6541232986389112, + "learning_rate": 1.988631581494365e-05, + "loss": 0.1612, + "step": 3268 + }, + { + "epoch": 0.6549239391513211, + "learning_rate": 1.9888407424834433e-05, + "loss": 0.4648, + "step": 3270 + }, + { + "epoch": 0.6549239391513211, + "learning_rate": 1.989047972245129e-05, + "loss": 0.3959, + "step": 3272 + }, + { + "epoch": 0.655724579663731, + "learning_rate": 1.989253270374697e-05, + "loss": 0.2024, + "step": 3274 + }, + { + "epoch": 0.655724579663731, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.0771, + "step": 3276 + }, + { + "epoch": 0.6565252201761409, + "learning_rate": 1.9896580701374482e-05, + "loss": 0.4497, + "step": 3278 + }, + { + "epoch": 0.6565252201761409, + "learning_rate": 1.989857570980049e-05, + "loss": 0.2759, + "step": 3280 + }, + { + "epoch": 0.6573258606885508, + "learning_rate": 1.9900551386093677e-05, + "loss": 0.1272, + "step": 3282 + }, + { + "epoch": 0.6573258606885508, + "learning_rate": 1.990250772639552e-05, + "loss": 0.3815, + "step": 3284 + }, + { + "epoch": 0.6581265012009607, + "learning_rate": 1.9904444726885236e-05, + "loss": 0.4654, + "step": 3286 + }, + { + "epoch": 0.6581265012009607, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.4346, + "step": 3288 + }, + { + "epoch": 0.6589271417133707, + "learning_rate": 1.990826069333406e-05, + "loss": 0.3647, + "step": 3290 + }, + { + "epoch": 0.6589271417133707, + "learning_rate": 1.99101396518405e-05, + "loss": 0.4385, + "step": 3292 + }, + { + "epoch": 0.6597277822257807, + "learning_rate": 1.99119992556295e-05, + "loss": 0.352, + "step": 3294 + }, + { + "epoch": 0.6597277822257807, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.3491, + "step": 3296 + }, + { + "epoch": 0.6605284227381906, + "learning_rate": 1.99156603845656e-05, + "loss": 0.3054, + "step": 3298 + }, + { + "epoch": 0.6605284227381906, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.2773, + "step": 3300 + }, + { + "epoch": 0.6613290632506005, + "learning_rate": 1.9919244051541315e-05, + "loss": 0.2163, + "step": 3302 + }, + { + "epoch": 0.6613290632506005, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.264, + "step": 3304 + }, + { + "epoch": 0.6621297037630104, + "learning_rate": 1.9922750228560746e-05, + "loss": 0.2482, + "step": 3306 + }, + { + "epoch": 0.6621297037630104, + "learning_rate": 1.9924474249753652e-05, + "loss": 0.696, + "step": 3308 + }, + { + "epoch": 0.6629303442754203, + "learning_rate": 1.9926178888233344e-05, + "loss": 0.1611, + "step": 3310 + }, + { + "epoch": 0.6629303442754203, + "learning_rate": 1.9927864140670615e-05, + "loss": 0.5024, + "step": 3312 + }, + { + "epoch": 0.6637309847878302, + "learning_rate": 1.9929530003774133e-05, + "loss": 0.3944, + "step": 3314 + }, + { + "epoch": 0.6637309847878302, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.2715, + "step": 3316 + }, + { + "epoch": 0.6645316253002402, + "learning_rate": 1.993280354900393e-05, + "loss": 0.4165, + "step": 3318 + }, + { + "epoch": 0.6645316253002402, + "learning_rate": 1.99344112247369e-05, + "loss": 0.4235, + "step": 3320 + }, + { + "epoch": 0.6653322658126501, + "learning_rate": 1.9935999498349518e-05, + "loss": 0.1793, + "step": 3322 + }, + { + "epoch": 0.6653322658126501, + "learning_rate": 1.9937568366739858e-05, + "loss": 0.1954, + "step": 3324 + }, + { + "epoch": 0.6661329063250601, + "learning_rate": 1.9939117826843883e-05, + "loss": 0.1881, + "step": 3326 + }, + { + "epoch": 0.6661329063250601, + "learning_rate": 1.9940647875635463e-05, + "loss": 0.3484, + "step": 3328 + }, + { + "epoch": 0.66693354683747, + "learning_rate": 1.9942158510126384e-05, + "loss": 0.165, + "step": 3330 + }, + { + "epoch": 0.66693354683747, + "learning_rate": 1.9943649727366335e-05, + "loss": 0.2826, + "step": 3332 + }, + { + "epoch": 0.6677341873498799, + "learning_rate": 1.9945121524442944e-05, + "loss": 0.2468, + "step": 3334 + }, + { + "epoch": 0.6677341873498799, + "learning_rate": 1.994657389848176e-05, + "loss": 0.1852, + "step": 3336 + }, + { + "epoch": 0.6685348278622898, + "learning_rate": 1.9948006846646262e-05, + "loss": 0.6371, + "step": 3338 + }, + { + "epoch": 0.6685348278622898, + "learning_rate": 1.994942036613787e-05, + "loss": 0.1481, + "step": 3340 + }, + { + "epoch": 0.6693354683746997, + "learning_rate": 1.9950814454195953e-05, + "loss": 0.1688, + "step": 3342 + }, + { + "epoch": 0.6693354683746997, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.6612, + "step": 3344 + }, + { + "epoch": 0.6701361088871097, + "learning_rate": 1.9953544325158755e-05, + "loss": 0.7063, + "step": 3346 + }, + { + "epoch": 0.6701361088871097, + "learning_rate": 1.995488010273198e-05, + "loss": 0.0452, + "step": 3348 + }, + { + "epoch": 0.6709367493995196, + "learning_rate": 1.9956196438208693e-05, + "loss": 0.5025, + "step": 3350 + }, + { + "epoch": 0.6709367493995196, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.1038, + "step": 3352 + }, + { + "epoch": 0.6717373899119295, + "learning_rate": 1.9958770772627236e-05, + "loss": 0.1697, + "step": 3354 + }, + { + "epoch": 0.6717373899119295, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.1717, + "step": 3356 + }, + { + "epoch": 0.6725380304243395, + "learning_rate": 1.9961267308303473e-05, + "loss": 0.2192, + "step": 3358 + }, + { + "epoch": 0.6725380304243395, + "learning_rate": 1.996248639549475e-05, + "loss": 0.1963, + "step": 3360 + }, + { + "epoch": 0.6733386709367494, + "learning_rate": 1.9963686025734262e-05, + "loss": 0.2069, + "step": 3362 + }, + { + "epoch": 0.6733386709367494, + "learning_rate": 1.9964866196679105e-05, + "loss": 0.2617, + "step": 3364 + }, + { + "epoch": 0.6741393114491593, + "learning_rate": 1.9966026906024377e-05, + "loss": 0.3077, + "step": 3366 + }, + { + "epoch": 0.6741393114491593, + "learning_rate": 1.9967168151503196e-05, + "loss": 0.2767, + "step": 3368 + }, + { + "epoch": 0.6749399519615693, + "learning_rate": 1.9968289930886675e-05, + "loss": 0.2471, + "step": 3370 + }, + { + "epoch": 0.6749399519615693, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.0995, + "step": 3372 + }, + { + "epoch": 0.6757405924739792, + "learning_rate": 1.997047508264221e-05, + "loss": 0.3756, + "step": 3374 + }, + { + "epoch": 0.6757405924739792, + "learning_rate": 1.997153845074662e-05, + "loss": 0.4216, + "step": 3376 + }, + { + "epoch": 0.6765412329863891, + "learning_rate": 1.99725823442204e-05, + "loss": 0.5415, + "step": 3378 + }, + { + "epoch": 0.6765412329863891, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.1883, + "step": 3380 + }, + { + "epoch": 0.677341873498799, + "learning_rate": 1.9974611699159142e-05, + "loss": 0.1124, + "step": 3382 + }, + { + "epoch": 0.677341873498799, + "learning_rate": 1.997559715666073e-05, + "loss": 0.0647, + "step": 3384 + }, + { + "epoch": 0.678142514011209, + "learning_rate": 1.9976563131604945e-05, + "loss": 0.1507, + "step": 3386 + }, + { + "epoch": 0.678142514011209, + "learning_rate": 1.9977509622105233e-05, + "loss": 0.2612, + "step": 3388 + }, + { + "epoch": 0.6789431545236189, + "learning_rate": 1.9978436626313065e-05, + "loss": 0.7532, + "step": 3390 + }, + { + "epoch": 0.6789431545236189, + "learning_rate": 1.9979344142417986e-05, + "loss": 0.3349, + "step": 3392 + }, + { + "epoch": 0.6797437950360288, + "learning_rate": 1.99802321686476e-05, + "loss": 0.5495, + "step": 3394 + }, + { + "epoch": 0.6797437950360288, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.0274, + "step": 3396 + }, + { + "epoch": 0.6805444355484388, + "learning_rate": 1.9981949744581622e-05, + "loss": 0.542, + "step": 3398 + }, + { + "epoch": 0.6805444355484388, + "learning_rate": 1.998277929093157e-05, + "loss": 0.0597, + "step": 3400 + }, + { + "epoch": 0.6813450760608487, + "learning_rate": 1.9983589340697288e-05, + "loss": 0.6511, + "step": 3402 + }, + { + "epoch": 0.6813450760608487, + "learning_rate": 1.998437989229673e-05, + "loss": 0.3231, + "step": 3404 + }, + { + "epoch": 0.6821457165732586, + "learning_rate": 1.998515094418594e-05, + "loss": 0.1761, + "step": 3406 + }, + { + "epoch": 0.6821457165732586, + "learning_rate": 1.9985902494859023e-05, + "loss": 0.239, + "step": 3408 + }, + { + "epoch": 0.6829463570856685, + "learning_rate": 1.99866345428482e-05, + "loss": 0.2069, + "step": 3410 + }, + { + "epoch": 0.6829463570856685, + "learning_rate": 1.998734708672375e-05, + "loss": 0.2792, + "step": 3412 + }, + { + "epoch": 0.6837469975980784, + "learning_rate": 1.998804012509407e-05, + "loss": 0.4495, + "step": 3414 + }, + { + "epoch": 0.6837469975980784, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.5576, + "step": 3416 + }, + { + "epoch": 0.6845476381104884, + "learning_rate": 1.9989367679943025e-05, + "loss": 0.3214, + "step": 3418 + }, + { + "epoch": 0.6845476381104884, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.3858, + "step": 3420 + }, + { + "epoch": 0.6853482786228983, + "learning_rate": 1.9990617197024103e-05, + "loss": 0.4545, + "step": 3422 + }, + { + "epoch": 0.6853482786228983, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.2006, + "step": 3424 + }, + { + "epoch": 0.6861489191353083, + "learning_rate": 1.999178866657597e-05, + "loss": 0.1961, + "step": 3426 + }, + { + "epoch": 0.6861489191353083, + "learning_rate": 1.9992345130644747e-05, + "loss": 0.4886, + "step": 3428 + }, + { + "epoch": 0.6869495596477182, + "learning_rate": 1.999288207944701e-05, + "loss": 0.1273, + "step": 3430 + }, + { + "epoch": 0.6869495596477182, + "learning_rate": 1.999339951193407e-05, + "loss": 0.1237, + "step": 3432 + }, + { + "epoch": 0.6877502001601281, + "learning_rate": 1.999389742709538e-05, + "loss": 0.2531, + "step": 3434 + }, + { + "epoch": 0.6877502001601281, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.1505, + "step": 3436 + }, + { + "epoch": 0.688550840672538, + "learning_rate": 1.9994834701589113e-05, + "loss": 0.5149, + "step": 3438 + }, + { + "epoch": 0.688550840672538, + "learning_rate": 1.9995274059091018e-05, + "loss": 0.2244, + "step": 3440 + }, + { + "epoch": 0.6893514811849479, + "learning_rate": 1.999569389560614e-05, + "loss": 0.4143, + "step": 3442 + }, + { + "epoch": 0.6893514811849479, + "learning_rate": 1.999609421031453e-05, + "loss": 0.2074, + "step": 3444 + }, + { + "epoch": 0.6901521216973578, + "learning_rate": 1.9996475002434365e-05, + "loss": 0.1683, + "step": 3446 + }, + { + "epoch": 0.6901521216973578, + "learning_rate": 1.999683627122195e-05, + "loss": 0.5138, + "step": 3448 + }, + { + "epoch": 0.6909527622097679, + "learning_rate": 1.999717801597172e-05, + "loss": 0.5701, + "step": 3450 + }, + { + "epoch": 0.6909527622097679, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.423, + "step": 3452 + }, + { + "epoch": 0.6917534027221778, + "learning_rate": 1.9997802930726195e-05, + "loss": 0.3856, + "step": 3454 + }, + { + "epoch": 0.6917534027221778, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.2978, + "step": 3456 + }, + { + "epoch": 0.6925540432345877, + "learning_rate": 1.9998349741815916e-05, + "loss": 0.3094, + "step": 3458 + }, + { + "epoch": 0.6925540432345877, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.1577, + "step": 3460 + }, + { + "epoch": 0.6933546837469976, + "learning_rate": 1.999881844496914e-05, + "loss": 0.211, + "step": 3462 + }, + { + "epoch": 0.6933546837469976, + "learning_rate": 1.99990235049015e-05, + "loss": 0.2351, + "step": 3464 + }, + { + "epoch": 0.6941553242594075, + "learning_rate": 1.9999209036524326e-05, + "loss": 0.3274, + "step": 3466 + }, + { + "epoch": 0.6941553242594075, + "learning_rate": 1.9999375039475275e-05, + "loss": 0.3728, + "step": 3468 + }, + { + "epoch": 0.6949559647718174, + "learning_rate": 1.999952151343014e-05, + "loss": 0.1854, + "step": 3470 + }, + { + "epoch": 0.6949559647718174, + "learning_rate": 1.999964845810285e-05, + "loss": 0.4345, + "step": 3472 + }, + { + "epoch": 0.6957566052842273, + "learning_rate": 1.9999755873245484e-05, + "loss": 0.2237, + "step": 3474 + }, + { + "epoch": 0.6957566052842273, + "learning_rate": 1.9999843758648253e-05, + "loss": 0.1767, + "step": 3476 + }, + { + "epoch": 0.6965572457966374, + "learning_rate": 1.999991211413952e-05, + "loss": 0.5157, + "step": 3478 + }, + { + "epoch": 0.6965572457966374, + "learning_rate": 1.999996093958578e-05, + "loss": 0.3625, + "step": 3480 + }, + { + "epoch": 0.6973578863090473, + "learning_rate": 1.9999990234891677e-05, + "loss": 0.4346, + "step": 3482 + }, + { + "epoch": 0.6973578863090473, + "learning_rate": 2e-05, + "loss": 0.1571, + "step": 3484 + }, + { + "epoch": 0.6981585268214572, + "learning_rate": 1.999999023489168e-05, + "loss": 0.1579, + "step": 3486 + }, + { + "epoch": 0.6981585268214572, + "learning_rate": 1.999996093958578e-05, + "loss": 0.1955, + "step": 3488 + }, + { + "epoch": 0.6989591673338671, + "learning_rate": 1.999991211413952e-05, + "loss": 0.1647, + "step": 3490 + }, + { + "epoch": 0.6989591673338671, + "learning_rate": 1.9999843758648253e-05, + "loss": 0.0961, + "step": 3492 + }, + { + "epoch": 0.699759807846277, + "learning_rate": 1.9999755873245484e-05, + "loss": 0.44, + "step": 3494 + }, + { + "epoch": 0.699759807846277, + "learning_rate": 1.999964845810285e-05, + "loss": 0.9857, + "step": 3496 + }, + { + "epoch": 0.7005604483586869, + "learning_rate": 1.999952151343014e-05, + "loss": 0.1852, + "step": 3498 + }, + { + "epoch": 0.7005604483586869, + "learning_rate": 1.9999375039475278e-05, + "loss": 0.3013, + "step": 3500 + }, + { + "epoch": 0.7013610888710968, + "learning_rate": 1.9999209036524326e-05, + "loss": 0.6867, + "step": 3502 + }, + { + "epoch": 0.7013610888710968, + "learning_rate": 1.99990235049015e-05, + "loss": 0.2531, + "step": 3504 + }, + { + "epoch": 0.7021617293835068, + "learning_rate": 1.999881844496914e-05, + "loss": 0.2156, + "step": 3506 + }, + { + "epoch": 0.7021617293835068, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.2191, + "step": 3508 + }, + { + "epoch": 0.7029623698959168, + "learning_rate": 1.9998349741815916e-05, + "loss": 0.282, + "step": 3510 + }, + { + "epoch": 0.7029623698959168, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.3974, + "step": 3512 + }, + { + "epoch": 0.7037630104083267, + "learning_rate": 1.9997802930726195e-05, + "loss": 0.4265, + "step": 3514 + }, + { + "epoch": 0.7037630104083267, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.1428, + "step": 3516 + }, + { + "epoch": 0.7045636509207366, + "learning_rate": 1.999717801597172e-05, + "loss": 0.3773, + "step": 3518 + }, + { + "epoch": 0.7045636509207366, + "learning_rate": 1.999683627122195e-05, + "loss": 0.1995, + "step": 3520 + }, + { + "epoch": 0.7053642914331465, + "learning_rate": 1.9996475002434365e-05, + "loss": 0.4602, + "step": 3522 + }, + { + "epoch": 0.7053642914331465, + "learning_rate": 1.999609421031453e-05, + "loss": 0.2064, + "step": 3524 + }, + { + "epoch": 0.7061649319455564, + "learning_rate": 1.999569389560614e-05, + "loss": 0.1755, + "step": 3526 + }, + { + "epoch": 0.7061649319455564, + "learning_rate": 1.999527405909102e-05, + "loss": 0.5047, + "step": 3528 + }, + { + "epoch": 0.7069655724579663, + "learning_rate": 1.9994834701589113e-05, + "loss": 0.3721, + "step": 3530 + }, + { + "epoch": 0.7069655724579663, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.1685, + "step": 3532 + }, + { + "epoch": 0.7077662129703763, + "learning_rate": 1.9993897427095378e-05, + "loss": 0.1963, + "step": 3534 + }, + { + "epoch": 0.7077662129703763, + "learning_rate": 1.999339951193407e-05, + "loss": 0.3616, + "step": 3536 + }, + { + "epoch": 0.7085668534827863, + "learning_rate": 1.999288207944701e-05, + "loss": 0.4263, + "step": 3538 + }, + { + "epoch": 0.7085668534827863, + "learning_rate": 1.999234513064475e-05, + "loss": 0.1666, + "step": 3540 + }, + { + "epoch": 0.7093674939951962, + "learning_rate": 1.999178866657597e-05, + "loss": 0.5756, + "step": 3542 + }, + { + "epoch": 0.7093674939951962, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.1601, + "step": 3544 + }, + { + "epoch": 0.7101681345076061, + "learning_rate": 1.9990617197024103e-05, + "loss": 0.176, + "step": 3546 + }, + { + "epoch": 0.7101681345076061, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.1501, + "step": 3548 + }, + { + "epoch": 0.710968775020016, + "learning_rate": 1.998936767994303e-05, + "loss": 0.1513, + "step": 3550 + }, + { + "epoch": 0.710968775020016, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.0448, + "step": 3552 + }, + { + "epoch": 0.7117694155324259, + "learning_rate": 1.998804012509407e-05, + "loss": 0.2052, + "step": 3554 + }, + { + "epoch": 0.7117694155324259, + "learning_rate": 1.998734708672375e-05, + "loss": 0.1983, + "step": 3556 + }, + { + "epoch": 0.7125700560448359, + "learning_rate": 1.99866345428482e-05, + "loss": 0.1946, + "step": 3558 + }, + { + "epoch": 0.7125700560448359, + "learning_rate": 1.9985902494859026e-05, + "loss": 0.1585, + "step": 3560 + }, + { + "epoch": 0.7133706965572458, + "learning_rate": 1.998515094418594e-05, + "loss": 0.278, + "step": 3562 + }, + { + "epoch": 0.7133706965572458, + "learning_rate": 1.9984379892296735e-05, + "loss": 0.0863, + "step": 3564 + }, + { + "epoch": 0.7141713370696557, + "learning_rate": 1.9983589340697288e-05, + "loss": 0.5755, + "step": 3566 + }, + { + "epoch": 0.7141713370696557, + "learning_rate": 1.9982779290931572e-05, + "loss": 0.0929, + "step": 3568 + }, + { + "epoch": 0.7149719775820657, + "learning_rate": 1.9981949744581622e-05, + "loss": 0.1784, + "step": 3570 + }, + { + "epoch": 0.7149719775820657, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.0969, + "step": 3572 + }, + { + "epoch": 0.7157726180944756, + "learning_rate": 1.99802321686476e-05, + "loss": 0.229, + "step": 3574 + }, + { + "epoch": 0.7157726180944756, + "learning_rate": 1.997934414241799e-05, + "loss": 0.4519, + "step": 3576 + }, + { + "epoch": 0.7165732586068855, + "learning_rate": 1.9978436626313068e-05, + "loss": 0.9369, + "step": 3578 + }, + { + "epoch": 0.7165732586068855, + "learning_rate": 1.9977509622105236e-05, + "loss": 0.0366, + "step": 3580 + }, + { + "epoch": 0.7173738991192954, + "learning_rate": 1.997656313160495e-05, + "loss": 0.059, + "step": 3582 + }, + { + "epoch": 0.7173738991192954, + "learning_rate": 1.997559715666073e-05, + "loss": 0.0156, + "step": 3584 + }, + { + "epoch": 0.7181745396317054, + "learning_rate": 1.9974611699159142e-05, + "loss": 0.4294, + "step": 3586 + }, + { + "epoch": 0.7181745396317054, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.233, + "step": 3588 + }, + { + "epoch": 0.7189751801441153, + "learning_rate": 1.99725823442204e-05, + "loss": 0.168, + "step": 3590 + }, + { + "epoch": 0.7189751801441153, + "learning_rate": 1.997153845074662e-05, + "loss": 0.0714, + "step": 3592 + }, + { + "epoch": 0.7197758206565252, + "learning_rate": 1.9970475082642212e-05, + "loss": 0.1761, + "step": 3594 + }, + { + "epoch": 0.7197758206565252, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.198, + "step": 3596 + }, + { + "epoch": 0.7205764611689351, + "learning_rate": 1.9968289930886675e-05, + "loss": 0.0076, + "step": 3598 + }, + { + "epoch": 0.7205764611689351, + "learning_rate": 1.9967168151503193e-05, + "loss": 0.0173, + "step": 3600 + }, + { + "epoch": 0.7213771016813451, + "learning_rate": 1.9966026906024377e-05, + "loss": 0.2869, + "step": 3602 + }, + { + "epoch": 0.7213771016813451, + "learning_rate": 1.996486619667911e-05, + "loss": 0.5903, + "step": 3604 + }, + { + "epoch": 0.722177742193755, + "learning_rate": 1.9963686025734262e-05, + "loss": 0.1797, + "step": 3606 + }, + { + "epoch": 0.722177742193755, + "learning_rate": 1.9962486395494753e-05, + "loss": 0.044, + "step": 3608 + }, + { + "epoch": 0.7229783827061649, + "learning_rate": 1.9961267308303473e-05, + "loss": 0.6855, + "step": 3610 + }, + { + "epoch": 0.7229783827061649, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.6294, + "step": 3612 + }, + { + "epoch": 0.7237790232185749, + "learning_rate": 1.9958770772627236e-05, + "loss": 0.3414, + "step": 3614 + }, + { + "epoch": 0.7237790232185749, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.2379, + "step": 3616 + }, + { + "epoch": 0.7245796637309848, + "learning_rate": 1.9956196438208693e-05, + "loss": 0.455, + "step": 3618 + }, + { + "epoch": 0.7245796637309848, + "learning_rate": 1.995488010273198e-05, + "loss": 0.2024, + "step": 3620 + }, + { + "epoch": 0.7253803042433947, + "learning_rate": 1.9953544325158755e-05, + "loss": 0.1667, + "step": 3622 + }, + { + "epoch": 0.7253803042433947, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.471, + "step": 3624 + }, + { + "epoch": 0.7261809447558046, + "learning_rate": 1.9950814454195953e-05, + "loss": 0.3317, + "step": 3626 + }, + { + "epoch": 0.7261809447558046, + "learning_rate": 1.9949420366137873e-05, + "loss": 0.3127, + "step": 3628 + }, + { + "epoch": 0.7269815852682145, + "learning_rate": 1.9948006846646262e-05, + "loss": 0.168, + "step": 3630 + }, + { + "epoch": 0.7269815852682145, + "learning_rate": 1.994657389848176e-05, + "loss": 0.7609, + "step": 3632 + }, + { + "epoch": 0.7277822257806245, + "learning_rate": 1.9945121524442947e-05, + "loss": 0.4163, + "step": 3634 + }, + { + "epoch": 0.7277822257806245, + "learning_rate": 1.994364972736634e-05, + "loss": 0.3349, + "step": 3636 + }, + { + "epoch": 0.7285828662930345, + "learning_rate": 1.9942158510126384e-05, + "loss": 0.5226, + "step": 3638 + }, + { + "epoch": 0.7285828662930345, + "learning_rate": 1.9940647875635466e-05, + "loss": 0.2286, + "step": 3640 + }, + { + "epoch": 0.7293835068054444, + "learning_rate": 1.9939117826843887e-05, + "loss": 0.5597, + "step": 3642 + }, + { + "epoch": 0.7293835068054444, + "learning_rate": 1.993756836673986e-05, + "loss": 0.2156, + "step": 3644 + }, + { + "epoch": 0.7301841473178543, + "learning_rate": 1.9935999498349525e-05, + "loss": 0.3924, + "step": 3646 + }, + { + "epoch": 0.7301841473178543, + "learning_rate": 1.99344112247369e-05, + "loss": 0.5538, + "step": 3648 + }, + { + "epoch": 0.7309847878302642, + "learning_rate": 1.9932803549003932e-05, + "loss": 0.2051, + "step": 3650 + }, + { + "epoch": 0.7309847878302642, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.3971, + "step": 3652 + }, + { + "epoch": 0.7317854283426741, + "learning_rate": 1.9929530003774136e-05, + "loss": 0.1989, + "step": 3654 + }, + { + "epoch": 0.7317854283426741, + "learning_rate": 1.9927864140670618e-05, + "loss": 0.2877, + "step": 3656 + }, + { + "epoch": 0.732586068855084, + "learning_rate": 1.9926178888233344e-05, + "loss": 0.3857, + "step": 3658 + }, + { + "epoch": 0.732586068855084, + "learning_rate": 1.9924474249753656e-05, + "loss": 0.4158, + "step": 3660 + }, + { + "epoch": 0.733386709367494, + "learning_rate": 1.9922750228560746e-05, + "loss": 0.3614, + "step": 3662 + }, + { + "epoch": 0.733386709367494, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.2104, + "step": 3664 + }, + { + "epoch": 0.734187349879904, + "learning_rate": 1.9919244051541315e-05, + "loss": 0.3535, + "step": 3666 + }, + { + "epoch": 0.734187349879904, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.1195, + "step": 3668 + }, + { + "epoch": 0.7349879903923139, + "learning_rate": 1.9915660384565603e-05, + "loss": 0.3613, + "step": 3670 + }, + { + "epoch": 0.7349879903923139, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.166, + "step": 3672 + }, + { + "epoch": 0.7357886309047238, + "learning_rate": 1.9911999255629504e-05, + "loss": 0.1989, + "step": 3674 + }, + { + "epoch": 0.7357886309047238, + "learning_rate": 1.9910139651840497e-05, + "loss": 0.2196, + "step": 3676 + }, + { + "epoch": 0.7365892714171337, + "learning_rate": 1.990826069333406e-05, + "loss": 0.4604, + "step": 3678 + }, + { + "epoch": 0.7365892714171337, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.1984, + "step": 3680 + }, + { + "epoch": 0.7373899119295436, + "learning_rate": 1.9904444726885236e-05, + "loss": 0.1669, + "step": 3682 + }, + { + "epoch": 0.7373899119295436, + "learning_rate": 1.9902507726395524e-05, + "loss": 0.082, + "step": 3684 + }, + { + "epoch": 0.7381905524419535, + "learning_rate": 1.9900551386093677e-05, + "loss": 0.2931, + "step": 3686 + }, + { + "epoch": 0.7381905524419535, + "learning_rate": 1.989857570980049e-05, + "loss": 0.2164, + "step": 3688 + }, + { + "epoch": 0.7389911929543634, + "learning_rate": 1.9896580701374482e-05, + "loss": 0.1678, + "step": 3690 + }, + { + "epoch": 0.7389911929543634, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.0201, + "step": 3692 + }, + { + "epoch": 0.7397918334667735, + "learning_rate": 1.9892532703746977e-05, + "loss": 0.3052, + "step": 3694 + }, + { + "epoch": 0.7397918334667735, + "learning_rate": 1.9890479722451292e-05, + "loss": 0.3427, + "step": 3696 + }, + { + "epoch": 0.7405924739791834, + "learning_rate": 1.9888407424834437e-05, + "loss": 0.4775, + "step": 3698 + }, + { + "epoch": 0.7405924739791834, + "learning_rate": 1.988631581494365e-05, + "loss": 0.4975, + "step": 3700 + }, + { + "epoch": 0.7413931144915933, + "learning_rate": 1.9884204896863895e-05, + "loss": 0.4598, + "step": 3702 + }, + { + "epoch": 0.7413931144915933, + "learning_rate": 1.9882074674717832e-05, + "loss": 0.3838, + "step": 3704 + }, + { + "epoch": 0.7421937550040032, + "learning_rate": 1.9879925152665845e-05, + "loss": 0.1849, + "step": 3706 + }, + { + "epoch": 0.7421937550040032, + "learning_rate": 1.987775633490599e-05, + "loss": 0.2017, + "step": 3708 + }, + { + "epoch": 0.7429943955164131, + "learning_rate": 1.9875568225674005e-05, + "loss": 0.1008, + "step": 3710 + }, + { + "epoch": 0.7429943955164131, + "learning_rate": 1.987336082924333e-05, + "loss": 0.0474, + "step": 3712 + }, + { + "epoch": 0.743795036028823, + "learning_rate": 1.987113414992505e-05, + "loss": 0.1762, + "step": 3714 + }, + { + "epoch": 0.743795036028823, + "learning_rate": 1.986888819206792e-05, + "loss": 0.432, + "step": 3716 + }, + { + "epoch": 0.7445956765412329, + "learning_rate": 1.986662296005834e-05, + "loss": 0.465, + "step": 3718 + }, + { + "epoch": 0.7445956765412329, + "learning_rate": 1.986433845832037e-05, + "loss": 0.0209, + "step": 3720 + }, + { + "epoch": 0.745396317053643, + "learning_rate": 1.9862034691315678e-05, + "loss": 0.2399, + "step": 3722 + }, + { + "epoch": 0.745396317053643, + "learning_rate": 1.9859711663543573e-05, + "loss": 0.1987, + "step": 3724 + }, + { + "epoch": 0.7461969575660529, + "learning_rate": 1.9857369379540985e-05, + "loss": 0.5046, + "step": 3726 + }, + { + "epoch": 0.7461969575660529, + "learning_rate": 1.9855007843882437e-05, + "loss": 0.2266, + "step": 3728 + }, + { + "epoch": 0.7469975980784628, + "learning_rate": 1.985262706118007e-05, + "loss": 0.2417, + "step": 3730 + }, + { + "epoch": 0.7469975980784628, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.1269, + "step": 3732 + }, + { + "epoch": 0.7477982385908727, + "learning_rate": 1.9847807773280314e-05, + "loss": 0.1664, + "step": 3734 + }, + { + "epoch": 0.7477982385908727, + "learning_rate": 1.9845369277495105e-05, + "loss": 0.1948, + "step": 3736 + }, + { + "epoch": 0.7485988791032826, + "learning_rate": 1.9842911553490396e-05, + "loss": 0.3393, + "step": 3738 + }, + { + "epoch": 0.7485988791032826, + "learning_rate": 1.9840434606066186e-05, + "loss": 1.2607, + "step": 3740 + }, + { + "epoch": 0.7493995196156925, + "learning_rate": 1.983793844005999e-05, + "loss": 0.0891, + "step": 3742 + }, + { + "epoch": 0.7493995196156925, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.0716, + "step": 3744 + }, + { + "epoch": 0.7502001601281025, + "learning_rate": 1.9832888471839475e-05, + "loss": 0.3133, + "step": 3746 + }, + { + "epoch": 0.7502001601281025, + "learning_rate": 1.983033467948784e-05, + "loss": 0.3731, + "step": 3748 + }, + { + "epoch": 0.7510008006405124, + "learning_rate": 1.9827761688279613e-05, + "loss": 0.2168, + "step": 3750 + }, + { + "epoch": 0.7510008006405124, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.3082, + "step": 3752 + }, + { + "epoch": 0.7518014411529224, + "learning_rate": 1.9822558129431263e-05, + "loss": 0.5453, + "step": 3754 + }, + { + "epoch": 0.7518014411529224, + "learning_rate": 1.9819927571953804e-05, + "loss": 0.2924, + "step": 3756 + }, + { + "epoch": 0.7526020816653323, + "learning_rate": 1.981727783594506e-05, + "loss": 0.1903, + "step": 3758 + }, + { + "epoch": 0.7526020816653323, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.0273, + "step": 3760 + }, + { + "epoch": 0.7534027221777422, + "learning_rate": 1.9811920849071092e-05, + "loss": 0.2325, + "step": 3762 + }, + { + "epoch": 0.7534027221777422, + "learning_rate": 1.980921360866819e-05, + "loss": 0.4156, + "step": 3764 + }, + { + "epoch": 0.7542033626901521, + "learning_rate": 1.980648721065859e-05, + "loss": 0.3152, + "step": 3766 + }, + { + "epoch": 0.7542033626901521, + "learning_rate": 1.9803741660367018e-05, + "loss": 0.1907, + "step": 3768 + }, + { + "epoch": 0.755004003202562, + "learning_rate": 1.980097696315558e-05, + "loss": 0.3841, + "step": 3770 + }, + { + "epoch": 0.755004003202562, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.0873, + "step": 3772 + }, + { + "epoch": 0.755804643714972, + "learning_rate": 1.979539014960858e-05, + "loss": 0.4258, + "step": 3774 + }, + { + "epoch": 0.755804643714972, + "learning_rate": 1.979256804418418e-05, + "loss": 0.1078, + "step": 3776 + }, + { + "epoch": 0.7566052842273819, + "learning_rate": 1.9789726813662233e-05, + "loss": 0.3728, + "step": 3778 + }, + { + "epoch": 0.7566052842273819, + "learning_rate": 1.978686646359173e-05, + "loss": 0.3641, + "step": 3780 + }, + { + "epoch": 0.7574059247397918, + "learning_rate": 1.9783986999558994e-05, + "loss": 0.5143, + "step": 3782 + }, + { + "epoch": 0.7574059247397918, + "learning_rate": 1.9781088427187677e-05, + "loss": 0.4478, + "step": 3784 + }, + { + "epoch": 0.7582065652522018, + "learning_rate": 1.9778170752138763e-05, + "loss": 0.1869, + "step": 3786 + }, + { + "epoch": 0.7582065652522018, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.064, + "step": 3788 + }, + { + "epoch": 0.7590072057646117, + "learning_rate": 1.9772278116838546e-05, + "loss": 0.1504, + "step": 3790 + }, + { + "epoch": 0.7590072057646117, + "learning_rate": 1.976930316809569e-05, + "loss": 0.0338, + "step": 3792 + }, + { + "epoch": 0.7598078462770216, + "learning_rate": 1.97663091396921e-05, + "loss": 0.4835, + "step": 3794 + }, + { + "epoch": 0.7598078462770216, + "learning_rate": 1.9763296037475177e-05, + "loss": 1.1043, + "step": 3796 + }, + { + "epoch": 0.7606084867894315, + "learning_rate": 1.976026386732957e-05, + "loss": 0.3634, + "step": 3798 + }, + { + "epoch": 0.7606084867894315, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.4011, + "step": 3800 + }, + { + "epoch": 0.7614091273018415, + "learning_rate": 1.9754142346977122e-05, + "loss": 0.1789, + "step": 3802 + }, + { + "epoch": 0.7614091273018415, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.0058, + "step": 3804 + }, + { + "epoch": 0.7622097678142514, + "learning_rate": 1.9747944626456577e-05, + "loss": 0.6204, + "step": 3806 + }, + { + "epoch": 0.7622097678142514, + "learning_rate": 1.9744817206240374e-05, + "loss": 0.2192, + "step": 3808 + }, + { + "epoch": 0.7630104083266613, + "learning_rate": 1.9741670754185054e-05, + "loss": 0.1905, + "step": 3810 + }, + { + "epoch": 0.7630104083266613, + "learning_rate": 1.9738505276435695e-05, + "loss": 0.7769, + "step": 3812 + }, + { + "epoch": 0.7638110488390712, + "learning_rate": 1.9735320779174548e-05, + "loss": 0.571, + "step": 3814 + }, + { + "epoch": 0.7638110488390712, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.5202, + "step": 3816 + }, + { + "epoch": 0.7646116893514812, + "learning_rate": 1.9728894751031595e-05, + "loss": 0.5054, + "step": 3818 + }, + { + "epoch": 0.7646116893514812, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.2107, + "step": 3820 + }, + { + "epoch": 0.7654123298638911, + "learning_rate": 1.972239271995686e-05, + "loss": 0.2162, + "step": 3822 + }, + { + "epoch": 0.7654123298638911, + "learning_rate": 1.9719113219170152e-05, + "loss": 0.6044, + "step": 3824 + }, + { + "epoch": 0.7662129703763011, + "learning_rate": 1.9715814736744758e-05, + "loss": 0.2619, + "step": 3826 + }, + { + "epoch": 0.7662129703763011, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.1986, + "step": 3828 + }, + { + "epoch": 0.767013610888711, + "learning_rate": 1.9709160852783022e-05, + "loss": 0.229, + "step": 3830 + }, + { + "epoch": 0.767013610888711, + "learning_rate": 1.970580546424186e-05, + "loss": 0.4985, + "step": 3832 + }, + { + "epoch": 0.7678142514011209, + "learning_rate": 1.9702431120052352e-05, + "loss": 0.1767, + "step": 3834 + }, + { + "epoch": 0.7678142514011209, + "learning_rate": 1.969903782680467e-05, + "loss": 0.1224, + "step": 3836 + }, + { + "epoch": 0.7686148919135308, + "learning_rate": 1.9695625591125984e-05, + "loss": 0.4602, + "step": 3838 + }, + { + "epoch": 0.7686148919135308, + "learning_rate": 1.9692194419680463e-05, + "loss": 0.3979, + "step": 3840 + }, + { + "epoch": 0.7694155324259407, + "learning_rate": 1.968874431916926e-05, + "loss": 0.3618, + "step": 3842 + }, + { + "epoch": 0.7694155324259407, + "learning_rate": 1.96852752963305e-05, + "loss": 0.1771, + "step": 3844 + }, + { + "epoch": 0.7702161729383507, + "learning_rate": 1.9681787357939257e-05, + "loss": 0.2169, + "step": 3846 + }, + { + "epoch": 0.7702161729383507, + "learning_rate": 1.9678280510807552e-05, + "loss": 0.6681, + "step": 3848 + }, + { + "epoch": 0.7710168134507606, + "learning_rate": 1.9674754761784334e-05, + "loss": 0.1674, + "step": 3850 + }, + { + "epoch": 0.7710168134507606, + "learning_rate": 1.9671210117755462e-05, + "loss": 0.3857, + "step": 3852 + }, + { + "epoch": 0.7718174539631706, + "learning_rate": 1.9667646585643706e-05, + "loss": 0.4017, + "step": 3854 + }, + { + "epoch": 0.7718174539631706, + "learning_rate": 1.966406417240872e-05, + "loss": 0.298, + "step": 3856 + }, + { + "epoch": 0.7726180944755805, + "learning_rate": 1.966046288504704e-05, + "loss": 0.2642, + "step": 3858 + }, + { + "epoch": 0.7726180944755805, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.1519, + "step": 3860 + }, + { + "epoch": 0.7734187349879904, + "learning_rate": 1.965320371611399e-05, + "loss": 0.1718, + "step": 3862 + }, + { + "epoch": 0.7734187349879904, + "learning_rate": 1.964954584871995e-05, + "loss": 0.393, + "step": 3864 + }, + { + "epoch": 0.7742193755004003, + "learning_rate": 1.964586913555381e-05, + "loss": 0.23, + "step": 3866 + }, + { + "epoch": 0.7742193755004003, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.1498, + "step": 3868 + }, + { + "epoch": 0.7750200160128102, + "learning_rate": 1.9638459200664822e-05, + "loss": 0.1593, + "step": 3870 + }, + { + "epoch": 0.7750200160128102, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.2157, + "step": 3872 + }, + { + "epoch": 0.7758206565252201, + "learning_rate": 1.9630973969334068e-05, + "loss": 0.3213, + "step": 3874 + }, + { + "epoch": 0.7758206565252201, + "learning_rate": 1.9627203135753576e-05, + "loss": 0.0939, + "step": 3876 + }, + { + "epoch": 0.77662129703763, + "learning_rate": 1.9623413500036795e-05, + "loss": 0.3834, + "step": 3878 + }, + { + "epoch": 0.77662129703763, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.253, + "step": 3880 + }, + { + "epoch": 0.7774219375500401, + "learning_rate": 1.9615777851836007e-05, + "loss": 0.224, + "step": 3882 + }, + { + "epoch": 0.7774219375500401, + "learning_rate": 1.961193185426459e-05, + "loss": 0.3822, + "step": 3884 + }, + { + "epoch": 0.77822257806245, + "learning_rate": 1.9608067084382025e-05, + "loss": 0.2108, + "step": 3886 + }, + { + "epoch": 0.77822257806245, + "learning_rate": 1.9604183549736287e-05, + "loss": 0.1212, + "step": 3888 + }, + { + "epoch": 0.7790232185748599, + "learning_rate": 1.9600281257912002e-05, + "loss": 0.2463, + "step": 3890 + }, + { + "epoch": 0.7790232185748599, + "learning_rate": 1.959636021653044e-05, + "loss": 0.198, + "step": 3892 + }, + { + "epoch": 0.7798238590872698, + "learning_rate": 1.9592420433249465e-05, + "loss": 0.3772, + "step": 3894 + }, + { + "epoch": 0.7798238590872698, + "learning_rate": 1.958846191576357e-05, + "loss": 0.1502, + "step": 3896 + }, + { + "epoch": 0.7806244995996797, + "learning_rate": 1.958448467180382e-05, + "loss": 0.3887, + "step": 3898 + }, + { + "epoch": 0.7806244995996797, + "learning_rate": 1.958048870913786e-05, + "loss": 1.0372, + "step": 3900 + }, + { + "epoch": 0.7814251401120896, + "learning_rate": 1.9576474035569895e-05, + "loss": 0.2238, + "step": 3902 + }, + { + "epoch": 0.7814251401120896, + "learning_rate": 1.9572440658940667e-05, + "loss": 0.0693, + "step": 3904 + }, + { + "epoch": 0.7822257806244995, + "learning_rate": 1.9568388587127448e-05, + "loss": 0.5662, + "step": 3906 + }, + { + "epoch": 0.7822257806244995, + "learning_rate": 1.9564317828044022e-05, + "loss": 0.333, + "step": 3908 + }, + { + "epoch": 0.7830264211369096, + "learning_rate": 1.9560228389640668e-05, + "loss": 0.4041, + "step": 3910 + }, + { + "epoch": 0.7830264211369096, + "learning_rate": 1.955612027990415e-05, + "loss": 0.2359, + "step": 3912 + }, + { + "epoch": 0.7838270616493195, + "learning_rate": 1.955199350685769e-05, + "loss": 0.4231, + "step": 3914 + }, + { + "epoch": 0.7838270616493195, + "learning_rate": 1.9547848078560982e-05, + "loss": 0.1687, + "step": 3916 + }, + { + "epoch": 0.7846277021617294, + "learning_rate": 1.954368400311011e-05, + "loss": 0.1581, + "step": 3918 + }, + { + "epoch": 0.7846277021617294, + "learning_rate": 1.953950128863763e-05, + "loss": 0.3495, + "step": 3920 + }, + { + "epoch": 0.7854283426741393, + "learning_rate": 1.9535299943312455e-05, + "loss": 0.3632, + "step": 3922 + }, + { + "epoch": 0.7854283426741393, + "learning_rate": 1.9531079975339915e-05, + "loss": 0.2938, + "step": 3924 + }, + { + "epoch": 0.7862289831865492, + "learning_rate": 1.9526841392961694e-05, + "loss": 0.104, + "step": 3926 + }, + { + "epoch": 0.7862289831865492, + "learning_rate": 1.9522584204455835e-05, + "loss": 0.0706, + "step": 3928 + }, + { + "epoch": 0.7870296236989591, + "learning_rate": 1.9518308418136728e-05, + "loss": 0.2953, + "step": 3930 + }, + { + "epoch": 0.7870296236989591, + "learning_rate": 1.9514014042355054e-05, + "loss": 0.2781, + "step": 3932 + }, + { + "epoch": 0.7878302642113691, + "learning_rate": 1.9509701085497852e-05, + "loss": 0.3613, + "step": 3934 + }, + { + "epoch": 0.7878302642113691, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.8307, + "step": 3936 + }, + { + "epoch": 0.7886309047237791, + "learning_rate": 1.9501019462286266e-05, + "loss": 0.2325, + "step": 3938 + }, + { + "epoch": 0.7886309047237791, + "learning_rate": 1.9496650812887293e-05, + "loss": 0.3829, + "step": 3940 + }, + { + "epoch": 0.789431545236189, + "learning_rate": 1.9492263616323536e-05, + "loss": 0.2862, + "step": 3942 + }, + { + "epoch": 0.789431545236189, + "learning_rate": 1.9487857881163295e-05, + "loss": 0.1871, + "step": 3944 + }, + { + "epoch": 0.7902321857485989, + "learning_rate": 1.948343361601105e-05, + "loss": 0.2966, + "step": 3946 + }, + { + "epoch": 0.7902321857485989, + "learning_rate": 1.947899082950751e-05, + "loss": 0.3186, + "step": 3948 + }, + { + "epoch": 0.7910328262610088, + "learning_rate": 1.947452953032951e-05, + "loss": 0.2294, + "step": 3950 + }, + { + "epoch": 0.7910328262610088, + "learning_rate": 1.947004972719008e-05, + "loss": 0.335, + "step": 3952 + }, + { + "epoch": 0.7918334667734187, + "learning_rate": 1.9465551428838363e-05, + "loss": 0.1959, + "step": 3954 + }, + { + "epoch": 0.7918334667734187, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.534, + "step": 3956 + }, + { + "epoch": 0.7926341072858286, + "learning_rate": 1.9456499381675285e-05, + "loss": 0.1668, + "step": 3958 + }, + { + "epoch": 0.7926341072858286, + "learning_rate": 1.945194565054276e-05, + "loss": 0.1361, + "step": 3960 + }, + { + "epoch": 0.7934347477982386, + "learning_rate": 1.9447373459555617e-05, + "loss": 0.2156, + "step": 3962 + }, + { + "epoch": 0.7934347477982386, + "learning_rate": 1.9442782817643425e-05, + "loss": 0.1359, + "step": 3964 + }, + { + "epoch": 0.7942353883106485, + "learning_rate": 1.9438173733771814e-05, + "loss": 0.168, + "step": 3966 + }, + { + "epoch": 0.7942353883106485, + "learning_rate": 1.9433546216942433e-05, + "loss": 0.596, + "step": 3968 + }, + { + "epoch": 0.7950360288230585, + "learning_rate": 1.9428900276192903e-05, + "loss": 0.2781, + "step": 3970 + }, + { + "epoch": 0.7950360288230585, + "learning_rate": 1.942423592059687e-05, + "loss": 0.7508, + "step": 3972 + }, + { + "epoch": 0.7958366693354684, + "learning_rate": 1.94195531592639e-05, + "loss": 0.2635, + "step": 3974 + }, + { + "epoch": 0.7958366693354684, + "learning_rate": 1.941485200133955e-05, + "loss": 0.0822, + "step": 3976 + }, + { + "epoch": 0.7966373098478783, + "learning_rate": 1.9410132456005262e-05, + "loss": 0.2761, + "step": 3978 + }, + { + "epoch": 0.7966373098478783, + "learning_rate": 1.9405394532478422e-05, + "loss": 0.5745, + "step": 3980 + }, + { + "epoch": 0.7974379503602882, + "learning_rate": 1.94006382400123e-05, + "loss": 0.3888, + "step": 3982 + }, + { + "epoch": 0.7974379503602882, + "learning_rate": 1.9395863587896025e-05, + "loss": 0.1216, + "step": 3984 + }, + { + "epoch": 0.7982385908726981, + "learning_rate": 1.939107058545461e-05, + "loss": 0.1513, + "step": 3986 + }, + { + "epoch": 0.7982385908726981, + "learning_rate": 1.938625924204888e-05, + "loss": 0.0319, + "step": 3988 + }, + { + "epoch": 0.7990392313851081, + "learning_rate": 1.9381429567075507e-05, + "loss": 0.503, + "step": 3990 + }, + { + "epoch": 0.7990392313851081, + "learning_rate": 1.937658156996694e-05, + "loss": 0.4072, + "step": 3992 + }, + { + "epoch": 0.799839871897518, + "learning_rate": 1.9371715260191425e-05, + "loss": 0.112, + "step": 3994 + }, + { + "epoch": 0.799839871897518, + "learning_rate": 1.9366830647252977e-05, + "loss": 0.1082, + "step": 3996 + }, + { + "epoch": 0.800640512409928, + "learning_rate": 1.936192774069133e-05, + "loss": 0.3999, + "step": 3998 + }, + { + "epoch": 0.800640512409928, + "learning_rate": 1.9357006550082e-05, + "loss": 0.1527, + "step": 4000 + }, + { + "epoch": 0.8014411529223379, + "learning_rate": 1.9352067085036145e-05, + "loss": 0.3613, + "step": 4002 + }, + { + "epoch": 0.8014411529223379, + "learning_rate": 1.9347109355200676e-05, + "loss": 0.1511, + "step": 4004 + }, + { + "epoch": 0.8022417934347478, + "learning_rate": 1.9342133370258124e-05, + "loss": 0.2964, + "step": 4006 + }, + { + "epoch": 0.8022417934347478, + "learning_rate": 1.933713913992671e-05, + "loss": 0.0088, + "step": 4008 + }, + { + "epoch": 0.8030424339471577, + "learning_rate": 1.9332126673960276e-05, + "loss": 0.4076, + "step": 4010 + }, + { + "epoch": 0.8030424339471577, + "learning_rate": 1.9327095982148255e-05, + "loss": 0.2023, + "step": 4012 + }, + { + "epoch": 0.8038430744595677, + "learning_rate": 1.932204707431572e-05, + "loss": 0.3831, + "step": 4014 + }, + { + "epoch": 0.8038430744595677, + "learning_rate": 1.9316979960323283e-05, + "loss": 0.3714, + "step": 4016 + }, + { + "epoch": 0.8046437149719776, + "learning_rate": 1.9311894650067146e-05, + "loss": 0.6496, + "step": 4018 + }, + { + "epoch": 0.8046437149719776, + "learning_rate": 1.9306791153479017e-05, + "loss": 0.6367, + "step": 4020 + }, + { + "epoch": 0.8054443554843875, + "learning_rate": 1.9301669480526118e-05, + "loss": 0.8116, + "step": 4022 + }, + { + "epoch": 0.8054443554843875, + "learning_rate": 1.9296529641211226e-05, + "loss": 0.7964, + "step": 4024 + }, + { + "epoch": 0.8062449959967974, + "learning_rate": 1.929137164557252e-05, + "loss": 0.1502, + "step": 4026 + }, + { + "epoch": 0.8062449959967974, + "learning_rate": 1.928619550368371e-05, + "loss": 0.2104, + "step": 4028 + }, + { + "epoch": 0.8070456365092074, + "learning_rate": 1.9281001225653883e-05, + "loss": 0.7397, + "step": 4030 + }, + { + "epoch": 0.8070456365092074, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.5979, + "step": 4032 + }, + { + "epoch": 0.8078462770216173, + "learning_rate": 1.9270558301784808e-05, + "loss": 0.3664, + "step": 4034 + }, + { + "epoch": 0.8078462770216173, + "learning_rate": 1.9265309676340783e-05, + "loss": 0.1687, + "step": 4036 + }, + { + "epoch": 0.8086469175340272, + "learning_rate": 1.9260042955546247e-05, + "loss": 0.5936, + "step": 4038 + }, + { + "epoch": 0.8086469175340272, + "learning_rate": 1.9254758149687187e-05, + "loss": 0.1326, + "step": 4040 + }, + { + "epoch": 0.8094475580464372, + "learning_rate": 1.9249455269084972e-05, + "loss": 0.3855, + "step": 4042 + }, + { + "epoch": 0.8094475580464372, + "learning_rate": 1.9244134324096216e-05, + "loss": 0.229, + "step": 4044 + }, + { + "epoch": 0.8102481985588471, + "learning_rate": 1.9238795325112867e-05, + "loss": 0.2195, + "step": 4046 + }, + { + "epoch": 0.8102481985588471, + "learning_rate": 1.9233438282562095e-05, + "loss": 0.4748, + "step": 4048 + }, + { + "epoch": 0.811048839071257, + "learning_rate": 1.9228063206906302e-05, + "loss": 0.2376, + "step": 4050 + }, + { + "epoch": 0.811048839071257, + "learning_rate": 1.9222670108643156e-05, + "loss": 0.2422, + "step": 4052 + }, + { + "epoch": 0.8118494795836669, + "learning_rate": 1.9217258998305464e-05, + "loss": 0.5496, + "step": 4054 + }, + { + "epoch": 0.8118494795836669, + "learning_rate": 1.9211829886461278e-05, + "loss": 0.1845, + "step": 4056 + }, + { + "epoch": 0.8126501200960768, + "learning_rate": 1.9206382783713735e-05, + "loss": 0.1581, + "step": 4058 + }, + { + "epoch": 0.8126501200960768, + "learning_rate": 1.9200917700701176e-05, + "loss": 0.3557, + "step": 4060 + }, + { + "epoch": 0.8134507606084868, + "learning_rate": 1.9195434648097013e-05, + "loss": 0.2271, + "step": 4062 + }, + { + "epoch": 0.8134507606084868, + "learning_rate": 1.918993363660975e-05, + "loss": 0.2764, + "step": 4064 + }, + { + "epoch": 0.8142514011208967, + "learning_rate": 1.9184414676983013e-05, + "loss": 0.1428, + "step": 4066 + }, + { + "epoch": 0.8142514011208967, + "learning_rate": 1.9178877779995416e-05, + "loss": 0.2323, + "step": 4068 + }, + { + "epoch": 0.8150520416333067, + "learning_rate": 1.9173322956460678e-05, + "loss": 0.4734, + "step": 4070 + }, + { + "epoch": 0.8150520416333067, + "learning_rate": 1.916775021722745e-05, + "loss": 0.6129, + "step": 4072 + }, + { + "epoch": 0.8158526821457166, + "learning_rate": 1.9162159573179446e-05, + "loss": 0.2513, + "step": 4074 + }, + { + "epoch": 0.8158526821457166, + "learning_rate": 1.9156551035235298e-05, + "loss": 0.0878, + "step": 4076 + }, + { + "epoch": 0.8166533226581265, + "learning_rate": 1.915092461434859e-05, + "loss": 0.1764, + "step": 4078 + }, + { + "epoch": 0.8166533226581265, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.1586, + "step": 4080 + }, + { + "epoch": 0.8174539631705364, + "learning_rate": 1.9139618167736547e-05, + "loss": 0.3045, + "step": 4082 + }, + { + "epoch": 0.8174539631705364, + "learning_rate": 1.9133938164092942e-05, + "loss": 0.1982, + "step": 4084 + }, + { + "epoch": 0.8182546036829463, + "learning_rate": 1.912824032167022e-05, + "loss": 0.2082, + "step": 4086 + }, + { + "epoch": 0.8182546036829463, + "learning_rate": 1.9122524651596372e-05, + "loss": 0.2981, + "step": 4088 + }, + { + "epoch": 0.8190552441953562, + "learning_rate": 1.911679116503426e-05, + "loss": 0.2617, + "step": 4090 + }, + { + "epoch": 0.8190552441953562, + "learning_rate": 1.9111039873181475e-05, + "loss": 0.2895, + "step": 4092 + }, + { + "epoch": 0.8198558847077662, + "learning_rate": 1.9105270787270446e-05, + "loss": 0.5754, + "step": 4094 + }, + { + "epoch": 0.8198558847077662, + "learning_rate": 1.9099483918568287e-05, + "loss": 0.3637, + "step": 4096 + }, + { + "epoch": 0.8206565252201762, + "learning_rate": 1.9093679278376913e-05, + "loss": 0.2712, + "step": 4098 + }, + { + "epoch": 0.8206565252201762, + "learning_rate": 1.90878568780329e-05, + "loss": 0.3055, + "step": 4100 + }, + { + "epoch": 0.8214571657325861, + "learning_rate": 1.90820167289075e-05, + "loss": 0.1704, + "step": 4102 + }, + { + "epoch": 0.8214571657325861, + "learning_rate": 1.907615884240668e-05, + "loss": 0.1078, + "step": 4104 + }, + { + "epoch": 0.822257806244996, + "learning_rate": 1.9070283229971003e-05, + "loss": 0.1762, + "step": 4106 + }, + { + "epoch": 0.822257806244996, + "learning_rate": 1.9064389903075683e-05, + "loss": 0.2959, + "step": 4108 + }, + { + "epoch": 0.8230584467574059, + "learning_rate": 1.9058478873230487e-05, + "loss": 0.5579, + "step": 4110 + }, + { + "epoch": 0.8230584467574059, + "learning_rate": 1.905255015197982e-05, + "loss": 0.1564, + "step": 4112 + }, + { + "epoch": 0.8238590872698158, + "learning_rate": 1.9046603750902585e-05, + "loss": 0.2448, + "step": 4114 + }, + { + "epoch": 0.8238590872698158, + "learning_rate": 1.9040639681612216e-05, + "loss": 0.1979, + "step": 4116 + }, + { + "epoch": 0.8246597277822257, + "learning_rate": 1.9034657955756702e-05, + "loss": 0.13, + "step": 4118 + }, + { + "epoch": 0.8246597277822257, + "learning_rate": 1.902865858501845e-05, + "loss": 0.1185, + "step": 4120 + }, + { + "epoch": 0.8254603682946358, + "learning_rate": 1.9022641581114396e-05, + "loss": 0.3634, + "step": 4122 + }, + { + "epoch": 0.8254603682946358, + "learning_rate": 1.9016606955795843e-05, + "loss": 0.2813, + "step": 4124 + }, + { + "epoch": 0.8262610088070457, + "learning_rate": 1.901055472084858e-05, + "loss": 0.2416, + "step": 4126 + }, + { + "epoch": 0.8262610088070457, + "learning_rate": 1.9004484888092734e-05, + "loss": 0.2049, + "step": 4128 + }, + { + "epoch": 0.8270616493194556, + "learning_rate": 1.8998397469382812e-05, + "loss": 0.2322, + "step": 4130 + }, + { + "epoch": 0.8270616493194556, + "learning_rate": 1.8992292476607695e-05, + "loss": 0.1342, + "step": 4132 + }, + { + "epoch": 0.8278622898318655, + "learning_rate": 1.898616992169054e-05, + "loss": 0.2779, + "step": 4134 + }, + { + "epoch": 0.8278622898318655, + "learning_rate": 1.8980029816588863e-05, + "loss": 0.1142, + "step": 4136 + }, + { + "epoch": 0.8286629303442754, + "learning_rate": 1.89738721732944e-05, + "loss": 0.1825, + "step": 4138 + }, + { + "epoch": 0.8286629303442754, + "learning_rate": 1.8967697003833156e-05, + "loss": 0.1075, + "step": 4140 + }, + { + "epoch": 0.8294635708566853, + "learning_rate": 1.8961504320265392e-05, + "loss": 0.6788, + "step": 4142 + }, + { + "epoch": 0.8294635708566853, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.0285, + "step": 4144 + }, + { + "epoch": 0.8302642113690952, + "learning_rate": 1.8949066459222224e-05, + "loss": 0.1704, + "step": 4146 + }, + { + "epoch": 0.8302642113690952, + "learning_rate": 1.8942821306038227e-05, + "loss": 0.0975, + "step": 4148 + }, + { + "epoch": 0.8310648518815053, + "learning_rate": 1.8936558687330492e-05, + "loss": 0.3483, + "step": 4150 + }, + { + "epoch": 0.8310648518815053, + "learning_rate": 1.893027861533003e-05, + "loss": 0.0197, + "step": 4152 + }, + { + "epoch": 0.8318654923939152, + "learning_rate": 1.8923981102301944e-05, + "loss": 0.5074, + "step": 4154 + }, + { + "epoch": 0.8318654923939152, + "learning_rate": 1.891766616054545e-05, + "loss": 0.017, + "step": 4156 + }, + { + "epoch": 0.8326661329063251, + "learning_rate": 1.8911333802393725e-05, + "loss": 0.3443, + "step": 4158 + }, + { + "epoch": 0.8326661329063251, + "learning_rate": 1.8904984040214043e-05, + "loss": 1.0473, + "step": 4160 + }, + { + "epoch": 0.833466773418735, + "learning_rate": 1.8898616886407588e-05, + "loss": 0.0773, + "step": 4162 + }, + { + "epoch": 0.833466773418735, + "learning_rate": 1.8892232353409582e-05, + "loss": 0.0464, + "step": 4164 + }, + { + "epoch": 0.8342674139311449, + "learning_rate": 1.8885830453689146e-05, + "loss": 0.176, + "step": 4166 + }, + { + "epoch": 0.8342674139311449, + "learning_rate": 1.8879411199749306e-05, + "loss": 0.0313, + "step": 4168 + }, + { + "epoch": 0.8350680544435548, + "learning_rate": 1.8872974604127038e-05, + "loss": 0.3711, + "step": 4170 + }, + { + "epoch": 0.8350680544435548, + "learning_rate": 1.8866520679393124e-05, + "loss": 0.6589, + "step": 4172 + }, + { + "epoch": 0.8358686949559647, + "learning_rate": 1.8860049438152247e-05, + "loss": 0.2154, + "step": 4174 + }, + { + "epoch": 0.8358686949559647, + "learning_rate": 1.885356089304285e-05, + "loss": 0.2633, + "step": 4176 + }, + { + "epoch": 0.8366693354683747, + "learning_rate": 1.8847055056737236e-05, + "loss": 0.5028, + "step": 4178 + }, + { + "epoch": 0.8366693354683747, + "learning_rate": 1.884053194194143e-05, + "loss": 0.3358, + "step": 4180 + }, + { + "epoch": 0.8374699759807847, + "learning_rate": 1.8833991561395194e-05, + "loss": 0.2323, + "step": 4182 + }, + { + "epoch": 0.8374699759807847, + "learning_rate": 1.882743392787207e-05, + "loss": 0.5659, + "step": 4184 + }, + { + "epoch": 0.8382706164931946, + "learning_rate": 1.8820859054179225e-05, + "loss": 0.0664, + "step": 4186 + }, + { + "epoch": 0.8382706164931946, + "learning_rate": 1.881426695315756e-05, + "loss": 0.0416, + "step": 4188 + }, + { + "epoch": 0.8390712570056045, + "learning_rate": 1.8807657637681577e-05, + "loss": 0.2099, + "step": 4190 + }, + { + "epoch": 0.8390712570056045, + "learning_rate": 1.8801031120659396e-05, + "loss": 0.1769, + "step": 4192 + }, + { + "epoch": 0.8398718975180144, + "learning_rate": 1.8794387415032783e-05, + "loss": 0.2049, + "step": 4194 + }, + { + "epoch": 0.8398718975180144, + "learning_rate": 1.8787726533777003e-05, + "loss": 0.5707, + "step": 4196 + }, + { + "epoch": 0.8406725380304243, + "learning_rate": 1.8781048489900936e-05, + "loss": 0.3429, + "step": 4198 + }, + { + "epoch": 0.8406725380304243, + "learning_rate": 1.877435329644691e-05, + "loss": 0.5196, + "step": 4200 + }, + { + "epoch": 0.8414731785428343, + "learning_rate": 1.876764096649082e-05, + "loss": 0.2776, + "step": 4202 + }, + { + "epoch": 0.8414731785428343, + "learning_rate": 1.8760911513141974e-05, + "loss": 0.1203, + "step": 4204 + }, + { + "epoch": 0.8422738190552442, + "learning_rate": 1.8754164949543123e-05, + "loss": 0.4077, + "step": 4206 + }, + { + "epoch": 0.8422738190552442, + "learning_rate": 1.8747401288870482e-05, + "loss": 0.629, + "step": 4208 + }, + { + "epoch": 0.8430744595676541, + "learning_rate": 1.8740620544333604e-05, + "loss": 0.0998, + "step": 4210 + }, + { + "epoch": 0.8430744595676541, + "learning_rate": 1.8733822729175455e-05, + "loss": 0.0205, + "step": 4212 + }, + { + "epoch": 0.8438751000800641, + "learning_rate": 1.872700785667228e-05, + "loss": 0.1761, + "step": 4214 + }, + { + "epoch": 0.8438751000800641, + "learning_rate": 1.8720175940133712e-05, + "loss": 0.4468, + "step": 4216 + }, + { + "epoch": 0.844675740592474, + "learning_rate": 1.8713326992902612e-05, + "loss": 0.2239, + "step": 4218 + }, + { + "epoch": 0.844675740592474, + "learning_rate": 1.8706461028355107e-05, + "loss": 0.1588, + "step": 4220 + }, + { + "epoch": 0.8454763811048839, + "learning_rate": 1.8699578059900604e-05, + "loss": 0.8129, + "step": 4222 + }, + { + "epoch": 0.8454763811048839, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.0581, + "step": 4224 + }, + { + "epoch": 0.8462770216172938, + "learning_rate": 1.868576116507408e-05, + "loss": 0.2173, + "step": 4226 + }, + { + "epoch": 0.8462770216172938, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.0585, + "step": 4228 + }, + { + "epoch": 0.8470776621297038, + "learning_rate": 1.8671876416361767e-05, + "loss": 0.2385, + "step": 4230 + }, + { + "epoch": 0.8470776621297038, + "learning_rate": 1.8664908630674264e-05, + "loss": 0.6322, + "step": 4232 + }, + { + "epoch": 0.8478783026421137, + "learning_rate": 1.8657923922232467e-05, + "loss": 0.3056, + "step": 4234 + }, + { + "epoch": 0.8478783026421137, + "learning_rate": 1.86509223046777e-05, + "loss": 0.3098, + "step": 4236 + }, + { + "epoch": 0.8486789431545236, + "learning_rate": 1.8643903791684228e-05, + "loss": 0.8131, + "step": 4238 + }, + { + "epoch": 0.8486789431545236, + "learning_rate": 1.8636868396959406e-05, + "loss": 0.4263, + "step": 4240 + }, + { + "epoch": 0.8494795836669335, + "learning_rate": 1.8629816134243466e-05, + "loss": 0.5489, + "step": 4242 + }, + { + "epoch": 0.8494795836669335, + "learning_rate": 1.8622747017309676e-05, + "loss": 0.4834, + "step": 4244 + }, + { + "epoch": 0.8502802241793435, + "learning_rate": 1.8615661059964148e-05, + "loss": 0.3243, + "step": 4246 + }, + { + "epoch": 0.8502802241793435, + "learning_rate": 1.8608558276045898e-05, + "loss": 0.1349, + "step": 4248 + }, + { + "epoch": 0.8510808646917534, + "learning_rate": 1.860143867942685e-05, + "loss": 0.4736, + "step": 4250 + }, + { + "epoch": 0.8510808646917534, + "learning_rate": 1.8594302284011697e-05, + "loss": 0.1684, + "step": 4252 + }, + { + "epoch": 0.8518815052041633, + "learning_rate": 1.8587149103738006e-05, + "loss": 0.2282, + "step": 4254 + }, + { + "epoch": 0.8518815052041633, + "learning_rate": 1.8579979152576076e-05, + "loss": 0.187, + "step": 4256 + }, + { + "epoch": 0.8526821457165733, + "learning_rate": 1.8572792444528963e-05, + "loss": 0.3356, + "step": 4258 + }, + { + "epoch": 0.8526821457165733, + "learning_rate": 1.8565588993632498e-05, + "loss": 0.1199, + "step": 4260 + }, + { + "epoch": 0.8534827862289832, + "learning_rate": 1.8558368813955136e-05, + "loss": 0.4171, + "step": 4262 + }, + { + "epoch": 0.8534827862289832, + "learning_rate": 1.8551131919598084e-05, + "loss": 0.1853, + "step": 4264 + }, + { + "epoch": 0.8542834267413931, + "learning_rate": 1.854387832469512e-05, + "loss": 0.2303, + "step": 4266 + }, + { + "epoch": 0.8542834267413931, + "learning_rate": 1.8536608043412702e-05, + "loss": 0.3686, + "step": 4268 + }, + { + "epoch": 0.855084067253803, + "learning_rate": 1.8529321089949833e-05, + "loss": 0.7221, + "step": 4270 + }, + { + "epoch": 0.855084067253803, + "learning_rate": 1.852201747853807e-05, + "loss": 0.2138, + "step": 4272 + }, + { + "epoch": 0.855884707766213, + "learning_rate": 1.8514697223441565e-05, + "loss": 0.2426, + "step": 4274 + }, + { + "epoch": 0.855884707766213, + "learning_rate": 1.8507360338956896e-05, + "loss": 0.6411, + "step": 4276 + }, + { + "epoch": 0.8566853482786229, + "learning_rate": 1.850000683941319e-05, + "loss": 0.291, + "step": 4278 + }, + { + "epoch": 0.8566853482786229, + "learning_rate": 1.849263673917196e-05, + "loss": 0.0517, + "step": 4280 + }, + { + "epoch": 0.8574859887910328, + "learning_rate": 1.8485250052627205e-05, + "loss": 0.217, + "step": 4282 + }, + { + "epoch": 0.8574859887910328, + "learning_rate": 1.847784679420527e-05, + "loss": 0.1541, + "step": 4284 + }, + { + "epoch": 0.8582866293034428, + "learning_rate": 1.8470426978364857e-05, + "loss": 0.1623, + "step": 4286 + }, + { + "epoch": 0.8582866293034428, + "learning_rate": 1.846299061959706e-05, + "loss": 0.3731, + "step": 4288 + }, + { + "epoch": 0.8590872698158527, + "learning_rate": 1.845553773242522e-05, + "loss": 0.2323, + "step": 4290 + }, + { + "epoch": 0.8590872698158527, + "learning_rate": 1.8448068331405018e-05, + "loss": 0.2715, + "step": 4292 + }, + { + "epoch": 0.8598879103282626, + "learning_rate": 1.8440582431124322e-05, + "loss": 0.6596, + "step": 4294 + }, + { + "epoch": 0.8598879103282626, + "learning_rate": 1.8433080046203293e-05, + "loss": 0.2975, + "step": 4296 + }, + { + "epoch": 0.8606885508406725, + "learning_rate": 1.842556119129423e-05, + "loss": 0.1853, + "step": 4298 + }, + { + "epoch": 0.8606885508406725, + "learning_rate": 1.841802588108161e-05, + "loss": 0.0966, + "step": 4300 + }, + { + "epoch": 0.8614891913530824, + "learning_rate": 1.841047413028209e-05, + "loss": 0.159, + "step": 4302 + }, + { + "epoch": 0.8614891913530824, + "learning_rate": 1.8402905953644356e-05, + "loss": 0.0753, + "step": 4304 + }, + { + "epoch": 0.8622898318654924, + "learning_rate": 1.8395321365949273e-05, + "loss": 0.5291, + "step": 4306 + }, + { + "epoch": 0.8622898318654924, + "learning_rate": 1.838772038200968e-05, + "loss": 0.1877, + "step": 4308 + }, + { + "epoch": 0.8630904723779024, + "learning_rate": 1.838010301667044e-05, + "loss": 0.1699, + "step": 4310 + }, + { + "epoch": 0.8630904723779024, + "learning_rate": 1.837246928480848e-05, + "loss": 0.0826, + "step": 4312 + }, + { + "epoch": 0.8638911128903123, + "learning_rate": 1.8364819201332596e-05, + "loss": 0.2274, + "step": 4314 + }, + { + "epoch": 0.8638911128903123, + "learning_rate": 1.8357152781183613e-05, + "loss": 0.3852, + "step": 4316 + }, + { + "epoch": 0.8646917534027222, + "learning_rate": 1.834947003933417e-05, + "loss": 0.532, + "step": 4318 + }, + { + "epoch": 0.8646917534027222, + "learning_rate": 1.8341770990788874e-05, + "loss": 0.073, + "step": 4320 + }, + { + "epoch": 0.8654923939151321, + "learning_rate": 1.8334055650584107e-05, + "loss": 0.2157, + "step": 4322 + }, + { + "epoch": 0.8654923939151321, + "learning_rate": 1.8326324033788087e-05, + "loss": 0.0201, + "step": 4324 + }, + { + "epoch": 0.866293034427542, + "learning_rate": 1.8318576155500855e-05, + "loss": 0.2419, + "step": 4326 + }, + { + "epoch": 0.866293034427542, + "learning_rate": 1.831081203085415e-05, + "loss": 0.0429, + "step": 4328 + }, + { + "epoch": 0.8670936749399519, + "learning_rate": 1.830303167501152e-05, + "loss": 0.4884, + "step": 4330 + }, + { + "epoch": 0.8670936749399519, + "learning_rate": 1.8295235103168128e-05, + "loss": 0.0053, + "step": 4332 + }, + { + "epoch": 0.8678943154523618, + "learning_rate": 1.8287422330550885e-05, + "loss": 0.6323, + "step": 4334 + }, + { + "epoch": 0.8678943154523618, + "learning_rate": 1.8279593372418284e-05, + "loss": 0.0937, + "step": 4336 + }, + { + "epoch": 0.8686949559647719, + "learning_rate": 1.827174824406043e-05, + "loss": 0.3153, + "step": 4338 + }, + { + "epoch": 0.8686949559647719, + "learning_rate": 1.8263886960799072e-05, + "loss": 0.0579, + "step": 4340 + }, + { + "epoch": 0.8694955964771818, + "learning_rate": 1.8256009537987424e-05, + "loss": 0.2198, + "step": 4342 + }, + { + "epoch": 0.8694955964771818, + "learning_rate": 1.8248115991010303e-05, + "loss": 0.3042, + "step": 4344 + }, + { + "epoch": 0.8702962369895917, + "learning_rate": 1.8240206335283943e-05, + "loss": 0.207, + "step": 4346 + }, + { + "epoch": 0.8702962369895917, + "learning_rate": 1.8232280586256104e-05, + "loss": 0.3416, + "step": 4348 + }, + { + "epoch": 0.8710968775020016, + "learning_rate": 1.8224338759405934e-05, + "loss": 0.9246, + "step": 4350 + }, + { + "epoch": 0.8710968775020016, + "learning_rate": 1.8216380870243963e-05, + "loss": 0.0775, + "step": 4352 + }, + { + "epoch": 0.8718975180144115, + "learning_rate": 1.820840693431217e-05, + "loss": 0.2935, + "step": 4354 + }, + { + "epoch": 0.8718975180144115, + "learning_rate": 1.820041696718378e-05, + "loss": 0.6678, + "step": 4356 + }, + { + "epoch": 0.8726981585268214, + "learning_rate": 1.8192410984463416e-05, + "loss": 0.1903, + "step": 4358 + }, + { + "epoch": 0.8726981585268214, + "learning_rate": 1.8184389001786912e-05, + "loss": 0.6652, + "step": 4360 + }, + { + "epoch": 0.8734987990392313, + "learning_rate": 1.8176351034821352e-05, + "loss": 0.2199, + "step": 4362 + }, + { + "epoch": 0.8734987990392313, + "learning_rate": 1.8168297099265108e-05, + "loss": 0.1135, + "step": 4364 + }, + { + "epoch": 0.8742994395516414, + "learning_rate": 1.8160227210847642e-05, + "loss": 0.2087, + "step": 4366 + }, + { + "epoch": 0.8742994395516414, + "learning_rate": 1.815214138532966e-05, + "loss": 0.4475, + "step": 4368 + }, + { + "epoch": 0.8751000800640513, + "learning_rate": 1.8144039638502927e-05, + "loss": 0.1587, + "step": 4370 + }, + { + "epoch": 0.8751000800640513, + "learning_rate": 1.8135921986190358e-05, + "loss": 0.0433, + "step": 4372 + }, + { + "epoch": 0.8759007205764612, + "learning_rate": 1.8127788444245884e-05, + "loss": 0.6798, + "step": 4374 + }, + { + "epoch": 0.8759007205764612, + "learning_rate": 1.8119639028554475e-05, + "loss": 0.6588, + "step": 4376 + }, + { + "epoch": 0.8767013610888711, + "learning_rate": 1.8111473755032152e-05, + "loss": 0.4657, + "step": 4378 + }, + { + "epoch": 0.8767013610888711, + "learning_rate": 1.8103292639625835e-05, + "loss": 0.3322, + "step": 4380 + }, + { + "epoch": 0.877502001601281, + "learning_rate": 1.8095095698313456e-05, + "loss": 0.211, + "step": 4382 + }, + { + "epoch": 0.877502001601281, + "learning_rate": 1.808688294710378e-05, + "loss": 0.189, + "step": 4384 + }, + { + "epoch": 0.8783026421136909, + "learning_rate": 1.807865440203653e-05, + "loss": 0.8527, + "step": 4386 + }, + { + "epoch": 0.8783026421136909, + "learning_rate": 1.807041007918221e-05, + "loss": 0.5408, + "step": 4388 + }, + { + "epoch": 0.8791032826261009, + "learning_rate": 1.806214999464214e-05, + "loss": 0.1502, + "step": 4390 + }, + { + "epoch": 0.8791032826261009, + "learning_rate": 1.805387416454849e-05, + "loss": 0.5762, + "step": 4392 + }, + { + "epoch": 0.8799039231385108, + "learning_rate": 1.8045582605064087e-05, + "loss": 0.2893, + "step": 4394 + }, + { + "epoch": 0.8799039231385108, + "learning_rate": 1.8037275332382575e-05, + "loss": 0.376, + "step": 4396 + }, + { + "epoch": 0.8807045636509208, + "learning_rate": 1.802895236272819e-05, + "loss": 0.3249, + "step": 4398 + }, + { + "epoch": 0.8807045636509208, + "learning_rate": 1.802061371235592e-05, + "loss": 0.3931, + "step": 4400 + }, + { + "epoch": 0.8815052041633307, + "learning_rate": 1.80122593975513e-05, + "loss": 0.2533, + "step": 4402 + }, + { + "epoch": 0.8815052041633307, + "learning_rate": 1.8003889434630476e-05, + "loss": 0.2072, + "step": 4404 + }, + { + "epoch": 0.8823058446757406, + "learning_rate": 1.7995503839940204e-05, + "loss": 0.1587, + "step": 4406 + }, + { + "epoch": 0.8823058446757406, + "learning_rate": 1.7987102629857692e-05, + "loss": 0.3521, + "step": 4408 + }, + { + "epoch": 0.8831064851881505, + "learning_rate": 1.7978685820790725e-05, + "loss": 0.3734, + "step": 4410 + }, + { + "epoch": 0.8831064851881505, + "learning_rate": 1.7970253429177494e-05, + "loss": 0.2753, + "step": 4412 + }, + { + "epoch": 0.8839071257005604, + "learning_rate": 1.796180547148662e-05, + "loss": 0.1587, + "step": 4414 + }, + { + "epoch": 0.8839071257005604, + "learning_rate": 1.7953341964217196e-05, + "loss": 0.4716, + "step": 4416 + }, + { + "epoch": 0.8847077662129704, + "learning_rate": 1.7944862923898586e-05, + "loss": 0.3863, + "step": 4418 + }, + { + "epoch": 0.8847077662129704, + "learning_rate": 1.7936368367090583e-05, + "loss": 0.1854, + "step": 4420 + }, + { + "epoch": 0.8855084067253803, + "learning_rate": 1.7927858310383196e-05, + "loss": 0.3268, + "step": 4422 + }, + { + "epoch": 0.8855084067253803, + "learning_rate": 1.7919332770396798e-05, + "loss": 0.1508, + "step": 4424 + }, + { + "epoch": 0.8863090472377902, + "learning_rate": 1.7910791763781928e-05, + "loss": 0.1719, + "step": 4426 + }, + { + "epoch": 0.8863090472377902, + "learning_rate": 1.7902235307219336e-05, + "loss": 0.1763, + "step": 4428 + }, + { + "epoch": 0.8871096877502002, + "learning_rate": 1.789366341742001e-05, + "loss": 0.1335, + "step": 4430 + }, + { + "epoch": 0.8871096877502002, + "learning_rate": 1.7885076111125e-05, + "loss": 0.0577, + "step": 4432 + }, + { + "epoch": 0.8879103282626101, + "learning_rate": 1.7876473405105535e-05, + "loss": 0.3524, + "step": 4434 + }, + { + "epoch": 0.8879103282626101, + "learning_rate": 1.7867855316162846e-05, + "loss": 0.6029, + "step": 4436 + }, + { + "epoch": 0.88871096877502, + "learning_rate": 1.785922186112829e-05, + "loss": 0.3894, + "step": 4438 + }, + { + "epoch": 0.88871096877502, + "learning_rate": 1.7850573056863173e-05, + "loss": 0.3637, + "step": 4440 + }, + { + "epoch": 0.8895116092874299, + "learning_rate": 1.7841908920258774e-05, + "loss": 0.176, + "step": 4442 + }, + { + "epoch": 0.8895116092874299, + "learning_rate": 1.783322946823638e-05, + "loss": 0.2878, + "step": 4444 + }, + { + "epoch": 0.8903122497998399, + "learning_rate": 1.782453471774711e-05, + "loss": 0.1513, + "step": 4446 + }, + { + "epoch": 0.8903122497998399, + "learning_rate": 1.7815824685772042e-05, + "loss": 0.4897, + "step": 4448 + }, + { + "epoch": 0.8911128903122498, + "learning_rate": 1.7807099389322013e-05, + "loss": 0.3776, + "step": 4450 + }, + { + "epoch": 0.8911128903122498, + "learning_rate": 1.779835884543776e-05, + "loss": 0.2187, + "step": 4452 + }, + { + "epoch": 0.8919135308246597, + "learning_rate": 1.7789603071189733e-05, + "loss": 0.2069, + "step": 4454 + }, + { + "epoch": 0.8919135308246597, + "learning_rate": 1.7780832083678122e-05, + "loss": 0.1773, + "step": 4456 + }, + { + "epoch": 0.8927141713370697, + "learning_rate": 1.7772045900032912e-05, + "loss": 0.1587, + "step": 4458 + }, + { + "epoch": 0.8927141713370697, + "learning_rate": 1.776324453741365e-05, + "loss": 0.438, + "step": 4460 + }, + { + "epoch": 0.8935148118494796, + "learning_rate": 1.7754428013009644e-05, + "loss": 0.4262, + "step": 4462 + }, + { + "epoch": 0.8935148118494796, + "learning_rate": 1.774559634403971e-05, + "loss": 0.122, + "step": 4464 + }, + { + "epoch": 0.8943154523618895, + "learning_rate": 1.7736749547752327e-05, + "loss": 0.2068, + "step": 4466 + }, + { + "epoch": 0.8943154523618895, + "learning_rate": 1.7727887641425465e-05, + "loss": 0.5509, + "step": 4468 + }, + { + "epoch": 0.8951160928742994, + "learning_rate": 1.7719010642366597e-05, + "loss": 0.1901, + "step": 4470 + }, + { + "epoch": 0.8951160928742994, + "learning_rate": 1.7710118567912732e-05, + "loss": 0.418, + "step": 4472 + }, + { + "epoch": 0.8959167333867094, + "learning_rate": 1.770121143543025e-05, + "loss": 0.4342, + "step": 4474 + }, + { + "epoch": 0.8959167333867094, + "learning_rate": 1.7692289262315008e-05, + "loss": 0.4755, + "step": 4476 + }, + { + "epoch": 0.8967173738991193, + "learning_rate": 1.7683352065992174e-05, + "loss": 0.3034, + "step": 4478 + }, + { + "epoch": 0.8967173738991193, + "learning_rate": 1.7674399863916298e-05, + "loss": 0.2379, + "step": 4480 + }, + { + "epoch": 0.8975180144115292, + "learning_rate": 1.7665432673571238e-05, + "loss": 0.2379, + "step": 4482 + }, + { + "epoch": 0.8975180144115292, + "learning_rate": 1.765645051247007e-05, + "loss": 0.1511, + "step": 4484 + }, + { + "epoch": 0.8983186549239391, + "learning_rate": 1.7647453398155204e-05, + "loss": 0.2849, + "step": 4486 + }, + { + "epoch": 0.8983186549239391, + "learning_rate": 1.7638441348198144e-05, + "loss": 0.0123, + "step": 4488 + }, + { + "epoch": 0.899119295436349, + "learning_rate": 1.7629414380199672e-05, + "loss": 0.1866, + "step": 4490 + }, + { + "epoch": 0.899119295436349, + "learning_rate": 1.762037251178961e-05, + "loss": 0.4598, + "step": 4492 + }, + { + "epoch": 0.899919935948759, + "learning_rate": 1.7611315760626943e-05, + "loss": 0.1496, + "step": 4494 + }, + { + "epoch": 0.899919935948759, + "learning_rate": 1.7602244144399713e-05, + "loss": 0.0817, + "step": 4496 + }, + { + "epoch": 0.900720576461169, + "learning_rate": 1.7593157680824943e-05, + "loss": 0.4548, + "step": 4498 + }, + { + "epoch": 0.900720576461169, + "learning_rate": 1.7584056387648738e-05, + "loss": 0.1285, + "step": 4500 + }, + { + "epoch": 0.9015212169735789, + "learning_rate": 1.757494028264608e-05, + "loss": 0.2961, + "step": 4502 + }, + { + "epoch": 0.9015212169735789, + "learning_rate": 1.7565809383620966e-05, + "loss": 0.1537, + "step": 4504 + }, + { + "epoch": 0.9023218574859888, + "learning_rate": 1.7556663708406203e-05, + "loss": 0.2138, + "step": 4506 + }, + { + "epoch": 0.9023218574859888, + "learning_rate": 1.7547503274863502e-05, + "loss": 0.13, + "step": 4508 + }, + { + "epoch": 0.9031224979983987, + "learning_rate": 1.7538328100883404e-05, + "loss": 0.4411, + "step": 4510 + }, + { + "epoch": 0.9031224979983987, + "learning_rate": 1.7529138204385186e-05, + "loss": 0.1029, + "step": 4512 + }, + { + "epoch": 0.9039231385108086, + "learning_rate": 1.7519933603316962e-05, + "loss": 0.4227, + "step": 4514 + }, + { + "epoch": 0.9039231385108086, + "learning_rate": 1.7510714315655467e-05, + "loss": 0.6255, + "step": 4516 + }, + { + "epoch": 0.9047237790232185, + "learning_rate": 1.750148035940622e-05, + "loss": 0.1786, + "step": 4518 + }, + { + "epoch": 0.9047237790232185, + "learning_rate": 1.7492231752603305e-05, + "loss": 0.426, + "step": 4520 + }, + { + "epoch": 0.9055244195356285, + "learning_rate": 1.7482968513309458e-05, + "loss": 0.5727, + "step": 4522 + }, + { + "epoch": 0.9055244195356285, + "learning_rate": 1.7473690659616e-05, + "loss": 0.0447, + "step": 4524 + }, + { + "epoch": 0.9063250600480385, + "learning_rate": 1.7464398209642744e-05, + "loss": 0.333, + "step": 4526 + }, + { + "epoch": 0.9063250600480385, + "learning_rate": 1.7455091181538094e-05, + "loss": 0.5703, + "step": 4528 + }, + { + "epoch": 0.9071257005604484, + "learning_rate": 1.7445769593478842e-05, + "loss": 0.2161, + "step": 4530 + }, + { + "epoch": 0.9071257005604484, + "learning_rate": 1.743643346367027e-05, + "loss": 0.2706, + "step": 4532 + }, + { + "epoch": 0.9079263410728583, + "learning_rate": 1.7427082810346024e-05, + "loss": 0.1665, + "step": 4534 + }, + { + "epoch": 0.9079263410728583, + "learning_rate": 1.741771765176815e-05, + "loss": 0.2637, + "step": 4536 + }, + { + "epoch": 0.9087269815852682, + "learning_rate": 1.740833800622701e-05, + "loss": 0.3771, + "step": 4538 + }, + { + "epoch": 0.9087269815852682, + "learning_rate": 1.739894389204122e-05, + "loss": 0.0067, + "step": 4540 + }, + { + "epoch": 0.9095276220976781, + "learning_rate": 1.738953532755774e-05, + "loss": 0.3853, + "step": 4542 + }, + { + "epoch": 0.9095276220976781, + "learning_rate": 1.7380112331151657e-05, + "loss": 0.2051, + "step": 4544 + }, + { + "epoch": 0.910328262610088, + "learning_rate": 1.7370674921226306e-05, + "loss": 0.1955, + "step": 4546 + }, + { + "epoch": 0.910328262610088, + "learning_rate": 1.7361223116213146e-05, + "loss": 0.2711, + "step": 4548 + }, + { + "epoch": 0.911128903122498, + "learning_rate": 1.7351756934571764e-05, + "loss": 0.2293, + "step": 4550 + }, + { + "epoch": 0.911128903122498, + "learning_rate": 1.7342276394789825e-05, + "loss": 0.1236, + "step": 4552 + }, + { + "epoch": 0.911929543634908, + "learning_rate": 1.7332781515382996e-05, + "loss": 0.51, + "step": 4554 + }, + { + "epoch": 0.911929543634908, + "learning_rate": 1.732327231489503e-05, + "loss": 0.7844, + "step": 4556 + }, + { + "epoch": 0.9127301841473179, + "learning_rate": 1.7313748811897564e-05, + "loss": 0.2334, + "step": 4558 + }, + { + "epoch": 0.9127301841473179, + "learning_rate": 1.7304211024990216e-05, + "loss": 0.2295, + "step": 4560 + }, + { + "epoch": 0.9135308246597278, + "learning_rate": 1.7294658972800495e-05, + "loss": 0.1496, + "step": 4562 + }, + { + "epoch": 0.9135308246597278, + "learning_rate": 1.728509267398376e-05, + "loss": 0.0667, + "step": 4564 + }, + { + "epoch": 0.9143314651721377, + "learning_rate": 1.727551214722322e-05, + "loss": 0.3487, + "step": 4566 + }, + { + "epoch": 0.9143314651721377, + "learning_rate": 1.7265917411229803e-05, + "loss": 0.0767, + "step": 4568 + }, + { + "epoch": 0.9151321056845476, + "learning_rate": 1.72563084847423e-05, + "loss": 0.3192, + "step": 4570 + }, + { + "epoch": 0.9151321056845476, + "learning_rate": 1.7246685386527105e-05, + "loss": 0.3517, + "step": 4572 + }, + { + "epoch": 0.9159327461969575, + "learning_rate": 1.723704813537835e-05, + "loss": 0.2294, + "step": 4574 + }, + { + "epoch": 0.9159327461969575, + "learning_rate": 1.7227396750117802e-05, + "loss": 0.0792, + "step": 4576 + }, + { + "epoch": 0.9167333867093675, + "learning_rate": 1.7217731249594817e-05, + "loss": 0.6158, + "step": 4578 + }, + { + "epoch": 0.9167333867093675, + "learning_rate": 1.7208051652686348e-05, + "loss": 0.1384, + "step": 4580 + }, + { + "epoch": 0.9175340272217775, + "learning_rate": 1.7198357978296827e-05, + "loss": 0.4153, + "step": 4582 + }, + { + "epoch": 0.9175340272217775, + "learning_rate": 1.718865024535822e-05, + "loss": 0.0014, + "step": 4584 + }, + { + "epoch": 0.9183346677341874, + "learning_rate": 1.717892847282995e-05, + "loss": 0.2328, + "step": 4586 + }, + { + "epoch": 0.9183346677341874, + "learning_rate": 1.716919267969884e-05, + "loss": 0.3754, + "step": 4588 + }, + { + "epoch": 0.9191353082465973, + "learning_rate": 1.715944288497911e-05, + "loss": 0.1695, + "step": 4590 + }, + { + "epoch": 0.9191353082465973, + "learning_rate": 1.7149679107712317e-05, + "loss": 0.2707, + "step": 4592 + }, + { + "epoch": 0.9199359487590072, + "learning_rate": 1.713990136696734e-05, + "loss": 0.3555, + "step": 4594 + }, + { + "epoch": 0.9199359487590072, + "learning_rate": 1.7130109681840298e-05, + "loss": 0.4731, + "step": 4596 + }, + { + "epoch": 0.9207365892714171, + "learning_rate": 1.7120304071454578e-05, + "loss": 0.0322, + "step": 4598 + }, + { + "epoch": 0.9207365892714171, + "learning_rate": 1.711048455496075e-05, + "loss": 0.0551, + "step": 4600 + }, + { + "epoch": 0.921537229783827, + "learning_rate": 1.7100651151536532e-05, + "loss": 0.8615, + "step": 4602 + }, + { + "epoch": 0.921537229783827, + "learning_rate": 1.7090803880386784e-05, + "loss": 0.1954, + "step": 4604 + }, + { + "epoch": 0.922337870296237, + "learning_rate": 1.708094276074344e-05, + "loss": 0.2265, + "step": 4606 + }, + { + "epoch": 0.922337870296237, + "learning_rate": 1.7071067811865474e-05, + "loss": 0.1821, + "step": 4608 + }, + { + "epoch": 0.923138510808647, + "learning_rate": 1.7061179053038894e-05, + "loss": 0.5494, + "step": 4610 + }, + { + "epoch": 0.923138510808647, + "learning_rate": 1.705127650357663e-05, + "loss": 0.1867, + "step": 4612 + }, + { + "epoch": 0.9239391513210569, + "learning_rate": 1.704136018281859e-05, + "loss": 0.1756, + "step": 4614 + }, + { + "epoch": 0.9239391513210569, + "learning_rate": 1.7031430110131566e-05, + "loss": 0.6573, + "step": 4616 + }, + { + "epoch": 0.9247397918334668, + "learning_rate": 1.7021486304909202e-05, + "loss": 0.1342, + "step": 4618 + }, + { + "epoch": 0.9247397918334668, + "learning_rate": 1.701152878657197e-05, + "loss": 0.1762, + "step": 4620 + }, + { + "epoch": 0.9255404323458767, + "learning_rate": 1.700155757456711e-05, + "loss": 0.1346, + "step": 4622 + }, + { + "epoch": 0.9255404323458767, + "learning_rate": 1.699157268836863e-05, + "loss": 0.2233, + "step": 4624 + }, + { + "epoch": 0.9263410728582866, + "learning_rate": 1.6981574147477214e-05, + "loss": 0.1991, + "step": 4626 + }, + { + "epoch": 0.9263410728582866, + "learning_rate": 1.697156197142023e-05, + "loss": 0.0719, + "step": 4628 + }, + { + "epoch": 0.9271417133706965, + "learning_rate": 1.696153617975168e-05, + "loss": 0.2535, + "step": 4630 + }, + { + "epoch": 0.9271417133706965, + "learning_rate": 1.6951496792052148e-05, + "loss": 0.0063, + "step": 4632 + }, + { + "epoch": 0.9279423538831065, + "learning_rate": 1.694144382792878e-05, + "loss": 0.2162, + "step": 4634 + }, + { + "epoch": 0.9279423538831065, + "learning_rate": 1.6931377307015236e-05, + "loss": 0.1016, + "step": 4636 + }, + { + "epoch": 0.9287429943955164, + "learning_rate": 1.6921297248971652e-05, + "loss": 0.3514, + "step": 4638 + }, + { + "epoch": 0.9287429943955164, + "learning_rate": 1.6911203673484583e-05, + "loss": 0.1953, + "step": 4640 + }, + { + "epoch": 0.9295436349079264, + "learning_rate": 1.690109660026701e-05, + "loss": 0.3386, + "step": 4642 + }, + { + "epoch": 0.9295436349079264, + "learning_rate": 1.6890976049058267e-05, + "loss": 0.0428, + "step": 4644 + }, + { + "epoch": 0.9303442754203363, + "learning_rate": 1.688084203962401e-05, + "loss": 0.0296, + "step": 4646 + }, + { + "epoch": 0.9303442754203363, + "learning_rate": 1.687069459175619e-05, + "loss": 0.1098, + "step": 4648 + }, + { + "epoch": 0.9311449159327462, + "learning_rate": 1.6860533725272953e-05, + "loss": 0.4078, + "step": 4650 + }, + { + "epoch": 0.9311449159327462, + "learning_rate": 1.6850359460018744e-05, + "loss": 0.3788, + "step": 4652 + }, + { + "epoch": 0.9319455564451561, + "learning_rate": 1.6840171815864085e-05, + "loss": 0.4026, + "step": 4654 + }, + { + "epoch": 0.9319455564451561, + "learning_rate": 1.682997081270568e-05, + "loss": 0.0298, + "step": 4656 + }, + { + "epoch": 0.932746196957566, + "learning_rate": 1.681975647046631e-05, + "loss": 0.3147, + "step": 4658 + }, + { + "epoch": 0.932746196957566, + "learning_rate": 1.6809528809094805e-05, + "loss": 0.3, + "step": 4660 + }, + { + "epoch": 0.933546837469976, + "learning_rate": 1.6799287848566024e-05, + "loss": 0.6517, + "step": 4662 + }, + { + "epoch": 0.933546837469976, + "learning_rate": 1.6789033608880742e-05, + "loss": 0.238, + "step": 4664 + }, + { + "epoch": 0.9343474779823859, + "learning_rate": 1.6778766110065765e-05, + "loss": 0.1606, + "step": 4666 + }, + { + "epoch": 0.9343474779823859, + "learning_rate": 1.67684853721737e-05, + "loss": 0.4368, + "step": 4668 + }, + { + "epoch": 0.9351481184947958, + "learning_rate": 1.6758191415283066e-05, + "loss": 0.3007, + "step": 4670 + }, + { + "epoch": 0.9351481184947958, + "learning_rate": 1.6747884259498185e-05, + "loss": 0.0737, + "step": 4672 + }, + { + "epoch": 0.9359487590072058, + "learning_rate": 1.673756392494916e-05, + "loss": 0.5567, + "step": 4674 + }, + { + "epoch": 0.9359487590072058, + "learning_rate": 1.6727230431791826e-05, + "loss": 0.1951, + "step": 4676 + }, + { + "epoch": 0.9367493995196157, + "learning_rate": 1.671688380020769e-05, + "loss": 0.202, + "step": 4678 + }, + { + "epoch": 0.9367493995196157, + "learning_rate": 1.6706524050404006e-05, + "loss": 0.2464, + "step": 4680 + }, + { + "epoch": 0.9375500400320256, + "learning_rate": 1.6696151202613537e-05, + "loss": 0.6446, + "step": 4682 + }, + { + "epoch": 0.9375500400320256, + "learning_rate": 1.6685765277094702e-05, + "loss": 0.2138, + "step": 4684 + }, + { + "epoch": 0.9383506805444356, + "learning_rate": 1.6675366294131432e-05, + "loss": 0.0839, + "step": 4686 + }, + { + "epoch": 0.9383506805444356, + "learning_rate": 1.6664954274033175e-05, + "loss": 0.0096, + "step": 4688 + }, + { + "epoch": 0.9391513210568455, + "learning_rate": 1.6654529237134833e-05, + "loss": 0.1432, + "step": 4690 + }, + { + "epoch": 0.9391513210568455, + "learning_rate": 1.66440912037967e-05, + "loss": 0.0993, + "step": 4692 + }, + { + "epoch": 0.9399519615692554, + "learning_rate": 1.663364019440453e-05, + "loss": 0.5957, + "step": 4694 + }, + { + "epoch": 0.9399519615692554, + "learning_rate": 1.662317622936933e-05, + "loss": 0.1596, + "step": 4696 + }, + { + "epoch": 0.9407526020816653, + "learning_rate": 1.6612699329127467e-05, + "loss": 0.4715, + "step": 4698 + }, + { + "epoch": 0.9407526020816653, + "learning_rate": 1.6602209514140562e-05, + "loss": 0.7418, + "step": 4700 + }, + { + "epoch": 0.9415532425940752, + "learning_rate": 1.6591706804895415e-05, + "loss": 0.2233, + "step": 4702 + }, + { + "epoch": 0.9415532425940752, + "learning_rate": 1.6581191221904098e-05, + "loss": 0.1497, + "step": 4704 + }, + { + "epoch": 0.9423538831064852, + "learning_rate": 1.6570662785703716e-05, + "loss": 0.1512, + "step": 4706 + }, + { + "epoch": 0.9423538831064852, + "learning_rate": 1.6560121516856592e-05, + "loss": 0.1666, + "step": 4708 + }, + { + "epoch": 0.9431545236188951, + "learning_rate": 1.654956743595001e-05, + "loss": 0.1528, + "step": 4710 + }, + { + "epoch": 0.9431545236188951, + "learning_rate": 1.6539000563596328e-05, + "loss": 0.228, + "step": 4712 + }, + { + "epoch": 0.9439551641313051, + "learning_rate": 1.6528420920432893e-05, + "loss": 0.3298, + "step": 4714 + }, + { + "epoch": 0.9439551641313051, + "learning_rate": 1.651782852712194e-05, + "loss": 0.0243, + "step": 4716 + }, + { + "epoch": 0.944755804643715, + "learning_rate": 1.6507223404350686e-05, + "loss": 0.635, + "step": 4718 + }, + { + "epoch": 0.944755804643715, + "learning_rate": 1.6496605572831127e-05, + "loss": 0.1585, + "step": 4720 + }, + { + "epoch": 0.9455564451561249, + "learning_rate": 1.648597505330016e-05, + "loss": 0.4467, + "step": 4722 + }, + { + "epoch": 0.9455564451561249, + "learning_rate": 1.6475331866519387e-05, + "loss": 0.0793, + "step": 4724 + }, + { + "epoch": 0.9463570856685348, + "learning_rate": 1.6464676033275187e-05, + "loss": 0.2946, + "step": 4726 + }, + { + "epoch": 0.9463570856685348, + "learning_rate": 1.6454007574378657e-05, + "loss": 0.0438, + "step": 4728 + }, + { + "epoch": 0.9471577261809447, + "learning_rate": 1.644332651066548e-05, + "loss": 0.5859, + "step": 4730 + }, + { + "epoch": 0.9471577261809447, + "learning_rate": 1.6432632862996062e-05, + "loss": 0.2021, + "step": 4732 + }, + { + "epoch": 0.9479583666933546, + "learning_rate": 1.6421926652255275e-05, + "loss": 0.1627, + "step": 4734 + }, + { + "epoch": 0.9479583666933546, + "learning_rate": 1.6411207899352633e-05, + "loss": 0.3356, + "step": 4736 + }, + { + "epoch": 0.9487590072057646, + "learning_rate": 1.6400476625222057e-05, + "loss": 0.3043, + "step": 4738 + }, + { + "epoch": 0.9487590072057646, + "learning_rate": 1.6389732850821964e-05, + "loss": 0.256, + "step": 4740 + }, + { + "epoch": 0.9495596477181746, + "learning_rate": 1.6378976597135193e-05, + "loss": 0.3328, + "step": 4742 + }, + { + "epoch": 0.9495596477181746, + "learning_rate": 1.6368207885168904e-05, + "loss": 0.0391, + "step": 4744 + }, + { + "epoch": 0.9503602882305845, + "learning_rate": 1.635742673595468e-05, + "loss": 0.67, + "step": 4746 + }, + { + "epoch": 0.9503602882305845, + "learning_rate": 1.6346633170548275e-05, + "loss": 0.378, + "step": 4748 + }, + { + "epoch": 0.9511609287429944, + "learning_rate": 1.6335827210029823e-05, + "loss": 0.089, + "step": 4750 + }, + { + "epoch": 0.9511609287429944, + "learning_rate": 1.6325008875503563e-05, + "loss": 0.0366, + "step": 4752 + }, + { + "epoch": 0.9519615692554043, + "learning_rate": 1.6314178188097917e-05, + "loss": 0.4012, + "step": 4754 + }, + { + "epoch": 0.9519615692554043, + "learning_rate": 1.6303335168965495e-05, + "loss": 0.1774, + "step": 4756 + }, + { + "epoch": 0.9527622097678142, + "learning_rate": 1.6292479839282904e-05, + "loss": 0.2536, + "step": 4758 + }, + { + "epoch": 0.9527622097678142, + "learning_rate": 1.628161222025089e-05, + "loss": 0.2885, + "step": 4760 + }, + { + "epoch": 0.9535628502802241, + "learning_rate": 1.627073233309409e-05, + "loss": 0.2917, + "step": 4762 + }, + { + "epoch": 0.9535628502802241, + "learning_rate": 1.625984019906122e-05, + "loss": 0.0923, + "step": 4764 + }, + { + "epoch": 0.9543634907926342, + "learning_rate": 1.624893583942482e-05, + "loss": 0.3349, + "step": 4766 + }, + { + "epoch": 0.9543634907926342, + "learning_rate": 1.623801927548132e-05, + "loss": 0.0433, + "step": 4768 + }, + { + "epoch": 0.9551641313050441, + "learning_rate": 1.6227090528551058e-05, + "loss": 0.1419, + "step": 4770 + }, + { + "epoch": 0.9551641313050441, + "learning_rate": 1.6216149619978057e-05, + "loss": 0.1378, + "step": 4772 + }, + { + "epoch": 0.955964771817454, + "learning_rate": 1.6205196571130204e-05, + "loss": 0.2879, + "step": 4774 + }, + { + "epoch": 0.955964771817454, + "learning_rate": 1.6194231403398987e-05, + "loss": 0.1541, + "step": 4776 + }, + { + "epoch": 0.9567654123298639, + "learning_rate": 1.618325413819967e-05, + "loss": 0.411, + "step": 4778 + }, + { + "epoch": 0.9567654123298639, + "learning_rate": 1.6172264796971063e-05, + "loss": 0.0572, + "step": 4780 + }, + { + "epoch": 0.9575660528422738, + "learning_rate": 1.6161263401175555e-05, + "loss": 0.5453, + "step": 4782 + }, + { + "epoch": 0.9575660528422738, + "learning_rate": 1.6150249972299173e-05, + "loss": 0.1949, + "step": 4784 + }, + { + "epoch": 0.9583666933546837, + "learning_rate": 1.613922453185133e-05, + "loss": 0.7139, + "step": 4786 + }, + { + "epoch": 0.9583666933546837, + "learning_rate": 1.612818710136499e-05, + "loss": 0.4795, + "step": 4788 + }, + { + "epoch": 0.9591673338670936, + "learning_rate": 1.6117137702396454e-05, + "loss": 0.5209, + "step": 4790 + }, + { + "epoch": 0.9591673338670936, + "learning_rate": 1.6106076356525484e-05, + "loss": 0.4714, + "step": 4792 + }, + { + "epoch": 0.9599679743795037, + "learning_rate": 1.6095003085355103e-05, + "loss": 0.3584, + "step": 4794 + }, + { + "epoch": 0.9599679743795037, + "learning_rate": 1.6083917910511623e-05, + "loss": 0.3463, + "step": 4796 + }, + { + "epoch": 0.9607686148919136, + "learning_rate": 1.6072820853644688e-05, + "loss": 0.7911, + "step": 4798 + }, + { + "epoch": 0.9607686148919136, + "learning_rate": 1.6061711936427028e-05, + "loss": 0.2156, + "step": 4800 + }, + { + "epoch": 0.9615692554043235, + "learning_rate": 1.6050591180554658e-05, + "loss": 0.183, + "step": 4802 + }, + { + "epoch": 0.9615692554043235, + "learning_rate": 1.60394586077466e-05, + "loss": 0.1345, + "step": 4804 + }, + { + "epoch": 0.9623698959167334, + "learning_rate": 1.6028314239745068e-05, + "loss": 0.1276, + "step": 4806 + }, + { + "epoch": 0.9623698959167334, + "learning_rate": 1.6017158098315224e-05, + "loss": 0.2641, + "step": 4808 + }, + { + "epoch": 0.9631705364291433, + "learning_rate": 1.6005990205245226e-05, + "loss": 0.4003, + "step": 4810 + }, + { + "epoch": 0.9631705364291433, + "learning_rate": 1.5994810582346266e-05, + "loss": 0.0812, + "step": 4812 + }, + { + "epoch": 0.9639711769415532, + "learning_rate": 1.5983619251452334e-05, + "loss": 0.1702, + "step": 4814 + }, + { + "epoch": 0.9639711769415532, + "learning_rate": 1.5972416234420404e-05, + "loss": 0.1665, + "step": 4816 + }, + { + "epoch": 0.9647718174539631, + "learning_rate": 1.596120155313017e-05, + "loss": 0.2824, + "step": 4818 + }, + { + "epoch": 0.9647718174539631, + "learning_rate": 1.594997522948413e-05, + "loss": 0.0134, + "step": 4820 + }, + { + "epoch": 0.9655724579663731, + "learning_rate": 1.593873728540759e-05, + "loss": 0.1138, + "step": 4822 + }, + { + "epoch": 0.9655724579663731, + "learning_rate": 1.592748774284844e-05, + "loss": 0.1343, + "step": 4824 + }, + { + "epoch": 0.966373098478783, + "learning_rate": 1.5916226623777346e-05, + "loss": 0.2533, + "step": 4826 + }, + { + "epoch": 0.966373098478783, + "learning_rate": 1.5904953950187448e-05, + "loss": 0.1226, + "step": 4828 + }, + { + "epoch": 0.967173738991193, + "learning_rate": 1.5893669744094587e-05, + "loss": 0.1512, + "step": 4830 + }, + { + "epoch": 0.967173738991193, + "learning_rate": 1.588237402753703e-05, + "loss": 0.2555, + "step": 4832 + }, + { + "epoch": 0.9679743795036029, + "learning_rate": 1.5871066822575526e-05, + "loss": 0.2378, + "step": 4834 + }, + { + "epoch": 0.9679743795036029, + "learning_rate": 1.5859748151293354e-05, + "loss": 0.7478, + "step": 4836 + }, + { + "epoch": 0.9687750200160128, + "learning_rate": 1.5848418035796064e-05, + "loss": 0.152, + "step": 4838 + }, + { + "epoch": 0.9687750200160128, + "learning_rate": 1.5837076498211673e-05, + "loss": 0.0552, + "step": 4840 + }, + { + "epoch": 0.9695756605284227, + "learning_rate": 1.5825723560690396e-05, + "loss": 0.4747, + "step": 4842 + }, + { + "epoch": 0.9695756605284227, + "learning_rate": 1.581435924540482e-05, + "loss": 0.2445, + "step": 4844 + }, + { + "epoch": 0.9703763010408326, + "learning_rate": 1.580298357454967e-05, + "loss": 0.3226, + "step": 4846 + }, + { + "epoch": 0.9703763010408326, + "learning_rate": 1.579159657034185e-05, + "loss": 0.0665, + "step": 4848 + }, + { + "epoch": 0.9711769415532426, + "learning_rate": 1.5780198255020485e-05, + "loss": 0.2761, + "step": 4850 + }, + { + "epoch": 0.9711769415532426, + "learning_rate": 1.5768788650846674e-05, + "loss": 0.007, + "step": 4852 + }, + { + "epoch": 0.9719775820656525, + "learning_rate": 1.5757367780103672e-05, + "loss": 0.1608, + "step": 4854 + }, + { + "epoch": 0.9719775820656525, + "learning_rate": 1.574593566509664e-05, + "loss": 0.1789, + "step": 4856 + }, + { + "epoch": 0.9727782225780625, + "learning_rate": 1.5734492328152796e-05, + "loss": 0.0919, + "step": 4858 + }, + { + "epoch": 0.9727782225780625, + "learning_rate": 1.5723037791621203e-05, + "loss": 0.0056, + "step": 4860 + }, + { + "epoch": 0.9735788630904724, + "learning_rate": 1.5711572077872784e-05, + "loss": 0.1635, + "step": 4862 + }, + { + "epoch": 0.9735788630904724, + "learning_rate": 1.5700095209300386e-05, + "loss": 0.0626, + "step": 4864 + }, + { + "epoch": 0.9743795036028823, + "learning_rate": 1.568860720831852e-05, + "loss": 0.5397, + "step": 4866 + }, + { + "epoch": 0.9743795036028823, + "learning_rate": 1.5677108097363565e-05, + "loss": 0.0692, + "step": 4868 + }, + { + "epoch": 0.9751801441152922, + "learning_rate": 1.5665597898893508e-05, + "loss": 0.1807, + "step": 4870 + }, + { + "epoch": 0.9751801441152922, + "learning_rate": 1.5654076635387976e-05, + "loss": 0.029, + "step": 4872 + }, + { + "epoch": 0.9759807846277022, + "learning_rate": 1.5642544329348316e-05, + "loss": 0.3771, + "step": 4874 + }, + { + "epoch": 0.9759807846277022, + "learning_rate": 1.5631001003297302e-05, + "loss": 0.1662, + "step": 4876 + }, + { + "epoch": 0.9767814251401121, + "learning_rate": 1.5619446679779367e-05, + "loss": 0.5357, + "step": 4878 + }, + { + "epoch": 0.9767814251401121, + "learning_rate": 1.560788138136029e-05, + "loss": 0.5486, + "step": 4880 + }, + { + "epoch": 0.977582065652522, + "learning_rate": 1.5596305130627414e-05, + "loss": 0.2382, + "step": 4882 + }, + { + "epoch": 0.977582065652522, + "learning_rate": 1.5584717950189373e-05, + "loss": 0.1727, + "step": 4884 + }, + { + "epoch": 0.978382706164932, + "learning_rate": 1.5573119862676155e-05, + "loss": 0.1686, + "step": 4886 + }, + { + "epoch": 0.978382706164932, + "learning_rate": 1.5561510890739137e-05, + "loss": 0.0837, + "step": 4888 + }, + { + "epoch": 0.9791833466773419, + "learning_rate": 1.554989105705083e-05, + "loss": 0.1311, + "step": 4890 + }, + { + "epoch": 0.9791833466773419, + "learning_rate": 1.5538260384305083e-05, + "loss": 0.0826, + "step": 4892 + }, + { + "epoch": 0.9799839871897518, + "learning_rate": 1.5526618895216786e-05, + "loss": 0.1105, + "step": 4894 + }, + { + "epoch": 0.9799839871897518, + "learning_rate": 1.5514966612522088e-05, + "loss": 0.0088, + "step": 4896 + }, + { + "epoch": 0.9807846277021617, + "learning_rate": 1.5503303558978112e-05, + "loss": 0.5397, + "step": 4898 + }, + { + "epoch": 0.9807846277021617, + "learning_rate": 1.5491629757363033e-05, + "loss": 0.6078, + "step": 4900 + }, + { + "epoch": 0.9815852682145717, + "learning_rate": 1.547994523047609e-05, + "loss": 0.1693, + "step": 4902 + }, + { + "epoch": 0.9815852682145717, + "learning_rate": 1.546825000113736e-05, + "loss": 0.1275, + "step": 4904 + }, + { + "epoch": 0.9823859087269816, + "learning_rate": 1.545654409218794e-05, + "loss": 0.6113, + "step": 4906 + }, + { + "epoch": 0.9823859087269816, + "learning_rate": 1.544482752648966e-05, + "loss": 0.2083, + "step": 4908 + }, + { + "epoch": 0.9831865492393915, + "learning_rate": 1.5433100326925298e-05, + "loss": 0.0843, + "step": 4910 + }, + { + "epoch": 0.9831865492393915, + "learning_rate": 1.5421362516398285e-05, + "loss": 0.0569, + "step": 4912 + }, + { + "epoch": 0.9839871897518014, + "learning_rate": 1.5409614117832797e-05, + "loss": 0.4227, + "step": 4914 + }, + { + "epoch": 0.9839871897518014, + "learning_rate": 1.539785515417377e-05, + "loss": 0.0934, + "step": 4916 + }, + { + "epoch": 0.9847878302642114, + "learning_rate": 1.538608564838665e-05, + "loss": 0.153, + "step": 4918 + }, + { + "epoch": 0.9847878302642114, + "learning_rate": 1.5374305623457605e-05, + "loss": 0.0904, + "step": 4920 + }, + { + "epoch": 0.9855884707766213, + "learning_rate": 1.5362515102393244e-05, + "loss": 0.1268, + "step": 4922 + }, + { + "epoch": 0.9855884707766213, + "learning_rate": 1.5350714108220677e-05, + "loss": 0.5589, + "step": 4924 + }, + { + "epoch": 0.9863891112890312, + "learning_rate": 1.5338902663987564e-05, + "loss": 0.1444, + "step": 4926 + }, + { + "epoch": 0.9863891112890312, + "learning_rate": 1.532708079276186e-05, + "loss": 0.1909, + "step": 4928 + }, + { + "epoch": 0.9871897518014412, + "learning_rate": 1.531524851763198e-05, + "loss": 0.1268, + "step": 4930 + }, + { + "epoch": 0.9871897518014412, + "learning_rate": 1.5303405861706567e-05, + "loss": 0.1504, + "step": 4932 + }, + { + "epoch": 0.9879903923138511, + "learning_rate": 1.529155284811464e-05, + "loss": 0.1542, + "step": 4934 + }, + { + "epoch": 0.9879903923138511, + "learning_rate": 1.5279689500005353e-05, + "loss": 0.0221, + "step": 4936 + }, + { + "epoch": 0.988791032826261, + "learning_rate": 1.5267815840548067e-05, + "loss": 0.4188, + "step": 4938 + }, + { + "epoch": 0.988791032826261, + "learning_rate": 1.5255931892932344e-05, + "loss": 0.3513, + "step": 4940 + }, + { + "epoch": 0.9895916733386709, + "learning_rate": 1.5244037680367739e-05, + "loss": 0.1664, + "step": 4942 + }, + { + "epoch": 0.9895916733386709, + "learning_rate": 1.5232133226083962e-05, + "loss": 0.1242, + "step": 4944 + }, + { + "epoch": 0.9903923138510808, + "learning_rate": 1.522021855333061e-05, + "loss": 0.1204, + "step": 4946 + }, + { + "epoch": 0.9903923138510808, + "learning_rate": 1.5208293685377362e-05, + "loss": 0.2119, + "step": 4948 + }, + { + "epoch": 0.9911929543634908, + "learning_rate": 1.519635864551371e-05, + "loss": 0.1162, + "step": 4950 + }, + { + "epoch": 0.9911929543634908, + "learning_rate": 1.5184413457049014e-05, + "loss": 0.0457, + "step": 4952 + }, + { + "epoch": 0.9919935948759008, + "learning_rate": 1.5172458143312548e-05, + "loss": 0.2387, + "step": 4954 + }, + { + "epoch": 0.9919935948759008, + "learning_rate": 1.5160492727653238e-05, + "loss": 0.034, + "step": 4956 + }, + { + "epoch": 0.9927942353883107, + "learning_rate": 1.5148517233439858e-05, + "loss": 0.2563, + "step": 4958 + }, + { + "epoch": 0.9927942353883107, + "learning_rate": 1.5136531684060753e-05, + "loss": 0.2303, + "step": 4960 + }, + { + "epoch": 0.9935948759007206, + "learning_rate": 1.512453610292402e-05, + "loss": 0.3777, + "step": 4962 + }, + { + "epoch": 0.9935948759007206, + "learning_rate": 1.5112530513457251e-05, + "loss": 0.0183, + "step": 4964 + }, + { + "epoch": 0.9943955164131305, + "learning_rate": 1.5100514939107598e-05, + "loss": 0.1849, + "step": 4966 + }, + { + "epoch": 0.9943955164131305, + "learning_rate": 1.50884894033418e-05, + "loss": 0.0986, + "step": 4968 + }, + { + "epoch": 0.9951961569255404, + "learning_rate": 1.5076453929645927e-05, + "loss": 0.697, + "step": 4970 + }, + { + "epoch": 0.9951961569255404, + "learning_rate": 1.5064408541525578e-05, + "loss": 0.0108, + "step": 4972 + }, + { + "epoch": 0.9959967974379503, + "learning_rate": 1.505235326250563e-05, + "loss": 0.4589, + "step": 4974 + }, + { + "epoch": 0.9959967974379503, + "learning_rate": 1.504028811613027e-05, + "loss": 0.0098, + "step": 4976 + }, + { + "epoch": 0.9967974379503602, + "learning_rate": 1.5028213125963054e-05, + "loss": 0.3555, + "step": 4978 + }, + { + "epoch": 0.9967974379503602, + "learning_rate": 1.5016128315586636e-05, + "loss": 0.2235, + "step": 4980 + }, + { + "epoch": 0.9975980784627703, + "learning_rate": 1.5004033708602977e-05, + "loss": 0.1274, + "step": 4982 + }, + { + "epoch": 0.9975980784627703, + "learning_rate": 1.4991929328633043e-05, + "loss": 0.0136, + "step": 4984 + }, + { + "epoch": 0.9983987189751802, + "learning_rate": 1.4979815199317011e-05, + "loss": 0.9098, + "step": 4986 + }, + { + "epoch": 0.9983987189751802, + "learning_rate": 1.4967691344314012e-05, + "loss": 0.1939, + "step": 4988 + }, + { + "epoch": 0.9991993594875901, + "learning_rate": 1.495555778730216e-05, + "loss": 0.6106, + "step": 4990 + }, + { + "epoch": 0.9991993594875901, + "learning_rate": 1.4943414551978622e-05, + "loss": 0.017, + "step": 4992 + }, + { + "epoch": 1.0, + "learning_rate": 1.4931261662059333e-05, + "loss": 0.1969, + "step": 4994 + }, + { + "epoch": 1.0, + "learning_rate": 1.4919099141279214e-05, + "loss": 0.7851, + "step": 4996 + }, + { + "epoch": 1.0, + "step": 4996, + "total_flos": 2.927145290039296e+16, + "train_loss": 0.27889094776088064, + "train_runtime": 8503.2779, + "train_samples_per_second": 2.35, + "train_steps_per_second": 0.588 + } + ], + "logging_steps": 2, + "max_steps": 4996, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 2.927145290039296e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_25_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_25_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..4929dcb1326f376d0a13570f2ce34ce154591bef --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_25_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1429b8638f8f26d39670c2c450d26e04ee5e6aacd66d17bcbeb21b4bbed2dd21 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_25_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_25_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2a86b1642ac9562118e2b50ee561acd5e307e618 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_25_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f433e41c0774412e2d54624d75e19252ea86c1417a4ab460e9092764d81bb2c +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_25_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_25_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..d1117fbe79c6733118e13c14e174db95b3f81b67 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_25_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c858add8c9c9bc9679ef6ea731c6e25961adf98717bd17aba1b4a143012839b +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_25_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_25_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..e4db193800c1d3a9c89a75437e6ba7485f96c043 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_infoBatch_scenario12_new_10000_random0_25_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a3b7595208c3e5400a1cf7e2264a15968c0770649c7ef2df76f2045cda7c2e8 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7af0442430f59d3aebc7b99f4c80fe8249a2924c --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,3776 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1249, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016012810248198558, + "learning_rate": 2.4341906163790364e-06, + "loss": 0.0115, + "step": 2 + }, + { + "epoch": 0.0032025620496397116, + "learning_rate": 2.4708617956148052e-06, + "loss": 0.0613, + "step": 4 + }, + { + "epoch": 0.004803843074459567, + "learning_rate": 2.507768247396697e-06, + "loss": 0.0314, + "step": 6 + }, + { + "epoch": 0.006405124099279423, + "learning_rate": 2.5449088184619065e-06, + "loss": 0.1098, + "step": 8 + }, + { + "epoch": 0.008006405124099279, + "learning_rate": 2.5822823482318517e-06, + "loss": 0.0442, + "step": 10 + }, + { + "epoch": 0.009607686148919135, + "learning_rate": 2.6198876688483453e-06, + "loss": 0.0075, + "step": 12 + }, + { + "epoch": 0.01120896717373899, + "learning_rate": 2.6577236052101764e-06, + "loss": 0.019, + "step": 14 + }, + { + "epoch": 0.012810248198558846, + "learning_rate": 2.6957889750097866e-06, + "loss": 0.0177, + "step": 16 + }, + { + "epoch": 0.014411529223378704, + "learning_rate": 2.7340825887701848e-06, + "loss": 0.0717, + "step": 18 + }, + { + "epoch": 0.016012810248198558, + "learning_rate": 2.772603249882202e-06, + "loss": 0.0304, + "step": 20 + }, + { + "epoch": 0.017614091273018415, + "learning_rate": 2.81134975464178e-06, + "loss": 0.0935, + "step": 22 + }, + { + "epoch": 0.01921537229783827, + "learning_rate": 2.850320892287688e-06, + "loss": 0.0096, + "step": 24 + }, + { + "epoch": 0.020816653322658127, + "learning_rate": 2.889515445039256e-06, + "loss": 0.0836, + "step": 26 + }, + { + "epoch": 0.02241793434747798, + "learning_rate": 2.928932188134529e-06, + "loss": 0.0077, + "step": 28 + }, + { + "epoch": 0.02401921537229784, + "learning_rate": 2.9685698898684355e-06, + "loss": 0.0155, + "step": 30 + }, + { + "epoch": 0.025620496397117692, + "learning_rate": 3.00842731163137e-06, + "loss": 0.0711, + "step": 32 + }, + { + "epoch": 0.02722177742193755, + "learning_rate": 3.048503207947854e-06, + "loss": 0.1398, + "step": 34 + }, + { + "epoch": 0.028823058446757407, + "learning_rate": 3.0887963265154187e-06, + "loss": 0.0613, + "step": 36 + }, + { + "epoch": 0.03042433947157726, + "learning_rate": 3.129305408243829e-06, + "loss": 0.4299, + "step": 38 + }, + { + "epoch": 0.032025620496397116, + "learning_rate": 3.17002918729432e-06, + "loss": 0.3248, + "step": 40 + }, + { + "epoch": 0.03362690152121697, + "learning_rate": 3.2109663911192622e-06, + "loss": 0.251, + "step": 42 + }, + { + "epoch": 0.03522818254603683, + "learning_rate": 3.2521157405018146e-06, + "loss": 0.0688, + "step": 44 + }, + { + "epoch": 0.03682946357085669, + "learning_rate": 3.293475949595998e-06, + "loss": 0.0092, + "step": 46 + }, + { + "epoch": 0.03843074459567654, + "learning_rate": 3.335045725966829e-06, + "loss": 0.4842, + "step": 48 + }, + { + "epoch": 0.040032025620496396, + "learning_rate": 3.3768237706306716e-06, + "loss": 0.2113, + "step": 50 + }, + { + "epoch": 0.041633306645316254, + "learning_rate": 3.418808778095917e-06, + "loss": 0.0297, + "step": 52 + }, + { + "epoch": 0.04323458767013611, + "learning_rate": 3.460999436403676e-06, + "loss": 0.0452, + "step": 54 + }, + { + "epoch": 0.04483586869495596, + "learning_rate": 3.5033944271688624e-06, + "loss": 0.5119, + "step": 56 + }, + { + "epoch": 0.04643714971977582, + "learning_rate": 3.5459924256213596e-06, + "loss": 0.0449, + "step": 58 + }, + { + "epoch": 0.04803843074459568, + "learning_rate": 3.588792100647368e-06, + "loss": 0.0173, + "step": 60 + }, + { + "epoch": 0.049639711769415534, + "learning_rate": 3.6317921148310965e-06, + "loss": 0.0322, + "step": 62 + }, + { + "epoch": 0.051240992794235385, + "learning_rate": 3.674991124496452e-06, + "loss": 0.0341, + "step": 64 + }, + { + "epoch": 0.05284227381905524, + "learning_rate": 3.7183877797491143e-06, + "loss": 0.0166, + "step": 66 + }, + { + "epoch": 0.0544435548438751, + "learning_rate": 3.7619807245186824e-06, + "loss": 0.0369, + "step": 68 + }, + { + "epoch": 0.05604483586869496, + "learning_rate": 3.8057685966010025e-06, + "loss": 0.0651, + "step": 70 + }, + { + "epoch": 0.057646116893514815, + "learning_rate": 3.849750027700842e-06, + "loss": 0.0364, + "step": 72 + }, + { + "epoch": 0.059247397918334666, + "learning_rate": 3.8939236434745184e-06, + "loss": 0.0152, + "step": 74 + }, + { + "epoch": 0.06084867894315452, + "learning_rate": 3.938288063572962e-06, + "loss": 0.0282, + "step": 76 + }, + { + "epoch": 0.06244995996797438, + "learning_rate": 3.982841901684792e-06, + "loss": 0.4835, + "step": 78 + }, + { + "epoch": 0.06405124099279423, + "learning_rate": 4.027583765579601e-06, + "loss": 0.2517, + "step": 80 + }, + { + "epoch": 0.0656525220176141, + "learning_rate": 4.072512257151546e-06, + "loss": 0.0416, + "step": 82 + }, + { + "epoch": 0.06725380304243395, + "learning_rate": 4.117625972462988e-06, + "loss": 0.0257, + "step": 84 + }, + { + "epoch": 0.0688550840672538, + "learning_rate": 4.1629235017883285e-06, + "loss": 0.0276, + "step": 86 + }, + { + "epoch": 0.07045636509207366, + "learning_rate": 4.208403429658151e-06, + "loss": 0.0505, + "step": 88 + }, + { + "epoch": 0.07205764611689351, + "learning_rate": 4.254064334903347e-06, + "loss": 0.0186, + "step": 90 + }, + { + "epoch": 0.07365892714171338, + "learning_rate": 4.299904790699619e-06, + "loss": 0.0364, + "step": 92 + }, + { + "epoch": 0.07526020816653323, + "learning_rate": 4.345923364612024e-06, + "loss": 0.0523, + "step": 94 + }, + { + "epoch": 0.07686148919135308, + "learning_rate": 4.392118618639698e-06, + "loss": 0.0166, + "step": 96 + }, + { + "epoch": 0.07846277021617294, + "learning_rate": 4.4384891092608795e-06, + "loss": 0.211, + "step": 98 + }, + { + "epoch": 0.08006405124099279, + "learning_rate": 4.485033387477915e-06, + "loss": 0.075, + "step": 100 + }, + { + "epoch": 0.08166533226581266, + "learning_rate": 4.531749998862628e-06, + "loss": 0.0279, + "step": 102 + }, + { + "epoch": 0.08326661329063251, + "learning_rate": 4.578637483601732e-06, + "loss": 0.0037, + "step": 104 + }, + { + "epoch": 0.08486789431545236, + "learning_rate": 4.625694376542399e-06, + "loss": 0.0193, + "step": 106 + }, + { + "epoch": 0.08646917534027222, + "learning_rate": 4.672919207238145e-06, + "loss": 0.0071, + "step": 108 + }, + { + "epoch": 0.08807045636509207, + "learning_rate": 4.720310499994664e-06, + "loss": 0.1025, + "step": 110 + }, + { + "epoch": 0.08967173738991192, + "learning_rate": 4.767866773916041e-06, + "loss": 0.0123, + "step": 112 + }, + { + "epoch": 0.09127301841473179, + "learning_rate": 4.81558654295099e-06, + "loss": 0.0521, + "step": 114 + }, + { + "epoch": 0.09287429943955164, + "learning_rate": 4.863468315939234e-06, + "loss": 0.0082, + "step": 116 + }, + { + "epoch": 0.0944755804643715, + "learning_rate": 4.911510596658202e-06, + "loss": 0.0071, + "step": 118 + }, + { + "epoch": 0.09607686148919135, + "learning_rate": 4.959711883869734e-06, + "loss": 0.0047, + "step": 120 + }, + { + "epoch": 0.0976781425140112, + "learning_rate": 5.0080706713669435e-06, + "loss": 0.0162, + "step": 122 + }, + { + "epoch": 0.09927942353883107, + "learning_rate": 5.056585448021398e-06, + "loss": 0.0991, + "step": 124 + }, + { + "epoch": 0.10088070456365092, + "learning_rate": 5.105254697830208e-06, + "loss": 0.0075, + "step": 126 + }, + { + "epoch": 0.10248198558847077, + "learning_rate": 5.154076899963514e-06, + "loss": 0.2876, + "step": 128 + }, + { + "epoch": 0.10408326661329063, + "learning_rate": 5.203050528811959e-06, + "loss": 0.0301, + "step": 130 + }, + { + "epoch": 0.10568454763811048, + "learning_rate": 5.2521740540343205e-06, + "loss": 0.0061, + "step": 132 + }, + { + "epoch": 0.10728582866293035, + "learning_rate": 5.3014459406054295e-06, + "loss": 0.0076, + "step": 134 + }, + { + "epoch": 0.1088871096877502, + "learning_rate": 5.350864648864026e-06, + "loss": 0.0591, + "step": 136 + }, + { + "epoch": 0.11048839071257005, + "learning_rate": 5.4004286345609665e-06, + "loss": 0.1107, + "step": 138 + }, + { + "epoch": 0.11208967173738991, + "learning_rate": 5.450136348907444e-06, + "loss": 0.0711, + "step": 140 + }, + { + "epoch": 0.11369095276220977, + "learning_rate": 5.499986238623329e-06, + "loss": 0.5889, + "step": 142 + }, + { + "epoch": 0.11529223378702963, + "learning_rate": 5.549976745985809e-06, + "loss": 0.1377, + "step": 144 + }, + { + "epoch": 0.11689351481184948, + "learning_rate": 5.6001063088780085e-06, + "loss": 0.0098, + "step": 146 + }, + { + "epoch": 0.11849479583666933, + "learning_rate": 5.650373360837763e-06, + "loss": 0.0374, + "step": 148 + }, + { + "epoch": 0.1200960768614892, + "learning_rate": 5.700776331106674e-06, + "loss": 0.0817, + "step": 150 + }, + { + "epoch": 0.12169735788630905, + "learning_rate": 5.751313644679071e-06, + "loss": 0.0105, + "step": 152 + }, + { + "epoch": 0.1232986389111289, + "learning_rate": 5.8019837223513295e-06, + "loss": 0.0128, + "step": 154 + }, + { + "epoch": 0.12489991993594876, + "learning_rate": 5.852784980771182e-06, + "loss": 0.0087, + "step": 156 + }, + { + "epoch": 0.1265012009607686, + "learning_rate": 5.903715832487138e-06, + "loss": 0.0367, + "step": 158 + }, + { + "epoch": 0.12810248198558846, + "learning_rate": 5.954774685998206e-06, + "loss": 0.2613, + "step": 160 + }, + { + "epoch": 0.1297037630104083, + "learning_rate": 6.005959945803494e-06, + "loss": 0.1632, + "step": 162 + }, + { + "epoch": 0.1313050440352282, + "learning_rate": 6.057270012452186e-06, + "loss": 0.0613, + "step": 164 + }, + { + "epoch": 0.13290632506004804, + "learning_rate": 6.108703282593461e-06, + "loss": 0.044, + "step": 166 + }, + { + "epoch": 0.1345076060848679, + "learning_rate": 6.160258149026557e-06, + "loss": 0.0222, + "step": 168 + }, + { + "epoch": 0.13610888710968774, + "learning_rate": 6.2119330007511014e-06, + "loss": 0.0747, + "step": 170 + }, + { + "epoch": 0.1377101681345076, + "learning_rate": 6.263726223017326e-06, + "loss": 0.066, + "step": 172 + }, + { + "epoch": 0.13931144915932747, + "learning_rate": 6.315636197376634e-06, + "loss": 0.0046, + "step": 174 + }, + { + "epoch": 0.14091273018414732, + "learning_rate": 6.3676613017321305e-06, + "loss": 0.0141, + "step": 176 + }, + { + "epoch": 0.14251401120896717, + "learning_rate": 6.419799910389257e-06, + "loss": 0.01, + "step": 178 + }, + { + "epoch": 0.14411529223378702, + "learning_rate": 6.472050394106689e-06, + "loss": 0.0232, + "step": 180 + }, + { + "epoch": 0.14571657325860687, + "learning_rate": 6.524411120147204e-06, + "loss": 0.3321, + "step": 182 + }, + { + "epoch": 0.14731785428342675, + "learning_rate": 6.576880452328645e-06, + "loss": 0.0035, + "step": 184 + }, + { + "epoch": 0.1489191353082466, + "learning_rate": 6.6294567510751675e-06, + "loss": 0.0711, + "step": 186 + }, + { + "epoch": 0.15052041633306645, + "learning_rate": 6.682138373468341e-06, + "loss": 1.048, + "step": 188 + }, + { + "epoch": 0.1521216973578863, + "learning_rate": 6.734923673298605e-06, + "loss": 0.0059, + "step": 190 + }, + { + "epoch": 0.15372297838270615, + "learning_rate": 6.787811001116654e-06, + "loss": 0.0337, + "step": 192 + }, + { + "epoch": 0.15532425940752603, + "learning_rate": 6.840798704284939e-06, + "loss": 0.0089, + "step": 194 + }, + { + "epoch": 0.15692554043234588, + "learning_rate": 6.893885127029419e-06, + "loss": 0.1451, + "step": 196 + }, + { + "epoch": 0.15852682145716573, + "learning_rate": 6.94706861049117e-06, + "loss": 0.2645, + "step": 198 + }, + { + "epoch": 0.16012810248198558, + "learning_rate": 7.000347492778341e-06, + "loss": 0.0101, + "step": 200 + }, + { + "epoch": 0.16172938350680544, + "learning_rate": 7.05372010901803e-06, + "loss": 0.0022, + "step": 202 + }, + { + "epoch": 0.1633306645316253, + "learning_rate": 7.1071847914082605e-06, + "loss": 0.0912, + "step": 204 + }, + { + "epoch": 0.16493194555644516, + "learning_rate": 7.160739869270219e-06, + "loss": 0.2989, + "step": 206 + }, + { + "epoch": 0.16653322658126501, + "learning_rate": 7.214383669100317e-06, + "loss": 0.0103, + "step": 208 + }, + { + "epoch": 0.16813450760608487, + "learning_rate": 7.268114514622635e-06, + "loss": 0.2877, + "step": 210 + }, + { + "epoch": 0.16973578863090472, + "learning_rate": 7.321930726841144e-06, + "loss": 0.0287, + "step": 212 + }, + { + "epoch": 0.17133706965572457, + "learning_rate": 7.375830624092336e-06, + "loss": 0.2467, + "step": 214 + }, + { + "epoch": 0.17293835068054444, + "learning_rate": 7.429812522097613e-06, + "loss": 0.0859, + "step": 216 + }, + { + "epoch": 0.1745396317053643, + "learning_rate": 7.4838747340160475e-06, + "loss": 0.0267, + "step": 218 + }, + { + "epoch": 0.17614091273018415, + "learning_rate": 7.538015570497046e-06, + "loss": 0.0165, + "step": 220 + }, + { + "epoch": 0.177742193755004, + "learning_rate": 7.592233339733077e-06, + "loss": 0.0217, + "step": 222 + }, + { + "epoch": 0.17934347477982385, + "learning_rate": 7.646526347512665e-06, + "loss": 0.098, + "step": 224 + }, + { + "epoch": 0.18094475580464373, + "learning_rate": 7.70089289727319e-06, + "loss": 0.0082, + "step": 226 + }, + { + "epoch": 0.18254603682946358, + "learning_rate": 7.755331290154041e-06, + "loss": 0.3836, + "step": 228 + }, + { + "epoch": 0.18414731785428343, + "learning_rate": 7.809839825049565e-06, + "loss": 0.0131, + "step": 230 + }, + { + "epoch": 0.18574859887910328, + "learning_rate": 7.864416798662347e-06, + "loss": 0.0069, + "step": 232 + }, + { + "epoch": 0.18734987990392313, + "learning_rate": 7.919060505556376e-06, + "loss": 0.0101, + "step": 234 + }, + { + "epoch": 0.188951160928743, + "learning_rate": 7.973769238210291e-06, + "loss": 0.0022, + "step": 236 + }, + { + "epoch": 0.19055244195356286, + "learning_rate": 8.028541287070858e-06, + "loss": 0.1303, + "step": 238 + }, + { + "epoch": 0.1921537229783827, + "learning_rate": 8.083374940606256e-06, + "loss": 0.0004, + "step": 240 + }, + { + "epoch": 0.19375500400320256, + "learning_rate": 8.138268485359684e-06, + "loss": 0.0076, + "step": 242 + }, + { + "epoch": 0.1953562850280224, + "learning_rate": 8.193220206002785e-06, + "loss": 0.0099, + "step": 244 + }, + { + "epoch": 0.1969575660528423, + "learning_rate": 8.248228385389349e-06, + "loss": 0.3315, + "step": 246 + }, + { + "epoch": 0.19855884707766214, + "learning_rate": 8.303291304608936e-06, + "loss": 0.0084, + "step": 248 + }, + { + "epoch": 0.200160128102482, + "learning_rate": 8.358407243040524e-06, + "loss": 0.0105, + "step": 250 + }, + { + "epoch": 0.20176140912730184, + "learning_rate": 8.413574478406386e-06, + "loss": 0.3407, + "step": 252 + }, + { + "epoch": 0.2033626901521217, + "learning_rate": 8.468791286825856e-06, + "loss": 0.5892, + "step": 254 + }, + { + "epoch": 0.20496397117694154, + "learning_rate": 8.524055942869135e-06, + "loss": 0.0045, + "step": 256 + }, + { + "epoch": 0.20656525220176142, + "learning_rate": 8.579366719611353e-06, + "loss": 0.4009, + "step": 258 + }, + { + "epoch": 0.20816653322658127, + "learning_rate": 8.634721888686368e-06, + "loss": 0.0144, + "step": 260 + }, + { + "epoch": 0.20976781425140112, + "learning_rate": 8.690119720340907e-06, + "loss": 0.1179, + "step": 262 + }, + { + "epoch": 0.21136909527622097, + "learning_rate": 8.74555848348857e-06, + "loss": 0.0654, + "step": 264 + }, + { + "epoch": 0.21297037630104082, + "learning_rate": 8.801036445763858e-06, + "loss": 0.0525, + "step": 266 + }, + { + "epoch": 0.2145716573258607, + "learning_rate": 8.856551873576448e-06, + "loss": 0.0884, + "step": 268 + }, + { + "epoch": 0.21617293835068055, + "learning_rate": 8.912103032165206e-06, + "loss": 0.0164, + "step": 270 + }, + { + "epoch": 0.2177742193755004, + "learning_rate": 8.967688185652527e-06, + "loss": 0.0241, + "step": 272 + }, + { + "epoch": 0.21937550040032025, + "learning_rate": 9.023305597098526e-06, + "loss": 0.1003, + "step": 274 + }, + { + "epoch": 0.2209767814251401, + "learning_rate": 9.078953528555258e-06, + "loss": 0.0034, + "step": 276 + }, + { + "epoch": 0.22257806244995998, + "learning_rate": 9.134630241121135e-06, + "loss": 0.2084, + "step": 278 + }, + { + "epoch": 0.22417934347477983, + "learning_rate": 9.190333994995208e-06, + "loss": 0.0057, + "step": 280 + }, + { + "epoch": 0.22578062449959968, + "learning_rate": 9.24606304953148e-06, + "loss": 0.4558, + "step": 282 + }, + { + "epoch": 0.22738190552441953, + "learning_rate": 9.301815663293426e-06, + "loss": 0.013, + "step": 284 + }, + { + "epoch": 0.22898318654923938, + "learning_rate": 9.35759009410826e-06, + "loss": 0.0066, + "step": 286 + }, + { + "epoch": 0.23058446757405926, + "learning_rate": 9.41338459912151e-06, + "loss": 0.2102, + "step": 288 + }, + { + "epoch": 0.2321857485988791, + "learning_rate": 9.469197434851414e-06, + "loss": 0.0801, + "step": 290 + }, + { + "epoch": 0.23378702962369896, + "learning_rate": 9.52502685724336e-06, + "loss": 0.0085, + "step": 292 + }, + { + "epoch": 0.2353883106485188, + "learning_rate": 9.580871121724498e-06, + "loss": 0.0431, + "step": 294 + }, + { + "epoch": 0.23698959167333866, + "learning_rate": 9.636728483258116e-06, + "loss": 0.0225, + "step": 296 + }, + { + "epoch": 0.23859087269815854, + "learning_rate": 9.692597196398302e-06, + "loss": 0.0309, + "step": 298 + }, + { + "epoch": 0.2401921537229784, + "learning_rate": 9.748475515344416e-06, + "loss": 0.0128, + "step": 300 + }, + { + "epoch": 0.24179343474779824, + "learning_rate": 9.80436169399561e-06, + "loss": 0.0213, + "step": 302 + }, + { + "epoch": 0.2433947157726181, + "learning_rate": 9.8602539860055e-06, + "loss": 0.2513, + "step": 304 + }, + { + "epoch": 0.24499599679743794, + "learning_rate": 9.916150644836596e-06, + "loss": 0.0577, + "step": 306 + }, + { + "epoch": 0.2465972778222578, + "learning_rate": 9.972049923815011e-06, + "loss": 0.0175, + "step": 308 + }, + { + "epoch": 0.24819855884707767, + "learning_rate": 1.0027950076184982e-05, + "loss": 0.0767, + "step": 310 + }, + { + "epoch": 0.24979983987189752, + "learning_rate": 1.0083849355163397e-05, + "loss": 0.0132, + "step": 312 + }, + { + "epoch": 0.2514011208967174, + "learning_rate": 1.0139746013994493e-05, + "loss": 0.1218, + "step": 314 + }, + { + "epoch": 0.2530024019215372, + "learning_rate": 1.0195638306004383e-05, + "loss": 0.0037, + "step": 316 + }, + { + "epoch": 0.2546036829463571, + "learning_rate": 1.0251524484655577e-05, + "loss": 0.0073, + "step": 318 + }, + { + "epoch": 0.2562049639711769, + "learning_rate": 1.0307402803601691e-05, + "loss": 0.011, + "step": 320 + }, + { + "epoch": 0.2578062449959968, + "learning_rate": 1.0363271516741877e-05, + "loss": 0.004, + "step": 322 + }, + { + "epoch": 0.2594075260208166, + "learning_rate": 1.0419128878275495e-05, + "loss": 0.0018, + "step": 324 + }, + { + "epoch": 0.2610088070456365, + "learning_rate": 1.0474973142756632e-05, + "loss": 0.0055, + "step": 326 + }, + { + "epoch": 0.2626100880704564, + "learning_rate": 1.053080256514858e-05, + "loss": 0.0488, + "step": 328 + }, + { + "epoch": 0.2642113690952762, + "learning_rate": 1.0586615400878484e-05, + "loss": 0.0015, + "step": 330 + }, + { + "epoch": 0.2658126501200961, + "learning_rate": 1.0642409905891733e-05, + "loss": 0.0329, + "step": 332 + }, + { + "epoch": 0.2674139311449159, + "learning_rate": 1.0698184336706567e-05, + "loss": 0.0018, + "step": 334 + }, + { + "epoch": 0.2690152121697358, + "learning_rate": 1.0753936950468513e-05, + "loss": 0.0977, + "step": 336 + }, + { + "epoch": 0.27061649319455566, + "learning_rate": 1.0809666005004787e-05, + "loss": 0.0729, + "step": 338 + }, + { + "epoch": 0.2722177742193755, + "learning_rate": 1.0865369758878858e-05, + "loss": 0.0824, + "step": 340 + }, + { + "epoch": 0.27381905524419536, + "learning_rate": 1.0921046471444737e-05, + "loss": 0.2659, + "step": 342 + }, + { + "epoch": 0.2754203362690152, + "learning_rate": 1.0976694402901467e-05, + "loss": 0.9178, + "step": 344 + }, + { + "epoch": 0.27702161729383507, + "learning_rate": 1.1032311814347467e-05, + "loss": 0.0049, + "step": 346 + }, + { + "epoch": 0.27862289831865494, + "learning_rate": 1.1087896967834787e-05, + "loss": 0.05, + "step": 348 + }, + { + "epoch": 0.28022417934347477, + "learning_rate": 1.1143448126423545e-05, + "loss": 0.038, + "step": 350 + }, + { + "epoch": 0.28182546036829464, + "learning_rate": 1.1198963554236135e-05, + "loss": 0.026, + "step": 352 + }, + { + "epoch": 0.28342674139311447, + "learning_rate": 1.1254441516511425e-05, + "loss": 0.0229, + "step": 354 + }, + { + "epoch": 0.28502802241793435, + "learning_rate": 1.1309880279659087e-05, + "loss": 0.0376, + "step": 356 + }, + { + "epoch": 0.2866293034427542, + "learning_rate": 1.1365278111313625e-05, + "loss": 0.0176, + "step": 358 + }, + { + "epoch": 0.28823058446757405, + "learning_rate": 1.142063328038864e-05, + "loss": 1.1606, + "step": 360 + }, + { + "epoch": 0.2898318654923939, + "learning_rate": 1.1475944057130856e-05, + "loss": 0.0118, + "step": 362 + }, + { + "epoch": 0.29143314651721375, + "learning_rate": 1.1531208713174138e-05, + "loss": 0.1219, + "step": 364 + }, + { + "epoch": 0.2930344275420336, + "learning_rate": 1.1586425521593607e-05, + "loss": 0.0283, + "step": 366 + }, + { + "epoch": 0.2946357085668535, + "learning_rate": 1.1641592756959467e-05, + "loss": 0.0223, + "step": 368 + }, + { + "epoch": 0.2962369895916733, + "learning_rate": 1.1696708695391057e-05, + "loss": 0.4123, + "step": 370 + }, + { + "epoch": 0.2978382706164932, + "learning_rate": 1.1751771614610643e-05, + "loss": 0.0068, + "step": 372 + }, + { + "epoch": 0.29943955164131303, + "learning_rate": 1.180677979399721e-05, + "loss": 0.0158, + "step": 374 + }, + { + "epoch": 0.3010408326661329, + "learning_rate": 1.1861731514640309e-05, + "loss": 0.0095, + "step": 376 + }, + { + "epoch": 0.3026421136909528, + "learning_rate": 1.1916625059393739e-05, + "loss": 0.0251, + "step": 378 + }, + { + "epoch": 0.3042433947157726, + "learning_rate": 1.1971458712929133e-05, + "loss": 0.0583, + "step": 380 + }, + { + "epoch": 0.3058446757405925, + "learning_rate": 1.2026230761789702e-05, + "loss": 0.0195, + "step": 382 + }, + { + "epoch": 0.3074459567654123, + "learning_rate": 1.2080939494443618e-05, + "loss": 0.0327, + "step": 384 + }, + { + "epoch": 0.3090472377902322, + "learning_rate": 1.2135583201337646e-05, + "loss": 0.3051, + "step": 386 + }, + { + "epoch": 0.31064851881505207, + "learning_rate": 1.2190160174950428e-05, + "loss": 0.2871, + "step": 388 + }, + { + "epoch": 0.3122497998398719, + "learning_rate": 1.2244668709845952e-05, + "loss": 0.5664, + "step": 390 + }, + { + "epoch": 0.31385108086469177, + "learning_rate": 1.2299107102726804e-05, + "loss": 0.2616, + "step": 392 + }, + { + "epoch": 0.3154523618895116, + "learning_rate": 1.2353473652487329e-05, + "loss": 0.0139, + "step": 394 + }, + { + "epoch": 0.31705364291433147, + "learning_rate": 1.2407766660266916e-05, + "loss": 0.0156, + "step": 396 + }, + { + "epoch": 0.31865492393915135, + "learning_rate": 1.2461984429502947e-05, + "loss": 0.3355, + "step": 398 + }, + { + "epoch": 0.32025620496397117, + "learning_rate": 1.2516125265983945e-05, + "loss": 0.1616, + "step": 400 + }, + { + "epoch": 0.32185748598879105, + "learning_rate": 1.257018747790238e-05, + "loss": 0.0382, + "step": 402 + }, + { + "epoch": 0.32345876701361087, + "learning_rate": 1.2624169375907657e-05, + "loss": 0.0462, + "step": 404 + }, + { + "epoch": 0.32506004803843075, + "learning_rate": 1.2678069273158849e-05, + "loss": 0.3156, + "step": 406 + }, + { + "epoch": 0.3266613290632506, + "learning_rate": 1.273188548537736e-05, + "loss": 0.0206, + "step": 408 + }, + { + "epoch": 0.32826261008807045, + "learning_rate": 1.2785616330899676e-05, + "loss": 0.2361, + "step": 410 + }, + { + "epoch": 0.32986389111289033, + "learning_rate": 1.2839260130729776e-05, + "loss": 0.2106, + "step": 412 + }, + { + "epoch": 0.33146517213771015, + "learning_rate": 1.2892815208591734e-05, + "loss": 0.0083, + "step": 414 + }, + { + "epoch": 0.33306645316253003, + "learning_rate": 1.2946279890981966e-05, + "loss": 0.0327, + "step": 416 + }, + { + "epoch": 0.33466773418734985, + "learning_rate": 1.2999652507221652e-05, + "loss": 0.0155, + "step": 418 + }, + { + "epoch": 0.33626901521216973, + "learning_rate": 1.3052931389508822e-05, + "loss": 0.2529, + "step": 420 + }, + { + "epoch": 0.3378702962369896, + "learning_rate": 1.3106114872970575e-05, + "loss": 0.0028, + "step": 422 + }, + { + "epoch": 0.33947157726180943, + "learning_rate": 1.3159201295715054e-05, + "loss": 0.0766, + "step": 424 + }, + { + "epoch": 0.3410728582866293, + "learning_rate": 1.321218899888334e-05, + "loss": 0.012, + "step": 426 + }, + { + "epoch": 0.34267413931144913, + "learning_rate": 1.326507632670139e-05, + "loss": 0.3979, + "step": 428 + }, + { + "epoch": 0.344275420336269, + "learning_rate": 1.3317861626531652e-05, + "loss": 0.0081, + "step": 430 + }, + { + "epoch": 0.3458767013610889, + "learning_rate": 1.3370543248924826e-05, + "loss": 0.1765, + "step": 432 + }, + { + "epoch": 0.3474779823859087, + "learning_rate": 1.3423119547671348e-05, + "loss": 0.4849, + "step": 434 + }, + { + "epoch": 0.3490792634107286, + "learning_rate": 1.347558887985279e-05, + "loss": 0.0628, + "step": 436 + }, + { + "epoch": 0.3506805444355484, + "learning_rate": 1.3527949605893305e-05, + "loss": 0.0067, + "step": 438 + }, + { + "epoch": 0.3522818254603683, + "learning_rate": 1.3580200089610739e-05, + "loss": 0.0149, + "step": 440 + }, + { + "epoch": 0.35388310648518817, + "learning_rate": 1.3632338698267863e-05, + "loss": 0.2761, + "step": 442 + }, + { + "epoch": 0.355484387510008, + "learning_rate": 1.368436380262336e-05, + "loss": 0.0693, + "step": 444 + }, + { + "epoch": 0.35708566853482787, + "learning_rate": 1.3736273776982667e-05, + "loss": 0.0683, + "step": 446 + }, + { + "epoch": 0.3586869495596477, + "learning_rate": 1.3788066999248893e-05, + "loss": 0.0024, + "step": 448 + }, + { + "epoch": 0.3602882305844676, + "learning_rate": 1.3839741850973435e-05, + "loss": 0.0881, + "step": 450 + }, + { + "epoch": 0.36188951160928745, + "learning_rate": 1.3891296717406533e-05, + "loss": 0.134, + "step": 452 + }, + { + "epoch": 0.3634907926341073, + "learning_rate": 1.3942729987547808e-05, + "loss": 0.3643, + "step": 454 + }, + { + "epoch": 0.36509207365892715, + "learning_rate": 1.3994040054196498e-05, + "loss": 0.1769, + "step": 456 + }, + { + "epoch": 0.366693354683747, + "learning_rate": 1.4045225314001789e-05, + "loss": 0.3219, + "step": 458 + }, + { + "epoch": 0.36829463570856685, + "learning_rate": 1.4096284167512856e-05, + "loss": 0.3227, + "step": 460 + }, + { + "epoch": 0.36989591673338673, + "learning_rate": 1.4147215019228813e-05, + "loss": 0.7582, + "step": 462 + }, + { + "epoch": 0.37149719775820655, + "learning_rate": 1.4198016277648665e-05, + "loss": 0.1347, + "step": 464 + }, + { + "epoch": 0.37309847878302643, + "learning_rate": 1.4248686355320922e-05, + "loss": 0.4261, + "step": 466 + }, + { + "epoch": 0.37469975980784626, + "learning_rate": 1.429922366889332e-05, + "loss": 0.0435, + "step": 468 + }, + { + "epoch": 0.37630104083266613, + "learning_rate": 1.4349626639162231e-05, + "loss": 0.5976, + "step": 470 + }, + { + "epoch": 0.377902321857486, + "learning_rate": 1.4399893691121985e-05, + "loss": 0.1774, + "step": 472 + }, + { + "epoch": 0.37950360288230583, + "learning_rate": 1.4450023254014185e-05, + "loss": 0.3105, + "step": 474 + }, + { + "epoch": 0.3811048839071257, + "learning_rate": 1.4500013761376663e-05, + "loss": 0.1354, + "step": 476 + }, + { + "epoch": 0.38270616493194554, + "learning_rate": 1.454986365109255e-05, + "loss": 0.1534, + "step": 478 + }, + { + "epoch": 0.3843074459567654, + "learning_rate": 1.4599571365439027e-05, + "loss": 0.1461, + "step": 480 + }, + { + "epoch": 0.3859087269815853, + "learning_rate": 1.4649135351135968e-05, + "loss": 0.0107, + "step": 482 + }, + { + "epoch": 0.3875100080064051, + "learning_rate": 1.4698554059394563e-05, + "loss": 0.1937, + "step": 484 + }, + { + "epoch": 0.389111289031225, + "learning_rate": 1.4747825945965675e-05, + "loss": 0.1782, + "step": 486 + }, + { + "epoch": 0.3907125700560448, + "learning_rate": 1.4796949471188033e-05, + "loss": 0.175, + "step": 488 + }, + { + "epoch": 0.3923138510808647, + "learning_rate": 1.4845923100036479e-05, + "loss": 0.1427, + "step": 490 + }, + { + "epoch": 0.3939151321056846, + "learning_rate": 1.4894745302169786e-05, + "loss": 0.0768, + "step": 492 + }, + { + "epoch": 0.3955164131305044, + "learning_rate": 1.4943414551978597e-05, + "loss": 0.0369, + "step": 494 + }, + { + "epoch": 0.3971176941553243, + "learning_rate": 1.499192932863305e-05, + "loss": 0.005, + "step": 496 + }, + { + "epoch": 0.3987189751801441, + "learning_rate": 1.5040288116130261e-05, + "loss": 0.0398, + "step": 498 + }, + { + "epoch": 0.400320256204964, + "learning_rate": 1.5088489403341793e-05, + "loss": 0.1466, + "step": 500 + }, + { + "epoch": 0.40192153722978385, + "learning_rate": 1.513653168406076e-05, + "loss": 0.2476, + "step": 502 + }, + { + "epoch": 0.4035228182546037, + "learning_rate": 1.5184413457049006e-05, + "loss": 0.0099, + "step": 504 + }, + { + "epoch": 0.40512409927942356, + "learning_rate": 1.5232133226083954e-05, + "loss": 0.025, + "step": 506 + }, + { + "epoch": 0.4067253803042434, + "learning_rate": 1.527968950000533e-05, + "loss": 0.0748, + "step": 508 + }, + { + "epoch": 0.40832666132906326, + "learning_rate": 1.532708079276185e-05, + "loss": 0.8313, + "step": 510 + }, + { + "epoch": 0.4099279423538831, + "learning_rate": 1.5374305623457594e-05, + "loss": 0.7859, + "step": 512 + }, + { + "epoch": 0.41152922337870296, + "learning_rate": 1.542136251639826e-05, + "loss": 0.0223, + "step": 514 + }, + { + "epoch": 0.41313050440352284, + "learning_rate": 1.5468250001137368e-05, + "loss": 0.254, + "step": 516 + }, + { + "epoch": 0.41473178542834266, + "learning_rate": 1.551496661252208e-05, + "loss": 0.2607, + "step": 518 + }, + { + "epoch": 0.41633306645316254, + "learning_rate": 1.5561510890739113e-05, + "loss": 0.0586, + "step": 520 + }, + { + "epoch": 0.41793434747798236, + "learning_rate": 1.5607881381360296e-05, + "loss": 0.0156, + "step": 522 + }, + { + "epoch": 0.41953562850280224, + "learning_rate": 1.565407663538797e-05, + "loss": 0.7957, + "step": 524 + }, + { + "epoch": 0.4211369095276221, + "learning_rate": 1.5700095209300376e-05, + "loss": 0.623, + "step": 526 + }, + { + "epoch": 0.42273819055244194, + "learning_rate": 1.5745935665096647e-05, + "loss": 0.1845, + "step": 528 + }, + { + "epoch": 0.4243394715772618, + "learning_rate": 1.5791596570341844e-05, + "loss": 0.2489, + "step": 530 + }, + { + "epoch": 0.42594075260208164, + "learning_rate": 1.5837076498211666e-05, + "loss": 0.0799, + "step": 532 + }, + { + "epoch": 0.4275420336269015, + "learning_rate": 1.5882374027537005e-05, + "loss": 0.0038, + "step": 534 + }, + { + "epoch": 0.4291433146517214, + "learning_rate": 1.5927487742848448e-05, + "loss": 0.6771, + "step": 536 + }, + { + "epoch": 0.4307445956765412, + "learning_rate": 1.5972416234420393e-05, + "loss": 0.0516, + "step": 538 + }, + { + "epoch": 0.4323458767013611, + "learning_rate": 1.60171580983152e-05, + "loss": 0.1406, + "step": 540 + }, + { + "epoch": 0.4339471577261809, + "learning_rate": 1.606171193642703e-05, + "loss": 0.083, + "step": 542 + }, + { + "epoch": 0.4355484387510008, + "learning_rate": 1.6106076356525474e-05, + "loss": 0.0496, + "step": 544 + }, + { + "epoch": 0.4371497197758207, + "learning_rate": 1.6150249972299153e-05, + "loss": 0.0667, + "step": 546 + }, + { + "epoch": 0.4387510008006405, + "learning_rate": 1.6194231403398994e-05, + "loss": 0.1212, + "step": 548 + }, + { + "epoch": 0.4403522818254604, + "learning_rate": 1.6238019275481313e-05, + "loss": 0.0511, + "step": 550 + }, + { + "epoch": 0.4419535628502802, + "learning_rate": 1.6281612220250883e-05, + "loss": 0.511, + "step": 552 + }, + { + "epoch": 0.4435548438751001, + "learning_rate": 1.6325008875503543e-05, + "loss": 0.0295, + "step": 554 + }, + { + "epoch": 0.44515612489991996, + "learning_rate": 1.6368207885168897e-05, + "loss": 0.5725, + "step": 556 + }, + { + "epoch": 0.4467574059247398, + "learning_rate": 1.641120789935263e-05, + "loss": 0.1666, + "step": 558 + }, + { + "epoch": 0.44835868694955966, + "learning_rate": 1.6454007574378637e-05, + "loss": 0.0335, + "step": 560 + }, + { + "epoch": 0.4499599679743795, + "learning_rate": 1.6496605572831134e-05, + "loss": 0.2936, + "step": 562 + }, + { + "epoch": 0.45156124899919936, + "learning_rate": 1.6539000563596318e-05, + "loss": 0.0672, + "step": 564 + }, + { + "epoch": 0.45316253002401924, + "learning_rate": 1.6581191221904077e-05, + "loss": 0.0445, + "step": 566 + }, + { + "epoch": 0.45476381104883906, + "learning_rate": 1.6623176229369324e-05, + "loss": 0.0843, + "step": 568 + }, + { + "epoch": 0.45636509207365894, + "learning_rate": 1.6664954274033168e-05, + "loss": 0.4623, + "step": 570 + }, + { + "epoch": 0.45796637309847876, + "learning_rate": 1.6706524050403996e-05, + "loss": 0.2791, + "step": 572 + }, + { + "epoch": 0.45956765412329864, + "learning_rate": 1.674788425949818e-05, + "loss": 0.4381, + "step": 574 + }, + { + "epoch": 0.4611689351481185, + "learning_rate": 1.6789033608880735e-05, + "loss": 0.0804, + "step": 576 + }, + { + "epoch": 0.46277021617293834, + "learning_rate": 1.6829970812705674e-05, + "loss": 0.009, + "step": 578 + }, + { + "epoch": 0.4643714971977582, + "learning_rate": 1.6870694591756165e-05, + "loss": 0.0544, + "step": 580 + }, + { + "epoch": 0.46597277822257804, + "learning_rate": 1.6911203673484577e-05, + "loss": 0.248, + "step": 582 + }, + { + "epoch": 0.4675740592473979, + "learning_rate": 1.695149679205214e-05, + "loss": 0.4181, + "step": 584 + }, + { + "epoch": 0.4691753402722178, + "learning_rate": 1.6991572688368628e-05, + "loss": 0.2582, + "step": 586 + }, + { + "epoch": 0.4707766212970376, + "learning_rate": 1.7031430110131562e-05, + "loss": 0.1539, + "step": 588 + }, + { + "epoch": 0.4723779023218575, + "learning_rate": 1.7071067811865467e-05, + "loss": 0.0001, + "step": 590 + }, + { + "epoch": 0.4739791833466773, + "learning_rate": 1.7110484554960738e-05, + "loss": 0.15, + "step": 592 + }, + { + "epoch": 0.4755804643714972, + "learning_rate": 1.7149679107712306e-05, + "loss": 0.114, + "step": 594 + }, + { + "epoch": 0.4771817453963171, + "learning_rate": 1.7188650245358215e-05, + "loss": 0.2967, + "step": 596 + }, + { + "epoch": 0.4787830264211369, + "learning_rate": 1.722739675011779e-05, + "loss": 0.0317, + "step": 598 + }, + { + "epoch": 0.4803843074459568, + "learning_rate": 1.726591741122981e-05, + "loss": 0.0016, + "step": 600 + }, + { + "epoch": 0.4819855884707766, + "learning_rate": 1.730421102499021e-05, + "loss": 0.1812, + "step": 602 + }, + { + "epoch": 0.4835868694955965, + "learning_rate": 1.734227639478982e-05, + "loss": 0.011, + "step": 604 + }, + { + "epoch": 0.4851881505204163, + "learning_rate": 1.738011233115165e-05, + "loss": 0.1, + "step": 606 + }, + { + "epoch": 0.4867894315452362, + "learning_rate": 1.7417717651768144e-05, + "loss": 0.1107, + "step": 608 + }, + { + "epoch": 0.48839071257005606, + "learning_rate": 1.7455091181538087e-05, + "loss": 0.0264, + "step": 610 + }, + { + "epoch": 0.4899919935948759, + "learning_rate": 1.74922317526033e-05, + "loss": 0.0127, + "step": 612 + }, + { + "epoch": 0.49159327461969576, + "learning_rate": 1.752913820438519e-05, + "loss": 0.0264, + "step": 614 + }, + { + "epoch": 0.4931945556445156, + "learning_rate": 1.756580938362096e-05, + "loss": 0.0274, + "step": 616 + }, + { + "epoch": 0.49479583666933546, + "learning_rate": 1.7602244144399693e-05, + "loss": 0.2283, + "step": 618 + }, + { + "epoch": 0.49639711769415534, + "learning_rate": 1.7638441348198147e-05, + "loss": 0.0032, + "step": 620 + }, + { + "epoch": 0.49799839871897517, + "learning_rate": 1.7674399863916295e-05, + "loss": 0.2706, + "step": 622 + }, + { + "epoch": 0.49959967974379504, + "learning_rate": 1.771011856791273e-05, + "loss": 1.2877, + "step": 624 + }, + { + "epoch": 0.5012009607686149, + "learning_rate": 1.7745596344039712e-05, + "loss": 0.0047, + "step": 626 + }, + { + "epoch": 0.5028022417934348, + "learning_rate": 1.7780832083678116e-05, + "loss": 0.1743, + "step": 628 + }, + { + "epoch": 0.5044035228182546, + "learning_rate": 1.7815824685772035e-05, + "loss": 0.0114, + "step": 630 + }, + { + "epoch": 0.5060048038430744, + "learning_rate": 1.7850573056863156e-05, + "loss": 0.0297, + "step": 632 + }, + { + "epoch": 0.5076060848678943, + "learning_rate": 1.7885076111125004e-05, + "loss": 0.1904, + "step": 634 + }, + { + "epoch": 0.5092073658927142, + "learning_rate": 1.791933277039679e-05, + "loss": 0.3071, + "step": 636 + }, + { + "epoch": 0.510808646917534, + "learning_rate": 1.7953341964217183e-05, + "loss": 1.8352, + "step": 638 + }, + { + "epoch": 0.5124099279423538, + "learning_rate": 1.7987102629857696e-05, + "loss": 0.1905, + "step": 640 + }, + { + "epoch": 0.5140112089671738, + "learning_rate": 1.8020613712355912e-05, + "loss": 0.829, + "step": 642 + }, + { + "epoch": 0.5156124899919936, + "learning_rate": 1.805387416454847e-05, + "loss": 0.1595, + "step": 644 + }, + { + "epoch": 0.5172137710168134, + "learning_rate": 1.8086882947103787e-05, + "loss": 0.6371, + "step": 646 + }, + { + "epoch": 0.5188150520416333, + "learning_rate": 1.811963902855447e-05, + "loss": 0.0154, + "step": 648 + }, + { + "epoch": 0.5204163330664532, + "learning_rate": 1.8152141385329658e-05, + "loss": 0.2388, + "step": 650 + }, + { + "epoch": 0.522017614091273, + "learning_rate": 1.8184389001786895e-05, + "loss": 0.2164, + "step": 652 + }, + { + "epoch": 0.5236188951160928, + "learning_rate": 1.821638087024396e-05, + "loss": 0.1124, + "step": 654 + }, + { + "epoch": 0.5252201761409128, + "learning_rate": 1.8248115991010296e-05, + "loss": 0.2451, + "step": 656 + }, + { + "epoch": 0.5268214571657326, + "learning_rate": 1.8279593372418264e-05, + "loss": 0.3718, + "step": 658 + }, + { + "epoch": 0.5284227381905524, + "learning_rate": 1.8310812030854155e-05, + "loss": 0.135, + "step": 660 + }, + { + "epoch": 0.5300240192153723, + "learning_rate": 1.834177099078887e-05, + "loss": 0.11, + "step": 662 + }, + { + "epoch": 0.5316253002401922, + "learning_rate": 1.8372469284808465e-05, + "loss": 0.0877, + "step": 664 + }, + { + "epoch": 0.533226581265012, + "learning_rate": 1.840290595364436e-05, + "loss": 0.0271, + "step": 666 + }, + { + "epoch": 0.5348278622898318, + "learning_rate": 1.8433080046203286e-05, + "loss": 0.0719, + "step": 668 + }, + { + "epoch": 0.5364291433146517, + "learning_rate": 1.8462990619597054e-05, + "loss": 0.3165, + "step": 670 + }, + { + "epoch": 0.5380304243394716, + "learning_rate": 1.8492636739171966e-05, + "loss": 0.0657, + "step": 672 + }, + { + "epoch": 0.5396317053642914, + "learning_rate": 1.8522017478538067e-05, + "loss": 0.6174, + "step": 674 + }, + { + "epoch": 0.5412329863891113, + "learning_rate": 1.855113191959808e-05, + "loss": 0.5329, + "step": 676 + }, + { + "epoch": 0.5428342674139311, + "learning_rate": 1.8579979152576063e-05, + "loss": 0.0865, + "step": 678 + }, + { + "epoch": 0.544435548438751, + "learning_rate": 1.8608558276045895e-05, + "loss": 0.3533, + "step": 680 + }, + { + "epoch": 0.5460368294635709, + "learning_rate": 1.86368683969594e-05, + "loss": 0.2774, + "step": 682 + }, + { + "epoch": 0.5476381104883907, + "learning_rate": 1.866490863067425e-05, + "loss": 0.024, + "step": 684 + }, + { + "epoch": 0.5492393915132106, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.3237, + "step": 686 + }, + { + "epoch": 0.5508406725380304, + "learning_rate": 1.8720175940133705e-05, + "loss": 0.4843, + "step": 688 + }, + { + "epoch": 0.5524419535628503, + "learning_rate": 1.8747401288870472e-05, + "loss": 0.5326, + "step": 690 + }, + { + "epoch": 0.5540432345876701, + "learning_rate": 1.877435329644691e-05, + "loss": 0.0711, + "step": 692 + }, + { + "epoch": 0.55564451561249, + "learning_rate": 1.8801031120659393e-05, + "loss": 0.1451, + "step": 694 + }, + { + "epoch": 0.5572457966373099, + "learning_rate": 1.8827433927872066e-05, + "loss": 0.2235, + "step": 696 + }, + { + "epoch": 0.5588470776621297, + "learning_rate": 1.8853560893042854e-05, + "loss": 0.0714, + "step": 698 + }, + { + "epoch": 0.5604483586869495, + "learning_rate": 1.8879411199749303e-05, + "loss": 0.3484, + "step": 700 + }, + { + "epoch": 0.5620496397117695, + "learning_rate": 1.8904984040214037e-05, + "loss": 0.0289, + "step": 702 + }, + { + "epoch": 0.5636509207365893, + "learning_rate": 1.893027861533002e-05, + "loss": 0.2299, + "step": 704 + }, + { + "epoch": 0.5652522017614091, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.1907, + "step": 706 + }, + { + "epoch": 0.5668534827862289, + "learning_rate": 1.898002981658886e-05, + "loss": 0.0711, + "step": 708 + }, + { + "epoch": 0.5684547638110489, + "learning_rate": 1.9004484888092724e-05, + "loss": 0.9791, + "step": 710 + }, + { + "epoch": 0.5700560448358687, + "learning_rate": 1.9028658585018455e-05, + "loss": 0.1713, + "step": 712 + }, + { + "epoch": 0.5716573258606885, + "learning_rate": 1.9052550151979816e-05, + "loss": 0.0567, + "step": 714 + }, + { + "epoch": 0.5732586068855084, + "learning_rate": 1.9076158842406674e-05, + "loss": 0.1118, + "step": 716 + }, + { + "epoch": 0.5748598879103283, + "learning_rate": 1.909948391856829e-05, + "loss": 0.4813, + "step": 718 + }, + { + "epoch": 0.5764611689351481, + "learning_rate": 1.912252465159637e-05, + "loss": 0.0189, + "step": 720 + }, + { + "epoch": 0.578062449959968, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.0205, + "step": 722 + }, + { + "epoch": 0.5796637309847879, + "learning_rate": 1.9167750217227454e-05, + "loss": 0.0192, + "step": 724 + }, + { + "epoch": 0.5812650120096077, + "learning_rate": 1.9189933636609747e-05, + "loss": 0.1019, + "step": 726 + }, + { + "epoch": 0.5828662930344275, + "learning_rate": 1.9211829886461274e-05, + "loss": 0.3529, + "step": 728 + }, + { + "epoch": 0.5844675740592474, + "learning_rate": 1.9233438282562085e-05, + "loss": 0.1614, + "step": 730 + }, + { + "epoch": 0.5860688550840673, + "learning_rate": 1.925475814968719e-05, + "loss": 0.1906, + "step": 732 + }, + { + "epoch": 0.5876701361088871, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.0572, + "step": 734 + }, + { + "epoch": 0.589271417133707, + "learning_rate": 1.9296529641211215e-05, + "loss": 0.1369, + "step": 736 + }, + { + "epoch": 0.5908726981585268, + "learning_rate": 1.9316979960323286e-05, + "loss": 0.3382, + "step": 738 + }, + { + "epoch": 0.5924739791833467, + "learning_rate": 1.9337139139926707e-05, + "loss": 0.0321, + "step": 740 + }, + { + "epoch": 0.5940752602081665, + "learning_rate": 1.935700655008199e-05, + "loss": 0.189, + "step": 742 + }, + { + "epoch": 0.5956765412329864, + "learning_rate": 1.9376581569966933e-05, + "loss": 0.14, + "step": 744 + }, + { + "epoch": 0.5972778222578062, + "learning_rate": 1.939586358789602e-05, + "loss": 0.4193, + "step": 746 + }, + { + "epoch": 0.5988791032826261, + "learning_rate": 1.9414852001339547e-05, + "loss": 0.1203, + "step": 748 + }, + { + "epoch": 0.600480384307446, + "learning_rate": 1.9433546216942423e-05, + "loss": 0.0388, + "step": 750 + }, + { + "epoch": 0.6020816653322658, + "learning_rate": 1.945194565054276e-05, + "loss": 1.3002, + "step": 752 + }, + { + "epoch": 0.6036829463570856, + "learning_rate": 1.9470049727190073e-05, + "loss": 0.1144, + "step": 754 + }, + { + "epoch": 0.6052842273819056, + "learning_rate": 1.948785788116329e-05, + "loss": 0.2157, + "step": 756 + }, + { + "epoch": 0.6068855084067254, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.109, + "step": 758 + }, + { + "epoch": 0.6084867894315452, + "learning_rate": 1.952258420445583e-05, + "loss": 1.3681, + "step": 760 + }, + { + "epoch": 0.610088070456365, + "learning_rate": 1.953950128863762e-05, + "loss": 0.3985, + "step": 762 + }, + { + "epoch": 0.611689351481185, + "learning_rate": 1.9556120279904144e-05, + "loss": 0.1771, + "step": 764 + }, + { + "epoch": 0.6132906325060048, + "learning_rate": 1.957244065894066e-05, + "loss": 0.0865, + "step": 766 + }, + { + "epoch": 0.6148919135308246, + "learning_rate": 1.9588461915763566e-05, + "loss": 0.1155, + "step": 768 + }, + { + "epoch": 0.6164931945556446, + "learning_rate": 1.9604183549736283e-05, + "loss": 0.2388, + "step": 770 + }, + { + "epoch": 0.6180944755804644, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.4045, + "step": 772 + }, + { + "epoch": 0.6196957566052842, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.2719, + "step": 774 + }, + { + "epoch": 0.6212970376301041, + "learning_rate": 1.964954584871995e-05, + "loss": 0.0497, + "step": 776 + }, + { + "epoch": 0.622898318654924, + "learning_rate": 1.966406417240872e-05, + "loss": 0.1746, + "step": 778 + }, + { + "epoch": 0.6244995996797438, + "learning_rate": 1.967828051080755e-05, + "loss": 0.1996, + "step": 780 + }, + { + "epoch": 0.6261008807045636, + "learning_rate": 1.969219441968046e-05, + "loss": 0.0518, + "step": 782 + }, + { + "epoch": 0.6277021617293835, + "learning_rate": 1.9705805464241856e-05, + "loss": 0.3466, + "step": 784 + }, + { + "epoch": 0.6293034427542034, + "learning_rate": 1.971911321917015e-05, + "loss": 0.2456, + "step": 786 + }, + { + "epoch": 0.6309047237790232, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.4983, + "step": 788 + }, + { + "epoch": 0.6325060048038431, + "learning_rate": 1.9744817206240377e-05, + "loss": 0.1538, + "step": 790 + }, + { + "epoch": 0.6341072858286629, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.1137, + "step": 792 + }, + { + "epoch": 0.6357085668534828, + "learning_rate": 1.976930316809569e-05, + "loss": 0.2823, + "step": 794 + }, + { + "epoch": 0.6373098478783027, + "learning_rate": 1.978108842718768e-05, + "loss": 0.0788, + "step": 796 + }, + { + "epoch": 0.6389111289031225, + "learning_rate": 1.9792568044184176e-05, + "loss": 0.1359, + "step": 798 + }, + { + "epoch": 0.6405124099279423, + "learning_rate": 1.9803741660367015e-05, + "loss": 0.1467, + "step": 800 + }, + { + "epoch": 0.6421136909527622, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.232, + "step": 802 + }, + { + "epoch": 0.6437149719775821, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.3375, + "step": 804 + }, + { + "epoch": 0.6453162530024019, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.0389, + "step": 806 + }, + { + "epoch": 0.6469175340272217, + "learning_rate": 1.9845369277495102e-05, + "loss": 0.5332, + "step": 808 + }, + { + "epoch": 0.6485188150520417, + "learning_rate": 1.985500784388244e-05, + "loss": 0.0348, + "step": 810 + }, + { + "epoch": 0.6501200960768615, + "learning_rate": 1.9864338458320366e-05, + "loss": 0.2578, + "step": 812 + }, + { + "epoch": 0.6517213771016813, + "learning_rate": 1.9873360829243323e-05, + "loss": 0.0191, + "step": 814 + }, + { + "epoch": 0.6533226581265013, + "learning_rate": 1.9882074674717836e-05, + "loss": 0.2958, + "step": 816 + }, + { + "epoch": 0.6549239391513211, + "learning_rate": 1.989047972245129e-05, + "loss": 0.498, + "step": 818 + }, + { + "epoch": 0.6565252201761409, + "learning_rate": 1.989857570980049e-05, + "loss": 0.0116, + "step": 820 + }, + { + "epoch": 0.6581265012009607, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.0858, + "step": 822 + }, + { + "epoch": 0.6597277822257807, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.46, + "step": 824 + }, + { + "epoch": 0.6613290632506005, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.0252, + "step": 826 + }, + { + "epoch": 0.6629303442754203, + "learning_rate": 1.9927864140670615e-05, + "loss": 0.0383, + "step": 828 + }, + { + "epoch": 0.6645316253002402, + "learning_rate": 1.99344112247369e-05, + "loss": 0.2299, + "step": 830 + }, + { + "epoch": 0.6661329063250601, + "learning_rate": 1.9940647875635463e-05, + "loss": 0.0138, + "step": 832 + }, + { + "epoch": 0.6677341873498799, + "learning_rate": 1.994657389848176e-05, + "loss": 0.2264, + "step": 834 + }, + { + "epoch": 0.6693354683746997, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.0547, + "step": 836 + }, + { + "epoch": 0.6709367493995196, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.5327, + "step": 838 + }, + { + "epoch": 0.6725380304243395, + "learning_rate": 1.996248639549475e-05, + "loss": 0.1396, + "step": 840 + }, + { + "epoch": 0.6741393114491593, + "learning_rate": 1.9967168151503196e-05, + "loss": 0.1442, + "step": 842 + }, + { + "epoch": 0.6757405924739792, + "learning_rate": 1.997153845074662e-05, + "loss": 0.2086, + "step": 844 + }, + { + "epoch": 0.677341873498799, + "learning_rate": 1.997559715666073e-05, + "loss": 0.9181, + "step": 846 + }, + { + "epoch": 0.6789431545236189, + "learning_rate": 1.9979344142417986e-05, + "loss": 0.2542, + "step": 848 + }, + { + "epoch": 0.6805444355484388, + "learning_rate": 1.998277929093157e-05, + "loss": 0.8229, + "step": 850 + }, + { + "epoch": 0.6821457165732586, + "learning_rate": 1.9985902494859023e-05, + "loss": 0.0208, + "step": 852 + }, + { + "epoch": 0.6837469975980784, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.2371, + "step": 854 + }, + { + "epoch": 0.6853482786228983, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.1165, + "step": 856 + }, + { + "epoch": 0.6869495596477182, + "learning_rate": 1.999339951193407e-05, + "loss": 0.1002, + "step": 858 + }, + { + "epoch": 0.688550840672538, + "learning_rate": 1.9995274059091018e-05, + "loss": 0.0635, + "step": 860 + }, + { + "epoch": 0.6901521216973578, + "learning_rate": 1.999683627122195e-05, + "loss": 0.3659, + "step": 862 + }, + { + "epoch": 0.6917534027221778, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.1194, + "step": 864 + }, + { + "epoch": 0.6933546837469976, + "learning_rate": 1.99990235049015e-05, + "loss": 0.1655, + "step": 866 + }, + { + "epoch": 0.6949559647718174, + "learning_rate": 1.999964845810285e-05, + "loss": 0.2137, + "step": 868 + }, + { + "epoch": 0.6965572457966374, + "learning_rate": 1.999996093958578e-05, + "loss": 0.7598, + "step": 870 + }, + { + "epoch": 0.6981585268214572, + "learning_rate": 1.999996093958578e-05, + "loss": 0.0014, + "step": 872 + }, + { + "epoch": 0.699759807846277, + "learning_rate": 1.999964845810285e-05, + "loss": 0.0453, + "step": 874 + }, + { + "epoch": 0.7013610888710968, + "learning_rate": 1.99990235049015e-05, + "loss": 0.0608, + "step": 876 + }, + { + "epoch": 0.7029623698959168, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.0868, + "step": 878 + }, + { + "epoch": 0.7045636509207366, + "learning_rate": 1.999683627122195e-05, + "loss": 0.1059, + "step": 880 + }, + { + "epoch": 0.7061649319455564, + "learning_rate": 1.999527405909102e-05, + "loss": 0.1273, + "step": 882 + }, + { + "epoch": 0.7077662129703763, + "learning_rate": 1.999339951193407e-05, + "loss": 0.2293, + "step": 884 + }, + { + "epoch": 0.7093674939951962, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.4514, + "step": 886 + }, + { + "epoch": 0.710968775020016, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.1176, + "step": 888 + }, + { + "epoch": 0.7125700560448359, + "learning_rate": 1.9985902494859026e-05, + "loss": 0.1113, + "step": 890 + }, + { + "epoch": 0.7141713370696557, + "learning_rate": 1.9982779290931572e-05, + "loss": 0.0194, + "step": 892 + }, + { + "epoch": 0.7157726180944756, + "learning_rate": 1.997934414241799e-05, + "loss": 0.0459, + "step": 894 + }, + { + "epoch": 0.7173738991192954, + "learning_rate": 1.997559715666073e-05, + "loss": 0.1905, + "step": 896 + }, + { + "epoch": 0.7189751801441153, + "learning_rate": 1.997153845074662e-05, + "loss": 0.4555, + "step": 898 + }, + { + "epoch": 0.7205764611689351, + "learning_rate": 1.9967168151503193e-05, + "loss": 0.1606, + "step": 900 + }, + { + "epoch": 0.722177742193755, + "learning_rate": 1.9962486395494753e-05, + "loss": 0.4599, + "step": 902 + }, + { + "epoch": 0.7237790232185749, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.5361, + "step": 904 + }, + { + "epoch": 0.7253803042433947, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.1093, + "step": 906 + }, + { + "epoch": 0.7269815852682145, + "learning_rate": 1.994657389848176e-05, + "loss": 0.1691, + "step": 908 + }, + { + "epoch": 0.7285828662930345, + "learning_rate": 1.9940647875635466e-05, + "loss": 0.0625, + "step": 910 + }, + { + "epoch": 0.7301841473178543, + "learning_rate": 1.99344112247369e-05, + "loss": 0.0687, + "step": 912 + }, + { + "epoch": 0.7317854283426741, + "learning_rate": 1.9927864140670618e-05, + "loss": 0.0299, + "step": 914 + }, + { + "epoch": 0.733386709367494, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.0043, + "step": 916 + }, + { + "epoch": 0.7349879903923139, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.4055, + "step": 918 + }, + { + "epoch": 0.7365892714171337, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.2288, + "step": 920 + }, + { + "epoch": 0.7381905524419535, + "learning_rate": 1.989857570980049e-05, + "loss": 0.0064, + "step": 922 + }, + { + "epoch": 0.7397918334667735, + "learning_rate": 1.9890479722451292e-05, + "loss": 0.0055, + "step": 924 + }, + { + "epoch": 0.7413931144915933, + "learning_rate": 1.9882074674717832e-05, + "loss": 0.2182, + "step": 926 + }, + { + "epoch": 0.7429943955164131, + "learning_rate": 1.987336082924333e-05, + "loss": 0.2219, + "step": 928 + }, + { + "epoch": 0.7445956765412329, + "learning_rate": 1.986433845832037e-05, + "loss": 0.2671, + "step": 930 + }, + { + "epoch": 0.7461969575660529, + "learning_rate": 1.9855007843882437e-05, + "loss": 0.0311, + "step": 932 + }, + { + "epoch": 0.7477982385908727, + "learning_rate": 1.9845369277495105e-05, + "loss": 0.0492, + "step": 934 + }, + { + "epoch": 0.7493995196156925, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.0139, + "step": 936 + }, + { + "epoch": 0.7510008006405124, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.0928, + "step": 938 + }, + { + "epoch": 0.7526020816653323, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.1338, + "step": 940 + }, + { + "epoch": 0.7542033626901521, + "learning_rate": 1.9803741660367018e-05, + "loss": 0.1548, + "step": 942 + }, + { + "epoch": 0.755804643714972, + "learning_rate": 1.979256804418418e-05, + "loss": 0.0043, + "step": 944 + }, + { + "epoch": 0.7574059247397918, + "learning_rate": 1.9781088427187677e-05, + "loss": 0.0023, + "step": 946 + }, + { + "epoch": 0.7590072057646117, + "learning_rate": 1.976930316809569e-05, + "loss": 0.0889, + "step": 948 + }, + { + "epoch": 0.7606084867894315, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.1767, + "step": 950 + }, + { + "epoch": 0.7622097678142514, + "learning_rate": 1.9744817206240374e-05, + "loss": 0.2258, + "step": 952 + }, + { + "epoch": 0.7638110488390712, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.0001, + "step": 954 + }, + { + "epoch": 0.7654123298638911, + "learning_rate": 1.9719113219170152e-05, + "loss": 0.3736, + "step": 956 + }, + { + "epoch": 0.767013610888711, + "learning_rate": 1.970580546424186e-05, + "loss": 0.0531, + "step": 958 + }, + { + "epoch": 0.7686148919135308, + "learning_rate": 1.9692194419680463e-05, + "loss": 0.0288, + "step": 960 + }, + { + "epoch": 0.7702161729383507, + "learning_rate": 1.9678280510807552e-05, + "loss": 0.4798, + "step": 962 + }, + { + "epoch": 0.7718174539631706, + "learning_rate": 1.966406417240872e-05, + "loss": 0.0031, + "step": 964 + }, + { + "epoch": 0.7734187349879904, + "learning_rate": 1.964954584871995e-05, + "loss": 0.3823, + "step": 966 + }, + { + "epoch": 0.7750200160128102, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.0076, + "step": 968 + }, + { + "epoch": 0.77662129703763, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.873, + "step": 970 + }, + { + "epoch": 0.77822257806245, + "learning_rate": 1.9604183549736287e-05, + "loss": 0.6369, + "step": 972 + }, + { + "epoch": 0.7798238590872698, + "learning_rate": 1.958846191576357e-05, + "loss": 0.0925, + "step": 974 + }, + { + "epoch": 0.7814251401120896, + "learning_rate": 1.9572440658940667e-05, + "loss": 0.3141, + "step": 976 + }, + { + "epoch": 0.7830264211369096, + "learning_rate": 1.955612027990415e-05, + "loss": 0.384, + "step": 978 + }, + { + "epoch": 0.7846277021617294, + "learning_rate": 1.953950128863763e-05, + "loss": 0.0391, + "step": 980 + }, + { + "epoch": 0.7862289831865492, + "learning_rate": 1.9522584204455835e-05, + "loss": 0.0014, + "step": 982 + }, + { + "epoch": 0.7878302642113691, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.0192, + "step": 984 + }, + { + "epoch": 0.789431545236189, + "learning_rate": 1.9487857881163295e-05, + "loss": 0.0366, + "step": 986 + }, + { + "epoch": 0.7910328262610088, + "learning_rate": 1.947004972719008e-05, + "loss": 0.0459, + "step": 988 + }, + { + "epoch": 0.7926341072858286, + "learning_rate": 1.945194565054276e-05, + "loss": 0.0674, + "step": 990 + }, + { + "epoch": 0.7942353883106485, + "learning_rate": 1.9433546216942433e-05, + "loss": 0.0146, + "step": 992 + }, + { + "epoch": 0.7958366693354684, + "learning_rate": 1.941485200133955e-05, + "loss": 0.1375, + "step": 994 + }, + { + "epoch": 0.7974379503602882, + "learning_rate": 1.9395863587896025e-05, + "loss": 0.068, + "step": 996 + }, + { + "epoch": 0.7990392313851081, + "learning_rate": 1.937658156996694e-05, + "loss": 0.099, + "step": 998 + }, + { + "epoch": 0.800640512409928, + "learning_rate": 1.9357006550082e-05, + "loss": 0.0079, + "step": 1000 + }, + { + "epoch": 0.8022417934347478, + "learning_rate": 1.933713913992671e-05, + "loss": 0.0543, + "step": 1002 + }, + { + "epoch": 0.8038430744595677, + "learning_rate": 1.9316979960323283e-05, + "loss": 1.0646, + "step": 1004 + }, + { + "epoch": 0.8054443554843875, + "learning_rate": 1.9296529641211226e-05, + "loss": 0.2528, + "step": 1006 + }, + { + "epoch": 0.8070456365092074, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.476, + "step": 1008 + }, + { + "epoch": 0.8086469175340272, + "learning_rate": 1.9254758149687187e-05, + "loss": 0.2421, + "step": 1010 + }, + { + "epoch": 0.8102481985588471, + "learning_rate": 1.9233438282562095e-05, + "loss": 0.2072, + "step": 1012 + }, + { + "epoch": 0.8118494795836669, + "learning_rate": 1.9211829886461278e-05, + "loss": 0.0423, + "step": 1014 + }, + { + "epoch": 0.8134507606084868, + "learning_rate": 1.918993363660975e-05, + "loss": 0.237, + "step": 1016 + }, + { + "epoch": 0.8150520416333067, + "learning_rate": 1.916775021722745e-05, + "loss": 0.0965, + "step": 1018 + }, + { + "epoch": 0.8166533226581265, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.0664, + "step": 1020 + }, + { + "epoch": 0.8182546036829463, + "learning_rate": 1.9122524651596372e-05, + "loss": 0.0601, + "step": 1022 + }, + { + "epoch": 0.8198558847077662, + "learning_rate": 1.9099483918568287e-05, + "loss": 0.8941, + "step": 1024 + }, + { + "epoch": 0.8214571657325861, + "learning_rate": 1.907615884240668e-05, + "loss": 0.1116, + "step": 1026 + }, + { + "epoch": 0.8230584467574059, + "learning_rate": 1.905255015197982e-05, + "loss": 0.3439, + "step": 1028 + }, + { + "epoch": 0.8246597277822257, + "learning_rate": 1.902865858501845e-05, + "loss": 0.2088, + "step": 1030 + }, + { + "epoch": 0.8262610088070457, + "learning_rate": 1.9004484888092734e-05, + "loss": 0.1967, + "step": 1032 + }, + { + "epoch": 0.8278622898318655, + "learning_rate": 1.8980029816588863e-05, + "loss": 0.2113, + "step": 1034 + }, + { + "epoch": 0.8294635708566853, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.0119, + "step": 1036 + }, + { + "epoch": 0.8310648518815053, + "learning_rate": 1.893027861533003e-05, + "loss": 0.1752, + "step": 1038 + }, + { + "epoch": 0.8326661329063251, + "learning_rate": 1.8904984040214043e-05, + "loss": 0.1147, + "step": 1040 + }, + { + "epoch": 0.8342674139311449, + "learning_rate": 1.8879411199749306e-05, + "loss": 0.037, + "step": 1042 + }, + { + "epoch": 0.8358686949559647, + "learning_rate": 1.885356089304285e-05, + "loss": 0.0085, + "step": 1044 + }, + { + "epoch": 0.8374699759807847, + "learning_rate": 1.882743392787207e-05, + "loss": 0.0267, + "step": 1046 + }, + { + "epoch": 0.8390712570056045, + "learning_rate": 1.8801031120659396e-05, + "loss": 0.6913, + "step": 1048 + }, + { + "epoch": 0.8406725380304243, + "learning_rate": 1.877435329644691e-05, + "loss": 0.0912, + "step": 1050 + }, + { + "epoch": 0.8422738190552442, + "learning_rate": 1.8747401288870482e-05, + "loss": 0.02, + "step": 1052 + }, + { + "epoch": 0.8438751000800641, + "learning_rate": 1.8720175940133712e-05, + "loss": 0.3667, + "step": 1054 + }, + { + "epoch": 0.8454763811048839, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.3677, + "step": 1056 + }, + { + "epoch": 0.8470776621297038, + "learning_rate": 1.8664908630674264e-05, + "loss": 0.0142, + "step": 1058 + }, + { + "epoch": 0.8486789431545236, + "learning_rate": 1.8636868396959406e-05, + "loss": 0.1325, + "step": 1060 + }, + { + "epoch": 0.8502802241793435, + "learning_rate": 1.8608558276045898e-05, + "loss": 0.1181, + "step": 1062 + }, + { + "epoch": 0.8518815052041633, + "learning_rate": 1.8579979152576076e-05, + "loss": 0.3335, + "step": 1064 + }, + { + "epoch": 0.8534827862289832, + "learning_rate": 1.8551131919598084e-05, + "loss": 0.1456, + "step": 1066 + }, + { + "epoch": 0.855084067253803, + "learning_rate": 1.852201747853807e-05, + "loss": 0.0368, + "step": 1068 + }, + { + "epoch": 0.8566853482786229, + "learning_rate": 1.849263673917196e-05, + "loss": 0.1909, + "step": 1070 + }, + { + "epoch": 0.8582866293034428, + "learning_rate": 1.846299061959706e-05, + "loss": 0.1086, + "step": 1072 + }, + { + "epoch": 0.8598879103282626, + "learning_rate": 1.8433080046203293e-05, + "loss": 0.041, + "step": 1074 + }, + { + "epoch": 0.8614891913530824, + "learning_rate": 1.8402905953644356e-05, + "loss": 0.0564, + "step": 1076 + }, + { + "epoch": 0.8630904723779024, + "learning_rate": 1.837246928480848e-05, + "loss": 0.0264, + "step": 1078 + }, + { + "epoch": 0.8646917534027222, + "learning_rate": 1.8341770990788874e-05, + "loss": 0.1879, + "step": 1080 + }, + { + "epoch": 0.866293034427542, + "learning_rate": 1.831081203085415e-05, + "loss": 0.1047, + "step": 1082 + }, + { + "epoch": 0.8678943154523618, + "learning_rate": 1.8279593372418284e-05, + "loss": 0.1068, + "step": 1084 + }, + { + "epoch": 0.8694955964771818, + "learning_rate": 1.8248115991010303e-05, + "loss": 0.319, + "step": 1086 + }, + { + "epoch": 0.8710968775020016, + "learning_rate": 1.8216380870243963e-05, + "loss": 0.0001, + "step": 1088 + }, + { + "epoch": 0.8726981585268214, + "learning_rate": 1.8184389001786912e-05, + "loss": 0.2298, + "step": 1090 + }, + { + "epoch": 0.8742994395516414, + "learning_rate": 1.815214138532966e-05, + "loss": 0.048, + "step": 1092 + }, + { + "epoch": 0.8759007205764612, + "learning_rate": 1.8119639028554475e-05, + "loss": 0.0002, + "step": 1094 + }, + { + "epoch": 0.877502001601281, + "learning_rate": 1.808688294710378e-05, + "loss": 0.0071, + "step": 1096 + }, + { + "epoch": 0.8791032826261009, + "learning_rate": 1.805387416454849e-05, + "loss": 0.033, + "step": 1098 + }, + { + "epoch": 0.8807045636509208, + "learning_rate": 1.802061371235592e-05, + "loss": 0.0605, + "step": 1100 + }, + { + "epoch": 0.8823058446757406, + "learning_rate": 1.7987102629857692e-05, + "loss": 0.0408, + "step": 1102 + }, + { + "epoch": 0.8839071257005604, + "learning_rate": 1.7953341964217196e-05, + "loss": 0.1925, + "step": 1104 + }, + { + "epoch": 0.8855084067253803, + "learning_rate": 1.7919332770396798e-05, + "loss": 0.0207, + "step": 1106 + }, + { + "epoch": 0.8871096877502002, + "learning_rate": 1.7885076111125e-05, + "loss": 0.1387, + "step": 1108 + }, + { + "epoch": 0.88871096877502, + "learning_rate": 1.7850573056863173e-05, + "loss": 0.0176, + "step": 1110 + }, + { + "epoch": 0.8903122497998399, + "learning_rate": 1.7815824685772042e-05, + "loss": 0.0212, + "step": 1112 + }, + { + "epoch": 0.8919135308246597, + "learning_rate": 1.7780832083678122e-05, + "loss": 0.0108, + "step": 1114 + }, + { + "epoch": 0.8935148118494796, + "learning_rate": 1.774559634403971e-05, + "loss": 0.005, + "step": 1116 + }, + { + "epoch": 0.8951160928742994, + "learning_rate": 1.7710118567912732e-05, + "loss": 0.0017, + "step": 1118 + }, + { + "epoch": 0.8967173738991193, + "learning_rate": 1.7674399863916298e-05, + "loss": 0.0897, + "step": 1120 + }, + { + "epoch": 0.8983186549239391, + "learning_rate": 1.7638441348198144e-05, + "loss": 0.5343, + "step": 1122 + }, + { + "epoch": 0.899919935948759, + "learning_rate": 1.7602244144399713e-05, + "loss": 0.2319, + "step": 1124 + }, + { + "epoch": 0.9015212169735789, + "learning_rate": 1.7565809383620966e-05, + "loss": 0.0114, + "step": 1126 + }, + { + "epoch": 0.9031224979983987, + "learning_rate": 1.7529138204385186e-05, + "loss": 0.1284, + "step": 1128 + }, + { + "epoch": 0.9047237790232185, + "learning_rate": 1.7492231752603305e-05, + "loss": 0.5426, + "step": 1130 + }, + { + "epoch": 0.9063250600480385, + "learning_rate": 1.7455091181538094e-05, + "loss": 0.0196, + "step": 1132 + }, + { + "epoch": 0.9079263410728583, + "learning_rate": 1.741771765176815e-05, + "loss": 0.0165, + "step": 1134 + }, + { + "epoch": 0.9095276220976781, + "learning_rate": 1.7380112331151657e-05, + "loss": 1.0646, + "step": 1136 + }, + { + "epoch": 0.911128903122498, + "learning_rate": 1.7342276394789825e-05, + "loss": 0.0055, + "step": 1138 + }, + { + "epoch": 0.9127301841473179, + "learning_rate": 1.7304211024990216e-05, + "loss": 0.5121, + "step": 1140 + }, + { + "epoch": 0.9143314651721377, + "learning_rate": 1.7265917411229803e-05, + "loss": 0.1019, + "step": 1142 + }, + { + "epoch": 0.9159327461969575, + "learning_rate": 1.7227396750117802e-05, + "loss": 0.0352, + "step": 1144 + }, + { + "epoch": 0.9175340272217775, + "learning_rate": 1.718865024535822e-05, + "loss": 1.2906, + "step": 1146 + }, + { + "epoch": 0.9191353082465973, + "learning_rate": 1.7149679107712317e-05, + "loss": 0.1183, + "step": 1148 + }, + { + "epoch": 0.9207365892714171, + "learning_rate": 1.711048455496075e-05, + "loss": 0.5906, + "step": 1150 + }, + { + "epoch": 0.922337870296237, + "learning_rate": 1.7071067811865474e-05, + "loss": 0.0863, + "step": 1152 + }, + { + "epoch": 0.9239391513210569, + "learning_rate": 1.7031430110131566e-05, + "loss": 0.0848, + "step": 1154 + }, + { + "epoch": 0.9255404323458767, + "learning_rate": 1.699157268836863e-05, + "loss": 0.1044, + "step": 1156 + }, + { + "epoch": 0.9271417133706965, + "learning_rate": 1.6951496792052148e-05, + "loss": 0.1233, + "step": 1158 + }, + { + "epoch": 0.9287429943955164, + "learning_rate": 1.6911203673484583e-05, + "loss": 0.2436, + "step": 1160 + }, + { + "epoch": 0.9303442754203363, + "learning_rate": 1.687069459175619e-05, + "loss": 0.2321, + "step": 1162 + }, + { + "epoch": 0.9319455564451561, + "learning_rate": 1.682997081270568e-05, + "loss": 0.2943, + "step": 1164 + }, + { + "epoch": 0.933546837469976, + "learning_rate": 1.6789033608880742e-05, + "loss": 0.1561, + "step": 1166 + }, + { + "epoch": 0.9351481184947958, + "learning_rate": 1.6747884259498185e-05, + "loss": 0.2669, + "step": 1168 + }, + { + "epoch": 0.9367493995196157, + "learning_rate": 1.6706524050404006e-05, + "loss": 0.0965, + "step": 1170 + }, + { + "epoch": 0.9383506805444356, + "learning_rate": 1.6664954274033175e-05, + "loss": 0.0963, + "step": 1172 + }, + { + "epoch": 0.9399519615692554, + "learning_rate": 1.662317622936933e-05, + "loss": 0.1669, + "step": 1174 + }, + { + "epoch": 0.9415532425940752, + "learning_rate": 1.6581191221904098e-05, + "loss": 0.1955, + "step": 1176 + }, + { + "epoch": 0.9431545236188951, + "learning_rate": 1.6539000563596328e-05, + "loss": 0.1589, + "step": 1178 + }, + { + "epoch": 0.944755804643715, + "learning_rate": 1.6496605572831127e-05, + "loss": 0.2771, + "step": 1180 + }, + { + "epoch": 0.9463570856685348, + "learning_rate": 1.6454007574378657e-05, + "loss": 0.0502, + "step": 1182 + }, + { + "epoch": 0.9479583666933546, + "learning_rate": 1.6411207899352633e-05, + "loss": 0.2301, + "step": 1184 + }, + { + "epoch": 0.9495596477181746, + "learning_rate": 1.6368207885168904e-05, + "loss": 0.1753, + "step": 1186 + }, + { + "epoch": 0.9511609287429944, + "learning_rate": 1.6325008875503563e-05, + "loss": 0.0301, + "step": 1188 + }, + { + "epoch": 0.9527622097678142, + "learning_rate": 1.628161222025089e-05, + "loss": 0.1463, + "step": 1190 + }, + { + "epoch": 0.9543634907926342, + "learning_rate": 1.623801927548132e-05, + "loss": 0.0355, + "step": 1192 + }, + { + "epoch": 0.955964771817454, + "learning_rate": 1.6194231403398987e-05, + "loss": 0.0165, + "step": 1194 + }, + { + "epoch": 0.9575660528422738, + "learning_rate": 1.6150249972299173e-05, + "loss": 0.0333, + "step": 1196 + }, + { + "epoch": 0.9591673338670936, + "learning_rate": 1.6106076356525484e-05, + "loss": 0.2193, + "step": 1198 + }, + { + "epoch": 0.9607686148919136, + "learning_rate": 1.6061711936427028e-05, + "loss": 0.1044, + "step": 1200 + }, + { + "epoch": 0.9623698959167334, + "learning_rate": 1.6017158098315224e-05, + "loss": 0.0319, + "step": 1202 + }, + { + "epoch": 0.9639711769415532, + "learning_rate": 1.5972416234420404e-05, + "loss": 0.03, + "step": 1204 + }, + { + "epoch": 0.9655724579663731, + "learning_rate": 1.592748774284844e-05, + "loss": 0.0717, + "step": 1206 + }, + { + "epoch": 0.967173738991193, + "learning_rate": 1.588237402753703e-05, + "loss": 0.0346, + "step": 1208 + }, + { + "epoch": 0.9687750200160128, + "learning_rate": 1.5837076498211673e-05, + "loss": 0.9439, + "step": 1210 + }, + { + "epoch": 0.9703763010408326, + "learning_rate": 1.579159657034185e-05, + "loss": 0.159, + "step": 1212 + }, + { + "epoch": 0.9719775820656525, + "learning_rate": 1.574593566509664e-05, + "loss": 0.0223, + "step": 1214 + }, + { + "epoch": 0.9735788630904724, + "learning_rate": 1.5700095209300386e-05, + "loss": 0.01, + "step": 1216 + }, + { + "epoch": 0.9751801441152922, + "learning_rate": 1.5654076635387976e-05, + "loss": 0.0783, + "step": 1218 + }, + { + "epoch": 0.9767814251401121, + "learning_rate": 1.560788138136029e-05, + "loss": 0.2691, + "step": 1220 + }, + { + "epoch": 0.978382706164932, + "learning_rate": 1.5561510890739137e-05, + "loss": 0.1013, + "step": 1222 + }, + { + "epoch": 0.9799839871897518, + "learning_rate": 1.5514966612522088e-05, + "loss": 0.0088, + "step": 1224 + }, + { + "epoch": 0.9815852682145717, + "learning_rate": 1.546825000113736e-05, + "loss": 0.8085, + "step": 1226 + }, + { + "epoch": 0.9831865492393915, + "learning_rate": 1.5421362516398285e-05, + "loss": 0.0106, + "step": 1228 + }, + { + "epoch": 0.9847878302642114, + "learning_rate": 1.5374305623457605e-05, + "loss": 0.036, + "step": 1230 + }, + { + "epoch": 0.9863891112890312, + "learning_rate": 1.532708079276186e-05, + "loss": 0.0525, + "step": 1232 + }, + { + "epoch": 0.9879903923138511, + "learning_rate": 1.5279689500005353e-05, + "loss": 0.0354, + "step": 1234 + }, + { + "epoch": 0.9895916733386709, + "learning_rate": 1.5232133226083962e-05, + "loss": 0.0953, + "step": 1236 + }, + { + "epoch": 0.9911929543634908, + "learning_rate": 1.5184413457049014e-05, + "loss": 0.0238, + "step": 1238 + }, + { + "epoch": 0.9927942353883107, + "learning_rate": 1.5136531684060753e-05, + "loss": 0.0918, + "step": 1240 + }, + { + "epoch": 0.9943955164131305, + "learning_rate": 1.50884894033418e-05, + "loss": 0.0118, + "step": 1242 + }, + { + "epoch": 0.9959967974379503, + "learning_rate": 1.504028811613027e-05, + "loss": 0.0056, + "step": 1244 + }, + { + "epoch": 0.9975980784627703, + "learning_rate": 1.4991929328633043e-05, + "loss": 0.006, + "step": 1246 + }, + { + "epoch": 0.9991993594875901, + "learning_rate": 1.4943414551978622e-05, + "loss": 0.0297, + "step": 1248 + }, + { + "epoch": 1.0, + "step": 1249, + "total_flos": 7643157994930176.0, + "train_loss": 0.16662734626208595, + "train_runtime": 6819.6813, + "train_samples_per_second": 2.93, + "train_steps_per_second": 0.183 + } + ], + "logging_steps": 2, + "max_steps": 1249, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 7643157994930176.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..1aef4832055f3013c27058b1dd9dee1b2f37bdc1 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ced5520b21b14f16b6dcbabc9f4080e0798e3d1eee912b6b647d58c8c3e3ecb6 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..c6c5d754556a355da6c67d27a62814ebece1acd8 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16dcf1ce62c6bd34f581a9319fbd9e8e654746f4c386413eec4319bc3e6afa1d +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..70ba552d48962752f438f8403de880c0e70a9f9e --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8611975ec35e50509d43cd93aa101a582e6de5d7faca70f209ba00266d661d4c +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..844fc1190de9cd0dcfa64fd5716a71e02cd4bfce --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ada9a6f76796e3d2e0236714938ab276dd2b1948ff0b51c9203ac27caf5cb48a +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_125_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_125_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..74cf8f0e1ca0952ffd4f45a076782fd9fc92b322 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_125_seed1/0_trainer_state.json @@ -0,0 +1,7526 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2498, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008006405124099279, + "learning_rate": 2.415943612351265e-06, + "loss": 0.0801, + "step": 2 + }, + { + "epoch": 0.0016012810248198558, + "learning_rate": 2.4341906163790364e-06, + "loss": 0.0636, + "step": 4 + }, + { + "epoch": 0.0024019215372297837, + "learning_rate": 2.4524967251364995e-06, + "loss": 0.0109, + "step": 6 + }, + { + "epoch": 0.0032025620496397116, + "learning_rate": 2.4708617956148052e-06, + "loss": 0.0714, + "step": 8 + }, + { + "epoch": 0.0040032025620496394, + "learning_rate": 2.4892856843445236e-06, + "loss": 0.0178, + "step": 10 + }, + { + "epoch": 0.004803843074459567, + "learning_rate": 2.507768247396697e-06, + "loss": 0.0077, + "step": 12 + }, + { + "epoch": 0.005604483586869495, + "learning_rate": 2.5263093403840022e-06, + "loss": 0.0544, + "step": 14 + }, + { + "epoch": 0.006405124099279423, + "learning_rate": 2.5449088184619065e-06, + "loss": 0.093, + "step": 16 + }, + { + "epoch": 0.007205764611689352, + "learning_rate": 2.5635665363297356e-06, + "loss": 0.0157, + "step": 18 + }, + { + "epoch": 0.008006405124099279, + "learning_rate": 2.5822823482318517e-06, + "loss": 0.007, + "step": 20 + }, + { + "epoch": 0.008807045636509208, + "learning_rate": 2.6010561079587694e-06, + "loss": 0.0183, + "step": 22 + }, + { + "epoch": 0.009607686148919135, + "learning_rate": 2.6198876688483453e-06, + "loss": 0.0513, + "step": 24 + }, + { + "epoch": 0.010408326661329063, + "learning_rate": 2.6387768837868565e-06, + "loss": 0.0127, + "step": 26 + }, + { + "epoch": 0.01120896717373899, + "learning_rate": 2.6577236052101764e-06, + "loss": 0.766, + "step": 28 + }, + { + "epoch": 0.01200960768614892, + "learning_rate": 2.6767276851049716e-06, + "loss": 0.0409, + "step": 30 + }, + { + "epoch": 0.012810248198558846, + "learning_rate": 2.6957889750097866e-06, + "loss": 0.0198, + "step": 32 + }, + { + "epoch": 0.013610888710968775, + "learning_rate": 2.7149073260162416e-06, + "loss": 0.0116, + "step": 34 + }, + { + "epoch": 0.014411529223378704, + "learning_rate": 2.7340825887701848e-06, + "loss": 0.5641, + "step": 36 + }, + { + "epoch": 0.01521216973578863, + "learning_rate": 2.7533146134728993e-06, + "loss": 0.0051, + "step": 38 + }, + { + "epoch": 0.016012810248198558, + "learning_rate": 2.772603249882202e-06, + "loss": 0.0596, + "step": 40 + }, + { + "epoch": 0.016813450760608487, + "learning_rate": 2.7919483473136555e-06, + "loss": 0.0329, + "step": 42 + }, + { + "epoch": 0.017614091273018415, + "learning_rate": 2.81134975464178e-06, + "loss": 0.0711, + "step": 44 + }, + { + "epoch": 0.018414731785428344, + "learning_rate": 2.8308073203011634e-06, + "loss": 0.0047, + "step": 46 + }, + { + "epoch": 0.01921537229783827, + "learning_rate": 2.850320892287688e-06, + "loss": 0.0039, + "step": 48 + }, + { + "epoch": 0.020016012810248198, + "learning_rate": 2.8698903181597026e-06, + "loss": 0.0098, + "step": 50 + }, + { + "epoch": 0.020816653322658127, + "learning_rate": 2.889515445039256e-06, + "loss": 0.0996, + "step": 52 + }, + { + "epoch": 0.021617293835068056, + "learning_rate": 2.909196119613218e-06, + "loss": 0.0009, + "step": 54 + }, + { + "epoch": 0.02241793434747798, + "learning_rate": 2.928932188134529e-06, + "loss": 0.0125, + "step": 56 + }, + { + "epoch": 0.02321857485988791, + "learning_rate": 2.9487234964233724e-06, + "loss": 0.0325, + "step": 58 + }, + { + "epoch": 0.02401921537229784, + "learning_rate": 2.9685698898684355e-06, + "loss": 0.0232, + "step": 60 + }, + { + "epoch": 0.024819855884707767, + "learning_rate": 2.988471213428035e-06, + "loss": 0.0029, + "step": 62 + }, + { + "epoch": 0.025620496397117692, + "learning_rate": 3.00842731163137e-06, + "loss": 0.082, + "step": 64 + }, + { + "epoch": 0.02642113690952762, + "learning_rate": 3.0284380285797733e-06, + "loss": 0.2249, + "step": 66 + }, + { + "epoch": 0.02722177742193755, + "learning_rate": 3.048503207947854e-06, + "loss": 0.0633, + "step": 68 + }, + { + "epoch": 0.02802241793434748, + "learning_rate": 3.068622692984767e-06, + "loss": 0.0254, + "step": 70 + }, + { + "epoch": 0.028823058446757407, + "learning_rate": 3.0887963265154187e-06, + "loss": 0.1402, + "step": 72 + }, + { + "epoch": 0.029623698959167333, + "learning_rate": 3.1090239509417364e-06, + "loss": 0.6191, + "step": 74 + }, + { + "epoch": 0.03042433947157726, + "learning_rate": 3.129305408243829e-06, + "loss": 0.0021, + "step": 76 + }, + { + "epoch": 0.03122497998398719, + "learning_rate": 3.1496405399812602e-06, + "loss": 0.5928, + "step": 78 + }, + { + "epoch": 0.032025620496397116, + "learning_rate": 3.17002918729432e-06, + "loss": 0.5687, + "step": 80 + }, + { + "epoch": 0.03282626100880705, + "learning_rate": 3.1904711909051967e-06, + "loss": 0.0452, + "step": 82 + }, + { + "epoch": 0.03362690152121697, + "learning_rate": 3.2109663911192622e-06, + "loss": 0.4084, + "step": 84 + }, + { + "epoch": 0.0344275420336269, + "learning_rate": 3.231514627826302e-06, + "loss": 0.009, + "step": 86 + }, + { + "epoch": 0.03522818254603683, + "learning_rate": 3.2521157405018146e-06, + "loss": 0.2046, + "step": 88 + }, + { + "epoch": 0.036028823058446756, + "learning_rate": 3.2727695682081897e-06, + "loss": 0.0461, + "step": 90 + }, + { + "epoch": 0.03682946357085669, + "learning_rate": 3.293475949595998e-06, + "loss": 0.0119, + "step": 92 + }, + { + "epoch": 0.03763010408326661, + "learning_rate": 3.314234722905302e-06, + "loss": 0.1393, + "step": 94 + }, + { + "epoch": 0.03843074459567654, + "learning_rate": 3.335045725966829e-06, + "loss": 0.0595, + "step": 96 + }, + { + "epoch": 0.03923138510808647, + "learning_rate": 3.355908796203301e-06, + "loss": 0.1344, + "step": 98 + }, + { + "epoch": 0.040032025620496396, + "learning_rate": 3.3768237706306716e-06, + "loss": 0.3163, + "step": 100 + }, + { + "epoch": 0.04083266613290633, + "learning_rate": 3.3977904858594534e-06, + "loss": 0.053, + "step": 102 + }, + { + "epoch": 0.041633306645316254, + "learning_rate": 3.418808778095917e-06, + "loss": 0.0187, + "step": 104 + }, + { + "epoch": 0.04243394715772618, + "learning_rate": 3.4398784831434097e-06, + "loss": 0.024, + "step": 106 + }, + { + "epoch": 0.04323458767013611, + "learning_rate": 3.460999436403676e-06, + "loss": 0.523, + "step": 108 + }, + { + "epoch": 0.044035228182546036, + "learning_rate": 3.4821714728780654e-06, + "loss": 0.0103, + "step": 110 + }, + { + "epoch": 0.04483586869495596, + "learning_rate": 3.5033944271688624e-06, + "loss": 0.1431, + "step": 112 + }, + { + "epoch": 0.045636509207365894, + "learning_rate": 3.5246681334806177e-06, + "loss": 0.0203, + "step": 114 + }, + { + "epoch": 0.04643714971977582, + "learning_rate": 3.5459924256213596e-06, + "loss": 0.0149, + "step": 116 + }, + { + "epoch": 0.04723779023218575, + "learning_rate": 3.567367137003953e-06, + "loss": 0.0503, + "step": 118 + }, + { + "epoch": 0.04803843074459568, + "learning_rate": 3.588792100647368e-06, + "loss": 0.5323, + "step": 120 + }, + { + "epoch": 0.0488390712570056, + "learning_rate": 3.6102671491780393e-06, + "loss": 0.026, + "step": 122 + }, + { + "epoch": 0.049639711769415534, + "learning_rate": 3.6317921148310965e-06, + "loss": 0.0589, + "step": 124 + }, + { + "epoch": 0.05044035228182546, + "learning_rate": 3.653366829451711e-06, + "loss": 0.0157, + "step": 126 + }, + { + "epoch": 0.051240992794235385, + "learning_rate": 3.674991124496452e-06, + "loss": 0.1824, + "step": 128 + }, + { + "epoch": 0.05204163330664532, + "learning_rate": 3.696664831034521e-06, + "loss": 0.0091, + "step": 130 + }, + { + "epoch": 0.05284227381905524, + "learning_rate": 3.7183877797491143e-06, + "loss": 0.0625, + "step": 132 + }, + { + "epoch": 0.053642914331465175, + "learning_rate": 3.740159800938784e-06, + "loss": 0.0989, + "step": 134 + }, + { + "epoch": 0.0544435548438751, + "learning_rate": 3.7619807245186824e-06, + "loss": 0.0136, + "step": 136 + }, + { + "epoch": 0.055244195356285025, + "learning_rate": 3.783850380021933e-06, + "loss": 0.0133, + "step": 138 + }, + { + "epoch": 0.05604483586869496, + "learning_rate": 3.8057685966010025e-06, + "loss": 0.0049, + "step": 140 + }, + { + "epoch": 0.05684547638110488, + "learning_rate": 3.827735203028956e-06, + "loss": 0.0162, + "step": 142 + }, + { + "epoch": 0.057646116893514815, + "learning_rate": 3.849750027700842e-06, + "loss": 0.0516, + "step": 144 + }, + { + "epoch": 0.05844675740592474, + "learning_rate": 3.871812898635011e-06, + "loss": 0.2893, + "step": 146 + }, + { + "epoch": 0.059247397918334666, + "learning_rate": 3.8939236434745184e-06, + "loss": 0.0133, + "step": 148 + }, + { + "epoch": 0.0600480384307446, + "learning_rate": 3.916082089488379e-06, + "loss": 0.1542, + "step": 150 + }, + { + "epoch": 0.06084867894315452, + "learning_rate": 3.938288063572962e-06, + "loss": 0.0238, + "step": 152 + }, + { + "epoch": 0.06164931945556445, + "learning_rate": 3.960541392253387e-06, + "loss": 0.0078, + "step": 154 + }, + { + "epoch": 0.06244995996797438, + "learning_rate": 3.982841901684792e-06, + "loss": 0.0882, + "step": 156 + }, + { + "epoch": 0.0632506004803843, + "learning_rate": 4.005189417653737e-06, + "loss": 0.0507, + "step": 158 + }, + { + "epoch": 0.06405124099279423, + "learning_rate": 4.027583765579601e-06, + "loss": 0.2705, + "step": 160 + }, + { + "epoch": 0.06485188150520416, + "learning_rate": 4.050024770515873e-06, + "loss": 0.153, + "step": 162 + }, + { + "epoch": 0.0656525220176141, + "learning_rate": 4.072512257151546e-06, + "loss": 0.0157, + "step": 164 + }, + { + "epoch": 0.06645316253002402, + "learning_rate": 4.095046049812541e-06, + "loss": 0.0147, + "step": 166 + }, + { + "epoch": 0.06725380304243395, + "learning_rate": 4.117625972462988e-06, + "loss": 0.0087, + "step": 168 + }, + { + "epoch": 0.06805444355484387, + "learning_rate": 4.1402518487066624e-06, + "loss": 0.0056, + "step": 170 + }, + { + "epoch": 0.0688550840672538, + "learning_rate": 4.1629235017883285e-06, + "loss": 0.0126, + "step": 172 + }, + { + "epoch": 0.06965572457966374, + "learning_rate": 4.1856407545951825e-06, + "loss": 0.1593, + "step": 174 + }, + { + "epoch": 0.07045636509207366, + "learning_rate": 4.208403429658151e-06, + "loss": 0.0592, + "step": 176 + }, + { + "epoch": 0.07125700560448359, + "learning_rate": 4.2312113491533145e-06, + "loss": 0.0233, + "step": 178 + }, + { + "epoch": 0.07205764611689351, + "learning_rate": 4.254064334903347e-06, + "loss": 0.0091, + "step": 180 + }, + { + "epoch": 0.07285828662930344, + "learning_rate": 4.276962208378814e-06, + "loss": 0.0825, + "step": 182 + }, + { + "epoch": 0.07365892714171338, + "learning_rate": 4.299904790699619e-06, + "loss": 0.0135, + "step": 184 + }, + { + "epoch": 0.0744595676541233, + "learning_rate": 4.3228919026364345e-06, + "loss": 0.3794, + "step": 186 + }, + { + "epoch": 0.07526020816653323, + "learning_rate": 4.345923364612024e-06, + "loss": 0.0753, + "step": 188 + }, + { + "epoch": 0.07606084867894315, + "learning_rate": 4.368998996702686e-06, + "loss": 0.0168, + "step": 190 + }, + { + "epoch": 0.07686148919135308, + "learning_rate": 4.392118618639698e-06, + "loss": 0.012, + "step": 192 + }, + { + "epoch": 0.07766212970376302, + "learning_rate": 4.415282049810643e-06, + "loss": 0.0039, + "step": 194 + }, + { + "epoch": 0.07846277021617294, + "learning_rate": 4.4384891092608795e-06, + "loss": 0.008, + "step": 196 + }, + { + "epoch": 0.07926341072858287, + "learning_rate": 4.461739615694921e-06, + "loss": 0.0182, + "step": 198 + }, + { + "epoch": 0.08006405124099279, + "learning_rate": 4.485033387477915e-06, + "loss": 0.0389, + "step": 200 + }, + { + "epoch": 0.08086469175340272, + "learning_rate": 4.5083702426369715e-06, + "loss": 1.0716, + "step": 202 + }, + { + "epoch": 0.08166533226581266, + "learning_rate": 4.531749998862628e-06, + "loss": 0.014, + "step": 204 + }, + { + "epoch": 0.08246597277822258, + "learning_rate": 4.555172473510324e-06, + "loss": 0.0264, + "step": 206 + }, + { + "epoch": 0.08326661329063251, + "learning_rate": 4.578637483601732e-06, + "loss": 0.0367, + "step": 208 + }, + { + "epoch": 0.08406725380304243, + "learning_rate": 4.602144845826234e-06, + "loss": 0.0028, + "step": 210 + }, + { + "epoch": 0.08486789431545236, + "learning_rate": 4.625694376542399e-06, + "loss": 0.0031, + "step": 212 + }, + { + "epoch": 0.08566853482786228, + "learning_rate": 4.649285891779326e-06, + "loss": 0.0039, + "step": 214 + }, + { + "epoch": 0.08646917534027222, + "learning_rate": 4.672919207238145e-06, + "loss": 0.0153, + "step": 216 + }, + { + "epoch": 0.08726981585268215, + "learning_rate": 4.696594138293421e-06, + "loss": 0.0665, + "step": 218 + }, + { + "epoch": 0.08807045636509207, + "learning_rate": 4.720310499994664e-06, + "loss": 0.0246, + "step": 220 + }, + { + "epoch": 0.088871096877502, + "learning_rate": 4.744068107067673e-06, + "loss": 0.0043, + "step": 222 + }, + { + "epoch": 0.08967173738991192, + "learning_rate": 4.767866773916041e-06, + "loss": 0.0031, + "step": 224 + }, + { + "epoch": 0.09047237790232186, + "learning_rate": 4.79170631462264e-06, + "loss": 0.1456, + "step": 226 + }, + { + "epoch": 0.09127301841473179, + "learning_rate": 4.81558654295099e-06, + "loss": 0.2295, + "step": 228 + }, + { + "epoch": 0.09207365892714171, + "learning_rate": 4.839507272346751e-06, + "loss": 0.0266, + "step": 230 + }, + { + "epoch": 0.09287429943955164, + "learning_rate": 4.863468315939234e-06, + "loss": 0.8542, + "step": 232 + }, + { + "epoch": 0.09367493995196156, + "learning_rate": 4.8874694865427676e-06, + "loss": 0.0158, + "step": 234 + }, + { + "epoch": 0.0944755804643715, + "learning_rate": 4.911510596658202e-06, + "loss": 0.1082, + "step": 236 + }, + { + "epoch": 0.09527622097678143, + "learning_rate": 4.935591458474425e-06, + "loss": 0.6745, + "step": 238 + }, + { + "epoch": 0.09607686148919135, + "learning_rate": 4.959711883869734e-06, + "loss": 0.0052, + "step": 240 + }, + { + "epoch": 0.09687750200160128, + "learning_rate": 4.9838716844133665e-06, + "loss": 0.0408, + "step": 242 + }, + { + "epoch": 0.0976781425140112, + "learning_rate": 5.0080706713669435e-06, + "loss": 0.1533, + "step": 244 + }, + { + "epoch": 0.09847878302642114, + "learning_rate": 5.032308655686007e-06, + "loss": 0.0549, + "step": 246 + }, + { + "epoch": 0.09927942353883107, + "learning_rate": 5.056585448021398e-06, + "loss": 0.5651, + "step": 248 + }, + { + "epoch": 0.100080064051241, + "learning_rate": 5.080900858720789e-06, + "loss": 0.0364, + "step": 250 + }, + { + "epoch": 0.10088070456365092, + "learning_rate": 5.105254697830208e-06, + "loss": 0.0167, + "step": 252 + }, + { + "epoch": 0.10168134507606084, + "learning_rate": 5.129646775095432e-06, + "loss": 0.1313, + "step": 254 + }, + { + "epoch": 0.10248198558847077, + "learning_rate": 5.154076899963514e-06, + "loss": 0.0055, + "step": 256 + }, + { + "epoch": 0.10328262610088071, + "learning_rate": 5.178544881584328e-06, + "loss": 0.115, + "step": 258 + }, + { + "epoch": 0.10408326661329063, + "learning_rate": 5.203050528811959e-06, + "loss": 0.0175, + "step": 260 + }, + { + "epoch": 0.10488390712570056, + "learning_rate": 5.227593650206246e-06, + "loss": 0.0305, + "step": 262 + }, + { + "epoch": 0.10568454763811048, + "learning_rate": 5.2521740540343205e-06, + "loss": 0.016, + "step": 264 + }, + { + "epoch": 0.10648518815052041, + "learning_rate": 5.2767915482720164e-06, + "loss": 0.0629, + "step": 266 + }, + { + "epoch": 0.10728582866293035, + "learning_rate": 5.3014459406054295e-06, + "loss": 0.0793, + "step": 268 + }, + { + "epoch": 0.10808646917534027, + "learning_rate": 5.3261370384323904e-06, + "loss": 0.0657, + "step": 270 + }, + { + "epoch": 0.1088871096877502, + "learning_rate": 5.350864648864026e-06, + "loss": 0.1417, + "step": 272 + }, + { + "epoch": 0.10968775020016013, + "learning_rate": 5.375628578726181e-06, + "loss": 0.0241, + "step": 274 + }, + { + "epoch": 0.11048839071257005, + "learning_rate": 5.4004286345609665e-06, + "loss": 0.0104, + "step": 276 + }, + { + "epoch": 0.11128903122497999, + "learning_rate": 5.425264622628326e-06, + "loss": 0.0458, + "step": 278 + }, + { + "epoch": 0.11208967173738991, + "learning_rate": 5.450136348907444e-06, + "loss": 0.0316, + "step": 280 + }, + { + "epoch": 0.11289031224979984, + "learning_rate": 5.475043619098321e-06, + "loss": 0.5709, + "step": 282 + }, + { + "epoch": 0.11369095276220977, + "learning_rate": 5.499986238623329e-06, + "loss": 0.1028, + "step": 284 + }, + { + "epoch": 0.11449159327461969, + "learning_rate": 5.524964012628644e-06, + "loss": 0.0452, + "step": 286 + }, + { + "epoch": 0.11529223378702963, + "learning_rate": 5.549976745985809e-06, + "loss": 0.381, + "step": 288 + }, + { + "epoch": 0.11609287429943956, + "learning_rate": 5.57502424329331e-06, + "loss": 0.0211, + "step": 290 + }, + { + "epoch": 0.11689351481184948, + "learning_rate": 5.6001063088780085e-06, + "loss": 0.3771, + "step": 292 + }, + { + "epoch": 0.1176941553242594, + "learning_rate": 5.62522274679673e-06, + "loss": 0.0511, + "step": 294 + }, + { + "epoch": 0.11849479583666933, + "learning_rate": 5.650373360837763e-06, + "loss": 0.0066, + "step": 296 + }, + { + "epoch": 0.11929543634907927, + "learning_rate": 5.675557954522462e-06, + "loss": 0.3528, + "step": 298 + }, + { + "epoch": 0.1200960768614892, + "learning_rate": 5.700776331106674e-06, + "loss": 0.1767, + "step": 300 + }, + { + "epoch": 0.12089671737389912, + "learning_rate": 5.726028293582342e-06, + "loss": 0.0526, + "step": 302 + }, + { + "epoch": 0.12169735788630905, + "learning_rate": 5.751313644679071e-06, + "loss": 0.01, + "step": 304 + }, + { + "epoch": 0.12249799839871897, + "learning_rate": 5.776632186865589e-06, + "loss": 0.0026, + "step": 306 + }, + { + "epoch": 0.1232986389111289, + "learning_rate": 5.8019837223513295e-06, + "loss": 0.2308, + "step": 308 + }, + { + "epoch": 0.12409927942353884, + "learning_rate": 5.827368053088032e-06, + "loss": 0.0724, + "step": 310 + }, + { + "epoch": 0.12489991993594876, + "learning_rate": 5.852784980771182e-06, + "loss": 0.0546, + "step": 312 + }, + { + "epoch": 0.1257005604483587, + "learning_rate": 5.878234306841637e-06, + "loss": 0.1785, + "step": 314 + }, + { + "epoch": 0.1265012009607686, + "learning_rate": 5.903715832487138e-06, + "loss": 0.1457, + "step": 316 + }, + { + "epoch": 0.12730184147317855, + "learning_rate": 5.929229358643925e-06, + "loss": 0.0273, + "step": 318 + }, + { + "epoch": 0.12810248198558846, + "learning_rate": 5.954774685998206e-06, + "loss": 0.3551, + "step": 320 + }, + { + "epoch": 0.1289031224979984, + "learning_rate": 5.9803516149877475e-06, + "loss": 0.1019, + "step": 322 + }, + { + "epoch": 0.1297037630104083, + "learning_rate": 6.005959945803494e-06, + "loss": 0.1084, + "step": 324 + }, + { + "epoch": 0.13050440352281825, + "learning_rate": 6.03159947839103e-06, + "loss": 0.1593, + "step": 326 + }, + { + "epoch": 0.1313050440352282, + "learning_rate": 6.057270012452186e-06, + "loss": 0.0407, + "step": 328 + }, + { + "epoch": 0.1321056845476381, + "learning_rate": 6.082971347446654e-06, + "loss": 0.0104, + "step": 330 + }, + { + "epoch": 0.13290632506004804, + "learning_rate": 6.108703282593461e-06, + "loss": 0.0114, + "step": 332 + }, + { + "epoch": 0.13370696557245795, + "learning_rate": 6.13446561687258e-06, + "loss": 0.0222, + "step": 334 + }, + { + "epoch": 0.1345076060848679, + "learning_rate": 6.160258149026557e-06, + "loss": 0.0107, + "step": 336 + }, + { + "epoch": 0.13530824659727783, + "learning_rate": 6.186080677561974e-06, + "loss": 0.085, + "step": 338 + }, + { + "epoch": 0.13610888710968774, + "learning_rate": 6.2119330007511014e-06, + "loss": 0.0965, + "step": 340 + }, + { + "epoch": 0.13690952762209768, + "learning_rate": 6.237814916633431e-06, + "loss": 0.0155, + "step": 342 + }, + { + "epoch": 0.1377101681345076, + "learning_rate": 6.263726223017326e-06, + "loss": 0.7712, + "step": 344 + }, + { + "epoch": 0.13851080864691753, + "learning_rate": 6.289666717481496e-06, + "loss": 0.0059, + "step": 346 + }, + { + "epoch": 0.13931144915932747, + "learning_rate": 6.315636197376634e-06, + "loss": 0.0032, + "step": 348 + }, + { + "epoch": 0.14011208967173738, + "learning_rate": 6.341634459827044e-06, + "loss": 0.209, + "step": 350 + }, + { + "epoch": 0.14091273018414732, + "learning_rate": 6.3676613017321305e-06, + "loss": 0.001, + "step": 352 + }, + { + "epoch": 0.14171337069655723, + "learning_rate": 6.393716519768032e-06, + "loss": 0.0077, + "step": 354 + }, + { + "epoch": 0.14251401120896717, + "learning_rate": 6.419799910389257e-06, + "loss": 0.1633, + "step": 356 + }, + { + "epoch": 0.1433146517213771, + "learning_rate": 6.445911269830183e-06, + "loss": 0.0056, + "step": 358 + }, + { + "epoch": 0.14411529223378702, + "learning_rate": 6.472050394106689e-06, + "loss": 0.0763, + "step": 360 + }, + { + "epoch": 0.14491593274619696, + "learning_rate": 6.498217079017806e-06, + "loss": 0.0595, + "step": 362 + }, + { + "epoch": 0.14571657325860687, + "learning_rate": 6.524411120147204e-06, + "loss": 0.1156, + "step": 364 + }, + { + "epoch": 0.1465172137710168, + "learning_rate": 6.5506323128648654e-06, + "loss": 0.0515, + "step": 366 + }, + { + "epoch": 0.14731785428342675, + "learning_rate": 6.576880452328645e-06, + "loss": 0.0587, + "step": 368 + }, + { + "epoch": 0.14811849479583666, + "learning_rate": 6.603155333485934e-06, + "loss": 0.1753, + "step": 370 + }, + { + "epoch": 0.1489191353082466, + "learning_rate": 6.6294567510751675e-06, + "loss": 0.0584, + "step": 372 + }, + { + "epoch": 0.14971977582065651, + "learning_rate": 6.655784499627476e-06, + "loss": 0.0484, + "step": 374 + }, + { + "epoch": 0.15052041633306645, + "learning_rate": 6.682138373468341e-06, + "loss": 0.5351, + "step": 376 + }, + { + "epoch": 0.1513210568454764, + "learning_rate": 6.7085181667191e-06, + "loss": 0.0509, + "step": 378 + }, + { + "epoch": 0.1521216973578863, + "learning_rate": 6.734923673298605e-06, + "loss": 0.0203, + "step": 380 + }, + { + "epoch": 0.15292233787029624, + "learning_rate": 6.761354686924883e-06, + "loss": 0.0055, + "step": 382 + }, + { + "epoch": 0.15372297838270615, + "learning_rate": 6.787811001116654e-06, + "loss": 0.0507, + "step": 384 + }, + { + "epoch": 0.1545236188951161, + "learning_rate": 6.8142924091949955e-06, + "loss": 0.0457, + "step": 386 + }, + { + "epoch": 0.15532425940752603, + "learning_rate": 6.840798704284939e-06, + "loss": 0.0008, + "step": 388 + }, + { + "epoch": 0.15612489991993594, + "learning_rate": 6.867329679317144e-06, + "loss": 0.111, + "step": 390 + }, + { + "epoch": 0.15692554043234588, + "learning_rate": 6.893885127029419e-06, + "loss": 0.0012, + "step": 392 + }, + { + "epoch": 0.1577261809447558, + "learning_rate": 6.920464839968391e-06, + "loss": 0.057, + "step": 394 + }, + { + "epoch": 0.15852682145716573, + "learning_rate": 6.94706861049117e-06, + "loss": 0.0036, + "step": 396 + }, + { + "epoch": 0.15932746196957567, + "learning_rate": 6.973696230766884e-06, + "loss": 0.0013, + "step": 398 + }, + { + "epoch": 0.16012810248198558, + "learning_rate": 7.000347492778341e-06, + "loss": 0.122, + "step": 400 + }, + { + "epoch": 0.16092874299439552, + "learning_rate": 7.027022188323704e-06, + "loss": 0.0021, + "step": 402 + }, + { + "epoch": 0.16172938350680544, + "learning_rate": 7.05372010901803e-06, + "loss": 0.0366, + "step": 404 + }, + { + "epoch": 0.16253002401921537, + "learning_rate": 7.080441046294945e-06, + "loss": 0.0183, + "step": 406 + }, + { + "epoch": 0.1633306645316253, + "learning_rate": 7.1071847914082605e-06, + "loss": 0.4039, + "step": 408 + }, + { + "epoch": 0.16413130504403523, + "learning_rate": 7.133951135433656e-06, + "loss": 0.0141, + "step": 410 + }, + { + "epoch": 0.16493194555644516, + "learning_rate": 7.160739869270219e-06, + "loss": 0.071, + "step": 412 + }, + { + "epoch": 0.16573258606885508, + "learning_rate": 7.18755078364214e-06, + "loss": 0.0051, + "step": 414 + }, + { + "epoch": 0.16653322658126501, + "learning_rate": 7.214383669100317e-06, + "loss": 0.0014, + "step": 416 + }, + { + "epoch": 0.16733386709367493, + "learning_rate": 7.241238316024064e-06, + "loss": 0.1901, + "step": 418 + }, + { + "epoch": 0.16813450760608487, + "learning_rate": 7.268114514622635e-06, + "loss": 0.0003, + "step": 420 + }, + { + "epoch": 0.1689351481184948, + "learning_rate": 7.2950120549369204e-06, + "loss": 0.0016, + "step": 422 + }, + { + "epoch": 0.16973578863090472, + "learning_rate": 7.321930726841144e-06, + "loss": 0.048, + "step": 424 + }, + { + "epoch": 0.17053642914331466, + "learning_rate": 7.348870320044395e-06, + "loss": 0.0357, + "step": 426 + }, + { + "epoch": 0.17133706965572457, + "learning_rate": 7.375830624092336e-06, + "loss": 0.0307, + "step": 428 + }, + { + "epoch": 0.1721377101681345, + "learning_rate": 7.402811428368824e-06, + "loss": 0.3541, + "step": 430 + }, + { + "epoch": 0.17293835068054444, + "learning_rate": 7.429812522097613e-06, + "loss": 0.0202, + "step": 432 + }, + { + "epoch": 0.17373899119295436, + "learning_rate": 7.4568336943439055e-06, + "loss": 0.0166, + "step": 434 + }, + { + "epoch": 0.1745396317053643, + "learning_rate": 7.4838747340160475e-06, + "loss": 0.0033, + "step": 436 + }, + { + "epoch": 0.1753402722177742, + "learning_rate": 7.510935429867233e-06, + "loss": 0.01, + "step": 438 + }, + { + "epoch": 0.17614091273018415, + "learning_rate": 7.538015570497046e-06, + "loss": 0.0228, + "step": 440 + }, + { + "epoch": 0.17694155324259409, + "learning_rate": 7.5651149443531846e-06, + "loss": 0.244, + "step": 442 + }, + { + "epoch": 0.177742193755004, + "learning_rate": 7.592233339733077e-06, + "loss": 0.0895, + "step": 444 + }, + { + "epoch": 0.17854283426741394, + "learning_rate": 7.619370544785608e-06, + "loss": 0.0068, + "step": 446 + }, + { + "epoch": 0.17934347477982385, + "learning_rate": 7.646526347512665e-06, + "loss": 0.0033, + "step": 448 + }, + { + "epoch": 0.1801441152922338, + "learning_rate": 7.67370053577085e-06, + "loss": 0.007, + "step": 450 + }, + { + "epoch": 0.18094475580464373, + "learning_rate": 7.70089289727319e-06, + "loss": 0.0078, + "step": 452 + }, + { + "epoch": 0.18174539631705364, + "learning_rate": 7.728103219590684e-06, + "loss": 0.0256, + "step": 454 + }, + { + "epoch": 0.18254603682946358, + "learning_rate": 7.755331290154041e-06, + "loss": 0.0128, + "step": 456 + }, + { + "epoch": 0.1833466773418735, + "learning_rate": 7.7825768962553e-06, + "loss": 0.1078, + "step": 458 + }, + { + "epoch": 0.18414731785428343, + "learning_rate": 7.809839825049565e-06, + "loss": 0.0023, + "step": 460 + }, + { + "epoch": 0.18494795836669337, + "learning_rate": 7.83711986355656e-06, + "loss": 0.0016, + "step": 462 + }, + { + "epoch": 0.18574859887910328, + "learning_rate": 7.864416798662347e-06, + "loss": 0.0728, + "step": 464 + }, + { + "epoch": 0.18654923939151322, + "learning_rate": 7.891730417121043e-06, + "loss": 0.0018, + "step": 466 + }, + { + "epoch": 0.18734987990392313, + "learning_rate": 7.919060505556376e-06, + "loss": 0.0004, + "step": 468 + }, + { + "epoch": 0.18815052041633307, + "learning_rate": 7.946406850463435e-06, + "loss": 0.0081, + "step": 470 + }, + { + "epoch": 0.188951160928743, + "learning_rate": 7.973769238210291e-06, + "loss": 0.0021, + "step": 472 + }, + { + "epoch": 0.18975180144115292, + "learning_rate": 8.001147455039737e-06, + "loss": 0.0462, + "step": 474 + }, + { + "epoch": 0.19055244195356286, + "learning_rate": 8.028541287070858e-06, + "loss": 0.0005, + "step": 476 + }, + { + "epoch": 0.19135308246597277, + "learning_rate": 8.055950520300756e-06, + "loss": 0.0013, + "step": 478 + }, + { + "epoch": 0.1921537229783827, + "learning_rate": 8.083374940606256e-06, + "loss": 0.0003, + "step": 480 + }, + { + "epoch": 0.19295436349079265, + "learning_rate": 8.110814333745503e-06, + "loss": 0.4547, + "step": 482 + }, + { + "epoch": 0.19375500400320256, + "learning_rate": 8.138268485359684e-06, + "loss": 0.0077, + "step": 484 + }, + { + "epoch": 0.1945556445156125, + "learning_rate": 8.165737180974676e-06, + "loss": 0.2209, + "step": 486 + }, + { + "epoch": 0.1953562850280224, + "learning_rate": 8.193220206002785e-06, + "loss": 0.0026, + "step": 488 + }, + { + "epoch": 0.19615692554043235, + "learning_rate": 8.220717345744326e-06, + "loss": 0.19, + "step": 490 + }, + { + "epoch": 0.1969575660528423, + "learning_rate": 8.248228385389349e-06, + "loss": 0.0007, + "step": 492 + }, + { + "epoch": 0.1977582065652522, + "learning_rate": 8.275753110019367e-06, + "loss": 0.029, + "step": 494 + }, + { + "epoch": 0.19855884707766214, + "learning_rate": 8.303291304608936e-06, + "loss": 0.0084, + "step": 496 + }, + { + "epoch": 0.19935948759007205, + "learning_rate": 8.330842754027378e-06, + "loss": 0.0143, + "step": 498 + }, + { + "epoch": 0.200160128102482, + "learning_rate": 8.358407243040524e-06, + "loss": 0.002, + "step": 500 + }, + { + "epoch": 0.20096076861489193, + "learning_rate": 8.385984556312285e-06, + "loss": 0.3068, + "step": 502 + }, + { + "epoch": 0.20176140912730184, + "learning_rate": 8.413574478406386e-06, + "loss": 0.1498, + "step": 504 + }, + { + "epoch": 0.20256204963971178, + "learning_rate": 8.441176793788106e-06, + "loss": 0.0003, + "step": 506 + }, + { + "epoch": 0.2033626901521217, + "learning_rate": 8.468791286825856e-06, + "loss": 0.7949, + "step": 508 + }, + { + "epoch": 0.20416333066453163, + "learning_rate": 8.496417741792922e-06, + "loss": 0.0368, + "step": 510 + }, + { + "epoch": 0.20496397117694154, + "learning_rate": 8.524055942869135e-06, + "loss": 0.0015, + "step": 512 + }, + { + "epoch": 0.20576461168935148, + "learning_rate": 8.551705674142616e-06, + "loss": 0.0042, + "step": 514 + }, + { + "epoch": 0.20656525220176142, + "learning_rate": 8.579366719611353e-06, + "loss": 0.0018, + "step": 516 + }, + { + "epoch": 0.20736589271417133, + "learning_rate": 8.607038863184952e-06, + "loss": 0.0016, + "step": 518 + }, + { + "epoch": 0.20816653322658127, + "learning_rate": 8.634721888686368e-06, + "loss": 0.0053, + "step": 520 + }, + { + "epoch": 0.20896717373899118, + "learning_rate": 8.662415579853495e-06, + "loss": 0.0019, + "step": 522 + }, + { + "epoch": 0.20976781425140112, + "learning_rate": 8.690119720340907e-06, + "loss": 0.8234, + "step": 524 + }, + { + "epoch": 0.21056845476381106, + "learning_rate": 8.717834093721598e-06, + "loss": 0.0163, + "step": 526 + }, + { + "epoch": 0.21136909527622097, + "learning_rate": 8.74555848348857e-06, + "loss": 0.0077, + "step": 528 + }, + { + "epoch": 0.2121697357886309, + "learning_rate": 8.773292673056572e-06, + "loss": 0.0571, + "step": 530 + }, + { + "epoch": 0.21297037630104082, + "learning_rate": 8.801036445763858e-06, + "loss": 0.011, + "step": 532 + }, + { + "epoch": 0.21377101681345076, + "learning_rate": 8.828789584873757e-06, + "loss": 0.0056, + "step": 534 + }, + { + "epoch": 0.2145716573258607, + "learning_rate": 8.856551873576448e-06, + "loss": 0.5161, + "step": 536 + }, + { + "epoch": 0.2153722978382706, + "learning_rate": 8.884323094990613e-06, + "loss": 0.0389, + "step": 538 + }, + { + "epoch": 0.21617293835068055, + "learning_rate": 8.912103032165206e-06, + "loss": 0.235, + "step": 540 + }, + { + "epoch": 0.21697357886309046, + "learning_rate": 8.939891468081036e-06, + "loss": 0.0074, + "step": 542 + }, + { + "epoch": 0.2177742193755004, + "learning_rate": 8.967688185652527e-06, + "loss": 0.0219, + "step": 544 + }, + { + "epoch": 0.21857485988791034, + "learning_rate": 8.995492967729449e-06, + "loss": 0.0257, + "step": 546 + }, + { + "epoch": 0.21937550040032025, + "learning_rate": 9.023305597098526e-06, + "loss": 0.004, + "step": 548 + }, + { + "epoch": 0.2201761409127302, + "learning_rate": 9.051125856485175e-06, + "loss": 0.0074, + "step": 550 + }, + { + "epoch": 0.2209767814251401, + "learning_rate": 9.078953528555258e-06, + "loss": 0.0726, + "step": 552 + }, + { + "epoch": 0.22177742193755004, + "learning_rate": 9.106788395916682e-06, + "loss": 0.0409, + "step": 554 + }, + { + "epoch": 0.22257806244995998, + "learning_rate": 9.134630241121135e-06, + "loss": 0.0014, + "step": 556 + }, + { + "epoch": 0.2233787029623699, + "learning_rate": 9.162478846665854e-06, + "loss": 0.0871, + "step": 558 + }, + { + "epoch": 0.22417934347477983, + "learning_rate": 9.190333994995208e-06, + "loss": 0.0046, + "step": 560 + }, + { + "epoch": 0.22497998398718974, + "learning_rate": 9.218195468502469e-06, + "loss": 0.2511, + "step": 562 + }, + { + "epoch": 0.22578062449959968, + "learning_rate": 9.24606304953148e-06, + "loss": 0.0144, + "step": 564 + }, + { + "epoch": 0.22658126501200962, + "learning_rate": 9.273936520378426e-06, + "loss": 0.0059, + "step": 566 + }, + { + "epoch": 0.22738190552441953, + "learning_rate": 9.301815663293426e-06, + "loss": 0.0734, + "step": 568 + }, + { + "epoch": 0.22818254603682947, + "learning_rate": 9.329700260482286e-06, + "loss": 0.1193, + "step": 570 + }, + { + "epoch": 0.22898318654923938, + "learning_rate": 9.35759009410826e-06, + "loss": 0.0016, + "step": 572 + }, + { + "epoch": 0.22978382706164932, + "learning_rate": 9.38548494629364e-06, + "loss": 0.6529, + "step": 574 + }, + { + "epoch": 0.23058446757405926, + "learning_rate": 9.41338459912151e-06, + "loss": 0.0062, + "step": 576 + }, + { + "epoch": 0.23138510808646917, + "learning_rate": 9.441288834637507e-06, + "loss": 0.0759, + "step": 578 + }, + { + "epoch": 0.2321857485988791, + "learning_rate": 9.469197434851414e-06, + "loss": 0.3349, + "step": 580 + }, + { + "epoch": 0.23298638911128902, + "learning_rate": 9.497110181738935e-06, + "loss": 0.0217, + "step": 582 + }, + { + "epoch": 0.23378702962369896, + "learning_rate": 9.52502685724336e-06, + "loss": 0.0152, + "step": 584 + }, + { + "epoch": 0.2345876701361089, + "learning_rate": 9.552947243277342e-06, + "loss": 0.1907, + "step": 586 + }, + { + "epoch": 0.2353883106485188, + "learning_rate": 9.580871121724498e-06, + "loss": 0.0195, + "step": 588 + }, + { + "epoch": 0.23618895116092875, + "learning_rate": 9.608798274441153e-06, + "loss": 0.3192, + "step": 590 + }, + { + "epoch": 0.23698959167333866, + "learning_rate": 9.636728483258116e-06, + "loss": 0.0097, + "step": 592 + }, + { + "epoch": 0.2377902321857486, + "learning_rate": 9.664661529982263e-06, + "loss": 0.0077, + "step": 594 + }, + { + "epoch": 0.23859087269815854, + "learning_rate": 9.692597196398302e-06, + "loss": 0.0575, + "step": 596 + }, + { + "epoch": 0.23939151321056845, + "learning_rate": 9.720535264270526e-06, + "loss": 0.0041, + "step": 598 + }, + { + "epoch": 0.2401921537229784, + "learning_rate": 9.748475515344416e-06, + "loss": 0.6202, + "step": 600 + }, + { + "epoch": 0.2409927942353883, + "learning_rate": 9.776417731348403e-06, + "loss": 0.2202, + "step": 602 + }, + { + "epoch": 0.24179343474779824, + "learning_rate": 9.80436169399561e-06, + "loss": 0.0034, + "step": 604 + }, + { + "epoch": 0.24259407526020815, + "learning_rate": 9.832307184985473e-06, + "loss": 0.2163, + "step": 606 + }, + { + "epoch": 0.2433947157726181, + "learning_rate": 9.8602539860055e-06, + "loss": 0.039, + "step": 608 + }, + { + "epoch": 0.24419535628502803, + "learning_rate": 9.888201878732946e-06, + "loss": 0.0361, + "step": 610 + }, + { + "epoch": 0.24499599679743794, + "learning_rate": 9.916150644836596e-06, + "loss": 0.174, + "step": 612 + }, + { + "epoch": 0.24579663730984788, + "learning_rate": 9.944100065978354e-06, + "loss": 0.3124, + "step": 614 + }, + { + "epoch": 0.2465972778222578, + "learning_rate": 9.972049923815011e-06, + "loss": 0.0976, + "step": 616 + }, + { + "epoch": 0.24739791833466773, + "learning_rate": 9.999999999999996e-06, + "loss": 0.0829, + "step": 618 + }, + { + "epoch": 0.24819855884707767, + "learning_rate": 1.0027950076184982e-05, + "loss": 0.0903, + "step": 620 + }, + { + "epoch": 0.24899919935948758, + "learning_rate": 1.0055899934021637e-05, + "loss": 0.0449, + "step": 622 + }, + { + "epoch": 0.24979983987189752, + "learning_rate": 1.0083849355163397e-05, + "loss": 0.0409, + "step": 624 + }, + { + "epoch": 0.25060048038430743, + "learning_rate": 1.0111798121267047e-05, + "loss": 0.0182, + "step": 626 + }, + { + "epoch": 0.2514011208967174, + "learning_rate": 1.0139746013994493e-05, + "loss": 0.1285, + "step": 628 + }, + { + "epoch": 0.2522017614091273, + "learning_rate": 1.016769281501452e-05, + "loss": 0.0086, + "step": 630 + }, + { + "epoch": 0.2530024019215372, + "learning_rate": 1.0195638306004383e-05, + "loss": 0.1102, + "step": 632 + }, + { + "epoch": 0.25380304243394713, + "learning_rate": 1.022358226865159e-05, + "loss": 0.033, + "step": 634 + }, + { + "epoch": 0.2546036829463571, + "learning_rate": 1.0251524484655577e-05, + "loss": 0.0007, + "step": 636 + }, + { + "epoch": 0.255404323458767, + "learning_rate": 1.0279464735729467e-05, + "loss": 0.733, + "step": 638 + }, + { + "epoch": 0.2562049639711769, + "learning_rate": 1.0307402803601691e-05, + "loss": 0.0816, + "step": 640 + }, + { + "epoch": 0.2570056044835869, + "learning_rate": 1.033533847001773e-05, + "loss": 0.0039, + "step": 642 + }, + { + "epoch": 0.2578062449959968, + "learning_rate": 1.0363271516741877e-05, + "loss": 0.0218, + "step": 644 + }, + { + "epoch": 0.2586068855084067, + "learning_rate": 1.039120172555884e-05, + "loss": 0.3085, + "step": 646 + }, + { + "epoch": 0.2594075260208166, + "learning_rate": 1.0419128878275495e-05, + "loss": 0.1332, + "step": 648 + }, + { + "epoch": 0.2602081665332266, + "learning_rate": 1.0447052756722651e-05, + "loss": 0.3762, + "step": 650 + }, + { + "epoch": 0.2610088070456365, + "learning_rate": 1.0474973142756632e-05, + "loss": 0.0293, + "step": 652 + }, + { + "epoch": 0.2618094475580464, + "learning_rate": 1.0502889818261058e-05, + "loss": 0.0458, + "step": 654 + }, + { + "epoch": 0.2626100880704564, + "learning_rate": 1.053080256514858e-05, + "loss": 0.0076, + "step": 656 + }, + { + "epoch": 0.2634107285828663, + "learning_rate": 1.0558711165362488e-05, + "loss": 0.0099, + "step": 658 + }, + { + "epoch": 0.2642113690952762, + "learning_rate": 1.0586615400878484e-05, + "loss": 0.0241, + "step": 660 + }, + { + "epoch": 0.26501200960768617, + "learning_rate": 1.0614515053706354e-05, + "loss": 0.1594, + "step": 662 + }, + { + "epoch": 0.2658126501200961, + "learning_rate": 1.0642409905891733e-05, + "loss": 0.2788, + "step": 664 + }, + { + "epoch": 0.266613290632506, + "learning_rate": 1.0670299739517706e-05, + "loss": 0.0126, + "step": 666 + }, + { + "epoch": 0.2674139311449159, + "learning_rate": 1.0698184336706567e-05, + "loss": 0.0182, + "step": 668 + }, + { + "epoch": 0.2682145716573259, + "learning_rate": 1.0726063479621567e-05, + "loss": 0.0018, + "step": 670 + }, + { + "epoch": 0.2690152121697358, + "learning_rate": 1.0753936950468513e-05, + "loss": 0.3308, + "step": 672 + }, + { + "epoch": 0.2698158526821457, + "learning_rate": 1.0781804531497525e-05, + "loss": 0.1904, + "step": 674 + }, + { + "epoch": 0.27061649319455566, + "learning_rate": 1.0809666005004787e-05, + "loss": 0.0374, + "step": 676 + }, + { + "epoch": 0.2714171337069656, + "learning_rate": 1.083752115333414e-05, + "loss": 0.0323, + "step": 678 + }, + { + "epoch": 0.2722177742193755, + "learning_rate": 1.0865369758878858e-05, + "loss": 0.3418, + "step": 680 + }, + { + "epoch": 0.27301841473178545, + "learning_rate": 1.0893211604083311e-05, + "loss": 0.4836, + "step": 682 + }, + { + "epoch": 0.27381905524419536, + "learning_rate": 1.0921046471444737e-05, + "loss": 0.2694, + "step": 684 + }, + { + "epoch": 0.2746196957566053, + "learning_rate": 1.0948874143514818e-05, + "loss": 0.5604, + "step": 686 + }, + { + "epoch": 0.2754203362690152, + "learning_rate": 1.0976694402901467e-05, + "loss": 0.0983, + "step": 688 + }, + { + "epoch": 0.27622097678142515, + "learning_rate": 1.1004507032270544e-05, + "loss": 0.1592, + "step": 690 + }, + { + "epoch": 0.27702161729383507, + "learning_rate": 1.1032311814347467e-05, + "loss": 0.0023, + "step": 692 + }, + { + "epoch": 0.277822257806245, + "learning_rate": 1.1060108531918955e-05, + "loss": 0.149, + "step": 694 + }, + { + "epoch": 0.27862289831865494, + "learning_rate": 1.1087896967834787e-05, + "loss": 0.0161, + "step": 696 + }, + { + "epoch": 0.27942353883106485, + "learning_rate": 1.111567690500938e-05, + "loss": 0.0116, + "step": 698 + }, + { + "epoch": 0.28022417934347477, + "learning_rate": 1.1143448126423545e-05, + "loss": 0.0542, + "step": 700 + }, + { + "epoch": 0.28102481985588473, + "learning_rate": 1.1171210415126238e-05, + "loss": 0.0016, + "step": 702 + }, + { + "epoch": 0.28182546036829464, + "learning_rate": 1.1198963554236135e-05, + "loss": 0.0007, + "step": 704 + }, + { + "epoch": 0.28262610088070456, + "learning_rate": 1.122670732694342e-05, + "loss": 0.0162, + "step": 706 + }, + { + "epoch": 0.28342674139311447, + "learning_rate": 1.1254441516511425e-05, + "loss": 0.057, + "step": 708 + }, + { + "epoch": 0.28422738190552443, + "learning_rate": 1.1282165906278395e-05, + "loss": 0.0012, + "step": 710 + }, + { + "epoch": 0.28502802241793435, + "learning_rate": 1.1309880279659087e-05, + "loss": 0.0791, + "step": 712 + }, + { + "epoch": 0.28582866293034426, + "learning_rate": 1.1337584420146496e-05, + "loss": 0.0477, + "step": 714 + }, + { + "epoch": 0.2866293034427542, + "learning_rate": 1.1365278111313625e-05, + "loss": 0.0105, + "step": 716 + }, + { + "epoch": 0.28742994395516414, + "learning_rate": 1.1392961136815041e-05, + "loss": 1.0666, + "step": 718 + }, + { + "epoch": 0.28823058446757405, + "learning_rate": 1.142063328038864e-05, + "loss": 0.0089, + "step": 720 + }, + { + "epoch": 0.289031224979984, + "learning_rate": 1.1448294325857377e-05, + "loss": 0.2864, + "step": 722 + }, + { + "epoch": 0.2898318654923939, + "learning_rate": 1.1475944057130856e-05, + "loss": 0.041, + "step": 724 + }, + { + "epoch": 0.29063250600480384, + "learning_rate": 1.150358225820707e-05, + "loss": 0.0033, + "step": 726 + }, + { + "epoch": 0.29143314651721375, + "learning_rate": 1.1531208713174138e-05, + "loss": 0.0103, + "step": 728 + }, + { + "epoch": 0.2922337870296237, + "learning_rate": 1.1558823206211887e-05, + "loss": 0.0007, + "step": 730 + }, + { + "epoch": 0.2930344275420336, + "learning_rate": 1.1586425521593607e-05, + "loss": 0.3845, + "step": 732 + }, + { + "epoch": 0.29383506805444354, + "learning_rate": 1.1614015443687708e-05, + "loss": 0.011, + "step": 734 + }, + { + "epoch": 0.2946357085668535, + "learning_rate": 1.1641592756959467e-05, + "loss": 0.0733, + "step": 736 + }, + { + "epoch": 0.2954363490792634, + "learning_rate": 1.1669157245972616e-05, + "loss": 0.2447, + "step": 738 + }, + { + "epoch": 0.2962369895916733, + "learning_rate": 1.1696708695391057e-05, + "loss": 0.026, + "step": 740 + }, + { + "epoch": 0.29703763010408324, + "learning_rate": 1.1724246889980626e-05, + "loss": 0.0012, + "step": 742 + }, + { + "epoch": 0.2978382706164932, + "learning_rate": 1.1751771614610643e-05, + "loss": 0.0115, + "step": 744 + }, + { + "epoch": 0.2986389111289031, + "learning_rate": 1.1779282654255668e-05, + "loss": 0.0289, + "step": 746 + }, + { + "epoch": 0.29943955164131303, + "learning_rate": 1.180677979399721e-05, + "loss": 0.0277, + "step": 748 + }, + { + "epoch": 0.300240192153723, + "learning_rate": 1.1834262819025317e-05, + "loss": 0.0003, + "step": 750 + }, + { + "epoch": 0.3010408326661329, + "learning_rate": 1.1861731514640309e-05, + "loss": 0.0023, + "step": 752 + }, + { + "epoch": 0.3018414731785428, + "learning_rate": 1.188918566625449e-05, + "loss": 0.0097, + "step": 754 + }, + { + "epoch": 0.3026421136909528, + "learning_rate": 1.1916625059393739e-05, + "loss": 0.0408, + "step": 756 + }, + { + "epoch": 0.3034427542033627, + "learning_rate": 1.1944049479699241e-05, + "loss": 0.1744, + "step": 758 + }, + { + "epoch": 0.3042433947157726, + "learning_rate": 1.1971458712929133e-05, + "loss": 0.0218, + "step": 760 + }, + { + "epoch": 0.3050440352281825, + "learning_rate": 1.1998852544960256e-05, + "loss": 0.058, + "step": 762 + }, + { + "epoch": 0.3058446757405925, + "learning_rate": 1.2026230761789702e-05, + "loss": 0.0017, + "step": 764 + }, + { + "epoch": 0.3066453162530024, + "learning_rate": 1.2053593149536557e-05, + "loss": 0.1017, + "step": 766 + }, + { + "epoch": 0.3074459567654123, + "learning_rate": 1.2080939494443618e-05, + "loss": 0.3067, + "step": 768 + }, + { + "epoch": 0.3082465972778223, + "learning_rate": 1.210826958287895e-05, + "loss": 0.0826, + "step": 770 + }, + { + "epoch": 0.3090472377902322, + "learning_rate": 1.2135583201337646e-05, + "loss": 0.6656, + "step": 772 + }, + { + "epoch": 0.3098478783026421, + "learning_rate": 1.2162880136443434e-05, + "loss": 0.0384, + "step": 774 + }, + { + "epoch": 0.31064851881505207, + "learning_rate": 1.2190160174950428e-05, + "loss": 0.064, + "step": 776 + }, + { + "epoch": 0.311449159327462, + "learning_rate": 1.2217423103744692e-05, + "loss": 0.3208, + "step": 778 + }, + { + "epoch": 0.3122497998398719, + "learning_rate": 1.2244668709845952e-05, + "loss": 0.0568, + "step": 780 + }, + { + "epoch": 0.3130504403522818, + "learning_rate": 1.2271896780409309e-05, + "loss": 0.3526, + "step": 782 + }, + { + "epoch": 0.31385108086469177, + "learning_rate": 1.2299107102726804e-05, + "loss": 0.0351, + "step": 784 + }, + { + "epoch": 0.3146517213771017, + "learning_rate": 1.2326299464229143e-05, + "loss": 0.0207, + "step": 786 + }, + { + "epoch": 0.3154523618895116, + "learning_rate": 1.2353473652487329e-05, + "loss": 0.3532, + "step": 788 + }, + { + "epoch": 0.31625300240192156, + "learning_rate": 1.2380629455214385e-05, + "loss": 0.498, + "step": 790 + }, + { + "epoch": 0.31705364291433147, + "learning_rate": 1.2407766660266916e-05, + "loss": 0.0005, + "step": 792 + }, + { + "epoch": 0.3178542834267414, + "learning_rate": 1.2434885055646808e-05, + "loss": 0.041, + "step": 794 + }, + { + "epoch": 0.31865492393915135, + "learning_rate": 1.2461984429502947e-05, + "loss": 0.0457, + "step": 796 + }, + { + "epoch": 0.31945556445156126, + "learning_rate": 1.2489064570132761e-05, + "loss": 0.0324, + "step": 798 + }, + { + "epoch": 0.32025620496397117, + "learning_rate": 1.2516125265983945e-05, + "loss": 0.1037, + "step": 800 + }, + { + "epoch": 0.3210568454763811, + "learning_rate": 1.2543166305656089e-05, + "loss": 0.0206, + "step": 802 + }, + { + "epoch": 0.32185748598879105, + "learning_rate": 1.257018747790238e-05, + "loss": 0.1009, + "step": 804 + }, + { + "epoch": 0.32265812650120096, + "learning_rate": 1.259718857163117e-05, + "loss": 0.0801, + "step": 806 + }, + { + "epoch": 0.32345876701361087, + "learning_rate": 1.2624169375907657e-05, + "loss": 0.0913, + "step": 808 + }, + { + "epoch": 0.32425940752602084, + "learning_rate": 1.2651129679955598e-05, + "loss": 0.2667, + "step": 810 + }, + { + "epoch": 0.32506004803843075, + "learning_rate": 1.2678069273158849e-05, + "loss": 0.0852, + "step": 812 + }, + { + "epoch": 0.32586068855084066, + "learning_rate": 1.2704987945063073e-05, + "loss": 0.0776, + "step": 814 + }, + { + "epoch": 0.3266613290632506, + "learning_rate": 1.273188548537736e-05, + "loss": 0.1268, + "step": 816 + }, + { + "epoch": 0.32746196957566054, + "learning_rate": 1.2758761683975929e-05, + "loss": 0.1433, + "step": 818 + }, + { + "epoch": 0.32826261008807045, + "learning_rate": 1.2785616330899676e-05, + "loss": 0.4018, + "step": 820 + }, + { + "epoch": 0.32906325060048036, + "learning_rate": 1.2812449216357855e-05, + "loss": 0.0984, + "step": 822 + }, + { + "epoch": 0.32986389111289033, + "learning_rate": 1.2839260130729776e-05, + "loss": 0.116, + "step": 824 + }, + { + "epoch": 0.33066453162530024, + "learning_rate": 1.2866048864566336e-05, + "loss": 0.0015, + "step": 826 + }, + { + "epoch": 0.33146517213771015, + "learning_rate": 1.2892815208591734e-05, + "loss": 0.2462, + "step": 828 + }, + { + "epoch": 0.3322658126501201, + "learning_rate": 1.2919558953705047e-05, + "loss": 0.0347, + "step": 830 + }, + { + "epoch": 0.33306645316253003, + "learning_rate": 1.2946279890981966e-05, + "loss": 0.2647, + "step": 832 + }, + { + "epoch": 0.33386709367493994, + "learning_rate": 1.2972977811676289e-05, + "loss": 0.0067, + "step": 834 + }, + { + "epoch": 0.33466773418734985, + "learning_rate": 1.2999652507221652e-05, + "loss": 0.0167, + "step": 836 + }, + { + "epoch": 0.3354683746997598, + "learning_rate": 1.3026303769233109e-05, + "loss": 0.2641, + "step": 838 + }, + { + "epoch": 0.33626901521216973, + "learning_rate": 1.3052931389508822e-05, + "loss": 0.2266, + "step": 840 + }, + { + "epoch": 0.33706965572457964, + "learning_rate": 1.3079535160031601e-05, + "loss": 0.0007, + "step": 842 + }, + { + "epoch": 0.3378702962369896, + "learning_rate": 1.3106114872970575e-05, + "loss": 0.2083, + "step": 844 + }, + { + "epoch": 0.3386709367493995, + "learning_rate": 1.313267032068285e-05, + "loss": 0.1587, + "step": 846 + }, + { + "epoch": 0.33947157726180943, + "learning_rate": 1.3159201295715054e-05, + "loss": 0.0112, + "step": 848 + }, + { + "epoch": 0.3402722177742194, + "learning_rate": 1.3185707590804997e-05, + "loss": 0.0541, + "step": 850 + }, + { + "epoch": 0.3410728582866293, + "learning_rate": 1.321218899888334e-05, + "loss": 0.2232, + "step": 852 + }, + { + "epoch": 0.3418734987990392, + "learning_rate": 1.3238645313075109e-05, + "loss": 0.1321, + "step": 854 + }, + { + "epoch": 0.34267413931144913, + "learning_rate": 1.326507632670139e-05, + "loss": 0.1404, + "step": 856 + }, + { + "epoch": 0.3434747798238591, + "learning_rate": 1.3291481833280894e-05, + "loss": 0.0021, + "step": 858 + }, + { + "epoch": 0.344275420336269, + "learning_rate": 1.3317861626531652e-05, + "loss": 0.1083, + "step": 860 + }, + { + "epoch": 0.3450760608486789, + "learning_rate": 1.3344215500372517e-05, + "loss": 0.265, + "step": 862 + }, + { + "epoch": 0.3458767013610889, + "learning_rate": 1.3370543248924826e-05, + "loss": 0.0018, + "step": 864 + }, + { + "epoch": 0.3466773418734988, + "learning_rate": 1.3396844666514062e-05, + "loss": 0.1455, + "step": 866 + }, + { + "epoch": 0.3474779823859087, + "learning_rate": 1.3423119547671348e-05, + "loss": 0.0043, + "step": 868 + }, + { + "epoch": 0.3482786228983187, + "learning_rate": 1.344936768713513e-05, + "loss": 0.1776, + "step": 870 + }, + { + "epoch": 0.3490792634107286, + "learning_rate": 1.347558887985279e-05, + "loss": 0.0068, + "step": 872 + }, + { + "epoch": 0.3498799039231385, + "learning_rate": 1.3501782920982189e-05, + "loss": 0.0104, + "step": 874 + }, + { + "epoch": 0.3506805444355484, + "learning_rate": 1.3527949605893305e-05, + "loss": 0.0027, + "step": 876 + }, + { + "epoch": 0.3514811849479584, + "learning_rate": 1.3554088730169812e-05, + "loss": 0.0455, + "step": 878 + }, + { + "epoch": 0.3522818254603683, + "learning_rate": 1.3580200089610739e-05, + "loss": 0.0677, + "step": 880 + }, + { + "epoch": 0.3530824659727782, + "learning_rate": 1.3606283480231962e-05, + "loss": 0.0043, + "step": 882 + }, + { + "epoch": 0.35388310648518817, + "learning_rate": 1.3632338698267863e-05, + "loss": 0.1621, + "step": 884 + }, + { + "epoch": 0.3546837469975981, + "learning_rate": 1.3658365540172948e-05, + "loss": 0.0121, + "step": 886 + }, + { + "epoch": 0.355484387510008, + "learning_rate": 1.368436380262336e-05, + "loss": 0.0267, + "step": 888 + }, + { + "epoch": 0.35628502802241796, + "learning_rate": 1.3710333282518497e-05, + "loss": 0.0043, + "step": 890 + }, + { + "epoch": 0.35708566853482787, + "learning_rate": 1.3736273776982667e-05, + "loss": 0.0549, + "step": 892 + }, + { + "epoch": 0.3578863090472378, + "learning_rate": 1.3762185083366562e-05, + "loss": 0.0054, + "step": 894 + }, + { + "epoch": 0.3586869495596477, + "learning_rate": 1.3788066999248893e-05, + "loss": 0.0181, + "step": 896 + }, + { + "epoch": 0.35948759007205766, + "learning_rate": 1.3813919322438018e-05, + "loss": 0.6881, + "step": 898 + }, + { + "epoch": 0.3602882305844676, + "learning_rate": 1.3839741850973435e-05, + "loss": 0.071, + "step": 900 + }, + { + "epoch": 0.3610888710968775, + "learning_rate": 1.3865534383127413e-05, + "loss": 0.0473, + "step": 902 + }, + { + "epoch": 0.36188951160928745, + "learning_rate": 1.3891296717406533e-05, + "loss": 0.1588, + "step": 904 + }, + { + "epoch": 0.36269015212169736, + "learning_rate": 1.391702865255334e-05, + "loss": 0.0878, + "step": 906 + }, + { + "epoch": 0.3634907926341073, + "learning_rate": 1.3942729987547808e-05, + "loss": 0.2021, + "step": 908 + }, + { + "epoch": 0.36429143314651724, + "learning_rate": 1.3968400521608962e-05, + "loss": 0.3762, + "step": 910 + }, + { + "epoch": 0.36509207365892715, + "learning_rate": 1.3994040054196498e-05, + "loss": 0.8518, + "step": 912 + }, + { + "epoch": 0.36589271417133706, + "learning_rate": 1.4019648385012245e-05, + "loss": 0.0181, + "step": 914 + }, + { + "epoch": 0.366693354683747, + "learning_rate": 1.4045225314001789e-05, + "loss": 0.1592, + "step": 916 + }, + { + "epoch": 0.36749399519615694, + "learning_rate": 1.4070770641356069e-05, + "loss": 0.1322, + "step": 918 + }, + { + "epoch": 0.36829463570856685, + "learning_rate": 1.4096284167512856e-05, + "loss": 0.1366, + "step": 920 + }, + { + "epoch": 0.36909527622097676, + "learning_rate": 1.4121765693158355e-05, + "loss": 0.8325, + "step": 922 + }, + { + "epoch": 0.36989591673338673, + "learning_rate": 1.4147215019228813e-05, + "loss": 0.2463, + "step": 924 + }, + { + "epoch": 0.37069655724579664, + "learning_rate": 1.4172631946911964e-05, + "loss": 0.058, + "step": 926 + }, + { + "epoch": 0.37149719775820655, + "learning_rate": 1.4198016277648665e-05, + "loss": 0.0059, + "step": 928 + }, + { + "epoch": 0.37229783827061647, + "learning_rate": 1.4223367813134406e-05, + "loss": 0.1082, + "step": 930 + }, + { + "epoch": 0.37309847878302643, + "learning_rate": 1.4248686355320922e-05, + "loss": 0.0008, + "step": 932 + }, + { + "epoch": 0.37389911929543634, + "learning_rate": 1.4273971706417653e-05, + "loss": 0.0051, + "step": 934 + }, + { + "epoch": 0.37469975980784626, + "learning_rate": 1.429922366889332e-05, + "loss": 0.6201, + "step": 936 + }, + { + "epoch": 0.3755004003202562, + "learning_rate": 1.4324442045477534e-05, + "loss": 0.2649, + "step": 938 + }, + { + "epoch": 0.37630104083266613, + "learning_rate": 1.4349626639162231e-05, + "loss": 0.4172, + "step": 940 + }, + { + "epoch": 0.37710168134507605, + "learning_rate": 1.4374777253203265e-05, + "loss": 0.0442, + "step": 942 + }, + { + "epoch": 0.377902321857486, + "learning_rate": 1.4399893691121985e-05, + "loss": 0.0649, + "step": 944 + }, + { + "epoch": 0.3787029623698959, + "learning_rate": 1.4424975756706684e-05, + "loss": 0.1761, + "step": 946 + }, + { + "epoch": 0.37950360288230583, + "learning_rate": 1.4450023254014185e-05, + "loss": 0.0072, + "step": 948 + }, + { + "epoch": 0.38030424339471575, + "learning_rate": 1.4475035987371348e-05, + "loss": 0.1921, + "step": 950 + }, + { + "epoch": 0.3811048839071257, + "learning_rate": 1.4500013761376663e-05, + "loss": 0.0901, + "step": 952 + }, + { + "epoch": 0.3819055244195356, + "learning_rate": 1.4524956380901674e-05, + "loss": 0.0454, + "step": 954 + }, + { + "epoch": 0.38270616493194554, + "learning_rate": 1.454986365109255e-05, + "loss": 0.0542, + "step": 956 + }, + { + "epoch": 0.3835068054443555, + "learning_rate": 1.4574735377371669e-05, + "loss": 0.4236, + "step": 958 + }, + { + "epoch": 0.3843074459567654, + "learning_rate": 1.4599571365439027e-05, + "loss": 0.4265, + "step": 960 + }, + { + "epoch": 0.3851080864691753, + "learning_rate": 1.4624371421273812e-05, + "loss": 0.0224, + "step": 962 + }, + { + "epoch": 0.3859087269815853, + "learning_rate": 1.4649135351135968e-05, + "loss": 0.0246, + "step": 964 + }, + { + "epoch": 0.3867093674939952, + "learning_rate": 1.4673862961567604e-05, + "loss": 0.1201, + "step": 966 + }, + { + "epoch": 0.3875100080064051, + "learning_rate": 1.4698554059394563e-05, + "loss": 0.023, + "step": 968 + }, + { + "epoch": 0.388310648518815, + "learning_rate": 1.4723208451727977e-05, + "loss": 0.383, + "step": 970 + }, + { + "epoch": 0.389111289031225, + "learning_rate": 1.4747825945965675e-05, + "loss": 0.0058, + "step": 972 + }, + { + "epoch": 0.3899119295436349, + "learning_rate": 1.4772406349793749e-05, + "loss": 0.0061, + "step": 974 + }, + { + "epoch": 0.3907125700560448, + "learning_rate": 1.4796949471188033e-05, + "loss": 0.034, + "step": 976 + }, + { + "epoch": 0.3915132105684548, + "learning_rate": 1.4821455118415666e-05, + "loss": 0.5899, + "step": 978 + }, + { + "epoch": 0.3923138510808647, + "learning_rate": 1.4845923100036479e-05, + "loss": 0.1603, + "step": 980 + }, + { + "epoch": 0.3931144915932746, + "learning_rate": 1.4870353224904563e-05, + "loss": 0.2285, + "step": 982 + }, + { + "epoch": 0.3939151321056846, + "learning_rate": 1.4894745302169786e-05, + "loss": 0.3768, + "step": 984 + }, + { + "epoch": 0.3947157726180945, + "learning_rate": 1.4919099141279205e-05, + "loss": 0.0903, + "step": 986 + }, + { + "epoch": 0.3955164131305044, + "learning_rate": 1.4943414551978597e-05, + "loss": 0.0292, + "step": 988 + }, + { + "epoch": 0.3963170536429143, + "learning_rate": 1.4967691344313988e-05, + "loss": 0.8227, + "step": 990 + }, + { + "epoch": 0.3971176941553243, + "learning_rate": 1.499192932863305e-05, + "loss": 0.0124, + "step": 992 + }, + { + "epoch": 0.3979183346677342, + "learning_rate": 1.5016128315586626e-05, + "loss": 0.2691, + "step": 994 + }, + { + "epoch": 0.3987189751801441, + "learning_rate": 1.5040288116130261e-05, + "loss": 0.4409, + "step": 996 + }, + { + "epoch": 0.39951961569255406, + "learning_rate": 1.5064408541525568e-05, + "loss": 0.0088, + "step": 998 + }, + { + "epoch": 0.400320256204964, + "learning_rate": 1.5088489403341793e-05, + "loss": 0.089, + "step": 1000 + }, + { + "epoch": 0.4011208967173739, + "learning_rate": 1.5112530513457229e-05, + "loss": 0.0009, + "step": 1002 + }, + { + "epoch": 0.40192153722978385, + "learning_rate": 1.513653168406076e-05, + "loss": 0.7044, + "step": 1004 + }, + { + "epoch": 0.40272217774219377, + "learning_rate": 1.5160492727653245e-05, + "loss": 0.7035, + "step": 1006 + }, + { + "epoch": 0.4035228182546037, + "learning_rate": 1.5184413457049006e-05, + "loss": 0.1875, + "step": 1008 + }, + { + "epoch": 0.4043234587670136, + "learning_rate": 1.5208293685377354e-05, + "loss": 0.0098, + "step": 1010 + }, + { + "epoch": 0.40512409927942356, + "learning_rate": 1.5232133226083954e-05, + "loss": 0.0197, + "step": 1012 + }, + { + "epoch": 0.40592473979183347, + "learning_rate": 1.5255931892932322e-05, + "loss": 0.2072, + "step": 1014 + }, + { + "epoch": 0.4067253803042434, + "learning_rate": 1.527968950000533e-05, + "loss": 0.0416, + "step": 1016 + }, + { + "epoch": 0.40752602081665334, + "learning_rate": 1.5303405861706574e-05, + "loss": 0.1154, + "step": 1018 + }, + { + "epoch": 0.40832666132906326, + "learning_rate": 1.532708079276185e-05, + "loss": 0.5119, + "step": 1020 + }, + { + "epoch": 0.40912730184147317, + "learning_rate": 1.5350714108220667e-05, + "loss": 0.1093, + "step": 1022 + }, + { + "epoch": 0.4099279423538831, + "learning_rate": 1.5374305623457594e-05, + "loss": 0.8024, + "step": 1024 + }, + { + "epoch": 0.41072858286629305, + "learning_rate": 1.539785515417376e-05, + "loss": 0.0614, + "step": 1026 + }, + { + "epoch": 0.41152922337870296, + "learning_rate": 1.542136251639826e-05, + "loss": 0.0484, + "step": 1028 + }, + { + "epoch": 0.41232986389111287, + "learning_rate": 1.5444827526489668e-05, + "loss": 0.0458, + "step": 1030 + }, + { + "epoch": 0.41313050440352284, + "learning_rate": 1.5468250001137368e-05, + "loss": 0.1341, + "step": 1032 + }, + { + "epoch": 0.41393114491593275, + "learning_rate": 1.5491629757363026e-05, + "loss": 0.0131, + "step": 1034 + }, + { + "epoch": 0.41473178542834266, + "learning_rate": 1.551496661252208e-05, + "loss": 0.213, + "step": 1036 + }, + { + "epoch": 0.4155324259407526, + "learning_rate": 1.5538260384305073e-05, + "loss": 0.1959, + "step": 1038 + }, + { + "epoch": 0.41633306645316254, + "learning_rate": 1.5561510890739113e-05, + "loss": 0.0839, + "step": 1040 + }, + { + "epoch": 0.41713370696557245, + "learning_rate": 1.5584717950189353e-05, + "loss": 0.1903, + "step": 1042 + }, + { + "epoch": 0.41793434747798236, + "learning_rate": 1.5607881381360296e-05, + "loss": 0.0977, + "step": 1044 + }, + { + "epoch": 0.4187349879903923, + "learning_rate": 1.563100100329731e-05, + "loss": 0.2273, + "step": 1046 + }, + { + "epoch": 0.41953562850280224, + "learning_rate": 1.565407663538797e-05, + "loss": 0.0025, + "step": 1048 + }, + { + "epoch": 0.42033626901521215, + "learning_rate": 1.567710809736356e-05, + "loss": 0.3549, + "step": 1050 + }, + { + "epoch": 0.4211369095276221, + "learning_rate": 1.5700095209300376e-05, + "loss": 0.2379, + "step": 1052 + }, + { + "epoch": 0.42193755004003203, + "learning_rate": 1.572303779162118e-05, + "loss": 0.0229, + "step": 1054 + }, + { + "epoch": 0.42273819055244194, + "learning_rate": 1.5745935665096647e-05, + "loss": 0.2703, + "step": 1056 + }, + { + "epoch": 0.4235388310648519, + "learning_rate": 1.5768788650846677e-05, + "loss": 0.2931, + "step": 1058 + }, + { + "epoch": 0.4243394715772618, + "learning_rate": 1.5791596570341844e-05, + "loss": 0.1855, + "step": 1060 + }, + { + "epoch": 0.42514011208967173, + "learning_rate": 1.581435924540481e-05, + "loss": 0.0015, + "step": 1062 + }, + { + "epoch": 0.42594075260208164, + "learning_rate": 1.5837076498211666e-05, + "loss": 0.0475, + "step": 1064 + }, + { + "epoch": 0.4267413931144916, + "learning_rate": 1.5859748151293333e-05, + "loss": 0.7034, + "step": 1066 + }, + { + "epoch": 0.4275420336269015, + "learning_rate": 1.5882374027537005e-05, + "loss": 0.0162, + "step": 1068 + }, + { + "epoch": 0.42834267413931143, + "learning_rate": 1.5904953950187455e-05, + "loss": 0.2178, + "step": 1070 + }, + { + "epoch": 0.4291433146517214, + "learning_rate": 1.5927487742848448e-05, + "loss": 0.1326, + "step": 1072 + }, + { + "epoch": 0.4299439551641313, + "learning_rate": 1.594997522948412e-05, + "loss": 0.0228, + "step": 1074 + }, + { + "epoch": 0.4307445956765412, + "learning_rate": 1.5972416234420393e-05, + "loss": 0.0204, + "step": 1076 + }, + { + "epoch": 0.4315452361889512, + "learning_rate": 1.599481058234626e-05, + "loss": 0.1959, + "step": 1078 + }, + { + "epoch": 0.4323458767013611, + "learning_rate": 1.60171580983152e-05, + "loss": 0.1799, + "step": 1080 + }, + { + "epoch": 0.433146517213771, + "learning_rate": 1.6039458607746607e-05, + "loss": 0.0039, + "step": 1082 + }, + { + "epoch": 0.4339471577261809, + "learning_rate": 1.606171193642703e-05, + "loss": 0.0226, + "step": 1084 + }, + { + "epoch": 0.4347477982385909, + "learning_rate": 1.6083917910511616e-05, + "loss": 0.3353, + "step": 1086 + }, + { + "epoch": 0.4355484387510008, + "learning_rate": 1.6106076356525474e-05, + "loss": 0.0478, + "step": 1088 + }, + { + "epoch": 0.4363490792634107, + "learning_rate": 1.6128187101364982e-05, + "loss": 0.0055, + "step": 1090 + }, + { + "epoch": 0.4371497197758207, + "learning_rate": 1.6150249972299153e-05, + "loss": 0.0331, + "step": 1092 + }, + { + "epoch": 0.4379503602882306, + "learning_rate": 1.617226479697104e-05, + "loss": 0.2329, + "step": 1094 + }, + { + "epoch": 0.4387510008006405, + "learning_rate": 1.6194231403398994e-05, + "loss": 0.0068, + "step": 1096 + }, + { + "epoch": 0.43955164131305047, + "learning_rate": 1.621614961997806e-05, + "loss": 0.0051, + "step": 1098 + }, + { + "epoch": 0.4403522818254604, + "learning_rate": 1.6238019275481313e-05, + "loss": 0.1169, + "step": 1100 + }, + { + "epoch": 0.4411529223378703, + "learning_rate": 1.6259840199061212e-05, + "loss": 0.036, + "step": 1102 + }, + { + "epoch": 0.4419535628502802, + "learning_rate": 1.6281612220250883e-05, + "loss": 0.2857, + "step": 1104 + }, + { + "epoch": 0.44275420336269017, + "learning_rate": 1.6303335168965474e-05, + "loss": 0.2708, + "step": 1106 + }, + { + "epoch": 0.4435548438751001, + "learning_rate": 1.6325008875503543e-05, + "loss": 0.175, + "step": 1108 + }, + { + "epoch": 0.44435548438751, + "learning_rate": 1.6346633170548285e-05, + "loss": 0.4169, + "step": 1110 + }, + { + "epoch": 0.44515612489991996, + "learning_rate": 1.6368207885168897e-05, + "loss": 0.0723, + "step": 1112 + }, + { + "epoch": 0.44595676541232987, + "learning_rate": 1.6389732850821957e-05, + "loss": 0.0994, + "step": 1114 + }, + { + "epoch": 0.4467574059247398, + "learning_rate": 1.641120789935263e-05, + "loss": 0.1522, + "step": 1116 + }, + { + "epoch": 0.4475580464371497, + "learning_rate": 1.6432632862996042e-05, + "loss": 0.0611, + "step": 1118 + }, + { + "epoch": 0.44835868694955966, + "learning_rate": 1.6454007574378637e-05, + "loss": 0.0052, + "step": 1120 + }, + { + "epoch": 0.44915932746196957, + "learning_rate": 1.6475331866519377e-05, + "loss": 0.0285, + "step": 1122 + }, + { + "epoch": 0.4499599679743795, + "learning_rate": 1.6496605572831134e-05, + "loss": 0.3768, + "step": 1124 + }, + { + "epoch": 0.45076060848678945, + "learning_rate": 1.6517828527121928e-05, + "loss": 0.3831, + "step": 1126 + }, + { + "epoch": 0.45156124899919936, + "learning_rate": 1.6539000563596318e-05, + "loss": 0.034, + "step": 1128 + }, + { + "epoch": 0.45236188951160927, + "learning_rate": 1.6560121516856586e-05, + "loss": 0.1842, + "step": 1130 + }, + { + "epoch": 0.45316253002401924, + "learning_rate": 1.6581191221904077e-05, + "loss": 0.014, + "step": 1132 + }, + { + "epoch": 0.45396317053642915, + "learning_rate": 1.6602209514140542e-05, + "loss": 0.0154, + "step": 1134 + }, + { + "epoch": 0.45476381104883906, + "learning_rate": 1.6623176229369324e-05, + "loss": 0.0072, + "step": 1136 + }, + { + "epoch": 0.455564451561249, + "learning_rate": 1.6644091203796694e-05, + "loss": 0.3086, + "step": 1138 + }, + { + "epoch": 0.45636509207365894, + "learning_rate": 1.6664954274033168e-05, + "loss": 0.2891, + "step": 1140 + }, + { + "epoch": 0.45716573258606885, + "learning_rate": 1.6685765277094695e-05, + "loss": 0.4844, + "step": 1142 + }, + { + "epoch": 0.45796637309847876, + "learning_rate": 1.6706524050403996e-05, + "loss": 0.1555, + "step": 1144 + }, + { + "epoch": 0.45876701361088873, + "learning_rate": 1.6727230431791806e-05, + "loss": 0.0794, + "step": 1146 + }, + { + "epoch": 0.45956765412329864, + "learning_rate": 1.674788425949818e-05, + "loss": 0.0244, + "step": 1148 + }, + { + "epoch": 0.46036829463570855, + "learning_rate": 1.6768485372173696e-05, + "loss": 0.1516, + "step": 1150 + }, + { + "epoch": 0.4611689351481185, + "learning_rate": 1.6789033608880735e-05, + "loss": 0.1434, + "step": 1152 + }, + { + "epoch": 0.46196957566052843, + "learning_rate": 1.6809528809094798e-05, + "loss": 0.0315, + "step": 1154 + }, + { + "epoch": 0.46277021617293834, + "learning_rate": 1.6829970812705674e-05, + "loss": 0.0286, + "step": 1156 + }, + { + "epoch": 0.46357085668534825, + "learning_rate": 1.6850359460018733e-05, + "loss": 0.2848, + "step": 1158 + }, + { + "epoch": 0.4643714971977582, + "learning_rate": 1.6870694591756165e-05, + "loss": 0.01, + "step": 1160 + }, + { + "epoch": 0.46517213771016813, + "learning_rate": 1.689097604905826e-05, + "loss": 0.414, + "step": 1162 + }, + { + "epoch": 0.46597277822257804, + "learning_rate": 1.6911203673484577e-05, + "loss": 0.2419, + "step": 1164 + }, + { + "epoch": 0.466773418734988, + "learning_rate": 1.6931377307015226e-05, + "loss": 0.0608, + "step": 1166 + }, + { + "epoch": 0.4675740592473979, + "learning_rate": 1.695149679205214e-05, + "loss": 0.0715, + "step": 1168 + }, + { + "epoch": 0.46837469975980783, + "learning_rate": 1.6971561971420222e-05, + "loss": 0.0714, + "step": 1170 + }, + { + "epoch": 0.4691753402722178, + "learning_rate": 1.6991572688368628e-05, + "loss": 0.2044, + "step": 1172 + }, + { + "epoch": 0.4699759807846277, + "learning_rate": 1.701152878657196e-05, + "loss": 0.0005, + "step": 1174 + }, + { + "epoch": 0.4707766212970376, + "learning_rate": 1.7031430110131562e-05, + "loss": 0.5394, + "step": 1176 + }, + { + "epoch": 0.47157726180944753, + "learning_rate": 1.705127650357662e-05, + "loss": 0.071, + "step": 1178 + }, + { + "epoch": 0.4723779023218575, + "learning_rate": 1.7071067811865467e-05, + "loss": 0.0164, + "step": 1180 + }, + { + "epoch": 0.4731785428342674, + "learning_rate": 1.7090803880386778e-05, + "loss": 0.3293, + "step": 1182 + }, + { + "epoch": 0.4739791833466773, + "learning_rate": 1.7110484554960738e-05, + "loss": 0.0002, + "step": 1184 + }, + { + "epoch": 0.4747798238590873, + "learning_rate": 1.713010968184029e-05, + "loss": 0.0311, + "step": 1186 + }, + { + "epoch": 0.4755804643714972, + "learning_rate": 1.7149679107712306e-05, + "loss": 0.0367, + "step": 1188 + }, + { + "epoch": 0.4763811048839071, + "learning_rate": 1.716919267969883e-05, + "loss": 0.1595, + "step": 1190 + }, + { + "epoch": 0.4771817453963171, + "learning_rate": 1.7188650245358215e-05, + "loss": 0.0407, + "step": 1192 + }, + { + "epoch": 0.477982385908727, + "learning_rate": 1.7208051652686338e-05, + "loss": 0.0003, + "step": 1194 + }, + { + "epoch": 0.4787830264211369, + "learning_rate": 1.722739675011779e-05, + "loss": 0.0018, + "step": 1196 + }, + { + "epoch": 0.4795836669335468, + "learning_rate": 1.7246685386527095e-05, + "loss": 0.0405, + "step": 1198 + }, + { + "epoch": 0.4803843074459568, + "learning_rate": 1.726591741122981e-05, + "loss": 0.4007, + "step": 1200 + }, + { + "epoch": 0.4811849479583667, + "learning_rate": 1.7285092673983753e-05, + "loss": 0.0304, + "step": 1202 + }, + { + "epoch": 0.4819855884707766, + "learning_rate": 1.730421102499021e-05, + "loss": 0.0034, + "step": 1204 + }, + { + "epoch": 0.48278622898318657, + "learning_rate": 1.7323272314895022e-05, + "loss": 0.0027, + "step": 1206 + }, + { + "epoch": 0.4835868694955965, + "learning_rate": 1.734227639478982e-05, + "loss": 0.1113, + "step": 1208 + }, + { + "epoch": 0.4843875100080064, + "learning_rate": 1.736122311621314e-05, + "loss": 0.0414, + "step": 1210 + }, + { + "epoch": 0.4851881505204163, + "learning_rate": 1.738011233115165e-05, + "loss": 0.2865, + "step": 1212 + }, + { + "epoch": 0.4859887910328263, + "learning_rate": 1.7398943892041227e-05, + "loss": 0.237, + "step": 1214 + }, + { + "epoch": 0.4867894315452362, + "learning_rate": 1.7417717651768144e-05, + "loss": 0.0062, + "step": 1216 + }, + { + "epoch": 0.4875900720576461, + "learning_rate": 1.743643346367026e-05, + "loss": 0.1424, + "step": 1218 + }, + { + "epoch": 0.48839071257005606, + "learning_rate": 1.7455091181538087e-05, + "loss": 0.0031, + "step": 1220 + }, + { + "epoch": 0.489191353082466, + "learning_rate": 1.7473690659615992e-05, + "loss": 0.0813, + "step": 1222 + }, + { + "epoch": 0.4899919935948759, + "learning_rate": 1.74922317526033e-05, + "loss": 0.0447, + "step": 1224 + }, + { + "epoch": 0.49079263410728585, + "learning_rate": 1.7510714315655474e-05, + "loss": 0.0468, + "step": 1226 + }, + { + "epoch": 0.49159327461969576, + "learning_rate": 1.752913820438519e-05, + "loss": 0.001, + "step": 1228 + }, + { + "epoch": 0.4923939151321057, + "learning_rate": 1.7547503274863495e-05, + "loss": 1.1001, + "step": 1230 + }, + { + "epoch": 0.4931945556445156, + "learning_rate": 1.756580938362096e-05, + "loss": 0.4151, + "step": 1232 + }, + { + "epoch": 0.49399519615692555, + "learning_rate": 1.758405638764873e-05, + "loss": 0.071, + "step": 1234 + }, + { + "epoch": 0.49479583666933546, + "learning_rate": 1.7602244144399693e-05, + "loss": 0.0568, + "step": 1236 + }, + { + "epoch": 0.4955964771817454, + "learning_rate": 1.7620372511789604e-05, + "loss": 0.0461, + "step": 1238 + }, + { + "epoch": 0.49639711769415534, + "learning_rate": 1.7638441348198147e-05, + "loss": 0.0024, + "step": 1240 + }, + { + "epoch": 0.49719775820656525, + "learning_rate": 1.7656450512470077e-05, + "loss": 0.02, + "step": 1242 + }, + { + "epoch": 0.49799839871897517, + "learning_rate": 1.7674399863916295e-05, + "loss": 0.1453, + "step": 1244 + }, + { + "epoch": 0.49879903923138513, + "learning_rate": 1.7692289262315e-05, + "loss": 0.5886, + "step": 1246 + }, + { + "epoch": 0.49959967974379504, + "learning_rate": 1.771011856791273e-05, + "loss": 0.2157, + "step": 1248 + }, + { + "epoch": 0.500400320256205, + "learning_rate": 1.7727887641425448e-05, + "loss": 0.1327, + "step": 1250 + }, + { + "epoch": 0.5012009607686149, + "learning_rate": 1.7745596344039712e-05, + "loss": 0.9044, + "step": 1252 + }, + { + "epoch": 0.5020016012810248, + "learning_rate": 1.7763244537413657e-05, + "loss": 0.3071, + "step": 1254 + }, + { + "epoch": 0.5028022417934348, + "learning_rate": 1.7780832083678116e-05, + "loss": 0.6591, + "step": 1256 + }, + { + "epoch": 0.5036028823058447, + "learning_rate": 1.7798358845437754e-05, + "loss": 0.3133, + "step": 1258 + }, + { + "epoch": 0.5044035228182546, + "learning_rate": 1.7815824685772035e-05, + "loss": 0.0013, + "step": 1260 + }, + { + "epoch": 0.5052041633306645, + "learning_rate": 1.7833229468236364e-05, + "loss": 0.0022, + "step": 1262 + }, + { + "epoch": 0.5060048038430744, + "learning_rate": 1.7850573056863156e-05, + "loss": 0.7067, + "step": 1264 + }, + { + "epoch": 0.5068054443554844, + "learning_rate": 1.786785531616285e-05, + "loss": 0.3535, + "step": 1266 + }, + { + "epoch": 0.5076060848678943, + "learning_rate": 1.7885076111125004e-05, + "loss": 0.0113, + "step": 1268 + }, + { + "epoch": 0.5084067253803043, + "learning_rate": 1.790223530721933e-05, + "loss": 0.1172, + "step": 1270 + }, + { + "epoch": 0.5092073658927142, + "learning_rate": 1.791933277039679e-05, + "loss": 0.1106, + "step": 1272 + }, + { + "epoch": 0.5100080064051241, + "learning_rate": 1.7936368367090577e-05, + "loss": 0.2106, + "step": 1274 + }, + { + "epoch": 0.510808646917534, + "learning_rate": 1.7953341964217183e-05, + "loss": 0.0005, + "step": 1276 + }, + { + "epoch": 0.5116092874299439, + "learning_rate": 1.7970253429177477e-05, + "loss": 0.0089, + "step": 1278 + }, + { + "epoch": 0.5124099279423538, + "learning_rate": 1.7987102629857696e-05, + "loss": 0.3194, + "step": 1280 + }, + { + "epoch": 0.5132105684547638, + "learning_rate": 1.800388943463047e-05, + "loss": 0.5188, + "step": 1282 + }, + { + "epoch": 0.5140112089671738, + "learning_rate": 1.8020613712355912e-05, + "loss": 0.0203, + "step": 1284 + }, + { + "epoch": 0.5148118494795837, + "learning_rate": 1.803727533238257e-05, + "loss": 0.0003, + "step": 1286 + }, + { + "epoch": 0.5156124899919936, + "learning_rate": 1.805387416454847e-05, + "loss": 0.0806, + "step": 1288 + }, + { + "epoch": 0.5164131305044035, + "learning_rate": 1.8070410079182195e-05, + "loss": 0.0052, + "step": 1290 + }, + { + "epoch": 0.5172137710168134, + "learning_rate": 1.8086882947103787e-05, + "loss": 0.0723, + "step": 1292 + }, + { + "epoch": 0.5180144115292233, + "learning_rate": 1.8103292639625842e-05, + "loss": 0.0202, + "step": 1294 + }, + { + "epoch": 0.5188150520416333, + "learning_rate": 1.811963902855447e-05, + "loss": 0.0011, + "step": 1296 + }, + { + "epoch": 0.5196156925540433, + "learning_rate": 1.813592198619035e-05, + "loss": 0.0015, + "step": 1298 + }, + { + "epoch": 0.5204163330664532, + "learning_rate": 1.8152141385329658e-05, + "loss": 1.0666, + "step": 1300 + }, + { + "epoch": 0.5212169735788631, + "learning_rate": 1.816829709926509e-05, + "loss": 0.4121, + "step": 1302 + }, + { + "epoch": 0.522017614091273, + "learning_rate": 1.8184389001786895e-05, + "loss": 0.0388, + "step": 1304 + }, + { + "epoch": 0.5228182546036829, + "learning_rate": 1.8200416967183785e-05, + "loss": 0.0572, + "step": 1306 + }, + { + "epoch": 0.5236188951160928, + "learning_rate": 1.821638087024396e-05, + "loss": 0.2627, + "step": 1308 + }, + { + "epoch": 0.5244195356285029, + "learning_rate": 1.8232280586256097e-05, + "loss": 0.0204, + "step": 1310 + }, + { + "epoch": 0.5252201761409128, + "learning_rate": 1.8248115991010296e-05, + "loss": 0.6114, + "step": 1312 + }, + { + "epoch": 0.5260208166533227, + "learning_rate": 1.8263886960799055e-05, + "loss": 0.5067, + "step": 1314 + }, + { + "epoch": 0.5268214571657326, + "learning_rate": 1.8279593372418264e-05, + "loss": 0.1669, + "step": 1316 + }, + { + "epoch": 0.5276220976781425, + "learning_rate": 1.829523510316813e-05, + "loss": 0.0117, + "step": 1318 + }, + { + "epoch": 0.5284227381905524, + "learning_rate": 1.8310812030854155e-05, + "loss": 1.2405, + "step": 1320 + }, + { + "epoch": 0.5292233787029623, + "learning_rate": 1.832632403378808e-05, + "loss": 0.0463, + "step": 1322 + }, + { + "epoch": 0.5300240192153723, + "learning_rate": 1.834177099078887e-05, + "loss": 0.1823, + "step": 1324 + }, + { + "epoch": 0.5308246597277823, + "learning_rate": 1.8357152781183606e-05, + "loss": 0.1365, + "step": 1326 + }, + { + "epoch": 0.5316253002401922, + "learning_rate": 1.8372469284808465e-05, + "loss": 0.0824, + "step": 1328 + }, + { + "epoch": 0.5324259407526021, + "learning_rate": 1.8387720382009665e-05, + "loss": 0.0296, + "step": 1330 + }, + { + "epoch": 0.533226581265012, + "learning_rate": 1.840290595364436e-05, + "loss": 0.4264, + "step": 1332 + }, + { + "epoch": 0.5340272217774219, + "learning_rate": 1.8418025881081606e-05, + "loss": 0.0419, + "step": 1334 + }, + { + "epoch": 0.5348278622898318, + "learning_rate": 1.8433080046203286e-05, + "loss": 0.0008, + "step": 1336 + }, + { + "epoch": 0.5356285028022418, + "learning_rate": 1.844806833140501e-05, + "loss": 0.0093, + "step": 1338 + }, + { + "epoch": 0.5364291433146517, + "learning_rate": 1.8462990619597054e-05, + "loss": 0.1367, + "step": 1340 + }, + { + "epoch": 0.5372297838270617, + "learning_rate": 1.8477846794205258e-05, + "loss": 0.2671, + "step": 1342 + }, + { + "epoch": 0.5380304243394716, + "learning_rate": 1.8492636739171966e-05, + "loss": 0.4726, + "step": 1344 + }, + { + "epoch": 0.5388310648518815, + "learning_rate": 1.85073603389569e-05, + "loss": 0.1133, + "step": 1346 + }, + { + "epoch": 0.5396317053642914, + "learning_rate": 1.8522017478538067e-05, + "loss": 0.0364, + "step": 1348 + }, + { + "epoch": 0.5404323458767014, + "learning_rate": 1.8536608043412695e-05, + "loss": 0.5208, + "step": 1350 + }, + { + "epoch": 0.5412329863891113, + "learning_rate": 1.855113191959808e-05, + "loss": 0.0647, + "step": 1352 + }, + { + "epoch": 0.5420336269015212, + "learning_rate": 1.856558899363248e-05, + "loss": 0.3076, + "step": 1354 + }, + { + "epoch": 0.5428342674139311, + "learning_rate": 1.8579979152576063e-05, + "loss": 0.1763, + "step": 1356 + }, + { + "epoch": 0.5436349079263411, + "learning_rate": 1.85943022840117e-05, + "loss": 0.2104, + "step": 1358 + }, + { + "epoch": 0.544435548438751, + "learning_rate": 1.8608558276045895e-05, + "loss": 0.1484, + "step": 1360 + }, + { + "epoch": 0.5452361889511609, + "learning_rate": 1.862274701730967e-05, + "loss": 0.0616, + "step": 1362 + }, + { + "epoch": 0.5460368294635709, + "learning_rate": 1.86368683969594e-05, + "loss": 0.137, + "step": 1364 + }, + { + "epoch": 0.5468374699759808, + "learning_rate": 1.865092230467769e-05, + "loss": 0.0258, + "step": 1366 + }, + { + "epoch": 0.5476381104883907, + "learning_rate": 1.866490863067425e-05, + "loss": 0.0026, + "step": 1368 + }, + { + "epoch": 0.5484387510008006, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.576, + "step": 1370 + }, + { + "epoch": 0.5492393915132106, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.2616, + "step": 1372 + }, + { + "epoch": 0.5500400320256205, + "learning_rate": 1.87064610283551e-05, + "loss": 0.2087, + "step": 1374 + }, + { + "epoch": 0.5508406725380304, + "learning_rate": 1.8720175940133705e-05, + "loss": 0.3665, + "step": 1376 + }, + { + "epoch": 0.5516413130504404, + "learning_rate": 1.873382272917545e-05, + "loss": 0.0261, + "step": 1378 + }, + { + "epoch": 0.5524419535628503, + "learning_rate": 1.8747401288870472e-05, + "loss": 0.1751, + "step": 1380 + }, + { + "epoch": 0.5532425940752602, + "learning_rate": 1.876091151314196e-05, + "loss": 0.0816, + "step": 1382 + }, + { + "epoch": 0.5540432345876701, + "learning_rate": 1.877435329644691e-05, + "loss": 0.3357, + "step": 1384 + }, + { + "epoch": 0.55484387510008, + "learning_rate": 1.8787726533776996e-05, + "loss": 0.0258, + "step": 1386 + }, + { + "epoch": 0.55564451561249, + "learning_rate": 1.8801031120659393e-05, + "loss": 0.0184, + "step": 1388 + }, + { + "epoch": 0.5564451561248999, + "learning_rate": 1.8814266953157557e-05, + "loss": 0.1036, + "step": 1390 + }, + { + "epoch": 0.5572457966373099, + "learning_rate": 1.8827433927872066e-05, + "loss": 0.0367, + "step": 1392 + }, + { + "epoch": 0.5580464371497198, + "learning_rate": 1.8840531941941415e-05, + "loss": 0.0078, + "step": 1394 + }, + { + "epoch": 0.5588470776621297, + "learning_rate": 1.8853560893042854e-05, + "loss": 0.0865, + "step": 1396 + }, + { + "epoch": 0.5596477181745396, + "learning_rate": 1.8866520679393127e-05, + "loss": 0.1166, + "step": 1398 + }, + { + "epoch": 0.5604483586869495, + "learning_rate": 1.8879411199749303e-05, + "loss": 0.1748, + "step": 1400 + }, + { + "epoch": 0.5612489991993594, + "learning_rate": 1.889223235340958e-05, + "loss": 0.0104, + "step": 1402 + }, + { + "epoch": 0.5620496397117695, + "learning_rate": 1.8904984040214037e-05, + "loss": 0.0223, + "step": 1404 + }, + { + "epoch": 0.5628502802241794, + "learning_rate": 1.8917666160545436e-05, + "loss": 0.2058, + "step": 1406 + }, + { + "epoch": 0.5636509207365893, + "learning_rate": 1.893027861533002e-05, + "loss": 0.1483, + "step": 1408 + }, + { + "epoch": 0.5644515612489992, + "learning_rate": 1.894282130603823e-05, + "loss": 0.0009, + "step": 1410 + }, + { + "epoch": 0.5652522017614091, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.085, + "step": 1412 + }, + { + "epoch": 0.566052842273819, + "learning_rate": 1.896769700383315e-05, + "loss": 0.0829, + "step": 1414 + }, + { + "epoch": 0.5668534827862289, + "learning_rate": 1.898002981658886e-05, + "loss": 0.0484, + "step": 1416 + }, + { + "epoch": 0.567654123298639, + "learning_rate": 1.899229247660769e-05, + "loss": 0.0887, + "step": 1418 + }, + { + "epoch": 0.5684547638110489, + "learning_rate": 1.9004484888092724e-05, + "loss": 0.0026, + "step": 1420 + }, + { + "epoch": 0.5692554043234588, + "learning_rate": 1.901660695579585e-05, + "loss": 0.0261, + "step": 1422 + }, + { + "epoch": 0.5700560448358687, + "learning_rate": 1.9028658585018455e-05, + "loss": 0.0455, + "step": 1424 + }, + { + "epoch": 0.5708566853482786, + "learning_rate": 1.9040639681612212e-05, + "loss": 0.0646, + "step": 1426 + }, + { + "epoch": 0.5716573258606885, + "learning_rate": 1.9052550151979816e-05, + "loss": 0.4623, + "step": 1428 + }, + { + "epoch": 0.5724579663730984, + "learning_rate": 1.9064389903075676e-05, + "loss": 0.9076, + "step": 1430 + }, + { + "epoch": 0.5732586068855084, + "learning_rate": 1.9076158842406674e-05, + "loss": 0.4521, + "step": 1432 + }, + { + "epoch": 0.5740592473979184, + "learning_rate": 1.9087856878032886e-05, + "loss": 0.5421, + "step": 1434 + }, + { + "epoch": 0.5748598879103283, + "learning_rate": 1.909948391856829e-05, + "loss": 0.233, + "step": 1436 + }, + { + "epoch": 0.5756605284227382, + "learning_rate": 1.911103987318148e-05, + "loss": 0.3778, + "step": 1438 + }, + { + "epoch": 0.5764611689351481, + "learning_rate": 1.912252465159637e-05, + "loss": 0.0183, + "step": 1440 + }, + { + "epoch": 0.577261809447558, + "learning_rate": 1.913393816409294e-05, + "loss": 0.3066, + "step": 1442 + }, + { + "epoch": 0.578062449959968, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.2076, + "step": 1444 + }, + { + "epoch": 0.5788630904723779, + "learning_rate": 1.9156551035235288e-05, + "loss": 0.0097, + "step": 1446 + }, + { + "epoch": 0.5796637309847879, + "learning_rate": 1.9167750217227454e-05, + "loss": 0.0277, + "step": 1448 + }, + { + "epoch": 0.5804643714971978, + "learning_rate": 1.9178877779995423e-05, + "loss": 0.0006, + "step": 1450 + }, + { + "epoch": 0.5812650120096077, + "learning_rate": 1.9189933636609747e-05, + "loss": 0.0798, + "step": 1452 + }, + { + "epoch": 0.5820656525220176, + "learning_rate": 1.9200917700701173e-05, + "loss": 0.0329, + "step": 1454 + }, + { + "epoch": 0.5828662930344275, + "learning_rate": 1.9211829886461274e-05, + "loss": 0.0512, + "step": 1456 + }, + { + "epoch": 0.5836669335468375, + "learning_rate": 1.9222670108643146e-05, + "loss": 0.2518, + "step": 1458 + }, + { + "epoch": 0.5844675740592474, + "learning_rate": 1.9233438282562085e-05, + "loss": 0.0258, + "step": 1460 + }, + { + "epoch": 0.5852682145716573, + "learning_rate": 1.924413432409622e-05, + "loss": 0.1864, + "step": 1462 + }, + { + "epoch": 0.5860688550840673, + "learning_rate": 1.925475814968719e-05, + "loss": 0.0999, + "step": 1464 + }, + { + "epoch": 0.5868694955964772, + "learning_rate": 1.926530967634078e-05, + "loss": 0.2455, + "step": 1466 + }, + { + "epoch": 0.5876701361088871, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.0058, + "step": 1468 + }, + { + "epoch": 0.588470776621297, + "learning_rate": 1.9286195503683705e-05, + "loss": 0.0325, + "step": 1470 + }, + { + "epoch": 0.589271417133707, + "learning_rate": 1.9296529641211215e-05, + "loss": 0.1963, + "step": 1472 + }, + { + "epoch": 0.5900720576461169, + "learning_rate": 1.9306791153479004e-05, + "loss": 0.285, + "step": 1474 + }, + { + "epoch": 0.5908726981585268, + "learning_rate": 1.9316979960323286e-05, + "loss": 0.0647, + "step": 1476 + }, + { + "epoch": 0.5916733386709367, + "learning_rate": 1.932709598214825e-05, + "loss": 0.2397, + "step": 1478 + }, + { + "epoch": 0.5924739791833467, + "learning_rate": 1.9337139139926707e-05, + "loss": 0.0307, + "step": 1480 + }, + { + "epoch": 0.5932746196957566, + "learning_rate": 1.9347109355200672e-05, + "loss": 0.0144, + "step": 1482 + }, + { + "epoch": 0.5940752602081665, + "learning_rate": 1.935700655008199e-05, + "loss": 0.043, + "step": 1484 + }, + { + "epoch": 0.5948759007205765, + "learning_rate": 1.9366830647252967e-05, + "loss": 0.0332, + "step": 1486 + }, + { + "epoch": 0.5956765412329864, + "learning_rate": 1.9376581569966933e-05, + "loss": 0.2263, + "step": 1488 + }, + { + "epoch": 0.5964771817453963, + "learning_rate": 1.9386259242048883e-05, + "loss": 0.2772, + "step": 1490 + }, + { + "epoch": 0.5972778222578062, + "learning_rate": 1.939586358789602e-05, + "loss": 0.2263, + "step": 1492 + }, + { + "epoch": 0.5980784627702161, + "learning_rate": 1.940539453247842e-05, + "loss": 0.059, + "step": 1494 + }, + { + "epoch": 0.5988791032826261, + "learning_rate": 1.9414852001339547e-05, + "loss": 0.087, + "step": 1496 + }, + { + "epoch": 0.5996797437950361, + "learning_rate": 1.9424235920596863e-05, + "loss": 0.1917, + "step": 1498 + }, + { + "epoch": 0.600480384307446, + "learning_rate": 1.9433546216942423e-05, + "loss": 0.012, + "step": 1500 + }, + { + "epoch": 0.6012810248198559, + "learning_rate": 1.944278281764342e-05, + "loss": 0.5585, + "step": 1502 + }, + { + "epoch": 0.6020816653322658, + "learning_rate": 1.945194565054276e-05, + "loss": 0.5613, + "step": 1504 + }, + { + "epoch": 0.6028823058446757, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.0463, + "step": 1506 + }, + { + "epoch": 0.6036829463570856, + "learning_rate": 1.9470049727190073e-05, + "loss": 0.0108, + "step": 1508 + }, + { + "epoch": 0.6044835868694955, + "learning_rate": 1.9478990829507504e-05, + "loss": 0.0188, + "step": 1510 + }, + { + "epoch": 0.6052842273819056, + "learning_rate": 1.948785788116329e-05, + "loss": 0.0289, + "step": 1512 + }, + { + "epoch": 0.6060848678943155, + "learning_rate": 1.9496650812887286e-05, + "loss": 0.2851, + "step": 1514 + }, + { + "epoch": 0.6068855084067254, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.4127, + "step": 1516 + }, + { + "epoch": 0.6076861489191353, + "learning_rate": 1.951401404235505e-05, + "loss": 0.0305, + "step": 1518 + }, + { + "epoch": 0.6084867894315452, + "learning_rate": 1.952258420445583e-05, + "loss": 1.2352, + "step": 1520 + }, + { + "epoch": 0.6092874299439551, + "learning_rate": 1.9531079975339912e-05, + "loss": 0.1052, + "step": 1522 + }, + { + "epoch": 0.610088070456365, + "learning_rate": 1.953950128863762e-05, + "loss": 0.7068, + "step": 1524 + }, + { + "epoch": 0.6108887109687751, + "learning_rate": 1.9547848078560975e-05, + "loss": 0.2665, + "step": 1526 + }, + { + "epoch": 0.611689351481185, + "learning_rate": 1.9556120279904144e-05, + "loss": 0.1457, + "step": 1528 + }, + { + "epoch": 0.6124899919935949, + "learning_rate": 1.956431782804402e-05, + "loss": 0.3066, + "step": 1530 + }, + { + "epoch": 0.6132906325060048, + "learning_rate": 1.957244065894066e-05, + "loss": 0.1436, + "step": 1532 + }, + { + "epoch": 0.6140912730184147, + "learning_rate": 1.9580488709137858e-05, + "loss": 0.0271, + "step": 1534 + }, + { + "epoch": 0.6148919135308246, + "learning_rate": 1.9588461915763566e-05, + "loss": 0.1551, + "step": 1536 + }, + { + "epoch": 0.6156925540432346, + "learning_rate": 1.9596360216530436e-05, + "loss": 0.208, + "step": 1538 + }, + { + "epoch": 0.6164931945556446, + "learning_rate": 1.9604183549736283e-05, + "loss": 0.2467, + "step": 1540 + }, + { + "epoch": 0.6172938350680545, + "learning_rate": 1.961193185426459e-05, + "loss": 0.1003, + "step": 1542 + }, + { + "epoch": 0.6180944755804644, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.1313, + "step": 1544 + }, + { + "epoch": 0.6188951160928743, + "learning_rate": 1.9627203135753573e-05, + "loss": 0.6395, + "step": 1546 + }, + { + "epoch": 0.6196957566052842, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.0611, + "step": 1548 + }, + { + "epoch": 0.6204963971176941, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.1077, + "step": 1550 + }, + { + "epoch": 0.6212970376301041, + "learning_rate": 1.964954584871995e-05, + "loss": 0.1798, + "step": 1552 + }, + { + "epoch": 0.622097678142514, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.2445, + "step": 1554 + }, + { + "epoch": 0.622898318654924, + "learning_rate": 1.966406417240872e-05, + "loss": 0.1383, + "step": 1556 + }, + { + "epoch": 0.6236989591673339, + "learning_rate": 1.967121011775546e-05, + "loss": 0.4587, + "step": 1558 + }, + { + "epoch": 0.6244995996797438, + "learning_rate": 1.967828051080755e-05, + "loss": 0.1159, + "step": 1560 + }, + { + "epoch": 0.6253002401921537, + "learning_rate": 1.9685275296330497e-05, + "loss": 0.1475, + "step": 1562 + }, + { + "epoch": 0.6261008807045636, + "learning_rate": 1.969219441968046e-05, + "loss": 0.1966, + "step": 1564 + }, + { + "epoch": 0.6269015212169736, + "learning_rate": 1.969903782680467e-05, + "loss": 0.0793, + "step": 1566 + }, + { + "epoch": 0.6277021617293835, + "learning_rate": 1.9705805464241856e-05, + "loss": 0.2738, + "step": 1568 + }, + { + "epoch": 0.6285028022417934, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.324, + "step": 1570 + }, + { + "epoch": 0.6293034427542034, + "learning_rate": 1.971911321917015e-05, + "loss": 0.0414, + "step": 1572 + }, + { + "epoch": 0.6301040832666133, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.6407, + "step": 1574 + }, + { + "epoch": 0.6309047237790232, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.2573, + "step": 1576 + }, + { + "epoch": 0.6317053642914331, + "learning_rate": 1.9738505276435692e-05, + "loss": 0.1595, + "step": 1578 + }, + { + "epoch": 0.6325060048038431, + "learning_rate": 1.9744817206240377e-05, + "loss": 0.0171, + "step": 1580 + }, + { + "epoch": 0.633306645316253, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.0161, + "step": 1582 + }, + { + "epoch": 0.6341072858286629, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.1234, + "step": 1584 + }, + { + "epoch": 0.6349079263410728, + "learning_rate": 1.9763296037475174e-05, + "loss": 0.0682, + "step": 1586 + }, + { + "epoch": 0.6357085668534828, + "learning_rate": 1.976930316809569e-05, + "loss": 0.0004, + "step": 1588 + }, + { + "epoch": 0.6365092073658927, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.0794, + "step": 1590 + }, + { + "epoch": 0.6373098478783027, + "learning_rate": 1.978108842718768e-05, + "loss": 0.0015, + "step": 1592 + }, + { + "epoch": 0.6381104883907126, + "learning_rate": 1.9786866463591732e-05, + "loss": 0.0004, + "step": 1594 + }, + { + "epoch": 0.6389111289031225, + "learning_rate": 1.9792568044184176e-05, + "loss": 0.0039, + "step": 1596 + }, + { + "epoch": 0.6397117694155324, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.0006, + "step": 1598 + }, + { + "epoch": 0.6405124099279423, + "learning_rate": 1.9803741660367015e-05, + "loss": 1.0098, + "step": 1600 + }, + { + "epoch": 0.6413130504403523, + "learning_rate": 1.9809213608668185e-05, + "loss": 0.2293, + "step": 1602 + }, + { + "epoch": 0.6421136909527622, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.0008, + "step": 1604 + }, + { + "epoch": 0.6429143314651722, + "learning_rate": 1.9819927571953807e-05, + "loss": 0.0945, + "step": 1606 + }, + { + "epoch": 0.6437149719775821, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.0001, + "step": 1608 + }, + { + "epoch": 0.644515612489992, + "learning_rate": 1.983033467948784e-05, + "loss": 0.0111, + "step": 1610 + }, + { + "epoch": 0.6453162530024019, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.287, + "step": 1612 + }, + { + "epoch": 0.6461168935148118, + "learning_rate": 1.9840434606066182e-05, + "loss": 0.0001, + "step": 1614 + }, + { + "epoch": 0.6469175340272217, + "learning_rate": 1.9845369277495102e-05, + "loss": 0.6377, + "step": 1616 + }, + { + "epoch": 0.6477181745396317, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.1366, + "step": 1618 + }, + { + "epoch": 0.6485188150520417, + "learning_rate": 1.985500784388244e-05, + "loss": 0.4037, + "step": 1620 + }, + { + "epoch": 0.6493194555644516, + "learning_rate": 1.985971166354357e-05, + "loss": 0.0514, + "step": 1622 + }, + { + "epoch": 0.6501200960768615, + "learning_rate": 1.9864338458320366e-05, + "loss": 0.1403, + "step": 1624 + }, + { + "epoch": 0.6509207365892714, + "learning_rate": 1.986888819206792e-05, + "loss": 0.3532, + "step": 1626 + }, + { + "epoch": 0.6517213771016813, + "learning_rate": 1.9873360829243323e-05, + "loss": 0.0647, + "step": 1628 + }, + { + "epoch": 0.6525220176140912, + "learning_rate": 1.9877756334905983e-05, + "loss": 0.2665, + "step": 1630 + }, + { + "epoch": 0.6533226581265013, + "learning_rate": 1.9882074674717836e-05, + "loss": 0.0796, + "step": 1632 + }, + { + "epoch": 0.6541232986389112, + "learning_rate": 1.988631581494365e-05, + "loss": 0.2642, + "step": 1634 + }, + { + "epoch": 0.6549239391513211, + "learning_rate": 1.989047972245129e-05, + "loss": 0.6593, + "step": 1636 + }, + { + "epoch": 0.655724579663731, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.0056, + "step": 1638 + }, + { + "epoch": 0.6565252201761409, + "learning_rate": 1.989857570980049e-05, + "loss": 0.0001, + "step": 1640 + }, + { + "epoch": 0.6573258606885508, + "learning_rate": 1.990250772639552e-05, + "loss": 0.0549, + "step": 1642 + }, + { + "epoch": 0.6581265012009607, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.385, + "step": 1644 + }, + { + "epoch": 0.6589271417133707, + "learning_rate": 1.99101396518405e-05, + "loss": 0.6175, + "step": 1646 + }, + { + "epoch": 0.6597277822257807, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.0549, + "step": 1648 + }, + { + "epoch": 0.6605284227381906, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.244, + "step": 1650 + }, + { + "epoch": 0.6613290632506005, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.2337, + "step": 1652 + }, + { + "epoch": 0.6621297037630104, + "learning_rate": 1.9924474249753652e-05, + "loss": 0.0053, + "step": 1654 + }, + { + "epoch": 0.6629303442754203, + "learning_rate": 1.9927864140670615e-05, + "loss": 0.112, + "step": 1656 + }, + { + "epoch": 0.6637309847878302, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.5662, + "step": 1658 + }, + { + "epoch": 0.6645316253002402, + "learning_rate": 1.99344112247369e-05, + "loss": 0.2069, + "step": 1660 + }, + { + "epoch": 0.6653322658126501, + "learning_rate": 1.9937568366739858e-05, + "loss": 0.0612, + "step": 1662 + }, + { + "epoch": 0.6661329063250601, + "learning_rate": 1.9940647875635463e-05, + "loss": 0.0775, + "step": 1664 + }, + { + "epoch": 0.66693354683747, + "learning_rate": 1.9943649727366335e-05, + "loss": 0.1098, + "step": 1666 + }, + { + "epoch": 0.6677341873498799, + "learning_rate": 1.994657389848176e-05, + "loss": 0.0182, + "step": 1668 + }, + { + "epoch": 0.6685348278622898, + "learning_rate": 1.994942036613787e-05, + "loss": 0.1451, + "step": 1670 + }, + { + "epoch": 0.6693354683746997, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.1388, + "step": 1672 + }, + { + "epoch": 0.6701361088871097, + "learning_rate": 1.995488010273198e-05, + "loss": 0.6024, + "step": 1674 + }, + { + "epoch": 0.6709367493995196, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.0001, + "step": 1676 + }, + { + "epoch": 0.6717373899119295, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.0327, + "step": 1678 + }, + { + "epoch": 0.6725380304243395, + "learning_rate": 1.996248639549475e-05, + "loss": 0.0524, + "step": 1680 + }, + { + "epoch": 0.6733386709367494, + "learning_rate": 1.9964866196679105e-05, + "loss": 0.226, + "step": 1682 + }, + { + "epoch": 0.6741393114491593, + "learning_rate": 1.9967168151503196e-05, + "loss": 0.2091, + "step": 1684 + }, + { + "epoch": 0.6749399519615693, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.2656, + "step": 1686 + }, + { + "epoch": 0.6757405924739792, + "learning_rate": 1.997153845074662e-05, + "loss": 0.137, + "step": 1688 + }, + { + "epoch": 0.6765412329863891, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.2958, + "step": 1690 + }, + { + "epoch": 0.677341873498799, + "learning_rate": 1.997559715666073e-05, + "loss": 0.2331, + "step": 1692 + }, + { + "epoch": 0.678142514011209, + "learning_rate": 1.9977509622105233e-05, + "loss": 0.0886, + "step": 1694 + }, + { + "epoch": 0.6789431545236189, + "learning_rate": 1.9979344142417986e-05, + "loss": 0.0793, + "step": 1696 + }, + { + "epoch": 0.6797437950360288, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.3767, + "step": 1698 + }, + { + "epoch": 0.6805444355484388, + "learning_rate": 1.998277929093157e-05, + "loss": 0.5332, + "step": 1700 + }, + { + "epoch": 0.6813450760608487, + "learning_rate": 1.998437989229673e-05, + "loss": 0.0276, + "step": 1702 + }, + { + "epoch": 0.6821457165732586, + "learning_rate": 1.9985902494859023e-05, + "loss": 0.0048, + "step": 1704 + }, + { + "epoch": 0.6829463570856685, + "learning_rate": 1.998734708672375e-05, + "loss": 0.2072, + "step": 1706 + }, + { + "epoch": 0.6837469975980784, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.0324, + "step": 1708 + }, + { + "epoch": 0.6845476381104884, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.2072, + "step": 1710 + }, + { + "epoch": 0.6853482786228983, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.1363, + "step": 1712 + }, + { + "epoch": 0.6861489191353083, + "learning_rate": 1.9992345130644747e-05, + "loss": 0.074, + "step": 1714 + }, + { + "epoch": 0.6869495596477182, + "learning_rate": 1.999339951193407e-05, + "loss": 0.0117, + "step": 1716 + }, + { + "epoch": 0.6877502001601281, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.0054, + "step": 1718 + }, + { + "epoch": 0.688550840672538, + "learning_rate": 1.9995274059091018e-05, + "loss": 0.0035, + "step": 1720 + }, + { + "epoch": 0.6893514811849479, + "learning_rate": 1.999609421031453e-05, + "loss": 0.0076, + "step": 1722 + }, + { + "epoch": 0.6901521216973578, + "learning_rate": 1.999683627122195e-05, + "loss": 0.057, + "step": 1724 + }, + { + "epoch": 0.6909527622097679, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.2443, + "step": 1726 + }, + { + "epoch": 0.6917534027221778, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.0951, + "step": 1728 + }, + { + "epoch": 0.6925540432345877, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.0034, + "step": 1730 + }, + { + "epoch": 0.6933546837469976, + "learning_rate": 1.99990235049015e-05, + "loss": 0.3352, + "step": 1732 + }, + { + "epoch": 0.6941553242594075, + "learning_rate": 1.9999375039475275e-05, + "loss": 0.1609, + "step": 1734 + }, + { + "epoch": 0.6949559647718174, + "learning_rate": 1.999964845810285e-05, + "loss": 0.0943, + "step": 1736 + }, + { + "epoch": 0.6957566052842273, + "learning_rate": 1.9999843758648253e-05, + "loss": 0.04, + "step": 1738 + }, + { + "epoch": 0.6965572457966374, + "learning_rate": 1.999996093958578e-05, + "loss": 1.0398, + "step": 1740 + }, + { + "epoch": 0.6973578863090473, + "learning_rate": 2e-05, + "loss": 0.0, + "step": 1742 + }, + { + "epoch": 0.6981585268214572, + "learning_rate": 1.999996093958578e-05, + "loss": 0.0001, + "step": 1744 + }, + { + "epoch": 0.6989591673338671, + "learning_rate": 1.9999843758648253e-05, + "loss": 0.4942, + "step": 1746 + }, + { + "epoch": 0.699759807846277, + "learning_rate": 1.999964845810285e-05, + "loss": 0.437, + "step": 1748 + }, + { + "epoch": 0.7005604483586869, + "learning_rate": 1.9999375039475278e-05, + "loss": 0.6542, + "step": 1750 + }, + { + "epoch": 0.7013610888710968, + "learning_rate": 1.99990235049015e-05, + "loss": 0.0511, + "step": 1752 + }, + { + "epoch": 0.7021617293835068, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.129, + "step": 1754 + }, + { + "epoch": 0.7029623698959168, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.0515, + "step": 1756 + }, + { + "epoch": 0.7037630104083267, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.0468, + "step": 1758 + }, + { + "epoch": 0.7045636509207366, + "learning_rate": 1.999683627122195e-05, + "loss": 0.0317, + "step": 1760 + }, + { + "epoch": 0.7053642914331465, + "learning_rate": 1.999609421031453e-05, + "loss": 0.0642, + "step": 1762 + }, + { + "epoch": 0.7061649319455564, + "learning_rate": 1.999527405909102e-05, + "loss": 0.2104, + "step": 1764 + }, + { + "epoch": 0.7069655724579663, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.0037, + "step": 1766 + }, + { + "epoch": 0.7077662129703763, + "learning_rate": 1.999339951193407e-05, + "loss": 0.9791, + "step": 1768 + }, + { + "epoch": 0.7085668534827863, + "learning_rate": 1.999234513064475e-05, + "loss": 0.151, + "step": 1770 + }, + { + "epoch": 0.7093674939951962, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.0518, + "step": 1772 + }, + { + "epoch": 0.7101681345076061, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.3055, + "step": 1774 + }, + { + "epoch": 0.710968775020016, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.1985, + "step": 1776 + }, + { + "epoch": 0.7117694155324259, + "learning_rate": 1.998734708672375e-05, + "loss": 0.2032, + "step": 1778 + }, + { + "epoch": 0.7125700560448359, + "learning_rate": 1.9985902494859026e-05, + "loss": 0.0645, + "step": 1780 + }, + { + "epoch": 0.7133706965572458, + "learning_rate": 1.9984379892296735e-05, + "loss": 0.0838, + "step": 1782 + }, + { + "epoch": 0.7141713370696557, + "learning_rate": 1.9982779290931572e-05, + "loss": 0.0237, + "step": 1784 + }, + { + "epoch": 0.7149719775820657, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.1042, + "step": 1786 + }, + { + "epoch": 0.7157726180944756, + "learning_rate": 1.997934414241799e-05, + "loss": 0.4274, + "step": 1788 + }, + { + "epoch": 0.7165732586068855, + "learning_rate": 1.9977509622105236e-05, + "loss": 0.4342, + "step": 1790 + }, + { + "epoch": 0.7173738991192954, + "learning_rate": 1.997559715666073e-05, + "loss": 0.0642, + "step": 1792 + }, + { + "epoch": 0.7181745396317054, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.0001, + "step": 1794 + }, + { + "epoch": 0.7189751801441153, + "learning_rate": 1.997153845074662e-05, + "loss": 0.1242, + "step": 1796 + }, + { + "epoch": 0.7197758206565252, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.5144, + "step": 1798 + }, + { + "epoch": 0.7205764611689351, + "learning_rate": 1.9967168151503193e-05, + "loss": 0.018, + "step": 1800 + }, + { + "epoch": 0.7213771016813451, + "learning_rate": 1.996486619667911e-05, + "loss": 0.3147, + "step": 1802 + }, + { + "epoch": 0.722177742193755, + "learning_rate": 1.9962486395494753e-05, + "loss": 0.0839, + "step": 1804 + }, + { + "epoch": 0.7229783827061649, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.2524, + "step": 1806 + }, + { + "epoch": 0.7237790232185749, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.0291, + "step": 1808 + }, + { + "epoch": 0.7245796637309848, + "learning_rate": 1.995488010273198e-05, + "loss": 0.0327, + "step": 1810 + }, + { + "epoch": 0.7253803042433947, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.1003, + "step": 1812 + }, + { + "epoch": 0.7261809447558046, + "learning_rate": 1.9949420366137873e-05, + "loss": 0.1213, + "step": 1814 + }, + { + "epoch": 0.7269815852682145, + "learning_rate": 1.994657389848176e-05, + "loss": 0.1595, + "step": 1816 + }, + { + "epoch": 0.7277822257806245, + "learning_rate": 1.994364972736634e-05, + "loss": 0.0425, + "step": 1818 + }, + { + "epoch": 0.7285828662930345, + "learning_rate": 1.9940647875635466e-05, + "loss": 0.3022, + "step": 1820 + }, + { + "epoch": 0.7293835068054444, + "learning_rate": 1.993756836673986e-05, + "loss": 0.0623, + "step": 1822 + }, + { + "epoch": 0.7301841473178543, + "learning_rate": 1.99344112247369e-05, + "loss": 0.072, + "step": 1824 + }, + { + "epoch": 0.7309847878302642, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.0061, + "step": 1826 + }, + { + "epoch": 0.7317854283426741, + "learning_rate": 1.9927864140670618e-05, + "loss": 0.1777, + "step": 1828 + }, + { + "epoch": 0.732586068855084, + "learning_rate": 1.9924474249753656e-05, + "loss": 0.0074, + "step": 1830 + }, + { + "epoch": 0.733386709367494, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.0852, + "step": 1832 + }, + { + "epoch": 0.734187349879904, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.1199, + "step": 1834 + }, + { + "epoch": 0.7349879903923139, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.3061, + "step": 1836 + }, + { + "epoch": 0.7357886309047238, + "learning_rate": 1.9910139651840497e-05, + "loss": 0.245, + "step": 1838 + }, + { + "epoch": 0.7365892714171337, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.1088, + "step": 1840 + }, + { + "epoch": 0.7373899119295436, + "learning_rate": 1.9902507726395524e-05, + "loss": 0.0967, + "step": 1842 + }, + { + "epoch": 0.7381905524419535, + "learning_rate": 1.989857570980049e-05, + "loss": 0.0053, + "step": 1844 + }, + { + "epoch": 0.7389911929543634, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.0575, + "step": 1846 + }, + { + "epoch": 0.7397918334667735, + "learning_rate": 1.9890479722451292e-05, + "loss": 0.1528, + "step": 1848 + }, + { + "epoch": 0.7405924739791834, + "learning_rate": 1.988631581494365e-05, + "loss": 0.1595, + "step": 1850 + }, + { + "epoch": 0.7413931144915933, + "learning_rate": 1.9882074674717832e-05, + "loss": 0.0333, + "step": 1852 + }, + { + "epoch": 0.7421937550040032, + "learning_rate": 1.987775633490599e-05, + "loss": 0.3554, + "step": 1854 + }, + { + "epoch": 0.7429943955164131, + "learning_rate": 1.987336082924333e-05, + "loss": 0.2075, + "step": 1856 + }, + { + "epoch": 0.743795036028823, + "learning_rate": 1.986888819206792e-05, + "loss": 0.0049, + "step": 1858 + }, + { + "epoch": 0.7445956765412329, + "learning_rate": 1.986433845832037e-05, + "loss": 0.7761, + "step": 1860 + }, + { + "epoch": 0.745396317053643, + "learning_rate": 1.9859711663543573e-05, + "loss": 0.0058, + "step": 1862 + }, + { + "epoch": 0.7461969575660529, + "learning_rate": 1.9855007843882437e-05, + "loss": 0.0043, + "step": 1864 + }, + { + "epoch": 0.7469975980784628, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.085, + "step": 1866 + }, + { + "epoch": 0.7477982385908727, + "learning_rate": 1.9845369277495105e-05, + "loss": 0.4881, + "step": 1868 + }, + { + "epoch": 0.7485988791032826, + "learning_rate": 1.9840434606066186e-05, + "loss": 0.0032, + "step": 1870 + }, + { + "epoch": 0.7493995196156925, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.4315, + "step": 1872 + }, + { + "epoch": 0.7502001601281025, + "learning_rate": 1.983033467948784e-05, + "loss": 0.1179, + "step": 1874 + }, + { + "epoch": 0.7510008006405124, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.3049, + "step": 1876 + }, + { + "epoch": 0.7518014411529224, + "learning_rate": 1.9819927571953804e-05, + "loss": 0.4883, + "step": 1878 + }, + { + "epoch": 0.7526020816653323, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.016, + "step": 1880 + }, + { + "epoch": 0.7534027221777422, + "learning_rate": 1.980921360866819e-05, + "loss": 0.2162, + "step": 1882 + }, + { + "epoch": 0.7542033626901521, + "learning_rate": 1.9803741660367018e-05, + "loss": 0.0303, + "step": 1884 + }, + { + "epoch": 0.755004003202562, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.1446, + "step": 1886 + }, + { + "epoch": 0.755804643714972, + "learning_rate": 1.979256804418418e-05, + "loss": 0.0763, + "step": 1888 + }, + { + "epoch": 0.7566052842273819, + "learning_rate": 1.978686646359173e-05, + "loss": 0.0216, + "step": 1890 + }, + { + "epoch": 0.7574059247397918, + "learning_rate": 1.9781088427187677e-05, + "loss": 0.0079, + "step": 1892 + }, + { + "epoch": 0.7582065652522018, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.1249, + "step": 1894 + }, + { + "epoch": 0.7590072057646117, + "learning_rate": 1.976930316809569e-05, + "loss": 0.049, + "step": 1896 + }, + { + "epoch": 0.7598078462770216, + "learning_rate": 1.9763296037475177e-05, + "loss": 0.0049, + "step": 1898 + }, + { + "epoch": 0.7606084867894315, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.4573, + "step": 1900 + }, + { + "epoch": 0.7614091273018415, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.0006, + "step": 1902 + }, + { + "epoch": 0.7622097678142514, + "learning_rate": 1.9744817206240374e-05, + "loss": 0.8807, + "step": 1904 + }, + { + "epoch": 0.7630104083266613, + "learning_rate": 1.9738505276435695e-05, + "loss": 0.002, + "step": 1906 + }, + { + "epoch": 0.7638110488390712, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.1593, + "step": 1908 + }, + { + "epoch": 0.7646116893514812, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.1775, + "step": 1910 + }, + { + "epoch": 0.7654123298638911, + "learning_rate": 1.9719113219170152e-05, + "loss": 0.1082, + "step": 1912 + }, + { + "epoch": 0.7662129703763011, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.6593, + "step": 1914 + }, + { + "epoch": 0.767013610888711, + "learning_rate": 1.970580546424186e-05, + "loss": 0.0108, + "step": 1916 + }, + { + "epoch": 0.7678142514011209, + "learning_rate": 1.969903782680467e-05, + "loss": 0.0004, + "step": 1918 + }, + { + "epoch": 0.7686148919135308, + "learning_rate": 1.9692194419680463e-05, + "loss": 0.5401, + "step": 1920 + }, + { + "epoch": 0.7694155324259407, + "learning_rate": 1.96852752963305e-05, + "loss": 0.035, + "step": 1922 + }, + { + "epoch": 0.7702161729383507, + "learning_rate": 1.9678280510807552e-05, + "loss": 0.493, + "step": 1924 + }, + { + "epoch": 0.7710168134507606, + "learning_rate": 1.9671210117755462e-05, + "loss": 0.0044, + "step": 1926 + }, + { + "epoch": 0.7718174539631706, + "learning_rate": 1.966406417240872e-05, + "loss": 0.2809, + "step": 1928 + }, + { + "epoch": 0.7726180944755805, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.3123, + "step": 1930 + }, + { + "epoch": 0.7734187349879904, + "learning_rate": 1.964954584871995e-05, + "loss": 0.2852, + "step": 1932 + }, + { + "epoch": 0.7742193755004003, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.034, + "step": 1934 + }, + { + "epoch": 0.7750200160128102, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.1179, + "step": 1936 + }, + { + "epoch": 0.7758206565252201, + "learning_rate": 1.9627203135753576e-05, + "loss": 0.5843, + "step": 1938 + }, + { + "epoch": 0.77662129703763, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.3006, + "step": 1940 + }, + { + "epoch": 0.7774219375500401, + "learning_rate": 1.961193185426459e-05, + "loss": 0.2783, + "step": 1942 + }, + { + "epoch": 0.77822257806245, + "learning_rate": 1.9604183549736287e-05, + "loss": 0.1199, + "step": 1944 + }, + { + "epoch": 0.7790232185748599, + "learning_rate": 1.959636021653044e-05, + "loss": 0.0503, + "step": 1946 + }, + { + "epoch": 0.7798238590872698, + "learning_rate": 1.958846191576357e-05, + "loss": 0.0891, + "step": 1948 + }, + { + "epoch": 0.7806244995996797, + "learning_rate": 1.958048870913786e-05, + "loss": 0.0513, + "step": 1950 + }, + { + "epoch": 0.7814251401120896, + "learning_rate": 1.9572440658940667e-05, + "loss": 0.0511, + "step": 1952 + }, + { + "epoch": 0.7822257806244995, + "learning_rate": 1.9564317828044022e-05, + "loss": 0.1877, + "step": 1954 + }, + { + "epoch": 0.7830264211369096, + "learning_rate": 1.955612027990415e-05, + "loss": 0.6169, + "step": 1956 + }, + { + "epoch": 0.7838270616493195, + "learning_rate": 1.9547848078560982e-05, + "loss": 0.0882, + "step": 1958 + }, + { + "epoch": 0.7846277021617294, + "learning_rate": 1.953950128863763e-05, + "loss": 0.0981, + "step": 1960 + }, + { + "epoch": 0.7854283426741393, + "learning_rate": 1.9531079975339915e-05, + "loss": 0.2532, + "step": 1962 + }, + { + "epoch": 0.7862289831865492, + "learning_rate": 1.9522584204455835e-05, + "loss": 0.2072, + "step": 1964 + }, + { + "epoch": 0.7870296236989591, + "learning_rate": 1.9514014042355054e-05, + "loss": 0.0466, + "step": 1966 + }, + { + "epoch": 0.7878302642113691, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.0137, + "step": 1968 + }, + { + "epoch": 0.7886309047237791, + "learning_rate": 1.9496650812887293e-05, + "loss": 0.2165, + "step": 1970 + }, + { + "epoch": 0.789431545236189, + "learning_rate": 1.9487857881163295e-05, + "loss": 0.0011, + "step": 1972 + }, + { + "epoch": 0.7902321857485989, + "learning_rate": 1.947899082950751e-05, + "loss": 0.0164, + "step": 1974 + }, + { + "epoch": 0.7910328262610088, + "learning_rate": 1.947004972719008e-05, + "loss": 0.0203, + "step": 1976 + }, + { + "epoch": 0.7918334667734187, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.0281, + "step": 1978 + }, + { + "epoch": 0.7926341072858286, + "learning_rate": 1.945194565054276e-05, + "loss": 0.1098, + "step": 1980 + }, + { + "epoch": 0.7934347477982386, + "learning_rate": 1.9442782817643425e-05, + "loss": 0.0039, + "step": 1982 + }, + { + "epoch": 0.7942353883106485, + "learning_rate": 1.9433546216942433e-05, + "loss": 0.0008, + "step": 1984 + }, + { + "epoch": 0.7950360288230585, + "learning_rate": 1.942423592059687e-05, + "loss": 0.018, + "step": 1986 + }, + { + "epoch": 0.7958366693354684, + "learning_rate": 1.941485200133955e-05, + "loss": 0.0747, + "step": 1988 + }, + { + "epoch": 0.7966373098478783, + "learning_rate": 1.9405394532478422e-05, + "loss": 0.1602, + "step": 1990 + }, + { + "epoch": 0.7974379503602882, + "learning_rate": 1.9395863587896025e-05, + "loss": 0.0003, + "step": 1992 + }, + { + "epoch": 0.7982385908726981, + "learning_rate": 1.938625924204888e-05, + "loss": 0.1318, + "step": 1994 + }, + { + "epoch": 0.7990392313851081, + "learning_rate": 1.937658156996694e-05, + "loss": 0.1455, + "step": 1996 + }, + { + "epoch": 0.799839871897518, + "learning_rate": 1.9366830647252977e-05, + "loss": 0.0093, + "step": 1998 + }, + { + "epoch": 0.800640512409928, + "learning_rate": 1.9357006550082e-05, + "loss": 0.0004, + "step": 2000 + }, + { + "epoch": 0.8014411529223379, + "learning_rate": 1.9347109355200676e-05, + "loss": 0.0004, + "step": 2002 + }, + { + "epoch": 0.8022417934347478, + "learning_rate": 1.933713913992671e-05, + "loss": 0.0145, + "step": 2004 + }, + { + "epoch": 0.8030424339471577, + "learning_rate": 1.9327095982148255e-05, + "loss": 0.0362, + "step": 2006 + }, + { + "epoch": 0.8038430744595677, + "learning_rate": 1.9316979960323283e-05, + "loss": 0.0801, + "step": 2008 + }, + { + "epoch": 0.8046437149719776, + "learning_rate": 1.9306791153479017e-05, + "loss": 1.5897, + "step": 2010 + }, + { + "epoch": 0.8054443554843875, + "learning_rate": 1.9296529641211226e-05, + "loss": 0.1746, + "step": 2012 + }, + { + "epoch": 0.8062449959967974, + "learning_rate": 1.928619550368371e-05, + "loss": 0.1106, + "step": 2014 + }, + { + "epoch": 0.8070456365092074, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.1474, + "step": 2016 + }, + { + "epoch": 0.8078462770216173, + "learning_rate": 1.9265309676340783e-05, + "loss": 0.0476, + "step": 2018 + }, + { + "epoch": 0.8086469175340272, + "learning_rate": 1.9254758149687187e-05, + "loss": 0.2874, + "step": 2020 + }, + { + "epoch": 0.8094475580464372, + "learning_rate": 1.9244134324096216e-05, + "loss": 0.0621, + "step": 2022 + }, + { + "epoch": 0.8102481985588471, + "learning_rate": 1.9233438282562095e-05, + "loss": 0.029, + "step": 2024 + }, + { + "epoch": 0.811048839071257, + "learning_rate": 1.9222670108643156e-05, + "loss": 0.1902, + "step": 2026 + }, + { + "epoch": 0.8118494795836669, + "learning_rate": 1.9211829886461278e-05, + "loss": 0.0029, + "step": 2028 + }, + { + "epoch": 0.8126501200960768, + "learning_rate": 1.9200917700701176e-05, + "loss": 0.0412, + "step": 2030 + }, + { + "epoch": 0.8134507606084868, + "learning_rate": 1.918993363660975e-05, + "loss": 0.1754, + "step": 2032 + }, + { + "epoch": 0.8142514011208967, + "learning_rate": 1.9178877779995416e-05, + "loss": 0.549, + "step": 2034 + }, + { + "epoch": 0.8150520416333067, + "learning_rate": 1.916775021722745e-05, + "loss": 0.0605, + "step": 2036 + }, + { + "epoch": 0.8158526821457166, + "learning_rate": 1.9156551035235298e-05, + "loss": 0.016, + "step": 2038 + }, + { + "epoch": 0.8166533226581265, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.0589, + "step": 2040 + }, + { + "epoch": 0.8174539631705364, + "learning_rate": 1.9133938164092942e-05, + "loss": 0.0003, + "step": 2042 + }, + { + "epoch": 0.8182546036829463, + "learning_rate": 1.9122524651596372e-05, + "loss": 0.1746, + "step": 2044 + }, + { + "epoch": 0.8190552441953562, + "learning_rate": 1.9111039873181475e-05, + "loss": 0.0019, + "step": 2046 + }, + { + "epoch": 0.8198558847077662, + "learning_rate": 1.9099483918568287e-05, + "loss": 0.0479, + "step": 2048 + }, + { + "epoch": 0.8206565252201762, + "learning_rate": 1.90878568780329e-05, + "loss": 1.0782, + "step": 2050 + }, + { + "epoch": 0.8214571657325861, + "learning_rate": 1.907615884240668e-05, + "loss": 0.0231, + "step": 2052 + }, + { + "epoch": 0.822257806244996, + "learning_rate": 1.9064389903075683e-05, + "loss": 0.1322, + "step": 2054 + }, + { + "epoch": 0.8230584467574059, + "learning_rate": 1.905255015197982e-05, + "loss": 0.027, + "step": 2056 + }, + { + "epoch": 0.8238590872698158, + "learning_rate": 1.9040639681612216e-05, + "loss": 0.2416, + "step": 2058 + }, + { + "epoch": 0.8246597277822257, + "learning_rate": 1.902865858501845e-05, + "loss": 0.0458, + "step": 2060 + }, + { + "epoch": 0.8254603682946358, + "learning_rate": 1.9016606955795843e-05, + "loss": 0.102, + "step": 2062 + }, + { + "epoch": 0.8262610088070457, + "learning_rate": 1.9004484888092734e-05, + "loss": 0.01, + "step": 2064 + }, + { + "epoch": 0.8270616493194556, + "learning_rate": 1.8992292476607695e-05, + "loss": 0.1245, + "step": 2066 + }, + { + "epoch": 0.8278622898318655, + "learning_rate": 1.8980029816588863e-05, + "loss": 0.0033, + "step": 2068 + }, + { + "epoch": 0.8286629303442754, + "learning_rate": 1.8967697003833156e-05, + "loss": 0.0778, + "step": 2070 + }, + { + "epoch": 0.8294635708566853, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.0348, + "step": 2072 + }, + { + "epoch": 0.8302642113690952, + "learning_rate": 1.8942821306038227e-05, + "loss": 0.1087, + "step": 2074 + }, + { + "epoch": 0.8310648518815053, + "learning_rate": 1.893027861533003e-05, + "loss": 0.0668, + "step": 2076 + }, + { + "epoch": 0.8318654923939152, + "learning_rate": 1.891766616054545e-05, + "loss": 0.0428, + "step": 2078 + }, + { + "epoch": 0.8326661329063251, + "learning_rate": 1.8904984040214043e-05, + "loss": 0.0036, + "step": 2080 + }, + { + "epoch": 0.833466773418735, + "learning_rate": 1.8892232353409582e-05, + "loss": 0.1369, + "step": 2082 + }, + { + "epoch": 0.8342674139311449, + "learning_rate": 1.8879411199749306e-05, + "loss": 0.0157, + "step": 2084 + }, + { + "epoch": 0.8350680544435548, + "learning_rate": 1.8866520679393124e-05, + "loss": 0.0054, + "step": 2086 + }, + { + "epoch": 0.8358686949559647, + "learning_rate": 1.885356089304285e-05, + "loss": 0.2856, + "step": 2088 + }, + { + "epoch": 0.8366693354683747, + "learning_rate": 1.884053194194143e-05, + "loss": 0.0027, + "step": 2090 + }, + { + "epoch": 0.8374699759807847, + "learning_rate": 1.882743392787207e-05, + "loss": 0.0271, + "step": 2092 + }, + { + "epoch": 0.8382706164931946, + "learning_rate": 1.881426695315756e-05, + "loss": 0.8077, + "step": 2094 + }, + { + "epoch": 0.8390712570056045, + "learning_rate": 1.8801031120659396e-05, + "loss": 0.0015, + "step": 2096 + }, + { + "epoch": 0.8398718975180144, + "learning_rate": 1.8787726533777003e-05, + "loss": 0.0077, + "step": 2098 + }, + { + "epoch": 0.8406725380304243, + "learning_rate": 1.877435329644691e-05, + "loss": 0.1006, + "step": 2100 + }, + { + "epoch": 0.8414731785428343, + "learning_rate": 1.8760911513141974e-05, + "loss": 0.265, + "step": 2102 + }, + { + "epoch": 0.8422738190552442, + "learning_rate": 1.8747401288870482e-05, + "loss": 0.0466, + "step": 2104 + }, + { + "epoch": 0.8430744595676541, + "learning_rate": 1.8733822729175455e-05, + "loss": 0.0131, + "step": 2106 + }, + { + "epoch": 0.8438751000800641, + "learning_rate": 1.8720175940133712e-05, + "loss": 0.1588, + "step": 2108 + }, + { + "epoch": 0.844675740592474, + "learning_rate": 1.8706461028355107e-05, + "loss": 0.1333, + "step": 2110 + }, + { + "epoch": 0.8454763811048839, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.1233, + "step": 2112 + }, + { + "epoch": 0.8462770216172938, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.005, + "step": 2114 + }, + { + "epoch": 0.8470776621297038, + "learning_rate": 1.8664908630674264e-05, + "loss": 0.0406, + "step": 2116 + }, + { + "epoch": 0.8478783026421137, + "learning_rate": 1.86509223046777e-05, + "loss": 0.3074, + "step": 2118 + }, + { + "epoch": 0.8486789431545236, + "learning_rate": 1.8636868396959406e-05, + "loss": 0.1088, + "step": 2120 + }, + { + "epoch": 0.8494795836669335, + "learning_rate": 1.8622747017309676e-05, + "loss": 0.0509, + "step": 2122 + }, + { + "epoch": 0.8502802241793435, + "learning_rate": 1.8608558276045898e-05, + "loss": 0.007, + "step": 2124 + }, + { + "epoch": 0.8510808646917534, + "learning_rate": 1.8594302284011697e-05, + "loss": 0.1313, + "step": 2126 + }, + { + "epoch": 0.8518815052041633, + "learning_rate": 1.8579979152576076e-05, + "loss": 0.1767, + "step": 2128 + }, + { + "epoch": 0.8526821457165733, + "learning_rate": 1.8565588993632498e-05, + "loss": 0.0047, + "step": 2130 + }, + { + "epoch": 0.8534827862289832, + "learning_rate": 1.8551131919598084e-05, + "loss": 0.0034, + "step": 2132 + }, + { + "epoch": 0.8542834267413931, + "learning_rate": 1.8536608043412702e-05, + "loss": 0.088, + "step": 2134 + }, + { + "epoch": 0.855084067253803, + "learning_rate": 1.852201747853807e-05, + "loss": 0.0974, + "step": 2136 + }, + { + "epoch": 0.855884707766213, + "learning_rate": 1.8507360338956896e-05, + "loss": 0.0255, + "step": 2138 + }, + { + "epoch": 0.8566853482786229, + "learning_rate": 1.849263673917196e-05, + "loss": 0.8135, + "step": 2140 + }, + { + "epoch": 0.8574859887910328, + "learning_rate": 1.847784679420527e-05, + "loss": 0.0076, + "step": 2142 + }, + { + "epoch": 0.8582866293034428, + "learning_rate": 1.846299061959706e-05, + "loss": 0.3762, + "step": 2144 + }, + { + "epoch": 0.8590872698158527, + "learning_rate": 1.8448068331405018e-05, + "loss": 0.0019, + "step": 2146 + }, + { + "epoch": 0.8598879103282626, + "learning_rate": 1.8433080046203293e-05, + "loss": 0.0317, + "step": 2148 + }, + { + "epoch": 0.8606885508406725, + "learning_rate": 1.841802588108161e-05, + "loss": 0.0818, + "step": 2150 + }, + { + "epoch": 0.8614891913530824, + "learning_rate": 1.8402905953644356e-05, + "loss": 0.1322, + "step": 2152 + }, + { + "epoch": 0.8622898318654924, + "learning_rate": 1.838772038200968e-05, + "loss": 0.0925, + "step": 2154 + }, + { + "epoch": 0.8630904723779024, + "learning_rate": 1.837246928480848e-05, + "loss": 0.1591, + "step": 2156 + }, + { + "epoch": 0.8638911128903123, + "learning_rate": 1.8357152781183613e-05, + "loss": 0.2538, + "step": 2158 + }, + { + "epoch": 0.8646917534027222, + "learning_rate": 1.8341770990788874e-05, + "loss": 0.0033, + "step": 2160 + }, + { + "epoch": 0.8654923939151321, + "learning_rate": 1.8326324033788087e-05, + "loss": 0.007, + "step": 2162 + }, + { + "epoch": 0.866293034427542, + "learning_rate": 1.831081203085415e-05, + "loss": 0.0362, + "step": 2164 + }, + { + "epoch": 0.8670936749399519, + "learning_rate": 1.8295235103168128e-05, + "loss": 0.1039, + "step": 2166 + }, + { + "epoch": 0.8678943154523618, + "learning_rate": 1.8279593372418284e-05, + "loss": 0.0329, + "step": 2168 + }, + { + "epoch": 0.8686949559647719, + "learning_rate": 1.8263886960799072e-05, + "loss": 0.0004, + "step": 2170 + }, + { + "epoch": 0.8694955964771818, + "learning_rate": 1.8248115991010303e-05, + "loss": 0.016, + "step": 2172 + }, + { + "epoch": 0.8702962369895917, + "learning_rate": 1.8232280586256104e-05, + "loss": 0.0007, + "step": 2174 + }, + { + "epoch": 0.8710968775020016, + "learning_rate": 1.8216380870243963e-05, + "loss": 0.0007, + "step": 2176 + }, + { + "epoch": 0.8718975180144115, + "learning_rate": 1.820041696718378e-05, + "loss": 0.041, + "step": 2178 + }, + { + "epoch": 0.8726981585268214, + "learning_rate": 1.8184389001786912e-05, + "loss": 0.0255, + "step": 2180 + }, + { + "epoch": 0.8734987990392313, + "learning_rate": 1.8168297099265108e-05, + "loss": 0.1702, + "step": 2182 + }, + { + "epoch": 0.8742994395516414, + "learning_rate": 1.815214138532966e-05, + "loss": 0.733, + "step": 2184 + }, + { + "epoch": 0.8751000800640513, + "learning_rate": 1.8135921986190358e-05, + "loss": 0.3067, + "step": 2186 + }, + { + "epoch": 0.8759007205764612, + "learning_rate": 1.8119639028554475e-05, + "loss": 0.2254, + "step": 2188 + }, + { + "epoch": 0.8767013610888711, + "learning_rate": 1.8103292639625835e-05, + "loss": 0.0031, + "step": 2190 + }, + { + "epoch": 0.877502001601281, + "learning_rate": 1.808688294710378e-05, + "loss": 0.0065, + "step": 2192 + }, + { + "epoch": 0.8783026421136909, + "learning_rate": 1.807041007918221e-05, + "loss": 0.4527, + "step": 2194 + }, + { + "epoch": 0.8791032826261009, + "learning_rate": 1.805387416454849e-05, + "loss": 0.4783, + "step": 2196 + }, + { + "epoch": 0.8799039231385108, + "learning_rate": 1.8037275332382575e-05, + "loss": 0.3824, + "step": 2198 + }, + { + "epoch": 0.8807045636509208, + "learning_rate": 1.802061371235592e-05, + "loss": 0.0258, + "step": 2200 + }, + { + "epoch": 0.8815052041633307, + "learning_rate": 1.8003889434630476e-05, + "loss": 0.0221, + "step": 2202 + }, + { + "epoch": 0.8823058446757406, + "learning_rate": 1.7987102629857692e-05, + "loss": 0.0649, + "step": 2204 + }, + { + "epoch": 0.8831064851881505, + "learning_rate": 1.7970253429177494e-05, + "loss": 0.0714, + "step": 2206 + }, + { + "epoch": 0.8839071257005604, + "learning_rate": 1.7953341964217196e-05, + "loss": 0.1458, + "step": 2208 + }, + { + "epoch": 0.8847077662129704, + "learning_rate": 1.7936368367090583e-05, + "loss": 0.0122, + "step": 2210 + }, + { + "epoch": 0.8855084067253803, + "learning_rate": 1.7919332770396798e-05, + "loss": 0.0576, + "step": 2212 + }, + { + "epoch": 0.8863090472377902, + "learning_rate": 1.7902235307219336e-05, + "loss": 0.0085, + "step": 2214 + }, + { + "epoch": 0.8871096877502002, + "learning_rate": 1.7885076111125e-05, + "loss": 0.2006, + "step": 2216 + }, + { + "epoch": 0.8879103282626101, + "learning_rate": 1.7867855316162846e-05, + "loss": 0.1233, + "step": 2218 + }, + { + "epoch": 0.88871096877502, + "learning_rate": 1.7850573056863173e-05, + "loss": 0.0155, + "step": 2220 + }, + { + "epoch": 0.8895116092874299, + "learning_rate": 1.783322946823638e-05, + "loss": 0.0066, + "step": 2222 + }, + { + "epoch": 0.8903122497998399, + "learning_rate": 1.7815824685772042e-05, + "loss": 0.1181, + "step": 2224 + }, + { + "epoch": 0.8911128903122498, + "learning_rate": 1.779835884543776e-05, + "loss": 0.0099, + "step": 2226 + }, + { + "epoch": 0.8919135308246597, + "learning_rate": 1.7780832083678122e-05, + "loss": 0.028, + "step": 2228 + }, + { + "epoch": 0.8927141713370697, + "learning_rate": 1.776324453741365e-05, + "loss": 0.0002, + "step": 2230 + }, + { + "epoch": 0.8935148118494796, + "learning_rate": 1.774559634403971e-05, + "loss": 0.0485, + "step": 2232 + }, + { + "epoch": 0.8943154523618895, + "learning_rate": 1.7727887641425465e-05, + "loss": 0.0289, + "step": 2234 + }, + { + "epoch": 0.8951160928742994, + "learning_rate": 1.7710118567912732e-05, + "loss": 0.0235, + "step": 2236 + }, + { + "epoch": 0.8959167333867094, + "learning_rate": 1.7692289262315008e-05, + "loss": 0.0717, + "step": 2238 + }, + { + "epoch": 0.8967173738991193, + "learning_rate": 1.7674399863916298e-05, + "loss": 0.0068, + "step": 2240 + }, + { + "epoch": 0.8975180144115292, + "learning_rate": 1.765645051247007e-05, + "loss": 0.0089, + "step": 2242 + }, + { + "epoch": 0.8983186549239391, + "learning_rate": 1.7638441348198144e-05, + "loss": 0.0212, + "step": 2244 + }, + { + "epoch": 0.899119295436349, + "learning_rate": 1.762037251178961e-05, + "loss": 0.0933, + "step": 2246 + }, + { + "epoch": 0.899919935948759, + "learning_rate": 1.7602244144399713e-05, + "loss": 0.0025, + "step": 2248 + }, + { + "epoch": 0.900720576461169, + "learning_rate": 1.7584056387648738e-05, + "loss": 0.0097, + "step": 2250 + }, + { + "epoch": 0.9015212169735789, + "learning_rate": 1.7565809383620966e-05, + "loss": 0.0406, + "step": 2252 + }, + { + "epoch": 0.9023218574859888, + "learning_rate": 1.7547503274863502e-05, + "loss": 0.2641, + "step": 2254 + }, + { + "epoch": 0.9031224979983987, + "learning_rate": 1.7529138204385186e-05, + "loss": 0.0989, + "step": 2256 + }, + { + "epoch": 0.9039231385108086, + "learning_rate": 1.7510714315655467e-05, + "loss": 0.0455, + "step": 2258 + }, + { + "epoch": 0.9047237790232185, + "learning_rate": 1.7492231752603305e-05, + "loss": 0.8535, + "step": 2260 + }, + { + "epoch": 0.9055244195356285, + "learning_rate": 1.7473690659616e-05, + "loss": 0.0571, + "step": 2262 + }, + { + "epoch": 0.9063250600480385, + "learning_rate": 1.7455091181538094e-05, + "loss": 0.1448, + "step": 2264 + }, + { + "epoch": 0.9071257005604484, + "learning_rate": 1.743643346367027e-05, + "loss": 0.0013, + "step": 2266 + }, + { + "epoch": 0.9079263410728583, + "learning_rate": 1.741771765176815e-05, + "loss": 0.0275, + "step": 2268 + }, + { + "epoch": 0.9087269815852682, + "learning_rate": 1.739894389204122e-05, + "loss": 0.019, + "step": 2270 + }, + { + "epoch": 0.9095276220976781, + "learning_rate": 1.7380112331151657e-05, + "loss": 0.3946, + "step": 2272 + }, + { + "epoch": 0.910328262610088, + "learning_rate": 1.7361223116213146e-05, + "loss": 0.0001, + "step": 2274 + }, + { + "epoch": 0.911128903122498, + "learning_rate": 1.7342276394789825e-05, + "loss": 0.0013, + "step": 2276 + }, + { + "epoch": 0.911929543634908, + "learning_rate": 1.732327231489503e-05, + "loss": 0.4521, + "step": 2278 + }, + { + "epoch": 0.9127301841473179, + "learning_rate": 1.7304211024990216e-05, + "loss": 0.0288, + "step": 2280 + }, + { + "epoch": 0.9135308246597278, + "learning_rate": 1.728509267398376e-05, + "loss": 0.0658, + "step": 2282 + }, + { + "epoch": 0.9143314651721377, + "learning_rate": 1.7265917411229803e-05, + "loss": 0.3208, + "step": 2284 + }, + { + "epoch": 0.9151321056845476, + "learning_rate": 1.7246685386527105e-05, + "loss": 0.1613, + "step": 2286 + }, + { + "epoch": 0.9159327461969575, + "learning_rate": 1.7227396750117802e-05, + "loss": 0.1949, + "step": 2288 + }, + { + "epoch": 0.9167333867093675, + "learning_rate": 1.7208051652686348e-05, + "loss": 0.1907, + "step": 2290 + }, + { + "epoch": 0.9175340272217775, + "learning_rate": 1.718865024535822e-05, + "loss": 0.0098, + "step": 2292 + }, + { + "epoch": 0.9183346677341874, + "learning_rate": 1.716919267969884e-05, + "loss": 0.4879, + "step": 2294 + }, + { + "epoch": 0.9191353082465973, + "learning_rate": 1.7149679107712317e-05, + "loss": 0.0111, + "step": 2296 + }, + { + "epoch": 0.9199359487590072, + "learning_rate": 1.7130109681840298e-05, + "loss": 0.066, + "step": 2298 + }, + { + "epoch": 0.9207365892714171, + "learning_rate": 1.711048455496075e-05, + "loss": 0.0002, + "step": 2300 + }, + { + "epoch": 0.921537229783827, + "learning_rate": 1.7090803880386784e-05, + "loss": 0.3147, + "step": 2302 + }, + { + "epoch": 0.922337870296237, + "learning_rate": 1.7071067811865474e-05, + "loss": 0.0248, + "step": 2304 + }, + { + "epoch": 0.923138510808647, + "learning_rate": 1.705127650357663e-05, + "loss": 0.0227, + "step": 2306 + }, + { + "epoch": 0.9239391513210569, + "learning_rate": 1.7031430110131566e-05, + "loss": 0.789, + "step": 2308 + }, + { + "epoch": 0.9247397918334668, + "learning_rate": 1.701152878657197e-05, + "loss": 0.001, + "step": 2310 + }, + { + "epoch": 0.9255404323458767, + "learning_rate": 1.699157268836863e-05, + "loss": 0.2123, + "step": 2312 + }, + { + "epoch": 0.9263410728582866, + "learning_rate": 1.697156197142023e-05, + "loss": 0.0002, + "step": 2314 + }, + { + "epoch": 0.9271417133706965, + "learning_rate": 1.6951496792052148e-05, + "loss": 0.2305, + "step": 2316 + }, + { + "epoch": 0.9279423538831065, + "learning_rate": 1.6931377307015236e-05, + "loss": 0.0877, + "step": 2318 + }, + { + "epoch": 0.9287429943955164, + "learning_rate": 1.6911203673484583e-05, + "loss": 0.0371, + "step": 2320 + }, + { + "epoch": 0.9295436349079264, + "learning_rate": 1.6890976049058267e-05, + "loss": 0.0258, + "step": 2322 + }, + { + "epoch": 0.9303442754203363, + "learning_rate": 1.687069459175619e-05, + "loss": 0.0002, + "step": 2324 + }, + { + "epoch": 0.9311449159327462, + "learning_rate": 1.6850359460018744e-05, + "loss": 0.0036, + "step": 2326 + }, + { + "epoch": 0.9319455564451561, + "learning_rate": 1.682997081270568e-05, + "loss": 0.1881, + "step": 2328 + }, + { + "epoch": 0.932746196957566, + "learning_rate": 1.6809528809094805e-05, + "loss": 0.3493, + "step": 2330 + }, + { + "epoch": 0.933546837469976, + "learning_rate": 1.6789033608880742e-05, + "loss": 0.1645, + "step": 2332 + }, + { + "epoch": 0.9343474779823859, + "learning_rate": 1.67684853721737e-05, + "loss": 0.1463, + "step": 2334 + }, + { + "epoch": 0.9351481184947958, + "learning_rate": 1.6747884259498185e-05, + "loss": 0.8528, + "step": 2336 + }, + { + "epoch": 0.9359487590072058, + "learning_rate": 1.6727230431791826e-05, + "loss": 0.0117, + "step": 2338 + }, + { + "epoch": 0.9367493995196157, + "learning_rate": 1.6706524050404006e-05, + "loss": 1.9378, + "step": 2340 + }, + { + "epoch": 0.9375500400320256, + "learning_rate": 1.6685765277094702e-05, + "loss": 0.0039, + "step": 2342 + }, + { + "epoch": 0.9383506805444356, + "learning_rate": 1.6664954274033175e-05, + "loss": 0.4301, + "step": 2344 + }, + { + "epoch": 0.9391513210568455, + "learning_rate": 1.66440912037967e-05, + "loss": 0.0202, + "step": 2346 + }, + { + "epoch": 0.9399519615692554, + "learning_rate": 1.662317622936933e-05, + "loss": 0.0949, + "step": 2348 + }, + { + "epoch": 0.9407526020816653, + "learning_rate": 1.6602209514140562e-05, + "loss": 0.4167, + "step": 2350 + }, + { + "epoch": 0.9415532425940752, + "learning_rate": 1.6581191221904098e-05, + "loss": 0.0196, + "step": 2352 + }, + { + "epoch": 0.9423538831064852, + "learning_rate": 1.6560121516856592e-05, + "loss": 0.1803, + "step": 2354 + }, + { + "epoch": 0.9431545236188951, + "learning_rate": 1.6539000563596328e-05, + "loss": 0.2539, + "step": 2356 + }, + { + "epoch": 0.9439551641313051, + "learning_rate": 1.651782852712194e-05, + "loss": 0.1134, + "step": 2358 + }, + { + "epoch": 0.944755804643715, + "learning_rate": 1.6496605572831127e-05, + "loss": 0.0166, + "step": 2360 + }, + { + "epoch": 0.9455564451561249, + "learning_rate": 1.6475331866519387e-05, + "loss": 0.073, + "step": 2362 + }, + { + "epoch": 0.9463570856685348, + "learning_rate": 1.6454007574378657e-05, + "loss": 0.1229, + "step": 2364 + }, + { + "epoch": 0.9471577261809447, + "learning_rate": 1.6432632862996062e-05, + "loss": 0.7828, + "step": 2366 + }, + { + "epoch": 0.9479583666933546, + "learning_rate": 1.6411207899352633e-05, + "loss": 0.0024, + "step": 2368 + }, + { + "epoch": 0.9487590072057646, + "learning_rate": 1.6389732850821964e-05, + "loss": 0.029, + "step": 2370 + }, + { + "epoch": 0.9495596477181746, + "learning_rate": 1.6368207885168904e-05, + "loss": 0.2533, + "step": 2372 + }, + { + "epoch": 0.9503602882305845, + "learning_rate": 1.6346633170548275e-05, + "loss": 0.0467, + "step": 2374 + }, + { + "epoch": 0.9511609287429944, + "learning_rate": 1.6325008875503563e-05, + "loss": 0.3091, + "step": 2376 + }, + { + "epoch": 0.9519615692554043, + "learning_rate": 1.6303335168965495e-05, + "loss": 0.0408, + "step": 2378 + }, + { + "epoch": 0.9527622097678142, + "learning_rate": 1.628161222025089e-05, + "loss": 0.6963, + "step": 2380 + }, + { + "epoch": 0.9535628502802241, + "learning_rate": 1.625984019906122e-05, + "loss": 0.0389, + "step": 2382 + }, + { + "epoch": 0.9543634907926342, + "learning_rate": 1.623801927548132e-05, + "loss": 0.1206, + "step": 2384 + }, + { + "epoch": 0.9551641313050441, + "learning_rate": 1.6216149619978057e-05, + "loss": 0.5189, + "step": 2386 + }, + { + "epoch": 0.955964771817454, + "learning_rate": 1.6194231403398987e-05, + "loss": 0.0083, + "step": 2388 + }, + { + "epoch": 0.9567654123298639, + "learning_rate": 1.6172264796971063e-05, + "loss": 0.0977, + "step": 2390 + }, + { + "epoch": 0.9575660528422738, + "learning_rate": 1.6150249972299173e-05, + "loss": 0.0825, + "step": 2392 + }, + { + "epoch": 0.9583666933546837, + "learning_rate": 1.612818710136499e-05, + "loss": 0.0081, + "step": 2394 + }, + { + "epoch": 0.9591673338670936, + "learning_rate": 1.6106076356525484e-05, + "loss": 0.576, + "step": 2396 + }, + { + "epoch": 0.9599679743795037, + "learning_rate": 1.6083917910511623e-05, + "loss": 0.1246, + "step": 2398 + }, + { + "epoch": 0.9607686148919136, + "learning_rate": 1.6061711936427028e-05, + "loss": 0.3569, + "step": 2400 + }, + { + "epoch": 0.9615692554043235, + "learning_rate": 1.60394586077466e-05, + "loss": 0.2307, + "step": 2402 + }, + { + "epoch": 0.9623698959167334, + "learning_rate": 1.6017158098315224e-05, + "loss": 0.1766, + "step": 2404 + }, + { + "epoch": 0.9631705364291433, + "learning_rate": 1.5994810582346266e-05, + "loss": 0.2258, + "step": 2406 + }, + { + "epoch": 0.9639711769415532, + "learning_rate": 1.5972416234420404e-05, + "loss": 0.0256, + "step": 2408 + }, + { + "epoch": 0.9647718174539631, + "learning_rate": 1.594997522948413e-05, + "loss": 0.3328, + "step": 2410 + }, + { + "epoch": 0.9655724579663731, + "learning_rate": 1.592748774284844e-05, + "loss": 0.1389, + "step": 2412 + }, + { + "epoch": 0.966373098478783, + "learning_rate": 1.5904953950187448e-05, + "loss": 0.143, + "step": 2414 + }, + { + "epoch": 0.967173738991193, + "learning_rate": 1.588237402753703e-05, + "loss": 0.014, + "step": 2416 + }, + { + "epoch": 0.9679743795036029, + "learning_rate": 1.5859748151293354e-05, + "loss": 1.4022, + "step": 2418 + }, + { + "epoch": 0.9687750200160128, + "learning_rate": 1.5837076498211673e-05, + "loss": 0.0127, + "step": 2420 + }, + { + "epoch": 0.9695756605284227, + "learning_rate": 1.581435924540482e-05, + "loss": 0.7333, + "step": 2422 + }, + { + "epoch": 0.9703763010408326, + "learning_rate": 1.579159657034185e-05, + "loss": 0.0675, + "step": 2424 + }, + { + "epoch": 0.9711769415532426, + "learning_rate": 1.5768788650846674e-05, + "loss": 0.0202, + "step": 2426 + }, + { + "epoch": 0.9719775820656525, + "learning_rate": 1.574593566509664e-05, + "loss": 0.1511, + "step": 2428 + }, + { + "epoch": 0.9727782225780625, + "learning_rate": 1.5723037791621203e-05, + "loss": 0.0173, + "step": 2430 + }, + { + "epoch": 0.9735788630904724, + "learning_rate": 1.5700095209300386e-05, + "loss": 0.0188, + "step": 2432 + }, + { + "epoch": 0.9743795036028823, + "learning_rate": 1.5677108097363565e-05, + "loss": 0.0922, + "step": 2434 + }, + { + "epoch": 0.9751801441152922, + "learning_rate": 1.5654076635387976e-05, + "loss": 0.1461, + "step": 2436 + }, + { + "epoch": 0.9759807846277022, + "learning_rate": 1.5631001003297302e-05, + "loss": 0.2591, + "step": 2438 + }, + { + "epoch": 0.9767814251401121, + "learning_rate": 1.560788138136029e-05, + "loss": 0.1226, + "step": 2440 + }, + { + "epoch": 0.977582065652522, + "learning_rate": 1.5584717950189373e-05, + "loss": 0.2198, + "step": 2442 + }, + { + "epoch": 0.978382706164932, + "learning_rate": 1.5561510890739137e-05, + "loss": 0.2688, + "step": 2444 + }, + { + "epoch": 0.9791833466773419, + "learning_rate": 1.5538260384305083e-05, + "loss": 0.052, + "step": 2446 + }, + { + "epoch": 0.9799839871897518, + "learning_rate": 1.5514966612522088e-05, + "loss": 0.0048, + "step": 2448 + }, + { + "epoch": 0.9807846277021617, + "learning_rate": 1.5491629757363033e-05, + "loss": 0.4595, + "step": 2450 + }, + { + "epoch": 0.9815852682145717, + "learning_rate": 1.546825000113736e-05, + "loss": 0.0701, + "step": 2452 + }, + { + "epoch": 0.9823859087269816, + "learning_rate": 1.544482752648966e-05, + "loss": 0.1642, + "step": 2454 + }, + { + "epoch": 0.9831865492393915, + "learning_rate": 1.5421362516398285e-05, + "loss": 0.0853, + "step": 2456 + }, + { + "epoch": 0.9839871897518014, + "learning_rate": 1.539785515417377e-05, + "loss": 0.0321, + "step": 2458 + }, + { + "epoch": 0.9847878302642114, + "learning_rate": 1.5374305623457605e-05, + "loss": 0.0576, + "step": 2460 + }, + { + "epoch": 0.9855884707766213, + "learning_rate": 1.5350714108220677e-05, + "loss": 0.0124, + "step": 2462 + }, + { + "epoch": 0.9863891112890312, + "learning_rate": 1.532708079276186e-05, + "loss": 0.6578, + "step": 2464 + }, + { + "epoch": 0.9871897518014412, + "learning_rate": 1.5303405861706567e-05, + "loss": 0.0207, + "step": 2466 + }, + { + "epoch": 0.9879903923138511, + "learning_rate": 1.5279689500005353e-05, + "loss": 0.007, + "step": 2468 + }, + { + "epoch": 0.988791032826261, + "learning_rate": 1.5255931892932344e-05, + "loss": 0.1112, + "step": 2470 + }, + { + "epoch": 0.9895916733386709, + "learning_rate": 1.5232133226083962e-05, + "loss": 0.039, + "step": 2472 + }, + { + "epoch": 0.9903923138510808, + "learning_rate": 1.5208293685377362e-05, + "loss": 0.0893, + "step": 2474 + }, + { + "epoch": 0.9911929543634908, + "learning_rate": 1.5184413457049014e-05, + "loss": 0.0713, + "step": 2476 + }, + { + "epoch": 0.9919935948759008, + "learning_rate": 1.5160492727653238e-05, + "loss": 0.0533, + "step": 2478 + }, + { + "epoch": 0.9927942353883107, + "learning_rate": 1.5136531684060753e-05, + "loss": 0.3525, + "step": 2480 + }, + { + "epoch": 0.9935948759007206, + "learning_rate": 1.5112530513457251e-05, + "loss": 0.1159, + "step": 2482 + }, + { + "epoch": 0.9943955164131305, + "learning_rate": 1.50884894033418e-05, + "loss": 0.1732, + "step": 2484 + }, + { + "epoch": 0.9951961569255404, + "learning_rate": 1.5064408541525578e-05, + "loss": 0.5729, + "step": 2486 + }, + { + "epoch": 0.9959967974379503, + "learning_rate": 1.504028811613027e-05, + "loss": 0.1649, + "step": 2488 + }, + { + "epoch": 0.9967974379503602, + "learning_rate": 1.5016128315586636e-05, + "loss": 0.108, + "step": 2490 + }, + { + "epoch": 0.9975980784627703, + "learning_rate": 1.4991929328633043e-05, + "loss": 0.0274, + "step": 2492 + }, + { + "epoch": 0.9983987189751802, + "learning_rate": 1.4967691344314012e-05, + "loss": 0.0882, + "step": 2494 + }, + { + "epoch": 0.9991993594875901, + "learning_rate": 1.4943414551978622e-05, + "loss": 0.1361, + "step": 2496 + }, + { + "epoch": 1.0, + "learning_rate": 1.4919099141279214e-05, + "loss": 0.0709, + "step": 2498 + }, + { + "epoch": 1.0, + "step": 2498, + "total_flos": 1.5680061851041792e+16, + "train_loss": 0.1484192200493866, + "train_runtime": 8093.8434, + "train_samples_per_second": 2.469, + "train_steps_per_second": 0.309 + } + ], + "logging_steps": 2, + "max_steps": 2498, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 1.5680061851041792e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_125_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_125_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..4acacd3669129aa61e63c5e2f45636e26637a9fc --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_125_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3c1d41afac2406d86f62eb09cb6a6aacf1b76a9715e16ab2cbe6348c2f5de19 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_125_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_125_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3b6cb40713fd1691712f910d5976c98394ac17aa --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_125_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37d581ef7e214b4500e730670e177c50f76cd34fbae9a255d56fdddfd60240a8 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_125_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_125_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6ccc3919b6638ba9acec18a1581897c4bb9950f6 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_125_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd4b560626ff313aebea9d6b8d0f8df854d6434ed52184b30915957fb4cd8759 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_125_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_125_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f57ddb08a3bf168473dc23f1a354f15789e61fe8 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_125_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ad126653757c3f703fec9e65536b73349b72c2b5bba725f11f86a6ffd54c3e1 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_25_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_25_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4b397e1f07f4e3237cb5caad5f2dbf89e1ac91df --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_25_seed1/0_trainer_state.json @@ -0,0 +1,15020 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 4996, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008006405124099279, + "learning_rate": 2.406842319175051e-06, + "loss": 0.0124, + "step": 2 + }, + { + "epoch": 0.0008006405124099279, + "learning_rate": 2.415943612351265e-06, + "loss": 0.0468, + "step": 4 + }, + { + "epoch": 0.0016012810248198558, + "learning_rate": 2.4250597173539104e-06, + "loss": 0.1542, + "step": 6 + }, + { + "epoch": 0.0016012810248198558, + "learning_rate": 2.4341906163790364e-06, + "loss": 0.2499, + "step": 8 + }, + { + "epoch": 0.0024019215372297837, + "learning_rate": 2.443336291593801e-06, + "loss": 0.012, + "step": 10 + }, + { + "epoch": 0.0024019215372297837, + "learning_rate": 2.4524967251364995e-06, + "loss": 0.0064, + "step": 12 + }, + { + "epoch": 0.0032025620496397116, + "learning_rate": 2.461671899116598e-06, + "loss": 0.0266, + "step": 14 + }, + { + "epoch": 0.0032025620496397116, + "learning_rate": 2.4708617956148052e-06, + "loss": 0.0396, + "step": 16 + }, + { + "epoch": 0.0040032025620496394, + "learning_rate": 2.4800663966830417e-06, + "loss": 0.0376, + "step": 18 + }, + { + "epoch": 0.0040032025620496394, + "learning_rate": 2.4892856843445236e-06, + "loss": 0.0082, + "step": 20 + }, + { + "epoch": 0.004803843074459567, + "learning_rate": 2.4985196405937807e-06, + "loss": 0.1764, + "step": 22 + }, + { + "epoch": 0.004803843074459567, + "learning_rate": 2.507768247396697e-06, + "loss": 0.0097, + "step": 24 + }, + { + "epoch": 0.005604483586869495, + "learning_rate": 2.5170314866905443e-06, + "loss": 0.0026, + "step": 26 + }, + { + "epoch": 0.005604483586869495, + "learning_rate": 2.5263093403840022e-06, + "loss": 0.2071, + "step": 28 + }, + { + "epoch": 0.006405124099279423, + "learning_rate": 2.535601790357246e-06, + "loss": 0.1081, + "step": 30 + }, + { + "epoch": 0.006405124099279423, + "learning_rate": 2.5449088184619065e-06, + "loss": 0.1143, + "step": 32 + }, + { + "epoch": 0.007205764611689352, + "learning_rate": 2.5542304065211578e-06, + "loss": 0.0458, + "step": 34 + }, + { + "epoch": 0.007205764611689352, + "learning_rate": 2.5635665363297356e-06, + "loss": 0.0043, + "step": 36 + }, + { + "epoch": 0.008006405124099279, + "learning_rate": 2.5729171896539763e-06, + "loss": 0.5069, + "step": 38 + }, + { + "epoch": 0.008006405124099279, + "learning_rate": 2.5822823482318517e-06, + "loss": 0.0199, + "step": 40 + }, + { + "epoch": 0.008807045636509208, + "learning_rate": 2.5916619937729915e-06, + "loss": 0.0124, + "step": 42 + }, + { + "epoch": 0.008807045636509208, + "learning_rate": 2.6010561079587694e-06, + "loss": 0.0043, + "step": 44 + }, + { + "epoch": 0.009607686148919135, + "learning_rate": 2.6104646724422643e-06, + "loss": 0.3063, + "step": 46 + }, + { + "epoch": 0.009607686148919135, + "learning_rate": 2.6198876688483453e-06, + "loss": 0.0083, + "step": 48 + }, + { + "epoch": 0.010408326661329063, + "learning_rate": 2.629325078773699e-06, + "loss": 0.0043, + "step": 50 + }, + { + "epoch": 0.010408326661329063, + "learning_rate": 2.6387768837868565e-06, + "loss": 0.5353, + "step": 52 + }, + { + "epoch": 0.01120896717373899, + "learning_rate": 2.648243065428239e-06, + "loss": 0.5936, + "step": 54 + }, + { + "epoch": 0.01120896717373899, + "learning_rate": 2.6577236052101764e-06, + "loss": 0.0345, + "step": 56 + }, + { + "epoch": 0.01200960768614892, + "learning_rate": 2.6672184846169934e-06, + "loss": 0.0859, + "step": 58 + }, + { + "epoch": 0.01200960768614892, + "learning_rate": 2.6767276851049716e-06, + "loss": 0.0182, + "step": 60 + }, + { + "epoch": 0.012810248198558846, + "learning_rate": 2.686251188102439e-06, + "loss": 0.0819, + "step": 62 + }, + { + "epoch": 0.012810248198558846, + "learning_rate": 2.6957889750097866e-06, + "loss": 0.0407, + "step": 64 + }, + { + "epoch": 0.013610888710968775, + "learning_rate": 2.7053410271995085e-06, + "loss": 0.0141, + "step": 66 + }, + { + "epoch": 0.013610888710968775, + "learning_rate": 2.7149073260162416e-06, + "loss": 0.1275, + "step": 68 + }, + { + "epoch": 0.014411529223378704, + "learning_rate": 2.724487852776785e-06, + "loss": 0.3795, + "step": 70 + }, + { + "epoch": 0.014411529223378704, + "learning_rate": 2.7340825887701848e-06, + "loss": 0.1963, + "step": 72 + }, + { + "epoch": 0.01521216973578863, + "learning_rate": 2.7436915152577038e-06, + "loss": 0.0001, + "step": 74 + }, + { + "epoch": 0.01521216973578863, + "learning_rate": 2.7533146134728993e-06, + "loss": 0.0065, + "step": 76 + }, + { + "epoch": 0.016012810248198558, + "learning_rate": 2.7629518646216522e-06, + "loss": 0.0206, + "step": 78 + }, + { + "epoch": 0.016012810248198558, + "learning_rate": 2.772603249882202e-06, + "loss": 0.2184, + "step": 80 + }, + { + "epoch": 0.016813450760608487, + "learning_rate": 2.782268750405185e-06, + "loss": 0.0038, + "step": 82 + }, + { + "epoch": 0.016813450760608487, + "learning_rate": 2.7919483473136555e-06, + "loss": 0.1508, + "step": 84 + }, + { + "epoch": 0.017614091273018415, + "learning_rate": 2.801642021703177e-06, + "loss": 0.0097, + "step": 86 + }, + { + "epoch": 0.017614091273018415, + "learning_rate": 2.81134975464178e-06, + "loss": 0.0708, + "step": 88 + }, + { + "epoch": 0.018414731785428344, + "learning_rate": 2.821071527170053e-06, + "loss": 0.0118, + "step": 90 + }, + { + "epoch": 0.018414731785428344, + "learning_rate": 2.8308073203011634e-06, + "loss": 0.0987, + "step": 92 + }, + { + "epoch": 0.01921537229783827, + "learning_rate": 2.8405571150208945e-06, + "loss": 0.1529, + "step": 94 + }, + { + "epoch": 0.01921537229783827, + "learning_rate": 2.850320892287688e-06, + "loss": 0.0367, + "step": 96 + }, + { + "epoch": 0.020016012810248198, + "learning_rate": 2.860098633032663e-06, + "loss": 0.5394, + "step": 98 + }, + { + "epoch": 0.020016012810248198, + "learning_rate": 2.8698903181597026e-06, + "loss": 0.0086, + "step": 100 + }, + { + "epoch": 0.020816653322658127, + "learning_rate": 2.879695928545424e-06, + "loss": 0.1052, + "step": 102 + }, + { + "epoch": 0.020816653322658127, + "learning_rate": 2.889515445039256e-06, + "loss": 0.0719, + "step": 104 + }, + { + "epoch": 0.021617293835068056, + "learning_rate": 2.899348848463471e-06, + "loss": 0.0002, + "step": 106 + }, + { + "epoch": 0.021617293835068056, + "learning_rate": 2.909196119613218e-06, + "loss": 0.0001, + "step": 108 + }, + { + "epoch": 0.02241793434747798, + "learning_rate": 2.9190572392565643e-06, + "loss": 0.0079, + "step": 110 + }, + { + "epoch": 0.02241793434747798, + "learning_rate": 2.928932188134529e-06, + "loss": 0.0064, + "step": 112 + }, + { + "epoch": 0.02321857485988791, + "learning_rate": 2.9388209469611093e-06, + "loss": 0.054, + "step": 114 + }, + { + "epoch": 0.02321857485988791, + "learning_rate": 2.9487234964233724e-06, + "loss": 0.0064, + "step": 116 + }, + { + "epoch": 0.02401921537229784, + "learning_rate": 2.9586398171814114e-06, + "loss": 0.0286, + "step": 118 + }, + { + "epoch": 0.02401921537229784, + "learning_rate": 2.9685698898684355e-06, + "loss": 0.0074, + "step": 120 + }, + { + "epoch": 0.024819855884707767, + "learning_rate": 2.9785136950907987e-06, + "loss": 0.0042, + "step": 122 + }, + { + "epoch": 0.024819855884707767, + "learning_rate": 2.988471213428035e-06, + "loss": 0.0124, + "step": 124 + }, + { + "epoch": 0.025620496397117692, + "learning_rate": 2.9984424254328936e-06, + "loss": 0.047, + "step": 126 + }, + { + "epoch": 0.025620496397117692, + "learning_rate": 3.00842731163137e-06, + "loss": 0.0845, + "step": 128 + }, + { + "epoch": 0.02642113690952762, + "learning_rate": 3.0184258525227895e-06, + "loss": 0.6016, + "step": 130 + }, + { + "epoch": 0.02642113690952762, + "learning_rate": 3.0284380285797733e-06, + "loss": 0.0037, + "step": 132 + }, + { + "epoch": 0.02722177742193755, + "learning_rate": 3.038463820248324e-06, + "loss": 0.0089, + "step": 134 + }, + { + "epoch": 0.02722177742193755, + "learning_rate": 3.048503207947854e-06, + "loss": 0.0517, + "step": 136 + }, + { + "epoch": 0.02802241793434748, + "learning_rate": 3.0585561720712207e-06, + "loss": 0.007, + "step": 138 + }, + { + "epoch": 0.02802241793434748, + "learning_rate": 3.068622692984767e-06, + "loss": 0.001, + "step": 140 + }, + { + "epoch": 0.028823058446757407, + "learning_rate": 3.0787027510283495e-06, + "loss": 0.1047, + "step": 142 + }, + { + "epoch": 0.028823058446757407, + "learning_rate": 3.0887963265154187e-06, + "loss": 0.0054, + "step": 144 + }, + { + "epoch": 0.029623698959167333, + "learning_rate": 3.098903399732992e-06, + "loss": 0.6199, + "step": 146 + }, + { + "epoch": 0.029623698959167333, + "learning_rate": 3.1090239509417364e-06, + "loss": 0.0035, + "step": 148 + }, + { + "epoch": 0.03042433947157726, + "learning_rate": 3.1191579603759946e-06, + "loss": 0.1587, + "step": 150 + }, + { + "epoch": 0.03042433947157726, + "learning_rate": 3.129305408243829e-06, + "loss": 0.0073, + "step": 152 + }, + { + "epoch": 0.03122497998398719, + "learning_rate": 3.139466274727052e-06, + "loss": 0.2852, + "step": 154 + }, + { + "epoch": 0.03122497998398719, + "learning_rate": 3.1496405399812602e-06, + "loss": 0.1353, + "step": 156 + }, + { + "epoch": 0.032025620496397116, + "learning_rate": 3.159828184135917e-06, + "loss": 0.0874, + "step": 158 + }, + { + "epoch": 0.032025620496397116, + "learning_rate": 3.17002918729432e-06, + "loss": 0.3596, + "step": 160 + }, + { + "epoch": 0.03282626100880705, + "learning_rate": 3.1802435295336908e-06, + "loss": 0.1581, + "step": 162 + }, + { + "epoch": 0.03282626100880705, + "learning_rate": 3.1904711909051967e-06, + "loss": 0.0162, + "step": 164 + }, + { + "epoch": 0.03362690152121697, + "learning_rate": 3.2007121514339924e-06, + "loss": 0.2441, + "step": 166 + }, + { + "epoch": 0.03362690152121697, + "learning_rate": 3.2109663911192622e-06, + "loss": 0.3694, + "step": 168 + }, + { + "epoch": 0.0344275420336269, + "learning_rate": 3.221233889934239e-06, + "loss": 0.0042, + "step": 170 + }, + { + "epoch": 0.0344275420336269, + "learning_rate": 3.231514627826302e-06, + "loss": 0.0141, + "step": 172 + }, + { + "epoch": 0.03522818254603683, + "learning_rate": 3.2418085847169344e-06, + "loss": 0.041, + "step": 174 + }, + { + "epoch": 0.03522818254603683, + "learning_rate": 3.2521157405018146e-06, + "loss": 0.0469, + "step": 176 + }, + { + "epoch": 0.036028823058446756, + "learning_rate": 3.2624360750508457e-06, + "loss": 0.3182, + "step": 178 + }, + { + "epoch": 0.036028823058446756, + "learning_rate": 3.2727695682081897e-06, + "loss": 0.011, + "step": 180 + }, + { + "epoch": 0.03682946357085669, + "learning_rate": 3.28311619979231e-06, + "loss": 0.1958, + "step": 182 + }, + { + "epoch": 0.03682946357085669, + "learning_rate": 3.293475949595998e-06, + "loss": 0.0616, + "step": 184 + }, + { + "epoch": 0.03763010408326661, + "learning_rate": 3.303848797386465e-06, + "loss": 0.0543, + "step": 186 + }, + { + "epoch": 0.03763010408326661, + "learning_rate": 3.314234722905302e-06, + "loss": 0.052, + "step": 188 + }, + { + "epoch": 0.03843074459567654, + "learning_rate": 3.3246337058685697e-06, + "loss": 0.0218, + "step": 190 + }, + { + "epoch": 0.03843074459567654, + "learning_rate": 3.335045725966829e-06, + "loss": 0.007, + "step": 192 + }, + { + "epoch": 0.03923138510808647, + "learning_rate": 3.3454707628651806e-06, + "loss": 0.352, + "step": 194 + }, + { + "epoch": 0.03923138510808647, + "learning_rate": 3.355908796203301e-06, + "loss": 0.0047, + "step": 196 + }, + { + "epoch": 0.040032025620496396, + "learning_rate": 3.3663598055954716e-06, + "loss": 0.0968, + "step": 198 + }, + { + "epoch": 0.040032025620496396, + "learning_rate": 3.3768237706306716e-06, + "loss": 0.0036, + "step": 200 + }, + { + "epoch": 0.04083266613290633, + "learning_rate": 3.3873006708725365e-06, + "loss": 0.2149, + "step": 202 + }, + { + "epoch": 0.04083266613290633, + "learning_rate": 3.3977904858594534e-06, + "loss": 0.0395, + "step": 204 + }, + { + "epoch": 0.041633306645316254, + "learning_rate": 3.408293195104586e-06, + "loss": 0.0461, + "step": 206 + }, + { + "epoch": 0.041633306645316254, + "learning_rate": 3.418808778095917e-06, + "loss": 0.0076, + "step": 208 + }, + { + "epoch": 0.04243394715772618, + "learning_rate": 3.4293372142962845e-06, + "loss": 0.036, + "step": 210 + }, + { + "epoch": 0.04243394715772618, + "learning_rate": 3.4398784831434097e-06, + "loss": 0.038, + "step": 212 + }, + { + "epoch": 0.04323458767013611, + "learning_rate": 3.4504325640499936e-06, + "loss": 0.5063, + "step": 214 + }, + { + "epoch": 0.04323458767013611, + "learning_rate": 3.460999436403676e-06, + "loss": 0.0376, + "step": 216 + }, + { + "epoch": 0.044035228182546036, + "learning_rate": 3.4715790795671232e-06, + "loss": 0.0055, + "step": 218 + }, + { + "epoch": 0.044035228182546036, + "learning_rate": 3.4821714728780654e-06, + "loss": 0.0033, + "step": 220 + }, + { + "epoch": 0.04483586869495596, + "learning_rate": 3.4927765956493276e-06, + "loss": 0.0219, + "step": 222 + }, + { + "epoch": 0.04483586869495596, + "learning_rate": 3.5033944271688624e-06, + "loss": 0.0069, + "step": 224 + }, + { + "epoch": 0.045636509207365894, + "learning_rate": 3.514024946699842e-06, + "loss": 0.1642, + "step": 226 + }, + { + "epoch": 0.045636509207365894, + "learning_rate": 3.5246681334806177e-06, + "loss": 0.0404, + "step": 228 + }, + { + "epoch": 0.04643714971977582, + "learning_rate": 3.535323966724814e-06, + "loss": 0.0511, + "step": 230 + }, + { + "epoch": 0.04643714971977582, + "learning_rate": 3.5459924256213596e-06, + "loss": 0.0125, + "step": 232 + }, + { + "epoch": 0.04723779023218575, + "learning_rate": 3.556673489334522e-06, + "loss": 0.0042, + "step": 234 + }, + { + "epoch": 0.04723779023218575, + "learning_rate": 3.567367137003953e-06, + "loss": 0.0115, + "step": 236 + }, + { + "epoch": 0.04803843074459568, + "learning_rate": 3.5780733477447127e-06, + "loss": 0.0099, + "step": 238 + }, + { + "epoch": 0.04803843074459568, + "learning_rate": 3.588792100647368e-06, + "loss": 0.5879, + "step": 240 + }, + { + "epoch": 0.0488390712570056, + "learning_rate": 3.5995233747779467e-06, + "loss": 0.0124, + "step": 242 + }, + { + "epoch": 0.0488390712570056, + "learning_rate": 3.6102671491780393e-06, + "loss": 0.0043, + "step": 244 + }, + { + "epoch": 0.049639711769415534, + "learning_rate": 3.6210234028648216e-06, + "loss": 0.1926, + "step": 246 + }, + { + "epoch": 0.049639711769415534, + "learning_rate": 3.6317921148310965e-06, + "loss": 0.0164, + "step": 248 + }, + { + "epoch": 0.05044035228182546, + "learning_rate": 3.6425732640453235e-06, + "loss": 0.0038, + "step": 250 + }, + { + "epoch": 0.05044035228182546, + "learning_rate": 3.653366829451711e-06, + "loss": 0.2777, + "step": 252 + }, + { + "epoch": 0.051240992794235385, + "learning_rate": 3.6641727899701795e-06, + "loss": 0.0879, + "step": 254 + }, + { + "epoch": 0.051240992794235385, + "learning_rate": 3.674991124496452e-06, + "loss": 0.0582, + "step": 256 + }, + { + "epoch": 0.05204163330664532, + "learning_rate": 3.6858218119020884e-06, + "loss": 0.0125, + "step": 258 + }, + { + "epoch": 0.05204163330664532, + "learning_rate": 3.696664831034521e-06, + "loss": 0.0075, + "step": 260 + }, + { + "epoch": 0.05284227381905524, + "learning_rate": 3.7075201607170997e-06, + "loss": 0.0223, + "step": 262 + }, + { + "epoch": 0.05284227381905524, + "learning_rate": 3.7183877797491143e-06, + "loss": 0.0062, + "step": 264 + }, + { + "epoch": 0.053642914331465175, + "learning_rate": 3.729267666905899e-06, + "loss": 0.0001, + "step": 266 + }, + { + "epoch": 0.053642914331465175, + "learning_rate": 3.740159800938784e-06, + "loss": 0.1066, + "step": 268 + }, + { + "epoch": 0.0544435548438751, + "learning_rate": 3.751064160575195e-06, + "loss": 0.0004, + "step": 270 + }, + { + "epoch": 0.0544435548438751, + "learning_rate": 3.7619807245186824e-06, + "loss": 0.0124, + "step": 272 + }, + { + "epoch": 0.055244195356285025, + "learning_rate": 3.772909471448959e-06, + "loss": 0.008, + "step": 274 + }, + { + "epoch": 0.055244195356285025, + "learning_rate": 3.783850380021933e-06, + "loss": 0.0118, + "step": 276 + }, + { + "epoch": 0.05604483586869496, + "learning_rate": 3.794803428869799e-06, + "loss": 0.0017, + "step": 278 + }, + { + "epoch": 0.05604483586869496, + "learning_rate": 3.8057685966010025e-06, + "loss": 0.2145, + "step": 280 + }, + { + "epoch": 0.05684547638110488, + "learning_rate": 3.816745861800334e-06, + "loss": 0.0013, + "step": 282 + }, + { + "epoch": 0.05684547638110488, + "learning_rate": 3.827735203028956e-06, + "loss": 0.0096, + "step": 284 + }, + { + "epoch": 0.057646116893514815, + "learning_rate": 3.838736598824446e-06, + "loss": 0.0248, + "step": 286 + }, + { + "epoch": 0.057646116893514815, + "learning_rate": 3.849750027700842e-06, + "loss": 0.0026, + "step": 288 + }, + { + "epoch": 0.05844675740592474, + "learning_rate": 3.860775468148662e-06, + "loss": 0.0092, + "step": 290 + }, + { + "epoch": 0.05844675740592474, + "learning_rate": 3.871812898635011e-06, + "loss": 0.0053, + "step": 292 + }, + { + "epoch": 0.059247397918334666, + "learning_rate": 3.882862297603536e-06, + "loss": 0.003, + "step": 294 + }, + { + "epoch": 0.059247397918334666, + "learning_rate": 3.8939236434745184e-06, + "loss": 1.0124, + "step": 296 + }, + { + "epoch": 0.0600480384307446, + "learning_rate": 3.904996914644913e-06, + "loss": 0.0663, + "step": 298 + }, + { + "epoch": 0.0600480384307446, + "learning_rate": 3.916082089488379e-06, + "loss": 0.0094, + "step": 300 + }, + { + "epoch": 0.06084867894315452, + "learning_rate": 3.927179146355317e-06, + "loss": 0.0254, + "step": 302 + }, + { + "epoch": 0.06084867894315452, + "learning_rate": 3.938288063572962e-06, + "loss": 0.1662, + "step": 304 + }, + { + "epoch": 0.06164931945556445, + "learning_rate": 3.949408819445345e-06, + "loss": 0.0634, + "step": 306 + }, + { + "epoch": 0.06164931945556445, + "learning_rate": 3.960541392253387e-06, + "loss": 0.0423, + "step": 308 + }, + { + "epoch": 0.06244995996797438, + "learning_rate": 3.971685760254933e-06, + "loss": 0.014, + "step": 310 + }, + { + "epoch": 0.06244995996797438, + "learning_rate": 3.982841901684792e-06, + "loss": 0.0111, + "step": 312 + }, + { + "epoch": 0.0632506004803843, + "learning_rate": 3.994009794754777e-06, + "loss": 0.3424, + "step": 314 + }, + { + "epoch": 0.0632506004803843, + "learning_rate": 4.005189417653737e-06, + "loss": 0.0118, + "step": 316 + }, + { + "epoch": 0.06405124099279423, + "learning_rate": 4.016380748547654e-06, + "loss": 0.1344, + "step": 318 + }, + { + "epoch": 0.06405124099279423, + "learning_rate": 4.027583765579601e-06, + "loss": 0.2704, + "step": 320 + }, + { + "epoch": 0.06485188150520416, + "learning_rate": 4.038798446869847e-06, + "loss": 0.0144, + "step": 322 + }, + { + "epoch": 0.06485188150520416, + "learning_rate": 4.050024770515873e-06, + "loss": 0.0188, + "step": 324 + }, + { + "epoch": 0.0656525220176141, + "learning_rate": 4.061262714592426e-06, + "loss": 0.3915, + "step": 326 + }, + { + "epoch": 0.0656525220176141, + "learning_rate": 4.072512257151546e-06, + "loss": 0.0002, + "step": 328 + }, + { + "epoch": 0.06645316253002402, + "learning_rate": 4.0837733762226584e-06, + "loss": 0.0971, + "step": 330 + }, + { + "epoch": 0.06645316253002402, + "learning_rate": 4.095046049812541e-06, + "loss": 0.0226, + "step": 332 + }, + { + "epoch": 0.06725380304243395, + "learning_rate": 4.106330255905417e-06, + "loss": 0.0048, + "step": 334 + }, + { + "epoch": 0.06725380304243395, + "learning_rate": 4.117625972462988e-06, + "loss": 0.0081, + "step": 336 + }, + { + "epoch": 0.06805444355484387, + "learning_rate": 4.128933177424475e-06, + "loss": 0.0116, + "step": 338 + }, + { + "epoch": 0.06805444355484387, + "learning_rate": 4.1402518487066624e-06, + "loss": 0.2303, + "step": 340 + }, + { + "epoch": 0.0688550840672538, + "learning_rate": 4.151581964203924e-06, + "loss": 0.4936, + "step": 342 + }, + { + "epoch": 0.0688550840672538, + "learning_rate": 4.1629235017883285e-06, + "loss": 0.0634, + "step": 344 + }, + { + "epoch": 0.06965572457966374, + "learning_rate": 4.174276439309593e-06, + "loss": 0.0048, + "step": 346 + }, + { + "epoch": 0.06965572457966374, + "learning_rate": 4.1856407545951825e-06, + "loss": 0.0272, + "step": 348 + }, + { + "epoch": 0.07045636509207366, + "learning_rate": 4.197016425450347e-06, + "loss": 0.0048, + "step": 350 + }, + { + "epoch": 0.07045636509207366, + "learning_rate": 4.208403429658151e-06, + "loss": 0.0693, + "step": 352 + }, + { + "epoch": 0.07125700560448359, + "learning_rate": 4.219801744979517e-06, + "loss": 0.0498, + "step": 354 + }, + { + "epoch": 0.07125700560448359, + "learning_rate": 4.2312113491533145e-06, + "loss": 0.0359, + "step": 356 + }, + { + "epoch": 0.07205764611689351, + "learning_rate": 4.242632219896328e-06, + "loss": 0.0133, + "step": 358 + }, + { + "epoch": 0.07205764611689351, + "learning_rate": 4.254064334903347e-06, + "loss": 0.0023, + "step": 360 + }, + { + "epoch": 0.07285828662930344, + "learning_rate": 4.2655076718472045e-06, + "loss": 0.0074, + "step": 362 + }, + { + "epoch": 0.07285828662930344, + "learning_rate": 4.276962208378814e-06, + "loss": 0.146, + "step": 364 + }, + { + "epoch": 0.07365892714171338, + "learning_rate": 4.28842792212722e-06, + "loss": 0.0067, + "step": 366 + }, + { + "epoch": 0.07365892714171338, + "learning_rate": 4.299904790699619e-06, + "loss": 0.0001, + "step": 368 + }, + { + "epoch": 0.0744595676541233, + "learning_rate": 4.3113927916814665e-06, + "loss": 0.0109, + "step": 370 + }, + { + "epoch": 0.0744595676541233, + "learning_rate": 4.3228919026364345e-06, + "loss": 0.226, + "step": 372 + }, + { + "epoch": 0.07526020816653323, + "learning_rate": 4.33440210110651e-06, + "loss": 0.0199, + "step": 374 + }, + { + "epoch": 0.07526020816653323, + "learning_rate": 4.345923364612024e-06, + "loss": 0.0234, + "step": 376 + }, + { + "epoch": 0.07606084867894315, + "learning_rate": 4.3574556706517035e-06, + "loss": 0.2675, + "step": 378 + }, + { + "epoch": 0.07606084867894315, + "learning_rate": 4.368998996702686e-06, + "loss": 0.0128, + "step": 380 + }, + { + "epoch": 0.07686148919135308, + "learning_rate": 4.380553320220638e-06, + "loss": 0.0078, + "step": 382 + }, + { + "epoch": 0.07686148919135308, + "learning_rate": 4.392118618639698e-06, + "loss": 0.4056, + "step": 384 + }, + { + "epoch": 0.07766212970376302, + "learning_rate": 4.403694869372589e-06, + "loss": 0.0026, + "step": 386 + }, + { + "epoch": 0.07766212970376302, + "learning_rate": 4.415282049810643e-06, + "loss": 0.0076, + "step": 388 + }, + { + "epoch": 0.07846277021617294, + "learning_rate": 4.4268801373238454e-06, + "loss": 0.0059, + "step": 390 + }, + { + "epoch": 0.07846277021617294, + "learning_rate": 4.4384891092608795e-06, + "loss": 0.1038, + "step": 392 + }, + { + "epoch": 0.07926341072858287, + "learning_rate": 4.450108942949158e-06, + "loss": 0.0319, + "step": 394 + }, + { + "epoch": 0.07926341072858287, + "learning_rate": 4.461739615694921e-06, + "loss": 0.0038, + "step": 396 + }, + { + "epoch": 0.08006405124099279, + "learning_rate": 4.473381104783201e-06, + "loss": 0.0055, + "step": 398 + }, + { + "epoch": 0.08006405124099279, + "learning_rate": 4.485033387477915e-06, + "loss": 0.0023, + "step": 400 + }, + { + "epoch": 0.08086469175340272, + "learning_rate": 4.496696441021904e-06, + "loss": 0.6122, + "step": 402 + }, + { + "epoch": 0.08086469175340272, + "learning_rate": 4.5083702426369715e-06, + "loss": 0.0124, + "step": 404 + }, + { + "epoch": 0.08166533226581266, + "learning_rate": 4.520054769523929e-06, + "loss": 0.0155, + "step": 406 + }, + { + "epoch": 0.08166533226581266, + "learning_rate": 4.531749998862628e-06, + "loss": 0.0027, + "step": 408 + }, + { + "epoch": 0.08246597277822258, + "learning_rate": 4.543455907812063e-06, + "loss": 0.0188, + "step": 410 + }, + { + "epoch": 0.08246597277822258, + "learning_rate": 4.555172473510324e-06, + "loss": 0.2692, + "step": 412 + }, + { + "epoch": 0.08326661329063251, + "learning_rate": 4.566899673074706e-06, + "loss": 0.0086, + "step": 414 + }, + { + "epoch": 0.08326661329063251, + "learning_rate": 4.578637483601732e-06, + "loss": 0.2757, + "step": 416 + }, + { + "epoch": 0.08406725380304243, + "learning_rate": 4.590385882167206e-06, + "loss": 0.0002, + "step": 418 + }, + { + "epoch": 0.08406725380304243, + "learning_rate": 4.602144845826234e-06, + "loss": 0.0062, + "step": 420 + }, + { + "epoch": 0.08486789431545236, + "learning_rate": 4.613914351613337e-06, + "loss": 0.0016, + "step": 422 + }, + { + "epoch": 0.08486789431545236, + "learning_rate": 4.625694376542399e-06, + "loss": 0.0519, + "step": 424 + }, + { + "epoch": 0.08566853482786228, + "learning_rate": 4.637484897606777e-06, + "loss": 0.0075, + "step": 426 + }, + { + "epoch": 0.08566853482786228, + "learning_rate": 4.649285891779326e-06, + "loss": 0.0103, + "step": 428 + }, + { + "epoch": 0.08646917534027222, + "learning_rate": 4.661097336012451e-06, + "loss": 0.0046, + "step": 430 + }, + { + "epoch": 0.08646917534027222, + "learning_rate": 4.672919207238145e-06, + "loss": 0.0641, + "step": 432 + }, + { + "epoch": 0.08726981585268215, + "learning_rate": 4.684751482368022e-06, + "loss": 0.0222, + "step": 434 + }, + { + "epoch": 0.08726981585268215, + "learning_rate": 4.696594138293421e-06, + "loss": 0.008, + "step": 436 + }, + { + "epoch": 0.08807045636509207, + "learning_rate": 4.7084471518853656e-06, + "loss": 0.0087, + "step": 438 + }, + { + "epoch": 0.08807045636509207, + "learning_rate": 4.720310499994664e-06, + "loss": 0.0134, + "step": 440 + }, + { + "epoch": 0.088871096877502, + "learning_rate": 4.732184159451937e-06, + "loss": 0.0165, + "step": 442 + }, + { + "epoch": 0.088871096877502, + "learning_rate": 4.744068107067673e-06, + "loss": 0.0019, + "step": 444 + }, + { + "epoch": 0.08967173738991192, + "learning_rate": 4.755962319632249e-06, + "loss": 0.0001, + "step": 446 + }, + { + "epoch": 0.08967173738991192, + "learning_rate": 4.767866773916041e-06, + "loss": 0.0509, + "step": 448 + }, + { + "epoch": 0.09047237790232186, + "learning_rate": 4.779781446669376e-06, + "loss": 0.0065, + "step": 450 + }, + { + "epoch": 0.09047237790232186, + "learning_rate": 4.79170631462264e-06, + "loss": 0.3302, + "step": 452 + }, + { + "epoch": 0.09127301841473179, + "learning_rate": 4.8036413544863095e-06, + "loss": 0.3878, + "step": 454 + }, + { + "epoch": 0.09127301841473179, + "learning_rate": 4.81558654295099e-06, + "loss": 0.0009, + "step": 456 + }, + { + "epoch": 0.09207365892714171, + "learning_rate": 4.827541856687471e-06, + "loss": 0.1511, + "step": 458 + }, + { + "epoch": 0.09207365892714171, + "learning_rate": 4.839507272346751e-06, + "loss": 0.0086, + "step": 460 + }, + { + "epoch": 0.09287429943955164, + "learning_rate": 4.8514827665601425e-06, + "loss": 0.3543, + "step": 462 + }, + { + "epoch": 0.09287429943955164, + "learning_rate": 4.863468315939234e-06, + "loss": 0.0225, + "step": 464 + }, + { + "epoch": 0.09367493995196156, + "learning_rate": 4.875463897075985e-06, + "loss": 0.0179, + "step": 466 + }, + { + "epoch": 0.09367493995196156, + "learning_rate": 4.8874694865427676e-06, + "loss": 0.0097, + "step": 468 + }, + { + "epoch": 0.0944755804643715, + "learning_rate": 4.899485060892404e-06, + "loss": 0.1328, + "step": 470 + }, + { + "epoch": 0.0944755804643715, + "learning_rate": 4.911510596658202e-06, + "loss": 0.071, + "step": 472 + }, + { + "epoch": 0.09527622097678143, + "learning_rate": 4.9235460703540615e-06, + "loss": 0.6449, + "step": 474 + }, + { + "epoch": 0.09527622097678143, + "learning_rate": 4.935591458474425e-06, + "loss": 0.0063, + "step": 476 + }, + { + "epoch": 0.09607686148919135, + "learning_rate": 4.947646737494389e-06, + "loss": 0.0023, + "step": 478 + }, + { + "epoch": 0.09607686148919135, + "learning_rate": 4.959711883869734e-06, + "loss": 0.7921, + "step": 480 + }, + { + "epoch": 0.09687750200160128, + "learning_rate": 4.9717868740369645e-06, + "loss": 0.017, + "step": 482 + }, + { + "epoch": 0.09687750200160128, + "learning_rate": 4.9838716844133665e-06, + "loss": 0.0013, + "step": 484 + }, + { + "epoch": 0.0976781425140112, + "learning_rate": 4.9959662913970254e-06, + "loss": 0.1608, + "step": 486 + }, + { + "epoch": 0.0976781425140112, + "learning_rate": 5.0080706713669435e-06, + "loss": 0.0678, + "step": 488 + }, + { + "epoch": 0.09847878302642114, + "learning_rate": 5.02018480068299e-06, + "loss": 0.0107, + "step": 490 + }, + { + "epoch": 0.09847878302642114, + "learning_rate": 5.032308655686007e-06, + "loss": 0.0177, + "step": 492 + }, + { + "epoch": 0.09927942353883107, + "learning_rate": 5.044442212697842e-06, + "loss": 0.3409, + "step": 494 + }, + { + "epoch": 0.09927942353883107, + "learning_rate": 5.056585448021398e-06, + "loss": 0.0333, + "step": 496 + }, + { + "epoch": 0.100080064051241, + "learning_rate": 5.068738337940655e-06, + "loss": 0.0097, + "step": 498 + }, + { + "epoch": 0.100080064051241, + "learning_rate": 5.080900858720789e-06, + "loss": 0.1292, + "step": 500 + }, + { + "epoch": 0.10088070456365092, + "learning_rate": 5.093072986608116e-06, + "loss": 0.0248, + "step": 502 + }, + { + "epoch": 0.10088070456365092, + "learning_rate": 5.105254697830208e-06, + "loss": 0.1738, + "step": 504 + }, + { + "epoch": 0.10168134507606084, + "learning_rate": 5.1174459685959175e-06, + "loss": 0.0005, + "step": 506 + }, + { + "epoch": 0.10168134507606084, + "learning_rate": 5.129646775095432e-06, + "loss": 0.1787, + "step": 508 + }, + { + "epoch": 0.10248198558847077, + "learning_rate": 5.141857093500307e-06, + "loss": 0.1812, + "step": 510 + }, + { + "epoch": 0.10248198558847077, + "learning_rate": 5.154076899963514e-06, + "loss": 0.0042, + "step": 512 + }, + { + "epoch": 0.10328262610088071, + "learning_rate": 5.166306170619537e-06, + "loss": 0.0815, + "step": 514 + }, + { + "epoch": 0.10328262610088071, + "learning_rate": 5.178544881584328e-06, + "loss": 0.2327, + "step": 516 + }, + { + "epoch": 0.10408326661329063, + "learning_rate": 5.190793008955421e-06, + "loss": 0.0199, + "step": 518 + }, + { + "epoch": 0.10408326661329063, + "learning_rate": 5.203050528811959e-06, + "loss": 0.0888, + "step": 520 + }, + { + "epoch": 0.10488390712570056, + "learning_rate": 5.215317417214739e-06, + "loss": 0.0933, + "step": 522 + }, + { + "epoch": 0.10488390712570056, + "learning_rate": 5.227593650206246e-06, + "loss": 0.0462, + "step": 524 + }, + { + "epoch": 0.10568454763811048, + "learning_rate": 5.239879203810763e-06, + "loss": 0.0037, + "step": 526 + }, + { + "epoch": 0.10568454763811048, + "learning_rate": 5.2521740540343205e-06, + "loss": 0.01, + "step": 528 + }, + { + "epoch": 0.10648518815052041, + "learning_rate": 5.264478176864811e-06, + "loss": 0.003, + "step": 530 + }, + { + "epoch": 0.10648518815052041, + "learning_rate": 5.2767915482720164e-06, + "loss": 0.0106, + "step": 532 + }, + { + "epoch": 0.10728582866293035, + "learning_rate": 5.289114144207656e-06, + "loss": 0.0042, + "step": 534 + }, + { + "epoch": 0.10728582866293035, + "learning_rate": 5.3014459406054295e-06, + "loss": 0.0148, + "step": 536 + }, + { + "epoch": 0.10808646917534027, + "learning_rate": 5.313786913381061e-06, + "loss": 0.0282, + "step": 538 + }, + { + "epoch": 0.10808646917534027, + "learning_rate": 5.3261370384323904e-06, + "loss": 0.0591, + "step": 540 + }, + { + "epoch": 0.1088871096877502, + "learning_rate": 5.338496291639341e-06, + "loss": 0.0691, + "step": 542 + }, + { + "epoch": 0.1088871096877502, + "learning_rate": 5.350864648864026e-06, + "loss": 0.1038, + "step": 544 + }, + { + "epoch": 0.10968775020016013, + "learning_rate": 5.363242085950773e-06, + "loss": 0.0097, + "step": 546 + }, + { + "epoch": 0.10968775020016013, + "learning_rate": 5.375628578726181e-06, + "loss": 0.1313, + "step": 548 + }, + { + "epoch": 0.11048839071257005, + "learning_rate": 5.3880241029991434e-06, + "loss": 0.1904, + "step": 550 + }, + { + "epoch": 0.11048839071257005, + "learning_rate": 5.4004286345609665e-06, + "loss": 0.02, + "step": 552 + }, + { + "epoch": 0.11128903122497999, + "learning_rate": 5.412842149185316e-06, + "loss": 0.0029, + "step": 554 + }, + { + "epoch": 0.11128903122497999, + "learning_rate": 5.425264622628326e-06, + "loss": 0.0568, + "step": 556 + }, + { + "epoch": 0.11208967173738991, + "learning_rate": 5.437696030628639e-06, + "loss": 0.0052, + "step": 558 + }, + { + "epoch": 0.11208967173738991, + "learning_rate": 5.450136348907444e-06, + "loss": 0.0047, + "step": 560 + }, + { + "epoch": 0.11289031224979984, + "learning_rate": 5.462585553168532e-06, + "loss": 0.6187, + "step": 562 + }, + { + "epoch": 0.11289031224979984, + "learning_rate": 5.475043619098321e-06, + "loss": 0.4302, + "step": 564 + }, + { + "epoch": 0.11369095276220977, + "learning_rate": 5.487510522365969e-06, + "loss": 0.4455, + "step": 566 + }, + { + "epoch": 0.11369095276220977, + "learning_rate": 5.499986238623329e-06, + "loss": 0.0035, + "step": 568 + }, + { + "epoch": 0.11449159327461969, + "learning_rate": 5.512470743505057e-06, + "loss": 0.046, + "step": 570 + }, + { + "epoch": 0.11449159327461969, + "learning_rate": 5.524964012628644e-06, + "loss": 0.0047, + "step": 572 + }, + { + "epoch": 0.11529223378702963, + "learning_rate": 5.537466021594464e-06, + "loss": 0.1496, + "step": 574 + }, + { + "epoch": 0.11529223378702963, + "learning_rate": 5.549976745985809e-06, + "loss": 0.0236, + "step": 576 + }, + { + "epoch": 0.11609287429943956, + "learning_rate": 5.5624961613689934e-06, + "loss": 0.0067, + "step": 578 + }, + { + "epoch": 0.11609287429943956, + "learning_rate": 5.57502424329331e-06, + "loss": 0.0634, + "step": 580 + }, + { + "epoch": 0.11689351481184948, + "learning_rate": 5.5875609672911465e-06, + "loss": 0.1327, + "step": 582 + }, + { + "epoch": 0.11689351481184948, + "learning_rate": 5.6001063088780085e-06, + "loss": 0.1738, + "step": 584 + }, + { + "epoch": 0.1176941553242594, + "learning_rate": 5.6126602435525725e-06, + "loss": 0.0235, + "step": 586 + }, + { + "epoch": 0.1176941553242594, + "learning_rate": 5.62522274679673e-06, + "loss": 0.0192, + "step": 588 + }, + { + "epoch": 0.11849479583666933, + "learning_rate": 5.637793794075625e-06, + "loss": 0.0069, + "step": 590 + }, + { + "epoch": 0.11849479583666933, + "learning_rate": 5.650373360837763e-06, + "loss": 0.0068, + "step": 592 + }, + { + "epoch": 0.11929543634907927, + "learning_rate": 5.662961422514961e-06, + "loss": 0.0065, + "step": 594 + }, + { + "epoch": 0.11929543634907927, + "learning_rate": 5.675557954522462e-06, + "loss": 0.0424, + "step": 596 + }, + { + "epoch": 0.1200960768614892, + "learning_rate": 5.688162932258965e-06, + "loss": 0.2188, + "step": 598 + }, + { + "epoch": 0.1200960768614892, + "learning_rate": 5.700776331106674e-06, + "loss": 0.1761, + "step": 600 + }, + { + "epoch": 0.12089671737389912, + "learning_rate": 5.713398126431353e-06, + "loss": 0.0107, + "step": 602 + }, + { + "epoch": 0.12089671737389912, + "learning_rate": 5.726028293582342e-06, + "loss": 0.0112, + "step": 604 + }, + { + "epoch": 0.12169735788630905, + "learning_rate": 5.738666807892684e-06, + "loss": 0.0057, + "step": 606 + }, + { + "epoch": 0.12169735788630905, + "learning_rate": 5.751313644679071e-06, + "loss": 0.0011, + "step": 608 + }, + { + "epoch": 0.12249799839871897, + "learning_rate": 5.763968779241957e-06, + "loss": 0.2857, + "step": 610 + }, + { + "epoch": 0.12249799839871897, + "learning_rate": 5.776632186865589e-06, + "loss": 0.02, + "step": 612 + }, + { + "epoch": 0.1232986389111289, + "learning_rate": 5.7893038428180584e-06, + "loss": 0.2251, + "step": 614 + }, + { + "epoch": 0.1232986389111289, + "learning_rate": 5.8019837223513295e-06, + "loss": 0.0079, + "step": 616 + }, + { + "epoch": 0.12409927942353884, + "learning_rate": 5.814671800701357e-06, + "loss": 0.1822, + "step": 618 + }, + { + "epoch": 0.12409927942353884, + "learning_rate": 5.827368053088032e-06, + "loss": 0.0182, + "step": 620 + }, + { + "epoch": 0.12489991993594876, + "learning_rate": 5.840072454715297e-06, + "loss": 0.0281, + "step": 622 + }, + { + "epoch": 0.12489991993594876, + "learning_rate": 5.852784980771182e-06, + "loss": 0.2302, + "step": 624 + }, + { + "epoch": 0.1257005604483587, + "learning_rate": 5.865505606427848e-06, + "loss": 0.1837, + "step": 626 + }, + { + "epoch": 0.1257005604483587, + "learning_rate": 5.878234306841637e-06, + "loss": 0.0129, + "step": 628 + }, + { + "epoch": 0.1265012009607686, + "learning_rate": 5.890971057153105e-06, + "loss": 0.0023, + "step": 630 + }, + { + "epoch": 0.1265012009607686, + "learning_rate": 5.903715832487138e-06, + "loss": 0.1906, + "step": 632 + }, + { + "epoch": 0.12730184147317855, + "learning_rate": 5.916468607952892e-06, + "loss": 0.0155, + "step": 634 + }, + { + "epoch": 0.12730184147317855, + "learning_rate": 5.929229358643925e-06, + "loss": 0.0289, + "step": 636 + }, + { + "epoch": 0.12810248198558846, + "learning_rate": 5.941998059638212e-06, + "loss": 0.0114, + "step": 638 + }, + { + "epoch": 0.12810248198558846, + "learning_rate": 5.954774685998206e-06, + "loss": 0.1762, + "step": 640 + }, + { + "epoch": 0.1289031224979984, + "learning_rate": 5.9675592127708585e-06, + "loss": 0.0078, + "step": 642 + }, + { + "epoch": 0.1289031224979984, + "learning_rate": 5.9803516149877475e-06, + "loss": 0.1084, + "step": 644 + }, + { + "epoch": 0.1297037630104083, + "learning_rate": 5.993151867665015e-06, + "loss": 0.4576, + "step": 646 + }, + { + "epoch": 0.1297037630104083, + "learning_rate": 6.005959945803494e-06, + "loss": 0.0033, + "step": 648 + }, + { + "epoch": 0.13050440352281825, + "learning_rate": 6.01877582438873e-06, + "loss": 0.3519, + "step": 650 + }, + { + "epoch": 0.13050440352281825, + "learning_rate": 6.03159947839103e-06, + "loss": 0.1319, + "step": 652 + }, + { + "epoch": 0.1313050440352282, + "learning_rate": 6.0444308827655265e-06, + "loss": 0.0125, + "step": 654 + }, + { + "epoch": 0.1313050440352282, + "learning_rate": 6.057270012452186e-06, + "loss": 0.0042, + "step": 656 + }, + { + "epoch": 0.1321056845476381, + "learning_rate": 6.070116842375947e-06, + "loss": 0.007, + "step": 658 + }, + { + "epoch": 0.1321056845476381, + "learning_rate": 6.082971347446654e-06, + "loss": 0.0001, + "step": 660 + }, + { + "epoch": 0.13290632506004804, + "learning_rate": 6.095833502559182e-06, + "loss": 0.0013, + "step": 662 + }, + { + "epoch": 0.13290632506004804, + "learning_rate": 6.108703282593461e-06, + "loss": 0.3794, + "step": 664 + }, + { + "epoch": 0.13370696557245795, + "learning_rate": 6.121580662414533e-06, + "loss": 0.0033, + "step": 666 + }, + { + "epoch": 0.13370696557245795, + "learning_rate": 6.13446561687258e-06, + "loss": 0.0095, + "step": 668 + }, + { + "epoch": 0.1345076060848679, + "learning_rate": 6.147358120803041e-06, + "loss": 0.207, + "step": 670 + }, + { + "epoch": 0.1345076060848679, + "learning_rate": 6.160258149026557e-06, + "loss": 0.0165, + "step": 672 + }, + { + "epoch": 0.13530824659727783, + "learning_rate": 6.173165676349095e-06, + "loss": 0.0074, + "step": 674 + }, + { + "epoch": 0.13530824659727783, + "learning_rate": 6.186080677561974e-06, + "loss": 0.0319, + "step": 676 + }, + { + "epoch": 0.13610888710968774, + "learning_rate": 6.1990031274419186e-06, + "loss": 0.0015, + "step": 678 + }, + { + "epoch": 0.13610888710968774, + "learning_rate": 6.2119330007511014e-06, + "loss": 0.0404, + "step": 680 + }, + { + "epoch": 0.13690952762209768, + "learning_rate": 6.224870272237185e-06, + "loss": 0.3826, + "step": 682 + }, + { + "epoch": 0.13690952762209768, + "learning_rate": 6.237814916633431e-06, + "loss": 0.0185, + "step": 684 + }, + { + "epoch": 0.1377101681345076, + "learning_rate": 6.250766908658652e-06, + "loss": 0.0045, + "step": 686 + }, + { + "epoch": 0.1377101681345076, + "learning_rate": 6.263726223017326e-06, + "loss": 0.645, + "step": 688 + }, + { + "epoch": 0.13851080864691753, + "learning_rate": 6.2766928343996314e-06, + "loss": 0.002, + "step": 690 + }, + { + "epoch": 0.13851080864691753, + "learning_rate": 6.289666717481496e-06, + "loss": 0.0663, + "step": 692 + }, + { + "epoch": 0.13931144915932747, + "learning_rate": 6.3026478469246285e-06, + "loss": 0.6162, + "step": 694 + }, + { + "epoch": 0.13931144915932747, + "learning_rate": 6.315636197376634e-06, + "loss": 0.0324, + "step": 696 + }, + { + "epoch": 0.14011208967173738, + "learning_rate": 6.328631743470968e-06, + "loss": 0.4025, + "step": 698 + }, + { + "epoch": 0.14011208967173738, + "learning_rate": 6.341634459827044e-06, + "loss": 0.0092, + "step": 700 + }, + { + "epoch": 0.14091273018414732, + "learning_rate": 6.354644321050279e-06, + "loss": 0.0001, + "step": 702 + }, + { + "epoch": 0.14091273018414732, + "learning_rate": 6.3676613017321305e-06, + "loss": 0.222, + "step": 704 + }, + { + "epoch": 0.14171337069655723, + "learning_rate": 6.380685376450153e-06, + "loss": 0.0036, + "step": 706 + }, + { + "epoch": 0.14171337069655723, + "learning_rate": 6.393716519768032e-06, + "loss": 0.0344, + "step": 708 + }, + { + "epoch": 0.14251401120896717, + "learning_rate": 6.406754706235692e-06, + "loss": 0.0256, + "step": 710 + }, + { + "epoch": 0.14251401120896717, + "learning_rate": 6.419799910389257e-06, + "loss": 0.03, + "step": 712 + }, + { + "epoch": 0.1433146517213771, + "learning_rate": 6.432852106751162e-06, + "loss": 0.0124, + "step": 714 + }, + { + "epoch": 0.1433146517213771, + "learning_rate": 6.445911269830183e-06, + "loss": 0.0029, + "step": 716 + }, + { + "epoch": 0.14411529223378702, + "learning_rate": 6.458977374121492e-06, + "loss": 0.0756, + "step": 718 + }, + { + "epoch": 0.14411529223378702, + "learning_rate": 6.472050394106689e-06, + "loss": 0.0108, + "step": 720 + }, + { + "epoch": 0.14491593274619696, + "learning_rate": 6.485130304253915e-06, + "loss": 0.0658, + "step": 722 + }, + { + "epoch": 0.14491593274619696, + "learning_rate": 6.498217079017806e-06, + "loss": 0.2104, + "step": 724 + }, + { + "epoch": 0.14571657325860687, + "learning_rate": 6.511310692839605e-06, + "loss": 0.2439, + "step": 726 + }, + { + "epoch": 0.14571657325860687, + "learning_rate": 6.524411120147204e-06, + "loss": 0.0013, + "step": 728 + }, + { + "epoch": 0.1465172137710168, + "learning_rate": 6.537518335355182e-06, + "loss": 0.225, + "step": 730 + }, + { + "epoch": 0.1465172137710168, + "learning_rate": 6.5506323128648654e-06, + "loss": 0.1093, + "step": 732 + }, + { + "epoch": 0.14731785428342675, + "learning_rate": 6.563753027064355e-06, + "loss": 0.2603, + "step": 734 + }, + { + "epoch": 0.14731785428342675, + "learning_rate": 6.576880452328645e-06, + "loss": 0.0286, + "step": 736 + }, + { + "epoch": 0.14811849479583666, + "learning_rate": 6.590014563019571e-06, + "loss": 0.0971, + "step": 738 + }, + { + "epoch": 0.14811849479583666, + "learning_rate": 6.603155333485934e-06, + "loss": 0.0164, + "step": 740 + }, + { + "epoch": 0.1489191353082466, + "learning_rate": 6.61630273806352e-06, + "loss": 0.0407, + "step": 742 + }, + { + "epoch": 0.1489191353082466, + "learning_rate": 6.6294567510751675e-06, + "loss": 0.0532, + "step": 744 + }, + { + "epoch": 0.14971977582065651, + "learning_rate": 6.642617346830784e-06, + "loss": 0.4811, + "step": 746 + }, + { + "epoch": 0.14971977582065651, + "learning_rate": 6.655784499627476e-06, + "loss": 0.0148, + "step": 748 + }, + { + "epoch": 0.15052041633306645, + "learning_rate": 6.6689581837494925e-06, + "loss": 0.5057, + "step": 750 + }, + { + "epoch": 0.15052041633306645, + "learning_rate": 6.682138373468341e-06, + "loss": 0.4743, + "step": 752 + }, + { + "epoch": 0.1513210568454764, + "learning_rate": 6.695325043042827e-06, + "loss": 0.0143, + "step": 754 + }, + { + "epoch": 0.1513210568454764, + "learning_rate": 6.7085181667191e-06, + "loss": 0.0789, + "step": 756 + }, + { + "epoch": 0.1521216973578863, + "learning_rate": 6.7217177187307e-06, + "loss": 0.0253, + "step": 758 + }, + { + "epoch": 0.1521216973578863, + "learning_rate": 6.734923673298605e-06, + "loss": 0.0122, + "step": 760 + }, + { + "epoch": 0.15292233787029624, + "learning_rate": 6.748136004631327e-06, + "loss": 0.0019, + "step": 762 + }, + { + "epoch": 0.15292233787029624, + "learning_rate": 6.761354686924883e-06, + "loss": 0.0023, + "step": 764 + }, + { + "epoch": 0.15372297838270615, + "learning_rate": 6.774579694362902e-06, + "loss": 0.0657, + "step": 766 + }, + { + "epoch": 0.15372297838270615, + "learning_rate": 6.787811001116654e-06, + "loss": 0.0123, + "step": 768 + }, + { + "epoch": 0.1545236188951161, + "learning_rate": 6.801048581345113e-06, + "loss": 0.0151, + "step": 770 + }, + { + "epoch": 0.1545236188951161, + "learning_rate": 6.8142924091949955e-06, + "loss": 0.3758, + "step": 772 + }, + { + "epoch": 0.15532425940752603, + "learning_rate": 6.827542458800804e-06, + "loss": 0.0004, + "step": 774 + }, + { + "epoch": 0.15532425940752603, + "learning_rate": 6.840798704284939e-06, + "loss": 0.0008, + "step": 776 + }, + { + "epoch": 0.15612489991993594, + "learning_rate": 6.854061119757647e-06, + "loss": 0.0212, + "step": 778 + }, + { + "epoch": 0.15612489991993594, + "learning_rate": 6.867329679317144e-06, + "loss": 0.1763, + "step": 780 + }, + { + "epoch": 0.15692554043234588, + "learning_rate": 6.880604357049646e-06, + "loss": 0.0023, + "step": 782 + }, + { + "epoch": 0.15692554043234588, + "learning_rate": 6.893885127029419e-06, + "loss": 0.0002, + "step": 784 + }, + { + "epoch": 0.1577261809447558, + "learning_rate": 6.907171963318815e-06, + "loss": 0.285, + "step": 786 + }, + { + "epoch": 0.1577261809447558, + "learning_rate": 6.920464839968391e-06, + "loss": 1.0925, + "step": 788 + }, + { + "epoch": 0.15852682145716573, + "learning_rate": 6.9337637310168494e-06, + "loss": 0.1321, + "step": 790 + }, + { + "epoch": 0.15852682145716573, + "learning_rate": 6.94706861049117e-06, + "loss": 0.0052, + "step": 792 + }, + { + "epoch": 0.15932746196957567, + "learning_rate": 6.960379452406636e-06, + "loss": 0.0576, + "step": 794 + }, + { + "epoch": 0.15932746196957567, + "learning_rate": 6.973696230766884e-06, + "loss": 0.0037, + "step": 796 + }, + { + "epoch": 0.16012810248198558, + "learning_rate": 6.9870189195639595e-06, + "loss": 0.0834, + "step": 798 + }, + { + "epoch": 0.16012810248198558, + "learning_rate": 7.000347492778341e-06, + "loss": 0.5316, + "step": 800 + }, + { + "epoch": 0.16092874299439552, + "learning_rate": 7.013681924379073e-06, + "loss": 0.0709, + "step": 802 + }, + { + "epoch": 0.16092874299439552, + "learning_rate": 7.027022188323704e-06, + "loss": 0.0547, + "step": 804 + }, + { + "epoch": 0.16172938350680544, + "learning_rate": 7.040368258558412e-06, + "loss": 0.0436, + "step": 806 + }, + { + "epoch": 0.16172938350680544, + "learning_rate": 7.05372010901803e-06, + "loss": 0.0033, + "step": 808 + }, + { + "epoch": 0.16253002401921537, + "learning_rate": 7.0670777136261035e-06, + "loss": 0.0219, + "step": 810 + }, + { + "epoch": 0.16253002401921537, + "learning_rate": 7.080441046294945e-06, + "loss": 0.2076, + "step": 812 + }, + { + "epoch": 0.1633306645316253, + "learning_rate": 7.093810080925657e-06, + "loss": 0.0112, + "step": 814 + }, + { + "epoch": 0.1633306645316253, + "learning_rate": 7.1071847914082605e-06, + "loss": 0.0787, + "step": 816 + }, + { + "epoch": 0.16413130504403523, + "learning_rate": 7.120565151621638e-06, + "loss": 0.0063, + "step": 818 + }, + { + "epoch": 0.16413130504403523, + "learning_rate": 7.133951135433656e-06, + "loss": 0.171, + "step": 820 + }, + { + "epoch": 0.16493194555644516, + "learning_rate": 7.1473427167012e-06, + "loss": 0.1278, + "step": 822 + }, + { + "epoch": 0.16493194555644516, + "learning_rate": 7.160739869270219e-06, + "loss": 0.1191, + "step": 824 + }, + { + "epoch": 0.16573258606885508, + "learning_rate": 7.1741425669757854e-06, + "loss": 0.2681, + "step": 826 + }, + { + "epoch": 0.16573258606885508, + "learning_rate": 7.18755078364214e-06, + "loss": 0.0504, + "step": 828 + }, + { + "epoch": 0.16653322658126501, + "learning_rate": 7.200964493082727e-06, + "loss": 0.0017, + "step": 830 + }, + { + "epoch": 0.16653322658126501, + "learning_rate": 7.214383669100317e-06, + "loss": 0.0168, + "step": 832 + }, + { + "epoch": 0.16733386709367493, + "learning_rate": 7.227808285486952e-06, + "loss": 0.2642, + "step": 834 + }, + { + "epoch": 0.16733386709367493, + "learning_rate": 7.241238316024064e-06, + "loss": 0.0739, + "step": 836 + }, + { + "epoch": 0.16813450760608487, + "learning_rate": 7.254673734482513e-06, + "loss": 0.0124, + "step": 838 + }, + { + "epoch": 0.16813450760608487, + "learning_rate": 7.268114514622635e-06, + "loss": 0.02, + "step": 840 + }, + { + "epoch": 0.1689351481184948, + "learning_rate": 7.2815606301942945e-06, + "loss": 0.0068, + "step": 842 + }, + { + "epoch": 0.1689351481184948, + "learning_rate": 7.2950120549369204e-06, + "loss": 0.0766, + "step": 844 + }, + { + "epoch": 0.16973578863090472, + "learning_rate": 7.308468762579623e-06, + "loss": 0.0271, + "step": 846 + }, + { + "epoch": 0.16973578863090472, + "learning_rate": 7.321930726841144e-06, + "loss": 0.0354, + "step": 848 + }, + { + "epoch": 0.17053642914331466, + "learning_rate": 7.3353979214299765e-06, + "loss": 0.1633, + "step": 850 + }, + { + "epoch": 0.17053642914331466, + "learning_rate": 7.348870320044395e-06, + "loss": 0.2286, + "step": 852 + }, + { + "epoch": 0.17133706965572457, + "learning_rate": 7.362347896372515e-06, + "loss": 0.07, + "step": 854 + }, + { + "epoch": 0.17133706965572457, + "learning_rate": 7.375830624092336e-06, + "loss": 0.0042, + "step": 856 + }, + { + "epoch": 0.1721377101681345, + "learning_rate": 7.389318476871784e-06, + "loss": 0.1077, + "step": 858 + }, + { + "epoch": 0.1721377101681345, + "learning_rate": 7.402811428368824e-06, + "loss": 0.0127, + "step": 860 + }, + { + "epoch": 0.17293835068054444, + "learning_rate": 7.416309452231411e-06, + "loss": 0.0125, + "step": 862 + }, + { + "epoch": 0.17293835068054444, + "learning_rate": 7.429812522097613e-06, + "loss": 0.009, + "step": 864 + }, + { + "epoch": 0.17373899119295436, + "learning_rate": 7.443320611595641e-06, + "loss": 0.0399, + "step": 866 + }, + { + "epoch": 0.17373899119295436, + "learning_rate": 7.4568336943439055e-06, + "loss": 0.03, + "step": 868 + }, + { + "epoch": 0.1745396317053643, + "learning_rate": 7.470351743951061e-06, + "loss": 0.0054, + "step": 870 + }, + { + "epoch": 0.1745396317053643, + "learning_rate": 7.4838747340160475e-06, + "loss": 0.0225, + "step": 872 + }, + { + "epoch": 0.1753402722177742, + "learning_rate": 7.497402638128209e-06, + "loss": 0.005, + "step": 874 + }, + { + "epoch": 0.1753402722177742, + "learning_rate": 7.510935429867233e-06, + "loss": 0.0158, + "step": 876 + }, + { + "epoch": 0.17614091273018415, + "learning_rate": 7.52447308280329e-06, + "loss": 0.0043, + "step": 878 + }, + { + "epoch": 0.17614091273018415, + "learning_rate": 7.538015570497046e-06, + "loss": 0.8699, + "step": 880 + }, + { + "epoch": 0.17694155324259409, + "learning_rate": 7.551562866499732e-06, + "loss": 0.0018, + "step": 882 + }, + { + "epoch": 0.17694155324259409, + "learning_rate": 7.5651149443531846e-06, + "loss": 0.119, + "step": 884 + }, + { + "epoch": 0.177742193755004, + "learning_rate": 7.578671777589884e-06, + "loss": 0.0244, + "step": 886 + }, + { + "epoch": 0.177742193755004, + "learning_rate": 7.592233339733077e-06, + "loss": 0.0129, + "step": 888 + }, + { + "epoch": 0.17854283426741394, + "learning_rate": 7.605799604296721e-06, + "loss": 0.0007, + "step": 890 + }, + { + "epoch": 0.17854283426741394, + "learning_rate": 7.619370544785608e-06, + "loss": 0.2501, + "step": 892 + }, + { + "epoch": 0.17934347477982385, + "learning_rate": 7.632946134695396e-06, + "loss": 0.011, + "step": 894 + }, + { + "epoch": 0.17934347477982385, + "learning_rate": 7.646526347512665e-06, + "loss": 0.0721, + "step": 896 + }, + { + "epoch": 0.1801441152922338, + "learning_rate": 7.660111156714964e-06, + "loss": 0.0205, + "step": 898 + }, + { + "epoch": 0.1801441152922338, + "learning_rate": 7.67370053577085e-06, + "loss": 0.0774, + "step": 900 + }, + { + "epoch": 0.18094475580464373, + "learning_rate": 7.687294458140006e-06, + "loss": 0.0042, + "step": 902 + }, + { + "epoch": 0.18094475580464373, + "learning_rate": 7.70089289727319e-06, + "loss": 0.0271, + "step": 904 + }, + { + "epoch": 0.18174539631705364, + "learning_rate": 7.714495826612353e-06, + "loss": 0.2146, + "step": 906 + }, + { + "epoch": 0.18174539631705364, + "learning_rate": 7.728103219590684e-06, + "loss": 0.0083, + "step": 908 + }, + { + "epoch": 0.18254603682946358, + "learning_rate": 7.741715049632646e-06, + "loss": 0.0221, + "step": 910 + }, + { + "epoch": 0.18254603682946358, + "learning_rate": 7.755331290154041e-06, + "loss": 0.0002, + "step": 912 + }, + { + "epoch": 0.1833466773418735, + "learning_rate": 7.76895191456204e-06, + "loss": 0.2641, + "step": 914 + }, + { + "epoch": 0.1833466773418735, + "learning_rate": 7.7825768962553e-06, + "loss": 0.0115, + "step": 916 + }, + { + "epoch": 0.18414731785428343, + "learning_rate": 7.796206208623925e-06, + "loss": 0.1835, + "step": 918 + }, + { + "epoch": 0.18414731785428343, + "learning_rate": 7.809839825049565e-06, + "loss": 0.0022, + "step": 920 + }, + { + "epoch": 0.18494795836669337, + "learning_rate": 7.82347771890548e-06, + "loss": 0.0129, + "step": 922 + }, + { + "epoch": 0.18494795836669337, + "learning_rate": 7.83711986355656e-06, + "loss": 0.0336, + "step": 924 + }, + { + "epoch": 0.18574859887910328, + "learning_rate": 7.850766232359408e-06, + "loss": 0.0061, + "step": 926 + }, + { + "epoch": 0.18574859887910328, + "learning_rate": 7.864416798662347e-06, + "loss": 0.003, + "step": 928 + }, + { + "epoch": 0.18654923939151322, + "learning_rate": 7.878071535805564e-06, + "loss": 0.0048, + "step": 930 + }, + { + "epoch": 0.18654923939151322, + "learning_rate": 7.891730417121043e-06, + "loss": 0.0876, + "step": 932 + }, + { + "epoch": 0.18734987990392313, + "learning_rate": 7.90539341593269e-06, + "loss": 0.0362, + "step": 934 + }, + { + "epoch": 0.18734987990392313, + "learning_rate": 7.919060505556376e-06, + "loss": 0.0001, + "step": 936 + }, + { + "epoch": 0.18815052041633307, + "learning_rate": 7.932731659299978e-06, + "loss": 0.006, + "step": 938 + }, + { + "epoch": 0.18815052041633307, + "learning_rate": 7.946406850463435e-06, + "loss": 0.0123, + "step": 940 + }, + { + "epoch": 0.188951160928743, + "learning_rate": 7.960086052338788e-06, + "loss": 0.0076, + "step": 942 + }, + { + "epoch": 0.188951160928743, + "learning_rate": 7.973769238210291e-06, + "loss": 0.0359, + "step": 944 + }, + { + "epoch": 0.18975180144115292, + "learning_rate": 7.987456381354371e-06, + "loss": 0.005, + "step": 946 + }, + { + "epoch": 0.18975180144115292, + "learning_rate": 8.001147455039737e-06, + "loss": 0.1388, + "step": 948 + }, + { + "epoch": 0.19055244195356286, + "learning_rate": 8.01484243252743e-06, + "loss": 0.0012, + "step": 950 + }, + { + "epoch": 0.19055244195356286, + "learning_rate": 8.028541287070858e-06, + "loss": 0.0023, + "step": 952 + }, + { + "epoch": 0.19135308246597277, + "learning_rate": 8.042243991915866e-06, + "loss": 0.0006, + "step": 954 + }, + { + "epoch": 0.19135308246597277, + "learning_rate": 8.055950520300756e-06, + "loss": 0.0023, + "step": 956 + }, + { + "epoch": 0.1921537229783827, + "learning_rate": 8.069660845456411e-06, + "loss": 0.0002, + "step": 958 + }, + { + "epoch": 0.1921537229783827, + "learning_rate": 8.083374940606256e-06, + "loss": 0.0509, + "step": 960 + }, + { + "epoch": 0.19295436349079265, + "learning_rate": 8.097092778966364e-06, + "loss": 0.0662, + "step": 962 + }, + { + "epoch": 0.19295436349079265, + "learning_rate": 8.110814333745503e-06, + "loss": 0.007, + "step": 964 + }, + { + "epoch": 0.19375500400320256, + "learning_rate": 8.124539578145176e-06, + "loss": 0.0253, + "step": 966 + }, + { + "epoch": 0.19375500400320256, + "learning_rate": 8.138268485359684e-06, + "loss": 0.001, + "step": 968 + }, + { + "epoch": 0.1945556445156125, + "learning_rate": 8.152001028576158e-06, + "loss": 0.0055, + "step": 970 + }, + { + "epoch": 0.1945556445156125, + "learning_rate": 8.165737180974676e-06, + "loss": 0.042, + "step": 972 + }, + { + "epoch": 0.1953562850280224, + "learning_rate": 8.179476915728217e-06, + "loss": 0.0382, + "step": 974 + }, + { + "epoch": 0.1953562850280224, + "learning_rate": 8.193220206002785e-06, + "loss": 0.0005, + "step": 976 + }, + { + "epoch": 0.19615692554043235, + "learning_rate": 8.206967024957432e-06, + "loss": 0.0452, + "step": 978 + }, + { + "epoch": 0.19615692554043235, + "learning_rate": 8.220717345744326e-06, + "loss": 0.4027, + "step": 980 + }, + { + "epoch": 0.1969575660528423, + "learning_rate": 8.234471141508773e-06, + "loss": 0.2069, + "step": 982 + }, + { + "epoch": 0.1969575660528423, + "learning_rate": 8.248228385389349e-06, + "loss": 0.0566, + "step": 984 + }, + { + "epoch": 0.1977582065652522, + "learning_rate": 8.261989050517841e-06, + "loss": 0.0132, + "step": 986 + }, + { + "epoch": 0.1977582065652522, + "learning_rate": 8.275753110019367e-06, + "loss": 0.02, + "step": 988 + }, + { + "epoch": 0.19855884707766214, + "learning_rate": 8.289520537012428e-06, + "loss": 0.1235, + "step": 990 + }, + { + "epoch": 0.19855884707766214, + "learning_rate": 8.303291304608936e-06, + "loss": 0.1076, + "step": 992 + }, + { + "epoch": 0.19935948759007205, + "learning_rate": 8.317065385914285e-06, + "loss": 1.5592, + "step": 994 + }, + { + "epoch": 0.19935948759007205, + "learning_rate": 8.330842754027378e-06, + "loss": 0.0078, + "step": 996 + }, + { + "epoch": 0.200160128102482, + "learning_rate": 8.344623382040752e-06, + "loss": 0.0047, + "step": 998 + }, + { + "epoch": 0.200160128102482, + "learning_rate": 8.358407243040524e-06, + "loss": 0.1183, + "step": 1000 + }, + { + "epoch": 0.20096076861489193, + "learning_rate": 8.372194310106515e-06, + "loss": 0.1148, + "step": 1002 + }, + { + "epoch": 0.20096076861489193, + "learning_rate": 8.385984556312285e-06, + "loss": 0.2857, + "step": 1004 + }, + { + "epoch": 0.20176140912730184, + "learning_rate": 8.399777954725183e-06, + "loss": 0.2654, + "step": 1006 + }, + { + "epoch": 0.20176140912730184, + "learning_rate": 8.413574478406386e-06, + "loss": 0.0111, + "step": 1008 + }, + { + "epoch": 0.20256204963971178, + "learning_rate": 8.427374100411022e-06, + "loss": 0.021, + "step": 1010 + }, + { + "epoch": 0.20256204963971178, + "learning_rate": 8.441176793788106e-06, + "loss": 0.0047, + "step": 1012 + }, + { + "epoch": 0.2033626901521217, + "learning_rate": 8.454982531580687e-06, + "loss": 0.675, + "step": 1014 + }, + { + "epoch": 0.2033626901521217, + "learning_rate": 8.468791286825856e-06, + "loss": 0.0455, + "step": 1016 + }, + { + "epoch": 0.20416333066453163, + "learning_rate": 8.482603032554812e-06, + "loss": 0.1333, + "step": 1018 + }, + { + "epoch": 0.20416333066453163, + "learning_rate": 8.496417741792922e-06, + "loss": 0.0228, + "step": 1020 + }, + { + "epoch": 0.20496397117694154, + "learning_rate": 8.510235387559738e-06, + "loss": 0.0322, + "step": 1022 + }, + { + "epoch": 0.20496397117694154, + "learning_rate": 8.524055942869135e-06, + "loss": 0.0827, + "step": 1024 + }, + { + "epoch": 0.20576461168935148, + "learning_rate": 8.537879380729254e-06, + "loss": 0.1964, + "step": 1026 + }, + { + "epoch": 0.20576461168935148, + "learning_rate": 8.551705674142616e-06, + "loss": 0.0001, + "step": 1028 + }, + { + "epoch": 0.20656525220176142, + "learning_rate": 8.565534796106175e-06, + "loss": 0.0236, + "step": 1030 + }, + { + "epoch": 0.20656525220176142, + "learning_rate": 8.579366719611353e-06, + "loss": 0.0117, + "step": 1032 + }, + { + "epoch": 0.20736589271417133, + "learning_rate": 8.593201417644091e-06, + "loss": 0.0163, + "step": 1034 + }, + { + "epoch": 0.20736589271417133, + "learning_rate": 8.607038863184952e-06, + "loss": 0.4011, + "step": 1036 + }, + { + "epoch": 0.20816653322658127, + "learning_rate": 8.620879029209093e-06, + "loss": 0.0001, + "step": 1038 + }, + { + "epoch": 0.20816653322658127, + "learning_rate": 8.634721888686368e-06, + "loss": 0.4028, + "step": 1040 + }, + { + "epoch": 0.20896717373899118, + "learning_rate": 8.648567414581372e-06, + "loss": 0.6762, + "step": 1042 + }, + { + "epoch": 0.20896717373899118, + "learning_rate": 8.662415579853495e-06, + "loss": 0.0086, + "step": 1044 + }, + { + "epoch": 0.20976781425140112, + "learning_rate": 8.676266357456968e-06, + "loss": 0.0029, + "step": 1046 + }, + { + "epoch": 0.20976781425140112, + "learning_rate": 8.690119720340907e-06, + "loss": 0.2297, + "step": 1048 + }, + { + "epoch": 0.21056845476381106, + "learning_rate": 8.703975641449426e-06, + "loss": 0.0227, + "step": 1050 + }, + { + "epoch": 0.21056845476381106, + "learning_rate": 8.717834093721598e-06, + "loss": 0.0663, + "step": 1052 + }, + { + "epoch": 0.21136909527622097, + "learning_rate": 8.731695050091561e-06, + "loss": 0.0054, + "step": 1054 + }, + { + "epoch": 0.21136909527622097, + "learning_rate": 8.74555848348857e-06, + "loss": 0.3183, + "step": 1056 + }, + { + "epoch": 0.2121697357886309, + "learning_rate": 8.759424366837035e-06, + "loss": 0.0216, + "step": 1058 + }, + { + "epoch": 0.2121697357886309, + "learning_rate": 8.773292673056572e-06, + "loss": 0.1549, + "step": 1060 + }, + { + "epoch": 0.21297037630104082, + "learning_rate": 8.787163375062113e-06, + "loss": 0.265, + "step": 1062 + }, + { + "epoch": 0.21297037630104082, + "learning_rate": 8.801036445763858e-06, + "loss": 0.0157, + "step": 1064 + }, + { + "epoch": 0.21377101681345076, + "learning_rate": 8.8149118580674e-06, + "loss": 0.0508, + "step": 1066 + }, + { + "epoch": 0.21377101681345076, + "learning_rate": 8.828789584873757e-06, + "loss": 0.0071, + "step": 1068 + }, + { + "epoch": 0.2145716573258607, + "learning_rate": 8.84266959907943e-06, + "loss": 0.0027, + "step": 1070 + }, + { + "epoch": 0.2145716573258607, + "learning_rate": 8.856551873576448e-06, + "loss": 0.1521, + "step": 1072 + }, + { + "epoch": 0.2153722978382706, + "learning_rate": 8.870436381252412e-06, + "loss": 0.4029, + "step": 1074 + }, + { + "epoch": 0.2153722978382706, + "learning_rate": 8.884323094990613e-06, + "loss": 0.0343, + "step": 1076 + }, + { + "epoch": 0.21617293835068055, + "learning_rate": 8.89821198766998e-06, + "loss": 0.0801, + "step": 1078 + }, + { + "epoch": 0.21617293835068055, + "learning_rate": 8.912103032165206e-06, + "loss": 0.0156, + "step": 1080 + }, + { + "epoch": 0.21697357886309046, + "learning_rate": 8.925996201346779e-06, + "loss": 0.0096, + "step": 1082 + }, + { + "epoch": 0.21697357886309046, + "learning_rate": 8.939891468081036e-06, + "loss": 0.0406, + "step": 1084 + }, + { + "epoch": 0.2177742193755004, + "learning_rate": 8.953788805230209e-06, + "loss": 0.1328, + "step": 1086 + }, + { + "epoch": 0.2177742193755004, + "learning_rate": 8.967688185652527e-06, + "loss": 0.0001, + "step": 1088 + }, + { + "epoch": 0.21857485988791034, + "learning_rate": 8.981589582202184e-06, + "loss": 0.0609, + "step": 1090 + }, + { + "epoch": 0.21857485988791034, + "learning_rate": 8.995492967729449e-06, + "loss": 0.0708, + "step": 1092 + }, + { + "epoch": 0.21937550040032025, + "learning_rate": 9.009398315080712e-06, + "loss": 0.3147, + "step": 1094 + }, + { + "epoch": 0.21937550040032025, + "learning_rate": 9.023305597098526e-06, + "loss": 0.0002, + "step": 1096 + }, + { + "epoch": 0.2201761409127302, + "learning_rate": 9.037214786621669e-06, + "loss": 0.0059, + "step": 1098 + }, + { + "epoch": 0.2201761409127302, + "learning_rate": 9.051125856485175e-06, + "loss": 0.0643, + "step": 1100 + }, + { + "epoch": 0.2209767814251401, + "learning_rate": 9.065038779520457e-06, + "loss": 0.1204, + "step": 1102 + }, + { + "epoch": 0.2209767814251401, + "learning_rate": 9.078953528555258e-06, + "loss": 0.0049, + "step": 1104 + }, + { + "epoch": 0.22177742193755004, + "learning_rate": 9.092870076413771e-06, + "loss": 0.0201, + "step": 1106 + }, + { + "epoch": 0.22177742193755004, + "learning_rate": 9.106788395916682e-06, + "loss": 0.1813, + "step": 1108 + }, + { + "epoch": 0.22257806244995998, + "learning_rate": 9.120708459881203e-06, + "loss": 0.0022, + "step": 1110 + }, + { + "epoch": 0.22257806244995998, + "learning_rate": 9.134630241121135e-06, + "loss": 0.0035, + "step": 1112 + }, + { + "epoch": 0.2233787029623699, + "learning_rate": 9.148553712446971e-06, + "loss": 0.0185, + "step": 1114 + }, + { + "epoch": 0.2233787029623699, + "learning_rate": 9.162478846665854e-06, + "loss": 0.0262, + "step": 1116 + }, + { + "epoch": 0.22417934347477983, + "learning_rate": 9.176405616581694e-06, + "loss": 0.0012, + "step": 1118 + }, + { + "epoch": 0.22417934347477983, + "learning_rate": 9.190333994995208e-06, + "loss": 0.3057, + "step": 1120 + }, + { + "epoch": 0.22497998398718974, + "learning_rate": 9.20426395470397e-06, + "loss": 0.3783, + "step": 1122 + }, + { + "epoch": 0.22497998398718974, + "learning_rate": 9.218195468502469e-06, + "loss": 0.0032, + "step": 1124 + }, + { + "epoch": 0.22578062449959968, + "learning_rate": 9.232128509182136e-06, + "loss": 0.1114, + "step": 1126 + }, + { + "epoch": 0.22578062449959968, + "learning_rate": 9.24606304953148e-06, + "loss": 0.027, + "step": 1128 + }, + { + "epoch": 0.22658126501200962, + "learning_rate": 9.259999062336021e-06, + "loss": 0.0039, + "step": 1130 + }, + { + "epoch": 0.22658126501200962, + "learning_rate": 9.273936520378426e-06, + "loss": 0.0097, + "step": 1132 + }, + { + "epoch": 0.22738190552441953, + "learning_rate": 9.287875396438536e-06, + "loss": 0.0009, + "step": 1134 + }, + { + "epoch": 0.22738190552441953, + "learning_rate": 9.301815663293426e-06, + "loss": 0.4672, + "step": 1136 + }, + { + "epoch": 0.22818254603682947, + "learning_rate": 9.315757293717432e-06, + "loss": 0.0007, + "step": 1138 + }, + { + "epoch": 0.22818254603682947, + "learning_rate": 9.329700260482286e-06, + "loss": 0.0029, + "step": 1140 + }, + { + "epoch": 0.22898318654923938, + "learning_rate": 9.343644536357053e-06, + "loss": 0.0016, + "step": 1142 + }, + { + "epoch": 0.22898318654923938, + "learning_rate": 9.35759009410826e-06, + "loss": 0.0015, + "step": 1144 + }, + { + "epoch": 0.22978382706164932, + "learning_rate": 9.37153690649993e-06, + "loss": 0.0352, + "step": 1146 + }, + { + "epoch": 0.22978382706164932, + "learning_rate": 9.38548494629364e-06, + "loss": 0.0001, + "step": 1148 + }, + { + "epoch": 0.23058446757405926, + "learning_rate": 9.39943418624856e-06, + "loss": 0.0047, + "step": 1150 + }, + { + "epoch": 0.23058446757405926, + "learning_rate": 9.41338459912151e-06, + "loss": 0.0054, + "step": 1152 + }, + { + "epoch": 0.23138510808646917, + "learning_rate": 9.427336157667062e-06, + "loss": 0.0971, + "step": 1154 + }, + { + "epoch": 0.23138510808646917, + "learning_rate": 9.441288834637507e-06, + "loss": 0.0015, + "step": 1156 + }, + { + "epoch": 0.2321857485988791, + "learning_rate": 9.45524260278296e-06, + "loss": 0.1742, + "step": 1158 + }, + { + "epoch": 0.2321857485988791, + "learning_rate": 9.469197434851414e-06, + "loss": 0.0024, + "step": 1160 + }, + { + "epoch": 0.23298638911128902, + "learning_rate": 9.483153303588777e-06, + "loss": 0.0304, + "step": 1162 + }, + { + "epoch": 0.23298638911128902, + "learning_rate": 9.497110181738935e-06, + "loss": 0.0711, + "step": 1164 + }, + { + "epoch": 0.23378702962369896, + "learning_rate": 9.511068042043785e-06, + "loss": 0.0151, + "step": 1166 + }, + { + "epoch": 0.23378702962369896, + "learning_rate": 9.52502685724336e-06, + "loss": 0.0083, + "step": 1168 + }, + { + "epoch": 0.2345876701361089, + "learning_rate": 9.538986600075773e-06, + "loss": 0.0023, + "step": 1170 + }, + { + "epoch": 0.2345876701361089, + "learning_rate": 9.552947243277342e-06, + "loss": 0.0024, + "step": 1172 + }, + { + "epoch": 0.2353883106485188, + "learning_rate": 9.566908759582633e-06, + "loss": 0.0003, + "step": 1174 + }, + { + "epoch": 0.2353883106485188, + "learning_rate": 9.580871121724498e-06, + "loss": 0.0158, + "step": 1176 + }, + { + "epoch": 0.23618895116092875, + "learning_rate": 9.594834302434123e-06, + "loss": 1.2354, + "step": 1178 + }, + { + "epoch": 0.23618895116092875, + "learning_rate": 9.608798274441153e-06, + "loss": 0.2439, + "step": 1180 + }, + { + "epoch": 0.23698959167333866, + "learning_rate": 9.622763010473628e-06, + "loss": 0.0076, + "step": 1182 + }, + { + "epoch": 0.23698959167333866, + "learning_rate": 9.636728483258116e-06, + "loss": 0.0538, + "step": 1184 + }, + { + "epoch": 0.2377902321857486, + "learning_rate": 9.650694665519747e-06, + "loss": 0.1541, + "step": 1186 + }, + { + "epoch": 0.2377902321857486, + "learning_rate": 9.664661529982263e-06, + "loss": 0.0051, + "step": 1188 + }, + { + "epoch": 0.23859087269815854, + "learning_rate": 9.678629049368077e-06, + "loss": 0.0061, + "step": 1190 + }, + { + "epoch": 0.23859087269815854, + "learning_rate": 9.692597196398302e-06, + "loss": 0.0028, + "step": 1192 + }, + { + "epoch": 0.23939151321056845, + "learning_rate": 9.706565943792879e-06, + "loss": 0.0011, + "step": 1194 + }, + { + "epoch": 0.23939151321056845, + "learning_rate": 9.720535264270526e-06, + "loss": 0.0016, + "step": 1196 + }, + { + "epoch": 0.2401921537229784, + "learning_rate": 9.734505130548855e-06, + "loss": 0.0053, + "step": 1198 + }, + { + "epoch": 0.2401921537229784, + "learning_rate": 9.748475515344416e-06, + "loss": 0.2317, + "step": 1200 + }, + { + "epoch": 0.2409927942353883, + "learning_rate": 9.762446391372746e-06, + "loss": 0.3768, + "step": 1202 + }, + { + "epoch": 0.2409927942353883, + "learning_rate": 9.776417731348403e-06, + "loss": 0.0638, + "step": 1204 + }, + { + "epoch": 0.24179343474779824, + "learning_rate": 9.790389507985091e-06, + "loss": 0.0224, + "step": 1206 + }, + { + "epoch": 0.24179343474779824, + "learning_rate": 9.80436169399561e-06, + "loss": 0.6163, + "step": 1208 + }, + { + "epoch": 0.24259407526020815, + "learning_rate": 9.81833426209198e-06, + "loss": 0.1619, + "step": 1210 + }, + { + "epoch": 0.24259407526020815, + "learning_rate": 9.832307184985473e-06, + "loss": 1.0514, + "step": 1212 + }, + { + "epoch": 0.2433947157726181, + "learning_rate": 9.846280435386668e-06, + "loss": 0.0469, + "step": 1214 + }, + { + "epoch": 0.2433947157726181, + "learning_rate": 9.8602539860055e-06, + "loss": 0.0026, + "step": 1216 + }, + { + "epoch": 0.24419535628502803, + "learning_rate": 9.874227809551307e-06, + "loss": 0.0139, + "step": 1218 + }, + { + "epoch": 0.24419535628502803, + "learning_rate": 9.888201878732946e-06, + "loss": 0.0706, + "step": 1220 + }, + { + "epoch": 0.24499599679743794, + "learning_rate": 9.902176166258738e-06, + "loss": 0.0159, + "step": 1222 + }, + { + "epoch": 0.24499599679743794, + "learning_rate": 9.916150644836596e-06, + "loss": 0.0508, + "step": 1224 + }, + { + "epoch": 0.24579663730984788, + "learning_rate": 9.930125287174061e-06, + "loss": 0.0043, + "step": 1226 + }, + { + "epoch": 0.24579663730984788, + "learning_rate": 9.944100065978354e-06, + "loss": 0.0157, + "step": 1228 + }, + { + "epoch": 0.2465972778222578, + "learning_rate": 9.958074953956413e-06, + "loss": 0.1768, + "step": 1230 + }, + { + "epoch": 0.2465972778222578, + "learning_rate": 9.972049923815011e-06, + "loss": 0.1292, + "step": 1232 + }, + { + "epoch": 0.24739791833466773, + "learning_rate": 9.986024948260714e-06, + "loss": 0.4027, + "step": 1234 + }, + { + "epoch": 0.24739791833466773, + "learning_rate": 9.999999999999996e-06, + "loss": 0.1062, + "step": 1236 + }, + { + "epoch": 0.24819855884707767, + "learning_rate": 1.0013975051739277e-05, + "loss": 0.2018, + "step": 1238 + }, + { + "epoch": 0.24819855884707767, + "learning_rate": 1.0027950076184982e-05, + "loss": 0.5756, + "step": 1240 + }, + { + "epoch": 0.24899919935948758, + "learning_rate": 1.004192504604358e-05, + "loss": 0.1901, + "step": 1242 + }, + { + "epoch": 0.24899919935948758, + "learning_rate": 1.0055899934021637e-05, + "loss": 0.1314, + "step": 1244 + }, + { + "epoch": 0.24979983987189752, + "learning_rate": 1.006987471282593e-05, + "loss": 0.1218, + "step": 1246 + }, + { + "epoch": 0.24979983987189752, + "learning_rate": 1.0083849355163397e-05, + "loss": 0.0185, + "step": 1248 + }, + { + "epoch": 0.25060048038430743, + "learning_rate": 1.0097823833741255e-05, + "loss": 0.056, + "step": 1250 + }, + { + "epoch": 0.25060048038430743, + "learning_rate": 1.0111798121267047e-05, + "loss": 0.0098, + "step": 1252 + }, + { + "epoch": 0.2514011208967174, + "learning_rate": 1.0125772190448686e-05, + "loss": 0.296, + "step": 1254 + }, + { + "epoch": 0.2514011208967174, + "learning_rate": 1.0139746013994493e-05, + "loss": 0.3305, + "step": 1256 + }, + { + "epoch": 0.2522017614091273, + "learning_rate": 1.0153719564613327e-05, + "loss": 0.0053, + "step": 1258 + }, + { + "epoch": 0.2522017614091273, + "learning_rate": 1.016769281501452e-05, + "loss": 0.0002, + "step": 1260 + }, + { + "epoch": 0.2530024019215372, + "learning_rate": 1.018166573790801e-05, + "loss": 0.2638, + "step": 1262 + }, + { + "epoch": 0.2530024019215372, + "learning_rate": 1.0195638306004383e-05, + "loss": 0.0095, + "step": 1264 + }, + { + "epoch": 0.25380304243394713, + "learning_rate": 1.0209610492014904e-05, + "loss": 0.0036, + "step": 1266 + }, + { + "epoch": 0.25380304243394713, + "learning_rate": 1.022358226865159e-05, + "loss": 0.0507, + "step": 1268 + }, + { + "epoch": 0.2546036829463571, + "learning_rate": 1.0237553608627247e-05, + "loss": 0.0157, + "step": 1270 + }, + { + "epoch": 0.2546036829463571, + "learning_rate": 1.0251524484655577e-05, + "loss": 0.0097, + "step": 1272 + }, + { + "epoch": 0.255404323458767, + "learning_rate": 1.0265494869451138e-05, + "loss": 0.4044, + "step": 1274 + }, + { + "epoch": 0.255404323458767, + "learning_rate": 1.0279464735729467e-05, + "loss": 0.0255, + "step": 1276 + }, + { + "epoch": 0.2562049639711769, + "learning_rate": 1.0293434056207114e-05, + "loss": 0.0413, + "step": 1278 + }, + { + "epoch": 0.2562049639711769, + "learning_rate": 1.0307402803601691e-05, + "loss": 0.0708, + "step": 1280 + }, + { + "epoch": 0.2570056044835869, + "learning_rate": 1.0321370950631918e-05, + "loss": 0.0082, + "step": 1282 + }, + { + "epoch": 0.2570056044835869, + "learning_rate": 1.033533847001773e-05, + "loss": 0.0979, + "step": 1284 + }, + { + "epoch": 0.2578062449959968, + "learning_rate": 1.0349305334480246e-05, + "loss": 0.024, + "step": 1286 + }, + { + "epoch": 0.2578062449959968, + "learning_rate": 1.0363271516741877e-05, + "loss": 0.0049, + "step": 1288 + }, + { + "epoch": 0.2586068855084067, + "learning_rate": 1.0377236989526366e-05, + "loss": 0.4549, + "step": 1290 + }, + { + "epoch": 0.2586068855084067, + "learning_rate": 1.039120172555884e-05, + "loss": 0.1824, + "step": 1292 + }, + { + "epoch": 0.2594075260208166, + "learning_rate": 1.0405165697565868e-05, + "loss": 0.0123, + "step": 1294 + }, + { + "epoch": 0.2594075260208166, + "learning_rate": 1.0419128878275495e-05, + "loss": 0.0001, + "step": 1296 + }, + { + "epoch": 0.2602081665332266, + "learning_rate": 1.0433091240417362e-05, + "loss": 0.3147, + "step": 1298 + }, + { + "epoch": 0.2602081665332266, + "learning_rate": 1.0447052756722651e-05, + "loss": 0.0024, + "step": 1300 + }, + { + "epoch": 0.2610088070456365, + "learning_rate": 1.046101339992422e-05, + "loss": 0.2615, + "step": 1302 + }, + { + "epoch": 0.2610088070456365, + "learning_rate": 1.0474973142756632e-05, + "loss": 0.0047, + "step": 1304 + }, + { + "epoch": 0.2618094475580464, + "learning_rate": 1.0488931957956208e-05, + "loss": 0.0818, + "step": 1306 + }, + { + "epoch": 0.2618094475580464, + "learning_rate": 1.0502889818261058e-05, + "loss": 0.8834, + "step": 1308 + }, + { + "epoch": 0.2626100880704564, + "learning_rate": 1.0516846696411216e-05, + "loss": 0.0074, + "step": 1310 + }, + { + "epoch": 0.2626100880704564, + "learning_rate": 1.053080256514858e-05, + "loss": 0.0186, + "step": 1312 + }, + { + "epoch": 0.2634107285828663, + "learning_rate": 1.054475739721703e-05, + "loss": 0.0147, + "step": 1314 + }, + { + "epoch": 0.2634107285828663, + "learning_rate": 1.0558711165362488e-05, + "loss": 0.0044, + "step": 1316 + }, + { + "epoch": 0.2642113690952762, + "learning_rate": 1.0572663842332931e-05, + "loss": 0.0638, + "step": 1318 + }, + { + "epoch": 0.2642113690952762, + "learning_rate": 1.0586615400878484e-05, + "loss": 0.0455, + "step": 1320 + }, + { + "epoch": 0.26501200960768617, + "learning_rate": 1.0600565813751433e-05, + "loss": 0.1191, + "step": 1322 + }, + { + "epoch": 0.26501200960768617, + "learning_rate": 1.0614515053706354e-05, + "loss": 0.0373, + "step": 1324 + }, + { + "epoch": 0.2658126501200961, + "learning_rate": 1.0628463093500063e-05, + "loss": 0.0711, + "step": 1326 + }, + { + "epoch": 0.2658126501200961, + "learning_rate": 1.0642409905891733e-05, + "loss": 0.1191, + "step": 1328 + }, + { + "epoch": 0.266613290632506, + "learning_rate": 1.065635546364294e-05, + "loss": 0.0202, + "step": 1330 + }, + { + "epoch": 0.266613290632506, + "learning_rate": 1.0670299739517706e-05, + "loss": 0.119, + "step": 1332 + }, + { + "epoch": 0.2674139311449159, + "learning_rate": 1.0684242706282562e-05, + "loss": 0.1446, + "step": 1334 + }, + { + "epoch": 0.2674139311449159, + "learning_rate": 1.0698184336706567e-05, + "loss": 0.0021, + "step": 1336 + }, + { + "epoch": 0.2682145716573259, + "learning_rate": 1.0712124603561457e-05, + "loss": 0.0013, + "step": 1338 + }, + { + "epoch": 0.2682145716573259, + "learning_rate": 1.0726063479621567e-05, + "loss": 0.1297, + "step": 1340 + }, + { + "epoch": 0.2690152121697358, + "learning_rate": 1.0740000937663972e-05, + "loss": 0.0415, + "step": 1342 + }, + { + "epoch": 0.2690152121697358, + "learning_rate": 1.0753936950468513e-05, + "loss": 0.1632, + "step": 1344 + }, + { + "epoch": 0.2698158526821457, + "learning_rate": 1.0767871490817856e-05, + "loss": 0.0419, + "step": 1346 + }, + { + "epoch": 0.2698158526821457, + "learning_rate": 1.0781804531497525e-05, + "loss": 0.0225, + "step": 1348 + }, + { + "epoch": 0.27061649319455566, + "learning_rate": 1.0795736045296023e-05, + "loss": 0.0055, + "step": 1350 + }, + { + "epoch": 0.27061649319455566, + "learning_rate": 1.0809666005004787e-05, + "loss": 0.0011, + "step": 1352 + }, + { + "epoch": 0.2714171337069656, + "learning_rate": 1.08235943834183e-05, + "loss": 0.0305, + "step": 1354 + }, + { + "epoch": 0.2714171337069656, + "learning_rate": 1.083752115333414e-05, + "loss": 0.0226, + "step": 1356 + }, + { + "epoch": 0.2722177742193755, + "learning_rate": 1.0851446287553022e-05, + "loss": 0.0009, + "step": 1358 + }, + { + "epoch": 0.2722177742193755, + "learning_rate": 1.0865369758878858e-05, + "loss": 0.0466, + "step": 1360 + }, + { + "epoch": 0.27301841473178545, + "learning_rate": 1.087929154011879e-05, + "loss": 0.0004, + "step": 1362 + }, + { + "epoch": 0.27301841473178545, + "learning_rate": 1.0893211604083311e-05, + "loss": 0.0507, + "step": 1364 + }, + { + "epoch": 0.27381905524419536, + "learning_rate": 1.090712992358622e-05, + "loss": 1.1479, + "step": 1366 + }, + { + "epoch": 0.27381905524419536, + "learning_rate": 1.0921046471444737e-05, + "loss": 0.007, + "step": 1368 + }, + { + "epoch": 0.2746196957566053, + "learning_rate": 1.0934961220479537e-05, + "loss": 0.0464, + "step": 1370 + }, + { + "epoch": 0.2746196957566053, + "learning_rate": 1.0948874143514818e-05, + "loss": 0.4602, + "step": 1372 + }, + { + "epoch": 0.2754203362690152, + "learning_rate": 1.0962785213378325e-05, + "loss": 0.006, + "step": 1374 + }, + { + "epoch": 0.2754203362690152, + "learning_rate": 1.0976694402901467e-05, + "loss": 0.0879, + "step": 1376 + }, + { + "epoch": 0.27622097678142515, + "learning_rate": 1.0990601684919282e-05, + "loss": 1.7215, + "step": 1378 + }, + { + "epoch": 0.27622097678142515, + "learning_rate": 1.1004507032270544e-05, + "loss": 0.0079, + "step": 1380 + }, + { + "epoch": 0.27702161729383507, + "learning_rate": 1.1018410417797809e-05, + "loss": 0.4258, + "step": 1382 + }, + { + "epoch": 0.27702161729383507, + "learning_rate": 1.1032311814347467e-05, + "loss": 0.1218, + "step": 1384 + }, + { + "epoch": 0.277822257806245, + "learning_rate": 1.1046211194769784e-05, + "loss": 0.0016, + "step": 1386 + }, + { + "epoch": 0.277822257806245, + "learning_rate": 1.1060108531918955e-05, + "loss": 0.0848, + "step": 1388 + }, + { + "epoch": 0.27862289831865494, + "learning_rate": 1.1074003798653215e-05, + "loss": 0.0004, + "step": 1390 + }, + { + "epoch": 0.27862289831865494, + "learning_rate": 1.1087896967834787e-05, + "loss": 0.1128, + "step": 1392 + }, + { + "epoch": 0.27942353883106485, + "learning_rate": 1.1101788012330013e-05, + "loss": 0.0096, + "step": 1394 + }, + { + "epoch": 0.27942353883106485, + "learning_rate": 1.111567690500938e-05, + "loss": 0.0433, + "step": 1396 + }, + { + "epoch": 0.28022417934347477, + "learning_rate": 1.1129563618747581e-05, + "loss": 0.019, + "step": 1398 + }, + { + "epoch": 0.28022417934347477, + "learning_rate": 1.1143448126423545e-05, + "loss": 0.0183, + "step": 1400 + }, + { + "epoch": 0.28102481985588473, + "learning_rate": 1.1157330400920563e-05, + "loss": 0.0111, + "step": 1402 + }, + { + "epoch": 0.28102481985588473, + "learning_rate": 1.1171210415126238e-05, + "loss": 0.0527, + "step": 1404 + }, + { + "epoch": 0.28182546036829464, + "learning_rate": 1.1185088141932594e-05, + "loss": 0.1079, + "step": 1406 + }, + { + "epoch": 0.28182546036829464, + "learning_rate": 1.1198963554236135e-05, + "loss": 0.0076, + "step": 1408 + }, + { + "epoch": 0.28262610088070456, + "learning_rate": 1.121283662493788e-05, + "loss": 0.0302, + "step": 1410 + }, + { + "epoch": 0.28262610088070456, + "learning_rate": 1.122670732694342e-05, + "loss": 0.079, + "step": 1412 + }, + { + "epoch": 0.28342674139311447, + "learning_rate": 1.1240575633162958e-05, + "loss": 0.0028, + "step": 1414 + }, + { + "epoch": 0.28342674139311447, + "learning_rate": 1.1254441516511425e-05, + "loss": 0.1096, + "step": 1416 + }, + { + "epoch": 0.28422738190552443, + "learning_rate": 1.1268304949908434e-05, + "loss": 0.0789, + "step": 1418 + }, + { + "epoch": 0.28422738190552443, + "learning_rate": 1.1282165906278395e-05, + "loss": 0.0037, + "step": 1420 + }, + { + "epoch": 0.28502802241793435, + "learning_rate": 1.1296024358550565e-05, + "loss": 0.71, + "step": 1422 + }, + { + "epoch": 0.28502802241793435, + "learning_rate": 1.1309880279659087e-05, + "loss": 0.8524, + "step": 1424 + }, + { + "epoch": 0.28582866293034426, + "learning_rate": 1.1323733642543024e-05, + "loss": 0.0028, + "step": 1426 + }, + { + "epoch": 0.28582866293034426, + "learning_rate": 1.1337584420146496e-05, + "loss": 0.0129, + "step": 1428 + }, + { + "epoch": 0.2866293034427542, + "learning_rate": 1.135143258541862e-05, + "loss": 0.0019, + "step": 1430 + }, + { + "epoch": 0.2866293034427542, + "learning_rate": 1.1365278111313625e-05, + "loss": 0.1217, + "step": 1432 + }, + { + "epoch": 0.28742994395516414, + "learning_rate": 1.13791209707909e-05, + "loss": 0.0311, + "step": 1434 + }, + { + "epoch": 0.28742994395516414, + "learning_rate": 1.1392961136815041e-05, + "loss": 0.9133, + "step": 1436 + }, + { + "epoch": 0.28823058446757405, + "learning_rate": 1.1406798582355902e-05, + "loss": 0.0407, + "step": 1438 + }, + { + "epoch": 0.28823058446757405, + "learning_rate": 1.142063328038864e-05, + "loss": 0.0598, + "step": 1440 + }, + { + "epoch": 0.289031224979984, + "learning_rate": 1.1434465203893818e-05, + "loss": 0.0158, + "step": 1442 + }, + { + "epoch": 0.289031224979984, + "learning_rate": 1.1448294325857377e-05, + "loss": 0.2428, + "step": 1444 + }, + { + "epoch": 0.2898318654923939, + "learning_rate": 1.146212061927074e-05, + "loss": 0.0863, + "step": 1446 + }, + { + "epoch": 0.2898318654923939, + "learning_rate": 1.1475944057130856e-05, + "loss": 0.0918, + "step": 1448 + }, + { + "epoch": 0.29063250600480384, + "learning_rate": 1.1489764612440255e-05, + "loss": 0.0146, + "step": 1450 + }, + { + "epoch": 0.29063250600480384, + "learning_rate": 1.150358225820707e-05, + "loss": 0.0125, + "step": 1452 + }, + { + "epoch": 0.29143314651721375, + "learning_rate": 1.151739696744518e-05, + "loss": 0.2841, + "step": 1454 + }, + { + "epoch": 0.29143314651721375, + "learning_rate": 1.1531208713174138e-05, + "loss": 0.384, + "step": 1456 + }, + { + "epoch": 0.2922337870296237, + "learning_rate": 1.1545017468419307e-05, + "loss": 0.0113, + "step": 1458 + }, + { + "epoch": 0.2922337870296237, + "learning_rate": 1.1558823206211887e-05, + "loss": 0.2674, + "step": 1460 + }, + { + "epoch": 0.2930344275420336, + "learning_rate": 1.1572625899588972e-05, + "loss": 0.0314, + "step": 1462 + }, + { + "epoch": 0.2930344275420336, + "learning_rate": 1.1586425521593607e-05, + "loss": 0.2442, + "step": 1464 + }, + { + "epoch": 0.29383506805444354, + "learning_rate": 1.1600222045274809e-05, + "loss": 0.0208, + "step": 1466 + }, + { + "epoch": 0.29383506805444354, + "learning_rate": 1.1614015443687708e-05, + "loss": 0.0482, + "step": 1468 + }, + { + "epoch": 0.2946357085668535, + "learning_rate": 1.1627805689893478e-05, + "loss": 0.0378, + "step": 1470 + }, + { + "epoch": 0.2946357085668535, + "learning_rate": 1.1641592756959467e-05, + "loss": 0.0718, + "step": 1472 + }, + { + "epoch": 0.2954363490792634, + "learning_rate": 1.1655376617959239e-05, + "loss": 0.3383, + "step": 1474 + }, + { + "epoch": 0.2954363490792634, + "learning_rate": 1.1669157245972616e-05, + "loss": 0.0243, + "step": 1476 + }, + { + "epoch": 0.2962369895916733, + "learning_rate": 1.1682934614085708e-05, + "loss": 0.0155, + "step": 1478 + }, + { + "epoch": 0.2962369895916733, + "learning_rate": 1.1696708695391057e-05, + "loss": 0.1174, + "step": 1480 + }, + { + "epoch": 0.29703763010408324, + "learning_rate": 1.1710479462987565e-05, + "loss": 0.0099, + "step": 1482 + }, + { + "epoch": 0.29703763010408324, + "learning_rate": 1.1724246889980626e-05, + "loss": 0.1034, + "step": 1484 + }, + { + "epoch": 0.2978382706164932, + "learning_rate": 1.1738010949482152e-05, + "loss": 0.0328, + "step": 1486 + }, + { + "epoch": 0.2978382706164932, + "learning_rate": 1.1751771614610643e-05, + "loss": 0.0187, + "step": 1488 + }, + { + "epoch": 0.2986389111289031, + "learning_rate": 1.176552885849122e-05, + "loss": 0.3593, + "step": 1490 + }, + { + "epoch": 0.2986389111289031, + "learning_rate": 1.1779282654255668e-05, + "loss": 0.0395, + "step": 1492 + }, + { + "epoch": 0.29943955164131303, + "learning_rate": 1.1793032975042563e-05, + "loss": 0.4046, + "step": 1494 + }, + { + "epoch": 0.29943955164131303, + "learning_rate": 1.180677979399721e-05, + "loss": 0.069, + "step": 1496 + }, + { + "epoch": 0.300240192153723, + "learning_rate": 1.1820523084271775e-05, + "loss": 0.0002, + "step": 1498 + }, + { + "epoch": 0.300240192153723, + "learning_rate": 1.1834262819025317e-05, + "loss": 0.0378, + "step": 1500 + }, + { + "epoch": 0.3010408326661329, + "learning_rate": 1.1847998971423835e-05, + "loss": 0.0053, + "step": 1502 + }, + { + "epoch": 0.3010408326661329, + "learning_rate": 1.1861731514640309e-05, + "loss": 0.0302, + "step": 1504 + }, + { + "epoch": 0.3018414731785428, + "learning_rate": 1.1875460421854816e-05, + "loss": 0.0225, + "step": 1506 + }, + { + "epoch": 0.3018414731785428, + "learning_rate": 1.188918566625449e-05, + "loss": 0.7327, + "step": 1508 + }, + { + "epoch": 0.3026421136909528, + "learning_rate": 1.1902907221033629e-05, + "loss": 0.0287, + "step": 1510 + }, + { + "epoch": 0.3026421136909528, + "learning_rate": 1.1916625059393739e-05, + "loss": 0.2359, + "step": 1512 + }, + { + "epoch": 0.3034427542033627, + "learning_rate": 1.1930339154543582e-05, + "loss": 0.6283, + "step": 1514 + }, + { + "epoch": 0.3034427542033627, + "learning_rate": 1.1944049479699241e-05, + "loss": 0.0001, + "step": 1516 + }, + { + "epoch": 0.3042433947157726, + "learning_rate": 1.1957756008084127e-05, + "loss": 0.0621, + "step": 1518 + }, + { + "epoch": 0.3042433947157726, + "learning_rate": 1.1971458712929133e-05, + "loss": 0.0677, + "step": 1520 + }, + { + "epoch": 0.3050440352281825, + "learning_rate": 1.1985157567472563e-05, + "loss": 0.1494, + "step": 1522 + }, + { + "epoch": 0.3050440352281825, + "learning_rate": 1.1998852544960256e-05, + "loss": 0.0369, + "step": 1524 + }, + { + "epoch": 0.3058446757405925, + "learning_rate": 1.2012543618645622e-05, + "loss": 0.009, + "step": 1526 + }, + { + "epoch": 0.3058446757405925, + "learning_rate": 1.2026230761789702e-05, + "loss": 0.8969, + "step": 1528 + }, + { + "epoch": 0.3066453162530024, + "learning_rate": 1.2039913947661205e-05, + "loss": 0.0192, + "step": 1530 + }, + { + "epoch": 0.3066453162530024, + "learning_rate": 1.2053593149536557e-05, + "loss": 0.0113, + "step": 1532 + }, + { + "epoch": 0.3074459567654123, + "learning_rate": 1.2067268340700016e-05, + "loss": 0.0321, + "step": 1534 + }, + { + "epoch": 0.3074459567654123, + "learning_rate": 1.2080939494443618e-05, + "loss": 0.1448, + "step": 1536 + }, + { + "epoch": 0.3082465972778223, + "learning_rate": 1.2094606584067304e-05, + "loss": 0.0322, + "step": 1538 + }, + { + "epoch": 0.3082465972778223, + "learning_rate": 1.210826958287895e-05, + "loss": 0.2451, + "step": 1540 + }, + { + "epoch": 0.3090472377902322, + "learning_rate": 1.212192846419443e-05, + "loss": 0.2352, + "step": 1542 + }, + { + "epoch": 0.3090472377902322, + "learning_rate": 1.2135583201337646e-05, + "loss": 0.3716, + "step": 1544 + }, + { + "epoch": 0.3098478783026421, + "learning_rate": 1.2149233767640587e-05, + "loss": 0.0358, + "step": 1546 + }, + { + "epoch": 0.3098478783026421, + "learning_rate": 1.2162880136443434e-05, + "loss": 0.0953, + "step": 1548 + }, + { + "epoch": 0.31064851881505207, + "learning_rate": 1.2176522281094514e-05, + "loss": 0.0027, + "step": 1550 + }, + { + "epoch": 0.31064851881505207, + "learning_rate": 1.2190160174950428e-05, + "loss": 0.0706, + "step": 1552 + }, + { + "epoch": 0.311449159327462, + "learning_rate": 1.220379379137607e-05, + "loss": 0.0801, + "step": 1554 + }, + { + "epoch": 0.311449159327462, + "learning_rate": 1.2217423103744692e-05, + "loss": 0.0084, + "step": 1556 + }, + { + "epoch": 0.3122497998398719, + "learning_rate": 1.2231048085437953e-05, + "loss": 0.02, + "step": 1558 + }, + { + "epoch": 0.3122497998398719, + "learning_rate": 1.2244668709845952e-05, + "loss": 0.122, + "step": 1560 + }, + { + "epoch": 0.3130504403522818, + "learning_rate": 1.2258284950367347e-05, + "loss": 0.0077, + "step": 1562 + }, + { + "epoch": 0.3130504403522818, + "learning_rate": 1.2271896780409309e-05, + "loss": 0.3968, + "step": 1564 + }, + { + "epoch": 0.31385108086469177, + "learning_rate": 1.228550417338764e-05, + "loss": 0.0218, + "step": 1566 + }, + { + "epoch": 0.31385108086469177, + "learning_rate": 1.2299107102726804e-05, + "loss": 0.3134, + "step": 1568 + }, + { + "epoch": 0.3146517213771017, + "learning_rate": 1.2312705541859985e-05, + "loss": 0.0632, + "step": 1570 + }, + { + "epoch": 0.3146517213771017, + "learning_rate": 1.2326299464229143e-05, + "loss": 0.036, + "step": 1572 + }, + { + "epoch": 0.3154523618895116, + "learning_rate": 1.2339888843285029e-05, + "loss": 0.3441, + "step": 1574 + }, + { + "epoch": 0.3154523618895116, + "learning_rate": 1.2353473652487329e-05, + "loss": 0.0453, + "step": 1576 + }, + { + "epoch": 0.31625300240192156, + "learning_rate": 1.2367053865304597e-05, + "loss": 0.0018, + "step": 1578 + }, + { + "epoch": 0.31625300240192156, + "learning_rate": 1.2380629455214385e-05, + "loss": 0.0709, + "step": 1580 + }, + { + "epoch": 0.31705364291433147, + "learning_rate": 1.2394200395703273e-05, + "loss": 0.2462, + "step": 1582 + }, + { + "epoch": 0.31705364291433147, + "learning_rate": 1.2407766660266916e-05, + "loss": 0.1447, + "step": 1584 + }, + { + "epoch": 0.3178542834267414, + "learning_rate": 1.2421328222410109e-05, + "loss": 0.0208, + "step": 1586 + }, + { + "epoch": 0.3178542834267414, + "learning_rate": 1.2434885055646808e-05, + "loss": 0.2909, + "step": 1588 + }, + { + "epoch": 0.31865492393915135, + "learning_rate": 1.2448437133500262e-05, + "loss": 0.1078, + "step": 1590 + }, + { + "epoch": 0.31865492393915135, + "learning_rate": 1.2461984429502947e-05, + "loss": 0.0, + "step": 1592 + }, + { + "epoch": 0.31945556445156126, + "learning_rate": 1.2475526917196703e-05, + "loss": 0.174, + "step": 1594 + }, + { + "epoch": 0.31945556445156126, + "learning_rate": 1.2489064570132761e-05, + "loss": 0.0124, + "step": 1596 + }, + { + "epoch": 0.32025620496397117, + "learning_rate": 1.2502597361871787e-05, + "loss": 0.329, + "step": 1598 + }, + { + "epoch": 0.32025620496397117, + "learning_rate": 1.2516125265983945e-05, + "loss": 0.2101, + "step": 1600 + }, + { + "epoch": 0.3210568454763811, + "learning_rate": 1.2529648256048931e-05, + "loss": 0.0204, + "step": 1602 + }, + { + "epoch": 0.3210568454763811, + "learning_rate": 1.2543166305656089e-05, + "loss": 0.6452, + "step": 1604 + }, + { + "epoch": 0.32185748598879105, + "learning_rate": 1.2556679388404351e-05, + "loss": 0.0279, + "step": 1606 + }, + { + "epoch": 0.32185748598879105, + "learning_rate": 1.257018747790238e-05, + "loss": 0.026, + "step": 1608 + }, + { + "epoch": 0.32265812650120096, + "learning_rate": 1.2583690547768584e-05, + "loss": 0.0113, + "step": 1610 + }, + { + "epoch": 0.32265812650120096, + "learning_rate": 1.259718857163117e-05, + "loss": 0.6554, + "step": 1612 + }, + { + "epoch": 0.32345876701361087, + "learning_rate": 1.261068152312821e-05, + "loss": 0.0125, + "step": 1614 + }, + { + "epoch": 0.32345876701361087, + "learning_rate": 1.2624169375907657e-05, + "loss": 0.0381, + "step": 1616 + }, + { + "epoch": 0.32425940752602084, + "learning_rate": 1.2637652103627481e-05, + "loss": 0.1913, + "step": 1618 + }, + { + "epoch": 0.32425940752602084, + "learning_rate": 1.2651129679955598e-05, + "loss": 0.1094, + "step": 1620 + }, + { + "epoch": 0.32506004803843075, + "learning_rate": 1.2664602078570017e-05, + "loss": 0.024, + "step": 1622 + }, + { + "epoch": 0.32506004803843075, + "learning_rate": 1.2678069273158849e-05, + "loss": 0.1114, + "step": 1624 + }, + { + "epoch": 0.32586068855084066, + "learning_rate": 1.2691531237420369e-05, + "loss": 0.1001, + "step": 1626 + }, + { + "epoch": 0.32586068855084066, + "learning_rate": 1.2704987945063073e-05, + "loss": 0.5097, + "step": 1628 + }, + { + "epoch": 0.3266613290632506, + "learning_rate": 1.27184393698057e-05, + "loss": 0.0397, + "step": 1630 + }, + { + "epoch": 0.3266613290632506, + "learning_rate": 1.273188548537736e-05, + "loss": 0.0038, + "step": 1632 + }, + { + "epoch": 0.32746196957566054, + "learning_rate": 1.2745326265517481e-05, + "loss": 0.1333, + "step": 1634 + }, + { + "epoch": 0.32746196957566054, + "learning_rate": 1.2758761683975929e-05, + "loss": 0.089, + "step": 1636 + }, + { + "epoch": 0.32826261008807045, + "learning_rate": 1.277219171451304e-05, + "loss": 0.2364, + "step": 1638 + }, + { + "epoch": 0.32826261008807045, + "learning_rate": 1.2785616330899676e-05, + "loss": 0.0045, + "step": 1640 + }, + { + "epoch": 0.32906325060048036, + "learning_rate": 1.2799035506917265e-05, + "loss": 0.008, + "step": 1642 + }, + { + "epoch": 0.32906325060048036, + "learning_rate": 1.2812449216357855e-05, + "loss": 0.133, + "step": 1644 + }, + { + "epoch": 0.32986389111289033, + "learning_rate": 1.2825857433024208e-05, + "loss": 0.1808, + "step": 1646 + }, + { + "epoch": 0.32986389111289033, + "learning_rate": 1.2839260130729776e-05, + "loss": 0.0614, + "step": 1648 + }, + { + "epoch": 0.33066453162530024, + "learning_rate": 1.2852657283298794e-05, + "loss": 0.001, + "step": 1650 + }, + { + "epoch": 0.33066453162530024, + "learning_rate": 1.2866048864566336e-05, + "loss": 0.1106, + "step": 1652 + }, + { + "epoch": 0.33146517213771015, + "learning_rate": 1.2879434848378356e-05, + "loss": 0.1039, + "step": 1654 + }, + { + "epoch": 0.33146517213771015, + "learning_rate": 1.2892815208591734e-05, + "loss": 0.0012, + "step": 1656 + }, + { + "epoch": 0.3322658126501201, + "learning_rate": 1.2906189919074336e-05, + "loss": 0.0025, + "step": 1658 + }, + { + "epoch": 0.3322658126501201, + "learning_rate": 1.2919558953705047e-05, + "loss": 0.0237, + "step": 1660 + }, + { + "epoch": 0.33306645316253003, + "learning_rate": 1.293292228637389e-05, + "loss": 0.0507, + "step": 1662 + }, + { + "epoch": 0.33306645316253003, + "learning_rate": 1.2946279890981966e-05, + "loss": 0.3569, + "step": 1664 + }, + { + "epoch": 0.33386709367493994, + "learning_rate": 1.2959631741441583e-05, + "loss": 0.1179, + "step": 1666 + }, + { + "epoch": 0.33386709367493994, + "learning_rate": 1.2972977811676289e-05, + "loss": 0.002, + "step": 1668 + }, + { + "epoch": 0.33466773418734985, + "learning_rate": 1.298631807562092e-05, + "loss": 0.004, + "step": 1670 + }, + { + "epoch": 0.33466773418734985, + "learning_rate": 1.2999652507221652e-05, + "loss": 0.0764, + "step": 1672 + }, + { + "epoch": 0.3354683746997598, + "learning_rate": 1.3012981080436036e-05, + "loss": 0.7035, + "step": 1674 + }, + { + "epoch": 0.3354683746997598, + "learning_rate": 1.3026303769233109e-05, + "loss": 0.0254, + "step": 1676 + }, + { + "epoch": 0.33626901521216973, + "learning_rate": 1.3039620547593357e-05, + "loss": 0.1793, + "step": 1678 + }, + { + "epoch": 0.33626901521216973, + "learning_rate": 1.3052931389508822e-05, + "loss": 0.08, + "step": 1680 + }, + { + "epoch": 0.33706965572457964, + "learning_rate": 1.3066236268983143e-05, + "loss": 0.0032, + "step": 1682 + }, + { + "epoch": 0.33706965572457964, + "learning_rate": 1.3079535160031601e-05, + "loss": 0.0057, + "step": 1684 + }, + { + "epoch": 0.3378702962369896, + "learning_rate": 1.3092828036681178e-05, + "loss": 0.0526, + "step": 1686 + }, + { + "epoch": 0.3378702962369896, + "learning_rate": 1.3106114872970575e-05, + "loss": 0.0027, + "step": 1688 + }, + { + "epoch": 0.3386709367493995, + "learning_rate": 1.3119395642950348e-05, + "loss": 0.0026, + "step": 1690 + }, + { + "epoch": 0.3386709367493995, + "learning_rate": 1.313267032068285e-05, + "loss": 0.0617, + "step": 1692 + }, + { + "epoch": 0.33947157726180943, + "learning_rate": 1.3145938880242346e-05, + "loss": 0.0256, + "step": 1694 + }, + { + "epoch": 0.33947157726180943, + "learning_rate": 1.3159201295715054e-05, + "loss": 0.081, + "step": 1696 + }, + { + "epoch": 0.3402722177742194, + "learning_rate": 1.3172457541199188e-05, + "loss": 0.0095, + "step": 1698 + }, + { + "epoch": 0.3402722177742194, + "learning_rate": 1.3185707590804997e-05, + "loss": 0.0127, + "step": 1700 + }, + { + "epoch": 0.3410728582866293, + "learning_rate": 1.3198951418654882e-05, + "loss": 0.1822, + "step": 1702 + }, + { + "epoch": 0.3410728582866293, + "learning_rate": 1.321218899888334e-05, + "loss": 0.0913, + "step": 1704 + }, + { + "epoch": 0.3418734987990392, + "learning_rate": 1.322542030563709e-05, + "loss": 0.376, + "step": 1706 + }, + { + "epoch": 0.3418734987990392, + "learning_rate": 1.3238645313075109e-05, + "loss": 0.674, + "step": 1708 + }, + { + "epoch": 0.34267413931144913, + "learning_rate": 1.3251863995368665e-05, + "loss": 0.2442, + "step": 1710 + }, + { + "epoch": 0.34267413931144913, + "learning_rate": 1.326507632670139e-05, + "loss": 0.2295, + "step": 1712 + }, + { + "epoch": 0.3434747798238591, + "learning_rate": 1.3278282281269293e-05, + "loss": 0.06, + "step": 1714 + }, + { + "epoch": 0.3434747798238591, + "learning_rate": 1.3291481833280894e-05, + "loss": 0.003, + "step": 1716 + }, + { + "epoch": 0.344275420336269, + "learning_rate": 1.3304674956957167e-05, + "loss": 0.1585, + "step": 1718 + }, + { + "epoch": 0.344275420336269, + "learning_rate": 1.3317861626531652e-05, + "loss": 0.2851, + "step": 1720 + }, + { + "epoch": 0.3450760608486789, + "learning_rate": 1.3331041816250503e-05, + "loss": 0.0849, + "step": 1722 + }, + { + "epoch": 0.3450760608486789, + "learning_rate": 1.3344215500372517e-05, + "loss": 0.4279, + "step": 1724 + }, + { + "epoch": 0.3458767013610889, + "learning_rate": 1.335738265316921e-05, + "loss": 0.098, + "step": 1726 + }, + { + "epoch": 0.3458767013610889, + "learning_rate": 1.3370543248924826e-05, + "loss": 0.0016, + "step": 1728 + }, + { + "epoch": 0.3466773418734988, + "learning_rate": 1.3383697261936472e-05, + "loss": 0.0007, + "step": 1730 + }, + { + "epoch": 0.3466773418734988, + "learning_rate": 1.3396844666514062e-05, + "loss": 0.0321, + "step": 1732 + }, + { + "epoch": 0.3474779823859087, + "learning_rate": 1.3409985436980422e-05, + "loss": 0.3558, + "step": 1734 + }, + { + "epoch": 0.3474779823859087, + "learning_rate": 1.3423119547671348e-05, + "loss": 0.0006, + "step": 1736 + }, + { + "epoch": 0.3482786228983187, + "learning_rate": 1.3436246972935638e-05, + "loss": 0.1764, + "step": 1738 + }, + { + "epoch": 0.3482786228983187, + "learning_rate": 1.344936768713513e-05, + "loss": 0.326, + "step": 1740 + }, + { + "epoch": 0.3490792634107286, + "learning_rate": 1.346248166464481e-05, + "loss": 0.0051, + "step": 1742 + }, + { + "epoch": 0.3490792634107286, + "learning_rate": 1.347558887985279e-05, + "loss": 0.0812, + "step": 1744 + }, + { + "epoch": 0.3498799039231385, + "learning_rate": 1.348868930716039e-05, + "loss": 0.6387, + "step": 1746 + }, + { + "epoch": 0.3498799039231385, + "learning_rate": 1.3501782920982189e-05, + "loss": 0.0364, + "step": 1748 + }, + { + "epoch": 0.3506805444355484, + "learning_rate": 1.3514869695746078e-05, + "loss": 0.054, + "step": 1750 + }, + { + "epoch": 0.3506805444355484, + "learning_rate": 1.3527949605893305e-05, + "loss": 0.0278, + "step": 1752 + }, + { + "epoch": 0.3514811849479584, + "learning_rate": 1.3541022625878501e-05, + "loss": 0.0158, + "step": 1754 + }, + { + "epoch": 0.3514811849479584, + "learning_rate": 1.3554088730169812e-05, + "loss": 0.0547, + "step": 1756 + }, + { + "epoch": 0.3522818254603683, + "learning_rate": 1.3567147893248833e-05, + "loss": 0.1633, + "step": 1758 + }, + { + "epoch": 0.3522818254603683, + "learning_rate": 1.3580200089610739e-05, + "loss": 0.0136, + "step": 1760 + }, + { + "epoch": 0.3530824659727782, + "learning_rate": 1.3593245293764303e-05, + "loss": 0.0055, + "step": 1762 + }, + { + "epoch": 0.3530824659727782, + "learning_rate": 1.3606283480231962e-05, + "loss": 0.3808, + "step": 1764 + }, + { + "epoch": 0.35388310648518817, + "learning_rate": 1.361931462354984e-05, + "loss": 0.3163, + "step": 1766 + }, + { + "epoch": 0.35388310648518817, + "learning_rate": 1.3632338698267863e-05, + "loss": 0.0847, + "step": 1768 + }, + { + "epoch": 0.3546837469975981, + "learning_rate": 1.3645355678949715e-05, + "loss": 0.0497, + "step": 1770 + }, + { + "epoch": 0.3546837469975981, + "learning_rate": 1.3658365540172948e-05, + "loss": 0.1675, + "step": 1772 + }, + { + "epoch": 0.355484387510008, + "learning_rate": 1.3671368256529026e-05, + "loss": 0.3889, + "step": 1774 + }, + { + "epoch": 0.355484387510008, + "learning_rate": 1.368436380262336e-05, + "loss": 0.1003, + "step": 1776 + }, + { + "epoch": 0.35628502802241796, + "learning_rate": 1.3697352153075365e-05, + "loss": 0.0167, + "step": 1778 + }, + { + "epoch": 0.35628502802241796, + "learning_rate": 1.3710333282518497e-05, + "loss": 0.0002, + "step": 1780 + }, + { + "epoch": 0.35708566853482787, + "learning_rate": 1.3723307165600361e-05, + "loss": 0.0255, + "step": 1782 + }, + { + "epoch": 0.35708566853482787, + "learning_rate": 1.3736273776982667e-05, + "loss": 0.0046, + "step": 1784 + }, + { + "epoch": 0.3578863090472378, + "learning_rate": 1.3749233091341344e-05, + "loss": 0.1334, + "step": 1786 + }, + { + "epoch": 0.3578863090472378, + "learning_rate": 1.3762185083366562e-05, + "loss": 0.0015, + "step": 1788 + }, + { + "epoch": 0.3586869495596477, + "learning_rate": 1.3775129727762808e-05, + "loss": 0.2922, + "step": 1790 + }, + { + "epoch": 0.3586869495596477, + "learning_rate": 1.3788066999248893e-05, + "loss": 0.3249, + "step": 1792 + }, + { + "epoch": 0.35948759007205766, + "learning_rate": 1.3800996872558075e-05, + "loss": 0.006, + "step": 1794 + }, + { + "epoch": 0.35948759007205766, + "learning_rate": 1.3813919322438018e-05, + "loss": 0.1777, + "step": 1796 + }, + { + "epoch": 0.3602882305844676, + "learning_rate": 1.3826834323650899e-05, + "loss": 0.6483, + "step": 1798 + }, + { + "epoch": 0.3602882305844676, + "learning_rate": 1.3839741850973435e-05, + "loss": 0.0801, + "step": 1800 + }, + { + "epoch": 0.3610888710968775, + "learning_rate": 1.3852641879196952e-05, + "loss": 1.1504, + "step": 1802 + }, + { + "epoch": 0.3610888710968775, + "learning_rate": 1.3865534383127413e-05, + "loss": 0.0328, + "step": 1804 + }, + { + "epoch": 0.36188951160928745, + "learning_rate": 1.387841933758546e-05, + "loss": 0.014, + "step": 1806 + }, + { + "epoch": 0.36188951160928745, + "learning_rate": 1.3891296717406533e-05, + "loss": 0.3022, + "step": 1808 + }, + { + "epoch": 0.36269015212169736, + "learning_rate": 1.3904166497440812e-05, + "loss": 0.0043, + "step": 1810 + }, + { + "epoch": 0.36269015212169736, + "learning_rate": 1.391702865255334e-05, + "loss": 0.1017, + "step": 1812 + }, + { + "epoch": 0.3634907926341073, + "learning_rate": 1.3929883157624046e-05, + "loss": 0.3802, + "step": 1814 + }, + { + "epoch": 0.3634907926341073, + "learning_rate": 1.3942729987547808e-05, + "loss": 0.0272, + "step": 1816 + }, + { + "epoch": 0.36429143314651724, + "learning_rate": 1.3955569117234468e-05, + "loss": 0.338, + "step": 1818 + }, + { + "epoch": 0.36429143314651724, + "learning_rate": 1.3968400521608962e-05, + "loss": 0.4794, + "step": 1820 + }, + { + "epoch": 0.36509207365892715, + "learning_rate": 1.3981224175611265e-05, + "loss": 0.159, + "step": 1822 + }, + { + "epoch": 0.36509207365892715, + "learning_rate": 1.3994040054196498e-05, + "loss": 0.0287, + "step": 1824 + }, + { + "epoch": 0.36589271417133706, + "learning_rate": 1.4006848132334979e-05, + "loss": 0.1178, + "step": 1826 + }, + { + "epoch": 0.36589271417133706, + "learning_rate": 1.4019648385012245e-05, + "loss": 0.046, + "step": 1828 + }, + { + "epoch": 0.366693354683747, + "learning_rate": 1.4032440787229135e-05, + "loss": 0.9434, + "step": 1830 + }, + { + "epoch": 0.366693354683747, + "learning_rate": 1.4045225314001789e-05, + "loss": 0.0236, + "step": 1832 + }, + { + "epoch": 0.36749399519615694, + "learning_rate": 1.4058001940361781e-05, + "loss": 0.2076, + "step": 1834 + }, + { + "epoch": 0.36749399519615694, + "learning_rate": 1.4070770641356069e-05, + "loss": 0.1673, + "step": 1836 + }, + { + "epoch": 0.36829463570856685, + "learning_rate": 1.40835313920471e-05, + "loss": 0.009, + "step": 1838 + }, + { + "epoch": 0.36829463570856685, + "learning_rate": 1.4096284167512856e-05, + "loss": 0.0247, + "step": 1840 + }, + { + "epoch": 0.36909527622097676, + "learning_rate": 1.4109028942846888e-05, + "loss": 0.3863, + "step": 1842 + }, + { + "epoch": 0.36909527622097676, + "learning_rate": 1.4121765693158355e-05, + "loss": 0.0835, + "step": 1844 + }, + { + "epoch": 0.36989591673338673, + "learning_rate": 1.4134494393572146e-05, + "loss": 1.2366, + "step": 1846 + }, + { + "epoch": 0.36989591673338673, + "learning_rate": 1.4147215019228813e-05, + "loss": 0.382, + "step": 1848 + }, + { + "epoch": 0.37069655724579664, + "learning_rate": 1.4159927545284697e-05, + "loss": 0.7613, + "step": 1850 + }, + { + "epoch": 0.37069655724579664, + "learning_rate": 1.4172631946911964e-05, + "loss": 0.2838, + "step": 1852 + }, + { + "epoch": 0.37149719775820655, + "learning_rate": 1.4185328199298636e-05, + "loss": 0.0351, + "step": 1854 + }, + { + "epoch": 0.37149719775820655, + "learning_rate": 1.4198016277648665e-05, + "loss": 0.1954, + "step": 1856 + }, + { + "epoch": 0.37229783827061647, + "learning_rate": 1.4210696157181936e-05, + "loss": 0.1594, + "step": 1858 + }, + { + "epoch": 0.37229783827061647, + "learning_rate": 1.4223367813134406e-05, + "loss": 0.1246, + "step": 1860 + }, + { + "epoch": 0.37309847878302643, + "learning_rate": 1.4236031220758037e-05, + "loss": 0.2904, + "step": 1862 + }, + { + "epoch": 0.37309847878302643, + "learning_rate": 1.4248686355320922e-05, + "loss": 0.0602, + "step": 1864 + }, + { + "epoch": 0.37389911929543634, + "learning_rate": 1.426133319210731e-05, + "loss": 0.0206, + "step": 1866 + }, + { + "epoch": 0.37389911929543634, + "learning_rate": 1.4273971706417653e-05, + "loss": 0.1116, + "step": 1868 + }, + { + "epoch": 0.37469975980784626, + "learning_rate": 1.4286601873568642e-05, + "loss": 0.398, + "step": 1870 + }, + { + "epoch": 0.37469975980784626, + "learning_rate": 1.429922366889332e-05, + "loss": 0.039, + "step": 1872 + }, + { + "epoch": 0.3755004003202562, + "learning_rate": 1.431183706774103e-05, + "loss": 0.1361, + "step": 1874 + }, + { + "epoch": 0.3755004003202562, + "learning_rate": 1.4324442045477534e-05, + "loss": 0.0969, + "step": 1876 + }, + { + "epoch": 0.37630104083266613, + "learning_rate": 1.4337038577485035e-05, + "loss": 0.1901, + "step": 1878 + }, + { + "epoch": 0.37630104083266613, + "learning_rate": 1.4349626639162231e-05, + "loss": 0.0163, + "step": 1880 + }, + { + "epoch": 0.37710168134507605, + "learning_rate": 1.436220620592437e-05, + "loss": 0.3444, + "step": 1882 + }, + { + "epoch": 0.37710168134507605, + "learning_rate": 1.4374777253203265e-05, + "loss": 0.0795, + "step": 1884 + }, + { + "epoch": 0.377902321857486, + "learning_rate": 1.4387339756447422e-05, + "loss": 0.0613, + "step": 1886 + }, + { + "epoch": 0.377902321857486, + "learning_rate": 1.4399893691121985e-05, + "loss": 0.0055, + "step": 1888 + }, + { + "epoch": 0.3787029623698959, + "learning_rate": 1.4412439032708848e-05, + "loss": 0.0933, + "step": 1890 + }, + { + "epoch": 0.3787029623698959, + "learning_rate": 1.4424975756706684e-05, + "loss": 0.0308, + "step": 1892 + }, + { + "epoch": 0.37950360288230583, + "learning_rate": 1.4437503838631002e-05, + "loss": 0.0713, + "step": 1894 + }, + { + "epoch": 0.37950360288230583, + "learning_rate": 1.4450023254014185e-05, + "loss": 0.6564, + "step": 1896 + }, + { + "epoch": 0.38030424339471575, + "learning_rate": 1.4462533978405529e-05, + "loss": 0.0697, + "step": 1898 + }, + { + "epoch": 0.38030424339471575, + "learning_rate": 1.4475035987371348e-05, + "loss": 0.0828, + "step": 1900 + }, + { + "epoch": 0.3811048839071257, + "learning_rate": 1.4487529256494937e-05, + "loss": 0.0324, + "step": 1902 + }, + { + "epoch": 0.3811048839071257, + "learning_rate": 1.4500013761376663e-05, + "loss": 0.1798, + "step": 1904 + }, + { + "epoch": 0.3819055244195356, + "learning_rate": 1.4512489477634024e-05, + "loss": 0.1994, + "step": 1906 + }, + { + "epoch": 0.3819055244195356, + "learning_rate": 1.4524956380901674e-05, + "loss": 0.1323, + "step": 1908 + }, + { + "epoch": 0.38270616493194554, + "learning_rate": 1.4537414446831461e-05, + "loss": 0.1734, + "step": 1910 + }, + { + "epoch": 0.38270616493194554, + "learning_rate": 1.454986365109255e-05, + "loss": 0.1042, + "step": 1912 + }, + { + "epoch": 0.3835068054443555, + "learning_rate": 1.4562303969371357e-05, + "loss": 0.0162, + "step": 1914 + }, + { + "epoch": 0.3835068054443555, + "learning_rate": 1.4574735377371669e-05, + "loss": 0.2452, + "step": 1916 + }, + { + "epoch": 0.3843074459567654, + "learning_rate": 1.4587157850814679e-05, + "loss": 0.0244, + "step": 1918 + }, + { + "epoch": 0.3843074459567654, + "learning_rate": 1.4599571365439027e-05, + "loss": 0.0557, + "step": 1920 + }, + { + "epoch": 0.3851080864691753, + "learning_rate": 1.4611975897000849e-05, + "loss": 0.1608, + "step": 1922 + }, + { + "epoch": 0.3851080864691753, + "learning_rate": 1.4624371421273812e-05, + "loss": 0.0084, + "step": 1924 + }, + { + "epoch": 0.3859087269815853, + "learning_rate": 1.463675791404922e-05, + "loss": 0.0101, + "step": 1926 + }, + { + "epoch": 0.3859087269815853, + "learning_rate": 1.4649135351135968e-05, + "loss": 0.2455, + "step": 1928 + }, + { + "epoch": 0.3867093674939952, + "learning_rate": 1.4661503708360652e-05, + "loss": 0.0636, + "step": 1930 + }, + { + "epoch": 0.3867093674939952, + "learning_rate": 1.4673862961567604e-05, + "loss": 0.1304, + "step": 1932 + }, + { + "epoch": 0.3875100080064051, + "learning_rate": 1.4686213086618932e-05, + "loss": 0.0104, + "step": 1934 + }, + { + "epoch": 0.3875100080064051, + "learning_rate": 1.4698554059394563e-05, + "loss": 0.0265, + "step": 1936 + }, + { + "epoch": 0.388310648518815, + "learning_rate": 1.4710885855792338e-05, + "loss": 0.0287, + "step": 1938 + }, + { + "epoch": 0.388310648518815, + "learning_rate": 1.4723208451727977e-05, + "loss": 0.4397, + "step": 1940 + }, + { + "epoch": 0.389111289031225, + "learning_rate": 1.4735521823135184e-05, + "loss": 0.3584, + "step": 1942 + }, + { + "epoch": 0.389111289031225, + "learning_rate": 1.4747825945965675e-05, + "loss": 0.0183, + "step": 1944 + }, + { + "epoch": 0.3899119295436349, + "learning_rate": 1.4760120796189233e-05, + "loss": 0.0125, + "step": 1946 + }, + { + "epoch": 0.3899119295436349, + "learning_rate": 1.4772406349793749e-05, + "loss": 0.1148, + "step": 1948 + }, + { + "epoch": 0.3907125700560448, + "learning_rate": 1.4784682582785254e-05, + "loss": 0.4781, + "step": 1950 + }, + { + "epoch": 0.3907125700560448, + "learning_rate": 1.4796949471188033e-05, + "loss": 0.0397, + "step": 1952 + }, + { + "epoch": 0.3915132105684548, + "learning_rate": 1.4809206991044571e-05, + "loss": 0.1333, + "step": 1954 + }, + { + "epoch": 0.3915132105684548, + "learning_rate": 1.4821455118415666e-05, + "loss": 0.3427, + "step": 1956 + }, + { + "epoch": 0.3923138510808647, + "learning_rate": 1.4833693829380458e-05, + "loss": 0.6474, + "step": 1958 + }, + { + "epoch": 0.3923138510808647, + "learning_rate": 1.4845923100036479e-05, + "loss": 0.0392, + "step": 1960 + }, + { + "epoch": 0.3931144915932746, + "learning_rate": 1.4858142906499686e-05, + "loss": 0.1987, + "step": 1962 + }, + { + "epoch": 0.3931144915932746, + "learning_rate": 1.4870353224904563e-05, + "loss": 0.0469, + "step": 1964 + }, + { + "epoch": 0.3939151321056846, + "learning_rate": 1.4882554031404075e-05, + "loss": 0.0123, + "step": 1966 + }, + { + "epoch": 0.3939151321056846, + "learning_rate": 1.4894745302169786e-05, + "loss": 0.1139, + "step": 1968 + }, + { + "epoch": 0.3947157726180945, + "learning_rate": 1.4906927013391879e-05, + "loss": 0.0309, + "step": 1970 + }, + { + "epoch": 0.3947157726180945, + "learning_rate": 1.4919099141279205e-05, + "loss": 0.0996, + "step": 1972 + }, + { + "epoch": 0.3955164131305044, + "learning_rate": 1.4931261662059338e-05, + "loss": 0.4467, + "step": 1974 + }, + { + "epoch": 0.3955164131305044, + "learning_rate": 1.4943414551978597e-05, + "loss": 0.0347, + "step": 1976 + }, + { + "epoch": 0.3963170536429143, + "learning_rate": 1.4955557787302151e-05, + "loss": 0.3419, + "step": 1978 + }, + { + "epoch": 0.3963170536429143, + "learning_rate": 1.4967691344313988e-05, + "loss": 0.0121, + "step": 1980 + }, + { + "epoch": 0.3971176941553243, + "learning_rate": 1.4979815199317005e-05, + "loss": 0.151, + "step": 1982 + }, + { + "epoch": 0.3971176941553243, + "learning_rate": 1.499192932863305e-05, + "loss": 0.7055, + "step": 1984 + }, + { + "epoch": 0.3979183346677342, + "learning_rate": 1.5004033708602967e-05, + "loss": 0.1461, + "step": 1986 + }, + { + "epoch": 0.3979183346677342, + "learning_rate": 1.5016128315586626e-05, + "loss": 0.3682, + "step": 1988 + }, + { + "epoch": 0.3987189751801441, + "learning_rate": 1.5028213125963029e-05, + "loss": 0.0185, + "step": 1990 + }, + { + "epoch": 0.3987189751801441, + "learning_rate": 1.5040288116130261e-05, + "loss": 0.0257, + "step": 1992 + }, + { + "epoch": 0.39951961569255406, + "learning_rate": 1.5052353262505603e-05, + "loss": 0.0424, + "step": 1994 + }, + { + "epoch": 0.39951961569255406, + "learning_rate": 1.5064408541525568e-05, + "loss": 0.1249, + "step": 1996 + }, + { + "epoch": 0.400320256204964, + "learning_rate": 1.5076453929645933e-05, + "loss": 0.1904, + "step": 1998 + }, + { + "epoch": 0.400320256204964, + "learning_rate": 1.5088489403341793e-05, + "loss": 0.13, + "step": 2000 + }, + { + "epoch": 0.4011208967173739, + "learning_rate": 1.510051493910759e-05, + "loss": 0.0543, + "step": 2002 + }, + { + "epoch": 0.4011208967173739, + "learning_rate": 1.5112530513457229e-05, + "loss": 0.0179, + "step": 2004 + }, + { + "epoch": 0.40192153722978385, + "learning_rate": 1.512453610292401e-05, + "loss": 0.2063, + "step": 2006 + }, + { + "epoch": 0.40192153722978385, + "learning_rate": 1.513653168406076e-05, + "loss": 0.229, + "step": 2008 + }, + { + "epoch": 0.40272217774219377, + "learning_rate": 1.514851723343985e-05, + "loss": 0.5445, + "step": 2010 + }, + { + "epoch": 0.40272217774219377, + "learning_rate": 1.5160492727653245e-05, + "loss": 0.0732, + "step": 2012 + }, + { + "epoch": 0.4035228182546037, + "learning_rate": 1.5172458143312522e-05, + "loss": 0.1578, + "step": 2014 + }, + { + "epoch": 0.4035228182546037, + "learning_rate": 1.5184413457049006e-05, + "loss": 0.1167, + "step": 2016 + }, + { + "epoch": 0.4043234587670136, + "learning_rate": 1.5196358645513685e-05, + "loss": 0.0296, + "step": 2018 + }, + { + "epoch": 0.4043234587670136, + "learning_rate": 1.5208293685377354e-05, + "loss": 0.159, + "step": 2020 + }, + { + "epoch": 0.40512409927942356, + "learning_rate": 1.5220218553330618e-05, + "loss": 0.1219, + "step": 2022 + }, + { + "epoch": 0.40512409927942356, + "learning_rate": 1.5232133226083954e-05, + "loss": 0.1297, + "step": 2024 + }, + { + "epoch": 0.40592473979183347, + "learning_rate": 1.5244037680367744e-05, + "loss": 1.0696, + "step": 2026 + }, + { + "epoch": 0.40592473979183347, + "learning_rate": 1.5255931892932322e-05, + "loss": 0.2073, + "step": 2028 + }, + { + "epoch": 0.4067253803042434, + "learning_rate": 1.5267815840548057e-05, + "loss": 0.0482, + "step": 2030 + }, + { + "epoch": 0.4067253803042434, + "learning_rate": 1.527968950000533e-05, + "loss": 0.1281, + "step": 2032 + }, + { + "epoch": 0.40752602081665334, + "learning_rate": 1.529155284811463e-05, + "loss": 0.0209, + "step": 2034 + }, + { + "epoch": 0.40752602081665334, + "learning_rate": 1.5303405861706574e-05, + "loss": 0.061, + "step": 2036 + }, + { + "epoch": 0.40832666132906326, + "learning_rate": 1.5315248517631975e-05, + "loss": 0.0069, + "step": 2038 + }, + { + "epoch": 0.40832666132906326, + "learning_rate": 1.532708079276185e-05, + "loss": 0.2448, + "step": 2040 + }, + { + "epoch": 0.40912730184147317, + "learning_rate": 1.5338902663987544e-05, + "loss": 0.0856, + "step": 2042 + }, + { + "epoch": 0.40912730184147317, + "learning_rate": 1.5350714108220667e-05, + "loss": 0.0286, + "step": 2044 + }, + { + "epoch": 0.4099279423538831, + "learning_rate": 1.5362515102393217e-05, + "loss": 0.5656, + "step": 2046 + }, + { + "epoch": 0.4099279423538831, + "learning_rate": 1.5374305623457594e-05, + "loss": 0.0285, + "step": 2048 + }, + { + "epoch": 0.41072858286629305, + "learning_rate": 1.5386085648386656e-05, + "loss": 0.5049, + "step": 2050 + }, + { + "epoch": 0.41072858286629305, + "learning_rate": 1.539785515417376e-05, + "loss": 0.1583, + "step": 2052 + }, + { + "epoch": 0.41152922337870296, + "learning_rate": 1.540961411783279e-05, + "loss": 0.1726, + "step": 2054 + }, + { + "epoch": 0.41152922337870296, + "learning_rate": 1.542136251639826e-05, + "loss": 0.282, + "step": 2056 + }, + { + "epoch": 0.41232986389111287, + "learning_rate": 1.5433100326925288e-05, + "loss": 0.1953, + "step": 2058 + }, + { + "epoch": 0.41232986389111287, + "learning_rate": 1.5444827526489668e-05, + "loss": 0.1136, + "step": 2060 + }, + { + "epoch": 0.41313050440352284, + "learning_rate": 1.545654409218793e-05, + "loss": 0.2845, + "step": 2062 + }, + { + "epoch": 0.41313050440352284, + "learning_rate": 1.5468250001137368e-05, + "loss": 0.0307, + "step": 2064 + }, + { + "epoch": 0.41393114491593275, + "learning_rate": 1.5479945230476066e-05, + "loss": 0.0196, + "step": 2066 + }, + { + "epoch": 0.41393114491593275, + "learning_rate": 1.5491629757363026e-05, + "loss": 0.0371, + "step": 2068 + }, + { + "epoch": 0.41473178542834266, + "learning_rate": 1.550330355897809e-05, + "loss": 0.2373, + "step": 2070 + }, + { + "epoch": 0.41473178542834266, + "learning_rate": 1.551496661252208e-05, + "loss": 0.0222, + "step": 2072 + }, + { + "epoch": 0.4155324259407526, + "learning_rate": 1.5526618895216793e-05, + "loss": 0.1051, + "step": 2074 + }, + { + "epoch": 0.4155324259407526, + "learning_rate": 1.5538260384305073e-05, + "loss": 0.1114, + "step": 2076 + }, + { + "epoch": 0.41633306645316254, + "learning_rate": 1.5549891057050837e-05, + "loss": 0.0282, + "step": 2078 + }, + { + "epoch": 0.41633306645316254, + "learning_rate": 1.5561510890739113e-05, + "loss": 0.0141, + "step": 2080 + }, + { + "epoch": 0.41713370696557245, + "learning_rate": 1.557311986267615e-05, + "loss": 0.0124, + "step": 2082 + }, + { + "epoch": 0.41713370696557245, + "learning_rate": 1.5584717950189353e-05, + "loss": 0.665, + "step": 2084 + }, + { + "epoch": 0.41793434747798236, + "learning_rate": 1.5596305130627404e-05, + "loss": 0.0323, + "step": 2086 + }, + { + "epoch": 0.41793434747798236, + "learning_rate": 1.5607881381360296e-05, + "loss": 0.0234, + "step": 2088 + }, + { + "epoch": 0.4187349879903923, + "learning_rate": 1.5619446679779357e-05, + "loss": 0.4162, + "step": 2090 + }, + { + "epoch": 0.4187349879903923, + "learning_rate": 1.563100100329731e-05, + "loss": 0.0203, + "step": 2092 + }, + { + "epoch": 0.41953562850280224, + "learning_rate": 1.564254432934829e-05, + "loss": 0.02, + "step": 2094 + }, + { + "epoch": 0.41953562850280224, + "learning_rate": 1.565407663538797e-05, + "loss": 0.0569, + "step": 2096 + }, + { + "epoch": 0.42033626901521215, + "learning_rate": 1.5665597898893484e-05, + "loss": 0.7525, + "step": 2098 + }, + { + "epoch": 0.42033626901521215, + "learning_rate": 1.567710809736356e-05, + "loss": 0.015, + "step": 2100 + }, + { + "epoch": 0.4211369095276221, + "learning_rate": 1.568860720831853e-05, + "loss": 0.4037, + "step": 2102 + }, + { + "epoch": 0.4211369095276221, + "learning_rate": 1.5700095209300376e-05, + "loss": 0.1641, + "step": 2104 + }, + { + "epoch": 0.42193755004003203, + "learning_rate": 1.5711572077872774e-05, + "loss": 0.0304, + "step": 2106 + }, + { + "epoch": 0.42193755004003203, + "learning_rate": 1.572303779162118e-05, + "loss": 0.0101, + "step": 2108 + }, + { + "epoch": 0.42273819055244194, + "learning_rate": 1.573449232815279e-05, + "loss": 0.0134, + "step": 2110 + }, + { + "epoch": 0.42273819055244194, + "learning_rate": 1.5745935665096647e-05, + "loss": 0.2475, + "step": 2112 + }, + { + "epoch": 0.4235388310648519, + "learning_rate": 1.5757367780103666e-05, + "loss": 0.0065, + "step": 2114 + }, + { + "epoch": 0.4235388310648519, + "learning_rate": 1.5768788650846677e-05, + "loss": 0.0154, + "step": 2116 + }, + { + "epoch": 0.4243394715772618, + "learning_rate": 1.5780198255020478e-05, + "loss": 0.1334, + "step": 2118 + }, + { + "epoch": 0.4243394715772618, + "learning_rate": 1.5791596570341844e-05, + "loss": 0.4546, + "step": 2120 + }, + { + "epoch": 0.42514011208967173, + "learning_rate": 1.580298357454965e-05, + "loss": 0.3265, + "step": 2122 + }, + { + "epoch": 0.42514011208967173, + "learning_rate": 1.581435924540481e-05, + "loss": 0.0071, + "step": 2124 + }, + { + "epoch": 0.42594075260208164, + "learning_rate": 1.5825723560690403e-05, + "loss": 0.3637, + "step": 2126 + }, + { + "epoch": 0.42594075260208164, + "learning_rate": 1.5837076498211666e-05, + "loss": 0.4373, + "step": 2128 + }, + { + "epoch": 0.4267413931144916, + "learning_rate": 1.5848418035796068e-05, + "loss": 0.1638, + "step": 2130 + }, + { + "epoch": 0.4267413931144916, + "learning_rate": 1.5859748151293333e-05, + "loss": 0.0089, + "step": 2132 + }, + { + "epoch": 0.4275420336269015, + "learning_rate": 1.587106682257552e-05, + "loss": 0.0026, + "step": 2134 + }, + { + "epoch": 0.4275420336269015, + "learning_rate": 1.5882374027537005e-05, + "loss": 0.0373, + "step": 2136 + }, + { + "epoch": 0.42834267413931143, + "learning_rate": 1.5893669744094577e-05, + "loss": 0.2046, + "step": 2138 + }, + { + "epoch": 0.42834267413931143, + "learning_rate": 1.5904953950187455e-05, + "loss": 0.1908, + "step": 2140 + }, + { + "epoch": 0.4291433146517214, + "learning_rate": 1.591622662377734e-05, + "loss": 0.1365, + "step": 2142 + }, + { + "epoch": 0.4291433146517214, + "learning_rate": 1.5927487742848448e-05, + "loss": 0.0186, + "step": 2144 + }, + { + "epoch": 0.4299439551641313, + "learning_rate": 1.5938737285407567e-05, + "loss": 0.0866, + "step": 2146 + }, + { + "epoch": 0.4299439551641313, + "learning_rate": 1.594997522948412e-05, + "loss": 0.1113, + "step": 2148 + }, + { + "epoch": 0.4307445956765412, + "learning_rate": 1.5961201553130148e-05, + "loss": 0.0457, + "step": 2150 + }, + { + "epoch": 0.4307445956765412, + "learning_rate": 1.5972416234420393e-05, + "loss": 0.0713, + "step": 2152 + }, + { + "epoch": 0.4315452361889512, + "learning_rate": 1.598361925145234e-05, + "loss": 0.0415, + "step": 2154 + }, + { + "epoch": 0.4315452361889512, + "learning_rate": 1.599481058234626e-05, + "loss": 0.1127, + "step": 2156 + }, + { + "epoch": 0.4323458767013611, + "learning_rate": 1.6005990205245216e-05, + "loss": 0.086, + "step": 2158 + }, + { + "epoch": 0.4323458767013611, + "learning_rate": 1.60171580983152e-05, + "loss": 0.3256, + "step": 2160 + }, + { + "epoch": 0.433146517213771, + "learning_rate": 1.602831423974506e-05, + "loss": 0.0196, + "step": 2162 + }, + { + "epoch": 0.433146517213771, + "learning_rate": 1.6039458607746607e-05, + "loss": 0.0099, + "step": 2164 + }, + { + "epoch": 0.4339471577261809, + "learning_rate": 1.6050591180554648e-05, + "loss": 0.2868, + "step": 2166 + }, + { + "epoch": 0.4339471577261809, + "learning_rate": 1.606171193642703e-05, + "loss": 0.0038, + "step": 2168 + }, + { + "epoch": 0.4347477982385909, + "learning_rate": 1.6072820853644677e-05, + "loss": 0.0039, + "step": 2170 + }, + { + "epoch": 0.4347477982385909, + "learning_rate": 1.6083917910511616e-05, + "loss": 0.3634, + "step": 2172 + }, + { + "epoch": 0.4355484387510008, + "learning_rate": 1.6095003085355082e-05, + "loss": 0.0057, + "step": 2174 + }, + { + "epoch": 0.4355484387510008, + "learning_rate": 1.6106076356525474e-05, + "loss": 0.3305, + "step": 2176 + }, + { + "epoch": 0.4363490792634107, + "learning_rate": 1.611713770239646e-05, + "loss": 0.01, + "step": 2178 + }, + { + "epoch": 0.4363490792634107, + "learning_rate": 1.6128187101364982e-05, + "loss": 0.0174, + "step": 2180 + }, + { + "epoch": 0.4371497197758207, + "learning_rate": 1.6139224531851332e-05, + "loss": 0.0774, + "step": 2182 + }, + { + "epoch": 0.4371497197758207, + "learning_rate": 1.6150249972299153e-05, + "loss": 0.7148, + "step": 2184 + }, + { + "epoch": 0.4379503602882306, + "learning_rate": 1.616126340117555e-05, + "loss": 0.0735, + "step": 2186 + }, + { + "epoch": 0.4379503602882306, + "learning_rate": 1.617226479697104e-05, + "loss": 0.1394, + "step": 2188 + }, + { + "epoch": 0.4387510008006405, + "learning_rate": 1.618325413819966e-05, + "loss": 0.0026, + "step": 2190 + }, + { + "epoch": 0.4387510008006405, + "learning_rate": 1.6194231403398994e-05, + "loss": 0.0028, + "step": 2192 + }, + { + "epoch": 0.43955164131305047, + "learning_rate": 1.6205196571130194e-05, + "loss": 0.009, + "step": 2194 + }, + { + "epoch": 0.43955164131305047, + "learning_rate": 1.621614961997806e-05, + "loss": 0.0024, + "step": 2196 + }, + { + "epoch": 0.4403522818254604, + "learning_rate": 1.6227090528551034e-05, + "loss": 0.0893, + "step": 2198 + }, + { + "epoch": 0.4403522818254604, + "learning_rate": 1.6238019275481313e-05, + "loss": 0.0225, + "step": 2200 + }, + { + "epoch": 0.4411529223378703, + "learning_rate": 1.62489358394248e-05, + "loss": 0.0511, + "step": 2202 + }, + { + "epoch": 0.4411529223378703, + "learning_rate": 1.6259840199061212e-05, + "loss": 0.4014, + "step": 2204 + }, + { + "epoch": 0.4419535628502802, + "learning_rate": 1.6270732333094095e-05, + "loss": 0.0208, + "step": 2206 + }, + { + "epoch": 0.4419535628502802, + "learning_rate": 1.6281612220250883e-05, + "loss": 0.855, + "step": 2208 + }, + { + "epoch": 0.44275420336269017, + "learning_rate": 1.6292479839282897e-05, + "loss": 0.3202, + "step": 2210 + }, + { + "epoch": 0.44275420336269017, + "learning_rate": 1.6303335168965474e-05, + "loss": 0.027, + "step": 2212 + }, + { + "epoch": 0.4435548438751001, + "learning_rate": 1.6314178188097907e-05, + "loss": 0.0908, + "step": 2214 + }, + { + "epoch": 0.4435548438751001, + "learning_rate": 1.6325008875503543e-05, + "loss": 0.7935, + "step": 2216 + }, + { + "epoch": 0.44435548438751, + "learning_rate": 1.6335827210029816e-05, + "loss": 0.0358, + "step": 2218 + }, + { + "epoch": 0.44435548438751, + "learning_rate": 1.6346633170548285e-05, + "loss": 0.4517, + "step": 2220 + }, + { + "epoch": 0.44515612489991996, + "learning_rate": 1.635742673595467e-05, + "loss": 0.2085, + "step": 2222 + }, + { + "epoch": 0.44515612489991996, + "learning_rate": 1.6368207885168897e-05, + "loss": 0.2866, + "step": 2224 + }, + { + "epoch": 0.44595676541232987, + "learning_rate": 1.6378976597135173e-05, + "loss": 0.0245, + "step": 2226 + }, + { + "epoch": 0.44595676541232987, + "learning_rate": 1.6389732850821957e-05, + "loss": 0.07, + "step": 2228 + }, + { + "epoch": 0.4467574059247398, + "learning_rate": 1.640047662522205e-05, + "loss": 0.0939, + "step": 2230 + }, + { + "epoch": 0.4467574059247398, + "learning_rate": 1.641120789935263e-05, + "loss": 0.057, + "step": 2232 + }, + { + "epoch": 0.4475580464371497, + "learning_rate": 1.6421926652255282e-05, + "loss": 0.0133, + "step": 2234 + }, + { + "epoch": 0.4475580464371497, + "learning_rate": 1.6432632862996042e-05, + "loss": 0.0283, + "step": 2236 + }, + { + "epoch": 0.44835868694955966, + "learning_rate": 1.6443326510665474e-05, + "loss": 0.0353, + "step": 2238 + }, + { + "epoch": 0.44835868694955966, + "learning_rate": 1.6454007574378637e-05, + "loss": 0.0102, + "step": 2240 + }, + { + "epoch": 0.44915932746196957, + "learning_rate": 1.646467603327518e-05, + "loss": 0.0271, + "step": 2242 + }, + { + "epoch": 0.44915932746196957, + "learning_rate": 1.6475331866519377e-05, + "loss": 0.0159, + "step": 2244 + }, + { + "epoch": 0.4499599679743795, + "learning_rate": 1.6485975053300154e-05, + "loss": 0.003, + "step": 2246 + }, + { + "epoch": 0.4499599679743795, + "learning_rate": 1.6496605572831134e-05, + "loss": 0.7262, + "step": 2248 + }, + { + "epoch": 0.45076060848678945, + "learning_rate": 1.650722340435067e-05, + "loss": 0.1244, + "step": 2250 + }, + { + "epoch": 0.45076060848678945, + "learning_rate": 1.6517828527121928e-05, + "loss": 0.2164, + "step": 2252 + }, + { + "epoch": 0.45156124899919936, + "learning_rate": 1.652842092043287e-05, + "loss": 0.0116, + "step": 2254 + }, + { + "epoch": 0.45156124899919936, + "learning_rate": 1.6539000563596318e-05, + "loss": 0.602, + "step": 2256 + }, + { + "epoch": 0.45236188951160927, + "learning_rate": 1.6549567435950004e-05, + "loss": 0.0196, + "step": 2258 + }, + { + "epoch": 0.45236188951160927, + "learning_rate": 1.6560121516856586e-05, + "loss": 0.4808, + "step": 2260 + }, + { + "epoch": 0.45316253002401924, + "learning_rate": 1.6570662785703713e-05, + "loss": 0.1026, + "step": 2262 + }, + { + "epoch": 0.45316253002401924, + "learning_rate": 1.6581191221904077e-05, + "loss": 0.0156, + "step": 2264 + }, + { + "epoch": 0.45396317053642915, + "learning_rate": 1.6591706804895408e-05, + "loss": 0.0464, + "step": 2266 + }, + { + "epoch": 0.45396317053642915, + "learning_rate": 1.6602209514140542e-05, + "loss": 0.3453, + "step": 2268 + }, + { + "epoch": 0.45476381104883906, + "learning_rate": 1.6612699329127457e-05, + "loss": 0.0475, + "step": 2270 + }, + { + "epoch": 0.45476381104883906, + "learning_rate": 1.6623176229369324e-05, + "loss": 0.0636, + "step": 2272 + }, + { + "epoch": 0.455564451561249, + "learning_rate": 1.6633640194404523e-05, + "loss": 0.2875, + "step": 2274 + }, + { + "epoch": 0.455564451561249, + "learning_rate": 1.6644091203796694e-05, + "loss": 0.1648, + "step": 2276 + }, + { + "epoch": 0.45636509207365894, + "learning_rate": 1.6654529237134816e-05, + "loss": 0.0353, + "step": 2278 + }, + { + "epoch": 0.45636509207365894, + "learning_rate": 1.6664954274033168e-05, + "loss": 0.1624, + "step": 2280 + }, + { + "epoch": 0.45716573258606885, + "learning_rate": 1.667536629413143e-05, + "loss": 0.0195, + "step": 2282 + }, + { + "epoch": 0.45716573258606885, + "learning_rate": 1.6685765277094695e-05, + "loss": 0.1872, + "step": 2284 + }, + { + "epoch": 0.45796637309847876, + "learning_rate": 1.6696151202613527e-05, + "loss": 0.2271, + "step": 2286 + }, + { + "epoch": 0.45796637309847876, + "learning_rate": 1.6706524050403996e-05, + "loss": 0.2824, + "step": 2288 + }, + { + "epoch": 0.45876701361088873, + "learning_rate": 1.6716883800207685e-05, + "loss": 0.4806, + "step": 2290 + }, + { + "epoch": 0.45876701361088873, + "learning_rate": 1.6727230431791806e-05, + "loss": 0.0637, + "step": 2292 + }, + { + "epoch": 0.45956765412329864, + "learning_rate": 1.673756392494915e-05, + "loss": 0.0146, + "step": 2294 + }, + { + "epoch": 0.45956765412329864, + "learning_rate": 1.674788425949818e-05, + "loss": 0.0079, + "step": 2296 + }, + { + "epoch": 0.46036829463570855, + "learning_rate": 1.6758191415283063e-05, + "loss": 0.0137, + "step": 2298 + }, + { + "epoch": 0.46036829463570855, + "learning_rate": 1.6768485372173696e-05, + "loss": 0.8585, + "step": 2300 + }, + { + "epoch": 0.4611689351481185, + "learning_rate": 1.6778766110065755e-05, + "loss": 0.0481, + "step": 2302 + }, + { + "epoch": 0.4611689351481185, + "learning_rate": 1.6789033608880735e-05, + "loss": 0.1793, + "step": 2304 + }, + { + "epoch": 0.46196957566052843, + "learning_rate": 1.6799287848566e-05, + "loss": 0.0462, + "step": 2306 + }, + { + "epoch": 0.46196957566052843, + "learning_rate": 1.6809528809094798e-05, + "loss": 0.222, + "step": 2308 + }, + { + "epoch": 0.46277021617293834, + "learning_rate": 1.6819756470466305e-05, + "loss": 0.0098, + "step": 2310 + }, + { + "epoch": 0.46277021617293834, + "learning_rate": 1.6829970812705674e-05, + "loss": 0.238, + "step": 2312 + }, + { + "epoch": 0.46357085668534825, + "learning_rate": 1.684017181586408e-05, + "loss": 0.0638, + "step": 2314 + }, + { + "epoch": 0.46357085668534825, + "learning_rate": 1.6850359460018733e-05, + "loss": 0.1134, + "step": 2316 + }, + { + "epoch": 0.4643714971977582, + "learning_rate": 1.6860533725272943e-05, + "loss": 0.6497, + "step": 2318 + }, + { + "epoch": 0.4643714971977582, + "learning_rate": 1.6870694591756165e-05, + "loss": 0.0104, + "step": 2320 + }, + { + "epoch": 0.46517213771016813, + "learning_rate": 1.6880842039624e-05, + "loss": 0.4154, + "step": 2322 + }, + { + "epoch": 0.46517213771016813, + "learning_rate": 1.689097604905826e-05, + "loss": 0.1901, + "step": 2324 + }, + { + "epoch": 0.46597277822257804, + "learning_rate": 1.6901096600267e-05, + "loss": 0.0914, + "step": 2326 + }, + { + "epoch": 0.46597277822257804, + "learning_rate": 1.6911203673484577e-05, + "loss": 0.2309, + "step": 2328 + }, + { + "epoch": 0.466773418734988, + "learning_rate": 1.6921297248971645e-05, + "loss": 0.3031, + "step": 2330 + }, + { + "epoch": 0.466773418734988, + "learning_rate": 1.6931377307015226e-05, + "loss": 0.4301, + "step": 2332 + }, + { + "epoch": 0.4675740592473979, + "learning_rate": 1.6941443827928778e-05, + "loss": 0.0525, + "step": 2334 + }, + { + "epoch": 0.4675740592473979, + "learning_rate": 1.695149679205214e-05, + "loss": 0.0117, + "step": 2336 + }, + { + "epoch": 0.46837469975980783, + "learning_rate": 1.6961536179751672e-05, + "loss": 0.7849, + "step": 2338 + }, + { + "epoch": 0.46837469975980783, + "learning_rate": 1.6971561971420222e-05, + "loss": 0.4011, + "step": 2340 + }, + { + "epoch": 0.4691753402722178, + "learning_rate": 1.6981574147477204e-05, + "loss": 0.0145, + "step": 2342 + }, + { + "epoch": 0.4691753402722178, + "learning_rate": 1.6991572688368628e-05, + "loss": 0.059, + "step": 2344 + }, + { + "epoch": 0.4699759807846277, + "learning_rate": 1.70015575745671e-05, + "loss": 0.149, + "step": 2346 + }, + { + "epoch": 0.4699759807846277, + "learning_rate": 1.701152878657196e-05, + "loss": 0.1325, + "step": 2348 + }, + { + "epoch": 0.4707766212970376, + "learning_rate": 1.7021486304909196e-05, + "loss": 0.1303, + "step": 2350 + }, + { + "epoch": 0.4707766212970376, + "learning_rate": 1.7031430110131562e-05, + "loss": 0.0644, + "step": 2352 + }, + { + "epoch": 0.47157726180944753, + "learning_rate": 1.7041360182818583e-05, + "loss": 0.1645, + "step": 2354 + }, + { + "epoch": 0.47157726180944753, + "learning_rate": 1.705127650357662e-05, + "loss": 0.015, + "step": 2356 + }, + { + "epoch": 0.4723779023218575, + "learning_rate": 1.7061179053038887e-05, + "loss": 0.285, + "step": 2358 + }, + { + "epoch": 0.4723779023218575, + "learning_rate": 1.7071067811865467e-05, + "loss": 0.0134, + "step": 2360 + }, + { + "epoch": 0.4731785428342674, + "learning_rate": 1.708094276074343e-05, + "loss": 0.2169, + "step": 2362 + }, + { + "epoch": 0.4731785428342674, + "learning_rate": 1.7090803880386778e-05, + "loss": 0.3543, + "step": 2364 + }, + { + "epoch": 0.4739791833466773, + "learning_rate": 1.7100651151536525e-05, + "loss": 0.022, + "step": 2366 + }, + { + "epoch": 0.4739791833466773, + "learning_rate": 1.7110484554960738e-05, + "loss": 0.0975, + "step": 2368 + }, + { + "epoch": 0.4747798238590873, + "learning_rate": 1.712030407145457e-05, + "loss": 0.0716, + "step": 2370 + }, + { + "epoch": 0.4747798238590873, + "learning_rate": 1.713010968184029e-05, + "loss": 0.0178, + "step": 2372 + }, + { + "epoch": 0.4755804643714972, + "learning_rate": 1.7139901366967332e-05, + "loss": 0.1925, + "step": 2374 + }, + { + "epoch": 0.4755804643714972, + "learning_rate": 1.7149679107712306e-05, + "loss": 0.0857, + "step": 2376 + }, + { + "epoch": 0.4763811048839071, + "learning_rate": 1.71594428849791e-05, + "loss": 0.0038, + "step": 2378 + }, + { + "epoch": 0.4763811048839071, + "learning_rate": 1.716919267969883e-05, + "loss": 0.1088, + "step": 2380 + }, + { + "epoch": 0.4771817453963171, + "learning_rate": 1.717892847282994e-05, + "loss": 0.5358, + "step": 2382 + }, + { + "epoch": 0.4771817453963171, + "learning_rate": 1.7188650245358215e-05, + "loss": 0.0914, + "step": 2384 + }, + { + "epoch": 0.477982385908727, + "learning_rate": 1.7198357978296817e-05, + "loss": 0.0448, + "step": 2386 + }, + { + "epoch": 0.477982385908727, + "learning_rate": 1.7208051652686338e-05, + "loss": 0.1806, + "step": 2388 + }, + { + "epoch": 0.4787830264211369, + "learning_rate": 1.721773124959481e-05, + "loss": 0.2372, + "step": 2390 + }, + { + "epoch": 0.4787830264211369, + "learning_rate": 1.722739675011779e-05, + "loss": 0.0127, + "step": 2392 + }, + { + "epoch": 0.4795836669335468, + "learning_rate": 1.723704813537834e-05, + "loss": 0.1994, + "step": 2394 + }, + { + "epoch": 0.4795836669335468, + "learning_rate": 1.7246685386527095e-05, + "loss": 0.1591, + "step": 2396 + }, + { + "epoch": 0.4803843074459568, + "learning_rate": 1.725630848474229e-05, + "loss": 0.0396, + "step": 2398 + }, + { + "epoch": 0.4803843074459568, + "learning_rate": 1.726591741122981e-05, + "loss": 0.1951, + "step": 2400 + }, + { + "epoch": 0.4811849479583667, + "learning_rate": 1.727551214722321e-05, + "loss": 0.133, + "step": 2402 + }, + { + "epoch": 0.4811849479583667, + "learning_rate": 1.7285092673983753e-05, + "loss": 0.1767, + "step": 2404 + }, + { + "epoch": 0.4819855884707766, + "learning_rate": 1.7294658972800488e-05, + "loss": 0.0795, + "step": 2406 + }, + { + "epoch": 0.4819855884707766, + "learning_rate": 1.730421102499021e-05, + "loss": 0.2287, + "step": 2408 + }, + { + "epoch": 0.48278622898318657, + "learning_rate": 1.7313748811897558e-05, + "loss": 0.0007, + "step": 2410 + }, + { + "epoch": 0.48278622898318657, + "learning_rate": 1.7323272314895022e-05, + "loss": 0.0942, + "step": 2412 + }, + { + "epoch": 0.4835868694955965, + "learning_rate": 1.7332781515383003e-05, + "loss": 0.0268, + "step": 2414 + }, + { + "epoch": 0.4835868694955965, + "learning_rate": 1.734227639478982e-05, + "loss": 0.276, + "step": 2416 + }, + { + "epoch": 0.4843875100080064, + "learning_rate": 1.7351756934571758e-05, + "loss": 0.9855, + "step": 2418 + }, + { + "epoch": 0.4843875100080064, + "learning_rate": 1.736122311621314e-05, + "loss": 0.0287, + "step": 2420 + }, + { + "epoch": 0.4851881505204163, + "learning_rate": 1.7370674921226296e-05, + "loss": 0.1416, + "step": 2422 + }, + { + "epoch": 0.4851881505204163, + "learning_rate": 1.738011233115165e-05, + "loss": 0.056, + "step": 2424 + }, + { + "epoch": 0.4859887910328263, + "learning_rate": 1.7389535327557733e-05, + "loss": 0.0553, + "step": 2426 + }, + { + "epoch": 0.4859887910328263, + "learning_rate": 1.7398943892041227e-05, + "loss": 0.2669, + "step": 2428 + }, + { + "epoch": 0.4867894315452362, + "learning_rate": 1.7408338006227005e-05, + "loss": 0.016, + "step": 2430 + }, + { + "epoch": 0.4867894315452362, + "learning_rate": 1.7417717651768144e-05, + "loss": 0.02, + "step": 2432 + }, + { + "epoch": 0.4875900720576461, + "learning_rate": 1.7427082810346018e-05, + "loss": 0.0886, + "step": 2434 + }, + { + "epoch": 0.4875900720576461, + "learning_rate": 1.743643346367026e-05, + "loss": 0.1204, + "step": 2436 + }, + { + "epoch": 0.48839071257005606, + "learning_rate": 1.744576959347884e-05, + "loss": 0.133, + "step": 2438 + }, + { + "epoch": 0.48839071257005606, + "learning_rate": 1.7455091181538087e-05, + "loss": 0.0286, + "step": 2440 + }, + { + "epoch": 0.489191353082466, + "learning_rate": 1.746439820964275e-05, + "loss": 0.0292, + "step": 2442 + }, + { + "epoch": 0.489191353082466, + "learning_rate": 1.7473690659615992e-05, + "loss": 1.0368, + "step": 2444 + }, + { + "epoch": 0.4899919935948759, + "learning_rate": 1.748296851330945e-05, + "loss": 0.1221, + "step": 2446 + }, + { + "epoch": 0.4899919935948759, + "learning_rate": 1.74922317526033e-05, + "loss": 0.0219, + "step": 2448 + }, + { + "epoch": 0.49079263410728585, + "learning_rate": 1.7501480359406217e-05, + "loss": 0.0234, + "step": 2450 + }, + { + "epoch": 0.49079263410728585, + "learning_rate": 1.7510714315655474e-05, + "loss": 0.0121, + "step": 2452 + }, + { + "epoch": 0.49159327461969576, + "learning_rate": 1.7519933603316955e-05, + "loss": 0.0088, + "step": 2454 + }, + { + "epoch": 0.49159327461969576, + "learning_rate": 1.752913820438519e-05, + "loss": 0.0004, + "step": 2456 + }, + { + "epoch": 0.4923939151321057, + "learning_rate": 1.7538328100883397e-05, + "loss": 0.2305, + "step": 2458 + }, + { + "epoch": 0.4923939151321057, + "learning_rate": 1.7547503274863495e-05, + "loss": 0.6014, + "step": 2460 + }, + { + "epoch": 0.4931945556445156, + "learning_rate": 1.7556663708406193e-05, + "loss": 0.0856, + "step": 2462 + }, + { + "epoch": 0.4931945556445156, + "learning_rate": 1.756580938362096e-05, + "loss": 0.0128, + "step": 2464 + }, + { + "epoch": 0.49399519615692555, + "learning_rate": 1.7574940282646085e-05, + "loss": 0.0296, + "step": 2466 + }, + { + "epoch": 0.49399519615692555, + "learning_rate": 1.758405638764873e-05, + "loss": 0.0363, + "step": 2468 + }, + { + "epoch": 0.49479583666933546, + "learning_rate": 1.7593157680824946e-05, + "loss": 0.1121, + "step": 2470 + }, + { + "epoch": 0.49479583666933546, + "learning_rate": 1.7602244144399693e-05, + "loss": 0.1606, + "step": 2472 + }, + { + "epoch": 0.4955964771817454, + "learning_rate": 1.761131576062694e-05, + "loss": 0.2643, + "step": 2474 + }, + { + "epoch": 0.4955964771817454, + "learning_rate": 1.7620372511789604e-05, + "loss": 0.0031, + "step": 2476 + }, + { + "epoch": 0.49639711769415534, + "learning_rate": 1.7629414380199662e-05, + "loss": 0.0521, + "step": 2478 + }, + { + "epoch": 0.49639711769415534, + "learning_rate": 1.7638441348198147e-05, + "loss": 0.0305, + "step": 2480 + }, + { + "epoch": 0.49719775820656525, + "learning_rate": 1.7647453398155194e-05, + "loss": 0.1942, + "step": 2482 + }, + { + "epoch": 0.49719775820656525, + "learning_rate": 1.7656450512470077e-05, + "loss": 0.0309, + "step": 2484 + }, + { + "epoch": 0.49799839871897517, + "learning_rate": 1.7665432673571218e-05, + "loss": 0.3348, + "step": 2486 + }, + { + "epoch": 0.49799839871897517, + "learning_rate": 1.7674399863916295e-05, + "loss": 0.0001, + "step": 2488 + }, + { + "epoch": 0.49879903923138513, + "learning_rate": 1.768335206599217e-05, + "loss": 0.042, + "step": 2490 + }, + { + "epoch": 0.49879903923138513, + "learning_rate": 1.7692289262315e-05, + "loss": 0.5685, + "step": 2492 + }, + { + "epoch": 0.49959967974379504, + "learning_rate": 1.7701211435430256e-05, + "loss": 0.7338, + "step": 2494 + }, + { + "epoch": 0.49959967974379504, + "learning_rate": 1.771011856791273e-05, + "loss": 0.1241, + "step": 2496 + }, + { + "epoch": 0.500400320256205, + "learning_rate": 1.771901064236659e-05, + "loss": 0.3829, + "step": 2498 + }, + { + "epoch": 0.500400320256205, + "learning_rate": 1.7727887641425448e-05, + "loss": 0.2672, + "step": 2500 + }, + { + "epoch": 0.5012009607686149, + "learning_rate": 1.773674954775232e-05, + "loss": 0.1601, + "step": 2502 + }, + { + "epoch": 0.5012009607686149, + "learning_rate": 1.7745596344039712e-05, + "loss": 0.207, + "step": 2504 + }, + { + "epoch": 0.5020016012810248, + "learning_rate": 1.7754428013009637e-05, + "loss": 0.0658, + "step": 2506 + }, + { + "epoch": 0.5020016012810248, + "learning_rate": 1.7763244537413657e-05, + "loss": 0.1175, + "step": 2508 + }, + { + "epoch": 0.5028022417934348, + "learning_rate": 1.77720459000329e-05, + "loss": 1.0147, + "step": 2510 + }, + { + "epoch": 0.5028022417934348, + "learning_rate": 1.7780832083678116e-05, + "loss": 0.2297, + "step": 2512 + }, + { + "epoch": 0.5036028823058447, + "learning_rate": 1.7789603071189712e-05, + "loss": 0.4516, + "step": 2514 + }, + { + "epoch": 0.5036028823058447, + "learning_rate": 1.7798358845437754e-05, + "loss": 0.1235, + "step": 2516 + }, + { + "epoch": 0.5044035228182546, + "learning_rate": 1.780709938932202e-05, + "loss": 0.0326, + "step": 2518 + }, + { + "epoch": 0.5044035228182546, + "learning_rate": 1.7815824685772035e-05, + "loss": 0.0185, + "step": 2520 + }, + { + "epoch": 0.5052041633306645, + "learning_rate": 1.7824534717747115e-05, + "loss": 0.0823, + "step": 2522 + }, + { + "epoch": 0.5052041633306645, + "learning_rate": 1.7833229468236364e-05, + "loss": 0.1625, + "step": 2524 + }, + { + "epoch": 0.5060048038430744, + "learning_rate": 1.7841908920258767e-05, + "loss": 1.5316, + "step": 2526 + }, + { + "epoch": 0.5060048038430744, + "learning_rate": 1.7850573056863156e-05, + "loss": 0.0284, + "step": 2528 + }, + { + "epoch": 0.5068054443554844, + "learning_rate": 1.7859221861128284e-05, + "loss": 0.0694, + "step": 2530 + }, + { + "epoch": 0.5068054443554844, + "learning_rate": 1.786785531616285e-05, + "loss": 0.1843, + "step": 2532 + }, + { + "epoch": 0.5076060848678943, + "learning_rate": 1.7876473405105528e-05, + "loss": 0.1289, + "step": 2534 + }, + { + "epoch": 0.5076060848678943, + "learning_rate": 1.7885076111125004e-05, + "loss": 0.0074, + "step": 2536 + }, + { + "epoch": 0.5084067253803043, + "learning_rate": 1.7893663417419995e-05, + "loss": 0.6023, + "step": 2538 + }, + { + "epoch": 0.5084067253803043, + "learning_rate": 1.790223530721933e-05, + "loss": 0.4515, + "step": 2540 + }, + { + "epoch": 0.5092073658927142, + "learning_rate": 1.791079176378191e-05, + "loss": 0.0013, + "step": 2542 + }, + { + "epoch": 0.5092073658927142, + "learning_rate": 1.791933277039679e-05, + "loss": 0.0035, + "step": 2544 + }, + { + "epoch": 0.5100080064051241, + "learning_rate": 1.7927858310383202e-05, + "loss": 0.1259, + "step": 2546 + }, + { + "epoch": 0.5100080064051241, + "learning_rate": 1.7936368367090577e-05, + "loss": 0.1562, + "step": 2548 + }, + { + "epoch": 0.510808646917534, + "learning_rate": 1.794486292389858e-05, + "loss": 0.2035, + "step": 2550 + }, + { + "epoch": 0.510808646917534, + "learning_rate": 1.7953341964217183e-05, + "loss": 0.0035, + "step": 2552 + }, + { + "epoch": 0.5116092874299439, + "learning_rate": 1.7961805471486618e-05, + "loss": 0.0289, + "step": 2554 + }, + { + "epoch": 0.5116092874299439, + "learning_rate": 1.7970253429177477e-05, + "loss": 0.0662, + "step": 2556 + }, + { + "epoch": 0.5124099279423538, + "learning_rate": 1.797868582079072e-05, + "loss": 0.2284, + "step": 2558 + }, + { + "epoch": 0.5124099279423538, + "learning_rate": 1.7987102629857696e-05, + "loss": 0.0877, + "step": 2560 + }, + { + "epoch": 0.5132105684547638, + "learning_rate": 1.7995503839940197e-05, + "loss": 0.0254, + "step": 2562 + }, + { + "epoch": 0.5132105684547638, + "learning_rate": 1.800388943463047e-05, + "loss": 0.1117, + "step": 2564 + }, + { + "epoch": 0.5140112089671738, + "learning_rate": 1.8012259397551283e-05, + "loss": 0.0172, + "step": 2566 + }, + { + "epoch": 0.5140112089671738, + "learning_rate": 1.8020613712355912e-05, + "loss": 0.0625, + "step": 2568 + }, + { + "epoch": 0.5148118494795837, + "learning_rate": 1.8028952362728197e-05, + "loss": 0.0984, + "step": 2570 + }, + { + "epoch": 0.5148118494795837, + "learning_rate": 1.803727533238257e-05, + "loss": 0.0542, + "step": 2572 + }, + { + "epoch": 0.5156124899919936, + "learning_rate": 1.804558260506409e-05, + "loss": 0.0295, + "step": 2574 + }, + { + "epoch": 0.5156124899919936, + "learning_rate": 1.805387416454847e-05, + "loss": 0.0542, + "step": 2576 + }, + { + "epoch": 0.5164131305044035, + "learning_rate": 1.8062149994642135e-05, + "loss": 0.2656, + "step": 2578 + }, + { + "epoch": 0.5164131305044035, + "learning_rate": 1.8070410079182195e-05, + "loss": 0.0682, + "step": 2580 + }, + { + "epoch": 0.5172137710168134, + "learning_rate": 1.8078654402036526e-05, + "loss": 0.0354, + "step": 2582 + }, + { + "epoch": 0.5172137710168134, + "learning_rate": 1.8086882947103787e-05, + "loss": 0.0021, + "step": 2584 + }, + { + "epoch": 0.5180144115292233, + "learning_rate": 1.8095095698313452e-05, + "loss": 0.0113, + "step": 2586 + }, + { + "epoch": 0.5180144115292233, + "learning_rate": 1.8103292639625842e-05, + "loss": 0.0015, + "step": 2588 + }, + { + "epoch": 0.5188150520416333, + "learning_rate": 1.811147375503214e-05, + "loss": 0.0023, + "step": 2590 + }, + { + "epoch": 0.5188150520416333, + "learning_rate": 1.811963902855447e-05, + "loss": 0.0136, + "step": 2592 + }, + { + "epoch": 0.5196156925540433, + "learning_rate": 1.812778844424587e-05, + "loss": 0.0158, + "step": 2594 + }, + { + "epoch": 0.5196156925540433, + "learning_rate": 1.813592198619035e-05, + "loss": 0.3767, + "step": 2596 + }, + { + "epoch": 0.5204163330664532, + "learning_rate": 1.814403963850293e-05, + "loss": 0.008, + "step": 2598 + }, + { + "epoch": 0.5204163330664532, + "learning_rate": 1.8152141385329658e-05, + "loss": 0.1032, + "step": 2600 + }, + { + "epoch": 0.5212169735788631, + "learning_rate": 1.8160227210847636e-05, + "loss": 0.0768, + "step": 2602 + }, + { + "epoch": 0.5212169735788631, + "learning_rate": 1.816829709926509e-05, + "loss": 0.0852, + "step": 2604 + }, + { + "epoch": 0.522017614091273, + "learning_rate": 1.8176351034821345e-05, + "loss": 0.0371, + "step": 2606 + }, + { + "epoch": 0.522017614091273, + "learning_rate": 1.8184389001786895e-05, + "loss": 0.0081, + "step": 2608 + }, + { + "epoch": 0.5228182546036829, + "learning_rate": 1.819241098446341e-05, + "loss": 0.0154, + "step": 2610 + }, + { + "epoch": 0.5228182546036829, + "learning_rate": 1.8200416967183785e-05, + "loss": 0.8303, + "step": 2612 + }, + { + "epoch": 0.5236188951160928, + "learning_rate": 1.8208406934312167e-05, + "loss": 0.0053, + "step": 2614 + }, + { + "epoch": 0.5236188951160928, + "learning_rate": 1.821638087024396e-05, + "loss": 0.2696, + "step": 2616 + }, + { + "epoch": 0.5244195356285029, + "learning_rate": 1.8224338759405917e-05, + "loss": 1.1945, + "step": 2618 + }, + { + "epoch": 0.5244195356285029, + "learning_rate": 1.8232280586256097e-05, + "loss": 0.0303, + "step": 2620 + }, + { + "epoch": 0.5252201761409128, + "learning_rate": 1.8240206335283947e-05, + "loss": 0.3651, + "step": 2622 + }, + { + "epoch": 0.5252201761409128, + "learning_rate": 1.8248115991010296e-05, + "loss": 0.1745, + "step": 2624 + }, + { + "epoch": 0.5260208166533227, + "learning_rate": 1.825600953798743e-05, + "loss": 0.0037, + "step": 2626 + }, + { + "epoch": 0.5260208166533227, + "learning_rate": 1.8263886960799055e-05, + "loss": 0.0585, + "step": 2628 + }, + { + "epoch": 0.5268214571657326, + "learning_rate": 1.8271748244060426e-05, + "loss": 0.2044, + "step": 2630 + }, + { + "epoch": 0.5268214571657326, + "learning_rate": 1.8279593372418264e-05, + "loss": 0.0222, + "step": 2632 + }, + { + "epoch": 0.5276220976781425, + "learning_rate": 1.8287422330550878e-05, + "loss": 0.0334, + "step": 2634 + }, + { + "epoch": 0.5276220976781425, + "learning_rate": 1.829523510316813e-05, + "loss": 0.4768, + "step": 2636 + }, + { + "epoch": 0.5284227381905524, + "learning_rate": 1.8303031675011515e-05, + "loss": 0.5087, + "step": 2638 + }, + { + "epoch": 0.5284227381905524, + "learning_rate": 1.8310812030854155e-05, + "loss": 0.0276, + "step": 2640 + }, + { + "epoch": 0.5292233787029623, + "learning_rate": 1.8318576155500838e-05, + "loss": 0.0739, + "step": 2642 + }, + { + "epoch": 0.5292233787029623, + "learning_rate": 1.832632403378808e-05, + "loss": 0.2453, + "step": 2644 + }, + { + "epoch": 0.5300240192153723, + "learning_rate": 1.8334055650584094e-05, + "loss": 0.2957, + "step": 2646 + }, + { + "epoch": 0.5300240192153723, + "learning_rate": 1.834177099078887e-05, + "loss": 1.1322, + "step": 2648 + }, + { + "epoch": 0.5308246597277823, + "learning_rate": 1.8349470039334173e-05, + "loss": 0.0669, + "step": 2650 + }, + { + "epoch": 0.5308246597277823, + "learning_rate": 1.8357152781183606e-05, + "loss": 0.0444, + "step": 2652 + }, + { + "epoch": 0.5316253002401922, + "learning_rate": 1.83648192013326e-05, + "loss": 0.009, + "step": 2654 + }, + { + "epoch": 0.5316253002401922, + "learning_rate": 1.8372469284808465e-05, + "loss": 0.1286, + "step": 2656 + }, + { + "epoch": 0.5324259407526021, + "learning_rate": 1.8380103016670437e-05, + "loss": 0.4698, + "step": 2658 + }, + { + "epoch": 0.5324259407526021, + "learning_rate": 1.8387720382009665e-05, + "loss": 0.0061, + "step": 2660 + }, + { + "epoch": 0.533226581265012, + "learning_rate": 1.839532136594927e-05, + "loss": 0.3172, + "step": 2662 + }, + { + "epoch": 0.533226581265012, + "learning_rate": 1.840290595364436e-05, + "loss": 0.4881, + "step": 2664 + }, + { + "epoch": 0.5340272217774219, + "learning_rate": 1.8410474130282085e-05, + "loss": 0.1238, + "step": 2666 + }, + { + "epoch": 0.5340272217774219, + "learning_rate": 1.8418025881081606e-05, + "loss": 0.0232, + "step": 2668 + }, + { + "epoch": 0.5348278622898318, + "learning_rate": 1.8425561191294217e-05, + "loss": 0.0298, + "step": 2670 + }, + { + "epoch": 0.5348278622898318, + "learning_rate": 1.8433080046203286e-05, + "loss": 0.041, + "step": 2672 + }, + { + "epoch": 0.5356285028022418, + "learning_rate": 1.8440582431124325e-05, + "loss": 0.2252, + "step": 2674 + }, + { + "epoch": 0.5356285028022418, + "learning_rate": 1.844806833140501e-05, + "loss": 0.0052, + "step": 2676 + }, + { + "epoch": 0.5364291433146517, + "learning_rate": 1.8455537732425223e-05, + "loss": 0.187, + "step": 2678 + }, + { + "epoch": 0.5364291433146517, + "learning_rate": 1.8462990619597054e-05, + "loss": 0.077, + "step": 2680 + }, + { + "epoch": 0.5372297838270617, + "learning_rate": 1.847042697836485e-05, + "loss": 0.064, + "step": 2682 + }, + { + "epoch": 0.5372297838270617, + "learning_rate": 1.8477846794205258e-05, + "loss": 0.1342, + "step": 2684 + }, + { + "epoch": 0.5380304243394716, + "learning_rate": 1.84852500526272e-05, + "loss": 0.0454, + "step": 2686 + }, + { + "epoch": 0.5380304243394716, + "learning_rate": 1.8492636739171966e-05, + "loss": 0.0796, + "step": 2688 + }, + { + "epoch": 0.5388310648518815, + "learning_rate": 1.8500006839413183e-05, + "loss": 0.017, + "step": 2690 + }, + { + "epoch": 0.5388310648518815, + "learning_rate": 1.85073603389569e-05, + "loss": 0.2529, + "step": 2692 + }, + { + "epoch": 0.5396317053642914, + "learning_rate": 1.851469722344155e-05, + "loss": 0.0471, + "step": 2694 + }, + { + "epoch": 0.5396317053642914, + "learning_rate": 1.8522017478538067e-05, + "loss": 0.0083, + "step": 2696 + }, + { + "epoch": 0.5404323458767014, + "learning_rate": 1.8529321089949817e-05, + "loss": 0.1084, + "step": 2698 + }, + { + "epoch": 0.5404323458767014, + "learning_rate": 1.8536608043412695e-05, + "loss": 0.8031, + "step": 2700 + }, + { + "epoch": 0.5412329863891113, + "learning_rate": 1.8543878324695122e-05, + "loss": 0.3296, + "step": 2702 + }, + { + "epoch": 0.5412329863891113, + "learning_rate": 1.855113191959808e-05, + "loss": 0.4151, + "step": 2704 + }, + { + "epoch": 0.5420336269015212, + "learning_rate": 1.8558368813955143e-05, + "loss": 0.4803, + "step": 2706 + }, + { + "epoch": 0.5420336269015212, + "learning_rate": 1.856558899363248e-05, + "loss": 0.4022, + "step": 2708 + }, + { + "epoch": 0.5428342674139311, + "learning_rate": 1.857279244452896e-05, + "loss": 0.1194, + "step": 2710 + }, + { + "epoch": 0.5428342674139311, + "learning_rate": 1.8579979152576063e-05, + "loss": 0.1475, + "step": 2712 + }, + { + "epoch": 0.5436349079263411, + "learning_rate": 1.8587149103738e-05, + "loss": 0.0167, + "step": 2714 + }, + { + "epoch": 0.5436349079263411, + "learning_rate": 1.85943022840117e-05, + "loss": 0.12, + "step": 2716 + }, + { + "epoch": 0.544435548438751, + "learning_rate": 1.8601438679426847e-05, + "loss": 0.0096, + "step": 2718 + }, + { + "epoch": 0.544435548438751, + "learning_rate": 1.8608558276045895e-05, + "loss": 0.7054, + "step": 2720 + }, + { + "epoch": 0.5452361889511609, + "learning_rate": 1.8615661059964134e-05, + "loss": 0.1492, + "step": 2722 + }, + { + "epoch": 0.5452361889511609, + "learning_rate": 1.862274701730967e-05, + "loss": 0.0551, + "step": 2724 + }, + { + "epoch": 0.5460368294635709, + "learning_rate": 1.862981613424347e-05, + "loss": 0.15, + "step": 2726 + }, + { + "epoch": 0.5460368294635709, + "learning_rate": 1.86368683969594e-05, + "loss": 0.0171, + "step": 2728 + }, + { + "epoch": 0.5468374699759808, + "learning_rate": 1.864390379168423e-05, + "loss": 0.0123, + "step": 2730 + }, + { + "epoch": 0.5468374699759808, + "learning_rate": 1.865092230467769e-05, + "loss": 0.0278, + "step": 2732 + }, + { + "epoch": 0.5476381104883907, + "learning_rate": 1.8657923922232464e-05, + "loss": 0.0356, + "step": 2734 + }, + { + "epoch": 0.5476381104883907, + "learning_rate": 1.866490863067425e-05, + "loss": 0.0032, + "step": 2736 + }, + { + "epoch": 0.5484387510008006, + "learning_rate": 1.8671876416361763e-05, + "loss": 0.2349, + "step": 2738 + }, + { + "epoch": 0.5484387510008006, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.161, + "step": 2740 + }, + { + "epoch": 0.5492393915132106, + "learning_rate": 1.8685761165074073e-05, + "loss": 0.3891, + "step": 2742 + }, + { + "epoch": 0.5492393915132106, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.0099, + "step": 2744 + }, + { + "epoch": 0.5500400320256205, + "learning_rate": 1.869957805990059e-05, + "loss": 0.0716, + "step": 2746 + }, + { + "epoch": 0.5500400320256205, + "learning_rate": 1.87064610283551e-05, + "loss": 0.6634, + "step": 2748 + }, + { + "epoch": 0.5508406725380304, + "learning_rate": 1.87133269929026e-05, + "loss": 0.1744, + "step": 2750 + }, + { + "epoch": 0.5508406725380304, + "learning_rate": 1.8720175940133705e-05, + "loss": 0.0658, + "step": 2752 + }, + { + "epoch": 0.5516413130504404, + "learning_rate": 1.8727007856672285e-05, + "loss": 0.0021, + "step": 2754 + }, + { + "epoch": 0.5516413130504404, + "learning_rate": 1.873382272917545e-05, + "loss": 0.01, + "step": 2756 + }, + { + "epoch": 0.5524419535628503, + "learning_rate": 1.8740620544333607e-05, + "loss": 0.3089, + "step": 2758 + }, + { + "epoch": 0.5524419535628503, + "learning_rate": 1.8747401288870472e-05, + "loss": 0.0774, + "step": 2760 + }, + { + "epoch": 0.5532425940752602, + "learning_rate": 1.875416494954312e-05, + "loss": 0.0993, + "step": 2762 + }, + { + "epoch": 0.5532425940752602, + "learning_rate": 1.876091151314196e-05, + "loss": 0.0775, + "step": 2764 + }, + { + "epoch": 0.5540432345876701, + "learning_rate": 1.8767640966490813e-05, + "loss": 0.0705, + "step": 2766 + }, + { + "epoch": 0.5540432345876701, + "learning_rate": 1.877435329644691e-05, + "loss": 0.6201, + "step": 2768 + }, + { + "epoch": 0.55484387510008, + "learning_rate": 1.878104848990093e-05, + "loss": 0.0131, + "step": 2770 + }, + { + "epoch": 0.55484387510008, + "learning_rate": 1.8787726533776996e-05, + "loss": 0.1657, + "step": 2772 + }, + { + "epoch": 0.55564451561249, + "learning_rate": 1.879438741503277e-05, + "loss": 0.0588, + "step": 2774 + }, + { + "epoch": 0.55564451561249, + "learning_rate": 1.8801031120659393e-05, + "loss": 0.5328, + "step": 2776 + }, + { + "epoch": 0.5564451561248999, + "learning_rate": 1.8807657637681563e-05, + "loss": 0.1229, + "step": 2778 + }, + { + "epoch": 0.5564451561248999, + "learning_rate": 1.8814266953157557e-05, + "loss": 0.0041, + "step": 2780 + }, + { + "epoch": 0.5572457966373099, + "learning_rate": 1.8820859054179225e-05, + "loss": 0.043, + "step": 2782 + }, + { + "epoch": 0.5572457966373099, + "learning_rate": 1.8827433927872066e-05, + "loss": 0.041, + "step": 2784 + }, + { + "epoch": 0.5580464371497198, + "learning_rate": 1.883399156139519e-05, + "loss": 0.0802, + "step": 2786 + }, + { + "epoch": 0.5580464371497198, + "learning_rate": 1.8840531941941415e-05, + "loss": 0.033, + "step": 2788 + }, + { + "epoch": 0.5588470776621297, + "learning_rate": 1.8847055056737233e-05, + "loss": 0.0894, + "step": 2790 + }, + { + "epoch": 0.5588470776621297, + "learning_rate": 1.8853560893042854e-05, + "loss": 0.2089, + "step": 2792 + }, + { + "epoch": 0.5596477181745396, + "learning_rate": 1.8860049438152244e-05, + "loss": 0.0083, + "step": 2794 + }, + { + "epoch": 0.5596477181745396, + "learning_rate": 1.8866520679393127e-05, + "loss": 0.0368, + "step": 2796 + }, + { + "epoch": 0.5604483586869495, + "learning_rate": 1.8872974604127025e-05, + "loss": 0.1915, + "step": 2798 + }, + { + "epoch": 0.5604483586869495, + "learning_rate": 1.8879411199749303e-05, + "loss": 0.0319, + "step": 2800 + }, + { + "epoch": 0.5612489991993594, + "learning_rate": 1.8885830453689132e-05, + "loss": 0.0045, + "step": 2802 + }, + { + "epoch": 0.5612489991993594, + "learning_rate": 1.889223235340958e-05, + "loss": 0.0009, + "step": 2804 + }, + { + "epoch": 0.5620496397117695, + "learning_rate": 1.889861688640759e-05, + "loss": 0.2281, + "step": 2806 + }, + { + "epoch": 0.5620496397117695, + "learning_rate": 1.8904984040214037e-05, + "loss": 0.4018, + "step": 2808 + }, + { + "epoch": 0.5628502802241794, + "learning_rate": 1.891133380239373e-05, + "loss": 0.0375, + "step": 2810 + }, + { + "epoch": 0.5628502802241794, + "learning_rate": 1.8917666160545436e-05, + "loss": 0.1588, + "step": 2812 + }, + { + "epoch": 0.5636509207365893, + "learning_rate": 1.892398110230194e-05, + "loss": 0.4025, + "step": 2814 + }, + { + "epoch": 0.5636509207365893, + "learning_rate": 1.893027861533002e-05, + "loss": 0.0127, + "step": 2816 + }, + { + "epoch": 0.5644515612489992, + "learning_rate": 1.8936558687330485e-05, + "loss": 0.0379, + "step": 2818 + }, + { + "epoch": 0.5644515612489992, + "learning_rate": 1.894282130603823e-05, + "loss": 0.0099, + "step": 2820 + }, + { + "epoch": 0.5652522017614091, + "learning_rate": 1.8949066459222217e-05, + "loss": 0.0191, + "step": 2822 + }, + { + "epoch": 0.5652522017614091, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.183, + "step": 2824 + }, + { + "epoch": 0.566052842273819, + "learning_rate": 1.8961504320265382e-05, + "loss": 0.0289, + "step": 2826 + }, + { + "epoch": 0.566052842273819, + "learning_rate": 1.896769700383315e-05, + "loss": 0.037, + "step": 2828 + }, + { + "epoch": 0.5668534827862289, + "learning_rate": 1.897387217329439e-05, + "loss": 0.3413, + "step": 2830 + }, + { + "epoch": 0.5668534827862289, + "learning_rate": 1.898002981658886e-05, + "loss": 0.2194, + "step": 2832 + }, + { + "epoch": 0.567654123298639, + "learning_rate": 1.8986169921690543e-05, + "loss": 0.2104, + "step": 2834 + }, + { + "epoch": 0.567654123298639, + "learning_rate": 1.899229247660769e-05, + "loss": 0.0153, + "step": 2836 + }, + { + "epoch": 0.5684547638110489, + "learning_rate": 1.899839746938281e-05, + "loss": 0.0063, + "step": 2838 + }, + { + "epoch": 0.5684547638110489, + "learning_rate": 1.9004484888092724e-05, + "loss": 0.8226, + "step": 2840 + }, + { + "epoch": 0.5692554043234588, + "learning_rate": 1.9010554720848577e-05, + "loss": 0.2321, + "step": 2842 + }, + { + "epoch": 0.5692554043234588, + "learning_rate": 1.901660695579585e-05, + "loss": 0.1893, + "step": 2844 + }, + { + "epoch": 0.5700560448358687, + "learning_rate": 1.9022641581114392e-05, + "loss": 0.13, + "step": 2846 + }, + { + "epoch": 0.5700560448358687, + "learning_rate": 1.9028658585018455e-05, + "loss": 0.0017, + "step": 2848 + }, + { + "epoch": 0.5708566853482786, + "learning_rate": 1.9034657955756695e-05, + "loss": 0.0121, + "step": 2850 + }, + { + "epoch": 0.5708566853482786, + "learning_rate": 1.9040639681612212e-05, + "loss": 0.0172, + "step": 2852 + }, + { + "epoch": 0.5716573258606885, + "learning_rate": 1.904660375090257e-05, + "loss": 0.0371, + "step": 2854 + }, + { + "epoch": 0.5716573258606885, + "learning_rate": 1.9052550151979816e-05, + "loss": 0.04, + "step": 2856 + }, + { + "epoch": 0.5724579663730984, + "learning_rate": 1.905847887323049e-05, + "loss": 0.0279, + "step": 2858 + }, + { + "epoch": 0.5724579663730984, + "learning_rate": 1.9064389903075676e-05, + "loss": 0.7048, + "step": 2860 + }, + { + "epoch": 0.5732586068855084, + "learning_rate": 1.9070283229971007e-05, + "loss": 0.0281, + "step": 2862 + }, + { + "epoch": 0.5732586068855084, + "learning_rate": 1.9076158842406674e-05, + "loss": 0.1451, + "step": 2864 + }, + { + "epoch": 0.5740592473979184, + "learning_rate": 1.9082016728907496e-05, + "loss": 0.1812, + "step": 2866 + }, + { + "epoch": 0.5740592473979184, + "learning_rate": 1.9087856878032886e-05, + "loss": 0.0162, + "step": 2868 + }, + { + "epoch": 0.5748598879103283, + "learning_rate": 1.909367927837691e-05, + "loss": 0.0287, + "step": 2870 + }, + { + "epoch": 0.5748598879103283, + "learning_rate": 1.909948391856829e-05, + "loss": 0.2915, + "step": 2872 + }, + { + "epoch": 0.5756605284227382, + "learning_rate": 1.910527078727044e-05, + "loss": 0.1596, + "step": 2874 + }, + { + "epoch": 0.5756605284227382, + "learning_rate": 1.911103987318148e-05, + "loss": 0.0746, + "step": 2876 + }, + { + "epoch": 0.5764611689351481, + "learning_rate": 1.911679116503425e-05, + "loss": 0.0227, + "step": 2878 + }, + { + "epoch": 0.5764611689351481, + "learning_rate": 1.912252465159637e-05, + "loss": 0.024, + "step": 2880 + }, + { + "epoch": 0.577261809447558, + "learning_rate": 1.9128240321670208e-05, + "loss": 0.1517, + "step": 2882 + }, + { + "epoch": 0.577261809447558, + "learning_rate": 1.913393816409294e-05, + "loss": 0.0867, + "step": 2884 + }, + { + "epoch": 0.578062449959968, + "learning_rate": 1.913961816773655e-05, + "loss": 0.0977, + "step": 2886 + }, + { + "epoch": 0.578062449959968, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.078, + "step": 2888 + }, + { + "epoch": 0.5788630904723779, + "learning_rate": 1.9150924614348588e-05, + "loss": 0.0221, + "step": 2890 + }, + { + "epoch": 0.5788630904723779, + "learning_rate": 1.9156551035235288e-05, + "loss": 0.0232, + "step": 2892 + }, + { + "epoch": 0.5796637309847879, + "learning_rate": 1.916215957317944e-05, + "loss": 0.0087, + "step": 2894 + }, + { + "epoch": 0.5796637309847879, + "learning_rate": 1.9167750217227454e-05, + "loss": 0.0026, + "step": 2896 + }, + { + "epoch": 0.5804643714971978, + "learning_rate": 1.9173322956460675e-05, + "loss": 0.0038, + "step": 2898 + }, + { + "epoch": 0.5804643714971978, + "learning_rate": 1.9178877779995423e-05, + "loss": 0.0075, + "step": 2900 + }, + { + "epoch": 0.5812650120096077, + "learning_rate": 1.9184414676983006e-05, + "loss": 0.0083, + "step": 2902 + }, + { + "epoch": 0.5812650120096077, + "learning_rate": 1.9189933636609747e-05, + "loss": 0.0088, + "step": 2904 + }, + { + "epoch": 0.5820656525220176, + "learning_rate": 1.9195434648097003e-05, + "loss": 0.0053, + "step": 2906 + }, + { + "epoch": 0.5820656525220176, + "learning_rate": 1.9200917700701173e-05, + "loss": 0.3749, + "step": 2908 + }, + { + "epoch": 0.5828662930344275, + "learning_rate": 1.9206382783713738e-05, + "loss": 0.0041, + "step": 2910 + }, + { + "epoch": 0.5828662930344275, + "learning_rate": 1.9211829886461274e-05, + "loss": 1.006, + "step": 2912 + }, + { + "epoch": 0.5836669335468375, + "learning_rate": 1.921725899830547e-05, + "loss": 0.282, + "step": 2914 + }, + { + "epoch": 0.5836669335468375, + "learning_rate": 1.9222670108643146e-05, + "loss": 0.7329, + "step": 2916 + }, + { + "epoch": 0.5844675740592474, + "learning_rate": 1.92280632069063e-05, + "loss": 0.5726, + "step": 2918 + }, + { + "epoch": 0.5844675740592474, + "learning_rate": 1.9233438282562085e-05, + "loss": 0.0054, + "step": 2920 + }, + { + "epoch": 0.5852682145716573, + "learning_rate": 1.9238795325112867e-05, + "loss": 0.0152, + "step": 2922 + }, + { + "epoch": 0.5852682145716573, + "learning_rate": 1.924413432409622e-05, + "loss": 0.016, + "step": 2924 + }, + { + "epoch": 0.5860688550840673, + "learning_rate": 1.924945526908497e-05, + "loss": 0.071, + "step": 2926 + }, + { + "epoch": 0.5860688550840673, + "learning_rate": 1.925475814968719e-05, + "loss": 0.0073, + "step": 2928 + }, + { + "epoch": 0.5868694955964772, + "learning_rate": 1.9260042955546237e-05, + "loss": 0.0126, + "step": 2930 + }, + { + "epoch": 0.5868694955964772, + "learning_rate": 1.926530967634078e-05, + "loss": 0.191, + "step": 2932 + }, + { + "epoch": 0.5876701361088871, + "learning_rate": 1.9270558301784795e-05, + "loss": 0.4624, + "step": 2934 + }, + { + "epoch": 0.5876701361088871, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.011, + "step": 2936 + }, + { + "epoch": 0.588470776621297, + "learning_rate": 1.9281001225653887e-05, + "loss": 0.6002, + "step": 2938 + }, + { + "epoch": 0.588470776621297, + "learning_rate": 1.9286195503683705e-05, + "loss": 0.9091, + "step": 2940 + }, + { + "epoch": 0.589271417133707, + "learning_rate": 1.9291371645572517e-05, + "loss": 0.3852, + "step": 2942 + }, + { + "epoch": 0.589271417133707, + "learning_rate": 1.9296529641211215e-05, + "loss": 0.0481, + "step": 2944 + }, + { + "epoch": 0.5900720576461169, + "learning_rate": 1.9301669480526115e-05, + "loss": 0.0323, + "step": 2946 + }, + { + "epoch": 0.5900720576461169, + "learning_rate": 1.9306791153479004e-05, + "loss": 0.0799, + "step": 2948 + }, + { + "epoch": 0.5908726981585268, + "learning_rate": 1.931189465006714e-05, + "loss": 0.291, + "step": 2950 + }, + { + "epoch": 0.5908726981585268, + "learning_rate": 1.9316979960323286e-05, + "loss": 0.0387, + "step": 2952 + }, + { + "epoch": 0.5916733386709367, + "learning_rate": 1.9322047074315717e-05, + "loss": 0.043, + "step": 2954 + }, + { + "epoch": 0.5916733386709367, + "learning_rate": 1.932709598214825e-05, + "loss": 0.0539, + "step": 2956 + }, + { + "epoch": 0.5924739791833467, + "learning_rate": 1.9332126673960262e-05, + "loss": 0.0134, + "step": 2958 + }, + { + "epoch": 0.5924739791833467, + "learning_rate": 1.9337139139926707e-05, + "loss": 0.0123, + "step": 2960 + }, + { + "epoch": 0.5932746196957566, + "learning_rate": 1.934213337025812e-05, + "loss": 0.0066, + "step": 2962 + }, + { + "epoch": 0.5932746196957566, + "learning_rate": 1.9347109355200672e-05, + "loss": 0.0668, + "step": 2964 + }, + { + "epoch": 0.5940752602081665, + "learning_rate": 1.9352067085036145e-05, + "loss": 0.0528, + "step": 2966 + }, + { + "epoch": 0.5940752602081665, + "learning_rate": 1.935700655008199e-05, + "loss": 0.0368, + "step": 2968 + }, + { + "epoch": 0.5948759007205765, + "learning_rate": 1.9361927740691327e-05, + "loss": 0.0704, + "step": 2970 + }, + { + "epoch": 0.5948759007205765, + "learning_rate": 1.9366830647252967e-05, + "loss": 0.2066, + "step": 2972 + }, + { + "epoch": 0.5956765412329864, + "learning_rate": 1.937171526019142e-05, + "loss": 0.1714, + "step": 2974 + }, + { + "epoch": 0.5956765412329864, + "learning_rate": 1.9376581569966933e-05, + "loss": 0.271, + "step": 2976 + }, + { + "epoch": 0.5964771817453963, + "learning_rate": 1.9381429567075504e-05, + "loss": 0.855, + "step": 2978 + }, + { + "epoch": 0.5964771817453963, + "learning_rate": 1.9386259242048883e-05, + "loss": 0.3778, + "step": 2980 + }, + { + "epoch": 0.5972778222578062, + "learning_rate": 1.93910705854546e-05, + "loss": 0.0572, + "step": 2982 + }, + { + "epoch": 0.5972778222578062, + "learning_rate": 1.939586358789602e-05, + "loss": 0.2539, + "step": 2984 + }, + { + "epoch": 0.5980784627702161, + "learning_rate": 1.9400638240012294e-05, + "loss": 0.3678, + "step": 2986 + }, + { + "epoch": 0.5980784627702161, + "learning_rate": 1.940539453247842e-05, + "loss": 0.0024, + "step": 2988 + }, + { + "epoch": 0.5988791032826261, + "learning_rate": 1.9410132456005262e-05, + "loss": 0.5758, + "step": 2990 + }, + { + "epoch": 0.5988791032826261, + "learning_rate": 1.9414852001339547e-05, + "loss": 0.0101, + "step": 2992 + }, + { + "epoch": 0.5996797437950361, + "learning_rate": 1.9419553159263896e-05, + "loss": 0.0215, + "step": 2994 + }, + { + "epoch": 0.5996797437950361, + "learning_rate": 1.9424235920596863e-05, + "loss": 0.0641, + "step": 2996 + }, + { + "epoch": 0.600480384307446, + "learning_rate": 1.94289002761929e-05, + "loss": 0.0931, + "step": 2998 + }, + { + "epoch": 0.600480384307446, + "learning_rate": 1.9433546216942423e-05, + "loss": 0.2934, + "step": 3000 + }, + { + "epoch": 0.6012810248198559, + "learning_rate": 1.943817373377181e-05, + "loss": 0.0107, + "step": 3002 + }, + { + "epoch": 0.6012810248198559, + "learning_rate": 1.944278281764342e-05, + "loss": 0.747, + "step": 3004 + }, + { + "epoch": 0.6020816653322658, + "learning_rate": 1.944737345955561e-05, + "loss": 0.2638, + "step": 3006 + }, + { + "epoch": 0.6020816653322658, + "learning_rate": 1.945194565054276e-05, + "loss": 0.0703, + "step": 3008 + }, + { + "epoch": 0.6028823058446757, + "learning_rate": 1.945649938167528e-05, + "loss": 0.2821, + "step": 3010 + }, + { + "epoch": 0.6028823058446757, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.082, + "step": 3012 + }, + { + "epoch": 0.6036829463570856, + "learning_rate": 1.946555142883836e-05, + "loss": 0.0637, + "step": 3014 + }, + { + "epoch": 0.6036829463570856, + "learning_rate": 1.9470049727190073e-05, + "loss": 0.0145, + "step": 3016 + }, + { + "epoch": 0.6044835868694955, + "learning_rate": 1.9474529530329507e-05, + "loss": 0.0379, + "step": 3018 + }, + { + "epoch": 0.6044835868694955, + "learning_rate": 1.9478990829507504e-05, + "loss": 0.4971, + "step": 3020 + }, + { + "epoch": 0.6052842273819056, + "learning_rate": 1.9483433616011047e-05, + "loss": 0.0077, + "step": 3022 + }, + { + "epoch": 0.6052842273819056, + "learning_rate": 1.948785788116329e-05, + "loss": 0.122, + "step": 3024 + }, + { + "epoch": 0.6060848678943155, + "learning_rate": 1.9492263616323533e-05, + "loss": 0.0685, + "step": 3026 + }, + { + "epoch": 0.6060848678943155, + "learning_rate": 1.9496650812887286e-05, + "loss": 0.0079, + "step": 3028 + }, + { + "epoch": 0.6068855084067254, + "learning_rate": 1.9501019462286263e-05, + "loss": 0.107, + "step": 3030 + }, + { + "epoch": 0.6068855084067254, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.0711, + "step": 3032 + }, + { + "epoch": 0.6076861489191353, + "learning_rate": 1.9509701085497842e-05, + "loss": 0.0729, + "step": 3034 + }, + { + "epoch": 0.6076861489191353, + "learning_rate": 1.951401404235505e-05, + "loss": 0.0918, + "step": 3036 + }, + { + "epoch": 0.6084867894315452, + "learning_rate": 1.9518308418136718e-05, + "loss": 0.2476, + "step": 3038 + }, + { + "epoch": 0.6084867894315452, + "learning_rate": 1.952258420445583e-05, + "loss": 0.1612, + "step": 3040 + }, + { + "epoch": 0.6092874299439551, + "learning_rate": 1.952684139296169e-05, + "loss": 0.0385, + "step": 3042 + }, + { + "epoch": 0.6092874299439551, + "learning_rate": 1.9531079975339912e-05, + "loss": 0.0995, + "step": 3044 + }, + { + "epoch": 0.610088070456365, + "learning_rate": 1.9535299943312455e-05, + "loss": 0.4713, + "step": 3046 + }, + { + "epoch": 0.610088070456365, + "learning_rate": 1.953950128863762e-05, + "loss": 0.6503, + "step": 3048 + }, + { + "epoch": 0.6108887109687751, + "learning_rate": 1.9543684003110105e-05, + "loss": 0.0304, + "step": 3050 + }, + { + "epoch": 0.6108887109687751, + "learning_rate": 1.9547848078560975e-05, + "loss": 0.0078, + "step": 3052 + }, + { + "epoch": 0.611689351481185, + "learning_rate": 1.9551993506857688e-05, + "loss": 0.2751, + "step": 3054 + }, + { + "epoch": 0.611689351481185, + "learning_rate": 1.9556120279904144e-05, + "loss": 0.0956, + "step": 3056 + }, + { + "epoch": 0.6124899919935949, + "learning_rate": 1.9560228389640664e-05, + "loss": 0.0287, + "step": 3058 + }, + { + "epoch": 0.6124899919935949, + "learning_rate": 1.956431782804402e-05, + "loss": 0.1193, + "step": 3060 + }, + { + "epoch": 0.6132906325060048, + "learning_rate": 1.956838858712744e-05, + "loss": 0.0186, + "step": 3062 + }, + { + "epoch": 0.6132906325060048, + "learning_rate": 1.957244065894066e-05, + "loss": 0.0283, + "step": 3064 + }, + { + "epoch": 0.6140912730184147, + "learning_rate": 1.9576474035569892e-05, + "loss": 0.0099, + "step": 3066 + }, + { + "epoch": 0.6140912730184147, + "learning_rate": 1.9580488709137858e-05, + "loss": 0.0166, + "step": 3068 + }, + { + "epoch": 0.6148919135308246, + "learning_rate": 1.9584484671803818e-05, + "loss": 0.3175, + "step": 3070 + }, + { + "epoch": 0.6148919135308246, + "learning_rate": 1.9588461915763566e-05, + "loss": 0.0513, + "step": 3072 + }, + { + "epoch": 0.6156925540432346, + "learning_rate": 1.9592420433249462e-05, + "loss": 0.0038, + "step": 3074 + }, + { + "epoch": 0.6156925540432346, + "learning_rate": 1.9596360216530436e-05, + "loss": 0.4266, + "step": 3076 + }, + { + "epoch": 0.6164931945556446, + "learning_rate": 1.9600281257912e-05, + "loss": 0.0064, + "step": 3078 + }, + { + "epoch": 0.6164931945556446, + "learning_rate": 1.9604183549736283e-05, + "loss": 0.0861, + "step": 3080 + }, + { + "epoch": 0.6172938350680545, + "learning_rate": 1.960806708438202e-05, + "loss": 0.1805, + "step": 3082 + }, + { + "epoch": 0.6172938350680545, + "learning_rate": 1.961193185426459e-05, + "loss": 0.8899, + "step": 3084 + }, + { + "epoch": 0.6180944755804644, + "learning_rate": 1.9615777851836003e-05, + "loss": 0.0271, + "step": 3086 + }, + { + "epoch": 0.6180944755804644, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.453, + "step": 3088 + }, + { + "epoch": 0.6188951160928743, + "learning_rate": 1.962341350003679e-05, + "loss": 0.0084, + "step": 3090 + }, + { + "epoch": 0.6188951160928743, + "learning_rate": 1.9627203135753573e-05, + "loss": 1.3815, + "step": 3092 + }, + { + "epoch": 0.6196957566052842, + "learning_rate": 1.9630973969334068e-05, + "loss": 0.5617, + "step": 3094 + }, + { + "epoch": 0.6196957566052842, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.0022, + "step": 3096 + }, + { + "epoch": 0.6204963971176941, + "learning_rate": 1.9638459200664822e-05, + "loss": 0.5165, + "step": 3098 + }, + { + "epoch": 0.6204963971176941, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.01, + "step": 3100 + }, + { + "epoch": 0.6212970376301041, + "learning_rate": 1.9645869135553806e-05, + "loss": 0.0466, + "step": 3102 + }, + { + "epoch": 0.6212970376301041, + "learning_rate": 1.964954584871995e-05, + "loss": 0.0054, + "step": 3104 + }, + { + "epoch": 0.622097678142514, + "learning_rate": 1.965320371611399e-05, + "loss": 0.0045, + "step": 3106 + }, + { + "epoch": 0.622097678142514, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.1016, + "step": 3108 + }, + { + "epoch": 0.622898318654924, + "learning_rate": 1.9660462885047032e-05, + "loss": 0.0887, + "step": 3110 + }, + { + "epoch": 0.622898318654924, + "learning_rate": 1.966406417240872e-05, + "loss": 0.0589, + "step": 3112 + }, + { + "epoch": 0.6236989591673339, + "learning_rate": 1.9667646585643703e-05, + "loss": 0.7948, + "step": 3114 + }, + { + "epoch": 0.6236989591673339, + "learning_rate": 1.967121011775546e-05, + "loss": 0.0584, + "step": 3116 + }, + { + "epoch": 0.6244995996797438, + "learning_rate": 1.967475476178433e-05, + "loss": 0.2049, + "step": 3118 + }, + { + "epoch": 0.6244995996797438, + "learning_rate": 1.967828051080755e-05, + "loss": 0.0487, + "step": 3120 + }, + { + "epoch": 0.6253002401921537, + "learning_rate": 1.9681787357939254e-05, + "loss": 0.0081, + "step": 3122 + }, + { + "epoch": 0.6253002401921537, + "learning_rate": 1.9685275296330497e-05, + "loss": 0.1137, + "step": 3124 + }, + { + "epoch": 0.6261008807045636, + "learning_rate": 1.968874431916926e-05, + "loss": 0.353, + "step": 3126 + }, + { + "epoch": 0.6261008807045636, + "learning_rate": 1.969219441968046e-05, + "loss": 0.0147, + "step": 3128 + }, + { + "epoch": 0.6269015212169736, + "learning_rate": 1.969562559112598e-05, + "loss": 0.041, + "step": 3130 + }, + { + "epoch": 0.6269015212169736, + "learning_rate": 1.969903782680467e-05, + "loss": 0.0981, + "step": 3132 + }, + { + "epoch": 0.6277021617293835, + "learning_rate": 1.970243112005235e-05, + "loss": 0.0737, + "step": 3134 + }, + { + "epoch": 0.6277021617293835, + "learning_rate": 1.9705805464241856e-05, + "loss": 0.0311, + "step": 3136 + }, + { + "epoch": 0.6285028022417934, + "learning_rate": 1.970916085278302e-05, + "loss": 0.2111, + "step": 3138 + }, + { + "epoch": 0.6285028022417934, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.1726, + "step": 3140 + }, + { + "epoch": 0.6293034427542034, + "learning_rate": 1.9715814736744755e-05, + "loss": 0.0226, + "step": 3142 + }, + { + "epoch": 0.6293034427542034, + "learning_rate": 1.971911321917015e-05, + "loss": 0.0579, + "step": 3144 + }, + { + "epoch": 0.6301040832666133, + "learning_rate": 1.9722392719956864e-05, + "loss": 0.2623, + "step": 3146 + }, + { + "epoch": 0.6301040832666133, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.285, + "step": 3148 + }, + { + "epoch": 0.6309047237790232, + "learning_rate": 1.9728894751031595e-05, + "loss": 0.025, + "step": 3150 + }, + { + "epoch": 0.6309047237790232, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.3016, + "step": 3152 + }, + { + "epoch": 0.6317053642914331, + "learning_rate": 1.9735320779174545e-05, + "loss": 0.2965, + "step": 3154 + }, + { + "epoch": 0.6317053642914331, + "learning_rate": 1.9738505276435692e-05, + "loss": 0.0027, + "step": 3156 + }, + { + "epoch": 0.6325060048038431, + "learning_rate": 1.974167075418505e-05, + "loss": 0.498, + "step": 3158 + }, + { + "epoch": 0.6325060048038431, + "learning_rate": 1.9744817206240377e-05, + "loss": 0.0222, + "step": 3160 + }, + { + "epoch": 0.633306645316253, + "learning_rate": 1.9747944626456577e-05, + "loss": 0.023, + "step": 3162 + }, + { + "epoch": 0.633306645316253, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.0554, + "step": 3164 + }, + { + "epoch": 0.6341072858286629, + "learning_rate": 1.975414234697712e-05, + "loss": 0.3539, + "step": 3166 + }, + { + "epoch": 0.6341072858286629, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.0299, + "step": 3168 + }, + { + "epoch": 0.6349079263410728, + "learning_rate": 1.9760263867329568e-05, + "loss": 0.152, + "step": 3170 + }, + { + "epoch": 0.6349079263410728, + "learning_rate": 1.9763296037475174e-05, + "loss": 0.5325, + "step": 3172 + }, + { + "epoch": 0.6357085668534828, + "learning_rate": 1.97663091396921e-05, + "loss": 0.0711, + "step": 3174 + }, + { + "epoch": 0.6357085668534828, + "learning_rate": 1.976930316809569e-05, + "loss": 0.1907, + "step": 3176 + }, + { + "epoch": 0.6365092073658927, + "learning_rate": 1.9772278116838543e-05, + "loss": 0.1301, + "step": 3178 + }, + { + "epoch": 0.6365092073658927, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.2146, + "step": 3180 + }, + { + "epoch": 0.6373098478783027, + "learning_rate": 1.977817075213876e-05, + "loss": 0.0059, + "step": 3182 + }, + { + "epoch": 0.6373098478783027, + "learning_rate": 1.978108842718768e-05, + "loss": 0.0545, + "step": 3184 + }, + { + "epoch": 0.6381104883907126, + "learning_rate": 1.9783986999558994e-05, + "loss": 0.1227, + "step": 3186 + }, + { + "epoch": 0.6381104883907126, + "learning_rate": 1.9786866463591732e-05, + "loss": 0.09, + "step": 3188 + }, + { + "epoch": 0.6389111289031225, + "learning_rate": 1.9789726813662233e-05, + "loss": 0.0849, + "step": 3190 + }, + { + "epoch": 0.6389111289031225, + "learning_rate": 1.9792568044184176e-05, + "loss": 0.0077, + "step": 3192 + }, + { + "epoch": 0.6397117694155324, + "learning_rate": 1.979539014960858e-05, + "loss": 0.0007, + "step": 3194 + }, + { + "epoch": 0.6397117694155324, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.4283, + "step": 3196 + }, + { + "epoch": 0.6405124099279423, + "learning_rate": 1.9800976963155584e-05, + "loss": 0.0089, + "step": 3198 + }, + { + "epoch": 0.6405124099279423, + "learning_rate": 1.9803741660367015e-05, + "loss": 0.0636, + "step": 3200 + }, + { + "epoch": 0.6413130504403523, + "learning_rate": 1.980648721065859e-05, + "loss": 0.6478, + "step": 3202 + }, + { + "epoch": 0.6413130504403523, + "learning_rate": 1.9809213608668185e-05, + "loss": 0.0164, + "step": 3204 + }, + { + "epoch": 0.6421136909527622, + "learning_rate": 1.9811920849071092e-05, + "loss": 0.0198, + "step": 3206 + }, + { + "epoch": 0.6421136909527622, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.0143, + "step": 3208 + }, + { + "epoch": 0.6429143314651722, + "learning_rate": 1.9817277835945057e-05, + "loss": 0.176, + "step": 3210 + }, + { + "epoch": 0.6429143314651722, + "learning_rate": 1.9819927571953807e-05, + "loss": 0.1418, + "step": 3212 + }, + { + "epoch": 0.6437149719775821, + "learning_rate": 1.9822558129431263e-05, + "loss": 0.0247, + "step": 3214 + }, + { + "epoch": 0.6437149719775821, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.0744, + "step": 3216 + }, + { + "epoch": 0.644515612489992, + "learning_rate": 1.9827761688279606e-05, + "loss": 0.0111, + "step": 3218 + }, + { + "epoch": 0.644515612489992, + "learning_rate": 1.983033467948784e-05, + "loss": 0.124, + "step": 3220 + }, + { + "epoch": 0.6453162530024019, + "learning_rate": 1.983288847183947e-05, + "loss": 0.0159, + "step": 3222 + }, + { + "epoch": 0.6453162530024019, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.017, + "step": 3224 + }, + { + "epoch": 0.6461168935148118, + "learning_rate": 1.9837938440059992e-05, + "loss": 0.696, + "step": 3226 + }, + { + "epoch": 0.6461168935148118, + "learning_rate": 1.9840434606066182e-05, + "loss": 0.0008, + "step": 3228 + }, + { + "epoch": 0.6469175340272217, + "learning_rate": 1.9842911553490392e-05, + "loss": 0.8495, + "step": 3230 + }, + { + "epoch": 0.6469175340272217, + "learning_rate": 1.9845369277495102e-05, + "loss": 0.0913, + "step": 3232 + }, + { + "epoch": 0.6477181745396317, + "learning_rate": 1.984780777328031e-05, + "loss": 0.4274, + "step": 3234 + }, + { + "epoch": 0.6477181745396317, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.0729, + "step": 3236 + }, + { + "epoch": 0.6485188150520417, + "learning_rate": 1.985262706118007e-05, + "loss": 0.5578, + "step": 3238 + }, + { + "epoch": 0.6485188150520417, + "learning_rate": 1.985500784388244e-05, + "loss": 0.1194, + "step": 3240 + }, + { + "epoch": 0.6493194555644516, + "learning_rate": 1.9857369379540982e-05, + "loss": 0.2974, + "step": 3242 + }, + { + "epoch": 0.6493194555644516, + "learning_rate": 1.985971166354357e-05, + "loss": 0.2707, + "step": 3244 + }, + { + "epoch": 0.6501200960768615, + "learning_rate": 1.986203469131567e-05, + "loss": 0.0881, + "step": 3246 + }, + { + "epoch": 0.6501200960768615, + "learning_rate": 1.9864338458320366e-05, + "loss": 0.0624, + "step": 3248 + }, + { + "epoch": 0.6509207365892714, + "learning_rate": 1.986662296005834e-05, + "loss": 0.0781, + "step": 3250 + }, + { + "epoch": 0.6509207365892714, + "learning_rate": 1.986888819206792e-05, + "loss": 0.7957, + "step": 3252 + }, + { + "epoch": 0.6517213771016813, + "learning_rate": 1.987113414992505e-05, + "loss": 0.1136, + "step": 3254 + }, + { + "epoch": 0.6517213771016813, + "learning_rate": 1.9873360829243323e-05, + "loss": 0.0822, + "step": 3256 + }, + { + "epoch": 0.6525220176140912, + "learning_rate": 1.9875568225674e-05, + "loss": 0.1393, + "step": 3258 + }, + { + "epoch": 0.6525220176140912, + "learning_rate": 1.9877756334905983e-05, + "loss": 0.0616, + "step": 3260 + }, + { + "epoch": 0.6533226581265013, + "learning_rate": 1.9879925152665845e-05, + "loss": 0.2217, + "step": 3262 + }, + { + "epoch": 0.6533226581265013, + "learning_rate": 1.9882074674717836e-05, + "loss": 0.4713, + "step": 3264 + }, + { + "epoch": 0.6541232986389112, + "learning_rate": 1.9884204896863895e-05, + "loss": 0.2647, + "step": 3266 + }, + { + "epoch": 0.6541232986389112, + "learning_rate": 1.988631581494365e-05, + "loss": 0.0853, + "step": 3268 + }, + { + "epoch": 0.6549239391513211, + "learning_rate": 1.9888407424834433e-05, + "loss": 0.134, + "step": 3270 + }, + { + "epoch": 0.6549239391513211, + "learning_rate": 1.989047972245129e-05, + "loss": 0.6033, + "step": 3272 + }, + { + "epoch": 0.655724579663731, + "learning_rate": 1.989253270374697e-05, + "loss": 0.0012, + "step": 3274 + }, + { + "epoch": 0.655724579663731, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.0911, + "step": 3276 + }, + { + "epoch": 0.6565252201761409, + "learning_rate": 1.9896580701374482e-05, + "loss": 0.2662, + "step": 3278 + }, + { + "epoch": 0.6565252201761409, + "learning_rate": 1.989857570980049e-05, + "loss": 0.0069, + "step": 3280 + }, + { + "epoch": 0.6573258606885508, + "learning_rate": 1.9900551386093677e-05, + "loss": 0.2149, + "step": 3282 + }, + { + "epoch": 0.6573258606885508, + "learning_rate": 1.990250772639552e-05, + "loss": 0.1645, + "step": 3284 + }, + { + "epoch": 0.6581265012009607, + "learning_rate": 1.9904444726885236e-05, + "loss": 0.2298, + "step": 3286 + }, + { + "epoch": 0.6581265012009607, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.6133, + "step": 3288 + }, + { + "epoch": 0.6589271417133707, + "learning_rate": 1.990826069333406e-05, + "loss": 0.1084, + "step": 3290 + }, + { + "epoch": 0.6589271417133707, + "learning_rate": 1.99101396518405e-05, + "loss": 0.3521, + "step": 3292 + }, + { + "epoch": 0.6597277822257807, + "learning_rate": 1.99119992556295e-05, + "loss": 0.0314, + "step": 3294 + }, + { + "epoch": 0.6597277822257807, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.2525, + "step": 3296 + }, + { + "epoch": 0.6605284227381906, + "learning_rate": 1.99156603845656e-05, + "loss": 0.2435, + "step": 3298 + }, + { + "epoch": 0.6605284227381906, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.1624, + "step": 3300 + }, + { + "epoch": 0.6613290632506005, + "learning_rate": 1.9919244051541315e-05, + "loss": 0.0461, + "step": 3302 + }, + { + "epoch": 0.6613290632506005, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.2821, + "step": 3304 + }, + { + "epoch": 0.6621297037630104, + "learning_rate": 1.9922750228560746e-05, + "loss": 0.0831, + "step": 3306 + }, + { + "epoch": 0.6621297037630104, + "learning_rate": 1.9924474249753652e-05, + "loss": 0.0512, + "step": 3308 + }, + { + "epoch": 0.6629303442754203, + "learning_rate": 1.9926178888233344e-05, + "loss": 0.1599, + "step": 3310 + }, + { + "epoch": 0.6629303442754203, + "learning_rate": 1.9927864140670615e-05, + "loss": 0.0228, + "step": 3312 + }, + { + "epoch": 0.6637309847878302, + "learning_rate": 1.9929530003774133e-05, + "loss": 0.651, + "step": 3314 + }, + { + "epoch": 0.6637309847878302, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.024, + "step": 3316 + }, + { + "epoch": 0.6645316253002402, + "learning_rate": 1.993280354900393e-05, + "loss": 0.0712, + "step": 3318 + }, + { + "epoch": 0.6645316253002402, + "learning_rate": 1.99344112247369e-05, + "loss": 0.1585, + "step": 3320 + }, + { + "epoch": 0.6653322658126501, + "learning_rate": 1.9935999498349518e-05, + "loss": 0.0323, + "step": 3322 + }, + { + "epoch": 0.6653322658126501, + "learning_rate": 1.9937568366739858e-05, + "loss": 0.0438, + "step": 3324 + }, + { + "epoch": 0.6661329063250601, + "learning_rate": 1.9939117826843883e-05, + "loss": 0.2793, + "step": 3326 + }, + { + "epoch": 0.6661329063250601, + "learning_rate": 1.9940647875635463e-05, + "loss": 0.0484, + "step": 3328 + }, + { + "epoch": 0.66693354683747, + "learning_rate": 1.9942158510126384e-05, + "loss": 0.0095, + "step": 3330 + }, + { + "epoch": 0.66693354683747, + "learning_rate": 1.9943649727366335e-05, + "loss": 0.0006, + "step": 3332 + }, + { + "epoch": 0.6677341873498799, + "learning_rate": 1.9945121524442944e-05, + "loss": 0.2856, + "step": 3334 + }, + { + "epoch": 0.6677341873498799, + "learning_rate": 1.994657389848176e-05, + "loss": 0.2441, + "step": 3336 + }, + { + "epoch": 0.6685348278622898, + "learning_rate": 1.9948006846646262e-05, + "loss": 0.0011, + "step": 3338 + }, + { + "epoch": 0.6685348278622898, + "learning_rate": 1.994942036613787e-05, + "loss": 0.3692, + "step": 3340 + }, + { + "epoch": 0.6693354683746997, + "learning_rate": 1.9950814454195953e-05, + "loss": 0.0364, + "step": 3342 + }, + { + "epoch": 0.6693354683746997, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.0177, + "step": 3344 + }, + { + "epoch": 0.6701361088871097, + "learning_rate": 1.9953544325158755e-05, + "loss": 0.0339, + "step": 3346 + }, + { + "epoch": 0.6701361088871097, + "learning_rate": 1.995488010273198e-05, + "loss": 0.357, + "step": 3348 + }, + { + "epoch": 0.6709367493995196, + "learning_rate": 1.9956196438208693e-05, + "loss": 0.4034, + "step": 3350 + }, + { + "epoch": 0.6709367493995196, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.0012, + "step": 3352 + }, + { + "epoch": 0.6717373899119295, + "learning_rate": 1.9958770772627236e-05, + "loss": 0.7196, + "step": 3354 + }, + { + "epoch": 0.6717373899119295, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.0016, + "step": 3356 + }, + { + "epoch": 0.6725380304243395, + "learning_rate": 1.9961267308303473e-05, + "loss": 0.0476, + "step": 3358 + }, + { + "epoch": 0.6725380304243395, + "learning_rate": 1.996248639549475e-05, + "loss": 0.0137, + "step": 3360 + }, + { + "epoch": 0.6733386709367494, + "learning_rate": 1.9963686025734262e-05, + "loss": 0.0712, + "step": 3362 + }, + { + "epoch": 0.6733386709367494, + "learning_rate": 1.9964866196679105e-05, + "loss": 0.0611, + "step": 3364 + }, + { + "epoch": 0.6741393114491593, + "learning_rate": 1.9966026906024377e-05, + "loss": 0.121, + "step": 3366 + }, + { + "epoch": 0.6741393114491593, + "learning_rate": 1.9967168151503196e-05, + "loss": 0.2023, + "step": 3368 + }, + { + "epoch": 0.6749399519615693, + "learning_rate": 1.9968289930886675e-05, + "loss": 0.1194, + "step": 3370 + }, + { + "epoch": 0.6749399519615693, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.211, + "step": 3372 + }, + { + "epoch": 0.6757405924739792, + "learning_rate": 1.997047508264221e-05, + "loss": 0.041, + "step": 3374 + }, + { + "epoch": 0.6757405924739792, + "learning_rate": 1.997153845074662e-05, + "loss": 0.5754, + "step": 3376 + }, + { + "epoch": 0.6765412329863891, + "learning_rate": 1.99725823442204e-05, + "loss": 0.1386, + "step": 3378 + }, + { + "epoch": 0.6765412329863891, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.6039, + "step": 3380 + }, + { + "epoch": 0.677341873498799, + "learning_rate": 1.9974611699159142e-05, + "loss": 0.0799, + "step": 3382 + }, + { + "epoch": 0.677341873498799, + "learning_rate": 1.997559715666073e-05, + "loss": 0.0324, + "step": 3384 + }, + { + "epoch": 0.678142514011209, + "learning_rate": 1.9976563131604945e-05, + "loss": 0.1016, + "step": 3386 + }, + { + "epoch": 0.678142514011209, + "learning_rate": 1.9977509622105233e-05, + "loss": 0.1315, + "step": 3388 + }, + { + "epoch": 0.6789431545236189, + "learning_rate": 1.9978436626313065e-05, + "loss": 0.1319, + "step": 3390 + }, + { + "epoch": 0.6789431545236189, + "learning_rate": 1.9979344142417986e-05, + "loss": 0.0379, + "step": 3392 + }, + { + "epoch": 0.6797437950360288, + "learning_rate": 1.99802321686476e-05, + "loss": 0.1742, + "step": 3394 + }, + { + "epoch": 0.6797437950360288, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.0183, + "step": 3396 + }, + { + "epoch": 0.6805444355484388, + "learning_rate": 1.9981949744581622e-05, + "loss": 0.1333, + "step": 3398 + }, + { + "epoch": 0.6805444355484388, + "learning_rate": 1.998277929093157e-05, + "loss": 0.4586, + "step": 3400 + }, + { + "epoch": 0.6813450760608487, + "learning_rate": 1.9983589340697288e-05, + "loss": 0.0989, + "step": 3402 + }, + { + "epoch": 0.6813450760608487, + "learning_rate": 1.998437989229673e-05, + "loss": 0.0247, + "step": 3404 + }, + { + "epoch": 0.6821457165732586, + "learning_rate": 1.998515094418594e-05, + "loss": 0.0246, + "step": 3406 + }, + { + "epoch": 0.6821457165732586, + "learning_rate": 1.9985902494859023e-05, + "loss": 0.016, + "step": 3408 + }, + { + "epoch": 0.6829463570856685, + "learning_rate": 1.99866345428482e-05, + "loss": 0.0073, + "step": 3410 + }, + { + "epoch": 0.6829463570856685, + "learning_rate": 1.998734708672375e-05, + "loss": 0.0428, + "step": 3412 + }, + { + "epoch": 0.6837469975980784, + "learning_rate": 1.998804012509407e-05, + "loss": 0.0134, + "step": 3414 + }, + { + "epoch": 0.6837469975980784, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.0158, + "step": 3416 + }, + { + "epoch": 0.6845476381104884, + "learning_rate": 1.9989367679943025e-05, + "loss": 0.0187, + "step": 3418 + }, + { + "epoch": 0.6845476381104884, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.071, + "step": 3420 + }, + { + "epoch": 0.6853482786228983, + "learning_rate": 1.9990617197024103e-05, + "loss": 0.0827, + "step": 3422 + }, + { + "epoch": 0.6853482786228983, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.0888, + "step": 3424 + }, + { + "epoch": 0.6861489191353083, + "learning_rate": 1.999178866657597e-05, + "loss": 0.0974, + "step": 3426 + }, + { + "epoch": 0.6861489191353083, + "learning_rate": 1.9992345130644747e-05, + "loss": 0.0226, + "step": 3428 + }, + { + "epoch": 0.6869495596477182, + "learning_rate": 1.999288207944701e-05, + "loss": 1.0742, + "step": 3430 + }, + { + "epoch": 0.6869495596477182, + "learning_rate": 1.999339951193407e-05, + "loss": 0.0425, + "step": 3432 + }, + { + "epoch": 0.6877502001601281, + "learning_rate": 1.999389742709538e-05, + "loss": 0.0032, + "step": 3434 + }, + { + "epoch": 0.6877502001601281, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.0406, + "step": 3436 + }, + { + "epoch": 0.688550840672538, + "learning_rate": 1.9994834701589113e-05, + "loss": 0.1616, + "step": 3438 + }, + { + "epoch": 0.688550840672538, + "learning_rate": 1.9995274059091018e-05, + "loss": 0.129, + "step": 3440 + }, + { + "epoch": 0.6893514811849479, + "learning_rate": 1.999569389560614e-05, + "loss": 0.0004, + "step": 3442 + }, + { + "epoch": 0.6893514811849479, + "learning_rate": 1.999609421031453e-05, + "loss": 0.0617, + "step": 3444 + }, + { + "epoch": 0.6901521216973578, + "learning_rate": 1.9996475002434365e-05, + "loss": 0.0975, + "step": 3446 + }, + { + "epoch": 0.6901521216973578, + "learning_rate": 1.999683627122195e-05, + "loss": 0.0126, + "step": 3448 + }, + { + "epoch": 0.6909527622097679, + "learning_rate": 1.999717801597172e-05, + "loss": 0.7388, + "step": 3450 + }, + { + "epoch": 0.6909527622097679, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.0205, + "step": 3452 + }, + { + "epoch": 0.6917534027221778, + "learning_rate": 1.9997802930726195e-05, + "loss": 0.7978, + "step": 3454 + }, + { + "epoch": 0.6917534027221778, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.0108, + "step": 3456 + }, + { + "epoch": 0.6925540432345877, + "learning_rate": 1.9998349741815916e-05, + "loss": 0.0343, + "step": 3458 + }, + { + "epoch": 0.6925540432345877, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.088, + "step": 3460 + }, + { + "epoch": 0.6933546837469976, + "learning_rate": 1.999881844496914e-05, + "loss": 0.0514, + "step": 3462 + }, + { + "epoch": 0.6933546837469976, + "learning_rate": 1.99990235049015e-05, + "loss": 0.0795, + "step": 3464 + }, + { + "epoch": 0.6941553242594075, + "learning_rate": 1.9999209036524326e-05, + "loss": 0.1069, + "step": 3466 + }, + { + "epoch": 0.6941553242594075, + "learning_rate": 1.9999375039475275e-05, + "loss": 0.0334, + "step": 3468 + }, + { + "epoch": 0.6949559647718174, + "learning_rate": 1.999952151343014e-05, + "loss": 0.0335, + "step": 3470 + }, + { + "epoch": 0.6949559647718174, + "learning_rate": 1.999964845810285e-05, + "loss": 0.0203, + "step": 3472 + }, + { + "epoch": 0.6957566052842273, + "learning_rate": 1.9999755873245484e-05, + "loss": 0.0747, + "step": 3474 + }, + { + "epoch": 0.6957566052842273, + "learning_rate": 1.9999843758648253e-05, + "loss": 0.0606, + "step": 3476 + }, + { + "epoch": 0.6965572457966374, + "learning_rate": 1.999991211413952e-05, + "loss": 0.0444, + "step": 3478 + }, + { + "epoch": 0.6965572457966374, + "learning_rate": 1.999996093958578e-05, + "loss": 0.3763, + "step": 3480 + }, + { + "epoch": 0.6973578863090473, + "learning_rate": 1.9999990234891677e-05, + "loss": 0.0199, + "step": 3482 + }, + { + "epoch": 0.6973578863090473, + "learning_rate": 2e-05, + "loss": 0.0067, + "step": 3484 + }, + { + "epoch": 0.6981585268214572, + "learning_rate": 1.999999023489168e-05, + "loss": 0.0017, + "step": 3486 + }, + { + "epoch": 0.6981585268214572, + "learning_rate": 1.999996093958578e-05, + "loss": 0.064, + "step": 3488 + }, + { + "epoch": 0.6989591673338671, + "learning_rate": 1.999991211413952e-05, + "loss": 0.0495, + "step": 3490 + }, + { + "epoch": 0.6989591673338671, + "learning_rate": 1.9999843758648253e-05, + "loss": 0.1926, + "step": 3492 + }, + { + "epoch": 0.699759807846277, + "learning_rate": 1.9999755873245484e-05, + "loss": 0.073, + "step": 3494 + }, + { + "epoch": 0.699759807846277, + "learning_rate": 1.999964845810285e-05, + "loss": 0.2266, + "step": 3496 + }, + { + "epoch": 0.7005604483586869, + "learning_rate": 1.999952151343014e-05, + "loss": 0.0027, + "step": 3498 + }, + { + "epoch": 0.7005604483586869, + "learning_rate": 1.9999375039475278e-05, + "loss": 0.1246, + "step": 3500 + }, + { + "epoch": 0.7013610888710968, + "learning_rate": 1.9999209036524326e-05, + "loss": 0.024, + "step": 3502 + }, + { + "epoch": 0.7013610888710968, + "learning_rate": 1.99990235049015e-05, + "loss": 0.0106, + "step": 3504 + }, + { + "epoch": 0.7021617293835068, + "learning_rate": 1.999881844496914e-05, + "loss": 0.0264, + "step": 3506 + }, + { + "epoch": 0.7021617293835068, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.0803, + "step": 3508 + }, + { + "epoch": 0.7029623698959168, + "learning_rate": 1.9998349741815916e-05, + "loss": 0.016, + "step": 3510 + }, + { + "epoch": 0.7029623698959168, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.0256, + "step": 3512 + }, + { + "epoch": 0.7037630104083267, + "learning_rate": 1.9997802930726195e-05, + "loss": 0.0268, + "step": 3514 + }, + { + "epoch": 0.7037630104083267, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.6471, + "step": 3516 + }, + { + "epoch": 0.7045636509207366, + "learning_rate": 1.999717801597172e-05, + "loss": 0.0331, + "step": 3518 + }, + { + "epoch": 0.7045636509207366, + "learning_rate": 1.999683627122195e-05, + "loss": 0.1949, + "step": 3520 + }, + { + "epoch": 0.7053642914331465, + "learning_rate": 1.9996475002434365e-05, + "loss": 0.0649, + "step": 3522 + }, + { + "epoch": 0.7053642914331465, + "learning_rate": 1.999609421031453e-05, + "loss": 0.1302, + "step": 3524 + }, + { + "epoch": 0.7061649319455564, + "learning_rate": 1.999569389560614e-05, + "loss": 0.0398, + "step": 3526 + }, + { + "epoch": 0.7061649319455564, + "learning_rate": 1.999527405909102e-05, + "loss": 0.0271, + "step": 3528 + }, + { + "epoch": 0.7069655724579663, + "learning_rate": 1.9994834701589113e-05, + "loss": 0.0007, + "step": 3530 + }, + { + "epoch": 0.7069655724579663, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.011, + "step": 3532 + }, + { + "epoch": 0.7077662129703763, + "learning_rate": 1.9993897427095378e-05, + "loss": 0.0028, + "step": 3534 + }, + { + "epoch": 0.7077662129703763, + "learning_rate": 1.999339951193407e-05, + "loss": 1.4264, + "step": 3536 + }, + { + "epoch": 0.7085668534827863, + "learning_rate": 1.999288207944701e-05, + "loss": 0.0055, + "step": 3538 + }, + { + "epoch": 0.7085668534827863, + "learning_rate": 1.999234513064475e-05, + "loss": 0.2434, + "step": 3540 + }, + { + "epoch": 0.7093674939951962, + "learning_rate": 1.999178866657597e-05, + "loss": 0.0111, + "step": 3542 + }, + { + "epoch": 0.7093674939951962, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.0421, + "step": 3544 + }, + { + "epoch": 0.7101681345076061, + "learning_rate": 1.9990617197024103e-05, + "loss": 0.1452, + "step": 3546 + }, + { + "epoch": 0.7101681345076061, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.0265, + "step": 3548 + }, + { + "epoch": 0.710968775020016, + "learning_rate": 1.998936767994303e-05, + "loss": 0.452, + "step": 3550 + }, + { + "epoch": 0.710968775020016, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.0135, + "step": 3552 + }, + { + "epoch": 0.7117694155324259, + "learning_rate": 1.998804012509407e-05, + "loss": 0.0364, + "step": 3554 + }, + { + "epoch": 0.7117694155324259, + "learning_rate": 1.998734708672375e-05, + "loss": 0.0142, + "step": 3556 + }, + { + "epoch": 0.7125700560448359, + "learning_rate": 1.99866345428482e-05, + "loss": 0.03, + "step": 3558 + }, + { + "epoch": 0.7125700560448359, + "learning_rate": 1.9985902494859026e-05, + "loss": 0.0912, + "step": 3560 + }, + { + "epoch": 0.7133706965572458, + "learning_rate": 1.998515094418594e-05, + "loss": 0.3941, + "step": 3562 + }, + { + "epoch": 0.7133706965572458, + "learning_rate": 1.9984379892296735e-05, + "loss": 0.0278, + "step": 3564 + }, + { + "epoch": 0.7141713370696557, + "learning_rate": 1.9983589340697288e-05, + "loss": 1.3174, + "step": 3566 + }, + { + "epoch": 0.7141713370696557, + "learning_rate": 1.9982779290931572e-05, + "loss": 0.0103, + "step": 3568 + }, + { + "epoch": 0.7149719775820657, + "learning_rate": 1.9981949744581622e-05, + "loss": 0.0181, + "step": 3570 + }, + { + "epoch": 0.7149719775820657, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.0114, + "step": 3572 + }, + { + "epoch": 0.7157726180944756, + "learning_rate": 1.99802321686476e-05, + "loss": 0.2928, + "step": 3574 + }, + { + "epoch": 0.7157726180944756, + "learning_rate": 1.997934414241799e-05, + "loss": 0.3292, + "step": 3576 + }, + { + "epoch": 0.7165732586068855, + "learning_rate": 1.9978436626313068e-05, + "loss": 0.3294, + "step": 3578 + }, + { + "epoch": 0.7165732586068855, + "learning_rate": 1.9977509622105236e-05, + "loss": 0.0036, + "step": 3580 + }, + { + "epoch": 0.7173738991192954, + "learning_rate": 1.997656313160495e-05, + "loss": 0.141, + "step": 3582 + }, + { + "epoch": 0.7173738991192954, + "learning_rate": 1.997559715666073e-05, + "loss": 0.0365, + "step": 3584 + }, + { + "epoch": 0.7181745396317054, + "learning_rate": 1.9974611699159142e-05, + "loss": 0.0256, + "step": 3586 + }, + { + "epoch": 0.7181745396317054, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.5297, + "step": 3588 + }, + { + "epoch": 0.7189751801441153, + "learning_rate": 1.99725823442204e-05, + "loss": 0.0255, + "step": 3590 + }, + { + "epoch": 0.7189751801441153, + "learning_rate": 1.997153845074662e-05, + "loss": 0.0923, + "step": 3592 + }, + { + "epoch": 0.7197758206565252, + "learning_rate": 1.9970475082642212e-05, + "loss": 0.1213, + "step": 3594 + }, + { + "epoch": 0.7197758206565252, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.2126, + "step": 3596 + }, + { + "epoch": 0.7205764611689351, + "learning_rate": 1.9968289930886675e-05, + "loss": 0.4013, + "step": 3598 + }, + { + "epoch": 0.7205764611689351, + "learning_rate": 1.9967168151503193e-05, + "loss": 0.0238, + "step": 3600 + }, + { + "epoch": 0.7213771016813451, + "learning_rate": 1.9966026906024377e-05, + "loss": 0.2295, + "step": 3602 + }, + { + "epoch": 0.7213771016813451, + "learning_rate": 1.996486619667911e-05, + "loss": 0.5845, + "step": 3604 + }, + { + "epoch": 0.722177742193755, + "learning_rate": 1.9963686025734262e-05, + "loss": 0.0981, + "step": 3606 + }, + { + "epoch": 0.722177742193755, + "learning_rate": 1.9962486395494753e-05, + "loss": 0.2808, + "step": 3608 + }, + { + "epoch": 0.7229783827061649, + "learning_rate": 1.9961267308303473e-05, + "loss": 0.2619, + "step": 3610 + }, + { + "epoch": 0.7229783827061649, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.088, + "step": 3612 + }, + { + "epoch": 0.7237790232185749, + "learning_rate": 1.9958770772627236e-05, + "loss": 0.0797, + "step": 3614 + }, + { + "epoch": 0.7237790232185749, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.3006, + "step": 3616 + }, + { + "epoch": 0.7245796637309848, + "learning_rate": 1.9956196438208693e-05, + "loss": 0.0977, + "step": 3618 + }, + { + "epoch": 0.7245796637309848, + "learning_rate": 1.995488010273198e-05, + "loss": 0.0033, + "step": 3620 + }, + { + "epoch": 0.7253803042433947, + "learning_rate": 1.9953544325158755e-05, + "loss": 0.2783, + "step": 3622 + }, + { + "epoch": 0.7253803042433947, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.0614, + "step": 3624 + }, + { + "epoch": 0.7261809447558046, + "learning_rate": 1.9950814454195953e-05, + "loss": 0.0494, + "step": 3626 + }, + { + "epoch": 0.7261809447558046, + "learning_rate": 1.9949420366137873e-05, + "loss": 0.0233, + "step": 3628 + }, + { + "epoch": 0.7269815852682145, + "learning_rate": 1.9948006846646262e-05, + "loss": 0.108, + "step": 3630 + }, + { + "epoch": 0.7269815852682145, + "learning_rate": 1.994657389848176e-05, + "loss": 0.0575, + "step": 3632 + }, + { + "epoch": 0.7277822257806245, + "learning_rate": 1.9945121524442947e-05, + "loss": 0.0198, + "step": 3634 + }, + { + "epoch": 0.7277822257806245, + "learning_rate": 1.994364972736634e-05, + "loss": 0.7067, + "step": 3636 + }, + { + "epoch": 0.7285828662930345, + "learning_rate": 1.9942158510126384e-05, + "loss": 0.0772, + "step": 3638 + }, + { + "epoch": 0.7285828662930345, + "learning_rate": 1.9940647875635466e-05, + "loss": 0.0078, + "step": 3640 + }, + { + "epoch": 0.7293835068054444, + "learning_rate": 1.9939117826843887e-05, + "loss": 0.082, + "step": 3642 + }, + { + "epoch": 0.7293835068054444, + "learning_rate": 1.993756836673986e-05, + "loss": 0.1418, + "step": 3644 + }, + { + "epoch": 0.7301841473178543, + "learning_rate": 1.9935999498349525e-05, + "loss": 0.0029, + "step": 3646 + }, + { + "epoch": 0.7301841473178543, + "learning_rate": 1.99344112247369e-05, + "loss": 0.6537, + "step": 3648 + }, + { + "epoch": 0.7309847878302642, + "learning_rate": 1.9932803549003932e-05, + "loss": 0.0256, + "step": 3650 + }, + { + "epoch": 0.7309847878302642, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.0115, + "step": 3652 + }, + { + "epoch": 0.7317854283426741, + "learning_rate": 1.9929530003774136e-05, + "loss": 0.0227, + "step": 3654 + }, + { + "epoch": 0.7317854283426741, + "learning_rate": 1.9927864140670618e-05, + "loss": 0.0302, + "step": 3656 + }, + { + "epoch": 0.732586068855084, + "learning_rate": 1.9926178888233344e-05, + "loss": 0.3431, + "step": 3658 + }, + { + "epoch": 0.732586068855084, + "learning_rate": 1.9924474249753656e-05, + "loss": 0.359, + "step": 3660 + }, + { + "epoch": 0.733386709367494, + "learning_rate": 1.9922750228560746e-05, + "loss": 0.003, + "step": 3662 + }, + { + "epoch": 0.733386709367494, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.296, + "step": 3664 + }, + { + "epoch": 0.734187349879904, + "learning_rate": 1.9919244051541315e-05, + "loss": 0.0363, + "step": 3666 + }, + { + "epoch": 0.734187349879904, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.2255, + "step": 3668 + }, + { + "epoch": 0.7349879903923139, + "learning_rate": 1.9915660384565603e-05, + "loss": 0.3326, + "step": 3670 + }, + { + "epoch": 0.7349879903923139, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.0584, + "step": 3672 + }, + { + "epoch": 0.7357886309047238, + "learning_rate": 1.9911999255629504e-05, + "loss": 0.1587, + "step": 3674 + }, + { + "epoch": 0.7357886309047238, + "learning_rate": 1.9910139651840497e-05, + "loss": 0.0184, + "step": 3676 + }, + { + "epoch": 0.7365892714171337, + "learning_rate": 1.990826069333406e-05, + "loss": 0.0451, + "step": 3678 + }, + { + "epoch": 0.7365892714171337, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.1308, + "step": 3680 + }, + { + "epoch": 0.7373899119295436, + "learning_rate": 1.9904444726885236e-05, + "loss": 0.0514, + "step": 3682 + }, + { + "epoch": 0.7373899119295436, + "learning_rate": 1.9902507726395524e-05, + "loss": 0.1201, + "step": 3684 + }, + { + "epoch": 0.7381905524419535, + "learning_rate": 1.9900551386093677e-05, + "loss": 0.0547, + "step": 3686 + }, + { + "epoch": 0.7381905524419535, + "learning_rate": 1.989857570980049e-05, + "loss": 0.1376, + "step": 3688 + }, + { + "epoch": 0.7389911929543634, + "learning_rate": 1.9896580701374482e-05, + "loss": 0.02, + "step": 3690 + }, + { + "epoch": 0.7389911929543634, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.0044, + "step": 3692 + }, + { + "epoch": 0.7397918334667735, + "learning_rate": 1.9892532703746977e-05, + "loss": 0.1619, + "step": 3694 + }, + { + "epoch": 0.7397918334667735, + "learning_rate": 1.9890479722451292e-05, + "loss": 0.1794, + "step": 3696 + }, + { + "epoch": 0.7405924739791834, + "learning_rate": 1.9888407424834437e-05, + "loss": 0.8193, + "step": 3698 + }, + { + "epoch": 0.7405924739791834, + "learning_rate": 1.988631581494365e-05, + "loss": 0.0038, + "step": 3700 + }, + { + "epoch": 0.7413931144915933, + "learning_rate": 1.9884204896863895e-05, + "loss": 0.0375, + "step": 3702 + }, + { + "epoch": 0.7413931144915933, + "learning_rate": 1.9882074674717832e-05, + "loss": 0.046, + "step": 3704 + }, + { + "epoch": 0.7421937550040032, + "learning_rate": 1.9879925152665845e-05, + "loss": 0.4429, + "step": 3706 + }, + { + "epoch": 0.7421937550040032, + "learning_rate": 1.987775633490599e-05, + "loss": 0.0111, + "step": 3708 + }, + { + "epoch": 0.7429943955164131, + "learning_rate": 1.9875568225674005e-05, + "loss": 0.0486, + "step": 3710 + }, + { + "epoch": 0.7429943955164131, + "learning_rate": 1.987336082924333e-05, + "loss": 0.0105, + "step": 3712 + }, + { + "epoch": 0.743795036028823, + "learning_rate": 1.987113414992505e-05, + "loss": 0.009, + "step": 3714 + }, + { + "epoch": 0.743795036028823, + "learning_rate": 1.986888819206792e-05, + "loss": 0.0064, + "step": 3716 + }, + { + "epoch": 0.7445956765412329, + "learning_rate": 1.986662296005834e-05, + "loss": 0.5356, + "step": 3718 + }, + { + "epoch": 0.7445956765412329, + "learning_rate": 1.986433845832037e-05, + "loss": 0.013, + "step": 3720 + }, + { + "epoch": 0.745396317053643, + "learning_rate": 1.9862034691315678e-05, + "loss": 0.0066, + "step": 3722 + }, + { + "epoch": 0.745396317053643, + "learning_rate": 1.9859711663543573e-05, + "loss": 0.0013, + "step": 3724 + }, + { + "epoch": 0.7461969575660529, + "learning_rate": 1.9857369379540985e-05, + "loss": 0.0483, + "step": 3726 + }, + { + "epoch": 0.7461969575660529, + "learning_rate": 1.9855007843882437e-05, + "loss": 0.002, + "step": 3728 + }, + { + "epoch": 0.7469975980784628, + "learning_rate": 1.985262706118007e-05, + "loss": 0.1197, + "step": 3730 + }, + { + "epoch": 0.7469975980784628, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.0395, + "step": 3732 + }, + { + "epoch": 0.7477982385908727, + "learning_rate": 1.9847807773280314e-05, + "loss": 0.0164, + "step": 3734 + }, + { + "epoch": 0.7477982385908727, + "learning_rate": 1.9845369277495105e-05, + "loss": 0.0667, + "step": 3736 + }, + { + "epoch": 0.7485988791032826, + "learning_rate": 1.9842911553490396e-05, + "loss": 0.0812, + "step": 3738 + }, + { + "epoch": 0.7485988791032826, + "learning_rate": 1.9840434606066186e-05, + "loss": 0.0015, + "step": 3740 + }, + { + "epoch": 0.7493995196156925, + "learning_rate": 1.983793844005999e-05, + "loss": 0.0902, + "step": 3742 + }, + { + "epoch": 0.7493995196156925, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.0793, + "step": 3744 + }, + { + "epoch": 0.7502001601281025, + "learning_rate": 1.9832888471839475e-05, + "loss": 0.0423, + "step": 3746 + }, + { + "epoch": 0.7502001601281025, + "learning_rate": 1.983033467948784e-05, + "loss": 0.0928, + "step": 3748 + }, + { + "epoch": 0.7510008006405124, + "learning_rate": 1.9827761688279613e-05, + "loss": 0.0407, + "step": 3750 + }, + { + "epoch": 0.7510008006405124, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.0255, + "step": 3752 + }, + { + "epoch": 0.7518014411529224, + "learning_rate": 1.9822558129431263e-05, + "loss": 0.0532, + "step": 3754 + }, + { + "epoch": 0.7518014411529224, + "learning_rate": 1.9819927571953804e-05, + "loss": 0.0099, + "step": 3756 + }, + { + "epoch": 0.7526020816653323, + "learning_rate": 1.981727783594506e-05, + "loss": 0.0375, + "step": 3758 + }, + { + "epoch": 0.7526020816653323, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.0027, + "step": 3760 + }, + { + "epoch": 0.7534027221777422, + "learning_rate": 1.9811920849071092e-05, + "loss": 0.0141, + "step": 3762 + }, + { + "epoch": 0.7534027221777422, + "learning_rate": 1.980921360866819e-05, + "loss": 0.0408, + "step": 3764 + }, + { + "epoch": 0.7542033626901521, + "learning_rate": 1.980648721065859e-05, + "loss": 0.0614, + "step": 3766 + }, + { + "epoch": 0.7542033626901521, + "learning_rate": 1.9803741660367018e-05, + "loss": 0.0522, + "step": 3768 + }, + { + "epoch": 0.755004003202562, + "learning_rate": 1.980097696315558e-05, + "loss": 0.8226, + "step": 3770 + }, + { + "epoch": 0.755004003202562, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.0034, + "step": 3772 + }, + { + "epoch": 0.755804643714972, + "learning_rate": 1.979539014960858e-05, + "loss": 0.0378, + "step": 3774 + }, + { + "epoch": 0.755804643714972, + "learning_rate": 1.979256804418418e-05, + "loss": 0.0038, + "step": 3776 + }, + { + "epoch": 0.7566052842273819, + "learning_rate": 1.9789726813662233e-05, + "loss": 0.0201, + "step": 3778 + }, + { + "epoch": 0.7566052842273819, + "learning_rate": 1.978686646359173e-05, + "loss": 0.0462, + "step": 3780 + }, + { + "epoch": 0.7574059247397918, + "learning_rate": 1.9783986999558994e-05, + "loss": 0.089, + "step": 3782 + }, + { + "epoch": 0.7574059247397918, + "learning_rate": 1.9781088427187677e-05, + "loss": 1.1025, + "step": 3784 + }, + { + "epoch": 0.7582065652522018, + "learning_rate": 1.9778170752138763e-05, + "loss": 0.0273, + "step": 3786 + }, + { + "epoch": 0.7582065652522018, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.0299, + "step": 3788 + }, + { + "epoch": 0.7590072057646117, + "learning_rate": 1.9772278116838546e-05, + "loss": 0.0053, + "step": 3790 + }, + { + "epoch": 0.7590072057646117, + "learning_rate": 1.976930316809569e-05, + "loss": 0.0467, + "step": 3792 + }, + { + "epoch": 0.7598078462770216, + "learning_rate": 1.97663091396921e-05, + "loss": 0.3065, + "step": 3794 + }, + { + "epoch": 0.7598078462770216, + "learning_rate": 1.9763296037475177e-05, + "loss": 0.0302, + "step": 3796 + }, + { + "epoch": 0.7606084867894315, + "learning_rate": 1.976026386732957e-05, + "loss": 0.5116, + "step": 3798 + }, + { + "epoch": 0.7606084867894315, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.044, + "step": 3800 + }, + { + "epoch": 0.7614091273018415, + "learning_rate": 1.9754142346977122e-05, + "loss": 0.0023, + "step": 3802 + }, + { + "epoch": 0.7614091273018415, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.0136, + "step": 3804 + }, + { + "epoch": 0.7622097678142514, + "learning_rate": 1.9747944626456577e-05, + "loss": 0.0191, + "step": 3806 + }, + { + "epoch": 0.7622097678142514, + "learning_rate": 1.9744817206240374e-05, + "loss": 0.0047, + "step": 3808 + }, + { + "epoch": 0.7630104083266613, + "learning_rate": 1.9741670754185054e-05, + "loss": 1.19, + "step": 3810 + }, + { + "epoch": 0.7630104083266613, + "learning_rate": 1.9738505276435695e-05, + "loss": 0.0193, + "step": 3812 + }, + { + "epoch": 0.7638110488390712, + "learning_rate": 1.9735320779174548e-05, + "loss": 0.08, + "step": 3814 + }, + { + "epoch": 0.7638110488390712, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.3559, + "step": 3816 + }, + { + "epoch": 0.7646116893514812, + "learning_rate": 1.9728894751031595e-05, + "loss": 0.05, + "step": 3818 + }, + { + "epoch": 0.7646116893514812, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.2171, + "step": 3820 + }, + { + "epoch": 0.7654123298638911, + "learning_rate": 1.972239271995686e-05, + "loss": 0.0985, + "step": 3822 + }, + { + "epoch": 0.7654123298638911, + "learning_rate": 1.9719113219170152e-05, + "loss": 0.0556, + "step": 3824 + }, + { + "epoch": 0.7662129703763011, + "learning_rate": 1.9715814736744758e-05, + "loss": 0.4366, + "step": 3826 + }, + { + "epoch": 0.7662129703763011, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.5142, + "step": 3828 + }, + { + "epoch": 0.767013610888711, + "learning_rate": 1.9709160852783022e-05, + "loss": 0.0031, + "step": 3830 + }, + { + "epoch": 0.767013610888711, + "learning_rate": 1.970580546424186e-05, + "loss": 0.0812, + "step": 3832 + }, + { + "epoch": 0.7678142514011209, + "learning_rate": 1.9702431120052352e-05, + "loss": 0.0012, + "step": 3834 + }, + { + "epoch": 0.7678142514011209, + "learning_rate": 1.969903782680467e-05, + "loss": 0.0458, + "step": 3836 + }, + { + "epoch": 0.7686148919135308, + "learning_rate": 1.9695625591125984e-05, + "loss": 0.1476, + "step": 3838 + }, + { + "epoch": 0.7686148919135308, + "learning_rate": 1.9692194419680463e-05, + "loss": 0.0744, + "step": 3840 + }, + { + "epoch": 0.7694155324259407, + "learning_rate": 1.968874431916926e-05, + "loss": 0.0793, + "step": 3842 + }, + { + "epoch": 0.7694155324259407, + "learning_rate": 1.96852752963305e-05, + "loss": 0.003, + "step": 3844 + }, + { + "epoch": 0.7702161729383507, + "learning_rate": 1.9681787357939257e-05, + "loss": 0.0201, + "step": 3846 + }, + { + "epoch": 0.7702161729383507, + "learning_rate": 1.9678280510807552e-05, + "loss": 0.0467, + "step": 3848 + }, + { + "epoch": 0.7710168134507606, + "learning_rate": 1.9674754761784334e-05, + "loss": 0.0116, + "step": 3850 + }, + { + "epoch": 0.7710168134507606, + "learning_rate": 1.9671210117755462e-05, + "loss": 0.2011, + "step": 3852 + }, + { + "epoch": 0.7718174539631706, + "learning_rate": 1.9667646585643706e-05, + "loss": 0.495, + "step": 3854 + }, + { + "epoch": 0.7718174539631706, + "learning_rate": 1.966406417240872e-05, + "loss": 0.6857, + "step": 3856 + }, + { + "epoch": 0.7726180944755805, + "learning_rate": 1.966046288504704e-05, + "loss": 0.3206, + "step": 3858 + }, + { + "epoch": 0.7726180944755805, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.0226, + "step": 3860 + }, + { + "epoch": 0.7734187349879904, + "learning_rate": 1.965320371611399e-05, + "loss": 0.0588, + "step": 3862 + }, + { + "epoch": 0.7734187349879904, + "learning_rate": 1.964954584871995e-05, + "loss": 0.0725, + "step": 3864 + }, + { + "epoch": 0.7742193755004003, + "learning_rate": 1.964586913555381e-05, + "loss": 0.0961, + "step": 3866 + }, + { + "epoch": 0.7742193755004003, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.0093, + "step": 3868 + }, + { + "epoch": 0.7750200160128102, + "learning_rate": 1.9638459200664822e-05, + "loss": 0.0441, + "step": 3870 + }, + { + "epoch": 0.7750200160128102, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.0545, + "step": 3872 + }, + { + "epoch": 0.7758206565252201, + "learning_rate": 1.9630973969334068e-05, + "loss": 0.3289, + "step": 3874 + }, + { + "epoch": 0.7758206565252201, + "learning_rate": 1.9627203135753576e-05, + "loss": 0.0544, + "step": 3876 + }, + { + "epoch": 0.77662129703763, + "learning_rate": 1.9623413500036795e-05, + "loss": 0.4045, + "step": 3878 + }, + { + "epoch": 0.77662129703763, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.1174, + "step": 3880 + }, + { + "epoch": 0.7774219375500401, + "learning_rate": 1.9615777851836007e-05, + "loss": 0.2256, + "step": 3882 + }, + { + "epoch": 0.7774219375500401, + "learning_rate": 1.961193185426459e-05, + "loss": 0.1943, + "step": 3884 + }, + { + "epoch": 0.77822257806245, + "learning_rate": 1.9608067084382025e-05, + "loss": 0.1133, + "step": 3886 + }, + { + "epoch": 0.77822257806245, + "learning_rate": 1.9604183549736287e-05, + "loss": 0.6401, + "step": 3888 + }, + { + "epoch": 0.7790232185748599, + "learning_rate": 1.9600281257912002e-05, + "loss": 0.0102, + "step": 3890 + }, + { + "epoch": 0.7790232185748599, + "learning_rate": 1.959636021653044e-05, + "loss": 0.1598, + "step": 3892 + }, + { + "epoch": 0.7798238590872698, + "learning_rate": 1.9592420433249465e-05, + "loss": 0.0388, + "step": 3894 + }, + { + "epoch": 0.7798238590872698, + "learning_rate": 1.958846191576357e-05, + "loss": 0.0071, + "step": 3896 + }, + { + "epoch": 0.7806244995996797, + "learning_rate": 1.958448467180382e-05, + "loss": 0.0228, + "step": 3898 + }, + { + "epoch": 0.7806244995996797, + "learning_rate": 1.958048870913786e-05, + "loss": 0.907, + "step": 3900 + }, + { + "epoch": 0.7814251401120896, + "learning_rate": 1.9576474035569895e-05, + "loss": 0.4162, + "step": 3902 + }, + { + "epoch": 0.7814251401120896, + "learning_rate": 1.9572440658940667e-05, + "loss": 0.0094, + "step": 3904 + }, + { + "epoch": 0.7822257806244995, + "learning_rate": 1.9568388587127448e-05, + "loss": 0.1328, + "step": 3906 + }, + { + "epoch": 0.7822257806244995, + "learning_rate": 1.9564317828044022e-05, + "loss": 0.0156, + "step": 3908 + }, + { + "epoch": 0.7830264211369096, + "learning_rate": 1.9560228389640668e-05, + "loss": 0.1191, + "step": 3910 + }, + { + "epoch": 0.7830264211369096, + "learning_rate": 1.955612027990415e-05, + "loss": 0.8233, + "step": 3912 + }, + { + "epoch": 0.7838270616493195, + "learning_rate": 1.955199350685769e-05, + "loss": 0.0377, + "step": 3914 + }, + { + "epoch": 0.7838270616493195, + "learning_rate": 1.9547848078560982e-05, + "loss": 0.3297, + "step": 3916 + }, + { + "epoch": 0.7846277021617294, + "learning_rate": 1.954368400311011e-05, + "loss": 0.0139, + "step": 3918 + }, + { + "epoch": 0.7846277021617294, + "learning_rate": 1.953950128863763e-05, + "loss": 0.0004, + "step": 3920 + }, + { + "epoch": 0.7854283426741393, + "learning_rate": 1.9535299943312455e-05, + "loss": 0.1538, + "step": 3922 + }, + { + "epoch": 0.7854283426741393, + "learning_rate": 1.9531079975339915e-05, + "loss": 0.5767, + "step": 3924 + }, + { + "epoch": 0.7862289831865492, + "learning_rate": 1.9526841392961694e-05, + "loss": 0.4006, + "step": 3926 + }, + { + "epoch": 0.7862289831865492, + "learning_rate": 1.9522584204455835e-05, + "loss": 0.0412, + "step": 3928 + }, + { + "epoch": 0.7870296236989591, + "learning_rate": 1.9518308418136728e-05, + "loss": 0.0729, + "step": 3930 + }, + { + "epoch": 0.7870296236989591, + "learning_rate": 1.9514014042355054e-05, + "loss": 0.1101, + "step": 3932 + }, + { + "epoch": 0.7878302642113691, + "learning_rate": 1.9509701085497852e-05, + "loss": 0.0084, + "step": 3934 + }, + { + "epoch": 0.7878302642113691, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.3932, + "step": 3936 + }, + { + "epoch": 0.7886309047237791, + "learning_rate": 1.9501019462286266e-05, + "loss": 0.1323, + "step": 3938 + }, + { + "epoch": 0.7886309047237791, + "learning_rate": 1.9496650812887293e-05, + "loss": 0.0714, + "step": 3940 + }, + { + "epoch": 0.789431545236189, + "learning_rate": 1.9492263616323536e-05, + "loss": 0.0741, + "step": 3942 + }, + { + "epoch": 0.789431545236189, + "learning_rate": 1.9487857881163295e-05, + "loss": 1.1999, + "step": 3944 + }, + { + "epoch": 0.7902321857485989, + "learning_rate": 1.948343361601105e-05, + "loss": 0.0226, + "step": 3946 + }, + { + "epoch": 0.7902321857485989, + "learning_rate": 1.947899082950751e-05, + "loss": 0.0049, + "step": 3948 + }, + { + "epoch": 0.7910328262610088, + "learning_rate": 1.947452953032951e-05, + "loss": 0.1352, + "step": 3950 + }, + { + "epoch": 0.7910328262610088, + "learning_rate": 1.947004972719008e-05, + "loss": 0.053, + "step": 3952 + }, + { + "epoch": 0.7918334667734187, + "learning_rate": 1.9465551428838363e-05, + "loss": 0.0293, + "step": 3954 + }, + { + "epoch": 0.7918334667734187, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.0567, + "step": 3956 + }, + { + "epoch": 0.7926341072858286, + "learning_rate": 1.9456499381675285e-05, + "loss": 0.078, + "step": 3958 + }, + { + "epoch": 0.7926341072858286, + "learning_rate": 1.945194565054276e-05, + "loss": 0.0113, + "step": 3960 + }, + { + "epoch": 0.7934347477982386, + "learning_rate": 1.9447373459555617e-05, + "loss": 0.0588, + "step": 3962 + }, + { + "epoch": 0.7934347477982386, + "learning_rate": 1.9442782817643425e-05, + "loss": 0.0202, + "step": 3964 + }, + { + "epoch": 0.7942353883106485, + "learning_rate": 1.9438173733771814e-05, + "loss": 0.0325, + "step": 3966 + }, + { + "epoch": 0.7942353883106485, + "learning_rate": 1.9433546216942433e-05, + "loss": 0.4341, + "step": 3968 + }, + { + "epoch": 0.7950360288230585, + "learning_rate": 1.9428900276192903e-05, + "loss": 0.0511, + "step": 3970 + }, + { + "epoch": 0.7950360288230585, + "learning_rate": 1.942423592059687e-05, + "loss": 0.0575, + "step": 3972 + }, + { + "epoch": 0.7958366693354684, + "learning_rate": 1.94195531592639e-05, + "loss": 0.0405, + "step": 3974 + }, + { + "epoch": 0.7958366693354684, + "learning_rate": 1.941485200133955e-05, + "loss": 0.0602, + "step": 3976 + }, + { + "epoch": 0.7966373098478783, + "learning_rate": 1.9410132456005262e-05, + "loss": 0.4265, + "step": 3978 + }, + { + "epoch": 0.7966373098478783, + "learning_rate": 1.9405394532478422e-05, + "loss": 0.0111, + "step": 3980 + }, + { + "epoch": 0.7974379503602882, + "learning_rate": 1.94006382400123e-05, + "loss": 0.0204, + "step": 3982 + }, + { + "epoch": 0.7974379503602882, + "learning_rate": 1.9395863587896025e-05, + "loss": 0.0502, + "step": 3984 + }, + { + "epoch": 0.7982385908726981, + "learning_rate": 1.939107058545461e-05, + "loss": 0.6034, + "step": 3986 + }, + { + "epoch": 0.7982385908726981, + "learning_rate": 1.938625924204888e-05, + "loss": 0.0163, + "step": 3988 + }, + { + "epoch": 0.7990392313851081, + "learning_rate": 1.9381429567075507e-05, + "loss": 0.5857, + "step": 3990 + }, + { + "epoch": 0.7990392313851081, + "learning_rate": 1.937658156996694e-05, + "loss": 0.1292, + "step": 3992 + }, + { + "epoch": 0.799839871897518, + "learning_rate": 1.9371715260191425e-05, + "loss": 0.0339, + "step": 3994 + }, + { + "epoch": 0.799839871897518, + "learning_rate": 1.9366830647252977e-05, + "loss": 0.02, + "step": 3996 + }, + { + "epoch": 0.800640512409928, + "learning_rate": 1.936192774069133e-05, + "loss": 0.005, + "step": 3998 + }, + { + "epoch": 0.800640512409928, + "learning_rate": 1.9357006550082e-05, + "loss": 0.0514, + "step": 4000 + }, + { + "epoch": 0.8014411529223379, + "learning_rate": 1.9352067085036145e-05, + "loss": 0.0056, + "step": 4002 + }, + { + "epoch": 0.8014411529223379, + "learning_rate": 1.9347109355200676e-05, + "loss": 0.0779, + "step": 4004 + }, + { + "epoch": 0.8022417934347478, + "learning_rate": 1.9342133370258124e-05, + "loss": 0.009, + "step": 4006 + }, + { + "epoch": 0.8022417934347478, + "learning_rate": 1.933713913992671e-05, + "loss": 0.1081, + "step": 4008 + }, + { + "epoch": 0.8030424339471577, + "learning_rate": 1.9332126673960276e-05, + "loss": 0.0164, + "step": 4010 + }, + { + "epoch": 0.8030424339471577, + "learning_rate": 1.9327095982148255e-05, + "loss": 0.0816, + "step": 4012 + }, + { + "epoch": 0.8038430744595677, + "learning_rate": 1.932204707431572e-05, + "loss": 0.0004, + "step": 4014 + }, + { + "epoch": 0.8038430744595677, + "learning_rate": 1.9316979960323283e-05, + "loss": 0.0319, + "step": 4016 + }, + { + "epoch": 0.8046437149719776, + "learning_rate": 1.9311894650067146e-05, + "loss": 0.0976, + "step": 4018 + }, + { + "epoch": 0.8046437149719776, + "learning_rate": 1.9306791153479017e-05, + "loss": 0.0045, + "step": 4020 + }, + { + "epoch": 0.8054443554843875, + "learning_rate": 1.9301669480526118e-05, + "loss": 0.0304, + "step": 4022 + }, + { + "epoch": 0.8054443554843875, + "learning_rate": 1.9296529641211226e-05, + "loss": 0.0241, + "step": 4024 + }, + { + "epoch": 0.8062449959967974, + "learning_rate": 1.929137164557252e-05, + "loss": 0.0169, + "step": 4026 + }, + { + "epoch": 0.8062449959967974, + "learning_rate": 1.928619550368371e-05, + "loss": 0.4489, + "step": 4028 + }, + { + "epoch": 0.8070456365092074, + "learning_rate": 1.9281001225653883e-05, + "loss": 0.4399, + "step": 4030 + }, + { + "epoch": 0.8070456365092074, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.0562, + "step": 4032 + }, + { + "epoch": 0.8078462770216173, + "learning_rate": 1.9270558301784808e-05, + "loss": 0.496, + "step": 4034 + }, + { + "epoch": 0.8078462770216173, + "learning_rate": 1.9265309676340783e-05, + "loss": 0.0058, + "step": 4036 + }, + { + "epoch": 0.8086469175340272, + "learning_rate": 1.9260042955546247e-05, + "loss": 0.0575, + "step": 4038 + }, + { + "epoch": 0.8086469175340272, + "learning_rate": 1.9254758149687187e-05, + "loss": 0.2253, + "step": 4040 + }, + { + "epoch": 0.8094475580464372, + "learning_rate": 1.9249455269084972e-05, + "loss": 0.0287, + "step": 4042 + }, + { + "epoch": 0.8094475580464372, + "learning_rate": 1.9244134324096216e-05, + "loss": 0.0181, + "step": 4044 + }, + { + "epoch": 0.8102481985588471, + "learning_rate": 1.9238795325112867e-05, + "loss": 0.1058, + "step": 4046 + }, + { + "epoch": 0.8102481985588471, + "learning_rate": 1.9233438282562095e-05, + "loss": 0.0144, + "step": 4048 + }, + { + "epoch": 0.811048839071257, + "learning_rate": 1.9228063206906302e-05, + "loss": 0.0141, + "step": 4050 + }, + { + "epoch": 0.811048839071257, + "learning_rate": 1.9222670108643156e-05, + "loss": 0.0062, + "step": 4052 + }, + { + "epoch": 0.8118494795836669, + "learning_rate": 1.9217258998305464e-05, + "loss": 0.0046, + "step": 4054 + }, + { + "epoch": 0.8118494795836669, + "learning_rate": 1.9211829886461278e-05, + "loss": 0.0682, + "step": 4056 + }, + { + "epoch": 0.8126501200960768, + "learning_rate": 1.9206382783713735e-05, + "loss": 0.1747, + "step": 4058 + }, + { + "epoch": 0.8126501200960768, + "learning_rate": 1.9200917700701176e-05, + "loss": 0.3619, + "step": 4060 + }, + { + "epoch": 0.8134507606084868, + "learning_rate": 1.9195434648097013e-05, + "loss": 0.2962, + "step": 4062 + }, + { + "epoch": 0.8134507606084868, + "learning_rate": 1.918993363660975e-05, + "loss": 0.1878, + "step": 4064 + }, + { + "epoch": 0.8142514011208967, + "learning_rate": 1.9184414676983013e-05, + "loss": 0.0142, + "step": 4066 + }, + { + "epoch": 0.8142514011208967, + "learning_rate": 1.9178877779995416e-05, + "loss": 0.0337, + "step": 4068 + }, + { + "epoch": 0.8150520416333067, + "learning_rate": 1.9173322956460678e-05, + "loss": 0.1193, + "step": 4070 + }, + { + "epoch": 0.8150520416333067, + "learning_rate": 1.916775021722745e-05, + "loss": 0.1193, + "step": 4072 + }, + { + "epoch": 0.8158526821457166, + "learning_rate": 1.9162159573179446e-05, + "loss": 0.0004, + "step": 4074 + }, + { + "epoch": 0.8158526821457166, + "learning_rate": 1.9156551035235298e-05, + "loss": 0.0406, + "step": 4076 + }, + { + "epoch": 0.8166533226581265, + "learning_rate": 1.915092461434859e-05, + "loss": 0.0058, + "step": 4078 + }, + { + "epoch": 0.8166533226581265, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.1995, + "step": 4080 + }, + { + "epoch": 0.8174539631705364, + "learning_rate": 1.9139618167736547e-05, + "loss": 0.103, + "step": 4082 + }, + { + "epoch": 0.8174539631705364, + "learning_rate": 1.9133938164092942e-05, + "loss": 0.0731, + "step": 4084 + }, + { + "epoch": 0.8182546036829463, + "learning_rate": 1.912824032167022e-05, + "loss": 0.4261, + "step": 4086 + }, + { + "epoch": 0.8182546036829463, + "learning_rate": 1.9122524651596372e-05, + "loss": 0.0008, + "step": 4088 + }, + { + "epoch": 0.8190552441953562, + "learning_rate": 1.911679116503426e-05, + "loss": 0.0974, + "step": 4090 + }, + { + "epoch": 0.8190552441953562, + "learning_rate": 1.9111039873181475e-05, + "loss": 0.2107, + "step": 4092 + }, + { + "epoch": 0.8198558847077662, + "learning_rate": 1.9105270787270446e-05, + "loss": 0.0032, + "step": 4094 + }, + { + "epoch": 0.8198558847077662, + "learning_rate": 1.9099483918568287e-05, + "loss": 0.0267, + "step": 4096 + }, + { + "epoch": 0.8206565252201762, + "learning_rate": 1.9093679278376913e-05, + "loss": 0.0172, + "step": 4098 + }, + { + "epoch": 0.8206565252201762, + "learning_rate": 1.90878568780329e-05, + "loss": 0.284, + "step": 4100 + }, + { + "epoch": 0.8214571657325861, + "learning_rate": 1.90820167289075e-05, + "loss": 0.0089, + "step": 4102 + }, + { + "epoch": 0.8214571657325861, + "learning_rate": 1.907615884240668e-05, + "loss": 0.0094, + "step": 4104 + }, + { + "epoch": 0.822257806244996, + "learning_rate": 1.9070283229971003e-05, + "loss": 0.1062, + "step": 4106 + }, + { + "epoch": 0.822257806244996, + "learning_rate": 1.9064389903075683e-05, + "loss": 0.1342, + "step": 4108 + }, + { + "epoch": 0.8230584467574059, + "learning_rate": 1.9058478873230487e-05, + "loss": 0.0792, + "step": 4110 + }, + { + "epoch": 0.8230584467574059, + "learning_rate": 1.905255015197982e-05, + "loss": 0.064, + "step": 4112 + }, + { + "epoch": 0.8238590872698158, + "learning_rate": 1.9046603750902585e-05, + "loss": 0.6229, + "step": 4114 + }, + { + "epoch": 0.8238590872698158, + "learning_rate": 1.9040639681612216e-05, + "loss": 0.0139, + "step": 4116 + }, + { + "epoch": 0.8246597277822257, + "learning_rate": 1.9034657955756702e-05, + "loss": 0.0099, + "step": 4118 + }, + { + "epoch": 0.8246597277822257, + "learning_rate": 1.902865858501845e-05, + "loss": 0.0462, + "step": 4120 + }, + { + "epoch": 0.8254603682946358, + "learning_rate": 1.9022641581114396e-05, + "loss": 0.0364, + "step": 4122 + }, + { + "epoch": 0.8254603682946358, + "learning_rate": 1.9016606955795843e-05, + "loss": 0.0359, + "step": 4124 + }, + { + "epoch": 0.8262610088070457, + "learning_rate": 1.901055472084858e-05, + "loss": 0.0055, + "step": 4126 + }, + { + "epoch": 0.8262610088070457, + "learning_rate": 1.9004484888092734e-05, + "loss": 0.0301, + "step": 4128 + }, + { + "epoch": 0.8270616493194556, + "learning_rate": 1.8998397469382812e-05, + "loss": 0.0504, + "step": 4130 + }, + { + "epoch": 0.8270616493194556, + "learning_rate": 1.8992292476607695e-05, + "loss": 0.124, + "step": 4132 + }, + { + "epoch": 0.8278622898318655, + "learning_rate": 1.898616992169054e-05, + "loss": 0.0087, + "step": 4134 + }, + { + "epoch": 0.8278622898318655, + "learning_rate": 1.8980029816588863e-05, + "loss": 0.0058, + "step": 4136 + }, + { + "epoch": 0.8286629303442754, + "learning_rate": 1.89738721732944e-05, + "loss": 0.0006, + "step": 4138 + }, + { + "epoch": 0.8286629303442754, + "learning_rate": 1.8967697003833156e-05, + "loss": 0.0389, + "step": 4140 + }, + { + "epoch": 0.8294635708566853, + "learning_rate": 1.8961504320265392e-05, + "loss": 0.005, + "step": 4142 + }, + { + "epoch": 0.8294635708566853, + "learning_rate": 1.8955294134685528e-05, + "loss": 1.4727, + "step": 4144 + }, + { + "epoch": 0.8302642113690952, + "learning_rate": 1.8949066459222224e-05, + "loss": 0.0052, + "step": 4146 + }, + { + "epoch": 0.8302642113690952, + "learning_rate": 1.8942821306038227e-05, + "loss": 0.0153, + "step": 4148 + }, + { + "epoch": 0.8310648518815053, + "learning_rate": 1.8936558687330492e-05, + "loss": 0.0205, + "step": 4150 + }, + { + "epoch": 0.8310648518815053, + "learning_rate": 1.893027861533003e-05, + "loss": 0.1392, + "step": 4152 + }, + { + "epoch": 0.8318654923939152, + "learning_rate": 1.8923981102301944e-05, + "loss": 0.0462, + "step": 4154 + }, + { + "epoch": 0.8318654923939152, + "learning_rate": 1.891766616054545e-05, + "loss": 0.1698, + "step": 4156 + }, + { + "epoch": 0.8326661329063251, + "learning_rate": 1.8911333802393725e-05, + "loss": 0.0635, + "step": 4158 + }, + { + "epoch": 0.8326661329063251, + "learning_rate": 1.8904984040214043e-05, + "loss": 0.0978, + "step": 4160 + }, + { + "epoch": 0.833466773418735, + "learning_rate": 1.8898616886407588e-05, + "loss": 0.0318, + "step": 4162 + }, + { + "epoch": 0.833466773418735, + "learning_rate": 1.8892232353409582e-05, + "loss": 0.0249, + "step": 4164 + }, + { + "epoch": 0.8342674139311449, + "learning_rate": 1.8885830453689146e-05, + "loss": 0.1745, + "step": 4166 + }, + { + "epoch": 0.8342674139311449, + "learning_rate": 1.8879411199749306e-05, + "loss": 0.011, + "step": 4168 + }, + { + "epoch": 0.8350680544435548, + "learning_rate": 1.8872974604127038e-05, + "loss": 0.0179, + "step": 4170 + }, + { + "epoch": 0.8350680544435548, + "learning_rate": 1.8866520679393124e-05, + "loss": 0.0688, + "step": 4172 + }, + { + "epoch": 0.8358686949559647, + "learning_rate": 1.8860049438152247e-05, + "loss": 0.0002, + "step": 4174 + }, + { + "epoch": 0.8358686949559647, + "learning_rate": 1.885356089304285e-05, + "loss": 0.2442, + "step": 4176 + }, + { + "epoch": 0.8366693354683747, + "learning_rate": 1.8847055056737236e-05, + "loss": 0.005, + "step": 4178 + }, + { + "epoch": 0.8366693354683747, + "learning_rate": 1.884053194194143e-05, + "loss": 0.0001, + "step": 4180 + }, + { + "epoch": 0.8374699759807847, + "learning_rate": 1.8833991561395194e-05, + "loss": 0.0088, + "step": 4182 + }, + { + "epoch": 0.8374699759807847, + "learning_rate": 1.882743392787207e-05, + "loss": 0.171, + "step": 4184 + }, + { + "epoch": 0.8382706164931946, + "learning_rate": 1.8820859054179225e-05, + "loss": 1.3908, + "step": 4186 + }, + { + "epoch": 0.8382706164931946, + "learning_rate": 1.881426695315756e-05, + "loss": 0.8332, + "step": 4188 + }, + { + "epoch": 0.8390712570056045, + "learning_rate": 1.8807657637681577e-05, + "loss": 0.0101, + "step": 4190 + }, + { + "epoch": 0.8390712570056045, + "learning_rate": 1.8801031120659396e-05, + "loss": 0.0876, + "step": 4192 + }, + { + "epoch": 0.8398718975180144, + "learning_rate": 1.8794387415032783e-05, + "loss": 0.0458, + "step": 4194 + }, + { + "epoch": 0.8398718975180144, + "learning_rate": 1.8787726533777003e-05, + "loss": 0.041, + "step": 4196 + }, + { + "epoch": 0.8406725380304243, + "learning_rate": 1.8781048489900936e-05, + "loss": 0.1129, + "step": 4198 + }, + { + "epoch": 0.8406725380304243, + "learning_rate": 1.877435329644691e-05, + "loss": 1.159, + "step": 4200 + }, + { + "epoch": 0.8414731785428343, + "learning_rate": 1.876764096649082e-05, + "loss": 0.0362, + "step": 4202 + }, + { + "epoch": 0.8414731785428343, + "learning_rate": 1.8760911513141974e-05, + "loss": 0.1786, + "step": 4204 + }, + { + "epoch": 0.8422738190552442, + "learning_rate": 1.8754164949543123e-05, + "loss": 1.2519, + "step": 4206 + }, + { + "epoch": 0.8422738190552442, + "learning_rate": 1.8747401288870482e-05, + "loss": 0.1029, + "step": 4208 + }, + { + "epoch": 0.8430744595676541, + "learning_rate": 1.8740620544333604e-05, + "loss": 0.1514, + "step": 4210 + }, + { + "epoch": 0.8430744595676541, + "learning_rate": 1.8733822729175455e-05, + "loss": 0.043, + "step": 4212 + }, + { + "epoch": 0.8438751000800641, + "learning_rate": 1.872700785667228e-05, + "loss": 0.2023, + "step": 4214 + }, + { + "epoch": 0.8438751000800641, + "learning_rate": 1.8720175940133712e-05, + "loss": 0.0281, + "step": 4216 + }, + { + "epoch": 0.844675740592474, + "learning_rate": 1.8713326992902612e-05, + "loss": 0.0266, + "step": 4218 + }, + { + "epoch": 0.844675740592474, + "learning_rate": 1.8706461028355107e-05, + "loss": 0.272, + "step": 4220 + }, + { + "epoch": 0.8454763811048839, + "learning_rate": 1.8699578059900604e-05, + "loss": 0.1336, + "step": 4222 + }, + { + "epoch": 0.8454763811048839, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.0293, + "step": 4224 + }, + { + "epoch": 0.8462770216172938, + "learning_rate": 1.868576116507408e-05, + "loss": 0.0607, + "step": 4226 + }, + { + "epoch": 0.8462770216172938, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.0313, + "step": 4228 + }, + { + "epoch": 0.8470776621297038, + "learning_rate": 1.8671876416361767e-05, + "loss": 0.0413, + "step": 4230 + }, + { + "epoch": 0.8470776621297038, + "learning_rate": 1.8664908630674264e-05, + "loss": 0.5999, + "step": 4232 + }, + { + "epoch": 0.8478783026421137, + "learning_rate": 1.8657923922232467e-05, + "loss": 0.2102, + "step": 4234 + }, + { + "epoch": 0.8478783026421137, + "learning_rate": 1.86509223046777e-05, + "loss": 0.0128, + "step": 4236 + }, + { + "epoch": 0.8486789431545236, + "learning_rate": 1.8643903791684228e-05, + "loss": 0.0258, + "step": 4238 + }, + { + "epoch": 0.8486789431545236, + "learning_rate": 1.8636868396959406e-05, + "loss": 0.624, + "step": 4240 + }, + { + "epoch": 0.8494795836669335, + "learning_rate": 1.8629816134243466e-05, + "loss": 0.138, + "step": 4242 + }, + { + "epoch": 0.8494795836669335, + "learning_rate": 1.8622747017309676e-05, + "loss": 0.7055, + "step": 4244 + }, + { + "epoch": 0.8502802241793435, + "learning_rate": 1.8615661059964148e-05, + "loss": 0.1903, + "step": 4246 + }, + { + "epoch": 0.8502802241793435, + "learning_rate": 1.8608558276045898e-05, + "loss": 0.0412, + "step": 4248 + }, + { + "epoch": 0.8510808646917534, + "learning_rate": 1.860143867942685e-05, + "loss": 0.057, + "step": 4250 + }, + { + "epoch": 0.8510808646917534, + "learning_rate": 1.8594302284011697e-05, + "loss": 0.0583, + "step": 4252 + }, + { + "epoch": 0.8518815052041633, + "learning_rate": 1.8587149103738006e-05, + "loss": 0.1445, + "step": 4254 + }, + { + "epoch": 0.8518815052041633, + "learning_rate": 1.8579979152576076e-05, + "loss": 0.2106, + "step": 4256 + }, + { + "epoch": 0.8526821457165733, + "learning_rate": 1.8572792444528963e-05, + "loss": 0.0133, + "step": 4258 + }, + { + "epoch": 0.8526821457165733, + "learning_rate": 1.8565588993632498e-05, + "loss": 0.073, + "step": 4260 + }, + { + "epoch": 0.8534827862289832, + "learning_rate": 1.8558368813955136e-05, + "loss": 0.0716, + "step": 4262 + }, + { + "epoch": 0.8534827862289832, + "learning_rate": 1.8551131919598084e-05, + "loss": 0.0109, + "step": 4264 + }, + { + "epoch": 0.8542834267413931, + "learning_rate": 1.854387832469512e-05, + "loss": 0.3256, + "step": 4266 + }, + { + "epoch": 0.8542834267413931, + "learning_rate": 1.8536608043412702e-05, + "loss": 0.2274, + "step": 4268 + }, + { + "epoch": 0.855084067253803, + "learning_rate": 1.8529321089949833e-05, + "loss": 0.0542, + "step": 4270 + }, + { + "epoch": 0.855084067253803, + "learning_rate": 1.852201747853807e-05, + "loss": 0.0933, + "step": 4272 + }, + { + "epoch": 0.855884707766213, + "learning_rate": 1.8514697223441565e-05, + "loss": 0.0857, + "step": 4274 + }, + { + "epoch": 0.855884707766213, + "learning_rate": 1.8507360338956896e-05, + "loss": 0.0269, + "step": 4276 + }, + { + "epoch": 0.8566853482786229, + "learning_rate": 1.850000683941319e-05, + "loss": 0.5278, + "step": 4278 + }, + { + "epoch": 0.8566853482786229, + "learning_rate": 1.849263673917196e-05, + "loss": 0.0669, + "step": 4280 + }, + { + "epoch": 0.8574859887910328, + "learning_rate": 1.8485250052627205e-05, + "loss": 0.0227, + "step": 4282 + }, + { + "epoch": 0.8574859887910328, + "learning_rate": 1.847784679420527e-05, + "loss": 0.4488, + "step": 4284 + }, + { + "epoch": 0.8582866293034428, + "learning_rate": 1.8470426978364857e-05, + "loss": 0.0222, + "step": 4286 + }, + { + "epoch": 0.8582866293034428, + "learning_rate": 1.846299061959706e-05, + "loss": 0.1453, + "step": 4288 + }, + { + "epoch": 0.8590872698158527, + "learning_rate": 1.845553773242522e-05, + "loss": 0.0228, + "step": 4290 + }, + { + "epoch": 0.8590872698158527, + "learning_rate": 1.8448068331405018e-05, + "loss": 0.0546, + "step": 4292 + }, + { + "epoch": 0.8598879103282626, + "learning_rate": 1.8440582431124322e-05, + "loss": 0.375, + "step": 4294 + }, + { + "epoch": 0.8598879103282626, + "learning_rate": 1.8433080046203293e-05, + "loss": 0.0364, + "step": 4296 + }, + { + "epoch": 0.8606885508406725, + "learning_rate": 1.842556119129423e-05, + "loss": 0.0201, + "step": 4298 + }, + { + "epoch": 0.8606885508406725, + "learning_rate": 1.841802588108161e-05, + "loss": 0.177, + "step": 4300 + }, + { + "epoch": 0.8614891913530824, + "learning_rate": 1.841047413028209e-05, + "loss": 0.0409, + "step": 4302 + }, + { + "epoch": 0.8614891913530824, + "learning_rate": 1.8402905953644356e-05, + "loss": 0.0856, + "step": 4304 + }, + { + "epoch": 0.8622898318654924, + "learning_rate": 1.8395321365949273e-05, + "loss": 0.1299, + "step": 4306 + }, + { + "epoch": 0.8622898318654924, + "learning_rate": 1.838772038200968e-05, + "loss": 0.262, + "step": 4308 + }, + { + "epoch": 0.8630904723779024, + "learning_rate": 1.838010301667044e-05, + "loss": 0.0226, + "step": 4310 + }, + { + "epoch": 0.8630904723779024, + "learning_rate": 1.837246928480848e-05, + "loss": 0.0459, + "step": 4312 + }, + { + "epoch": 0.8638911128903123, + "learning_rate": 1.8364819201332596e-05, + "loss": 0.0162, + "step": 4314 + }, + { + "epoch": 0.8638911128903123, + "learning_rate": 1.8357152781183613e-05, + "loss": 0.5129, + "step": 4316 + }, + { + "epoch": 0.8646917534027222, + "learning_rate": 1.834947003933417e-05, + "loss": 0.0115, + "step": 4318 + }, + { + "epoch": 0.8646917534027222, + "learning_rate": 1.8341770990788874e-05, + "loss": 0.032, + "step": 4320 + }, + { + "epoch": 0.8654923939151321, + "learning_rate": 1.8334055650584107e-05, + "loss": 0.0105, + "step": 4322 + }, + { + "epoch": 0.8654923939151321, + "learning_rate": 1.8326324033788087e-05, + "loss": 0.0711, + "step": 4324 + }, + { + "epoch": 0.866293034427542, + "learning_rate": 1.8318576155500855e-05, + "loss": 0.8233, + "step": 4326 + }, + { + "epoch": 0.866293034427542, + "learning_rate": 1.831081203085415e-05, + "loss": 0.0293, + "step": 4328 + }, + { + "epoch": 0.8670936749399519, + "learning_rate": 1.830303167501152e-05, + "loss": 0.1085, + "step": 4330 + }, + { + "epoch": 0.8670936749399519, + "learning_rate": 1.8295235103168128e-05, + "loss": 0.0672, + "step": 4332 + }, + { + "epoch": 0.8678943154523618, + "learning_rate": 1.8287422330550885e-05, + "loss": 0.0076, + "step": 4334 + }, + { + "epoch": 0.8678943154523618, + "learning_rate": 1.8279593372418284e-05, + "loss": 0.1378, + "step": 4336 + }, + { + "epoch": 0.8686949559647719, + "learning_rate": 1.827174824406043e-05, + "loss": 0.0077, + "step": 4338 + }, + { + "epoch": 0.8686949559647719, + "learning_rate": 1.8263886960799072e-05, + "loss": 0.0113, + "step": 4340 + }, + { + "epoch": 0.8694955964771818, + "learning_rate": 1.8256009537987424e-05, + "loss": 0.0158, + "step": 4342 + }, + { + "epoch": 0.8694955964771818, + "learning_rate": 1.8248115991010303e-05, + "loss": 0.0264, + "step": 4344 + }, + { + "epoch": 0.8702962369895917, + "learning_rate": 1.8240206335283943e-05, + "loss": 0.0024, + "step": 4346 + }, + { + "epoch": 0.8702962369895917, + "learning_rate": 1.8232280586256104e-05, + "loss": 0.0255, + "step": 4348 + }, + { + "epoch": 0.8710968775020016, + "learning_rate": 1.8224338759405934e-05, + "loss": 0.0242, + "step": 4350 + }, + { + "epoch": 0.8710968775020016, + "learning_rate": 1.8216380870243963e-05, + "loss": 0.053, + "step": 4352 + }, + { + "epoch": 0.8718975180144115, + "learning_rate": 1.820840693431217e-05, + "loss": 0.009, + "step": 4354 + }, + { + "epoch": 0.8718975180144115, + "learning_rate": 1.820041696718378e-05, + "loss": 0.1606, + "step": 4356 + }, + { + "epoch": 0.8726981585268214, + "learning_rate": 1.8192410984463416e-05, + "loss": 0.0078, + "step": 4358 + }, + { + "epoch": 0.8726981585268214, + "learning_rate": 1.8184389001786912e-05, + "loss": 0.1673, + "step": 4360 + }, + { + "epoch": 0.8734987990392313, + "learning_rate": 1.8176351034821352e-05, + "loss": 0.0249, + "step": 4362 + }, + { + "epoch": 0.8734987990392313, + "learning_rate": 1.8168297099265108e-05, + "loss": 0.0125, + "step": 4364 + }, + { + "epoch": 0.8742994395516414, + "learning_rate": 1.8160227210847642e-05, + "loss": 0.6164, + "step": 4366 + }, + { + "epoch": 0.8742994395516414, + "learning_rate": 1.815214138532966e-05, + "loss": 0.2749, + "step": 4368 + }, + { + "epoch": 0.8751000800640513, + "learning_rate": 1.8144039638502927e-05, + "loss": 0.0153, + "step": 4370 + }, + { + "epoch": 0.8751000800640513, + "learning_rate": 1.8135921986190358e-05, + "loss": 0.6254, + "step": 4372 + }, + { + "epoch": 0.8759007205764612, + "learning_rate": 1.8127788444245884e-05, + "loss": 0.0137, + "step": 4374 + }, + { + "epoch": 0.8759007205764612, + "learning_rate": 1.8119639028554475e-05, + "loss": 0.791, + "step": 4376 + }, + { + "epoch": 0.8767013610888711, + "learning_rate": 1.8111473755032152e-05, + "loss": 0.0118, + "step": 4378 + }, + { + "epoch": 0.8767013610888711, + "learning_rate": 1.8103292639625835e-05, + "loss": 0.18, + "step": 4380 + }, + { + "epoch": 0.877502001601281, + "learning_rate": 1.8095095698313456e-05, + "loss": 0.0101, + "step": 4382 + }, + { + "epoch": 0.877502001601281, + "learning_rate": 1.808688294710378e-05, + "loss": 0.042, + "step": 4384 + }, + { + "epoch": 0.8783026421136909, + "learning_rate": 1.807865440203653e-05, + "loss": 0.2267, + "step": 4386 + }, + { + "epoch": 0.8783026421136909, + "learning_rate": 1.807041007918221e-05, + "loss": 0.5602, + "step": 4388 + }, + { + "epoch": 0.8791032826261009, + "learning_rate": 1.806214999464214e-05, + "loss": 0.0074, + "step": 4390 + }, + { + "epoch": 0.8791032826261009, + "learning_rate": 1.805387416454849e-05, + "loss": 0.0994, + "step": 4392 + }, + { + "epoch": 0.8799039231385108, + "learning_rate": 1.8045582605064087e-05, + "loss": 0.2399, + "step": 4394 + }, + { + "epoch": 0.8799039231385108, + "learning_rate": 1.8037275332382575e-05, + "loss": 0.3572, + "step": 4396 + }, + { + "epoch": 0.8807045636509208, + "learning_rate": 1.802895236272819e-05, + "loss": 0.2666, + "step": 4398 + }, + { + "epoch": 0.8807045636509208, + "learning_rate": 1.802061371235592e-05, + "loss": 0.3552, + "step": 4400 + }, + { + "epoch": 0.8815052041633307, + "learning_rate": 1.80122593975513e-05, + "loss": 0.044, + "step": 4402 + }, + { + "epoch": 0.8815052041633307, + "learning_rate": 1.8003889434630476e-05, + "loss": 0.0903, + "step": 4404 + }, + { + "epoch": 0.8823058446757406, + "learning_rate": 1.7995503839940204e-05, + "loss": 0.0964, + "step": 4406 + }, + { + "epoch": 0.8823058446757406, + "learning_rate": 1.7987102629857692e-05, + "loss": 0.0123, + "step": 4408 + }, + { + "epoch": 0.8831064851881505, + "learning_rate": 1.7978685820790725e-05, + "loss": 0.1195, + "step": 4410 + }, + { + "epoch": 0.8831064851881505, + "learning_rate": 1.7970253429177494e-05, + "loss": 0.014, + "step": 4412 + }, + { + "epoch": 0.8839071257005604, + "learning_rate": 1.796180547148662e-05, + "loss": 0.1408, + "step": 4414 + }, + { + "epoch": 0.8839071257005604, + "learning_rate": 1.7953341964217196e-05, + "loss": 0.0225, + "step": 4416 + }, + { + "epoch": 0.8847077662129704, + "learning_rate": 1.7944862923898586e-05, + "loss": 0.1834, + "step": 4418 + }, + { + "epoch": 0.8847077662129704, + "learning_rate": 1.7936368367090583e-05, + "loss": 0.5575, + "step": 4420 + }, + { + "epoch": 0.8855084067253803, + "learning_rate": 1.7927858310383196e-05, + "loss": 0.1227, + "step": 4422 + }, + { + "epoch": 0.8855084067253803, + "learning_rate": 1.7919332770396798e-05, + "loss": 0.1641, + "step": 4424 + }, + { + "epoch": 0.8863090472377902, + "learning_rate": 1.7910791763781928e-05, + "loss": 0.0946, + "step": 4426 + }, + { + "epoch": 0.8863090472377902, + "learning_rate": 1.7902235307219336e-05, + "loss": 0.05, + "step": 4428 + }, + { + "epoch": 0.8871096877502002, + "learning_rate": 1.789366341742001e-05, + "loss": 0.0605, + "step": 4430 + }, + { + "epoch": 0.8871096877502002, + "learning_rate": 1.7885076111125e-05, + "loss": 0.0413, + "step": 4432 + }, + { + "epoch": 0.8879103282626101, + "learning_rate": 1.7876473405105535e-05, + "loss": 0.0594, + "step": 4434 + }, + { + "epoch": 0.8879103282626101, + "learning_rate": 1.7867855316162846e-05, + "loss": 0.0796, + "step": 4436 + }, + { + "epoch": 0.88871096877502, + "learning_rate": 1.785922186112829e-05, + "loss": 0.0061, + "step": 4438 + }, + { + "epoch": 0.88871096877502, + "learning_rate": 1.7850573056863173e-05, + "loss": 0.0128, + "step": 4440 + }, + { + "epoch": 0.8895116092874299, + "learning_rate": 1.7841908920258774e-05, + "loss": 0.1015, + "step": 4442 + }, + { + "epoch": 0.8895116092874299, + "learning_rate": 1.783322946823638e-05, + "loss": 0.2583, + "step": 4444 + }, + { + "epoch": 0.8903122497998399, + "learning_rate": 1.782453471774711e-05, + "loss": 0.8147, + "step": 4446 + }, + { + "epoch": 0.8903122497998399, + "learning_rate": 1.7815824685772042e-05, + "loss": 0.8269, + "step": 4448 + }, + { + "epoch": 0.8911128903122498, + "learning_rate": 1.7807099389322013e-05, + "loss": 0.0473, + "step": 4450 + }, + { + "epoch": 0.8911128903122498, + "learning_rate": 1.779835884543776e-05, + "loss": 0.0294, + "step": 4452 + }, + { + "epoch": 0.8919135308246597, + "learning_rate": 1.7789603071189733e-05, + "loss": 0.135, + "step": 4454 + }, + { + "epoch": 0.8919135308246597, + "learning_rate": 1.7780832083678122e-05, + "loss": 0.002, + "step": 4456 + }, + { + "epoch": 0.8927141713370697, + "learning_rate": 1.7772045900032912e-05, + "loss": 0.0052, + "step": 4458 + }, + { + "epoch": 0.8927141713370697, + "learning_rate": 1.776324453741365e-05, + "loss": 0.2706, + "step": 4460 + }, + { + "epoch": 0.8935148118494796, + "learning_rate": 1.7754428013009644e-05, + "loss": 0.2102, + "step": 4462 + }, + { + "epoch": 0.8935148118494796, + "learning_rate": 1.774559634403971e-05, + "loss": 0.602, + "step": 4464 + }, + { + "epoch": 0.8943154523618895, + "learning_rate": 1.7736749547752327e-05, + "loss": 0.0216, + "step": 4466 + }, + { + "epoch": 0.8943154523618895, + "learning_rate": 1.7727887641425465e-05, + "loss": 0.6029, + "step": 4468 + }, + { + "epoch": 0.8951160928742994, + "learning_rate": 1.7719010642366597e-05, + "loss": 0.0458, + "step": 4470 + }, + { + "epoch": 0.8951160928742994, + "learning_rate": 1.7710118567912732e-05, + "loss": 0.3132, + "step": 4472 + }, + { + "epoch": 0.8959167333867094, + "learning_rate": 1.770121143543025e-05, + "loss": 0.0946, + "step": 4474 + }, + { + "epoch": 0.8959167333867094, + "learning_rate": 1.7692289262315008e-05, + "loss": 0.2963, + "step": 4476 + }, + { + "epoch": 0.8967173738991193, + "learning_rate": 1.7683352065992174e-05, + "loss": 0.1634, + "step": 4478 + }, + { + "epoch": 0.8967173738991193, + "learning_rate": 1.7674399863916298e-05, + "loss": 0.0138, + "step": 4480 + }, + { + "epoch": 0.8975180144115292, + "learning_rate": 1.7665432673571238e-05, + "loss": 0.1079, + "step": 4482 + }, + { + "epoch": 0.8975180144115292, + "learning_rate": 1.765645051247007e-05, + "loss": 0.2162, + "step": 4484 + }, + { + "epoch": 0.8983186549239391, + "learning_rate": 1.7647453398155204e-05, + "loss": 0.0719, + "step": 4486 + }, + { + "epoch": 0.8983186549239391, + "learning_rate": 1.7638441348198144e-05, + "loss": 0.036, + "step": 4488 + }, + { + "epoch": 0.899119295436349, + "learning_rate": 1.7629414380199672e-05, + "loss": 0.2451, + "step": 4490 + }, + { + "epoch": 0.899119295436349, + "learning_rate": 1.762037251178961e-05, + "loss": 0.0772, + "step": 4492 + }, + { + "epoch": 0.899919935948759, + "learning_rate": 1.7611315760626943e-05, + "loss": 0.0441, + "step": 4494 + }, + { + "epoch": 0.899919935948759, + "learning_rate": 1.7602244144399713e-05, + "loss": 0.0409, + "step": 4496 + }, + { + "epoch": 0.900720576461169, + "learning_rate": 1.7593157680824943e-05, + "loss": 0.0565, + "step": 4498 + }, + { + "epoch": 0.900720576461169, + "learning_rate": 1.7584056387648738e-05, + "loss": 0.0094, + "step": 4500 + }, + { + "epoch": 0.9015212169735789, + "learning_rate": 1.757494028264608e-05, + "loss": 0.0841, + "step": 4502 + }, + { + "epoch": 0.9015212169735789, + "learning_rate": 1.7565809383620966e-05, + "loss": 0.0904, + "step": 4504 + }, + { + "epoch": 0.9023218574859888, + "learning_rate": 1.7556663708406203e-05, + "loss": 0.0635, + "step": 4506 + }, + { + "epoch": 0.9023218574859888, + "learning_rate": 1.7547503274863502e-05, + "loss": 0.0018, + "step": 4508 + }, + { + "epoch": 0.9031224979983987, + "learning_rate": 1.7538328100883404e-05, + "loss": 0.0075, + "step": 4510 + }, + { + "epoch": 0.9031224979983987, + "learning_rate": 1.7529138204385186e-05, + "loss": 0.5073, + "step": 4512 + }, + { + "epoch": 0.9039231385108086, + "learning_rate": 1.7519933603316962e-05, + "loss": 0.1192, + "step": 4514 + }, + { + "epoch": 0.9039231385108086, + "learning_rate": 1.7510714315655467e-05, + "loss": 0.0521, + "step": 4516 + }, + { + "epoch": 0.9047237790232185, + "learning_rate": 1.750148035940622e-05, + "loss": 0.0775, + "step": 4518 + }, + { + "epoch": 0.9047237790232185, + "learning_rate": 1.7492231752603305e-05, + "loss": 0.2796, + "step": 4520 + }, + { + "epoch": 0.9055244195356285, + "learning_rate": 1.7482968513309458e-05, + "loss": 0.0709, + "step": 4522 + }, + { + "epoch": 0.9055244195356285, + "learning_rate": 1.7473690659616e-05, + "loss": 0.0054, + "step": 4524 + }, + { + "epoch": 0.9063250600480385, + "learning_rate": 1.7464398209642744e-05, + "loss": 0.2577, + "step": 4526 + }, + { + "epoch": 0.9063250600480385, + "learning_rate": 1.7455091181538094e-05, + "loss": 0.1193, + "step": 4528 + }, + { + "epoch": 0.9071257005604484, + "learning_rate": 1.7445769593478842e-05, + "loss": 0.0419, + "step": 4530 + }, + { + "epoch": 0.9071257005604484, + "learning_rate": 1.743643346367027e-05, + "loss": 0.0178, + "step": 4532 + }, + { + "epoch": 0.9079263410728583, + "learning_rate": 1.7427082810346024e-05, + "loss": 0.1569, + "step": 4534 + }, + { + "epoch": 0.9079263410728583, + "learning_rate": 1.741771765176815e-05, + "loss": 0.0034, + "step": 4536 + }, + { + "epoch": 0.9087269815852682, + "learning_rate": 1.740833800622701e-05, + "loss": 0.0444, + "step": 4538 + }, + { + "epoch": 0.9087269815852682, + "learning_rate": 1.739894389204122e-05, + "loss": 0.1744, + "step": 4540 + }, + { + "epoch": 0.9095276220976781, + "learning_rate": 1.738953532755774e-05, + "loss": 0.2178, + "step": 4542 + }, + { + "epoch": 0.9095276220976781, + "learning_rate": 1.7380112331151657e-05, + "loss": 0.201, + "step": 4544 + }, + { + "epoch": 0.910328262610088, + "learning_rate": 1.7370674921226306e-05, + "loss": 0.0068, + "step": 4546 + }, + { + "epoch": 0.910328262610088, + "learning_rate": 1.7361223116213146e-05, + "loss": 0.0051, + "step": 4548 + }, + { + "epoch": 0.911128903122498, + "learning_rate": 1.7351756934571764e-05, + "loss": 0.0059, + "step": 4550 + }, + { + "epoch": 0.911128903122498, + "learning_rate": 1.7342276394789825e-05, + "loss": 0.0079, + "step": 4552 + }, + { + "epoch": 0.911929543634908, + "learning_rate": 1.7332781515382996e-05, + "loss": 1.5021, + "step": 4554 + }, + { + "epoch": 0.911929543634908, + "learning_rate": 1.732327231489503e-05, + "loss": 0.6782, + "step": 4556 + }, + { + "epoch": 0.9127301841473179, + "learning_rate": 1.7313748811897564e-05, + "loss": 0.0343, + "step": 4558 + }, + { + "epoch": 0.9127301841473179, + "learning_rate": 1.7304211024990216e-05, + "loss": 0.1984, + "step": 4560 + }, + { + "epoch": 0.9135308246597278, + "learning_rate": 1.7294658972800495e-05, + "loss": 0.0487, + "step": 4562 + }, + { + "epoch": 0.9135308246597278, + "learning_rate": 1.728509267398376e-05, + "loss": 0.0635, + "step": 4564 + }, + { + "epoch": 0.9143314651721377, + "learning_rate": 1.727551214722322e-05, + "loss": 0.5774, + "step": 4566 + }, + { + "epoch": 0.9143314651721377, + "learning_rate": 1.7265917411229803e-05, + "loss": 0.0164, + "step": 4568 + }, + { + "epoch": 0.9151321056845476, + "learning_rate": 1.72563084847423e-05, + "loss": 0.0068, + "step": 4570 + }, + { + "epoch": 0.9151321056845476, + "learning_rate": 1.7246685386527105e-05, + "loss": 0.0708, + "step": 4572 + }, + { + "epoch": 0.9159327461969575, + "learning_rate": 1.723704813537835e-05, + "loss": 0.3606, + "step": 4574 + }, + { + "epoch": 0.9159327461969575, + "learning_rate": 1.7227396750117802e-05, + "loss": 0.0364, + "step": 4576 + }, + { + "epoch": 0.9167333867093675, + "learning_rate": 1.7217731249594817e-05, + "loss": 0.4258, + "step": 4578 + }, + { + "epoch": 0.9167333867093675, + "learning_rate": 1.7208051652686348e-05, + "loss": 0.4878, + "step": 4580 + }, + { + "epoch": 0.9175340272217775, + "learning_rate": 1.7198357978296827e-05, + "loss": 0.0141, + "step": 4582 + }, + { + "epoch": 0.9175340272217775, + "learning_rate": 1.718865024535822e-05, + "loss": 0.031, + "step": 4584 + }, + { + "epoch": 0.9183346677341874, + "learning_rate": 1.717892847282995e-05, + "loss": 0.1539, + "step": 4586 + }, + { + "epoch": 0.9183346677341874, + "learning_rate": 1.716919267969884e-05, + "loss": 0.1482, + "step": 4588 + }, + { + "epoch": 0.9191353082465973, + "learning_rate": 1.715944288497911e-05, + "loss": 0.0196, + "step": 4590 + }, + { + "epoch": 0.9191353082465973, + "learning_rate": 1.7149679107712317e-05, + "loss": 0.1897, + "step": 4592 + }, + { + "epoch": 0.9199359487590072, + "learning_rate": 1.713990136696734e-05, + "loss": 0.0511, + "step": 4594 + }, + { + "epoch": 0.9199359487590072, + "learning_rate": 1.7130109681840298e-05, + "loss": 0.7649, + "step": 4596 + }, + { + "epoch": 0.9207365892714171, + "learning_rate": 1.7120304071454578e-05, + "loss": 0.0045, + "step": 4598 + }, + { + "epoch": 0.9207365892714171, + "learning_rate": 1.711048455496075e-05, + "loss": 0.0049, + "step": 4600 + }, + { + "epoch": 0.921537229783827, + "learning_rate": 1.7100651151536532e-05, + "loss": 0.6462, + "step": 4602 + }, + { + "epoch": 0.921537229783827, + "learning_rate": 1.7090803880386784e-05, + "loss": 0.0741, + "step": 4604 + }, + { + "epoch": 0.922337870296237, + "learning_rate": 1.708094276074344e-05, + "loss": 0.012, + "step": 4606 + }, + { + "epoch": 0.922337870296237, + "learning_rate": 1.7071067811865474e-05, + "loss": 0.5408, + "step": 4608 + }, + { + "epoch": 0.923138510808647, + "learning_rate": 1.7061179053038894e-05, + "loss": 0.0201, + "step": 4610 + }, + { + "epoch": 0.923138510808647, + "learning_rate": 1.705127650357663e-05, + "loss": 0.0403, + "step": 4612 + }, + { + "epoch": 0.9239391513210569, + "learning_rate": 1.704136018281859e-05, + "loss": 0.034, + "step": 4614 + }, + { + "epoch": 0.9239391513210569, + "learning_rate": 1.7031430110131566e-05, + "loss": 0.0217, + "step": 4616 + }, + { + "epoch": 0.9247397918334668, + "learning_rate": 1.7021486304909202e-05, + "loss": 0.4436, + "step": 4618 + }, + { + "epoch": 0.9247397918334668, + "learning_rate": 1.701152878657197e-05, + "loss": 0.0038, + "step": 4620 + }, + { + "epoch": 0.9255404323458767, + "learning_rate": 1.700155757456711e-05, + "loss": 0.2702, + "step": 4622 + }, + { + "epoch": 0.9255404323458767, + "learning_rate": 1.699157268836863e-05, + "loss": 0.4658, + "step": 4624 + }, + { + "epoch": 0.9263410728582866, + "learning_rate": 1.6981574147477214e-05, + "loss": 0.0201, + "step": 4626 + }, + { + "epoch": 0.9263410728582866, + "learning_rate": 1.697156197142023e-05, + "loss": 0.2149, + "step": 4628 + }, + { + "epoch": 0.9271417133706965, + "learning_rate": 1.696153617975168e-05, + "loss": 0.0615, + "step": 4630 + }, + { + "epoch": 0.9271417133706965, + "learning_rate": 1.6951496792052148e-05, + "loss": 0.0693, + "step": 4632 + }, + { + "epoch": 0.9279423538831065, + "learning_rate": 1.694144382792878e-05, + "loss": 0.0667, + "step": 4634 + }, + { + "epoch": 0.9279423538831065, + "learning_rate": 1.6931377307015236e-05, + "loss": 0.3292, + "step": 4636 + }, + { + "epoch": 0.9287429943955164, + "learning_rate": 1.6921297248971652e-05, + "loss": 0.2767, + "step": 4638 + }, + { + "epoch": 0.9287429943955164, + "learning_rate": 1.6911203673484583e-05, + "loss": 0.0091, + "step": 4640 + }, + { + "epoch": 0.9295436349079264, + "learning_rate": 1.690109660026701e-05, + "loss": 0.5111, + "step": 4642 + }, + { + "epoch": 0.9295436349079264, + "learning_rate": 1.6890976049058267e-05, + "loss": 0.0762, + "step": 4644 + }, + { + "epoch": 0.9303442754203363, + "learning_rate": 1.688084203962401e-05, + "loss": 0.003, + "step": 4646 + }, + { + "epoch": 0.9303442754203363, + "learning_rate": 1.687069459175619e-05, + "loss": 0.014, + "step": 4648 + }, + { + "epoch": 0.9311449159327462, + "learning_rate": 1.6860533725272953e-05, + "loss": 0.058, + "step": 4650 + }, + { + "epoch": 0.9311449159327462, + "learning_rate": 1.6850359460018744e-05, + "loss": 0.019, + "step": 4652 + }, + { + "epoch": 0.9319455564451561, + "learning_rate": 1.6840171815864085e-05, + "loss": 0.2688, + "step": 4654 + }, + { + "epoch": 0.9319455564451561, + "learning_rate": 1.682997081270568e-05, + "loss": 0.1099, + "step": 4656 + }, + { + "epoch": 0.932746196957566, + "learning_rate": 1.681975647046631e-05, + "loss": 0.1455, + "step": 4658 + }, + { + "epoch": 0.932746196957566, + "learning_rate": 1.6809528809094805e-05, + "loss": 0.0089, + "step": 4660 + }, + { + "epoch": 0.933546837469976, + "learning_rate": 1.6799287848566024e-05, + "loss": 0.2025, + "step": 4662 + }, + { + "epoch": 0.933546837469976, + "learning_rate": 1.6789033608880742e-05, + "loss": 0.077, + "step": 4664 + }, + { + "epoch": 0.9343474779823859, + "learning_rate": 1.6778766110065765e-05, + "loss": 0.0513, + "step": 4666 + }, + { + "epoch": 0.9343474779823859, + "learning_rate": 1.67684853721737e-05, + "loss": 0.0679, + "step": 4668 + }, + { + "epoch": 0.9351481184947958, + "learning_rate": 1.6758191415283066e-05, + "loss": 0.3074, + "step": 4670 + }, + { + "epoch": 0.9351481184947958, + "learning_rate": 1.6747884259498185e-05, + "loss": 0.0022, + "step": 4672 + }, + { + "epoch": 0.9359487590072058, + "learning_rate": 1.673756392494916e-05, + "loss": 0.0161, + "step": 4674 + }, + { + "epoch": 0.9359487590072058, + "learning_rate": 1.6727230431791826e-05, + "loss": 0.4532, + "step": 4676 + }, + { + "epoch": 0.9367493995196157, + "learning_rate": 1.671688380020769e-05, + "loss": 0.4219, + "step": 4678 + }, + { + "epoch": 0.9367493995196157, + "learning_rate": 1.6706524050404006e-05, + "loss": 0.0553, + "step": 4680 + }, + { + "epoch": 0.9375500400320256, + "learning_rate": 1.6696151202613537e-05, + "loss": 0.0008, + "step": 4682 + }, + { + "epoch": 0.9375500400320256, + "learning_rate": 1.6685765277094702e-05, + "loss": 0.0504, + "step": 4684 + }, + { + "epoch": 0.9383506805444356, + "learning_rate": 1.6675366294131432e-05, + "loss": 0.033, + "step": 4686 + }, + { + "epoch": 0.9383506805444356, + "learning_rate": 1.6664954274033175e-05, + "loss": 0.1768, + "step": 4688 + }, + { + "epoch": 0.9391513210568455, + "learning_rate": 1.6654529237134833e-05, + "loss": 0.0003, + "step": 4690 + }, + { + "epoch": 0.9391513210568455, + "learning_rate": 1.66440912037967e-05, + "loss": 0.0312, + "step": 4692 + }, + { + "epoch": 0.9399519615692554, + "learning_rate": 1.663364019440453e-05, + "loss": 0.06, + "step": 4694 + }, + { + "epoch": 0.9399519615692554, + "learning_rate": 1.662317622936933e-05, + "loss": 0.0337, + "step": 4696 + }, + { + "epoch": 0.9407526020816653, + "learning_rate": 1.6612699329127467e-05, + "loss": 0.0322, + "step": 4698 + }, + { + "epoch": 0.9407526020816653, + "learning_rate": 1.6602209514140562e-05, + "loss": 0.3268, + "step": 4700 + }, + { + "epoch": 0.9415532425940752, + "learning_rate": 1.6591706804895415e-05, + "loss": 0.0634, + "step": 4702 + }, + { + "epoch": 0.9415532425940752, + "learning_rate": 1.6581191221904098e-05, + "loss": 0.3512, + "step": 4704 + }, + { + "epoch": 0.9423538831064852, + "learning_rate": 1.6570662785703716e-05, + "loss": 0.0303, + "step": 4706 + }, + { + "epoch": 0.9423538831064852, + "learning_rate": 1.6560121516856592e-05, + "loss": 0.089, + "step": 4708 + }, + { + "epoch": 0.9431545236188951, + "learning_rate": 1.654956743595001e-05, + "loss": 0.5975, + "step": 4710 + }, + { + "epoch": 0.9431545236188951, + "learning_rate": 1.6539000563596328e-05, + "loss": 0.7379, + "step": 4712 + }, + { + "epoch": 0.9439551641313051, + "learning_rate": 1.6528420920432893e-05, + "loss": 0.0426, + "step": 4714 + }, + { + "epoch": 0.9439551641313051, + "learning_rate": 1.651782852712194e-05, + "loss": 0.0634, + "step": 4716 + }, + { + "epoch": 0.944755804643715, + "learning_rate": 1.6507223404350686e-05, + "loss": 0.019, + "step": 4718 + }, + { + "epoch": 0.944755804643715, + "learning_rate": 1.6496605572831127e-05, + "loss": 0.0107, + "step": 4720 + }, + { + "epoch": 0.9455564451561249, + "learning_rate": 1.648597505330016e-05, + "loss": 0.0001, + "step": 4722 + }, + { + "epoch": 0.9455564451561249, + "learning_rate": 1.6475331866519387e-05, + "loss": 0.0632, + "step": 4724 + }, + { + "epoch": 0.9463570856685348, + "learning_rate": 1.6464676033275187e-05, + "loss": 0.0381, + "step": 4726 + }, + { + "epoch": 0.9463570856685348, + "learning_rate": 1.6454007574378657e-05, + "loss": 0.0013, + "step": 4728 + }, + { + "epoch": 0.9471577261809447, + "learning_rate": 1.644332651066548e-05, + "loss": 0.2155, + "step": 4730 + }, + { + "epoch": 0.9471577261809447, + "learning_rate": 1.6432632862996062e-05, + "loss": 0.4494, + "step": 4732 + }, + { + "epoch": 0.9479583666933546, + "learning_rate": 1.6421926652255275e-05, + "loss": 0.0412, + "step": 4734 + }, + { + "epoch": 0.9479583666933546, + "learning_rate": 1.6411207899352633e-05, + "loss": 0.0017, + "step": 4736 + }, + { + "epoch": 0.9487590072057646, + "learning_rate": 1.6400476625222057e-05, + "loss": 0.0191, + "step": 4738 + }, + { + "epoch": 0.9487590072057646, + "learning_rate": 1.6389732850821964e-05, + "loss": 0.0903, + "step": 4740 + }, + { + "epoch": 0.9495596477181746, + "learning_rate": 1.6378976597135193e-05, + "loss": 0.1674, + "step": 4742 + }, + { + "epoch": 0.9495596477181746, + "learning_rate": 1.6368207885168904e-05, + "loss": 0.2763, + "step": 4744 + }, + { + "epoch": 0.9503602882305845, + "learning_rate": 1.635742673595468e-05, + "loss": 0.1764, + "step": 4746 + }, + { + "epoch": 0.9503602882305845, + "learning_rate": 1.6346633170548275e-05, + "loss": 0.1059, + "step": 4748 + }, + { + "epoch": 0.9511609287429944, + "learning_rate": 1.6335827210029823e-05, + "loss": 0.2641, + "step": 4750 + }, + { + "epoch": 0.9511609287429944, + "learning_rate": 1.6325008875503563e-05, + "loss": 0.0188, + "step": 4752 + }, + { + "epoch": 0.9519615692554043, + "learning_rate": 1.6314178188097917e-05, + "loss": 0.2504, + "step": 4754 + }, + { + "epoch": 0.9519615692554043, + "learning_rate": 1.6303335168965495e-05, + "loss": 0.0194, + "step": 4756 + }, + { + "epoch": 0.9527622097678142, + "learning_rate": 1.6292479839282904e-05, + "loss": 0.9506, + "step": 4758 + }, + { + "epoch": 0.9527622097678142, + "learning_rate": 1.628161222025089e-05, + "loss": 0.2162, + "step": 4760 + }, + { + "epoch": 0.9535628502802241, + "learning_rate": 1.627073233309409e-05, + "loss": 0.0594, + "step": 4762 + }, + { + "epoch": 0.9535628502802241, + "learning_rate": 1.625984019906122e-05, + "loss": 0.0178, + "step": 4764 + }, + { + "epoch": 0.9543634907926342, + "learning_rate": 1.624893583942482e-05, + "loss": 0.8116, + "step": 4766 + }, + { + "epoch": 0.9543634907926342, + "learning_rate": 1.623801927548132e-05, + "loss": 0.0255, + "step": 4768 + }, + { + "epoch": 0.9551641313050441, + "learning_rate": 1.6227090528551058e-05, + "loss": 0.0184, + "step": 4770 + }, + { + "epoch": 0.9551641313050441, + "learning_rate": 1.6216149619978057e-05, + "loss": 0.0068, + "step": 4772 + }, + { + "epoch": 0.955964771817454, + "learning_rate": 1.6205196571130204e-05, + "loss": 0.009, + "step": 4774 + }, + { + "epoch": 0.955964771817454, + "learning_rate": 1.6194231403398987e-05, + "loss": 0.0093, + "step": 4776 + }, + { + "epoch": 0.9567654123298639, + "learning_rate": 1.618325413819967e-05, + "loss": 0.0605, + "step": 4778 + }, + { + "epoch": 0.9567654123298639, + "learning_rate": 1.6172264796971063e-05, + "loss": 0.0874, + "step": 4780 + }, + { + "epoch": 0.9575660528422738, + "learning_rate": 1.6161263401175555e-05, + "loss": 0.0056, + "step": 4782 + }, + { + "epoch": 0.9575660528422738, + "learning_rate": 1.6150249972299173e-05, + "loss": 0.0133, + "step": 4784 + }, + { + "epoch": 0.9583666933546837, + "learning_rate": 1.613922453185133e-05, + "loss": 0.0406, + "step": 4786 + }, + { + "epoch": 0.9583666933546837, + "learning_rate": 1.612818710136499e-05, + "loss": 0.0583, + "step": 4788 + }, + { + "epoch": 0.9591673338670936, + "learning_rate": 1.6117137702396454e-05, + "loss": 0.3693, + "step": 4790 + }, + { + "epoch": 0.9591673338670936, + "learning_rate": 1.6106076356525484e-05, + "loss": 0.0209, + "step": 4792 + }, + { + "epoch": 0.9599679743795037, + "learning_rate": 1.6095003085355103e-05, + "loss": 0.3374, + "step": 4794 + }, + { + "epoch": 0.9599679743795037, + "learning_rate": 1.6083917910511623e-05, + "loss": 0.0363, + "step": 4796 + }, + { + "epoch": 0.9607686148919136, + "learning_rate": 1.6072820853644688e-05, + "loss": 0.2257, + "step": 4798 + }, + { + "epoch": 0.9607686148919136, + "learning_rate": 1.6061711936427028e-05, + "loss": 0.2098, + "step": 4800 + }, + { + "epoch": 0.9615692554043235, + "learning_rate": 1.6050591180554658e-05, + "loss": 0.0954, + "step": 4802 + }, + { + "epoch": 0.9615692554043235, + "learning_rate": 1.60394586077466e-05, + "loss": 0.0024, + "step": 4804 + }, + { + "epoch": 0.9623698959167334, + "learning_rate": 1.6028314239745068e-05, + "loss": 0.0109, + "step": 4806 + }, + { + "epoch": 0.9623698959167334, + "learning_rate": 1.6017158098315224e-05, + "loss": 0.0302, + "step": 4808 + }, + { + "epoch": 0.9631705364291433, + "learning_rate": 1.6005990205245226e-05, + "loss": 0.0014, + "step": 4810 + }, + { + "epoch": 0.9631705364291433, + "learning_rate": 1.5994810582346266e-05, + "loss": 0.0011, + "step": 4812 + }, + { + "epoch": 0.9639711769415532, + "learning_rate": 1.5983619251452334e-05, + "loss": 0.0007, + "step": 4814 + }, + { + "epoch": 0.9639711769415532, + "learning_rate": 1.5972416234420404e-05, + "loss": 0.3353, + "step": 4816 + }, + { + "epoch": 0.9647718174539631, + "learning_rate": 1.596120155313017e-05, + "loss": 0.0633, + "step": 4818 + }, + { + "epoch": 0.9647718174539631, + "learning_rate": 1.594997522948413e-05, + "loss": 0.1231, + "step": 4820 + }, + { + "epoch": 0.9655724579663731, + "learning_rate": 1.593873728540759e-05, + "loss": 0.0024, + "step": 4822 + }, + { + "epoch": 0.9655724579663731, + "learning_rate": 1.592748774284844e-05, + "loss": 0.0585, + "step": 4824 + }, + { + "epoch": 0.966373098478783, + "learning_rate": 1.5916226623777346e-05, + "loss": 0.029, + "step": 4826 + }, + { + "epoch": 0.966373098478783, + "learning_rate": 1.5904953950187448e-05, + "loss": 0.0177, + "step": 4828 + }, + { + "epoch": 0.967173738991193, + "learning_rate": 1.5893669744094587e-05, + "loss": 0.0026, + "step": 4830 + }, + { + "epoch": 0.967173738991193, + "learning_rate": 1.588237402753703e-05, + "loss": 0.3826, + "step": 4832 + }, + { + "epoch": 0.9679743795036029, + "learning_rate": 1.5871066822575526e-05, + "loss": 1.2153, + "step": 4834 + }, + { + "epoch": 0.9679743795036029, + "learning_rate": 1.5859748151293354e-05, + "loss": 0.0079, + "step": 4836 + }, + { + "epoch": 0.9687750200160128, + "learning_rate": 1.5848418035796064e-05, + "loss": 0.2626, + "step": 4838 + }, + { + "epoch": 0.9687750200160128, + "learning_rate": 1.5837076498211673e-05, + "loss": 0.0112, + "step": 4840 + }, + { + "epoch": 0.9695756605284227, + "learning_rate": 1.5825723560690396e-05, + "loss": 0.0035, + "step": 4842 + }, + { + "epoch": 0.9695756605284227, + "learning_rate": 1.581435924540482e-05, + "loss": 0.0111, + "step": 4844 + }, + { + "epoch": 0.9703763010408326, + "learning_rate": 1.580298357454967e-05, + "loss": 0.0132, + "step": 4846 + }, + { + "epoch": 0.9703763010408326, + "learning_rate": 1.579159657034185e-05, + "loss": 0.0945, + "step": 4848 + }, + { + "epoch": 0.9711769415532426, + "learning_rate": 1.5780198255020485e-05, + "loss": 0.0068, + "step": 4850 + }, + { + "epoch": 0.9711769415532426, + "learning_rate": 1.5768788650846674e-05, + "loss": 0.1996, + "step": 4852 + }, + { + "epoch": 0.9719775820656525, + "learning_rate": 1.5757367780103672e-05, + "loss": 0.0134, + "step": 4854 + }, + { + "epoch": 0.9719775820656525, + "learning_rate": 1.574593566509664e-05, + "loss": 0.0389, + "step": 4856 + }, + { + "epoch": 0.9727782225780625, + "learning_rate": 1.5734492328152796e-05, + "loss": 0.0292, + "step": 4858 + }, + { + "epoch": 0.9727782225780625, + "learning_rate": 1.5723037791621203e-05, + "loss": 0.0077, + "step": 4860 + }, + { + "epoch": 0.9735788630904724, + "learning_rate": 1.5711572077872784e-05, + "loss": 0.0352, + "step": 4862 + }, + { + "epoch": 0.9735788630904724, + "learning_rate": 1.5700095209300386e-05, + "loss": 0.0008, + "step": 4864 + }, + { + "epoch": 0.9743795036028823, + "learning_rate": 1.568860720831852e-05, + "loss": 0.0034, + "step": 4866 + }, + { + "epoch": 0.9743795036028823, + "learning_rate": 1.5677108097363565e-05, + "loss": 0.0024, + "step": 4868 + }, + { + "epoch": 0.9751801441152922, + "learning_rate": 1.5665597898893508e-05, + "loss": 0.0978, + "step": 4870 + }, + { + "epoch": 0.9751801441152922, + "learning_rate": 1.5654076635387976e-05, + "loss": 0.0575, + "step": 4872 + }, + { + "epoch": 0.9759807846277022, + "learning_rate": 1.5642544329348316e-05, + "loss": 0.7043, + "step": 4874 + }, + { + "epoch": 0.9759807846277022, + "learning_rate": 1.5631001003297302e-05, + "loss": 0.5892, + "step": 4876 + }, + { + "epoch": 0.9767814251401121, + "learning_rate": 1.5619446679779367e-05, + "loss": 0.3581, + "step": 4878 + }, + { + "epoch": 0.9767814251401121, + "learning_rate": 1.560788138136029e-05, + "loss": 0.7062, + "step": 4880 + }, + { + "epoch": 0.977582065652522, + "learning_rate": 1.5596305130627414e-05, + "loss": 0.0094, + "step": 4882 + }, + { + "epoch": 0.977582065652522, + "learning_rate": 1.5584717950189373e-05, + "loss": 0.1169, + "step": 4884 + }, + { + "epoch": 0.978382706164932, + "learning_rate": 1.5573119862676155e-05, + "loss": 0.5629, + "step": 4886 + }, + { + "epoch": 0.978382706164932, + "learning_rate": 1.5561510890739137e-05, + "loss": 0.1904, + "step": 4888 + }, + { + "epoch": 0.9791833466773419, + "learning_rate": 1.554989105705083e-05, + "loss": 0.0532, + "step": 4890 + }, + { + "epoch": 0.9791833466773419, + "learning_rate": 1.5538260384305083e-05, + "loss": 0.2299, + "step": 4892 + }, + { + "epoch": 0.9799839871897518, + "learning_rate": 1.5526618895216786e-05, + "loss": 0.0014, + "step": 4894 + }, + { + "epoch": 0.9799839871897518, + "learning_rate": 1.5514966612522088e-05, + "loss": 0.0223, + "step": 4896 + }, + { + "epoch": 0.9807846277021617, + "learning_rate": 1.5503303558978112e-05, + "loss": 0.0085, + "step": 4898 + }, + { + "epoch": 0.9807846277021617, + "learning_rate": 1.5491629757363033e-05, + "loss": 0.5192, + "step": 4900 + }, + { + "epoch": 0.9815852682145717, + "learning_rate": 1.547994523047609e-05, + "loss": 0.0459, + "step": 4902 + }, + { + "epoch": 0.9815852682145717, + "learning_rate": 1.546825000113736e-05, + "loss": 0.0219, + "step": 4904 + }, + { + "epoch": 0.9823859087269816, + "learning_rate": 1.545654409218794e-05, + "loss": 0.0471, + "step": 4906 + }, + { + "epoch": 0.9823859087269816, + "learning_rate": 1.544482752648966e-05, + "loss": 0.2349, + "step": 4908 + }, + { + "epoch": 0.9831865492393915, + "learning_rate": 1.5433100326925298e-05, + "loss": 0.0419, + "step": 4910 + }, + { + "epoch": 0.9831865492393915, + "learning_rate": 1.5421362516398285e-05, + "loss": 0.0446, + "step": 4912 + }, + { + "epoch": 0.9839871897518014, + "learning_rate": 1.5409614117832797e-05, + "loss": 0.6763, + "step": 4914 + }, + { + "epoch": 0.9839871897518014, + "learning_rate": 1.539785515417377e-05, + "loss": 0.1647, + "step": 4916 + }, + { + "epoch": 0.9847878302642114, + "learning_rate": 1.538608564838665e-05, + "loss": 0.007, + "step": 4918 + }, + { + "epoch": 0.9847878302642114, + "learning_rate": 1.5374305623457605e-05, + "loss": 0.0132, + "step": 4920 + }, + { + "epoch": 0.9855884707766213, + "learning_rate": 1.5362515102393244e-05, + "loss": 0.1768, + "step": 4922 + }, + { + "epoch": 0.9855884707766213, + "learning_rate": 1.5350714108220677e-05, + "loss": 0.122, + "step": 4924 + }, + { + "epoch": 0.9863891112890312, + "learning_rate": 1.5338902663987564e-05, + "loss": 0.0353, + "step": 4926 + }, + { + "epoch": 0.9863891112890312, + "learning_rate": 1.532708079276186e-05, + "loss": 0.0595, + "step": 4928 + }, + { + "epoch": 0.9871897518014412, + "learning_rate": 1.531524851763198e-05, + "loss": 0.0515, + "step": 4930 + }, + { + "epoch": 0.9871897518014412, + "learning_rate": 1.5303405861706567e-05, + "loss": 0.0561, + "step": 4932 + }, + { + "epoch": 0.9879903923138511, + "learning_rate": 1.529155284811464e-05, + "loss": 0.0006, + "step": 4934 + }, + { + "epoch": 0.9879903923138511, + "learning_rate": 1.5279689500005353e-05, + "loss": 0.1927, + "step": 4936 + }, + { + "epoch": 0.988791032826261, + "learning_rate": 1.5267815840548067e-05, + "loss": 0.1452, + "step": 4938 + }, + { + "epoch": 0.988791032826261, + "learning_rate": 1.5255931892932344e-05, + "loss": 0.1241, + "step": 4940 + }, + { + "epoch": 0.9895916733386709, + "learning_rate": 1.5244037680367739e-05, + "loss": 0.2923, + "step": 4942 + }, + { + "epoch": 0.9895916733386709, + "learning_rate": 1.5232133226083962e-05, + "loss": 0.0122, + "step": 4944 + }, + { + "epoch": 0.9903923138510808, + "learning_rate": 1.522021855333061e-05, + "loss": 0.0179, + "step": 4946 + }, + { + "epoch": 0.9903923138510808, + "learning_rate": 1.5208293685377362e-05, + "loss": 0.0205, + "step": 4948 + }, + { + "epoch": 0.9911929543634908, + "learning_rate": 1.519635864551371e-05, + "loss": 0.0546, + "step": 4950 + }, + { + "epoch": 0.9911929543634908, + "learning_rate": 1.5184413457049014e-05, + "loss": 0.0221, + "step": 4952 + }, + { + "epoch": 0.9919935948759008, + "learning_rate": 1.5172458143312548e-05, + "loss": 0.0111, + "step": 4954 + }, + { + "epoch": 0.9919935948759008, + "learning_rate": 1.5160492727653238e-05, + "loss": 0.0411, + "step": 4956 + }, + { + "epoch": 0.9927942353883107, + "learning_rate": 1.5148517233439858e-05, + "loss": 0.4544, + "step": 4958 + }, + { + "epoch": 0.9927942353883107, + "learning_rate": 1.5136531684060753e-05, + "loss": 0.2525, + "step": 4960 + }, + { + "epoch": 0.9935948759007206, + "learning_rate": 1.512453610292402e-05, + "loss": 0.1624, + "step": 4962 + }, + { + "epoch": 0.9935948759007206, + "learning_rate": 1.5112530513457251e-05, + "loss": 0.022, + "step": 4964 + }, + { + "epoch": 0.9943955164131305, + "learning_rate": 1.5100514939107598e-05, + "loss": 0.0145, + "step": 4966 + }, + { + "epoch": 0.9943955164131305, + "learning_rate": 1.50884894033418e-05, + "loss": 0.217, + "step": 4968 + }, + { + "epoch": 0.9951961569255404, + "learning_rate": 1.5076453929645927e-05, + "loss": 0.6618, + "step": 4970 + }, + { + "epoch": 0.9951961569255404, + "learning_rate": 1.5064408541525578e-05, + "loss": 0.0385, + "step": 4972 + }, + { + "epoch": 0.9959967974379503, + "learning_rate": 1.505235326250563e-05, + "loss": 0.0009, + "step": 4974 + }, + { + "epoch": 0.9959967974379503, + "learning_rate": 1.504028811613027e-05, + "loss": 0.0876, + "step": 4976 + }, + { + "epoch": 0.9967974379503602, + "learning_rate": 1.5028213125963054e-05, + "loss": 0.0999, + "step": 4978 + }, + { + "epoch": 0.9967974379503602, + "learning_rate": 1.5016128315586636e-05, + "loss": 0.0986, + "step": 4980 + }, + { + "epoch": 0.9975980784627703, + "learning_rate": 1.5004033708602977e-05, + "loss": 0.0139, + "step": 4982 + }, + { + "epoch": 0.9975980784627703, + "learning_rate": 1.4991929328633043e-05, + "loss": 0.0097, + "step": 4984 + }, + { + "epoch": 0.9983987189751802, + "learning_rate": 1.4979815199317011e-05, + "loss": 0.3621, + "step": 4986 + }, + { + "epoch": 0.9983987189751802, + "learning_rate": 1.4967691344314012e-05, + "loss": 0.011, + "step": 4988 + }, + { + "epoch": 0.9991993594875901, + "learning_rate": 1.495555778730216e-05, + "loss": 0.1634, + "step": 4990 + }, + { + "epoch": 0.9991993594875901, + "learning_rate": 1.4943414551978622e-05, + "loss": 0.599, + "step": 4992 + }, + { + "epoch": 1.0, + "learning_rate": 1.4931261662059333e-05, + "loss": 0.0124, + "step": 4994 + }, + { + "epoch": 1.0, + "learning_rate": 1.4919099141279214e-05, + "loss": 0.0722, + "step": 4996 + }, + { + "epoch": 1.0, + "step": 4996, + "total_flos": 3.162009226235085e+16, + "train_loss": 0.14234216435637692, + "train_runtime": 10130.4583, + "train_samples_per_second": 1.973, + "train_steps_per_second": 0.493 + } + ], + "logging_steps": 2, + "max_steps": 4996, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 3.162009226235085e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_25_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_25_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..14214eccc698a1f6529e3344b594a406e2abad08 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_25_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:026908a1a6cabee317d77b59ad4094b037d47a0c2f07775571896a8b61cd24f4 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_25_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_25_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2851943c5deaf22da7b54dd32e8c600ddb719321 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_25_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a19015762eb8cb440555fa4fe326b31d12fc46f51586de17bc0c15e67bacd47 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_25_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_25_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..38b2701720af48a98a09d3a88fb67ded5238902b --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_25_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:301a03e1f7687bd3ab835c0fd8c2d27b212c6bf6a8ae40891acdb8007af2a1fc +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_25_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_25_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..809330859cce811896484b9162f479d091eae6ef --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_selfsup_scenario12_new_10000_random0_25_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b357ee354d17ca21606e65ed753f16fef6a150a4021c8541290521afd0ee6884 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario12_new_10000_nosampling_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario12_new_10000_nosampling_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..49bcb20dc69f506c32ff6b5e22798f4a6eef72e5 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario12_new_10000_nosampling_seed1/0_trainer_state.json @@ -0,0 +1,35032 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002, + "grad_norm": 9.764071464538574, + "learning_rate": 4e-08, + "loss": 0.9425, + "step": 2 + }, + { + "epoch": 0.0004, + "grad_norm": 9.944977760314941, + "learning_rate": 8e-08, + "loss": 0.438, + "step": 4 + }, + { + "epoch": 0.0006, + "grad_norm": 3.53376841545105, + "learning_rate": 1.2000000000000002e-07, + "loss": 0.1292, + "step": 6 + }, + { + "epoch": 0.0008, + "grad_norm": 11.00562858581543, + "learning_rate": 1.6e-07, + "loss": 0.8584, + "step": 8 + }, + { + "epoch": 0.001, + "grad_norm": 6.5821614265441895, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.5299, + "step": 10 + }, + { + "epoch": 0.0012, + "grad_norm": 9.923234939575195, + "learning_rate": 2.4000000000000003e-07, + "loss": 0.4719, + "step": 12 + }, + { + "epoch": 0.0014, + "grad_norm": 13.532042503356934, + "learning_rate": 2.8e-07, + "loss": 0.4847, + "step": 14 + }, + { + "epoch": 0.0016, + "grad_norm": 9.777961730957031, + "learning_rate": 3.2e-07, + "loss": 0.7231, + "step": 16 + }, + { + "epoch": 0.0018, + "grad_norm": 10.388928413391113, + "learning_rate": 3.6e-07, + "loss": 0.4402, + "step": 18 + }, + { + "epoch": 0.002, + "grad_norm": 4.022494792938232, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.5561, + "step": 20 + }, + { + "epoch": 0.0022, + "grad_norm": 12.663888931274414, + "learning_rate": 4.4e-07, + "loss": 0.582, + "step": 22 + }, + { + "epoch": 0.0024, + "grad_norm": 4.6402907371521, + "learning_rate": 4.800000000000001e-07, + "loss": 0.4905, + "step": 24 + }, + { + "epoch": 0.0026, + "grad_norm": 9.031705856323242, + "learning_rate": 5.2e-07, + "loss": 0.6423, + "step": 26 + }, + { + "epoch": 0.0028, + "grad_norm": 15.259504318237305, + "learning_rate": 5.6e-07, + "loss": 0.8211, + "step": 28 + }, + { + "epoch": 0.003, + "grad_norm": 3.2614762783050537, + "learning_rate": 6.000000000000001e-07, + "loss": 0.4804, + "step": 30 + }, + { + "epoch": 0.0032, + "grad_norm": 3.9314475059509277, + "learning_rate": 6.4e-07, + "loss": 0.2322, + "step": 32 + }, + { + "epoch": 0.0034, + "grad_norm": 5.211013317108154, + "learning_rate": 6.800000000000001e-07, + "loss": 0.4562, + "step": 34 + }, + { + "epoch": 0.0036, + "grad_norm": 9.006333351135254, + "learning_rate": 7.2e-07, + "loss": 0.6425, + "step": 36 + }, + { + "epoch": 0.0038, + "grad_norm": 3.156242847442627, + "learning_rate": 7.6e-07, + "loss": 0.3204, + "step": 38 + }, + { + "epoch": 0.004, + "grad_norm": 3.8118786811828613, + "learning_rate": 8.000000000000001e-07, + "loss": 0.3482, + "step": 40 + }, + { + "epoch": 0.0042, + "grad_norm": 10.38118839263916, + "learning_rate": 8.400000000000001e-07, + "loss": 0.3742, + "step": 42 + }, + { + "epoch": 0.0044, + "grad_norm": 3.7640886306762695, + "learning_rate": 8.8e-07, + "loss": 0.1999, + "step": 44 + }, + { + "epoch": 0.0046, + "grad_norm": 4.93544340133667, + "learning_rate": 9.200000000000001e-07, + "loss": 0.2311, + "step": 46 + }, + { + "epoch": 0.0048, + "grad_norm": 3.8757948875427246, + "learning_rate": 9.600000000000001e-07, + "loss": 0.4838, + "step": 48 + }, + { + "epoch": 0.005, + "grad_norm": 12.478875160217285, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.5005, + "step": 50 + }, + { + "epoch": 0.0052, + "grad_norm": 6.216131210327148, + "learning_rate": 1.04e-06, + "loss": 0.3286, + "step": 52 + }, + { + "epoch": 0.0054, + "grad_norm": 9.227232933044434, + "learning_rate": 1.08e-06, + "loss": 0.5249, + "step": 54 + }, + { + "epoch": 0.0056, + "grad_norm": 4.609793186187744, + "learning_rate": 1.12e-06, + "loss": 0.3539, + "step": 56 + }, + { + "epoch": 0.0058, + "grad_norm": 4.001669883728027, + "learning_rate": 1.1600000000000001e-06, + "loss": 0.1375, + "step": 58 + }, + { + "epoch": 0.006, + "grad_norm": 4.001186847686768, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.1716, + "step": 60 + }, + { + "epoch": 0.0062, + "grad_norm": 3.9459457397460938, + "learning_rate": 1.2400000000000002e-06, + "loss": 0.3606, + "step": 62 + }, + { + "epoch": 0.0064, + "grad_norm": 7.238277435302734, + "learning_rate": 1.28e-06, + "loss": 0.4921, + "step": 64 + }, + { + "epoch": 0.0066, + "grad_norm": 4.067897796630859, + "learning_rate": 1.32e-06, + "loss": 0.331, + "step": 66 + }, + { + "epoch": 0.0068, + "grad_norm": 7.326686382293701, + "learning_rate": 1.3600000000000001e-06, + "loss": 0.4619, + "step": 68 + }, + { + "epoch": 0.007, + "grad_norm": 4.921950340270996, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.2673, + "step": 70 + }, + { + "epoch": 0.0072, + "grad_norm": 6.056000232696533, + "learning_rate": 1.44e-06, + "loss": 0.2887, + "step": 72 + }, + { + "epoch": 0.0074, + "grad_norm": 5.459908485412598, + "learning_rate": 1.48e-06, + "loss": 0.2885, + "step": 74 + }, + { + "epoch": 0.0076, + "grad_norm": 5.349428176879883, + "learning_rate": 1.52e-06, + "loss": 0.2642, + "step": 76 + }, + { + "epoch": 0.0078, + "grad_norm": 5.5520830154418945, + "learning_rate": 1.56e-06, + "loss": 0.3448, + "step": 78 + }, + { + "epoch": 0.008, + "grad_norm": 7.404011249542236, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.352, + "step": 80 + }, + { + "epoch": 0.0082, + "grad_norm": 5.249072074890137, + "learning_rate": 1.6400000000000002e-06, + "loss": 0.3017, + "step": 82 + }, + { + "epoch": 0.0084, + "grad_norm": 4.045840740203857, + "learning_rate": 1.6800000000000002e-06, + "loss": 0.2111, + "step": 84 + }, + { + "epoch": 0.0086, + "grad_norm": 6.694685935974121, + "learning_rate": 1.72e-06, + "loss": 0.3106, + "step": 86 + }, + { + "epoch": 0.0088, + "grad_norm": 5.175083160400391, + "learning_rate": 1.76e-06, + "loss": 0.3105, + "step": 88 + }, + { + "epoch": 0.009, + "grad_norm": 5.499259948730469, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.3019, + "step": 90 + }, + { + "epoch": 0.0092, + "grad_norm": 4.50314474105835, + "learning_rate": 1.8400000000000002e-06, + "loss": 0.2748, + "step": 92 + }, + { + "epoch": 0.0094, + "grad_norm": 5.685649871826172, + "learning_rate": 1.8800000000000002e-06, + "loss": 0.329, + "step": 94 + }, + { + "epoch": 0.0096, + "grad_norm": 5.692402362823486, + "learning_rate": 1.9200000000000003e-06, + "loss": 0.2771, + "step": 96 + }, + { + "epoch": 0.0098, + "grad_norm": 5.952978134155273, + "learning_rate": 1.9600000000000003e-06, + "loss": 0.2859, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 7.373020648956299, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.3573, + "step": 100 + }, + { + "epoch": 0.0102, + "grad_norm": 8.39665412902832, + "learning_rate": 2.04e-06, + "loss": 0.5364, + "step": 102 + }, + { + "epoch": 0.0104, + "grad_norm": 6.55374813079834, + "learning_rate": 2.08e-06, + "loss": 0.3328, + "step": 104 + }, + { + "epoch": 0.0106, + "grad_norm": 4.950131893157959, + "learning_rate": 2.12e-06, + "loss": 0.1964, + "step": 106 + }, + { + "epoch": 0.0108, + "grad_norm": 6.390681266784668, + "learning_rate": 2.16e-06, + "loss": 0.2763, + "step": 108 + }, + { + "epoch": 0.011, + "grad_norm": 4.370520114898682, + "learning_rate": 2.2e-06, + "loss": 0.2708, + "step": 110 + }, + { + "epoch": 0.0112, + "grad_norm": 11.066506385803223, + "learning_rate": 2.24e-06, + "loss": 0.5564, + "step": 112 + }, + { + "epoch": 0.0114, + "grad_norm": 6.519148826599121, + "learning_rate": 2.28e-06, + "loss": 0.3506, + "step": 114 + }, + { + "epoch": 0.0116, + "grad_norm": 3.7491276264190674, + "learning_rate": 2.3200000000000002e-06, + "loss": 0.3092, + "step": 116 + }, + { + "epoch": 0.0118, + "grad_norm": 5.18808650970459, + "learning_rate": 2.3600000000000003e-06, + "loss": 0.2998, + "step": 118 + }, + { + "epoch": 0.012, + "grad_norm": 2.809255838394165, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.2053, + "step": 120 + }, + { + "epoch": 0.0122, + "grad_norm": 6.506760597229004, + "learning_rate": 2.4400000000000004e-06, + "loss": 0.2904, + "step": 122 + }, + { + "epoch": 0.0124, + "grad_norm": 9.282499313354492, + "learning_rate": 2.4800000000000004e-06, + "loss": 0.3573, + "step": 124 + }, + { + "epoch": 0.0126, + "grad_norm": 14.685383796691895, + "learning_rate": 2.52e-06, + "loss": 0.4336, + "step": 126 + }, + { + "epoch": 0.0128, + "grad_norm": 3.468036413192749, + "learning_rate": 2.56e-06, + "loss": 0.2645, + "step": 128 + }, + { + "epoch": 0.013, + "grad_norm": 5.894038200378418, + "learning_rate": 2.6e-06, + "loss": 0.2562, + "step": 130 + }, + { + "epoch": 0.0132, + "grad_norm": 11.968144416809082, + "learning_rate": 2.64e-06, + "loss": 0.4186, + "step": 132 + }, + { + "epoch": 0.0134, + "grad_norm": 6.168442249298096, + "learning_rate": 2.68e-06, + "loss": 0.4185, + "step": 134 + }, + { + "epoch": 0.0136, + "grad_norm": 2.6652610301971436, + "learning_rate": 2.7200000000000002e-06, + "loss": 0.1096, + "step": 136 + }, + { + "epoch": 0.0138, + "grad_norm": 10.757227897644043, + "learning_rate": 2.7600000000000003e-06, + "loss": 0.4201, + "step": 138 + }, + { + "epoch": 0.014, + "grad_norm": 8.377145767211914, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.3729, + "step": 140 + }, + { + "epoch": 0.0142, + "grad_norm": 12.025701522827148, + "learning_rate": 2.84e-06, + "loss": 0.623, + "step": 142 + }, + { + "epoch": 0.0144, + "grad_norm": 8.233043670654297, + "learning_rate": 2.88e-06, + "loss": 0.2133, + "step": 144 + }, + { + "epoch": 0.0146, + "grad_norm": 8.333831787109375, + "learning_rate": 2.92e-06, + "loss": 0.6405, + "step": 146 + }, + { + "epoch": 0.0148, + "grad_norm": 8.141560554504395, + "learning_rate": 2.96e-06, + "loss": 0.6846, + "step": 148 + }, + { + "epoch": 0.015, + "grad_norm": 7.344973564147949, + "learning_rate": 3e-06, + "loss": 0.2501, + "step": 150 + }, + { + "epoch": 0.0152, + "grad_norm": 7.519558429718018, + "learning_rate": 3.04e-06, + "loss": 0.6396, + "step": 152 + }, + { + "epoch": 0.0154, + "grad_norm": 4.9677228927612305, + "learning_rate": 3.08e-06, + "loss": 0.3145, + "step": 154 + }, + { + "epoch": 0.0156, + "grad_norm": 3.8736705780029297, + "learning_rate": 3.12e-06, + "loss": 0.1666, + "step": 156 + }, + { + "epoch": 0.0158, + "grad_norm": 5.810587406158447, + "learning_rate": 3.1600000000000002e-06, + "loss": 0.4425, + "step": 158 + }, + { + "epoch": 0.016, + "grad_norm": 3.833611011505127, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.3994, + "step": 160 + }, + { + "epoch": 0.0162, + "grad_norm": 8.689177513122559, + "learning_rate": 3.2400000000000003e-06, + "loss": 0.3992, + "step": 162 + }, + { + "epoch": 0.0164, + "grad_norm": 8.393973350524902, + "learning_rate": 3.2800000000000004e-06, + "loss": 0.3345, + "step": 164 + }, + { + "epoch": 0.0166, + "grad_norm": 7.708757400512695, + "learning_rate": 3.3200000000000004e-06, + "loss": 0.2776, + "step": 166 + }, + { + "epoch": 0.0168, + "grad_norm": 9.31840991973877, + "learning_rate": 3.3600000000000004e-06, + "loss": 0.5534, + "step": 168 + }, + { + "epoch": 0.017, + "grad_norm": 4.801684379577637, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.0172, + "grad_norm": 7.417541027069092, + "learning_rate": 3.44e-06, + "loss": 0.2809, + "step": 172 + }, + { + "epoch": 0.0174, + "grad_norm": 4.078112602233887, + "learning_rate": 3.48e-06, + "loss": 0.2368, + "step": 174 + }, + { + "epoch": 0.0176, + "grad_norm": 5.178045749664307, + "learning_rate": 3.52e-06, + "loss": 0.3392, + "step": 176 + }, + { + "epoch": 0.0178, + "grad_norm": 5.377458095550537, + "learning_rate": 3.5600000000000002e-06, + "loss": 0.2452, + "step": 178 + }, + { + "epoch": 0.018, + "grad_norm": 7.346227169036865, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.3387, + "step": 180 + }, + { + "epoch": 0.0182, + "grad_norm": 5.363344192504883, + "learning_rate": 3.6400000000000003e-06, + "loss": 0.3995, + "step": 182 + }, + { + "epoch": 0.0184, + "grad_norm": 4.735292434692383, + "learning_rate": 3.6800000000000003e-06, + "loss": 0.3473, + "step": 184 + }, + { + "epoch": 0.0186, + "grad_norm": 4.726678848266602, + "learning_rate": 3.7200000000000004e-06, + "loss": 0.3735, + "step": 186 + }, + { + "epoch": 0.0188, + "grad_norm": 8.389480590820312, + "learning_rate": 3.7600000000000004e-06, + "loss": 0.5939, + "step": 188 + }, + { + "epoch": 0.019, + "grad_norm": 11.936482429504395, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.4122, + "step": 190 + }, + { + "epoch": 0.0192, + "grad_norm": 6.911277770996094, + "learning_rate": 3.8400000000000005e-06, + "loss": 0.4327, + "step": 192 + }, + { + "epoch": 0.0194, + "grad_norm": 6.574550151824951, + "learning_rate": 3.88e-06, + "loss": 0.3415, + "step": 194 + }, + { + "epoch": 0.0196, + "grad_norm": 3.653127670288086, + "learning_rate": 3.920000000000001e-06, + "loss": 0.3247, + "step": 196 + }, + { + "epoch": 0.0198, + "grad_norm": 3.1536202430725098, + "learning_rate": 3.96e-06, + "loss": 0.3159, + "step": 198 + }, + { + "epoch": 0.02, + "grad_norm": 5.709495544433594, + "learning_rate": 4.000000000000001e-06, + "loss": 0.3737, + "step": 200 + }, + { + "epoch": 0.0202, + "grad_norm": 5.63132905960083, + "learning_rate": 4.04e-06, + "loss": 0.3549, + "step": 202 + }, + { + "epoch": 0.0204, + "grad_norm": 5.41639518737793, + "learning_rate": 4.08e-06, + "loss": 0.2749, + "step": 204 + }, + { + "epoch": 0.0206, + "grad_norm": 4.386246681213379, + "learning_rate": 4.12e-06, + "loss": 0.2231, + "step": 206 + }, + { + "epoch": 0.0208, + "grad_norm": 5.446130752563477, + "learning_rate": 4.16e-06, + "loss": 0.3543, + "step": 208 + }, + { + "epoch": 0.021, + "grad_norm": 4.440978050231934, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.3204, + "step": 210 + }, + { + "epoch": 0.0212, + "grad_norm": 10.089569091796875, + "learning_rate": 4.24e-06, + "loss": 0.385, + "step": 212 + }, + { + "epoch": 0.0214, + "grad_norm": 3.741953134536743, + "learning_rate": 4.2800000000000005e-06, + "loss": 0.2093, + "step": 214 + }, + { + "epoch": 0.0216, + "grad_norm": 6.1820220947265625, + "learning_rate": 4.32e-06, + "loss": 0.3928, + "step": 216 + }, + { + "epoch": 0.0218, + "grad_norm": 5.93937873840332, + "learning_rate": 4.360000000000001e-06, + "loss": 0.3707, + "step": 218 + }, + { + "epoch": 0.022, + "grad_norm": 3.3084259033203125, + "learning_rate": 4.4e-06, + "loss": 0.1391, + "step": 220 + }, + { + "epoch": 0.0222, + "grad_norm": 6.119719982147217, + "learning_rate": 4.440000000000001e-06, + "loss": 0.6732, + "step": 222 + }, + { + "epoch": 0.0224, + "grad_norm": 11.847949028015137, + "learning_rate": 4.48e-06, + "loss": 0.7336, + "step": 224 + }, + { + "epoch": 0.0226, + "grad_norm": 6.498089790344238, + "learning_rate": 4.520000000000001e-06, + "loss": 0.4007, + "step": 226 + }, + { + "epoch": 0.0228, + "grad_norm": 4.350018501281738, + "learning_rate": 4.56e-06, + "loss": 0.2255, + "step": 228 + }, + { + "epoch": 0.023, + "grad_norm": 4.452190399169922, + "learning_rate": 4.600000000000001e-06, + "loss": 0.2201, + "step": 230 + }, + { + "epoch": 0.0232, + "grad_norm": 4.962041854858398, + "learning_rate": 4.6400000000000005e-06, + "loss": 0.2433, + "step": 232 + }, + { + "epoch": 0.0234, + "grad_norm": 4.02195930480957, + "learning_rate": 4.680000000000001e-06, + "loss": 0.3519, + "step": 234 + }, + { + "epoch": 0.0236, + "grad_norm": 4.45338773727417, + "learning_rate": 4.7200000000000005e-06, + "loss": 0.3064, + "step": 236 + }, + { + "epoch": 0.0238, + "grad_norm": 5.782873153686523, + "learning_rate": 4.76e-06, + "loss": 0.3545, + "step": 238 + }, + { + "epoch": 0.024, + "grad_norm": 8.59780216217041, + "learning_rate": 4.800000000000001e-06, + "loss": 0.3874, + "step": 240 + }, + { + "epoch": 0.0242, + "grad_norm": 5.609553813934326, + "learning_rate": 4.84e-06, + "loss": 0.2898, + "step": 242 + }, + { + "epoch": 0.0244, + "grad_norm": 6.145511150360107, + "learning_rate": 4.880000000000001e-06, + "loss": 0.3831, + "step": 244 + }, + { + "epoch": 0.0246, + "grad_norm": 6.866811752319336, + "learning_rate": 4.92e-06, + "loss": 0.3541, + "step": 246 + }, + { + "epoch": 0.0248, + "grad_norm": 5.022074222564697, + "learning_rate": 4.960000000000001e-06, + "loss": 0.3407, + "step": 248 + }, + { + "epoch": 0.025, + "grad_norm": 4.811051845550537, + "learning_rate": 5e-06, + "loss": 0.4056, + "step": 250 + }, + { + "epoch": 0.0252, + "grad_norm": 5.733877182006836, + "learning_rate": 5.04e-06, + "loss": 0.3683, + "step": 252 + }, + { + "epoch": 0.0254, + "grad_norm": 5.9673004150390625, + "learning_rate": 5.0800000000000005e-06, + "loss": 0.4361, + "step": 254 + }, + { + "epoch": 0.0256, + "grad_norm": 4.748039722442627, + "learning_rate": 5.12e-06, + "loss": 0.3408, + "step": 256 + }, + { + "epoch": 0.0258, + "grad_norm": 5.454189777374268, + "learning_rate": 5.1600000000000006e-06, + "loss": 0.3659, + "step": 258 + }, + { + "epoch": 0.026, + "grad_norm": 3.880958080291748, + "learning_rate": 5.2e-06, + "loss": 0.2292, + "step": 260 + }, + { + "epoch": 0.0262, + "grad_norm": 6.126842975616455, + "learning_rate": 5.240000000000001e-06, + "loss": 0.4171, + "step": 262 + }, + { + "epoch": 0.0264, + "grad_norm": 2.772209882736206, + "learning_rate": 5.28e-06, + "loss": 0.1459, + "step": 264 + }, + { + "epoch": 0.0266, + "grad_norm": 4.241418361663818, + "learning_rate": 5.320000000000001e-06, + "loss": 0.1681, + "step": 266 + }, + { + "epoch": 0.0268, + "grad_norm": 4.367356300354004, + "learning_rate": 5.36e-06, + "loss": 0.1885, + "step": 268 + }, + { + "epoch": 0.027, + "grad_norm": 2.496178388595581, + "learning_rate": 5.400000000000001e-06, + "loss": 0.2797, + "step": 270 + }, + { + "epoch": 0.0272, + "grad_norm": 3.762805223464966, + "learning_rate": 5.4400000000000004e-06, + "loss": 0.0592, + "step": 272 + }, + { + "epoch": 0.0274, + "grad_norm": 5.019501209259033, + "learning_rate": 5.480000000000001e-06, + "loss": 0.3407, + "step": 274 + }, + { + "epoch": 0.0276, + "grad_norm": 6.736423969268799, + "learning_rate": 5.5200000000000005e-06, + "loss": 0.418, + "step": 276 + }, + { + "epoch": 0.0278, + "grad_norm": 8.573018074035645, + "learning_rate": 5.560000000000001e-06, + "loss": 0.4247, + "step": 278 + }, + { + "epoch": 0.028, + "grad_norm": 9.654558181762695, + "learning_rate": 5.600000000000001e-06, + "loss": 0.2248, + "step": 280 + }, + { + "epoch": 0.0282, + "grad_norm": 2.6493446826934814, + "learning_rate": 5.64e-06, + "loss": 0.365, + "step": 282 + }, + { + "epoch": 0.0284, + "grad_norm": 5.378200054168701, + "learning_rate": 5.68e-06, + "loss": 0.2793, + "step": 284 + }, + { + "epoch": 0.0286, + "grad_norm": 10.591124534606934, + "learning_rate": 5.72e-06, + "loss": 0.4103, + "step": 286 + }, + { + "epoch": 0.0288, + "grad_norm": 1.9863722324371338, + "learning_rate": 5.76e-06, + "loss": 0.2624, + "step": 288 + }, + { + "epoch": 0.029, + "grad_norm": 1.6954416036605835, + "learning_rate": 5.8e-06, + "loss": 0.1919, + "step": 290 + }, + { + "epoch": 0.0292, + "grad_norm": 3.587008476257324, + "learning_rate": 5.84e-06, + "loss": 0.2182, + "step": 292 + }, + { + "epoch": 0.0294, + "grad_norm": 8.539349555969238, + "learning_rate": 5.8800000000000005e-06, + "loss": 0.4392, + "step": 294 + }, + { + "epoch": 0.0296, + "grad_norm": 5.729363441467285, + "learning_rate": 5.92e-06, + "loss": 0.2728, + "step": 296 + }, + { + "epoch": 0.0298, + "grad_norm": 10.105363845825195, + "learning_rate": 5.9600000000000005e-06, + "loss": 0.3168, + "step": 298 + }, + { + "epoch": 0.03, + "grad_norm": 6.039941310882568, + "learning_rate": 6e-06, + "loss": 0.3587, + "step": 300 + }, + { + "epoch": 0.0302, + "grad_norm": 5.653912544250488, + "learning_rate": 6.040000000000001e-06, + "loss": 0.2525, + "step": 302 + }, + { + "epoch": 0.0304, + "grad_norm": 3.596804618835449, + "learning_rate": 6.08e-06, + "loss": 0.1843, + "step": 304 + }, + { + "epoch": 0.0306, + "grad_norm": 5.026602268218994, + "learning_rate": 6.120000000000001e-06, + "loss": 0.1599, + "step": 306 + }, + { + "epoch": 0.0308, + "grad_norm": 8.492005348205566, + "learning_rate": 6.16e-06, + "loss": 0.3782, + "step": 308 + }, + { + "epoch": 0.031, + "grad_norm": 10.179569244384766, + "learning_rate": 6.200000000000001e-06, + "loss": 0.3066, + "step": 310 + }, + { + "epoch": 0.0312, + "grad_norm": 9.979721069335938, + "learning_rate": 6.24e-06, + "loss": 0.6111, + "step": 312 + }, + { + "epoch": 0.0314, + "grad_norm": 5.467297077178955, + "learning_rate": 6.280000000000001e-06, + "loss": 0.1676, + "step": 314 + }, + { + "epoch": 0.0316, + "grad_norm": 5.762682914733887, + "learning_rate": 6.3200000000000005e-06, + "loss": 0.4174, + "step": 316 + }, + { + "epoch": 0.0318, + "grad_norm": 3.668522596359253, + "learning_rate": 6.360000000000001e-06, + "loss": 0.211, + "step": 318 + }, + { + "epoch": 0.032, + "grad_norm": 23.567941665649414, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.5527, + "step": 320 + }, + { + "epoch": 0.0322, + "grad_norm": 2.52738881111145, + "learning_rate": 6.440000000000001e-06, + "loss": 0.4521, + "step": 322 + }, + { + "epoch": 0.0324, + "grad_norm": 4.090083599090576, + "learning_rate": 6.480000000000001e-06, + "loss": 0.2908, + "step": 324 + }, + { + "epoch": 0.0326, + "grad_norm": 6.881717681884766, + "learning_rate": 6.520000000000001e-06, + "loss": 0.1169, + "step": 326 + }, + { + "epoch": 0.0328, + "grad_norm": 11.318687438964844, + "learning_rate": 6.560000000000001e-06, + "loss": 0.5724, + "step": 328 + }, + { + "epoch": 0.033, + "grad_norm": 8.0609712600708, + "learning_rate": 6.600000000000001e-06, + "loss": 0.3943, + "step": 330 + }, + { + "epoch": 0.0332, + "grad_norm": 3.913357973098755, + "learning_rate": 6.640000000000001e-06, + "loss": 0.166, + "step": 332 + }, + { + "epoch": 0.0334, + "grad_norm": 6.234376430511475, + "learning_rate": 6.680000000000001e-06, + "loss": 0.3245, + "step": 334 + }, + { + "epoch": 0.0336, + "grad_norm": 7.855494022369385, + "learning_rate": 6.720000000000001e-06, + "loss": 0.2794, + "step": 336 + }, + { + "epoch": 0.0338, + "grad_norm": 11.271302223205566, + "learning_rate": 6.760000000000001e-06, + "loss": 0.4311, + "step": 338 + }, + { + "epoch": 0.034, + "grad_norm": 6.490280628204346, + "learning_rate": 6.800000000000001e-06, + "loss": 0.3097, + "step": 340 + }, + { + "epoch": 0.0342, + "grad_norm": 6.3122172355651855, + "learning_rate": 6.8400000000000014e-06, + "loss": 0.1649, + "step": 342 + }, + { + "epoch": 0.0344, + "grad_norm": 3.0100767612457275, + "learning_rate": 6.88e-06, + "loss": 0.4413, + "step": 344 + }, + { + "epoch": 0.0346, + "grad_norm": 3.841132879257202, + "learning_rate": 6.92e-06, + "loss": 0.1521, + "step": 346 + }, + { + "epoch": 0.0348, + "grad_norm": 18.788040161132812, + "learning_rate": 6.96e-06, + "loss": 0.561, + "step": 348 + }, + { + "epoch": 0.035, + "grad_norm": 6.728890419006348, + "learning_rate": 7e-06, + "loss": 0.4278, + "step": 350 + }, + { + "epoch": 0.0352, + "grad_norm": 5.574136257171631, + "learning_rate": 7.04e-06, + "loss": 0.4963, + "step": 352 + }, + { + "epoch": 0.0354, + "grad_norm": 1.8246036767959595, + "learning_rate": 7.08e-06, + "loss": 0.2424, + "step": 354 + }, + { + "epoch": 0.0356, + "grad_norm": 7.485132217407227, + "learning_rate": 7.1200000000000004e-06, + "loss": 0.5604, + "step": 356 + }, + { + "epoch": 0.0358, + "grad_norm": 3.701840400695801, + "learning_rate": 7.16e-06, + "loss": 0.3829, + "step": 358 + }, + { + "epoch": 0.036, + "grad_norm": 4.048810005187988, + "learning_rate": 7.2000000000000005e-06, + "loss": 0.2283, + "step": 360 + }, + { + "epoch": 0.0362, + "grad_norm": 2.488981246948242, + "learning_rate": 7.24e-06, + "loss": 0.168, + "step": 362 + }, + { + "epoch": 0.0364, + "grad_norm": 2.0678000450134277, + "learning_rate": 7.280000000000001e-06, + "loss": 0.1005, + "step": 364 + }, + { + "epoch": 0.0366, + "grad_norm": 6.043478012084961, + "learning_rate": 7.32e-06, + "loss": 0.4452, + "step": 366 + }, + { + "epoch": 0.0368, + "grad_norm": 6.154824256896973, + "learning_rate": 7.360000000000001e-06, + "loss": 0.4155, + "step": 368 + }, + { + "epoch": 0.037, + "grad_norm": 3.0539345741271973, + "learning_rate": 7.4e-06, + "loss": 0.1777, + "step": 370 + }, + { + "epoch": 0.0372, + "grad_norm": 3.944918155670166, + "learning_rate": 7.440000000000001e-06, + "loss": 0.4181, + "step": 372 + }, + { + "epoch": 0.0374, + "grad_norm": 5.592339515686035, + "learning_rate": 7.48e-06, + "loss": 0.2033, + "step": 374 + }, + { + "epoch": 0.0376, + "grad_norm": 4.391637325286865, + "learning_rate": 7.520000000000001e-06, + "loss": 0.321, + "step": 376 + }, + { + "epoch": 0.0378, + "grad_norm": 6.2351202964782715, + "learning_rate": 7.5600000000000005e-06, + "loss": 0.2951, + "step": 378 + }, + { + "epoch": 0.038, + "grad_norm": 3.8407304286956787, + "learning_rate": 7.600000000000001e-06, + "loss": 0.2406, + "step": 380 + }, + { + "epoch": 0.0382, + "grad_norm": 0.8013114333152771, + "learning_rate": 7.640000000000001e-06, + "loss": 0.1429, + "step": 382 + }, + { + "epoch": 0.0384, + "grad_norm": 2.5438413619995117, + "learning_rate": 7.680000000000001e-06, + "loss": 0.178, + "step": 384 + }, + { + "epoch": 0.0386, + "grad_norm": 6.3677778244018555, + "learning_rate": 7.72e-06, + "loss": 0.4738, + "step": 386 + }, + { + "epoch": 0.0388, + "grad_norm": 9.787010192871094, + "learning_rate": 7.76e-06, + "loss": 0.4175, + "step": 388 + }, + { + "epoch": 0.039, + "grad_norm": 2.7699735164642334, + "learning_rate": 7.800000000000002e-06, + "loss": 0.2058, + "step": 390 + }, + { + "epoch": 0.0392, + "grad_norm": 2.655912160873413, + "learning_rate": 7.840000000000001e-06, + "loss": 0.1689, + "step": 392 + }, + { + "epoch": 0.0394, + "grad_norm": 3.9969513416290283, + "learning_rate": 7.88e-06, + "loss": 0.3466, + "step": 394 + }, + { + "epoch": 0.0396, + "grad_norm": 14.64122200012207, + "learning_rate": 7.92e-06, + "loss": 0.486, + "step": 396 + }, + { + "epoch": 0.0398, + "grad_norm": 17.54070281982422, + "learning_rate": 7.960000000000002e-06, + "loss": 0.6391, + "step": 398 + }, + { + "epoch": 0.04, + "grad_norm": 10.914463996887207, + "learning_rate": 8.000000000000001e-06, + "loss": 0.2643, + "step": 400 + }, + { + "epoch": 0.0402, + "grad_norm": 2.7129647731781006, + "learning_rate": 8.040000000000001e-06, + "loss": 0.1061, + "step": 402 + }, + { + "epoch": 0.0404, + "grad_norm": 5.4490509033203125, + "learning_rate": 8.08e-06, + "loss": 0.3766, + "step": 404 + }, + { + "epoch": 0.0406, + "grad_norm": 9.029168128967285, + "learning_rate": 8.120000000000002e-06, + "loss": 0.3249, + "step": 406 + }, + { + "epoch": 0.0408, + "grad_norm": 3.629739284515381, + "learning_rate": 8.16e-06, + "loss": 0.2834, + "step": 408 + }, + { + "epoch": 0.041, + "grad_norm": 7.094827175140381, + "learning_rate": 8.2e-06, + "loss": 0.3534, + "step": 410 + }, + { + "epoch": 0.0412, + "grad_norm": 10.501252174377441, + "learning_rate": 8.24e-06, + "loss": 0.5134, + "step": 412 + }, + { + "epoch": 0.0414, + "grad_norm": 8.407830238342285, + "learning_rate": 8.28e-06, + "loss": 0.5513, + "step": 414 + }, + { + "epoch": 0.0416, + "grad_norm": 5.202540397644043, + "learning_rate": 8.32e-06, + "loss": 0.4558, + "step": 416 + }, + { + "epoch": 0.0418, + "grad_norm": 3.0795340538024902, + "learning_rate": 8.36e-06, + "loss": 0.3164, + "step": 418 + }, + { + "epoch": 0.042, + "grad_norm": 4.212881565093994, + "learning_rate": 8.400000000000001e-06, + "loss": 0.2685, + "step": 420 + }, + { + "epoch": 0.0422, + "grad_norm": 2.7910046577453613, + "learning_rate": 8.44e-06, + "loss": 0.1965, + "step": 422 + }, + { + "epoch": 0.0424, + "grad_norm": 4.316739082336426, + "learning_rate": 8.48e-06, + "loss": 0.2685, + "step": 424 + }, + { + "epoch": 0.0426, + "grad_norm": 2.7433853149414062, + "learning_rate": 8.52e-06, + "loss": 0.2538, + "step": 426 + }, + { + "epoch": 0.0428, + "grad_norm": 5.061600208282471, + "learning_rate": 8.560000000000001e-06, + "loss": 0.2647, + "step": 428 + }, + { + "epoch": 0.043, + "grad_norm": 2.7244014739990234, + "learning_rate": 8.6e-06, + "loss": 0.1086, + "step": 430 + }, + { + "epoch": 0.0432, + "grad_norm": 1.887459397315979, + "learning_rate": 8.64e-06, + "loss": 0.5405, + "step": 432 + }, + { + "epoch": 0.0434, + "grad_norm": 1.606137752532959, + "learning_rate": 8.68e-06, + "loss": 0.0983, + "step": 434 + }, + { + "epoch": 0.0436, + "grad_norm": 8.65583610534668, + "learning_rate": 8.720000000000001e-06, + "loss": 0.6833, + "step": 436 + }, + { + "epoch": 0.0438, + "grad_norm": 6.704258441925049, + "learning_rate": 8.76e-06, + "loss": 0.3705, + "step": 438 + }, + { + "epoch": 0.044, + "grad_norm": 1.9846571683883667, + "learning_rate": 8.8e-06, + "loss": 0.3492, + "step": 440 + }, + { + "epoch": 0.0442, + "grad_norm": 7.24104118347168, + "learning_rate": 8.84e-06, + "loss": 0.4128, + "step": 442 + }, + { + "epoch": 0.0444, + "grad_norm": 3.8112471103668213, + "learning_rate": 8.880000000000001e-06, + "loss": 0.4394, + "step": 444 + }, + { + "epoch": 0.0446, + "grad_norm": 2.7524023056030273, + "learning_rate": 8.920000000000001e-06, + "loss": 0.2644, + "step": 446 + }, + { + "epoch": 0.0448, + "grad_norm": 3.708747625350952, + "learning_rate": 8.96e-06, + "loss": 0.3286, + "step": 448 + }, + { + "epoch": 0.045, + "grad_norm": 2.4635205268859863, + "learning_rate": 9e-06, + "loss": 0.2414, + "step": 450 + }, + { + "epoch": 0.0452, + "grad_norm": 8.522350311279297, + "learning_rate": 9.040000000000002e-06, + "loss": 0.4008, + "step": 452 + }, + { + "epoch": 0.0454, + "grad_norm": 3.589963674545288, + "learning_rate": 9.080000000000001e-06, + "loss": 0.1776, + "step": 454 + }, + { + "epoch": 0.0456, + "grad_norm": 3.990734815597534, + "learning_rate": 9.12e-06, + "loss": 0.4894, + "step": 456 + }, + { + "epoch": 0.0458, + "grad_norm": 2.326188564300537, + "learning_rate": 9.16e-06, + "loss": 0.2346, + "step": 458 + }, + { + "epoch": 0.046, + "grad_norm": 3.448143482208252, + "learning_rate": 9.200000000000002e-06, + "loss": 0.3656, + "step": 460 + }, + { + "epoch": 0.0462, + "grad_norm": 6.038661003112793, + "learning_rate": 9.240000000000001e-06, + "loss": 0.3719, + "step": 462 + }, + { + "epoch": 0.0464, + "grad_norm": 4.314684867858887, + "learning_rate": 9.280000000000001e-06, + "loss": 0.3206, + "step": 464 + }, + { + "epoch": 0.0466, + "grad_norm": 1.5061438083648682, + "learning_rate": 9.32e-06, + "loss": 0.1776, + "step": 466 + }, + { + "epoch": 0.0468, + "grad_norm": 3.306666374206543, + "learning_rate": 9.360000000000002e-06, + "loss": 0.2183, + "step": 468 + }, + { + "epoch": 0.047, + "grad_norm": 3.068347454071045, + "learning_rate": 9.4e-06, + "loss": 0.3071, + "step": 470 + }, + { + "epoch": 0.0472, + "grad_norm": 4.404678821563721, + "learning_rate": 9.440000000000001e-06, + "loss": 0.2944, + "step": 472 + }, + { + "epoch": 0.0474, + "grad_norm": 2.2805864810943604, + "learning_rate": 9.48e-06, + "loss": 0.1209, + "step": 474 + }, + { + "epoch": 0.0476, + "grad_norm": 2.899704694747925, + "learning_rate": 9.52e-06, + "loss": 0.1295, + "step": 476 + }, + { + "epoch": 0.0478, + "grad_norm": 7.905484676361084, + "learning_rate": 9.56e-06, + "loss": 0.3953, + "step": 478 + }, + { + "epoch": 0.048, + "grad_norm": 1.7650407552719116, + "learning_rate": 9.600000000000001e-06, + "loss": 0.3649, + "step": 480 + }, + { + "epoch": 0.0482, + "grad_norm": 5.348580837249756, + "learning_rate": 9.640000000000001e-06, + "loss": 0.4726, + "step": 482 + }, + { + "epoch": 0.0484, + "grad_norm": 1.8222765922546387, + "learning_rate": 9.68e-06, + "loss": 0.2489, + "step": 484 + }, + { + "epoch": 0.0486, + "grad_norm": 7.086293697357178, + "learning_rate": 9.72e-06, + "loss": 0.3363, + "step": 486 + }, + { + "epoch": 0.0488, + "grad_norm": 4.211742401123047, + "learning_rate": 9.760000000000001e-06, + "loss": 0.1436, + "step": 488 + }, + { + "epoch": 0.049, + "grad_norm": 3.0763492584228516, + "learning_rate": 9.800000000000001e-06, + "loss": 0.2209, + "step": 490 + }, + { + "epoch": 0.0492, + "grad_norm": 5.404379367828369, + "learning_rate": 9.84e-06, + "loss": 0.4551, + "step": 492 + }, + { + "epoch": 0.0494, + "grad_norm": 8.962662696838379, + "learning_rate": 9.88e-06, + "loss": 0.324, + "step": 494 + }, + { + "epoch": 0.0496, + "grad_norm": 2.5670766830444336, + "learning_rate": 9.920000000000002e-06, + "loss": 0.2821, + "step": 496 + }, + { + "epoch": 0.0498, + "grad_norm": 4.070441722869873, + "learning_rate": 9.960000000000001e-06, + "loss": 0.3154, + "step": 498 + }, + { + "epoch": 0.05, + "grad_norm": 3.5727906227111816, + "learning_rate": 1e-05, + "loss": 0.3551, + "step": 500 + }, + { + "epoch": 0.0502, + "grad_norm": 3.0645854473114014, + "learning_rate": 1.004e-05, + "loss": 0.1595, + "step": 502 + }, + { + "epoch": 0.0504, + "grad_norm": 5.211047172546387, + "learning_rate": 1.008e-05, + "loss": 0.4167, + "step": 504 + }, + { + "epoch": 0.0506, + "grad_norm": 3.592796564102173, + "learning_rate": 1.0120000000000001e-05, + "loss": 0.1594, + "step": 506 + }, + { + "epoch": 0.0508, + "grad_norm": 10.408934593200684, + "learning_rate": 1.0160000000000001e-05, + "loss": 0.3959, + "step": 508 + }, + { + "epoch": 0.051, + "grad_norm": 1.2011529207229614, + "learning_rate": 1.02e-05, + "loss": 0.2076, + "step": 510 + }, + { + "epoch": 0.0512, + "grad_norm": 2.624803304672241, + "learning_rate": 1.024e-05, + "loss": 0.22, + "step": 512 + }, + { + "epoch": 0.0514, + "grad_norm": 7.004764556884766, + "learning_rate": 1.0280000000000002e-05, + "loss": 0.452, + "step": 514 + }, + { + "epoch": 0.0516, + "grad_norm": 11.438862800598145, + "learning_rate": 1.0320000000000001e-05, + "loss": 0.2892, + "step": 516 + }, + { + "epoch": 0.0518, + "grad_norm": 4.021786212921143, + "learning_rate": 1.036e-05, + "loss": 0.2794, + "step": 518 + }, + { + "epoch": 0.052, + "grad_norm": 10.561925888061523, + "learning_rate": 1.04e-05, + "loss": 0.3333, + "step": 520 + }, + { + "epoch": 0.0522, + "grad_norm": 2.437946319580078, + "learning_rate": 1.0440000000000002e-05, + "loss": 0.1966, + "step": 522 + }, + { + "epoch": 0.0524, + "grad_norm": 5.286338806152344, + "learning_rate": 1.0480000000000001e-05, + "loss": 0.2432, + "step": 524 + }, + { + "epoch": 0.0526, + "grad_norm": 9.232484817504883, + "learning_rate": 1.0520000000000001e-05, + "loss": 0.9895, + "step": 526 + }, + { + "epoch": 0.0528, + "grad_norm": 6.0050554275512695, + "learning_rate": 1.056e-05, + "loss": 0.2766, + "step": 528 + }, + { + "epoch": 0.053, + "grad_norm": 2.3595387935638428, + "learning_rate": 1.0600000000000002e-05, + "loss": 0.4241, + "step": 530 + }, + { + "epoch": 0.0532, + "grad_norm": 5.617124557495117, + "learning_rate": 1.0640000000000001e-05, + "loss": 0.2619, + "step": 532 + }, + { + "epoch": 0.0534, + "grad_norm": 2.9883148670196533, + "learning_rate": 1.0680000000000001e-05, + "loss": 0.3449, + "step": 534 + }, + { + "epoch": 0.0536, + "grad_norm": 12.66818618774414, + "learning_rate": 1.072e-05, + "loss": 0.5941, + "step": 536 + }, + { + "epoch": 0.0538, + "grad_norm": 2.316157579421997, + "learning_rate": 1.0760000000000002e-05, + "loss": 0.2772, + "step": 538 + }, + { + "epoch": 0.054, + "grad_norm": 6.273144721984863, + "learning_rate": 1.0800000000000002e-05, + "loss": 0.3348, + "step": 540 + }, + { + "epoch": 0.0542, + "grad_norm": 7.677483558654785, + "learning_rate": 1.0840000000000001e-05, + "loss": 0.4903, + "step": 542 + }, + { + "epoch": 0.0544, + "grad_norm": 2.3350231647491455, + "learning_rate": 1.0880000000000001e-05, + "loss": 0.1406, + "step": 544 + }, + { + "epoch": 0.0546, + "grad_norm": 2.1038222312927246, + "learning_rate": 1.0920000000000002e-05, + "loss": 0.32, + "step": 546 + }, + { + "epoch": 0.0548, + "grad_norm": 2.5570850372314453, + "learning_rate": 1.0960000000000002e-05, + "loss": 0.1057, + "step": 548 + }, + { + "epoch": 0.055, + "grad_norm": 3.6344656944274902, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.1243, + "step": 550 + }, + { + "epoch": 0.0552, + "grad_norm": 4.729857444763184, + "learning_rate": 1.1040000000000001e-05, + "loss": 0.4348, + "step": 552 + }, + { + "epoch": 0.0554, + "grad_norm": 3.4328370094299316, + "learning_rate": 1.1080000000000002e-05, + "loss": 0.2065, + "step": 554 + }, + { + "epoch": 0.0556, + "grad_norm": 4.555666446685791, + "learning_rate": 1.1120000000000002e-05, + "loss": 0.4766, + "step": 556 + }, + { + "epoch": 0.0558, + "grad_norm": 19.07379913330078, + "learning_rate": 1.1160000000000002e-05, + "loss": 0.6257, + "step": 558 + }, + { + "epoch": 0.056, + "grad_norm": 3.3283989429473877, + "learning_rate": 1.1200000000000001e-05, + "loss": 0.1331, + "step": 560 + }, + { + "epoch": 0.0562, + "grad_norm": 15.191646575927734, + "learning_rate": 1.1240000000000002e-05, + "loss": 0.7594, + "step": 562 + }, + { + "epoch": 0.0564, + "grad_norm": 2.675227403640747, + "learning_rate": 1.128e-05, + "loss": 0.3172, + "step": 564 + }, + { + "epoch": 0.0566, + "grad_norm": 4.9325971603393555, + "learning_rate": 1.132e-05, + "loss": 0.1877, + "step": 566 + }, + { + "epoch": 0.0568, + "grad_norm": 7.129478454589844, + "learning_rate": 1.136e-05, + "loss": 0.5756, + "step": 568 + }, + { + "epoch": 0.057, + "grad_norm": 0.9107065200805664, + "learning_rate": 1.14e-05, + "loss": 0.2605, + "step": 570 + }, + { + "epoch": 0.0572, + "grad_norm": 1.8293330669403076, + "learning_rate": 1.144e-05, + "loss": 0.3962, + "step": 572 + }, + { + "epoch": 0.0574, + "grad_norm": 4.287289619445801, + "learning_rate": 1.148e-05, + "loss": 0.3247, + "step": 574 + }, + { + "epoch": 0.0576, + "grad_norm": 1.3618651628494263, + "learning_rate": 1.152e-05, + "loss": 0.4328, + "step": 576 + }, + { + "epoch": 0.0578, + "grad_norm": 1.5392404794692993, + "learning_rate": 1.156e-05, + "loss": 0.4184, + "step": 578 + }, + { + "epoch": 0.058, + "grad_norm": 3.0153794288635254, + "learning_rate": 1.16e-05, + "loss": 0.1564, + "step": 580 + }, + { + "epoch": 0.0582, + "grad_norm": 1.700326919555664, + "learning_rate": 1.164e-05, + "loss": 0.1783, + "step": 582 + }, + { + "epoch": 0.0584, + "grad_norm": 1.4888060092926025, + "learning_rate": 1.168e-05, + "loss": 0.3787, + "step": 584 + }, + { + "epoch": 0.0586, + "grad_norm": 4.6906914710998535, + "learning_rate": 1.172e-05, + "loss": 0.6027, + "step": 586 + }, + { + "epoch": 0.0588, + "grad_norm": 1.5849099159240723, + "learning_rate": 1.1760000000000001e-05, + "loss": 0.168, + "step": 588 + }, + { + "epoch": 0.059, + "grad_norm": 4.271294116973877, + "learning_rate": 1.18e-05, + "loss": 0.4621, + "step": 590 + }, + { + "epoch": 0.0592, + "grad_norm": 1.8925687074661255, + "learning_rate": 1.184e-05, + "loss": 0.3745, + "step": 592 + }, + { + "epoch": 0.0594, + "grad_norm": 1.4834446907043457, + "learning_rate": 1.188e-05, + "loss": 0.2795, + "step": 594 + }, + { + "epoch": 0.0596, + "grad_norm": 3.356538772583008, + "learning_rate": 1.1920000000000001e-05, + "loss": 0.3432, + "step": 596 + }, + { + "epoch": 0.0598, + "grad_norm": 2.518808126449585, + "learning_rate": 1.196e-05, + "loss": 0.2212, + "step": 598 + }, + { + "epoch": 0.06, + "grad_norm": 3.09200119972229, + "learning_rate": 1.2e-05, + "loss": 0.3283, + "step": 600 + }, + { + "epoch": 0.0602, + "grad_norm": 2.8231143951416016, + "learning_rate": 1.204e-05, + "loss": 0.2808, + "step": 602 + }, + { + "epoch": 0.0604, + "grad_norm": 1.7705836296081543, + "learning_rate": 1.2080000000000001e-05, + "loss": 0.1727, + "step": 604 + }, + { + "epoch": 0.0606, + "grad_norm": 1.242443323135376, + "learning_rate": 1.2120000000000001e-05, + "loss": 0.1737, + "step": 606 + }, + { + "epoch": 0.0608, + "grad_norm": 1.5449870824813843, + "learning_rate": 1.216e-05, + "loss": 0.3445, + "step": 608 + }, + { + "epoch": 0.061, + "grad_norm": 1.7644752264022827, + "learning_rate": 1.22e-05, + "loss": 0.4618, + "step": 610 + }, + { + "epoch": 0.0612, + "grad_norm": 1.0457683801651, + "learning_rate": 1.2240000000000001e-05, + "loss": 0.1966, + "step": 612 + }, + { + "epoch": 0.0614, + "grad_norm": 4.240916728973389, + "learning_rate": 1.2280000000000001e-05, + "loss": 0.4176, + "step": 614 + }, + { + "epoch": 0.0616, + "grad_norm": 3.6039907932281494, + "learning_rate": 1.232e-05, + "loss": 0.418, + "step": 616 + }, + { + "epoch": 0.0618, + "grad_norm": 2.0494260787963867, + "learning_rate": 1.236e-05, + "loss": 0.1904, + "step": 618 + }, + { + "epoch": 0.062, + "grad_norm": 4.09948205947876, + "learning_rate": 1.2400000000000002e-05, + "loss": 0.2737, + "step": 620 + }, + { + "epoch": 0.0622, + "grad_norm": 6.71242094039917, + "learning_rate": 1.2440000000000001e-05, + "loss": 0.7132, + "step": 622 + }, + { + "epoch": 0.0624, + "grad_norm": 3.8860206604003906, + "learning_rate": 1.248e-05, + "loss": 0.405, + "step": 624 + }, + { + "epoch": 0.0626, + "grad_norm": 4.04345703125, + "learning_rate": 1.252e-05, + "loss": 0.2307, + "step": 626 + }, + { + "epoch": 0.0628, + "grad_norm": 3.0688107013702393, + "learning_rate": 1.2560000000000002e-05, + "loss": 0.4367, + "step": 628 + }, + { + "epoch": 0.063, + "grad_norm": 4.542538642883301, + "learning_rate": 1.2600000000000001e-05, + "loss": 0.23, + "step": 630 + }, + { + "epoch": 0.0632, + "grad_norm": 2.0085527896881104, + "learning_rate": 1.2640000000000001e-05, + "loss": 0.2096, + "step": 632 + }, + { + "epoch": 0.0634, + "grad_norm": 2.4783897399902344, + "learning_rate": 1.268e-05, + "loss": 0.2793, + "step": 634 + }, + { + "epoch": 0.0636, + "grad_norm": 4.451127052307129, + "learning_rate": 1.2720000000000002e-05, + "loss": 0.4051, + "step": 636 + }, + { + "epoch": 0.0638, + "grad_norm": 6.084064483642578, + "learning_rate": 1.2760000000000001e-05, + "loss": 0.3647, + "step": 638 + }, + { + "epoch": 0.064, + "grad_norm": 1.6697360277175903, + "learning_rate": 1.2800000000000001e-05, + "loss": 0.1782, + "step": 640 + }, + { + "epoch": 0.0642, + "grad_norm": 2.6502139568328857, + "learning_rate": 1.284e-05, + "loss": 0.289, + "step": 642 + }, + { + "epoch": 0.0644, + "grad_norm": 4.108283042907715, + "learning_rate": 1.2880000000000002e-05, + "loss": 0.2194, + "step": 644 + }, + { + "epoch": 0.0646, + "grad_norm": 3.5031793117523193, + "learning_rate": 1.2920000000000002e-05, + "loss": 0.2405, + "step": 646 + }, + { + "epoch": 0.0648, + "grad_norm": 3.830467462539673, + "learning_rate": 1.2960000000000001e-05, + "loss": 0.2857, + "step": 648 + }, + { + "epoch": 0.065, + "grad_norm": 2.2974984645843506, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.1051, + "step": 650 + }, + { + "epoch": 0.0652, + "grad_norm": 13.95319652557373, + "learning_rate": 1.3040000000000002e-05, + "loss": 0.6515, + "step": 652 + }, + { + "epoch": 0.0654, + "grad_norm": 7.540467739105225, + "learning_rate": 1.3080000000000002e-05, + "loss": 0.533, + "step": 654 + }, + { + "epoch": 0.0656, + "grad_norm": 6.608221054077148, + "learning_rate": 1.3120000000000001e-05, + "loss": 0.4732, + "step": 656 + }, + { + "epoch": 0.0658, + "grad_norm": 3.665257692337036, + "learning_rate": 1.3160000000000001e-05, + "loss": 0.3353, + "step": 658 + }, + { + "epoch": 0.066, + "grad_norm": 1.7245553731918335, + "learning_rate": 1.3200000000000002e-05, + "loss": 0.1177, + "step": 660 + }, + { + "epoch": 0.0662, + "grad_norm": 6.330013751983643, + "learning_rate": 1.3240000000000002e-05, + "loss": 0.4171, + "step": 662 + }, + { + "epoch": 0.0664, + "grad_norm": 5.119513988494873, + "learning_rate": 1.3280000000000002e-05, + "loss": 0.2887, + "step": 664 + }, + { + "epoch": 0.0666, + "grad_norm": 2.25529408454895, + "learning_rate": 1.3320000000000001e-05, + "loss": 0.2127, + "step": 666 + }, + { + "epoch": 0.0668, + "grad_norm": 2.802395820617676, + "learning_rate": 1.3360000000000003e-05, + "loss": 0.4379, + "step": 668 + }, + { + "epoch": 0.067, + "grad_norm": 10.639803886413574, + "learning_rate": 1.3400000000000002e-05, + "loss": 0.3951, + "step": 670 + }, + { + "epoch": 0.0672, + "grad_norm": 11.272892951965332, + "learning_rate": 1.3440000000000002e-05, + "loss": 0.478, + "step": 672 + }, + { + "epoch": 0.0674, + "grad_norm": 2.3102333545684814, + "learning_rate": 1.3480000000000001e-05, + "loss": 0.3116, + "step": 674 + }, + { + "epoch": 0.0676, + "grad_norm": 3.9925613403320312, + "learning_rate": 1.3520000000000003e-05, + "loss": 0.2735, + "step": 676 + }, + { + "epoch": 0.0678, + "grad_norm": 2.68086838722229, + "learning_rate": 1.3560000000000002e-05, + "loss": 0.2085, + "step": 678 + }, + { + "epoch": 0.068, + "grad_norm": 5.605576992034912, + "learning_rate": 1.3600000000000002e-05, + "loss": 0.4051, + "step": 680 + }, + { + "epoch": 0.0682, + "grad_norm": 4.015842437744141, + "learning_rate": 1.3640000000000002e-05, + "loss": 0.4105, + "step": 682 + }, + { + "epoch": 0.0684, + "grad_norm": 6.048739433288574, + "learning_rate": 1.3680000000000003e-05, + "loss": 0.42, + "step": 684 + }, + { + "epoch": 0.0686, + "grad_norm": 2.316864013671875, + "learning_rate": 1.3720000000000002e-05, + "loss": 0.2338, + "step": 686 + }, + { + "epoch": 0.0688, + "grad_norm": 1.5960725545883179, + "learning_rate": 1.376e-05, + "loss": 0.1922, + "step": 688 + }, + { + "epoch": 0.069, + "grad_norm": 7.173309326171875, + "learning_rate": 1.38e-05, + "loss": 0.3548, + "step": 690 + }, + { + "epoch": 0.0692, + "grad_norm": 5.948266506195068, + "learning_rate": 1.384e-05, + "loss": 0.3427, + "step": 692 + }, + { + "epoch": 0.0694, + "grad_norm": 3.3483481407165527, + "learning_rate": 1.3880000000000001e-05, + "loss": 0.2685, + "step": 694 + }, + { + "epoch": 0.0696, + "grad_norm": 10.619876861572266, + "learning_rate": 1.392e-05, + "loss": 0.4993, + "step": 696 + }, + { + "epoch": 0.0698, + "grad_norm": 2.084138870239258, + "learning_rate": 1.396e-05, + "loss": 0.197, + "step": 698 + }, + { + "epoch": 0.07, + "grad_norm": 2.8195457458496094, + "learning_rate": 1.4e-05, + "loss": 0.4225, + "step": 700 + }, + { + "epoch": 0.0702, + "grad_norm": 0.8430588841438293, + "learning_rate": 1.4040000000000001e-05, + "loss": 0.1047, + "step": 702 + }, + { + "epoch": 0.0704, + "grad_norm": 6.262668609619141, + "learning_rate": 1.408e-05, + "loss": 0.2665, + "step": 704 + }, + { + "epoch": 0.0706, + "grad_norm": 5.194781303405762, + "learning_rate": 1.412e-05, + "loss": 0.2892, + "step": 706 + }, + { + "epoch": 0.0708, + "grad_norm": 5.186453819274902, + "learning_rate": 1.416e-05, + "loss": 0.2916, + "step": 708 + }, + { + "epoch": 0.071, + "grad_norm": 3.7561216354370117, + "learning_rate": 1.4200000000000001e-05, + "loss": 0.292, + "step": 710 + }, + { + "epoch": 0.0712, + "grad_norm": 5.085936069488525, + "learning_rate": 1.4240000000000001e-05, + "loss": 0.1566, + "step": 712 + }, + { + "epoch": 0.0714, + "grad_norm": 10.693710327148438, + "learning_rate": 1.428e-05, + "loss": 0.2897, + "step": 714 + }, + { + "epoch": 0.0716, + "grad_norm": 7.7803778648376465, + "learning_rate": 1.432e-05, + "loss": 0.1747, + "step": 716 + }, + { + "epoch": 0.0718, + "grad_norm": 8.919317245483398, + "learning_rate": 1.4360000000000001e-05, + "loss": 0.2993, + "step": 718 + }, + { + "epoch": 0.072, + "grad_norm": 13.645888328552246, + "learning_rate": 1.4400000000000001e-05, + "loss": 0.578, + "step": 720 + }, + { + "epoch": 0.0722, + "grad_norm": 2.2947018146514893, + "learning_rate": 1.444e-05, + "loss": 0.4581, + "step": 722 + }, + { + "epoch": 0.0724, + "grad_norm": 5.488033771514893, + "learning_rate": 1.448e-05, + "loss": 0.5018, + "step": 724 + }, + { + "epoch": 0.0726, + "grad_norm": 5.155704498291016, + "learning_rate": 1.4520000000000002e-05, + "loss": 0.3656, + "step": 726 + }, + { + "epoch": 0.0728, + "grad_norm": 5.664485454559326, + "learning_rate": 1.4560000000000001e-05, + "loss": 0.5728, + "step": 728 + }, + { + "epoch": 0.073, + "grad_norm": 12.935782432556152, + "learning_rate": 1.46e-05, + "loss": 0.5468, + "step": 730 + }, + { + "epoch": 0.0732, + "grad_norm": 4.303211212158203, + "learning_rate": 1.464e-05, + "loss": 0.1428, + "step": 732 + }, + { + "epoch": 0.0734, + "grad_norm": 8.540749549865723, + "learning_rate": 1.4680000000000002e-05, + "loss": 0.2191, + "step": 734 + }, + { + "epoch": 0.0736, + "grad_norm": 13.570290565490723, + "learning_rate": 1.4720000000000001e-05, + "loss": 0.5424, + "step": 736 + }, + { + "epoch": 0.0738, + "grad_norm": 4.624420166015625, + "learning_rate": 1.4760000000000001e-05, + "loss": 0.4112, + "step": 738 + }, + { + "epoch": 0.074, + "grad_norm": 1.3473963737487793, + "learning_rate": 1.48e-05, + "loss": 0.293, + "step": 740 + }, + { + "epoch": 0.0742, + "grad_norm": 6.875557899475098, + "learning_rate": 1.4840000000000002e-05, + "loss": 0.256, + "step": 742 + }, + { + "epoch": 0.0744, + "grad_norm": 3.568737745285034, + "learning_rate": 1.4880000000000002e-05, + "loss": 0.1365, + "step": 744 + }, + { + "epoch": 0.0746, + "grad_norm": 3.3630902767181396, + "learning_rate": 1.4920000000000001e-05, + "loss": 0.3406, + "step": 746 + }, + { + "epoch": 0.0748, + "grad_norm": 2.2788608074188232, + "learning_rate": 1.496e-05, + "loss": 0.2174, + "step": 748 + }, + { + "epoch": 0.075, + "grad_norm": 3.803896188735962, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.538, + "step": 750 + }, + { + "epoch": 0.0752, + "grad_norm": 5.575863361358643, + "learning_rate": 1.5040000000000002e-05, + "loss": 0.5524, + "step": 752 + }, + { + "epoch": 0.0754, + "grad_norm": 2.0472638607025146, + "learning_rate": 1.5080000000000001e-05, + "loss": 0.1966, + "step": 754 + }, + { + "epoch": 0.0756, + "grad_norm": 2.323838949203491, + "learning_rate": 1.5120000000000001e-05, + "loss": 0.3205, + "step": 756 + }, + { + "epoch": 0.0758, + "grad_norm": 2.7926442623138428, + "learning_rate": 1.516e-05, + "loss": 0.4361, + "step": 758 + }, + { + "epoch": 0.076, + "grad_norm": 2.5403661727905273, + "learning_rate": 1.5200000000000002e-05, + "loss": 0.3107, + "step": 760 + }, + { + "epoch": 0.0762, + "grad_norm": 2.2995612621307373, + "learning_rate": 1.5240000000000001e-05, + "loss": 0.3362, + "step": 762 + }, + { + "epoch": 0.0764, + "grad_norm": 3.46327805519104, + "learning_rate": 1.5280000000000003e-05, + "loss": 0.4748, + "step": 764 + }, + { + "epoch": 0.0766, + "grad_norm": 3.513758420944214, + "learning_rate": 1.5320000000000002e-05, + "loss": 0.4906, + "step": 766 + }, + { + "epoch": 0.0768, + "grad_norm": 4.197177886962891, + "learning_rate": 1.5360000000000002e-05, + "loss": 0.4169, + "step": 768 + }, + { + "epoch": 0.077, + "grad_norm": 1.75531804561615, + "learning_rate": 1.54e-05, + "loss": 0.2399, + "step": 770 + }, + { + "epoch": 0.0772, + "grad_norm": 2.0469040870666504, + "learning_rate": 1.544e-05, + "loss": 0.1659, + "step": 772 + }, + { + "epoch": 0.0774, + "grad_norm": 2.402507781982422, + "learning_rate": 1.548e-05, + "loss": 0.2538, + "step": 774 + }, + { + "epoch": 0.0776, + "grad_norm": 1.7749147415161133, + "learning_rate": 1.552e-05, + "loss": 0.3653, + "step": 776 + }, + { + "epoch": 0.0778, + "grad_norm": 2.535987615585327, + "learning_rate": 1.556e-05, + "loss": 0.3533, + "step": 778 + }, + { + "epoch": 0.078, + "grad_norm": 3.3212859630584717, + "learning_rate": 1.5600000000000003e-05, + "loss": 0.3547, + "step": 780 + }, + { + "epoch": 0.0782, + "grad_norm": 3.9531798362731934, + "learning_rate": 1.5640000000000003e-05, + "loss": 0.3333, + "step": 782 + }, + { + "epoch": 0.0784, + "grad_norm": 1.504765272140503, + "learning_rate": 1.5680000000000002e-05, + "loss": 0.2166, + "step": 784 + }, + { + "epoch": 0.0786, + "grad_norm": 2.441934823989868, + "learning_rate": 1.5720000000000002e-05, + "loss": 0.2792, + "step": 786 + }, + { + "epoch": 0.0788, + "grad_norm": 3.6252458095550537, + "learning_rate": 1.576e-05, + "loss": 0.2232, + "step": 788 + }, + { + "epoch": 0.079, + "grad_norm": 1.9582695960998535, + "learning_rate": 1.58e-05, + "loss": 0.2908, + "step": 790 + }, + { + "epoch": 0.0792, + "grad_norm": 3.5418097972869873, + "learning_rate": 1.584e-05, + "loss": 0.1774, + "step": 792 + }, + { + "epoch": 0.0794, + "grad_norm": 1.6849431991577148, + "learning_rate": 1.588e-05, + "loss": 0.3786, + "step": 794 + }, + { + "epoch": 0.0796, + "grad_norm": 1.3276718854904175, + "learning_rate": 1.5920000000000003e-05, + "loss": 0.2967, + "step": 796 + }, + { + "epoch": 0.0798, + "grad_norm": 1.991226315498352, + "learning_rate": 1.5960000000000003e-05, + "loss": 0.3626, + "step": 798 + }, + { + "epoch": 0.08, + "grad_norm": 4.020994186401367, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.6377, + "step": 800 + }, + { + "epoch": 0.0802, + "grad_norm": 1.2026379108428955, + "learning_rate": 1.6040000000000002e-05, + "loss": 0.3706, + "step": 802 + }, + { + "epoch": 0.0804, + "grad_norm": 5.320672512054443, + "learning_rate": 1.6080000000000002e-05, + "loss": 0.3406, + "step": 804 + }, + { + "epoch": 0.0806, + "grad_norm": 2.1307332515716553, + "learning_rate": 1.612e-05, + "loss": 0.169, + "step": 806 + }, + { + "epoch": 0.0808, + "grad_norm": 2.065152168273926, + "learning_rate": 1.616e-05, + "loss": 0.0804, + "step": 808 + }, + { + "epoch": 0.081, + "grad_norm": 7.465230464935303, + "learning_rate": 1.62e-05, + "loss": 0.2584, + "step": 810 + }, + { + "epoch": 0.0812, + "grad_norm": 14.860204696655273, + "learning_rate": 1.6240000000000004e-05, + "loss": 0.5783, + "step": 812 + }, + { + "epoch": 0.0814, + "grad_norm": 2.41607403755188, + "learning_rate": 1.628e-05, + "loss": 0.3046, + "step": 814 + }, + { + "epoch": 0.0816, + "grad_norm": 2.6218061447143555, + "learning_rate": 1.632e-05, + "loss": 0.2542, + "step": 816 + }, + { + "epoch": 0.0818, + "grad_norm": 2.31086802482605, + "learning_rate": 1.636e-05, + "loss": 0.3103, + "step": 818 + }, + { + "epoch": 0.082, + "grad_norm": 1.2792166471481323, + "learning_rate": 1.64e-05, + "loss": 0.0481, + "step": 820 + }, + { + "epoch": 0.0822, + "grad_norm": 2.9201133251190186, + "learning_rate": 1.6440000000000002e-05, + "loss": 0.3536, + "step": 822 + }, + { + "epoch": 0.0824, + "grad_norm": 4.756605625152588, + "learning_rate": 1.648e-05, + "loss": 0.4055, + "step": 824 + }, + { + "epoch": 0.0826, + "grad_norm": 4.143870830535889, + "learning_rate": 1.652e-05, + "loss": 0.3645, + "step": 826 + }, + { + "epoch": 0.0828, + "grad_norm": 3.0887601375579834, + "learning_rate": 1.656e-05, + "loss": 0.3054, + "step": 828 + }, + { + "epoch": 0.083, + "grad_norm": 2.5197861194610596, + "learning_rate": 1.66e-05, + "loss": 0.2786, + "step": 830 + }, + { + "epoch": 0.0832, + "grad_norm": 3.4697773456573486, + "learning_rate": 1.664e-05, + "loss": 0.1768, + "step": 832 + }, + { + "epoch": 0.0834, + "grad_norm": 1.4943238496780396, + "learning_rate": 1.668e-05, + "loss": 0.2712, + "step": 834 + }, + { + "epoch": 0.0836, + "grad_norm": 1.9805855751037598, + "learning_rate": 1.672e-05, + "loss": 0.1991, + "step": 836 + }, + { + "epoch": 0.0838, + "grad_norm": 1.770057201385498, + "learning_rate": 1.6760000000000002e-05, + "loss": 0.2456, + "step": 838 + }, + { + "epoch": 0.084, + "grad_norm": 7.828678607940674, + "learning_rate": 1.6800000000000002e-05, + "loss": 0.3646, + "step": 840 + }, + { + "epoch": 0.0842, + "grad_norm": 3.2110707759857178, + "learning_rate": 1.684e-05, + "loss": 0.1967, + "step": 842 + }, + { + "epoch": 0.0844, + "grad_norm": 1.4341624975204468, + "learning_rate": 1.688e-05, + "loss": 0.0525, + "step": 844 + }, + { + "epoch": 0.0846, + "grad_norm": 3.6250362396240234, + "learning_rate": 1.692e-05, + "loss": 0.1121, + "step": 846 + }, + { + "epoch": 0.0848, + "grad_norm": 1.0750113725662231, + "learning_rate": 1.696e-05, + "loss": 0.7173, + "step": 848 + }, + { + "epoch": 0.085, + "grad_norm": 8.220694541931152, + "learning_rate": 1.7e-05, + "loss": 0.7772, + "step": 850 + }, + { + "epoch": 0.0852, + "grad_norm": 5.71378231048584, + "learning_rate": 1.704e-05, + "loss": 0.5119, + "step": 852 + }, + { + "epoch": 0.0854, + "grad_norm": 2.5791015625, + "learning_rate": 1.7080000000000002e-05, + "loss": 0.1544, + "step": 854 + }, + { + "epoch": 0.0856, + "grad_norm": 3.6199216842651367, + "learning_rate": 1.7120000000000002e-05, + "loss": 0.4515, + "step": 856 + }, + { + "epoch": 0.0858, + "grad_norm": 2.3736722469329834, + "learning_rate": 1.7160000000000002e-05, + "loss": 0.1294, + "step": 858 + }, + { + "epoch": 0.086, + "grad_norm": 3.4732065200805664, + "learning_rate": 1.72e-05, + "loss": 0.1877, + "step": 860 + }, + { + "epoch": 0.0862, + "grad_norm": 3.112140417098999, + "learning_rate": 1.724e-05, + "loss": 0.1394, + "step": 862 + }, + { + "epoch": 0.0864, + "grad_norm": 6.514045715332031, + "learning_rate": 1.728e-05, + "loss": 0.2649, + "step": 864 + }, + { + "epoch": 0.0866, + "grad_norm": 5.7469635009765625, + "learning_rate": 1.732e-05, + "loss": 0.3671, + "step": 866 + }, + { + "epoch": 0.0868, + "grad_norm": 1.6792031526565552, + "learning_rate": 1.736e-05, + "loss": 0.118, + "step": 868 + }, + { + "epoch": 0.087, + "grad_norm": 4.0161051750183105, + "learning_rate": 1.7400000000000003e-05, + "loss": 0.367, + "step": 870 + }, + { + "epoch": 0.0872, + "grad_norm": 3.5848679542541504, + "learning_rate": 1.7440000000000002e-05, + "loss": 0.4887, + "step": 872 + }, + { + "epoch": 0.0874, + "grad_norm": 2.125319004058838, + "learning_rate": 1.7480000000000002e-05, + "loss": 0.2152, + "step": 874 + }, + { + "epoch": 0.0876, + "grad_norm": 5.302021503448486, + "learning_rate": 1.752e-05, + "loss": 0.2204, + "step": 876 + }, + { + "epoch": 0.0878, + "grad_norm": 2.218912124633789, + "learning_rate": 1.756e-05, + "loss": 0.2887, + "step": 878 + }, + { + "epoch": 0.088, + "grad_norm": 1.7471247911453247, + "learning_rate": 1.76e-05, + "loss": 0.2513, + "step": 880 + }, + { + "epoch": 0.0882, + "grad_norm": 11.370123863220215, + "learning_rate": 1.764e-05, + "loss": 0.24, + "step": 882 + }, + { + "epoch": 0.0884, + "grad_norm": 1.1529287099838257, + "learning_rate": 1.768e-05, + "loss": 0.0155, + "step": 884 + }, + { + "epoch": 0.0886, + "grad_norm": 5.260316848754883, + "learning_rate": 1.7720000000000003e-05, + "loss": 0.4722, + "step": 886 + }, + { + "epoch": 0.0888, + "grad_norm": 3.3896920680999756, + "learning_rate": 1.7760000000000003e-05, + "loss": 0.3932, + "step": 888 + }, + { + "epoch": 0.089, + "grad_norm": 11.877079963684082, + "learning_rate": 1.7800000000000002e-05, + "loss": 0.2936, + "step": 890 + }, + { + "epoch": 0.0892, + "grad_norm": 1.6488037109375, + "learning_rate": 1.7840000000000002e-05, + "loss": 0.2648, + "step": 892 + }, + { + "epoch": 0.0894, + "grad_norm": 9.130402565002441, + "learning_rate": 1.788e-05, + "loss": 0.5358, + "step": 894 + }, + { + "epoch": 0.0896, + "grad_norm": 2.8646466732025146, + "learning_rate": 1.792e-05, + "loss": 0.6679, + "step": 896 + }, + { + "epoch": 0.0898, + "grad_norm": 2.074549436569214, + "learning_rate": 1.796e-05, + "loss": 0.2477, + "step": 898 + }, + { + "epoch": 0.09, + "grad_norm": 2.271467924118042, + "learning_rate": 1.8e-05, + "loss": 0.263, + "step": 900 + }, + { + "epoch": 0.0902, + "grad_norm": 2.3133039474487305, + "learning_rate": 1.8040000000000003e-05, + "loss": 0.2656, + "step": 902 + }, + { + "epoch": 0.0904, + "grad_norm": 3.192239761352539, + "learning_rate": 1.8080000000000003e-05, + "loss": 0.3501, + "step": 904 + }, + { + "epoch": 0.0906, + "grad_norm": 1.4195775985717773, + "learning_rate": 1.8120000000000003e-05, + "loss": 0.2444, + "step": 906 + }, + { + "epoch": 0.0908, + "grad_norm": 0.1636224240064621, + "learning_rate": 1.8160000000000002e-05, + "loss": 0.5358, + "step": 908 + }, + { + "epoch": 0.091, + "grad_norm": 5.323878765106201, + "learning_rate": 1.8200000000000002e-05, + "loss": 0.4759, + "step": 910 + }, + { + "epoch": 0.0912, + "grad_norm": 11.152843475341797, + "learning_rate": 1.824e-05, + "loss": 1.0258, + "step": 912 + }, + { + "epoch": 0.0914, + "grad_norm": 7.822467803955078, + "learning_rate": 1.828e-05, + "loss": 0.2277, + "step": 914 + }, + { + "epoch": 0.0916, + "grad_norm": 3.452258586883545, + "learning_rate": 1.832e-05, + "loss": 0.6606, + "step": 916 + }, + { + "epoch": 0.0918, + "grad_norm": 1.7406905889511108, + "learning_rate": 1.8360000000000004e-05, + "loss": 0.2287, + "step": 918 + }, + { + "epoch": 0.092, + "grad_norm": 3.8792717456817627, + "learning_rate": 1.8400000000000003e-05, + "loss": 0.3061, + "step": 920 + }, + { + "epoch": 0.0922, + "grad_norm": 2.573408842086792, + "learning_rate": 1.8440000000000003e-05, + "loss": 0.4168, + "step": 922 + }, + { + "epoch": 0.0924, + "grad_norm": 3.2529759407043457, + "learning_rate": 1.8480000000000003e-05, + "loss": 0.2217, + "step": 924 + }, + { + "epoch": 0.0926, + "grad_norm": 2.6694247722625732, + "learning_rate": 1.8520000000000002e-05, + "loss": 0.4172, + "step": 926 + }, + { + "epoch": 0.0928, + "grad_norm": 0.6444530487060547, + "learning_rate": 1.8560000000000002e-05, + "loss": 0.1467, + "step": 928 + }, + { + "epoch": 0.093, + "grad_norm": 8.258158683776855, + "learning_rate": 1.86e-05, + "loss": 0.4201, + "step": 930 + }, + { + "epoch": 0.0932, + "grad_norm": 2.6051578521728516, + "learning_rate": 1.864e-05, + "loss": 0.3661, + "step": 932 + }, + { + "epoch": 0.0934, + "grad_norm": 1.696272373199463, + "learning_rate": 1.8680000000000004e-05, + "loss": 0.2802, + "step": 934 + }, + { + "epoch": 0.0936, + "grad_norm": 1.8856561183929443, + "learning_rate": 1.8720000000000004e-05, + "loss": 0.266, + "step": 936 + }, + { + "epoch": 0.0938, + "grad_norm": 1.753676414489746, + "learning_rate": 1.876e-05, + "loss": 0.3466, + "step": 938 + }, + { + "epoch": 0.094, + "grad_norm": 5.285353183746338, + "learning_rate": 1.88e-05, + "loss": 0.2688, + "step": 940 + }, + { + "epoch": 0.0942, + "grad_norm": 2.821622848510742, + "learning_rate": 1.884e-05, + "loss": 0.4742, + "step": 942 + }, + { + "epoch": 0.0944, + "grad_norm": 2.9793028831481934, + "learning_rate": 1.8880000000000002e-05, + "loss": 0.3719, + "step": 944 + }, + { + "epoch": 0.0946, + "grad_norm": 1.7065761089324951, + "learning_rate": 1.8920000000000002e-05, + "loss": 0.1214, + "step": 946 + }, + { + "epoch": 0.0948, + "grad_norm": 1.053374171257019, + "learning_rate": 1.896e-05, + "loss": 0.0693, + "step": 948 + }, + { + "epoch": 0.095, + "grad_norm": 5.787185192108154, + "learning_rate": 1.9e-05, + "loss": 0.3862, + "step": 950 + }, + { + "epoch": 0.0952, + "grad_norm": 2.4752702713012695, + "learning_rate": 1.904e-05, + "loss": 0.1616, + "step": 952 + }, + { + "epoch": 0.0954, + "grad_norm": 3.2430596351623535, + "learning_rate": 1.908e-05, + "loss": 0.5093, + "step": 954 + }, + { + "epoch": 0.0956, + "grad_norm": 0.32520344853401184, + "learning_rate": 1.912e-05, + "loss": 0.197, + "step": 956 + }, + { + "epoch": 0.0958, + "grad_norm": 1.7791827917099, + "learning_rate": 1.916e-05, + "loss": 0.3058, + "step": 958 + }, + { + "epoch": 0.096, + "grad_norm": 1.9858050346374512, + "learning_rate": 1.9200000000000003e-05, + "loss": 0.1157, + "step": 960 + }, + { + "epoch": 0.0962, + "grad_norm": 1.8496408462524414, + "learning_rate": 1.9240000000000002e-05, + "loss": 0.2306, + "step": 962 + }, + { + "epoch": 0.0964, + "grad_norm": 4.734987735748291, + "learning_rate": 1.9280000000000002e-05, + "loss": 0.321, + "step": 964 + }, + { + "epoch": 0.0966, + "grad_norm": 5.535330295562744, + "learning_rate": 1.932e-05, + "loss": 0.2441, + "step": 966 + }, + { + "epoch": 0.0968, + "grad_norm": 2.800863027572632, + "learning_rate": 1.936e-05, + "loss": 0.1227, + "step": 968 + }, + { + "epoch": 0.097, + "grad_norm": 2.3475358486175537, + "learning_rate": 1.94e-05, + "loss": 0.3432, + "step": 970 + }, + { + "epoch": 0.0972, + "grad_norm": 0.266855925321579, + "learning_rate": 1.944e-05, + "loss": 0.1363, + "step": 972 + }, + { + "epoch": 0.0974, + "grad_norm": 4.758463382720947, + "learning_rate": 1.948e-05, + "loss": 0.4616, + "step": 974 + }, + { + "epoch": 0.0976, + "grad_norm": 2.1494803428649902, + "learning_rate": 1.9520000000000003e-05, + "loss": 0.4562, + "step": 976 + }, + { + "epoch": 0.0978, + "grad_norm": 0.46634259819984436, + "learning_rate": 1.9560000000000002e-05, + "loss": 0.29, + "step": 978 + }, + { + "epoch": 0.098, + "grad_norm": 2.6168723106384277, + "learning_rate": 1.9600000000000002e-05, + "loss": 0.1206, + "step": 980 + }, + { + "epoch": 0.0982, + "grad_norm": 3.4922008514404297, + "learning_rate": 1.9640000000000002e-05, + "loss": 0.2654, + "step": 982 + }, + { + "epoch": 0.0984, + "grad_norm": 14.384154319763184, + "learning_rate": 1.968e-05, + "loss": 0.6712, + "step": 984 + }, + { + "epoch": 0.0986, + "grad_norm": 3.357034206390381, + "learning_rate": 1.972e-05, + "loss": 0.1208, + "step": 986 + }, + { + "epoch": 0.0988, + "grad_norm": 4.389461040496826, + "learning_rate": 1.976e-05, + "loss": 0.1771, + "step": 988 + }, + { + "epoch": 0.099, + "grad_norm": 1.9919044971466064, + "learning_rate": 1.98e-05, + "loss": 0.2784, + "step": 990 + }, + { + "epoch": 0.0992, + "grad_norm": 0.9896438717842102, + "learning_rate": 1.9840000000000003e-05, + "loss": 0.1512, + "step": 992 + }, + { + "epoch": 0.0994, + "grad_norm": 5.203512191772461, + "learning_rate": 1.9880000000000003e-05, + "loss": 0.4723, + "step": 994 + }, + { + "epoch": 0.0996, + "grad_norm": 0.9966242909431458, + "learning_rate": 1.9920000000000002e-05, + "loss": 0.1568, + "step": 996 + }, + { + "epoch": 0.0998, + "grad_norm": 2.5107908248901367, + "learning_rate": 1.9960000000000002e-05, + "loss": 0.2385, + "step": 998 + }, + { + "epoch": 0.1, + "grad_norm": 2.243304491043091, + "learning_rate": 2e-05, + "loss": 0.2178, + "step": 1000 + }, + { + "epoch": 0.1002, + "grad_norm": 7.936058044433594, + "learning_rate": 1.9999997563060744e-05, + "loss": 0.3063, + "step": 1002 + }, + { + "epoch": 0.1004, + "grad_norm": 1.2027560472488403, + "learning_rate": 1.9999990252244153e-05, + "loss": 0.1356, + "step": 1004 + }, + { + "epoch": 0.1006, + "grad_norm": 5.095545768737793, + "learning_rate": 1.9999978067553796e-05, + "loss": 0.4409, + "step": 1006 + }, + { + "epoch": 0.1008, + "grad_norm": 3.966773748397827, + "learning_rate": 1.9999961008995607e-05, + "loss": 0.7738, + "step": 1008 + }, + { + "epoch": 0.101, + "grad_norm": 6.3062520027160645, + "learning_rate": 1.9999939076577906e-05, + "loss": 0.1484, + "step": 1010 + }, + { + "epoch": 0.1012, + "grad_norm": 7.847177028656006, + "learning_rate": 1.9999912270311376e-05, + "loss": 0.2157, + "step": 1012 + }, + { + "epoch": 0.1014, + "grad_norm": 5.203221797943115, + "learning_rate": 1.999988059020909e-05, + "loss": 0.6518, + "step": 1014 + }, + { + "epoch": 0.1016, + "grad_norm": 4.474412441253662, + "learning_rate": 1.9999844036286483e-05, + "loss": 0.6624, + "step": 1016 + }, + { + "epoch": 0.1018, + "grad_norm": 3.9571986198425293, + "learning_rate": 1.999980260856137e-05, + "loss": 0.3649, + "step": 1018 + }, + { + "epoch": 0.102, + "grad_norm": 2.8950016498565674, + "learning_rate": 1.9999756307053947e-05, + "loss": 0.1851, + "step": 1020 + }, + { + "epoch": 0.1022, + "grad_norm": 4.417191982269287, + "learning_rate": 1.999970513178678e-05, + "loss": 0.4653, + "step": 1022 + }, + { + "epoch": 0.1024, + "grad_norm": 5.8457512855529785, + "learning_rate": 1.9999649082784807e-05, + "loss": 0.4689, + "step": 1024 + }, + { + "epoch": 0.1026, + "grad_norm": 1.773913860321045, + "learning_rate": 1.999958816007535e-05, + "loss": 0.1149, + "step": 1026 + }, + { + "epoch": 0.1028, + "grad_norm": 8.849010467529297, + "learning_rate": 1.99995223636881e-05, + "loss": 0.6119, + "step": 1028 + }, + { + "epoch": 0.103, + "grad_norm": 1.3597086668014526, + "learning_rate": 1.9999451693655125e-05, + "loss": 0.073, + "step": 1030 + }, + { + "epoch": 0.1032, + "grad_norm": 6.329043388366699, + "learning_rate": 1.9999376150010868e-05, + "loss": 0.4018, + "step": 1032 + }, + { + "epoch": 0.1034, + "grad_norm": 1.7063261270523071, + "learning_rate": 1.9999295732792146e-05, + "loss": 0.1132, + "step": 1034 + }, + { + "epoch": 0.1036, + "grad_norm": 0.4906080663204193, + "learning_rate": 1.9999210442038164e-05, + "loss": 0.2906, + "step": 1036 + }, + { + "epoch": 0.1038, + "grad_norm": 4.902503490447998, + "learning_rate": 1.9999120277790477e-05, + "loss": 0.1871, + "step": 1038 + }, + { + "epoch": 0.104, + "grad_norm": 2.528104066848755, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.4115, + "step": 1040 + }, + { + "epoch": 0.1042, + "grad_norm": 3.6741113662719727, + "learning_rate": 1.9998925328992175e-05, + "loss": 0.3922, + "step": 1042 + }, + { + "epoch": 0.1044, + "grad_norm": 8.448480606079102, + "learning_rate": 1.999882054453657e-05, + "loss": 0.4474, + "step": 1044 + }, + { + "epoch": 0.1046, + "grad_norm": 3.878326654434204, + "learning_rate": 1.9998710886777298e-05, + "loss": 0.3649, + "step": 1046 + }, + { + "epoch": 0.1048, + "grad_norm": 3.3463120460510254, + "learning_rate": 1.9998596355767805e-05, + "loss": 0.3674, + "step": 1048 + }, + { + "epoch": 0.105, + "grad_norm": 0.20032286643981934, + "learning_rate": 1.9998476951563914e-05, + "loss": 0.1771, + "step": 1050 + }, + { + "epoch": 0.1052, + "grad_norm": 2.8902528285980225, + "learning_rate": 1.9998352674223816e-05, + "loss": 0.2158, + "step": 1052 + }, + { + "epoch": 0.1054, + "grad_norm": 4.226684093475342, + "learning_rate": 1.9998223523808092e-05, + "loss": 0.3462, + "step": 1054 + }, + { + "epoch": 0.1056, + "grad_norm": 3.6894328594207764, + "learning_rate": 1.999808950037968e-05, + "loss": 0.2377, + "step": 1056 + }, + { + "epoch": 0.1058, + "grad_norm": 8.067360877990723, + "learning_rate": 1.99979506040039e-05, + "loss": 0.2469, + "step": 1058 + }, + { + "epoch": 0.106, + "grad_norm": 2.3406012058258057, + "learning_rate": 1.9997806834748455e-05, + "loss": 0.1461, + "step": 1060 + }, + { + "epoch": 0.1062, + "grad_norm": 42.54692077636719, + "learning_rate": 1.9997658192683412e-05, + "loss": 0.8543, + "step": 1062 + }, + { + "epoch": 0.1064, + "grad_norm": 3.0934934616088867, + "learning_rate": 1.9997504677881224e-05, + "loss": 0.3246, + "step": 1064 + }, + { + "epoch": 0.1066, + "grad_norm": 3.524845838546753, + "learning_rate": 1.9997346290416703e-05, + "loss": 0.2294, + "step": 1066 + }, + { + "epoch": 0.1068, + "grad_norm": 2.8621084690093994, + "learning_rate": 1.999718303036705e-05, + "loss": 0.3838, + "step": 1068 + }, + { + "epoch": 0.107, + "grad_norm": 5.3311543464660645, + "learning_rate": 1.9997014897811834e-05, + "loss": 0.494, + "step": 1070 + }, + { + "epoch": 0.1072, + "grad_norm": 5.529619216918945, + "learning_rate": 1.9996841892833e-05, + "loss": 0.7313, + "step": 1072 + }, + { + "epoch": 0.1074, + "grad_norm": 3.156148910522461, + "learning_rate": 1.999666401551487e-05, + "loss": 0.2459, + "step": 1074 + }, + { + "epoch": 0.1076, + "grad_norm": 2.2328834533691406, + "learning_rate": 1.9996481265944146e-05, + "loss": 0.2536, + "step": 1076 + }, + { + "epoch": 0.1078, + "grad_norm": 1.4742876291275024, + "learning_rate": 1.9996293644209886e-05, + "loss": 0.346, + "step": 1078 + }, + { + "epoch": 0.108, + "grad_norm": 1.8225466012954712, + "learning_rate": 1.9996101150403543e-05, + "loss": 0.3858, + "step": 1080 + }, + { + "epoch": 0.1082, + "grad_norm": 1.5976028442382812, + "learning_rate": 1.9995903784618936e-05, + "loss": 0.1695, + "step": 1082 + }, + { + "epoch": 0.1084, + "grad_norm": 4.241769790649414, + "learning_rate": 1.9995701546952252e-05, + "loss": 0.4113, + "step": 1084 + }, + { + "epoch": 0.1086, + "grad_norm": 5.970701694488525, + "learning_rate": 1.9995494437502064e-05, + "loss": 0.2785, + "step": 1086 + }, + { + "epoch": 0.1088, + "grad_norm": 3.879070281982422, + "learning_rate": 1.9995282456369313e-05, + "loss": 0.7099, + "step": 1088 + }, + { + "epoch": 0.109, + "grad_norm": 1.6936259269714355, + "learning_rate": 1.9995065603657317e-05, + "loss": 0.199, + "step": 1090 + }, + { + "epoch": 0.1092, + "grad_norm": 2.3477957248687744, + "learning_rate": 1.999484387947177e-05, + "loss": 0.4569, + "step": 1092 + }, + { + "epoch": 0.1094, + "grad_norm": 2.124924421310425, + "learning_rate": 1.999461728392073e-05, + "loss": 0.3084, + "step": 1094 + }, + { + "epoch": 0.1096, + "grad_norm": 1.3436256647109985, + "learning_rate": 1.9994385817114644e-05, + "loss": 0.3739, + "step": 1096 + }, + { + "epoch": 0.1098, + "grad_norm": 1.4334663152694702, + "learning_rate": 1.9994149479166324e-05, + "loss": 0.4167, + "step": 1098 + }, + { + "epoch": 0.11, + "grad_norm": 2.850574493408203, + "learning_rate": 1.999390827019096e-05, + "loss": 0.4187, + "step": 1100 + }, + { + "epoch": 0.1102, + "grad_norm": 2.5010666847229004, + "learning_rate": 1.999366219030611e-05, + "loss": 0.5938, + "step": 1102 + }, + { + "epoch": 0.1104, + "grad_norm": 1.167473554611206, + "learning_rate": 1.9993411239631713e-05, + "loss": 0.1877, + "step": 1104 + }, + { + "epoch": 0.1106, + "grad_norm": 1.3288588523864746, + "learning_rate": 1.999315541829008e-05, + "loss": 0.3861, + "step": 1106 + }, + { + "epoch": 0.1108, + "grad_norm": 2.0968332290649414, + "learning_rate": 1.9992894726405894e-05, + "loss": 0.4529, + "step": 1108 + }, + { + "epoch": 0.111, + "grad_norm": 2.272514820098877, + "learning_rate": 1.999262916410621e-05, + "loss": 0.4349, + "step": 1110 + }, + { + "epoch": 0.1112, + "grad_norm": 2.101304531097412, + "learning_rate": 1.999235873152047e-05, + "loss": 0.3511, + "step": 1112 + }, + { + "epoch": 0.1114, + "grad_norm": 2.384021043777466, + "learning_rate": 1.999208342878047e-05, + "loss": 0.4007, + "step": 1114 + }, + { + "epoch": 0.1116, + "grad_norm": 2.565124273300171, + "learning_rate": 1.9991803256020393e-05, + "loss": 0.3922, + "step": 1116 + }, + { + "epoch": 0.1118, + "grad_norm": 1.2576755285263062, + "learning_rate": 1.9991518213376787e-05, + "loss": 0.1789, + "step": 1118 + }, + { + "epoch": 0.112, + "grad_norm": 1.7330803871154785, + "learning_rate": 1.9991228300988586e-05, + "loss": 0.2808, + "step": 1120 + }, + { + "epoch": 0.1122, + "grad_norm": 5.82855224609375, + "learning_rate": 1.9990933518997086e-05, + "loss": 0.3644, + "step": 1122 + }, + { + "epoch": 0.1124, + "grad_norm": 3.679081916809082, + "learning_rate": 1.9990633867545956e-05, + "loss": 0.3893, + "step": 1124 + }, + { + "epoch": 0.1126, + "grad_norm": 1.5280896425247192, + "learning_rate": 1.999032934678125e-05, + "loss": 0.1786, + "step": 1126 + }, + { + "epoch": 0.1128, + "grad_norm": 1.7678289413452148, + "learning_rate": 1.9990019956851384e-05, + "loss": 0.1638, + "step": 1128 + }, + { + "epoch": 0.113, + "grad_norm": 3.922325849533081, + "learning_rate": 1.998970569790715e-05, + "loss": 0.5529, + "step": 1130 + }, + { + "epoch": 0.1132, + "grad_norm": 2.1396260261535645, + "learning_rate": 1.9989386570101716e-05, + "loss": 0.3746, + "step": 1132 + }, + { + "epoch": 0.1134, + "grad_norm": 2.1597821712493896, + "learning_rate": 1.9989062573590618e-05, + "loss": 0.4284, + "step": 1134 + }, + { + "epoch": 0.1136, + "grad_norm": 1.638731837272644, + "learning_rate": 1.9988733708531772e-05, + "loss": 0.3072, + "step": 1136 + }, + { + "epoch": 0.1138, + "grad_norm": 1.7607803344726562, + "learning_rate": 1.998839997508546e-05, + "loss": 0.2417, + "step": 1138 + }, + { + "epoch": 0.114, + "grad_norm": 2.4384148120880127, + "learning_rate": 1.9988061373414342e-05, + "loss": 0.2552, + "step": 1140 + }, + { + "epoch": 0.1142, + "grad_norm": 2.9188060760498047, + "learning_rate": 1.9987717903683447e-05, + "loss": 0.2736, + "step": 1142 + }, + { + "epoch": 0.1144, + "grad_norm": 2.5725526809692383, + "learning_rate": 1.998736956606018e-05, + "loss": 0.1782, + "step": 1144 + }, + { + "epoch": 0.1146, + "grad_norm": 2.622135877609253, + "learning_rate": 1.9987016360714307e-05, + "loss": 0.2121, + "step": 1146 + }, + { + "epoch": 0.1148, + "grad_norm": 0.02353403903543949, + "learning_rate": 1.998665828781799e-05, + "loss": 0.3302, + "step": 1148 + }, + { + "epoch": 0.115, + "grad_norm": 0.9851248860359192, + "learning_rate": 1.9986295347545738e-05, + "loss": 0.2835, + "step": 1150 + }, + { + "epoch": 0.1152, + "grad_norm": 15.313615798950195, + "learning_rate": 1.9985927540074453e-05, + "loss": 0.6685, + "step": 1152 + }, + { + "epoch": 0.1154, + "grad_norm": 2.5001044273376465, + "learning_rate": 1.9985554865583394e-05, + "loss": 0.371, + "step": 1154 + }, + { + "epoch": 0.1156, + "grad_norm": 4.0548906326293945, + "learning_rate": 1.99851773242542e-05, + "loss": 0.3357, + "step": 1156 + }, + { + "epoch": 0.1158, + "grad_norm": 2.1948277950286865, + "learning_rate": 1.9984794916270876e-05, + "loss": 0.1735, + "step": 1158 + }, + { + "epoch": 0.116, + "grad_norm": 2.3124876022338867, + "learning_rate": 1.9984407641819812e-05, + "loss": 0.3371, + "step": 1160 + }, + { + "epoch": 0.1162, + "grad_norm": 2.079962730407715, + "learning_rate": 1.998401550108975e-05, + "loss": 0.2956, + "step": 1162 + }, + { + "epoch": 0.1164, + "grad_norm": 1.415584921836853, + "learning_rate": 1.9983618494271825e-05, + "loss": 0.1465, + "step": 1164 + }, + { + "epoch": 0.1166, + "grad_norm": 1.5980442762374878, + "learning_rate": 1.9983216621559525e-05, + "loss": 0.1742, + "step": 1166 + }, + { + "epoch": 0.1168, + "grad_norm": 4.508903980255127, + "learning_rate": 1.998280988314872e-05, + "loss": 0.6484, + "step": 1168 + }, + { + "epoch": 0.117, + "grad_norm": 0.8330200910568237, + "learning_rate": 1.9982398279237657e-05, + "loss": 0.0887, + "step": 1170 + }, + { + "epoch": 0.1172, + "grad_norm": 0.6710852980613708, + "learning_rate": 1.9981981810026932e-05, + "loss": 0.0524, + "step": 1172 + }, + { + "epoch": 0.1174, + "grad_norm": 1.4977208375930786, + "learning_rate": 1.998156047571954e-05, + "loss": 0.18, + "step": 1174 + }, + { + "epoch": 0.1176, + "grad_norm": 0.446388840675354, + "learning_rate": 1.9981134276520828e-05, + "loss": 0.0482, + "step": 1176 + }, + { + "epoch": 0.1178, + "grad_norm": 0.6544026732444763, + "learning_rate": 1.9980703212638522e-05, + "loss": 0.2893, + "step": 1178 + }, + { + "epoch": 0.118, + "grad_norm": 7.165531635284424, + "learning_rate": 1.9980267284282718e-05, + "loss": 0.5686, + "step": 1180 + }, + { + "epoch": 0.1182, + "grad_norm": 0.6907548308372498, + "learning_rate": 1.997982649166588e-05, + "loss": 0.0304, + "step": 1182 + }, + { + "epoch": 0.1184, + "grad_norm": 5.37349271774292, + "learning_rate": 1.9979380835002846e-05, + "loss": 0.4501, + "step": 1184 + }, + { + "epoch": 0.1186, + "grad_norm": 2.4432289600372314, + "learning_rate": 1.9978930314510826e-05, + "loss": 0.3677, + "step": 1186 + }, + { + "epoch": 0.1188, + "grad_norm": 4.382468223571777, + "learning_rate": 1.9978474930409396e-05, + "loss": 0.3382, + "step": 1188 + }, + { + "epoch": 0.119, + "grad_norm": 1.569029688835144, + "learning_rate": 1.9978014682920503e-05, + "loss": 0.1226, + "step": 1190 + }, + { + "epoch": 0.1192, + "grad_norm": 2.4853107929229736, + "learning_rate": 1.997754957226847e-05, + "loss": 0.4029, + "step": 1192 + }, + { + "epoch": 0.1194, + "grad_norm": 1.7556354999542236, + "learning_rate": 1.9977079598679978e-05, + "loss": 0.2051, + "step": 1194 + }, + { + "epoch": 0.1196, + "grad_norm": 5.743977069854736, + "learning_rate": 1.99766047623841e-05, + "loss": 0.4405, + "step": 1196 + }, + { + "epoch": 0.1198, + "grad_norm": 3.657247543334961, + "learning_rate": 1.9976125063612254e-05, + "loss": 0.3461, + "step": 1198 + }, + { + "epoch": 0.12, + "grad_norm": 0.12951353192329407, + "learning_rate": 1.9975640502598243e-05, + "loss": 0.3547, + "step": 1200 + }, + { + "epoch": 0.1202, + "grad_norm": 3.153754949569702, + "learning_rate": 1.9975151079578238e-05, + "loss": 0.4008, + "step": 1202 + }, + { + "epoch": 0.1204, + "grad_norm": 13.018710136413574, + "learning_rate": 1.9974656794790777e-05, + "loss": 0.6959, + "step": 1204 + }, + { + "epoch": 0.1206, + "grad_norm": 2.685300350189209, + "learning_rate": 1.9974157648476768e-05, + "loss": 0.2577, + "step": 1206 + }, + { + "epoch": 0.1208, + "grad_norm": 2.52923846244812, + "learning_rate": 1.9973653640879486e-05, + "loss": 0.3363, + "step": 1208 + }, + { + "epoch": 0.121, + "grad_norm": 1.5434337854385376, + "learning_rate": 1.997314477224458e-05, + "loss": 0.0957, + "step": 1210 + }, + { + "epoch": 0.1212, + "grad_norm": 2.3144333362579346, + "learning_rate": 1.997263104282007e-05, + "loss": 0.2291, + "step": 1212 + }, + { + "epoch": 0.1214, + "grad_norm": 2.285275936126709, + "learning_rate": 1.997211245285634e-05, + "loss": 0.3344, + "step": 1214 + }, + { + "epoch": 0.1216, + "grad_norm": 4.451172828674316, + "learning_rate": 1.997158900260614e-05, + "loss": 0.292, + "step": 1216 + }, + { + "epoch": 0.1218, + "grad_norm": 1.2649403810501099, + "learning_rate": 1.99710606923246e-05, + "loss": 0.1712, + "step": 1218 + }, + { + "epoch": 0.122, + "grad_norm": 3.3218584060668945, + "learning_rate": 1.9970527522269204e-05, + "loss": 0.2587, + "step": 1220 + }, + { + "epoch": 0.1222, + "grad_norm": 3.7014427185058594, + "learning_rate": 1.996998949269982e-05, + "loss": 0.4171, + "step": 1222 + }, + { + "epoch": 0.1224, + "grad_norm": 2.507833480834961, + "learning_rate": 1.9969446603878673e-05, + "loss": 0.7084, + "step": 1224 + }, + { + "epoch": 0.1226, + "grad_norm": 2.320350170135498, + "learning_rate": 1.996889885607036e-05, + "loss": 0.334, + "step": 1226 + }, + { + "epoch": 0.1228, + "grad_norm": 0.660672664642334, + "learning_rate": 1.9968346249541848e-05, + "loss": 0.1746, + "step": 1228 + }, + { + "epoch": 0.123, + "grad_norm": 7.4112229347229, + "learning_rate": 1.9967788784562474e-05, + "loss": 0.4116, + "step": 1230 + }, + { + "epoch": 0.1232, + "grad_norm": 1.431082844734192, + "learning_rate": 1.9967226461403934e-05, + "loss": 0.0994, + "step": 1232 + }, + { + "epoch": 0.1234, + "grad_norm": 2.889305353164673, + "learning_rate": 1.99666592803403e-05, + "loss": 0.5512, + "step": 1234 + }, + { + "epoch": 0.1236, + "grad_norm": 1.2020742893218994, + "learning_rate": 1.996608724164801e-05, + "loss": 0.072, + "step": 1236 + }, + { + "epoch": 0.1238, + "grad_norm": 0.8130085468292236, + "learning_rate": 1.9965510345605866e-05, + "loss": 0.3148, + "step": 1238 + }, + { + "epoch": 0.124, + "grad_norm": 1.7790873050689697, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.1367, + "step": 1240 + }, + { + "epoch": 0.1242, + "grad_norm": 2.1053359508514404, + "learning_rate": 1.996434198259908e-05, + "loss": 0.268, + "step": 1242 + }, + { + "epoch": 0.1244, + "grad_norm": 3.7255001068115234, + "learning_rate": 1.9963750516203887e-05, + "loss": 0.3862, + "step": 1244 + }, + { + "epoch": 0.1246, + "grad_norm": 1.040077805519104, + "learning_rate": 1.9963154193597728e-05, + "loss": 0.186, + "step": 1246 + }, + { + "epoch": 0.1248, + "grad_norm": 1.0406478643417358, + "learning_rate": 1.996255301507125e-05, + "loss": 0.1043, + "step": 1248 + }, + { + "epoch": 0.125, + "grad_norm": 1.262279987335205, + "learning_rate": 1.9961946980917457e-05, + "loss": 0.1391, + "step": 1250 + }, + { + "epoch": 0.1252, + "grad_norm": 2.7806077003479004, + "learning_rate": 1.9961336091431728e-05, + "loss": 0.2948, + "step": 1252 + }, + { + "epoch": 0.1254, + "grad_norm": 4.46557092666626, + "learning_rate": 1.9960720346911798e-05, + "loss": 0.3146, + "step": 1254 + }, + { + "epoch": 0.1256, + "grad_norm": 17.333044052124023, + "learning_rate": 1.9960099747657774e-05, + "loss": 0.8044, + "step": 1256 + }, + { + "epoch": 0.1258, + "grad_norm": 3.1158394813537598, + "learning_rate": 1.995947429397213e-05, + "loss": 0.2211, + "step": 1258 + }, + { + "epoch": 0.126, + "grad_norm": 2.538512706756592, + "learning_rate": 1.9958843986159705e-05, + "loss": 0.1927, + "step": 1260 + }, + { + "epoch": 0.1262, + "grad_norm": 0.6797531247138977, + "learning_rate": 1.9958208824527702e-05, + "loss": 0.1065, + "step": 1262 + }, + { + "epoch": 0.1264, + "grad_norm": 8.962922096252441, + "learning_rate": 1.9957568809385693e-05, + "loss": 0.594, + "step": 1264 + }, + { + "epoch": 0.1266, + "grad_norm": 1.5102475881576538, + "learning_rate": 1.9956923941045613e-05, + "loss": 0.146, + "step": 1266 + }, + { + "epoch": 0.1268, + "grad_norm": 1.8735897541046143, + "learning_rate": 1.995627421982176e-05, + "loss": 0.2208, + "step": 1268 + }, + { + "epoch": 0.127, + "grad_norm": 6.8968706130981445, + "learning_rate": 1.99556196460308e-05, + "loss": 0.4163, + "step": 1270 + }, + { + "epoch": 0.1272, + "grad_norm": 2.854938507080078, + "learning_rate": 1.995496021999177e-05, + "loss": 0.371, + "step": 1272 + }, + { + "epoch": 0.1274, + "grad_norm": 3.0465095043182373, + "learning_rate": 1.9954295942026065e-05, + "loss": 0.2117, + "step": 1274 + }, + { + "epoch": 0.1276, + "grad_norm": 1.289006233215332, + "learning_rate": 1.995362681245744e-05, + "loss": 0.1596, + "step": 1276 + }, + { + "epoch": 0.1278, + "grad_norm": 3.165769338607788, + "learning_rate": 1.9952952831612027e-05, + "loss": 0.3187, + "step": 1278 + }, + { + "epoch": 0.128, + "grad_norm": 3.098623037338257, + "learning_rate": 1.9952273999818312e-05, + "loss": 0.3283, + "step": 1280 + }, + { + "epoch": 0.1282, + "grad_norm": 5.1990275382995605, + "learning_rate": 1.9951590317407152e-05, + "loss": 0.3727, + "step": 1282 + }, + { + "epoch": 0.1284, + "grad_norm": 1.5409075021743774, + "learning_rate": 1.9950901784711765e-05, + "loss": 0.2245, + "step": 1284 + }, + { + "epoch": 0.1286, + "grad_norm": 2.215860366821289, + "learning_rate": 1.9950208402067735e-05, + "loss": 0.0981, + "step": 1286 + }, + { + "epoch": 0.1288, + "grad_norm": 3.108517646789551, + "learning_rate": 1.9949510169813006e-05, + "loss": 0.2175, + "step": 1288 + }, + { + "epoch": 0.129, + "grad_norm": 1.4311892986297607, + "learning_rate": 1.9948807088287884e-05, + "loss": 0.3358, + "step": 1290 + }, + { + "epoch": 0.1292, + "grad_norm": 4.196983337402344, + "learning_rate": 1.994809915783505e-05, + "loss": 0.3603, + "step": 1292 + }, + { + "epoch": 0.1294, + "grad_norm": 1.011609435081482, + "learning_rate": 1.9947386378799534e-05, + "loss": 0.0598, + "step": 1294 + }, + { + "epoch": 0.1296, + "grad_norm": 3.8195748329162598, + "learning_rate": 1.9946668751528745e-05, + "loss": 0.4816, + "step": 1296 + }, + { + "epoch": 0.1298, + "grad_norm": 1.7454580068588257, + "learning_rate": 1.9945946276372435e-05, + "loss": 0.2833, + "step": 1298 + }, + { + "epoch": 0.13, + "grad_norm": 7.0121965408325195, + "learning_rate": 1.9945218953682736e-05, + "loss": 0.7535, + "step": 1300 + }, + { + "epoch": 0.1302, + "grad_norm": 0.49667003750801086, + "learning_rate": 1.9944486783814135e-05, + "loss": 0.0287, + "step": 1302 + }, + { + "epoch": 0.1304, + "grad_norm": 7.058590888977051, + "learning_rate": 1.994374976712348e-05, + "loss": 0.2545, + "step": 1304 + }, + { + "epoch": 0.1306, + "grad_norm": 4.977167129516602, + "learning_rate": 1.994300790396999e-05, + "loss": 0.2785, + "step": 1306 + }, + { + "epoch": 0.1308, + "grad_norm": 3.231706380844116, + "learning_rate": 1.9942261194715236e-05, + "loss": 0.1342, + "step": 1308 + }, + { + "epoch": 0.131, + "grad_norm": 3.3855912685394287, + "learning_rate": 1.9941509639723155e-05, + "loss": 0.1309, + "step": 1310 + }, + { + "epoch": 0.1312, + "grad_norm": 3.204033136367798, + "learning_rate": 1.9940753239360047e-05, + "loss": 0.2405, + "step": 1312 + }, + { + "epoch": 0.1314, + "grad_norm": 2.9671733379364014, + "learning_rate": 1.993999199399457e-05, + "loss": 0.1058, + "step": 1314 + }, + { + "epoch": 0.1316, + "grad_norm": 0.49726471304893494, + "learning_rate": 1.9939225903997748e-05, + "loss": 0.0939, + "step": 1316 + }, + { + "epoch": 0.1318, + "grad_norm": 5.56477165222168, + "learning_rate": 1.993845496974297e-05, + "loss": 1.0439, + "step": 1318 + }, + { + "epoch": 0.132, + "grad_norm": 2.9033894538879395, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.232, + "step": 1320 + }, + { + "epoch": 0.1322, + "grad_norm": 2.5996317863464355, + "learning_rate": 1.993689856996485e-05, + "loss": 0.3188, + "step": 1322 + }, + { + "epoch": 0.1324, + "grad_norm": 9.907886505126953, + "learning_rate": 1.9936113105200085e-05, + "loss": 0.3239, + "step": 1324 + }, + { + "epoch": 0.1326, + "grad_norm": 0.9099286794662476, + "learning_rate": 1.99353227976945e-05, + "loss": 0.2309, + "step": 1326 + }, + { + "epoch": 0.1328, + "grad_norm": 1.2554891109466553, + "learning_rate": 1.9934527647833276e-05, + "loss": 0.2442, + "step": 1328 + }, + { + "epoch": 0.133, + "grad_norm": 3.8082666397094727, + "learning_rate": 1.9933727656003964e-05, + "loss": 0.1225, + "step": 1330 + }, + { + "epoch": 0.1332, + "grad_norm": 8.989752769470215, + "learning_rate": 1.993292282259647e-05, + "loss": 0.2571, + "step": 1332 + }, + { + "epoch": 0.1334, + "grad_norm": 8.651959419250488, + "learning_rate": 1.9932113148003057e-05, + "loss": 0.2168, + "step": 1334 + }, + { + "epoch": 0.1336, + "grad_norm": 1.7457717657089233, + "learning_rate": 1.9931298632618355e-05, + "loss": 0.0699, + "step": 1336 + }, + { + "epoch": 0.1338, + "grad_norm": 6.537881851196289, + "learning_rate": 1.9930479276839347e-05, + "loss": 0.2875, + "step": 1338 + }, + { + "epoch": 0.134, + "grad_norm": 0.31225481629371643, + "learning_rate": 1.992965508106537e-05, + "loss": 0.3355, + "step": 1340 + }, + { + "epoch": 0.1342, + "grad_norm": 1.059474229812622, + "learning_rate": 1.9928826045698138e-05, + "loss": 0.0539, + "step": 1342 + }, + { + "epoch": 0.1344, + "grad_norm": 3.188612461090088, + "learning_rate": 1.9927992171141707e-05, + "loss": 0.1961, + "step": 1344 + }, + { + "epoch": 0.1346, + "grad_norm": 4.713750839233398, + "learning_rate": 1.99271534578025e-05, + "loss": 0.3323, + "step": 1346 + }, + { + "epoch": 0.1348, + "grad_norm": 13.073081970214844, + "learning_rate": 1.992630990608929e-05, + "loss": 0.4336, + "step": 1348 + }, + { + "epoch": 0.135, + "grad_norm": 0.0421074740588665, + "learning_rate": 1.9925461516413224e-05, + "loss": 0.2262, + "step": 1350 + }, + { + "epoch": 0.1352, + "grad_norm": 1.6329069137573242, + "learning_rate": 1.9924608289187786e-05, + "loss": 0.4746, + "step": 1352 + }, + { + "epoch": 0.1354, + "grad_norm": 38.43767166137695, + "learning_rate": 1.9923750224828833e-05, + "loss": 0.5785, + "step": 1354 + }, + { + "epoch": 0.1356, + "grad_norm": 4.894640922546387, + "learning_rate": 1.992288732375458e-05, + "loss": 0.511, + "step": 1356 + }, + { + "epoch": 0.1358, + "grad_norm": 1.3244742155075073, + "learning_rate": 1.9922019586385587e-05, + "loss": 0.0584, + "step": 1358 + }, + { + "epoch": 0.136, + "grad_norm": 1.8731356859207153, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.1063, + "step": 1360 + }, + { + "epoch": 0.1362, + "grad_norm": 0.3128760755062103, + "learning_rate": 1.9920269604457444e-05, + "loss": 0.1954, + "step": 1362 + }, + { + "epoch": 0.1364, + "grad_norm": 1.081976056098938, + "learning_rate": 1.9919387360751216e-05, + "loss": 0.0546, + "step": 1364 + }, + { + "epoch": 0.1366, + "grad_norm": 2.917098045349121, + "learning_rate": 1.991850028245609e-05, + "loss": 0.1876, + "step": 1366 + }, + { + "epoch": 0.1368, + "grad_norm": 1.1691385507583618, + "learning_rate": 1.9917608370004417e-05, + "loss": 0.1578, + "step": 1368 + }, + { + "epoch": 0.137, + "grad_norm": 0.5017140507698059, + "learning_rate": 1.9916711623830904e-05, + "loss": 0.2335, + "step": 1370 + }, + { + "epoch": 0.1372, + "grad_norm": 4.038936138153076, + "learning_rate": 1.9915810044372618e-05, + "loss": 0.384, + "step": 1372 + }, + { + "epoch": 0.1374, + "grad_norm": 3.8408236503601074, + "learning_rate": 1.9914903632068975e-05, + "loss": 0.2884, + "step": 1374 + }, + { + "epoch": 0.1376, + "grad_norm": 2.549910545349121, + "learning_rate": 1.9913992387361747e-05, + "loss": 0.1968, + "step": 1376 + }, + { + "epoch": 0.1378, + "grad_norm": 2.5776498317718506, + "learning_rate": 1.9913076310695068e-05, + "loss": 0.7194, + "step": 1378 + }, + { + "epoch": 0.138, + "grad_norm": 2.939225435256958, + "learning_rate": 1.991215540251542e-05, + "loss": 0.3662, + "step": 1380 + }, + { + "epoch": 0.1382, + "grad_norm": 5.966836929321289, + "learning_rate": 1.991122966327164e-05, + "loss": 0.2981, + "step": 1382 + }, + { + "epoch": 0.1384, + "grad_norm": 0.09505891054868698, + "learning_rate": 1.991029909341493e-05, + "loss": 0.0805, + "step": 1384 + }, + { + "epoch": 0.1386, + "grad_norm": 1.3459240198135376, + "learning_rate": 1.9909363693398828e-05, + "loss": 0.076, + "step": 1386 + }, + { + "epoch": 0.1388, + "grad_norm": 1.947648286819458, + "learning_rate": 1.9908423463679246e-05, + "loss": 0.294, + "step": 1388 + }, + { + "epoch": 0.139, + "grad_norm": 0.028451183810830116, + "learning_rate": 1.9907478404714438e-05, + "loss": 0.0514, + "step": 1390 + }, + { + "epoch": 0.1392, + "grad_norm": 3.398876905441284, + "learning_rate": 1.990652851696501e-05, + "loss": 0.2771, + "step": 1392 + }, + { + "epoch": 0.1394, + "grad_norm": 9.974687576293945, + "learning_rate": 1.990557380089393e-05, + "loss": 0.3337, + "step": 1394 + }, + { + "epoch": 0.1396, + "grad_norm": 2.080087184906006, + "learning_rate": 1.9904614256966514e-05, + "loss": 0.094, + "step": 1396 + }, + { + "epoch": 0.1398, + "grad_norm": 2.082339286804199, + "learning_rate": 1.990364988565043e-05, + "loss": 0.1282, + "step": 1398 + }, + { + "epoch": 0.14, + "grad_norm": 1.3997812271118164, + "learning_rate": 1.9902680687415704e-05, + "loss": 0.2421, + "step": 1400 + }, + { + "epoch": 0.1402, + "grad_norm": 4.3650922775268555, + "learning_rate": 1.990170666273471e-05, + "loss": 0.266, + "step": 1402 + }, + { + "epoch": 0.1404, + "grad_norm": 0.019374210387468338, + "learning_rate": 1.9900727812082177e-05, + "loss": 0.0643, + "step": 1404 + }, + { + "epoch": 0.1406, + "grad_norm": 4.7919464111328125, + "learning_rate": 1.989974413593518e-05, + "loss": 0.2242, + "step": 1406 + }, + { + "epoch": 0.1408, + "grad_norm": 0.9335508942604065, + "learning_rate": 1.989875563477316e-05, + "loss": 0.0975, + "step": 1408 + }, + { + "epoch": 0.141, + "grad_norm": 4.366420745849609, + "learning_rate": 1.989776230907789e-05, + "loss": 0.2236, + "step": 1410 + }, + { + "epoch": 0.1412, + "grad_norm": 7.192631721496582, + "learning_rate": 1.989676415933351e-05, + "loss": 0.3732, + "step": 1412 + }, + { + "epoch": 0.1414, + "grad_norm": 0.0033727081026881933, + "learning_rate": 1.989576118602651e-05, + "loss": 0.1198, + "step": 1414 + }, + { + "epoch": 0.1416, + "grad_norm": 0.1634206473827362, + "learning_rate": 1.9894753389645723e-05, + "loss": 0.0052, + "step": 1416 + }, + { + "epoch": 0.1418, + "grad_norm": 5.796917915344238, + "learning_rate": 1.9893740770682334e-05, + "loss": 0.2858, + "step": 1418 + }, + { + "epoch": 0.142, + "grad_norm": 1.7806518077850342, + "learning_rate": 1.9892723329629885e-05, + "loss": 0.027, + "step": 1420 + }, + { + "epoch": 0.1422, + "grad_norm": 9.39648723602295, + "learning_rate": 1.9891701066984264e-05, + "loss": 0.5113, + "step": 1422 + }, + { + "epoch": 0.1424, + "grad_norm": 0.057938843965530396, + "learning_rate": 1.9890673983243708e-05, + "loss": 0.0185, + "step": 1424 + }, + { + "epoch": 0.1426, + "grad_norm": 0.06769731640815735, + "learning_rate": 1.9889642078908805e-05, + "loss": 0.0032, + "step": 1426 + }, + { + "epoch": 0.1428, + "grad_norm": 12.803485870361328, + "learning_rate": 1.9888605354482494e-05, + "loss": 0.8554, + "step": 1428 + }, + { + "epoch": 0.143, + "grad_norm": 2.690295457839966, + "learning_rate": 1.988756381047006e-05, + "loss": 0.0443, + "step": 1430 + }, + { + "epoch": 0.1432, + "grad_norm": 0.590094268321991, + "learning_rate": 1.988651744737914e-05, + "loss": 0.0165, + "step": 1432 + }, + { + "epoch": 0.1434, + "grad_norm": 2.412403106689453, + "learning_rate": 1.9885466265719723e-05, + "loss": 0.0571, + "step": 1434 + }, + { + "epoch": 0.1436, + "grad_norm": 0.004821226466447115, + "learning_rate": 1.9884410266004134e-05, + "loss": 0.0363, + "step": 1436 + }, + { + "epoch": 0.1438, + "grad_norm": 30.626766204833984, + "learning_rate": 1.988334944874706e-05, + "loss": 1.969, + "step": 1438 + }, + { + "epoch": 0.144, + "grad_norm": 1.8576127290725708, + "learning_rate": 1.988228381446553e-05, + "loss": 0.0577, + "step": 1440 + }, + { + "epoch": 0.1442, + "grad_norm": 1.1660624742507935, + "learning_rate": 1.988121336367892e-05, + "loss": 0.0875, + "step": 1442 + }, + { + "epoch": 0.1444, + "grad_norm": 3.105055093765259, + "learning_rate": 1.9880138096908955e-05, + "loss": 0.1555, + "step": 1444 + }, + { + "epoch": 0.1446, + "grad_norm": 7.1764445304870605, + "learning_rate": 1.9879058014679704e-05, + "loss": 0.3719, + "step": 1446 + }, + { + "epoch": 0.1448, + "grad_norm": 0.9168652296066284, + "learning_rate": 1.987797311751759e-05, + "loss": 0.1669, + "step": 1448 + }, + { + "epoch": 0.145, + "grad_norm": 1.4997260570526123, + "learning_rate": 1.9876883405951378e-05, + "loss": 0.2634, + "step": 1450 + }, + { + "epoch": 0.1452, + "grad_norm": 1.8381479978561401, + "learning_rate": 1.9875788880512183e-05, + "loss": 0.1969, + "step": 1452 + }, + { + "epoch": 0.1454, + "grad_norm": 11.696943283081055, + "learning_rate": 1.9874689541733455e-05, + "loss": 0.4188, + "step": 1454 + }, + { + "epoch": 0.1456, + "grad_norm": 0.9196287393569946, + "learning_rate": 1.9873585390151003e-05, + "loss": 0.079, + "step": 1456 + }, + { + "epoch": 0.1458, + "grad_norm": 1.472731351852417, + "learning_rate": 1.9872476426302983e-05, + "loss": 0.1503, + "step": 1458 + }, + { + "epoch": 0.146, + "grad_norm": 5.7633185386657715, + "learning_rate": 1.987136265072988e-05, + "loss": 0.7784, + "step": 1460 + }, + { + "epoch": 0.1462, + "grad_norm": 17.18065643310547, + "learning_rate": 1.987024406397454e-05, + "loss": 0.7173, + "step": 1462 + }, + { + "epoch": 0.1464, + "grad_norm": 12.929783821105957, + "learning_rate": 1.9869120666582153e-05, + "loss": 0.497, + "step": 1464 + }, + { + "epoch": 0.1466, + "grad_norm": 1.2644630670547485, + "learning_rate": 1.986799245910024e-05, + "loss": 0.1864, + "step": 1466 + }, + { + "epoch": 0.1468, + "grad_norm": 3.6712048053741455, + "learning_rate": 1.986685944207868e-05, + "loss": 0.4937, + "step": 1468 + }, + { + "epoch": 0.147, + "grad_norm": 4.558094501495361, + "learning_rate": 1.9865721616069695e-05, + "loss": 0.5774, + "step": 1470 + }, + { + "epoch": 0.1472, + "grad_norm": 3.0761606693267822, + "learning_rate": 1.9864578981627844e-05, + "loss": 0.5931, + "step": 1472 + }, + { + "epoch": 0.1474, + "grad_norm": 2.1664791107177734, + "learning_rate": 1.9863431539310033e-05, + "loss": 0.329, + "step": 1474 + }, + { + "epoch": 0.1476, + "grad_norm": 4.522461891174316, + "learning_rate": 1.986227928967551e-05, + "loss": 0.4055, + "step": 1476 + }, + { + "epoch": 0.1478, + "grad_norm": 1.7971429824829102, + "learning_rate": 1.9861122233285873e-05, + "loss": 0.185, + "step": 1478 + }, + { + "epoch": 0.148, + "grad_norm": 1.1392816305160522, + "learning_rate": 1.985996037070505e-05, + "loss": 0.2802, + "step": 1480 + }, + { + "epoch": 0.1482, + "grad_norm": 3.1489346027374268, + "learning_rate": 1.9858793702499322e-05, + "loss": 0.2398, + "step": 1482 + }, + { + "epoch": 0.1484, + "grad_norm": 1.506956696510315, + "learning_rate": 1.9857622229237315e-05, + "loss": 0.2046, + "step": 1484 + }, + { + "epoch": 0.1486, + "grad_norm": 3.1549041271209717, + "learning_rate": 1.9856445951489984e-05, + "loss": 0.4631, + "step": 1486 + }, + { + "epoch": 0.1488, + "grad_norm": 1.1688741445541382, + "learning_rate": 1.985526486983063e-05, + "loss": 0.2885, + "step": 1488 + }, + { + "epoch": 0.149, + "grad_norm": 1.551353096961975, + "learning_rate": 1.9854078984834904e-05, + "loss": 0.198, + "step": 1490 + }, + { + "epoch": 0.1492, + "grad_norm": 3.8020401000976562, + "learning_rate": 1.985288829708079e-05, + "loss": 0.4165, + "step": 1492 + }, + { + "epoch": 0.1494, + "grad_norm": 1.9078748226165771, + "learning_rate": 1.9851692807148612e-05, + "loss": 0.3587, + "step": 1494 + }, + { + "epoch": 0.1496, + "grad_norm": 3.0070862770080566, + "learning_rate": 1.9850492515621038e-05, + "loss": 0.2969, + "step": 1496 + }, + { + "epoch": 0.1498, + "grad_norm": 0.8299723267555237, + "learning_rate": 1.984928742308308e-05, + "loss": 0.2672, + "step": 1498 + }, + { + "epoch": 0.15, + "grad_norm": 1.6822665929794312, + "learning_rate": 1.9848077530122083e-05, + "loss": 0.2406, + "step": 1500 + }, + { + "epoch": 0.1502, + "grad_norm": 3.0565226078033447, + "learning_rate": 1.9846862837327733e-05, + "loss": 0.2979, + "step": 1502 + }, + { + "epoch": 0.1504, + "grad_norm": 2.247175693511963, + "learning_rate": 1.9845643345292055e-05, + "loss": 0.2548, + "step": 1504 + }, + { + "epoch": 0.1506, + "grad_norm": 1.9976131916046143, + "learning_rate": 1.9844419054609418e-05, + "loss": 0.2548, + "step": 1506 + }, + { + "epoch": 0.1508, + "grad_norm": 3.588804244995117, + "learning_rate": 1.9843189965876525e-05, + "loss": 0.6414, + "step": 1508 + }, + { + "epoch": 0.151, + "grad_norm": 2.832587242126465, + "learning_rate": 1.984195607969242e-05, + "loss": 0.4714, + "step": 1510 + }, + { + "epoch": 0.1512, + "grad_norm": 2.203768730163574, + "learning_rate": 1.9840717396658483e-05, + "loss": 0.1562, + "step": 1512 + }, + { + "epoch": 0.1514, + "grad_norm": 2.4130680561065674, + "learning_rate": 1.9839473917378432e-05, + "loss": 0.401, + "step": 1514 + }, + { + "epoch": 0.1516, + "grad_norm": 1.0958037376403809, + "learning_rate": 1.983822564245833e-05, + "loss": 0.2483, + "step": 1516 + }, + { + "epoch": 0.1518, + "grad_norm": 5.898284435272217, + "learning_rate": 1.9836972572506557e-05, + "loss": 0.3206, + "step": 1518 + }, + { + "epoch": 0.152, + "grad_norm": 2.1390392780303955, + "learning_rate": 1.983571470813386e-05, + "loss": 0.2339, + "step": 1520 + }, + { + "epoch": 0.1522, + "grad_norm": 6.76456880569458, + "learning_rate": 1.98344520499533e-05, + "loss": 0.3838, + "step": 1522 + }, + { + "epoch": 0.1524, + "grad_norm": 3.1156163215637207, + "learning_rate": 1.983318459858028e-05, + "loss": 0.3573, + "step": 1524 + }, + { + "epoch": 0.1526, + "grad_norm": 1.9338505268096924, + "learning_rate": 1.9831912354632537e-05, + "loss": 0.1835, + "step": 1526 + }, + { + "epoch": 0.1528, + "grad_norm": 2.1358370780944824, + "learning_rate": 1.9830635318730155e-05, + "loss": 0.2251, + "step": 1528 + }, + { + "epoch": 0.153, + "grad_norm": 2.8573877811431885, + "learning_rate": 1.9829353491495545e-05, + "loss": 0.1836, + "step": 1530 + }, + { + "epoch": 0.1532, + "grad_norm": 8.940711975097656, + "learning_rate": 1.982806687355345e-05, + "loss": 0.279, + "step": 1532 + }, + { + "epoch": 0.1534, + "grad_norm": 12.19266128540039, + "learning_rate": 1.982677546553095e-05, + "loss": 0.4935, + "step": 1534 + }, + { + "epoch": 0.1536, + "grad_norm": 0.9413098692893982, + "learning_rate": 1.982547926805747e-05, + "loss": 0.4725, + "step": 1536 + }, + { + "epoch": 0.1538, + "grad_norm": 4.005331993103027, + "learning_rate": 1.9824178281764753e-05, + "loss": 0.311, + "step": 1538 + }, + { + "epoch": 0.154, + "grad_norm": 2.7039694786071777, + "learning_rate": 1.982287250728689e-05, + "loss": 0.3165, + "step": 1540 + }, + { + "epoch": 0.1542, + "grad_norm": 6.060277938842773, + "learning_rate": 1.9821561945260292e-05, + "loss": 0.2096, + "step": 1542 + }, + { + "epoch": 0.1544, + "grad_norm": 4.918139934539795, + "learning_rate": 1.982024659632372e-05, + "loss": 0.5119, + "step": 1544 + }, + { + "epoch": 0.1546, + "grad_norm": 1.1218396425247192, + "learning_rate": 1.9818926461118254e-05, + "loss": 0.1681, + "step": 1546 + }, + { + "epoch": 0.1548, + "grad_norm": 3.193166732788086, + "learning_rate": 1.981760154028731e-05, + "loss": 0.4849, + "step": 1548 + }, + { + "epoch": 0.155, + "grad_norm": 4.193605899810791, + "learning_rate": 1.9816271834476642e-05, + "loss": 0.3056, + "step": 1550 + }, + { + "epoch": 0.1552, + "grad_norm": 1.5969318151474, + "learning_rate": 1.981493734433433e-05, + "loss": 0.3846, + "step": 1552 + }, + { + "epoch": 0.1554, + "grad_norm": 1.4828364849090576, + "learning_rate": 1.981359807051079e-05, + "loss": 0.4157, + "step": 1554 + }, + { + "epoch": 0.1556, + "grad_norm": 7.221129417419434, + "learning_rate": 1.981225401365877e-05, + "loss": 0.427, + "step": 1556 + }, + { + "epoch": 0.1558, + "grad_norm": 2.8227264881134033, + "learning_rate": 1.981090517443334e-05, + "loss": 0.4526, + "step": 1558 + }, + { + "epoch": 0.156, + "grad_norm": 2.7908616065979004, + "learning_rate": 1.9809551553491918e-05, + "loss": 0.2966, + "step": 1560 + }, + { + "epoch": 0.1562, + "grad_norm": 5.243305206298828, + "learning_rate": 1.9808193151494233e-05, + "loss": 0.2207, + "step": 1562 + }, + { + "epoch": 0.1564, + "grad_norm": 0.8531282544136047, + "learning_rate": 1.9806829969102356e-05, + "loss": 0.4045, + "step": 1564 + }, + { + "epoch": 0.1566, + "grad_norm": 0.9345164895057678, + "learning_rate": 1.9805462006980688e-05, + "loss": 0.1106, + "step": 1566 + }, + { + "epoch": 0.1568, + "grad_norm": 1.0322470664978027, + "learning_rate": 1.980408926579596e-05, + "loss": 0.163, + "step": 1568 + }, + { + "epoch": 0.157, + "grad_norm": 0.7693102359771729, + "learning_rate": 1.9802711746217222e-05, + "loss": 0.1342, + "step": 1570 + }, + { + "epoch": 0.1572, + "grad_norm": 0.41420799493789673, + "learning_rate": 1.9801329448915863e-05, + "loss": 0.4219, + "step": 1572 + }, + { + "epoch": 0.1574, + "grad_norm": 1.1019107103347778, + "learning_rate": 1.9799942374565597e-05, + "loss": 0.5703, + "step": 1574 + }, + { + "epoch": 0.1576, + "grad_norm": 1.2007523775100708, + "learning_rate": 1.979855052384247e-05, + "loss": 0.4324, + "step": 1576 + }, + { + "epoch": 0.1578, + "grad_norm": 2.1199240684509277, + "learning_rate": 1.9797153897424854e-05, + "loss": 0.3202, + "step": 1578 + }, + { + "epoch": 0.158, + "grad_norm": 1.3200700283050537, + "learning_rate": 1.979575249599344e-05, + "loss": 0.2796, + "step": 1580 + }, + { + "epoch": 0.1582, + "grad_norm": 1.2451176643371582, + "learning_rate": 1.9794346320231265e-05, + "loss": 0.2434, + "step": 1582 + }, + { + "epoch": 0.1584, + "grad_norm": 3.0071558952331543, + "learning_rate": 1.9792935370823676e-05, + "loss": 0.289, + "step": 1584 + }, + { + "epoch": 0.1586, + "grad_norm": 1.2129672765731812, + "learning_rate": 1.9791519648458352e-05, + "loss": 0.2962, + "step": 1586 + }, + { + "epoch": 0.1588, + "grad_norm": 3.2925422191619873, + "learning_rate": 1.97900991538253e-05, + "loss": 0.473, + "step": 1588 + }, + { + "epoch": 0.159, + "grad_norm": 1.7789535522460938, + "learning_rate": 1.9788673887616852e-05, + "loss": 0.2536, + "step": 1590 + }, + { + "epoch": 0.1592, + "grad_norm": 1.2831358909606934, + "learning_rate": 1.9787243850527663e-05, + "loss": 0.2084, + "step": 1592 + }, + { + "epoch": 0.1594, + "grad_norm": 3.914043426513672, + "learning_rate": 1.978580904325472e-05, + "loss": 0.2993, + "step": 1594 + }, + { + "epoch": 0.1596, + "grad_norm": 3.8544373512268066, + "learning_rate": 1.9784369466497333e-05, + "loss": 0.4852, + "step": 1596 + }, + { + "epoch": 0.1598, + "grad_norm": 2.7286629676818848, + "learning_rate": 1.9782925120957123e-05, + "loss": 0.3651, + "step": 1598 + }, + { + "epoch": 0.16, + "grad_norm": 2.1046531200408936, + "learning_rate": 1.9781476007338058e-05, + "loss": 0.3068, + "step": 1600 + }, + { + "epoch": 0.1602, + "grad_norm": 4.084563255310059, + "learning_rate": 1.9780022126346413e-05, + "loss": 0.3531, + "step": 1602 + }, + { + "epoch": 0.1604, + "grad_norm": 2.7153377532958984, + "learning_rate": 1.977856347869079e-05, + "loss": 0.2341, + "step": 1604 + }, + { + "epoch": 0.1606, + "grad_norm": 3.0560450553894043, + "learning_rate": 1.977710006508212e-05, + "loss": 0.3709, + "step": 1606 + }, + { + "epoch": 0.1608, + "grad_norm": 0.9854336380958557, + "learning_rate": 1.9775631886233655e-05, + "loss": 0.1542, + "step": 1608 + }, + { + "epoch": 0.161, + "grad_norm": 3.0378315448760986, + "learning_rate": 1.9774158942860962e-05, + "loss": 0.3854, + "step": 1610 + }, + { + "epoch": 0.1612, + "grad_norm": 1.7450065612792969, + "learning_rate": 1.9772681235681936e-05, + "loss": 0.3934, + "step": 1612 + }, + { + "epoch": 0.1614, + "grad_norm": 1.3985697031021118, + "learning_rate": 1.97711987654168e-05, + "loss": 0.1091, + "step": 1614 + }, + { + "epoch": 0.1616, + "grad_norm": 4.88847017288208, + "learning_rate": 1.9769711532788083e-05, + "loss": 0.3139, + "step": 1616 + }, + { + "epoch": 0.1618, + "grad_norm": 1.611147165298462, + "learning_rate": 1.976821953852065e-05, + "loss": 0.1838, + "step": 1618 + }, + { + "epoch": 0.162, + "grad_norm": 4.096453666687012, + "learning_rate": 1.9766722783341682e-05, + "loss": 0.2318, + "step": 1620 + }, + { + "epoch": 0.1622, + "grad_norm": 2.8079702854156494, + "learning_rate": 1.9765221267980675e-05, + "loss": 0.7501, + "step": 1622 + }, + { + "epoch": 0.1624, + "grad_norm": 1.018310546875, + "learning_rate": 1.976371499316945e-05, + "loss": 0.1656, + "step": 1624 + }, + { + "epoch": 0.1626, + "grad_norm": 0.6901879906654358, + "learning_rate": 1.976220395964215e-05, + "loss": 0.301, + "step": 1626 + }, + { + "epoch": 0.1628, + "grad_norm": 5.263514995574951, + "learning_rate": 1.9760688168135233e-05, + "loss": 0.4604, + "step": 1628 + }, + { + "epoch": 0.163, + "grad_norm": 1.400829792022705, + "learning_rate": 1.9759167619387474e-05, + "loss": 0.1965, + "step": 1630 + }, + { + "epoch": 0.1632, + "grad_norm": 2.3807971477508545, + "learning_rate": 1.9757642314139977e-05, + "loss": 0.2154, + "step": 1632 + }, + { + "epoch": 0.1634, + "grad_norm": 1.3182570934295654, + "learning_rate": 1.9756112253136154e-05, + "loss": 0.1086, + "step": 1634 + }, + { + "epoch": 0.1636, + "grad_norm": 3.390212297439575, + "learning_rate": 1.9754577437121733e-05, + "loss": 0.263, + "step": 1636 + }, + { + "epoch": 0.1638, + "grad_norm": 5.501956939697266, + "learning_rate": 1.975303786684477e-05, + "loss": 0.2715, + "step": 1638 + }, + { + "epoch": 0.164, + "grad_norm": 0.8549824953079224, + "learning_rate": 1.9751493543055634e-05, + "loss": 0.1884, + "step": 1640 + }, + { + "epoch": 0.1642, + "grad_norm": 1.40828275680542, + "learning_rate": 1.9749944466507007e-05, + "loss": 0.2232, + "step": 1642 + }, + { + "epoch": 0.1644, + "grad_norm": 12.29401683807373, + "learning_rate": 1.974839063795389e-05, + "loss": 0.4702, + "step": 1644 + }, + { + "epoch": 0.1646, + "grad_norm": 2.6862800121307373, + "learning_rate": 1.9746832058153602e-05, + "loss": 0.4742, + "step": 1646 + }, + { + "epoch": 0.1648, + "grad_norm": 1.8436191082000732, + "learning_rate": 1.9745268727865774e-05, + "loss": 0.3423, + "step": 1648 + }, + { + "epoch": 0.165, + "grad_norm": 0.9999091625213623, + "learning_rate": 1.9743700647852356e-05, + "loss": 0.4772, + "step": 1650 + }, + { + "epoch": 0.1652, + "grad_norm": 1.85992431640625, + "learning_rate": 1.9742127818877605e-05, + "loss": 0.2391, + "step": 1652 + }, + { + "epoch": 0.1654, + "grad_norm": 2.7800650596618652, + "learning_rate": 1.974055024170811e-05, + "loss": 0.2648, + "step": 1654 + }, + { + "epoch": 0.1656, + "grad_norm": 2.8990421295166016, + "learning_rate": 1.9738967917112752e-05, + "loss": 0.3623, + "step": 1656 + }, + { + "epoch": 0.1658, + "grad_norm": 3.8204612731933594, + "learning_rate": 1.9737380845862745e-05, + "loss": 0.3397, + "step": 1658 + }, + { + "epoch": 0.166, + "grad_norm": 2.406428098678589, + "learning_rate": 1.9735789028731603e-05, + "loss": 0.3729, + "step": 1660 + }, + { + "epoch": 0.1662, + "grad_norm": 3.8778858184814453, + "learning_rate": 1.9734192466495162e-05, + "loss": 0.3491, + "step": 1662 + }, + { + "epoch": 0.1664, + "grad_norm": 3.1791062355041504, + "learning_rate": 1.9732591159931564e-05, + "loss": 0.3811, + "step": 1664 + }, + { + "epoch": 0.1666, + "grad_norm": 0.6689931154251099, + "learning_rate": 1.9730985109821268e-05, + "loss": 0.1246, + "step": 1666 + }, + { + "epoch": 0.1668, + "grad_norm": 1.7325247526168823, + "learning_rate": 1.972937431694704e-05, + "loss": 0.2826, + "step": 1668 + }, + { + "epoch": 0.167, + "grad_norm": 0.5912852883338928, + "learning_rate": 1.972775878209397e-05, + "loss": 0.2163, + "step": 1670 + }, + { + "epoch": 0.1672, + "grad_norm": 4.946029186248779, + "learning_rate": 1.9726138506049438e-05, + "loss": 0.2389, + "step": 1672 + }, + { + "epoch": 0.1674, + "grad_norm": 4.266664028167725, + "learning_rate": 1.9724513489603153e-05, + "loss": 0.4746, + "step": 1674 + }, + { + "epoch": 0.1676, + "grad_norm": 1.1441905498504639, + "learning_rate": 1.9722883733547128e-05, + "loss": 0.1454, + "step": 1676 + }, + { + "epoch": 0.1678, + "grad_norm": 0.6392074227333069, + "learning_rate": 1.9721249238675688e-05, + "loss": 0.108, + "step": 1678 + }, + { + "epoch": 0.168, + "grad_norm": 6.0953521728515625, + "learning_rate": 1.9719610005785466e-05, + "loss": 0.3447, + "step": 1680 + }, + { + "epoch": 0.1682, + "grad_norm": 0.5466648936271667, + "learning_rate": 1.97179660356754e-05, + "loss": 0.0969, + "step": 1682 + }, + { + "epoch": 0.1684, + "grad_norm": 3.31120228767395, + "learning_rate": 1.971631732914674e-05, + "loss": 0.4889, + "step": 1684 + }, + { + "epoch": 0.1686, + "grad_norm": 5.490817546844482, + "learning_rate": 1.9714663887003055e-05, + "loss": 0.7067, + "step": 1686 + }, + { + "epoch": 0.1688, + "grad_norm": 2.013009786605835, + "learning_rate": 1.9713005710050203e-05, + "loss": 0.2768, + "step": 1688 + }, + { + "epoch": 0.169, + "grad_norm": 2.1413989067077637, + "learning_rate": 1.971134279909636e-05, + "loss": 0.4444, + "step": 1690 + }, + { + "epoch": 0.1692, + "grad_norm": 1.684460163116455, + "learning_rate": 1.9709675154952017e-05, + "loss": 0.196, + "step": 1692 + }, + { + "epoch": 0.1694, + "grad_norm": 4.373110294342041, + "learning_rate": 1.9708002778429957e-05, + "loss": 0.5287, + "step": 1694 + }, + { + "epoch": 0.1696, + "grad_norm": 1.8100032806396484, + "learning_rate": 1.9706325670345276e-05, + "loss": 0.2472, + "step": 1696 + }, + { + "epoch": 0.1698, + "grad_norm": 2.924220085144043, + "learning_rate": 1.9704643831515377e-05, + "loss": 0.3394, + "step": 1698 + }, + { + "epoch": 0.17, + "grad_norm": 1.932181715965271, + "learning_rate": 1.9702957262759964e-05, + "loss": 0.2398, + "step": 1700 + }, + { + "epoch": 0.1702, + "grad_norm": 3.639941453933716, + "learning_rate": 1.970126596490106e-05, + "loss": 0.3528, + "step": 1702 + }, + { + "epoch": 0.1704, + "grad_norm": 1.6618781089782715, + "learning_rate": 1.9699569938762975e-05, + "loss": 0.2674, + "step": 1704 + }, + { + "epoch": 0.1706, + "grad_norm": 1.3492673635482788, + "learning_rate": 1.969786918517233e-05, + "loss": 0.2953, + "step": 1706 + }, + { + "epoch": 0.1708, + "grad_norm": 3.717430830001831, + "learning_rate": 1.969616370495806e-05, + "loss": 0.29, + "step": 1708 + }, + { + "epoch": 0.171, + "grad_norm": 1.0908899307250977, + "learning_rate": 1.9694453498951392e-05, + "loss": 0.1483, + "step": 1710 + }, + { + "epoch": 0.1712, + "grad_norm": 1.8479453325271606, + "learning_rate": 1.9692738567985853e-05, + "loss": 0.2272, + "step": 1712 + }, + { + "epoch": 0.1714, + "grad_norm": 5.521978855133057, + "learning_rate": 1.9691018912897285e-05, + "loss": 0.3332, + "step": 1714 + }, + { + "epoch": 0.1716, + "grad_norm": 1.9934576749801636, + "learning_rate": 1.968929453452383e-05, + "loss": 0.4163, + "step": 1716 + }, + { + "epoch": 0.1718, + "grad_norm": 3.190218925476074, + "learning_rate": 1.9687565433705926e-05, + "loss": 0.1996, + "step": 1718 + }, + { + "epoch": 0.172, + "grad_norm": 1.3450467586517334, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.1514, + "step": 1720 + }, + { + "epoch": 0.1722, + "grad_norm": 1.4487247467041016, + "learning_rate": 1.968409306811004e-05, + "loss": 0.2081, + "step": 1722 + }, + { + "epoch": 0.1724, + "grad_norm": 4.730585098266602, + "learning_rate": 1.9682349805024447e-05, + "loss": 0.4145, + "step": 1724 + }, + { + "epoch": 0.1726, + "grad_norm": 0.5015780329704285, + "learning_rate": 1.968060182287918e-05, + "loss": 0.26, + "step": 1726 + }, + { + "epoch": 0.1728, + "grad_norm": 2.476351261138916, + "learning_rate": 1.967884912252619e-05, + "loss": 0.2691, + "step": 1728 + }, + { + "epoch": 0.173, + "grad_norm": 2.9409899711608887, + "learning_rate": 1.9677091704819714e-05, + "loss": 0.4077, + "step": 1730 + }, + { + "epoch": 0.1732, + "grad_norm": 2.049117088317871, + "learning_rate": 1.96753295706163e-05, + "loss": 0.1334, + "step": 1732 + }, + { + "epoch": 0.1734, + "grad_norm": 0.21300305426120758, + "learning_rate": 1.9673562720774792e-05, + "loss": 0.0111, + "step": 1734 + }, + { + "epoch": 0.1736, + "grad_norm": 0.14174099266529083, + "learning_rate": 1.967179115615633e-05, + "loss": 0.021, + "step": 1736 + }, + { + "epoch": 0.1738, + "grad_norm": 0.6144225001335144, + "learning_rate": 1.9670014877624353e-05, + "loss": 1.1746, + "step": 1738 + }, + { + "epoch": 0.174, + "grad_norm": 8.191797256469727, + "learning_rate": 1.9668233886044597e-05, + "loss": 1.129, + "step": 1740 + }, + { + "epoch": 0.1742, + "grad_norm": 0.21766650676727295, + "learning_rate": 1.9666448182285095e-05, + "loss": 0.0375, + "step": 1742 + }, + { + "epoch": 0.1744, + "grad_norm": 0.23818178474903107, + "learning_rate": 1.9664657767216176e-05, + "loss": 0.1182, + "step": 1744 + }, + { + "epoch": 0.1746, + "grad_norm": 0.33612170815467834, + "learning_rate": 1.966286264171047e-05, + "loss": 0.0402, + "step": 1746 + }, + { + "epoch": 0.1748, + "grad_norm": 6.028461456298828, + "learning_rate": 1.9661062806642903e-05, + "loss": 0.7083, + "step": 1748 + }, + { + "epoch": 0.175, + "grad_norm": 2.8726863861083984, + "learning_rate": 1.9659258262890683e-05, + "loss": 0.2716, + "step": 1750 + }, + { + "epoch": 0.1752, + "grad_norm": 3.2170355319976807, + "learning_rate": 1.9657449011333328e-05, + "loss": 0.2892, + "step": 1752 + }, + { + "epoch": 0.1754, + "grad_norm": 3.97408127784729, + "learning_rate": 1.9655635052852648e-05, + "loss": 0.3466, + "step": 1754 + }, + { + "epoch": 0.1756, + "grad_norm": 2.32251238822937, + "learning_rate": 1.965381638833274e-05, + "loss": 0.1498, + "step": 1756 + }, + { + "epoch": 0.1758, + "grad_norm": 4.862124919891357, + "learning_rate": 1.9651993018660002e-05, + "loss": 0.4459, + "step": 1758 + }, + { + "epoch": 0.176, + "grad_norm": 5.118532180786133, + "learning_rate": 1.9650164944723116e-05, + "loss": 0.4514, + "step": 1760 + }, + { + "epoch": 0.1762, + "grad_norm": 2.4550209045410156, + "learning_rate": 1.9648332167413067e-05, + "loss": 0.4973, + "step": 1762 + }, + { + "epoch": 0.1764, + "grad_norm": 4.505826950073242, + "learning_rate": 1.9646494687623135e-05, + "loss": 0.7787, + "step": 1764 + }, + { + "epoch": 0.1766, + "grad_norm": 3.2299411296844482, + "learning_rate": 1.9644652506248872e-05, + "loss": 0.2543, + "step": 1766 + }, + { + "epoch": 0.1768, + "grad_norm": 1.5251811742782593, + "learning_rate": 1.964280562418815e-05, + "loss": 0.188, + "step": 1768 + }, + { + "epoch": 0.177, + "grad_norm": 2.6859261989593506, + "learning_rate": 1.96409540423411e-05, + "loss": 0.4168, + "step": 1770 + }, + { + "epoch": 0.1772, + "grad_norm": 2.011946439743042, + "learning_rate": 1.9639097761610174e-05, + "loss": 0.335, + "step": 1772 + }, + { + "epoch": 0.1774, + "grad_norm": 3.4656167030334473, + "learning_rate": 1.96372367829001e-05, + "loss": 0.3717, + "step": 1774 + }, + { + "epoch": 0.1776, + "grad_norm": 2.638150453567505, + "learning_rate": 1.963537110711789e-05, + "loss": 0.2311, + "step": 1776 + }, + { + "epoch": 0.1778, + "grad_norm": 10.94227409362793, + "learning_rate": 1.963350073517285e-05, + "loss": 0.3993, + "step": 1778 + }, + { + "epoch": 0.178, + "grad_norm": 1.3827954530715942, + "learning_rate": 1.9631625667976584e-05, + "loss": 0.2335, + "step": 1780 + }, + { + "epoch": 0.1782, + "grad_norm": 4.098860740661621, + "learning_rate": 1.9629745906442973e-05, + "loss": 0.3241, + "step": 1782 + }, + { + "epoch": 0.1784, + "grad_norm": 2.212566375732422, + "learning_rate": 1.962786145148819e-05, + "loss": 0.3716, + "step": 1784 + }, + { + "epoch": 0.1786, + "grad_norm": 1.2276434898376465, + "learning_rate": 1.9625972304030697e-05, + "loss": 0.3107, + "step": 1786 + }, + { + "epoch": 0.1788, + "grad_norm": 2.220611333847046, + "learning_rate": 1.962407846499124e-05, + "loss": 0.2975, + "step": 1788 + }, + { + "epoch": 0.179, + "grad_norm": 2.4787049293518066, + "learning_rate": 1.9622179935292855e-05, + "loss": 0.2975, + "step": 1790 + }, + { + "epoch": 0.1792, + "grad_norm": 1.326407551765442, + "learning_rate": 1.962027671586086e-05, + "loss": 0.317, + "step": 1792 + }, + { + "epoch": 0.1794, + "grad_norm": 1.0714318752288818, + "learning_rate": 1.9618368807622863e-05, + "loss": 0.082, + "step": 1794 + }, + { + "epoch": 0.1796, + "grad_norm": 1.8859285116195679, + "learning_rate": 1.9616456211508756e-05, + "loss": 0.3346, + "step": 1796 + }, + { + "epoch": 0.1798, + "grad_norm": 1.436424732208252, + "learning_rate": 1.961453892845071e-05, + "loss": 0.1959, + "step": 1798 + }, + { + "epoch": 0.18, + "grad_norm": 4.176055908203125, + "learning_rate": 1.961261695938319e-05, + "loss": 0.3629, + "step": 1800 + }, + { + "epoch": 0.1802, + "grad_norm": 1.8395518064498901, + "learning_rate": 1.961069030524294e-05, + "loss": 0.2542, + "step": 1802 + }, + { + "epoch": 0.1804, + "grad_norm": 2.211261749267578, + "learning_rate": 1.9608758966968987e-05, + "loss": 0.3579, + "step": 1804 + }, + { + "epoch": 0.1806, + "grad_norm": 0.49207645654678345, + "learning_rate": 1.9606822945502642e-05, + "loss": 0.2007, + "step": 1806 + }, + { + "epoch": 0.1808, + "grad_norm": 0.8354406952857971, + "learning_rate": 1.96048822417875e-05, + "loss": 0.2319, + "step": 1808 + }, + { + "epoch": 0.181, + "grad_norm": 4.064897060394287, + "learning_rate": 1.9602936856769432e-05, + "loss": 0.2716, + "step": 1810 + }, + { + "epoch": 0.1812, + "grad_norm": 1.08663809299469, + "learning_rate": 1.96009867913966e-05, + "loss": 0.2832, + "step": 1812 + }, + { + "epoch": 0.1814, + "grad_norm": 3.1925015449523926, + "learning_rate": 1.9599032046619437e-05, + "loss": 0.3346, + "step": 1814 + }, + { + "epoch": 0.1816, + "grad_norm": 2.5724434852600098, + "learning_rate": 1.9597072623390668e-05, + "loss": 0.1622, + "step": 1816 + }, + { + "epoch": 0.1818, + "grad_norm": 2.447040557861328, + "learning_rate": 1.959510852266529e-05, + "loss": 0.2478, + "step": 1818 + }, + { + "epoch": 0.182, + "grad_norm": 2.6632285118103027, + "learning_rate": 1.9593139745400575e-05, + "loss": 0.3535, + "step": 1820 + }, + { + "epoch": 0.1822, + "grad_norm": 2.013444185256958, + "learning_rate": 1.9591166292556093e-05, + "loss": 0.2429, + "step": 1822 + }, + { + "epoch": 0.1824, + "grad_norm": 3.198054790496826, + "learning_rate": 1.958918816509367e-05, + "loss": 0.3708, + "step": 1824 + }, + { + "epoch": 0.1826, + "grad_norm": 0.9017783999443054, + "learning_rate": 1.9587205363977428e-05, + "loss": 0.1341, + "step": 1826 + }, + { + "epoch": 0.1828, + "grad_norm": 8.48698616027832, + "learning_rate": 1.958521789017376e-05, + "loss": 0.5348, + "step": 1828 + }, + { + "epoch": 0.183, + "grad_norm": 9.196399688720703, + "learning_rate": 1.9583225744651334e-05, + "loss": 0.5586, + "step": 1830 + }, + { + "epoch": 0.1832, + "grad_norm": 2.1795856952667236, + "learning_rate": 1.95812289283811e-05, + "loss": 0.6276, + "step": 1832 + }, + { + "epoch": 0.1834, + "grad_norm": 1.0605894327163696, + "learning_rate": 1.9579227442336276e-05, + "loss": 0.1833, + "step": 1834 + }, + { + "epoch": 0.1836, + "grad_norm": 8.42657470703125, + "learning_rate": 1.9577221287492368e-05, + "loss": 0.43, + "step": 1836 + }, + { + "epoch": 0.1838, + "grad_norm": 1.1949400901794434, + "learning_rate": 1.957521046482715e-05, + "loss": 0.0394, + "step": 1838 + }, + { + "epoch": 0.184, + "grad_norm": 3.319857597351074, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.8953, + "step": 1840 + }, + { + "epoch": 0.1842, + "grad_norm": 1.508025884628296, + "learning_rate": 1.9571174819955264e-05, + "loss": 0.5361, + "step": 1842 + }, + { + "epoch": 0.1844, + "grad_norm": 1.8951834440231323, + "learning_rate": 1.9569149999715514e-05, + "loss": 0.2028, + "step": 1844 + }, + { + "epoch": 0.1846, + "grad_norm": 1.1553112268447876, + "learning_rate": 1.9567120515588307e-05, + "loss": 0.0716, + "step": 1846 + }, + { + "epoch": 0.1848, + "grad_norm": 2.803413152694702, + "learning_rate": 1.956508636856278e-05, + "loss": 0.3075, + "step": 1848 + }, + { + "epoch": 0.185, + "grad_norm": 1.6093330383300781, + "learning_rate": 1.9563047559630356e-05, + "loss": 0.4178, + "step": 1850 + }, + { + "epoch": 0.1852, + "grad_norm": 0.859183669090271, + "learning_rate": 1.9561004089784726e-05, + "loss": 0.2203, + "step": 1852 + }, + { + "epoch": 0.1854, + "grad_norm": 1.1464929580688477, + "learning_rate": 1.9558955960021847e-05, + "loss": 0.1222, + "step": 1854 + }, + { + "epoch": 0.1856, + "grad_norm": 2.0434157848358154, + "learning_rate": 1.9556903171339963e-05, + "loss": 0.2675, + "step": 1856 + }, + { + "epoch": 0.1858, + "grad_norm": 1.771394968032837, + "learning_rate": 1.9554845724739565e-05, + "loss": 0.3965, + "step": 1858 + }, + { + "epoch": 0.186, + "grad_norm": 2.5208611488342285, + "learning_rate": 1.9552783621223437e-05, + "loss": 0.3232, + "step": 1860 + }, + { + "epoch": 0.1862, + "grad_norm": 2.8431284427642822, + "learning_rate": 1.9550716861796623e-05, + "loss": 0.3022, + "step": 1862 + }, + { + "epoch": 0.1864, + "grad_norm": 3.32989764213562, + "learning_rate": 1.9548645447466433e-05, + "loss": 0.248, + "step": 1864 + }, + { + "epoch": 0.1866, + "grad_norm": 1.3159457445144653, + "learning_rate": 1.9546569379242446e-05, + "loss": 0.1876, + "step": 1866 + }, + { + "epoch": 0.1868, + "grad_norm": 2.01220440864563, + "learning_rate": 1.9544488658136522e-05, + "loss": 0.3531, + "step": 1868 + }, + { + "epoch": 0.187, + "grad_norm": 2.909235954284668, + "learning_rate": 1.954240328516277e-05, + "loss": 0.3733, + "step": 1870 + }, + { + "epoch": 0.1872, + "grad_norm": 5.561924457550049, + "learning_rate": 1.954031326133758e-05, + "loss": 0.3403, + "step": 1872 + }, + { + "epoch": 0.1874, + "grad_norm": 0.6475815176963806, + "learning_rate": 1.9538218587679605e-05, + "loss": 0.1566, + "step": 1874 + }, + { + "epoch": 0.1876, + "grad_norm": 7.410195350646973, + "learning_rate": 1.9536119265209763e-05, + "loss": 0.4527, + "step": 1876 + }, + { + "epoch": 0.1878, + "grad_norm": 5.607536315917969, + "learning_rate": 1.9534015294951235e-05, + "loss": 0.3399, + "step": 1878 + }, + { + "epoch": 0.188, + "grad_norm": 1.8265537023544312, + "learning_rate": 1.9531906677929472e-05, + "loss": 0.2382, + "step": 1880 + }, + { + "epoch": 0.1882, + "grad_norm": 5.324478626251221, + "learning_rate": 1.952979341517219e-05, + "loss": 0.2543, + "step": 1882 + }, + { + "epoch": 0.1884, + "grad_norm": 2.792299747467041, + "learning_rate": 1.9527675507709368e-05, + "loss": 0.3969, + "step": 1884 + }, + { + "epoch": 0.1886, + "grad_norm": 0.7094467282295227, + "learning_rate": 1.9525552956573244e-05, + "loss": 0.1208, + "step": 1886 + }, + { + "epoch": 0.1888, + "grad_norm": 0.4998934864997864, + "learning_rate": 1.9523425762798328e-05, + "loss": 0.0484, + "step": 1888 + }, + { + "epoch": 0.189, + "grad_norm": 0.5838341116905212, + "learning_rate": 1.9521293927421388e-05, + "loss": 0.1127, + "step": 1890 + }, + { + "epoch": 0.1892, + "grad_norm": 0.4313047528266907, + "learning_rate": 1.9519157451481453e-05, + "loss": 0.0642, + "step": 1892 + }, + { + "epoch": 0.1894, + "grad_norm": 1.8632011413574219, + "learning_rate": 1.9517016336019817e-05, + "loss": 0.2233, + "step": 1894 + }, + { + "epoch": 0.1896, + "grad_norm": 1.7288683652877808, + "learning_rate": 1.951487058208003e-05, + "loss": 0.7258, + "step": 1896 + }, + { + "epoch": 0.1898, + "grad_norm": 0.6498861908912659, + "learning_rate": 1.9512720190707915e-05, + "loss": 0.4593, + "step": 1898 + }, + { + "epoch": 0.19, + "grad_norm": 2.036729574203491, + "learning_rate": 1.9510565162951538e-05, + "loss": 0.1794, + "step": 1900 + }, + { + "epoch": 0.1902, + "grad_norm": 3.0258665084838867, + "learning_rate": 1.9508405499861235e-05, + "loss": 0.3069, + "step": 1902 + }, + { + "epoch": 0.1904, + "grad_norm": 4.652884006500244, + "learning_rate": 1.95062412024896e-05, + "loss": 0.2362, + "step": 1904 + }, + { + "epoch": 0.1906, + "grad_norm": 11.214337348937988, + "learning_rate": 1.9504072271891486e-05, + "loss": 0.4895, + "step": 1906 + }, + { + "epoch": 0.1908, + "grad_norm": 0.4942970871925354, + "learning_rate": 1.950189870912401e-05, + "loss": 0.2109, + "step": 1908 + }, + { + "epoch": 0.191, + "grad_norm": 1.9469482898712158, + "learning_rate": 1.9499720515246524e-05, + "loss": 0.1443, + "step": 1910 + }, + { + "epoch": 0.1912, + "grad_norm": 7.804234981536865, + "learning_rate": 1.949753769132067e-05, + "loss": 0.301, + "step": 1912 + }, + { + "epoch": 0.1914, + "grad_norm": 0.5478846430778503, + "learning_rate": 1.949535023841032e-05, + "loss": 0.1948, + "step": 1914 + }, + { + "epoch": 0.1916, + "grad_norm": 0.17917288839817047, + "learning_rate": 1.9493158157581617e-05, + "loss": 0.1814, + "step": 1916 + }, + { + "epoch": 0.1918, + "grad_norm": 4.5715460777282715, + "learning_rate": 1.9490961449902946e-05, + "loss": 0.3966, + "step": 1918 + }, + { + "epoch": 0.192, + "grad_norm": 4.014777660369873, + "learning_rate": 1.9488760116444966e-05, + "loss": 0.0875, + "step": 1920 + }, + { + "epoch": 0.1922, + "grad_norm": 2.9256680011749268, + "learning_rate": 1.9486554158280576e-05, + "loss": 0.5458, + "step": 1922 + }, + { + "epoch": 0.1924, + "grad_norm": 4.948172569274902, + "learning_rate": 1.9484343576484935e-05, + "loss": 0.5975, + "step": 1924 + }, + { + "epoch": 0.1926, + "grad_norm": 0.9122314453125, + "learning_rate": 1.9482128372135446e-05, + "loss": 0.3474, + "step": 1926 + }, + { + "epoch": 0.1928, + "grad_norm": 3.249664783477783, + "learning_rate": 1.9479908546311783e-05, + "loss": 0.1365, + "step": 1928 + }, + { + "epoch": 0.193, + "grad_norm": 0.5036922097206116, + "learning_rate": 1.947768410009586e-05, + "loss": 0.1682, + "step": 1930 + }, + { + "epoch": 0.1932, + "grad_norm": 2.9884300231933594, + "learning_rate": 1.947545503457184e-05, + "loss": 0.1452, + "step": 1932 + }, + { + "epoch": 0.1934, + "grad_norm": 0.3490000069141388, + "learning_rate": 1.9473221350826145e-05, + "loss": 0.0219, + "step": 1934 + }, + { + "epoch": 0.1936, + "grad_norm": 0.834568202495575, + "learning_rate": 1.9470983049947446e-05, + "loss": 0.0811, + "step": 1936 + }, + { + "epoch": 0.1938, + "grad_norm": 1.1328246593475342, + "learning_rate": 1.946874013302666e-05, + "loss": 0.3221, + "step": 1938 + }, + { + "epoch": 0.194, + "grad_norm": 1.3655518293380737, + "learning_rate": 1.9466492601156964e-05, + "loss": 0.5573, + "step": 1940 + }, + { + "epoch": 0.1942, + "grad_norm": 1.4792650938034058, + "learning_rate": 1.9464240455433775e-05, + "loss": 0.5247, + "step": 1942 + }, + { + "epoch": 0.1944, + "grad_norm": 9.961508750915527, + "learning_rate": 1.946198369695476e-05, + "loss": 0.4159, + "step": 1944 + }, + { + "epoch": 0.1946, + "grad_norm": 1.1929376125335693, + "learning_rate": 1.945972232681984e-05, + "loss": 0.0886, + "step": 1946 + }, + { + "epoch": 0.1948, + "grad_norm": 3.923212766647339, + "learning_rate": 1.945745634613117e-05, + "loss": 0.3532, + "step": 1948 + }, + { + "epoch": 0.195, + "grad_norm": 1.2186851501464844, + "learning_rate": 1.945518575599317e-05, + "loss": 0.3153, + "step": 1950 + }, + { + "epoch": 0.1952, + "grad_norm": 1.8021100759506226, + "learning_rate": 1.9452910557512497e-05, + "loss": 0.1129, + "step": 1952 + }, + { + "epoch": 0.1954, + "grad_norm": 8.920308113098145, + "learning_rate": 1.945063075179805e-05, + "loss": 0.6946, + "step": 1954 + }, + { + "epoch": 0.1956, + "grad_norm": 1.535203218460083, + "learning_rate": 1.9448346339960984e-05, + "loss": 0.1665, + "step": 1956 + }, + { + "epoch": 0.1958, + "grad_norm": 0.50552898645401, + "learning_rate": 1.944605732311469e-05, + "loss": 0.249, + "step": 1958 + }, + { + "epoch": 0.196, + "grad_norm": 4.8526177406311035, + "learning_rate": 1.944376370237481e-05, + "loss": 0.5473, + "step": 1960 + }, + { + "epoch": 0.1962, + "grad_norm": 1.3613243103027344, + "learning_rate": 1.944146547885923e-05, + "loss": 0.2036, + "step": 1962 + }, + { + "epoch": 0.1964, + "grad_norm": 0.20680193603038788, + "learning_rate": 1.9439162653688066e-05, + "loss": 0.022, + "step": 1964 + }, + { + "epoch": 0.1966, + "grad_norm": 4.232306957244873, + "learning_rate": 1.9436855227983695e-05, + "loss": 0.2023, + "step": 1966 + }, + { + "epoch": 0.1968, + "grad_norm": 0.49645450711250305, + "learning_rate": 1.9434543202870726e-05, + "loss": 0.1384, + "step": 1968 + }, + { + "epoch": 0.197, + "grad_norm": 0.2080843448638916, + "learning_rate": 1.943222657947601e-05, + "loss": 0.0337, + "step": 1970 + }, + { + "epoch": 0.1972, + "grad_norm": 0.09921710938215256, + "learning_rate": 1.9429905358928648e-05, + "loss": 0.0404, + "step": 1972 + }, + { + "epoch": 0.1974, + "grad_norm": 0.9430707097053528, + "learning_rate": 1.9427579542359966e-05, + "loss": 0.0599, + "step": 1974 + }, + { + "epoch": 0.1976, + "grad_norm": 0.8971462845802307, + "learning_rate": 1.9425249130903544e-05, + "loss": 0.4562, + "step": 1976 + }, + { + "epoch": 0.1978, + "grad_norm": 4.605910301208496, + "learning_rate": 1.942291412569519e-05, + "loss": 0.2827, + "step": 1978 + }, + { + "epoch": 0.198, + "grad_norm": 0.6378897428512573, + "learning_rate": 1.942057452787297e-05, + "loss": 0.7732, + "step": 1980 + }, + { + "epoch": 0.1982, + "grad_norm": 9.134836196899414, + "learning_rate": 1.9418230338577164e-05, + "loss": 0.3802, + "step": 1982 + }, + { + "epoch": 0.1984, + "grad_norm": 2.806412935256958, + "learning_rate": 1.9415881558950302e-05, + "loss": 0.2635, + "step": 1984 + }, + { + "epoch": 0.1986, + "grad_norm": 0.4183618724346161, + "learning_rate": 1.9413528190137158e-05, + "loss": 0.42, + "step": 1986 + }, + { + "epoch": 0.1988, + "grad_norm": 2.899543046951294, + "learning_rate": 1.9411170233284728e-05, + "loss": 0.2184, + "step": 1988 + }, + { + "epoch": 0.199, + "grad_norm": 3.531592607498169, + "learning_rate": 1.9408807689542257e-05, + "loss": 0.3861, + "step": 1990 + }, + { + "epoch": 0.1992, + "grad_norm": 1.1969672441482544, + "learning_rate": 1.9406440560061214e-05, + "loss": 0.137, + "step": 1992 + }, + { + "epoch": 0.1994, + "grad_norm": 4.070821762084961, + "learning_rate": 1.9404068845995317e-05, + "loss": 0.3435, + "step": 1994 + }, + { + "epoch": 0.1996, + "grad_norm": 4.573517799377441, + "learning_rate": 1.9401692548500504e-05, + "loss": 0.4986, + "step": 1996 + }, + { + "epoch": 0.1998, + "grad_norm": 6.381196975708008, + "learning_rate": 1.9399311668734957e-05, + "loss": 0.4656, + "step": 1998 + }, + { + "epoch": 0.2, + "grad_norm": 7.725571632385254, + "learning_rate": 1.9396926207859085e-05, + "loss": 0.4158, + "step": 2000 + }, + { + "epoch": 0.2002, + "grad_norm": 0.7697179913520813, + "learning_rate": 1.9394536167035535e-05, + "loss": 0.3583, + "step": 2002 + }, + { + "epoch": 0.2004, + "grad_norm": 1.105743408203125, + "learning_rate": 1.9392141547429183e-05, + "loss": 0.3313, + "step": 2004 + }, + { + "epoch": 0.2006, + "grad_norm": 1.8378835916519165, + "learning_rate": 1.938974235020714e-05, + "loss": 0.1544, + "step": 2006 + }, + { + "epoch": 0.2008, + "grad_norm": 3.367936611175537, + "learning_rate": 1.9387338576538743e-05, + "loss": 0.2239, + "step": 2008 + }, + { + "epoch": 0.201, + "grad_norm": 3.3693974018096924, + "learning_rate": 1.938493022759556e-05, + "loss": 0.3206, + "step": 2010 + }, + { + "epoch": 0.2012, + "grad_norm": 5.410327911376953, + "learning_rate": 1.9382517304551397e-05, + "loss": 0.3436, + "step": 2012 + }, + { + "epoch": 0.2014, + "grad_norm": 3.018688678741455, + "learning_rate": 1.9380099808582278e-05, + "loss": 0.3053, + "step": 2014 + }, + { + "epoch": 0.2016, + "grad_norm": 1.8961783647537231, + "learning_rate": 1.937767774086646e-05, + "loss": 0.2286, + "step": 2016 + }, + { + "epoch": 0.2018, + "grad_norm": 2.1470301151275635, + "learning_rate": 1.9375251102584438e-05, + "loss": 0.258, + "step": 2018 + }, + { + "epoch": 0.202, + "grad_norm": 1.7527621984481812, + "learning_rate": 1.937281989491892e-05, + "loss": 0.1489, + "step": 2020 + }, + { + "epoch": 0.2022, + "grad_norm": 1.7808074951171875, + "learning_rate": 1.937038411905484e-05, + "loss": 0.1682, + "step": 2022 + }, + { + "epoch": 0.2024, + "grad_norm": 5.5131001472473145, + "learning_rate": 1.936794377617938e-05, + "loss": 0.405, + "step": 2024 + }, + { + "epoch": 0.2026, + "grad_norm": 0.7011245489120483, + "learning_rate": 1.9365498867481926e-05, + "loss": 0.2164, + "step": 2026 + }, + { + "epoch": 0.2028, + "grad_norm": 1.4794191122055054, + "learning_rate": 1.9363049394154095e-05, + "loss": 0.0828, + "step": 2028 + }, + { + "epoch": 0.203, + "grad_norm": 2.29217529296875, + "learning_rate": 1.9360595357389735e-05, + "loss": 0.2403, + "step": 2030 + }, + { + "epoch": 0.2032, + "grad_norm": 6.054064750671387, + "learning_rate": 1.935813675838491e-05, + "loss": 0.6241, + "step": 2032 + }, + { + "epoch": 0.2034, + "grad_norm": 0.6936021447181702, + "learning_rate": 1.9355673598337916e-05, + "loss": 0.043, + "step": 2034 + }, + { + "epoch": 0.2036, + "grad_norm": 2.0169079303741455, + "learning_rate": 1.935320587844926e-05, + "loss": 0.2029, + "step": 2036 + }, + { + "epoch": 0.2038, + "grad_norm": 0.5693888068199158, + "learning_rate": 1.9350733599921684e-05, + "loss": 0.0459, + "step": 2038 + }, + { + "epoch": 0.204, + "grad_norm": 3.6083478927612305, + "learning_rate": 1.9348256763960146e-05, + "loss": 0.4192, + "step": 2040 + }, + { + "epoch": 0.2042, + "grad_norm": 1.317873239517212, + "learning_rate": 1.9345775371771826e-05, + "loss": 0.0819, + "step": 2042 + }, + { + "epoch": 0.2044, + "grad_norm": 2.3501064777374268, + "learning_rate": 1.9343289424566122e-05, + "loss": 0.2203, + "step": 2044 + }, + { + "epoch": 0.2046, + "grad_norm": 3.941293239593506, + "learning_rate": 1.9340798923554657e-05, + "loss": 0.2383, + "step": 2046 + }, + { + "epoch": 0.2048, + "grad_norm": 6.952541828155518, + "learning_rate": 1.933830386995127e-05, + "loss": 0.6451, + "step": 2048 + }, + { + "epoch": 0.205, + "grad_norm": 0.26788848638534546, + "learning_rate": 1.9335804264972018e-05, + "loss": 0.0995, + "step": 2050 + }, + { + "epoch": 0.2052, + "grad_norm": 5.568851470947266, + "learning_rate": 1.9333300109835182e-05, + "loss": 0.9051, + "step": 2052 + }, + { + "epoch": 0.2054, + "grad_norm": 1.8068602085113525, + "learning_rate": 1.9330791405761254e-05, + "loss": 0.1361, + "step": 2054 + }, + { + "epoch": 0.2056, + "grad_norm": 3.7755544185638428, + "learning_rate": 1.9328278153972947e-05, + "loss": 0.2311, + "step": 2056 + }, + { + "epoch": 0.2058, + "grad_norm": 1.8204047679901123, + "learning_rate": 1.932576035569519e-05, + "loss": 0.2719, + "step": 2058 + }, + { + "epoch": 0.206, + "grad_norm": 3.0625641345977783, + "learning_rate": 1.9323238012155125e-05, + "loss": 0.4162, + "step": 2060 + }, + { + "epoch": 0.2062, + "grad_norm": 5.864592552185059, + "learning_rate": 1.932071112458211e-05, + "loss": 0.5113, + "step": 2062 + }, + { + "epoch": 0.2064, + "grad_norm": 0.9500077962875366, + "learning_rate": 1.9318179694207726e-05, + "loss": 0.1433, + "step": 2064 + }, + { + "epoch": 0.2066, + "grad_norm": 1.7119433879852295, + "learning_rate": 1.931564372226576e-05, + "loss": 0.2974, + "step": 2066 + }, + { + "epoch": 0.2068, + "grad_norm": 1.7745660543441772, + "learning_rate": 1.9313103209992205e-05, + "loss": 0.4103, + "step": 2068 + }, + { + "epoch": 0.207, + "grad_norm": 4.024250030517578, + "learning_rate": 1.9310558158625286e-05, + "loss": 0.3505, + "step": 2070 + }, + { + "epoch": 0.2072, + "grad_norm": 6.866841793060303, + "learning_rate": 1.9308008569405424e-05, + "loss": 0.5717, + "step": 2072 + }, + { + "epoch": 0.2074, + "grad_norm": 1.488175392150879, + "learning_rate": 1.930545444357526e-05, + "loss": 0.23, + "step": 2074 + }, + { + "epoch": 0.2076, + "grad_norm": 3.6349337100982666, + "learning_rate": 1.9302895782379648e-05, + "loss": 0.4239, + "step": 2076 + }, + { + "epoch": 0.2078, + "grad_norm": 4.0324625968933105, + "learning_rate": 1.9300332587065644e-05, + "loss": 0.4604, + "step": 2078 + }, + { + "epoch": 0.208, + "grad_norm": 3.160834312438965, + "learning_rate": 1.9297764858882516e-05, + "loss": 0.2369, + "step": 2080 + }, + { + "epoch": 0.2082, + "grad_norm": 1.2871387004852295, + "learning_rate": 1.9295192599081747e-05, + "loss": 0.1101, + "step": 2082 + }, + { + "epoch": 0.2084, + "grad_norm": 10.18492317199707, + "learning_rate": 1.9292615808917027e-05, + "loss": 0.5612, + "step": 2084 + }, + { + "epoch": 0.2086, + "grad_norm": 4.529983043670654, + "learning_rate": 1.9290034489644247e-05, + "loss": 0.2172, + "step": 2086 + }, + { + "epoch": 0.2088, + "grad_norm": 0.6203173995018005, + "learning_rate": 1.9287448642521513e-05, + "loss": 0.0938, + "step": 2088 + }, + { + "epoch": 0.209, + "grad_norm": 0.9068770408630371, + "learning_rate": 1.9284858268809135e-05, + "loss": 0.1924, + "step": 2090 + }, + { + "epoch": 0.2092, + "grad_norm": 4.39232063293457, + "learning_rate": 1.9282263369769633e-05, + "loss": 0.3716, + "step": 2092 + }, + { + "epoch": 0.2094, + "grad_norm": 1.2752331495285034, + "learning_rate": 1.927966394666773e-05, + "loss": 0.123, + "step": 2094 + }, + { + "epoch": 0.2096, + "grad_norm": 7.545273780822754, + "learning_rate": 1.9277060000770342e-05, + "loss": 0.8336, + "step": 2096 + }, + { + "epoch": 0.2098, + "grad_norm": 4.347425937652588, + "learning_rate": 1.9274451533346617e-05, + "loss": 0.3493, + "step": 2098 + }, + { + "epoch": 0.21, + "grad_norm": 1.7225444316864014, + "learning_rate": 1.9271838545667876e-05, + "loss": 0.2028, + "step": 2100 + }, + { + "epoch": 0.2102, + "grad_norm": 1.1817072629928589, + "learning_rate": 1.9269221039007666e-05, + "loss": 0.2104, + "step": 2102 + }, + { + "epoch": 0.2104, + "grad_norm": 7.7895660400390625, + "learning_rate": 1.9266599014641724e-05, + "loss": 0.4159, + "step": 2104 + }, + { + "epoch": 0.2106, + "grad_norm": 4.645057201385498, + "learning_rate": 1.9263972473847995e-05, + "loss": 0.3643, + "step": 2106 + }, + { + "epoch": 0.2108, + "grad_norm": 3.385549545288086, + "learning_rate": 1.9261341417906622e-05, + "loss": 0.2456, + "step": 2108 + }, + { + "epoch": 0.211, + "grad_norm": 2.3285717964172363, + "learning_rate": 1.925870584809995e-05, + "loss": 0.239, + "step": 2110 + }, + { + "epoch": 0.2112, + "grad_norm": 3.7075090408325195, + "learning_rate": 1.9256065765712524e-05, + "loss": 0.2442, + "step": 2112 + }, + { + "epoch": 0.2114, + "grad_norm": 2.6956043243408203, + "learning_rate": 1.9253421172031086e-05, + "loss": 0.3581, + "step": 2114 + }, + { + "epoch": 0.2116, + "grad_norm": 1.945876955986023, + "learning_rate": 1.925077206834458e-05, + "loss": 0.0939, + "step": 2116 + }, + { + "epoch": 0.2118, + "grad_norm": 0.8783878684043884, + "learning_rate": 1.9248118455944153e-05, + "loss": 0.08, + "step": 2118 + }, + { + "epoch": 0.212, + "grad_norm": 8.454212188720703, + "learning_rate": 1.9245460336123136e-05, + "loss": 0.3968, + "step": 2120 + }, + { + "epoch": 0.2122, + "grad_norm": 7.248368740081787, + "learning_rate": 1.924279771017706e-05, + "loss": 0.5501, + "step": 2122 + }, + { + "epoch": 0.2124, + "grad_norm": 2.020746946334839, + "learning_rate": 1.924013057940367e-05, + "loss": 0.4845, + "step": 2124 + }, + { + "epoch": 0.2126, + "grad_norm": 5.9256062507629395, + "learning_rate": 1.923745894510288e-05, + "loss": 0.1801, + "step": 2126 + }, + { + "epoch": 0.2128, + "grad_norm": 0.6204491257667542, + "learning_rate": 1.9234782808576823e-05, + "loss": 0.1551, + "step": 2128 + }, + { + "epoch": 0.213, + "grad_norm": 4.826442718505859, + "learning_rate": 1.923210217112981e-05, + "loss": 0.432, + "step": 2130 + }, + { + "epoch": 0.2132, + "grad_norm": 5.50948429107666, + "learning_rate": 1.9229417034068352e-05, + "loss": 0.3181, + "step": 2132 + }, + { + "epoch": 0.2134, + "grad_norm": 6.578877925872803, + "learning_rate": 1.922672739870115e-05, + "loss": 0.5531, + "step": 2134 + }, + { + "epoch": 0.2136, + "grad_norm": 2.2794864177703857, + "learning_rate": 1.9224033266339103e-05, + "loss": 0.3427, + "step": 2136 + }, + { + "epoch": 0.2138, + "grad_norm": 3.8102433681488037, + "learning_rate": 1.9221334638295296e-05, + "loss": 0.3289, + "step": 2138 + }, + { + "epoch": 0.214, + "grad_norm": 0.6609979867935181, + "learning_rate": 1.9218631515885007e-05, + "loss": 0.3705, + "step": 2140 + }, + { + "epoch": 0.2142, + "grad_norm": 5.3612141609191895, + "learning_rate": 1.921592390042571e-05, + "loss": 0.2962, + "step": 2142 + }, + { + "epoch": 0.2144, + "grad_norm": 4.051450252532959, + "learning_rate": 1.9213211793237056e-05, + "loss": 0.3457, + "step": 2144 + }, + { + "epoch": 0.2146, + "grad_norm": 4.750330924987793, + "learning_rate": 1.9210495195640895e-05, + "loss": 0.561, + "step": 2146 + }, + { + "epoch": 0.2148, + "grad_norm": 2.542323350906372, + "learning_rate": 1.9207774108961273e-05, + "loss": 0.3103, + "step": 2148 + }, + { + "epoch": 0.215, + "grad_norm": 3.5887413024902344, + "learning_rate": 1.9205048534524405e-05, + "loss": 0.1887, + "step": 2150 + }, + { + "epoch": 0.2152, + "grad_norm": 2.8819026947021484, + "learning_rate": 1.9202318473658707e-05, + "loss": 0.3064, + "step": 2152 + }, + { + "epoch": 0.2154, + "grad_norm": 2.9269802570343018, + "learning_rate": 1.9199583927694775e-05, + "loss": 0.4179, + "step": 2154 + }, + { + "epoch": 0.2156, + "grad_norm": 4.345792770385742, + "learning_rate": 1.9196844897965393e-05, + "loss": 0.2656, + "step": 2156 + }, + { + "epoch": 0.2158, + "grad_norm": 2.7425544261932373, + "learning_rate": 1.919410138580553e-05, + "loss": 0.2731, + "step": 2158 + }, + { + "epoch": 0.216, + "grad_norm": 4.56603479385376, + "learning_rate": 1.9191353392552346e-05, + "loss": 0.3114, + "step": 2160 + }, + { + "epoch": 0.2162, + "grad_norm": 3.8861048221588135, + "learning_rate": 1.9188600919545176e-05, + "loss": 0.4015, + "step": 2162 + }, + { + "epoch": 0.2164, + "grad_norm": 0.8570343852043152, + "learning_rate": 1.9185843968125543e-05, + "loss": 0.1782, + "step": 2164 + }, + { + "epoch": 0.2166, + "grad_norm": 1.6527063846588135, + "learning_rate": 1.918308253963715e-05, + "loss": 0.3073, + "step": 2166 + }, + { + "epoch": 0.2168, + "grad_norm": 2.9277172088623047, + "learning_rate": 1.9180316635425883e-05, + "loss": 0.3536, + "step": 2168 + }, + { + "epoch": 0.217, + "grad_norm": 4.0611138343811035, + "learning_rate": 1.9177546256839814e-05, + "loss": 0.3591, + "step": 2170 + }, + { + "epoch": 0.2172, + "grad_norm": 2.2460410594940186, + "learning_rate": 1.9174771405229187e-05, + "loss": 0.477, + "step": 2172 + }, + { + "epoch": 0.2174, + "grad_norm": 2.247309446334839, + "learning_rate": 1.9171992081946436e-05, + "loss": 0.2311, + "step": 2174 + }, + { + "epoch": 0.2176, + "grad_norm": 1.9866015911102295, + "learning_rate": 1.9169208288346168e-05, + "loss": 0.1679, + "step": 2176 + }, + { + "epoch": 0.2178, + "grad_norm": 2.710228443145752, + "learning_rate": 1.9166420025785165e-05, + "loss": 0.4708, + "step": 2178 + }, + { + "epoch": 0.218, + "grad_norm": 2.224778890609741, + "learning_rate": 1.9163627295622397e-05, + "loss": 0.2916, + "step": 2180 + }, + { + "epoch": 0.2182, + "grad_norm": 0.49396270513534546, + "learning_rate": 1.9160830099219007e-05, + "loss": 0.1888, + "step": 2182 + }, + { + "epoch": 0.2184, + "grad_norm": 1.4679275751113892, + "learning_rate": 1.9158028437938316e-05, + "loss": 0.2679, + "step": 2184 + }, + { + "epoch": 0.2186, + "grad_norm": 3.170015335083008, + "learning_rate": 1.9155222313145817e-05, + "loss": 0.2001, + "step": 2186 + }, + { + "epoch": 0.2188, + "grad_norm": 3.070338726043701, + "learning_rate": 1.9152411726209176e-05, + "loss": 0.4666, + "step": 2188 + }, + { + "epoch": 0.219, + "grad_norm": 1.60456383228302, + "learning_rate": 1.914959667849825e-05, + "loss": 0.3065, + "step": 2190 + }, + { + "epoch": 0.2192, + "grad_norm": 1.6688004732131958, + "learning_rate": 1.914677717138505e-05, + "loss": 0.2207, + "step": 2192 + }, + { + "epoch": 0.2194, + "grad_norm": 1.2315086126327515, + "learning_rate": 1.9143953206243778e-05, + "loss": 0.2238, + "step": 2194 + }, + { + "epoch": 0.2196, + "grad_norm": 1.6530859470367432, + "learning_rate": 1.914112478445079e-05, + "loss": 0.1957, + "step": 2196 + }, + { + "epoch": 0.2198, + "grad_norm": 1.018897294998169, + "learning_rate": 1.9138291907384632e-05, + "loss": 0.1438, + "step": 2198 + }, + { + "epoch": 0.22, + "grad_norm": 0.568054735660553, + "learning_rate": 1.913545457642601e-05, + "loss": 0.1086, + "step": 2200 + }, + { + "epoch": 0.2202, + "grad_norm": 2.352857828140259, + "learning_rate": 1.9132612792957808e-05, + "loss": 0.2172, + "step": 2202 + }, + { + "epoch": 0.2204, + "grad_norm": 5.448037147521973, + "learning_rate": 1.9129766558365076e-05, + "loss": 0.7537, + "step": 2204 + }, + { + "epoch": 0.2206, + "grad_norm": 7.410750389099121, + "learning_rate": 1.912691587403503e-05, + "loss": 0.3504, + "step": 2206 + }, + { + "epoch": 0.2208, + "grad_norm": 2.9737045764923096, + "learning_rate": 1.9124060741357065e-05, + "loss": 0.2365, + "step": 2208 + }, + { + "epoch": 0.221, + "grad_norm": 14.594645500183105, + "learning_rate": 1.9121201161722732e-05, + "loss": 0.1663, + "step": 2210 + }, + { + "epoch": 0.2212, + "grad_norm": 5.113057613372803, + "learning_rate": 1.911833713652576e-05, + "loss": 0.2164, + "step": 2212 + }, + { + "epoch": 0.2214, + "grad_norm": 1.5725576877593994, + "learning_rate": 1.9115468667162038e-05, + "loss": 0.1083, + "step": 2214 + }, + { + "epoch": 0.2216, + "grad_norm": 2.8892300128936768, + "learning_rate": 1.9112595755029625e-05, + "loss": 0.4323, + "step": 2216 + }, + { + "epoch": 0.2218, + "grad_norm": 9.797536849975586, + "learning_rate": 1.9109718401528742e-05, + "loss": 0.4719, + "step": 2218 + }, + { + "epoch": 0.222, + "grad_norm": 1.193596363067627, + "learning_rate": 1.910683660806177e-05, + "loss": 0.2651, + "step": 2220 + }, + { + "epoch": 0.2222, + "grad_norm": 3.525181293487549, + "learning_rate": 1.9103950376033276e-05, + "loss": 0.3835, + "step": 2222 + }, + { + "epoch": 0.2224, + "grad_norm": 1.8001726865768433, + "learning_rate": 1.9101059706849957e-05, + "loss": 0.1917, + "step": 2224 + }, + { + "epoch": 0.2226, + "grad_norm": 1.23676598072052, + "learning_rate": 1.9098164601920702e-05, + "loss": 0.1026, + "step": 2226 + }, + { + "epoch": 0.2228, + "grad_norm": 0.7590399384498596, + "learning_rate": 1.9095265062656546e-05, + "loss": 0.0834, + "step": 2228 + }, + { + "epoch": 0.223, + "grad_norm": 3.177039623260498, + "learning_rate": 1.9092361090470688e-05, + "loss": 0.5969, + "step": 2230 + }, + { + "epoch": 0.2232, + "grad_norm": 6.914273738861084, + "learning_rate": 1.908945268677849e-05, + "loss": 0.4044, + "step": 2232 + }, + { + "epoch": 0.2234, + "grad_norm": 3.6700997352600098, + "learning_rate": 1.908653985299747e-05, + "loss": 0.2621, + "step": 2234 + }, + { + "epoch": 0.2236, + "grad_norm": 2.781306266784668, + "learning_rate": 1.9083622590547313e-05, + "loss": 0.0856, + "step": 2236 + }, + { + "epoch": 0.2238, + "grad_norm": 0.5730105638504028, + "learning_rate": 1.9080700900849855e-05, + "loss": 0.158, + "step": 2238 + }, + { + "epoch": 0.224, + "grad_norm": 2.2577922344207764, + "learning_rate": 1.907777478532909e-05, + "loss": 0.5402, + "step": 2240 + }, + { + "epoch": 0.2242, + "grad_norm": 8.960272789001465, + "learning_rate": 1.907484424541117e-05, + "loss": 0.3066, + "step": 2242 + }, + { + "epoch": 0.2244, + "grad_norm": 1.23610520362854, + "learning_rate": 1.907190928252441e-05, + "loss": 0.1084, + "step": 2244 + }, + { + "epoch": 0.2246, + "grad_norm": 1.216396689414978, + "learning_rate": 1.906896989809927e-05, + "loss": 0.08, + "step": 2246 + }, + { + "epoch": 0.2248, + "grad_norm": 1.5912705659866333, + "learning_rate": 1.906602609356838e-05, + "loss": 0.2828, + "step": 2248 + }, + { + "epoch": 0.225, + "grad_norm": 8.795039176940918, + "learning_rate": 1.9063077870366504e-05, + "loss": 0.452, + "step": 2250 + }, + { + "epoch": 0.2252, + "grad_norm": 13.420876502990723, + "learning_rate": 1.9060125229930572e-05, + "loss": 0.5585, + "step": 2252 + }, + { + "epoch": 0.2254, + "grad_norm": 8.277320861816406, + "learning_rate": 1.9057168173699664e-05, + "loss": 0.6882, + "step": 2254 + }, + { + "epoch": 0.2256, + "grad_norm": 2.512808084487915, + "learning_rate": 1.905420670311502e-05, + "loss": 0.1678, + "step": 2256 + }, + { + "epoch": 0.2258, + "grad_norm": 1.3870428800582886, + "learning_rate": 1.9051240819620018e-05, + "loss": 0.2426, + "step": 2258 + }, + { + "epoch": 0.226, + "grad_norm": 3.784214496612549, + "learning_rate": 1.9048270524660197e-05, + "loss": 0.1877, + "step": 2260 + }, + { + "epoch": 0.2262, + "grad_norm": 3.1591155529022217, + "learning_rate": 1.904529581968324e-05, + "loss": 0.4008, + "step": 2262 + }, + { + "epoch": 0.2264, + "grad_norm": 2.3281054496765137, + "learning_rate": 1.9042316706138987e-05, + "loss": 0.0737, + "step": 2264 + }, + { + "epoch": 0.2266, + "grad_norm": 5.332728862762451, + "learning_rate": 1.903933318547942e-05, + "loss": 0.8179, + "step": 2266 + }, + { + "epoch": 0.2268, + "grad_norm": 5.935715198516846, + "learning_rate": 1.9036345259158667e-05, + "loss": 0.3398, + "step": 2268 + }, + { + "epoch": 0.227, + "grad_norm": 2.2499592304229736, + "learning_rate": 1.903335292863301e-05, + "loss": 0.2083, + "step": 2270 + }, + { + "epoch": 0.2272, + "grad_norm": 2.7253258228302, + "learning_rate": 1.9030356195360875e-05, + "loss": 0.2307, + "step": 2272 + }, + { + "epoch": 0.2274, + "grad_norm": 3.3170642852783203, + "learning_rate": 1.902735506080283e-05, + "loss": 0.4014, + "step": 2274 + }, + { + "epoch": 0.2276, + "grad_norm": 0.7801094651222229, + "learning_rate": 1.9024349526421596e-05, + "loss": 0.0983, + "step": 2276 + }, + { + "epoch": 0.2278, + "grad_norm": 3.3817012310028076, + "learning_rate": 1.902133959368203e-05, + "loss": 0.4979, + "step": 2278 + }, + { + "epoch": 0.228, + "grad_norm": 2.579627752304077, + "learning_rate": 1.901832526405114e-05, + "loss": 0.2299, + "step": 2280 + }, + { + "epoch": 0.2282, + "grad_norm": 0.620607316493988, + "learning_rate": 1.901530653899807e-05, + "loss": 0.1357, + "step": 2282 + }, + { + "epoch": 0.2284, + "grad_norm": 2.6820576190948486, + "learning_rate": 1.9012283419994115e-05, + "loss": 0.2284, + "step": 2284 + }, + { + "epoch": 0.2286, + "grad_norm": 2.1231746673583984, + "learning_rate": 1.9009255908512704e-05, + "loss": 0.1772, + "step": 2286 + }, + { + "epoch": 0.2288, + "grad_norm": 9.799442291259766, + "learning_rate": 1.9006224006029404e-05, + "loss": 0.8271, + "step": 2288 + }, + { + "epoch": 0.229, + "grad_norm": 1.157966136932373, + "learning_rate": 1.9003187714021936e-05, + "loss": 0.105, + "step": 2290 + }, + { + "epoch": 0.2292, + "grad_norm": 0.5955468416213989, + "learning_rate": 1.9000147033970148e-05, + "loss": 0.0733, + "step": 2292 + }, + { + "epoch": 0.2294, + "grad_norm": 1.6986724138259888, + "learning_rate": 1.899710196735603e-05, + "loss": 0.1387, + "step": 2294 + }, + { + "epoch": 0.2296, + "grad_norm": 5.613572597503662, + "learning_rate": 1.899405251566371e-05, + "loss": 0.3847, + "step": 2296 + }, + { + "epoch": 0.2298, + "grad_norm": 1.9490113258361816, + "learning_rate": 1.8990998680379458e-05, + "loss": 0.3138, + "step": 2298 + }, + { + "epoch": 0.23, + "grad_norm": 4.346127510070801, + "learning_rate": 1.8987940462991673e-05, + "loss": 0.9862, + "step": 2300 + }, + { + "epoch": 0.2302, + "grad_norm": 1.7949714660644531, + "learning_rate": 1.8984877864990888e-05, + "loss": 0.2174, + "step": 2302 + }, + { + "epoch": 0.2304, + "grad_norm": 1.1416171789169312, + "learning_rate": 1.8981810887869784e-05, + "loss": 0.1225, + "step": 2304 + }, + { + "epoch": 0.2306, + "grad_norm": 2.664747953414917, + "learning_rate": 1.897873953312317e-05, + "loss": 0.2414, + "step": 2306 + }, + { + "epoch": 0.2308, + "grad_norm": 2.913633346557617, + "learning_rate": 1.8975663802247978e-05, + "loss": 0.1675, + "step": 2308 + }, + { + "epoch": 0.231, + "grad_norm": 1.904273509979248, + "learning_rate": 1.8972583696743284e-05, + "loss": 0.3357, + "step": 2310 + }, + { + "epoch": 0.2312, + "grad_norm": 1.570477843284607, + "learning_rate": 1.8969499218110302e-05, + "loss": 0.1428, + "step": 2312 + }, + { + "epoch": 0.2314, + "grad_norm": 8.871370315551758, + "learning_rate": 1.896641036785236e-05, + "loss": 0.3214, + "step": 2314 + }, + { + "epoch": 0.2316, + "grad_norm": 1.9939604997634888, + "learning_rate": 1.896331714747493e-05, + "loss": 0.1795, + "step": 2316 + }, + { + "epoch": 0.2318, + "grad_norm": 1.4739173650741577, + "learning_rate": 1.896021955848561e-05, + "loss": 0.3734, + "step": 2318 + }, + { + "epoch": 0.232, + "grad_norm": 1.5394773483276367, + "learning_rate": 1.895711760239413e-05, + "loss": 0.0867, + "step": 2320 + }, + { + "epoch": 0.2322, + "grad_norm": 1.3247002363204956, + "learning_rate": 1.895401128071234e-05, + "loss": 0.1396, + "step": 2322 + }, + { + "epoch": 0.2324, + "grad_norm": 2.4758243560791016, + "learning_rate": 1.8950900594954226e-05, + "loss": 0.394, + "step": 2324 + }, + { + "epoch": 0.2326, + "grad_norm": 3.9967737197875977, + "learning_rate": 1.8947785546635905e-05, + "loss": 0.331, + "step": 2326 + }, + { + "epoch": 0.2328, + "grad_norm": 1.4319734573364258, + "learning_rate": 1.89446661372756e-05, + "loss": 0.1522, + "step": 2328 + }, + { + "epoch": 0.233, + "grad_norm": 0.741013765335083, + "learning_rate": 1.8941542368393683e-05, + "loss": 0.1115, + "step": 2330 + }, + { + "epoch": 0.2332, + "grad_norm": 0.3116653263568878, + "learning_rate": 1.893841424151264e-05, + "loss": 0.2966, + "step": 2332 + }, + { + "epoch": 0.2334, + "grad_norm": 8.678589820861816, + "learning_rate": 1.893528175815708e-05, + "loss": 0.9459, + "step": 2334 + }, + { + "epoch": 0.2336, + "grad_norm": 7.20061731338501, + "learning_rate": 1.893214491985374e-05, + "loss": 0.2346, + "step": 2336 + }, + { + "epoch": 0.2338, + "grad_norm": 2.1576573848724365, + "learning_rate": 1.892900372813147e-05, + "loss": 0.1367, + "step": 2338 + }, + { + "epoch": 0.234, + "grad_norm": 1.3124428987503052, + "learning_rate": 1.892585818452126e-05, + "loss": 0.3639, + "step": 2340 + }, + { + "epoch": 0.2342, + "grad_norm": 0.5636441111564636, + "learning_rate": 1.8922708290556197e-05, + "loss": 0.1522, + "step": 2342 + }, + { + "epoch": 0.2344, + "grad_norm": 1.7306102514266968, + "learning_rate": 1.8919554047771508e-05, + "loss": 0.3843, + "step": 2344 + }, + { + "epoch": 0.2346, + "grad_norm": 4.045849800109863, + "learning_rate": 1.8916395457704536e-05, + "loss": 0.323, + "step": 2346 + }, + { + "epoch": 0.2348, + "grad_norm": 2.0259652137756348, + "learning_rate": 1.8913232521894734e-05, + "loss": 0.16, + "step": 2348 + }, + { + "epoch": 0.235, + "grad_norm": 3.4000399112701416, + "learning_rate": 1.891006524188368e-05, + "loss": 0.5722, + "step": 2350 + }, + { + "epoch": 0.2352, + "grad_norm": 4.522225379943848, + "learning_rate": 1.890689361921507e-05, + "loss": 0.237, + "step": 2352 + }, + { + "epoch": 0.2354, + "grad_norm": 1.5163888931274414, + "learning_rate": 1.8903717655434708e-05, + "loss": 0.105, + "step": 2354 + }, + { + "epoch": 0.2356, + "grad_norm": 1.8625366687774658, + "learning_rate": 1.8900537352090523e-05, + "loss": 0.1839, + "step": 2356 + }, + { + "epoch": 0.2358, + "grad_norm": 6.078539848327637, + "learning_rate": 1.8897352710732564e-05, + "loss": 0.2088, + "step": 2358 + }, + { + "epoch": 0.236, + "grad_norm": 2.1609041690826416, + "learning_rate": 1.889416373291298e-05, + "loss": 0.1967, + "step": 2360 + }, + { + "epoch": 0.2362, + "grad_norm": 1.6443394422531128, + "learning_rate": 1.8890970420186035e-05, + "loss": 0.337, + "step": 2362 + }, + { + "epoch": 0.2364, + "grad_norm": 7.876880168914795, + "learning_rate": 1.8887772774108116e-05, + "loss": 0.3482, + "step": 2364 + }, + { + "epoch": 0.2366, + "grad_norm": 3.1669762134552, + "learning_rate": 1.888457079623772e-05, + "loss": 0.2606, + "step": 2366 + }, + { + "epoch": 0.2368, + "grad_norm": 0.794026792049408, + "learning_rate": 1.8881364488135448e-05, + "loss": 0.0916, + "step": 2368 + }, + { + "epoch": 0.237, + "grad_norm": 0.3224738836288452, + "learning_rate": 1.8878153851364013e-05, + "loss": 0.0544, + "step": 2370 + }, + { + "epoch": 0.2372, + "grad_norm": 6.185129642486572, + "learning_rate": 1.887493888748825e-05, + "loss": 0.2579, + "step": 2372 + }, + { + "epoch": 0.2374, + "grad_norm": 4.117551803588867, + "learning_rate": 1.8871719598075083e-05, + "loss": 0.5555, + "step": 2374 + }, + { + "epoch": 0.2376, + "grad_norm": 1.4950443506240845, + "learning_rate": 1.886849598469356e-05, + "loss": 0.0274, + "step": 2376 + }, + { + "epoch": 0.2378, + "grad_norm": 1.8144456148147583, + "learning_rate": 1.8865268048914828e-05, + "loss": 0.2093, + "step": 2378 + }, + { + "epoch": 0.238, + "grad_norm": 4.777124881744385, + "learning_rate": 1.8862035792312148e-05, + "loss": 0.7292, + "step": 2380 + }, + { + "epoch": 0.2382, + "grad_norm": 7.393581867218018, + "learning_rate": 1.8858799216460883e-05, + "loss": 0.3138, + "step": 2382 + }, + { + "epoch": 0.2384, + "grad_norm": 0.45316171646118164, + "learning_rate": 1.8855558322938492e-05, + "loss": 0.281, + "step": 2384 + }, + { + "epoch": 0.2386, + "grad_norm": 5.675621509552002, + "learning_rate": 1.8852313113324553e-05, + "loss": 0.246, + "step": 2386 + }, + { + "epoch": 0.2388, + "grad_norm": 3.13859224319458, + "learning_rate": 1.8849063589200744e-05, + "loss": 0.2551, + "step": 2388 + }, + { + "epoch": 0.239, + "grad_norm": 1.3566502332687378, + "learning_rate": 1.884580975215084e-05, + "loss": 0.1713, + "step": 2390 + }, + { + "epoch": 0.2392, + "grad_norm": 3.6396286487579346, + "learning_rate": 1.8842551603760725e-05, + "loss": 0.1503, + "step": 2392 + }, + { + "epoch": 0.2394, + "grad_norm": 5.951265335083008, + "learning_rate": 1.8839289145618378e-05, + "loss": 0.2646, + "step": 2394 + }, + { + "epoch": 0.2396, + "grad_norm": 4.037349700927734, + "learning_rate": 1.8836022379313884e-05, + "loss": 0.204, + "step": 2396 + }, + { + "epoch": 0.2398, + "grad_norm": 0.6894005537033081, + "learning_rate": 1.883275130643942e-05, + "loss": 0.2981, + "step": 2398 + }, + { + "epoch": 0.24, + "grad_norm": 0.6646224856376648, + "learning_rate": 1.8829475928589272e-05, + "loss": 0.0333, + "step": 2400 + }, + { + "epoch": 0.2402, + "grad_norm": 1.464653730392456, + "learning_rate": 1.882619624735982e-05, + "loss": 0.1248, + "step": 2402 + }, + { + "epoch": 0.2404, + "grad_norm": 0.43089112639427185, + "learning_rate": 1.8822912264349535e-05, + "loss": 0.0191, + "step": 2404 + }, + { + "epoch": 0.2406, + "grad_norm": 5.105642318725586, + "learning_rate": 1.8819623981158996e-05, + "loss": 0.214, + "step": 2406 + }, + { + "epoch": 0.2408, + "grad_norm": 1.400454044342041, + "learning_rate": 1.881633139939087e-05, + "loss": 0.0869, + "step": 2408 + }, + { + "epoch": 0.241, + "grad_norm": 14.481117248535156, + "learning_rate": 1.8813034520649923e-05, + "loss": 1.3508, + "step": 2410 + }, + { + "epoch": 0.2412, + "grad_norm": 5.723416328430176, + "learning_rate": 1.8809733346543013e-05, + "loss": 0.3961, + "step": 2412 + }, + { + "epoch": 0.2414, + "grad_norm": 2.051590919494629, + "learning_rate": 1.880642787867909e-05, + "loss": 0.1662, + "step": 2414 + }, + { + "epoch": 0.2416, + "grad_norm": 1.9778416156768799, + "learning_rate": 1.8803118118669203e-05, + "loss": 0.3836, + "step": 2416 + }, + { + "epoch": 0.2418, + "grad_norm": 6.291645526885986, + "learning_rate": 1.8799804068126487e-05, + "loss": 0.6904, + "step": 2418 + }, + { + "epoch": 0.242, + "grad_norm": 5.266213893890381, + "learning_rate": 1.879648572866617e-05, + "loss": 0.3335, + "step": 2420 + }, + { + "epoch": 0.2422, + "grad_norm": 1.7489036321640015, + "learning_rate": 1.8793163101905562e-05, + "loss": 0.3163, + "step": 2422 + }, + { + "epoch": 0.2424, + "grad_norm": 2.6449134349823, + "learning_rate": 1.878983618946409e-05, + "loss": 0.3497, + "step": 2424 + }, + { + "epoch": 0.2426, + "grad_norm": 1.2462626695632935, + "learning_rate": 1.878650499296323e-05, + "loss": 0.1234, + "step": 2426 + }, + { + "epoch": 0.2428, + "grad_norm": 12.666828155517578, + "learning_rate": 1.878316951402658e-05, + "loss": 0.4002, + "step": 2428 + }, + { + "epoch": 0.243, + "grad_norm": 1.419054388999939, + "learning_rate": 1.8779829754279806e-05, + "loss": 0.3486, + "step": 2430 + }, + { + "epoch": 0.2432, + "grad_norm": 0.8969396948814392, + "learning_rate": 1.8776485715350672e-05, + "loss": 0.2963, + "step": 2432 + }, + { + "epoch": 0.2434, + "grad_norm": 3.6412370204925537, + "learning_rate": 1.8773137398869017e-05, + "loss": 0.29, + "step": 2434 + }, + { + "epoch": 0.2436, + "grad_norm": 2.437797784805298, + "learning_rate": 1.8769784806466768e-05, + "loss": 0.3726, + "step": 2436 + }, + { + "epoch": 0.2438, + "grad_norm": 1.869241714477539, + "learning_rate": 1.8766427939777943e-05, + "loss": 0.1588, + "step": 2438 + }, + { + "epoch": 0.244, + "grad_norm": 2.2472105026245117, + "learning_rate": 1.8763066800438638e-05, + "loss": 0.2516, + "step": 2440 + }, + { + "epoch": 0.2442, + "grad_norm": 0.7342290282249451, + "learning_rate": 1.8759701390087026e-05, + "loss": 0.0635, + "step": 2442 + }, + { + "epoch": 0.2444, + "grad_norm": 3.004732608795166, + "learning_rate": 1.8756331710363375e-05, + "loss": 0.1729, + "step": 2444 + }, + { + "epoch": 0.2446, + "grad_norm": 1.395782709121704, + "learning_rate": 1.8752957762910016e-05, + "loss": 0.2649, + "step": 2446 + }, + { + "epoch": 0.2448, + "grad_norm": 3.2434980869293213, + "learning_rate": 1.874957954937138e-05, + "loss": 0.2173, + "step": 2448 + }, + { + "epoch": 0.245, + "grad_norm": 2.856022357940674, + "learning_rate": 1.874619707139396e-05, + "loss": 0.3358, + "step": 2450 + }, + { + "epoch": 0.2452, + "grad_norm": 3.0424506664276123, + "learning_rate": 1.8742810330626338e-05, + "loss": 0.267, + "step": 2452 + }, + { + "epoch": 0.2454, + "grad_norm": 2.229801893234253, + "learning_rate": 1.873941932871917e-05, + "loss": 0.3336, + "step": 2454 + }, + { + "epoch": 0.2456, + "grad_norm": 5.333440780639648, + "learning_rate": 1.8736024067325188e-05, + "loss": 0.7694, + "step": 2456 + }, + { + "epoch": 0.2458, + "grad_norm": 2.4231739044189453, + "learning_rate": 1.8732624548099204e-05, + "loss": 0.1336, + "step": 2458 + }, + { + "epoch": 0.246, + "grad_norm": 3.4374513626098633, + "learning_rate": 1.8729220772698096e-05, + "loss": 0.3841, + "step": 2460 + }, + { + "epoch": 0.2462, + "grad_norm": 7.259360313415527, + "learning_rate": 1.8725812742780832e-05, + "loss": 0.2827, + "step": 2462 + }, + { + "epoch": 0.2464, + "grad_norm": 2.9198765754699707, + "learning_rate": 1.8722400460008437e-05, + "loss": 0.2378, + "step": 2464 + }, + { + "epoch": 0.2466, + "grad_norm": 0.7190779447555542, + "learning_rate": 1.871898392604402e-05, + "loss": 0.0795, + "step": 2466 + }, + { + "epoch": 0.2468, + "grad_norm": 2.157228469848633, + "learning_rate": 1.8715563142552758e-05, + "loss": 0.4149, + "step": 2468 + }, + { + "epoch": 0.247, + "grad_norm": 2.939298629760742, + "learning_rate": 1.8712138111201898e-05, + "loss": 0.2158, + "step": 2470 + }, + { + "epoch": 0.2472, + "grad_norm": 2.2276322841644287, + "learning_rate": 1.8708708833660755e-05, + "loss": 0.2085, + "step": 2472 + }, + { + "epoch": 0.2474, + "grad_norm": 0.6593524813652039, + "learning_rate": 1.8705275311600724e-05, + "loss": 0.074, + "step": 2474 + }, + { + "epoch": 0.2476, + "grad_norm": 0.588624119758606, + "learning_rate": 1.870183754669526e-05, + "loss": 0.6651, + "step": 2476 + }, + { + "epoch": 0.2478, + "grad_norm": 0.8389158248901367, + "learning_rate": 1.8698395540619883e-05, + "loss": 0.1817, + "step": 2478 + }, + { + "epoch": 0.248, + "grad_norm": 1.4358712434768677, + "learning_rate": 1.869494929505219e-05, + "loss": 0.0888, + "step": 2480 + }, + { + "epoch": 0.2482, + "grad_norm": 2.224890947341919, + "learning_rate": 1.869149881167184e-05, + "loss": 0.5602, + "step": 2482 + }, + { + "epoch": 0.2484, + "grad_norm": 4.835969924926758, + "learning_rate": 1.8688044092160554e-05, + "loss": 0.1846, + "step": 2484 + }, + { + "epoch": 0.2486, + "grad_norm": 2.859402894973755, + "learning_rate": 1.8684585138202122e-05, + "loss": 0.2151, + "step": 2486 + }, + { + "epoch": 0.2488, + "grad_norm": 2.17842173576355, + "learning_rate": 1.8681121951482397e-05, + "loss": 0.23, + "step": 2488 + }, + { + "epoch": 0.249, + "grad_norm": 1.0750844478607178, + "learning_rate": 1.8677654533689287e-05, + "loss": 0.215, + "step": 2490 + }, + { + "epoch": 0.2492, + "grad_norm": 6.691223621368408, + "learning_rate": 1.8674182886512776e-05, + "loss": 0.3645, + "step": 2492 + }, + { + "epoch": 0.2494, + "grad_norm": 3.2773327827453613, + "learning_rate": 1.86707070116449e-05, + "loss": 0.2645, + "step": 2494 + }, + { + "epoch": 0.2496, + "grad_norm": 4.642674446105957, + "learning_rate": 1.8667226910779767e-05, + "loss": 0.1997, + "step": 2496 + }, + { + "epoch": 0.2498, + "grad_norm": 9.599454879760742, + "learning_rate": 1.866374258561352e-05, + "loss": 0.4696, + "step": 2498 + }, + { + "epoch": 0.25, + "grad_norm": 0.17933738231658936, + "learning_rate": 1.866025403784439e-05, + "loss": 0.0166, + "step": 2500 + }, + { + "epoch": 0.2502, + "grad_norm": 3.293269395828247, + "learning_rate": 1.8656761269172645e-05, + "loss": 0.3994, + "step": 2502 + }, + { + "epoch": 0.2504, + "grad_norm": 1.116142749786377, + "learning_rate": 1.8653264281300622e-05, + "loss": 0.2531, + "step": 2504 + }, + { + "epoch": 0.2506, + "grad_norm": 3.7051804065704346, + "learning_rate": 1.864976307593271e-05, + "loss": 0.0713, + "step": 2506 + }, + { + "epoch": 0.2508, + "grad_norm": 3.2398624420166016, + "learning_rate": 1.864625765477535e-05, + "loss": 0.1651, + "step": 2508 + }, + { + "epoch": 0.251, + "grad_norm": 0.44193923473358154, + "learning_rate": 1.864274801953705e-05, + "loss": 0.0647, + "step": 2510 + }, + { + "epoch": 0.2512, + "grad_norm": 4.042020320892334, + "learning_rate": 1.8639234171928355e-05, + "loss": 0.5935, + "step": 2512 + }, + { + "epoch": 0.2514, + "grad_norm": 0.14937610924243927, + "learning_rate": 1.8635716113661876e-05, + "loss": 0.1651, + "step": 2514 + }, + { + "epoch": 0.2516, + "grad_norm": 2.1045565605163574, + "learning_rate": 1.863219384645227e-05, + "loss": 0.167, + "step": 2516 + }, + { + "epoch": 0.2518, + "grad_norm": 3.2725329399108887, + "learning_rate": 1.862866737201625e-05, + "loss": 0.1275, + "step": 2518 + }, + { + "epoch": 0.252, + "grad_norm": 3.9612643718719482, + "learning_rate": 1.8625136692072577e-05, + "loss": 0.2292, + "step": 2520 + }, + { + "epoch": 0.2522, + "grad_norm": 1.4020951986312866, + "learning_rate": 1.862160180834206e-05, + "loss": 0.8248, + "step": 2522 + }, + { + "epoch": 0.2524, + "grad_norm": 5.059605598449707, + "learning_rate": 1.861806272254755e-05, + "loss": 0.1421, + "step": 2524 + }, + { + "epoch": 0.2526, + "grad_norm": 2.6634531021118164, + "learning_rate": 1.8614519436413968e-05, + "loss": 0.2388, + "step": 2526 + }, + { + "epoch": 0.2528, + "grad_norm": 3.2796871662139893, + "learning_rate": 1.8610971951668265e-05, + "loss": 0.1549, + "step": 2528 + }, + { + "epoch": 0.253, + "grad_norm": 0.31612658500671387, + "learning_rate": 1.860742027003944e-05, + "loss": 0.0167, + "step": 2530 + }, + { + "epoch": 0.2532, + "grad_norm": 8.121637344360352, + "learning_rate": 1.8603864393258534e-05, + "loss": 0.3577, + "step": 2532 + }, + { + "epoch": 0.2534, + "grad_norm": 3.1375465393066406, + "learning_rate": 1.860030432305865e-05, + "loss": 0.8208, + "step": 2534 + }, + { + "epoch": 0.2536, + "grad_norm": 3.7270150184631348, + "learning_rate": 1.8596740061174912e-05, + "loss": 0.1235, + "step": 2536 + }, + { + "epoch": 0.2538, + "grad_norm": 0.9021314382553101, + "learning_rate": 1.8593171609344505e-05, + "loss": 0.0874, + "step": 2538 + }, + { + "epoch": 0.254, + "grad_norm": 0.6309481263160706, + "learning_rate": 1.8589598969306646e-05, + "loss": 0.1779, + "step": 2540 + }, + { + "epoch": 0.2542, + "grad_norm": 2.456204891204834, + "learning_rate": 1.8586022142802597e-05, + "loss": 0.4833, + "step": 2542 + }, + { + "epoch": 0.2544, + "grad_norm": 4.076435565948486, + "learning_rate": 1.8582441131575658e-05, + "loss": 0.2473, + "step": 2544 + }, + { + "epoch": 0.2546, + "grad_norm": 1.524857759475708, + "learning_rate": 1.8578855937371176e-05, + "loss": 0.095, + "step": 2546 + }, + { + "epoch": 0.2548, + "grad_norm": 0.6109943389892578, + "learning_rate": 1.8575266561936526e-05, + "loss": 0.0344, + "step": 2548 + }, + { + "epoch": 0.255, + "grad_norm": 3.9478862285614014, + "learning_rate": 1.8571673007021124e-05, + "loss": 0.6644, + "step": 2550 + }, + { + "epoch": 0.2552, + "grad_norm": 2.1006197929382324, + "learning_rate": 1.856807527437643e-05, + "loss": 0.1487, + "step": 2552 + }, + { + "epoch": 0.2554, + "grad_norm": 5.350639820098877, + "learning_rate": 1.8564473365755936e-05, + "loss": 0.6361, + "step": 2554 + }, + { + "epoch": 0.2556, + "grad_norm": 0.7489902377128601, + "learning_rate": 1.8560867282915164e-05, + "loss": 0.1201, + "step": 2556 + }, + { + "epoch": 0.2558, + "grad_norm": 2.0265865325927734, + "learning_rate": 1.8557257027611677e-05, + "loss": 0.1962, + "step": 2558 + }, + { + "epoch": 0.256, + "grad_norm": 4.616270065307617, + "learning_rate": 1.855364260160507e-05, + "loss": 0.3977, + "step": 2560 + }, + { + "epoch": 0.2562, + "grad_norm": 0.9611383080482483, + "learning_rate": 1.8550024006656967e-05, + "loss": 0.0963, + "step": 2562 + }, + { + "epoch": 0.2564, + "grad_norm": 1.8750972747802734, + "learning_rate": 1.854640124453103e-05, + "loss": 0.5964, + "step": 2564 + }, + { + "epoch": 0.2566, + "grad_norm": 11.153924942016602, + "learning_rate": 1.8542774316992953e-05, + "loss": 0.4442, + "step": 2566 + }, + { + "epoch": 0.2568, + "grad_norm": 0.6988123059272766, + "learning_rate": 1.8539143225810453e-05, + "loss": 0.3519, + "step": 2568 + }, + { + "epoch": 0.257, + "grad_norm": 1.264974594116211, + "learning_rate": 1.8535507972753275e-05, + "loss": 0.2455, + "step": 2570 + }, + { + "epoch": 0.2572, + "grad_norm": 4.065680027008057, + "learning_rate": 1.8531868559593205e-05, + "loss": 0.4812, + "step": 2572 + }, + { + "epoch": 0.2574, + "grad_norm": 2.869530200958252, + "learning_rate": 1.8528224988104044e-05, + "loss": 0.2885, + "step": 2574 + }, + { + "epoch": 0.2576, + "grad_norm": 12.683579444885254, + "learning_rate": 1.8524577260061628e-05, + "loss": 0.2457, + "step": 2576 + }, + { + "epoch": 0.2578, + "grad_norm": 0.61281818151474, + "learning_rate": 1.8520925377243812e-05, + "loss": 0.0877, + "step": 2578 + }, + { + "epoch": 0.258, + "grad_norm": 2.710731029510498, + "learning_rate": 1.851726934143048e-05, + "loss": 0.3345, + "step": 2580 + }, + { + "epoch": 0.2582, + "grad_norm": 3.4396347999572754, + "learning_rate": 1.8513609154403535e-05, + "loss": 0.536, + "step": 2582 + }, + { + "epoch": 0.2584, + "grad_norm": 6.010830402374268, + "learning_rate": 1.850994481794692e-05, + "loss": 0.5352, + "step": 2584 + }, + { + "epoch": 0.2586, + "grad_norm": 4.101052761077881, + "learning_rate": 1.850627633384658e-05, + "loss": 0.1489, + "step": 2586 + }, + { + "epoch": 0.2588, + "grad_norm": 4.959023952484131, + "learning_rate": 1.8502603703890488e-05, + "loss": 0.2165, + "step": 2588 + }, + { + "epoch": 0.259, + "grad_norm": 2.055767297744751, + "learning_rate": 1.849892692986864e-05, + "loss": 0.102, + "step": 2590 + }, + { + "epoch": 0.2592, + "grad_norm": 1.1151221990585327, + "learning_rate": 1.8495246013573057e-05, + "loss": 0.1486, + "step": 2592 + }, + { + "epoch": 0.2594, + "grad_norm": 0.9500857591629028, + "learning_rate": 1.8491560956797766e-05, + "loss": 0.0774, + "step": 2594 + }, + { + "epoch": 0.2596, + "grad_norm": 0.7054685950279236, + "learning_rate": 1.848787176133882e-05, + "loss": 0.6057, + "step": 2596 + }, + { + "epoch": 0.2598, + "grad_norm": 2.021289348602295, + "learning_rate": 1.848417842899429e-05, + "loss": 0.1044, + "step": 2598 + }, + { + "epoch": 0.26, + "grad_norm": 1.2079353332519531, + "learning_rate": 1.848048096156426e-05, + "loss": 0.1437, + "step": 2600 + }, + { + "epoch": 0.2602, + "grad_norm": 2.195878267288208, + "learning_rate": 1.8476779360850833e-05, + "loss": 0.3016, + "step": 2602 + }, + { + "epoch": 0.2604, + "grad_norm": 1.5879536867141724, + "learning_rate": 1.8473073628658123e-05, + "loss": 0.117, + "step": 2604 + }, + { + "epoch": 0.2606, + "grad_norm": 3.478729009628296, + "learning_rate": 1.8469363766792258e-05, + "loss": 0.4095, + "step": 2606 + }, + { + "epoch": 0.2608, + "grad_norm": 3.6574208736419678, + "learning_rate": 1.8465649777061377e-05, + "loss": 0.2104, + "step": 2608 + }, + { + "epoch": 0.261, + "grad_norm": 3.8282949924468994, + "learning_rate": 1.8461931661275642e-05, + "loss": 0.1451, + "step": 2610 + }, + { + "epoch": 0.2612, + "grad_norm": 3.6004621982574463, + "learning_rate": 1.8458209421247208e-05, + "loss": 0.1364, + "step": 2612 + }, + { + "epoch": 0.2614, + "grad_norm": 1.4721136093139648, + "learning_rate": 1.8454483058790254e-05, + "loss": 0.1309, + "step": 2614 + }, + { + "epoch": 0.2616, + "grad_norm": 3.0431156158447266, + "learning_rate": 1.8450752575720967e-05, + "loss": 0.1098, + "step": 2616 + }, + { + "epoch": 0.2618, + "grad_norm": 1.8569884300231934, + "learning_rate": 1.844701797385753e-05, + "loss": 0.2693, + "step": 2618 + }, + { + "epoch": 0.262, + "grad_norm": 13.084210395812988, + "learning_rate": 1.8443279255020153e-05, + "loss": 0.5709, + "step": 2620 + }, + { + "epoch": 0.2622, + "grad_norm": 2.537609577178955, + "learning_rate": 1.8439536421031035e-05, + "loss": 0.6623, + "step": 2622 + }, + { + "epoch": 0.2624, + "grad_norm": 0.24397003650665283, + "learning_rate": 1.843578947371439e-05, + "loss": 0.0699, + "step": 2624 + }, + { + "epoch": 0.2626, + "grad_norm": 5.735052108764648, + "learning_rate": 1.8432038414896432e-05, + "loss": 0.3698, + "step": 2626 + }, + { + "epoch": 0.2628, + "grad_norm": 0.5779203772544861, + "learning_rate": 1.842828324640539e-05, + "loss": 0.1258, + "step": 2628 + }, + { + "epoch": 0.263, + "grad_norm": 7.6135406494140625, + "learning_rate": 1.842452397007148e-05, + "loss": 0.3473, + "step": 2630 + }, + { + "epoch": 0.2632, + "grad_norm": 0.24288348853588104, + "learning_rate": 1.8420760587726925e-05, + "loss": 0.4375, + "step": 2632 + }, + { + "epoch": 0.2634, + "grad_norm": 0.3086508512496948, + "learning_rate": 1.8416993101205957e-05, + "loss": 0.053, + "step": 2634 + }, + { + "epoch": 0.2636, + "grad_norm": 0.24247707426548004, + "learning_rate": 1.8413221512344805e-05, + "loss": 0.2964, + "step": 2636 + }, + { + "epoch": 0.2638, + "grad_norm": 3.4181265830993652, + "learning_rate": 1.8409445822981694e-05, + "loss": 0.3037, + "step": 2638 + }, + { + "epoch": 0.264, + "grad_norm": 4.177505016326904, + "learning_rate": 1.8405666034956842e-05, + "loss": 0.4888, + "step": 2640 + }, + { + "epoch": 0.2642, + "grad_norm": 3.3590710163116455, + "learning_rate": 1.8401882150112485e-05, + "loss": 0.1655, + "step": 2642 + }, + { + "epoch": 0.2644, + "grad_norm": 2.6430881023406982, + "learning_rate": 1.839809417029283e-05, + "loss": 0.0775, + "step": 2644 + }, + { + "epoch": 0.2646, + "grad_norm": 4.68912935256958, + "learning_rate": 1.8394302097344103e-05, + "loss": 0.308, + "step": 2646 + }, + { + "epoch": 0.2648, + "grad_norm": 1.9956631660461426, + "learning_rate": 1.8390505933114503e-05, + "loss": 0.1043, + "step": 2648 + }, + { + "epoch": 0.265, + "grad_norm": 3.1871337890625, + "learning_rate": 1.8386705679454243e-05, + "loss": 0.1459, + "step": 2650 + }, + { + "epoch": 0.2652, + "grad_norm": 2.985778570175171, + "learning_rate": 1.8382901338215515e-05, + "loss": 0.1399, + "step": 2652 + }, + { + "epoch": 0.2654, + "grad_norm": 1.06570303440094, + "learning_rate": 1.8379092911252515e-05, + "loss": 0.0783, + "step": 2654 + }, + { + "epoch": 0.2656, + "grad_norm": 10.251187324523926, + "learning_rate": 1.837528040042142e-05, + "loss": 0.5091, + "step": 2656 + }, + { + "epoch": 0.2658, + "grad_norm": 0.11516684293746948, + "learning_rate": 1.83714638075804e-05, + "loss": 0.0308, + "step": 2658 + }, + { + "epoch": 0.266, + "grad_norm": 3.02200984954834, + "learning_rate": 1.836764313458962e-05, + "loss": 0.3531, + "step": 2660 + }, + { + "epoch": 0.2662, + "grad_norm": 3.413304328918457, + "learning_rate": 1.8363818383311226e-05, + "loss": 0.3046, + "step": 2662 + }, + { + "epoch": 0.2664, + "grad_norm": 0.218239888548851, + "learning_rate": 1.8359989555609355e-05, + "loss": 0.0191, + "step": 2664 + }, + { + "epoch": 0.2666, + "grad_norm": 1.0864304304122925, + "learning_rate": 1.8356156653350138e-05, + "loss": 0.211, + "step": 2666 + }, + { + "epoch": 0.2668, + "grad_norm": 4.5643839836120605, + "learning_rate": 1.8352319678401677e-05, + "loss": 0.2343, + "step": 2668 + }, + { + "epoch": 0.267, + "grad_norm": 0.9928192496299744, + "learning_rate": 1.8348478632634067e-05, + "loss": 0.1405, + "step": 2670 + }, + { + "epoch": 0.2672, + "grad_norm": 0.9407457709312439, + "learning_rate": 1.834463351791939e-05, + "loss": 0.0384, + "step": 2672 + }, + { + "epoch": 0.2674, + "grad_norm": 0.21397624909877777, + "learning_rate": 1.8340784336131715e-05, + "loss": 0.1285, + "step": 2674 + }, + { + "epoch": 0.2676, + "grad_norm": 8.079825401306152, + "learning_rate": 1.8336931089147076e-05, + "loss": 0.8778, + "step": 2676 + }, + { + "epoch": 0.2678, + "grad_norm": 0.681880533695221, + "learning_rate": 1.83330737788435e-05, + "loss": 0.0269, + "step": 2678 + }, + { + "epoch": 0.268, + "grad_norm": 4.60369873046875, + "learning_rate": 1.8329212407100996e-05, + "loss": 0.2473, + "step": 2680 + }, + { + "epoch": 0.2682, + "grad_norm": 0.333812415599823, + "learning_rate": 1.832534697580155e-05, + "loss": 0.118, + "step": 2682 + }, + { + "epoch": 0.2684, + "grad_norm": 4.275298595428467, + "learning_rate": 1.8321477486829128e-05, + "loss": 0.2421, + "step": 2684 + }, + { + "epoch": 0.2686, + "grad_norm": 2.6665358543395996, + "learning_rate": 1.8317603942069665e-05, + "loss": 0.1535, + "step": 2686 + }, + { + "epoch": 0.2688, + "grad_norm": 12.264452934265137, + "learning_rate": 1.8313726343411085e-05, + "loss": 0.6006, + "step": 2688 + }, + { + "epoch": 0.269, + "grad_norm": 7.388026714324951, + "learning_rate": 1.8309844692743283e-05, + "loss": 0.3705, + "step": 2690 + }, + { + "epoch": 0.2692, + "grad_norm": 0.12849552929401398, + "learning_rate": 1.830595899195813e-05, + "loss": 0.1132, + "step": 2692 + }, + { + "epoch": 0.2694, + "grad_norm": 7.135030746459961, + "learning_rate": 1.830206924294946e-05, + "loss": 0.4163, + "step": 2694 + }, + { + "epoch": 0.2696, + "grad_norm": 10.465075492858887, + "learning_rate": 1.82981754476131e-05, + "loss": 0.1827, + "step": 2696 + }, + { + "epoch": 0.2698, + "grad_norm": 2.744213342666626, + "learning_rate": 1.8294277607846834e-05, + "loss": 0.237, + "step": 2698 + }, + { + "epoch": 0.27, + "grad_norm": 8.057718276977539, + "learning_rate": 1.8290375725550417e-05, + "loss": 0.4172, + "step": 2700 + }, + { + "epoch": 0.2702, + "grad_norm": 1.7675005197525024, + "learning_rate": 1.828646980262559e-05, + "loss": 0.1601, + "step": 2702 + }, + { + "epoch": 0.2704, + "grad_norm": 0.918673038482666, + "learning_rate": 1.8282559840976043e-05, + "loss": 0.1499, + "step": 2704 + }, + { + "epoch": 0.2706, + "grad_norm": 0.34883373975753784, + "learning_rate": 1.8278645842507448e-05, + "loss": 0.0566, + "step": 2706 + }, + { + "epoch": 0.2708, + "grad_norm": 6.1770758628845215, + "learning_rate": 1.827472780912744e-05, + "loss": 0.2398, + "step": 2708 + }, + { + "epoch": 0.271, + "grad_norm": 10.502192497253418, + "learning_rate": 1.827080574274562e-05, + "loss": 0.3826, + "step": 2710 + }, + { + "epoch": 0.2712, + "grad_norm": 1.2738440036773682, + "learning_rate": 1.8266879645273557e-05, + "loss": 0.0405, + "step": 2712 + }, + { + "epoch": 0.2714, + "grad_norm": 2.473457098007202, + "learning_rate": 1.826294951862478e-05, + "loss": 0.3268, + "step": 2714 + }, + { + "epoch": 0.2716, + "grad_norm": 1.4379875659942627, + "learning_rate": 1.8259015364714786e-05, + "loss": 0.1772, + "step": 2716 + }, + { + "epoch": 0.2718, + "grad_norm": 0.387317031621933, + "learning_rate": 1.825507718546104e-05, + "loss": 0.0211, + "step": 2718 + }, + { + "epoch": 0.272, + "grad_norm": 2.845128059387207, + "learning_rate": 1.8251134982782952e-05, + "loss": 0.4719, + "step": 2720 + }, + { + "epoch": 0.2722, + "grad_norm": 0.22480414807796478, + "learning_rate": 1.8247188758601912e-05, + "loss": 0.0347, + "step": 2722 + }, + { + "epoch": 0.2724, + "grad_norm": 2.8036043643951416, + "learning_rate": 1.824323851484126e-05, + "loss": 0.1484, + "step": 2724 + }, + { + "epoch": 0.2726, + "grad_norm": 0.08101589232683182, + "learning_rate": 1.8239284253426294e-05, + "loss": 0.0392, + "step": 2726 + }, + { + "epoch": 0.2728, + "grad_norm": 30.116926193237305, + "learning_rate": 1.8235325976284276e-05, + "loss": 0.8905, + "step": 2728 + }, + { + "epoch": 0.273, + "grad_norm": 1.6454282999038696, + "learning_rate": 1.8231363685344422e-05, + "loss": 0.0559, + "step": 2730 + }, + { + "epoch": 0.2732, + "grad_norm": 3.2071683406829834, + "learning_rate": 1.82273973825379e-05, + "loss": 0.2473, + "step": 2732 + }, + { + "epoch": 0.2734, + "grad_norm": 10.213172912597656, + "learning_rate": 1.8223427069797845e-05, + "loss": 0.2269, + "step": 2734 + }, + { + "epoch": 0.2736, + "grad_norm": 7.517613410949707, + "learning_rate": 1.8219452749059332e-05, + "loss": 0.3452, + "step": 2736 + }, + { + "epoch": 0.2738, + "grad_norm": 1.1272602081298828, + "learning_rate": 1.8215474422259403e-05, + "loss": 0.1776, + "step": 2738 + }, + { + "epoch": 0.274, + "grad_norm": 0.9250140190124512, + "learning_rate": 1.821149209133704e-05, + "loss": 0.1877, + "step": 2740 + }, + { + "epoch": 0.2742, + "grad_norm": 1.1862109899520874, + "learning_rate": 1.820750575823319e-05, + "loss": 0.0252, + "step": 2742 + }, + { + "epoch": 0.2744, + "grad_norm": 5.114722728729248, + "learning_rate": 1.8203515424890738e-05, + "loss": 0.5117, + "step": 2744 + }, + { + "epoch": 0.2746, + "grad_norm": 7.5531086921691895, + "learning_rate": 1.8199521093254524e-05, + "loss": 0.8118, + "step": 2746 + }, + { + "epoch": 0.2748, + "grad_norm": 19.06239128112793, + "learning_rate": 1.819552276527134e-05, + "loss": 0.7622, + "step": 2748 + }, + { + "epoch": 0.275, + "grad_norm": 2.5143494606018066, + "learning_rate": 1.819152044288992e-05, + "loss": 0.2913, + "step": 2750 + }, + { + "epoch": 0.2752, + "grad_norm": 1.308100938796997, + "learning_rate": 1.8187514128060946e-05, + "loss": 0.1769, + "step": 2752 + }, + { + "epoch": 0.2754, + "grad_norm": 6.6905837059021, + "learning_rate": 1.818350382273705e-05, + "loss": 0.7301, + "step": 2754 + }, + { + "epoch": 0.2756, + "grad_norm": 1.9593067169189453, + "learning_rate": 1.8179489528872808e-05, + "loss": 0.4642, + "step": 2756 + }, + { + "epoch": 0.2758, + "grad_norm": 7.618997573852539, + "learning_rate": 1.817547124842473e-05, + "loss": 0.3285, + "step": 2758 + }, + { + "epoch": 0.276, + "grad_norm": 4.072129726409912, + "learning_rate": 1.8171448983351284e-05, + "loss": 0.1919, + "step": 2760 + }, + { + "epoch": 0.2762, + "grad_norm": 0.42512863874435425, + "learning_rate": 1.8167422735612877e-05, + "loss": 0.1735, + "step": 2762 + }, + { + "epoch": 0.2764, + "grad_norm": 3.5821197032928467, + "learning_rate": 1.816339250717184e-05, + "loss": 0.31, + "step": 2764 + }, + { + "epoch": 0.2766, + "grad_norm": 1.7279516458511353, + "learning_rate": 1.815935829999247e-05, + "loss": 0.2082, + "step": 2766 + }, + { + "epoch": 0.2768, + "grad_norm": 3.1246533393859863, + "learning_rate": 1.8155320116040983e-05, + "loss": 0.5841, + "step": 2768 + }, + { + "epoch": 0.277, + "grad_norm": 5.0204033851623535, + "learning_rate": 1.815127795728554e-05, + "loss": 0.3567, + "step": 2770 + }, + { + "epoch": 0.2772, + "grad_norm": 4.998348236083984, + "learning_rate": 1.814723182569625e-05, + "loss": 0.3827, + "step": 2772 + }, + { + "epoch": 0.2774, + "grad_norm": 1.3034894466400146, + "learning_rate": 1.814318172324514e-05, + "loss": 0.135, + "step": 2774 + }, + { + "epoch": 0.2776, + "grad_norm": 2.203009605407715, + "learning_rate": 1.8139127651906183e-05, + "loss": 0.2641, + "step": 2776 + }, + { + "epoch": 0.2778, + "grad_norm": 0.6253824234008789, + "learning_rate": 1.813506961365528e-05, + "loss": 0.2624, + "step": 2778 + }, + { + "epoch": 0.278, + "grad_norm": 3.8880536556243896, + "learning_rate": 1.8131007610470278e-05, + "loss": 0.6378, + "step": 2780 + }, + { + "epoch": 0.2782, + "grad_norm": 4.024691581726074, + "learning_rate": 1.812694164433094e-05, + "loss": 0.435, + "step": 2782 + }, + { + "epoch": 0.2784, + "grad_norm": 2.6701204776763916, + "learning_rate": 1.812287171721897e-05, + "loss": 0.2644, + "step": 2784 + }, + { + "epoch": 0.2786, + "grad_norm": 2.7724533081054688, + "learning_rate": 1.811879783111801e-05, + "loss": 0.2479, + "step": 2786 + }, + { + "epoch": 0.2788, + "grad_norm": 3.696347713470459, + "learning_rate": 1.8114719988013612e-05, + "loss": 0.3645, + "step": 2788 + }, + { + "epoch": 0.279, + "grad_norm": 3.484485387802124, + "learning_rate": 1.8110638189893267e-05, + "loss": 0.5195, + "step": 2790 + }, + { + "epoch": 0.2792, + "grad_norm": 1.3852870464324951, + "learning_rate": 1.81065524387464e-05, + "loss": 0.196, + "step": 2792 + }, + { + "epoch": 0.2794, + "grad_norm": 2.3284411430358887, + "learning_rate": 1.8102462736564355e-05, + "loss": 0.2065, + "step": 2794 + }, + { + "epoch": 0.2796, + "grad_norm": 3.9490487575531006, + "learning_rate": 1.80983690853404e-05, + "loss": 0.2899, + "step": 2796 + }, + { + "epoch": 0.2798, + "grad_norm": 2.1606624126434326, + "learning_rate": 1.8094271487069733e-05, + "loss": 0.35, + "step": 2798 + }, + { + "epoch": 0.28, + "grad_norm": 1.99848473072052, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.2914, + "step": 2800 + }, + { + "epoch": 0.2802, + "grad_norm": 1.7992031574249268, + "learning_rate": 1.8086064457378667e-05, + "loss": 0.2941, + "step": 2802 + }, + { + "epoch": 0.2804, + "grad_norm": 3.0396595001220703, + "learning_rate": 1.8081955029958272e-05, + "loss": 0.2792, + "step": 2804 + }, + { + "epoch": 0.2806, + "grad_norm": 5.012045860290527, + "learning_rate": 1.8077841663491174e-05, + "loss": 0.5432, + "step": 2806 + }, + { + "epoch": 0.2808, + "grad_norm": 0.9668693542480469, + "learning_rate": 1.8073724359982184e-05, + "loss": 0.1341, + "step": 2808 + }, + { + "epoch": 0.281, + "grad_norm": 5.441640377044678, + "learning_rate": 1.806960312143802e-05, + "loss": 0.5328, + "step": 2810 + }, + { + "epoch": 0.2812, + "grad_norm": 5.7581915855407715, + "learning_rate": 1.8065477949867327e-05, + "loss": 0.4756, + "step": 2812 + }, + { + "epoch": 0.2814, + "grad_norm": 1.5176331996917725, + "learning_rate": 1.806134884728066e-05, + "loss": 0.2715, + "step": 2814 + }, + { + "epoch": 0.2816, + "grad_norm": 1.4910598993301392, + "learning_rate": 1.8057215815690494e-05, + "loss": 0.199, + "step": 2816 + }, + { + "epoch": 0.2818, + "grad_norm": 0.67167729139328, + "learning_rate": 1.8053078857111218e-05, + "loss": 0.1406, + "step": 2818 + }, + { + "epoch": 0.282, + "grad_norm": 2.93373441696167, + "learning_rate": 1.804893797355914e-05, + "loss": 0.3527, + "step": 2820 + }, + { + "epoch": 0.2822, + "grad_norm": 2.3212578296661377, + "learning_rate": 1.8044793167052476e-05, + "loss": 0.291, + "step": 2822 + }, + { + "epoch": 0.2824, + "grad_norm": 2.2266130447387695, + "learning_rate": 1.8040644439611348e-05, + "loss": 0.3054, + "step": 2824 + }, + { + "epoch": 0.2826, + "grad_norm": 2.300508975982666, + "learning_rate": 1.80364917932578e-05, + "loss": 0.172, + "step": 2826 + }, + { + "epoch": 0.2828, + "grad_norm": 4.062469482421875, + "learning_rate": 1.803233523001578e-05, + "loss": 0.346, + "step": 2828 + }, + { + "epoch": 0.283, + "grad_norm": 5.238489151000977, + "learning_rate": 1.8028174751911147e-05, + "loss": 0.3054, + "step": 2830 + }, + { + "epoch": 0.2832, + "grad_norm": 2.0214619636535645, + "learning_rate": 1.802401036097167e-05, + "loss": 0.4486, + "step": 2832 + }, + { + "epoch": 0.2834, + "grad_norm": 2.875058650970459, + "learning_rate": 1.801984205922701e-05, + "loss": 0.4042, + "step": 2834 + }, + { + "epoch": 0.2836, + "grad_norm": 2.097720146179199, + "learning_rate": 1.8015669848708768e-05, + "loss": 0.1986, + "step": 2836 + }, + { + "epoch": 0.2838, + "grad_norm": 3.98968768119812, + "learning_rate": 1.8011493731450412e-05, + "loss": 0.3101, + "step": 2838 + }, + { + "epoch": 0.284, + "grad_norm": 1.5358871221542358, + "learning_rate": 1.8007313709487334e-05, + "loss": 0.1871, + "step": 2840 + }, + { + "epoch": 0.2842, + "grad_norm": 3.126748561859131, + "learning_rate": 1.8003129784856832e-05, + "loss": 0.3711, + "step": 2842 + }, + { + "epoch": 0.2844, + "grad_norm": 5.3859663009643555, + "learning_rate": 1.7998941959598097e-05, + "loss": 0.3809, + "step": 2844 + }, + { + "epoch": 0.2846, + "grad_norm": 1.550809621810913, + "learning_rate": 1.799475023575222e-05, + "loss": 0.115, + "step": 2846 + }, + { + "epoch": 0.2848, + "grad_norm": 2.163062334060669, + "learning_rate": 1.79905546153622e-05, + "loss": 0.3043, + "step": 2848 + }, + { + "epoch": 0.285, + "grad_norm": 3.707369089126587, + "learning_rate": 1.798635510047293e-05, + "loss": 0.3336, + "step": 2850 + }, + { + "epoch": 0.2852, + "grad_norm": 2.546422004699707, + "learning_rate": 1.7982151693131206e-05, + "loss": 0.24, + "step": 2852 + }, + { + "epoch": 0.2854, + "grad_norm": 1.8271149396896362, + "learning_rate": 1.7977944395385713e-05, + "loss": 0.1079, + "step": 2854 + }, + { + "epoch": 0.2856, + "grad_norm": 2.235872983932495, + "learning_rate": 1.7973733209287036e-05, + "loss": 0.2626, + "step": 2856 + }, + { + "epoch": 0.2858, + "grad_norm": 1.9475841522216797, + "learning_rate": 1.7969518136887664e-05, + "loss": 0.4162, + "step": 2858 + }, + { + "epoch": 0.286, + "grad_norm": 12.460562705993652, + "learning_rate": 1.7965299180241963e-05, + "loss": 0.4977, + "step": 2860 + }, + { + "epoch": 0.2862, + "grad_norm": 1.9023628234863281, + "learning_rate": 1.796107634140621e-05, + "loss": 0.1802, + "step": 2862 + }, + { + "epoch": 0.2864, + "grad_norm": 6.973464012145996, + "learning_rate": 1.7956849622438554e-05, + "loss": 0.2649, + "step": 2864 + }, + { + "epoch": 0.2866, + "grad_norm": 2.3440234661102295, + "learning_rate": 1.795261902539906e-05, + "loss": 0.4795, + "step": 2866 + }, + { + "epoch": 0.2868, + "grad_norm": 0.9532572031021118, + "learning_rate": 1.794838455234966e-05, + "loss": 0.0521, + "step": 2868 + }, + { + "epoch": 0.287, + "grad_norm": 2.5809245109558105, + "learning_rate": 1.7944146205354182e-05, + "loss": 0.2669, + "step": 2870 + }, + { + "epoch": 0.2872, + "grad_norm": 0.8381490111351013, + "learning_rate": 1.7939903986478354e-05, + "loss": 0.1954, + "step": 2872 + }, + { + "epoch": 0.2874, + "grad_norm": 3.8047308921813965, + "learning_rate": 1.793565789778978e-05, + "loss": 0.1407, + "step": 2874 + }, + { + "epoch": 0.2876, + "grad_norm": 7.672700881958008, + "learning_rate": 1.793140794135795e-05, + "loss": 0.3965, + "step": 2876 + }, + { + "epoch": 0.2878, + "grad_norm": 1.6298463344573975, + "learning_rate": 1.7927154119254234e-05, + "loss": 0.3281, + "step": 2878 + }, + { + "epoch": 0.288, + "grad_norm": 3.7426095008850098, + "learning_rate": 1.792289643355191e-05, + "loss": 0.2764, + "step": 2880 + }, + { + "epoch": 0.2882, + "grad_norm": 1.113887071609497, + "learning_rate": 1.791863488632611e-05, + "loss": 0.1367, + "step": 2882 + }, + { + "epoch": 0.2884, + "grad_norm": 1.272702932357788, + "learning_rate": 1.7914369479653858e-05, + "loss": 0.3475, + "step": 2884 + }, + { + "epoch": 0.2886, + "grad_norm": 4.129271030426025, + "learning_rate": 1.791010021561407e-05, + "loss": 0.3334, + "step": 2886 + }, + { + "epoch": 0.2888, + "grad_norm": 0.776145339012146, + "learning_rate": 1.7905827096287532e-05, + "loss": 0.4549, + "step": 2888 + }, + { + "epoch": 0.289, + "grad_norm": 9.77563190460205, + "learning_rate": 1.7901550123756906e-05, + "loss": 0.3053, + "step": 2890 + }, + { + "epoch": 0.2892, + "grad_norm": 0.27968287467956543, + "learning_rate": 1.789726930010674e-05, + "loss": 0.0412, + "step": 2892 + }, + { + "epoch": 0.2894, + "grad_norm": 2.696039915084839, + "learning_rate": 1.789298462742345e-05, + "loss": 0.1878, + "step": 2894 + }, + { + "epoch": 0.2896, + "grad_norm": 3.5772898197174072, + "learning_rate": 1.7888696107795343e-05, + "loss": 0.3708, + "step": 2896 + }, + { + "epoch": 0.2898, + "grad_norm": 12.661288261413574, + "learning_rate": 1.7884403743312583e-05, + "loss": 0.7315, + "step": 2898 + }, + { + "epoch": 0.29, + "grad_norm": 3.7118217945098877, + "learning_rate": 1.788010753606722e-05, + "loss": 0.3332, + "step": 2900 + }, + { + "epoch": 0.2902, + "grad_norm": 4.836172580718994, + "learning_rate": 1.7875807488153173e-05, + "loss": 0.4514, + "step": 2902 + }, + { + "epoch": 0.2904, + "grad_norm": 2.0384280681610107, + "learning_rate": 1.7871503601666233e-05, + "loss": 0.2527, + "step": 2904 + }, + { + "epoch": 0.2906, + "grad_norm": 3.133139133453369, + "learning_rate": 1.7867195878704062e-05, + "loss": 0.2019, + "step": 2906 + }, + { + "epoch": 0.2908, + "grad_norm": 5.681478977203369, + "learning_rate": 1.786288432136619e-05, + "loss": 0.3337, + "step": 2908 + }, + { + "epoch": 0.291, + "grad_norm": 0.49043047428131104, + "learning_rate": 1.785856893175402e-05, + "loss": 0.553, + "step": 2910 + }, + { + "epoch": 0.2912, + "grad_norm": 9.701278686523438, + "learning_rate": 1.785424971197082e-05, + "loss": 0.5853, + "step": 2912 + }, + { + "epoch": 0.2914, + "grad_norm": 4.400559902191162, + "learning_rate": 1.7849926664121726e-05, + "loss": 0.65, + "step": 2914 + }, + { + "epoch": 0.2916, + "grad_norm": 1.6105750799179077, + "learning_rate": 1.7845599790313735e-05, + "loss": 0.2619, + "step": 2916 + }, + { + "epoch": 0.2918, + "grad_norm": 1.8967746496200562, + "learning_rate": 1.7841269092655714e-05, + "loss": 0.3451, + "step": 2918 + }, + { + "epoch": 0.292, + "grad_norm": 0.6434007287025452, + "learning_rate": 1.78369345732584e-05, + "loss": 0.0525, + "step": 2920 + }, + { + "epoch": 0.2922, + "grad_norm": 0.34453585743904114, + "learning_rate": 1.7832596234234376e-05, + "loss": 0.4628, + "step": 2922 + }, + { + "epoch": 0.2924, + "grad_norm": 1.4182170629501343, + "learning_rate": 1.78282540776981e-05, + "loss": 0.2698, + "step": 2924 + }, + { + "epoch": 0.2926, + "grad_norm": 3.1153039932250977, + "learning_rate": 1.7823908105765883e-05, + "loss": 0.6024, + "step": 2926 + }, + { + "epoch": 0.2928, + "grad_norm": 1.7974677085876465, + "learning_rate": 1.7819558320555902e-05, + "loss": 0.1241, + "step": 2928 + }, + { + "epoch": 0.293, + "grad_norm": 4.069811820983887, + "learning_rate": 1.781520472418819e-05, + "loss": 0.3049, + "step": 2930 + }, + { + "epoch": 0.2932, + "grad_norm": 2.902940511703491, + "learning_rate": 1.7810847318784632e-05, + "loss": 0.3398, + "step": 2932 + }, + { + "epoch": 0.2934, + "grad_norm": 1.2253791093826294, + "learning_rate": 1.7806486106468983e-05, + "loss": 0.2873, + "step": 2934 + }, + { + "epoch": 0.2936, + "grad_norm": 3.0444533824920654, + "learning_rate": 1.780212108936684e-05, + "loss": 0.4166, + "step": 2936 + }, + { + "epoch": 0.2938, + "grad_norm": 0.773672342300415, + "learning_rate": 1.7797752269605654e-05, + "loss": 0.078, + "step": 2938 + }, + { + "epoch": 0.294, + "grad_norm": 2.4871091842651367, + "learning_rate": 1.7793379649314743e-05, + "loss": 0.2172, + "step": 2940 + }, + { + "epoch": 0.2942, + "grad_norm": 3.405329465866089, + "learning_rate": 1.7789003230625266e-05, + "loss": 0.3668, + "step": 2942 + }, + { + "epoch": 0.2944, + "grad_norm": 1.986344575881958, + "learning_rate": 1.7784623015670237e-05, + "loss": 0.283, + "step": 2944 + }, + { + "epoch": 0.2946, + "grad_norm": 2.402453660964966, + "learning_rate": 1.7780239006584515e-05, + "loss": 0.24, + "step": 2946 + }, + { + "epoch": 0.2948, + "grad_norm": 1.1132586002349854, + "learning_rate": 1.7775851205504823e-05, + "loss": 0.2025, + "step": 2948 + }, + { + "epoch": 0.295, + "grad_norm": 2.0582642555236816, + "learning_rate": 1.777145961456971e-05, + "loss": 0.3532, + "step": 2950 + }, + { + "epoch": 0.2952, + "grad_norm": 6.109958171844482, + "learning_rate": 1.7767064235919594e-05, + "loss": 0.5146, + "step": 2952 + }, + { + "epoch": 0.2954, + "grad_norm": 3.3962695598602295, + "learning_rate": 1.776266507169672e-05, + "loss": 0.2486, + "step": 2954 + }, + { + "epoch": 0.2956, + "grad_norm": 1.3351765871047974, + "learning_rate": 1.7758262124045195e-05, + "loss": 0.2825, + "step": 2956 + }, + { + "epoch": 0.2958, + "grad_norm": 1.4624029397964478, + "learning_rate": 1.775385539511096e-05, + "loss": 0.2314, + "step": 2958 + }, + { + "epoch": 0.296, + "grad_norm": 4.090035915374756, + "learning_rate": 1.7749444887041797e-05, + "loss": 0.3225, + "step": 2960 + }, + { + "epoch": 0.2962, + "grad_norm": 1.4231541156768799, + "learning_rate": 1.7745030601987338e-05, + "loss": 0.1721, + "step": 2962 + }, + { + "epoch": 0.2964, + "grad_norm": 5.056805610656738, + "learning_rate": 1.7740612542099054e-05, + "loss": 0.4595, + "step": 2964 + }, + { + "epoch": 0.2966, + "grad_norm": 3.250394821166992, + "learning_rate": 1.773619070953025e-05, + "loss": 0.206, + "step": 2966 + }, + { + "epoch": 0.2968, + "grad_norm": 0.8660505414009094, + "learning_rate": 1.7731765106436073e-05, + "loss": 0.1521, + "step": 2968 + }, + { + "epoch": 0.297, + "grad_norm": 1.8636387586593628, + "learning_rate": 1.7727335734973512e-05, + "loss": 0.1836, + "step": 2970 + }, + { + "epoch": 0.2972, + "grad_norm": 4.643412113189697, + "learning_rate": 1.7722902597301385e-05, + "loss": 0.1407, + "step": 2972 + }, + { + "epoch": 0.2974, + "grad_norm": 1.1150456666946411, + "learning_rate": 1.771846569558035e-05, + "loss": 0.1831, + "step": 2974 + }, + { + "epoch": 0.2976, + "grad_norm": 2.3759777545928955, + "learning_rate": 1.7714025031972904e-05, + "loss": 0.1285, + "step": 2976 + }, + { + "epoch": 0.2978, + "grad_norm": 1.7916673421859741, + "learning_rate": 1.7709580608643364e-05, + "loss": 0.0837, + "step": 2978 + }, + { + "epoch": 0.298, + "grad_norm": 1.0765917301177979, + "learning_rate": 1.7705132427757895e-05, + "loss": 0.1999, + "step": 2980 + }, + { + "epoch": 0.2982, + "grad_norm": 1.5508619546890259, + "learning_rate": 1.770068049148448e-05, + "loss": 0.3139, + "step": 2982 + }, + { + "epoch": 0.2984, + "grad_norm": 12.907669067382812, + "learning_rate": 1.7696224801992947e-05, + "loss": 0.4983, + "step": 2984 + }, + { + "epoch": 0.2986, + "grad_norm": 4.619217872619629, + "learning_rate": 1.769176536145494e-05, + "loss": 0.2586, + "step": 2986 + }, + { + "epoch": 0.2988, + "grad_norm": 2.5080182552337646, + "learning_rate": 1.7687302172043933e-05, + "loss": 0.2059, + "step": 2988 + }, + { + "epoch": 0.299, + "grad_norm": 1.2511807680130005, + "learning_rate": 1.7682835235935236e-05, + "loss": 0.0525, + "step": 2990 + }, + { + "epoch": 0.2992, + "grad_norm": 0.4213349223136902, + "learning_rate": 1.767836455530598e-05, + "loss": 0.3435, + "step": 2992 + }, + { + "epoch": 0.2994, + "grad_norm": 0.5199392437934875, + "learning_rate": 1.767389013233511e-05, + "loss": 0.0413, + "step": 2994 + }, + { + "epoch": 0.2996, + "grad_norm": 0.1493908017873764, + "learning_rate": 1.7669411969203417e-05, + "loss": 0.2308, + "step": 2996 + }, + { + "epoch": 0.2998, + "grad_norm": 1.2655545473098755, + "learning_rate": 1.76649300680935e-05, + "loss": 0.3616, + "step": 2998 + }, + { + "epoch": 0.3, + "grad_norm": 15.426005363464355, + "learning_rate": 1.766044443118978e-05, + "loss": 0.4761, + "step": 3000 + }, + { + "epoch": 0.3002, + "grad_norm": 0.12226205319166183, + "learning_rate": 1.7655955060678508e-05, + "loss": 0.0079, + "step": 3002 + }, + { + "epoch": 0.3004, + "grad_norm": 7.998121738433838, + "learning_rate": 1.7651461958747745e-05, + "loss": 0.534, + "step": 3004 + }, + { + "epoch": 0.3006, + "grad_norm": 9.500176429748535, + "learning_rate": 1.7646965127587373e-05, + "loss": 0.2503, + "step": 3006 + }, + { + "epoch": 0.3008, + "grad_norm": 0.16198256611824036, + "learning_rate": 1.764246456938909e-05, + "loss": 0.0702, + "step": 3008 + }, + { + "epoch": 0.301, + "grad_norm": 0.3742484748363495, + "learning_rate": 1.7637960286346423e-05, + "loss": 0.0841, + "step": 3010 + }, + { + "epoch": 0.3012, + "grad_norm": 1.5521295070648193, + "learning_rate": 1.76334522806547e-05, + "loss": 0.1713, + "step": 3012 + }, + { + "epoch": 0.3014, + "grad_norm": 7.680184364318848, + "learning_rate": 1.7628940554511064e-05, + "loss": 0.3485, + "step": 3014 + }, + { + "epoch": 0.3016, + "grad_norm": 1.2801295518875122, + "learning_rate": 1.762442511011448e-05, + "loss": 0.1394, + "step": 3016 + }, + { + "epoch": 0.3018, + "grad_norm": 1.2674964666366577, + "learning_rate": 1.761990594966572e-05, + "loss": 0.0929, + "step": 3018 + }, + { + "epoch": 0.302, + "grad_norm": 0.7616429924964905, + "learning_rate": 1.761538307536737e-05, + "loss": 0.0545, + "step": 3020 + }, + { + "epoch": 0.3022, + "grad_norm": 0.02697041630744934, + "learning_rate": 1.761085648942382e-05, + "loss": 0.0134, + "step": 3022 + }, + { + "epoch": 0.3024, + "grad_norm": 0.4015868604183197, + "learning_rate": 1.7606326194041274e-05, + "loss": 0.0608, + "step": 3024 + }, + { + "epoch": 0.3026, + "grad_norm": 0.45227134227752686, + "learning_rate": 1.760179219142774e-05, + "loss": 0.0304, + "step": 3026 + }, + { + "epoch": 0.3028, + "grad_norm": 13.296224594116211, + "learning_rate": 1.759725448379305e-05, + "loss": 1.2617, + "step": 3028 + }, + { + "epoch": 0.303, + "grad_norm": 1.1419142484664917, + "learning_rate": 1.759271307334881e-05, + "loss": 0.1038, + "step": 3030 + }, + { + "epoch": 0.3032, + "grad_norm": 0.5938009023666382, + "learning_rate": 1.7588167962308458e-05, + "loss": 0.0384, + "step": 3032 + }, + { + "epoch": 0.3034, + "grad_norm": 7.754266738891602, + "learning_rate": 1.7583619152887222e-05, + "loss": 0.2409, + "step": 3034 + }, + { + "epoch": 0.3036, + "grad_norm": 14.645008087158203, + "learning_rate": 1.7579066647302134e-05, + "loss": 0.2994, + "step": 3036 + }, + { + "epoch": 0.3038, + "grad_norm": 6.396594524383545, + "learning_rate": 1.757451044777204e-05, + "loss": 0.188, + "step": 3038 + }, + { + "epoch": 0.304, + "grad_norm": 9.380064010620117, + "learning_rate": 1.7569950556517566e-05, + "loss": 0.3296, + "step": 3040 + }, + { + "epoch": 0.3042, + "grad_norm": 0.5132370591163635, + "learning_rate": 1.756538697576115e-05, + "loss": 0.046, + "step": 3042 + }, + { + "epoch": 0.3044, + "grad_norm": 7.086660385131836, + "learning_rate": 1.7560819707727034e-05, + "loss": 0.5502, + "step": 3044 + }, + { + "epoch": 0.3046, + "grad_norm": 7.914719581604004, + "learning_rate": 1.7556248754641237e-05, + "loss": 0.185, + "step": 3046 + }, + { + "epoch": 0.3048, + "grad_norm": 1.2060405015945435, + "learning_rate": 1.7551674118731592e-05, + "loss": 0.0744, + "step": 3048 + }, + { + "epoch": 0.305, + "grad_norm": 1.1407266855239868, + "learning_rate": 1.7547095802227723e-05, + "loss": 0.0768, + "step": 3050 + }, + { + "epoch": 0.3052, + "grad_norm": 5.713343143463135, + "learning_rate": 1.754251380736104e-05, + "loss": 0.2001, + "step": 3052 + }, + { + "epoch": 0.3054, + "grad_norm": 2.7940566539764404, + "learning_rate": 1.7537928136364756e-05, + "loss": 0.161, + "step": 3054 + }, + { + "epoch": 0.3056, + "grad_norm": 9.509251594543457, + "learning_rate": 1.7533338791473872e-05, + "loss": 0.5435, + "step": 3056 + }, + { + "epoch": 0.3058, + "grad_norm": 1.95154869556427, + "learning_rate": 1.7528745774925175e-05, + "loss": 0.0854, + "step": 3058 + }, + { + "epoch": 0.306, + "grad_norm": 0.5906040072441101, + "learning_rate": 1.7524149088957244e-05, + "loss": 0.0576, + "step": 3060 + }, + { + "epoch": 0.3062, + "grad_norm": 5.8458123207092285, + "learning_rate": 1.7519548735810456e-05, + "loss": 0.3191, + "step": 3062 + }, + { + "epoch": 0.3064, + "grad_norm": 0.5129405856132507, + "learning_rate": 1.7514944717726962e-05, + "loss": 0.2281, + "step": 3064 + }, + { + "epoch": 0.3066, + "grad_norm": 12.572365760803223, + "learning_rate": 1.7510337036950703e-05, + "loss": 0.2576, + "step": 3066 + }, + { + "epoch": 0.3068, + "grad_norm": 11.874238967895508, + "learning_rate": 1.7505725695727414e-05, + "loss": 0.3525, + "step": 3068 + }, + { + "epoch": 0.307, + "grad_norm": 19.977888107299805, + "learning_rate": 1.7501110696304598e-05, + "loss": 0.6393, + "step": 3070 + }, + { + "epoch": 0.3072, + "grad_norm": 6.059708595275879, + "learning_rate": 1.749649204093155e-05, + "loss": 0.7491, + "step": 3072 + }, + { + "epoch": 0.3074, + "grad_norm": 9.04761028289795, + "learning_rate": 1.7491869731859353e-05, + "loss": 0.8544, + "step": 3074 + }, + { + "epoch": 0.3076, + "grad_norm": 7.814766883850098, + "learning_rate": 1.7487243771340862e-05, + "loss": 0.6478, + "step": 3076 + }, + { + "epoch": 0.3078, + "grad_norm": 6.138884544372559, + "learning_rate": 1.7482614161630714e-05, + "loss": 0.2966, + "step": 3078 + }, + { + "epoch": 0.308, + "grad_norm": 4.770823001861572, + "learning_rate": 1.747798090498532e-05, + "loss": 0.3932, + "step": 3080 + }, + { + "epoch": 0.3082, + "grad_norm": 0.6917981505393982, + "learning_rate": 1.7473344003662877e-05, + "loss": 0.3745, + "step": 3082 + }, + { + "epoch": 0.3084, + "grad_norm": 2.5041441917419434, + "learning_rate": 1.746870345992336e-05, + "loss": 0.1908, + "step": 3084 + }, + { + "epoch": 0.3086, + "grad_norm": 2.975079298019409, + "learning_rate": 1.7464059276028497e-05, + "loss": 0.4302, + "step": 3086 + }, + { + "epoch": 0.3088, + "grad_norm": 4.797851085662842, + "learning_rate": 1.7459411454241822e-05, + "loss": 0.405, + "step": 3088 + }, + { + "epoch": 0.309, + "grad_norm": 6.202813148498535, + "learning_rate": 1.7454759996828622e-05, + "loss": 0.3905, + "step": 3090 + }, + { + "epoch": 0.3092, + "grad_norm": 1.709937572479248, + "learning_rate": 1.7450104906055963e-05, + "loss": 0.1908, + "step": 3092 + }, + { + "epoch": 0.3094, + "grad_norm": 1.5948448181152344, + "learning_rate": 1.7445446184192674e-05, + "loss": 0.1985, + "step": 3094 + }, + { + "epoch": 0.3096, + "grad_norm": 1.171077013015747, + "learning_rate": 1.7440783833509366e-05, + "loss": 0.4235, + "step": 3096 + }, + { + "epoch": 0.3098, + "grad_norm": 2.7316155433654785, + "learning_rate": 1.743611785627841e-05, + "loss": 0.3016, + "step": 3098 + }, + { + "epoch": 0.31, + "grad_norm": 3.5063345432281494, + "learning_rate": 1.7431448254773943e-05, + "loss": 0.5326, + "step": 3100 + }, + { + "epoch": 0.3102, + "grad_norm": 3.122267723083496, + "learning_rate": 1.7426775031271876e-05, + "loss": 0.3102, + "step": 3102 + }, + { + "epoch": 0.3104, + "grad_norm": 1.8012405633926392, + "learning_rate": 1.7422098188049885e-05, + "loss": 0.2646, + "step": 3104 + }, + { + "epoch": 0.3106, + "grad_norm": 2.9155118465423584, + "learning_rate": 1.7417417727387392e-05, + "loss": 0.2306, + "step": 3106 + }, + { + "epoch": 0.3108, + "grad_norm": 3.5996010303497314, + "learning_rate": 1.741273365156561e-05, + "loss": 0.2397, + "step": 3108 + }, + { + "epoch": 0.311, + "grad_norm": 1.927994728088379, + "learning_rate": 1.74080459628675e-05, + "loss": 0.3195, + "step": 3110 + }, + { + "epoch": 0.3112, + "grad_norm": 3.0896668434143066, + "learning_rate": 1.7403354663577782e-05, + "loss": 0.4756, + "step": 3112 + }, + { + "epoch": 0.3114, + "grad_norm": 1.3881851434707642, + "learning_rate": 1.7398659755982937e-05, + "loss": 0.1761, + "step": 3114 + }, + { + "epoch": 0.3116, + "grad_norm": 0.5820208787918091, + "learning_rate": 1.7393961242371203e-05, + "loss": 0.1075, + "step": 3116 + }, + { + "epoch": 0.3118, + "grad_norm": 2.9963629245758057, + "learning_rate": 1.738925912503259e-05, + "loss": 0.2637, + "step": 3118 + }, + { + "epoch": 0.312, + "grad_norm": 6.876772403717041, + "learning_rate": 1.7384553406258842e-05, + "loss": 0.555, + "step": 3120 + }, + { + "epoch": 0.3122, + "grad_norm": 0.7099496722221375, + "learning_rate": 1.737984408834347e-05, + "loss": 0.1047, + "step": 3122 + }, + { + "epoch": 0.3124, + "grad_norm": 0.9901650547981262, + "learning_rate": 1.737513117358174e-05, + "loss": 0.1047, + "step": 3124 + }, + { + "epoch": 0.3126, + "grad_norm": 0.6682717800140381, + "learning_rate": 1.7370414664270675e-05, + "loss": 0.1003, + "step": 3126 + }, + { + "epoch": 0.3128, + "grad_norm": 4.307413101196289, + "learning_rate": 1.7365694562709034e-05, + "loss": 0.7904, + "step": 3128 + }, + { + "epoch": 0.313, + "grad_norm": 2.7778141498565674, + "learning_rate": 1.7360970871197347e-05, + "loss": 0.196, + "step": 3130 + }, + { + "epoch": 0.3132, + "grad_norm": 3.014732599258423, + "learning_rate": 1.7356243592037876e-05, + "loss": 0.5836, + "step": 3132 + }, + { + "epoch": 0.3134, + "grad_norm": 2.9615843296051025, + "learning_rate": 1.7351512727534645e-05, + "loss": 0.1688, + "step": 3134 + }, + { + "epoch": 0.3136, + "grad_norm": 3.2080817222595215, + "learning_rate": 1.7346778279993417e-05, + "loss": 0.4759, + "step": 3136 + }, + { + "epoch": 0.3138, + "grad_norm": 3.235415458679199, + "learning_rate": 1.7342040251721702e-05, + "loss": 0.2024, + "step": 3138 + }, + { + "epoch": 0.314, + "grad_norm": 3.4966700077056885, + "learning_rate": 1.7337298645028764e-05, + "loss": 0.5113, + "step": 3140 + }, + { + "epoch": 0.3142, + "grad_norm": 5.868491172790527, + "learning_rate": 1.7332553462225604e-05, + "loss": 0.2197, + "step": 3142 + }, + { + "epoch": 0.3144, + "grad_norm": 1.2447623014450073, + "learning_rate": 1.732780470562496e-05, + "loss": 0.1517, + "step": 3144 + }, + { + "epoch": 0.3146, + "grad_norm": 6.204957485198975, + "learning_rate": 1.732305237754132e-05, + "loss": 0.2987, + "step": 3146 + }, + { + "epoch": 0.3148, + "grad_norm": 7.630340576171875, + "learning_rate": 1.7318296480290912e-05, + "loss": 0.7533, + "step": 3148 + }, + { + "epoch": 0.315, + "grad_norm": 4.492129325866699, + "learning_rate": 1.7313537016191706e-05, + "loss": 0.4725, + "step": 3150 + }, + { + "epoch": 0.3152, + "grad_norm": 2.534719705581665, + "learning_rate": 1.7308773987563406e-05, + "loss": 0.2635, + "step": 3152 + }, + { + "epoch": 0.3154, + "grad_norm": 1.4121227264404297, + "learning_rate": 1.730400739672745e-05, + "loss": 0.047, + "step": 3154 + }, + { + "epoch": 0.3156, + "grad_norm": 2.2694878578186035, + "learning_rate": 1.7299237246007018e-05, + "loss": 0.2333, + "step": 3156 + }, + { + "epoch": 0.3158, + "grad_norm": 1.1745939254760742, + "learning_rate": 1.7294463537727026e-05, + "loss": 0.2301, + "step": 3158 + }, + { + "epoch": 0.316, + "grad_norm": 2.122953414916992, + "learning_rate": 1.7289686274214116e-05, + "loss": 0.3808, + "step": 3160 + }, + { + "epoch": 0.3162, + "grad_norm": 2.3149425983428955, + "learning_rate": 1.7284905457796678e-05, + "loss": 0.1769, + "step": 3162 + }, + { + "epoch": 0.3164, + "grad_norm": 1.4895646572113037, + "learning_rate": 1.7280121090804813e-05, + "loss": 0.2938, + "step": 3164 + }, + { + "epoch": 0.3166, + "grad_norm": 2.608558416366577, + "learning_rate": 1.727533317557037e-05, + "loss": 0.1796, + "step": 3166 + }, + { + "epoch": 0.3168, + "grad_norm": 0.8740285038948059, + "learning_rate": 1.727054171442692e-05, + "loss": 0.2198, + "step": 3168 + }, + { + "epoch": 0.317, + "grad_norm": 1.3282686471939087, + "learning_rate": 1.7265746709709762e-05, + "loss": 0.238, + "step": 3170 + }, + { + "epoch": 0.3172, + "grad_norm": 0.8472639322280884, + "learning_rate": 1.7260948163755918e-05, + "loss": 0.181, + "step": 3172 + }, + { + "epoch": 0.3174, + "grad_norm": 1.1054245233535767, + "learning_rate": 1.7256146078904153e-05, + "loss": 0.1794, + "step": 3174 + }, + { + "epoch": 0.3176, + "grad_norm": 4.823938369750977, + "learning_rate": 1.7251340457494934e-05, + "loss": 0.2542, + "step": 3176 + }, + { + "epoch": 0.3178, + "grad_norm": 2.0330159664154053, + "learning_rate": 1.7246531301870467e-05, + "loss": 0.2275, + "step": 3178 + }, + { + "epoch": 0.318, + "grad_norm": 1.2845816612243652, + "learning_rate": 1.7241718614374678e-05, + "loss": 0.167, + "step": 3180 + }, + { + "epoch": 0.3182, + "grad_norm": 5.321610450744629, + "learning_rate": 1.7236902397353204e-05, + "loss": 0.3923, + "step": 3182 + }, + { + "epoch": 0.3184, + "grad_norm": 6.098850250244141, + "learning_rate": 1.7232082653153422e-05, + "loss": 0.3428, + "step": 3184 + }, + { + "epoch": 0.3186, + "grad_norm": 3.4308173656463623, + "learning_rate": 1.7227259384124408e-05, + "loss": 0.2522, + "step": 3186 + }, + { + "epoch": 0.3188, + "grad_norm": 2.07170033454895, + "learning_rate": 1.722243259261697e-05, + "loss": 0.0544, + "step": 3188 + }, + { + "epoch": 0.319, + "grad_norm": 6.323230743408203, + "learning_rate": 1.7217602280983622e-05, + "loss": 0.2378, + "step": 3190 + }, + { + "epoch": 0.3192, + "grad_norm": 1.0970321893692017, + "learning_rate": 1.721276845157861e-05, + "loss": 0.1003, + "step": 3192 + }, + { + "epoch": 0.3194, + "grad_norm": 4.3397417068481445, + "learning_rate": 1.7207931106757867e-05, + "loss": 0.3048, + "step": 3194 + }, + { + "epoch": 0.3196, + "grad_norm": 4.523487567901611, + "learning_rate": 1.720309024887907e-05, + "loss": 0.3935, + "step": 3196 + }, + { + "epoch": 0.3198, + "grad_norm": 1.1057226657867432, + "learning_rate": 1.719824588030159e-05, + "loss": 0.2253, + "step": 3198 + }, + { + "epoch": 0.32, + "grad_norm": 0.8647271990776062, + "learning_rate": 1.7193398003386514e-05, + "loss": 0.0516, + "step": 3200 + }, + { + "epoch": 0.3202, + "grad_norm": 9.974818229675293, + "learning_rate": 1.7188546620496634e-05, + "loss": 0.4832, + "step": 3202 + }, + { + "epoch": 0.3204, + "grad_norm": 5.866600036621094, + "learning_rate": 1.7183691733996463e-05, + "loss": 0.5054, + "step": 3204 + }, + { + "epoch": 0.3206, + "grad_norm": 2.262077808380127, + "learning_rate": 1.7178833346252208e-05, + "loss": 0.3856, + "step": 3206 + }, + { + "epoch": 0.3208, + "grad_norm": 0.6471142768859863, + "learning_rate": 1.717397145963179e-05, + "loss": 0.0427, + "step": 3208 + }, + { + "epoch": 0.321, + "grad_norm": 7.609720230102539, + "learning_rate": 1.716910607650483e-05, + "loss": 0.3773, + "step": 3210 + }, + { + "epoch": 0.3212, + "grad_norm": 0.24256661534309387, + "learning_rate": 1.716423719924266e-05, + "loss": 0.406, + "step": 3212 + }, + { + "epoch": 0.3214, + "grad_norm": 4.75200080871582, + "learning_rate": 1.7159364830218312e-05, + "loss": 0.6638, + "step": 3214 + }, + { + "epoch": 0.3216, + "grad_norm": 0.49972012639045715, + "learning_rate": 1.715448897180652e-05, + "loss": 0.0683, + "step": 3216 + }, + { + "epoch": 0.3218, + "grad_norm": 2.9366774559020996, + "learning_rate": 1.7149609626383718e-05, + "loss": 0.2645, + "step": 3218 + }, + { + "epoch": 0.322, + "grad_norm": 5.320857048034668, + "learning_rate": 1.7144726796328034e-05, + "loss": 0.5113, + "step": 3220 + }, + { + "epoch": 0.3222, + "grad_norm": 0.9561947584152222, + "learning_rate": 1.713984048401931e-05, + "loss": 0.1351, + "step": 3222 + }, + { + "epoch": 0.3224, + "grad_norm": 1.1076703071594238, + "learning_rate": 1.7134950691839063e-05, + "loss": 0.2083, + "step": 3224 + }, + { + "epoch": 0.3226, + "grad_norm": 2.9401724338531494, + "learning_rate": 1.713005742217053e-05, + "loss": 0.3333, + "step": 3226 + }, + { + "epoch": 0.3228, + "grad_norm": 2.359201192855835, + "learning_rate": 1.7125160677398625e-05, + "loss": 0.2423, + "step": 3228 + }, + { + "epoch": 0.323, + "grad_norm": 2.298027753829956, + "learning_rate": 1.712026045990997e-05, + "loss": 0.1988, + "step": 3230 + }, + { + "epoch": 0.3232, + "grad_norm": 2.3732001781463623, + "learning_rate": 1.7115356772092858e-05, + "loss": 0.2781, + "step": 3232 + }, + { + "epoch": 0.3234, + "grad_norm": 2.1123545169830322, + "learning_rate": 1.711044961633729e-05, + "loss": 0.2166, + "step": 3234 + }, + { + "epoch": 0.3236, + "grad_norm": 5.132606506347656, + "learning_rate": 1.710553899503496e-05, + "loss": 0.3188, + "step": 3236 + }, + { + "epoch": 0.3238, + "grad_norm": 2.4402053356170654, + "learning_rate": 1.710062491057925e-05, + "loss": 0.2149, + "step": 3238 + }, + { + "epoch": 0.324, + "grad_norm": 0.36511123180389404, + "learning_rate": 1.709570736536521e-05, + "loss": 0.1253, + "step": 3240 + }, + { + "epoch": 0.3242, + "grad_norm": 3.223181962966919, + "learning_rate": 1.7090786361789602e-05, + "loss": 0.1951, + "step": 3242 + }, + { + "epoch": 0.3244, + "grad_norm": 4.848666191101074, + "learning_rate": 1.7085861902250864e-05, + "loss": 0.2238, + "step": 3244 + }, + { + "epoch": 0.3246, + "grad_norm": 3.4667813777923584, + "learning_rate": 1.7080933989149112e-05, + "loss": 0.1601, + "step": 3246 + }, + { + "epoch": 0.3248, + "grad_norm": 0.26786476373672485, + "learning_rate": 1.7076002624886156e-05, + "loss": 0.0714, + "step": 3248 + }, + { + "epoch": 0.325, + "grad_norm": 0.3877842128276825, + "learning_rate": 1.7071067811865477e-05, + "loss": 0.0468, + "step": 3250 + }, + { + "epoch": 0.3252, + "grad_norm": 3.3650619983673096, + "learning_rate": 1.706612955249225e-05, + "loss": 0.2864, + "step": 3252 + }, + { + "epoch": 0.3254, + "grad_norm": 2.8891162872314453, + "learning_rate": 1.7061187849173318e-05, + "loss": 0.1336, + "step": 3254 + }, + { + "epoch": 0.3256, + "grad_norm": 6.979137897491455, + "learning_rate": 1.705624270431721e-05, + "loss": 0.2786, + "step": 3256 + }, + { + "epoch": 0.3258, + "grad_norm": 3.881531000137329, + "learning_rate": 1.7051294120334126e-05, + "loss": 0.0658, + "step": 3258 + }, + { + "epoch": 0.326, + "grad_norm": 5.460346698760986, + "learning_rate": 1.7046342099635948e-05, + "loss": 0.305, + "step": 3260 + }, + { + "epoch": 0.3262, + "grad_norm": 3.068619728088379, + "learning_rate": 1.704138664463623e-05, + "loss": 0.1378, + "step": 3262 + }, + { + "epoch": 0.3264, + "grad_norm": 5.6236348152160645, + "learning_rate": 1.7036427757750205e-05, + "loss": 0.3764, + "step": 3264 + }, + { + "epoch": 0.3266, + "grad_norm": 2.543750286102295, + "learning_rate": 1.7031465441394766e-05, + "loss": 0.4227, + "step": 3266 + }, + { + "epoch": 0.3268, + "grad_norm": 11.370311737060547, + "learning_rate": 1.7026499697988496e-05, + "loss": 0.2537, + "step": 3268 + }, + { + "epoch": 0.327, + "grad_norm": 13.120868682861328, + "learning_rate": 1.7021530529951627e-05, + "loss": 0.6162, + "step": 3270 + }, + { + "epoch": 0.3272, + "grad_norm": 1.6002190113067627, + "learning_rate": 1.7016557939706075e-05, + "loss": 0.0586, + "step": 3272 + }, + { + "epoch": 0.3274, + "grad_norm": 1.2236037254333496, + "learning_rate": 1.7011581929675424e-05, + "loss": 0.0544, + "step": 3274 + }, + { + "epoch": 0.3276, + "grad_norm": 0.3369058668613434, + "learning_rate": 1.700660250228492e-05, + "loss": 0.1353, + "step": 3276 + }, + { + "epoch": 0.3278, + "grad_norm": 0.7809661030769348, + "learning_rate": 1.7001619659961467e-05, + "loss": 0.0492, + "step": 3278 + }, + { + "epoch": 0.328, + "grad_norm": 0.18442396819591522, + "learning_rate": 1.6996633405133656e-05, + "loss": 0.0925, + "step": 3280 + }, + { + "epoch": 0.3282, + "grad_norm": 7.330276012420654, + "learning_rate": 1.6991643740231714e-05, + "loss": 0.2889, + "step": 3282 + }, + { + "epoch": 0.3284, + "grad_norm": 9.17273998260498, + "learning_rate": 1.6986650667687552e-05, + "loss": 0.8195, + "step": 3284 + }, + { + "epoch": 0.3286, + "grad_norm": 21.544525146484375, + "learning_rate": 1.698165418993473e-05, + "loss": 0.5598, + "step": 3286 + }, + { + "epoch": 0.3288, + "grad_norm": 0.36321714520454407, + "learning_rate": 1.6976654309408464e-05, + "loss": 0.0293, + "step": 3288 + }, + { + "epoch": 0.329, + "grad_norm": 3.4584152698516846, + "learning_rate": 1.697165102854565e-05, + "loss": 0.6392, + "step": 3290 + }, + { + "epoch": 0.3292, + "grad_norm": 16.990114212036133, + "learning_rate": 1.696664434978481e-05, + "loss": 0.8241, + "step": 3292 + }, + { + "epoch": 0.3294, + "grad_norm": 1.1038914918899536, + "learning_rate": 1.6961634275566147e-05, + "loss": 0.3694, + "step": 3294 + }, + { + "epoch": 0.3296, + "grad_norm": 22.60733985900879, + "learning_rate": 1.695662080833151e-05, + "loss": 0.7523, + "step": 3296 + }, + { + "epoch": 0.3298, + "grad_norm": 1.9794248342514038, + "learning_rate": 1.69516039505244e-05, + "loss": 0.132, + "step": 3298 + }, + { + "epoch": 0.33, + "grad_norm": 3.316636323928833, + "learning_rate": 1.6946583704589973e-05, + "loss": 0.1118, + "step": 3300 + }, + { + "epoch": 0.3302, + "grad_norm": 3.919255495071411, + "learning_rate": 1.694156007297504e-05, + "loss": 0.2171, + "step": 3302 + }, + { + "epoch": 0.3304, + "grad_norm": 6.12785530090332, + "learning_rate": 1.693653305812805e-05, + "loss": 0.2894, + "step": 3304 + }, + { + "epoch": 0.3306, + "grad_norm": 2.1033217906951904, + "learning_rate": 1.6931502662499116e-05, + "loss": 0.353, + "step": 3306 + }, + { + "epoch": 0.3308, + "grad_norm": 3.4226372241973877, + "learning_rate": 1.6926468888539988e-05, + "loss": 0.4925, + "step": 3308 + }, + { + "epoch": 0.331, + "grad_norm": 0.7176029086112976, + "learning_rate": 1.692143173870407e-05, + "loss": 0.0965, + "step": 3310 + }, + { + "epoch": 0.3312, + "grad_norm": 0.9549070000648499, + "learning_rate": 1.6916391215446403e-05, + "loss": 0.2476, + "step": 3312 + }, + { + "epoch": 0.3314, + "grad_norm": 0.7443346977233887, + "learning_rate": 1.691134732122368e-05, + "loss": 0.0894, + "step": 3314 + }, + { + "epoch": 0.3316, + "grad_norm": 1.9046714305877686, + "learning_rate": 1.690630005849423e-05, + "loss": 0.2987, + "step": 3316 + }, + { + "epoch": 0.3318, + "grad_norm": 6.774964809417725, + "learning_rate": 1.6901249429718033e-05, + "loss": 0.3818, + "step": 3318 + }, + { + "epoch": 0.332, + "grad_norm": 0.614512026309967, + "learning_rate": 1.68961954373567e-05, + "loss": 0.0817, + "step": 3320 + }, + { + "epoch": 0.3322, + "grad_norm": 4.263340473175049, + "learning_rate": 1.6891138083873486e-05, + "loss": 0.2783, + "step": 3322 + }, + { + "epoch": 0.3324, + "grad_norm": 3.259214401245117, + "learning_rate": 1.6886077371733285e-05, + "loss": 0.4038, + "step": 3324 + }, + { + "epoch": 0.3326, + "grad_norm": 4.660262107849121, + "learning_rate": 1.688101330340263e-05, + "loss": 0.2532, + "step": 3326 + }, + { + "epoch": 0.3328, + "grad_norm": 1.9322415590286255, + "learning_rate": 1.6875945881349676e-05, + "loss": 0.2767, + "step": 3328 + }, + { + "epoch": 0.333, + "grad_norm": 0.463273823261261, + "learning_rate": 1.6870875108044233e-05, + "loss": 0.2107, + "step": 3330 + }, + { + "epoch": 0.3332, + "grad_norm": 4.211151123046875, + "learning_rate": 1.686580098595773e-05, + "loss": 0.1572, + "step": 3332 + }, + { + "epoch": 0.3334, + "grad_norm": 2.751866340637207, + "learning_rate": 1.6860723517563232e-05, + "loss": 0.1841, + "step": 3334 + }, + { + "epoch": 0.3336, + "grad_norm": 3.77817702293396, + "learning_rate": 1.6855642705335438e-05, + "loss": 0.3232, + "step": 3336 + }, + { + "epoch": 0.3338, + "grad_norm": 2.7535836696624756, + "learning_rate": 1.685055855175067e-05, + "loss": 0.3995, + "step": 3338 + }, + { + "epoch": 0.334, + "grad_norm": 0.7860167026519775, + "learning_rate": 1.684547105928689e-05, + "loss": 0.2158, + "step": 3340 + }, + { + "epoch": 0.3342, + "grad_norm": 2.2274563312530518, + "learning_rate": 1.684038023042367e-05, + "loss": 0.2932, + "step": 3342 + }, + { + "epoch": 0.3344, + "grad_norm": 2.3566975593566895, + "learning_rate": 1.6835286067642228e-05, + "loss": 0.6856, + "step": 3344 + }, + { + "epoch": 0.3346, + "grad_norm": 3.9676597118377686, + "learning_rate": 1.683018857342539e-05, + "loss": 0.342, + "step": 3346 + }, + { + "epoch": 0.3348, + "grad_norm": 0.7061996459960938, + "learning_rate": 1.6825087750257617e-05, + "loss": 0.059, + "step": 3348 + }, + { + "epoch": 0.335, + "grad_norm": 2.5069754123687744, + "learning_rate": 1.6819983600624986e-05, + "loss": 0.3191, + "step": 3350 + }, + { + "epoch": 0.3352, + "grad_norm": 4.84499454498291, + "learning_rate": 1.68148761270152e-05, + "loss": 0.3663, + "step": 3352 + }, + { + "epoch": 0.3354, + "grad_norm": 1.5344300270080566, + "learning_rate": 1.6809765331917576e-05, + "loss": 0.2779, + "step": 3354 + }, + { + "epoch": 0.3356, + "grad_norm": 1.8808579444885254, + "learning_rate": 1.6804651217823055e-05, + "loss": 0.1952, + "step": 3356 + }, + { + "epoch": 0.3358, + "grad_norm": 1.2605701684951782, + "learning_rate": 1.6799533787224192e-05, + "loss": 0.136, + "step": 3358 + }, + { + "epoch": 0.336, + "grad_norm": 4.161028861999512, + "learning_rate": 1.6794413042615168e-05, + "loss": 0.3158, + "step": 3360 + }, + { + "epoch": 0.3362, + "grad_norm": 1.3958619832992554, + "learning_rate": 1.6789288986491764e-05, + "loss": 0.1019, + "step": 3362 + }, + { + "epoch": 0.3364, + "grad_norm": 1.6565730571746826, + "learning_rate": 1.6784161621351384e-05, + "loss": 0.3282, + "step": 3364 + }, + { + "epoch": 0.3366, + "grad_norm": 6.114418983459473, + "learning_rate": 1.6779030949693044e-05, + "loss": 0.4221, + "step": 3366 + }, + { + "epoch": 0.3368, + "grad_norm": 0.26107677817344666, + "learning_rate": 1.6773896974017373e-05, + "loss": 0.3181, + "step": 3368 + }, + { + "epoch": 0.337, + "grad_norm": 0.5302995443344116, + "learning_rate": 1.6768759696826608e-05, + "loss": 0.0771, + "step": 3370 + }, + { + "epoch": 0.3372, + "grad_norm": 1.2331658601760864, + "learning_rate": 1.6763619120624595e-05, + "loss": 0.2486, + "step": 3372 + }, + { + "epoch": 0.3374, + "grad_norm": 7.8458638191223145, + "learning_rate": 1.6758475247916786e-05, + "loss": 0.2402, + "step": 3374 + }, + { + "epoch": 0.3376, + "grad_norm": 1.7576193809509277, + "learning_rate": 1.6753328081210244e-05, + "loss": 0.2278, + "step": 3376 + }, + { + "epoch": 0.3378, + "grad_norm": 2.137834072113037, + "learning_rate": 1.6748177623013638e-05, + "loss": 0.1299, + "step": 3378 + }, + { + "epoch": 0.338, + "grad_norm": 0.8487266898155212, + "learning_rate": 1.6743023875837233e-05, + "loss": 0.1535, + "step": 3380 + }, + { + "epoch": 0.3382, + "grad_norm": 5.636786937713623, + "learning_rate": 1.6737866842192908e-05, + "loss": 0.2395, + "step": 3382 + }, + { + "epoch": 0.3384, + "grad_norm": 2.355649709701538, + "learning_rate": 1.6732706524594138e-05, + "loss": 0.2331, + "step": 3384 + }, + { + "epoch": 0.3386, + "grad_norm": 14.39785099029541, + "learning_rate": 1.6727542925556e-05, + "loss": 0.5892, + "step": 3386 + }, + { + "epoch": 0.3388, + "grad_norm": 1.0866690874099731, + "learning_rate": 1.6722376047595163e-05, + "loss": 0.0522, + "step": 3388 + }, + { + "epoch": 0.339, + "grad_norm": 0.37535929679870605, + "learning_rate": 1.6717205893229904e-05, + "loss": 0.0635, + "step": 3390 + }, + { + "epoch": 0.3392, + "grad_norm": 1.3040165901184082, + "learning_rate": 1.6712032464980094e-05, + "loss": 0.2208, + "step": 3392 + }, + { + "epoch": 0.3394, + "grad_norm": 13.495378494262695, + "learning_rate": 1.6706855765367202e-05, + "loss": 0.3082, + "step": 3394 + }, + { + "epoch": 0.3396, + "grad_norm": 0.13619691133499146, + "learning_rate": 1.6701675796914284e-05, + "loss": 0.6807, + "step": 3396 + }, + { + "epoch": 0.3398, + "grad_norm": 9.520256042480469, + "learning_rate": 1.6696492562145996e-05, + "loss": 0.5294, + "step": 3398 + }, + { + "epoch": 0.34, + "grad_norm": 0.12433121353387833, + "learning_rate": 1.6691306063588583e-05, + "loss": 0.1136, + "step": 3400 + }, + { + "epoch": 0.3402, + "grad_norm": 0.5479094386100769, + "learning_rate": 1.6686116303769884e-05, + "loss": 0.2608, + "step": 3402 + }, + { + "epoch": 0.3404, + "grad_norm": 2.6236729621887207, + "learning_rate": 1.668092328521932e-05, + "loss": 0.1077, + "step": 3404 + }, + { + "epoch": 0.3406, + "grad_norm": 1.3510371446609497, + "learning_rate": 1.667572701046791e-05, + "loss": 0.0614, + "step": 3406 + }, + { + "epoch": 0.3408, + "grad_norm": 5.808706283569336, + "learning_rate": 1.6670527482048246e-05, + "loss": 0.2, + "step": 3408 + }, + { + "epoch": 0.341, + "grad_norm": 4.246831893920898, + "learning_rate": 1.6665324702494524e-05, + "loss": 0.2651, + "step": 3410 + }, + { + "epoch": 0.3412, + "grad_norm": 5.977890491485596, + "learning_rate": 1.666011867434252e-05, + "loss": 0.6084, + "step": 3412 + }, + { + "epoch": 0.3414, + "grad_norm": 3.3720288276672363, + "learning_rate": 1.6654909400129575e-05, + "loss": 0.35, + "step": 3414 + }, + { + "epoch": 0.3416, + "grad_norm": 1.3278477191925049, + "learning_rate": 1.6649696882394635e-05, + "loss": 0.1254, + "step": 3416 + }, + { + "epoch": 0.3418, + "grad_norm": 2.6452057361602783, + "learning_rate": 1.664448112367822e-05, + "loss": 0.4388, + "step": 3418 + }, + { + "epoch": 0.342, + "grad_norm": 2.9542412757873535, + "learning_rate": 1.6639262126522417e-05, + "loss": 0.265, + "step": 3420 + }, + { + "epoch": 0.3422, + "grad_norm": 2.2736623287200928, + "learning_rate": 1.6634039893470912e-05, + "loss": 0.4964, + "step": 3422 + }, + { + "epoch": 0.3424, + "grad_norm": 3.569699764251709, + "learning_rate": 1.6628814427068954e-05, + "loss": 0.417, + "step": 3424 + }, + { + "epoch": 0.3426, + "grad_norm": 2.3799965381622314, + "learning_rate": 1.662358572986337e-05, + "loss": 0.2568, + "step": 3426 + }, + { + "epoch": 0.3428, + "grad_norm": 1.7090212106704712, + "learning_rate": 1.6618353804402567e-05, + "loss": 0.0942, + "step": 3428 + }, + { + "epoch": 0.343, + "grad_norm": 3.780691623687744, + "learning_rate": 1.661311865323652e-05, + "loss": 0.3526, + "step": 3430 + }, + { + "epoch": 0.3432, + "grad_norm": 0.5062385201454163, + "learning_rate": 1.6607880278916778e-05, + "loss": 0.263, + "step": 3432 + }, + { + "epoch": 0.3434, + "grad_norm": 1.1650316715240479, + "learning_rate": 1.6602638683996462e-05, + "loss": 0.3027, + "step": 3434 + }, + { + "epoch": 0.3436, + "grad_norm": 0.6821444630622864, + "learning_rate": 1.6597393871030264e-05, + "loss": 0.0894, + "step": 3436 + }, + { + "epoch": 0.3438, + "grad_norm": 1.539177417755127, + "learning_rate": 1.6592145842574433e-05, + "loss": 0.3136, + "step": 3438 + }, + { + "epoch": 0.344, + "grad_norm": 2.4481747150421143, + "learning_rate": 1.6586894601186804e-05, + "loss": 0.6027, + "step": 3440 + }, + { + "epoch": 0.3442, + "grad_norm": 2.0303781032562256, + "learning_rate": 1.6581640149426766e-05, + "loss": 0.4846, + "step": 3442 + }, + { + "epoch": 0.3444, + "grad_norm": 3.0587966442108154, + "learning_rate": 1.6576382489855274e-05, + "loss": 0.4885, + "step": 3444 + }, + { + "epoch": 0.3446, + "grad_norm": 3.5459413528442383, + "learning_rate": 1.6571121625034847e-05, + "loss": 0.1965, + "step": 3446 + }, + { + "epoch": 0.3448, + "grad_norm": 4.49768590927124, + "learning_rate": 1.6565857557529567e-05, + "loss": 0.3805, + "step": 3448 + }, + { + "epoch": 0.345, + "grad_norm": 2.3127448558807373, + "learning_rate": 1.6560590289905074e-05, + "loss": 0.2428, + "step": 3450 + }, + { + "epoch": 0.3452, + "grad_norm": 2.375755786895752, + "learning_rate": 1.6555319824728577e-05, + "loss": 0.3813, + "step": 3452 + }, + { + "epoch": 0.3454, + "grad_norm": 2.860966205596924, + "learning_rate": 1.6550046164568827e-05, + "loss": 0.22, + "step": 3454 + }, + { + "epoch": 0.3456, + "grad_norm": 0.785335123538971, + "learning_rate": 1.654476931199615e-05, + "loss": 0.3244, + "step": 3456 + }, + { + "epoch": 0.3458, + "grad_norm": 5.482293128967285, + "learning_rate": 1.6539489269582414e-05, + "loss": 0.355, + "step": 3458 + }, + { + "epoch": 0.346, + "grad_norm": 1.8880510330200195, + "learning_rate": 1.6534206039901057e-05, + "loss": 0.2107, + "step": 3460 + }, + { + "epoch": 0.3462, + "grad_norm": 3.3759422302246094, + "learning_rate": 1.652891962552705e-05, + "loss": 0.1886, + "step": 3462 + }, + { + "epoch": 0.3464, + "grad_norm": 1.6883141994476318, + "learning_rate": 1.652363002903693e-05, + "loss": 0.1366, + "step": 3464 + }, + { + "epoch": 0.3466, + "grad_norm": 3.052422523498535, + "learning_rate": 1.651833725300879e-05, + "loss": 0.2075, + "step": 3466 + }, + { + "epoch": 0.3468, + "grad_norm": 5.999207019805908, + "learning_rate": 1.6513041300022253e-05, + "loss": 0.4262, + "step": 3468 + }, + { + "epoch": 0.347, + "grad_norm": 2.577103614807129, + "learning_rate": 1.650774217265851e-05, + "loss": 0.2379, + "step": 3470 + }, + { + "epoch": 0.3472, + "grad_norm": 1.9377018213272095, + "learning_rate": 1.650243987350029e-05, + "loss": 0.2078, + "step": 3472 + }, + { + "epoch": 0.3474, + "grad_norm": 3.668597936630249, + "learning_rate": 1.649713440513187e-05, + "loss": 0.3841, + "step": 3474 + }, + { + "epoch": 0.3476, + "grad_norm": 3.8214313983917236, + "learning_rate": 1.649182577013906e-05, + "loss": 0.3653, + "step": 3476 + }, + { + "epoch": 0.3478, + "grad_norm": 0.12296484410762787, + "learning_rate": 1.6486513971109245e-05, + "loss": 0.1347, + "step": 3478 + }, + { + "epoch": 0.348, + "grad_norm": 0.17810428142547607, + "learning_rate": 1.6481199010631312e-05, + "loss": 0.3119, + "step": 3480 + }, + { + "epoch": 0.3482, + "grad_norm": 3.598045587539673, + "learning_rate": 1.6475880891295716e-05, + "loss": 0.435, + "step": 3482 + }, + { + "epoch": 0.3484, + "grad_norm": 1.9006749391555786, + "learning_rate": 1.6470559615694445e-05, + "loss": 0.2713, + "step": 3484 + }, + { + "epoch": 0.3486, + "grad_norm": 1.1732110977172852, + "learning_rate": 1.6465235186421024e-05, + "loss": 0.2761, + "step": 3486 + }, + { + "epoch": 0.3488, + "grad_norm": 1.243195652961731, + "learning_rate": 1.6459907606070513e-05, + "loss": 0.2378, + "step": 3488 + }, + { + "epoch": 0.349, + "grad_norm": 5.203609466552734, + "learning_rate": 1.645457687723951e-05, + "loss": 0.2329, + "step": 3490 + }, + { + "epoch": 0.3492, + "grad_norm": 0.6536909341812134, + "learning_rate": 1.6449243002526146e-05, + "loss": 0.146, + "step": 3492 + }, + { + "epoch": 0.3494, + "grad_norm": 1.0517759323120117, + "learning_rate": 1.6443905984530092e-05, + "loss": 0.248, + "step": 3494 + }, + { + "epoch": 0.3496, + "grad_norm": 4.450293064117432, + "learning_rate": 1.643856582585254e-05, + "loss": 0.1211, + "step": 3496 + }, + { + "epoch": 0.3498, + "grad_norm": 6.06393575668335, + "learning_rate": 1.643322252909622e-05, + "loss": 0.1812, + "step": 3498 + }, + { + "epoch": 0.35, + "grad_norm": 14.939154624938965, + "learning_rate": 1.6427876096865394e-05, + "loss": 0.6337, + "step": 3500 + }, + { + "epoch": 0.3502, + "grad_norm": 9.705696105957031, + "learning_rate": 1.6422526531765846e-05, + "loss": 0.2512, + "step": 3502 + }, + { + "epoch": 0.3504, + "grad_norm": 0.4772564172744751, + "learning_rate": 1.6417173836404888e-05, + "loss": 0.0452, + "step": 3504 + }, + { + "epoch": 0.3506, + "grad_norm": 1.406052827835083, + "learning_rate": 1.6411818013391357e-05, + "loss": 0.0858, + "step": 3506 + }, + { + "epoch": 0.3508, + "grad_norm": 4.45255708694458, + "learning_rate": 1.6406459065335616e-05, + "loss": 0.4186, + "step": 3508 + }, + { + "epoch": 0.351, + "grad_norm": 0.6481139063835144, + "learning_rate": 1.6401096994849558e-05, + "loss": 0.0258, + "step": 3510 + }, + { + "epoch": 0.3512, + "grad_norm": 2.550693988800049, + "learning_rate": 1.6395731804546582e-05, + "loss": 0.2511, + "step": 3512 + }, + { + "epoch": 0.3514, + "grad_norm": 0.5232470631599426, + "learning_rate": 1.639036349704162e-05, + "loss": 0.3921, + "step": 3514 + }, + { + "epoch": 0.3516, + "grad_norm": 5.614489555358887, + "learning_rate": 1.6384992074951124e-05, + "loss": 0.3194, + "step": 3516 + }, + { + "epoch": 0.3518, + "grad_norm": 4.220565319061279, + "learning_rate": 1.6379617540893056e-05, + "loss": 0.6583, + "step": 3518 + }, + { + "epoch": 0.352, + "grad_norm": 1.9188789129257202, + "learning_rate": 1.63742398974869e-05, + "loss": 0.164, + "step": 3520 + }, + { + "epoch": 0.3522, + "grad_norm": 3.3095650672912598, + "learning_rate": 1.636885914735365e-05, + "loss": 0.3449, + "step": 3522 + }, + { + "epoch": 0.3524, + "grad_norm": 2.2452034950256348, + "learning_rate": 1.6363475293115824e-05, + "loss": 0.4188, + "step": 3524 + }, + { + "epoch": 0.3526, + "grad_norm": 4.4046502113342285, + "learning_rate": 1.6358088337397444e-05, + "loss": 0.3422, + "step": 3526 + }, + { + "epoch": 0.3528, + "grad_norm": 0.4539504945278168, + "learning_rate": 1.6352698282824045e-05, + "loss": 0.1482, + "step": 3528 + }, + { + "epoch": 0.353, + "grad_norm": 4.516602993011475, + "learning_rate": 1.6347305132022677e-05, + "loss": 0.7846, + "step": 3530 + }, + { + "epoch": 0.3532, + "grad_norm": 0.9014936685562134, + "learning_rate": 1.6341908887621894e-05, + "loss": 0.1575, + "step": 3532 + }, + { + "epoch": 0.3534, + "grad_norm": 0.9652045965194702, + "learning_rate": 1.6336509552251766e-05, + "loss": 0.0967, + "step": 3534 + }, + { + "epoch": 0.3536, + "grad_norm": 7.963252544403076, + "learning_rate": 1.6331107128543856e-05, + "loss": 0.4663, + "step": 3536 + }, + { + "epoch": 0.3538, + "grad_norm": 0.24469590187072754, + "learning_rate": 1.6325701619131246e-05, + "loss": 0.0155, + "step": 3538 + }, + { + "epoch": 0.354, + "grad_norm": 1.3035551309585571, + "learning_rate": 1.632029302664851e-05, + "loss": 0.1583, + "step": 3540 + }, + { + "epoch": 0.3542, + "grad_norm": 7.612825870513916, + "learning_rate": 1.6314881353731733e-05, + "loss": 0.3325, + "step": 3542 + }, + { + "epoch": 0.3544, + "grad_norm": 4.63937520980835, + "learning_rate": 1.6309466603018497e-05, + "loss": 0.3373, + "step": 3544 + }, + { + "epoch": 0.3546, + "grad_norm": 0.5010870099067688, + "learning_rate": 1.630404877714789e-05, + "loss": 0.0411, + "step": 3546 + }, + { + "epoch": 0.3548, + "grad_norm": 2.981733798980713, + "learning_rate": 1.6298627878760488e-05, + "loss": 0.4967, + "step": 3548 + }, + { + "epoch": 0.355, + "grad_norm": 2.681283473968506, + "learning_rate": 1.6293203910498375e-05, + "loss": 0.2423, + "step": 3550 + }, + { + "epoch": 0.3552, + "grad_norm": 0.5167466402053833, + "learning_rate": 1.628777687500513e-05, + "loss": 0.1421, + "step": 3552 + }, + { + "epoch": 0.3554, + "grad_norm": 2.602158784866333, + "learning_rate": 1.6282346774925816e-05, + "loss": 0.4159, + "step": 3554 + }, + { + "epoch": 0.3556, + "grad_norm": 2.3419411182403564, + "learning_rate": 1.6276913612907005e-05, + "loss": 0.1963, + "step": 3556 + }, + { + "epoch": 0.3558, + "grad_norm": 2.164745569229126, + "learning_rate": 1.6271477391596754e-05, + "loss": 0.5493, + "step": 3558 + }, + { + "epoch": 0.356, + "grad_norm": 4.72236967086792, + "learning_rate": 1.6266038113644605e-05, + "loss": 0.4045, + "step": 3560 + }, + { + "epoch": 0.3562, + "grad_norm": 0.7726926803588867, + "learning_rate": 1.6260595781701605e-05, + "loss": 0.053, + "step": 3562 + }, + { + "epoch": 0.3564, + "grad_norm": 2.298553705215454, + "learning_rate": 1.6255150398420273e-05, + "loss": 0.1679, + "step": 3564 + }, + { + "epoch": 0.3566, + "grad_norm": 2.657900094985962, + "learning_rate": 1.6249701966454626e-05, + "loss": 0.4898, + "step": 3566 + }, + { + "epoch": 0.3568, + "grad_norm": 1.9502123594284058, + "learning_rate": 1.624425048846016e-05, + "loss": 0.0867, + "step": 3568 + }, + { + "epoch": 0.357, + "grad_norm": 6.40615177154541, + "learning_rate": 1.6238795967093865e-05, + "loss": 0.2648, + "step": 3570 + }, + { + "epoch": 0.3572, + "grad_norm": 3.7713780403137207, + "learning_rate": 1.6233338405014204e-05, + "loss": 0.1439, + "step": 3572 + }, + { + "epoch": 0.3574, + "grad_norm": 17.840335845947266, + "learning_rate": 1.6227877804881126e-05, + "loss": 0.5372, + "step": 3574 + }, + { + "epoch": 0.3576, + "grad_norm": 5.364625453948975, + "learning_rate": 1.6222414169356066e-05, + "loss": 0.3174, + "step": 3576 + }, + { + "epoch": 0.3578, + "grad_norm": 0.6341979503631592, + "learning_rate": 1.621694750110193e-05, + "loss": 0.0279, + "step": 3578 + }, + { + "epoch": 0.358, + "grad_norm": 13.716168403625488, + "learning_rate": 1.6211477802783105e-05, + "loss": 0.1555, + "step": 3580 + }, + { + "epoch": 0.3582, + "grad_norm": 1.6421754360198975, + "learning_rate": 1.6206005077065457e-05, + "loss": 0.3526, + "step": 3582 + }, + { + "epoch": 0.3584, + "grad_norm": 1.4549051523208618, + "learning_rate": 1.620052932661633e-05, + "loss": 0.2161, + "step": 3584 + }, + { + "epoch": 0.3586, + "grad_norm": 0.8946252465248108, + "learning_rate": 1.619505055410453e-05, + "loss": 0.0711, + "step": 3586 + }, + { + "epoch": 0.3588, + "grad_norm": 1.496496558189392, + "learning_rate": 1.618956876220035e-05, + "loss": 0.2532, + "step": 3588 + }, + { + "epoch": 0.359, + "grad_norm": 4.217204570770264, + "learning_rate": 1.6184083953575543e-05, + "loss": 0.5193, + "step": 3590 + }, + { + "epoch": 0.3592, + "grad_norm": 3.3577992916107178, + "learning_rate": 1.6178596130903345e-05, + "loss": 0.332, + "step": 3592 + }, + { + "epoch": 0.3594, + "grad_norm": 4.581401824951172, + "learning_rate": 1.617310529685845e-05, + "loss": 0.3098, + "step": 3594 + }, + { + "epoch": 0.3596, + "grad_norm": 1.468532681465149, + "learning_rate": 1.6167611454117027e-05, + "loss": 0.2716, + "step": 3596 + }, + { + "epoch": 0.3598, + "grad_norm": 2.526750087738037, + "learning_rate": 1.6162114605356704e-05, + "loss": 0.1019, + "step": 3598 + }, + { + "epoch": 0.36, + "grad_norm": 4.494863510131836, + "learning_rate": 1.6156614753256583e-05, + "loss": 0.4508, + "step": 3600 + }, + { + "epoch": 0.3602, + "grad_norm": 7.188631057739258, + "learning_rate": 1.6151111900497225e-05, + "loss": 0.2786, + "step": 3602 + }, + { + "epoch": 0.3604, + "grad_norm": 0.12311619520187378, + "learning_rate": 1.6145606049760644e-05, + "loss": 0.1231, + "step": 3604 + }, + { + "epoch": 0.3606, + "grad_norm": 2.878835916519165, + "learning_rate": 1.614009720373034e-05, + "loss": 0.2159, + "step": 3606 + }, + { + "epoch": 0.3608, + "grad_norm": 4.008289337158203, + "learning_rate": 1.6134585365091243e-05, + "loss": 0.3265, + "step": 3608 + }, + { + "epoch": 0.361, + "grad_norm": 9.09296989440918, + "learning_rate": 1.6129070536529767e-05, + "loss": 0.5868, + "step": 3610 + }, + { + "epoch": 0.3612, + "grad_norm": 0.6591078639030457, + "learning_rate": 1.6123552720733767e-05, + "loss": 0.1135, + "step": 3612 + }, + { + "epoch": 0.3614, + "grad_norm": 7.2387776374816895, + "learning_rate": 1.611803192039256e-05, + "loss": 0.5552, + "step": 3614 + }, + { + "epoch": 0.3616, + "grad_norm": 1.0447754859924316, + "learning_rate": 1.611250813819692e-05, + "loss": 0.0905, + "step": 3616 + }, + { + "epoch": 0.3618, + "grad_norm": 0.32504233717918396, + "learning_rate": 1.6106981376839064e-05, + "loss": 0.2769, + "step": 3618 + }, + { + "epoch": 0.362, + "grad_norm": 0.13239331543445587, + "learning_rate": 1.610145163901268e-05, + "loss": 0.1779, + "step": 3620 + }, + { + "epoch": 0.3622, + "grad_norm": 4.09377384185791, + "learning_rate": 1.6095918927412883e-05, + "loss": 0.4895, + "step": 3622 + }, + { + "epoch": 0.3624, + "grad_norm": 0.5278245210647583, + "learning_rate": 1.6090383244736256e-05, + "loss": 0.0574, + "step": 3624 + }, + { + "epoch": 0.3626, + "grad_norm": 1.5962798595428467, + "learning_rate": 1.608484459368082e-05, + "loss": 0.2272, + "step": 3626 + }, + { + "epoch": 0.3628, + "grad_norm": 1.548728108406067, + "learning_rate": 1.6079302976946055e-05, + "loss": 0.1516, + "step": 3628 + }, + { + "epoch": 0.363, + "grad_norm": 0.58984375, + "learning_rate": 1.607375839723287e-05, + "loss": 0.0471, + "step": 3630 + }, + { + "epoch": 0.3632, + "grad_norm": 2.533109188079834, + "learning_rate": 1.6068210857243625e-05, + "loss": 0.2645, + "step": 3632 + }, + { + "epoch": 0.3634, + "grad_norm": 1.0608601570129395, + "learning_rate": 1.6062660359682124e-05, + "loss": 0.0836, + "step": 3634 + }, + { + "epoch": 0.3636, + "grad_norm": 0.8962275385856628, + "learning_rate": 1.6057106907253617e-05, + "loss": 0.2712, + "step": 3636 + }, + { + "epoch": 0.3638, + "grad_norm": 0.9705060124397278, + "learning_rate": 1.605155050266478e-05, + "loss": 0.0432, + "step": 3638 + }, + { + "epoch": 0.364, + "grad_norm": 0.469296932220459, + "learning_rate": 1.6045991148623752e-05, + "loss": 0.0277, + "step": 3640 + }, + { + "epoch": 0.3642, + "grad_norm": 7.464212894439697, + "learning_rate": 1.6040428847840078e-05, + "loss": 0.299, + "step": 3642 + }, + { + "epoch": 0.3644, + "grad_norm": 8.813636779785156, + "learning_rate": 1.6034863603024768e-05, + "loss": 0.1953, + "step": 3644 + }, + { + "epoch": 0.3646, + "grad_norm": 0.05101361498236656, + "learning_rate": 1.602929541689025e-05, + "loss": 0.4533, + "step": 3646 + }, + { + "epoch": 0.3648, + "grad_norm": 2.4205265045166016, + "learning_rate": 1.6023724292150387e-05, + "loss": 0.1117, + "step": 3648 + }, + { + "epoch": 0.365, + "grad_norm": 0.7351101636886597, + "learning_rate": 1.6018150231520486e-05, + "loss": 0.7586, + "step": 3650 + }, + { + "epoch": 0.3652, + "grad_norm": 5.8306660652160645, + "learning_rate": 1.601257323771727e-05, + "loss": 0.3829, + "step": 3652 + }, + { + "epoch": 0.3654, + "grad_norm": 4.073424816131592, + "learning_rate": 1.6006993313458896e-05, + "loss": 0.2342, + "step": 3654 + }, + { + "epoch": 0.3656, + "grad_norm": 1.179059386253357, + "learning_rate": 1.6001410461464955e-05, + "loss": 0.2532, + "step": 3656 + }, + { + "epoch": 0.3658, + "grad_norm": 2.658978223800659, + "learning_rate": 1.5995824684456465e-05, + "loss": 0.1382, + "step": 3658 + }, + { + "epoch": 0.366, + "grad_norm": 5.247921943664551, + "learning_rate": 1.599023598515586e-05, + "loss": 0.4882, + "step": 3660 + }, + { + "epoch": 0.3662, + "grad_norm": 2.0549850463867188, + "learning_rate": 1.5984644366287007e-05, + "loss": 0.0794, + "step": 3662 + }, + { + "epoch": 0.3664, + "grad_norm": 1.7399955987930298, + "learning_rate": 1.597904983057519e-05, + "loss": 0.1591, + "step": 3664 + }, + { + "epoch": 0.3666, + "grad_norm": 0.16353943943977356, + "learning_rate": 1.5973452380747125e-05, + "loss": 0.3096, + "step": 3666 + }, + { + "epoch": 0.3668, + "grad_norm": 2.4816179275512695, + "learning_rate": 1.596785201953093e-05, + "loss": 0.339, + "step": 3668 + }, + { + "epoch": 0.367, + "grad_norm": 1.7445335388183594, + "learning_rate": 1.5962248749656158e-05, + "loss": 0.4233, + "step": 3670 + }, + { + "epoch": 0.3672, + "grad_norm": 0.8461546301841736, + "learning_rate": 1.5956642573853784e-05, + "loss": 0.1953, + "step": 3672 + }, + { + "epoch": 0.3674, + "grad_norm": 3.33308744430542, + "learning_rate": 1.5951033494856174e-05, + "loss": 0.2383, + "step": 3674 + }, + { + "epoch": 0.3676, + "grad_norm": 0.781283438205719, + "learning_rate": 1.5945421515397135e-05, + "loss": 0.1867, + "step": 3676 + }, + { + "epoch": 0.3678, + "grad_norm": 0.8204110860824585, + "learning_rate": 1.593980663821187e-05, + "loss": 0.0526, + "step": 3678 + }, + { + "epoch": 0.368, + "grad_norm": 3.5501744747161865, + "learning_rate": 1.5934188866037017e-05, + "loss": 0.2674, + "step": 3680 + }, + { + "epoch": 0.3682, + "grad_norm": 0.8411276936531067, + "learning_rate": 1.5928568201610593e-05, + "loss": 0.0695, + "step": 3682 + }, + { + "epoch": 0.3684, + "grad_norm": 0.03108889050781727, + "learning_rate": 1.592294464767205e-05, + "loss": 0.0207, + "step": 3684 + }, + { + "epoch": 0.3686, + "grad_norm": 0.9112809896469116, + "learning_rate": 1.591731820696224e-05, + "loss": 0.1682, + "step": 3686 + }, + { + "epoch": 0.3688, + "grad_norm": 7.845537185668945, + "learning_rate": 1.591168888222342e-05, + "loss": 0.7915, + "step": 3688 + }, + { + "epoch": 0.369, + "grad_norm": 1.5495846271514893, + "learning_rate": 1.5906056676199256e-05, + "loss": 0.139, + "step": 3690 + }, + { + "epoch": 0.3692, + "grad_norm": 2.4437646865844727, + "learning_rate": 1.5900421591634813e-05, + "loss": 0.2255, + "step": 3692 + }, + { + "epoch": 0.3694, + "grad_norm": 0.13954026997089386, + "learning_rate": 1.589478363127657e-05, + "loss": 0.0154, + "step": 3694 + }, + { + "epoch": 0.3696, + "grad_norm": 0.07694923132658005, + "learning_rate": 1.5889142797872387e-05, + "loss": 0.003, + "step": 3696 + }, + { + "epoch": 0.3698, + "grad_norm": 0.413431853055954, + "learning_rate": 1.5883499094171556e-05, + "loss": 0.3009, + "step": 3698 + }, + { + "epoch": 0.37, + "grad_norm": 1.5404397249221802, + "learning_rate": 1.5877852522924733e-05, + "loss": 0.0912, + "step": 3700 + }, + { + "epoch": 0.3702, + "grad_norm": 1.875856876373291, + "learning_rate": 1.5872203086883996e-05, + "loss": 0.1244, + "step": 3702 + }, + { + "epoch": 0.3704, + "grad_norm": 1.2042292356491089, + "learning_rate": 1.5866550788802815e-05, + "loss": 0.3364, + "step": 3704 + }, + { + "epoch": 0.3706, + "grad_norm": 0.47809141874313354, + "learning_rate": 1.5860895631436044e-05, + "loss": 0.0837, + "step": 3706 + }, + { + "epoch": 0.3708, + "grad_norm": 1.3752598762512207, + "learning_rate": 1.5855237617539943e-05, + "loss": 0.082, + "step": 3708 + }, + { + "epoch": 0.371, + "grad_norm": 8.835253715515137, + "learning_rate": 1.584957674987216e-05, + "loss": 0.3299, + "step": 3710 + }, + { + "epoch": 0.3712, + "grad_norm": 5.762791156768799, + "learning_rate": 1.5843913031191722e-05, + "loss": 1.0693, + "step": 3712 + }, + { + "epoch": 0.3714, + "grad_norm": 4.154807090759277, + "learning_rate": 1.583824646425907e-05, + "loss": 0.3423, + "step": 3714 + }, + { + "epoch": 0.3716, + "grad_norm": 2.5459964275360107, + "learning_rate": 1.5832577051836016e-05, + "loss": 0.545, + "step": 3716 + }, + { + "epoch": 0.3718, + "grad_norm": 1.3168872594833374, + "learning_rate": 1.5826904796685763e-05, + "loss": 0.3078, + "step": 3718 + }, + { + "epoch": 0.372, + "grad_norm": 2.768369674682617, + "learning_rate": 1.5821229701572897e-05, + "loss": 0.384, + "step": 3720 + }, + { + "epoch": 0.3722, + "grad_norm": 1.582114577293396, + "learning_rate": 1.5815551769263387e-05, + "loss": 0.1048, + "step": 3722 + }, + { + "epoch": 0.3724, + "grad_norm": 0.04572045058012009, + "learning_rate": 1.5809871002524602e-05, + "loss": 0.2859, + "step": 3724 + }, + { + "epoch": 0.3726, + "grad_norm": 1.7575660943984985, + "learning_rate": 1.580418740412526e-05, + "loss": 0.2709, + "step": 3726 + }, + { + "epoch": 0.3728, + "grad_norm": 2.0812900066375732, + "learning_rate": 1.5798500976835493e-05, + "loss": 0.1329, + "step": 3728 + }, + { + "epoch": 0.373, + "grad_norm": 2.7206802368164062, + "learning_rate": 1.5792811723426787e-05, + "loss": 0.3221, + "step": 3730 + }, + { + "epoch": 0.3732, + "grad_norm": 71.73616027832031, + "learning_rate": 1.5787119646672025e-05, + "loss": 0.8704, + "step": 3732 + }, + { + "epoch": 0.3734, + "grad_norm": 1.9784677028656006, + "learning_rate": 1.5781424749345447e-05, + "loss": 0.3354, + "step": 3734 + }, + { + "epoch": 0.3736, + "grad_norm": 2.831066370010376, + "learning_rate": 1.5775727034222675e-05, + "loss": 0.3486, + "step": 3736 + }, + { + "epoch": 0.3738, + "grad_norm": 3.0011062622070312, + "learning_rate": 1.577002650408072e-05, + "loss": 0.4082, + "step": 3738 + }, + { + "epoch": 0.374, + "grad_norm": 4.472220420837402, + "learning_rate": 1.5764323161697933e-05, + "loss": 0.3217, + "step": 3740 + }, + { + "epoch": 0.3742, + "grad_norm": 1.3101431131362915, + "learning_rate": 1.5758617009854068e-05, + "loss": 0.1758, + "step": 3742 + }, + { + "epoch": 0.3744, + "grad_norm": 2.1950812339782715, + "learning_rate": 1.575290805133023e-05, + "loss": 0.2638, + "step": 3744 + }, + { + "epoch": 0.3746, + "grad_norm": 5.400842666625977, + "learning_rate": 1.5747196288908887e-05, + "loss": 0.2665, + "step": 3746 + }, + { + "epoch": 0.3748, + "grad_norm": 1.4637328386306763, + "learning_rate": 1.57414817253739e-05, + "loss": 0.1101, + "step": 3748 + }, + { + "epoch": 0.375, + "grad_norm": 2.6760220527648926, + "learning_rate": 1.573576436351046e-05, + "loss": 0.1874, + "step": 3750 + }, + { + "epoch": 0.3752, + "grad_norm": 1.647377848625183, + "learning_rate": 1.5730044206105156e-05, + "loss": 0.265, + "step": 3752 + }, + { + "epoch": 0.3754, + "grad_norm": 0.910101056098938, + "learning_rate": 1.572432125594591e-05, + "loss": 0.1446, + "step": 3754 + }, + { + "epoch": 0.3756, + "grad_norm": 2.8350636959075928, + "learning_rate": 1.5718595515822027e-05, + "loss": 0.21, + "step": 3756 + }, + { + "epoch": 0.3758, + "grad_norm": 1.6864395141601562, + "learning_rate": 1.5712866988524157e-05, + "loss": 0.1502, + "step": 3758 + }, + { + "epoch": 0.376, + "grad_norm": 2.9960713386535645, + "learning_rate": 1.570713567684432e-05, + "loss": 0.0729, + "step": 3760 + }, + { + "epoch": 0.3762, + "grad_norm": 4.44609260559082, + "learning_rate": 1.5701401583575883e-05, + "loss": 0.4096, + "step": 3762 + }, + { + "epoch": 0.3764, + "grad_norm": 1.7341505289077759, + "learning_rate": 1.5695664711513575e-05, + "loss": 0.3617, + "step": 3764 + }, + { + "epoch": 0.3766, + "grad_norm": 4.759166717529297, + "learning_rate": 1.5689925063453483e-05, + "loss": 0.3052, + "step": 3766 + }, + { + "epoch": 0.3768, + "grad_norm": 1.633755087852478, + "learning_rate": 1.568418264219303e-05, + "loss": 0.1502, + "step": 3768 + }, + { + "epoch": 0.377, + "grad_norm": 5.2214674949646, + "learning_rate": 1.5678437450531014e-05, + "loss": 0.6165, + "step": 3770 + }, + { + "epoch": 0.3772, + "grad_norm": 4.60928201675415, + "learning_rate": 1.567268949126757e-05, + "loss": 1.0418, + "step": 3772 + }, + { + "epoch": 0.3774, + "grad_norm": 2.6201138496398926, + "learning_rate": 1.5666938767204173e-05, + "loss": 0.1831, + "step": 3774 + }, + { + "epoch": 0.3776, + "grad_norm": 4.502782821655273, + "learning_rate": 1.5661185281143666e-05, + "loss": 0.4843, + "step": 3776 + }, + { + "epoch": 0.3778, + "grad_norm": 2.0136187076568604, + "learning_rate": 1.565542903589023e-05, + "loss": 0.1082, + "step": 3778 + }, + { + "epoch": 0.378, + "grad_norm": 6.40091609954834, + "learning_rate": 1.564967003424938e-05, + "loss": 0.3153, + "step": 3780 + }, + { + "epoch": 0.3782, + "grad_norm": 4.387092113494873, + "learning_rate": 1.5643908279027994e-05, + "loss": 0.4898, + "step": 3782 + }, + { + "epoch": 0.3784, + "grad_norm": 5.05969762802124, + "learning_rate": 1.5638143773034268e-05, + "loss": 0.1299, + "step": 3784 + }, + { + "epoch": 0.3786, + "grad_norm": 2.419077157974243, + "learning_rate": 1.563237651907777e-05, + "loss": 0.1766, + "step": 3786 + }, + { + "epoch": 0.3788, + "grad_norm": 2.8465256690979004, + "learning_rate": 1.562660651996937e-05, + "loss": 0.2252, + "step": 3788 + }, + { + "epoch": 0.379, + "grad_norm": 4.09852409362793, + "learning_rate": 1.5620833778521306e-05, + "loss": 0.2779, + "step": 3790 + }, + { + "epoch": 0.3792, + "grad_norm": 2.0170750617980957, + "learning_rate": 1.5615058297547144e-05, + "loss": 0.2225, + "step": 3792 + }, + { + "epoch": 0.3794, + "grad_norm": 2.328763484954834, + "learning_rate": 1.560928007986178e-05, + "loss": 0.2018, + "step": 3794 + }, + { + "epoch": 0.3796, + "grad_norm": 0.9967830181121826, + "learning_rate": 1.5603499128281447e-05, + "loss": 0.1166, + "step": 3796 + }, + { + "epoch": 0.3798, + "grad_norm": 2.3807787895202637, + "learning_rate": 1.5597715445623714e-05, + "loss": 0.2158, + "step": 3798 + }, + { + "epoch": 0.38, + "grad_norm": 0.8536610007286072, + "learning_rate": 1.5591929034707468e-05, + "loss": 0.0831, + "step": 3800 + }, + { + "epoch": 0.3802, + "grad_norm": 2.241077184677124, + "learning_rate": 1.558613989835295e-05, + "loss": 0.2934, + "step": 3802 + }, + { + "epoch": 0.3804, + "grad_norm": 2.8639750480651855, + "learning_rate": 1.55803480393817e-05, + "loss": 0.2933, + "step": 3804 + }, + { + "epoch": 0.3806, + "grad_norm": 3.1483120918273926, + "learning_rate": 1.5574553460616608e-05, + "loss": 0.2378, + "step": 3806 + }, + { + "epoch": 0.3808, + "grad_norm": 1.4069608449935913, + "learning_rate": 1.556875616488188e-05, + "loss": 0.0931, + "step": 3808 + }, + { + "epoch": 0.381, + "grad_norm": 8.284597396850586, + "learning_rate": 1.556295615500305e-05, + "loss": 0.2399, + "step": 3810 + }, + { + "epoch": 0.3812, + "grad_norm": 2.229138135910034, + "learning_rate": 1.5557153433806967e-05, + "loss": 0.426, + "step": 3812 + }, + { + "epoch": 0.3814, + "grad_norm": 8.875584602355957, + "learning_rate": 1.555134800412181e-05, + "loss": 0.6106, + "step": 3814 + }, + { + "epoch": 0.3816, + "grad_norm": 3.3714170455932617, + "learning_rate": 1.5545539868777075e-05, + "loss": 0.4836, + "step": 3816 + }, + { + "epoch": 0.3818, + "grad_norm": 0.41438645124435425, + "learning_rate": 1.5539729030603574e-05, + "loss": 0.0368, + "step": 3818 + }, + { + "epoch": 0.382, + "grad_norm": 5.786258220672607, + "learning_rate": 1.553391549243344e-05, + "loss": 0.3662, + "step": 3820 + }, + { + "epoch": 0.3822, + "grad_norm": 5.490174770355225, + "learning_rate": 1.5528099257100126e-05, + "loss": 0.2019, + "step": 3822 + }, + { + "epoch": 0.3824, + "grad_norm": 2.847789764404297, + "learning_rate": 1.5522280327438388e-05, + "loss": 0.1019, + "step": 3824 + }, + { + "epoch": 0.3826, + "grad_norm": 5.841432094573975, + "learning_rate": 1.5516458706284306e-05, + "loss": 0.3944, + "step": 3826 + }, + { + "epoch": 0.3828, + "grad_norm": 0.4141913652420044, + "learning_rate": 1.5510634396475262e-05, + "loss": 0.1238, + "step": 3828 + }, + { + "epoch": 0.383, + "grad_norm": 0.48015329241752625, + "learning_rate": 1.5504807400849957e-05, + "loss": 0.1493, + "step": 3830 + }, + { + "epoch": 0.3832, + "grad_norm": 4.403670310974121, + "learning_rate": 1.54989777222484e-05, + "loss": 0.3038, + "step": 3832 + }, + { + "epoch": 0.3834, + "grad_norm": 0.3943869471549988, + "learning_rate": 1.54931453635119e-05, + "loss": 0.1589, + "step": 3834 + }, + { + "epoch": 0.3836, + "grad_norm": 1.3341339826583862, + "learning_rate": 1.5487310327483087e-05, + "loss": 0.0713, + "step": 3836 + }, + { + "epoch": 0.3838, + "grad_norm": 1.9282124042510986, + "learning_rate": 1.5481472617005878e-05, + "loss": 0.1589, + "step": 3838 + }, + { + "epoch": 0.384, + "grad_norm": 0.19355642795562744, + "learning_rate": 1.5475632234925505e-05, + "loss": 0.0473, + "step": 3840 + }, + { + "epoch": 0.3842, + "grad_norm": 6.0235443115234375, + "learning_rate": 1.5469789184088498e-05, + "loss": 0.1132, + "step": 3842 + }, + { + "epoch": 0.3844, + "grad_norm": 3.6379761695861816, + "learning_rate": 1.5463943467342694e-05, + "loss": 0.3128, + "step": 3844 + }, + { + "epoch": 0.3846, + "grad_norm": 7.515876770019531, + "learning_rate": 1.5458095087537216e-05, + "loss": 0.227, + "step": 3846 + }, + { + "epoch": 0.3848, + "grad_norm": 0.5887134075164795, + "learning_rate": 1.5452244047522504e-05, + "loss": 0.0465, + "step": 3848 + }, + { + "epoch": 0.385, + "grad_norm": 7.400668144226074, + "learning_rate": 1.5446390350150272e-05, + "loss": 0.3327, + "step": 3850 + }, + { + "epoch": 0.3852, + "grad_norm": 10.498122215270996, + "learning_rate": 1.544053399827355e-05, + "loss": 0.5829, + "step": 3852 + }, + { + "epoch": 0.3854, + "grad_norm": 4.243395805358887, + "learning_rate": 1.543467499474665e-05, + "loss": 0.6758, + "step": 3854 + }, + { + "epoch": 0.3856, + "grad_norm": 1.7674437761306763, + "learning_rate": 1.5428813342425177e-05, + "loss": 0.0365, + "step": 3856 + }, + { + "epoch": 0.3858, + "grad_norm": 5.80580472946167, + "learning_rate": 1.542294904416603e-05, + "loss": 0.4156, + "step": 3858 + }, + { + "epoch": 0.386, + "grad_norm": 6.784333229064941, + "learning_rate": 1.54170821028274e-05, + "loss": 1.0451, + "step": 3860 + }, + { + "epoch": 0.3862, + "grad_norm": 1.5565739870071411, + "learning_rate": 1.541121252126876e-05, + "loss": 0.6517, + "step": 3862 + }, + { + "epoch": 0.3864, + "grad_norm": 3.8387246131896973, + "learning_rate": 1.540534030235087e-05, + "loss": 0.1277, + "step": 3864 + }, + { + "epoch": 0.3866, + "grad_norm": 1.721896767616272, + "learning_rate": 1.5399465448935788e-05, + "loss": 0.1244, + "step": 3866 + }, + { + "epoch": 0.3868, + "grad_norm": 3.545351028442383, + "learning_rate": 1.5393587963886837e-05, + "loss": 0.2779, + "step": 3868 + }, + { + "epoch": 0.387, + "grad_norm": 5.585663795471191, + "learning_rate": 1.5387707850068633e-05, + "loss": 0.2539, + "step": 3870 + }, + { + "epoch": 0.3872, + "grad_norm": 2.1478636264801025, + "learning_rate": 1.5381825110347072e-05, + "loss": 0.1685, + "step": 3872 + }, + { + "epoch": 0.3874, + "grad_norm": 5.317318916320801, + "learning_rate": 1.5375939747589334e-05, + "loss": 0.46, + "step": 3874 + }, + { + "epoch": 0.3876, + "grad_norm": 5.378169059753418, + "learning_rate": 1.5370051764663872e-05, + "loss": 1.0945, + "step": 3876 + }, + { + "epoch": 0.3878, + "grad_norm": 3.7742326259613037, + "learning_rate": 1.5364161164440413e-05, + "loss": 0.37, + "step": 3878 + }, + { + "epoch": 0.388, + "grad_norm": 3.6353495121002197, + "learning_rate": 1.5358267949789968e-05, + "loss": 0.5662, + "step": 3880 + }, + { + "epoch": 0.3882, + "grad_norm": 1.2449723482131958, + "learning_rate": 1.5352372123584816e-05, + "loss": 0.1093, + "step": 3882 + }, + { + "epoch": 0.3884, + "grad_norm": 6.004200458526611, + "learning_rate": 1.5346473688698514e-05, + "loss": 0.2528, + "step": 3884 + }, + { + "epoch": 0.3886, + "grad_norm": 3.6601991653442383, + "learning_rate": 1.5340572648005887e-05, + "loss": 0.4231, + "step": 3886 + }, + { + "epoch": 0.3888, + "grad_norm": 1.393237829208374, + "learning_rate": 1.533466900438303e-05, + "loss": 0.1163, + "step": 3888 + }, + { + "epoch": 0.389, + "grad_norm": 1.5533102750778198, + "learning_rate": 1.53287627607073e-05, + "loss": 0.2865, + "step": 3890 + }, + { + "epoch": 0.3892, + "grad_norm": 4.148789405822754, + "learning_rate": 1.532285391985734e-05, + "loss": 0.2199, + "step": 3892 + }, + { + "epoch": 0.3894, + "grad_norm": 2.0072567462921143, + "learning_rate": 1.5316942484713043e-05, + "loss": 0.2389, + "step": 3894 + }, + { + "epoch": 0.3896, + "grad_norm": 1.068885087966919, + "learning_rate": 1.5311028458155567e-05, + "loss": 0.1165, + "step": 3896 + }, + { + "epoch": 0.3898, + "grad_norm": 2.0256881713867188, + "learning_rate": 1.5305111843067343e-05, + "loss": 0.13, + "step": 3898 + }, + { + "epoch": 0.39, + "grad_norm": 7.451221942901611, + "learning_rate": 1.529919264233205e-05, + "loss": 0.4402, + "step": 3900 + }, + { + "epoch": 0.3902, + "grad_norm": 2.4691104888916016, + "learning_rate": 1.5293270858834643e-05, + "loss": 0.1703, + "step": 3902 + }, + { + "epoch": 0.3904, + "grad_norm": 2.645686626434326, + "learning_rate": 1.528734649546132e-05, + "loss": 0.1431, + "step": 3904 + }, + { + "epoch": 0.3906, + "grad_norm": 2.4536731243133545, + "learning_rate": 1.5281419555099547e-05, + "loss": 0.2159, + "step": 3906 + }, + { + "epoch": 0.3908, + "grad_norm": 4.8123369216918945, + "learning_rate": 1.5275490040638038e-05, + "loss": 0.3612, + "step": 3908 + }, + { + "epoch": 0.391, + "grad_norm": 3.0149338245391846, + "learning_rate": 1.5269557954966777e-05, + "loss": 0.6132, + "step": 3910 + }, + { + "epoch": 0.3912, + "grad_norm": 1.004098653793335, + "learning_rate": 1.526362330097698e-05, + "loss": 0.1771, + "step": 3912 + }, + { + "epoch": 0.3914, + "grad_norm": 1.6169664859771729, + "learning_rate": 1.5257686081561134e-05, + "loss": 0.1448, + "step": 3914 + }, + { + "epoch": 0.3916, + "grad_norm": 1.3510079383850098, + "learning_rate": 1.5251746299612959e-05, + "loss": 0.165, + "step": 3916 + }, + { + "epoch": 0.3918, + "grad_norm": 4.330749034881592, + "learning_rate": 1.5245803958027434e-05, + "loss": 0.1816, + "step": 3918 + }, + { + "epoch": 0.392, + "grad_norm": 0.5666859149932861, + "learning_rate": 1.5239859059700794e-05, + "loss": 0.2065, + "step": 3920 + }, + { + "epoch": 0.3922, + "grad_norm": 1.8087283372879028, + "learning_rate": 1.5233911607530499e-05, + "loss": 0.3486, + "step": 3922 + }, + { + "epoch": 0.3924, + "grad_norm": 0.2810194194316864, + "learning_rate": 1.5227961604415266e-05, + "loss": 0.2413, + "step": 3924 + }, + { + "epoch": 0.3926, + "grad_norm": 0.4464583396911621, + "learning_rate": 1.5222009053255061e-05, + "loss": 0.128, + "step": 3926 + }, + { + "epoch": 0.3928, + "grad_norm": 2.6019718647003174, + "learning_rate": 1.5216053956951081e-05, + "loss": 0.2469, + "step": 3928 + }, + { + "epoch": 0.393, + "grad_norm": 1.1385291814804077, + "learning_rate": 1.5210096318405768e-05, + "loss": 0.1115, + "step": 3930 + }, + { + "epoch": 0.3932, + "grad_norm": 3.9281117916107178, + "learning_rate": 1.5204136140522799e-05, + "loss": 0.4973, + "step": 3932 + }, + { + "epoch": 0.3934, + "grad_norm": 3.227299451828003, + "learning_rate": 1.5198173426207095e-05, + "loss": 0.5417, + "step": 3934 + }, + { + "epoch": 0.3936, + "grad_norm": 11.265265464782715, + "learning_rate": 1.5192208178364815e-05, + "loss": 0.2764, + "step": 3936 + }, + { + "epoch": 0.3938, + "grad_norm": 0.23566818237304688, + "learning_rate": 1.5186240399903343e-05, + "loss": 0.0762, + "step": 3938 + }, + { + "epoch": 0.394, + "grad_norm": 0.31829413771629333, + "learning_rate": 1.5180270093731305e-05, + "loss": 0.0969, + "step": 3940 + }, + { + "epoch": 0.3942, + "grad_norm": 3.7859840393066406, + "learning_rate": 1.5174297262758551e-05, + "loss": 0.1426, + "step": 3942 + }, + { + "epoch": 0.3944, + "grad_norm": 3.925340175628662, + "learning_rate": 1.5168321909896171e-05, + "loss": 0.4155, + "step": 3944 + }, + { + "epoch": 0.3946, + "grad_norm": 2.304399251937866, + "learning_rate": 1.5162344038056476e-05, + "loss": 0.1443, + "step": 3946 + }, + { + "epoch": 0.3948, + "grad_norm": 8.289799690246582, + "learning_rate": 1.5156363650153012e-05, + "loss": 0.4155, + "step": 3948 + }, + { + "epoch": 0.395, + "grad_norm": 3.2277050018310547, + "learning_rate": 1.5150380749100545e-05, + "loss": 0.1057, + "step": 3950 + }, + { + "epoch": 0.3952, + "grad_norm": 6.432862758636475, + "learning_rate": 1.5144395337815066e-05, + "loss": 0.5778, + "step": 3952 + }, + { + "epoch": 0.3954, + "grad_norm": 3.8260443210601807, + "learning_rate": 1.5138407419213797e-05, + "loss": 0.2106, + "step": 3954 + }, + { + "epoch": 0.3956, + "grad_norm": 0.7724792957305908, + "learning_rate": 1.5132416996215171e-05, + "loss": 0.2766, + "step": 3956 + }, + { + "epoch": 0.3958, + "grad_norm": 1.293936848640442, + "learning_rate": 1.5126424071738853e-05, + "loss": 0.1828, + "step": 3958 + }, + { + "epoch": 0.396, + "grad_norm": 1.1515394449234009, + "learning_rate": 1.5120428648705716e-05, + "loss": 0.2477, + "step": 3960 + }, + { + "epoch": 0.3962, + "grad_norm": 0.5131187438964844, + "learning_rate": 1.511443073003786e-05, + "loss": 0.0488, + "step": 3962 + }, + { + "epoch": 0.3964, + "grad_norm": 0.6063569784164429, + "learning_rate": 1.51084303186586e-05, + "loss": 0.2398, + "step": 3964 + }, + { + "epoch": 0.3966, + "grad_norm": 0.7434547543525696, + "learning_rate": 1.510242741749246e-05, + "loss": 0.3268, + "step": 3966 + }, + { + "epoch": 0.3968, + "grad_norm": 4.418087959289551, + "learning_rate": 1.5096422029465178e-05, + "loss": 0.2472, + "step": 3968 + }, + { + "epoch": 0.397, + "grad_norm": 1.6690775156021118, + "learning_rate": 1.5090414157503715e-05, + "loss": 0.2785, + "step": 3970 + }, + { + "epoch": 0.3972, + "grad_norm": 5.34909725189209, + "learning_rate": 1.508440380453623e-05, + "loss": 0.2782, + "step": 3972 + }, + { + "epoch": 0.3974, + "grad_norm": 2.4384002685546875, + "learning_rate": 1.5078390973492094e-05, + "loss": 0.1958, + "step": 3974 + }, + { + "epoch": 0.3976, + "grad_norm": 0.29154571890830994, + "learning_rate": 1.5072375667301893e-05, + "loss": 0.1558, + "step": 3976 + }, + { + "epoch": 0.3978, + "grad_norm": 1.5846441984176636, + "learning_rate": 1.506635788889741e-05, + "loss": 0.066, + "step": 3978 + }, + { + "epoch": 0.398, + "grad_norm": 8.921165466308594, + "learning_rate": 1.5060337641211637e-05, + "loss": 0.2358, + "step": 3980 + }, + { + "epoch": 0.3982, + "grad_norm": 4.063834190368652, + "learning_rate": 1.5054314927178779e-05, + "loss": 0.0888, + "step": 3982 + }, + { + "epoch": 0.3984, + "grad_norm": 3.3818576335906982, + "learning_rate": 1.504828974973422e-05, + "loss": 0.3059, + "step": 3984 + }, + { + "epoch": 0.3986, + "grad_norm": 0.7056012153625488, + "learning_rate": 1.5042262111814566e-05, + "loss": 0.0873, + "step": 3986 + }, + { + "epoch": 0.3988, + "grad_norm": 0.42469051480293274, + "learning_rate": 1.503623201635761e-05, + "loss": 0.0385, + "step": 3988 + }, + { + "epoch": 0.399, + "grad_norm": 0.20332230627536774, + "learning_rate": 1.5030199466302354e-05, + "loss": 0.0384, + "step": 3990 + }, + { + "epoch": 0.3992, + "grad_norm": 0.7149085998535156, + "learning_rate": 1.5024164464588982e-05, + "loss": 0.0613, + "step": 3992 + }, + { + "epoch": 0.3994, + "grad_norm": 3.9178693294525146, + "learning_rate": 1.5018127014158886e-05, + "loss": 0.3323, + "step": 3994 + }, + { + "epoch": 0.3996, + "grad_norm": 0.21910303831100464, + "learning_rate": 1.5012087117954643e-05, + "loss": 0.0186, + "step": 3996 + }, + { + "epoch": 0.3998, + "grad_norm": 2.0214710235595703, + "learning_rate": 1.5006044778920028e-05, + "loss": 0.1199, + "step": 3998 + }, + { + "epoch": 0.4, + "grad_norm": 10.891829490661621, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.6022, + "step": 4000 + }, + { + "epoch": 0.4002, + "grad_norm": 8.957113265991211, + "learning_rate": 1.4993952784140716e-05, + "loss": 0.8945, + "step": 4002 + }, + { + "epoch": 0.4004, + "grad_norm": 6.367103576660156, + "learning_rate": 1.498790313428951e-05, + "loss": 0.9924, + "step": 4004 + }, + { + "epoch": 0.4006, + "grad_norm": 2.3334765434265137, + "learning_rate": 1.498185105339491e-05, + "loss": 0.1447, + "step": 4006 + }, + { + "epoch": 0.4008, + "grad_norm": 6.412349700927734, + "learning_rate": 1.4975796544406627e-05, + "loss": 0.3843, + "step": 4008 + }, + { + "epoch": 0.401, + "grad_norm": 5.949571132659912, + "learning_rate": 1.4969739610275556e-05, + "loss": 0.2046, + "step": 4010 + }, + { + "epoch": 0.4012, + "grad_norm": 3.0599565505981445, + "learning_rate": 1.496368025395377e-05, + "loss": 0.3331, + "step": 4012 + }, + { + "epoch": 0.4014, + "grad_norm": 6.018095970153809, + "learning_rate": 1.4957618478394529e-05, + "loss": 0.8022, + "step": 4014 + }, + { + "epoch": 0.4016, + "grad_norm": 1.3438384532928467, + "learning_rate": 1.4951554286552266e-05, + "loss": 0.0767, + "step": 4016 + }, + { + "epoch": 0.4018, + "grad_norm": 0.9696942567825317, + "learning_rate": 1.4945487681382597e-05, + "loss": 0.1764, + "step": 4018 + }, + { + "epoch": 0.402, + "grad_norm": 0.21510916948318481, + "learning_rate": 1.493941866584231e-05, + "loss": 0.0221, + "step": 4020 + }, + { + "epoch": 0.4022, + "grad_norm": 0.23609517514705658, + "learning_rate": 1.4933347242889371e-05, + "loss": 0.217, + "step": 4022 + }, + { + "epoch": 0.4024, + "grad_norm": 4.969806671142578, + "learning_rate": 1.4927273415482916e-05, + "loss": 0.4544, + "step": 4024 + }, + { + "epoch": 0.4026, + "grad_norm": 1.2433180809020996, + "learning_rate": 1.4921197186583256e-05, + "loss": 0.1105, + "step": 4026 + }, + { + "epoch": 0.4028, + "grad_norm": 2.923341989517212, + "learning_rate": 1.4915118559151871e-05, + "loss": 0.2273, + "step": 4028 + }, + { + "epoch": 0.403, + "grad_norm": 1.4066460132598877, + "learning_rate": 1.490903753615141e-05, + "loss": 0.2611, + "step": 4030 + }, + { + "epoch": 0.4032, + "grad_norm": 0.39710551500320435, + "learning_rate": 1.4902954120545687e-05, + "loss": 0.0931, + "step": 4032 + }, + { + "epoch": 0.4034, + "grad_norm": 0.1472305953502655, + "learning_rate": 1.4896868315299692e-05, + "loss": 0.5377, + "step": 4034 + }, + { + "epoch": 0.4036, + "grad_norm": 2.7603299617767334, + "learning_rate": 1.4890780123379565e-05, + "loss": 0.5137, + "step": 4036 + }, + { + "epoch": 0.4038, + "grad_norm": 3.2153124809265137, + "learning_rate": 1.488468954775262e-05, + "loss": 0.484, + "step": 4038 + }, + { + "epoch": 0.404, + "grad_norm": 1.6375248432159424, + "learning_rate": 1.4878596591387329e-05, + "loss": 0.4955, + "step": 4040 + }, + { + "epoch": 0.4042, + "grad_norm": 3.399106740951538, + "learning_rate": 1.4872501257253325e-05, + "loss": 0.3153, + "step": 4042 + }, + { + "epoch": 0.4044, + "grad_norm": 0.6359753012657166, + "learning_rate": 1.4866403548321402e-05, + "loss": 0.2364, + "step": 4044 + }, + { + "epoch": 0.4046, + "grad_norm": 1.5492560863494873, + "learning_rate": 1.4860303467563504e-05, + "loss": 0.1671, + "step": 4046 + }, + { + "epoch": 0.4048, + "grad_norm": 1.0665173530578613, + "learning_rate": 1.485420101795274e-05, + "loss": 0.2165, + "step": 4048 + }, + { + "epoch": 0.405, + "grad_norm": 3.6870357990264893, + "learning_rate": 1.4848096202463373e-05, + "loss": 0.1688, + "step": 4050 + }, + { + "epoch": 0.4052, + "grad_norm": 0.9266182780265808, + "learning_rate": 1.4841989024070809e-05, + "loss": 0.2438, + "step": 4052 + }, + { + "epoch": 0.4054, + "grad_norm": 4.050524711608887, + "learning_rate": 1.4835879485751617e-05, + "loss": 0.2883, + "step": 4054 + }, + { + "epoch": 0.4056, + "grad_norm": 0.8568427562713623, + "learning_rate": 1.4829767590483508e-05, + "loss": 0.2807, + "step": 4056 + }, + { + "epoch": 0.4058, + "grad_norm": 1.4737147092819214, + "learning_rate": 1.4823653341245353e-05, + "loss": 0.1027, + "step": 4058 + }, + { + "epoch": 0.406, + "grad_norm": 1.3100484609603882, + "learning_rate": 1.4817536741017153e-05, + "loss": 0.4833, + "step": 4060 + }, + { + "epoch": 0.4062, + "grad_norm": 4.747870922088623, + "learning_rate": 1.4811417792780074e-05, + "loss": 0.4516, + "step": 4062 + }, + { + "epoch": 0.4064, + "grad_norm": 0.919586718082428, + "learning_rate": 1.4805296499516408e-05, + "loss": 0.0713, + "step": 4064 + }, + { + "epoch": 0.4066, + "grad_norm": 1.7173786163330078, + "learning_rate": 1.4799172864209607e-05, + "loss": 0.2624, + "step": 4066 + }, + { + "epoch": 0.4068, + "grad_norm": 1.2244130373001099, + "learning_rate": 1.4793046889844252e-05, + "loss": 0.3218, + "step": 4068 + }, + { + "epoch": 0.407, + "grad_norm": 3.5205001831054688, + "learning_rate": 1.478691857940607e-05, + "loss": 0.1591, + "step": 4070 + }, + { + "epoch": 0.4072, + "grad_norm": 0.8904837965965271, + "learning_rate": 1.4780787935881925e-05, + "loss": 0.1167, + "step": 4072 + }, + { + "epoch": 0.4074, + "grad_norm": 2.976209878921509, + "learning_rate": 1.4774654962259813e-05, + "loss": 0.4511, + "step": 4074 + }, + { + "epoch": 0.4076, + "grad_norm": 6.929346084594727, + "learning_rate": 1.4768519661528879e-05, + "loss": 0.3354, + "step": 4076 + }, + { + "epoch": 0.4078, + "grad_norm": 1.2232463359832764, + "learning_rate": 1.4762382036679393e-05, + "loss": 0.2437, + "step": 4078 + }, + { + "epoch": 0.408, + "grad_norm": 11.10765552520752, + "learning_rate": 1.4756242090702756e-05, + "loss": 0.7635, + "step": 4080 + }, + { + "epoch": 0.4082, + "grad_norm": 0.9831032156944275, + "learning_rate": 1.47500998265915e-05, + "loss": 0.1388, + "step": 4082 + }, + { + "epoch": 0.4084, + "grad_norm": 2.8441288471221924, + "learning_rate": 1.4743955247339292e-05, + "loss": 0.067, + "step": 4084 + }, + { + "epoch": 0.4086, + "grad_norm": 3.623744010925293, + "learning_rate": 1.4737808355940932e-05, + "loss": 0.2966, + "step": 4086 + }, + { + "epoch": 0.4088, + "grad_norm": 4.100814342498779, + "learning_rate": 1.4731659155392332e-05, + "loss": 0.305, + "step": 4088 + }, + { + "epoch": 0.409, + "grad_norm": 2.667576789855957, + "learning_rate": 1.4725507648690542e-05, + "loss": 0.285, + "step": 4090 + }, + { + "epoch": 0.4092, + "grad_norm": 1.5522990226745605, + "learning_rate": 1.4719353838833729e-05, + "loss": 0.2671, + "step": 4092 + }, + { + "epoch": 0.4094, + "grad_norm": 2.0260627269744873, + "learning_rate": 1.4713197728821185e-05, + "loss": 0.0989, + "step": 4094 + }, + { + "epoch": 0.4096, + "grad_norm": 2.175962209701538, + "learning_rate": 1.470703932165333e-05, + "loss": 0.2047, + "step": 4096 + }, + { + "epoch": 0.4098, + "grad_norm": 0.8743038177490234, + "learning_rate": 1.4700878620331684e-05, + "loss": 0.062, + "step": 4098 + }, + { + "epoch": 0.41, + "grad_norm": 0.4797155261039734, + "learning_rate": 1.469471562785891e-05, + "loss": 0.177, + "step": 4100 + }, + { + "epoch": 0.4102, + "grad_norm": 3.1618378162384033, + "learning_rate": 1.468855034723877e-05, + "loss": 0.1361, + "step": 4102 + }, + { + "epoch": 0.4104, + "grad_norm": 1.7610321044921875, + "learning_rate": 1.4682382781476146e-05, + "loss": 0.352, + "step": 4104 + }, + { + "epoch": 0.4106, + "grad_norm": 3.922696113586426, + "learning_rate": 1.467621293357704e-05, + "loss": 0.216, + "step": 4106 + }, + { + "epoch": 0.4108, + "grad_norm": 1.982114315032959, + "learning_rate": 1.4670040806548555e-05, + "loss": 0.1477, + "step": 4108 + }, + { + "epoch": 0.411, + "grad_norm": 1.4725905656814575, + "learning_rate": 1.4663866403398915e-05, + "loss": 0.1389, + "step": 4110 + }, + { + "epoch": 0.4112, + "grad_norm": 0.3389441967010498, + "learning_rate": 1.4657689727137443e-05, + "loss": 0.0464, + "step": 4112 + }, + { + "epoch": 0.4114, + "grad_norm": 3.0283443927764893, + "learning_rate": 1.4651510780774585e-05, + "loss": 0.1501, + "step": 4114 + }, + { + "epoch": 0.4116, + "grad_norm": 3.1762940883636475, + "learning_rate": 1.464532956732188e-05, + "loss": 0.306, + "step": 4116 + }, + { + "epoch": 0.4118, + "grad_norm": 5.786334991455078, + "learning_rate": 1.4639146089791972e-05, + "loss": 1.1591, + "step": 4118 + }, + { + "epoch": 0.412, + "grad_norm": 0.40633296966552734, + "learning_rate": 1.463296035119862e-05, + "loss": 0.0245, + "step": 4120 + }, + { + "epoch": 0.4122, + "grad_norm": 0.9293982982635498, + "learning_rate": 1.462677235455667e-05, + "loss": 0.637, + "step": 4122 + }, + { + "epoch": 0.4124, + "grad_norm": 5.606644630432129, + "learning_rate": 1.4620582102882088e-05, + "loss": 0.3226, + "step": 4124 + }, + { + "epoch": 0.4126, + "grad_norm": 19.793380737304688, + "learning_rate": 1.4614389599191917e-05, + "loss": 0.7696, + "step": 4126 + }, + { + "epoch": 0.4128, + "grad_norm": 0.6690611839294434, + "learning_rate": 1.4608194846504311e-05, + "loss": 0.1382, + "step": 4128 + }, + { + "epoch": 0.413, + "grad_norm": 0.2825923264026642, + "learning_rate": 1.4601997847838518e-05, + "loss": 0.0902, + "step": 4130 + }, + { + "epoch": 0.4132, + "grad_norm": 0.736546516418457, + "learning_rate": 1.4595798606214882e-05, + "loss": 0.0449, + "step": 4132 + }, + { + "epoch": 0.4134, + "grad_norm": 1.4477406740188599, + "learning_rate": 1.4589597124654834e-05, + "loss": 0.364, + "step": 4134 + }, + { + "epoch": 0.4136, + "grad_norm": 2.479720115661621, + "learning_rate": 1.4583393406180898e-05, + "loss": 0.2541, + "step": 4136 + }, + { + "epoch": 0.4138, + "grad_norm": 0.9851766228675842, + "learning_rate": 1.4577187453816702e-05, + "loss": 0.3305, + "step": 4138 + }, + { + "epoch": 0.414, + "grad_norm": 3.515544891357422, + "learning_rate": 1.4570979270586944e-05, + "loss": 0.3577, + "step": 4140 + }, + { + "epoch": 0.4142, + "grad_norm": 6.137422561645508, + "learning_rate": 1.4564768859517417e-05, + "loss": 0.6612, + "step": 4142 + }, + { + "epoch": 0.4144, + "grad_norm": 0.7248104810714722, + "learning_rate": 1.4558556223635004e-05, + "loss": 0.5142, + "step": 4144 + }, + { + "epoch": 0.4146, + "grad_norm": 4.483447074890137, + "learning_rate": 1.455234136596766e-05, + "loss": 0.5309, + "step": 4146 + }, + { + "epoch": 0.4148, + "grad_norm": 1.0987902879714966, + "learning_rate": 1.454612428954444e-05, + "loss": 0.2048, + "step": 4148 + }, + { + "epoch": 0.415, + "grad_norm": 1.8370847702026367, + "learning_rate": 1.4539904997395468e-05, + "loss": 0.1904, + "step": 4150 + }, + { + "epoch": 0.4152, + "grad_norm": 0.8449780941009521, + "learning_rate": 1.4533683492551954e-05, + "loss": 0.1286, + "step": 4152 + }, + { + "epoch": 0.4154, + "grad_norm": 3.485955238342285, + "learning_rate": 1.452745977804618e-05, + "loss": 0.2953, + "step": 4154 + }, + { + "epoch": 0.4156, + "grad_norm": 4.550891876220703, + "learning_rate": 1.4521233856911507e-05, + "loss": 0.3446, + "step": 4156 + }, + { + "epoch": 0.4158, + "grad_norm": 3.756093978881836, + "learning_rate": 1.4515005732182384e-05, + "loss": 0.4926, + "step": 4158 + }, + { + "epoch": 0.416, + "grad_norm": 1.980782151222229, + "learning_rate": 1.4508775406894308e-05, + "loss": 0.2295, + "step": 4160 + }, + { + "epoch": 0.4162, + "grad_norm": 3.505768060684204, + "learning_rate": 1.4502542884083876e-05, + "loss": 0.3816, + "step": 4162 + }, + { + "epoch": 0.4164, + "grad_norm": 1.7373032569885254, + "learning_rate": 1.449630816678874e-05, + "loss": 0.3496, + "step": 4164 + }, + { + "epoch": 0.4166, + "grad_norm": 0.533490002155304, + "learning_rate": 1.4490071258047625e-05, + "loss": 0.3428, + "step": 4166 + }, + { + "epoch": 0.4168, + "grad_norm": 0.658152163028717, + "learning_rate": 1.4483832160900326e-05, + "loss": 0.2436, + "step": 4168 + }, + { + "epoch": 0.417, + "grad_norm": 4.593430995941162, + "learning_rate": 1.4477590878387697e-05, + "loss": 0.3639, + "step": 4170 + }, + { + "epoch": 0.4172, + "grad_norm": 3.0169713497161865, + "learning_rate": 1.4471347413551673e-05, + "loss": 0.47, + "step": 4172 + }, + { + "epoch": 0.4174, + "grad_norm": 5.3525238037109375, + "learning_rate": 1.4465101769435235e-05, + "loss": 0.1481, + "step": 4174 + }, + { + "epoch": 0.4176, + "grad_norm": 0.6148722767829895, + "learning_rate": 1.4458853949082443e-05, + "loss": 0.0541, + "step": 4176 + }, + { + "epoch": 0.4178, + "grad_norm": 2.856746196746826, + "learning_rate": 1.4452603955538397e-05, + "loss": 0.2195, + "step": 4178 + }, + { + "epoch": 0.418, + "grad_norm": 3.749067544937134, + "learning_rate": 1.4446351791849276e-05, + "loss": 0.2198, + "step": 4180 + }, + { + "epoch": 0.4182, + "grad_norm": 2.0266122817993164, + "learning_rate": 1.4440097461062308e-05, + "loss": 0.251, + "step": 4182 + }, + { + "epoch": 0.4184, + "grad_norm": 2.569699287414551, + "learning_rate": 1.4433840966225772e-05, + "loss": 0.3335, + "step": 4184 + }, + { + "epoch": 0.4186, + "grad_norm": 4.705577850341797, + "learning_rate": 1.442758231038902e-05, + "loss": 0.3522, + "step": 4186 + }, + { + "epoch": 0.4188, + "grad_norm": 4.342922687530518, + "learning_rate": 1.4421321496602428e-05, + "loss": 0.3434, + "step": 4188 + }, + { + "epoch": 0.419, + "grad_norm": 3.5279994010925293, + "learning_rate": 1.4415058527917454e-05, + "loss": 0.2378, + "step": 4190 + }, + { + "epoch": 0.4192, + "grad_norm": 2.5113978385925293, + "learning_rate": 1.4408793407386587e-05, + "loss": 0.5709, + "step": 4192 + }, + { + "epoch": 0.4194, + "grad_norm": 3.447615623474121, + "learning_rate": 1.4402526138063373e-05, + "loss": 0.1886, + "step": 4194 + }, + { + "epoch": 0.4196, + "grad_norm": 8.978830337524414, + "learning_rate": 1.43962567230024e-05, + "loss": 0.6332, + "step": 4196 + }, + { + "epoch": 0.4198, + "grad_norm": 5.255710601806641, + "learning_rate": 1.4389985165259308e-05, + "loss": 0.2853, + "step": 4198 + }, + { + "epoch": 0.42, + "grad_norm": 3.1465976238250732, + "learning_rate": 1.4383711467890776e-05, + "loss": 0.3569, + "step": 4200 + }, + { + "epoch": 0.4202, + "grad_norm": 1.8027558326721191, + "learning_rate": 1.4377435633954528e-05, + "loss": 0.4716, + "step": 4202 + }, + { + "epoch": 0.4204, + "grad_norm": 3.689570665359497, + "learning_rate": 1.437115766650933e-05, + "loss": 0.2541, + "step": 4204 + }, + { + "epoch": 0.4206, + "grad_norm": 1.088626742362976, + "learning_rate": 1.436487756861499e-05, + "loss": 0.3486, + "step": 4206 + }, + { + "epoch": 0.4208, + "grad_norm": 2.568286895751953, + "learning_rate": 1.4358595343332342e-05, + "loss": 0.3805, + "step": 4208 + }, + { + "epoch": 0.421, + "grad_norm": 0.5302088260650635, + "learning_rate": 1.4352310993723277e-05, + "loss": 0.2574, + "step": 4210 + }, + { + "epoch": 0.4212, + "grad_norm": 1.1194888353347778, + "learning_rate": 1.4346024522850704e-05, + "loss": 0.2784, + "step": 4212 + }, + { + "epoch": 0.4214, + "grad_norm": 2.576188564300537, + "learning_rate": 1.4339735933778576e-05, + "loss": 0.2149, + "step": 4214 + }, + { + "epoch": 0.4216, + "grad_norm": 2.296700954437256, + "learning_rate": 1.4333445229571874e-05, + "loss": 0.2108, + "step": 4216 + }, + { + "epoch": 0.4218, + "grad_norm": 2.821171998977661, + "learning_rate": 1.4327152413296607e-05, + "loss": 0.2705, + "step": 4218 + }, + { + "epoch": 0.422, + "grad_norm": 1.341805338859558, + "learning_rate": 1.4320857488019826e-05, + "loss": 0.2299, + "step": 4220 + }, + { + "epoch": 0.4222, + "grad_norm": 1.6580400466918945, + "learning_rate": 1.4314560456809592e-05, + "loss": 0.2275, + "step": 4222 + }, + { + "epoch": 0.4224, + "grad_norm": 2.51485538482666, + "learning_rate": 1.4308261322735006e-05, + "loss": 0.1539, + "step": 4224 + }, + { + "epoch": 0.4226, + "grad_norm": 3.0908501148223877, + "learning_rate": 1.4301960088866187e-05, + "loss": 0.2641, + "step": 4226 + }, + { + "epoch": 0.4228, + "grad_norm": 1.4558466672897339, + "learning_rate": 1.4295656758274283e-05, + "loss": 0.1485, + "step": 4228 + }, + { + "epoch": 0.423, + "grad_norm": 0.8807215094566345, + "learning_rate": 1.4289351334031461e-05, + "loss": 0.1382, + "step": 4230 + }, + { + "epoch": 0.4232, + "grad_norm": 3.433450222015381, + "learning_rate": 1.4283043819210905e-05, + "loss": 0.2419, + "step": 4232 + }, + { + "epoch": 0.4234, + "grad_norm": 3.9653942584991455, + "learning_rate": 1.4276734216886823e-05, + "loss": 0.3388, + "step": 4234 + }, + { + "epoch": 0.4236, + "grad_norm": 0.6012662053108215, + "learning_rate": 1.4270422530134433e-05, + "loss": 0.1638, + "step": 4236 + }, + { + "epoch": 0.4238, + "grad_norm": 4.257350444793701, + "learning_rate": 1.4264108762029989e-05, + "loss": 0.4439, + "step": 4238 + }, + { + "epoch": 0.424, + "grad_norm": 1.5174967050552368, + "learning_rate": 1.4257792915650728e-05, + "loss": 0.1754, + "step": 4240 + }, + { + "epoch": 0.4242, + "grad_norm": 7.645261764526367, + "learning_rate": 1.4251474994074927e-05, + "loss": 0.5975, + "step": 4242 + }, + { + "epoch": 0.4244, + "grad_norm": 5.972664833068848, + "learning_rate": 1.424515500038186e-05, + "loss": 0.7073, + "step": 4244 + }, + { + "epoch": 0.4246, + "grad_norm": 3.2304024696350098, + "learning_rate": 1.4238832937651816e-05, + "loss": 0.4696, + "step": 4246 + }, + { + "epoch": 0.4248, + "grad_norm": 6.339985370635986, + "learning_rate": 1.4232508808966097e-05, + "loss": 0.4738, + "step": 4248 + }, + { + "epoch": 0.425, + "grad_norm": 3.145146369934082, + "learning_rate": 1.4226182617406996e-05, + "loss": 0.3977, + "step": 4250 + }, + { + "epoch": 0.4252, + "grad_norm": 1.8933545351028442, + "learning_rate": 1.4219854366057831e-05, + "loss": 0.2783, + "step": 4252 + }, + { + "epoch": 0.4254, + "grad_norm": 1.86481511592865, + "learning_rate": 1.421352405800291e-05, + "loss": 0.1429, + "step": 4254 + }, + { + "epoch": 0.4256, + "grad_norm": 3.7003817558288574, + "learning_rate": 1.420719169632755e-05, + "loss": 0.3104, + "step": 4256 + }, + { + "epoch": 0.4258, + "grad_norm": 3.7516672611236572, + "learning_rate": 1.4200857284118067e-05, + "loss": 0.263, + "step": 4258 + }, + { + "epoch": 0.426, + "grad_norm": 3.487356662750244, + "learning_rate": 1.4194520824461773e-05, + "loss": 0.2935, + "step": 4260 + }, + { + "epoch": 0.4262, + "grad_norm": 1.4015233516693115, + "learning_rate": 1.4188182320446985e-05, + "loss": 0.089, + "step": 4262 + }, + { + "epoch": 0.4264, + "grad_norm": 5.351632118225098, + "learning_rate": 1.4181841775163014e-05, + "loss": 0.6129, + "step": 4264 + }, + { + "epoch": 0.4266, + "grad_norm": 2.0895864963531494, + "learning_rate": 1.4175499191700169e-05, + "loss": 0.1672, + "step": 4266 + }, + { + "epoch": 0.4268, + "grad_norm": 0.6308691501617432, + "learning_rate": 1.4169154573149737e-05, + "loss": 0.2103, + "step": 4268 + }, + { + "epoch": 0.427, + "grad_norm": 6.3098931312561035, + "learning_rate": 1.4162807922604014e-05, + "loss": 0.8448, + "step": 4270 + }, + { + "epoch": 0.4272, + "grad_norm": 0.8682838082313538, + "learning_rate": 1.415645924315628e-05, + "loss": 0.2584, + "step": 4272 + }, + { + "epoch": 0.4274, + "grad_norm": 0.5748447179794312, + "learning_rate": 1.4150108537900805e-05, + "loss": 0.0549, + "step": 4274 + }, + { + "epoch": 0.4276, + "grad_norm": 0.7322406768798828, + "learning_rate": 1.4143755809932843e-05, + "loss": 0.0724, + "step": 4276 + }, + { + "epoch": 0.4278, + "grad_norm": 0.9429101943969727, + "learning_rate": 1.4137401062348639e-05, + "loss": 0.1079, + "step": 4278 + }, + { + "epoch": 0.428, + "grad_norm": 4.213616371154785, + "learning_rate": 1.413104429824542e-05, + "loss": 0.2394, + "step": 4280 + }, + { + "epoch": 0.4282, + "grad_norm": 1.118756651878357, + "learning_rate": 1.4124685520721393e-05, + "loss": 0.1362, + "step": 4282 + }, + { + "epoch": 0.4284, + "grad_norm": 1.353657603263855, + "learning_rate": 1.411832473287575e-05, + "loss": 0.5419, + "step": 4284 + }, + { + "epoch": 0.4286, + "grad_norm": 11.473264694213867, + "learning_rate": 1.4111961937808665e-05, + "loss": 0.6813, + "step": 4286 + }, + { + "epoch": 0.4288, + "grad_norm": 1.0107816457748413, + "learning_rate": 1.4105597138621281e-05, + "loss": 0.0693, + "step": 4288 + }, + { + "epoch": 0.429, + "grad_norm": 1.0048671960830688, + "learning_rate": 1.4099230338415728e-05, + "loss": 0.5027, + "step": 4290 + }, + { + "epoch": 0.4292, + "grad_norm": 1.6013520956039429, + "learning_rate": 1.4092861540295109e-05, + "loss": 0.1948, + "step": 4292 + }, + { + "epoch": 0.4294, + "grad_norm": 3.0976076126098633, + "learning_rate": 1.4086490747363492e-05, + "loss": 0.308, + "step": 4294 + }, + { + "epoch": 0.4296, + "grad_norm": 2.074516534805298, + "learning_rate": 1.4080117962725929e-05, + "loss": 0.4809, + "step": 4296 + }, + { + "epoch": 0.4298, + "grad_norm": 4.255273342132568, + "learning_rate": 1.4073743189488436e-05, + "loss": 0.4263, + "step": 4298 + }, + { + "epoch": 0.43, + "grad_norm": 3.379931688308716, + "learning_rate": 1.4067366430758004e-05, + "loss": 0.4439, + "step": 4300 + }, + { + "epoch": 0.4302, + "grad_norm": 2.2280704975128174, + "learning_rate": 1.4060987689642581e-05, + "loss": 0.2899, + "step": 4302 + }, + { + "epoch": 0.4304, + "grad_norm": 0.8708080649375916, + "learning_rate": 1.4054606969251095e-05, + "loss": 0.1702, + "step": 4304 + }, + { + "epoch": 0.4306, + "grad_norm": 1.5616799592971802, + "learning_rate": 1.4048224272693426e-05, + "loss": 0.1668, + "step": 4306 + }, + { + "epoch": 0.4308, + "grad_norm": 0.6246001720428467, + "learning_rate": 1.4041839603080423e-05, + "loss": 0.2129, + "step": 4308 + }, + { + "epoch": 0.431, + "grad_norm": 2.34782075881958, + "learning_rate": 1.4035452963523903e-05, + "loss": 0.306, + "step": 4310 + }, + { + "epoch": 0.4312, + "grad_norm": 3.670454502105713, + "learning_rate": 1.4029064357136628e-05, + "loss": 0.1998, + "step": 4312 + }, + { + "epoch": 0.4314, + "grad_norm": 1.1732332706451416, + "learning_rate": 1.4022673787032333e-05, + "loss": 0.1998, + "step": 4314 + }, + { + "epoch": 0.4316, + "grad_norm": 1.231430172920227, + "learning_rate": 1.4016281256325702e-05, + "loss": 0.29, + "step": 4316 + }, + { + "epoch": 0.4318, + "grad_norm": 1.6575500965118408, + "learning_rate": 1.4009886768132375e-05, + "loss": 0.1086, + "step": 4318 + }, + { + "epoch": 0.432, + "grad_norm": 1.3014055490493774, + "learning_rate": 1.4003490325568953e-05, + "loss": 0.1284, + "step": 4320 + }, + { + "epoch": 0.4322, + "grad_norm": 3.9094247817993164, + "learning_rate": 1.3997091931752978e-05, + "loss": 0.2985, + "step": 4322 + }, + { + "epoch": 0.4324, + "grad_norm": 1.7926738262176514, + "learning_rate": 1.3990691589802955e-05, + "loss": 0.1984, + "step": 4324 + }, + { + "epoch": 0.4326, + "grad_norm": 4.71518087387085, + "learning_rate": 1.3984289302838327e-05, + "loss": 0.2352, + "step": 4326 + }, + { + "epoch": 0.4328, + "grad_norm": 2.0654563903808594, + "learning_rate": 1.39778850739795e-05, + "loss": 0.2763, + "step": 4328 + }, + { + "epoch": 0.433, + "grad_norm": 1.1821035146713257, + "learning_rate": 1.3971478906347806e-05, + "loss": 0.2379, + "step": 4330 + }, + { + "epoch": 0.4332, + "grad_norm": 3.9508731365203857, + "learning_rate": 1.3965070803065543e-05, + "loss": 0.3058, + "step": 4332 + }, + { + "epoch": 0.4334, + "grad_norm": 2.8939359188079834, + "learning_rate": 1.3958660767255938e-05, + "loss": 0.1403, + "step": 4334 + }, + { + "epoch": 0.4336, + "grad_norm": 3.3140013217926025, + "learning_rate": 1.3952248802043166e-05, + "loss": 0.396, + "step": 4336 + }, + { + "epoch": 0.4338, + "grad_norm": 0.5250727534294128, + "learning_rate": 1.394583491055234e-05, + "loss": 0.2509, + "step": 4338 + }, + { + "epoch": 0.434, + "grad_norm": 2.751217842102051, + "learning_rate": 1.3939419095909513e-05, + "loss": 0.11, + "step": 4340 + }, + { + "epoch": 0.4342, + "grad_norm": 17.103477478027344, + "learning_rate": 1.3933001361241674e-05, + "loss": 0.4216, + "step": 4342 + }, + { + "epoch": 0.4344, + "grad_norm": 2.4573686122894287, + "learning_rate": 1.3926581709676752e-05, + "loss": 0.1298, + "step": 4344 + }, + { + "epoch": 0.4346, + "grad_norm": 1.7624469995498657, + "learning_rate": 1.3920160144343604e-05, + "loss": 0.5927, + "step": 4346 + }, + { + "epoch": 0.4348, + "grad_norm": 0.6752290725708008, + "learning_rate": 1.3913736668372027e-05, + "loss": 0.0777, + "step": 4348 + }, + { + "epoch": 0.435, + "grad_norm": 0.29604077339172363, + "learning_rate": 1.3907311284892737e-05, + "loss": 0.3622, + "step": 4350 + }, + { + "epoch": 0.4352, + "grad_norm": 3.3036954402923584, + "learning_rate": 1.3900883997037398e-05, + "loss": 0.2823, + "step": 4352 + }, + { + "epoch": 0.4354, + "grad_norm": 0.9429232478141785, + "learning_rate": 1.3894454807938587e-05, + "loss": 0.1332, + "step": 4354 + }, + { + "epoch": 0.4356, + "grad_norm": 1.3312541246414185, + "learning_rate": 1.388802372072981e-05, + "loss": 0.1339, + "step": 4356 + }, + { + "epoch": 0.4358, + "grad_norm": 0.22178970277309418, + "learning_rate": 1.3881590738545508e-05, + "loss": 0.0866, + "step": 4358 + }, + { + "epoch": 0.436, + "grad_norm": 2.4089691638946533, + "learning_rate": 1.3875155864521031e-05, + "loss": 0.1949, + "step": 4360 + }, + { + "epoch": 0.4362, + "grad_norm": 0.2310796082019806, + "learning_rate": 1.3868719101792664e-05, + "loss": 0.0257, + "step": 4362 + }, + { + "epoch": 0.4364, + "grad_norm": 0.4447331428527832, + "learning_rate": 1.3862280453497601e-05, + "loss": 0.1239, + "step": 4364 + }, + { + "epoch": 0.4366, + "grad_norm": 3.2367382049560547, + "learning_rate": 1.3855839922773968e-05, + "loss": 0.5702, + "step": 4366 + }, + { + "epoch": 0.4368, + "grad_norm": 0.2621062994003296, + "learning_rate": 1.3849397512760797e-05, + "loss": 0.1701, + "step": 4368 + }, + { + "epoch": 0.437, + "grad_norm": 0.5088450312614441, + "learning_rate": 1.3842953226598036e-05, + "loss": 0.1748, + "step": 4370 + }, + { + "epoch": 0.4372, + "grad_norm": 2.5639703273773193, + "learning_rate": 1.3836507067426565e-05, + "loss": 0.5089, + "step": 4372 + }, + { + "epoch": 0.4374, + "grad_norm": 4.540256977081299, + "learning_rate": 1.3830059038388153e-05, + "loss": 0.3391, + "step": 4374 + }, + { + "epoch": 0.4376, + "grad_norm": 1.2504281997680664, + "learning_rate": 1.3823609142625492e-05, + "loss": 0.1764, + "step": 4376 + }, + { + "epoch": 0.4378, + "grad_norm": 2.077064275741577, + "learning_rate": 1.3817157383282184e-05, + "loss": 0.2641, + "step": 4378 + }, + { + "epoch": 0.438, + "grad_norm": 2.877821922302246, + "learning_rate": 1.3810703763502744e-05, + "loss": 0.2643, + "step": 4380 + }, + { + "epoch": 0.4382, + "grad_norm": 2.6588351726531982, + "learning_rate": 1.3804248286432577e-05, + "loss": 0.3157, + "step": 4382 + }, + { + "epoch": 0.4384, + "grad_norm": 2.2303574085235596, + "learning_rate": 1.3797790955218014e-05, + "loss": 0.1827, + "step": 4384 + }, + { + "epoch": 0.4386, + "grad_norm": 1.0769734382629395, + "learning_rate": 1.3791331773006272e-05, + "loss": 0.0583, + "step": 4386 + }, + { + "epoch": 0.4388, + "grad_norm": 4.092939853668213, + "learning_rate": 1.3784870742945482e-05, + "loss": 0.342, + "step": 4388 + }, + { + "epoch": 0.439, + "grad_norm": 13.305526733398438, + "learning_rate": 1.3778407868184674e-05, + "loss": 0.4739, + "step": 4390 + }, + { + "epoch": 0.4392, + "grad_norm": 4.96666955947876, + "learning_rate": 1.3771943151873768e-05, + "loss": 0.148, + "step": 4392 + }, + { + "epoch": 0.4394, + "grad_norm": 1.0552475452423096, + "learning_rate": 1.3765476597163595e-05, + "loss": 0.0892, + "step": 4394 + }, + { + "epoch": 0.4396, + "grad_norm": 4.445632457733154, + "learning_rate": 1.3759008207205869e-05, + "loss": 0.168, + "step": 4396 + }, + { + "epoch": 0.4398, + "grad_norm": 2.272061347961426, + "learning_rate": 1.375253798515321e-05, + "loss": 0.2574, + "step": 4398 + }, + { + "epoch": 0.44, + "grad_norm": 4.156055450439453, + "learning_rate": 1.3746065934159123e-05, + "loss": 0.3417, + "step": 4400 + }, + { + "epoch": 0.4402, + "grad_norm": 4.669026851654053, + "learning_rate": 1.3739592057378005e-05, + "loss": 0.384, + "step": 4402 + }, + { + "epoch": 0.4404, + "grad_norm": 0.7264893054962158, + "learning_rate": 1.373311635796515e-05, + "loss": 0.0634, + "step": 4404 + }, + { + "epoch": 0.4406, + "grad_norm": 2.1156728267669678, + "learning_rate": 1.3726638839076732e-05, + "loss": 0.0626, + "step": 4406 + }, + { + "epoch": 0.4408, + "grad_norm": 0.8322746157646179, + "learning_rate": 1.3720159503869816e-05, + "loss": 0.1353, + "step": 4408 + }, + { + "epoch": 0.441, + "grad_norm": 1.7972571849822998, + "learning_rate": 1.371367835550235e-05, + "loss": 0.2104, + "step": 4410 + }, + { + "epoch": 0.4412, + "grad_norm": 0.7260698080062866, + "learning_rate": 1.3707195397133165e-05, + "loss": 0.083, + "step": 4412 + }, + { + "epoch": 0.4414, + "grad_norm": 3.586761474609375, + "learning_rate": 1.3700710631921984e-05, + "loss": 0.3492, + "step": 4414 + }, + { + "epoch": 0.4416, + "grad_norm": 0.515049934387207, + "learning_rate": 1.3694224063029396e-05, + "loss": 0.0287, + "step": 4416 + }, + { + "epoch": 0.4418, + "grad_norm": 5.745823383331299, + "learning_rate": 1.3687735693616876e-05, + "loss": 0.3418, + "step": 4418 + }, + { + "epoch": 0.442, + "grad_norm": 4.4258527755737305, + "learning_rate": 1.3681245526846782e-05, + "loss": 0.3725, + "step": 4420 + }, + { + "epoch": 0.4422, + "grad_norm": 5.310042858123779, + "learning_rate": 1.3674753565882336e-05, + "loss": 0.1922, + "step": 4422 + }, + { + "epoch": 0.4424, + "grad_norm": 3.406984329223633, + "learning_rate": 1.3668259813887644e-05, + "loss": 0.2159, + "step": 4424 + }, + { + "epoch": 0.4426, + "grad_norm": 1.211945652961731, + "learning_rate": 1.3661764274027678e-05, + "loss": 0.1426, + "step": 4426 + }, + { + "epoch": 0.4428, + "grad_norm": 15.098991394042969, + "learning_rate": 1.365526694946829e-05, + "loss": 0.8702, + "step": 4428 + }, + { + "epoch": 0.443, + "grad_norm": 0.11946279555559158, + "learning_rate": 1.3648767843376196e-05, + "loss": 0.0219, + "step": 4430 + }, + { + "epoch": 0.4432, + "grad_norm": 0.8467361330986023, + "learning_rate": 1.3642266958918985e-05, + "loss": 0.1151, + "step": 4432 + }, + { + "epoch": 0.4434, + "grad_norm": 2.349670171737671, + "learning_rate": 1.36357642992651e-05, + "loss": 0.0579, + "step": 4434 + }, + { + "epoch": 0.4436, + "grad_norm": 4.7028021812438965, + "learning_rate": 1.3629259867583864e-05, + "loss": 0.195, + "step": 4436 + }, + { + "epoch": 0.4438, + "grad_norm": 0.13971567153930664, + "learning_rate": 1.3622753667045459e-05, + "loss": 0.7082, + "step": 4438 + }, + { + "epoch": 0.444, + "grad_norm": 5.856100559234619, + "learning_rate": 1.3616245700820922e-05, + "loss": 0.5195, + "step": 4440 + }, + { + "epoch": 0.4442, + "grad_norm": 0.21128733456134796, + "learning_rate": 1.3609735972082168e-05, + "loss": 0.0187, + "step": 4442 + }, + { + "epoch": 0.4444, + "grad_norm": 1.6402634382247925, + "learning_rate": 1.3603224484001949e-05, + "loss": 0.2706, + "step": 4444 + }, + { + "epoch": 0.4446, + "grad_norm": 0.3805604875087738, + "learning_rate": 1.3596711239753889e-05, + "loss": 0.0265, + "step": 4446 + }, + { + "epoch": 0.4448, + "grad_norm": 3.242457866668701, + "learning_rate": 1.3590196242512463e-05, + "loss": 0.8415, + "step": 4448 + }, + { + "epoch": 0.445, + "grad_norm": 0.1798551231622696, + "learning_rate": 1.3583679495453e-05, + "loss": 0.0174, + "step": 4450 + }, + { + "epoch": 0.4452, + "grad_norm": 0.7201522588729858, + "learning_rate": 1.3577161001751696e-05, + "loss": 0.0379, + "step": 4452 + }, + { + "epoch": 0.4454, + "grad_norm": 1.3044639825820923, + "learning_rate": 1.3570640764585567e-05, + "loss": 0.0479, + "step": 4454 + }, + { + "epoch": 0.4456, + "grad_norm": 1.681814432144165, + "learning_rate": 1.3564118787132507e-05, + "loss": 0.1764, + "step": 4456 + }, + { + "epoch": 0.4458, + "grad_norm": 10.636478424072266, + "learning_rate": 1.355759507257125e-05, + "loss": 0.4167, + "step": 4458 + }, + { + "epoch": 0.446, + "grad_norm": 1.5288593769073486, + "learning_rate": 1.3551069624081372e-05, + "loss": 0.2781, + "step": 4460 + }, + { + "epoch": 0.4462, + "grad_norm": 12.233014106750488, + "learning_rate": 1.3544542444843298e-05, + "loss": 0.3005, + "step": 4462 + }, + { + "epoch": 0.4464, + "grad_norm": 0.7780667543411255, + "learning_rate": 1.3538013538038295e-05, + "loss": 0.1043, + "step": 4464 + }, + { + "epoch": 0.4466, + "grad_norm": 5.992543697357178, + "learning_rate": 1.3531482906848474e-05, + "loss": 0.9806, + "step": 4466 + }, + { + "epoch": 0.4468, + "grad_norm": 0.5585418939590454, + "learning_rate": 1.3524950554456786e-05, + "loss": 0.0482, + "step": 4468 + }, + { + "epoch": 0.447, + "grad_norm": 0.19406765699386597, + "learning_rate": 1.3518416484047018e-05, + "loss": 0.0288, + "step": 4470 + }, + { + "epoch": 0.4472, + "grad_norm": 0.6975449323654175, + "learning_rate": 1.3511880698803801e-05, + "loss": 0.052, + "step": 4472 + }, + { + "epoch": 0.4474, + "grad_norm": 3.8871660232543945, + "learning_rate": 1.350534320191259e-05, + "loss": 0.4922, + "step": 4474 + }, + { + "epoch": 0.4476, + "grad_norm": 7.435347080230713, + "learning_rate": 1.349880399655969e-05, + "loss": 0.4402, + "step": 4476 + }, + { + "epoch": 0.4478, + "grad_norm": 1.4978870153427124, + "learning_rate": 1.3492263085932224e-05, + "loss": 0.0975, + "step": 4478 + }, + { + "epoch": 0.448, + "grad_norm": 3.503523111343384, + "learning_rate": 1.3485720473218153e-05, + "loss": 0.1164, + "step": 4480 + }, + { + "epoch": 0.4482, + "grad_norm": 1.3144464492797852, + "learning_rate": 1.3479176161606269e-05, + "loss": 0.0814, + "step": 4482 + }, + { + "epoch": 0.4484, + "grad_norm": 8.831536293029785, + "learning_rate": 1.347263015428619e-05, + "loss": 0.4117, + "step": 4484 + }, + { + "epoch": 0.4486, + "grad_norm": 0.7191663980484009, + "learning_rate": 1.3466082454448364e-05, + "loss": 0.0615, + "step": 4486 + }, + { + "epoch": 0.4488, + "grad_norm": 0.8180520534515381, + "learning_rate": 1.3459533065284049e-05, + "loss": 0.1596, + "step": 4488 + }, + { + "epoch": 0.449, + "grad_norm": 1.0952298641204834, + "learning_rate": 1.3452981989985347e-05, + "loss": 0.1145, + "step": 4490 + }, + { + "epoch": 0.4492, + "grad_norm": 5.344959259033203, + "learning_rate": 1.344642923174517e-05, + "loss": 0.4885, + "step": 4492 + }, + { + "epoch": 0.4494, + "grad_norm": 0.7012404203414917, + "learning_rate": 1.3439874793757255e-05, + "loss": 0.0374, + "step": 4494 + }, + { + "epoch": 0.4496, + "grad_norm": 1.7969279289245605, + "learning_rate": 1.3433318679216154e-05, + "loss": 0.0969, + "step": 4496 + }, + { + "epoch": 0.4498, + "grad_norm": 3.4431591033935547, + "learning_rate": 1.3426760891317236e-05, + "loss": 0.1684, + "step": 4498 + }, + { + "epoch": 0.45, + "grad_norm": 0.4079241454601288, + "learning_rate": 1.342020143325669e-05, + "loss": 0.1238, + "step": 4500 + }, + { + "epoch": 0.4502, + "grad_norm": 7.1976494789123535, + "learning_rate": 1.3413640308231511e-05, + "loss": 0.8776, + "step": 4502 + }, + { + "epoch": 0.4504, + "grad_norm": 5.702585697174072, + "learning_rate": 1.340707751943952e-05, + "loss": 0.7548, + "step": 4504 + }, + { + "epoch": 0.4506, + "grad_norm": 0.8267924189567566, + "learning_rate": 1.340051307007933e-05, + "loss": 0.0613, + "step": 4506 + }, + { + "epoch": 0.4508, + "grad_norm": 3.3813209533691406, + "learning_rate": 1.3393946963350381e-05, + "loss": 0.2019, + "step": 4508 + }, + { + "epoch": 0.451, + "grad_norm": 3.3949952125549316, + "learning_rate": 1.3387379202452917e-05, + "loss": 0.2614, + "step": 4510 + }, + { + "epoch": 0.4512, + "grad_norm": 3.781489133834839, + "learning_rate": 1.3380809790587975e-05, + "loss": 0.1569, + "step": 4512 + }, + { + "epoch": 0.4514, + "grad_norm": 6.145360946655273, + "learning_rate": 1.3374238730957414e-05, + "loss": 0.3989, + "step": 4514 + }, + { + "epoch": 0.4516, + "grad_norm": 1.7985860109329224, + "learning_rate": 1.3367666026763884e-05, + "loss": 0.0914, + "step": 4516 + }, + { + "epoch": 0.4518, + "grad_norm": 0.6553221940994263, + "learning_rate": 1.3361091681210846e-05, + "loss": 0.2104, + "step": 4518 + }, + { + "epoch": 0.452, + "grad_norm": 2.0449023246765137, + "learning_rate": 1.3354515697502552e-05, + "loss": 0.3615, + "step": 4520 + }, + { + "epoch": 0.4522, + "grad_norm": 5.652247905731201, + "learning_rate": 1.3347938078844058e-05, + "loss": 0.3575, + "step": 4522 + }, + { + "epoch": 0.4524, + "grad_norm": 0.7181721925735474, + "learning_rate": 1.3341358828441217e-05, + "loss": 0.2058, + "step": 4524 + }, + { + "epoch": 0.4526, + "grad_norm": 2.09092378616333, + "learning_rate": 1.3334777949500673e-05, + "loss": 0.3929, + "step": 4526 + }, + { + "epoch": 0.4528, + "grad_norm": 2.021484136581421, + "learning_rate": 1.3328195445229869e-05, + "loss": 0.1164, + "step": 4528 + }, + { + "epoch": 0.453, + "grad_norm": 9.18801498413086, + "learning_rate": 1.3321611318837033e-05, + "loss": 0.3833, + "step": 4530 + }, + { + "epoch": 0.4532, + "grad_norm": 1.6194324493408203, + "learning_rate": 1.3315025573531198e-05, + "loss": 0.3235, + "step": 4532 + }, + { + "epoch": 0.4534, + "grad_norm": 1.9135050773620605, + "learning_rate": 1.3308438212522164e-05, + "loss": 0.1294, + "step": 4534 + }, + { + "epoch": 0.4536, + "grad_norm": 0.5339848399162292, + "learning_rate": 1.3301849239020537e-05, + "loss": 0.335, + "step": 4536 + }, + { + "epoch": 0.4538, + "grad_norm": 7.554966449737549, + "learning_rate": 1.3295258656237703e-05, + "loss": 0.4233, + "step": 4538 + }, + { + "epoch": 0.454, + "grad_norm": 0.40024474263191223, + "learning_rate": 1.3288666467385834e-05, + "loss": 0.0428, + "step": 4540 + }, + { + "epoch": 0.4542, + "grad_norm": 3.326446771621704, + "learning_rate": 1.328207267567788e-05, + "loss": 0.1716, + "step": 4542 + }, + { + "epoch": 0.4544, + "grad_norm": 4.032560348510742, + "learning_rate": 1.327547728432757e-05, + "loss": 0.3212, + "step": 4544 + }, + { + "epoch": 0.4546, + "grad_norm": 3.3528456687927246, + "learning_rate": 1.3268880296549424e-05, + "loss": 0.1767, + "step": 4546 + }, + { + "epoch": 0.4548, + "grad_norm": 3.565134048461914, + "learning_rate": 1.3262281715558736e-05, + "loss": 0.4088, + "step": 4548 + }, + { + "epoch": 0.455, + "grad_norm": 1.5116472244262695, + "learning_rate": 1.3255681544571568e-05, + "loss": 0.3278, + "step": 4550 + }, + { + "epoch": 0.4552, + "grad_norm": 1.2029272317886353, + "learning_rate": 1.3249079786804765e-05, + "loss": 0.0831, + "step": 4552 + }, + { + "epoch": 0.4554, + "grad_norm": 0.9115913510322571, + "learning_rate": 1.3242476445475945e-05, + "loss": 0.0856, + "step": 4554 + }, + { + "epoch": 0.4556, + "grad_norm": 2.716320514678955, + "learning_rate": 1.3235871523803496e-05, + "loss": 0.3041, + "step": 4556 + }, + { + "epoch": 0.4558, + "grad_norm": 1.10104238986969, + "learning_rate": 1.3229265025006577e-05, + "loss": 0.2896, + "step": 4558 + }, + { + "epoch": 0.456, + "grad_norm": 4.140308380126953, + "learning_rate": 1.3222656952305113e-05, + "loss": 0.3321, + "step": 4560 + }, + { + "epoch": 0.4562, + "grad_norm": 9.226702690124512, + "learning_rate": 1.32160473089198e-05, + "loss": 0.4642, + "step": 4562 + }, + { + "epoch": 0.4564, + "grad_norm": 3.8276524543762207, + "learning_rate": 1.3209436098072095e-05, + "loss": 0.4088, + "step": 4564 + }, + { + "epoch": 0.4566, + "grad_norm": 2.3744852542877197, + "learning_rate": 1.3202823322984228e-05, + "loss": 0.1129, + "step": 4566 + }, + { + "epoch": 0.4568, + "grad_norm": 2.9120137691497803, + "learning_rate": 1.319620898687918e-05, + "loss": 0.3815, + "step": 4568 + }, + { + "epoch": 0.457, + "grad_norm": 0.21498461067676544, + "learning_rate": 1.3189593092980701e-05, + "loss": 0.0963, + "step": 4570 + }, + { + "epoch": 0.4572, + "grad_norm": 1.476844072341919, + "learning_rate": 1.3182975644513296e-05, + "loss": 0.1884, + "step": 4572 + }, + { + "epoch": 0.4574, + "grad_norm": 3.6069154739379883, + "learning_rate": 1.3176356644702225e-05, + "loss": 0.3639, + "step": 4574 + }, + { + "epoch": 0.4576, + "grad_norm": 1.5537594556808472, + "learning_rate": 1.316973609677352e-05, + "loss": 0.4397, + "step": 4576 + }, + { + "epoch": 0.4578, + "grad_norm": 0.195475772023201, + "learning_rate": 1.316311400395394e-05, + "loss": 0.1533, + "step": 4578 + }, + { + "epoch": 0.458, + "grad_norm": 19.879558563232422, + "learning_rate": 1.3156490369471026e-05, + "loss": 0.4033, + "step": 4580 + }, + { + "epoch": 0.4582, + "grad_norm": 6.942025184631348, + "learning_rate": 1.3149865196553049e-05, + "loss": 0.2903, + "step": 4582 + }, + { + "epoch": 0.4584, + "grad_norm": 6.008035659790039, + "learning_rate": 1.3143238488429042e-05, + "loss": 0.6943, + "step": 4584 + }, + { + "epoch": 0.4586, + "grad_norm": 5.712240695953369, + "learning_rate": 1.3136610248328779e-05, + "loss": 0.7642, + "step": 4586 + }, + { + "epoch": 0.4588, + "grad_norm": 8.281588554382324, + "learning_rate": 1.3129980479482783e-05, + "loss": 0.7582, + "step": 4588 + }, + { + "epoch": 0.459, + "grad_norm": 2.6330337524414062, + "learning_rate": 1.3123349185122328e-05, + "loss": 0.3013, + "step": 4590 + }, + { + "epoch": 0.4592, + "grad_norm": 2.2823619842529297, + "learning_rate": 1.3116716368479418e-05, + "loss": 0.1497, + "step": 4592 + }, + { + "epoch": 0.4594, + "grad_norm": 13.647124290466309, + "learning_rate": 1.311008203278682e-05, + "loss": 0.2903, + "step": 4594 + }, + { + "epoch": 0.4596, + "grad_norm": 1.137437105178833, + "learning_rate": 1.3103446181278015e-05, + "loss": 0.0794, + "step": 4596 + }, + { + "epoch": 0.4598, + "grad_norm": 3.2951269149780273, + "learning_rate": 1.3096808817187243e-05, + "loss": 0.5453, + "step": 4598 + }, + { + "epoch": 0.46, + "grad_norm": 2.1046576499938965, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.2424, + "step": 4600 + }, + { + "epoch": 0.4602, + "grad_norm": 0.30875059962272644, + "learning_rate": 1.3083529564200417e-05, + "loss": 0.1424, + "step": 4602 + }, + { + "epoch": 0.4604, + "grad_norm": 2.4026083946228027, + "learning_rate": 1.3076887681776509e-05, + "loss": 0.4591, + "step": 4604 + }, + { + "epoch": 0.4606, + "grad_norm": 2.2609951496124268, + "learning_rate": 1.307024429971492e-05, + "loss": 0.4516, + "step": 4606 + }, + { + "epoch": 0.4608, + "grad_norm": 2.571723699569702, + "learning_rate": 1.306359942125356e-05, + "loss": 0.4602, + "step": 4608 + }, + { + "epoch": 0.461, + "grad_norm": 0.9200714826583862, + "learning_rate": 1.3056953049631059e-05, + "loss": 0.1334, + "step": 4610 + }, + { + "epoch": 0.4612, + "grad_norm": 5.180070400238037, + "learning_rate": 1.3050305188086778e-05, + "loss": 0.271, + "step": 4612 + }, + { + "epoch": 0.4614, + "grad_norm": 1.4828917980194092, + "learning_rate": 1.3043655839860803e-05, + "loss": 0.4984, + "step": 4614 + }, + { + "epoch": 0.4616, + "grad_norm": 3.8686635494232178, + "learning_rate": 1.3037005008193944e-05, + "loss": 0.1791, + "step": 4616 + }, + { + "epoch": 0.4618, + "grad_norm": 7.988800048828125, + "learning_rate": 1.3030352696327741e-05, + "loss": 0.4604, + "step": 4618 + }, + { + "epoch": 0.462, + "grad_norm": 1.8882746696472168, + "learning_rate": 1.3023698907504447e-05, + "loss": 0.2785, + "step": 4620 + }, + { + "epoch": 0.4622, + "grad_norm": 2.2560644149780273, + "learning_rate": 1.3017043644967036e-05, + "loss": 0.2397, + "step": 4622 + }, + { + "epoch": 0.4624, + "grad_norm": 2.9536828994750977, + "learning_rate": 1.3010386911959207e-05, + "loss": 0.0889, + "step": 4624 + }, + { + "epoch": 0.4626, + "grad_norm": 1.4018107652664185, + "learning_rate": 1.3003728711725364e-05, + "loss": 0.2709, + "step": 4626 + }, + { + "epoch": 0.4628, + "grad_norm": 1.1318714618682861, + "learning_rate": 1.299706904751064e-05, + "loss": 0.124, + "step": 4628 + }, + { + "epoch": 0.463, + "grad_norm": 2.40834903717041, + "learning_rate": 1.2990407922560869e-05, + "loss": 0.2764, + "step": 4630 + }, + { + "epoch": 0.4632, + "grad_norm": 2.9622387886047363, + "learning_rate": 1.2983745340122604e-05, + "loss": 0.497, + "step": 4632 + }, + { + "epoch": 0.4634, + "grad_norm": 2.1214396953582764, + "learning_rate": 1.2977081303443107e-05, + "loss": 0.2936, + "step": 4634 + }, + { + "epoch": 0.4636, + "grad_norm": 3.405961751937866, + "learning_rate": 1.297041581577035e-05, + "loss": 0.1272, + "step": 4636 + }, + { + "epoch": 0.4638, + "grad_norm": 7.951139450073242, + "learning_rate": 1.2963748880353011e-05, + "loss": 0.3942, + "step": 4638 + }, + { + "epoch": 0.464, + "grad_norm": 0.768947422504425, + "learning_rate": 1.2957080500440469e-05, + "loss": 0.1542, + "step": 4640 + }, + { + "epoch": 0.4642, + "grad_norm": 3.987706184387207, + "learning_rate": 1.2950410679282815e-05, + "loss": 0.3095, + "step": 4642 + }, + { + "epoch": 0.4644, + "grad_norm": 1.3505041599273682, + "learning_rate": 1.2943739420130837e-05, + "loss": 0.167, + "step": 4644 + }, + { + "epoch": 0.4646, + "grad_norm": 5.908804893493652, + "learning_rate": 1.2937066726236029e-05, + "loss": 0.6514, + "step": 4646 + }, + { + "epoch": 0.4648, + "grad_norm": 0.9972499012947083, + "learning_rate": 1.2930392600850574e-05, + "loss": 0.1135, + "step": 4648 + }, + { + "epoch": 0.465, + "grad_norm": 0.5467854142189026, + "learning_rate": 1.2923717047227368e-05, + "loss": 0.0996, + "step": 4650 + }, + { + "epoch": 0.4652, + "grad_norm": 0.36453452706336975, + "learning_rate": 1.291704006861999e-05, + "loss": 0.0523, + "step": 4652 + }, + { + "epoch": 0.4654, + "grad_norm": 5.138363838195801, + "learning_rate": 1.2910361668282718e-05, + "loss": 0.3058, + "step": 4654 + }, + { + "epoch": 0.4656, + "grad_norm": 8.793717384338379, + "learning_rate": 1.2903681849470528e-05, + "loss": 0.3927, + "step": 4656 + }, + { + "epoch": 0.4658, + "grad_norm": 0.568292498588562, + "learning_rate": 1.2897000615439075e-05, + "loss": 0.0487, + "step": 4658 + }, + { + "epoch": 0.466, + "grad_norm": 6.576045036315918, + "learning_rate": 1.2890317969444716e-05, + "loss": 0.1911, + "step": 4660 + }, + { + "epoch": 0.4662, + "grad_norm": 1.279620885848999, + "learning_rate": 1.2883633914744493e-05, + "loss": 0.13, + "step": 4662 + }, + { + "epoch": 0.4664, + "grad_norm": 7.587808609008789, + "learning_rate": 1.287694845459613e-05, + "loss": 0.2992, + "step": 4664 + }, + { + "epoch": 0.4666, + "grad_norm": 0.5267515778541565, + "learning_rate": 1.2870261592258038e-05, + "loss": 0.5826, + "step": 4666 + }, + { + "epoch": 0.4668, + "grad_norm": 3.7106411457061768, + "learning_rate": 1.2863573330989315e-05, + "loss": 0.1517, + "step": 4668 + }, + { + "epoch": 0.467, + "grad_norm": 0.8642452955245972, + "learning_rate": 1.2856883674049736e-05, + "loss": 0.3979, + "step": 4670 + }, + { + "epoch": 0.4672, + "grad_norm": 1.4815387725830078, + "learning_rate": 1.2850192624699762e-05, + "loss": 0.1488, + "step": 4672 + }, + { + "epoch": 0.4674, + "grad_norm": 3.971844434738159, + "learning_rate": 1.2843500186200529e-05, + "loss": 0.437, + "step": 4674 + }, + { + "epoch": 0.4676, + "grad_norm": 1.3343275785446167, + "learning_rate": 1.2836806361813846e-05, + "loss": 0.6733, + "step": 4676 + }, + { + "epoch": 0.4678, + "grad_norm": 0.30849894881248474, + "learning_rate": 1.2830111154802203e-05, + "loss": 0.1154, + "step": 4678 + }, + { + "epoch": 0.468, + "grad_norm": 5.704225063323975, + "learning_rate": 1.2823414568428767e-05, + "loss": 0.0947, + "step": 4680 + }, + { + "epoch": 0.4682, + "grad_norm": 0.44716399908065796, + "learning_rate": 1.2816716605957366e-05, + "loss": 0.0796, + "step": 4682 + }, + { + "epoch": 0.4684, + "grad_norm": 2.058727502822876, + "learning_rate": 1.2810017270652513e-05, + "loss": 0.2565, + "step": 4684 + }, + { + "epoch": 0.4686, + "grad_norm": 0.17953366041183472, + "learning_rate": 1.2803316565779378e-05, + "loss": 0.0746, + "step": 4686 + }, + { + "epoch": 0.4688, + "grad_norm": 6.307948589324951, + "learning_rate": 1.27966144946038e-05, + "loss": 0.4968, + "step": 4688 + }, + { + "epoch": 0.469, + "grad_norm": 1.7330063581466675, + "learning_rate": 1.2789911060392295e-05, + "loss": 0.1734, + "step": 4690 + }, + { + "epoch": 0.4692, + "grad_norm": 0.7902920842170715, + "learning_rate": 1.278320626641203e-05, + "loss": 0.0994, + "step": 4692 + }, + { + "epoch": 0.4694, + "grad_norm": 3.979994535446167, + "learning_rate": 1.2776500115930842e-05, + "loss": 0.4838, + "step": 4694 + }, + { + "epoch": 0.4696, + "grad_norm": 1.4033863544464111, + "learning_rate": 1.2769792612217224e-05, + "loss": 0.1499, + "step": 4696 + }, + { + "epoch": 0.4698, + "grad_norm": 8.530102729797363, + "learning_rate": 1.2763083758540337e-05, + "loss": 0.5596, + "step": 4698 + }, + { + "epoch": 0.47, + "grad_norm": 0.12680093944072723, + "learning_rate": 1.2756373558169992e-05, + "loss": 0.2668, + "step": 4700 + }, + { + "epoch": 0.4702, + "grad_norm": 4.972046375274658, + "learning_rate": 1.2749662014376662e-05, + "loss": 0.2761, + "step": 4702 + }, + { + "epoch": 0.4704, + "grad_norm": 1.2459627389907837, + "learning_rate": 1.2742949130431468e-05, + "loss": 0.2717, + "step": 4704 + }, + { + "epoch": 0.4706, + "grad_norm": 0.7692914009094238, + "learning_rate": 1.2736234909606186e-05, + "loss": 0.2248, + "step": 4706 + }, + { + "epoch": 0.4708, + "grad_norm": 4.040436267852783, + "learning_rate": 1.2729519355173254e-05, + "loss": 0.3096, + "step": 4708 + }, + { + "epoch": 0.471, + "grad_norm": 0.7477133274078369, + "learning_rate": 1.2722802470405744e-05, + "loss": 0.1542, + "step": 4710 + }, + { + "epoch": 0.4712, + "grad_norm": 0.8529167771339417, + "learning_rate": 1.2716084258577388e-05, + "loss": 0.1952, + "step": 4712 + }, + { + "epoch": 0.4714, + "grad_norm": 1.7446985244750977, + "learning_rate": 1.270936472296256e-05, + "loss": 0.2879, + "step": 4714 + }, + { + "epoch": 0.4716, + "grad_norm": 4.860806941986084, + "learning_rate": 1.270264386683628e-05, + "loss": 0.1701, + "step": 4716 + }, + { + "epoch": 0.4718, + "grad_norm": 3.566110134124756, + "learning_rate": 1.2695921693474211e-05, + "loss": 0.2043, + "step": 4718 + }, + { + "epoch": 0.472, + "grad_norm": 0.756203293800354, + "learning_rate": 1.2689198206152657e-05, + "loss": 0.0727, + "step": 4720 + }, + { + "epoch": 0.4722, + "grad_norm": 3.5518932342529297, + "learning_rate": 1.268247340814857e-05, + "loss": 0.2896, + "step": 4722 + }, + { + "epoch": 0.4724, + "grad_norm": 4.219339847564697, + "learning_rate": 1.2675747302739528e-05, + "loss": 0.356, + "step": 4724 + }, + { + "epoch": 0.4726, + "grad_norm": 3.6728062629699707, + "learning_rate": 1.2669019893203758e-05, + "loss": 0.1228, + "step": 4726 + }, + { + "epoch": 0.4728, + "grad_norm": 1.1308871507644653, + "learning_rate": 1.2662291182820115e-05, + "loss": 0.066, + "step": 4728 + }, + { + "epoch": 0.473, + "grad_norm": 0.743052065372467, + "learning_rate": 1.265556117486809e-05, + "loss": 0.0474, + "step": 4730 + }, + { + "epoch": 0.4732, + "grad_norm": 0.6818839311599731, + "learning_rate": 1.2648829872627809e-05, + "loss": 0.0794, + "step": 4732 + }, + { + "epoch": 0.4734, + "grad_norm": 0.8317766785621643, + "learning_rate": 1.2642097279380025e-05, + "loss": 0.1244, + "step": 4734 + }, + { + "epoch": 0.4736, + "grad_norm": 8.465240478515625, + "learning_rate": 1.263536339840613e-05, + "loss": 0.4768, + "step": 4736 + }, + { + "epoch": 0.4738, + "grad_norm": 5.4631500244140625, + "learning_rate": 1.2628628232988123e-05, + "loss": 0.3418, + "step": 4738 + }, + { + "epoch": 0.474, + "grad_norm": 0.4513447880744934, + "learning_rate": 1.2621891786408648e-05, + "loss": 0.0826, + "step": 4740 + }, + { + "epoch": 0.4742, + "grad_norm": 6.356937885284424, + "learning_rate": 1.261515406195097e-05, + "loss": 0.1114, + "step": 4742 + }, + { + "epoch": 0.4744, + "grad_norm": 4.374515056610107, + "learning_rate": 1.2608415062898971e-05, + "loss": 0.4186, + "step": 4744 + }, + { + "epoch": 0.4746, + "grad_norm": 0.6138001680374146, + "learning_rate": 1.2601674792537157e-05, + "loss": 0.0385, + "step": 4746 + }, + { + "epoch": 0.4748, + "grad_norm": 2.5472042560577393, + "learning_rate": 1.2594933254150654e-05, + "loss": 0.2535, + "step": 4748 + }, + { + "epoch": 0.475, + "grad_norm": 0.45860275626182556, + "learning_rate": 1.2588190451025209e-05, + "loss": 0.0479, + "step": 4750 + }, + { + "epoch": 0.4752, + "grad_norm": 2.6113266944885254, + "learning_rate": 1.2581446386447178e-05, + "loss": 0.2669, + "step": 4752 + }, + { + "epoch": 0.4754, + "grad_norm": 0.6229885220527649, + "learning_rate": 1.257470106370354e-05, + "loss": 0.0634, + "step": 4754 + }, + { + "epoch": 0.4756, + "grad_norm": 1.8717930316925049, + "learning_rate": 1.256795448608188e-05, + "loss": 0.1955, + "step": 4756 + }, + { + "epoch": 0.4758, + "grad_norm": 4.671133041381836, + "learning_rate": 1.2561206656870397e-05, + "loss": 0.226, + "step": 4758 + }, + { + "epoch": 0.476, + "grad_norm": 1.8825098276138306, + "learning_rate": 1.2554457579357906e-05, + "loss": 0.098, + "step": 4760 + }, + { + "epoch": 0.4762, + "grad_norm": 1.5538884401321411, + "learning_rate": 1.2547707256833823e-05, + "loss": 0.0928, + "step": 4762 + }, + { + "epoch": 0.4764, + "grad_norm": 10.048849105834961, + "learning_rate": 1.2540955692588173e-05, + "loss": 0.3526, + "step": 4764 + }, + { + "epoch": 0.4766, + "grad_norm": 3.797873020172119, + "learning_rate": 1.2534202889911584e-05, + "loss": 0.3803, + "step": 4766 + }, + { + "epoch": 0.4768, + "grad_norm": 2.88969349861145, + "learning_rate": 1.2527448852095295e-05, + "loss": 0.2962, + "step": 4768 + }, + { + "epoch": 0.477, + "grad_norm": 22.298255920410156, + "learning_rate": 1.252069358243114e-05, + "loss": 0.433, + "step": 4770 + }, + { + "epoch": 0.4772, + "grad_norm": 2.18404483795166, + "learning_rate": 1.251393708421155e-05, + "loss": 0.0932, + "step": 4772 + }, + { + "epoch": 0.4774, + "grad_norm": 13.801117897033691, + "learning_rate": 1.2507179360729569e-05, + "loss": 1.7289, + "step": 4774 + }, + { + "epoch": 0.4776, + "grad_norm": 2.155973196029663, + "learning_rate": 1.2500420415278822e-05, + "loss": 0.6096, + "step": 4776 + }, + { + "epoch": 0.4778, + "grad_norm": 1.8601332902908325, + "learning_rate": 1.249366025115354e-05, + "loss": 0.2264, + "step": 4778 + }, + { + "epoch": 0.478, + "grad_norm": 0.8336498737335205, + "learning_rate": 1.2486898871648552e-05, + "loss": 0.1144, + "step": 4780 + }, + { + "epoch": 0.4782, + "grad_norm": 1.2063336372375488, + "learning_rate": 1.2480136280059256e-05, + "loss": 0.1902, + "step": 4782 + }, + { + "epoch": 0.4784, + "grad_norm": 1.0924508571624756, + "learning_rate": 1.2473372479681671e-05, + "loss": 0.2104, + "step": 4784 + }, + { + "epoch": 0.4786, + "grad_norm": 0.3655213713645935, + "learning_rate": 1.2466607473812386e-05, + "loss": 0.4938, + "step": 4786 + }, + { + "epoch": 0.4788, + "grad_norm": 1.13620126247406, + "learning_rate": 1.2459841265748582e-05, + "loss": 0.3745, + "step": 4788 + }, + { + "epoch": 0.479, + "grad_norm": 4.346567153930664, + "learning_rate": 1.2453073858788027e-05, + "loss": 0.4344, + "step": 4790 + }, + { + "epoch": 0.4792, + "grad_norm": 4.898146152496338, + "learning_rate": 1.2446305256229074e-05, + "loss": 0.3322, + "step": 4792 + }, + { + "epoch": 0.4794, + "grad_norm": 3.408001184463501, + "learning_rate": 1.2439535461370658e-05, + "loss": 0.3726, + "step": 4794 + }, + { + "epoch": 0.4796, + "grad_norm": 2.2589142322540283, + "learning_rate": 1.2432764477512294e-05, + "loss": 0.3322, + "step": 4796 + }, + { + "epoch": 0.4798, + "grad_norm": 2.55334734916687, + "learning_rate": 1.2425992307954075e-05, + "loss": 0.2193, + "step": 4798 + }, + { + "epoch": 0.48, + "grad_norm": 3.9360060691833496, + "learning_rate": 1.2419218955996677e-05, + "loss": 0.3636, + "step": 4800 + }, + { + "epoch": 0.4802, + "grad_norm": 1.6219794750213623, + "learning_rate": 1.241244442494135e-05, + "loss": 0.2615, + "step": 4802 + }, + { + "epoch": 0.4804, + "grad_norm": 2.602032423019409, + "learning_rate": 1.2405668718089918e-05, + "loss": 0.2529, + "step": 4804 + }, + { + "epoch": 0.4806, + "grad_norm": 3.4659502506256104, + "learning_rate": 1.2398891838744777e-05, + "loss": 0.3215, + "step": 4806 + }, + { + "epoch": 0.4808, + "grad_norm": 0.5597580075263977, + "learning_rate": 1.2392113790208895e-05, + "loss": 0.1335, + "step": 4808 + }, + { + "epoch": 0.481, + "grad_norm": 3.62618088722229, + "learning_rate": 1.238533457578581e-05, + "loss": 0.2044, + "step": 4810 + }, + { + "epoch": 0.4812, + "grad_norm": 7.121874809265137, + "learning_rate": 1.2378554198779632e-05, + "loss": 0.645, + "step": 4812 + }, + { + "epoch": 0.4814, + "grad_norm": 1.9943015575408936, + "learning_rate": 1.2371772662495031e-05, + "loss": 0.1648, + "step": 4814 + }, + { + "epoch": 0.4816, + "grad_norm": 4.707725524902344, + "learning_rate": 1.236498997023725e-05, + "loss": 0.535, + "step": 4816 + }, + { + "epoch": 0.4818, + "grad_norm": 0.6350975036621094, + "learning_rate": 1.2358206125312085e-05, + "loss": 0.3051, + "step": 4818 + }, + { + "epoch": 0.482, + "grad_norm": 3.3710074424743652, + "learning_rate": 1.23514211310259e-05, + "loss": 0.3812, + "step": 4820 + }, + { + "epoch": 0.4822, + "grad_norm": 3.4677767753601074, + "learning_rate": 1.2344634990685624e-05, + "loss": 0.4696, + "step": 4822 + }, + { + "epoch": 0.4824, + "grad_norm": 5.760736465454102, + "learning_rate": 1.2337847707598738e-05, + "loss": 0.3321, + "step": 4824 + }, + { + "epoch": 0.4826, + "grad_norm": 2.551616907119751, + "learning_rate": 1.233105928507328e-05, + "loss": 0.2196, + "step": 4826 + }, + { + "epoch": 0.4828, + "grad_norm": 2.9629039764404297, + "learning_rate": 1.2324269726417841e-05, + "loss": 0.2422, + "step": 4828 + }, + { + "epoch": 0.483, + "grad_norm": 1.829493522644043, + "learning_rate": 1.2317479034941572e-05, + "loss": 0.2412, + "step": 4830 + }, + { + "epoch": 0.4832, + "grad_norm": 4.351972579956055, + "learning_rate": 1.2310687213954182e-05, + "loss": 0.4925, + "step": 4832 + }, + { + "epoch": 0.4834, + "grad_norm": 1.0646053552627563, + "learning_rate": 1.2303894266765908e-05, + "loss": 0.1301, + "step": 4834 + }, + { + "epoch": 0.4836, + "grad_norm": 1.32081937789917, + "learning_rate": 1.2297100196687557e-05, + "loss": 0.2613, + "step": 4836 + }, + { + "epoch": 0.4838, + "grad_norm": 2.7847952842712402, + "learning_rate": 1.2290305007030479e-05, + "loss": 0.2106, + "step": 4838 + }, + { + "epoch": 0.484, + "grad_norm": 1.2059736251831055, + "learning_rate": 1.2283508701106559e-05, + "loss": 0.253, + "step": 4840 + }, + { + "epoch": 0.4842, + "grad_norm": 4.007351398468018, + "learning_rate": 1.2276711282228241e-05, + "loss": 0.3576, + "step": 4842 + }, + { + "epoch": 0.4844, + "grad_norm": 2.497727155685425, + "learning_rate": 1.2269912753708502e-05, + "loss": 0.1995, + "step": 4844 + }, + { + "epoch": 0.4846, + "grad_norm": 3.7762722969055176, + "learning_rate": 1.226311311886086e-05, + "loss": 0.3804, + "step": 4846 + }, + { + "epoch": 0.4848, + "grad_norm": 3.1748766899108887, + "learning_rate": 1.2256312380999376e-05, + "loss": 0.3264, + "step": 4848 + }, + { + "epoch": 0.485, + "grad_norm": 0.7367700338363647, + "learning_rate": 1.2249510543438652e-05, + "loss": 0.0811, + "step": 4850 + }, + { + "epoch": 0.4852, + "grad_norm": 4.718523979187012, + "learning_rate": 1.2242707609493814e-05, + "loss": 0.4739, + "step": 4852 + }, + { + "epoch": 0.4854, + "grad_norm": 3.5550971031188965, + "learning_rate": 1.223590358248053e-05, + "loss": 0.3278, + "step": 4854 + }, + { + "epoch": 0.4856, + "grad_norm": 6.616255760192871, + "learning_rate": 1.2229098465715005e-05, + "loss": 0.3634, + "step": 4856 + }, + { + "epoch": 0.4858, + "grad_norm": 0.28856417536735535, + "learning_rate": 1.2222292262513967e-05, + "loss": 0.2585, + "step": 4858 + }, + { + "epoch": 0.486, + "grad_norm": 1.486482858657837, + "learning_rate": 1.2215484976194675e-05, + "loss": 0.2053, + "step": 4860 + }, + { + "epoch": 0.4862, + "grad_norm": 4.479843616485596, + "learning_rate": 1.220867661007492e-05, + "loss": 0.2471, + "step": 4862 + }, + { + "epoch": 0.4864, + "grad_norm": 1.5909696817398071, + "learning_rate": 1.2201867167473015e-05, + "loss": 0.2694, + "step": 4864 + }, + { + "epoch": 0.4866, + "grad_norm": 3.2860543727874756, + "learning_rate": 1.2195056651707806e-05, + "loss": 0.3171, + "step": 4866 + }, + { + "epoch": 0.4868, + "grad_norm": 2.208601951599121, + "learning_rate": 1.2188245066098647e-05, + "loss": 0.2621, + "step": 4868 + }, + { + "epoch": 0.487, + "grad_norm": 3.530874252319336, + "learning_rate": 1.2181432413965428e-05, + "loss": 0.2851, + "step": 4870 + }, + { + "epoch": 0.4872, + "grad_norm": 6.662416458129883, + "learning_rate": 1.217461869862855e-05, + "loss": 0.4442, + "step": 4872 + }, + { + "epoch": 0.4874, + "grad_norm": 1.6005032062530518, + "learning_rate": 1.2167803923408935e-05, + "loss": 0.1588, + "step": 4874 + }, + { + "epoch": 0.4876, + "grad_norm": 0.7576032876968384, + "learning_rate": 1.2160988091628023e-05, + "loss": 0.0858, + "step": 4876 + }, + { + "epoch": 0.4878, + "grad_norm": 2.0747220516204834, + "learning_rate": 1.2154171206607765e-05, + "loss": 0.3306, + "step": 4878 + }, + { + "epoch": 0.488, + "grad_norm": 0.4090907871723175, + "learning_rate": 1.2147353271670634e-05, + "loss": 0.0825, + "step": 4880 + }, + { + "epoch": 0.4882, + "grad_norm": 8.020939826965332, + "learning_rate": 1.2140534290139601e-05, + "loss": 0.4124, + "step": 4882 + }, + { + "epoch": 0.4884, + "grad_norm": 2.0946292877197266, + "learning_rate": 1.2133714265338162e-05, + "loss": 0.1291, + "step": 4884 + }, + { + "epoch": 0.4886, + "grad_norm": 1.0317192077636719, + "learning_rate": 1.2126893200590309e-05, + "loss": 0.2709, + "step": 4886 + }, + { + "epoch": 0.4888, + "grad_norm": 2.738006830215454, + "learning_rate": 1.212007109922055e-05, + "loss": 0.3096, + "step": 4888 + }, + { + "epoch": 0.489, + "grad_norm": 5.016908168792725, + "learning_rate": 1.211324796455389e-05, + "loss": 0.224, + "step": 4890 + }, + { + "epoch": 0.4892, + "grad_norm": 3.0704073905944824, + "learning_rate": 1.2106423799915841e-05, + "loss": 0.2982, + "step": 4892 + }, + { + "epoch": 0.4894, + "grad_norm": 2.519353151321411, + "learning_rate": 1.2099598608632427e-05, + "loss": 0.2024, + "step": 4894 + }, + { + "epoch": 0.4896, + "grad_norm": 4.714383602142334, + "learning_rate": 1.2092772394030153e-05, + "loss": 0.225, + "step": 4896 + }, + { + "epoch": 0.4898, + "grad_norm": 1.9803663492202759, + "learning_rate": 1.208594515943604e-05, + "loss": 0.2821, + "step": 4898 + }, + { + "epoch": 0.49, + "grad_norm": 2.8631489276885986, + "learning_rate": 1.2079116908177592e-05, + "loss": 0.2933, + "step": 4900 + }, + { + "epoch": 0.4902, + "grad_norm": 3.78094482421875, + "learning_rate": 1.2072287643582825e-05, + "loss": 0.3228, + "step": 4902 + }, + { + "epoch": 0.4904, + "grad_norm": 0.649927020072937, + "learning_rate": 1.2065457368980236e-05, + "loss": 0.1816, + "step": 4904 + }, + { + "epoch": 0.4906, + "grad_norm": 2.3347017765045166, + "learning_rate": 1.2058626087698814e-05, + "loss": 0.1335, + "step": 4906 + }, + { + "epoch": 0.4908, + "grad_norm": 12.218737602233887, + "learning_rate": 1.2051793803068046e-05, + "loss": 0.2615, + "step": 4908 + }, + { + "epoch": 0.491, + "grad_norm": 2.6934714317321777, + "learning_rate": 1.2044960518417902e-05, + "loss": 0.0698, + "step": 4910 + }, + { + "epoch": 0.4912, + "grad_norm": 5.911588668823242, + "learning_rate": 1.203812623707885e-05, + "loss": 0.1769, + "step": 4912 + }, + { + "epoch": 0.4914, + "grad_norm": 1.2723027467727661, + "learning_rate": 1.2031290962381823e-05, + "loss": 0.2784, + "step": 4914 + }, + { + "epoch": 0.4916, + "grad_norm": 2.6851680278778076, + "learning_rate": 1.202445469765826e-05, + "loss": 0.122, + "step": 4916 + }, + { + "epoch": 0.4918, + "grad_norm": 12.322037696838379, + "learning_rate": 1.201761744624007e-05, + "loss": 0.5492, + "step": 4918 + }, + { + "epoch": 0.492, + "grad_norm": 1.8838659524917603, + "learning_rate": 1.2010779211459649e-05, + "loss": 0.2019, + "step": 4920 + }, + { + "epoch": 0.4922, + "grad_norm": 3.2860896587371826, + "learning_rate": 1.2003939996649864e-05, + "loss": 0.6389, + "step": 4922 + }, + { + "epoch": 0.4924, + "grad_norm": 4.185286045074463, + "learning_rate": 1.1997099805144071e-05, + "loss": 0.1259, + "step": 4924 + }, + { + "epoch": 0.4926, + "grad_norm": 0.35707786679267883, + "learning_rate": 1.1990258640276094e-05, + "loss": 0.1473, + "step": 4926 + }, + { + "epoch": 0.4928, + "grad_norm": 7.650276184082031, + "learning_rate": 1.1983416505380234e-05, + "loss": 0.3051, + "step": 4928 + }, + { + "epoch": 0.493, + "grad_norm": 2.1662490367889404, + "learning_rate": 1.1976573403791263e-05, + "loss": 0.4718, + "step": 4930 + }, + { + "epoch": 0.4932, + "grad_norm": 1.566079020500183, + "learning_rate": 1.1969729338844429e-05, + "loss": 0.6309, + "step": 4932 + }, + { + "epoch": 0.4934, + "grad_norm": 10.979264259338379, + "learning_rate": 1.196288431387544e-05, + "loss": 0.8418, + "step": 4934 + }, + { + "epoch": 0.4936, + "grad_norm": 2.2012057304382324, + "learning_rate": 1.1956038332220484e-05, + "loss": 0.1607, + "step": 4936 + }, + { + "epoch": 0.4938, + "grad_norm": 1.2891920804977417, + "learning_rate": 1.1949191397216207e-05, + "loss": 0.3559, + "step": 4938 + }, + { + "epoch": 0.494, + "grad_norm": 0.40306150913238525, + "learning_rate": 1.194234351219972e-05, + "loss": 0.2646, + "step": 4940 + }, + { + "epoch": 0.4942, + "grad_norm": 6.400496006011963, + "learning_rate": 1.1935494680508606e-05, + "loss": 0.5964, + "step": 4942 + }, + { + "epoch": 0.4944, + "grad_norm": 8.132110595703125, + "learning_rate": 1.192864490548089e-05, + "loss": 0.2707, + "step": 4944 + }, + { + "epoch": 0.4946, + "grad_norm": 3.541283369064331, + "learning_rate": 1.1921794190455082e-05, + "loss": 0.1452, + "step": 4946 + }, + { + "epoch": 0.4948, + "grad_norm": 0.5335226655006409, + "learning_rate": 1.191494253877013e-05, + "loss": 0.055, + "step": 4948 + }, + { + "epoch": 0.495, + "grad_norm": 0.22899696230888367, + "learning_rate": 1.190808995376545e-05, + "loss": 0.0327, + "step": 4950 + }, + { + "epoch": 0.4952, + "grad_norm": 1.873138666152954, + "learning_rate": 1.1901236438780902e-05, + "loss": 0.5085, + "step": 4952 + }, + { + "epoch": 0.4954, + "grad_norm": 2.6754817962646484, + "learning_rate": 1.1894381997156814e-05, + "loss": 0.4655, + "step": 4954 + }, + { + "epoch": 0.4956, + "grad_norm": 1.4942432641983032, + "learning_rate": 1.1887526632233954e-05, + "loss": 0.408, + "step": 4956 + }, + { + "epoch": 0.4958, + "grad_norm": 2.0801193714141846, + "learning_rate": 1.188067034735354e-05, + "loss": 0.2377, + "step": 4958 + }, + { + "epoch": 0.496, + "grad_norm": 0.8183286786079407, + "learning_rate": 1.187381314585725e-05, + "loss": 0.2576, + "step": 4960 + }, + { + "epoch": 0.4962, + "grad_norm": 2.015312433242798, + "learning_rate": 1.186695503108719e-05, + "loss": 0.1385, + "step": 4962 + }, + { + "epoch": 0.4964, + "grad_norm": 0.37125837802886963, + "learning_rate": 1.186009600638593e-05, + "loss": 0.1791, + "step": 4964 + }, + { + "epoch": 0.4966, + "grad_norm": 2.391964912414551, + "learning_rate": 1.1853236075096474e-05, + "loss": 0.1684, + "step": 4966 + }, + { + "epoch": 0.4968, + "grad_norm": 3.7674620151519775, + "learning_rate": 1.184637524056227e-05, + "loss": 0.2311, + "step": 4968 + }, + { + "epoch": 0.497, + "grad_norm": 1.9172714948654175, + "learning_rate": 1.1839513506127202e-05, + "loss": 0.262, + "step": 4970 + }, + { + "epoch": 0.4972, + "grad_norm": 1.6699622869491577, + "learning_rate": 1.1832650875135599e-05, + "loss": 0.6171, + "step": 4972 + }, + { + "epoch": 0.4974, + "grad_norm": 5.323099613189697, + "learning_rate": 1.1825787350932224e-05, + "loss": 0.2709, + "step": 4974 + }, + { + "epoch": 0.4976, + "grad_norm": 2.321369171142578, + "learning_rate": 1.181892293686227e-05, + "loss": 0.2397, + "step": 4976 + }, + { + "epoch": 0.4978, + "grad_norm": 0.5214923024177551, + "learning_rate": 1.1812057636271374e-05, + "loss": 0.0691, + "step": 4978 + }, + { + "epoch": 0.498, + "grad_norm": 1.8914241790771484, + "learning_rate": 1.1805191452505602e-05, + "loss": 0.1367, + "step": 4980 + }, + { + "epoch": 0.4982, + "grad_norm": 1.8577405214309692, + "learning_rate": 1.1798324388911445e-05, + "loss": 0.1514, + "step": 4982 + }, + { + "epoch": 0.4984, + "grad_norm": 1.6009234189987183, + "learning_rate": 1.1791456448835825e-05, + "loss": 0.1502, + "step": 4984 + }, + { + "epoch": 0.4986, + "grad_norm": 3.4801230430603027, + "learning_rate": 1.1784587635626095e-05, + "loss": 0.1543, + "step": 4986 + }, + { + "epoch": 0.4988, + "grad_norm": 0.8861472606658936, + "learning_rate": 1.1777717952630033e-05, + "loss": 0.0727, + "step": 4988 + }, + { + "epoch": 0.499, + "grad_norm": 3.323774576187134, + "learning_rate": 1.1770847403195836e-05, + "loss": 0.3391, + "step": 4990 + }, + { + "epoch": 0.4992, + "grad_norm": 2.776662826538086, + "learning_rate": 1.1763975990672125e-05, + "loss": 0.2052, + "step": 4992 + }, + { + "epoch": 0.4994, + "grad_norm": 1.915566325187683, + "learning_rate": 1.1757103718407948e-05, + "loss": 0.1024, + "step": 4994 + }, + { + "epoch": 0.4996, + "grad_norm": 0.9988976120948792, + "learning_rate": 1.1750230589752763e-05, + "loss": 0.0481, + "step": 4996 + }, + { + "epoch": 0.4998, + "grad_norm": 1.4704638719558716, + "learning_rate": 1.1743356608056448e-05, + "loss": 0.3702, + "step": 4998 + }, + { + "epoch": 0.5, + "grad_norm": 3.5187931060791016, + "learning_rate": 1.1736481776669307e-05, + "loss": 0.1541, + "step": 5000 + }, + { + "epoch": 0.5002, + "grad_norm": 10.545181274414062, + "learning_rate": 1.1729606098942039e-05, + "loss": 0.4656, + "step": 5002 + }, + { + "epoch": 0.5004, + "grad_norm": 4.681562423706055, + "learning_rate": 1.1722729578225769e-05, + "loss": 0.1586, + "step": 5004 + }, + { + "epoch": 0.5006, + "grad_norm": 0.18547940254211426, + "learning_rate": 1.171585221787203e-05, + "loss": 0.0265, + "step": 5006 + }, + { + "epoch": 0.5008, + "grad_norm": 0.3785983622074127, + "learning_rate": 1.1708974021232768e-05, + "loss": 0.0794, + "step": 5008 + }, + { + "epoch": 0.501, + "grad_norm": 1.6861804723739624, + "learning_rate": 1.1702094991660326e-05, + "loss": 0.1442, + "step": 5010 + }, + { + "epoch": 0.5012, + "grad_norm": 1.0058417320251465, + "learning_rate": 1.1695215132507465e-05, + "loss": 0.2526, + "step": 5012 + }, + { + "epoch": 0.5014, + "grad_norm": 1.2421131134033203, + "learning_rate": 1.1688334447127338e-05, + "loss": 0.0687, + "step": 5014 + }, + { + "epoch": 0.5016, + "grad_norm": 0.7239051461219788, + "learning_rate": 1.1681452938873516e-05, + "loss": 0.05, + "step": 5016 + }, + { + "epoch": 0.5018, + "grad_norm": 0.2878114879131317, + "learning_rate": 1.1674570611099956e-05, + "loss": 0.0184, + "step": 5018 + }, + { + "epoch": 0.502, + "grad_norm": 0.5549318194389343, + "learning_rate": 1.1667687467161025e-05, + "loss": 0.0567, + "step": 5020 + }, + { + "epoch": 0.5022, + "grad_norm": 32.02801513671875, + "learning_rate": 1.166080351041148e-05, + "loss": 0.7712, + "step": 5022 + }, + { + "epoch": 0.5024, + "grad_norm": 7.336417198181152, + "learning_rate": 1.1653918744206478e-05, + "loss": 0.38, + "step": 5024 + }, + { + "epoch": 0.5026, + "grad_norm": 7.323688983917236, + "learning_rate": 1.1647033171901573e-05, + "loss": 0.3488, + "step": 5026 + }, + { + "epoch": 0.5028, + "grad_norm": 11.263627052307129, + "learning_rate": 1.1640146796852711e-05, + "loss": 0.6629, + "step": 5028 + }, + { + "epoch": 0.503, + "grad_norm": 2.3518600463867188, + "learning_rate": 1.1633259622416224e-05, + "loss": 0.0565, + "step": 5030 + }, + { + "epoch": 0.5032, + "grad_norm": 4.096634387969971, + "learning_rate": 1.1626371651948839e-05, + "loss": 0.4042, + "step": 5032 + }, + { + "epoch": 0.5034, + "grad_norm": 5.597692012786865, + "learning_rate": 1.1619482888807662e-05, + "loss": 0.1442, + "step": 5034 + }, + { + "epoch": 0.5036, + "grad_norm": 2.7219624519348145, + "learning_rate": 1.1612593336350209e-05, + "loss": 0.2567, + "step": 5036 + }, + { + "epoch": 0.5038, + "grad_norm": 2.9304747581481934, + "learning_rate": 1.1605702997934345e-05, + "loss": 0.1678, + "step": 5038 + }, + { + "epoch": 0.504, + "grad_norm": 4.590115547180176, + "learning_rate": 1.159881187691835e-05, + "loss": 0.6637, + "step": 5040 + }, + { + "epoch": 0.5042, + "grad_norm": 1.9717835187911987, + "learning_rate": 1.1591919976660867e-05, + "loss": 0.0661, + "step": 5042 + }, + { + "epoch": 0.5044, + "grad_norm": 7.50637149810791, + "learning_rate": 1.158502730052093e-05, + "loss": 0.528, + "step": 5044 + }, + { + "epoch": 0.5046, + "grad_norm": 3.6813573837280273, + "learning_rate": 1.157813385185794e-05, + "loss": 0.3634, + "step": 5046 + }, + { + "epoch": 0.5048, + "grad_norm": 1.8091933727264404, + "learning_rate": 1.157123963403168e-05, + "loss": 0.5598, + "step": 5048 + }, + { + "epoch": 0.505, + "grad_norm": 5.6625285148620605, + "learning_rate": 1.156434465040231e-05, + "loss": 0.6609, + "step": 5050 + }, + { + "epoch": 0.5052, + "grad_norm": 2.433624744415283, + "learning_rate": 1.1557448904330362e-05, + "loss": 0.2103, + "step": 5052 + }, + { + "epoch": 0.5054, + "grad_norm": 6.526553630828857, + "learning_rate": 1.155055239917674e-05, + "loss": 0.3942, + "step": 5054 + }, + { + "epoch": 0.5056, + "grad_norm": 3.104905843734741, + "learning_rate": 1.1543655138302714e-05, + "loss": 0.157, + "step": 5056 + }, + { + "epoch": 0.5058, + "grad_norm": 1.997605800628662, + "learning_rate": 1.1536757125069924e-05, + "loss": 0.1611, + "step": 5058 + }, + { + "epoch": 0.506, + "grad_norm": 3.541027545928955, + "learning_rate": 1.1529858362840383e-05, + "loss": 0.4229, + "step": 5060 + }, + { + "epoch": 0.5062, + "grad_norm": 1.113247275352478, + "learning_rate": 1.1522958854976458e-05, + "loss": 0.0693, + "step": 5062 + }, + { + "epoch": 0.5064, + "grad_norm": 2.698312520980835, + "learning_rate": 1.1516058604840891e-05, + "loss": 0.352, + "step": 5064 + }, + { + "epoch": 0.5066, + "grad_norm": 3.8935041427612305, + "learning_rate": 1.1509157615796775e-05, + "loss": 0.333, + "step": 5066 + }, + { + "epoch": 0.5068, + "grad_norm": 1.933345913887024, + "learning_rate": 1.1502255891207572e-05, + "loss": 0.177, + "step": 5068 + }, + { + "epoch": 0.507, + "grad_norm": 2.175575017929077, + "learning_rate": 1.1495353434437098e-05, + "loss": 0.3227, + "step": 5070 + }, + { + "epoch": 0.5072, + "grad_norm": 2.8641340732574463, + "learning_rate": 1.1488450248849523e-05, + "loss": 0.3776, + "step": 5072 + }, + { + "epoch": 0.5074, + "grad_norm": 0.450950026512146, + "learning_rate": 1.1481546337809381e-05, + "loss": 0.1198, + "step": 5074 + }, + { + "epoch": 0.5076, + "grad_norm": 2.249995231628418, + "learning_rate": 1.1474641704681551e-05, + "loss": 0.1867, + "step": 5076 + }, + { + "epoch": 0.5078, + "grad_norm": 1.4288171529769897, + "learning_rate": 1.1467736352831266e-05, + "loss": 0.2049, + "step": 5078 + }, + { + "epoch": 0.508, + "grad_norm": 6.7950849533081055, + "learning_rate": 1.1460830285624119e-05, + "loss": 0.2071, + "step": 5080 + }, + { + "epoch": 0.5082, + "grad_norm": 1.6567192077636719, + "learning_rate": 1.1453923506426032e-05, + "loss": 0.2619, + "step": 5082 + }, + { + "epoch": 0.5084, + "grad_norm": 4.376550674438477, + "learning_rate": 1.1447016018603293e-05, + "loss": 0.5149, + "step": 5084 + }, + { + "epoch": 0.5086, + "grad_norm": 4.108633518218994, + "learning_rate": 1.1440107825522522e-05, + "loss": 0.5742, + "step": 5086 + }, + { + "epoch": 0.5088, + "grad_norm": 0.39270296692848206, + "learning_rate": 1.1433198930550694e-05, + "loss": 0.077, + "step": 5088 + }, + { + "epoch": 0.509, + "grad_norm": 3.7546846866607666, + "learning_rate": 1.1426289337055119e-05, + "loss": 0.3094, + "step": 5090 + }, + { + "epoch": 0.5092, + "grad_norm": 1.5506293773651123, + "learning_rate": 1.1419379048403446e-05, + "loss": 0.1387, + "step": 5092 + }, + { + "epoch": 0.5094, + "grad_norm": 3.3189609050750732, + "learning_rate": 1.141246806796367e-05, + "loss": 0.4039, + "step": 5094 + }, + { + "epoch": 0.5096, + "grad_norm": 0.3084852993488312, + "learning_rate": 1.140555639910411e-05, + "loss": 0.3243, + "step": 5096 + }, + { + "epoch": 0.5098, + "grad_norm": 4.5742974281311035, + "learning_rate": 1.1398644045193443e-05, + "loss": 0.2295, + "step": 5098 + }, + { + "epoch": 0.51, + "grad_norm": 1.374224305152893, + "learning_rate": 1.1391731009600655e-05, + "loss": 0.1345, + "step": 5100 + }, + { + "epoch": 0.5102, + "grad_norm": 1.041952133178711, + "learning_rate": 1.1384817295695083e-05, + "loss": 0.2014, + "step": 5102 + }, + { + "epoch": 0.5104, + "grad_norm": 0.7786287069320679, + "learning_rate": 1.137790290684638e-05, + "loss": 0.054, + "step": 5104 + }, + { + "epoch": 0.5106, + "grad_norm": 6.352116584777832, + "learning_rate": 1.1370987846424547e-05, + "loss": 0.5079, + "step": 5106 + }, + { + "epoch": 0.5108, + "grad_norm": 3.2295758724212646, + "learning_rate": 1.1364072117799884e-05, + "loss": 0.21, + "step": 5108 + }, + { + "epoch": 0.511, + "grad_norm": 3.325671434402466, + "learning_rate": 1.1357155724343046e-05, + "loss": 0.3416, + "step": 5110 + }, + { + "epoch": 0.5112, + "grad_norm": 1.5319480895996094, + "learning_rate": 1.1350238669424993e-05, + "loss": 0.1152, + "step": 5112 + }, + { + "epoch": 0.5114, + "grad_norm": 0.48372405767440796, + "learning_rate": 1.1343320956417015e-05, + "loss": 0.0303, + "step": 5114 + }, + { + "epoch": 0.5116, + "grad_norm": 4.1439208984375, + "learning_rate": 1.1336402588690727e-05, + "loss": 0.3147, + "step": 5116 + }, + { + "epoch": 0.5118, + "grad_norm": 0.7744665741920471, + "learning_rate": 1.1329483569618045e-05, + "loss": 0.2894, + "step": 5118 + }, + { + "epoch": 0.512, + "grad_norm": 2.9621939659118652, + "learning_rate": 1.1322563902571227e-05, + "loss": 0.4501, + "step": 5120 + }, + { + "epoch": 0.5122, + "grad_norm": 12.531291007995605, + "learning_rate": 1.1315643590922827e-05, + "loss": 0.6957, + "step": 5122 + }, + { + "epoch": 0.5124, + "grad_norm": 2.1413521766662598, + "learning_rate": 1.1308722638045724e-05, + "loss": 0.2234, + "step": 5124 + }, + { + "epoch": 0.5126, + "grad_norm": 1.0899955034255981, + "learning_rate": 1.1301801047313106e-05, + "loss": 0.3651, + "step": 5126 + }, + { + "epoch": 0.5128, + "grad_norm": 1.2548089027404785, + "learning_rate": 1.129487882209847e-05, + "loss": 0.1753, + "step": 5128 + }, + { + "epoch": 0.513, + "grad_norm": 2.552690029144287, + "learning_rate": 1.128795596577563e-05, + "loss": 0.1358, + "step": 5130 + }, + { + "epoch": 0.5132, + "grad_norm": 0.7208943963050842, + "learning_rate": 1.1281032481718696e-05, + "loss": 0.1045, + "step": 5132 + }, + { + "epoch": 0.5134, + "grad_norm": 2.9018023014068604, + "learning_rate": 1.1274108373302095e-05, + "loss": 0.5192, + "step": 5134 + }, + { + "epoch": 0.5136, + "grad_norm": 4.502842903137207, + "learning_rate": 1.1267183643900548e-05, + "loss": 0.2988, + "step": 5136 + }, + { + "epoch": 0.5138, + "grad_norm": 2.7287912368774414, + "learning_rate": 1.1260258296889086e-05, + "loss": 0.4227, + "step": 5138 + }, + { + "epoch": 0.514, + "grad_norm": 2.8690152168273926, + "learning_rate": 1.1253332335643043e-05, + "loss": 0.7353, + "step": 5140 + }, + { + "epoch": 0.5142, + "grad_norm": 2.918837070465088, + "learning_rate": 1.1246405763538047e-05, + "loss": 0.1644, + "step": 5142 + }, + { + "epoch": 0.5144, + "grad_norm": 3.0693583488464355, + "learning_rate": 1.1239478583950019e-05, + "loss": 0.3262, + "step": 5144 + }, + { + "epoch": 0.5146, + "grad_norm": 2.313800096511841, + "learning_rate": 1.1232550800255188e-05, + "loss": 0.2819, + "step": 5146 + }, + { + "epoch": 0.5148, + "grad_norm": 2.738335371017456, + "learning_rate": 1.1225622415830068e-05, + "loss": 0.2566, + "step": 5148 + }, + { + "epoch": 0.515, + "grad_norm": 3.20343279838562, + "learning_rate": 1.1218693434051475e-05, + "loss": 0.315, + "step": 5150 + }, + { + "epoch": 0.5152, + "grad_norm": 4.581728935241699, + "learning_rate": 1.1211763858296507e-05, + "loss": 0.8277, + "step": 5152 + }, + { + "epoch": 0.5154, + "grad_norm": 2.8552238941192627, + "learning_rate": 1.1204833691942553e-05, + "loss": 0.231, + "step": 5154 + }, + { + "epoch": 0.5156, + "grad_norm": 1.448453426361084, + "learning_rate": 1.1197902938367297e-05, + "loss": 0.4167, + "step": 5156 + }, + { + "epoch": 0.5158, + "grad_norm": 1.927646517753601, + "learning_rate": 1.11909716009487e-05, + "loss": 0.3321, + "step": 5158 + }, + { + "epoch": 0.516, + "grad_norm": 2.886883497238159, + "learning_rate": 1.1184039683065014e-05, + "loss": 0.1949, + "step": 5160 + }, + { + "epoch": 0.5162, + "grad_norm": 1.1667296886444092, + "learning_rate": 1.1177107188094765e-05, + "loss": 0.1721, + "step": 5162 + }, + { + "epoch": 0.5164, + "grad_norm": 4.947206497192383, + "learning_rate": 1.1170174119416778e-05, + "loss": 0.4381, + "step": 5164 + }, + { + "epoch": 0.5166, + "grad_norm": 1.4906156063079834, + "learning_rate": 1.1163240480410136e-05, + "loss": 0.1865, + "step": 5166 + }, + { + "epoch": 0.5168, + "grad_norm": 0.3602636456489563, + "learning_rate": 1.1156306274454218e-05, + "loss": 0.2432, + "step": 5168 + }, + { + "epoch": 0.517, + "grad_norm": 1.1904546022415161, + "learning_rate": 1.1149371504928667e-05, + "loss": 0.1282, + "step": 5170 + }, + { + "epoch": 0.5172, + "grad_norm": 2.302222967147827, + "learning_rate": 1.1142436175213409e-05, + "loss": 0.2189, + "step": 5172 + }, + { + "epoch": 0.5174, + "grad_norm": 2.1418564319610596, + "learning_rate": 1.1135500288688636e-05, + "loss": 0.2525, + "step": 5174 + }, + { + "epoch": 0.5176, + "grad_norm": 3.9979140758514404, + "learning_rate": 1.1128563848734817e-05, + "loss": 0.2287, + "step": 5176 + }, + { + "epoch": 0.5178, + "grad_norm": 0.7333757281303406, + "learning_rate": 1.112162685873269e-05, + "loss": 0.2324, + "step": 5178 + }, + { + "epoch": 0.518, + "grad_norm": 2.792938709259033, + "learning_rate": 1.1114689322063255e-05, + "loss": 0.2417, + "step": 5180 + }, + { + "epoch": 0.5182, + "grad_norm": 3.2576472759246826, + "learning_rate": 1.1107751242107786e-05, + "loss": 0.2067, + "step": 5182 + }, + { + "epoch": 0.5184, + "grad_norm": 1.4043841361999512, + "learning_rate": 1.1100812622247823e-05, + "loss": 0.3635, + "step": 5184 + }, + { + "epoch": 0.5186, + "grad_norm": 2.334397792816162, + "learning_rate": 1.1093873465865156e-05, + "loss": 0.6853, + "step": 5186 + }, + { + "epoch": 0.5188, + "grad_norm": 1.3160724639892578, + "learning_rate": 1.1086933776341853e-05, + "loss": 0.2894, + "step": 5188 + }, + { + "epoch": 0.519, + "grad_norm": 3.0317788124084473, + "learning_rate": 1.1079993557060228e-05, + "loss": 0.2191, + "step": 5190 + }, + { + "epoch": 0.5192, + "grad_norm": 0.5317188501358032, + "learning_rate": 1.1073052811402867e-05, + "loss": 0.1624, + "step": 5192 + }, + { + "epoch": 0.5194, + "grad_norm": 1.4004173278808594, + "learning_rate": 1.10661115427526e-05, + "loss": 0.1076, + "step": 5194 + }, + { + "epoch": 0.5196, + "grad_norm": 1.4237269163131714, + "learning_rate": 1.105916975449252e-05, + "loss": 0.2774, + "step": 5196 + }, + { + "epoch": 0.5198, + "grad_norm": 4.21946382522583, + "learning_rate": 1.1052227450005968e-05, + "loss": 0.3441, + "step": 5198 + }, + { + "epoch": 0.52, + "grad_norm": 2.8351097106933594, + "learning_rate": 1.1045284632676535e-05, + "loss": 0.3183, + "step": 5200 + }, + { + "epoch": 0.5202, + "grad_norm": 3.9457435607910156, + "learning_rate": 1.1038341305888074e-05, + "loss": 0.1682, + "step": 5202 + }, + { + "epoch": 0.5204, + "grad_norm": 0.7267175912857056, + "learning_rate": 1.1031397473024674e-05, + "loss": 0.3001, + "step": 5204 + }, + { + "epoch": 0.5206, + "grad_norm": 4.980691432952881, + "learning_rate": 1.1024453137470677e-05, + "loss": 0.4378, + "step": 5206 + }, + { + "epoch": 0.5208, + "grad_norm": 2.33803391456604, + "learning_rate": 1.1017508302610665e-05, + "loss": 0.2163, + "step": 5208 + }, + { + "epoch": 0.521, + "grad_norm": 1.5834604501724243, + "learning_rate": 1.1010562971829464e-05, + "loss": 0.1846, + "step": 5210 + }, + { + "epoch": 0.5212, + "grad_norm": 6.831141948699951, + "learning_rate": 1.1003617148512149e-05, + "loss": 0.3799, + "step": 5212 + }, + { + "epoch": 0.5214, + "grad_norm": 0.1933005303144455, + "learning_rate": 1.099667083604403e-05, + "loss": 0.0886, + "step": 5214 + }, + { + "epoch": 0.5216, + "grad_norm": 4.8129801750183105, + "learning_rate": 1.0989724037810651e-05, + "loss": 0.4731, + "step": 5216 + }, + { + "epoch": 0.5218, + "grad_norm": 1.5570693016052246, + "learning_rate": 1.0982776757197799e-05, + "loss": 0.154, + "step": 5218 + }, + { + "epoch": 0.522, + "grad_norm": 1.7834153175354004, + "learning_rate": 1.0975828997591496e-05, + "loss": 0.1951, + "step": 5220 + }, + { + "epoch": 0.5222, + "grad_norm": 2.690514326095581, + "learning_rate": 1.0968880762377994e-05, + "loss": 0.2503, + "step": 5222 + }, + { + "epoch": 0.5224, + "grad_norm": 3.863245964050293, + "learning_rate": 1.0961932054943778e-05, + "loss": 0.1513, + "step": 5224 + }, + { + "epoch": 0.5226, + "grad_norm": 4.68580436706543, + "learning_rate": 1.0954982878675564e-05, + "loss": 0.2635, + "step": 5226 + }, + { + "epoch": 0.5228, + "grad_norm": 0.3717038929462433, + "learning_rate": 1.0948033236960294e-05, + "loss": 0.0278, + "step": 5228 + }, + { + "epoch": 0.523, + "grad_norm": 5.103582382202148, + "learning_rate": 1.0941083133185146e-05, + "loss": 0.7104, + "step": 5230 + }, + { + "epoch": 0.5232, + "grad_norm": 1.4912970066070557, + "learning_rate": 1.0934132570737508e-05, + "loss": 0.0857, + "step": 5232 + }, + { + "epoch": 0.5234, + "grad_norm": 0.5822873711585999, + "learning_rate": 1.0927181553005001e-05, + "loss": 0.051, + "step": 5234 + }, + { + "epoch": 0.5236, + "grad_norm": 5.243893146514893, + "learning_rate": 1.0920230083375474e-05, + "loss": 0.4634, + "step": 5236 + }, + { + "epoch": 0.5238, + "grad_norm": 0.2628829777240753, + "learning_rate": 1.0913278165236977e-05, + "loss": 0.1421, + "step": 5238 + }, + { + "epoch": 0.524, + "grad_norm": 1.3764586448669434, + "learning_rate": 1.0906325801977804e-05, + "loss": 0.2579, + "step": 5240 + }, + { + "epoch": 0.5242, + "grad_norm": 0.4548317790031433, + "learning_rate": 1.0899372996986439e-05, + "loss": 0.1786, + "step": 5242 + }, + { + "epoch": 0.5244, + "grad_norm": 1.579666018486023, + "learning_rate": 1.0892419753651606e-05, + "loss": 0.1214, + "step": 5244 + }, + { + "epoch": 0.5246, + "grad_norm": 5.32271146774292, + "learning_rate": 1.0885466075362224e-05, + "loss": 0.3691, + "step": 5246 + }, + { + "epoch": 0.5248, + "grad_norm": 0.7099839448928833, + "learning_rate": 1.0878511965507435e-05, + "loss": 0.4836, + "step": 5248 + }, + { + "epoch": 0.525, + "grad_norm": 4.263265132904053, + "learning_rate": 1.0871557427476585e-05, + "loss": 0.1453, + "step": 5250 + }, + { + "epoch": 0.5252, + "grad_norm": 5.137057781219482, + "learning_rate": 1.086460246465923e-05, + "loss": 0.396, + "step": 5252 + }, + { + "epoch": 0.5254, + "grad_norm": 0.6662480235099792, + "learning_rate": 1.085764708044514e-05, + "loss": 0.1027, + "step": 5254 + }, + { + "epoch": 0.5256, + "grad_norm": 1.446975827217102, + "learning_rate": 1.0850691278224282e-05, + "loss": 0.1716, + "step": 5256 + }, + { + "epoch": 0.5258, + "grad_norm": 6.435297012329102, + "learning_rate": 1.0843735061386829e-05, + "loss": 0.4833, + "step": 5258 + }, + { + "epoch": 0.526, + "grad_norm": 3.8096625804901123, + "learning_rate": 1.083677843332316e-05, + "loss": 0.7626, + "step": 5260 + }, + { + "epoch": 0.5262, + "grad_norm": 4.71964693069458, + "learning_rate": 1.082982139742384e-05, + "loss": 0.2772, + "step": 5262 + }, + { + "epoch": 0.5264, + "grad_norm": 1.9194263219833374, + "learning_rate": 1.0822863957079657e-05, + "loss": 0.0518, + "step": 5264 + }, + { + "epoch": 0.5266, + "grad_norm": 1.3000229597091675, + "learning_rate": 1.0815906115681579e-05, + "loss": 0.0539, + "step": 5266 + }, + { + "epoch": 0.5268, + "grad_norm": 0.8757956624031067, + "learning_rate": 1.0808947876620768e-05, + "loss": 0.1293, + "step": 5268 + }, + { + "epoch": 0.527, + "grad_norm": 10.030099868774414, + "learning_rate": 1.0801989243288588e-05, + "loss": 0.4619, + "step": 5270 + }, + { + "epoch": 0.5272, + "grad_norm": 2.5447182655334473, + "learning_rate": 1.07950302190766e-05, + "loss": 0.1577, + "step": 5272 + }, + { + "epoch": 0.5274, + "grad_norm": 4.358039379119873, + "learning_rate": 1.0788070807376536e-05, + "loss": 0.415, + "step": 5274 + }, + { + "epoch": 0.5276, + "grad_norm": 0.30928120017051697, + "learning_rate": 1.0781111011580336e-05, + "loss": 0.1454, + "step": 5276 + }, + { + "epoch": 0.5278, + "grad_norm": 1.2690770626068115, + "learning_rate": 1.0774150835080119e-05, + "loss": 0.2656, + "step": 5278 + }, + { + "epoch": 0.528, + "grad_norm": 1.9747623205184937, + "learning_rate": 1.0767190281268187e-05, + "loss": 0.2154, + "step": 5280 + }, + { + "epoch": 0.5282, + "grad_norm": 1.15560781955719, + "learning_rate": 1.0760229353537032e-05, + "loss": 0.2223, + "step": 5282 + }, + { + "epoch": 0.5284, + "grad_norm": 2.972447395324707, + "learning_rate": 1.0753268055279328e-05, + "loss": 0.2706, + "step": 5284 + }, + { + "epoch": 0.5286, + "grad_norm": 2.1051435470581055, + "learning_rate": 1.0746306389887924e-05, + "loss": 0.185, + "step": 5286 + }, + { + "epoch": 0.5288, + "grad_norm": 6.672732830047607, + "learning_rate": 1.0739344360755853e-05, + "loss": 0.6127, + "step": 5288 + }, + { + "epoch": 0.529, + "grad_norm": 6.798135280609131, + "learning_rate": 1.0732381971276318e-05, + "loss": 0.4165, + "step": 5290 + }, + { + "epoch": 0.5292, + "grad_norm": 0.3041224479675293, + "learning_rate": 1.072541922484271e-05, + "loss": 0.2363, + "step": 5292 + }, + { + "epoch": 0.5294, + "grad_norm": 2.4529871940612793, + "learning_rate": 1.0718456124848584e-05, + "loss": 0.2421, + "step": 5294 + }, + { + "epoch": 0.5296, + "grad_norm": 3.2821011543273926, + "learning_rate": 1.071149267468767e-05, + "loss": 0.1604, + "step": 5296 + }, + { + "epoch": 0.5298, + "grad_norm": 0.4039662480354309, + "learning_rate": 1.070452887775387e-05, + "loss": 0.0836, + "step": 5298 + }, + { + "epoch": 0.53, + "grad_norm": 0.47644442319869995, + "learning_rate": 1.0697564737441254e-05, + "loss": 0.0988, + "step": 5300 + }, + { + "epoch": 0.5302, + "grad_norm": 1.9763269424438477, + "learning_rate": 1.0690600257144062e-05, + "loss": 0.1296, + "step": 5302 + }, + { + "epoch": 0.5304, + "grad_norm": 0.29676106572151184, + "learning_rate": 1.0683635440256689e-05, + "loss": 0.0394, + "step": 5304 + }, + { + "epoch": 0.5306, + "grad_norm": 0.6531403064727783, + "learning_rate": 1.067667029017371e-05, + "loss": 0.1511, + "step": 5306 + }, + { + "epoch": 0.5308, + "grad_norm": 7.38726806640625, + "learning_rate": 1.0669704810289852e-05, + "loss": 0.3429, + "step": 5308 + }, + { + "epoch": 0.531, + "grad_norm": 2.7487611770629883, + "learning_rate": 1.0662739004000005e-05, + "loss": 0.0811, + "step": 5310 + }, + { + "epoch": 0.5312, + "grad_norm": 6.0154948234558105, + "learning_rate": 1.0655772874699217e-05, + "loss": 0.3917, + "step": 5312 + }, + { + "epoch": 0.5314, + "grad_norm": 0.3283737897872925, + "learning_rate": 1.0648806425782697e-05, + "loss": 0.0234, + "step": 5314 + }, + { + "epoch": 0.5316, + "grad_norm": 4.758686065673828, + "learning_rate": 1.0641839660645806e-05, + "loss": 0.3442, + "step": 5316 + }, + { + "epoch": 0.5318, + "grad_norm": 3.613036870956421, + "learning_rate": 1.0634872582684062e-05, + "loss": 0.6257, + "step": 5318 + }, + { + "epoch": 0.532, + "grad_norm": 1.8253846168518066, + "learning_rate": 1.0627905195293135e-05, + "loss": 0.7782, + "step": 5320 + }, + { + "epoch": 0.5322, + "grad_norm": 0.34326809644699097, + "learning_rate": 1.0620937501868842e-05, + "loss": 0.0544, + "step": 5322 + }, + { + "epoch": 0.5324, + "grad_norm": 3.089259386062622, + "learning_rate": 1.0613969505807157e-05, + "loss": 0.2295, + "step": 5324 + }, + { + "epoch": 0.5326, + "grad_norm": 3.1751766204833984, + "learning_rate": 1.060700121050419e-05, + "loss": 0.1822, + "step": 5326 + }, + { + "epoch": 0.5328, + "grad_norm": 1.1119434833526611, + "learning_rate": 1.0600032619356208e-05, + "loss": 0.2949, + "step": 5328 + }, + { + "epoch": 0.533, + "grad_norm": 4.910400867462158, + "learning_rate": 1.0593063735759619e-05, + "loss": 0.2013, + "step": 5330 + }, + { + "epoch": 0.5332, + "grad_norm": 0.7539941072463989, + "learning_rate": 1.0586094563110965e-05, + "loss": 0.0395, + "step": 5332 + }, + { + "epoch": 0.5334, + "grad_norm": 0.8246897459030151, + "learning_rate": 1.0579125104806944e-05, + "loss": 0.21, + "step": 5334 + }, + { + "epoch": 0.5336, + "grad_norm": 0.7192615866661072, + "learning_rate": 1.0572155364244383e-05, + "loss": 0.3574, + "step": 5336 + }, + { + "epoch": 0.5338, + "grad_norm": 2.327643871307373, + "learning_rate": 1.0565185344820248e-05, + "loss": 0.1074, + "step": 5338 + }, + { + "epoch": 0.534, + "grad_norm": 1.2634823322296143, + "learning_rate": 1.055821504993164e-05, + "loss": 0.446, + "step": 5340 + }, + { + "epoch": 0.5342, + "grad_norm": 5.884777545928955, + "learning_rate": 1.0551244482975798e-05, + "loss": 0.6362, + "step": 5342 + }, + { + "epoch": 0.5344, + "grad_norm": 3.332662582397461, + "learning_rate": 1.0544273647350091e-05, + "loss": 0.1954, + "step": 5344 + }, + { + "epoch": 0.5346, + "grad_norm": 1.2004437446594238, + "learning_rate": 1.0537302546452022e-05, + "loss": 0.2801, + "step": 5346 + }, + { + "epoch": 0.5348, + "grad_norm": 4.3835906982421875, + "learning_rate": 1.053033118367922e-05, + "loss": 0.1277, + "step": 5348 + }, + { + "epoch": 0.535, + "grad_norm": 1.2073575258255005, + "learning_rate": 1.0523359562429441e-05, + "loss": 0.063, + "step": 5350 + }, + { + "epoch": 0.5352, + "grad_norm": 5.010507583618164, + "learning_rate": 1.0516387686100566e-05, + "loss": 0.8351, + "step": 5352 + }, + { + "epoch": 0.5354, + "grad_norm": 2.311480760574341, + "learning_rate": 1.050941555809061e-05, + "loss": 0.1137, + "step": 5354 + }, + { + "epoch": 0.5356, + "grad_norm": 1.1763269901275635, + "learning_rate": 1.0502443181797696e-05, + "loss": 0.0604, + "step": 5356 + }, + { + "epoch": 0.5358, + "grad_norm": 3.2539279460906982, + "learning_rate": 1.0495470560620082e-05, + "loss": 0.1584, + "step": 5358 + }, + { + "epoch": 0.536, + "grad_norm": 4.3426103591918945, + "learning_rate": 1.0488497697956134e-05, + "loss": 0.2378, + "step": 5360 + }, + { + "epoch": 0.5362, + "grad_norm": 0.42861494421958923, + "learning_rate": 1.0481524597204342e-05, + "loss": 0.0836, + "step": 5362 + }, + { + "epoch": 0.5364, + "grad_norm": 1.93867826461792, + "learning_rate": 1.0474551261763315e-05, + "loss": 0.0793, + "step": 5364 + }, + { + "epoch": 0.5366, + "grad_norm": 0.8945069909095764, + "learning_rate": 1.0467577695031763e-05, + "loss": 0.1586, + "step": 5366 + }, + { + "epoch": 0.5368, + "grad_norm": 1.738525390625, + "learning_rate": 1.0460603900408523e-05, + "loss": 0.111, + "step": 5368 + }, + { + "epoch": 0.537, + "grad_norm": 3.9387948513031006, + "learning_rate": 1.0453629881292537e-05, + "loss": 0.2268, + "step": 5370 + }, + { + "epoch": 0.5372, + "grad_norm": 0.48827239871025085, + "learning_rate": 1.0446655641082864e-05, + "loss": 0.1789, + "step": 5372 + }, + { + "epoch": 0.5374, + "grad_norm": 2.451982259750366, + "learning_rate": 1.043968118317865e-05, + "loss": 0.1427, + "step": 5374 + }, + { + "epoch": 0.5376, + "grad_norm": 3.535067081451416, + "learning_rate": 1.0432706510979172e-05, + "loss": 0.1645, + "step": 5376 + }, + { + "epoch": 0.5378, + "grad_norm": 0.500156819820404, + "learning_rate": 1.0425731627883798e-05, + "loss": 0.0563, + "step": 5378 + }, + { + "epoch": 0.538, + "grad_norm": 0.39092183113098145, + "learning_rate": 1.0418756537291996e-05, + "loss": 0.0393, + "step": 5380 + }, + { + "epoch": 0.5382, + "grad_norm": 1.293672800064087, + "learning_rate": 1.0411781242603352e-05, + "loss": 0.0726, + "step": 5382 + }, + { + "epoch": 0.5384, + "grad_norm": 1.3737905025482178, + "learning_rate": 1.0404805747217525e-05, + "loss": 0.1821, + "step": 5384 + }, + { + "epoch": 0.5386, + "grad_norm": 3.212954044342041, + "learning_rate": 1.03978300545343e-05, + "loss": 0.173, + "step": 5386 + }, + { + "epoch": 0.5388, + "grad_norm": 0.3977322280406952, + "learning_rate": 1.0390854167953537e-05, + "loss": 0.0653, + "step": 5388 + }, + { + "epoch": 0.539, + "grad_norm": 2.624960422515869, + "learning_rate": 1.03838780908752e-05, + "loss": 0.3128, + "step": 5390 + }, + { + "epoch": 0.5392, + "grad_norm": 0.23187890648841858, + "learning_rate": 1.0376901826699349e-05, + "loss": 0.4356, + "step": 5392 + }, + { + "epoch": 0.5394, + "grad_norm": 7.972472667694092, + "learning_rate": 1.036992537882612e-05, + "loss": 0.634, + "step": 5394 + }, + { + "epoch": 0.5396, + "grad_norm": 5.79369592666626, + "learning_rate": 1.036294875065576e-05, + "loss": 0.3007, + "step": 5396 + }, + { + "epoch": 0.5398, + "grad_norm": 4.575866222381592, + "learning_rate": 1.0355971945588586e-05, + "loss": 0.503, + "step": 5398 + }, + { + "epoch": 0.54, + "grad_norm": 0.757382333278656, + "learning_rate": 1.0348994967025012e-05, + "loss": 0.0441, + "step": 5400 + }, + { + "epoch": 0.5402, + "grad_norm": 1.208433985710144, + "learning_rate": 1.034201781836553e-05, + "loss": 0.0761, + "step": 5402 + }, + { + "epoch": 0.5404, + "grad_norm": 0.3613438308238983, + "learning_rate": 1.0335040503010715e-05, + "loss": 0.0484, + "step": 5404 + }, + { + "epoch": 0.5406, + "grad_norm": 0.47342249751091003, + "learning_rate": 1.0328063024361232e-05, + "loss": 0.0908, + "step": 5406 + }, + { + "epoch": 0.5408, + "grad_norm": 0.23493477702140808, + "learning_rate": 1.0321085385817818e-05, + "loss": 0.0285, + "step": 5408 + }, + { + "epoch": 0.541, + "grad_norm": 1.002082109451294, + "learning_rate": 1.0314107590781284e-05, + "loss": 0.0527, + "step": 5410 + }, + { + "epoch": 0.5412, + "grad_norm": 1.0541203022003174, + "learning_rate": 1.030712964265253e-05, + "loss": 0.2639, + "step": 5412 + }, + { + "epoch": 0.5414, + "grad_norm": 0.23920251429080963, + "learning_rate": 1.0300151544832513e-05, + "loss": 0.3844, + "step": 5414 + }, + { + "epoch": 0.5416, + "grad_norm": 0.9785172343254089, + "learning_rate": 1.0293173300722286e-05, + "loss": 0.0381, + "step": 5416 + }, + { + "epoch": 0.5418, + "grad_norm": 6.392991542816162, + "learning_rate": 1.0286194913722948e-05, + "loss": 0.3411, + "step": 5418 + }, + { + "epoch": 0.542, + "grad_norm": 6.4486212730407715, + "learning_rate": 1.0279216387235691e-05, + "loss": 0.4293, + "step": 5420 + }, + { + "epoch": 0.5422, + "grad_norm": 4.611568450927734, + "learning_rate": 1.0272237724661753e-05, + "loss": 0.2609, + "step": 5422 + }, + { + "epoch": 0.5424, + "grad_norm": 1.3526670932769775, + "learning_rate": 1.026525892940246e-05, + "loss": 0.3318, + "step": 5424 + }, + { + "epoch": 0.5426, + "grad_norm": 13.549226760864258, + "learning_rate": 1.0258280004859189e-05, + "loss": 0.6163, + "step": 5426 + }, + { + "epoch": 0.5428, + "grad_norm": 6.072121620178223, + "learning_rate": 1.0251300954433377e-05, + "loss": 0.1965, + "step": 5428 + }, + { + "epoch": 0.543, + "grad_norm": 1.6811012029647827, + "learning_rate": 1.0244321781526533e-05, + "loss": 0.1948, + "step": 5430 + }, + { + "epoch": 0.5432, + "grad_norm": 0.9693351984024048, + "learning_rate": 1.0237342489540221e-05, + "loss": 0.0511, + "step": 5432 + }, + { + "epoch": 0.5434, + "grad_norm": 2.153236150741577, + "learning_rate": 1.0230363081876065e-05, + "loss": 0.3854, + "step": 5434 + }, + { + "epoch": 0.5436, + "grad_norm": 2.002882719039917, + "learning_rate": 1.0223383561935738e-05, + "loss": 0.1239, + "step": 5436 + }, + { + "epoch": 0.5438, + "grad_norm": 7.6168317794799805, + "learning_rate": 1.0216403933120979e-05, + "loss": 0.508, + "step": 5438 + }, + { + "epoch": 0.544, + "grad_norm": 2.206270217895508, + "learning_rate": 1.0209424198833571e-05, + "loss": 0.0926, + "step": 5440 + }, + { + "epoch": 0.5442, + "grad_norm": 5.6418046951293945, + "learning_rate": 1.0202444362475352e-05, + "loss": 0.2732, + "step": 5442 + }, + { + "epoch": 0.5444, + "grad_norm": 0.865811824798584, + "learning_rate": 1.0195464427448213e-05, + "loss": 0.0559, + "step": 5444 + }, + { + "epoch": 0.5446, + "grad_norm": 4.651216983795166, + "learning_rate": 1.0188484397154083e-05, + "loss": 0.2961, + "step": 5446 + }, + { + "epoch": 0.5448, + "grad_norm": 0.6321510672569275, + "learning_rate": 1.0181504274994949e-05, + "loss": 0.5825, + "step": 5448 + }, + { + "epoch": 0.545, + "grad_norm": 2.6170382499694824, + "learning_rate": 1.0174524064372837e-05, + "loss": 0.0812, + "step": 5450 + }, + { + "epoch": 0.5452, + "grad_norm": 1.7075893878936768, + "learning_rate": 1.0167543768689816e-05, + "loss": 0.2371, + "step": 5452 + }, + { + "epoch": 0.5454, + "grad_norm": 0.1967419981956482, + "learning_rate": 1.0160563391347998e-05, + "loss": 0.0169, + "step": 5454 + }, + { + "epoch": 0.5456, + "grad_norm": 4.951149940490723, + "learning_rate": 1.0153582935749531e-05, + "loss": 0.4964, + "step": 5456 + }, + { + "epoch": 0.5458, + "grad_norm": 0.544441819190979, + "learning_rate": 1.0146602405296608e-05, + "loss": 0.2055, + "step": 5458 + }, + { + "epoch": 0.546, + "grad_norm": 1.6451481580734253, + "learning_rate": 1.0139621803391454e-05, + "loss": 0.1421, + "step": 5460 + }, + { + "epoch": 0.5462, + "grad_norm": 0.9093581438064575, + "learning_rate": 1.013264113343633e-05, + "loss": 0.1113, + "step": 5462 + }, + { + "epoch": 0.5464, + "grad_norm": 1.6740262508392334, + "learning_rate": 1.0125660398833528e-05, + "loss": 0.2447, + "step": 5464 + }, + { + "epoch": 0.5466, + "grad_norm": 2.269050121307373, + "learning_rate": 1.0118679602985373e-05, + "loss": 0.2573, + "step": 5466 + }, + { + "epoch": 0.5468, + "grad_norm": 3.88667893409729, + "learning_rate": 1.0111698749294223e-05, + "loss": 0.1863, + "step": 5468 + }, + { + "epoch": 0.547, + "grad_norm": 7.155335903167725, + "learning_rate": 1.010471784116246e-05, + "loss": 0.1543, + "step": 5470 + }, + { + "epoch": 0.5472, + "grad_norm": 8.719377517700195, + "learning_rate": 1.0097736881992492e-05, + "loss": 0.2405, + "step": 5472 + }, + { + "epoch": 0.5474, + "grad_norm": 0.29813048243522644, + "learning_rate": 1.0090755875186752e-05, + "loss": 0.1863, + "step": 5474 + }, + { + "epoch": 0.5476, + "grad_norm": 5.552900791168213, + "learning_rate": 1.0083774824147707e-05, + "loss": 0.6516, + "step": 5476 + }, + { + "epoch": 0.5478, + "grad_norm": 5.134792327880859, + "learning_rate": 1.007679373227783e-05, + "loss": 0.5312, + "step": 5478 + }, + { + "epoch": 0.548, + "grad_norm": 6.84963846206665, + "learning_rate": 1.0069812602979617e-05, + "loss": 0.2222, + "step": 5480 + }, + { + "epoch": 0.5482, + "grad_norm": 8.094292640686035, + "learning_rate": 1.0062831439655591e-05, + "loss": 0.3967, + "step": 5482 + }, + { + "epoch": 0.5484, + "grad_norm": 4.508349418640137, + "learning_rate": 1.0055850245708283e-05, + "loss": 0.3347, + "step": 5484 + }, + { + "epoch": 0.5486, + "grad_norm": 1.2131792306900024, + "learning_rate": 1.0048869024540247e-05, + "loss": 0.1731, + "step": 5486 + }, + { + "epoch": 0.5488, + "grad_norm": 4.423066139221191, + "learning_rate": 1.0041887779554041e-05, + "loss": 0.1537, + "step": 5488 + }, + { + "epoch": 0.549, + "grad_norm": 3.6289877891540527, + "learning_rate": 1.0034906514152239e-05, + "loss": 0.2014, + "step": 5490 + }, + { + "epoch": 0.5492, + "grad_norm": 5.620269298553467, + "learning_rate": 1.0027925231737428e-05, + "loss": 0.4636, + "step": 5492 + }, + { + "epoch": 0.5494, + "grad_norm": 1.2765752077102661, + "learning_rate": 1.0020943935712193e-05, + "loss": 0.3698, + "step": 5494 + }, + { + "epoch": 0.5496, + "grad_norm": 2.390381336212158, + "learning_rate": 1.0013962629479145e-05, + "loss": 0.7038, + "step": 5496 + }, + { + "epoch": 0.5498, + "grad_norm": 2.6334850788116455, + "learning_rate": 1.0006981316440876e-05, + "loss": 0.1019, + "step": 5498 + }, + { + "epoch": 0.55, + "grad_norm": 0.8703633546829224, + "learning_rate": 1e-05, + "loss": 0.0761, + "step": 5500 + }, + { + "epoch": 0.5502, + "grad_norm": 5.997681617736816, + "learning_rate": 9.993018683559126e-06, + "loss": 0.2475, + "step": 5502 + }, + { + "epoch": 0.5504, + "grad_norm": 4.221917629241943, + "learning_rate": 9.986037370520856e-06, + "loss": 0.1759, + "step": 5504 + }, + { + "epoch": 0.5506, + "grad_norm": 1.3594822883605957, + "learning_rate": 9.979056064287807e-06, + "loss": 0.0683, + "step": 5506 + }, + { + "epoch": 0.5508, + "grad_norm": 5.480632305145264, + "learning_rate": 9.972074768262576e-06, + "loss": 0.2428, + "step": 5508 + }, + { + "epoch": 0.551, + "grad_norm": 6.61918306350708, + "learning_rate": 9.965093485847766e-06, + "loss": 0.232, + "step": 5510 + }, + { + "epoch": 0.5512, + "grad_norm": 0.4201000928878784, + "learning_rate": 9.958112220445964e-06, + "loss": 0.0339, + "step": 5512 + }, + { + "epoch": 0.5514, + "grad_norm": 0.6229251027107239, + "learning_rate": 9.951130975459758e-06, + "loss": 0.0706, + "step": 5514 + }, + { + "epoch": 0.5516, + "grad_norm": 1.7897288799285889, + "learning_rate": 9.944149754291719e-06, + "loss": 0.3076, + "step": 5516 + }, + { + "epoch": 0.5518, + "grad_norm": 1.9947421550750732, + "learning_rate": 9.937168560344412e-06, + "loss": 0.0887, + "step": 5518 + }, + { + "epoch": 0.552, + "grad_norm": 0.4789806306362152, + "learning_rate": 9.930187397020385e-06, + "loss": 0.1312, + "step": 5520 + }, + { + "epoch": 0.5522, + "grad_norm": 11.537650108337402, + "learning_rate": 9.923206267722173e-06, + "loss": 0.3162, + "step": 5522 + }, + { + "epoch": 0.5524, + "grad_norm": 0.5356634855270386, + "learning_rate": 9.916225175852295e-06, + "loss": 0.1275, + "step": 5524 + }, + { + "epoch": 0.5526, + "grad_norm": 3.299595594406128, + "learning_rate": 9.909244124813246e-06, + "loss": 0.3921, + "step": 5526 + }, + { + "epoch": 0.5528, + "grad_norm": 0.2184961438179016, + "learning_rate": 9.902263118007513e-06, + "loss": 0.0791, + "step": 5528 + }, + { + "epoch": 0.553, + "grad_norm": 0.5442588925361633, + "learning_rate": 9.895282158837545e-06, + "loss": 0.0945, + "step": 5530 + }, + { + "epoch": 0.5532, + "grad_norm": 2.0755014419555664, + "learning_rate": 9.88830125070578e-06, + "loss": 0.1582, + "step": 5532 + }, + { + "epoch": 0.5534, + "grad_norm": 11.323128700256348, + "learning_rate": 9.88132039701463e-06, + "loss": 0.538, + "step": 5534 + }, + { + "epoch": 0.5536, + "grad_norm": 9.2882719039917, + "learning_rate": 9.874339601166474e-06, + "loss": 0.6632, + "step": 5536 + }, + { + "epoch": 0.5538, + "grad_norm": 0.6790152192115784, + "learning_rate": 9.867358866563674e-06, + "loss": 0.0729, + "step": 5538 + }, + { + "epoch": 0.554, + "grad_norm": 0.15450569987297058, + "learning_rate": 9.860378196608549e-06, + "loss": 0.0157, + "step": 5540 + }, + { + "epoch": 0.5542, + "grad_norm": 1.2751222848892212, + "learning_rate": 9.853397594703394e-06, + "loss": 0.0363, + "step": 5542 + }, + { + "epoch": 0.5544, + "grad_norm": 3.109269142150879, + "learning_rate": 9.84641706425047e-06, + "loss": 0.133, + "step": 5544 + }, + { + "epoch": 0.5546, + "grad_norm": 6.382877349853516, + "learning_rate": 9.839436608652007e-06, + "loss": 0.3205, + "step": 5546 + }, + { + "epoch": 0.5548, + "grad_norm": 6.267333507537842, + "learning_rate": 9.832456231310189e-06, + "loss": 0.3972, + "step": 5548 + }, + { + "epoch": 0.555, + "grad_norm": 4.109254837036133, + "learning_rate": 9.825475935627165e-06, + "loss": 0.2258, + "step": 5550 + }, + { + "epoch": 0.5552, + "grad_norm": 0.35155296325683594, + "learning_rate": 9.818495725005053e-06, + "loss": 0.0221, + "step": 5552 + }, + { + "epoch": 0.5554, + "grad_norm": 7.902141571044922, + "learning_rate": 9.81151560284592e-06, + "loss": 0.4038, + "step": 5554 + }, + { + "epoch": 0.5556, + "grad_norm": 9.28659725189209, + "learning_rate": 9.80453557255179e-06, + "loss": 0.7114, + "step": 5556 + }, + { + "epoch": 0.5558, + "grad_norm": 2.433978319168091, + "learning_rate": 9.79755563752465e-06, + "loss": 0.0916, + "step": 5558 + }, + { + "epoch": 0.556, + "grad_norm": 6.768426895141602, + "learning_rate": 9.790575801166432e-06, + "loss": 0.1521, + "step": 5560 + }, + { + "epoch": 0.5562, + "grad_norm": 2.8815722465515137, + "learning_rate": 9.783596066879023e-06, + "loss": 0.1254, + "step": 5562 + }, + { + "epoch": 0.5564, + "grad_norm": 1.2171070575714111, + "learning_rate": 9.776616438064265e-06, + "loss": 0.2038, + "step": 5564 + }, + { + "epoch": 0.5566, + "grad_norm": 6.547184944152832, + "learning_rate": 9.76963691812394e-06, + "loss": 0.3087, + "step": 5566 + }, + { + "epoch": 0.5568, + "grad_norm": 8.248235702514648, + "learning_rate": 9.762657510459784e-06, + "loss": 0.3095, + "step": 5568 + }, + { + "epoch": 0.557, + "grad_norm": 3.7473275661468506, + "learning_rate": 9.75567821847347e-06, + "loss": 0.2416, + "step": 5570 + }, + { + "epoch": 0.5572, + "grad_norm": 2.0188026428222656, + "learning_rate": 9.748699045566626e-06, + "loss": 0.1758, + "step": 5572 + }, + { + "epoch": 0.5574, + "grad_norm": 3.9386141300201416, + "learning_rate": 9.741719995140814e-06, + "loss": 0.4177, + "step": 5574 + }, + { + "epoch": 0.5576, + "grad_norm": 0.8044818043708801, + "learning_rate": 9.73474107059754e-06, + "loss": 0.2057, + "step": 5576 + }, + { + "epoch": 0.5578, + "grad_norm": 2.8582823276519775, + "learning_rate": 9.727762275338246e-06, + "loss": 0.1112, + "step": 5578 + }, + { + "epoch": 0.558, + "grad_norm": 0.13508324325084686, + "learning_rate": 9.720783612764314e-06, + "loss": 0.0367, + "step": 5580 + }, + { + "epoch": 0.5582, + "grad_norm": 0.6011978983879089, + "learning_rate": 9.713805086277055e-06, + "loss": 0.133, + "step": 5582 + }, + { + "epoch": 0.5584, + "grad_norm": 0.6880024671554565, + "learning_rate": 9.706826699277719e-06, + "loss": 0.0477, + "step": 5584 + }, + { + "epoch": 0.5586, + "grad_norm": 3.2880308628082275, + "learning_rate": 9.699848455167489e-06, + "loss": 0.158, + "step": 5586 + }, + { + "epoch": 0.5588, + "grad_norm": 0.7876872420310974, + "learning_rate": 9.692870357347474e-06, + "loss": 0.0993, + "step": 5588 + }, + { + "epoch": 0.559, + "grad_norm": 2.1944057941436768, + "learning_rate": 9.685892409218718e-06, + "loss": 0.1224, + "step": 5590 + }, + { + "epoch": 0.5592, + "grad_norm": 3.757805109024048, + "learning_rate": 9.678914614182185e-06, + "loss": 0.0569, + "step": 5592 + }, + { + "epoch": 0.5594, + "grad_norm": 1.552512526512146, + "learning_rate": 9.671936975638768e-06, + "loss": 0.2959, + "step": 5594 + }, + { + "epoch": 0.5596, + "grad_norm": 0.35562002658843994, + "learning_rate": 9.664959496989286e-06, + "loss": 0.0099, + "step": 5596 + }, + { + "epoch": 0.5598, + "grad_norm": 3.7570388317108154, + "learning_rate": 9.657982181634476e-06, + "loss": 0.099, + "step": 5598 + }, + { + "epoch": 0.56, + "grad_norm": 0.24765044450759888, + "learning_rate": 9.651005032974994e-06, + "loss": 0.0118, + "step": 5600 + }, + { + "epoch": 0.5602, + "grad_norm": 0.11889337003231049, + "learning_rate": 9.644028054411416e-06, + "loss": 0.2118, + "step": 5602 + }, + { + "epoch": 0.5604, + "grad_norm": 10.561895370483398, + "learning_rate": 9.637051249344244e-06, + "loss": 0.2151, + "step": 5604 + }, + { + "epoch": 0.5606, + "grad_norm": 2.1115548610687256, + "learning_rate": 9.630074621173882e-06, + "loss": 0.0985, + "step": 5606 + }, + { + "epoch": 0.5608, + "grad_norm": 6.145288467407227, + "learning_rate": 9.623098173300655e-06, + "loss": 0.3696, + "step": 5608 + }, + { + "epoch": 0.561, + "grad_norm": 0.15574859082698822, + "learning_rate": 9.616121909124801e-06, + "loss": 0.1026, + "step": 5610 + }, + { + "epoch": 0.5612, + "grad_norm": 0.25423797965049744, + "learning_rate": 9.609145832046465e-06, + "loss": 0.0921, + "step": 5612 + }, + { + "epoch": 0.5614, + "grad_norm": 4.780964374542236, + "learning_rate": 9.602169945465702e-06, + "loss": 0.2264, + "step": 5614 + }, + { + "epoch": 0.5616, + "grad_norm": 19.124801635742188, + "learning_rate": 9.595194252782476e-06, + "loss": 0.5846, + "step": 5616 + }, + { + "epoch": 0.5618, + "grad_norm": 1.7042936086654663, + "learning_rate": 9.588218757396655e-06, + "loss": 0.1141, + "step": 5618 + }, + { + "epoch": 0.562, + "grad_norm": 12.202101707458496, + "learning_rate": 9.581243462708007e-06, + "loss": 0.3699, + "step": 5620 + }, + { + "epoch": 0.5622, + "grad_norm": 9.323801040649414, + "learning_rate": 9.574268372116205e-06, + "loss": 0.5854, + "step": 5622 + }, + { + "epoch": 0.5624, + "grad_norm": 0.5356630086898804, + "learning_rate": 9.567293489020831e-06, + "loss": 0.0355, + "step": 5624 + }, + { + "epoch": 0.5626, + "grad_norm": 4.084611892700195, + "learning_rate": 9.560318816821354e-06, + "loss": 0.0694, + "step": 5626 + }, + { + "epoch": 0.5628, + "grad_norm": 2.5714404582977295, + "learning_rate": 9.553344358917141e-06, + "loss": 0.3795, + "step": 5628 + }, + { + "epoch": 0.563, + "grad_norm": 0.9806063771247864, + "learning_rate": 9.546370118707463e-06, + "loss": 0.1012, + "step": 5630 + }, + { + "epoch": 0.5632, + "grad_norm": 4.210886001586914, + "learning_rate": 9.539396099591477e-06, + "loss": 0.1072, + "step": 5632 + }, + { + "epoch": 0.5634, + "grad_norm": 0.04868179187178612, + "learning_rate": 9.532422304968243e-06, + "loss": 0.0044, + "step": 5634 + }, + { + "epoch": 0.5636, + "grad_norm": 0.12026279419660568, + "learning_rate": 9.525448738236691e-06, + "loss": 0.4811, + "step": 5636 + }, + { + "epoch": 0.5638, + "grad_norm": 0.65245121717453, + "learning_rate": 9.518475402795661e-06, + "loss": 0.048, + "step": 5638 + }, + { + "epoch": 0.564, + "grad_norm": 0.6515442132949829, + "learning_rate": 9.511502302043867e-06, + "loss": 0.2165, + "step": 5640 + }, + { + "epoch": 0.5642, + "grad_norm": 0.07892456650733948, + "learning_rate": 9.504529439379921e-06, + "loss": 0.1923, + "step": 5642 + }, + { + "epoch": 0.5644, + "grad_norm": 7.27810001373291, + "learning_rate": 9.497556818202306e-06, + "loss": 0.5354, + "step": 5644 + }, + { + "epoch": 0.5646, + "grad_norm": 1.095073938369751, + "learning_rate": 9.490584441909392e-06, + "loss": 0.2569, + "step": 5646 + }, + { + "epoch": 0.5648, + "grad_norm": 0.07596687972545624, + "learning_rate": 9.483612313899436e-06, + "loss": 0.0473, + "step": 5648 + }, + { + "epoch": 0.565, + "grad_norm": 1.8427543640136719, + "learning_rate": 9.476640437570562e-06, + "loss": 0.31, + "step": 5650 + }, + { + "epoch": 0.5652, + "grad_norm": 4.190150737762451, + "learning_rate": 9.469668816320785e-06, + "loss": 0.3226, + "step": 5652 + }, + { + "epoch": 0.5654, + "grad_norm": 6.057847499847412, + "learning_rate": 9.46269745354798e-06, + "loss": 0.2074, + "step": 5654 + }, + { + "epoch": 0.5656, + "grad_norm": 0.5225338339805603, + "learning_rate": 9.45572635264991e-06, + "loss": 0.0643, + "step": 5656 + }, + { + "epoch": 0.5658, + "grad_norm": 0.07028944045305252, + "learning_rate": 9.448755517024207e-06, + "loss": 0.0988, + "step": 5658 + }, + { + "epoch": 0.566, + "grad_norm": 0.03929111361503601, + "learning_rate": 9.441784950068362e-06, + "loss": 0.0414, + "step": 5660 + }, + { + "epoch": 0.5662, + "grad_norm": 5.339606761932373, + "learning_rate": 9.434814655179756e-06, + "loss": 0.2464, + "step": 5662 + }, + { + "epoch": 0.5664, + "grad_norm": 0.4925324022769928, + "learning_rate": 9.42784463575562e-06, + "loss": 0.0943, + "step": 5664 + }, + { + "epoch": 0.5666, + "grad_norm": 1.6658002138137817, + "learning_rate": 9.420874895193056e-06, + "loss": 0.0519, + "step": 5666 + }, + { + "epoch": 0.5668, + "grad_norm": 6.131579875946045, + "learning_rate": 9.413905436889035e-06, + "loss": 0.4975, + "step": 5668 + }, + { + "epoch": 0.567, + "grad_norm": 4.469518661499023, + "learning_rate": 9.406936264240386e-06, + "loss": 0.2887, + "step": 5670 + }, + { + "epoch": 0.5672, + "grad_norm": 1.5558600425720215, + "learning_rate": 9.399967380643795e-06, + "loss": 0.0825, + "step": 5672 + }, + { + "epoch": 0.5674, + "grad_norm": 1.241970181465149, + "learning_rate": 9.392998789495813e-06, + "loss": 0.0288, + "step": 5674 + }, + { + "epoch": 0.5676, + "grad_norm": 0.10019036382436752, + "learning_rate": 9.386030494192847e-06, + "loss": 0.0894, + "step": 5676 + }, + { + "epoch": 0.5678, + "grad_norm": 1.0895448923110962, + "learning_rate": 9.379062498131161e-06, + "loss": 0.0477, + "step": 5678 + }, + { + "epoch": 0.568, + "grad_norm": 2.124887466430664, + "learning_rate": 9.372094804706867e-06, + "loss": 0.2063, + "step": 5680 + }, + { + "epoch": 0.5682, + "grad_norm": 7.130063533782959, + "learning_rate": 9.36512741731594e-06, + "loss": 0.1913, + "step": 5682 + }, + { + "epoch": 0.5684, + "grad_norm": 1.2900716066360474, + "learning_rate": 9.358160339354194e-06, + "loss": 0.0245, + "step": 5684 + }, + { + "epoch": 0.5686, + "grad_norm": 3.6203160285949707, + "learning_rate": 9.351193574217305e-06, + "loss": 0.1876, + "step": 5686 + }, + { + "epoch": 0.5688, + "grad_norm": 6.919813632965088, + "learning_rate": 9.344227125300788e-06, + "loss": 0.3559, + "step": 5688 + }, + { + "epoch": 0.569, + "grad_norm": 20.454391479492188, + "learning_rate": 9.337260996000002e-06, + "loss": 0.9943, + "step": 5690 + }, + { + "epoch": 0.5692, + "grad_norm": 0.051797837018966675, + "learning_rate": 9.330295189710153e-06, + "loss": 0.0802, + "step": 5692 + }, + { + "epoch": 0.5694, + "grad_norm": 1.9271719455718994, + "learning_rate": 9.323329709826294e-06, + "loss": 0.7893, + "step": 5694 + }, + { + "epoch": 0.5696, + "grad_norm": 3.520754814147949, + "learning_rate": 9.316364559743315e-06, + "loss": 0.1442, + "step": 5696 + }, + { + "epoch": 0.5698, + "grad_norm": 0.2470025271177292, + "learning_rate": 9.309399742855943e-06, + "loss": 0.0419, + "step": 5698 + }, + { + "epoch": 0.57, + "grad_norm": 12.007941246032715, + "learning_rate": 9.302435262558748e-06, + "loss": 0.8235, + "step": 5700 + }, + { + "epoch": 0.5702, + "grad_norm": 0.4118618369102478, + "learning_rate": 9.295471122246131e-06, + "loss": 0.1172, + "step": 5702 + }, + { + "epoch": 0.5704, + "grad_norm": 12.362785339355469, + "learning_rate": 9.288507325312334e-06, + "loss": 0.7321, + "step": 5704 + }, + { + "epoch": 0.5706, + "grad_norm": 0.07712121307849884, + "learning_rate": 9.281543875151419e-06, + "loss": 0.0904, + "step": 5706 + }, + { + "epoch": 0.5708, + "grad_norm": 3.6140520572662354, + "learning_rate": 9.274580775157294e-06, + "loss": 0.1147, + "step": 5708 + }, + { + "epoch": 0.571, + "grad_norm": 1.9162027835845947, + "learning_rate": 9.267618028723687e-06, + "loss": 0.2014, + "step": 5710 + }, + { + "epoch": 0.5712, + "grad_norm": 0.05598197877407074, + "learning_rate": 9.260655639244152e-06, + "loss": 0.2088, + "step": 5712 + }, + { + "epoch": 0.5714, + "grad_norm": 1.4517903327941895, + "learning_rate": 9.253693610112079e-06, + "loss": 0.3074, + "step": 5714 + }, + { + "epoch": 0.5716, + "grad_norm": 4.21862268447876, + "learning_rate": 9.246731944720675e-06, + "loss": 0.4507, + "step": 5716 + }, + { + "epoch": 0.5718, + "grad_norm": 7.834107398986816, + "learning_rate": 9.239770646462968e-06, + "loss": 0.2748, + "step": 5718 + }, + { + "epoch": 0.572, + "grad_norm": 1.5172889232635498, + "learning_rate": 9.232809718731815e-06, + "loss": 0.204, + "step": 5720 + }, + { + "epoch": 0.5722, + "grad_norm": 7.010932445526123, + "learning_rate": 9.225849164919886e-06, + "loss": 0.1896, + "step": 5722 + }, + { + "epoch": 0.5724, + "grad_norm": 1.542572021484375, + "learning_rate": 9.218888988419668e-06, + "loss": 0.1283, + "step": 5724 + }, + { + "epoch": 0.5726, + "grad_norm": 4.257505893707275, + "learning_rate": 9.211929192623466e-06, + "loss": 0.2406, + "step": 5726 + }, + { + "epoch": 0.5728, + "grad_norm": 0.5160626173019409, + "learning_rate": 9.204969780923404e-06, + "loss": 0.0266, + "step": 5728 + }, + { + "epoch": 0.573, + "grad_norm": 0.2926258146762848, + "learning_rate": 9.198010756711413e-06, + "loss": 0.4613, + "step": 5730 + }, + { + "epoch": 0.5732, + "grad_norm": 1.1502536535263062, + "learning_rate": 9.191052123379234e-06, + "loss": 0.0544, + "step": 5732 + }, + { + "epoch": 0.5734, + "grad_norm": 1.2872200012207031, + "learning_rate": 9.184093884318426e-06, + "loss": 0.0926, + "step": 5734 + }, + { + "epoch": 0.5736, + "grad_norm": 7.215987682342529, + "learning_rate": 9.177136042920344e-06, + "loss": 0.523, + "step": 5736 + }, + { + "epoch": 0.5738, + "grad_norm": 4.551417350769043, + "learning_rate": 9.170178602576161e-06, + "loss": 0.3972, + "step": 5738 + }, + { + "epoch": 0.574, + "grad_norm": 6.941018581390381, + "learning_rate": 9.163221566676847e-06, + "loss": 0.257, + "step": 5740 + }, + { + "epoch": 0.5742, + "grad_norm": 0.9171852469444275, + "learning_rate": 9.156264938613176e-06, + "loss": 0.1488, + "step": 5742 + }, + { + "epoch": 0.5744, + "grad_norm": 7.771508693695068, + "learning_rate": 9.14930872177572e-06, + "loss": 0.4082, + "step": 5744 + }, + { + "epoch": 0.5746, + "grad_norm": 4.728274345397949, + "learning_rate": 9.142352919554862e-06, + "loss": 0.4162, + "step": 5746 + }, + { + "epoch": 0.5748, + "grad_norm": 1.2125329971313477, + "learning_rate": 9.135397535340773e-06, + "loss": 0.0506, + "step": 5748 + }, + { + "epoch": 0.575, + "grad_norm": 0.35776785016059875, + "learning_rate": 9.128442572523418e-06, + "loss": 0.1014, + "step": 5750 + }, + { + "epoch": 0.5752, + "grad_norm": 0.7517110109329224, + "learning_rate": 9.121488034492569e-06, + "loss": 0.1362, + "step": 5752 + }, + { + "epoch": 0.5754, + "grad_norm": 0.5347161293029785, + "learning_rate": 9.114533924637778e-06, + "loss": 0.055, + "step": 5754 + }, + { + "epoch": 0.5756, + "grad_norm": 5.745047092437744, + "learning_rate": 9.107580246348395e-06, + "loss": 0.2425, + "step": 5756 + }, + { + "epoch": 0.5758, + "grad_norm": 2.0244877338409424, + "learning_rate": 9.100627003013563e-06, + "loss": 0.1161, + "step": 5758 + }, + { + "epoch": 0.576, + "grad_norm": 4.185141563415527, + "learning_rate": 9.093674198022201e-06, + "loss": 0.2761, + "step": 5760 + }, + { + "epoch": 0.5762, + "grad_norm": 0.28924861550331116, + "learning_rate": 9.086721834763024e-06, + "loss": 0.1814, + "step": 5762 + }, + { + "epoch": 0.5764, + "grad_norm": 1.832138180732727, + "learning_rate": 9.07976991662453e-06, + "loss": 0.1158, + "step": 5764 + }, + { + "epoch": 0.5766, + "grad_norm": 5.484676837921143, + "learning_rate": 9.072818446995e-06, + "loss": 0.4584, + "step": 5766 + }, + { + "epoch": 0.5768, + "grad_norm": 1.711653709411621, + "learning_rate": 9.065867429262497e-06, + "loss": 0.142, + "step": 5768 + }, + { + "epoch": 0.577, + "grad_norm": 0.09732580929994583, + "learning_rate": 9.058916866814857e-06, + "loss": 0.0436, + "step": 5770 + }, + { + "epoch": 0.5772, + "grad_norm": 1.575200080871582, + "learning_rate": 9.051966763039706e-06, + "loss": 0.0504, + "step": 5772 + }, + { + "epoch": 0.5774, + "grad_norm": 0.27907487750053406, + "learning_rate": 9.045017121324438e-06, + "loss": 0.3629, + "step": 5774 + }, + { + "epoch": 0.5776, + "grad_norm": 1.129930853843689, + "learning_rate": 9.038067945056229e-06, + "loss": 0.1223, + "step": 5776 + }, + { + "epoch": 0.5778, + "grad_norm": 11.245157241821289, + "learning_rate": 9.031119237622011e-06, + "loss": 0.9116, + "step": 5778 + }, + { + "epoch": 0.578, + "grad_norm": 1.047930359840393, + "learning_rate": 9.024171002408507e-06, + "loss": 0.2759, + "step": 5780 + }, + { + "epoch": 0.5782, + "grad_norm": 1.6102540493011475, + "learning_rate": 9.017223242802205e-06, + "loss": 0.0922, + "step": 5782 + }, + { + "epoch": 0.5784, + "grad_norm": 9.020341873168945, + "learning_rate": 9.01027596218935e-06, + "loss": 0.4412, + "step": 5784 + }, + { + "epoch": 0.5786, + "grad_norm": 0.9257790446281433, + "learning_rate": 9.003329163955973e-06, + "loss": 0.2503, + "step": 5786 + }, + { + "epoch": 0.5788, + "grad_norm": 5.681929111480713, + "learning_rate": 8.996382851487851e-06, + "loss": 0.361, + "step": 5788 + }, + { + "epoch": 0.579, + "grad_norm": 11.629603385925293, + "learning_rate": 8.989437028170537e-06, + "loss": 0.4963, + "step": 5790 + }, + { + "epoch": 0.5792, + "grad_norm": 0.9616450071334839, + "learning_rate": 8.982491697389339e-06, + "loss": 0.1669, + "step": 5792 + }, + { + "epoch": 0.5794, + "grad_norm": 0.029870731756091118, + "learning_rate": 8.975546862529328e-06, + "loss": 0.0057, + "step": 5794 + }, + { + "epoch": 0.5796, + "grad_norm": 2.1211860179901123, + "learning_rate": 8.968602526975329e-06, + "loss": 0.1398, + "step": 5796 + }, + { + "epoch": 0.5798, + "grad_norm": 0.4901007413864136, + "learning_rate": 8.961658694111929e-06, + "loss": 0.0854, + "step": 5798 + }, + { + "epoch": 0.58, + "grad_norm": 0.09442697465419769, + "learning_rate": 8.954715367323468e-06, + "loss": 0.4805, + "step": 5800 + }, + { + "epoch": 0.5802, + "grad_norm": 0.3534424602985382, + "learning_rate": 8.947772549994037e-06, + "loss": 0.0971, + "step": 5802 + }, + { + "epoch": 0.5804, + "grad_norm": 2.669002056121826, + "learning_rate": 8.940830245507483e-06, + "loss": 0.1474, + "step": 5804 + }, + { + "epoch": 0.5806, + "grad_norm": 2.31103515625, + "learning_rate": 8.933888457247402e-06, + "loss": 0.4307, + "step": 5806 + }, + { + "epoch": 0.5808, + "grad_norm": 3.212576389312744, + "learning_rate": 8.926947188597133e-06, + "loss": 0.1425, + "step": 5808 + }, + { + "epoch": 0.581, + "grad_norm": 6.982884407043457, + "learning_rate": 8.920006442939772e-06, + "loss": 0.2473, + "step": 5810 + }, + { + "epoch": 0.5812, + "grad_norm": 1.680696725845337, + "learning_rate": 8.913066223658152e-06, + "loss": 0.0931, + "step": 5812 + }, + { + "epoch": 0.5814, + "grad_norm": 2.809116840362549, + "learning_rate": 8.906126534134849e-06, + "loss": 0.5566, + "step": 5814 + }, + { + "epoch": 0.5816, + "grad_norm": 1.5567501783370972, + "learning_rate": 8.89918737775218e-06, + "loss": 0.0734, + "step": 5816 + }, + { + "epoch": 0.5818, + "grad_norm": 1.1401926279067993, + "learning_rate": 8.892248757892215e-06, + "loss": 0.2152, + "step": 5818 + }, + { + "epoch": 0.582, + "grad_norm": 0.044570647180080414, + "learning_rate": 8.885310677936746e-06, + "loss": 0.0718, + "step": 5820 + }, + { + "epoch": 0.5822, + "grad_norm": 0.18106873333454132, + "learning_rate": 8.878373141267312e-06, + "loss": 0.2317, + "step": 5822 + }, + { + "epoch": 0.5824, + "grad_norm": 0.7613741755485535, + "learning_rate": 8.871436151265183e-06, + "loss": 0.1328, + "step": 5824 + }, + { + "epoch": 0.5826, + "grad_norm": 1.127523422241211, + "learning_rate": 8.864499711311362e-06, + "loss": 0.1147, + "step": 5826 + }, + { + "epoch": 0.5828, + "grad_norm": 3.430119514465332, + "learning_rate": 8.857563824786598e-06, + "loss": 0.237, + "step": 5828 + }, + { + "epoch": 0.583, + "grad_norm": 9.98499584197998, + "learning_rate": 8.850628495071336e-06, + "loss": 0.3146, + "step": 5830 + }, + { + "epoch": 0.5832, + "grad_norm": 0.18927647173404694, + "learning_rate": 8.843693725545787e-06, + "loss": 0.1256, + "step": 5832 + }, + { + "epoch": 0.5834, + "grad_norm": 2.992692708969116, + "learning_rate": 8.836759519589869e-06, + "loss": 0.0931, + "step": 5834 + }, + { + "epoch": 0.5836, + "grad_norm": 0.5743416547775269, + "learning_rate": 8.829825880583228e-06, + "loss": 0.2362, + "step": 5836 + }, + { + "epoch": 0.5838, + "grad_norm": 2.497803211212158, + "learning_rate": 8.822892811905237e-06, + "loss": 0.3325, + "step": 5838 + }, + { + "epoch": 0.584, + "grad_norm": 0.8220381736755371, + "learning_rate": 8.815960316934991e-06, + "loss": 0.1587, + "step": 5840 + }, + { + "epoch": 0.5842, + "grad_norm": 1.0505242347717285, + "learning_rate": 8.809028399051302e-06, + "loss": 0.0545, + "step": 5842 + }, + { + "epoch": 0.5844, + "grad_norm": 6.089834213256836, + "learning_rate": 8.802097061632706e-06, + "loss": 0.3515, + "step": 5844 + }, + { + "epoch": 0.5846, + "grad_norm": 1.1309839487075806, + "learning_rate": 8.79516630805745e-06, + "loss": 0.0559, + "step": 5846 + }, + { + "epoch": 0.5848, + "grad_norm": 4.785970687866211, + "learning_rate": 8.788236141703498e-06, + "loss": 0.3206, + "step": 5848 + }, + { + "epoch": 0.585, + "grad_norm": 5.744917392730713, + "learning_rate": 8.781306565948528e-06, + "loss": 0.3439, + "step": 5850 + }, + { + "epoch": 0.5852, + "grad_norm": 6.862751483917236, + "learning_rate": 8.774377584169934e-06, + "loss": 0.6106, + "step": 5852 + }, + { + "epoch": 0.5854, + "grad_norm": 1.5213000774383545, + "learning_rate": 8.767449199744813e-06, + "loss": 0.0602, + "step": 5854 + }, + { + "epoch": 0.5856, + "grad_norm": 1.8845092058181763, + "learning_rate": 8.760521416049983e-06, + "loss": 0.1527, + "step": 5856 + }, + { + "epoch": 0.5858, + "grad_norm": 0.3676110506057739, + "learning_rate": 8.753594236461957e-06, + "loss": 0.1528, + "step": 5858 + }, + { + "epoch": 0.586, + "grad_norm": 1.7047942876815796, + "learning_rate": 8.746667664356957e-06, + "loss": 0.5413, + "step": 5860 + }, + { + "epoch": 0.5862, + "grad_norm": 6.21046781539917, + "learning_rate": 8.739741703110914e-06, + "loss": 0.3948, + "step": 5862 + }, + { + "epoch": 0.5864, + "grad_norm": 0.18333947658538818, + "learning_rate": 8.732816356099455e-06, + "loss": 0.1476, + "step": 5864 + }, + { + "epoch": 0.5866, + "grad_norm": 1.8171154260635376, + "learning_rate": 8.725891626697912e-06, + "loss": 0.3479, + "step": 5866 + }, + { + "epoch": 0.5868, + "grad_norm": 8.80384635925293, + "learning_rate": 8.718967518281307e-06, + "loss": 0.372, + "step": 5868 + }, + { + "epoch": 0.587, + "grad_norm": 2.155397415161133, + "learning_rate": 8.712044034224374e-06, + "loss": 0.0776, + "step": 5870 + }, + { + "epoch": 0.5872, + "grad_norm": 0.9116203188896179, + "learning_rate": 8.705121177901532e-06, + "loss": 0.1145, + "step": 5872 + }, + { + "epoch": 0.5874, + "grad_norm": 5.382971286773682, + "learning_rate": 8.698198952686896e-06, + "loss": 0.0964, + "step": 5874 + }, + { + "epoch": 0.5876, + "grad_norm": 8.131940841674805, + "learning_rate": 8.69127736195428e-06, + "loss": 0.161, + "step": 5876 + }, + { + "epoch": 0.5878, + "grad_norm": 1.7002097368240356, + "learning_rate": 8.684356409077177e-06, + "loss": 0.0528, + "step": 5878 + }, + { + "epoch": 0.588, + "grad_norm": 5.0608015060424805, + "learning_rate": 8.677436097428775e-06, + "loss": 0.5042, + "step": 5880 + }, + { + "epoch": 0.5882, + "grad_norm": 0.6331314444541931, + "learning_rate": 8.670516430381958e-06, + "loss": 0.2024, + "step": 5882 + }, + { + "epoch": 0.5884, + "grad_norm": 6.277942180633545, + "learning_rate": 8.663597411309278e-06, + "loss": 0.57, + "step": 5884 + }, + { + "epoch": 0.5886, + "grad_norm": 4.617282390594482, + "learning_rate": 8.656679043582986e-06, + "loss": 0.1508, + "step": 5886 + }, + { + "epoch": 0.5888, + "grad_norm": 0.8449786901473999, + "learning_rate": 8.649761330575009e-06, + "loss": 0.0568, + "step": 5888 + }, + { + "epoch": 0.589, + "grad_norm": 1.5337581634521484, + "learning_rate": 8.642844275656957e-06, + "loss": 0.0927, + "step": 5890 + }, + { + "epoch": 0.5892, + "grad_norm": 1.3424409627914429, + "learning_rate": 8.635927882200117e-06, + "loss": 0.0452, + "step": 5892 + }, + { + "epoch": 0.5894, + "grad_norm": 2.5922720432281494, + "learning_rate": 8.629012153575458e-06, + "loss": 0.1277, + "step": 5894 + }, + { + "epoch": 0.5896, + "grad_norm": 3.2734134197235107, + "learning_rate": 8.62209709315362e-06, + "loss": 0.0683, + "step": 5896 + }, + { + "epoch": 0.5898, + "grad_norm": 12.5712251663208, + "learning_rate": 8.615182704304918e-06, + "loss": 0.3184, + "step": 5898 + }, + { + "epoch": 0.59, + "grad_norm": 6.313766002655029, + "learning_rate": 8.60826899039935e-06, + "loss": 0.3087, + "step": 5900 + }, + { + "epoch": 0.5902, + "grad_norm": 10.46423053741455, + "learning_rate": 8.601355954806562e-06, + "loss": 0.2269, + "step": 5902 + }, + { + "epoch": 0.5904, + "grad_norm": 6.785037517547607, + "learning_rate": 8.594443600895892e-06, + "loss": 0.9763, + "step": 5904 + }, + { + "epoch": 0.5906, + "grad_norm": 4.364811897277832, + "learning_rate": 8.587531932036334e-06, + "loss": 0.1698, + "step": 5906 + }, + { + "epoch": 0.5908, + "grad_norm": 1.0478793382644653, + "learning_rate": 8.580620951596556e-06, + "loss": 0.02, + "step": 5908 + }, + { + "epoch": 0.591, + "grad_norm": 1.854924201965332, + "learning_rate": 8.573710662944884e-06, + "loss": 0.0278, + "step": 5910 + }, + { + "epoch": 0.5912, + "grad_norm": 1.1792523860931396, + "learning_rate": 8.566801069449307e-06, + "loss": 0.151, + "step": 5912 + }, + { + "epoch": 0.5914, + "grad_norm": 0.6186419129371643, + "learning_rate": 8.559892174477478e-06, + "loss": 0.4143, + "step": 5914 + }, + { + "epoch": 0.5916, + "grad_norm": 1.95673668384552, + "learning_rate": 8.552983981396709e-06, + "loss": 0.1944, + "step": 5916 + }, + { + "epoch": 0.5918, + "grad_norm": 0.6506578922271729, + "learning_rate": 8.546076493573973e-06, + "loss": 0.111, + "step": 5918 + }, + { + "epoch": 0.592, + "grad_norm": 4.647477149963379, + "learning_rate": 8.539169714375885e-06, + "loss": 0.9959, + "step": 5920 + }, + { + "epoch": 0.5922, + "grad_norm": 0.5279133915901184, + "learning_rate": 8.532263647168735e-06, + "loss": 0.0743, + "step": 5922 + }, + { + "epoch": 0.5924, + "grad_norm": 0.44719964265823364, + "learning_rate": 8.525358295318454e-06, + "loss": 0.2246, + "step": 5924 + }, + { + "epoch": 0.5926, + "grad_norm": 11.765973091125488, + "learning_rate": 8.518453662190622e-06, + "loss": 0.3513, + "step": 5926 + }, + { + "epoch": 0.5928, + "grad_norm": 1.9848308563232422, + "learning_rate": 8.511549751150478e-06, + "loss": 0.091, + "step": 5928 + }, + { + "epoch": 0.593, + "grad_norm": 0.9513328671455383, + "learning_rate": 8.504646565562907e-06, + "loss": 0.1068, + "step": 5930 + }, + { + "epoch": 0.5932, + "grad_norm": 1.886859655380249, + "learning_rate": 8.49774410879243e-06, + "loss": 0.1013, + "step": 5932 + }, + { + "epoch": 0.5934, + "grad_norm": 11.905098915100098, + "learning_rate": 8.490842384203227e-06, + "loss": 0.4596, + "step": 5934 + }, + { + "epoch": 0.5936, + "grad_norm": 1.031836748123169, + "learning_rate": 8.483941395159114e-06, + "loss": 0.0484, + "step": 5936 + }, + { + "epoch": 0.5938, + "grad_norm": 7.270950794219971, + "learning_rate": 8.477041145023546e-06, + "loss": 0.3856, + "step": 5938 + }, + { + "epoch": 0.594, + "grad_norm": 0.21065405011177063, + "learning_rate": 8.47014163715962e-06, + "loss": 0.1151, + "step": 5940 + }, + { + "epoch": 0.5942, + "grad_norm": 0.6360710263252258, + "learning_rate": 8.46324287493008e-06, + "loss": 0.1312, + "step": 5942 + }, + { + "epoch": 0.5944, + "grad_norm": 2.67958664894104, + "learning_rate": 8.45634486169729e-06, + "loss": 0.2663, + "step": 5944 + }, + { + "epoch": 0.5946, + "grad_norm": 5.203149795532227, + "learning_rate": 8.449447600823262e-06, + "loss": 0.3043, + "step": 5946 + }, + { + "epoch": 0.5948, + "grad_norm": 6.2318902015686035, + "learning_rate": 8.44255109566964e-06, + "loss": 0.2922, + "step": 5948 + }, + { + "epoch": 0.595, + "grad_norm": 0.07047893106937408, + "learning_rate": 8.43565534959769e-06, + "loss": 0.0592, + "step": 5950 + }, + { + "epoch": 0.5952, + "grad_norm": 0.6556594371795654, + "learning_rate": 8.428760365968327e-06, + "loss": 0.0706, + "step": 5952 + }, + { + "epoch": 0.5954, + "grad_norm": 2.105936050415039, + "learning_rate": 8.421866148142066e-06, + "loss": 0.515, + "step": 5954 + }, + { + "epoch": 0.5956, + "grad_norm": 0.2687493860721588, + "learning_rate": 8.414972699479076e-06, + "loss": 0.1267, + "step": 5956 + }, + { + "epoch": 0.5958, + "grad_norm": 0.21977975964546204, + "learning_rate": 8.408080023339134e-06, + "loss": 0.0285, + "step": 5958 + }, + { + "epoch": 0.596, + "grad_norm": 3.7959821224212646, + "learning_rate": 8.401188123081653e-06, + "loss": 0.2266, + "step": 5960 + }, + { + "epoch": 0.5962, + "grad_norm": 12.434759140014648, + "learning_rate": 8.394297002065658e-06, + "loss": 0.4253, + "step": 5962 + }, + { + "epoch": 0.5964, + "grad_norm": 6.410341739654541, + "learning_rate": 8.387406663649796e-06, + "loss": 0.2416, + "step": 5964 + }, + { + "epoch": 0.5966, + "grad_norm": 1.6149542331695557, + "learning_rate": 8.380517111192336e-06, + "loss": 0.3849, + "step": 5966 + }, + { + "epoch": 0.5968, + "grad_norm": 1.3850603103637695, + "learning_rate": 8.373628348051165e-06, + "loss": 0.0552, + "step": 5968 + }, + { + "epoch": 0.597, + "grad_norm": 1.1955461502075195, + "learning_rate": 8.366740377583781e-06, + "loss": 0.0371, + "step": 5970 + }, + { + "epoch": 0.5972, + "grad_norm": 1.7642918825149536, + "learning_rate": 8.35985320314729e-06, + "loss": 0.0431, + "step": 5972 + }, + { + "epoch": 0.5974, + "grad_norm": 3.1367437839508057, + "learning_rate": 8.352966828098428e-06, + "loss": 0.2268, + "step": 5974 + }, + { + "epoch": 0.5976, + "grad_norm": 0.24941672384738922, + "learning_rate": 8.346081255793524e-06, + "loss": 0.138, + "step": 5976 + }, + { + "epoch": 0.5978, + "grad_norm": 7.082655906677246, + "learning_rate": 8.339196489588522e-06, + "loss": 1.0368, + "step": 5978 + }, + { + "epoch": 0.598, + "grad_norm": 5.729485034942627, + "learning_rate": 8.332312532838978e-06, + "loss": 0.2427, + "step": 5980 + }, + { + "epoch": 0.5982, + "grad_norm": 4.708596229553223, + "learning_rate": 8.325429388900046e-06, + "loss": 0.5311, + "step": 5982 + }, + { + "epoch": 0.5984, + "grad_norm": 0.9880713224411011, + "learning_rate": 8.318547061126485e-06, + "loss": 0.0602, + "step": 5984 + }, + { + "epoch": 0.5986, + "grad_norm": 3.1158249378204346, + "learning_rate": 8.311665552872662e-06, + "loss": 0.877, + "step": 5986 + }, + { + "epoch": 0.5988, + "grad_norm": 1.9009720087051392, + "learning_rate": 8.30478486749254e-06, + "loss": 0.0603, + "step": 5988 + }, + { + "epoch": 0.599, + "grad_norm": 3.269432544708252, + "learning_rate": 8.297905008339677e-06, + "loss": 0.2775, + "step": 5990 + }, + { + "epoch": 0.5992, + "grad_norm": 0.5768991708755493, + "learning_rate": 8.291025978767236e-06, + "loss": 0.021, + "step": 5992 + }, + { + "epoch": 0.5994, + "grad_norm": 9.754294395446777, + "learning_rate": 8.284147782127971e-06, + "loss": 0.6907, + "step": 5994 + }, + { + "epoch": 0.5996, + "grad_norm": 2.802686929702759, + "learning_rate": 8.277270421774234e-06, + "loss": 0.0823, + "step": 5996 + }, + { + "epoch": 0.5998, + "grad_norm": 4.230920314788818, + "learning_rate": 8.270393901057964e-06, + "loss": 0.2503, + "step": 5998 + }, + { + "epoch": 0.6, + "grad_norm": 1.3337135314941406, + "learning_rate": 8.263518223330698e-06, + "loss": 0.1294, + "step": 6000 + }, + { + "epoch": 0.6002, + "grad_norm": 3.277832508087158, + "learning_rate": 8.25664339194355e-06, + "loss": 0.2523, + "step": 6002 + }, + { + "epoch": 0.6004, + "grad_norm": 1.6613192558288574, + "learning_rate": 8.249769410247239e-06, + "loss": 0.0576, + "step": 6004 + }, + { + "epoch": 0.6006, + "grad_norm": 1.440618634223938, + "learning_rate": 8.242896281592057e-06, + "loss": 0.042, + "step": 6006 + }, + { + "epoch": 0.6008, + "grad_norm": 3.478546142578125, + "learning_rate": 8.236024009327879e-06, + "loss": 0.096, + "step": 6008 + }, + { + "epoch": 0.601, + "grad_norm": 2.2007217407226562, + "learning_rate": 8.22915259680417e-06, + "loss": 0.426, + "step": 6010 + }, + { + "epoch": 0.6012, + "grad_norm": 3.627232313156128, + "learning_rate": 8.222282047369972e-06, + "loss": 0.151, + "step": 6012 + }, + { + "epoch": 0.6014, + "grad_norm": 9.818490982055664, + "learning_rate": 8.215412364373908e-06, + "loss": 0.7419, + "step": 6014 + }, + { + "epoch": 0.6016, + "grad_norm": 0.20535032451152802, + "learning_rate": 8.208543551164178e-06, + "loss": 0.0427, + "step": 6016 + }, + { + "epoch": 0.6018, + "grad_norm": 0.5736970901489258, + "learning_rate": 8.201675611088558e-06, + "loss": 0.0201, + "step": 6018 + }, + { + "epoch": 0.602, + "grad_norm": 2.5250141620635986, + "learning_rate": 8.194808547494401e-06, + "loss": 0.2161, + "step": 6020 + }, + { + "epoch": 0.6022, + "grad_norm": 2.8653597831726074, + "learning_rate": 8.187942363728626e-06, + "loss": 0.2858, + "step": 6022 + }, + { + "epoch": 0.6024, + "grad_norm": 8.24601936340332, + "learning_rate": 8.181077063137733e-06, + "loss": 0.5391, + "step": 6024 + }, + { + "epoch": 0.6026, + "grad_norm": 6.710006237030029, + "learning_rate": 8.174212649067781e-06, + "loss": 0.3632, + "step": 6026 + }, + { + "epoch": 0.6028, + "grad_norm": 3.0282130241394043, + "learning_rate": 8.167349124864406e-06, + "loss": 0.3561, + "step": 6028 + }, + { + "epoch": 0.603, + "grad_norm": 5.083648204803467, + "learning_rate": 8.1604864938728e-06, + "loss": 0.6705, + "step": 6030 + }, + { + "epoch": 0.6032, + "grad_norm": 3.9779200553894043, + "learning_rate": 8.153624759437733e-06, + "loss": 0.2408, + "step": 6032 + }, + { + "epoch": 0.6034, + "grad_norm": 1.0914925336837769, + "learning_rate": 8.146763924903527e-06, + "loss": 0.054, + "step": 6034 + }, + { + "epoch": 0.6036, + "grad_norm": 2.0767483711242676, + "learning_rate": 8.139903993614069e-06, + "loss": 0.1224, + "step": 6036 + }, + { + "epoch": 0.6038, + "grad_norm": 3.407792329788208, + "learning_rate": 8.133044968912811e-06, + "loss": 0.2844, + "step": 6038 + }, + { + "epoch": 0.604, + "grad_norm": 1.3327842950820923, + "learning_rate": 8.126186854142752e-06, + "loss": 0.0759, + "step": 6040 + }, + { + "epoch": 0.6042, + "grad_norm": 0.5044893622398376, + "learning_rate": 8.119329652646463e-06, + "loss": 0.0478, + "step": 6042 + }, + { + "epoch": 0.6044, + "grad_norm": 0.3862454295158386, + "learning_rate": 8.112473367766051e-06, + "loss": 0.2195, + "step": 6044 + }, + { + "epoch": 0.6046, + "grad_norm": 5.248617172241211, + "learning_rate": 8.10561800284319e-06, + "loss": 0.1853, + "step": 6046 + }, + { + "epoch": 0.6048, + "grad_norm": 6.637161731719971, + "learning_rate": 8.098763561219101e-06, + "loss": 0.4027, + "step": 6048 + }, + { + "epoch": 0.605, + "grad_norm": 2.726423501968384, + "learning_rate": 8.091910046234552e-06, + "loss": 0.2475, + "step": 6050 + }, + { + "epoch": 0.6052, + "grad_norm": 0.29223111271858215, + "learning_rate": 8.08505746122987e-06, + "loss": 0.1684, + "step": 6052 + }, + { + "epoch": 0.6054, + "grad_norm": 0.2127991020679474, + "learning_rate": 8.078205809544918e-06, + "loss": 0.1041, + "step": 6054 + }, + { + "epoch": 0.6056, + "grad_norm": 5.637780666351318, + "learning_rate": 8.07135509451911e-06, + "loss": 0.1509, + "step": 6056 + }, + { + "epoch": 0.6058, + "grad_norm": 7.364511489868164, + "learning_rate": 8.064505319491398e-06, + "loss": 0.5227, + "step": 6058 + }, + { + "epoch": 0.606, + "grad_norm": 0.5771879553794861, + "learning_rate": 8.057656487800283e-06, + "loss": 0.0431, + "step": 6060 + }, + { + "epoch": 0.6062, + "grad_norm": 5.031026363372803, + "learning_rate": 8.050808602783797e-06, + "loss": 0.4639, + "step": 6062 + }, + { + "epoch": 0.6064, + "grad_norm": 1.6729713678359985, + "learning_rate": 8.04396166777952e-06, + "loss": 0.1015, + "step": 6064 + }, + { + "epoch": 0.6066, + "grad_norm": 3.456418514251709, + "learning_rate": 8.037115686124564e-06, + "loss": 0.2897, + "step": 6066 + }, + { + "epoch": 0.6068, + "grad_norm": 1.2114489078521729, + "learning_rate": 8.030270661155575e-06, + "loss": 0.184, + "step": 6068 + }, + { + "epoch": 0.607, + "grad_norm": 0.8111811280250549, + "learning_rate": 8.023426596208739e-06, + "loss": 0.1071, + "step": 6070 + }, + { + "epoch": 0.6072, + "grad_norm": 5.223745346069336, + "learning_rate": 8.016583494619769e-06, + "loss": 0.2464, + "step": 6072 + }, + { + "epoch": 0.6074, + "grad_norm": 0.5979897379875183, + "learning_rate": 8.009741359723906e-06, + "loss": 0.0536, + "step": 6074 + }, + { + "epoch": 0.6076, + "grad_norm": 3.057410717010498, + "learning_rate": 8.00290019485593e-06, + "loss": 0.2663, + "step": 6076 + }, + { + "epoch": 0.6078, + "grad_norm": 8.98874568939209, + "learning_rate": 7.996060003350139e-06, + "loss": 0.4038, + "step": 6078 + }, + { + "epoch": 0.608, + "grad_norm": 2.019909143447876, + "learning_rate": 7.989220788540356e-06, + "loss": 0.2614, + "step": 6080 + }, + { + "epoch": 0.6082, + "grad_norm": 1.117266297340393, + "learning_rate": 7.982382553759931e-06, + "loss": 0.1803, + "step": 6082 + }, + { + "epoch": 0.6084, + "grad_norm": 0.3230209946632385, + "learning_rate": 7.975545302341743e-06, + "loss": 0.0184, + "step": 6084 + }, + { + "epoch": 0.6086, + "grad_norm": 0.5938141942024231, + "learning_rate": 7.96870903761818e-06, + "loss": 0.1415, + "step": 6086 + }, + { + "epoch": 0.6088, + "grad_norm": 3.0296449661254883, + "learning_rate": 7.961873762921153e-06, + "loss": 0.2389, + "step": 6088 + }, + { + "epoch": 0.609, + "grad_norm": 0.7365408539772034, + "learning_rate": 7.955039481582098e-06, + "loss": 0.0379, + "step": 6090 + }, + { + "epoch": 0.6092, + "grad_norm": 0.5453080534934998, + "learning_rate": 7.948206196931953e-06, + "loss": 0.0793, + "step": 6092 + }, + { + "epoch": 0.6094, + "grad_norm": 1.3529847860336304, + "learning_rate": 7.94137391230119e-06, + "loss": 0.2729, + "step": 6094 + }, + { + "epoch": 0.6096, + "grad_norm": 0.12948718667030334, + "learning_rate": 7.934542631019767e-06, + "loss": 0.3117, + "step": 6096 + }, + { + "epoch": 0.6098, + "grad_norm": 0.7626124024391174, + "learning_rate": 7.927712356417176e-06, + "loss": 0.0575, + "step": 6098 + }, + { + "epoch": 0.61, + "grad_norm": 0.9935643076896667, + "learning_rate": 7.92088309182241e-06, + "loss": 0.1075, + "step": 6100 + }, + { + "epoch": 0.6102, + "grad_norm": 3.6225016117095947, + "learning_rate": 7.914054840563962e-06, + "loss": 0.2636, + "step": 6102 + }, + { + "epoch": 0.6104, + "grad_norm": 3.182295083999634, + "learning_rate": 7.907227605969849e-06, + "loss": 0.1113, + "step": 6104 + }, + { + "epoch": 0.6106, + "grad_norm": 1.5943310260772705, + "learning_rate": 7.900401391367576e-06, + "loss": 0.1879, + "step": 6106 + }, + { + "epoch": 0.6108, + "grad_norm": 1.5261863470077515, + "learning_rate": 7.89357620008416e-06, + "loss": 0.097, + "step": 6108 + }, + { + "epoch": 0.611, + "grad_norm": 1.603388786315918, + "learning_rate": 7.886752035446116e-06, + "loss": 0.0552, + "step": 6110 + }, + { + "epoch": 0.6112, + "grad_norm": 16.40922737121582, + "learning_rate": 7.879928900779457e-06, + "loss": 0.7746, + "step": 6112 + }, + { + "epoch": 0.6114, + "grad_norm": 1.2380625009536743, + "learning_rate": 7.873106799409696e-06, + "loss": 0.0664, + "step": 6114 + }, + { + "epoch": 0.6116, + "grad_norm": 2.5200071334838867, + "learning_rate": 7.866285734661842e-06, + "loss": 0.0929, + "step": 6116 + }, + { + "epoch": 0.6118, + "grad_norm": 8.377872467041016, + "learning_rate": 7.8594657098604e-06, + "loss": 0.6399, + "step": 6118 + }, + { + "epoch": 0.612, + "grad_norm": 0.8822484612464905, + "learning_rate": 7.852646728329368e-06, + "loss": 0.1643, + "step": 6120 + }, + { + "epoch": 0.6122, + "grad_norm": 4.1035943031311035, + "learning_rate": 7.845828793392236e-06, + "loss": 0.0767, + "step": 6122 + }, + { + "epoch": 0.6124, + "grad_norm": 0.15757961571216583, + "learning_rate": 7.83901190837198e-06, + "loss": 0.1134, + "step": 6124 + }, + { + "epoch": 0.6126, + "grad_norm": 7.610534191131592, + "learning_rate": 7.832196076591067e-06, + "loss": 0.1631, + "step": 6126 + }, + { + "epoch": 0.6128, + "grad_norm": 2.196582794189453, + "learning_rate": 7.825381301371452e-06, + "loss": 0.3466, + "step": 6128 + }, + { + "epoch": 0.613, + "grad_norm": 16.313907623291016, + "learning_rate": 7.818567586034578e-06, + "loss": 0.8345, + "step": 6130 + }, + { + "epoch": 0.6132, + "grad_norm": 0.2136334776878357, + "learning_rate": 7.811754933901358e-06, + "loss": 0.0184, + "step": 6132 + }, + { + "epoch": 0.6134, + "grad_norm": 2.917362928390503, + "learning_rate": 7.804943348292197e-06, + "loss": 0.3853, + "step": 6134 + }, + { + "epoch": 0.6136, + "grad_norm": 3.8938848972320557, + "learning_rate": 7.798132832526986e-06, + "loss": 0.151, + "step": 6136 + }, + { + "epoch": 0.6138, + "grad_norm": 10.318235397338867, + "learning_rate": 7.791323389925084e-06, + "loss": 0.4374, + "step": 6138 + }, + { + "epoch": 0.614, + "grad_norm": 5.075292110443115, + "learning_rate": 7.784515023805328e-06, + "loss": 0.2024, + "step": 6140 + }, + { + "epoch": 0.6142, + "grad_norm": 1.769653081893921, + "learning_rate": 7.777707737486036e-06, + "loss": 0.2305, + "step": 6142 + }, + { + "epoch": 0.6144, + "grad_norm": 0.38961276412010193, + "learning_rate": 7.770901534284996e-06, + "loss": 0.0244, + "step": 6144 + }, + { + "epoch": 0.6146, + "grad_norm": 6.886707782745361, + "learning_rate": 7.76409641751947e-06, + "loss": 0.2324, + "step": 6146 + }, + { + "epoch": 0.6148, + "grad_norm": 2.2723593711853027, + "learning_rate": 7.757292390506191e-06, + "loss": 0.4546, + "step": 6148 + }, + { + "epoch": 0.615, + "grad_norm": 6.352449417114258, + "learning_rate": 7.750489456561351e-06, + "loss": 0.0964, + "step": 6150 + }, + { + "epoch": 0.6152, + "grad_norm": 0.22507260739803314, + "learning_rate": 7.743687619000625e-06, + "loss": 0.2324, + "step": 6152 + }, + { + "epoch": 0.6154, + "grad_norm": 10.877449035644531, + "learning_rate": 7.736886881139143e-06, + "loss": 0.7085, + "step": 6154 + }, + { + "epoch": 0.6156, + "grad_norm": 0.5149603486061096, + "learning_rate": 7.730087246291503e-06, + "loss": 0.0162, + "step": 6156 + }, + { + "epoch": 0.6158, + "grad_norm": 1.9323742389678955, + "learning_rate": 7.72328871777176e-06, + "loss": 0.064, + "step": 6158 + }, + { + "epoch": 0.616, + "grad_norm": 0.1945025473833084, + "learning_rate": 7.716491298893443e-06, + "loss": 0.2493, + "step": 6160 + }, + { + "epoch": 0.6162, + "grad_norm": 6.315744876861572, + "learning_rate": 7.709694992969525e-06, + "loss": 0.2843, + "step": 6162 + }, + { + "epoch": 0.6164, + "grad_norm": 7.368813514709473, + "learning_rate": 7.702899803312443e-06, + "loss": 0.899, + "step": 6164 + }, + { + "epoch": 0.6166, + "grad_norm": 1.155861496925354, + "learning_rate": 7.696105733234099e-06, + "loss": 0.0572, + "step": 6166 + }, + { + "epoch": 0.6168, + "grad_norm": 9.395236015319824, + "learning_rate": 7.689312786045823e-06, + "loss": 0.5854, + "step": 6168 + }, + { + "epoch": 0.617, + "grad_norm": 0.42361947894096375, + "learning_rate": 7.68252096505843e-06, + "loss": 0.0423, + "step": 6170 + }, + { + "epoch": 0.6172, + "grad_norm": 4.428504943847656, + "learning_rate": 7.67573027358216e-06, + "loss": 0.1632, + "step": 6172 + }, + { + "epoch": 0.6174, + "grad_norm": 1.0758732557296753, + "learning_rate": 7.668940714926724e-06, + "loss": 0.0445, + "step": 6174 + }, + { + "epoch": 0.6176, + "grad_norm": 3.276918649673462, + "learning_rate": 7.662152292401265e-06, + "loss": 0.5318, + "step": 6176 + }, + { + "epoch": 0.6178, + "grad_norm": 4.808465003967285, + "learning_rate": 7.655365009314375e-06, + "loss": 0.3634, + "step": 6178 + }, + { + "epoch": 0.618, + "grad_norm": 1.6113462448120117, + "learning_rate": 7.6485788689741e-06, + "loss": 0.0811, + "step": 6180 + }, + { + "epoch": 0.6182, + "grad_norm": 0.6430481672286987, + "learning_rate": 7.641793874687918e-06, + "loss": 0.1697, + "step": 6182 + }, + { + "epoch": 0.6184, + "grad_norm": 3.379060745239258, + "learning_rate": 7.635010029762755e-06, + "loss": 0.1696, + "step": 6184 + }, + { + "epoch": 0.6186, + "grad_norm": 6.845134258270264, + "learning_rate": 7.628227337504972e-06, + "loss": 0.2693, + "step": 6186 + }, + { + "epoch": 0.6188, + "grad_norm": 5.948103427886963, + "learning_rate": 7.621445801220372e-06, + "loss": 0.3983, + "step": 6188 + }, + { + "epoch": 0.619, + "grad_norm": 0.7051439881324768, + "learning_rate": 7.6146654242141935e-06, + "loss": 0.2817, + "step": 6190 + }, + { + "epoch": 0.6192, + "grad_norm": 1.2736669778823853, + "learning_rate": 7.6078862097911075e-06, + "loss": 0.4124, + "step": 6192 + }, + { + "epoch": 0.6194, + "grad_norm": 6.3394269943237305, + "learning_rate": 7.6011081612552265e-06, + "loss": 0.3842, + "step": 6194 + }, + { + "epoch": 0.6196, + "grad_norm": 11.581921577453613, + "learning_rate": 7.594331281910082e-06, + "loss": 0.5024, + "step": 6196 + }, + { + "epoch": 0.6198, + "grad_norm": 2.410388946533203, + "learning_rate": 7.58755557505865e-06, + "loss": 0.0592, + "step": 6198 + }, + { + "epoch": 0.62, + "grad_norm": 11.532902717590332, + "learning_rate": 7.580781044003324e-06, + "loss": 0.4964, + "step": 6200 + }, + { + "epoch": 0.6202, + "grad_norm": 1.9948935508728027, + "learning_rate": 7.574007692045928e-06, + "loss": 0.1354, + "step": 6202 + }, + { + "epoch": 0.6204, + "grad_norm": 5.309185981750488, + "learning_rate": 7.5672355224877115e-06, + "loss": 0.3939, + "step": 6204 + }, + { + "epoch": 0.6206, + "grad_norm": 3.3470730781555176, + "learning_rate": 7.560464538629345e-06, + "loss": 0.3385, + "step": 6206 + }, + { + "epoch": 0.6208, + "grad_norm": 0.4483829438686371, + "learning_rate": 7.553694743770928e-06, + "loss": 0.1669, + "step": 6208 + }, + { + "epoch": 0.621, + "grad_norm": 1.7434008121490479, + "learning_rate": 7.546926141211975e-06, + "loss": 0.2152, + "step": 6210 + }, + { + "epoch": 0.6212, + "grad_norm": 0.5405002236366272, + "learning_rate": 7.54015873425142e-06, + "loss": 0.1391, + "step": 6212 + }, + { + "epoch": 0.6214, + "grad_norm": 1.2131593227386475, + "learning_rate": 7.533392526187617e-06, + "loss": 0.1028, + "step": 6214 + }, + { + "epoch": 0.6216, + "grad_norm": 5.1227521896362305, + "learning_rate": 7.526627520318329e-06, + "loss": 0.7419, + "step": 6216 + }, + { + "epoch": 0.6218, + "grad_norm": 5.810759544372559, + "learning_rate": 7.519863719940748e-06, + "loss": 0.4937, + "step": 6218 + }, + { + "epoch": 0.622, + "grad_norm": 0.5882106423377991, + "learning_rate": 7.513101128351454e-06, + "loss": 0.1699, + "step": 6220 + }, + { + "epoch": 0.6222, + "grad_norm": 3.601806163787842, + "learning_rate": 7.506339748846461e-06, + "loss": 0.451, + "step": 6222 + }, + { + "epoch": 0.6224, + "grad_norm": 2.0747272968292236, + "learning_rate": 7.49957958472118e-06, + "loss": 0.2814, + "step": 6224 + }, + { + "epoch": 0.6226, + "grad_norm": 0.6846380233764648, + "learning_rate": 7.492820639270435e-06, + "loss": 0.6483, + "step": 6226 + }, + { + "epoch": 0.6228, + "grad_norm": 0.303364098072052, + "learning_rate": 7.486062915788453e-06, + "loss": 0.3658, + "step": 6228 + }, + { + "epoch": 0.623, + "grad_norm": 0.49666541814804077, + "learning_rate": 7.4793064175688635e-06, + "loss": 0.1197, + "step": 6230 + }, + { + "epoch": 0.6232, + "grad_norm": 0.505020797252655, + "learning_rate": 7.472551147904708e-06, + "loss": 0.2864, + "step": 6232 + }, + { + "epoch": 0.6234, + "grad_norm": 2.9315707683563232, + "learning_rate": 7.465797110088417e-06, + "loss": 0.1622, + "step": 6234 + }, + { + "epoch": 0.6236, + "grad_norm": 2.3778023719787598, + "learning_rate": 7.4590443074118325e-06, + "loss": 0.2191, + "step": 6236 + }, + { + "epoch": 0.6238, + "grad_norm": 2.1516520977020264, + "learning_rate": 7.4522927431661805e-06, + "loss": 0.3007, + "step": 6238 + }, + { + "epoch": 0.624, + "grad_norm": 1.108674168586731, + "learning_rate": 7.445542420642097e-06, + "loss": 0.243, + "step": 6240 + }, + { + "epoch": 0.6242, + "grad_norm": 0.6472600102424622, + "learning_rate": 7.438793343129605e-06, + "loss": 0.0424, + "step": 6242 + }, + { + "epoch": 0.6244, + "grad_norm": 4.663537502288818, + "learning_rate": 7.432045513918122e-06, + "loss": 0.4507, + "step": 6244 + }, + { + "epoch": 0.6246, + "grad_norm": 4.220770835876465, + "learning_rate": 7.4252989362964635e-06, + "loss": 0.2757, + "step": 6246 + }, + { + "epoch": 0.6248, + "grad_norm": 0.5799724459648132, + "learning_rate": 7.418553613552824e-06, + "loss": 0.0613, + "step": 6248 + }, + { + "epoch": 0.625, + "grad_norm": 2.179457187652588, + "learning_rate": 7.411809548974792e-06, + "loss": 0.2187, + "step": 6250 + }, + { + "epoch": 0.6252, + "grad_norm": 4.666337490081787, + "learning_rate": 7.405066745849347e-06, + "loss": 0.2373, + "step": 6252 + }, + { + "epoch": 0.6254, + "grad_norm": 3.8348867893218994, + "learning_rate": 7.398325207462846e-06, + "loss": 0.3517, + "step": 6254 + }, + { + "epoch": 0.6256, + "grad_norm": 1.0409650802612305, + "learning_rate": 7.391584937101034e-06, + "loss": 0.3424, + "step": 6256 + }, + { + "epoch": 0.6258, + "grad_norm": 1.4916540384292603, + "learning_rate": 7.384845938049033e-06, + "loss": 0.1239, + "step": 6258 + }, + { + "epoch": 0.626, + "grad_norm": 3.6912012100219727, + "learning_rate": 7.378108213591355e-06, + "loss": 0.187, + "step": 6260 + }, + { + "epoch": 0.6262, + "grad_norm": 4.102867126464844, + "learning_rate": 7.37137176701188e-06, + "loss": 0.3724, + "step": 6262 + }, + { + "epoch": 0.6264, + "grad_norm": 2.246795177459717, + "learning_rate": 7.364636601593875e-06, + "loss": 0.2154, + "step": 6264 + }, + { + "epoch": 0.6266, + "grad_norm": 5.188350200653076, + "learning_rate": 7.357902720619976e-06, + "loss": 0.5025, + "step": 6266 + }, + { + "epoch": 0.6268, + "grad_norm": 6.827349662780762, + "learning_rate": 7.351170127372191e-06, + "loss": 0.2871, + "step": 6268 + }, + { + "epoch": 0.627, + "grad_norm": 0.5769883990287781, + "learning_rate": 7.344438825131912e-06, + "loss": 0.1363, + "step": 6270 + }, + { + "epoch": 0.6272, + "grad_norm": 7.351644515991211, + "learning_rate": 7.33770881717989e-06, + "loss": 0.4259, + "step": 6272 + }, + { + "epoch": 0.6274, + "grad_norm": 0.504842221736908, + "learning_rate": 7.330980106796247e-06, + "loss": 0.2406, + "step": 6274 + }, + { + "epoch": 0.6276, + "grad_norm": 2.1691486835479736, + "learning_rate": 7.324252697260475e-06, + "loss": 0.1584, + "step": 6276 + }, + { + "epoch": 0.6278, + "grad_norm": 5.3428168296813965, + "learning_rate": 7.3175265918514335e-06, + "loss": 0.4572, + "step": 6278 + }, + { + "epoch": 0.628, + "grad_norm": 0.8083658218383789, + "learning_rate": 7.310801793847344e-06, + "loss": 0.1964, + "step": 6280 + }, + { + "epoch": 0.6282, + "grad_norm": 5.816895008087158, + "learning_rate": 7.3040783065257906e-06, + "loss": 0.3818, + "step": 6282 + }, + { + "epoch": 0.6284, + "grad_norm": 1.2228367328643799, + "learning_rate": 7.297356133163722e-06, + "loss": 0.3803, + "step": 6284 + }, + { + "epoch": 0.6286, + "grad_norm": 5.408708095550537, + "learning_rate": 7.290635277037442e-06, + "loss": 0.3071, + "step": 6286 + }, + { + "epoch": 0.6288, + "grad_norm": 2.6939327716827393, + "learning_rate": 7.283915741422611e-06, + "loss": 0.2189, + "step": 6288 + }, + { + "epoch": 0.629, + "grad_norm": 2.5225424766540527, + "learning_rate": 7.277197529594257e-06, + "loss": 0.1895, + "step": 6290 + }, + { + "epoch": 0.6292, + "grad_norm": 3.8928442001342773, + "learning_rate": 7.27048064482675e-06, + "loss": 0.426, + "step": 6292 + }, + { + "epoch": 0.6294, + "grad_norm": 3.936983346939087, + "learning_rate": 7.263765090393817e-06, + "loss": 0.2462, + "step": 6294 + }, + { + "epoch": 0.6296, + "grad_norm": 7.017339706420898, + "learning_rate": 7.257050869568536e-06, + "loss": 0.3515, + "step": 6296 + }, + { + "epoch": 0.6298, + "grad_norm": 0.6321125030517578, + "learning_rate": 7.250337985623342e-06, + "loss": 0.151, + "step": 6298 + }, + { + "epoch": 0.63, + "grad_norm": 0.9765869379043579, + "learning_rate": 7.243626441830009e-06, + "loss": 0.1242, + "step": 6300 + }, + { + "epoch": 0.6302, + "grad_norm": 1.507826328277588, + "learning_rate": 7.236916241459664e-06, + "loss": 0.2533, + "step": 6302 + }, + { + "epoch": 0.6304, + "grad_norm": 0.21494236588478088, + "learning_rate": 7.2302073877827775e-06, + "loss": 0.3355, + "step": 6304 + }, + { + "epoch": 0.6306, + "grad_norm": 4.314621925354004, + "learning_rate": 7.22349988406916e-06, + "loss": 0.1436, + "step": 6306 + }, + { + "epoch": 0.6308, + "grad_norm": 5.294309139251709, + "learning_rate": 7.216793733587976e-06, + "loss": 0.2639, + "step": 6308 + }, + { + "epoch": 0.631, + "grad_norm": 6.2738237380981445, + "learning_rate": 7.210088939607709e-06, + "loss": 0.4507, + "step": 6310 + }, + { + "epoch": 0.6312, + "grad_norm": 2.386147975921631, + "learning_rate": 7.203385505396203e-06, + "loss": 0.1782, + "step": 6312 + }, + { + "epoch": 0.6314, + "grad_norm": 3.7573795318603516, + "learning_rate": 7.196683434220626e-06, + "loss": 0.1361, + "step": 6314 + }, + { + "epoch": 0.6316, + "grad_norm": 3.1449546813964844, + "learning_rate": 7.189982729347491e-06, + "loss": 0.2464, + "step": 6316 + }, + { + "epoch": 0.6318, + "grad_norm": 2.318754196166992, + "learning_rate": 7.1832833940426346e-06, + "loss": 0.1945, + "step": 6318 + }, + { + "epoch": 0.632, + "grad_norm": 2.0608620643615723, + "learning_rate": 7.176585431571235e-06, + "loss": 0.2532, + "step": 6320 + }, + { + "epoch": 0.6322, + "grad_norm": 0.38129159808158875, + "learning_rate": 7.169888845197798e-06, + "loss": 0.1788, + "step": 6322 + }, + { + "epoch": 0.6324, + "grad_norm": 0.8667468428611755, + "learning_rate": 7.163193638186159e-06, + "loss": 0.0585, + "step": 6324 + }, + { + "epoch": 0.6326, + "grad_norm": 2.90531849861145, + "learning_rate": 7.156499813799477e-06, + "loss": 0.2161, + "step": 6326 + }, + { + "epoch": 0.6328, + "grad_norm": 3.12203311920166, + "learning_rate": 7.149807375300239e-06, + "loss": 0.1944, + "step": 6328 + }, + { + "epoch": 0.633, + "grad_norm": 0.3441455662250519, + "learning_rate": 7.143116325950266e-06, + "loss": 0.0591, + "step": 6330 + }, + { + "epoch": 0.6332, + "grad_norm": 0.7827467322349548, + "learning_rate": 7.13642666901069e-06, + "loss": 0.1366, + "step": 6332 + }, + { + "epoch": 0.6334, + "grad_norm": 2.813955783843994, + "learning_rate": 7.129738407741964e-06, + "loss": 0.3517, + "step": 6334 + }, + { + "epoch": 0.6336, + "grad_norm": 5.7530059814453125, + "learning_rate": 7.123051545403874e-06, + "loss": 0.5317, + "step": 6336 + }, + { + "epoch": 0.6338, + "grad_norm": 9.31773853302002, + "learning_rate": 7.116366085255511e-06, + "loss": 0.3408, + "step": 6338 + }, + { + "epoch": 0.634, + "grad_norm": 10.848834037780762, + "learning_rate": 7.109682030555283e-06, + "loss": 0.4887, + "step": 6340 + }, + { + "epoch": 0.6342, + "grad_norm": 1.1706467866897583, + "learning_rate": 7.102999384560927e-06, + "loss": 0.1145, + "step": 6342 + }, + { + "epoch": 0.6344, + "grad_norm": 1.2128900289535522, + "learning_rate": 7.096318150529476e-06, + "loss": 0.3285, + "step": 6344 + }, + { + "epoch": 0.6346, + "grad_norm": 2.967592239379883, + "learning_rate": 7.0896383317172845e-06, + "loss": 0.2152, + "step": 6346 + }, + { + "epoch": 0.6348, + "grad_norm": 7.55496072769165, + "learning_rate": 7.082959931380011e-06, + "loss": 0.2608, + "step": 6348 + }, + { + "epoch": 0.635, + "grad_norm": 2.1344854831695557, + "learning_rate": 7.076282952772634e-06, + "loss": 0.1355, + "step": 6350 + }, + { + "epoch": 0.6352, + "grad_norm": 4.66903829574585, + "learning_rate": 7.069607399149427e-06, + "loss": 0.3225, + "step": 6352 + }, + { + "epoch": 0.6354, + "grad_norm": 1.3044642210006714, + "learning_rate": 7.062933273763974e-06, + "loss": 0.0703, + "step": 6354 + }, + { + "epoch": 0.6356, + "grad_norm": 1.803135633468628, + "learning_rate": 7.056260579869165e-06, + "loss": 0.1677, + "step": 6356 + }, + { + "epoch": 0.6358, + "grad_norm": 1.6618908643722534, + "learning_rate": 7.049589320717186e-06, + "loss": 0.2463, + "step": 6358 + }, + { + "epoch": 0.636, + "grad_norm": 3.369168996810913, + "learning_rate": 7.042919499559538e-06, + "loss": 0.3322, + "step": 6360 + }, + { + "epoch": 0.6362, + "grad_norm": 3.565838098526001, + "learning_rate": 7.036251119646993e-06, + "loss": 0.1285, + "step": 6362 + }, + { + "epoch": 0.6364, + "grad_norm": 0.2952379286289215, + "learning_rate": 7.029584184229653e-06, + "loss": 0.3883, + "step": 6364 + }, + { + "epoch": 0.6366, + "grad_norm": 3.9717161655426025, + "learning_rate": 7.022918696556896e-06, + "loss": 0.1468, + "step": 6366 + }, + { + "epoch": 0.6368, + "grad_norm": 0.8217795491218567, + "learning_rate": 7.016254659877398e-06, + "loss": 0.4011, + "step": 6368 + }, + { + "epoch": 0.637, + "grad_norm": 6.282580375671387, + "learning_rate": 7.009592077439135e-06, + "loss": 0.695, + "step": 6370 + }, + { + "epoch": 0.6372, + "grad_norm": 39.09699630737305, + "learning_rate": 7.002930952489362e-06, + "loss": 0.7929, + "step": 6372 + }, + { + "epoch": 0.6374, + "grad_norm": 8.828478813171387, + "learning_rate": 6.996271288274636e-06, + "loss": 0.3987, + "step": 6374 + }, + { + "epoch": 0.6376, + "grad_norm": 3.965907096862793, + "learning_rate": 6.9896130880407965e-06, + "loss": 0.2976, + "step": 6376 + }, + { + "epoch": 0.6378, + "grad_norm": 1.0318095684051514, + "learning_rate": 6.982956355032968e-06, + "loss": 0.0791, + "step": 6378 + }, + { + "epoch": 0.638, + "grad_norm": 2.649975061416626, + "learning_rate": 6.976301092495556e-06, + "loss": 0.3148, + "step": 6380 + }, + { + "epoch": 0.6382, + "grad_norm": 3.1535093784332275, + "learning_rate": 6.969647303672262e-06, + "loss": 0.3323, + "step": 6382 + }, + { + "epoch": 0.6384, + "grad_norm": 3.042024612426758, + "learning_rate": 6.962994991806059e-06, + "loss": 0.3316, + "step": 6384 + }, + { + "epoch": 0.6386, + "grad_norm": 1.9274065494537354, + "learning_rate": 6.956344160139201e-06, + "loss": 0.1584, + "step": 6386 + }, + { + "epoch": 0.6388, + "grad_norm": 1.788570761680603, + "learning_rate": 6.949694811913226e-06, + "loss": 0.2079, + "step": 6388 + }, + { + "epoch": 0.639, + "grad_norm": 6.205123424530029, + "learning_rate": 6.943046950368944e-06, + "loss": 0.3384, + "step": 6390 + }, + { + "epoch": 0.6392, + "grad_norm": 4.549469947814941, + "learning_rate": 6.9364005787464406e-06, + "loss": 0.2976, + "step": 6392 + }, + { + "epoch": 0.6394, + "grad_norm": 4.198714256286621, + "learning_rate": 6.929755700285082e-06, + "loss": 0.2392, + "step": 6394 + }, + { + "epoch": 0.6396, + "grad_norm": 0.4947677254676819, + "learning_rate": 6.923112318223497e-06, + "loss": 0.0766, + "step": 6396 + }, + { + "epoch": 0.6398, + "grad_norm": 3.3645591735839844, + "learning_rate": 6.9164704357995874e-06, + "loss": 0.1538, + "step": 6398 + }, + { + "epoch": 0.64, + "grad_norm": 2.3637948036193848, + "learning_rate": 6.909830056250527e-06, + "loss": 0.2284, + "step": 6400 + }, + { + "epoch": 0.6402, + "grad_norm": 0.23701079189777374, + "learning_rate": 6.903191182812759e-06, + "loss": 0.1152, + "step": 6402 + }, + { + "epoch": 0.6404, + "grad_norm": 4.584061145782471, + "learning_rate": 6.896553818721989e-06, + "loss": 0.2817, + "step": 6404 + }, + { + "epoch": 0.6406, + "grad_norm": 1.1131246089935303, + "learning_rate": 6.889917967213184e-06, + "loss": 0.0679, + "step": 6406 + }, + { + "epoch": 0.6408, + "grad_norm": 2.722517251968384, + "learning_rate": 6.883283631520582e-06, + "loss": 0.2221, + "step": 6408 + }, + { + "epoch": 0.641, + "grad_norm": 4.360770225524902, + "learning_rate": 6.876650814877675e-06, + "loss": 0.3344, + "step": 6410 + }, + { + "epoch": 0.6412, + "grad_norm": 1.0865607261657715, + "learning_rate": 6.870019520517217e-06, + "loss": 0.0763, + "step": 6412 + }, + { + "epoch": 0.6414, + "grad_norm": 6.864331245422363, + "learning_rate": 6.863389751671225e-06, + "loss": 0.4139, + "step": 6414 + }, + { + "epoch": 0.6416, + "grad_norm": 2.472266674041748, + "learning_rate": 6.856761511570963e-06, + "loss": 0.1757, + "step": 6416 + }, + { + "epoch": 0.6418, + "grad_norm": 3.6313846111297607, + "learning_rate": 6.850134803446955e-06, + "loss": 0.1482, + "step": 6418 + }, + { + "epoch": 0.642, + "grad_norm": 1.0550991296768188, + "learning_rate": 6.843509630528977e-06, + "loss": 0.3351, + "step": 6420 + }, + { + "epoch": 0.6422, + "grad_norm": 5.514056205749512, + "learning_rate": 6.836885996046061e-06, + "loss": 0.399, + "step": 6422 + }, + { + "epoch": 0.6424, + "grad_norm": 0.3298647403717041, + "learning_rate": 6.830263903226483e-06, + "loss": 0.0232, + "step": 6424 + }, + { + "epoch": 0.6426, + "grad_norm": 6.18208122253418, + "learning_rate": 6.823643355297774e-06, + "loss": 0.7748, + "step": 6426 + }, + { + "epoch": 0.6428, + "grad_norm": 3.237090826034546, + "learning_rate": 6.8170243554867065e-06, + "loss": 0.1665, + "step": 6428 + }, + { + "epoch": 0.643, + "grad_norm": 1.1156134605407715, + "learning_rate": 6.8104069070193e-06, + "loss": 0.3347, + "step": 6430 + }, + { + "epoch": 0.6432, + "grad_norm": 5.968198776245117, + "learning_rate": 6.803791013120822e-06, + "loss": 0.1728, + "step": 6432 + }, + { + "epoch": 0.6434, + "grad_norm": 0.2075730860233307, + "learning_rate": 6.797176677015775e-06, + "loss": 0.0224, + "step": 6434 + }, + { + "epoch": 0.6436, + "grad_norm": 1.2049905061721802, + "learning_rate": 6.790563901927907e-06, + "loss": 0.1032, + "step": 6436 + }, + { + "epoch": 0.6438, + "grad_norm": 1.3712271451950073, + "learning_rate": 6.783952691080203e-06, + "loss": 0.0718, + "step": 6438 + }, + { + "epoch": 0.644, + "grad_norm": 3.43179988861084, + "learning_rate": 6.777343047694891e-06, + "loss": 0.1669, + "step": 6440 + }, + { + "epoch": 0.6442, + "grad_norm": 11.397260665893555, + "learning_rate": 6.770734974993427e-06, + "loss": 0.1948, + "step": 6442 + }, + { + "epoch": 0.6444, + "grad_norm": 1.9390872716903687, + "learning_rate": 6.764128476196505e-06, + "loss": 0.0765, + "step": 6444 + }, + { + "epoch": 0.6446, + "grad_norm": 6.401348114013672, + "learning_rate": 6.757523554524056e-06, + "loss": 0.2845, + "step": 6446 + }, + { + "epoch": 0.6448, + "grad_norm": 3.1447227001190186, + "learning_rate": 6.750920213195238e-06, + "loss": 0.4319, + "step": 6448 + }, + { + "epoch": 0.645, + "grad_norm": 4.017346382141113, + "learning_rate": 6.744318455428436e-06, + "loss": 0.1031, + "step": 6450 + }, + { + "epoch": 0.6452, + "grad_norm": 1.3796619176864624, + "learning_rate": 6.737718284441267e-06, + "loss": 0.0477, + "step": 6452 + }, + { + "epoch": 0.6454, + "grad_norm": 3.182954788208008, + "learning_rate": 6.731119703450577e-06, + "loss": 0.2814, + "step": 6454 + }, + { + "epoch": 0.6456, + "grad_norm": 5.484886169433594, + "learning_rate": 6.7245227156724324e-06, + "loss": 0.4641, + "step": 6456 + }, + { + "epoch": 0.6458, + "grad_norm": 5.996239185333252, + "learning_rate": 6.717927324322124e-06, + "loss": 0.1677, + "step": 6458 + }, + { + "epoch": 0.646, + "grad_norm": 3.2010064125061035, + "learning_rate": 6.711333532614168e-06, + "loss": 0.0858, + "step": 6460 + }, + { + "epoch": 0.6462, + "grad_norm": 3.4453189373016357, + "learning_rate": 6.704741343762296e-06, + "loss": 0.5131, + "step": 6462 + }, + { + "epoch": 0.6464, + "grad_norm": 4.104238033294678, + "learning_rate": 6.698150760979463e-06, + "loss": 0.2305, + "step": 6464 + }, + { + "epoch": 0.6466, + "grad_norm": 4.207293510437012, + "learning_rate": 6.69156178747784e-06, + "loss": 0.1783, + "step": 6466 + }, + { + "epoch": 0.6468, + "grad_norm": 3.592609405517578, + "learning_rate": 6.684974426468809e-06, + "loss": 0.1946, + "step": 6468 + }, + { + "epoch": 0.647, + "grad_norm": 11.923818588256836, + "learning_rate": 6.67838868116297e-06, + "loss": 0.3818, + "step": 6470 + }, + { + "epoch": 0.6472, + "grad_norm": 6.490572929382324, + "learning_rate": 6.671804554770135e-06, + "loss": 0.5202, + "step": 6472 + }, + { + "epoch": 0.6474, + "grad_norm": 4.420390605926514, + "learning_rate": 6.6652220504993305e-06, + "loss": 0.166, + "step": 6474 + }, + { + "epoch": 0.6476, + "grad_norm": 5.631129741668701, + "learning_rate": 6.658641171558785e-06, + "loss": 0.464, + "step": 6476 + }, + { + "epoch": 0.6478, + "grad_norm": 2.996126651763916, + "learning_rate": 6.6520619211559435e-06, + "loss": 0.0729, + "step": 6478 + }, + { + "epoch": 0.648, + "grad_norm": 10.55669116973877, + "learning_rate": 6.645484302497452e-06, + "loss": 0.6677, + "step": 6480 + }, + { + "epoch": 0.6482, + "grad_norm": 8.567784309387207, + "learning_rate": 6.638908318789156e-06, + "loss": 0.5501, + "step": 6482 + }, + { + "epoch": 0.6484, + "grad_norm": 1.7309556007385254, + "learning_rate": 6.63233397323612e-06, + "loss": 0.081, + "step": 6484 + }, + { + "epoch": 0.6486, + "grad_norm": 0.9151462912559509, + "learning_rate": 6.62576126904259e-06, + "loss": 0.1594, + "step": 6486 + }, + { + "epoch": 0.6488, + "grad_norm": 5.116731643676758, + "learning_rate": 6.6191902094120295e-06, + "loss": 0.1946, + "step": 6488 + }, + { + "epoch": 0.649, + "grad_norm": 0.12371377646923065, + "learning_rate": 6.612620797547087e-06, + "loss": 0.0674, + "step": 6490 + }, + { + "epoch": 0.6492, + "grad_norm": 3.6194655895233154, + "learning_rate": 6.60605303664962e-06, + "loss": 0.4025, + "step": 6492 + }, + { + "epoch": 0.6494, + "grad_norm": 3.267848253250122, + "learning_rate": 6.5994869299206736e-06, + "loss": 0.3439, + "step": 6494 + }, + { + "epoch": 0.6496, + "grad_norm": 6.965150833129883, + "learning_rate": 6.5929224805604845e-06, + "loss": 0.0908, + "step": 6496 + }, + { + "epoch": 0.6498, + "grad_norm": 10.681925773620605, + "learning_rate": 6.58635969176849e-06, + "loss": 0.3249, + "step": 6498 + }, + { + "epoch": 0.65, + "grad_norm": 9.039934158325195, + "learning_rate": 6.579798566743314e-06, + "loss": 0.7083, + "step": 6500 + }, + { + "epoch": 0.6502, + "grad_norm": 4.179249286651611, + "learning_rate": 6.573239108682769e-06, + "loss": 0.4508, + "step": 6502 + }, + { + "epoch": 0.6504, + "grad_norm": 8.542593955993652, + "learning_rate": 6.566681320783849e-06, + "loss": 0.6101, + "step": 6504 + }, + { + "epoch": 0.6506, + "grad_norm": 5.450608730316162, + "learning_rate": 6.560125206242746e-06, + "loss": 0.6664, + "step": 6506 + }, + { + "epoch": 0.6508, + "grad_norm": 5.862269878387451, + "learning_rate": 6.553570768254831e-06, + "loss": 0.4157, + "step": 6508 + }, + { + "epoch": 0.651, + "grad_norm": 3.7710630893707275, + "learning_rate": 6.547018010014654e-06, + "loss": 0.3263, + "step": 6510 + }, + { + "epoch": 0.6512, + "grad_norm": 2.708002805709839, + "learning_rate": 6.540466934715953e-06, + "loss": 0.1032, + "step": 6512 + }, + { + "epoch": 0.6514, + "grad_norm": 2.859264373779297, + "learning_rate": 6.53391754555164e-06, + "loss": 0.2352, + "step": 6514 + }, + { + "epoch": 0.6516, + "grad_norm": 4.711730003356934, + "learning_rate": 6.52736984571381e-06, + "loss": 0.2064, + "step": 6516 + }, + { + "epoch": 0.6518, + "grad_norm": 3.9252982139587402, + "learning_rate": 6.520823838393732e-06, + "loss": 0.3695, + "step": 6518 + }, + { + "epoch": 0.652, + "grad_norm": 1.9727703332901, + "learning_rate": 6.5142795267818505e-06, + "loss": 0.7446, + "step": 6520 + }, + { + "epoch": 0.6522, + "grad_norm": 6.077550411224365, + "learning_rate": 6.5077369140677815e-06, + "loss": 0.3251, + "step": 6522 + }, + { + "epoch": 0.6524, + "grad_norm": 3.48382568359375, + "learning_rate": 6.501196003440313e-06, + "loss": 0.216, + "step": 6524 + }, + { + "epoch": 0.6526, + "grad_norm": 3.106964349746704, + "learning_rate": 6.494656798087412e-06, + "loss": 0.2092, + "step": 6526 + }, + { + "epoch": 0.6528, + "grad_norm": 0.22142212092876434, + "learning_rate": 6.488119301196201e-06, + "loss": 0.2506, + "step": 6528 + }, + { + "epoch": 0.653, + "grad_norm": 3.1787192821502686, + "learning_rate": 6.481583515952983e-06, + "loss": 0.199, + "step": 6530 + }, + { + "epoch": 0.6532, + "grad_norm": 0.7442744374275208, + "learning_rate": 6.475049445543215e-06, + "loss": 0.0764, + "step": 6532 + }, + { + "epoch": 0.6534, + "grad_norm": 0.9041438102722168, + "learning_rate": 6.468517093151525e-06, + "loss": 0.2355, + "step": 6534 + }, + { + "epoch": 0.6536, + "grad_norm": 2.789360523223877, + "learning_rate": 6.461986461961706e-06, + "loss": 0.2319, + "step": 6536 + }, + { + "epoch": 0.6538, + "grad_norm": 1.9806697368621826, + "learning_rate": 6.455457555156706e-06, + "loss": 0.1697, + "step": 6538 + }, + { + "epoch": 0.654, + "grad_norm": 1.935733437538147, + "learning_rate": 6.448930375918632e-06, + "loss": 0.0851, + "step": 6540 + }, + { + "epoch": 0.6542, + "grad_norm": 3.469123363494873, + "learning_rate": 6.442404927428751e-06, + "loss": 0.4734, + "step": 6542 + }, + { + "epoch": 0.6544, + "grad_norm": 5.11306095123291, + "learning_rate": 6.435881212867494e-06, + "loss": 0.1036, + "step": 6544 + }, + { + "epoch": 0.6546, + "grad_norm": 1.3808881044387817, + "learning_rate": 6.4293592354144365e-06, + "loss": 0.0615, + "step": 6546 + }, + { + "epoch": 0.6548, + "grad_norm": 0.44541388750076294, + "learning_rate": 6.422838998248308e-06, + "loss": 0.1286, + "step": 6548 + }, + { + "epoch": 0.655, + "grad_norm": 0.8696237802505493, + "learning_rate": 6.4163205045469975e-06, + "loss": 0.2923, + "step": 6550 + }, + { + "epoch": 0.6552, + "grad_norm": 2.508239507675171, + "learning_rate": 6.409803757487539e-06, + "loss": 0.3836, + "step": 6552 + }, + { + "epoch": 0.6554, + "grad_norm": 0.8963150382041931, + "learning_rate": 6.403288760246112e-06, + "loss": 0.0613, + "step": 6554 + }, + { + "epoch": 0.6556, + "grad_norm": 0.27460727095603943, + "learning_rate": 6.396775515998055e-06, + "loss": 0.1789, + "step": 6556 + }, + { + "epoch": 0.6558, + "grad_norm": 3.072242498397827, + "learning_rate": 6.390264027917836e-06, + "loss": 0.12, + "step": 6558 + }, + { + "epoch": 0.656, + "grad_norm": 0.7918325662612915, + "learning_rate": 6.383754299179079e-06, + "loss": 0.2098, + "step": 6560 + }, + { + "epoch": 0.6562, + "grad_norm": 1.016815423965454, + "learning_rate": 6.377246332954544e-06, + "loss": 0.0952, + "step": 6562 + }, + { + "epoch": 0.6564, + "grad_norm": 0.5786592960357666, + "learning_rate": 6.370740132416138e-06, + "loss": 0.1838, + "step": 6564 + }, + { + "epoch": 0.6566, + "grad_norm": 0.9609250426292419, + "learning_rate": 6.364235700734903e-06, + "loss": 0.2258, + "step": 6566 + }, + { + "epoch": 0.6568, + "grad_norm": 2.461491107940674, + "learning_rate": 6.357733041081018e-06, + "loss": 0.2283, + "step": 6568 + }, + { + "epoch": 0.657, + "grad_norm": 0.3781920373439789, + "learning_rate": 6.351232156623803e-06, + "loss": 0.0417, + "step": 6570 + }, + { + "epoch": 0.6572, + "grad_norm": 0.31984400749206543, + "learning_rate": 6.344733050531713e-06, + "loss": 0.038, + "step": 6572 + }, + { + "epoch": 0.6574, + "grad_norm": 1.297649621963501, + "learning_rate": 6.338235725972326e-06, + "loss": 0.1905, + "step": 6574 + }, + { + "epoch": 0.6576, + "grad_norm": 1.1986666917800903, + "learning_rate": 6.33174018611236e-06, + "loss": 0.3647, + "step": 6576 + }, + { + "epoch": 0.6578, + "grad_norm": 4.630595684051514, + "learning_rate": 6.325246434117669e-06, + "loss": 0.2464, + "step": 6578 + }, + { + "epoch": 0.658, + "grad_norm": 4.778238773345947, + "learning_rate": 6.318754473153221e-06, + "loss": 0.2855, + "step": 6580 + }, + { + "epoch": 0.6582, + "grad_norm": 2.4158637523651123, + "learning_rate": 6.3122643063831245e-06, + "loss": 0.1694, + "step": 6582 + }, + { + "epoch": 0.6584, + "grad_norm": 2.764516592025757, + "learning_rate": 6.305775936970606e-06, + "loss": 0.1444, + "step": 6584 + }, + { + "epoch": 0.6586, + "grad_norm": 2.75319242477417, + "learning_rate": 6.299289368078016e-06, + "loss": 0.1751, + "step": 6586 + }, + { + "epoch": 0.6588, + "grad_norm": 6.393750190734863, + "learning_rate": 6.292804602866833e-06, + "loss": 0.4834, + "step": 6588 + }, + { + "epoch": 0.659, + "grad_norm": 8.444443702697754, + "learning_rate": 6.286321644497655e-06, + "loss": 0.4481, + "step": 6590 + }, + { + "epoch": 0.6592, + "grad_norm": 1.2433717250823975, + "learning_rate": 6.27984049613019e-06, + "loss": 0.0856, + "step": 6592 + }, + { + "epoch": 0.6594, + "grad_norm": 1.6846582889556885, + "learning_rate": 6.273361160923271e-06, + "loss": 0.0643, + "step": 6594 + }, + { + "epoch": 0.6596, + "grad_norm": 7.735212802886963, + "learning_rate": 6.2668836420348535e-06, + "loss": 0.6197, + "step": 6596 + }, + { + "epoch": 0.6598, + "grad_norm": 0.14022639393806458, + "learning_rate": 6.260407942621998e-06, + "loss": 0.1622, + "step": 6598 + }, + { + "epoch": 0.66, + "grad_norm": 7.273261547088623, + "learning_rate": 6.25393406584088e-06, + "loss": 0.3486, + "step": 6600 + }, + { + "epoch": 0.6602, + "grad_norm": 0.1943744421005249, + "learning_rate": 6.247462014846793e-06, + "loss": 0.0077, + "step": 6602 + }, + { + "epoch": 0.6604, + "grad_norm": 3.5570404529571533, + "learning_rate": 6.240991792794133e-06, + "loss": 0.0857, + "step": 6604 + }, + { + "epoch": 0.6606, + "grad_norm": 2.653998374938965, + "learning_rate": 6.234523402836408e-06, + "loss": 0.2019, + "step": 6606 + }, + { + "epoch": 0.6608, + "grad_norm": 2.0648839473724365, + "learning_rate": 6.228056848126236e-06, + "loss": 0.1286, + "step": 6608 + }, + { + "epoch": 0.661, + "grad_norm": 7.750906944274902, + "learning_rate": 6.22159213181533e-06, + "loss": 0.5501, + "step": 6610 + }, + { + "epoch": 0.6612, + "grad_norm": 10.969327926635742, + "learning_rate": 6.2151292570545215e-06, + "loss": 0.5064, + "step": 6612 + }, + { + "epoch": 0.6614, + "grad_norm": 0.7909489274024963, + "learning_rate": 6.208668226993731e-06, + "loss": 0.2372, + "step": 6614 + }, + { + "epoch": 0.6616, + "grad_norm": 0.6686945557594299, + "learning_rate": 6.202209044781991e-06, + "loss": 0.0234, + "step": 6616 + }, + { + "epoch": 0.6618, + "grad_norm": 0.22913715243339539, + "learning_rate": 6.195751713567426e-06, + "loss": 0.1632, + "step": 6618 + }, + { + "epoch": 0.662, + "grad_norm": 4.912769317626953, + "learning_rate": 6.18929623649726e-06, + "loss": 0.1905, + "step": 6620 + }, + { + "epoch": 0.6622, + "grad_norm": 1.8873761892318726, + "learning_rate": 6.182842616717817e-06, + "loss": 0.5043, + "step": 6622 + }, + { + "epoch": 0.6624, + "grad_norm": 3.2329585552215576, + "learning_rate": 6.176390857374508e-06, + "loss": 0.4633, + "step": 6624 + }, + { + "epoch": 0.6626, + "grad_norm": 2.178720474243164, + "learning_rate": 6.169940961611853e-06, + "loss": 0.5955, + "step": 6626 + }, + { + "epoch": 0.6628, + "grad_norm": 2.3060073852539062, + "learning_rate": 6.1634929325734385e-06, + "loss": 0.1896, + "step": 6628 + }, + { + "epoch": 0.663, + "grad_norm": 0.46830904483795166, + "learning_rate": 6.157046773401964e-06, + "loss": 0.3444, + "step": 6630 + }, + { + "epoch": 0.6632, + "grad_norm": 1.9609344005584717, + "learning_rate": 6.150602487239207e-06, + "loss": 0.1196, + "step": 6632 + }, + { + "epoch": 0.6634, + "grad_norm": 0.8874260783195496, + "learning_rate": 6.144160077226035e-06, + "loss": 0.5097, + "step": 6634 + }, + { + "epoch": 0.6636, + "grad_norm": 1.2160536050796509, + "learning_rate": 6.137719546502401e-06, + "loss": 0.0515, + "step": 6636 + }, + { + "epoch": 0.6638, + "grad_norm": 2.285950183868408, + "learning_rate": 6.131280898207339e-06, + "loss": 0.7818, + "step": 6638 + }, + { + "epoch": 0.664, + "grad_norm": 3.6035027503967285, + "learning_rate": 6.124844135478971e-06, + "loss": 0.2123, + "step": 6640 + }, + { + "epoch": 0.6642, + "grad_norm": 2.13199520111084, + "learning_rate": 6.118409261454494e-06, + "loss": 0.114, + "step": 6642 + }, + { + "epoch": 0.6644, + "grad_norm": 0.69740229845047, + "learning_rate": 6.1119762792701935e-06, + "loss": 0.1644, + "step": 6644 + }, + { + "epoch": 0.6646, + "grad_norm": 0.3627305030822754, + "learning_rate": 6.1055451920614165e-06, + "loss": 0.0964, + "step": 6646 + }, + { + "epoch": 0.6648, + "grad_norm": 1.7360920906066895, + "learning_rate": 6.099116002962604e-06, + "loss": 0.0732, + "step": 6648 + }, + { + "epoch": 0.665, + "grad_norm": 0.34170255064964294, + "learning_rate": 6.092688715107265e-06, + "loss": 0.0883, + "step": 6650 + }, + { + "epoch": 0.6652, + "grad_norm": 2.9521923065185547, + "learning_rate": 6.086263331627976e-06, + "loss": 0.2461, + "step": 6652 + }, + { + "epoch": 0.6654, + "grad_norm": 0.4287213981151581, + "learning_rate": 6.079839855656397e-06, + "loss": 0.0339, + "step": 6654 + }, + { + "epoch": 0.6656, + "grad_norm": 1.477308988571167, + "learning_rate": 6.073418290323251e-06, + "loss": 0.2821, + "step": 6656 + }, + { + "epoch": 0.6658, + "grad_norm": 1.6898819208145142, + "learning_rate": 6.066998638758326e-06, + "loss": 0.1759, + "step": 6658 + }, + { + "epoch": 0.666, + "grad_norm": 0.5654078722000122, + "learning_rate": 6.06058090409049e-06, + "loss": 0.0985, + "step": 6660 + }, + { + "epoch": 0.6662, + "grad_norm": 1.0496320724487305, + "learning_rate": 6.054165089447663e-06, + "loss": 0.119, + "step": 6662 + }, + { + "epoch": 0.6664, + "grad_norm": 6.669126033782959, + "learning_rate": 6.047751197956838e-06, + "loss": 0.4707, + "step": 6664 + }, + { + "epoch": 0.6666, + "grad_norm": 4.0150980949401855, + "learning_rate": 6.0413392327440635e-06, + "loss": 0.0858, + "step": 6666 + }, + { + "epoch": 0.6668, + "grad_norm": 2.4150116443634033, + "learning_rate": 6.0349291969344595e-06, + "loss": 0.1345, + "step": 6668 + }, + { + "epoch": 0.667, + "grad_norm": 3.9722697734832764, + "learning_rate": 6.028521093652195e-06, + "loss": 0.2708, + "step": 6670 + }, + { + "epoch": 0.6672, + "grad_norm": 0.6189969778060913, + "learning_rate": 6.022114926020504e-06, + "loss": 0.2323, + "step": 6672 + }, + { + "epoch": 0.6674, + "grad_norm": 2.4969310760498047, + "learning_rate": 6.015710697161674e-06, + "loss": 0.1239, + "step": 6674 + }, + { + "epoch": 0.6676, + "grad_norm": 0.6136898994445801, + "learning_rate": 6.009308410197048e-06, + "loss": 0.1073, + "step": 6676 + }, + { + "epoch": 0.6678, + "grad_norm": 8.434051513671875, + "learning_rate": 6.002908068247024e-06, + "loss": 0.3262, + "step": 6678 + }, + { + "epoch": 0.668, + "grad_norm": 3.5076684951782227, + "learning_rate": 5.996509674431053e-06, + "loss": 0.063, + "step": 6680 + }, + { + "epoch": 0.6682, + "grad_norm": 4.880193710327148, + "learning_rate": 5.990113231867629e-06, + "loss": 0.187, + "step": 6682 + }, + { + "epoch": 0.6684, + "grad_norm": 3.125481367111206, + "learning_rate": 5.983718743674302e-06, + "loss": 0.2459, + "step": 6684 + }, + { + "epoch": 0.6686, + "grad_norm": 2.44232439994812, + "learning_rate": 5.977326212967671e-06, + "loss": 0.2576, + "step": 6686 + }, + { + "epoch": 0.6688, + "grad_norm": 4.743481636047363, + "learning_rate": 5.970935642863375e-06, + "loss": 0.3382, + "step": 6688 + }, + { + "epoch": 0.669, + "grad_norm": 1.0559309720993042, + "learning_rate": 5.9645470364761e-06, + "loss": 0.0992, + "step": 6690 + }, + { + "epoch": 0.6692, + "grad_norm": 0.4226183295249939, + "learning_rate": 5.958160396919577e-06, + "loss": 0.0208, + "step": 6692 + }, + { + "epoch": 0.6694, + "grad_norm": 5.542026042938232, + "learning_rate": 5.951775727306577e-06, + "loss": 0.2516, + "step": 6694 + }, + { + "epoch": 0.6696, + "grad_norm": 1.0688104629516602, + "learning_rate": 5.94539303074891e-06, + "loss": 0.1158, + "step": 6696 + }, + { + "epoch": 0.6698, + "grad_norm": 0.3828153908252716, + "learning_rate": 5.939012310357422e-06, + "loss": 0.0273, + "step": 6698 + }, + { + "epoch": 0.67, + "grad_norm": 2.848578453063965, + "learning_rate": 5.932633569242e-06, + "loss": 0.195, + "step": 6700 + }, + { + "epoch": 0.6702, + "grad_norm": 6.917814254760742, + "learning_rate": 5.926256810511566e-06, + "loss": 0.3184, + "step": 6702 + }, + { + "epoch": 0.6704, + "grad_norm": 1.2026150226593018, + "learning_rate": 5.9198820372740726e-06, + "loss": 0.0604, + "step": 6704 + }, + { + "epoch": 0.6706, + "grad_norm": 0.36246877908706665, + "learning_rate": 5.913509252636511e-06, + "loss": 0.0423, + "step": 6706 + }, + { + "epoch": 0.6708, + "grad_norm": 4.020970344543457, + "learning_rate": 5.907138459704895e-06, + "loss": 0.096, + "step": 6708 + }, + { + "epoch": 0.671, + "grad_norm": 1.7861522436141968, + "learning_rate": 5.900769661584273e-06, + "loss": 0.2243, + "step": 6710 + }, + { + "epoch": 0.6712, + "grad_norm": 0.3854908049106598, + "learning_rate": 5.894402861378721e-06, + "loss": 0.2225, + "step": 6712 + }, + { + "epoch": 0.6714, + "grad_norm": 4.2171406745910645, + "learning_rate": 5.88803806219134e-06, + "loss": 0.3086, + "step": 6714 + }, + { + "epoch": 0.6716, + "grad_norm": 0.433533638715744, + "learning_rate": 5.881675267124254e-06, + "loss": 0.0149, + "step": 6716 + }, + { + "epoch": 0.6718, + "grad_norm": 0.182979017496109, + "learning_rate": 5.8753144792786095e-06, + "loss": 0.0205, + "step": 6718 + }, + { + "epoch": 0.672, + "grad_norm": 7.810181140899658, + "learning_rate": 5.868955701754584e-06, + "loss": 0.4034, + "step": 6720 + }, + { + "epoch": 0.6722, + "grad_norm": 7.797017574310303, + "learning_rate": 5.862598937651364e-06, + "loss": 0.3482, + "step": 6722 + }, + { + "epoch": 0.6724, + "grad_norm": 3.13995361328125, + "learning_rate": 5.85624419006716e-06, + "loss": 0.2856, + "step": 6724 + }, + { + "epoch": 0.6726, + "grad_norm": 19.26747703552246, + "learning_rate": 5.849891462099199e-06, + "loss": 0.1823, + "step": 6726 + }, + { + "epoch": 0.6728, + "grad_norm": 0.3839501440525055, + "learning_rate": 5.843540756843722e-06, + "loss": 0.036, + "step": 6728 + }, + { + "epoch": 0.673, + "grad_norm": 0.5213305354118347, + "learning_rate": 5.83719207739599e-06, + "loss": 0.7155, + "step": 6730 + }, + { + "epoch": 0.6732, + "grad_norm": 4.890913486480713, + "learning_rate": 5.830845426850268e-06, + "loss": 0.375, + "step": 6732 + }, + { + "epoch": 0.6734, + "grad_norm": 7.376688003540039, + "learning_rate": 5.824500808299836e-06, + "loss": 0.6251, + "step": 6734 + }, + { + "epoch": 0.6736, + "grad_norm": 3.774182081222534, + "learning_rate": 5.818158224836987e-06, + "loss": 0.2319, + "step": 6736 + }, + { + "epoch": 0.6738, + "grad_norm": 1.116968035697937, + "learning_rate": 5.811817679553018e-06, + "loss": 0.032, + "step": 6738 + }, + { + "epoch": 0.674, + "grad_norm": 8.257487297058105, + "learning_rate": 5.8054791755382286e-06, + "loss": 0.6366, + "step": 6740 + }, + { + "epoch": 0.6742, + "grad_norm": 2.180586338043213, + "learning_rate": 5.799142715881938e-06, + "loss": 0.3145, + "step": 6742 + }, + { + "epoch": 0.6744, + "grad_norm": 1.4056130647659302, + "learning_rate": 5.792808303672454e-06, + "loss": 0.1479, + "step": 6744 + }, + { + "epoch": 0.6746, + "grad_norm": 2.5428872108459473, + "learning_rate": 5.786475941997094e-06, + "loss": 0.1678, + "step": 6746 + }, + { + "epoch": 0.6748, + "grad_norm": 3.146336317062378, + "learning_rate": 5.780145633942173e-06, + "loss": 0.8402, + "step": 6748 + }, + { + "epoch": 0.675, + "grad_norm": 1.531263828277588, + "learning_rate": 5.773817382593008e-06, + "loss": 0.3072, + "step": 6750 + }, + { + "epoch": 0.6752, + "grad_norm": 4.328049182891846, + "learning_rate": 5.7674911910339094e-06, + "loss": 0.2623, + "step": 6752 + }, + { + "epoch": 0.6754, + "grad_norm": 0.20466819405555725, + "learning_rate": 5.761167062348187e-06, + "loss": 0.1022, + "step": 6754 + }, + { + "epoch": 0.6756, + "grad_norm": 1.17844557762146, + "learning_rate": 5.754844999618144e-06, + "loss": 0.0592, + "step": 6756 + }, + { + "epoch": 0.6758, + "grad_norm": 0.48447659611701965, + "learning_rate": 5.748525005925074e-06, + "loss": 0.0321, + "step": 6758 + }, + { + "epoch": 0.676, + "grad_norm": 0.338146448135376, + "learning_rate": 5.742207084349274e-06, + "loss": 0.0152, + "step": 6760 + }, + { + "epoch": 0.6762, + "grad_norm": 7.008608341217041, + "learning_rate": 5.735891237970015e-06, + "loss": 0.4365, + "step": 6762 + }, + { + "epoch": 0.6764, + "grad_norm": 1.3529415130615234, + "learning_rate": 5.729577469865566e-06, + "loss": 0.3298, + "step": 6764 + }, + { + "epoch": 0.6766, + "grad_norm": 0.6421126127243042, + "learning_rate": 5.723265783113181e-06, + "loss": 0.0723, + "step": 6766 + }, + { + "epoch": 0.6768, + "grad_norm": 1.0155773162841797, + "learning_rate": 5.716956180789098e-06, + "loss": 0.1375, + "step": 6768 + }, + { + "epoch": 0.677, + "grad_norm": 5.478870391845703, + "learning_rate": 5.710648665968543e-06, + "loss": 0.2054, + "step": 6770 + }, + { + "epoch": 0.6772, + "grad_norm": 0.3022187054157257, + "learning_rate": 5.704343241725719e-06, + "loss": 0.0435, + "step": 6772 + }, + { + "epoch": 0.6774, + "grad_norm": 2.3352699279785156, + "learning_rate": 5.698039911133816e-06, + "loss": 0.1641, + "step": 6774 + }, + { + "epoch": 0.6776, + "grad_norm": 1.8896830081939697, + "learning_rate": 5.691738677265e-06, + "loss": 0.214, + "step": 6776 + }, + { + "epoch": 0.6778, + "grad_norm": 3.3402657508850098, + "learning_rate": 5.685439543190409e-06, + "loss": 0.1277, + "step": 6778 + }, + { + "epoch": 0.678, + "grad_norm": 0.10768085718154907, + "learning_rate": 5.679142511980176e-06, + "loss": 0.0531, + "step": 6780 + }, + { + "epoch": 0.6782, + "grad_norm": 2.3789055347442627, + "learning_rate": 5.672847586703393e-06, + "loss": 0.2266, + "step": 6782 + }, + { + "epoch": 0.6784, + "grad_norm": 13.007341384887695, + "learning_rate": 5.666554770428129e-06, + "loss": 1.0258, + "step": 6784 + }, + { + "epoch": 0.6786, + "grad_norm": 1.5721607208251953, + "learning_rate": 5.660264066221426e-06, + "loss": 0.2291, + "step": 6786 + }, + { + "epoch": 0.6788, + "grad_norm": 0.27759233117103577, + "learning_rate": 5.653975477149298e-06, + "loss": 0.0488, + "step": 6788 + }, + { + "epoch": 0.679, + "grad_norm": 15.781034469604492, + "learning_rate": 5.647689006276727e-06, + "loss": 0.5497, + "step": 6790 + }, + { + "epoch": 0.6792, + "grad_norm": 2.3610780239105225, + "learning_rate": 5.641404656667661e-06, + "loss": 1.4326, + "step": 6792 + }, + { + "epoch": 0.6794, + "grad_norm": 0.435750812292099, + "learning_rate": 5.6351224313850165e-06, + "loss": 0.2405, + "step": 6794 + }, + { + "epoch": 0.6796, + "grad_norm": 1.499695062637329, + "learning_rate": 5.628842333490674e-06, + "loss": 0.079, + "step": 6796 + }, + { + "epoch": 0.6798, + "grad_norm": 1.5173518657684326, + "learning_rate": 5.622564366045472e-06, + "loss": 0.1822, + "step": 6798 + }, + { + "epoch": 0.68, + "grad_norm": 3.159726142883301, + "learning_rate": 5.616288532109225e-06, + "loss": 0.0673, + "step": 6800 + }, + { + "epoch": 0.6802, + "grad_norm": 0.5936082005500793, + "learning_rate": 5.610014834740694e-06, + "loss": 0.0429, + "step": 6802 + }, + { + "epoch": 0.6804, + "grad_norm": 2.3683040142059326, + "learning_rate": 5.603743276997607e-06, + "loss": 0.1291, + "step": 6804 + }, + { + "epoch": 0.6806, + "grad_norm": 0.25923216342926025, + "learning_rate": 5.59747386193663e-06, + "loss": 0.142, + "step": 6806 + }, + { + "epoch": 0.6808, + "grad_norm": 1.9230754375457764, + "learning_rate": 5.591206592613416e-06, + "loss": 0.2857, + "step": 6808 + }, + { + "epoch": 0.681, + "grad_norm": 3.1759021282196045, + "learning_rate": 5.584941472082549e-06, + "loss": 0.1605, + "step": 6810 + }, + { + "epoch": 0.6812, + "grad_norm": 0.4876266121864319, + "learning_rate": 5.5786785033975745e-06, + "loss": 0.0313, + "step": 6812 + }, + { + "epoch": 0.6814, + "grad_norm": 1.1194491386413574, + "learning_rate": 5.572417689610987e-06, + "loss": 0.0535, + "step": 6814 + }, + { + "epoch": 0.6816, + "grad_norm": 5.640929222106934, + "learning_rate": 5.5661590337742255e-06, + "loss": 0.4412, + "step": 6816 + }, + { + "epoch": 0.6818, + "grad_norm": 0.3013049364089966, + "learning_rate": 5.559902538937694e-06, + "loss": 0.0192, + "step": 6818 + }, + { + "epoch": 0.682, + "grad_norm": 0.7727065682411194, + "learning_rate": 5.553648208150728e-06, + "loss": 0.081, + "step": 6820 + }, + { + "epoch": 0.6822, + "grad_norm": 1.2568910121917725, + "learning_rate": 5.5473960444616085e-06, + "loss": 0.1037, + "step": 6822 + }, + { + "epoch": 0.6824, + "grad_norm": 14.181533813476562, + "learning_rate": 5.5411460509175605e-06, + "loss": 0.5728, + "step": 6824 + }, + { + "epoch": 0.6826, + "grad_norm": 1.0913207530975342, + "learning_rate": 5.534898230564765e-06, + "loss": 0.1549, + "step": 6826 + }, + { + "epoch": 0.6828, + "grad_norm": 4.411632537841797, + "learning_rate": 5.5286525864483285e-06, + "loss": 0.361, + "step": 6828 + }, + { + "epoch": 0.683, + "grad_norm": 0.9821171164512634, + "learning_rate": 5.522409121612304e-06, + "loss": 0.354, + "step": 6830 + }, + { + "epoch": 0.6832, + "grad_norm": 1.930647850036621, + "learning_rate": 5.516167839099679e-06, + "loss": 0.0832, + "step": 6832 + }, + { + "epoch": 0.6834, + "grad_norm": 1.1893230676651, + "learning_rate": 5.50992874195238e-06, + "loss": 0.3005, + "step": 6834 + }, + { + "epoch": 0.6836, + "grad_norm": 5.5413818359375, + "learning_rate": 5.50369183321126e-06, + "loss": 0.4395, + "step": 6836 + }, + { + "epoch": 0.6838, + "grad_norm": 1.5309711694717407, + "learning_rate": 5.497457115916127e-06, + "loss": 0.0862, + "step": 6838 + }, + { + "epoch": 0.684, + "grad_norm": 2.800269603729248, + "learning_rate": 5.491224593105695e-06, + "loss": 0.2162, + "step": 6840 + }, + { + "epoch": 0.6842, + "grad_norm": 1.8054580688476562, + "learning_rate": 5.484994267817624e-06, + "loss": 0.2352, + "step": 6842 + }, + { + "epoch": 0.6844, + "grad_norm": 0.9581074118614197, + "learning_rate": 5.478766143088492e-06, + "loss": 0.0732, + "step": 6844 + }, + { + "epoch": 0.6846, + "grad_norm": 3.1997978687286377, + "learning_rate": 5.472540221953824e-06, + "loss": 0.2303, + "step": 6846 + }, + { + "epoch": 0.6848, + "grad_norm": 7.008697032928467, + "learning_rate": 5.466316507448049e-06, + "loss": 0.3342, + "step": 6848 + }, + { + "epoch": 0.685, + "grad_norm": 6.805210113525391, + "learning_rate": 5.460095002604533e-06, + "loss": 0.2441, + "step": 6850 + }, + { + "epoch": 0.6852, + "grad_norm": 4.608808994293213, + "learning_rate": 5.453875710455562e-06, + "loss": 0.4509, + "step": 6852 + }, + { + "epoch": 0.6854, + "grad_norm": 5.074594497680664, + "learning_rate": 5.447658634032338e-06, + "loss": 0.1942, + "step": 6854 + }, + { + "epoch": 0.6856, + "grad_norm": 4.137449741363525, + "learning_rate": 5.441443776365003e-06, + "loss": 0.1601, + "step": 6856 + }, + { + "epoch": 0.6858, + "grad_norm": 3.4195563793182373, + "learning_rate": 5.435231140482588e-06, + "loss": 0.2321, + "step": 6858 + }, + { + "epoch": 0.686, + "grad_norm": 0.45667970180511475, + "learning_rate": 5.429020729413062e-06, + "loss": 0.0998, + "step": 6860 + }, + { + "epoch": 0.6862, + "grad_norm": 0.1905176043510437, + "learning_rate": 5.4228125461833026e-06, + "loss": 0.0172, + "step": 6862 + }, + { + "epoch": 0.6864, + "grad_norm": 1.4143213033676147, + "learning_rate": 5.416606593819102e-06, + "loss": 0.1596, + "step": 6864 + }, + { + "epoch": 0.6866, + "grad_norm": 2.1148502826690674, + "learning_rate": 5.41040287534517e-06, + "loss": 0.1712, + "step": 6866 + }, + { + "epoch": 0.6868, + "grad_norm": 2.0880091190338135, + "learning_rate": 5.404201393785123e-06, + "loss": 0.3721, + "step": 6868 + }, + { + "epoch": 0.687, + "grad_norm": 0.9828031659126282, + "learning_rate": 5.398002152161484e-06, + "loss": 0.2181, + "step": 6870 + }, + { + "epoch": 0.6872, + "grad_norm": 0.577071487903595, + "learning_rate": 5.391805153495693e-06, + "loss": 0.0363, + "step": 6872 + }, + { + "epoch": 0.6874, + "grad_norm": 3.337368965148926, + "learning_rate": 5.385610400808088e-06, + "loss": 0.195, + "step": 6874 + }, + { + "epoch": 0.6876, + "grad_norm": 0.715030312538147, + "learning_rate": 5.379417897117917e-06, + "loss": 0.0274, + "step": 6876 + }, + { + "epoch": 0.6878, + "grad_norm": 10.844812393188477, + "learning_rate": 5.373227645443332e-06, + "loss": 0.565, + "step": 6878 + }, + { + "epoch": 0.688, + "grad_norm": 2.0936427116394043, + "learning_rate": 5.367039648801386e-06, + "loss": 0.1264, + "step": 6880 + }, + { + "epoch": 0.6882, + "grad_norm": 0.39196038246154785, + "learning_rate": 5.360853910208028e-06, + "loss": 0.0103, + "step": 6882 + }, + { + "epoch": 0.6884, + "grad_norm": 0.8453869819641113, + "learning_rate": 5.354670432678124e-06, + "loss": 0.0214, + "step": 6884 + }, + { + "epoch": 0.6886, + "grad_norm": 8.596561431884766, + "learning_rate": 5.348489219225417e-06, + "loss": 0.4598, + "step": 6886 + }, + { + "epoch": 0.6888, + "grad_norm": 1.050215721130371, + "learning_rate": 5.342310272862558e-06, + "loss": 0.1023, + "step": 6888 + }, + { + "epoch": 0.689, + "grad_norm": 2.4091074466705322, + "learning_rate": 5.336133596601089e-06, + "loss": 0.166, + "step": 6890 + }, + { + "epoch": 0.6892, + "grad_norm": 2.257788896560669, + "learning_rate": 5.3299591934514485e-06, + "loss": 0.2011, + "step": 6892 + }, + { + "epoch": 0.6894, + "grad_norm": 1.3363113403320312, + "learning_rate": 5.323787066422964e-06, + "loss": 0.0741, + "step": 6894 + }, + { + "epoch": 0.6896, + "grad_norm": 1.8734533786773682, + "learning_rate": 5.317617218523856e-06, + "loss": 0.0331, + "step": 6896 + }, + { + "epoch": 0.6898, + "grad_norm": 0.5821602940559387, + "learning_rate": 5.311449652761235e-06, + "loss": 0.0243, + "step": 6898 + }, + { + "epoch": 0.69, + "grad_norm": 1.5080395936965942, + "learning_rate": 5.305284372141095e-06, + "loss": 0.0443, + "step": 6900 + }, + { + "epoch": 0.6902, + "grad_norm": 7.157463550567627, + "learning_rate": 5.299121379668316e-06, + "loss": 0.5418, + "step": 6902 + }, + { + "epoch": 0.6904, + "grad_norm": 3.624070882797241, + "learning_rate": 5.292960678346674e-06, + "loss": 0.0914, + "step": 6904 + }, + { + "epoch": 0.6906, + "grad_norm": 7.520585536956787, + "learning_rate": 5.286802271178815e-06, + "loss": 0.2748, + "step": 6906 + }, + { + "epoch": 0.6908, + "grad_norm": 4.01132345199585, + "learning_rate": 5.280646161166274e-06, + "loss": 0.1677, + "step": 6908 + }, + { + "epoch": 0.691, + "grad_norm": 4.212621212005615, + "learning_rate": 5.274492351309462e-06, + "loss": 0.1503, + "step": 6910 + }, + { + "epoch": 0.6912, + "grad_norm": 1.2125356197357178, + "learning_rate": 5.26834084460767e-06, + "loss": 0.0416, + "step": 6912 + }, + { + "epoch": 0.6914, + "grad_norm": 0.6544078588485718, + "learning_rate": 5.262191644059071e-06, + "loss": 0.1146, + "step": 6914 + }, + { + "epoch": 0.6916, + "grad_norm": 9.290487289428711, + "learning_rate": 5.256044752660709e-06, + "loss": 0.5129, + "step": 6916 + }, + { + "epoch": 0.6918, + "grad_norm": 3.518091917037964, + "learning_rate": 5.2499001734085045e-06, + "loss": 0.2186, + "step": 6918 + }, + { + "epoch": 0.692, + "grad_norm": 2.023345470428467, + "learning_rate": 5.243757909297247e-06, + "loss": 0.1433, + "step": 6920 + }, + { + "epoch": 0.6922, + "grad_norm": 0.4197022020816803, + "learning_rate": 5.237617963320608e-06, + "loss": 0.4863, + "step": 6922 + }, + { + "epoch": 0.6924, + "grad_norm": 0.0955604761838913, + "learning_rate": 5.23148033847112e-06, + "loss": 0.0189, + "step": 6924 + }, + { + "epoch": 0.6926, + "grad_norm": 3.6617543697357178, + "learning_rate": 5.225345037740186e-06, + "loss": 0.4081, + "step": 6926 + }, + { + "epoch": 0.6928, + "grad_norm": 1.3725416660308838, + "learning_rate": 5.219212064118079e-06, + "loss": 0.0469, + "step": 6928 + }, + { + "epoch": 0.693, + "grad_norm": 0.051624320447444916, + "learning_rate": 5.213081420593933e-06, + "loss": 0.0269, + "step": 6930 + }, + { + "epoch": 0.6932, + "grad_norm": 5.8161420822143555, + "learning_rate": 5.2069531101557505e-06, + "loss": 0.2151, + "step": 6932 + }, + { + "epoch": 0.6934, + "grad_norm": 0.5259977579116821, + "learning_rate": 5.200827135790396e-06, + "loss": 0.0242, + "step": 6934 + }, + { + "epoch": 0.6936, + "grad_norm": 0.2075728476047516, + "learning_rate": 5.194703500483593e-06, + "loss": 0.3593, + "step": 6936 + }, + { + "epoch": 0.6938, + "grad_norm": 2.1747379302978516, + "learning_rate": 5.188582207219931e-06, + "loss": 0.0741, + "step": 6938 + }, + { + "epoch": 0.694, + "grad_norm": 6.509223461151123, + "learning_rate": 5.1824632589828465e-06, + "loss": 0.4486, + "step": 6940 + }, + { + "epoch": 0.6942, + "grad_norm": 0.1724066138267517, + "learning_rate": 5.176346658754648e-06, + "loss": 0.0089, + "step": 6942 + }, + { + "epoch": 0.6944, + "grad_norm": 2.1357898712158203, + "learning_rate": 5.1702324095164955e-06, + "loss": 0.1819, + "step": 6944 + }, + { + "epoch": 0.6946, + "grad_norm": 5.606626033782959, + "learning_rate": 5.16412051424839e-06, + "loss": 0.0844, + "step": 6946 + }, + { + "epoch": 0.6948, + "grad_norm": 0.07445748150348663, + "learning_rate": 5.158010975929193e-06, + "loss": 0.4281, + "step": 6948 + }, + { + "epoch": 0.695, + "grad_norm": 0.19951319694519043, + "learning_rate": 5.151903797536631e-06, + "loss": 0.1484, + "step": 6950 + }, + { + "epoch": 0.6952, + "grad_norm": 2.2031755447387695, + "learning_rate": 5.145798982047261e-06, + "loss": 0.0674, + "step": 6952 + }, + { + "epoch": 0.6954, + "grad_norm": 6.818058967590332, + "learning_rate": 5.139696532436499e-06, + "loss": 0.4728, + "step": 6954 + }, + { + "epoch": 0.6956, + "grad_norm": 5.2996697425842285, + "learning_rate": 5.133596451678603e-06, + "loss": 0.3653, + "step": 6956 + }, + { + "epoch": 0.6958, + "grad_norm": 5.317918300628662, + "learning_rate": 5.127498742746675e-06, + "loss": 0.3029, + "step": 6958 + }, + { + "epoch": 0.696, + "grad_norm": 0.40016287565231323, + "learning_rate": 5.121403408612672e-06, + "loss": 0.0308, + "step": 6960 + }, + { + "epoch": 0.6962, + "grad_norm": 0.2948208153247833, + "learning_rate": 5.115310452247386e-06, + "loss": 0.1552, + "step": 6962 + }, + { + "epoch": 0.6964, + "grad_norm": 0.5148042440414429, + "learning_rate": 5.109219876620441e-06, + "loss": 0.4395, + "step": 6964 + }, + { + "epoch": 0.6966, + "grad_norm": 3.921583890914917, + "learning_rate": 5.103131684700315e-06, + "loss": 0.8475, + "step": 6966 + }, + { + "epoch": 0.6968, + "grad_norm": 0.3607581853866577, + "learning_rate": 5.0970458794543135e-06, + "loss": 0.0781, + "step": 6968 + }, + { + "epoch": 0.697, + "grad_norm": 0.604478657245636, + "learning_rate": 5.090962463848592e-06, + "loss": 0.2735, + "step": 6970 + }, + { + "epoch": 0.6972, + "grad_norm": 1.5579429864883423, + "learning_rate": 5.0848814408481305e-06, + "loss": 0.0991, + "step": 6972 + }, + { + "epoch": 0.6974, + "grad_norm": 0.4879496991634369, + "learning_rate": 5.078802813416746e-06, + "loss": 0.1993, + "step": 6974 + }, + { + "epoch": 0.6976, + "grad_norm": 3.4359116554260254, + "learning_rate": 5.072726584517086e-06, + "loss": 0.3558, + "step": 6976 + }, + { + "epoch": 0.6978, + "grad_norm": 0.48656967282295227, + "learning_rate": 5.066652757110628e-06, + "loss": 0.0152, + "step": 6978 + }, + { + "epoch": 0.698, + "grad_norm": 0.8122151494026184, + "learning_rate": 5.060581334157693e-06, + "loss": 0.0308, + "step": 6980 + }, + { + "epoch": 0.6982, + "grad_norm": 0.46794000267982483, + "learning_rate": 5.054512318617406e-06, + "loss": 0.0349, + "step": 6982 + }, + { + "epoch": 0.6984, + "grad_norm": 4.2167253494262695, + "learning_rate": 5.048445713447738e-06, + "loss": 0.2101, + "step": 6984 + }, + { + "epoch": 0.6986, + "grad_norm": 0.8956387042999268, + "learning_rate": 5.042381521605473e-06, + "loss": 0.1146, + "step": 6986 + }, + { + "epoch": 0.6988, + "grad_norm": 6.124754905700684, + "learning_rate": 5.036319746046232e-06, + "loss": 0.1551, + "step": 6988 + }, + { + "epoch": 0.699, + "grad_norm": 0.5178934931755066, + "learning_rate": 5.030260389724447e-06, + "loss": 0.1198, + "step": 6990 + }, + { + "epoch": 0.6992, + "grad_norm": 0.3754532039165497, + "learning_rate": 5.024203455593375e-06, + "loss": 0.1311, + "step": 6992 + }, + { + "epoch": 0.6994, + "grad_norm": 0.9165704250335693, + "learning_rate": 5.018148946605092e-06, + "loss": 0.322, + "step": 6994 + }, + { + "epoch": 0.6996, + "grad_norm": 9.615159034729004, + "learning_rate": 5.012096865710494e-06, + "loss": 0.5707, + "step": 6996 + }, + { + "epoch": 0.6998, + "grad_norm": 0.6602795720100403, + "learning_rate": 5.0060472158592885e-06, + "loss": 0.3285, + "step": 6998 + }, + { + "epoch": 0.7, + "grad_norm": 5.50363826751709, + "learning_rate": 5.000000000000003e-06, + "loss": 0.1895, + "step": 7000 + }, + { + "epoch": 0.7002, + "grad_norm": 0.8959852457046509, + "learning_rate": 4.993955221079976e-06, + "loss": 0.0584, + "step": 7002 + }, + { + "epoch": 0.7004, + "grad_norm": 9.781309127807617, + "learning_rate": 4.98791288204536e-06, + "loss": 0.2282, + "step": 7004 + }, + { + "epoch": 0.7006, + "grad_norm": 4.310182571411133, + "learning_rate": 4.981872985841115e-06, + "loss": 0.3222, + "step": 7006 + }, + { + "epoch": 0.7008, + "grad_norm": 0.21829384565353394, + "learning_rate": 4.97583553541102e-06, + "loss": 0.0175, + "step": 7008 + }, + { + "epoch": 0.701, + "grad_norm": 2.875472068786621, + "learning_rate": 4.96980053369765e-06, + "loss": 0.1013, + "step": 7010 + }, + { + "epoch": 0.7012, + "grad_norm": 4.273564338684082, + "learning_rate": 4.9637679836423926e-06, + "loss": 0.471, + "step": 7012 + }, + { + "epoch": 0.7014, + "grad_norm": 0.2526986002922058, + "learning_rate": 4.957737888185439e-06, + "loss": 0.0137, + "step": 7014 + }, + { + "epoch": 0.7016, + "grad_norm": 13.692468643188477, + "learning_rate": 4.951710250265785e-06, + "loss": 0.3629, + "step": 7016 + }, + { + "epoch": 0.7018, + "grad_norm": 3.544919967651367, + "learning_rate": 4.945685072821227e-06, + "loss": 0.3146, + "step": 7018 + }, + { + "epoch": 0.702, + "grad_norm": 0.9518455862998962, + "learning_rate": 4.939662358788364e-06, + "loss": 0.0349, + "step": 7020 + }, + { + "epoch": 0.7022, + "grad_norm": 1.358994722366333, + "learning_rate": 4.933642111102595e-06, + "loss": 0.3464, + "step": 7022 + }, + { + "epoch": 0.7024, + "grad_norm": 0.28896257281303406, + "learning_rate": 4.927624332698109e-06, + "loss": 0.4332, + "step": 7024 + }, + { + "epoch": 0.7026, + "grad_norm": 1.3150631189346313, + "learning_rate": 4.921609026507907e-06, + "loss": 0.1012, + "step": 7026 + }, + { + "epoch": 0.7028, + "grad_norm": 3.0952517986297607, + "learning_rate": 4.915596195463773e-06, + "loss": 0.1159, + "step": 7028 + }, + { + "epoch": 0.703, + "grad_norm": 8.56776237487793, + "learning_rate": 4.909585842496287e-06, + "loss": 0.4481, + "step": 7030 + }, + { + "epoch": 0.7032, + "grad_norm": 4.326989650726318, + "learning_rate": 4.903577970534823e-06, + "loss": 0.5961, + "step": 7032 + }, + { + "epoch": 0.7034, + "grad_norm": 0.44338536262512207, + "learning_rate": 4.897572582507544e-06, + "loss": 0.0441, + "step": 7034 + }, + { + "epoch": 0.7036, + "grad_norm": 4.049919605255127, + "learning_rate": 4.891569681341403e-06, + "loss": 0.1573, + "step": 7036 + }, + { + "epoch": 0.7038, + "grad_norm": 2.871220588684082, + "learning_rate": 4.885569269962142e-06, + "loss": 0.0992, + "step": 7038 + }, + { + "epoch": 0.704, + "grad_norm": 3.602328300476074, + "learning_rate": 4.879571351294287e-06, + "loss": 0.0885, + "step": 7040 + }, + { + "epoch": 0.7042, + "grad_norm": 2.9096200466156006, + "learning_rate": 4.873575928261151e-06, + "loss": 0.3381, + "step": 7042 + }, + { + "epoch": 0.7044, + "grad_norm": 10.685242652893066, + "learning_rate": 4.8675830037848295e-06, + "loss": 0.5725, + "step": 7044 + }, + { + "epoch": 0.7046, + "grad_norm": 0.29387927055358887, + "learning_rate": 4.861592580786205e-06, + "loss": 0.1823, + "step": 7046 + }, + { + "epoch": 0.7048, + "grad_norm": 1.045341968536377, + "learning_rate": 4.855604662184935e-06, + "loss": 0.2183, + "step": 7048 + }, + { + "epoch": 0.705, + "grad_norm": 2.8172101974487305, + "learning_rate": 4.849619250899458e-06, + "loss": 0.079, + "step": 7050 + }, + { + "epoch": 0.7052, + "grad_norm": 0.4235807955265045, + "learning_rate": 4.843636349846991e-06, + "loss": 0.1467, + "step": 7052 + }, + { + "epoch": 0.7054, + "grad_norm": 5.555002212524414, + "learning_rate": 4.837655961943526e-06, + "loss": 0.2891, + "step": 7054 + }, + { + "epoch": 0.7056, + "grad_norm": 2.7532591819763184, + "learning_rate": 4.831678090103832e-06, + "loss": 0.1085, + "step": 7056 + }, + { + "epoch": 0.7058, + "grad_norm": 6.321671485900879, + "learning_rate": 4.825702737241452e-06, + "loss": 0.451, + "step": 7058 + }, + { + "epoch": 0.706, + "grad_norm": 9.150782585144043, + "learning_rate": 4.8197299062687e-06, + "loss": 0.6507, + "step": 7060 + }, + { + "epoch": 0.7062, + "grad_norm": 0.3610908091068268, + "learning_rate": 4.813759600096661e-06, + "loss": 0.0108, + "step": 7062 + }, + { + "epoch": 0.7064, + "grad_norm": 0.5376060605049133, + "learning_rate": 4.807791821635186e-06, + "loss": 0.0706, + "step": 7064 + }, + { + "epoch": 0.7066, + "grad_norm": 4.280257225036621, + "learning_rate": 4.801826573792905e-06, + "loss": 0.2387, + "step": 7066 + }, + { + "epoch": 0.7068, + "grad_norm": 1.741957664489746, + "learning_rate": 4.795863859477207e-06, + "loss": 0.0676, + "step": 7068 + }, + { + "epoch": 0.707, + "grad_norm": 2.9873931407928467, + "learning_rate": 4.78990368159424e-06, + "loss": 0.4374, + "step": 7070 + }, + { + "epoch": 0.7072, + "grad_norm": 5.084872722625732, + "learning_rate": 4.783946043048922e-06, + "loss": 0.1662, + "step": 7072 + }, + { + "epoch": 0.7074, + "grad_norm": 1.2855349779129028, + "learning_rate": 4.7779909467449416e-06, + "loss": 0.037, + "step": 7074 + }, + { + "epoch": 0.7076, + "grad_norm": 0.22427190840244293, + "learning_rate": 4.772038395584735e-06, + "loss": 0.0253, + "step": 7076 + }, + { + "epoch": 0.7078, + "grad_norm": 1.9484387636184692, + "learning_rate": 4.7660883924695055e-06, + "loss": 0.0749, + "step": 7078 + }, + { + "epoch": 0.708, + "grad_norm": 7.093513488769531, + "learning_rate": 4.76014094029921e-06, + "loss": 0.2063, + "step": 7080 + }, + { + "epoch": 0.7082, + "grad_norm": 5.635471820831299, + "learning_rate": 4.754196041972563e-06, + "loss": 0.6093, + "step": 7082 + }, + { + "epoch": 0.7084, + "grad_norm": 0.8144509196281433, + "learning_rate": 4.7482537003870425e-06, + "loss": 0.0629, + "step": 7084 + }, + { + "epoch": 0.7086, + "grad_norm": 1.7809778451919556, + "learning_rate": 4.7423139184388725e-06, + "loss": 0.1764, + "step": 7086 + }, + { + "epoch": 0.7088, + "grad_norm": 0.5040732026100159, + "learning_rate": 4.736376699023023e-06, + "loss": 0.0944, + "step": 7088 + }, + { + "epoch": 0.709, + "grad_norm": 11.135908126831055, + "learning_rate": 4.7304420450332244e-06, + "loss": 0.6293, + "step": 7090 + }, + { + "epoch": 0.7092, + "grad_norm": 6.820241928100586, + "learning_rate": 4.724509959361961e-06, + "loss": 0.5284, + "step": 7092 + }, + { + "epoch": 0.7094, + "grad_norm": 8.259851455688477, + "learning_rate": 4.718580444900457e-06, + "loss": 0.3561, + "step": 7094 + }, + { + "epoch": 0.7096, + "grad_norm": 0.8531981110572815, + "learning_rate": 4.712653504538684e-06, + "loss": 0.0569, + "step": 7096 + }, + { + "epoch": 0.7098, + "grad_norm": 0.4144521951675415, + "learning_rate": 4.706729141165362e-06, + "loss": 0.1069, + "step": 7098 + }, + { + "epoch": 0.71, + "grad_norm": 16.445377349853516, + "learning_rate": 4.700807357667953e-06, + "loss": 0.5095, + "step": 7100 + }, + { + "epoch": 0.7102, + "grad_norm": 3.0783047676086426, + "learning_rate": 4.694888156932657e-06, + "loss": 0.2776, + "step": 7102 + }, + { + "epoch": 0.7104, + "grad_norm": 0.6945970058441162, + "learning_rate": 4.688971541844436e-06, + "loss": 0.1367, + "step": 7104 + }, + { + "epoch": 0.7106, + "grad_norm": 0.2581179440021515, + "learning_rate": 4.6830575152869615e-06, + "loss": 0.0418, + "step": 7106 + }, + { + "epoch": 0.7108, + "grad_norm": 0.5937615036964417, + "learning_rate": 4.677146080142664e-06, + "loss": 0.2794, + "step": 7108 + }, + { + "epoch": 0.711, + "grad_norm": 0.056709665805101395, + "learning_rate": 4.671237239292699e-06, + "loss": 0.089, + "step": 7110 + }, + { + "epoch": 0.7112, + "grad_norm": 7.397021770477295, + "learning_rate": 4.6653309956169745e-06, + "loss": 0.243, + "step": 7112 + }, + { + "epoch": 0.7114, + "grad_norm": 0.5573907494544983, + "learning_rate": 4.659427351994116e-06, + "loss": 0.1894, + "step": 7114 + }, + { + "epoch": 0.7116, + "grad_norm": 3.7200567722320557, + "learning_rate": 4.6535263113014885e-06, + "loss": 0.1008, + "step": 7116 + }, + { + "epoch": 0.7118, + "grad_norm": 1.995788812637329, + "learning_rate": 4.647627876415186e-06, + "loss": 0.0443, + "step": 7118 + }, + { + "epoch": 0.712, + "grad_norm": 3.602559804916382, + "learning_rate": 4.641732050210032e-06, + "loss": 0.1642, + "step": 7120 + }, + { + "epoch": 0.7122, + "grad_norm": 5.970663070678711, + "learning_rate": 4.635838835559591e-06, + "loss": 0.3561, + "step": 7122 + }, + { + "epoch": 0.7124, + "grad_norm": 1.4692498445510864, + "learning_rate": 4.629948235336133e-06, + "loss": 0.0337, + "step": 7124 + }, + { + "epoch": 0.7126, + "grad_norm": 3.8772687911987305, + "learning_rate": 4.62406025241067e-06, + "loss": 0.3143, + "step": 7126 + }, + { + "epoch": 0.7128, + "grad_norm": 2.1334939002990723, + "learning_rate": 4.618174889652928e-06, + "loss": 0.1762, + "step": 7128 + }, + { + "epoch": 0.713, + "grad_norm": 0.8207858800888062, + "learning_rate": 4.612292149931369e-06, + "loss": 0.4204, + "step": 7130 + }, + { + "epoch": 0.7132, + "grad_norm": 6.681224822998047, + "learning_rate": 4.606412036113166e-06, + "loss": 0.2724, + "step": 7132 + }, + { + "epoch": 0.7134, + "grad_norm": 0.1131209284067154, + "learning_rate": 4.600534551064215e-06, + "loss": 0.1622, + "step": 7134 + }, + { + "epoch": 0.7136, + "grad_norm": 1.109412431716919, + "learning_rate": 4.59465969764913e-06, + "loss": 0.4406, + "step": 7136 + }, + { + "epoch": 0.7138, + "grad_norm": 2.3327438831329346, + "learning_rate": 4.588787478731242e-06, + "loss": 0.3689, + "step": 7138 + }, + { + "epoch": 0.714, + "grad_norm": 0.6009097099304199, + "learning_rate": 4.582917897172603e-06, + "loss": 0.0212, + "step": 7140 + }, + { + "epoch": 0.7142, + "grad_norm": 0.3480912744998932, + "learning_rate": 4.577050955833972e-06, + "loss": 0.1552, + "step": 7142 + }, + { + "epoch": 0.7144, + "grad_norm": 0.06893940269947052, + "learning_rate": 4.571186657574828e-06, + "loss": 0.1757, + "step": 7144 + }, + { + "epoch": 0.7146, + "grad_norm": 2.029660940170288, + "learning_rate": 4.565325005253356e-06, + "loss": 0.1494, + "step": 7146 + }, + { + "epoch": 0.7148, + "grad_norm": 6.288959503173828, + "learning_rate": 4.559466001726451e-06, + "loss": 0.8475, + "step": 7148 + }, + { + "epoch": 0.715, + "grad_norm": 7.001792907714844, + "learning_rate": 4.5536096498497295e-06, + "loss": 0.4407, + "step": 7150 + }, + { + "epoch": 0.7152, + "grad_norm": 0.5368174910545349, + "learning_rate": 4.5477559524775e-06, + "loss": 0.022, + "step": 7152 + }, + { + "epoch": 0.7154, + "grad_norm": 2.0567626953125, + "learning_rate": 4.541904912462785e-06, + "loss": 0.0641, + "step": 7154 + }, + { + "epoch": 0.7156, + "grad_norm": 0.3701546788215637, + "learning_rate": 4.53605653265731e-06, + "loss": 0.3833, + "step": 7156 + }, + { + "epoch": 0.7158, + "grad_norm": 0.25160813331604004, + "learning_rate": 4.530210815911504e-06, + "loss": 0.0564, + "step": 7158 + }, + { + "epoch": 0.716, + "grad_norm": 4.81494140625, + "learning_rate": 4.524367765074499e-06, + "loss": 0.3381, + "step": 7160 + }, + { + "epoch": 0.7162, + "grad_norm": 2.9374840259552, + "learning_rate": 4.518527382994127e-06, + "loss": 0.5827, + "step": 7162 + }, + { + "epoch": 0.7164, + "grad_norm": 4.938754081726074, + "learning_rate": 4.512689672516918e-06, + "loss": 0.1147, + "step": 7164 + }, + { + "epoch": 0.7166, + "grad_norm": 1.4536011219024658, + "learning_rate": 4.506854636488103e-06, + "loss": 0.1146, + "step": 7166 + }, + { + "epoch": 0.7168, + "grad_norm": 5.965865612030029, + "learning_rate": 4.501022277751602e-06, + "loss": 0.1293, + "step": 7168 + }, + { + "epoch": 0.717, + "grad_norm": 1.4396333694458008, + "learning_rate": 4.495192599150045e-06, + "loss": 0.5723, + "step": 7170 + }, + { + "epoch": 0.7172, + "grad_norm": 3.689840078353882, + "learning_rate": 4.48936560352474e-06, + "loss": 0.2974, + "step": 7172 + }, + { + "epoch": 0.7174, + "grad_norm": 0.9893630146980286, + "learning_rate": 4.483541293715699e-06, + "loss": 0.1549, + "step": 7174 + }, + { + "epoch": 0.7176, + "grad_norm": 0.15630078315734863, + "learning_rate": 4.477719672561615e-06, + "loss": 0.124, + "step": 7176 + }, + { + "epoch": 0.7178, + "grad_norm": 4.798556804656982, + "learning_rate": 4.471900742899876e-06, + "loss": 0.1642, + "step": 7178 + }, + { + "epoch": 0.718, + "grad_norm": 1.0073946714401245, + "learning_rate": 4.46608450756656e-06, + "loss": 0.1668, + "step": 7180 + }, + { + "epoch": 0.7182, + "grad_norm": 3.948152542114258, + "learning_rate": 4.4602709693964296e-06, + "loss": 0.2925, + "step": 7182 + }, + { + "epoch": 0.7184, + "grad_norm": 0.5123436450958252, + "learning_rate": 4.4544601312229295e-06, + "loss": 0.1049, + "step": 7184 + }, + { + "epoch": 0.7186, + "grad_norm": 0.9494496583938599, + "learning_rate": 4.44865199587819e-06, + "loss": 0.0885, + "step": 7186 + }, + { + "epoch": 0.7188, + "grad_norm": 1.3536053895950317, + "learning_rate": 4.442846566193034e-06, + "loss": 0.5825, + "step": 7188 + }, + { + "epoch": 0.719, + "grad_norm": 0.1603391468524933, + "learning_rate": 4.437043844996952e-06, + "loss": 0.2672, + "step": 7190 + }, + { + "epoch": 0.7192, + "grad_norm": 3.7431447505950928, + "learning_rate": 4.4312438351181246e-06, + "loss": 0.237, + "step": 7192 + }, + { + "epoch": 0.7194, + "grad_norm": 1.563745141029358, + "learning_rate": 4.425446539383394e-06, + "loss": 0.1013, + "step": 7194 + }, + { + "epoch": 0.7196, + "grad_norm": 7.135501861572266, + "learning_rate": 4.419651960618302e-06, + "loss": 0.5496, + "step": 7196 + }, + { + "epoch": 0.7198, + "grad_norm": 6.194867134094238, + "learning_rate": 4.413860101647055e-06, + "loss": 0.3628, + "step": 7198 + }, + { + "epoch": 0.72, + "grad_norm": 1.2806998491287231, + "learning_rate": 4.408070965292534e-06, + "loss": 0.1819, + "step": 7200 + }, + { + "epoch": 0.7202, + "grad_norm": 0.24364517629146576, + "learning_rate": 4.402284554376292e-06, + "loss": 0.4099, + "step": 7202 + }, + { + "epoch": 0.7204, + "grad_norm": 3.373668670654297, + "learning_rate": 4.3965008717185555e-06, + "loss": 0.0592, + "step": 7204 + }, + { + "epoch": 0.7206, + "grad_norm": 4.231733798980713, + "learning_rate": 4.39071992013822e-06, + "loss": 0.1743, + "step": 7206 + }, + { + "epoch": 0.7208, + "grad_norm": 0.0793970376253128, + "learning_rate": 4.384941702452856e-06, + "loss": 0.2091, + "step": 7208 + }, + { + "epoch": 0.721, + "grad_norm": 4.582828521728516, + "learning_rate": 4.379166221478697e-06, + "loss": 0.2754, + "step": 7210 + }, + { + "epoch": 0.7212, + "grad_norm": 3.898827314376831, + "learning_rate": 4.373393480030637e-06, + "loss": 0.1661, + "step": 7212 + }, + { + "epoch": 0.7214, + "grad_norm": 2.7667651176452637, + "learning_rate": 4.367623480922236e-06, + "loss": 0.5318, + "step": 7214 + }, + { + "epoch": 0.7216, + "grad_norm": 6.110058784484863, + "learning_rate": 4.361856226965733e-06, + "loss": 0.2386, + "step": 7216 + }, + { + "epoch": 0.7218, + "grad_norm": 3.9693098068237305, + "learning_rate": 4.356091720972011e-06, + "loss": 0.305, + "step": 7218 + }, + { + "epoch": 0.722, + "grad_norm": 3.088503360748291, + "learning_rate": 4.350329965750622e-06, + "loss": 0.2019, + "step": 7220 + }, + { + "epoch": 0.7222, + "grad_norm": 0.4486554265022278, + "learning_rate": 4.344570964109775e-06, + "loss": 0.2358, + "step": 7222 + }, + { + "epoch": 0.7224, + "grad_norm": 0.21986593306064606, + "learning_rate": 4.338814718856333e-06, + "loss": 0.0351, + "step": 7224 + }, + { + "epoch": 0.7226, + "grad_norm": 0.633152961730957, + "learning_rate": 4.3330612327958265e-06, + "loss": 0.0176, + "step": 7226 + }, + { + "epoch": 0.7228, + "grad_norm": 0.10452810674905777, + "learning_rate": 4.3273105087324375e-06, + "loss": 0.1476, + "step": 7228 + }, + { + "epoch": 0.723, + "grad_norm": 0.32841554284095764, + "learning_rate": 4.321562549468991e-06, + "loss": 0.0231, + "step": 7230 + }, + { + "epoch": 0.7232, + "grad_norm": 8.153350830078125, + "learning_rate": 4.315817357806974e-06, + "loss": 0.3258, + "step": 7232 + }, + { + "epoch": 0.7234, + "grad_norm": 1.528338074684143, + "learning_rate": 4.310074936546521e-06, + "loss": 0.2473, + "step": 7234 + }, + { + "epoch": 0.7236, + "grad_norm": 0.2629214823246002, + "learning_rate": 4.304335288486426e-06, + "loss": 0.0527, + "step": 7236 + }, + { + "epoch": 0.7238, + "grad_norm": 5.62484073638916, + "learning_rate": 4.29859841642412e-06, + "loss": 0.3511, + "step": 7238 + }, + { + "epoch": 0.724, + "grad_norm": 7.763027667999268, + "learning_rate": 4.292864323155684e-06, + "loss": 0.3321, + "step": 7240 + }, + { + "epoch": 0.7242, + "grad_norm": 2.332965612411499, + "learning_rate": 4.287133011475847e-06, + "loss": 0.0581, + "step": 7242 + }, + { + "epoch": 0.7244, + "grad_norm": 5.321888446807861, + "learning_rate": 4.281404484177974e-06, + "loss": 0.4498, + "step": 7244 + }, + { + "epoch": 0.7246, + "grad_norm": 2.2592239379882812, + "learning_rate": 4.275678744054094e-06, + "loss": 0.0996, + "step": 7246 + }, + { + "epoch": 0.7248, + "grad_norm": 4.337447166442871, + "learning_rate": 4.26995579389485e-06, + "loss": 0.338, + "step": 7248 + }, + { + "epoch": 0.725, + "grad_norm": 1.7880980968475342, + "learning_rate": 4.264235636489542e-06, + "loss": 0.0456, + "step": 7250 + }, + { + "epoch": 0.7252, + "grad_norm": 0.20464052259922028, + "learning_rate": 4.258518274626103e-06, + "loss": 0.0675, + "step": 7252 + }, + { + "epoch": 0.7254, + "grad_norm": 0.826458752155304, + "learning_rate": 4.2528037110911126e-06, + "loss": 0.099, + "step": 7254 + }, + { + "epoch": 0.7256, + "grad_norm": 2.311230182647705, + "learning_rate": 4.247091948669775e-06, + "loss": 0.2943, + "step": 7256 + }, + { + "epoch": 0.7258, + "grad_norm": 0.26947686076164246, + "learning_rate": 4.2413829901459345e-06, + "loss": 0.035, + "step": 7258 + }, + { + "epoch": 0.726, + "grad_norm": 0.87807297706604, + "learning_rate": 4.235676838302069e-06, + "loss": 0.0321, + "step": 7260 + }, + { + "epoch": 0.7262, + "grad_norm": 3.356156587600708, + "learning_rate": 4.229973495919286e-06, + "loss": 0.137, + "step": 7262 + }, + { + "epoch": 0.7264, + "grad_norm": 10.963428497314453, + "learning_rate": 4.224272965777326e-06, + "loss": 0.3514, + "step": 7264 + }, + { + "epoch": 0.7266, + "grad_norm": 15.127537727355957, + "learning_rate": 4.218575250654559e-06, + "loss": 0.9836, + "step": 7266 + }, + { + "epoch": 0.7268, + "grad_norm": 1.970811367034912, + "learning_rate": 4.21288035332798e-06, + "loss": 0.5302, + "step": 7268 + }, + { + "epoch": 0.727, + "grad_norm": 2.847975492477417, + "learning_rate": 4.207188276573214e-06, + "loss": 0.2283, + "step": 7270 + }, + { + "epoch": 0.7272, + "grad_norm": 6.228306770324707, + "learning_rate": 4.201499023164508e-06, + "loss": 0.3249, + "step": 7272 + }, + { + "epoch": 0.7274, + "grad_norm": 1.0361886024475098, + "learning_rate": 4.19581259587474e-06, + "loss": 0.0476, + "step": 7274 + }, + { + "epoch": 0.7276, + "grad_norm": 1.1239997148513794, + "learning_rate": 4.190128997475402e-06, + "loss": 0.1325, + "step": 7276 + }, + { + "epoch": 0.7278, + "grad_norm": 7.582710266113281, + "learning_rate": 4.184448230736613e-06, + "loss": 0.3437, + "step": 7278 + }, + { + "epoch": 0.728, + "grad_norm": 5.733713150024414, + "learning_rate": 4.178770298427107e-06, + "loss": 0.3833, + "step": 7280 + }, + { + "epoch": 0.7282, + "grad_norm": 2.6746461391448975, + "learning_rate": 4.173095203314241e-06, + "loss": 0.3767, + "step": 7282 + }, + { + "epoch": 0.7284, + "grad_norm": 2.7200767993927, + "learning_rate": 4.167422948163986e-06, + "loss": 0.3634, + "step": 7284 + }, + { + "epoch": 0.7286, + "grad_norm": 10.713414192199707, + "learning_rate": 4.161753535740932e-06, + "loss": 0.5955, + "step": 7286 + }, + { + "epoch": 0.7288, + "grad_norm": 2.8930485248565674, + "learning_rate": 4.15608696880828e-06, + "loss": 0.1845, + "step": 7288 + }, + { + "epoch": 0.729, + "grad_norm": 4.219006538391113, + "learning_rate": 4.150423250127846e-06, + "loss": 0.2152, + "step": 7290 + }, + { + "epoch": 0.7292, + "grad_norm": 1.6658515930175781, + "learning_rate": 4.144762382460059e-06, + "loss": 0.0663, + "step": 7292 + }, + { + "epoch": 0.7294, + "grad_norm": 2.018024444580078, + "learning_rate": 4.1391043685639576e-06, + "loss": 0.0845, + "step": 7294 + }, + { + "epoch": 0.7296, + "grad_norm": 1.3039379119873047, + "learning_rate": 4.133449211197188e-06, + "loss": 0.1763, + "step": 7296 + }, + { + "epoch": 0.7298, + "grad_norm": 0.04561923071742058, + "learning_rate": 4.127796913116004e-06, + "loss": 0.4526, + "step": 7298 + }, + { + "epoch": 0.73, + "grad_norm": 0.16378910839557648, + "learning_rate": 4.12214747707527e-06, + "loss": 0.1248, + "step": 7300 + }, + { + "epoch": 0.7302, + "grad_norm": 3.0852584838867188, + "learning_rate": 4.1165009058284496e-06, + "loss": 0.1895, + "step": 7302 + }, + { + "epoch": 0.7304, + "grad_norm": 2.6093788146972656, + "learning_rate": 4.110857202127615e-06, + "loss": 0.6472, + "step": 7304 + }, + { + "epoch": 0.7306, + "grad_norm": 16.765865325927734, + "learning_rate": 4.105216368723437e-06, + "loss": 1.0097, + "step": 7306 + }, + { + "epoch": 0.7308, + "grad_norm": 0.8194018602371216, + "learning_rate": 4.099578408365192e-06, + "loss": 0.3287, + "step": 7308 + }, + { + "epoch": 0.731, + "grad_norm": 0.8269910216331482, + "learning_rate": 4.093943323800746e-06, + "loss": 0.2121, + "step": 7310 + }, + { + "epoch": 0.7312, + "grad_norm": 0.49718788266181946, + "learning_rate": 4.08831111777658e-06, + "loss": 0.0862, + "step": 7312 + }, + { + "epoch": 0.7314, + "grad_norm": 5.906735897064209, + "learning_rate": 4.08268179303776e-06, + "loss": 0.2519, + "step": 7314 + }, + { + "epoch": 0.7316, + "grad_norm": 5.983996868133545, + "learning_rate": 4.0770553523279535e-06, + "loss": 0.196, + "step": 7316 + }, + { + "epoch": 0.7318, + "grad_norm": 3.696990966796875, + "learning_rate": 4.071431798389408e-06, + "loss": 0.0676, + "step": 7318 + }, + { + "epoch": 0.732, + "grad_norm": 0.17371319234371185, + "learning_rate": 4.065811133962987e-06, + "loss": 0.0328, + "step": 7320 + }, + { + "epoch": 0.7322, + "grad_norm": 6.114076614379883, + "learning_rate": 4.06019336178813e-06, + "loss": 0.3351, + "step": 7322 + }, + { + "epoch": 0.7324, + "grad_norm": 2.1939380168914795, + "learning_rate": 4.05457848460287e-06, + "loss": 0.1325, + "step": 7324 + }, + { + "epoch": 0.7326, + "grad_norm": 5.219820976257324, + "learning_rate": 4.048966505143831e-06, + "loss": 0.2921, + "step": 7326 + }, + { + "epoch": 0.7328, + "grad_norm": 1.6827659606933594, + "learning_rate": 4.04335742614622e-06, + "loss": 0.2882, + "step": 7328 + }, + { + "epoch": 0.733, + "grad_norm": 1.075720191001892, + "learning_rate": 4.037751250343841e-06, + "loss": 0.3421, + "step": 7330 + }, + { + "epoch": 0.7332, + "grad_norm": 8.861807823181152, + "learning_rate": 4.032147980469072e-06, + "loss": 0.5202, + "step": 7332 + }, + { + "epoch": 0.7334, + "grad_norm": 0.703471839427948, + "learning_rate": 4.026547619252883e-06, + "loss": 0.1274, + "step": 7334 + }, + { + "epoch": 0.7336, + "grad_norm": 3.5199646949768066, + "learning_rate": 4.020950169424815e-06, + "loss": 0.1508, + "step": 7336 + }, + { + "epoch": 0.7338, + "grad_norm": 9.400798797607422, + "learning_rate": 4.015355633712996e-06, + "loss": 0.4051, + "step": 7338 + }, + { + "epoch": 0.734, + "grad_norm": 0.41200411319732666, + "learning_rate": 4.009764014844143e-06, + "loss": 0.0663, + "step": 7340 + }, + { + "epoch": 0.7342, + "grad_norm": 1.5544410943984985, + "learning_rate": 4.004175315543538e-06, + "loss": 0.1237, + "step": 7342 + }, + { + "epoch": 0.7344, + "grad_norm": 0.5806523561477661, + "learning_rate": 3.998589538535046e-06, + "loss": 0.8079, + "step": 7344 + }, + { + "epoch": 0.7346, + "grad_norm": 3.1126832962036133, + "learning_rate": 3.993006686541108e-06, + "loss": 0.246, + "step": 7346 + }, + { + "epoch": 0.7348, + "grad_norm": 1.1309667825698853, + "learning_rate": 3.987426762282733e-06, + "loss": 0.0722, + "step": 7348 + }, + { + "epoch": 0.735, + "grad_norm": 0.7706969976425171, + "learning_rate": 3.981849768479516e-06, + "loss": 0.0369, + "step": 7350 + }, + { + "epoch": 0.7352, + "grad_norm": 1.772402048110962, + "learning_rate": 3.976275707849616e-06, + "loss": 0.1417, + "step": 7352 + }, + { + "epoch": 0.7354, + "grad_norm": 8.549421310424805, + "learning_rate": 3.970704583109755e-06, + "loss": 0.6583, + "step": 7354 + }, + { + "epoch": 0.7356, + "grad_norm": 6.193266868591309, + "learning_rate": 3.965136396975235e-06, + "loss": 0.3443, + "step": 7356 + }, + { + "epoch": 0.7358, + "grad_norm": 5.833011150360107, + "learning_rate": 3.959571152159922e-06, + "loss": 0.3261, + "step": 7358 + }, + { + "epoch": 0.736, + "grad_norm": 5.78825044631958, + "learning_rate": 3.954008851376252e-06, + "loss": 0.6156, + "step": 7360 + }, + { + "epoch": 0.7362, + "grad_norm": 0.9694762229919434, + "learning_rate": 3.94844949733522e-06, + "loss": 0.2691, + "step": 7362 + }, + { + "epoch": 0.7364, + "grad_norm": 3.630249261856079, + "learning_rate": 3.942893092746387e-06, + "loss": 0.2774, + "step": 7364 + }, + { + "epoch": 0.7366, + "grad_norm": 1.5751121044158936, + "learning_rate": 3.937339640317879e-06, + "loss": 0.2635, + "step": 7366 + }, + { + "epoch": 0.7368, + "grad_norm": 2.2820370197296143, + "learning_rate": 3.931789142756377e-06, + "loss": 0.1373, + "step": 7368 + }, + { + "epoch": 0.737, + "grad_norm": 1.4606951475143433, + "learning_rate": 3.9262416027671354e-06, + "loss": 0.2637, + "step": 7370 + }, + { + "epoch": 0.7372, + "grad_norm": 4.829554557800293, + "learning_rate": 3.920697023053949e-06, + "loss": 0.1562, + "step": 7372 + }, + { + "epoch": 0.7374, + "grad_norm": 3.779395341873169, + "learning_rate": 3.915155406319181e-06, + "loss": 0.1847, + "step": 7374 + }, + { + "epoch": 0.7376, + "grad_norm": 0.2849827706813812, + "learning_rate": 3.9096167552637454e-06, + "loss": 0.0205, + "step": 7376 + }, + { + "epoch": 0.7378, + "grad_norm": 5.890630722045898, + "learning_rate": 3.90408107258712e-06, + "loss": 0.1962, + "step": 7378 + }, + { + "epoch": 0.738, + "grad_norm": 5.627165794372559, + "learning_rate": 3.898548360987325e-06, + "loss": 0.2038, + "step": 7380 + }, + { + "epoch": 0.7382, + "grad_norm": 0.3232470154762268, + "learning_rate": 3.893018623160938e-06, + "loss": 0.202, + "step": 7382 + }, + { + "epoch": 0.7384, + "grad_norm": 2.35629940032959, + "learning_rate": 3.887491861803085e-06, + "loss": 0.3088, + "step": 7384 + }, + { + "epoch": 0.7386, + "grad_norm": 2.417980909347534, + "learning_rate": 3.88196807960744e-06, + "loss": 0.1067, + "step": 7386 + }, + { + "epoch": 0.7388, + "grad_norm": 1.5987173318862915, + "learning_rate": 3.876447279266238e-06, + "loss": 0.0989, + "step": 7388 + }, + { + "epoch": 0.739, + "grad_norm": 0.7247028350830078, + "learning_rate": 3.8709294634702374e-06, + "loss": 0.0273, + "step": 7390 + }, + { + "epoch": 0.7392, + "grad_norm": 0.3843022584915161, + "learning_rate": 3.86541463490876e-06, + "loss": 0.0423, + "step": 7392 + }, + { + "epoch": 0.7394, + "grad_norm": 0.4488237500190735, + "learning_rate": 3.859902796269664e-06, + "loss": 0.1251, + "step": 7394 + }, + { + "epoch": 0.7396, + "grad_norm": 5.436135768890381, + "learning_rate": 3.854393950239356e-06, + "loss": 0.5959, + "step": 7396 + }, + { + "epoch": 0.7398, + "grad_norm": 0.24551723897457123, + "learning_rate": 3.848888099502779e-06, + "loss": 0.0573, + "step": 7398 + }, + { + "epoch": 0.74, + "grad_norm": 1.992569088935852, + "learning_rate": 3.8433852467434175e-06, + "loss": 0.1945, + "step": 7400 + }, + { + "epoch": 0.7402, + "grad_norm": 0.3705749213695526, + "learning_rate": 3.8378853946432956e-06, + "loss": 0.0615, + "step": 7402 + }, + { + "epoch": 0.7404, + "grad_norm": 5.893743515014648, + "learning_rate": 3.832388545882975e-06, + "loss": 0.3697, + "step": 7404 + }, + { + "epoch": 0.7406, + "grad_norm": 3.2105772495269775, + "learning_rate": 3.826894703141552e-06, + "loss": 0.2225, + "step": 7406 + }, + { + "epoch": 0.7408, + "grad_norm": 3.265939474105835, + "learning_rate": 3.821403869096658e-06, + "loss": 0.1254, + "step": 7408 + }, + { + "epoch": 0.741, + "grad_norm": 0.13449662923812866, + "learning_rate": 3.81591604642446e-06, + "loss": 0.033, + "step": 7410 + }, + { + "epoch": 0.7412, + "grad_norm": 4.905856132507324, + "learning_rate": 3.810431237799657e-06, + "loss": 0.2463, + "step": 7412 + }, + { + "epoch": 0.7414, + "grad_norm": 5.741507053375244, + "learning_rate": 3.804949445895473e-06, + "loss": 0.5184, + "step": 7414 + }, + { + "epoch": 0.7416, + "grad_norm": 4.996687412261963, + "learning_rate": 3.7994706733836738e-06, + "loss": 0.4688, + "step": 7416 + }, + { + "epoch": 0.7418, + "grad_norm": 3.5090458393096924, + "learning_rate": 3.793994922934544e-06, + "loss": 0.2623, + "step": 7418 + }, + { + "epoch": 0.742, + "grad_norm": 5.09272575378418, + "learning_rate": 3.7885221972168974e-06, + "loss": 0.1697, + "step": 7420 + }, + { + "epoch": 0.7422, + "grad_norm": 8.586466789245605, + "learning_rate": 3.783052498898073e-06, + "loss": 0.6702, + "step": 7422 + }, + { + "epoch": 0.7424, + "grad_norm": 3.0619256496429443, + "learning_rate": 3.7775858306439374e-06, + "loss": 0.0882, + "step": 7424 + }, + { + "epoch": 0.7426, + "grad_norm": 0.3999852240085602, + "learning_rate": 3.772122195118877e-06, + "loss": 0.0406, + "step": 7426 + }, + { + "epoch": 0.7428, + "grad_norm": 1.1625382900238037, + "learning_rate": 3.766661594985801e-06, + "loss": 0.4262, + "step": 7428 + }, + { + "epoch": 0.743, + "grad_norm": 2.7032430171966553, + "learning_rate": 3.7612040329061405e-06, + "loss": 0.2149, + "step": 7430 + }, + { + "epoch": 0.7432, + "grad_norm": 3.90956974029541, + "learning_rate": 3.7557495115398446e-06, + "loss": 0.235, + "step": 7432 + }, + { + "epoch": 0.7434, + "grad_norm": 0.5231888294219971, + "learning_rate": 3.7502980335453777e-06, + "loss": 0.0234, + "step": 7434 + }, + { + "epoch": 0.7436, + "grad_norm": 3.803154230117798, + "learning_rate": 3.7448496015797296e-06, + "loss": 0.1961, + "step": 7436 + }, + { + "epoch": 0.7438, + "grad_norm": 0.6651385426521301, + "learning_rate": 3.7394042182983983e-06, + "loss": 0.056, + "step": 7438 + }, + { + "epoch": 0.744, + "grad_norm": 3.2893691062927246, + "learning_rate": 3.7339618863553983e-06, + "loss": 0.1413, + "step": 7440 + }, + { + "epoch": 0.7442, + "grad_norm": 5.508923530578613, + "learning_rate": 3.728522608403249e-06, + "loss": 0.2369, + "step": 7442 + }, + { + "epoch": 0.7444, + "grad_norm": 1.8549280166625977, + "learning_rate": 3.723086387092997e-06, + "loss": 0.0925, + "step": 7444 + }, + { + "epoch": 0.7446, + "grad_norm": 2.6159822940826416, + "learning_rate": 3.7176532250741857e-06, + "loss": 0.1264, + "step": 7446 + }, + { + "epoch": 0.7448, + "grad_norm": 1.460924744606018, + "learning_rate": 3.7122231249948747e-06, + "loss": 0.3205, + "step": 7448 + }, + { + "epoch": 0.745, + "grad_norm": 9.486297607421875, + "learning_rate": 3.7067960895016277e-06, + "loss": 0.2925, + "step": 7450 + }, + { + "epoch": 0.7452, + "grad_norm": 12.142578125, + "learning_rate": 3.7013721212395128e-06, + "loss": 1.0319, + "step": 7452 + }, + { + "epoch": 0.7454, + "grad_norm": 4.772895812988281, + "learning_rate": 3.6959512228521123e-06, + "loss": 0.2794, + "step": 7454 + }, + { + "epoch": 0.7456, + "grad_norm": 3.2273812294006348, + "learning_rate": 3.6905333969815038e-06, + "loss": 0.116, + "step": 7456 + }, + { + "epoch": 0.7458, + "grad_norm": 3.9867875576019287, + "learning_rate": 3.685118646268272e-06, + "loss": 0.5447, + "step": 7458 + }, + { + "epoch": 0.746, + "grad_norm": 4.0842695236206055, + "learning_rate": 3.679706973351491e-06, + "loss": 0.3256, + "step": 7460 + }, + { + "epoch": 0.7462, + "grad_norm": 0.20465590059757233, + "learning_rate": 3.674298380868756e-06, + "loss": 0.2121, + "step": 7462 + }, + { + "epoch": 0.7464, + "grad_norm": 2.601588249206543, + "learning_rate": 3.6688928714561444e-06, + "loss": 0.0735, + "step": 7464 + }, + { + "epoch": 0.7466, + "grad_norm": 2.6060516834259033, + "learning_rate": 3.663490447748236e-06, + "loss": 0.1846, + "step": 7466 + }, + { + "epoch": 0.7468, + "grad_norm": 7.137617111206055, + "learning_rate": 3.658091112378106e-06, + "loss": 0.4228, + "step": 7468 + }, + { + "epoch": 0.747, + "grad_norm": 1.635891318321228, + "learning_rate": 3.6526948679773256e-06, + "loss": 0.0652, + "step": 7470 + }, + { + "epoch": 0.7472, + "grad_norm": 1.0558104515075684, + "learning_rate": 3.6473017171759563e-06, + "loss": 0.2608, + "step": 7472 + }, + { + "epoch": 0.7474, + "grad_norm": 4.349795341491699, + "learning_rate": 3.6419116626025585e-06, + "loss": 0.2159, + "step": 7474 + }, + { + "epoch": 0.7476, + "grad_norm": 2.1088030338287354, + "learning_rate": 3.636524706884181e-06, + "loss": 0.0887, + "step": 7476 + }, + { + "epoch": 0.7478, + "grad_norm": 3.063671350479126, + "learning_rate": 3.6311408526463554e-06, + "loss": 0.2046, + "step": 7478 + }, + { + "epoch": 0.748, + "grad_norm": 5.153581142425537, + "learning_rate": 3.625760102513103e-06, + "loss": 0.5587, + "step": 7480 + }, + { + "epoch": 0.7482, + "grad_norm": 1.8185757398605347, + "learning_rate": 3.620382459106946e-06, + "loss": 0.0693, + "step": 7482 + }, + { + "epoch": 0.7484, + "grad_norm": 2.1349024772644043, + "learning_rate": 3.615007925048878e-06, + "loss": 0.1603, + "step": 7484 + }, + { + "epoch": 0.7486, + "grad_norm": 3.968214511871338, + "learning_rate": 3.6096365029583803e-06, + "loss": 0.0917, + "step": 7486 + }, + { + "epoch": 0.7488, + "grad_norm": 13.594572067260742, + "learning_rate": 3.604268195453421e-06, + "loss": 0.8473, + "step": 7488 + }, + { + "epoch": 0.749, + "grad_norm": 0.21978281438350677, + "learning_rate": 3.598903005150444e-06, + "loss": 0.0538, + "step": 7490 + }, + { + "epoch": 0.7492, + "grad_norm": 3.5831823348999023, + "learning_rate": 3.5935409346643835e-06, + "loss": 0.5994, + "step": 7492 + }, + { + "epoch": 0.7494, + "grad_norm": 0.9887539744377136, + "learning_rate": 3.5881819866086485e-06, + "loss": 0.1037, + "step": 7494 + }, + { + "epoch": 0.7496, + "grad_norm": 2.43330979347229, + "learning_rate": 3.582826163595119e-06, + "loss": 0.3313, + "step": 7496 + }, + { + "epoch": 0.7498, + "grad_norm": 1.0807483196258545, + "learning_rate": 3.5774734682341563e-06, + "loss": 0.0565, + "step": 7498 + }, + { + "epoch": 0.75, + "grad_norm": 3.955461263656616, + "learning_rate": 3.5721239031346067e-06, + "loss": 0.252, + "step": 7500 + }, + { + "epoch": 0.7502, + "grad_norm": 7.01019811630249, + "learning_rate": 3.5667774709037804e-06, + "loss": 0.2712, + "step": 7502 + }, + { + "epoch": 0.7504, + "grad_norm": 1.899630069732666, + "learning_rate": 3.5614341741474633e-06, + "loss": 0.25, + "step": 7504 + }, + { + "epoch": 0.7506, + "grad_norm": 6.450450420379639, + "learning_rate": 3.5560940154699133e-06, + "loss": 0.3653, + "step": 7506 + }, + { + "epoch": 0.7508, + "grad_norm": 3.4716596603393555, + "learning_rate": 3.5507569974738575e-06, + "loss": 0.1413, + "step": 7508 + }, + { + "epoch": 0.751, + "grad_norm": 2.042548656463623, + "learning_rate": 3.545423122760493e-06, + "loss": 0.253, + "step": 7510 + }, + { + "epoch": 0.7512, + "grad_norm": 4.139739513397217, + "learning_rate": 3.540092393929494e-06, + "loss": 0.3048, + "step": 7512 + }, + { + "epoch": 0.7514, + "grad_norm": 4.704060077667236, + "learning_rate": 3.5347648135789823e-06, + "loss": 0.1582, + "step": 7514 + }, + { + "epoch": 0.7516, + "grad_norm": 3.7871150970458984, + "learning_rate": 3.5294403843055604e-06, + "loss": 0.3029, + "step": 7516 + }, + { + "epoch": 0.7518, + "grad_norm": 3.179260015487671, + "learning_rate": 3.524119108704286e-06, + "loss": 0.3322, + "step": 7518 + }, + { + "epoch": 0.752, + "grad_norm": 1.6317273378372192, + "learning_rate": 3.5188009893686916e-06, + "loss": 0.0705, + "step": 7520 + }, + { + "epoch": 0.7522, + "grad_norm": 3.757275342941284, + "learning_rate": 3.5134860288907602e-06, + "loss": 0.2243, + "step": 7522 + }, + { + "epoch": 0.7524, + "grad_norm": 6.376817226409912, + "learning_rate": 3.50817422986094e-06, + "loss": 0.3426, + "step": 7524 + }, + { + "epoch": 0.7526, + "grad_norm": 4.533647060394287, + "learning_rate": 3.502865594868136e-06, + "loss": 0.2345, + "step": 7526 + }, + { + "epoch": 0.7528, + "grad_norm": 6.27448034286499, + "learning_rate": 3.4975601264997094e-06, + "loss": 0.2772, + "step": 7528 + }, + { + "epoch": 0.753, + "grad_norm": 0.7733517289161682, + "learning_rate": 3.492257827341492e-06, + "loss": 0.5571, + "step": 7530 + }, + { + "epoch": 0.7532, + "grad_norm": 2.7193760871887207, + "learning_rate": 3.4869586999777492e-06, + "loss": 0.253, + "step": 7532 + }, + { + "epoch": 0.7534, + "grad_norm": 1.0692601203918457, + "learning_rate": 3.4816627469912147e-06, + "loss": 0.2663, + "step": 7534 + }, + { + "epoch": 0.7536, + "grad_norm": 1.4887052774429321, + "learning_rate": 3.476369970963072e-06, + "loss": 0.0625, + "step": 7536 + }, + { + "epoch": 0.7538, + "grad_norm": 0.2186392843723297, + "learning_rate": 3.4710803744729517e-06, + "loss": 0.1661, + "step": 7538 + }, + { + "epoch": 0.754, + "grad_norm": 11.471294403076172, + "learning_rate": 3.4657939600989453e-06, + "loss": 0.3028, + "step": 7540 + }, + { + "epoch": 0.7542, + "grad_norm": 7.724924564361572, + "learning_rate": 3.4605107304175855e-06, + "loss": 0.3071, + "step": 7542 + }, + { + "epoch": 0.7544, + "grad_norm": 0.25846266746520996, + "learning_rate": 3.455230688003852e-06, + "loss": 0.3382, + "step": 7544 + }, + { + "epoch": 0.7546, + "grad_norm": 3.6899945735931396, + "learning_rate": 3.4499538354311757e-06, + "loss": 0.1519, + "step": 7546 + }, + { + "epoch": 0.7548, + "grad_norm": 0.47127366065979004, + "learning_rate": 3.4446801752714287e-06, + "loss": 0.7449, + "step": 7548 + }, + { + "epoch": 0.755, + "grad_norm": 6.568058013916016, + "learning_rate": 3.4394097100949286e-06, + "loss": 0.5588, + "step": 7550 + }, + { + "epoch": 0.7552, + "grad_norm": 3.423339366912842, + "learning_rate": 3.4341424424704373e-06, + "loss": 0.246, + "step": 7552 + }, + { + "epoch": 0.7554, + "grad_norm": 5.854997158050537, + "learning_rate": 3.4288783749651568e-06, + "loss": 0.2707, + "step": 7554 + }, + { + "epoch": 0.7556, + "grad_norm": 1.2402218580245972, + "learning_rate": 3.4236175101447265e-06, + "loss": 0.1549, + "step": 7556 + }, + { + "epoch": 0.7558, + "grad_norm": 2.647789239883423, + "learning_rate": 3.418359850573234e-06, + "loss": 0.1877, + "step": 7558 + }, + { + "epoch": 0.756, + "grad_norm": 0.24767683446407318, + "learning_rate": 3.4131053988131947e-06, + "loss": 0.2346, + "step": 7560 + }, + { + "epoch": 0.7562, + "grad_norm": 1.0218244791030884, + "learning_rate": 3.4078541574255664e-06, + "loss": 0.0422, + "step": 7562 + }, + { + "epoch": 0.7564, + "grad_norm": 0.5697231888771057, + "learning_rate": 3.4026061289697397e-06, + "loss": 0.0355, + "step": 7564 + }, + { + "epoch": 0.7566, + "grad_norm": 3.8422534465789795, + "learning_rate": 3.397361316003539e-06, + "loss": 0.2972, + "step": 7566 + }, + { + "epoch": 0.7568, + "grad_norm": 1.0109690427780151, + "learning_rate": 3.3921197210832235e-06, + "loss": 0.0391, + "step": 7568 + }, + { + "epoch": 0.757, + "grad_norm": 0.33508145809173584, + "learning_rate": 3.3868813467634833e-06, + "loss": 0.2207, + "step": 7570 + }, + { + "epoch": 0.7572, + "grad_norm": 1.0386849641799927, + "learning_rate": 3.381646195597437e-06, + "loss": 0.1039, + "step": 7572 + }, + { + "epoch": 0.7574, + "grad_norm": 2.322230100631714, + "learning_rate": 3.376414270136633e-06, + "loss": 0.1012, + "step": 7574 + }, + { + "epoch": 0.7576, + "grad_norm": 1.7650905847549438, + "learning_rate": 3.3711855729310482e-06, + "loss": 0.1339, + "step": 7576 + }, + { + "epoch": 0.7578, + "grad_norm": 1.1933482885360718, + "learning_rate": 3.3659601065290893e-06, + "loss": 0.0957, + "step": 7578 + }, + { + "epoch": 0.758, + "grad_norm": 7.438645362854004, + "learning_rate": 3.360737873477584e-06, + "loss": 0.4709, + "step": 7580 + }, + { + "epoch": 0.7582, + "grad_norm": 4.3193206787109375, + "learning_rate": 3.355518876321787e-06, + "loss": 0.2567, + "step": 7582 + }, + { + "epoch": 0.7584, + "grad_norm": 3.5236244201660156, + "learning_rate": 3.3503031176053657e-06, + "loss": 0.2556, + "step": 7584 + }, + { + "epoch": 0.7586, + "grad_norm": 2.769192695617676, + "learning_rate": 3.3450905998704274e-06, + "loss": 0.116, + "step": 7586 + }, + { + "epoch": 0.7588, + "grad_norm": 2.10605788230896, + "learning_rate": 3.3398813256574847e-06, + "loss": 0.0803, + "step": 7588 + }, + { + "epoch": 0.759, + "grad_norm": 1.3237371444702148, + "learning_rate": 3.3346752975054763e-06, + "loss": 0.1382, + "step": 7590 + }, + { + "epoch": 0.7592, + "grad_norm": 4.293743133544922, + "learning_rate": 3.3294725179517573e-06, + "loss": 0.4597, + "step": 7592 + }, + { + "epoch": 0.7594, + "grad_norm": 1.6884580850601196, + "learning_rate": 3.3242729895320945e-06, + "loss": 0.3316, + "step": 7594 + }, + { + "epoch": 0.7596, + "grad_norm": 1.9161299467086792, + "learning_rate": 3.3190767147806825e-06, + "loss": 0.144, + "step": 7596 + }, + { + "epoch": 0.7598, + "grad_norm": 4.738112449645996, + "learning_rate": 3.3138836962301192e-06, + "loss": 1.066, + "step": 7598 + }, + { + "epoch": 0.76, + "grad_norm": 7.374011039733887, + "learning_rate": 3.308693936411421e-06, + "loss": 0.4288, + "step": 7600 + }, + { + "epoch": 0.7602, + "grad_norm": 3.388742446899414, + "learning_rate": 3.3035074378540087e-06, + "loss": 0.2868, + "step": 7602 + }, + { + "epoch": 0.7604, + "grad_norm": 2.061094284057617, + "learning_rate": 3.2983242030857177e-06, + "loss": 0.0958, + "step": 7604 + }, + { + "epoch": 0.7606, + "grad_norm": 0.68804532289505, + "learning_rate": 3.2931442346328e-06, + "loss": 0.0963, + "step": 7606 + }, + { + "epoch": 0.7608, + "grad_norm": 4.723433494567871, + "learning_rate": 3.287967535019908e-06, + "loss": 0.27, + "step": 7608 + }, + { + "epoch": 0.761, + "grad_norm": 7.151556968688965, + "learning_rate": 3.2827941067700996e-06, + "loss": 0.2443, + "step": 7610 + }, + { + "epoch": 0.7612, + "grad_norm": 2.5145318508148193, + "learning_rate": 3.2776239524048426e-06, + "loss": 0.0949, + "step": 7612 + }, + { + "epoch": 0.7614, + "grad_norm": 1.1050833463668823, + "learning_rate": 3.272457074444003e-06, + "loss": 0.0723, + "step": 7614 + }, + { + "epoch": 0.7616, + "grad_norm": 0.25515884160995483, + "learning_rate": 3.2672934754058615e-06, + "loss": 0.0381, + "step": 7616 + }, + { + "epoch": 0.7618, + "grad_norm": 4.879878997802734, + "learning_rate": 3.2621331578070936e-06, + "loss": 0.3514, + "step": 7618 + }, + { + "epoch": 0.762, + "grad_norm": 7.729321479797363, + "learning_rate": 3.2569761241627694e-06, + "loss": 0.1847, + "step": 7620 + }, + { + "epoch": 0.7622, + "grad_norm": 0.5615931749343872, + "learning_rate": 3.2518223769863633e-06, + "loss": 0.147, + "step": 7622 + }, + { + "epoch": 0.7624, + "grad_norm": 2.997455358505249, + "learning_rate": 3.2466719187897555e-06, + "loss": 0.2631, + "step": 7624 + }, + { + "epoch": 0.7626, + "grad_norm": 5.220663070678711, + "learning_rate": 3.241524752083215e-06, + "loss": 0.5106, + "step": 7626 + }, + { + "epoch": 0.7628, + "grad_norm": 0.9151387810707092, + "learning_rate": 3.2363808793754082e-06, + "loss": 0.0723, + "step": 7628 + }, + { + "epoch": 0.763, + "grad_norm": 0.4029538333415985, + "learning_rate": 3.2312403031733943e-06, + "loss": 0.5185, + "step": 7630 + }, + { + "epoch": 0.7632, + "grad_norm": 0.1734284609556198, + "learning_rate": 3.2261030259826287e-06, + "loss": 0.0105, + "step": 7632 + }, + { + "epoch": 0.7634, + "grad_norm": 0.36074119806289673, + "learning_rate": 3.2209690503069545e-06, + "loss": 0.2006, + "step": 7634 + }, + { + "epoch": 0.7636, + "grad_norm": 0.17010685801506042, + "learning_rate": 3.2158383786486204e-06, + "loss": 0.0518, + "step": 7636 + }, + { + "epoch": 0.7638, + "grad_norm": 1.4421687126159668, + "learning_rate": 3.210711013508242e-06, + "loss": 0.0705, + "step": 7638 + }, + { + "epoch": 0.764, + "grad_norm": 4.090760231018066, + "learning_rate": 3.2055869573848374e-06, + "loss": 0.3086, + "step": 7640 + }, + { + "epoch": 0.7642, + "grad_norm": 3.3188939094543457, + "learning_rate": 3.200466212775808e-06, + "loss": 0.222, + "step": 7642 + }, + { + "epoch": 0.7644, + "grad_norm": 5.38986349105835, + "learning_rate": 3.195348782176948e-06, + "loss": 0.2957, + "step": 7644 + }, + { + "epoch": 0.7646, + "grad_norm": 4.537854194641113, + "learning_rate": 3.190234668082427e-06, + "loss": 0.1945, + "step": 7646 + }, + { + "epoch": 0.7648, + "grad_norm": 2.830826997756958, + "learning_rate": 3.1851238729848033e-06, + "loss": 0.4799, + "step": 7648 + }, + { + "epoch": 0.765, + "grad_norm": 0.488627165555954, + "learning_rate": 3.1800163993750166e-06, + "loss": 0.039, + "step": 7650 + }, + { + "epoch": 0.7652, + "grad_norm": 5.108675003051758, + "learning_rate": 3.174912249742382e-06, + "loss": 0.507, + "step": 7652 + }, + { + "epoch": 0.7654, + "grad_norm": 2.660773515701294, + "learning_rate": 3.1698114265746126e-06, + "loss": 0.3272, + "step": 7654 + }, + { + "epoch": 0.7656, + "grad_norm": 0.19988323748111725, + "learning_rate": 3.164713932357776e-06, + "loss": 0.03, + "step": 7656 + }, + { + "epoch": 0.7658, + "grad_norm": 4.397059917449951, + "learning_rate": 3.159619769576333e-06, + "loss": 0.2799, + "step": 7658 + }, + { + "epoch": 0.766, + "grad_norm": 1.7151198387145996, + "learning_rate": 3.1545289407131128e-06, + "loss": 0.2152, + "step": 7660 + }, + { + "epoch": 0.7662, + "grad_norm": 12.055477142333984, + "learning_rate": 3.149441448249331e-06, + "loss": 0.3206, + "step": 7662 + }, + { + "epoch": 0.7664, + "grad_norm": 1.3097885847091675, + "learning_rate": 3.144357294664565e-06, + "loss": 0.0675, + "step": 7664 + }, + { + "epoch": 0.7666, + "grad_norm": 6.521742820739746, + "learning_rate": 3.1392764824367706e-06, + "loss": 0.4178, + "step": 7666 + }, + { + "epoch": 0.7668, + "grad_norm": 1.9586526155471802, + "learning_rate": 3.134199014042274e-06, + "loss": 0.0519, + "step": 7668 + }, + { + "epoch": 0.767, + "grad_norm": 3.467979907989502, + "learning_rate": 3.1291248919557717e-06, + "loss": 0.507, + "step": 7670 + }, + { + "epoch": 0.7672, + "grad_norm": 1.622187852859497, + "learning_rate": 3.124054118650327e-06, + "loss": 0.0758, + "step": 7672 + }, + { + "epoch": 0.7674, + "grad_norm": 0.5499117970466614, + "learning_rate": 3.118986696597377e-06, + "loss": 0.3218, + "step": 7674 + }, + { + "epoch": 0.7676, + "grad_norm": 5.223560810089111, + "learning_rate": 3.113922628266718e-06, + "loss": 0.1011, + "step": 7676 + }, + { + "epoch": 0.7678, + "grad_norm": 6.723410129547119, + "learning_rate": 3.108861916126518e-06, + "loss": 0.5974, + "step": 7678 + }, + { + "epoch": 0.768, + "grad_norm": 2.7158892154693604, + "learning_rate": 3.103804562643302e-06, + "loss": 0.8255, + "step": 7680 + }, + { + "epoch": 0.7682, + "grad_norm": 1.3051215410232544, + "learning_rate": 3.0987505702819687e-06, + "loss": 0.1375, + "step": 7682 + }, + { + "epoch": 0.7684, + "grad_norm": 7.147759914398193, + "learning_rate": 3.0936999415057712e-06, + "loss": 0.3297, + "step": 7684 + }, + { + "epoch": 0.7686, + "grad_norm": 0.43112847208976746, + "learning_rate": 3.0886526787763237e-06, + "loss": 0.0791, + "step": 7686 + }, + { + "epoch": 0.7688, + "grad_norm": 1.9429917335510254, + "learning_rate": 3.0836087845536e-06, + "loss": 0.0857, + "step": 7688 + }, + { + "epoch": 0.769, + "grad_norm": 0.15028110146522522, + "learning_rate": 3.0785682612959334e-06, + "loss": 0.2128, + "step": 7690 + }, + { + "epoch": 0.7692, + "grad_norm": 5.368011951446533, + "learning_rate": 3.073531111460013e-06, + "loss": 0.289, + "step": 7692 + }, + { + "epoch": 0.7694, + "grad_norm": 1.1854615211486816, + "learning_rate": 3.0684973375008865e-06, + "loss": 0.4659, + "step": 7694 + }, + { + "epoch": 0.7696, + "grad_norm": 1.3451170921325684, + "learning_rate": 3.063466941871952e-06, + "loss": 0.2533, + "step": 7696 + }, + { + "epoch": 0.7698, + "grad_norm": 1.4906580448150635, + "learning_rate": 3.058439927024962e-06, + "loss": 0.2369, + "step": 7698 + }, + { + "epoch": 0.77, + "grad_norm": 6.6543288230896, + "learning_rate": 3.0534162954100264e-06, + "loss": 0.2301, + "step": 7700 + }, + { + "epoch": 0.7702, + "grad_norm": 1.2466717958450317, + "learning_rate": 3.0483960494756017e-06, + "loss": 0.1669, + "step": 7702 + }, + { + "epoch": 0.7704, + "grad_norm": 2.087963104248047, + "learning_rate": 3.043379191668492e-06, + "loss": 0.0867, + "step": 7704 + }, + { + "epoch": 0.7706, + "grad_norm": 6.269555568695068, + "learning_rate": 3.038365724433858e-06, + "loss": 0.3828, + "step": 7706 + }, + { + "epoch": 0.7708, + "grad_norm": 0.803800642490387, + "learning_rate": 3.033355650215193e-06, + "loss": 0.2073, + "step": 7708 + }, + { + "epoch": 0.771, + "grad_norm": 2.7078990936279297, + "learning_rate": 3.028348971454356e-06, + "loss": 0.3627, + "step": 7710 + }, + { + "epoch": 0.7712, + "grad_norm": 7.800130844116211, + "learning_rate": 3.023345690591537e-06, + "loss": 0.3513, + "step": 7712 + }, + { + "epoch": 0.7714, + "grad_norm": 8.936089515686035, + "learning_rate": 3.0183458100652752e-06, + "loss": 0.2995, + "step": 7714 + }, + { + "epoch": 0.7716, + "grad_norm": 3.8381824493408203, + "learning_rate": 3.013349332312451e-06, + "loss": 0.2555, + "step": 7716 + }, + { + "epoch": 0.7718, + "grad_norm": 0.9715508818626404, + "learning_rate": 3.008356259768285e-06, + "loss": 0.1394, + "step": 7718 + }, + { + "epoch": 0.772, + "grad_norm": 3.474364995956421, + "learning_rate": 3.003366594866345e-06, + "loss": 0.3261, + "step": 7720 + }, + { + "epoch": 0.7722, + "grad_norm": 0.5183166265487671, + "learning_rate": 2.9983803400385313e-06, + "loss": 0.0924, + "step": 7722 + }, + { + "epoch": 0.7724, + "grad_norm": 2.9176604747772217, + "learning_rate": 2.993397497715086e-06, + "loss": 0.3314, + "step": 7724 + }, + { + "epoch": 0.7726, + "grad_norm": 0.30745816230773926, + "learning_rate": 2.988418070324577e-06, + "loss": 0.0895, + "step": 7726 + }, + { + "epoch": 0.7728, + "grad_norm": 1.3748500347137451, + "learning_rate": 2.983442060293926e-06, + "loss": 0.2188, + "step": 7728 + }, + { + "epoch": 0.773, + "grad_norm": 6.958530426025391, + "learning_rate": 2.978469470048376e-06, + "loss": 0.2685, + "step": 7730 + }, + { + "epoch": 0.7732, + "grad_norm": 0.34124264121055603, + "learning_rate": 2.9735003020115095e-06, + "loss": 0.0231, + "step": 7732 + }, + { + "epoch": 0.7734, + "grad_norm": 2.9157497882843018, + "learning_rate": 2.968534558605236e-06, + "loss": 0.1051, + "step": 7734 + }, + { + "epoch": 0.7736, + "grad_norm": 1.6747865676879883, + "learning_rate": 2.963572242249799e-06, + "loss": 0.0955, + "step": 7736 + }, + { + "epoch": 0.7738, + "grad_norm": 3.752796173095703, + "learning_rate": 2.9586133553637687e-06, + "loss": 0.4489, + "step": 7738 + }, + { + "epoch": 0.774, + "grad_norm": 1.3958207368850708, + "learning_rate": 2.953657900364053e-06, + "loss": 0.3089, + "step": 7740 + }, + { + "epoch": 0.7742, + "grad_norm": 9.147198677062988, + "learning_rate": 2.9487058796658785e-06, + "loss": 0.8566, + "step": 7742 + }, + { + "epoch": 0.7744, + "grad_norm": 3.1543757915496826, + "learning_rate": 2.9437572956827965e-06, + "loss": 0.2603, + "step": 7744 + }, + { + "epoch": 0.7746, + "grad_norm": 8.0980224609375, + "learning_rate": 2.938812150826684e-06, + "loss": 0.5841, + "step": 7746 + }, + { + "epoch": 0.7748, + "grad_norm": 1.2012873888015747, + "learning_rate": 2.9338704475077527e-06, + "loss": 0.1133, + "step": 7748 + }, + { + "epoch": 0.775, + "grad_norm": 1.4469947814941406, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.23, + "step": 7750 + }, + { + "epoch": 0.7752, + "grad_norm": 3.8727734088897705, + "learning_rate": 2.9239973751138495e-06, + "loss": 0.2771, + "step": 7752 + }, + { + "epoch": 0.7754, + "grad_norm": 5.018026351928711, + "learning_rate": 2.919066010850892e-06, + "loss": 0.265, + "step": 7754 + }, + { + "epoch": 0.7756, + "grad_norm": 10.791189193725586, + "learning_rate": 2.9141380977491373e-06, + "loss": 0.3409, + "step": 7756 + }, + { + "epoch": 0.7758, + "grad_norm": 0.5848069190979004, + "learning_rate": 2.9092136382103976e-06, + "loss": 0.1038, + "step": 7758 + }, + { + "epoch": 0.776, + "grad_norm": 0.9100140929222107, + "learning_rate": 2.9042926346347932e-06, + "loss": 0.1025, + "step": 7760 + }, + { + "epoch": 0.7762, + "grad_norm": 3.145876407623291, + "learning_rate": 2.8993750894207563e-06, + "loss": 0.2009, + "step": 7762 + }, + { + "epoch": 0.7764, + "grad_norm": 10.342663764953613, + "learning_rate": 2.8944610049650377e-06, + "loss": 0.1755, + "step": 7764 + }, + { + "epoch": 0.7766, + "grad_norm": 3.6500184535980225, + "learning_rate": 2.8895503836627105e-06, + "loss": 0.2499, + "step": 7766 + }, + { + "epoch": 0.7768, + "grad_norm": 3.66215443611145, + "learning_rate": 2.884643227907147e-06, + "loss": 0.2378, + "step": 7768 + }, + { + "epoch": 0.777, + "grad_norm": 6.356958866119385, + "learning_rate": 2.8797395400900362e-06, + "loss": 0.392, + "step": 7770 + }, + { + "epoch": 0.7772, + "grad_norm": 1.7171930074691772, + "learning_rate": 2.874839322601375e-06, + "loss": 0.0542, + "step": 7772 + }, + { + "epoch": 0.7774, + "grad_norm": 1.5658248662948608, + "learning_rate": 2.869942577829471e-06, + "loss": 0.1327, + "step": 7774 + }, + { + "epoch": 0.7776, + "grad_norm": 2.393611192703247, + "learning_rate": 2.8650493081609344e-06, + "loss": 0.1592, + "step": 7776 + }, + { + "epoch": 0.7778, + "grad_norm": 0.157082661986351, + "learning_rate": 2.860159515980695e-06, + "loss": 0.01, + "step": 7778 + }, + { + "epoch": 0.778, + "grad_norm": 0.13591240346431732, + "learning_rate": 2.855273203671969e-06, + "loss": 0.3548, + "step": 7780 + }, + { + "epoch": 0.7782, + "grad_norm": 5.553399562835693, + "learning_rate": 2.8503903736162876e-06, + "loss": 0.0906, + "step": 7782 + }, + { + "epoch": 0.7784, + "grad_norm": 8.80829906463623, + "learning_rate": 2.8455110281934804e-06, + "loss": 0.7296, + "step": 7784 + }, + { + "epoch": 0.7786, + "grad_norm": 2.5882441997528076, + "learning_rate": 2.840635169781688e-06, + "loss": 0.1067, + "step": 7786 + }, + { + "epoch": 0.7788, + "grad_norm": 6.0271525382995605, + "learning_rate": 2.8357628007573412e-06, + "loss": 0.5078, + "step": 7788 + }, + { + "epoch": 0.779, + "grad_norm": 4.135467529296875, + "learning_rate": 2.830893923495173e-06, + "loss": 0.286, + "step": 7790 + }, + { + "epoch": 0.7792, + "grad_norm": 1.4640638828277588, + "learning_rate": 2.8260285403682153e-06, + "loss": 0.1987, + "step": 7792 + }, + { + "epoch": 0.7794, + "grad_norm": 0.12559951841831207, + "learning_rate": 2.821166653747793e-06, + "loss": 0.0116, + "step": 7794 + }, + { + "epoch": 0.7796, + "grad_norm": 0.5013250708580017, + "learning_rate": 2.816308266003541e-06, + "loss": 0.2179, + "step": 7796 + }, + { + "epoch": 0.7798, + "grad_norm": 3.451265573501587, + "learning_rate": 2.8114533795033685e-06, + "loss": 0.216, + "step": 7798 + }, + { + "epoch": 0.78, + "grad_norm": 3.4470465183258057, + "learning_rate": 2.8066019966134907e-06, + "loss": 0.3379, + "step": 7800 + }, + { + "epoch": 0.7802, + "grad_norm": 7.247532367706299, + "learning_rate": 2.8017541196984144e-06, + "loss": 0.1299, + "step": 7802 + }, + { + "epoch": 0.7804, + "grad_norm": 0.3349640965461731, + "learning_rate": 2.796909751120931e-06, + "loss": 0.0615, + "step": 7804 + }, + { + "epoch": 0.7806, + "grad_norm": 4.7989115715026855, + "learning_rate": 2.7920688932421337e-06, + "loss": 0.1949, + "step": 7806 + }, + { + "epoch": 0.7808, + "grad_norm": 3.163360834121704, + "learning_rate": 2.7872315484213954e-06, + "loss": 0.195, + "step": 7808 + }, + { + "epoch": 0.781, + "grad_norm": 0.12293023616075516, + "learning_rate": 2.7823977190163788e-06, + "loss": 0.0105, + "step": 7810 + }, + { + "epoch": 0.7812, + "grad_norm": 3.708956718444824, + "learning_rate": 2.7775674073830337e-06, + "loss": 0.3693, + "step": 7812 + }, + { + "epoch": 0.7814, + "grad_norm": 0.28274333477020264, + "learning_rate": 2.7727406158755943e-06, + "loss": 0.2333, + "step": 7814 + }, + { + "epoch": 0.7816, + "grad_norm": 2.8027565479278564, + "learning_rate": 2.7679173468465813e-06, + "loss": 0.096, + "step": 7816 + }, + { + "epoch": 0.7818, + "grad_norm": 1.210662603378296, + "learning_rate": 2.763097602646797e-06, + "loss": 0.1441, + "step": 7818 + }, + { + "epoch": 0.782, + "grad_norm": 2.714014768600464, + "learning_rate": 2.7582813856253276e-06, + "loss": 0.1896, + "step": 7820 + }, + { + "epoch": 0.7822, + "grad_norm": 0.1570204496383667, + "learning_rate": 2.7534686981295335e-06, + "loss": 0.0126, + "step": 7822 + }, + { + "epoch": 0.7824, + "grad_norm": 0.35673847794532776, + "learning_rate": 2.7486595425050667e-06, + "loss": 0.0182, + "step": 7824 + }, + { + "epoch": 0.7826, + "grad_norm": 0.13164272904396057, + "learning_rate": 2.7438539210958483e-06, + "loss": 0.3324, + "step": 7826 + }, + { + "epoch": 0.7828, + "grad_norm": 0.8464373350143433, + "learning_rate": 2.739051836244081e-06, + "loss": 0.085, + "step": 7828 + }, + { + "epoch": 0.783, + "grad_norm": 3.887058973312378, + "learning_rate": 2.7342532902902418e-06, + "loss": 0.2955, + "step": 7830 + }, + { + "epoch": 0.7832, + "grad_norm": 0.10574262589216232, + "learning_rate": 2.7294582855730835e-06, + "loss": 0.706, + "step": 7832 + }, + { + "epoch": 0.7834, + "grad_norm": 0.8982234001159668, + "learning_rate": 2.7246668244296328e-06, + "loss": 0.0375, + "step": 7834 + }, + { + "epoch": 0.7836, + "grad_norm": 0.49985769391059875, + "learning_rate": 2.7198789091951903e-06, + "loss": 0.1127, + "step": 7836 + }, + { + "epoch": 0.7838, + "grad_norm": 3.048163890838623, + "learning_rate": 2.715094542203327e-06, + "loss": 0.3689, + "step": 7838 + }, + { + "epoch": 0.784, + "grad_norm": 4.1949462890625, + "learning_rate": 2.7103137257858867e-06, + "loss": 0.4137, + "step": 7840 + }, + { + "epoch": 0.7842, + "grad_norm": 0.7722543478012085, + "learning_rate": 2.7055364622729772e-06, + "loss": 0.2098, + "step": 7842 + }, + { + "epoch": 0.7844, + "grad_norm": 4.925539970397949, + "learning_rate": 2.7007627539929847e-06, + "loss": 0.2441, + "step": 7844 + }, + { + "epoch": 0.7846, + "grad_norm": 0.34558627009391785, + "learning_rate": 2.6959926032725537e-06, + "loss": 0.0674, + "step": 7846 + }, + { + "epoch": 0.7848, + "grad_norm": 0.9057009220123291, + "learning_rate": 2.6912260124366007e-06, + "loss": 0.2503, + "step": 7848 + }, + { + "epoch": 0.785, + "grad_norm": 3.2463221549987793, + "learning_rate": 2.6864629838082957e-06, + "loss": 0.3178, + "step": 7850 + }, + { + "epoch": 0.7852, + "grad_norm": 2.9099795818328857, + "learning_rate": 2.6817035197090892e-06, + "loss": 0.1778, + "step": 7852 + }, + { + "epoch": 0.7854, + "grad_norm": 2.679835796356201, + "learning_rate": 2.676947622458683e-06, + "loss": 0.2264, + "step": 7854 + }, + { + "epoch": 0.7856, + "grad_norm": 4.151165008544922, + "learning_rate": 2.672195294375045e-06, + "loss": 0.2017, + "step": 7856 + }, + { + "epoch": 0.7858, + "grad_norm": 3.742879629135132, + "learning_rate": 2.667446537774402e-06, + "loss": 0.4023, + "step": 7858 + }, + { + "epoch": 0.786, + "grad_norm": 5.815006256103516, + "learning_rate": 2.6627013549712355e-06, + "loss": 0.3625, + "step": 7860 + }, + { + "epoch": 0.7862, + "grad_norm": 0.18976163864135742, + "learning_rate": 2.6579597482782972e-06, + "loss": 0.0172, + "step": 7862 + }, + { + "epoch": 0.7864, + "grad_norm": 1.0491454601287842, + "learning_rate": 2.6532217200065856e-06, + "loss": 0.0989, + "step": 7864 + }, + { + "epoch": 0.7866, + "grad_norm": 3.8395323753356934, + "learning_rate": 2.648487272465361e-06, + "loss": 0.1781, + "step": 7866 + }, + { + "epoch": 0.7868, + "grad_norm": 0.9457539916038513, + "learning_rate": 2.643756407962127e-06, + "loss": 0.151, + "step": 7868 + }, + { + "epoch": 0.787, + "grad_norm": 1.8462740182876587, + "learning_rate": 2.639029128802657e-06, + "loss": 0.2444, + "step": 7870 + }, + { + "epoch": 0.7872, + "grad_norm": 0.6774052381515503, + "learning_rate": 2.634305437290968e-06, + "loss": 0.0397, + "step": 7872 + }, + { + "epoch": 0.7874, + "grad_norm": 3.060145378112793, + "learning_rate": 2.62958533572933e-06, + "loss": 0.1035, + "step": 7874 + }, + { + "epoch": 0.7876, + "grad_norm": 5.339869499206543, + "learning_rate": 2.624868826418262e-06, + "loss": 0.2997, + "step": 7876 + }, + { + "epoch": 0.7878, + "grad_norm": 0.31255042552948, + "learning_rate": 2.6201559116565346e-06, + "loss": 0.0391, + "step": 7878 + }, + { + "epoch": 0.788, + "grad_norm": 0.1894853115081787, + "learning_rate": 2.615446593741161e-06, + "loss": 0.2134, + "step": 7880 + }, + { + "epoch": 0.7882, + "grad_norm": 4.494018077850342, + "learning_rate": 2.6107408749674125e-06, + "loss": 0.408, + "step": 7882 + }, + { + "epoch": 0.7884, + "grad_norm": 0.5511333346366882, + "learning_rate": 2.6060387576287983e-06, + "loss": 0.0218, + "step": 7884 + }, + { + "epoch": 0.7886, + "grad_norm": 4.548969268798828, + "learning_rate": 2.6013402440170676e-06, + "loss": 0.2709, + "step": 7886 + }, + { + "epoch": 0.7888, + "grad_norm": 3.9842917919158936, + "learning_rate": 2.596645336422219e-06, + "loss": 0.2017, + "step": 7888 + }, + { + "epoch": 0.789, + "grad_norm": 4.895910263061523, + "learning_rate": 2.5919540371325005e-06, + "loss": 0.1567, + "step": 7890 + }, + { + "epoch": 0.7892, + "grad_norm": 0.821453332901001, + "learning_rate": 2.5872663484343887e-06, + "loss": 0.196, + "step": 7892 + }, + { + "epoch": 0.7894, + "grad_norm": 4.148651123046875, + "learning_rate": 2.5825822726126095e-06, + "loss": 0.3145, + "step": 7894 + }, + { + "epoch": 0.7896, + "grad_norm": 1.74871027469635, + "learning_rate": 2.577901811950121e-06, + "loss": 0.0741, + "step": 7896 + }, + { + "epoch": 0.7898, + "grad_norm": 2.6653354167938232, + "learning_rate": 2.5732249687281228e-06, + "loss": 0.1143, + "step": 7898 + }, + { + "epoch": 0.79, + "grad_norm": 0.20164629817008972, + "learning_rate": 2.5685517452260566e-06, + "loss": 0.1359, + "step": 7900 + }, + { + "epoch": 0.7902, + "grad_norm": 1.2214750051498413, + "learning_rate": 2.5638821437215944e-06, + "loss": 0.1106, + "step": 7902 + }, + { + "epoch": 0.7904, + "grad_norm": 7.396397113800049, + "learning_rate": 2.5592161664906366e-06, + "loss": 0.2595, + "step": 7904 + }, + { + "epoch": 0.7906, + "grad_norm": 0.7835617661476135, + "learning_rate": 2.5545538158073278e-06, + "loss": 0.0929, + "step": 7906 + }, + { + "epoch": 0.7908, + "grad_norm": 2.3000311851501465, + "learning_rate": 2.549895093944039e-06, + "loss": 0.158, + "step": 7908 + }, + { + "epoch": 0.791, + "grad_norm": 13.420740127563477, + "learning_rate": 2.5452400031713786e-06, + "loss": 0.4876, + "step": 7910 + }, + { + "epoch": 0.7912, + "grad_norm": 2.1948201656341553, + "learning_rate": 2.5405885457581793e-06, + "loss": 0.4024, + "step": 7912 + }, + { + "epoch": 0.7914, + "grad_norm": 2.8323709964752197, + "learning_rate": 2.535940723971505e-06, + "loss": 0.2149, + "step": 7914 + }, + { + "epoch": 0.7916, + "grad_norm": 0.1663455367088318, + "learning_rate": 2.5312965400766475e-06, + "loss": 0.0463, + "step": 7916 + }, + { + "epoch": 0.7918, + "grad_norm": 4.295587539672852, + "learning_rate": 2.5266559963371216e-06, + "loss": 0.1475, + "step": 7918 + }, + { + "epoch": 0.792, + "grad_norm": 8.373339653015137, + "learning_rate": 2.522019095014683e-06, + "loss": 0.5993, + "step": 7920 + }, + { + "epoch": 0.7922, + "grad_norm": 2.3885338306427, + "learning_rate": 2.5173858383692906e-06, + "loss": 0.144, + "step": 7922 + }, + { + "epoch": 0.7924, + "grad_norm": 3.684830665588379, + "learning_rate": 2.512756228659141e-06, + "loss": 0.1669, + "step": 7924 + }, + { + "epoch": 0.7926, + "grad_norm": 0.12432601302862167, + "learning_rate": 2.5081302681406463e-06, + "loss": 0.1234, + "step": 7926 + }, + { + "epoch": 0.7928, + "grad_norm": 9.326316833496094, + "learning_rate": 2.5035079590684496e-06, + "loss": 0.4175, + "step": 7928 + }, + { + "epoch": 0.793, + "grad_norm": 6.411750793457031, + "learning_rate": 2.4988893036954045e-06, + "loss": 0.266, + "step": 7930 + }, + { + "epoch": 0.7932, + "grad_norm": 4.573802947998047, + "learning_rate": 2.494274304272589e-06, + "loss": 0.2836, + "step": 7932 + }, + { + "epoch": 0.7934, + "grad_norm": 1.9375247955322266, + "learning_rate": 2.4896629630492974e-06, + "loss": 0.0476, + "step": 7934 + }, + { + "epoch": 0.7936, + "grad_norm": 19.559894561767578, + "learning_rate": 2.48505528227304e-06, + "loss": 0.3947, + "step": 7936 + }, + { + "epoch": 0.7938, + "grad_norm": 1.6352777481079102, + "learning_rate": 2.480451264189546e-06, + "loss": 0.0728, + "step": 7938 + }, + { + "epoch": 0.794, + "grad_norm": 4.7284417152404785, + "learning_rate": 2.4758509110427576e-06, + "loss": 0.2635, + "step": 7940 + }, + { + "epoch": 0.7942, + "grad_norm": 2.101707935333252, + "learning_rate": 2.4712542250748305e-06, + "loss": 0.0733, + "step": 7942 + }, + { + "epoch": 0.7944, + "grad_norm": 4.9326653480529785, + "learning_rate": 2.4666612085261344e-06, + "loss": 0.0805, + "step": 7944 + }, + { + "epoch": 0.7946, + "grad_norm": 4.117537498474121, + "learning_rate": 2.4620718636352457e-06, + "loss": 0.125, + "step": 7946 + }, + { + "epoch": 0.7948, + "grad_norm": 1.3585597276687622, + "learning_rate": 2.4574861926389615e-06, + "loss": 0.1669, + "step": 7948 + }, + { + "epoch": 0.795, + "grad_norm": 3.219125509262085, + "learning_rate": 2.45290419777228e-06, + "loss": 0.0678, + "step": 7950 + }, + { + "epoch": 0.7952, + "grad_norm": 9.064671516418457, + "learning_rate": 2.4483258812684096e-06, + "loss": 1.1121, + "step": 7952 + }, + { + "epoch": 0.7954, + "grad_norm": 0.8070541024208069, + "learning_rate": 2.4437512453587653e-06, + "loss": 0.0757, + "step": 7954 + }, + { + "epoch": 0.7956, + "grad_norm": 8.062519073486328, + "learning_rate": 2.4391802922729703e-06, + "loss": 0.4704, + "step": 7956 + }, + { + "epoch": 0.7958, + "grad_norm": 4.856522560119629, + "learning_rate": 2.43461302423885e-06, + "loss": 0.2941, + "step": 7958 + }, + { + "epoch": 0.796, + "grad_norm": 2.124919891357422, + "learning_rate": 2.4300494434824373e-06, + "loss": 0.1072, + "step": 7960 + }, + { + "epoch": 0.7962, + "grad_norm": 0.1602659523487091, + "learning_rate": 2.4254895522279642e-06, + "loss": 0.0142, + "step": 7962 + }, + { + "epoch": 0.7964, + "grad_norm": 3.1875383853912354, + "learning_rate": 2.420933352697865e-06, + "loss": 0.2516, + "step": 7964 + }, + { + "epoch": 0.7966, + "grad_norm": 7.108877658843994, + "learning_rate": 2.4163808471127815e-06, + "loss": 0.3333, + "step": 7966 + }, + { + "epoch": 0.7968, + "grad_norm": 7.877798557281494, + "learning_rate": 2.411832037691545e-06, + "loss": 0.1343, + "step": 7968 + }, + { + "epoch": 0.797, + "grad_norm": 5.251255989074707, + "learning_rate": 2.407286926651192e-06, + "loss": 0.2369, + "step": 7970 + }, + { + "epoch": 0.7972, + "grad_norm": 0.3538439869880676, + "learning_rate": 2.4027455162069567e-06, + "loss": 0.0394, + "step": 7972 + }, + { + "epoch": 0.7974, + "grad_norm": 6.786355018615723, + "learning_rate": 2.398207808572258e-06, + "loss": 0.4022, + "step": 7974 + }, + { + "epoch": 0.7976, + "grad_norm": 1.8221917152404785, + "learning_rate": 2.3936738059587284e-06, + "loss": 0.1904, + "step": 7976 + }, + { + "epoch": 0.7978, + "grad_norm": 1.782202959060669, + "learning_rate": 2.3891435105761838e-06, + "loss": 0.1351, + "step": 7978 + }, + { + "epoch": 0.798, + "grad_norm": 2.1170389652252197, + "learning_rate": 2.3846169246326345e-06, + "loss": 0.058, + "step": 7980 + }, + { + "epoch": 0.7982, + "grad_norm": 5.142927169799805, + "learning_rate": 2.380094050334283e-06, + "loss": 0.2754, + "step": 7982 + }, + { + "epoch": 0.7984, + "grad_norm": 3.9593749046325684, + "learning_rate": 2.37557488988552e-06, + "loss": 0.1126, + "step": 7984 + }, + { + "epoch": 0.7986, + "grad_norm": 7.0533952713012695, + "learning_rate": 2.371059445488938e-06, + "loss": 0.2498, + "step": 7986 + }, + { + "epoch": 0.7988, + "grad_norm": 2.134976863861084, + "learning_rate": 2.3665477193453037e-06, + "loss": 0.1074, + "step": 7988 + }, + { + "epoch": 0.799, + "grad_norm": 13.151493072509766, + "learning_rate": 2.362039713653581e-06, + "loss": 0.4085, + "step": 7990 + }, + { + "epoch": 0.7992, + "grad_norm": 4.264260768890381, + "learning_rate": 2.35753543061091e-06, + "loss": 0.1675, + "step": 7992 + }, + { + "epoch": 0.7994, + "grad_norm": 5.193064212799072, + "learning_rate": 2.3530348724126304e-06, + "loss": 0.425, + "step": 7994 + }, + { + "epoch": 0.7996, + "grad_norm": 15.95190143585205, + "learning_rate": 2.3485380412522586e-06, + "loss": 0.2093, + "step": 7996 + }, + { + "epoch": 0.7998, + "grad_norm": 1.0041838884353638, + "learning_rate": 2.3440449393214947e-06, + "loss": 0.0422, + "step": 7998 + }, + { + "epoch": 0.8, + "grad_norm": 0.7500092387199402, + "learning_rate": 2.339555568810221e-06, + "loss": 0.0355, + "step": 8000 + }, + { + "epoch": 0.8002, + "grad_norm": 1.1933802366256714, + "learning_rate": 2.335069931906503e-06, + "loss": 0.0886, + "step": 8002 + }, + { + "epoch": 0.8004, + "grad_norm": 2.907106399536133, + "learning_rate": 2.3305880307965834e-06, + "loss": 0.1782, + "step": 8004 + }, + { + "epoch": 0.8006, + "grad_norm": 0.5065482258796692, + "learning_rate": 2.3261098676648908e-06, + "loss": 0.4637, + "step": 8006 + }, + { + "epoch": 0.8008, + "grad_norm": 4.437660217285156, + "learning_rate": 2.321635444694028e-06, + "loss": 0.1764, + "step": 8008 + }, + { + "epoch": 0.801, + "grad_norm": 1.1391597986221313, + "learning_rate": 2.317164764064769e-06, + "loss": 0.06, + "step": 8010 + }, + { + "epoch": 0.8012, + "grad_norm": 6.571604251861572, + "learning_rate": 2.3126978279560687e-06, + "loss": 0.1285, + "step": 8012 + }, + { + "epoch": 0.8014, + "grad_norm": 10.12667465209961, + "learning_rate": 2.308234638545064e-06, + "loss": 0.3026, + "step": 8014 + }, + { + "epoch": 0.8016, + "grad_norm": 0.5088629722595215, + "learning_rate": 2.3037751980070557e-06, + "loss": 0.5753, + "step": 8016 + }, + { + "epoch": 0.8018, + "grad_norm": 8.639359474182129, + "learning_rate": 2.2993195085155205e-06, + "loss": 0.461, + "step": 8018 + }, + { + "epoch": 0.802, + "grad_norm": 2.384979724884033, + "learning_rate": 2.2948675722421086e-06, + "loss": 0.1944, + "step": 8020 + }, + { + "epoch": 0.8022, + "grad_norm": 4.261360168457031, + "learning_rate": 2.2904193913566363e-06, + "loss": 0.1183, + "step": 8022 + }, + { + "epoch": 0.8024, + "grad_norm": 6.391547203063965, + "learning_rate": 2.2859749680270983e-06, + "loss": 0.2723, + "step": 8024 + }, + { + "epoch": 0.8026, + "grad_norm": 0.2326156347990036, + "learning_rate": 2.2815343044196523e-06, + "loss": 0.059, + "step": 8026 + }, + { + "epoch": 0.8028, + "grad_norm": 2.5378270149230957, + "learning_rate": 2.277097402698619e-06, + "loss": 0.1009, + "step": 8028 + }, + { + "epoch": 0.803, + "grad_norm": 2.2833187580108643, + "learning_rate": 2.27266426502649e-06, + "loss": 0.1379, + "step": 8030 + }, + { + "epoch": 0.8032, + "grad_norm": 1.6517970561981201, + "learning_rate": 2.2682348935639274e-06, + "loss": 0.0571, + "step": 8032 + }, + { + "epoch": 0.8034, + "grad_norm": 0.4379899501800537, + "learning_rate": 2.2638092904697516e-06, + "loss": 0.0948, + "step": 8034 + }, + { + "epoch": 0.8036, + "grad_norm": 3.20393705368042, + "learning_rate": 2.259387457900948e-06, + "loss": 0.4337, + "step": 8036 + }, + { + "epoch": 0.8038, + "grad_norm": 5.775723934173584, + "learning_rate": 2.254969398012663e-06, + "loss": 0.2632, + "step": 8038 + }, + { + "epoch": 0.804, + "grad_norm": 11.485030174255371, + "learning_rate": 2.2505551129582047e-06, + "loss": 0.2097, + "step": 8040 + }, + { + "epoch": 0.8042, + "grad_norm": 3.3185768127441406, + "learning_rate": 2.2461446048890424e-06, + "loss": 0.2217, + "step": 8042 + }, + { + "epoch": 0.8044, + "grad_norm": 2.0337345600128174, + "learning_rate": 2.241737875954808e-06, + "loss": 0.1073, + "step": 8044 + }, + { + "epoch": 0.8046, + "grad_norm": 6.489532947540283, + "learning_rate": 2.237334928303283e-06, + "loss": 0.2334, + "step": 8046 + }, + { + "epoch": 0.8048, + "grad_norm": 0.6451210975646973, + "learning_rate": 2.2329357640804118e-06, + "loss": 0.1251, + "step": 8048 + }, + { + "epoch": 0.805, + "grad_norm": 2.1306240558624268, + "learning_rate": 2.2285403854302912e-06, + "loss": 0.1478, + "step": 8050 + }, + { + "epoch": 0.8052, + "grad_norm": 2.2630186080932617, + "learning_rate": 2.22414879449518e-06, + "loss": 0.1547, + "step": 8052 + }, + { + "epoch": 0.8054, + "grad_norm": 5.361601829528809, + "learning_rate": 2.219760993415485e-06, + "loss": 0.2371, + "step": 8054 + }, + { + "epoch": 0.8056, + "grad_norm": 3.898195266723633, + "learning_rate": 2.215376984329767e-06, + "loss": 0.0899, + "step": 8056 + }, + { + "epoch": 0.8058, + "grad_norm": 5.97847843170166, + "learning_rate": 2.210996769374737e-06, + "loss": 0.215, + "step": 8058 + }, + { + "epoch": 0.806, + "grad_norm": 4.480817794799805, + "learning_rate": 2.206620350685257e-06, + "loss": 0.2317, + "step": 8060 + }, + { + "epoch": 0.8062, + "grad_norm": 2.113557815551758, + "learning_rate": 2.202247730394349e-06, + "loss": 0.0643, + "step": 8062 + }, + { + "epoch": 0.8064, + "grad_norm": 0.6442761421203613, + "learning_rate": 2.1978789106331666e-06, + "loss": 0.317, + "step": 8064 + }, + { + "epoch": 0.8066, + "grad_norm": 0.929008424282074, + "learning_rate": 2.1935138935310208e-06, + "loss": 0.0557, + "step": 8066 + }, + { + "epoch": 0.8068, + "grad_norm": 6.465555191040039, + "learning_rate": 2.1891526812153674e-06, + "loss": 0.8697, + "step": 8068 + }, + { + "epoch": 0.807, + "grad_norm": 1.485868215560913, + "learning_rate": 2.1847952758118118e-06, + "loss": 0.1232, + "step": 8070 + }, + { + "epoch": 0.8072, + "grad_norm": 0.1912083625793457, + "learning_rate": 2.1804416794441e-06, + "loss": 0.1263, + "step": 8072 + }, + { + "epoch": 0.8074, + "grad_norm": 17.426420211791992, + "learning_rate": 2.1760918942341193e-06, + "loss": 0.2441, + "step": 8074 + }, + { + "epoch": 0.8076, + "grad_norm": 5.261832237243652, + "learning_rate": 2.171745922301903e-06, + "loss": 0.2615, + "step": 8076 + }, + { + "epoch": 0.8078, + "grad_norm": 9.06939697265625, + "learning_rate": 2.1674037657656265e-06, + "loss": 0.2521, + "step": 8078 + }, + { + "epoch": 0.808, + "grad_norm": 0.17624813318252563, + "learning_rate": 2.163065426741603e-06, + "loss": 0.0133, + "step": 8080 + }, + { + "epoch": 0.8082, + "grad_norm": 2.1648619174957275, + "learning_rate": 2.1587309073442865e-06, + "loss": 0.1188, + "step": 8082 + }, + { + "epoch": 0.8084, + "grad_norm": 1.3124631643295288, + "learning_rate": 2.154400209686268e-06, + "loss": 0.0704, + "step": 8084 + }, + { + "epoch": 0.8086, + "grad_norm": 5.429103374481201, + "learning_rate": 2.1500733358782786e-06, + "loss": 0.1953, + "step": 8086 + }, + { + "epoch": 0.8088, + "grad_norm": 2.696139097213745, + "learning_rate": 2.1457502880291815e-06, + "loss": 0.266, + "step": 8088 + }, + { + "epoch": 0.809, + "grad_norm": 0.5570796132087708, + "learning_rate": 2.1414310682459805e-06, + "loss": 0.0997, + "step": 8090 + }, + { + "epoch": 0.8092, + "grad_norm": 5.007327556610107, + "learning_rate": 2.1371156786338108e-06, + "loss": 0.3881, + "step": 8092 + }, + { + "epoch": 0.8094, + "grad_norm": 1.707683801651001, + "learning_rate": 2.1328041212959403e-06, + "loss": 0.0374, + "step": 8094 + }, + { + "epoch": 0.8096, + "grad_norm": 1.902909517288208, + "learning_rate": 2.128496398333768e-06, + "loss": 0.0581, + "step": 8096 + }, + { + "epoch": 0.8098, + "grad_norm": 8.681625366210938, + "learning_rate": 2.1241925118468288e-06, + "loss": 0.1594, + "step": 8098 + }, + { + "epoch": 0.81, + "grad_norm": 2.1168951988220215, + "learning_rate": 2.119892463932781e-06, + "loss": 0.0866, + "step": 8100 + }, + { + "epoch": 0.8102, + "grad_norm": 6.4936089515686035, + "learning_rate": 2.115596256687419e-06, + "loss": 0.2356, + "step": 8102 + }, + { + "epoch": 0.8104, + "grad_norm": 5.968657493591309, + "learning_rate": 2.1113038922046603e-06, + "loss": 0.4135, + "step": 8104 + }, + { + "epoch": 0.8106, + "grad_norm": 13.196752548217773, + "learning_rate": 2.107015372576552e-06, + "loss": 0.1992, + "step": 8106 + }, + { + "epoch": 0.8108, + "grad_norm": 12.746481895446777, + "learning_rate": 2.102730699893263e-06, + "loss": 0.2177, + "step": 8108 + }, + { + "epoch": 0.811, + "grad_norm": 5.609739303588867, + "learning_rate": 2.098449876243096e-06, + "loss": 0.2217, + "step": 8110 + }, + { + "epoch": 0.8112, + "grad_norm": 17.9269962310791, + "learning_rate": 2.09417290371247e-06, + "loss": 0.3688, + "step": 8112 + }, + { + "epoch": 0.8114, + "grad_norm": 0.23069728910923004, + "learning_rate": 2.0898997843859338e-06, + "loss": 0.0567, + "step": 8114 + }, + { + "epoch": 0.8116, + "grad_norm": 2.662022352218628, + "learning_rate": 2.0856305203461436e-06, + "loss": 0.0904, + "step": 8116 + }, + { + "epoch": 0.8118, + "grad_norm": 0.5518839955329895, + "learning_rate": 2.0813651136738957e-06, + "loss": 0.1891, + "step": 8118 + }, + { + "epoch": 0.812, + "grad_norm": 5.4912004470825195, + "learning_rate": 2.0771035664480944e-06, + "loss": 0.1643, + "step": 8120 + }, + { + "epoch": 0.8122, + "grad_norm": 2.444000720977783, + "learning_rate": 2.072845880745766e-06, + "loss": 0.09, + "step": 8122 + }, + { + "epoch": 0.8124, + "grad_norm": 2.6053221225738525, + "learning_rate": 2.0685920586420562e-06, + "loss": 0.0643, + "step": 8124 + }, + { + "epoch": 0.8126, + "grad_norm": 5.628448486328125, + "learning_rate": 2.0643421022102216e-06, + "loss": 0.0985, + "step": 8126 + }, + { + "epoch": 0.8128, + "grad_norm": 13.905454635620117, + "learning_rate": 2.0600960135216463e-06, + "loss": 0.9268, + "step": 8128 + }, + { + "epoch": 0.813, + "grad_norm": 4.43077278137207, + "learning_rate": 2.0558537946458177e-06, + "loss": 0.0638, + "step": 8130 + }, + { + "epoch": 0.8132, + "grad_norm": 0.7337204813957214, + "learning_rate": 2.051615447650347e-06, + "loss": 0.0134, + "step": 8132 + }, + { + "epoch": 0.8134, + "grad_norm": 11.077897071838379, + "learning_rate": 2.0473809746009444e-06, + "loss": 0.2375, + "step": 8134 + }, + { + "epoch": 0.8136, + "grad_norm": 1.7207975387573242, + "learning_rate": 2.0431503775614457e-06, + "loss": 0.1157, + "step": 8136 + }, + { + "epoch": 0.8138, + "grad_norm": 0.9682816863059998, + "learning_rate": 2.0389236585937944e-06, + "loss": 0.0703, + "step": 8138 + }, + { + "epoch": 0.814, + "grad_norm": 20.45258140563965, + "learning_rate": 2.0347008197580376e-06, + "loss": 0.5498, + "step": 8140 + }, + { + "epoch": 0.8142, + "grad_norm": 0.15246739983558655, + "learning_rate": 2.0304818631123393e-06, + "loss": 0.0497, + "step": 8142 + }, + { + "epoch": 0.8144, + "grad_norm": 0.7339113354682922, + "learning_rate": 2.026266790712965e-06, + "loss": 0.7229, + "step": 8144 + }, + { + "epoch": 0.8146, + "grad_norm": 4.4856953620910645, + "learning_rate": 2.022055604614289e-06, + "loss": 0.3711, + "step": 8146 + }, + { + "epoch": 0.8148, + "grad_norm": 4.958371639251709, + "learning_rate": 2.017848306868797e-06, + "loss": 0.095, + "step": 8148 + }, + { + "epoch": 0.815, + "grad_norm": 4.343465328216553, + "learning_rate": 2.013644899527074e-06, + "loss": 0.1051, + "step": 8150 + }, + { + "epoch": 0.8152, + "grad_norm": 4.310601234436035, + "learning_rate": 2.009445384637805e-06, + "loss": 0.4592, + "step": 8152 + }, + { + "epoch": 0.8154, + "grad_norm": 0.3304789066314697, + "learning_rate": 2.005249764247783e-06, + "loss": 0.1708, + "step": 8154 + }, + { + "epoch": 0.8156, + "grad_norm": 12.035998344421387, + "learning_rate": 2.0010580404019066e-06, + "loss": 1.0256, + "step": 8156 + }, + { + "epoch": 0.8158, + "grad_norm": 0.8940761685371399, + "learning_rate": 1.9968702151431697e-06, + "loss": 0.2576, + "step": 8158 + }, + { + "epoch": 0.816, + "grad_norm": 6.480496406555176, + "learning_rate": 1.9926862905126663e-06, + "loss": 0.2413, + "step": 8160 + }, + { + "epoch": 0.8162, + "grad_norm": 4.20681619644165, + "learning_rate": 1.9885062685495905e-06, + "loss": 0.1505, + "step": 8162 + }, + { + "epoch": 0.8164, + "grad_norm": 7.966033458709717, + "learning_rate": 1.984330151291233e-06, + "loss": 0.6626, + "step": 8164 + }, + { + "epoch": 0.8166, + "grad_norm": 2.311863422393799, + "learning_rate": 1.9801579407729866e-06, + "loss": 0.2352, + "step": 8166 + }, + { + "epoch": 0.8168, + "grad_norm": 0.1070103719830513, + "learning_rate": 1.9759896390283362e-06, + "loss": 0.2285, + "step": 8168 + }, + { + "epoch": 0.817, + "grad_norm": 0.841128945350647, + "learning_rate": 1.9718252480888567e-06, + "loss": 0.0346, + "step": 8170 + }, + { + "epoch": 0.8172, + "grad_norm": 6.678462505340576, + "learning_rate": 1.9676647699842246e-06, + "loss": 0.2207, + "step": 8172 + }, + { + "epoch": 0.8174, + "grad_norm": 0.3968384861946106, + "learning_rate": 1.963508206742202e-06, + "loss": 0.0861, + "step": 8174 + }, + { + "epoch": 0.8176, + "grad_norm": 2.3897364139556885, + "learning_rate": 1.959355560388654e-06, + "loss": 0.0547, + "step": 8176 + }, + { + "epoch": 0.8178, + "grad_norm": 5.454402446746826, + "learning_rate": 1.955206832947526e-06, + "loss": 0.4253, + "step": 8178 + }, + { + "epoch": 0.818, + "grad_norm": 0.2833423912525177, + "learning_rate": 1.95106202644086e-06, + "loss": 0.0564, + "step": 8180 + }, + { + "epoch": 0.8182, + "grad_norm": 11.492815971374512, + "learning_rate": 1.9469211428887813e-06, + "loss": 0.3832, + "step": 8182 + }, + { + "epoch": 0.8184, + "grad_norm": 6.505232810974121, + "learning_rate": 1.9427841843095063e-06, + "loss": 0.1378, + "step": 8184 + }, + { + "epoch": 0.8186, + "grad_norm": 0.16747082769870758, + "learning_rate": 1.938651152719344e-06, + "loss": 0.0386, + "step": 8186 + }, + { + "epoch": 0.8188, + "grad_norm": 4.235581398010254, + "learning_rate": 1.934522050132678e-06, + "loss": 0.086, + "step": 8188 + }, + { + "epoch": 0.819, + "grad_norm": 0.6762155294418335, + "learning_rate": 1.930396878561983e-06, + "loss": 0.1468, + "step": 8190 + }, + { + "epoch": 0.8192, + "grad_norm": 8.331721305847168, + "learning_rate": 1.9262756400178163e-06, + "loss": 0.5536, + "step": 8192 + }, + { + "epoch": 0.8194, + "grad_norm": 0.3674505054950714, + "learning_rate": 1.9221583365088246e-06, + "loss": 0.0641, + "step": 8194 + }, + { + "epoch": 0.8196, + "grad_norm": 0.9401031732559204, + "learning_rate": 1.918044970041729e-06, + "loss": 0.0332, + "step": 8196 + }, + { + "epoch": 0.8198, + "grad_norm": 0.7040270566940308, + "learning_rate": 1.9139355426213346e-06, + "loss": 0.2095, + "step": 8198 + }, + { + "epoch": 0.82, + "grad_norm": 1.7334318161010742, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.0689, + "step": 8200 + }, + { + "epoch": 0.8202, + "grad_norm": 10.744850158691406, + "learning_rate": 1.9057285129302682e-06, + "loss": 0.4225, + "step": 8202 + }, + { + "epoch": 0.8204, + "grad_norm": 6.588041305541992, + "learning_rate": 1.9016309146596024e-06, + "loss": 0.1476, + "step": 8204 + }, + { + "epoch": 0.8206, + "grad_norm": 4.8214006423950195, + "learning_rate": 1.8975372634356481e-06, + "loss": 0.2854, + "step": 8206 + }, + { + "epoch": 0.8208, + "grad_norm": 7.505956172943115, + "learning_rate": 1.8934475612536019e-06, + "loss": 0.2608, + "step": 8208 + }, + { + "epoch": 0.821, + "grad_norm": 0.3560364246368408, + "learning_rate": 1.8893618101067357e-06, + "loss": 0.3383, + "step": 8210 + }, + { + "epoch": 0.8212, + "grad_norm": 1.4640743732452393, + "learning_rate": 1.8852800119863912e-06, + "loss": 0.5571, + "step": 8212 + }, + { + "epoch": 0.8214, + "grad_norm": 5.696381568908691, + "learning_rate": 1.8812021688819914e-06, + "loss": 0.2372, + "step": 8214 + }, + { + "epoch": 0.8216, + "grad_norm": 9.422380447387695, + "learning_rate": 1.8771282827810278e-06, + "loss": 0.3421, + "step": 8216 + }, + { + "epoch": 0.8218, + "grad_norm": 4.830421447753906, + "learning_rate": 1.8730583556690607e-06, + "loss": 0.1528, + "step": 8218 + }, + { + "epoch": 0.822, + "grad_norm": 4.197458744049072, + "learning_rate": 1.8689923895297247e-06, + "loss": 0.1224, + "step": 8220 + }, + { + "epoch": 0.8222, + "grad_norm": 0.2913706600666046, + "learning_rate": 1.86493038634472e-06, + "loss": 0.0204, + "step": 8222 + }, + { + "epoch": 0.8224, + "grad_norm": 0.30853477120399475, + "learning_rate": 1.8608723480938207e-06, + "loss": 0.4112, + "step": 8224 + }, + { + "epoch": 0.8226, + "grad_norm": 0.29658961296081543, + "learning_rate": 1.8568182767548626e-06, + "loss": 0.1297, + "step": 8226 + }, + { + "epoch": 0.8228, + "grad_norm": 4.049558162689209, + "learning_rate": 1.8527681743037518e-06, + "loss": 0.1895, + "step": 8228 + }, + { + "epoch": 0.823, + "grad_norm": 1.6998740434646606, + "learning_rate": 1.848722042714457e-06, + "loss": 0.058, + "step": 8230 + }, + { + "epoch": 0.8232, + "grad_norm": 1.2197293043136597, + "learning_rate": 1.8446798839590186e-06, + "loss": 0.129, + "step": 8232 + }, + { + "epoch": 0.8234, + "grad_norm": 2.849874973297119, + "learning_rate": 1.8406417000075327e-06, + "loss": 0.0772, + "step": 8234 + }, + { + "epoch": 0.8236, + "grad_norm": 0.18925133347511292, + "learning_rate": 1.8366074928281608e-06, + "loss": 0.5394, + "step": 8236 + }, + { + "epoch": 0.8238, + "grad_norm": 1.239287257194519, + "learning_rate": 1.8325772643871264e-06, + "loss": 0.0308, + "step": 8238 + }, + { + "epoch": 0.824, + "grad_norm": 0.07912616431713104, + "learning_rate": 1.8285510166487154e-06, + "loss": 0.0095, + "step": 8240 + }, + { + "epoch": 0.8242, + "grad_norm": 0.38481104373931885, + "learning_rate": 1.8245287515752708e-06, + "loss": 0.0689, + "step": 8242 + }, + { + "epoch": 0.8244, + "grad_norm": 5.340576171875, + "learning_rate": 1.820510471127196e-06, + "loss": 0.0928, + "step": 8244 + }, + { + "epoch": 0.8246, + "grad_norm": 3.5329058170318604, + "learning_rate": 1.816496177262952e-06, + "loss": 0.1505, + "step": 8246 + }, + { + "epoch": 0.8248, + "grad_norm": 7.170869827270508, + "learning_rate": 1.812485871939056e-06, + "loss": 0.4645, + "step": 8248 + }, + { + "epoch": 0.825, + "grad_norm": 0.5429093837738037, + "learning_rate": 1.808479557110081e-06, + "loss": 0.0204, + "step": 8250 + }, + { + "epoch": 0.8252, + "grad_norm": 12.564099311828613, + "learning_rate": 1.804477234728661e-06, + "loss": 0.4839, + "step": 8252 + }, + { + "epoch": 0.8254, + "grad_norm": 0.41515710949897766, + "learning_rate": 1.8004789067454763e-06, + "loss": 0.0193, + "step": 8254 + }, + { + "epoch": 0.8256, + "grad_norm": 1.364174723625183, + "learning_rate": 1.7964845751092663e-06, + "loss": 0.0349, + "step": 8256 + }, + { + "epoch": 0.8258, + "grad_norm": 10.970721244812012, + "learning_rate": 1.7924942417668113e-06, + "loss": 0.8617, + "step": 8258 + }, + { + "epoch": 0.826, + "grad_norm": 3.7013909816741943, + "learning_rate": 1.7885079086629598e-06, + "loss": 0.2062, + "step": 8260 + }, + { + "epoch": 0.8262, + "grad_norm": 0.14584970474243164, + "learning_rate": 1.7845255777406e-06, + "loss": 0.4566, + "step": 8262 + }, + { + "epoch": 0.8264, + "grad_norm": 2.0850746631622314, + "learning_rate": 1.7805472509406695e-06, + "loss": 0.0316, + "step": 8264 + }, + { + "epoch": 0.8266, + "grad_norm": 2.8157272338867188, + "learning_rate": 1.7765729302021596e-06, + "loss": 0.3312, + "step": 8266 + }, + { + "epoch": 0.8268, + "grad_norm": 3.817713737487793, + "learning_rate": 1.7726026174621004e-06, + "loss": 0.124, + "step": 8268 + }, + { + "epoch": 0.827, + "grad_norm": 0.31467053294181824, + "learning_rate": 1.7686363146555807e-06, + "loss": 0.3866, + "step": 8270 + }, + { + "epoch": 0.8272, + "grad_norm": 5.159049987792969, + "learning_rate": 1.7646740237157256e-06, + "loss": 0.2723, + "step": 8272 + }, + { + "epoch": 0.8274, + "grad_norm": 0.17223969101905823, + "learning_rate": 1.760715746573709e-06, + "loss": 0.6213, + "step": 8274 + }, + { + "epoch": 0.8276, + "grad_norm": 0.869417130947113, + "learning_rate": 1.7567614851587444e-06, + "loss": 0.1959, + "step": 8276 + }, + { + "epoch": 0.8278, + "grad_norm": 0.09731350094079971, + "learning_rate": 1.7528112413980892e-06, + "loss": 0.039, + "step": 8278 + }, + { + "epoch": 0.828, + "grad_norm": 0.10571722686290741, + "learning_rate": 1.7488650172170496e-06, + "loss": 0.026, + "step": 8280 + }, + { + "epoch": 0.8282, + "grad_norm": 0.14359119534492493, + "learning_rate": 1.744922814538964e-06, + "loss": 0.4055, + "step": 8282 + }, + { + "epoch": 0.8284, + "grad_norm": 4.226179599761963, + "learning_rate": 1.7409846352852144e-06, + "loss": 0.2005, + "step": 8284 + }, + { + "epoch": 0.8286, + "grad_norm": 2.4682788848876953, + "learning_rate": 1.7370504813752232e-06, + "loss": 0.2957, + "step": 8286 + }, + { + "epoch": 0.8288, + "grad_norm": 6.003852367401123, + "learning_rate": 1.7331203547264452e-06, + "loss": 0.3771, + "step": 8288 + }, + { + "epoch": 0.829, + "grad_norm": 3.303440570831299, + "learning_rate": 1.7291942572543806e-06, + "loss": 0.16, + "step": 8290 + }, + { + "epoch": 0.8292, + "grad_norm": 11.249002456665039, + "learning_rate": 1.7252721908725633e-06, + "loss": 0.1816, + "step": 8292 + }, + { + "epoch": 0.8294, + "grad_norm": 7.379795551300049, + "learning_rate": 1.7213541574925551e-06, + "loss": 0.5586, + "step": 8294 + }, + { + "epoch": 0.8296, + "grad_norm": 4.637264728546143, + "learning_rate": 1.7174401590239587e-06, + "loss": 0.1753, + "step": 8296 + }, + { + "epoch": 0.8298, + "grad_norm": 0.27080294489860535, + "learning_rate": 1.7135301973744122e-06, + "loss": 0.0743, + "step": 8298 + }, + { + "epoch": 0.83, + "grad_norm": 4.673574924468994, + "learning_rate": 1.709624274449584e-06, + "loss": 0.5041, + "step": 8300 + }, + { + "epoch": 0.8302, + "grad_norm": 1.5337480306625366, + "learning_rate": 1.7057223921531706e-06, + "loss": 0.065, + "step": 8302 + }, + { + "epoch": 0.8304, + "grad_norm": 0.2709592580795288, + "learning_rate": 1.7018245523869038e-06, + "loss": 0.0272, + "step": 8304 + }, + { + "epoch": 0.8306, + "grad_norm": 0.19115617871284485, + "learning_rate": 1.6979307570505422e-06, + "loss": 0.0359, + "step": 8306 + }, + { + "epoch": 0.8308, + "grad_norm": 18.649524688720703, + "learning_rate": 1.6940410080418723e-06, + "loss": 0.502, + "step": 8308 + }, + { + "epoch": 0.831, + "grad_norm": 4.902451515197754, + "learning_rate": 1.6901553072567189e-06, + "loss": 0.1491, + "step": 8310 + }, + { + "epoch": 0.8312, + "grad_norm": 2.2369213104248047, + "learning_rate": 1.686273656588917e-06, + "loss": 0.5228, + "step": 8312 + }, + { + "epoch": 0.8314, + "grad_norm": 0.0848495364189148, + "learning_rate": 1.6823960579303378e-06, + "loss": 0.11, + "step": 8314 + }, + { + "epoch": 0.8316, + "grad_norm": 7.232333183288574, + "learning_rate": 1.6785225131708749e-06, + "loss": 0.1549, + "step": 8316 + }, + { + "epoch": 0.8318, + "grad_norm": 2.585928440093994, + "learning_rate": 1.6746530241984504e-06, + "loss": 0.2036, + "step": 8318 + }, + { + "epoch": 0.832, + "grad_norm": 0.2001282125711441, + "learning_rate": 1.6707875928990059e-06, + "loss": 0.0291, + "step": 8320 + }, + { + "epoch": 0.8322, + "grad_norm": 0.3768687844276428, + "learning_rate": 1.666926221156503e-06, + "loss": 0.1171, + "step": 8322 + }, + { + "epoch": 0.8324, + "grad_norm": 0.1788954883813858, + "learning_rate": 1.6630689108529286e-06, + "loss": 0.0852, + "step": 8324 + }, + { + "epoch": 0.8326, + "grad_norm": 0.16985097527503967, + "learning_rate": 1.6592156638682887e-06, + "loss": 0.3345, + "step": 8326 + }, + { + "epoch": 0.8328, + "grad_norm": 0.16741661727428436, + "learning_rate": 1.6553664820806102e-06, + "loss": 0.0088, + "step": 8328 + }, + { + "epoch": 0.833, + "grad_norm": 3.5786192417144775, + "learning_rate": 1.651521367365936e-06, + "loss": 0.1254, + "step": 8330 + }, + { + "epoch": 0.8332, + "grad_norm": 0.21325922012329102, + "learning_rate": 1.6476803215983295e-06, + "loss": 0.0651, + "step": 8332 + }, + { + "epoch": 0.8334, + "grad_norm": 2.695995569229126, + "learning_rate": 1.643843346649866e-06, + "loss": 0.1237, + "step": 8334 + }, + { + "epoch": 0.8336, + "grad_norm": 1.4966139793395996, + "learning_rate": 1.6400104443906463e-06, + "loss": 0.2533, + "step": 8336 + }, + { + "epoch": 0.8338, + "grad_norm": 26.25848960876465, + "learning_rate": 1.6361816166887768e-06, + "loss": 0.2478, + "step": 8338 + }, + { + "epoch": 0.834, + "grad_norm": 0.9069409370422363, + "learning_rate": 1.6323568654103838e-06, + "loss": 0.1012, + "step": 8340 + }, + { + "epoch": 0.8342, + "grad_norm": 4.875728607177734, + "learning_rate": 1.6285361924196031e-06, + "loss": 0.2097, + "step": 8342 + }, + { + "epoch": 0.8344, + "grad_norm": 0.28729021549224854, + "learning_rate": 1.6247195995785836e-06, + "loss": 0.0386, + "step": 8344 + }, + { + "epoch": 0.8346, + "grad_norm": 5.494871616363525, + "learning_rate": 1.6209070887474876e-06, + "loss": 0.0883, + "step": 8346 + }, + { + "epoch": 0.8348, + "grad_norm": 1.2850302457809448, + "learning_rate": 1.6170986617844864e-06, + "loss": 0.1835, + "step": 8348 + }, + { + "epoch": 0.835, + "grad_norm": 4.9740824699401855, + "learning_rate": 1.6132943205457607e-06, + "loss": 0.0997, + "step": 8350 + }, + { + "epoch": 0.8352, + "grad_norm": 0.9731609225273132, + "learning_rate": 1.6094940668855008e-06, + "loss": 0.0729, + "step": 8352 + }, + { + "epoch": 0.8354, + "grad_norm": 14.27662467956543, + "learning_rate": 1.6056979026559005e-06, + "loss": 0.3876, + "step": 8354 + }, + { + "epoch": 0.8356, + "grad_norm": 0.26496079564094543, + "learning_rate": 1.601905829707171e-06, + "loss": 0.4609, + "step": 8356 + }, + { + "epoch": 0.8358, + "grad_norm": 2.022000789642334, + "learning_rate": 1.5981178498875182e-06, + "loss": 0.2229, + "step": 8358 + }, + { + "epoch": 0.836, + "grad_norm": 3.4274892807006836, + "learning_rate": 1.5943339650431578e-06, + "loss": 0.0811, + "step": 8360 + }, + { + "epoch": 0.8362, + "grad_norm": 2.035132884979248, + "learning_rate": 1.5905541770183096e-06, + "loss": 0.0536, + "step": 8362 + }, + { + "epoch": 0.8364, + "grad_norm": 0.0898500382900238, + "learning_rate": 1.5867784876551973e-06, + "loss": 0.3546, + "step": 8364 + }, + { + "epoch": 0.8366, + "grad_norm": 3.79764461517334, + "learning_rate": 1.583006898794044e-06, + "loss": 0.1274, + "step": 8366 + }, + { + "epoch": 0.8368, + "grad_norm": 0.35276874899864197, + "learning_rate": 1.579239412273078e-06, + "loss": 0.0559, + "step": 8368 + }, + { + "epoch": 0.837, + "grad_norm": 3.3567020893096924, + "learning_rate": 1.5754760299285255e-06, + "loss": 0.2602, + "step": 8370 + }, + { + "epoch": 0.8372, + "grad_norm": 0.7121863961219788, + "learning_rate": 1.5717167535946142e-06, + "loss": 0.0245, + "step": 8372 + }, + { + "epoch": 0.8374, + "grad_norm": 1.3630940914154053, + "learning_rate": 1.5679615851035669e-06, + "loss": 0.0883, + "step": 8374 + }, + { + "epoch": 0.8376, + "grad_norm": 0.1518147587776184, + "learning_rate": 1.5642105262856122e-06, + "loss": 0.0161, + "step": 8376 + }, + { + "epoch": 0.8378, + "grad_norm": 3.1829521656036377, + "learning_rate": 1.560463578968967e-06, + "loss": 0.1411, + "step": 8378 + }, + { + "epoch": 0.838, + "grad_norm": 0.14414629340171814, + "learning_rate": 1.5567207449798517e-06, + "loss": 0.1782, + "step": 8380 + }, + { + "epoch": 0.8382, + "grad_norm": 7.52355432510376, + "learning_rate": 1.55298202614247e-06, + "loss": 0.2177, + "step": 8382 + }, + { + "epoch": 0.8384, + "grad_norm": 0.13803809881210327, + "learning_rate": 1.5492474242790368e-06, + "loss": 0.0203, + "step": 8384 + }, + { + "epoch": 0.8386, + "grad_norm": 3.773780107498169, + "learning_rate": 1.545516941209747e-06, + "loss": 0.5587, + "step": 8386 + }, + { + "epoch": 0.8388, + "grad_norm": 1.6508004665374756, + "learning_rate": 1.5417905787527943e-06, + "loss": 0.0969, + "step": 8388 + }, + { + "epoch": 0.839, + "grad_norm": 0.6833041310310364, + "learning_rate": 1.538068338724361e-06, + "loss": 0.0174, + "step": 8390 + }, + { + "epoch": 0.8392, + "grad_norm": 0.3119729459285736, + "learning_rate": 1.5343502229386209e-06, + "loss": 0.1972, + "step": 8392 + }, + { + "epoch": 0.8394, + "grad_norm": 1.1000158786773682, + "learning_rate": 1.530636233207743e-06, + "loss": 0.0951, + "step": 8394 + }, + { + "epoch": 0.8396, + "grad_norm": 5.032218933105469, + "learning_rate": 1.526926371341878e-06, + "loss": 0.116, + "step": 8396 + }, + { + "epoch": 0.8398, + "grad_norm": 9.26672077178955, + "learning_rate": 1.52322063914917e-06, + "loss": 0.436, + "step": 8398 + }, + { + "epoch": 0.84, + "grad_norm": 2.531115770339966, + "learning_rate": 1.5195190384357405e-06, + "loss": 0.0862, + "step": 8400 + }, + { + "epoch": 0.8402, + "grad_norm": 4.474094390869141, + "learning_rate": 1.5158215710057123e-06, + "loss": 0.1846, + "step": 8402 + }, + { + "epoch": 0.8404, + "grad_norm": 0.07837464660406113, + "learning_rate": 1.5121282386611823e-06, + "loss": 0.4535, + "step": 8404 + }, + { + "epoch": 0.8406, + "grad_norm": 0.5868538618087769, + "learning_rate": 1.5084390432022377e-06, + "loss": 0.0602, + "step": 8406 + }, + { + "epoch": 0.8408, + "grad_norm": 3.8681325912475586, + "learning_rate": 1.5047539864269477e-06, + "loss": 0.2573, + "step": 8408 + }, + { + "epoch": 0.841, + "grad_norm": 5.475268840789795, + "learning_rate": 1.5010730701313626e-06, + "loss": 0.318, + "step": 8410 + }, + { + "epoch": 0.8412, + "grad_norm": 0.7951678037643433, + "learning_rate": 1.4973962961095135e-06, + "loss": 0.4143, + "step": 8412 + }, + { + "epoch": 0.8414, + "grad_norm": 0.3612942099571228, + "learning_rate": 1.4937236661534227e-06, + "loss": 0.0151, + "step": 8414 + }, + { + "epoch": 0.8416, + "grad_norm": 1.371131181716919, + "learning_rate": 1.490055182053083e-06, + "loss": 0.0514, + "step": 8416 + }, + { + "epoch": 0.8418, + "grad_norm": 1.293357491493225, + "learning_rate": 1.486390845596466e-06, + "loss": 0.1479, + "step": 8418 + }, + { + "epoch": 0.842, + "grad_norm": 3.9676291942596436, + "learning_rate": 1.4827306585695234e-06, + "loss": 0.4359, + "step": 8420 + }, + { + "epoch": 0.8422, + "grad_norm": 5.730264663696289, + "learning_rate": 1.4790746227561925e-06, + "loss": 0.3804, + "step": 8422 + }, + { + "epoch": 0.8424, + "grad_norm": 8.599799156188965, + "learning_rate": 1.4754227399383758e-06, + "loss": 0.1955, + "step": 8424 + }, + { + "epoch": 0.8426, + "grad_norm": 3.0166702270507812, + "learning_rate": 1.4717750118959583e-06, + "loss": 0.2811, + "step": 8426 + }, + { + "epoch": 0.8428, + "grad_norm": 1.6973257064819336, + "learning_rate": 1.468131440406798e-06, + "loss": 0.0342, + "step": 8428 + }, + { + "epoch": 0.843, + "grad_norm": 9.70492935180664, + "learning_rate": 1.4644920272467245e-06, + "loss": 0.6054, + "step": 8430 + }, + { + "epoch": 0.8432, + "grad_norm": 0.14582639932632446, + "learning_rate": 1.4608567741895496e-06, + "loss": 0.0126, + "step": 8432 + }, + { + "epoch": 0.8434, + "grad_norm": 12.29815673828125, + "learning_rate": 1.4572256830070497e-06, + "loss": 0.5944, + "step": 8434 + }, + { + "epoch": 0.8436, + "grad_norm": 0.3100648522377014, + "learning_rate": 1.4535987554689712e-06, + "loss": 0.0137, + "step": 8436 + }, + { + "epoch": 0.8438, + "grad_norm": 3.742044448852539, + "learning_rate": 1.4499759933430347e-06, + "loss": 0.4081, + "step": 8438 + }, + { + "epoch": 0.844, + "grad_norm": 0.8604146838188171, + "learning_rate": 1.446357398394934e-06, + "loss": 0.0284, + "step": 8440 + }, + { + "epoch": 0.8442, + "grad_norm": 7.660444736480713, + "learning_rate": 1.4427429723883256e-06, + "loss": 0.5169, + "step": 8442 + }, + { + "epoch": 0.8444, + "grad_norm": 4.253939151763916, + "learning_rate": 1.439132717084839e-06, + "loss": 0.3255, + "step": 8444 + }, + { + "epoch": 0.8446, + "grad_norm": 1.5887898206710815, + "learning_rate": 1.4355266342440678e-06, + "loss": 0.1989, + "step": 8446 + }, + { + "epoch": 0.8448, + "grad_norm": 1.2329578399658203, + "learning_rate": 1.4319247256235713e-06, + "loss": 0.0498, + "step": 8448 + }, + { + "epoch": 0.845, + "grad_norm": 8.110052108764648, + "learning_rate": 1.4283269929788779e-06, + "loss": 0.8563, + "step": 8450 + }, + { + "epoch": 0.8452, + "grad_norm": 0.1626647412776947, + "learning_rate": 1.4247334380634792e-06, + "loss": 0.0104, + "step": 8452 + }, + { + "epoch": 0.8454, + "grad_norm": 0.47181060910224915, + "learning_rate": 1.4211440626288286e-06, + "loss": 0.1994, + "step": 8454 + }, + { + "epoch": 0.8456, + "grad_norm": 0.9436598420143127, + "learning_rate": 1.4175588684243447e-06, + "loss": 0.582, + "step": 8456 + }, + { + "epoch": 0.8458, + "grad_norm": 2.261943817138672, + "learning_rate": 1.413977857197405e-06, + "loss": 0.158, + "step": 8458 + }, + { + "epoch": 0.846, + "grad_norm": 2.8878419399261475, + "learning_rate": 1.4104010306933558e-06, + "loss": 0.1351, + "step": 8460 + }, + { + "epoch": 0.8462, + "grad_norm": 19.130945205688477, + "learning_rate": 1.4068283906554969e-06, + "loss": 1.0846, + "step": 8462 + }, + { + "epoch": 0.8464, + "grad_norm": 0.24062322080135345, + "learning_rate": 1.40325993882509e-06, + "loss": 0.1067, + "step": 8464 + }, + { + "epoch": 0.8466, + "grad_norm": 11.721121788024902, + "learning_rate": 1.399695676941354e-06, + "loss": 0.5738, + "step": 8466 + }, + { + "epoch": 0.8468, + "grad_norm": 14.206520080566406, + "learning_rate": 1.3961356067414667e-06, + "loss": 0.392, + "step": 8468 + }, + { + "epoch": 0.847, + "grad_norm": 1.0727972984313965, + "learning_rate": 1.3925797299605649e-06, + "loss": 0.1466, + "step": 8470 + }, + { + "epoch": 0.8472, + "grad_norm": 0.12908236682415009, + "learning_rate": 1.3890280483317375e-06, + "loss": 0.1115, + "step": 8472 + }, + { + "epoch": 0.8474, + "grad_norm": 6.654333114624023, + "learning_rate": 1.3854805635860335e-06, + "loss": 0.4729, + "step": 8474 + }, + { + "epoch": 0.8476, + "grad_norm": 1.1645723581314087, + "learning_rate": 1.381937277452451e-06, + "loss": 0.1326, + "step": 8476 + }, + { + "epoch": 0.8478, + "grad_norm": 0.21057263016700745, + "learning_rate": 1.3783981916579448e-06, + "loss": 0.1115, + "step": 8478 + }, + { + "epoch": 0.848, + "grad_norm": 8.1213960647583, + "learning_rate": 1.3748633079274254e-06, + "loss": 0.0788, + "step": 8480 + }, + { + "epoch": 0.8482, + "grad_norm": 0.8701608777046204, + "learning_rate": 1.3713326279837502e-06, + "loss": 0.0475, + "step": 8482 + }, + { + "epoch": 0.8484, + "grad_norm": 4.4492011070251465, + "learning_rate": 1.3678061535477305e-06, + "loss": 0.3627, + "step": 8484 + }, + { + "epoch": 0.8486, + "grad_norm": 0.3874255120754242, + "learning_rate": 1.3642838863381258e-06, + "loss": 0.1014, + "step": 8486 + }, + { + "epoch": 0.8488, + "grad_norm": 0.7445206046104431, + "learning_rate": 1.3607658280716474e-06, + "loss": 0.1836, + "step": 8488 + }, + { + "epoch": 0.849, + "grad_norm": 3.221233606338501, + "learning_rate": 1.3572519804629537e-06, + "loss": 0.1352, + "step": 8490 + }, + { + "epoch": 0.8492, + "grad_norm": 2.725607395172119, + "learning_rate": 1.3537423452246522e-06, + "loss": 0.5147, + "step": 8492 + }, + { + "epoch": 0.8494, + "grad_norm": 0.7844517230987549, + "learning_rate": 1.3502369240672941e-06, + "loss": 0.1037, + "step": 8494 + }, + { + "epoch": 0.8496, + "grad_norm": 12.240203857421875, + "learning_rate": 1.3467357186993802e-06, + "loss": 0.4531, + "step": 8496 + }, + { + "epoch": 0.8498, + "grad_norm": 0.11654934287071228, + "learning_rate": 1.3432387308273576e-06, + "loss": 0.0444, + "step": 8498 + }, + { + "epoch": 0.85, + "grad_norm": 9.027752876281738, + "learning_rate": 1.339745962155613e-06, + "loss": 0.1193, + "step": 8500 + }, + { + "epoch": 0.8502, + "grad_norm": 0.1004432961344719, + "learning_rate": 1.3362574143864816e-06, + "loss": 0.0259, + "step": 8502 + }, + { + "epoch": 0.8504, + "grad_norm": 9.088797569274902, + "learning_rate": 1.3327730892202384e-06, + "loss": 0.7773, + "step": 8504 + }, + { + "epoch": 0.8506, + "grad_norm": 16.662147521972656, + "learning_rate": 1.3292929883550998e-06, + "loss": 1.1887, + "step": 8506 + }, + { + "epoch": 0.8508, + "grad_norm": 2.511016845703125, + "learning_rate": 1.3258171134872267e-06, + "loss": 0.0684, + "step": 8508 + }, + { + "epoch": 0.851, + "grad_norm": 6.43282413482666, + "learning_rate": 1.322345466310717e-06, + "loss": 0.2753, + "step": 8510 + }, + { + "epoch": 0.8512, + "grad_norm": 6.069318771362305, + "learning_rate": 1.3188780485176089e-06, + "loss": 0.2224, + "step": 8512 + }, + { + "epoch": 0.8514, + "grad_norm": 0.11646804958581924, + "learning_rate": 1.3154148617978813e-06, + "loss": 0.1111, + "step": 8514 + }, + { + "epoch": 0.8516, + "grad_norm": 0.15227827429771423, + "learning_rate": 1.3119559078394462e-06, + "loss": 0.0602, + "step": 8516 + }, + { + "epoch": 0.8518, + "grad_norm": 2.996511936187744, + "learning_rate": 1.3085011883281606e-06, + "loss": 0.178, + "step": 8518 + }, + { + "epoch": 0.852, + "grad_norm": 1.617392897605896, + "learning_rate": 1.30505070494781e-06, + "loss": 0.0411, + "step": 8520 + }, + { + "epoch": 0.8522, + "grad_norm": 0.6266528367996216, + "learning_rate": 1.3016044593801202e-06, + "loss": 0.0815, + "step": 8522 + }, + { + "epoch": 0.8524, + "grad_norm": 0.45551830530166626, + "learning_rate": 1.2981624533047432e-06, + "loss": 0.0814, + "step": 8524 + }, + { + "epoch": 0.8526, + "grad_norm": 0.08374336361885071, + "learning_rate": 1.294724688399278e-06, + "loss": 0.0151, + "step": 8526 + }, + { + "epoch": 0.8528, + "grad_norm": 18.812379837036133, + "learning_rate": 1.2912911663392468e-06, + "loss": 0.5639, + "step": 8528 + }, + { + "epoch": 0.853, + "grad_norm": 3.7929346561431885, + "learning_rate": 1.2878618887981064e-06, + "loss": 0.1083, + "step": 8530 + }, + { + "epoch": 0.8532, + "grad_norm": 9.206286430358887, + "learning_rate": 1.2844368574472454e-06, + "loss": 0.4566, + "step": 8532 + }, + { + "epoch": 0.8534, + "grad_norm": 0.2521537244319916, + "learning_rate": 1.2810160739559797e-06, + "loss": 0.0133, + "step": 8534 + }, + { + "epoch": 0.8536, + "grad_norm": 3.801011800765991, + "learning_rate": 1.277599539991563e-06, + "loss": 0.0926, + "step": 8536 + }, + { + "epoch": 0.8538, + "grad_norm": 2.9231858253479004, + "learning_rate": 1.2741872572191684e-06, + "loss": 0.4151, + "step": 8538 + }, + { + "epoch": 0.854, + "grad_norm": 2.4219069480895996, + "learning_rate": 1.2707792273019049e-06, + "loss": 0.0882, + "step": 8540 + }, + { + "epoch": 0.8542, + "grad_norm": 6.455507278442383, + "learning_rate": 1.2673754519008008e-06, + "loss": 0.4174, + "step": 8542 + }, + { + "epoch": 0.8544, + "grad_norm": 4.968897819519043, + "learning_rate": 1.2639759326748136e-06, + "loss": 0.1519, + "step": 8544 + }, + { + "epoch": 0.8546, + "grad_norm": 4.769510746002197, + "learning_rate": 1.2605806712808322e-06, + "loss": 0.2923, + "step": 8546 + }, + { + "epoch": 0.8548, + "grad_norm": 6.388027667999268, + "learning_rate": 1.257189669373664e-06, + "loss": 0.6673, + "step": 8548 + }, + { + "epoch": 0.855, + "grad_norm": 1.0057413578033447, + "learning_rate": 1.2538029286060428e-06, + "loss": 0.357, + "step": 8550 + }, + { + "epoch": 0.8552, + "grad_norm": 5.030423641204834, + "learning_rate": 1.2504204506286244e-06, + "loss": 0.2417, + "step": 8552 + }, + { + "epoch": 0.8554, + "grad_norm": 2.626772880554199, + "learning_rate": 1.2470422370899838e-06, + "loss": 0.1291, + "step": 8554 + }, + { + "epoch": 0.8556, + "grad_norm": 0.3825162947177887, + "learning_rate": 1.2436682896366282e-06, + "loss": 0.0741, + "step": 8556 + }, + { + "epoch": 0.8558, + "grad_norm": 0.7216492891311646, + "learning_rate": 1.2402986099129765e-06, + "loss": 0.0308, + "step": 8558 + }, + { + "epoch": 0.856, + "grad_norm": 3.302356004714966, + "learning_rate": 1.2369331995613664e-06, + "loss": 0.1157, + "step": 8560 + }, + { + "epoch": 0.8562, + "grad_norm": 9.541150093078613, + "learning_rate": 1.233572060222057e-06, + "loss": 0.6162, + "step": 8562 + }, + { + "epoch": 0.8564, + "grad_norm": 5.261593818664551, + "learning_rate": 1.230215193533233e-06, + "loss": 0.5069, + "step": 8564 + }, + { + "epoch": 0.8566, + "grad_norm": 3.9012904167175293, + "learning_rate": 1.2268626011309858e-06, + "loss": 0.3833, + "step": 8566 + }, + { + "epoch": 0.8568, + "grad_norm": 6.2977824211120605, + "learning_rate": 1.223514284649331e-06, + "loss": 0.2207, + "step": 8568 + }, + { + "epoch": 0.857, + "grad_norm": 13.93101692199707, + "learning_rate": 1.2201702457201948e-06, + "loss": 0.6161, + "step": 8570 + }, + { + "epoch": 0.8572, + "grad_norm": 1.678261399269104, + "learning_rate": 1.2168304859734226e-06, + "loss": 0.2573, + "step": 8572 + }, + { + "epoch": 0.8574, + "grad_norm": 1.060602068901062, + "learning_rate": 1.2134950070367723e-06, + "loss": 0.0264, + "step": 8574 + }, + { + "epoch": 0.8576, + "grad_norm": 0.56810063123703, + "learning_rate": 1.210163810535917e-06, + "loss": 0.0277, + "step": 8576 + }, + { + "epoch": 0.8578, + "grad_norm": 0.9596744775772095, + "learning_rate": 1.206836898094439e-06, + "loss": 0.3743, + "step": 8578 + }, + { + "epoch": 0.858, + "grad_norm": 2.2686612606048584, + "learning_rate": 1.2035142713338366e-06, + "loss": 0.3463, + "step": 8580 + }, + { + "epoch": 0.8582, + "grad_norm": 2.6643917560577393, + "learning_rate": 1.2001959318735158e-06, + "loss": 0.1157, + "step": 8582 + }, + { + "epoch": 0.8584, + "grad_norm": 0.3418812155723572, + "learning_rate": 1.196881881330798e-06, + "loss": 0.0231, + "step": 8584 + }, + { + "epoch": 0.8586, + "grad_norm": 0.1340498924255371, + "learning_rate": 1.1935721213209106e-06, + "loss": 0.4049, + "step": 8586 + }, + { + "epoch": 0.8588, + "grad_norm": 1.7400144338607788, + "learning_rate": 1.1902666534569884e-06, + "loss": 0.0579, + "step": 8588 + }, + { + "epoch": 0.859, + "grad_norm": 4.910536289215088, + "learning_rate": 1.1869654793500784e-06, + "loss": 0.131, + "step": 8590 + }, + { + "epoch": 0.8592, + "grad_norm": 5.080787181854248, + "learning_rate": 1.1836686006091313e-06, + "loss": 0.195, + "step": 8592 + }, + { + "epoch": 0.8594, + "grad_norm": 2.8592584133148193, + "learning_rate": 1.1803760188410074e-06, + "loss": 0.1372, + "step": 8594 + }, + { + "epoch": 0.8596, + "grad_norm": 7.400374412536621, + "learning_rate": 1.1770877356504684e-06, + "loss": 0.2299, + "step": 8596 + }, + { + "epoch": 0.8598, + "grad_norm": 0.6476640105247498, + "learning_rate": 1.1738037526401857e-06, + "loss": 0.101, + "step": 8598 + }, + { + "epoch": 0.86, + "grad_norm": 0.409222811460495, + "learning_rate": 1.1705240714107301e-06, + "loss": 0.1831, + "step": 8600 + }, + { + "epoch": 0.8602, + "grad_norm": 0.21666644513607025, + "learning_rate": 1.167248693560583e-06, + "loss": 0.0241, + "step": 8602 + }, + { + "epoch": 0.8604, + "grad_norm": 3.258988380432129, + "learning_rate": 1.1639776206861197e-06, + "loss": 0.2755, + "step": 8604 + }, + { + "epoch": 0.8606, + "grad_norm": 0.4320504367351532, + "learning_rate": 1.1607108543816247e-06, + "loss": 0.3407, + "step": 8606 + }, + { + "epoch": 0.8608, + "grad_norm": 0.08888380974531174, + "learning_rate": 1.1574483962392768e-06, + "loss": 0.1613, + "step": 8608 + }, + { + "epoch": 0.861, + "grad_norm": 1.8909862041473389, + "learning_rate": 1.1541902478491607e-06, + "loss": 0.0855, + "step": 8610 + }, + { + "epoch": 0.8612, + "grad_norm": 1.26923406124115, + "learning_rate": 1.1509364107992582e-06, + "loss": 0.2139, + "step": 8612 + }, + { + "epoch": 0.8614, + "grad_norm": 3.7628116607666016, + "learning_rate": 1.1476868866754488e-06, + "loss": 0.2089, + "step": 8614 + }, + { + "epoch": 0.8616, + "grad_norm": 1.1959969997406006, + "learning_rate": 1.1444416770615118e-06, + "loss": 0.0405, + "step": 8616 + }, + { + "epoch": 0.8618, + "grad_norm": 2.2536253929138184, + "learning_rate": 1.1412007835391237e-06, + "loss": 0.1271, + "step": 8618 + }, + { + "epoch": 0.862, + "grad_norm": 0.6073092222213745, + "learning_rate": 1.1379642076878528e-06, + "loss": 0.0252, + "step": 8620 + }, + { + "epoch": 0.8622, + "grad_norm": 1.5299214124679565, + "learning_rate": 1.1347319510851718e-06, + "loss": 0.0549, + "step": 8622 + }, + { + "epoch": 0.8624, + "grad_norm": 8.978793144226074, + "learning_rate": 1.1315040153064416e-06, + "loss": 0.181, + "step": 8624 + }, + { + "epoch": 0.8626, + "grad_norm": 0.5768134593963623, + "learning_rate": 1.1282804019249183e-06, + "loss": 0.0583, + "step": 8626 + }, + { + "epoch": 0.8628, + "grad_norm": 8.614498138427734, + "learning_rate": 1.1250611125117527e-06, + "loss": 0.317, + "step": 8628 + }, + { + "epoch": 0.863, + "grad_norm": 1.30638587474823, + "learning_rate": 1.1218461486359878e-06, + "loss": 0.1944, + "step": 8630 + }, + { + "epoch": 0.8632, + "grad_norm": 1.0229755640029907, + "learning_rate": 1.1186355118645552e-06, + "loss": 0.0243, + "step": 8632 + }, + { + "epoch": 0.8634, + "grad_norm": 5.84527063369751, + "learning_rate": 1.1154292037622838e-06, + "loss": 0.5331, + "step": 8634 + }, + { + "epoch": 0.8636, + "grad_norm": 1.1791433095932007, + "learning_rate": 1.1122272258918864e-06, + "loss": 0.0654, + "step": 8636 + }, + { + "epoch": 0.8638, + "grad_norm": 0.2796035706996918, + "learning_rate": 1.1090295798139672e-06, + "loss": 0.0499, + "step": 8638 + }, + { + "epoch": 0.864, + "grad_norm": 4.677618503570557, + "learning_rate": 1.1058362670870248e-06, + "loss": 0.5094, + "step": 8640 + }, + { + "epoch": 0.8642, + "grad_norm": 0.09749685972929001, + "learning_rate": 1.102647289267438e-06, + "loss": 0.0818, + "step": 8642 + }, + { + "epoch": 0.8644, + "grad_norm": 0.3355250060558319, + "learning_rate": 1.0994626479094749e-06, + "loss": 0.1821, + "step": 8644 + }, + { + "epoch": 0.8646, + "grad_norm": 0.9195314049720764, + "learning_rate": 1.096282344565296e-06, + "loss": 0.0604, + "step": 8646 + }, + { + "epoch": 0.8648, + "grad_norm": 5.510916233062744, + "learning_rate": 1.093106380784934e-06, + "loss": 0.2344, + "step": 8648 + }, + { + "epoch": 0.865, + "grad_norm": 0.17866003513336182, + "learning_rate": 1.0899347581163222e-06, + "loss": 0.0469, + "step": 8650 + }, + { + "epoch": 0.8652, + "grad_norm": 1.5016472339630127, + "learning_rate": 1.0867674781052683e-06, + "loss": 0.1255, + "step": 8652 + }, + { + "epoch": 0.8654, + "grad_norm": 5.103957176208496, + "learning_rate": 1.0836045422954665e-06, + "loss": 0.2302, + "step": 8654 + }, + { + "epoch": 0.8656, + "grad_norm": 1.4927504062652588, + "learning_rate": 1.0804459522284927e-06, + "loss": 0.0705, + "step": 8656 + }, + { + "epoch": 0.8658, + "grad_norm": 0.6989368200302124, + "learning_rate": 1.0772917094438052e-06, + "loss": 0.0307, + "step": 8658 + }, + { + "epoch": 0.866, + "grad_norm": 3.5680062770843506, + "learning_rate": 1.0741418154787443e-06, + "loss": 0.1241, + "step": 8660 + }, + { + "epoch": 0.8662, + "grad_norm": 3.0718746185302734, + "learning_rate": 1.0709962718685318e-06, + "loss": 0.0849, + "step": 8662 + }, + { + "epoch": 0.8664, + "grad_norm": 4.636090278625488, + "learning_rate": 1.0678550801462662e-06, + "loss": 0.5831, + "step": 8664 + }, + { + "epoch": 0.8666, + "grad_norm": 4.417630195617676, + "learning_rate": 1.0647182418429224e-06, + "loss": 0.1949, + "step": 8666 + }, + { + "epoch": 0.8668, + "grad_norm": 0.33263662457466125, + "learning_rate": 1.0615857584873624e-06, + "loss": 0.1433, + "step": 8668 + }, + { + "epoch": 0.867, + "grad_norm": 3.1588594913482666, + "learning_rate": 1.058457631606319e-06, + "loss": 0.1752, + "step": 8670 + }, + { + "epoch": 0.8672, + "grad_norm": 2.322646379470825, + "learning_rate": 1.0553338627244026e-06, + "loss": 0.2061, + "step": 8672 + }, + { + "epoch": 0.8674, + "grad_norm": 0.7177225351333618, + "learning_rate": 1.0522144533641e-06, + "loss": 0.0197, + "step": 8674 + }, + { + "epoch": 0.8676, + "grad_norm": 0.4758107364177704, + "learning_rate": 1.0490994050457748e-06, + "loss": 0.1093, + "step": 8676 + }, + { + "epoch": 0.8678, + "grad_norm": 0.5186434388160706, + "learning_rate": 1.0459887192876595e-06, + "loss": 0.0234, + "step": 8678 + }, + { + "epoch": 0.868, + "grad_norm": 9.482634544372559, + "learning_rate": 1.042882397605871e-06, + "loss": 0.2491, + "step": 8680 + }, + { + "epoch": 0.8682, + "grad_norm": 7.123345851898193, + "learning_rate": 1.039780441514391e-06, + "loss": 0.3047, + "step": 8682 + }, + { + "epoch": 0.8684, + "grad_norm": 0.9874138236045837, + "learning_rate": 1.0366828525250728e-06, + "loss": 0.0312, + "step": 8684 + }, + { + "epoch": 0.8686, + "grad_norm": 1.7774295806884766, + "learning_rate": 1.0335896321476413e-06, + "loss": 0.2216, + "step": 8686 + }, + { + "epoch": 0.8688, + "grad_norm": 1.3534643650054932, + "learning_rate": 1.0305007818897006e-06, + "loss": 0.1727, + "step": 8688 + }, + { + "epoch": 0.869, + "grad_norm": 6.799195289611816, + "learning_rate": 1.0274163032567165e-06, + "loss": 0.2699, + "step": 8690 + }, + { + "epoch": 0.8692, + "grad_norm": 0.2826516628265381, + "learning_rate": 1.024336197752025e-06, + "loss": 0.1378, + "step": 8692 + }, + { + "epoch": 0.8694, + "grad_norm": 8.372241973876953, + "learning_rate": 1.0212604668768343e-06, + "loss": 0.3352, + "step": 8694 + }, + { + "epoch": 0.8696, + "grad_norm": 6.669303894042969, + "learning_rate": 1.0181891121302145e-06, + "loss": 0.2608, + "step": 8696 + }, + { + "epoch": 0.8698, + "grad_norm": 19.5428409576416, + "learning_rate": 1.0151221350091134e-06, + "loss": 0.3696, + "step": 8698 + }, + { + "epoch": 0.87, + "grad_norm": 3.6535227298736572, + "learning_rate": 1.012059537008332e-06, + "loss": 0.1802, + "step": 8700 + }, + { + "epoch": 0.8702, + "grad_norm": 0.23964762687683105, + "learning_rate": 1.009001319620545e-06, + "loss": 0.0764, + "step": 8702 + }, + { + "epoch": 0.8704, + "grad_norm": 3.83192777633667, + "learning_rate": 1.0059474843362893e-06, + "loss": 0.1016, + "step": 8704 + }, + { + "epoch": 0.8706, + "grad_norm": 0.5659181475639343, + "learning_rate": 1.0028980326439708e-06, + "loss": 0.0315, + "step": 8706 + }, + { + "epoch": 0.8708, + "grad_norm": 0.3113324046134949, + "learning_rate": 9.99852966029854e-07, + "loss": 0.0298, + "step": 8708 + }, + { + "epoch": 0.871, + "grad_norm": 2.3000476360321045, + "learning_rate": 9.968122859780648e-07, + "loss": 0.1292, + "step": 8710 + }, + { + "epoch": 0.8712, + "grad_norm": 10.24364185333252, + "learning_rate": 9.93775993970597e-07, + "loss": 0.3409, + "step": 8712 + }, + { + "epoch": 0.8714, + "grad_norm": 6.997418403625488, + "learning_rate": 9.907440914873e-07, + "loss": 0.3218, + "step": 8714 + }, + { + "epoch": 0.8716, + "grad_norm": 6.09216833114624, + "learning_rate": 9.877165800058874e-07, + "loss": 0.5068, + "step": 8716 + }, + { + "epoch": 0.8718, + "grad_norm": 10.486473083496094, + "learning_rate": 9.84693461001932e-07, + "loss": 0.4262, + "step": 8718 + }, + { + "epoch": 0.872, + "grad_norm": 1.4312385320663452, + "learning_rate": 9.816747359488632e-07, + "loss": 0.033, + "step": 8720 + }, + { + "epoch": 0.8722, + "grad_norm": 5.312750816345215, + "learning_rate": 9.786604063179728e-07, + "loss": 0.2659, + "step": 8722 + }, + { + "epoch": 0.8724, + "grad_norm": 0.6861959099769592, + "learning_rate": 9.756504735784067e-07, + "loss": 0.0534, + "step": 8724 + }, + { + "epoch": 0.8726, + "grad_norm": 0.7620915770530701, + "learning_rate": 9.726449391971716e-07, + "loss": 0.0764, + "step": 8726 + }, + { + "epoch": 0.8728, + "grad_norm": 1.8566800355911255, + "learning_rate": 9.696438046391288e-07, + "loss": 0.2138, + "step": 8728 + }, + { + "epoch": 0.873, + "grad_norm": 0.5982115268707275, + "learning_rate": 9.666470713669918e-07, + "loss": 0.2266, + "step": 8730 + }, + { + "epoch": 0.8732, + "grad_norm": 2.208495855331421, + "learning_rate": 9.636547408413355e-07, + "loss": 0.2699, + "step": 8732 + }, + { + "epoch": 0.8734, + "grad_norm": 0.8084271550178528, + "learning_rate": 9.606668145205833e-07, + "loss": 0.0984, + "step": 8734 + }, + { + "epoch": 0.8736, + "grad_norm": 0.5741896033287048, + "learning_rate": 9.576832938610137e-07, + "loss": 0.065, + "step": 8736 + }, + { + "epoch": 0.8738, + "grad_norm": 2.5139265060424805, + "learning_rate": 9.547041803167601e-07, + "loss": 0.6472, + "step": 8738 + }, + { + "epoch": 0.874, + "grad_norm": 1.5510939359664917, + "learning_rate": 9.517294753398066e-07, + "loss": 0.1477, + "step": 8740 + }, + { + "epoch": 0.8742, + "grad_norm": 3.824021100997925, + "learning_rate": 9.487591803799856e-07, + "loss": 0.5652, + "step": 8742 + }, + { + "epoch": 0.8744, + "grad_norm": 0.2421092838048935, + "learning_rate": 9.457932968849826e-07, + "loss": 0.0591, + "step": 8744 + }, + { + "epoch": 0.8746, + "grad_norm": 2.1888046264648438, + "learning_rate": 9.428318263003378e-07, + "loss": 0.1762, + "step": 8746 + }, + { + "epoch": 0.8748, + "grad_norm": 0.650844156742096, + "learning_rate": 9.398747700694322e-07, + "loss": 0.073, + "step": 8748 + }, + { + "epoch": 0.875, + "grad_norm": 1.4648793935775757, + "learning_rate": 9.369221296335007e-07, + "loss": 0.1441, + "step": 8750 + }, + { + "epoch": 0.8752, + "grad_norm": 0.7553773522377014, + "learning_rate": 9.339739064316233e-07, + "loss": 0.3239, + "step": 8752 + }, + { + "epoch": 0.8754, + "grad_norm": 7.857962131500244, + "learning_rate": 9.310301019007284e-07, + "loss": 0.5094, + "step": 8754 + }, + { + "epoch": 0.8756, + "grad_norm": 0.09507651627063751, + "learning_rate": 9.280907174755916e-07, + "loss": 0.0181, + "step": 8756 + }, + { + "epoch": 0.8758, + "grad_norm": 0.216440349817276, + "learning_rate": 9.251557545888312e-07, + "loss": 0.0207, + "step": 8758 + }, + { + "epoch": 0.876, + "grad_norm": 3.5091652870178223, + "learning_rate": 9.222252146709143e-07, + "loss": 0.1509, + "step": 8760 + }, + { + "epoch": 0.8762, + "grad_norm": 1.7899590730667114, + "learning_rate": 9.192990991501483e-07, + "loss": 0.0957, + "step": 8762 + }, + { + "epoch": 0.8764, + "grad_norm": 0.14606180787086487, + "learning_rate": 9.16377409452689e-07, + "loss": 0.0184, + "step": 8764 + }, + { + "epoch": 0.8766, + "grad_norm": 4.735692501068115, + "learning_rate": 9.134601470025306e-07, + "loss": 0.6098, + "step": 8766 + }, + { + "epoch": 0.8768, + "grad_norm": 7.663984775543213, + "learning_rate": 9.105473132215126e-07, + "loss": 0.4473, + "step": 8768 + }, + { + "epoch": 0.877, + "grad_norm": 0.26074671745300293, + "learning_rate": 9.076389095293148e-07, + "loss": 0.1274, + "step": 8770 + }, + { + "epoch": 0.8772, + "grad_norm": 1.6866629123687744, + "learning_rate": 9.047349373434566e-07, + "loss": 0.2995, + "step": 8772 + }, + { + "epoch": 0.8774, + "grad_norm": 7.11013650894165, + "learning_rate": 9.018353980792993e-07, + "loss": 1.0223, + "step": 8774 + }, + { + "epoch": 0.8776, + "grad_norm": 0.47749584913253784, + "learning_rate": 8.989402931500434e-07, + "loss": 0.8661, + "step": 8776 + }, + { + "epoch": 0.8778, + "grad_norm": 1.8943768739700317, + "learning_rate": 8.960496239667282e-07, + "loss": 0.2866, + "step": 8778 + }, + { + "epoch": 0.878, + "grad_norm": 0.1955864131450653, + "learning_rate": 8.931633919382299e-07, + "loss": 0.1811, + "step": 8780 + }, + { + "epoch": 0.8782, + "grad_norm": 0.1676880419254303, + "learning_rate": 8.902815984712621e-07, + "loss": 0.051, + "step": 8782 + }, + { + "epoch": 0.8784, + "grad_norm": 4.439838886260986, + "learning_rate": 8.874042449703779e-07, + "loss": 0.1016, + "step": 8784 + }, + { + "epoch": 0.8786, + "grad_norm": 10.822925567626953, + "learning_rate": 8.845313328379635e-07, + "loss": 0.5447, + "step": 8786 + }, + { + "epoch": 0.8788, + "grad_norm": 0.13560588657855988, + "learning_rate": 8.816628634742441e-07, + "loss": 0.0612, + "step": 8788 + }, + { + "epoch": 0.879, + "grad_norm": 0.5245721340179443, + "learning_rate": 8.787988382772705e-07, + "loss": 0.0627, + "step": 8790 + }, + { + "epoch": 0.8792, + "grad_norm": 9.82371711730957, + "learning_rate": 8.759392586429394e-07, + "loss": 0.1599, + "step": 8792 + }, + { + "epoch": 0.8794, + "grad_norm": 6.036669731140137, + "learning_rate": 8.730841259649725e-07, + "loss": 0.4581, + "step": 8794 + }, + { + "epoch": 0.8796, + "grad_norm": 0.2324625551700592, + "learning_rate": 8.702334416349279e-07, + "loss": 0.1248, + "step": 8796 + }, + { + "epoch": 0.8798, + "grad_norm": 3.4358489513397217, + "learning_rate": 8.67387207042194e-07, + "loss": 0.2447, + "step": 8798 + }, + { + "epoch": 0.88, + "grad_norm": 8.857840538024902, + "learning_rate": 8.645454235739903e-07, + "loss": 0.292, + "step": 8800 + }, + { + "epoch": 0.8802, + "grad_norm": 0.6323832273483276, + "learning_rate": 8.617080926153698e-07, + "loss": 0.0321, + "step": 8802 + }, + { + "epoch": 0.8804, + "grad_norm": 1.7104426622390747, + "learning_rate": 8.58875215549212e-07, + "loss": 0.0991, + "step": 8804 + }, + { + "epoch": 0.8806, + "grad_norm": 0.7574736475944519, + "learning_rate": 8.560467937562278e-07, + "loss": 0.2594, + "step": 8806 + }, + { + "epoch": 0.8808, + "grad_norm": 0.7536328434944153, + "learning_rate": 8.532228286149502e-07, + "loss": 0.0428, + "step": 8808 + }, + { + "epoch": 0.881, + "grad_norm": 0.9132006168365479, + "learning_rate": 8.504033215017527e-07, + "loss": 0.0261, + "step": 8810 + }, + { + "epoch": 0.8812, + "grad_norm": 2.563493251800537, + "learning_rate": 8.475882737908248e-07, + "loss": 0.2973, + "step": 8812 + }, + { + "epoch": 0.8814, + "grad_norm": 3.4220328330993652, + "learning_rate": 8.447776868541879e-07, + "loss": 0.8106, + "step": 8814 + }, + { + "epoch": 0.8816, + "grad_norm": 2.286410093307495, + "learning_rate": 8.419715620616875e-07, + "loss": 0.1213, + "step": 8816 + }, + { + "epoch": 0.8818, + "grad_norm": 5.153921604156494, + "learning_rate": 8.39169900780995e-07, + "loss": 0.2371, + "step": 8818 + }, + { + "epoch": 0.882, + "grad_norm": 7.493651866912842, + "learning_rate": 8.363727043776037e-07, + "loss": 0.3881, + "step": 8820 + }, + { + "epoch": 0.8822, + "grad_norm": 3.6763105392456055, + "learning_rate": 8.335799742148387e-07, + "loss": 0.2317, + "step": 8822 + }, + { + "epoch": 0.8824, + "grad_norm": 0.9999001622200012, + "learning_rate": 8.307917116538378e-07, + "loss": 0.032, + "step": 8824 + }, + { + "epoch": 0.8826, + "grad_norm": 2.328443765640259, + "learning_rate": 8.280079180535672e-07, + "loss": 0.3312, + "step": 8826 + }, + { + "epoch": 0.8828, + "grad_norm": 0.4259248673915863, + "learning_rate": 8.252285947708139e-07, + "loss": 0.0235, + "step": 8828 + }, + { + "epoch": 0.883, + "grad_norm": 0.877259373664856, + "learning_rate": 8.224537431601886e-07, + "loss": 0.1412, + "step": 8830 + }, + { + "epoch": 0.8832, + "grad_norm": 1.9359627962112427, + "learning_rate": 8.196833645741187e-07, + "loss": 0.0547, + "step": 8832 + }, + { + "epoch": 0.8834, + "grad_norm": 2.3607661724090576, + "learning_rate": 8.169174603628538e-07, + "loss": 0.1364, + "step": 8834 + }, + { + "epoch": 0.8836, + "grad_norm": 0.549034833908081, + "learning_rate": 8.141560318744601e-07, + "loss": 0.0193, + "step": 8836 + }, + { + "epoch": 0.8838, + "grad_norm": 1.0467787981033325, + "learning_rate": 8.113990804548244e-07, + "loss": 0.0407, + "step": 8838 + }, + { + "epoch": 0.884, + "grad_norm": 0.7858067750930786, + "learning_rate": 8.086466074476562e-07, + "loss": 0.0476, + "step": 8840 + }, + { + "epoch": 0.8842, + "grad_norm": 4.689967632293701, + "learning_rate": 8.058986141944724e-07, + "loss": 0.1071, + "step": 8842 + }, + { + "epoch": 0.8844, + "grad_norm": 1.6969621181488037, + "learning_rate": 8.031551020346129e-07, + "loss": 0.2138, + "step": 8844 + }, + { + "epoch": 0.8846, + "grad_norm": 0.24360613524913788, + "learning_rate": 8.004160723052312e-07, + "loss": 0.1045, + "step": 8846 + }, + { + "epoch": 0.8848, + "grad_norm": 0.34203270077705383, + "learning_rate": 7.976815263412963e-07, + "loss": 0.0525, + "step": 8848 + }, + { + "epoch": 0.885, + "grad_norm": 0.1314094215631485, + "learning_rate": 7.949514654755963e-07, + "loss": 0.0112, + "step": 8850 + }, + { + "epoch": 0.8852, + "grad_norm": 13.25346851348877, + "learning_rate": 7.922258910387282e-07, + "loss": 0.8078, + "step": 8852 + }, + { + "epoch": 0.8854, + "grad_norm": 1.7381771802902222, + "learning_rate": 7.895048043591036e-07, + "loss": 0.1546, + "step": 8854 + }, + { + "epoch": 0.8856, + "grad_norm": 7.075968265533447, + "learning_rate": 7.867882067629473e-07, + "loss": 0.253, + "step": 8856 + }, + { + "epoch": 0.8858, + "grad_norm": 0.3180268108844757, + "learning_rate": 7.840760995742946e-07, + "loss": 0.1396, + "step": 8858 + }, + { + "epoch": 0.886, + "grad_norm": 0.30556178092956543, + "learning_rate": 7.81368484114996e-07, + "loss": 0.0337, + "step": 8860 + }, + { + "epoch": 0.8862, + "grad_norm": 5.166200637817383, + "learning_rate": 7.78665361704708e-07, + "loss": 0.0707, + "step": 8862 + }, + { + "epoch": 0.8864, + "grad_norm": 0.3859630525112152, + "learning_rate": 7.759667336609011e-07, + "loss": 0.2357, + "step": 8864 + }, + { + "epoch": 0.8866, + "grad_norm": 4.401634693145752, + "learning_rate": 7.732726012988512e-07, + "loss": 0.2811, + "step": 8866 + }, + { + "epoch": 0.8868, + "grad_norm": 4.760586261749268, + "learning_rate": 7.7058296593165e-07, + "loss": 0.0862, + "step": 8868 + }, + { + "epoch": 0.887, + "grad_norm": 0.8563665747642517, + "learning_rate": 7.678978288701911e-07, + "loss": 0.1357, + "step": 8870 + }, + { + "epoch": 0.8872, + "grad_norm": 9.573128700256348, + "learning_rate": 7.652171914231777e-07, + "loss": 0.3311, + "step": 8872 + }, + { + "epoch": 0.8874, + "grad_norm": 2.106950283050537, + "learning_rate": 7.62541054897119e-07, + "loss": 0.0805, + "step": 8874 + }, + { + "epoch": 0.8876, + "grad_norm": 1.3430426120758057, + "learning_rate": 7.598694205963331e-07, + "loss": 0.0348, + "step": 8876 + }, + { + "epoch": 0.8878, + "grad_norm": 1.226119875907898, + "learning_rate": 7.572022898229403e-07, + "loss": 0.1104, + "step": 8878 + }, + { + "epoch": 0.888, + "grad_norm": 0.6710270047187805, + "learning_rate": 7.545396638768698e-07, + "loss": 0.0422, + "step": 8880 + }, + { + "epoch": 0.8882, + "grad_norm": 7.481780529022217, + "learning_rate": 7.518815440558514e-07, + "loss": 0.627, + "step": 8882 + }, + { + "epoch": 0.8884, + "grad_norm": 1.3092851638793945, + "learning_rate": 7.492279316554207e-07, + "loss": 0.0734, + "step": 8884 + }, + { + "epoch": 0.8886, + "grad_norm": 9.828453063964844, + "learning_rate": 7.465788279689156e-07, + "loss": 0.1894, + "step": 8886 + }, + { + "epoch": 0.8888, + "grad_norm": 4.360773086547852, + "learning_rate": 7.439342342874789e-07, + "loss": 0.289, + "step": 8888 + }, + { + "epoch": 0.889, + "grad_norm": 6.791991710662842, + "learning_rate": 7.412941519000527e-07, + "loss": 0.2009, + "step": 8890 + }, + { + "epoch": 0.8892, + "grad_norm": 0.7145469188690186, + "learning_rate": 7.386585820933812e-07, + "loss": 0.151, + "step": 8892 + }, + { + "epoch": 0.8894, + "grad_norm": 4.417191028594971, + "learning_rate": 7.360275261520078e-07, + "loss": 0.249, + "step": 8894 + }, + { + "epoch": 0.8896, + "grad_norm": 3.2947254180908203, + "learning_rate": 7.334009853582791e-07, + "loss": 0.2522, + "step": 8896 + }, + { + "epoch": 0.8898, + "grad_norm": 0.9451799392700195, + "learning_rate": 7.307789609923377e-07, + "loss": 0.1988, + "step": 8898 + }, + { + "epoch": 0.89, + "grad_norm": 13.138494491577148, + "learning_rate": 7.281614543321269e-07, + "loss": 0.5752, + "step": 8900 + }, + { + "epoch": 0.8902, + "grad_norm": 8.464405059814453, + "learning_rate": 7.255484666533874e-07, + "loss": 0.2775, + "step": 8902 + }, + { + "epoch": 0.8904, + "grad_norm": 7.05691385269165, + "learning_rate": 7.22939999229657e-07, + "loss": 0.1372, + "step": 8904 + }, + { + "epoch": 0.8906, + "grad_norm": 15.607992172241211, + "learning_rate": 7.203360533322734e-07, + "loss": 0.2523, + "step": 8906 + }, + { + "epoch": 0.8908, + "grad_norm": 3.907414197921753, + "learning_rate": 7.177366302303667e-07, + "loss": 0.4798, + "step": 8908 + }, + { + "epoch": 0.891, + "grad_norm": 2.463498830795288, + "learning_rate": 7.151417311908648e-07, + "loss": 0.253, + "step": 8910 + }, + { + "epoch": 0.8912, + "grad_norm": 8.083518028259277, + "learning_rate": 7.125513574784904e-07, + "loss": 0.3566, + "step": 8912 + }, + { + "epoch": 0.8914, + "grad_norm": 0.22113361954689026, + "learning_rate": 7.099655103557557e-07, + "loss": 0.1036, + "step": 8914 + }, + { + "epoch": 0.8916, + "grad_norm": 1.580735683441162, + "learning_rate": 7.073841910829771e-07, + "loss": 0.0369, + "step": 8916 + }, + { + "epoch": 0.8918, + "grad_norm": 0.3952212929725647, + "learning_rate": 7.048074009182548e-07, + "loss": 0.2005, + "step": 8918 + }, + { + "epoch": 0.892, + "grad_norm": 0.2849358022212982, + "learning_rate": 7.022351411174866e-07, + "loss": 0.1149, + "step": 8920 + }, + { + "epoch": 0.8922, + "grad_norm": 2.3384382724761963, + "learning_rate": 6.996674129343606e-07, + "loss": 0.1538, + "step": 8922 + }, + { + "epoch": 0.8924, + "grad_norm": 6.078530788421631, + "learning_rate": 6.971042176203535e-07, + "loss": 0.2348, + "step": 8924 + }, + { + "epoch": 0.8926, + "grad_norm": 0.3761364221572876, + "learning_rate": 6.945455564247394e-07, + "loss": 0.1078, + "step": 8926 + }, + { + "epoch": 0.8928, + "grad_norm": 0.3557867109775543, + "learning_rate": 6.919914305945774e-07, + "loss": 0.7187, + "step": 8928 + }, + { + "epoch": 0.893, + "grad_norm": 2.6539289951324463, + "learning_rate": 6.894418413747183e-07, + "loss": 0.1974, + "step": 8930 + }, + { + "epoch": 0.8932, + "grad_norm": 10.30247974395752, + "learning_rate": 6.868967900077972e-07, + "loss": 0.3038, + "step": 8932 + }, + { + "epoch": 0.8934, + "grad_norm": 7.911545753479004, + "learning_rate": 6.84356277734245e-07, + "loss": 0.1893, + "step": 8934 + }, + { + "epoch": 0.8936, + "grad_norm": 1.4160538911819458, + "learning_rate": 6.818203057922756e-07, + "loss": 0.081, + "step": 8936 + }, + { + "epoch": 0.8938, + "grad_norm": 7.049499988555908, + "learning_rate": 6.792888754178906e-07, + "loss": 0.732, + "step": 8938 + }, + { + "epoch": 0.894, + "grad_norm": 0.5967305302619934, + "learning_rate": 6.767619878448783e-07, + "loss": 0.3509, + "step": 8940 + }, + { + "epoch": 0.8942, + "grad_norm": 0.4135792553424835, + "learning_rate": 6.742396443048138e-07, + "loss": 0.5184, + "step": 8942 + }, + { + "epoch": 0.8944, + "grad_norm": 0.22147655487060547, + "learning_rate": 6.717218460270536e-07, + "loss": 0.047, + "step": 8944 + }, + { + "epoch": 0.8946, + "grad_norm": 0.33166933059692383, + "learning_rate": 6.692085942387483e-07, + "loss": 0.4609, + "step": 8946 + }, + { + "epoch": 0.8948, + "grad_norm": 3.6744141578674316, + "learning_rate": 6.666998901648203e-07, + "loss": 0.1215, + "step": 8948 + }, + { + "epoch": 0.895, + "grad_norm": 2.804565668106079, + "learning_rate": 6.641957350279838e-07, + "loss": 0.0926, + "step": 8950 + }, + { + "epoch": 0.8952, + "grad_norm": 7.827357292175293, + "learning_rate": 6.616961300487323e-07, + "loss": 0.2135, + "step": 8952 + }, + { + "epoch": 0.8954, + "grad_norm": 0.3853585124015808, + "learning_rate": 6.592010764453449e-07, + "loss": 0.0498, + "step": 8954 + }, + { + "epoch": 0.8956, + "grad_norm": 6.187767028808594, + "learning_rate": 6.567105754338798e-07, + "loss": 0.2772, + "step": 8956 + }, + { + "epoch": 0.8958, + "grad_norm": 2.0838916301727295, + "learning_rate": 6.542246282281772e-07, + "loss": 0.1949, + "step": 8958 + }, + { + "epoch": 0.896, + "grad_norm": 0.30169788002967834, + "learning_rate": 6.517432360398556e-07, + "loss": 0.0308, + "step": 8960 + }, + { + "epoch": 0.8962, + "grad_norm": 7.098496437072754, + "learning_rate": 6.492664000783166e-07, + "loss": 0.2693, + "step": 8962 + }, + { + "epoch": 0.8964, + "grad_norm": 0.6426255106925964, + "learning_rate": 6.467941215507434e-07, + "loss": 0.1009, + "step": 8964 + }, + { + "epoch": 0.8966, + "grad_norm": 1.5507556200027466, + "learning_rate": 6.443264016620887e-07, + "loss": 0.1762, + "step": 8966 + }, + { + "epoch": 0.8968, + "grad_norm": 1.261578917503357, + "learning_rate": 6.418632416150927e-07, + "loss": 0.4873, + "step": 8968 + }, + { + "epoch": 0.897, + "grad_norm": 4.74484920501709, + "learning_rate": 6.394046426102673e-07, + "loss": 0.263, + "step": 8970 + }, + { + "epoch": 0.8972, + "grad_norm": 7.151072978973389, + "learning_rate": 6.369506058459063e-07, + "loss": 0.9232, + "step": 8972 + }, + { + "epoch": 0.8974, + "grad_norm": 12.998397827148438, + "learning_rate": 6.345011325180772e-07, + "loss": 0.337, + "step": 8974 + }, + { + "epoch": 0.8976, + "grad_norm": 0.23520877957344055, + "learning_rate": 6.320562238206218e-07, + "loss": 0.1264, + "step": 8976 + }, + { + "epoch": 0.8978, + "grad_norm": 3.649014949798584, + "learning_rate": 6.296158809451602e-07, + "loss": 0.584, + "step": 8978 + }, + { + "epoch": 0.898, + "grad_norm": 5.990115642547607, + "learning_rate": 6.271801050810856e-07, + "loss": 0.3047, + "step": 8980 + }, + { + "epoch": 0.8982, + "grad_norm": 0.6255689859390259, + "learning_rate": 6.247488974155657e-07, + "loss": 0.045, + "step": 8982 + }, + { + "epoch": 0.8984, + "grad_norm": 0.7009949684143066, + "learning_rate": 6.223222591335409e-07, + "loss": 0.038, + "step": 8984 + }, + { + "epoch": 0.8986, + "grad_norm": 1.1112905740737915, + "learning_rate": 6.199001914177261e-07, + "loss": 0.3876, + "step": 8986 + }, + { + "epoch": 0.8988, + "grad_norm": 1.7793662548065186, + "learning_rate": 6.174826954486069e-07, + "loss": 0.1145, + "step": 8988 + }, + { + "epoch": 0.899, + "grad_norm": 4.442110061645508, + "learning_rate": 6.150697724044407e-07, + "loss": 0.1285, + "step": 8990 + }, + { + "epoch": 0.8992, + "grad_norm": 4.742364883422852, + "learning_rate": 6.126614234612593e-07, + "loss": 0.1946, + "step": 8992 + }, + { + "epoch": 0.8994, + "grad_norm": 3.5769293308258057, + "learning_rate": 6.102576497928614e-07, + "loss": 0.1948, + "step": 8994 + }, + { + "epoch": 0.8996, + "grad_norm": 1.6691123247146606, + "learning_rate": 6.078584525708175e-07, + "loss": 0.3096, + "step": 8996 + }, + { + "epoch": 0.8998, + "grad_norm": 6.02407169342041, + "learning_rate": 6.054638329644658e-07, + "loss": 0.2622, + "step": 8998 + }, + { + "epoch": 0.9, + "grad_norm": 2.7169809341430664, + "learning_rate": 6.030737921409169e-07, + "loss": 0.0958, + "step": 9000 + }, + { + "epoch": 0.9002, + "grad_norm": 6.10143518447876, + "learning_rate": 6.006883312650458e-07, + "loss": 0.3443, + "step": 9002 + }, + { + "epoch": 0.9004, + "grad_norm": 0.9577600359916687, + "learning_rate": 5.98307451499498e-07, + "loss": 0.0401, + "step": 9004 + }, + { + "epoch": 0.9006, + "grad_norm": 0.5463529825210571, + "learning_rate": 5.959311540046863e-07, + "loss": 0.0247, + "step": 9006 + }, + { + "epoch": 0.9008, + "grad_norm": 0.9867238402366638, + "learning_rate": 5.935594399387856e-07, + "loss": 0.0368, + "step": 9008 + }, + { + "epoch": 0.901, + "grad_norm": 1.9127979278564453, + "learning_rate": 5.911923104577455e-07, + "loss": 0.1973, + "step": 9010 + }, + { + "epoch": 0.9012, + "grad_norm": 1.0905245542526245, + "learning_rate": 5.888297667152731e-07, + "loss": 0.1066, + "step": 9012 + }, + { + "epoch": 0.9014, + "grad_norm": 3.840873956680298, + "learning_rate": 5.864718098628441e-07, + "loss": 0.2942, + "step": 9014 + }, + { + "epoch": 0.9016, + "grad_norm": 5.040812015533447, + "learning_rate": 5.841184410496992e-07, + "loss": 0.1893, + "step": 9016 + }, + { + "epoch": 0.9018, + "grad_norm": 2.9948835372924805, + "learning_rate": 5.817696614228396e-07, + "loss": 0.2377, + "step": 9018 + }, + { + "epoch": 0.902, + "grad_norm": 8.731274604797363, + "learning_rate": 5.794254721270331e-07, + "loss": 0.5482, + "step": 9020 + }, + { + "epoch": 0.9022, + "grad_norm": 1.1173052787780762, + "learning_rate": 5.770858743048091e-07, + "loss": 0.1591, + "step": 9022 + }, + { + "epoch": 0.9024, + "grad_norm": 0.0907478854060173, + "learning_rate": 5.747508690964599e-07, + "loss": 0.0537, + "step": 9024 + }, + { + "epoch": 0.9026, + "grad_norm": 3.6706032752990723, + "learning_rate": 5.724204576400372e-07, + "loss": 0.1535, + "step": 9026 + }, + { + "epoch": 0.9028, + "grad_norm": 1.2252155542373657, + "learning_rate": 5.700946410713548e-07, + "loss": 0.0574, + "step": 9028 + }, + { + "epoch": 0.903, + "grad_norm": 1.3831772804260254, + "learning_rate": 5.677734205239904e-07, + "loss": 0.2351, + "step": 9030 + }, + { + "epoch": 0.9032, + "grad_norm": 3.352851629257202, + "learning_rate": 5.654567971292757e-07, + "loss": 0.2075, + "step": 9032 + }, + { + "epoch": 0.9034, + "grad_norm": 6.667214870452881, + "learning_rate": 5.631447720163074e-07, + "loss": 0.3069, + "step": 9034 + }, + { + "epoch": 0.9036, + "grad_norm": 2.4762961864471436, + "learning_rate": 5.608373463119354e-07, + "loss": 0.2814, + "step": 9036 + }, + { + "epoch": 0.9038, + "grad_norm": 10.705140113830566, + "learning_rate": 5.585345211407734e-07, + "loss": 0.6676, + "step": 9038 + }, + { + "epoch": 0.904, + "grad_norm": 1.668321132659912, + "learning_rate": 5.562362976251901e-07, + "loss": 0.824, + "step": 9040 + }, + { + "epoch": 0.9042, + "grad_norm": 1.437806248664856, + "learning_rate": 5.539426768853107e-07, + "loss": 0.2036, + "step": 9042 + }, + { + "epoch": 0.9044, + "grad_norm": 11.145874977111816, + "learning_rate": 5.516536600390188e-07, + "loss": 0.2152, + "step": 9044 + }, + { + "epoch": 0.9046, + "grad_norm": 3.641624689102173, + "learning_rate": 5.49369248201953e-07, + "loss": 0.3478, + "step": 9046 + }, + { + "epoch": 0.9048, + "grad_norm": 6.1957173347473145, + "learning_rate": 5.470894424875062e-07, + "loss": 0.261, + "step": 9048 + }, + { + "epoch": 0.905, + "grad_norm": 9.722752571105957, + "learning_rate": 5.448142440068316e-07, + "loss": 0.2333, + "step": 9050 + }, + { + "epoch": 0.9052, + "grad_norm": 0.5121570229530334, + "learning_rate": 5.425436538688322e-07, + "loss": 0.0575, + "step": 9052 + }, + { + "epoch": 0.9054, + "grad_norm": 14.574369430541992, + "learning_rate": 5.402776731801662e-07, + "loss": 0.4301, + "step": 9054 + }, + { + "epoch": 0.9056, + "grad_norm": 5.041574954986572, + "learning_rate": 5.380163030452412e-07, + "loss": 0.1387, + "step": 9056 + }, + { + "epoch": 0.9058, + "grad_norm": 1.1122899055480957, + "learning_rate": 5.357595445662267e-07, + "loss": 0.0299, + "step": 9058 + }, + { + "epoch": 0.906, + "grad_norm": 0.5967110991477966, + "learning_rate": 5.335073988430373e-07, + "loss": 0.1013, + "step": 9060 + }, + { + "epoch": 0.9062, + "grad_norm": 0.7109056115150452, + "learning_rate": 5.312598669733404e-07, + "loss": 0.0314, + "step": 9062 + }, + { + "epoch": 0.9064, + "grad_norm": 9.292826652526855, + "learning_rate": 5.290169500525577e-07, + "loss": 0.2885, + "step": 9064 + }, + { + "epoch": 0.9066, + "grad_norm": 2.955451250076294, + "learning_rate": 5.267786491738569e-07, + "loss": 0.2973, + "step": 9066 + }, + { + "epoch": 0.9068, + "grad_norm": 3.7839009761810303, + "learning_rate": 5.245449654281632e-07, + "loss": 0.1537, + "step": 9068 + }, + { + "epoch": 0.907, + "grad_norm": 0.7996234893798828, + "learning_rate": 5.223158999041444e-07, + "loss": 0.2861, + "step": 9070 + }, + { + "epoch": 0.9072, + "grad_norm": 0.6307675242424011, + "learning_rate": 5.200914536882184e-07, + "loss": 0.2357, + "step": 9072 + }, + { + "epoch": 0.9074, + "grad_norm": 1.5047610998153687, + "learning_rate": 5.178716278645534e-07, + "loss": 0.0817, + "step": 9074 + }, + { + "epoch": 0.9076, + "grad_norm": 5.476268768310547, + "learning_rate": 5.156564235150686e-07, + "loss": 0.0852, + "step": 9076 + }, + { + "epoch": 0.9078, + "grad_norm": 1.3548669815063477, + "learning_rate": 5.134458417194255e-07, + "loss": 0.1526, + "step": 9078 + }, + { + "epoch": 0.908, + "grad_norm": 3.146357774734497, + "learning_rate": 5.112398835550348e-07, + "loss": 0.575, + "step": 9080 + }, + { + "epoch": 0.9082, + "grad_norm": 5.740618705749512, + "learning_rate": 5.090385500970551e-07, + "loss": 0.2923, + "step": 9082 + }, + { + "epoch": 0.9084, + "grad_norm": 1.865789532661438, + "learning_rate": 5.068418424183874e-07, + "loss": 0.3268, + "step": 9084 + }, + { + "epoch": 0.9086, + "grad_norm": 1.4210994243621826, + "learning_rate": 5.046497615896806e-07, + "loss": 0.4263, + "step": 9086 + }, + { + "epoch": 0.9088, + "grad_norm": 0.9923996925354004, + "learning_rate": 5.024623086793323e-07, + "loss": 0.0368, + "step": 9088 + }, + { + "epoch": 0.909, + "grad_norm": 15.388869285583496, + "learning_rate": 5.002794847534765e-07, + "loss": 0.4873, + "step": 9090 + }, + { + "epoch": 0.9092, + "grad_norm": 2.243638038635254, + "learning_rate": 4.981012908759941e-07, + "loss": 0.1421, + "step": 9092 + }, + { + "epoch": 0.9094, + "grad_norm": 6.630256175994873, + "learning_rate": 4.959277281085128e-07, + "loss": 0.2241, + "step": 9094 + }, + { + "epoch": 0.9096, + "grad_norm": 15.035537719726562, + "learning_rate": 4.937587975103997e-07, + "loss": 0.2109, + "step": 9096 + }, + { + "epoch": 0.9098, + "grad_norm": 2.8710217475891113, + "learning_rate": 4.915945001387668e-07, + "loss": 0.1147, + "step": 9098 + }, + { + "epoch": 0.91, + "grad_norm": 3.780961275100708, + "learning_rate": 4.894348370484648e-07, + "loss": 0.1379, + "step": 9100 + }, + { + "epoch": 0.9102, + "grad_norm": 0.2578640878200531, + "learning_rate": 4.872798092920871e-07, + "loss": 0.1539, + "step": 9102 + }, + { + "epoch": 0.9104, + "grad_norm": 7.014750003814697, + "learning_rate": 4.851294179199673e-07, + "loss": 0.5589, + "step": 9104 + }, + { + "epoch": 0.9106, + "grad_norm": 1.5886447429656982, + "learning_rate": 4.829836639801844e-07, + "loss": 0.1947, + "step": 9106 + }, + { + "epoch": 0.9108, + "grad_norm": 0.3105440139770508, + "learning_rate": 4.808425485185486e-07, + "loss": 0.3382, + "step": 9108 + }, + { + "epoch": 0.911, + "grad_norm": 3.3445379734039307, + "learning_rate": 4.787060725786141e-07, + "loss": 0.8113, + "step": 9110 + }, + { + "epoch": 0.9112, + "grad_norm": 3.2984158992767334, + "learning_rate": 4.765742372016735e-07, + "loss": 0.341, + "step": 9112 + }, + { + "epoch": 0.9114, + "grad_norm": 0.3938648998737335, + "learning_rate": 4.7444704342675673e-07, + "loss": 0.1296, + "step": 9114 + }, + { + "epoch": 0.9116, + "grad_norm": 1.916056513786316, + "learning_rate": 4.723244922906356e-07, + "loss": 0.1869, + "step": 9116 + }, + { + "epoch": 0.9118, + "grad_norm": 5.108635902404785, + "learning_rate": 4.702065848278126e-07, + "loss": 0.2594, + "step": 9118 + }, + { + "epoch": 0.912, + "grad_norm": 0.20744815468788147, + "learning_rate": 4.6809332207053083e-07, + "loss": 0.115, + "step": 9120 + }, + { + "epoch": 0.9122, + "grad_norm": 1.5848251581192017, + "learning_rate": 4.659847050487687e-07, + "loss": 0.1016, + "step": 9122 + }, + { + "epoch": 0.9124, + "grad_norm": 4.9793195724487305, + "learning_rate": 4.638807347902408e-07, + "loss": 0.3609, + "step": 9124 + }, + { + "epoch": 0.9126, + "grad_norm": 2.232839345932007, + "learning_rate": 4.6178141232039676e-07, + "loss": 0.5852, + "step": 9126 + }, + { + "epoch": 0.9128, + "grad_norm": 2.484161376953125, + "learning_rate": 4.596867386624215e-07, + "loss": 0.081, + "step": 9128 + }, + { + "epoch": 0.913, + "grad_norm": 3.8492953777313232, + "learning_rate": 4.575967148372318e-07, + "loss": 0.3482, + "step": 9130 + }, + { + "epoch": 0.9132, + "grad_norm": 0.17895986139774323, + "learning_rate": 4.5551134186348045e-07, + "loss": 0.0142, + "step": 9132 + }, + { + "epoch": 0.9134, + "grad_norm": 0.8326396346092224, + "learning_rate": 4.534306207575545e-07, + "loss": 0.0251, + "step": 9134 + }, + { + "epoch": 0.9136, + "grad_norm": 0.6321288347244263, + "learning_rate": 4.5135455253357053e-07, + "loss": 0.1052, + "step": 9136 + }, + { + "epoch": 0.9138, + "grad_norm": 2.631474018096924, + "learning_rate": 4.492831382033791e-07, + "loss": 0.182, + "step": 9138 + }, + { + "epoch": 0.914, + "grad_norm": 4.831506252288818, + "learning_rate": 4.4721637877656377e-07, + "loss": 0.1378, + "step": 9140 + }, + { + "epoch": 0.9142, + "grad_norm": 2.639197587966919, + "learning_rate": 4.451542752604365e-07, + "loss": 0.1675, + "step": 9142 + }, + { + "epoch": 0.9144, + "grad_norm": 0.32563915848731995, + "learning_rate": 4.4309682866004124e-07, + "loss": 0.08, + "step": 9144 + }, + { + "epoch": 0.9146, + "grad_norm": 0.16226278245449066, + "learning_rate": 4.4104403997815346e-07, + "loss": 0.4837, + "step": 9146 + }, + { + "epoch": 0.9148, + "grad_norm": 3.0618181228637695, + "learning_rate": 4.3899591021527743e-07, + "loss": 0.0988, + "step": 9148 + }, + { + "epoch": 0.915, + "grad_norm": 2.047734260559082, + "learning_rate": 4.3695244036964567e-07, + "loss": 0.1506, + "step": 9150 + }, + { + "epoch": 0.9152, + "grad_norm": 1.268881916999817, + "learning_rate": 4.349136314372204e-07, + "loss": 0.1065, + "step": 9152 + }, + { + "epoch": 0.9154, + "grad_norm": 0.46407896280288696, + "learning_rate": 4.3287948441169457e-07, + "loss": 0.046, + "step": 9154 + }, + { + "epoch": 0.9156, + "grad_norm": 11.145973205566406, + "learning_rate": 4.308500002844862e-07, + "loss": 0.6582, + "step": 9156 + }, + { + "epoch": 0.9158, + "grad_norm": 0.12012653797864914, + "learning_rate": 4.288251800447385e-07, + "loss": 0.3101, + "step": 9158 + }, + { + "epoch": 0.916, + "grad_norm": 5.793189525604248, + "learning_rate": 4.268050246793276e-07, + "loss": 0.6394, + "step": 9160 + }, + { + "epoch": 0.9162, + "grad_norm": 5.403175354003906, + "learning_rate": 4.247895351728504e-07, + "loss": 0.1406, + "step": 9162 + }, + { + "epoch": 0.9164, + "grad_norm": 0.7142015099525452, + "learning_rate": 4.2277871250763327e-07, + "loss": 0.0447, + "step": 9164 + }, + { + "epoch": 0.9166, + "grad_norm": 0.2476159632205963, + "learning_rate": 4.207725576637256e-07, + "loss": 0.0234, + "step": 9166 + }, + { + "epoch": 0.9168, + "grad_norm": 0.2634108364582062, + "learning_rate": 4.1877107161890416e-07, + "loss": 0.0205, + "step": 9168 + }, + { + "epoch": 0.917, + "grad_norm": 5.445647716522217, + "learning_rate": 4.167742553486676e-07, + "loss": 0.1158, + "step": 9170 + }, + { + "epoch": 0.9172, + "grad_norm": 7.49146842956543, + "learning_rate": 4.1478210982624055e-07, + "loss": 0.2188, + "step": 9172 + }, + { + "epoch": 0.9174, + "grad_norm": 7.379105567932129, + "learning_rate": 4.1279463602257207e-07, + "loss": 1.4145, + "step": 9174 + }, + { + "epoch": 0.9176, + "grad_norm": 1.9952174425125122, + "learning_rate": 4.108118349063306e-07, + "loss": 0.3313, + "step": 9176 + }, + { + "epoch": 0.9178, + "grad_norm": 3.296851634979248, + "learning_rate": 4.0883370744390973e-07, + "loss": 0.1949, + "step": 9178 + }, + { + "epoch": 0.918, + "grad_norm": 2.446139335632324, + "learning_rate": 4.068602545994249e-07, + "loss": 0.6512, + "step": 9180 + }, + { + "epoch": 0.9182, + "grad_norm": 0.7570074200630188, + "learning_rate": 4.0489147733471347e-07, + "loss": 0.1071, + "step": 9182 + }, + { + "epoch": 0.9184, + "grad_norm": 4.818393230438232, + "learning_rate": 4.0292737660933335e-07, + "loss": 0.0855, + "step": 9184 + }, + { + "epoch": 0.9186, + "grad_norm": 2.7449300289154053, + "learning_rate": 4.009679533805633e-07, + "loss": 0.103, + "step": 9186 + }, + { + "epoch": 0.9188, + "grad_norm": 1.9089161157608032, + "learning_rate": 3.990132086034026e-07, + "loss": 0.0909, + "step": 9188 + }, + { + "epoch": 0.919, + "grad_norm": 2.486797571182251, + "learning_rate": 3.9706314323056936e-07, + "loss": 0.2706, + "step": 9190 + }, + { + "epoch": 0.9192, + "grad_norm": 7.002142906188965, + "learning_rate": 3.9511775821250206e-07, + "loss": 0.3144, + "step": 9192 + }, + { + "epoch": 0.9194, + "grad_norm": 2.1433913707733154, + "learning_rate": 3.931770544973601e-07, + "loss": 0.2152, + "step": 9194 + }, + { + "epoch": 0.9196, + "grad_norm": 2.2112672328948975, + "learning_rate": 3.912410330310157e-07, + "loss": 0.0754, + "step": 9196 + }, + { + "epoch": 0.9198, + "grad_norm": 0.29214268922805786, + "learning_rate": 3.8930969475706183e-07, + "loss": 0.0283, + "step": 9198 + }, + { + "epoch": 0.92, + "grad_norm": 0.9526484608650208, + "learning_rate": 3.8738304061681107e-07, + "loss": 0.4767, + "step": 9200 + }, + { + "epoch": 0.9202, + "grad_norm": 1.2230300903320312, + "learning_rate": 3.854610715492924e-07, + "loss": 0.247, + "step": 9202 + }, + { + "epoch": 0.9204, + "grad_norm": 1.3556287288665771, + "learning_rate": 3.835437884912474e-07, + "loss": 0.1231, + "step": 9204 + }, + { + "epoch": 0.9206, + "grad_norm": 1.384960651397705, + "learning_rate": 3.8163119237713877e-07, + "loss": 0.3736, + "step": 9206 + }, + { + "epoch": 0.9208, + "grad_norm": 0.28678151965141296, + "learning_rate": 3.7972328413914074e-07, + "loss": 0.0591, + "step": 9208 + }, + { + "epoch": 0.921, + "grad_norm": 8.080974578857422, + "learning_rate": 3.7782006470714614e-07, + "loss": 0.4534, + "step": 9210 + }, + { + "epoch": 0.9212, + "grad_norm": 0.3178415596485138, + "learning_rate": 3.759215350087619e-07, + "loss": 0.0454, + "step": 9212 + }, + { + "epoch": 0.9214, + "grad_norm": 0.1744065284729004, + "learning_rate": 3.7402769596930567e-07, + "loss": 0.0519, + "step": 9214 + }, + { + "epoch": 0.9216, + "grad_norm": 0.691257119178772, + "learning_rate": 3.721385485118123e-07, + "loss": 0.0704, + "step": 9216 + }, + { + "epoch": 0.9218, + "grad_norm": 1.417859673500061, + "learning_rate": 3.7025409355702977e-07, + "loss": 0.1942, + "step": 9218 + }, + { + "epoch": 0.922, + "grad_norm": 4.610188007354736, + "learning_rate": 3.68374332023419e-07, + "loss": 0.1276, + "step": 9220 + }, + { + "epoch": 0.9222, + "grad_norm": 3.9682674407958984, + "learning_rate": 3.664992648271526e-07, + "loss": 0.0983, + "step": 9222 + }, + { + "epoch": 0.9224, + "grad_norm": 2.1390271186828613, + "learning_rate": 3.646288928821151e-07, + "loss": 0.5884, + "step": 9224 + }, + { + "epoch": 0.9226, + "grad_norm": 0.6454818844795227, + "learning_rate": 3.627632170999029e-07, + "loss": 0.1146, + "step": 9226 + }, + { + "epoch": 0.9228, + "grad_norm": 0.5614191293716431, + "learning_rate": 3.609022383898242e-07, + "loss": 0.1891, + "step": 9228 + }, + { + "epoch": 0.923, + "grad_norm": 1.5165932178497314, + "learning_rate": 3.590459576589e-07, + "loss": 0.1064, + "step": 9230 + }, + { + "epoch": 0.9232, + "grad_norm": 1.2543209791183472, + "learning_rate": 3.571943758118546e-07, + "loss": 0.0495, + "step": 9232 + }, + { + "epoch": 0.9234, + "grad_norm": 0.3526496887207031, + "learning_rate": 3.553474937511281e-07, + "loss": 0.0261, + "step": 9234 + }, + { + "epoch": 0.9236, + "grad_norm": 0.7017860412597656, + "learning_rate": 3.5350531237686723e-07, + "loss": 0.1667, + "step": 9236 + }, + { + "epoch": 0.9238, + "grad_norm": 9.07026195526123, + "learning_rate": 3.516678325869316e-07, + "loss": 0.3207, + "step": 9238 + }, + { + "epoch": 0.924, + "grad_norm": 1.7451468706130981, + "learning_rate": 3.498350552768859e-07, + "loss": 0.1638, + "step": 9240 + }, + { + "epoch": 0.9242, + "grad_norm": 3.9782984256744385, + "learning_rate": 3.480069813400022e-07, + "loss": 0.2281, + "step": 9242 + }, + { + "epoch": 0.9244, + "grad_norm": 13.583698272705078, + "learning_rate": 3.4618361166726123e-07, + "loss": 0.3689, + "step": 9244 + }, + { + "epoch": 0.9246, + "grad_norm": 10.711908340454102, + "learning_rate": 3.4436494714735313e-07, + "loss": 0.1943, + "step": 9246 + }, + { + "epoch": 0.9248, + "grad_norm": 5.447337627410889, + "learning_rate": 3.4255098866667114e-07, + "loss": 0.2006, + "step": 9248 + }, + { + "epoch": 0.925, + "grad_norm": 1.3043588399887085, + "learning_rate": 3.4074173710931804e-07, + "loss": 0.1601, + "step": 9250 + }, + { + "epoch": 0.9252, + "grad_norm": 4.052359580993652, + "learning_rate": 3.3893719335709953e-07, + "loss": 0.3128, + "step": 9252 + }, + { + "epoch": 0.9254, + "grad_norm": 4.520719051361084, + "learning_rate": 3.3713735828952985e-07, + "loss": 0.1904, + "step": 9254 + }, + { + "epoch": 0.9256, + "grad_norm": 7.345173358917236, + "learning_rate": 3.3534223278382405e-07, + "loss": 0.4249, + "step": 9256 + }, + { + "epoch": 0.9258, + "grad_norm": 13.759790420532227, + "learning_rate": 3.3355181771490776e-07, + "loss": 0.5365, + "step": 9258 + }, + { + "epoch": 0.926, + "grad_norm": 2.6156070232391357, + "learning_rate": 3.3176611395540625e-07, + "loss": 0.4022, + "step": 9260 + }, + { + "epoch": 0.9262, + "grad_norm": 0.465053528547287, + "learning_rate": 3.2998512237565005e-07, + "loss": 0.4431, + "step": 9262 + }, + { + "epoch": 0.9264, + "grad_norm": 4.7531633377075195, + "learning_rate": 3.282088438436715e-07, + "loss": 0.4305, + "step": 9264 + }, + { + "epoch": 0.9266, + "grad_norm": 0.6710368990898132, + "learning_rate": 3.2643727922520905e-07, + "loss": 0.0422, + "step": 9266 + }, + { + "epoch": 0.9268, + "grad_norm": 0.6355507969856262, + "learning_rate": 3.246704293837011e-07, + "loss": 0.0247, + "step": 9268 + }, + { + "epoch": 0.927, + "grad_norm": 8.88353157043457, + "learning_rate": 3.2290829518028867e-07, + "loss": 0.2426, + "step": 9270 + }, + { + "epoch": 0.9272, + "grad_norm": 4.166189670562744, + "learning_rate": 3.211508774738137e-07, + "loss": 0.8182, + "step": 9272 + }, + { + "epoch": 0.9274, + "grad_norm": 0.3736756443977356, + "learning_rate": 3.19398177120821e-07, + "loss": 0.1581, + "step": 9274 + }, + { + "epoch": 0.9276, + "grad_norm": 4.107546806335449, + "learning_rate": 3.1765019497555617e-07, + "loss": 0.2722, + "step": 9276 + }, + { + "epoch": 0.9278, + "grad_norm": 0.40480706095695496, + "learning_rate": 3.1590693188996324e-07, + "loss": 0.1067, + "step": 9278 + }, + { + "epoch": 0.928, + "grad_norm": 0.11424301564693451, + "learning_rate": 3.1416838871368925e-07, + "loss": 0.1117, + "step": 9280 + }, + { + "epoch": 0.9282, + "grad_norm": 12.249424934387207, + "learning_rate": 3.1243456629407644e-07, + "loss": 0.327, + "step": 9282 + }, + { + "epoch": 0.9284, + "grad_norm": 0.6208549737930298, + "learning_rate": 3.10705465476171e-07, + "loss": 0.0741, + "step": 9284 + }, + { + "epoch": 0.9286, + "grad_norm": 1.4867558479309082, + "learning_rate": 3.0898108710271437e-07, + "loss": 0.1357, + "step": 9286 + }, + { + "epoch": 0.9288, + "grad_norm": 1.7721288204193115, + "learning_rate": 3.072614320141487e-07, + "loss": 0.513, + "step": 9288 + }, + { + "epoch": 0.929, + "grad_norm": 0.18412937223911285, + "learning_rate": 3.0554650104861137e-07, + "loss": 0.1263, + "step": 9290 + }, + { + "epoch": 0.9292, + "grad_norm": 3.8390698432922363, + "learning_rate": 3.0383629504194047e-07, + "loss": 0.3437, + "step": 9292 + }, + { + "epoch": 0.9294, + "grad_norm": 3.4544551372528076, + "learning_rate": 3.0213081482766803e-07, + "loss": 0.1249, + "step": 9294 + }, + { + "epoch": 0.9296, + "grad_norm": 2.021573781967163, + "learning_rate": 3.00430061237027e-07, + "loss": 0.1049, + "step": 9296 + }, + { + "epoch": 0.9298, + "grad_norm": 1.6916600465774536, + "learning_rate": 2.987340350989421e-07, + "loss": 0.2387, + "step": 9298 + }, + { + "epoch": 0.93, + "grad_norm": 1.6648565530776978, + "learning_rate": 2.970427372400353e-07, + "loss": 0.1667, + "step": 9300 + }, + { + "epoch": 0.9302, + "grad_norm": 1.1259887218475342, + "learning_rate": 2.9535616848462624e-07, + "loss": 0.0498, + "step": 9302 + }, + { + "epoch": 0.9304, + "grad_norm": 0.27142009139060974, + "learning_rate": 2.936743296547273e-07, + "loss": 0.3613, + "step": 9304 + }, + { + "epoch": 0.9306, + "grad_norm": 0.3698432147502899, + "learning_rate": 2.919972215700462e-07, + "loss": 0.1159, + "step": 9306 + }, + { + "epoch": 0.9308, + "grad_norm": 1.1540350914001465, + "learning_rate": 2.9032484504798454e-07, + "loss": 0.0517, + "step": 9308 + }, + { + "epoch": 0.931, + "grad_norm": 0.17629878222942352, + "learning_rate": 2.8865720090364037e-07, + "loss": 0.0134, + "step": 9310 + }, + { + "epoch": 0.9312, + "grad_norm": 1.5601814985275269, + "learning_rate": 2.8699428994980017e-07, + "loss": 0.1761, + "step": 9312 + }, + { + "epoch": 0.9314, + "grad_norm": 1.1591155529022217, + "learning_rate": 2.8533611299694784e-07, + "loss": 0.0425, + "step": 9314 + }, + { + "epoch": 0.9316, + "grad_norm": 0.7394058704376221, + "learning_rate": 2.836826708532603e-07, + "loss": 0.4539, + "step": 9316 + }, + { + "epoch": 0.9318, + "grad_norm": 1.3765168190002441, + "learning_rate": 2.8203396432460507e-07, + "loss": 0.2529, + "step": 9318 + }, + { + "epoch": 0.932, + "grad_norm": 1.0161367654800415, + "learning_rate": 2.8038999421453827e-07, + "loss": 0.0405, + "step": 9320 + }, + { + "epoch": 0.9322, + "grad_norm": 0.35694417357444763, + "learning_rate": 2.7875076132431344e-07, + "loss": 0.0627, + "step": 9322 + }, + { + "epoch": 0.9324, + "grad_norm": 0.09108936786651611, + "learning_rate": 2.771162664528726e-07, + "loss": 0.355, + "step": 9324 + }, + { + "epoch": 0.9326, + "grad_norm": 3.0780293941497803, + "learning_rate": 2.7548651039684847e-07, + "loss": 0.1859, + "step": 9326 + }, + { + "epoch": 0.9328, + "grad_norm": 1.1846190690994263, + "learning_rate": 2.7386149395056463e-07, + "loss": 0.0929, + "step": 9328 + }, + { + "epoch": 0.933, + "grad_norm": 1.7170050144195557, + "learning_rate": 2.7224121790603517e-07, + "loss": 0.0958, + "step": 9330 + }, + { + "epoch": 0.9332, + "grad_norm": 2.312399387359619, + "learning_rate": 2.7062568305295967e-07, + "loss": 0.0761, + "step": 9332 + }, + { + "epoch": 0.9334, + "grad_norm": 5.200381278991699, + "learning_rate": 2.6901489017873375e-07, + "loss": 0.1406, + "step": 9334 + }, + { + "epoch": 0.9336, + "grad_norm": 7.906890869140625, + "learning_rate": 2.6740884006843826e-07, + "loss": 0.2035, + "step": 9336 + }, + { + "epoch": 0.9338, + "grad_norm": 6.422906398773193, + "learning_rate": 2.6580753350484044e-07, + "loss": 0.4393, + "step": 9338 + }, + { + "epoch": 0.934, + "grad_norm": 0.9677523970603943, + "learning_rate": 2.6421097126839714e-07, + "loss": 0.0257, + "step": 9340 + }, + { + "epoch": 0.9342, + "grad_norm": 2.017104148864746, + "learning_rate": 2.626191541372558e-07, + "loss": 0.1713, + "step": 9342 + }, + { + "epoch": 0.9344, + "grad_norm": 0.31043335795402527, + "learning_rate": 2.6103208288724815e-07, + "loss": 0.1182, + "step": 9344 + }, + { + "epoch": 0.9346, + "grad_norm": 2.1246702671051025, + "learning_rate": 2.59449758291892e-07, + "loss": 0.0771, + "step": 9346 + }, + { + "epoch": 0.9348, + "grad_norm": 6.336101531982422, + "learning_rate": 2.57872181122395e-07, + "loss": 0.2289, + "step": 9348 + }, + { + "epoch": 0.935, + "grad_norm": 6.408915996551514, + "learning_rate": 2.5629935214764866e-07, + "loss": 0.1875, + "step": 9350 + }, + { + "epoch": 0.9352, + "grad_norm": 3.4270594120025635, + "learning_rate": 2.547312721342277e-07, + "loss": 0.2943, + "step": 9352 + }, + { + "epoch": 0.9354, + "grad_norm": 14.34856128692627, + "learning_rate": 2.5316794184640056e-07, + "loss": 0.5495, + "step": 9354 + }, + { + "epoch": 0.9356, + "grad_norm": 1.6976382732391357, + "learning_rate": 2.516093620461124e-07, + "loss": 0.0924, + "step": 9356 + }, + { + "epoch": 0.9358, + "grad_norm": 4.692202568054199, + "learning_rate": 2.500555334929955e-07, + "loss": 0.2369, + "step": 9358 + }, + { + "epoch": 0.936, + "grad_norm": 7.7424163818359375, + "learning_rate": 2.4850645694436736e-07, + "loss": 0.4472, + "step": 9360 + }, + { + "epoch": 0.9362, + "grad_norm": 2.114579439163208, + "learning_rate": 2.4696213315523074e-07, + "loss": 0.0455, + "step": 9362 + }, + { + "epoch": 0.9364, + "grad_norm": 6.997885704040527, + "learning_rate": 2.4542256287826915e-07, + "loss": 0.4503, + "step": 9364 + }, + { + "epoch": 0.9366, + "grad_norm": 1.9285922050476074, + "learning_rate": 2.4388774686385007e-07, + "loss": 0.0991, + "step": 9366 + }, + { + "epoch": 0.9368, + "grad_norm": 0.7891079783439636, + "learning_rate": 2.423576858600252e-07, + "loss": 0.1727, + "step": 9368 + }, + { + "epoch": 0.937, + "grad_norm": 1.4144967794418335, + "learning_rate": 2.4083238061252565e-07, + "loss": 0.081, + "step": 9370 + }, + { + "epoch": 0.9372, + "grad_norm": 3.649247169494629, + "learning_rate": 2.3931183186477026e-07, + "loss": 0.178, + "step": 9372 + }, + { + "epoch": 0.9374, + "grad_norm": 0.721006453037262, + "learning_rate": 2.3779604035785277e-07, + "loss": 0.032, + "step": 9374 + }, + { + "epoch": 0.9376, + "grad_norm": 12.150331497192383, + "learning_rate": 2.3628500683055222e-07, + "loss": 0.5952, + "step": 9376 + }, + { + "epoch": 0.9378, + "grad_norm": 2.2269814014434814, + "learning_rate": 2.3477873201932733e-07, + "loss": 0.0655, + "step": 9378 + }, + { + "epoch": 0.938, + "grad_norm": 1.6421831846237183, + "learning_rate": 2.332772166583208e-07, + "loss": 0.209, + "step": 9380 + }, + { + "epoch": 0.9382, + "grad_norm": 1.6179054975509644, + "learning_rate": 2.3178046147935173e-07, + "loss": 0.2942, + "step": 9382 + }, + { + "epoch": 0.9384, + "grad_norm": 5.707662582397461, + "learning_rate": 2.3028846721191878e-07, + "loss": 0.1943, + "step": 9384 + }, + { + "epoch": 0.9386, + "grad_norm": 2.314422607421875, + "learning_rate": 2.288012345832047e-07, + "loss": 0.1106, + "step": 9386 + }, + { + "epoch": 0.9388, + "grad_norm": 0.728227436542511, + "learning_rate": 2.273187643180652e-07, + "loss": 0.1072, + "step": 9388 + }, + { + "epoch": 0.939, + "grad_norm": 3.335394859313965, + "learning_rate": 2.2584105713904126e-07, + "loss": 0.1746, + "step": 9390 + }, + { + "epoch": 0.9392, + "grad_norm": 1.0184483528137207, + "learning_rate": 2.2436811376634893e-07, + "loss": 0.269, + "step": 9392 + }, + { + "epoch": 0.9394, + "grad_norm": 1.709597110748291, + "learning_rate": 2.2289993491788065e-07, + "loss": 0.0742, + "step": 9394 + }, + { + "epoch": 0.9396, + "grad_norm": 5.197251319885254, + "learning_rate": 2.214365213092118e-07, + "loss": 0.2691, + "step": 9396 + }, + { + "epoch": 0.9398, + "grad_norm": 2.905419111251831, + "learning_rate": 2.1997787365358958e-07, + "loss": 0.3948, + "step": 9398 + }, + { + "epoch": 0.94, + "grad_norm": 1.5577483177185059, + "learning_rate": 2.1852399266194312e-07, + "loss": 0.5724, + "step": 9400 + }, + { + "epoch": 0.9402, + "grad_norm": 8.983512878417969, + "learning_rate": 2.1707487904287672e-07, + "loss": 0.1844, + "step": 9402 + }, + { + "epoch": 0.9404, + "grad_norm": 0.6903245449066162, + "learning_rate": 2.1563053350266983e-07, + "loss": 0.0375, + "step": 9404 + }, + { + "epoch": 0.9406, + "grad_norm": 0.7482632398605347, + "learning_rate": 2.1419095674527934e-07, + "loss": 0.0407, + "step": 9406 + }, + { + "epoch": 0.9408, + "grad_norm": 0.43498390913009644, + "learning_rate": 2.1275614947233624e-07, + "loss": 0.0422, + "step": 9408 + }, + { + "epoch": 0.941, + "grad_norm": 1.5286457538604736, + "learning_rate": 2.1132611238315004e-07, + "loss": 0.1231, + "step": 9410 + }, + { + "epoch": 0.9412, + "grad_norm": 1.092687964439392, + "learning_rate": 2.0990084617470207e-07, + "loss": 0.2721, + "step": 9412 + }, + { + "epoch": 0.9414, + "grad_norm": 0.6243681311607361, + "learning_rate": 2.0848035154165113e-07, + "loss": 0.0942, + "step": 9414 + }, + { + "epoch": 0.9416, + "grad_norm": 11.332029342651367, + "learning_rate": 2.0706462917632676e-07, + "loss": 0.5404, + "step": 9416 + }, + { + "epoch": 0.9418, + "grad_norm": 3.2058374881744385, + "learning_rate": 2.0565367976873584e-07, + "loss": 0.6119, + "step": 9418 + }, + { + "epoch": 0.942, + "grad_norm": 1.2014843225479126, + "learning_rate": 2.0424750400655947e-07, + "loss": 0.2055, + "step": 9420 + }, + { + "epoch": 0.9422, + "grad_norm": 1.5237919092178345, + "learning_rate": 2.0284610257514936e-07, + "loss": 0.3038, + "step": 9422 + }, + { + "epoch": 0.9424, + "grad_norm": 2.9245107173919678, + "learning_rate": 2.014494761575314e-07, + "loss": 0.064, + "step": 9424 + }, + { + "epoch": 0.9426, + "grad_norm": 3.997873306274414, + "learning_rate": 2.0005762543440444e-07, + "loss": 0.196, + "step": 9426 + }, + { + "epoch": 0.9428, + "grad_norm": 2.3620665073394775, + "learning_rate": 1.9867055108414023e-07, + "loss": 0.1192, + "step": 9428 + }, + { + "epoch": 0.943, + "grad_norm": 0.36273330450057983, + "learning_rate": 1.9728825378278248e-07, + "loss": 0.5184, + "step": 9430 + }, + { + "epoch": 0.9432, + "grad_norm": 5.0667724609375, + "learning_rate": 1.9591073420404338e-07, + "loss": 0.3436, + "step": 9432 + }, + { + "epoch": 0.9434, + "grad_norm": 4.220357418060303, + "learning_rate": 1.9453799301931253e-07, + "loss": 0.3509, + "step": 9434 + }, + { + "epoch": 0.9436, + "grad_norm": 2.589963674545288, + "learning_rate": 1.9317003089764365e-07, + "loss": 0.4543, + "step": 9436 + }, + { + "epoch": 0.9438, + "grad_norm": 1.2228903770446777, + "learning_rate": 1.9180684850576893e-07, + "loss": 0.0519, + "step": 9438 + }, + { + "epoch": 0.944, + "grad_norm": 3.4193882942199707, + "learning_rate": 1.9044844650808468e-07, + "loss": 0.3143, + "step": 9440 + }, + { + "epoch": 0.9442, + "grad_norm": 10.965206146240234, + "learning_rate": 1.8909482556666026e-07, + "loss": 0.3791, + "step": 9442 + }, + { + "epoch": 0.9444, + "grad_norm": 0.1702074110507965, + "learning_rate": 1.877459863412323e-07, + "loss": 0.0699, + "step": 9444 + }, + { + "epoch": 0.9446, + "grad_norm": 0.7900124788284302, + "learning_rate": 1.8640192948921053e-07, + "loss": 0.0831, + "step": 9446 + }, + { + "epoch": 0.9448, + "grad_norm": 0.2814807593822479, + "learning_rate": 1.8506265566567095e-07, + "loss": 0.0813, + "step": 9448 + }, + { + "epoch": 0.945, + "grad_norm": 0.4825044274330139, + "learning_rate": 1.8372816552336025e-07, + "loss": 0.1724, + "step": 9450 + }, + { + "epoch": 0.9452, + "grad_norm": 1.1220015287399292, + "learning_rate": 1.8239845971269266e-07, + "loss": 0.1845, + "step": 9452 + }, + { + "epoch": 0.9454, + "grad_norm": 0.938378632068634, + "learning_rate": 1.8107353888175083e-07, + "loss": 0.0557, + "step": 9454 + }, + { + "epoch": 0.9456, + "grad_norm": 7.510448455810547, + "learning_rate": 1.7975340367628269e-07, + "loss": 0.3087, + "step": 9456 + }, + { + "epoch": 0.9458, + "grad_norm": 2.3348512649536133, + "learning_rate": 1.7843805473970798e-07, + "loss": 0.1024, + "step": 9458 + }, + { + "epoch": 0.946, + "grad_norm": 1.1380234956741333, + "learning_rate": 1.7712749271311392e-07, + "loss": 0.4039, + "step": 9460 + }, + { + "epoch": 0.9462, + "grad_norm": 12.127257347106934, + "learning_rate": 1.758217182352495e-07, + "loss": 0.5023, + "step": 9462 + }, + { + "epoch": 0.9464, + "grad_norm": 2.942239761352539, + "learning_rate": 1.7452073194253237e-07, + "loss": 0.4316, + "step": 9464 + }, + { + "epoch": 0.9466, + "grad_norm": 3.3086655139923096, + "learning_rate": 1.7322453446905084e-07, + "loss": 0.3217, + "step": 9466 + }, + { + "epoch": 0.9468, + "grad_norm": 4.713417053222656, + "learning_rate": 1.719331264465529e-07, + "loss": 0.3297, + "step": 9468 + }, + { + "epoch": 0.947, + "grad_norm": 6.800234794616699, + "learning_rate": 1.706465085044584e-07, + "loss": 0.7568, + "step": 9470 + }, + { + "epoch": 0.9472, + "grad_norm": 1.5973906517028809, + "learning_rate": 1.6936468126984573e-07, + "loss": 0.2369, + "step": 9472 + }, + { + "epoch": 0.9474, + "grad_norm": 3.635608196258545, + "learning_rate": 1.680876453674629e-07, + "loss": 0.4136, + "step": 9474 + }, + { + "epoch": 0.9476, + "grad_norm": 3.39072585105896, + "learning_rate": 1.668154014197243e-07, + "loss": 0.1495, + "step": 9476 + }, + { + "epoch": 0.9478, + "grad_norm": 0.17828910052776337, + "learning_rate": 1.6554795004670389e-07, + "loss": 0.0169, + "step": 9478 + }, + { + "epoch": 0.948, + "grad_norm": 0.3283783197402954, + "learning_rate": 1.6428529186614195e-07, + "loss": 0.0142, + "step": 9480 + }, + { + "epoch": 0.9482, + "grad_norm": 1.1432490348815918, + "learning_rate": 1.6302742749344292e-07, + "loss": 0.0661, + "step": 9482 + }, + { + "epoch": 0.9484, + "grad_norm": 11.012632369995117, + "learning_rate": 1.6177435754167413e-07, + "loss": 0.3046, + "step": 9484 + }, + { + "epoch": 0.9486, + "grad_norm": 3.006629228591919, + "learning_rate": 1.605260826215682e-07, + "loss": 0.4635, + "step": 9486 + }, + { + "epoch": 0.9488, + "grad_norm": 4.772534370422363, + "learning_rate": 1.5928260334151847e-07, + "loss": 0.4031, + "step": 9488 + }, + { + "epoch": 0.949, + "grad_norm": 4.740313529968262, + "learning_rate": 1.580439203075812e-07, + "loss": 0.348, + "step": 9490 + }, + { + "epoch": 0.9492, + "grad_norm": 4.013779640197754, + "learning_rate": 1.5681003412347573e-07, + "loss": 0.2184, + "step": 9492 + }, + { + "epoch": 0.9494, + "grad_norm": 0.9187645316123962, + "learning_rate": 1.555809453905821e-07, + "loss": 0.085, + "step": 9494 + }, + { + "epoch": 0.9496, + "grad_norm": 6.6079888343811035, + "learning_rate": 1.543566547079467e-07, + "loss": 0.6487, + "step": 9496 + }, + { + "epoch": 0.9498, + "grad_norm": 13.015205383300781, + "learning_rate": 1.5313716267226997e-07, + "loss": 0.1811, + "step": 9498 + }, + { + "epoch": 0.95, + "grad_norm": 13.922344207763672, + "learning_rate": 1.519224698779198e-07, + "loss": 0.7319, + "step": 9500 + }, + { + "epoch": 0.9502, + "grad_norm": 2.130119800567627, + "learning_rate": 1.5071257691692153e-07, + "loss": 0.0894, + "step": 9502 + }, + { + "epoch": 0.9504, + "grad_norm": 2.932265043258667, + "learning_rate": 1.4950748437896235e-07, + "loss": 0.1572, + "step": 9504 + }, + { + "epoch": 0.9506, + "grad_norm": 2.5185437202453613, + "learning_rate": 1.483071928513913e-07, + "loss": 0.1029, + "step": 9506 + }, + { + "epoch": 0.9508, + "grad_norm": 0.13796466588974, + "learning_rate": 1.4711170291921485e-07, + "loss": 0.101, + "step": 9508 + }, + { + "epoch": 0.951, + "grad_norm": 0.9758581519126892, + "learning_rate": 1.4592101516509916e-07, + "loss": 0.4009, + "step": 9510 + }, + { + "epoch": 0.9512, + "grad_norm": 1.509272813796997, + "learning_rate": 1.4473513016937223e-07, + "loss": 0.1397, + "step": 9512 + }, + { + "epoch": 0.9514, + "grad_norm": 2.80757474899292, + "learning_rate": 1.4355404851001953e-07, + "loss": 0.0688, + "step": 9514 + }, + { + "epoch": 0.9516, + "grad_norm": 1.9046063423156738, + "learning_rate": 1.4237777076268723e-07, + "loss": 0.3478, + "step": 9516 + }, + { + "epoch": 0.9518, + "grad_norm": 6.674467086791992, + "learning_rate": 1.4120629750067672e-07, + "loss": 0.5129, + "step": 9518 + }, + { + "epoch": 0.952, + "grad_norm": 0.06908193230628967, + "learning_rate": 1.400396292949513e-07, + "loss": 0.0471, + "step": 9520 + }, + { + "epoch": 0.9522, + "grad_norm": 4.298490047454834, + "learning_rate": 1.3887776671412943e-07, + "loss": 0.7775, + "step": 9522 + }, + { + "epoch": 0.9524, + "grad_norm": 7.143821716308594, + "learning_rate": 1.377207103244904e-07, + "loss": 0.9376, + "step": 9524 + }, + { + "epoch": 0.9526, + "grad_norm": 0.9515742659568787, + "learning_rate": 1.3656846068996976e-07, + "loss": 0.1145, + "step": 9526 + }, + { + "epoch": 0.9528, + "grad_norm": 1.649501919746399, + "learning_rate": 1.3542101837215826e-07, + "loss": 0.0771, + "step": 9528 + }, + { + "epoch": 0.953, + "grad_norm": 0.16056782007217407, + "learning_rate": 1.3427838393030634e-07, + "loss": 0.2682, + "step": 9530 + }, + { + "epoch": 0.9532, + "grad_norm": 3.7667806148529053, + "learning_rate": 1.3314055792131964e-07, + "loss": 0.7065, + "step": 9532 + }, + { + "epoch": 0.9534, + "grad_norm": 2.8477046489715576, + "learning_rate": 1.320075408997612e-07, + "loss": 0.0956, + "step": 9534 + }, + { + "epoch": 0.9536, + "grad_norm": 3.0819661617279053, + "learning_rate": 1.308793334178493e-07, + "loss": 0.1071, + "step": 9536 + }, + { + "epoch": 0.9538, + "grad_norm": 8.947070121765137, + "learning_rate": 1.2975593602545966e-07, + "loss": 0.3435, + "step": 9538 + }, + { + "epoch": 0.954, + "grad_norm": 7.083137512207031, + "learning_rate": 1.2863734927012094e-07, + "loss": 0.1876, + "step": 9540 + }, + { + "epoch": 0.9542, + "grad_norm": 0.32494667172431946, + "learning_rate": 1.275235736970193e-07, + "loss": 0.0181, + "step": 9542 + }, + { + "epoch": 0.9544, + "grad_norm": 0.9536388516426086, + "learning_rate": 1.26414609848996e-07, + "loss": 0.4255, + "step": 9544 + }, + { + "epoch": 0.9546, + "grad_norm": 1.7586272954940796, + "learning_rate": 1.2531045826654652e-07, + "loss": 0.7243, + "step": 9546 + }, + { + "epoch": 0.9548, + "grad_norm": 0.24037855863571167, + "learning_rate": 1.242111194878215e-07, + "loss": 0.0882, + "step": 9548 + }, + { + "epoch": 0.955, + "grad_norm": 2.1069369316101074, + "learning_rate": 1.231165940486234e-07, + "loss": 0.1506, + "step": 9550 + }, + { + "epoch": 0.9552, + "grad_norm": 8.59726619720459, + "learning_rate": 1.2202688248241113e-07, + "loss": 0.2942, + "step": 9552 + }, + { + "epoch": 0.9554, + "grad_norm": 3.4142167568206787, + "learning_rate": 1.2094198532029754e-07, + "loss": 0.1641, + "step": 9554 + }, + { + "epoch": 0.9556, + "grad_norm": 2.260650396347046, + "learning_rate": 1.1986190309104861e-07, + "loss": 0.0956, + "step": 9556 + }, + { + "epoch": 0.9558, + "grad_norm": 4.3381781578063965, + "learning_rate": 1.1878663632108322e-07, + "loss": 0.2458, + "step": 9558 + }, + { + "epoch": 0.956, + "grad_norm": 2.2918076515197754, + "learning_rate": 1.1771618553447217e-07, + "loss": 0.1084, + "step": 9560 + }, + { + "epoch": 0.9562, + "grad_norm": 3.524341106414795, + "learning_rate": 1.1665055125294033e-07, + "loss": 0.1892, + "step": 9562 + }, + { + "epoch": 0.9564, + "grad_norm": 10.671085357666016, + "learning_rate": 1.1558973399586671e-07, + "loss": 0.2007, + "step": 9564 + }, + { + "epoch": 0.9566, + "grad_norm": 0.1121668815612793, + "learning_rate": 1.1453373428027992e-07, + "loss": 0.3554, + "step": 9566 + }, + { + "epoch": 0.9568, + "grad_norm": 5.018359661102295, + "learning_rate": 1.134825526208605e-07, + "loss": 0.1903, + "step": 9568 + }, + { + "epoch": 0.957, + "grad_norm": 8.825321197509766, + "learning_rate": 1.1243618952994195e-07, + "loss": 0.4227, + "step": 9570 + }, + { + "epoch": 0.9572, + "grad_norm": 13.738285064697266, + "learning_rate": 1.1139464551750857e-07, + "loss": 0.4071, + "step": 9572 + }, + { + "epoch": 0.9574, + "grad_norm": 8.4378080368042, + "learning_rate": 1.1035792109119758e-07, + "loss": 0.7153, + "step": 9574 + }, + { + "epoch": 0.9576, + "grad_norm": 0.18691451847553253, + "learning_rate": 1.0932601675629595e-07, + "loss": 0.0161, + "step": 9576 + }, + { + "epoch": 0.9578, + "grad_norm": 3.1172142028808594, + "learning_rate": 1.0829893301573913e-07, + "loss": 0.1145, + "step": 9578 + }, + { + "epoch": 0.958, + "grad_norm": 2.9549384117126465, + "learning_rate": 1.0727667037011668e-07, + "loss": 0.0998, + "step": 9580 + }, + { + "epoch": 0.9582, + "grad_norm": 1.5426806211471558, + "learning_rate": 1.0625922931766786e-07, + "loss": 0.2008, + "step": 9582 + }, + { + "epoch": 0.9584, + "grad_norm": 0.07867155224084854, + "learning_rate": 1.052466103542793e-07, + "loss": 0.4032, + "step": 9584 + }, + { + "epoch": 0.9586, + "grad_norm": 7.504363059997559, + "learning_rate": 1.0423881397349067e-07, + "loss": 0.338, + "step": 9586 + }, + { + "epoch": 0.9588, + "grad_norm": 1.8432539701461792, + "learning_rate": 1.0323584066648795e-07, + "loss": 0.0581, + "step": 9588 + }, + { + "epoch": 0.959, + "grad_norm": 2.228245973587036, + "learning_rate": 1.0223769092211012e-07, + "loss": 0.0833, + "step": 9590 + }, + { + "epoch": 0.9592, + "grad_norm": 0.18137522041797638, + "learning_rate": 1.0124436522684244e-07, + "loss": 0.0418, + "step": 9592 + }, + { + "epoch": 0.9594, + "grad_norm": 2.1846201419830322, + "learning_rate": 1.002558640648199e-07, + "loss": 0.2866, + "step": 9594 + }, + { + "epoch": 0.9596, + "grad_norm": 0.4588221311569214, + "learning_rate": 9.9272187917826e-08, + "loss": 0.0184, + "step": 9596 + }, + { + "epoch": 0.9598, + "grad_norm": 1.0526036024093628, + "learning_rate": 9.829333726529056e-08, + "loss": 0.2255, + "step": 9598 + }, + { + "epoch": 0.96, + "grad_norm": 4.9529595375061035, + "learning_rate": 9.731931258429638e-08, + "loss": 0.1756, + "step": 9600 + }, + { + "epoch": 0.9602, + "grad_norm": 3.1921467781066895, + "learning_rate": 9.635011434957153e-08, + "loss": 0.1417, + "step": 9602 + }, + { + "epoch": 0.9604, + "grad_norm": 11.636567115783691, + "learning_rate": 9.538574303348813e-08, + "loss": 0.3339, + "step": 9604 + }, + { + "epoch": 0.9606, + "grad_norm": 0.3219669461250305, + "learning_rate": 9.442619910607131e-08, + "loss": 0.2386, + "step": 9606 + }, + { + "epoch": 0.9608, + "grad_norm": 0.3353015184402466, + "learning_rate": 9.347148303499143e-08, + "loss": 0.0265, + "step": 9608 + }, + { + "epoch": 0.961, + "grad_norm": 0.11074426770210266, + "learning_rate": 9.252159528556404e-08, + "loss": 0.0107, + "step": 9610 + }, + { + "epoch": 0.9612, + "grad_norm": 0.2862573564052582, + "learning_rate": 9.157653632075435e-08, + "loss": 0.0478, + "step": 9612 + }, + { + "epoch": 0.9614, + "grad_norm": 1.241227149963379, + "learning_rate": 9.063630660117172e-08, + "loss": 0.06, + "step": 9614 + }, + { + "epoch": 0.9616, + "grad_norm": 2.3700037002563477, + "learning_rate": 8.970090658507291e-08, + "loss": 0.4255, + "step": 9616 + }, + { + "epoch": 0.9618, + "grad_norm": 0.17784151434898376, + "learning_rate": 8.877033672835988e-08, + "loss": 0.0504, + "step": 9618 + }, + { + "epoch": 0.962, + "grad_norm": 4.671106815338135, + "learning_rate": 8.784459748458318e-08, + "loss": 0.3169, + "step": 9620 + }, + { + "epoch": 0.9622, + "grad_norm": 2.704710006713867, + "learning_rate": 8.692368930493522e-08, + "loss": 0.0925, + "step": 9622 + }, + { + "epoch": 0.9624, + "grad_norm": 6.560164928436279, + "learning_rate": 8.600761263825475e-08, + "loss": 0.2771, + "step": 9624 + }, + { + "epoch": 0.9626, + "grad_norm": 1.6860800981521606, + "learning_rate": 8.509636793102683e-08, + "loss": 0.0394, + "step": 9626 + }, + { + "epoch": 0.9628, + "grad_norm": 1.3864340782165527, + "learning_rate": 8.418995562738286e-08, + "loss": 0.0535, + "step": 9628 + }, + { + "epoch": 0.963, + "grad_norm": 9.820911407470703, + "learning_rate": 8.328837616909612e-08, + "loss": 0.4767, + "step": 9630 + }, + { + "epoch": 0.9632, + "grad_norm": 4.679340362548828, + "learning_rate": 8.239162999558403e-08, + "loss": 0.0703, + "step": 9632 + }, + { + "epoch": 0.9634, + "grad_norm": 0.45922529697418213, + "learning_rate": 8.149971754391251e-08, + "loss": 0.0235, + "step": 9634 + }, + { + "epoch": 0.9636, + "grad_norm": 0.8535955548286438, + "learning_rate": 8.061263924878604e-08, + "loss": 0.045, + "step": 9636 + }, + { + "epoch": 0.9638, + "grad_norm": 0.26557549834251404, + "learning_rate": 7.973039554255768e-08, + "loss": 0.3614, + "step": 9638 + }, + { + "epoch": 0.964, + "grad_norm": 2.1100032329559326, + "learning_rate": 7.885298685522235e-08, + "loss": 0.1253, + "step": 9640 + }, + { + "epoch": 0.9642, + "grad_norm": 6.088329792022705, + "learning_rate": 7.798041361441688e-08, + "loss": 0.1539, + "step": 9642 + }, + { + "epoch": 0.9644, + "grad_norm": 1.8984109163284302, + "learning_rate": 7.71126762454233e-08, + "loss": 0.1754, + "step": 9644 + }, + { + "epoch": 0.9646, + "grad_norm": 4.350412845611572, + "learning_rate": 7.624977517116772e-08, + "loss": 0.4887, + "step": 9646 + }, + { + "epoch": 0.9648, + "grad_norm": 2.890630006790161, + "learning_rate": 7.539171081221597e-08, + "loss": 0.127, + "step": 9648 + }, + { + "epoch": 0.965, + "grad_norm": 0.7202578186988831, + "learning_rate": 7.453848358678018e-08, + "loss": 0.2149, + "step": 9650 + }, + { + "epoch": 0.9652, + "grad_norm": 0.24183593690395355, + "learning_rate": 7.369009391070992e-08, + "loss": 0.0359, + "step": 9652 + }, + { + "epoch": 0.9654, + "grad_norm": 3.1709718704223633, + "learning_rate": 7.284654219750332e-08, + "loss": 0.129, + "step": 9654 + }, + { + "epoch": 0.9656, + "grad_norm": 0.2316875159740448, + "learning_rate": 7.200782885829482e-08, + "loss": 0.0852, + "step": 9656 + }, + { + "epoch": 0.9658, + "grad_norm": 7.452415466308594, + "learning_rate": 7.117395430186414e-08, + "loss": 0.6604, + "step": 9658 + }, + { + "epoch": 0.966, + "grad_norm": 0.2630626857280731, + "learning_rate": 7.034491893463059e-08, + "loss": 0.1078, + "step": 9660 + }, + { + "epoch": 0.9662, + "grad_norm": 2.013852834701538, + "learning_rate": 6.95207231606576e-08, + "loss": 0.3513, + "step": 9662 + }, + { + "epoch": 0.9664, + "grad_norm": 0.7996952533721924, + "learning_rate": 6.870136738164612e-08, + "loss": 0.0371, + "step": 9664 + }, + { + "epoch": 0.9666, + "grad_norm": 0.5594776272773743, + "learning_rate": 6.788685199694222e-08, + "loss": 0.0379, + "step": 9666 + }, + { + "epoch": 0.9668, + "grad_norm": 0.6196044087409973, + "learning_rate": 6.707717740353059e-08, + "loss": 0.0252, + "step": 9668 + }, + { + "epoch": 0.967, + "grad_norm": 0.9031921029090881, + "learning_rate": 6.627234399603554e-08, + "loss": 0.2291, + "step": 9670 + }, + { + "epoch": 0.9672, + "grad_norm": 0.9752335548400879, + "learning_rate": 6.547235216672443e-08, + "loss": 0.1411, + "step": 9672 + }, + { + "epoch": 0.9674, + "grad_norm": 7.242069244384766, + "learning_rate": 6.4677202305502e-08, + "loss": 0.6848, + "step": 9674 + }, + { + "epoch": 0.9676, + "grad_norm": 0.6806180477142334, + "learning_rate": 6.388689479991606e-08, + "loss": 0.1599, + "step": 9676 + }, + { + "epoch": 0.9678, + "grad_norm": 2.3557963371276855, + "learning_rate": 6.310143003515179e-08, + "loss": 0.0969, + "step": 9678 + }, + { + "epoch": 0.968, + "grad_norm": 0.5674247145652771, + "learning_rate": 6.232080839403631e-08, + "loss": 0.0332, + "step": 9680 + }, + { + "epoch": 0.9682, + "grad_norm": 0.48817774653434753, + "learning_rate": 6.154503025703418e-08, + "loss": 0.0392, + "step": 9682 + }, + { + "epoch": 0.9684, + "grad_norm": 9.133881568908691, + "learning_rate": 6.07740960022507e-08, + "loss": 0.363, + "step": 9684 + }, + { + "epoch": 0.9686, + "grad_norm": 1.9140713214874268, + "learning_rate": 6.000800600542977e-08, + "loss": 0.0603, + "step": 9686 + }, + { + "epoch": 0.9688, + "grad_norm": 0.18984802067279816, + "learning_rate": 5.9246760639953824e-08, + "loss": 0.0151, + "step": 9688 + }, + { + "epoch": 0.969, + "grad_norm": 15.66620922088623, + "learning_rate": 5.849036027684607e-08, + "loss": 0.67, + "step": 9690 + }, + { + "epoch": 0.9692, + "grad_norm": 6.994088649749756, + "learning_rate": 5.7738805284764945e-08, + "loss": 0.2015, + "step": 9692 + }, + { + "epoch": 0.9694, + "grad_norm": 6.2559814453125, + "learning_rate": 5.699209603001077e-08, + "loss": 0.2773, + "step": 9694 + }, + { + "epoch": 0.9696, + "grad_norm": 4.182510852813721, + "learning_rate": 5.625023287652021e-08, + "loss": 0.1821, + "step": 9696 + }, + { + "epoch": 0.9698, + "grad_norm": 2.9471592903137207, + "learning_rate": 5.5513216185867356e-08, + "loss": 0.6584, + "step": 9698 + }, + { + "epoch": 0.97, + "grad_norm": 3.13179349899292, + "learning_rate": 5.4781046317267103e-08, + "loss": 0.4222, + "step": 9700 + }, + { + "epoch": 0.9702, + "grad_norm": 0.88803631067276, + "learning_rate": 5.4053723627567336e-08, + "loss": 0.0626, + "step": 9702 + }, + { + "epoch": 0.9704, + "grad_norm": 8.642060279846191, + "learning_rate": 5.3331248471258926e-08, + "loss": 0.3816, + "step": 9704 + }, + { + "epoch": 0.9706, + "grad_norm": 8.145577430725098, + "learning_rate": 5.261362120046687e-08, + "loss": 0.4915, + "step": 9706 + }, + { + "epoch": 0.9708, + "grad_norm": 11.031943321228027, + "learning_rate": 5.190084216495361e-08, + "loss": 0.2499, + "step": 9708 + }, + { + "epoch": 0.971, + "grad_norm": 0.41914913058280945, + "learning_rate": 5.119291171211793e-08, + "loss": 0.0295, + "step": 9710 + }, + { + "epoch": 0.9712, + "grad_norm": 14.286233901977539, + "learning_rate": 5.048983018699827e-08, + "loss": 0.4567, + "step": 9712 + }, + { + "epoch": 0.9714, + "grad_norm": 5.607082843780518, + "learning_rate": 4.979159793226718e-08, + "loss": 0.3608, + "step": 9714 + }, + { + "epoch": 0.9716, + "grad_norm": 0.12273362278938293, + "learning_rate": 4.9098215288235776e-08, + "loss": 0.1234, + "step": 9716 + }, + { + "epoch": 0.9718, + "grad_norm": 2.1781163215637207, + "learning_rate": 4.840968259284817e-08, + "loss": 0.2815, + "step": 9718 + }, + { + "epoch": 0.972, + "grad_norm": 1.4138092994689941, + "learning_rate": 4.772600018168816e-08, + "loss": 0.0389, + "step": 9720 + }, + { + "epoch": 0.9722, + "grad_norm": 3.8508095741271973, + "learning_rate": 4.704716838797363e-08, + "loss": 0.3178, + "step": 9722 + }, + { + "epoch": 0.9724, + "grad_norm": 3.2986457347869873, + "learning_rate": 4.6373187542561036e-08, + "loss": 0.2007, + "step": 9724 + }, + { + "epoch": 0.9726, + "grad_norm": 0.3651931583881378, + "learning_rate": 4.570405797393762e-08, + "loss": 0.0333, + "step": 9726 + }, + { + "epoch": 0.9728, + "grad_norm": 2.7965714931488037, + "learning_rate": 4.503978000823028e-08, + "loss": 0.2152, + "step": 9728 + }, + { + "epoch": 0.973, + "grad_norm": 3.7738075256347656, + "learning_rate": 4.438035396920004e-08, + "loss": 0.2776, + "step": 9730 + }, + { + "epoch": 0.9732, + "grad_norm": 0.6697690486907959, + "learning_rate": 4.3725780178243135e-08, + "loss": 0.1858, + "step": 9732 + }, + { + "epoch": 0.9734, + "grad_norm": 3.466055393218994, + "learning_rate": 4.3076058954391045e-08, + "loss": 0.2152, + "step": 9734 + }, + { + "epoch": 0.9736, + "grad_norm": 2.604447364807129, + "learning_rate": 4.2431190614309334e-08, + "loss": 0.1509, + "step": 9736 + }, + { + "epoch": 0.9738, + "grad_norm": 4.867729663848877, + "learning_rate": 4.179117547229883e-08, + "loss": 0.1675, + "step": 9738 + }, + { + "epoch": 0.974, + "grad_norm": 1.4093992710113525, + "learning_rate": 4.115601384029666e-08, + "loss": 0.0541, + "step": 9740 + }, + { + "epoch": 0.9742, + "grad_norm": 0.4829404056072235, + "learning_rate": 4.052570602787076e-08, + "loss": 0.0379, + "step": 9742 + }, + { + "epoch": 0.9744, + "grad_norm": 0.5871360898017883, + "learning_rate": 3.990025234222872e-08, + "loss": 0.0168, + "step": 9744 + }, + { + "epoch": 0.9746, + "grad_norm": 6.366359233856201, + "learning_rate": 3.927965308820558e-08, + "loss": 0.1639, + "step": 9746 + }, + { + "epoch": 0.9748, + "grad_norm": 1.2487432956695557, + "learning_rate": 3.866390856827495e-08, + "loss": 0.2607, + "step": 9748 + }, + { + "epoch": 0.975, + "grad_norm": 4.889294147491455, + "learning_rate": 3.805301908254455e-08, + "loss": 0.6153, + "step": 9750 + }, + { + "epoch": 0.9752, + "grad_norm": 9.182682037353516, + "learning_rate": 3.7446984928753984e-08, + "loss": 0.9006, + "step": 9752 + }, + { + "epoch": 0.9754, + "grad_norm": 0.11603279411792755, + "learning_rate": 3.684580640227586e-08, + "loss": 0.1099, + "step": 9754 + }, + { + "epoch": 0.9756, + "grad_norm": 7.929275035858154, + "learning_rate": 3.6249483796116924e-08, + "loss": 0.4111, + "step": 9756 + }, + { + "epoch": 0.9758, + "grad_norm": 6.309351444244385, + "learning_rate": 3.565801740092023e-08, + "loss": 0.439, + "step": 9758 + }, + { + "epoch": 0.976, + "grad_norm": 3.1064319610595703, + "learning_rate": 3.50714075049563e-08, + "loss": 0.0893, + "step": 9760 + }, + { + "epoch": 0.9762, + "grad_norm": 1.8642123937606812, + "learning_rate": 3.4489654394134206e-08, + "loss": 0.0884, + "step": 9762 + }, + { + "epoch": 0.9764, + "grad_norm": 2.798891305923462, + "learning_rate": 3.391275835199159e-08, + "loss": 0.1579, + "step": 9764 + }, + { + "epoch": 0.9766, + "grad_norm": 2.3120760917663574, + "learning_rate": 3.3340719659701315e-08, + "loss": 0.27, + "step": 9766 + }, + { + "epoch": 0.9768, + "grad_norm": 4.136335849761963, + "learning_rate": 3.2773538596068134e-08, + "loss": 0.375, + "step": 9768 + }, + { + "epoch": 0.977, + "grad_norm": 1.28151535987854, + "learning_rate": 3.22112154375287e-08, + "loss": 0.0582, + "step": 9770 + }, + { + "epoch": 0.9772, + "grad_norm": 10.763650894165039, + "learning_rate": 3.165375045815266e-08, + "loss": 0.415, + "step": 9772 + }, + { + "epoch": 0.9774, + "grad_norm": 0.3777977526187897, + "learning_rate": 3.110114392964159e-08, + "loss": 0.054, + "step": 9774 + }, + { + "epoch": 0.9776, + "grad_norm": 6.499661445617676, + "learning_rate": 3.0553396121330015e-08, + "loss": 0.3437, + "step": 9776 + }, + { + "epoch": 0.9778, + "grad_norm": 4.057175159454346, + "learning_rate": 3.001050730018218e-08, + "loss": 0.2224, + "step": 9778 + }, + { + "epoch": 0.978, + "grad_norm": 4.867879867553711, + "learning_rate": 2.947247773079753e-08, + "loss": 0.3553, + "step": 9780 + }, + { + "epoch": 0.9782, + "grad_norm": 0.8889520168304443, + "learning_rate": 2.8939307675402983e-08, + "loss": 0.065, + "step": 9782 + }, + { + "epoch": 0.9784, + "grad_norm": 1.4241443872451782, + "learning_rate": 2.8410997393860663e-08, + "loss": 0.0686, + "step": 9784 + }, + { + "epoch": 0.9786, + "grad_norm": 3.431974172592163, + "learning_rate": 2.7887547143662375e-08, + "loss": 0.073, + "step": 9786 + }, + { + "epoch": 0.9788, + "grad_norm": 1.2433518171310425, + "learning_rate": 2.7368957179929602e-08, + "loss": 0.055, + "step": 9788 + }, + { + "epoch": 0.979, + "grad_norm": 2.2261812686920166, + "learning_rate": 2.6855227755419046e-08, + "loss": 0.1137, + "step": 9790 + }, + { + "epoch": 0.9792, + "grad_norm": 5.207066535949707, + "learning_rate": 2.6346359120514863e-08, + "loss": 0.6123, + "step": 9792 + }, + { + "epoch": 0.9794, + "grad_norm": 0.7248803377151489, + "learning_rate": 2.584235152323422e-08, + "loss": 0.04, + "step": 9794 + }, + { + "epoch": 0.9796, + "grad_norm": 3.2365875244140625, + "learning_rate": 2.5343205209225062e-08, + "loss": 0.0703, + "step": 9796 + }, + { + "epoch": 0.9798, + "grad_norm": 2.847076177597046, + "learning_rate": 2.484892042176279e-08, + "loss": 0.1534, + "step": 9798 + }, + { + "epoch": 0.98, + "grad_norm": 1.534439206123352, + "learning_rate": 2.4359497401758026e-08, + "loss": 0.0504, + "step": 9800 + }, + { + "epoch": 0.9802, + "grad_norm": 3.3688812255859375, + "learning_rate": 2.3874936387747738e-08, + "loss": 0.1394, + "step": 9802 + }, + { + "epoch": 0.9804, + "grad_norm": 6.893819808959961, + "learning_rate": 2.339523761590301e-08, + "loss": 0.4262, + "step": 9804 + }, + { + "epoch": 0.9806, + "grad_norm": 8.194034576416016, + "learning_rate": 2.292040132002238e-08, + "loss": 0.6087, + "step": 9806 + }, + { + "epoch": 0.9808, + "grad_norm": 7.140226364135742, + "learning_rate": 2.2450427731534052e-08, + "loss": 0.3477, + "step": 9808 + }, + { + "epoch": 0.981, + "grad_norm": 4.06059455871582, + "learning_rate": 2.1985317079500358e-08, + "loss": 0.1755, + "step": 9810 + }, + { + "epoch": 0.9812, + "grad_norm": 2.7062673568725586, + "learning_rate": 2.152506959060774e-08, + "loss": 0.3711, + "step": 9812 + }, + { + "epoch": 0.9814, + "grad_norm": 1.1393840312957764, + "learning_rate": 2.1069685489176762e-08, + "loss": 0.1641, + "step": 9814 + }, + { + "epoch": 0.9816, + "grad_norm": 1.0249760150909424, + "learning_rate": 2.061916499715544e-08, + "loss": 0.061, + "step": 9816 + }, + { + "epoch": 0.9818, + "grad_norm": 2.36454701423645, + "learning_rate": 2.017350833412146e-08, + "loss": 0.1419, + "step": 9818 + }, + { + "epoch": 0.982, + "grad_norm": 3.5024983882904053, + "learning_rate": 1.973271571728441e-08, + "loss": 0.5129, + "step": 9820 + }, + { + "epoch": 0.9822, + "grad_norm": 2.8188674449920654, + "learning_rate": 1.929678736148022e-08, + "loss": 0.2377, + "step": 9822 + }, + { + "epoch": 0.9824, + "grad_norm": 0.208138570189476, + "learning_rate": 1.886572347917337e-08, + "loss": 0.1149, + "step": 9824 + }, + { + "epoch": 0.9826, + "grad_norm": 2.8546626567840576, + "learning_rate": 1.8439524280462474e-08, + "loss": 0.0988, + "step": 9826 + }, + { + "epoch": 0.9828, + "grad_norm": 1.4325743913650513, + "learning_rate": 1.8018189973069144e-08, + "loss": 0.0853, + "step": 9828 + }, + { + "epoch": 0.983, + "grad_norm": 0.15609712898731232, + "learning_rate": 1.7601720762346895e-08, + "loss": 0.4836, + "step": 9830 + }, + { + "epoch": 0.9832, + "grad_norm": 6.382482051849365, + "learning_rate": 1.7190116851280024e-08, + "loss": 0.7423, + "step": 9832 + }, + { + "epoch": 0.9834, + "grad_norm": 0.5732066631317139, + "learning_rate": 1.678337844047695e-08, + "loss": 0.0295, + "step": 9834 + }, + { + "epoch": 0.9836, + "grad_norm": 0.1851583570241928, + "learning_rate": 1.6381505728176872e-08, + "loss": 0.1802, + "step": 9836 + }, + { + "epoch": 0.9838, + "grad_norm": 2.7513115406036377, + "learning_rate": 1.5984498910249778e-08, + "loss": 0.2413, + "step": 9838 + }, + { + "epoch": 0.984, + "grad_norm": 0.7874372601509094, + "learning_rate": 1.5592358180189782e-08, + "loss": 0.1008, + "step": 9840 + }, + { + "epoch": 0.9842, + "grad_norm": 0.08096729218959808, + "learning_rate": 1.5205083729122883e-08, + "loss": 0.0204, + "step": 9842 + }, + { + "epoch": 0.9844, + "grad_norm": 0.3383121192455292, + "learning_rate": 1.482267574580143e-08, + "loss": 0.0406, + "step": 9844 + }, + { + "epoch": 0.9846, + "grad_norm": 6.935275077819824, + "learning_rate": 1.4445134416607442e-08, + "loss": 0.5588, + "step": 9846 + }, + { + "epoch": 0.9848, + "grad_norm": 4.365992069244385, + "learning_rate": 1.4072459925548176e-08, + "loss": 0.0995, + "step": 9848 + }, + { + "epoch": 0.985, + "grad_norm": 7.135626792907715, + "learning_rate": 1.370465245426167e-08, + "loss": 0.399, + "step": 9850 + }, + { + "epoch": 0.9852, + "grad_norm": 2.5722439289093018, + "learning_rate": 1.3341712182012301e-08, + "loss": 0.3142, + "step": 9852 + }, + { + "epoch": 0.9854, + "grad_norm": 2.8778364658355713, + "learning_rate": 1.2983639285693018e-08, + "loss": 0.0941, + "step": 9854 + }, + { + "epoch": 0.9856, + "grad_norm": 0.5499218106269836, + "learning_rate": 1.2630433939825326e-08, + "loss": 0.0261, + "step": 9856 + }, + { + "epoch": 0.9858, + "grad_norm": 2.184906244277954, + "learning_rate": 1.2282096316554858e-08, + "loss": 0.2371, + "step": 9858 + }, + { + "epoch": 0.986, + "grad_norm": 4.24810791015625, + "learning_rate": 1.1938626585660252e-08, + "loss": 0.1213, + "step": 9860 + }, + { + "epoch": 0.9862, + "grad_norm": 0.13639876246452332, + "learning_rate": 1.1600024914540931e-08, + "loss": 0.162, + "step": 9862 + }, + { + "epoch": 0.9864, + "grad_norm": 1.0226075649261475, + "learning_rate": 1.126629146822933e-08, + "loss": 0.1835, + "step": 9864 + }, + { + "epoch": 0.9866, + "grad_norm": 5.670531749725342, + "learning_rate": 1.0937426409384223e-08, + "loss": 0.1565, + "step": 9866 + }, + { + "epoch": 0.9868, + "grad_norm": 2.708902359008789, + "learning_rate": 1.0613429898287397e-08, + "loss": 0.2074, + "step": 9868 + }, + { + "epoch": 0.987, + "grad_norm": 4.95836067199707, + "learning_rate": 1.0294302092853647e-08, + "loss": 0.1264, + "step": 9870 + }, + { + "epoch": 0.9872, + "grad_norm": 0.756247878074646, + "learning_rate": 9.980043148619668e-09, + "loss": 0.2474, + "step": 9872 + }, + { + "epoch": 0.9874, + "grad_norm": 8.26154899597168, + "learning_rate": 9.670653218752935e-09, + "loss": 0.2008, + "step": 9874 + }, + { + "epoch": 0.9876, + "grad_norm": 1.1014680862426758, + "learning_rate": 9.366132454046162e-09, + "loss": 0.1595, + "step": 9876 + }, + { + "epoch": 0.9878, + "grad_norm": 0.7164192795753479, + "learning_rate": 9.066481002918403e-09, + "loss": 0.3894, + "step": 9878 + }, + { + "epoch": 0.988, + "grad_norm": 0.30404090881347656, + "learning_rate": 8.771699011416169e-09, + "loss": 0.0574, + "step": 9880 + }, + { + "epoch": 0.9882, + "grad_norm": 1.8645484447479248, + "learning_rate": 8.481786623214527e-09, + "loss": 0.081, + "step": 9882 + }, + { + "epoch": 0.9884, + "grad_norm": 0.24885350465774536, + "learning_rate": 8.196743979610455e-09, + "loss": 0.0219, + "step": 9884 + }, + { + "epoch": 0.9886, + "grad_norm": 4.053718090057373, + "learning_rate": 7.916571219531711e-09, + "loss": 0.5991, + "step": 9886 + }, + { + "epoch": 0.9888, + "grad_norm": 3.8242852687835693, + "learning_rate": 7.641268479531283e-09, + "loss": 0.2385, + "step": 9888 + }, + { + "epoch": 0.989, + "grad_norm": 2.6425750255584717, + "learning_rate": 7.370835893788508e-09, + "loss": 0.1506, + "step": 9890 + }, + { + "epoch": 0.9892, + "grad_norm": 0.8418349623680115, + "learning_rate": 7.105273594107953e-09, + "loss": 0.1275, + "step": 9892 + }, + { + "epoch": 0.9894, + "grad_norm": 15.08019733428955, + "learning_rate": 6.844581709921639e-09, + "loss": 0.513, + "step": 9894 + }, + { + "epoch": 0.9896, + "grad_norm": 1.3763177394866943, + "learning_rate": 6.588760368287928e-09, + "loss": 0.2575, + "step": 9896 + }, + { + "epoch": 0.9898, + "grad_norm": 1.497840166091919, + "learning_rate": 6.3378096938915276e-09, + "loss": 0.1713, + "step": 9898 + }, + { + "epoch": 0.99, + "grad_norm": 0.1797395646572113, + "learning_rate": 6.091729809042379e-09, + "loss": 0.3127, + "step": 9900 + }, + { + "epoch": 0.9902, + "grad_norm": 2.952615261077881, + "learning_rate": 5.850520833676765e-09, + "loss": 0.1675, + "step": 9902 + }, + { + "epoch": 0.9904, + "grad_norm": 9.471525192260742, + "learning_rate": 5.614182885357311e-09, + "loss": 0.5343, + "step": 9904 + }, + { + "epoch": 0.9906, + "grad_norm": 0.18879730999469757, + "learning_rate": 5.382716079271877e-09, + "loss": 0.4588, + "step": 9906 + }, + { + "epoch": 0.9908, + "grad_norm": 7.449692249298096, + "learning_rate": 5.156120528233555e-09, + "loss": 0.2441, + "step": 9908 + }, + { + "epoch": 0.991, + "grad_norm": 4.774081707000732, + "learning_rate": 4.9343963426840006e-09, + "loss": 0.3846, + "step": 9910 + }, + { + "epoch": 0.9912, + "grad_norm": 3.5647811889648438, + "learning_rate": 4.717543630688992e-09, + "loss": 0.1017, + "step": 9912 + }, + { + "epoch": 0.9914, + "grad_norm": 7.341500282287598, + "learning_rate": 4.505562497938431e-09, + "loss": 0.2137, + "step": 9914 + }, + { + "epoch": 0.9916, + "grad_norm": 2.1906352043151855, + "learning_rate": 4.298453047749674e-09, + "loss": 0.1064, + "step": 9916 + }, + { + "epoch": 0.9918, + "grad_norm": 0.22140319645404816, + "learning_rate": 4.096215381066415e-09, + "loss": 0.1126, + "step": 9918 + }, + { + "epoch": 0.992, + "grad_norm": 1.5637964010238647, + "learning_rate": 3.898849596456477e-09, + "loss": 0.3645, + "step": 9920 + }, + { + "epoch": 0.9922, + "grad_norm": 0.23849308490753174, + "learning_rate": 3.7063557901129144e-09, + "loss": 0.0801, + "step": 9922 + }, + { + "epoch": 0.9924, + "grad_norm": 0.20337949693202972, + "learning_rate": 3.518734055855122e-09, + "loss": 0.1369, + "step": 9924 + }, + { + "epoch": 0.9926, + "grad_norm": 5.049282073974609, + "learning_rate": 3.3359844851277302e-09, + "loss": 0.1563, + "step": 9926 + }, + { + "epoch": 0.9928, + "grad_norm": 2.2264084815979004, + "learning_rate": 3.1581071670006013e-09, + "loss": 0.0718, + "step": 9928 + }, + { + "epoch": 0.993, + "grad_norm": 8.412311553955078, + "learning_rate": 2.9851021881688314e-09, + "loss": 0.9753, + "step": 9930 + }, + { + "epoch": 0.9932, + "grad_norm": 5.903042316436768, + "learning_rate": 2.8169696329527484e-09, + "loss": 0.1658, + "step": 9932 + }, + { + "epoch": 0.9934, + "grad_norm": 7.086763381958008, + "learning_rate": 2.6537095832990247e-09, + "loss": 0.2677, + "step": 9934 + }, + { + "epoch": 0.9936, + "grad_norm": 6.07136869430542, + "learning_rate": 2.495322118778454e-09, + "loss": 0.3038, + "step": 9936 + }, + { + "epoch": 0.9938, + "grad_norm": 2.2851791381835938, + "learning_rate": 2.341807316587064e-09, + "loss": 0.0961, + "step": 9938 + }, + { + "epoch": 0.994, + "grad_norm": 3.2269370555877686, + "learning_rate": 2.193165251545004e-09, + "loss": 0.4876, + "step": 9940 + }, + { + "epoch": 0.9942, + "grad_norm": 0.12575875222682953, + "learning_rate": 2.049395996099879e-09, + "loss": 0.1011, + "step": 9942 + }, + { + "epoch": 0.9944, + "grad_norm": 1.0080852508544922, + "learning_rate": 1.910499620322304e-09, + "loss": 0.0951, + "step": 9944 + }, + { + "epoch": 0.9946, + "grad_norm": 0.2542726993560791, + "learning_rate": 1.776476191910348e-09, + "loss": 0.0167, + "step": 9946 + }, + { + "epoch": 0.9948, + "grad_norm": 1.096764087677002, + "learning_rate": 1.647325776182873e-09, + "loss": 0.1597, + "step": 9948 + }, + { + "epoch": 0.995, + "grad_norm": 7.207360744476318, + "learning_rate": 1.5230484360873043e-09, + "loss": 0.5457, + "step": 9950 + }, + { + "epoch": 0.9952, + "grad_norm": 0.6455267071723938, + "learning_rate": 1.4036442321962995e-09, + "loss": 0.1146, + "step": 9952 + }, + { + "epoch": 0.9954, + "grad_norm": 0.5692222118377686, + "learning_rate": 1.2891132227033087e-09, + "loss": 0.1892, + "step": 9954 + }, + { + "epoch": 0.9956, + "grad_norm": 1.5371720790863037, + "learning_rate": 1.1794554634314558e-09, + "loss": 0.0908, + "step": 9956 + }, + { + "epoch": 0.9958, + "grad_norm": 4.621524810791016, + "learning_rate": 1.0746710078257673e-09, + "loss": 0.3409, + "step": 9958 + }, + { + "epoch": 0.996, + "grad_norm": 9.077204704284668, + "learning_rate": 9.74759906957612e-10, + "loss": 0.2413, + "step": 9960 + }, + { + "epoch": 0.9962, + "grad_norm": 4.821218013763428, + "learning_rate": 8.797222095224822e-10, + "loss": 0.3123, + "step": 9962 + }, + { + "epoch": 0.9964, + "grad_norm": 0.2627652585506439, + "learning_rate": 7.895579618388827e-10, + "loss": 0.1693, + "step": 9964 + }, + { + "epoch": 0.9966, + "grad_norm": 0.2788083255290985, + "learning_rate": 7.042672078527712e-10, + "loss": 0.0791, + "step": 9966 + }, + { + "epoch": 0.9968, + "grad_norm": 0.1699613332748413, + "learning_rate": 6.238499891353389e-10, + "loss": 0.094, + "step": 9968 + }, + { + "epoch": 0.997, + "grad_norm": 11.463567733764648, + "learning_rate": 5.483063448785686e-10, + "loss": 0.6006, + "step": 9970 + }, + { + "epoch": 0.9972, + "grad_norm": 3.7742180824279785, + "learning_rate": 4.77636311903007e-10, + "loss": 0.1755, + "step": 9972 + }, + { + "epoch": 0.9974, + "grad_norm": 4.965467929840088, + "learning_rate": 4.118399246522131e-10, + "loss": 0.2412, + "step": 9974 + }, + { + "epoch": 0.9976, + "grad_norm": 5.833639621734619, + "learning_rate": 3.509172151938689e-10, + "loss": 0.4581, + "step": 9976 + }, + { + "epoch": 0.9978, + "grad_norm": 6.718730926513672, + "learning_rate": 2.948682132208891e-10, + "loss": 0.288, + "step": 9978 + }, + { + "epoch": 0.998, + "grad_norm": 0.35264289379119873, + "learning_rate": 2.436929460525317e-10, + "loss": 0.2178, + "step": 9980 + }, + { + "epoch": 0.9982, + "grad_norm": 0.4536125659942627, + "learning_rate": 1.9739143862884668e-10, + "loss": 0.0466, + "step": 9982 + }, + { + "epoch": 0.9984, + "grad_norm": 0.48696693778038025, + "learning_rate": 1.559637135173375e-10, + "loss": 0.054, + "step": 9984 + }, + { + "epoch": 0.9986, + "grad_norm": 3.2200417518615723, + "learning_rate": 1.1940979091074056e-10, + "loss": 0.4729, + "step": 9986 + }, + { + "epoch": 0.9988, + "grad_norm": 0.5005107522010803, + "learning_rate": 8.772968862369447e-11, + "loss": 0.0722, + "step": 9988 + }, + { + "epoch": 0.999, + "grad_norm": 6.185525417327881, + "learning_rate": 6.092342209607083e-11, + "loss": 0.3804, + "step": 9990 + }, + { + "epoch": 0.9992, + "grad_norm": 0.2855339050292969, + "learning_rate": 3.899100439408443e-11, + "loss": 0.0198, + "step": 9992 + }, + { + "epoch": 0.9994, + "grad_norm": 1.6095556020736694, + "learning_rate": 2.1932446206962556e-11, + "loss": 0.247, + "step": 9994 + }, + { + "epoch": 0.9996, + "grad_norm": 0.13488978147506714, + "learning_rate": 9.74775584916543e-12, + "loss": 0.0455, + "step": 9996 + }, + { + "epoch": 0.9998, + "grad_norm": 2.9248945713043213, + "learning_rate": 2.4369392592760166e-12, + "loss": 0.1325, + "step": 9998 + }, + { + "epoch": 1.0, + "grad_norm": 2.105154037475586, + "learning_rate": 0.0, + "loss": 0.1369, + "step": 10000 + }, + { + "epoch": 1.0, + "step": 10000, + "total_flos": 5.057206687996314e+16, + "train_loss": 0.2676136633721646, + "train_runtime": 5765.2454, + "train_samples_per_second": 1.735, + "train_steps_per_second": 1.735 + } + ], + "logging_steps": 2, + "max_steps": 10000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 5.057206687996314e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario12_new_10000_nosampling_seed1/client_0/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario12_new_10000_nosampling_seed1/client_0/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..577e62dc3edd7936e24563f2a003c1c1a94d686e --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario12_new_10000_nosampling_seed1/client_0/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c15489f49a90fdded8919fbb8ebc570b124d54d2018f83913a15cd74185afb9d +size 3837841200 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario12_new_10000_nosampling_seed1/client_0/global_step10000/mp_rank_00_model_states.pt b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario12_new_10000_nosampling_seed1/client_0/global_step10000/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f727f6f86822e733bfe52f987c5817edc3c4ea78 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario12_new_10000_nosampling_seed1/client_0/global_step10000/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f27d50b648a1b01d6976aadad7c37de66a6e5a4317832888e94a1404c52b910c +size 639989420 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario12_new_10000_nosampling_seed1/client_0/latest b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario12_new_10000_nosampling_seed1/client_0/latest new file mode 100644 index 0000000000000000000000000000000000000000..25c776ee3abcad1c4d1e16e8275e4e00984a237c --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario12_new_10000_nosampling_seed1/client_0/latest @@ -0,0 +1 @@ +global_step10000 \ No newline at end of file diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario12_new_10000_nosampling_seed1/client_0/scheduler.pt b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario12_new_10000_nosampling_seed1/client_0/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..25eb571999231336d0a702eb239de6c45b7611dd --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario12_new_10000_nosampling_seed1/client_0/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939ec7493baff042202ebbde234df8c237e604474ce9fde178ca98bdf8572092 +size 1064 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario12_new_10000_nosampling_seed1/client_0/zero_to_fp32.py b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario12_new_10000_nosampling_seed1/client_0/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..e93cb1c95f15c1474642edb1978714075361bc04 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario12_new_10000_nosampling_seed1/client_0/zero_to_fp32.py @@ -0,0 +1,758 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: + shared_tensor = state_dict[converted_tensors[tensor_id]] + state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + state_dict[name] = tensor.contiguous() + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in shard_state_dict: + del state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario12_new_10000_nosampling_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario12_new_10000_nosampling_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b7cf4fc3f9ff9aefc001ba203161ab095e7204b7 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario12_new_10000_nosampling_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c17e752afd7a58c48c285ac03d46527c50f29299301875ef66b17dff9795325a +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario16_new_10000_nosampling_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario16_new_10000_nosampling_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1488136f8506c90756555062bcd8889a78877b0a --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario16_new_10000_nosampling_seed1/0_trainer_state.json @@ -0,0 +1,280025 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 79998, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 2.500062501562539e-05, + "grad_norm": 9.10953426361084, + "learning_rate": 5e-09, + "loss": 1.3562, + "step": 2 + }, + { + "epoch": 5.000125003125078e-05, + "grad_norm": 4.277276992797852, + "learning_rate": 1e-08, + "loss": 1.0142, + "step": 4 + }, + { + "epoch": 7.500187504687617e-05, + "grad_norm": 10.494938850402832, + "learning_rate": 1.5000000000000002e-08, + "loss": 1.7856, + "step": 6 + }, + { + "epoch": 0.00010000250006250156, + "grad_norm": 8.096179008483887, + "learning_rate": 2e-08, + "loss": 4.1155, + "step": 8 + }, + { + "epoch": 0.00012500312507812694, + "grad_norm": 4.419346809387207, + "learning_rate": 2.5000000000000002e-08, + "loss": 0.105, + "step": 10 + }, + { + "epoch": 0.00015000375009375234, + "grad_norm": 3.601125478744507, + "learning_rate": 3.0000000000000004e-08, + "loss": 1.5986, + "step": 12 + }, + { + "epoch": 0.00017500437510937773, + "grad_norm": 8.975794792175293, + "learning_rate": 3.5e-08, + "loss": 2.3227, + "step": 14 + }, + { + "epoch": 0.00020000500012500312, + "grad_norm": 6.112163066864014, + "learning_rate": 4e-08, + "loss": 1.8024, + "step": 16 + }, + { + "epoch": 0.00022500562514062852, + "grad_norm": 11.981054306030273, + "learning_rate": 4.5e-08, + "loss": 1.3717, + "step": 18 + }, + { + "epoch": 0.0002500062501562539, + "grad_norm": 4.937902927398682, + "learning_rate": 5.0000000000000004e-08, + "loss": 1.6723, + "step": 20 + }, + { + "epoch": 0.0002750068751718793, + "grad_norm": 3.265590190887451, + "learning_rate": 5.5e-08, + "loss": 1.1271, + "step": 22 + }, + { + "epoch": 0.00030000750018750467, + "grad_norm": 7.635922908782959, + "learning_rate": 6.000000000000001e-08, + "loss": 2.5472, + "step": 24 + }, + { + "epoch": 0.0003250081252031301, + "grad_norm": 3.672375202178955, + "learning_rate": 6.5e-08, + "loss": 1.7466, + "step": 26 + }, + { + "epoch": 0.00035000875021875546, + "grad_norm": 8.024916648864746, + "learning_rate": 7e-08, + "loss": 1.0598, + "step": 28 + }, + { + "epoch": 0.0003750093752343809, + "grad_norm": 6.571441173553467, + "learning_rate": 7.500000000000001e-08, + "loss": 1.3687, + "step": 30 + }, + { + "epoch": 0.00040001000025000625, + "grad_norm": 10.959146499633789, + "learning_rate": 8e-08, + "loss": 1.9713, + "step": 32 + }, + { + "epoch": 0.0004250106252656316, + "grad_norm": 7.638070106506348, + "learning_rate": 8.500000000000001e-08, + "loss": 1.1454, + "step": 34 + }, + { + "epoch": 0.00045001125028125703, + "grad_norm": 10.974169731140137, + "learning_rate": 9e-08, + "loss": 1.0309, + "step": 36 + }, + { + "epoch": 0.0004750118752968824, + "grad_norm": 6.3398661613464355, + "learning_rate": 9.5e-08, + "loss": 0.8875, + "step": 38 + }, + { + "epoch": 0.0005000125003125078, + "grad_norm": 5.970608711242676, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.1999, + "step": 40 + }, + { + "epoch": 0.0005250131253281332, + "grad_norm": 8.671695709228516, + "learning_rate": 1.0500000000000001e-07, + "loss": 2.0603, + "step": 42 + }, + { + "epoch": 0.0005500137503437586, + "grad_norm": 6.7004899978637695, + "learning_rate": 1.1e-07, + "loss": 1.6211, + "step": 44 + }, + { + "epoch": 0.000575014375359384, + "grad_norm": 8.571113586425781, + "learning_rate": 1.1500000000000001e-07, + "loss": 1.6288, + "step": 46 + }, + { + "epoch": 0.0006000150003750093, + "grad_norm": 10.264274597167969, + "learning_rate": 1.2000000000000002e-07, + "loss": 2.7799, + "step": 48 + }, + { + "epoch": 0.0006250156253906348, + "grad_norm": 3.2826316356658936, + "learning_rate": 1.2500000000000002e-07, + "loss": 1.1192, + "step": 50 + }, + { + "epoch": 0.0006500162504062602, + "grad_norm": 3.405529022216797, + "learning_rate": 1.3e-07, + "loss": 1.8694, + "step": 52 + }, + { + "epoch": 0.0006750168754218856, + "grad_norm": 13.13711929321289, + "learning_rate": 1.35e-07, + "loss": 6.8851, + "step": 54 + }, + { + "epoch": 0.0007000175004375109, + "grad_norm": 18.245697021484375, + "learning_rate": 1.4e-07, + "loss": 0.9101, + "step": 56 + }, + { + "epoch": 0.0007250181254531363, + "grad_norm": 9.195446968078613, + "learning_rate": 1.4500000000000001e-07, + "loss": 1.7172, + "step": 58 + }, + { + "epoch": 0.0007500187504687618, + "grad_norm": 4.365952014923096, + "learning_rate": 1.5000000000000002e-07, + "loss": 1.0999, + "step": 60 + }, + { + "epoch": 0.0007750193754843871, + "grad_norm": 40.13249969482422, + "learning_rate": 1.5500000000000002e-07, + "loss": 6.4126, + "step": 62 + }, + { + "epoch": 0.0008000200005000125, + "grad_norm": 11.006204605102539, + "learning_rate": 1.6e-07, + "loss": 1.7386, + "step": 64 + }, + { + "epoch": 0.0008250206255156379, + "grad_norm": 4.893359661102295, + "learning_rate": 1.65e-07, + "loss": 2.3432, + "step": 66 + }, + { + "epoch": 0.0008500212505312632, + "grad_norm": 10.647500038146973, + "learning_rate": 1.7000000000000001e-07, + "loss": 2.0898, + "step": 68 + }, + { + "epoch": 0.0008750218755468887, + "grad_norm": 4.967966556549072, + "learning_rate": 1.7500000000000002e-07, + "loss": 1.4541, + "step": 70 + }, + { + "epoch": 0.0009000225005625141, + "grad_norm": 4.602876663208008, + "learning_rate": 1.8e-07, + "loss": 1.1361, + "step": 72 + }, + { + "epoch": 0.0009250231255781394, + "grad_norm": 10.445550918579102, + "learning_rate": 1.85e-07, + "loss": 1.1014, + "step": 74 + }, + { + "epoch": 0.0009500237505937648, + "grad_norm": 17.332305908203125, + "learning_rate": 1.9e-07, + "loss": 1.9133, + "step": 76 + }, + { + "epoch": 0.0009750243756093903, + "grad_norm": 14.35498046875, + "learning_rate": 1.95e-07, + "loss": 2.9009, + "step": 78 + }, + { + "epoch": 0.0010000250006250155, + "grad_norm": 10.465797424316406, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.3169, + "step": 80 + }, + { + "epoch": 0.001025025625640641, + "grad_norm": 5.130842685699463, + "learning_rate": 2.0500000000000002e-07, + "loss": 0.1114, + "step": 82 + }, + { + "epoch": 0.0010500262506562665, + "grad_norm": 8.74416446685791, + "learning_rate": 2.1000000000000003e-07, + "loss": 1.4988, + "step": 84 + }, + { + "epoch": 0.0010750268756718917, + "grad_norm": 4.54466438293457, + "learning_rate": 2.15e-07, + "loss": 1.4445, + "step": 86 + }, + { + "epoch": 0.0011000275006875172, + "grad_norm": 10.986504554748535, + "learning_rate": 2.2e-07, + "loss": 2.1573, + "step": 88 + }, + { + "epoch": 0.0011250281257031425, + "grad_norm": 7.103388786315918, + "learning_rate": 2.2500000000000002e-07, + "loss": 1.0223, + "step": 90 + }, + { + "epoch": 0.001150028750718768, + "grad_norm": 3.085089921951294, + "learning_rate": 2.3000000000000002e-07, + "loss": 1.48, + "step": 92 + }, + { + "epoch": 0.0011750293757343934, + "grad_norm": 9.340208053588867, + "learning_rate": 2.3500000000000003e-07, + "loss": 1.2128, + "step": 94 + }, + { + "epoch": 0.0012000300007500187, + "grad_norm": 5.5756988525390625, + "learning_rate": 2.4000000000000003e-07, + "loss": 3.2061, + "step": 96 + }, + { + "epoch": 0.0012250306257656442, + "grad_norm": 10.820189476013184, + "learning_rate": 2.4500000000000004e-07, + "loss": 1.9864, + "step": 98 + }, + { + "epoch": 0.0012500312507812696, + "grad_norm": 11.40755558013916, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.6401, + "step": 100 + }, + { + "epoch": 0.001275031875796895, + "grad_norm": 11.35473918914795, + "learning_rate": 2.55e-07, + "loss": 1.6673, + "step": 102 + }, + { + "epoch": 0.0013000325008125204, + "grad_norm": 6.034008026123047, + "learning_rate": 2.6e-07, + "loss": 0.6481, + "step": 104 + }, + { + "epoch": 0.0013250331258281456, + "grad_norm": 2.167680263519287, + "learning_rate": 2.65e-07, + "loss": 1.2366, + "step": 106 + }, + { + "epoch": 0.001350033750843771, + "grad_norm": 14.133890151977539, + "learning_rate": 2.7e-07, + "loss": 1.3436, + "step": 108 + }, + { + "epoch": 0.0013750343758593966, + "grad_norm": 30.125118255615234, + "learning_rate": 2.75e-07, + "loss": 2.4306, + "step": 110 + }, + { + "epoch": 0.0014000350008750218, + "grad_norm": 2.651886224746704, + "learning_rate": 2.8e-07, + "loss": 0.8265, + "step": 112 + }, + { + "epoch": 0.0014250356258906473, + "grad_norm": 2.9862895011901855, + "learning_rate": 2.85e-07, + "loss": 1.3058, + "step": 114 + }, + { + "epoch": 0.0014500362509062726, + "grad_norm": 10.043912887573242, + "learning_rate": 2.9000000000000003e-07, + "loss": 0.7187, + "step": 116 + }, + { + "epoch": 0.001475036875921898, + "grad_norm": 4.150693893432617, + "learning_rate": 2.9500000000000003e-07, + "loss": 1.2217, + "step": 118 + }, + { + "epoch": 0.0015000375009375235, + "grad_norm": 9.48703670501709, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.1388, + "step": 120 + }, + { + "epoch": 0.0015250381259531488, + "grad_norm": 15.437480926513672, + "learning_rate": 3.0500000000000004e-07, + "loss": 2.5312, + "step": 122 + }, + { + "epoch": 0.0015500387509687743, + "grad_norm": 5.259402275085449, + "learning_rate": 3.1000000000000005e-07, + "loss": 1.7713, + "step": 124 + }, + { + "epoch": 0.0015750393759843995, + "grad_norm": 6.1952128410339355, + "learning_rate": 3.15e-07, + "loss": 1.8568, + "step": 126 + }, + { + "epoch": 0.001600040001000025, + "grad_norm": 12.739459037780762, + "learning_rate": 3.2e-07, + "loss": 2.2193, + "step": 128 + }, + { + "epoch": 0.0016250406260156505, + "grad_norm": 11.742515563964844, + "learning_rate": 3.25e-07, + "loss": 2.6286, + "step": 130 + }, + { + "epoch": 0.0016500412510312757, + "grad_norm": 9.547414779663086, + "learning_rate": 3.3e-07, + "loss": 2.6615, + "step": 132 + }, + { + "epoch": 0.0016750418760469012, + "grad_norm": 6.4586591720581055, + "learning_rate": 3.35e-07, + "loss": 0.9742, + "step": 134 + }, + { + "epoch": 0.0017000425010625265, + "grad_norm": 5.598713397979736, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.8947, + "step": 136 + }, + { + "epoch": 0.001725043126078152, + "grad_norm": 7.771640300750732, + "learning_rate": 3.4500000000000003e-07, + "loss": 1.957, + "step": 138 + }, + { + "epoch": 0.0017500437510937774, + "grad_norm": 6.083402633666992, + "learning_rate": 3.5000000000000004e-07, + "loss": 1.7639, + "step": 140 + }, + { + "epoch": 0.0017750443761094027, + "grad_norm": 5.319943428039551, + "learning_rate": 3.55e-07, + "loss": 1.0745, + "step": 142 + }, + { + "epoch": 0.0018000450011250281, + "grad_norm": 3.0289969444274902, + "learning_rate": 3.6e-07, + "loss": 1.1636, + "step": 144 + }, + { + "epoch": 0.0018250456261406536, + "grad_norm": 6.366456031799316, + "learning_rate": 3.65e-07, + "loss": 1.2255, + "step": 146 + }, + { + "epoch": 0.0018500462511562789, + "grad_norm": 25.093582153320312, + "learning_rate": 3.7e-07, + "loss": 1.8611, + "step": 148 + }, + { + "epoch": 0.0018750468761719043, + "grad_norm": 10.580937385559082, + "learning_rate": 3.75e-07, + "loss": 1.2777, + "step": 150 + }, + { + "epoch": 0.0019000475011875296, + "grad_norm": 2.7177059650421143, + "learning_rate": 3.8e-07, + "loss": 1.9572, + "step": 152 + }, + { + "epoch": 0.001925048126203155, + "grad_norm": 5.789566993713379, + "learning_rate": 3.85e-07, + "loss": 3.1367, + "step": 154 + }, + { + "epoch": 0.0019500487512187806, + "grad_norm": 12.800239562988281, + "learning_rate": 3.9e-07, + "loss": 1.9611, + "step": 156 + }, + { + "epoch": 0.001975049376234406, + "grad_norm": 7.2754974365234375, + "learning_rate": 3.9500000000000003e-07, + "loss": 1.9298, + "step": 158 + }, + { + "epoch": 0.002000050001250031, + "grad_norm": 14.125917434692383, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.7521, + "step": 160 + }, + { + "epoch": 0.0020250506262656565, + "grad_norm": 6.67465877532959, + "learning_rate": 4.0500000000000004e-07, + "loss": 2.8835, + "step": 162 + }, + { + "epoch": 0.002050051251281282, + "grad_norm": 9.034854888916016, + "learning_rate": 4.1000000000000004e-07, + "loss": 1.5379, + "step": 164 + }, + { + "epoch": 0.0020750518762969075, + "grad_norm": 12.550370216369629, + "learning_rate": 4.1500000000000005e-07, + "loss": 2.192, + "step": 166 + }, + { + "epoch": 0.002100052501312533, + "grad_norm": 5.637836456298828, + "learning_rate": 4.2000000000000006e-07, + "loss": 2.2696, + "step": 168 + }, + { + "epoch": 0.002125053126328158, + "grad_norm": 4.0295000076293945, + "learning_rate": 4.2500000000000006e-07, + "loss": 2.8032, + "step": 170 + }, + { + "epoch": 0.0021500537513437835, + "grad_norm": 10.379382133483887, + "learning_rate": 4.3e-07, + "loss": 2.9539, + "step": 172 + }, + { + "epoch": 0.002175054376359409, + "grad_norm": 10.702367782592773, + "learning_rate": 4.35e-07, + "loss": 1.1856, + "step": 174 + }, + { + "epoch": 0.0022000550013750344, + "grad_norm": 8.943778991699219, + "learning_rate": 4.4e-07, + "loss": 3.5013, + "step": 176 + }, + { + "epoch": 0.00222505562639066, + "grad_norm": 21.945314407348633, + "learning_rate": 4.4500000000000003e-07, + "loss": 1.7304, + "step": 178 + }, + { + "epoch": 0.002250056251406285, + "grad_norm": 6.392901420593262, + "learning_rate": 4.5000000000000003e-07, + "loss": 0.8074, + "step": 180 + }, + { + "epoch": 0.0022750568764219104, + "grad_norm": 7.220922946929932, + "learning_rate": 4.5500000000000004e-07, + "loss": 0.7176, + "step": 182 + }, + { + "epoch": 0.002300057501437536, + "grad_norm": 3.247453212738037, + "learning_rate": 4.6000000000000004e-07, + "loss": 1.4551, + "step": 184 + }, + { + "epoch": 0.0023250581264531614, + "grad_norm": 13.586856842041016, + "learning_rate": 4.6500000000000005e-07, + "loss": 2.642, + "step": 186 + }, + { + "epoch": 0.002350058751468787, + "grad_norm": 8.42886734008789, + "learning_rate": 4.7000000000000005e-07, + "loss": 2.2443, + "step": 188 + }, + { + "epoch": 0.002375059376484412, + "grad_norm": 6.977269649505615, + "learning_rate": 4.7500000000000006e-07, + "loss": 1.3546, + "step": 190 + }, + { + "epoch": 0.0024000600015000374, + "grad_norm": 3.5643160343170166, + "learning_rate": 4.800000000000001e-07, + "loss": 0.7587, + "step": 192 + }, + { + "epoch": 0.002425060626515663, + "grad_norm": 16.670740127563477, + "learning_rate": 4.85e-07, + "loss": 2.3639, + "step": 194 + }, + { + "epoch": 0.0024500612515312883, + "grad_norm": 8.806687355041504, + "learning_rate": 4.900000000000001e-07, + "loss": 2.1679, + "step": 196 + }, + { + "epoch": 0.002475061876546914, + "grad_norm": 8.036782264709473, + "learning_rate": 4.95e-07, + "loss": 1.374, + "step": 198 + }, + { + "epoch": 0.0025000625015625393, + "grad_norm": 3.798902750015259, + "learning_rate": 5.000000000000001e-07, + "loss": 0.182, + "step": 200 + }, + { + "epoch": 0.0025250631265781643, + "grad_norm": 6.094936847686768, + "learning_rate": 5.05e-07, + "loss": 0.5915, + "step": 202 + }, + { + "epoch": 0.00255006375159379, + "grad_norm": 7.284603118896484, + "learning_rate": 5.1e-07, + "loss": 2.0227, + "step": 204 + }, + { + "epoch": 0.0025750643766094153, + "grad_norm": 8.53467845916748, + "learning_rate": 5.15e-07, + "loss": 0.7789, + "step": 206 + }, + { + "epoch": 0.0026000650016250407, + "grad_norm": 4.439601898193359, + "learning_rate": 5.2e-07, + "loss": 0.4206, + "step": 208 + }, + { + "epoch": 0.0026250656266406662, + "grad_norm": 9.506946563720703, + "learning_rate": 5.250000000000001e-07, + "loss": 2.8145, + "step": 210 + }, + { + "epoch": 0.0026500662516562913, + "grad_norm": 4.11452054977417, + "learning_rate": 5.3e-07, + "loss": 0.9603, + "step": 212 + }, + { + "epoch": 0.0026750668766719167, + "grad_norm": 8.581072807312012, + "learning_rate": 5.350000000000001e-07, + "loss": 2.0863, + "step": 214 + }, + { + "epoch": 0.002700067501687542, + "grad_norm": 8.475007057189941, + "learning_rate": 5.4e-07, + "loss": 1.574, + "step": 216 + }, + { + "epoch": 0.0027250681267031677, + "grad_norm": 13.420190811157227, + "learning_rate": 5.450000000000001e-07, + "loss": 1.5178, + "step": 218 + }, + { + "epoch": 0.002750068751718793, + "grad_norm": 13.653470039367676, + "learning_rate": 5.5e-07, + "loss": 2.7023, + "step": 220 + }, + { + "epoch": 0.002775069376734418, + "grad_norm": 3.4038825035095215, + "learning_rate": 5.550000000000001e-07, + "loss": 1.3424, + "step": 222 + }, + { + "epoch": 0.0028000700017500437, + "grad_norm": 11.086280822753906, + "learning_rate": 5.6e-07, + "loss": 1.5798, + "step": 224 + }, + { + "epoch": 0.002825070626765669, + "grad_norm": 3.722223997116089, + "learning_rate": 5.650000000000001e-07, + "loss": 1.8047, + "step": 226 + }, + { + "epoch": 0.0028500712517812946, + "grad_norm": 18.389009475708008, + "learning_rate": 5.7e-07, + "loss": 1.4043, + "step": 228 + }, + { + "epoch": 0.00287507187679692, + "grad_norm": 6.792549133300781, + "learning_rate": 5.750000000000001e-07, + "loss": 1.7856, + "step": 230 + }, + { + "epoch": 0.002900072501812545, + "grad_norm": 10.503623962402344, + "learning_rate": 5.800000000000001e-07, + "loss": 2.1977, + "step": 232 + }, + { + "epoch": 0.0029250731268281706, + "grad_norm": 7.503457546234131, + "learning_rate": 5.850000000000001e-07, + "loss": 2.1474, + "step": 234 + }, + { + "epoch": 0.002950073751843796, + "grad_norm": 6.735136985778809, + "learning_rate": 5.900000000000001e-07, + "loss": 1.0773, + "step": 236 + }, + { + "epoch": 0.0029750743768594216, + "grad_norm": 2.936291456222534, + "learning_rate": 5.95e-07, + "loss": 0.872, + "step": 238 + }, + { + "epoch": 0.003000075001875047, + "grad_norm": 6.5450825691223145, + "learning_rate": 6.000000000000001e-07, + "loss": 0.7117, + "step": 240 + }, + { + "epoch": 0.003025075626890672, + "grad_norm": 8.68979263305664, + "learning_rate": 6.05e-07, + "loss": 2.116, + "step": 242 + }, + { + "epoch": 0.0030500762519062976, + "grad_norm": 8.381768226623535, + "learning_rate": 6.100000000000001e-07, + "loss": 1.657, + "step": 244 + }, + { + "epoch": 0.003075076876921923, + "grad_norm": 3.1986751556396484, + "learning_rate": 6.15e-07, + "loss": 1.3, + "step": 246 + }, + { + "epoch": 0.0031000775019375485, + "grad_norm": 8.349403381347656, + "learning_rate": 6.200000000000001e-07, + "loss": 1.3512, + "step": 248 + }, + { + "epoch": 0.003125078126953174, + "grad_norm": 1.6555742025375366, + "learning_rate": 6.25e-07, + "loss": 1.5399, + "step": 250 + }, + { + "epoch": 0.003150078751968799, + "grad_norm": 8.092061042785645, + "learning_rate": 6.3e-07, + "loss": 0.7617, + "step": 252 + }, + { + "epoch": 0.0031750793769844245, + "grad_norm": 6.886405944824219, + "learning_rate": 6.350000000000001e-07, + "loss": 0.7556, + "step": 254 + }, + { + "epoch": 0.00320008000200005, + "grad_norm": 8.233000755310059, + "learning_rate": 6.4e-07, + "loss": 0.2078, + "step": 256 + }, + { + "epoch": 0.0032250806270156755, + "grad_norm": 14.32558536529541, + "learning_rate": 6.450000000000001e-07, + "loss": 3.3267, + "step": 258 + }, + { + "epoch": 0.003250081252031301, + "grad_norm": 4.809398174285889, + "learning_rate": 6.5e-07, + "loss": 1.7054, + "step": 260 + }, + { + "epoch": 0.003275081877046926, + "grad_norm": 12.562570571899414, + "learning_rate": 6.550000000000001e-07, + "loss": 1.9012, + "step": 262 + }, + { + "epoch": 0.0033000825020625514, + "grad_norm": 11.717635154724121, + "learning_rate": 6.6e-07, + "loss": 1.6971, + "step": 264 + }, + { + "epoch": 0.003325083127078177, + "grad_norm": 8.578349113464355, + "learning_rate": 6.650000000000001e-07, + "loss": 1.4478, + "step": 266 + }, + { + "epoch": 0.0033500837520938024, + "grad_norm": 13.884634017944336, + "learning_rate": 6.7e-07, + "loss": 2.5596, + "step": 268 + }, + { + "epoch": 0.003375084377109428, + "grad_norm": 2.9229888916015625, + "learning_rate": 6.750000000000001e-07, + "loss": 3.2278, + "step": 270 + }, + { + "epoch": 0.003400085002125053, + "grad_norm": 10.349213600158691, + "learning_rate": 6.800000000000001e-07, + "loss": 1.71, + "step": 272 + }, + { + "epoch": 0.0034250856271406784, + "grad_norm": 8.61713981628418, + "learning_rate": 6.850000000000001e-07, + "loss": 1.5798, + "step": 274 + }, + { + "epoch": 0.003450086252156304, + "grad_norm": 2.823537826538086, + "learning_rate": 6.900000000000001e-07, + "loss": 1.3178, + "step": 276 + }, + { + "epoch": 0.0034750868771719293, + "grad_norm": 4.692976474761963, + "learning_rate": 6.950000000000001e-07, + "loss": 1.7855, + "step": 278 + }, + { + "epoch": 0.003500087502187555, + "grad_norm": 6.038337707519531, + "learning_rate": 7.000000000000001e-07, + "loss": 2.0927, + "step": 280 + }, + { + "epoch": 0.0035250881272031803, + "grad_norm": 4.686464309692383, + "learning_rate": 7.05e-07, + "loss": 1.0902, + "step": 282 + }, + { + "epoch": 0.0035500887522188053, + "grad_norm": 3.4980618953704834, + "learning_rate": 7.1e-07, + "loss": 1.2562, + "step": 284 + }, + { + "epoch": 0.003575089377234431, + "grad_norm": 6.342993259429932, + "learning_rate": 7.15e-07, + "loss": 1.6267, + "step": 286 + }, + { + "epoch": 0.0036000900022500563, + "grad_norm": 5.872858047485352, + "learning_rate": 7.2e-07, + "loss": 1.5201, + "step": 288 + }, + { + "epoch": 0.0036250906272656818, + "grad_norm": 15.673853874206543, + "learning_rate": 7.25e-07, + "loss": 1.9658, + "step": 290 + }, + { + "epoch": 0.0036500912522813072, + "grad_norm": 11.721842765808105, + "learning_rate": 7.3e-07, + "loss": 2.5797, + "step": 292 + }, + { + "epoch": 0.0036750918772969323, + "grad_norm": 6.166496753692627, + "learning_rate": 7.350000000000001e-07, + "loss": 0.8161, + "step": 294 + }, + { + "epoch": 0.0037000925023125577, + "grad_norm": 9.478219032287598, + "learning_rate": 7.4e-07, + "loss": 1.8239, + "step": 296 + }, + { + "epoch": 0.0037250931273281832, + "grad_norm": 6.063263416290283, + "learning_rate": 7.450000000000001e-07, + "loss": 0.9695, + "step": 298 + }, + { + "epoch": 0.0037500937523438087, + "grad_norm": 8.272128105163574, + "learning_rate": 7.5e-07, + "loss": 0.9945, + "step": 300 + }, + { + "epoch": 0.003775094377359434, + "grad_norm": 13.598306655883789, + "learning_rate": 7.550000000000001e-07, + "loss": 2.4199, + "step": 302 + }, + { + "epoch": 0.003800095002375059, + "grad_norm": 6.2680182456970215, + "learning_rate": 7.6e-07, + "loss": 2.5816, + "step": 304 + }, + { + "epoch": 0.0038250956273906847, + "grad_norm": 11.7550048828125, + "learning_rate": 7.650000000000001e-07, + "loss": 2.4056, + "step": 306 + }, + { + "epoch": 0.00385009625240631, + "grad_norm": 10.040814399719238, + "learning_rate": 7.7e-07, + "loss": 6.2, + "step": 308 + }, + { + "epoch": 0.0038750968774219356, + "grad_norm": 5.964242458343506, + "learning_rate": 7.750000000000001e-07, + "loss": 0.9787, + "step": 310 + }, + { + "epoch": 0.003900097502437561, + "grad_norm": 3.5168240070343018, + "learning_rate": 7.8e-07, + "loss": 1.0192, + "step": 312 + }, + { + "epoch": 0.003925098127453186, + "grad_norm": 2.7713115215301514, + "learning_rate": 7.850000000000001e-07, + "loss": 0.8154, + "step": 314 + }, + { + "epoch": 0.003950098752468812, + "grad_norm": 9.018936157226562, + "learning_rate": 7.900000000000001e-07, + "loss": 1.9761, + "step": 316 + }, + { + "epoch": 0.003975099377484437, + "grad_norm": 4.354487419128418, + "learning_rate": 7.950000000000001e-07, + "loss": 0.7656, + "step": 318 + }, + { + "epoch": 0.004000100002500062, + "grad_norm": 4.679379463195801, + "learning_rate": 8.000000000000001e-07, + "loss": 2.7064, + "step": 320 + }, + { + "epoch": 0.004025100627515688, + "grad_norm": 46.494964599609375, + "learning_rate": 8.050000000000001e-07, + "loss": 1.9047, + "step": 322 + }, + { + "epoch": 0.004050101252531313, + "grad_norm": 30.388660430908203, + "learning_rate": 8.100000000000001e-07, + "loss": 3.9908, + "step": 324 + }, + { + "epoch": 0.004075101877546939, + "grad_norm": 5.437876224517822, + "learning_rate": 8.150000000000001e-07, + "loss": 0.7249, + "step": 326 + }, + { + "epoch": 0.004100102502562564, + "grad_norm": 14.222411155700684, + "learning_rate": 8.200000000000001e-07, + "loss": 1.2022, + "step": 328 + }, + { + "epoch": 0.004125103127578189, + "grad_norm": 16.85289764404297, + "learning_rate": 8.250000000000001e-07, + "loss": 1.7883, + "step": 330 + }, + { + "epoch": 0.004150103752593815, + "grad_norm": 3.9653873443603516, + "learning_rate": 8.300000000000001e-07, + "loss": 1.1939, + "step": 332 + }, + { + "epoch": 0.00417510437760944, + "grad_norm": 5.823245525360107, + "learning_rate": 8.350000000000002e-07, + "loss": 2.2807, + "step": 334 + }, + { + "epoch": 0.004200105002625066, + "grad_norm": 5.081576824188232, + "learning_rate": 8.400000000000001e-07, + "loss": 0.7584, + "step": 336 + }, + { + "epoch": 0.004225105627640691, + "grad_norm": 7.313120365142822, + "learning_rate": 8.450000000000002e-07, + "loss": 2.4182, + "step": 338 + }, + { + "epoch": 0.004250106252656316, + "grad_norm": 6.43465518951416, + "learning_rate": 8.500000000000001e-07, + "loss": 1.8604, + "step": 340 + }, + { + "epoch": 0.004275106877671942, + "grad_norm": 7.002227783203125, + "learning_rate": 8.550000000000002e-07, + "loss": 1.4556, + "step": 342 + }, + { + "epoch": 0.004300107502687567, + "grad_norm": 5.551004409790039, + "learning_rate": 8.6e-07, + "loss": 2.3573, + "step": 344 + }, + { + "epoch": 0.004325108127703193, + "grad_norm": 2.653164863586426, + "learning_rate": 8.65e-07, + "loss": 1.7408, + "step": 346 + }, + { + "epoch": 0.004350108752718818, + "grad_norm": 11.326807975769043, + "learning_rate": 8.7e-07, + "loss": 1.4879, + "step": 348 + }, + { + "epoch": 0.004375109377734443, + "grad_norm": 4.709672927856445, + "learning_rate": 8.75e-07, + "loss": 1.2884, + "step": 350 + }, + { + "epoch": 0.004400110002750069, + "grad_norm": 11.570570945739746, + "learning_rate": 8.8e-07, + "loss": 1.7833, + "step": 352 + }, + { + "epoch": 0.004425110627765694, + "grad_norm": 2.526841402053833, + "learning_rate": 8.85e-07, + "loss": 1.0093, + "step": 354 + }, + { + "epoch": 0.00445011125278132, + "grad_norm": 11.940034866333008, + "learning_rate": 8.900000000000001e-07, + "loss": 5.8959, + "step": 356 + }, + { + "epoch": 0.004475111877796945, + "grad_norm": 6.214504241943359, + "learning_rate": 8.95e-07, + "loss": 1.392, + "step": 358 + }, + { + "epoch": 0.00450011250281257, + "grad_norm": 3.7771918773651123, + "learning_rate": 9.000000000000001e-07, + "loss": 2.0682, + "step": 360 + }, + { + "epoch": 0.004525113127828196, + "grad_norm": 5.635478973388672, + "learning_rate": 9.05e-07, + "loss": 1.7436, + "step": 362 + }, + { + "epoch": 0.004550113752843821, + "grad_norm": 6.157444953918457, + "learning_rate": 9.100000000000001e-07, + "loss": 2.4436, + "step": 364 + }, + { + "epoch": 0.004575114377859447, + "grad_norm": 12.759936332702637, + "learning_rate": 9.15e-07, + "loss": 0.9732, + "step": 366 + }, + { + "epoch": 0.004600115002875072, + "grad_norm": 3.1614365577697754, + "learning_rate": 9.200000000000001e-07, + "loss": 1.2453, + "step": 368 + }, + { + "epoch": 0.004625115627890697, + "grad_norm": 1.8694403171539307, + "learning_rate": 9.25e-07, + "loss": 0.9357, + "step": 370 + }, + { + "epoch": 0.004650116252906323, + "grad_norm": 7.228161334991455, + "learning_rate": 9.300000000000001e-07, + "loss": 1.6778, + "step": 372 + }, + { + "epoch": 0.004675116877921948, + "grad_norm": 19.838768005371094, + "learning_rate": 9.35e-07, + "loss": 2.2389, + "step": 374 + }, + { + "epoch": 0.004700117502937574, + "grad_norm": 3.5310277938842773, + "learning_rate": 9.400000000000001e-07, + "loss": 1.1192, + "step": 376 + }, + { + "epoch": 0.004725118127953199, + "grad_norm": 5.9445295333862305, + "learning_rate": 9.450000000000001e-07, + "loss": 1.3381, + "step": 378 + }, + { + "epoch": 0.004750118752968824, + "grad_norm": 10.761473655700684, + "learning_rate": 9.500000000000001e-07, + "loss": 2.66, + "step": 380 + }, + { + "epoch": 0.00477511937798445, + "grad_norm": 8.883493423461914, + "learning_rate": 9.550000000000002e-07, + "loss": 1.4983, + "step": 382 + }, + { + "epoch": 0.004800120003000075, + "grad_norm": 4.1593804359436035, + "learning_rate": 9.600000000000001e-07, + "loss": 0.2306, + "step": 384 + }, + { + "epoch": 0.004825120628015701, + "grad_norm": 7.821424961090088, + "learning_rate": 9.65e-07, + "loss": 4.8936, + "step": 386 + }, + { + "epoch": 0.004850121253031326, + "grad_norm": 4.511836528778076, + "learning_rate": 9.7e-07, + "loss": 1.223, + "step": 388 + }, + { + "epoch": 0.004875121878046951, + "grad_norm": 9.362104415893555, + "learning_rate": 9.750000000000002e-07, + "loss": 1.033, + "step": 390 + }, + { + "epoch": 0.004900122503062577, + "grad_norm": 6.17924690246582, + "learning_rate": 9.800000000000001e-07, + "loss": 1.1223, + "step": 392 + }, + { + "epoch": 0.004925123128078202, + "grad_norm": 3.6198747158050537, + "learning_rate": 9.85e-07, + "loss": 0.8841, + "step": 394 + }, + { + "epoch": 0.004950123753093828, + "grad_norm": 4.041724681854248, + "learning_rate": 9.9e-07, + "loss": 1.3778, + "step": 396 + }, + { + "epoch": 0.004975124378109453, + "grad_norm": 15.402488708496094, + "learning_rate": 9.950000000000002e-07, + "loss": 2.0737, + "step": 398 + }, + { + "epoch": 0.0050001250031250786, + "grad_norm": 5.228225231170654, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.6854, + "step": 400 + }, + { + "epoch": 0.005025125628140704, + "grad_norm": 3.868774652481079, + "learning_rate": 1.0050000000000001e-06, + "loss": 2.1385, + "step": 402 + }, + { + "epoch": 0.005050126253156329, + "grad_norm": 3.8328213691711426, + "learning_rate": 1.01e-06, + "loss": 1.3596, + "step": 404 + }, + { + "epoch": 0.0050751268781719545, + "grad_norm": 7.911139488220215, + "learning_rate": 1.0150000000000002e-06, + "loss": 1.4573, + "step": 406 + }, + { + "epoch": 0.00510012750318758, + "grad_norm": 8.13247013092041, + "learning_rate": 1.02e-06, + "loss": 2.0496, + "step": 408 + }, + { + "epoch": 0.0051251281282032055, + "grad_norm": 6.317373275756836, + "learning_rate": 1.025e-06, + "loss": 0.8005, + "step": 410 + }, + { + "epoch": 0.0051501287532188305, + "grad_norm": 8.27928352355957, + "learning_rate": 1.03e-06, + "loss": 1.0288, + "step": 412 + }, + { + "epoch": 0.005175129378234456, + "grad_norm": 2.8289296627044678, + "learning_rate": 1.035e-06, + "loss": 0.9177, + "step": 414 + }, + { + "epoch": 0.0052001300032500815, + "grad_norm": 5.547508716583252, + "learning_rate": 1.04e-06, + "loss": 1.9, + "step": 416 + }, + { + "epoch": 0.0052251306282657065, + "grad_norm": 5.506785869598389, + "learning_rate": 1.045e-06, + "loss": 1.2021, + "step": 418 + }, + { + "epoch": 0.0052501312532813324, + "grad_norm": 20.587112426757812, + "learning_rate": 1.0500000000000001e-06, + "loss": 2.0741, + "step": 420 + }, + { + "epoch": 0.0052751318782969575, + "grad_norm": 4.9863996505737305, + "learning_rate": 1.055e-06, + "loss": 1.0863, + "step": 422 + }, + { + "epoch": 0.0053001325033125825, + "grad_norm": 10.174978256225586, + "learning_rate": 1.06e-06, + "loss": 1.4585, + "step": 424 + }, + { + "epoch": 0.005325133128328208, + "grad_norm": 3.614572763442993, + "learning_rate": 1.065e-06, + "loss": 0.4775, + "step": 426 + }, + { + "epoch": 0.0053501337533438335, + "grad_norm": 3.4551374912261963, + "learning_rate": 1.0700000000000001e-06, + "loss": 1.591, + "step": 428 + }, + { + "epoch": 0.005375134378359459, + "grad_norm": 4.610350608825684, + "learning_rate": 1.075e-06, + "loss": 1.7014, + "step": 430 + }, + { + "epoch": 0.005400135003375084, + "grad_norm": 4.36111307144165, + "learning_rate": 1.08e-06, + "loss": 1.4813, + "step": 432 + }, + { + "epoch": 0.0054251356283907095, + "grad_norm": 10.763853073120117, + "learning_rate": 1.085e-06, + "loss": 1.6313, + "step": 434 + }, + { + "epoch": 0.005450136253406335, + "grad_norm": 5.085967540740967, + "learning_rate": 1.0900000000000002e-06, + "loss": 1.767, + "step": 436 + }, + { + "epoch": 0.00547513687842196, + "grad_norm": 8.775583267211914, + "learning_rate": 1.095e-06, + "loss": 1.2062, + "step": 438 + }, + { + "epoch": 0.005500137503437586, + "grad_norm": 13.859451293945312, + "learning_rate": 1.1e-06, + "loss": 3.2093, + "step": 440 + }, + { + "epoch": 0.005525138128453211, + "grad_norm": 3.8971145153045654, + "learning_rate": 1.105e-06, + "loss": 0.3102, + "step": 442 + }, + { + "epoch": 0.005550138753468836, + "grad_norm": 6.583864212036133, + "learning_rate": 1.1100000000000002e-06, + "loss": 1.9425, + "step": 444 + }, + { + "epoch": 0.005575139378484462, + "grad_norm": 10.968184471130371, + "learning_rate": 1.1150000000000001e-06, + "loss": 1.1159, + "step": 446 + }, + { + "epoch": 0.005600140003500087, + "grad_norm": 5.108273983001709, + "learning_rate": 1.12e-06, + "loss": 1.6558, + "step": 448 + }, + { + "epoch": 0.005625140628515713, + "grad_norm": 10.358725547790527, + "learning_rate": 1.125e-06, + "loss": 2.8564, + "step": 450 + }, + { + "epoch": 0.005650141253531338, + "grad_norm": 10.342994689941406, + "learning_rate": 1.1300000000000002e-06, + "loss": 1.3708, + "step": 452 + }, + { + "epoch": 0.005675141878546963, + "grad_norm": 11.328195571899414, + "learning_rate": 1.1350000000000001e-06, + "loss": 1.543, + "step": 454 + }, + { + "epoch": 0.005700142503562589, + "grad_norm": 3.854583263397217, + "learning_rate": 1.14e-06, + "loss": 1.4969, + "step": 456 + }, + { + "epoch": 0.005725143128578214, + "grad_norm": 5.283221244812012, + "learning_rate": 1.145e-06, + "loss": 4.5461, + "step": 458 + }, + { + "epoch": 0.00575014375359384, + "grad_norm": 5.430889129638672, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.1141, + "step": 460 + }, + { + "epoch": 0.005775144378609465, + "grad_norm": 3.587857484817505, + "learning_rate": 1.1550000000000002e-06, + "loss": 1.0674, + "step": 462 + }, + { + "epoch": 0.00580014500362509, + "grad_norm": 7.317820072174072, + "learning_rate": 1.1600000000000001e-06, + "loss": 2.0334, + "step": 464 + }, + { + "epoch": 0.005825145628640716, + "grad_norm": 21.977798461914062, + "learning_rate": 1.165e-06, + "loss": 0.9824, + "step": 466 + }, + { + "epoch": 0.005850146253656341, + "grad_norm": 1.979722023010254, + "learning_rate": 1.1700000000000002e-06, + "loss": 0.939, + "step": 468 + }, + { + "epoch": 0.005875146878671967, + "grad_norm": 8.561959266662598, + "learning_rate": 1.175e-06, + "loss": 2.2519, + "step": 470 + }, + { + "epoch": 0.005900147503687592, + "grad_norm": 3.572202682495117, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.3357, + "step": 472 + }, + { + "epoch": 0.005925148128703217, + "grad_norm": 13.112444877624512, + "learning_rate": 1.185e-06, + "loss": 0.5918, + "step": 474 + }, + { + "epoch": 0.005950148753718843, + "grad_norm": 6.500751972198486, + "learning_rate": 1.19e-06, + "loss": 0.8054, + "step": 476 + }, + { + "epoch": 0.005975149378734468, + "grad_norm": 11.101346969604492, + "learning_rate": 1.195e-06, + "loss": 2.0138, + "step": 478 + }, + { + "epoch": 0.006000150003750094, + "grad_norm": 4.307151794433594, + "learning_rate": 1.2000000000000002e-06, + "loss": 2.1459, + "step": 480 + }, + { + "epoch": 0.006025150628765719, + "grad_norm": 33.30113220214844, + "learning_rate": 1.2050000000000001e-06, + "loss": 3.4699, + "step": 482 + }, + { + "epoch": 0.006050151253781344, + "grad_norm": 5.7049665451049805, + "learning_rate": 1.21e-06, + "loss": 1.1121, + "step": 484 + }, + { + "epoch": 0.00607515187879697, + "grad_norm": 4.785853385925293, + "learning_rate": 1.215e-06, + "loss": 1.7469, + "step": 486 + }, + { + "epoch": 0.006100152503812595, + "grad_norm": 14.286933898925781, + "learning_rate": 1.2200000000000002e-06, + "loss": 2.6714, + "step": 488 + }, + { + "epoch": 0.006125153128828221, + "grad_norm": 9.786770820617676, + "learning_rate": 1.2250000000000001e-06, + "loss": 2.2167, + "step": 490 + }, + { + "epoch": 0.006150153753843846, + "grad_norm": 6.505812168121338, + "learning_rate": 1.23e-06, + "loss": 2.0869, + "step": 492 + }, + { + "epoch": 0.006175154378859471, + "grad_norm": 17.94192123413086, + "learning_rate": 1.235e-06, + "loss": 1.1309, + "step": 494 + }, + { + "epoch": 0.006200155003875097, + "grad_norm": 3.3607358932495117, + "learning_rate": 1.2400000000000002e-06, + "loss": 0.8632, + "step": 496 + }, + { + "epoch": 0.006225155628890722, + "grad_norm": 17.593101501464844, + "learning_rate": 1.2450000000000002e-06, + "loss": 2.5768, + "step": 498 + }, + { + "epoch": 0.006250156253906348, + "grad_norm": 6.647007465362549, + "learning_rate": 1.25e-06, + "loss": 1.3838, + "step": 500 + }, + { + "epoch": 0.006275156878921973, + "grad_norm": 2.627236843109131, + "learning_rate": 1.255e-06, + "loss": 1.5962, + "step": 502 + }, + { + "epoch": 0.006300157503937598, + "grad_norm": 7.892225742340088, + "learning_rate": 1.26e-06, + "loss": 1.7434, + "step": 504 + }, + { + "epoch": 0.006325158128953224, + "grad_norm": 0.8347011804580688, + "learning_rate": 1.2650000000000002e-06, + "loss": 1.0636, + "step": 506 + }, + { + "epoch": 0.006350158753968849, + "grad_norm": 5.538966655731201, + "learning_rate": 1.2700000000000001e-06, + "loss": 0.7759, + "step": 508 + }, + { + "epoch": 0.006375159378984475, + "grad_norm": 6.003100872039795, + "learning_rate": 1.275e-06, + "loss": 2.0137, + "step": 510 + }, + { + "epoch": 0.0064001600040001, + "grad_norm": 4.337047100067139, + "learning_rate": 1.28e-06, + "loss": 1.4915, + "step": 512 + }, + { + "epoch": 0.006425160629015725, + "grad_norm": 7.312321662902832, + "learning_rate": 1.2850000000000002e-06, + "loss": 0.3707, + "step": 514 + }, + { + "epoch": 0.006450161254031351, + "grad_norm": 5.820616245269775, + "learning_rate": 1.2900000000000001e-06, + "loss": 0.551, + "step": 516 + }, + { + "epoch": 0.006475161879046976, + "grad_norm": 5.87538480758667, + "learning_rate": 1.295e-06, + "loss": 1.1051, + "step": 518 + }, + { + "epoch": 0.006500162504062602, + "grad_norm": 6.441608428955078, + "learning_rate": 1.3e-06, + "loss": 2.158, + "step": 520 + }, + { + "epoch": 0.006525163129078227, + "grad_norm": 2.904430866241455, + "learning_rate": 1.3050000000000002e-06, + "loss": 0.7867, + "step": 522 + }, + { + "epoch": 0.006550163754093852, + "grad_norm": 4.0248847007751465, + "learning_rate": 1.3100000000000002e-06, + "loss": 1.292, + "step": 524 + }, + { + "epoch": 0.006575164379109478, + "grad_norm": 5.7484211921691895, + "learning_rate": 1.3150000000000001e-06, + "loss": 1.7676, + "step": 526 + }, + { + "epoch": 0.006600165004125103, + "grad_norm": 5.288037300109863, + "learning_rate": 1.32e-06, + "loss": 0.8018, + "step": 528 + }, + { + "epoch": 0.006625165629140729, + "grad_norm": 4.952310085296631, + "learning_rate": 1.3250000000000002e-06, + "loss": 1.7474, + "step": 530 + }, + { + "epoch": 0.006650166254156354, + "grad_norm": 8.312854766845703, + "learning_rate": 1.3300000000000002e-06, + "loss": 1.2259, + "step": 532 + }, + { + "epoch": 0.006675166879171979, + "grad_norm": 1.0683197975158691, + "learning_rate": 1.3350000000000001e-06, + "loss": 0.5158, + "step": 534 + }, + { + "epoch": 0.006700167504187605, + "grad_norm": 3.465127944946289, + "learning_rate": 1.34e-06, + "loss": 0.1919, + "step": 536 + }, + { + "epoch": 0.00672516812920323, + "grad_norm": 4.5318284034729, + "learning_rate": 1.3450000000000003e-06, + "loss": 1.4503, + "step": 538 + }, + { + "epoch": 0.006750168754218856, + "grad_norm": 6.064114093780518, + "learning_rate": 1.3500000000000002e-06, + "loss": 1.2118, + "step": 540 + }, + { + "epoch": 0.006775169379234481, + "grad_norm": 4.092135906219482, + "learning_rate": 1.3550000000000002e-06, + "loss": 1.5028, + "step": 542 + }, + { + "epoch": 0.006800170004250106, + "grad_norm": 5.132049560546875, + "learning_rate": 1.3600000000000001e-06, + "loss": 1.382, + "step": 544 + }, + { + "epoch": 0.006825170629265732, + "grad_norm": 6.346365928649902, + "learning_rate": 1.3650000000000003e-06, + "loss": 0.4173, + "step": 546 + }, + { + "epoch": 0.006850171254281357, + "grad_norm": 3.279660701751709, + "learning_rate": 1.3700000000000002e-06, + "loss": 0.9729, + "step": 548 + }, + { + "epoch": 0.006875171879296983, + "grad_norm": 4.9538254737854, + "learning_rate": 1.3750000000000002e-06, + "loss": 0.9624, + "step": 550 + }, + { + "epoch": 0.006900172504312608, + "grad_norm": 4.265155792236328, + "learning_rate": 1.3800000000000001e-06, + "loss": 1.4648, + "step": 552 + }, + { + "epoch": 0.006925173129328234, + "grad_norm": 5.316565990447998, + "learning_rate": 1.3850000000000003e-06, + "loss": 1.4558, + "step": 554 + }, + { + "epoch": 0.006950173754343859, + "grad_norm": 5.798628330230713, + "learning_rate": 1.3900000000000002e-06, + "loss": 1.5937, + "step": 556 + }, + { + "epoch": 0.006975174379359484, + "grad_norm": 5.853082180023193, + "learning_rate": 1.3950000000000002e-06, + "loss": 1.7947, + "step": 558 + }, + { + "epoch": 0.00700017500437511, + "grad_norm": 5.917539119720459, + "learning_rate": 1.4000000000000001e-06, + "loss": 2.1862, + "step": 560 + }, + { + "epoch": 0.007025175629390735, + "grad_norm": 5.523014068603516, + "learning_rate": 1.4050000000000003e-06, + "loss": 1.7219, + "step": 562 + }, + { + "epoch": 0.007050176254406361, + "grad_norm": 4.503792762756348, + "learning_rate": 1.41e-06, + "loss": 1.3286, + "step": 564 + }, + { + "epoch": 0.007075176879421986, + "grad_norm": 7.250819206237793, + "learning_rate": 1.415e-06, + "loss": 2.4015, + "step": 566 + }, + { + "epoch": 0.007100177504437611, + "grad_norm": 5.084353923797607, + "learning_rate": 1.42e-06, + "loss": 1.3148, + "step": 568 + }, + { + "epoch": 0.0071251781294532366, + "grad_norm": 7.611924648284912, + "learning_rate": 1.425e-06, + "loss": 0.9332, + "step": 570 + }, + { + "epoch": 0.007150178754468862, + "grad_norm": 7.6248321533203125, + "learning_rate": 1.43e-06, + "loss": 0.5842, + "step": 572 + }, + { + "epoch": 0.0071751793794844875, + "grad_norm": 4.9991888999938965, + "learning_rate": 1.435e-06, + "loss": 2.1038, + "step": 574 + }, + { + "epoch": 0.0072001800045001126, + "grad_norm": 2.5932374000549316, + "learning_rate": 1.44e-06, + "loss": 0.0566, + "step": 576 + }, + { + "epoch": 0.007225180629515738, + "grad_norm": 9.56359577178955, + "learning_rate": 1.445e-06, + "loss": 0.3889, + "step": 578 + }, + { + "epoch": 0.0072501812545313635, + "grad_norm": 4.7285685539245605, + "learning_rate": 1.45e-06, + "loss": 1.044, + "step": 580 + }, + { + "epoch": 0.0072751818795469885, + "grad_norm": 4.471067905426025, + "learning_rate": 1.455e-06, + "loss": 1.3358, + "step": 582 + }, + { + "epoch": 0.0073001825045626145, + "grad_norm": 7.512258052825928, + "learning_rate": 1.46e-06, + "loss": 2.8928, + "step": 584 + }, + { + "epoch": 0.0073251831295782395, + "grad_norm": 4.855992317199707, + "learning_rate": 1.465e-06, + "loss": 1.2245, + "step": 586 + }, + { + "epoch": 0.0073501837545938645, + "grad_norm": 5.347456455230713, + "learning_rate": 1.4700000000000001e-06, + "loss": 0.5532, + "step": 588 + }, + { + "epoch": 0.0073751843796094904, + "grad_norm": 6.008098125457764, + "learning_rate": 1.475e-06, + "loss": 1.2851, + "step": 590 + }, + { + "epoch": 0.0074001850046251155, + "grad_norm": 6.099523544311523, + "learning_rate": 1.48e-06, + "loss": 0.844, + "step": 592 + }, + { + "epoch": 0.007425185629640741, + "grad_norm": 13.111811637878418, + "learning_rate": 1.485e-06, + "loss": 0.587, + "step": 594 + }, + { + "epoch": 0.0074501862546563664, + "grad_norm": 3.3862411975860596, + "learning_rate": 1.4900000000000001e-06, + "loss": 1.0881, + "step": 596 + }, + { + "epoch": 0.0074751868796719915, + "grad_norm": 4.262303829193115, + "learning_rate": 1.495e-06, + "loss": 1.3302, + "step": 598 + }, + { + "epoch": 0.007500187504687617, + "grad_norm": 6.9670891761779785, + "learning_rate": 1.5e-06, + "loss": 0.7781, + "step": 600 + }, + { + "epoch": 0.007525188129703242, + "grad_norm": 4.490973949432373, + "learning_rate": 1.505e-06, + "loss": 0.7733, + "step": 602 + }, + { + "epoch": 0.007550188754718868, + "grad_norm": 4.654324531555176, + "learning_rate": 1.5100000000000002e-06, + "loss": 1.6913, + "step": 604 + }, + { + "epoch": 0.007575189379734493, + "grad_norm": 2.579848527908325, + "learning_rate": 1.5150000000000001e-06, + "loss": 0.6833, + "step": 606 + }, + { + "epoch": 0.007600190004750118, + "grad_norm": 6.991676330566406, + "learning_rate": 1.52e-06, + "loss": 1.2766, + "step": 608 + }, + { + "epoch": 0.007625190629765744, + "grad_norm": 4.186803340911865, + "learning_rate": 1.525e-06, + "loss": 1.463, + "step": 610 + }, + { + "epoch": 0.007650191254781369, + "grad_norm": 4.537936210632324, + "learning_rate": 1.5300000000000002e-06, + "loss": 1.1111, + "step": 612 + }, + { + "epoch": 0.007675191879796995, + "grad_norm": 15.422035217285156, + "learning_rate": 1.5350000000000001e-06, + "loss": 1.0212, + "step": 614 + }, + { + "epoch": 0.00770019250481262, + "grad_norm": 8.738069534301758, + "learning_rate": 1.54e-06, + "loss": 0.7057, + "step": 616 + }, + { + "epoch": 0.007725193129828245, + "grad_norm": 6.117671489715576, + "learning_rate": 1.545e-06, + "loss": 1.6551, + "step": 618 + }, + { + "epoch": 0.007750193754843871, + "grad_norm": 9.27519416809082, + "learning_rate": 1.5500000000000002e-06, + "loss": 2.3343, + "step": 620 + }, + { + "epoch": 0.007775194379859496, + "grad_norm": 6.824201583862305, + "learning_rate": 1.5550000000000001e-06, + "loss": 1.6888, + "step": 622 + }, + { + "epoch": 0.007800195004875122, + "grad_norm": 6.599346160888672, + "learning_rate": 1.56e-06, + "loss": 1.2648, + "step": 624 + }, + { + "epoch": 0.007825195629890747, + "grad_norm": 10.640985488891602, + "learning_rate": 1.565e-06, + "loss": 1.1629, + "step": 626 + }, + { + "epoch": 0.007850196254906372, + "grad_norm": 3.08182692527771, + "learning_rate": 1.5700000000000002e-06, + "loss": 1.2796, + "step": 628 + }, + { + "epoch": 0.007875196879921997, + "grad_norm": 5.700413227081299, + "learning_rate": 1.5750000000000002e-06, + "loss": 0.97, + "step": 630 + }, + { + "epoch": 0.007900197504937624, + "grad_norm": 4.171158313751221, + "learning_rate": 1.5800000000000001e-06, + "loss": 1.7143, + "step": 632 + }, + { + "epoch": 0.00792519812995325, + "grad_norm": 8.289078712463379, + "learning_rate": 1.585e-06, + "loss": 2.4225, + "step": 634 + }, + { + "epoch": 0.007950198754968874, + "grad_norm": 5.203309059143066, + "learning_rate": 1.5900000000000002e-06, + "loss": 1.6301, + "step": 636 + }, + { + "epoch": 0.0079751993799845, + "grad_norm": 4.121235370635986, + "learning_rate": 1.5950000000000002e-06, + "loss": 0.9198, + "step": 638 + }, + { + "epoch": 0.008000200005000124, + "grad_norm": 3.1465415954589844, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.5194, + "step": 640 + }, + { + "epoch": 0.008025200630015751, + "grad_norm": 4.335588455200195, + "learning_rate": 1.605e-06, + "loss": 1.2643, + "step": 642 + }, + { + "epoch": 0.008050201255031376, + "grad_norm": 7.183367729187012, + "learning_rate": 1.6100000000000003e-06, + "loss": 0.8657, + "step": 644 + }, + { + "epoch": 0.008075201880047001, + "grad_norm": 2.8048744201660156, + "learning_rate": 1.6150000000000002e-06, + "loss": 0.6697, + "step": 646 + }, + { + "epoch": 0.008100202505062626, + "grad_norm": 2.506573438644409, + "learning_rate": 1.6200000000000002e-06, + "loss": 0.4757, + "step": 648 + }, + { + "epoch": 0.008125203130078251, + "grad_norm": 5.256616592407227, + "learning_rate": 1.6250000000000001e-06, + "loss": 0.6025, + "step": 650 + }, + { + "epoch": 0.008150203755093878, + "grad_norm": 0.8061256408691406, + "learning_rate": 1.6300000000000003e-06, + "loss": 0.3725, + "step": 652 + }, + { + "epoch": 0.008175204380109503, + "grad_norm": 3.9800705909729004, + "learning_rate": 1.6350000000000002e-06, + "loss": 1.0366, + "step": 654 + }, + { + "epoch": 0.008200205005125128, + "grad_norm": 1.0533578395843506, + "learning_rate": 1.6400000000000002e-06, + "loss": 0.1994, + "step": 656 + }, + { + "epoch": 0.008225205630140753, + "grad_norm": 3.69706130027771, + "learning_rate": 1.6450000000000001e-06, + "loss": 1.7964, + "step": 658 + }, + { + "epoch": 0.008250206255156378, + "grad_norm": 4.251678466796875, + "learning_rate": 1.6500000000000003e-06, + "loss": 1.3015, + "step": 660 + }, + { + "epoch": 0.008275206880172005, + "grad_norm": 7.5849080085754395, + "learning_rate": 1.6550000000000002e-06, + "loss": 2.6218, + "step": 662 + }, + { + "epoch": 0.00830020750518763, + "grad_norm": 0.8258697390556335, + "learning_rate": 1.6600000000000002e-06, + "loss": 0.6527, + "step": 664 + }, + { + "epoch": 0.008325208130203255, + "grad_norm": 2.919700860977173, + "learning_rate": 1.6650000000000002e-06, + "loss": 1.7214, + "step": 666 + }, + { + "epoch": 0.00835020875521888, + "grad_norm": 6.385138511657715, + "learning_rate": 1.6700000000000003e-06, + "loss": 1.8234, + "step": 668 + }, + { + "epoch": 0.008375209380234505, + "grad_norm": 2.9821155071258545, + "learning_rate": 1.6750000000000003e-06, + "loss": 1.6259, + "step": 670 + }, + { + "epoch": 0.008400210005250132, + "grad_norm": 3.827873468399048, + "learning_rate": 1.6800000000000002e-06, + "loss": 1.2892, + "step": 672 + }, + { + "epoch": 0.008425210630265757, + "grad_norm": 3.3445932865142822, + "learning_rate": 1.6850000000000002e-06, + "loss": 2.0928, + "step": 674 + }, + { + "epoch": 0.008450211255281382, + "grad_norm": 4.439575672149658, + "learning_rate": 1.6900000000000003e-06, + "loss": 0.8951, + "step": 676 + }, + { + "epoch": 0.008475211880297007, + "grad_norm": 3.614839792251587, + "learning_rate": 1.6950000000000003e-06, + "loss": 1.3391, + "step": 678 + }, + { + "epoch": 0.008500212505312632, + "grad_norm": 5.783501148223877, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.6656, + "step": 680 + }, + { + "epoch": 0.008525213130328259, + "grad_norm": 2.859832763671875, + "learning_rate": 1.7050000000000002e-06, + "loss": 1.2557, + "step": 682 + }, + { + "epoch": 0.008550213755343884, + "grad_norm": 0.26662182807922363, + "learning_rate": 1.7100000000000004e-06, + "loss": 0.4091, + "step": 684 + }, + { + "epoch": 0.008575214380359509, + "grad_norm": 4.332157611846924, + "learning_rate": 1.7150000000000003e-06, + "loss": 2.6103, + "step": 686 + }, + { + "epoch": 0.008600215005375134, + "grad_norm": 0.5395259261131287, + "learning_rate": 1.72e-06, + "loss": 0.5641, + "step": 688 + }, + { + "epoch": 0.008625215630390759, + "grad_norm": 8.743202209472656, + "learning_rate": 1.725e-06, + "loss": 0.8723, + "step": 690 + }, + { + "epoch": 0.008650216255406386, + "grad_norm": 1.033158540725708, + "learning_rate": 1.73e-06, + "loss": 1.0801, + "step": 692 + }, + { + "epoch": 0.00867521688042201, + "grad_norm": 2.683718681335449, + "learning_rate": 1.7350000000000001e-06, + "loss": 0.714, + "step": 694 + }, + { + "epoch": 0.008700217505437636, + "grad_norm": 7.722630500793457, + "learning_rate": 1.74e-06, + "loss": 1.2563, + "step": 696 + }, + { + "epoch": 0.008725218130453261, + "grad_norm": 4.934688568115234, + "learning_rate": 1.745e-06, + "loss": 1.9024, + "step": 698 + }, + { + "epoch": 0.008750218755468886, + "grad_norm": 5.102988243103027, + "learning_rate": 1.75e-06, + "loss": 1.1822, + "step": 700 + }, + { + "epoch": 0.008775219380484513, + "grad_norm": 3.996642827987671, + "learning_rate": 1.7550000000000001e-06, + "loss": 1.0211, + "step": 702 + }, + { + "epoch": 0.008800220005500138, + "grad_norm": 5.963805198669434, + "learning_rate": 1.76e-06, + "loss": 1.4326, + "step": 704 + }, + { + "epoch": 0.008825220630515763, + "grad_norm": 6.239928245544434, + "learning_rate": 1.765e-06, + "loss": 1.6553, + "step": 706 + }, + { + "epoch": 0.008850221255531388, + "grad_norm": 5.7539143562316895, + "learning_rate": 1.77e-06, + "loss": 2.0468, + "step": 708 + }, + { + "epoch": 0.008875221880547013, + "grad_norm": 4.368383407592773, + "learning_rate": 1.7750000000000002e-06, + "loss": 1.4361, + "step": 710 + }, + { + "epoch": 0.00890022250556264, + "grad_norm": 4.084403038024902, + "learning_rate": 1.7800000000000001e-06, + "loss": 1.1644, + "step": 712 + }, + { + "epoch": 0.008925223130578265, + "grad_norm": 3.7587578296661377, + "learning_rate": 1.785e-06, + "loss": 1.534, + "step": 714 + }, + { + "epoch": 0.00895022375559389, + "grad_norm": 12.978184700012207, + "learning_rate": 1.79e-06, + "loss": 1.8533, + "step": 716 + }, + { + "epoch": 0.008975224380609515, + "grad_norm": 9.845887184143066, + "learning_rate": 1.7950000000000002e-06, + "loss": 1.1487, + "step": 718 + }, + { + "epoch": 0.00900022500562514, + "grad_norm": 5.7648468017578125, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.4371, + "step": 720 + }, + { + "epoch": 0.009025225630640767, + "grad_norm": 3.760190486907959, + "learning_rate": 1.805e-06, + "loss": 0.4344, + "step": 722 + }, + { + "epoch": 0.009050226255656392, + "grad_norm": 4.998908996582031, + "learning_rate": 1.81e-06, + "loss": 1.1268, + "step": 724 + }, + { + "epoch": 0.009075226880672017, + "grad_norm": 7.742924690246582, + "learning_rate": 1.8150000000000002e-06, + "loss": 0.6189, + "step": 726 + }, + { + "epoch": 0.009100227505687642, + "grad_norm": 5.8751397132873535, + "learning_rate": 1.8200000000000002e-06, + "loss": 1.0597, + "step": 728 + }, + { + "epoch": 0.009125228130703267, + "grad_norm": 3.819308280944824, + "learning_rate": 1.825e-06, + "loss": 2.2132, + "step": 730 + }, + { + "epoch": 0.009150228755718894, + "grad_norm": 0.23192673921585083, + "learning_rate": 1.83e-06, + "loss": 0.0057, + "step": 732 + }, + { + "epoch": 0.009175229380734519, + "grad_norm": 3.653209924697876, + "learning_rate": 1.8350000000000002e-06, + "loss": 0.6894, + "step": 734 + }, + { + "epoch": 0.009200230005750144, + "grad_norm": 5.011002063751221, + "learning_rate": 1.8400000000000002e-06, + "loss": 1.8376, + "step": 736 + }, + { + "epoch": 0.009225230630765769, + "grad_norm": 11.620277404785156, + "learning_rate": 1.8450000000000001e-06, + "loss": 1.8572, + "step": 738 + }, + { + "epoch": 0.009250231255781394, + "grad_norm": 3.8466925621032715, + "learning_rate": 1.85e-06, + "loss": 0.728, + "step": 740 + }, + { + "epoch": 0.00927523188079702, + "grad_norm": 1.199435830116272, + "learning_rate": 1.8550000000000002e-06, + "loss": 0.943, + "step": 742 + }, + { + "epoch": 0.009300232505812646, + "grad_norm": 0.6834016442298889, + "learning_rate": 1.8600000000000002e-06, + "loss": 0.5071, + "step": 744 + }, + { + "epoch": 0.00932523313082827, + "grad_norm": 17.77370262145996, + "learning_rate": 1.8650000000000001e-06, + "loss": 2.395, + "step": 746 + }, + { + "epoch": 0.009350233755843896, + "grad_norm": 3.1378698348999023, + "learning_rate": 1.87e-06, + "loss": 1.3104, + "step": 748 + }, + { + "epoch": 0.00937523438085952, + "grad_norm": 5.31392765045166, + "learning_rate": 1.8750000000000003e-06, + "loss": 1.2364, + "step": 750 + }, + { + "epoch": 0.009400235005875147, + "grad_norm": 0.21278852224349976, + "learning_rate": 1.8800000000000002e-06, + "loss": 0.6353, + "step": 752 + }, + { + "epoch": 0.009425235630890772, + "grad_norm": 1.9773966073989868, + "learning_rate": 1.8850000000000002e-06, + "loss": 1.1431, + "step": 754 + }, + { + "epoch": 0.009450236255906398, + "grad_norm": 4.1202473640441895, + "learning_rate": 1.8900000000000001e-06, + "loss": 0.3373, + "step": 756 + }, + { + "epoch": 0.009475236880922023, + "grad_norm": 3.57170033454895, + "learning_rate": 1.895e-06, + "loss": 0.8217, + "step": 758 + }, + { + "epoch": 0.009500237505937648, + "grad_norm": 5.236710548400879, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.35, + "step": 760 + }, + { + "epoch": 0.009525238130953274, + "grad_norm": 0.34660035371780396, + "learning_rate": 1.9050000000000002e-06, + "loss": 0.6903, + "step": 762 + }, + { + "epoch": 0.0095502387559689, + "grad_norm": 3.407133102416992, + "learning_rate": 1.9100000000000003e-06, + "loss": 0.5382, + "step": 764 + }, + { + "epoch": 0.009575239380984524, + "grad_norm": 4.042250633239746, + "learning_rate": 1.9150000000000003e-06, + "loss": 1.3548, + "step": 766 + }, + { + "epoch": 0.00960024000600015, + "grad_norm": 3.9384372234344482, + "learning_rate": 1.9200000000000003e-06, + "loss": 1.5146, + "step": 768 + }, + { + "epoch": 0.009625240631015775, + "grad_norm": 12.852288246154785, + "learning_rate": 1.925e-06, + "loss": 0.8815, + "step": 770 + }, + { + "epoch": 0.009650241256031401, + "grad_norm": 4.059468746185303, + "learning_rate": 1.93e-06, + "loss": 1.1549, + "step": 772 + }, + { + "epoch": 0.009675241881047026, + "grad_norm": 3.1180481910705566, + "learning_rate": 1.935e-06, + "loss": 1.3414, + "step": 774 + }, + { + "epoch": 0.009700242506062651, + "grad_norm": 13.970375061035156, + "learning_rate": 1.94e-06, + "loss": 1.3807, + "step": 776 + }, + { + "epoch": 0.009725243131078276, + "grad_norm": 0.22146618366241455, + "learning_rate": 1.945e-06, + "loss": 1.5227, + "step": 778 + }, + { + "epoch": 0.009750243756093901, + "grad_norm": 0.41104286909103394, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.9061, + "step": 780 + }, + { + "epoch": 0.009775244381109528, + "grad_norm": 0.18161647021770477, + "learning_rate": 1.9550000000000003e-06, + "loss": 0.8113, + "step": 782 + }, + { + "epoch": 0.009800245006125153, + "grad_norm": 3.5634982585906982, + "learning_rate": 1.9600000000000003e-06, + "loss": 1.424, + "step": 784 + }, + { + "epoch": 0.009825245631140778, + "grad_norm": 5.891641139984131, + "learning_rate": 1.9650000000000002e-06, + "loss": 0.4904, + "step": 786 + }, + { + "epoch": 0.009850246256156403, + "grad_norm": 12.032763481140137, + "learning_rate": 1.97e-06, + "loss": 1.6748, + "step": 788 + }, + { + "epoch": 0.00987524688117203, + "grad_norm": 4.118125915527344, + "learning_rate": 1.975e-06, + "loss": 0.6602, + "step": 790 + }, + { + "epoch": 0.009900247506187655, + "grad_norm": 4.380794048309326, + "learning_rate": 1.98e-06, + "loss": 0.5172, + "step": 792 + }, + { + "epoch": 0.00992524813120328, + "grad_norm": 3.5340628623962402, + "learning_rate": 1.985e-06, + "loss": 0.1781, + "step": 794 + }, + { + "epoch": 0.009950248756218905, + "grad_norm": 3.2963688373565674, + "learning_rate": 1.9900000000000004e-06, + "loss": 0.3872, + "step": 796 + }, + { + "epoch": 0.00997524938123453, + "grad_norm": 3.995626926422119, + "learning_rate": 1.9950000000000004e-06, + "loss": 1.0936, + "step": 798 + }, + { + "epoch": 0.010000250006250157, + "grad_norm": 2.6736490726470947, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.115, + "step": 800 + }, + { + "epoch": 0.010025250631265782, + "grad_norm": 3.1102588176727295, + "learning_rate": 2.0050000000000003e-06, + "loss": 0.9541, + "step": 802 + }, + { + "epoch": 0.010050251256281407, + "grad_norm": 3.1124751567840576, + "learning_rate": 2.0100000000000002e-06, + "loss": 1.4004, + "step": 804 + }, + { + "epoch": 0.010075251881297032, + "grad_norm": 3.863781213760376, + "learning_rate": 2.015e-06, + "loss": 0.8348, + "step": 806 + }, + { + "epoch": 0.010100252506312657, + "grad_norm": 3.337034225463867, + "learning_rate": 2.02e-06, + "loss": 0.5222, + "step": 808 + }, + { + "epoch": 0.010125253131328284, + "grad_norm": 8.514750480651855, + "learning_rate": 2.025e-06, + "loss": 1.3634, + "step": 810 + }, + { + "epoch": 0.010150253756343909, + "grad_norm": 6.084957122802734, + "learning_rate": 2.0300000000000005e-06, + "loss": 0.8715, + "step": 812 + }, + { + "epoch": 0.010175254381359534, + "grad_norm": 3.255967855453491, + "learning_rate": 2.035e-06, + "loss": 1.5972, + "step": 814 + }, + { + "epoch": 0.01020025500637516, + "grad_norm": 5.915607929229736, + "learning_rate": 2.04e-06, + "loss": 1.3685, + "step": 816 + }, + { + "epoch": 0.010225255631390784, + "grad_norm": 3.348132371902466, + "learning_rate": 2.045e-06, + "loss": 0.5846, + "step": 818 + }, + { + "epoch": 0.010250256256406411, + "grad_norm": 6.627892017364502, + "learning_rate": 2.05e-06, + "loss": 1.3934, + "step": 820 + }, + { + "epoch": 0.010275256881422036, + "grad_norm": 4.578691482543945, + "learning_rate": 2.0550000000000002e-06, + "loss": 1.3853, + "step": 822 + }, + { + "epoch": 0.010300257506437661, + "grad_norm": 12.857481956481934, + "learning_rate": 2.06e-06, + "loss": 3.443, + "step": 824 + }, + { + "epoch": 0.010325258131453286, + "grad_norm": 0.8202765583992004, + "learning_rate": 2.065e-06, + "loss": 0.1688, + "step": 826 + }, + { + "epoch": 0.010350258756468911, + "grad_norm": 3.618351697921753, + "learning_rate": 2.07e-06, + "loss": 1.2454, + "step": 828 + }, + { + "epoch": 0.010375259381484538, + "grad_norm": 3.2046687602996826, + "learning_rate": 2.075e-06, + "loss": 0.9207, + "step": 830 + }, + { + "epoch": 0.010400260006500163, + "grad_norm": 4.173259258270264, + "learning_rate": 2.08e-06, + "loss": 2.7434, + "step": 832 + }, + { + "epoch": 0.010425260631515788, + "grad_norm": 10.082711219787598, + "learning_rate": 2.085e-06, + "loss": 1.1723, + "step": 834 + }, + { + "epoch": 0.010450261256531413, + "grad_norm": 5.8999552726745605, + "learning_rate": 2.09e-06, + "loss": 1.4379, + "step": 836 + }, + { + "epoch": 0.010475261881547038, + "grad_norm": 4.135981559753418, + "learning_rate": 2.0950000000000003e-06, + "loss": 0.6508, + "step": 838 + }, + { + "epoch": 0.010500262506562665, + "grad_norm": 3.899548292160034, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.8831, + "step": 840 + }, + { + "epoch": 0.01052526313157829, + "grad_norm": 4.531285762786865, + "learning_rate": 2.105e-06, + "loss": 0.803, + "step": 842 + }, + { + "epoch": 0.010550263756593915, + "grad_norm": 12.343222618103027, + "learning_rate": 2.11e-06, + "loss": 0.9256, + "step": 844 + }, + { + "epoch": 0.01057526438160954, + "grad_norm": 3.7680232524871826, + "learning_rate": 2.115e-06, + "loss": 1.7116, + "step": 846 + }, + { + "epoch": 0.010600265006625165, + "grad_norm": 4.3078460693359375, + "learning_rate": 2.12e-06, + "loss": 1.9364, + "step": 848 + }, + { + "epoch": 0.010625265631640792, + "grad_norm": 11.737281799316406, + "learning_rate": 2.125e-06, + "loss": 1.0589, + "step": 850 + }, + { + "epoch": 0.010650266256656417, + "grad_norm": 4.949865341186523, + "learning_rate": 2.13e-06, + "loss": 0.7845, + "step": 852 + }, + { + "epoch": 0.010675266881672042, + "grad_norm": 2.6895337104797363, + "learning_rate": 2.1350000000000003e-06, + "loss": 0.7287, + "step": 854 + }, + { + "epoch": 0.010700267506687667, + "grad_norm": 6.307435512542725, + "learning_rate": 2.1400000000000003e-06, + "loss": 1.0251, + "step": 856 + }, + { + "epoch": 0.010725268131703292, + "grad_norm": 5.423127174377441, + "learning_rate": 2.1450000000000002e-06, + "loss": 0.7872, + "step": 858 + }, + { + "epoch": 0.010750268756718919, + "grad_norm": 14.334004402160645, + "learning_rate": 2.15e-06, + "loss": 1.7159, + "step": 860 + }, + { + "epoch": 0.010775269381734544, + "grad_norm": 7.292761325836182, + "learning_rate": 2.155e-06, + "loss": 1.0143, + "step": 862 + }, + { + "epoch": 0.010800270006750169, + "grad_norm": 3.5362658500671387, + "learning_rate": 2.16e-06, + "loss": 1.1284, + "step": 864 + }, + { + "epoch": 0.010825270631765794, + "grad_norm": 2.0610127449035645, + "learning_rate": 2.165e-06, + "loss": 0.5962, + "step": 866 + }, + { + "epoch": 0.010850271256781419, + "grad_norm": 26.933931350708008, + "learning_rate": 2.17e-06, + "loss": 0.7432, + "step": 868 + }, + { + "epoch": 0.010875271881797046, + "grad_norm": 5.389805316925049, + "learning_rate": 2.1750000000000004e-06, + "loss": 1.7428, + "step": 870 + }, + { + "epoch": 0.01090027250681267, + "grad_norm": 10.040491104125977, + "learning_rate": 2.1800000000000003e-06, + "loss": 2.2054, + "step": 872 + }, + { + "epoch": 0.010925273131828296, + "grad_norm": 2.442275047302246, + "learning_rate": 2.1850000000000003e-06, + "loss": 1.943, + "step": 874 + }, + { + "epoch": 0.01095027375684392, + "grad_norm": 4.041282653808594, + "learning_rate": 2.19e-06, + "loss": 0.3115, + "step": 876 + }, + { + "epoch": 0.010975274381859546, + "grad_norm": 6.313303470611572, + "learning_rate": 2.195e-06, + "loss": 1.4218, + "step": 878 + }, + { + "epoch": 0.011000275006875173, + "grad_norm": 5.4759345054626465, + "learning_rate": 2.2e-06, + "loss": 1.6564, + "step": 880 + }, + { + "epoch": 0.011025275631890798, + "grad_norm": 3.1310904026031494, + "learning_rate": 2.205e-06, + "loss": 0.3696, + "step": 882 + }, + { + "epoch": 0.011050276256906423, + "grad_norm": 10.480581283569336, + "learning_rate": 2.21e-06, + "loss": 1.1846, + "step": 884 + }, + { + "epoch": 0.011075276881922048, + "grad_norm": 6.60823917388916, + "learning_rate": 2.2150000000000004e-06, + "loss": 0.9186, + "step": 886 + }, + { + "epoch": 0.011100277506937673, + "grad_norm": 9.154681205749512, + "learning_rate": 2.2200000000000003e-06, + "loss": 1.2621, + "step": 888 + }, + { + "epoch": 0.0111252781319533, + "grad_norm": 8.224906921386719, + "learning_rate": 2.2250000000000003e-06, + "loss": 1.004, + "step": 890 + }, + { + "epoch": 0.011150278756968925, + "grad_norm": 0.33573994040489197, + "learning_rate": 2.2300000000000002e-06, + "loss": 0.419, + "step": 892 + }, + { + "epoch": 0.01117527938198455, + "grad_norm": 4.896464824676514, + "learning_rate": 2.235e-06, + "loss": 1.3605, + "step": 894 + }, + { + "epoch": 0.011200280007000175, + "grad_norm": 3.795398235321045, + "learning_rate": 2.24e-06, + "loss": 0.1774, + "step": 896 + }, + { + "epoch": 0.0112252806320158, + "grad_norm": 10.528375625610352, + "learning_rate": 2.245e-06, + "loss": 1.5144, + "step": 898 + }, + { + "epoch": 0.011250281257031427, + "grad_norm": 4.215242385864258, + "learning_rate": 2.25e-06, + "loss": 1.7875, + "step": 900 + }, + { + "epoch": 0.011275281882047052, + "grad_norm": 7.752352237701416, + "learning_rate": 2.2550000000000004e-06, + "loss": 1.3018, + "step": 902 + }, + { + "epoch": 0.011300282507062677, + "grad_norm": 14.028157234191895, + "learning_rate": 2.2600000000000004e-06, + "loss": 0.553, + "step": 904 + }, + { + "epoch": 0.011325283132078302, + "grad_norm": 8.061020851135254, + "learning_rate": 2.2650000000000003e-06, + "loss": 1.3524, + "step": 906 + }, + { + "epoch": 0.011350283757093927, + "grad_norm": 0.23997358977794647, + "learning_rate": 2.2700000000000003e-06, + "loss": 0.1291, + "step": 908 + }, + { + "epoch": 0.011375284382109553, + "grad_norm": 3.0996932983398438, + "learning_rate": 2.2750000000000002e-06, + "loss": 1.2642, + "step": 910 + }, + { + "epoch": 0.011400285007125179, + "grad_norm": 3.4303176403045654, + "learning_rate": 2.28e-06, + "loss": 0.2539, + "step": 912 + }, + { + "epoch": 0.011425285632140804, + "grad_norm": 3.048133373260498, + "learning_rate": 2.285e-06, + "loss": 0.9501, + "step": 914 + }, + { + "epoch": 0.011450286257156429, + "grad_norm": 16.673311233520508, + "learning_rate": 2.29e-06, + "loss": 0.6721, + "step": 916 + }, + { + "epoch": 0.011475286882172054, + "grad_norm": 7.4937896728515625, + "learning_rate": 2.2950000000000005e-06, + "loss": 2.3355, + "step": 918 + }, + { + "epoch": 0.01150028750718768, + "grad_norm": 4.909395217895508, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.2916, + "step": 920 + }, + { + "epoch": 0.011525288132203305, + "grad_norm": 3.994419813156128, + "learning_rate": 2.3050000000000004e-06, + "loss": 1.0582, + "step": 922 + }, + { + "epoch": 0.01155028875721893, + "grad_norm": 7.697213649749756, + "learning_rate": 2.3100000000000003e-06, + "loss": 2.8152, + "step": 924 + }, + { + "epoch": 0.011575289382234556, + "grad_norm": 5.459895133972168, + "learning_rate": 2.3150000000000003e-06, + "loss": 0.9411, + "step": 926 + }, + { + "epoch": 0.01160029000725018, + "grad_norm": 5.377702713012695, + "learning_rate": 2.3200000000000002e-06, + "loss": 1.509, + "step": 928 + }, + { + "epoch": 0.011625290632265807, + "grad_norm": 5.655515193939209, + "learning_rate": 2.325e-06, + "loss": 0.8831, + "step": 930 + }, + { + "epoch": 0.011650291257281432, + "grad_norm": 4.841042995452881, + "learning_rate": 2.33e-06, + "loss": 1.6479, + "step": 932 + }, + { + "epoch": 0.011675291882297057, + "grad_norm": 4.868343830108643, + "learning_rate": 2.3350000000000005e-06, + "loss": 0.5754, + "step": 934 + }, + { + "epoch": 0.011700292507312682, + "grad_norm": 4.551144599914551, + "learning_rate": 2.3400000000000005e-06, + "loss": 1.3783, + "step": 936 + }, + { + "epoch": 0.011725293132328308, + "grad_norm": 7.021554946899414, + "learning_rate": 2.345e-06, + "loss": 0.8515, + "step": 938 + }, + { + "epoch": 0.011750293757343934, + "grad_norm": 6.163313865661621, + "learning_rate": 2.35e-06, + "loss": 1.8586, + "step": 940 + }, + { + "epoch": 0.01177529438235956, + "grad_norm": 3.955118417739868, + "learning_rate": 2.355e-06, + "loss": 0.6217, + "step": 942 + }, + { + "epoch": 0.011800295007375184, + "grad_norm": 41.02714538574219, + "learning_rate": 2.3600000000000003e-06, + "loss": 3.1245, + "step": 944 + }, + { + "epoch": 0.01182529563239081, + "grad_norm": 3.282771348953247, + "learning_rate": 2.3650000000000002e-06, + "loss": 1.108, + "step": 946 + }, + { + "epoch": 0.011850296257406434, + "grad_norm": 4.802145004272461, + "learning_rate": 2.37e-06, + "loss": 1.1994, + "step": 948 + }, + { + "epoch": 0.011875296882422061, + "grad_norm": 3.0384304523468018, + "learning_rate": 2.375e-06, + "loss": 1.1195, + "step": 950 + }, + { + "epoch": 0.011900297507437686, + "grad_norm": 6.495284557342529, + "learning_rate": 2.38e-06, + "loss": 1.4161, + "step": 952 + }, + { + "epoch": 0.011925298132453311, + "grad_norm": 5.515675067901611, + "learning_rate": 2.385e-06, + "loss": 0.8649, + "step": 954 + }, + { + "epoch": 0.011950298757468936, + "grad_norm": 6.726687431335449, + "learning_rate": 2.39e-06, + "loss": 1.6277, + "step": 956 + }, + { + "epoch": 0.011975299382484561, + "grad_norm": 4.550925254821777, + "learning_rate": 2.395e-06, + "loss": 1.0882, + "step": 958 + }, + { + "epoch": 0.012000300007500188, + "grad_norm": 11.862398147583008, + "learning_rate": 2.4000000000000003e-06, + "loss": 2.0875, + "step": 960 + }, + { + "epoch": 0.012025300632515813, + "grad_norm": 0.46115854382514954, + "learning_rate": 2.4050000000000003e-06, + "loss": 0.743, + "step": 962 + }, + { + "epoch": 0.012050301257531438, + "grad_norm": 3.122760772705078, + "learning_rate": 2.4100000000000002e-06, + "loss": 1.5929, + "step": 964 + }, + { + "epoch": 0.012075301882547063, + "grad_norm": 12.31058120727539, + "learning_rate": 2.415e-06, + "loss": 1.065, + "step": 966 + }, + { + "epoch": 0.012100302507562688, + "grad_norm": 0.7038207054138184, + "learning_rate": 2.42e-06, + "loss": 1.2093, + "step": 968 + }, + { + "epoch": 0.012125303132578315, + "grad_norm": 4.592223644256592, + "learning_rate": 2.425e-06, + "loss": 1.3211, + "step": 970 + }, + { + "epoch": 0.01215030375759394, + "grad_norm": 0.5584082007408142, + "learning_rate": 2.43e-06, + "loss": 0.5414, + "step": 972 + }, + { + "epoch": 0.012175304382609565, + "grad_norm": 3.691969394683838, + "learning_rate": 2.435e-06, + "loss": 0.5535, + "step": 974 + }, + { + "epoch": 0.01220030500762519, + "grad_norm": 4.328186988830566, + "learning_rate": 2.4400000000000004e-06, + "loss": 1.9441, + "step": 976 + }, + { + "epoch": 0.012225305632640815, + "grad_norm": 3.381350040435791, + "learning_rate": 2.4450000000000003e-06, + "loss": 0.7166, + "step": 978 + }, + { + "epoch": 0.012250306257656442, + "grad_norm": 15.042677879333496, + "learning_rate": 2.4500000000000003e-06, + "loss": 1.5468, + "step": 980 + }, + { + "epoch": 0.012275306882672067, + "grad_norm": 3.2268598079681396, + "learning_rate": 2.4550000000000002e-06, + "loss": 2.0614, + "step": 982 + }, + { + "epoch": 0.012300307507687692, + "grad_norm": 3.9779319763183594, + "learning_rate": 2.46e-06, + "loss": 1.5837, + "step": 984 + }, + { + "epoch": 0.012325308132703317, + "grad_norm": 4.754467964172363, + "learning_rate": 2.465e-06, + "loss": 1.6366, + "step": 986 + }, + { + "epoch": 0.012350308757718942, + "grad_norm": 3.734445333480835, + "learning_rate": 2.47e-06, + "loss": 1.105, + "step": 988 + }, + { + "epoch": 0.012375309382734569, + "grad_norm": 10.688468933105469, + "learning_rate": 2.475e-06, + "loss": 1.0517, + "step": 990 + }, + { + "epoch": 0.012400310007750194, + "grad_norm": 0.14100292325019836, + "learning_rate": 2.4800000000000004e-06, + "loss": 0.3127, + "step": 992 + }, + { + "epoch": 0.012425310632765819, + "grad_norm": 5.202918529510498, + "learning_rate": 2.4850000000000003e-06, + "loss": 1.5709, + "step": 994 + }, + { + "epoch": 0.012450311257781444, + "grad_norm": 5.831029891967773, + "learning_rate": 2.4900000000000003e-06, + "loss": 1.5779, + "step": 996 + }, + { + "epoch": 0.01247531188279707, + "grad_norm": 3.2113211154937744, + "learning_rate": 2.4950000000000003e-06, + "loss": 1.247, + "step": 998 + }, + { + "epoch": 0.012500312507812696, + "grad_norm": 5.479981899261475, + "learning_rate": 2.5e-06, + "loss": 0.8057, + "step": 1000 + }, + { + "epoch": 0.012525313132828321, + "grad_norm": 4.340950965881348, + "learning_rate": 2.505e-06, + "loss": 1.3998, + "step": 1002 + }, + { + "epoch": 0.012550313757843946, + "grad_norm": 10.610760688781738, + "learning_rate": 2.51e-06, + "loss": 1.1513, + "step": 1004 + }, + { + "epoch": 0.012575314382859571, + "grad_norm": 0.19870977103710175, + "learning_rate": 2.515e-06, + "loss": 1.1037, + "step": 1006 + }, + { + "epoch": 0.012600315007875196, + "grad_norm": 6.342980861663818, + "learning_rate": 2.52e-06, + "loss": 2.328, + "step": 1008 + }, + { + "epoch": 0.012625315632890823, + "grad_norm": 0.27722764015197754, + "learning_rate": 2.5250000000000004e-06, + "loss": 0.5445, + "step": 1010 + }, + { + "epoch": 0.012650316257906448, + "grad_norm": 11.05344009399414, + "learning_rate": 2.5300000000000003e-06, + "loss": 1.7635, + "step": 1012 + }, + { + "epoch": 0.012675316882922073, + "grad_norm": 4.570568561553955, + "learning_rate": 2.5350000000000003e-06, + "loss": 1.144, + "step": 1014 + }, + { + "epoch": 0.012700317507937698, + "grad_norm": 4.211488723754883, + "learning_rate": 2.5400000000000002e-06, + "loss": 0.8674, + "step": 1016 + }, + { + "epoch": 0.012725318132953323, + "grad_norm": 3.8137221336364746, + "learning_rate": 2.545e-06, + "loss": 1.0963, + "step": 1018 + }, + { + "epoch": 0.01275031875796895, + "grad_norm": 5.12852144241333, + "learning_rate": 2.55e-06, + "loss": 2.104, + "step": 1020 + }, + { + "epoch": 0.012775319382984575, + "grad_norm": 4.089845180511475, + "learning_rate": 2.555e-06, + "loss": 1.1064, + "step": 1022 + }, + { + "epoch": 0.0128003200080002, + "grad_norm": 4.739445686340332, + "learning_rate": 2.56e-06, + "loss": 1.1742, + "step": 1024 + }, + { + "epoch": 0.012825320633015825, + "grad_norm": 3.0971593856811523, + "learning_rate": 2.5650000000000004e-06, + "loss": 1.1522, + "step": 1026 + }, + { + "epoch": 0.01285032125803145, + "grad_norm": 2.3294100761413574, + "learning_rate": 2.5700000000000004e-06, + "loss": 0.6562, + "step": 1028 + }, + { + "epoch": 0.012875321883047077, + "grad_norm": 4.193419933319092, + "learning_rate": 2.5750000000000003e-06, + "loss": 0.6659, + "step": 1030 + }, + { + "epoch": 0.012900322508062702, + "grad_norm": 3.854617118835449, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.6426, + "step": 1032 + }, + { + "epoch": 0.012925323133078327, + "grad_norm": 23.08792495727539, + "learning_rate": 2.5850000000000002e-06, + "loss": 1.0584, + "step": 1034 + }, + { + "epoch": 0.012950323758093952, + "grad_norm": 7.1056904792785645, + "learning_rate": 2.59e-06, + "loss": 1.391, + "step": 1036 + }, + { + "epoch": 0.012975324383109577, + "grad_norm": 9.727740287780762, + "learning_rate": 2.595e-06, + "loss": 1.7748, + "step": 1038 + }, + { + "epoch": 0.013000325008125204, + "grad_norm": 4.528501510620117, + "learning_rate": 2.6e-06, + "loss": 1.2461, + "step": 1040 + }, + { + "epoch": 0.013025325633140829, + "grad_norm": 6.548806667327881, + "learning_rate": 2.6050000000000005e-06, + "loss": 0.9203, + "step": 1042 + }, + { + "epoch": 0.013050326258156454, + "grad_norm": 6.892823696136475, + "learning_rate": 2.6100000000000004e-06, + "loss": 1.6551, + "step": 1044 + }, + { + "epoch": 0.013075326883172079, + "grad_norm": 0.3558691740036011, + "learning_rate": 2.6150000000000004e-06, + "loss": 0.5024, + "step": 1046 + }, + { + "epoch": 0.013100327508187704, + "grad_norm": 8.417325973510742, + "learning_rate": 2.6200000000000003e-06, + "loss": 1.3123, + "step": 1048 + }, + { + "epoch": 0.01312532813320333, + "grad_norm": 6.438705921173096, + "learning_rate": 2.6250000000000003e-06, + "loss": 0.6587, + "step": 1050 + }, + { + "epoch": 0.013150328758218956, + "grad_norm": 9.366304397583008, + "learning_rate": 2.6300000000000002e-06, + "loss": 1.0239, + "step": 1052 + }, + { + "epoch": 0.01317532938323458, + "grad_norm": 3.5579311847686768, + "learning_rate": 2.635e-06, + "loss": 0.5384, + "step": 1054 + }, + { + "epoch": 0.013200330008250206, + "grad_norm": 3.4501254558563232, + "learning_rate": 2.64e-06, + "loss": 0.8873, + "step": 1056 + }, + { + "epoch": 0.01322533063326583, + "grad_norm": 3.1107115745544434, + "learning_rate": 2.6450000000000005e-06, + "loss": 1.5969, + "step": 1058 + }, + { + "epoch": 0.013250331258281458, + "grad_norm": 3.9892334938049316, + "learning_rate": 2.6500000000000005e-06, + "loss": 1.0436, + "step": 1060 + }, + { + "epoch": 0.013275331883297083, + "grad_norm": 3.2023842334747314, + "learning_rate": 2.6550000000000004e-06, + "loss": 1.2938, + "step": 1062 + }, + { + "epoch": 0.013300332508312708, + "grad_norm": 7.776435375213623, + "learning_rate": 2.6600000000000004e-06, + "loss": 1.719, + "step": 1064 + }, + { + "epoch": 0.013325333133328333, + "grad_norm": 4.061999320983887, + "learning_rate": 2.6650000000000003e-06, + "loss": 1.5743, + "step": 1066 + }, + { + "epoch": 0.013350333758343958, + "grad_norm": 5.124392986297607, + "learning_rate": 2.6700000000000003e-06, + "loss": 1.5789, + "step": 1068 + }, + { + "epoch": 0.013375334383359585, + "grad_norm": 4.740284442901611, + "learning_rate": 2.6750000000000002e-06, + "loss": 1.8057, + "step": 1070 + }, + { + "epoch": 0.01340033500837521, + "grad_norm": 14.306984901428223, + "learning_rate": 2.68e-06, + "loss": 0.7411, + "step": 1072 + }, + { + "epoch": 0.013425335633390835, + "grad_norm": 5.512692451477051, + "learning_rate": 2.6850000000000006e-06, + "loss": 1.2573, + "step": 1074 + }, + { + "epoch": 0.01345033625840646, + "grad_norm": 3.5710535049438477, + "learning_rate": 2.6900000000000005e-06, + "loss": 1.4402, + "step": 1076 + }, + { + "epoch": 0.013475336883422085, + "grad_norm": 5.931448936462402, + "learning_rate": 2.6950000000000005e-06, + "loss": 1.0309, + "step": 1078 + }, + { + "epoch": 0.013500337508437711, + "grad_norm": 4.336605548858643, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.7933, + "step": 1080 + }, + { + "epoch": 0.013525338133453337, + "grad_norm": 10.116353034973145, + "learning_rate": 2.7050000000000004e-06, + "loss": 2.2696, + "step": 1082 + }, + { + "epoch": 0.013550338758468962, + "grad_norm": 7.179591178894043, + "learning_rate": 2.7100000000000003e-06, + "loss": 0.5088, + "step": 1084 + }, + { + "epoch": 0.013575339383484587, + "grad_norm": 8.177982330322266, + "learning_rate": 2.7150000000000003e-06, + "loss": 0.7589, + "step": 1086 + }, + { + "epoch": 0.013600340008500212, + "grad_norm": 3.3890984058380127, + "learning_rate": 2.7200000000000002e-06, + "loss": 1.0135, + "step": 1088 + }, + { + "epoch": 0.013625340633515838, + "grad_norm": 4.494238376617432, + "learning_rate": 2.7250000000000006e-06, + "loss": 0.4444, + "step": 1090 + }, + { + "epoch": 0.013650341258531463, + "grad_norm": 4.238345146179199, + "learning_rate": 2.7300000000000005e-06, + "loss": 0.803, + "step": 1092 + }, + { + "epoch": 0.013675341883547089, + "grad_norm": 0.16566190123558044, + "learning_rate": 2.7350000000000005e-06, + "loss": 0.2075, + "step": 1094 + }, + { + "epoch": 0.013700342508562714, + "grad_norm": 5.369298934936523, + "learning_rate": 2.7400000000000004e-06, + "loss": 1.3597, + "step": 1096 + }, + { + "epoch": 0.01372534313357834, + "grad_norm": 6.487485885620117, + "learning_rate": 2.7450000000000004e-06, + "loss": 1.7246, + "step": 1098 + }, + { + "epoch": 0.013750343758593965, + "grad_norm": 16.829069137573242, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.7874, + "step": 1100 + }, + { + "epoch": 0.01377534438360959, + "grad_norm": 5.856969833374023, + "learning_rate": 2.7550000000000003e-06, + "loss": 1.366, + "step": 1102 + }, + { + "epoch": 0.013800345008625215, + "grad_norm": 3.294729471206665, + "learning_rate": 2.7600000000000003e-06, + "loss": 0.6347, + "step": 1104 + }, + { + "epoch": 0.01382534563364084, + "grad_norm": 3.0470845699310303, + "learning_rate": 2.7650000000000006e-06, + "loss": 0.7902, + "step": 1106 + }, + { + "epoch": 0.013850346258656467, + "grad_norm": 2.976728916168213, + "learning_rate": 2.7700000000000006e-06, + "loss": 0.8529, + "step": 1108 + }, + { + "epoch": 0.013875346883672092, + "grad_norm": 6.306283473968506, + "learning_rate": 2.7750000000000005e-06, + "loss": 1.8359, + "step": 1110 + }, + { + "epoch": 0.013900347508687717, + "grad_norm": 9.050816535949707, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.8542, + "step": 1112 + }, + { + "epoch": 0.013925348133703342, + "grad_norm": 5.706435203552246, + "learning_rate": 2.7850000000000004e-06, + "loss": 1.1891, + "step": 1114 + }, + { + "epoch": 0.013950348758718967, + "grad_norm": 13.866663932800293, + "learning_rate": 2.7900000000000004e-06, + "loss": 0.9958, + "step": 1116 + }, + { + "epoch": 0.013975349383734594, + "grad_norm": 7.099430084228516, + "learning_rate": 2.7950000000000003e-06, + "loss": 2.016, + "step": 1118 + }, + { + "epoch": 0.01400035000875022, + "grad_norm": 3.194931745529175, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.0094, + "step": 1120 + }, + { + "epoch": 0.014025350633765844, + "grad_norm": 7.3075127601623535, + "learning_rate": 2.8050000000000007e-06, + "loss": 1.5759, + "step": 1122 + }, + { + "epoch": 0.01405035125878147, + "grad_norm": 7.173995018005371, + "learning_rate": 2.8100000000000006e-06, + "loss": 1.1102, + "step": 1124 + }, + { + "epoch": 0.014075351883797094, + "grad_norm": 3.3852152824401855, + "learning_rate": 2.815e-06, + "loss": 0.6986, + "step": 1126 + }, + { + "epoch": 0.014100352508812721, + "grad_norm": 5.147909164428711, + "learning_rate": 2.82e-06, + "loss": 1.2867, + "step": 1128 + }, + { + "epoch": 0.014125353133828346, + "grad_norm": 0.1418209969997406, + "learning_rate": 2.825e-06, + "loss": 0.0034, + "step": 1130 + }, + { + "epoch": 0.014150353758843971, + "grad_norm": 5.483434200286865, + "learning_rate": 2.83e-06, + "loss": 0.7983, + "step": 1132 + }, + { + "epoch": 0.014175354383859596, + "grad_norm": 5.476966381072998, + "learning_rate": 2.835e-06, + "loss": 1.7943, + "step": 1134 + }, + { + "epoch": 0.014200355008875221, + "grad_norm": 5.187030792236328, + "learning_rate": 2.84e-06, + "loss": 1.247, + "step": 1136 + }, + { + "epoch": 0.014225355633890848, + "grad_norm": 6.0968146324157715, + "learning_rate": 2.845e-06, + "loss": 1.0338, + "step": 1138 + }, + { + "epoch": 0.014250356258906473, + "grad_norm": 7.593626499176025, + "learning_rate": 2.85e-06, + "loss": 1.3721, + "step": 1140 + }, + { + "epoch": 0.014275356883922098, + "grad_norm": 4.604891777038574, + "learning_rate": 2.855e-06, + "loss": 1.0799, + "step": 1142 + }, + { + "epoch": 0.014300357508937723, + "grad_norm": 0.6648197174072266, + "learning_rate": 2.86e-06, + "loss": 0.1657, + "step": 1144 + }, + { + "epoch": 0.014325358133953348, + "grad_norm": 7.02484655380249, + "learning_rate": 2.865e-06, + "loss": 1.0677, + "step": 1146 + }, + { + "epoch": 0.014350358758968975, + "grad_norm": 1.2220983505249023, + "learning_rate": 2.87e-06, + "loss": 0.4948, + "step": 1148 + }, + { + "epoch": 0.0143753593839846, + "grad_norm": 6.066830158233643, + "learning_rate": 2.875e-06, + "loss": 0.5099, + "step": 1150 + }, + { + "epoch": 0.014400360009000225, + "grad_norm": 4.201080799102783, + "learning_rate": 2.88e-06, + "loss": 0.8648, + "step": 1152 + }, + { + "epoch": 0.01442536063401585, + "grad_norm": 2.500391960144043, + "learning_rate": 2.885e-06, + "loss": 0.2667, + "step": 1154 + }, + { + "epoch": 0.014450361259031475, + "grad_norm": 4.715792179107666, + "learning_rate": 2.89e-06, + "loss": 0.9756, + "step": 1156 + }, + { + "epoch": 0.014475361884047102, + "grad_norm": 4.868462085723877, + "learning_rate": 2.8950000000000002e-06, + "loss": 0.5492, + "step": 1158 + }, + { + "epoch": 0.014500362509062727, + "grad_norm": 5.4695024490356445, + "learning_rate": 2.9e-06, + "loss": 1.3598, + "step": 1160 + }, + { + "epoch": 0.014525363134078352, + "grad_norm": 3.0060160160064697, + "learning_rate": 2.905e-06, + "loss": 0.4066, + "step": 1162 + }, + { + "epoch": 0.014550363759093977, + "grad_norm": 0.15218128263950348, + "learning_rate": 2.91e-06, + "loss": 0.2608, + "step": 1164 + }, + { + "epoch": 0.014575364384109602, + "grad_norm": 4.584559917449951, + "learning_rate": 2.915e-06, + "loss": 0.3256, + "step": 1166 + }, + { + "epoch": 0.014600365009125229, + "grad_norm": 26.527301788330078, + "learning_rate": 2.92e-06, + "loss": 1.7359, + "step": 1168 + }, + { + "epoch": 0.014625365634140854, + "grad_norm": 5.023931980133057, + "learning_rate": 2.925e-06, + "loss": 1.401, + "step": 1170 + }, + { + "epoch": 0.014650366259156479, + "grad_norm": 11.880655288696289, + "learning_rate": 2.93e-06, + "loss": 1.6074, + "step": 1172 + }, + { + "epoch": 0.014675366884172104, + "grad_norm": 18.785945892333984, + "learning_rate": 2.9350000000000003e-06, + "loss": 1.4875, + "step": 1174 + }, + { + "epoch": 0.014700367509187729, + "grad_norm": 9.395803451538086, + "learning_rate": 2.9400000000000002e-06, + "loss": 0.9491, + "step": 1176 + }, + { + "epoch": 0.014725368134203356, + "grad_norm": 2.9056105613708496, + "learning_rate": 2.945e-06, + "loss": 1.1664, + "step": 1178 + }, + { + "epoch": 0.014750368759218981, + "grad_norm": 6.0602521896362305, + "learning_rate": 2.95e-06, + "loss": 1.0575, + "step": 1180 + }, + { + "epoch": 0.014775369384234606, + "grad_norm": 0.39890220761299133, + "learning_rate": 2.955e-06, + "loss": 0.491, + "step": 1182 + }, + { + "epoch": 0.014800370009250231, + "grad_norm": 3.5646121501922607, + "learning_rate": 2.96e-06, + "loss": 0.7147, + "step": 1184 + }, + { + "epoch": 0.014825370634265856, + "grad_norm": 7.619149208068848, + "learning_rate": 2.965e-06, + "loss": 0.093, + "step": 1186 + }, + { + "epoch": 0.014850371259281483, + "grad_norm": 4.79898738861084, + "learning_rate": 2.97e-06, + "loss": 1.4848, + "step": 1188 + }, + { + "epoch": 0.014875371884297108, + "grad_norm": 4.806220531463623, + "learning_rate": 2.9750000000000003e-06, + "loss": 1.2268, + "step": 1190 + }, + { + "epoch": 0.014900372509312733, + "grad_norm": 5.043309211730957, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.8618, + "step": 1192 + }, + { + "epoch": 0.014925373134328358, + "grad_norm": 4.642086029052734, + "learning_rate": 2.9850000000000002e-06, + "loss": 1.5208, + "step": 1194 + }, + { + "epoch": 0.014950373759343983, + "grad_norm": 3.228925943374634, + "learning_rate": 2.99e-06, + "loss": 0.8403, + "step": 1196 + }, + { + "epoch": 0.01497537438435961, + "grad_norm": 4.017519950866699, + "learning_rate": 2.995e-06, + "loss": 0.8314, + "step": 1198 + }, + { + "epoch": 0.015000375009375235, + "grad_norm": 2.968623638153076, + "learning_rate": 3e-06, + "loss": 0.9964, + "step": 1200 + }, + { + "epoch": 0.01502537563439086, + "grad_norm": 0.1229841411113739, + "learning_rate": 3.005e-06, + "loss": 1.5469, + "step": 1202 + }, + { + "epoch": 0.015050376259406485, + "grad_norm": 3.1273839473724365, + "learning_rate": 3.01e-06, + "loss": 0.7363, + "step": 1204 + }, + { + "epoch": 0.01507537688442211, + "grad_norm": 6.599337100982666, + "learning_rate": 3.0150000000000004e-06, + "loss": 1.6366, + "step": 1206 + }, + { + "epoch": 0.015100377509437737, + "grad_norm": 4.008113861083984, + "learning_rate": 3.0200000000000003e-06, + "loss": 0.9569, + "step": 1208 + }, + { + "epoch": 0.015125378134453362, + "grad_norm": 16.209644317626953, + "learning_rate": 3.0250000000000003e-06, + "loss": 1.9214, + "step": 1210 + }, + { + "epoch": 0.015150378759468987, + "grad_norm": 5.620543479919434, + "learning_rate": 3.0300000000000002e-06, + "loss": 2.1213, + "step": 1212 + }, + { + "epoch": 0.015175379384484612, + "grad_norm": 5.173812389373779, + "learning_rate": 3.035e-06, + "loss": 1.078, + "step": 1214 + }, + { + "epoch": 0.015200380009500237, + "grad_norm": 3.1929264068603516, + "learning_rate": 3.04e-06, + "loss": 1.0803, + "step": 1216 + }, + { + "epoch": 0.015225380634515864, + "grad_norm": 3.4273109436035156, + "learning_rate": 3.045e-06, + "loss": 1.134, + "step": 1218 + }, + { + "epoch": 0.015250381259531489, + "grad_norm": 6.023685932159424, + "learning_rate": 3.05e-06, + "loss": 1.6894, + "step": 1220 + }, + { + "epoch": 0.015275381884547114, + "grad_norm": 5.07805061340332, + "learning_rate": 3.0550000000000004e-06, + "loss": 0.8394, + "step": 1222 + }, + { + "epoch": 0.015300382509562739, + "grad_norm": 3.7447378635406494, + "learning_rate": 3.0600000000000003e-06, + "loss": 0.6172, + "step": 1224 + }, + { + "epoch": 0.015325383134578364, + "grad_norm": 4.3148579597473145, + "learning_rate": 3.0650000000000003e-06, + "loss": 0.4221, + "step": 1226 + }, + { + "epoch": 0.01535038375959399, + "grad_norm": 3.3067684173583984, + "learning_rate": 3.0700000000000003e-06, + "loss": 0.7994, + "step": 1228 + }, + { + "epoch": 0.015375384384609616, + "grad_norm": 5.438954830169678, + "learning_rate": 3.075e-06, + "loss": 1.9003, + "step": 1230 + }, + { + "epoch": 0.01540038500962524, + "grad_norm": 3.499305486679077, + "learning_rate": 3.08e-06, + "loss": 1.4909, + "step": 1232 + }, + { + "epoch": 0.015425385634640866, + "grad_norm": 12.879094123840332, + "learning_rate": 3.085e-06, + "loss": 0.6062, + "step": 1234 + }, + { + "epoch": 0.01545038625965649, + "grad_norm": 8.531977653503418, + "learning_rate": 3.09e-06, + "loss": 1.3207, + "step": 1236 + }, + { + "epoch": 0.015475386884672118, + "grad_norm": 4.731712341308594, + "learning_rate": 3.0950000000000004e-06, + "loss": 0.9014, + "step": 1238 + }, + { + "epoch": 0.015500387509687743, + "grad_norm": 0.289911150932312, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.096, + "step": 1240 + }, + { + "epoch": 0.015525388134703368, + "grad_norm": 5.386617660522461, + "learning_rate": 3.1050000000000003e-06, + "loss": 1.6194, + "step": 1242 + }, + { + "epoch": 0.015550388759718993, + "grad_norm": 4.4977240562438965, + "learning_rate": 3.1100000000000003e-06, + "loss": 1.279, + "step": 1244 + }, + { + "epoch": 0.015575389384734618, + "grad_norm": 10.347302436828613, + "learning_rate": 3.1150000000000002e-06, + "loss": 1.5511, + "step": 1246 + }, + { + "epoch": 0.015600390009750244, + "grad_norm": 6.5990424156188965, + "learning_rate": 3.12e-06, + "loss": 0.5518, + "step": 1248 + }, + { + "epoch": 0.015625390634765868, + "grad_norm": 5.8512749671936035, + "learning_rate": 3.125e-06, + "loss": 0.9088, + "step": 1250 + }, + { + "epoch": 0.015650391259781495, + "grad_norm": 17.053953170776367, + "learning_rate": 3.13e-06, + "loss": 1.0699, + "step": 1252 + }, + { + "epoch": 0.01567539188479712, + "grad_norm": 5.6494245529174805, + "learning_rate": 3.135e-06, + "loss": 0.8771, + "step": 1254 + }, + { + "epoch": 0.015700392509812745, + "grad_norm": 7.19549560546875, + "learning_rate": 3.1400000000000004e-06, + "loss": 0.5063, + "step": 1256 + }, + { + "epoch": 0.01572539313482837, + "grad_norm": 2.865764856338501, + "learning_rate": 3.1450000000000004e-06, + "loss": 1.2885, + "step": 1258 + }, + { + "epoch": 0.015750393759843995, + "grad_norm": 6.2678141593933105, + "learning_rate": 3.1500000000000003e-06, + "loss": 1.6355, + "step": 1260 + }, + { + "epoch": 0.01577539438485962, + "grad_norm": 2.1995689868927, + "learning_rate": 3.1550000000000003e-06, + "loss": 0.9619, + "step": 1262 + }, + { + "epoch": 0.015800395009875248, + "grad_norm": 4.936195373535156, + "learning_rate": 3.1600000000000002e-06, + "loss": 1.5352, + "step": 1264 + }, + { + "epoch": 0.01582539563489087, + "grad_norm": 6.365655422210693, + "learning_rate": 3.165e-06, + "loss": 1.4741, + "step": 1266 + }, + { + "epoch": 0.0158503962599065, + "grad_norm": 7.503091812133789, + "learning_rate": 3.17e-06, + "loss": 2.976, + "step": 1268 + }, + { + "epoch": 0.01587539688492212, + "grad_norm": 4.077455997467041, + "learning_rate": 3.175e-06, + "loss": 0.7737, + "step": 1270 + }, + { + "epoch": 0.01590039750993775, + "grad_norm": 4.369293212890625, + "learning_rate": 3.1800000000000005e-06, + "loss": 1.7981, + "step": 1272 + }, + { + "epoch": 0.015925398134953375, + "grad_norm": 3.328395128250122, + "learning_rate": 3.1850000000000004e-06, + "loss": 0.4272, + "step": 1274 + }, + { + "epoch": 0.015950398759969, + "grad_norm": 5.359509468078613, + "learning_rate": 3.1900000000000004e-06, + "loss": 1.5715, + "step": 1276 + }, + { + "epoch": 0.015975399384984625, + "grad_norm": 3.025557279586792, + "learning_rate": 3.1950000000000003e-06, + "loss": 1.0401, + "step": 1278 + }, + { + "epoch": 0.01600040001000025, + "grad_norm": 5.117575168609619, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.1705, + "step": 1280 + }, + { + "epoch": 0.016025400635015875, + "grad_norm": 9.797975540161133, + "learning_rate": 3.2050000000000002e-06, + "loss": 1.9416, + "step": 1282 + }, + { + "epoch": 0.016050401260031502, + "grad_norm": 5.871378421783447, + "learning_rate": 3.21e-06, + "loss": 1.6602, + "step": 1284 + }, + { + "epoch": 0.016075401885047125, + "grad_norm": 8.106194496154785, + "learning_rate": 3.215e-06, + "loss": 1.0055, + "step": 1286 + }, + { + "epoch": 0.016100402510062752, + "grad_norm": 3.3406853675842285, + "learning_rate": 3.2200000000000005e-06, + "loss": 1.1463, + "step": 1288 + }, + { + "epoch": 0.016125403135078376, + "grad_norm": 4.021951675415039, + "learning_rate": 3.2250000000000005e-06, + "loss": 1.2182, + "step": 1290 + }, + { + "epoch": 0.016150403760094002, + "grad_norm": 0.22995100915431976, + "learning_rate": 3.2300000000000004e-06, + "loss": 0.5202, + "step": 1292 + }, + { + "epoch": 0.01617540438510963, + "grad_norm": 4.728019714355469, + "learning_rate": 3.2350000000000004e-06, + "loss": 1.5589, + "step": 1294 + }, + { + "epoch": 0.016200405010125252, + "grad_norm": 3.7595396041870117, + "learning_rate": 3.2400000000000003e-06, + "loss": 0.6402, + "step": 1296 + }, + { + "epoch": 0.01622540563514088, + "grad_norm": 7.198408126831055, + "learning_rate": 3.2450000000000003e-06, + "loss": 2.21, + "step": 1298 + }, + { + "epoch": 0.016250406260156502, + "grad_norm": 0.09762974083423615, + "learning_rate": 3.2500000000000002e-06, + "loss": 1.1135, + "step": 1300 + }, + { + "epoch": 0.01627540688517213, + "grad_norm": 0.3188939392566681, + "learning_rate": 3.255e-06, + "loss": 0.0618, + "step": 1302 + }, + { + "epoch": 0.016300407510187756, + "grad_norm": 7.949304580688477, + "learning_rate": 3.2600000000000006e-06, + "loss": 0.9843, + "step": 1304 + }, + { + "epoch": 0.01632540813520338, + "grad_norm": 9.324291229248047, + "learning_rate": 3.2650000000000005e-06, + "loss": 0.7826, + "step": 1306 + }, + { + "epoch": 0.016350408760219006, + "grad_norm": 5.566682815551758, + "learning_rate": 3.2700000000000005e-06, + "loss": 1.6977, + "step": 1308 + }, + { + "epoch": 0.01637540938523463, + "grad_norm": 8.505062103271484, + "learning_rate": 3.2750000000000004e-06, + "loss": 1.6805, + "step": 1310 + }, + { + "epoch": 0.016400410010250256, + "grad_norm": 4.138638019561768, + "learning_rate": 3.2800000000000004e-06, + "loss": 0.9272, + "step": 1312 + }, + { + "epoch": 0.016425410635265883, + "grad_norm": 10.258434295654297, + "learning_rate": 3.2850000000000003e-06, + "loss": 0.8302, + "step": 1314 + }, + { + "epoch": 0.016450411260281506, + "grad_norm": 5.136192321777344, + "learning_rate": 3.2900000000000003e-06, + "loss": 0.9856, + "step": 1316 + }, + { + "epoch": 0.016475411885297133, + "grad_norm": 7.025705814361572, + "learning_rate": 3.2950000000000002e-06, + "loss": 0.7363, + "step": 1318 + }, + { + "epoch": 0.016500412510312756, + "grad_norm": 4.630895614624023, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.37, + "step": 1320 + }, + { + "epoch": 0.016525413135328383, + "grad_norm": 5.529006004333496, + "learning_rate": 3.3050000000000005e-06, + "loss": 0.7665, + "step": 1322 + }, + { + "epoch": 0.01655041376034401, + "grad_norm": 5.844700336456299, + "learning_rate": 3.3100000000000005e-06, + "loss": 1.4975, + "step": 1324 + }, + { + "epoch": 0.016575414385359633, + "grad_norm": 6.703248023986816, + "learning_rate": 3.3150000000000004e-06, + "loss": 0.1618, + "step": 1326 + }, + { + "epoch": 0.01660041501037526, + "grad_norm": 9.463605880737305, + "learning_rate": 3.3200000000000004e-06, + "loss": 1.5852, + "step": 1328 + }, + { + "epoch": 0.016625415635390883, + "grad_norm": 4.576881408691406, + "learning_rate": 3.3250000000000004e-06, + "loss": 0.5847, + "step": 1330 + }, + { + "epoch": 0.01665041626040651, + "grad_norm": 4.7824015617370605, + "learning_rate": 3.3300000000000003e-06, + "loss": 1.8043, + "step": 1332 + }, + { + "epoch": 0.016675416885422137, + "grad_norm": 5.8089799880981445, + "learning_rate": 3.3350000000000003e-06, + "loss": 2.2828, + "step": 1334 + }, + { + "epoch": 0.01670041751043776, + "grad_norm": 5.973447799682617, + "learning_rate": 3.3400000000000006e-06, + "loss": 1.2801, + "step": 1336 + }, + { + "epoch": 0.016725418135453387, + "grad_norm": 3.6551427841186523, + "learning_rate": 3.3450000000000006e-06, + "loss": 1.3617, + "step": 1338 + }, + { + "epoch": 0.01675041876046901, + "grad_norm": 5.50007438659668, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.9842, + "step": 1340 + }, + { + "epoch": 0.016775419385484637, + "grad_norm": 7.179790496826172, + "learning_rate": 3.3550000000000005e-06, + "loss": 2.0944, + "step": 1342 + }, + { + "epoch": 0.016800420010500264, + "grad_norm": 5.729517936706543, + "learning_rate": 3.3600000000000004e-06, + "loss": 1.9411, + "step": 1344 + }, + { + "epoch": 0.016825420635515887, + "grad_norm": 4.25435209274292, + "learning_rate": 3.3650000000000004e-06, + "loss": 0.9978, + "step": 1346 + }, + { + "epoch": 0.016850421260531514, + "grad_norm": 0.8669453263282776, + "learning_rate": 3.3700000000000003e-06, + "loss": 1.127, + "step": 1348 + }, + { + "epoch": 0.016875421885547137, + "grad_norm": 5.843264102935791, + "learning_rate": 3.3750000000000003e-06, + "loss": 0.4672, + "step": 1350 + }, + { + "epoch": 0.016900422510562764, + "grad_norm": 3.359283685684204, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.5827, + "step": 1352 + }, + { + "epoch": 0.01692542313557839, + "grad_norm": 12.13847541809082, + "learning_rate": 3.3850000000000006e-06, + "loss": 2.4511, + "step": 1354 + }, + { + "epoch": 0.016950423760594014, + "grad_norm": 17.446638107299805, + "learning_rate": 3.3900000000000006e-06, + "loss": 0.9059, + "step": 1356 + }, + { + "epoch": 0.01697542438560964, + "grad_norm": 7.590035915374756, + "learning_rate": 3.3950000000000005e-06, + "loss": 0.9706, + "step": 1358 + }, + { + "epoch": 0.017000425010625264, + "grad_norm": 5.265120983123779, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.1441, + "step": 1360 + }, + { + "epoch": 0.01702542563564089, + "grad_norm": 5.063991546630859, + "learning_rate": 3.4050000000000004e-06, + "loss": 0.9517, + "step": 1362 + }, + { + "epoch": 0.017050426260656518, + "grad_norm": 5.018148422241211, + "learning_rate": 3.4100000000000004e-06, + "loss": 1.2932, + "step": 1364 + }, + { + "epoch": 0.01707542688567214, + "grad_norm": 3.4425129890441895, + "learning_rate": 3.4150000000000003e-06, + "loss": 1.4431, + "step": 1366 + }, + { + "epoch": 0.017100427510687768, + "grad_norm": 5.751655101776123, + "learning_rate": 3.4200000000000007e-06, + "loss": 0.6068, + "step": 1368 + }, + { + "epoch": 0.01712542813570339, + "grad_norm": 0.2651554048061371, + "learning_rate": 3.4250000000000007e-06, + "loss": 0.8223, + "step": 1370 + }, + { + "epoch": 0.017150428760719018, + "grad_norm": 4.681083679199219, + "learning_rate": 3.4300000000000006e-06, + "loss": 1.4579, + "step": 1372 + }, + { + "epoch": 0.017175429385734645, + "grad_norm": 19.17090606689453, + "learning_rate": 3.4350000000000006e-06, + "loss": 1.3703, + "step": 1374 + }, + { + "epoch": 0.017200430010750268, + "grad_norm": 0.46469059586524963, + "learning_rate": 3.44e-06, + "loss": 0.5433, + "step": 1376 + }, + { + "epoch": 0.017225430635765895, + "grad_norm": 4.817959308624268, + "learning_rate": 3.445e-06, + "loss": 0.6223, + "step": 1378 + }, + { + "epoch": 0.017250431260781518, + "grad_norm": 8.736249923706055, + "learning_rate": 3.45e-06, + "loss": 0.7035, + "step": 1380 + }, + { + "epoch": 0.017275431885797145, + "grad_norm": 0.14607450366020203, + "learning_rate": 3.455e-06, + "loss": 0.8148, + "step": 1382 + }, + { + "epoch": 0.01730043251081277, + "grad_norm": 7.322673320770264, + "learning_rate": 3.46e-06, + "loss": 1.0179, + "step": 1384 + }, + { + "epoch": 0.017325433135828395, + "grad_norm": 5.91670036315918, + "learning_rate": 3.465e-06, + "loss": 0.4713, + "step": 1386 + }, + { + "epoch": 0.01735043376084402, + "grad_norm": 3.164785623550415, + "learning_rate": 3.4700000000000002e-06, + "loss": 0.7291, + "step": 1388 + }, + { + "epoch": 0.017375434385859645, + "grad_norm": 4.55758810043335, + "learning_rate": 3.475e-06, + "loss": 0.1845, + "step": 1390 + }, + { + "epoch": 0.01740043501087527, + "grad_norm": 7.367198467254639, + "learning_rate": 3.48e-06, + "loss": 0.4978, + "step": 1392 + }, + { + "epoch": 0.0174254356358909, + "grad_norm": 4.787137031555176, + "learning_rate": 3.485e-06, + "loss": 0.9527, + "step": 1394 + }, + { + "epoch": 0.017450436260906522, + "grad_norm": 5.335115909576416, + "learning_rate": 3.49e-06, + "loss": 0.9254, + "step": 1396 + }, + { + "epoch": 0.01747543688592215, + "grad_norm": 3.02467679977417, + "learning_rate": 3.495e-06, + "loss": 1.2214, + "step": 1398 + }, + { + "epoch": 0.017500437510937772, + "grad_norm": 5.329033374786377, + "learning_rate": 3.5e-06, + "loss": 0.6142, + "step": 1400 + }, + { + "epoch": 0.0175254381359534, + "grad_norm": 4.608700275421143, + "learning_rate": 3.505e-06, + "loss": 0.7836, + "step": 1402 + }, + { + "epoch": 0.017550438760969025, + "grad_norm": 7.003199577331543, + "learning_rate": 3.5100000000000003e-06, + "loss": 1.3531, + "step": 1404 + }, + { + "epoch": 0.01757543938598465, + "grad_norm": 5.282921314239502, + "learning_rate": 3.5150000000000002e-06, + "loss": 1.0315, + "step": 1406 + }, + { + "epoch": 0.017600440011000276, + "grad_norm": 6.001785755157471, + "learning_rate": 3.52e-06, + "loss": 1.3406, + "step": 1408 + }, + { + "epoch": 0.0176254406360159, + "grad_norm": 5.970971584320068, + "learning_rate": 3.525e-06, + "loss": 1.5702, + "step": 1410 + }, + { + "epoch": 0.017650441261031526, + "grad_norm": 7.11763858795166, + "learning_rate": 3.53e-06, + "loss": 1.5037, + "step": 1412 + }, + { + "epoch": 0.017675441886047152, + "grad_norm": 6.464713096618652, + "learning_rate": 3.535e-06, + "loss": 1.098, + "step": 1414 + }, + { + "epoch": 0.017700442511062776, + "grad_norm": 5.916917324066162, + "learning_rate": 3.54e-06, + "loss": 0.6355, + "step": 1416 + }, + { + "epoch": 0.017725443136078402, + "grad_norm": 7.0150041580200195, + "learning_rate": 3.545e-06, + "loss": 1.3383, + "step": 1418 + }, + { + "epoch": 0.017750443761094026, + "grad_norm": 10.330873489379883, + "learning_rate": 3.5500000000000003e-06, + "loss": 1.2338, + "step": 1420 + }, + { + "epoch": 0.017775444386109653, + "grad_norm": 1.4415345191955566, + "learning_rate": 3.5550000000000003e-06, + "loss": 0.3711, + "step": 1422 + }, + { + "epoch": 0.01780044501112528, + "grad_norm": 11.529770851135254, + "learning_rate": 3.5600000000000002e-06, + "loss": 2.239, + "step": 1424 + }, + { + "epoch": 0.017825445636140903, + "grad_norm": 3.3611042499542236, + "learning_rate": 3.565e-06, + "loss": 0.8432, + "step": 1426 + }, + { + "epoch": 0.01785044626115653, + "grad_norm": 3.689052104949951, + "learning_rate": 3.57e-06, + "loss": 1.5685, + "step": 1428 + }, + { + "epoch": 0.017875446886172153, + "grad_norm": 9.880084991455078, + "learning_rate": 3.575e-06, + "loss": 1.0432, + "step": 1430 + }, + { + "epoch": 0.01790044751118778, + "grad_norm": 8.54449462890625, + "learning_rate": 3.58e-06, + "loss": 0.5986, + "step": 1432 + }, + { + "epoch": 0.017925448136203406, + "grad_norm": 5.063869476318359, + "learning_rate": 3.585e-06, + "loss": 1.4963, + "step": 1434 + }, + { + "epoch": 0.01795044876121903, + "grad_norm": 11.441155433654785, + "learning_rate": 3.5900000000000004e-06, + "loss": 0.1501, + "step": 1436 + }, + { + "epoch": 0.017975449386234656, + "grad_norm": 6.692288398742676, + "learning_rate": 3.5950000000000003e-06, + "loss": 3.1963, + "step": 1438 + }, + { + "epoch": 0.01800045001125028, + "grad_norm": 3.2029430866241455, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.7895, + "step": 1440 + }, + { + "epoch": 0.018025450636265906, + "grad_norm": 7.209542751312256, + "learning_rate": 3.6050000000000002e-06, + "loss": 1.5115, + "step": 1442 + }, + { + "epoch": 0.018050451261281533, + "grad_norm": 2.5491597652435303, + "learning_rate": 3.61e-06, + "loss": 0.6852, + "step": 1444 + }, + { + "epoch": 0.018075451886297157, + "grad_norm": 3.5385613441467285, + "learning_rate": 3.615e-06, + "loss": 1.6268, + "step": 1446 + }, + { + "epoch": 0.018100452511312783, + "grad_norm": 8.47276496887207, + "learning_rate": 3.62e-06, + "loss": 1.346, + "step": 1448 + }, + { + "epoch": 0.018125453136328407, + "grad_norm": 3.3489065170288086, + "learning_rate": 3.625e-06, + "loss": 0.8224, + "step": 1450 + }, + { + "epoch": 0.018150453761344033, + "grad_norm": 5.739799976348877, + "learning_rate": 3.6300000000000004e-06, + "loss": 1.6266, + "step": 1452 + }, + { + "epoch": 0.01817545438635966, + "grad_norm": 10.512731552124023, + "learning_rate": 3.6350000000000003e-06, + "loss": 1.4308, + "step": 1454 + }, + { + "epoch": 0.018200455011375283, + "grad_norm": 3.004159688949585, + "learning_rate": 3.6400000000000003e-06, + "loss": 1.6361, + "step": 1456 + }, + { + "epoch": 0.01822545563639091, + "grad_norm": 5.707242012023926, + "learning_rate": 3.6450000000000003e-06, + "loss": 1.0315, + "step": 1458 + }, + { + "epoch": 0.018250456261406534, + "grad_norm": 4.99793004989624, + "learning_rate": 3.65e-06, + "loss": 1.3165, + "step": 1460 + }, + { + "epoch": 0.01827545688642216, + "grad_norm": 16.01923370361328, + "learning_rate": 3.655e-06, + "loss": 0.6928, + "step": 1462 + }, + { + "epoch": 0.018300457511437787, + "grad_norm": 7.288991451263428, + "learning_rate": 3.66e-06, + "loss": 1.4485, + "step": 1464 + }, + { + "epoch": 0.01832545813645341, + "grad_norm": 4.432981014251709, + "learning_rate": 3.665e-06, + "loss": 1.9894, + "step": 1466 + }, + { + "epoch": 0.018350458761469037, + "grad_norm": 3.3517932891845703, + "learning_rate": 3.6700000000000004e-06, + "loss": 0.5998, + "step": 1468 + }, + { + "epoch": 0.01837545938648466, + "grad_norm": 4.958065986633301, + "learning_rate": 3.6750000000000004e-06, + "loss": 1.8911, + "step": 1470 + }, + { + "epoch": 0.018400460011500287, + "grad_norm": 1.6755849123001099, + "learning_rate": 3.6800000000000003e-06, + "loss": 0.5996, + "step": 1472 + }, + { + "epoch": 0.018425460636515914, + "grad_norm": 5.23293399810791, + "learning_rate": 3.6850000000000003e-06, + "loss": 1.4153, + "step": 1474 + }, + { + "epoch": 0.018450461261531537, + "grad_norm": 5.890183448791504, + "learning_rate": 3.6900000000000002e-06, + "loss": 1.9285, + "step": 1476 + }, + { + "epoch": 0.018475461886547164, + "grad_norm": 6.712188720703125, + "learning_rate": 3.695e-06, + "loss": 0.1872, + "step": 1478 + }, + { + "epoch": 0.018500462511562787, + "grad_norm": 4.382708549499512, + "learning_rate": 3.7e-06, + "loss": 0.551, + "step": 1480 + }, + { + "epoch": 0.018525463136578414, + "grad_norm": 2.600803852081299, + "learning_rate": 3.705e-06, + "loss": 1.9873, + "step": 1482 + }, + { + "epoch": 0.01855046376159404, + "grad_norm": 7.491966724395752, + "learning_rate": 3.7100000000000005e-06, + "loss": 1.3555, + "step": 1484 + }, + { + "epoch": 0.018575464386609664, + "grad_norm": 5.425282001495361, + "learning_rate": 3.7150000000000004e-06, + "loss": 1.0287, + "step": 1486 + }, + { + "epoch": 0.01860046501162529, + "grad_norm": 4.35009765625, + "learning_rate": 3.7200000000000004e-06, + "loss": 1.2867, + "step": 1488 + }, + { + "epoch": 0.018625465636640914, + "grad_norm": 5.396328926086426, + "learning_rate": 3.7250000000000003e-06, + "loss": 0.9474, + "step": 1490 + }, + { + "epoch": 0.01865046626165654, + "grad_norm": 2.302100419998169, + "learning_rate": 3.7300000000000003e-06, + "loss": 0.6464, + "step": 1492 + }, + { + "epoch": 0.018675466886672168, + "grad_norm": 1.1674795150756836, + "learning_rate": 3.7350000000000002e-06, + "loss": 0.936, + "step": 1494 + }, + { + "epoch": 0.01870046751168779, + "grad_norm": 5.182133197784424, + "learning_rate": 3.74e-06, + "loss": 1.6, + "step": 1496 + }, + { + "epoch": 0.018725468136703418, + "grad_norm": 2.1812498569488525, + "learning_rate": 3.745e-06, + "loss": 1.7459, + "step": 1498 + }, + { + "epoch": 0.01875046876171904, + "grad_norm": 3.823646306991577, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.441, + "step": 1500 + }, + { + "epoch": 0.018775469386734668, + "grad_norm": 1.6065586805343628, + "learning_rate": 3.7550000000000005e-06, + "loss": 0.8477, + "step": 1502 + }, + { + "epoch": 0.018800470011750295, + "grad_norm": 0.13129039108753204, + "learning_rate": 3.7600000000000004e-06, + "loss": 0.573, + "step": 1504 + }, + { + "epoch": 0.018825470636765918, + "grad_norm": 3.782097339630127, + "learning_rate": 3.7650000000000004e-06, + "loss": 1.2036, + "step": 1506 + }, + { + "epoch": 0.018850471261781545, + "grad_norm": 5.120966911315918, + "learning_rate": 3.7700000000000003e-06, + "loss": 0.8436, + "step": 1508 + }, + { + "epoch": 0.018875471886797168, + "grad_norm": 3.4724977016448975, + "learning_rate": 3.7750000000000003e-06, + "loss": 0.4969, + "step": 1510 + }, + { + "epoch": 0.018900472511812795, + "grad_norm": 4.051347732543945, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.4617, + "step": 1512 + }, + { + "epoch": 0.018925473136828422, + "grad_norm": 11.29021167755127, + "learning_rate": 3.785e-06, + "loss": 1.1598, + "step": 1514 + }, + { + "epoch": 0.018950473761844045, + "grad_norm": 8.412789344787598, + "learning_rate": 3.79e-06, + "loss": 1.1885, + "step": 1516 + }, + { + "epoch": 0.018975474386859672, + "grad_norm": 5.402141094207764, + "learning_rate": 3.7950000000000005e-06, + "loss": 0.1993, + "step": 1518 + }, + { + "epoch": 0.019000475011875295, + "grad_norm": 9.354228019714355, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.1845, + "step": 1520 + }, + { + "epoch": 0.019025475636890922, + "grad_norm": 0.18336676061153412, + "learning_rate": 3.8050000000000004e-06, + "loss": 0.6342, + "step": 1522 + }, + { + "epoch": 0.01905047626190655, + "grad_norm": 5.147608757019043, + "learning_rate": 3.8100000000000004e-06, + "loss": 2.0047, + "step": 1524 + }, + { + "epoch": 0.019075476886922172, + "grad_norm": 17.84864044189453, + "learning_rate": 3.815000000000001e-06, + "loss": 0.4512, + "step": 1526 + }, + { + "epoch": 0.0191004775119378, + "grad_norm": 3.2128236293792725, + "learning_rate": 3.820000000000001e-06, + "loss": 0.9759, + "step": 1528 + }, + { + "epoch": 0.019125478136953422, + "grad_norm": 0.4679546058177948, + "learning_rate": 3.825000000000001e-06, + "loss": 0.2713, + "step": 1530 + }, + { + "epoch": 0.01915047876196905, + "grad_norm": 3.107508420944214, + "learning_rate": 3.830000000000001e-06, + "loss": 1.3044, + "step": 1532 + }, + { + "epoch": 0.019175479386984676, + "grad_norm": 3.275045156478882, + "learning_rate": 3.8350000000000006e-06, + "loss": 2.0141, + "step": 1534 + }, + { + "epoch": 0.0192004800120003, + "grad_norm": 7.310245037078857, + "learning_rate": 3.8400000000000005e-06, + "loss": 2.0546, + "step": 1536 + }, + { + "epoch": 0.019225480637015926, + "grad_norm": 3.457474708557129, + "learning_rate": 3.8450000000000005e-06, + "loss": 1.4345, + "step": 1538 + }, + { + "epoch": 0.01925048126203155, + "grad_norm": 7.814455032348633, + "learning_rate": 3.85e-06, + "loss": 1.0304, + "step": 1540 + }, + { + "epoch": 0.019275481887047176, + "grad_norm": 6.828482627868652, + "learning_rate": 3.855e-06, + "loss": 0.4697, + "step": 1542 + }, + { + "epoch": 0.019300482512062803, + "grad_norm": 4.276771068572998, + "learning_rate": 3.86e-06, + "loss": 1.8474, + "step": 1544 + }, + { + "epoch": 0.019325483137078426, + "grad_norm": 6.095361709594727, + "learning_rate": 3.865e-06, + "loss": 1.6391, + "step": 1546 + }, + { + "epoch": 0.019350483762094053, + "grad_norm": 4.690300941467285, + "learning_rate": 3.87e-06, + "loss": 1.127, + "step": 1548 + }, + { + "epoch": 0.019375484387109676, + "grad_norm": 19.6389102935791, + "learning_rate": 3.875e-06, + "loss": 0.1467, + "step": 1550 + }, + { + "epoch": 0.019400485012125303, + "grad_norm": 5.230897903442383, + "learning_rate": 3.88e-06, + "loss": 1.4272, + "step": 1552 + }, + { + "epoch": 0.01942548563714093, + "grad_norm": 5.714294910430908, + "learning_rate": 3.885e-06, + "loss": 1.526, + "step": 1554 + }, + { + "epoch": 0.019450486262156553, + "grad_norm": 0.6870523691177368, + "learning_rate": 3.89e-06, + "loss": 0.0051, + "step": 1556 + }, + { + "epoch": 0.01947548688717218, + "grad_norm": 7.913646697998047, + "learning_rate": 3.895000000000001e-06, + "loss": 1.2528, + "step": 1558 + }, + { + "epoch": 0.019500487512187803, + "grad_norm": 5.22213888168335, + "learning_rate": 3.900000000000001e-06, + "loss": 0.4742, + "step": 1560 + }, + { + "epoch": 0.01952548813720343, + "grad_norm": 2.0015101432800293, + "learning_rate": 3.905000000000001e-06, + "loss": 0.5129, + "step": 1562 + }, + { + "epoch": 0.019550488762219057, + "grad_norm": 15.245219230651855, + "learning_rate": 3.910000000000001e-06, + "loss": 2.0334, + "step": 1564 + }, + { + "epoch": 0.01957548938723468, + "grad_norm": 5.413457870483398, + "learning_rate": 3.915000000000001e-06, + "loss": 1.6559, + "step": 1566 + }, + { + "epoch": 0.019600490012250307, + "grad_norm": 9.123312950134277, + "learning_rate": 3.920000000000001e-06, + "loss": 1.763, + "step": 1568 + }, + { + "epoch": 0.019625490637265933, + "grad_norm": 5.879669666290283, + "learning_rate": 3.9250000000000005e-06, + "loss": 0.7586, + "step": 1570 + }, + { + "epoch": 0.019650491262281557, + "grad_norm": 3.072995185852051, + "learning_rate": 3.9300000000000005e-06, + "loss": 0.6832, + "step": 1572 + }, + { + "epoch": 0.019675491887297183, + "grad_norm": 0.7766585946083069, + "learning_rate": 3.9350000000000004e-06, + "loss": 0.3304, + "step": 1574 + }, + { + "epoch": 0.019700492512312807, + "grad_norm": 5.376052379608154, + "learning_rate": 3.94e-06, + "loss": 1.1929, + "step": 1576 + }, + { + "epoch": 0.019725493137328434, + "grad_norm": 11.517060279846191, + "learning_rate": 3.945e-06, + "loss": 0.8003, + "step": 1578 + }, + { + "epoch": 0.01975049376234406, + "grad_norm": 5.206607341766357, + "learning_rate": 3.95e-06, + "loss": 1.0408, + "step": 1580 + }, + { + "epoch": 0.019775494387359684, + "grad_norm": 4.695005416870117, + "learning_rate": 3.955e-06, + "loss": 0.774, + "step": 1582 + }, + { + "epoch": 0.01980049501237531, + "grad_norm": 3.244180679321289, + "learning_rate": 3.96e-06, + "loss": 1.1695, + "step": 1584 + }, + { + "epoch": 0.019825495637390934, + "grad_norm": 5.081257343292236, + "learning_rate": 3.965e-06, + "loss": 0.7717, + "step": 1586 + }, + { + "epoch": 0.01985049626240656, + "grad_norm": 7.281128406524658, + "learning_rate": 3.97e-06, + "loss": 1.4347, + "step": 1588 + }, + { + "epoch": 0.019875496887422187, + "grad_norm": 8.94804573059082, + "learning_rate": 3.975000000000001e-06, + "loss": 0.3398, + "step": 1590 + }, + { + "epoch": 0.01990049751243781, + "grad_norm": 4.35692834854126, + "learning_rate": 3.980000000000001e-06, + "loss": 0.8955, + "step": 1592 + }, + { + "epoch": 0.019925498137453437, + "grad_norm": 17.141939163208008, + "learning_rate": 3.985000000000001e-06, + "loss": 0.6445, + "step": 1594 + }, + { + "epoch": 0.01995049876246906, + "grad_norm": 0.3205932676792145, + "learning_rate": 3.990000000000001e-06, + "loss": 0.5055, + "step": 1596 + }, + { + "epoch": 0.019975499387484687, + "grad_norm": 0.38185539841651917, + "learning_rate": 3.995000000000001e-06, + "loss": 0.2592, + "step": 1598 + }, + { + "epoch": 0.020000500012500314, + "grad_norm": 6.062458038330078, + "learning_rate": 4.000000000000001e-06, + "loss": 1.8216, + "step": 1600 + }, + { + "epoch": 0.020025500637515938, + "grad_norm": 5.592759609222412, + "learning_rate": 4.005000000000001e-06, + "loss": 1.0001, + "step": 1602 + }, + { + "epoch": 0.020050501262531564, + "grad_norm": 6.06761360168457, + "learning_rate": 4.0100000000000006e-06, + "loss": 0.8174, + "step": 1604 + }, + { + "epoch": 0.020075501887547188, + "grad_norm": 8.678956985473633, + "learning_rate": 4.0150000000000005e-06, + "loss": 1.4174, + "step": 1606 + }, + { + "epoch": 0.020100502512562814, + "grad_norm": 7.352917671203613, + "learning_rate": 4.0200000000000005e-06, + "loss": 1.0101, + "step": 1608 + }, + { + "epoch": 0.02012550313757844, + "grad_norm": 4.8034515380859375, + "learning_rate": 4.0250000000000004e-06, + "loss": 0.9868, + "step": 1610 + }, + { + "epoch": 0.020150503762594064, + "grad_norm": 28.54228401184082, + "learning_rate": 4.03e-06, + "loss": 1.6249, + "step": 1612 + }, + { + "epoch": 0.02017550438760969, + "grad_norm": 6.471149921417236, + "learning_rate": 4.035e-06, + "loss": 1.3049, + "step": 1614 + }, + { + "epoch": 0.020200505012625315, + "grad_norm": 3.335527181625366, + "learning_rate": 4.04e-06, + "loss": 0.6382, + "step": 1616 + }, + { + "epoch": 0.02022550563764094, + "grad_norm": 6.9388427734375, + "learning_rate": 4.045e-06, + "loss": 1.1179, + "step": 1618 + }, + { + "epoch": 0.020250506262656568, + "grad_norm": 10.575672149658203, + "learning_rate": 4.05e-06, + "loss": 1.0496, + "step": 1620 + }, + { + "epoch": 0.02027550688767219, + "grad_norm": 4.838927745819092, + "learning_rate": 4.055000000000001e-06, + "loss": 1.0602, + "step": 1622 + }, + { + "epoch": 0.020300507512687818, + "grad_norm": 17.77973747253418, + "learning_rate": 4.060000000000001e-06, + "loss": 0.4789, + "step": 1624 + }, + { + "epoch": 0.02032550813770344, + "grad_norm": 11.179519653320312, + "learning_rate": 4.065e-06, + "loss": 1.123, + "step": 1626 + }, + { + "epoch": 0.020350508762719068, + "grad_norm": 5.714705467224121, + "learning_rate": 4.07e-06, + "loss": 1.8803, + "step": 1628 + }, + { + "epoch": 0.020375509387734695, + "grad_norm": 3.4289746284484863, + "learning_rate": 4.075e-06, + "loss": 0.5127, + "step": 1630 + }, + { + "epoch": 0.02040051001275032, + "grad_norm": 5.1447978019714355, + "learning_rate": 4.08e-06, + "loss": 1.5572, + "step": 1632 + }, + { + "epoch": 0.020425510637765945, + "grad_norm": 11.60823917388916, + "learning_rate": 4.085e-06, + "loss": 1.167, + "step": 1634 + }, + { + "epoch": 0.02045051126278157, + "grad_norm": 8.25395393371582, + "learning_rate": 4.09e-06, + "loss": 3.4028, + "step": 1636 + }, + { + "epoch": 0.020475511887797195, + "grad_norm": 1.8757425546646118, + "learning_rate": 4.095e-06, + "loss": 0.4763, + "step": 1638 + }, + { + "epoch": 0.020500512512812822, + "grad_norm": 5.837463855743408, + "learning_rate": 4.1e-06, + "loss": 1.8455, + "step": 1640 + }, + { + "epoch": 0.020525513137828445, + "grad_norm": 4.056856632232666, + "learning_rate": 4.1050000000000005e-06, + "loss": 1.2215, + "step": 1642 + }, + { + "epoch": 0.020550513762844072, + "grad_norm": 6.948950290679932, + "learning_rate": 4.1100000000000005e-06, + "loss": 1.0706, + "step": 1644 + }, + { + "epoch": 0.020575514387859695, + "grad_norm": 5.434691429138184, + "learning_rate": 4.115e-06, + "loss": 1.5309, + "step": 1646 + }, + { + "epoch": 0.020600515012875322, + "grad_norm": 4.153079032897949, + "learning_rate": 4.12e-06, + "loss": 0.9039, + "step": 1648 + }, + { + "epoch": 0.02062551563789095, + "grad_norm": 5.327980041503906, + "learning_rate": 4.125e-06, + "loss": 0.6095, + "step": 1650 + }, + { + "epoch": 0.020650516262906572, + "grad_norm": 8.792791366577148, + "learning_rate": 4.13e-06, + "loss": 1.236, + "step": 1652 + }, + { + "epoch": 0.0206755168879222, + "grad_norm": 2.849252462387085, + "learning_rate": 4.135e-06, + "loss": 0.441, + "step": 1654 + }, + { + "epoch": 0.020700517512937822, + "grad_norm": 19.992656707763672, + "learning_rate": 4.14e-06, + "loss": 1.5014, + "step": 1656 + }, + { + "epoch": 0.02072551813795345, + "grad_norm": 6.2397356033325195, + "learning_rate": 4.145e-06, + "loss": 1.521, + "step": 1658 + }, + { + "epoch": 0.020750518762969076, + "grad_norm": 4.158559799194336, + "learning_rate": 4.15e-06, + "loss": 0.1311, + "step": 1660 + }, + { + "epoch": 0.0207755193879847, + "grad_norm": 20.387958526611328, + "learning_rate": 4.155e-06, + "loss": 0.5206, + "step": 1662 + }, + { + "epoch": 0.020800520013000326, + "grad_norm": 4.728527069091797, + "learning_rate": 4.16e-06, + "loss": 2.2764, + "step": 1664 + }, + { + "epoch": 0.02082552063801595, + "grad_norm": 5.974364280700684, + "learning_rate": 4.165e-06, + "loss": 1.2143, + "step": 1666 + }, + { + "epoch": 0.020850521263031576, + "grad_norm": 5.907720565795898, + "learning_rate": 4.17e-06, + "loss": 1.5744, + "step": 1668 + }, + { + "epoch": 0.020875521888047203, + "grad_norm": 3.619338274002075, + "learning_rate": 4.175e-06, + "loss": 1.1206, + "step": 1670 + }, + { + "epoch": 0.020900522513062826, + "grad_norm": 8.586311340332031, + "learning_rate": 4.18e-06, + "loss": 0.9719, + "step": 1672 + }, + { + "epoch": 0.020925523138078453, + "grad_norm": 3.6847963333129883, + "learning_rate": 4.185000000000001e-06, + "loss": 0.4386, + "step": 1674 + }, + { + "epoch": 0.020950523763094076, + "grad_norm": 0.1761302947998047, + "learning_rate": 4.1900000000000005e-06, + "loss": 0.5247, + "step": 1676 + }, + { + "epoch": 0.020975524388109703, + "grad_norm": 6.226249694824219, + "learning_rate": 4.1950000000000005e-06, + "loss": 0.4753, + "step": 1678 + }, + { + "epoch": 0.02100052501312533, + "grad_norm": 9.165534973144531, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.6314, + "step": 1680 + }, + { + "epoch": 0.021025525638140953, + "grad_norm": 0.7852228879928589, + "learning_rate": 4.205e-06, + "loss": 0.5704, + "step": 1682 + }, + { + "epoch": 0.02105052626315658, + "grad_norm": 4.28530740737915, + "learning_rate": 4.21e-06, + "loss": 1.3988, + "step": 1684 + }, + { + "epoch": 0.021075526888172203, + "grad_norm": 4.502480506896973, + "learning_rate": 4.215e-06, + "loss": 0.0956, + "step": 1686 + }, + { + "epoch": 0.02110052751318783, + "grad_norm": 13.522788047790527, + "learning_rate": 4.22e-06, + "loss": 1.8208, + "step": 1688 + }, + { + "epoch": 0.021125528138203457, + "grad_norm": 8.479382514953613, + "learning_rate": 4.225e-06, + "loss": 0.6763, + "step": 1690 + }, + { + "epoch": 0.02115052876321908, + "grad_norm": 8.093138694763184, + "learning_rate": 4.23e-06, + "loss": 2.7042, + "step": 1692 + }, + { + "epoch": 0.021175529388234707, + "grad_norm": 0.08447739481925964, + "learning_rate": 4.235e-06, + "loss": 0.8753, + "step": 1694 + }, + { + "epoch": 0.02120053001325033, + "grad_norm": 7.573391437530518, + "learning_rate": 4.24e-06, + "loss": 1.384, + "step": 1696 + }, + { + "epoch": 0.021225530638265957, + "grad_norm": 1.912915587425232, + "learning_rate": 4.245e-06, + "loss": 0.6336, + "step": 1698 + }, + { + "epoch": 0.021250531263281584, + "grad_norm": 13.668278694152832, + "learning_rate": 4.25e-06, + "loss": 0.4222, + "step": 1700 + }, + { + "epoch": 0.021275531888297207, + "grad_norm": 3.632990837097168, + "learning_rate": 4.255e-06, + "loss": 1.1892, + "step": 1702 + }, + { + "epoch": 0.021300532513312834, + "grad_norm": 3.399291515350342, + "learning_rate": 4.26e-06, + "loss": 0.5358, + "step": 1704 + }, + { + "epoch": 0.021325533138328457, + "grad_norm": 4.140748977661133, + "learning_rate": 4.265000000000001e-06, + "loss": 1.2509, + "step": 1706 + }, + { + "epoch": 0.021350533763344084, + "grad_norm": 5.467991352081299, + "learning_rate": 4.270000000000001e-06, + "loss": 1.2874, + "step": 1708 + }, + { + "epoch": 0.02137553438835971, + "grad_norm": 5.809440612792969, + "learning_rate": 4.2750000000000006e-06, + "loss": 1.6392, + "step": 1710 + }, + { + "epoch": 0.021400535013375334, + "grad_norm": 9.520228385925293, + "learning_rate": 4.2800000000000005e-06, + "loss": 0.1143, + "step": 1712 + }, + { + "epoch": 0.02142553563839096, + "grad_norm": 5.714423179626465, + "learning_rate": 4.2850000000000005e-06, + "loss": 1.8939, + "step": 1714 + }, + { + "epoch": 0.021450536263406584, + "grad_norm": 5.93101167678833, + "learning_rate": 4.2900000000000004e-06, + "loss": 1.3323, + "step": 1716 + }, + { + "epoch": 0.02147553688842221, + "grad_norm": 7.559731960296631, + "learning_rate": 4.295e-06, + "loss": 0.5531, + "step": 1718 + }, + { + "epoch": 0.021500537513437838, + "grad_norm": 1.8286492824554443, + "learning_rate": 4.3e-06, + "loss": 1.2097, + "step": 1720 + }, + { + "epoch": 0.02152553813845346, + "grad_norm": 5.059815406799316, + "learning_rate": 4.305e-06, + "loss": 1.4767, + "step": 1722 + }, + { + "epoch": 0.021550538763469088, + "grad_norm": 4.438327789306641, + "learning_rate": 4.31e-06, + "loss": 1.1234, + "step": 1724 + }, + { + "epoch": 0.02157553938848471, + "grad_norm": 4.635177135467529, + "learning_rate": 4.315e-06, + "loss": 1.0718, + "step": 1726 + }, + { + "epoch": 0.021600540013500338, + "grad_norm": 10.60247802734375, + "learning_rate": 4.32e-06, + "loss": 0.8391, + "step": 1728 + }, + { + "epoch": 0.021625540638515964, + "grad_norm": 6.18443489074707, + "learning_rate": 4.325e-06, + "loss": 1.2539, + "step": 1730 + }, + { + "epoch": 0.021650541263531588, + "grad_norm": 5.132458209991455, + "learning_rate": 4.33e-06, + "loss": 1.0436, + "step": 1732 + }, + { + "epoch": 0.021675541888547215, + "grad_norm": 2.096583127975464, + "learning_rate": 4.335e-06, + "loss": 0.1959, + "step": 1734 + }, + { + "epoch": 0.021700542513562838, + "grad_norm": 0.25161150097846985, + "learning_rate": 4.34e-06, + "loss": 0.7696, + "step": 1736 + }, + { + "epoch": 0.021725543138578465, + "grad_norm": 12.427597045898438, + "learning_rate": 4.345000000000001e-06, + "loss": 0.5883, + "step": 1738 + }, + { + "epoch": 0.02175054376359409, + "grad_norm": 4.851448059082031, + "learning_rate": 4.350000000000001e-06, + "loss": 0.4471, + "step": 1740 + }, + { + "epoch": 0.021775544388609715, + "grad_norm": 9.657316207885742, + "learning_rate": 4.355000000000001e-06, + "loss": 1.321, + "step": 1742 + }, + { + "epoch": 0.02180054501362534, + "grad_norm": 3.3799076080322266, + "learning_rate": 4.360000000000001e-06, + "loss": 1.0189, + "step": 1744 + }, + { + "epoch": 0.021825545638640965, + "grad_norm": 6.16371488571167, + "learning_rate": 4.3650000000000006e-06, + "loss": 1.1922, + "step": 1746 + }, + { + "epoch": 0.02185054626365659, + "grad_norm": 0.19362249970436096, + "learning_rate": 4.3700000000000005e-06, + "loss": 0.1331, + "step": 1748 + }, + { + "epoch": 0.02187554688867222, + "grad_norm": 6.0354743003845215, + "learning_rate": 4.3750000000000005e-06, + "loss": 1.3944, + "step": 1750 + }, + { + "epoch": 0.02190054751368784, + "grad_norm": 4.702625751495361, + "learning_rate": 4.38e-06, + "loss": 1.852, + "step": 1752 + }, + { + "epoch": 0.02192554813870347, + "grad_norm": 5.822668075561523, + "learning_rate": 4.385e-06, + "loss": 1.0173, + "step": 1754 + }, + { + "epoch": 0.02195054876371909, + "grad_norm": 5.000926494598389, + "learning_rate": 4.39e-06, + "loss": 1.2995, + "step": 1756 + }, + { + "epoch": 0.02197554938873472, + "grad_norm": 5.577164649963379, + "learning_rate": 4.395e-06, + "loss": 1.2711, + "step": 1758 + }, + { + "epoch": 0.022000550013750345, + "grad_norm": 10.147957801818848, + "learning_rate": 4.4e-06, + "loss": 1.7199, + "step": 1760 + }, + { + "epoch": 0.02202555063876597, + "grad_norm": 5.149634838104248, + "learning_rate": 4.405e-06, + "loss": 1.8854, + "step": 1762 + }, + { + "epoch": 0.022050551263781595, + "grad_norm": 5.4645867347717285, + "learning_rate": 4.41e-06, + "loss": 1.4749, + "step": 1764 + }, + { + "epoch": 0.02207555188879722, + "grad_norm": 0.038768790662288666, + "learning_rate": 4.415e-06, + "loss": 0.5171, + "step": 1766 + }, + { + "epoch": 0.022100552513812845, + "grad_norm": 7.952024936676025, + "learning_rate": 4.42e-06, + "loss": 1.4092, + "step": 1768 + }, + { + "epoch": 0.022125553138828472, + "grad_norm": 6.446439266204834, + "learning_rate": 4.425e-06, + "loss": 1.664, + "step": 1770 + }, + { + "epoch": 0.022150553763844096, + "grad_norm": 6.178738117218018, + "learning_rate": 4.430000000000001e-06, + "loss": 1.4694, + "step": 1772 + }, + { + "epoch": 0.022175554388859722, + "grad_norm": 4.139686584472656, + "learning_rate": 4.435000000000001e-06, + "loss": 0.8245, + "step": 1774 + }, + { + "epoch": 0.022200555013875346, + "grad_norm": 3.3275973796844482, + "learning_rate": 4.440000000000001e-06, + "loss": 1.5682, + "step": 1776 + }, + { + "epoch": 0.022225555638890972, + "grad_norm": 5.361576080322266, + "learning_rate": 4.445000000000001e-06, + "loss": 1.5856, + "step": 1778 + }, + { + "epoch": 0.0222505562639066, + "grad_norm": 5.487329483032227, + "learning_rate": 4.450000000000001e-06, + "loss": 1.0759, + "step": 1780 + }, + { + "epoch": 0.022275556888922222, + "grad_norm": 7.934917449951172, + "learning_rate": 4.4550000000000005e-06, + "loss": 1.3343, + "step": 1782 + }, + { + "epoch": 0.02230055751393785, + "grad_norm": 5.0733747482299805, + "learning_rate": 4.4600000000000005e-06, + "loss": 1.0897, + "step": 1784 + }, + { + "epoch": 0.022325558138953473, + "grad_norm": 5.945675373077393, + "learning_rate": 4.4650000000000004e-06, + "loss": 1.252, + "step": 1786 + }, + { + "epoch": 0.0223505587639691, + "grad_norm": 5.091784477233887, + "learning_rate": 4.47e-06, + "loss": 1.1608, + "step": 1788 + }, + { + "epoch": 0.022375559388984726, + "grad_norm": 9.925880432128906, + "learning_rate": 4.475e-06, + "loss": 0.0912, + "step": 1790 + }, + { + "epoch": 0.02240056001400035, + "grad_norm": 4.9283127784729, + "learning_rate": 4.48e-06, + "loss": 0.5905, + "step": 1792 + }, + { + "epoch": 0.022425560639015976, + "grad_norm": 3.611419677734375, + "learning_rate": 4.485e-06, + "loss": 1.2309, + "step": 1794 + }, + { + "epoch": 0.0224505612640316, + "grad_norm": 3.2002058029174805, + "learning_rate": 4.49e-06, + "loss": 0.5874, + "step": 1796 + }, + { + "epoch": 0.022475561889047226, + "grad_norm": 4.83671760559082, + "learning_rate": 4.495e-06, + "loss": 1.4785, + "step": 1798 + }, + { + "epoch": 0.022500562514062853, + "grad_norm": 12.390768051147461, + "learning_rate": 4.5e-06, + "loss": 1.8503, + "step": 1800 + }, + { + "epoch": 0.022525563139078476, + "grad_norm": 6.23271369934082, + "learning_rate": 4.505e-06, + "loss": 0.3725, + "step": 1802 + }, + { + "epoch": 0.022550563764094103, + "grad_norm": 2.919323444366455, + "learning_rate": 4.510000000000001e-06, + "loss": 0.8251, + "step": 1804 + }, + { + "epoch": 0.022575564389109726, + "grad_norm": 12.005715370178223, + "learning_rate": 4.515000000000001e-06, + "loss": 0.9222, + "step": 1806 + }, + { + "epoch": 0.022600565014125353, + "grad_norm": 6.110873222351074, + "learning_rate": 4.520000000000001e-06, + "loss": 1.4457, + "step": 1808 + }, + { + "epoch": 0.02262556563914098, + "grad_norm": 11.622342109680176, + "learning_rate": 4.525000000000001e-06, + "loss": 2.7071, + "step": 1810 + }, + { + "epoch": 0.022650566264156603, + "grad_norm": 4.146234035491943, + "learning_rate": 4.530000000000001e-06, + "loss": 0.9982, + "step": 1812 + }, + { + "epoch": 0.02267556688917223, + "grad_norm": 12.843299865722656, + "learning_rate": 4.535000000000001e-06, + "loss": 1.4061, + "step": 1814 + }, + { + "epoch": 0.022700567514187853, + "grad_norm": 7.355935096740723, + "learning_rate": 4.540000000000001e-06, + "loss": 1.0978, + "step": 1816 + }, + { + "epoch": 0.02272556813920348, + "grad_norm": 0.07495700567960739, + "learning_rate": 4.5450000000000005e-06, + "loss": 0.489, + "step": 1818 + }, + { + "epoch": 0.022750568764219107, + "grad_norm": 2.8531012535095215, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.6683, + "step": 1820 + }, + { + "epoch": 0.02277556938923473, + "grad_norm": 4.403096675872803, + "learning_rate": 4.5550000000000004e-06, + "loss": 1.0079, + "step": 1822 + }, + { + "epoch": 0.022800570014250357, + "grad_norm": 27.825340270996094, + "learning_rate": 4.56e-06, + "loss": 0.5352, + "step": 1824 + }, + { + "epoch": 0.02282557063926598, + "grad_norm": 4.201235294342041, + "learning_rate": 4.565e-06, + "loss": 1.027, + "step": 1826 + }, + { + "epoch": 0.022850571264281607, + "grad_norm": 6.478667259216309, + "learning_rate": 4.57e-06, + "loss": 0.8169, + "step": 1828 + }, + { + "epoch": 0.022875571889297234, + "grad_norm": 4.57330846786499, + "learning_rate": 4.575e-06, + "loss": 1.2228, + "step": 1830 + }, + { + "epoch": 0.022900572514312857, + "grad_norm": 5.403872966766357, + "learning_rate": 4.58e-06, + "loss": 1.1627, + "step": 1832 + }, + { + "epoch": 0.022925573139328484, + "grad_norm": 3.013397216796875, + "learning_rate": 4.585e-06, + "loss": 0.4145, + "step": 1834 + }, + { + "epoch": 0.022950573764344107, + "grad_norm": 0.11450282484292984, + "learning_rate": 4.590000000000001e-06, + "loss": 0.1416, + "step": 1836 + }, + { + "epoch": 0.022975574389359734, + "grad_norm": 3.6899635791778564, + "learning_rate": 4.595000000000001e-06, + "loss": 1.5243, + "step": 1838 + }, + { + "epoch": 0.02300057501437536, + "grad_norm": 9.465859413146973, + "learning_rate": 4.600000000000001e-06, + "loss": 0.9515, + "step": 1840 + }, + { + "epoch": 0.023025575639390984, + "grad_norm": 7.667425155639648, + "learning_rate": 4.605000000000001e-06, + "loss": 1.4502, + "step": 1842 + }, + { + "epoch": 0.02305057626440661, + "grad_norm": 0.1172448992729187, + "learning_rate": 4.610000000000001e-06, + "loss": 0.4584, + "step": 1844 + }, + { + "epoch": 0.023075576889422234, + "grad_norm": 2.1584503650665283, + "learning_rate": 4.615000000000001e-06, + "loss": 0.147, + "step": 1846 + }, + { + "epoch": 0.02310057751443786, + "grad_norm": 5.825212478637695, + "learning_rate": 4.620000000000001e-06, + "loss": 0.8334, + "step": 1848 + }, + { + "epoch": 0.023125578139453488, + "grad_norm": 2.1255860328674316, + "learning_rate": 4.625000000000001e-06, + "loss": 1.6242, + "step": 1850 + }, + { + "epoch": 0.02315057876446911, + "grad_norm": 14.240114212036133, + "learning_rate": 4.6300000000000006e-06, + "loss": 0.5643, + "step": 1852 + }, + { + "epoch": 0.023175579389484738, + "grad_norm": 3.8280746936798096, + "learning_rate": 4.6350000000000005e-06, + "loss": 0.6096, + "step": 1854 + }, + { + "epoch": 0.02320058001450036, + "grad_norm": 7.26607608795166, + "learning_rate": 4.6400000000000005e-06, + "loss": 0.8639, + "step": 1856 + }, + { + "epoch": 0.023225580639515988, + "grad_norm": 6.664104461669922, + "learning_rate": 4.645e-06, + "loss": 0.9025, + "step": 1858 + }, + { + "epoch": 0.023250581264531615, + "grad_norm": 2.808490514755249, + "learning_rate": 4.65e-06, + "loss": 0.4382, + "step": 1860 + }, + { + "epoch": 0.023275581889547238, + "grad_norm": 5.821073055267334, + "learning_rate": 4.655e-06, + "loss": 1.8215, + "step": 1862 + }, + { + "epoch": 0.023300582514562865, + "grad_norm": 9.595924377441406, + "learning_rate": 4.66e-06, + "loss": 2.3263, + "step": 1864 + }, + { + "epoch": 0.023325583139578488, + "grad_norm": 5.422000885009766, + "learning_rate": 4.665e-06, + "loss": 0.771, + "step": 1866 + }, + { + "epoch": 0.023350583764594115, + "grad_norm": 8.986736297607422, + "learning_rate": 4.670000000000001e-06, + "loss": 1.1111, + "step": 1868 + }, + { + "epoch": 0.02337558438960974, + "grad_norm": 5.5983991622924805, + "learning_rate": 4.675000000000001e-06, + "loss": 0.8591, + "step": 1870 + }, + { + "epoch": 0.023400585014625365, + "grad_norm": 3.9468092918395996, + "learning_rate": 4.680000000000001e-06, + "loss": 0.8171, + "step": 1872 + }, + { + "epoch": 0.02342558563964099, + "grad_norm": 6.161650657653809, + "learning_rate": 4.685000000000001e-06, + "loss": 1.1663, + "step": 1874 + }, + { + "epoch": 0.023450586264656615, + "grad_norm": 0.04434753954410553, + "learning_rate": 4.69e-06, + "loss": 0.4904, + "step": 1876 + }, + { + "epoch": 0.023475586889672242, + "grad_norm": 2.869147300720215, + "learning_rate": 4.695e-06, + "loss": 0.6887, + "step": 1878 + }, + { + "epoch": 0.02350058751468787, + "grad_norm": 4.325875759124756, + "learning_rate": 4.7e-06, + "loss": 0.6088, + "step": 1880 + }, + { + "epoch": 0.023525588139703492, + "grad_norm": 6.7650299072265625, + "learning_rate": 4.705e-06, + "loss": 1.5615, + "step": 1882 + }, + { + "epoch": 0.02355058876471912, + "grad_norm": 4.7980451583862305, + "learning_rate": 4.71e-06, + "loss": 0.4455, + "step": 1884 + }, + { + "epoch": 0.023575589389734742, + "grad_norm": 2.885120391845703, + "learning_rate": 4.715e-06, + "loss": 1.0763, + "step": 1886 + }, + { + "epoch": 0.02360059001475037, + "grad_norm": 4.579460144042969, + "learning_rate": 4.7200000000000005e-06, + "loss": 0.8845, + "step": 1888 + }, + { + "epoch": 0.023625590639765996, + "grad_norm": 5.013570308685303, + "learning_rate": 4.7250000000000005e-06, + "loss": 1.7075, + "step": 1890 + }, + { + "epoch": 0.02365059126478162, + "grad_norm": 3.8787293434143066, + "learning_rate": 4.7300000000000005e-06, + "loss": 0.579, + "step": 1892 + }, + { + "epoch": 0.023675591889797246, + "grad_norm": 5.264076232910156, + "learning_rate": 4.735e-06, + "loss": 1.6827, + "step": 1894 + }, + { + "epoch": 0.02370059251481287, + "grad_norm": 0.6900630593299866, + "learning_rate": 4.74e-06, + "loss": 0.2091, + "step": 1896 + }, + { + "epoch": 0.023725593139828496, + "grad_norm": 6.3414082527160645, + "learning_rate": 4.745e-06, + "loss": 2.8715, + "step": 1898 + }, + { + "epoch": 0.023750593764844122, + "grad_norm": 0.275383323431015, + "learning_rate": 4.75e-06, + "loss": 0.7954, + "step": 1900 + }, + { + "epoch": 0.023775594389859746, + "grad_norm": 6.359581470489502, + "learning_rate": 4.755e-06, + "loss": 1.0571, + "step": 1902 + }, + { + "epoch": 0.023800595014875373, + "grad_norm": 2.4573354721069336, + "learning_rate": 4.76e-06, + "loss": 1.195, + "step": 1904 + }, + { + "epoch": 0.023825595639890996, + "grad_norm": 8.996787071228027, + "learning_rate": 4.765e-06, + "loss": 0.2276, + "step": 1906 + }, + { + "epoch": 0.023850596264906623, + "grad_norm": 4.454753875732422, + "learning_rate": 4.77e-06, + "loss": 1.0496, + "step": 1908 + }, + { + "epoch": 0.02387559688992225, + "grad_norm": 3.552520513534546, + "learning_rate": 4.775e-06, + "loss": 0.2581, + "step": 1910 + }, + { + "epoch": 0.023900597514937873, + "grad_norm": 4.080337047576904, + "learning_rate": 4.78e-06, + "loss": 0.2007, + "step": 1912 + }, + { + "epoch": 0.0239255981399535, + "grad_norm": 0.56992506980896, + "learning_rate": 4.785e-06, + "loss": 0.0054, + "step": 1914 + }, + { + "epoch": 0.023950598764969123, + "grad_norm": 5.773391246795654, + "learning_rate": 4.79e-06, + "loss": 1.5971, + "step": 1916 + }, + { + "epoch": 0.02397559938998475, + "grad_norm": 3.2008235454559326, + "learning_rate": 4.795e-06, + "loss": 1.3304, + "step": 1918 + }, + { + "epoch": 0.024000600015000376, + "grad_norm": 5.9241251945495605, + "learning_rate": 4.800000000000001e-06, + "loss": 1.6981, + "step": 1920 + }, + { + "epoch": 0.024025600640016, + "grad_norm": 2.5579755306243896, + "learning_rate": 4.805000000000001e-06, + "loss": 0.7156, + "step": 1922 + }, + { + "epoch": 0.024050601265031626, + "grad_norm": 5.834214687347412, + "learning_rate": 4.8100000000000005e-06, + "loss": 1.6484, + "step": 1924 + }, + { + "epoch": 0.02407560189004725, + "grad_norm": 9.029654502868652, + "learning_rate": 4.8150000000000005e-06, + "loss": 1.5595, + "step": 1926 + }, + { + "epoch": 0.024100602515062877, + "grad_norm": 4.785956382751465, + "learning_rate": 4.8200000000000004e-06, + "loss": 1.5334, + "step": 1928 + }, + { + "epoch": 0.024125603140078503, + "grad_norm": 7.770689010620117, + "learning_rate": 4.825e-06, + "loss": 2.452, + "step": 1930 + }, + { + "epoch": 0.024150603765094127, + "grad_norm": 6.4121785163879395, + "learning_rate": 4.83e-06, + "loss": 1.7692, + "step": 1932 + }, + { + "epoch": 0.024175604390109753, + "grad_norm": 5.294864654541016, + "learning_rate": 4.835e-06, + "loss": 1.1464, + "step": 1934 + }, + { + "epoch": 0.024200605015125377, + "grad_norm": 9.703957557678223, + "learning_rate": 4.84e-06, + "loss": 1.7385, + "step": 1936 + }, + { + "epoch": 0.024225605640141003, + "grad_norm": 4.286798000335693, + "learning_rate": 4.845e-06, + "loss": 0.3776, + "step": 1938 + }, + { + "epoch": 0.02425060626515663, + "grad_norm": 4.179155349731445, + "learning_rate": 4.85e-06, + "loss": 0.9998, + "step": 1940 + }, + { + "epoch": 0.024275606890172254, + "grad_norm": 1.094642162322998, + "learning_rate": 4.855e-06, + "loss": 0.0497, + "step": 1942 + }, + { + "epoch": 0.02430060751518788, + "grad_norm": 12.948932647705078, + "learning_rate": 4.86e-06, + "loss": 0.4983, + "step": 1944 + }, + { + "epoch": 0.024325608140203504, + "grad_norm": 14.276754379272461, + "learning_rate": 4.865e-06, + "loss": 1.0307, + "step": 1946 + }, + { + "epoch": 0.02435060876521913, + "grad_norm": 3.7512359619140625, + "learning_rate": 4.87e-06, + "loss": 1.1584, + "step": 1948 + }, + { + "epoch": 0.024375609390234757, + "grad_norm": 6.091309547424316, + "learning_rate": 4.875e-06, + "loss": 0.8401, + "step": 1950 + }, + { + "epoch": 0.02440061001525038, + "grad_norm": 8.864042282104492, + "learning_rate": 4.880000000000001e-06, + "loss": 1.3154, + "step": 1952 + }, + { + "epoch": 0.024425610640266007, + "grad_norm": 4.9369635581970215, + "learning_rate": 4.885000000000001e-06, + "loss": 1.0454, + "step": 1954 + }, + { + "epoch": 0.02445061126528163, + "grad_norm": 6.221229076385498, + "learning_rate": 4.890000000000001e-06, + "loss": 1.3434, + "step": 1956 + }, + { + "epoch": 0.024475611890297257, + "grad_norm": 0.06183359771966934, + "learning_rate": 4.8950000000000006e-06, + "loss": 0.6417, + "step": 1958 + }, + { + "epoch": 0.024500612515312884, + "grad_norm": 3.0544850826263428, + "learning_rate": 4.9000000000000005e-06, + "loss": 1.2203, + "step": 1960 + }, + { + "epoch": 0.024525613140328507, + "grad_norm": 9.81173038482666, + "learning_rate": 4.9050000000000005e-06, + "loss": 1.2989, + "step": 1962 + }, + { + "epoch": 0.024550613765344134, + "grad_norm": 3.5229787826538086, + "learning_rate": 4.9100000000000004e-06, + "loss": 1.1164, + "step": 1964 + }, + { + "epoch": 0.024575614390359758, + "grad_norm": 3.2806143760681152, + "learning_rate": 4.915e-06, + "loss": 0.7414, + "step": 1966 + }, + { + "epoch": 0.024600615015375384, + "grad_norm": 4.711001873016357, + "learning_rate": 4.92e-06, + "loss": 1.2481, + "step": 1968 + }, + { + "epoch": 0.02462561564039101, + "grad_norm": 3.3577377796173096, + "learning_rate": 4.925e-06, + "loss": 0.7405, + "step": 1970 + }, + { + "epoch": 0.024650616265406634, + "grad_norm": 3.835336208343506, + "learning_rate": 4.93e-06, + "loss": 1.9958, + "step": 1972 + }, + { + "epoch": 0.02467561689042226, + "grad_norm": 4.798999786376953, + "learning_rate": 4.935e-06, + "loss": 1.6506, + "step": 1974 + }, + { + "epoch": 0.024700617515437884, + "grad_norm": 12.432356834411621, + "learning_rate": 4.94e-06, + "loss": 0.7373, + "step": 1976 + }, + { + "epoch": 0.02472561814045351, + "grad_norm": 4.940716743469238, + "learning_rate": 4.945e-06, + "loss": 1.7292, + "step": 1978 + }, + { + "epoch": 0.024750618765469138, + "grad_norm": 4.116450309753418, + "learning_rate": 4.95e-06, + "loss": 0.9944, + "step": 1980 + }, + { + "epoch": 0.02477561939048476, + "grad_norm": 5.585843086242676, + "learning_rate": 4.955e-06, + "loss": 0.8259, + "step": 1982 + }, + { + "epoch": 0.024800620015500388, + "grad_norm": 5.660200595855713, + "learning_rate": 4.960000000000001e-06, + "loss": 1.6088, + "step": 1984 + }, + { + "epoch": 0.02482562064051601, + "grad_norm": 4.157122611999512, + "learning_rate": 4.965000000000001e-06, + "loss": 1.3935, + "step": 1986 + }, + { + "epoch": 0.024850621265531638, + "grad_norm": 4.775374889373779, + "learning_rate": 4.970000000000001e-06, + "loss": 0.9111, + "step": 1988 + }, + { + "epoch": 0.024875621890547265, + "grad_norm": 6.9031572341918945, + "learning_rate": 4.975000000000001e-06, + "loss": 1.879, + "step": 1990 + }, + { + "epoch": 0.024900622515562888, + "grad_norm": 4.134537220001221, + "learning_rate": 4.980000000000001e-06, + "loss": 0.7117, + "step": 1992 + }, + { + "epoch": 0.024925623140578515, + "grad_norm": 5.312216281890869, + "learning_rate": 4.9850000000000006e-06, + "loss": 2.1114, + "step": 1994 + }, + { + "epoch": 0.02495062376559414, + "grad_norm": 4.475594520568848, + "learning_rate": 4.9900000000000005e-06, + "loss": 1.6537, + "step": 1996 + }, + { + "epoch": 0.024975624390609765, + "grad_norm": 3.065894603729248, + "learning_rate": 4.9950000000000005e-06, + "loss": 0.976, + "step": 1998 + }, + { + "epoch": 0.025000625015625392, + "grad_norm": 5.835387706756592, + "learning_rate": 5e-06, + "loss": 0.7601, + "step": 2000 + }, + { + "epoch": 0.025025625640641015, + "grad_norm": 7.646363735198975, + "learning_rate": 5.0049999999999995e-06, + "loss": 0.9823, + "step": 2002 + }, + { + "epoch": 0.025050626265656642, + "grad_norm": 6.621387004852295, + "learning_rate": 5.01e-06, + "loss": 0.4404, + "step": 2004 + }, + { + "epoch": 0.025075626890672265, + "grad_norm": 6.917049407958984, + "learning_rate": 5.015e-06, + "loss": 1.2786, + "step": 2006 + }, + { + "epoch": 0.025100627515687892, + "grad_norm": 3.381976366043091, + "learning_rate": 5.02e-06, + "loss": 0.1479, + "step": 2008 + }, + { + "epoch": 0.02512562814070352, + "grad_norm": 3.872148036956787, + "learning_rate": 5.025e-06, + "loss": 1.4982, + "step": 2010 + }, + { + "epoch": 0.025150628765719142, + "grad_norm": 8.944808006286621, + "learning_rate": 5.03e-06, + "loss": 2.6316, + "step": 2012 + }, + { + "epoch": 0.02517562939073477, + "grad_norm": 8.206018447875977, + "learning_rate": 5.035e-06, + "loss": 2.3599, + "step": 2014 + }, + { + "epoch": 0.025200630015750392, + "grad_norm": 8.056687355041504, + "learning_rate": 5.04e-06, + "loss": 1.3191, + "step": 2016 + }, + { + "epoch": 0.02522563064076602, + "grad_norm": 6.090352535247803, + "learning_rate": 5.045e-06, + "loss": 2.1333, + "step": 2018 + }, + { + "epoch": 0.025250631265781646, + "grad_norm": 6.081569194793701, + "learning_rate": 5.050000000000001e-06, + "loss": 1.8517, + "step": 2020 + }, + { + "epoch": 0.02527563189079727, + "grad_norm": 4.579004764556885, + "learning_rate": 5.055e-06, + "loss": 0.4994, + "step": 2022 + }, + { + "epoch": 0.025300632515812896, + "grad_norm": 5.618497848510742, + "learning_rate": 5.060000000000001e-06, + "loss": 1.2945, + "step": 2024 + }, + { + "epoch": 0.02532563314082852, + "grad_norm": 6.517106533050537, + "learning_rate": 5.065e-06, + "loss": 1.7763, + "step": 2026 + }, + { + "epoch": 0.025350633765844146, + "grad_norm": 9.73573112487793, + "learning_rate": 5.070000000000001e-06, + "loss": 0.8226, + "step": 2028 + }, + { + "epoch": 0.025375634390859773, + "grad_norm": 4.583073616027832, + "learning_rate": 5.075e-06, + "loss": 1.6633, + "step": 2030 + }, + { + "epoch": 0.025400635015875396, + "grad_norm": 4.365197658538818, + "learning_rate": 5.0800000000000005e-06, + "loss": 0.3702, + "step": 2032 + }, + { + "epoch": 0.025425635640891023, + "grad_norm": 3.9542250633239746, + "learning_rate": 5.085e-06, + "loss": 0.53, + "step": 2034 + }, + { + "epoch": 0.025450636265906646, + "grad_norm": 7.2433180809021, + "learning_rate": 5.09e-06, + "loss": 0.5442, + "step": 2036 + }, + { + "epoch": 0.025475636890922273, + "grad_norm": 14.216182708740234, + "learning_rate": 5.095e-06, + "loss": 1.4089, + "step": 2038 + }, + { + "epoch": 0.0255006375159379, + "grad_norm": 2.754319906234741, + "learning_rate": 5.1e-06, + "loss": 1.1479, + "step": 2040 + }, + { + "epoch": 0.025525638140953523, + "grad_norm": 10.385480880737305, + "learning_rate": 5.105e-06, + "loss": 1.6345, + "step": 2042 + }, + { + "epoch": 0.02555063876596915, + "grad_norm": 4.231367111206055, + "learning_rate": 5.11e-06, + "loss": 0.9401, + "step": 2044 + }, + { + "epoch": 0.025575639390984773, + "grad_norm": 8.052108764648438, + "learning_rate": 5.115e-06, + "loss": 1.1679, + "step": 2046 + }, + { + "epoch": 0.0256006400160004, + "grad_norm": 4.916394233703613, + "learning_rate": 5.12e-06, + "loss": 1.8118, + "step": 2048 + }, + { + "epoch": 0.025625640641016027, + "grad_norm": 3.533641815185547, + "learning_rate": 5.125e-06, + "loss": 1.3236, + "step": 2050 + }, + { + "epoch": 0.02565064126603165, + "grad_norm": 0.43872904777526855, + "learning_rate": 5.130000000000001e-06, + "loss": 0.7651, + "step": 2052 + }, + { + "epoch": 0.025675641891047277, + "grad_norm": 3.4683520793914795, + "learning_rate": 5.135e-06, + "loss": 0.6913, + "step": 2054 + }, + { + "epoch": 0.0257006425160629, + "grad_norm": 4.748582363128662, + "learning_rate": 5.140000000000001e-06, + "loss": 0.3736, + "step": 2056 + }, + { + "epoch": 0.025725643141078527, + "grad_norm": 5.036345958709717, + "learning_rate": 5.145e-06, + "loss": 0.6167, + "step": 2058 + }, + { + "epoch": 0.025750643766094154, + "grad_norm": 3.5474090576171875, + "learning_rate": 5.150000000000001e-06, + "loss": 1.0767, + "step": 2060 + }, + { + "epoch": 0.025775644391109777, + "grad_norm": 4.333105564117432, + "learning_rate": 5.155e-06, + "loss": 0.5127, + "step": 2062 + }, + { + "epoch": 0.025800645016125404, + "grad_norm": 14.635791778564453, + "learning_rate": 5.1600000000000006e-06, + "loss": 0.4447, + "step": 2064 + }, + { + "epoch": 0.025825645641141027, + "grad_norm": 11.28579044342041, + "learning_rate": 5.165e-06, + "loss": 2.0662, + "step": 2066 + }, + { + "epoch": 0.025850646266156654, + "grad_norm": 0.1681651473045349, + "learning_rate": 5.1700000000000005e-06, + "loss": 0.1968, + "step": 2068 + }, + { + "epoch": 0.02587564689117228, + "grad_norm": 0.1906391680240631, + "learning_rate": 5.1750000000000004e-06, + "loss": 0.3165, + "step": 2070 + }, + { + "epoch": 0.025900647516187904, + "grad_norm": 0.27196604013442993, + "learning_rate": 5.18e-06, + "loss": 0.9088, + "step": 2072 + }, + { + "epoch": 0.02592564814120353, + "grad_norm": 10.85153865814209, + "learning_rate": 5.185e-06, + "loss": 0.9195, + "step": 2074 + }, + { + "epoch": 0.025950648766219154, + "grad_norm": 3.9174373149871826, + "learning_rate": 5.19e-06, + "loss": 0.691, + "step": 2076 + }, + { + "epoch": 0.02597564939123478, + "grad_norm": 2.0227959156036377, + "learning_rate": 5.195e-06, + "loss": 0.2234, + "step": 2078 + }, + { + "epoch": 0.026000650016250407, + "grad_norm": 6.444292068481445, + "learning_rate": 5.2e-06, + "loss": 1.0602, + "step": 2080 + }, + { + "epoch": 0.02602565064126603, + "grad_norm": 11.03789234161377, + "learning_rate": 5.205e-06, + "loss": 1.165, + "step": 2082 + }, + { + "epoch": 0.026050651266281658, + "grad_norm": 4.738453388214111, + "learning_rate": 5.210000000000001e-06, + "loss": 0.8328, + "step": 2084 + }, + { + "epoch": 0.02607565189129728, + "grad_norm": 4.6145524978637695, + "learning_rate": 5.215e-06, + "loss": 0.3539, + "step": 2086 + }, + { + "epoch": 0.026100652516312908, + "grad_norm": 7.50054407119751, + "learning_rate": 5.220000000000001e-06, + "loss": 0.9711, + "step": 2088 + }, + { + "epoch": 0.026125653141328534, + "grad_norm": 9.907612800598145, + "learning_rate": 5.225e-06, + "loss": 0.6239, + "step": 2090 + }, + { + "epoch": 0.026150653766344158, + "grad_norm": 4.190488338470459, + "learning_rate": 5.230000000000001e-06, + "loss": 1.676, + "step": 2092 + }, + { + "epoch": 0.026175654391359784, + "grad_norm": 4.253824234008789, + "learning_rate": 5.235e-06, + "loss": 0.6995, + "step": 2094 + }, + { + "epoch": 0.026200655016375408, + "grad_norm": 6.73296594619751, + "learning_rate": 5.240000000000001e-06, + "loss": 0.5164, + "step": 2096 + }, + { + "epoch": 0.026225655641391035, + "grad_norm": 4.414495468139648, + "learning_rate": 5.245e-06, + "loss": 1.9428, + "step": 2098 + }, + { + "epoch": 0.02625065626640666, + "grad_norm": 4.8120503425598145, + "learning_rate": 5.2500000000000006e-06, + "loss": 1.1413, + "step": 2100 + }, + { + "epoch": 0.026275656891422285, + "grad_norm": 4.861554145812988, + "learning_rate": 5.2550000000000005e-06, + "loss": 1.4916, + "step": 2102 + }, + { + "epoch": 0.02630065751643791, + "grad_norm": 4.96120023727417, + "learning_rate": 5.2600000000000005e-06, + "loss": 0.4816, + "step": 2104 + }, + { + "epoch": 0.026325658141453535, + "grad_norm": 3.600991725921631, + "learning_rate": 5.265e-06, + "loss": 0.075, + "step": 2106 + }, + { + "epoch": 0.02635065876646916, + "grad_norm": 2.2043607234954834, + "learning_rate": 5.27e-06, + "loss": 0.8624, + "step": 2108 + }, + { + "epoch": 0.026375659391484788, + "grad_norm": 4.5086894035339355, + "learning_rate": 5.275e-06, + "loss": 1.5192, + "step": 2110 + }, + { + "epoch": 0.02640066001650041, + "grad_norm": 7.038097381591797, + "learning_rate": 5.28e-06, + "loss": 1.4916, + "step": 2112 + }, + { + "epoch": 0.02642566064151604, + "grad_norm": 3.946685552597046, + "learning_rate": 5.285e-06, + "loss": 0.8528, + "step": 2114 + }, + { + "epoch": 0.02645066126653166, + "grad_norm": 4.990451335906982, + "learning_rate": 5.290000000000001e-06, + "loss": 1.0666, + "step": 2116 + }, + { + "epoch": 0.02647566189154729, + "grad_norm": 4.360968112945557, + "learning_rate": 5.295e-06, + "loss": 1.2182, + "step": 2118 + }, + { + "epoch": 0.026500662516562915, + "grad_norm": 6.694272994995117, + "learning_rate": 5.300000000000001e-06, + "loss": 0.14, + "step": 2120 + }, + { + "epoch": 0.02652566314157854, + "grad_norm": 6.103180885314941, + "learning_rate": 5.305e-06, + "loss": 1.6706, + "step": 2122 + }, + { + "epoch": 0.026550663766594165, + "grad_norm": 6.777509689331055, + "learning_rate": 5.310000000000001e-06, + "loss": 1.1468, + "step": 2124 + }, + { + "epoch": 0.02657566439160979, + "grad_norm": 4.821200847625732, + "learning_rate": 5.315e-06, + "loss": 0.9905, + "step": 2126 + }, + { + "epoch": 0.026600665016625415, + "grad_norm": 5.798837184906006, + "learning_rate": 5.320000000000001e-06, + "loss": 2.1109, + "step": 2128 + }, + { + "epoch": 0.026625665641641042, + "grad_norm": 5.2463274002075195, + "learning_rate": 5.325e-06, + "loss": 0.8042, + "step": 2130 + }, + { + "epoch": 0.026650666266656665, + "grad_norm": 4.901463031768799, + "learning_rate": 5.330000000000001e-06, + "loss": 1.0389, + "step": 2132 + }, + { + "epoch": 0.026675666891672292, + "grad_norm": 6.472564697265625, + "learning_rate": 5.335000000000001e-06, + "loss": 1.0946, + "step": 2134 + }, + { + "epoch": 0.026700667516687916, + "grad_norm": 7.759603500366211, + "learning_rate": 5.3400000000000005e-06, + "loss": 2.0892, + "step": 2136 + }, + { + "epoch": 0.026725668141703542, + "grad_norm": 3.4049911499023438, + "learning_rate": 5.3450000000000005e-06, + "loss": 1.0054, + "step": 2138 + }, + { + "epoch": 0.02675066876671917, + "grad_norm": 4.674206733703613, + "learning_rate": 5.3500000000000004e-06, + "loss": 0.552, + "step": 2140 + }, + { + "epoch": 0.026775669391734792, + "grad_norm": 9.121920585632324, + "learning_rate": 5.355e-06, + "loss": 1.9816, + "step": 2142 + }, + { + "epoch": 0.02680067001675042, + "grad_norm": 5.535032749176025, + "learning_rate": 5.36e-06, + "loss": 0.7323, + "step": 2144 + }, + { + "epoch": 0.026825670641766042, + "grad_norm": 3.231968641281128, + "learning_rate": 5.365e-06, + "loss": 0.9972, + "step": 2146 + }, + { + "epoch": 0.02685067126678167, + "grad_norm": 0.04954713210463524, + "learning_rate": 5.370000000000001e-06, + "loss": 0.9383, + "step": 2148 + }, + { + "epoch": 0.026875671891797296, + "grad_norm": 8.32050609588623, + "learning_rate": 5.375e-06, + "loss": 2.625, + "step": 2150 + }, + { + "epoch": 0.02690067251681292, + "grad_norm": 3.4185352325439453, + "learning_rate": 5.380000000000001e-06, + "loss": 0.843, + "step": 2152 + }, + { + "epoch": 0.026925673141828546, + "grad_norm": 10.27455997467041, + "learning_rate": 5.385e-06, + "loss": 0.8078, + "step": 2154 + }, + { + "epoch": 0.02695067376684417, + "grad_norm": 0.15812020003795624, + "learning_rate": 5.390000000000001e-06, + "loss": 0.0134, + "step": 2156 + }, + { + "epoch": 0.026975674391859796, + "grad_norm": 3.000312089920044, + "learning_rate": 5.395e-06, + "loss": 0.4025, + "step": 2158 + }, + { + "epoch": 0.027000675016875423, + "grad_norm": 5.952627658843994, + "learning_rate": 5.400000000000001e-06, + "loss": 1.2348, + "step": 2160 + }, + { + "epoch": 0.027025675641891046, + "grad_norm": 0.03933969885110855, + "learning_rate": 5.405e-06, + "loss": 0.5278, + "step": 2162 + }, + { + "epoch": 0.027050676266906673, + "grad_norm": 0.7225239872932434, + "learning_rate": 5.410000000000001e-06, + "loss": 0.2235, + "step": 2164 + }, + { + "epoch": 0.027075676891922296, + "grad_norm": 4.1895856857299805, + "learning_rate": 5.415000000000001e-06, + "loss": 0.0969, + "step": 2166 + }, + { + "epoch": 0.027100677516937923, + "grad_norm": 8.278556823730469, + "learning_rate": 5.420000000000001e-06, + "loss": 1.7954, + "step": 2168 + }, + { + "epoch": 0.02712567814195355, + "grad_norm": 7.107349872589111, + "learning_rate": 5.4250000000000006e-06, + "loss": 1.7241, + "step": 2170 + }, + { + "epoch": 0.027150678766969173, + "grad_norm": 8.848883628845215, + "learning_rate": 5.4300000000000005e-06, + "loss": 2.1696, + "step": 2172 + }, + { + "epoch": 0.0271756793919848, + "grad_norm": 4.311103820800781, + "learning_rate": 5.4350000000000005e-06, + "loss": 1.6275, + "step": 2174 + }, + { + "epoch": 0.027200680017000423, + "grad_norm": 3.7365317344665527, + "learning_rate": 5.4400000000000004e-06, + "loss": 0.5409, + "step": 2176 + }, + { + "epoch": 0.02722568064201605, + "grad_norm": 8.499143600463867, + "learning_rate": 5.445e-06, + "loss": 1.6131, + "step": 2178 + }, + { + "epoch": 0.027250681267031677, + "grad_norm": 5.0548906326293945, + "learning_rate": 5.450000000000001e-06, + "loss": 1.2088, + "step": 2180 + }, + { + "epoch": 0.0272756818920473, + "grad_norm": 5.285542011260986, + "learning_rate": 5.455e-06, + "loss": 0.8103, + "step": 2182 + }, + { + "epoch": 0.027300682517062927, + "grad_norm": 4.882510185241699, + "learning_rate": 5.460000000000001e-06, + "loss": 0.9154, + "step": 2184 + }, + { + "epoch": 0.02732568314207855, + "grad_norm": 16.13987922668457, + "learning_rate": 5.465e-06, + "loss": 0.8004, + "step": 2186 + }, + { + "epoch": 0.027350683767094177, + "grad_norm": 4.188985824584961, + "learning_rate": 5.470000000000001e-06, + "loss": 0.2445, + "step": 2188 + }, + { + "epoch": 0.027375684392109804, + "grad_norm": 3.8194949626922607, + "learning_rate": 5.475e-06, + "loss": 1.06, + "step": 2190 + }, + { + "epoch": 0.027400685017125427, + "grad_norm": 6.821078300476074, + "learning_rate": 5.480000000000001e-06, + "loss": 0.9286, + "step": 2192 + }, + { + "epoch": 0.027425685642141054, + "grad_norm": 0.0921761691570282, + "learning_rate": 5.485e-06, + "loss": 0.0007, + "step": 2194 + }, + { + "epoch": 0.02745068626715668, + "grad_norm": 4.501262664794922, + "learning_rate": 5.490000000000001e-06, + "loss": 1.1299, + "step": 2196 + }, + { + "epoch": 0.027475686892172304, + "grad_norm": 3.5581271648406982, + "learning_rate": 5.495000000000001e-06, + "loss": 1.0867, + "step": 2198 + }, + { + "epoch": 0.02750068751718793, + "grad_norm": 5.33985710144043, + "learning_rate": 5.500000000000001e-06, + "loss": 1.7376, + "step": 2200 + }, + { + "epoch": 0.027525688142203554, + "grad_norm": 1.6471614837646484, + "learning_rate": 5.505000000000001e-06, + "loss": 1.0648, + "step": 2202 + }, + { + "epoch": 0.02755068876721918, + "grad_norm": 5.4546709060668945, + "learning_rate": 5.510000000000001e-06, + "loss": 0.4454, + "step": 2204 + }, + { + "epoch": 0.027575689392234808, + "grad_norm": 19.664087295532227, + "learning_rate": 5.5150000000000006e-06, + "loss": 1.0344, + "step": 2206 + }, + { + "epoch": 0.02760069001725043, + "grad_norm": 5.715367317199707, + "learning_rate": 5.5200000000000005e-06, + "loss": 0.8404, + "step": 2208 + }, + { + "epoch": 0.027625690642266058, + "grad_norm": 5.5654377937316895, + "learning_rate": 5.5250000000000005e-06, + "loss": 1.9351, + "step": 2210 + }, + { + "epoch": 0.02765069126728168, + "grad_norm": 4.666059494018555, + "learning_rate": 5.530000000000001e-06, + "loss": 0.8882, + "step": 2212 + }, + { + "epoch": 0.027675691892297308, + "grad_norm": 4.684323787689209, + "learning_rate": 5.535e-06, + "loss": 0.9132, + "step": 2214 + }, + { + "epoch": 0.027700692517312935, + "grad_norm": 16.057464599609375, + "learning_rate": 5.540000000000001e-06, + "loss": 1.4413, + "step": 2216 + }, + { + "epoch": 0.027725693142328558, + "grad_norm": 0.27155429124832153, + "learning_rate": 5.545e-06, + "loss": 0.5072, + "step": 2218 + }, + { + "epoch": 0.027750693767344185, + "grad_norm": 11.454776763916016, + "learning_rate": 5.550000000000001e-06, + "loss": 1.075, + "step": 2220 + }, + { + "epoch": 0.027775694392359808, + "grad_norm": 6.096161365509033, + "learning_rate": 5.555e-06, + "loss": 0.278, + "step": 2222 + }, + { + "epoch": 0.027800695017375435, + "grad_norm": 3.5517168045043945, + "learning_rate": 5.560000000000001e-06, + "loss": 1.0995, + "step": 2224 + }, + { + "epoch": 0.02782569564239106, + "grad_norm": 10.152116775512695, + "learning_rate": 5.565e-06, + "loss": 0.9194, + "step": 2226 + }, + { + "epoch": 0.027850696267406685, + "grad_norm": 3.8360707759857178, + "learning_rate": 5.570000000000001e-06, + "loss": 0.609, + "step": 2228 + }, + { + "epoch": 0.02787569689242231, + "grad_norm": 4.1753082275390625, + "learning_rate": 5.575000000000001e-06, + "loss": 1.4194, + "step": 2230 + }, + { + "epoch": 0.027900697517437935, + "grad_norm": 8.640860557556152, + "learning_rate": 5.580000000000001e-06, + "loss": 1.0523, + "step": 2232 + }, + { + "epoch": 0.02792569814245356, + "grad_norm": 8.430807113647461, + "learning_rate": 5.585000000000001e-06, + "loss": 1.2719, + "step": 2234 + }, + { + "epoch": 0.02795069876746919, + "grad_norm": 7.717026233673096, + "learning_rate": 5.590000000000001e-06, + "loss": 0.525, + "step": 2236 + }, + { + "epoch": 0.02797569939248481, + "grad_norm": 4.275600910186768, + "learning_rate": 5.595000000000001e-06, + "loss": 0.8755, + "step": 2238 + }, + { + "epoch": 0.02800070001750044, + "grad_norm": 6.3469438552856445, + "learning_rate": 5.600000000000001e-06, + "loss": 1.4235, + "step": 2240 + }, + { + "epoch": 0.028025700642516062, + "grad_norm": 0.6414796710014343, + "learning_rate": 5.6050000000000005e-06, + "loss": 0.0069, + "step": 2242 + }, + { + "epoch": 0.02805070126753169, + "grad_norm": 4.465810775756836, + "learning_rate": 5.610000000000001e-06, + "loss": 0.8178, + "step": 2244 + }, + { + "epoch": 0.028075701892547315, + "grad_norm": 3.456735849380493, + "learning_rate": 5.6150000000000005e-06, + "loss": 0.208, + "step": 2246 + }, + { + "epoch": 0.02810070251756294, + "grad_norm": 3.6327226161956787, + "learning_rate": 5.620000000000001e-06, + "loss": 2.2941, + "step": 2248 + }, + { + "epoch": 0.028125703142578565, + "grad_norm": 4.474798202514648, + "learning_rate": 5.625e-06, + "loss": 0.6801, + "step": 2250 + }, + { + "epoch": 0.02815070376759419, + "grad_norm": 6.238933563232422, + "learning_rate": 5.63e-06, + "loss": 1.7026, + "step": 2252 + }, + { + "epoch": 0.028175704392609816, + "grad_norm": 5.636078357696533, + "learning_rate": 5.635e-06, + "loss": 1.6064, + "step": 2254 + }, + { + "epoch": 0.028200705017625442, + "grad_norm": 6.921169757843018, + "learning_rate": 5.64e-06, + "loss": 0.6299, + "step": 2256 + }, + { + "epoch": 0.028225705642641066, + "grad_norm": 6.471092700958252, + "learning_rate": 5.645e-06, + "loss": 1.6961, + "step": 2258 + }, + { + "epoch": 0.028250706267656692, + "grad_norm": 3.305781364440918, + "learning_rate": 5.65e-06, + "loss": 0.8453, + "step": 2260 + }, + { + "epoch": 0.028275706892672316, + "grad_norm": 4.17460823059082, + "learning_rate": 5.655e-06, + "loss": 1.0784, + "step": 2262 + }, + { + "epoch": 0.028300707517687942, + "grad_norm": 11.989883422851562, + "learning_rate": 5.66e-06, + "loss": 1.2761, + "step": 2264 + }, + { + "epoch": 0.02832570814270357, + "grad_norm": 3.245103120803833, + "learning_rate": 5.665000000000001e-06, + "loss": 0.8431, + "step": 2266 + }, + { + "epoch": 0.028350708767719193, + "grad_norm": 10.453003883361816, + "learning_rate": 5.67e-06, + "loss": 1.2213, + "step": 2268 + }, + { + "epoch": 0.02837570939273482, + "grad_norm": 5.607085704803467, + "learning_rate": 5.675000000000001e-06, + "loss": 0.9761, + "step": 2270 + }, + { + "epoch": 0.028400710017750443, + "grad_norm": 4.158308029174805, + "learning_rate": 5.68e-06, + "loss": 0.6439, + "step": 2272 + }, + { + "epoch": 0.02842571064276607, + "grad_norm": 4.805854320526123, + "learning_rate": 5.685000000000001e-06, + "loss": 1.0865, + "step": 2274 + }, + { + "epoch": 0.028450711267781696, + "grad_norm": 3.8572998046875, + "learning_rate": 5.69e-06, + "loss": 1.2085, + "step": 2276 + }, + { + "epoch": 0.02847571189279732, + "grad_norm": 0.2261519879102707, + "learning_rate": 5.6950000000000005e-06, + "loss": 0.263, + "step": 2278 + }, + { + "epoch": 0.028500712517812946, + "grad_norm": 5.243706703186035, + "learning_rate": 5.7e-06, + "loss": 1.3681, + "step": 2280 + }, + { + "epoch": 0.02852571314282857, + "grad_norm": 6.366832256317139, + "learning_rate": 5.7050000000000004e-06, + "loss": 0.6719, + "step": 2282 + }, + { + "epoch": 0.028550713767844196, + "grad_norm": 4.4748029708862305, + "learning_rate": 5.71e-06, + "loss": 0.7813, + "step": 2284 + }, + { + "epoch": 0.028575714392859823, + "grad_norm": 9.471955299377441, + "learning_rate": 5.715e-06, + "loss": 2.483, + "step": 2286 + }, + { + "epoch": 0.028600715017875446, + "grad_norm": 5.899610996246338, + "learning_rate": 5.72e-06, + "loss": 1.043, + "step": 2288 + }, + { + "epoch": 0.028625715642891073, + "grad_norm": 5.636706352233887, + "learning_rate": 5.725e-06, + "loss": 2.163, + "step": 2290 + }, + { + "epoch": 0.028650716267906697, + "grad_norm": 10.387925148010254, + "learning_rate": 5.73e-06, + "loss": 2.0899, + "step": 2292 + }, + { + "epoch": 0.028675716892922323, + "grad_norm": 3.9123189449310303, + "learning_rate": 5.735e-06, + "loss": 0.7559, + "step": 2294 + }, + { + "epoch": 0.02870071751793795, + "grad_norm": 7.1627349853515625, + "learning_rate": 5.74e-06, + "loss": 2.4827, + "step": 2296 + }, + { + "epoch": 0.028725718142953573, + "grad_norm": 5.864351272583008, + "learning_rate": 5.745000000000001e-06, + "loss": 0.1901, + "step": 2298 + }, + { + "epoch": 0.0287507187679692, + "grad_norm": 4.512864589691162, + "learning_rate": 5.75e-06, + "loss": 1.6996, + "step": 2300 + }, + { + "epoch": 0.028775719392984823, + "grad_norm": 4.856689453125, + "learning_rate": 5.755000000000001e-06, + "loss": 0.7754, + "step": 2302 + }, + { + "epoch": 0.02880072001800045, + "grad_norm": 3.05476450920105, + "learning_rate": 5.76e-06, + "loss": 0.7172, + "step": 2304 + }, + { + "epoch": 0.028825720643016077, + "grad_norm": 5.0839667320251465, + "learning_rate": 5.765000000000001e-06, + "loss": 1.2751, + "step": 2306 + }, + { + "epoch": 0.0288507212680317, + "grad_norm": 8.98061752319336, + "learning_rate": 5.77e-06, + "loss": 1.5915, + "step": 2308 + }, + { + "epoch": 0.028875721893047327, + "grad_norm": 10.795464515686035, + "learning_rate": 5.775000000000001e-06, + "loss": 1.4323, + "step": 2310 + }, + { + "epoch": 0.02890072251806295, + "grad_norm": 2.7692549228668213, + "learning_rate": 5.78e-06, + "loss": 0.3136, + "step": 2312 + }, + { + "epoch": 0.028925723143078577, + "grad_norm": 5.351258754730225, + "learning_rate": 5.7850000000000005e-06, + "loss": 0.8963, + "step": 2314 + }, + { + "epoch": 0.028950723768094204, + "grad_norm": 8.406754493713379, + "learning_rate": 5.7900000000000005e-06, + "loss": 1.9606, + "step": 2316 + }, + { + "epoch": 0.028975724393109827, + "grad_norm": 4.999108791351318, + "learning_rate": 5.795e-06, + "loss": 1.1157, + "step": 2318 + }, + { + "epoch": 0.029000725018125454, + "grad_norm": 9.518075942993164, + "learning_rate": 5.8e-06, + "loss": 1.7686, + "step": 2320 + }, + { + "epoch": 0.029025725643141077, + "grad_norm": 6.753932952880859, + "learning_rate": 5.805e-06, + "loss": 1.3628, + "step": 2322 + }, + { + "epoch": 0.029050726268156704, + "grad_norm": 5.2788591384887695, + "learning_rate": 5.81e-06, + "loss": 0.9041, + "step": 2324 + }, + { + "epoch": 0.02907572689317233, + "grad_norm": 1.2340563535690308, + "learning_rate": 5.815e-06, + "loss": 0.2034, + "step": 2326 + }, + { + "epoch": 0.029100727518187954, + "grad_norm": 3.6275014877319336, + "learning_rate": 5.82e-06, + "loss": 1.4798, + "step": 2328 + }, + { + "epoch": 0.02912572814320358, + "grad_norm": 13.993865013122559, + "learning_rate": 5.825000000000001e-06, + "loss": 0.7736, + "step": 2330 + }, + { + "epoch": 0.029150728768219204, + "grad_norm": 0.052005164325237274, + "learning_rate": 5.83e-06, + "loss": 1.108, + "step": 2332 + }, + { + "epoch": 0.02917572939323483, + "grad_norm": 2.693632125854492, + "learning_rate": 5.835000000000001e-06, + "loss": 1.0546, + "step": 2334 + }, + { + "epoch": 0.029200730018250458, + "grad_norm": 7.233316421508789, + "learning_rate": 5.84e-06, + "loss": 1.8614, + "step": 2336 + }, + { + "epoch": 0.02922573064326608, + "grad_norm": 4.0142951011657715, + "learning_rate": 5.845000000000001e-06, + "loss": 1.2623, + "step": 2338 + }, + { + "epoch": 0.029250731268281708, + "grad_norm": 3.904695749282837, + "learning_rate": 5.85e-06, + "loss": 0.8432, + "step": 2340 + }, + { + "epoch": 0.02927573189329733, + "grad_norm": 1.1383227109909058, + "learning_rate": 5.855000000000001e-06, + "loss": 0.126, + "step": 2342 + }, + { + "epoch": 0.029300732518312958, + "grad_norm": 3.8363537788391113, + "learning_rate": 5.86e-06, + "loss": 0.6731, + "step": 2344 + }, + { + "epoch": 0.029325733143328585, + "grad_norm": 4.413968563079834, + "learning_rate": 5.865000000000001e-06, + "loss": 0.9989, + "step": 2346 + }, + { + "epoch": 0.029350733768344208, + "grad_norm": 3.0730628967285156, + "learning_rate": 5.8700000000000005e-06, + "loss": 0.4398, + "step": 2348 + }, + { + "epoch": 0.029375734393359835, + "grad_norm": 8.357575416564941, + "learning_rate": 5.8750000000000005e-06, + "loss": 0.8683, + "step": 2350 + }, + { + "epoch": 0.029400735018375458, + "grad_norm": 9.158495903015137, + "learning_rate": 5.8800000000000005e-06, + "loss": 1.4435, + "step": 2352 + }, + { + "epoch": 0.029425735643391085, + "grad_norm": 10.568608283996582, + "learning_rate": 5.885e-06, + "loss": 1.8231, + "step": 2354 + }, + { + "epoch": 0.02945073626840671, + "grad_norm": 3.8882126808166504, + "learning_rate": 5.89e-06, + "loss": 1.1327, + "step": 2356 + }, + { + "epoch": 0.029475736893422335, + "grad_norm": 0.16841398179531097, + "learning_rate": 5.895e-06, + "loss": 0.7595, + "step": 2358 + }, + { + "epoch": 0.029500737518437962, + "grad_norm": 4.190512180328369, + "learning_rate": 5.9e-06, + "loss": 0.9014, + "step": 2360 + }, + { + "epoch": 0.029525738143453585, + "grad_norm": 0.22429198026657104, + "learning_rate": 5.905000000000001e-06, + "loss": 1.0426, + "step": 2362 + }, + { + "epoch": 0.029550738768469212, + "grad_norm": 0.7323446273803711, + "learning_rate": 5.91e-06, + "loss": 1.3779, + "step": 2364 + }, + { + "epoch": 0.02957573939348484, + "grad_norm": 4.235392093658447, + "learning_rate": 5.915000000000001e-06, + "loss": 0.6514, + "step": 2366 + }, + { + "epoch": 0.029600740018500462, + "grad_norm": 3.8192131519317627, + "learning_rate": 5.92e-06, + "loss": 0.719, + "step": 2368 + }, + { + "epoch": 0.02962574064351609, + "grad_norm": 5.825703144073486, + "learning_rate": 5.925000000000001e-06, + "loss": 0.3287, + "step": 2370 + }, + { + "epoch": 0.029650741268531712, + "grad_norm": 6.584100723266602, + "learning_rate": 5.93e-06, + "loss": 1.3403, + "step": 2372 + }, + { + "epoch": 0.02967574189354734, + "grad_norm": 8.471826553344727, + "learning_rate": 5.935000000000001e-06, + "loss": 0.5076, + "step": 2374 + }, + { + "epoch": 0.029700742518562966, + "grad_norm": 0.30985331535339355, + "learning_rate": 5.94e-06, + "loss": 0.7696, + "step": 2376 + }, + { + "epoch": 0.02972574314357859, + "grad_norm": 2.876852512359619, + "learning_rate": 5.945000000000001e-06, + "loss": 0.2492, + "step": 2378 + }, + { + "epoch": 0.029750743768594216, + "grad_norm": 6.576855659484863, + "learning_rate": 5.950000000000001e-06, + "loss": 0.2731, + "step": 2380 + }, + { + "epoch": 0.02977574439360984, + "grad_norm": 7.288631916046143, + "learning_rate": 5.955000000000001e-06, + "loss": 0.3539, + "step": 2382 + }, + { + "epoch": 0.029800745018625466, + "grad_norm": 0.03666597232222557, + "learning_rate": 5.9600000000000005e-06, + "loss": 1.1401, + "step": 2384 + }, + { + "epoch": 0.029825745643641093, + "grad_norm": 6.053106784820557, + "learning_rate": 5.9650000000000005e-06, + "loss": 0.869, + "step": 2386 + }, + { + "epoch": 0.029850746268656716, + "grad_norm": 3.956155300140381, + "learning_rate": 5.9700000000000004e-06, + "loss": 0.8385, + "step": 2388 + }, + { + "epoch": 0.029875746893672343, + "grad_norm": 8.785298347473145, + "learning_rate": 5.975e-06, + "loss": 0.5926, + "step": 2390 + }, + { + "epoch": 0.029900747518687966, + "grad_norm": 4.294055938720703, + "learning_rate": 5.98e-06, + "loss": 1.6949, + "step": 2392 + }, + { + "epoch": 0.029925748143703593, + "grad_norm": 4.248189449310303, + "learning_rate": 5.985000000000001e-06, + "loss": 0.9238, + "step": 2394 + }, + { + "epoch": 0.02995074876871922, + "grad_norm": 9.797625541687012, + "learning_rate": 5.99e-06, + "loss": 1.9539, + "step": 2396 + }, + { + "epoch": 0.029975749393734843, + "grad_norm": 3.241093635559082, + "learning_rate": 5.995000000000001e-06, + "loss": 1.1553, + "step": 2398 + }, + { + "epoch": 0.03000075001875047, + "grad_norm": 3.72636079788208, + "learning_rate": 6e-06, + "loss": 1.3025, + "step": 2400 + }, + { + "epoch": 0.030025750643766093, + "grad_norm": 7.281135559082031, + "learning_rate": 6.005000000000001e-06, + "loss": 1.5605, + "step": 2402 + }, + { + "epoch": 0.03005075126878172, + "grad_norm": 5.149214267730713, + "learning_rate": 6.01e-06, + "loss": 1.0218, + "step": 2404 + }, + { + "epoch": 0.030075751893797346, + "grad_norm": 3.9751505851745605, + "learning_rate": 6.015000000000001e-06, + "loss": 1.2631, + "step": 2406 + }, + { + "epoch": 0.03010075251881297, + "grad_norm": 4.235726356506348, + "learning_rate": 6.02e-06, + "loss": 1.8793, + "step": 2408 + }, + { + "epoch": 0.030125753143828597, + "grad_norm": 0.3110506534576416, + "learning_rate": 6.025000000000001e-06, + "loss": 0.0021, + "step": 2410 + }, + { + "epoch": 0.03015075376884422, + "grad_norm": 5.348988056182861, + "learning_rate": 6.030000000000001e-06, + "loss": 1.0821, + "step": 2412 + }, + { + "epoch": 0.030175754393859847, + "grad_norm": 12.525580406188965, + "learning_rate": 6.035000000000001e-06, + "loss": 0.6795, + "step": 2414 + }, + { + "epoch": 0.030200755018875473, + "grad_norm": 5.484158039093018, + "learning_rate": 6.040000000000001e-06, + "loss": 1.3373, + "step": 2416 + }, + { + "epoch": 0.030225755643891097, + "grad_norm": 5.681865692138672, + "learning_rate": 6.0450000000000006e-06, + "loss": 1.3632, + "step": 2418 + }, + { + "epoch": 0.030250756268906723, + "grad_norm": 0.023452112451195717, + "learning_rate": 6.0500000000000005e-06, + "loss": 0.0007, + "step": 2420 + }, + { + "epoch": 0.030275756893922347, + "grad_norm": 10.985302925109863, + "learning_rate": 6.0550000000000005e-06, + "loss": 0.9052, + "step": 2422 + }, + { + "epoch": 0.030300757518937974, + "grad_norm": 0.02472861483693123, + "learning_rate": 6.0600000000000004e-06, + "loss": 0.6602, + "step": 2424 + }, + { + "epoch": 0.0303257581439536, + "grad_norm": 16.6547908782959, + "learning_rate": 6.065000000000001e-06, + "loss": 1.8035, + "step": 2426 + }, + { + "epoch": 0.030350758768969224, + "grad_norm": 3.0191874504089355, + "learning_rate": 6.07e-06, + "loss": 0.1472, + "step": 2428 + }, + { + "epoch": 0.03037575939398485, + "grad_norm": 0.042866624891757965, + "learning_rate": 6.075000000000001e-06, + "loss": 0.3842, + "step": 2430 + }, + { + "epoch": 0.030400760019000474, + "grad_norm": 4.505115509033203, + "learning_rate": 6.08e-06, + "loss": 0.8946, + "step": 2432 + }, + { + "epoch": 0.0304257606440161, + "grad_norm": 5.291789531707764, + "learning_rate": 6.085000000000001e-06, + "loss": 1.344, + "step": 2434 + }, + { + "epoch": 0.030450761269031727, + "grad_norm": 7.880792617797852, + "learning_rate": 6.09e-06, + "loss": 1.9575, + "step": 2436 + }, + { + "epoch": 0.03047576189404735, + "grad_norm": 3.7092607021331787, + "learning_rate": 6.095000000000001e-06, + "loss": 0.5946, + "step": 2438 + }, + { + "epoch": 0.030500762519062977, + "grad_norm": 11.078755378723145, + "learning_rate": 6.1e-06, + "loss": 1.3182, + "step": 2440 + }, + { + "epoch": 0.0305257631440786, + "grad_norm": 6.928043842315674, + "learning_rate": 6.105000000000001e-06, + "loss": 0.6459, + "step": 2442 + }, + { + "epoch": 0.030550763769094227, + "grad_norm": 12.467026710510254, + "learning_rate": 6.110000000000001e-06, + "loss": 0.5759, + "step": 2444 + }, + { + "epoch": 0.030575764394109854, + "grad_norm": 7.340199947357178, + "learning_rate": 6.115000000000001e-06, + "loss": 0.2798, + "step": 2446 + }, + { + "epoch": 0.030600765019125478, + "grad_norm": 3.4943902492523193, + "learning_rate": 6.120000000000001e-06, + "loss": 1.3135, + "step": 2448 + }, + { + "epoch": 0.030625765644141104, + "grad_norm": 8.287038803100586, + "learning_rate": 6.125000000000001e-06, + "loss": 1.2177, + "step": 2450 + }, + { + "epoch": 0.030650766269156728, + "grad_norm": 9.297810554504395, + "learning_rate": 6.130000000000001e-06, + "loss": 0.9463, + "step": 2452 + }, + { + "epoch": 0.030675766894172354, + "grad_norm": 6.060208320617676, + "learning_rate": 6.1350000000000006e-06, + "loss": 1.1161, + "step": 2454 + }, + { + "epoch": 0.03070076751918798, + "grad_norm": 26.001070022583008, + "learning_rate": 6.1400000000000005e-06, + "loss": 0.2381, + "step": 2456 + }, + { + "epoch": 0.030725768144203604, + "grad_norm": 4.437205791473389, + "learning_rate": 6.145000000000001e-06, + "loss": 0.3449, + "step": 2458 + }, + { + "epoch": 0.03075076876921923, + "grad_norm": 0.13883619010448456, + "learning_rate": 6.15e-06, + "loss": 0.9196, + "step": 2460 + }, + { + "epoch": 0.030775769394234855, + "grad_norm": 8.112532615661621, + "learning_rate": 6.155000000000001e-06, + "loss": 1.4519, + "step": 2462 + }, + { + "epoch": 0.03080077001925048, + "grad_norm": 7.462016582489014, + "learning_rate": 6.16e-06, + "loss": 1.2783, + "step": 2464 + }, + { + "epoch": 0.030825770644266108, + "grad_norm": 7.516724109649658, + "learning_rate": 6.165000000000001e-06, + "loss": 0.9495, + "step": 2466 + }, + { + "epoch": 0.03085077126928173, + "grad_norm": 4.6826043128967285, + "learning_rate": 6.17e-06, + "loss": 0.7415, + "step": 2468 + }, + { + "epoch": 0.030875771894297358, + "grad_norm": 5.466147422790527, + "learning_rate": 6.175000000000001e-06, + "loss": 0.4621, + "step": 2470 + }, + { + "epoch": 0.03090077251931298, + "grad_norm": 5.346714019775391, + "learning_rate": 6.18e-06, + "loss": 1.3445, + "step": 2472 + }, + { + "epoch": 0.030925773144328608, + "grad_norm": 0.11633971333503723, + "learning_rate": 6.185000000000001e-06, + "loss": 0.0457, + "step": 2474 + }, + { + "epoch": 0.030950773769344235, + "grad_norm": 4.22144889831543, + "learning_rate": 6.190000000000001e-06, + "loss": 1.0138, + "step": 2476 + }, + { + "epoch": 0.03097577439435986, + "grad_norm": 4.582967758178711, + "learning_rate": 6.195000000000001e-06, + "loss": 1.4842, + "step": 2478 + }, + { + "epoch": 0.031000775019375485, + "grad_norm": 2.51350474357605, + "learning_rate": 6.200000000000001e-06, + "loss": 0.513, + "step": 2480 + }, + { + "epoch": 0.03102577564439111, + "grad_norm": 4.016600608825684, + "learning_rate": 6.205000000000001e-06, + "loss": 1.4541, + "step": 2482 + }, + { + "epoch": 0.031050776269406735, + "grad_norm": 0.022885000333189964, + "learning_rate": 6.210000000000001e-06, + "loss": 0.4493, + "step": 2484 + }, + { + "epoch": 0.031075776894422362, + "grad_norm": 2.980998992919922, + "learning_rate": 6.215000000000001e-06, + "loss": 0.6628, + "step": 2486 + }, + { + "epoch": 0.031100777519437985, + "grad_norm": 5.5346760749816895, + "learning_rate": 6.220000000000001e-06, + "loss": 1.629, + "step": 2488 + }, + { + "epoch": 0.031125778144453612, + "grad_norm": 8.248862266540527, + "learning_rate": 6.225000000000001e-06, + "loss": 0.3893, + "step": 2490 + }, + { + "epoch": 0.031150778769469235, + "grad_norm": 4.027735233306885, + "learning_rate": 6.2300000000000005e-06, + "loss": 0.3704, + "step": 2492 + }, + { + "epoch": 0.031175779394484862, + "grad_norm": 3.2482030391693115, + "learning_rate": 6.235000000000001e-06, + "loss": 1.1907, + "step": 2494 + }, + { + "epoch": 0.03120078001950049, + "grad_norm": 5.064302444458008, + "learning_rate": 6.24e-06, + "loss": 0.9176, + "step": 2496 + }, + { + "epoch": 0.031225780644516112, + "grad_norm": 10.997283935546875, + "learning_rate": 6.245000000000001e-06, + "loss": 0.4796, + "step": 2498 + }, + { + "epoch": 0.031250781269531736, + "grad_norm": 2.588047504425049, + "learning_rate": 6.25e-06, + "loss": 0.9449, + "step": 2500 + }, + { + "epoch": 0.03127578189454736, + "grad_norm": 3.4472458362579346, + "learning_rate": 6.255e-06, + "loss": 0.772, + "step": 2502 + }, + { + "epoch": 0.03130078251956299, + "grad_norm": 2.7451562881469727, + "learning_rate": 6.26e-06, + "loss": 0.062, + "step": 2504 + }, + { + "epoch": 0.031325783144578616, + "grad_norm": 5.221157550811768, + "learning_rate": 6.265e-06, + "loss": 0.4356, + "step": 2506 + }, + { + "epoch": 0.03135078376959424, + "grad_norm": 12.827893257141113, + "learning_rate": 6.27e-06, + "loss": 1.1739, + "step": 2508 + }, + { + "epoch": 0.03137578439460986, + "grad_norm": 2.891601800918579, + "learning_rate": 6.275e-06, + "loss": 0.9366, + "step": 2510 + }, + { + "epoch": 0.03140078501962549, + "grad_norm": 5.1522135734558105, + "learning_rate": 6.280000000000001e-06, + "loss": 1.2048, + "step": 2512 + }, + { + "epoch": 0.031425785644641116, + "grad_norm": 5.263957500457764, + "learning_rate": 6.285e-06, + "loss": 0.2948, + "step": 2514 + }, + { + "epoch": 0.03145078626965674, + "grad_norm": 4.557277679443359, + "learning_rate": 6.290000000000001e-06, + "loss": 1.6266, + "step": 2516 + }, + { + "epoch": 0.03147578689467237, + "grad_norm": 9.105693817138672, + "learning_rate": 6.295e-06, + "loss": 1.2869, + "step": 2518 + }, + { + "epoch": 0.03150078751968799, + "grad_norm": 7.184720039367676, + "learning_rate": 6.300000000000001e-06, + "loss": 0.2952, + "step": 2520 + }, + { + "epoch": 0.031525788144703616, + "grad_norm": 53.161041259765625, + "learning_rate": 6.305e-06, + "loss": 2.2574, + "step": 2522 + }, + { + "epoch": 0.03155078876971924, + "grad_norm": 10.601176261901855, + "learning_rate": 6.3100000000000006e-06, + "loss": 1.2868, + "step": 2524 + }, + { + "epoch": 0.03157578939473487, + "grad_norm": 9.227924346923828, + "learning_rate": 6.315e-06, + "loss": 0.6411, + "step": 2526 + }, + { + "epoch": 0.031600790019750497, + "grad_norm": 10.339025497436523, + "learning_rate": 6.3200000000000005e-06, + "loss": 0.5186, + "step": 2528 + }, + { + "epoch": 0.031625790644766116, + "grad_norm": 4.939643383026123, + "learning_rate": 6.3250000000000004e-06, + "loss": 1.9799, + "step": 2530 + }, + { + "epoch": 0.03165079126978174, + "grad_norm": 3.153491258621216, + "learning_rate": 6.33e-06, + "loss": 0.1834, + "step": 2532 + }, + { + "epoch": 0.03167579189479737, + "grad_norm": 4.147980213165283, + "learning_rate": 6.335e-06, + "loss": 0.5444, + "step": 2534 + }, + { + "epoch": 0.031700792519813, + "grad_norm": 6.9137349128723145, + "learning_rate": 6.34e-06, + "loss": 1.7475, + "step": 2536 + }, + { + "epoch": 0.03172579314482862, + "grad_norm": 5.190703392028809, + "learning_rate": 6.345e-06, + "loss": 0.9891, + "step": 2538 + }, + { + "epoch": 0.03175079376984424, + "grad_norm": 6.255692005157471, + "learning_rate": 6.35e-06, + "loss": 0.6417, + "step": 2540 + }, + { + "epoch": 0.03177579439485987, + "grad_norm": 12.341623306274414, + "learning_rate": 6.355e-06, + "loss": 1.0168, + "step": 2542 + }, + { + "epoch": 0.0318007950198755, + "grad_norm": 4.253443717956543, + "learning_rate": 6.360000000000001e-06, + "loss": 0.8109, + "step": 2544 + }, + { + "epoch": 0.031825795644891124, + "grad_norm": 7.253037929534912, + "learning_rate": 6.365e-06, + "loss": 1.3927, + "step": 2546 + }, + { + "epoch": 0.03185079626990675, + "grad_norm": 7.883599758148193, + "learning_rate": 6.370000000000001e-06, + "loss": 0.3701, + "step": 2548 + }, + { + "epoch": 0.03187579689492237, + "grad_norm": 8.865267753601074, + "learning_rate": 6.375e-06, + "loss": 1.3392, + "step": 2550 + }, + { + "epoch": 0.031900797519938, + "grad_norm": 6.526254653930664, + "learning_rate": 6.380000000000001e-06, + "loss": 0.8804, + "step": 2552 + }, + { + "epoch": 0.031925798144953624, + "grad_norm": 0.038441527634859085, + "learning_rate": 6.385e-06, + "loss": 0.7753, + "step": 2554 + }, + { + "epoch": 0.03195079876996925, + "grad_norm": 4.247732162475586, + "learning_rate": 6.390000000000001e-06, + "loss": 0.9191, + "step": 2556 + }, + { + "epoch": 0.03197579939498488, + "grad_norm": 6.576589107513428, + "learning_rate": 6.395e-06, + "loss": 0.5795, + "step": 2558 + }, + { + "epoch": 0.0320008000200005, + "grad_norm": 11.15737533569336, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.9716, + "step": 2560 + }, + { + "epoch": 0.032025800645016124, + "grad_norm": 7.941177845001221, + "learning_rate": 6.4050000000000005e-06, + "loss": 0.8124, + "step": 2562 + }, + { + "epoch": 0.03205080127003175, + "grad_norm": 6.498569488525391, + "learning_rate": 6.4100000000000005e-06, + "loss": 2.8092, + "step": 2564 + }, + { + "epoch": 0.03207580189504738, + "grad_norm": 3.206751823425293, + "learning_rate": 6.415e-06, + "loss": 0.5281, + "step": 2566 + }, + { + "epoch": 0.032100802520063004, + "grad_norm": 0.6988910436630249, + "learning_rate": 6.42e-06, + "loss": 0.2953, + "step": 2568 + }, + { + "epoch": 0.032125803145078624, + "grad_norm": 6.090380668640137, + "learning_rate": 6.425e-06, + "loss": 0.3564, + "step": 2570 + }, + { + "epoch": 0.03215080377009425, + "grad_norm": 5.788209915161133, + "learning_rate": 6.43e-06, + "loss": 1.9636, + "step": 2572 + }, + { + "epoch": 0.03217580439510988, + "grad_norm": 0.01147556770592928, + "learning_rate": 6.435e-06, + "loss": 0.3521, + "step": 2574 + }, + { + "epoch": 0.032200805020125504, + "grad_norm": 11.426464080810547, + "learning_rate": 6.440000000000001e-06, + "loss": 0.2885, + "step": 2576 + }, + { + "epoch": 0.03222580564514113, + "grad_norm": 4.505030632019043, + "learning_rate": 6.445e-06, + "loss": 1.2963, + "step": 2578 + }, + { + "epoch": 0.03225080627015675, + "grad_norm": 4.920653820037842, + "learning_rate": 6.450000000000001e-06, + "loss": 0.7505, + "step": 2580 + }, + { + "epoch": 0.03227580689517238, + "grad_norm": 6.057285785675049, + "learning_rate": 6.455e-06, + "loss": 0.7958, + "step": 2582 + }, + { + "epoch": 0.032300807520188005, + "grad_norm": 6.884401798248291, + "learning_rate": 6.460000000000001e-06, + "loss": 0.319, + "step": 2584 + }, + { + "epoch": 0.03232580814520363, + "grad_norm": 0.004592082463204861, + "learning_rate": 6.465e-06, + "loss": 0.0029, + "step": 2586 + }, + { + "epoch": 0.03235080877021926, + "grad_norm": 6.064556121826172, + "learning_rate": 6.470000000000001e-06, + "loss": 0.2844, + "step": 2588 + }, + { + "epoch": 0.03237580939523488, + "grad_norm": 12.709674835205078, + "learning_rate": 6.475e-06, + "loss": 1.3676, + "step": 2590 + }, + { + "epoch": 0.032400810020250505, + "grad_norm": 5.655536651611328, + "learning_rate": 6.480000000000001e-06, + "loss": 1.9006, + "step": 2592 + }, + { + "epoch": 0.03242581064526613, + "grad_norm": 8.687150955200195, + "learning_rate": 6.485000000000001e-06, + "loss": 1.0535, + "step": 2594 + }, + { + "epoch": 0.03245081127028176, + "grad_norm": 0.5481643676757812, + "learning_rate": 6.4900000000000005e-06, + "loss": 0.4948, + "step": 2596 + }, + { + "epoch": 0.032475811895297385, + "grad_norm": 7.712373733520508, + "learning_rate": 6.4950000000000005e-06, + "loss": 0.9442, + "step": 2598 + }, + { + "epoch": 0.032500812520313005, + "grad_norm": 0.011831271462142467, + "learning_rate": 6.5000000000000004e-06, + "loss": 0.0534, + "step": 2600 + }, + { + "epoch": 0.03252581314532863, + "grad_norm": 4.6645588874816895, + "learning_rate": 6.505e-06, + "loss": 1.566, + "step": 2602 + }, + { + "epoch": 0.03255081377034426, + "grad_norm": 9.526412963867188, + "learning_rate": 6.51e-06, + "loss": 1.3658, + "step": 2604 + }, + { + "epoch": 0.032575814395359885, + "grad_norm": 3.4479153156280518, + "learning_rate": 6.515e-06, + "loss": 1.548, + "step": 2606 + }, + { + "epoch": 0.03260081502037551, + "grad_norm": 3.363678216934204, + "learning_rate": 6.520000000000001e-06, + "loss": 0.7245, + "step": 2608 + }, + { + "epoch": 0.03262581564539113, + "grad_norm": 16.802295684814453, + "learning_rate": 6.525e-06, + "loss": 2.4912, + "step": 2610 + }, + { + "epoch": 0.03265081627040676, + "grad_norm": 3.1246726512908936, + "learning_rate": 6.530000000000001e-06, + "loss": 0.166, + "step": 2612 + }, + { + "epoch": 0.032675816895422385, + "grad_norm": 8.144326210021973, + "learning_rate": 6.535e-06, + "loss": 1.248, + "step": 2614 + }, + { + "epoch": 0.03270081752043801, + "grad_norm": 5.606773853302002, + "learning_rate": 6.540000000000001e-06, + "loss": 0.8174, + "step": 2616 + }, + { + "epoch": 0.03272581814545364, + "grad_norm": 6.52263879776001, + "learning_rate": 6.545e-06, + "loss": 2.0074, + "step": 2618 + }, + { + "epoch": 0.03275081877046926, + "grad_norm": 5.085580348968506, + "learning_rate": 6.550000000000001e-06, + "loss": 0.0651, + "step": 2620 + }, + { + "epoch": 0.032775819395484886, + "grad_norm": 6.398718357086182, + "learning_rate": 6.555e-06, + "loss": 0.4126, + "step": 2622 + }, + { + "epoch": 0.03280082002050051, + "grad_norm": 4.680537223815918, + "learning_rate": 6.560000000000001e-06, + "loss": 0.2459, + "step": 2624 + }, + { + "epoch": 0.03282582064551614, + "grad_norm": 6.770750045776367, + "learning_rate": 6.565000000000001e-06, + "loss": 0.841, + "step": 2626 + }, + { + "epoch": 0.032850821270531766, + "grad_norm": 0.020891103893518448, + "learning_rate": 6.570000000000001e-06, + "loss": 0.6268, + "step": 2628 + }, + { + "epoch": 0.032875821895547386, + "grad_norm": 12.632043838500977, + "learning_rate": 6.5750000000000006e-06, + "loss": 1.3502, + "step": 2630 + }, + { + "epoch": 0.03290082252056301, + "grad_norm": 41.076786041259766, + "learning_rate": 6.5800000000000005e-06, + "loss": 1.1467, + "step": 2632 + }, + { + "epoch": 0.03292582314557864, + "grad_norm": 8.26309585571289, + "learning_rate": 6.5850000000000005e-06, + "loss": 0.7228, + "step": 2634 + }, + { + "epoch": 0.032950823770594266, + "grad_norm": 8.217491149902344, + "learning_rate": 6.5900000000000004e-06, + "loss": 1.1643, + "step": 2636 + }, + { + "epoch": 0.03297582439560989, + "grad_norm": 4.41325569152832, + "learning_rate": 6.595e-06, + "loss": 0.4396, + "step": 2638 + }, + { + "epoch": 0.03300082502062551, + "grad_norm": 7.234860897064209, + "learning_rate": 6.600000000000001e-06, + "loss": 1.0945, + "step": 2640 + }, + { + "epoch": 0.03302582564564114, + "grad_norm": 6.35221004486084, + "learning_rate": 6.605e-06, + "loss": 1.2343, + "step": 2642 + }, + { + "epoch": 0.033050826270656766, + "grad_norm": 3.84236216545105, + "learning_rate": 6.610000000000001e-06, + "loss": 0.7797, + "step": 2644 + }, + { + "epoch": 0.03307582689567239, + "grad_norm": 0.028526922687888145, + "learning_rate": 6.615e-06, + "loss": 0.8441, + "step": 2646 + }, + { + "epoch": 0.03310082752068802, + "grad_norm": 4.007584095001221, + "learning_rate": 6.620000000000001e-06, + "loss": 0.3482, + "step": 2648 + }, + { + "epoch": 0.03312582814570364, + "grad_norm": 4.573972702026367, + "learning_rate": 6.625e-06, + "loss": 0.3261, + "step": 2650 + }, + { + "epoch": 0.033150828770719266, + "grad_norm": 6.677540302276611, + "learning_rate": 6.630000000000001e-06, + "loss": 1.5584, + "step": 2652 + }, + { + "epoch": 0.03317582939573489, + "grad_norm": 8.114127159118652, + "learning_rate": 6.635e-06, + "loss": 2.6039, + "step": 2654 + }, + { + "epoch": 0.03320083002075052, + "grad_norm": 5.318251609802246, + "learning_rate": 6.640000000000001e-06, + "loss": 0.3432, + "step": 2656 + }, + { + "epoch": 0.03322583064576615, + "grad_norm": 7.024054527282715, + "learning_rate": 6.645000000000001e-06, + "loss": 0.6906, + "step": 2658 + }, + { + "epoch": 0.03325083127078177, + "grad_norm": 0.026516510173678398, + "learning_rate": 6.650000000000001e-06, + "loss": 0.0003, + "step": 2660 + }, + { + "epoch": 0.03327583189579739, + "grad_norm": 0.04151776805520058, + "learning_rate": 6.655000000000001e-06, + "loss": 0.918, + "step": 2662 + }, + { + "epoch": 0.03330083252081302, + "grad_norm": 6.160072326660156, + "learning_rate": 6.660000000000001e-06, + "loss": 1.7258, + "step": 2664 + }, + { + "epoch": 0.03332583314582865, + "grad_norm": 4.479306697845459, + "learning_rate": 6.6650000000000006e-06, + "loss": 1.0749, + "step": 2666 + }, + { + "epoch": 0.033350833770844274, + "grad_norm": 3.8900797367095947, + "learning_rate": 6.6700000000000005e-06, + "loss": 0.9448, + "step": 2668 + }, + { + "epoch": 0.033375834395859894, + "grad_norm": 5.310650825500488, + "learning_rate": 6.6750000000000005e-06, + "loss": 0.5137, + "step": 2670 + }, + { + "epoch": 0.03340083502087552, + "grad_norm": 3.437760591506958, + "learning_rate": 6.680000000000001e-06, + "loss": 1.0435, + "step": 2672 + }, + { + "epoch": 0.03342583564589115, + "grad_norm": 3.8973138332366943, + "learning_rate": 6.685e-06, + "loss": 1.0834, + "step": 2674 + }, + { + "epoch": 0.033450836270906774, + "grad_norm": 12.792060852050781, + "learning_rate": 6.690000000000001e-06, + "loss": 0.2749, + "step": 2676 + }, + { + "epoch": 0.0334758368959224, + "grad_norm": 4.996336936950684, + "learning_rate": 6.695e-06, + "loss": 1.3888, + "step": 2678 + }, + { + "epoch": 0.03350083752093802, + "grad_norm": 0.773564875125885, + "learning_rate": 6.700000000000001e-06, + "loss": 0.3204, + "step": 2680 + }, + { + "epoch": 0.03352583814595365, + "grad_norm": 1.7255170345306396, + "learning_rate": 6.705e-06, + "loss": 0.3931, + "step": 2682 + }, + { + "epoch": 0.033550838770969274, + "grad_norm": 1.4148492813110352, + "learning_rate": 6.710000000000001e-06, + "loss": 0.4433, + "step": 2684 + }, + { + "epoch": 0.0335758393959849, + "grad_norm": 5.338280200958252, + "learning_rate": 6.715e-06, + "loss": 1.2712, + "step": 2686 + }, + { + "epoch": 0.03360084002100053, + "grad_norm": 3.2887823581695557, + "learning_rate": 6.720000000000001e-06, + "loss": 1.2403, + "step": 2688 + }, + { + "epoch": 0.03362584064601615, + "grad_norm": 15.576343536376953, + "learning_rate": 6.725000000000001e-06, + "loss": 1.2548, + "step": 2690 + }, + { + "epoch": 0.033650841271031774, + "grad_norm": 6.611833572387695, + "learning_rate": 6.730000000000001e-06, + "loss": 1.4474, + "step": 2692 + }, + { + "epoch": 0.0336758418960474, + "grad_norm": 4.930817127227783, + "learning_rate": 6.735000000000001e-06, + "loss": 1.2457, + "step": 2694 + }, + { + "epoch": 0.03370084252106303, + "grad_norm": 6.38845157623291, + "learning_rate": 6.740000000000001e-06, + "loss": 0.4852, + "step": 2696 + }, + { + "epoch": 0.033725843146078655, + "grad_norm": 4.707619667053223, + "learning_rate": 6.745000000000001e-06, + "loss": 0.9517, + "step": 2698 + }, + { + "epoch": 0.033750843771094274, + "grad_norm": 1.1827305555343628, + "learning_rate": 6.750000000000001e-06, + "loss": 0.1648, + "step": 2700 + }, + { + "epoch": 0.0337758443961099, + "grad_norm": 3.911485433578491, + "learning_rate": 6.7550000000000005e-06, + "loss": 0.6918, + "step": 2702 + }, + { + "epoch": 0.03380084502112553, + "grad_norm": 7.5196967124938965, + "learning_rate": 6.760000000000001e-06, + "loss": 1.5078, + "step": 2704 + }, + { + "epoch": 0.033825845646141155, + "grad_norm": 5.664214134216309, + "learning_rate": 6.7650000000000005e-06, + "loss": 0.9487, + "step": 2706 + }, + { + "epoch": 0.03385084627115678, + "grad_norm": 0.20362648367881775, + "learning_rate": 6.770000000000001e-06, + "loss": 0.8347, + "step": 2708 + }, + { + "epoch": 0.0338758468961724, + "grad_norm": 4.751779556274414, + "learning_rate": 6.775e-06, + "loss": 2.0469, + "step": 2710 + }, + { + "epoch": 0.03390084752118803, + "grad_norm": 4.293074607849121, + "learning_rate": 6.780000000000001e-06, + "loss": 0.2919, + "step": 2712 + }, + { + "epoch": 0.033925848146203655, + "grad_norm": 6.152959823608398, + "learning_rate": 6.785e-06, + "loss": 0.8181, + "step": 2714 + }, + { + "epoch": 0.03395084877121928, + "grad_norm": 0.12863671779632568, + "learning_rate": 6.790000000000001e-06, + "loss": 0.8152, + "step": 2716 + }, + { + "epoch": 0.03397584939623491, + "grad_norm": 4.148372173309326, + "learning_rate": 6.795e-06, + "loss": 1.6642, + "step": 2718 + }, + { + "epoch": 0.03400085002125053, + "grad_norm": 5.00449275970459, + "learning_rate": 6.800000000000001e-06, + "loss": 0.7862, + "step": 2720 + }, + { + "epoch": 0.034025850646266155, + "grad_norm": 0.05570381134748459, + "learning_rate": 6.805000000000001e-06, + "loss": 0.1882, + "step": 2722 + }, + { + "epoch": 0.03405085127128178, + "grad_norm": 2.7990055084228516, + "learning_rate": 6.810000000000001e-06, + "loss": 1.4135, + "step": 2724 + }, + { + "epoch": 0.03407585189629741, + "grad_norm": 11.376603126525879, + "learning_rate": 6.815000000000001e-06, + "loss": 0.5181, + "step": 2726 + }, + { + "epoch": 0.034100852521313035, + "grad_norm": 5.934099197387695, + "learning_rate": 6.820000000000001e-06, + "loss": 1.7763, + "step": 2728 + }, + { + "epoch": 0.034125853146328655, + "grad_norm": 7.437645435333252, + "learning_rate": 6.825000000000001e-06, + "loss": 0.8785, + "step": 2730 + }, + { + "epoch": 0.03415085377134428, + "grad_norm": 11.254440307617188, + "learning_rate": 6.830000000000001e-06, + "loss": 0.9358, + "step": 2732 + }, + { + "epoch": 0.03417585439635991, + "grad_norm": 0.17300310730934143, + "learning_rate": 6.835000000000001e-06, + "loss": 0.8953, + "step": 2734 + }, + { + "epoch": 0.034200855021375536, + "grad_norm": 5.934886455535889, + "learning_rate": 6.8400000000000014e-06, + "loss": 0.5002, + "step": 2736 + }, + { + "epoch": 0.03422585564639116, + "grad_norm": 3.043666362762451, + "learning_rate": 6.8450000000000005e-06, + "loss": 0.3556, + "step": 2738 + }, + { + "epoch": 0.03425085627140678, + "grad_norm": 6.189864635467529, + "learning_rate": 6.850000000000001e-06, + "loss": 1.214, + "step": 2740 + }, + { + "epoch": 0.03427585689642241, + "grad_norm": 6.139181613922119, + "learning_rate": 6.8550000000000004e-06, + "loss": 0.6134, + "step": 2742 + }, + { + "epoch": 0.034300857521438036, + "grad_norm": 0.03840470314025879, + "learning_rate": 6.860000000000001e-06, + "loss": 0.0007, + "step": 2744 + }, + { + "epoch": 0.03432585814645366, + "grad_norm": 3.5640811920166016, + "learning_rate": 6.865e-06, + "loss": 1.0157, + "step": 2746 + }, + { + "epoch": 0.03435085877146929, + "grad_norm": 3.202876567840576, + "learning_rate": 6.870000000000001e-06, + "loss": 0.5992, + "step": 2748 + }, + { + "epoch": 0.03437585939648491, + "grad_norm": 3.4599058628082275, + "learning_rate": 6.875e-06, + "loss": 1.031, + "step": 2750 + }, + { + "epoch": 0.034400860021500536, + "grad_norm": 4.990593910217285, + "learning_rate": 6.88e-06, + "loss": 1.3524, + "step": 2752 + }, + { + "epoch": 0.03442586064651616, + "grad_norm": 4.264657974243164, + "learning_rate": 6.885e-06, + "loss": 0.1708, + "step": 2754 + }, + { + "epoch": 0.03445086127153179, + "grad_norm": 4.766145706176758, + "learning_rate": 6.89e-06, + "loss": 0.8321, + "step": 2756 + }, + { + "epoch": 0.034475861896547416, + "grad_norm": 9.040043830871582, + "learning_rate": 6.895000000000001e-06, + "loss": 0.6294, + "step": 2758 + }, + { + "epoch": 0.034500862521563036, + "grad_norm": 8.887944221496582, + "learning_rate": 6.9e-06, + "loss": 1.1816, + "step": 2760 + }, + { + "epoch": 0.03452586314657866, + "grad_norm": 5.017195701599121, + "learning_rate": 6.905000000000001e-06, + "loss": 1.5461, + "step": 2762 + }, + { + "epoch": 0.03455086377159429, + "grad_norm": 5.02701473236084, + "learning_rate": 6.91e-06, + "loss": 1.2359, + "step": 2764 + }, + { + "epoch": 0.034575864396609916, + "grad_norm": 0.021266615018248558, + "learning_rate": 6.915000000000001e-06, + "loss": 0.0025, + "step": 2766 + }, + { + "epoch": 0.03460086502162554, + "grad_norm": 16.775798797607422, + "learning_rate": 6.92e-06, + "loss": 1.3353, + "step": 2768 + }, + { + "epoch": 0.03462586564664116, + "grad_norm": 7.73445463180542, + "learning_rate": 6.925000000000001e-06, + "loss": 1.6702, + "step": 2770 + }, + { + "epoch": 0.03465086627165679, + "grad_norm": 7.804163455963135, + "learning_rate": 6.93e-06, + "loss": 1.4626, + "step": 2772 + }, + { + "epoch": 0.034675866896672417, + "grad_norm": 3.3420042991638184, + "learning_rate": 6.9350000000000005e-06, + "loss": 0.5248, + "step": 2774 + }, + { + "epoch": 0.03470086752168804, + "grad_norm": 14.682146072387695, + "learning_rate": 6.9400000000000005e-06, + "loss": 1.0457, + "step": 2776 + }, + { + "epoch": 0.03472586814670367, + "grad_norm": 4.563251972198486, + "learning_rate": 6.945e-06, + "loss": 0.9045, + "step": 2778 + }, + { + "epoch": 0.03475086877171929, + "grad_norm": 5.63018798828125, + "learning_rate": 6.95e-06, + "loss": 0.8334, + "step": 2780 + }, + { + "epoch": 0.03477586939673492, + "grad_norm": 6.586276054382324, + "learning_rate": 6.955e-06, + "loss": 1.335, + "step": 2782 + }, + { + "epoch": 0.03480087002175054, + "grad_norm": 4.763210773468018, + "learning_rate": 6.96e-06, + "loss": 2.0361, + "step": 2784 + }, + { + "epoch": 0.03482587064676617, + "grad_norm": 0.04200545325875282, + "learning_rate": 6.965e-06, + "loss": 0.4984, + "step": 2786 + }, + { + "epoch": 0.0348508712717818, + "grad_norm": 13.75141716003418, + "learning_rate": 6.97e-06, + "loss": 1.7427, + "step": 2788 + }, + { + "epoch": 0.03487587189679742, + "grad_norm": 5.66323709487915, + "learning_rate": 6.975000000000001e-06, + "loss": 1.4786, + "step": 2790 + }, + { + "epoch": 0.034900872521813044, + "grad_norm": 3.6241815090179443, + "learning_rate": 6.98e-06, + "loss": 1.5458, + "step": 2792 + }, + { + "epoch": 0.03492587314682867, + "grad_norm": 5.057778835296631, + "learning_rate": 6.985000000000001e-06, + "loss": 1.5742, + "step": 2794 + }, + { + "epoch": 0.0349508737718443, + "grad_norm": 2.93557071685791, + "learning_rate": 6.99e-06, + "loss": 0.7523, + "step": 2796 + }, + { + "epoch": 0.034975874396859924, + "grad_norm": 0.040501777082681656, + "learning_rate": 6.995000000000001e-06, + "loss": 0.0006, + "step": 2798 + }, + { + "epoch": 0.035000875021875544, + "grad_norm": 6.847982883453369, + "learning_rate": 7e-06, + "loss": 1.5512, + "step": 2800 + }, + { + "epoch": 0.03502587564689117, + "grad_norm": 9.861618995666504, + "learning_rate": 7.005000000000001e-06, + "loss": 1.6385, + "step": 2802 + }, + { + "epoch": 0.0350508762719068, + "grad_norm": 6.023784160614014, + "learning_rate": 7.01e-06, + "loss": 1.4752, + "step": 2804 + }, + { + "epoch": 0.035075876896922424, + "grad_norm": 6.4545578956604, + "learning_rate": 7.015000000000001e-06, + "loss": 0.3322, + "step": 2806 + }, + { + "epoch": 0.03510087752193805, + "grad_norm": 4.406044960021973, + "learning_rate": 7.0200000000000006e-06, + "loss": 1.5391, + "step": 2808 + }, + { + "epoch": 0.03512587814695367, + "grad_norm": 4.374566078186035, + "learning_rate": 7.0250000000000005e-06, + "loss": 0.9792, + "step": 2810 + }, + { + "epoch": 0.0351508787719693, + "grad_norm": 6.540809154510498, + "learning_rate": 7.0300000000000005e-06, + "loss": 1.8819, + "step": 2812 + }, + { + "epoch": 0.035175879396984924, + "grad_norm": 3.837221384048462, + "learning_rate": 7.035e-06, + "loss": 1.0567, + "step": 2814 + }, + { + "epoch": 0.03520088002200055, + "grad_norm": 4.344392776489258, + "learning_rate": 7.04e-06, + "loss": 0.7571, + "step": 2816 + }, + { + "epoch": 0.03522588064701618, + "grad_norm": 0.014932217076420784, + "learning_rate": 7.045e-06, + "loss": 0.5852, + "step": 2818 + }, + { + "epoch": 0.0352508812720318, + "grad_norm": 2.2536113262176514, + "learning_rate": 7.05e-06, + "loss": 1.0692, + "step": 2820 + }, + { + "epoch": 0.035275881897047424, + "grad_norm": 8.604109764099121, + "learning_rate": 7.055000000000001e-06, + "loss": 2.361, + "step": 2822 + }, + { + "epoch": 0.03530088252206305, + "grad_norm": 6.282094478607178, + "learning_rate": 7.06e-06, + "loss": 0.6559, + "step": 2824 + }, + { + "epoch": 0.03532588314707868, + "grad_norm": 4.906512260437012, + "learning_rate": 7.065000000000001e-06, + "loss": 1.0454, + "step": 2826 + }, + { + "epoch": 0.035350883772094305, + "grad_norm": 5.615875244140625, + "learning_rate": 7.07e-06, + "loss": 0.8451, + "step": 2828 + }, + { + "epoch": 0.035375884397109925, + "grad_norm": 4.234293460845947, + "learning_rate": 7.075000000000001e-06, + "loss": 0.9658, + "step": 2830 + }, + { + "epoch": 0.03540088502212555, + "grad_norm": 0.01459707971662283, + "learning_rate": 7.08e-06, + "loss": 0.0003, + "step": 2832 + }, + { + "epoch": 0.03542588564714118, + "grad_norm": 4.959712028503418, + "learning_rate": 7.085000000000001e-06, + "loss": 1.5673, + "step": 2834 + }, + { + "epoch": 0.035450886272156805, + "grad_norm": 1.562484622001648, + "learning_rate": 7.09e-06, + "loss": 0.5737, + "step": 2836 + }, + { + "epoch": 0.03547588689717243, + "grad_norm": 8.04096794128418, + "learning_rate": 7.095000000000001e-06, + "loss": 0.976, + "step": 2838 + }, + { + "epoch": 0.03550088752218805, + "grad_norm": 7.855259418487549, + "learning_rate": 7.100000000000001e-06, + "loss": 1.3652, + "step": 2840 + }, + { + "epoch": 0.03552588814720368, + "grad_norm": 8.733099937438965, + "learning_rate": 7.105000000000001e-06, + "loss": 2.071, + "step": 2842 + }, + { + "epoch": 0.035550888772219305, + "grad_norm": 5.674595355987549, + "learning_rate": 7.1100000000000005e-06, + "loss": 1.3029, + "step": 2844 + }, + { + "epoch": 0.03557588939723493, + "grad_norm": 6.5674848556518555, + "learning_rate": 7.1150000000000005e-06, + "loss": 1.4486, + "step": 2846 + }, + { + "epoch": 0.03560089002225056, + "grad_norm": 0.06636912375688553, + "learning_rate": 7.1200000000000004e-06, + "loss": 0.309, + "step": 2848 + }, + { + "epoch": 0.03562589064726618, + "grad_norm": 4.4237446784973145, + "learning_rate": 7.125e-06, + "loss": 1.7609, + "step": 2850 + }, + { + "epoch": 0.035650891272281805, + "grad_norm": 0.016361841931939125, + "learning_rate": 7.13e-06, + "loss": 1.0045, + "step": 2852 + }, + { + "epoch": 0.03567589189729743, + "grad_norm": 3.6763110160827637, + "learning_rate": 7.135000000000001e-06, + "loss": 0.452, + "step": 2854 + }, + { + "epoch": 0.03570089252231306, + "grad_norm": 2.8856966495513916, + "learning_rate": 7.14e-06, + "loss": 0.487, + "step": 2856 + }, + { + "epoch": 0.035725893147328686, + "grad_norm": 4.236402988433838, + "learning_rate": 7.145000000000001e-06, + "loss": 0.6795, + "step": 2858 + }, + { + "epoch": 0.035750893772344305, + "grad_norm": 4.155874729156494, + "learning_rate": 7.15e-06, + "loss": 1.0916, + "step": 2860 + }, + { + "epoch": 0.03577589439735993, + "grad_norm": 3.7652604579925537, + "learning_rate": 7.155000000000001e-06, + "loss": 1.2324, + "step": 2862 + }, + { + "epoch": 0.03580089502237556, + "grad_norm": 0.05281085893511772, + "learning_rate": 7.16e-06, + "loss": 0.6252, + "step": 2864 + }, + { + "epoch": 0.035825895647391186, + "grad_norm": 5.9304094314575195, + "learning_rate": 7.165000000000001e-06, + "loss": 1.9629, + "step": 2866 + }, + { + "epoch": 0.03585089627240681, + "grad_norm": 4.399301528930664, + "learning_rate": 7.17e-06, + "loss": 2.4613, + "step": 2868 + }, + { + "epoch": 0.03587589689742243, + "grad_norm": 5.857614517211914, + "learning_rate": 7.175000000000001e-06, + "loss": 0.811, + "step": 2870 + }, + { + "epoch": 0.03590089752243806, + "grad_norm": 0.026969073340296745, + "learning_rate": 7.180000000000001e-06, + "loss": 1.1204, + "step": 2872 + }, + { + "epoch": 0.035925898147453686, + "grad_norm": 4.559090614318848, + "learning_rate": 7.185000000000001e-06, + "loss": 1.0526, + "step": 2874 + }, + { + "epoch": 0.03595089877246931, + "grad_norm": 6.13804817199707, + "learning_rate": 7.190000000000001e-06, + "loss": 1.1185, + "step": 2876 + }, + { + "epoch": 0.03597589939748494, + "grad_norm": 12.288576126098633, + "learning_rate": 7.1950000000000006e-06, + "loss": 1.3562, + "step": 2878 + }, + { + "epoch": 0.03600090002250056, + "grad_norm": 5.746531009674072, + "learning_rate": 7.2000000000000005e-06, + "loss": 0.332, + "step": 2880 + }, + { + "epoch": 0.036025900647516186, + "grad_norm": 3.956899642944336, + "learning_rate": 7.2050000000000005e-06, + "loss": 0.6442, + "step": 2882 + }, + { + "epoch": 0.03605090127253181, + "grad_norm": 0.022011060267686844, + "learning_rate": 7.2100000000000004e-06, + "loss": 0.7014, + "step": 2884 + }, + { + "epoch": 0.03607590189754744, + "grad_norm": 4.233859539031982, + "learning_rate": 7.215000000000001e-06, + "loss": 1.7416, + "step": 2886 + }, + { + "epoch": 0.036100902522563066, + "grad_norm": 0.8156386017799377, + "learning_rate": 7.22e-06, + "loss": 0.0237, + "step": 2888 + }, + { + "epoch": 0.036125903147578686, + "grad_norm": 5.019678592681885, + "learning_rate": 7.225000000000001e-06, + "loss": 1.3612, + "step": 2890 + }, + { + "epoch": 0.03615090377259431, + "grad_norm": 7.83676290512085, + "learning_rate": 7.23e-06, + "loss": 1.3717, + "step": 2892 + }, + { + "epoch": 0.03617590439760994, + "grad_norm": 4.322533130645752, + "learning_rate": 7.235000000000001e-06, + "loss": 1.4856, + "step": 2894 + }, + { + "epoch": 0.03620090502262557, + "grad_norm": 3.0753724575042725, + "learning_rate": 7.24e-06, + "loss": 0.0447, + "step": 2896 + }, + { + "epoch": 0.03622590564764119, + "grad_norm": 3.389192581176758, + "learning_rate": 7.245000000000001e-06, + "loss": 1.7366, + "step": 2898 + }, + { + "epoch": 0.03625090627265681, + "grad_norm": 4.072106838226318, + "learning_rate": 7.25e-06, + "loss": 0.9612, + "step": 2900 + }, + { + "epoch": 0.03627590689767244, + "grad_norm": 6.059340953826904, + "learning_rate": 7.255000000000001e-06, + "loss": 0.8813, + "step": 2902 + }, + { + "epoch": 0.03630090752268807, + "grad_norm": 4.328372955322266, + "learning_rate": 7.260000000000001e-06, + "loss": 0.8627, + "step": 2904 + }, + { + "epoch": 0.036325908147703694, + "grad_norm": 5.388106346130371, + "learning_rate": 7.265000000000001e-06, + "loss": 0.5552, + "step": 2906 + }, + { + "epoch": 0.03635090877271932, + "grad_norm": 0.21982748806476593, + "learning_rate": 7.270000000000001e-06, + "loss": 0.7044, + "step": 2908 + }, + { + "epoch": 0.03637590939773494, + "grad_norm": 12.954534530639648, + "learning_rate": 7.275000000000001e-06, + "loss": 1.4669, + "step": 2910 + }, + { + "epoch": 0.03640091002275057, + "grad_norm": 2.9420273303985596, + "learning_rate": 7.280000000000001e-06, + "loss": 1.2061, + "step": 2912 + }, + { + "epoch": 0.036425910647766194, + "grad_norm": 3.685250759124756, + "learning_rate": 7.2850000000000006e-06, + "loss": 1.1035, + "step": 2914 + }, + { + "epoch": 0.03645091127278182, + "grad_norm": 6.035146713256836, + "learning_rate": 7.2900000000000005e-06, + "loss": 1.1109, + "step": 2916 + }, + { + "epoch": 0.03647591189779745, + "grad_norm": 2.9242324829101562, + "learning_rate": 7.295000000000001e-06, + "loss": 0.859, + "step": 2918 + }, + { + "epoch": 0.03650091252281307, + "grad_norm": 4.717278480529785, + "learning_rate": 7.3e-06, + "loss": 0.2232, + "step": 2920 + }, + { + "epoch": 0.036525913147828694, + "grad_norm": 4.172697067260742, + "learning_rate": 7.305000000000001e-06, + "loss": 0.9086, + "step": 2922 + }, + { + "epoch": 0.03655091377284432, + "grad_norm": 8.582135200500488, + "learning_rate": 7.31e-06, + "loss": 0.1352, + "step": 2924 + }, + { + "epoch": 0.03657591439785995, + "grad_norm": 4.3623480796813965, + "learning_rate": 7.315000000000001e-06, + "loss": 0.9628, + "step": 2926 + }, + { + "epoch": 0.036600915022875574, + "grad_norm": 5.837160587310791, + "learning_rate": 7.32e-06, + "loss": 1.1176, + "step": 2928 + }, + { + "epoch": 0.036625915647891194, + "grad_norm": 6.281996726989746, + "learning_rate": 7.325000000000001e-06, + "loss": 0.5213, + "step": 2930 + }, + { + "epoch": 0.03665091627290682, + "grad_norm": 4.745972156524658, + "learning_rate": 7.33e-06, + "loss": 0.8126, + "step": 2932 + }, + { + "epoch": 0.03667591689792245, + "grad_norm": 4.878743648529053, + "learning_rate": 7.335000000000001e-06, + "loss": 0.9108, + "step": 2934 + }, + { + "epoch": 0.036700917522938074, + "grad_norm": 2.6627025604248047, + "learning_rate": 7.340000000000001e-06, + "loss": 0.1109, + "step": 2936 + }, + { + "epoch": 0.0367259181479537, + "grad_norm": 6.575930118560791, + "learning_rate": 7.345000000000001e-06, + "loss": 1.3095, + "step": 2938 + }, + { + "epoch": 0.03675091877296932, + "grad_norm": 0.39766925573349, + "learning_rate": 7.350000000000001e-06, + "loss": 1.048, + "step": 2940 + }, + { + "epoch": 0.03677591939798495, + "grad_norm": 3.042567014694214, + "learning_rate": 7.355000000000001e-06, + "loss": 1.4655, + "step": 2942 + }, + { + "epoch": 0.036800920023000575, + "grad_norm": 1.9676668643951416, + "learning_rate": 7.360000000000001e-06, + "loss": 1.304, + "step": 2944 + }, + { + "epoch": 0.0368259206480162, + "grad_norm": 6.257710933685303, + "learning_rate": 7.365000000000001e-06, + "loss": 2.1128, + "step": 2946 + }, + { + "epoch": 0.03685092127303183, + "grad_norm": 1.2944086790084839, + "learning_rate": 7.370000000000001e-06, + "loss": 0.3725, + "step": 2948 + }, + { + "epoch": 0.03687592189804745, + "grad_norm": 5.711312770843506, + "learning_rate": 7.375000000000001e-06, + "loss": 1.6197, + "step": 2950 + }, + { + "epoch": 0.036900922523063075, + "grad_norm": 2.81827974319458, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.7097, + "step": 2952 + }, + { + "epoch": 0.0369259231480787, + "grad_norm": 0.05580814182758331, + "learning_rate": 7.385000000000001e-06, + "loss": 1.2153, + "step": 2954 + }, + { + "epoch": 0.03695092377309433, + "grad_norm": 2.696845293045044, + "learning_rate": 7.39e-06, + "loss": 0.9865, + "step": 2956 + }, + { + "epoch": 0.036975924398109955, + "grad_norm": 0.08971463888883591, + "learning_rate": 7.395000000000001e-06, + "loss": 0.0265, + "step": 2958 + }, + { + "epoch": 0.037000925023125575, + "grad_norm": 5.531738758087158, + "learning_rate": 7.4e-06, + "loss": 0.5547, + "step": 2960 + }, + { + "epoch": 0.0370259256481412, + "grad_norm": 5.681268215179443, + "learning_rate": 7.405000000000001e-06, + "loss": 0.9103, + "step": 2962 + }, + { + "epoch": 0.03705092627315683, + "grad_norm": 3.2199721336364746, + "learning_rate": 7.41e-06, + "loss": 1.8267, + "step": 2964 + }, + { + "epoch": 0.037075926898172455, + "grad_norm": 3.7309446334838867, + "learning_rate": 7.415000000000001e-06, + "loss": 0.5434, + "step": 2966 + }, + { + "epoch": 0.03710092752318808, + "grad_norm": 2.0099685192108154, + "learning_rate": 7.420000000000001e-06, + "loss": 0.0417, + "step": 2968 + }, + { + "epoch": 0.0371259281482037, + "grad_norm": 4.707449436187744, + "learning_rate": 7.425000000000001e-06, + "loss": 1.3847, + "step": 2970 + }, + { + "epoch": 0.03715092877321933, + "grad_norm": 3.641953468322754, + "learning_rate": 7.430000000000001e-06, + "loss": 0.4942, + "step": 2972 + }, + { + "epoch": 0.037175929398234955, + "grad_norm": 7.7616448402404785, + "learning_rate": 7.435000000000001e-06, + "loss": 1.1816, + "step": 2974 + }, + { + "epoch": 0.03720093002325058, + "grad_norm": 1.614349126815796, + "learning_rate": 7.440000000000001e-06, + "loss": 0.9704, + "step": 2976 + }, + { + "epoch": 0.03722593064826621, + "grad_norm": 2.8974273204803467, + "learning_rate": 7.445000000000001e-06, + "loss": 0.2783, + "step": 2978 + }, + { + "epoch": 0.03725093127328183, + "grad_norm": 4.788438320159912, + "learning_rate": 7.450000000000001e-06, + "loss": 1.26, + "step": 2980 + }, + { + "epoch": 0.037275931898297456, + "grad_norm": 5.886223316192627, + "learning_rate": 7.4550000000000015e-06, + "loss": 1.0671, + "step": 2982 + }, + { + "epoch": 0.03730093252331308, + "grad_norm": 5.336127758026123, + "learning_rate": 7.4600000000000006e-06, + "loss": 0.9611, + "step": 2984 + }, + { + "epoch": 0.03732593314832871, + "grad_norm": 4.135542392730713, + "learning_rate": 7.465000000000001e-06, + "loss": 1.5144, + "step": 2986 + }, + { + "epoch": 0.037350933773344336, + "grad_norm": 4.94944429397583, + "learning_rate": 7.4700000000000005e-06, + "loss": 1.3451, + "step": 2988 + }, + { + "epoch": 0.037375934398359956, + "grad_norm": 4.214056491851807, + "learning_rate": 7.475000000000001e-06, + "loss": 1.7084, + "step": 2990 + }, + { + "epoch": 0.03740093502337558, + "grad_norm": 9.02291488647461, + "learning_rate": 7.48e-06, + "loss": 0.6492, + "step": 2992 + }, + { + "epoch": 0.03742593564839121, + "grad_norm": 4.999314308166504, + "learning_rate": 7.485000000000001e-06, + "loss": 1.5492, + "step": 2994 + }, + { + "epoch": 0.037450936273406836, + "grad_norm": 5.897023677825928, + "learning_rate": 7.49e-06, + "loss": 1.208, + "step": 2996 + }, + { + "epoch": 0.03747593689842246, + "grad_norm": 4.5441155433654785, + "learning_rate": 7.495000000000001e-06, + "loss": 0.8229, + "step": 2998 + }, + { + "epoch": 0.03750093752343808, + "grad_norm": 2.6825382709503174, + "learning_rate": 7.500000000000001e-06, + "loss": 1.3305, + "step": 3000 + }, + { + "epoch": 0.03752593814845371, + "grad_norm": 4.659523963928223, + "learning_rate": 7.505e-06, + "loss": 1.2384, + "step": 3002 + }, + { + "epoch": 0.037550938773469336, + "grad_norm": 3.52498722076416, + "learning_rate": 7.510000000000001e-06, + "loss": 1.4162, + "step": 3004 + }, + { + "epoch": 0.03757593939848496, + "grad_norm": 2.0715880393981934, + "learning_rate": 7.515e-06, + "loss": 1.3278, + "step": 3006 + }, + { + "epoch": 0.03760094002350059, + "grad_norm": 9.604924201965332, + "learning_rate": 7.520000000000001e-06, + "loss": 0.714, + "step": 3008 + }, + { + "epoch": 0.03762594064851621, + "grad_norm": 5.529014587402344, + "learning_rate": 7.525e-06, + "loss": 0.5975, + "step": 3010 + }, + { + "epoch": 0.037650941273531836, + "grad_norm": 5.690021514892578, + "learning_rate": 7.530000000000001e-06, + "loss": 0.9849, + "step": 3012 + }, + { + "epoch": 0.03767594189854746, + "grad_norm": 2.713961124420166, + "learning_rate": 7.535e-06, + "loss": 1.3199, + "step": 3014 + }, + { + "epoch": 0.03770094252356309, + "grad_norm": 7.209783554077148, + "learning_rate": 7.540000000000001e-06, + "loss": 1.0112, + "step": 3016 + }, + { + "epoch": 0.03772594314857872, + "grad_norm": 6.64795446395874, + "learning_rate": 7.545e-06, + "loss": 0.3179, + "step": 3018 + }, + { + "epoch": 0.037750943773594337, + "grad_norm": 0.005271784029901028, + "learning_rate": 7.5500000000000006e-06, + "loss": 1.0285, + "step": 3020 + }, + { + "epoch": 0.03777594439860996, + "grad_norm": 3.6539618968963623, + "learning_rate": 7.5550000000000005e-06, + "loss": 0.4298, + "step": 3022 + }, + { + "epoch": 0.03780094502362559, + "grad_norm": 8.54588508605957, + "learning_rate": 7.5600000000000005e-06, + "loss": 1.9668, + "step": 3024 + }, + { + "epoch": 0.03782594564864122, + "grad_norm": 0.0630137175321579, + "learning_rate": 7.565e-06, + "loss": 0.7789, + "step": 3026 + }, + { + "epoch": 0.037850946273656844, + "grad_norm": 0.6236500144004822, + "learning_rate": 7.57e-06, + "loss": 0.7799, + "step": 3028 + }, + { + "epoch": 0.03787594689867246, + "grad_norm": 4.770979881286621, + "learning_rate": 7.575e-06, + "loss": 1.5283, + "step": 3030 + }, + { + "epoch": 0.03790094752368809, + "grad_norm": 4.890214443206787, + "learning_rate": 7.58e-06, + "loss": 0.9487, + "step": 3032 + }, + { + "epoch": 0.03792594814870372, + "grad_norm": 0.14671552181243896, + "learning_rate": 7.585e-06, + "loss": 0.8367, + "step": 3034 + }, + { + "epoch": 0.037950948773719344, + "grad_norm": 0.012554190121591091, + "learning_rate": 7.590000000000001e-06, + "loss": 0.8284, + "step": 3036 + }, + { + "epoch": 0.03797594939873497, + "grad_norm": 5.1874308586120605, + "learning_rate": 7.595e-06, + "loss": 1.9699, + "step": 3038 + }, + { + "epoch": 0.03800095002375059, + "grad_norm": 5.3716959953308105, + "learning_rate": 7.600000000000001e-06, + "loss": 0.6899, + "step": 3040 + }, + { + "epoch": 0.03802595064876622, + "grad_norm": 6.829140663146973, + "learning_rate": 7.605e-06, + "loss": 1.834, + "step": 3042 + }, + { + "epoch": 0.038050951273781844, + "grad_norm": 4.608462810516357, + "learning_rate": 7.610000000000001e-06, + "loss": 1.7389, + "step": 3044 + }, + { + "epoch": 0.03807595189879747, + "grad_norm": 4.711689472198486, + "learning_rate": 7.615e-06, + "loss": 0.1864, + "step": 3046 + }, + { + "epoch": 0.0381009525238131, + "grad_norm": 8.308494567871094, + "learning_rate": 7.620000000000001e-06, + "loss": 0.3202, + "step": 3048 + }, + { + "epoch": 0.03812595314882872, + "grad_norm": 71.2898941040039, + "learning_rate": 7.625e-06, + "loss": 2.1335, + "step": 3050 + }, + { + "epoch": 0.038150953773844344, + "grad_norm": 2.4166176319122314, + "learning_rate": 7.630000000000001e-06, + "loss": 0.0828, + "step": 3052 + }, + { + "epoch": 0.03817595439885997, + "grad_norm": 4.786639213562012, + "learning_rate": 7.635e-06, + "loss": 0.7589, + "step": 3054 + }, + { + "epoch": 0.0382009550238756, + "grad_norm": 13.891875267028809, + "learning_rate": 7.640000000000001e-06, + "loss": 0.882, + "step": 3056 + }, + { + "epoch": 0.038225955648891224, + "grad_norm": 3.6730051040649414, + "learning_rate": 7.645e-06, + "loss": 1.2897, + "step": 3058 + }, + { + "epoch": 0.038250956273906844, + "grad_norm": 0.012468565255403519, + "learning_rate": 7.650000000000001e-06, + "loss": 0.0002, + "step": 3060 + }, + { + "epoch": 0.03827595689892247, + "grad_norm": 5.820043563842773, + "learning_rate": 7.655e-06, + "loss": 0.8864, + "step": 3062 + }, + { + "epoch": 0.0383009575239381, + "grad_norm": 4.924468994140625, + "learning_rate": 7.660000000000001e-06, + "loss": 1.2221, + "step": 3064 + }, + { + "epoch": 0.038325958148953725, + "grad_norm": 4.109971523284912, + "learning_rate": 7.665e-06, + "loss": 2.1839, + "step": 3066 + }, + { + "epoch": 0.03835095877396935, + "grad_norm": 20.914234161376953, + "learning_rate": 7.670000000000001e-06, + "loss": 1.0852, + "step": 3068 + }, + { + "epoch": 0.03837595939898497, + "grad_norm": 4.888038635253906, + "learning_rate": 7.675e-06, + "loss": 0.8628, + "step": 3070 + }, + { + "epoch": 0.0384009600240006, + "grad_norm": 5.6631999015808105, + "learning_rate": 7.680000000000001e-06, + "loss": 1.8933, + "step": 3072 + }, + { + "epoch": 0.038425960649016225, + "grad_norm": 4.7948198318481445, + "learning_rate": 7.685e-06, + "loss": 1.0834, + "step": 3074 + }, + { + "epoch": 0.03845096127403185, + "grad_norm": 3.6432442665100098, + "learning_rate": 7.690000000000001e-06, + "loss": 0.7626, + "step": 3076 + }, + { + "epoch": 0.03847596189904748, + "grad_norm": 5.138932228088379, + "learning_rate": 7.695e-06, + "loss": 0.424, + "step": 3078 + }, + { + "epoch": 0.0385009625240631, + "grad_norm": 3.433587074279785, + "learning_rate": 7.7e-06, + "loss": 0.6769, + "step": 3080 + }, + { + "epoch": 0.038525963149078725, + "grad_norm": 8.983166694641113, + "learning_rate": 7.705e-06, + "loss": 0.9857, + "step": 3082 + }, + { + "epoch": 0.03855096377409435, + "grad_norm": 3.1411032676696777, + "learning_rate": 7.71e-06, + "loss": 0.1408, + "step": 3084 + }, + { + "epoch": 0.03857596439910998, + "grad_norm": 5.6806111335754395, + "learning_rate": 7.715e-06, + "loss": 1.3939, + "step": 3086 + }, + { + "epoch": 0.038600965024125605, + "grad_norm": 8.368226051330566, + "learning_rate": 7.72e-06, + "loss": 0.8631, + "step": 3088 + }, + { + "epoch": 0.038625965649141225, + "grad_norm": 8.66380786895752, + "learning_rate": 7.725e-06, + "loss": 2.2565, + "step": 3090 + }, + { + "epoch": 0.03865096627415685, + "grad_norm": 5.127974033355713, + "learning_rate": 7.73e-06, + "loss": 0.735, + "step": 3092 + }, + { + "epoch": 0.03867596689917248, + "grad_norm": 4.571539878845215, + "learning_rate": 7.735e-06, + "loss": 0.3808, + "step": 3094 + }, + { + "epoch": 0.038700967524188105, + "grad_norm": 4.147434711456299, + "learning_rate": 7.74e-06, + "loss": 0.7473, + "step": 3096 + }, + { + "epoch": 0.03872596814920373, + "grad_norm": 0.43413490056991577, + "learning_rate": 7.745e-06, + "loss": 0.0445, + "step": 3098 + }, + { + "epoch": 0.03875096877421935, + "grad_norm": 1.5235369205474854, + "learning_rate": 7.75e-06, + "loss": 0.5235, + "step": 3100 + }, + { + "epoch": 0.03877596939923498, + "grad_norm": 4.498712062835693, + "learning_rate": 7.755000000000001e-06, + "loss": 0.9312, + "step": 3102 + }, + { + "epoch": 0.038800970024250606, + "grad_norm": 10.690627098083496, + "learning_rate": 7.76e-06, + "loss": 0.7095, + "step": 3104 + }, + { + "epoch": 0.03882597064926623, + "grad_norm": 2.941678524017334, + "learning_rate": 7.765000000000001e-06, + "loss": 1.5115, + "step": 3106 + }, + { + "epoch": 0.03885097127428186, + "grad_norm": 2.795644521713257, + "learning_rate": 7.77e-06, + "loss": 2.2712, + "step": 3108 + }, + { + "epoch": 0.03887597189929748, + "grad_norm": 4.479545593261719, + "learning_rate": 7.775000000000001e-06, + "loss": 1.2595, + "step": 3110 + }, + { + "epoch": 0.038900972524313106, + "grad_norm": 0.12481077015399933, + "learning_rate": 7.78e-06, + "loss": 0.3471, + "step": 3112 + }, + { + "epoch": 0.03892597314932873, + "grad_norm": 2.505723476409912, + "learning_rate": 7.785000000000001e-06, + "loss": 0.0989, + "step": 3114 + }, + { + "epoch": 0.03895097377434436, + "grad_norm": 3.3953206539154053, + "learning_rate": 7.790000000000002e-06, + "loss": 1.2723, + "step": 3116 + }, + { + "epoch": 0.038975974399359986, + "grad_norm": 4.498967170715332, + "learning_rate": 7.795e-06, + "loss": 1.3062, + "step": 3118 + }, + { + "epoch": 0.039000975024375606, + "grad_norm": 13.805456161499023, + "learning_rate": 7.800000000000002e-06, + "loss": 0.7654, + "step": 3120 + }, + { + "epoch": 0.03902597564939123, + "grad_norm": 5.627472877502441, + "learning_rate": 7.805e-06, + "loss": 0.247, + "step": 3122 + }, + { + "epoch": 0.03905097627440686, + "grad_norm": 3.042403221130371, + "learning_rate": 7.810000000000001e-06, + "loss": 1.006, + "step": 3124 + }, + { + "epoch": 0.039075976899422486, + "grad_norm": 6.646275520324707, + "learning_rate": 7.815e-06, + "loss": 0.7557, + "step": 3126 + }, + { + "epoch": 0.03910097752443811, + "grad_norm": 6.5288825035095215, + "learning_rate": 7.820000000000001e-06, + "loss": 0.6692, + "step": 3128 + }, + { + "epoch": 0.03912597814945374, + "grad_norm": 1.5890268087387085, + "learning_rate": 7.825e-06, + "loss": 0.0092, + "step": 3130 + }, + { + "epoch": 0.03915097877446936, + "grad_norm": 2.5704598426818848, + "learning_rate": 7.830000000000001e-06, + "loss": 0.0648, + "step": 3132 + }, + { + "epoch": 0.039175979399484986, + "grad_norm": 6.478513240814209, + "learning_rate": 7.835e-06, + "loss": 1.9754, + "step": 3134 + }, + { + "epoch": 0.03920098002450061, + "grad_norm": 4.210014343261719, + "learning_rate": 7.840000000000001e-06, + "loss": 1.5037, + "step": 3136 + }, + { + "epoch": 0.03922598064951624, + "grad_norm": 4.053020477294922, + "learning_rate": 7.845e-06, + "loss": 1.3147, + "step": 3138 + }, + { + "epoch": 0.03925098127453187, + "grad_norm": 2.411008834838867, + "learning_rate": 7.850000000000001e-06, + "loss": 0.2331, + "step": 3140 + }, + { + "epoch": 0.03927598189954749, + "grad_norm": 5.297018527984619, + "learning_rate": 7.855e-06, + "loss": 1.2256, + "step": 3142 + }, + { + "epoch": 0.03930098252456311, + "grad_norm": 4.443320274353027, + "learning_rate": 7.860000000000001e-06, + "loss": 0.1513, + "step": 3144 + }, + { + "epoch": 0.03932598314957874, + "grad_norm": 4.742912292480469, + "learning_rate": 7.865e-06, + "loss": 1.4568, + "step": 3146 + }, + { + "epoch": 0.03935098377459437, + "grad_norm": 0.24962304532527924, + "learning_rate": 7.870000000000001e-06, + "loss": 0.8339, + "step": 3148 + }, + { + "epoch": 0.039375984399609994, + "grad_norm": 4.0105366706848145, + "learning_rate": 7.875e-06, + "loss": 1.1744, + "step": 3150 + }, + { + "epoch": 0.039400985024625614, + "grad_norm": 0.004162886645644903, + "learning_rate": 7.88e-06, + "loss": 0.0031, + "step": 3152 + }, + { + "epoch": 0.03942598564964124, + "grad_norm": 3.8302602767944336, + "learning_rate": 7.885e-06, + "loss": 1.1275, + "step": 3154 + }, + { + "epoch": 0.03945098627465687, + "grad_norm": 5.25543212890625, + "learning_rate": 7.89e-06, + "loss": 0.8627, + "step": 3156 + }, + { + "epoch": 0.039475986899672494, + "grad_norm": 4.055466175079346, + "learning_rate": 7.895e-06, + "loss": 1.4553, + "step": 3158 + }, + { + "epoch": 0.03950098752468812, + "grad_norm": 0.030296772718429565, + "learning_rate": 7.9e-06, + "loss": 0.92, + "step": 3160 + }, + { + "epoch": 0.03952598814970374, + "grad_norm": 2.1424686908721924, + "learning_rate": 7.905e-06, + "loss": 0.7827, + "step": 3162 + }, + { + "epoch": 0.03955098877471937, + "grad_norm": 0.005567642394453287, + "learning_rate": 7.91e-06, + "loss": 0.2377, + "step": 3164 + }, + { + "epoch": 0.039575989399734994, + "grad_norm": 7.64077091217041, + "learning_rate": 7.915000000000001e-06, + "loss": 1.5926, + "step": 3166 + }, + { + "epoch": 0.03960099002475062, + "grad_norm": 3.691272497177124, + "learning_rate": 7.92e-06, + "loss": 0.5745, + "step": 3168 + }, + { + "epoch": 0.03962599064976625, + "grad_norm": 2.0251479148864746, + "learning_rate": 7.925000000000001e-06, + "loss": 0.4543, + "step": 3170 + }, + { + "epoch": 0.03965099127478187, + "grad_norm": 4.874791145324707, + "learning_rate": 7.93e-06, + "loss": 1.1473, + "step": 3172 + }, + { + "epoch": 0.039675991899797494, + "grad_norm": 0.027473000809550285, + "learning_rate": 7.935000000000001e-06, + "loss": 0.531, + "step": 3174 + }, + { + "epoch": 0.03970099252481312, + "grad_norm": 0.004184312652796507, + "learning_rate": 7.94e-06, + "loss": 0.1475, + "step": 3176 + }, + { + "epoch": 0.03972599314982875, + "grad_norm": 3.478414297103882, + "learning_rate": 7.945000000000001e-06, + "loss": 0.7527, + "step": 3178 + }, + { + "epoch": 0.039750993774844375, + "grad_norm": 4.370884895324707, + "learning_rate": 7.950000000000002e-06, + "loss": 1.6137, + "step": 3180 + }, + { + "epoch": 0.039775994399859994, + "grad_norm": 6.095135688781738, + "learning_rate": 7.955000000000001e-06, + "loss": 0.7543, + "step": 3182 + }, + { + "epoch": 0.03980099502487562, + "grad_norm": 5.089912414550781, + "learning_rate": 7.960000000000002e-06, + "loss": 0.9081, + "step": 3184 + }, + { + "epoch": 0.03982599564989125, + "grad_norm": 5.301812648773193, + "learning_rate": 7.965e-06, + "loss": 0.727, + "step": 3186 + }, + { + "epoch": 0.039850996274906875, + "grad_norm": 3.00443172454834, + "learning_rate": 7.970000000000002e-06, + "loss": 1.4278, + "step": 3188 + }, + { + "epoch": 0.0398759968999225, + "grad_norm": 19.833654403686523, + "learning_rate": 7.975e-06, + "loss": 1.591, + "step": 3190 + }, + { + "epoch": 0.03990099752493812, + "grad_norm": 3.9512417316436768, + "learning_rate": 7.980000000000002e-06, + "loss": 1.0295, + "step": 3192 + }, + { + "epoch": 0.03992599814995375, + "grad_norm": 2.0110037326812744, + "learning_rate": 7.985e-06, + "loss": 0.8705, + "step": 3194 + }, + { + "epoch": 0.039950998774969375, + "grad_norm": 0.009400258772075176, + "learning_rate": 7.990000000000001e-06, + "loss": 0.4625, + "step": 3196 + }, + { + "epoch": 0.039975999399985, + "grad_norm": 5.2756171226501465, + "learning_rate": 7.995e-06, + "loss": 1.0594, + "step": 3198 + }, + { + "epoch": 0.04000100002500063, + "grad_norm": 22.519563674926758, + "learning_rate": 8.000000000000001e-06, + "loss": 0.069, + "step": 3200 + }, + { + "epoch": 0.04002600065001625, + "grad_norm": 3.3606748580932617, + "learning_rate": 8.005e-06, + "loss": 0.4742, + "step": 3202 + }, + { + "epoch": 0.040051001275031875, + "grad_norm": 5.350225925445557, + "learning_rate": 8.010000000000001e-06, + "loss": 0.4014, + "step": 3204 + }, + { + "epoch": 0.0400760019000475, + "grad_norm": 4.091855525970459, + "learning_rate": 8.015e-06, + "loss": 0.7907, + "step": 3206 + }, + { + "epoch": 0.04010100252506313, + "grad_norm": 7.806231498718262, + "learning_rate": 8.020000000000001e-06, + "loss": 0.977, + "step": 3208 + }, + { + "epoch": 0.040126003150078755, + "grad_norm": 7.151938438415527, + "learning_rate": 8.025e-06, + "loss": 0.7951, + "step": 3210 + }, + { + "epoch": 0.040151003775094375, + "grad_norm": 6.878537654876709, + "learning_rate": 8.030000000000001e-06, + "loss": 2.313, + "step": 3212 + }, + { + "epoch": 0.04017600440011, + "grad_norm": 5.751062393188477, + "learning_rate": 8.035e-06, + "loss": 2.0603, + "step": 3214 + }, + { + "epoch": 0.04020100502512563, + "grad_norm": 4.5364203453063965, + "learning_rate": 8.040000000000001e-06, + "loss": 1.6482, + "step": 3216 + }, + { + "epoch": 0.040226005650141256, + "grad_norm": 8.14684009552002, + "learning_rate": 8.045e-06, + "loss": 0.6771, + "step": 3218 + }, + { + "epoch": 0.04025100627515688, + "grad_norm": 1.9341986179351807, + "learning_rate": 8.050000000000001e-06, + "loss": 1.4838, + "step": 3220 + }, + { + "epoch": 0.0402760069001725, + "grad_norm": 3.2384719848632812, + "learning_rate": 8.055e-06, + "loss": 1.004, + "step": 3222 + }, + { + "epoch": 0.04030100752518813, + "grad_norm": 3.803372621536255, + "learning_rate": 8.06e-06, + "loss": 1.6429, + "step": 3224 + }, + { + "epoch": 0.040326008150203756, + "grad_norm": 2.8616371154785156, + "learning_rate": 8.065e-06, + "loss": 1.2348, + "step": 3226 + }, + { + "epoch": 0.04035100877521938, + "grad_norm": 8.214056968688965, + "learning_rate": 8.07e-06, + "loss": 1.52, + "step": 3228 + }, + { + "epoch": 0.04037600940023501, + "grad_norm": 3.033876895904541, + "learning_rate": 8.075000000000001e-06, + "loss": 1.1242, + "step": 3230 + }, + { + "epoch": 0.04040101002525063, + "grad_norm": 3.445988416671753, + "learning_rate": 8.08e-06, + "loss": 0.8352, + "step": 3232 + }, + { + "epoch": 0.040426010650266256, + "grad_norm": 2.0803349018096924, + "learning_rate": 8.085000000000001e-06, + "loss": 0.3178, + "step": 3234 + }, + { + "epoch": 0.04045101127528188, + "grad_norm": 8.515774726867676, + "learning_rate": 8.09e-06, + "loss": 1.5724, + "step": 3236 + }, + { + "epoch": 0.04047601190029751, + "grad_norm": 3.7071876525878906, + "learning_rate": 8.095000000000001e-06, + "loss": 1.3664, + "step": 3238 + }, + { + "epoch": 0.040501012525313136, + "grad_norm": 5.219249248504639, + "learning_rate": 8.1e-06, + "loss": 0.7756, + "step": 3240 + }, + { + "epoch": 0.040526013150328756, + "grad_norm": 8.448517799377441, + "learning_rate": 8.105000000000001e-06, + "loss": 0.7698, + "step": 3242 + }, + { + "epoch": 0.04055101377534438, + "grad_norm": 5.775199890136719, + "learning_rate": 8.110000000000002e-06, + "loss": 1.4858, + "step": 3244 + }, + { + "epoch": 0.04057601440036001, + "grad_norm": 3.6367270946502686, + "learning_rate": 8.115000000000001e-06, + "loss": 1.8485, + "step": 3246 + }, + { + "epoch": 0.040601015025375636, + "grad_norm": 0.8241586089134216, + "learning_rate": 8.120000000000002e-06, + "loss": 1.0667, + "step": 3248 + }, + { + "epoch": 0.04062601565039126, + "grad_norm": 5.742926120758057, + "learning_rate": 8.125000000000001e-06, + "loss": 1.3957, + "step": 3250 + }, + { + "epoch": 0.04065101627540688, + "grad_norm": 5.807712554931641, + "learning_rate": 8.13e-06, + "loss": 1.2552, + "step": 3252 + }, + { + "epoch": 0.04067601690042251, + "grad_norm": 7.096923828125, + "learning_rate": 8.135000000000001e-06, + "loss": 0.8578, + "step": 3254 + }, + { + "epoch": 0.040701017525438137, + "grad_norm": 6.153586387634277, + "learning_rate": 8.14e-06, + "loss": 1.7153, + "step": 3256 + }, + { + "epoch": 0.04072601815045376, + "grad_norm": 4.283786773681641, + "learning_rate": 8.145e-06, + "loss": 1.7977, + "step": 3258 + }, + { + "epoch": 0.04075101877546939, + "grad_norm": 2.8568782806396484, + "learning_rate": 8.15e-06, + "loss": 0.9665, + "step": 3260 + }, + { + "epoch": 0.04077601940048501, + "grad_norm": 4.635697841644287, + "learning_rate": 8.155e-06, + "loss": 1.0432, + "step": 3262 + }, + { + "epoch": 0.04080102002550064, + "grad_norm": 3.463465690612793, + "learning_rate": 8.16e-06, + "loss": 0.9626, + "step": 3264 + }, + { + "epoch": 0.04082602065051626, + "grad_norm": 9.578557968139648, + "learning_rate": 8.165e-06, + "loss": 2.1497, + "step": 3266 + }, + { + "epoch": 0.04085102127553189, + "grad_norm": 7.432554244995117, + "learning_rate": 8.17e-06, + "loss": 1.2087, + "step": 3268 + }, + { + "epoch": 0.04087602190054752, + "grad_norm": 4.395415306091309, + "learning_rate": 8.175e-06, + "loss": 1.6482, + "step": 3270 + }, + { + "epoch": 0.04090102252556314, + "grad_norm": 3.764599561691284, + "learning_rate": 8.18e-06, + "loss": 1.3953, + "step": 3272 + }, + { + "epoch": 0.040926023150578764, + "grad_norm": 6.785884380340576, + "learning_rate": 8.185e-06, + "loss": 2.2027, + "step": 3274 + }, + { + "epoch": 0.04095102377559439, + "grad_norm": 5.606931209564209, + "learning_rate": 8.19e-06, + "loss": 1.9227, + "step": 3276 + }, + { + "epoch": 0.04097602440061002, + "grad_norm": 6.052057266235352, + "learning_rate": 8.195e-06, + "loss": 1.091, + "step": 3278 + }, + { + "epoch": 0.041001025025625644, + "grad_norm": 4.158267021179199, + "learning_rate": 8.2e-06, + "loss": 1.7586, + "step": 3280 + }, + { + "epoch": 0.041026025650641264, + "grad_norm": 32.76613998413086, + "learning_rate": 8.205e-06, + "loss": 0.7744, + "step": 3282 + }, + { + "epoch": 0.04105102627565689, + "grad_norm": 5.925457000732422, + "learning_rate": 8.210000000000001e-06, + "loss": 1.4031, + "step": 3284 + }, + { + "epoch": 0.04107602690067252, + "grad_norm": 0.04624738171696663, + "learning_rate": 8.215e-06, + "loss": 0.7485, + "step": 3286 + }, + { + "epoch": 0.041101027525688144, + "grad_norm": 8.831624984741211, + "learning_rate": 8.220000000000001e-06, + "loss": 1.1722, + "step": 3288 + }, + { + "epoch": 0.04112602815070377, + "grad_norm": 4.057882308959961, + "learning_rate": 8.225e-06, + "loss": 1.4271, + "step": 3290 + }, + { + "epoch": 0.04115102877571939, + "grad_norm": 4.308293342590332, + "learning_rate": 8.23e-06, + "loss": 1.4693, + "step": 3292 + }, + { + "epoch": 0.04117602940073502, + "grad_norm": 7.025139808654785, + "learning_rate": 8.235e-06, + "loss": 1.7481, + "step": 3294 + }, + { + "epoch": 0.041201030025750644, + "grad_norm": 2.8643240928649902, + "learning_rate": 8.24e-06, + "loss": 0.783, + "step": 3296 + }, + { + "epoch": 0.04122603065076627, + "grad_norm": 3.9126169681549072, + "learning_rate": 8.245000000000002e-06, + "loss": 0.7036, + "step": 3298 + }, + { + "epoch": 0.0412510312757819, + "grad_norm": 4.661065578460693, + "learning_rate": 8.25e-06, + "loss": 1.599, + "step": 3300 + }, + { + "epoch": 0.04127603190079752, + "grad_norm": 5.061479091644287, + "learning_rate": 8.255000000000001e-06, + "loss": 0.6097, + "step": 3302 + }, + { + "epoch": 0.041301032525813144, + "grad_norm": 2.870706558227539, + "learning_rate": 8.26e-06, + "loss": 0.8294, + "step": 3304 + }, + { + "epoch": 0.04132603315082877, + "grad_norm": 1.7775028944015503, + "learning_rate": 8.265000000000001e-06, + "loss": 0.4237, + "step": 3306 + }, + { + "epoch": 0.0413510337758444, + "grad_norm": 0.8572032451629639, + "learning_rate": 8.27e-06, + "loss": 0.0218, + "step": 3308 + }, + { + "epoch": 0.041376034400860025, + "grad_norm": 3.739403009414673, + "learning_rate": 8.275000000000001e-06, + "loss": 0.9363, + "step": 3310 + }, + { + "epoch": 0.041401035025875645, + "grad_norm": 1.775026559829712, + "learning_rate": 8.28e-06, + "loss": 0.1252, + "step": 3312 + }, + { + "epoch": 0.04142603565089127, + "grad_norm": 5.021397590637207, + "learning_rate": 8.285000000000001e-06, + "loss": 1.2827, + "step": 3314 + }, + { + "epoch": 0.0414510362759069, + "grad_norm": 0.036953896284103394, + "learning_rate": 8.29e-06, + "loss": 0.3276, + "step": 3316 + }, + { + "epoch": 0.041476036900922525, + "grad_norm": 4.888428211212158, + "learning_rate": 8.295000000000001e-06, + "loss": 1.6621, + "step": 3318 + }, + { + "epoch": 0.04150103752593815, + "grad_norm": 3.8050999641418457, + "learning_rate": 8.3e-06, + "loss": 1.2285, + "step": 3320 + }, + { + "epoch": 0.04152603815095377, + "grad_norm": 0.03632926568388939, + "learning_rate": 8.305000000000001e-06, + "loss": 0.9005, + "step": 3322 + }, + { + "epoch": 0.0415510387759694, + "grad_norm": 8.93421745300293, + "learning_rate": 8.31e-06, + "loss": 1.171, + "step": 3324 + }, + { + "epoch": 0.041576039400985025, + "grad_norm": 4.893916606903076, + "learning_rate": 8.315000000000001e-06, + "loss": 1.4243, + "step": 3326 + }, + { + "epoch": 0.04160104002600065, + "grad_norm": 6.963260173797607, + "learning_rate": 8.32e-06, + "loss": 0.2436, + "step": 3328 + }, + { + "epoch": 0.04162604065101628, + "grad_norm": 5.868851184844971, + "learning_rate": 8.325e-06, + "loss": 0.3398, + "step": 3330 + }, + { + "epoch": 0.0416510412760319, + "grad_norm": 3.1450419425964355, + "learning_rate": 8.33e-06, + "loss": 1.3491, + "step": 3332 + }, + { + "epoch": 0.041676041901047525, + "grad_norm": 4.6553730964660645, + "learning_rate": 8.335e-06, + "loss": 0.6261, + "step": 3334 + }, + { + "epoch": 0.04170104252606315, + "grad_norm": 4.7025227546691895, + "learning_rate": 8.34e-06, + "loss": 0.9468, + "step": 3336 + }, + { + "epoch": 0.04172604315107878, + "grad_norm": 5.508669853210449, + "learning_rate": 8.345e-06, + "loss": 1.9945, + "step": 3338 + }, + { + "epoch": 0.041751043776094406, + "grad_norm": 2.1616952419281006, + "learning_rate": 8.35e-06, + "loss": 0.5679, + "step": 3340 + }, + { + "epoch": 0.041776044401110025, + "grad_norm": 1.0341033935546875, + "learning_rate": 8.355e-06, + "loss": 0.0279, + "step": 3342 + }, + { + "epoch": 0.04180104502612565, + "grad_norm": 6.622447490692139, + "learning_rate": 8.36e-06, + "loss": 0.165, + "step": 3344 + }, + { + "epoch": 0.04182604565114128, + "grad_norm": 2.532137870788574, + "learning_rate": 8.365e-06, + "loss": 1.2035, + "step": 3346 + }, + { + "epoch": 0.041851046276156906, + "grad_norm": 4.904038906097412, + "learning_rate": 8.370000000000001e-06, + "loss": 1.6108, + "step": 3348 + }, + { + "epoch": 0.04187604690117253, + "grad_norm": 3.4851877689361572, + "learning_rate": 8.375e-06, + "loss": 0.4055, + "step": 3350 + }, + { + "epoch": 0.04190104752618815, + "grad_norm": 3.3354389667510986, + "learning_rate": 8.380000000000001e-06, + "loss": 0.9365, + "step": 3352 + }, + { + "epoch": 0.04192604815120378, + "grad_norm": 5.350419521331787, + "learning_rate": 8.385e-06, + "loss": 1.6035, + "step": 3354 + }, + { + "epoch": 0.041951048776219406, + "grad_norm": 3.672447681427002, + "learning_rate": 8.390000000000001e-06, + "loss": 0.0916, + "step": 3356 + }, + { + "epoch": 0.04197604940123503, + "grad_norm": 14.955153465270996, + "learning_rate": 8.395e-06, + "loss": 1.7438, + "step": 3358 + }, + { + "epoch": 0.04200105002625066, + "grad_norm": 5.552403926849365, + "learning_rate": 8.400000000000001e-06, + "loss": 0.99, + "step": 3360 + }, + { + "epoch": 0.04202605065126628, + "grad_norm": 4.977944374084473, + "learning_rate": 8.405000000000002e-06, + "loss": 1.0682, + "step": 3362 + }, + { + "epoch": 0.042051051276281906, + "grad_norm": 5.088313102722168, + "learning_rate": 8.41e-06, + "loss": 1.0421, + "step": 3364 + }, + { + "epoch": 0.04207605190129753, + "grad_norm": 3.640071392059326, + "learning_rate": 8.415000000000002e-06, + "loss": 0.7838, + "step": 3366 + }, + { + "epoch": 0.04210105252631316, + "grad_norm": 5.408273220062256, + "learning_rate": 8.42e-06, + "loss": 0.6264, + "step": 3368 + }, + { + "epoch": 0.042126053151328786, + "grad_norm": 4.827228546142578, + "learning_rate": 8.425000000000001e-06, + "loss": 1.1611, + "step": 3370 + }, + { + "epoch": 0.042151053776344406, + "grad_norm": 4.780372619628906, + "learning_rate": 8.43e-06, + "loss": 1.1495, + "step": 3372 + }, + { + "epoch": 0.04217605440136003, + "grad_norm": 2.6831729412078857, + "learning_rate": 8.435000000000001e-06, + "loss": 0.408, + "step": 3374 + }, + { + "epoch": 0.04220105502637566, + "grad_norm": 0.9503455758094788, + "learning_rate": 8.44e-06, + "loss": 0.181, + "step": 3376 + }, + { + "epoch": 0.04222605565139129, + "grad_norm": 3.54843807220459, + "learning_rate": 8.445000000000001e-06, + "loss": 1.1575, + "step": 3378 + }, + { + "epoch": 0.04225105627640691, + "grad_norm": 5.5384745597839355, + "learning_rate": 8.45e-06, + "loss": 1.7335, + "step": 3380 + }, + { + "epoch": 0.04227605690142253, + "grad_norm": 4.653634071350098, + "learning_rate": 8.455000000000001e-06, + "loss": 1.3065, + "step": 3382 + }, + { + "epoch": 0.04230105752643816, + "grad_norm": 4.413963317871094, + "learning_rate": 8.46e-06, + "loss": 1.8738, + "step": 3384 + }, + { + "epoch": 0.04232605815145379, + "grad_norm": 3.7276604175567627, + "learning_rate": 8.465000000000001e-06, + "loss": 0.9594, + "step": 3386 + }, + { + "epoch": 0.042351058776469414, + "grad_norm": 4.22770881652832, + "learning_rate": 8.47e-06, + "loss": 0.6968, + "step": 3388 + }, + { + "epoch": 0.04237605940148504, + "grad_norm": 11.203651428222656, + "learning_rate": 8.475000000000001e-06, + "loss": 1.5584, + "step": 3390 + }, + { + "epoch": 0.04240106002650066, + "grad_norm": 0.026642078533768654, + "learning_rate": 8.48e-06, + "loss": 0.9027, + "step": 3392 + }, + { + "epoch": 0.04242606065151629, + "grad_norm": 0.02065647952258587, + "learning_rate": 8.485000000000001e-06, + "loss": 0.3838, + "step": 3394 + }, + { + "epoch": 0.042451061276531914, + "grad_norm": 1.7919121980667114, + "learning_rate": 8.49e-06, + "loss": 1.1991, + "step": 3396 + }, + { + "epoch": 0.04247606190154754, + "grad_norm": 5.05344820022583, + "learning_rate": 8.495e-06, + "loss": 1.2098, + "step": 3398 + }, + { + "epoch": 0.04250106252656317, + "grad_norm": 3.5608155727386475, + "learning_rate": 8.5e-06, + "loss": 1.6045, + "step": 3400 + }, + { + "epoch": 0.04252606315157879, + "grad_norm": 4.9673075675964355, + "learning_rate": 8.505e-06, + "loss": 0.8954, + "step": 3402 + }, + { + "epoch": 0.042551063776594414, + "grad_norm": 4.824519157409668, + "learning_rate": 8.51e-06, + "loss": 1.9495, + "step": 3404 + }, + { + "epoch": 0.04257606440161004, + "grad_norm": 4.010466575622559, + "learning_rate": 8.515e-06, + "loss": 0.578, + "step": 3406 + }, + { + "epoch": 0.04260106502662567, + "grad_norm": 4.120970249176025, + "learning_rate": 8.52e-06, + "loss": 1.1281, + "step": 3408 + }, + { + "epoch": 0.042626065651641294, + "grad_norm": 0.030818870291113853, + "learning_rate": 8.525e-06, + "loss": 0.901, + "step": 3410 + }, + { + "epoch": 0.042651066276656914, + "grad_norm": 7.183474540710449, + "learning_rate": 8.530000000000001e-06, + "loss": 1.0595, + "step": 3412 + }, + { + "epoch": 0.04267606690167254, + "grad_norm": 0.989984929561615, + "learning_rate": 8.535e-06, + "loss": 0.1595, + "step": 3414 + }, + { + "epoch": 0.04270106752668817, + "grad_norm": 4.458256721496582, + "learning_rate": 8.540000000000001e-06, + "loss": 1.4542, + "step": 3416 + }, + { + "epoch": 0.042726068151703794, + "grad_norm": 7.237722396850586, + "learning_rate": 8.545e-06, + "loss": 0.7032, + "step": 3418 + }, + { + "epoch": 0.04275106877671942, + "grad_norm": 4.159117698669434, + "learning_rate": 8.550000000000001e-06, + "loss": 0.9269, + "step": 3420 + }, + { + "epoch": 0.04277606940173504, + "grad_norm": 5.920365333557129, + "learning_rate": 8.555e-06, + "loss": 1.4253, + "step": 3422 + }, + { + "epoch": 0.04280107002675067, + "grad_norm": 4.1454548835754395, + "learning_rate": 8.560000000000001e-06, + "loss": 1.708, + "step": 3424 + }, + { + "epoch": 0.042826070651766295, + "grad_norm": 1.6588190793991089, + "learning_rate": 8.565000000000002e-06, + "loss": 0.8429, + "step": 3426 + }, + { + "epoch": 0.04285107127678192, + "grad_norm": 11.843850135803223, + "learning_rate": 8.570000000000001e-06, + "loss": 1.3317, + "step": 3428 + }, + { + "epoch": 0.04287607190179755, + "grad_norm": 5.472208023071289, + "learning_rate": 8.575000000000002e-06, + "loss": 1.1016, + "step": 3430 + }, + { + "epoch": 0.04290107252681317, + "grad_norm": 5.702726364135742, + "learning_rate": 8.580000000000001e-06, + "loss": 1.6483, + "step": 3432 + }, + { + "epoch": 0.042926073151828795, + "grad_norm": 6.637176990509033, + "learning_rate": 8.585000000000002e-06, + "loss": 0.5839, + "step": 3434 + }, + { + "epoch": 0.04295107377684442, + "grad_norm": 1.7858339548110962, + "learning_rate": 8.59e-06, + "loss": 0.3593, + "step": 3436 + }, + { + "epoch": 0.04297607440186005, + "grad_norm": 4.133267402648926, + "learning_rate": 8.595000000000002e-06, + "loss": 1.0957, + "step": 3438 + }, + { + "epoch": 0.043001075026875675, + "grad_norm": 7.730734348297119, + "learning_rate": 8.6e-06, + "loss": 0.639, + "step": 3440 + }, + { + "epoch": 0.043026075651891295, + "grad_norm": 6.727161407470703, + "learning_rate": 8.605000000000001e-06, + "loss": 0.5923, + "step": 3442 + }, + { + "epoch": 0.04305107627690692, + "grad_norm": 15.354159355163574, + "learning_rate": 8.61e-06, + "loss": 1.611, + "step": 3444 + }, + { + "epoch": 0.04307607690192255, + "grad_norm": 3.2283973693847656, + "learning_rate": 8.615000000000001e-06, + "loss": 1.0156, + "step": 3446 + }, + { + "epoch": 0.043101077526938175, + "grad_norm": 4.524585247039795, + "learning_rate": 8.62e-06, + "loss": 1.7165, + "step": 3448 + }, + { + "epoch": 0.0431260781519538, + "grad_norm": 4.9305925369262695, + "learning_rate": 8.625000000000001e-06, + "loss": 1.0178, + "step": 3450 + }, + { + "epoch": 0.04315107877696942, + "grad_norm": 4.355340480804443, + "learning_rate": 8.63e-06, + "loss": 1.5057, + "step": 3452 + }, + { + "epoch": 0.04317607940198505, + "grad_norm": 0.05263165384531021, + "learning_rate": 8.635000000000001e-06, + "loss": 0.6491, + "step": 3454 + }, + { + "epoch": 0.043201080027000675, + "grad_norm": 6.910388946533203, + "learning_rate": 8.64e-06, + "loss": 0.1862, + "step": 3456 + }, + { + "epoch": 0.0432260806520163, + "grad_norm": 3.719341278076172, + "learning_rate": 8.645000000000001e-06, + "loss": 0.1236, + "step": 3458 + }, + { + "epoch": 0.04325108127703193, + "grad_norm": 2.9715237617492676, + "learning_rate": 8.65e-06, + "loss": 0.8723, + "step": 3460 + }, + { + "epoch": 0.04327608190204755, + "grad_norm": 5.920607089996338, + "learning_rate": 8.655000000000001e-06, + "loss": 1.2903, + "step": 3462 + }, + { + "epoch": 0.043301082527063176, + "grad_norm": 0.03376586362719536, + "learning_rate": 8.66e-06, + "loss": 0.4325, + "step": 3464 + }, + { + "epoch": 0.0433260831520788, + "grad_norm": 2.527961492538452, + "learning_rate": 8.665000000000001e-06, + "loss": 1.3487, + "step": 3466 + }, + { + "epoch": 0.04335108377709443, + "grad_norm": 4.114664077758789, + "learning_rate": 8.67e-06, + "loss": 1.5245, + "step": 3468 + }, + { + "epoch": 0.043376084402110056, + "grad_norm": 3.7701189517974854, + "learning_rate": 8.675e-06, + "loss": 1.0583, + "step": 3470 + }, + { + "epoch": 0.043401085027125676, + "grad_norm": 4.32952880859375, + "learning_rate": 8.68e-06, + "loss": 1.2277, + "step": 3472 + }, + { + "epoch": 0.0434260856521413, + "grad_norm": 11.320393562316895, + "learning_rate": 8.685e-06, + "loss": 0.6764, + "step": 3474 + }, + { + "epoch": 0.04345108627715693, + "grad_norm": 2.9113852977752686, + "learning_rate": 8.690000000000002e-06, + "loss": 0.8665, + "step": 3476 + }, + { + "epoch": 0.043476086902172556, + "grad_norm": 7.251353740692139, + "learning_rate": 8.695e-06, + "loss": 0.9994, + "step": 3478 + }, + { + "epoch": 0.04350108752718818, + "grad_norm": 9.18433952331543, + "learning_rate": 8.700000000000001e-06, + "loss": 1.1538, + "step": 3480 + }, + { + "epoch": 0.0435260881522038, + "grad_norm": 7.593164920806885, + "learning_rate": 8.705e-06, + "loss": 0.753, + "step": 3482 + }, + { + "epoch": 0.04355108877721943, + "grad_norm": 1.078994631767273, + "learning_rate": 8.710000000000001e-06, + "loss": 0.5038, + "step": 3484 + }, + { + "epoch": 0.043576089402235056, + "grad_norm": 6.909257888793945, + "learning_rate": 8.715e-06, + "loss": 0.5893, + "step": 3486 + }, + { + "epoch": 0.04360109002725068, + "grad_norm": 5.704934597015381, + "learning_rate": 8.720000000000001e-06, + "loss": 1.5866, + "step": 3488 + }, + { + "epoch": 0.04362609065226631, + "grad_norm": 0.3102138936519623, + "learning_rate": 8.725000000000002e-06, + "loss": 0.2894, + "step": 3490 + }, + { + "epoch": 0.04365109127728193, + "grad_norm": 0.2760644853115082, + "learning_rate": 8.730000000000001e-06, + "loss": 0.6125, + "step": 3492 + }, + { + "epoch": 0.043676091902297556, + "grad_norm": 0.31071844696998596, + "learning_rate": 8.735000000000002e-06, + "loss": 0.0694, + "step": 3494 + }, + { + "epoch": 0.04370109252731318, + "grad_norm": 4.55494499206543, + "learning_rate": 8.740000000000001e-06, + "loss": 1.1008, + "step": 3496 + }, + { + "epoch": 0.04372609315232881, + "grad_norm": 4.609190464019775, + "learning_rate": 8.745000000000002e-06, + "loss": 1.0706, + "step": 3498 + }, + { + "epoch": 0.04375109377734444, + "grad_norm": 5.1607890129089355, + "learning_rate": 8.750000000000001e-06, + "loss": 1.6407, + "step": 3500 + }, + { + "epoch": 0.043776094402360057, + "grad_norm": 3.990158796310425, + "learning_rate": 8.755e-06, + "loss": 0.044, + "step": 3502 + }, + { + "epoch": 0.04380109502737568, + "grad_norm": 2.220517635345459, + "learning_rate": 8.76e-06, + "loss": 0.5007, + "step": 3504 + }, + { + "epoch": 0.04382609565239131, + "grad_norm": 4.3973236083984375, + "learning_rate": 8.765e-06, + "loss": 0.7669, + "step": 3506 + }, + { + "epoch": 0.04385109627740694, + "grad_norm": 10.870895385742188, + "learning_rate": 8.77e-06, + "loss": 1.3005, + "step": 3508 + }, + { + "epoch": 0.043876096902422564, + "grad_norm": 0.22286581993103027, + "learning_rate": 8.775e-06, + "loss": 0.0506, + "step": 3510 + }, + { + "epoch": 0.04390109752743818, + "grad_norm": 0.237738698720932, + "learning_rate": 8.78e-06, + "loss": 0.5614, + "step": 3512 + }, + { + "epoch": 0.04392609815245381, + "grad_norm": 7.295814037322998, + "learning_rate": 8.785e-06, + "loss": 1.0989, + "step": 3514 + }, + { + "epoch": 0.04395109877746944, + "grad_norm": 5.839472770690918, + "learning_rate": 8.79e-06, + "loss": 1.5846, + "step": 3516 + }, + { + "epoch": 0.043976099402485064, + "grad_norm": 0.0106419138610363, + "learning_rate": 8.795e-06, + "loss": 0.616, + "step": 3518 + }, + { + "epoch": 0.04400110002750069, + "grad_norm": 8.766353607177734, + "learning_rate": 8.8e-06, + "loss": 1.9624, + "step": 3520 + }, + { + "epoch": 0.04402610065251631, + "grad_norm": 4.115775108337402, + "learning_rate": 8.805e-06, + "loss": 1.4107, + "step": 3522 + }, + { + "epoch": 0.04405110127753194, + "grad_norm": 3.0331504344940186, + "learning_rate": 8.81e-06, + "loss": 1.159, + "step": 3524 + }, + { + "epoch": 0.044076101902547564, + "grad_norm": 11.42657470703125, + "learning_rate": 8.815e-06, + "loss": 0.8669, + "step": 3526 + }, + { + "epoch": 0.04410110252756319, + "grad_norm": 0.8035628795623779, + "learning_rate": 8.82e-06, + "loss": 0.1036, + "step": 3528 + }, + { + "epoch": 0.04412610315257882, + "grad_norm": 10.04200553894043, + "learning_rate": 8.825000000000001e-06, + "loss": 0.8282, + "step": 3530 + }, + { + "epoch": 0.04415110377759444, + "grad_norm": 3.260572910308838, + "learning_rate": 8.83e-06, + "loss": 0.235, + "step": 3532 + }, + { + "epoch": 0.044176104402610064, + "grad_norm": 3.272277355194092, + "learning_rate": 8.835000000000001e-06, + "loss": 1.2694, + "step": 3534 + }, + { + "epoch": 0.04420110502762569, + "grad_norm": 2.861950397491455, + "learning_rate": 8.84e-06, + "loss": 0.5227, + "step": 3536 + }, + { + "epoch": 0.04422610565264132, + "grad_norm": 6.04716157913208, + "learning_rate": 8.845000000000001e-06, + "loss": 1.7256, + "step": 3538 + }, + { + "epoch": 0.044251106277656944, + "grad_norm": 0.15482813119888306, + "learning_rate": 8.85e-06, + "loss": 0.7584, + "step": 3540 + }, + { + "epoch": 0.044276106902672564, + "grad_norm": 3.095668077468872, + "learning_rate": 8.855e-06, + "loss": 0.7356, + "step": 3542 + }, + { + "epoch": 0.04430110752768819, + "grad_norm": 6.38306188583374, + "learning_rate": 8.860000000000002e-06, + "loss": 1.6943, + "step": 3544 + }, + { + "epoch": 0.04432610815270382, + "grad_norm": 4.114628314971924, + "learning_rate": 8.865e-06, + "loss": 0.2557, + "step": 3546 + }, + { + "epoch": 0.044351108777719445, + "grad_norm": 2.0164546966552734, + "learning_rate": 8.870000000000001e-06, + "loss": 0.5118, + "step": 3548 + }, + { + "epoch": 0.04437610940273507, + "grad_norm": 6.2539167404174805, + "learning_rate": 8.875e-06, + "loss": 0.4113, + "step": 3550 + }, + { + "epoch": 0.04440111002775069, + "grad_norm": 0.6525101661682129, + "learning_rate": 8.880000000000001e-06, + "loss": 0.4539, + "step": 3552 + }, + { + "epoch": 0.04442611065276632, + "grad_norm": 5.580752372741699, + "learning_rate": 8.885e-06, + "loss": 1.0732, + "step": 3554 + }, + { + "epoch": 0.044451111277781945, + "grad_norm": 4.501588344573975, + "learning_rate": 8.890000000000001e-06, + "loss": 1.2032, + "step": 3556 + }, + { + "epoch": 0.04447611190279757, + "grad_norm": 10.75645637512207, + "learning_rate": 8.895e-06, + "loss": 0.1728, + "step": 3558 + }, + { + "epoch": 0.0445011125278132, + "grad_norm": 0.6131554841995239, + "learning_rate": 8.900000000000001e-06, + "loss": 0.6261, + "step": 3560 + }, + { + "epoch": 0.04452611315282882, + "grad_norm": 5.735292911529541, + "learning_rate": 8.905e-06, + "loss": 0.8691, + "step": 3562 + }, + { + "epoch": 0.044551113777844445, + "grad_norm": 5.8910651206970215, + "learning_rate": 8.910000000000001e-06, + "loss": 1.0891, + "step": 3564 + }, + { + "epoch": 0.04457611440286007, + "grad_norm": 4.9456305503845215, + "learning_rate": 8.915e-06, + "loss": 1.4591, + "step": 3566 + }, + { + "epoch": 0.0446011150278757, + "grad_norm": 3.8192830085754395, + "learning_rate": 8.920000000000001e-06, + "loss": 0.7399, + "step": 3568 + }, + { + "epoch": 0.044626115652891325, + "grad_norm": 4.057834148406982, + "learning_rate": 8.925e-06, + "loss": 0.708, + "step": 3570 + }, + { + "epoch": 0.044651116277906945, + "grad_norm": 3.046861171722412, + "learning_rate": 8.930000000000001e-06, + "loss": 1.4623, + "step": 3572 + }, + { + "epoch": 0.04467611690292257, + "grad_norm": 4.308742046356201, + "learning_rate": 8.935e-06, + "loss": 1.095, + "step": 3574 + }, + { + "epoch": 0.0447011175279382, + "grad_norm": 2.8533992767333984, + "learning_rate": 8.94e-06, + "loss": 1.0997, + "step": 3576 + }, + { + "epoch": 0.044726118152953825, + "grad_norm": 0.6283295154571533, + "learning_rate": 8.945e-06, + "loss": 0.9555, + "step": 3578 + }, + { + "epoch": 0.04475111877796945, + "grad_norm": 7.594255447387695, + "learning_rate": 8.95e-06, + "loss": 1.2809, + "step": 3580 + }, + { + "epoch": 0.04477611940298507, + "grad_norm": 6.400524616241455, + "learning_rate": 8.955e-06, + "loss": 1.2546, + "step": 3582 + }, + { + "epoch": 0.0448011200280007, + "grad_norm": 0.1285165697336197, + "learning_rate": 8.96e-06, + "loss": 0.0055, + "step": 3584 + }, + { + "epoch": 0.044826120653016326, + "grad_norm": 10.067387580871582, + "learning_rate": 8.965e-06, + "loss": 0.7929, + "step": 3586 + }, + { + "epoch": 0.04485112127803195, + "grad_norm": 2.0935990810394287, + "learning_rate": 8.97e-06, + "loss": 0.2336, + "step": 3588 + }, + { + "epoch": 0.04487612190304758, + "grad_norm": 7.242427825927734, + "learning_rate": 8.975e-06, + "loss": 2.0288, + "step": 3590 + }, + { + "epoch": 0.0449011225280632, + "grad_norm": 8.047736167907715, + "learning_rate": 8.98e-06, + "loss": 1.1137, + "step": 3592 + }, + { + "epoch": 0.044926123153078826, + "grad_norm": 4.420129299163818, + "learning_rate": 8.985000000000001e-06, + "loss": 0.5931, + "step": 3594 + }, + { + "epoch": 0.04495112377809445, + "grad_norm": 4.294179439544678, + "learning_rate": 8.99e-06, + "loss": 1.7957, + "step": 3596 + }, + { + "epoch": 0.04497612440311008, + "grad_norm": 2.0426993370056152, + "learning_rate": 8.995000000000001e-06, + "loss": 0.2601, + "step": 3598 + }, + { + "epoch": 0.045001125028125706, + "grad_norm": 4.958590030670166, + "learning_rate": 9e-06, + "loss": 1.2187, + "step": 3600 + }, + { + "epoch": 0.045026125653141326, + "grad_norm": 3.965308427810669, + "learning_rate": 9.005000000000001e-06, + "loss": 1.4264, + "step": 3602 + }, + { + "epoch": 0.04505112627815695, + "grad_norm": 5.754740238189697, + "learning_rate": 9.01e-06, + "loss": 1.3099, + "step": 3604 + }, + { + "epoch": 0.04507612690317258, + "grad_norm": 10.888871192932129, + "learning_rate": 9.015000000000001e-06, + "loss": 1.2145, + "step": 3606 + }, + { + "epoch": 0.045101127528188206, + "grad_norm": 7.042413711547852, + "learning_rate": 9.020000000000002e-06, + "loss": 1.116, + "step": 3608 + }, + { + "epoch": 0.04512612815320383, + "grad_norm": 5.227004528045654, + "learning_rate": 9.025e-06, + "loss": 0.4964, + "step": 3610 + }, + { + "epoch": 0.04515112877821945, + "grad_norm": 4.490574359893799, + "learning_rate": 9.030000000000002e-06, + "loss": 0.5419, + "step": 3612 + }, + { + "epoch": 0.04517612940323508, + "grad_norm": 8.669466972351074, + "learning_rate": 9.035e-06, + "loss": 2.284, + "step": 3614 + }, + { + "epoch": 0.045201130028250706, + "grad_norm": 4.265906810760498, + "learning_rate": 9.040000000000002e-06, + "loss": 0.8805, + "step": 3616 + }, + { + "epoch": 0.04522613065326633, + "grad_norm": 6.2480082511901855, + "learning_rate": 9.045e-06, + "loss": 1.6405, + "step": 3618 + }, + { + "epoch": 0.04525113127828196, + "grad_norm": 4.422083854675293, + "learning_rate": 9.050000000000001e-06, + "loss": 1.3815, + "step": 3620 + }, + { + "epoch": 0.04527613190329758, + "grad_norm": 6.5013017654418945, + "learning_rate": 9.055e-06, + "loss": 2.0068, + "step": 3622 + }, + { + "epoch": 0.04530113252831321, + "grad_norm": 5.516392707824707, + "learning_rate": 9.060000000000001e-06, + "loss": 1.0328, + "step": 3624 + }, + { + "epoch": 0.04532613315332883, + "grad_norm": 17.669261932373047, + "learning_rate": 9.065e-06, + "loss": 1.2343, + "step": 3626 + }, + { + "epoch": 0.04535113377834446, + "grad_norm": 8.906614303588867, + "learning_rate": 9.070000000000001e-06, + "loss": 1.3014, + "step": 3628 + }, + { + "epoch": 0.04537613440336009, + "grad_norm": 2.815326452255249, + "learning_rate": 9.075e-06, + "loss": 0.6278, + "step": 3630 + }, + { + "epoch": 0.04540113502837571, + "grad_norm": 4.411290168762207, + "learning_rate": 9.080000000000001e-06, + "loss": 0.138, + "step": 3632 + }, + { + "epoch": 0.045426135653391334, + "grad_norm": 4.201783180236816, + "learning_rate": 9.085e-06, + "loss": 1.2991, + "step": 3634 + }, + { + "epoch": 0.04545113627840696, + "grad_norm": 4.524484634399414, + "learning_rate": 9.090000000000001e-06, + "loss": 1.2469, + "step": 3636 + }, + { + "epoch": 0.04547613690342259, + "grad_norm": 3.784005641937256, + "learning_rate": 9.095e-06, + "loss": 1.3036, + "step": 3638 + }, + { + "epoch": 0.045501137528438214, + "grad_norm": 6.2912750244140625, + "learning_rate": 9.100000000000001e-06, + "loss": 2.2516, + "step": 3640 + }, + { + "epoch": 0.045526138153453834, + "grad_norm": 8.563497543334961, + "learning_rate": 9.105e-06, + "loss": 0.1762, + "step": 3642 + }, + { + "epoch": 0.04555113877846946, + "grad_norm": 0.21464793384075165, + "learning_rate": 9.110000000000001e-06, + "loss": 0.0029, + "step": 3644 + }, + { + "epoch": 0.04557613940348509, + "grad_norm": 10.573267936706543, + "learning_rate": 9.115e-06, + "loss": 1.304, + "step": 3646 + }, + { + "epoch": 0.045601140028500714, + "grad_norm": 3.9790561199188232, + "learning_rate": 9.12e-06, + "loss": 0.729, + "step": 3648 + }, + { + "epoch": 0.04562614065351634, + "grad_norm": 7.457300662994385, + "learning_rate": 9.125e-06, + "loss": 2.3112, + "step": 3650 + }, + { + "epoch": 0.04565114127853196, + "grad_norm": 4.312674522399902, + "learning_rate": 9.13e-06, + "loss": 1.7488, + "step": 3652 + }, + { + "epoch": 0.04567614190354759, + "grad_norm": 1.857483148574829, + "learning_rate": 9.135e-06, + "loss": 0.9165, + "step": 3654 + }, + { + "epoch": 0.045701142528563214, + "grad_norm": 5.696174144744873, + "learning_rate": 9.14e-06, + "loss": 0.0461, + "step": 3656 + }, + { + "epoch": 0.04572614315357884, + "grad_norm": 3.906691074371338, + "learning_rate": 9.145000000000001e-06, + "loss": 0.6746, + "step": 3658 + }, + { + "epoch": 0.04575114377859447, + "grad_norm": 5.1564764976501465, + "learning_rate": 9.15e-06, + "loss": 1.8625, + "step": 3660 + }, + { + "epoch": 0.04577614440361009, + "grad_norm": 3.8954453468322754, + "learning_rate": 9.155000000000001e-06, + "loss": 1.1892, + "step": 3662 + }, + { + "epoch": 0.045801145028625714, + "grad_norm": 0.21977996826171875, + "learning_rate": 9.16e-06, + "loss": 0.319, + "step": 3664 + }, + { + "epoch": 0.04582614565364134, + "grad_norm": 5.466187953948975, + "learning_rate": 9.165000000000001e-06, + "loss": 1.271, + "step": 3666 + }, + { + "epoch": 0.04585114627865697, + "grad_norm": 3.4836246967315674, + "learning_rate": 9.17e-06, + "loss": 0.64, + "step": 3668 + }, + { + "epoch": 0.045876146903672595, + "grad_norm": 3.6487534046173096, + "learning_rate": 9.175000000000001e-06, + "loss": 1.264, + "step": 3670 + }, + { + "epoch": 0.045901147528688215, + "grad_norm": 4.946775913238525, + "learning_rate": 9.180000000000002e-06, + "loss": 1.218, + "step": 3672 + }, + { + "epoch": 0.04592614815370384, + "grad_norm": 6.665493488311768, + "learning_rate": 9.185000000000001e-06, + "loss": 1.4331, + "step": 3674 + }, + { + "epoch": 0.04595114877871947, + "grad_norm": 0.16356809437274933, + "learning_rate": 9.190000000000002e-06, + "loss": 0.0524, + "step": 3676 + }, + { + "epoch": 0.045976149403735095, + "grad_norm": 3.043720245361328, + "learning_rate": 9.195000000000001e-06, + "loss": 0.4532, + "step": 3678 + }, + { + "epoch": 0.04600115002875072, + "grad_norm": 5.28151798248291, + "learning_rate": 9.200000000000002e-06, + "loss": 1.732, + "step": 3680 + }, + { + "epoch": 0.04602615065376634, + "grad_norm": 5.329622268676758, + "learning_rate": 9.205e-06, + "loss": 0.5001, + "step": 3682 + }, + { + "epoch": 0.04605115127878197, + "grad_norm": 0.0388028509914875, + "learning_rate": 9.210000000000002e-06, + "loss": 0.6328, + "step": 3684 + }, + { + "epoch": 0.046076151903797595, + "grad_norm": 4.3600945472717285, + "learning_rate": 9.215e-06, + "loss": 2.1671, + "step": 3686 + }, + { + "epoch": 0.04610115252881322, + "grad_norm": 3.361593008041382, + "learning_rate": 9.220000000000002e-06, + "loss": 0.6705, + "step": 3688 + }, + { + "epoch": 0.04612615315382885, + "grad_norm": 1.6765902042388916, + "learning_rate": 9.225e-06, + "loss": 0.0352, + "step": 3690 + }, + { + "epoch": 0.04615115377884447, + "grad_norm": 9.26447582244873, + "learning_rate": 9.230000000000001e-06, + "loss": 0.5594, + "step": 3692 + }, + { + "epoch": 0.046176154403860095, + "grad_norm": 0.16325438022613525, + "learning_rate": 9.235e-06, + "loss": 0.0012, + "step": 3694 + }, + { + "epoch": 0.04620115502887572, + "grad_norm": 4.286826133728027, + "learning_rate": 9.240000000000001e-06, + "loss": 1.3662, + "step": 3696 + }, + { + "epoch": 0.04622615565389135, + "grad_norm": 3.227947473526001, + "learning_rate": 9.245e-06, + "loss": 0.2871, + "step": 3698 + }, + { + "epoch": 0.046251156278906976, + "grad_norm": 0.5296881198883057, + "learning_rate": 9.250000000000001e-06, + "loss": 1.1517, + "step": 3700 + }, + { + "epoch": 0.046276156903922595, + "grad_norm": 4.056119441986084, + "learning_rate": 9.255e-06, + "loss": 1.7512, + "step": 3702 + }, + { + "epoch": 0.04630115752893822, + "grad_norm": 6.373730182647705, + "learning_rate": 9.260000000000001e-06, + "loss": 0.8561, + "step": 3704 + }, + { + "epoch": 0.04632615815395385, + "grad_norm": 9.878593444824219, + "learning_rate": 9.265e-06, + "loss": 0.6608, + "step": 3706 + }, + { + "epoch": 0.046351158778969476, + "grad_norm": 3.2729361057281494, + "learning_rate": 9.270000000000001e-06, + "loss": 1.2716, + "step": 3708 + }, + { + "epoch": 0.0463761594039851, + "grad_norm": 6.087862968444824, + "learning_rate": 9.275e-06, + "loss": 1.2441, + "step": 3710 + }, + { + "epoch": 0.04640116002900072, + "grad_norm": 4.818881511688232, + "learning_rate": 9.280000000000001e-06, + "loss": 0.8677, + "step": 3712 + }, + { + "epoch": 0.04642616065401635, + "grad_norm": 4.376165866851807, + "learning_rate": 9.285e-06, + "loss": 0.2216, + "step": 3714 + }, + { + "epoch": 0.046451161279031976, + "grad_norm": 3.116837978363037, + "learning_rate": 9.29e-06, + "loss": 1.3146, + "step": 3716 + }, + { + "epoch": 0.0464761619040476, + "grad_norm": 6.060060501098633, + "learning_rate": 9.295e-06, + "loss": 1.6463, + "step": 3718 + }, + { + "epoch": 0.04650116252906323, + "grad_norm": 0.024685664102435112, + "learning_rate": 9.3e-06, + "loss": 1.0428, + "step": 3720 + }, + { + "epoch": 0.04652616315407885, + "grad_norm": 8.158454895019531, + "learning_rate": 9.305000000000002e-06, + "loss": 1.6095, + "step": 3722 + }, + { + "epoch": 0.046551163779094476, + "grad_norm": 3.320040702819824, + "learning_rate": 9.31e-06, + "loss": 1.0, + "step": 3724 + }, + { + "epoch": 0.0465761644041101, + "grad_norm": 4.316228866577148, + "learning_rate": 9.315000000000001e-06, + "loss": 1.2891, + "step": 3726 + }, + { + "epoch": 0.04660116502912573, + "grad_norm": 6.198246955871582, + "learning_rate": 9.32e-06, + "loss": 0.4956, + "step": 3728 + }, + { + "epoch": 0.046626165654141356, + "grad_norm": 6.164811134338379, + "learning_rate": 9.325000000000001e-06, + "loss": 1.6084, + "step": 3730 + }, + { + "epoch": 0.046651166279156976, + "grad_norm": 2.9118592739105225, + "learning_rate": 9.33e-06, + "loss": 1.4979, + "step": 3732 + }, + { + "epoch": 0.0466761669041726, + "grad_norm": 6.580169200897217, + "learning_rate": 9.335000000000001e-06, + "loss": 1.9742, + "step": 3734 + }, + { + "epoch": 0.04670116752918823, + "grad_norm": 4.7730231285095215, + "learning_rate": 9.340000000000002e-06, + "loss": 0.7024, + "step": 3736 + }, + { + "epoch": 0.046726168154203856, + "grad_norm": 3.0225701332092285, + "learning_rate": 9.345000000000001e-06, + "loss": 1.2248, + "step": 3738 + }, + { + "epoch": 0.04675116877921948, + "grad_norm": 6.0205864906311035, + "learning_rate": 9.350000000000002e-06, + "loss": 1.2827, + "step": 3740 + }, + { + "epoch": 0.0467761694042351, + "grad_norm": 4.125958442687988, + "learning_rate": 9.355000000000001e-06, + "loss": 0.6813, + "step": 3742 + }, + { + "epoch": 0.04680117002925073, + "grad_norm": 2.154783248901367, + "learning_rate": 9.360000000000002e-06, + "loss": 1.4922, + "step": 3744 + }, + { + "epoch": 0.04682617065426636, + "grad_norm": 3.913991928100586, + "learning_rate": 9.365000000000001e-06, + "loss": 0.6382, + "step": 3746 + }, + { + "epoch": 0.04685117127928198, + "grad_norm": 4.113982200622559, + "learning_rate": 9.370000000000002e-06, + "loss": 0.928, + "step": 3748 + }, + { + "epoch": 0.04687617190429761, + "grad_norm": 0.013694973662495613, + "learning_rate": 9.375000000000001e-06, + "loss": 0.4826, + "step": 3750 + }, + { + "epoch": 0.04690117252931323, + "grad_norm": 5.786873817443848, + "learning_rate": 9.38e-06, + "loss": 1.6592, + "step": 3752 + }, + { + "epoch": 0.04692617315432886, + "grad_norm": 3.684112310409546, + "learning_rate": 9.385e-06, + "loss": 0.5587, + "step": 3754 + }, + { + "epoch": 0.046951173779344484, + "grad_norm": 2.7932889461517334, + "learning_rate": 9.39e-06, + "loss": 0.7946, + "step": 3756 + }, + { + "epoch": 0.04697617440436011, + "grad_norm": 9.036128997802734, + "learning_rate": 9.395e-06, + "loss": 1.4766, + "step": 3758 + }, + { + "epoch": 0.04700117502937574, + "grad_norm": 9.491774559020996, + "learning_rate": 9.4e-06, + "loss": 3.5812, + "step": 3760 + }, + { + "epoch": 0.04702617565439136, + "grad_norm": 4.8207926750183105, + "learning_rate": 9.405e-06, + "loss": 1.0372, + "step": 3762 + }, + { + "epoch": 0.047051176279406984, + "grad_norm": 0.007848971523344517, + "learning_rate": 9.41e-06, + "loss": 0.282, + "step": 3764 + }, + { + "epoch": 0.04707617690442261, + "grad_norm": 4.114239692687988, + "learning_rate": 9.415e-06, + "loss": 0.4582, + "step": 3766 + }, + { + "epoch": 0.04710117752943824, + "grad_norm": 4.620325088500977, + "learning_rate": 9.42e-06, + "loss": 0.5335, + "step": 3768 + }, + { + "epoch": 0.047126178154453864, + "grad_norm": 5.1304545402526855, + "learning_rate": 9.425e-06, + "loss": 1.0082, + "step": 3770 + }, + { + "epoch": 0.047151178779469484, + "grad_norm": 4.506119728088379, + "learning_rate": 9.43e-06, + "loss": 1.4463, + "step": 3772 + }, + { + "epoch": 0.04717617940448511, + "grad_norm": 20.541982650756836, + "learning_rate": 9.435e-06, + "loss": 0.7193, + "step": 3774 + }, + { + "epoch": 0.04720118002950074, + "grad_norm": 0.5517037510871887, + "learning_rate": 9.440000000000001e-06, + "loss": 0.0321, + "step": 3776 + }, + { + "epoch": 0.047226180654516364, + "grad_norm": 2.8127548694610596, + "learning_rate": 9.445e-06, + "loss": 0.0822, + "step": 3778 + }, + { + "epoch": 0.04725118127953199, + "grad_norm": 6.370314121246338, + "learning_rate": 9.450000000000001e-06, + "loss": 0.9981, + "step": 3780 + }, + { + "epoch": 0.04727618190454761, + "grad_norm": 4.978466033935547, + "learning_rate": 9.455e-06, + "loss": 2.0786, + "step": 3782 + }, + { + "epoch": 0.04730118252956324, + "grad_norm": 2.932413101196289, + "learning_rate": 9.460000000000001e-06, + "loss": 0.7371, + "step": 3784 + }, + { + "epoch": 0.047326183154578864, + "grad_norm": 5.279724597930908, + "learning_rate": 9.465e-06, + "loss": 1.4961, + "step": 3786 + }, + { + "epoch": 0.04735118377959449, + "grad_norm": 0.09683757275342941, + "learning_rate": 9.47e-06, + "loss": 0.1805, + "step": 3788 + }, + { + "epoch": 0.04737618440461012, + "grad_norm": 7.100369930267334, + "learning_rate": 9.475000000000002e-06, + "loss": 0.3008, + "step": 3790 + }, + { + "epoch": 0.04740118502962574, + "grad_norm": 5.9402618408203125, + "learning_rate": 9.48e-06, + "loss": 1.8401, + "step": 3792 + }, + { + "epoch": 0.047426185654641365, + "grad_norm": 4.707239627838135, + "learning_rate": 9.485000000000002e-06, + "loss": 0.9946, + "step": 3794 + }, + { + "epoch": 0.04745118627965699, + "grad_norm": 3.7591938972473145, + "learning_rate": 9.49e-06, + "loss": 1.8341, + "step": 3796 + }, + { + "epoch": 0.04747618690467262, + "grad_norm": 3.188807725906372, + "learning_rate": 9.495000000000001e-06, + "loss": 0.8975, + "step": 3798 + }, + { + "epoch": 0.047501187529688245, + "grad_norm": 3.429738998413086, + "learning_rate": 9.5e-06, + "loss": 0.1289, + "step": 3800 + }, + { + "epoch": 0.047526188154703865, + "grad_norm": 4.066949844360352, + "learning_rate": 9.505000000000001e-06, + "loss": 1.5314, + "step": 3802 + }, + { + "epoch": 0.04755118877971949, + "grad_norm": 12.893852233886719, + "learning_rate": 9.51e-06, + "loss": 1.2919, + "step": 3804 + }, + { + "epoch": 0.04757618940473512, + "grad_norm": 5.143531322479248, + "learning_rate": 9.515000000000001e-06, + "loss": 1.2178, + "step": 3806 + }, + { + "epoch": 0.047601190029750745, + "grad_norm": 3.244065761566162, + "learning_rate": 9.52e-06, + "loss": 0.9631, + "step": 3808 + }, + { + "epoch": 0.04762619065476637, + "grad_norm": 4.874557971954346, + "learning_rate": 9.525000000000001e-06, + "loss": 1.4569, + "step": 3810 + }, + { + "epoch": 0.04765119127978199, + "grad_norm": 5.702967643737793, + "learning_rate": 9.53e-06, + "loss": 0.8411, + "step": 3812 + }, + { + "epoch": 0.04767619190479762, + "grad_norm": 0.33279210329055786, + "learning_rate": 9.535000000000001e-06, + "loss": 0.7554, + "step": 3814 + }, + { + "epoch": 0.047701192529813245, + "grad_norm": 1.7354540824890137, + "learning_rate": 9.54e-06, + "loss": 0.0784, + "step": 3816 + }, + { + "epoch": 0.04772619315482887, + "grad_norm": 0.0518982969224453, + "learning_rate": 9.545000000000001e-06, + "loss": 0.4129, + "step": 3818 + }, + { + "epoch": 0.0477511937798445, + "grad_norm": 3.8598263263702393, + "learning_rate": 9.55e-06, + "loss": 0.7802, + "step": 3820 + }, + { + "epoch": 0.04777619440486012, + "grad_norm": 6.61614990234375, + "learning_rate": 9.555e-06, + "loss": 0.743, + "step": 3822 + }, + { + "epoch": 0.047801195029875745, + "grad_norm": 5.751448154449463, + "learning_rate": 9.56e-06, + "loss": 0.945, + "step": 3824 + }, + { + "epoch": 0.04782619565489137, + "grad_norm": 9.1668701171875, + "learning_rate": 9.565e-06, + "loss": 0.9369, + "step": 3826 + }, + { + "epoch": 0.047851196279907, + "grad_norm": 0.3845333158969879, + "learning_rate": 9.57e-06, + "loss": 0.6179, + "step": 3828 + }, + { + "epoch": 0.047876196904922626, + "grad_norm": 3.0926990509033203, + "learning_rate": 9.575e-06, + "loss": 1.5557, + "step": 3830 + }, + { + "epoch": 0.047901197529938246, + "grad_norm": 4.694002151489258, + "learning_rate": 9.58e-06, + "loss": 1.3454, + "step": 3832 + }, + { + "epoch": 0.04792619815495387, + "grad_norm": 2.2252230644226074, + "learning_rate": 9.585e-06, + "loss": 1.1173, + "step": 3834 + }, + { + "epoch": 0.0479511987799695, + "grad_norm": 3.86765456199646, + "learning_rate": 9.59e-06, + "loss": 0.7808, + "step": 3836 + }, + { + "epoch": 0.047976199404985126, + "grad_norm": 2.7008440494537354, + "learning_rate": 9.595e-06, + "loss": 0.7695, + "step": 3838 + }, + { + "epoch": 0.04800120003000075, + "grad_norm": 3.771148443222046, + "learning_rate": 9.600000000000001e-06, + "loss": 0.7704, + "step": 3840 + }, + { + "epoch": 0.04802620065501637, + "grad_norm": 7.279515743255615, + "learning_rate": 9.605e-06, + "loss": 0.4847, + "step": 3842 + }, + { + "epoch": 0.048051201280032, + "grad_norm": 4.737011432647705, + "learning_rate": 9.610000000000001e-06, + "loss": 1.4043, + "step": 3844 + }, + { + "epoch": 0.048076201905047626, + "grad_norm": 3.2025742530822754, + "learning_rate": 9.615e-06, + "loss": 1.5066, + "step": 3846 + }, + { + "epoch": 0.04810120253006325, + "grad_norm": 3.0945885181427, + "learning_rate": 9.620000000000001e-06, + "loss": 0.9274, + "step": 3848 + }, + { + "epoch": 0.04812620315507888, + "grad_norm": 4.882870674133301, + "learning_rate": 9.625e-06, + "loss": 0.284, + "step": 3850 + }, + { + "epoch": 0.0481512037800945, + "grad_norm": 0.006784007418900728, + "learning_rate": 9.630000000000001e-06, + "loss": 1.005, + "step": 3852 + }, + { + "epoch": 0.048176204405110126, + "grad_norm": 7.651547908782959, + "learning_rate": 9.635000000000002e-06, + "loss": 2.3955, + "step": 3854 + }, + { + "epoch": 0.04820120503012575, + "grad_norm": 0.5199981331825256, + "learning_rate": 9.640000000000001e-06, + "loss": 0.9174, + "step": 3856 + }, + { + "epoch": 0.04822620565514138, + "grad_norm": 3.537665367126465, + "learning_rate": 9.645000000000002e-06, + "loss": 1.1379, + "step": 3858 + }, + { + "epoch": 0.04825120628015701, + "grad_norm": 7.168635368347168, + "learning_rate": 9.65e-06, + "loss": 0.5019, + "step": 3860 + }, + { + "epoch": 0.048276206905172626, + "grad_norm": 3.6024422645568848, + "learning_rate": 9.655000000000002e-06, + "loss": 1.5012, + "step": 3862 + }, + { + "epoch": 0.04830120753018825, + "grad_norm": 0.019366472959518433, + "learning_rate": 9.66e-06, + "loss": 0.7031, + "step": 3864 + }, + { + "epoch": 0.04832620815520388, + "grad_norm": 5.455802917480469, + "learning_rate": 9.665000000000001e-06, + "loss": 1.1567, + "step": 3866 + }, + { + "epoch": 0.04835120878021951, + "grad_norm": 8.457307815551758, + "learning_rate": 9.67e-06, + "loss": 0.8586, + "step": 3868 + }, + { + "epoch": 0.048376209405235134, + "grad_norm": 4.7850518226623535, + "learning_rate": 9.675000000000001e-06, + "loss": 1.5386, + "step": 3870 + }, + { + "epoch": 0.04840121003025075, + "grad_norm": 4.182426452636719, + "learning_rate": 9.68e-06, + "loss": 1.2979, + "step": 3872 + }, + { + "epoch": 0.04842621065526638, + "grad_norm": 4.103063583374023, + "learning_rate": 9.685000000000001e-06, + "loss": 0.1126, + "step": 3874 + }, + { + "epoch": 0.04845121128028201, + "grad_norm": 15.27856159210205, + "learning_rate": 9.69e-06, + "loss": 1.0342, + "step": 3876 + }, + { + "epoch": 0.048476211905297634, + "grad_norm": 9.451139450073242, + "learning_rate": 9.695000000000001e-06, + "loss": 1.2316, + "step": 3878 + }, + { + "epoch": 0.04850121253031326, + "grad_norm": 4.251308441162109, + "learning_rate": 9.7e-06, + "loss": 1.5189, + "step": 3880 + }, + { + "epoch": 0.04852621315532888, + "grad_norm": 3.3344993591308594, + "learning_rate": 9.705000000000001e-06, + "loss": 1.8255, + "step": 3882 + }, + { + "epoch": 0.04855121378034451, + "grad_norm": 6.152163028717041, + "learning_rate": 9.71e-06, + "loss": 1.4118, + "step": 3884 + }, + { + "epoch": 0.048576214405360134, + "grad_norm": 6.31234073638916, + "learning_rate": 9.715000000000001e-06, + "loss": 1.3424, + "step": 3886 + }, + { + "epoch": 0.04860121503037576, + "grad_norm": 2.7133703231811523, + "learning_rate": 9.72e-06, + "loss": 1.6415, + "step": 3888 + }, + { + "epoch": 0.04862621565539139, + "grad_norm": 2.705239772796631, + "learning_rate": 9.725000000000001e-06, + "loss": 0.9562, + "step": 3890 + }, + { + "epoch": 0.04865121628040701, + "grad_norm": 4.877296447753906, + "learning_rate": 9.73e-06, + "loss": 1.3037, + "step": 3892 + }, + { + "epoch": 0.048676216905422634, + "grad_norm": 4.973609924316406, + "learning_rate": 9.735e-06, + "loss": 0.6701, + "step": 3894 + }, + { + "epoch": 0.04870121753043826, + "grad_norm": 4.005776405334473, + "learning_rate": 9.74e-06, + "loss": 1.9133, + "step": 3896 + }, + { + "epoch": 0.04872621815545389, + "grad_norm": 4.474048614501953, + "learning_rate": 9.745e-06, + "loss": 0.6307, + "step": 3898 + }, + { + "epoch": 0.048751218780469514, + "grad_norm": 3.473562240600586, + "learning_rate": 9.75e-06, + "loss": 0.8894, + "step": 3900 + }, + { + "epoch": 0.048776219405485134, + "grad_norm": 4.642077445983887, + "learning_rate": 9.755e-06, + "loss": 1.459, + "step": 3902 + }, + { + "epoch": 0.04880122003050076, + "grad_norm": 2.1750476360321045, + "learning_rate": 9.760000000000001e-06, + "loss": 0.4042, + "step": 3904 + }, + { + "epoch": 0.04882622065551639, + "grad_norm": 3.0496745109558105, + "learning_rate": 9.765e-06, + "loss": 0.5237, + "step": 3906 + }, + { + "epoch": 0.048851221280532015, + "grad_norm": 4.164435386657715, + "learning_rate": 9.770000000000001e-06, + "loss": 1.5871, + "step": 3908 + }, + { + "epoch": 0.04887622190554764, + "grad_norm": 7.792657852172852, + "learning_rate": 9.775e-06, + "loss": 1.1913, + "step": 3910 + }, + { + "epoch": 0.04890122253056326, + "grad_norm": 8.115327835083008, + "learning_rate": 9.780000000000001e-06, + "loss": 1.3622, + "step": 3912 + }, + { + "epoch": 0.04892622315557889, + "grad_norm": 16.785419464111328, + "learning_rate": 9.785e-06, + "loss": 0.1674, + "step": 3914 + }, + { + "epoch": 0.048951223780594515, + "grad_norm": 5.298471927642822, + "learning_rate": 9.790000000000001e-06, + "loss": 1.2663, + "step": 3916 + }, + { + "epoch": 0.04897622440561014, + "grad_norm": 5.3461503982543945, + "learning_rate": 9.795000000000002e-06, + "loss": 0.7488, + "step": 3918 + }, + { + "epoch": 0.04900122503062577, + "grad_norm": 4.069904804229736, + "learning_rate": 9.800000000000001e-06, + "loss": 1.1913, + "step": 3920 + }, + { + "epoch": 0.04902622565564139, + "grad_norm": 5.606521129608154, + "learning_rate": 9.805000000000002e-06, + "loss": 1.2897, + "step": 3922 + }, + { + "epoch": 0.049051226280657015, + "grad_norm": 4.238129138946533, + "learning_rate": 9.810000000000001e-06, + "loss": 1.0447, + "step": 3924 + }, + { + "epoch": 0.04907622690567264, + "grad_norm": 2.3340163230895996, + "learning_rate": 9.815000000000002e-06, + "loss": 1.0458, + "step": 3926 + }, + { + "epoch": 0.04910122753068827, + "grad_norm": 0.6509089469909668, + "learning_rate": 9.820000000000001e-06, + "loss": 0.0894, + "step": 3928 + }, + { + "epoch": 0.049126228155703895, + "grad_norm": 3.5736336708068848, + "learning_rate": 9.825000000000002e-06, + "loss": 0.7586, + "step": 3930 + }, + { + "epoch": 0.049151228780719515, + "grad_norm": 0.008986346423625946, + "learning_rate": 9.83e-06, + "loss": 0.4781, + "step": 3932 + }, + { + "epoch": 0.04917622940573514, + "grad_norm": 5.45517635345459, + "learning_rate": 9.835000000000002e-06, + "loss": 1.424, + "step": 3934 + }, + { + "epoch": 0.04920123003075077, + "grad_norm": 2.318965196609497, + "learning_rate": 9.84e-06, + "loss": 1.2414, + "step": 3936 + }, + { + "epoch": 0.049226230655766395, + "grad_norm": 5.341896057128906, + "learning_rate": 9.845000000000001e-06, + "loss": 1.6732, + "step": 3938 + }, + { + "epoch": 0.04925123128078202, + "grad_norm": 4.257114410400391, + "learning_rate": 9.85e-06, + "loss": 1.11, + "step": 3940 + }, + { + "epoch": 0.04927623190579764, + "grad_norm": 0.004891535267233849, + "learning_rate": 9.855000000000001e-06, + "loss": 0.082, + "step": 3942 + }, + { + "epoch": 0.04930123253081327, + "grad_norm": 4.193057060241699, + "learning_rate": 9.86e-06, + "loss": 0.813, + "step": 3944 + }, + { + "epoch": 0.049326233155828896, + "grad_norm": 12.542588233947754, + "learning_rate": 9.865000000000001e-06, + "loss": 0.9648, + "step": 3946 + }, + { + "epoch": 0.04935123378084452, + "grad_norm": 2.625563859939575, + "learning_rate": 9.87e-06, + "loss": 1.0541, + "step": 3948 + }, + { + "epoch": 0.04937623440586015, + "grad_norm": 5.57685661315918, + "learning_rate": 9.875000000000001e-06, + "loss": 1.6464, + "step": 3950 + }, + { + "epoch": 0.04940123503087577, + "grad_norm": 2.9129021167755127, + "learning_rate": 9.88e-06, + "loss": 1.1753, + "step": 3952 + }, + { + "epoch": 0.049426235655891396, + "grad_norm": 0.0008301262278109789, + "learning_rate": 9.885000000000001e-06, + "loss": 0.1242, + "step": 3954 + }, + { + "epoch": 0.04945123628090702, + "grad_norm": 2.6666767597198486, + "learning_rate": 9.89e-06, + "loss": 1.2829, + "step": 3956 + }, + { + "epoch": 0.04947623690592265, + "grad_norm": 14.842369079589844, + "learning_rate": 9.895000000000001e-06, + "loss": 0.8936, + "step": 3958 + }, + { + "epoch": 0.049501237530938276, + "grad_norm": 4.271240711212158, + "learning_rate": 9.9e-06, + "loss": 1.47, + "step": 3960 + }, + { + "epoch": 0.049526238155953896, + "grad_norm": 7.711320877075195, + "learning_rate": 9.905000000000001e-06, + "loss": 1.3852, + "step": 3962 + }, + { + "epoch": 0.04955123878096952, + "grad_norm": 0.00043176155304536223, + "learning_rate": 9.91e-06, + "loss": 0.0001, + "step": 3964 + }, + { + "epoch": 0.04957623940598515, + "grad_norm": 0.013421537354588509, + "learning_rate": 9.915e-06, + "loss": 0.6485, + "step": 3966 + }, + { + "epoch": 0.049601240031000776, + "grad_norm": 2.7949023246765137, + "learning_rate": 9.920000000000002e-06, + "loss": 0.1609, + "step": 3968 + }, + { + "epoch": 0.0496262406560164, + "grad_norm": 2.448202610015869, + "learning_rate": 9.925e-06, + "loss": 0.7353, + "step": 3970 + }, + { + "epoch": 0.04965124128103202, + "grad_norm": 4.806116104125977, + "learning_rate": 9.930000000000001e-06, + "loss": 0.1419, + "step": 3972 + }, + { + "epoch": 0.04967624190604765, + "grad_norm": 5.67274808883667, + "learning_rate": 9.935e-06, + "loss": 1.3685, + "step": 3974 + }, + { + "epoch": 0.049701242531063276, + "grad_norm": 2.982246160507202, + "learning_rate": 9.940000000000001e-06, + "loss": 0.5979, + "step": 3976 + }, + { + "epoch": 0.0497262431560789, + "grad_norm": 5.332012176513672, + "learning_rate": 9.945e-06, + "loss": 1.0913, + "step": 3978 + }, + { + "epoch": 0.04975124378109453, + "grad_norm": 6.123744964599609, + "learning_rate": 9.950000000000001e-06, + "loss": 2.3978, + "step": 3980 + }, + { + "epoch": 0.04977624440611015, + "grad_norm": 5.540482521057129, + "learning_rate": 9.955000000000002e-06, + "loss": 1.5334, + "step": 3982 + }, + { + "epoch": 0.049801245031125776, + "grad_norm": 5.122238636016846, + "learning_rate": 9.960000000000001e-06, + "loss": 1.1093, + "step": 3984 + }, + { + "epoch": 0.0498262456561414, + "grad_norm": 10.091115951538086, + "learning_rate": 9.965000000000002e-06, + "loss": 1.5061, + "step": 3986 + }, + { + "epoch": 0.04985124628115703, + "grad_norm": 5.338637351989746, + "learning_rate": 9.970000000000001e-06, + "loss": 0.8662, + "step": 3988 + }, + { + "epoch": 0.04987624690617266, + "grad_norm": 1.8781379461288452, + "learning_rate": 9.975000000000002e-06, + "loss": 0.0726, + "step": 3990 + }, + { + "epoch": 0.04990124753118828, + "grad_norm": 12.79279613494873, + "learning_rate": 9.980000000000001e-06, + "loss": 1.6996, + "step": 3992 + }, + { + "epoch": 0.0499262481562039, + "grad_norm": 2.5353453159332275, + "learning_rate": 9.985000000000002e-06, + "loss": 0.5605, + "step": 3994 + }, + { + "epoch": 0.04995124878121953, + "grad_norm": 4.272116661071777, + "learning_rate": 9.990000000000001e-06, + "loss": 2.0273, + "step": 3996 + }, + { + "epoch": 0.04997624940623516, + "grad_norm": 3.0113022327423096, + "learning_rate": 9.995000000000002e-06, + "loss": 1.041, + "step": 3998 + }, + { + "epoch": 0.050001250031250784, + "grad_norm": 0.3005222976207733, + "learning_rate": 1e-05, + "loss": 0.0471, + "step": 4000 + }, + { + "epoch": 0.050026250656266404, + "grad_norm": 8.04782772064209, + "learning_rate": 1.0005e-05, + "loss": 0.8013, + "step": 4002 + }, + { + "epoch": 0.05005125128128203, + "grad_norm": 2.7679054737091064, + "learning_rate": 1.0009999999999999e-05, + "loss": 0.0884, + "step": 4004 + }, + { + "epoch": 0.05007625190629766, + "grad_norm": 2.6924736499786377, + "learning_rate": 1.0015000000000002e-05, + "loss": 1.3407, + "step": 4006 + }, + { + "epoch": 0.050101252531313284, + "grad_norm": 2.7913057804107666, + "learning_rate": 1.002e-05, + "loss": 0.5468, + "step": 4008 + }, + { + "epoch": 0.05012625315632891, + "grad_norm": 9.182475090026855, + "learning_rate": 1.0025e-05, + "loss": 1.2007, + "step": 4010 + }, + { + "epoch": 0.05015125378134453, + "grad_norm": 4.994586944580078, + "learning_rate": 1.003e-05, + "loss": 1.079, + "step": 4012 + }, + { + "epoch": 0.05017625440636016, + "grad_norm": 3.8627967834472656, + "learning_rate": 1.0035000000000001e-05, + "loss": 1.1909, + "step": 4014 + }, + { + "epoch": 0.050201255031375784, + "grad_norm": 3.115783929824829, + "learning_rate": 1.004e-05, + "loss": 1.8102, + "step": 4016 + }, + { + "epoch": 0.05022625565639141, + "grad_norm": 7.0019612312316895, + "learning_rate": 1.0045e-05, + "loss": 1.0321, + "step": 4018 + }, + { + "epoch": 0.05025125628140704, + "grad_norm": 6.81225061416626, + "learning_rate": 1.005e-05, + "loss": 1.5425, + "step": 4020 + }, + { + "epoch": 0.05027625690642266, + "grad_norm": 23.190290451049805, + "learning_rate": 1.0055000000000001e-05, + "loss": 0.898, + "step": 4022 + }, + { + "epoch": 0.050301257531438284, + "grad_norm": 3.793278217315674, + "learning_rate": 1.006e-05, + "loss": 1.2056, + "step": 4024 + }, + { + "epoch": 0.05032625815645391, + "grad_norm": 6.3201189041137695, + "learning_rate": 1.0065000000000001e-05, + "loss": 1.6195, + "step": 4026 + }, + { + "epoch": 0.05035125878146954, + "grad_norm": 3.7001073360443115, + "learning_rate": 1.007e-05, + "loss": 2.162, + "step": 4028 + }, + { + "epoch": 0.050376259406485165, + "grad_norm": 3.6296274662017822, + "learning_rate": 1.0075000000000001e-05, + "loss": 1.9738, + "step": 4030 + }, + { + "epoch": 0.050401260031500784, + "grad_norm": 4.027717590332031, + "learning_rate": 1.008e-05, + "loss": 0.7993, + "step": 4032 + }, + { + "epoch": 0.05042626065651641, + "grad_norm": 3.0308332443237305, + "learning_rate": 1.0085000000000001e-05, + "loss": 0.9776, + "step": 4034 + }, + { + "epoch": 0.05045126128153204, + "grad_norm": 3.5659799575805664, + "learning_rate": 1.009e-05, + "loss": 1.1584, + "step": 4036 + }, + { + "epoch": 0.050476261906547665, + "grad_norm": 3.899202346801758, + "learning_rate": 1.0095e-05, + "loss": 1.044, + "step": 4038 + }, + { + "epoch": 0.05050126253156329, + "grad_norm": 2.9484097957611084, + "learning_rate": 1.0100000000000002e-05, + "loss": 0.8521, + "step": 4040 + }, + { + "epoch": 0.05052626315657891, + "grad_norm": 3.0226166248321533, + "learning_rate": 1.0105e-05, + "loss": 1.037, + "step": 4042 + }, + { + "epoch": 0.05055126378159454, + "grad_norm": 0.004222841467708349, + "learning_rate": 1.011e-05, + "loss": 0.0046, + "step": 4044 + }, + { + "epoch": 0.050576264406610165, + "grad_norm": 6.8351521492004395, + "learning_rate": 1.0115000000000002e-05, + "loss": 1.1844, + "step": 4046 + }, + { + "epoch": 0.05060126503162579, + "grad_norm": 3.310295581817627, + "learning_rate": 1.0120000000000001e-05, + "loss": 0.5047, + "step": 4048 + }, + { + "epoch": 0.05062626565664142, + "grad_norm": 2.6742799282073975, + "learning_rate": 1.0125e-05, + "loss": 0.6928, + "step": 4050 + }, + { + "epoch": 0.05065126628165704, + "grad_norm": 3.4607887268066406, + "learning_rate": 1.013e-05, + "loss": 1.0932, + "step": 4052 + }, + { + "epoch": 0.050676266906672665, + "grad_norm": 7.498687267303467, + "learning_rate": 1.0135000000000002e-05, + "loss": 1.3279, + "step": 4054 + }, + { + "epoch": 0.05070126753168829, + "grad_norm": 8.608054161071777, + "learning_rate": 1.0140000000000001e-05, + "loss": 1.506, + "step": 4056 + }, + { + "epoch": 0.05072626815670392, + "grad_norm": 5.365523815155029, + "learning_rate": 1.0145e-05, + "loss": 0.8544, + "step": 4058 + }, + { + "epoch": 0.050751268781719545, + "grad_norm": 3.8050966262817383, + "learning_rate": 1.015e-05, + "loss": 1.8217, + "step": 4060 + }, + { + "epoch": 0.050776269406735165, + "grad_norm": 5.255065441131592, + "learning_rate": 1.0155000000000002e-05, + "loss": 1.6767, + "step": 4062 + }, + { + "epoch": 0.05080127003175079, + "grad_norm": 4.348194599151611, + "learning_rate": 1.0160000000000001e-05, + "loss": 1.4098, + "step": 4064 + }, + { + "epoch": 0.05082627065676642, + "grad_norm": 4.183195114135742, + "learning_rate": 1.0165e-05, + "loss": 1.1798, + "step": 4066 + }, + { + "epoch": 0.050851271281782046, + "grad_norm": 2.7589340209960938, + "learning_rate": 1.017e-05, + "loss": 1.1909, + "step": 4068 + }, + { + "epoch": 0.05087627190679767, + "grad_norm": 5.3432230949401855, + "learning_rate": 1.0175000000000002e-05, + "loss": 0.8132, + "step": 4070 + }, + { + "epoch": 0.05090127253181329, + "grad_norm": 4.443942546844482, + "learning_rate": 1.018e-05, + "loss": 0.7056, + "step": 4072 + }, + { + "epoch": 0.05092627315682892, + "grad_norm": 2.9527535438537598, + "learning_rate": 1.0185e-05, + "loss": 1.0494, + "step": 4074 + }, + { + "epoch": 0.050951273781844546, + "grad_norm": 9.12804126739502, + "learning_rate": 1.019e-05, + "loss": 1.1563, + "step": 4076 + }, + { + "epoch": 0.05097627440686017, + "grad_norm": 0.02220343053340912, + "learning_rate": 1.0195000000000001e-05, + "loss": 0.1846, + "step": 4078 + }, + { + "epoch": 0.0510012750318758, + "grad_norm": 4.6426286697387695, + "learning_rate": 1.02e-05, + "loss": 1.2121, + "step": 4080 + }, + { + "epoch": 0.05102627565689142, + "grad_norm": 5.094712734222412, + "learning_rate": 1.0205e-05, + "loss": 1.2412, + "step": 4082 + }, + { + "epoch": 0.051051276281907046, + "grad_norm": 5.713558197021484, + "learning_rate": 1.021e-05, + "loss": 1.7206, + "step": 4084 + }, + { + "epoch": 0.05107627690692267, + "grad_norm": 0.1552184820175171, + "learning_rate": 1.0215000000000001e-05, + "loss": 0.5805, + "step": 4086 + }, + { + "epoch": 0.0511012775319383, + "grad_norm": 4.183095455169678, + "learning_rate": 1.022e-05, + "loss": 1.0773, + "step": 4088 + }, + { + "epoch": 0.051126278156953926, + "grad_norm": 4.09202766418457, + "learning_rate": 1.0225000000000001e-05, + "loss": 0.6463, + "step": 4090 + }, + { + "epoch": 0.051151278781969546, + "grad_norm": 0.005458100698888302, + "learning_rate": 1.023e-05, + "loss": 0.7184, + "step": 4092 + }, + { + "epoch": 0.05117627940698517, + "grad_norm": 6.676637649536133, + "learning_rate": 1.0235000000000001e-05, + "loss": 0.5241, + "step": 4094 + }, + { + "epoch": 0.0512012800320008, + "grad_norm": 4.033051490783691, + "learning_rate": 1.024e-05, + "loss": 0.8674, + "step": 4096 + }, + { + "epoch": 0.051226280657016426, + "grad_norm": 7.432525157928467, + "learning_rate": 1.0245000000000001e-05, + "loss": 2.2875, + "step": 4098 + }, + { + "epoch": 0.05125128128203205, + "grad_norm": 0.0016994763864204288, + "learning_rate": 1.025e-05, + "loss": 0.1107, + "step": 4100 + }, + { + "epoch": 0.05127628190704767, + "grad_norm": 11.2993745803833, + "learning_rate": 1.0255000000000001e-05, + "loss": 1.5729, + "step": 4102 + }, + { + "epoch": 0.0513012825320633, + "grad_norm": 0.026323599740862846, + "learning_rate": 1.0260000000000002e-05, + "loss": 0.2488, + "step": 4104 + }, + { + "epoch": 0.05132628315707893, + "grad_norm": 8.399213790893555, + "learning_rate": 1.0265e-05, + "loss": 1.5271, + "step": 4106 + }, + { + "epoch": 0.05135128378209455, + "grad_norm": 27.97135353088379, + "learning_rate": 1.027e-05, + "loss": 0.4347, + "step": 4108 + }, + { + "epoch": 0.05137628440711018, + "grad_norm": 3.120980978012085, + "learning_rate": 1.0275000000000002e-05, + "loss": 0.862, + "step": 4110 + }, + { + "epoch": 0.0514012850321258, + "grad_norm": 3.0856096744537354, + "learning_rate": 1.0280000000000002e-05, + "loss": 0.7116, + "step": 4112 + }, + { + "epoch": 0.05142628565714143, + "grad_norm": 5.083218574523926, + "learning_rate": 1.0285e-05, + "loss": 1.0727, + "step": 4114 + }, + { + "epoch": 0.051451286282157054, + "grad_norm": 0.008883646689355373, + "learning_rate": 1.029e-05, + "loss": 2.0318, + "step": 4116 + }, + { + "epoch": 0.05147628690717268, + "grad_norm": 4.337731838226318, + "learning_rate": 1.0295000000000002e-05, + "loss": 0.5677, + "step": 4118 + }, + { + "epoch": 0.05150128753218831, + "grad_norm": 4.099752426147461, + "learning_rate": 1.0300000000000001e-05, + "loss": 0.3511, + "step": 4120 + }, + { + "epoch": 0.05152628815720393, + "grad_norm": 3.7414462566375732, + "learning_rate": 1.0305e-05, + "loss": 0.671, + "step": 4122 + }, + { + "epoch": 0.051551288782219554, + "grad_norm": 0.0035337633453309536, + "learning_rate": 1.031e-05, + "loss": 1.5977, + "step": 4124 + }, + { + "epoch": 0.05157628940723518, + "grad_norm": 0.016865242272615433, + "learning_rate": 1.0315000000000002e-05, + "loss": 0.3142, + "step": 4126 + }, + { + "epoch": 0.05160129003225081, + "grad_norm": 2.5167558193206787, + "learning_rate": 1.0320000000000001e-05, + "loss": 0.4235, + "step": 4128 + }, + { + "epoch": 0.051626290657266434, + "grad_norm": 1.2143880128860474, + "learning_rate": 1.0325e-05, + "loss": 0.613, + "step": 4130 + }, + { + "epoch": 0.051651291282282054, + "grad_norm": 6.03091287612915, + "learning_rate": 1.033e-05, + "loss": 1.4447, + "step": 4132 + }, + { + "epoch": 0.05167629190729768, + "grad_norm": 3.7254714965820312, + "learning_rate": 1.0335000000000002e-05, + "loss": 0.7647, + "step": 4134 + }, + { + "epoch": 0.05170129253231331, + "grad_norm": 3.069565534591675, + "learning_rate": 1.0340000000000001e-05, + "loss": 0.6356, + "step": 4136 + }, + { + "epoch": 0.051726293157328934, + "grad_norm": 10.254975318908691, + "learning_rate": 1.0345e-05, + "loss": 0.3547, + "step": 4138 + }, + { + "epoch": 0.05175129378234456, + "grad_norm": 4.35267972946167, + "learning_rate": 1.0350000000000001e-05, + "loss": 1.2276, + "step": 4140 + }, + { + "epoch": 0.05177629440736018, + "grad_norm": 3.9616732597351074, + "learning_rate": 1.0355000000000002e-05, + "loss": 0.6143, + "step": 4142 + }, + { + "epoch": 0.05180129503237581, + "grad_norm": 5.8100385665893555, + "learning_rate": 1.036e-05, + "loss": 1.1012, + "step": 4144 + }, + { + "epoch": 0.051826295657391434, + "grad_norm": 0.007611478213220835, + "learning_rate": 1.0365e-05, + "loss": 1.3781, + "step": 4146 + }, + { + "epoch": 0.05185129628240706, + "grad_norm": 5.190976619720459, + "learning_rate": 1.037e-05, + "loss": 1.4686, + "step": 4148 + }, + { + "epoch": 0.05187629690742269, + "grad_norm": 5.454734802246094, + "learning_rate": 1.0375000000000001e-05, + "loss": 0.8189, + "step": 4150 + }, + { + "epoch": 0.05190129753243831, + "grad_norm": 3.3223118782043457, + "learning_rate": 1.038e-05, + "loss": 0.8859, + "step": 4152 + }, + { + "epoch": 0.051926298157453935, + "grad_norm": 4.496255397796631, + "learning_rate": 1.0385000000000001e-05, + "loss": 1.3718, + "step": 4154 + }, + { + "epoch": 0.05195129878246956, + "grad_norm": 3.0585291385650635, + "learning_rate": 1.039e-05, + "loss": 2.1815, + "step": 4156 + }, + { + "epoch": 0.05197629940748519, + "grad_norm": 5.067299842834473, + "learning_rate": 1.0395000000000001e-05, + "loss": 0.2427, + "step": 4158 + }, + { + "epoch": 0.052001300032500815, + "grad_norm": 4.641566753387451, + "learning_rate": 1.04e-05, + "loss": 1.7848, + "step": 4160 + }, + { + "epoch": 0.052026300657516435, + "grad_norm": 8.957743644714355, + "learning_rate": 1.0405000000000001e-05, + "loss": 0.647, + "step": 4162 + }, + { + "epoch": 0.05205130128253206, + "grad_norm": 5.748458385467529, + "learning_rate": 1.041e-05, + "loss": 1.1701, + "step": 4164 + }, + { + "epoch": 0.05207630190754769, + "grad_norm": 2.838620185852051, + "learning_rate": 1.0415000000000001e-05, + "loss": 0.7176, + "step": 4166 + }, + { + "epoch": 0.052101302532563315, + "grad_norm": 5.509230613708496, + "learning_rate": 1.0420000000000002e-05, + "loss": 1.1117, + "step": 4168 + }, + { + "epoch": 0.05212630315757894, + "grad_norm": 3.2971138954162598, + "learning_rate": 1.0425000000000001e-05, + "loss": 1.5554, + "step": 4170 + }, + { + "epoch": 0.05215130378259456, + "grad_norm": 4.881767272949219, + "learning_rate": 1.043e-05, + "loss": 0.6797, + "step": 4172 + }, + { + "epoch": 0.05217630440761019, + "grad_norm": 3.2036826610565186, + "learning_rate": 1.0435000000000003e-05, + "loss": 0.7788, + "step": 4174 + }, + { + "epoch": 0.052201305032625815, + "grad_norm": 3.987278461456299, + "learning_rate": 1.0440000000000002e-05, + "loss": 0.2701, + "step": 4176 + }, + { + "epoch": 0.05222630565764144, + "grad_norm": 3.3789219856262207, + "learning_rate": 1.0445e-05, + "loss": 0.2398, + "step": 4178 + }, + { + "epoch": 0.05225130628265707, + "grad_norm": 0.004856455605477095, + "learning_rate": 1.045e-05, + "loss": 0.0001, + "step": 4180 + }, + { + "epoch": 0.05227630690767269, + "grad_norm": 0.0015958811854943633, + "learning_rate": 1.0455000000000002e-05, + "loss": 0.5683, + "step": 4182 + }, + { + "epoch": 0.052301307532688315, + "grad_norm": 3.686892509460449, + "learning_rate": 1.0460000000000001e-05, + "loss": 1.8323, + "step": 4184 + }, + { + "epoch": 0.05232630815770394, + "grad_norm": 0.03422735631465912, + "learning_rate": 1.0465e-05, + "loss": 0.6014, + "step": 4186 + }, + { + "epoch": 0.05235130878271957, + "grad_norm": 3.764528751373291, + "learning_rate": 1.047e-05, + "loss": 1.1699, + "step": 4188 + }, + { + "epoch": 0.052376309407735196, + "grad_norm": 3.3103644847869873, + "learning_rate": 1.0475000000000002e-05, + "loss": 0.5638, + "step": 4190 + }, + { + "epoch": 0.052401310032750816, + "grad_norm": 2.5440990924835205, + "learning_rate": 1.0480000000000001e-05, + "loss": 0.7379, + "step": 4192 + }, + { + "epoch": 0.05242631065776644, + "grad_norm": 8.419360160827637, + "learning_rate": 1.0485e-05, + "loss": 0.5871, + "step": 4194 + }, + { + "epoch": 0.05245131128278207, + "grad_norm": 3.47670841217041, + "learning_rate": 1.049e-05, + "loss": 0.5212, + "step": 4196 + }, + { + "epoch": 0.052476311907797696, + "grad_norm": 3.675471782684326, + "learning_rate": 1.0495000000000002e-05, + "loss": 0.705, + "step": 4198 + }, + { + "epoch": 0.05250131253281332, + "grad_norm": 8.438237190246582, + "learning_rate": 1.0500000000000001e-05, + "loss": 1.2089, + "step": 4200 + }, + { + "epoch": 0.05252631315782894, + "grad_norm": 5.863653659820557, + "learning_rate": 1.0505e-05, + "loss": 2.4342, + "step": 4202 + }, + { + "epoch": 0.05255131378284457, + "grad_norm": 0.0047645578160882, + "learning_rate": 1.0510000000000001e-05, + "loss": 0.8032, + "step": 4204 + }, + { + "epoch": 0.052576314407860196, + "grad_norm": 4.066840171813965, + "learning_rate": 1.0515000000000002e-05, + "loss": 0.8588, + "step": 4206 + }, + { + "epoch": 0.05260131503287582, + "grad_norm": 0.011098155751824379, + "learning_rate": 1.0520000000000001e-05, + "loss": 0.1853, + "step": 4208 + }, + { + "epoch": 0.05262631565789145, + "grad_norm": 2.303466558456421, + "learning_rate": 1.0525e-05, + "loss": 1.4147, + "step": 4210 + }, + { + "epoch": 0.05265131628290707, + "grad_norm": 2.4073643684387207, + "learning_rate": 1.053e-05, + "loss": 1.0496, + "step": 4212 + }, + { + "epoch": 0.052676316907922696, + "grad_norm": 3.4930267333984375, + "learning_rate": 1.0535000000000002e-05, + "loss": 0.942, + "step": 4214 + }, + { + "epoch": 0.05270131753293832, + "grad_norm": 5.754742622375488, + "learning_rate": 1.054e-05, + "loss": 1.1953, + "step": 4216 + }, + { + "epoch": 0.05272631815795395, + "grad_norm": 4.825891494750977, + "learning_rate": 1.0545000000000002e-05, + "loss": 1.0193, + "step": 4218 + }, + { + "epoch": 0.052751318782969576, + "grad_norm": 4.087713241577148, + "learning_rate": 1.055e-05, + "loss": 1.0665, + "step": 4220 + }, + { + "epoch": 0.052776319407985196, + "grad_norm": 4.260284423828125, + "learning_rate": 1.0555000000000001e-05, + "loss": 1.463, + "step": 4222 + }, + { + "epoch": 0.05280132003300082, + "grad_norm": 5.740818977355957, + "learning_rate": 1.056e-05, + "loss": 1.2798, + "step": 4224 + }, + { + "epoch": 0.05282632065801645, + "grad_norm": 3.401501417160034, + "learning_rate": 1.0565000000000001e-05, + "loss": 0.6416, + "step": 4226 + }, + { + "epoch": 0.05285132128303208, + "grad_norm": 7.1874189376831055, + "learning_rate": 1.057e-05, + "loss": 0.8956, + "step": 4228 + }, + { + "epoch": 0.0528763219080477, + "grad_norm": 5.492830276489258, + "learning_rate": 1.0575000000000001e-05, + "loss": 1.0992, + "step": 4230 + }, + { + "epoch": 0.05290132253306332, + "grad_norm": 7.225461483001709, + "learning_rate": 1.0580000000000002e-05, + "loss": 1.0262, + "step": 4232 + }, + { + "epoch": 0.05292632315807895, + "grad_norm": 5.6383256912231445, + "learning_rate": 1.0585000000000001e-05, + "loss": 0.3991, + "step": 4234 + }, + { + "epoch": 0.05295132378309458, + "grad_norm": 3.8595385551452637, + "learning_rate": 1.059e-05, + "loss": 1.5268, + "step": 4236 + }, + { + "epoch": 0.052976324408110204, + "grad_norm": 3.1857478618621826, + "learning_rate": 1.0595000000000003e-05, + "loss": 0.9425, + "step": 4238 + }, + { + "epoch": 0.05300132503312583, + "grad_norm": 4.104739189147949, + "learning_rate": 1.0600000000000002e-05, + "loss": 1.5917, + "step": 4240 + }, + { + "epoch": 0.05302632565814145, + "grad_norm": 2.320244550704956, + "learning_rate": 1.0605000000000001e-05, + "loss": 0.1585, + "step": 4242 + }, + { + "epoch": 0.05305132628315708, + "grad_norm": 3.854013681411743, + "learning_rate": 1.061e-05, + "loss": 2.3487, + "step": 4244 + }, + { + "epoch": 0.053076326908172704, + "grad_norm": 7.800434589385986, + "learning_rate": 1.0615000000000003e-05, + "loss": 1.3052, + "step": 4246 + }, + { + "epoch": 0.05310132753318833, + "grad_norm": 2.734036445617676, + "learning_rate": 1.0620000000000002e-05, + "loss": 0.9638, + "step": 4248 + }, + { + "epoch": 0.05312632815820396, + "grad_norm": 1.8221458196640015, + "learning_rate": 1.0625e-05, + "loss": 0.8095, + "step": 4250 + }, + { + "epoch": 0.05315132878321958, + "grad_norm": 0.5922107696533203, + "learning_rate": 1.063e-05, + "loss": 0.7191, + "step": 4252 + }, + { + "epoch": 0.053176329408235204, + "grad_norm": 6.208261013031006, + "learning_rate": 1.0634999999999999e-05, + "loss": 0.7274, + "step": 4254 + }, + { + "epoch": 0.05320133003325083, + "grad_norm": 4.171011447906494, + "learning_rate": 1.0640000000000001e-05, + "loss": 0.8125, + "step": 4256 + }, + { + "epoch": 0.05322633065826646, + "grad_norm": 3.4422311782836914, + "learning_rate": 1.0645e-05, + "loss": 0.7533, + "step": 4258 + }, + { + "epoch": 0.053251331283282084, + "grad_norm": 4.833932399749756, + "learning_rate": 1.065e-05, + "loss": 0.6798, + "step": 4260 + }, + { + "epoch": 0.053276331908297704, + "grad_norm": 2.8368492126464844, + "learning_rate": 1.0655e-05, + "loss": 1.1207, + "step": 4262 + }, + { + "epoch": 0.05330133253331333, + "grad_norm": 0.015488135628402233, + "learning_rate": 1.0660000000000001e-05, + "loss": 0.2177, + "step": 4264 + }, + { + "epoch": 0.05332633315832896, + "grad_norm": 6.211699485778809, + "learning_rate": 1.0665e-05, + "loss": 1.7464, + "step": 4266 + }, + { + "epoch": 0.053351333783344584, + "grad_norm": 0.00998991634696722, + "learning_rate": 1.0670000000000001e-05, + "loss": 0.0002, + "step": 4268 + }, + { + "epoch": 0.05337633440836021, + "grad_norm": 4.455796718597412, + "learning_rate": 1.0675e-05, + "loss": 0.9388, + "step": 4270 + }, + { + "epoch": 0.05340133503337583, + "grad_norm": 3.895749807357788, + "learning_rate": 1.0680000000000001e-05, + "loss": 1.0197, + "step": 4272 + }, + { + "epoch": 0.05342633565839146, + "grad_norm": 2.4777777194976807, + "learning_rate": 1.0685e-05, + "loss": 1.229, + "step": 4274 + }, + { + "epoch": 0.053451336283407085, + "grad_norm": 3.118570566177368, + "learning_rate": 1.0690000000000001e-05, + "loss": 1.1422, + "step": 4276 + }, + { + "epoch": 0.05347633690842271, + "grad_norm": 4.788069248199463, + "learning_rate": 1.0695e-05, + "loss": 0.9264, + "step": 4278 + }, + { + "epoch": 0.05350133753343834, + "grad_norm": 4.7682013511657715, + "learning_rate": 1.0700000000000001e-05, + "loss": 0.7739, + "step": 4280 + }, + { + "epoch": 0.05352633815845396, + "grad_norm": 4.33887243270874, + "learning_rate": 1.0705000000000002e-05, + "loss": 1.273, + "step": 4282 + }, + { + "epoch": 0.053551338783469585, + "grad_norm": 5.2668633460998535, + "learning_rate": 1.071e-05, + "loss": 0.9237, + "step": 4284 + }, + { + "epoch": 0.05357633940848521, + "grad_norm": 4.507604598999023, + "learning_rate": 1.0715e-05, + "loss": 1.4513, + "step": 4286 + }, + { + "epoch": 0.05360134003350084, + "grad_norm": 3.8806493282318115, + "learning_rate": 1.072e-05, + "loss": 1.2943, + "step": 4288 + }, + { + "epoch": 0.053626340658516465, + "grad_norm": 3.6713004112243652, + "learning_rate": 1.0725000000000001e-05, + "loss": 0.5122, + "step": 4290 + }, + { + "epoch": 0.053651341283532085, + "grad_norm": 5.98764181137085, + "learning_rate": 1.073e-05, + "loss": 0.4485, + "step": 4292 + }, + { + "epoch": 0.05367634190854771, + "grad_norm": 4.56107234954834, + "learning_rate": 1.0735e-05, + "loss": 0.7186, + "step": 4294 + }, + { + "epoch": 0.05370134253356334, + "grad_norm": 7.016373634338379, + "learning_rate": 1.0740000000000002e-05, + "loss": 2.2832, + "step": 4296 + }, + { + "epoch": 0.053726343158578965, + "grad_norm": 4.287487030029297, + "learning_rate": 1.0745000000000001e-05, + "loss": 1.7016, + "step": 4298 + }, + { + "epoch": 0.05375134378359459, + "grad_norm": 0.018393199890851974, + "learning_rate": 1.075e-05, + "loss": 0.4379, + "step": 4300 + }, + { + "epoch": 0.05377634440861021, + "grad_norm": 0.008583509363234043, + "learning_rate": 1.0755e-05, + "loss": 0.2898, + "step": 4302 + }, + { + "epoch": 0.05380134503362584, + "grad_norm": 0.5836949944496155, + "learning_rate": 1.0760000000000002e-05, + "loss": 0.7903, + "step": 4304 + }, + { + "epoch": 0.053826345658641465, + "grad_norm": 6.5625505447387695, + "learning_rate": 1.0765000000000001e-05, + "loss": 1.5732, + "step": 4306 + }, + { + "epoch": 0.05385134628365709, + "grad_norm": 4.426264762878418, + "learning_rate": 1.077e-05, + "loss": 1.0429, + "step": 4308 + }, + { + "epoch": 0.05387634690867272, + "grad_norm": 3.7584240436553955, + "learning_rate": 1.0775e-05, + "loss": 0.7473, + "step": 4310 + }, + { + "epoch": 0.05390134753368834, + "grad_norm": 6.758757591247559, + "learning_rate": 1.0780000000000002e-05, + "loss": 2.1162, + "step": 4312 + }, + { + "epoch": 0.053926348158703966, + "grad_norm": 3.7644619941711426, + "learning_rate": 1.0785000000000001e-05, + "loss": 0.9403, + "step": 4314 + }, + { + "epoch": 0.05395134878371959, + "grad_norm": 3.7855658531188965, + "learning_rate": 1.079e-05, + "loss": 1.2715, + "step": 4316 + }, + { + "epoch": 0.05397634940873522, + "grad_norm": 3.167618989944458, + "learning_rate": 1.0794999999999999e-05, + "loss": 1.682, + "step": 4318 + }, + { + "epoch": 0.054001350033750846, + "grad_norm": 0.09029392153024673, + "learning_rate": 1.0800000000000002e-05, + "loss": 0.1324, + "step": 4320 + }, + { + "epoch": 0.054026350658766466, + "grad_norm": 3.2998151779174805, + "learning_rate": 1.0805e-05, + "loss": 0.8363, + "step": 4322 + }, + { + "epoch": 0.05405135128378209, + "grad_norm": 7.820522308349609, + "learning_rate": 1.081e-05, + "loss": 0.3257, + "step": 4324 + }, + { + "epoch": 0.05407635190879772, + "grad_norm": 4.737775802612305, + "learning_rate": 1.0815e-05, + "loss": 1.1051, + "step": 4326 + }, + { + "epoch": 0.054101352533813346, + "grad_norm": 4.770135402679443, + "learning_rate": 1.0820000000000001e-05, + "loss": 1.6707, + "step": 4328 + }, + { + "epoch": 0.05412635315882897, + "grad_norm": 4.363837242126465, + "learning_rate": 1.0825e-05, + "loss": 1.2051, + "step": 4330 + }, + { + "epoch": 0.05415135378384459, + "grad_norm": 0.011559689417481422, + "learning_rate": 1.0830000000000001e-05, + "loss": 0.0002, + "step": 4332 + }, + { + "epoch": 0.05417635440886022, + "grad_norm": 10.534762382507324, + "learning_rate": 1.0835e-05, + "loss": 1.9475, + "step": 4334 + }, + { + "epoch": 0.054201355033875846, + "grad_norm": 4.951352119445801, + "learning_rate": 1.0840000000000001e-05, + "loss": 2.14, + "step": 4336 + }, + { + "epoch": 0.05422635565889147, + "grad_norm": 5.203623294830322, + "learning_rate": 1.0845e-05, + "loss": 1.3064, + "step": 4338 + }, + { + "epoch": 0.0542513562839071, + "grad_norm": 2.892613649368286, + "learning_rate": 1.0850000000000001e-05, + "loss": 1.1496, + "step": 4340 + }, + { + "epoch": 0.05427635690892272, + "grad_norm": 5.739965915679932, + "learning_rate": 1.0855e-05, + "loss": 0.7908, + "step": 4342 + }, + { + "epoch": 0.054301357533938346, + "grad_norm": 4.7814459800720215, + "learning_rate": 1.0860000000000001e-05, + "loss": 1.5147, + "step": 4344 + }, + { + "epoch": 0.05432635815895397, + "grad_norm": 6.22118616104126, + "learning_rate": 1.0865000000000002e-05, + "loss": 0.4936, + "step": 4346 + }, + { + "epoch": 0.0543513587839696, + "grad_norm": 7.698059558868408, + "learning_rate": 1.0870000000000001e-05, + "loss": 1.7131, + "step": 4348 + }, + { + "epoch": 0.05437635940898523, + "grad_norm": 3.644254207611084, + "learning_rate": 1.0875e-05, + "loss": 1.3024, + "step": 4350 + }, + { + "epoch": 0.05440136003400085, + "grad_norm": 4.089870452880859, + "learning_rate": 1.0880000000000001e-05, + "loss": 1.5633, + "step": 4352 + }, + { + "epoch": 0.05442636065901647, + "grad_norm": 6.0375237464904785, + "learning_rate": 1.0885000000000002e-05, + "loss": 1.2613, + "step": 4354 + }, + { + "epoch": 0.0544513612840321, + "grad_norm": 4.358163356781006, + "learning_rate": 1.089e-05, + "loss": 0.8283, + "step": 4356 + }, + { + "epoch": 0.05447636190904773, + "grad_norm": 0.00693094776943326, + "learning_rate": 1.0895e-05, + "loss": 0.0001, + "step": 4358 + }, + { + "epoch": 0.054501362534063354, + "grad_norm": 6.676548004150391, + "learning_rate": 1.0900000000000002e-05, + "loss": 1.7535, + "step": 4360 + }, + { + "epoch": 0.054526363159078974, + "grad_norm": 8.45041561126709, + "learning_rate": 1.0905000000000001e-05, + "loss": 1.5165, + "step": 4362 + }, + { + "epoch": 0.0545513637840946, + "grad_norm": 2.408940076828003, + "learning_rate": 1.091e-05, + "loss": 0.9522, + "step": 4364 + }, + { + "epoch": 0.05457636440911023, + "grad_norm": 0.009636417962610722, + "learning_rate": 1.0915e-05, + "loss": 0.0645, + "step": 4366 + }, + { + "epoch": 0.054601365034125854, + "grad_norm": 7.04083251953125, + "learning_rate": 1.0920000000000002e-05, + "loss": 0.935, + "step": 4368 + }, + { + "epoch": 0.05462636565914148, + "grad_norm": 4.451783657073975, + "learning_rate": 1.0925000000000001e-05, + "loss": 0.7303, + "step": 4370 + }, + { + "epoch": 0.0546513662841571, + "grad_norm": 4.102904319763184, + "learning_rate": 1.093e-05, + "loss": 1.1869, + "step": 4372 + }, + { + "epoch": 0.05467636690917273, + "grad_norm": 4.117753982543945, + "learning_rate": 1.0935e-05, + "loss": 0.3486, + "step": 4374 + }, + { + "epoch": 0.054701367534188354, + "grad_norm": 6.005593776702881, + "learning_rate": 1.0940000000000002e-05, + "loss": 1.1848, + "step": 4376 + }, + { + "epoch": 0.05472636815920398, + "grad_norm": 5.0356059074401855, + "learning_rate": 1.0945000000000001e-05, + "loss": 1.4891, + "step": 4378 + }, + { + "epoch": 0.05475136878421961, + "grad_norm": 3.6145169734954834, + "learning_rate": 1.095e-05, + "loss": 0.8319, + "step": 4380 + }, + { + "epoch": 0.054776369409235234, + "grad_norm": 4.355207443237305, + "learning_rate": 1.0955e-05, + "loss": 1.5479, + "step": 4382 + }, + { + "epoch": 0.054801370034250854, + "grad_norm": 11.534209251403809, + "learning_rate": 1.0960000000000002e-05, + "loss": 0.7902, + "step": 4384 + }, + { + "epoch": 0.05482637065926648, + "grad_norm": 3.270833969116211, + "learning_rate": 1.0965000000000001e-05, + "loss": 1.2287, + "step": 4386 + }, + { + "epoch": 0.05485137128428211, + "grad_norm": 0.32896289229393005, + "learning_rate": 1.097e-05, + "loss": 0.036, + "step": 4388 + }, + { + "epoch": 0.054876371909297735, + "grad_norm": 2.94071102142334, + "learning_rate": 1.0975e-05, + "loss": 0.4049, + "step": 4390 + }, + { + "epoch": 0.05490137253431336, + "grad_norm": 5.321509838104248, + "learning_rate": 1.0980000000000002e-05, + "loss": 1.1978, + "step": 4392 + }, + { + "epoch": 0.05492637315932898, + "grad_norm": 7.058454513549805, + "learning_rate": 1.0985e-05, + "loss": 1.8215, + "step": 4394 + }, + { + "epoch": 0.05495137378434461, + "grad_norm": 5.449026584625244, + "learning_rate": 1.0990000000000002e-05, + "loss": 2.3742, + "step": 4396 + }, + { + "epoch": 0.054976374409360235, + "grad_norm": 4.563514709472656, + "learning_rate": 1.0995e-05, + "loss": 0.9763, + "step": 4398 + }, + { + "epoch": 0.05500137503437586, + "grad_norm": 3.3131420612335205, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.1871, + "step": 4400 + }, + { + "epoch": 0.05502637565939149, + "grad_norm": 4.581338882446289, + "learning_rate": 1.1005e-05, + "loss": 1.1456, + "step": 4402 + }, + { + "epoch": 0.05505137628440711, + "grad_norm": 4.1118597984313965, + "learning_rate": 1.1010000000000001e-05, + "loss": 1.6534, + "step": 4404 + }, + { + "epoch": 0.055076376909422735, + "grad_norm": 4.213473796844482, + "learning_rate": 1.1015e-05, + "loss": 0.8061, + "step": 4406 + }, + { + "epoch": 0.05510137753443836, + "grad_norm": 0.004527157172560692, + "learning_rate": 1.1020000000000001e-05, + "loss": 0.1046, + "step": 4408 + }, + { + "epoch": 0.05512637815945399, + "grad_norm": 1.5233052968978882, + "learning_rate": 1.1025000000000002e-05, + "loss": 0.2915, + "step": 4410 + }, + { + "epoch": 0.055151378784469615, + "grad_norm": 0.005988818593323231, + "learning_rate": 1.1030000000000001e-05, + "loss": 0.928, + "step": 4412 + }, + { + "epoch": 0.055176379409485235, + "grad_norm": 8.378430366516113, + "learning_rate": 1.1035e-05, + "loss": 0.7227, + "step": 4414 + }, + { + "epoch": 0.05520138003450086, + "grad_norm": 0.01935519650578499, + "learning_rate": 1.1040000000000001e-05, + "loss": 0.6321, + "step": 4416 + }, + { + "epoch": 0.05522638065951649, + "grad_norm": 0.0053795031271874905, + "learning_rate": 1.1045000000000002e-05, + "loss": 0.2208, + "step": 4418 + }, + { + "epoch": 0.055251381284532115, + "grad_norm": 5.001326084136963, + "learning_rate": 1.1050000000000001e-05, + "loss": 1.1628, + "step": 4420 + }, + { + "epoch": 0.05527638190954774, + "grad_norm": 8.33295726776123, + "learning_rate": 1.1055e-05, + "loss": 1.2861, + "step": 4422 + }, + { + "epoch": 0.05530138253456336, + "grad_norm": 0.011922050267457962, + "learning_rate": 1.1060000000000003e-05, + "loss": 0.7361, + "step": 4424 + }, + { + "epoch": 0.05532638315957899, + "grad_norm": 29.169069290161133, + "learning_rate": 1.1065000000000002e-05, + "loss": 1.6976, + "step": 4426 + }, + { + "epoch": 0.055351383784594616, + "grad_norm": 2.5972554683685303, + "learning_rate": 1.107e-05, + "loss": 0.6632, + "step": 4428 + }, + { + "epoch": 0.05537638440961024, + "grad_norm": 5.489377021789551, + "learning_rate": 1.1075e-05, + "loss": 0.692, + "step": 4430 + }, + { + "epoch": 0.05540138503462587, + "grad_norm": 2.7091193199157715, + "learning_rate": 1.1080000000000002e-05, + "loss": 0.7662, + "step": 4432 + }, + { + "epoch": 0.05542638565964149, + "grad_norm": 3.9820644855499268, + "learning_rate": 1.1085000000000001e-05, + "loss": 0.9812, + "step": 4434 + }, + { + "epoch": 0.055451386284657116, + "grad_norm": 4.775511741638184, + "learning_rate": 1.109e-05, + "loss": 0.8177, + "step": 4436 + }, + { + "epoch": 0.05547638690967274, + "grad_norm": 4.969636917114258, + "learning_rate": 1.1095e-05, + "loss": 1.9685, + "step": 4438 + }, + { + "epoch": 0.05550138753468837, + "grad_norm": 3.624345302581787, + "learning_rate": 1.1100000000000002e-05, + "loss": 1.2305, + "step": 4440 + }, + { + "epoch": 0.055526388159703996, + "grad_norm": 4.166261196136475, + "learning_rate": 1.1105000000000001e-05, + "loss": 1.0465, + "step": 4442 + }, + { + "epoch": 0.055551388784719616, + "grad_norm": 4.18754768371582, + "learning_rate": 1.111e-05, + "loss": 0.9986, + "step": 4444 + }, + { + "epoch": 0.05557638940973524, + "grad_norm": 3.1474356651306152, + "learning_rate": 1.1115e-05, + "loss": 0.1673, + "step": 4446 + }, + { + "epoch": 0.05560139003475087, + "grad_norm": 3.498842477798462, + "learning_rate": 1.1120000000000002e-05, + "loss": 1.4368, + "step": 4448 + }, + { + "epoch": 0.055626390659766496, + "grad_norm": 3.9852523803710938, + "learning_rate": 1.1125000000000001e-05, + "loss": 1.3905, + "step": 4450 + }, + { + "epoch": 0.05565139128478212, + "grad_norm": 3.704803228378296, + "learning_rate": 1.113e-05, + "loss": 1.0802, + "step": 4452 + }, + { + "epoch": 0.05567639190979774, + "grad_norm": 0.043415993452072144, + "learning_rate": 1.1135000000000001e-05, + "loss": 1.2176, + "step": 4454 + }, + { + "epoch": 0.05570139253481337, + "grad_norm": 0.4572738707065582, + "learning_rate": 1.1140000000000002e-05, + "loss": 0.0368, + "step": 4456 + }, + { + "epoch": 0.055726393159828996, + "grad_norm": 5.274864673614502, + "learning_rate": 1.1145000000000001e-05, + "loss": 1.3582, + "step": 4458 + }, + { + "epoch": 0.05575139378484462, + "grad_norm": 3.435398817062378, + "learning_rate": 1.1150000000000002e-05, + "loss": 1.0468, + "step": 4460 + }, + { + "epoch": 0.05577639440986025, + "grad_norm": 0.071133092045784, + "learning_rate": 1.1155e-05, + "loss": 0.6606, + "step": 4462 + }, + { + "epoch": 0.05580139503487587, + "grad_norm": 4.280425548553467, + "learning_rate": 1.1160000000000002e-05, + "loss": 1.2323, + "step": 4464 + }, + { + "epoch": 0.055826395659891496, + "grad_norm": 4.512796401977539, + "learning_rate": 1.1165e-05, + "loss": 1.6286, + "step": 4466 + }, + { + "epoch": 0.05585139628490712, + "grad_norm": 3.548604965209961, + "learning_rate": 1.1170000000000001e-05, + "loss": 0.8768, + "step": 4468 + }, + { + "epoch": 0.05587639690992275, + "grad_norm": 3.215566635131836, + "learning_rate": 1.1175e-05, + "loss": 0.1762, + "step": 4470 + }, + { + "epoch": 0.05590139753493838, + "grad_norm": 6.749237537384033, + "learning_rate": 1.1180000000000001e-05, + "loss": 1.0516, + "step": 4472 + }, + { + "epoch": 0.055926398159954, + "grad_norm": 8.638649940490723, + "learning_rate": 1.1185000000000002e-05, + "loss": 0.1306, + "step": 4474 + }, + { + "epoch": 0.05595139878496962, + "grad_norm": 4.1232523918151855, + "learning_rate": 1.1190000000000001e-05, + "loss": 0.1861, + "step": 4476 + }, + { + "epoch": 0.05597639940998525, + "grad_norm": 5.752701759338379, + "learning_rate": 1.1195e-05, + "loss": 1.1847, + "step": 4478 + }, + { + "epoch": 0.05600140003500088, + "grad_norm": 0.007523755542933941, + "learning_rate": 1.1200000000000001e-05, + "loss": 1.1808, + "step": 4480 + }, + { + "epoch": 0.056026400660016504, + "grad_norm": 1.776896357536316, + "learning_rate": 1.1205000000000002e-05, + "loss": 0.5689, + "step": 4482 + }, + { + "epoch": 0.056051401285032124, + "grad_norm": 7.193055629730225, + "learning_rate": 1.1210000000000001e-05, + "loss": 0.1971, + "step": 4484 + }, + { + "epoch": 0.05607640191004775, + "grad_norm": 4.503664970397949, + "learning_rate": 1.1215e-05, + "loss": 1.9376, + "step": 4486 + }, + { + "epoch": 0.05610140253506338, + "grad_norm": 6.743425369262695, + "learning_rate": 1.1220000000000003e-05, + "loss": 1.3184, + "step": 4488 + }, + { + "epoch": 0.056126403160079004, + "grad_norm": 5.6398725509643555, + "learning_rate": 1.1225000000000002e-05, + "loss": 0.3997, + "step": 4490 + }, + { + "epoch": 0.05615140378509463, + "grad_norm": 6.8374505043029785, + "learning_rate": 1.1230000000000001e-05, + "loss": 0.3128, + "step": 4492 + }, + { + "epoch": 0.05617640441011025, + "grad_norm": 5.151124477386475, + "learning_rate": 1.1235e-05, + "loss": 1.2166, + "step": 4494 + }, + { + "epoch": 0.05620140503512588, + "grad_norm": 4.697113037109375, + "learning_rate": 1.1240000000000002e-05, + "loss": 0.7315, + "step": 4496 + }, + { + "epoch": 0.056226405660141504, + "grad_norm": 3.4984757900238037, + "learning_rate": 1.1245000000000002e-05, + "loss": 1.0155, + "step": 4498 + }, + { + "epoch": 0.05625140628515713, + "grad_norm": 4.928707122802734, + "learning_rate": 1.125e-05, + "loss": 1.2462, + "step": 4500 + }, + { + "epoch": 0.05627640691017276, + "grad_norm": 15.848565101623535, + "learning_rate": 1.1255e-05, + "loss": 1.6404, + "step": 4502 + }, + { + "epoch": 0.05630140753518838, + "grad_norm": 3.935544490814209, + "learning_rate": 1.126e-05, + "loss": 0.6266, + "step": 4504 + }, + { + "epoch": 0.056326408160204004, + "grad_norm": 14.15475082397461, + "learning_rate": 1.1265000000000001e-05, + "loss": 0.398, + "step": 4506 + }, + { + "epoch": 0.05635140878521963, + "grad_norm": 2.2898645401000977, + "learning_rate": 1.127e-05, + "loss": 0.1862, + "step": 4508 + }, + { + "epoch": 0.05637640941023526, + "grad_norm": 4.356060981750488, + "learning_rate": 1.1275e-05, + "loss": 2.5912, + "step": 4510 + }, + { + "epoch": 0.056401410035250885, + "grad_norm": 8.769369125366211, + "learning_rate": 1.128e-05, + "loss": 1.4014, + "step": 4512 + }, + { + "epoch": 0.056426410660266504, + "grad_norm": 0.00460837734863162, + "learning_rate": 1.1285000000000001e-05, + "loss": 0.0001, + "step": 4514 + }, + { + "epoch": 0.05645141128528213, + "grad_norm": 2.882509708404541, + "learning_rate": 1.129e-05, + "loss": 0.0986, + "step": 4516 + }, + { + "epoch": 0.05647641191029776, + "grad_norm": 3.9332492351531982, + "learning_rate": 1.1295000000000001e-05, + "loss": 0.7024, + "step": 4518 + }, + { + "epoch": 0.056501412535313385, + "grad_norm": 0.8339253067970276, + "learning_rate": 1.13e-05, + "loss": 0.7507, + "step": 4520 + }, + { + "epoch": 0.05652641316032901, + "grad_norm": 4.545960426330566, + "learning_rate": 1.1305000000000001e-05, + "loss": 1.0845, + "step": 4522 + }, + { + "epoch": 0.05655141378534463, + "grad_norm": 3.048832416534424, + "learning_rate": 1.131e-05, + "loss": 0.6467, + "step": 4524 + }, + { + "epoch": 0.05657641441036026, + "grad_norm": 1.9592115879058838, + "learning_rate": 1.1315000000000001e-05, + "loss": 0.5769, + "step": 4526 + }, + { + "epoch": 0.056601415035375885, + "grad_norm": 9.037142753601074, + "learning_rate": 1.132e-05, + "loss": 1.012, + "step": 4528 + }, + { + "epoch": 0.05662641566039151, + "grad_norm": 0.008064623922109604, + "learning_rate": 1.1325e-05, + "loss": 0.1129, + "step": 4530 + }, + { + "epoch": 0.05665141628540714, + "grad_norm": 0.010762929916381836, + "learning_rate": 1.1330000000000002e-05, + "loss": 0.3638, + "step": 4532 + }, + { + "epoch": 0.05667641691042276, + "grad_norm": 3.860914945602417, + "learning_rate": 1.1335e-05, + "loss": 0.9398, + "step": 4534 + }, + { + "epoch": 0.056701417535438385, + "grad_norm": 9.301783561706543, + "learning_rate": 1.134e-05, + "loss": 1.7513, + "step": 4536 + }, + { + "epoch": 0.05672641816045401, + "grad_norm": 6.558403491973877, + "learning_rate": 1.1345000000000002e-05, + "loss": 1.2477, + "step": 4538 + }, + { + "epoch": 0.05675141878546964, + "grad_norm": 10.047638893127441, + "learning_rate": 1.1350000000000001e-05, + "loss": 0.7676, + "step": 4540 + }, + { + "epoch": 0.056776419410485265, + "grad_norm": 3.130901336669922, + "learning_rate": 1.1355e-05, + "loss": 0.1134, + "step": 4542 + }, + { + "epoch": 0.056801420035500885, + "grad_norm": 10.091614723205566, + "learning_rate": 1.136e-05, + "loss": 0.8215, + "step": 4544 + }, + { + "epoch": 0.05682642066051651, + "grad_norm": 2.0530052185058594, + "learning_rate": 1.1365000000000002e-05, + "loss": 0.2924, + "step": 4546 + }, + { + "epoch": 0.05685142128553214, + "grad_norm": 5.677149295806885, + "learning_rate": 1.1370000000000001e-05, + "loss": 2.3821, + "step": 4548 + }, + { + "epoch": 0.056876421910547766, + "grad_norm": 3.870633602142334, + "learning_rate": 1.1375e-05, + "loss": 1.1717, + "step": 4550 + }, + { + "epoch": 0.05690142253556339, + "grad_norm": 39.84911346435547, + "learning_rate": 1.138e-05, + "loss": 0.6943, + "step": 4552 + }, + { + "epoch": 0.05692642316057901, + "grad_norm": 5.622470378875732, + "learning_rate": 1.1385000000000002e-05, + "loss": 0.4719, + "step": 4554 + }, + { + "epoch": 0.05695142378559464, + "grad_norm": 4.897730827331543, + "learning_rate": 1.1390000000000001e-05, + "loss": 1.8338, + "step": 4556 + }, + { + "epoch": 0.056976424410610266, + "grad_norm": 7.0723395347595215, + "learning_rate": 1.1395e-05, + "loss": 1.1132, + "step": 4558 + }, + { + "epoch": 0.05700142503562589, + "grad_norm": 0.07312867790460587, + "learning_rate": 1.14e-05, + "loss": 0.0005, + "step": 4560 + }, + { + "epoch": 0.05702642566064152, + "grad_norm": 9.883661270141602, + "learning_rate": 1.1405000000000002e-05, + "loss": 1.0612, + "step": 4562 + }, + { + "epoch": 0.05705142628565714, + "grad_norm": 6.777193546295166, + "learning_rate": 1.1410000000000001e-05, + "loss": 1.2427, + "step": 4564 + }, + { + "epoch": 0.057076426910672766, + "grad_norm": 1.2874574661254883, + "learning_rate": 1.1415e-05, + "loss": 0.5943, + "step": 4566 + }, + { + "epoch": 0.05710142753568839, + "grad_norm": 2.745412826538086, + "learning_rate": 1.142e-05, + "loss": 0.3309, + "step": 4568 + }, + { + "epoch": 0.05712642816070402, + "grad_norm": 2.090791940689087, + "learning_rate": 1.1425000000000002e-05, + "loss": 1.4928, + "step": 4570 + }, + { + "epoch": 0.057151428785719646, + "grad_norm": 0.02975476160645485, + "learning_rate": 1.143e-05, + "loss": 0.4473, + "step": 4572 + }, + { + "epoch": 0.057176429410735266, + "grad_norm": 3.651261568069458, + "learning_rate": 1.1435e-05, + "loss": 0.166, + "step": 4574 + }, + { + "epoch": 0.05720143003575089, + "grad_norm": 5.082508087158203, + "learning_rate": 1.144e-05, + "loss": 0.8077, + "step": 4576 + }, + { + "epoch": 0.05722643066076652, + "grad_norm": 3.0935068130493164, + "learning_rate": 1.1445000000000001e-05, + "loss": 1.747, + "step": 4578 + }, + { + "epoch": 0.057251431285782146, + "grad_norm": 6.204052448272705, + "learning_rate": 1.145e-05, + "loss": 0.7189, + "step": 4580 + }, + { + "epoch": 0.05727643191079777, + "grad_norm": 7.607250690460205, + "learning_rate": 1.1455000000000001e-05, + "loss": 3.6033, + "step": 4582 + }, + { + "epoch": 0.05730143253581339, + "grad_norm": 3.1059045791625977, + "learning_rate": 1.146e-05, + "loss": 0.9373, + "step": 4584 + }, + { + "epoch": 0.05732643316082902, + "grad_norm": 4.862497806549072, + "learning_rate": 1.1465000000000001e-05, + "loss": 0.9784, + "step": 4586 + }, + { + "epoch": 0.05735143378584465, + "grad_norm": 5.805476188659668, + "learning_rate": 1.147e-05, + "loss": 0.6665, + "step": 4588 + }, + { + "epoch": 0.05737643441086027, + "grad_norm": 2.7513580322265625, + "learning_rate": 1.1475000000000001e-05, + "loss": 1.3034, + "step": 4590 + }, + { + "epoch": 0.0574014350358759, + "grad_norm": 4.110196113586426, + "learning_rate": 1.148e-05, + "loss": 0.8897, + "step": 4592 + }, + { + "epoch": 0.05742643566089152, + "grad_norm": 3.8110220432281494, + "learning_rate": 1.1485000000000001e-05, + "loss": 1.582, + "step": 4594 + }, + { + "epoch": 0.05745143628590715, + "grad_norm": 8.532395362854004, + "learning_rate": 1.1490000000000002e-05, + "loss": 0.8029, + "step": 4596 + }, + { + "epoch": 0.057476436910922774, + "grad_norm": 4.930775165557861, + "learning_rate": 1.1495000000000001e-05, + "loss": 1.5172, + "step": 4598 + }, + { + "epoch": 0.0575014375359384, + "grad_norm": 3.767963409423828, + "learning_rate": 1.15e-05, + "loss": 0.9577, + "step": 4600 + }, + { + "epoch": 0.05752643816095403, + "grad_norm": 7.251204967498779, + "learning_rate": 1.1505000000000003e-05, + "loss": 0.9513, + "step": 4602 + }, + { + "epoch": 0.05755143878596965, + "grad_norm": 0.10701148211956024, + "learning_rate": 1.1510000000000002e-05, + "loss": 1.0613, + "step": 4604 + }, + { + "epoch": 0.057576439410985274, + "grad_norm": 3.9385154247283936, + "learning_rate": 1.1515e-05, + "loss": 1.3751, + "step": 4606 + }, + { + "epoch": 0.0576014400360009, + "grad_norm": 15.9191312789917, + "learning_rate": 1.152e-05, + "loss": 2.2394, + "step": 4608 + }, + { + "epoch": 0.05762644066101653, + "grad_norm": 14.963593482971191, + "learning_rate": 1.1525000000000002e-05, + "loss": 1.2255, + "step": 4610 + }, + { + "epoch": 0.057651441286032154, + "grad_norm": 7.072897434234619, + "learning_rate": 1.1530000000000001e-05, + "loss": 1.4429, + "step": 4612 + }, + { + "epoch": 0.057676441911047774, + "grad_norm": 6.15817928314209, + "learning_rate": 1.1535e-05, + "loss": 0.6072, + "step": 4614 + }, + { + "epoch": 0.0577014425360634, + "grad_norm": 2.607461452484131, + "learning_rate": 1.154e-05, + "loss": 0.2472, + "step": 4616 + }, + { + "epoch": 0.05772644316107903, + "grad_norm": 4.609371185302734, + "learning_rate": 1.1545000000000002e-05, + "loss": 1.1033, + "step": 4618 + }, + { + "epoch": 0.057751443786094654, + "grad_norm": 2.643686294555664, + "learning_rate": 1.1550000000000001e-05, + "loss": 1.2974, + "step": 4620 + }, + { + "epoch": 0.05777644441111028, + "grad_norm": 5.414684772491455, + "learning_rate": 1.1555e-05, + "loss": 0.5249, + "step": 4622 + }, + { + "epoch": 0.0578014450361259, + "grad_norm": 4.067637920379639, + "learning_rate": 1.156e-05, + "loss": 1.4866, + "step": 4624 + }, + { + "epoch": 0.05782644566114153, + "grad_norm": 3.3910207748413086, + "learning_rate": 1.1565000000000002e-05, + "loss": 0.8061, + "step": 4626 + }, + { + "epoch": 0.057851446286157154, + "grad_norm": 3.0440878868103027, + "learning_rate": 1.1570000000000001e-05, + "loss": 1.3771, + "step": 4628 + }, + { + "epoch": 0.05787644691117278, + "grad_norm": 3.2960147857666016, + "learning_rate": 1.1575e-05, + "loss": 1.0454, + "step": 4630 + }, + { + "epoch": 0.05790144753618841, + "grad_norm": 2.650787353515625, + "learning_rate": 1.1580000000000001e-05, + "loss": 1.5623, + "step": 4632 + }, + { + "epoch": 0.05792644816120403, + "grad_norm": 3.4648804664611816, + "learning_rate": 1.1585000000000002e-05, + "loss": 1.2438, + "step": 4634 + }, + { + "epoch": 0.057951448786219655, + "grad_norm": 0.7780069708824158, + "learning_rate": 1.159e-05, + "loss": 0.289, + "step": 4636 + }, + { + "epoch": 0.05797644941123528, + "grad_norm": 4.184265613555908, + "learning_rate": 1.1595e-05, + "loss": 0.9614, + "step": 4638 + }, + { + "epoch": 0.05800145003625091, + "grad_norm": 3.3219985961914062, + "learning_rate": 1.16e-05, + "loss": 0.6803, + "step": 4640 + }, + { + "epoch": 0.058026450661266535, + "grad_norm": 6.205493927001953, + "learning_rate": 1.1605000000000002e-05, + "loss": 0.778, + "step": 4642 + }, + { + "epoch": 0.058051451286282155, + "grad_norm": 3.0836048126220703, + "learning_rate": 1.161e-05, + "loss": 1.0471, + "step": 4644 + }, + { + "epoch": 0.05807645191129778, + "grad_norm": 0.09384278208017349, + "learning_rate": 1.1615000000000001e-05, + "loss": 0.07, + "step": 4646 + }, + { + "epoch": 0.05810145253631341, + "grad_norm": 1.5333961248397827, + "learning_rate": 1.162e-05, + "loss": 0.833, + "step": 4648 + }, + { + "epoch": 0.058126453161329035, + "grad_norm": 5.864470958709717, + "learning_rate": 1.1625000000000001e-05, + "loss": 0.794, + "step": 4650 + }, + { + "epoch": 0.05815145378634466, + "grad_norm": 4.763908386230469, + "learning_rate": 1.163e-05, + "loss": 1.0496, + "step": 4652 + }, + { + "epoch": 0.05817645441136028, + "grad_norm": 0.18388552963733673, + "learning_rate": 1.1635000000000001e-05, + "loss": 0.6801, + "step": 4654 + }, + { + "epoch": 0.05820145503637591, + "grad_norm": 5.611164093017578, + "learning_rate": 1.164e-05, + "loss": 1.5616, + "step": 4656 + }, + { + "epoch": 0.058226455661391535, + "grad_norm": 3.987881898880005, + "learning_rate": 1.1645000000000001e-05, + "loss": 1.014, + "step": 4658 + }, + { + "epoch": 0.05825145628640716, + "grad_norm": 1.2853097915649414, + "learning_rate": 1.1650000000000002e-05, + "loss": 0.459, + "step": 4660 + }, + { + "epoch": 0.05827645691142279, + "grad_norm": 5.1278276443481445, + "learning_rate": 1.1655000000000001e-05, + "loss": 0.2418, + "step": 4662 + }, + { + "epoch": 0.05830145753643841, + "grad_norm": 2.8792965412139893, + "learning_rate": 1.166e-05, + "loss": 0.8456, + "step": 4664 + }, + { + "epoch": 0.058326458161454035, + "grad_norm": 2.795562505722046, + "learning_rate": 1.1665000000000003e-05, + "loss": 0.7362, + "step": 4666 + }, + { + "epoch": 0.05835145878646966, + "grad_norm": 2.9465014934539795, + "learning_rate": 1.1670000000000002e-05, + "loss": 0.8441, + "step": 4668 + }, + { + "epoch": 0.05837645941148529, + "grad_norm": 7.110555648803711, + "learning_rate": 1.1675000000000001e-05, + "loss": 1.0714, + "step": 4670 + }, + { + "epoch": 0.058401460036500916, + "grad_norm": 3.764895439147949, + "learning_rate": 1.168e-05, + "loss": 0.1662, + "step": 4672 + }, + { + "epoch": 0.058426460661516535, + "grad_norm": 3.7152750492095947, + "learning_rate": 1.1685000000000002e-05, + "loss": 0.4332, + "step": 4674 + }, + { + "epoch": 0.05845146128653216, + "grad_norm": 0.05515047907829285, + "learning_rate": 1.1690000000000002e-05, + "loss": 0.6137, + "step": 4676 + }, + { + "epoch": 0.05847646191154779, + "grad_norm": 5.580020904541016, + "learning_rate": 1.1695e-05, + "loss": 0.9255, + "step": 4678 + }, + { + "epoch": 0.058501462536563416, + "grad_norm": 1.8970810174942017, + "learning_rate": 1.17e-05, + "loss": 1.0604, + "step": 4680 + }, + { + "epoch": 0.05852646316157904, + "grad_norm": 9.404041290283203, + "learning_rate": 1.1705000000000002e-05, + "loss": 1.5128, + "step": 4682 + }, + { + "epoch": 0.05855146378659466, + "grad_norm": 3.1354923248291016, + "learning_rate": 1.1710000000000001e-05, + "loss": 1.3676, + "step": 4684 + }, + { + "epoch": 0.05857646441161029, + "grad_norm": 12.890023231506348, + "learning_rate": 1.1715e-05, + "loss": 1.8063, + "step": 4686 + }, + { + "epoch": 0.058601465036625916, + "grad_norm": 4.085147380828857, + "learning_rate": 1.172e-05, + "loss": 1.9944, + "step": 4688 + }, + { + "epoch": 0.05862646566164154, + "grad_norm": 3.0967464447021484, + "learning_rate": 1.1725000000000002e-05, + "loss": 0.5604, + "step": 4690 + }, + { + "epoch": 0.05865146628665717, + "grad_norm": 0.022580290213227272, + "learning_rate": 1.1730000000000001e-05, + "loss": 0.0511, + "step": 4692 + }, + { + "epoch": 0.05867646691167279, + "grad_norm": 4.546712875366211, + "learning_rate": 1.1735e-05, + "loss": 1.745, + "step": 4694 + }, + { + "epoch": 0.058701467536688416, + "grad_norm": 4.032757759094238, + "learning_rate": 1.1740000000000001e-05, + "loss": 0.9997, + "step": 4696 + }, + { + "epoch": 0.05872646816170404, + "grad_norm": 3.225719451904297, + "learning_rate": 1.1745000000000002e-05, + "loss": 1.0514, + "step": 4698 + }, + { + "epoch": 0.05875146878671967, + "grad_norm": 5.814773082733154, + "learning_rate": 1.1750000000000001e-05, + "loss": 1.2486, + "step": 4700 + }, + { + "epoch": 0.058776469411735296, + "grad_norm": 0.691635251045227, + "learning_rate": 1.1755e-05, + "loss": 0.0344, + "step": 4702 + }, + { + "epoch": 0.058801470036750916, + "grad_norm": 2.61702823638916, + "learning_rate": 1.1760000000000001e-05, + "loss": 1.2408, + "step": 4704 + }, + { + "epoch": 0.05882647066176654, + "grad_norm": 6.900379180908203, + "learning_rate": 1.1765000000000002e-05, + "loss": 0.6444, + "step": 4706 + }, + { + "epoch": 0.05885147128678217, + "grad_norm": 0.6396856904029846, + "learning_rate": 1.177e-05, + "loss": 0.6357, + "step": 4708 + }, + { + "epoch": 0.0588764719117978, + "grad_norm": 2.6351096630096436, + "learning_rate": 1.1775000000000002e-05, + "loss": 1.0336, + "step": 4710 + }, + { + "epoch": 0.05890147253681342, + "grad_norm": 2.5753464698791504, + "learning_rate": 1.178e-05, + "loss": 1.1918, + "step": 4712 + }, + { + "epoch": 0.05892647316182904, + "grad_norm": 8.220812797546387, + "learning_rate": 1.1785000000000002e-05, + "loss": 1.7647, + "step": 4714 + }, + { + "epoch": 0.05895147378684467, + "grad_norm": 2.2495598793029785, + "learning_rate": 1.179e-05, + "loss": 0.3575, + "step": 4716 + }, + { + "epoch": 0.0589764744118603, + "grad_norm": 3.5990118980407715, + "learning_rate": 1.1795000000000001e-05, + "loss": 1.1961, + "step": 4718 + }, + { + "epoch": 0.059001475036875924, + "grad_norm": 2.641056537628174, + "learning_rate": 1.18e-05, + "loss": 0.196, + "step": 4720 + }, + { + "epoch": 0.05902647566189155, + "grad_norm": 7.888462543487549, + "learning_rate": 1.1805000000000001e-05, + "loss": 0.588, + "step": 4722 + }, + { + "epoch": 0.05905147628690717, + "grad_norm": 4.0829925537109375, + "learning_rate": 1.1810000000000002e-05, + "loss": 1.7426, + "step": 4724 + }, + { + "epoch": 0.0590764769119228, + "grad_norm": 0.02202942781150341, + "learning_rate": 1.1815000000000001e-05, + "loss": 0.3981, + "step": 4726 + }, + { + "epoch": 0.059101477536938424, + "grad_norm": 4.077978134155273, + "learning_rate": 1.182e-05, + "loss": 1.7036, + "step": 4728 + }, + { + "epoch": 0.05912647816195405, + "grad_norm": 3.3544816970825195, + "learning_rate": 1.1825000000000003e-05, + "loss": 1.3482, + "step": 4730 + }, + { + "epoch": 0.05915147878696968, + "grad_norm": 4.639157772064209, + "learning_rate": 1.1830000000000002e-05, + "loss": 1.1479, + "step": 4732 + }, + { + "epoch": 0.0591764794119853, + "grad_norm": 1.6753019094467163, + "learning_rate": 1.1835000000000001e-05, + "loss": 0.8169, + "step": 4734 + }, + { + "epoch": 0.059201480037000924, + "grad_norm": 0.15205904841423035, + "learning_rate": 1.184e-05, + "loss": 1.6752, + "step": 4736 + }, + { + "epoch": 0.05922648066201655, + "grad_norm": 1.3798288106918335, + "learning_rate": 1.1845000000000003e-05, + "loss": 0.4292, + "step": 4738 + }, + { + "epoch": 0.05925148128703218, + "grad_norm": 0.006622948218137026, + "learning_rate": 1.1850000000000002e-05, + "loss": 0.9752, + "step": 4740 + }, + { + "epoch": 0.059276481912047804, + "grad_norm": 8.633159637451172, + "learning_rate": 1.1855e-05, + "loss": 1.4574, + "step": 4742 + }, + { + "epoch": 0.059301482537063424, + "grad_norm": 4.209770202636719, + "learning_rate": 1.186e-05, + "loss": 0.8119, + "step": 4744 + }, + { + "epoch": 0.05932648316207905, + "grad_norm": 4.109338283538818, + "learning_rate": 1.1865000000000002e-05, + "loss": 1.6063, + "step": 4746 + }, + { + "epoch": 0.05935148378709468, + "grad_norm": 1.452834129333496, + "learning_rate": 1.1870000000000002e-05, + "loss": 0.0118, + "step": 4748 + }, + { + "epoch": 0.059376484412110304, + "grad_norm": 3.5280508995056152, + "learning_rate": 1.1875e-05, + "loss": 0.7812, + "step": 4750 + }, + { + "epoch": 0.05940148503712593, + "grad_norm": 4.228314399719238, + "learning_rate": 1.188e-05, + "loss": 0.7978, + "step": 4752 + }, + { + "epoch": 0.05942648566214155, + "grad_norm": 5.449346542358398, + "learning_rate": 1.1885e-05, + "loss": 0.1417, + "step": 4754 + }, + { + "epoch": 0.05945148628715718, + "grad_norm": 3.501904249191284, + "learning_rate": 1.1890000000000001e-05, + "loss": 0.8898, + "step": 4756 + }, + { + "epoch": 0.059476486912172805, + "grad_norm": 7.582802772521973, + "learning_rate": 1.1895e-05, + "loss": 0.917, + "step": 4758 + }, + { + "epoch": 0.05950148753718843, + "grad_norm": 0.5889860987663269, + "learning_rate": 1.1900000000000001e-05, + "loss": 0.2083, + "step": 4760 + }, + { + "epoch": 0.05952648816220406, + "grad_norm": 2.8103673458099365, + "learning_rate": 1.1905e-05, + "loss": 1.3981, + "step": 4762 + }, + { + "epoch": 0.05955148878721968, + "grad_norm": 4.9843926429748535, + "learning_rate": 1.1910000000000001e-05, + "loss": 2.1475, + "step": 4764 + }, + { + "epoch": 0.059576489412235305, + "grad_norm": 8.29718017578125, + "learning_rate": 1.1915e-05, + "loss": 1.128, + "step": 4766 + }, + { + "epoch": 0.05960149003725093, + "grad_norm": 4.142148971557617, + "learning_rate": 1.1920000000000001e-05, + "loss": 1.4008, + "step": 4768 + }, + { + "epoch": 0.05962649066226656, + "grad_norm": 0.008185683749616146, + "learning_rate": 1.1925e-05, + "loss": 0.6591, + "step": 4770 + }, + { + "epoch": 0.059651491287282185, + "grad_norm": 2.7272961139678955, + "learning_rate": 1.1930000000000001e-05, + "loss": 0.5135, + "step": 4772 + }, + { + "epoch": 0.059676491912297805, + "grad_norm": 4.610064506530762, + "learning_rate": 1.1935000000000002e-05, + "loss": 0.9997, + "step": 4774 + }, + { + "epoch": 0.05970149253731343, + "grad_norm": 1.7493804693222046, + "learning_rate": 1.1940000000000001e-05, + "loss": 0.67, + "step": 4776 + }, + { + "epoch": 0.05972649316232906, + "grad_norm": 3.160571813583374, + "learning_rate": 1.1945e-05, + "loss": 0.6109, + "step": 4778 + }, + { + "epoch": 0.059751493787344685, + "grad_norm": 3.6851093769073486, + "learning_rate": 1.195e-05, + "loss": 0.7738, + "step": 4780 + }, + { + "epoch": 0.05977649441236031, + "grad_norm": 6.506088733673096, + "learning_rate": 1.1955000000000002e-05, + "loss": 0.9845, + "step": 4782 + }, + { + "epoch": 0.05980149503737593, + "grad_norm": 23.519601821899414, + "learning_rate": 1.196e-05, + "loss": 1.8589, + "step": 4784 + }, + { + "epoch": 0.05982649566239156, + "grad_norm": 5.791053771972656, + "learning_rate": 1.1965e-05, + "loss": 0.4759, + "step": 4786 + }, + { + "epoch": 0.059851496287407185, + "grad_norm": 28.843006134033203, + "learning_rate": 1.1970000000000002e-05, + "loss": 3.0428, + "step": 4788 + }, + { + "epoch": 0.05987649691242281, + "grad_norm": 5.765805721282959, + "learning_rate": 1.1975000000000001e-05, + "loss": 1.1367, + "step": 4790 + }, + { + "epoch": 0.05990149753743844, + "grad_norm": 5.461038112640381, + "learning_rate": 1.198e-05, + "loss": 0.5866, + "step": 4792 + }, + { + "epoch": 0.05992649816245406, + "grad_norm": 5.597739219665527, + "learning_rate": 1.1985e-05, + "loss": 1.5203, + "step": 4794 + }, + { + "epoch": 0.059951498787469686, + "grad_norm": 0.026279335841536522, + "learning_rate": 1.1990000000000002e-05, + "loss": 1.1988, + "step": 4796 + }, + { + "epoch": 0.05997649941248531, + "grad_norm": 2.3421058654785156, + "learning_rate": 1.1995000000000001e-05, + "loss": 0.9099, + "step": 4798 + }, + { + "epoch": 0.06000150003750094, + "grad_norm": 4.280205249786377, + "learning_rate": 1.2e-05, + "loss": 0.6549, + "step": 4800 + }, + { + "epoch": 0.060026500662516566, + "grad_norm": 0.05943937972187996, + "learning_rate": 1.2005e-05, + "loss": 0.0016, + "step": 4802 + }, + { + "epoch": 0.060051501287532186, + "grad_norm": 5.813172340393066, + "learning_rate": 1.2010000000000002e-05, + "loss": 0.1172, + "step": 4804 + }, + { + "epoch": 0.06007650191254781, + "grad_norm": 4.0125041007995605, + "learning_rate": 1.2015000000000001e-05, + "loss": 0.0998, + "step": 4806 + }, + { + "epoch": 0.06010150253756344, + "grad_norm": 2.719089984893799, + "learning_rate": 1.202e-05, + "loss": 1.0536, + "step": 4808 + }, + { + "epoch": 0.060126503162579066, + "grad_norm": 3.6938974857330322, + "learning_rate": 1.2025e-05, + "loss": 0.9654, + "step": 4810 + }, + { + "epoch": 0.06015150378759469, + "grad_norm": 2.6361966133117676, + "learning_rate": 1.2030000000000002e-05, + "loss": 1.5032, + "step": 4812 + }, + { + "epoch": 0.06017650441261031, + "grad_norm": 2.384601354598999, + "learning_rate": 1.2035e-05, + "loss": 0.6271, + "step": 4814 + }, + { + "epoch": 0.06020150503762594, + "grad_norm": 5.322446346282959, + "learning_rate": 1.204e-05, + "loss": 0.2838, + "step": 4816 + }, + { + "epoch": 0.060226505662641566, + "grad_norm": 3.9606165885925293, + "learning_rate": 1.2045e-05, + "loss": 1.4374, + "step": 4818 + }, + { + "epoch": 0.06025150628765719, + "grad_norm": 6.897012710571289, + "learning_rate": 1.2050000000000002e-05, + "loss": 1.9205, + "step": 4820 + }, + { + "epoch": 0.06027650691267282, + "grad_norm": 5.336450576782227, + "learning_rate": 1.2055e-05, + "loss": 1.3609, + "step": 4822 + }, + { + "epoch": 0.06030150753768844, + "grad_norm": 4.26201057434082, + "learning_rate": 1.2060000000000001e-05, + "loss": 1.8433, + "step": 4824 + }, + { + "epoch": 0.060326508162704066, + "grad_norm": 2.9708120822906494, + "learning_rate": 1.2065e-05, + "loss": 1.5041, + "step": 4826 + }, + { + "epoch": 0.06035150878771969, + "grad_norm": 4.639847278594971, + "learning_rate": 1.2070000000000001e-05, + "loss": 1.23, + "step": 4828 + }, + { + "epoch": 0.06037650941273532, + "grad_norm": 6.1622772216796875, + "learning_rate": 1.2075e-05, + "loss": 2.1456, + "step": 4830 + }, + { + "epoch": 0.06040151003775095, + "grad_norm": 3.1917178630828857, + "learning_rate": 1.2080000000000001e-05, + "loss": 1.4859, + "step": 4832 + }, + { + "epoch": 0.06042651066276657, + "grad_norm": 1.6649030447006226, + "learning_rate": 1.2085e-05, + "loss": 0.1386, + "step": 4834 + }, + { + "epoch": 0.06045151128778219, + "grad_norm": 4.898342132568359, + "learning_rate": 1.2090000000000001e-05, + "loss": 1.5898, + "step": 4836 + }, + { + "epoch": 0.06047651191279782, + "grad_norm": 3.4244742393493652, + "learning_rate": 1.2095000000000002e-05, + "loss": 1.7685, + "step": 4838 + }, + { + "epoch": 0.06050151253781345, + "grad_norm": 3.2441580295562744, + "learning_rate": 1.2100000000000001e-05, + "loss": 1.4357, + "step": 4840 + }, + { + "epoch": 0.060526513162829074, + "grad_norm": 5.860594272613525, + "learning_rate": 1.2105e-05, + "loss": 1.6242, + "step": 4842 + }, + { + "epoch": 0.060551513787844694, + "grad_norm": 5.909303665161133, + "learning_rate": 1.2110000000000001e-05, + "loss": 1.697, + "step": 4844 + }, + { + "epoch": 0.06057651441286032, + "grad_norm": 5.704824924468994, + "learning_rate": 1.2115000000000002e-05, + "loss": 3.6954, + "step": 4846 + }, + { + "epoch": 0.06060151503787595, + "grad_norm": 2.5403590202331543, + "learning_rate": 1.2120000000000001e-05, + "loss": 0.5381, + "step": 4848 + }, + { + "epoch": 0.060626515662891574, + "grad_norm": 5.39560079574585, + "learning_rate": 1.2125e-05, + "loss": 2.1328, + "step": 4850 + }, + { + "epoch": 0.0606515162879072, + "grad_norm": 6.146430969238281, + "learning_rate": 1.2130000000000002e-05, + "loss": 0.6647, + "step": 4852 + }, + { + "epoch": 0.06067651691292282, + "grad_norm": 3.7731635570526123, + "learning_rate": 1.2135000000000002e-05, + "loss": 0.6631, + "step": 4854 + }, + { + "epoch": 0.06070151753793845, + "grad_norm": 11.737529754638672, + "learning_rate": 1.214e-05, + "loss": 0.4247, + "step": 4856 + }, + { + "epoch": 0.060726518162954074, + "grad_norm": 13.573443412780762, + "learning_rate": 1.2145e-05, + "loss": 1.8752, + "step": 4858 + }, + { + "epoch": 0.0607515187879697, + "grad_norm": 0.24150203168392181, + "learning_rate": 1.2150000000000002e-05, + "loss": 0.4729, + "step": 4860 + }, + { + "epoch": 0.06077651941298533, + "grad_norm": 1.4366511106491089, + "learning_rate": 1.2155000000000001e-05, + "loss": 0.4983, + "step": 4862 + }, + { + "epoch": 0.06080152003800095, + "grad_norm": 3.67195725440979, + "learning_rate": 1.216e-05, + "loss": 1.6629, + "step": 4864 + }, + { + "epoch": 0.060826520663016574, + "grad_norm": 2.9152183532714844, + "learning_rate": 1.2165e-05, + "loss": 0.57, + "step": 4866 + }, + { + "epoch": 0.0608515212880322, + "grad_norm": 2.877347946166992, + "learning_rate": 1.2170000000000002e-05, + "loss": 0.9778, + "step": 4868 + }, + { + "epoch": 0.06087652191304783, + "grad_norm": 5.8221540451049805, + "learning_rate": 1.2175000000000001e-05, + "loss": 0.4255, + "step": 4870 + }, + { + "epoch": 0.060901522538063455, + "grad_norm": 3.178727149963379, + "learning_rate": 1.218e-05, + "loss": 0.5727, + "step": 4872 + }, + { + "epoch": 0.060926523163079074, + "grad_norm": 5.636492729187012, + "learning_rate": 1.2185e-05, + "loss": 1.2527, + "step": 4874 + }, + { + "epoch": 0.0609515237880947, + "grad_norm": 3.7949647903442383, + "learning_rate": 1.2190000000000002e-05, + "loss": 0.5104, + "step": 4876 + }, + { + "epoch": 0.06097652441311033, + "grad_norm": 2.26055645942688, + "learning_rate": 1.2195000000000001e-05, + "loss": 1.0145, + "step": 4878 + }, + { + "epoch": 0.061001525038125955, + "grad_norm": 4.389135837554932, + "learning_rate": 1.22e-05, + "loss": 1.5884, + "step": 4880 + }, + { + "epoch": 0.06102652566314158, + "grad_norm": 2.214043617248535, + "learning_rate": 1.2205000000000001e-05, + "loss": 1.0809, + "step": 4882 + }, + { + "epoch": 0.0610515262881572, + "grad_norm": 3.285064220428467, + "learning_rate": 1.2210000000000002e-05, + "loss": 0.6649, + "step": 4884 + }, + { + "epoch": 0.06107652691317283, + "grad_norm": 5.381091594696045, + "learning_rate": 1.2215e-05, + "loss": 0.3766, + "step": 4886 + }, + { + "epoch": 0.061101527538188455, + "grad_norm": 1.8489261865615845, + "learning_rate": 1.2220000000000002e-05, + "loss": 0.5566, + "step": 4888 + }, + { + "epoch": 0.06112652816320408, + "grad_norm": 8.008090019226074, + "learning_rate": 1.2225e-05, + "loss": 1.5153, + "step": 4890 + }, + { + "epoch": 0.06115152878821971, + "grad_norm": 2.986175060272217, + "learning_rate": 1.2230000000000001e-05, + "loss": 0.624, + "step": 4892 + }, + { + "epoch": 0.06117652941323533, + "grad_norm": 6.002665996551514, + "learning_rate": 1.2235e-05, + "loss": 1.2961, + "step": 4894 + }, + { + "epoch": 0.061201530038250955, + "grad_norm": 4.431694984436035, + "learning_rate": 1.2240000000000001e-05, + "loss": 2.1538, + "step": 4896 + }, + { + "epoch": 0.06122653066326658, + "grad_norm": 7.731167793273926, + "learning_rate": 1.2245e-05, + "loss": 0.4237, + "step": 4898 + }, + { + "epoch": 0.06125153128828221, + "grad_norm": 4.569139003753662, + "learning_rate": 1.2250000000000001e-05, + "loss": 1.3519, + "step": 4900 + }, + { + "epoch": 0.061276531913297835, + "grad_norm": 8.868415832519531, + "learning_rate": 1.2255000000000002e-05, + "loss": 0.8992, + "step": 4902 + }, + { + "epoch": 0.061301532538313455, + "grad_norm": 3.66621470451355, + "learning_rate": 1.2260000000000001e-05, + "loss": 1.1933, + "step": 4904 + }, + { + "epoch": 0.06132653316332908, + "grad_norm": 6.999749183654785, + "learning_rate": 1.2265e-05, + "loss": 0.2786, + "step": 4906 + }, + { + "epoch": 0.06135153378834471, + "grad_norm": 0.40090084075927734, + "learning_rate": 1.2270000000000001e-05, + "loss": 0.587, + "step": 4908 + }, + { + "epoch": 0.061376534413360335, + "grad_norm": 4.920235633850098, + "learning_rate": 1.2275000000000002e-05, + "loss": 1.6908, + "step": 4910 + }, + { + "epoch": 0.06140153503837596, + "grad_norm": 4.927720069885254, + "learning_rate": 1.2280000000000001e-05, + "loss": 1.5082, + "step": 4912 + }, + { + "epoch": 0.06142653566339158, + "grad_norm": 2.9799416065216064, + "learning_rate": 1.2285e-05, + "loss": 1.6288, + "step": 4914 + }, + { + "epoch": 0.06145153628840721, + "grad_norm": 6.457995891571045, + "learning_rate": 1.2290000000000003e-05, + "loss": 0.9433, + "step": 4916 + }, + { + "epoch": 0.061476536913422836, + "grad_norm": 5.71627950668335, + "learning_rate": 1.2295000000000002e-05, + "loss": 1.3225, + "step": 4918 + }, + { + "epoch": 0.06150153753843846, + "grad_norm": 3.138878107070923, + "learning_rate": 1.23e-05, + "loss": 1.2972, + "step": 4920 + }, + { + "epoch": 0.06152653816345409, + "grad_norm": 3.194976568222046, + "learning_rate": 1.2305e-05, + "loss": 0.3407, + "step": 4922 + }, + { + "epoch": 0.06155153878846971, + "grad_norm": 2.69775652885437, + "learning_rate": 1.2310000000000002e-05, + "loss": 1.0856, + "step": 4924 + }, + { + "epoch": 0.061576539413485336, + "grad_norm": 2.07658314704895, + "learning_rate": 1.2315000000000002e-05, + "loss": 0.5872, + "step": 4926 + }, + { + "epoch": 0.06160154003850096, + "grad_norm": 4.223357200622559, + "learning_rate": 1.232e-05, + "loss": 1.9758, + "step": 4928 + }, + { + "epoch": 0.06162654066351659, + "grad_norm": 4.599226474761963, + "learning_rate": 1.2325e-05, + "loss": 1.968, + "step": 4930 + }, + { + "epoch": 0.061651541288532216, + "grad_norm": 5.579275131225586, + "learning_rate": 1.2330000000000002e-05, + "loss": 1.5716, + "step": 4932 + }, + { + "epoch": 0.061676541913547836, + "grad_norm": 2.319396495819092, + "learning_rate": 1.2335000000000001e-05, + "loss": 0.465, + "step": 4934 + }, + { + "epoch": 0.06170154253856346, + "grad_norm": 2.862675905227661, + "learning_rate": 1.234e-05, + "loss": 1.3194, + "step": 4936 + }, + { + "epoch": 0.06172654316357909, + "grad_norm": 3.056993246078491, + "learning_rate": 1.2345e-05, + "loss": 1.1485, + "step": 4938 + }, + { + "epoch": 0.061751543788594716, + "grad_norm": 0.024366529658436775, + "learning_rate": 1.2350000000000002e-05, + "loss": 0.2581, + "step": 4940 + }, + { + "epoch": 0.06177654441361034, + "grad_norm": 3.0621683597564697, + "learning_rate": 1.2355000000000001e-05, + "loss": 1.3409, + "step": 4942 + }, + { + "epoch": 0.06180154503862596, + "grad_norm": 3.558140277862549, + "learning_rate": 1.236e-05, + "loss": 0.9381, + "step": 4944 + }, + { + "epoch": 0.06182654566364159, + "grad_norm": 1.8321301937103271, + "learning_rate": 1.2365000000000001e-05, + "loss": 0.2595, + "step": 4946 + }, + { + "epoch": 0.061851546288657216, + "grad_norm": 3.6767985820770264, + "learning_rate": 1.2370000000000002e-05, + "loss": 1.3503, + "step": 4948 + }, + { + "epoch": 0.06187654691367284, + "grad_norm": 4.900521755218506, + "learning_rate": 1.2375000000000001e-05, + "loss": 0.9532, + "step": 4950 + }, + { + "epoch": 0.06190154753868847, + "grad_norm": 5.095187664031982, + "learning_rate": 1.2380000000000002e-05, + "loss": 1.9817, + "step": 4952 + }, + { + "epoch": 0.06192654816370409, + "grad_norm": 4.851664066314697, + "learning_rate": 1.2385000000000001e-05, + "loss": 2.0776, + "step": 4954 + }, + { + "epoch": 0.06195154878871972, + "grad_norm": 4.540231704711914, + "learning_rate": 1.2390000000000002e-05, + "loss": 1.2601, + "step": 4956 + }, + { + "epoch": 0.06197654941373534, + "grad_norm": 3.2999610900878906, + "learning_rate": 1.2395e-05, + "loss": 0.9753, + "step": 4958 + }, + { + "epoch": 0.06200155003875097, + "grad_norm": 4.881198406219482, + "learning_rate": 1.2400000000000002e-05, + "loss": 1.2264, + "step": 4960 + }, + { + "epoch": 0.0620265506637666, + "grad_norm": 2.3069663047790527, + "learning_rate": 1.2405e-05, + "loss": 1.0089, + "step": 4962 + }, + { + "epoch": 0.06205155128878222, + "grad_norm": 6.912741661071777, + "learning_rate": 1.2410000000000001e-05, + "loss": 1.0889, + "step": 4964 + }, + { + "epoch": 0.062076551913797844, + "grad_norm": 2.935694456100464, + "learning_rate": 1.2415000000000002e-05, + "loss": 0.7991, + "step": 4966 + }, + { + "epoch": 0.06210155253881347, + "grad_norm": 4.355788707733154, + "learning_rate": 1.2420000000000001e-05, + "loss": 1.0444, + "step": 4968 + }, + { + "epoch": 0.0621265531638291, + "grad_norm": 3.6408138275146484, + "learning_rate": 1.2425e-05, + "loss": 0.8721, + "step": 4970 + }, + { + "epoch": 0.062151553788844724, + "grad_norm": 4.332796096801758, + "learning_rate": 1.2430000000000001e-05, + "loss": 1.4369, + "step": 4972 + }, + { + "epoch": 0.062176554413860344, + "grad_norm": 6.679146766662598, + "learning_rate": 1.2435000000000002e-05, + "loss": 1.6738, + "step": 4974 + }, + { + "epoch": 0.06220155503887597, + "grad_norm": 6.122171401977539, + "learning_rate": 1.2440000000000001e-05, + "loss": 0.547, + "step": 4976 + }, + { + "epoch": 0.0622265556638916, + "grad_norm": 2.637140989303589, + "learning_rate": 1.2445e-05, + "loss": 0.9245, + "step": 4978 + }, + { + "epoch": 0.062251556288907224, + "grad_norm": 3.0191545486450195, + "learning_rate": 1.2450000000000003e-05, + "loss": 0.817, + "step": 4980 + }, + { + "epoch": 0.06227655691392285, + "grad_norm": 4.512000560760498, + "learning_rate": 1.2455000000000002e-05, + "loss": 1.6869, + "step": 4982 + }, + { + "epoch": 0.06230155753893847, + "grad_norm": 5.365052223205566, + "learning_rate": 1.2460000000000001e-05, + "loss": 0.376, + "step": 4984 + }, + { + "epoch": 0.0623265581639541, + "grad_norm": 2.574657678604126, + "learning_rate": 1.2465e-05, + "loss": 0.7578, + "step": 4986 + }, + { + "epoch": 0.062351558788969724, + "grad_norm": 2.59247088432312, + "learning_rate": 1.2470000000000003e-05, + "loss": 1.0489, + "step": 4988 + }, + { + "epoch": 0.06237655941398535, + "grad_norm": 2.4483909606933594, + "learning_rate": 1.2475000000000002e-05, + "loss": 0.331, + "step": 4990 + }, + { + "epoch": 0.06240156003900098, + "grad_norm": 3.2653987407684326, + "learning_rate": 1.248e-05, + "loss": 1.4966, + "step": 4992 + }, + { + "epoch": 0.0624265606640166, + "grad_norm": 6.310256004333496, + "learning_rate": 1.2485e-05, + "loss": 1.7637, + "step": 4994 + }, + { + "epoch": 0.062451561289032224, + "grad_norm": 9.360848426818848, + "learning_rate": 1.2490000000000002e-05, + "loss": 0.9264, + "step": 4996 + }, + { + "epoch": 0.06247656191404785, + "grad_norm": 3.683202028274536, + "learning_rate": 1.2495000000000001e-05, + "loss": 1.3832, + "step": 4998 + }, + { + "epoch": 0.06250156253906347, + "grad_norm": 3.1962876319885254, + "learning_rate": 1.25e-05, + "loss": 1.5406, + "step": 5000 + }, + { + "epoch": 0.0625265631640791, + "grad_norm": 3.0643508434295654, + "learning_rate": 1.2505e-05, + "loss": 0.736, + "step": 5002 + }, + { + "epoch": 0.06255156378909472, + "grad_norm": 0.504494309425354, + "learning_rate": 1.251e-05, + "loss": 0.8539, + "step": 5004 + }, + { + "epoch": 0.06257656441411036, + "grad_norm": 3.64640736579895, + "learning_rate": 1.2515000000000001e-05, + "loss": 0.6114, + "step": 5006 + }, + { + "epoch": 0.06260156503912598, + "grad_norm": 2.363543748855591, + "learning_rate": 1.252e-05, + "loss": 0.3501, + "step": 5008 + }, + { + "epoch": 0.0626265656641416, + "grad_norm": 6.383436679840088, + "learning_rate": 1.2525000000000001e-05, + "loss": 0.6746, + "step": 5010 + }, + { + "epoch": 0.06265156628915723, + "grad_norm": 3.589522361755371, + "learning_rate": 1.253e-05, + "loss": 1.2678, + "step": 5012 + }, + { + "epoch": 0.06267656691417285, + "grad_norm": 1.5025888681411743, + "learning_rate": 1.2535000000000001e-05, + "loss": 0.929, + "step": 5014 + }, + { + "epoch": 0.06270156753918849, + "grad_norm": 1.4463188648223877, + "learning_rate": 1.254e-05, + "loss": 0.6693, + "step": 5016 + }, + { + "epoch": 0.0627265681642041, + "grad_norm": 0.7504470944404602, + "learning_rate": 1.2545000000000001e-05, + "loss": 0.6707, + "step": 5018 + }, + { + "epoch": 0.06275156878921972, + "grad_norm": 3.2669990062713623, + "learning_rate": 1.255e-05, + "loss": 0.8343, + "step": 5020 + }, + { + "epoch": 0.06277656941423536, + "grad_norm": 15.374289512634277, + "learning_rate": 1.2555000000000001e-05, + "loss": 1.9309, + "step": 5022 + }, + { + "epoch": 0.06280157003925098, + "grad_norm": 1.8214046955108643, + "learning_rate": 1.2560000000000002e-05, + "loss": 1.1441, + "step": 5024 + }, + { + "epoch": 0.06282657066426661, + "grad_norm": 2.0772182941436768, + "learning_rate": 1.2565e-05, + "loss": 0.5828, + "step": 5026 + }, + { + "epoch": 0.06285157128928223, + "grad_norm": 2.99397349357605, + "learning_rate": 1.257e-05, + "loss": 1.4547, + "step": 5028 + }, + { + "epoch": 0.06287657191429785, + "grad_norm": 6.113977432250977, + "learning_rate": 1.2575000000000002e-05, + "loss": 1.9195, + "step": 5030 + }, + { + "epoch": 0.06290157253931349, + "grad_norm": 3.3195204734802246, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.8194, + "step": 5032 + }, + { + "epoch": 0.0629265731643291, + "grad_norm": 3.4799609184265137, + "learning_rate": 1.2585e-05, + "loss": 1.1276, + "step": 5034 + }, + { + "epoch": 0.06295157378934474, + "grad_norm": 3.299699544906616, + "learning_rate": 1.259e-05, + "loss": 0.2614, + "step": 5036 + }, + { + "epoch": 0.06297657441436036, + "grad_norm": 0.014058533124625683, + "learning_rate": 1.2595000000000002e-05, + "loss": 0.1691, + "step": 5038 + }, + { + "epoch": 0.06300157503937598, + "grad_norm": 4.461948394775391, + "learning_rate": 1.2600000000000001e-05, + "loss": 1.7636, + "step": 5040 + }, + { + "epoch": 0.06302657566439161, + "grad_norm": 2.206082344055176, + "learning_rate": 1.2605e-05, + "loss": 1.9909, + "step": 5042 + }, + { + "epoch": 0.06305157628940723, + "grad_norm": 4.479926109313965, + "learning_rate": 1.261e-05, + "loss": 0.9533, + "step": 5044 + }, + { + "epoch": 0.06307657691442287, + "grad_norm": 3.8508825302124023, + "learning_rate": 1.2615000000000002e-05, + "loss": 2.1423, + "step": 5046 + }, + { + "epoch": 0.06310157753943849, + "grad_norm": 7.504382133483887, + "learning_rate": 1.2620000000000001e-05, + "loss": 1.0818, + "step": 5048 + }, + { + "epoch": 0.0631265781644541, + "grad_norm": 6.994579315185547, + "learning_rate": 1.2625e-05, + "loss": 0.2263, + "step": 5050 + }, + { + "epoch": 0.06315157878946974, + "grad_norm": 4.189427852630615, + "learning_rate": 1.263e-05, + "loss": 1.7229, + "step": 5052 + }, + { + "epoch": 0.06317657941448536, + "grad_norm": 0.4332062602043152, + "learning_rate": 1.2635000000000002e-05, + "loss": 0.6882, + "step": 5054 + }, + { + "epoch": 0.06320158003950099, + "grad_norm": 10.196738243103027, + "learning_rate": 1.2640000000000001e-05, + "loss": 1.4969, + "step": 5056 + }, + { + "epoch": 0.06322658066451661, + "grad_norm": 4.138776779174805, + "learning_rate": 1.2645e-05, + "loss": 1.503, + "step": 5058 + }, + { + "epoch": 0.06325158128953223, + "grad_norm": 3.6701529026031494, + "learning_rate": 1.2650000000000001e-05, + "loss": 1.1558, + "step": 5060 + }, + { + "epoch": 0.06327658191454787, + "grad_norm": 1.8377426862716675, + "learning_rate": 1.2655000000000002e-05, + "loss": 1.0553, + "step": 5062 + }, + { + "epoch": 0.06330158253956349, + "grad_norm": 3.7303621768951416, + "learning_rate": 1.266e-05, + "loss": 2.1412, + "step": 5064 + }, + { + "epoch": 0.06332658316457912, + "grad_norm": 5.802850246429443, + "learning_rate": 1.2665e-05, + "loss": 0.7724, + "step": 5066 + }, + { + "epoch": 0.06335158378959474, + "grad_norm": 3.585294008255005, + "learning_rate": 1.267e-05, + "loss": 1.1799, + "step": 5068 + }, + { + "epoch": 0.06337658441461036, + "grad_norm": 0.03542392700910568, + "learning_rate": 1.2675000000000001e-05, + "loss": 0.7667, + "step": 5070 + }, + { + "epoch": 0.063401585039626, + "grad_norm": 3.946711778640747, + "learning_rate": 1.268e-05, + "loss": 1.6432, + "step": 5072 + }, + { + "epoch": 0.06342658566464161, + "grad_norm": 3.8171589374542236, + "learning_rate": 1.2685000000000001e-05, + "loss": 0.8515, + "step": 5074 + }, + { + "epoch": 0.06345158628965725, + "grad_norm": 0.009262521751224995, + "learning_rate": 1.269e-05, + "loss": 0.4737, + "step": 5076 + }, + { + "epoch": 0.06347658691467287, + "grad_norm": 0.008038688451051712, + "learning_rate": 1.2695000000000001e-05, + "loss": 0.5067, + "step": 5078 + }, + { + "epoch": 0.06350158753968849, + "grad_norm": 3.8470864295959473, + "learning_rate": 1.27e-05, + "loss": 1.9067, + "step": 5080 + }, + { + "epoch": 0.06352658816470412, + "grad_norm": 2.3321893215179443, + "learning_rate": 1.2705000000000001e-05, + "loss": 0.054, + "step": 5082 + }, + { + "epoch": 0.06355158878971974, + "grad_norm": 11.560994148254395, + "learning_rate": 1.271e-05, + "loss": 1.0931, + "step": 5084 + }, + { + "epoch": 0.06357658941473537, + "grad_norm": 5.223940849304199, + "learning_rate": 1.2715000000000001e-05, + "loss": 0.6913, + "step": 5086 + }, + { + "epoch": 0.063601590039751, + "grad_norm": 0.01419780496507883, + "learning_rate": 1.2720000000000002e-05, + "loss": 0.5966, + "step": 5088 + }, + { + "epoch": 0.06362659066476661, + "grad_norm": 5.391462802886963, + "learning_rate": 1.2725000000000001e-05, + "loss": 0.5593, + "step": 5090 + }, + { + "epoch": 0.06365159128978225, + "grad_norm": 3.3759689331054688, + "learning_rate": 1.273e-05, + "loss": 0.6553, + "step": 5092 + }, + { + "epoch": 0.06367659191479787, + "grad_norm": 0.020210158079862595, + "learning_rate": 1.2735000000000003e-05, + "loss": 0.4507, + "step": 5094 + }, + { + "epoch": 0.0637015925398135, + "grad_norm": 4.283379077911377, + "learning_rate": 1.2740000000000002e-05, + "loss": 0.5634, + "step": 5096 + }, + { + "epoch": 0.06372659316482912, + "grad_norm": 0.006007614079862833, + "learning_rate": 1.2745e-05, + "loss": 0.0001, + "step": 5098 + }, + { + "epoch": 0.06375159378984474, + "grad_norm": 2.151900053024292, + "learning_rate": 1.275e-05, + "loss": 1.178, + "step": 5100 + }, + { + "epoch": 0.06377659441486037, + "grad_norm": 2.121304512023926, + "learning_rate": 1.2755000000000002e-05, + "loss": 0.8648, + "step": 5102 + }, + { + "epoch": 0.063801595039876, + "grad_norm": 3.2889435291290283, + "learning_rate": 1.2760000000000001e-05, + "loss": 0.9286, + "step": 5104 + }, + { + "epoch": 0.06382659566489163, + "grad_norm": 0.004890061914920807, + "learning_rate": 1.2765e-05, + "loss": 1.177, + "step": 5106 + }, + { + "epoch": 0.06385159628990725, + "grad_norm": 4.631579399108887, + "learning_rate": 1.277e-05, + "loss": 1.9626, + "step": 5108 + }, + { + "epoch": 0.06387659691492287, + "grad_norm": 3.538170576095581, + "learning_rate": 1.2775000000000002e-05, + "loss": 1.6541, + "step": 5110 + }, + { + "epoch": 0.0639015975399385, + "grad_norm": 3.8116681575775146, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.5212, + "step": 5112 + }, + { + "epoch": 0.06392659816495412, + "grad_norm": 4.800282001495361, + "learning_rate": 1.2785e-05, + "loss": 1.3499, + "step": 5114 + }, + { + "epoch": 0.06395159878996975, + "grad_norm": 18.40309715270996, + "learning_rate": 1.279e-05, + "loss": 0.3984, + "step": 5116 + }, + { + "epoch": 0.06397659941498537, + "grad_norm": 3.477388381958008, + "learning_rate": 1.2795000000000002e-05, + "loss": 0.6955, + "step": 5118 + }, + { + "epoch": 0.064001600040001, + "grad_norm": 6.2731523513793945, + "learning_rate": 1.2800000000000001e-05, + "loss": 1.633, + "step": 5120 + }, + { + "epoch": 0.06402660066501663, + "grad_norm": 7.2176995277404785, + "learning_rate": 1.2805e-05, + "loss": 2.2948, + "step": 5122 + }, + { + "epoch": 0.06405160129003225, + "grad_norm": 4.198620319366455, + "learning_rate": 1.2810000000000001e-05, + "loss": 1.3901, + "step": 5124 + }, + { + "epoch": 0.06407660191504788, + "grad_norm": 4.057900905609131, + "learning_rate": 1.2815000000000002e-05, + "loss": 1.6423, + "step": 5126 + }, + { + "epoch": 0.0641016025400635, + "grad_norm": 2.848660469055176, + "learning_rate": 1.2820000000000001e-05, + "loss": 0.9171, + "step": 5128 + }, + { + "epoch": 0.06412660316507912, + "grad_norm": 4.3583083152771, + "learning_rate": 1.2825e-05, + "loss": 1.0416, + "step": 5130 + }, + { + "epoch": 0.06415160379009475, + "grad_norm": 3.652839422225952, + "learning_rate": 1.283e-05, + "loss": 1.2163, + "step": 5132 + }, + { + "epoch": 0.06417660441511037, + "grad_norm": 3.3584048748016357, + "learning_rate": 1.2835000000000002e-05, + "loss": 1.2794, + "step": 5134 + }, + { + "epoch": 0.06420160504012601, + "grad_norm": 4.9309797286987305, + "learning_rate": 1.284e-05, + "loss": 0.6977, + "step": 5136 + }, + { + "epoch": 0.06422660566514163, + "grad_norm": 3.8987953662872314, + "learning_rate": 1.2845000000000002e-05, + "loss": 1.4911, + "step": 5138 + }, + { + "epoch": 0.06425160629015725, + "grad_norm": 1.241102933883667, + "learning_rate": 1.285e-05, + "loss": 0.4333, + "step": 5140 + }, + { + "epoch": 0.06427660691517288, + "grad_norm": 3.345367670059204, + "learning_rate": 1.2855000000000001e-05, + "loss": 0.77, + "step": 5142 + }, + { + "epoch": 0.0643016075401885, + "grad_norm": 5.498572826385498, + "learning_rate": 1.286e-05, + "loss": 1.0849, + "step": 5144 + }, + { + "epoch": 0.06432660816520414, + "grad_norm": 4.7769269943237305, + "learning_rate": 1.2865000000000001e-05, + "loss": 1.0044, + "step": 5146 + }, + { + "epoch": 0.06435160879021976, + "grad_norm": 2.937675714492798, + "learning_rate": 1.287e-05, + "loss": 1.5567, + "step": 5148 + }, + { + "epoch": 0.06437660941523538, + "grad_norm": 2.957576274871826, + "learning_rate": 1.2875000000000001e-05, + "loss": 1.3627, + "step": 5150 + }, + { + "epoch": 0.06440161004025101, + "grad_norm": 4.416356563568115, + "learning_rate": 1.2880000000000002e-05, + "loss": 0.6403, + "step": 5152 + }, + { + "epoch": 0.06442661066526663, + "grad_norm": 1.917302131652832, + "learning_rate": 1.2885000000000001e-05, + "loss": 0.9263, + "step": 5154 + }, + { + "epoch": 0.06445161129028226, + "grad_norm": 4.267118453979492, + "learning_rate": 1.289e-05, + "loss": 1.0656, + "step": 5156 + }, + { + "epoch": 0.06447661191529788, + "grad_norm": 0.013868744485080242, + "learning_rate": 1.2895000000000003e-05, + "loss": 0.4661, + "step": 5158 + }, + { + "epoch": 0.0645016125403135, + "grad_norm": 7.12719202041626, + "learning_rate": 1.2900000000000002e-05, + "loss": 0.5335, + "step": 5160 + }, + { + "epoch": 0.06452661316532914, + "grad_norm": 1.8388475179672241, + "learning_rate": 1.2905000000000001e-05, + "loss": 0.7703, + "step": 5162 + }, + { + "epoch": 0.06455161379034476, + "grad_norm": 2.1505939960479736, + "learning_rate": 1.291e-05, + "loss": 0.7735, + "step": 5164 + }, + { + "epoch": 0.06457661441536039, + "grad_norm": 3.2549355030059814, + "learning_rate": 1.2915000000000003e-05, + "loss": 1.5856, + "step": 5166 + }, + { + "epoch": 0.06460161504037601, + "grad_norm": 8.61493968963623, + "learning_rate": 1.2920000000000002e-05, + "loss": 0.7545, + "step": 5168 + }, + { + "epoch": 0.06462661566539163, + "grad_norm": 3.5610713958740234, + "learning_rate": 1.2925e-05, + "loss": 0.696, + "step": 5170 + }, + { + "epoch": 0.06465161629040726, + "grad_norm": 3.668062925338745, + "learning_rate": 1.293e-05, + "loss": 1.7608, + "step": 5172 + }, + { + "epoch": 0.06467661691542288, + "grad_norm": 2.541987180709839, + "learning_rate": 1.2935000000000002e-05, + "loss": 0.45, + "step": 5174 + }, + { + "epoch": 0.06470161754043852, + "grad_norm": 4.46262264251709, + "learning_rate": 1.2940000000000001e-05, + "loss": 0.3768, + "step": 5176 + }, + { + "epoch": 0.06472661816545414, + "grad_norm": 2.6135082244873047, + "learning_rate": 1.2945e-05, + "loss": 1.4663, + "step": 5178 + }, + { + "epoch": 0.06475161879046976, + "grad_norm": 3.0772714614868164, + "learning_rate": 1.295e-05, + "loss": 0.4688, + "step": 5180 + }, + { + "epoch": 0.06477661941548539, + "grad_norm": 5.8925957679748535, + "learning_rate": 1.2955000000000002e-05, + "loss": 2.0232, + "step": 5182 + }, + { + "epoch": 0.06480162004050101, + "grad_norm": 3.4610416889190674, + "learning_rate": 1.2960000000000001e-05, + "loss": 1.1308, + "step": 5184 + }, + { + "epoch": 0.06482662066551664, + "grad_norm": 3.024139881134033, + "learning_rate": 1.2965e-05, + "loss": 1.503, + "step": 5186 + }, + { + "epoch": 0.06485162129053226, + "grad_norm": 5.627942085266113, + "learning_rate": 1.2970000000000001e-05, + "loss": 2.0454, + "step": 5188 + }, + { + "epoch": 0.06487662191554788, + "grad_norm": 0.007647113408893347, + "learning_rate": 1.2975000000000002e-05, + "loss": 0.4532, + "step": 5190 + }, + { + "epoch": 0.06490162254056352, + "grad_norm": 0.28037288784980774, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.9522, + "step": 5192 + }, + { + "epoch": 0.06492662316557914, + "grad_norm": 68.25873565673828, + "learning_rate": 1.2985e-05, + "loss": 1.1591, + "step": 5194 + }, + { + "epoch": 0.06495162379059477, + "grad_norm": 1.5890377759933472, + "learning_rate": 1.2990000000000001e-05, + "loss": 0.8625, + "step": 5196 + }, + { + "epoch": 0.06497662441561039, + "grad_norm": 2.656705856323242, + "learning_rate": 1.2995000000000002e-05, + "loss": 0.4791, + "step": 5198 + }, + { + "epoch": 0.06500162504062601, + "grad_norm": 8.537793159484863, + "learning_rate": 1.3000000000000001e-05, + "loss": 2.4197, + "step": 5200 + }, + { + "epoch": 0.06502662566564164, + "grad_norm": 2.478283405303955, + "learning_rate": 1.3005000000000002e-05, + "loss": 0.703, + "step": 5202 + }, + { + "epoch": 0.06505162629065726, + "grad_norm": 5.803468704223633, + "learning_rate": 1.301e-05, + "loss": 0.6188, + "step": 5204 + }, + { + "epoch": 0.0650766269156729, + "grad_norm": 4.372042655944824, + "learning_rate": 1.3015000000000002e-05, + "loss": 2.0819, + "step": 5206 + }, + { + "epoch": 0.06510162754068852, + "grad_norm": 5.132164478302002, + "learning_rate": 1.302e-05, + "loss": 1.0199, + "step": 5208 + }, + { + "epoch": 0.06512662816570414, + "grad_norm": 5.9358296394348145, + "learning_rate": 1.3025000000000002e-05, + "loss": 0.8414, + "step": 5210 + }, + { + "epoch": 0.06515162879071977, + "grad_norm": 4.3023271560668945, + "learning_rate": 1.303e-05, + "loss": 0.9333, + "step": 5212 + }, + { + "epoch": 0.06517662941573539, + "grad_norm": 2.4768049716949463, + "learning_rate": 1.3035000000000001e-05, + "loss": 0.9344, + "step": 5214 + }, + { + "epoch": 0.06520163004075102, + "grad_norm": 3.4830782413482666, + "learning_rate": 1.3040000000000002e-05, + "loss": 1.3733, + "step": 5216 + }, + { + "epoch": 0.06522663066576664, + "grad_norm": 11.442657470703125, + "learning_rate": 1.3045000000000001e-05, + "loss": 0.96, + "step": 5218 + }, + { + "epoch": 0.06525163129078226, + "grad_norm": 5.024885654449463, + "learning_rate": 1.305e-05, + "loss": 1.7239, + "step": 5220 + }, + { + "epoch": 0.0652766319157979, + "grad_norm": 4.081914901733398, + "learning_rate": 1.3055000000000003e-05, + "loss": 0.996, + "step": 5222 + }, + { + "epoch": 0.06530163254081352, + "grad_norm": 3.498680591583252, + "learning_rate": 1.3060000000000002e-05, + "loss": 0.6429, + "step": 5224 + }, + { + "epoch": 0.06532663316582915, + "grad_norm": 5.712122440338135, + "learning_rate": 1.3065000000000001e-05, + "loss": 1.3851, + "step": 5226 + }, + { + "epoch": 0.06535163379084477, + "grad_norm": 2.4964723587036133, + "learning_rate": 1.307e-05, + "loss": 1.4489, + "step": 5228 + }, + { + "epoch": 0.06537663441586039, + "grad_norm": 6.472644329071045, + "learning_rate": 1.3075000000000003e-05, + "loss": 1.9743, + "step": 5230 + }, + { + "epoch": 0.06540163504087602, + "grad_norm": 3.906491279602051, + "learning_rate": 1.3080000000000002e-05, + "loss": 1.2234, + "step": 5232 + }, + { + "epoch": 0.06542663566589164, + "grad_norm": 12.9578275680542, + "learning_rate": 1.3085000000000001e-05, + "loss": 1.9306, + "step": 5234 + }, + { + "epoch": 0.06545163629090728, + "grad_norm": 2.3714709281921387, + "learning_rate": 1.309e-05, + "loss": 0.5844, + "step": 5236 + }, + { + "epoch": 0.0654766369159229, + "grad_norm": 4.1150336265563965, + "learning_rate": 1.3095000000000003e-05, + "loss": 0.7067, + "step": 5238 + }, + { + "epoch": 0.06550163754093852, + "grad_norm": 0.011923114769160748, + "learning_rate": 1.3100000000000002e-05, + "loss": 0.5747, + "step": 5240 + }, + { + "epoch": 0.06552663816595415, + "grad_norm": 2.3802831172943115, + "learning_rate": 1.3105e-05, + "loss": 0.9391, + "step": 5242 + }, + { + "epoch": 0.06555163879096977, + "grad_norm": 3.2324626445770264, + "learning_rate": 1.311e-05, + "loss": 1.3904, + "step": 5244 + }, + { + "epoch": 0.0655766394159854, + "grad_norm": 2.74312686920166, + "learning_rate": 1.3115000000000002e-05, + "loss": 1.2941, + "step": 5246 + }, + { + "epoch": 0.06560164004100102, + "grad_norm": 3.190629720687866, + "learning_rate": 1.3120000000000001e-05, + "loss": 1.0338, + "step": 5248 + }, + { + "epoch": 0.06562664066601664, + "grad_norm": 23.04907989501953, + "learning_rate": 1.3125e-05, + "loss": 1.2634, + "step": 5250 + }, + { + "epoch": 0.06565164129103228, + "grad_norm": 4.322298049926758, + "learning_rate": 1.3130000000000001e-05, + "loss": 2.1159, + "step": 5252 + }, + { + "epoch": 0.0656766419160479, + "grad_norm": 0.015301809646189213, + "learning_rate": 1.3135e-05, + "loss": 0.5706, + "step": 5254 + }, + { + "epoch": 0.06570164254106353, + "grad_norm": 2.791670560836792, + "learning_rate": 1.3140000000000001e-05, + "loss": 0.8244, + "step": 5256 + }, + { + "epoch": 0.06572664316607915, + "grad_norm": 12.24149227142334, + "learning_rate": 1.3145e-05, + "loss": 2.2641, + "step": 5258 + }, + { + "epoch": 0.06575164379109477, + "grad_norm": 2.446943998336792, + "learning_rate": 1.3150000000000001e-05, + "loss": 0.7336, + "step": 5260 + }, + { + "epoch": 0.0657766444161104, + "grad_norm": 0.015482913702726364, + "learning_rate": 1.3155e-05, + "loss": 0.0003, + "step": 5262 + }, + { + "epoch": 0.06580164504112603, + "grad_norm": 10.794549942016602, + "learning_rate": 1.3160000000000001e-05, + "loss": 1.3057, + "step": 5264 + }, + { + "epoch": 0.06582664566614166, + "grad_norm": 0.012663879431784153, + "learning_rate": 1.3165000000000002e-05, + "loss": 0.468, + "step": 5266 + }, + { + "epoch": 0.06585164629115728, + "grad_norm": 0.04487356171011925, + "learning_rate": 1.3170000000000001e-05, + "loss": 0.0005, + "step": 5268 + }, + { + "epoch": 0.0658766469161729, + "grad_norm": 5.709043979644775, + "learning_rate": 1.3175e-05, + "loss": 0.3642, + "step": 5270 + }, + { + "epoch": 0.06590164754118853, + "grad_norm": 4.630573272705078, + "learning_rate": 1.3180000000000001e-05, + "loss": 1.9074, + "step": 5272 + }, + { + "epoch": 0.06592664816620415, + "grad_norm": 4.691824913024902, + "learning_rate": 1.3185000000000002e-05, + "loss": 1.2305, + "step": 5274 + }, + { + "epoch": 0.06595164879121979, + "grad_norm": 18.94035530090332, + "learning_rate": 1.319e-05, + "loss": 0.5438, + "step": 5276 + }, + { + "epoch": 0.0659766494162354, + "grad_norm": 2.5455079078674316, + "learning_rate": 1.3195e-05, + "loss": 0.2534, + "step": 5278 + }, + { + "epoch": 0.06600165004125103, + "grad_norm": 2.6331145763397217, + "learning_rate": 1.3200000000000002e-05, + "loss": 0.886, + "step": 5280 + }, + { + "epoch": 0.06602665066626666, + "grad_norm": 2.2185513973236084, + "learning_rate": 1.3205000000000001e-05, + "loss": 0.8327, + "step": 5282 + }, + { + "epoch": 0.06605165129128228, + "grad_norm": 3.8141536712646484, + "learning_rate": 1.321e-05, + "loss": 0.6022, + "step": 5284 + }, + { + "epoch": 0.06607665191629791, + "grad_norm": 3.2657957077026367, + "learning_rate": 1.3215e-05, + "loss": 1.3566, + "step": 5286 + }, + { + "epoch": 0.06610165254131353, + "grad_norm": 3.1555964946746826, + "learning_rate": 1.3220000000000002e-05, + "loss": 0.4332, + "step": 5288 + }, + { + "epoch": 0.06612665316632915, + "grad_norm": 5.1790971755981445, + "learning_rate": 1.3225000000000001e-05, + "loss": 0.937, + "step": 5290 + }, + { + "epoch": 0.06615165379134479, + "grad_norm": 2.5301549434661865, + "learning_rate": 1.323e-05, + "loss": 0.1538, + "step": 5292 + }, + { + "epoch": 0.0661766544163604, + "grad_norm": 4.476517200469971, + "learning_rate": 1.3235e-05, + "loss": 0.8732, + "step": 5294 + }, + { + "epoch": 0.06620165504137604, + "grad_norm": 0.04831581190228462, + "learning_rate": 1.3240000000000002e-05, + "loss": 1.0445, + "step": 5296 + }, + { + "epoch": 0.06622665566639166, + "grad_norm": 2.978360176086426, + "learning_rate": 1.3245000000000001e-05, + "loss": 1.3727, + "step": 5298 + }, + { + "epoch": 0.06625165629140728, + "grad_norm": 3.1588656902313232, + "learning_rate": 1.325e-05, + "loss": 1.2786, + "step": 5300 + }, + { + "epoch": 0.06627665691642291, + "grad_norm": 4.079364776611328, + "learning_rate": 1.3255e-05, + "loss": 0.3728, + "step": 5302 + }, + { + "epoch": 0.06630165754143853, + "grad_norm": 4.0441670417785645, + "learning_rate": 1.3260000000000002e-05, + "loss": 1.3569, + "step": 5304 + }, + { + "epoch": 0.06632665816645417, + "grad_norm": 4.322773456573486, + "learning_rate": 1.3265000000000001e-05, + "loss": 1.3539, + "step": 5306 + }, + { + "epoch": 0.06635165879146979, + "grad_norm": 3.572227954864502, + "learning_rate": 1.327e-05, + "loss": 0.8727, + "step": 5308 + }, + { + "epoch": 0.0663766594164854, + "grad_norm": 4.905872821807861, + "learning_rate": 1.3275e-05, + "loss": 1.6421, + "step": 5310 + }, + { + "epoch": 0.06640166004150104, + "grad_norm": 0.533276379108429, + "learning_rate": 1.3280000000000002e-05, + "loss": 0.6606, + "step": 5312 + }, + { + "epoch": 0.06642666066651666, + "grad_norm": 0.24564504623413086, + "learning_rate": 1.3285e-05, + "loss": 0.165, + "step": 5314 + }, + { + "epoch": 0.0664516612915323, + "grad_norm": 2.483729600906372, + "learning_rate": 1.3290000000000002e-05, + "loss": 0.8043, + "step": 5316 + }, + { + "epoch": 0.06647666191654791, + "grad_norm": 4.288825988769531, + "learning_rate": 1.3295e-05, + "loss": 1.7416, + "step": 5318 + }, + { + "epoch": 0.06650166254156353, + "grad_norm": 15.899820327758789, + "learning_rate": 1.3300000000000001e-05, + "loss": 1.0287, + "step": 5320 + }, + { + "epoch": 0.06652666316657917, + "grad_norm": 3.9079744815826416, + "learning_rate": 1.3305e-05, + "loss": 1.1327, + "step": 5322 + }, + { + "epoch": 0.06655166379159479, + "grad_norm": 3.7283334732055664, + "learning_rate": 1.3310000000000001e-05, + "loss": 1.3132, + "step": 5324 + }, + { + "epoch": 0.06657666441661042, + "grad_norm": 0.4910105764865875, + "learning_rate": 1.3315e-05, + "loss": 0.4762, + "step": 5326 + }, + { + "epoch": 0.06660166504162604, + "grad_norm": 3.4657182693481445, + "learning_rate": 1.3320000000000001e-05, + "loss": 1.2101, + "step": 5328 + }, + { + "epoch": 0.06662666566664166, + "grad_norm": 3.7700488567352295, + "learning_rate": 1.3325000000000002e-05, + "loss": 1.4017, + "step": 5330 + }, + { + "epoch": 0.0666516662916573, + "grad_norm": 4.991322040557861, + "learning_rate": 1.3330000000000001e-05, + "loss": 1.8926, + "step": 5332 + }, + { + "epoch": 0.06667666691667291, + "grad_norm": 1.7076492309570312, + "learning_rate": 1.3335e-05, + "loss": 0.1774, + "step": 5334 + }, + { + "epoch": 0.06670166754168855, + "grad_norm": 4.48725700378418, + "learning_rate": 1.3340000000000001e-05, + "loss": 1.5111, + "step": 5336 + }, + { + "epoch": 0.06672666816670417, + "grad_norm": 5.62389612197876, + "learning_rate": 1.3345000000000002e-05, + "loss": 1.4862, + "step": 5338 + }, + { + "epoch": 0.06675166879171979, + "grad_norm": 6.668692111968994, + "learning_rate": 1.3350000000000001e-05, + "loss": 1.4911, + "step": 5340 + }, + { + "epoch": 0.06677666941673542, + "grad_norm": 4.121255397796631, + "learning_rate": 1.3355e-05, + "loss": 1.015, + "step": 5342 + }, + { + "epoch": 0.06680167004175104, + "grad_norm": 4.1432647705078125, + "learning_rate": 1.3360000000000003e-05, + "loss": 1.4529, + "step": 5344 + }, + { + "epoch": 0.06682667066676667, + "grad_norm": 3.6765594482421875, + "learning_rate": 1.3365000000000002e-05, + "loss": 0.561, + "step": 5346 + }, + { + "epoch": 0.0668516712917823, + "grad_norm": 3.5005695819854736, + "learning_rate": 1.337e-05, + "loss": 0.7997, + "step": 5348 + }, + { + "epoch": 0.06687667191679791, + "grad_norm": 5.6969404220581055, + "learning_rate": 1.3375e-05, + "loss": 0.7943, + "step": 5350 + }, + { + "epoch": 0.06690167254181355, + "grad_norm": 2.4674510955810547, + "learning_rate": 1.3380000000000002e-05, + "loss": 1.0008, + "step": 5352 + }, + { + "epoch": 0.06692667316682917, + "grad_norm": 4.946968078613281, + "learning_rate": 1.3385000000000001e-05, + "loss": 1.2871, + "step": 5354 + }, + { + "epoch": 0.0669516737918448, + "grad_norm": 1.8918465375900269, + "learning_rate": 1.339e-05, + "loss": 0.6943, + "step": 5356 + }, + { + "epoch": 0.06697667441686042, + "grad_norm": 4.5244622230529785, + "learning_rate": 1.3395e-05, + "loss": 1.3569, + "step": 5358 + }, + { + "epoch": 0.06700167504187604, + "grad_norm": 5.471523284912109, + "learning_rate": 1.3400000000000002e-05, + "loss": 0.9133, + "step": 5360 + }, + { + "epoch": 0.06702667566689167, + "grad_norm": 1.6403917074203491, + "learning_rate": 1.3405000000000001e-05, + "loss": 0.8861, + "step": 5362 + }, + { + "epoch": 0.0670516762919073, + "grad_norm": 5.121341228485107, + "learning_rate": 1.341e-05, + "loss": 1.223, + "step": 5364 + }, + { + "epoch": 0.06707667691692293, + "grad_norm": 3.9504566192626953, + "learning_rate": 1.3415e-05, + "loss": 1.1601, + "step": 5366 + }, + { + "epoch": 0.06710167754193855, + "grad_norm": 2.8050482273101807, + "learning_rate": 1.3420000000000002e-05, + "loss": 0.513, + "step": 5368 + }, + { + "epoch": 0.06712667816695417, + "grad_norm": 3.0993728637695312, + "learning_rate": 1.3425000000000001e-05, + "loss": 0.8064, + "step": 5370 + }, + { + "epoch": 0.0671516787919698, + "grad_norm": 6.226204872131348, + "learning_rate": 1.343e-05, + "loss": 1.2706, + "step": 5372 + }, + { + "epoch": 0.06717667941698542, + "grad_norm": 1.7518919706344604, + "learning_rate": 1.3435000000000001e-05, + "loss": 1.9767, + "step": 5374 + }, + { + "epoch": 0.06720168004200106, + "grad_norm": 4.232812881469727, + "learning_rate": 1.3440000000000002e-05, + "loss": 1.3957, + "step": 5376 + }, + { + "epoch": 0.06722668066701667, + "grad_norm": 3.541093111038208, + "learning_rate": 1.3445000000000001e-05, + "loss": 0.9489, + "step": 5378 + }, + { + "epoch": 0.0672516812920323, + "grad_norm": 0.21263176202774048, + "learning_rate": 1.3450000000000002e-05, + "loss": 0.671, + "step": 5380 + }, + { + "epoch": 0.06727668191704793, + "grad_norm": 2.9274041652679443, + "learning_rate": 1.3455e-05, + "loss": 0.6844, + "step": 5382 + }, + { + "epoch": 0.06730168254206355, + "grad_norm": 2.2435576915740967, + "learning_rate": 1.3460000000000002e-05, + "loss": 0.7731, + "step": 5384 + }, + { + "epoch": 0.06732668316707918, + "grad_norm": 6.373210430145264, + "learning_rate": 1.3465e-05, + "loss": 1.6536, + "step": 5386 + }, + { + "epoch": 0.0673516837920948, + "grad_norm": 2.106490135192871, + "learning_rate": 1.3470000000000001e-05, + "loss": 0.8667, + "step": 5388 + }, + { + "epoch": 0.06737668441711042, + "grad_norm": 2.396792411804199, + "learning_rate": 1.3475e-05, + "loss": 0.8159, + "step": 5390 + }, + { + "epoch": 0.06740168504212606, + "grad_norm": 0.23261307179927826, + "learning_rate": 1.3480000000000001e-05, + "loss": 0.5125, + "step": 5392 + }, + { + "epoch": 0.06742668566714168, + "grad_norm": 0.07569403201341629, + "learning_rate": 1.3485000000000002e-05, + "loss": 0.061, + "step": 5394 + }, + { + "epoch": 0.06745168629215731, + "grad_norm": 6.797557353973389, + "learning_rate": 1.3490000000000001e-05, + "loss": 0.814, + "step": 5396 + }, + { + "epoch": 0.06747668691717293, + "grad_norm": 2.9188082218170166, + "learning_rate": 1.3495e-05, + "loss": 0.9959, + "step": 5398 + }, + { + "epoch": 0.06750168754218855, + "grad_norm": 2.547672748565674, + "learning_rate": 1.3500000000000001e-05, + "loss": 0.1924, + "step": 5400 + }, + { + "epoch": 0.06752668816720418, + "grad_norm": 3.770723819732666, + "learning_rate": 1.3505000000000002e-05, + "loss": 0.3583, + "step": 5402 + }, + { + "epoch": 0.0675516887922198, + "grad_norm": 3.2070436477661133, + "learning_rate": 1.3510000000000001e-05, + "loss": 1.653, + "step": 5404 + }, + { + "epoch": 0.06757668941723544, + "grad_norm": 4.841648101806641, + "learning_rate": 1.3515e-05, + "loss": 0.7966, + "step": 5406 + }, + { + "epoch": 0.06760169004225106, + "grad_norm": 6.2091145515441895, + "learning_rate": 1.3520000000000003e-05, + "loss": 0.3789, + "step": 5408 + }, + { + "epoch": 0.06762669066726668, + "grad_norm": 4.123163223266602, + "learning_rate": 1.3525000000000002e-05, + "loss": 1.9836, + "step": 5410 + }, + { + "epoch": 0.06765169129228231, + "grad_norm": 3.0826289653778076, + "learning_rate": 1.3530000000000001e-05, + "loss": 0.7622, + "step": 5412 + }, + { + "epoch": 0.06767669191729793, + "grad_norm": 4.264376640319824, + "learning_rate": 1.3535e-05, + "loss": 1.07, + "step": 5414 + }, + { + "epoch": 0.06770169254231356, + "grad_norm": 0.1332574337720871, + "learning_rate": 1.3540000000000003e-05, + "loss": 0.0029, + "step": 5416 + }, + { + "epoch": 0.06772669316732918, + "grad_norm": 0.32242724299430847, + "learning_rate": 1.3545000000000002e-05, + "loss": 0.6316, + "step": 5418 + }, + { + "epoch": 0.0677516937923448, + "grad_norm": 2.271904945373535, + "learning_rate": 1.355e-05, + "loss": 1.9271, + "step": 5420 + }, + { + "epoch": 0.06777669441736044, + "grad_norm": 0.2562756836414337, + "learning_rate": 1.3555e-05, + "loss": 0.7019, + "step": 5422 + }, + { + "epoch": 0.06780169504237606, + "grad_norm": 5.789306163787842, + "learning_rate": 1.3560000000000002e-05, + "loss": 0.753, + "step": 5424 + }, + { + "epoch": 0.06782669566739169, + "grad_norm": 0.1671445220708847, + "learning_rate": 1.3565000000000001e-05, + "loss": 0.0368, + "step": 5426 + }, + { + "epoch": 0.06785169629240731, + "grad_norm": 14.590506553649902, + "learning_rate": 1.357e-05, + "loss": 1.2226, + "step": 5428 + }, + { + "epoch": 0.06787669691742293, + "grad_norm": 0.39390069246292114, + "learning_rate": 1.3575e-05, + "loss": 0.9134, + "step": 5430 + }, + { + "epoch": 0.06790169754243856, + "grad_norm": 4.767691612243652, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.9514, + "step": 5432 + }, + { + "epoch": 0.06792669816745418, + "grad_norm": 4.005771636962891, + "learning_rate": 1.3585000000000001e-05, + "loss": 0.9251, + "step": 5434 + }, + { + "epoch": 0.06795169879246982, + "grad_norm": 3.446943521499634, + "learning_rate": 1.359e-05, + "loss": 1.5447, + "step": 5436 + }, + { + "epoch": 0.06797669941748544, + "grad_norm": 2.6904234886169434, + "learning_rate": 1.3595000000000001e-05, + "loss": 0.9411, + "step": 5438 + }, + { + "epoch": 0.06800170004250106, + "grad_norm": 2.5457417964935303, + "learning_rate": 1.3600000000000002e-05, + "loss": 0.8451, + "step": 5440 + }, + { + "epoch": 0.06802670066751669, + "grad_norm": 4.49110746383667, + "learning_rate": 1.3605000000000001e-05, + "loss": 1.106, + "step": 5442 + }, + { + "epoch": 0.06805170129253231, + "grad_norm": 4.376088619232178, + "learning_rate": 1.3610000000000002e-05, + "loss": 1.3228, + "step": 5444 + }, + { + "epoch": 0.06807670191754794, + "grad_norm": 3.9319772720336914, + "learning_rate": 1.3615000000000001e-05, + "loss": 1.1354, + "step": 5446 + }, + { + "epoch": 0.06810170254256356, + "grad_norm": 2.5242440700531006, + "learning_rate": 1.3620000000000002e-05, + "loss": 1.5266, + "step": 5448 + }, + { + "epoch": 0.06812670316757918, + "grad_norm": 2.9383137226104736, + "learning_rate": 1.3625e-05, + "loss": 0.6152, + "step": 5450 + }, + { + "epoch": 0.06815170379259482, + "grad_norm": 2.9078211784362793, + "learning_rate": 1.3630000000000002e-05, + "loss": 0.9949, + "step": 5452 + }, + { + "epoch": 0.06817670441761044, + "grad_norm": 3.274296998977661, + "learning_rate": 1.3635e-05, + "loss": 0.5013, + "step": 5454 + }, + { + "epoch": 0.06820170504262607, + "grad_norm": 0.04056381434202194, + "learning_rate": 1.3640000000000002e-05, + "loss": 0.3659, + "step": 5456 + }, + { + "epoch": 0.06822670566764169, + "grad_norm": 2.5035080909729004, + "learning_rate": 1.3645000000000002e-05, + "loss": 0.5904, + "step": 5458 + }, + { + "epoch": 0.06825170629265731, + "grad_norm": 3.5308825969696045, + "learning_rate": 1.3650000000000001e-05, + "loss": 1.5573, + "step": 5460 + }, + { + "epoch": 0.06827670691767294, + "grad_norm": 4.604557037353516, + "learning_rate": 1.3655e-05, + "loss": 0.9636, + "step": 5462 + }, + { + "epoch": 0.06830170754268856, + "grad_norm": 2.0191380977630615, + "learning_rate": 1.3660000000000001e-05, + "loss": 0.6323, + "step": 5464 + }, + { + "epoch": 0.0683267081677042, + "grad_norm": 3.9146790504455566, + "learning_rate": 1.3665000000000002e-05, + "loss": 0.3972, + "step": 5466 + }, + { + "epoch": 0.06835170879271982, + "grad_norm": 3.4532408714294434, + "learning_rate": 1.3670000000000001e-05, + "loss": 0.9384, + "step": 5468 + }, + { + "epoch": 0.06837670941773544, + "grad_norm": 8.089213371276855, + "learning_rate": 1.3675e-05, + "loss": 1.0707, + "step": 5470 + }, + { + "epoch": 0.06840171004275107, + "grad_norm": 1.623664140701294, + "learning_rate": 1.3680000000000003e-05, + "loss": 0.5426, + "step": 5472 + }, + { + "epoch": 0.06842671066776669, + "grad_norm": 0.6708600521087646, + "learning_rate": 1.3685000000000002e-05, + "loss": 0.6548, + "step": 5474 + }, + { + "epoch": 0.06845171129278232, + "grad_norm": 4.496300220489502, + "learning_rate": 1.3690000000000001e-05, + "loss": 1.0091, + "step": 5476 + }, + { + "epoch": 0.06847671191779794, + "grad_norm": 4.167341232299805, + "learning_rate": 1.3695e-05, + "loss": 2.0289, + "step": 5478 + }, + { + "epoch": 0.06850171254281356, + "grad_norm": 5.616669178009033, + "learning_rate": 1.3700000000000003e-05, + "loss": 1.8052, + "step": 5480 + }, + { + "epoch": 0.0685267131678292, + "grad_norm": 4.418644905090332, + "learning_rate": 1.3705000000000002e-05, + "loss": 0.9534, + "step": 5482 + }, + { + "epoch": 0.06855171379284482, + "grad_norm": 1.575804591178894, + "learning_rate": 1.3710000000000001e-05, + "loss": 0.8129, + "step": 5484 + }, + { + "epoch": 0.06857671441786045, + "grad_norm": 0.07447556406259537, + "learning_rate": 1.3715e-05, + "loss": 0.2263, + "step": 5486 + }, + { + "epoch": 0.06860171504287607, + "grad_norm": 2.303117513656616, + "learning_rate": 1.3720000000000002e-05, + "loss": 0.7206, + "step": 5488 + }, + { + "epoch": 0.06862671566789169, + "grad_norm": 3.8939640522003174, + "learning_rate": 1.3725000000000002e-05, + "loss": 1.236, + "step": 5490 + }, + { + "epoch": 0.06865171629290732, + "grad_norm": 4.115503311157227, + "learning_rate": 1.373e-05, + "loss": 1.6263, + "step": 5492 + }, + { + "epoch": 0.06867671691792294, + "grad_norm": 3.1306309700012207, + "learning_rate": 1.3735e-05, + "loss": 1.5565, + "step": 5494 + }, + { + "epoch": 0.06870171754293858, + "grad_norm": 0.03903828561306, + "learning_rate": 1.3740000000000002e-05, + "loss": 0.477, + "step": 5496 + }, + { + "epoch": 0.0687267181679542, + "grad_norm": 9.09599494934082, + "learning_rate": 1.3745000000000001e-05, + "loss": 1.4436, + "step": 5498 + }, + { + "epoch": 0.06875171879296982, + "grad_norm": 5.3144683837890625, + "learning_rate": 1.375e-05, + "loss": 0.6957, + "step": 5500 + }, + { + "epoch": 0.06877671941798545, + "grad_norm": 2.123015880584717, + "learning_rate": 1.3755000000000001e-05, + "loss": 0.0782, + "step": 5502 + }, + { + "epoch": 0.06880172004300107, + "grad_norm": 4.244698524475098, + "learning_rate": 1.376e-05, + "loss": 0.4608, + "step": 5504 + }, + { + "epoch": 0.0688267206680167, + "grad_norm": 1.6605137586593628, + "learning_rate": 1.3765000000000001e-05, + "loss": 0.173, + "step": 5506 + }, + { + "epoch": 0.06885172129303233, + "grad_norm": 9.947172164916992, + "learning_rate": 1.377e-05, + "loss": 2.0115, + "step": 5508 + }, + { + "epoch": 0.06887672191804795, + "grad_norm": 6.686270713806152, + "learning_rate": 1.3775000000000001e-05, + "loss": 2.075, + "step": 5510 + }, + { + "epoch": 0.06890172254306358, + "grad_norm": 3.5042619705200195, + "learning_rate": 1.378e-05, + "loss": 0.5137, + "step": 5512 + }, + { + "epoch": 0.0689267231680792, + "grad_norm": 5.109180450439453, + "learning_rate": 1.3785000000000001e-05, + "loss": 1.2782, + "step": 5514 + }, + { + "epoch": 0.06895172379309483, + "grad_norm": 4.504278182983398, + "learning_rate": 1.3790000000000002e-05, + "loss": 0.3198, + "step": 5516 + }, + { + "epoch": 0.06897672441811045, + "grad_norm": 1.9377391338348389, + "learning_rate": 1.3795000000000001e-05, + "loss": 1.2069, + "step": 5518 + }, + { + "epoch": 0.06900172504312607, + "grad_norm": 4.982752799987793, + "learning_rate": 1.38e-05, + "loss": 1.233, + "step": 5520 + }, + { + "epoch": 0.0690267256681417, + "grad_norm": 5.958322048187256, + "learning_rate": 1.3805000000000003e-05, + "loss": 2.0734, + "step": 5522 + }, + { + "epoch": 0.06905172629315733, + "grad_norm": 4.042726516723633, + "learning_rate": 1.3810000000000002e-05, + "loss": 1.5891, + "step": 5524 + }, + { + "epoch": 0.06907672691817296, + "grad_norm": 2.5273587703704834, + "learning_rate": 1.3815e-05, + "loss": 1.0845, + "step": 5526 + }, + { + "epoch": 0.06910172754318858, + "grad_norm": 0.6104610562324524, + "learning_rate": 1.382e-05, + "loss": 0.6926, + "step": 5528 + }, + { + "epoch": 0.0691267281682042, + "grad_norm": 5.247182846069336, + "learning_rate": 1.3825000000000002e-05, + "loss": 1.7774, + "step": 5530 + }, + { + "epoch": 0.06915172879321983, + "grad_norm": 4.858921051025391, + "learning_rate": 1.3830000000000001e-05, + "loss": 0.8182, + "step": 5532 + }, + { + "epoch": 0.06917672941823545, + "grad_norm": 3.757960557937622, + "learning_rate": 1.3835e-05, + "loss": 2.1849, + "step": 5534 + }, + { + "epoch": 0.06920173004325109, + "grad_norm": 5.02151346206665, + "learning_rate": 1.384e-05, + "loss": 0.7296, + "step": 5536 + }, + { + "epoch": 0.0692267306682667, + "grad_norm": 7.487196445465088, + "learning_rate": 1.3845000000000002e-05, + "loss": 0.9716, + "step": 5538 + }, + { + "epoch": 0.06925173129328233, + "grad_norm": 2.3832030296325684, + "learning_rate": 1.3850000000000001e-05, + "loss": 1.3537, + "step": 5540 + }, + { + "epoch": 0.06927673191829796, + "grad_norm": 1.792105793952942, + "learning_rate": 1.3855e-05, + "loss": 0.2498, + "step": 5542 + }, + { + "epoch": 0.06930173254331358, + "grad_norm": 8.149185180664062, + "learning_rate": 1.386e-05, + "loss": 1.6278, + "step": 5544 + }, + { + "epoch": 0.06932673316832921, + "grad_norm": 4.761815547943115, + "learning_rate": 1.3865000000000002e-05, + "loss": 0.7999, + "step": 5546 + }, + { + "epoch": 0.06935173379334483, + "grad_norm": 0.0926562249660492, + "learning_rate": 1.3870000000000001e-05, + "loss": 1.3747, + "step": 5548 + }, + { + "epoch": 0.06937673441836045, + "grad_norm": 4.2354536056518555, + "learning_rate": 1.3875e-05, + "loss": 1.5686, + "step": 5550 + }, + { + "epoch": 0.06940173504337609, + "grad_norm": 5.701686859130859, + "learning_rate": 1.3880000000000001e-05, + "loss": 1.0451, + "step": 5552 + }, + { + "epoch": 0.0694267356683917, + "grad_norm": 0.029498104006052017, + "learning_rate": 1.3885000000000002e-05, + "loss": 0.073, + "step": 5554 + }, + { + "epoch": 0.06945173629340734, + "grad_norm": 4.863171100616455, + "learning_rate": 1.389e-05, + "loss": 1.8287, + "step": 5556 + }, + { + "epoch": 0.06947673691842296, + "grad_norm": 4.767584323883057, + "learning_rate": 1.3895e-05, + "loss": 0.8658, + "step": 5558 + }, + { + "epoch": 0.06950173754343858, + "grad_norm": 4.003279209136963, + "learning_rate": 1.39e-05, + "loss": 0.8088, + "step": 5560 + }, + { + "epoch": 0.06952673816845421, + "grad_norm": 5.190685272216797, + "learning_rate": 1.3905000000000002e-05, + "loss": 1.1567, + "step": 5562 + }, + { + "epoch": 0.06955173879346983, + "grad_norm": 4.016655445098877, + "learning_rate": 1.391e-05, + "loss": 1.1762, + "step": 5564 + }, + { + "epoch": 0.06957673941848547, + "grad_norm": 11.541500091552734, + "learning_rate": 1.3915000000000001e-05, + "loss": 0.8835, + "step": 5566 + }, + { + "epoch": 0.06960174004350109, + "grad_norm": 2.9469497203826904, + "learning_rate": 1.392e-05, + "loss": 1.0101, + "step": 5568 + }, + { + "epoch": 0.0696267406685167, + "grad_norm": 7.861421585083008, + "learning_rate": 1.3925000000000001e-05, + "loss": 0.9301, + "step": 5570 + }, + { + "epoch": 0.06965174129353234, + "grad_norm": 0.708583414554596, + "learning_rate": 1.393e-05, + "loss": 1.0494, + "step": 5572 + }, + { + "epoch": 0.06967674191854796, + "grad_norm": 2.111701250076294, + "learning_rate": 1.3935000000000001e-05, + "loss": 0.1214, + "step": 5574 + }, + { + "epoch": 0.0697017425435636, + "grad_norm": 3.9165115356445312, + "learning_rate": 1.394e-05, + "loss": 1.025, + "step": 5576 + }, + { + "epoch": 0.06972674316857921, + "grad_norm": 4.6791582107543945, + "learning_rate": 1.3945000000000001e-05, + "loss": 1.7476, + "step": 5578 + }, + { + "epoch": 0.06975174379359483, + "grad_norm": 2.0237927436828613, + "learning_rate": 1.3950000000000002e-05, + "loss": 0.5401, + "step": 5580 + }, + { + "epoch": 0.06977674441861047, + "grad_norm": 2.855517625808716, + "learning_rate": 1.3955000000000001e-05, + "loss": 1.9138, + "step": 5582 + }, + { + "epoch": 0.06980174504362609, + "grad_norm": 5.508934497833252, + "learning_rate": 1.396e-05, + "loss": 0.381, + "step": 5584 + }, + { + "epoch": 0.06982674566864172, + "grad_norm": 3.9004175662994385, + "learning_rate": 1.3965000000000003e-05, + "loss": 0.424, + "step": 5586 + }, + { + "epoch": 0.06985174629365734, + "grad_norm": 2.408926010131836, + "learning_rate": 1.3970000000000002e-05, + "loss": 0.9136, + "step": 5588 + }, + { + "epoch": 0.06987674691867296, + "grad_norm": 0.1660955846309662, + "learning_rate": 1.3975000000000001e-05, + "loss": 0.3874, + "step": 5590 + }, + { + "epoch": 0.0699017475436886, + "grad_norm": 4.780223846435547, + "learning_rate": 1.398e-05, + "loss": 1.0029, + "step": 5592 + }, + { + "epoch": 0.06992674816870421, + "grad_norm": 2.2875914573669434, + "learning_rate": 1.3985000000000002e-05, + "loss": 0.6365, + "step": 5594 + }, + { + "epoch": 0.06995174879371985, + "grad_norm": 6.122470378875732, + "learning_rate": 1.3990000000000002e-05, + "loss": 0.3949, + "step": 5596 + }, + { + "epoch": 0.06997674941873547, + "grad_norm": 5.703813076019287, + "learning_rate": 1.3995e-05, + "loss": 0.6266, + "step": 5598 + }, + { + "epoch": 0.07000175004375109, + "grad_norm": 2.7709319591522217, + "learning_rate": 1.4e-05, + "loss": 0.9357, + "step": 5600 + }, + { + "epoch": 0.07002675066876672, + "grad_norm": 3.6958978176116943, + "learning_rate": 1.4005000000000002e-05, + "loss": 1.1893, + "step": 5602 + }, + { + "epoch": 0.07005175129378234, + "grad_norm": 2.9481546878814697, + "learning_rate": 1.4010000000000001e-05, + "loss": 1.3291, + "step": 5604 + }, + { + "epoch": 0.07007675191879797, + "grad_norm": 4.007837772369385, + "learning_rate": 1.4015e-05, + "loss": 1.3773, + "step": 5606 + }, + { + "epoch": 0.0701017525438136, + "grad_norm": 5.995537757873535, + "learning_rate": 1.402e-05, + "loss": 0.4757, + "step": 5608 + }, + { + "epoch": 0.07012675316882921, + "grad_norm": 3.523074150085449, + "learning_rate": 1.4025000000000002e-05, + "loss": 0.3897, + "step": 5610 + }, + { + "epoch": 0.07015175379384485, + "grad_norm": 3.0652637481689453, + "learning_rate": 1.4030000000000001e-05, + "loss": 1.5704, + "step": 5612 + }, + { + "epoch": 0.07017675441886047, + "grad_norm": 3.4799482822418213, + "learning_rate": 1.4035e-05, + "loss": 1.2051, + "step": 5614 + }, + { + "epoch": 0.0702017550438761, + "grad_norm": 3.666051149368286, + "learning_rate": 1.4040000000000001e-05, + "loss": 1.6108, + "step": 5616 + }, + { + "epoch": 0.07022675566889172, + "grad_norm": 5.643507480621338, + "learning_rate": 1.4045000000000002e-05, + "loss": 1.7235, + "step": 5618 + }, + { + "epoch": 0.07025175629390734, + "grad_norm": 4.172011852264404, + "learning_rate": 1.4050000000000001e-05, + "loss": 1.2004, + "step": 5620 + }, + { + "epoch": 0.07027675691892298, + "grad_norm": 6.306371212005615, + "learning_rate": 1.4055e-05, + "loss": 1.1244, + "step": 5622 + }, + { + "epoch": 0.0703017575439386, + "grad_norm": 5.517643451690674, + "learning_rate": 1.4060000000000001e-05, + "loss": 0.4436, + "step": 5624 + }, + { + "epoch": 0.07032675816895423, + "grad_norm": 2.8153960704803467, + "learning_rate": 1.4065000000000002e-05, + "loss": 0.8212, + "step": 5626 + }, + { + "epoch": 0.07035175879396985, + "grad_norm": 1.8872697353363037, + "learning_rate": 1.407e-05, + "loss": 0.2784, + "step": 5628 + }, + { + "epoch": 0.07037675941898547, + "grad_norm": 3.750763177871704, + "learning_rate": 1.4075000000000002e-05, + "loss": 0.7997, + "step": 5630 + }, + { + "epoch": 0.0704017600440011, + "grad_norm": 0.02510424517095089, + "learning_rate": 1.408e-05, + "loss": 0.0661, + "step": 5632 + }, + { + "epoch": 0.07042676066901672, + "grad_norm": 3.7430129051208496, + "learning_rate": 1.4085000000000002e-05, + "loss": 1.3344, + "step": 5634 + }, + { + "epoch": 0.07045176129403236, + "grad_norm": 3.572880983352661, + "learning_rate": 1.409e-05, + "loss": 0.8131, + "step": 5636 + }, + { + "epoch": 0.07047676191904798, + "grad_norm": 1.9806150197982788, + "learning_rate": 1.4095000000000001e-05, + "loss": 0.6627, + "step": 5638 + }, + { + "epoch": 0.0705017625440636, + "grad_norm": 0.40514683723449707, + "learning_rate": 1.41e-05, + "loss": 0.8004, + "step": 5640 + }, + { + "epoch": 0.07052676316907923, + "grad_norm": 0.0057195862755179405, + "learning_rate": 1.4105000000000001e-05, + "loss": 0.0427, + "step": 5642 + }, + { + "epoch": 0.07055176379409485, + "grad_norm": 2.7050552368164062, + "learning_rate": 1.4110000000000002e-05, + "loss": 1.8917, + "step": 5644 + }, + { + "epoch": 0.07057676441911048, + "grad_norm": 2.388995409011841, + "learning_rate": 1.4115000000000001e-05, + "loss": 0.2347, + "step": 5646 + }, + { + "epoch": 0.0706017650441261, + "grad_norm": 2.1362805366516113, + "learning_rate": 1.412e-05, + "loss": 0.5759, + "step": 5648 + }, + { + "epoch": 0.07062676566914172, + "grad_norm": 4.846392631530762, + "learning_rate": 1.4125000000000003e-05, + "loss": 1.1995, + "step": 5650 + }, + { + "epoch": 0.07065176629415736, + "grad_norm": 2.240316390991211, + "learning_rate": 1.4130000000000002e-05, + "loss": 0.5944, + "step": 5652 + }, + { + "epoch": 0.07067676691917298, + "grad_norm": 0.21497489511966705, + "learning_rate": 1.4135000000000001e-05, + "loss": 0.0465, + "step": 5654 + }, + { + "epoch": 0.07070176754418861, + "grad_norm": 2.3900227546691895, + "learning_rate": 1.414e-05, + "loss": 0.8421, + "step": 5656 + }, + { + "epoch": 0.07072676816920423, + "grad_norm": 0.017480606213212013, + "learning_rate": 1.4145000000000003e-05, + "loss": 1.0633, + "step": 5658 + }, + { + "epoch": 0.07075176879421985, + "grad_norm": 0.01622958295047283, + "learning_rate": 1.4150000000000002e-05, + "loss": 0.3121, + "step": 5660 + }, + { + "epoch": 0.07077676941923548, + "grad_norm": 10.756186485290527, + "learning_rate": 1.4155000000000001e-05, + "loss": 1.0769, + "step": 5662 + }, + { + "epoch": 0.0708017700442511, + "grad_norm": 4.892404079437256, + "learning_rate": 1.416e-05, + "loss": 1.0412, + "step": 5664 + }, + { + "epoch": 0.07082677066926674, + "grad_norm": 4.450433254241943, + "learning_rate": 1.4165000000000002e-05, + "loss": 1.3467, + "step": 5666 + }, + { + "epoch": 0.07085177129428236, + "grad_norm": 15.382206916809082, + "learning_rate": 1.4170000000000002e-05, + "loss": 0.6395, + "step": 5668 + }, + { + "epoch": 0.07087677191929798, + "grad_norm": 5.577559471130371, + "learning_rate": 1.4175e-05, + "loss": 0.4821, + "step": 5670 + }, + { + "epoch": 0.07090177254431361, + "grad_norm": 27.927688598632812, + "learning_rate": 1.418e-05, + "loss": 1.0843, + "step": 5672 + }, + { + "epoch": 0.07092677316932923, + "grad_norm": 3.891792058944702, + "learning_rate": 1.4185000000000002e-05, + "loss": 0.6917, + "step": 5674 + }, + { + "epoch": 0.07095177379434486, + "grad_norm": 5.065155982971191, + "learning_rate": 1.4190000000000001e-05, + "loss": 1.219, + "step": 5676 + }, + { + "epoch": 0.07097677441936048, + "grad_norm": 2.9750895500183105, + "learning_rate": 1.4195e-05, + "loss": 0.943, + "step": 5678 + }, + { + "epoch": 0.0710017750443761, + "grad_norm": 3.269050121307373, + "learning_rate": 1.4200000000000001e-05, + "loss": 0.8435, + "step": 5680 + }, + { + "epoch": 0.07102677566939174, + "grad_norm": 3.763584613800049, + "learning_rate": 1.4205000000000002e-05, + "loss": 1.0005, + "step": 5682 + }, + { + "epoch": 0.07105177629440736, + "grad_norm": 4.479984283447266, + "learning_rate": 1.4210000000000001e-05, + "loss": 2.38, + "step": 5684 + }, + { + "epoch": 0.07107677691942299, + "grad_norm": 3.604527711868286, + "learning_rate": 1.4215e-05, + "loss": 0.1351, + "step": 5686 + }, + { + "epoch": 0.07110177754443861, + "grad_norm": 5.175136089324951, + "learning_rate": 1.4220000000000001e-05, + "loss": 0.9741, + "step": 5688 + }, + { + "epoch": 0.07112677816945423, + "grad_norm": 7.89168643951416, + "learning_rate": 1.4225000000000002e-05, + "loss": 2.2062, + "step": 5690 + }, + { + "epoch": 0.07115177879446986, + "grad_norm": 3.082977294921875, + "learning_rate": 1.4230000000000001e-05, + "loss": 0.4524, + "step": 5692 + }, + { + "epoch": 0.07117677941948548, + "grad_norm": 2.5243656635284424, + "learning_rate": 1.4235000000000002e-05, + "loss": 0.2262, + "step": 5694 + }, + { + "epoch": 0.07120178004450112, + "grad_norm": 5.924048900604248, + "learning_rate": 1.4240000000000001e-05, + "loss": 0.9524, + "step": 5696 + }, + { + "epoch": 0.07122678066951674, + "grad_norm": 3.5371923446655273, + "learning_rate": 1.4245000000000002e-05, + "loss": 1.3386, + "step": 5698 + }, + { + "epoch": 0.07125178129453236, + "grad_norm": 3.8750100135803223, + "learning_rate": 1.425e-05, + "loss": 1.1613, + "step": 5700 + }, + { + "epoch": 0.07127678191954799, + "grad_norm": 8.688925743103027, + "learning_rate": 1.4255000000000002e-05, + "loss": 0.3944, + "step": 5702 + }, + { + "epoch": 0.07130178254456361, + "grad_norm": 3.968660354614258, + "learning_rate": 1.426e-05, + "loss": 0.7795, + "step": 5704 + }, + { + "epoch": 0.07132678316957924, + "grad_norm": 0.42267462611198425, + "learning_rate": 1.4265000000000001e-05, + "loss": 0.6041, + "step": 5706 + }, + { + "epoch": 0.07135178379459486, + "grad_norm": 0.005914824549108744, + "learning_rate": 1.4270000000000002e-05, + "loss": 0.4155, + "step": 5708 + }, + { + "epoch": 0.07137678441961048, + "grad_norm": 4.61187744140625, + "learning_rate": 1.4275000000000001e-05, + "loss": 1.6382, + "step": 5710 + }, + { + "epoch": 0.07140178504462612, + "grad_norm": 4.851742744445801, + "learning_rate": 1.428e-05, + "loss": 0.2751, + "step": 5712 + }, + { + "epoch": 0.07142678566964174, + "grad_norm": 3.1354382038116455, + "learning_rate": 1.4285000000000003e-05, + "loss": 0.4265, + "step": 5714 + }, + { + "epoch": 0.07145178629465737, + "grad_norm": 2.544586658477783, + "learning_rate": 1.4290000000000002e-05, + "loss": 0.7938, + "step": 5716 + }, + { + "epoch": 0.07147678691967299, + "grad_norm": 2.073199987411499, + "learning_rate": 1.4295000000000001e-05, + "loss": 0.504, + "step": 5718 + }, + { + "epoch": 0.07150178754468861, + "grad_norm": 8.706510543823242, + "learning_rate": 1.43e-05, + "loss": 2.0374, + "step": 5720 + }, + { + "epoch": 0.07152678816970424, + "grad_norm": 3.1765711307525635, + "learning_rate": 1.4305000000000003e-05, + "loss": 1.2214, + "step": 5722 + }, + { + "epoch": 0.07155178879471986, + "grad_norm": 2.4039788246154785, + "learning_rate": 1.4310000000000002e-05, + "loss": 0.7972, + "step": 5724 + }, + { + "epoch": 0.0715767894197355, + "grad_norm": 6.751132965087891, + "learning_rate": 1.4315000000000001e-05, + "loss": 0.7789, + "step": 5726 + }, + { + "epoch": 0.07160179004475112, + "grad_norm": 5.270044326782227, + "learning_rate": 1.432e-05, + "loss": 1.4794, + "step": 5728 + }, + { + "epoch": 0.07162679066976674, + "grad_norm": 5.412566661834717, + "learning_rate": 1.4325000000000003e-05, + "loss": 0.6705, + "step": 5730 + }, + { + "epoch": 0.07165179129478237, + "grad_norm": 3.84804630279541, + "learning_rate": 1.4330000000000002e-05, + "loss": 1.5242, + "step": 5732 + }, + { + "epoch": 0.07167679191979799, + "grad_norm": 7.009835243225098, + "learning_rate": 1.4335e-05, + "loss": 2.4432, + "step": 5734 + }, + { + "epoch": 0.07170179254481363, + "grad_norm": 4.500400066375732, + "learning_rate": 1.434e-05, + "loss": 0.1439, + "step": 5736 + }, + { + "epoch": 0.07172679316982924, + "grad_norm": 4.768668174743652, + "learning_rate": 1.4345000000000002e-05, + "loss": 1.1323, + "step": 5738 + }, + { + "epoch": 0.07175179379484486, + "grad_norm": 2.465733766555786, + "learning_rate": 1.4350000000000002e-05, + "loss": 1.7825, + "step": 5740 + }, + { + "epoch": 0.0717767944198605, + "grad_norm": 2.7004001140594482, + "learning_rate": 1.4355e-05, + "loss": 1.813, + "step": 5742 + }, + { + "epoch": 0.07180179504487612, + "grad_norm": 4.248676776885986, + "learning_rate": 1.4360000000000001e-05, + "loss": 1.9509, + "step": 5744 + }, + { + "epoch": 0.07182679566989175, + "grad_norm": 2.2398524284362793, + "learning_rate": 1.4365000000000002e-05, + "loss": 0.2867, + "step": 5746 + }, + { + "epoch": 0.07185179629490737, + "grad_norm": 3.070368528366089, + "learning_rate": 1.4370000000000001e-05, + "loss": 1.7965, + "step": 5748 + }, + { + "epoch": 0.07187679691992299, + "grad_norm": 0.10156998038291931, + "learning_rate": 1.4375e-05, + "loss": 0.4202, + "step": 5750 + }, + { + "epoch": 0.07190179754493863, + "grad_norm": 5.607059955596924, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.084, + "step": 5752 + }, + { + "epoch": 0.07192679816995425, + "grad_norm": 4.833645343780518, + "learning_rate": 1.4385e-05, + "loss": 2.0244, + "step": 5754 + }, + { + "epoch": 0.07195179879496988, + "grad_norm": 3.5763816833496094, + "learning_rate": 1.4390000000000001e-05, + "loss": 1.5492, + "step": 5756 + }, + { + "epoch": 0.0719767994199855, + "grad_norm": 3.331782341003418, + "learning_rate": 1.4395000000000002e-05, + "loss": 0.6556, + "step": 5758 + }, + { + "epoch": 0.07200180004500112, + "grad_norm": 4.576390266418457, + "learning_rate": 1.4400000000000001e-05, + "loss": 0.6108, + "step": 5760 + }, + { + "epoch": 0.07202680067001675, + "grad_norm": 6.780724048614502, + "learning_rate": 1.4405e-05, + "loss": 2.0773, + "step": 5762 + }, + { + "epoch": 0.07205180129503237, + "grad_norm": 0.1621636301279068, + "learning_rate": 1.4410000000000001e-05, + "loss": 0.6167, + "step": 5764 + }, + { + "epoch": 0.072076801920048, + "grad_norm": 3.255492925643921, + "learning_rate": 1.4415000000000002e-05, + "loss": 0.7011, + "step": 5766 + }, + { + "epoch": 0.07210180254506363, + "grad_norm": 0.20154094696044922, + "learning_rate": 1.4420000000000001e-05, + "loss": 0.3441, + "step": 5768 + }, + { + "epoch": 0.07212680317007925, + "grad_norm": 3.1281187534332275, + "learning_rate": 1.4425e-05, + "loss": 1.1252, + "step": 5770 + }, + { + "epoch": 0.07215180379509488, + "grad_norm": 4.555350303649902, + "learning_rate": 1.4430000000000002e-05, + "loss": 1.3577, + "step": 5772 + }, + { + "epoch": 0.0721768044201105, + "grad_norm": 3.661146879196167, + "learning_rate": 1.4435000000000002e-05, + "loss": 1.044, + "step": 5774 + }, + { + "epoch": 0.07220180504512613, + "grad_norm": 9.524643898010254, + "learning_rate": 1.444e-05, + "loss": 1.6126, + "step": 5776 + }, + { + "epoch": 0.07222680567014175, + "grad_norm": 0.016223784536123276, + "learning_rate": 1.4445e-05, + "loss": 0.1707, + "step": 5778 + }, + { + "epoch": 0.07225180629515737, + "grad_norm": 4.767289161682129, + "learning_rate": 1.4450000000000002e-05, + "loss": 1.1938, + "step": 5780 + }, + { + "epoch": 0.072276806920173, + "grad_norm": 4.9431233406066895, + "learning_rate": 1.4455000000000001e-05, + "loss": 0.6246, + "step": 5782 + }, + { + "epoch": 0.07230180754518863, + "grad_norm": 4.499691486358643, + "learning_rate": 1.446e-05, + "loss": 1.5399, + "step": 5784 + }, + { + "epoch": 0.07232680817020426, + "grad_norm": 0.006743308622390032, + "learning_rate": 1.4465e-05, + "loss": 1.5286, + "step": 5786 + }, + { + "epoch": 0.07235180879521988, + "grad_norm": 9.130691528320312, + "learning_rate": 1.4470000000000002e-05, + "loss": 1.8935, + "step": 5788 + }, + { + "epoch": 0.0723768094202355, + "grad_norm": 3.55338716506958, + "learning_rate": 1.4475000000000001e-05, + "loss": 1.4222, + "step": 5790 + }, + { + "epoch": 0.07240181004525113, + "grad_norm": 19.39518928527832, + "learning_rate": 1.448e-05, + "loss": 1.9542, + "step": 5792 + }, + { + "epoch": 0.07242681067026675, + "grad_norm": 0.034189704805612564, + "learning_rate": 1.4485e-05, + "loss": 0.6051, + "step": 5794 + }, + { + "epoch": 0.07245181129528239, + "grad_norm": 4.817819118499756, + "learning_rate": 1.4490000000000002e-05, + "loss": 0.482, + "step": 5796 + }, + { + "epoch": 0.072476811920298, + "grad_norm": 2.5554964542388916, + "learning_rate": 1.4495000000000001e-05, + "loss": 0.9882, + "step": 5798 + }, + { + "epoch": 0.07250181254531363, + "grad_norm": 3.4247827529907227, + "learning_rate": 1.45e-05, + "loss": 1.357, + "step": 5800 + }, + { + "epoch": 0.07252681317032926, + "grad_norm": 3.4260408878326416, + "learning_rate": 1.4505000000000001e-05, + "loss": 0.7454, + "step": 5802 + }, + { + "epoch": 0.07255181379534488, + "grad_norm": 6.679296016693115, + "learning_rate": 1.4510000000000002e-05, + "loss": 0.9854, + "step": 5804 + }, + { + "epoch": 0.07257681442036051, + "grad_norm": 7.155430316925049, + "learning_rate": 1.4515e-05, + "loss": 1.2764, + "step": 5806 + }, + { + "epoch": 0.07260181504537613, + "grad_norm": 6.661107063293457, + "learning_rate": 1.4520000000000002e-05, + "loss": 1.0749, + "step": 5808 + }, + { + "epoch": 0.07262681567039175, + "grad_norm": 2.545562267303467, + "learning_rate": 1.4525e-05, + "loss": 0.5563, + "step": 5810 + }, + { + "epoch": 0.07265181629540739, + "grad_norm": 3.7004666328430176, + "learning_rate": 1.4530000000000001e-05, + "loss": 1.6948, + "step": 5812 + }, + { + "epoch": 0.072676816920423, + "grad_norm": 0.04733217507600784, + "learning_rate": 1.4535e-05, + "loss": 0.2177, + "step": 5814 + }, + { + "epoch": 0.07270181754543864, + "grad_norm": 2.4055745601654053, + "learning_rate": 1.4540000000000001e-05, + "loss": 0.6523, + "step": 5816 + }, + { + "epoch": 0.07272681817045426, + "grad_norm": 3.3508033752441406, + "learning_rate": 1.4545e-05, + "loss": 0.9708, + "step": 5818 + }, + { + "epoch": 0.07275181879546988, + "grad_norm": 4.25958776473999, + "learning_rate": 1.4550000000000001e-05, + "loss": 0.7993, + "step": 5820 + }, + { + "epoch": 0.07277681942048551, + "grad_norm": 3.634654998779297, + "learning_rate": 1.4555000000000002e-05, + "loss": 0.4755, + "step": 5822 + }, + { + "epoch": 0.07280182004550113, + "grad_norm": 6.925124645233154, + "learning_rate": 1.4560000000000001e-05, + "loss": 1.2323, + "step": 5824 + }, + { + "epoch": 0.07282682067051677, + "grad_norm": 0.25385338068008423, + "learning_rate": 1.4565e-05, + "loss": 0.1143, + "step": 5826 + }, + { + "epoch": 0.07285182129553239, + "grad_norm": 7.442173480987549, + "learning_rate": 1.4570000000000001e-05, + "loss": 0.9921, + "step": 5828 + }, + { + "epoch": 0.07287682192054801, + "grad_norm": 0.08282768726348877, + "learning_rate": 1.4575000000000002e-05, + "loss": 0.8985, + "step": 5830 + }, + { + "epoch": 0.07290182254556364, + "grad_norm": 4.065700054168701, + "learning_rate": 1.4580000000000001e-05, + "loss": 1.0421, + "step": 5832 + }, + { + "epoch": 0.07292682317057926, + "grad_norm": 4.070064067840576, + "learning_rate": 1.4585e-05, + "loss": 1.2258, + "step": 5834 + }, + { + "epoch": 0.0729518237955949, + "grad_norm": 4.417230606079102, + "learning_rate": 1.4590000000000003e-05, + "loss": 1.0401, + "step": 5836 + }, + { + "epoch": 0.07297682442061051, + "grad_norm": 2.305762767791748, + "learning_rate": 1.4595000000000002e-05, + "loss": 0.7064, + "step": 5838 + }, + { + "epoch": 0.07300182504562613, + "grad_norm": 4.5199737548828125, + "learning_rate": 1.46e-05, + "loss": 1.7789, + "step": 5840 + }, + { + "epoch": 0.07302682567064177, + "grad_norm": 3.7400062084198, + "learning_rate": 1.4605e-05, + "loss": 0.8244, + "step": 5842 + }, + { + "epoch": 0.07305182629565739, + "grad_norm": 1.472259759902954, + "learning_rate": 1.4610000000000002e-05, + "loss": 0.2731, + "step": 5844 + }, + { + "epoch": 0.07307682692067302, + "grad_norm": 4.034121990203857, + "learning_rate": 1.4615000000000002e-05, + "loss": 0.9385, + "step": 5846 + }, + { + "epoch": 0.07310182754568864, + "grad_norm": 3.6993303298950195, + "learning_rate": 1.462e-05, + "loss": 1.5199, + "step": 5848 + }, + { + "epoch": 0.07312682817070426, + "grad_norm": 9.626428604125977, + "learning_rate": 1.4625e-05, + "loss": 1.6134, + "step": 5850 + }, + { + "epoch": 0.0731518287957199, + "grad_norm": 4.704865455627441, + "learning_rate": 1.4630000000000002e-05, + "loss": 1.6632, + "step": 5852 + }, + { + "epoch": 0.07317682942073551, + "grad_norm": 3.3523569107055664, + "learning_rate": 1.4635000000000001e-05, + "loss": 0.8292, + "step": 5854 + }, + { + "epoch": 0.07320183004575115, + "grad_norm": 10.683959007263184, + "learning_rate": 1.464e-05, + "loss": 0.4956, + "step": 5856 + }, + { + "epoch": 0.07322683067076677, + "grad_norm": 5.1532745361328125, + "learning_rate": 1.4645e-05, + "loss": 0.9779, + "step": 5858 + }, + { + "epoch": 0.07325183129578239, + "grad_norm": 1.9939744472503662, + "learning_rate": 1.4650000000000002e-05, + "loss": 0.5598, + "step": 5860 + }, + { + "epoch": 0.07327683192079802, + "grad_norm": 2.9418251514434814, + "learning_rate": 1.4655000000000001e-05, + "loss": 1.2383, + "step": 5862 + }, + { + "epoch": 0.07330183254581364, + "grad_norm": 2.1196486949920654, + "learning_rate": 1.466e-05, + "loss": 1.092, + "step": 5864 + }, + { + "epoch": 0.07332683317082928, + "grad_norm": 3.6677112579345703, + "learning_rate": 1.4665000000000001e-05, + "loss": 1.5619, + "step": 5866 + }, + { + "epoch": 0.0733518337958449, + "grad_norm": 0.051397472620010376, + "learning_rate": 1.4670000000000002e-05, + "loss": 0.6941, + "step": 5868 + }, + { + "epoch": 0.07337683442086051, + "grad_norm": 4.445187568664551, + "learning_rate": 1.4675000000000001e-05, + "loss": 1.5119, + "step": 5870 + }, + { + "epoch": 0.07340183504587615, + "grad_norm": 3.3424665927886963, + "learning_rate": 1.4680000000000002e-05, + "loss": 0.5187, + "step": 5872 + }, + { + "epoch": 0.07342683567089177, + "grad_norm": 3.321091890335083, + "learning_rate": 1.4685000000000001e-05, + "loss": 1.6027, + "step": 5874 + }, + { + "epoch": 0.0734518362959074, + "grad_norm": 0.2316579669713974, + "learning_rate": 1.4690000000000002e-05, + "loss": 0.7533, + "step": 5876 + }, + { + "epoch": 0.07347683692092302, + "grad_norm": 4.258123874664307, + "learning_rate": 1.4695e-05, + "loss": 0.7121, + "step": 5878 + }, + { + "epoch": 0.07350183754593864, + "grad_norm": 3.577772378921509, + "learning_rate": 1.4700000000000002e-05, + "loss": 1.5375, + "step": 5880 + }, + { + "epoch": 0.07352683817095428, + "grad_norm": 5.136934757232666, + "learning_rate": 1.4705e-05, + "loss": 1.2144, + "step": 5882 + }, + { + "epoch": 0.0735518387959699, + "grad_norm": 4.69830846786499, + "learning_rate": 1.4710000000000001e-05, + "loss": 1.4851, + "step": 5884 + }, + { + "epoch": 0.07357683942098553, + "grad_norm": 8.356304168701172, + "learning_rate": 1.4715000000000002e-05, + "loss": 0.514, + "step": 5886 + }, + { + "epoch": 0.07360184004600115, + "grad_norm": 0.032104965299367905, + "learning_rate": 1.4720000000000001e-05, + "loss": 0.7359, + "step": 5888 + }, + { + "epoch": 0.07362684067101677, + "grad_norm": 4.6410064697265625, + "learning_rate": 1.4725e-05, + "loss": 0.5762, + "step": 5890 + }, + { + "epoch": 0.0736518412960324, + "grad_norm": 3.1130928993225098, + "learning_rate": 1.4730000000000001e-05, + "loss": 1.868, + "step": 5892 + }, + { + "epoch": 0.07367684192104802, + "grad_norm": 5.007471084594727, + "learning_rate": 1.4735000000000002e-05, + "loss": 1.6306, + "step": 5894 + }, + { + "epoch": 0.07370184254606366, + "grad_norm": 3.0867555141448975, + "learning_rate": 1.4740000000000001e-05, + "loss": 1.02, + "step": 5896 + }, + { + "epoch": 0.07372684317107928, + "grad_norm": 3.482889175415039, + "learning_rate": 1.4745e-05, + "loss": 1.0089, + "step": 5898 + }, + { + "epoch": 0.0737518437960949, + "grad_norm": 0.2667827308177948, + "learning_rate": 1.4750000000000003e-05, + "loss": 0.6005, + "step": 5900 + }, + { + "epoch": 0.07377684442111053, + "grad_norm": 3.1748530864715576, + "learning_rate": 1.4755000000000002e-05, + "loss": 1.0064, + "step": 5902 + }, + { + "epoch": 0.07380184504612615, + "grad_norm": 3.028831720352173, + "learning_rate": 1.4760000000000001e-05, + "loss": 0.6666, + "step": 5904 + }, + { + "epoch": 0.07382684567114178, + "grad_norm": 4.954610347747803, + "learning_rate": 1.4765e-05, + "loss": 1.1184, + "step": 5906 + }, + { + "epoch": 0.0738518462961574, + "grad_norm": 3.5273311138153076, + "learning_rate": 1.4770000000000003e-05, + "loss": 0.8731, + "step": 5908 + }, + { + "epoch": 0.07387684692117302, + "grad_norm": 0.1420164257287979, + "learning_rate": 1.4775000000000002e-05, + "loss": 0.853, + "step": 5910 + }, + { + "epoch": 0.07390184754618866, + "grad_norm": 3.1535794734954834, + "learning_rate": 1.478e-05, + "loss": 0.6217, + "step": 5912 + }, + { + "epoch": 0.07392684817120428, + "grad_norm": 3.815340995788574, + "learning_rate": 1.4785e-05, + "loss": 1.4746, + "step": 5914 + }, + { + "epoch": 0.07395184879621991, + "grad_norm": 3.8077311515808105, + "learning_rate": 1.4790000000000002e-05, + "loss": 1.0547, + "step": 5916 + }, + { + "epoch": 0.07397684942123553, + "grad_norm": 3.3903238773345947, + "learning_rate": 1.4795000000000001e-05, + "loss": 1.5612, + "step": 5918 + }, + { + "epoch": 0.07400185004625115, + "grad_norm": 3.136117696762085, + "learning_rate": 1.48e-05, + "loss": 0.9381, + "step": 5920 + }, + { + "epoch": 0.07402685067126678, + "grad_norm": 4.813618183135986, + "learning_rate": 1.4805e-05, + "loss": 0.8472, + "step": 5922 + }, + { + "epoch": 0.0740518512962824, + "grad_norm": 3.1439077854156494, + "learning_rate": 1.4810000000000002e-05, + "loss": 1.4143, + "step": 5924 + }, + { + "epoch": 0.07407685192129804, + "grad_norm": 0.4188769459724426, + "learning_rate": 1.4815000000000001e-05, + "loss": 0.599, + "step": 5926 + }, + { + "epoch": 0.07410185254631366, + "grad_norm": 2.7511422634124756, + "learning_rate": 1.482e-05, + "loss": 0.8247, + "step": 5928 + }, + { + "epoch": 0.07412685317132928, + "grad_norm": 2.182990789413452, + "learning_rate": 1.4825000000000001e-05, + "loss": 1.084, + "step": 5930 + }, + { + "epoch": 0.07415185379634491, + "grad_norm": 3.8788387775421143, + "learning_rate": 1.4830000000000002e-05, + "loss": 0.9928, + "step": 5932 + }, + { + "epoch": 0.07417685442136053, + "grad_norm": 3.4431381225585938, + "learning_rate": 1.4835000000000001e-05, + "loss": 1.2051, + "step": 5934 + }, + { + "epoch": 0.07420185504637616, + "grad_norm": 0.11806602030992508, + "learning_rate": 1.4840000000000002e-05, + "loss": 0.004, + "step": 5936 + }, + { + "epoch": 0.07422685567139178, + "grad_norm": 3.2315609455108643, + "learning_rate": 1.4845000000000001e-05, + "loss": 0.7603, + "step": 5938 + }, + { + "epoch": 0.0742518562964074, + "grad_norm": 4.073068618774414, + "learning_rate": 1.4850000000000002e-05, + "loss": 0.9691, + "step": 5940 + }, + { + "epoch": 0.07427685692142304, + "grad_norm": 2.956063985824585, + "learning_rate": 1.4855000000000001e-05, + "loss": 0.208, + "step": 5942 + }, + { + "epoch": 0.07430185754643866, + "grad_norm": 4.308083534240723, + "learning_rate": 1.4860000000000002e-05, + "loss": 1.0683, + "step": 5944 + }, + { + "epoch": 0.07432685817145429, + "grad_norm": 5.435683727264404, + "learning_rate": 1.4865e-05, + "loss": 0.9694, + "step": 5946 + }, + { + "epoch": 0.07435185879646991, + "grad_norm": 3.3087127208709717, + "learning_rate": 1.4870000000000002e-05, + "loss": 2.1427, + "step": 5948 + }, + { + "epoch": 0.07437685942148553, + "grad_norm": 2.3396358489990234, + "learning_rate": 1.4875000000000002e-05, + "loss": 1.3041, + "step": 5950 + }, + { + "epoch": 0.07440186004650116, + "grad_norm": 10.374894142150879, + "learning_rate": 1.4880000000000002e-05, + "loss": 1.8985, + "step": 5952 + }, + { + "epoch": 0.07442686067151678, + "grad_norm": 0.12714654207229614, + "learning_rate": 1.4885e-05, + "loss": 0.3657, + "step": 5954 + }, + { + "epoch": 0.07445186129653242, + "grad_norm": 3.3320324420928955, + "learning_rate": 1.4890000000000001e-05, + "loss": 1.7748, + "step": 5956 + }, + { + "epoch": 0.07447686192154804, + "grad_norm": 4.545119762420654, + "learning_rate": 1.4895000000000002e-05, + "loss": 0.6531, + "step": 5958 + }, + { + "epoch": 0.07450186254656366, + "grad_norm": 2.727966547012329, + "learning_rate": 1.4900000000000001e-05, + "loss": 1.0974, + "step": 5960 + }, + { + "epoch": 0.07452686317157929, + "grad_norm": 7.6952338218688965, + "learning_rate": 1.4905e-05, + "loss": 2.1752, + "step": 5962 + }, + { + "epoch": 0.07455186379659491, + "grad_norm": 4.202500343322754, + "learning_rate": 1.4910000000000003e-05, + "loss": 1.4689, + "step": 5964 + }, + { + "epoch": 0.07457686442161054, + "grad_norm": 0.045780714601278305, + "learning_rate": 1.4915000000000002e-05, + "loss": 0.8647, + "step": 5966 + }, + { + "epoch": 0.07460186504662616, + "grad_norm": 3.3521480560302734, + "learning_rate": 1.4920000000000001e-05, + "loss": 0.2877, + "step": 5968 + }, + { + "epoch": 0.07462686567164178, + "grad_norm": 3.4400413036346436, + "learning_rate": 1.4925e-05, + "loss": 0.0861, + "step": 5970 + }, + { + "epoch": 0.07465186629665742, + "grad_norm": 2.8373677730560303, + "learning_rate": 1.4930000000000003e-05, + "loss": 0.1916, + "step": 5972 + }, + { + "epoch": 0.07467686692167304, + "grad_norm": 7.7387824058532715, + "learning_rate": 1.4935000000000002e-05, + "loss": 2.1871, + "step": 5974 + }, + { + "epoch": 0.07470186754668867, + "grad_norm": 8.739375114440918, + "learning_rate": 1.4940000000000001e-05, + "loss": 1.9968, + "step": 5976 + }, + { + "epoch": 0.07472686817170429, + "grad_norm": 3.5557076930999756, + "learning_rate": 1.4945e-05, + "loss": 1.3066, + "step": 5978 + }, + { + "epoch": 0.07475186879671991, + "grad_norm": 2.896075963973999, + "learning_rate": 1.4950000000000003e-05, + "loss": 0.1363, + "step": 5980 + }, + { + "epoch": 0.07477686942173555, + "grad_norm": 3.46474552154541, + "learning_rate": 1.4955000000000002e-05, + "loss": 0.9555, + "step": 5982 + }, + { + "epoch": 0.07480187004675116, + "grad_norm": 2.725085496902466, + "learning_rate": 1.496e-05, + "loss": 0.6051, + "step": 5984 + }, + { + "epoch": 0.0748268706717668, + "grad_norm": 4.413187026977539, + "learning_rate": 1.4965e-05, + "loss": 0.7657, + "step": 5986 + }, + { + "epoch": 0.07485187129678242, + "grad_norm": 3.0320403575897217, + "learning_rate": 1.4970000000000002e-05, + "loss": 1.5269, + "step": 5988 + }, + { + "epoch": 0.07487687192179804, + "grad_norm": 4.004209041595459, + "learning_rate": 1.4975000000000001e-05, + "loss": 1.646, + "step": 5990 + }, + { + "epoch": 0.07490187254681367, + "grad_norm": 3.053811550140381, + "learning_rate": 1.498e-05, + "loss": 1.2853, + "step": 5992 + }, + { + "epoch": 0.07492687317182929, + "grad_norm": 2.278372049331665, + "learning_rate": 1.4985000000000001e-05, + "loss": 0.3758, + "step": 5994 + }, + { + "epoch": 0.07495187379684493, + "grad_norm": 4.919025897979736, + "learning_rate": 1.4990000000000002e-05, + "loss": 1.3494, + "step": 5996 + }, + { + "epoch": 0.07497687442186055, + "grad_norm": 0.08646853268146515, + "learning_rate": 1.4995000000000001e-05, + "loss": 0.6892, + "step": 5998 + }, + { + "epoch": 0.07500187504687617, + "grad_norm": 3.544647216796875, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.8409, + "step": 6000 + }, + { + "epoch": 0.0750268756718918, + "grad_norm": 2.7956013679504395, + "learning_rate": 1.5005000000000001e-05, + "loss": 1.6288, + "step": 6002 + }, + { + "epoch": 0.07505187629690742, + "grad_norm": 5.264979362487793, + "learning_rate": 1.501e-05, + "loss": 1.3765, + "step": 6004 + }, + { + "epoch": 0.07507687692192305, + "grad_norm": 2.2031474113464355, + "learning_rate": 1.5015000000000001e-05, + "loss": 0.0927, + "step": 6006 + }, + { + "epoch": 0.07510187754693867, + "grad_norm": 2.61281418800354, + "learning_rate": 1.5020000000000002e-05, + "loss": 1.2137, + "step": 6008 + }, + { + "epoch": 0.07512687817195429, + "grad_norm": 0.028922587633132935, + "learning_rate": 1.5025000000000001e-05, + "loss": 0.0006, + "step": 6010 + }, + { + "epoch": 0.07515187879696993, + "grad_norm": 3.5583744049072266, + "learning_rate": 1.503e-05, + "loss": 1.4721, + "step": 6012 + }, + { + "epoch": 0.07517687942198555, + "grad_norm": 2.184542417526245, + "learning_rate": 1.5035000000000003e-05, + "loss": 1.0101, + "step": 6014 + }, + { + "epoch": 0.07520188004700118, + "grad_norm": 4.942892074584961, + "learning_rate": 1.5040000000000002e-05, + "loss": 2.1769, + "step": 6016 + }, + { + "epoch": 0.0752268806720168, + "grad_norm": 2.8130035400390625, + "learning_rate": 1.5045e-05, + "loss": 0.8208, + "step": 6018 + }, + { + "epoch": 0.07525188129703242, + "grad_norm": 4.060965538024902, + "learning_rate": 1.505e-05, + "loss": 1.3758, + "step": 6020 + }, + { + "epoch": 0.07527688192204805, + "grad_norm": 0.12423186004161835, + "learning_rate": 1.5055000000000002e-05, + "loss": 0.2131, + "step": 6022 + }, + { + "epoch": 0.07530188254706367, + "grad_norm": 6.45147180557251, + "learning_rate": 1.5060000000000001e-05, + "loss": 0.4042, + "step": 6024 + }, + { + "epoch": 0.0753268831720793, + "grad_norm": 3.3085505962371826, + "learning_rate": 1.5065e-05, + "loss": 0.9275, + "step": 6026 + }, + { + "epoch": 0.07535188379709493, + "grad_norm": 3.148190975189209, + "learning_rate": 1.507e-05, + "loss": 1.3209, + "step": 6028 + }, + { + "epoch": 0.07537688442211055, + "grad_norm": 3.370033025741577, + "learning_rate": 1.5075000000000002e-05, + "loss": 0.7335, + "step": 6030 + }, + { + "epoch": 0.07540188504712618, + "grad_norm": 2.6522223949432373, + "learning_rate": 1.5080000000000001e-05, + "loss": 0.8306, + "step": 6032 + }, + { + "epoch": 0.0754268856721418, + "grad_norm": 0.09691770374774933, + "learning_rate": 1.5085e-05, + "loss": 0.4057, + "step": 6034 + }, + { + "epoch": 0.07545188629715743, + "grad_norm": 5.054294109344482, + "learning_rate": 1.509e-05, + "loss": 1.1893, + "step": 6036 + }, + { + "epoch": 0.07547688692217305, + "grad_norm": 3.7341854572296143, + "learning_rate": 1.5095000000000002e-05, + "loss": 0.3307, + "step": 6038 + }, + { + "epoch": 0.07550188754718867, + "grad_norm": 3.1876964569091797, + "learning_rate": 1.5100000000000001e-05, + "loss": 0.7923, + "step": 6040 + }, + { + "epoch": 0.0755268881722043, + "grad_norm": 2.296184539794922, + "learning_rate": 1.5105e-05, + "loss": 1.188, + "step": 6042 + }, + { + "epoch": 0.07555188879721993, + "grad_norm": 0.012538298964500427, + "learning_rate": 1.5110000000000001e-05, + "loss": 0.9464, + "step": 6044 + }, + { + "epoch": 0.07557688942223556, + "grad_norm": 2.6427247524261475, + "learning_rate": 1.5115000000000002e-05, + "loss": 0.4787, + "step": 6046 + }, + { + "epoch": 0.07560189004725118, + "grad_norm": 3.390165328979492, + "learning_rate": 1.5120000000000001e-05, + "loss": 0.7748, + "step": 6048 + }, + { + "epoch": 0.0756268906722668, + "grad_norm": 2.5342037677764893, + "learning_rate": 1.5125e-05, + "loss": 1.2928, + "step": 6050 + }, + { + "epoch": 0.07565189129728243, + "grad_norm": 7.746935844421387, + "learning_rate": 1.513e-05, + "loss": 0.6496, + "step": 6052 + }, + { + "epoch": 0.07567689192229805, + "grad_norm": 0.0427289716899395, + "learning_rate": 1.5135000000000002e-05, + "loss": 0.3811, + "step": 6054 + }, + { + "epoch": 0.07570189254731369, + "grad_norm": 2.3833463191986084, + "learning_rate": 1.514e-05, + "loss": 0.2377, + "step": 6056 + }, + { + "epoch": 0.07572689317232931, + "grad_norm": 3.0820822715759277, + "learning_rate": 1.5145000000000002e-05, + "loss": 1.0981, + "step": 6058 + }, + { + "epoch": 0.07575189379734493, + "grad_norm": 0.2922348380088806, + "learning_rate": 1.515e-05, + "loss": 0.0103, + "step": 6060 + }, + { + "epoch": 0.07577689442236056, + "grad_norm": 4.703051567077637, + "learning_rate": 1.5155000000000001e-05, + "loss": 0.8068, + "step": 6062 + }, + { + "epoch": 0.07580189504737618, + "grad_norm": 2.8563764095306396, + "learning_rate": 1.516e-05, + "loss": 1.1758, + "step": 6064 + }, + { + "epoch": 0.07582689567239181, + "grad_norm": 3.361607074737549, + "learning_rate": 1.5165000000000001e-05, + "loss": 0.575, + "step": 6066 + }, + { + "epoch": 0.07585189629740743, + "grad_norm": 3.5790507793426514, + "learning_rate": 1.517e-05, + "loss": 0.8671, + "step": 6068 + }, + { + "epoch": 0.07587689692242305, + "grad_norm": 7.891122341156006, + "learning_rate": 1.5175000000000001e-05, + "loss": 1.3703, + "step": 6070 + }, + { + "epoch": 0.07590189754743869, + "grad_norm": 0.4331344664096832, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.2069, + "step": 6072 + }, + { + "epoch": 0.07592689817245431, + "grad_norm": 1.1609762907028198, + "learning_rate": 1.5185000000000001e-05, + "loss": 0.5667, + "step": 6074 + }, + { + "epoch": 0.07595189879746994, + "grad_norm": 2.8923988342285156, + "learning_rate": 1.519e-05, + "loss": 1.3453, + "step": 6076 + }, + { + "epoch": 0.07597689942248556, + "grad_norm": 5.0222368240356445, + "learning_rate": 1.5195000000000003e-05, + "loss": 1.8477, + "step": 6078 + }, + { + "epoch": 0.07600190004750118, + "grad_norm": 8.969088554382324, + "learning_rate": 1.5200000000000002e-05, + "loss": 1.3866, + "step": 6080 + }, + { + "epoch": 0.07602690067251681, + "grad_norm": 5.152365684509277, + "learning_rate": 1.5205000000000001e-05, + "loss": 0.5286, + "step": 6082 + }, + { + "epoch": 0.07605190129753243, + "grad_norm": 1.8638427257537842, + "learning_rate": 1.521e-05, + "loss": 0.5428, + "step": 6084 + }, + { + "epoch": 0.07607690192254807, + "grad_norm": 3.1192855834960938, + "learning_rate": 1.5215000000000003e-05, + "loss": 1.7928, + "step": 6086 + }, + { + "epoch": 0.07610190254756369, + "grad_norm": 4.345695972442627, + "learning_rate": 1.5220000000000002e-05, + "loss": 1.1877, + "step": 6088 + }, + { + "epoch": 0.07612690317257931, + "grad_norm": 3.2302043437957764, + "learning_rate": 1.5225e-05, + "loss": 1.7727, + "step": 6090 + }, + { + "epoch": 0.07615190379759494, + "grad_norm": 6.252063751220703, + "learning_rate": 1.523e-05, + "loss": 0.8505, + "step": 6092 + }, + { + "epoch": 0.07617690442261056, + "grad_norm": 1.5384441614151, + "learning_rate": 1.5235000000000002e-05, + "loss": 0.9103, + "step": 6094 + }, + { + "epoch": 0.0762019050476262, + "grad_norm": 4.179102897644043, + "learning_rate": 1.5240000000000001e-05, + "loss": 2.1592, + "step": 6096 + }, + { + "epoch": 0.07622690567264181, + "grad_norm": 6.683974266052246, + "learning_rate": 1.5245e-05, + "loss": 1.1649, + "step": 6098 + }, + { + "epoch": 0.07625190629765743, + "grad_norm": 3.4672892093658447, + "learning_rate": 1.525e-05, + "loss": 1.4177, + "step": 6100 + }, + { + "epoch": 0.07627690692267307, + "grad_norm": 3.339707612991333, + "learning_rate": 1.5255000000000002e-05, + "loss": 0.1301, + "step": 6102 + }, + { + "epoch": 0.07630190754768869, + "grad_norm": 4.155212879180908, + "learning_rate": 1.5260000000000003e-05, + "loss": 1.152, + "step": 6104 + }, + { + "epoch": 0.07632690817270432, + "grad_norm": 0.4503400921821594, + "learning_rate": 1.5265e-05, + "loss": 0.6763, + "step": 6106 + }, + { + "epoch": 0.07635190879771994, + "grad_norm": 0.2600701153278351, + "learning_rate": 1.527e-05, + "loss": 0.7257, + "step": 6108 + }, + { + "epoch": 0.07637690942273556, + "grad_norm": 2.4424455165863037, + "learning_rate": 1.5275000000000002e-05, + "loss": 0.5661, + "step": 6110 + }, + { + "epoch": 0.0764019100477512, + "grad_norm": 9.703751564025879, + "learning_rate": 1.5280000000000003e-05, + "loss": 0.764, + "step": 6112 + }, + { + "epoch": 0.07642691067276682, + "grad_norm": 2.1242566108703613, + "learning_rate": 1.5285e-05, + "loss": 0.8592, + "step": 6114 + }, + { + "epoch": 0.07645191129778245, + "grad_norm": 3.7471985816955566, + "learning_rate": 1.529e-05, + "loss": 1.3415, + "step": 6116 + }, + { + "epoch": 0.07647691192279807, + "grad_norm": 0.015084381215274334, + "learning_rate": 1.5295000000000002e-05, + "loss": 0.4322, + "step": 6118 + }, + { + "epoch": 0.07650191254781369, + "grad_norm": 7.697505474090576, + "learning_rate": 1.5300000000000003e-05, + "loss": 2.6292, + "step": 6120 + }, + { + "epoch": 0.07652691317282932, + "grad_norm": 11.032160758972168, + "learning_rate": 1.5305e-05, + "loss": 1.8567, + "step": 6122 + }, + { + "epoch": 0.07655191379784494, + "grad_norm": 1.749113917350769, + "learning_rate": 1.531e-05, + "loss": 1.3934, + "step": 6124 + }, + { + "epoch": 0.07657691442286058, + "grad_norm": 2.1604065895080566, + "learning_rate": 1.5315e-05, + "loss": 1.2506, + "step": 6126 + }, + { + "epoch": 0.0766019150478762, + "grad_norm": 7.055737018585205, + "learning_rate": 1.5320000000000002e-05, + "loss": 0.9067, + "step": 6128 + }, + { + "epoch": 0.07662691567289182, + "grad_norm": 3.1871376037597656, + "learning_rate": 1.5325e-05, + "loss": 1.7996, + "step": 6130 + }, + { + "epoch": 0.07665191629790745, + "grad_norm": 5.799600601196289, + "learning_rate": 1.533e-05, + "loss": 2.2364, + "step": 6132 + }, + { + "epoch": 0.07667691692292307, + "grad_norm": 3.644379138946533, + "learning_rate": 1.5335e-05, + "loss": 0.7141, + "step": 6134 + }, + { + "epoch": 0.0767019175479387, + "grad_norm": 5.719942569732666, + "learning_rate": 1.5340000000000002e-05, + "loss": 1.8569, + "step": 6136 + }, + { + "epoch": 0.07672691817295432, + "grad_norm": 5.468663215637207, + "learning_rate": 1.5345e-05, + "loss": 1.7933, + "step": 6138 + }, + { + "epoch": 0.07675191879796994, + "grad_norm": 4.230454921722412, + "learning_rate": 1.535e-05, + "loss": 0.604, + "step": 6140 + }, + { + "epoch": 0.07677691942298558, + "grad_norm": 3.1101818084716797, + "learning_rate": 1.5355e-05, + "loss": 1.8381, + "step": 6142 + }, + { + "epoch": 0.0768019200480012, + "grad_norm": 2.2420382499694824, + "learning_rate": 1.5360000000000002e-05, + "loss": 1.3552, + "step": 6144 + }, + { + "epoch": 0.07682692067301683, + "grad_norm": 4.930817127227783, + "learning_rate": 1.5365e-05, + "loss": 1.0417, + "step": 6146 + }, + { + "epoch": 0.07685192129803245, + "grad_norm": 4.254498481750488, + "learning_rate": 1.537e-05, + "loss": 1.2607, + "step": 6148 + }, + { + "epoch": 0.07687692192304807, + "grad_norm": 2.256685972213745, + "learning_rate": 1.5375e-05, + "loss": 0.8668, + "step": 6150 + }, + { + "epoch": 0.0769019225480637, + "grad_norm": 3.0890865325927734, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.5606, + "step": 6152 + }, + { + "epoch": 0.07692692317307932, + "grad_norm": 7.646145820617676, + "learning_rate": 1.5385000000000003e-05, + "loss": 1.811, + "step": 6154 + }, + { + "epoch": 0.07695192379809496, + "grad_norm": 4.783312797546387, + "learning_rate": 1.539e-05, + "loss": 1.2321, + "step": 6156 + }, + { + "epoch": 0.07697692442311058, + "grad_norm": 2.411703109741211, + "learning_rate": 1.5395e-05, + "loss": 0.6426, + "step": 6158 + }, + { + "epoch": 0.0770019250481262, + "grad_norm": 2.5621275901794434, + "learning_rate": 1.54e-05, + "loss": 0.7563, + "step": 6160 + }, + { + "epoch": 0.07702692567314183, + "grad_norm": 2.152022361755371, + "learning_rate": 1.5405000000000002e-05, + "loss": 1.5062, + "step": 6162 + }, + { + "epoch": 0.07705192629815745, + "grad_norm": 3.8227155208587646, + "learning_rate": 1.541e-05, + "loss": 1.6294, + "step": 6164 + }, + { + "epoch": 0.07707692692317308, + "grad_norm": 4.226044654846191, + "learning_rate": 1.5415e-05, + "loss": 1.2288, + "step": 6166 + }, + { + "epoch": 0.0771019275481887, + "grad_norm": 4.545937538146973, + "learning_rate": 1.542e-05, + "loss": 0.872, + "step": 6168 + }, + { + "epoch": 0.07712692817320432, + "grad_norm": 14.892423629760742, + "learning_rate": 1.5425000000000002e-05, + "loss": 1.7063, + "step": 6170 + }, + { + "epoch": 0.07715192879821996, + "grad_norm": 0.10552778840065002, + "learning_rate": 1.543e-05, + "loss": 0.3441, + "step": 6172 + }, + { + "epoch": 0.07717692942323558, + "grad_norm": 3.6074442863464355, + "learning_rate": 1.5435000000000004e-05, + "loss": 1.3687, + "step": 6174 + }, + { + "epoch": 0.07720193004825121, + "grad_norm": 2.8173608779907227, + "learning_rate": 1.544e-05, + "loss": 0.6689, + "step": 6176 + }, + { + "epoch": 0.07722693067326683, + "grad_norm": 2.8197968006134033, + "learning_rate": 1.5445000000000002e-05, + "loss": 1.2121, + "step": 6178 + }, + { + "epoch": 0.07725193129828245, + "grad_norm": 5.062067985534668, + "learning_rate": 1.545e-05, + "loss": 0.9883, + "step": 6180 + }, + { + "epoch": 0.07727693192329808, + "grad_norm": 2.835326910018921, + "learning_rate": 1.5455000000000004e-05, + "loss": 1.0989, + "step": 6182 + }, + { + "epoch": 0.0773019325483137, + "grad_norm": 0.027962181717157364, + "learning_rate": 1.546e-05, + "loss": 1.0143, + "step": 6184 + }, + { + "epoch": 0.07732693317332934, + "grad_norm": 3.0645227432250977, + "learning_rate": 1.5465000000000002e-05, + "loss": 1.0751, + "step": 6186 + }, + { + "epoch": 0.07735193379834496, + "grad_norm": 2.149766445159912, + "learning_rate": 1.547e-05, + "loss": 0.0895, + "step": 6188 + }, + { + "epoch": 0.07737693442336058, + "grad_norm": 0.0683823898434639, + "learning_rate": 1.5475000000000003e-05, + "loss": 0.872, + "step": 6190 + }, + { + "epoch": 0.07740193504837621, + "grad_norm": 6.41599702835083, + "learning_rate": 1.548e-05, + "loss": 2.921, + "step": 6192 + }, + { + "epoch": 0.07742693567339183, + "grad_norm": 5.107203006744385, + "learning_rate": 1.5485e-05, + "loss": 1.408, + "step": 6194 + }, + { + "epoch": 0.07745193629840746, + "grad_norm": 1.4706990718841553, + "learning_rate": 1.549e-05, + "loss": 0.846, + "step": 6196 + }, + { + "epoch": 0.07747693692342308, + "grad_norm": 3.7693967819213867, + "learning_rate": 1.5495000000000003e-05, + "loss": 2.2226, + "step": 6198 + }, + { + "epoch": 0.0775019375484387, + "grad_norm": 3.5036683082580566, + "learning_rate": 1.55e-05, + "loss": 1.9318, + "step": 6200 + }, + { + "epoch": 0.07752693817345434, + "grad_norm": 5.316382884979248, + "learning_rate": 1.5505e-05, + "loss": 1.9147, + "step": 6202 + }, + { + "epoch": 0.07755193879846996, + "grad_norm": 3.0784902572631836, + "learning_rate": 1.5510000000000002e-05, + "loss": 0.5625, + "step": 6204 + }, + { + "epoch": 0.07757693942348559, + "grad_norm": 0.05599392205476761, + "learning_rate": 1.5515000000000003e-05, + "loss": 0.0024, + "step": 6206 + }, + { + "epoch": 0.07760194004850121, + "grad_norm": 2.859574317932129, + "learning_rate": 1.552e-05, + "loss": 1.1279, + "step": 6208 + }, + { + "epoch": 0.07762694067351683, + "grad_norm": 3.9155640602111816, + "learning_rate": 1.5525e-05, + "loss": 0.9673, + "step": 6210 + }, + { + "epoch": 0.07765194129853246, + "grad_norm": 4.873098373413086, + "learning_rate": 1.5530000000000002e-05, + "loss": 1.2949, + "step": 6212 + }, + { + "epoch": 0.07767694192354808, + "grad_norm": 3.5928127765655518, + "learning_rate": 1.5535000000000003e-05, + "loss": 0.9229, + "step": 6214 + }, + { + "epoch": 0.07770194254856372, + "grad_norm": 5.935606002807617, + "learning_rate": 1.554e-05, + "loss": 0.4339, + "step": 6216 + }, + { + "epoch": 0.07772694317357934, + "grad_norm": 4.660245418548584, + "learning_rate": 1.5545e-05, + "loss": 0.904, + "step": 6218 + }, + { + "epoch": 0.07775194379859496, + "grad_norm": 4.004770278930664, + "learning_rate": 1.5550000000000002e-05, + "loss": 1.0871, + "step": 6220 + }, + { + "epoch": 0.07777694442361059, + "grad_norm": 3.863802909851074, + "learning_rate": 1.5555000000000003e-05, + "loss": 1.7261, + "step": 6222 + }, + { + "epoch": 0.07780194504862621, + "grad_norm": 1.5066455602645874, + "learning_rate": 1.556e-05, + "loss": 0.0582, + "step": 6224 + }, + { + "epoch": 0.07782694567364185, + "grad_norm": 4.890381336212158, + "learning_rate": 1.5565e-05, + "loss": 1.4275, + "step": 6226 + }, + { + "epoch": 0.07785194629865747, + "grad_norm": 2.226935863494873, + "learning_rate": 1.5570000000000002e-05, + "loss": 0.4586, + "step": 6228 + }, + { + "epoch": 0.07787694692367308, + "grad_norm": 3.858130931854248, + "learning_rate": 1.5575000000000002e-05, + "loss": 0.7614, + "step": 6230 + }, + { + "epoch": 0.07790194754868872, + "grad_norm": 0.951988697052002, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.4724, + "step": 6232 + }, + { + "epoch": 0.07792694817370434, + "grad_norm": 2.5042572021484375, + "learning_rate": 1.5585e-05, + "loss": 0.5012, + "step": 6234 + }, + { + "epoch": 0.07795194879871997, + "grad_norm": 2.2088897228240967, + "learning_rate": 1.559e-05, + "loss": 2.1518, + "step": 6236 + }, + { + "epoch": 0.07797694942373559, + "grad_norm": 30.11112403869629, + "learning_rate": 1.5595000000000002e-05, + "loss": 1.3875, + "step": 6238 + }, + { + "epoch": 0.07800195004875121, + "grad_norm": 3.098590135574341, + "learning_rate": 1.5600000000000003e-05, + "loss": 0.6225, + "step": 6240 + }, + { + "epoch": 0.07802695067376685, + "grad_norm": 4.833183765411377, + "learning_rate": 1.5605e-05, + "loss": 1.5615, + "step": 6242 + }, + { + "epoch": 0.07805195129878247, + "grad_norm": 4.36461067199707, + "learning_rate": 1.561e-05, + "loss": 1.4023, + "step": 6244 + }, + { + "epoch": 0.0780769519237981, + "grad_norm": 4.379441261291504, + "learning_rate": 1.5615000000000002e-05, + "loss": 2.1073, + "step": 6246 + }, + { + "epoch": 0.07810195254881372, + "grad_norm": 4.185563087463379, + "learning_rate": 1.5620000000000003e-05, + "loss": 0.9592, + "step": 6248 + }, + { + "epoch": 0.07812695317382935, + "grad_norm": 2.4602577686309814, + "learning_rate": 1.5625e-05, + "loss": 1.2154, + "step": 6250 + }, + { + "epoch": 0.07815195379884497, + "grad_norm": 7.7013654708862305, + "learning_rate": 1.563e-05, + "loss": 0.5753, + "step": 6252 + }, + { + "epoch": 0.07817695442386059, + "grad_norm": 3.1141018867492676, + "learning_rate": 1.5635e-05, + "loss": 1.0214, + "step": 6254 + }, + { + "epoch": 0.07820195504887623, + "grad_norm": 3.7479677200317383, + "learning_rate": 1.5640000000000003e-05, + "loss": 0.2769, + "step": 6256 + }, + { + "epoch": 0.07822695567389185, + "grad_norm": 2.774585008621216, + "learning_rate": 1.5645e-05, + "loss": 0.6071, + "step": 6258 + }, + { + "epoch": 0.07825195629890748, + "grad_norm": 4.797704696655273, + "learning_rate": 1.565e-05, + "loss": 1.2603, + "step": 6260 + }, + { + "epoch": 0.0782769569239231, + "grad_norm": 2.571739435195923, + "learning_rate": 1.5655000000000002e-05, + "loss": 0.5542, + "step": 6262 + }, + { + "epoch": 0.07830195754893872, + "grad_norm": 10.62350845336914, + "learning_rate": 1.5660000000000003e-05, + "loss": 2.3362, + "step": 6264 + }, + { + "epoch": 0.07832695817395435, + "grad_norm": 3.7026870250701904, + "learning_rate": 1.5665e-05, + "loss": 0.7087, + "step": 6266 + }, + { + "epoch": 0.07835195879896997, + "grad_norm": 1.3013445138931274, + "learning_rate": 1.567e-05, + "loss": 0.3439, + "step": 6268 + }, + { + "epoch": 0.0783769594239856, + "grad_norm": 0.308180570602417, + "learning_rate": 1.5675e-05, + "loss": 0.0413, + "step": 6270 + }, + { + "epoch": 0.07840196004900123, + "grad_norm": 3.018742561340332, + "learning_rate": 1.5680000000000002e-05, + "loss": 1.1087, + "step": 6272 + }, + { + "epoch": 0.07842696067401685, + "grad_norm": 0.0675542950630188, + "learning_rate": 1.5685e-05, + "loss": 0.0965, + "step": 6274 + }, + { + "epoch": 0.07845196129903248, + "grad_norm": 5.797464370727539, + "learning_rate": 1.569e-05, + "loss": 1.4613, + "step": 6276 + }, + { + "epoch": 0.0784769619240481, + "grad_norm": 1.0968716144561768, + "learning_rate": 1.5695e-05, + "loss": 0.5427, + "step": 6278 + }, + { + "epoch": 0.07850196254906373, + "grad_norm": 4.376802444458008, + "learning_rate": 1.5700000000000002e-05, + "loss": 1.0427, + "step": 6280 + }, + { + "epoch": 0.07852696317407935, + "grad_norm": 2.604853868484497, + "learning_rate": 1.5705000000000003e-05, + "loss": 0.9951, + "step": 6282 + }, + { + "epoch": 0.07855196379909497, + "grad_norm": 4.389036655426025, + "learning_rate": 1.571e-05, + "loss": 1.6237, + "step": 6284 + }, + { + "epoch": 0.0785769644241106, + "grad_norm": 3.9546377658843994, + "learning_rate": 1.5715e-05, + "loss": 1.6091, + "step": 6286 + }, + { + "epoch": 0.07860196504912623, + "grad_norm": 0.02036736160516739, + "learning_rate": 1.5720000000000002e-05, + "loss": 0.2431, + "step": 6288 + }, + { + "epoch": 0.07862696567414186, + "grad_norm": 0.018569951876997948, + "learning_rate": 1.5725000000000003e-05, + "loss": 0.7315, + "step": 6290 + }, + { + "epoch": 0.07865196629915748, + "grad_norm": 3.2841551303863525, + "learning_rate": 1.573e-05, + "loss": 0.3822, + "step": 6292 + }, + { + "epoch": 0.0786769669241731, + "grad_norm": 0.025823678821325302, + "learning_rate": 1.5735e-05, + "loss": 1.7328, + "step": 6294 + }, + { + "epoch": 0.07870196754918873, + "grad_norm": 2.515458583831787, + "learning_rate": 1.5740000000000002e-05, + "loss": 0.9923, + "step": 6296 + }, + { + "epoch": 0.07872696817420435, + "grad_norm": 3.306114435195923, + "learning_rate": 1.5745000000000003e-05, + "loss": 0.8076, + "step": 6298 + }, + { + "epoch": 0.07875196879921999, + "grad_norm": 3.69868540763855, + "learning_rate": 1.575e-05, + "loss": 0.9459, + "step": 6300 + }, + { + "epoch": 0.07877696942423561, + "grad_norm": 3.4497945308685303, + "learning_rate": 1.5755e-05, + "loss": 1.1786, + "step": 6302 + }, + { + "epoch": 0.07880197004925123, + "grad_norm": 3.1598050594329834, + "learning_rate": 1.576e-05, + "loss": 1.3883, + "step": 6304 + }, + { + "epoch": 0.07882697067426686, + "grad_norm": 0.23641575872898102, + "learning_rate": 1.5765000000000002e-05, + "loss": 0.6576, + "step": 6306 + }, + { + "epoch": 0.07885197129928248, + "grad_norm": 2.7914092540740967, + "learning_rate": 1.577e-05, + "loss": 0.8635, + "step": 6308 + }, + { + "epoch": 0.07887697192429811, + "grad_norm": 3.2488434314727783, + "learning_rate": 1.5775e-05, + "loss": 1.2029, + "step": 6310 + }, + { + "epoch": 0.07890197254931373, + "grad_norm": 3.3841006755828857, + "learning_rate": 1.578e-05, + "loss": 1.2734, + "step": 6312 + }, + { + "epoch": 0.07892697317432935, + "grad_norm": 3.5751867294311523, + "learning_rate": 1.5785000000000002e-05, + "loss": 0.6574, + "step": 6314 + }, + { + "epoch": 0.07895197379934499, + "grad_norm": 3.980923652648926, + "learning_rate": 1.579e-05, + "loss": 1.3991, + "step": 6316 + }, + { + "epoch": 0.07897697442436061, + "grad_norm": 5.047551155090332, + "learning_rate": 1.5795e-05, + "loss": 0.4595, + "step": 6318 + }, + { + "epoch": 0.07900197504937624, + "grad_norm": 3.836648464202881, + "learning_rate": 1.58e-05, + "loss": 1.3433, + "step": 6320 + }, + { + "epoch": 0.07902697567439186, + "grad_norm": 2.9060566425323486, + "learning_rate": 1.5805000000000002e-05, + "loss": 1.5657, + "step": 6322 + }, + { + "epoch": 0.07905197629940748, + "grad_norm": 0.18733999133110046, + "learning_rate": 1.581e-05, + "loss": 0.6729, + "step": 6324 + }, + { + "epoch": 0.07907697692442311, + "grad_norm": 2.2480738162994385, + "learning_rate": 1.5815e-05, + "loss": 0.4053, + "step": 6326 + }, + { + "epoch": 0.07910197754943873, + "grad_norm": 4.179388999938965, + "learning_rate": 1.582e-05, + "loss": 1.0192, + "step": 6328 + }, + { + "epoch": 0.07912697817445437, + "grad_norm": 0.32367587089538574, + "learning_rate": 1.5825000000000002e-05, + "loss": 0.0491, + "step": 6330 + }, + { + "epoch": 0.07915197879946999, + "grad_norm": 7.667989730834961, + "learning_rate": 1.5830000000000003e-05, + "loss": 0.6016, + "step": 6332 + }, + { + "epoch": 0.07917697942448561, + "grad_norm": 2.558100700378418, + "learning_rate": 1.5835e-05, + "loss": 0.9878, + "step": 6334 + }, + { + "epoch": 0.07920198004950124, + "grad_norm": 5.470254898071289, + "learning_rate": 1.584e-05, + "loss": 0.3758, + "step": 6336 + }, + { + "epoch": 0.07922698067451686, + "grad_norm": 4.124131202697754, + "learning_rate": 1.5845e-05, + "loss": 2.0469, + "step": 6338 + }, + { + "epoch": 0.0792519812995325, + "grad_norm": 6.230991840362549, + "learning_rate": 1.5850000000000002e-05, + "loss": 1.9266, + "step": 6340 + }, + { + "epoch": 0.07927698192454811, + "grad_norm": 3.0132901668548584, + "learning_rate": 1.5855e-05, + "loss": 0.7466, + "step": 6342 + }, + { + "epoch": 0.07930198254956373, + "grad_norm": 6.468095779418945, + "learning_rate": 1.586e-05, + "loss": 0.2272, + "step": 6344 + }, + { + "epoch": 0.07932698317457937, + "grad_norm": 3.477569341659546, + "learning_rate": 1.5865e-05, + "loss": 0.2923, + "step": 6346 + }, + { + "epoch": 0.07935198379959499, + "grad_norm": 0.019682278856635094, + "learning_rate": 1.5870000000000002e-05, + "loss": 1.0693, + "step": 6348 + }, + { + "epoch": 0.07937698442461062, + "grad_norm": 0.6765910983085632, + "learning_rate": 1.5875e-05, + "loss": 0.0071, + "step": 6350 + }, + { + "epoch": 0.07940198504962624, + "grad_norm": 2.9775266647338867, + "learning_rate": 1.588e-05, + "loss": 1.4305, + "step": 6352 + }, + { + "epoch": 0.07942698567464186, + "grad_norm": 4.8738908767700195, + "learning_rate": 1.5885e-05, + "loss": 1.508, + "step": 6354 + }, + { + "epoch": 0.0794519862996575, + "grad_norm": 3.5635790824890137, + "learning_rate": 1.5890000000000002e-05, + "loss": 1.4775, + "step": 6356 + }, + { + "epoch": 0.07947698692467312, + "grad_norm": 3.3281376361846924, + "learning_rate": 1.5895e-05, + "loss": 1.2089, + "step": 6358 + }, + { + "epoch": 0.07950198754968875, + "grad_norm": 0.008561245165765285, + "learning_rate": 1.5900000000000004e-05, + "loss": 1.0754, + "step": 6360 + }, + { + "epoch": 0.07952698817470437, + "grad_norm": 0.9843417406082153, + "learning_rate": 1.5905e-05, + "loss": 0.5946, + "step": 6362 + }, + { + "epoch": 0.07955198879971999, + "grad_norm": 2.5440492630004883, + "learning_rate": 1.5910000000000002e-05, + "loss": 0.6426, + "step": 6364 + }, + { + "epoch": 0.07957698942473562, + "grad_norm": 1.508498191833496, + "learning_rate": 1.5915e-05, + "loss": 1.116, + "step": 6366 + }, + { + "epoch": 0.07960199004975124, + "grad_norm": 7.689567565917969, + "learning_rate": 1.5920000000000003e-05, + "loss": 2.5106, + "step": 6368 + }, + { + "epoch": 0.07962699067476688, + "grad_norm": 0.12907710671424866, + "learning_rate": 1.5925e-05, + "loss": 0.6317, + "step": 6370 + }, + { + "epoch": 0.0796519912997825, + "grad_norm": 1.831013560295105, + "learning_rate": 1.593e-05, + "loss": 0.1654, + "step": 6372 + }, + { + "epoch": 0.07967699192479812, + "grad_norm": 2.6658225059509277, + "learning_rate": 1.5935e-05, + "loss": 0.741, + "step": 6374 + }, + { + "epoch": 0.07970199254981375, + "grad_norm": 0.009507155045866966, + "learning_rate": 1.5940000000000003e-05, + "loss": 0.0721, + "step": 6376 + }, + { + "epoch": 0.07972699317482937, + "grad_norm": 6.320855140686035, + "learning_rate": 1.5945e-05, + "loss": 1.1953, + "step": 6378 + }, + { + "epoch": 0.079751993799845, + "grad_norm": 3.9747002124786377, + "learning_rate": 1.595e-05, + "loss": 0.9312, + "step": 6380 + }, + { + "epoch": 0.07977699442486062, + "grad_norm": 2.3284220695495605, + "learning_rate": 1.5955e-05, + "loss": 1.2267, + "step": 6382 + }, + { + "epoch": 0.07980199504987624, + "grad_norm": 4.898107528686523, + "learning_rate": 1.5960000000000003e-05, + "loss": 0.3165, + "step": 6384 + }, + { + "epoch": 0.07982699567489188, + "grad_norm": 0.010411192663013935, + "learning_rate": 1.5965e-05, + "loss": 0.0002, + "step": 6386 + }, + { + "epoch": 0.0798519962999075, + "grad_norm": 2.5484139919281006, + "learning_rate": 1.597e-05, + "loss": 0.9296, + "step": 6388 + }, + { + "epoch": 0.07987699692492313, + "grad_norm": 2.256868362426758, + "learning_rate": 1.5975000000000002e-05, + "loss": 0.4126, + "step": 6390 + }, + { + "epoch": 0.07990199754993875, + "grad_norm": 4.4089884757995605, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.6982, + "step": 6392 + }, + { + "epoch": 0.07992699817495437, + "grad_norm": 5.786546230316162, + "learning_rate": 1.5985e-05, + "loss": 1.478, + "step": 6394 + }, + { + "epoch": 0.07995199879997, + "grad_norm": 0.007963348180055618, + "learning_rate": 1.599e-05, + "loss": 0.6659, + "step": 6396 + }, + { + "epoch": 0.07997699942498562, + "grad_norm": 4.98027229309082, + "learning_rate": 1.5995000000000002e-05, + "loss": 1.1213, + "step": 6398 + }, + { + "epoch": 0.08000200005000126, + "grad_norm": 5.123515605926514, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.8167, + "step": 6400 + }, + { + "epoch": 0.08002700067501688, + "grad_norm": 3.028306722640991, + "learning_rate": 1.6005e-05, + "loss": 0.7846, + "step": 6402 + }, + { + "epoch": 0.0800520013000325, + "grad_norm": 4.2322516441345215, + "learning_rate": 1.601e-05, + "loss": 1.6914, + "step": 6404 + }, + { + "epoch": 0.08007700192504813, + "grad_norm": 2.470599889755249, + "learning_rate": 1.6015e-05, + "loss": 1.7962, + "step": 6406 + }, + { + "epoch": 0.08010200255006375, + "grad_norm": 3.1779897212982178, + "learning_rate": 1.6020000000000002e-05, + "loss": 1.8395, + "step": 6408 + }, + { + "epoch": 0.08012700317507938, + "grad_norm": 3.472973346710205, + "learning_rate": 1.6025000000000003e-05, + "loss": 0.8442, + "step": 6410 + }, + { + "epoch": 0.080152003800095, + "grad_norm": 0.2612605392932892, + "learning_rate": 1.603e-05, + "loss": 0.0439, + "step": 6412 + }, + { + "epoch": 0.08017700442511062, + "grad_norm": 2.7207183837890625, + "learning_rate": 1.6035e-05, + "loss": 0.859, + "step": 6414 + }, + { + "epoch": 0.08020200505012626, + "grad_norm": 2.9273431301116943, + "learning_rate": 1.6040000000000002e-05, + "loss": 0.5989, + "step": 6416 + }, + { + "epoch": 0.08022700567514188, + "grad_norm": 5.205406188964844, + "learning_rate": 1.6045000000000003e-05, + "loss": 1.0051, + "step": 6418 + }, + { + "epoch": 0.08025200630015751, + "grad_norm": 3.7784013748168945, + "learning_rate": 1.605e-05, + "loss": 0.6213, + "step": 6420 + }, + { + "epoch": 0.08027700692517313, + "grad_norm": 3.561469316482544, + "learning_rate": 1.6055e-05, + "loss": 0.8791, + "step": 6422 + }, + { + "epoch": 0.08030200755018875, + "grad_norm": 2.985535144805908, + "learning_rate": 1.6060000000000002e-05, + "loss": 0.635, + "step": 6424 + }, + { + "epoch": 0.08032700817520438, + "grad_norm": 5.941481590270996, + "learning_rate": 1.6065000000000003e-05, + "loss": 1.0272, + "step": 6426 + }, + { + "epoch": 0.08035200880022, + "grad_norm": 3.6850216388702393, + "learning_rate": 1.607e-05, + "loss": 1.067, + "step": 6428 + }, + { + "epoch": 0.08037700942523564, + "grad_norm": 4.966442108154297, + "learning_rate": 1.6075e-05, + "loss": 1.2971, + "step": 6430 + }, + { + "epoch": 0.08040201005025126, + "grad_norm": 5.7367682456970215, + "learning_rate": 1.6080000000000002e-05, + "loss": 2.3821, + "step": 6432 + }, + { + "epoch": 0.08042701067526688, + "grad_norm": 1.5509458780288696, + "learning_rate": 1.6085000000000003e-05, + "loss": 0.9707, + "step": 6434 + }, + { + "epoch": 0.08045201130028251, + "grad_norm": 2.842052698135376, + "learning_rate": 1.609e-05, + "loss": 0.2048, + "step": 6436 + }, + { + "epoch": 0.08047701192529813, + "grad_norm": 3.5242133140563965, + "learning_rate": 1.6095e-05, + "loss": 1.3883, + "step": 6438 + }, + { + "epoch": 0.08050201255031376, + "grad_norm": 3.777226686477661, + "learning_rate": 1.6100000000000002e-05, + "loss": 0.8548, + "step": 6440 + }, + { + "epoch": 0.08052701317532938, + "grad_norm": 4.970685005187988, + "learning_rate": 1.6105000000000003e-05, + "loss": 1.5743, + "step": 6442 + }, + { + "epoch": 0.080552013800345, + "grad_norm": 2.6300346851348877, + "learning_rate": 1.611e-05, + "loss": 0.8642, + "step": 6444 + }, + { + "epoch": 0.08057701442536064, + "grad_norm": 7.324147701263428, + "learning_rate": 1.6115e-05, + "loss": 0.6542, + "step": 6446 + }, + { + "epoch": 0.08060201505037626, + "grad_norm": 9.363580703735352, + "learning_rate": 1.612e-05, + "loss": 0.4793, + "step": 6448 + }, + { + "epoch": 0.08062701567539189, + "grad_norm": 3.0416743755340576, + "learning_rate": 1.6125000000000002e-05, + "loss": 1.331, + "step": 6450 + }, + { + "epoch": 0.08065201630040751, + "grad_norm": 6.619192123413086, + "learning_rate": 1.613e-05, + "loss": 1.4213, + "step": 6452 + }, + { + "epoch": 0.08067701692542313, + "grad_norm": 2.42354679107666, + "learning_rate": 1.6135e-05, + "loss": 0.4824, + "step": 6454 + }, + { + "epoch": 0.08070201755043876, + "grad_norm": 4.137476921081543, + "learning_rate": 1.614e-05, + "loss": 0.9277, + "step": 6456 + }, + { + "epoch": 0.08072701817545438, + "grad_norm": 2.070371150970459, + "learning_rate": 1.6145000000000002e-05, + "loss": 0.1118, + "step": 6458 + }, + { + "epoch": 0.08075201880047002, + "grad_norm": 10.95738697052002, + "learning_rate": 1.6150000000000003e-05, + "loss": 0.5588, + "step": 6460 + }, + { + "epoch": 0.08077701942548564, + "grad_norm": 4.307394981384277, + "learning_rate": 1.6155e-05, + "loss": 0.8677, + "step": 6462 + }, + { + "epoch": 0.08080202005050126, + "grad_norm": 4.05812406539917, + "learning_rate": 1.616e-05, + "loss": 1.2243, + "step": 6464 + }, + { + "epoch": 0.08082702067551689, + "grad_norm": 4.02498197555542, + "learning_rate": 1.6165000000000002e-05, + "loss": 1.3401, + "step": 6466 + }, + { + "epoch": 0.08085202130053251, + "grad_norm": 0.40783005952835083, + "learning_rate": 1.6170000000000003e-05, + "loss": 1.2822, + "step": 6468 + }, + { + "epoch": 0.08087702192554815, + "grad_norm": 7.86510705947876, + "learning_rate": 1.6175e-05, + "loss": 1.2511, + "step": 6470 + }, + { + "epoch": 0.08090202255056377, + "grad_norm": 7.8061299324035645, + "learning_rate": 1.618e-05, + "loss": 0.3677, + "step": 6472 + }, + { + "epoch": 0.08092702317557939, + "grad_norm": 4.66628360748291, + "learning_rate": 1.6185000000000002e-05, + "loss": 1.5714, + "step": 6474 + }, + { + "epoch": 0.08095202380059502, + "grad_norm": 3.6617610454559326, + "learning_rate": 1.6190000000000003e-05, + "loss": 0.8526, + "step": 6476 + }, + { + "epoch": 0.08097702442561064, + "grad_norm": 2.996654748916626, + "learning_rate": 1.6195e-05, + "loss": 0.3556, + "step": 6478 + }, + { + "epoch": 0.08100202505062627, + "grad_norm": 6.53071403503418, + "learning_rate": 1.62e-05, + "loss": 0.3423, + "step": 6480 + }, + { + "epoch": 0.08102702567564189, + "grad_norm": 3.31892991065979, + "learning_rate": 1.6205e-05, + "loss": 1.8391, + "step": 6482 + }, + { + "epoch": 0.08105202630065751, + "grad_norm": 1.985768437385559, + "learning_rate": 1.6210000000000002e-05, + "loss": 0.255, + "step": 6484 + }, + { + "epoch": 0.08107702692567315, + "grad_norm": 5.75151252746582, + "learning_rate": 1.6215e-05, + "loss": 1.7758, + "step": 6486 + }, + { + "epoch": 0.08110202755068877, + "grad_norm": 3.4886255264282227, + "learning_rate": 1.6220000000000004e-05, + "loss": 1.3908, + "step": 6488 + }, + { + "epoch": 0.0811270281757044, + "grad_norm": 3.380500555038452, + "learning_rate": 1.6225e-05, + "loss": 1.8459, + "step": 6490 + }, + { + "epoch": 0.08115202880072002, + "grad_norm": 1.2118381261825562, + "learning_rate": 1.6230000000000002e-05, + "loss": 2.041, + "step": 6492 + }, + { + "epoch": 0.08117702942573564, + "grad_norm": 2.8833138942718506, + "learning_rate": 1.6235e-05, + "loss": 1.4588, + "step": 6494 + }, + { + "epoch": 0.08120203005075127, + "grad_norm": 2.937425374984741, + "learning_rate": 1.6240000000000004e-05, + "loss": 2.2059, + "step": 6496 + }, + { + "epoch": 0.08122703067576689, + "grad_norm": 4.10045051574707, + "learning_rate": 1.6245e-05, + "loss": 0.9156, + "step": 6498 + }, + { + "epoch": 0.08125203130078253, + "grad_norm": 4.359787940979004, + "learning_rate": 1.6250000000000002e-05, + "loss": 1.5794, + "step": 6500 + }, + { + "epoch": 0.08127703192579815, + "grad_norm": 2.585869550704956, + "learning_rate": 1.6255e-05, + "loss": 0.7866, + "step": 6502 + }, + { + "epoch": 0.08130203255081377, + "grad_norm": 3.5534133911132812, + "learning_rate": 1.626e-05, + "loss": 1.3346, + "step": 6504 + }, + { + "epoch": 0.0813270331758294, + "grad_norm": 3.638360023498535, + "learning_rate": 1.6265e-05, + "loss": 1.4318, + "step": 6506 + }, + { + "epoch": 0.08135203380084502, + "grad_norm": 4.786435127258301, + "learning_rate": 1.6270000000000002e-05, + "loss": 1.1295, + "step": 6508 + }, + { + "epoch": 0.08137703442586065, + "grad_norm": 3.357837677001953, + "learning_rate": 1.6275e-05, + "loss": 1.7754, + "step": 6510 + }, + { + "epoch": 0.08140203505087627, + "grad_norm": 1.6644904613494873, + "learning_rate": 1.628e-05, + "loss": 0.6537, + "step": 6512 + }, + { + "epoch": 0.08142703567589189, + "grad_norm": 2.9780728816986084, + "learning_rate": 1.6285e-05, + "loss": 1.2674, + "step": 6514 + }, + { + "epoch": 0.08145203630090753, + "grad_norm": 4.838628768920898, + "learning_rate": 1.629e-05, + "loss": 0.7667, + "step": 6516 + }, + { + "epoch": 0.08147703692592315, + "grad_norm": 3.068437337875366, + "learning_rate": 1.6295000000000002e-05, + "loss": 1.176, + "step": 6518 + }, + { + "epoch": 0.08150203755093878, + "grad_norm": 3.907527446746826, + "learning_rate": 1.63e-05, + "loss": 2.2177, + "step": 6520 + }, + { + "epoch": 0.0815270381759544, + "grad_norm": 5.854668140411377, + "learning_rate": 1.6305e-05, + "loss": 1.6793, + "step": 6522 + }, + { + "epoch": 0.08155203880097002, + "grad_norm": 4.611363887786865, + "learning_rate": 1.631e-05, + "loss": 0.6753, + "step": 6524 + }, + { + "epoch": 0.08157703942598565, + "grad_norm": 2.90956449508667, + "learning_rate": 1.6315000000000002e-05, + "loss": 1.3057, + "step": 6526 + }, + { + "epoch": 0.08160204005100127, + "grad_norm": 7.477356910705566, + "learning_rate": 1.632e-05, + "loss": 0.7213, + "step": 6528 + }, + { + "epoch": 0.08162704067601691, + "grad_norm": 2.187490940093994, + "learning_rate": 1.6325e-05, + "loss": 1.0894, + "step": 6530 + }, + { + "epoch": 0.08165204130103253, + "grad_norm": 4.518975734710693, + "learning_rate": 1.633e-05, + "loss": 1.9229, + "step": 6532 + }, + { + "epoch": 0.08167704192604815, + "grad_norm": 4.45811128616333, + "learning_rate": 1.6335000000000002e-05, + "loss": 1.053, + "step": 6534 + }, + { + "epoch": 0.08170204255106378, + "grad_norm": 2.497856378555298, + "learning_rate": 1.634e-05, + "loss": 0.6987, + "step": 6536 + }, + { + "epoch": 0.0817270431760794, + "grad_norm": 4.060742378234863, + "learning_rate": 1.6345000000000004e-05, + "loss": 0.84, + "step": 6538 + }, + { + "epoch": 0.08175204380109503, + "grad_norm": 3.8847239017486572, + "learning_rate": 1.635e-05, + "loss": 2.5723, + "step": 6540 + }, + { + "epoch": 0.08177704442611065, + "grad_norm": 3.763054370880127, + "learning_rate": 1.6355000000000002e-05, + "loss": 0.4991, + "step": 6542 + }, + { + "epoch": 0.08180204505112627, + "grad_norm": 4.42936897277832, + "learning_rate": 1.636e-05, + "loss": 2.1293, + "step": 6544 + }, + { + "epoch": 0.08182704567614191, + "grad_norm": 0.7675483822822571, + "learning_rate": 1.6365000000000003e-05, + "loss": 0.0905, + "step": 6546 + }, + { + "epoch": 0.08185204630115753, + "grad_norm": 4.7199931144714355, + "learning_rate": 1.637e-05, + "loss": 0.824, + "step": 6548 + }, + { + "epoch": 0.08187704692617316, + "grad_norm": 0.8746941089630127, + "learning_rate": 1.6375e-05, + "loss": 0.4346, + "step": 6550 + }, + { + "epoch": 0.08190204755118878, + "grad_norm": 4.810782432556152, + "learning_rate": 1.638e-05, + "loss": 0.5213, + "step": 6552 + }, + { + "epoch": 0.0819270481762044, + "grad_norm": 0.008280335925519466, + "learning_rate": 1.6385000000000003e-05, + "loss": 0.2887, + "step": 6554 + }, + { + "epoch": 0.08195204880122003, + "grad_norm": 3.4039225578308105, + "learning_rate": 1.639e-05, + "loss": 1.2118, + "step": 6556 + }, + { + "epoch": 0.08197704942623565, + "grad_norm": 0.2754960060119629, + "learning_rate": 1.6395e-05, + "loss": 0.9251, + "step": 6558 + }, + { + "epoch": 0.08200205005125129, + "grad_norm": 2.3654236793518066, + "learning_rate": 1.64e-05, + "loss": 0.5288, + "step": 6560 + }, + { + "epoch": 0.08202705067626691, + "grad_norm": 5.641172885894775, + "learning_rate": 1.6405000000000003e-05, + "loss": 1.9675, + "step": 6562 + }, + { + "epoch": 0.08205205130128253, + "grad_norm": 1.9712241888046265, + "learning_rate": 1.641e-05, + "loss": 0.4505, + "step": 6564 + }, + { + "epoch": 0.08207705192629816, + "grad_norm": 1.9027931690216064, + "learning_rate": 1.6415e-05, + "loss": 1.6655, + "step": 6566 + }, + { + "epoch": 0.08210205255131378, + "grad_norm": 2.104173183441162, + "learning_rate": 1.6420000000000002e-05, + "loss": 0.8938, + "step": 6568 + }, + { + "epoch": 0.08212705317632941, + "grad_norm": 2.972053289413452, + "learning_rate": 1.6425000000000003e-05, + "loss": 0.3755, + "step": 6570 + }, + { + "epoch": 0.08215205380134503, + "grad_norm": 25.13939094543457, + "learning_rate": 1.643e-05, + "loss": 1.3785, + "step": 6572 + }, + { + "epoch": 0.08217705442636065, + "grad_norm": 2.2915356159210205, + "learning_rate": 1.6435e-05, + "loss": 0.6592, + "step": 6574 + }, + { + "epoch": 0.08220205505137629, + "grad_norm": 2.9002439975738525, + "learning_rate": 1.6440000000000002e-05, + "loss": 1.0946, + "step": 6576 + }, + { + "epoch": 0.08222705567639191, + "grad_norm": 3.0512936115264893, + "learning_rate": 1.6445000000000003e-05, + "loss": 1.5364, + "step": 6578 + }, + { + "epoch": 0.08225205630140754, + "grad_norm": 4.153448104858398, + "learning_rate": 1.645e-05, + "loss": 1.1484, + "step": 6580 + }, + { + "epoch": 0.08227705692642316, + "grad_norm": 4.733523368835449, + "learning_rate": 1.6455e-05, + "loss": 0.4696, + "step": 6582 + }, + { + "epoch": 0.08230205755143878, + "grad_norm": 1.1289572715759277, + "learning_rate": 1.646e-05, + "loss": 0.8198, + "step": 6584 + }, + { + "epoch": 0.08232705817645442, + "grad_norm": 2.1508023738861084, + "learning_rate": 1.6465000000000002e-05, + "loss": 1.0776, + "step": 6586 + }, + { + "epoch": 0.08235205880147003, + "grad_norm": 0.021750889718532562, + "learning_rate": 1.647e-05, + "loss": 0.7038, + "step": 6588 + }, + { + "epoch": 0.08237705942648567, + "grad_norm": 5.528804302215576, + "learning_rate": 1.6475e-05, + "loss": 1.1587, + "step": 6590 + }, + { + "epoch": 0.08240206005150129, + "grad_norm": 4.719750881195068, + "learning_rate": 1.648e-05, + "loss": 0.5244, + "step": 6592 + }, + { + "epoch": 0.08242706067651691, + "grad_norm": 4.523304462432861, + "learning_rate": 1.6485000000000002e-05, + "loss": 1.491, + "step": 6594 + }, + { + "epoch": 0.08245206130153254, + "grad_norm": 3.15633225440979, + "learning_rate": 1.6490000000000003e-05, + "loss": 1.2593, + "step": 6596 + }, + { + "epoch": 0.08247706192654816, + "grad_norm": 5.056468963623047, + "learning_rate": 1.6495e-05, + "loss": 0.7562, + "step": 6598 + }, + { + "epoch": 0.0825020625515638, + "grad_norm": 5.797367095947266, + "learning_rate": 1.65e-05, + "loss": 1.484, + "step": 6600 + }, + { + "epoch": 0.08252706317657942, + "grad_norm": 0.01581847481429577, + "learning_rate": 1.6505000000000002e-05, + "loss": 0.7314, + "step": 6602 + }, + { + "epoch": 0.08255206380159504, + "grad_norm": 3.670219898223877, + "learning_rate": 1.6510000000000003e-05, + "loss": 1.9328, + "step": 6604 + }, + { + "epoch": 0.08257706442661067, + "grad_norm": 3.3399698734283447, + "learning_rate": 1.6515e-05, + "loss": 0.535, + "step": 6606 + }, + { + "epoch": 0.08260206505162629, + "grad_norm": 0.20170316100120544, + "learning_rate": 1.652e-05, + "loss": 0.2434, + "step": 6608 + }, + { + "epoch": 0.08262706567664192, + "grad_norm": 3.6961071491241455, + "learning_rate": 1.6525000000000002e-05, + "loss": 1.0036, + "step": 6610 + }, + { + "epoch": 0.08265206630165754, + "grad_norm": 2.6814210414886475, + "learning_rate": 1.6530000000000003e-05, + "loss": 0.5597, + "step": 6612 + }, + { + "epoch": 0.08267706692667316, + "grad_norm": 5.257541179656982, + "learning_rate": 1.6535e-05, + "loss": 2.123, + "step": 6614 + }, + { + "epoch": 0.0827020675516888, + "grad_norm": 2.9762306213378906, + "learning_rate": 1.654e-05, + "loss": 0.6893, + "step": 6616 + }, + { + "epoch": 0.08272706817670442, + "grad_norm": 2.727189779281616, + "learning_rate": 1.6545e-05, + "loss": 1.5589, + "step": 6618 + }, + { + "epoch": 0.08275206880172005, + "grad_norm": 0.01517017837613821, + "learning_rate": 1.6550000000000002e-05, + "loss": 0.4361, + "step": 6620 + }, + { + "epoch": 0.08277706942673567, + "grad_norm": 4.586521625518799, + "learning_rate": 1.6555e-05, + "loss": 2.2284, + "step": 6622 + }, + { + "epoch": 0.08280207005175129, + "grad_norm": 0.016229068860411644, + "learning_rate": 1.656e-05, + "loss": 0.1772, + "step": 6624 + }, + { + "epoch": 0.08282707067676692, + "grad_norm": 0.008016274310648441, + "learning_rate": 1.6565e-05, + "loss": 0.9273, + "step": 6626 + }, + { + "epoch": 0.08285207130178254, + "grad_norm": 3.5795581340789795, + "learning_rate": 1.6570000000000002e-05, + "loss": 1.2807, + "step": 6628 + }, + { + "epoch": 0.08287707192679818, + "grad_norm": 3.5620672702789307, + "learning_rate": 1.6575e-05, + "loss": 0.9419, + "step": 6630 + }, + { + "epoch": 0.0829020725518138, + "grad_norm": 2.6697518825531006, + "learning_rate": 1.658e-05, + "loss": 0.7821, + "step": 6632 + }, + { + "epoch": 0.08292707317682942, + "grad_norm": 4.909217834472656, + "learning_rate": 1.6585e-05, + "loss": 0.741, + "step": 6634 + }, + { + "epoch": 0.08295207380184505, + "grad_norm": 3.9583311080932617, + "learning_rate": 1.6590000000000002e-05, + "loss": 0.8146, + "step": 6636 + }, + { + "epoch": 0.08297707442686067, + "grad_norm": 2.819674491882324, + "learning_rate": 1.6595e-05, + "loss": 0.2395, + "step": 6638 + }, + { + "epoch": 0.0830020750518763, + "grad_norm": 2.802665948867798, + "learning_rate": 1.66e-05, + "loss": 0.5572, + "step": 6640 + }, + { + "epoch": 0.08302707567689192, + "grad_norm": 2.643665313720703, + "learning_rate": 1.6605e-05, + "loss": 0.5078, + "step": 6642 + }, + { + "epoch": 0.08305207630190754, + "grad_norm": 4.825368881225586, + "learning_rate": 1.6610000000000002e-05, + "loss": 1.2515, + "step": 6644 + }, + { + "epoch": 0.08307707692692318, + "grad_norm": 2.7552030086517334, + "learning_rate": 1.6615000000000003e-05, + "loss": 1.5204, + "step": 6646 + }, + { + "epoch": 0.0831020775519388, + "grad_norm": 6.6470513343811035, + "learning_rate": 1.662e-05, + "loss": 0.2055, + "step": 6648 + }, + { + "epoch": 0.08312707817695443, + "grad_norm": 8.229659080505371, + "learning_rate": 1.6625e-05, + "loss": 0.8408, + "step": 6650 + }, + { + "epoch": 0.08315207880197005, + "grad_norm": 3.0828773975372314, + "learning_rate": 1.6630000000000002e-05, + "loss": 1.5184, + "step": 6652 + }, + { + "epoch": 0.08317707942698567, + "grad_norm": 0.00667998194694519, + "learning_rate": 1.6635000000000003e-05, + "loss": 1.7853, + "step": 6654 + }, + { + "epoch": 0.0832020800520013, + "grad_norm": 1.786126971244812, + "learning_rate": 1.664e-05, + "loss": 0.8754, + "step": 6656 + }, + { + "epoch": 0.08322708067701692, + "grad_norm": 1.8038780689239502, + "learning_rate": 1.6645e-05, + "loss": 1.2313, + "step": 6658 + }, + { + "epoch": 0.08325208130203256, + "grad_norm": 2.150458812713623, + "learning_rate": 1.665e-05, + "loss": 0.2185, + "step": 6660 + }, + { + "epoch": 0.08327708192704818, + "grad_norm": 3.5658605098724365, + "learning_rate": 1.6655000000000002e-05, + "loss": 1.4685, + "step": 6662 + }, + { + "epoch": 0.0833020825520638, + "grad_norm": 2.6527175903320312, + "learning_rate": 1.666e-05, + "loss": 0.8916, + "step": 6664 + }, + { + "epoch": 0.08332708317707943, + "grad_norm": 0.0073565104976296425, + "learning_rate": 1.6665000000000004e-05, + "loss": 0.3633, + "step": 6666 + }, + { + "epoch": 0.08335208380209505, + "grad_norm": 4.167141437530518, + "learning_rate": 1.667e-05, + "loss": 0.6358, + "step": 6668 + }, + { + "epoch": 0.08337708442711068, + "grad_norm": 0.005511096678674221, + "learning_rate": 1.6675000000000002e-05, + "loss": 0.7792, + "step": 6670 + }, + { + "epoch": 0.0834020850521263, + "grad_norm": 5.068268775939941, + "learning_rate": 1.668e-05, + "loss": 1.5392, + "step": 6672 + }, + { + "epoch": 0.08342708567714192, + "grad_norm": 2.6180360317230225, + "learning_rate": 1.6685000000000004e-05, + "loss": 1.076, + "step": 6674 + }, + { + "epoch": 0.08345208630215756, + "grad_norm": 3.1098198890686035, + "learning_rate": 1.669e-05, + "loss": 1.3505, + "step": 6676 + }, + { + "epoch": 0.08347708692717318, + "grad_norm": 4.666235446929932, + "learning_rate": 1.6695000000000002e-05, + "loss": 1.4318, + "step": 6678 + }, + { + "epoch": 0.08350208755218881, + "grad_norm": 10.06293773651123, + "learning_rate": 1.67e-05, + "loss": 1.124, + "step": 6680 + }, + { + "epoch": 0.08352708817720443, + "grad_norm": 0.005157919600605965, + "learning_rate": 1.6705000000000004e-05, + "loss": 0.7157, + "step": 6682 + }, + { + "epoch": 0.08355208880222005, + "grad_norm": 4.2999043464660645, + "learning_rate": 1.671e-05, + "loss": 1.1312, + "step": 6684 + }, + { + "epoch": 0.08357708942723568, + "grad_norm": 2.44915771484375, + "learning_rate": 1.6715000000000002e-05, + "loss": 1.302, + "step": 6686 + }, + { + "epoch": 0.0836020900522513, + "grad_norm": 4.517419815063477, + "learning_rate": 1.672e-05, + "loss": 0.4671, + "step": 6688 + }, + { + "epoch": 0.08362709067726694, + "grad_norm": 5.810604095458984, + "learning_rate": 1.6725000000000003e-05, + "loss": 1.3123, + "step": 6690 + }, + { + "epoch": 0.08365209130228256, + "grad_norm": 2.351547956466675, + "learning_rate": 1.673e-05, + "loss": 0.3775, + "step": 6692 + }, + { + "epoch": 0.08367709192729818, + "grad_norm": 4.571915626525879, + "learning_rate": 1.6735e-05, + "loss": 0.7775, + "step": 6694 + }, + { + "epoch": 0.08370209255231381, + "grad_norm": 6.28919792175293, + "learning_rate": 1.6740000000000002e-05, + "loss": 0.7973, + "step": 6696 + }, + { + "epoch": 0.08372709317732943, + "grad_norm": 3.858957052230835, + "learning_rate": 1.6745000000000003e-05, + "loss": 0.6365, + "step": 6698 + }, + { + "epoch": 0.08375209380234507, + "grad_norm": 3.097585678100586, + "learning_rate": 1.675e-05, + "loss": 0.8757, + "step": 6700 + }, + { + "epoch": 0.08377709442736068, + "grad_norm": 2.2211952209472656, + "learning_rate": 1.6755e-05, + "loss": 0.4275, + "step": 6702 + }, + { + "epoch": 0.0838020950523763, + "grad_norm": 3.5867714881896973, + "learning_rate": 1.6760000000000002e-05, + "loss": 1.4987, + "step": 6704 + }, + { + "epoch": 0.08382709567739194, + "grad_norm": 2.7537481784820557, + "learning_rate": 1.6765000000000003e-05, + "loss": 0.2482, + "step": 6706 + }, + { + "epoch": 0.08385209630240756, + "grad_norm": 2.7636594772338867, + "learning_rate": 1.677e-05, + "loss": 0.6484, + "step": 6708 + }, + { + "epoch": 0.08387709692742319, + "grad_norm": 5.284765720367432, + "learning_rate": 1.6775e-05, + "loss": 0.9452, + "step": 6710 + }, + { + "epoch": 0.08390209755243881, + "grad_norm": 4.0064215660095215, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.2284, + "step": 6712 + }, + { + "epoch": 0.08392709817745443, + "grad_norm": 6.688210487365723, + "learning_rate": 1.6785000000000003e-05, + "loss": 1.5618, + "step": 6714 + }, + { + "epoch": 0.08395209880247007, + "grad_norm": 3.1477108001708984, + "learning_rate": 1.679e-05, + "loss": 2.3791, + "step": 6716 + }, + { + "epoch": 0.08397709942748569, + "grad_norm": 0.01192394457757473, + "learning_rate": 1.6795e-05, + "loss": 0.2759, + "step": 6718 + }, + { + "epoch": 0.08400210005250132, + "grad_norm": 4.853465557098389, + "learning_rate": 1.6800000000000002e-05, + "loss": 1.0883, + "step": 6720 + }, + { + "epoch": 0.08402710067751694, + "grad_norm": 2.678563117980957, + "learning_rate": 1.6805000000000003e-05, + "loss": 0.2678, + "step": 6722 + }, + { + "epoch": 0.08405210130253256, + "grad_norm": 1.7283872365951538, + "learning_rate": 1.6810000000000003e-05, + "loss": 1.5985, + "step": 6724 + }, + { + "epoch": 0.08407710192754819, + "grad_norm": 9.605424880981445, + "learning_rate": 1.6815e-05, + "loss": 1.2762, + "step": 6726 + }, + { + "epoch": 0.08410210255256381, + "grad_norm": 0.01419716328382492, + "learning_rate": 1.682e-05, + "loss": 0.0022, + "step": 6728 + }, + { + "epoch": 0.08412710317757945, + "grad_norm": 4.538006782531738, + "learning_rate": 1.6825000000000002e-05, + "loss": 1.2646, + "step": 6730 + }, + { + "epoch": 0.08415210380259507, + "grad_norm": 4.95416784286499, + "learning_rate": 1.6830000000000003e-05, + "loss": 1.4675, + "step": 6732 + }, + { + "epoch": 0.08417710442761069, + "grad_norm": 0.3135761022567749, + "learning_rate": 1.6835e-05, + "loss": 0.0057, + "step": 6734 + }, + { + "epoch": 0.08420210505262632, + "grad_norm": 0.007503182161599398, + "learning_rate": 1.684e-05, + "loss": 0.0847, + "step": 6736 + }, + { + "epoch": 0.08422710567764194, + "grad_norm": 3.193798542022705, + "learning_rate": 1.6845000000000002e-05, + "loss": 1.0229, + "step": 6738 + }, + { + "epoch": 0.08425210630265757, + "grad_norm": 3.077690839767456, + "learning_rate": 1.6850000000000003e-05, + "loss": 1.026, + "step": 6740 + }, + { + "epoch": 0.08427710692767319, + "grad_norm": 2.4088382720947266, + "learning_rate": 1.6855e-05, + "loss": 0.508, + "step": 6742 + }, + { + "epoch": 0.08430210755268881, + "grad_norm": 2.0929744243621826, + "learning_rate": 1.686e-05, + "loss": 0.7002, + "step": 6744 + }, + { + "epoch": 0.08432710817770445, + "grad_norm": 3.3036515712738037, + "learning_rate": 1.6865000000000002e-05, + "loss": 0.7769, + "step": 6746 + }, + { + "epoch": 0.08435210880272007, + "grad_norm": 3.2273213863372803, + "learning_rate": 1.6870000000000003e-05, + "loss": 0.5963, + "step": 6748 + }, + { + "epoch": 0.0843771094277357, + "grad_norm": 4.8012590408325195, + "learning_rate": 1.6875e-05, + "loss": 1.0002, + "step": 6750 + }, + { + "epoch": 0.08440211005275132, + "grad_norm": 0.6478224396705627, + "learning_rate": 1.688e-05, + "loss": 0.7996, + "step": 6752 + }, + { + "epoch": 0.08442711067776694, + "grad_norm": 2.277493715286255, + "learning_rate": 1.6885000000000002e-05, + "loss": 1.5342, + "step": 6754 + }, + { + "epoch": 0.08445211130278257, + "grad_norm": 2.241909980773926, + "learning_rate": 1.6890000000000003e-05, + "loss": 0.7581, + "step": 6756 + }, + { + "epoch": 0.08447711192779819, + "grad_norm": 1.7831724882125854, + "learning_rate": 1.6895e-05, + "loss": 0.112, + "step": 6758 + }, + { + "epoch": 0.08450211255281383, + "grad_norm": 4.645353317260742, + "learning_rate": 1.69e-05, + "loss": 1.7426, + "step": 6760 + }, + { + "epoch": 0.08452711317782945, + "grad_norm": 4.66052770614624, + "learning_rate": 1.6905e-05, + "loss": 1.4761, + "step": 6762 + }, + { + "epoch": 0.08455211380284507, + "grad_norm": 3.5181379318237305, + "learning_rate": 1.6910000000000002e-05, + "loss": 0.3186, + "step": 6764 + }, + { + "epoch": 0.0845771144278607, + "grad_norm": 2.717059373855591, + "learning_rate": 1.6915e-05, + "loss": 0.9221, + "step": 6766 + }, + { + "epoch": 0.08460211505287632, + "grad_norm": 2.446531057357788, + "learning_rate": 1.692e-05, + "loss": 0.9973, + "step": 6768 + }, + { + "epoch": 0.08462711567789195, + "grad_norm": 5.449763298034668, + "learning_rate": 1.6925e-05, + "loss": 1.5326, + "step": 6770 + }, + { + "epoch": 0.08465211630290757, + "grad_norm": 4.3568267822265625, + "learning_rate": 1.6930000000000002e-05, + "loss": 1.3515, + "step": 6772 + }, + { + "epoch": 0.0846771169279232, + "grad_norm": 3.2130866050720215, + "learning_rate": 1.6935000000000003e-05, + "loss": 0.6293, + "step": 6774 + }, + { + "epoch": 0.08470211755293883, + "grad_norm": 2.593045473098755, + "learning_rate": 1.694e-05, + "loss": 1.4488, + "step": 6776 + }, + { + "epoch": 0.08472711817795445, + "grad_norm": 5.285003662109375, + "learning_rate": 1.6945e-05, + "loss": 1.8411, + "step": 6778 + }, + { + "epoch": 0.08475211880297008, + "grad_norm": 1.244409441947937, + "learning_rate": 1.6950000000000002e-05, + "loss": 0.2608, + "step": 6780 + }, + { + "epoch": 0.0847771194279857, + "grad_norm": 3.5879175662994385, + "learning_rate": 1.6955000000000003e-05, + "loss": 1.3954, + "step": 6782 + }, + { + "epoch": 0.08480212005300132, + "grad_norm": 0.006590718869119883, + "learning_rate": 1.696e-05, + "loss": 0.4038, + "step": 6784 + }, + { + "epoch": 0.08482712067801695, + "grad_norm": 4.528584003448486, + "learning_rate": 1.6965e-05, + "loss": 1.0104, + "step": 6786 + }, + { + "epoch": 0.08485212130303257, + "grad_norm": 1.198104977607727, + "learning_rate": 1.6970000000000002e-05, + "loss": 0.0432, + "step": 6788 + }, + { + "epoch": 0.08487712192804821, + "grad_norm": 3.0933568477630615, + "learning_rate": 1.6975000000000003e-05, + "loss": 1.012, + "step": 6790 + }, + { + "epoch": 0.08490212255306383, + "grad_norm": 0.007433222606778145, + "learning_rate": 1.698e-05, + "loss": 0.6889, + "step": 6792 + }, + { + "epoch": 0.08492712317807945, + "grad_norm": 2.935041904449463, + "learning_rate": 1.6985e-05, + "loss": 1.1227, + "step": 6794 + }, + { + "epoch": 0.08495212380309508, + "grad_norm": 4.971336841583252, + "learning_rate": 1.699e-05, + "loss": 1.4073, + "step": 6796 + }, + { + "epoch": 0.0849771244281107, + "grad_norm": 3.3471860885620117, + "learning_rate": 1.6995000000000002e-05, + "loss": 1.4496, + "step": 6798 + }, + { + "epoch": 0.08500212505312633, + "grad_norm": 3.4907729625701904, + "learning_rate": 1.7e-05, + "loss": 1.4922, + "step": 6800 + }, + { + "epoch": 0.08502712567814195, + "grad_norm": 5.286045074462891, + "learning_rate": 1.7005e-05, + "loss": 0.8615, + "step": 6802 + }, + { + "epoch": 0.08505212630315757, + "grad_norm": 4.119279384613037, + "learning_rate": 1.701e-05, + "loss": 1.2698, + "step": 6804 + }, + { + "epoch": 0.08507712692817321, + "grad_norm": 3.9123520851135254, + "learning_rate": 1.7015000000000002e-05, + "loss": 1.1932, + "step": 6806 + }, + { + "epoch": 0.08510212755318883, + "grad_norm": 2.8649072647094727, + "learning_rate": 1.702e-05, + "loss": 1.2388, + "step": 6808 + }, + { + "epoch": 0.08512712817820446, + "grad_norm": 4.1610212326049805, + "learning_rate": 1.7025e-05, + "loss": 1.0066, + "step": 6810 + }, + { + "epoch": 0.08515212880322008, + "grad_norm": 3.5255796909332275, + "learning_rate": 1.703e-05, + "loss": 1.2743, + "step": 6812 + }, + { + "epoch": 0.0851771294282357, + "grad_norm": 3.1338250637054443, + "learning_rate": 1.7035000000000002e-05, + "loss": 0.9315, + "step": 6814 + }, + { + "epoch": 0.08520213005325133, + "grad_norm": 3.9721362590789795, + "learning_rate": 1.704e-05, + "loss": 1.0167, + "step": 6816 + }, + { + "epoch": 0.08522713067826695, + "grad_norm": 3.193751096725464, + "learning_rate": 1.7045e-05, + "loss": 0.94, + "step": 6818 + }, + { + "epoch": 0.08525213130328259, + "grad_norm": 3.091068983078003, + "learning_rate": 1.705e-05, + "loss": 1.1315, + "step": 6820 + }, + { + "epoch": 0.08527713192829821, + "grad_norm": 4.761346817016602, + "learning_rate": 1.7055000000000002e-05, + "loss": 0.9036, + "step": 6822 + }, + { + "epoch": 0.08530213255331383, + "grad_norm": 7.245108604431152, + "learning_rate": 1.7060000000000003e-05, + "loss": 0.5067, + "step": 6824 + }, + { + "epoch": 0.08532713317832946, + "grad_norm": 0.014678273350000381, + "learning_rate": 1.7065e-05, + "loss": 0.0002, + "step": 6826 + }, + { + "epoch": 0.08535213380334508, + "grad_norm": 3.8641958236694336, + "learning_rate": 1.707e-05, + "loss": 0.837, + "step": 6828 + }, + { + "epoch": 0.08537713442836072, + "grad_norm": 4.935196876525879, + "learning_rate": 1.7075e-05, + "loss": 1.0582, + "step": 6830 + }, + { + "epoch": 0.08540213505337634, + "grad_norm": 2.670936107635498, + "learning_rate": 1.7080000000000002e-05, + "loss": 0.8646, + "step": 6832 + }, + { + "epoch": 0.08542713567839195, + "grad_norm": 3.180347204208374, + "learning_rate": 1.7085e-05, + "loss": 1.0825, + "step": 6834 + }, + { + "epoch": 0.08545213630340759, + "grad_norm": 3.603001594543457, + "learning_rate": 1.709e-05, + "loss": 0.3806, + "step": 6836 + }, + { + "epoch": 0.08547713692842321, + "grad_norm": 2.004265069961548, + "learning_rate": 1.7095e-05, + "loss": 1.0911, + "step": 6838 + }, + { + "epoch": 0.08550213755343884, + "grad_norm": 3.5907649993896484, + "learning_rate": 1.7100000000000002e-05, + "loss": 0.7479, + "step": 6840 + }, + { + "epoch": 0.08552713817845446, + "grad_norm": 5.6488165855407715, + "learning_rate": 1.7105e-05, + "loss": 0.657, + "step": 6842 + }, + { + "epoch": 0.08555213880347008, + "grad_norm": 1.053591251373291, + "learning_rate": 1.711e-05, + "loss": 0.6308, + "step": 6844 + }, + { + "epoch": 0.08557713942848572, + "grad_norm": 4.0463385581970215, + "learning_rate": 1.7115e-05, + "loss": 1.2543, + "step": 6846 + }, + { + "epoch": 0.08560214005350134, + "grad_norm": 1.3688268661499023, + "learning_rate": 1.7120000000000002e-05, + "loss": 0.2616, + "step": 6848 + }, + { + "epoch": 0.08562714067851697, + "grad_norm": 2.7811567783355713, + "learning_rate": 1.7125e-05, + "loss": 1.16, + "step": 6850 + }, + { + "epoch": 0.08565214130353259, + "grad_norm": 5.293920516967773, + "learning_rate": 1.7130000000000004e-05, + "loss": 2.1824, + "step": 6852 + }, + { + "epoch": 0.08567714192854821, + "grad_norm": 3.852191209793091, + "learning_rate": 1.7135e-05, + "loss": 1.9432, + "step": 6854 + }, + { + "epoch": 0.08570214255356384, + "grad_norm": 2.339794635772705, + "learning_rate": 1.7140000000000002e-05, + "loss": 0.9214, + "step": 6856 + }, + { + "epoch": 0.08572714317857946, + "grad_norm": 3.3827056884765625, + "learning_rate": 1.7145e-05, + "loss": 1.072, + "step": 6858 + }, + { + "epoch": 0.0857521438035951, + "grad_norm": 3.7966678142547607, + "learning_rate": 1.7150000000000004e-05, + "loss": 1.5507, + "step": 6860 + }, + { + "epoch": 0.08577714442861072, + "grad_norm": 4.394893169403076, + "learning_rate": 1.7155e-05, + "loss": 0.8943, + "step": 6862 + }, + { + "epoch": 0.08580214505362634, + "grad_norm": 3.206866502761841, + "learning_rate": 1.7160000000000002e-05, + "loss": 1.3271, + "step": 6864 + }, + { + "epoch": 0.08582714567864197, + "grad_norm": 5.20649528503418, + "learning_rate": 1.7165e-05, + "loss": 0.8832, + "step": 6866 + }, + { + "epoch": 0.08585214630365759, + "grad_norm": 2.229609966278076, + "learning_rate": 1.7170000000000003e-05, + "loss": 0.5204, + "step": 6868 + }, + { + "epoch": 0.08587714692867322, + "grad_norm": 4.447109222412109, + "learning_rate": 1.7175e-05, + "loss": 1.2614, + "step": 6870 + }, + { + "epoch": 0.08590214755368884, + "grad_norm": 3.733642816543579, + "learning_rate": 1.718e-05, + "loss": 0.6022, + "step": 6872 + }, + { + "epoch": 0.08592714817870446, + "grad_norm": 3.9672045707702637, + "learning_rate": 1.7185e-05, + "loss": 1.8432, + "step": 6874 + }, + { + "epoch": 0.0859521488037201, + "grad_norm": 3.878300905227661, + "learning_rate": 1.7190000000000003e-05, + "loss": 1.4785, + "step": 6876 + }, + { + "epoch": 0.08597714942873572, + "grad_norm": 4.991541385650635, + "learning_rate": 1.7195e-05, + "loss": 0.3719, + "step": 6878 + }, + { + "epoch": 0.08600215005375135, + "grad_norm": 4.9941582679748535, + "learning_rate": 1.72e-05, + "loss": 1.3562, + "step": 6880 + }, + { + "epoch": 0.08602715067876697, + "grad_norm": 1.3625073432922363, + "learning_rate": 1.7205000000000002e-05, + "loss": 0.2948, + "step": 6882 + }, + { + "epoch": 0.08605215130378259, + "grad_norm": 6.295231342315674, + "learning_rate": 1.7210000000000003e-05, + "loss": 0.1764, + "step": 6884 + }, + { + "epoch": 0.08607715192879822, + "grad_norm": 2.6446354389190674, + "learning_rate": 1.7215e-05, + "loss": 0.6221, + "step": 6886 + }, + { + "epoch": 0.08610215255381384, + "grad_norm": 2.6815977096557617, + "learning_rate": 1.722e-05, + "loss": 0.783, + "step": 6888 + }, + { + "epoch": 0.08612715317882948, + "grad_norm": 1.4758661985397339, + "learning_rate": 1.7225000000000002e-05, + "loss": 1.9145, + "step": 6890 + }, + { + "epoch": 0.0861521538038451, + "grad_norm": 1.0117906332015991, + "learning_rate": 1.7230000000000003e-05, + "loss": 1.3145, + "step": 6892 + }, + { + "epoch": 0.08617715442886072, + "grad_norm": 2.4333417415618896, + "learning_rate": 1.7235e-05, + "loss": 1.1385, + "step": 6894 + }, + { + "epoch": 0.08620215505387635, + "grad_norm": 5.878175735473633, + "learning_rate": 1.724e-05, + "loss": 1.5256, + "step": 6896 + }, + { + "epoch": 0.08622715567889197, + "grad_norm": 0.007635308895260096, + "learning_rate": 1.7245000000000002e-05, + "loss": 0.0002, + "step": 6898 + }, + { + "epoch": 0.0862521563039076, + "grad_norm": 0.8108803033828735, + "learning_rate": 1.7250000000000003e-05, + "loss": 0.0136, + "step": 6900 + }, + { + "epoch": 0.08627715692892322, + "grad_norm": 3.09564208984375, + "learning_rate": 1.7255000000000003e-05, + "loss": 1.4222, + "step": 6902 + }, + { + "epoch": 0.08630215755393884, + "grad_norm": 0.760262668132782, + "learning_rate": 1.726e-05, + "loss": 0.4995, + "step": 6904 + }, + { + "epoch": 0.08632715817895448, + "grad_norm": 5.062269687652588, + "learning_rate": 1.7265e-05, + "loss": 1.7626, + "step": 6906 + }, + { + "epoch": 0.0863521588039701, + "grad_norm": 2.6966569423675537, + "learning_rate": 1.7270000000000002e-05, + "loss": 0.5687, + "step": 6908 + }, + { + "epoch": 0.08637715942898573, + "grad_norm": 3.971827983856201, + "learning_rate": 1.7275000000000003e-05, + "loss": 1.6043, + "step": 6910 + }, + { + "epoch": 0.08640216005400135, + "grad_norm": 4.249474048614502, + "learning_rate": 1.728e-05, + "loss": 0.8855, + "step": 6912 + }, + { + "epoch": 0.08642716067901697, + "grad_norm": 8.002070426940918, + "learning_rate": 1.7285e-05, + "loss": 1.1538, + "step": 6914 + }, + { + "epoch": 0.0864521613040326, + "grad_norm": 4.949833869934082, + "learning_rate": 1.7290000000000002e-05, + "loss": 1.3665, + "step": 6916 + }, + { + "epoch": 0.08647716192904822, + "grad_norm": 1.2490955591201782, + "learning_rate": 1.7295000000000003e-05, + "loss": 0.5657, + "step": 6918 + }, + { + "epoch": 0.08650216255406386, + "grad_norm": 0.009610850363969803, + "learning_rate": 1.73e-05, + "loss": 0.554, + "step": 6920 + }, + { + "epoch": 0.08652716317907948, + "grad_norm": 2.635662794113159, + "learning_rate": 1.7305e-05, + "loss": 0.1162, + "step": 6922 + }, + { + "epoch": 0.0865521638040951, + "grad_norm": 2.0745749473571777, + "learning_rate": 1.7310000000000002e-05, + "loss": 0.3941, + "step": 6924 + }, + { + "epoch": 0.08657716442911073, + "grad_norm": 0.005252284929156303, + "learning_rate": 1.7315000000000003e-05, + "loss": 0.4073, + "step": 6926 + }, + { + "epoch": 0.08660216505412635, + "grad_norm": 3.3801121711730957, + "learning_rate": 1.732e-05, + "loss": 1.0535, + "step": 6928 + }, + { + "epoch": 0.08662716567914198, + "grad_norm": 0.019221732392907143, + "learning_rate": 1.7325e-05, + "loss": 0.4644, + "step": 6930 + }, + { + "epoch": 0.0866521663041576, + "grad_norm": 1.3370519876480103, + "learning_rate": 1.7330000000000002e-05, + "loss": 0.8851, + "step": 6932 + }, + { + "epoch": 0.08667716692917322, + "grad_norm": 0.27778124809265137, + "learning_rate": 1.7335000000000003e-05, + "loss": 0.4109, + "step": 6934 + }, + { + "epoch": 0.08670216755418886, + "grad_norm": 5.521223545074463, + "learning_rate": 1.734e-05, + "loss": 2.4333, + "step": 6936 + }, + { + "epoch": 0.08672716817920448, + "grad_norm": 4.29935884475708, + "learning_rate": 1.7345e-05, + "loss": 0.9411, + "step": 6938 + }, + { + "epoch": 0.08675216880422011, + "grad_norm": 4.185783386230469, + "learning_rate": 1.735e-05, + "loss": 0.4332, + "step": 6940 + }, + { + "epoch": 0.08677716942923573, + "grad_norm": 0.6457161903381348, + "learning_rate": 1.7355000000000002e-05, + "loss": 0.4314, + "step": 6942 + }, + { + "epoch": 0.08680217005425135, + "grad_norm": 9.680858612060547, + "learning_rate": 1.736e-05, + "loss": 0.7967, + "step": 6944 + }, + { + "epoch": 0.08682717067926699, + "grad_norm": 9.652925491333008, + "learning_rate": 1.7365e-05, + "loss": 0.6733, + "step": 6946 + }, + { + "epoch": 0.0868521713042826, + "grad_norm": 11.504094123840332, + "learning_rate": 1.737e-05, + "loss": 0.8371, + "step": 6948 + }, + { + "epoch": 0.08687717192929824, + "grad_norm": 0.00911501795053482, + "learning_rate": 1.7375000000000002e-05, + "loss": 0.2048, + "step": 6950 + }, + { + "epoch": 0.08690217255431386, + "grad_norm": 5.202295303344727, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.6524, + "step": 6952 + }, + { + "epoch": 0.08692717317932948, + "grad_norm": 2.951737403869629, + "learning_rate": 1.7385e-05, + "loss": 1.471, + "step": 6954 + }, + { + "epoch": 0.08695217380434511, + "grad_norm": 3.655932664871216, + "learning_rate": 1.739e-05, + "loss": 0.7839, + "step": 6956 + }, + { + "epoch": 0.08697717442936073, + "grad_norm": 4.855175018310547, + "learning_rate": 1.7395000000000002e-05, + "loss": 0.3151, + "step": 6958 + }, + { + "epoch": 0.08700217505437637, + "grad_norm": 2.7927517890930176, + "learning_rate": 1.7400000000000003e-05, + "loss": 0.8356, + "step": 6960 + }, + { + "epoch": 0.08702717567939199, + "grad_norm": 0.09124696999788284, + "learning_rate": 1.7405e-05, + "loss": 0.6838, + "step": 6962 + }, + { + "epoch": 0.0870521763044076, + "grad_norm": 2.7594857215881348, + "learning_rate": 1.741e-05, + "loss": 1.2454, + "step": 6964 + }, + { + "epoch": 0.08707717692942324, + "grad_norm": 3.569260358810425, + "learning_rate": 1.7415000000000002e-05, + "loss": 1.102, + "step": 6966 + }, + { + "epoch": 0.08710217755443886, + "grad_norm": 1.9453279972076416, + "learning_rate": 1.7420000000000003e-05, + "loss": 1.6108, + "step": 6968 + }, + { + "epoch": 0.08712717817945449, + "grad_norm": 2.596170425415039, + "learning_rate": 1.7425e-05, + "loss": 0.9661, + "step": 6970 + }, + { + "epoch": 0.08715217880447011, + "grad_norm": 8.210526466369629, + "learning_rate": 1.743e-05, + "loss": 1.1409, + "step": 6972 + }, + { + "epoch": 0.08717717942948573, + "grad_norm": 6.281562328338623, + "learning_rate": 1.7435e-05, + "loss": 1.0045, + "step": 6974 + }, + { + "epoch": 0.08720218005450137, + "grad_norm": 9.528286933898926, + "learning_rate": 1.7440000000000002e-05, + "loss": 0.9667, + "step": 6976 + }, + { + "epoch": 0.08722718067951699, + "grad_norm": 2.6041243076324463, + "learning_rate": 1.7445e-05, + "loss": 0.3917, + "step": 6978 + }, + { + "epoch": 0.08725218130453262, + "grad_norm": 4.372514724731445, + "learning_rate": 1.7450000000000004e-05, + "loss": 1.3277, + "step": 6980 + }, + { + "epoch": 0.08727718192954824, + "grad_norm": 2.0342321395874023, + "learning_rate": 1.7455e-05, + "loss": 1.1606, + "step": 6982 + }, + { + "epoch": 0.08730218255456386, + "grad_norm": 3.8443453311920166, + "learning_rate": 1.7460000000000002e-05, + "loss": 0.8127, + "step": 6984 + }, + { + "epoch": 0.08732718317957949, + "grad_norm": 1.2075780630111694, + "learning_rate": 1.7465e-05, + "loss": 0.8568, + "step": 6986 + }, + { + "epoch": 0.08735218380459511, + "grad_norm": 3.1106679439544678, + "learning_rate": 1.7470000000000004e-05, + "loss": 0.2041, + "step": 6988 + }, + { + "epoch": 0.08737718442961075, + "grad_norm": 6.86865758895874, + "learning_rate": 1.7475e-05, + "loss": 0.6991, + "step": 6990 + }, + { + "epoch": 0.08740218505462637, + "grad_norm": 2.897477865219116, + "learning_rate": 1.7480000000000002e-05, + "loss": 0.7566, + "step": 6992 + }, + { + "epoch": 0.08742718567964199, + "grad_norm": 3.265144109725952, + "learning_rate": 1.7485e-05, + "loss": 0.6392, + "step": 6994 + }, + { + "epoch": 0.08745218630465762, + "grad_norm": 4.752957820892334, + "learning_rate": 1.7490000000000004e-05, + "loss": 1.597, + "step": 6996 + }, + { + "epoch": 0.08747718692967324, + "grad_norm": 3.357875108718872, + "learning_rate": 1.7495e-05, + "loss": 0.4507, + "step": 6998 + }, + { + "epoch": 0.08750218755468887, + "grad_norm": 3.307537794113159, + "learning_rate": 1.7500000000000002e-05, + "loss": 1.0832, + "step": 7000 + }, + { + "epoch": 0.0875271881797045, + "grad_norm": 4.157894134521484, + "learning_rate": 1.7505e-05, + "loss": 0.7735, + "step": 7002 + }, + { + "epoch": 0.08755218880472011, + "grad_norm": 4.833110809326172, + "learning_rate": 1.751e-05, + "loss": 1.2537, + "step": 7004 + }, + { + "epoch": 0.08757718942973575, + "grad_norm": 8.396344184875488, + "learning_rate": 1.7515e-05, + "loss": 0.2623, + "step": 7006 + }, + { + "epoch": 0.08760219005475137, + "grad_norm": 3.153474807739258, + "learning_rate": 1.752e-05, + "loss": 0.9474, + "step": 7008 + }, + { + "epoch": 0.087627190679767, + "grad_norm": 2.629051446914673, + "learning_rate": 1.7525000000000002e-05, + "loss": 0.4751, + "step": 7010 + }, + { + "epoch": 0.08765219130478262, + "grad_norm": 3.263666868209839, + "learning_rate": 1.753e-05, + "loss": 1.5861, + "step": 7012 + }, + { + "epoch": 0.08767719192979824, + "grad_norm": 1.319312572479248, + "learning_rate": 1.7535e-05, + "loss": 0.0688, + "step": 7014 + }, + { + "epoch": 0.08770219255481387, + "grad_norm": 0.28592565655708313, + "learning_rate": 1.754e-05, + "loss": 0.3845, + "step": 7016 + }, + { + "epoch": 0.0877271931798295, + "grad_norm": 3.029236316680908, + "learning_rate": 1.7545000000000002e-05, + "loss": 0.7389, + "step": 7018 + }, + { + "epoch": 0.08775219380484513, + "grad_norm": 0.011451010592281818, + "learning_rate": 1.755e-05, + "loss": 0.8708, + "step": 7020 + }, + { + "epoch": 0.08777719442986075, + "grad_norm": 4.1442437171936035, + "learning_rate": 1.7555e-05, + "loss": 1.1107, + "step": 7022 + }, + { + "epoch": 0.08780219505487637, + "grad_norm": 0.426169216632843, + "learning_rate": 1.756e-05, + "loss": 0.1039, + "step": 7024 + }, + { + "epoch": 0.087827195679892, + "grad_norm": 1.6667131185531616, + "learning_rate": 1.7565000000000002e-05, + "loss": 0.9761, + "step": 7026 + }, + { + "epoch": 0.08785219630490762, + "grad_norm": 0.009760293178260326, + "learning_rate": 1.757e-05, + "loss": 0.7092, + "step": 7028 + }, + { + "epoch": 0.08787719692992325, + "grad_norm": 2.3190524578094482, + "learning_rate": 1.7575000000000004e-05, + "loss": 0.5448, + "step": 7030 + }, + { + "epoch": 0.08790219755493887, + "grad_norm": 6.179687023162842, + "learning_rate": 1.758e-05, + "loss": 0.358, + "step": 7032 + }, + { + "epoch": 0.0879271981799545, + "grad_norm": 10.745038032531738, + "learning_rate": 1.7585000000000002e-05, + "loss": 1.8228, + "step": 7034 + }, + { + "epoch": 0.08795219880497013, + "grad_norm": 4.446945667266846, + "learning_rate": 1.759e-05, + "loss": 0.9547, + "step": 7036 + }, + { + "epoch": 0.08797719942998575, + "grad_norm": 3.2849044799804688, + "learning_rate": 1.7595000000000003e-05, + "loss": 1.145, + "step": 7038 + }, + { + "epoch": 0.08800220005500138, + "grad_norm": 3.672414779663086, + "learning_rate": 1.76e-05, + "loss": 1.1221, + "step": 7040 + }, + { + "epoch": 0.088027200680017, + "grad_norm": 0.22334301471710205, + "learning_rate": 1.7605000000000002e-05, + "loss": 0.5102, + "step": 7042 + }, + { + "epoch": 0.08805220130503262, + "grad_norm": 4.836431980133057, + "learning_rate": 1.761e-05, + "loss": 1.1549, + "step": 7044 + }, + { + "epoch": 0.08807720193004825, + "grad_norm": 3.662036418914795, + "learning_rate": 1.7615000000000003e-05, + "loss": 0.5735, + "step": 7046 + }, + { + "epoch": 0.08810220255506387, + "grad_norm": 7.0138840675354, + "learning_rate": 1.762e-05, + "loss": 0.927, + "step": 7048 + }, + { + "epoch": 0.08812720318007951, + "grad_norm": 3.755854368209839, + "learning_rate": 1.7625e-05, + "loss": 1.1083, + "step": 7050 + }, + { + "epoch": 0.08815220380509513, + "grad_norm": 2.280338764190674, + "learning_rate": 1.763e-05, + "loss": 0.3045, + "step": 7052 + }, + { + "epoch": 0.08817720443011075, + "grad_norm": 0.4658466875553131, + "learning_rate": 1.7635000000000003e-05, + "loss": 1.0692, + "step": 7054 + }, + { + "epoch": 0.08820220505512638, + "grad_norm": 3.135542392730713, + "learning_rate": 1.764e-05, + "loss": 1.1189, + "step": 7056 + }, + { + "epoch": 0.088227205680142, + "grad_norm": 3.8225486278533936, + "learning_rate": 1.7645e-05, + "loss": 0.7233, + "step": 7058 + }, + { + "epoch": 0.08825220630515763, + "grad_norm": 0.02153937704861164, + "learning_rate": 1.7650000000000002e-05, + "loss": 0.7514, + "step": 7060 + }, + { + "epoch": 0.08827720693017325, + "grad_norm": 2.5572080612182617, + "learning_rate": 1.7655000000000003e-05, + "loss": 0.3869, + "step": 7062 + }, + { + "epoch": 0.08830220755518887, + "grad_norm": 2.1050169467926025, + "learning_rate": 1.766e-05, + "loss": 0.5831, + "step": 7064 + }, + { + "epoch": 0.08832720818020451, + "grad_norm": 0.017944956198334694, + "learning_rate": 1.7665e-05, + "loss": 0.0893, + "step": 7066 + }, + { + "epoch": 0.08835220880522013, + "grad_norm": 4.200535297393799, + "learning_rate": 1.7670000000000002e-05, + "loss": 1.0677, + "step": 7068 + }, + { + "epoch": 0.08837720943023576, + "grad_norm": 0.9834591746330261, + "learning_rate": 1.7675000000000003e-05, + "loss": 0.0479, + "step": 7070 + }, + { + "epoch": 0.08840221005525138, + "grad_norm": 0.010727907530963421, + "learning_rate": 1.768e-05, + "loss": 0.6336, + "step": 7072 + }, + { + "epoch": 0.088427210680267, + "grad_norm": 2.9163198471069336, + "learning_rate": 1.7685e-05, + "loss": 1.4858, + "step": 7074 + }, + { + "epoch": 0.08845221130528264, + "grad_norm": 4.128634929656982, + "learning_rate": 1.7690000000000002e-05, + "loss": 1.1891, + "step": 7076 + }, + { + "epoch": 0.08847721193029826, + "grad_norm": 6.9572529792785645, + "learning_rate": 1.7695000000000003e-05, + "loss": 1.2747, + "step": 7078 + }, + { + "epoch": 0.08850221255531389, + "grad_norm": 2.16060733795166, + "learning_rate": 1.77e-05, + "loss": 1.0697, + "step": 7080 + }, + { + "epoch": 0.08852721318032951, + "grad_norm": 3.430983066558838, + "learning_rate": 1.7705e-05, + "loss": 1.6856, + "step": 7082 + }, + { + "epoch": 0.08855221380534513, + "grad_norm": 1.5508347749710083, + "learning_rate": 1.771e-05, + "loss": 0.4784, + "step": 7084 + }, + { + "epoch": 0.08857721443036076, + "grad_norm": 2.3397161960601807, + "learning_rate": 1.7715000000000002e-05, + "loss": 0.6453, + "step": 7086 + }, + { + "epoch": 0.08860221505537638, + "grad_norm": 3.058504343032837, + "learning_rate": 1.7720000000000003e-05, + "loss": 1.2231, + "step": 7088 + }, + { + "epoch": 0.08862721568039202, + "grad_norm": 3.203179121017456, + "learning_rate": 1.7725e-05, + "loss": 1.0987, + "step": 7090 + }, + { + "epoch": 0.08865221630540764, + "grad_norm": 4.110805511474609, + "learning_rate": 1.773e-05, + "loss": 1.3647, + "step": 7092 + }, + { + "epoch": 0.08867721693042326, + "grad_norm": 4.536674499511719, + "learning_rate": 1.7735000000000002e-05, + "loss": 1.441, + "step": 7094 + }, + { + "epoch": 0.08870221755543889, + "grad_norm": 2.025047779083252, + "learning_rate": 1.7740000000000003e-05, + "loss": 1.3378, + "step": 7096 + }, + { + "epoch": 0.08872721818045451, + "grad_norm": 4.242796897888184, + "learning_rate": 1.7745e-05, + "loss": 0.9917, + "step": 7098 + }, + { + "epoch": 0.08875221880547014, + "grad_norm": 2.9048168659210205, + "learning_rate": 1.775e-05, + "loss": 1.588, + "step": 7100 + }, + { + "epoch": 0.08877721943048576, + "grad_norm": 7.254281044006348, + "learning_rate": 1.7755000000000002e-05, + "loss": 1.336, + "step": 7102 + }, + { + "epoch": 0.08880222005550138, + "grad_norm": 1.4612329006195068, + "learning_rate": 1.7760000000000003e-05, + "loss": 0.0434, + "step": 7104 + }, + { + "epoch": 0.08882722068051702, + "grad_norm": 4.77567720413208, + "learning_rate": 1.7765e-05, + "loss": 1.1734, + "step": 7106 + }, + { + "epoch": 0.08885222130553264, + "grad_norm": 2.4750945568084717, + "learning_rate": 1.777e-05, + "loss": 0.5176, + "step": 7108 + }, + { + "epoch": 0.08887722193054827, + "grad_norm": 0.014750744216144085, + "learning_rate": 1.7775000000000002e-05, + "loss": 0.1302, + "step": 7110 + }, + { + "epoch": 0.08890222255556389, + "grad_norm": 0.012025171890854836, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0049, + "step": 7112 + }, + { + "epoch": 0.08892722318057951, + "grad_norm": 2.3654658794403076, + "learning_rate": 1.7785e-05, + "loss": 0.2673, + "step": 7114 + }, + { + "epoch": 0.08895222380559514, + "grad_norm": 3.9790282249450684, + "learning_rate": 1.779e-05, + "loss": 0.874, + "step": 7116 + }, + { + "epoch": 0.08897722443061076, + "grad_norm": 3.728808879852295, + "learning_rate": 1.7795e-05, + "loss": 0.9973, + "step": 7118 + }, + { + "epoch": 0.0890022250556264, + "grad_norm": 4.801133632659912, + "learning_rate": 1.7800000000000002e-05, + "loss": 1.0062, + "step": 7120 + }, + { + "epoch": 0.08902722568064202, + "grad_norm": 0.011529332958161831, + "learning_rate": 1.7805e-05, + "loss": 0.1576, + "step": 7122 + }, + { + "epoch": 0.08905222630565764, + "grad_norm": 7.326840877532959, + "learning_rate": 1.781e-05, + "loss": 1.5846, + "step": 7124 + }, + { + "epoch": 0.08907722693067327, + "grad_norm": 0.009710129350423813, + "learning_rate": 1.7815e-05, + "loss": 0.1772, + "step": 7126 + }, + { + "epoch": 0.08910222755568889, + "grad_norm": 4.38623046875, + "learning_rate": 1.7820000000000002e-05, + "loss": 0.9223, + "step": 7128 + }, + { + "epoch": 0.08912722818070452, + "grad_norm": 2.237874984741211, + "learning_rate": 1.7825e-05, + "loss": 1.9467, + "step": 7130 + }, + { + "epoch": 0.08915222880572014, + "grad_norm": 0.009139008820056915, + "learning_rate": 1.783e-05, + "loss": 0.5379, + "step": 7132 + }, + { + "epoch": 0.08917722943073576, + "grad_norm": 4.271993160247803, + "learning_rate": 1.7835e-05, + "loss": 0.9454, + "step": 7134 + }, + { + "epoch": 0.0892022300557514, + "grad_norm": 19.495996475219727, + "learning_rate": 1.7840000000000002e-05, + "loss": 2.2674, + "step": 7136 + }, + { + "epoch": 0.08922723068076702, + "grad_norm": 1.239255428314209, + "learning_rate": 1.7845000000000003e-05, + "loss": 0.8216, + "step": 7138 + }, + { + "epoch": 0.08925223130578265, + "grad_norm": 2.578010082244873, + "learning_rate": 1.785e-05, + "loss": 0.8071, + "step": 7140 + }, + { + "epoch": 0.08927723193079827, + "grad_norm": 2.032902717590332, + "learning_rate": 1.7855e-05, + "loss": 1.0442, + "step": 7142 + }, + { + "epoch": 0.08930223255581389, + "grad_norm": 5.165532112121582, + "learning_rate": 1.7860000000000002e-05, + "loss": 0.8948, + "step": 7144 + }, + { + "epoch": 0.08932723318082952, + "grad_norm": 4.705962181091309, + "learning_rate": 1.7865000000000003e-05, + "loss": 1.2438, + "step": 7146 + }, + { + "epoch": 0.08935223380584514, + "grad_norm": 5.653369426727295, + "learning_rate": 1.787e-05, + "loss": 0.5826, + "step": 7148 + }, + { + "epoch": 0.08937723443086078, + "grad_norm": 3.100085973739624, + "learning_rate": 1.7875e-05, + "loss": 0.9385, + "step": 7150 + }, + { + "epoch": 0.0894022350558764, + "grad_norm": 3.4530069828033447, + "learning_rate": 1.788e-05, + "loss": 1.369, + "step": 7152 + }, + { + "epoch": 0.08942723568089202, + "grad_norm": 0.01599852181971073, + "learning_rate": 1.7885000000000002e-05, + "loss": 0.1096, + "step": 7154 + }, + { + "epoch": 0.08945223630590765, + "grad_norm": 1.8738723993301392, + "learning_rate": 1.789e-05, + "loss": 1.0412, + "step": 7156 + }, + { + "epoch": 0.08947723693092327, + "grad_norm": 2.8758537769317627, + "learning_rate": 1.7895000000000004e-05, + "loss": 0.4432, + "step": 7158 + }, + { + "epoch": 0.0895022375559389, + "grad_norm": 5.3427276611328125, + "learning_rate": 1.79e-05, + "loss": 2.4668, + "step": 7160 + }, + { + "epoch": 0.08952723818095452, + "grad_norm": 0.017420480027794838, + "learning_rate": 1.7905000000000002e-05, + "loss": 0.2339, + "step": 7162 + }, + { + "epoch": 0.08955223880597014, + "grad_norm": 0.010813610628247261, + "learning_rate": 1.791e-05, + "loss": 0.7084, + "step": 7164 + }, + { + "epoch": 0.08957723943098578, + "grad_norm": 2.8653476238250732, + "learning_rate": 1.7915000000000004e-05, + "loss": 1.1289, + "step": 7166 + }, + { + "epoch": 0.0896022400560014, + "grad_norm": 5.224687576293945, + "learning_rate": 1.792e-05, + "loss": 1.4032, + "step": 7168 + }, + { + "epoch": 0.08962724068101703, + "grad_norm": 4.134850978851318, + "learning_rate": 1.7925000000000002e-05, + "loss": 0.7288, + "step": 7170 + }, + { + "epoch": 0.08965224130603265, + "grad_norm": 2.4694716930389404, + "learning_rate": 1.793e-05, + "loss": 1.2086, + "step": 7172 + }, + { + "epoch": 0.08967724193104827, + "grad_norm": 4.637180328369141, + "learning_rate": 1.7935000000000004e-05, + "loss": 0.706, + "step": 7174 + }, + { + "epoch": 0.0897022425560639, + "grad_norm": 6.045943260192871, + "learning_rate": 1.794e-05, + "loss": 1.0264, + "step": 7176 + }, + { + "epoch": 0.08972724318107952, + "grad_norm": 3.037900686264038, + "learning_rate": 1.7945000000000002e-05, + "loss": 0.96, + "step": 7178 + }, + { + "epoch": 0.08975224380609516, + "grad_norm": 0.10700883716344833, + "learning_rate": 1.795e-05, + "loss": 0.1629, + "step": 7180 + }, + { + "epoch": 0.08977724443111078, + "grad_norm": 5.4338579177856445, + "learning_rate": 1.7955000000000003e-05, + "loss": 0.859, + "step": 7182 + }, + { + "epoch": 0.0898022450561264, + "grad_norm": 0.010407590307295322, + "learning_rate": 1.796e-05, + "loss": 0.0396, + "step": 7184 + }, + { + "epoch": 0.08982724568114203, + "grad_norm": 2.4697325229644775, + "learning_rate": 1.7965e-05, + "loss": 0.5222, + "step": 7186 + }, + { + "epoch": 0.08985224630615765, + "grad_norm": 1.8618019819259644, + "learning_rate": 1.7970000000000002e-05, + "loss": 2.1827, + "step": 7188 + }, + { + "epoch": 0.08987724693117329, + "grad_norm": 1.485288381576538, + "learning_rate": 1.7975000000000003e-05, + "loss": 0.3829, + "step": 7190 + }, + { + "epoch": 0.0899022475561889, + "grad_norm": 2.8337173461914062, + "learning_rate": 1.798e-05, + "loss": 0.8421, + "step": 7192 + }, + { + "epoch": 0.08992724818120452, + "grad_norm": 2.3884661197662354, + "learning_rate": 1.7985e-05, + "loss": 0.9372, + "step": 7194 + }, + { + "epoch": 0.08995224880622016, + "grad_norm": 0.55622798204422, + "learning_rate": 1.7990000000000002e-05, + "loss": 0.0618, + "step": 7196 + }, + { + "epoch": 0.08997724943123578, + "grad_norm": 3.5924339294433594, + "learning_rate": 1.7995000000000003e-05, + "loss": 0.8931, + "step": 7198 + }, + { + "epoch": 0.09000225005625141, + "grad_norm": 3.7497947216033936, + "learning_rate": 1.8e-05, + "loss": 0.7169, + "step": 7200 + }, + { + "epoch": 0.09002725068126703, + "grad_norm": 3.228544235229492, + "learning_rate": 1.8005e-05, + "loss": 1.4919, + "step": 7202 + }, + { + "epoch": 0.09005225130628265, + "grad_norm": 6.076109886169434, + "learning_rate": 1.8010000000000002e-05, + "loss": 0.9653, + "step": 7204 + }, + { + "epoch": 0.09007725193129829, + "grad_norm": 3.375145435333252, + "learning_rate": 1.8015000000000003e-05, + "loss": 0.4093, + "step": 7206 + }, + { + "epoch": 0.0901022525563139, + "grad_norm": 2.3710241317749023, + "learning_rate": 1.802e-05, + "loss": 0.5086, + "step": 7208 + }, + { + "epoch": 0.09012725318132954, + "grad_norm": 5.625446319580078, + "learning_rate": 1.8025e-05, + "loss": 1.1451, + "step": 7210 + }, + { + "epoch": 0.09015225380634516, + "grad_norm": 4.645512580871582, + "learning_rate": 1.8030000000000002e-05, + "loss": 0.7028, + "step": 7212 + }, + { + "epoch": 0.09017725443136078, + "grad_norm": 2.719827890396118, + "learning_rate": 1.8035000000000003e-05, + "loss": 0.5532, + "step": 7214 + }, + { + "epoch": 0.09020225505637641, + "grad_norm": 3.6153564453125, + "learning_rate": 1.8040000000000003e-05, + "loss": 1.1515, + "step": 7216 + }, + { + "epoch": 0.09022725568139203, + "grad_norm": 2.392167568206787, + "learning_rate": 1.8045e-05, + "loss": 0.4263, + "step": 7218 + }, + { + "epoch": 0.09025225630640767, + "grad_norm": 3.153127431869507, + "learning_rate": 1.805e-05, + "loss": 0.5597, + "step": 7220 + }, + { + "epoch": 0.09027725693142329, + "grad_norm": 4.960170269012451, + "learning_rate": 1.8055000000000002e-05, + "loss": 1.2591, + "step": 7222 + }, + { + "epoch": 0.0903022575564389, + "grad_norm": 5.848571300506592, + "learning_rate": 1.8060000000000003e-05, + "loss": 0.6375, + "step": 7224 + }, + { + "epoch": 0.09032725818145454, + "grad_norm": 2.5413076877593994, + "learning_rate": 1.8065e-05, + "loss": 0.1112, + "step": 7226 + }, + { + "epoch": 0.09035225880647016, + "grad_norm": 7.239835739135742, + "learning_rate": 1.807e-05, + "loss": 1.3335, + "step": 7228 + }, + { + "epoch": 0.09037725943148579, + "grad_norm": 3.800232172012329, + "learning_rate": 1.8075000000000002e-05, + "loss": 0.5193, + "step": 7230 + }, + { + "epoch": 0.09040226005650141, + "grad_norm": 3.7557413578033447, + "learning_rate": 1.8080000000000003e-05, + "loss": 2.0868, + "step": 7232 + }, + { + "epoch": 0.09042726068151703, + "grad_norm": 3.02622127532959, + "learning_rate": 1.8085e-05, + "loss": 0.9259, + "step": 7234 + }, + { + "epoch": 0.09045226130653267, + "grad_norm": 3.7502522468566895, + "learning_rate": 1.809e-05, + "loss": 1.9156, + "step": 7236 + }, + { + "epoch": 0.09047726193154829, + "grad_norm": 0.0066614230163395405, + "learning_rate": 1.8095000000000002e-05, + "loss": 0.2414, + "step": 7238 + }, + { + "epoch": 0.09050226255656392, + "grad_norm": 6.953485012054443, + "learning_rate": 1.8100000000000003e-05, + "loss": 0.5922, + "step": 7240 + }, + { + "epoch": 0.09052726318157954, + "grad_norm": 1.687473177909851, + "learning_rate": 1.8105e-05, + "loss": 0.8192, + "step": 7242 + }, + { + "epoch": 0.09055226380659516, + "grad_norm": 3.490320920944214, + "learning_rate": 1.811e-05, + "loss": 0.9199, + "step": 7244 + }, + { + "epoch": 0.0905772644316108, + "grad_norm": 3.20817232131958, + "learning_rate": 1.8115000000000002e-05, + "loss": 0.8775, + "step": 7246 + }, + { + "epoch": 0.09060226505662641, + "grad_norm": 4.3680830001831055, + "learning_rate": 1.8120000000000003e-05, + "loss": 0.9332, + "step": 7248 + }, + { + "epoch": 0.09062726568164205, + "grad_norm": 6.784231185913086, + "learning_rate": 1.8125e-05, + "loss": 2.0372, + "step": 7250 + }, + { + "epoch": 0.09065226630665767, + "grad_norm": 4.649172782897949, + "learning_rate": 1.813e-05, + "loss": 1.7248, + "step": 7252 + }, + { + "epoch": 0.09067726693167329, + "grad_norm": 4.808923721313477, + "learning_rate": 1.8135000000000002e-05, + "loss": 1.8012, + "step": 7254 + }, + { + "epoch": 0.09070226755668892, + "grad_norm": 4.717310428619385, + "learning_rate": 1.8140000000000003e-05, + "loss": 1.3274, + "step": 7256 + }, + { + "epoch": 0.09072726818170454, + "grad_norm": 2.1856751441955566, + "learning_rate": 1.8145e-05, + "loss": 0.2126, + "step": 7258 + }, + { + "epoch": 0.09075226880672017, + "grad_norm": 0.03693837672472, + "learning_rate": 1.815e-05, + "loss": 0.748, + "step": 7260 + }, + { + "epoch": 0.0907772694317358, + "grad_norm": 1.8379549980163574, + "learning_rate": 1.8155e-05, + "loss": 0.3048, + "step": 7262 + }, + { + "epoch": 0.09080227005675141, + "grad_norm": 0.08576467633247375, + "learning_rate": 1.8160000000000002e-05, + "loss": 0.4175, + "step": 7264 + }, + { + "epoch": 0.09082727068176705, + "grad_norm": 2.63741135597229, + "learning_rate": 1.8165000000000003e-05, + "loss": 0.6172, + "step": 7266 + }, + { + "epoch": 0.09085227130678267, + "grad_norm": 0.008612658828496933, + "learning_rate": 1.817e-05, + "loss": 0.8987, + "step": 7268 + }, + { + "epoch": 0.0908772719317983, + "grad_norm": 2.6574907302856445, + "learning_rate": 1.8175e-05, + "loss": 0.9215, + "step": 7270 + }, + { + "epoch": 0.09090227255681392, + "grad_norm": 3.2113940715789795, + "learning_rate": 1.8180000000000002e-05, + "loss": 1.7161, + "step": 7272 + }, + { + "epoch": 0.09092727318182954, + "grad_norm": 2.0340609550476074, + "learning_rate": 1.8185000000000003e-05, + "loss": 0.8464, + "step": 7274 + }, + { + "epoch": 0.09095227380684517, + "grad_norm": 4.705951690673828, + "learning_rate": 1.819e-05, + "loss": 2.3205, + "step": 7276 + }, + { + "epoch": 0.0909772744318608, + "grad_norm": 3.598632574081421, + "learning_rate": 1.8195e-05, + "loss": 0.5665, + "step": 7278 + }, + { + "epoch": 0.09100227505687643, + "grad_norm": 3.583378553390503, + "learning_rate": 1.8200000000000002e-05, + "loss": 0.5574, + "step": 7280 + }, + { + "epoch": 0.09102727568189205, + "grad_norm": 3.0108277797698975, + "learning_rate": 1.8205000000000003e-05, + "loss": 1.1547, + "step": 7282 + }, + { + "epoch": 0.09105227630690767, + "grad_norm": 1.9125834703445435, + "learning_rate": 1.821e-05, + "loss": 1.0473, + "step": 7284 + }, + { + "epoch": 0.0910772769319233, + "grad_norm": 11.369312286376953, + "learning_rate": 1.8215e-05, + "loss": 1.4276, + "step": 7286 + }, + { + "epoch": 0.09110227755693892, + "grad_norm": 4.132571697235107, + "learning_rate": 1.8220000000000002e-05, + "loss": 1.3796, + "step": 7288 + }, + { + "epoch": 0.09112727818195455, + "grad_norm": 3.3533685207366943, + "learning_rate": 1.8225000000000003e-05, + "loss": 0.7608, + "step": 7290 + }, + { + "epoch": 0.09115227880697017, + "grad_norm": 4.147027492523193, + "learning_rate": 1.823e-05, + "loss": 1.4031, + "step": 7292 + }, + { + "epoch": 0.0911772794319858, + "grad_norm": 2.5724401473999023, + "learning_rate": 1.8235e-05, + "loss": 1.7975, + "step": 7294 + }, + { + "epoch": 0.09120228005700143, + "grad_norm": 2.3095457553863525, + "learning_rate": 1.824e-05, + "loss": 1.692, + "step": 7296 + }, + { + "epoch": 0.09122728068201705, + "grad_norm": 4.711038112640381, + "learning_rate": 1.8245000000000002e-05, + "loss": 2.1333, + "step": 7298 + }, + { + "epoch": 0.09125228130703268, + "grad_norm": 0.9201238751411438, + "learning_rate": 1.825e-05, + "loss": 0.9974, + "step": 7300 + }, + { + "epoch": 0.0912772819320483, + "grad_norm": 2.1912097930908203, + "learning_rate": 1.8255e-05, + "loss": 0.7841, + "step": 7302 + }, + { + "epoch": 0.09130228255706392, + "grad_norm": 4.838287830352783, + "learning_rate": 1.826e-05, + "loss": 2.0942, + "step": 7304 + }, + { + "epoch": 0.09132728318207955, + "grad_norm": 3.6654365062713623, + "learning_rate": 1.8265000000000002e-05, + "loss": 0.768, + "step": 7306 + }, + { + "epoch": 0.09135228380709517, + "grad_norm": 1.5118129253387451, + "learning_rate": 1.827e-05, + "loss": 0.6801, + "step": 7308 + }, + { + "epoch": 0.09137728443211081, + "grad_norm": 2.089829206466675, + "learning_rate": 1.8275e-05, + "loss": 0.4609, + "step": 7310 + }, + { + "epoch": 0.09140228505712643, + "grad_norm": 4.293592929840088, + "learning_rate": 1.828e-05, + "loss": 1.546, + "step": 7312 + }, + { + "epoch": 0.09142728568214205, + "grad_norm": 1.5661789178848267, + "learning_rate": 1.8285000000000002e-05, + "loss": 0.6947, + "step": 7314 + }, + { + "epoch": 0.09145228630715768, + "grad_norm": 4.562685489654541, + "learning_rate": 1.8290000000000003e-05, + "loss": 1.6811, + "step": 7316 + }, + { + "epoch": 0.0914772869321733, + "grad_norm": 0.013615457341074944, + "learning_rate": 1.8295e-05, + "loss": 0.2178, + "step": 7318 + }, + { + "epoch": 0.09150228755718894, + "grad_norm": 4.118705749511719, + "learning_rate": 1.83e-05, + "loss": 1.7319, + "step": 7320 + }, + { + "epoch": 0.09152728818220456, + "grad_norm": 2.2092502117156982, + "learning_rate": 1.8305000000000002e-05, + "loss": 1.929, + "step": 7322 + }, + { + "epoch": 0.09155228880722018, + "grad_norm": 0.02670171484351158, + "learning_rate": 1.8310000000000003e-05, + "loss": 0.598, + "step": 7324 + }, + { + "epoch": 0.09157728943223581, + "grad_norm": 2.7380521297454834, + "learning_rate": 1.8315e-05, + "loss": 0.6348, + "step": 7326 + }, + { + "epoch": 0.09160229005725143, + "grad_norm": 2.0815486907958984, + "learning_rate": 1.832e-05, + "loss": 0.1222, + "step": 7328 + }, + { + "epoch": 0.09162729068226706, + "grad_norm": 1.5216268301010132, + "learning_rate": 1.8325e-05, + "loss": 0.6044, + "step": 7330 + }, + { + "epoch": 0.09165229130728268, + "grad_norm": 1.5207717418670654, + "learning_rate": 1.8330000000000002e-05, + "loss": 0.0584, + "step": 7332 + }, + { + "epoch": 0.0916772919322983, + "grad_norm": 2.9309651851654053, + "learning_rate": 1.8335e-05, + "loss": 0.7949, + "step": 7334 + }, + { + "epoch": 0.09170229255731394, + "grad_norm": 4.016786575317383, + "learning_rate": 1.834e-05, + "loss": 1.588, + "step": 7336 + }, + { + "epoch": 0.09172729318232956, + "grad_norm": 5.383805274963379, + "learning_rate": 1.8345e-05, + "loss": 1.5445, + "step": 7338 + }, + { + "epoch": 0.09175229380734519, + "grad_norm": 3.4780964851379395, + "learning_rate": 1.8350000000000002e-05, + "loss": 1.5464, + "step": 7340 + }, + { + "epoch": 0.09177729443236081, + "grad_norm": 7.073495388031006, + "learning_rate": 1.8355e-05, + "loss": 0.8427, + "step": 7342 + }, + { + "epoch": 0.09180229505737643, + "grad_norm": 2.862239122390747, + "learning_rate": 1.8360000000000004e-05, + "loss": 0.8444, + "step": 7344 + }, + { + "epoch": 0.09182729568239206, + "grad_norm": 6.581540584564209, + "learning_rate": 1.8365e-05, + "loss": 0.416, + "step": 7346 + }, + { + "epoch": 0.09185229630740768, + "grad_norm": 17.116708755493164, + "learning_rate": 1.8370000000000002e-05, + "loss": 1.8795, + "step": 7348 + }, + { + "epoch": 0.09187729693242332, + "grad_norm": 3.039503335952759, + "learning_rate": 1.8375e-05, + "loss": 1.6706, + "step": 7350 + }, + { + "epoch": 0.09190229755743894, + "grad_norm": 2.826995372772217, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.9162, + "step": 7352 + }, + { + "epoch": 0.09192729818245456, + "grad_norm": 6.087180137634277, + "learning_rate": 1.8385e-05, + "loss": 0.6315, + "step": 7354 + }, + { + "epoch": 0.09195229880747019, + "grad_norm": 3.3867900371551514, + "learning_rate": 1.8390000000000002e-05, + "loss": 0.2651, + "step": 7356 + }, + { + "epoch": 0.09197729943248581, + "grad_norm": 2.38263201713562, + "learning_rate": 1.8395e-05, + "loss": 0.5687, + "step": 7358 + }, + { + "epoch": 0.09200230005750144, + "grad_norm": 4.282219886779785, + "learning_rate": 1.8400000000000003e-05, + "loss": 1.3136, + "step": 7360 + }, + { + "epoch": 0.09202730068251706, + "grad_norm": 3.5693626403808594, + "learning_rate": 1.8405e-05, + "loss": 1.6662, + "step": 7362 + }, + { + "epoch": 0.09205230130753268, + "grad_norm": 2.2848169803619385, + "learning_rate": 1.841e-05, + "loss": 2.373, + "step": 7364 + }, + { + "epoch": 0.09207730193254832, + "grad_norm": 2.5418736934661865, + "learning_rate": 1.8415e-05, + "loss": 0.6224, + "step": 7366 + }, + { + "epoch": 0.09210230255756394, + "grad_norm": 5.116671562194824, + "learning_rate": 1.8420000000000003e-05, + "loss": 2.0136, + "step": 7368 + }, + { + "epoch": 0.09212730318257957, + "grad_norm": 0.020658791065216064, + "learning_rate": 1.8425e-05, + "loss": 0.7607, + "step": 7370 + }, + { + "epoch": 0.09215230380759519, + "grad_norm": 3.7834224700927734, + "learning_rate": 1.843e-05, + "loss": 0.9137, + "step": 7372 + }, + { + "epoch": 0.09217730443261081, + "grad_norm": 2.2376420497894287, + "learning_rate": 1.8435000000000002e-05, + "loss": 1.1577, + "step": 7374 + }, + { + "epoch": 0.09220230505762644, + "grad_norm": 15.282563209533691, + "learning_rate": 1.8440000000000003e-05, + "loss": 0.4669, + "step": 7376 + }, + { + "epoch": 0.09222730568264206, + "grad_norm": 2.9012417793273926, + "learning_rate": 1.8445e-05, + "loss": 1.1985, + "step": 7378 + }, + { + "epoch": 0.0922523063076577, + "grad_norm": 2.0886240005493164, + "learning_rate": 1.845e-05, + "loss": 0.3362, + "step": 7380 + }, + { + "epoch": 0.09227730693267332, + "grad_norm": 3.258913040161133, + "learning_rate": 1.8455000000000002e-05, + "loss": 1.6197, + "step": 7382 + }, + { + "epoch": 0.09230230755768894, + "grad_norm": 6.108672618865967, + "learning_rate": 1.8460000000000003e-05, + "loss": 2.7855, + "step": 7384 + }, + { + "epoch": 0.09232730818270457, + "grad_norm": 3.4537887573242188, + "learning_rate": 1.8465e-05, + "loss": 0.9693, + "step": 7386 + }, + { + "epoch": 0.09235230880772019, + "grad_norm": 0.00641750218346715, + "learning_rate": 1.847e-05, + "loss": 0.0524, + "step": 7388 + }, + { + "epoch": 0.09237730943273582, + "grad_norm": 1.8725690841674805, + "learning_rate": 1.8475000000000002e-05, + "loss": 0.2557, + "step": 7390 + }, + { + "epoch": 0.09240231005775144, + "grad_norm": 3.1266160011291504, + "learning_rate": 1.8480000000000003e-05, + "loss": 1.1116, + "step": 7392 + }, + { + "epoch": 0.09242731068276706, + "grad_norm": 2.0850493907928467, + "learning_rate": 1.8485000000000003e-05, + "loss": 1.3768, + "step": 7394 + }, + { + "epoch": 0.0924523113077827, + "grad_norm": 4.74484920501709, + "learning_rate": 1.849e-05, + "loss": 0.8875, + "step": 7396 + }, + { + "epoch": 0.09247731193279832, + "grad_norm": 3.5392916202545166, + "learning_rate": 1.8495e-05, + "loss": 1.1773, + "step": 7398 + }, + { + "epoch": 0.09250231255781395, + "grad_norm": 1.0876781940460205, + "learning_rate": 1.8500000000000002e-05, + "loss": 0.9523, + "step": 7400 + }, + { + "epoch": 0.09252731318282957, + "grad_norm": 0.5844667553901672, + "learning_rate": 1.8505000000000003e-05, + "loss": 0.0343, + "step": 7402 + }, + { + "epoch": 0.09255231380784519, + "grad_norm": 2.3450918197631836, + "learning_rate": 1.851e-05, + "loss": 0.9766, + "step": 7404 + }, + { + "epoch": 0.09257731443286082, + "grad_norm": 5.2139058113098145, + "learning_rate": 1.8515e-05, + "loss": 2.0226, + "step": 7406 + }, + { + "epoch": 0.09260231505787644, + "grad_norm": 9.229046821594238, + "learning_rate": 1.8520000000000002e-05, + "loss": 1.2052, + "step": 7408 + }, + { + "epoch": 0.09262731568289208, + "grad_norm": 3.0706422328948975, + "learning_rate": 1.8525000000000003e-05, + "loss": 0.4572, + "step": 7410 + }, + { + "epoch": 0.0926523163079077, + "grad_norm": 3.0030322074890137, + "learning_rate": 1.853e-05, + "loss": 0.7653, + "step": 7412 + }, + { + "epoch": 0.09267731693292332, + "grad_norm": 3.0502119064331055, + "learning_rate": 1.8535e-05, + "loss": 1.4654, + "step": 7414 + }, + { + "epoch": 0.09270231755793895, + "grad_norm": 3.3217720985412598, + "learning_rate": 1.8540000000000002e-05, + "loss": 0.8242, + "step": 7416 + }, + { + "epoch": 0.09272731818295457, + "grad_norm": 3.3223674297332764, + "learning_rate": 1.8545000000000003e-05, + "loss": 0.9715, + "step": 7418 + }, + { + "epoch": 0.0927523188079702, + "grad_norm": 1.9316670894622803, + "learning_rate": 1.855e-05, + "loss": 1.4951, + "step": 7420 + }, + { + "epoch": 0.09277731943298582, + "grad_norm": 6.0837907791137695, + "learning_rate": 1.8555e-05, + "loss": 1.0079, + "step": 7422 + }, + { + "epoch": 0.09280232005800144, + "grad_norm": 4.404510498046875, + "learning_rate": 1.8560000000000002e-05, + "loss": 0.8738, + "step": 7424 + }, + { + "epoch": 0.09282732068301708, + "grad_norm": 0.05984283983707428, + "learning_rate": 1.8565000000000003e-05, + "loss": 1.0509, + "step": 7426 + }, + { + "epoch": 0.0928523213080327, + "grad_norm": 2.8642327785491943, + "learning_rate": 1.857e-05, + "loss": 1.7192, + "step": 7428 + }, + { + "epoch": 0.09287732193304833, + "grad_norm": 0.003344278782606125, + "learning_rate": 1.8575e-05, + "loss": 0.5479, + "step": 7430 + }, + { + "epoch": 0.09290232255806395, + "grad_norm": 6.15834379196167, + "learning_rate": 1.858e-05, + "loss": 0.4558, + "step": 7432 + }, + { + "epoch": 0.09292732318307957, + "grad_norm": 5.312638759613037, + "learning_rate": 1.8585000000000002e-05, + "loss": 1.2463, + "step": 7434 + }, + { + "epoch": 0.0929523238080952, + "grad_norm": 3.1258599758148193, + "learning_rate": 1.859e-05, + "loss": 0.7175, + "step": 7436 + }, + { + "epoch": 0.09297732443311083, + "grad_norm": 1.88680899143219, + "learning_rate": 1.8595e-05, + "loss": 0.9286, + "step": 7438 + }, + { + "epoch": 0.09300232505812646, + "grad_norm": 2.221731424331665, + "learning_rate": 1.86e-05, + "loss": 0.9305, + "step": 7440 + }, + { + "epoch": 0.09302732568314208, + "grad_norm": 0.003357339883223176, + "learning_rate": 1.8605000000000002e-05, + "loss": 0.8214, + "step": 7442 + }, + { + "epoch": 0.0930523263081577, + "grad_norm": 2.5689914226531982, + "learning_rate": 1.8610000000000003e-05, + "loss": 1.0945, + "step": 7444 + }, + { + "epoch": 0.09307732693317333, + "grad_norm": 3.421220064163208, + "learning_rate": 1.8615e-05, + "loss": 0.8689, + "step": 7446 + }, + { + "epoch": 0.09310232755818895, + "grad_norm": 2.7246334552764893, + "learning_rate": 1.862e-05, + "loss": 1.5878, + "step": 7448 + }, + { + "epoch": 0.09312732818320459, + "grad_norm": 4.652991771697998, + "learning_rate": 1.8625000000000002e-05, + "loss": 0.5652, + "step": 7450 + }, + { + "epoch": 0.0931523288082202, + "grad_norm": 3.8341705799102783, + "learning_rate": 1.8630000000000003e-05, + "loss": 0.965, + "step": 7452 + }, + { + "epoch": 0.09317732943323583, + "grad_norm": 0.001383114606142044, + "learning_rate": 1.8635e-05, + "loss": 0.9286, + "step": 7454 + }, + { + "epoch": 0.09320233005825146, + "grad_norm": 3.779796838760376, + "learning_rate": 1.864e-05, + "loss": 0.981, + "step": 7456 + }, + { + "epoch": 0.09322733068326708, + "grad_norm": 2.3749470710754395, + "learning_rate": 1.8645000000000002e-05, + "loss": 1.8654, + "step": 7458 + }, + { + "epoch": 0.09325233130828271, + "grad_norm": 2.8779420852661133, + "learning_rate": 1.8650000000000003e-05, + "loss": 1.1554, + "step": 7460 + }, + { + "epoch": 0.09327733193329833, + "grad_norm": 1.4495558738708496, + "learning_rate": 1.8655e-05, + "loss": 0.5064, + "step": 7462 + }, + { + "epoch": 0.09330233255831395, + "grad_norm": 0.008773625828325748, + "learning_rate": 1.866e-05, + "loss": 0.4705, + "step": 7464 + }, + { + "epoch": 0.09332733318332959, + "grad_norm": 4.946811676025391, + "learning_rate": 1.8665000000000002e-05, + "loss": 1.5627, + "step": 7466 + }, + { + "epoch": 0.0933523338083452, + "grad_norm": 3.117414712905884, + "learning_rate": 1.8670000000000003e-05, + "loss": 1.5664, + "step": 7468 + }, + { + "epoch": 0.09337733443336084, + "grad_norm": 4.422914505004883, + "learning_rate": 1.8675e-05, + "loss": 1.8213, + "step": 7470 + }, + { + "epoch": 0.09340233505837646, + "grad_norm": 2.612898826599121, + "learning_rate": 1.8680000000000004e-05, + "loss": 0.6538, + "step": 7472 + }, + { + "epoch": 0.09342733568339208, + "grad_norm": 3.2434656620025635, + "learning_rate": 1.8685e-05, + "loss": 0.6208, + "step": 7474 + }, + { + "epoch": 0.09345233630840771, + "grad_norm": 1.7950912714004517, + "learning_rate": 1.8690000000000002e-05, + "loss": 0.3116, + "step": 7476 + }, + { + "epoch": 0.09347733693342333, + "grad_norm": 3.5421667098999023, + "learning_rate": 1.8695e-05, + "loss": 1.101, + "step": 7478 + }, + { + "epoch": 0.09350233755843897, + "grad_norm": 2.9518299102783203, + "learning_rate": 1.8700000000000004e-05, + "loss": 2.5807, + "step": 7480 + }, + { + "epoch": 0.09352733818345459, + "grad_norm": 0.11761701107025146, + "learning_rate": 1.8705e-05, + "loss": 0.5412, + "step": 7482 + }, + { + "epoch": 0.0935523388084702, + "grad_norm": 3.157954216003418, + "learning_rate": 1.8710000000000002e-05, + "loss": 0.6748, + "step": 7484 + }, + { + "epoch": 0.09357733943348584, + "grad_norm": 0.36059504747390747, + "learning_rate": 1.8715e-05, + "loss": 0.0068, + "step": 7486 + }, + { + "epoch": 0.09360234005850146, + "grad_norm": 6.301931381225586, + "learning_rate": 1.8720000000000004e-05, + "loss": 0.2505, + "step": 7488 + }, + { + "epoch": 0.0936273406835171, + "grad_norm": 0.07603830099105835, + "learning_rate": 1.8725e-05, + "loss": 0.6905, + "step": 7490 + }, + { + "epoch": 0.09365234130853271, + "grad_norm": 2.5641860961914062, + "learning_rate": 1.8730000000000002e-05, + "loss": 0.4733, + "step": 7492 + }, + { + "epoch": 0.09367734193354833, + "grad_norm": 0.1363946944475174, + "learning_rate": 1.8735e-05, + "loss": 0.4422, + "step": 7494 + }, + { + "epoch": 0.09370234255856397, + "grad_norm": 3.0675930976867676, + "learning_rate": 1.8740000000000004e-05, + "loss": 1.3222, + "step": 7496 + }, + { + "epoch": 0.09372734318357959, + "grad_norm": 5.060628414154053, + "learning_rate": 1.8745e-05, + "loss": 1.0908, + "step": 7498 + }, + { + "epoch": 0.09375234380859522, + "grad_norm": 0.02449469082057476, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.5043, + "step": 7500 + }, + { + "epoch": 0.09377734443361084, + "grad_norm": 2.1408097743988037, + "learning_rate": 1.8755000000000003e-05, + "loss": 0.0575, + "step": 7502 + }, + { + "epoch": 0.09380234505862646, + "grad_norm": 4.321648597717285, + "learning_rate": 1.876e-05, + "loss": 1.4255, + "step": 7504 + }, + { + "epoch": 0.0938273456836421, + "grad_norm": 2.4968318939208984, + "learning_rate": 1.8765e-05, + "loss": 1.4011, + "step": 7506 + }, + { + "epoch": 0.09385234630865771, + "grad_norm": 0.007783310487866402, + "learning_rate": 1.877e-05, + "loss": 0.0546, + "step": 7508 + }, + { + "epoch": 0.09387734693367335, + "grad_norm": 4.203286170959473, + "learning_rate": 1.8775000000000002e-05, + "loss": 0.9326, + "step": 7510 + }, + { + "epoch": 0.09390234755868897, + "grad_norm": 6.174130439758301, + "learning_rate": 1.878e-05, + "loss": 1.4819, + "step": 7512 + }, + { + "epoch": 0.09392734818370459, + "grad_norm": 2.7457592487335205, + "learning_rate": 1.8785e-05, + "loss": 0.9456, + "step": 7514 + }, + { + "epoch": 0.09395234880872022, + "grad_norm": 5.191500663757324, + "learning_rate": 1.879e-05, + "loss": 0.343, + "step": 7516 + }, + { + "epoch": 0.09397734943373584, + "grad_norm": 3.7411093711853027, + "learning_rate": 1.8795000000000002e-05, + "loss": 0.8163, + "step": 7518 + }, + { + "epoch": 0.09400235005875147, + "grad_norm": 2.7787082195281982, + "learning_rate": 1.88e-05, + "loss": 0.4941, + "step": 7520 + }, + { + "epoch": 0.0940273506837671, + "grad_norm": 4.3063645362854, + "learning_rate": 1.8805000000000004e-05, + "loss": 1.4864, + "step": 7522 + }, + { + "epoch": 0.09405235130878271, + "grad_norm": 3.459005355834961, + "learning_rate": 1.881e-05, + "loss": 1.1833, + "step": 7524 + }, + { + "epoch": 0.09407735193379835, + "grad_norm": 1.3731977939605713, + "learning_rate": 1.8815000000000002e-05, + "loss": 0.2648, + "step": 7526 + }, + { + "epoch": 0.09410235255881397, + "grad_norm": 4.203587532043457, + "learning_rate": 1.882e-05, + "loss": 0.4159, + "step": 7528 + }, + { + "epoch": 0.0941273531838296, + "grad_norm": 3.571911573410034, + "learning_rate": 1.8825000000000004e-05, + "loss": 1.1803, + "step": 7530 + }, + { + "epoch": 0.09415235380884522, + "grad_norm": 3.2152633666992188, + "learning_rate": 1.883e-05, + "loss": 1.9895, + "step": 7532 + }, + { + "epoch": 0.09417735443386084, + "grad_norm": 5.33611536026001, + "learning_rate": 1.8835000000000002e-05, + "loss": 1.1097, + "step": 7534 + }, + { + "epoch": 0.09420235505887647, + "grad_norm": 0.18149419128894806, + "learning_rate": 1.884e-05, + "loss": 0.7404, + "step": 7536 + }, + { + "epoch": 0.0942273556838921, + "grad_norm": 1.2558737993240356, + "learning_rate": 1.8845000000000003e-05, + "loss": 0.9605, + "step": 7538 + }, + { + "epoch": 0.09425235630890773, + "grad_norm": 4.659029960632324, + "learning_rate": 1.885e-05, + "loss": 1.9855, + "step": 7540 + }, + { + "epoch": 0.09427735693392335, + "grad_norm": 1.420508623123169, + "learning_rate": 1.8855e-05, + "loss": 1.3749, + "step": 7542 + }, + { + "epoch": 0.09430235755893897, + "grad_norm": 6.219637870788574, + "learning_rate": 1.886e-05, + "loss": 1.276, + "step": 7544 + }, + { + "epoch": 0.0943273581839546, + "grad_norm": 2.3624019622802734, + "learning_rate": 1.8865000000000003e-05, + "loss": 1.73, + "step": 7546 + }, + { + "epoch": 0.09435235880897022, + "grad_norm": 3.4969468116760254, + "learning_rate": 1.887e-05, + "loss": 1.4531, + "step": 7548 + }, + { + "epoch": 0.09437735943398586, + "grad_norm": 5.580578804016113, + "learning_rate": 1.8875e-05, + "loss": 1.1763, + "step": 7550 + }, + { + "epoch": 0.09440236005900147, + "grad_norm": 5.779135704040527, + "learning_rate": 1.8880000000000002e-05, + "loss": 0.8537, + "step": 7552 + }, + { + "epoch": 0.0944273606840171, + "grad_norm": 4.012996196746826, + "learning_rate": 1.8885000000000003e-05, + "loss": 0.7979, + "step": 7554 + }, + { + "epoch": 0.09445236130903273, + "grad_norm": 0.13943222165107727, + "learning_rate": 1.889e-05, + "loss": 0.0018, + "step": 7556 + }, + { + "epoch": 0.09447736193404835, + "grad_norm": 1.267192006111145, + "learning_rate": 1.8895e-05, + "loss": 0.0654, + "step": 7558 + }, + { + "epoch": 0.09450236255906398, + "grad_norm": 4.284609794616699, + "learning_rate": 1.8900000000000002e-05, + "loss": 0.2015, + "step": 7560 + }, + { + "epoch": 0.0945273631840796, + "grad_norm": 2.2355213165283203, + "learning_rate": 1.8905000000000003e-05, + "loss": 0.7346, + "step": 7562 + }, + { + "epoch": 0.09455236380909522, + "grad_norm": 1.7393134832382202, + "learning_rate": 1.891e-05, + "loss": 0.5107, + "step": 7564 + }, + { + "epoch": 0.09457736443411086, + "grad_norm": 5.216782093048096, + "learning_rate": 1.8915e-05, + "loss": 2.1373, + "step": 7566 + }, + { + "epoch": 0.09460236505912648, + "grad_norm": 2.8003854751586914, + "learning_rate": 1.8920000000000002e-05, + "loss": 0.1931, + "step": 7568 + }, + { + "epoch": 0.09462736568414211, + "grad_norm": 3.574507474899292, + "learning_rate": 1.8925000000000003e-05, + "loss": 0.674, + "step": 7570 + }, + { + "epoch": 0.09465236630915773, + "grad_norm": 3.117415189743042, + "learning_rate": 1.893e-05, + "loss": 0.6425, + "step": 7572 + }, + { + "epoch": 0.09467736693417335, + "grad_norm": 3.1697428226470947, + "learning_rate": 1.8935e-05, + "loss": 1.4555, + "step": 7574 + }, + { + "epoch": 0.09470236755918898, + "grad_norm": 0.004976862110197544, + "learning_rate": 1.894e-05, + "loss": 0.3863, + "step": 7576 + }, + { + "epoch": 0.0947273681842046, + "grad_norm": 0.004180252086371183, + "learning_rate": 1.8945000000000002e-05, + "loss": 0.0924, + "step": 7578 + }, + { + "epoch": 0.09475236880922024, + "grad_norm": 0.0027886806055903435, + "learning_rate": 1.8950000000000003e-05, + "loss": 0.0223, + "step": 7580 + }, + { + "epoch": 0.09477736943423586, + "grad_norm": 5.033118724822998, + "learning_rate": 1.8955e-05, + "loss": 0.23, + "step": 7582 + }, + { + "epoch": 0.09480237005925148, + "grad_norm": 8.153624534606934, + "learning_rate": 1.896e-05, + "loss": 1.2441, + "step": 7584 + }, + { + "epoch": 0.09482737068426711, + "grad_norm": 10.908885955810547, + "learning_rate": 1.8965000000000002e-05, + "loss": 0.5634, + "step": 7586 + }, + { + "epoch": 0.09485237130928273, + "grad_norm": 3.401996612548828, + "learning_rate": 1.8970000000000003e-05, + "loss": 0.3802, + "step": 7588 + }, + { + "epoch": 0.09487737193429836, + "grad_norm": 0.18894007802009583, + "learning_rate": 1.8975e-05, + "loss": 1.1881, + "step": 7590 + }, + { + "epoch": 0.09490237255931398, + "grad_norm": 4.889155864715576, + "learning_rate": 1.898e-05, + "loss": 1.5889, + "step": 7592 + }, + { + "epoch": 0.0949273731843296, + "grad_norm": 1.9251434803009033, + "learning_rate": 1.8985000000000002e-05, + "loss": 1.5269, + "step": 7594 + }, + { + "epoch": 0.09495237380934524, + "grad_norm": 9.64488410949707, + "learning_rate": 1.8990000000000003e-05, + "loss": 0.5258, + "step": 7596 + }, + { + "epoch": 0.09497737443436086, + "grad_norm": 5.142972946166992, + "learning_rate": 1.8995e-05, + "loss": 1.25, + "step": 7598 + }, + { + "epoch": 0.09500237505937649, + "grad_norm": 2.8690240383148193, + "learning_rate": 1.9e-05, + "loss": 0.797, + "step": 7600 + }, + { + "epoch": 0.09502737568439211, + "grad_norm": 5.448661804199219, + "learning_rate": 1.9005000000000002e-05, + "loss": 0.8556, + "step": 7602 + }, + { + "epoch": 0.09505237630940773, + "grad_norm": 2.2899067401885986, + "learning_rate": 1.9010000000000003e-05, + "loss": 0.9664, + "step": 7604 + }, + { + "epoch": 0.09507737693442336, + "grad_norm": 0.0014353537699207664, + "learning_rate": 1.9015e-05, + "loss": 1.3, + "step": 7606 + }, + { + "epoch": 0.09510237755943898, + "grad_norm": 4.284374237060547, + "learning_rate": 1.902e-05, + "loss": 0.8757, + "step": 7608 + }, + { + "epoch": 0.09512737818445462, + "grad_norm": 3.2372963428497314, + "learning_rate": 1.9025e-05, + "loss": 0.6913, + "step": 7610 + }, + { + "epoch": 0.09515237880947024, + "grad_norm": 4.3558197021484375, + "learning_rate": 1.9030000000000002e-05, + "loss": 1.9357, + "step": 7612 + }, + { + "epoch": 0.09517737943448586, + "grad_norm": 1.9063012599945068, + "learning_rate": 1.9035e-05, + "loss": 0.7692, + "step": 7614 + }, + { + "epoch": 0.09520238005950149, + "grad_norm": 0.0045616282150149345, + "learning_rate": 1.904e-05, + "loss": 0.5892, + "step": 7616 + }, + { + "epoch": 0.09522738068451711, + "grad_norm": 2.8112783432006836, + "learning_rate": 1.9045e-05, + "loss": 1.0005, + "step": 7618 + }, + { + "epoch": 0.09525238130953274, + "grad_norm": 4.878450393676758, + "learning_rate": 1.9050000000000002e-05, + "loss": 1.2666, + "step": 7620 + }, + { + "epoch": 0.09527738193454836, + "grad_norm": 8.186452865600586, + "learning_rate": 1.9055e-05, + "loss": 0.8703, + "step": 7622 + }, + { + "epoch": 0.09530238255956398, + "grad_norm": 3.065434217453003, + "learning_rate": 1.906e-05, + "loss": 0.8894, + "step": 7624 + }, + { + "epoch": 0.09532738318457962, + "grad_norm": 3.3862829208374023, + "learning_rate": 1.9065e-05, + "loss": 1.0937, + "step": 7626 + }, + { + "epoch": 0.09535238380959524, + "grad_norm": 2.2722439765930176, + "learning_rate": 1.9070000000000002e-05, + "loss": 0.8636, + "step": 7628 + }, + { + "epoch": 0.09537738443461087, + "grad_norm": 0.0018316985806450248, + "learning_rate": 1.9075000000000003e-05, + "loss": 0.7609, + "step": 7630 + }, + { + "epoch": 0.09540238505962649, + "grad_norm": 3.1129024028778076, + "learning_rate": 1.908e-05, + "loss": 1.1929, + "step": 7632 + }, + { + "epoch": 0.09542738568464211, + "grad_norm": 2.4755778312683105, + "learning_rate": 1.9085e-05, + "loss": 1.1539, + "step": 7634 + }, + { + "epoch": 0.09545238630965774, + "grad_norm": 3.1887478828430176, + "learning_rate": 1.9090000000000002e-05, + "loss": 0.9579, + "step": 7636 + }, + { + "epoch": 0.09547738693467336, + "grad_norm": 3.032487630844116, + "learning_rate": 1.9095000000000003e-05, + "loss": 1.3283, + "step": 7638 + }, + { + "epoch": 0.095502387559689, + "grad_norm": 5.85750675201416, + "learning_rate": 1.91e-05, + "loss": 1.6002, + "step": 7640 + }, + { + "epoch": 0.09552738818470462, + "grad_norm": 0.47066187858581543, + "learning_rate": 1.9105e-05, + "loss": 1.4425, + "step": 7642 + }, + { + "epoch": 0.09555238880972024, + "grad_norm": 3.806187868118286, + "learning_rate": 1.911e-05, + "loss": 0.7088, + "step": 7644 + }, + { + "epoch": 0.09557738943473587, + "grad_norm": 3.1882104873657227, + "learning_rate": 1.9115000000000002e-05, + "loss": 1.1261, + "step": 7646 + }, + { + "epoch": 0.09560239005975149, + "grad_norm": 3.6026790142059326, + "learning_rate": 1.912e-05, + "loss": 0.9467, + "step": 7648 + }, + { + "epoch": 0.09562739068476712, + "grad_norm": 7.172318458557129, + "learning_rate": 1.9125000000000004e-05, + "loss": 2.4455, + "step": 7650 + }, + { + "epoch": 0.09565239130978274, + "grad_norm": 3.4549167156219482, + "learning_rate": 1.913e-05, + "loss": 1.5192, + "step": 7652 + }, + { + "epoch": 0.09567739193479836, + "grad_norm": 2.5169570446014404, + "learning_rate": 1.9135000000000002e-05, + "loss": 0.9563, + "step": 7654 + }, + { + "epoch": 0.095702392559814, + "grad_norm": 4.860182762145996, + "learning_rate": 1.914e-05, + "loss": 0.2967, + "step": 7656 + }, + { + "epoch": 0.09572739318482962, + "grad_norm": 3.844191312789917, + "learning_rate": 1.9145000000000004e-05, + "loss": 1.138, + "step": 7658 + }, + { + "epoch": 0.09575239380984525, + "grad_norm": 2.505347490310669, + "learning_rate": 1.915e-05, + "loss": 0.6387, + "step": 7660 + }, + { + "epoch": 0.09577739443486087, + "grad_norm": 5.616109848022461, + "learning_rate": 1.9155000000000002e-05, + "loss": 0.2388, + "step": 7662 + }, + { + "epoch": 0.09580239505987649, + "grad_norm": 2.52962589263916, + "learning_rate": 1.916e-05, + "loss": 1.0925, + "step": 7664 + }, + { + "epoch": 0.09582739568489212, + "grad_norm": 9.725438117980957, + "learning_rate": 1.9165000000000004e-05, + "loss": 0.2783, + "step": 7666 + }, + { + "epoch": 0.09585239630990774, + "grad_norm": 2.0876283645629883, + "learning_rate": 1.917e-05, + "loss": 0.9501, + "step": 7668 + }, + { + "epoch": 0.09587739693492338, + "grad_norm": 2.4799935817718506, + "learning_rate": 1.9175000000000002e-05, + "loss": 0.4783, + "step": 7670 + }, + { + "epoch": 0.095902397559939, + "grad_norm": 3.642183303833008, + "learning_rate": 1.918e-05, + "loss": 1.1454, + "step": 7672 + }, + { + "epoch": 0.09592739818495462, + "grad_norm": 3.9936113357543945, + "learning_rate": 1.9185000000000004e-05, + "loss": 1.4483, + "step": 7674 + }, + { + "epoch": 0.09595239880997025, + "grad_norm": 4.214056968688965, + "learning_rate": 1.919e-05, + "loss": 0.9154, + "step": 7676 + }, + { + "epoch": 0.09597739943498587, + "grad_norm": 0.07001835107803345, + "learning_rate": 1.9195000000000002e-05, + "loss": 0.6978, + "step": 7678 + }, + { + "epoch": 0.0960024000600015, + "grad_norm": 3.9665956497192383, + "learning_rate": 1.9200000000000003e-05, + "loss": 0.2828, + "step": 7680 + }, + { + "epoch": 0.09602740068501713, + "grad_norm": 5.224057674407959, + "learning_rate": 1.9205000000000003e-05, + "loss": 2.4101, + "step": 7682 + }, + { + "epoch": 0.09605240131003275, + "grad_norm": 15.576433181762695, + "learning_rate": 1.921e-05, + "loss": 1.1529, + "step": 7684 + }, + { + "epoch": 0.09607740193504838, + "grad_norm": 6.238667011260986, + "learning_rate": 1.9215e-05, + "loss": 1.6594, + "step": 7686 + }, + { + "epoch": 0.096102402560064, + "grad_norm": 4.686971187591553, + "learning_rate": 1.9220000000000002e-05, + "loss": 1.6234, + "step": 7688 + }, + { + "epoch": 0.09612740318507963, + "grad_norm": 8.96406364440918, + "learning_rate": 1.9225000000000003e-05, + "loss": 0.4865, + "step": 7690 + }, + { + "epoch": 0.09615240381009525, + "grad_norm": 3.3160953521728516, + "learning_rate": 1.923e-05, + "loss": 1.129, + "step": 7692 + }, + { + "epoch": 0.09617740443511087, + "grad_norm": 2.947144031524658, + "learning_rate": 1.9235e-05, + "loss": 0.8521, + "step": 7694 + }, + { + "epoch": 0.0962024050601265, + "grad_norm": 2.5408172607421875, + "learning_rate": 1.9240000000000002e-05, + "loss": 0.494, + "step": 7696 + }, + { + "epoch": 0.09622740568514213, + "grad_norm": 4.661709308624268, + "learning_rate": 1.9245000000000003e-05, + "loss": 1.8901, + "step": 7698 + }, + { + "epoch": 0.09625240631015776, + "grad_norm": 3.0241386890411377, + "learning_rate": 1.925e-05, + "loss": 0.7478, + "step": 7700 + }, + { + "epoch": 0.09627740693517338, + "grad_norm": 6.86907434463501, + "learning_rate": 1.9255e-05, + "loss": 0.8103, + "step": 7702 + }, + { + "epoch": 0.096302407560189, + "grad_norm": 2.95585560798645, + "learning_rate": 1.9260000000000002e-05, + "loss": 0.5785, + "step": 7704 + }, + { + "epoch": 0.09632740818520463, + "grad_norm": 0.5774003863334656, + "learning_rate": 1.9265000000000003e-05, + "loss": 0.0826, + "step": 7706 + }, + { + "epoch": 0.09635240881022025, + "grad_norm": 2.7115793228149414, + "learning_rate": 1.9270000000000004e-05, + "loss": 0.5269, + "step": 7708 + }, + { + "epoch": 0.09637740943523589, + "grad_norm": 1.1631834506988525, + "learning_rate": 1.9275e-05, + "loss": 0.4722, + "step": 7710 + }, + { + "epoch": 0.0964024100602515, + "grad_norm": 4.086895942687988, + "learning_rate": 1.9280000000000002e-05, + "loss": 1.2343, + "step": 7712 + }, + { + "epoch": 0.09642741068526713, + "grad_norm": 8.161751747131348, + "learning_rate": 1.9285000000000003e-05, + "loss": 0.8686, + "step": 7714 + }, + { + "epoch": 0.09645241131028276, + "grad_norm": 0.3024827837944031, + "learning_rate": 1.9290000000000003e-05, + "loss": 0.0264, + "step": 7716 + }, + { + "epoch": 0.09647741193529838, + "grad_norm": 3.5698254108428955, + "learning_rate": 1.9295e-05, + "loss": 1.0428, + "step": 7718 + }, + { + "epoch": 0.09650241256031401, + "grad_norm": 5.4688310623168945, + "learning_rate": 1.93e-05, + "loss": 0.7313, + "step": 7720 + }, + { + "epoch": 0.09652741318532963, + "grad_norm": 8.113683700561523, + "learning_rate": 1.9305000000000002e-05, + "loss": 1.454, + "step": 7722 + }, + { + "epoch": 0.09655241381034525, + "grad_norm": 2.458911418914795, + "learning_rate": 1.9310000000000003e-05, + "loss": 0.5154, + "step": 7724 + }, + { + "epoch": 0.09657741443536089, + "grad_norm": 4.047362804412842, + "learning_rate": 1.9315e-05, + "loss": 0.8453, + "step": 7726 + }, + { + "epoch": 0.0966024150603765, + "grad_norm": 6.037619113922119, + "learning_rate": 1.932e-05, + "loss": 0.1432, + "step": 7728 + }, + { + "epoch": 0.09662741568539214, + "grad_norm": 6.592851161956787, + "learning_rate": 1.9325000000000002e-05, + "loss": 0.9002, + "step": 7730 + }, + { + "epoch": 0.09665241631040776, + "grad_norm": 2.8709516525268555, + "learning_rate": 1.9330000000000003e-05, + "loss": 1.2122, + "step": 7732 + }, + { + "epoch": 0.09667741693542338, + "grad_norm": 3.649367094039917, + "learning_rate": 1.9335e-05, + "loss": 1.1027, + "step": 7734 + }, + { + "epoch": 0.09670241756043901, + "grad_norm": 5.457020282745361, + "learning_rate": 1.934e-05, + "loss": 2.5013, + "step": 7736 + }, + { + "epoch": 0.09672741818545463, + "grad_norm": 2.518810749053955, + "learning_rate": 1.9345000000000002e-05, + "loss": 2.4495, + "step": 7738 + }, + { + "epoch": 0.09675241881047027, + "grad_norm": 4.555356979370117, + "learning_rate": 1.9350000000000003e-05, + "loss": 1.3273, + "step": 7740 + }, + { + "epoch": 0.09677741943548589, + "grad_norm": 2.267362356185913, + "learning_rate": 1.9355e-05, + "loss": 1.1801, + "step": 7742 + }, + { + "epoch": 0.0968024200605015, + "grad_norm": 4.478208065032959, + "learning_rate": 1.936e-05, + "loss": 1.5922, + "step": 7744 + }, + { + "epoch": 0.09682742068551714, + "grad_norm": 4.169719696044922, + "learning_rate": 1.9365000000000002e-05, + "loss": 1.7893, + "step": 7746 + }, + { + "epoch": 0.09685242131053276, + "grad_norm": 4.476062774658203, + "learning_rate": 1.9370000000000003e-05, + "loss": 1.1997, + "step": 7748 + }, + { + "epoch": 0.0968774219355484, + "grad_norm": 5.076711177825928, + "learning_rate": 1.9375e-05, + "loss": 0.394, + "step": 7750 + }, + { + "epoch": 0.09690242256056401, + "grad_norm": 0.206266388297081, + "learning_rate": 1.938e-05, + "loss": 0.6898, + "step": 7752 + }, + { + "epoch": 0.09692742318557963, + "grad_norm": 2.493471384048462, + "learning_rate": 1.9385e-05, + "loss": 1.6117, + "step": 7754 + }, + { + "epoch": 0.09695242381059527, + "grad_norm": 6.000421524047852, + "learning_rate": 1.9390000000000002e-05, + "loss": 1.3662, + "step": 7756 + }, + { + "epoch": 0.09697742443561089, + "grad_norm": 3.9724621772766113, + "learning_rate": 1.9395000000000003e-05, + "loss": 0.8181, + "step": 7758 + }, + { + "epoch": 0.09700242506062652, + "grad_norm": 4.1684675216674805, + "learning_rate": 1.94e-05, + "loss": 1.688, + "step": 7760 + }, + { + "epoch": 0.09702742568564214, + "grad_norm": 8.481614112854004, + "learning_rate": 1.9405e-05, + "loss": 2.7419, + "step": 7762 + }, + { + "epoch": 0.09705242631065776, + "grad_norm": 0.007944322191178799, + "learning_rate": 1.9410000000000002e-05, + "loss": 0.5237, + "step": 7764 + }, + { + "epoch": 0.0970774269356734, + "grad_norm": 2.4139697551727295, + "learning_rate": 1.9415000000000003e-05, + "loss": 0.3091, + "step": 7766 + }, + { + "epoch": 0.09710242756068901, + "grad_norm": 3.7150731086730957, + "learning_rate": 1.942e-05, + "loss": 0.4741, + "step": 7768 + }, + { + "epoch": 0.09712742818570465, + "grad_norm": 1.0404974222183228, + "learning_rate": 1.9425e-05, + "loss": 0.9302, + "step": 7770 + }, + { + "epoch": 0.09715242881072027, + "grad_norm": 2.9811949729919434, + "learning_rate": 1.9430000000000002e-05, + "loss": 1.4491, + "step": 7772 + }, + { + "epoch": 0.09717742943573589, + "grad_norm": 2.4782280921936035, + "learning_rate": 1.9435000000000003e-05, + "loss": 1.0535, + "step": 7774 + }, + { + "epoch": 0.09720243006075152, + "grad_norm": 2.8370635509490967, + "learning_rate": 1.944e-05, + "loss": 0.6564, + "step": 7776 + }, + { + "epoch": 0.09722743068576714, + "grad_norm": 11.218090057373047, + "learning_rate": 1.9445e-05, + "loss": 1.9466, + "step": 7778 + }, + { + "epoch": 0.09725243131078277, + "grad_norm": 3.346363067626953, + "learning_rate": 1.9450000000000002e-05, + "loss": 0.5397, + "step": 7780 + }, + { + "epoch": 0.0972774319357984, + "grad_norm": 0.006490341387689114, + "learning_rate": 1.9455000000000003e-05, + "loss": 0.6409, + "step": 7782 + }, + { + "epoch": 0.09730243256081401, + "grad_norm": 3.0582244396209717, + "learning_rate": 1.946e-05, + "loss": 1.1032, + "step": 7784 + }, + { + "epoch": 0.09732743318582965, + "grad_norm": 3.137807607650757, + "learning_rate": 1.9465e-05, + "loss": 1.4249, + "step": 7786 + }, + { + "epoch": 0.09735243381084527, + "grad_norm": 2.74932599067688, + "learning_rate": 1.947e-05, + "loss": 1.2403, + "step": 7788 + }, + { + "epoch": 0.0973774344358609, + "grad_norm": 6.498315334320068, + "learning_rate": 1.9475000000000002e-05, + "loss": 1.3609, + "step": 7790 + }, + { + "epoch": 0.09740243506087652, + "grad_norm": 4.259742736816406, + "learning_rate": 1.948e-05, + "loss": 2.5068, + "step": 7792 + }, + { + "epoch": 0.09742743568589214, + "grad_norm": 0.2525150775909424, + "learning_rate": 1.9485e-05, + "loss": 0.6002, + "step": 7794 + }, + { + "epoch": 0.09745243631090778, + "grad_norm": 0.34866219758987427, + "learning_rate": 1.949e-05, + "loss": 0.002, + "step": 7796 + }, + { + "epoch": 0.0974774369359234, + "grad_norm": 0.39500951766967773, + "learning_rate": 1.9495000000000002e-05, + "loss": 0.8652, + "step": 7798 + }, + { + "epoch": 0.09750243756093903, + "grad_norm": 1.6070013046264648, + "learning_rate": 1.95e-05, + "loss": 0.2406, + "step": 7800 + }, + { + "epoch": 0.09752743818595465, + "grad_norm": 5.360443115234375, + "learning_rate": 1.9505e-05, + "loss": 0.4507, + "step": 7802 + }, + { + "epoch": 0.09755243881097027, + "grad_norm": 3.465703010559082, + "learning_rate": 1.951e-05, + "loss": 0.8274, + "step": 7804 + }, + { + "epoch": 0.0975774394359859, + "grad_norm": 2.903323173522949, + "learning_rate": 1.9515000000000002e-05, + "loss": 0.9423, + "step": 7806 + }, + { + "epoch": 0.09760244006100152, + "grad_norm": 4.591175079345703, + "learning_rate": 1.9520000000000003e-05, + "loss": 1.4544, + "step": 7808 + }, + { + "epoch": 0.09762744068601716, + "grad_norm": 7.206733226776123, + "learning_rate": 1.9525e-05, + "loss": 0.991, + "step": 7810 + }, + { + "epoch": 0.09765244131103278, + "grad_norm": 0.002558272797614336, + "learning_rate": 1.953e-05, + "loss": 0.9552, + "step": 7812 + }, + { + "epoch": 0.0976774419360484, + "grad_norm": 6.74169921875, + "learning_rate": 1.9535000000000002e-05, + "loss": 1.521, + "step": 7814 + }, + { + "epoch": 0.09770244256106403, + "grad_norm": 2.6082043647766113, + "learning_rate": 1.9540000000000003e-05, + "loss": 1.8551, + "step": 7816 + }, + { + "epoch": 0.09772744318607965, + "grad_norm": 3.6584184169769287, + "learning_rate": 1.9545e-05, + "loss": 1.5457, + "step": 7818 + }, + { + "epoch": 0.09775244381109528, + "grad_norm": 2.552528142929077, + "learning_rate": 1.955e-05, + "loss": 1.2286, + "step": 7820 + }, + { + "epoch": 0.0977774444361109, + "grad_norm": 4.108335971832275, + "learning_rate": 1.9555e-05, + "loss": 1.4722, + "step": 7822 + }, + { + "epoch": 0.09780244506112652, + "grad_norm": 0.7046869397163391, + "learning_rate": 1.9560000000000002e-05, + "loss": 0.4713, + "step": 7824 + }, + { + "epoch": 0.09782744568614216, + "grad_norm": 2.710967779159546, + "learning_rate": 1.9565e-05, + "loss": 0.4255, + "step": 7826 + }, + { + "epoch": 0.09785244631115778, + "grad_norm": 0.9405058026313782, + "learning_rate": 1.957e-05, + "loss": 0.3695, + "step": 7828 + }, + { + "epoch": 0.09787744693617341, + "grad_norm": 2.840632200241089, + "learning_rate": 1.9575e-05, + "loss": 0.6991, + "step": 7830 + }, + { + "epoch": 0.09790244756118903, + "grad_norm": 3.4714813232421875, + "learning_rate": 1.9580000000000002e-05, + "loss": 1.322, + "step": 7832 + }, + { + "epoch": 0.09792744818620465, + "grad_norm": 2.9572553634643555, + "learning_rate": 1.9585e-05, + "loss": 0.8779, + "step": 7834 + }, + { + "epoch": 0.09795244881122028, + "grad_norm": 9.035497665405273, + "learning_rate": 1.9590000000000004e-05, + "loss": 0.9798, + "step": 7836 + }, + { + "epoch": 0.0979774494362359, + "grad_norm": 3.2541253566741943, + "learning_rate": 1.9595e-05, + "loss": 1.2099, + "step": 7838 + }, + { + "epoch": 0.09800245006125154, + "grad_norm": 2.783633232116699, + "learning_rate": 1.9600000000000002e-05, + "loss": 1.4363, + "step": 7840 + }, + { + "epoch": 0.09802745068626716, + "grad_norm": 7.497677803039551, + "learning_rate": 1.9605e-05, + "loss": 0.3586, + "step": 7842 + }, + { + "epoch": 0.09805245131128278, + "grad_norm": 2.679466962814331, + "learning_rate": 1.9610000000000004e-05, + "loss": 1.0975, + "step": 7844 + }, + { + "epoch": 0.09807745193629841, + "grad_norm": 3.7799179553985596, + "learning_rate": 1.9615e-05, + "loss": 1.4021, + "step": 7846 + }, + { + "epoch": 0.09810245256131403, + "grad_norm": 3.047788381576538, + "learning_rate": 1.9620000000000002e-05, + "loss": 1.6133, + "step": 7848 + }, + { + "epoch": 0.09812745318632966, + "grad_norm": 0.048793110996484756, + "learning_rate": 1.9625e-05, + "loss": 0.7075, + "step": 7850 + }, + { + "epoch": 0.09815245381134528, + "grad_norm": 3.189464807510376, + "learning_rate": 1.9630000000000003e-05, + "loss": 2.1312, + "step": 7852 + }, + { + "epoch": 0.0981774544363609, + "grad_norm": 1.8037264347076416, + "learning_rate": 1.9635e-05, + "loss": 0.5349, + "step": 7854 + }, + { + "epoch": 0.09820245506137654, + "grad_norm": 2.711570978164673, + "learning_rate": 1.9640000000000002e-05, + "loss": 0.9109, + "step": 7856 + }, + { + "epoch": 0.09822745568639216, + "grad_norm": 4.16025972366333, + "learning_rate": 1.9645e-05, + "loss": 0.755, + "step": 7858 + }, + { + "epoch": 0.09825245631140779, + "grad_norm": 3.774561882019043, + "learning_rate": 1.9650000000000003e-05, + "loss": 1.0041, + "step": 7860 + }, + { + "epoch": 0.09827745693642341, + "grad_norm": 4.691653251647949, + "learning_rate": 1.9655e-05, + "loss": 2.409, + "step": 7862 + }, + { + "epoch": 0.09830245756143903, + "grad_norm": 4.31721305847168, + "learning_rate": 1.966e-05, + "loss": 1.4619, + "step": 7864 + }, + { + "epoch": 0.09832745818645466, + "grad_norm": 3.2996060848236084, + "learning_rate": 1.9665000000000002e-05, + "loss": 0.9649, + "step": 7866 + }, + { + "epoch": 0.09835245881147028, + "grad_norm": 2.9347598552703857, + "learning_rate": 1.9670000000000003e-05, + "loss": 0.7828, + "step": 7868 + }, + { + "epoch": 0.09837745943648592, + "grad_norm": 5.495727062225342, + "learning_rate": 1.9675e-05, + "loss": 1.405, + "step": 7870 + }, + { + "epoch": 0.09840246006150154, + "grad_norm": 2.1010444164276123, + "learning_rate": 1.968e-05, + "loss": 0.1109, + "step": 7872 + }, + { + "epoch": 0.09842746068651716, + "grad_norm": 0.0017733756685629487, + "learning_rate": 1.9685000000000002e-05, + "loss": 0.7692, + "step": 7874 + }, + { + "epoch": 0.09845246131153279, + "grad_norm": 1.9166542291641235, + "learning_rate": 1.9690000000000003e-05, + "loss": 0.623, + "step": 7876 + }, + { + "epoch": 0.09847746193654841, + "grad_norm": 2.4093613624572754, + "learning_rate": 1.9695e-05, + "loss": 0.544, + "step": 7878 + }, + { + "epoch": 0.09850246256156404, + "grad_norm": 2.0898542404174805, + "learning_rate": 1.97e-05, + "loss": 1.2975, + "step": 7880 + }, + { + "epoch": 0.09852746318657966, + "grad_norm": 3.435154676437378, + "learning_rate": 1.9705000000000002e-05, + "loss": 1.291, + "step": 7882 + }, + { + "epoch": 0.09855246381159528, + "grad_norm": 10.967841148376465, + "learning_rate": 1.9710000000000003e-05, + "loss": 0.7867, + "step": 7884 + }, + { + "epoch": 0.09857746443661092, + "grad_norm": 0.7086451649665833, + "learning_rate": 1.9715000000000004e-05, + "loss": 0.0664, + "step": 7886 + }, + { + "epoch": 0.09860246506162654, + "grad_norm": 3.506997585296631, + "learning_rate": 1.972e-05, + "loss": 1.7112, + "step": 7888 + }, + { + "epoch": 0.09862746568664217, + "grad_norm": 3.032409429550171, + "learning_rate": 1.9725000000000002e-05, + "loss": 0.5143, + "step": 7890 + }, + { + "epoch": 0.09865246631165779, + "grad_norm": 1.5072962045669556, + "learning_rate": 1.9730000000000003e-05, + "loss": 0.012, + "step": 7892 + }, + { + "epoch": 0.09867746693667341, + "grad_norm": 6.0130228996276855, + "learning_rate": 1.9735000000000003e-05, + "loss": 1.1933, + "step": 7894 + }, + { + "epoch": 0.09870246756168904, + "grad_norm": 6.698263645172119, + "learning_rate": 1.974e-05, + "loss": 2.2177, + "step": 7896 + }, + { + "epoch": 0.09872746818670466, + "grad_norm": 3.6361701488494873, + "learning_rate": 1.9745e-05, + "loss": 0.8565, + "step": 7898 + }, + { + "epoch": 0.0987524688117203, + "grad_norm": 2.101027250289917, + "learning_rate": 1.9750000000000002e-05, + "loss": 0.6759, + "step": 7900 + }, + { + "epoch": 0.09877746943673592, + "grad_norm": 0.3020719587802887, + "learning_rate": 1.9755000000000003e-05, + "loss": 0.2169, + "step": 7902 + }, + { + "epoch": 0.09880247006175154, + "grad_norm": 1.4123690128326416, + "learning_rate": 1.976e-05, + "loss": 1.3874, + "step": 7904 + }, + { + "epoch": 0.09882747068676717, + "grad_norm": 3.531574249267578, + "learning_rate": 1.9765e-05, + "loss": 1.9539, + "step": 7906 + }, + { + "epoch": 0.09885247131178279, + "grad_norm": 0.012321005575358868, + "learning_rate": 1.9770000000000002e-05, + "loss": 1.1088, + "step": 7908 + }, + { + "epoch": 0.09887747193679843, + "grad_norm": 4.588141441345215, + "learning_rate": 1.9775000000000003e-05, + "loss": 1.7127, + "step": 7910 + }, + { + "epoch": 0.09890247256181404, + "grad_norm": 11.221750259399414, + "learning_rate": 1.978e-05, + "loss": 1.011, + "step": 7912 + }, + { + "epoch": 0.09892747318682966, + "grad_norm": 2.3037028312683105, + "learning_rate": 1.9785e-05, + "loss": 0.8128, + "step": 7914 + }, + { + "epoch": 0.0989524738118453, + "grad_norm": 0.0014238911680877209, + "learning_rate": 1.9790000000000002e-05, + "loss": 0.5816, + "step": 7916 + }, + { + "epoch": 0.09897747443686092, + "grad_norm": 1.4200636148452759, + "learning_rate": 1.9795000000000003e-05, + "loss": 0.1005, + "step": 7918 + }, + { + "epoch": 0.09900247506187655, + "grad_norm": 2.8483071327209473, + "learning_rate": 1.98e-05, + "loss": 1.5027, + "step": 7920 + }, + { + "epoch": 0.09902747568689217, + "grad_norm": 1.4581599235534668, + "learning_rate": 1.9805e-05, + "loss": 0.4348, + "step": 7922 + }, + { + "epoch": 0.09905247631190779, + "grad_norm": 4.092288017272949, + "learning_rate": 1.9810000000000002e-05, + "loss": 0.6856, + "step": 7924 + }, + { + "epoch": 0.09907747693692343, + "grad_norm": 4.305578708648682, + "learning_rate": 1.9815000000000003e-05, + "loss": 1.7118, + "step": 7926 + }, + { + "epoch": 0.09910247756193905, + "grad_norm": 4.031336784362793, + "learning_rate": 1.982e-05, + "loss": 1.7645, + "step": 7928 + }, + { + "epoch": 0.09912747818695468, + "grad_norm": 3.0142030715942383, + "learning_rate": 1.9825e-05, + "loss": 0.5669, + "step": 7930 + }, + { + "epoch": 0.0991524788119703, + "grad_norm": 2.1241400241851807, + "learning_rate": 1.983e-05, + "loss": 1.3444, + "step": 7932 + }, + { + "epoch": 0.09917747943698592, + "grad_norm": 18.21384048461914, + "learning_rate": 1.9835000000000002e-05, + "loss": 1.2145, + "step": 7934 + }, + { + "epoch": 0.09920248006200155, + "grad_norm": 3.447161912918091, + "learning_rate": 1.9840000000000003e-05, + "loss": 1.3628, + "step": 7936 + }, + { + "epoch": 0.09922748068701717, + "grad_norm": 2.9885475635528564, + "learning_rate": 1.9845e-05, + "loss": 0.3414, + "step": 7938 + }, + { + "epoch": 0.0992524813120328, + "grad_norm": 0.12325410544872284, + "learning_rate": 1.985e-05, + "loss": 0.2063, + "step": 7940 + }, + { + "epoch": 0.09927748193704843, + "grad_norm": 11.61297607421875, + "learning_rate": 1.9855000000000002e-05, + "loss": 0.8875, + "step": 7942 + }, + { + "epoch": 0.09930248256206405, + "grad_norm": 0.0016759526915848255, + "learning_rate": 1.9860000000000003e-05, + "loss": 0.6872, + "step": 7944 + }, + { + "epoch": 0.09932748318707968, + "grad_norm": 3.9891409873962402, + "learning_rate": 1.9865e-05, + "loss": 0.5981, + "step": 7946 + }, + { + "epoch": 0.0993524838120953, + "grad_norm": 0.3659355640411377, + "learning_rate": 1.987e-05, + "loss": 0.0123, + "step": 7948 + }, + { + "epoch": 0.09937748443711093, + "grad_norm": 3.619765043258667, + "learning_rate": 1.9875000000000002e-05, + "loss": 1.4629, + "step": 7950 + }, + { + "epoch": 0.09940248506212655, + "grad_norm": 3.5174314975738525, + "learning_rate": 1.9880000000000003e-05, + "loss": 0.9546, + "step": 7952 + }, + { + "epoch": 0.09942748568714217, + "grad_norm": 2.737373113632202, + "learning_rate": 1.9885e-05, + "loss": 1.3168, + "step": 7954 + }, + { + "epoch": 0.0994524863121578, + "grad_norm": 3.006273031234741, + "learning_rate": 1.989e-05, + "loss": 1.3061, + "step": 7956 + }, + { + "epoch": 0.09947748693717343, + "grad_norm": 2.508347272872925, + "learning_rate": 1.9895000000000002e-05, + "loss": 0.1993, + "step": 7958 + }, + { + "epoch": 0.09950248756218906, + "grad_norm": 8.643576622009277, + "learning_rate": 1.9900000000000003e-05, + "loss": 0.2548, + "step": 7960 + }, + { + "epoch": 0.09952748818720468, + "grad_norm": 2.8535687923431396, + "learning_rate": 1.9905e-05, + "loss": 1.0987, + "step": 7962 + }, + { + "epoch": 0.0995524888122203, + "grad_norm": 8.837233543395996, + "learning_rate": 1.9910000000000004e-05, + "loss": 1.3947, + "step": 7964 + }, + { + "epoch": 0.09957748943723593, + "grad_norm": 5.500988483428955, + "learning_rate": 1.9915e-05, + "loss": 1.3545, + "step": 7966 + }, + { + "epoch": 0.09960249006225155, + "grad_norm": 3.872314214706421, + "learning_rate": 1.9920000000000002e-05, + "loss": 0.8148, + "step": 7968 + }, + { + "epoch": 0.09962749068726719, + "grad_norm": 5.618670463562012, + "learning_rate": 1.9925e-05, + "loss": 0.7187, + "step": 7970 + }, + { + "epoch": 0.0996524913122828, + "grad_norm": 2.7309186458587646, + "learning_rate": 1.9930000000000004e-05, + "loss": 0.6812, + "step": 7972 + }, + { + "epoch": 0.09967749193729843, + "grad_norm": 0.43622463941574097, + "learning_rate": 1.9935e-05, + "loss": 0.9676, + "step": 7974 + }, + { + "epoch": 0.09970249256231406, + "grad_norm": 2.6010663509368896, + "learning_rate": 1.9940000000000002e-05, + "loss": 1.1266, + "step": 7976 + }, + { + "epoch": 0.09972749318732968, + "grad_norm": 6.452625751495361, + "learning_rate": 1.9945e-05, + "loss": 0.7079, + "step": 7978 + }, + { + "epoch": 0.09975249381234531, + "grad_norm": 4.253120422363281, + "learning_rate": 1.9950000000000004e-05, + "loss": 1.2098, + "step": 7980 + }, + { + "epoch": 0.09977749443736093, + "grad_norm": 5.54668664932251, + "learning_rate": 1.9955e-05, + "loss": 1.2837, + "step": 7982 + }, + { + "epoch": 0.09980249506237655, + "grad_norm": 5.388639450073242, + "learning_rate": 1.9960000000000002e-05, + "loss": 1.3685, + "step": 7984 + }, + { + "epoch": 0.09982749568739219, + "grad_norm": 1.0548150539398193, + "learning_rate": 1.9965e-05, + "loss": 1.0825, + "step": 7986 + }, + { + "epoch": 0.0998524963124078, + "grad_norm": 3.6350648403167725, + "learning_rate": 1.9970000000000004e-05, + "loss": 1.7553, + "step": 7988 + }, + { + "epoch": 0.09987749693742344, + "grad_norm": 4.1388092041015625, + "learning_rate": 1.9975e-05, + "loss": 1.6741, + "step": 7990 + }, + { + "epoch": 0.09990249756243906, + "grad_norm": 2.9568333625793457, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.7871, + "step": 7992 + }, + { + "epoch": 0.09992749818745468, + "grad_norm": 2.94905424118042, + "learning_rate": 1.9985000000000003e-05, + "loss": 0.6298, + "step": 7994 + }, + { + "epoch": 0.09995249881247031, + "grad_norm": 6.019285202026367, + "learning_rate": 1.9990000000000003e-05, + "loss": 1.1837, + "step": 7996 + }, + { + "epoch": 0.09997749943748593, + "grad_norm": 3.154879093170166, + "learning_rate": 1.9995e-05, + "loss": 0.8255, + "step": 7998 + }, + { + "epoch": 0.10000250006250157, + "grad_norm": 3.764787197113037, + "learning_rate": 2e-05, + "loss": 1.6462, + "step": 8000 + }, + { + "epoch": 0.10002750068751719, + "grad_norm": 3.0508480072021484, + "learning_rate": 1.9999999961920707e-05, + "loss": 0.2101, + "step": 8002 + }, + { + "epoch": 0.10005250131253281, + "grad_norm": 3.338843822479248, + "learning_rate": 1.999999984768283e-05, + "loss": 0.7097, + "step": 8004 + }, + { + "epoch": 0.10007750193754844, + "grad_norm": 4.1988630294799805, + "learning_rate": 1.9999999657286365e-05, + "loss": 1.2551, + "step": 8006 + }, + { + "epoch": 0.10010250256256406, + "grad_norm": 4.308163166046143, + "learning_rate": 1.999999939073132e-05, + "loss": 1.8995, + "step": 8008 + }, + { + "epoch": 0.1001275031875797, + "grad_norm": 2.9408326148986816, + "learning_rate": 1.9999999048017694e-05, + "loss": 0.4713, + "step": 8010 + }, + { + "epoch": 0.10015250381259531, + "grad_norm": 2.76578426361084, + "learning_rate": 1.9999998629145488e-05, + "loss": 1.2574, + "step": 8012 + }, + { + "epoch": 0.10017750443761093, + "grad_norm": 3.372544050216675, + "learning_rate": 1.9999998134114704e-05, + "loss": 1.834, + "step": 8014 + }, + { + "epoch": 0.10020250506262657, + "grad_norm": 12.9149751663208, + "learning_rate": 1.999999756292535e-05, + "loss": 1.3849, + "step": 8016 + }, + { + "epoch": 0.10022750568764219, + "grad_norm": 3.746652603149414, + "learning_rate": 1.9999996915577432e-05, + "loss": 1.0749, + "step": 8018 + }, + { + "epoch": 0.10025250631265782, + "grad_norm": 5.06020975112915, + "learning_rate": 1.999999619207095e-05, + "loss": 1.3759, + "step": 8020 + }, + { + "epoch": 0.10027750693767344, + "grad_norm": 1.3324875831604004, + "learning_rate": 1.999999539240591e-05, + "loss": 0.1267, + "step": 8022 + }, + { + "epoch": 0.10030250756268906, + "grad_norm": 3.455148220062256, + "learning_rate": 1.9999994516582316e-05, + "loss": 0.8114, + "step": 8024 + }, + { + "epoch": 0.1003275081877047, + "grad_norm": 3.6409270763397217, + "learning_rate": 1.999999356460018e-05, + "loss": 1.4174, + "step": 8026 + }, + { + "epoch": 0.10035250881272031, + "grad_norm": 7.012474060058594, + "learning_rate": 1.9999992536459508e-05, + "loss": 1.022, + "step": 8028 + }, + { + "epoch": 0.10037750943773595, + "grad_norm": 5.7251505851745605, + "learning_rate": 1.999999143216031e-05, + "loss": 0.0576, + "step": 8030 + }, + { + "epoch": 0.10040251006275157, + "grad_norm": 2.810333013534546, + "learning_rate": 1.9999990251702587e-05, + "loss": 0.9927, + "step": 8032 + }, + { + "epoch": 0.10042751068776719, + "grad_norm": 4.67891263961792, + "learning_rate": 1.9999988995086353e-05, + "loss": 0.849, + "step": 8034 + }, + { + "epoch": 0.10045251131278282, + "grad_norm": 3.46364426612854, + "learning_rate": 1.999998766231162e-05, + "loss": 0.5231, + "step": 8036 + }, + { + "epoch": 0.10047751193779844, + "grad_norm": 4.1578803062438965, + "learning_rate": 1.9999986253378393e-05, + "loss": 1.982, + "step": 8038 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 11.4835844039917, + "learning_rate": 1.9999984768286684e-05, + "loss": 2.5392, + "step": 8040 + }, + { + "epoch": 0.1005275131878297, + "grad_norm": 1.7805320024490356, + "learning_rate": 1.9999983207036508e-05, + "loss": 0.299, + "step": 8042 + }, + { + "epoch": 0.10055251381284531, + "grad_norm": 2.1779944896698, + "learning_rate": 1.999998156962787e-05, + "loss": 1.1299, + "step": 8044 + }, + { + "epoch": 0.10057751443786095, + "grad_norm": 4.717772960662842, + "learning_rate": 1.9999979856060788e-05, + "loss": 0.2705, + "step": 8046 + }, + { + "epoch": 0.10060251506287657, + "grad_norm": 0.7329453825950623, + "learning_rate": 1.9999978066335275e-05, + "loss": 1.0982, + "step": 8048 + }, + { + "epoch": 0.1006275156878922, + "grad_norm": 5.8311052322387695, + "learning_rate": 1.9999976200451342e-05, + "loss": 0.9737, + "step": 8050 + }, + { + "epoch": 0.10065251631290782, + "grad_norm": 4.848895072937012, + "learning_rate": 1.9999974258409004e-05, + "loss": 1.7576, + "step": 8052 + }, + { + "epoch": 0.10067751693792344, + "grad_norm": 2.9377565383911133, + "learning_rate": 1.999997224020828e-05, + "loss": 1.0069, + "step": 8054 + }, + { + "epoch": 0.10070251756293908, + "grad_norm": 3.162132978439331, + "learning_rate": 1.9999970145849173e-05, + "loss": 1.0939, + "step": 8056 + }, + { + "epoch": 0.1007275181879547, + "grad_norm": 2.359502077102661, + "learning_rate": 1.9999967975331715e-05, + "loss": 0.8318, + "step": 8058 + }, + { + "epoch": 0.10075251881297033, + "grad_norm": 0.004967877641320229, + "learning_rate": 1.999996572865591e-05, + "loss": 0.5357, + "step": 8060 + }, + { + "epoch": 0.10077751943798595, + "grad_norm": 5.524488925933838, + "learning_rate": 1.9999963405821786e-05, + "loss": 1.2397, + "step": 8062 + }, + { + "epoch": 0.10080252006300157, + "grad_norm": 0.28692254424095154, + "learning_rate": 1.999996100682935e-05, + "loss": 0.1342, + "step": 8064 + }, + { + "epoch": 0.1008275206880172, + "grad_norm": 4.909961223602295, + "learning_rate": 1.999995853167863e-05, + "loss": 1.3865, + "step": 8066 + }, + { + "epoch": 0.10085252131303282, + "grad_norm": 3.2153987884521484, + "learning_rate": 1.9999955980369633e-05, + "loss": 0.7177, + "step": 8068 + }, + { + "epoch": 0.10087752193804846, + "grad_norm": 3.2160398960113525, + "learning_rate": 1.999995335290239e-05, + "loss": 0.8984, + "step": 8070 + }, + { + "epoch": 0.10090252256306408, + "grad_norm": 4.1196770668029785, + "learning_rate": 1.9999950649276915e-05, + "loss": 0.9433, + "step": 8072 + }, + { + "epoch": 0.1009275231880797, + "grad_norm": 2.966582775115967, + "learning_rate": 1.9999947869493232e-05, + "loss": 0.7722, + "step": 8074 + }, + { + "epoch": 0.10095252381309533, + "grad_norm": 0.17023535072803497, + "learning_rate": 1.999994501355136e-05, + "loss": 0.8039, + "step": 8076 + }, + { + "epoch": 0.10097752443811095, + "grad_norm": 4.037571907043457, + "learning_rate": 1.999994208145132e-05, + "loss": 1.0575, + "step": 8078 + }, + { + "epoch": 0.10100252506312658, + "grad_norm": 3.51625657081604, + "learning_rate": 1.999993907319313e-05, + "loss": 0.9053, + "step": 8080 + }, + { + "epoch": 0.1010275256881422, + "grad_norm": 2.3074820041656494, + "learning_rate": 1.9999935988776827e-05, + "loss": 0.1367, + "step": 8082 + }, + { + "epoch": 0.10105252631315782, + "grad_norm": 7.190342426300049, + "learning_rate": 1.999993282820242e-05, + "loss": 1.0369, + "step": 8084 + }, + { + "epoch": 0.10107752693817346, + "grad_norm": 1.916694164276123, + "learning_rate": 1.999992959146994e-05, + "loss": 0.4106, + "step": 8086 + }, + { + "epoch": 0.10110252756318908, + "grad_norm": 1.4093636274337769, + "learning_rate": 1.999992627857941e-05, + "loss": 1.073, + "step": 8088 + }, + { + "epoch": 0.10112752818820471, + "grad_norm": 35.23579788208008, + "learning_rate": 1.999992288953086e-05, + "loss": 0.8775, + "step": 8090 + }, + { + "epoch": 0.10115252881322033, + "grad_norm": 1.1284552812576294, + "learning_rate": 1.9999919424324306e-05, + "loss": 0.1336, + "step": 8092 + }, + { + "epoch": 0.10117752943823595, + "grad_norm": 6.227737903594971, + "learning_rate": 1.999991588295978e-05, + "loss": 0.7819, + "step": 8094 + }, + { + "epoch": 0.10120253006325158, + "grad_norm": 3.2738521099090576, + "learning_rate": 1.999991226543731e-05, + "loss": 1.7546, + "step": 8096 + }, + { + "epoch": 0.1012275306882672, + "grad_norm": 2.9501562118530273, + "learning_rate": 1.9999908571756923e-05, + "loss": 1.0521, + "step": 8098 + }, + { + "epoch": 0.10125253131328284, + "grad_norm": 2.3650662899017334, + "learning_rate": 1.9999904801918647e-05, + "loss": 1.0414, + "step": 8100 + }, + { + "epoch": 0.10127753193829846, + "grad_norm": 2.2370824813842773, + "learning_rate": 1.9999900955922507e-05, + "loss": 1.4847, + "step": 8102 + }, + { + "epoch": 0.10130253256331408, + "grad_norm": 2.697239637374878, + "learning_rate": 1.9999897033768537e-05, + "loss": 0.8197, + "step": 8104 + }, + { + "epoch": 0.10132753318832971, + "grad_norm": 0.007952556945383549, + "learning_rate": 1.9999893035456764e-05, + "loss": 0.0002, + "step": 8106 + }, + { + "epoch": 0.10135253381334533, + "grad_norm": 3.2981507778167725, + "learning_rate": 1.9999888960987226e-05, + "loss": 1.4916, + "step": 8108 + }, + { + "epoch": 0.10137753443836096, + "grad_norm": 3.911701202392578, + "learning_rate": 1.9999884810359942e-05, + "loss": 0.6269, + "step": 8110 + }, + { + "epoch": 0.10140253506337658, + "grad_norm": 2.998880386352539, + "learning_rate": 1.999988058357495e-05, + "loss": 0.5745, + "step": 8112 + }, + { + "epoch": 0.1014275356883922, + "grad_norm": 8.223182678222656, + "learning_rate": 1.999987628063228e-05, + "loss": 1.1755, + "step": 8114 + }, + { + "epoch": 0.10145253631340784, + "grad_norm": 2.5592076778411865, + "learning_rate": 1.999987190153197e-05, + "loss": 0.5437, + "step": 8116 + }, + { + "epoch": 0.10147753693842346, + "grad_norm": 3.602639675140381, + "learning_rate": 1.999986744627405e-05, + "loss": 1.3192, + "step": 8118 + }, + { + "epoch": 0.10150253756343909, + "grad_norm": 4.109521865844727, + "learning_rate": 1.999986291485855e-05, + "loss": 1.4021, + "step": 8120 + }, + { + "epoch": 0.10152753818845471, + "grad_norm": 4.885420799255371, + "learning_rate": 1.999985830728551e-05, + "loss": 1.1535, + "step": 8122 + }, + { + "epoch": 0.10155253881347033, + "grad_norm": 2.921337127685547, + "learning_rate": 1.9999853623554966e-05, + "loss": 1.1145, + "step": 8124 + }, + { + "epoch": 0.10157753943848596, + "grad_norm": 3.024106502532959, + "learning_rate": 1.999984886366695e-05, + "loss": 1.559, + "step": 8126 + }, + { + "epoch": 0.10160254006350158, + "grad_norm": 10.97314167022705, + "learning_rate": 1.9999844027621495e-05, + "loss": 1.5232, + "step": 8128 + }, + { + "epoch": 0.10162754068851722, + "grad_norm": 3.2761127948760986, + "learning_rate": 1.9999839115418645e-05, + "loss": 0.6371, + "step": 8130 + }, + { + "epoch": 0.10165254131353284, + "grad_norm": 3.0610830783843994, + "learning_rate": 1.9999834127058433e-05, + "loss": 1.7235, + "step": 8132 + }, + { + "epoch": 0.10167754193854846, + "grad_norm": 5.33901834487915, + "learning_rate": 1.99998290625409e-05, + "loss": 1.6958, + "step": 8134 + }, + { + "epoch": 0.10170254256356409, + "grad_norm": 5.151320457458496, + "learning_rate": 1.999982392186608e-05, + "loss": 0.7368, + "step": 8136 + }, + { + "epoch": 0.10172754318857971, + "grad_norm": 5.4452056884765625, + "learning_rate": 1.999981870503402e-05, + "loss": 1.4898, + "step": 8138 + }, + { + "epoch": 0.10175254381359534, + "grad_norm": 3.123826026916504, + "learning_rate": 1.999981341204475e-05, + "loss": 1.4763, + "step": 8140 + }, + { + "epoch": 0.10177754443861096, + "grad_norm": 2.548845052719116, + "learning_rate": 1.9999808042898315e-05, + "loss": 0.5069, + "step": 8142 + }, + { + "epoch": 0.10180254506362658, + "grad_norm": 2.0390918254852295, + "learning_rate": 1.999980259759476e-05, + "loss": 0.5888, + "step": 8144 + }, + { + "epoch": 0.10182754568864222, + "grad_norm": 7.041058540344238, + "learning_rate": 1.9999797076134123e-05, + "loss": 0.8615, + "step": 8146 + }, + { + "epoch": 0.10185254631365784, + "grad_norm": 2.296977996826172, + "learning_rate": 1.999979147851644e-05, + "loss": 0.6428, + "step": 8148 + }, + { + "epoch": 0.10187754693867347, + "grad_norm": 0.5198347568511963, + "learning_rate": 1.9999785804741765e-05, + "loss": 0.0782, + "step": 8150 + }, + { + "epoch": 0.10190254756368909, + "grad_norm": 3.899540662765503, + "learning_rate": 1.9999780054810133e-05, + "loss": 1.5424, + "step": 8152 + }, + { + "epoch": 0.10192754818870471, + "grad_norm": 4.117657661437988, + "learning_rate": 1.9999774228721593e-05, + "loss": 0.7702, + "step": 8154 + }, + { + "epoch": 0.10195254881372035, + "grad_norm": 5.040409088134766, + "learning_rate": 1.999976832647618e-05, + "loss": 0.7861, + "step": 8156 + }, + { + "epoch": 0.10197754943873596, + "grad_norm": 14.72226619720459, + "learning_rate": 1.999976234807395e-05, + "loss": 0.8687, + "step": 8158 + }, + { + "epoch": 0.1020025500637516, + "grad_norm": 0.6122305393218994, + "learning_rate": 1.9999756293514942e-05, + "loss": 0.2254, + "step": 8160 + }, + { + "epoch": 0.10202755068876722, + "grad_norm": 5.5875959396362305, + "learning_rate": 1.999975016279921e-05, + "loss": 1.5922, + "step": 8162 + }, + { + "epoch": 0.10205255131378284, + "grad_norm": 0.0332639105618, + "learning_rate": 1.9999743955926787e-05, + "loss": 1.3015, + "step": 8164 + }, + { + "epoch": 0.10207755193879847, + "grad_norm": 0.8400448560714722, + "learning_rate": 1.999973767289773e-05, + "loss": 0.2331, + "step": 8166 + }, + { + "epoch": 0.10210255256381409, + "grad_norm": 4.079097747802734, + "learning_rate": 1.999973131371209e-05, + "loss": 1.6476, + "step": 8168 + }, + { + "epoch": 0.10212755318882973, + "grad_norm": 2.986262798309326, + "learning_rate": 1.9999724878369907e-05, + "loss": 1.2911, + "step": 8170 + }, + { + "epoch": 0.10215255381384535, + "grad_norm": 6.445075035095215, + "learning_rate": 1.9999718366871234e-05, + "loss": 0.6288, + "step": 8172 + }, + { + "epoch": 0.10217755443886097, + "grad_norm": 5.558152675628662, + "learning_rate": 1.999971177921612e-05, + "loss": 0.3174, + "step": 8174 + }, + { + "epoch": 0.1022025550638766, + "grad_norm": 4.009613990783691, + "learning_rate": 1.9999705115404612e-05, + "loss": 1.3716, + "step": 8176 + }, + { + "epoch": 0.10222755568889222, + "grad_norm": 2.253377914428711, + "learning_rate": 1.9999698375436765e-05, + "loss": 0.5976, + "step": 8178 + }, + { + "epoch": 0.10225255631390785, + "grad_norm": 0.018307600170373917, + "learning_rate": 1.999969155931263e-05, + "loss": 0.9278, + "step": 8180 + }, + { + "epoch": 0.10227755693892347, + "grad_norm": 3.658968925476074, + "learning_rate": 1.999968466703226e-05, + "loss": 1.6696, + "step": 8182 + }, + { + "epoch": 0.10230255756393909, + "grad_norm": 3.415332317352295, + "learning_rate": 1.9999677698595703e-05, + "loss": 0.9962, + "step": 8184 + }, + { + "epoch": 0.10232755818895473, + "grad_norm": 5.971807956695557, + "learning_rate": 1.9999670654003018e-05, + "loss": 1.6694, + "step": 8186 + }, + { + "epoch": 0.10235255881397035, + "grad_norm": 2.3960914611816406, + "learning_rate": 1.9999663533254254e-05, + "loss": 0.7621, + "step": 8188 + }, + { + "epoch": 0.10237755943898598, + "grad_norm": 3.139497995376587, + "learning_rate": 1.9999656336349466e-05, + "loss": 1.0639, + "step": 8190 + }, + { + "epoch": 0.1024025600640016, + "grad_norm": 5.898971080780029, + "learning_rate": 1.9999649063288707e-05, + "loss": 0.6896, + "step": 8192 + }, + { + "epoch": 0.10242756068901722, + "grad_norm": 4.4636993408203125, + "learning_rate": 1.999964171407204e-05, + "loss": 1.328, + "step": 8194 + }, + { + "epoch": 0.10245256131403285, + "grad_norm": 5.691183090209961, + "learning_rate": 1.9999634288699512e-05, + "loss": 1.4077, + "step": 8196 + }, + { + "epoch": 0.10247756193904847, + "grad_norm": 7.095579624176025, + "learning_rate": 1.9999626787171186e-05, + "loss": 1.1191, + "step": 8198 + }, + { + "epoch": 0.1025025625640641, + "grad_norm": 3.924360752105713, + "learning_rate": 1.9999619209487117e-05, + "loss": 1.2706, + "step": 8200 + }, + { + "epoch": 0.10252756318907973, + "grad_norm": 2.8427627086639404, + "learning_rate": 1.9999611555647356e-05, + "loss": 0.4274, + "step": 8202 + }, + { + "epoch": 0.10255256381409535, + "grad_norm": 2.3014156818389893, + "learning_rate": 1.999960382565197e-05, + "loss": 0.6797, + "step": 8204 + }, + { + "epoch": 0.10257756443911098, + "grad_norm": 4.057212829589844, + "learning_rate": 1.999959601950102e-05, + "loss": 1.1257, + "step": 8206 + }, + { + "epoch": 0.1026025650641266, + "grad_norm": 1.8656758069992065, + "learning_rate": 1.9999588137194558e-05, + "loss": 0.591, + "step": 8208 + }, + { + "epoch": 0.10262756568914223, + "grad_norm": 2.6916122436523438, + "learning_rate": 1.999958017873265e-05, + "loss": 1.3483, + "step": 8210 + }, + { + "epoch": 0.10265256631415785, + "grad_norm": 63.582237243652344, + "learning_rate": 1.9999572144115345e-05, + "loss": 0.3989, + "step": 8212 + }, + { + "epoch": 0.10267756693917347, + "grad_norm": 2.9195070266723633, + "learning_rate": 1.999956403334272e-05, + "loss": 1.5604, + "step": 8214 + }, + { + "epoch": 0.1027025675641891, + "grad_norm": 0.28848546743392944, + "learning_rate": 1.9999555846414824e-05, + "loss": 0.6406, + "step": 8216 + }, + { + "epoch": 0.10272756818920473, + "grad_norm": 2.6175570487976074, + "learning_rate": 1.9999547583331732e-05, + "loss": 0.2566, + "step": 8218 + }, + { + "epoch": 0.10275256881422036, + "grad_norm": 1.0776827335357666, + "learning_rate": 1.9999539244093493e-05, + "loss": 0.5, + "step": 8220 + }, + { + "epoch": 0.10277756943923598, + "grad_norm": 4.2871880531311035, + "learning_rate": 1.999953082870018e-05, + "loss": 2.1375, + "step": 8222 + }, + { + "epoch": 0.1028025700642516, + "grad_norm": 6.943802356719971, + "learning_rate": 1.9999522337151853e-05, + "loss": 0.548, + "step": 8224 + }, + { + "epoch": 0.10282757068926723, + "grad_norm": 2.431964159011841, + "learning_rate": 1.999951376944858e-05, + "loss": 0.6921, + "step": 8226 + }, + { + "epoch": 0.10285257131428285, + "grad_norm": 0.10115398466587067, + "learning_rate": 1.9999505125590423e-05, + "loss": 0.0173, + "step": 8228 + }, + { + "epoch": 0.10287757193929849, + "grad_norm": 5.498225688934326, + "learning_rate": 1.9999496405577447e-05, + "loss": 1.554, + "step": 8230 + }, + { + "epoch": 0.10290257256431411, + "grad_norm": 3.18149471282959, + "learning_rate": 1.9999487609409724e-05, + "loss": 0.8476, + "step": 8232 + }, + { + "epoch": 0.10292757318932973, + "grad_norm": 7.023778915405273, + "learning_rate": 1.9999478737087316e-05, + "loss": 0.7562, + "step": 8234 + }, + { + "epoch": 0.10295257381434536, + "grad_norm": 2.692497491836548, + "learning_rate": 1.9999469788610292e-05, + "loss": 1.6896, + "step": 8236 + }, + { + "epoch": 0.10297757443936098, + "grad_norm": 2.9792182445526123, + "learning_rate": 1.999946076397872e-05, + "loss": 0.7629, + "step": 8238 + }, + { + "epoch": 0.10300257506437661, + "grad_norm": 4.602653503417969, + "learning_rate": 1.999945166319267e-05, + "loss": 1.9024, + "step": 8240 + }, + { + "epoch": 0.10302757568939223, + "grad_norm": 2.5463504791259766, + "learning_rate": 1.9999442486252208e-05, + "loss": 1.1795, + "step": 8242 + }, + { + "epoch": 0.10305257631440785, + "grad_norm": 6.120151042938232, + "learning_rate": 1.9999433233157407e-05, + "loss": 1.8247, + "step": 8244 + }, + { + "epoch": 0.10307757693942349, + "grad_norm": 3.735633611679077, + "learning_rate": 1.9999423903908336e-05, + "loss": 1.6733, + "step": 8246 + }, + { + "epoch": 0.10310257756443911, + "grad_norm": 4.027077674865723, + "learning_rate": 1.9999414498505068e-05, + "loss": 1.1849, + "step": 8248 + }, + { + "epoch": 0.10312757818945474, + "grad_norm": 3.5637199878692627, + "learning_rate": 1.9999405016947673e-05, + "loss": 0.1676, + "step": 8250 + }, + { + "epoch": 0.10315257881447036, + "grad_norm": 3.2493999004364014, + "learning_rate": 1.9999395459236222e-05, + "loss": 0.941, + "step": 8252 + }, + { + "epoch": 0.10317757943948598, + "grad_norm": 3.269446849822998, + "learning_rate": 1.999938582537079e-05, + "loss": 1.14, + "step": 8254 + }, + { + "epoch": 0.10320258006450161, + "grad_norm": 5.696062088012695, + "learning_rate": 1.999937611535145e-05, + "loss": 1.5541, + "step": 8256 + }, + { + "epoch": 0.10322758068951723, + "grad_norm": 0.10382885485887527, + "learning_rate": 1.9999366329178275e-05, + "loss": 0.1467, + "step": 8258 + }, + { + "epoch": 0.10325258131453287, + "grad_norm": 3.6778533458709717, + "learning_rate": 1.999935646685134e-05, + "loss": 0.9893, + "step": 8260 + }, + { + "epoch": 0.10327758193954849, + "grad_norm": 2.657210350036621, + "learning_rate": 1.9999346528370722e-05, + "loss": 0.5717, + "step": 8262 + }, + { + "epoch": 0.10330258256456411, + "grad_norm": 4.253978729248047, + "learning_rate": 1.9999336513736492e-05, + "loss": 2.284, + "step": 8264 + }, + { + "epoch": 0.10332758318957974, + "grad_norm": 3.3748433589935303, + "learning_rate": 1.9999326422948732e-05, + "loss": 0.7414, + "step": 8266 + }, + { + "epoch": 0.10335258381459536, + "grad_norm": 1.1403871774673462, + "learning_rate": 1.9999316256007516e-05, + "loss": 0.5989, + "step": 8268 + }, + { + "epoch": 0.103377584439611, + "grad_norm": 1.7944897413253784, + "learning_rate": 1.999930601291292e-05, + "loss": 0.9054, + "step": 8270 + }, + { + "epoch": 0.10340258506462661, + "grad_norm": 5.714563369750977, + "learning_rate": 1.999929569366502e-05, + "loss": 1.7851, + "step": 8272 + }, + { + "epoch": 0.10342758568964223, + "grad_norm": 3.1799912452697754, + "learning_rate": 1.99992852982639e-05, + "loss": 0.5667, + "step": 8274 + }, + { + "epoch": 0.10345258631465787, + "grad_norm": 9.851259231567383, + "learning_rate": 1.9999274826709644e-05, + "loss": 1.3042, + "step": 8276 + }, + { + "epoch": 0.10347758693967349, + "grad_norm": 4.712583065032959, + "learning_rate": 1.9999264279002317e-05, + "loss": 2.2779, + "step": 8278 + }, + { + "epoch": 0.10350258756468912, + "grad_norm": 4.274397850036621, + "learning_rate": 1.999925365514201e-05, + "loss": 1.3037, + "step": 8280 + }, + { + "epoch": 0.10352758818970474, + "grad_norm": 3.6049561500549316, + "learning_rate": 1.99992429551288e-05, + "loss": 1.1061, + "step": 8282 + }, + { + "epoch": 0.10355258881472036, + "grad_norm": 1.1000769138336182, + "learning_rate": 1.9999232178962772e-05, + "loss": 0.7683, + "step": 8284 + }, + { + "epoch": 0.103577589439736, + "grad_norm": 2.2200210094451904, + "learning_rate": 1.9999221326644003e-05, + "loss": 1.0216, + "step": 8286 + }, + { + "epoch": 0.10360259006475162, + "grad_norm": 0.025302298367023468, + "learning_rate": 1.999921039817258e-05, + "loss": 1.0747, + "step": 8288 + }, + { + "epoch": 0.10362759068976725, + "grad_norm": 6.226258277893066, + "learning_rate": 1.9999199393548587e-05, + "loss": 1.813, + "step": 8290 + }, + { + "epoch": 0.10365259131478287, + "grad_norm": 2.033273696899414, + "learning_rate": 1.9999188312772103e-05, + "loss": 0.6304, + "step": 8292 + }, + { + "epoch": 0.10367759193979849, + "grad_norm": 0.11689481139183044, + "learning_rate": 1.9999177155843214e-05, + "loss": 0.0292, + "step": 8294 + }, + { + "epoch": 0.10370259256481412, + "grad_norm": 4.131185531616211, + "learning_rate": 1.9999165922762007e-05, + "loss": 0.9747, + "step": 8296 + }, + { + "epoch": 0.10372759318982974, + "grad_norm": 4.068981170654297, + "learning_rate": 1.9999154613528567e-05, + "loss": 0.593, + "step": 8298 + }, + { + "epoch": 0.10375259381484538, + "grad_norm": 4.408660888671875, + "learning_rate": 1.9999143228142975e-05, + "loss": 1.185, + "step": 8300 + }, + { + "epoch": 0.103777594439861, + "grad_norm": 0.5668891668319702, + "learning_rate": 1.9999131766605327e-05, + "loss": 0.2274, + "step": 8302 + }, + { + "epoch": 0.10380259506487662, + "grad_norm": 6.984584808349609, + "learning_rate": 1.9999120228915703e-05, + "loss": 1.6497, + "step": 8304 + }, + { + "epoch": 0.10382759568989225, + "grad_norm": 4.324365615844727, + "learning_rate": 1.9999108615074196e-05, + "loss": 1.7075, + "step": 8306 + }, + { + "epoch": 0.10385259631490787, + "grad_norm": 2.7971420288085938, + "learning_rate": 1.999909692508089e-05, + "loss": 1.128, + "step": 8308 + }, + { + "epoch": 0.1038775969399235, + "grad_norm": 3.7987613677978516, + "learning_rate": 1.9999085158935875e-05, + "loss": 1.1946, + "step": 8310 + }, + { + "epoch": 0.10390259756493912, + "grad_norm": 2.831923246383667, + "learning_rate": 1.9999073316639245e-05, + "loss": 0.4453, + "step": 8312 + }, + { + "epoch": 0.10392759818995474, + "grad_norm": 3.41990327835083, + "learning_rate": 1.9999061398191083e-05, + "loss": 0.9134, + "step": 8314 + }, + { + "epoch": 0.10395259881497038, + "grad_norm": 4.109755992889404, + "learning_rate": 1.9999049403591484e-05, + "loss": 1.5121, + "step": 8316 + }, + { + "epoch": 0.103977599439986, + "grad_norm": 2.230991840362549, + "learning_rate": 1.9999037332840537e-05, + "loss": 0.8564, + "step": 8318 + }, + { + "epoch": 0.10400260006500163, + "grad_norm": 3.2986786365509033, + "learning_rate": 1.999902518593834e-05, + "loss": 0.9423, + "step": 8320 + }, + { + "epoch": 0.10402760069001725, + "grad_norm": 4.635247230529785, + "learning_rate": 1.999901296288498e-05, + "loss": 1.239, + "step": 8322 + }, + { + "epoch": 0.10405260131503287, + "grad_norm": 4.297941207885742, + "learning_rate": 1.999900066368055e-05, + "loss": 0.7815, + "step": 8324 + }, + { + "epoch": 0.1040776019400485, + "grad_norm": 2.313415765762329, + "learning_rate": 1.9998988288325143e-05, + "loss": 0.7331, + "step": 8326 + }, + { + "epoch": 0.10410260256506412, + "grad_norm": 1.751887559890747, + "learning_rate": 1.999897583681886e-05, + "loss": 0.9038, + "step": 8328 + }, + { + "epoch": 0.10412760319007976, + "grad_norm": 1.832931637763977, + "learning_rate": 1.9998963309161786e-05, + "loss": 0.8931, + "step": 8330 + }, + { + "epoch": 0.10415260381509538, + "grad_norm": 3.3666555881500244, + "learning_rate": 1.9998950705354025e-05, + "loss": 0.4009, + "step": 8332 + }, + { + "epoch": 0.104177604440111, + "grad_norm": 3.3679981231689453, + "learning_rate": 1.9998938025395667e-05, + "loss": 1.2638, + "step": 8334 + }, + { + "epoch": 0.10420260506512663, + "grad_norm": 0.08637663722038269, + "learning_rate": 1.999892526928681e-05, + "loss": 0.6402, + "step": 8336 + }, + { + "epoch": 0.10422760569014225, + "grad_norm": 6.5320634841918945, + "learning_rate": 1.999891243702756e-05, + "loss": 0.6175, + "step": 8338 + }, + { + "epoch": 0.10425260631515788, + "grad_norm": 2.3478336334228516, + "learning_rate": 1.9998899528618e-05, + "loss": 0.9355, + "step": 8340 + }, + { + "epoch": 0.1042776069401735, + "grad_norm": 3.3978593349456787, + "learning_rate": 1.9998886544058235e-05, + "loss": 1.0972, + "step": 8342 + }, + { + "epoch": 0.10430260756518912, + "grad_norm": 6.152519702911377, + "learning_rate": 1.9998873483348366e-05, + "loss": 1.2619, + "step": 8344 + }, + { + "epoch": 0.10432760819020476, + "grad_norm": 0.020336223766207695, + "learning_rate": 1.999886034648849e-05, + "loss": 0.5153, + "step": 8346 + }, + { + "epoch": 0.10435260881522038, + "grad_norm": 2.1210224628448486, + "learning_rate": 1.9998847133478705e-05, + "loss": 0.6534, + "step": 8348 + }, + { + "epoch": 0.10437760944023601, + "grad_norm": 3.28605580329895, + "learning_rate": 1.9998833844319117e-05, + "loss": 0.6476, + "step": 8350 + }, + { + "epoch": 0.10440261006525163, + "grad_norm": 5.421499252319336, + "learning_rate": 1.9998820479009824e-05, + "loss": 0.1615, + "step": 8352 + }, + { + "epoch": 0.10442761069026725, + "grad_norm": 0.6191707849502563, + "learning_rate": 1.9998807037550927e-05, + "loss": 0.7635, + "step": 8354 + }, + { + "epoch": 0.10445261131528288, + "grad_norm": 4.281023979187012, + "learning_rate": 1.999879351994253e-05, + "loss": 0.9406, + "step": 8356 + }, + { + "epoch": 0.1044776119402985, + "grad_norm": 0.41516467928886414, + "learning_rate": 1.999877992618474e-05, + "loss": 0.0727, + "step": 8358 + }, + { + "epoch": 0.10450261256531414, + "grad_norm": 6.506966590881348, + "learning_rate": 1.999876625627765e-05, + "loss": 0.3162, + "step": 8360 + }, + { + "epoch": 0.10452761319032976, + "grad_norm": 0.0059335133992135525, + "learning_rate": 1.9998752510221373e-05, + "loss": 0.4288, + "step": 8362 + }, + { + "epoch": 0.10455261381534538, + "grad_norm": 5.68477201461792, + "learning_rate": 1.999873868801601e-05, + "loss": 1.1847, + "step": 8364 + }, + { + "epoch": 0.10457761444036101, + "grad_norm": 0.6466432809829712, + "learning_rate": 1.999872478966167e-05, + "loss": 0.6078, + "step": 8366 + }, + { + "epoch": 0.10460261506537663, + "grad_norm": 4.63156270980835, + "learning_rate": 1.999871081515845e-05, + "loss": 1.0999, + "step": 8368 + }, + { + "epoch": 0.10462761569039226, + "grad_norm": 2.88346791267395, + "learning_rate": 1.9998696764506467e-05, + "loss": 1.0257, + "step": 8370 + }, + { + "epoch": 0.10465261631540788, + "grad_norm": 4.811205863952637, + "learning_rate": 1.9998682637705823e-05, + "loss": 0.8324, + "step": 8372 + }, + { + "epoch": 0.1046776169404235, + "grad_norm": 0.22621853649616241, + "learning_rate": 1.9998668434756626e-05, + "loss": 0.9462, + "step": 8374 + }, + { + "epoch": 0.10470261756543914, + "grad_norm": 6.167718887329102, + "learning_rate": 1.999865415565898e-05, + "loss": 0.6207, + "step": 8376 + }, + { + "epoch": 0.10472761819045476, + "grad_norm": 4.111562252044678, + "learning_rate": 1.9998639800413e-05, + "loss": 1.0324, + "step": 8378 + }, + { + "epoch": 0.10475261881547039, + "grad_norm": 3.1849746704101562, + "learning_rate": 1.9998625369018798e-05, + "loss": 1.3209, + "step": 8380 + }, + { + "epoch": 0.10477761944048601, + "grad_norm": 6.593410491943359, + "learning_rate": 1.9998610861476474e-05, + "loss": 1.1689, + "step": 8382 + }, + { + "epoch": 0.10480262006550163, + "grad_norm": 3.9233651161193848, + "learning_rate": 1.9998596277786147e-05, + "loss": 1.3948, + "step": 8384 + }, + { + "epoch": 0.10482762069051726, + "grad_norm": 3.5545947551727295, + "learning_rate": 1.999858161794792e-05, + "loss": 0.8752, + "step": 8386 + }, + { + "epoch": 0.10485262131553288, + "grad_norm": 2.224771738052368, + "learning_rate": 1.9998566881961915e-05, + "loss": 0.1985, + "step": 8388 + }, + { + "epoch": 0.10487762194054852, + "grad_norm": 3.387538433074951, + "learning_rate": 1.9998552069828235e-05, + "loss": 0.9713, + "step": 8390 + }, + { + "epoch": 0.10490262256556414, + "grad_norm": 0.0029336584266275167, + "learning_rate": 1.9998537181546997e-05, + "loss": 0.7239, + "step": 8392 + }, + { + "epoch": 0.10492762319057976, + "grad_norm": 5.790914058685303, + "learning_rate": 1.9998522217118314e-05, + "loss": 1.3732, + "step": 8394 + }, + { + "epoch": 0.10495262381559539, + "grad_norm": 5.569971561431885, + "learning_rate": 1.9998507176542297e-05, + "loss": 1.2761, + "step": 8396 + }, + { + "epoch": 0.10497762444061101, + "grad_norm": 3.958761692047119, + "learning_rate": 1.9998492059819066e-05, + "loss": 2.159, + "step": 8398 + }, + { + "epoch": 0.10500262506562665, + "grad_norm": 4.659613132476807, + "learning_rate": 1.9998476866948734e-05, + "loss": 1.6248, + "step": 8400 + }, + { + "epoch": 0.10502762569064227, + "grad_norm": 4.550838947296143, + "learning_rate": 1.9998461597931415e-05, + "loss": 1.0829, + "step": 8402 + }, + { + "epoch": 0.10505262631565788, + "grad_norm": 6.740553379058838, + "learning_rate": 1.9998446252767226e-05, + "loss": 0.9246, + "step": 8404 + }, + { + "epoch": 0.10507762694067352, + "grad_norm": 3.752078056335449, + "learning_rate": 1.9998430831456286e-05, + "loss": 1.3665, + "step": 8406 + }, + { + "epoch": 0.10510262756568914, + "grad_norm": 5.0727152824401855, + "learning_rate": 1.999841533399871e-05, + "loss": 0.7314, + "step": 8408 + }, + { + "epoch": 0.10512762819070477, + "grad_norm": 2.619202136993408, + "learning_rate": 1.9998399760394615e-05, + "loss": 0.5299, + "step": 8410 + }, + { + "epoch": 0.10515262881572039, + "grad_norm": 2.8695895671844482, + "learning_rate": 1.9998384110644124e-05, + "loss": 0.9455, + "step": 8412 + }, + { + "epoch": 0.10517762944073601, + "grad_norm": 0.005487400572746992, + "learning_rate": 1.9998368384747353e-05, + "loss": 0.5799, + "step": 8414 + }, + { + "epoch": 0.10520263006575165, + "grad_norm": 3.026838541030884, + "learning_rate": 1.999835258270442e-05, + "loss": 0.4535, + "step": 8416 + }, + { + "epoch": 0.10522763069076727, + "grad_norm": 1.4052587747573853, + "learning_rate": 1.999833670451545e-05, + "loss": 0.7729, + "step": 8418 + }, + { + "epoch": 0.1052526313157829, + "grad_norm": 2.5566749572753906, + "learning_rate": 1.999832075018056e-05, + "loss": 0.9385, + "step": 8420 + }, + { + "epoch": 0.10527763194079852, + "grad_norm": 3.456432342529297, + "learning_rate": 1.9998304719699872e-05, + "loss": 0.5818, + "step": 8422 + }, + { + "epoch": 0.10530263256581414, + "grad_norm": 2.7935094833374023, + "learning_rate": 1.9998288613073515e-05, + "loss": 0.5934, + "step": 8424 + }, + { + "epoch": 0.10532763319082977, + "grad_norm": 2.919609546661377, + "learning_rate": 1.99982724303016e-05, + "loss": 0.8014, + "step": 8426 + }, + { + "epoch": 0.10535263381584539, + "grad_norm": 8.096268653869629, + "learning_rate": 1.9998256171384256e-05, + "loss": 1.1282, + "step": 8428 + }, + { + "epoch": 0.10537763444086103, + "grad_norm": 1.8452014923095703, + "learning_rate": 1.999823983632161e-05, + "loss": 1.2084, + "step": 8430 + }, + { + "epoch": 0.10540263506587665, + "grad_norm": 9.954158782958984, + "learning_rate": 1.9998223425113782e-05, + "loss": 0.9186, + "step": 8432 + }, + { + "epoch": 0.10542763569089227, + "grad_norm": 3.537421226501465, + "learning_rate": 1.9998206937760897e-05, + "loss": 0.6932, + "step": 8434 + }, + { + "epoch": 0.1054526363159079, + "grad_norm": 0.794554591178894, + "learning_rate": 1.9998190374263084e-05, + "loss": 0.262, + "step": 8436 + }, + { + "epoch": 0.10547763694092352, + "grad_norm": 4.082449913024902, + "learning_rate": 1.9998173734620464e-05, + "loss": 0.5304, + "step": 8438 + }, + { + "epoch": 0.10550263756593915, + "grad_norm": 2.791154384613037, + "learning_rate": 1.9998157018833168e-05, + "loss": 1.6654, + "step": 8440 + }, + { + "epoch": 0.10552763819095477, + "grad_norm": 4.3875908851623535, + "learning_rate": 1.9998140226901323e-05, + "loss": 0.4361, + "step": 8442 + }, + { + "epoch": 0.10555263881597039, + "grad_norm": 1.9095979928970337, + "learning_rate": 1.9998123358825054e-05, + "loss": 2.2847, + "step": 8444 + }, + { + "epoch": 0.10557763944098603, + "grad_norm": 2.3409314155578613, + "learning_rate": 1.9998106414604495e-05, + "loss": 1.4355, + "step": 8446 + }, + { + "epoch": 0.10560264006600165, + "grad_norm": 2.5876736640930176, + "learning_rate": 1.9998089394239767e-05, + "loss": 1.8404, + "step": 8448 + }, + { + "epoch": 0.10562764069101728, + "grad_norm": 5.62089729309082, + "learning_rate": 1.9998072297731008e-05, + "loss": 1.7296, + "step": 8450 + }, + { + "epoch": 0.1056526413160329, + "grad_norm": 4.028035640716553, + "learning_rate": 1.9998055125078344e-05, + "loss": 0.3475, + "step": 8452 + }, + { + "epoch": 0.10567764194104852, + "grad_norm": 3.2572622299194336, + "learning_rate": 1.9998037876281905e-05, + "loss": 0.4787, + "step": 8454 + }, + { + "epoch": 0.10570264256606415, + "grad_norm": 2.3585617542266846, + "learning_rate": 1.999802055134182e-05, + "loss": 0.6589, + "step": 8456 + }, + { + "epoch": 0.10572764319107977, + "grad_norm": 5.8609442710876465, + "learning_rate": 1.999800315025823e-05, + "loss": 0.6963, + "step": 8458 + }, + { + "epoch": 0.1057526438160954, + "grad_norm": 1.7414828538894653, + "learning_rate": 1.9997985673031258e-05, + "loss": 0.1902, + "step": 8460 + }, + { + "epoch": 0.10577764444111103, + "grad_norm": 2.710845947265625, + "learning_rate": 1.9997968119661042e-05, + "loss": 0.2276, + "step": 8462 + }, + { + "epoch": 0.10580264506612665, + "grad_norm": 0.002081798855215311, + "learning_rate": 1.9997950490147714e-05, + "loss": 0.3664, + "step": 8464 + }, + { + "epoch": 0.10582764569114228, + "grad_norm": 3.6442065238952637, + "learning_rate": 1.999793278449141e-05, + "loss": 1.3573, + "step": 8466 + }, + { + "epoch": 0.1058526463161579, + "grad_norm": 2.589599132537842, + "learning_rate": 1.9997915002692262e-05, + "loss": 1.1417, + "step": 8468 + }, + { + "epoch": 0.10587764694117353, + "grad_norm": 0.8480360507965088, + "learning_rate": 1.9997897144750408e-05, + "loss": 0.3829, + "step": 8470 + }, + { + "epoch": 0.10590264756618915, + "grad_norm": 3.691373348236084, + "learning_rate": 1.9997879210665984e-05, + "loss": 0.8943, + "step": 8472 + }, + { + "epoch": 0.10592764819120477, + "grad_norm": 0.8289151787757874, + "learning_rate": 1.9997861200439126e-05, + "loss": 0.3952, + "step": 8474 + }, + { + "epoch": 0.10595264881622041, + "grad_norm": 0.0024922024458646774, + "learning_rate": 1.999784311406997e-05, + "loss": 0.5752, + "step": 8476 + }, + { + "epoch": 0.10597764944123603, + "grad_norm": 4.790506839752197, + "learning_rate": 1.9997824951558655e-05, + "loss": 0.9923, + "step": 8478 + }, + { + "epoch": 0.10600265006625166, + "grad_norm": 3.5096395015716553, + "learning_rate": 1.999780671290532e-05, + "loss": 1.5369, + "step": 8480 + }, + { + "epoch": 0.10602765069126728, + "grad_norm": 6.240821838378906, + "learning_rate": 1.9997788398110104e-05, + "loss": 2.0483, + "step": 8482 + }, + { + "epoch": 0.1060526513162829, + "grad_norm": 0.8773292303085327, + "learning_rate": 1.999777000717314e-05, + "loss": 1.0451, + "step": 8484 + }, + { + "epoch": 0.10607765194129853, + "grad_norm": 2.413527250289917, + "learning_rate": 1.999775154009458e-05, + "loss": 1.0396, + "step": 8486 + }, + { + "epoch": 0.10610265256631415, + "grad_norm": 9.830681800842285, + "learning_rate": 1.9997732996874554e-05, + "loss": 2.301, + "step": 8488 + }, + { + "epoch": 0.10612765319132979, + "grad_norm": 3.7226176261901855, + "learning_rate": 1.9997714377513208e-05, + "loss": 1.0193, + "step": 8490 + }, + { + "epoch": 0.10615265381634541, + "grad_norm": 2.0206422805786133, + "learning_rate": 1.999769568201068e-05, + "loss": 0.299, + "step": 8492 + }, + { + "epoch": 0.10617765444136103, + "grad_norm": 5.087479591369629, + "learning_rate": 1.999767691036712e-05, + "loss": 0.7604, + "step": 8494 + }, + { + "epoch": 0.10620265506637666, + "grad_norm": 2.6474151611328125, + "learning_rate": 1.9997658062582665e-05, + "loss": 0.5325, + "step": 8496 + }, + { + "epoch": 0.10622765569139228, + "grad_norm": 3.5508975982666016, + "learning_rate": 1.999763913865746e-05, + "loss": 0.9298, + "step": 8498 + }, + { + "epoch": 0.10625265631640791, + "grad_norm": 3.018458366394043, + "learning_rate": 1.999762013859165e-05, + "loss": 1.4292, + "step": 8500 + }, + { + "epoch": 0.10627765694142353, + "grad_norm": 3.016869068145752, + "learning_rate": 1.999760106238538e-05, + "loss": 1.2559, + "step": 8502 + }, + { + "epoch": 0.10630265756643915, + "grad_norm": 2.22532057762146, + "learning_rate": 1.999758191003879e-05, + "loss": 0.7471, + "step": 8504 + }, + { + "epoch": 0.10632765819145479, + "grad_norm": 2.945199966430664, + "learning_rate": 1.9997562681552033e-05, + "loss": 0.8031, + "step": 8506 + }, + { + "epoch": 0.10635265881647041, + "grad_norm": 3.0537047386169434, + "learning_rate": 1.999754337692525e-05, + "loss": 1.1252, + "step": 8508 + }, + { + "epoch": 0.10637765944148604, + "grad_norm": 4.216436862945557, + "learning_rate": 1.9997523996158592e-05, + "loss": 2.2999, + "step": 8510 + }, + { + "epoch": 0.10640266006650166, + "grad_norm": 2.4028117656707764, + "learning_rate": 1.9997504539252207e-05, + "loss": 1.83, + "step": 8512 + }, + { + "epoch": 0.10642766069151728, + "grad_norm": 3.0676450729370117, + "learning_rate": 1.999748500620624e-05, + "loss": 1.1081, + "step": 8514 + }, + { + "epoch": 0.10645266131653291, + "grad_norm": 3.257905960083008, + "learning_rate": 1.999746539702084e-05, + "loss": 0.9452, + "step": 8516 + }, + { + "epoch": 0.10647766194154853, + "grad_norm": 2.758225917816162, + "learning_rate": 1.9997445711696157e-05, + "loss": 0.9869, + "step": 8518 + }, + { + "epoch": 0.10650266256656417, + "grad_norm": 5.829425811767578, + "learning_rate": 1.9997425950232342e-05, + "loss": 0.7103, + "step": 8520 + }, + { + "epoch": 0.10652766319157979, + "grad_norm": 3.741913080215454, + "learning_rate": 1.9997406112629543e-05, + "loss": 1.3094, + "step": 8522 + }, + { + "epoch": 0.10655266381659541, + "grad_norm": 3.6776442527770996, + "learning_rate": 1.9997386198887914e-05, + "loss": 1.7637, + "step": 8524 + }, + { + "epoch": 0.10657766444161104, + "grad_norm": 2.8479771614074707, + "learning_rate": 1.9997366209007605e-05, + "loss": 0.8179, + "step": 8526 + }, + { + "epoch": 0.10660266506662666, + "grad_norm": 2.7646775245666504, + "learning_rate": 1.999734614298877e-05, + "loss": 1.5475, + "step": 8528 + }, + { + "epoch": 0.1066276656916423, + "grad_norm": 2.6992151737213135, + "learning_rate": 1.999732600083156e-05, + "loss": 0.3244, + "step": 8530 + }, + { + "epoch": 0.10665266631665792, + "grad_norm": 4.284727573394775, + "learning_rate": 1.9997305782536128e-05, + "loss": 1.7576, + "step": 8532 + }, + { + "epoch": 0.10667766694167354, + "grad_norm": 6.380565166473389, + "learning_rate": 1.9997285488102632e-05, + "loss": 2.1431, + "step": 8534 + }, + { + "epoch": 0.10670266756668917, + "grad_norm": 7.055816650390625, + "learning_rate": 1.9997265117531222e-05, + "loss": 0.1112, + "step": 8536 + }, + { + "epoch": 0.10672766819170479, + "grad_norm": 2.7500245571136475, + "learning_rate": 1.999724467082205e-05, + "loss": 1.4863, + "step": 8538 + }, + { + "epoch": 0.10675266881672042, + "grad_norm": 2.394408702850342, + "learning_rate": 1.999722414797528e-05, + "loss": 0.8546, + "step": 8540 + }, + { + "epoch": 0.10677766944173604, + "grad_norm": 8.73987102508545, + "learning_rate": 1.9997203548991066e-05, + "loss": 1.5813, + "step": 8542 + }, + { + "epoch": 0.10680267006675166, + "grad_norm": 3.7897300720214844, + "learning_rate": 1.9997182873869563e-05, + "loss": 0.2453, + "step": 8544 + }, + { + "epoch": 0.1068276706917673, + "grad_norm": 0.23947031795978546, + "learning_rate": 1.9997162122610928e-05, + "loss": 0.6517, + "step": 8546 + }, + { + "epoch": 0.10685267131678292, + "grad_norm": 2.4592108726501465, + "learning_rate": 1.999714129521532e-05, + "loss": 0.6234, + "step": 8548 + }, + { + "epoch": 0.10687767194179855, + "grad_norm": 3.183070421218872, + "learning_rate": 1.99971203916829e-05, + "loss": 0.7107, + "step": 8550 + }, + { + "epoch": 0.10690267256681417, + "grad_norm": 3.366215944290161, + "learning_rate": 1.999709941201382e-05, + "loss": 1.1409, + "step": 8552 + }, + { + "epoch": 0.10692767319182979, + "grad_norm": 3.866760015487671, + "learning_rate": 1.999707835620825e-05, + "loss": 1.1213, + "step": 8554 + }, + { + "epoch": 0.10695267381684542, + "grad_norm": 5.221292018890381, + "learning_rate": 1.999705722426634e-05, + "loss": 1.1615, + "step": 8556 + }, + { + "epoch": 0.10697767444186104, + "grad_norm": 4.623573303222656, + "learning_rate": 1.9997036016188257e-05, + "loss": 0.5796, + "step": 8558 + }, + { + "epoch": 0.10700267506687668, + "grad_norm": 3.0881688594818115, + "learning_rate": 1.9997014731974164e-05, + "loss": 2.0136, + "step": 8560 + }, + { + "epoch": 0.1070276756918923, + "grad_norm": 3.2684547901153564, + "learning_rate": 1.999699337162422e-05, + "loss": 1.8487, + "step": 8562 + }, + { + "epoch": 0.10705267631690792, + "grad_norm": 3.094639778137207, + "learning_rate": 1.9996971935138586e-05, + "loss": 2.1408, + "step": 8564 + }, + { + "epoch": 0.10707767694192355, + "grad_norm": 1.1600468158721924, + "learning_rate": 1.999695042251743e-05, + "loss": 1.3829, + "step": 8566 + }, + { + "epoch": 0.10710267756693917, + "grad_norm": 0.16798920929431915, + "learning_rate": 1.999692883376091e-05, + "loss": 0.8044, + "step": 8568 + }, + { + "epoch": 0.1071276781919548, + "grad_norm": 4.446827411651611, + "learning_rate": 1.99969071688692e-05, + "loss": 1.6891, + "step": 8570 + }, + { + "epoch": 0.10715267881697042, + "grad_norm": 2.014998435974121, + "learning_rate": 1.999688542784245e-05, + "loss": 0.3029, + "step": 8572 + }, + { + "epoch": 0.10717767944198604, + "grad_norm": 2.048074245452881, + "learning_rate": 1.999686361068084e-05, + "loss": 0.522, + "step": 8574 + }, + { + "epoch": 0.10720268006700168, + "grad_norm": 5.400364875793457, + "learning_rate": 1.9996841717384526e-05, + "loss": 0.3762, + "step": 8576 + }, + { + "epoch": 0.1072276806920173, + "grad_norm": 3.8848001956939697, + "learning_rate": 1.9996819747953684e-05, + "loss": 0.5726, + "step": 8578 + }, + { + "epoch": 0.10725268131703293, + "grad_norm": 4.546293258666992, + "learning_rate": 1.9996797702388475e-05, + "loss": 1.7531, + "step": 8580 + }, + { + "epoch": 0.10727768194204855, + "grad_norm": 0.4653739333152771, + "learning_rate": 1.999677558068907e-05, + "loss": 0.1131, + "step": 8582 + }, + { + "epoch": 0.10730268256706417, + "grad_norm": 4.352059841156006, + "learning_rate": 1.9996753382855632e-05, + "loss": 2.0917, + "step": 8584 + }, + { + "epoch": 0.1073276831920798, + "grad_norm": 4.9104905128479, + "learning_rate": 1.9996731108888338e-05, + "loss": 1.2953, + "step": 8586 + }, + { + "epoch": 0.10735268381709542, + "grad_norm": 4.02164888381958, + "learning_rate": 1.999670875878735e-05, + "loss": 1.7355, + "step": 8588 + }, + { + "epoch": 0.10737768444211106, + "grad_norm": 4.761839866638184, + "learning_rate": 1.9996686332552843e-05, + "loss": 0.6184, + "step": 8590 + }, + { + "epoch": 0.10740268506712668, + "grad_norm": 4.5915303230285645, + "learning_rate": 1.9996663830184988e-05, + "loss": 1.1107, + "step": 8592 + }, + { + "epoch": 0.1074276856921423, + "grad_norm": 2.8470985889434814, + "learning_rate": 1.999664125168395e-05, + "loss": 1.0621, + "step": 8594 + }, + { + "epoch": 0.10745268631715793, + "grad_norm": 2.637505292892456, + "learning_rate": 1.999661859704991e-05, + "loss": 1.1703, + "step": 8596 + }, + { + "epoch": 0.10747768694217355, + "grad_norm": 2.111703395843506, + "learning_rate": 1.999659586628304e-05, + "loss": 0.476, + "step": 8598 + }, + { + "epoch": 0.10750268756718918, + "grad_norm": 3.888979434967041, + "learning_rate": 1.9996573059383505e-05, + "loss": 0.9778, + "step": 8600 + }, + { + "epoch": 0.1075276881922048, + "grad_norm": 1.5864492654800415, + "learning_rate": 1.999655017635148e-05, + "loss": 0.556, + "step": 8602 + }, + { + "epoch": 0.10755268881722042, + "grad_norm": 5.241070747375488, + "learning_rate": 1.9996527217187147e-05, + "loss": 0.0533, + "step": 8604 + }, + { + "epoch": 0.10757768944223606, + "grad_norm": 4.8804545402526855, + "learning_rate": 1.9996504181890674e-05, + "loss": 0.6878, + "step": 8606 + }, + { + "epoch": 0.10760269006725168, + "grad_norm": 3.396899700164795, + "learning_rate": 1.999648107046224e-05, + "loss": 0.9482, + "step": 8608 + }, + { + "epoch": 0.10762769069226731, + "grad_norm": 2.1200978755950928, + "learning_rate": 1.9996457882902024e-05, + "loss": 0.1152, + "step": 8610 + }, + { + "epoch": 0.10765269131728293, + "grad_norm": 3.246553421020508, + "learning_rate": 1.9996434619210192e-05, + "loss": 1.369, + "step": 8612 + }, + { + "epoch": 0.10767769194229855, + "grad_norm": 2.5575878620147705, + "learning_rate": 1.999641127938693e-05, + "loss": 1.0628, + "step": 8614 + }, + { + "epoch": 0.10770269256731418, + "grad_norm": 3.19140362739563, + "learning_rate": 1.9996387863432415e-05, + "loss": 0.6308, + "step": 8616 + }, + { + "epoch": 0.1077276931923298, + "grad_norm": 2.7795515060424805, + "learning_rate": 1.9996364371346822e-05, + "loss": 0.6461, + "step": 8618 + }, + { + "epoch": 0.10775269381734544, + "grad_norm": 9.938822746276855, + "learning_rate": 1.999634080313033e-05, + "loss": 0.7899, + "step": 8620 + }, + { + "epoch": 0.10777769444236106, + "grad_norm": 4.073063373565674, + "learning_rate": 1.9996317158783123e-05, + "loss": 1.3431, + "step": 8622 + }, + { + "epoch": 0.10780269506737668, + "grad_norm": 4.835875988006592, + "learning_rate": 1.9996293438305375e-05, + "loss": 1.5816, + "step": 8624 + }, + { + "epoch": 0.10782769569239231, + "grad_norm": 2.9899916648864746, + "learning_rate": 1.9996269641697272e-05, + "loss": 0.6075, + "step": 8626 + }, + { + "epoch": 0.10785269631740793, + "grad_norm": 1.7533563375473022, + "learning_rate": 1.9996245768958993e-05, + "loss": 0.2842, + "step": 8628 + }, + { + "epoch": 0.10787769694242356, + "grad_norm": 3.5288338661193848, + "learning_rate": 1.999622182009072e-05, + "loss": 0.2054, + "step": 8630 + }, + { + "epoch": 0.10790269756743918, + "grad_norm": 3.227205514907837, + "learning_rate": 1.9996197795092635e-05, + "loss": 0.6713, + "step": 8632 + }, + { + "epoch": 0.1079276981924548, + "grad_norm": 6.580834865570068, + "learning_rate": 1.9996173693964918e-05, + "loss": 1.8414, + "step": 8634 + }, + { + "epoch": 0.10795269881747044, + "grad_norm": 5.621514797210693, + "learning_rate": 1.999614951670776e-05, + "loss": 2.293, + "step": 8636 + }, + { + "epoch": 0.10797769944248606, + "grad_norm": 3.0369913578033447, + "learning_rate": 1.999612526332134e-05, + "loss": 0.6768, + "step": 8638 + }, + { + "epoch": 0.10800270006750169, + "grad_norm": 0.007209342438727617, + "learning_rate": 1.999610093380584e-05, + "loss": 0.2662, + "step": 8640 + }, + { + "epoch": 0.10802770069251731, + "grad_norm": 0.0020075598731637, + "learning_rate": 1.9996076528161456e-05, + "loss": 0.5532, + "step": 8642 + }, + { + "epoch": 0.10805270131753293, + "grad_norm": 5.16105318069458, + "learning_rate": 1.999605204638836e-05, + "loss": 1.3367, + "step": 8644 + }, + { + "epoch": 0.10807770194254857, + "grad_norm": 2.9039459228515625, + "learning_rate": 1.9996027488486747e-05, + "loss": 1.6275, + "step": 8646 + }, + { + "epoch": 0.10810270256756419, + "grad_norm": 2.474109411239624, + "learning_rate": 1.9996002854456803e-05, + "loss": 1.1002, + "step": 8648 + }, + { + "epoch": 0.10812770319257982, + "grad_norm": 0.0022197405342012644, + "learning_rate": 1.9995978144298716e-05, + "loss": 0.9632, + "step": 8650 + }, + { + "epoch": 0.10815270381759544, + "grad_norm": 4.158550262451172, + "learning_rate": 1.9995953358012673e-05, + "loss": 1.9348, + "step": 8652 + }, + { + "epoch": 0.10817770444261106, + "grad_norm": 0.005202939733862877, + "learning_rate": 1.9995928495598858e-05, + "loss": 0.4514, + "step": 8654 + }, + { + "epoch": 0.10820270506762669, + "grad_norm": 5.286029815673828, + "learning_rate": 1.999590355705747e-05, + "loss": 2.0522, + "step": 8656 + }, + { + "epoch": 0.10822770569264231, + "grad_norm": 2.3851382732391357, + "learning_rate": 1.999587854238869e-05, + "loss": 0.635, + "step": 8658 + }, + { + "epoch": 0.10825270631765795, + "grad_norm": 1.7288336753845215, + "learning_rate": 1.9995853451592716e-05, + "loss": 0.7083, + "step": 8660 + }, + { + "epoch": 0.10827770694267357, + "grad_norm": 3.695441722869873, + "learning_rate": 1.9995828284669738e-05, + "loss": 1.6094, + "step": 8662 + }, + { + "epoch": 0.10830270756768919, + "grad_norm": 0.00156793266069144, + "learning_rate": 1.999580304161994e-05, + "loss": 0.362, + "step": 8664 + }, + { + "epoch": 0.10832770819270482, + "grad_norm": 0.005685946438461542, + "learning_rate": 1.9995777722443525e-05, + "loss": 0.0111, + "step": 8666 + }, + { + "epoch": 0.10835270881772044, + "grad_norm": 2.778538942337036, + "learning_rate": 1.9995752327140673e-05, + "loss": 0.1043, + "step": 8668 + }, + { + "epoch": 0.10837770944273607, + "grad_norm": 2.3356735706329346, + "learning_rate": 1.9995726855711593e-05, + "loss": 1.4055, + "step": 8670 + }, + { + "epoch": 0.10840271006775169, + "grad_norm": 0.45935383439064026, + "learning_rate": 1.9995701308156466e-05, + "loss": 0.4373, + "step": 8672 + }, + { + "epoch": 0.10842771069276731, + "grad_norm": 0.0028182710520923138, + "learning_rate": 1.999567568447549e-05, + "loss": 0.0001, + "step": 8674 + }, + { + "epoch": 0.10845271131778295, + "grad_norm": 3.999875783920288, + "learning_rate": 1.9995649984668865e-05, + "loss": 0.0711, + "step": 8676 + }, + { + "epoch": 0.10847771194279857, + "grad_norm": 1.6762810945510864, + "learning_rate": 1.999562420873678e-05, + "loss": 1.0186, + "step": 8678 + }, + { + "epoch": 0.1085027125678142, + "grad_norm": 5.840904712677002, + "learning_rate": 1.9995598356679438e-05, + "loss": 0.9681, + "step": 8680 + }, + { + "epoch": 0.10852771319282982, + "grad_norm": 1.5674853324890137, + "learning_rate": 1.999557242849703e-05, + "loss": 1.0322, + "step": 8682 + }, + { + "epoch": 0.10855271381784544, + "grad_norm": 2.949693441390991, + "learning_rate": 1.9995546424189757e-05, + "loss": 0.8384, + "step": 8684 + }, + { + "epoch": 0.10857771444286107, + "grad_norm": 7.59528923034668, + "learning_rate": 1.9995520343757815e-05, + "loss": 2.7716, + "step": 8686 + }, + { + "epoch": 0.10860271506787669, + "grad_norm": 1.6038944721221924, + "learning_rate": 1.9995494187201405e-05, + "loss": 0.6667, + "step": 8688 + }, + { + "epoch": 0.10862771569289233, + "grad_norm": 1.153906226158142, + "learning_rate": 1.9995467954520725e-05, + "loss": 0.9501, + "step": 8690 + }, + { + "epoch": 0.10865271631790795, + "grad_norm": 3.4876444339752197, + "learning_rate": 1.999544164571597e-05, + "loss": 1.563, + "step": 8692 + }, + { + "epoch": 0.10867771694292357, + "grad_norm": 3.3080520629882812, + "learning_rate": 1.9995415260787353e-05, + "loss": 1.1664, + "step": 8694 + }, + { + "epoch": 0.1087027175679392, + "grad_norm": 2.598587989807129, + "learning_rate": 1.999538879973506e-05, + "loss": 1.1198, + "step": 8696 + }, + { + "epoch": 0.10872771819295482, + "grad_norm": 3.7058961391448975, + "learning_rate": 1.99953622625593e-05, + "loss": 1.9317, + "step": 8698 + }, + { + "epoch": 0.10875271881797045, + "grad_norm": 4.583491802215576, + "learning_rate": 1.999533564926028e-05, + "loss": 0.6205, + "step": 8700 + }, + { + "epoch": 0.10877771944298607, + "grad_norm": 6.047037124633789, + "learning_rate": 1.9995308959838193e-05, + "loss": 1.6626, + "step": 8702 + }, + { + "epoch": 0.1088027200680017, + "grad_norm": 3.1131834983825684, + "learning_rate": 1.9995282194293243e-05, + "loss": 0.9316, + "step": 8704 + }, + { + "epoch": 0.10882772069301733, + "grad_norm": 3.036146879196167, + "learning_rate": 1.9995255352625645e-05, + "loss": 1.4324, + "step": 8706 + }, + { + "epoch": 0.10885272131803295, + "grad_norm": 2.304093837738037, + "learning_rate": 1.999522843483559e-05, + "loss": 0.7908, + "step": 8708 + }, + { + "epoch": 0.10887772194304858, + "grad_norm": 5.726008415222168, + "learning_rate": 1.999520144092329e-05, + "loss": 1.4611, + "step": 8710 + }, + { + "epoch": 0.1089027225680642, + "grad_norm": 3.7214152812957764, + "learning_rate": 1.999517437088895e-05, + "loss": 1.7504, + "step": 8712 + }, + { + "epoch": 0.10892772319307982, + "grad_norm": 2.975881576538086, + "learning_rate": 1.9995147224732775e-05, + "loss": 0.8479, + "step": 8714 + }, + { + "epoch": 0.10895272381809545, + "grad_norm": 4.774179458618164, + "learning_rate": 1.999512000245497e-05, + "loss": 1.074, + "step": 8716 + }, + { + "epoch": 0.10897772444311107, + "grad_norm": 2.1110284328460693, + "learning_rate": 1.999509270405575e-05, + "loss": 0.5281, + "step": 8718 + }, + { + "epoch": 0.10900272506812671, + "grad_norm": 4.113692760467529, + "learning_rate": 1.9995065329535314e-05, + "loss": 1.4558, + "step": 8720 + }, + { + "epoch": 0.10902772569314233, + "grad_norm": 3.737053394317627, + "learning_rate": 1.9995037878893873e-05, + "loss": 1.1015, + "step": 8722 + }, + { + "epoch": 0.10905272631815795, + "grad_norm": 2.8864996433258057, + "learning_rate": 1.999501035213164e-05, + "loss": 1.2681, + "step": 8724 + }, + { + "epoch": 0.10907772694317358, + "grad_norm": 0.0014118037652224302, + "learning_rate": 1.9994982749248816e-05, + "loss": 0.6252, + "step": 8726 + }, + { + "epoch": 0.1091027275681892, + "grad_norm": 4.276555061340332, + "learning_rate": 1.9994955070245622e-05, + "loss": 1.3134, + "step": 8728 + }, + { + "epoch": 0.10912772819320483, + "grad_norm": 5.237215995788574, + "learning_rate": 1.999492731512226e-05, + "loss": 1.1467, + "step": 8730 + }, + { + "epoch": 0.10915272881822045, + "grad_norm": 1.193457007408142, + "learning_rate": 1.999489948387895e-05, + "loss": 0.6911, + "step": 8732 + }, + { + "epoch": 0.10917772944323607, + "grad_norm": 5.782459735870361, + "learning_rate": 1.9994871576515894e-05, + "loss": 1.2432, + "step": 8734 + }, + { + "epoch": 0.10920273006825171, + "grad_norm": 9.543431282043457, + "learning_rate": 1.9994843593033314e-05, + "loss": 0.9163, + "step": 8736 + }, + { + "epoch": 0.10922773069326733, + "grad_norm": 3.6345882415771484, + "learning_rate": 1.9994815533431414e-05, + "loss": 1.8629, + "step": 8738 + }, + { + "epoch": 0.10925273131828296, + "grad_norm": 3.525160789489746, + "learning_rate": 1.9994787397710414e-05, + "loss": 0.9125, + "step": 8740 + }, + { + "epoch": 0.10927773194329858, + "grad_norm": 2.4440808296203613, + "learning_rate": 1.9994759185870526e-05, + "loss": 0.1525, + "step": 8742 + }, + { + "epoch": 0.1093027325683142, + "grad_norm": 2.177870750427246, + "learning_rate": 1.999473089791197e-05, + "loss": 0.1684, + "step": 8744 + }, + { + "epoch": 0.10932773319332983, + "grad_norm": 3.759044885635376, + "learning_rate": 1.999470253383495e-05, + "loss": 0.7393, + "step": 8746 + }, + { + "epoch": 0.10935273381834545, + "grad_norm": 6.04836893081665, + "learning_rate": 1.999467409363969e-05, + "loss": 0.7312, + "step": 8748 + }, + { + "epoch": 0.10937773444336109, + "grad_norm": 3.5254085063934326, + "learning_rate": 1.999464557732641e-05, + "loss": 0.8719, + "step": 8750 + }, + { + "epoch": 0.10940273506837671, + "grad_norm": 3.4763646125793457, + "learning_rate": 1.9994616984895322e-05, + "loss": 0.8077, + "step": 8752 + }, + { + "epoch": 0.10942773569339234, + "grad_norm": 5.2054572105407715, + "learning_rate": 1.999458831634664e-05, + "loss": 1.0025, + "step": 8754 + }, + { + "epoch": 0.10945273631840796, + "grad_norm": 2.395176410675049, + "learning_rate": 1.999455957168059e-05, + "loss": 1.0676, + "step": 8756 + }, + { + "epoch": 0.10947773694342358, + "grad_norm": 5.683766841888428, + "learning_rate": 1.9994530750897386e-05, + "loss": 0.5464, + "step": 8758 + }, + { + "epoch": 0.10950273756843922, + "grad_norm": 2.632056474685669, + "learning_rate": 1.999450185399725e-05, + "loss": 1.501, + "step": 8760 + }, + { + "epoch": 0.10952773819345483, + "grad_norm": 0.5714048743247986, + "learning_rate": 1.99944728809804e-05, + "loss": 1.2522, + "step": 8762 + }, + { + "epoch": 0.10955273881847047, + "grad_norm": 3.911452054977417, + "learning_rate": 1.9994443831847063e-05, + "loss": 1.2635, + "step": 8764 + }, + { + "epoch": 0.10957773944348609, + "grad_norm": 3.9767026901245117, + "learning_rate": 1.999441470659745e-05, + "loss": 1.3653, + "step": 8766 + }, + { + "epoch": 0.10960274006850171, + "grad_norm": 10.06637954711914, + "learning_rate": 1.9994385505231787e-05, + "loss": 1.1495, + "step": 8768 + }, + { + "epoch": 0.10962774069351734, + "grad_norm": 2.403651714324951, + "learning_rate": 1.9994356227750303e-05, + "loss": 0.3172, + "step": 8770 + }, + { + "epoch": 0.10965274131853296, + "grad_norm": 4.1441569328308105, + "learning_rate": 1.9994326874153208e-05, + "loss": 1.3298, + "step": 8772 + }, + { + "epoch": 0.1096777419435486, + "grad_norm": 1.449007272720337, + "learning_rate": 1.999429744444074e-05, + "loss": 0.1303, + "step": 8774 + }, + { + "epoch": 0.10970274256856422, + "grad_norm": 2.417625904083252, + "learning_rate": 1.9994267938613112e-05, + "loss": 0.7634, + "step": 8776 + }, + { + "epoch": 0.10972774319357984, + "grad_norm": 2.4635705947875977, + "learning_rate": 1.9994238356670554e-05, + "loss": 1.5414, + "step": 8778 + }, + { + "epoch": 0.10975274381859547, + "grad_norm": 2.214463472366333, + "learning_rate": 1.9994208698613287e-05, + "loss": 0.7458, + "step": 8780 + }, + { + "epoch": 0.10977774444361109, + "grad_norm": 17.282045364379883, + "learning_rate": 1.9994178964441545e-05, + "loss": 2.4017, + "step": 8782 + }, + { + "epoch": 0.10980274506862672, + "grad_norm": 2.669461250305176, + "learning_rate": 1.9994149154155544e-05, + "loss": 0.848, + "step": 8784 + }, + { + "epoch": 0.10982774569364234, + "grad_norm": 3.793773889541626, + "learning_rate": 1.999411926775552e-05, + "loss": 2.127, + "step": 8786 + }, + { + "epoch": 0.10985274631865796, + "grad_norm": 2.2078192234039307, + "learning_rate": 1.9994089305241698e-05, + "loss": 0.8111, + "step": 8788 + }, + { + "epoch": 0.1098777469436736, + "grad_norm": 2.182671546936035, + "learning_rate": 1.9994059266614303e-05, + "loss": 1.2031, + "step": 8790 + }, + { + "epoch": 0.10990274756868922, + "grad_norm": 6.895944118499756, + "learning_rate": 1.9994029151873568e-05, + "loss": 1.3861, + "step": 8792 + }, + { + "epoch": 0.10992774819370485, + "grad_norm": 9.347264289855957, + "learning_rate": 1.9993998961019712e-05, + "loss": 1.8046, + "step": 8794 + }, + { + "epoch": 0.10995274881872047, + "grad_norm": 4.554777145385742, + "learning_rate": 1.999396869405298e-05, + "loss": 1.3224, + "step": 8796 + }, + { + "epoch": 0.10997774944373609, + "grad_norm": 3.679142951965332, + "learning_rate": 1.99939383509736e-05, + "loss": 0.8572, + "step": 8798 + }, + { + "epoch": 0.11000275006875172, + "grad_norm": 4.53116512298584, + "learning_rate": 1.999390793178179e-05, + "loss": 0.4917, + "step": 8800 + }, + { + "epoch": 0.11002775069376734, + "grad_norm": 3.4802258014678955, + "learning_rate": 1.9993877436477794e-05, + "loss": 0.9893, + "step": 8802 + }, + { + "epoch": 0.11005275131878298, + "grad_norm": 8.912053108215332, + "learning_rate": 1.999384686506184e-05, + "loss": 0.9584, + "step": 8804 + }, + { + "epoch": 0.1100777519437986, + "grad_norm": 1.2125959396362305, + "learning_rate": 1.9993816217534167e-05, + "loss": 0.0802, + "step": 8806 + }, + { + "epoch": 0.11010275256881422, + "grad_norm": 3.473487377166748, + "learning_rate": 1.9993785493895e-05, + "loss": 0.7336, + "step": 8808 + }, + { + "epoch": 0.11012775319382985, + "grad_norm": 2.6688406467437744, + "learning_rate": 1.999375469414457e-05, + "loss": 1.3908, + "step": 8810 + }, + { + "epoch": 0.11015275381884547, + "grad_norm": 3.901442527770996, + "learning_rate": 1.9993723818283125e-05, + "loss": 1.7638, + "step": 8812 + }, + { + "epoch": 0.1101777544438611, + "grad_norm": 5.368589401245117, + "learning_rate": 1.9993692866310888e-05, + "loss": 1.5308, + "step": 8814 + }, + { + "epoch": 0.11020275506887672, + "grad_norm": 3.697021245956421, + "learning_rate": 1.99936618382281e-05, + "loss": 1.4946, + "step": 8816 + }, + { + "epoch": 0.11022775569389234, + "grad_norm": 3.531233549118042, + "learning_rate": 1.9993630734034998e-05, + "loss": 1.674, + "step": 8818 + }, + { + "epoch": 0.11025275631890798, + "grad_norm": 3.576643228530884, + "learning_rate": 1.9993599553731816e-05, + "loss": 1.4474, + "step": 8820 + }, + { + "epoch": 0.1102777569439236, + "grad_norm": 2.9472451210021973, + "learning_rate": 1.9993568297318794e-05, + "loss": 0.9153, + "step": 8822 + }, + { + "epoch": 0.11030275756893923, + "grad_norm": 3.448808193206787, + "learning_rate": 1.999353696479617e-05, + "loss": 1.3814, + "step": 8824 + }, + { + "epoch": 0.11032775819395485, + "grad_norm": 7.4759345054626465, + "learning_rate": 1.9993505556164182e-05, + "loss": 2.2876, + "step": 8826 + }, + { + "epoch": 0.11035275881897047, + "grad_norm": 3.677600145339966, + "learning_rate": 1.9993474071423068e-05, + "loss": 1.7793, + "step": 8828 + }, + { + "epoch": 0.1103777594439861, + "grad_norm": 0.004939486738294363, + "learning_rate": 1.999344251057307e-05, + "loss": 1.0266, + "step": 8830 + }, + { + "epoch": 0.11040276006900172, + "grad_norm": 4.812320232391357, + "learning_rate": 1.9993410873614425e-05, + "loss": 0.4714, + "step": 8832 + }, + { + "epoch": 0.11042776069401736, + "grad_norm": 0.003563123755156994, + "learning_rate": 1.9993379160547374e-05, + "loss": 0.7995, + "step": 8834 + }, + { + "epoch": 0.11045276131903298, + "grad_norm": 2.305236577987671, + "learning_rate": 1.9993347371372166e-05, + "loss": 0.7643, + "step": 8836 + }, + { + "epoch": 0.1104777619440486, + "grad_norm": 0.2873709201812744, + "learning_rate": 1.999331550608903e-05, + "loss": 1.0463, + "step": 8838 + }, + { + "epoch": 0.11050276256906423, + "grad_norm": 2.217292070388794, + "learning_rate": 1.9993283564698225e-05, + "loss": 1.3199, + "step": 8840 + }, + { + "epoch": 0.11052776319407985, + "grad_norm": 3.5823843479156494, + "learning_rate": 1.999325154719998e-05, + "loss": 1.134, + "step": 8842 + }, + { + "epoch": 0.11055276381909548, + "grad_norm": 2.331099271774292, + "learning_rate": 1.9993219453594545e-05, + "loss": 0.9005, + "step": 8844 + }, + { + "epoch": 0.1105777644441111, + "grad_norm": 0.7522176504135132, + "learning_rate": 1.999318728388217e-05, + "loss": 1.0862, + "step": 8846 + }, + { + "epoch": 0.11060276506912672, + "grad_norm": 6.040470123291016, + "learning_rate": 1.9993155038063086e-05, + "loss": 1.7365, + "step": 8848 + }, + { + "epoch": 0.11062776569414236, + "grad_norm": 2.2471604347229004, + "learning_rate": 1.9993122716137548e-05, + "loss": 0.5417, + "step": 8850 + }, + { + "epoch": 0.11065276631915798, + "grad_norm": 2.6239542961120605, + "learning_rate": 1.99930903181058e-05, + "loss": 0.5902, + "step": 8852 + }, + { + "epoch": 0.11067776694417361, + "grad_norm": 2.007296562194824, + "learning_rate": 1.999305784396809e-05, + "loss": 1.5409, + "step": 8854 + }, + { + "epoch": 0.11070276756918923, + "grad_norm": 4.0742058753967285, + "learning_rate": 1.9993025293724665e-05, + "loss": 1.1549, + "step": 8856 + }, + { + "epoch": 0.11072776819420485, + "grad_norm": 3.005807399749756, + "learning_rate": 1.9992992667375773e-05, + "loss": 0.5864, + "step": 8858 + }, + { + "epoch": 0.11075276881922048, + "grad_norm": 1.7599133253097534, + "learning_rate": 1.9992959964921662e-05, + "loss": 1.1846, + "step": 8860 + }, + { + "epoch": 0.1107777694442361, + "grad_norm": 3.7507569789886475, + "learning_rate": 1.9992927186362583e-05, + "loss": 1.2111, + "step": 8862 + }, + { + "epoch": 0.11080277006925174, + "grad_norm": 1.9599560499191284, + "learning_rate": 1.999289433169878e-05, + "loss": 0.57, + "step": 8864 + }, + { + "epoch": 0.11082777069426736, + "grad_norm": 6.148538112640381, + "learning_rate": 1.999286140093051e-05, + "loss": 1.7554, + "step": 8866 + }, + { + "epoch": 0.11085277131928298, + "grad_norm": 0.004336345940828323, + "learning_rate": 1.999282839405802e-05, + "loss": 0.1198, + "step": 8868 + }, + { + "epoch": 0.11087777194429861, + "grad_norm": 2.016657829284668, + "learning_rate": 1.999279531108156e-05, + "loss": 0.8502, + "step": 8870 + }, + { + "epoch": 0.11090277256931423, + "grad_norm": 2.012422800064087, + "learning_rate": 1.9992762152001383e-05, + "loss": 0.3997, + "step": 8872 + }, + { + "epoch": 0.11092777319432987, + "grad_norm": 1.0132343769073486, + "learning_rate": 1.9992728916817748e-05, + "loss": 0.9663, + "step": 8874 + }, + { + "epoch": 0.11095277381934548, + "grad_norm": 1.6556780338287354, + "learning_rate": 1.99926956055309e-05, + "loss": 0.5182, + "step": 8876 + }, + { + "epoch": 0.1109777744443611, + "grad_norm": 0.8872689604759216, + "learning_rate": 1.9992662218141093e-05, + "loss": 0.0659, + "step": 8878 + }, + { + "epoch": 0.11100277506937674, + "grad_norm": 3.175771474838257, + "learning_rate": 1.9992628754648584e-05, + "loss": 1.3751, + "step": 8880 + }, + { + "epoch": 0.11102777569439236, + "grad_norm": 2.834428310394287, + "learning_rate": 1.999259521505363e-05, + "loss": 1.5515, + "step": 8882 + }, + { + "epoch": 0.11105277631940799, + "grad_norm": 1.9867111444473267, + "learning_rate": 1.9992561599356484e-05, + "loss": 0.1728, + "step": 8884 + }, + { + "epoch": 0.11107777694442361, + "grad_norm": 5.342217922210693, + "learning_rate": 1.99925279075574e-05, + "loss": 2.021, + "step": 8886 + }, + { + "epoch": 0.11110277756943923, + "grad_norm": 2.2776339054107666, + "learning_rate": 1.9992494139656636e-05, + "loss": 0.9854, + "step": 8888 + }, + { + "epoch": 0.11112777819445487, + "grad_norm": 4.839662075042725, + "learning_rate": 1.999246029565445e-05, + "loss": 2.3044, + "step": 8890 + }, + { + "epoch": 0.11115277881947049, + "grad_norm": 0.05872546508908272, + "learning_rate": 1.9992426375551105e-05, + "loss": 0.6225, + "step": 8892 + }, + { + "epoch": 0.11117777944448612, + "grad_norm": 3.4061532020568848, + "learning_rate": 1.9992392379346846e-05, + "loss": 0.3866, + "step": 8894 + }, + { + "epoch": 0.11120278006950174, + "grad_norm": 4.071114540100098, + "learning_rate": 1.9992358307041942e-05, + "loss": 0.2022, + "step": 8896 + }, + { + "epoch": 0.11122778069451736, + "grad_norm": 2.126877784729004, + "learning_rate": 1.999232415863665e-05, + "loss": 1.2573, + "step": 8898 + }, + { + "epoch": 0.11125278131953299, + "grad_norm": 1.769298791885376, + "learning_rate": 1.9992289934131236e-05, + "loss": 0.7951, + "step": 8900 + }, + { + "epoch": 0.11127778194454861, + "grad_norm": 1.0765987634658813, + "learning_rate": 1.999225563352595e-05, + "loss": 0.4787, + "step": 8902 + }, + { + "epoch": 0.11130278256956425, + "grad_norm": 3.7063870429992676, + "learning_rate": 1.9992221256821056e-05, + "loss": 1.6024, + "step": 8904 + }, + { + "epoch": 0.11132778319457987, + "grad_norm": 9.251067161560059, + "learning_rate": 1.9992186804016823e-05, + "loss": 1.1777, + "step": 8906 + }, + { + "epoch": 0.11135278381959549, + "grad_norm": 5.1279168128967285, + "learning_rate": 1.9992152275113503e-05, + "loss": 1.273, + "step": 8908 + }, + { + "epoch": 0.11137778444461112, + "grad_norm": 2.8209023475646973, + "learning_rate": 1.9992117670111367e-05, + "loss": 1.8064, + "step": 8910 + }, + { + "epoch": 0.11140278506962674, + "grad_norm": 2.2032527923583984, + "learning_rate": 1.9992082989010674e-05, + "loss": 0.673, + "step": 8912 + }, + { + "epoch": 0.11142778569464237, + "grad_norm": 9.948318481445312, + "learning_rate": 1.9992048231811693e-05, + "loss": 1.0457, + "step": 8914 + }, + { + "epoch": 0.11145278631965799, + "grad_norm": 0.005041980184614658, + "learning_rate": 1.9992013398514683e-05, + "loss": 0.5535, + "step": 8916 + }, + { + "epoch": 0.11147778694467361, + "grad_norm": 2.0648488998413086, + "learning_rate": 1.9991978489119915e-05, + "loss": 0.95, + "step": 8918 + }, + { + "epoch": 0.11150278756968925, + "grad_norm": 2.143698215484619, + "learning_rate": 1.9991943503627652e-05, + "loss": 0.1735, + "step": 8920 + }, + { + "epoch": 0.11152778819470487, + "grad_norm": 0.24493572115898132, + "learning_rate": 1.9991908442038156e-05, + "loss": 0.531, + "step": 8922 + }, + { + "epoch": 0.1115527888197205, + "grad_norm": 0.0019284107256680727, + "learning_rate": 1.99918733043517e-05, + "loss": 0.7559, + "step": 8924 + }, + { + "epoch": 0.11157778944473612, + "grad_norm": 1.9223113059997559, + "learning_rate": 1.9991838090568555e-05, + "loss": 0.8277, + "step": 8926 + }, + { + "epoch": 0.11160279006975174, + "grad_norm": 4.112198352813721, + "learning_rate": 1.999180280068898e-05, + "loss": 1.3973, + "step": 8928 + }, + { + "epoch": 0.11162779069476737, + "grad_norm": 4.042316436767578, + "learning_rate": 1.9991767434713247e-05, + "loss": 1.1776, + "step": 8930 + }, + { + "epoch": 0.11165279131978299, + "grad_norm": 3.423574447631836, + "learning_rate": 1.9991731992641626e-05, + "loss": 1.5733, + "step": 8932 + }, + { + "epoch": 0.11167779194479863, + "grad_norm": 4.532438278198242, + "learning_rate": 1.999169647447439e-05, + "loss": 1.5424, + "step": 8934 + }, + { + "epoch": 0.11170279256981425, + "grad_norm": 1.9837859869003296, + "learning_rate": 1.99916608802118e-05, + "loss": 1.4415, + "step": 8936 + }, + { + "epoch": 0.11172779319482987, + "grad_norm": 3.482279062271118, + "learning_rate": 1.999162520985414e-05, + "loss": 1.1566, + "step": 8938 + }, + { + "epoch": 0.1117527938198455, + "grad_norm": 1.6522834300994873, + "learning_rate": 1.9991589463401673e-05, + "loss": 0.994, + "step": 8940 + }, + { + "epoch": 0.11177779444486112, + "grad_norm": 2.370157241821289, + "learning_rate": 1.9991553640854674e-05, + "loss": 0.8754, + "step": 8942 + }, + { + "epoch": 0.11180279506987675, + "grad_norm": 0.20740152895450592, + "learning_rate": 1.9991517742213413e-05, + "loss": 0.0221, + "step": 8944 + }, + { + "epoch": 0.11182779569489237, + "grad_norm": 4.340074062347412, + "learning_rate": 1.999148176747817e-05, + "loss": 1.5099, + "step": 8946 + }, + { + "epoch": 0.111852796319908, + "grad_norm": 2.0175111293792725, + "learning_rate": 1.9991445716649213e-05, + "loss": 0.6911, + "step": 8948 + }, + { + "epoch": 0.11187779694492363, + "grad_norm": 3.4567923545837402, + "learning_rate": 1.999140958972682e-05, + "loss": 1.2169, + "step": 8950 + }, + { + "epoch": 0.11190279756993925, + "grad_norm": 0.7542278170585632, + "learning_rate": 1.999137338671126e-05, + "loss": 1.0435, + "step": 8952 + }, + { + "epoch": 0.11192779819495488, + "grad_norm": 3.8062288761138916, + "learning_rate": 1.9991337107602818e-05, + "loss": 1.739, + "step": 8954 + }, + { + "epoch": 0.1119527988199705, + "grad_norm": 3.589355230331421, + "learning_rate": 1.9991300752401762e-05, + "loss": 1.0466, + "step": 8956 + }, + { + "epoch": 0.11197779944498612, + "grad_norm": 4.562684535980225, + "learning_rate": 1.9991264321108375e-05, + "loss": 2.946, + "step": 8958 + }, + { + "epoch": 0.11200280007000175, + "grad_norm": 3.4159393310546875, + "learning_rate": 1.9991227813722933e-05, + "loss": 0.2421, + "step": 8960 + }, + { + "epoch": 0.11202780069501737, + "grad_norm": 3.660741090774536, + "learning_rate": 1.9991191230245708e-05, + "loss": 2.0596, + "step": 8962 + }, + { + "epoch": 0.11205280132003301, + "grad_norm": 3.1375091075897217, + "learning_rate": 1.9991154570676986e-05, + "loss": 0.7217, + "step": 8964 + }, + { + "epoch": 0.11207780194504863, + "grad_norm": 5.534066200256348, + "learning_rate": 1.9991117835017045e-05, + "loss": 1.8741, + "step": 8966 + }, + { + "epoch": 0.11210280257006425, + "grad_norm": 3.762519598007202, + "learning_rate": 1.9991081023266163e-05, + "loss": 0.9557, + "step": 8968 + }, + { + "epoch": 0.11212780319507988, + "grad_norm": 3.0593602657318115, + "learning_rate": 1.9991044135424616e-05, + "loss": 1.4869, + "step": 8970 + }, + { + "epoch": 0.1121528038200955, + "grad_norm": 5.572475910186768, + "learning_rate": 1.9991007171492694e-05, + "loss": 1.2011, + "step": 8972 + }, + { + "epoch": 0.11217780444511113, + "grad_norm": 5.696146488189697, + "learning_rate": 1.9990970131470674e-05, + "loss": 1.034, + "step": 8974 + }, + { + "epoch": 0.11220280507012675, + "grad_norm": 4.4723286628723145, + "learning_rate": 1.999093301535884e-05, + "loss": 1.3824, + "step": 8976 + }, + { + "epoch": 0.11222780569514237, + "grad_norm": 4.126735210418701, + "learning_rate": 1.999089582315747e-05, + "loss": 0.3629, + "step": 8978 + }, + { + "epoch": 0.11225280632015801, + "grad_norm": 4.393233299255371, + "learning_rate": 1.9990858554866852e-05, + "loss": 2.4125, + "step": 8980 + }, + { + "epoch": 0.11227780694517363, + "grad_norm": 8.296619415283203, + "learning_rate": 1.9990821210487266e-05, + "loss": 1.9947, + "step": 8982 + }, + { + "epoch": 0.11230280757018926, + "grad_norm": 3.126132011413574, + "learning_rate": 1.9990783790019002e-05, + "loss": 1.3837, + "step": 8984 + }, + { + "epoch": 0.11232780819520488, + "grad_norm": 0.2903481721878052, + "learning_rate": 1.999074629346234e-05, + "loss": 0.0247, + "step": 8986 + }, + { + "epoch": 0.1123528088202205, + "grad_norm": 5.165947437286377, + "learning_rate": 1.9990708720817567e-05, + "loss": 1.178, + "step": 8988 + }, + { + "epoch": 0.11237780944523613, + "grad_norm": 2.346177339553833, + "learning_rate": 1.999067107208497e-05, + "loss": 0.8067, + "step": 8990 + }, + { + "epoch": 0.11240281007025175, + "grad_norm": 2.6481316089630127, + "learning_rate": 1.999063334726483e-05, + "loss": 0.6684, + "step": 8992 + }, + { + "epoch": 0.11242781069526739, + "grad_norm": 4.204620361328125, + "learning_rate": 1.9990595546357445e-05, + "loss": 0.8392, + "step": 8994 + }, + { + "epoch": 0.11245281132028301, + "grad_norm": 2.343864917755127, + "learning_rate": 1.9990557669363094e-05, + "loss": 1.4258, + "step": 8996 + }, + { + "epoch": 0.11247781194529863, + "grad_norm": 4.504353046417236, + "learning_rate": 1.999051971628207e-05, + "loss": 1.8779, + "step": 8998 + }, + { + "epoch": 0.11250281257031426, + "grad_norm": 4.006065368652344, + "learning_rate": 1.999048168711466e-05, + "loss": 2.4357, + "step": 9000 + }, + { + "epoch": 0.11252781319532988, + "grad_norm": 3.123324394226074, + "learning_rate": 1.9990443581861157e-05, + "loss": 1.1696, + "step": 9002 + }, + { + "epoch": 0.11255281382034552, + "grad_norm": 3.0343399047851562, + "learning_rate": 1.9990405400521844e-05, + "loss": 1.1698, + "step": 9004 + }, + { + "epoch": 0.11257781444536114, + "grad_norm": 2.3884243965148926, + "learning_rate": 1.9990367143097018e-05, + "loss": 0.332, + "step": 9006 + }, + { + "epoch": 0.11260281507037675, + "grad_norm": 3.519104480743408, + "learning_rate": 1.999032880958697e-05, + "loss": 1.2494, + "step": 9008 + }, + { + "epoch": 0.11262781569539239, + "grad_norm": 4.300308704376221, + "learning_rate": 1.999029039999199e-05, + "loss": 1.1662, + "step": 9010 + }, + { + "epoch": 0.11265281632040801, + "grad_norm": 4.577869415283203, + "learning_rate": 1.9990251914312368e-05, + "loss": 1.5172, + "step": 9012 + }, + { + "epoch": 0.11267781694542364, + "grad_norm": 4.5808186531066895, + "learning_rate": 1.9990213352548404e-05, + "loss": 1.97, + "step": 9014 + }, + { + "epoch": 0.11270281757043926, + "grad_norm": 9.011072158813477, + "learning_rate": 1.9990174714700386e-05, + "loss": 1.5974, + "step": 9016 + }, + { + "epoch": 0.11272781819545488, + "grad_norm": 0.5166740417480469, + "learning_rate": 1.999013600076861e-05, + "loss": 2.1462, + "step": 9018 + }, + { + "epoch": 0.11275281882047052, + "grad_norm": 4.442983627319336, + "learning_rate": 1.9990097210753373e-05, + "loss": 2.0689, + "step": 9020 + }, + { + "epoch": 0.11277781944548614, + "grad_norm": 10.657615661621094, + "learning_rate": 1.9990058344654967e-05, + "loss": 1.4391, + "step": 9022 + }, + { + "epoch": 0.11280282007050177, + "grad_norm": 3.3505613803863525, + "learning_rate": 1.9990019402473687e-05, + "loss": 2.0694, + "step": 9024 + }, + { + "epoch": 0.11282782069551739, + "grad_norm": 0.009295869618654251, + "learning_rate": 1.9989980384209837e-05, + "loss": 1.3489, + "step": 9026 + }, + { + "epoch": 0.11285282132053301, + "grad_norm": 5.352908134460449, + "learning_rate": 1.9989941289863703e-05, + "loss": 0.5912, + "step": 9028 + }, + { + "epoch": 0.11287782194554864, + "grad_norm": 0.19410043954849243, + "learning_rate": 1.9989902119435595e-05, + "loss": 0.8329, + "step": 9030 + }, + { + "epoch": 0.11290282257056426, + "grad_norm": 2.503493547439575, + "learning_rate": 1.9989862872925802e-05, + "loss": 0.5982, + "step": 9032 + }, + { + "epoch": 0.1129278231955799, + "grad_norm": 2.386075019836426, + "learning_rate": 1.9989823550334625e-05, + "loss": 0.4206, + "step": 9034 + }, + { + "epoch": 0.11295282382059552, + "grad_norm": 3.044243097305298, + "learning_rate": 1.9989784151662366e-05, + "loss": 1.2318, + "step": 9036 + }, + { + "epoch": 0.11297782444561114, + "grad_norm": 3.0348312854766846, + "learning_rate": 1.9989744676909324e-05, + "loss": 0.7528, + "step": 9038 + }, + { + "epoch": 0.11300282507062677, + "grad_norm": 2.9402856826782227, + "learning_rate": 1.99897051260758e-05, + "loss": 1.0147, + "step": 9040 + }, + { + "epoch": 0.11302782569564239, + "grad_norm": 2.2932376861572266, + "learning_rate": 1.9989665499162093e-05, + "loss": 0.5557, + "step": 9042 + }, + { + "epoch": 0.11305282632065802, + "grad_norm": 4.314813613891602, + "learning_rate": 1.9989625796168504e-05, + "loss": 1.6852, + "step": 9044 + }, + { + "epoch": 0.11307782694567364, + "grad_norm": 2.2352027893066406, + "learning_rate": 1.998958601709534e-05, + "loss": 0.7787, + "step": 9046 + }, + { + "epoch": 0.11310282757068926, + "grad_norm": 3.5336341857910156, + "learning_rate": 1.9989546161942903e-05, + "loss": 0.6528, + "step": 9048 + }, + { + "epoch": 0.1131278281957049, + "grad_norm": 4.278752326965332, + "learning_rate": 1.9989506230711492e-05, + "loss": 0.7127, + "step": 9050 + }, + { + "epoch": 0.11315282882072052, + "grad_norm": 9.559503555297852, + "learning_rate": 1.9989466223401415e-05, + "loss": 2.369, + "step": 9052 + }, + { + "epoch": 0.11317782944573615, + "grad_norm": 1.0186767578125, + "learning_rate": 1.998942614001298e-05, + "loss": 0.2151, + "step": 9054 + }, + { + "epoch": 0.11320283007075177, + "grad_norm": 2.7055459022521973, + "learning_rate": 1.9989385980546486e-05, + "loss": 0.6172, + "step": 9056 + }, + { + "epoch": 0.11322783069576739, + "grad_norm": 2.764439821243286, + "learning_rate": 1.9989345745002237e-05, + "loss": 0.5574, + "step": 9058 + }, + { + "epoch": 0.11325283132078302, + "grad_norm": 2.5554323196411133, + "learning_rate": 1.9989305433380547e-05, + "loss": 1.5204, + "step": 9060 + }, + { + "epoch": 0.11327783194579864, + "grad_norm": 2.8379671573638916, + "learning_rate": 1.998926504568172e-05, + "loss": 1.4284, + "step": 9062 + }, + { + "epoch": 0.11330283257081428, + "grad_norm": 2.4912352561950684, + "learning_rate": 1.998922458190606e-05, + "loss": 0.2717, + "step": 9064 + }, + { + "epoch": 0.1133278331958299, + "grad_norm": 3.0484540462493896, + "learning_rate": 1.9989184042053883e-05, + "loss": 0.969, + "step": 9066 + }, + { + "epoch": 0.11335283382084552, + "grad_norm": 1.6548864841461182, + "learning_rate": 1.9989143426125487e-05, + "loss": 0.518, + "step": 9068 + }, + { + "epoch": 0.11337783444586115, + "grad_norm": 2.678161382675171, + "learning_rate": 1.998910273412119e-05, + "loss": 0.7755, + "step": 9070 + }, + { + "epoch": 0.11340283507087677, + "grad_norm": 6.269943714141846, + "learning_rate": 1.9989061966041298e-05, + "loss": 1.9161, + "step": 9072 + }, + { + "epoch": 0.1134278356958924, + "grad_norm": 7.5835161209106445, + "learning_rate": 1.9989021121886125e-05, + "loss": 1.5215, + "step": 9074 + }, + { + "epoch": 0.11345283632090802, + "grad_norm": 4.270866870880127, + "learning_rate": 1.998898020165598e-05, + "loss": 1.1841, + "step": 9076 + }, + { + "epoch": 0.11347783694592364, + "grad_norm": 0.03739044815301895, + "learning_rate": 1.9988939205351172e-05, + "loss": 0.692, + "step": 9078 + }, + { + "epoch": 0.11350283757093928, + "grad_norm": 3.5624918937683105, + "learning_rate": 1.998889813297202e-05, + "loss": 1.4776, + "step": 9080 + }, + { + "epoch": 0.1135278381959549, + "grad_norm": 3.7634613513946533, + "learning_rate": 1.9988856984518826e-05, + "loss": 1.4178, + "step": 9082 + }, + { + "epoch": 0.11355283882097053, + "grad_norm": 0.13620488345623016, + "learning_rate": 1.998881575999191e-05, + "loss": 0.0527, + "step": 9084 + }, + { + "epoch": 0.11357783944598615, + "grad_norm": 3.4293084144592285, + "learning_rate": 1.9988774459391588e-05, + "loss": 1.3603, + "step": 9086 + }, + { + "epoch": 0.11360284007100177, + "grad_norm": 3.582292318344116, + "learning_rate": 1.9988733082718176e-05, + "loss": 0.708, + "step": 9088 + }, + { + "epoch": 0.1136278406960174, + "grad_norm": 6.83648681640625, + "learning_rate": 1.9988691629971982e-05, + "loss": 1.2881, + "step": 9090 + }, + { + "epoch": 0.11365284132103302, + "grad_norm": 3.3722710609436035, + "learning_rate": 1.9988650101153326e-05, + "loss": 1.5618, + "step": 9092 + }, + { + "epoch": 0.11367784194604866, + "grad_norm": 3.621323585510254, + "learning_rate": 1.998860849626252e-05, + "loss": 0.6965, + "step": 9094 + }, + { + "epoch": 0.11370284257106428, + "grad_norm": 0.038942109793424606, + "learning_rate": 1.9988566815299885e-05, + "loss": 0.0005, + "step": 9096 + }, + { + "epoch": 0.1137278431960799, + "grad_norm": 1.6982028484344482, + "learning_rate": 1.998852505826574e-05, + "loss": 0.1954, + "step": 9098 + }, + { + "epoch": 0.11375284382109553, + "grad_norm": 4.749936103820801, + "learning_rate": 1.99884832251604e-05, + "loss": 1.0907, + "step": 9100 + }, + { + "epoch": 0.11377784444611115, + "grad_norm": 21.317106246948242, + "learning_rate": 1.9988441315984186e-05, + "loss": 1.6389, + "step": 9102 + }, + { + "epoch": 0.11380284507112678, + "grad_norm": 1.9805988073349, + "learning_rate": 1.9988399330737415e-05, + "loss": 0.4073, + "step": 9104 + }, + { + "epoch": 0.1138278456961424, + "grad_norm": 2.987569808959961, + "learning_rate": 1.9988357269420403e-05, + "loss": 0.6139, + "step": 9106 + }, + { + "epoch": 0.11385284632115802, + "grad_norm": 2.65421462059021, + "learning_rate": 1.998831513203348e-05, + "loss": 1.3996, + "step": 9108 + }, + { + "epoch": 0.11387784694617366, + "grad_norm": 2.223958730697632, + "learning_rate": 1.9988272918576958e-05, + "loss": 0.5138, + "step": 9110 + }, + { + "epoch": 0.11390284757118928, + "grad_norm": 6.821513652801514, + "learning_rate": 1.998823062905116e-05, + "loss": 2.1285, + "step": 9112 + }, + { + "epoch": 0.11392784819620491, + "grad_norm": 5.168257713317871, + "learning_rate": 1.9988188263456415e-05, + "loss": 1.2391, + "step": 9114 + }, + { + "epoch": 0.11395284882122053, + "grad_norm": 6.04434061050415, + "learning_rate": 1.9988145821793038e-05, + "loss": 1.0582, + "step": 9116 + }, + { + "epoch": 0.11397784944623615, + "grad_norm": 1.2991528511047363, + "learning_rate": 1.9988103304061356e-05, + "loss": 0.1738, + "step": 9118 + }, + { + "epoch": 0.11400285007125179, + "grad_norm": 0.012947358191013336, + "learning_rate": 1.998806071026169e-05, + "loss": 0.0003, + "step": 9120 + }, + { + "epoch": 0.1140278506962674, + "grad_norm": 3.158735513687134, + "learning_rate": 1.998801804039437e-05, + "loss": 1.2111, + "step": 9122 + }, + { + "epoch": 0.11405285132128304, + "grad_norm": 4.413255214691162, + "learning_rate": 1.998797529445971e-05, + "loss": 0.4065, + "step": 9124 + }, + { + "epoch": 0.11407785194629866, + "grad_norm": 3.0372135639190674, + "learning_rate": 1.998793247245805e-05, + "loss": 1.564, + "step": 9126 + }, + { + "epoch": 0.11410285257131428, + "grad_norm": 2.873039960861206, + "learning_rate": 1.9987889574389703e-05, + "loss": 0.2304, + "step": 9128 + }, + { + "epoch": 0.11412785319632991, + "grad_norm": 0.1558179259300232, + "learning_rate": 1.9987846600255002e-05, + "loss": 0.4016, + "step": 9130 + }, + { + "epoch": 0.11415285382134553, + "grad_norm": 2.9986183643341064, + "learning_rate": 1.9987803550054275e-05, + "loss": 1.6368, + "step": 9132 + }, + { + "epoch": 0.11417785444636117, + "grad_norm": 5.968321800231934, + "learning_rate": 1.998776042378785e-05, + "loss": 1.3064, + "step": 9134 + }, + { + "epoch": 0.11420285507137679, + "grad_norm": 2.8108983039855957, + "learning_rate": 1.9987717221456054e-05, + "loss": 0.6281, + "step": 9136 + }, + { + "epoch": 0.1142278556963924, + "grad_norm": 3.2461674213409424, + "learning_rate": 1.9987673943059214e-05, + "loss": 0.7353, + "step": 9138 + }, + { + "epoch": 0.11425285632140804, + "grad_norm": 2.7558703422546387, + "learning_rate": 1.9987630588597662e-05, + "loss": 0.5857, + "step": 9140 + }, + { + "epoch": 0.11427785694642366, + "grad_norm": 1.0605567693710327, + "learning_rate": 1.998758715807173e-05, + "loss": 0.8689, + "step": 9142 + }, + { + "epoch": 0.11430285757143929, + "grad_norm": 0.7763286232948303, + "learning_rate": 1.9987543651481743e-05, + "loss": 0.025, + "step": 9144 + }, + { + "epoch": 0.11432785819645491, + "grad_norm": 0.05912661552429199, + "learning_rate": 1.9987500068828042e-05, + "loss": 0.5323, + "step": 9146 + }, + { + "epoch": 0.11435285882147053, + "grad_norm": 3.434727430343628, + "learning_rate": 1.9987456410110945e-05, + "loss": 0.8817, + "step": 9148 + }, + { + "epoch": 0.11437785944648617, + "grad_norm": 2.971885919570923, + "learning_rate": 1.9987412675330797e-05, + "loss": 0.9009, + "step": 9150 + }, + { + "epoch": 0.11440286007150179, + "grad_norm": 1.474133849143982, + "learning_rate": 1.9987368864487928e-05, + "loss": 1.3331, + "step": 9152 + }, + { + "epoch": 0.11442786069651742, + "grad_norm": 0.008055760525166988, + "learning_rate": 1.9987324977582666e-05, + "loss": 0.0178, + "step": 9154 + }, + { + "epoch": 0.11445286132153304, + "grad_norm": 2.959416389465332, + "learning_rate": 1.9987281014615352e-05, + "loss": 0.7782, + "step": 9156 + }, + { + "epoch": 0.11447786194654866, + "grad_norm": 2.6257364749908447, + "learning_rate": 1.9987236975586313e-05, + "loss": 0.6904, + "step": 9158 + }, + { + "epoch": 0.11450286257156429, + "grad_norm": 2.317469358444214, + "learning_rate": 1.9987192860495898e-05, + "loss": 0.4828, + "step": 9160 + }, + { + "epoch": 0.11452786319657991, + "grad_norm": 3.465416431427002, + "learning_rate": 1.9987148669344427e-05, + "loss": 1.2839, + "step": 9162 + }, + { + "epoch": 0.11455286382159555, + "grad_norm": 2.0712087154388428, + "learning_rate": 1.998710440213225e-05, + "loss": 0.1287, + "step": 9164 + }, + { + "epoch": 0.11457786444661117, + "grad_norm": 4.924625873565674, + "learning_rate": 1.9987060058859694e-05, + "loss": 1.561, + "step": 9166 + }, + { + "epoch": 0.11460286507162679, + "grad_norm": 2.573763847351074, + "learning_rate": 1.9987015639527102e-05, + "loss": 0.7477, + "step": 9168 + }, + { + "epoch": 0.11462786569664242, + "grad_norm": 2.2560465335845947, + "learning_rate": 1.998697114413481e-05, + "loss": 0.6165, + "step": 9170 + }, + { + "epoch": 0.11465286632165804, + "grad_norm": 3.581772804260254, + "learning_rate": 1.998692657268316e-05, + "loss": 1.1502, + "step": 9172 + }, + { + "epoch": 0.11467786694667367, + "grad_norm": 2.886908769607544, + "learning_rate": 1.998688192517249e-05, + "loss": 1.5396, + "step": 9174 + }, + { + "epoch": 0.1147028675716893, + "grad_norm": 7.8885345458984375, + "learning_rate": 1.998683720160314e-05, + "loss": 1.04, + "step": 9176 + }, + { + "epoch": 0.11472786819670491, + "grad_norm": 3.6267871856689453, + "learning_rate": 1.9986792401975448e-05, + "loss": 0.948, + "step": 9178 + }, + { + "epoch": 0.11475286882172055, + "grad_norm": 2.8645260334014893, + "learning_rate": 1.9986747526289758e-05, + "loss": 1.4878, + "step": 9180 + }, + { + "epoch": 0.11477786944673617, + "grad_norm": 4.214166164398193, + "learning_rate": 1.998670257454641e-05, + "loss": 0.6931, + "step": 9182 + }, + { + "epoch": 0.1148028700717518, + "grad_norm": 2.579113483428955, + "learning_rate": 1.998665754674575e-05, + "loss": 1.3741, + "step": 9184 + }, + { + "epoch": 0.11482787069676742, + "grad_norm": 0.9919914603233337, + "learning_rate": 1.9986612442888113e-05, + "loss": 0.0556, + "step": 9186 + }, + { + "epoch": 0.11485287132178304, + "grad_norm": 4.867489814758301, + "learning_rate": 1.9986567262973856e-05, + "loss": 1.5555, + "step": 9188 + }, + { + "epoch": 0.11487787194679867, + "grad_norm": 2.168311357498169, + "learning_rate": 1.998652200700331e-05, + "loss": 1.5135, + "step": 9190 + }, + { + "epoch": 0.1149028725718143, + "grad_norm": 13.505965232849121, + "learning_rate": 1.9986476674976822e-05, + "loss": 1.5175, + "step": 9192 + }, + { + "epoch": 0.11492787319682993, + "grad_norm": 4.367381572723389, + "learning_rate": 1.9986431266894746e-05, + "loss": 1.0042, + "step": 9194 + }, + { + "epoch": 0.11495287382184555, + "grad_norm": 2.232832431793213, + "learning_rate": 1.998638578275742e-05, + "loss": 0.0685, + "step": 9196 + }, + { + "epoch": 0.11497787444686117, + "grad_norm": 0.006801709067076445, + "learning_rate": 1.998634022256519e-05, + "loss": 0.1643, + "step": 9198 + }, + { + "epoch": 0.1150028750718768, + "grad_norm": 3.7804737091064453, + "learning_rate": 1.9986294586318404e-05, + "loss": 1.8304, + "step": 9200 + }, + { + "epoch": 0.11502787569689242, + "grad_norm": 3.402268171310425, + "learning_rate": 1.9986248874017414e-05, + "loss": 0.9655, + "step": 9202 + }, + { + "epoch": 0.11505287632190805, + "grad_norm": 1.8220044374465942, + "learning_rate": 1.998620308566256e-05, + "loss": 1.0305, + "step": 9204 + }, + { + "epoch": 0.11507787694692367, + "grad_norm": 3.2264881134033203, + "learning_rate": 1.9986157221254202e-05, + "loss": 0.7433, + "step": 9206 + }, + { + "epoch": 0.1151028775719393, + "grad_norm": 3.3529441356658936, + "learning_rate": 1.9986111280792677e-05, + "loss": 0.6732, + "step": 9208 + }, + { + "epoch": 0.11512787819695493, + "grad_norm": 3.320216417312622, + "learning_rate": 1.9986065264278345e-05, + "loss": 1.1475, + "step": 9210 + }, + { + "epoch": 0.11515287882197055, + "grad_norm": 0.04895170405507088, + "learning_rate": 1.998601917171155e-05, + "loss": 1.1058, + "step": 9212 + }, + { + "epoch": 0.11517787944698618, + "grad_norm": 0.963151216506958, + "learning_rate": 1.9985973003092644e-05, + "loss": 0.1424, + "step": 9214 + }, + { + "epoch": 0.1152028800720018, + "grad_norm": 3.6890454292297363, + "learning_rate": 1.9985926758421984e-05, + "loss": 1.2415, + "step": 9216 + }, + { + "epoch": 0.11522788069701742, + "grad_norm": 3.2913625240325928, + "learning_rate": 1.9985880437699915e-05, + "loss": 1.3583, + "step": 9218 + }, + { + "epoch": 0.11525288132203305, + "grad_norm": 3.2302372455596924, + "learning_rate": 1.998583404092679e-05, + "loss": 1.2186, + "step": 9220 + }, + { + "epoch": 0.11527788194704867, + "grad_norm": 3.6733829975128174, + "learning_rate": 1.998578756810297e-05, + "loss": 1.4532, + "step": 9222 + }, + { + "epoch": 0.11530288257206431, + "grad_norm": 0.012842404656112194, + "learning_rate": 1.9985741019228803e-05, + "loss": 0.0016, + "step": 9224 + }, + { + "epoch": 0.11532788319707993, + "grad_norm": 4.554096698760986, + "learning_rate": 1.9985694394304643e-05, + "loss": 1.8398, + "step": 9226 + }, + { + "epoch": 0.11535288382209555, + "grad_norm": 1.353927731513977, + "learning_rate": 1.9985647693330846e-05, + "loss": 0.1906, + "step": 9228 + }, + { + "epoch": 0.11537788444711118, + "grad_norm": 8.534396171569824, + "learning_rate": 1.9985600916307768e-05, + "loss": 0.6246, + "step": 9230 + }, + { + "epoch": 0.1154028850721268, + "grad_norm": 3.9393067359924316, + "learning_rate": 1.9985554063235768e-05, + "loss": 0.7807, + "step": 9232 + }, + { + "epoch": 0.11542788569714243, + "grad_norm": 2.82891845703125, + "learning_rate": 1.9985507134115202e-05, + "loss": 0.599, + "step": 9234 + }, + { + "epoch": 0.11545288632215805, + "grad_norm": 0.0963844284415245, + "learning_rate": 1.9985460128946422e-05, + "loss": 0.5855, + "step": 9236 + }, + { + "epoch": 0.11547788694717367, + "grad_norm": 2.363783836364746, + "learning_rate": 1.998541304772979e-05, + "loss": 1.0275, + "step": 9238 + }, + { + "epoch": 0.11550288757218931, + "grad_norm": 2.7082066535949707, + "learning_rate": 1.9985365890465664e-05, + "loss": 0.8727, + "step": 9240 + }, + { + "epoch": 0.11552788819720493, + "grad_norm": 0.00853289756923914, + "learning_rate": 1.9985318657154404e-05, + "loss": 0.471, + "step": 9242 + }, + { + "epoch": 0.11555288882222056, + "grad_norm": 2.308331251144409, + "learning_rate": 1.998527134779637e-05, + "loss": 0.5894, + "step": 9244 + }, + { + "epoch": 0.11557788944723618, + "grad_norm": 3.567345142364502, + "learning_rate": 1.9985223962391922e-05, + "loss": 1.1415, + "step": 9246 + }, + { + "epoch": 0.1156028900722518, + "grad_norm": 5.699481964111328, + "learning_rate": 1.9985176500941418e-05, + "loss": 0.0815, + "step": 9248 + }, + { + "epoch": 0.11562789069726744, + "grad_norm": 4.036313056945801, + "learning_rate": 1.998512896344522e-05, + "loss": 1.6202, + "step": 9250 + }, + { + "epoch": 0.11565289132228306, + "grad_norm": 0.004710851702839136, + "learning_rate": 1.9985081349903697e-05, + "loss": 0.7825, + "step": 9252 + }, + { + "epoch": 0.11567789194729869, + "grad_norm": 2.826616048812866, + "learning_rate": 1.9985033660317205e-05, + "loss": 0.5272, + "step": 9254 + }, + { + "epoch": 0.11570289257231431, + "grad_norm": 4.331600189208984, + "learning_rate": 1.9984985894686108e-05, + "loss": 2.2702, + "step": 9256 + }, + { + "epoch": 0.11572789319732993, + "grad_norm": 5.784297466278076, + "learning_rate": 1.998493805301077e-05, + "loss": 0.4191, + "step": 9258 + }, + { + "epoch": 0.11575289382234556, + "grad_norm": 4.419131755828857, + "learning_rate": 1.9984890135291557e-05, + "loss": 1.1605, + "step": 9260 + }, + { + "epoch": 0.11577789444736118, + "grad_norm": 1.5953052043914795, + "learning_rate": 1.998484214152883e-05, + "loss": 0.6555, + "step": 9262 + }, + { + "epoch": 0.11580289507237682, + "grad_norm": 2.6202166080474854, + "learning_rate": 1.998479407172296e-05, + "loss": 0.5507, + "step": 9264 + }, + { + "epoch": 0.11582789569739244, + "grad_norm": 4.330026149749756, + "learning_rate": 1.998474592587431e-05, + "loss": 1.343, + "step": 9266 + }, + { + "epoch": 0.11585289632240806, + "grad_norm": 5.783157825469971, + "learning_rate": 1.998469770398325e-05, + "loss": 1.7966, + "step": 9268 + }, + { + "epoch": 0.11587789694742369, + "grad_norm": 1.807675838470459, + "learning_rate": 1.998464940605014e-05, + "loss": 0.6709, + "step": 9270 + }, + { + "epoch": 0.11590289757243931, + "grad_norm": 3.3085548877716064, + "learning_rate": 1.9984601032075353e-05, + "loss": 1.401, + "step": 9272 + }, + { + "epoch": 0.11592789819745494, + "grad_norm": 1.5706247091293335, + "learning_rate": 1.9984552582059256e-05, + "loss": 1.0967, + "step": 9274 + }, + { + "epoch": 0.11595289882247056, + "grad_norm": 2.674718141555786, + "learning_rate": 1.9984504056002216e-05, + "loss": 0.3377, + "step": 9276 + }, + { + "epoch": 0.11597789944748618, + "grad_norm": 5.265970706939697, + "learning_rate": 1.998445545390461e-05, + "loss": 0.6935, + "step": 9278 + }, + { + "epoch": 0.11600290007250182, + "grad_norm": 4.855103015899658, + "learning_rate": 1.99844067757668e-05, + "loss": 1.4463, + "step": 9280 + }, + { + "epoch": 0.11602790069751744, + "grad_norm": 5.648382663726807, + "learning_rate": 1.9984358021589162e-05, + "loss": 1.2895, + "step": 9282 + }, + { + "epoch": 0.11605290132253307, + "grad_norm": 0.00758650666102767, + "learning_rate": 1.9984309191372064e-05, + "loss": 0.0033, + "step": 9284 + }, + { + "epoch": 0.11607790194754869, + "grad_norm": 2.2397806644439697, + "learning_rate": 1.998426028511588e-05, + "loss": 1.312, + "step": 9286 + }, + { + "epoch": 0.11610290257256431, + "grad_norm": 4.464367389678955, + "learning_rate": 1.998421130282098e-05, + "loss": 0.9336, + "step": 9288 + }, + { + "epoch": 0.11612790319757994, + "grad_norm": 2.713841438293457, + "learning_rate": 1.998416224448774e-05, + "loss": 0.39, + "step": 9290 + }, + { + "epoch": 0.11615290382259556, + "grad_norm": 0.0024488631170243025, + "learning_rate": 1.998411311011653e-05, + "loss": 0.6526, + "step": 9292 + }, + { + "epoch": 0.1161779044476112, + "grad_norm": 3.8328092098236084, + "learning_rate": 1.998406389970773e-05, + "loss": 1.7043, + "step": 9294 + }, + { + "epoch": 0.11620290507262682, + "grad_norm": 4.246939182281494, + "learning_rate": 1.998401461326171e-05, + "loss": 0.2921, + "step": 9296 + }, + { + "epoch": 0.11622790569764244, + "grad_norm": 3.3404335975646973, + "learning_rate": 1.9983965250778844e-05, + "loss": 1.1719, + "step": 9298 + }, + { + "epoch": 0.11625290632265807, + "grad_norm": 2.9952590465545654, + "learning_rate": 1.9983915812259512e-05, + "loss": 1.1475, + "step": 9300 + }, + { + "epoch": 0.11627790694767369, + "grad_norm": 4.071412086486816, + "learning_rate": 1.9983866297704092e-05, + "loss": 1.2505, + "step": 9302 + }, + { + "epoch": 0.11630290757268932, + "grad_norm": 3.03240704536438, + "learning_rate": 1.9983816707112954e-05, + "loss": 1.3228, + "step": 9304 + }, + { + "epoch": 0.11632790819770494, + "grad_norm": 6.136419296264648, + "learning_rate": 1.9983767040486484e-05, + "loss": 1.5143, + "step": 9306 + }, + { + "epoch": 0.11635290882272056, + "grad_norm": 2.0731778144836426, + "learning_rate": 1.998371729782505e-05, + "loss": 1.455, + "step": 9308 + }, + { + "epoch": 0.1163779094477362, + "grad_norm": 0.05435887724161148, + "learning_rate": 1.998366747912904e-05, + "loss": 0.5469, + "step": 9310 + }, + { + "epoch": 0.11640291007275182, + "grad_norm": 2.8204386234283447, + "learning_rate": 1.998361758439883e-05, + "loss": 1.4486, + "step": 9312 + }, + { + "epoch": 0.11642791069776745, + "grad_norm": 6.894687175750732, + "learning_rate": 1.9983567613634803e-05, + "loss": 1.0192, + "step": 9314 + }, + { + "epoch": 0.11645291132278307, + "grad_norm": 6.286653518676758, + "learning_rate": 1.9983517566837338e-05, + "loss": 0.7298, + "step": 9316 + }, + { + "epoch": 0.11647791194779869, + "grad_norm": 4.335163593292236, + "learning_rate": 1.998346744400681e-05, + "loss": 1.1417, + "step": 9318 + }, + { + "epoch": 0.11650291257281432, + "grad_norm": 3.7817394733428955, + "learning_rate": 1.9983417245143606e-05, + "loss": 1.6739, + "step": 9320 + }, + { + "epoch": 0.11652791319782994, + "grad_norm": 0.9029420018196106, + "learning_rate": 1.998336697024811e-05, + "loss": 0.3156, + "step": 9322 + }, + { + "epoch": 0.11655291382284558, + "grad_norm": 4.162932872772217, + "learning_rate": 1.9983316619320702e-05, + "loss": 0.95, + "step": 9324 + }, + { + "epoch": 0.1165779144478612, + "grad_norm": 2.8559999465942383, + "learning_rate": 1.9983266192361768e-05, + "loss": 1.3111, + "step": 9326 + }, + { + "epoch": 0.11660291507287682, + "grad_norm": 2.9674124717712402, + "learning_rate": 1.998321568937169e-05, + "loss": 0.7605, + "step": 9328 + }, + { + "epoch": 0.11662791569789245, + "grad_norm": 2.3002703189849854, + "learning_rate": 1.9983165110350852e-05, + "loss": 0.9991, + "step": 9330 + }, + { + "epoch": 0.11665291632290807, + "grad_norm": 5.983363151550293, + "learning_rate": 1.998311445529964e-05, + "loss": 0.4502, + "step": 9332 + }, + { + "epoch": 0.1166779169479237, + "grad_norm": 2.440786361694336, + "learning_rate": 1.9983063724218442e-05, + "loss": 1.1568, + "step": 9334 + }, + { + "epoch": 0.11670291757293932, + "grad_norm": 3.7496676445007324, + "learning_rate": 1.9983012917107642e-05, + "loss": 1.9376, + "step": 9336 + }, + { + "epoch": 0.11672791819795494, + "grad_norm": 0.1258838027715683, + "learning_rate": 1.9982962033967625e-05, + "loss": 0.1392, + "step": 9338 + }, + { + "epoch": 0.11675291882297058, + "grad_norm": 0.005585241597145796, + "learning_rate": 1.998291107479878e-05, + "loss": 0.0071, + "step": 9340 + }, + { + "epoch": 0.1167779194479862, + "grad_norm": 3.95709228515625, + "learning_rate": 1.99828600396015e-05, + "loss": 1.0499, + "step": 9342 + }, + { + "epoch": 0.11680292007300183, + "grad_norm": 0.0031084695365279913, + "learning_rate": 1.9982808928376167e-05, + "loss": 0.7793, + "step": 9344 + }, + { + "epoch": 0.11682792069801745, + "grad_norm": 3.1329798698425293, + "learning_rate": 1.9982757741123177e-05, + "loss": 0.8948, + "step": 9346 + }, + { + "epoch": 0.11685292132303307, + "grad_norm": 0.003759341547265649, + "learning_rate": 1.998270647784291e-05, + "loss": 0.7651, + "step": 9348 + }, + { + "epoch": 0.1168779219480487, + "grad_norm": 8.520059585571289, + "learning_rate": 1.9982655138535767e-05, + "loss": 1.9166, + "step": 9350 + }, + { + "epoch": 0.11690292257306432, + "grad_norm": 5.827334403991699, + "learning_rate": 1.9982603723202133e-05, + "loss": 1.733, + "step": 9352 + }, + { + "epoch": 0.11692792319807996, + "grad_norm": 3.6277966499328613, + "learning_rate": 1.9982552231842397e-05, + "loss": 1.0374, + "step": 9354 + }, + { + "epoch": 0.11695292382309558, + "grad_norm": 0.0032397215254604816, + "learning_rate": 1.9982500664456958e-05, + "loss": 0.0709, + "step": 9356 + }, + { + "epoch": 0.1169779244481112, + "grad_norm": 0.006431614980101585, + "learning_rate": 1.9982449021046205e-05, + "loss": 0.4797, + "step": 9358 + }, + { + "epoch": 0.11700292507312683, + "grad_norm": 3.247748613357544, + "learning_rate": 1.9982397301610532e-05, + "loss": 0.763, + "step": 9360 + }, + { + "epoch": 0.11702792569814245, + "grad_norm": 2.572265148162842, + "learning_rate": 1.9982345506150335e-05, + "loss": 1.0453, + "step": 9362 + }, + { + "epoch": 0.11705292632315809, + "grad_norm": 3.3234033584594727, + "learning_rate": 1.9982293634666003e-05, + "loss": 1.3077, + "step": 9364 + }, + { + "epoch": 0.1170779269481737, + "grad_norm": 0.0017323584761470556, + "learning_rate": 1.9982241687157934e-05, + "loss": 0.3456, + "step": 9366 + }, + { + "epoch": 0.11710292757318932, + "grad_norm": 3.1734418869018555, + "learning_rate": 1.9982189663626526e-05, + "loss": 2.8575, + "step": 9368 + }, + { + "epoch": 0.11712792819820496, + "grad_norm": 0.7230618000030518, + "learning_rate": 1.9982137564072173e-05, + "loss": 0.6744, + "step": 9370 + }, + { + "epoch": 0.11715292882322058, + "grad_norm": 2.0108935832977295, + "learning_rate": 1.9982085388495268e-05, + "loss": 0.6375, + "step": 9372 + }, + { + "epoch": 0.11717792944823621, + "grad_norm": 3.225041151046753, + "learning_rate": 1.9982033136896218e-05, + "loss": 1.3692, + "step": 9374 + }, + { + "epoch": 0.11720293007325183, + "grad_norm": 3.179450035095215, + "learning_rate": 1.998198080927541e-05, + "loss": 1.9401, + "step": 9376 + }, + { + "epoch": 0.11722793069826745, + "grad_norm": 3.7126331329345703, + "learning_rate": 1.9981928405633253e-05, + "loss": 1.2368, + "step": 9378 + }, + { + "epoch": 0.11725293132328309, + "grad_norm": 2.986750602722168, + "learning_rate": 1.998187592597014e-05, + "loss": 1.1581, + "step": 9380 + }, + { + "epoch": 0.1172779319482987, + "grad_norm": 5.505577564239502, + "learning_rate": 1.998182337028647e-05, + "loss": 1.7052, + "step": 9382 + }, + { + "epoch": 0.11730293257331434, + "grad_norm": 2.787106990814209, + "learning_rate": 1.9981770738582644e-05, + "loss": 0.1903, + "step": 9384 + }, + { + "epoch": 0.11732793319832996, + "grad_norm": 2.578989028930664, + "learning_rate": 1.998171803085906e-05, + "loss": 1.5024, + "step": 9386 + }, + { + "epoch": 0.11735293382334558, + "grad_norm": 3.114821195602417, + "learning_rate": 1.998166524711613e-05, + "loss": 1.6128, + "step": 9388 + }, + { + "epoch": 0.11737793444836121, + "grad_norm": 2.4002320766448975, + "learning_rate": 1.9981612387354247e-05, + "loss": 0.5841, + "step": 9390 + }, + { + "epoch": 0.11740293507337683, + "grad_norm": 4.010537624359131, + "learning_rate": 1.9981559451573816e-05, + "loss": 1.0182, + "step": 9392 + }, + { + "epoch": 0.11742793569839247, + "grad_norm": 2.9755890369415283, + "learning_rate": 1.998150643977524e-05, + "loss": 0.8264, + "step": 9394 + }, + { + "epoch": 0.11745293632340809, + "grad_norm": 0.7973011136054993, + "learning_rate": 1.9981453351958917e-05, + "loss": 0.8339, + "step": 9396 + }, + { + "epoch": 0.1174779369484237, + "grad_norm": 4.109920501708984, + "learning_rate": 1.998140018812526e-05, + "loss": 0.9844, + "step": 9398 + }, + { + "epoch": 0.11750293757343934, + "grad_norm": 4.243297100067139, + "learning_rate": 1.9981346948274673e-05, + "loss": 0.4298, + "step": 9400 + }, + { + "epoch": 0.11752793819845496, + "grad_norm": 2.032414436340332, + "learning_rate": 1.998129363240756e-05, + "loss": 1.37, + "step": 9402 + }, + { + "epoch": 0.11755293882347059, + "grad_norm": 2.815365791320801, + "learning_rate": 1.9981240240524323e-05, + "loss": 1.6268, + "step": 9404 + }, + { + "epoch": 0.11757793944848621, + "grad_norm": 8.321577072143555, + "learning_rate": 1.998118677262537e-05, + "loss": 1.631, + "step": 9406 + }, + { + "epoch": 0.11760294007350183, + "grad_norm": 2.529353618621826, + "learning_rate": 1.9981133228711114e-05, + "loss": 0.6546, + "step": 9408 + }, + { + "epoch": 0.11762794069851747, + "grad_norm": 2.838003396987915, + "learning_rate": 1.998107960878196e-05, + "loss": 0.7179, + "step": 9410 + }, + { + "epoch": 0.11765294132353309, + "grad_norm": 2.3404626846313477, + "learning_rate": 1.9981025912838313e-05, + "loss": 0.8741, + "step": 9412 + }, + { + "epoch": 0.11767794194854872, + "grad_norm": 0.17203043401241302, + "learning_rate": 1.998097214088058e-05, + "loss": 0.7311, + "step": 9414 + }, + { + "epoch": 0.11770294257356434, + "grad_norm": 3.720689296722412, + "learning_rate": 1.998091829290918e-05, + "loss": 0.2601, + "step": 9416 + }, + { + "epoch": 0.11772794319857996, + "grad_norm": 4.6481428146362305, + "learning_rate": 1.9980864368924514e-05, + "loss": 0.2609, + "step": 9418 + }, + { + "epoch": 0.1177529438235956, + "grad_norm": 3.6856610774993896, + "learning_rate": 1.9980810368927e-05, + "loss": 0.8493, + "step": 9420 + }, + { + "epoch": 0.11777794444861121, + "grad_norm": 3.3869330883026123, + "learning_rate": 1.9980756292917044e-05, + "loss": 1.7871, + "step": 9422 + }, + { + "epoch": 0.11780294507362685, + "grad_norm": 0.0462748259305954, + "learning_rate": 1.998070214089506e-05, + "loss": 0.9527, + "step": 9424 + }, + { + "epoch": 0.11782794569864247, + "grad_norm": 4.599856376647949, + "learning_rate": 1.998064791286146e-05, + "loss": 1.8563, + "step": 9426 + }, + { + "epoch": 0.11785294632365809, + "grad_norm": 4.06142520904541, + "learning_rate": 1.9980593608816658e-05, + "loss": 2.4275, + "step": 9428 + }, + { + "epoch": 0.11787794694867372, + "grad_norm": 2.066086530685425, + "learning_rate": 1.998053922876106e-05, + "loss": 1.8133, + "step": 9430 + }, + { + "epoch": 0.11790294757368934, + "grad_norm": 3.9580492973327637, + "learning_rate": 1.9980484772695092e-05, + "loss": 1.6902, + "step": 9432 + }, + { + "epoch": 0.11792794819870497, + "grad_norm": 1.4937734603881836, + "learning_rate": 1.9980430240619165e-05, + "loss": 0.8111, + "step": 9434 + }, + { + "epoch": 0.1179529488237206, + "grad_norm": 2.0786848068237305, + "learning_rate": 1.998037563253369e-05, + "loss": 0.1335, + "step": 9436 + }, + { + "epoch": 0.11797794944873621, + "grad_norm": 2.991392135620117, + "learning_rate": 1.998032094843909e-05, + "loss": 0.5931, + "step": 9438 + }, + { + "epoch": 0.11800295007375185, + "grad_norm": 2.9731481075286865, + "learning_rate": 1.998026618833577e-05, + "loss": 0.9285, + "step": 9440 + }, + { + "epoch": 0.11802795069876747, + "grad_norm": 3.5893054008483887, + "learning_rate": 1.9980211352224156e-05, + "loss": 0.9158, + "step": 9442 + }, + { + "epoch": 0.1180529513237831, + "grad_norm": 3.672213554382324, + "learning_rate": 1.9980156440104664e-05, + "loss": 1.4235, + "step": 9444 + }, + { + "epoch": 0.11807795194879872, + "grad_norm": 5.301119327545166, + "learning_rate": 1.998010145197771e-05, + "loss": 2.1792, + "step": 9446 + }, + { + "epoch": 0.11810295257381434, + "grad_norm": 1.2500145435333252, + "learning_rate": 1.998004638784372e-05, + "loss": 0.1809, + "step": 9448 + }, + { + "epoch": 0.11812795319882997, + "grad_norm": 0.004947028122842312, + "learning_rate": 1.9979991247703104e-05, + "loss": 0.4177, + "step": 9450 + }, + { + "epoch": 0.1181529538238456, + "grad_norm": 4.158546447753906, + "learning_rate": 1.9979936031556287e-05, + "loss": 1.2527, + "step": 9452 + }, + { + "epoch": 0.11817795444886123, + "grad_norm": 3.741978883743286, + "learning_rate": 1.997988073940369e-05, + "loss": 1.4928, + "step": 9454 + }, + { + "epoch": 0.11820295507387685, + "grad_norm": 7.346352577209473, + "learning_rate": 1.9979825371245728e-05, + "loss": 0.7106, + "step": 9456 + }, + { + "epoch": 0.11822795569889247, + "grad_norm": 2.8825130462646484, + "learning_rate": 1.9979769927082833e-05, + "loss": 1.6952, + "step": 9458 + }, + { + "epoch": 0.1182529563239081, + "grad_norm": 0.0035512344911694527, + "learning_rate": 1.9979714406915417e-05, + "loss": 0.1119, + "step": 9460 + }, + { + "epoch": 0.11827795694892372, + "grad_norm": 0.6711738109588623, + "learning_rate": 1.997965881074391e-05, + "loss": 0.0198, + "step": 9462 + }, + { + "epoch": 0.11830295757393935, + "grad_norm": 3.1822879314422607, + "learning_rate": 1.997960313856873e-05, + "loss": 0.7485, + "step": 9464 + }, + { + "epoch": 0.11832795819895497, + "grad_norm": 3.4418604373931885, + "learning_rate": 1.997954739039031e-05, + "loss": 1.4146, + "step": 9466 + }, + { + "epoch": 0.1183529588239706, + "grad_norm": 2.9170219898223877, + "learning_rate": 1.9979491566209063e-05, + "loss": 0.6638, + "step": 9468 + }, + { + "epoch": 0.11837795944898623, + "grad_norm": 4.865732192993164, + "learning_rate": 1.997943566602542e-05, + "loss": 0.7883, + "step": 9470 + }, + { + "epoch": 0.11840296007400185, + "grad_norm": 1.5331426858901978, + "learning_rate": 1.9979379689839807e-05, + "loss": 0.3043, + "step": 9472 + }, + { + "epoch": 0.11842796069901748, + "grad_norm": 0.05426199361681938, + "learning_rate": 1.9979323637652647e-05, + "loss": 0.8145, + "step": 9474 + }, + { + "epoch": 0.1184529613240331, + "grad_norm": 3.477275848388672, + "learning_rate": 1.997926750946437e-05, + "loss": 1.2798, + "step": 9476 + }, + { + "epoch": 0.11847796194904872, + "grad_norm": 1.3787685632705688, + "learning_rate": 1.9979211305275408e-05, + "loss": 0.2895, + "step": 9478 + }, + { + "epoch": 0.11850296257406435, + "grad_norm": 2.61311674118042, + "learning_rate": 1.997915502508618e-05, + "loss": 0.8669, + "step": 9480 + }, + { + "epoch": 0.11852796319907997, + "grad_norm": 5.198050022125244, + "learning_rate": 1.997909866889712e-05, + "loss": 1.6172, + "step": 9482 + }, + { + "epoch": 0.11855296382409561, + "grad_norm": 2.202000379562378, + "learning_rate": 1.997904223670865e-05, + "loss": 0.5062, + "step": 9484 + }, + { + "epoch": 0.11857796444911123, + "grad_norm": 3.973099708557129, + "learning_rate": 1.9978985728521215e-05, + "loss": 0.6774, + "step": 9486 + }, + { + "epoch": 0.11860296507412685, + "grad_norm": 3.3173980712890625, + "learning_rate": 1.997892914433523e-05, + "loss": 2.4863, + "step": 9488 + }, + { + "epoch": 0.11862796569914248, + "grad_norm": 2.522921562194824, + "learning_rate": 1.997887248415113e-05, + "loss": 0.1833, + "step": 9490 + }, + { + "epoch": 0.1186529663241581, + "grad_norm": 4.911849021911621, + "learning_rate": 1.9978815747969354e-05, + "loss": 0.4328, + "step": 9492 + }, + { + "epoch": 0.11867796694917374, + "grad_norm": 2.0456228256225586, + "learning_rate": 1.9978758935790322e-05, + "loss": 1.055, + "step": 9494 + }, + { + "epoch": 0.11870296757418936, + "grad_norm": 11.542919158935547, + "learning_rate": 1.9978702047614478e-05, + "loss": 0.1315, + "step": 9496 + }, + { + "epoch": 0.11872796819920498, + "grad_norm": 0.004783821292221546, + "learning_rate": 1.997864508344225e-05, + "loss": 0.9915, + "step": 9498 + }, + { + "epoch": 0.11875296882422061, + "grad_norm": 5.356696605682373, + "learning_rate": 1.997858804327407e-05, + "loss": 2.3703, + "step": 9500 + }, + { + "epoch": 0.11877796944923623, + "grad_norm": 0.004517064429819584, + "learning_rate": 1.9978530927110375e-05, + "loss": 0.5694, + "step": 9502 + }, + { + "epoch": 0.11880297007425186, + "grad_norm": 3.417635202407837, + "learning_rate": 1.99784737349516e-05, + "loss": 0.2885, + "step": 9504 + }, + { + "epoch": 0.11882797069926748, + "grad_norm": 2.8272740840911865, + "learning_rate": 1.9978416466798182e-05, + "loss": 0.8636, + "step": 9506 + }, + { + "epoch": 0.1188529713242831, + "grad_norm": 3.8750596046447754, + "learning_rate": 1.9978359122650553e-05, + "loss": 1.3336, + "step": 9508 + }, + { + "epoch": 0.11887797194929874, + "grad_norm": 4.456325531005859, + "learning_rate": 1.9978301702509154e-05, + "loss": 1.3362, + "step": 9510 + }, + { + "epoch": 0.11890297257431436, + "grad_norm": 2.073902130126953, + "learning_rate": 1.9978244206374416e-05, + "loss": 0.7988, + "step": 9512 + }, + { + "epoch": 0.11892797319932999, + "grad_norm": 3.641404390335083, + "learning_rate": 1.9978186634246785e-05, + "loss": 1.5283, + "step": 9514 + }, + { + "epoch": 0.11895297382434561, + "grad_norm": 3.880868434906006, + "learning_rate": 1.9978128986126695e-05, + "loss": 0.7503, + "step": 9516 + }, + { + "epoch": 0.11897797444936123, + "grad_norm": 3.7111432552337646, + "learning_rate": 1.9978071262014584e-05, + "loss": 0.8481, + "step": 9518 + }, + { + "epoch": 0.11900297507437686, + "grad_norm": 3.6236753463745117, + "learning_rate": 1.9978013461910895e-05, + "loss": 0.9106, + "step": 9520 + }, + { + "epoch": 0.11902797569939248, + "grad_norm": 3.0252785682678223, + "learning_rate": 1.9977955585816065e-05, + "loss": 1.3016, + "step": 9522 + }, + { + "epoch": 0.11905297632440812, + "grad_norm": 2.715778350830078, + "learning_rate": 1.9977897633730538e-05, + "loss": 1.1216, + "step": 9524 + }, + { + "epoch": 0.11907797694942374, + "grad_norm": 6.14186954498291, + "learning_rate": 1.997783960565475e-05, + "loss": 1.0637, + "step": 9526 + }, + { + "epoch": 0.11910297757443936, + "grad_norm": 2.7687127590179443, + "learning_rate": 1.9977781501589152e-05, + "loss": 0.6646, + "step": 9528 + }, + { + "epoch": 0.11912797819945499, + "grad_norm": 3.2550621032714844, + "learning_rate": 1.9977723321534177e-05, + "loss": 1.2376, + "step": 9530 + }, + { + "epoch": 0.11915297882447061, + "grad_norm": 7.273950576782227, + "learning_rate": 1.9977665065490274e-05, + "loss": 0.5231, + "step": 9532 + }, + { + "epoch": 0.11917797944948624, + "grad_norm": 4.5078301429748535, + "learning_rate": 1.9977606733457887e-05, + "loss": 1.644, + "step": 9534 + }, + { + "epoch": 0.11920298007450186, + "grad_norm": 2.740295171737671, + "learning_rate": 1.9977548325437452e-05, + "loss": 0.805, + "step": 9536 + }, + { + "epoch": 0.11922798069951748, + "grad_norm": 1.7799196243286133, + "learning_rate": 1.9977489841429424e-05, + "loss": 1.4923, + "step": 9538 + }, + { + "epoch": 0.11925298132453312, + "grad_norm": 7.053293228149414, + "learning_rate": 1.9977431281434243e-05, + "loss": 1.488, + "step": 9540 + }, + { + "epoch": 0.11927798194954874, + "grad_norm": 1.641975998878479, + "learning_rate": 1.997737264545236e-05, + "loss": 1.2078, + "step": 9542 + }, + { + "epoch": 0.11930298257456437, + "grad_norm": 3.389402151107788, + "learning_rate": 1.9977313933484215e-05, + "loss": 0.2635, + "step": 9544 + }, + { + "epoch": 0.11932798319957999, + "grad_norm": 3.4096689224243164, + "learning_rate": 1.9977255145530258e-05, + "loss": 0.4015, + "step": 9546 + }, + { + "epoch": 0.11935298382459561, + "grad_norm": 2.84663987159729, + "learning_rate": 1.9977196281590935e-05, + "loss": 0.6091, + "step": 9548 + }, + { + "epoch": 0.11937798444961124, + "grad_norm": 4.637609481811523, + "learning_rate": 1.9977137341666697e-05, + "loss": 1.0217, + "step": 9550 + }, + { + "epoch": 0.11940298507462686, + "grad_norm": 4.445004463195801, + "learning_rate": 1.997707832575799e-05, + "loss": 1.0471, + "step": 9552 + }, + { + "epoch": 0.1194279856996425, + "grad_norm": 1.8474150896072388, + "learning_rate": 1.9977019233865272e-05, + "loss": 1.3675, + "step": 9554 + }, + { + "epoch": 0.11945298632465812, + "grad_norm": 0.08563531190156937, + "learning_rate": 1.9976960065988982e-05, + "loss": 0.0019, + "step": 9556 + }, + { + "epoch": 0.11947798694967374, + "grad_norm": 2.1254143714904785, + "learning_rate": 1.9976900822129574e-05, + "loss": 1.4193, + "step": 9558 + }, + { + "epoch": 0.11950298757468937, + "grad_norm": 6.011241912841797, + "learning_rate": 1.99768415022875e-05, + "loss": 0.6964, + "step": 9560 + }, + { + "epoch": 0.11952798819970499, + "grad_norm": 1.8282356262207031, + "learning_rate": 1.9976782106463214e-05, + "loss": 1.1897, + "step": 9562 + }, + { + "epoch": 0.11955298882472062, + "grad_norm": 0.0029259072616696358, + "learning_rate": 1.9976722634657167e-05, + "loss": 0.0001, + "step": 9564 + }, + { + "epoch": 0.11957798944973624, + "grad_norm": 1.3472154140472412, + "learning_rate": 1.9976663086869812e-05, + "loss": 0.4473, + "step": 9566 + }, + { + "epoch": 0.11960299007475186, + "grad_norm": 4.39417028427124, + "learning_rate": 1.99766034631016e-05, + "loss": 1.6526, + "step": 9568 + }, + { + "epoch": 0.1196279906997675, + "grad_norm": 4.080316066741943, + "learning_rate": 1.9976543763352985e-05, + "loss": 0.9911, + "step": 9570 + }, + { + "epoch": 0.11965299132478312, + "grad_norm": 4.363177299499512, + "learning_rate": 1.9976483987624426e-05, + "loss": 0.3769, + "step": 9572 + }, + { + "epoch": 0.11967799194979875, + "grad_norm": 2.1403048038482666, + "learning_rate": 1.9976424135916373e-05, + "loss": 0.4056, + "step": 9574 + }, + { + "epoch": 0.11970299257481437, + "grad_norm": 4.4617919921875, + "learning_rate": 1.9976364208229286e-05, + "loss": 0.8655, + "step": 9576 + }, + { + "epoch": 0.11972799319982999, + "grad_norm": 2.503819704055786, + "learning_rate": 1.9976304204563622e-05, + "loss": 0.7012, + "step": 9578 + }, + { + "epoch": 0.11975299382484562, + "grad_norm": 3.7417027950286865, + "learning_rate": 1.9976244124919834e-05, + "loss": 0.9881, + "step": 9580 + }, + { + "epoch": 0.11977799444986124, + "grad_norm": 3.7933788299560547, + "learning_rate": 1.9976183969298383e-05, + "loss": 1.3031, + "step": 9582 + }, + { + "epoch": 0.11980299507487688, + "grad_norm": 0.002726601669564843, + "learning_rate": 1.9976123737699723e-05, + "loss": 0.3283, + "step": 9584 + }, + { + "epoch": 0.1198279956998925, + "grad_norm": 3.72821044921875, + "learning_rate": 1.997606343012432e-05, + "loss": 0.5968, + "step": 9586 + }, + { + "epoch": 0.11985299632490812, + "grad_norm": 0.04044356197118759, + "learning_rate": 1.9976003046572624e-05, + "loss": 0.0007, + "step": 9588 + }, + { + "epoch": 0.11987799694992375, + "grad_norm": 6.331126689910889, + "learning_rate": 1.99759425870451e-05, + "loss": 0.9711, + "step": 9590 + }, + { + "epoch": 0.11990299757493937, + "grad_norm": 2.7950632572174072, + "learning_rate": 1.997588205154221e-05, + "loss": 0.8204, + "step": 9592 + }, + { + "epoch": 0.119927998199955, + "grad_norm": 1.630501627922058, + "learning_rate": 1.997582144006441e-05, + "loss": 0.3435, + "step": 9594 + }, + { + "epoch": 0.11995299882497062, + "grad_norm": 4.752024173736572, + "learning_rate": 1.997576075261217e-05, + "loss": 1.6473, + "step": 9596 + }, + { + "epoch": 0.11997799944998624, + "grad_norm": 0.6006292700767517, + "learning_rate": 1.9975699989185943e-05, + "loss": 0.6601, + "step": 9598 + }, + { + "epoch": 0.12000300007500188, + "grad_norm": 3.2553746700286865, + "learning_rate": 1.9975639149786195e-05, + "loss": 1.0977, + "step": 9600 + }, + { + "epoch": 0.1200280007000175, + "grad_norm": 5.688362121582031, + "learning_rate": 1.9975578234413392e-05, + "loss": 1.6842, + "step": 9602 + }, + { + "epoch": 0.12005300132503313, + "grad_norm": 4.872973918914795, + "learning_rate": 1.9975517243067995e-05, + "loss": 0.673, + "step": 9604 + }, + { + "epoch": 0.12007800195004875, + "grad_norm": 4.0238189697265625, + "learning_rate": 1.9975456175750468e-05, + "loss": 1.6428, + "step": 9606 + }, + { + "epoch": 0.12010300257506437, + "grad_norm": 0.004037375096231699, + "learning_rate": 1.997539503246128e-05, + "loss": 0.6706, + "step": 9608 + }, + { + "epoch": 0.12012800320008, + "grad_norm": 4.4913859367370605, + "learning_rate": 1.997533381320089e-05, + "loss": 1.8043, + "step": 9610 + }, + { + "epoch": 0.12015300382509563, + "grad_norm": 2.485861301422119, + "learning_rate": 1.9975272517969773e-05, + "loss": 0.7234, + "step": 9612 + }, + { + "epoch": 0.12017800445011126, + "grad_norm": 2.279430866241455, + "learning_rate": 1.997521114676839e-05, + "loss": 0.655, + "step": 9614 + }, + { + "epoch": 0.12020300507512688, + "grad_norm": 4.11374044418335, + "learning_rate": 1.997514969959721e-05, + "loss": 1.8683, + "step": 9616 + }, + { + "epoch": 0.1202280057001425, + "grad_norm": 8.039205551147461, + "learning_rate": 1.9975088176456697e-05, + "loss": 0.3637, + "step": 9618 + }, + { + "epoch": 0.12025300632515813, + "grad_norm": 2.9345011711120605, + "learning_rate": 1.9975026577347325e-05, + "loss": 0.9206, + "step": 9620 + }, + { + "epoch": 0.12027800695017375, + "grad_norm": 3.796212673187256, + "learning_rate": 1.997496490226956e-05, + "loss": 1.1303, + "step": 9622 + }, + { + "epoch": 0.12030300757518939, + "grad_norm": 4.440600872039795, + "learning_rate": 1.9974903151223874e-05, + "loss": 0.2962, + "step": 9624 + }, + { + "epoch": 0.120328008200205, + "grad_norm": 4.218809604644775, + "learning_rate": 1.997484132421074e-05, + "loss": 1.0024, + "step": 9626 + }, + { + "epoch": 0.12035300882522063, + "grad_norm": 0.5785406231880188, + "learning_rate": 1.997477942123062e-05, + "loss": 0.2282, + "step": 9628 + }, + { + "epoch": 0.12037800945023626, + "grad_norm": 0.004147273022681475, + "learning_rate": 1.997471744228399e-05, + "loss": 0.7891, + "step": 9630 + }, + { + "epoch": 0.12040301007525188, + "grad_norm": 21.67119026184082, + "learning_rate": 1.9974655387371325e-05, + "loss": 2.3866, + "step": 9632 + }, + { + "epoch": 0.12042801070026751, + "grad_norm": 4.343683242797852, + "learning_rate": 1.9974593256493097e-05, + "loss": 0.2874, + "step": 9634 + }, + { + "epoch": 0.12045301132528313, + "grad_norm": 3.1207144260406494, + "learning_rate": 1.9974531049649776e-05, + "loss": 0.8704, + "step": 9636 + }, + { + "epoch": 0.12047801195029875, + "grad_norm": 9.16373348236084, + "learning_rate": 1.9974468766841833e-05, + "loss": 1.3413, + "step": 9638 + }, + { + "epoch": 0.12050301257531439, + "grad_norm": 2.798283338546753, + "learning_rate": 1.997440640806975e-05, + "loss": 0.6977, + "step": 9640 + }, + { + "epoch": 0.12052801320033, + "grad_norm": 5.1001105308532715, + "learning_rate": 1.9974343973334e-05, + "loss": 2.7841, + "step": 9642 + }, + { + "epoch": 0.12055301382534564, + "grad_norm": 4.298389434814453, + "learning_rate": 1.997428146263505e-05, + "loss": 1.5971, + "step": 9644 + }, + { + "epoch": 0.12057801445036126, + "grad_norm": 2.27646541595459, + "learning_rate": 1.9974218875973392e-05, + "loss": 1.5342, + "step": 9646 + }, + { + "epoch": 0.12060301507537688, + "grad_norm": 0.04860330745577812, + "learning_rate": 1.997415621334949e-05, + "loss": 0.0007, + "step": 9648 + }, + { + "epoch": 0.12062801570039251, + "grad_norm": 2.583371877670288, + "learning_rate": 1.997409347476382e-05, + "loss": 1.3608, + "step": 9650 + }, + { + "epoch": 0.12065301632540813, + "grad_norm": 2.9410738945007324, + "learning_rate": 1.9974030660216867e-05, + "loss": 0.4383, + "step": 9652 + }, + { + "epoch": 0.12067801695042377, + "grad_norm": 4.348761081695557, + "learning_rate": 1.997396776970911e-05, + "loss": 0.4795, + "step": 9654 + }, + { + "epoch": 0.12070301757543939, + "grad_norm": 0.006683460436761379, + "learning_rate": 1.997390480324102e-05, + "loss": 0.0001, + "step": 9656 + }, + { + "epoch": 0.120728018200455, + "grad_norm": 2.7585959434509277, + "learning_rate": 1.9973841760813085e-05, + "loss": 0.7003, + "step": 9658 + }, + { + "epoch": 0.12075301882547064, + "grad_norm": 2.9382483959198, + "learning_rate": 1.9973778642425784e-05, + "loss": 1.521, + "step": 9660 + }, + { + "epoch": 0.12077801945048626, + "grad_norm": 4.728212356567383, + "learning_rate": 1.997371544807959e-05, + "loss": 1.3755, + "step": 9662 + }, + { + "epoch": 0.1208030200755019, + "grad_norm": 0.006015690043568611, + "learning_rate": 1.9973652177774997e-05, + "loss": 0.7829, + "step": 9664 + }, + { + "epoch": 0.12082802070051751, + "grad_norm": 2.2675516605377197, + "learning_rate": 1.9973588831512473e-05, + "loss": 1.0654, + "step": 9666 + }, + { + "epoch": 0.12085302132553313, + "grad_norm": 1.8330339193344116, + "learning_rate": 1.9973525409292506e-05, + "loss": 0.2638, + "step": 9668 + }, + { + "epoch": 0.12087802195054877, + "grad_norm": 3.9074501991271973, + "learning_rate": 1.9973461911115585e-05, + "loss": 2.0216, + "step": 9670 + }, + { + "epoch": 0.12090302257556439, + "grad_norm": 3.8631043434143066, + "learning_rate": 1.9973398336982184e-05, + "loss": 0.8406, + "step": 9672 + }, + { + "epoch": 0.12092802320058002, + "grad_norm": 0.006133950315415859, + "learning_rate": 1.9973334686892794e-05, + "loss": 0.931, + "step": 9674 + }, + { + "epoch": 0.12095302382559564, + "grad_norm": 5.760914325714111, + "learning_rate": 1.9973270960847896e-05, + "loss": 0.3237, + "step": 9676 + }, + { + "epoch": 0.12097802445061126, + "grad_norm": 2.332244396209717, + "learning_rate": 1.9973207158847974e-05, + "loss": 0.363, + "step": 9678 + }, + { + "epoch": 0.1210030250756269, + "grad_norm": 3.2770345211029053, + "learning_rate": 1.997314328089352e-05, + "loss": 1.5945, + "step": 9680 + }, + { + "epoch": 0.12102802570064251, + "grad_norm": 9.274054527282715, + "learning_rate": 1.9973079326985017e-05, + "loss": 2.0368, + "step": 9682 + }, + { + "epoch": 0.12105302632565815, + "grad_norm": 4.931266784667969, + "learning_rate": 1.9973015297122954e-05, + "loss": 1.268, + "step": 9684 + }, + { + "epoch": 0.12107802695067377, + "grad_norm": 3.639538049697876, + "learning_rate": 1.9972951191307818e-05, + "loss": 1.3431, + "step": 9686 + }, + { + "epoch": 0.12110302757568939, + "grad_norm": 3.414260149002075, + "learning_rate": 1.997288700954009e-05, + "loss": 0.9697, + "step": 9688 + }, + { + "epoch": 0.12112802820070502, + "grad_norm": 0.008034078404307365, + "learning_rate": 1.9972822751820267e-05, + "loss": 0.4831, + "step": 9690 + }, + { + "epoch": 0.12115302882572064, + "grad_norm": 1.903446078300476, + "learning_rate": 1.9972758418148837e-05, + "loss": 0.0848, + "step": 9692 + }, + { + "epoch": 0.12117802945073627, + "grad_norm": 5.421220302581787, + "learning_rate": 1.997269400852629e-05, + "loss": 0.2716, + "step": 9694 + }, + { + "epoch": 0.1212030300757519, + "grad_norm": 4.512765407562256, + "learning_rate": 1.9972629522953116e-05, + "loss": 0.2524, + "step": 9696 + }, + { + "epoch": 0.12122803070076751, + "grad_norm": 4.346960067749023, + "learning_rate": 1.9972564961429803e-05, + "loss": 0.8426, + "step": 9698 + }, + { + "epoch": 0.12125303132578315, + "grad_norm": 3.9696121215820312, + "learning_rate": 1.9972500323956847e-05, + "loss": 1.312, + "step": 9700 + }, + { + "epoch": 0.12127803195079877, + "grad_norm": 0.050937898457050323, + "learning_rate": 1.9972435610534743e-05, + "loss": 0.913, + "step": 9702 + }, + { + "epoch": 0.1213030325758144, + "grad_norm": 3.6251754760742188, + "learning_rate": 1.9972370821163975e-05, + "loss": 0.7593, + "step": 9704 + }, + { + "epoch": 0.12132803320083002, + "grad_norm": 3.3959643840789795, + "learning_rate": 1.997230595584504e-05, + "loss": 0.6583, + "step": 9706 + }, + { + "epoch": 0.12135303382584564, + "grad_norm": 0.9117713570594788, + "learning_rate": 1.9972241014578434e-05, + "loss": 0.4182, + "step": 9708 + }, + { + "epoch": 0.12137803445086127, + "grad_norm": 2.813896417617798, + "learning_rate": 1.9972175997364653e-05, + "loss": 1.0567, + "step": 9710 + }, + { + "epoch": 0.1214030350758769, + "grad_norm": 2.680732011795044, + "learning_rate": 1.9972110904204187e-05, + "loss": 0.7839, + "step": 9712 + }, + { + "epoch": 0.12142803570089253, + "grad_norm": 0.005541524849832058, + "learning_rate": 1.9972045735097537e-05, + "loss": 0.6321, + "step": 9714 + }, + { + "epoch": 0.12145303632590815, + "grad_norm": 4.140967845916748, + "learning_rate": 1.9971980490045192e-05, + "loss": 1.1133, + "step": 9716 + }, + { + "epoch": 0.12147803695092377, + "grad_norm": 0.028493864461779594, + "learning_rate": 1.997191516904766e-05, + "loss": 0.0004, + "step": 9718 + }, + { + "epoch": 0.1215030375759394, + "grad_norm": 1.207177996635437, + "learning_rate": 1.997184977210543e-05, + "loss": 0.6568, + "step": 9720 + }, + { + "epoch": 0.12152803820095502, + "grad_norm": 1.3630037307739258, + "learning_rate": 1.9971784299219e-05, + "loss": 0.2497, + "step": 9722 + }, + { + "epoch": 0.12155303882597066, + "grad_norm": 2.2974743843078613, + "learning_rate": 1.9971718750388873e-05, + "loss": 0.8004, + "step": 9724 + }, + { + "epoch": 0.12157803945098627, + "grad_norm": 2.5006706714630127, + "learning_rate": 1.9971653125615547e-05, + "loss": 0.6358, + "step": 9726 + }, + { + "epoch": 0.1216030400760019, + "grad_norm": 2.6796488761901855, + "learning_rate": 1.9971587424899517e-05, + "loss": 0.7881, + "step": 9728 + }, + { + "epoch": 0.12162804070101753, + "grad_norm": 7.847442626953125, + "learning_rate": 1.997152164824129e-05, + "loss": 1.3083, + "step": 9730 + }, + { + "epoch": 0.12165304132603315, + "grad_norm": 0.836753249168396, + "learning_rate": 1.9971455795641365e-05, + "loss": 0.2036, + "step": 9732 + }, + { + "epoch": 0.12167804195104878, + "grad_norm": 3.5476467609405518, + "learning_rate": 1.9971389867100245e-05, + "loss": 1.4494, + "step": 9734 + }, + { + "epoch": 0.1217030425760644, + "grad_norm": 2.3143434524536133, + "learning_rate": 1.9971323862618427e-05, + "loss": 0.5101, + "step": 9736 + }, + { + "epoch": 0.12172804320108002, + "grad_norm": 6.362306594848633, + "learning_rate": 1.9971257782196413e-05, + "loss": 0.3691, + "step": 9738 + }, + { + "epoch": 0.12175304382609566, + "grad_norm": 0.10376796126365662, + "learning_rate": 1.9971191625834713e-05, + "loss": 0.0053, + "step": 9740 + }, + { + "epoch": 0.12177804445111128, + "grad_norm": 6.9299116134643555, + "learning_rate": 1.997112539353383e-05, + "loss": 1.5262, + "step": 9742 + }, + { + "epoch": 0.12180304507612691, + "grad_norm": 2.124271869659424, + "learning_rate": 1.9971059085294263e-05, + "loss": 1.276, + "step": 9744 + }, + { + "epoch": 0.12182804570114253, + "grad_norm": 3.9885241985321045, + "learning_rate": 1.9970992701116523e-05, + "loss": 1.4828, + "step": 9746 + }, + { + "epoch": 0.12185304632615815, + "grad_norm": 2.7882747650146484, + "learning_rate": 1.997092624100111e-05, + "loss": 0.6532, + "step": 9748 + }, + { + "epoch": 0.12187804695117378, + "grad_norm": 8.954148292541504, + "learning_rate": 1.9970859704948534e-05, + "loss": 1.1742, + "step": 9750 + }, + { + "epoch": 0.1219030475761894, + "grad_norm": 6.095761299133301, + "learning_rate": 1.9970793092959297e-05, + "loss": 1.9514, + "step": 9752 + }, + { + "epoch": 0.12192804820120504, + "grad_norm": 3.963679313659668, + "learning_rate": 1.9970726405033913e-05, + "loss": 0.8635, + "step": 9754 + }, + { + "epoch": 0.12195304882622066, + "grad_norm": 3.363199472427368, + "learning_rate": 1.997065964117289e-05, + "loss": 1.5144, + "step": 9756 + }, + { + "epoch": 0.12197804945123628, + "grad_norm": 0.17249582707881927, + "learning_rate": 1.9970592801376726e-05, + "loss": 0.6747, + "step": 9758 + }, + { + "epoch": 0.12200305007625191, + "grad_norm": 3.590219497680664, + "learning_rate": 1.997052588564594e-05, + "loss": 1.6085, + "step": 9760 + }, + { + "epoch": 0.12202805070126753, + "grad_norm": 6.649292469024658, + "learning_rate": 1.9970458893981042e-05, + "loss": 0.7402, + "step": 9762 + }, + { + "epoch": 0.12205305132628316, + "grad_norm": 1.990606665611267, + "learning_rate": 1.9970391826382535e-05, + "loss": 0.5103, + "step": 9764 + }, + { + "epoch": 0.12207805195129878, + "grad_norm": 1.7369245290756226, + "learning_rate": 1.9970324682850935e-05, + "loss": 0.4628, + "step": 9766 + }, + { + "epoch": 0.1221030525763144, + "grad_norm": 3.0494415760040283, + "learning_rate": 1.997025746338675e-05, + "loss": 0.9998, + "step": 9768 + }, + { + "epoch": 0.12212805320133004, + "grad_norm": 5.412056922912598, + "learning_rate": 1.99701901679905e-05, + "loss": 0.3697, + "step": 9770 + }, + { + "epoch": 0.12215305382634566, + "grad_norm": 4.335453987121582, + "learning_rate": 1.9970122796662687e-05, + "loss": 0.9451, + "step": 9772 + }, + { + "epoch": 0.12217805445136129, + "grad_norm": 3.3763296604156494, + "learning_rate": 1.997005534940383e-05, + "loss": 2.3015, + "step": 9774 + }, + { + "epoch": 0.12220305507637691, + "grad_norm": 2.3668224811553955, + "learning_rate": 1.996998782621444e-05, + "loss": 1.0914, + "step": 9776 + }, + { + "epoch": 0.12222805570139253, + "grad_norm": 2.767259359359741, + "learning_rate": 1.9969920227095034e-05, + "loss": 1.6499, + "step": 9778 + }, + { + "epoch": 0.12225305632640816, + "grad_norm": 2.1180081367492676, + "learning_rate": 1.9969852552046126e-05, + "loss": 0.3377, + "step": 9780 + }, + { + "epoch": 0.12227805695142378, + "grad_norm": 4.542524337768555, + "learning_rate": 1.996978480106823e-05, + "loss": 0.8169, + "step": 9782 + }, + { + "epoch": 0.12230305757643942, + "grad_norm": 3.542971611022949, + "learning_rate": 1.9969716974161862e-05, + "loss": 0.6659, + "step": 9784 + }, + { + "epoch": 0.12232805820145504, + "grad_norm": 0.44535353779792786, + "learning_rate": 1.996964907132754e-05, + "loss": 0.4545, + "step": 9786 + }, + { + "epoch": 0.12235305882647066, + "grad_norm": 2.3851966857910156, + "learning_rate": 1.996958109256578e-05, + "loss": 0.0651, + "step": 9788 + }, + { + "epoch": 0.12237805945148629, + "grad_norm": 0.014852515421807766, + "learning_rate": 1.9969513037877102e-05, + "loss": 0.3161, + "step": 9790 + }, + { + "epoch": 0.12240306007650191, + "grad_norm": 6.157809257507324, + "learning_rate": 1.996944490726202e-05, + "loss": 0.858, + "step": 9792 + }, + { + "epoch": 0.12242806070151754, + "grad_norm": 0.7968582510948181, + "learning_rate": 1.996937670072106e-05, + "loss": 0.9015, + "step": 9794 + }, + { + "epoch": 0.12245306132653316, + "grad_norm": 3.2523818016052246, + "learning_rate": 1.9969308418254733e-05, + "loss": 0.7721, + "step": 9796 + }, + { + "epoch": 0.12247806195154878, + "grad_norm": 5.6371989250183105, + "learning_rate": 1.9969240059863564e-05, + "loss": 1.2785, + "step": 9798 + }, + { + "epoch": 0.12250306257656442, + "grad_norm": 6.1843695640563965, + "learning_rate": 1.996917162554807e-05, + "loss": 1.9739, + "step": 9800 + }, + { + "epoch": 0.12252806320158004, + "grad_norm": 0.017400609329342842, + "learning_rate": 1.996910311530878e-05, + "loss": 0.9191, + "step": 9802 + }, + { + "epoch": 0.12255306382659567, + "grad_norm": 6.124422073364258, + "learning_rate": 1.9969034529146203e-05, + "loss": 0.5599, + "step": 9804 + }, + { + "epoch": 0.12257806445161129, + "grad_norm": 0.008154640905559063, + "learning_rate": 1.9968965867060874e-05, + "loss": 0.0945, + "step": 9806 + }, + { + "epoch": 0.12260306507662691, + "grad_norm": 0.09488064050674438, + "learning_rate": 1.996889712905331e-05, + "loss": 0.0702, + "step": 9808 + }, + { + "epoch": 0.12262806570164254, + "grad_norm": 4.408646583557129, + "learning_rate": 1.9968828315124033e-05, + "loss": 0.9456, + "step": 9810 + }, + { + "epoch": 0.12265306632665816, + "grad_norm": 1.0324009656906128, + "learning_rate": 1.9968759425273575e-05, + "loss": 0.9651, + "step": 9812 + }, + { + "epoch": 0.1226780669516738, + "grad_norm": 4.359233856201172, + "learning_rate": 1.9968690459502448e-05, + "loss": 1.8688, + "step": 9814 + }, + { + "epoch": 0.12270306757668942, + "grad_norm": 3.5653557777404785, + "learning_rate": 1.9968621417811186e-05, + "loss": 1.2975, + "step": 9816 + }, + { + "epoch": 0.12272806820170504, + "grad_norm": 17.740711212158203, + "learning_rate": 1.9968552300200317e-05, + "loss": 0.4255, + "step": 9818 + }, + { + "epoch": 0.12275306882672067, + "grad_norm": 2.4168920516967773, + "learning_rate": 1.9968483106670356e-05, + "loss": 1.0143, + "step": 9820 + }, + { + "epoch": 0.12277806945173629, + "grad_norm": 1.7490949630737305, + "learning_rate": 1.9968413837221845e-05, + "loss": 0.0246, + "step": 9822 + }, + { + "epoch": 0.12280307007675192, + "grad_norm": 3.7772040367126465, + "learning_rate": 1.9968344491855302e-05, + "loss": 0.4389, + "step": 9824 + }, + { + "epoch": 0.12282807070176754, + "grad_norm": 2.775487184524536, + "learning_rate": 1.9968275070571256e-05, + "loss": 1.0302, + "step": 9826 + }, + { + "epoch": 0.12285307132678316, + "grad_norm": 2.5215182304382324, + "learning_rate": 1.9968205573370232e-05, + "loss": 0.8288, + "step": 9828 + }, + { + "epoch": 0.1228780719517988, + "grad_norm": 6.044615268707275, + "learning_rate": 1.9968136000252773e-05, + "loss": 0.9187, + "step": 9830 + }, + { + "epoch": 0.12290307257681442, + "grad_norm": 0.004552275408059359, + "learning_rate": 1.9968066351219392e-05, + "loss": 0.2245, + "step": 9832 + }, + { + "epoch": 0.12292807320183005, + "grad_norm": 5.4448041915893555, + "learning_rate": 1.996799662627063e-05, + "loss": 0.6578, + "step": 9834 + }, + { + "epoch": 0.12295307382684567, + "grad_norm": 0.042286477982997894, + "learning_rate": 1.9967926825407017e-05, + "loss": 0.7045, + "step": 9836 + }, + { + "epoch": 0.12297807445186129, + "grad_norm": 10.950261116027832, + "learning_rate": 1.9967856948629083e-05, + "loss": 0.399, + "step": 9838 + }, + { + "epoch": 0.12300307507687692, + "grad_norm": 2.2116315364837646, + "learning_rate": 1.9967786995937355e-05, + "loss": 0.4304, + "step": 9840 + }, + { + "epoch": 0.12302807570189254, + "grad_norm": 2.5271706581115723, + "learning_rate": 1.9967716967332377e-05, + "loss": 0.7027, + "step": 9842 + }, + { + "epoch": 0.12305307632690818, + "grad_norm": 2.2857964038848877, + "learning_rate": 1.9967646862814673e-05, + "loss": 0.6773, + "step": 9844 + }, + { + "epoch": 0.1230780769519238, + "grad_norm": 6.646721839904785, + "learning_rate": 1.996757668238478e-05, + "loss": 1.462, + "step": 9846 + }, + { + "epoch": 0.12310307757693942, + "grad_norm": 2.850986957550049, + "learning_rate": 1.9967506426043235e-05, + "loss": 1.3927, + "step": 9848 + }, + { + "epoch": 0.12312807820195505, + "grad_norm": 3.103381633758545, + "learning_rate": 1.996743609379057e-05, + "loss": 0.5819, + "step": 9850 + }, + { + "epoch": 0.12315307882697067, + "grad_norm": 6.691488742828369, + "learning_rate": 1.996736568562732e-05, + "loss": 1.7228, + "step": 9852 + }, + { + "epoch": 0.1231780794519863, + "grad_norm": 6.11289119720459, + "learning_rate": 1.996729520155402e-05, + "loss": 0.6964, + "step": 9854 + }, + { + "epoch": 0.12320308007700193, + "grad_norm": 2.617405414581299, + "learning_rate": 1.9967224641571214e-05, + "loss": 1.0356, + "step": 9856 + }, + { + "epoch": 0.12322808070201755, + "grad_norm": 3.0337331295013428, + "learning_rate": 1.996715400567943e-05, + "loss": 1.4397, + "step": 9858 + }, + { + "epoch": 0.12325308132703318, + "grad_norm": 0.21227402985095978, + "learning_rate": 1.9967083293879213e-05, + "loss": 0.3143, + "step": 9860 + }, + { + "epoch": 0.1232780819520488, + "grad_norm": 4.134282112121582, + "learning_rate": 1.99670125061711e-05, + "loss": 0.8292, + "step": 9862 + }, + { + "epoch": 0.12330308257706443, + "grad_norm": 3.9248831272125244, + "learning_rate": 1.996694164255563e-05, + "loss": 1.1638, + "step": 9864 + }, + { + "epoch": 0.12332808320208005, + "grad_norm": 2.1378586292266846, + "learning_rate": 1.996687070303334e-05, + "loss": 0.7406, + "step": 9866 + }, + { + "epoch": 0.12335308382709567, + "grad_norm": 2.5743796825408936, + "learning_rate": 1.9966799687604772e-05, + "loss": 1.321, + "step": 9868 + }, + { + "epoch": 0.1233780844521113, + "grad_norm": 1.94943106174469, + "learning_rate": 1.996672859627047e-05, + "loss": 1.4295, + "step": 9870 + }, + { + "epoch": 0.12340308507712693, + "grad_norm": 4.778283596038818, + "learning_rate": 1.9966657429030965e-05, + "loss": 1.3274, + "step": 9872 + }, + { + "epoch": 0.12342808570214256, + "grad_norm": 3.7679014205932617, + "learning_rate": 1.9966586185886813e-05, + "loss": 1.1174, + "step": 9874 + }, + { + "epoch": 0.12345308632715818, + "grad_norm": 3.09955096244812, + "learning_rate": 1.9966514866838547e-05, + "loss": 1.3121, + "step": 9876 + }, + { + "epoch": 0.1234780869521738, + "grad_norm": 2.4219188690185547, + "learning_rate": 1.9966443471886717e-05, + "loss": 1.1912, + "step": 9878 + }, + { + "epoch": 0.12350308757718943, + "grad_norm": 9.270381927490234, + "learning_rate": 1.9966372001031858e-05, + "loss": 0.6086, + "step": 9880 + }, + { + "epoch": 0.12352808820220505, + "grad_norm": 0.0036700398195534945, + "learning_rate": 1.9966300454274522e-05, + "loss": 0.0001, + "step": 9882 + }, + { + "epoch": 0.12355308882722069, + "grad_norm": 0.22931736707687378, + "learning_rate": 1.9966228831615252e-05, + "loss": 0.0359, + "step": 9884 + }, + { + "epoch": 0.1235780894522363, + "grad_norm": 7.0567307472229, + "learning_rate": 1.9966157133054593e-05, + "loss": 1.9295, + "step": 9886 + }, + { + "epoch": 0.12360309007725193, + "grad_norm": 3.2701618671417236, + "learning_rate": 1.9966085358593086e-05, + "loss": 1.4508, + "step": 9888 + }, + { + "epoch": 0.12362809070226756, + "grad_norm": 0.0008383509120903909, + "learning_rate": 1.9966013508231285e-05, + "loss": 0.0176, + "step": 9890 + }, + { + "epoch": 0.12365309132728318, + "grad_norm": 0.2293071299791336, + "learning_rate": 1.9965941581969735e-05, + "loss": 0.0221, + "step": 9892 + }, + { + "epoch": 0.12367809195229881, + "grad_norm": 5.027838230133057, + "learning_rate": 1.996586957980898e-05, + "loss": 0.5592, + "step": 9894 + }, + { + "epoch": 0.12370309257731443, + "grad_norm": 2.2214643955230713, + "learning_rate": 1.9965797501749575e-05, + "loss": 0.1653, + "step": 9896 + }, + { + "epoch": 0.12372809320233005, + "grad_norm": 3.35888671875, + "learning_rate": 1.9965725347792066e-05, + "loss": 0.6331, + "step": 9898 + }, + { + "epoch": 0.12375309382734569, + "grad_norm": 2.9968574047088623, + "learning_rate": 1.9965653117937e-05, + "loss": 0.56, + "step": 9900 + }, + { + "epoch": 0.1237780944523613, + "grad_norm": 5.502317905426025, + "learning_rate": 1.9965580812184934e-05, + "loss": 0.7903, + "step": 9902 + }, + { + "epoch": 0.12380309507737694, + "grad_norm": 3.1487536430358887, + "learning_rate": 1.9965508430536407e-05, + "loss": 1.4258, + "step": 9904 + }, + { + "epoch": 0.12382809570239256, + "grad_norm": 3.6186070442199707, + "learning_rate": 1.996543597299198e-05, + "loss": 0.8398, + "step": 9906 + }, + { + "epoch": 0.12385309632740818, + "grad_norm": 3.872407913208008, + "learning_rate": 1.9965363439552202e-05, + "loss": 1.4322, + "step": 9908 + }, + { + "epoch": 0.12387809695242381, + "grad_norm": 3.7332489490509033, + "learning_rate": 1.996529083021762e-05, + "loss": 1.3845, + "step": 9910 + }, + { + "epoch": 0.12390309757743943, + "grad_norm": 3.2253780364990234, + "learning_rate": 1.99652181449888e-05, + "loss": 0.9781, + "step": 9912 + }, + { + "epoch": 0.12392809820245507, + "grad_norm": 4.342178821563721, + "learning_rate": 1.9965145383866288e-05, + "loss": 1.0023, + "step": 9914 + }, + { + "epoch": 0.12395309882747069, + "grad_norm": 1.402522325515747, + "learning_rate": 1.9965072546850633e-05, + "loss": 0.4648, + "step": 9916 + }, + { + "epoch": 0.1239780994524863, + "grad_norm": 3.3537189960479736, + "learning_rate": 1.9964999633942395e-05, + "loss": 0.8456, + "step": 9918 + }, + { + "epoch": 0.12400310007750194, + "grad_norm": 1.9270226955413818, + "learning_rate": 1.9964926645142133e-05, + "loss": 0.7867, + "step": 9920 + }, + { + "epoch": 0.12402810070251756, + "grad_norm": 4.779174327850342, + "learning_rate": 1.9964853580450395e-05, + "loss": 0.8565, + "step": 9922 + }, + { + "epoch": 0.1240531013275332, + "grad_norm": 28.540328979492188, + "learning_rate": 1.9964780439867744e-05, + "loss": 0.9697, + "step": 9924 + }, + { + "epoch": 0.12407810195254881, + "grad_norm": 3.2524349689483643, + "learning_rate": 1.9964707223394733e-05, + "loss": 0.9147, + "step": 9926 + }, + { + "epoch": 0.12410310257756443, + "grad_norm": 4.249830722808838, + "learning_rate": 1.9964633931031923e-05, + "loss": 1.2409, + "step": 9928 + }, + { + "epoch": 0.12412810320258007, + "grad_norm": 4.326256275177002, + "learning_rate": 1.9964560562779867e-05, + "loss": 0.3997, + "step": 9930 + }, + { + "epoch": 0.12415310382759569, + "grad_norm": 4.123647689819336, + "learning_rate": 1.9964487118639127e-05, + "loss": 0.9938, + "step": 9932 + }, + { + "epoch": 0.12417810445261132, + "grad_norm": 3.9203200340270996, + "learning_rate": 1.9964413598610263e-05, + "loss": 0.7068, + "step": 9934 + }, + { + "epoch": 0.12420310507762694, + "grad_norm": 3.954331874847412, + "learning_rate": 1.9964340002693837e-05, + "loss": 0.5129, + "step": 9936 + }, + { + "epoch": 0.12422810570264256, + "grad_norm": 5.106273174285889, + "learning_rate": 1.99642663308904e-05, + "loss": 1.3523, + "step": 9938 + }, + { + "epoch": 0.1242531063276582, + "grad_norm": 7.0933661460876465, + "learning_rate": 1.9964192583200527e-05, + "loss": 0.8513, + "step": 9940 + }, + { + "epoch": 0.12427810695267381, + "grad_norm": 1.1826133728027344, + "learning_rate": 1.9964118759624768e-05, + "loss": 0.0376, + "step": 9942 + }, + { + "epoch": 0.12430310757768945, + "grad_norm": 3.435314655303955, + "learning_rate": 1.996404486016369e-05, + "loss": 0.7225, + "step": 9944 + }, + { + "epoch": 0.12432810820270507, + "grad_norm": 3.210486650466919, + "learning_rate": 1.9963970884817856e-05, + "loss": 1.4305, + "step": 9946 + }, + { + "epoch": 0.12435310882772069, + "grad_norm": 5.972083568572998, + "learning_rate": 1.9963896833587827e-05, + "loss": 0.7699, + "step": 9948 + }, + { + "epoch": 0.12437810945273632, + "grad_norm": 3.9567182064056396, + "learning_rate": 1.9963822706474173e-05, + "loss": 1.1592, + "step": 9950 + }, + { + "epoch": 0.12440311007775194, + "grad_norm": 1.60283625125885, + "learning_rate": 1.9963748503477452e-05, + "loss": 0.2838, + "step": 9952 + }, + { + "epoch": 0.12442811070276757, + "grad_norm": 3.3641176223754883, + "learning_rate": 1.9963674224598232e-05, + "loss": 1.0637, + "step": 9954 + }, + { + "epoch": 0.1244531113277832, + "grad_norm": 0.9398790001869202, + "learning_rate": 1.9963599869837074e-05, + "loss": 0.3107, + "step": 9956 + }, + { + "epoch": 0.12447811195279881, + "grad_norm": 4.991335391998291, + "learning_rate": 1.9963525439194553e-05, + "loss": 1.5118, + "step": 9958 + }, + { + "epoch": 0.12450311257781445, + "grad_norm": 0.006810194347053766, + "learning_rate": 1.996345093267123e-05, + "loss": 1.3468, + "step": 9960 + }, + { + "epoch": 0.12452811320283007, + "grad_norm": 4.094231128692627, + "learning_rate": 1.9963376350267674e-05, + "loss": 0.9092, + "step": 9962 + }, + { + "epoch": 0.1245531138278457, + "grad_norm": 4.856958866119385, + "learning_rate": 1.9963301691984452e-05, + "loss": 1.1334, + "step": 9964 + }, + { + "epoch": 0.12457811445286132, + "grad_norm": 4.005449295043945, + "learning_rate": 1.9963226957822133e-05, + "loss": 0.9592, + "step": 9966 + }, + { + "epoch": 0.12460311507787694, + "grad_norm": 2.1348087787628174, + "learning_rate": 1.9963152147781287e-05, + "loss": 0.5458, + "step": 9968 + }, + { + "epoch": 0.12462811570289258, + "grad_norm": 1.730566143989563, + "learning_rate": 1.996307726186248e-05, + "loss": 0.7036, + "step": 9970 + }, + { + "epoch": 0.1246531163279082, + "grad_norm": 4.8900275230407715, + "learning_rate": 1.996300230006629e-05, + "loss": 1.6225, + "step": 9972 + }, + { + "epoch": 0.12467811695292383, + "grad_norm": 0.6601751446723938, + "learning_rate": 1.9962927262393282e-05, + "loss": 0.7148, + "step": 9974 + }, + { + "epoch": 0.12470311757793945, + "grad_norm": 2.3486738204956055, + "learning_rate": 1.9962852148844028e-05, + "loss": 0.6469, + "step": 9976 + }, + { + "epoch": 0.12472811820295507, + "grad_norm": 3.223385810852051, + "learning_rate": 1.9962776959419104e-05, + "loss": 0.9128, + "step": 9978 + }, + { + "epoch": 0.1247531188279707, + "grad_norm": 3.886068105697632, + "learning_rate": 1.996270169411908e-05, + "loss": 1.088, + "step": 9980 + }, + { + "epoch": 0.12477811945298632, + "grad_norm": 1.7536805868148804, + "learning_rate": 1.996262635294452e-05, + "loss": 0.0614, + "step": 9982 + }, + { + "epoch": 0.12480312007800196, + "grad_norm": 2.830183744430542, + "learning_rate": 1.9962550935896015e-05, + "loss": 1.5654, + "step": 9984 + }, + { + "epoch": 0.12482812070301758, + "grad_norm": 1.5416284799575806, + "learning_rate": 1.996247544297413e-05, + "loss": 0.5777, + "step": 9986 + }, + { + "epoch": 0.1248531213280332, + "grad_norm": 0.006594714242964983, + "learning_rate": 1.9962399874179436e-05, + "loss": 0.3366, + "step": 9988 + }, + { + "epoch": 0.12487812195304883, + "grad_norm": 3.5348877906799316, + "learning_rate": 1.9962324229512518e-05, + "loss": 1.1121, + "step": 9990 + }, + { + "epoch": 0.12490312257806445, + "grad_norm": 1.0679856538772583, + "learning_rate": 1.9962248508973948e-05, + "loss": 0.0328, + "step": 9992 + }, + { + "epoch": 0.12492812320308008, + "grad_norm": 4.447800159454346, + "learning_rate": 1.99621727125643e-05, + "loss": 1.2889, + "step": 9994 + }, + { + "epoch": 0.1249531238280957, + "grad_norm": 0.6015301942825317, + "learning_rate": 1.9962096840284154e-05, + "loss": 1.0884, + "step": 9996 + }, + { + "epoch": 0.12497812445311132, + "grad_norm": 2.6043429374694824, + "learning_rate": 1.9962020892134085e-05, + "loss": 0.4022, + "step": 9998 + }, + { + "epoch": 0.12500312507812694, + "grad_norm": 0.006536934059113264, + "learning_rate": 1.9961944868114672e-05, + "loss": 0.9407, + "step": 10000 + }, + { + "epoch": 0.1250281257031426, + "grad_norm": 6.8967604637146, + "learning_rate": 1.99618687682265e-05, + "loss": 2.4324, + "step": 10002 + }, + { + "epoch": 0.1250531263281582, + "grad_norm": 2.6695477962493896, + "learning_rate": 1.9961792592470145e-05, + "loss": 0.8085, + "step": 10004 + }, + { + "epoch": 0.12507812695317383, + "grad_norm": 1.895548701286316, + "learning_rate": 1.9961716340846186e-05, + "loss": 0.3897, + "step": 10006 + }, + { + "epoch": 0.12510312757818945, + "grad_norm": 4.310908794403076, + "learning_rate": 1.99616400133552e-05, + "loss": 1.5505, + "step": 10008 + }, + { + "epoch": 0.12512812820320507, + "grad_norm": 2.8418869972229004, + "learning_rate": 1.9961563609997778e-05, + "loss": 0.4016, + "step": 10010 + }, + { + "epoch": 0.12515312882822072, + "grad_norm": 3.163670301437378, + "learning_rate": 1.996148713077449e-05, + "loss": 0.7542, + "step": 10012 + }, + { + "epoch": 0.12517812945323634, + "grad_norm": 3.1791155338287354, + "learning_rate": 1.9961410575685928e-05, + "loss": 1.3297, + "step": 10014 + }, + { + "epoch": 0.12520313007825196, + "grad_norm": 2.64896297454834, + "learning_rate": 1.9961333944732673e-05, + "loss": 1.2715, + "step": 10016 + }, + { + "epoch": 0.12522813070326758, + "grad_norm": 0.0052061849273741245, + "learning_rate": 1.9961257237915306e-05, + "loss": 1.1252, + "step": 10018 + }, + { + "epoch": 0.1252531313282832, + "grad_norm": 3.6230337619781494, + "learning_rate": 1.996118045523441e-05, + "loss": 1.1751, + "step": 10020 + }, + { + "epoch": 0.12527813195329884, + "grad_norm": 6.6673431396484375, + "learning_rate": 1.9961103596690577e-05, + "loss": 1.9037, + "step": 10022 + }, + { + "epoch": 0.12530313257831446, + "grad_norm": 3.442073106765747, + "learning_rate": 1.9961026662284387e-05, + "loss": 1.3656, + "step": 10024 + }, + { + "epoch": 0.12532813320333008, + "grad_norm": 3.280486583709717, + "learning_rate": 1.9960949652016426e-05, + "loss": 0.8228, + "step": 10026 + }, + { + "epoch": 0.1253531338283457, + "grad_norm": 2.131772041320801, + "learning_rate": 1.996087256588728e-05, + "loss": 0.1302, + "step": 10028 + }, + { + "epoch": 0.12537813445336132, + "grad_norm": 1.7503741979599, + "learning_rate": 1.9960795403897534e-05, + "loss": 0.6964, + "step": 10030 + }, + { + "epoch": 0.12540313507837697, + "grad_norm": 3.901052236557007, + "learning_rate": 1.9960718166047785e-05, + "loss": 0.7864, + "step": 10032 + }, + { + "epoch": 0.1254281357033926, + "grad_norm": 2.0114762783050537, + "learning_rate": 1.9960640852338612e-05, + "loss": 0.9526, + "step": 10034 + }, + { + "epoch": 0.1254531363284082, + "grad_norm": 2.8936760425567627, + "learning_rate": 1.9960563462770605e-05, + "loss": 1.1284, + "step": 10036 + }, + { + "epoch": 0.12547813695342383, + "grad_norm": 2.015397548675537, + "learning_rate": 1.9960485997344357e-05, + "loss": 1.0398, + "step": 10038 + }, + { + "epoch": 0.12550313757843945, + "grad_norm": 2.2869198322296143, + "learning_rate": 1.9960408456060457e-05, + "loss": 0.9639, + "step": 10040 + }, + { + "epoch": 0.1255281382034551, + "grad_norm": 5.25559663772583, + "learning_rate": 1.996033083891949e-05, + "loss": 0.86, + "step": 10042 + }, + { + "epoch": 0.12555313882847072, + "grad_norm": 1.8558305501937866, + "learning_rate": 1.9960253145922063e-05, + "loss": 0.5092, + "step": 10044 + }, + { + "epoch": 0.12557813945348634, + "grad_norm": 3.017921209335327, + "learning_rate": 1.996017537706875e-05, + "loss": 0.3672, + "step": 10046 + }, + { + "epoch": 0.12560314007850196, + "grad_norm": 3.1130616664886475, + "learning_rate": 1.9960097532360147e-05, + "loss": 1.2657, + "step": 10048 + }, + { + "epoch": 0.12562814070351758, + "grad_norm": 2.5085017681121826, + "learning_rate": 1.9960019611796852e-05, + "loss": 1.1189, + "step": 10050 + }, + { + "epoch": 0.12565314132853322, + "grad_norm": 2.639178991317749, + "learning_rate": 1.9959941615379455e-05, + "loss": 1.1694, + "step": 10052 + }, + { + "epoch": 0.12567814195354884, + "grad_norm": 2.0404791831970215, + "learning_rate": 1.9959863543108555e-05, + "loss": 0.1398, + "step": 10054 + }, + { + "epoch": 0.12570314257856446, + "grad_norm": 3.0722427368164062, + "learning_rate": 1.9959785394984735e-05, + "loss": 1.3235, + "step": 10056 + }, + { + "epoch": 0.12572814320358008, + "grad_norm": 2.13956356048584, + "learning_rate": 1.9959707171008604e-05, + "loss": 0.0947, + "step": 10058 + }, + { + "epoch": 0.1257531438285957, + "grad_norm": 5.823610782623291, + "learning_rate": 1.9959628871180753e-05, + "loss": 1.1904, + "step": 10060 + }, + { + "epoch": 0.12577814445361135, + "grad_norm": 3.3817036151885986, + "learning_rate": 1.9959550495501773e-05, + "loss": 1.0616, + "step": 10062 + }, + { + "epoch": 0.12580314507862697, + "grad_norm": 3.40087890625, + "learning_rate": 1.9959472043972266e-05, + "loss": 1.5055, + "step": 10064 + }, + { + "epoch": 0.1258281457036426, + "grad_norm": 1.73658287525177, + "learning_rate": 1.9959393516592826e-05, + "loss": 0.1243, + "step": 10066 + }, + { + "epoch": 0.1258531463286582, + "grad_norm": 3.2620065212249756, + "learning_rate": 1.9959314913364055e-05, + "loss": 1.3103, + "step": 10068 + }, + { + "epoch": 0.12587814695367383, + "grad_norm": 2.2161648273468018, + "learning_rate": 1.9959236234286554e-05, + "loss": 0.4404, + "step": 10070 + }, + { + "epoch": 0.12590314757868948, + "grad_norm": 0.2650756239891052, + "learning_rate": 1.9959157479360915e-05, + "loss": 0.3105, + "step": 10072 + }, + { + "epoch": 0.1259281482037051, + "grad_norm": 3.166135787963867, + "learning_rate": 1.995907864858774e-05, + "loss": 1.6933, + "step": 10074 + }, + { + "epoch": 0.12595314882872072, + "grad_norm": 2.3678441047668457, + "learning_rate": 1.995899974196763e-05, + "loss": 1.1752, + "step": 10076 + }, + { + "epoch": 0.12597814945373634, + "grad_norm": 4.890764236450195, + "learning_rate": 1.9958920759501187e-05, + "loss": 1.6429, + "step": 10078 + }, + { + "epoch": 0.12600315007875196, + "grad_norm": 2.6968066692352295, + "learning_rate": 1.9958841701189017e-05, + "loss": 0.5303, + "step": 10080 + }, + { + "epoch": 0.1260281507037676, + "grad_norm": 3.9080893993377686, + "learning_rate": 1.9958762567031707e-05, + "loss": 0.3373, + "step": 10082 + }, + { + "epoch": 0.12605315132878323, + "grad_norm": 10.860376358032227, + "learning_rate": 1.995868335702988e-05, + "loss": 1.4087, + "step": 10084 + }, + { + "epoch": 0.12607815195379884, + "grad_norm": 2.0918962955474854, + "learning_rate": 1.995860407118412e-05, + "loss": 0.6281, + "step": 10086 + }, + { + "epoch": 0.12610315257881446, + "grad_norm": 1.285216212272644, + "learning_rate": 1.9958524709495045e-05, + "loss": 0.0345, + "step": 10088 + }, + { + "epoch": 0.12612815320383008, + "grad_norm": 0.9144445061683655, + "learning_rate": 1.9958445271963253e-05, + "loss": 0.7511, + "step": 10090 + }, + { + "epoch": 0.12615315382884573, + "grad_norm": 5.440208911895752, + "learning_rate": 1.9958365758589348e-05, + "loss": 0.3414, + "step": 10092 + }, + { + "epoch": 0.12617815445386135, + "grad_norm": 2.7002737522125244, + "learning_rate": 1.995828616937394e-05, + "loss": 0.7477, + "step": 10094 + }, + { + "epoch": 0.12620315507887697, + "grad_norm": 4.18523645401001, + "learning_rate": 1.995820650431763e-05, + "loss": 0.8762, + "step": 10096 + }, + { + "epoch": 0.1262281557038926, + "grad_norm": 2.0496928691864014, + "learning_rate": 1.9958126763421033e-05, + "loss": 1.131, + "step": 10098 + }, + { + "epoch": 0.1262531563289082, + "grad_norm": 3.7754409313201904, + "learning_rate": 1.9958046946684747e-05, + "loss": 0.5795, + "step": 10100 + }, + { + "epoch": 0.12627815695392386, + "grad_norm": 2.830573558807373, + "learning_rate": 1.995796705410938e-05, + "loss": 0.5478, + "step": 10102 + }, + { + "epoch": 0.12630315757893948, + "grad_norm": 0.046375107020139694, + "learning_rate": 1.9957887085695552e-05, + "loss": 0.1879, + "step": 10104 + }, + { + "epoch": 0.1263281582039551, + "grad_norm": 2.5668563842773438, + "learning_rate": 1.9957807041443858e-05, + "loss": 1.5016, + "step": 10106 + }, + { + "epoch": 0.12635315882897072, + "grad_norm": 2.3395369052886963, + "learning_rate": 1.9957726921354916e-05, + "loss": 0.3077, + "step": 10108 + }, + { + "epoch": 0.12637815945398634, + "grad_norm": 5.64462947845459, + "learning_rate": 1.9957646725429332e-05, + "loss": 1.8962, + "step": 10110 + }, + { + "epoch": 0.12640316007900199, + "grad_norm": 5.522331237792969, + "learning_rate": 1.9957566453667723e-05, + "loss": 0.8304, + "step": 10112 + }, + { + "epoch": 0.1264281607040176, + "grad_norm": 3.1479039192199707, + "learning_rate": 1.9957486106070695e-05, + "loss": 0.5312, + "step": 10114 + }, + { + "epoch": 0.12645316132903323, + "grad_norm": 2.1031007766723633, + "learning_rate": 1.9957405682638858e-05, + "loss": 0.1725, + "step": 10116 + }, + { + "epoch": 0.12647816195404885, + "grad_norm": 3.1160342693328857, + "learning_rate": 1.995732518337283e-05, + "loss": 0.7724, + "step": 10118 + }, + { + "epoch": 0.12650316257906447, + "grad_norm": 0.0036679685581475496, + "learning_rate": 1.9957244608273217e-05, + "loss": 0.3825, + "step": 10120 + }, + { + "epoch": 0.1265281632040801, + "grad_norm": 3.148259162902832, + "learning_rate": 1.9957163957340643e-05, + "loss": 0.6304, + "step": 10122 + }, + { + "epoch": 0.12655316382909573, + "grad_norm": 4.448102951049805, + "learning_rate": 1.995708323057571e-05, + "loss": 0.599, + "step": 10124 + }, + { + "epoch": 0.12657816445411135, + "grad_norm": 2.930136203765869, + "learning_rate": 1.9957002427979044e-05, + "loss": 0.8903, + "step": 10126 + }, + { + "epoch": 0.12660316507912697, + "grad_norm": 2.010751724243164, + "learning_rate": 1.9956921549551255e-05, + "loss": 0.7507, + "step": 10128 + }, + { + "epoch": 0.1266281657041426, + "grad_norm": 4.305495738983154, + "learning_rate": 1.995684059529296e-05, + "loss": 1.4702, + "step": 10130 + }, + { + "epoch": 0.12665316632915824, + "grad_norm": 2.9133901596069336, + "learning_rate": 1.9956759565204777e-05, + "loss": 0.6828, + "step": 10132 + }, + { + "epoch": 0.12667816695417386, + "grad_norm": 10.637284278869629, + "learning_rate": 1.9956678459287317e-05, + "loss": 1.3755, + "step": 10134 + }, + { + "epoch": 0.12670316757918948, + "grad_norm": 0.7518123984336853, + "learning_rate": 1.99565972775412e-05, + "loss": 0.5226, + "step": 10136 + }, + { + "epoch": 0.1267281682042051, + "grad_norm": 3.2255802154541016, + "learning_rate": 1.995651601996705e-05, + "loss": 0.1722, + "step": 10138 + }, + { + "epoch": 0.12675316882922072, + "grad_norm": 4.098382949829102, + "learning_rate": 1.9956434686565483e-05, + "loss": 1.5431, + "step": 10140 + }, + { + "epoch": 0.12677816945423637, + "grad_norm": 9.24551773071289, + "learning_rate": 1.9956353277337116e-05, + "loss": 2.0239, + "step": 10142 + }, + { + "epoch": 0.126803170079252, + "grad_norm": 3.286334753036499, + "learning_rate": 1.995627179228257e-05, + "loss": 0.3772, + "step": 10144 + }, + { + "epoch": 0.1268281707042676, + "grad_norm": 3.1053011417388916, + "learning_rate": 1.9956190231402462e-05, + "loss": 0.6661, + "step": 10146 + }, + { + "epoch": 0.12685317132928323, + "grad_norm": 4.441277503967285, + "learning_rate": 1.9956108594697423e-05, + "loss": 1.0173, + "step": 10148 + }, + { + "epoch": 0.12687817195429885, + "grad_norm": 4.397806167602539, + "learning_rate": 1.9956026882168068e-05, + "loss": 0.3246, + "step": 10150 + }, + { + "epoch": 0.1269031725793145, + "grad_norm": 1.5310269594192505, + "learning_rate": 1.9955945093815017e-05, + "loss": 0.1796, + "step": 10152 + }, + { + "epoch": 0.1269281732043301, + "grad_norm": 6.56746768951416, + "learning_rate": 1.9955863229638898e-05, + "loss": 0.826, + "step": 10154 + }, + { + "epoch": 0.12695317382934573, + "grad_norm": 6.342916011810303, + "learning_rate": 1.9955781289640336e-05, + "loss": 1.8078, + "step": 10156 + }, + { + "epoch": 0.12697817445436135, + "grad_norm": 3.2299320697784424, + "learning_rate": 1.9955699273819948e-05, + "loss": 1.096, + "step": 10158 + }, + { + "epoch": 0.12700317507937697, + "grad_norm": 0.04791725426912308, + "learning_rate": 1.995561718217836e-05, + "loss": 0.3077, + "step": 10160 + }, + { + "epoch": 0.12702817570439262, + "grad_norm": 3.478482246398926, + "learning_rate": 1.9955535014716206e-05, + "loss": 0.9769, + "step": 10162 + }, + { + "epoch": 0.12705317632940824, + "grad_norm": 4.888621807098389, + "learning_rate": 1.99554527714341e-05, + "loss": 1.0781, + "step": 10164 + }, + { + "epoch": 0.12707817695442386, + "grad_norm": 1.4805701971054077, + "learning_rate": 1.9955370452332673e-05, + "loss": 0.6872, + "step": 10166 + }, + { + "epoch": 0.12710317757943948, + "grad_norm": 6.013959884643555, + "learning_rate": 1.9955288057412553e-05, + "loss": 1.2716, + "step": 10168 + }, + { + "epoch": 0.1271281782044551, + "grad_norm": 0.0036495954263955355, + "learning_rate": 1.9955205586674368e-05, + "loss": 0.0316, + "step": 10170 + }, + { + "epoch": 0.12715317882947075, + "grad_norm": 4.1342692375183105, + "learning_rate": 1.9955123040118748e-05, + "loss": 1.6351, + "step": 10172 + }, + { + "epoch": 0.12717817945448637, + "grad_norm": 5.482421398162842, + "learning_rate": 1.9955040417746313e-05, + "loss": 1.6898, + "step": 10174 + }, + { + "epoch": 0.127203180079502, + "grad_norm": 3.7163732051849365, + "learning_rate": 1.99549577195577e-05, + "loss": 1.781, + "step": 10176 + }, + { + "epoch": 0.1272281807045176, + "grad_norm": 2.317493200302124, + "learning_rate": 1.9954874945553538e-05, + "loss": 0.7266, + "step": 10178 + }, + { + "epoch": 0.12725318132953323, + "grad_norm": 6.655475616455078, + "learning_rate": 1.9954792095734458e-05, + "loss": 1.2878, + "step": 10180 + }, + { + "epoch": 0.12727818195454887, + "grad_norm": 4.637360095977783, + "learning_rate": 1.9954709170101084e-05, + "loss": 2.0125, + "step": 10182 + }, + { + "epoch": 0.1273031825795645, + "grad_norm": 4.193637371063232, + "learning_rate": 1.995462616865406e-05, + "loss": 1.5218, + "step": 10184 + }, + { + "epoch": 0.12732818320458011, + "grad_norm": 0.0015320922248065472, + "learning_rate": 1.9954543091394002e-05, + "loss": 0.0193, + "step": 10186 + }, + { + "epoch": 0.12735318382959573, + "grad_norm": 3.5680508613586426, + "learning_rate": 1.995445993832156e-05, + "loss": 0.5615, + "step": 10188 + }, + { + "epoch": 0.12737818445461135, + "grad_norm": 2.1877965927124023, + "learning_rate": 1.9954376709437352e-05, + "loss": 1.207, + "step": 10190 + }, + { + "epoch": 0.127403185079627, + "grad_norm": 2.1890392303466797, + "learning_rate": 1.995429340474202e-05, + "loss": 0.7233, + "step": 10192 + }, + { + "epoch": 0.12742818570464262, + "grad_norm": 4.045754432678223, + "learning_rate": 1.99542100242362e-05, + "loss": 0.9611, + "step": 10194 + }, + { + "epoch": 0.12745318632965824, + "grad_norm": 3.0282697677612305, + "learning_rate": 1.995412656792052e-05, + "loss": 0.7677, + "step": 10196 + }, + { + "epoch": 0.12747818695467386, + "grad_norm": 2.1553213596343994, + "learning_rate": 1.9954043035795623e-05, + "loss": 1.2956, + "step": 10198 + }, + { + "epoch": 0.12750318757968948, + "grad_norm": 2.942748785018921, + "learning_rate": 1.9953959427862143e-05, + "loss": 0.6577, + "step": 10200 + }, + { + "epoch": 0.12752818820470513, + "grad_norm": 1.0344622135162354, + "learning_rate": 1.995387574412071e-05, + "loss": 0.5778, + "step": 10202 + }, + { + "epoch": 0.12755318882972075, + "grad_norm": 3.5235538482666016, + "learning_rate": 1.9953791984571973e-05, + "loss": 0.8083, + "step": 10204 + }, + { + "epoch": 0.12757818945473637, + "grad_norm": 1.9901173114776611, + "learning_rate": 1.9953708149216563e-05, + "loss": 1.358, + "step": 10206 + }, + { + "epoch": 0.127603190079752, + "grad_norm": 2.939897060394287, + "learning_rate": 1.9953624238055117e-05, + "loss": 1.1509, + "step": 10208 + }, + { + "epoch": 0.1276281907047676, + "grad_norm": 2.3401222229003906, + "learning_rate": 1.9953540251088274e-05, + "loss": 1.032, + "step": 10210 + }, + { + "epoch": 0.12765319132978326, + "grad_norm": 2.433065891265869, + "learning_rate": 1.9953456188316682e-05, + "loss": 0.1673, + "step": 10212 + }, + { + "epoch": 0.12767819195479888, + "grad_norm": 4.316909313201904, + "learning_rate": 1.995337204974097e-05, + "loss": 0.4684, + "step": 10214 + }, + { + "epoch": 0.1277031925798145, + "grad_norm": 3.752009391784668, + "learning_rate": 1.9953287835361787e-05, + "loss": 1.6566, + "step": 10216 + }, + { + "epoch": 0.12772819320483011, + "grad_norm": 2.5867836475372314, + "learning_rate": 1.9953203545179772e-05, + "loss": 0.8933, + "step": 10218 + }, + { + "epoch": 0.12775319382984573, + "grad_norm": 2.7241015434265137, + "learning_rate": 1.9953119179195565e-05, + "loss": 1.6459, + "step": 10220 + }, + { + "epoch": 0.12777819445486138, + "grad_norm": 2.643298387527466, + "learning_rate": 1.9953034737409808e-05, + "loss": 1.2364, + "step": 10222 + }, + { + "epoch": 0.127803195079877, + "grad_norm": 0.0018763417610898614, + "learning_rate": 1.995295021982315e-05, + "loss": 0.5932, + "step": 10224 + }, + { + "epoch": 0.12782819570489262, + "grad_norm": 0.0022839996963739395, + "learning_rate": 1.9952865626436228e-05, + "loss": 1.084, + "step": 10226 + }, + { + "epoch": 0.12785319632990824, + "grad_norm": 3.665409564971924, + "learning_rate": 1.995278095724969e-05, + "loss": 1.4362, + "step": 10228 + }, + { + "epoch": 0.12787819695492386, + "grad_norm": 6.175693511962891, + "learning_rate": 1.995269621226418e-05, + "loss": 0.3156, + "step": 10230 + }, + { + "epoch": 0.1279031975799395, + "grad_norm": 2.7614223957061768, + "learning_rate": 1.995261139148034e-05, + "loss": 0.5308, + "step": 10232 + }, + { + "epoch": 0.12792819820495513, + "grad_norm": 0.3217454254627228, + "learning_rate": 1.995252649489882e-05, + "loss": 1.0149, + "step": 10234 + }, + { + "epoch": 0.12795319882997075, + "grad_norm": 2.276906728744507, + "learning_rate": 1.9952441522520267e-05, + "loss": 0.92, + "step": 10236 + }, + { + "epoch": 0.12797819945498637, + "grad_norm": 2.2801754474639893, + "learning_rate": 1.9952356474345327e-05, + "loss": 0.458, + "step": 10238 + }, + { + "epoch": 0.128003200080002, + "grad_norm": 3.3826074600219727, + "learning_rate": 1.995227135037465e-05, + "loss": 0.9118, + "step": 10240 + }, + { + "epoch": 0.12802820070501764, + "grad_norm": 5.138126373291016, + "learning_rate": 1.9952186150608875e-05, + "loss": 1.578, + "step": 10242 + }, + { + "epoch": 0.12805320133003326, + "grad_norm": 3.160902261734009, + "learning_rate": 1.9952100875048662e-05, + "loss": 0.8893, + "step": 10244 + }, + { + "epoch": 0.12807820195504888, + "grad_norm": 3.1951546669006348, + "learning_rate": 1.9952015523694653e-05, + "loss": 0.2854, + "step": 10246 + }, + { + "epoch": 0.1281032025800645, + "grad_norm": 0.0006692304159514606, + "learning_rate": 1.9951930096547505e-05, + "loss": 0.5013, + "step": 10248 + }, + { + "epoch": 0.12812820320508012, + "grad_norm": 3.8105978965759277, + "learning_rate": 1.9951844593607863e-05, + "loss": 0.9536, + "step": 10250 + }, + { + "epoch": 0.12815320383009576, + "grad_norm": 6.867191314697266, + "learning_rate": 1.995175901487638e-05, + "loss": 1.1506, + "step": 10252 + }, + { + "epoch": 0.12817820445511138, + "grad_norm": 2.4636478424072266, + "learning_rate": 1.9951673360353708e-05, + "loss": 0.2358, + "step": 10254 + }, + { + "epoch": 0.128203205080127, + "grad_norm": 2.448369264602661, + "learning_rate": 1.99515876300405e-05, + "loss": 1.2795, + "step": 10256 + }, + { + "epoch": 0.12822820570514262, + "grad_norm": 2.325653553009033, + "learning_rate": 1.9951501823937404e-05, + "loss": 1.0592, + "step": 10258 + }, + { + "epoch": 0.12825320633015824, + "grad_norm": 6.189474582672119, + "learning_rate": 1.995141594204508e-05, + "loss": 1.528, + "step": 10260 + }, + { + "epoch": 0.1282782069551739, + "grad_norm": 2.1405482292175293, + "learning_rate": 1.995132998436418e-05, + "loss": 1.284, + "step": 10262 + }, + { + "epoch": 0.1283032075801895, + "grad_norm": 2.304414749145508, + "learning_rate": 1.995124395089536e-05, + "loss": 0.5592, + "step": 10264 + }, + { + "epoch": 0.12832820820520513, + "grad_norm": 0.09483987838029861, + "learning_rate": 1.995115784163927e-05, + "loss": 0.1572, + "step": 10266 + }, + { + "epoch": 0.12835320883022075, + "grad_norm": 0.52036052942276, + "learning_rate": 1.995107165659657e-05, + "loss": 1.0743, + "step": 10268 + }, + { + "epoch": 0.12837820945523637, + "grad_norm": 0.032198067754507065, + "learning_rate": 1.9950985395767913e-05, + "loss": 0.8091, + "step": 10270 + }, + { + "epoch": 0.12840321008025202, + "grad_norm": 0.09481395035982132, + "learning_rate": 1.9950899059153962e-05, + "loss": 0.6396, + "step": 10272 + }, + { + "epoch": 0.12842821070526764, + "grad_norm": 0.0009944025659933686, + "learning_rate": 1.995081264675537e-05, + "loss": 0.6313, + "step": 10274 + }, + { + "epoch": 0.12845321133028326, + "grad_norm": 2.3127048015594482, + "learning_rate": 1.9950726158572794e-05, + "loss": 1.5736, + "step": 10276 + }, + { + "epoch": 0.12847821195529888, + "grad_norm": 2.515085220336914, + "learning_rate": 1.99506395946069e-05, + "loss": 1.5098, + "step": 10278 + }, + { + "epoch": 0.1285032125803145, + "grad_norm": 3.779503107070923, + "learning_rate": 1.9950552954858338e-05, + "loss": 0.8021, + "step": 10280 + }, + { + "epoch": 0.12852821320533014, + "grad_norm": 3.8222885131835938, + "learning_rate": 1.9950466239327772e-05, + "loss": 1.6666, + "step": 10282 + }, + { + "epoch": 0.12855321383034576, + "grad_norm": 3.181817054748535, + "learning_rate": 1.9950379448015865e-05, + "loss": 0.6106, + "step": 10284 + }, + { + "epoch": 0.12857821445536138, + "grad_norm": 3.8973865509033203, + "learning_rate": 1.995029258092327e-05, + "loss": 0.7323, + "step": 10286 + }, + { + "epoch": 0.128603215080377, + "grad_norm": 5.770036220550537, + "learning_rate": 1.9950205638050655e-05, + "loss": 1.2896, + "step": 10288 + }, + { + "epoch": 0.12862821570539262, + "grad_norm": 2.6701576709747314, + "learning_rate": 1.9950118619398684e-05, + "loss": 1.0089, + "step": 10290 + }, + { + "epoch": 0.12865321633040827, + "grad_norm": 4.365085601806641, + "learning_rate": 1.9950031524968017e-05, + "loss": 0.7081, + "step": 10292 + }, + { + "epoch": 0.1286782169554239, + "grad_norm": 6.037912845611572, + "learning_rate": 1.9949944354759317e-05, + "loss": 0.9946, + "step": 10294 + }, + { + "epoch": 0.1287032175804395, + "grad_norm": 5.157447814941406, + "learning_rate": 1.9949857108773246e-05, + "loss": 3.0713, + "step": 10296 + }, + { + "epoch": 0.12872821820545513, + "grad_norm": 1.7500654458999634, + "learning_rate": 1.994976978701047e-05, + "loss": 0.2032, + "step": 10298 + }, + { + "epoch": 0.12875321883047075, + "grad_norm": 3.9918057918548584, + "learning_rate": 1.9949682389471655e-05, + "loss": 1.9332, + "step": 10300 + }, + { + "epoch": 0.1287782194554864, + "grad_norm": 3.234036445617676, + "learning_rate": 1.9949594916157463e-05, + "loss": 0.5938, + "step": 10302 + }, + { + "epoch": 0.12880322008050202, + "grad_norm": 3.420443534851074, + "learning_rate": 1.9949507367068567e-05, + "loss": 1.6263, + "step": 10304 + }, + { + "epoch": 0.12882822070551764, + "grad_norm": 4.517813205718994, + "learning_rate": 1.9949419742205628e-05, + "loss": 1.7796, + "step": 10306 + }, + { + "epoch": 0.12885322133053326, + "grad_norm": 12.394577026367188, + "learning_rate": 1.9949332041569317e-05, + "loss": 1.792, + "step": 10308 + }, + { + "epoch": 0.12887822195554888, + "grad_norm": 0.0012936943676322699, + "learning_rate": 1.99492442651603e-05, + "loss": 0.1956, + "step": 10310 + }, + { + "epoch": 0.12890322258056452, + "grad_norm": 3.5094809532165527, + "learning_rate": 1.9949156412979243e-05, + "loss": 0.9694, + "step": 10312 + }, + { + "epoch": 0.12892822320558014, + "grad_norm": 2.1276798248291016, + "learning_rate": 1.9949068485026818e-05, + "loss": 0.6636, + "step": 10314 + }, + { + "epoch": 0.12895322383059576, + "grad_norm": 0.0006174133741296828, + "learning_rate": 1.9948980481303697e-05, + "loss": 0.2999, + "step": 10316 + }, + { + "epoch": 0.12897822445561138, + "grad_norm": 2.0339548587799072, + "learning_rate": 1.9948892401810546e-05, + "loss": 0.1758, + "step": 10318 + }, + { + "epoch": 0.129003225080627, + "grad_norm": 2.014472007751465, + "learning_rate": 1.9948804246548037e-05, + "loss": 1.3086, + "step": 10320 + }, + { + "epoch": 0.12902822570564265, + "grad_norm": 3.289701461791992, + "learning_rate": 1.9948716015516845e-05, + "loss": 0.766, + "step": 10322 + }, + { + "epoch": 0.12905322633065827, + "grad_norm": 6.005667686462402, + "learning_rate": 1.9948627708717635e-05, + "loss": 2.3428, + "step": 10324 + }, + { + "epoch": 0.1290782269556739, + "grad_norm": 4.705220699310303, + "learning_rate": 1.9948539326151084e-05, + "loss": 0.2413, + "step": 10326 + }, + { + "epoch": 0.1291032275806895, + "grad_norm": 2.324585437774658, + "learning_rate": 1.9948450867817866e-05, + "loss": 1.2682, + "step": 10328 + }, + { + "epoch": 0.12912822820570513, + "grad_norm": 3.4469759464263916, + "learning_rate": 1.9948362333718648e-05, + "loss": 0.8939, + "step": 10330 + }, + { + "epoch": 0.12915322883072078, + "grad_norm": 3.721984386444092, + "learning_rate": 1.9948273723854116e-05, + "loss": 1.7097, + "step": 10332 + }, + { + "epoch": 0.1291782294557364, + "grad_norm": 0.0006888618227094412, + "learning_rate": 1.9948185038224936e-05, + "loss": 0.0, + "step": 10334 + }, + { + "epoch": 0.12920323008075202, + "grad_norm": 2.8925557136535645, + "learning_rate": 1.9948096276831782e-05, + "loss": 0.6733, + "step": 10336 + }, + { + "epoch": 0.12922823070576764, + "grad_norm": 3.42848539352417, + "learning_rate": 1.9948007439675337e-05, + "loss": 0.6123, + "step": 10338 + }, + { + "epoch": 0.12925323133078326, + "grad_norm": 4.389673233032227, + "learning_rate": 1.9947918526756273e-05, + "loss": 2.7942, + "step": 10340 + }, + { + "epoch": 0.1292782319557989, + "grad_norm": 5.910466194152832, + "learning_rate": 1.9947829538075266e-05, + "loss": 2.0115, + "step": 10342 + }, + { + "epoch": 0.12930323258081453, + "grad_norm": 3.637939214706421, + "learning_rate": 1.9947740473633002e-05, + "loss": 1.752, + "step": 10344 + }, + { + "epoch": 0.12932823320583015, + "grad_norm": 0.022756261751055717, + "learning_rate": 1.994765133343015e-05, + "loss": 0.0435, + "step": 10346 + }, + { + "epoch": 0.12935323383084577, + "grad_norm": 5.517354488372803, + "learning_rate": 1.994756211746739e-05, + "loss": 1.4091, + "step": 10348 + }, + { + "epoch": 0.12937823445586139, + "grad_norm": 2.606233835220337, + "learning_rate": 1.9947472825745405e-05, + "loss": 0.5561, + "step": 10350 + }, + { + "epoch": 0.12940323508087703, + "grad_norm": 7.305147171020508, + "learning_rate": 1.9947383458264874e-05, + "loss": 2.2999, + "step": 10352 + }, + { + "epoch": 0.12942823570589265, + "grad_norm": 2.5242574214935303, + "learning_rate": 1.994729401502648e-05, + "loss": 1.0702, + "step": 10354 + }, + { + "epoch": 0.12945323633090827, + "grad_norm": 4.3265061378479, + "learning_rate": 1.99472044960309e-05, + "loss": 1.0668, + "step": 10356 + }, + { + "epoch": 0.1294782369559239, + "grad_norm": 1.6977670192718506, + "learning_rate": 1.9947114901278817e-05, + "loss": 1.387, + "step": 10358 + }, + { + "epoch": 0.1295032375809395, + "grad_norm": 2.0992465019226074, + "learning_rate": 1.994702523077091e-05, + "loss": 0.9092, + "step": 10360 + }, + { + "epoch": 0.12952823820595516, + "grad_norm": 0.012768951244652271, + "learning_rate": 1.994693548450787e-05, + "loss": 0.9059, + "step": 10362 + }, + { + "epoch": 0.12955323883097078, + "grad_norm": 1.177789568901062, + "learning_rate": 1.9946845662490374e-05, + "loss": 0.6319, + "step": 10364 + }, + { + "epoch": 0.1295782394559864, + "grad_norm": 3.548574924468994, + "learning_rate": 1.994675576471911e-05, + "loss": 1.144, + "step": 10366 + }, + { + "epoch": 0.12960324008100202, + "grad_norm": 2.2716317176818848, + "learning_rate": 1.9946665791194758e-05, + "loss": 1.2994, + "step": 10368 + }, + { + "epoch": 0.12962824070601764, + "grad_norm": 2.260631561279297, + "learning_rate": 1.9946575741918006e-05, + "loss": 1.4031, + "step": 10370 + }, + { + "epoch": 0.1296532413310333, + "grad_norm": 0.0010595389176160097, + "learning_rate": 1.994648561688954e-05, + "loss": 0.46, + "step": 10372 + }, + { + "epoch": 0.1296782419560489, + "grad_norm": 3.7994465827941895, + "learning_rate": 1.9946395416110047e-05, + "loss": 1.0755, + "step": 10374 + }, + { + "epoch": 0.12970324258106453, + "grad_norm": 2.602461814880371, + "learning_rate": 1.9946305139580214e-05, + "loss": 0.3871, + "step": 10376 + }, + { + "epoch": 0.12972824320608015, + "grad_norm": 5.211521148681641, + "learning_rate": 1.9946214787300726e-05, + "loss": 1.6315, + "step": 10378 + }, + { + "epoch": 0.12975324383109577, + "grad_norm": 3.513948440551758, + "learning_rate": 1.9946124359272275e-05, + "loss": 0.466, + "step": 10380 + }, + { + "epoch": 0.1297782444561114, + "grad_norm": 1.8518900871276855, + "learning_rate": 1.9946033855495544e-05, + "loss": 0.3105, + "step": 10382 + }, + { + "epoch": 0.12980324508112703, + "grad_norm": 11.484301567077637, + "learning_rate": 1.9945943275971228e-05, + "loss": 1.5397, + "step": 10384 + }, + { + "epoch": 0.12982824570614265, + "grad_norm": 4.529183864593506, + "learning_rate": 1.994585262070001e-05, + "loss": 1.2744, + "step": 10386 + }, + { + "epoch": 0.12985324633115827, + "grad_norm": 2.522228956222534, + "learning_rate": 1.9945761889682592e-05, + "loss": 1.0044, + "step": 10388 + }, + { + "epoch": 0.1298782469561739, + "grad_norm": 0.0010953312739729881, + "learning_rate": 1.9945671082919652e-05, + "loss": 0.6112, + "step": 10390 + }, + { + "epoch": 0.12990324758118954, + "grad_norm": 3.256730794906616, + "learning_rate": 1.994558020041189e-05, + "loss": 1.4074, + "step": 10392 + }, + { + "epoch": 0.12992824820620516, + "grad_norm": 6.1583476066589355, + "learning_rate": 1.9945489242159998e-05, + "loss": 1.7586, + "step": 10394 + }, + { + "epoch": 0.12995324883122078, + "grad_norm": 3.3053200244903564, + "learning_rate": 1.994539820816466e-05, + "loss": 1.1524, + "step": 10396 + }, + { + "epoch": 0.1299782494562364, + "grad_norm": 3.4077653884887695, + "learning_rate": 1.9945307098426583e-05, + "loss": 1.2962, + "step": 10398 + }, + { + "epoch": 0.13000325008125202, + "grad_norm": 4.696837902069092, + "learning_rate": 1.994521591294645e-05, + "loss": 1.0432, + "step": 10400 + }, + { + "epoch": 0.13002825070626767, + "grad_norm": 0.00045300694182515144, + "learning_rate": 1.994512465172496e-05, + "loss": 0.9429, + "step": 10402 + }, + { + "epoch": 0.1300532513312833, + "grad_norm": 1.413819432258606, + "learning_rate": 1.9945033314762807e-05, + "loss": 0.6817, + "step": 10404 + }, + { + "epoch": 0.1300782519562989, + "grad_norm": 3.7752299308776855, + "learning_rate": 1.9944941902060685e-05, + "loss": 1.6994, + "step": 10406 + }, + { + "epoch": 0.13010325258131453, + "grad_norm": 3.1379880905151367, + "learning_rate": 1.9944850413619293e-05, + "loss": 0.2818, + "step": 10408 + }, + { + "epoch": 0.13012825320633015, + "grad_norm": 3.03373384475708, + "learning_rate": 1.9944758849439325e-05, + "loss": 1.2995, + "step": 10410 + }, + { + "epoch": 0.1301532538313458, + "grad_norm": 1.7733972072601318, + "learning_rate": 1.994466720952148e-05, + "loss": 0.5502, + "step": 10412 + }, + { + "epoch": 0.13017825445636141, + "grad_norm": 0.0007868458633311093, + "learning_rate": 1.9944575493866457e-05, + "loss": 0.0, + "step": 10414 + }, + { + "epoch": 0.13020325508137703, + "grad_norm": 4.339810848236084, + "learning_rate": 1.9944483702474952e-05, + "loss": 1.3844, + "step": 10416 + }, + { + "epoch": 0.13022825570639265, + "grad_norm": 1.7972633838653564, + "learning_rate": 1.9944391835347667e-05, + "loss": 0.9013, + "step": 10418 + }, + { + "epoch": 0.13025325633140827, + "grad_norm": 3.1556499004364014, + "learning_rate": 1.9944299892485303e-05, + "loss": 1.8005, + "step": 10420 + }, + { + "epoch": 0.13027825695642392, + "grad_norm": 0.0012093066470697522, + "learning_rate": 1.994420787388855e-05, + "loss": 0.0036, + "step": 10422 + }, + { + "epoch": 0.13030325758143954, + "grad_norm": 3.266497850418091, + "learning_rate": 1.9944115779558122e-05, + "loss": 0.5904, + "step": 10424 + }, + { + "epoch": 0.13032825820645516, + "grad_norm": 2.556922435760498, + "learning_rate": 1.994402360949471e-05, + "loss": 1.3679, + "step": 10426 + }, + { + "epoch": 0.13035325883147078, + "grad_norm": 1.982735276222229, + "learning_rate": 1.9943931363699026e-05, + "loss": 1.2074, + "step": 10428 + }, + { + "epoch": 0.1303782594564864, + "grad_norm": 2.0897388458251953, + "learning_rate": 1.994383904217176e-05, + "loss": 1.8339, + "step": 10430 + }, + { + "epoch": 0.13040326008150205, + "grad_norm": 2.836538314819336, + "learning_rate": 1.994374664491363e-05, + "loss": 0.9046, + "step": 10432 + }, + { + "epoch": 0.13042826070651767, + "grad_norm": 5.304127216339111, + "learning_rate": 1.9943654171925328e-05, + "loss": 1.1129, + "step": 10434 + }, + { + "epoch": 0.1304532613315333, + "grad_norm": 3.779973030090332, + "learning_rate": 1.994356162320756e-05, + "loss": 1.0179, + "step": 10436 + }, + { + "epoch": 0.1304782619565489, + "grad_norm": 1.7822483777999878, + "learning_rate": 1.9943468998761033e-05, + "loss": 0.0531, + "step": 10438 + }, + { + "epoch": 0.13050326258156453, + "grad_norm": 1.9061919450759888, + "learning_rate": 1.9943376298586453e-05, + "loss": 0.9744, + "step": 10440 + }, + { + "epoch": 0.13052826320658018, + "grad_norm": 4.946432590484619, + "learning_rate": 1.9943283522684526e-05, + "loss": 1.0979, + "step": 10442 + }, + { + "epoch": 0.1305532638315958, + "grad_norm": 1.897670030593872, + "learning_rate": 1.994319067105596e-05, + "loss": 1.583, + "step": 10444 + }, + { + "epoch": 0.13057826445661141, + "grad_norm": 1.9105641841888428, + "learning_rate": 1.9943097743701457e-05, + "loss": 1.9572, + "step": 10446 + }, + { + "epoch": 0.13060326508162703, + "grad_norm": 1.7412192821502686, + "learning_rate": 1.994300474062173e-05, + "loss": 0.58, + "step": 10448 + }, + { + "epoch": 0.13062826570664265, + "grad_norm": 2.207723379135132, + "learning_rate": 1.9942911661817482e-05, + "loss": 1.7383, + "step": 10450 + }, + { + "epoch": 0.1306532663316583, + "grad_norm": 2.99540376663208, + "learning_rate": 1.9942818507289425e-05, + "loss": 0.7807, + "step": 10452 + }, + { + "epoch": 0.13067826695667392, + "grad_norm": 6.371413230895996, + "learning_rate": 1.994272527703827e-05, + "loss": 0.9564, + "step": 10454 + }, + { + "epoch": 0.13070326758168954, + "grad_norm": 2.809054136276245, + "learning_rate": 1.9942631971064723e-05, + "loss": 1.123, + "step": 10456 + }, + { + "epoch": 0.13072826820670516, + "grad_norm": 0.03566500172019005, + "learning_rate": 1.99425385893695e-05, + "loss": 1.3067, + "step": 10458 + }, + { + "epoch": 0.13075326883172078, + "grad_norm": 4.605257034301758, + "learning_rate": 1.9942445131953307e-05, + "loss": 1.0592, + "step": 10460 + }, + { + "epoch": 0.13077826945673643, + "grad_norm": 0.7110816240310669, + "learning_rate": 1.994235159881686e-05, + "loss": 0.066, + "step": 10462 + }, + { + "epoch": 0.13080327008175205, + "grad_norm": 3.0790820121765137, + "learning_rate": 1.9942257989960866e-05, + "loss": 1.2608, + "step": 10464 + }, + { + "epoch": 0.13082827070676767, + "grad_norm": 3.147372245788574, + "learning_rate": 1.9942164305386042e-05, + "loss": 0.7586, + "step": 10466 + }, + { + "epoch": 0.1308532713317833, + "grad_norm": 3.474073886871338, + "learning_rate": 1.9942070545093103e-05, + "loss": 1.0169, + "step": 10468 + }, + { + "epoch": 0.1308782719567989, + "grad_norm": 1.7695848941802979, + "learning_rate": 1.994197670908276e-05, + "loss": 0.9444, + "step": 10470 + }, + { + "epoch": 0.13090327258181456, + "grad_norm": 1.105933427810669, + "learning_rate": 1.9941882797355726e-05, + "loss": 0.3807, + "step": 10472 + }, + { + "epoch": 0.13092827320683018, + "grad_norm": 3.661778688430786, + "learning_rate": 1.9941788809912718e-05, + "loss": 1.6141, + "step": 10474 + }, + { + "epoch": 0.1309532738318458, + "grad_norm": 3.1483654975891113, + "learning_rate": 1.9941694746754458e-05, + "loss": 1.1684, + "step": 10476 + }, + { + "epoch": 0.13097827445686142, + "grad_norm": 2.8321609497070312, + "learning_rate": 1.9941600607881654e-05, + "loss": 1.0996, + "step": 10478 + }, + { + "epoch": 0.13100327508187704, + "grad_norm": 1.4117426872253418, + "learning_rate": 1.994150639329502e-05, + "loss": 0.822, + "step": 10480 + }, + { + "epoch": 0.13102827570689268, + "grad_norm": 7.698520660400391, + "learning_rate": 1.9941412102995284e-05, + "loss": 0.3164, + "step": 10482 + }, + { + "epoch": 0.1310532763319083, + "grad_norm": 2.2626302242279053, + "learning_rate": 1.9941317736983162e-05, + "loss": 0.1462, + "step": 10484 + }, + { + "epoch": 0.13107827695692392, + "grad_norm": 3.13063645362854, + "learning_rate": 1.9941223295259366e-05, + "loss": 1.2912, + "step": 10486 + }, + { + "epoch": 0.13110327758193954, + "grad_norm": 4.754046440124512, + "learning_rate": 1.9941128777824617e-05, + "loss": 0.4049, + "step": 10488 + }, + { + "epoch": 0.13112827820695516, + "grad_norm": 2.648975133895874, + "learning_rate": 1.9941034184679644e-05, + "loss": 0.7683, + "step": 10490 + }, + { + "epoch": 0.1311532788319708, + "grad_norm": 1.0547717809677124, + "learning_rate": 1.9940939515825156e-05, + "loss": 0.093, + "step": 10492 + }, + { + "epoch": 0.13117827945698643, + "grad_norm": 3.7196216583251953, + "learning_rate": 1.994084477126188e-05, + "loss": 0.9451, + "step": 10494 + }, + { + "epoch": 0.13120328008200205, + "grad_norm": 5.826702117919922, + "learning_rate": 1.9940749950990535e-05, + "loss": 0.3291, + "step": 10496 + }, + { + "epoch": 0.13122828070701767, + "grad_norm": 4.127508163452148, + "learning_rate": 1.9940655055011843e-05, + "loss": 2.1043, + "step": 10498 + }, + { + "epoch": 0.1312532813320333, + "grad_norm": 2.017099618911743, + "learning_rate": 1.994056008332653e-05, + "loss": 0.7052, + "step": 10500 + }, + { + "epoch": 0.13127828195704894, + "grad_norm": 5.710960865020752, + "learning_rate": 1.9940465035935316e-05, + "loss": 0.9459, + "step": 10502 + }, + { + "epoch": 0.13130328258206456, + "grad_norm": 3.2404966354370117, + "learning_rate": 1.9940369912838925e-05, + "loss": 2.3608, + "step": 10504 + }, + { + "epoch": 0.13132828320708018, + "grad_norm": 3.165863275527954, + "learning_rate": 1.9940274714038087e-05, + "loss": 1.318, + "step": 10506 + }, + { + "epoch": 0.1313532838320958, + "grad_norm": 0.00668648025020957, + "learning_rate": 1.9940179439533516e-05, + "loss": 1.302, + "step": 10508 + }, + { + "epoch": 0.13137828445711142, + "grad_norm": 5.5109052658081055, + "learning_rate": 1.9940084089325945e-05, + "loss": 1.0391, + "step": 10510 + }, + { + "epoch": 0.13140328508212706, + "grad_norm": 3.506072759628296, + "learning_rate": 1.9939988663416106e-05, + "loss": 0.7364, + "step": 10512 + }, + { + "epoch": 0.13142828570714268, + "grad_norm": 0.00580528425052762, + "learning_rate": 1.9939893161804712e-05, + "loss": 0.3748, + "step": 10514 + }, + { + "epoch": 0.1314532863321583, + "grad_norm": 2.656494617462158, + "learning_rate": 1.99397975844925e-05, + "loss": 0.7497, + "step": 10516 + }, + { + "epoch": 0.13147828695717392, + "grad_norm": 5.039648056030273, + "learning_rate": 1.9939701931480195e-05, + "loss": 0.4068, + "step": 10518 + }, + { + "epoch": 0.13150328758218954, + "grad_norm": 3.768343448638916, + "learning_rate": 1.9939606202768528e-05, + "loss": 1.7793, + "step": 10520 + }, + { + "epoch": 0.1315282882072052, + "grad_norm": 1.910524606704712, + "learning_rate": 1.9939510398358218e-05, + "loss": 0.7263, + "step": 10522 + }, + { + "epoch": 0.1315532888322208, + "grad_norm": 2.5106499195098877, + "learning_rate": 1.993941451825001e-05, + "loss": 1.8246, + "step": 10524 + }, + { + "epoch": 0.13157828945723643, + "grad_norm": 3.3122522830963135, + "learning_rate": 1.9939318562444625e-05, + "loss": 1.0361, + "step": 10526 + }, + { + "epoch": 0.13160329008225205, + "grad_norm": 2.398197650909424, + "learning_rate": 1.9939222530942793e-05, + "loss": 1.506, + "step": 10528 + }, + { + "epoch": 0.13162829070726767, + "grad_norm": 0.141809344291687, + "learning_rate": 1.993912642374525e-05, + "loss": 0.0585, + "step": 10530 + }, + { + "epoch": 0.13165329133228332, + "grad_norm": 3.394850015640259, + "learning_rate": 1.9939030240852726e-05, + "loss": 0.5143, + "step": 10532 + }, + { + "epoch": 0.13167829195729894, + "grad_norm": 3.5168139934539795, + "learning_rate": 1.9938933982265952e-05, + "loss": 1.8286, + "step": 10534 + }, + { + "epoch": 0.13170329258231456, + "grad_norm": 2.9515390396118164, + "learning_rate": 1.9938837647985664e-05, + "loss": 1.1805, + "step": 10536 + }, + { + "epoch": 0.13172829320733018, + "grad_norm": 2.52701997756958, + "learning_rate": 1.993874123801259e-05, + "loss": 1.1742, + "step": 10538 + }, + { + "epoch": 0.1317532938323458, + "grad_norm": 4.300713539123535, + "learning_rate": 1.993864475234747e-05, + "loss": 0.1413, + "step": 10540 + }, + { + "epoch": 0.13177829445736144, + "grad_norm": 1.8771514892578125, + "learning_rate": 1.993854819099104e-05, + "loss": 0.9498, + "step": 10542 + }, + { + "epoch": 0.13180329508237706, + "grad_norm": 1.9274892807006836, + "learning_rate": 1.9938451553944028e-05, + "loss": 0.4446, + "step": 10544 + }, + { + "epoch": 0.13182829570739268, + "grad_norm": 4.335587501525879, + "learning_rate": 1.9938354841207176e-05, + "loss": 1.4567, + "step": 10546 + }, + { + "epoch": 0.1318532963324083, + "grad_norm": 2.931351900100708, + "learning_rate": 1.9938258052781222e-05, + "loss": 1.3873, + "step": 10548 + }, + { + "epoch": 0.13187829695742392, + "grad_norm": 6.361811637878418, + "learning_rate": 1.9938161188666897e-05, + "loss": 2.0186, + "step": 10550 + }, + { + "epoch": 0.13190329758243957, + "grad_norm": 0.08716489374637604, + "learning_rate": 1.993806424886494e-05, + "loss": 0.7238, + "step": 10552 + }, + { + "epoch": 0.1319282982074552, + "grad_norm": 2.6572682857513428, + "learning_rate": 1.9937967233376094e-05, + "loss": 0.6467, + "step": 10554 + }, + { + "epoch": 0.1319532988324708, + "grad_norm": 12.454322814941406, + "learning_rate": 1.9937870142201093e-05, + "loss": 1.3026, + "step": 10556 + }, + { + "epoch": 0.13197829945748643, + "grad_norm": 2.690656900405884, + "learning_rate": 1.9937772975340682e-05, + "loss": 0.7998, + "step": 10558 + }, + { + "epoch": 0.13200330008250205, + "grad_norm": 4.520631790161133, + "learning_rate": 1.9937675732795595e-05, + "loss": 2.4501, + "step": 10560 + }, + { + "epoch": 0.1320283007075177, + "grad_norm": 2.4380948543548584, + "learning_rate": 1.9937578414566575e-05, + "loss": 0.5785, + "step": 10562 + }, + { + "epoch": 0.13205330133253332, + "grad_norm": 0.0022478285245597363, + "learning_rate": 1.9937481020654365e-05, + "loss": 0.0686, + "step": 10564 + }, + { + "epoch": 0.13207830195754894, + "grad_norm": 2.4745256900787354, + "learning_rate": 1.99373835510597e-05, + "loss": 1.7371, + "step": 10566 + }, + { + "epoch": 0.13210330258256456, + "grad_norm": 3.2900238037109375, + "learning_rate": 1.993728600578333e-05, + "loss": 1.1115, + "step": 10568 + }, + { + "epoch": 0.13212830320758018, + "grad_norm": 3.3642332553863525, + "learning_rate": 1.9937188384825994e-05, + "loss": 1.6354, + "step": 10570 + }, + { + "epoch": 0.13215330383259583, + "grad_norm": 2.0828654766082764, + "learning_rate": 1.9937090688188437e-05, + "loss": 0.5853, + "step": 10572 + }, + { + "epoch": 0.13217830445761145, + "grad_norm": 2.5508968830108643, + "learning_rate": 1.9936992915871403e-05, + "loss": 1.5465, + "step": 10574 + }, + { + "epoch": 0.13220330508262707, + "grad_norm": 2.6045925617218018, + "learning_rate": 1.993689506787564e-05, + "loss": 0.5743, + "step": 10576 + }, + { + "epoch": 0.13222830570764268, + "grad_norm": 2.579315185546875, + "learning_rate": 1.993679714420188e-05, + "loss": 1.0626, + "step": 10578 + }, + { + "epoch": 0.1322533063326583, + "grad_norm": 2.066573143005371, + "learning_rate": 1.9936699144850886e-05, + "loss": 0.3712, + "step": 10580 + }, + { + "epoch": 0.13227830695767395, + "grad_norm": 1.2563824653625488, + "learning_rate": 1.9936601069823393e-05, + "loss": 0.8999, + "step": 10582 + }, + { + "epoch": 0.13230330758268957, + "grad_norm": 4.520228862762451, + "learning_rate": 1.9936502919120154e-05, + "loss": 1.5746, + "step": 10584 + }, + { + "epoch": 0.1323283082077052, + "grad_norm": 0.006561800837516785, + "learning_rate": 1.9936404692741914e-05, + "loss": 1.5556, + "step": 10586 + }, + { + "epoch": 0.1323533088327208, + "grad_norm": 3.72375226020813, + "learning_rate": 1.993630639068942e-05, + "loss": 1.328, + "step": 10588 + }, + { + "epoch": 0.13237830945773643, + "grad_norm": 2.9130208492279053, + "learning_rate": 1.993620801296342e-05, + "loss": 0.6768, + "step": 10590 + }, + { + "epoch": 0.13240331008275208, + "grad_norm": 2.7780587673187256, + "learning_rate": 1.9936109559564664e-05, + "loss": 2.0394, + "step": 10592 + }, + { + "epoch": 0.1324283107077677, + "grad_norm": 2.4662787914276123, + "learning_rate": 1.9936011030493905e-05, + "loss": 0.9524, + "step": 10594 + }, + { + "epoch": 0.13245331133278332, + "grad_norm": 3.744264841079712, + "learning_rate": 1.9935912425751895e-05, + "loss": 1.0828, + "step": 10596 + }, + { + "epoch": 0.13247831195779894, + "grad_norm": 7.033129692077637, + "learning_rate": 1.9935813745339373e-05, + "loss": 0.8642, + "step": 10598 + }, + { + "epoch": 0.13250331258281456, + "grad_norm": 0.5775741338729858, + "learning_rate": 1.9935714989257105e-05, + "loss": 0.1517, + "step": 10600 + }, + { + "epoch": 0.1325283132078302, + "grad_norm": 0.0043024844489991665, + "learning_rate": 1.9935616157505834e-05, + "loss": 0.9281, + "step": 10602 + }, + { + "epoch": 0.13255331383284583, + "grad_norm": 8.980567932128906, + "learning_rate": 1.9935517250086314e-05, + "loss": 0.804, + "step": 10604 + }, + { + "epoch": 0.13257831445786145, + "grad_norm": 1.4753522872924805, + "learning_rate": 1.99354182669993e-05, + "loss": 0.6878, + "step": 10606 + }, + { + "epoch": 0.13260331508287707, + "grad_norm": 2.364208459854126, + "learning_rate": 1.9935319208245548e-05, + "loss": 0.8542, + "step": 10608 + }, + { + "epoch": 0.13262831570789269, + "grad_norm": 4.280712127685547, + "learning_rate": 1.9935220073825806e-05, + "loss": 1.0613, + "step": 10610 + }, + { + "epoch": 0.13265331633290833, + "grad_norm": 2.1151466369628906, + "learning_rate": 1.9935120863740834e-05, + "loss": 0.641, + "step": 10612 + }, + { + "epoch": 0.13267831695792395, + "grad_norm": 1.9727147817611694, + "learning_rate": 1.9935021577991392e-05, + "loss": 0.906, + "step": 10614 + }, + { + "epoch": 0.13270331758293957, + "grad_norm": 0.02720089815557003, + "learning_rate": 1.9934922216578225e-05, + "loss": 0.241, + "step": 10616 + }, + { + "epoch": 0.1327283182079552, + "grad_norm": 3.7659738063812256, + "learning_rate": 1.9934822779502098e-05, + "loss": 0.1905, + "step": 10618 + }, + { + "epoch": 0.1327533188329708, + "grad_norm": 7.659479141235352, + "learning_rate": 1.9934723266763764e-05, + "loss": 1.7867, + "step": 10620 + }, + { + "epoch": 0.13277831945798646, + "grad_norm": 3.532243013381958, + "learning_rate": 1.993462367836398e-05, + "loss": 2.1119, + "step": 10622 + }, + { + "epoch": 0.13280332008300208, + "grad_norm": 0.02780592069029808, + "learning_rate": 1.9934524014303513e-05, + "loss": 0.3157, + "step": 10624 + }, + { + "epoch": 0.1328283207080177, + "grad_norm": 3.5079591274261475, + "learning_rate": 1.993442427458311e-05, + "loss": 1.3587, + "step": 10626 + }, + { + "epoch": 0.13285332133303332, + "grad_norm": 1.5971137285232544, + "learning_rate": 1.993432445920354e-05, + "loss": 1.1728, + "step": 10628 + }, + { + "epoch": 0.13287832195804894, + "grad_norm": 0.015440008603036404, + "learning_rate": 1.9934224568165563e-05, + "loss": 0.2795, + "step": 10630 + }, + { + "epoch": 0.1329033225830646, + "grad_norm": 7.200891017913818, + "learning_rate": 1.993412460146993e-05, + "loss": 0.8591, + "step": 10632 + }, + { + "epoch": 0.1329283232080802, + "grad_norm": 3.478715181350708, + "learning_rate": 1.993402455911741e-05, + "loss": 0.5745, + "step": 10634 + }, + { + "epoch": 0.13295332383309583, + "grad_norm": 0.9714349508285522, + "learning_rate": 1.9933924441108767e-05, + "loss": 0.7626, + "step": 10636 + }, + { + "epoch": 0.13297832445811145, + "grad_norm": 2.1823084354400635, + "learning_rate": 1.9933824247444756e-05, + "loss": 1.479, + "step": 10638 + }, + { + "epoch": 0.13300332508312707, + "grad_norm": 8.141046524047852, + "learning_rate": 1.993372397812615e-05, + "loss": 0.6226, + "step": 10640 + }, + { + "epoch": 0.13302832570814271, + "grad_norm": 2.47768235206604, + "learning_rate": 1.9933623633153704e-05, + "loss": 1.7235, + "step": 10642 + }, + { + "epoch": 0.13305332633315833, + "grad_norm": 0.43076637387275696, + "learning_rate": 1.9933523212528184e-05, + "loss": 0.1191, + "step": 10644 + }, + { + "epoch": 0.13307832695817395, + "grad_norm": 4.3174591064453125, + "learning_rate": 1.9933422716250357e-05, + "loss": 1.6018, + "step": 10646 + }, + { + "epoch": 0.13310332758318957, + "grad_norm": 6.346798419952393, + "learning_rate": 1.9933322144320985e-05, + "loss": 0.8778, + "step": 10648 + }, + { + "epoch": 0.1331283282082052, + "grad_norm": 2.667869806289673, + "learning_rate": 1.993322149674084e-05, + "loss": 1.0412, + "step": 10650 + }, + { + "epoch": 0.13315332883322084, + "grad_norm": 3.6371943950653076, + "learning_rate": 1.9933120773510683e-05, + "loss": 0.6621, + "step": 10652 + }, + { + "epoch": 0.13317832945823646, + "grad_norm": 1.4581705331802368, + "learning_rate": 1.993301997463128e-05, + "loss": 0.265, + "step": 10654 + }, + { + "epoch": 0.13320333008325208, + "grad_norm": 9.803730010986328, + "learning_rate": 1.9932919100103406e-05, + "loss": 1.1431, + "step": 10656 + }, + { + "epoch": 0.1332283307082677, + "grad_norm": 3.061859607696533, + "learning_rate": 1.9932818149927823e-05, + "loss": 1.0346, + "step": 10658 + }, + { + "epoch": 0.13325333133328332, + "grad_norm": 3.499021053314209, + "learning_rate": 1.9932717124105302e-05, + "loss": 1.2803, + "step": 10660 + }, + { + "epoch": 0.13327833195829897, + "grad_norm": 4.15720796585083, + "learning_rate": 1.9932616022636613e-05, + "loss": 2.113, + "step": 10662 + }, + { + "epoch": 0.1333033325833146, + "grad_norm": 3.964354991912842, + "learning_rate": 1.9932514845522522e-05, + "loss": 0.8508, + "step": 10664 + }, + { + "epoch": 0.1333283332083302, + "grad_norm": 0.8499677181243896, + "learning_rate": 1.9932413592763805e-05, + "loss": 0.3938, + "step": 10666 + }, + { + "epoch": 0.13335333383334583, + "grad_norm": 0.8495145440101624, + "learning_rate": 1.993231226436123e-05, + "loss": 0.2117, + "step": 10668 + }, + { + "epoch": 0.13337833445836145, + "grad_norm": 0.014268379658460617, + "learning_rate": 1.993221086031557e-05, + "loss": 0.0012, + "step": 10670 + }, + { + "epoch": 0.1334033350833771, + "grad_norm": 3.7388341426849365, + "learning_rate": 1.9932109380627597e-05, + "loss": 1.4851, + "step": 10672 + }, + { + "epoch": 0.13342833570839271, + "grad_norm": 0.5230211615562439, + "learning_rate": 1.993200782529808e-05, + "loss": 0.0124, + "step": 10674 + }, + { + "epoch": 0.13345333633340833, + "grad_norm": 2.0712954998016357, + "learning_rate": 1.99319061943278e-05, + "loss": 0.089, + "step": 10676 + }, + { + "epoch": 0.13347833695842395, + "grad_norm": 1.9255447387695312, + "learning_rate": 1.9931804487717524e-05, + "loss": 0.6152, + "step": 10678 + }, + { + "epoch": 0.13350333758343957, + "grad_norm": 1.7814075946807861, + "learning_rate": 1.993170270546803e-05, + "loss": 0.2364, + "step": 10680 + }, + { + "epoch": 0.13352833820845522, + "grad_norm": 0.008870307356119156, + "learning_rate": 1.9931600847580094e-05, + "loss": 0.0294, + "step": 10682 + }, + { + "epoch": 0.13355333883347084, + "grad_norm": 5.205301761627197, + "learning_rate": 1.993149891405449e-05, + "loss": 1.73, + "step": 10684 + }, + { + "epoch": 0.13357833945848646, + "grad_norm": 1.892233967781067, + "learning_rate": 1.9931396904891995e-05, + "loss": 1.4245, + "step": 10686 + }, + { + "epoch": 0.13360334008350208, + "grad_norm": 3.8934433460235596, + "learning_rate": 1.9931294820093387e-05, + "loss": 0.7652, + "step": 10688 + }, + { + "epoch": 0.1336283407085177, + "grad_norm": 3.866162061691284, + "learning_rate": 1.993119265965944e-05, + "loss": 0.8027, + "step": 10690 + }, + { + "epoch": 0.13365334133353335, + "grad_norm": 3.5364573001861572, + "learning_rate": 1.9931090423590934e-05, + "loss": 1.1054, + "step": 10692 + }, + { + "epoch": 0.13367834195854897, + "grad_norm": 2.7165982723236084, + "learning_rate": 1.9930988111888648e-05, + "loss": 0.2212, + "step": 10694 + }, + { + "epoch": 0.1337033425835646, + "grad_norm": 4.239438533782959, + "learning_rate": 1.993088572455336e-05, + "loss": 0.8589, + "step": 10696 + }, + { + "epoch": 0.1337283432085802, + "grad_norm": 2.6463029384613037, + "learning_rate": 1.993078326158585e-05, + "loss": 0.2802, + "step": 10698 + }, + { + "epoch": 0.13375334383359583, + "grad_norm": 3.6775054931640625, + "learning_rate": 1.9930680722986904e-05, + "loss": 0.7563, + "step": 10700 + }, + { + "epoch": 0.13377834445861148, + "grad_norm": 5.096325397491455, + "learning_rate": 1.9930578108757294e-05, + "loss": 0.4416, + "step": 10702 + }, + { + "epoch": 0.1338033450836271, + "grad_norm": 4.807310104370117, + "learning_rate": 1.9930475418897806e-05, + "loss": 0.6435, + "step": 10704 + }, + { + "epoch": 0.13382834570864272, + "grad_norm": 3.1279590129852295, + "learning_rate": 1.993037265340922e-05, + "loss": 0.5997, + "step": 10706 + }, + { + "epoch": 0.13385334633365834, + "grad_norm": 2.2846081256866455, + "learning_rate": 1.993026981229232e-05, + "loss": 0.7331, + "step": 10708 + }, + { + "epoch": 0.13387834695867395, + "grad_norm": 2.8391456604003906, + "learning_rate": 1.9930166895547892e-05, + "loss": 0.5068, + "step": 10710 + }, + { + "epoch": 0.1339033475836896, + "grad_norm": 3.96063232421875, + "learning_rate": 1.9930063903176716e-05, + "loss": 0.1862, + "step": 10712 + }, + { + "epoch": 0.13392834820870522, + "grad_norm": 1.8732694387435913, + "learning_rate": 1.992996083517958e-05, + "loss": 0.5176, + "step": 10714 + }, + { + "epoch": 0.13395334883372084, + "grad_norm": 2.327293872833252, + "learning_rate": 1.9929857691557264e-05, + "loss": 0.7304, + "step": 10716 + }, + { + "epoch": 0.13397834945873646, + "grad_norm": 2.2488203048706055, + "learning_rate": 1.9929754472310553e-05, + "loss": 1.3062, + "step": 10718 + }, + { + "epoch": 0.13400335008375208, + "grad_norm": 4.5176777839660645, + "learning_rate": 1.9929651177440242e-05, + "loss": 0.3313, + "step": 10720 + }, + { + "epoch": 0.13402835070876773, + "grad_norm": 3.6696689128875732, + "learning_rate": 1.9929547806947108e-05, + "loss": 0.2534, + "step": 10722 + }, + { + "epoch": 0.13405335133378335, + "grad_norm": 0.10305037349462509, + "learning_rate": 1.9929444360831946e-05, + "loss": 0.9644, + "step": 10724 + }, + { + "epoch": 0.13407835195879897, + "grad_norm": 5.151892185211182, + "learning_rate": 1.9929340839095536e-05, + "loss": 1.8571, + "step": 10726 + }, + { + "epoch": 0.1341033525838146, + "grad_norm": 3.532433271408081, + "learning_rate": 1.992923724173867e-05, + "loss": 1.3051, + "step": 10728 + }, + { + "epoch": 0.1341283532088302, + "grad_norm": 3.2765250205993652, + "learning_rate": 1.9929133568762142e-05, + "loss": 0.2429, + "step": 10730 + }, + { + "epoch": 0.13415335383384586, + "grad_norm": 2.7477316856384277, + "learning_rate": 1.9929029820166734e-05, + "loss": 1.3687, + "step": 10732 + }, + { + "epoch": 0.13417835445886148, + "grad_norm": 2.4104254245758057, + "learning_rate": 1.9928925995953235e-05, + "loss": 0.9935, + "step": 10734 + }, + { + "epoch": 0.1342033550838771, + "grad_norm": 4.027638912200928, + "learning_rate": 1.9928822096122446e-05, + "loss": 1.3796, + "step": 10736 + }, + { + "epoch": 0.13422835570889272, + "grad_norm": 6.664989948272705, + "learning_rate": 1.9928718120675148e-05, + "loss": 2.3376, + "step": 10738 + }, + { + "epoch": 0.13425335633390834, + "grad_norm": 4.110318183898926, + "learning_rate": 1.9928614069612138e-05, + "loss": 0.8433, + "step": 10740 + }, + { + "epoch": 0.13427835695892398, + "grad_norm": 4.717862129211426, + "learning_rate": 1.992850994293421e-05, + "loss": 0.6242, + "step": 10742 + }, + { + "epoch": 0.1343033575839396, + "grad_norm": 2.8981497287750244, + "learning_rate": 1.992840574064215e-05, + "loss": 1.2368, + "step": 10744 + }, + { + "epoch": 0.13432835820895522, + "grad_norm": 3.3402748107910156, + "learning_rate": 1.9928301462736755e-05, + "loss": 1.2973, + "step": 10746 + }, + { + "epoch": 0.13435335883397084, + "grad_norm": 0.035405393689870834, + "learning_rate": 1.9928197109218822e-05, + "loss": 0.9134, + "step": 10748 + }, + { + "epoch": 0.13437835945898646, + "grad_norm": 2.394937038421631, + "learning_rate": 1.9928092680089145e-05, + "loss": 0.3244, + "step": 10750 + }, + { + "epoch": 0.1344033600840021, + "grad_norm": 3.9301462173461914, + "learning_rate": 1.992798817534852e-05, + "loss": 1.2875, + "step": 10752 + }, + { + "epoch": 0.13442836070901773, + "grad_norm": 3.542001247406006, + "learning_rate": 1.9927883594997738e-05, + "loss": 1.4815, + "step": 10754 + }, + { + "epoch": 0.13445336133403335, + "grad_norm": 9.2101469039917, + "learning_rate": 1.9927778939037597e-05, + "loss": 0.278, + "step": 10756 + }, + { + "epoch": 0.13447836195904897, + "grad_norm": 17.323848724365234, + "learning_rate": 1.99276742074689e-05, + "loss": 1.3608, + "step": 10758 + }, + { + "epoch": 0.1345033625840646, + "grad_norm": 3.0124568939208984, + "learning_rate": 1.9927569400292437e-05, + "loss": 1.1908, + "step": 10760 + }, + { + "epoch": 0.13452836320908024, + "grad_norm": 3.5749170780181885, + "learning_rate": 1.9927464517509013e-05, + "loss": 1.3137, + "step": 10762 + }, + { + "epoch": 0.13455336383409586, + "grad_norm": 3.526505708694458, + "learning_rate": 1.9927359559119418e-05, + "loss": 0.2661, + "step": 10764 + }, + { + "epoch": 0.13457836445911148, + "grad_norm": 2.180734634399414, + "learning_rate": 1.9927254525124458e-05, + "loss": 1.3869, + "step": 10766 + }, + { + "epoch": 0.1346033650841271, + "grad_norm": 11.4994535446167, + "learning_rate": 1.9927149415524933e-05, + "loss": 0.7248, + "step": 10768 + }, + { + "epoch": 0.13462836570914272, + "grad_norm": 1.8763093948364258, + "learning_rate": 1.992704423032164e-05, + "loss": 0.1062, + "step": 10770 + }, + { + "epoch": 0.13465336633415836, + "grad_norm": 3.9050192832946777, + "learning_rate": 1.9926938969515383e-05, + "loss": 2.0548, + "step": 10772 + }, + { + "epoch": 0.13467836695917398, + "grad_norm": 2.7170352935791016, + "learning_rate": 1.9926833633106964e-05, + "loss": 0.7138, + "step": 10774 + }, + { + "epoch": 0.1347033675841896, + "grad_norm": 4.335831165313721, + "learning_rate": 1.9926728221097182e-05, + "loss": 1.6944, + "step": 10776 + }, + { + "epoch": 0.13472836820920522, + "grad_norm": 5.405806541442871, + "learning_rate": 1.992662273348684e-05, + "loss": 1.1161, + "step": 10778 + }, + { + "epoch": 0.13475336883422084, + "grad_norm": 3.6912267208099365, + "learning_rate": 1.9926517170276746e-05, + "loss": 1.1968, + "step": 10780 + }, + { + "epoch": 0.1347783694592365, + "grad_norm": 4.148090839385986, + "learning_rate": 1.99264115314677e-05, + "loss": 1.0971, + "step": 10782 + }, + { + "epoch": 0.1348033700842521, + "grad_norm": 2.1547255516052246, + "learning_rate": 1.9926305817060506e-05, + "loss": 0.6808, + "step": 10784 + }, + { + "epoch": 0.13482837070926773, + "grad_norm": 2.433112144470215, + "learning_rate": 1.9926200027055973e-05, + "loss": 0.401, + "step": 10786 + }, + { + "epoch": 0.13485337133428335, + "grad_norm": 13.182782173156738, + "learning_rate": 1.99260941614549e-05, + "loss": 1.8302, + "step": 10788 + }, + { + "epoch": 0.13487837195929897, + "grad_norm": 0.06264089792966843, + "learning_rate": 1.9925988220258103e-05, + "loss": 0.3674, + "step": 10790 + }, + { + "epoch": 0.13490337258431462, + "grad_norm": 17.210369110107422, + "learning_rate": 1.992588220346638e-05, + "loss": 1.8605, + "step": 10792 + }, + { + "epoch": 0.13492837320933024, + "grad_norm": 0.02001698687672615, + "learning_rate": 1.9925776111080544e-05, + "loss": 0.6698, + "step": 10794 + }, + { + "epoch": 0.13495337383434586, + "grad_norm": 2.2816522121429443, + "learning_rate": 1.99256699431014e-05, + "loss": 0.4919, + "step": 10796 + }, + { + "epoch": 0.13497837445936148, + "grad_norm": 3.2917425632476807, + "learning_rate": 1.9925563699529756e-05, + "loss": 0.8771, + "step": 10798 + }, + { + "epoch": 0.1350033750843771, + "grad_norm": 0.9334639310836792, + "learning_rate": 1.9925457380366423e-05, + "loss": 1.0576, + "step": 10800 + }, + { + "epoch": 0.13502837570939275, + "grad_norm": 3.750458240509033, + "learning_rate": 1.9925350985612214e-05, + "loss": 2.1796, + "step": 10802 + }, + { + "epoch": 0.13505337633440836, + "grad_norm": 2.532924175262451, + "learning_rate": 1.992524451526793e-05, + "loss": 0.3536, + "step": 10804 + }, + { + "epoch": 0.13507837695942398, + "grad_norm": 7.608557224273682, + "learning_rate": 1.992513796933439e-05, + "loss": 0.4321, + "step": 10806 + }, + { + "epoch": 0.1351033775844396, + "grad_norm": 4.7873854637146, + "learning_rate": 1.9925031347812404e-05, + "loss": 0.613, + "step": 10808 + }, + { + "epoch": 0.13512837820945522, + "grad_norm": 0.015955530107021332, + "learning_rate": 1.992492465070278e-05, + "loss": 0.2057, + "step": 10810 + }, + { + "epoch": 0.13515337883447087, + "grad_norm": 3.992802858352661, + "learning_rate": 1.9924817878006332e-05, + "loss": 1.4293, + "step": 10812 + }, + { + "epoch": 0.1351783794594865, + "grad_norm": 0.7111257910728455, + "learning_rate": 1.9924711029723876e-05, + "loss": 0.0197, + "step": 10814 + }, + { + "epoch": 0.1352033800845021, + "grad_norm": 6.752542972564697, + "learning_rate": 1.9924604105856228e-05, + "loss": 0.9045, + "step": 10816 + }, + { + "epoch": 0.13522838070951773, + "grad_norm": 0.352573424577713, + "learning_rate": 1.9924497106404195e-05, + "loss": 0.0053, + "step": 10818 + }, + { + "epoch": 0.13525338133453335, + "grad_norm": 4.1851091384887695, + "learning_rate": 1.9924390031368596e-05, + "loss": 1.2015, + "step": 10820 + }, + { + "epoch": 0.135278381959549, + "grad_norm": 5.114405632019043, + "learning_rate": 1.9924282880750248e-05, + "loss": 0.4937, + "step": 10822 + }, + { + "epoch": 0.13530338258456462, + "grad_norm": 6.532843589782715, + "learning_rate": 1.992417565454996e-05, + "loss": 2.1945, + "step": 10824 + }, + { + "epoch": 0.13532838320958024, + "grad_norm": 2.8349320888519287, + "learning_rate": 1.992406835276856e-05, + "loss": 0.5663, + "step": 10826 + }, + { + "epoch": 0.13535338383459586, + "grad_norm": 3.3369617462158203, + "learning_rate": 1.9923960975406853e-05, + "loss": 1.5343, + "step": 10828 + }, + { + "epoch": 0.13537838445961148, + "grad_norm": 3.9419689178466797, + "learning_rate": 1.992385352246567e-05, + "loss": 1.6729, + "step": 10830 + }, + { + "epoch": 0.13540338508462713, + "grad_norm": 2.8242573738098145, + "learning_rate": 1.9923745993945815e-05, + "loss": 0.4986, + "step": 10832 + }, + { + "epoch": 0.13542838570964275, + "grad_norm": 2.243293046951294, + "learning_rate": 1.9923638389848114e-05, + "loss": 0.9101, + "step": 10834 + }, + { + "epoch": 0.13545338633465837, + "grad_norm": 4.5255818367004395, + "learning_rate": 1.9923530710173387e-05, + "loss": 1.2579, + "step": 10836 + }, + { + "epoch": 0.13547838695967399, + "grad_norm": 0.007102082017809153, + "learning_rate": 1.9923422954922458e-05, + "loss": 0.0001, + "step": 10838 + }, + { + "epoch": 0.1355033875846896, + "grad_norm": 2.5434839725494385, + "learning_rate": 1.9923315124096138e-05, + "loss": 1.3875, + "step": 10840 + }, + { + "epoch": 0.13552838820970525, + "grad_norm": 2.0199098587036133, + "learning_rate": 1.9923207217695255e-05, + "loss": 0.9588, + "step": 10842 + }, + { + "epoch": 0.13555338883472087, + "grad_norm": 0.7212941646575928, + "learning_rate": 1.9923099235720633e-05, + "loss": 1.2129, + "step": 10844 + }, + { + "epoch": 0.1355783894597365, + "grad_norm": 0.39295125007629395, + "learning_rate": 1.9922991178173087e-05, + "loss": 0.8081, + "step": 10846 + }, + { + "epoch": 0.1356033900847521, + "grad_norm": 3.835935592651367, + "learning_rate": 1.992288304505344e-05, + "loss": 1.3952, + "step": 10848 + }, + { + "epoch": 0.13562839070976773, + "grad_norm": 4.114824295043945, + "learning_rate": 1.992277483636252e-05, + "loss": 2.0306, + "step": 10850 + }, + { + "epoch": 0.13565339133478338, + "grad_norm": 2.163813829421997, + "learning_rate": 1.9922666552101156e-05, + "loss": 0.3633, + "step": 10852 + }, + { + "epoch": 0.135678391959799, + "grad_norm": 1.3969228267669678, + "learning_rate": 1.9922558192270163e-05, + "loss": 0.9264, + "step": 10854 + }, + { + "epoch": 0.13570339258481462, + "grad_norm": 3.561185836791992, + "learning_rate": 1.992244975687037e-05, + "loss": 1.6798, + "step": 10856 + }, + { + "epoch": 0.13572839320983024, + "grad_norm": 0.07418389618396759, + "learning_rate": 1.9922341245902602e-05, + "loss": 0.1555, + "step": 10858 + }, + { + "epoch": 0.13575339383484586, + "grad_norm": 0.4667898118495941, + "learning_rate": 1.9922232659367686e-05, + "loss": 0.3369, + "step": 10860 + }, + { + "epoch": 0.1357783944598615, + "grad_norm": 2.5331673622131348, + "learning_rate": 1.992212399726645e-05, + "loss": 0.8083, + "step": 10862 + }, + { + "epoch": 0.13580339508487713, + "grad_norm": 0.5535283088684082, + "learning_rate": 1.992201525959972e-05, + "loss": 0.9645, + "step": 10864 + }, + { + "epoch": 0.13582839570989275, + "grad_norm": 5.306959629058838, + "learning_rate": 1.9921906446368328e-05, + "loss": 0.4728, + "step": 10866 + }, + { + "epoch": 0.13585339633490837, + "grad_norm": 0.3140881359577179, + "learning_rate": 1.9921797557573096e-05, + "loss": 0.5302, + "step": 10868 + }, + { + "epoch": 0.13587839695992399, + "grad_norm": 4.183764457702637, + "learning_rate": 1.992168859321486e-05, + "loss": 1.714, + "step": 10870 + }, + { + "epoch": 0.13590339758493963, + "grad_norm": 8.072193145751953, + "learning_rate": 1.9921579553294442e-05, + "loss": 1.5021, + "step": 10872 + }, + { + "epoch": 0.13592839820995525, + "grad_norm": 5.782841682434082, + "learning_rate": 1.9921470437812682e-05, + "loss": 1.2225, + "step": 10874 + }, + { + "epoch": 0.13595339883497087, + "grad_norm": 3.3738973140716553, + "learning_rate": 1.9921361246770403e-05, + "loss": 1.2843, + "step": 10876 + }, + { + "epoch": 0.1359783994599865, + "grad_norm": 3.081411361694336, + "learning_rate": 1.9921251980168442e-05, + "loss": 0.6316, + "step": 10878 + }, + { + "epoch": 0.1360034000850021, + "grad_norm": 2.052314519882202, + "learning_rate": 1.9921142638007628e-05, + "loss": 0.2492, + "step": 10880 + }, + { + "epoch": 0.13602840071001776, + "grad_norm": 0.52139812707901, + "learning_rate": 1.9921033220288795e-05, + "loss": 1.1188, + "step": 10882 + }, + { + "epoch": 0.13605340133503338, + "grad_norm": 1.656082272529602, + "learning_rate": 1.9920923727012777e-05, + "loss": 1.5722, + "step": 10884 + }, + { + "epoch": 0.136078401960049, + "grad_norm": 1.3377571105957031, + "learning_rate": 1.9920814158180403e-05, + "loss": 0.3788, + "step": 10886 + }, + { + "epoch": 0.13610340258506462, + "grad_norm": 1.6765882968902588, + "learning_rate": 1.9920704513792515e-05, + "loss": 1.0392, + "step": 10888 + }, + { + "epoch": 0.13612840321008024, + "grad_norm": 2.388669967651367, + "learning_rate": 1.9920594793849945e-05, + "loss": 0.3679, + "step": 10890 + }, + { + "epoch": 0.1361534038350959, + "grad_norm": 0.7388811707496643, + "learning_rate": 1.9920484998353523e-05, + "loss": 0.5491, + "step": 10892 + }, + { + "epoch": 0.1361784044601115, + "grad_norm": 2.463618755340576, + "learning_rate": 1.9920375127304093e-05, + "loss": 1.2795, + "step": 10894 + }, + { + "epoch": 0.13620340508512713, + "grad_norm": 4.6375603675842285, + "learning_rate": 1.992026518070249e-05, + "loss": 0.8645, + "step": 10896 + }, + { + "epoch": 0.13622840571014275, + "grad_norm": 3.06199312210083, + "learning_rate": 1.9920155158549548e-05, + "loss": 1.4232, + "step": 10898 + }, + { + "epoch": 0.13625340633515837, + "grad_norm": 4.177846431732178, + "learning_rate": 1.992004506084611e-05, + "loss": 1.8778, + "step": 10900 + }, + { + "epoch": 0.13627840696017401, + "grad_norm": 2.2017273902893066, + "learning_rate": 1.991993488759301e-05, + "loss": 0.5218, + "step": 10902 + }, + { + "epoch": 0.13630340758518963, + "grad_norm": 4.616024017333984, + "learning_rate": 1.9919824638791087e-05, + "loss": 1.2075, + "step": 10904 + }, + { + "epoch": 0.13632840821020525, + "grad_norm": 3.174346923828125, + "learning_rate": 1.991971431444118e-05, + "loss": 1.1767, + "step": 10906 + }, + { + "epoch": 0.13635340883522087, + "grad_norm": 3.479112148284912, + "learning_rate": 1.9919603914544143e-05, + "loss": 0.3918, + "step": 10908 + }, + { + "epoch": 0.1363784094602365, + "grad_norm": 3.0895888805389404, + "learning_rate": 1.9919493439100798e-05, + "loss": 1.3167, + "step": 10910 + }, + { + "epoch": 0.13640341008525214, + "grad_norm": 6.018475532531738, + "learning_rate": 1.991938288811199e-05, + "loss": 0.777, + "step": 10912 + }, + { + "epoch": 0.13642841071026776, + "grad_norm": 3.1564886569976807, + "learning_rate": 1.991927226157857e-05, + "loss": 1.7801, + "step": 10914 + }, + { + "epoch": 0.13645341133528338, + "grad_norm": 2.913485527038574, + "learning_rate": 1.9919161559501373e-05, + "loss": 1.2867, + "step": 10916 + }, + { + "epoch": 0.136478411960299, + "grad_norm": 5.8958892822265625, + "learning_rate": 1.9919050781881247e-05, + "loss": 1.063, + "step": 10918 + }, + { + "epoch": 0.13650341258531462, + "grad_norm": 2.8330330848693848, + "learning_rate": 1.991893992871903e-05, + "loss": 1.3759, + "step": 10920 + }, + { + "epoch": 0.13652841321033027, + "grad_norm": 2.595357656478882, + "learning_rate": 1.991882900001557e-05, + "loss": 0.6587, + "step": 10922 + }, + { + "epoch": 0.1365534138353459, + "grad_norm": 2.477501153945923, + "learning_rate": 1.9918717995771712e-05, + "loss": 2.0077, + "step": 10924 + }, + { + "epoch": 0.1365784144603615, + "grad_norm": 3.2280051708221436, + "learning_rate": 1.9918606915988296e-05, + "loss": 2.0554, + "step": 10926 + }, + { + "epoch": 0.13660341508537713, + "grad_norm": 3.3624134063720703, + "learning_rate": 1.9918495760666173e-05, + "loss": 1.1653, + "step": 10928 + }, + { + "epoch": 0.13662841571039275, + "grad_norm": 6.8725056648254395, + "learning_rate": 1.9918384529806195e-05, + "loss": 1.5503, + "step": 10930 + }, + { + "epoch": 0.1366534163354084, + "grad_norm": 4.426362037658691, + "learning_rate": 1.99182732234092e-05, + "loss": 1.7839, + "step": 10932 + }, + { + "epoch": 0.13667841696042402, + "grad_norm": 5.194019317626953, + "learning_rate": 1.9918161841476037e-05, + "loss": 0.3392, + "step": 10934 + }, + { + "epoch": 0.13670341758543963, + "grad_norm": 0.004913358483463526, + "learning_rate": 1.991805038400756e-05, + "loss": 1.3789, + "step": 10936 + }, + { + "epoch": 0.13672841821045525, + "grad_norm": 4.289793491363525, + "learning_rate": 1.991793885100461e-05, + "loss": 0.8961, + "step": 10938 + }, + { + "epoch": 0.13675341883547087, + "grad_norm": 2.656231164932251, + "learning_rate": 1.991782724246804e-05, + "loss": 0.702, + "step": 10940 + }, + { + "epoch": 0.13677841946048652, + "grad_norm": 2.4591526985168457, + "learning_rate": 1.99177155583987e-05, + "loss": 0.8378, + "step": 10942 + }, + { + "epoch": 0.13680342008550214, + "grad_norm": 2.088310718536377, + "learning_rate": 1.9917603798797442e-05, + "loss": 0.2521, + "step": 10944 + }, + { + "epoch": 0.13682842071051776, + "grad_norm": 0.39253008365631104, + "learning_rate": 1.9917491963665117e-05, + "loss": 0.435, + "step": 10946 + }, + { + "epoch": 0.13685342133553338, + "grad_norm": 4.9927496910095215, + "learning_rate": 1.9917380053002574e-05, + "loss": 1.4347, + "step": 10948 + }, + { + "epoch": 0.136878421960549, + "grad_norm": 0.6295644044876099, + "learning_rate": 1.9917268066810668e-05, + "loss": 0.7353, + "step": 10950 + }, + { + "epoch": 0.13690342258556465, + "grad_norm": 2.3015635013580322, + "learning_rate": 1.991715600509025e-05, + "loss": 1.1655, + "step": 10952 + }, + { + "epoch": 0.13692842321058027, + "grad_norm": 4.5128045082092285, + "learning_rate": 1.9917043867842175e-05, + "loss": 1.9824, + "step": 10954 + }, + { + "epoch": 0.1369534238355959, + "grad_norm": 5.037839889526367, + "learning_rate": 1.9916931655067296e-05, + "loss": 1.4899, + "step": 10956 + }, + { + "epoch": 0.1369784244606115, + "grad_norm": 2.7435100078582764, + "learning_rate": 1.991681936676647e-05, + "loss": 0.8474, + "step": 10958 + }, + { + "epoch": 0.13700342508562713, + "grad_norm": 2.3010644912719727, + "learning_rate": 1.9916707002940547e-05, + "loss": 1.01, + "step": 10960 + }, + { + "epoch": 0.13702842571064278, + "grad_norm": 2.0979299545288086, + "learning_rate": 1.991659456359039e-05, + "loss": 1.3647, + "step": 10962 + }, + { + "epoch": 0.1370534263356584, + "grad_norm": 2.7218799591064453, + "learning_rate": 1.9916482048716845e-05, + "loss": 1.7652, + "step": 10964 + }, + { + "epoch": 0.13707842696067402, + "grad_norm": 0.13445162773132324, + "learning_rate": 1.991636945832078e-05, + "loss": 0.0768, + "step": 10966 + }, + { + "epoch": 0.13710342758568964, + "grad_norm": 4.292069435119629, + "learning_rate": 1.9916256792403046e-05, + "loss": 1.0709, + "step": 10968 + }, + { + "epoch": 0.13712842821070526, + "grad_norm": 3.8814804553985596, + "learning_rate": 1.99161440509645e-05, + "loss": 0.9707, + "step": 10970 + }, + { + "epoch": 0.1371534288357209, + "grad_norm": 3.50545334815979, + "learning_rate": 1.9916031234006006e-05, + "loss": 1.2368, + "step": 10972 + }, + { + "epoch": 0.13717842946073652, + "grad_norm": 3.096729278564453, + "learning_rate": 1.991591834152842e-05, + "loss": 0.8268, + "step": 10974 + }, + { + "epoch": 0.13720343008575214, + "grad_norm": 5.8075103759765625, + "learning_rate": 1.9915805373532602e-05, + "loss": 1.549, + "step": 10976 + }, + { + "epoch": 0.13722843071076776, + "grad_norm": 0.0036009575705975294, + "learning_rate": 1.9915692330019413e-05, + "loss": 0.4095, + "step": 10978 + }, + { + "epoch": 0.13725343133578338, + "grad_norm": 4.048880100250244, + "learning_rate": 1.9915579210989712e-05, + "loss": 1.3804, + "step": 10980 + }, + { + "epoch": 0.13727843196079903, + "grad_norm": 2.880645275115967, + "learning_rate": 1.991546601644436e-05, + "loss": 0.7062, + "step": 10982 + }, + { + "epoch": 0.13730343258581465, + "grad_norm": 4.453183650970459, + "learning_rate": 1.9915352746384228e-05, + "loss": 1.4338, + "step": 10984 + }, + { + "epoch": 0.13732843321083027, + "grad_norm": 0.0016518976772204041, + "learning_rate": 1.991523940081017e-05, + "loss": 0.0001, + "step": 10986 + }, + { + "epoch": 0.1373534338358459, + "grad_norm": 3.598797559738159, + "learning_rate": 1.9915125979723044e-05, + "loss": 1.3237, + "step": 10988 + }, + { + "epoch": 0.1373784344608615, + "grad_norm": 2.769779920578003, + "learning_rate": 1.9915012483123725e-05, + "loss": 0.4976, + "step": 10990 + }, + { + "epoch": 0.13740343508587716, + "grad_norm": 2.2689759731292725, + "learning_rate": 1.9914898911013072e-05, + "loss": 1.5912, + "step": 10992 + }, + { + "epoch": 0.13742843571089278, + "grad_norm": 0.03649895265698433, + "learning_rate": 1.9914785263391953e-05, + "loss": 0.2209, + "step": 10994 + }, + { + "epoch": 0.1374534363359084, + "grad_norm": 4.782425880432129, + "learning_rate": 1.991467154026123e-05, + "loss": 1.3891, + "step": 10996 + }, + { + "epoch": 0.13747843696092402, + "grad_norm": 2.4052486419677734, + "learning_rate": 1.991455774162177e-05, + "loss": 1.2888, + "step": 10998 + }, + { + "epoch": 0.13750343758593964, + "grad_norm": 8.359009742736816, + "learning_rate": 1.9914443867474442e-05, + "loss": 0.6648, + "step": 11000 + }, + { + "epoch": 0.13752843821095528, + "grad_norm": 3.9733033180236816, + "learning_rate": 1.9914329917820113e-05, + "loss": 1.0185, + "step": 11002 + }, + { + "epoch": 0.1375534388359709, + "grad_norm": 2.199814796447754, + "learning_rate": 1.9914215892659645e-05, + "loss": 0.4874, + "step": 11004 + }, + { + "epoch": 0.13757843946098652, + "grad_norm": 2.5065133571624756, + "learning_rate": 1.9914101791993914e-05, + "loss": 0.4929, + "step": 11006 + }, + { + "epoch": 0.13760344008600214, + "grad_norm": 3.2151708602905273, + "learning_rate": 1.9913987615823785e-05, + "loss": 0.6848, + "step": 11008 + }, + { + "epoch": 0.13762844071101776, + "grad_norm": 2.4418914318084717, + "learning_rate": 1.991387336415013e-05, + "loss": 0.6601, + "step": 11010 + }, + { + "epoch": 0.1376534413360334, + "grad_norm": 3.15381121635437, + "learning_rate": 1.991375903697381e-05, + "loss": 1.3248, + "step": 11012 + }, + { + "epoch": 0.13767844196104903, + "grad_norm": 3.9036693572998047, + "learning_rate": 1.9913644634295712e-05, + "loss": 1.0266, + "step": 11014 + }, + { + "epoch": 0.13770344258606465, + "grad_norm": 3.41221284866333, + "learning_rate": 1.9913530156116697e-05, + "loss": 1.1197, + "step": 11016 + }, + { + "epoch": 0.13772844321108027, + "grad_norm": 3.9912731647491455, + "learning_rate": 1.9913415602437634e-05, + "loss": 0.2038, + "step": 11018 + }, + { + "epoch": 0.1377534438360959, + "grad_norm": 4.695002555847168, + "learning_rate": 1.99133009732594e-05, + "loss": 1.6729, + "step": 11020 + }, + { + "epoch": 0.13777844446111154, + "grad_norm": 3.116823196411133, + "learning_rate": 1.9913186268582868e-05, + "loss": 0.9958, + "step": 11022 + }, + { + "epoch": 0.13780344508612716, + "grad_norm": 2.1564760208129883, + "learning_rate": 1.9913071488408916e-05, + "loss": 1.325, + "step": 11024 + }, + { + "epoch": 0.13782844571114278, + "grad_norm": 3.132874011993408, + "learning_rate": 1.991295663273841e-05, + "loss": 1.0039, + "step": 11026 + }, + { + "epoch": 0.1378534463361584, + "grad_norm": 2.197901487350464, + "learning_rate": 1.991284170157223e-05, + "loss": 0.0384, + "step": 11028 + }, + { + "epoch": 0.13787844696117402, + "grad_norm": 5.347640037536621, + "learning_rate": 1.9912726694911245e-05, + "loss": 2.0563, + "step": 11030 + }, + { + "epoch": 0.13790344758618966, + "grad_norm": 2.9187827110290527, + "learning_rate": 1.991261161275634e-05, + "loss": 1.4143, + "step": 11032 + }, + { + "epoch": 0.13792844821120528, + "grad_norm": 0.002513280138373375, + "learning_rate": 1.9912496455108382e-05, + "loss": 0.2283, + "step": 11034 + }, + { + "epoch": 0.1379534488362209, + "grad_norm": 3.1987969875335693, + "learning_rate": 1.9912381221968254e-05, + "loss": 2.294, + "step": 11036 + }, + { + "epoch": 0.13797844946123652, + "grad_norm": 2.6629459857940674, + "learning_rate": 1.9912265913336836e-05, + "loss": 0.9116, + "step": 11038 + }, + { + "epoch": 0.13800345008625214, + "grad_norm": 2.892085075378418, + "learning_rate": 1.9912150529215e-05, + "loss": 0.3254, + "step": 11040 + }, + { + "epoch": 0.1380284507112678, + "grad_norm": 2.2450740337371826, + "learning_rate": 1.991203506960363e-05, + "loss": 0.5784, + "step": 11042 + }, + { + "epoch": 0.1380534513362834, + "grad_norm": 0.6861468553543091, + "learning_rate": 1.9911919534503597e-05, + "loss": 0.5077, + "step": 11044 + }, + { + "epoch": 0.13807845196129903, + "grad_norm": 3.178921937942505, + "learning_rate": 1.991180392391579e-05, + "loss": 0.3864, + "step": 11046 + }, + { + "epoch": 0.13810345258631465, + "grad_norm": 3.3858790397644043, + "learning_rate": 1.9911688237841087e-05, + "loss": 1.5444, + "step": 11048 + }, + { + "epoch": 0.13812845321133027, + "grad_norm": 2.9406497478485107, + "learning_rate": 1.9911572476280366e-05, + "loss": 1.1743, + "step": 11050 + }, + { + "epoch": 0.13815345383634592, + "grad_norm": 2.790912628173828, + "learning_rate": 1.991145663923451e-05, + "loss": 0.3744, + "step": 11052 + }, + { + "epoch": 0.13817845446136154, + "grad_norm": 4.8597917556762695, + "learning_rate": 1.9911340726704402e-05, + "loss": 0.675, + "step": 11054 + }, + { + "epoch": 0.13820345508637716, + "grad_norm": 1.4190031290054321, + "learning_rate": 1.9911224738690927e-05, + "loss": 1.1539, + "step": 11056 + }, + { + "epoch": 0.13822845571139278, + "grad_norm": 2.0652730464935303, + "learning_rate": 1.9911108675194965e-05, + "loss": 0.3053, + "step": 11058 + }, + { + "epoch": 0.1382534563364084, + "grad_norm": 3.5269739627838135, + "learning_rate": 1.99109925362174e-05, + "loss": 0.8802, + "step": 11060 + }, + { + "epoch": 0.13827845696142405, + "grad_norm": 2.721553087234497, + "learning_rate": 1.991087632175912e-05, + "loss": 0.3515, + "step": 11062 + }, + { + "epoch": 0.13830345758643967, + "grad_norm": 0.004100941587239504, + "learning_rate": 1.9910760031821005e-05, + "loss": 0.6047, + "step": 11064 + }, + { + "epoch": 0.13832845821145529, + "grad_norm": 3.354111671447754, + "learning_rate": 1.991064366640394e-05, + "loss": 1.3655, + "step": 11066 + }, + { + "epoch": 0.1383534588364709, + "grad_norm": 2.5347392559051514, + "learning_rate": 1.991052722550882e-05, + "loss": 1.0989, + "step": 11068 + }, + { + "epoch": 0.13837845946148652, + "grad_norm": 1.9347455501556396, + "learning_rate": 1.9910410709136525e-05, + "loss": 1.6744, + "step": 11070 + }, + { + "epoch": 0.13840346008650217, + "grad_norm": 5.599861145019531, + "learning_rate": 1.991029411728794e-05, + "loss": 0.702, + "step": 11072 + }, + { + "epoch": 0.1384284607115178, + "grad_norm": 0.0030359828379005194, + "learning_rate": 1.991017744996396e-05, + "loss": 1.1779, + "step": 11074 + }, + { + "epoch": 0.1384534613365334, + "grad_norm": 2.251044750213623, + "learning_rate": 1.9910060707165468e-05, + "loss": 0.388, + "step": 11076 + }, + { + "epoch": 0.13847846196154903, + "grad_norm": 4.623195171356201, + "learning_rate": 1.9909943888893358e-05, + "loss": 0.8508, + "step": 11078 + }, + { + "epoch": 0.13850346258656465, + "grad_norm": 4.140858173370361, + "learning_rate": 1.9909826995148515e-05, + "loss": 1.0719, + "step": 11080 + }, + { + "epoch": 0.1385284632115803, + "grad_norm": 5.77186393737793, + "learning_rate": 1.9909710025931826e-05, + "loss": 1.4075, + "step": 11082 + }, + { + "epoch": 0.13855346383659592, + "grad_norm": 4.415901184082031, + "learning_rate": 1.990959298124419e-05, + "loss": 0.5848, + "step": 11084 + }, + { + "epoch": 0.13857846446161154, + "grad_norm": 2.626784324645996, + "learning_rate": 1.99094758610865e-05, + "loss": 0.8746, + "step": 11086 + }, + { + "epoch": 0.13860346508662716, + "grad_norm": 3.162498950958252, + "learning_rate": 1.990935866545964e-05, + "loss": 0.4726, + "step": 11088 + }, + { + "epoch": 0.13862846571164278, + "grad_norm": 4.347777843475342, + "learning_rate": 1.9909241394364508e-05, + "loss": 0.5623, + "step": 11090 + }, + { + "epoch": 0.13865346633665843, + "grad_norm": 0.00996879767626524, + "learning_rate": 1.990912404780199e-05, + "loss": 0.1215, + "step": 11092 + }, + { + "epoch": 0.13867846696167405, + "grad_norm": 3.050621271133423, + "learning_rate": 1.9909006625772988e-05, + "loss": 1.624, + "step": 11094 + }, + { + "epoch": 0.13870346758668967, + "grad_norm": 2.718655824661255, + "learning_rate": 1.990888912827839e-05, + "loss": 1.8461, + "step": 11096 + }, + { + "epoch": 0.13872846821170529, + "grad_norm": 4.4662909507751465, + "learning_rate": 1.9908771555319095e-05, + "loss": 1.5081, + "step": 11098 + }, + { + "epoch": 0.1387534688367209, + "grad_norm": 3.6793594360351562, + "learning_rate": 1.9908653906895997e-05, + "loss": 1.1306, + "step": 11100 + }, + { + "epoch": 0.13877846946173655, + "grad_norm": 0.001600124523974955, + "learning_rate": 1.9908536183009996e-05, + "loss": 0.0003, + "step": 11102 + }, + { + "epoch": 0.13880347008675217, + "grad_norm": 5.630105972290039, + "learning_rate": 1.990841838366198e-05, + "loss": 0.4167, + "step": 11104 + }, + { + "epoch": 0.1388284707117678, + "grad_norm": 3.0023958683013916, + "learning_rate": 1.9908300508852848e-05, + "loss": 0.5534, + "step": 11106 + }, + { + "epoch": 0.1388534713367834, + "grad_norm": 5.601396083831787, + "learning_rate": 1.9908182558583505e-05, + "loss": 1.2481, + "step": 11108 + }, + { + "epoch": 0.13887847196179903, + "grad_norm": 2.419083595275879, + "learning_rate": 1.9908064532854846e-05, + "loss": 0.6047, + "step": 11110 + }, + { + "epoch": 0.13890347258681468, + "grad_norm": 2.834573984146118, + "learning_rate": 1.9907946431667765e-05, + "loss": 1.5245, + "step": 11112 + }, + { + "epoch": 0.1389284732118303, + "grad_norm": 3.479074001312256, + "learning_rate": 1.9907828255023168e-05, + "loss": 0.7048, + "step": 11114 + }, + { + "epoch": 0.13895347383684592, + "grad_norm": 5.734103202819824, + "learning_rate": 1.9907710002921952e-05, + "loss": 1.3192, + "step": 11116 + }, + { + "epoch": 0.13897847446186154, + "grad_norm": 2.5168867111206055, + "learning_rate": 1.9907591675365015e-05, + "loss": 1.2982, + "step": 11118 + }, + { + "epoch": 0.13900347508687716, + "grad_norm": 2.4847564697265625, + "learning_rate": 1.9907473272353264e-05, + "loss": 1.5448, + "step": 11120 + }, + { + "epoch": 0.1390284757118928, + "grad_norm": 0.005796385928988457, + "learning_rate": 1.9907354793887596e-05, + "loss": 1.0059, + "step": 11122 + }, + { + "epoch": 0.13905347633690843, + "grad_norm": 5.780514717102051, + "learning_rate": 1.9907236239968913e-05, + "loss": 0.3203, + "step": 11124 + }, + { + "epoch": 0.13907847696192405, + "grad_norm": 3.7186243534088135, + "learning_rate": 1.9907117610598124e-05, + "loss": 1.761, + "step": 11126 + }, + { + "epoch": 0.13910347758693967, + "grad_norm": 0.00823988951742649, + "learning_rate": 1.9906998905776124e-05, + "loss": 0.4185, + "step": 11128 + }, + { + "epoch": 0.1391284782119553, + "grad_norm": 0.008870826102793217, + "learning_rate": 1.9906880125503824e-05, + "loss": 0.0002, + "step": 11130 + }, + { + "epoch": 0.13915347883697093, + "grad_norm": 2.7059621810913086, + "learning_rate": 1.9906761269782125e-05, + "loss": 1.1385, + "step": 11132 + }, + { + "epoch": 0.13917847946198655, + "grad_norm": 2.221313714981079, + "learning_rate": 1.9906642338611936e-05, + "loss": 1.0908, + "step": 11134 + }, + { + "epoch": 0.13920348008700217, + "grad_norm": 2.974107027053833, + "learning_rate": 1.9906523331994157e-05, + "loss": 0.7419, + "step": 11136 + }, + { + "epoch": 0.1392284807120178, + "grad_norm": 4.849024295806885, + "learning_rate": 1.9906404249929696e-05, + "loss": 0.8164, + "step": 11138 + }, + { + "epoch": 0.1392534813370334, + "grad_norm": 3.2038679122924805, + "learning_rate": 1.9906285092419462e-05, + "loss": 0.9635, + "step": 11140 + }, + { + "epoch": 0.13927848196204906, + "grad_norm": 1.9682526588439941, + "learning_rate": 1.9906165859464362e-05, + "loss": 1.6008, + "step": 11142 + }, + { + "epoch": 0.13930348258706468, + "grad_norm": 2.2664437294006348, + "learning_rate": 1.9906046551065305e-05, + "loss": 1.2837, + "step": 11144 + }, + { + "epoch": 0.1393284832120803, + "grad_norm": 5.400205135345459, + "learning_rate": 1.99059271672232e-05, + "loss": 1.1046, + "step": 11146 + }, + { + "epoch": 0.13935348383709592, + "grad_norm": 3.7623674869537354, + "learning_rate": 1.9905807707938947e-05, + "loss": 1.2695, + "step": 11148 + }, + { + "epoch": 0.13937848446211154, + "grad_norm": 1.9780195951461792, + "learning_rate": 1.990568817321347e-05, + "loss": 1.8299, + "step": 11150 + }, + { + "epoch": 0.1394034850871272, + "grad_norm": 1.4267035722732544, + "learning_rate": 1.990556856304767e-05, + "loss": 0.8873, + "step": 11152 + }, + { + "epoch": 0.1394284857121428, + "grad_norm": 3.9336397647857666, + "learning_rate": 1.9905448877442462e-05, + "loss": 0.8232, + "step": 11154 + }, + { + "epoch": 0.13945348633715843, + "grad_norm": 0.742182195186615, + "learning_rate": 1.9905329116398756e-05, + "loss": 0.0404, + "step": 11156 + }, + { + "epoch": 0.13947848696217405, + "grad_norm": 2.509113311767578, + "learning_rate": 1.9905209279917462e-05, + "loss": 0.4436, + "step": 11158 + }, + { + "epoch": 0.13950348758718967, + "grad_norm": 2.4210715293884277, + "learning_rate": 1.99050893679995e-05, + "loss": 0.4724, + "step": 11160 + }, + { + "epoch": 0.13952848821220531, + "grad_norm": 4.7202935218811035, + "learning_rate": 1.990496938064577e-05, + "loss": 0.6599, + "step": 11162 + }, + { + "epoch": 0.13955348883722093, + "grad_norm": 2.3895797729492188, + "learning_rate": 1.99048493178572e-05, + "loss": 1.9279, + "step": 11164 + }, + { + "epoch": 0.13957848946223655, + "grad_norm": 2.0690882205963135, + "learning_rate": 1.9904729179634697e-05, + "loss": 0.276, + "step": 11166 + }, + { + "epoch": 0.13960349008725217, + "grad_norm": 1.8931005001068115, + "learning_rate": 1.9904608965979176e-05, + "loss": 0.121, + "step": 11168 + }, + { + "epoch": 0.1396284907122678, + "grad_norm": 3.7108893394470215, + "learning_rate": 1.9904488676891556e-05, + "loss": 0.3036, + "step": 11170 + }, + { + "epoch": 0.13965349133728344, + "grad_norm": 6.317174911499023, + "learning_rate": 1.990436831237275e-05, + "loss": 1.5987, + "step": 11172 + }, + { + "epoch": 0.13967849196229906, + "grad_norm": 1.4245857000350952, + "learning_rate": 1.9904247872423675e-05, + "loss": 0.0524, + "step": 11174 + }, + { + "epoch": 0.13970349258731468, + "grad_norm": 0.0170129407197237, + "learning_rate": 1.9904127357045247e-05, + "loss": 0.0531, + "step": 11176 + }, + { + "epoch": 0.1397284932123303, + "grad_norm": 0.6061741709709167, + "learning_rate": 1.9904006766238385e-05, + "loss": 0.4558, + "step": 11178 + }, + { + "epoch": 0.13975349383734592, + "grad_norm": 2.677748918533325, + "learning_rate": 1.9903886100004013e-05, + "loss": 0.7311, + "step": 11180 + }, + { + "epoch": 0.13977849446236157, + "grad_norm": 5.744621276855469, + "learning_rate": 1.990376535834304e-05, + "loss": 0.8309, + "step": 11182 + }, + { + "epoch": 0.1398034950873772, + "grad_norm": 0.002012468408793211, + "learning_rate": 1.9903644541256395e-05, + "loss": 0.5013, + "step": 11184 + }, + { + "epoch": 0.1398284957123928, + "grad_norm": 2.8419294357299805, + "learning_rate": 1.9903523648744994e-05, + "loss": 0.9772, + "step": 11186 + }, + { + "epoch": 0.13985349633740843, + "grad_norm": 6.690624713897705, + "learning_rate": 1.9903402680809754e-05, + "loss": 0.9022, + "step": 11188 + }, + { + "epoch": 0.13987849696242405, + "grad_norm": 2.8223986625671387, + "learning_rate": 1.99032816374516e-05, + "loss": 1.1685, + "step": 11190 + }, + { + "epoch": 0.1399034975874397, + "grad_norm": 2.650599956512451, + "learning_rate": 1.9903160518671454e-05, + "loss": 1.1764, + "step": 11192 + }, + { + "epoch": 0.13992849821245532, + "grad_norm": 3.6467201709747314, + "learning_rate": 1.9903039324470237e-05, + "loss": 0.8088, + "step": 11194 + }, + { + "epoch": 0.13995349883747094, + "grad_norm": 8.786619186401367, + "learning_rate": 1.9902918054848875e-05, + "loss": 0.3714, + "step": 11196 + }, + { + "epoch": 0.13997849946248656, + "grad_norm": 2.9778494834899902, + "learning_rate": 1.9902796709808288e-05, + "loss": 1.3214, + "step": 11198 + }, + { + "epoch": 0.14000350008750218, + "grad_norm": 6.295504570007324, + "learning_rate": 1.9902675289349402e-05, + "loss": 0.4939, + "step": 11200 + }, + { + "epoch": 0.14002850071251782, + "grad_norm": 4.342469692230225, + "learning_rate": 1.9902553793473143e-05, + "loss": 0.8551, + "step": 11202 + }, + { + "epoch": 0.14005350133753344, + "grad_norm": 4.440917491912842, + "learning_rate": 1.9902432222180434e-05, + "loss": 1.0646, + "step": 11204 + }, + { + "epoch": 0.14007850196254906, + "grad_norm": 0.0012876649852842093, + "learning_rate": 1.99023105754722e-05, + "loss": 1.408, + "step": 11206 + }, + { + "epoch": 0.14010350258756468, + "grad_norm": 6.8308424949646, + "learning_rate": 1.9902188853349368e-05, + "loss": 0.9166, + "step": 11208 + }, + { + "epoch": 0.1401285032125803, + "grad_norm": 1.0728638172149658, + "learning_rate": 1.990206705581287e-05, + "loss": 0.4434, + "step": 11210 + }, + { + "epoch": 0.14015350383759595, + "grad_norm": 0.0017349119298160076, + "learning_rate": 1.9901945182863625e-05, + "loss": 0.7793, + "step": 11212 + }, + { + "epoch": 0.14017850446261157, + "grad_norm": 2.5575873851776123, + "learning_rate": 1.990182323450257e-05, + "loss": 0.8562, + "step": 11214 + }, + { + "epoch": 0.1402035050876272, + "grad_norm": 2.8816261291503906, + "learning_rate": 1.990170121073063e-05, + "loss": 0.9048, + "step": 11216 + }, + { + "epoch": 0.1402285057126428, + "grad_norm": 3.40094256401062, + "learning_rate": 1.990157911154873e-05, + "loss": 2.0004, + "step": 11218 + }, + { + "epoch": 0.14025350633765843, + "grad_norm": 2.761230945587158, + "learning_rate": 1.9901456936957802e-05, + "loss": 1.2977, + "step": 11220 + }, + { + "epoch": 0.14027850696267408, + "grad_norm": 1.8202725648880005, + "learning_rate": 1.9901334686958782e-05, + "loss": 0.294, + "step": 11222 + }, + { + "epoch": 0.1403035075876897, + "grad_norm": 1.828254222869873, + "learning_rate": 1.9901212361552596e-05, + "loss": 1.4005, + "step": 11224 + }, + { + "epoch": 0.14032850821270532, + "grad_norm": 4.041133403778076, + "learning_rate": 1.990108996074018e-05, + "loss": 1.7601, + "step": 11226 + }, + { + "epoch": 0.14035350883772094, + "grad_norm": 2.56097412109375, + "learning_rate": 1.990096748452246e-05, + "loss": 0.6551, + "step": 11228 + }, + { + "epoch": 0.14037850946273656, + "grad_norm": 4.630360126495361, + "learning_rate": 1.9900844932900368e-05, + "loss": 1.7372, + "step": 11230 + }, + { + "epoch": 0.1404035100877522, + "grad_norm": 2.988250494003296, + "learning_rate": 1.9900722305874847e-05, + "loss": 1.2431, + "step": 11232 + }, + { + "epoch": 0.14042851071276782, + "grad_norm": 2.4891879558563232, + "learning_rate": 1.9900599603446822e-05, + "loss": 0.6373, + "step": 11234 + }, + { + "epoch": 0.14045351133778344, + "grad_norm": 0.37136125564575195, + "learning_rate": 1.9900476825617235e-05, + "loss": 0.9282, + "step": 11236 + }, + { + "epoch": 0.14047851196279906, + "grad_norm": 2.8725686073303223, + "learning_rate": 1.990035397238701e-05, + "loss": 0.348, + "step": 11238 + }, + { + "epoch": 0.14050351258781468, + "grad_norm": 5.052865982055664, + "learning_rate": 1.9900231043757093e-05, + "loss": 1.7017, + "step": 11240 + }, + { + "epoch": 0.14052851321283033, + "grad_norm": 3.3934459686279297, + "learning_rate": 1.9900108039728415e-05, + "loss": 0.3929, + "step": 11242 + }, + { + "epoch": 0.14055351383784595, + "grad_norm": 2.9321930408477783, + "learning_rate": 1.9899984960301915e-05, + "loss": 1.3106, + "step": 11244 + }, + { + "epoch": 0.14057851446286157, + "grad_norm": 3.4127197265625, + "learning_rate": 1.989986180547853e-05, + "loss": 0.7253, + "step": 11246 + }, + { + "epoch": 0.1406035150878772, + "grad_norm": 3.7348203659057617, + "learning_rate": 1.9899738575259196e-05, + "loss": 1.6156, + "step": 11248 + }, + { + "epoch": 0.1406285157128928, + "grad_norm": 0.5329709053039551, + "learning_rate": 1.9899615269644854e-05, + "loss": 0.3022, + "step": 11250 + }, + { + "epoch": 0.14065351633790846, + "grad_norm": 1.151768445968628, + "learning_rate": 1.989949188863644e-05, + "loss": 0.0377, + "step": 11252 + }, + { + "epoch": 0.14067851696292408, + "grad_norm": 2.806713819503784, + "learning_rate": 1.98993684322349e-05, + "loss": 1.1664, + "step": 11254 + }, + { + "epoch": 0.1407035175879397, + "grad_norm": 3.1594419479370117, + "learning_rate": 1.9899244900441168e-05, + "loss": 0.8357, + "step": 11256 + }, + { + "epoch": 0.14072851821295532, + "grad_norm": 0.10252418369054794, + "learning_rate": 1.9899121293256184e-05, + "loss": 1.2234, + "step": 11258 + }, + { + "epoch": 0.14075351883797094, + "grad_norm": 2.3625195026397705, + "learning_rate": 1.9898997610680894e-05, + "loss": 0.9124, + "step": 11260 + }, + { + "epoch": 0.14077851946298658, + "grad_norm": 2.182224750518799, + "learning_rate": 1.989887385271624e-05, + "loss": 0.744, + "step": 11262 + }, + { + "epoch": 0.1408035200880022, + "grad_norm": 2.0660486221313477, + "learning_rate": 1.9898750019363163e-05, + "loss": 0.4471, + "step": 11264 + }, + { + "epoch": 0.14082852071301782, + "grad_norm": 2.85976243019104, + "learning_rate": 1.9898626110622602e-05, + "loss": 1.4661, + "step": 11266 + }, + { + "epoch": 0.14085352133803344, + "grad_norm": 4.933412075042725, + "learning_rate": 1.9898502126495505e-05, + "loss": 1.0929, + "step": 11268 + }, + { + "epoch": 0.14087852196304906, + "grad_norm": 3.630918502807617, + "learning_rate": 1.989837806698282e-05, + "loss": 1.6097, + "step": 11270 + }, + { + "epoch": 0.1409035225880647, + "grad_norm": 2.6487598419189453, + "learning_rate": 1.989825393208548e-05, + "loss": 1.1909, + "step": 11272 + }, + { + "epoch": 0.14092852321308033, + "grad_norm": 3.5230860710144043, + "learning_rate": 1.9898129721804443e-05, + "loss": 0.876, + "step": 11274 + }, + { + "epoch": 0.14095352383809595, + "grad_norm": 4.921627521514893, + "learning_rate": 1.9898005436140648e-05, + "loss": 0.4315, + "step": 11276 + }, + { + "epoch": 0.14097852446311157, + "grad_norm": 4.0705742835998535, + "learning_rate": 1.9897881075095044e-05, + "loss": 1.2983, + "step": 11278 + }, + { + "epoch": 0.1410035250881272, + "grad_norm": 0.010091247968375683, + "learning_rate": 1.9897756638668576e-05, + "loss": 0.4754, + "step": 11280 + }, + { + "epoch": 0.14102852571314284, + "grad_norm": 4.04973840713501, + "learning_rate": 1.9897632126862193e-05, + "loss": 1.9846, + "step": 11282 + }, + { + "epoch": 0.14105352633815846, + "grad_norm": 2.7493321895599365, + "learning_rate": 1.9897507539676847e-05, + "loss": 1.672, + "step": 11284 + }, + { + "epoch": 0.14107852696317408, + "grad_norm": 2.9317848682403564, + "learning_rate": 1.989738287711348e-05, + "loss": 0.4572, + "step": 11286 + }, + { + "epoch": 0.1411035275881897, + "grad_norm": 5.25828218460083, + "learning_rate": 1.9897258139173045e-05, + "loss": 1.8991, + "step": 11288 + }, + { + "epoch": 0.14112852821320532, + "grad_norm": 2.808894157409668, + "learning_rate": 1.9897133325856492e-05, + "loss": 0.657, + "step": 11290 + }, + { + "epoch": 0.14115352883822097, + "grad_norm": 0.15364013612270355, + "learning_rate": 1.9897008437164774e-05, + "loss": 0.7847, + "step": 11292 + }, + { + "epoch": 0.14117852946323659, + "grad_norm": 1.996978998184204, + "learning_rate": 1.9896883473098834e-05, + "loss": 0.771, + "step": 11294 + }, + { + "epoch": 0.1412035300882522, + "grad_norm": 0.777292013168335, + "learning_rate": 1.989675843365963e-05, + "loss": 0.3662, + "step": 11296 + }, + { + "epoch": 0.14122853071326782, + "grad_norm": 1.7757328748703003, + "learning_rate": 1.9896633318848116e-05, + "loss": 0.7511, + "step": 11298 + }, + { + "epoch": 0.14125353133828344, + "grad_norm": 5.4035773277282715, + "learning_rate": 1.989650812866524e-05, + "loss": 0.8918, + "step": 11300 + }, + { + "epoch": 0.1412785319632991, + "grad_norm": 5.264878273010254, + "learning_rate": 1.989638286311196e-05, + "loss": 1.3031, + "step": 11302 + }, + { + "epoch": 0.1413035325883147, + "grad_norm": 0.01178129855543375, + "learning_rate": 1.9896257522189226e-05, + "loss": 0.5865, + "step": 11304 + }, + { + "epoch": 0.14132853321333033, + "grad_norm": 3.282809257507324, + "learning_rate": 1.9896132105897995e-05, + "loss": 0.5718, + "step": 11306 + }, + { + "epoch": 0.14135353383834595, + "grad_norm": 0.8527539968490601, + "learning_rate": 1.989600661423922e-05, + "loss": 0.0377, + "step": 11308 + }, + { + "epoch": 0.14137853446336157, + "grad_norm": 3.227424383163452, + "learning_rate": 1.989588104721386e-05, + "loss": 1.58, + "step": 11310 + }, + { + "epoch": 0.14140353508837722, + "grad_norm": 2.5177876949310303, + "learning_rate": 1.9895755404822867e-05, + "loss": 1.3808, + "step": 11312 + }, + { + "epoch": 0.14142853571339284, + "grad_norm": 5.6594719886779785, + "learning_rate": 1.9895629687067203e-05, + "loss": 1.3162, + "step": 11314 + }, + { + "epoch": 0.14145353633840846, + "grad_norm": 3.9468934535980225, + "learning_rate": 1.989550389394782e-05, + "loss": 1.7518, + "step": 11316 + }, + { + "epoch": 0.14147853696342408, + "grad_norm": 5.519913673400879, + "learning_rate": 1.989537802546568e-05, + "loss": 1.114, + "step": 11318 + }, + { + "epoch": 0.1415035375884397, + "grad_norm": 3.697131633758545, + "learning_rate": 1.989525208162174e-05, + "loss": 1.7394, + "step": 11320 + }, + { + "epoch": 0.14152853821345535, + "grad_norm": 9.63033676147461, + "learning_rate": 1.9895126062416956e-05, + "loss": 0.9419, + "step": 11322 + }, + { + "epoch": 0.14155353883847097, + "grad_norm": 5.274528980255127, + "learning_rate": 1.9894999967852296e-05, + "loss": 1.178, + "step": 11324 + }, + { + "epoch": 0.14157853946348659, + "grad_norm": 0.0031006166245788336, + "learning_rate": 1.9894873797928717e-05, + "loss": 0.5165, + "step": 11326 + }, + { + "epoch": 0.1416035400885022, + "grad_norm": 3.9736361503601074, + "learning_rate": 1.9894747552647175e-05, + "loss": 0.9243, + "step": 11328 + }, + { + "epoch": 0.14162854071351783, + "grad_norm": 3.537130832672119, + "learning_rate": 1.989462123200863e-05, + "loss": 0.5819, + "step": 11330 + }, + { + "epoch": 0.14165354133853347, + "grad_norm": 1.886799693107605, + "learning_rate": 1.989449483601406e-05, + "loss": 1.0959, + "step": 11332 + }, + { + "epoch": 0.1416785419635491, + "grad_norm": 1.904107928276062, + "learning_rate": 1.9894368364664407e-05, + "loss": 0.7336, + "step": 11334 + }, + { + "epoch": 0.1417035425885647, + "grad_norm": 3.710535764694214, + "learning_rate": 1.9894241817960648e-05, + "loss": 2.072, + "step": 11336 + }, + { + "epoch": 0.14172854321358033, + "grad_norm": 2.0507500171661377, + "learning_rate": 1.9894115195903746e-05, + "loss": 0.6791, + "step": 11338 + }, + { + "epoch": 0.14175354383859595, + "grad_norm": 2.2423505783081055, + "learning_rate": 1.9893988498494658e-05, + "loss": 0.8026, + "step": 11340 + }, + { + "epoch": 0.1417785444636116, + "grad_norm": 3.3746449947357178, + "learning_rate": 1.989386172573435e-05, + "loss": 0.8808, + "step": 11342 + }, + { + "epoch": 0.14180354508862722, + "grad_norm": 4.322576522827148, + "learning_rate": 1.989373487762379e-05, + "loss": 1.5702, + "step": 11344 + }, + { + "epoch": 0.14182854571364284, + "grad_norm": 3.1200857162475586, + "learning_rate": 1.9893607954163948e-05, + "loss": 0.8175, + "step": 11346 + }, + { + "epoch": 0.14185354633865846, + "grad_norm": 4.367031574249268, + "learning_rate": 1.9893480955355787e-05, + "loss": 1.5032, + "step": 11348 + }, + { + "epoch": 0.14187854696367408, + "grad_norm": 2.7845699787139893, + "learning_rate": 1.9893353881200274e-05, + "loss": 1.6701, + "step": 11350 + }, + { + "epoch": 0.14190354758868973, + "grad_norm": 0.0015445094322785735, + "learning_rate": 1.9893226731698375e-05, + "loss": 0.4509, + "step": 11352 + }, + { + "epoch": 0.14192854821370535, + "grad_norm": 2.4206695556640625, + "learning_rate": 1.9893099506851063e-05, + "loss": 1.3186, + "step": 11354 + }, + { + "epoch": 0.14195354883872097, + "grad_norm": 6.705798149108887, + "learning_rate": 1.9892972206659303e-05, + "loss": 2.488, + "step": 11356 + }, + { + "epoch": 0.1419785494637366, + "grad_norm": 2.825265407562256, + "learning_rate": 1.9892844831124063e-05, + "loss": 1.245, + "step": 11358 + }, + { + "epoch": 0.1420035500887522, + "grad_norm": 0.005312612280249596, + "learning_rate": 1.9892717380246317e-05, + "loss": 0.5825, + "step": 11360 + }, + { + "epoch": 0.14202855071376785, + "grad_norm": 5.094935894012451, + "learning_rate": 1.9892589854027037e-05, + "loss": 0.5611, + "step": 11362 + }, + { + "epoch": 0.14205355133878347, + "grad_norm": 2.532505512237549, + "learning_rate": 1.9892462252467188e-05, + "loss": 0.0835, + "step": 11364 + }, + { + "epoch": 0.1420785519637991, + "grad_norm": 2.6651387214660645, + "learning_rate": 1.989233457556775e-05, + "loss": 0.6013, + "step": 11366 + }, + { + "epoch": 0.1421035525888147, + "grad_norm": 9.632932662963867, + "learning_rate": 1.9892206823329688e-05, + "loss": 1.392, + "step": 11368 + }, + { + "epoch": 0.14212855321383033, + "grad_norm": 2.515209674835205, + "learning_rate": 1.9892078995753978e-05, + "loss": 1.0426, + "step": 11370 + }, + { + "epoch": 0.14215355383884598, + "grad_norm": 1.5155339241027832, + "learning_rate": 1.9891951092841592e-05, + "loss": 0.4843, + "step": 11372 + }, + { + "epoch": 0.1421785544638616, + "grad_norm": 1.733386754989624, + "learning_rate": 1.9891823114593506e-05, + "loss": 0.1232, + "step": 11374 + }, + { + "epoch": 0.14220355508887722, + "grad_norm": 2.2354071140289307, + "learning_rate": 1.9891695061010693e-05, + "loss": 0.2101, + "step": 11376 + }, + { + "epoch": 0.14222855571389284, + "grad_norm": 0.002280804794281721, + "learning_rate": 1.9891566932094132e-05, + "loss": 0.691, + "step": 11378 + }, + { + "epoch": 0.14225355633890846, + "grad_norm": 3.577545404434204, + "learning_rate": 1.9891438727844794e-05, + "loss": 1.501, + "step": 11380 + }, + { + "epoch": 0.1422785569639241, + "grad_norm": 0.007073083892464638, + "learning_rate": 1.989131044826366e-05, + "loss": 0.7078, + "step": 11382 + }, + { + "epoch": 0.14230355758893973, + "grad_norm": 3.3668763637542725, + "learning_rate": 1.9891182093351702e-05, + "loss": 0.6046, + "step": 11384 + }, + { + "epoch": 0.14232855821395535, + "grad_norm": 5.883259296417236, + "learning_rate": 1.9891053663109902e-05, + "loss": 2.1556, + "step": 11386 + }, + { + "epoch": 0.14235355883897097, + "grad_norm": 1.2125920057296753, + "learning_rate": 1.9890925157539232e-05, + "loss": 0.5086, + "step": 11388 + }, + { + "epoch": 0.1423785594639866, + "grad_norm": 4.228988170623779, + "learning_rate": 1.9890796576640676e-05, + "loss": 0.5096, + "step": 11390 + }, + { + "epoch": 0.14240356008900223, + "grad_norm": 4.203810214996338, + "learning_rate": 1.9890667920415212e-05, + "loss": 0.5176, + "step": 11392 + }, + { + "epoch": 0.14242856071401785, + "grad_norm": 2.53874135017395, + "learning_rate": 1.9890539188863823e-05, + "loss": 1.2253, + "step": 11394 + }, + { + "epoch": 0.14245356133903347, + "grad_norm": 0.021113652735948563, + "learning_rate": 1.989041038198748e-05, + "loss": 0.1001, + "step": 11396 + }, + { + "epoch": 0.1424785619640491, + "grad_norm": 3.201176404953003, + "learning_rate": 1.989028149978718e-05, + "loss": 1.0875, + "step": 11398 + }, + { + "epoch": 0.14250356258906471, + "grad_norm": 3.4604434967041016, + "learning_rate": 1.9890152542263886e-05, + "loss": 1.3617, + "step": 11400 + }, + { + "epoch": 0.14252856321408036, + "grad_norm": 4.032627582550049, + "learning_rate": 1.989002350941859e-05, + "loss": 1.1834, + "step": 11402 + }, + { + "epoch": 0.14255356383909598, + "grad_norm": 10.190657615661621, + "learning_rate": 1.9889894401252278e-05, + "loss": 1.3188, + "step": 11404 + }, + { + "epoch": 0.1425785644641116, + "grad_norm": 0.39151906967163086, + "learning_rate": 1.9889765217765925e-05, + "loss": 0.9966, + "step": 11406 + }, + { + "epoch": 0.14260356508912722, + "grad_norm": 0.00282789277844131, + "learning_rate": 1.9889635958960517e-05, + "loss": 1.1306, + "step": 11408 + }, + { + "epoch": 0.14262856571414284, + "grad_norm": 0.037775978446006775, + "learning_rate": 1.9889506624837045e-05, + "loss": 0.3318, + "step": 11410 + }, + { + "epoch": 0.1426535663391585, + "grad_norm": 2.833796501159668, + "learning_rate": 1.9889377215396487e-05, + "loss": 0.5169, + "step": 11412 + }, + { + "epoch": 0.1426785669641741, + "grad_norm": 3.3372178077697754, + "learning_rate": 1.988924773063983e-05, + "loss": 0.9346, + "step": 11414 + }, + { + "epoch": 0.14270356758918973, + "grad_norm": 4.865937232971191, + "learning_rate": 1.988911817056806e-05, + "loss": 1.2101, + "step": 11416 + }, + { + "epoch": 0.14272856821420535, + "grad_norm": 2.360161781311035, + "learning_rate": 1.9888988535182165e-05, + "loss": 0.7978, + "step": 11418 + }, + { + "epoch": 0.14275356883922097, + "grad_norm": 0.10818052291870117, + "learning_rate": 1.9888858824483135e-05, + "loss": 0.0021, + "step": 11420 + }, + { + "epoch": 0.14277856946423662, + "grad_norm": 6.477091312408447, + "learning_rate": 1.988872903847195e-05, + "loss": 1.1047, + "step": 11422 + }, + { + "epoch": 0.14280357008925224, + "grad_norm": 1.9710638523101807, + "learning_rate": 1.9888599177149604e-05, + "loss": 0.3325, + "step": 11424 + }, + { + "epoch": 0.14282857071426786, + "grad_norm": 2.6676185131073, + "learning_rate": 1.988846924051709e-05, + "loss": 1.8475, + "step": 11426 + }, + { + "epoch": 0.14285357133928347, + "grad_norm": 0.008048349060118198, + "learning_rate": 1.9888339228575385e-05, + "loss": 0.9173, + "step": 11428 + }, + { + "epoch": 0.1428785719642991, + "grad_norm": 3.7736332416534424, + "learning_rate": 1.988820914132549e-05, + "loss": 1.3035, + "step": 11430 + }, + { + "epoch": 0.14290357258931474, + "grad_norm": 2.7774007320404053, + "learning_rate": 1.9888078978768394e-05, + "loss": 1.4741, + "step": 11432 + }, + { + "epoch": 0.14292857321433036, + "grad_norm": 2.034879446029663, + "learning_rate": 1.9887948740905087e-05, + "loss": 0.8084, + "step": 11434 + }, + { + "epoch": 0.14295357383934598, + "grad_norm": 2.198782444000244, + "learning_rate": 1.9887818427736557e-05, + "loss": 1.5071, + "step": 11436 + }, + { + "epoch": 0.1429785744643616, + "grad_norm": 7.8413286209106445, + "learning_rate": 1.9887688039263806e-05, + "loss": 1.6487, + "step": 11438 + }, + { + "epoch": 0.14300357508937722, + "grad_norm": 3.8127408027648926, + "learning_rate": 1.9887557575487816e-05, + "loss": 1.3478, + "step": 11440 + }, + { + "epoch": 0.14302857571439287, + "grad_norm": 4.152053356170654, + "learning_rate": 1.988742703640959e-05, + "loss": 2.434, + "step": 11442 + }, + { + "epoch": 0.1430535763394085, + "grad_norm": 2.956578493118286, + "learning_rate": 1.988729642203011e-05, + "loss": 0.6052, + "step": 11444 + }, + { + "epoch": 0.1430785769644241, + "grad_norm": 4.711149215698242, + "learning_rate": 1.9887165732350384e-05, + "loss": 1.2873, + "step": 11446 + }, + { + "epoch": 0.14310357758943973, + "grad_norm": 3.2299115657806396, + "learning_rate": 1.9887034967371403e-05, + "loss": 1.2625, + "step": 11448 + }, + { + "epoch": 0.14312857821445535, + "grad_norm": 2.8450779914855957, + "learning_rate": 1.988690412709416e-05, + "loss": 0.5906, + "step": 11450 + }, + { + "epoch": 0.143153578839471, + "grad_norm": 2.652832269668579, + "learning_rate": 1.9886773211519655e-05, + "loss": 1.1079, + "step": 11452 + }, + { + "epoch": 0.14317857946448662, + "grad_norm": 2.005214214324951, + "learning_rate": 1.9886642220648883e-05, + "loss": 0.8983, + "step": 11454 + }, + { + "epoch": 0.14320358008950224, + "grad_norm": 0.016233647242188454, + "learning_rate": 1.9886511154482838e-05, + "loss": 0.7742, + "step": 11456 + }, + { + "epoch": 0.14322858071451786, + "grad_norm": 2.1595520973205566, + "learning_rate": 1.9886380013022527e-05, + "loss": 0.6325, + "step": 11458 + }, + { + "epoch": 0.14325358133953348, + "grad_norm": 3.046809673309326, + "learning_rate": 1.9886248796268942e-05, + "loss": 0.3319, + "step": 11460 + }, + { + "epoch": 0.14327858196454912, + "grad_norm": 4.800886631011963, + "learning_rate": 1.9886117504223086e-05, + "loss": 1.5351, + "step": 11462 + }, + { + "epoch": 0.14330358258956474, + "grad_norm": 3.626811981201172, + "learning_rate": 1.9885986136885956e-05, + "loss": 0.6683, + "step": 11464 + }, + { + "epoch": 0.14332858321458036, + "grad_norm": 3.2243967056274414, + "learning_rate": 1.9885854694258554e-05, + "loss": 1.0731, + "step": 11466 + }, + { + "epoch": 0.14335358383959598, + "grad_norm": 2.8201746940612793, + "learning_rate": 1.9885723176341876e-05, + "loss": 0.7243, + "step": 11468 + }, + { + "epoch": 0.1433785844646116, + "grad_norm": 0.7797150015830994, + "learning_rate": 1.9885591583136933e-05, + "loss": 0.0753, + "step": 11470 + }, + { + "epoch": 0.14340358508962725, + "grad_norm": 3.0527760982513428, + "learning_rate": 1.9885459914644722e-05, + "loss": 1.3506, + "step": 11472 + }, + { + "epoch": 0.14342858571464287, + "grad_norm": 0.6903699040412903, + "learning_rate": 1.9885328170866245e-05, + "loss": 0.9546, + "step": 11474 + }, + { + "epoch": 0.1434535863396585, + "grad_norm": 4.375216960906982, + "learning_rate": 1.988519635180251e-05, + "loss": 0.5361, + "step": 11476 + }, + { + "epoch": 0.1434785869646741, + "grad_norm": 4.716502666473389, + "learning_rate": 1.9885064457454514e-05, + "loss": 1.8533, + "step": 11478 + }, + { + "epoch": 0.14350358758968973, + "grad_norm": 2.5265159606933594, + "learning_rate": 1.9884932487823267e-05, + "loss": 2.2456, + "step": 11480 + }, + { + "epoch": 0.14352858821470538, + "grad_norm": 5.026211738586426, + "learning_rate": 1.9884800442909767e-05, + "loss": 0.7078, + "step": 11482 + }, + { + "epoch": 0.143553588839721, + "grad_norm": 0.043124690651893616, + "learning_rate": 1.988466832271503e-05, + "loss": 0.016, + "step": 11484 + }, + { + "epoch": 0.14357858946473662, + "grad_norm": 0.0016957555199041963, + "learning_rate": 1.9884536127240054e-05, + "loss": 0.1325, + "step": 11486 + }, + { + "epoch": 0.14360359008975224, + "grad_norm": 6.574573993682861, + "learning_rate": 1.988440385648585e-05, + "loss": 1.4083, + "step": 11488 + }, + { + "epoch": 0.14362859071476786, + "grad_norm": 1.7950420379638672, + "learning_rate": 1.9884271510453424e-05, + "loss": 0.1611, + "step": 11490 + }, + { + "epoch": 0.1436535913397835, + "grad_norm": 2.777780771255493, + "learning_rate": 1.9884139089143785e-05, + "loss": 0.8128, + "step": 11492 + }, + { + "epoch": 0.14367859196479912, + "grad_norm": 3.9742910861968994, + "learning_rate": 1.9884006592557942e-05, + "loss": 1.7164, + "step": 11494 + }, + { + "epoch": 0.14370359258981474, + "grad_norm": 3.172595500946045, + "learning_rate": 1.9883874020696898e-05, + "loss": 0.6577, + "step": 11496 + }, + { + "epoch": 0.14372859321483036, + "grad_norm": 5.765730381011963, + "learning_rate": 1.988374137356167e-05, + "loss": 1.7052, + "step": 11498 + }, + { + "epoch": 0.14375359383984598, + "grad_norm": 3.0007472038269043, + "learning_rate": 1.9883608651153264e-05, + "loss": 1.705, + "step": 11500 + }, + { + "epoch": 0.14377859446486163, + "grad_norm": 3.2214083671569824, + "learning_rate": 1.988347585347269e-05, + "loss": 1.9044, + "step": 11502 + }, + { + "epoch": 0.14380359508987725, + "grad_norm": 2.3533875942230225, + "learning_rate": 1.9883342980520967e-05, + "loss": 1.2169, + "step": 11504 + }, + { + "epoch": 0.14382859571489287, + "grad_norm": 6.581323146820068, + "learning_rate": 1.9883210032299095e-05, + "loss": 1.3683, + "step": 11506 + }, + { + "epoch": 0.1438535963399085, + "grad_norm": 5.460835933685303, + "learning_rate": 1.98830770088081e-05, + "loss": 0.4663, + "step": 11508 + }, + { + "epoch": 0.1438785969649241, + "grad_norm": 2.882432222366333, + "learning_rate": 1.9882943910048983e-05, + "loss": 1.5613, + "step": 11510 + }, + { + "epoch": 0.14390359758993976, + "grad_norm": 6.033037185668945, + "learning_rate": 1.9882810736022764e-05, + "loss": 0.6679, + "step": 11512 + }, + { + "epoch": 0.14392859821495538, + "grad_norm": 0.2563052475452423, + "learning_rate": 1.9882677486730457e-05, + "loss": 0.0585, + "step": 11514 + }, + { + "epoch": 0.143953598839971, + "grad_norm": 3.5500996112823486, + "learning_rate": 1.9882544162173076e-05, + "loss": 0.5966, + "step": 11516 + }, + { + "epoch": 0.14397859946498662, + "grad_norm": 3.6500089168548584, + "learning_rate": 1.9882410762351637e-05, + "loss": 1.037, + "step": 11518 + }, + { + "epoch": 0.14400360009000224, + "grad_norm": 3.1587281227111816, + "learning_rate": 1.988227728726715e-05, + "loss": 0.6041, + "step": 11520 + }, + { + "epoch": 0.14402860071501788, + "grad_norm": 2.9207570552825928, + "learning_rate": 1.9882143736920644e-05, + "loss": 0.6241, + "step": 11522 + }, + { + "epoch": 0.1440536013400335, + "grad_norm": 1.3943265676498413, + "learning_rate": 1.988201011131312e-05, + "loss": 0.4307, + "step": 11524 + }, + { + "epoch": 0.14407860196504912, + "grad_norm": 4.682505130767822, + "learning_rate": 1.988187641044561e-05, + "loss": 0.4967, + "step": 11526 + }, + { + "epoch": 0.14410360259006474, + "grad_norm": 1.9891510009765625, + "learning_rate": 1.9881742634319126e-05, + "loss": 0.6998, + "step": 11528 + }, + { + "epoch": 0.14412860321508036, + "grad_norm": 5.83788537979126, + "learning_rate": 1.988160878293469e-05, + "loss": 1.2801, + "step": 11530 + }, + { + "epoch": 0.144153603840096, + "grad_norm": 3.0808913707733154, + "learning_rate": 1.9881474856293316e-05, + "loss": 1.1176, + "step": 11532 + }, + { + "epoch": 0.14417860446511163, + "grad_norm": 2.662842035293579, + "learning_rate": 1.9881340854396027e-05, + "loss": 0.7609, + "step": 11534 + }, + { + "epoch": 0.14420360509012725, + "grad_norm": 0.25946882367134094, + "learning_rate": 1.9881206777243845e-05, + "loss": 0.9579, + "step": 11536 + }, + { + "epoch": 0.14422860571514287, + "grad_norm": 1.8815993070602417, + "learning_rate": 1.9881072624837786e-05, + "loss": 0.8357, + "step": 11538 + }, + { + "epoch": 0.1442536063401585, + "grad_norm": 10.50795841217041, + "learning_rate": 1.988093839717888e-05, + "loss": 2.5222, + "step": 11540 + }, + { + "epoch": 0.14427860696517414, + "grad_norm": 2.3195226192474365, + "learning_rate": 1.988080409426814e-05, + "loss": 0.2004, + "step": 11542 + }, + { + "epoch": 0.14430360759018976, + "grad_norm": 0.686137318611145, + "learning_rate": 1.9880669716106596e-05, + "loss": 0.7512, + "step": 11544 + }, + { + "epoch": 0.14432860821520538, + "grad_norm": 5.010406970977783, + "learning_rate": 1.9880535262695267e-05, + "loss": 2.0801, + "step": 11546 + }, + { + "epoch": 0.144353608840221, + "grad_norm": 0.002516453852877021, + "learning_rate": 1.9880400734035178e-05, + "loss": 0.6962, + "step": 11548 + }, + { + "epoch": 0.14437860946523662, + "grad_norm": 2.6517398357391357, + "learning_rate": 1.9880266130127358e-05, + "loss": 1.0816, + "step": 11550 + }, + { + "epoch": 0.14440361009025227, + "grad_norm": 1.7752230167388916, + "learning_rate": 1.9880131450972827e-05, + "loss": 1.0762, + "step": 11552 + }, + { + "epoch": 0.14442861071526789, + "grad_norm": 2.2347733974456787, + "learning_rate": 1.9879996696572613e-05, + "loss": 1.1391, + "step": 11554 + }, + { + "epoch": 0.1444536113402835, + "grad_norm": 1.1876122951507568, + "learning_rate": 1.9879861866927738e-05, + "loss": 0.6024, + "step": 11556 + }, + { + "epoch": 0.14447861196529913, + "grad_norm": 2.4195055961608887, + "learning_rate": 1.9879726962039232e-05, + "loss": 0.8385, + "step": 11558 + }, + { + "epoch": 0.14450361259031475, + "grad_norm": 4.134278774261475, + "learning_rate": 1.9879591981908125e-05, + "loss": 1.4259, + "step": 11560 + }, + { + "epoch": 0.1445286132153304, + "grad_norm": 3.5620837211608887, + "learning_rate": 1.9879456926535444e-05, + "loss": 1.2315, + "step": 11562 + }, + { + "epoch": 0.144553613840346, + "grad_norm": 3.184631824493408, + "learning_rate": 1.9879321795922212e-05, + "loss": 1.6378, + "step": 11564 + }, + { + "epoch": 0.14457861446536163, + "grad_norm": 2.7158734798431396, + "learning_rate": 1.9879186590069466e-05, + "loss": 0.5152, + "step": 11566 + }, + { + "epoch": 0.14460361509037725, + "grad_norm": 0.0023356215097010136, + "learning_rate": 1.987905130897823e-05, + "loss": 1.0516, + "step": 11568 + }, + { + "epoch": 0.14462861571539287, + "grad_norm": 2.3466315269470215, + "learning_rate": 1.9878915952649534e-05, + "loss": 0.9553, + "step": 11570 + }, + { + "epoch": 0.14465361634040852, + "grad_norm": 5.657113552093506, + "learning_rate": 1.9878780521084415e-05, + "loss": 0.4268, + "step": 11572 + }, + { + "epoch": 0.14467861696542414, + "grad_norm": 2.465510368347168, + "learning_rate": 1.9878645014283896e-05, + "loss": 1.2923, + "step": 11574 + }, + { + "epoch": 0.14470361759043976, + "grad_norm": 3.285418748855591, + "learning_rate": 1.9878509432249018e-05, + "loss": 0.8751, + "step": 11576 + }, + { + "epoch": 0.14472861821545538, + "grad_norm": 1.8846375942230225, + "learning_rate": 1.9878373774980807e-05, + "loss": 1.6039, + "step": 11578 + }, + { + "epoch": 0.144753618840471, + "grad_norm": 4.539454936981201, + "learning_rate": 1.98782380424803e-05, + "loss": 0.1119, + "step": 11580 + }, + { + "epoch": 0.14477861946548665, + "grad_norm": 4.846085548400879, + "learning_rate": 1.9878102234748528e-05, + "loss": 1.6996, + "step": 11582 + }, + { + "epoch": 0.14480362009050227, + "grad_norm": 3.298786163330078, + "learning_rate": 1.9877966351786527e-05, + "loss": 1.13, + "step": 11584 + }, + { + "epoch": 0.1448286207155179, + "grad_norm": 1.905288815498352, + "learning_rate": 1.987783039359533e-05, + "loss": 0.7592, + "step": 11586 + }, + { + "epoch": 0.1448536213405335, + "grad_norm": 2.9043006896972656, + "learning_rate": 1.9877694360175975e-05, + "loss": 1.3181, + "step": 11588 + }, + { + "epoch": 0.14487862196554913, + "grad_norm": 1.563126802444458, + "learning_rate": 1.9877558251529496e-05, + "loss": 0.9685, + "step": 11590 + }, + { + "epoch": 0.14490362259056477, + "grad_norm": 5.578498363494873, + "learning_rate": 1.987742206765693e-05, + "loss": 0.5187, + "step": 11592 + }, + { + "epoch": 0.1449286232155804, + "grad_norm": 2.9950802326202393, + "learning_rate": 1.987728580855931e-05, + "loss": 0.7247, + "step": 11594 + }, + { + "epoch": 0.144953623840596, + "grad_norm": 0.4737074077129364, + "learning_rate": 1.9877149474237685e-05, + "loss": 0.6607, + "step": 11596 + }, + { + "epoch": 0.14497862446561163, + "grad_norm": 3.329446315765381, + "learning_rate": 1.9877013064693082e-05, + "loss": 1.2945, + "step": 11598 + }, + { + "epoch": 0.14500362509062725, + "grad_norm": 2.542140007019043, + "learning_rate": 1.9876876579926546e-05, + "loss": 1.2677, + "step": 11600 + }, + { + "epoch": 0.1450286257156429, + "grad_norm": 0.25267457962036133, + "learning_rate": 1.9876740019939117e-05, + "loss": 0.1036, + "step": 11602 + }, + { + "epoch": 0.14505362634065852, + "grad_norm": 2.0780749320983887, + "learning_rate": 1.9876603384731828e-05, + "loss": 1.0774, + "step": 11604 + }, + { + "epoch": 0.14507862696567414, + "grad_norm": 0.16274365782737732, + "learning_rate": 1.9876466674305727e-05, + "loss": 0.4167, + "step": 11606 + }, + { + "epoch": 0.14510362759068976, + "grad_norm": 3.285040855407715, + "learning_rate": 1.9876329888661853e-05, + "loss": 1.7713, + "step": 11608 + }, + { + "epoch": 0.14512862821570538, + "grad_norm": 2.792147397994995, + "learning_rate": 1.9876193027801246e-05, + "loss": 1.3182, + "step": 11610 + }, + { + "epoch": 0.14515362884072103, + "grad_norm": 3.62593936920166, + "learning_rate": 1.9876056091724948e-05, + "loss": 0.7481, + "step": 11612 + }, + { + "epoch": 0.14517862946573665, + "grad_norm": 4.221423625946045, + "learning_rate": 1.9875919080434004e-05, + "loss": 0.8041, + "step": 11614 + }, + { + "epoch": 0.14520363009075227, + "grad_norm": 9.696172714233398, + "learning_rate": 1.9875781993929458e-05, + "loss": 1.7722, + "step": 11616 + }, + { + "epoch": 0.1452286307157679, + "grad_norm": 10.91207504272461, + "learning_rate": 1.9875644832212354e-05, + "loss": 1.0124, + "step": 11618 + }, + { + "epoch": 0.1452536313407835, + "grad_norm": 4.289856433868408, + "learning_rate": 1.9875507595283733e-05, + "loss": 2.3316, + "step": 11620 + }, + { + "epoch": 0.14527863196579915, + "grad_norm": 3.8038055896759033, + "learning_rate": 1.9875370283144643e-05, + "loss": 1.4761, + "step": 11622 + }, + { + "epoch": 0.14530363259081477, + "grad_norm": 0.36659663915634155, + "learning_rate": 1.9875232895796133e-05, + "loss": 0.5824, + "step": 11624 + }, + { + "epoch": 0.1453286332158304, + "grad_norm": 3.7325658798217773, + "learning_rate": 1.9875095433239245e-05, + "loss": 0.836, + "step": 11626 + }, + { + "epoch": 0.145353633840846, + "grad_norm": 3.3488078117370605, + "learning_rate": 1.9874957895475023e-05, + "loss": 0.5655, + "step": 11628 + }, + { + "epoch": 0.14537863446586163, + "grad_norm": 7.803639888763428, + "learning_rate": 1.987482028250452e-05, + "loss": 0.5872, + "step": 11630 + }, + { + "epoch": 0.14540363509087728, + "grad_norm": 5.599608421325684, + "learning_rate": 1.9874682594328784e-05, + "loss": 0.4571, + "step": 11632 + }, + { + "epoch": 0.1454286357158929, + "grad_norm": 0.9990139603614807, + "learning_rate": 1.9874544830948857e-05, + "loss": 0.6369, + "step": 11634 + }, + { + "epoch": 0.14545363634090852, + "grad_norm": 9.065752029418945, + "learning_rate": 1.9874406992365797e-05, + "loss": 0.9151, + "step": 11636 + }, + { + "epoch": 0.14547863696592414, + "grad_norm": 4.485179424285889, + "learning_rate": 1.9874269078580645e-05, + "loss": 2.4132, + "step": 11638 + }, + { + "epoch": 0.14550363759093976, + "grad_norm": 3.2622973918914795, + "learning_rate": 1.987413108959446e-05, + "loss": 0.6742, + "step": 11640 + }, + { + "epoch": 0.1455286382159554, + "grad_norm": 4.098448276519775, + "learning_rate": 1.987399302540829e-05, + "loss": 1.3874, + "step": 11642 + }, + { + "epoch": 0.14555363884097103, + "grad_norm": 1.7384854555130005, + "learning_rate": 1.9873854886023185e-05, + "loss": 0.291, + "step": 11644 + }, + { + "epoch": 0.14557863946598665, + "grad_norm": 2.0771121978759766, + "learning_rate": 1.9873716671440195e-05, + "loss": 1.2778, + "step": 11646 + }, + { + "epoch": 0.14560364009100227, + "grad_norm": 2.0247676372528076, + "learning_rate": 1.9873578381660377e-05, + "loss": 1.2139, + "step": 11648 + }, + { + "epoch": 0.1456286407160179, + "grad_norm": 2.4031994342803955, + "learning_rate": 1.9873440016684782e-05, + "loss": 0.8863, + "step": 11650 + }, + { + "epoch": 0.14565364134103354, + "grad_norm": 6.901354789733887, + "learning_rate": 1.9873301576514464e-05, + "loss": 1.1801, + "step": 11652 + }, + { + "epoch": 0.14567864196604915, + "grad_norm": 1.5943151712417603, + "learning_rate": 1.9873163061150477e-05, + "loss": 0.4995, + "step": 11654 + }, + { + "epoch": 0.14570364259106477, + "grad_norm": 4.1042304039001465, + "learning_rate": 1.9873024470593877e-05, + "loss": 2.1628, + "step": 11656 + }, + { + "epoch": 0.1457286432160804, + "grad_norm": 2.561305522918701, + "learning_rate": 1.9872885804845715e-05, + "loss": 1.008, + "step": 11658 + }, + { + "epoch": 0.14575364384109601, + "grad_norm": 3.9348013401031494, + "learning_rate": 1.9872747063907056e-05, + "loss": 1.0429, + "step": 11660 + }, + { + "epoch": 0.14577864446611166, + "grad_norm": 4.795505046844482, + "learning_rate": 1.9872608247778952e-05, + "loss": 0.9316, + "step": 11662 + }, + { + "epoch": 0.14580364509112728, + "grad_norm": 2.287092924118042, + "learning_rate": 1.9872469356462457e-05, + "loss": 0.3587, + "step": 11664 + }, + { + "epoch": 0.1458286457161429, + "grad_norm": 2.1218888759613037, + "learning_rate": 1.987233038995863e-05, + "loss": 1.1541, + "step": 11666 + }, + { + "epoch": 0.14585364634115852, + "grad_norm": 5.091196060180664, + "learning_rate": 1.9872191348268534e-05, + "loss": 1.4208, + "step": 11668 + }, + { + "epoch": 0.14587864696617414, + "grad_norm": 3.344137668609619, + "learning_rate": 1.987205223139322e-05, + "loss": 1.3793, + "step": 11670 + }, + { + "epoch": 0.1459036475911898, + "grad_norm": 6.136921405792236, + "learning_rate": 1.9871913039333758e-05, + "loss": 1.6866, + "step": 11672 + }, + { + "epoch": 0.1459286482162054, + "grad_norm": 5.252764701843262, + "learning_rate": 1.98717737720912e-05, + "loss": 0.5507, + "step": 11674 + }, + { + "epoch": 0.14595364884122103, + "grad_norm": 0.08913498371839523, + "learning_rate": 1.987163442966661e-05, + "loss": 0.3104, + "step": 11676 + }, + { + "epoch": 0.14597864946623665, + "grad_norm": 3.4096765518188477, + "learning_rate": 1.9871495012061045e-05, + "loss": 1.2822, + "step": 11678 + }, + { + "epoch": 0.14600365009125227, + "grad_norm": 4.162652969360352, + "learning_rate": 1.9871355519275576e-05, + "loss": 1.1276, + "step": 11680 + }, + { + "epoch": 0.14602865071626792, + "grad_norm": 1.572434663772583, + "learning_rate": 1.9871215951311253e-05, + "loss": 0.7609, + "step": 11682 + }, + { + "epoch": 0.14605365134128354, + "grad_norm": 2.64504337310791, + "learning_rate": 1.9871076308169148e-05, + "loss": 1.8373, + "step": 11684 + }, + { + "epoch": 0.14607865196629916, + "grad_norm": 3.481820821762085, + "learning_rate": 1.987093658985032e-05, + "loss": 1.5979, + "step": 11686 + }, + { + "epoch": 0.14610365259131478, + "grad_norm": 1.1132924556732178, + "learning_rate": 1.9870796796355835e-05, + "loss": 0.6239, + "step": 11688 + }, + { + "epoch": 0.1461286532163304, + "grad_norm": 2.3428494930267334, + "learning_rate": 1.987065692768676e-05, + "loss": 0.5272, + "step": 11690 + }, + { + "epoch": 0.14615365384134604, + "grad_norm": 4.077625751495361, + "learning_rate": 1.9870516983844156e-05, + "loss": 1.7906, + "step": 11692 + }, + { + "epoch": 0.14617865446636166, + "grad_norm": 0.4722968339920044, + "learning_rate": 1.9870376964829094e-05, + "loss": 0.5003, + "step": 11694 + }, + { + "epoch": 0.14620365509137728, + "grad_norm": 2.0597383975982666, + "learning_rate": 1.9870236870642632e-05, + "loss": 0.7716, + "step": 11696 + }, + { + "epoch": 0.1462286557163929, + "grad_norm": 2.7290120124816895, + "learning_rate": 1.9870096701285845e-05, + "loss": 0.8785, + "step": 11698 + }, + { + "epoch": 0.14625365634140852, + "grad_norm": 4.004464626312256, + "learning_rate": 1.9869956456759794e-05, + "loss": 0.4312, + "step": 11700 + }, + { + "epoch": 0.14627865696642417, + "grad_norm": 0.01887522079050541, + "learning_rate": 1.986981613706555e-05, + "loss": 0.0001, + "step": 11702 + }, + { + "epoch": 0.1463036575914398, + "grad_norm": 4.793057918548584, + "learning_rate": 1.9869675742204186e-05, + "loss": 1.3312, + "step": 11704 + }, + { + "epoch": 0.1463286582164554, + "grad_norm": 3.010096788406372, + "learning_rate": 1.9869535272176764e-05, + "loss": 0.2608, + "step": 11706 + }, + { + "epoch": 0.14635365884147103, + "grad_norm": 3.4759981632232666, + "learning_rate": 1.9869394726984357e-05, + "loss": 0.711, + "step": 11708 + }, + { + "epoch": 0.14637865946648665, + "grad_norm": 2.5750088691711426, + "learning_rate": 1.986925410662804e-05, + "loss": 0.7, + "step": 11710 + }, + { + "epoch": 0.1464036600915023, + "grad_norm": 0.12377990782260895, + "learning_rate": 1.9869113411108872e-05, + "loss": 0.0024, + "step": 11712 + }, + { + "epoch": 0.14642866071651792, + "grad_norm": 1.2094786167144775, + "learning_rate": 1.986897264042794e-05, + "loss": 0.5561, + "step": 11714 + }, + { + "epoch": 0.14645366134153354, + "grad_norm": 3.152358055114746, + "learning_rate": 1.98688317945863e-05, + "loss": 0.5725, + "step": 11716 + }, + { + "epoch": 0.14647866196654916, + "grad_norm": 0.6936199069023132, + "learning_rate": 1.9868690873585036e-05, + "loss": 0.0776, + "step": 11718 + }, + { + "epoch": 0.14650366259156478, + "grad_norm": 6.879578590393066, + "learning_rate": 1.986854987742522e-05, + "loss": 2.2038, + "step": 11720 + }, + { + "epoch": 0.14652866321658042, + "grad_norm": 4.046028137207031, + "learning_rate": 1.9868408806107923e-05, + "loss": 0.7119, + "step": 11722 + }, + { + "epoch": 0.14655366384159604, + "grad_norm": 3.0805511474609375, + "learning_rate": 1.986826765963422e-05, + "loss": 0.1104, + "step": 11724 + }, + { + "epoch": 0.14657866446661166, + "grad_norm": 2.47989821434021, + "learning_rate": 1.9868126438005184e-05, + "loss": 1.3929, + "step": 11726 + }, + { + "epoch": 0.14660366509162728, + "grad_norm": 3.867917537689209, + "learning_rate": 1.9867985141221893e-05, + "loss": 0.5447, + "step": 11728 + }, + { + "epoch": 0.1466286657166429, + "grad_norm": 1.4136279821395874, + "learning_rate": 1.9867843769285423e-05, + "loss": 0.5421, + "step": 11730 + }, + { + "epoch": 0.14665366634165855, + "grad_norm": 8.011677742004395, + "learning_rate": 1.986770232219685e-05, + "loss": 1.8755, + "step": 11732 + }, + { + "epoch": 0.14667866696667417, + "grad_norm": 3.6191484928131104, + "learning_rate": 1.9867560799957255e-05, + "loss": 1.4009, + "step": 11734 + }, + { + "epoch": 0.1467036675916898, + "grad_norm": 1.9685680866241455, + "learning_rate": 1.9867419202567707e-05, + "loss": 0.6909, + "step": 11736 + }, + { + "epoch": 0.1467286682167054, + "grad_norm": 0.0009654007153585553, + "learning_rate": 1.986727753002929e-05, + "loss": 1.0987, + "step": 11738 + }, + { + "epoch": 0.14675366884172103, + "grad_norm": 0.4464739263057709, + "learning_rate": 1.9867135782343087e-05, + "loss": 1.0828, + "step": 11740 + }, + { + "epoch": 0.14677866946673668, + "grad_norm": 0.0007818458252586424, + "learning_rate": 1.9866993959510173e-05, + "loss": 1.1075, + "step": 11742 + }, + { + "epoch": 0.1468036700917523, + "grad_norm": 0.7779210209846497, + "learning_rate": 1.9866852061531624e-05, + "loss": 0.4602, + "step": 11744 + }, + { + "epoch": 0.14682867071676792, + "grad_norm": 3.3308441638946533, + "learning_rate": 1.986671008840853e-05, + "loss": 1.1291, + "step": 11746 + }, + { + "epoch": 0.14685367134178354, + "grad_norm": 2.1724750995635986, + "learning_rate": 1.986656804014196e-05, + "loss": 0.5595, + "step": 11748 + }, + { + "epoch": 0.14687867196679916, + "grad_norm": 4.161025047302246, + "learning_rate": 1.9866425916733013e-05, + "loss": 0.8018, + "step": 11750 + }, + { + "epoch": 0.1469036725918148, + "grad_norm": 5.413883686065674, + "learning_rate": 1.9866283718182756e-05, + "loss": 0.4956, + "step": 11752 + }, + { + "epoch": 0.14692867321683042, + "grad_norm": 2.774839162826538, + "learning_rate": 1.9866141444492277e-05, + "loss": 0.1997, + "step": 11754 + }, + { + "epoch": 0.14695367384184604, + "grad_norm": 5.874181270599365, + "learning_rate": 1.986599909566266e-05, + "loss": 0.8764, + "step": 11756 + }, + { + "epoch": 0.14697867446686166, + "grad_norm": 2.430215358734131, + "learning_rate": 1.986585667169499e-05, + "loss": 0.7968, + "step": 11758 + }, + { + "epoch": 0.14700367509187728, + "grad_norm": 2.9434478282928467, + "learning_rate": 1.9865714172590353e-05, + "loss": 0.1226, + "step": 11760 + }, + { + "epoch": 0.14702867571689293, + "grad_norm": 0.003754597157239914, + "learning_rate": 1.9865571598349826e-05, + "loss": 0.1263, + "step": 11762 + }, + { + "epoch": 0.14705367634190855, + "grad_norm": 3.4421298503875732, + "learning_rate": 1.9865428948974505e-05, + "loss": 0.554, + "step": 11764 + }, + { + "epoch": 0.14707867696692417, + "grad_norm": 8.946772575378418, + "learning_rate": 1.9865286224465472e-05, + "loss": 0.8774, + "step": 11766 + }, + { + "epoch": 0.1471036775919398, + "grad_norm": 1.7407487630844116, + "learning_rate": 1.9865143424823813e-05, + "loss": 0.0601, + "step": 11768 + }, + { + "epoch": 0.1471286782169554, + "grad_norm": 4.019123554229736, + "learning_rate": 1.9865000550050616e-05, + "loss": 1.0033, + "step": 11770 + }, + { + "epoch": 0.14715367884197106, + "grad_norm": 3.6783525943756104, + "learning_rate": 1.9864857600146973e-05, + "loss": 0.9343, + "step": 11772 + }, + { + "epoch": 0.14717867946698668, + "grad_norm": 0.0016811976674944162, + "learning_rate": 1.986471457511397e-05, + "loss": 0.9578, + "step": 11774 + }, + { + "epoch": 0.1472036800920023, + "grad_norm": 1.0091277360916138, + "learning_rate": 1.986457147495269e-05, + "loss": 0.4624, + "step": 11776 + }, + { + "epoch": 0.14722868071701792, + "grad_norm": 4.340851783752441, + "learning_rate": 1.986442829966423e-05, + "loss": 0.6154, + "step": 11778 + }, + { + "epoch": 0.14725368134203354, + "grad_norm": 0.0008195005939342082, + "learning_rate": 1.986428504924968e-05, + "loss": 0.1614, + "step": 11780 + }, + { + "epoch": 0.14727868196704919, + "grad_norm": 1.886413812637329, + "learning_rate": 1.9864141723710135e-05, + "loss": 0.2343, + "step": 11782 + }, + { + "epoch": 0.1473036825920648, + "grad_norm": 0.0006614267476834357, + "learning_rate": 1.9863998323046676e-05, + "loss": 0.184, + "step": 11784 + }, + { + "epoch": 0.14732868321708043, + "grad_norm": 5.959045886993408, + "learning_rate": 1.9863854847260405e-05, + "loss": 0.9158, + "step": 11786 + }, + { + "epoch": 0.14735368384209604, + "grad_norm": 2.218925952911377, + "learning_rate": 1.9863711296352408e-05, + "loss": 0.8257, + "step": 11788 + }, + { + "epoch": 0.14737868446711166, + "grad_norm": 3.151592254638672, + "learning_rate": 1.986356767032378e-05, + "loss": 0.28, + "step": 11790 + }, + { + "epoch": 0.1474036850921273, + "grad_norm": 8.023509979248047, + "learning_rate": 1.9863423969175613e-05, + "loss": 0.7711, + "step": 11792 + }, + { + "epoch": 0.14742868571714293, + "grad_norm": 0.00039422709960490465, + "learning_rate": 1.9863280192909008e-05, + "loss": 0.739, + "step": 11794 + }, + { + "epoch": 0.14745368634215855, + "grad_norm": 1.2321462631225586, + "learning_rate": 1.9863136341525055e-05, + "loss": 0.2422, + "step": 11796 + }, + { + "epoch": 0.14747868696717417, + "grad_norm": 3.287266731262207, + "learning_rate": 1.986299241502485e-05, + "loss": 0.7726, + "step": 11798 + }, + { + "epoch": 0.1475036875921898, + "grad_norm": 2.835829257965088, + "learning_rate": 1.986284841340949e-05, + "loss": 0.6653, + "step": 11800 + }, + { + "epoch": 0.14752868821720544, + "grad_norm": 1.3831180334091187, + "learning_rate": 1.986270433668007e-05, + "loss": 0.3797, + "step": 11802 + }, + { + "epoch": 0.14755368884222106, + "grad_norm": 3.0039610862731934, + "learning_rate": 1.986256018483769e-05, + "loss": 1.3358, + "step": 11804 + }, + { + "epoch": 0.14757868946723668, + "grad_norm": 4.957677841186523, + "learning_rate": 1.9862415957883443e-05, + "loss": 0.9305, + "step": 11806 + }, + { + "epoch": 0.1476036900922523, + "grad_norm": 3.224078416824341, + "learning_rate": 1.9862271655818436e-05, + "loss": 0.7098, + "step": 11808 + }, + { + "epoch": 0.14762869071726792, + "grad_norm": 1.4278063774108887, + "learning_rate": 1.9862127278643758e-05, + "loss": 1.1177, + "step": 11810 + }, + { + "epoch": 0.14765369134228357, + "grad_norm": 4.055924892425537, + "learning_rate": 1.9861982826360517e-05, + "loss": 1.145, + "step": 11812 + }, + { + "epoch": 0.14767869196729919, + "grad_norm": 1.4889713525772095, + "learning_rate": 1.9861838298969807e-05, + "loss": 0.9148, + "step": 11814 + }, + { + "epoch": 0.1477036925923148, + "grad_norm": 1.677054524421692, + "learning_rate": 1.9861693696472733e-05, + "loss": 0.2761, + "step": 11816 + }, + { + "epoch": 0.14772869321733043, + "grad_norm": 2.423421621322632, + "learning_rate": 1.986154901887039e-05, + "loss": 0.624, + "step": 11818 + }, + { + "epoch": 0.14775369384234605, + "grad_norm": 3.4483509063720703, + "learning_rate": 1.9861404266163888e-05, + "loss": 1.7624, + "step": 11820 + }, + { + "epoch": 0.1477786944673617, + "grad_norm": 3.539793014526367, + "learning_rate": 1.9861259438354326e-05, + "loss": 2.2757, + "step": 11822 + }, + { + "epoch": 0.1478036950923773, + "grad_norm": 2.82049298286438, + "learning_rate": 1.9861114535442804e-05, + "loss": 1.3081, + "step": 11824 + }, + { + "epoch": 0.14782869571739293, + "grad_norm": 4.723892688751221, + "learning_rate": 1.9860969557430428e-05, + "loss": 1.1045, + "step": 11826 + }, + { + "epoch": 0.14785369634240855, + "grad_norm": 0.23846982419490814, + "learning_rate": 1.9860824504318308e-05, + "loss": 0.3813, + "step": 11828 + }, + { + "epoch": 0.14787869696742417, + "grad_norm": 7.366908073425293, + "learning_rate": 1.986067937610754e-05, + "loss": 1.5454, + "step": 11830 + }, + { + "epoch": 0.14790369759243982, + "grad_norm": 2.8148245811462402, + "learning_rate": 1.9860534172799228e-05, + "loss": 0.4488, + "step": 11832 + }, + { + "epoch": 0.14792869821745544, + "grad_norm": 0.0005545956664718688, + "learning_rate": 1.9860388894394487e-05, + "loss": 0.6284, + "step": 11834 + }, + { + "epoch": 0.14795369884247106, + "grad_norm": 2.9523935317993164, + "learning_rate": 1.9860243540894418e-05, + "loss": 1.1868, + "step": 11836 + }, + { + "epoch": 0.14797869946748668, + "grad_norm": 5.9075117111206055, + "learning_rate": 1.9860098112300127e-05, + "loss": 1.7042, + "step": 11838 + }, + { + "epoch": 0.1480037000925023, + "grad_norm": 5.1659345626831055, + "learning_rate": 1.9859952608612724e-05, + "loss": 1.5334, + "step": 11840 + }, + { + "epoch": 0.14802870071751795, + "grad_norm": 2.733780860900879, + "learning_rate": 1.9859807029833315e-05, + "loss": 0.1043, + "step": 11842 + }, + { + "epoch": 0.14805370134253357, + "grad_norm": 3.7859508991241455, + "learning_rate": 1.985966137596301e-05, + "loss": 1.3687, + "step": 11844 + }, + { + "epoch": 0.1480787019675492, + "grad_norm": 5.216110706329346, + "learning_rate": 1.985951564700292e-05, + "loss": 1.2484, + "step": 11846 + }, + { + "epoch": 0.1481037025925648, + "grad_norm": 1.3446069955825806, + "learning_rate": 1.9859369842954152e-05, + "loss": 0.0434, + "step": 11848 + }, + { + "epoch": 0.14812870321758043, + "grad_norm": 1.6499505043029785, + "learning_rate": 1.985922396381782e-05, + "loss": 0.1674, + "step": 11850 + }, + { + "epoch": 0.14815370384259607, + "grad_norm": 5.591148853302002, + "learning_rate": 1.985907800959503e-05, + "loss": 1.5209, + "step": 11852 + }, + { + "epoch": 0.1481787044676117, + "grad_norm": 1.423699140548706, + "learning_rate": 1.9858931980286895e-05, + "loss": 0.1735, + "step": 11854 + }, + { + "epoch": 0.1482037050926273, + "grad_norm": 6.368609428405762, + "learning_rate": 1.985878587589453e-05, + "loss": 0.3438, + "step": 11856 + }, + { + "epoch": 0.14822870571764293, + "grad_norm": 0.0007930226274766028, + "learning_rate": 1.9858639696419047e-05, + "loss": 0.4665, + "step": 11858 + }, + { + "epoch": 0.14825370634265855, + "grad_norm": 3.4904158115386963, + "learning_rate": 1.9858493441861558e-05, + "loss": 0.6949, + "step": 11860 + }, + { + "epoch": 0.1482787069676742, + "grad_norm": 0.0006819767877459526, + "learning_rate": 1.9858347112223173e-05, + "loss": 0.4832, + "step": 11862 + }, + { + "epoch": 0.14830370759268982, + "grad_norm": 5.3027262687683105, + "learning_rate": 1.9858200707505014e-05, + "loss": 1.2482, + "step": 11864 + }, + { + "epoch": 0.14832870821770544, + "grad_norm": 3.484182119369507, + "learning_rate": 1.9858054227708192e-05, + "loss": 1.3365, + "step": 11866 + }, + { + "epoch": 0.14835370884272106, + "grad_norm": 2.726562738418579, + "learning_rate": 1.9857907672833823e-05, + "loss": 0.5832, + "step": 11868 + }, + { + "epoch": 0.14837870946773668, + "grad_norm": 0.034903671592473984, + "learning_rate": 1.9857761042883023e-05, + "loss": 0.5658, + "step": 11870 + }, + { + "epoch": 0.14840371009275233, + "grad_norm": 2.2523515224456787, + "learning_rate": 1.985761433785691e-05, + "loss": 0.2285, + "step": 11872 + }, + { + "epoch": 0.14842871071776795, + "grad_norm": 3.425919532775879, + "learning_rate": 1.9857467557756597e-05, + "loss": 0.6285, + "step": 11874 + }, + { + "epoch": 0.14845371134278357, + "grad_norm": 0.5725084543228149, + "learning_rate": 1.9857320702583206e-05, + "loss": 0.2911, + "step": 11876 + }, + { + "epoch": 0.1484787119677992, + "grad_norm": 0.0004959352081641555, + "learning_rate": 1.9857173772337858e-05, + "loss": 0.0007, + "step": 11878 + }, + { + "epoch": 0.1485037125928148, + "grad_norm": 3.319896936416626, + "learning_rate": 1.9857026767021667e-05, + "loss": 0.2568, + "step": 11880 + }, + { + "epoch": 0.14852871321783045, + "grad_norm": 3.878710985183716, + "learning_rate": 1.985687968663575e-05, + "loss": 0.7876, + "step": 11882 + }, + { + "epoch": 0.14855371384284607, + "grad_norm": 2.5379862785339355, + "learning_rate": 1.985673253118124e-05, + "loss": 0.2149, + "step": 11884 + }, + { + "epoch": 0.1485787144678617, + "grad_norm": 2.900663137435913, + "learning_rate": 1.9856585300659242e-05, + "loss": 0.4821, + "step": 11886 + }, + { + "epoch": 0.14860371509287731, + "grad_norm": 3.4142982959747314, + "learning_rate": 1.9856437995070884e-05, + "loss": 0.7805, + "step": 11888 + }, + { + "epoch": 0.14862871571789293, + "grad_norm": 2.1773648262023926, + "learning_rate": 1.985629061441729e-05, + "loss": 1.3924, + "step": 11890 + }, + { + "epoch": 0.14865371634290858, + "grad_norm": 5.048615455627441, + "learning_rate": 1.985614315869958e-05, + "loss": 1.2986, + "step": 11892 + }, + { + "epoch": 0.1486787169679242, + "grad_norm": 2.0231525897979736, + "learning_rate": 1.9855995627918877e-05, + "loss": 1.2971, + "step": 11894 + }, + { + "epoch": 0.14870371759293982, + "grad_norm": 2.0925071239471436, + "learning_rate": 1.9855848022076306e-05, + "loss": 1.1753, + "step": 11896 + }, + { + "epoch": 0.14872871821795544, + "grad_norm": 3.5175538063049316, + "learning_rate": 1.985570034117299e-05, + "loss": 0.7264, + "step": 11898 + }, + { + "epoch": 0.14875371884297106, + "grad_norm": 2.088043212890625, + "learning_rate": 1.9855552585210054e-05, + "loss": 1.7415, + "step": 11900 + }, + { + "epoch": 0.1487787194679867, + "grad_norm": 3.656700849533081, + "learning_rate": 1.9855404754188623e-05, + "loss": 1.4158, + "step": 11902 + }, + { + "epoch": 0.14880372009300233, + "grad_norm": 4.499278545379639, + "learning_rate": 1.9855256848109823e-05, + "loss": 2.215, + "step": 11904 + }, + { + "epoch": 0.14882872071801795, + "grad_norm": 5.32634162902832, + "learning_rate": 1.985510886697478e-05, + "loss": 3.0328, + "step": 11906 + }, + { + "epoch": 0.14885372134303357, + "grad_norm": 5.117880821228027, + "learning_rate": 1.9854960810784622e-05, + "loss": 1.4718, + "step": 11908 + }, + { + "epoch": 0.1488787219680492, + "grad_norm": 4.53443717956543, + "learning_rate": 1.9854812679540477e-05, + "loss": 1.3198, + "step": 11910 + }, + { + "epoch": 0.14890372259306484, + "grad_norm": 0.2661478519439697, + "learning_rate": 1.9854664473243474e-05, + "loss": 0.0077, + "step": 11912 + }, + { + "epoch": 0.14892872321808046, + "grad_norm": 1.7135982513427734, + "learning_rate": 1.9854516191894736e-05, + "loss": 0.3466, + "step": 11914 + }, + { + "epoch": 0.14895372384309608, + "grad_norm": 0.6585392355918884, + "learning_rate": 1.9854367835495397e-05, + "loss": 0.704, + "step": 11916 + }, + { + "epoch": 0.1489787244681117, + "grad_norm": 9.773283958435059, + "learning_rate": 1.9854219404046587e-05, + "loss": 1.4937, + "step": 11918 + }, + { + "epoch": 0.14900372509312731, + "grad_norm": 2.2520430088043213, + "learning_rate": 1.9854070897549435e-05, + "loss": 1.5042, + "step": 11920 + }, + { + "epoch": 0.14902872571814296, + "grad_norm": 0.7548986077308655, + "learning_rate": 1.9853922316005072e-05, + "loss": 0.2229, + "step": 11922 + }, + { + "epoch": 0.14905372634315858, + "grad_norm": 2.4693708419799805, + "learning_rate": 1.9853773659414633e-05, + "loss": 1.9524, + "step": 11924 + }, + { + "epoch": 0.1490787269681742, + "grad_norm": 2.168133497238159, + "learning_rate": 1.985362492777924e-05, + "loss": 0.7769, + "step": 11926 + }, + { + "epoch": 0.14910372759318982, + "grad_norm": 3.946993589401245, + "learning_rate": 1.9853476121100043e-05, + "loss": 0.3173, + "step": 11928 + }, + { + "epoch": 0.14912872821820544, + "grad_norm": 4.674585819244385, + "learning_rate": 1.9853327239378157e-05, + "loss": 1.5029, + "step": 11930 + }, + { + "epoch": 0.1491537288432211, + "grad_norm": 2.4118127822875977, + "learning_rate": 1.985317828261473e-05, + "loss": 0.7654, + "step": 11932 + }, + { + "epoch": 0.1491787294682367, + "grad_norm": 4.411170959472656, + "learning_rate": 1.9853029250810888e-05, + "loss": 0.8948, + "step": 11934 + }, + { + "epoch": 0.14920373009325233, + "grad_norm": 3.904582977294922, + "learning_rate": 1.9852880143967767e-05, + "loss": 0.9226, + "step": 11936 + }, + { + "epoch": 0.14922873071826795, + "grad_norm": 4.753373622894287, + "learning_rate": 1.9852730962086506e-05, + "loss": 1.3539, + "step": 11938 + }, + { + "epoch": 0.14925373134328357, + "grad_norm": 3.458505630493164, + "learning_rate": 1.985258170516824e-05, + "loss": 1.1415, + "step": 11940 + }, + { + "epoch": 0.14927873196829922, + "grad_norm": 0.0007242323481477797, + "learning_rate": 1.9852432373214105e-05, + "loss": 1.249, + "step": 11942 + }, + { + "epoch": 0.14930373259331484, + "grad_norm": 5.749562740325928, + "learning_rate": 1.9852282966225235e-05, + "loss": 1.5673, + "step": 11944 + }, + { + "epoch": 0.14932873321833046, + "grad_norm": 2.882242202758789, + "learning_rate": 1.9852133484202775e-05, + "loss": 0.6089, + "step": 11946 + }, + { + "epoch": 0.14935373384334608, + "grad_norm": 2.791297197341919, + "learning_rate": 1.9851983927147857e-05, + "loss": 0.5017, + "step": 11948 + }, + { + "epoch": 0.1493787344683617, + "grad_norm": 2.2773213386535645, + "learning_rate": 1.9851834295061625e-05, + "loss": 0.425, + "step": 11950 + }, + { + "epoch": 0.14940373509337734, + "grad_norm": 1.3941481113433838, + "learning_rate": 1.9851684587945214e-05, + "loss": 0.374, + "step": 11952 + }, + { + "epoch": 0.14942873571839296, + "grad_norm": 7.184381484985352, + "learning_rate": 1.9851534805799767e-05, + "loss": 0.59, + "step": 11954 + }, + { + "epoch": 0.14945373634340858, + "grad_norm": 3.8006062507629395, + "learning_rate": 1.9851384948626425e-05, + "loss": 0.7633, + "step": 11956 + }, + { + "epoch": 0.1494787369684242, + "grad_norm": 3.3017613887786865, + "learning_rate": 1.9851235016426327e-05, + "loss": 1.2203, + "step": 11958 + }, + { + "epoch": 0.14950373759343982, + "grad_norm": 3.4992740154266357, + "learning_rate": 1.985108500920062e-05, + "loss": 1.1577, + "step": 11960 + }, + { + "epoch": 0.14952873821845547, + "grad_norm": 4.445028305053711, + "learning_rate": 1.9850934926950434e-05, + "loss": 0.0931, + "step": 11962 + }, + { + "epoch": 0.1495537388434711, + "grad_norm": 0.5720308423042297, + "learning_rate": 1.985078476967693e-05, + "loss": 0.4216, + "step": 11964 + }, + { + "epoch": 0.1495787394684867, + "grad_norm": 2.472127914428711, + "learning_rate": 1.9850634537381238e-05, + "loss": 1.2102, + "step": 11966 + }, + { + "epoch": 0.14960374009350233, + "grad_norm": 4.6942009925842285, + "learning_rate": 1.9850484230064508e-05, + "loss": 0.7658, + "step": 11968 + }, + { + "epoch": 0.14962874071851795, + "grad_norm": 2.5028727054595947, + "learning_rate": 1.985033384772788e-05, + "loss": 1.3056, + "step": 11970 + }, + { + "epoch": 0.1496537413435336, + "grad_norm": 0.0003280418459326029, + "learning_rate": 1.9850183390372507e-05, + "loss": 1.0433, + "step": 11972 + }, + { + "epoch": 0.14967874196854922, + "grad_norm": 1.9742763042449951, + "learning_rate": 1.9850032857999526e-05, + "loss": 0.8675, + "step": 11974 + }, + { + "epoch": 0.14970374259356484, + "grad_norm": 0.011451167985796928, + "learning_rate": 1.9849882250610088e-05, + "loss": 0.9706, + "step": 11976 + }, + { + "epoch": 0.14972874321858046, + "grad_norm": 2.6406524181365967, + "learning_rate": 1.9849731568205343e-05, + "loss": 0.9747, + "step": 11978 + }, + { + "epoch": 0.14975374384359608, + "grad_norm": 3.4703006744384766, + "learning_rate": 1.984958081078643e-05, + "loss": 0.6081, + "step": 11980 + }, + { + "epoch": 0.14977874446861172, + "grad_norm": 1.6843838691711426, + "learning_rate": 1.9849429978354505e-05, + "loss": 0.5468, + "step": 11982 + }, + { + "epoch": 0.14980374509362734, + "grad_norm": 3.336015224456787, + "learning_rate": 1.9849279070910717e-05, + "loss": 1.5723, + "step": 11984 + }, + { + "epoch": 0.14982874571864296, + "grad_norm": 4.747019290924072, + "learning_rate": 1.984912808845621e-05, + "loss": 1.3063, + "step": 11986 + }, + { + "epoch": 0.14985374634365858, + "grad_norm": 4.6439738273620605, + "learning_rate": 1.9848977030992136e-05, + "loss": 1.7218, + "step": 11988 + }, + { + "epoch": 0.1498787469686742, + "grad_norm": 2.5069310665130615, + "learning_rate": 1.9848825898519646e-05, + "loss": 0.5071, + "step": 11990 + }, + { + "epoch": 0.14990374759368985, + "grad_norm": 0.0005805697292089462, + "learning_rate": 1.984867469103989e-05, + "loss": 0.0, + "step": 11992 + }, + { + "epoch": 0.14992874821870547, + "grad_norm": 5.533572673797607, + "learning_rate": 1.984852340855402e-05, + "loss": 1.218, + "step": 11994 + }, + { + "epoch": 0.1499537488437211, + "grad_norm": 1.4932324886322021, + "learning_rate": 1.984837205106319e-05, + "loss": 1.2008, + "step": 11996 + }, + { + "epoch": 0.1499787494687367, + "grad_norm": 3.714975357055664, + "learning_rate": 1.9848220618568553e-05, + "loss": 1.6464, + "step": 11998 + }, + { + "epoch": 0.15000375009375233, + "grad_norm": 1.8235222101211548, + "learning_rate": 1.9848069111071257e-05, + "loss": 0.6965, + "step": 12000 + }, + { + "epoch": 0.15002875071876798, + "grad_norm": 2.7581100463867188, + "learning_rate": 1.984791752857246e-05, + "loss": 1.1809, + "step": 12002 + }, + { + "epoch": 0.1500537513437836, + "grad_norm": 4.161800861358643, + "learning_rate": 1.9847765871073315e-05, + "loss": 1.1309, + "step": 12004 + }, + { + "epoch": 0.15007875196879922, + "grad_norm": 3.9179527759552, + "learning_rate": 1.984761413857498e-05, + "loss": 1.786, + "step": 12006 + }, + { + "epoch": 0.15010375259381484, + "grad_norm": 1.2281394004821777, + "learning_rate": 1.9847462331078605e-05, + "loss": 0.7774, + "step": 12008 + }, + { + "epoch": 0.15012875321883046, + "grad_norm": 2.574235200881958, + "learning_rate": 1.9847310448585352e-05, + "loss": 1.9589, + "step": 12010 + }, + { + "epoch": 0.1501537538438461, + "grad_norm": 3.6848583221435547, + "learning_rate": 1.9847158491096374e-05, + "loss": 1.2798, + "step": 12012 + }, + { + "epoch": 0.15017875446886172, + "grad_norm": 0.17844648659229279, + "learning_rate": 1.984700645861283e-05, + "loss": 0.5346, + "step": 12014 + }, + { + "epoch": 0.15020375509387734, + "grad_norm": 9.68855094909668, + "learning_rate": 1.9846854351135877e-05, + "loss": 1.6646, + "step": 12016 + }, + { + "epoch": 0.15022875571889296, + "grad_norm": 2.870929718017578, + "learning_rate": 1.9846702168666675e-05, + "loss": 0.5963, + "step": 12018 + }, + { + "epoch": 0.15025375634390858, + "grad_norm": 3.2390148639678955, + "learning_rate": 1.984654991120638e-05, + "loss": 1.5013, + "step": 12020 + }, + { + "epoch": 0.15027875696892423, + "grad_norm": 2.2728538513183594, + "learning_rate": 1.9846397578756155e-05, + "loss": 1.017, + "step": 12022 + }, + { + "epoch": 0.15030375759393985, + "grad_norm": 6.262406349182129, + "learning_rate": 1.984624517131716e-05, + "loss": 1.859, + "step": 12024 + }, + { + "epoch": 0.15032875821895547, + "grad_norm": 4.632371425628662, + "learning_rate": 1.984609268889055e-05, + "loss": 0.9655, + "step": 12026 + }, + { + "epoch": 0.1503537588439711, + "grad_norm": 4.704301834106445, + "learning_rate": 1.9845940131477493e-05, + "loss": 0.9271, + "step": 12028 + }, + { + "epoch": 0.1503787594689867, + "grad_norm": 3.295097589492798, + "learning_rate": 1.9845787499079148e-05, + "loss": 1.6732, + "step": 12030 + }, + { + "epoch": 0.15040376009400236, + "grad_norm": 0.0007964130491018295, + "learning_rate": 1.9845634791696676e-05, + "loss": 0.8034, + "step": 12032 + }, + { + "epoch": 0.15042876071901798, + "grad_norm": 0.8674212098121643, + "learning_rate": 1.9845482009331243e-05, + "loss": 0.8102, + "step": 12034 + }, + { + "epoch": 0.1504537613440336, + "grad_norm": 2.854853868484497, + "learning_rate": 1.9845329151984013e-05, + "loss": 1.424, + "step": 12036 + }, + { + "epoch": 0.15047876196904922, + "grad_norm": 3.699810266494751, + "learning_rate": 1.9845176219656147e-05, + "loss": 1.8266, + "step": 12038 + }, + { + "epoch": 0.15050376259406484, + "grad_norm": 0.00545743340626359, + "learning_rate": 1.9845023212348812e-05, + "loss": 0.0001, + "step": 12040 + }, + { + "epoch": 0.15052876321908049, + "grad_norm": 3.8624491691589355, + "learning_rate": 1.9844870130063168e-05, + "loss": 1.3125, + "step": 12042 + }, + { + "epoch": 0.1505537638440961, + "grad_norm": 2.8811328411102295, + "learning_rate": 1.984471697280039e-05, + "loss": 1.961, + "step": 12044 + }, + { + "epoch": 0.15057876446911173, + "grad_norm": 1.7705845832824707, + "learning_rate": 1.9844563740561636e-05, + "loss": 0.0995, + "step": 12046 + }, + { + "epoch": 0.15060376509412735, + "grad_norm": 1.6469614505767822, + "learning_rate": 1.984441043334808e-05, + "loss": 0.8065, + "step": 12048 + }, + { + "epoch": 0.15062876571914297, + "grad_norm": 9.922962188720703, + "learning_rate": 1.9844257051160886e-05, + "loss": 0.7267, + "step": 12050 + }, + { + "epoch": 0.1506537663441586, + "grad_norm": 5.17098331451416, + "learning_rate": 1.9844103594001224e-05, + "loss": 0.6448, + "step": 12052 + }, + { + "epoch": 0.15067876696917423, + "grad_norm": 0.4449999928474426, + "learning_rate": 1.9843950061870256e-05, + "loss": 0.011, + "step": 12054 + }, + { + "epoch": 0.15070376759418985, + "grad_norm": 2.6510603427886963, + "learning_rate": 1.9843796454769157e-05, + "loss": 1.0201, + "step": 12056 + }, + { + "epoch": 0.15072876821920547, + "grad_norm": 3.980003595352173, + "learning_rate": 1.98436427726991e-05, + "loss": 1.4151, + "step": 12058 + }, + { + "epoch": 0.1507537688442211, + "grad_norm": 1.7370550632476807, + "learning_rate": 1.984348901566125e-05, + "loss": 1.39, + "step": 12060 + }, + { + "epoch": 0.15077876946923674, + "grad_norm": 0.0008303723880089819, + "learning_rate": 1.984333518365678e-05, + "loss": 0.0019, + "step": 12062 + }, + { + "epoch": 0.15080377009425236, + "grad_norm": 4.197431564331055, + "learning_rate": 1.984318127668686e-05, + "loss": 0.5408, + "step": 12064 + }, + { + "epoch": 0.15082877071926798, + "grad_norm": 1.669101357460022, + "learning_rate": 1.9843027294752666e-05, + "loss": 0.9045, + "step": 12066 + }, + { + "epoch": 0.1508537713442836, + "grad_norm": 0.42903703451156616, + "learning_rate": 1.9842873237855365e-05, + "loss": 0.4078, + "step": 12068 + }, + { + "epoch": 0.15087877196929922, + "grad_norm": 0.0007172105833888054, + "learning_rate": 1.9842719105996135e-05, + "loss": 0.0, + "step": 12070 + }, + { + "epoch": 0.15090377259431487, + "grad_norm": 2.629884719848633, + "learning_rate": 1.984256489917615e-05, + "loss": 1.4951, + "step": 12072 + }, + { + "epoch": 0.1509287732193305, + "grad_norm": 0.0013295868411660194, + "learning_rate": 1.9842410617396578e-05, + "loss": 0.6032, + "step": 12074 + }, + { + "epoch": 0.1509537738443461, + "grad_norm": 2.523071527481079, + "learning_rate": 1.9842256260658602e-05, + "loss": 0.6145, + "step": 12076 + }, + { + "epoch": 0.15097877446936173, + "grad_norm": 2.9145498275756836, + "learning_rate": 1.984210182896339e-05, + "loss": 0.919, + "step": 12078 + }, + { + "epoch": 0.15100377509437735, + "grad_norm": 2.775315523147583, + "learning_rate": 1.9841947322312126e-05, + "loss": 1.1772, + "step": 12080 + }, + { + "epoch": 0.151028775719393, + "grad_norm": 6.468705654144287, + "learning_rate": 1.9841792740705984e-05, + "loss": 1.6635, + "step": 12082 + }, + { + "epoch": 0.1510537763444086, + "grad_norm": 0.007172429468482733, + "learning_rate": 1.9841638084146137e-05, + "loss": 0.8961, + "step": 12084 + }, + { + "epoch": 0.15107877696942423, + "grad_norm": 2.871474504470825, + "learning_rate": 1.984148335263377e-05, + "loss": 0.7613, + "step": 12086 + }, + { + "epoch": 0.15110377759443985, + "grad_norm": 1.5600799322128296, + "learning_rate": 1.9841328546170056e-05, + "loss": 0.4099, + "step": 12088 + }, + { + "epoch": 0.15112877821945547, + "grad_norm": 4.032410144805908, + "learning_rate": 1.9841173664756172e-05, + "loss": 2.0763, + "step": 12090 + }, + { + "epoch": 0.15115377884447112, + "grad_norm": 8.643178939819336, + "learning_rate": 1.9841018708393306e-05, + "loss": 1.1442, + "step": 12092 + }, + { + "epoch": 0.15117877946948674, + "grad_norm": 3.525195360183716, + "learning_rate": 1.9840863677082628e-05, + "loss": 0.7164, + "step": 12094 + }, + { + "epoch": 0.15120378009450236, + "grad_norm": 7.565273284912109, + "learning_rate": 1.9840708570825325e-05, + "loss": 0.6419, + "step": 12096 + }, + { + "epoch": 0.15122878071951798, + "grad_norm": 3.96061110496521, + "learning_rate": 1.984055338962258e-05, + "loss": 1.891, + "step": 12098 + }, + { + "epoch": 0.1512537813445336, + "grad_norm": 3.313373327255249, + "learning_rate": 1.9840398133475572e-05, + "loss": 1.0921, + "step": 12100 + }, + { + "epoch": 0.15127878196954925, + "grad_norm": 6.12595272064209, + "learning_rate": 1.9840242802385483e-05, + "loss": 0.8637, + "step": 12102 + }, + { + "epoch": 0.15130378259456487, + "grad_norm": 0.9123494625091553, + "learning_rate": 1.9840087396353497e-05, + "loss": 0.1646, + "step": 12104 + }, + { + "epoch": 0.1513287832195805, + "grad_norm": 0.4488367438316345, + "learning_rate": 1.9839931915380792e-05, + "loss": 0.0089, + "step": 12106 + }, + { + "epoch": 0.1513537838445961, + "grad_norm": 5.342685222625732, + "learning_rate": 1.9839776359468562e-05, + "loss": 0.7957, + "step": 12108 + }, + { + "epoch": 0.15137878446961173, + "grad_norm": 2.6056690216064453, + "learning_rate": 1.9839620728617985e-05, + "loss": 1.54, + "step": 12110 + }, + { + "epoch": 0.15140378509462737, + "grad_norm": 2.0188863277435303, + "learning_rate": 1.9839465022830248e-05, + "loss": 0.3653, + "step": 12112 + }, + { + "epoch": 0.151428785719643, + "grad_norm": 3.574493885040283, + "learning_rate": 1.983930924210654e-05, + "loss": 0.0227, + "step": 12114 + }, + { + "epoch": 0.15145378634465861, + "grad_norm": 3.7627921104431152, + "learning_rate": 1.983915338644804e-05, + "loss": 0.7012, + "step": 12116 + }, + { + "epoch": 0.15147878696967423, + "grad_norm": 4.835992813110352, + "learning_rate": 1.983899745585594e-05, + "loss": 2.1203, + "step": 12118 + }, + { + "epoch": 0.15150378759468985, + "grad_norm": 3.5990490913391113, + "learning_rate": 1.983884145033143e-05, + "loss": 0.4809, + "step": 12120 + }, + { + "epoch": 0.1515287882197055, + "grad_norm": 2.4292638301849365, + "learning_rate": 1.983868536987569e-05, + "loss": 0.6622, + "step": 12122 + }, + { + "epoch": 0.15155378884472112, + "grad_norm": 3.1938815116882324, + "learning_rate": 1.9838529214489914e-05, + "loss": 0.813, + "step": 12124 + }, + { + "epoch": 0.15157878946973674, + "grad_norm": 5.113379001617432, + "learning_rate": 1.9838372984175293e-05, + "loss": 1.1936, + "step": 12126 + }, + { + "epoch": 0.15160379009475236, + "grad_norm": 2.2007391452789307, + "learning_rate": 1.9838216678933014e-05, + "loss": 0.1725, + "step": 12128 + }, + { + "epoch": 0.15162879071976798, + "grad_norm": 8.945024490356445, + "learning_rate": 1.9838060298764268e-05, + "loss": 1.2133, + "step": 12130 + }, + { + "epoch": 0.15165379134478363, + "grad_norm": 4.764930725097656, + "learning_rate": 1.9837903843670247e-05, + "loss": 0.277, + "step": 12132 + }, + { + "epoch": 0.15167879196979925, + "grad_norm": 2.6617629528045654, + "learning_rate": 1.983774731365214e-05, + "loss": 1.1164, + "step": 12134 + }, + { + "epoch": 0.15170379259481487, + "grad_norm": 0.005130556877702475, + "learning_rate": 1.983759070871114e-05, + "loss": 0.0287, + "step": 12136 + }, + { + "epoch": 0.1517287932198305, + "grad_norm": 3.1147620677948, + "learning_rate": 1.983743402884844e-05, + "loss": 1.7115, + "step": 12138 + }, + { + "epoch": 0.1517537938448461, + "grad_norm": 3.6851725578308105, + "learning_rate": 1.9837277274065236e-05, + "loss": 1.603, + "step": 12140 + }, + { + "epoch": 0.15177879446986176, + "grad_norm": 3.3778374195098877, + "learning_rate": 1.983712044436272e-05, + "loss": 0.61, + "step": 12142 + }, + { + "epoch": 0.15180379509487738, + "grad_norm": 4.864071846008301, + "learning_rate": 1.983696353974208e-05, + "loss": 0.9525, + "step": 12144 + }, + { + "epoch": 0.151828795719893, + "grad_norm": 1.2882513999938965, + "learning_rate": 1.9836806560204524e-05, + "loss": 0.6009, + "step": 12146 + }, + { + "epoch": 0.15185379634490861, + "grad_norm": 5.092992305755615, + "learning_rate": 1.9836649505751236e-05, + "loss": 0.216, + "step": 12148 + }, + { + "epoch": 0.15187879696992423, + "grad_norm": 3.2418789863586426, + "learning_rate": 1.9836492376383417e-05, + "loss": 1.8162, + "step": 12150 + }, + { + "epoch": 0.15190379759493988, + "grad_norm": 1.5525095462799072, + "learning_rate": 1.9836335172102266e-05, + "loss": 1.0269, + "step": 12152 + }, + { + "epoch": 0.1519287982199555, + "grad_norm": 1.7787935733795166, + "learning_rate": 1.9836177892908975e-05, + "loss": 0.9475, + "step": 12154 + }, + { + "epoch": 0.15195379884497112, + "grad_norm": 2.1755897998809814, + "learning_rate": 1.9836020538804745e-05, + "loss": 1.0178, + "step": 12156 + }, + { + "epoch": 0.15197879946998674, + "grad_norm": 13.65096378326416, + "learning_rate": 1.983586310979077e-05, + "loss": 1.8624, + "step": 12158 + }, + { + "epoch": 0.15200380009500236, + "grad_norm": 3.141150712966919, + "learning_rate": 1.9835705605868255e-05, + "loss": 1.3024, + "step": 12160 + }, + { + "epoch": 0.152028800720018, + "grad_norm": 2.7553951740264893, + "learning_rate": 1.9835548027038398e-05, + "loss": 2.1653, + "step": 12162 + }, + { + "epoch": 0.15205380134503363, + "grad_norm": 2.2622828483581543, + "learning_rate": 1.9835390373302398e-05, + "loss": 2.0317, + "step": 12164 + }, + { + "epoch": 0.15207880197004925, + "grad_norm": 3.3119893074035645, + "learning_rate": 1.9835232644661455e-05, + "loss": 1.4047, + "step": 12166 + }, + { + "epoch": 0.15210380259506487, + "grad_norm": 3.3122639656066895, + "learning_rate": 1.983507484111677e-05, + "loss": 0.8803, + "step": 12168 + }, + { + "epoch": 0.1521288032200805, + "grad_norm": 2.600353717803955, + "learning_rate": 1.983491696266955e-05, + "loss": 0.7943, + "step": 12170 + }, + { + "epoch": 0.15215380384509614, + "grad_norm": 2.093074083328247, + "learning_rate": 1.9834759009320987e-05, + "loss": 0.9333, + "step": 12172 + }, + { + "epoch": 0.15217880447011176, + "grad_norm": 4.213756084442139, + "learning_rate": 1.983460098107229e-05, + "loss": 0.7636, + "step": 12174 + }, + { + "epoch": 0.15220380509512738, + "grad_norm": 3.207235813140869, + "learning_rate": 1.983444287792467e-05, + "loss": 1.0137, + "step": 12176 + }, + { + "epoch": 0.152228805720143, + "grad_norm": 2.215423822402954, + "learning_rate": 1.983428469987932e-05, + "loss": 0.2199, + "step": 12178 + }, + { + "epoch": 0.15225380634515862, + "grad_norm": 2.818462610244751, + "learning_rate": 1.9834126446937448e-05, + "loss": 1.3694, + "step": 12180 + }, + { + "epoch": 0.15227880697017426, + "grad_norm": 3.2435920238494873, + "learning_rate": 1.983396811910026e-05, + "loss": 0.59, + "step": 12182 + }, + { + "epoch": 0.15230380759518988, + "grad_norm": 0.4251645505428314, + "learning_rate": 1.9833809716368958e-05, + "loss": 0.5087, + "step": 12184 + }, + { + "epoch": 0.1523288082202055, + "grad_norm": 1.821762204170227, + "learning_rate": 1.9833651238744756e-05, + "loss": 0.8931, + "step": 12186 + }, + { + "epoch": 0.15235380884522112, + "grad_norm": 0.08956807106733322, + "learning_rate": 1.9833492686228855e-05, + "loss": 0.2366, + "step": 12188 + }, + { + "epoch": 0.15237880947023674, + "grad_norm": 5.03751802444458, + "learning_rate": 1.9833334058822466e-05, + "loss": 0.3726, + "step": 12190 + }, + { + "epoch": 0.1524038100952524, + "grad_norm": 5.785824298858643, + "learning_rate": 1.9833175356526793e-05, + "loss": 0.7329, + "step": 12192 + }, + { + "epoch": 0.152428810720268, + "grad_norm": 2.52066707611084, + "learning_rate": 1.983301657934305e-05, + "loss": 0.5743, + "step": 12194 + }, + { + "epoch": 0.15245381134528363, + "grad_norm": 3.6531128883361816, + "learning_rate": 1.983285772727244e-05, + "loss": 1.1763, + "step": 12196 + }, + { + "epoch": 0.15247881197029925, + "grad_norm": 3.3388803005218506, + "learning_rate": 1.983269880031618e-05, + "loss": 1.7312, + "step": 12198 + }, + { + "epoch": 0.15250381259531487, + "grad_norm": 1.9052172899246216, + "learning_rate": 1.9832539798475473e-05, + "loss": 0.4932, + "step": 12200 + }, + { + "epoch": 0.15252881322033052, + "grad_norm": 2.2541451454162598, + "learning_rate": 1.9832380721751535e-05, + "loss": 1.2214, + "step": 12202 + }, + { + "epoch": 0.15255381384534614, + "grad_norm": 1.2454018592834473, + "learning_rate": 1.9832221570145575e-05, + "loss": 0.0903, + "step": 12204 + }, + { + "epoch": 0.15257881447036176, + "grad_norm": 4.403197765350342, + "learning_rate": 1.9832062343658806e-05, + "loss": 1.4895, + "step": 12206 + }, + { + "epoch": 0.15260381509537738, + "grad_norm": 3.322340488433838, + "learning_rate": 1.983190304229244e-05, + "loss": 0.8996, + "step": 12208 + }, + { + "epoch": 0.152628815720393, + "grad_norm": 0.5555328726768494, + "learning_rate": 1.983174366604769e-05, + "loss": 0.7268, + "step": 12210 + }, + { + "epoch": 0.15265381634540864, + "grad_norm": 3.512820243835449, + "learning_rate": 1.9831584214925775e-05, + "loss": 1.3048, + "step": 12212 + }, + { + "epoch": 0.15267881697042426, + "grad_norm": 2.662903308868408, + "learning_rate": 1.9831424688927902e-05, + "loss": 0.3414, + "step": 12214 + }, + { + "epoch": 0.15270381759543988, + "grad_norm": 2.4542765617370605, + "learning_rate": 1.9831265088055287e-05, + "loss": 0.301, + "step": 12216 + }, + { + "epoch": 0.1527288182204555, + "grad_norm": 2.607954263687134, + "learning_rate": 1.9831105412309148e-05, + "loss": 0.9542, + "step": 12218 + }, + { + "epoch": 0.15275381884547112, + "grad_norm": 3.733161211013794, + "learning_rate": 1.9830945661690703e-05, + "loss": 0.6445, + "step": 12220 + }, + { + "epoch": 0.15277881947048677, + "grad_norm": 3.163971424102783, + "learning_rate": 1.9830785836201167e-05, + "loss": 0.6131, + "step": 12222 + }, + { + "epoch": 0.1528038200955024, + "grad_norm": 5.549933433532715, + "learning_rate": 1.983062593584175e-05, + "loss": 1.5772, + "step": 12224 + }, + { + "epoch": 0.152828820720518, + "grad_norm": 2.609114170074463, + "learning_rate": 1.9830465960613682e-05, + "loss": 1.5574, + "step": 12226 + }, + { + "epoch": 0.15285382134553363, + "grad_norm": 0.004579655360430479, + "learning_rate": 1.9830305910518174e-05, + "loss": 0.3486, + "step": 12228 + }, + { + "epoch": 0.15287882197054925, + "grad_norm": 0.8078145384788513, + "learning_rate": 1.9830145785556446e-05, + "loss": 0.0189, + "step": 12230 + }, + { + "epoch": 0.1529038225955649, + "grad_norm": 9.006863594055176, + "learning_rate": 1.9829985585729718e-05, + "loss": 1.6677, + "step": 12232 + }, + { + "epoch": 0.15292882322058052, + "grad_norm": 3.1050078868865967, + "learning_rate": 1.982982531103921e-05, + "loss": 0.6223, + "step": 12234 + }, + { + "epoch": 0.15295382384559614, + "grad_norm": 2.957597494125366, + "learning_rate": 1.9829664961486142e-05, + "loss": 1.6741, + "step": 12236 + }, + { + "epoch": 0.15297882447061176, + "grad_norm": 2.4489645957946777, + "learning_rate": 1.9829504537071735e-05, + "loss": 0.8015, + "step": 12238 + }, + { + "epoch": 0.15300382509562738, + "grad_norm": 1.9351247549057007, + "learning_rate": 1.9829344037797213e-05, + "loss": 0.5062, + "step": 12240 + }, + { + "epoch": 0.15302882572064302, + "grad_norm": 0.4126090705394745, + "learning_rate": 1.9829183463663796e-05, + "loss": 1.0628, + "step": 12242 + }, + { + "epoch": 0.15305382634565864, + "grad_norm": 3.9562411308288574, + "learning_rate": 1.9829022814672707e-05, + "loss": 1.7523, + "step": 12244 + }, + { + "epoch": 0.15307882697067426, + "grad_norm": 3.357912540435791, + "learning_rate": 1.982886209082517e-05, + "loss": 1.1724, + "step": 12246 + }, + { + "epoch": 0.15310382759568988, + "grad_norm": 3.54248046875, + "learning_rate": 1.982870129212241e-05, + "loss": 1.3539, + "step": 12248 + }, + { + "epoch": 0.1531288282207055, + "grad_norm": 6.928567409515381, + "learning_rate": 1.9828540418565652e-05, + "loss": 1.6314, + "step": 12250 + }, + { + "epoch": 0.15315382884572115, + "grad_norm": 2.3825454711914062, + "learning_rate": 1.9828379470156118e-05, + "loss": 0.1433, + "step": 12252 + }, + { + "epoch": 0.15317882947073677, + "grad_norm": 5.577332019805908, + "learning_rate": 1.9828218446895037e-05, + "loss": 2.1779, + "step": 12254 + }, + { + "epoch": 0.1532038300957524, + "grad_norm": 0.09222619235515594, + "learning_rate": 1.982805734878363e-05, + "loss": 0.8393, + "step": 12256 + }, + { + "epoch": 0.153228830720768, + "grad_norm": 3.030247688293457, + "learning_rate": 1.9827896175823134e-05, + "loss": 0.6271, + "step": 12258 + }, + { + "epoch": 0.15325383134578363, + "grad_norm": 2.651611804962158, + "learning_rate": 1.9827734928014772e-05, + "loss": 1.0516, + "step": 12260 + }, + { + "epoch": 0.15327883197079928, + "grad_norm": 0.5342461466789246, + "learning_rate": 1.9827573605359766e-05, + "loss": 0.1118, + "step": 12262 + }, + { + "epoch": 0.1533038325958149, + "grad_norm": 2.9452617168426514, + "learning_rate": 1.982741220785935e-05, + "loss": 0.4613, + "step": 12264 + }, + { + "epoch": 0.15332883322083052, + "grad_norm": 9.992552757263184, + "learning_rate": 1.982725073551475e-05, + "loss": 1.1673, + "step": 12266 + }, + { + "epoch": 0.15335383384584614, + "grad_norm": 1.63748300075531, + "learning_rate": 1.9827089188327203e-05, + "loss": 0.1194, + "step": 12268 + }, + { + "epoch": 0.15337883447086176, + "grad_norm": 0.01349041610956192, + "learning_rate": 1.982692756629793e-05, + "loss": 1.1005, + "step": 12270 + }, + { + "epoch": 0.1534038350958774, + "grad_norm": 1.4863767623901367, + "learning_rate": 1.982676586942817e-05, + "loss": 1.4266, + "step": 12272 + }, + { + "epoch": 0.15342883572089303, + "grad_norm": 2.5739924907684326, + "learning_rate": 1.9826604097719147e-05, + "loss": 0.9982, + "step": 12274 + }, + { + "epoch": 0.15345383634590865, + "grad_norm": 0.004670318681746721, + "learning_rate": 1.98264422511721e-05, + "loss": 0.7969, + "step": 12276 + }, + { + "epoch": 0.15347883697092427, + "grad_norm": 1.7593718767166138, + "learning_rate": 1.9826280329788255e-05, + "loss": 0.8222, + "step": 12278 + }, + { + "epoch": 0.15350383759593988, + "grad_norm": 1.4867825508117676, + "learning_rate": 1.9826118333568855e-05, + "loss": 0.205, + "step": 12280 + }, + { + "epoch": 0.15352883822095553, + "grad_norm": 2.796107053756714, + "learning_rate": 1.982595626251512e-05, + "loss": 0.712, + "step": 12282 + }, + { + "epoch": 0.15355383884597115, + "grad_norm": 1.7574468851089478, + "learning_rate": 1.9825794116628298e-05, + "loss": 1.2663, + "step": 12284 + }, + { + "epoch": 0.15357883947098677, + "grad_norm": 0.07577385753393173, + "learning_rate": 1.982563189590961e-05, + "loss": 0.1776, + "step": 12286 + }, + { + "epoch": 0.1536038400960024, + "grad_norm": 2.9457645416259766, + "learning_rate": 1.9825469600360304e-05, + "loss": 0.6297, + "step": 12288 + }, + { + "epoch": 0.153628840721018, + "grad_norm": 0.015543724410235882, + "learning_rate": 1.9825307229981613e-05, + "loss": 0.298, + "step": 12290 + }, + { + "epoch": 0.15365384134603366, + "grad_norm": 0.016935458406805992, + "learning_rate": 1.9825144784774767e-05, + "loss": 0.3879, + "step": 12292 + }, + { + "epoch": 0.15367884197104928, + "grad_norm": 3.559288740158081, + "learning_rate": 1.982498226474101e-05, + "loss": 0.6103, + "step": 12294 + }, + { + "epoch": 0.1537038425960649, + "grad_norm": 0.8759868741035461, + "learning_rate": 1.9824819669881575e-05, + "loss": 0.9383, + "step": 12296 + }, + { + "epoch": 0.15372884322108052, + "grad_norm": 2.0878093242645264, + "learning_rate": 1.9824657000197707e-05, + "loss": 0.7792, + "step": 12298 + }, + { + "epoch": 0.15375384384609614, + "grad_norm": 2.611266613006592, + "learning_rate": 1.9824494255690637e-05, + "loss": 0.7694, + "step": 12300 + }, + { + "epoch": 0.1537788444711118, + "grad_norm": 5.4460248947143555, + "learning_rate": 1.9824331436361614e-05, + "loss": 1.3353, + "step": 12302 + }, + { + "epoch": 0.1538038450961274, + "grad_norm": 3.59495210647583, + "learning_rate": 1.9824168542211866e-05, + "loss": 0.8144, + "step": 12304 + }, + { + "epoch": 0.15382884572114303, + "grad_norm": 3.8152248859405518, + "learning_rate": 1.982400557324264e-05, + "loss": 0.3814, + "step": 12306 + }, + { + "epoch": 0.15385384634615865, + "grad_norm": 2.3891825675964355, + "learning_rate": 1.982384252945518e-05, + "loss": 0.1275, + "step": 12308 + }, + { + "epoch": 0.15387884697117427, + "grad_norm": 2.7128183841705322, + "learning_rate": 1.982367941085072e-05, + "loss": 0.6675, + "step": 12310 + }, + { + "epoch": 0.1539038475961899, + "grad_norm": 0.007342178840190172, + "learning_rate": 1.982351621743051e-05, + "loss": 0.333, + "step": 12312 + }, + { + "epoch": 0.15392884822120553, + "grad_norm": 0.07488783448934555, + "learning_rate": 1.9823352949195786e-05, + "loss": 0.6513, + "step": 12314 + }, + { + "epoch": 0.15395384884622115, + "grad_norm": 0.008443186990916729, + "learning_rate": 1.9823189606147802e-05, + "loss": 0.2091, + "step": 12316 + }, + { + "epoch": 0.15397884947123677, + "grad_norm": 3.464405059814453, + "learning_rate": 1.9823026188287788e-05, + "loss": 1.5773, + "step": 12318 + }, + { + "epoch": 0.1540038500962524, + "grad_norm": 1.664629340171814, + "learning_rate": 1.9822862695617e-05, + "loss": 0.1249, + "step": 12320 + }, + { + "epoch": 0.15402885072126804, + "grad_norm": 4.623106002807617, + "learning_rate": 1.9822699128136677e-05, + "loss": 0.6972, + "step": 12322 + }, + { + "epoch": 0.15405385134628366, + "grad_norm": 3.163147449493408, + "learning_rate": 1.9822535485848067e-05, + "loss": 0.8986, + "step": 12324 + }, + { + "epoch": 0.15407885197129928, + "grad_norm": 0.007055079098790884, + "learning_rate": 1.9822371768752418e-05, + "loss": 0.5162, + "step": 12326 + }, + { + "epoch": 0.1541038525963149, + "grad_norm": 2.602804660797119, + "learning_rate": 1.9822207976850972e-05, + "loss": 0.6571, + "step": 12328 + }, + { + "epoch": 0.15412885322133052, + "grad_norm": 0.502970278263092, + "learning_rate": 1.9822044110144978e-05, + "loss": 0.4091, + "step": 12330 + }, + { + "epoch": 0.15415385384634617, + "grad_norm": 1.6299177408218384, + "learning_rate": 1.9821880168635687e-05, + "loss": 1.29, + "step": 12332 + }, + { + "epoch": 0.1541788544713618, + "grad_norm": 0.01285543479025364, + "learning_rate": 1.9821716152324345e-05, + "loss": 0.1022, + "step": 12334 + }, + { + "epoch": 0.1542038550963774, + "grad_norm": 1.9032132625579834, + "learning_rate": 1.98215520612122e-05, + "loss": 0.3053, + "step": 12336 + }, + { + "epoch": 0.15422885572139303, + "grad_norm": 2.506065845489502, + "learning_rate": 1.9821387895300507e-05, + "loss": 0.7275, + "step": 12338 + }, + { + "epoch": 0.15425385634640865, + "grad_norm": 0.55763840675354, + "learning_rate": 1.982122365459051e-05, + "loss": 0.5385, + "step": 12340 + }, + { + "epoch": 0.1542788569714243, + "grad_norm": 2.1319074630737305, + "learning_rate": 1.982105933908346e-05, + "loss": 0.2285, + "step": 12342 + }, + { + "epoch": 0.15430385759643991, + "grad_norm": 3.8311710357666016, + "learning_rate": 1.9820894948780617e-05, + "loss": 1.6824, + "step": 12344 + }, + { + "epoch": 0.15432885822145553, + "grad_norm": 6.010688781738281, + "learning_rate": 1.9820730483683226e-05, + "loss": 0.4994, + "step": 12346 + }, + { + "epoch": 0.15435385884647115, + "grad_norm": 3.5871682167053223, + "learning_rate": 1.9820565943792535e-05, + "loss": 2.0986, + "step": 12348 + }, + { + "epoch": 0.15437885947148677, + "grad_norm": 0.004660353064537048, + "learning_rate": 1.9820401329109807e-05, + "loss": 1.0783, + "step": 12350 + }, + { + "epoch": 0.15440386009650242, + "grad_norm": 3.5480825901031494, + "learning_rate": 1.982023663963629e-05, + "loss": 2.0057, + "step": 12352 + }, + { + "epoch": 0.15442886072151804, + "grad_norm": 2.94834303855896, + "learning_rate": 1.982007187537324e-05, + "loss": 0.7747, + "step": 12354 + }, + { + "epoch": 0.15445386134653366, + "grad_norm": 3.508707046508789, + "learning_rate": 1.981990703632191e-05, + "loss": 0.5543, + "step": 12356 + }, + { + "epoch": 0.15447886197154928, + "grad_norm": 1.363485336303711, + "learning_rate": 1.981974212248356e-05, + "loss": 1.129, + "step": 12358 + }, + { + "epoch": 0.1545038625965649, + "grad_norm": 4.21367883682251, + "learning_rate": 1.981957713385944e-05, + "loss": 0.8677, + "step": 12360 + }, + { + "epoch": 0.15452886322158055, + "grad_norm": 2.5745551586151123, + "learning_rate": 1.9819412070450808e-05, + "loss": 0.7278, + "step": 12362 + }, + { + "epoch": 0.15455386384659617, + "grad_norm": 11.508672714233398, + "learning_rate": 1.9819246932258923e-05, + "loss": 0.7194, + "step": 12364 + }, + { + "epoch": 0.1545788644716118, + "grad_norm": 2.996885061264038, + "learning_rate": 1.9819081719285044e-05, + "loss": 0.7508, + "step": 12366 + }, + { + "epoch": 0.1546038650966274, + "grad_norm": 2.9654805660247803, + "learning_rate": 1.9818916431530427e-05, + "loss": 1.4897, + "step": 12368 + }, + { + "epoch": 0.15462886572164303, + "grad_norm": 3.6744301319122314, + "learning_rate": 1.981875106899633e-05, + "loss": 1.2507, + "step": 12370 + }, + { + "epoch": 0.15465386634665867, + "grad_norm": 3.6839680671691895, + "learning_rate": 1.981858563168401e-05, + "loss": 1.3087, + "step": 12372 + }, + { + "epoch": 0.1546788669716743, + "grad_norm": 0.7357863783836365, + "learning_rate": 1.9818420119594735e-05, + "loss": 0.2905, + "step": 12374 + }, + { + "epoch": 0.15470386759668991, + "grad_norm": 3.2557790279388428, + "learning_rate": 1.9818254532729758e-05, + "loss": 0.9177, + "step": 12376 + }, + { + "epoch": 0.15472886822170553, + "grad_norm": 2.8243019580841064, + "learning_rate": 1.9818088871090343e-05, + "loss": 0.7457, + "step": 12378 + }, + { + "epoch": 0.15475386884672115, + "grad_norm": 3.9571492671966553, + "learning_rate": 1.9817923134677756e-05, + "loss": 1.4745, + "step": 12380 + }, + { + "epoch": 0.1547788694717368, + "grad_norm": 1.6479016542434692, + "learning_rate": 1.981775732349325e-05, + "loss": 0.6759, + "step": 12382 + }, + { + "epoch": 0.15480387009675242, + "grad_norm": 4.113266944885254, + "learning_rate": 1.9817591437538095e-05, + "loss": 1.6069, + "step": 12384 + }, + { + "epoch": 0.15482887072176804, + "grad_norm": 0.007365765981376171, + "learning_rate": 1.981742547681355e-05, + "loss": 0.4429, + "step": 12386 + }, + { + "epoch": 0.15485387134678366, + "grad_norm": 7.197845935821533, + "learning_rate": 1.981725944132088e-05, + "loss": 1.7032, + "step": 12388 + }, + { + "epoch": 0.15487887197179928, + "grad_norm": 13.672384262084961, + "learning_rate": 1.9817093331061355e-05, + "loss": 0.3539, + "step": 12390 + }, + { + "epoch": 0.15490387259681493, + "grad_norm": 0.019316814839839935, + "learning_rate": 1.9816927146036233e-05, + "loss": 0.0014, + "step": 12392 + }, + { + "epoch": 0.15492887322183055, + "grad_norm": 4.470597743988037, + "learning_rate": 1.9816760886246782e-05, + "loss": 1.4966, + "step": 12394 + }, + { + "epoch": 0.15495387384684617, + "grad_norm": 0.02467506192624569, + "learning_rate": 1.981659455169427e-05, + "loss": 0.0106, + "step": 12396 + }, + { + "epoch": 0.1549788744718618, + "grad_norm": 0.6463467478752136, + "learning_rate": 1.981642814237996e-05, + "loss": 0.1959, + "step": 12398 + }, + { + "epoch": 0.1550038750968774, + "grad_norm": 4.117307662963867, + "learning_rate": 1.9816261658305118e-05, + "loss": 1.2702, + "step": 12400 + }, + { + "epoch": 0.15502887572189306, + "grad_norm": 3.5324208736419678, + "learning_rate": 1.981609509947102e-05, + "loss": 1.2799, + "step": 12402 + }, + { + "epoch": 0.15505387634690868, + "grad_norm": 0.024311920627951622, + "learning_rate": 1.981592846587893e-05, + "loss": 0.5766, + "step": 12404 + }, + { + "epoch": 0.1550788769719243, + "grad_norm": 2.630964517593384, + "learning_rate": 1.9815761757530114e-05, + "loss": 0.7299, + "step": 12406 + }, + { + "epoch": 0.15510387759693992, + "grad_norm": 3.8746702671051025, + "learning_rate": 1.9815594974425846e-05, + "loss": 1.1383, + "step": 12408 + }, + { + "epoch": 0.15512887822195554, + "grad_norm": 2.644148111343384, + "learning_rate": 1.9815428116567393e-05, + "loss": 1.2095, + "step": 12410 + }, + { + "epoch": 0.15515387884697118, + "grad_norm": 1.711533784866333, + "learning_rate": 1.981526118395603e-05, + "loss": 0.6893, + "step": 12412 + }, + { + "epoch": 0.1551788794719868, + "grad_norm": 4.065579414367676, + "learning_rate": 1.9815094176593027e-05, + "loss": 0.7468, + "step": 12414 + }, + { + "epoch": 0.15520388009700242, + "grad_norm": 0.007477100472897291, + "learning_rate": 1.9814927094479654e-05, + "loss": 0.2466, + "step": 12416 + }, + { + "epoch": 0.15522888072201804, + "grad_norm": 0.7898657321929932, + "learning_rate": 1.981475993761718e-05, + "loss": 0.244, + "step": 12418 + }, + { + "epoch": 0.15525388134703366, + "grad_norm": 4.884597301483154, + "learning_rate": 1.9814592706006888e-05, + "loss": 1.9214, + "step": 12420 + }, + { + "epoch": 0.1552788819720493, + "grad_norm": 1.696520209312439, + "learning_rate": 1.9814425399650043e-05, + "loss": 1.0015, + "step": 12422 + }, + { + "epoch": 0.15530388259706493, + "grad_norm": 1.6554991006851196, + "learning_rate": 1.9814258018547924e-05, + "loss": 0.1788, + "step": 12424 + }, + { + "epoch": 0.15532888322208055, + "grad_norm": 2.19197416305542, + "learning_rate": 1.98140905627018e-05, + "loss": 0.6311, + "step": 12426 + }, + { + "epoch": 0.15535388384709617, + "grad_norm": 4.0254807472229, + "learning_rate": 1.9813923032112953e-05, + "loss": 0.9669, + "step": 12428 + }, + { + "epoch": 0.1553788844721118, + "grad_norm": 8.639084815979004, + "learning_rate": 1.9813755426782653e-05, + "loss": 2.0041, + "step": 12430 + }, + { + "epoch": 0.15540388509712744, + "grad_norm": 5.638019561767578, + "learning_rate": 1.9813587746712185e-05, + "loss": 2.3692, + "step": 12432 + }, + { + "epoch": 0.15542888572214306, + "grad_norm": 2.740520715713501, + "learning_rate": 1.9813419991902816e-05, + "loss": 0.6395, + "step": 12434 + }, + { + "epoch": 0.15545388634715868, + "grad_norm": 2.91569447517395, + "learning_rate": 1.981325216235583e-05, + "loss": 1.1261, + "step": 12436 + }, + { + "epoch": 0.1554788869721743, + "grad_norm": 11.410320281982422, + "learning_rate": 1.9813084258072502e-05, + "loss": 1.2395, + "step": 12438 + }, + { + "epoch": 0.15550388759718992, + "grad_norm": 1.015902042388916, + "learning_rate": 1.9812916279054112e-05, + "loss": 1.1282, + "step": 12440 + }, + { + "epoch": 0.15552888822220556, + "grad_norm": 0.016230478882789612, + "learning_rate": 1.981274822530194e-05, + "loss": 0.8648, + "step": 12442 + }, + { + "epoch": 0.15555388884722118, + "grad_norm": 2.0466225147247314, + "learning_rate": 1.9812580096817268e-05, + "loss": 0.4831, + "step": 12444 + }, + { + "epoch": 0.1555788894722368, + "grad_norm": 4.4031982421875, + "learning_rate": 1.981241189360137e-05, + "loss": 0.9603, + "step": 12446 + }, + { + "epoch": 0.15560389009725242, + "grad_norm": 2.0228230953216553, + "learning_rate": 1.9812243615655527e-05, + "loss": 1.1206, + "step": 12448 + }, + { + "epoch": 0.15562889072226804, + "grad_norm": 1.9363411664962769, + "learning_rate": 1.981207526298103e-05, + "loss": 0.4413, + "step": 12450 + }, + { + "epoch": 0.1556538913472837, + "grad_norm": 2.193681001663208, + "learning_rate": 1.9811906835579157e-05, + "loss": 0.4896, + "step": 12452 + }, + { + "epoch": 0.1556788919722993, + "grad_norm": 2.5145411491394043, + "learning_rate": 1.9811738333451185e-05, + "loss": 0.4569, + "step": 12454 + }, + { + "epoch": 0.15570389259731493, + "grad_norm": 0.03102232702076435, + "learning_rate": 1.98115697565984e-05, + "loss": 0.0985, + "step": 12456 + }, + { + "epoch": 0.15572889322233055, + "grad_norm": 3.5796635150909424, + "learning_rate": 1.981140110502209e-05, + "loss": 1.576, + "step": 12458 + }, + { + "epoch": 0.15575389384734617, + "grad_norm": 3.6493778228759766, + "learning_rate": 1.9811232378723537e-05, + "loss": 0.6147, + "step": 12460 + }, + { + "epoch": 0.15577889447236182, + "grad_norm": 3.9348621368408203, + "learning_rate": 1.9811063577704025e-05, + "loss": 1.5268, + "step": 12462 + }, + { + "epoch": 0.15580389509737744, + "grad_norm": 5.240859031677246, + "learning_rate": 1.981089470196484e-05, + "loss": 1.5485, + "step": 12464 + }, + { + "epoch": 0.15582889572239306, + "grad_norm": 0.00894826091825962, + "learning_rate": 1.9810725751507267e-05, + "loss": 0.0221, + "step": 12466 + }, + { + "epoch": 0.15585389634740868, + "grad_norm": 4.977940082550049, + "learning_rate": 1.9810556726332595e-05, + "loss": 1.097, + "step": 12468 + }, + { + "epoch": 0.1558788969724243, + "grad_norm": 0.0051801917143166065, + "learning_rate": 1.981038762644211e-05, + "loss": 0.0001, + "step": 12470 + }, + { + "epoch": 0.15590389759743994, + "grad_norm": 5.420718193054199, + "learning_rate": 1.98102184518371e-05, + "loss": 1.7196, + "step": 12472 + }, + { + "epoch": 0.15592889822245556, + "grad_norm": 4.598221778869629, + "learning_rate": 1.9810049202518855e-05, + "loss": 2.0111, + "step": 12474 + }, + { + "epoch": 0.15595389884747118, + "grad_norm": 1.2954823970794678, + "learning_rate": 1.980987987848866e-05, + "loss": 0.5565, + "step": 12476 + }, + { + "epoch": 0.1559788994724868, + "grad_norm": 0.00824074074625969, + "learning_rate": 1.980971047974781e-05, + "loss": 0.8229, + "step": 12478 + }, + { + "epoch": 0.15600390009750242, + "grad_norm": 2.9313836097717285, + "learning_rate": 1.980954100629759e-05, + "loss": 0.8535, + "step": 12480 + }, + { + "epoch": 0.15602890072251807, + "grad_norm": 5.950128078460693, + "learning_rate": 1.9809371458139295e-05, + "loss": 0.9385, + "step": 12482 + }, + { + "epoch": 0.1560539013475337, + "grad_norm": 0.8367852568626404, + "learning_rate": 1.9809201835274215e-05, + "loss": 0.3294, + "step": 12484 + }, + { + "epoch": 0.1560789019725493, + "grad_norm": 2.417250871658325, + "learning_rate": 1.9809032137703637e-05, + "loss": 0.7852, + "step": 12486 + }, + { + "epoch": 0.15610390259756493, + "grad_norm": 3.333519697189331, + "learning_rate": 1.9808862365428864e-05, + "loss": 1.0642, + "step": 12488 + }, + { + "epoch": 0.15612890322258055, + "grad_norm": 0.8471091389656067, + "learning_rate": 1.9808692518451175e-05, + "loss": 0.191, + "step": 12490 + }, + { + "epoch": 0.1561539038475962, + "grad_norm": 8.283258438110352, + "learning_rate": 1.9808522596771874e-05, + "loss": 1.2362, + "step": 12492 + }, + { + "epoch": 0.15617890447261182, + "grad_norm": 1.621987223625183, + "learning_rate": 1.980835260039225e-05, + "loss": 0.6465, + "step": 12494 + }, + { + "epoch": 0.15620390509762744, + "grad_norm": 3.4845597743988037, + "learning_rate": 1.98081825293136e-05, + "loss": 1.5136, + "step": 12496 + }, + { + "epoch": 0.15622890572264306, + "grad_norm": 4.539579391479492, + "learning_rate": 1.9808012383537223e-05, + "loss": 1.2195, + "step": 12498 + }, + { + "epoch": 0.1562539063476587, + "grad_norm": 3.34869384765625, + "learning_rate": 1.980784216306441e-05, + "loss": 1.9837, + "step": 12500 + }, + { + "epoch": 0.15627890697267433, + "grad_norm": 3.278270959854126, + "learning_rate": 1.9807671867896454e-05, + "loss": 0.8375, + "step": 12502 + }, + { + "epoch": 0.15630390759768995, + "grad_norm": 1.2664058208465576, + "learning_rate": 1.9807501498034657e-05, + "loss": 0.0412, + "step": 12504 + }, + { + "epoch": 0.15632890822270556, + "grad_norm": 3.8790667057037354, + "learning_rate": 1.9807331053480316e-05, + "loss": 1.8207, + "step": 12506 + }, + { + "epoch": 0.15635390884772118, + "grad_norm": 3.5619618892669678, + "learning_rate": 1.980716053423473e-05, + "loss": 0.91, + "step": 12508 + }, + { + "epoch": 0.15637890947273683, + "grad_norm": 1.9417238235473633, + "learning_rate": 1.9806989940299195e-05, + "loss": 0.9324, + "step": 12510 + }, + { + "epoch": 0.15640391009775245, + "grad_norm": 5.075495719909668, + "learning_rate": 1.980681927167501e-05, + "loss": 1.1277, + "step": 12512 + }, + { + "epoch": 0.15642891072276807, + "grad_norm": 4.714689254760742, + "learning_rate": 1.9806648528363478e-05, + "loss": 1.3734, + "step": 12514 + }, + { + "epoch": 0.1564539113477837, + "grad_norm": 4.035572528839111, + "learning_rate": 1.9806477710365897e-05, + "loss": 0.9353, + "step": 12516 + }, + { + "epoch": 0.1564789119727993, + "grad_norm": 1.8162264823913574, + "learning_rate": 1.980630681768357e-05, + "loss": 1.0827, + "step": 12518 + }, + { + "epoch": 0.15650391259781496, + "grad_norm": 2.386483669281006, + "learning_rate": 1.9806135850317794e-05, + "loss": 0.1083, + "step": 12520 + }, + { + "epoch": 0.15652891322283058, + "grad_norm": 3.135364532470703, + "learning_rate": 1.9805964808269878e-05, + "loss": 0.3436, + "step": 12522 + }, + { + "epoch": 0.1565539138478462, + "grad_norm": 3.4594428539276123, + "learning_rate": 1.980579369154112e-05, + "loss": 2.3341, + "step": 12524 + }, + { + "epoch": 0.15657891447286182, + "grad_norm": 3.5737438201904297, + "learning_rate": 1.9805622500132822e-05, + "loss": 0.69, + "step": 12526 + }, + { + "epoch": 0.15660391509787744, + "grad_norm": 4.196369171142578, + "learning_rate": 1.980545123404629e-05, + "loss": 0.7454, + "step": 12528 + }, + { + "epoch": 0.1566289157228931, + "grad_norm": 0.454933762550354, + "learning_rate": 1.980527989328283e-05, + "loss": 0.0228, + "step": 12530 + }, + { + "epoch": 0.1566539163479087, + "grad_norm": 2.245018243789673, + "learning_rate": 1.9805108477843744e-05, + "loss": 0.5369, + "step": 12532 + }, + { + "epoch": 0.15667891697292433, + "grad_norm": 0.012954109348356724, + "learning_rate": 1.9804936987730333e-05, + "loss": 0.3534, + "step": 12534 + }, + { + "epoch": 0.15670391759793995, + "grad_norm": 5.190274238586426, + "learning_rate": 1.9804765422943915e-05, + "loss": 1.2264, + "step": 12536 + }, + { + "epoch": 0.15672891822295557, + "grad_norm": 3.333258628845215, + "learning_rate": 1.980459378348579e-05, + "loss": 1.1471, + "step": 12538 + }, + { + "epoch": 0.1567539188479712, + "grad_norm": 2.811014413833618, + "learning_rate": 1.980442206935726e-05, + "loss": 1.3835, + "step": 12540 + }, + { + "epoch": 0.15677891947298683, + "grad_norm": 0.006977601908147335, + "learning_rate": 1.9804250280559643e-05, + "loss": 0.0002, + "step": 12542 + }, + { + "epoch": 0.15680392009800245, + "grad_norm": 0.019372403621673584, + "learning_rate": 1.980407841709424e-05, + "loss": 0.7585, + "step": 12544 + }, + { + "epoch": 0.15682892072301807, + "grad_norm": 1.6684951782226562, + "learning_rate": 1.980390647896236e-05, + "loss": 0.4488, + "step": 12546 + }, + { + "epoch": 0.1568539213480337, + "grad_norm": 2.8881173133850098, + "learning_rate": 1.9803734466165322e-05, + "loss": 0.6838, + "step": 12548 + }, + { + "epoch": 0.15687892197304934, + "grad_norm": 3.7486085891723633, + "learning_rate": 1.9803562378704423e-05, + "loss": 1.9373, + "step": 12550 + }, + { + "epoch": 0.15690392259806496, + "grad_norm": 4.206179618835449, + "learning_rate": 1.9803390216580978e-05, + "loss": 1.1364, + "step": 12552 + }, + { + "epoch": 0.15692892322308058, + "grad_norm": 2.4520905017852783, + "learning_rate": 1.9803217979796305e-05, + "loss": 1.0423, + "step": 12554 + }, + { + "epoch": 0.1569539238480962, + "grad_norm": 0.01809891313314438, + "learning_rate": 1.980304566835171e-05, + "loss": 0.2228, + "step": 12556 + }, + { + "epoch": 0.15697892447311182, + "grad_norm": 2.6906015872955322, + "learning_rate": 1.9802873282248497e-05, + "loss": 0.6883, + "step": 12558 + }, + { + "epoch": 0.15700392509812747, + "grad_norm": 1.554964542388916, + "learning_rate": 1.9802700821487995e-05, + "loss": 1.2076, + "step": 12560 + }, + { + "epoch": 0.1570289257231431, + "grad_norm": 4.204789161682129, + "learning_rate": 1.9802528286071506e-05, + "loss": 1.2298, + "step": 12562 + }, + { + "epoch": 0.1570539263481587, + "grad_norm": 4.074195861816406, + "learning_rate": 1.980235567600035e-05, + "loss": 0.9968, + "step": 12564 + }, + { + "epoch": 0.15707892697317433, + "grad_norm": 5.558722496032715, + "learning_rate": 1.980218299127584e-05, + "loss": 1.3037, + "step": 12566 + }, + { + "epoch": 0.15710392759818995, + "grad_norm": 1.4701586961746216, + "learning_rate": 1.980201023189929e-05, + "loss": 0.1638, + "step": 12568 + }, + { + "epoch": 0.1571289282232056, + "grad_norm": 4.39614725112915, + "learning_rate": 1.9801837397872014e-05, + "loss": 0.7534, + "step": 12570 + }, + { + "epoch": 0.1571539288482212, + "grad_norm": 0.14914944767951965, + "learning_rate": 1.9801664489195332e-05, + "loss": 0.507, + "step": 12572 + }, + { + "epoch": 0.15717892947323683, + "grad_norm": 3.0810415744781494, + "learning_rate": 1.980149150587056e-05, + "loss": 0.4547, + "step": 12574 + }, + { + "epoch": 0.15720393009825245, + "grad_norm": 2.9056384563446045, + "learning_rate": 1.9801318447899014e-05, + "loss": 1.1211, + "step": 12576 + }, + { + "epoch": 0.15722893072326807, + "grad_norm": 0.005024840589612722, + "learning_rate": 1.980114531528201e-05, + "loss": 0.3016, + "step": 12578 + }, + { + "epoch": 0.15725393134828372, + "grad_norm": 12.11992073059082, + "learning_rate": 1.9800972108020874e-05, + "loss": 0.5564, + "step": 12580 + }, + { + "epoch": 0.15727893197329934, + "grad_norm": 3.6365883350372314, + "learning_rate": 1.980079882611692e-05, + "loss": 1.3769, + "step": 12582 + }, + { + "epoch": 0.15730393259831496, + "grad_norm": 4.2379961013793945, + "learning_rate": 1.9800625469571467e-05, + "loss": 1.3205, + "step": 12584 + }, + { + "epoch": 0.15732893322333058, + "grad_norm": 4.006270885467529, + "learning_rate": 1.9800452038385836e-05, + "loss": 1.2035, + "step": 12586 + }, + { + "epoch": 0.1573539338483462, + "grad_norm": 7.334877014160156, + "learning_rate": 1.9800278532561344e-05, + "loss": 1.2287, + "step": 12588 + }, + { + "epoch": 0.15737893447336185, + "grad_norm": 9.217230796813965, + "learning_rate": 1.9800104952099325e-05, + "loss": 1.7881, + "step": 12590 + }, + { + "epoch": 0.15740393509837747, + "grad_norm": 3.9330389499664307, + "learning_rate": 1.9799931297001086e-05, + "loss": 1.4444, + "step": 12592 + }, + { + "epoch": 0.1574289357233931, + "grad_norm": 6.097550868988037, + "learning_rate": 1.979975756726796e-05, + "loss": 1.7013, + "step": 12594 + }, + { + "epoch": 0.1574539363484087, + "grad_norm": 2.8994765281677246, + "learning_rate": 1.9799583762901264e-05, + "loss": 1.0353, + "step": 12596 + }, + { + "epoch": 0.15747893697342433, + "grad_norm": 4.228240489959717, + "learning_rate": 1.9799409883902326e-05, + "loss": 1.2018, + "step": 12598 + }, + { + "epoch": 0.15750393759843997, + "grad_norm": 9.139363288879395, + "learning_rate": 1.9799235930272465e-05, + "loss": 1.3608, + "step": 12600 + }, + { + "epoch": 0.1575289382234556, + "grad_norm": 2.272477865219116, + "learning_rate": 1.9799061902013007e-05, + "loss": 0.5455, + "step": 12602 + }, + { + "epoch": 0.15755393884847121, + "grad_norm": 3.1005301475524902, + "learning_rate": 1.9798887799125284e-05, + "loss": 1.6246, + "step": 12604 + }, + { + "epoch": 0.15757893947348683, + "grad_norm": 0.4308038353919983, + "learning_rate": 1.979871362161062e-05, + "loss": 0.1019, + "step": 12606 + }, + { + "epoch": 0.15760394009850245, + "grad_norm": 1.6962977647781372, + "learning_rate": 1.979853936947033e-05, + "loss": 0.2744, + "step": 12608 + }, + { + "epoch": 0.1576289407235181, + "grad_norm": 5.870669364929199, + "learning_rate": 1.9798365042705756e-05, + "loss": 0.398, + "step": 12610 + }, + { + "epoch": 0.15765394134853372, + "grad_norm": 5.527795791625977, + "learning_rate": 1.9798190641318218e-05, + "loss": 2.5861, + "step": 12612 + }, + { + "epoch": 0.15767894197354934, + "grad_norm": 3.1588056087493896, + "learning_rate": 1.9798016165309043e-05, + "loss": 0.2094, + "step": 12614 + }, + { + "epoch": 0.15770394259856496, + "grad_norm": 1.0665377378463745, + "learning_rate": 1.9797841614679566e-05, + "loss": 0.6162, + "step": 12616 + }, + { + "epoch": 0.15772894322358058, + "grad_norm": 7.022693634033203, + "learning_rate": 1.979766698943111e-05, + "loss": 1.7588, + "step": 12618 + }, + { + "epoch": 0.15775394384859623, + "grad_norm": 1.7799720764160156, + "learning_rate": 1.979749228956501e-05, + "loss": 0.8615, + "step": 12620 + }, + { + "epoch": 0.15777894447361185, + "grad_norm": 0.0419018529355526, + "learning_rate": 1.9797317515082596e-05, + "loss": 0.5722, + "step": 12622 + }, + { + "epoch": 0.15780394509862747, + "grad_norm": 3.097667694091797, + "learning_rate": 1.9797142665985193e-05, + "loss": 1.3043, + "step": 12624 + }, + { + "epoch": 0.1578289457236431, + "grad_norm": 5.581547737121582, + "learning_rate": 1.9796967742274142e-05, + "loss": 0.3807, + "step": 12626 + }, + { + "epoch": 0.1578539463486587, + "grad_norm": 0.021651750430464745, + "learning_rate": 1.9796792743950767e-05, + "loss": 0.5575, + "step": 12628 + }, + { + "epoch": 0.15787894697367436, + "grad_norm": 4.843774318695068, + "learning_rate": 1.9796617671016403e-05, + "loss": 1.6156, + "step": 12630 + }, + { + "epoch": 0.15790394759868998, + "grad_norm": 0.026420610025525093, + "learning_rate": 1.9796442523472383e-05, + "loss": 0.375, + "step": 12632 + }, + { + "epoch": 0.1579289482237056, + "grad_norm": 4.4724931716918945, + "learning_rate": 1.9796267301320043e-05, + "loss": 0.463, + "step": 12634 + }, + { + "epoch": 0.15795394884872122, + "grad_norm": 3.005629777908325, + "learning_rate": 1.979609200456072e-05, + "loss": 1.6247, + "step": 12636 + }, + { + "epoch": 0.15797894947373683, + "grad_norm": 2.3976287841796875, + "learning_rate": 1.979591663319574e-05, + "loss": 0.29, + "step": 12638 + }, + { + "epoch": 0.15800395009875248, + "grad_norm": 2.822486400604248, + "learning_rate": 1.9795741187226448e-05, + "loss": 1.5921, + "step": 12640 + }, + { + "epoch": 0.1580289507237681, + "grad_norm": 3.7313036918640137, + "learning_rate": 1.9795565666654176e-05, + "loss": 1.0478, + "step": 12642 + }, + { + "epoch": 0.15805395134878372, + "grad_norm": 1.6423592567443848, + "learning_rate": 1.9795390071480263e-05, + "loss": 0.044, + "step": 12644 + }, + { + "epoch": 0.15807895197379934, + "grad_norm": 1.3395366668701172, + "learning_rate": 1.979521440170604e-05, + "loss": 0.7571, + "step": 12646 + }, + { + "epoch": 0.15810395259881496, + "grad_norm": 1.6187134981155396, + "learning_rate": 1.979503865733285e-05, + "loss": 0.215, + "step": 12648 + }, + { + "epoch": 0.1581289532238306, + "grad_norm": 2.5829248428344727, + "learning_rate": 1.9794862838362036e-05, + "loss": 1.591, + "step": 12650 + }, + { + "epoch": 0.15815395384884623, + "grad_norm": 2.887537717819214, + "learning_rate": 1.9794686944794926e-05, + "loss": 0.2064, + "step": 12652 + }, + { + "epoch": 0.15817895447386185, + "grad_norm": 4.159685134887695, + "learning_rate": 1.9794510976632867e-05, + "loss": 1.7059, + "step": 12654 + }, + { + "epoch": 0.15820395509887747, + "grad_norm": 4.077271938323975, + "learning_rate": 1.97943349338772e-05, + "loss": 1.3698, + "step": 12656 + }, + { + "epoch": 0.1582289557238931, + "grad_norm": 4.041355609893799, + "learning_rate": 1.9794158816529257e-05, + "loss": 0.7297, + "step": 12658 + }, + { + "epoch": 0.15825395634890874, + "grad_norm": 3.288261890411377, + "learning_rate": 1.9793982624590394e-05, + "loss": 1.1969, + "step": 12660 + }, + { + "epoch": 0.15827895697392436, + "grad_norm": 3.955625295639038, + "learning_rate": 1.9793806358061938e-05, + "loss": 1.0765, + "step": 12662 + }, + { + "epoch": 0.15830395759893998, + "grad_norm": 0.012434059754014015, + "learning_rate": 1.979363001694524e-05, + "loss": 0.0252, + "step": 12664 + }, + { + "epoch": 0.1583289582239556, + "grad_norm": 2.5936267375946045, + "learning_rate": 1.979345360124164e-05, + "loss": 0.6705, + "step": 12666 + }, + { + "epoch": 0.15835395884897122, + "grad_norm": 0.023039381951093674, + "learning_rate": 1.9793277110952483e-05, + "loss": 0.9403, + "step": 12668 + }, + { + "epoch": 0.15837895947398686, + "grad_norm": 5.648407936096191, + "learning_rate": 1.979310054607911e-05, + "loss": 1.1798, + "step": 12670 + }, + { + "epoch": 0.15840396009900248, + "grad_norm": 4.412627696990967, + "learning_rate": 1.979292390662287e-05, + "loss": 0.3955, + "step": 12672 + }, + { + "epoch": 0.1584289607240181, + "grad_norm": 3.3834428787231445, + "learning_rate": 1.9792747192585107e-05, + "loss": 0.8287, + "step": 12674 + }, + { + "epoch": 0.15845396134903372, + "grad_norm": 4.763339519500732, + "learning_rate": 1.9792570403967165e-05, + "loss": 1.4163, + "step": 12676 + }, + { + "epoch": 0.15847896197404934, + "grad_norm": 3.7119085788726807, + "learning_rate": 1.9792393540770392e-05, + "loss": 0.8244, + "step": 12678 + }, + { + "epoch": 0.158503962599065, + "grad_norm": 4.593087196350098, + "learning_rate": 1.979221660299614e-05, + "loss": 1.2936, + "step": 12680 + }, + { + "epoch": 0.1585289632240806, + "grad_norm": 3.6986701488494873, + "learning_rate": 1.9792039590645743e-05, + "loss": 0.4506, + "step": 12682 + }, + { + "epoch": 0.15855396384909623, + "grad_norm": 0.006175042130053043, + "learning_rate": 1.9791862503720562e-05, + "loss": 1.0044, + "step": 12684 + }, + { + "epoch": 0.15857896447411185, + "grad_norm": 0.05605548992753029, + "learning_rate": 1.9791685342221935e-05, + "loss": 0.0016, + "step": 12686 + }, + { + "epoch": 0.15860396509912747, + "grad_norm": 3.8754613399505615, + "learning_rate": 1.979150810615122e-05, + "loss": 0.3865, + "step": 12688 + }, + { + "epoch": 0.15862896572414312, + "grad_norm": 3.5339787006378174, + "learning_rate": 1.9791330795509766e-05, + "loss": 1.4359, + "step": 12690 + }, + { + "epoch": 0.15865396634915874, + "grad_norm": 1.4299286603927612, + "learning_rate": 1.9791153410298924e-05, + "loss": 1.1812, + "step": 12692 + }, + { + "epoch": 0.15867896697417436, + "grad_norm": 0.3233015537261963, + "learning_rate": 1.9790975950520033e-05, + "loss": 0.065, + "step": 12694 + }, + { + "epoch": 0.15870396759918998, + "grad_norm": 3.3843841552734375, + "learning_rate": 1.979079841617446e-05, + "loss": 2.1426, + "step": 12696 + }, + { + "epoch": 0.1587289682242056, + "grad_norm": 1.3287855386734009, + "learning_rate": 1.9790620807263552e-05, + "loss": 0.9486, + "step": 12698 + }, + { + "epoch": 0.15875396884922124, + "grad_norm": 3.5970821380615234, + "learning_rate": 1.9790443123788655e-05, + "loss": 1.8009, + "step": 12700 + }, + { + "epoch": 0.15877896947423686, + "grad_norm": 3.0837748050689697, + "learning_rate": 1.9790265365751134e-05, + "loss": 1.5671, + "step": 12702 + }, + { + "epoch": 0.15880397009925248, + "grad_norm": 2.320068120956421, + "learning_rate": 1.9790087533152332e-05, + "loss": 0.5322, + "step": 12704 + }, + { + "epoch": 0.1588289707242681, + "grad_norm": 3.440141439437866, + "learning_rate": 1.978990962599361e-05, + "loss": 0.7052, + "step": 12706 + }, + { + "epoch": 0.15885397134928372, + "grad_norm": 3.614640235900879, + "learning_rate": 1.978973164427632e-05, + "loss": 1.8379, + "step": 12708 + }, + { + "epoch": 0.15887897197429937, + "grad_norm": 3.107881784439087, + "learning_rate": 1.9789553588001818e-05, + "loss": 1.3926, + "step": 12710 + }, + { + "epoch": 0.158903972599315, + "grad_norm": 3.6581408977508545, + "learning_rate": 1.978937545717146e-05, + "loss": 1.7182, + "step": 12712 + }, + { + "epoch": 0.1589289732243306, + "grad_norm": 0.37895604968070984, + "learning_rate": 1.9789197251786604e-05, + "loss": 0.822, + "step": 12714 + }, + { + "epoch": 0.15895397384934623, + "grad_norm": 3.2833681106567383, + "learning_rate": 1.9789018971848604e-05, + "loss": 1.3512, + "step": 12716 + }, + { + "epoch": 0.15897897447436185, + "grad_norm": 2.226379156112671, + "learning_rate": 1.978884061735882e-05, + "loss": 0.4822, + "step": 12718 + }, + { + "epoch": 0.1590039750993775, + "grad_norm": 7.961231708526611, + "learning_rate": 1.9788662188318614e-05, + "loss": 0.8647, + "step": 12720 + }, + { + "epoch": 0.15902897572439312, + "grad_norm": 3.0506744384765625, + "learning_rate": 1.9788483684729338e-05, + "loss": 1.546, + "step": 12722 + }, + { + "epoch": 0.15905397634940874, + "grad_norm": 1.8331266641616821, + "learning_rate": 1.9788305106592356e-05, + "loss": 0.0518, + "step": 12724 + }, + { + "epoch": 0.15907897697442436, + "grad_norm": 4.907299518585205, + "learning_rate": 1.9788126453909024e-05, + "loss": 1.604, + "step": 12726 + }, + { + "epoch": 0.15910397759943998, + "grad_norm": 3.224116325378418, + "learning_rate": 1.9787947726680708e-05, + "loss": 1.0198, + "step": 12728 + }, + { + "epoch": 0.15912897822445563, + "grad_norm": 0.6854885816574097, + "learning_rate": 1.9787768924908763e-05, + "loss": 0.0294, + "step": 12730 + }, + { + "epoch": 0.15915397884947124, + "grad_norm": 2.697793483734131, + "learning_rate": 1.9787590048594557e-05, + "loss": 0.7801, + "step": 12732 + }, + { + "epoch": 0.15917897947448686, + "grad_norm": 3.249218463897705, + "learning_rate": 1.9787411097739447e-05, + "loss": 1.0433, + "step": 12734 + }, + { + "epoch": 0.15920398009950248, + "grad_norm": 5.03335428237915, + "learning_rate": 1.97872320723448e-05, + "loss": 0.8972, + "step": 12736 + }, + { + "epoch": 0.1592289807245181, + "grad_norm": 0.0038543366827070713, + "learning_rate": 1.9787052972411978e-05, + "loss": 0.0075, + "step": 12738 + }, + { + "epoch": 0.15925398134953375, + "grad_norm": 2.504342794418335, + "learning_rate": 1.9786873797942344e-05, + "loss": 1.7558, + "step": 12740 + }, + { + "epoch": 0.15927898197454937, + "grad_norm": 0.18914173543453217, + "learning_rate": 1.978669454893726e-05, + "loss": 0.1472, + "step": 12742 + }, + { + "epoch": 0.159303982599565, + "grad_norm": 0.5384919047355652, + "learning_rate": 1.9786515225398095e-05, + "loss": 1.2997, + "step": 12744 + }, + { + "epoch": 0.1593289832245806, + "grad_norm": 0.0035114458296447992, + "learning_rate": 1.9786335827326217e-05, + "loss": 0.6804, + "step": 12746 + }, + { + "epoch": 0.15935398384959623, + "grad_norm": 1.3436434268951416, + "learning_rate": 1.978615635472299e-05, + "loss": 0.1427, + "step": 12748 + }, + { + "epoch": 0.15937898447461188, + "grad_norm": 2.746328592300415, + "learning_rate": 1.9785976807589775e-05, + "loss": 1.3493, + "step": 12750 + }, + { + "epoch": 0.1594039850996275, + "grad_norm": 3.4079198837280273, + "learning_rate": 1.9785797185927948e-05, + "loss": 1.085, + "step": 12752 + }, + { + "epoch": 0.15942898572464312, + "grad_norm": 2.230900287628174, + "learning_rate": 1.9785617489738874e-05, + "loss": 1.0129, + "step": 12754 + }, + { + "epoch": 0.15945398634965874, + "grad_norm": 6.489268779754639, + "learning_rate": 1.9785437719023917e-05, + "loss": 1.4497, + "step": 12756 + }, + { + "epoch": 0.15947898697467436, + "grad_norm": 3.312695264816284, + "learning_rate": 1.9785257873784454e-05, + "loss": 1.1122, + "step": 12758 + }, + { + "epoch": 0.15950398759969, + "grad_norm": 4.069403648376465, + "learning_rate": 1.9785077954021853e-05, + "loss": 0.4712, + "step": 12760 + }, + { + "epoch": 0.15952898822470563, + "grad_norm": 2.458832025527954, + "learning_rate": 1.9784897959737477e-05, + "loss": 1.2791, + "step": 12762 + }, + { + "epoch": 0.15955398884972125, + "grad_norm": 2.913846015930176, + "learning_rate": 1.9784717890932704e-05, + "loss": 0.56, + "step": 12764 + }, + { + "epoch": 0.15957898947473687, + "grad_norm": 9.33730411529541, + "learning_rate": 1.9784537747608902e-05, + "loss": 1.54, + "step": 12766 + }, + { + "epoch": 0.15960399009975249, + "grad_norm": 1.4278128147125244, + "learning_rate": 1.9784357529767445e-05, + "loss": 0.8931, + "step": 12768 + }, + { + "epoch": 0.15962899072476813, + "grad_norm": 5.210375785827637, + "learning_rate": 1.9784177237409704e-05, + "loss": 1.8793, + "step": 12770 + }, + { + "epoch": 0.15965399134978375, + "grad_norm": 2.656991720199585, + "learning_rate": 1.9783996870537056e-05, + "loss": 1.777, + "step": 12772 + }, + { + "epoch": 0.15967899197479937, + "grad_norm": 4.121217727661133, + "learning_rate": 1.9783816429150868e-05, + "loss": 0.8027, + "step": 12774 + }, + { + "epoch": 0.159703992599815, + "grad_norm": 2.9602792263031006, + "learning_rate": 1.978363591325252e-05, + "loss": 1.6673, + "step": 12776 + }, + { + "epoch": 0.1597289932248306, + "grad_norm": 0.8789263367652893, + "learning_rate": 1.978345532284338e-05, + "loss": 0.8062, + "step": 12778 + }, + { + "epoch": 0.15975399384984626, + "grad_norm": 2.358607053756714, + "learning_rate": 1.9783274657924832e-05, + "loss": 1.3227, + "step": 12780 + }, + { + "epoch": 0.15977899447486188, + "grad_norm": 6.066888332366943, + "learning_rate": 1.9783093918498247e-05, + "loss": 0.9642, + "step": 12782 + }, + { + "epoch": 0.1598039950998775, + "grad_norm": 1.100109338760376, + "learning_rate": 1.9782913104565003e-05, + "loss": 1.3617, + "step": 12784 + }, + { + "epoch": 0.15982899572489312, + "grad_norm": 3.369875907897949, + "learning_rate": 1.978273221612647e-05, + "loss": 0.728, + "step": 12786 + }, + { + "epoch": 0.15985399634990874, + "grad_norm": 4.011329650878906, + "learning_rate": 1.9782551253184042e-05, + "loss": 0.9976, + "step": 12788 + }, + { + "epoch": 0.1598789969749244, + "grad_norm": 4.356934070587158, + "learning_rate": 1.978237021573908e-05, + "loss": 0.14, + "step": 12790 + }, + { + "epoch": 0.15990399759994, + "grad_norm": 7.648773670196533, + "learning_rate": 1.978218910379297e-05, + "loss": 0.9325, + "step": 12792 + }, + { + "epoch": 0.15992899822495563, + "grad_norm": 3.0684256553649902, + "learning_rate": 1.9782007917347094e-05, + "loss": 0.7482, + "step": 12794 + }, + { + "epoch": 0.15995399884997125, + "grad_norm": 0.011595254763960838, + "learning_rate": 1.9781826656402828e-05, + "loss": 0.9796, + "step": 12796 + }, + { + "epoch": 0.15997899947498687, + "grad_norm": 0.006683704908937216, + "learning_rate": 1.9781645320961553e-05, + "loss": 0.7582, + "step": 12798 + }, + { + "epoch": 0.1600040001000025, + "grad_norm": 1.976731300354004, + "learning_rate": 1.978146391102465e-05, + "loss": 1.1906, + "step": 12800 + }, + { + "epoch": 0.16002900072501813, + "grad_norm": 3.9115335941314697, + "learning_rate": 1.97812824265935e-05, + "loss": 1.5611, + "step": 12802 + }, + { + "epoch": 0.16005400135003375, + "grad_norm": 4.4082417488098145, + "learning_rate": 1.9781100867669488e-05, + "loss": 1.132, + "step": 12804 + }, + { + "epoch": 0.16007900197504937, + "grad_norm": 0.0051499889232218266, + "learning_rate": 1.9780919234253996e-05, + "loss": 0.415, + "step": 12806 + }, + { + "epoch": 0.160104002600065, + "grad_norm": 6.360398292541504, + "learning_rate": 1.9780737526348404e-05, + "loss": 0.8216, + "step": 12808 + }, + { + "epoch": 0.16012900322508064, + "grad_norm": 13.590343475341797, + "learning_rate": 1.97805557439541e-05, + "loss": 0.9717, + "step": 12810 + }, + { + "epoch": 0.16015400385009626, + "grad_norm": 4.2297682762146, + "learning_rate": 1.9780373887072466e-05, + "loss": 1.7406, + "step": 12812 + }, + { + "epoch": 0.16017900447511188, + "grad_norm": 6.619110584259033, + "learning_rate": 1.978019195570489e-05, + "loss": 2.1299, + "step": 12814 + }, + { + "epoch": 0.1602040051001275, + "grad_norm": 3.157386541366577, + "learning_rate": 1.978000994985275e-05, + "loss": 2.1429, + "step": 12816 + }, + { + "epoch": 0.16022900572514312, + "grad_norm": 3.4061126708984375, + "learning_rate": 1.977982786951744e-05, + "loss": 1.3918, + "step": 12818 + }, + { + "epoch": 0.16025400635015877, + "grad_norm": 4.063413143157959, + "learning_rate": 1.9779645714700345e-05, + "loss": 0.4587, + "step": 12820 + }, + { + "epoch": 0.1602790069751744, + "grad_norm": 0.32007718086242676, + "learning_rate": 1.977946348540285e-05, + "loss": 0.963, + "step": 12822 + }, + { + "epoch": 0.16030400760019, + "grad_norm": 0.8284406065940857, + "learning_rate": 1.9779281181626344e-05, + "loss": 0.792, + "step": 12824 + }, + { + "epoch": 0.16032900822520563, + "grad_norm": 2.4053804874420166, + "learning_rate": 1.9779098803372213e-05, + "loss": 1.0954, + "step": 12826 + }, + { + "epoch": 0.16035400885022125, + "grad_norm": 2.8212087154388428, + "learning_rate": 1.9778916350641853e-05, + "loss": 0.5936, + "step": 12828 + }, + { + "epoch": 0.1603790094752369, + "grad_norm": 2.792898178100586, + "learning_rate": 1.9778733823436647e-05, + "loss": 1.2834, + "step": 12830 + }, + { + "epoch": 0.16040401010025251, + "grad_norm": 3.2242581844329834, + "learning_rate": 1.9778551221757986e-05, + "loss": 0.771, + "step": 12832 + }, + { + "epoch": 0.16042901072526813, + "grad_norm": 2.388458490371704, + "learning_rate": 1.9778368545607263e-05, + "loss": 0.7255, + "step": 12834 + }, + { + "epoch": 0.16045401135028375, + "grad_norm": 3.44648814201355, + "learning_rate": 1.9778185794985868e-05, + "loss": 1.4786, + "step": 12836 + }, + { + "epoch": 0.16047901197529937, + "grad_norm": 0.4696025848388672, + "learning_rate": 1.9778002969895193e-05, + "loss": 0.4981, + "step": 12838 + }, + { + "epoch": 0.16050401260031502, + "grad_norm": 2.836956024169922, + "learning_rate": 1.9777820070336627e-05, + "loss": 0.9456, + "step": 12840 + }, + { + "epoch": 0.16052901322533064, + "grad_norm": 2.8242671489715576, + "learning_rate": 1.977763709631157e-05, + "loss": 0.8903, + "step": 12842 + }, + { + "epoch": 0.16055401385034626, + "grad_norm": 1.0226373672485352, + "learning_rate": 1.977745404782141e-05, + "loss": 0.9074, + "step": 12844 + }, + { + "epoch": 0.16057901447536188, + "grad_norm": 4.220085144042969, + "learning_rate": 1.977727092486754e-05, + "loss": 2.3157, + "step": 12846 + }, + { + "epoch": 0.1606040151003775, + "grad_norm": 3.2205841541290283, + "learning_rate": 1.977708772745136e-05, + "loss": 0.9959, + "step": 12848 + }, + { + "epoch": 0.16062901572539315, + "grad_norm": 5.466496467590332, + "learning_rate": 1.9776904455574263e-05, + "loss": 1.0452, + "step": 12850 + }, + { + "epoch": 0.16065401635040877, + "grad_norm": 3.9736297130584717, + "learning_rate": 1.9776721109237642e-05, + "loss": 0.8573, + "step": 12852 + }, + { + "epoch": 0.1606790169754244, + "grad_norm": 3.969282865524292, + "learning_rate": 1.9776537688442898e-05, + "loss": 1.5715, + "step": 12854 + }, + { + "epoch": 0.16070401760044, + "grad_norm": 1.240299105644226, + "learning_rate": 1.9776354193191424e-05, + "loss": 0.0706, + "step": 12856 + }, + { + "epoch": 0.16072901822545563, + "grad_norm": 1.337903618812561, + "learning_rate": 1.9776170623484622e-05, + "loss": 0.9126, + "step": 12858 + }, + { + "epoch": 0.16075401885047128, + "grad_norm": 2.8966734409332275, + "learning_rate": 1.977598697932388e-05, + "loss": 1.699, + "step": 12860 + }, + { + "epoch": 0.1607790194754869, + "grad_norm": 3.3606600761413574, + "learning_rate": 1.977580326071061e-05, + "loss": 1.3626, + "step": 12862 + }, + { + "epoch": 0.16080402010050251, + "grad_norm": 5.0185160636901855, + "learning_rate": 1.9775619467646202e-05, + "loss": 1.817, + "step": 12864 + }, + { + "epoch": 0.16082902072551813, + "grad_norm": 2.9019970893859863, + "learning_rate": 1.9775435600132062e-05, + "loss": 0.1681, + "step": 12866 + }, + { + "epoch": 0.16085402135053375, + "grad_norm": 4.5165863037109375, + "learning_rate": 1.9775251658169584e-05, + "loss": 1.2296, + "step": 12868 + }, + { + "epoch": 0.1608790219755494, + "grad_norm": 2.4143476486206055, + "learning_rate": 1.977506764176017e-05, + "loss": 1.0467, + "step": 12870 + }, + { + "epoch": 0.16090402260056502, + "grad_norm": 4.2617573738098145, + "learning_rate": 1.9774883550905228e-05, + "loss": 1.1942, + "step": 12872 + }, + { + "epoch": 0.16092902322558064, + "grad_norm": 4.095648288726807, + "learning_rate": 1.977469938560615e-05, + "loss": 1.4096, + "step": 12874 + }, + { + "epoch": 0.16095402385059626, + "grad_norm": 6.798590660095215, + "learning_rate": 1.977451514586435e-05, + "loss": 1.2122, + "step": 12876 + }, + { + "epoch": 0.16097902447561188, + "grad_norm": 2.5034239292144775, + "learning_rate": 1.9774330831681217e-05, + "loss": 0.7891, + "step": 12878 + }, + { + "epoch": 0.16100402510062753, + "grad_norm": 2.309614658355713, + "learning_rate": 1.9774146443058168e-05, + "loss": 0.3653, + "step": 12880 + }, + { + "epoch": 0.16102902572564315, + "grad_norm": 3.2682063579559326, + "learning_rate": 1.9773961979996602e-05, + "loss": 0.695, + "step": 12882 + }, + { + "epoch": 0.16105402635065877, + "grad_norm": 1.7202885150909424, + "learning_rate": 1.977377744249792e-05, + "loss": 1.0344, + "step": 12884 + }, + { + "epoch": 0.1610790269756744, + "grad_norm": 4.964806079864502, + "learning_rate": 1.977359283056353e-05, + "loss": 1.0766, + "step": 12886 + }, + { + "epoch": 0.16110402760069, + "grad_norm": 7.753147125244141, + "learning_rate": 1.9773408144194845e-05, + "loss": 2.0617, + "step": 12888 + }, + { + "epoch": 0.16112902822570566, + "grad_norm": 3.055689573287964, + "learning_rate": 1.9773223383393263e-05, + "loss": 1.4715, + "step": 12890 + }, + { + "epoch": 0.16115402885072128, + "grad_norm": 0.4120856821537018, + "learning_rate": 1.9773038548160193e-05, + "loss": 1.2376, + "step": 12892 + }, + { + "epoch": 0.1611790294757369, + "grad_norm": 4.000949859619141, + "learning_rate": 1.9772853638497044e-05, + "loss": 0.5946, + "step": 12894 + }, + { + "epoch": 0.16120403010075252, + "grad_norm": 3.321989059448242, + "learning_rate": 1.9772668654405228e-05, + "loss": 1.3206, + "step": 12896 + }, + { + "epoch": 0.16122903072576814, + "grad_norm": 2.5117900371551514, + "learning_rate": 1.977248359588614e-05, + "loss": 1.2325, + "step": 12898 + }, + { + "epoch": 0.16125403135078378, + "grad_norm": 0.1308281421661377, + "learning_rate": 1.9772298462941207e-05, + "loss": 0.0598, + "step": 12900 + }, + { + "epoch": 0.1612790319757994, + "grad_norm": 4.238417148590088, + "learning_rate": 1.9772113255571828e-05, + "loss": 1.5809, + "step": 12902 + }, + { + "epoch": 0.16130403260081502, + "grad_norm": 2.3461763858795166, + "learning_rate": 1.9771927973779417e-05, + "loss": 1.222, + "step": 12904 + }, + { + "epoch": 0.16132903322583064, + "grad_norm": 0.0036447911988943815, + "learning_rate": 1.9771742617565383e-05, + "loss": 0.1214, + "step": 12906 + }, + { + "epoch": 0.16135403385084626, + "grad_norm": 2.40136456489563, + "learning_rate": 1.977155718693114e-05, + "loss": 1.4988, + "step": 12908 + }, + { + "epoch": 0.1613790344758619, + "grad_norm": 0.8275664448738098, + "learning_rate": 1.97713716818781e-05, + "loss": 0.024, + "step": 12910 + }, + { + "epoch": 0.16140403510087753, + "grad_norm": 3.9412546157836914, + "learning_rate": 1.9771186102407672e-05, + "loss": 0.3755, + "step": 12912 + }, + { + "epoch": 0.16142903572589315, + "grad_norm": 4.421278476715088, + "learning_rate": 1.9771000448521277e-05, + "loss": 1.5777, + "step": 12914 + }, + { + "epoch": 0.16145403635090877, + "grad_norm": 3.737786054611206, + "learning_rate": 1.9770814720220323e-05, + "loss": 0.9496, + "step": 12916 + }, + { + "epoch": 0.1614790369759244, + "grad_norm": 2.5996298789978027, + "learning_rate": 1.9770628917506224e-05, + "loss": 1.732, + "step": 12918 + }, + { + "epoch": 0.16150403760094004, + "grad_norm": 1.3306223154067993, + "learning_rate": 1.97704430403804e-05, + "loss": 0.4585, + "step": 12920 + }, + { + "epoch": 0.16152903822595566, + "grad_norm": 4.638648986816406, + "learning_rate": 1.977025708884426e-05, + "loss": 1.7763, + "step": 12922 + }, + { + "epoch": 0.16155403885097128, + "grad_norm": 0.7940484285354614, + "learning_rate": 1.9770071062899226e-05, + "loss": 1.1166, + "step": 12924 + }, + { + "epoch": 0.1615790394759869, + "grad_norm": 3.5176429748535156, + "learning_rate": 1.976988496254671e-05, + "loss": 0.9227, + "step": 12926 + }, + { + "epoch": 0.16160404010100252, + "grad_norm": 1.6016353368759155, + "learning_rate": 1.9769698787788135e-05, + "loss": 0.0803, + "step": 12928 + }, + { + "epoch": 0.16162904072601816, + "grad_norm": 0.6198147535324097, + "learning_rate": 1.9769512538624914e-05, + "loss": 0.0177, + "step": 12930 + }, + { + "epoch": 0.16165404135103378, + "grad_norm": 5.826296806335449, + "learning_rate": 1.9769326215058465e-05, + "loss": 1.6349, + "step": 12932 + }, + { + "epoch": 0.1616790419760494, + "grad_norm": 2.5197694301605225, + "learning_rate": 1.976913981709021e-05, + "loss": 0.3395, + "step": 12934 + }, + { + "epoch": 0.16170404260106502, + "grad_norm": 3.6328413486480713, + "learning_rate": 1.9768953344721567e-05, + "loss": 1.027, + "step": 12936 + }, + { + "epoch": 0.16172904322608064, + "grad_norm": 4.4370646476745605, + "learning_rate": 1.976876679795396e-05, + "loss": 1.5191, + "step": 12938 + }, + { + "epoch": 0.1617540438510963, + "grad_norm": 2.131683349609375, + "learning_rate": 1.9768580176788805e-05, + "loss": 1.1404, + "step": 12940 + }, + { + "epoch": 0.1617790444761119, + "grad_norm": 2.4698946475982666, + "learning_rate": 1.9768393481227525e-05, + "loss": 1.6608, + "step": 12942 + }, + { + "epoch": 0.16180404510112753, + "grad_norm": 4.986601829528809, + "learning_rate": 1.976820671127154e-05, + "loss": 1.671, + "step": 12944 + }, + { + "epoch": 0.16182904572614315, + "grad_norm": 4.134639739990234, + "learning_rate": 1.9768019866922277e-05, + "loss": 1.0474, + "step": 12946 + }, + { + "epoch": 0.16185404635115877, + "grad_norm": 2.2011828422546387, + "learning_rate": 1.976783294818115e-05, + "loss": 0.2928, + "step": 12948 + }, + { + "epoch": 0.16187904697617442, + "grad_norm": 4.031190395355225, + "learning_rate": 1.9767645955049594e-05, + "loss": 0.9469, + "step": 12950 + }, + { + "epoch": 0.16190404760119004, + "grad_norm": 0.5631057024002075, + "learning_rate": 1.976745888752903e-05, + "loss": 0.4085, + "step": 12952 + }, + { + "epoch": 0.16192904822620566, + "grad_norm": 3.3394854068756104, + "learning_rate": 1.9767271745620874e-05, + "loss": 0.5483, + "step": 12954 + }, + { + "epoch": 0.16195404885122128, + "grad_norm": 6.936605453491211, + "learning_rate": 1.9767084529326558e-05, + "loss": 1.1312, + "step": 12956 + }, + { + "epoch": 0.1619790494762369, + "grad_norm": 3.091722011566162, + "learning_rate": 1.9766897238647512e-05, + "loss": 0.5935, + "step": 12958 + }, + { + "epoch": 0.16200405010125254, + "grad_norm": 2.9946491718292236, + "learning_rate": 1.9766709873585156e-05, + "loss": 0.3395, + "step": 12960 + }, + { + "epoch": 0.16202905072626816, + "grad_norm": 15.80666732788086, + "learning_rate": 1.9766522434140918e-05, + "loss": 0.4925, + "step": 12962 + }, + { + "epoch": 0.16205405135128378, + "grad_norm": 3.382067918777466, + "learning_rate": 1.976633492031623e-05, + "loss": 0.783, + "step": 12964 + }, + { + "epoch": 0.1620790519762994, + "grad_norm": 3.107377052307129, + "learning_rate": 1.976614733211251e-05, + "loss": 0.6345, + "step": 12966 + }, + { + "epoch": 0.16210405260131502, + "grad_norm": 3.299093246459961, + "learning_rate": 1.9765959669531198e-05, + "loss": 0.5949, + "step": 12968 + }, + { + "epoch": 0.16212905322633067, + "grad_norm": 1.997373104095459, + "learning_rate": 1.9765771932573714e-05, + "loss": 1.4508, + "step": 12970 + }, + { + "epoch": 0.1621540538513463, + "grad_norm": 6.125792980194092, + "learning_rate": 1.9765584121241493e-05, + "loss": 0.1734, + "step": 12972 + }, + { + "epoch": 0.1621790544763619, + "grad_norm": 5.260492324829102, + "learning_rate": 1.9765396235535964e-05, + "loss": 0.5094, + "step": 12974 + }, + { + "epoch": 0.16220405510137753, + "grad_norm": 3.076002836227417, + "learning_rate": 1.976520827545856e-05, + "loss": 1.3919, + "step": 12976 + }, + { + "epoch": 0.16222905572639315, + "grad_norm": 5.579464435577393, + "learning_rate": 1.9765020241010712e-05, + "loss": 2.0519, + "step": 12978 + }, + { + "epoch": 0.1622540563514088, + "grad_norm": 2.821838617324829, + "learning_rate": 1.976483213219385e-05, + "loss": 0.7535, + "step": 12980 + }, + { + "epoch": 0.16227905697642442, + "grad_norm": 2.3653602600097656, + "learning_rate": 1.9764643949009407e-05, + "loss": 1.1205, + "step": 12982 + }, + { + "epoch": 0.16230405760144004, + "grad_norm": 9.381975173950195, + "learning_rate": 1.9764455691458813e-05, + "loss": 1.8673, + "step": 12984 + }, + { + "epoch": 0.16232905822645566, + "grad_norm": 2.5858371257781982, + "learning_rate": 1.976426735954351e-05, + "loss": 1.1391, + "step": 12986 + }, + { + "epoch": 0.16235405885147128, + "grad_norm": 2.540205955505371, + "learning_rate": 1.9764078953264922e-05, + "loss": 1.1381, + "step": 12988 + }, + { + "epoch": 0.16237905947648693, + "grad_norm": 2.670672655105591, + "learning_rate": 1.9763890472624494e-05, + "loss": 0.368, + "step": 12990 + }, + { + "epoch": 0.16240406010150255, + "grad_norm": 2.980656147003174, + "learning_rate": 1.9763701917623658e-05, + "loss": 0.4914, + "step": 12992 + }, + { + "epoch": 0.16242906072651817, + "grad_norm": 0.07960361242294312, + "learning_rate": 1.9763513288263845e-05, + "loss": 0.6272, + "step": 12994 + }, + { + "epoch": 0.16245406135153379, + "grad_norm": 2.451688289642334, + "learning_rate": 1.9763324584546497e-05, + "loss": 0.336, + "step": 12996 + }, + { + "epoch": 0.1624790619765494, + "grad_norm": 0.7345331907272339, + "learning_rate": 1.9763135806473052e-05, + "loss": 0.5296, + "step": 12998 + }, + { + "epoch": 0.16250406260156505, + "grad_norm": 4.697054386138916, + "learning_rate": 1.9762946954044944e-05, + "loss": 0.3555, + "step": 13000 + }, + { + "epoch": 0.16252906322658067, + "grad_norm": 3.2779698371887207, + "learning_rate": 1.976275802726361e-05, + "loss": 1.861, + "step": 13002 + }, + { + "epoch": 0.1625540638515963, + "grad_norm": 1.8486205339431763, + "learning_rate": 1.9762569026130497e-05, + "loss": 0.3839, + "step": 13004 + }, + { + "epoch": 0.1625790644766119, + "grad_norm": 3.2567014694213867, + "learning_rate": 1.9762379950647033e-05, + "loss": 1.0601, + "step": 13006 + }, + { + "epoch": 0.16260406510162753, + "grad_norm": 4.917303085327148, + "learning_rate": 1.9762190800814664e-05, + "loss": 1.1574, + "step": 13008 + }, + { + "epoch": 0.16262906572664318, + "grad_norm": 0.8227323293685913, + "learning_rate": 1.9762001576634834e-05, + "loss": 1.044, + "step": 13010 + }, + { + "epoch": 0.1626540663516588, + "grad_norm": 10.726448059082031, + "learning_rate": 1.9761812278108978e-05, + "loss": 0.5677, + "step": 13012 + }, + { + "epoch": 0.16267906697667442, + "grad_norm": 0.004661891143769026, + "learning_rate": 1.976162290523854e-05, + "loss": 0.6573, + "step": 13014 + }, + { + "epoch": 0.16270406760169004, + "grad_norm": 1.3834391832351685, + "learning_rate": 1.9761433458024966e-05, + "loss": 0.7807, + "step": 13016 + }, + { + "epoch": 0.16272906822670566, + "grad_norm": 0.1866205483675003, + "learning_rate": 1.9761243936469693e-05, + "loss": 0.4267, + "step": 13018 + }, + { + "epoch": 0.1627540688517213, + "grad_norm": 0.0014821174554526806, + "learning_rate": 1.9761054340574164e-05, + "loss": 0.0001, + "step": 13020 + }, + { + "epoch": 0.16277906947673693, + "grad_norm": 3.925262451171875, + "learning_rate": 1.976086467033983e-05, + "loss": 1.2873, + "step": 13022 + }, + { + "epoch": 0.16280407010175255, + "grad_norm": 0.023864656686782837, + "learning_rate": 1.976067492576813e-05, + "loss": 0.8386, + "step": 13024 + }, + { + "epoch": 0.16282907072676817, + "grad_norm": 0.0019192721229046583, + "learning_rate": 1.9760485106860505e-05, + "loss": 0.1158, + "step": 13026 + }, + { + "epoch": 0.16285407135178379, + "grad_norm": 2.7393126487731934, + "learning_rate": 1.976029521361841e-05, + "loss": 0.819, + "step": 13028 + }, + { + "epoch": 0.16287907197679943, + "grad_norm": 4.065953731536865, + "learning_rate": 1.9760105246043286e-05, + "loss": 0.6322, + "step": 13030 + }, + { + "epoch": 0.16290407260181505, + "grad_norm": 2.577838659286499, + "learning_rate": 1.975991520413658e-05, + "loss": 0.5922, + "step": 13032 + }, + { + "epoch": 0.16292907322683067, + "grad_norm": 4.056440353393555, + "learning_rate": 1.9759725087899745e-05, + "loss": 1.9158, + "step": 13034 + }, + { + "epoch": 0.1629540738518463, + "grad_norm": 2.7458252906799316, + "learning_rate": 1.9759534897334215e-05, + "loss": 0.6865, + "step": 13036 + }, + { + "epoch": 0.1629790744768619, + "grad_norm": 0.0013661925913766026, + "learning_rate": 1.9759344632441456e-05, + "loss": 0.5512, + "step": 13038 + }, + { + "epoch": 0.16300407510187756, + "grad_norm": 2.321118116378784, + "learning_rate": 1.9759154293222904e-05, + "loss": 0.6408, + "step": 13040 + }, + { + "epoch": 0.16302907572689318, + "grad_norm": 4.529439926147461, + "learning_rate": 1.975896387968001e-05, + "loss": 1.9511, + "step": 13042 + }, + { + "epoch": 0.1630540763519088, + "grad_norm": 1.5472328662872314, + "learning_rate": 1.9758773391814233e-05, + "loss": 0.452, + "step": 13044 + }, + { + "epoch": 0.16307907697692442, + "grad_norm": 4.047083377838135, + "learning_rate": 1.9758582829627015e-05, + "loss": 1.9008, + "step": 13046 + }, + { + "epoch": 0.16310407760194004, + "grad_norm": 2.96163272857666, + "learning_rate": 1.975839219311981e-05, + "loss": 0.8919, + "step": 13048 + }, + { + "epoch": 0.1631290782269557, + "grad_norm": 2.755837917327881, + "learning_rate": 1.975820148229407e-05, + "loss": 1.522, + "step": 13050 + }, + { + "epoch": 0.1631540788519713, + "grad_norm": 3.170506477355957, + "learning_rate": 1.975801069715125e-05, + "loss": 1.0719, + "step": 13052 + }, + { + "epoch": 0.16317907947698693, + "grad_norm": 1.764435052871704, + "learning_rate": 1.9757819837692797e-05, + "loss": 0.985, + "step": 13054 + }, + { + "epoch": 0.16320408010200255, + "grad_norm": 6.566181659698486, + "learning_rate": 1.9757628903920173e-05, + "loss": 2.2329, + "step": 13056 + }, + { + "epoch": 0.16322908072701817, + "grad_norm": 2.9903573989868164, + "learning_rate": 1.975743789583482e-05, + "loss": 1.0691, + "step": 13058 + }, + { + "epoch": 0.16325408135203381, + "grad_norm": 5.2855353355407715, + "learning_rate": 1.9757246813438205e-05, + "loss": 1.7753, + "step": 13060 + }, + { + "epoch": 0.16327908197704943, + "grad_norm": 3.1906750202178955, + "learning_rate": 1.9757055656731777e-05, + "loss": 1.5723, + "step": 13062 + }, + { + "epoch": 0.16330408260206505, + "grad_norm": 0.9265944957733154, + "learning_rate": 1.975686442571699e-05, + "loss": 1.3142, + "step": 13064 + }, + { + "epoch": 0.16332908322708067, + "grad_norm": 3.6976940631866455, + "learning_rate": 1.9756673120395308e-05, + "loss": 0.5797, + "step": 13066 + }, + { + "epoch": 0.1633540838520963, + "grad_norm": 3.374093770980835, + "learning_rate": 1.975648174076818e-05, + "loss": 0.766, + "step": 13068 + }, + { + "epoch": 0.16337908447711194, + "grad_norm": 3.9817686080932617, + "learning_rate": 1.9756290286837067e-05, + "loss": 1.1455, + "step": 13070 + }, + { + "epoch": 0.16340408510212756, + "grad_norm": 5.476555824279785, + "learning_rate": 1.975609875860343e-05, + "loss": 1.1699, + "step": 13072 + }, + { + "epoch": 0.16342908572714318, + "grad_norm": 7.631515026092529, + "learning_rate": 1.975590715606872e-05, + "loss": 1.5964, + "step": 13074 + }, + { + "epoch": 0.1634540863521588, + "grad_norm": 0.8311274647712708, + "learning_rate": 1.97557154792344e-05, + "loss": 0.4988, + "step": 13076 + }, + { + "epoch": 0.16347908697717442, + "grad_norm": 0.0019897327292710543, + "learning_rate": 1.9755523728101936e-05, + "loss": 0.8981, + "step": 13078 + }, + { + "epoch": 0.16350408760219007, + "grad_norm": 2.6983399391174316, + "learning_rate": 1.975533190267278e-05, + "loss": 0.8554, + "step": 13080 + }, + { + "epoch": 0.1635290882272057, + "grad_norm": 5.588770389556885, + "learning_rate": 1.9755140002948396e-05, + "loss": 1.9474, + "step": 13082 + }, + { + "epoch": 0.1635540888522213, + "grad_norm": 2.636693239212036, + "learning_rate": 1.9754948028930245e-05, + "loss": 0.7755, + "step": 13084 + }, + { + "epoch": 0.16357908947723693, + "grad_norm": 2.897196054458618, + "learning_rate": 1.975475598061979e-05, + "loss": 1.6355, + "step": 13086 + }, + { + "epoch": 0.16360409010225255, + "grad_norm": 6.750943660736084, + "learning_rate": 1.975456385801849e-05, + "loss": 0.3275, + "step": 13088 + }, + { + "epoch": 0.1636290907272682, + "grad_norm": 4.556063175201416, + "learning_rate": 1.9754371661127812e-05, + "loss": 1.4772, + "step": 13090 + }, + { + "epoch": 0.16365409135228381, + "grad_norm": 3.544004440307617, + "learning_rate": 1.9754179389949223e-05, + "loss": 0.9741, + "step": 13092 + }, + { + "epoch": 0.16367909197729943, + "grad_norm": 3.39568829536438, + "learning_rate": 1.975398704448418e-05, + "loss": 1.6291, + "step": 13094 + }, + { + "epoch": 0.16370409260231505, + "grad_norm": 4.072340488433838, + "learning_rate": 1.9753794624734152e-05, + "loss": 0.8229, + "step": 13096 + }, + { + "epoch": 0.16372909322733067, + "grad_norm": 3.954843759536743, + "learning_rate": 1.97536021307006e-05, + "loss": 1.8383, + "step": 13098 + }, + { + "epoch": 0.16375409385234632, + "grad_norm": 2.467728853225708, + "learning_rate": 1.9753409562385e-05, + "loss": 1.2443, + "step": 13100 + }, + { + "epoch": 0.16377909447736194, + "grad_norm": 4.4684672355651855, + "learning_rate": 1.9753216919788808e-05, + "loss": 1.0601, + "step": 13102 + }, + { + "epoch": 0.16380409510237756, + "grad_norm": 8.091965675354004, + "learning_rate": 1.9753024202913493e-05, + "loss": 0.7176, + "step": 13104 + }, + { + "epoch": 0.16382909572739318, + "grad_norm": 2.1532092094421387, + "learning_rate": 1.975283141176053e-05, + "loss": 0.8528, + "step": 13106 + }, + { + "epoch": 0.1638540963524088, + "grad_norm": 0.018264753744006157, + "learning_rate": 1.9752638546331377e-05, + "loss": 0.1331, + "step": 13108 + }, + { + "epoch": 0.16387909697742445, + "grad_norm": 3.515962600708008, + "learning_rate": 1.975244560662751e-05, + "loss": 0.5726, + "step": 13110 + }, + { + "epoch": 0.16390409760244007, + "grad_norm": 2.786180019378662, + "learning_rate": 1.97522525926504e-05, + "loss": 1.5222, + "step": 13112 + }, + { + "epoch": 0.1639290982274557, + "grad_norm": 6.4905877113342285, + "learning_rate": 1.9752059504401508e-05, + "loss": 0.4471, + "step": 13114 + }, + { + "epoch": 0.1639540988524713, + "grad_norm": 5.027302265167236, + "learning_rate": 1.975186634188231e-05, + "loss": 1.1453, + "step": 13116 + }, + { + "epoch": 0.16397909947748693, + "grad_norm": 3.891510248184204, + "learning_rate": 1.975167310509428e-05, + "loss": 1.6841, + "step": 13118 + }, + { + "epoch": 0.16400410010250258, + "grad_norm": 4.683281898498535, + "learning_rate": 1.9751479794038885e-05, + "loss": 2.47, + "step": 13120 + }, + { + "epoch": 0.1640291007275182, + "grad_norm": 5.194438457489014, + "learning_rate": 1.9751286408717602e-05, + "loss": 1.0428, + "step": 13122 + }, + { + "epoch": 0.16405410135253382, + "grad_norm": 0.0016222124686464667, + "learning_rate": 1.9751092949131896e-05, + "loss": 0.1062, + "step": 13124 + }, + { + "epoch": 0.16407910197754944, + "grad_norm": 2.5290017127990723, + "learning_rate": 1.975089941528325e-05, + "loss": 0.8907, + "step": 13126 + }, + { + "epoch": 0.16410410260256506, + "grad_norm": 1.9022181034088135, + "learning_rate": 1.9750705807173126e-05, + "loss": 0.6929, + "step": 13128 + }, + { + "epoch": 0.1641291032275807, + "grad_norm": 3.240288734436035, + "learning_rate": 1.9750512124803013e-05, + "loss": 0.7995, + "step": 13130 + }, + { + "epoch": 0.16415410385259632, + "grad_norm": 3.0101964473724365, + "learning_rate": 1.9750318368174372e-05, + "loss": 0.4448, + "step": 13132 + }, + { + "epoch": 0.16417910447761194, + "grad_norm": 2.3409926891326904, + "learning_rate": 1.9750124537288687e-05, + "loss": 1.4567, + "step": 13134 + }, + { + "epoch": 0.16420410510262756, + "grad_norm": 3.5114145278930664, + "learning_rate": 1.9749930632147435e-05, + "loss": 1.7387, + "step": 13136 + }, + { + "epoch": 0.16422910572764318, + "grad_norm": 1.4470285177230835, + "learning_rate": 1.974973665275209e-05, + "loss": 0.0673, + "step": 13138 + }, + { + "epoch": 0.16425410635265883, + "grad_norm": 2.8574893474578857, + "learning_rate": 1.9749542599104126e-05, + "loss": 0.4902, + "step": 13140 + }, + { + "epoch": 0.16427910697767445, + "grad_norm": 2.872610330581665, + "learning_rate": 1.9749348471205027e-05, + "loss": 0.0862, + "step": 13142 + }, + { + "epoch": 0.16430410760269007, + "grad_norm": 0.9179526567459106, + "learning_rate": 1.9749154269056266e-05, + "loss": 0.9764, + "step": 13144 + }, + { + "epoch": 0.1643291082277057, + "grad_norm": 2.4059414863586426, + "learning_rate": 1.974895999265933e-05, + "loss": 0.7642, + "step": 13146 + }, + { + "epoch": 0.1643541088527213, + "grad_norm": 2.8315958976745605, + "learning_rate": 1.9748765642015687e-05, + "loss": 0.4863, + "step": 13148 + }, + { + "epoch": 0.16437910947773696, + "grad_norm": 10.898630142211914, + "learning_rate": 1.974857121712683e-05, + "loss": 1.1918, + "step": 13150 + }, + { + "epoch": 0.16440411010275258, + "grad_norm": 0.000733657565433532, + "learning_rate": 1.974837671799423e-05, + "loss": 1.2945, + "step": 13152 + }, + { + "epoch": 0.1644291107277682, + "grad_norm": 0.015272476710379124, + "learning_rate": 1.974818214461937e-05, + "loss": 0.8511, + "step": 13154 + }, + { + "epoch": 0.16445411135278382, + "grad_norm": 2.518409013748169, + "learning_rate": 1.9747987497003733e-05, + "loss": 0.2431, + "step": 13156 + }, + { + "epoch": 0.16447911197779944, + "grad_norm": 3.768606662750244, + "learning_rate": 1.9747792775148804e-05, + "loss": 1.7206, + "step": 13158 + }, + { + "epoch": 0.16450411260281508, + "grad_norm": 4.075292110443115, + "learning_rate": 1.9747597979056065e-05, + "loss": 0.2585, + "step": 13160 + }, + { + "epoch": 0.1645291132278307, + "grad_norm": 4.590107440948486, + "learning_rate": 1.9747403108726994e-05, + "loss": 1.7697, + "step": 13162 + }, + { + "epoch": 0.16455411385284632, + "grad_norm": 3.7050156593322754, + "learning_rate": 1.974720816416308e-05, + "loss": 1.7687, + "step": 13164 + }, + { + "epoch": 0.16457911447786194, + "grad_norm": 2.8462743759155273, + "learning_rate": 1.974701314536581e-05, + "loss": 1.6677, + "step": 13166 + }, + { + "epoch": 0.16460411510287756, + "grad_norm": 0.0018741668900474906, + "learning_rate": 1.9746818052336666e-05, + "loss": 0.0001, + "step": 13168 + }, + { + "epoch": 0.1646291157278932, + "grad_norm": 5.658571720123291, + "learning_rate": 1.9746622885077132e-05, + "loss": 0.7203, + "step": 13170 + }, + { + "epoch": 0.16465411635290883, + "grad_norm": 0.0015844723675400019, + "learning_rate": 1.9746427643588696e-05, + "loss": 0.6287, + "step": 13172 + }, + { + "epoch": 0.16467911697792445, + "grad_norm": 2.5698788166046143, + "learning_rate": 1.974623232787285e-05, + "loss": 0.4865, + "step": 13174 + }, + { + "epoch": 0.16470411760294007, + "grad_norm": 5.004791736602783, + "learning_rate": 1.974603693793107e-05, + "loss": 1.9187, + "step": 13176 + }, + { + "epoch": 0.1647291182279557, + "grad_norm": 3.239147901535034, + "learning_rate": 1.9745841473764855e-05, + "loss": 0.184, + "step": 13178 + }, + { + "epoch": 0.16475411885297134, + "grad_norm": 0.3458101451396942, + "learning_rate": 1.9745645935375688e-05, + "loss": 0.0299, + "step": 13180 + }, + { + "epoch": 0.16477911947798696, + "grad_norm": 3.557575225830078, + "learning_rate": 1.974545032276506e-05, + "loss": 1.101, + "step": 13182 + }, + { + "epoch": 0.16480412010300258, + "grad_norm": 4.855677127838135, + "learning_rate": 1.974525463593446e-05, + "loss": 1.1821, + "step": 13184 + }, + { + "epoch": 0.1648291207280182, + "grad_norm": 5.388638019561768, + "learning_rate": 1.974505887488538e-05, + "loss": 2.5081, + "step": 13186 + }, + { + "epoch": 0.16485412135303382, + "grad_norm": 4.459591865539551, + "learning_rate": 1.9744863039619312e-05, + "loss": 0.9575, + "step": 13188 + }, + { + "epoch": 0.16487912197804946, + "grad_norm": 2.683711290359497, + "learning_rate": 1.974466713013774e-05, + "loss": 1.0166, + "step": 13190 + }, + { + "epoch": 0.16490412260306508, + "grad_norm": 5.677315711975098, + "learning_rate": 1.9744471146442165e-05, + "loss": 1.6194, + "step": 13192 + }, + { + "epoch": 0.1649291232280807, + "grad_norm": 2.1450321674346924, + "learning_rate": 1.9744275088534074e-05, + "loss": 1.4427, + "step": 13194 + }, + { + "epoch": 0.16495412385309632, + "grad_norm": 4.889382362365723, + "learning_rate": 1.974407895641496e-05, + "loss": 0.2335, + "step": 13196 + }, + { + "epoch": 0.16497912447811194, + "grad_norm": 4.240497589111328, + "learning_rate": 1.9743882750086323e-05, + "loss": 1.643, + "step": 13198 + }, + { + "epoch": 0.1650041251031276, + "grad_norm": 3.8747847080230713, + "learning_rate": 1.974368646954965e-05, + "loss": 0.8151, + "step": 13200 + }, + { + "epoch": 0.1650291257281432, + "grad_norm": 7.974327564239502, + "learning_rate": 1.974349011480644e-05, + "loss": 1.0003, + "step": 13202 + }, + { + "epoch": 0.16505412635315883, + "grad_norm": 1.6608339548110962, + "learning_rate": 1.974329368585819e-05, + "loss": 1.4405, + "step": 13204 + }, + { + "epoch": 0.16507912697817445, + "grad_norm": 2.3631021976470947, + "learning_rate": 1.9743097182706385e-05, + "loss": 0.1892, + "step": 13206 + }, + { + "epoch": 0.16510412760319007, + "grad_norm": 2.3252503871917725, + "learning_rate": 1.9742900605352537e-05, + "loss": 0.8776, + "step": 13208 + }, + { + "epoch": 0.16512912822820572, + "grad_norm": 3.3320624828338623, + "learning_rate": 1.974270395379813e-05, + "loss": 0.4679, + "step": 13210 + }, + { + "epoch": 0.16515412885322134, + "grad_norm": 0.0032268420327454805, + "learning_rate": 1.9742507228044673e-05, + "loss": 0.0686, + "step": 13212 + }, + { + "epoch": 0.16517912947823696, + "grad_norm": 1.9888734817504883, + "learning_rate": 1.9742310428093656e-05, + "loss": 0.7294, + "step": 13214 + }, + { + "epoch": 0.16520413010325258, + "grad_norm": 4.7209882736206055, + "learning_rate": 1.974211355394658e-05, + "loss": 1.6637, + "step": 13216 + }, + { + "epoch": 0.1652291307282682, + "grad_norm": 3.7897346019744873, + "learning_rate": 1.9741916605604947e-05, + "loss": 0.3996, + "step": 13218 + }, + { + "epoch": 0.16525413135328385, + "grad_norm": 3.7856674194335938, + "learning_rate": 1.974171958307025e-05, + "loss": 2.2561, + "step": 13220 + }, + { + "epoch": 0.16527913197829947, + "grad_norm": 2.4369330406188965, + "learning_rate": 1.9741522486344e-05, + "loss": 1.086, + "step": 13222 + }, + { + "epoch": 0.16530413260331508, + "grad_norm": 0.0025268406607210636, + "learning_rate": 1.9741325315427692e-05, + "loss": 0.7836, + "step": 13224 + }, + { + "epoch": 0.1653291332283307, + "grad_norm": 0.0007088163401931524, + "learning_rate": 1.9741128070322823e-05, + "loss": 0.4089, + "step": 13226 + }, + { + "epoch": 0.16535413385334632, + "grad_norm": 2.330016851425171, + "learning_rate": 1.9740930751030905e-05, + "loss": 1.0427, + "step": 13228 + }, + { + "epoch": 0.16537913447836197, + "grad_norm": 2.8455913066864014, + "learning_rate": 1.9740733357553433e-05, + "loss": 1.5183, + "step": 13230 + }, + { + "epoch": 0.1654041351033776, + "grad_norm": 0.07465513795614243, + "learning_rate": 1.9740535889891914e-05, + "loss": 1.1489, + "step": 13232 + }, + { + "epoch": 0.1654291357283932, + "grad_norm": 4.387430191040039, + "learning_rate": 1.974033834804785e-05, + "loss": 1.5875, + "step": 13234 + }, + { + "epoch": 0.16545413635340883, + "grad_norm": 3.20906400680542, + "learning_rate": 1.974014073202275e-05, + "loss": 1.5611, + "step": 13236 + }, + { + "epoch": 0.16547913697842445, + "grad_norm": 2.9614672660827637, + "learning_rate": 1.9739943041818113e-05, + "loss": 1.27, + "step": 13238 + }, + { + "epoch": 0.1655041376034401, + "grad_norm": 2.834685802459717, + "learning_rate": 1.9739745277435447e-05, + "loss": 1.0764, + "step": 13240 + }, + { + "epoch": 0.16552913822845572, + "grad_norm": 4.509789943695068, + "learning_rate": 1.973954743887626e-05, + "loss": 1.9203, + "step": 13242 + }, + { + "epoch": 0.16555413885347134, + "grad_norm": 3.600316047668457, + "learning_rate": 1.9739349526142054e-05, + "loss": 0.8753, + "step": 13244 + }, + { + "epoch": 0.16557913947848696, + "grad_norm": 2.4071247577667236, + "learning_rate": 1.973915153923434e-05, + "loss": 2.2704, + "step": 13246 + }, + { + "epoch": 0.16560414010350258, + "grad_norm": 3.1522881984710693, + "learning_rate": 1.9738953478154625e-05, + "loss": 1.0047, + "step": 13248 + }, + { + "epoch": 0.16562914072851823, + "grad_norm": 4.115470886230469, + "learning_rate": 1.973875534290442e-05, + "loss": 0.955, + "step": 13250 + }, + { + "epoch": 0.16565414135353385, + "grad_norm": 0.005168826784938574, + "learning_rate": 1.973855713348523e-05, + "loss": 0.3894, + "step": 13252 + }, + { + "epoch": 0.16567914197854947, + "grad_norm": 4.3382182121276855, + "learning_rate": 1.9738358849898567e-05, + "loss": 0.8396, + "step": 13254 + }, + { + "epoch": 0.16570414260356509, + "grad_norm": 0.0023323791101574898, + "learning_rate": 1.973816049214594e-05, + "loss": 0.7015, + "step": 13256 + }, + { + "epoch": 0.1657291432285807, + "grad_norm": 3.2671024799346924, + "learning_rate": 1.9737962060228856e-05, + "loss": 0.8987, + "step": 13258 + }, + { + "epoch": 0.16575414385359635, + "grad_norm": 2.861523389816284, + "learning_rate": 1.9737763554148834e-05, + "loss": 1.0173, + "step": 13260 + }, + { + "epoch": 0.16577914447861197, + "grad_norm": 9.529797554016113, + "learning_rate": 1.9737564973907384e-05, + "loss": 1.0364, + "step": 13262 + }, + { + "epoch": 0.1658041451036276, + "grad_norm": 4.296408653259277, + "learning_rate": 1.973736631950601e-05, + "loss": 0.3766, + "step": 13264 + }, + { + "epoch": 0.1658291457286432, + "grad_norm": 2.5125882625579834, + "learning_rate": 1.9737167590946235e-05, + "loss": 0.432, + "step": 13266 + }, + { + "epoch": 0.16585414635365883, + "grad_norm": 4.3758368492126465, + "learning_rate": 1.9736968788229565e-05, + "loss": 1.4095, + "step": 13268 + }, + { + "epoch": 0.16587914697867448, + "grad_norm": 2.4103140830993652, + "learning_rate": 1.9736769911357526e-05, + "loss": 1.2729, + "step": 13270 + }, + { + "epoch": 0.1659041476036901, + "grad_norm": 0.4314862787723541, + "learning_rate": 1.9736570960331618e-05, + "loss": 0.9063, + "step": 13272 + }, + { + "epoch": 0.16592914822870572, + "grad_norm": 3.999037981033325, + "learning_rate": 1.973637193515336e-05, + "loss": 1.3403, + "step": 13274 + }, + { + "epoch": 0.16595414885372134, + "grad_norm": 2.730804920196533, + "learning_rate": 1.9736172835824275e-05, + "loss": 0.1606, + "step": 13276 + }, + { + "epoch": 0.16597914947873696, + "grad_norm": 0.008034095168113708, + "learning_rate": 1.973597366234587e-05, + "loss": 0.1222, + "step": 13278 + }, + { + "epoch": 0.1660041501037526, + "grad_norm": 2.625502109527588, + "learning_rate": 1.9735774414719674e-05, + "loss": 1.2161, + "step": 13280 + }, + { + "epoch": 0.16602915072876823, + "grad_norm": 3.2824206352233887, + "learning_rate": 1.973557509294719e-05, + "loss": 0.3771, + "step": 13282 + }, + { + "epoch": 0.16605415135378385, + "grad_norm": 3.3737521171569824, + "learning_rate": 1.9735375697029947e-05, + "loss": 0.693, + "step": 13284 + }, + { + "epoch": 0.16607915197879947, + "grad_norm": 2.629755973815918, + "learning_rate": 1.9735176226969456e-05, + "loss": 0.6654, + "step": 13286 + }, + { + "epoch": 0.1661041526038151, + "grad_norm": 1.6175501346588135, + "learning_rate": 1.973497668276724e-05, + "loss": 0.8853, + "step": 13288 + }, + { + "epoch": 0.16612915322883073, + "grad_norm": 3.1164603233337402, + "learning_rate": 1.9734777064424818e-05, + "loss": 0.8531, + "step": 13290 + }, + { + "epoch": 0.16615415385384635, + "grad_norm": 2.318531036376953, + "learning_rate": 1.9734577371943713e-05, + "loss": 0.0523, + "step": 13292 + }, + { + "epoch": 0.16617915447886197, + "grad_norm": 4.476472854614258, + "learning_rate": 1.973437760532544e-05, + "loss": 1.3249, + "step": 13294 + }, + { + "epoch": 0.1662041551038776, + "grad_norm": 2.0628578662872314, + "learning_rate": 1.9734177764571527e-05, + "loss": 0.7285, + "step": 13296 + }, + { + "epoch": 0.1662291557288932, + "grad_norm": 3.7190308570861816, + "learning_rate": 1.973397784968349e-05, + "loss": 1.1787, + "step": 13298 + }, + { + "epoch": 0.16625415635390886, + "grad_norm": 1.5399532318115234, + "learning_rate": 1.9733777860662857e-05, + "loss": 0.1718, + "step": 13300 + }, + { + "epoch": 0.16627915697892448, + "grad_norm": 4.702518939971924, + "learning_rate": 1.9733577797511146e-05, + "loss": 1.3057, + "step": 13302 + }, + { + "epoch": 0.1663041576039401, + "grad_norm": 3.3574609756469727, + "learning_rate": 1.9733377660229884e-05, + "loss": 1.7571, + "step": 13304 + }, + { + "epoch": 0.16632915822895572, + "grad_norm": 4.196248531341553, + "learning_rate": 1.9733177448820597e-05, + "loss": 1.679, + "step": 13306 + }, + { + "epoch": 0.16635415885397134, + "grad_norm": 2.610762357711792, + "learning_rate": 1.97329771632848e-05, + "loss": 0.6535, + "step": 13308 + }, + { + "epoch": 0.166379159478987, + "grad_norm": 5.754736423492432, + "learning_rate": 1.973277680362403e-05, + "loss": 1.5727, + "step": 13310 + }, + { + "epoch": 0.1664041601040026, + "grad_norm": 0.8976267576217651, + "learning_rate": 1.9732576369839805e-05, + "loss": 0.082, + "step": 13312 + }, + { + "epoch": 0.16642916072901823, + "grad_norm": 5.8506975173950195, + "learning_rate": 1.973237586193366e-05, + "loss": 1.7171, + "step": 13314 + }, + { + "epoch": 0.16645416135403385, + "grad_norm": 1.9139753580093384, + "learning_rate": 1.973217527990711e-05, + "loss": 0.3999, + "step": 13316 + }, + { + "epoch": 0.16647916197904947, + "grad_norm": 2.7435104846954346, + "learning_rate": 1.9731974623761695e-05, + "loss": 1.0134, + "step": 13318 + }, + { + "epoch": 0.16650416260406511, + "grad_norm": 5.1440229415893555, + "learning_rate": 1.9731773893498934e-05, + "loss": 0.4316, + "step": 13320 + }, + { + "epoch": 0.16652916322908073, + "grad_norm": 3.6426680088043213, + "learning_rate": 1.973157308912036e-05, + "loss": 0.7147, + "step": 13322 + }, + { + "epoch": 0.16655416385409635, + "grad_norm": 1.1375172138214111, + "learning_rate": 1.9731372210627504e-05, + "loss": 0.2482, + "step": 13324 + }, + { + "epoch": 0.16657916447911197, + "grad_norm": 4.016312599182129, + "learning_rate": 1.973117125802189e-05, + "loss": 2.3191, + "step": 13326 + }, + { + "epoch": 0.1666041651041276, + "grad_norm": 3.246812105178833, + "learning_rate": 1.9730970231305056e-05, + "loss": 1.3974, + "step": 13328 + }, + { + "epoch": 0.16662916572914324, + "grad_norm": 4.693193435668945, + "learning_rate": 1.9730769130478527e-05, + "loss": 1.9268, + "step": 13330 + }, + { + "epoch": 0.16665416635415886, + "grad_norm": 2.644463062286377, + "learning_rate": 1.9730567955543834e-05, + "loss": 1.013, + "step": 13332 + }, + { + "epoch": 0.16667916697917448, + "grad_norm": 3.2130370140075684, + "learning_rate": 1.9730366706502514e-05, + "loss": 1.3229, + "step": 13334 + }, + { + "epoch": 0.1667041676041901, + "grad_norm": 2.854257345199585, + "learning_rate": 1.9730165383356098e-05, + "loss": 0.5122, + "step": 13336 + }, + { + "epoch": 0.16672916822920572, + "grad_norm": 5.4009108543396, + "learning_rate": 1.9729963986106116e-05, + "loss": 2.0621, + "step": 13338 + }, + { + "epoch": 0.16675416885422137, + "grad_norm": 1.2216638326644897, + "learning_rate": 1.9729762514754104e-05, + "loss": 0.0746, + "step": 13340 + }, + { + "epoch": 0.166779169479237, + "grad_norm": 1.7969425916671753, + "learning_rate": 1.9729560969301596e-05, + "loss": 1.5795, + "step": 13342 + }, + { + "epoch": 0.1668041701042526, + "grad_norm": 4.578139781951904, + "learning_rate": 1.9729359349750132e-05, + "loss": 1.0862, + "step": 13344 + }, + { + "epoch": 0.16682917072926823, + "grad_norm": 2.470945119857788, + "learning_rate": 1.9729157656101238e-05, + "loss": 0.2, + "step": 13346 + }, + { + "epoch": 0.16685417135428385, + "grad_norm": 4.885252475738525, + "learning_rate": 1.9728955888356456e-05, + "loss": 0.6413, + "step": 13348 + }, + { + "epoch": 0.1668791719792995, + "grad_norm": 2.506301164627075, + "learning_rate": 1.9728754046517326e-05, + "loss": 1.0056, + "step": 13350 + }, + { + "epoch": 0.16690417260431512, + "grad_norm": 6.2984700202941895, + "learning_rate": 1.9728552130585377e-05, + "loss": 0.4881, + "step": 13352 + }, + { + "epoch": 0.16692917322933074, + "grad_norm": 3.1430411338806152, + "learning_rate": 1.9728350140562152e-05, + "loss": 0.5926, + "step": 13354 + }, + { + "epoch": 0.16695417385434635, + "grad_norm": 2.2103617191314697, + "learning_rate": 1.972814807644919e-05, + "loss": 0.8166, + "step": 13356 + }, + { + "epoch": 0.16697917447936197, + "grad_norm": 2.890827178955078, + "learning_rate": 1.9727945938248022e-05, + "loss": 0.6926, + "step": 13358 + }, + { + "epoch": 0.16700417510437762, + "grad_norm": 7.944486618041992, + "learning_rate": 1.9727743725960198e-05, + "loss": 1.392, + "step": 13360 + }, + { + "epoch": 0.16702917572939324, + "grad_norm": 1.7550896406173706, + "learning_rate": 1.9727541439587253e-05, + "loss": 0.7401, + "step": 13362 + }, + { + "epoch": 0.16705417635440886, + "grad_norm": 0.00501609779894352, + "learning_rate": 1.9727339079130726e-05, + "loss": 1.3325, + "step": 13364 + }, + { + "epoch": 0.16707917697942448, + "grad_norm": 0.002245655283331871, + "learning_rate": 1.9727136644592162e-05, + "loss": 0.2522, + "step": 13366 + }, + { + "epoch": 0.1671041776044401, + "grad_norm": 4.719552516937256, + "learning_rate": 1.97269341359731e-05, + "loss": 0.4533, + "step": 13368 + }, + { + "epoch": 0.16712917822945575, + "grad_norm": 1.5877207517623901, + "learning_rate": 1.9726731553275084e-05, + "loss": 0.514, + "step": 13370 + }, + { + "epoch": 0.16715417885447137, + "grad_norm": 5.255157470703125, + "learning_rate": 1.9726528896499656e-05, + "loss": 1.4011, + "step": 13372 + }, + { + "epoch": 0.167179179479487, + "grad_norm": 2.0453102588653564, + "learning_rate": 1.9726326165648357e-05, + "loss": 1.0171, + "step": 13374 + }, + { + "epoch": 0.1672041801045026, + "grad_norm": 10.605979919433594, + "learning_rate": 1.9726123360722737e-05, + "loss": 1.5306, + "step": 13376 + }, + { + "epoch": 0.16722918072951823, + "grad_norm": 0.6617851257324219, + "learning_rate": 1.9725920481724336e-05, + "loss": 0.5081, + "step": 13378 + }, + { + "epoch": 0.16725418135453388, + "grad_norm": 3.227130889892578, + "learning_rate": 1.97257175286547e-05, + "loss": 1.1888, + "step": 13380 + }, + { + "epoch": 0.1672791819795495, + "grad_norm": 5.086513996124268, + "learning_rate": 1.972551450151537e-05, + "loss": 1.9614, + "step": 13382 + }, + { + "epoch": 0.16730418260456512, + "grad_norm": 2.7034201622009277, + "learning_rate": 1.9725311400307902e-05, + "loss": 1.1267, + "step": 13384 + }, + { + "epoch": 0.16732918322958074, + "grad_norm": 6.120874881744385, + "learning_rate": 1.9725108225033835e-05, + "loss": 0.8678, + "step": 13386 + }, + { + "epoch": 0.16735418385459636, + "grad_norm": 3.3985729217529297, + "learning_rate": 1.972490497569472e-05, + "loss": 1.2376, + "step": 13388 + }, + { + "epoch": 0.167379184479612, + "grad_norm": 3.2691996097564697, + "learning_rate": 1.9724701652292104e-05, + "loss": 0.1715, + "step": 13390 + }, + { + "epoch": 0.16740418510462762, + "grad_norm": 3.309872627258301, + "learning_rate": 1.9724498254827536e-05, + "loss": 1.1208, + "step": 13392 + }, + { + "epoch": 0.16742918572964324, + "grad_norm": 0.004624841269105673, + "learning_rate": 1.9724294783302565e-05, + "loss": 1.0782, + "step": 13394 + }, + { + "epoch": 0.16745418635465886, + "grad_norm": 3.3129584789276123, + "learning_rate": 1.9724091237718738e-05, + "loss": 0.27, + "step": 13396 + }, + { + "epoch": 0.16747918697967448, + "grad_norm": 4.406829833984375, + "learning_rate": 1.972388761807761e-05, + "loss": 1.2498, + "step": 13398 + }, + { + "epoch": 0.16750418760469013, + "grad_norm": 3.134213447570801, + "learning_rate": 1.972368392438073e-05, + "loss": 1.169, + "step": 13400 + }, + { + "epoch": 0.16752918822970575, + "grad_norm": 2.30588960647583, + "learning_rate": 1.9723480156629643e-05, + "loss": 0.4035, + "step": 13402 + }, + { + "epoch": 0.16755418885472137, + "grad_norm": 3.757838010787964, + "learning_rate": 1.9723276314825906e-05, + "loss": 1.569, + "step": 13404 + }, + { + "epoch": 0.167579189479737, + "grad_norm": 2.615785598754883, + "learning_rate": 1.9723072398971076e-05, + "loss": 0.5716, + "step": 13406 + }, + { + "epoch": 0.1676041901047526, + "grad_norm": 2.937753915786743, + "learning_rate": 1.97228684090667e-05, + "loss": 0.6931, + "step": 13408 + }, + { + "epoch": 0.16762919072976826, + "grad_norm": 2.6342763900756836, + "learning_rate": 1.9722664345114337e-05, + "loss": 1.9052, + "step": 13410 + }, + { + "epoch": 0.16765419135478388, + "grad_norm": 4.127047061920166, + "learning_rate": 1.972246020711553e-05, + "loss": 1.2886, + "step": 13412 + }, + { + "epoch": 0.1676791919797995, + "grad_norm": 1.8695640563964844, + "learning_rate": 1.9722255995071843e-05, + "loss": 0.8329, + "step": 13414 + }, + { + "epoch": 0.16770419260481512, + "grad_norm": 3.2562949657440186, + "learning_rate": 1.9722051708984834e-05, + "loss": 1.2688, + "step": 13416 + }, + { + "epoch": 0.16772919322983074, + "grad_norm": 3.5878396034240723, + "learning_rate": 1.972184734885605e-05, + "loss": 0.3708, + "step": 13418 + }, + { + "epoch": 0.16775419385484638, + "grad_norm": 2.541181802749634, + "learning_rate": 1.972164291468705e-05, + "loss": 0.997, + "step": 13420 + }, + { + "epoch": 0.167779194479862, + "grad_norm": 4.703075885772705, + "learning_rate": 1.9721438406479393e-05, + "loss": 0.9945, + "step": 13422 + }, + { + "epoch": 0.16780419510487762, + "grad_norm": 5.3242998123168945, + "learning_rate": 1.972123382423464e-05, + "loss": 1.431, + "step": 13424 + }, + { + "epoch": 0.16782919572989324, + "grad_norm": 3.173090934753418, + "learning_rate": 1.972102916795434e-05, + "loss": 1.1897, + "step": 13426 + }, + { + "epoch": 0.16785419635490886, + "grad_norm": 4.860556602478027, + "learning_rate": 1.9720824437640062e-05, + "loss": 0.9846, + "step": 13428 + }, + { + "epoch": 0.1678791969799245, + "grad_norm": 0.7094460129737854, + "learning_rate": 1.9720619633293352e-05, + "loss": 1.136, + "step": 13430 + }, + { + "epoch": 0.16790419760494013, + "grad_norm": 2.7476930618286133, + "learning_rate": 1.9720414754915783e-05, + "loss": 1.792, + "step": 13432 + }, + { + "epoch": 0.16792919822995575, + "grad_norm": 7.319428443908691, + "learning_rate": 1.972020980250891e-05, + "loss": 1.7952, + "step": 13434 + }, + { + "epoch": 0.16795419885497137, + "grad_norm": 5.077286243438721, + "learning_rate": 1.9720004776074292e-05, + "loss": 1.7032, + "step": 13436 + }, + { + "epoch": 0.167979199479987, + "grad_norm": 2.7101857662200928, + "learning_rate": 1.971979967561349e-05, + "loss": 0.6906, + "step": 13438 + }, + { + "epoch": 0.16800420010500264, + "grad_norm": 0.007907148450613022, + "learning_rate": 1.9719594501128073e-05, + "loss": 0.1006, + "step": 13440 + }, + { + "epoch": 0.16802920073001826, + "grad_norm": 3.796678066253662, + "learning_rate": 1.9719389252619594e-05, + "loss": 0.2635, + "step": 13442 + }, + { + "epoch": 0.16805420135503388, + "grad_norm": 1.404205560684204, + "learning_rate": 1.9719183930089626e-05, + "loss": 0.5256, + "step": 13444 + }, + { + "epoch": 0.1680792019800495, + "grad_norm": 4.357640266418457, + "learning_rate": 1.9718978533539723e-05, + "loss": 1.4614, + "step": 13446 + }, + { + "epoch": 0.16810420260506512, + "grad_norm": 3.462108612060547, + "learning_rate": 1.9718773062971456e-05, + "loss": 1.7257, + "step": 13448 + }, + { + "epoch": 0.16812920323008076, + "grad_norm": 2.8585898876190186, + "learning_rate": 1.9718567518386387e-05, + "loss": 0.8226, + "step": 13450 + }, + { + "epoch": 0.16815420385509638, + "grad_norm": 3.2312536239624023, + "learning_rate": 1.9718361899786082e-05, + "loss": 1.6137, + "step": 13452 + }, + { + "epoch": 0.168179204480112, + "grad_norm": 2.067203998565674, + "learning_rate": 1.971815620717211e-05, + "loss": 0.9245, + "step": 13454 + }, + { + "epoch": 0.16820420510512762, + "grad_norm": 3.111468553543091, + "learning_rate": 1.971795044054603e-05, + "loss": 0.603, + "step": 13456 + }, + { + "epoch": 0.16822920573014324, + "grad_norm": 3.5117783546447754, + "learning_rate": 1.9717744599909418e-05, + "loss": 1.5701, + "step": 13458 + }, + { + "epoch": 0.1682542063551589, + "grad_norm": 3.127042770385742, + "learning_rate": 1.9717538685263837e-05, + "loss": 0.8011, + "step": 13460 + }, + { + "epoch": 0.1682792069801745, + "grad_norm": 2.581930637359619, + "learning_rate": 1.9717332696610852e-05, + "loss": 2.0936, + "step": 13462 + }, + { + "epoch": 0.16830420760519013, + "grad_norm": 3.8122944831848145, + "learning_rate": 1.9717126633952034e-05, + "loss": 1.8175, + "step": 13464 + }, + { + "epoch": 0.16832920823020575, + "grad_norm": 3.1756277084350586, + "learning_rate": 1.971692049728896e-05, + "loss": 1.7264, + "step": 13466 + }, + { + "epoch": 0.16835420885522137, + "grad_norm": 3.090787649154663, + "learning_rate": 1.971671428662319e-05, + "loss": 0.599, + "step": 13468 + }, + { + "epoch": 0.16837920948023702, + "grad_norm": 0.5817862749099731, + "learning_rate": 1.97165080019563e-05, + "loss": 0.4595, + "step": 13470 + }, + { + "epoch": 0.16840421010525264, + "grad_norm": 1.6487393379211426, + "learning_rate": 1.9716301643289854e-05, + "loss": 0.1381, + "step": 13472 + }, + { + "epoch": 0.16842921073026826, + "grad_norm": 4.276501178741455, + "learning_rate": 1.9716095210625434e-05, + "loss": 1.5971, + "step": 13474 + }, + { + "epoch": 0.16845421135528388, + "grad_norm": 0.003901731688529253, + "learning_rate": 1.9715888703964606e-05, + "loss": 0.2747, + "step": 13476 + }, + { + "epoch": 0.1684792119802995, + "grad_norm": 2.7628540992736816, + "learning_rate": 1.9715682123308938e-05, + "loss": 1.0702, + "step": 13478 + }, + { + "epoch": 0.16850421260531515, + "grad_norm": 0.014777745120227337, + "learning_rate": 1.9715475468660016e-05, + "loss": 0.296, + "step": 13480 + }, + { + "epoch": 0.16852921323033077, + "grad_norm": 0.6893850564956665, + "learning_rate": 1.9715268740019404e-05, + "loss": 0.503, + "step": 13482 + }, + { + "epoch": 0.16855421385534639, + "grad_norm": 0.011087833903729916, + "learning_rate": 1.971506193738868e-05, + "loss": 0.0817, + "step": 13484 + }, + { + "epoch": 0.168579214480362, + "grad_norm": 3.8908443450927734, + "learning_rate": 1.9714855060769417e-05, + "loss": 1.8664, + "step": 13486 + }, + { + "epoch": 0.16860421510537763, + "grad_norm": 2.648319721221924, + "learning_rate": 1.971464811016319e-05, + "loss": 0.3448, + "step": 13488 + }, + { + "epoch": 0.16862921573039327, + "grad_norm": 3.436668634414673, + "learning_rate": 1.971444108557158e-05, + "loss": 0.3008, + "step": 13490 + }, + { + "epoch": 0.1686542163554089, + "grad_norm": 2.2938618659973145, + "learning_rate": 1.9714233986996162e-05, + "loss": 0.3913, + "step": 13492 + }, + { + "epoch": 0.1686792169804245, + "grad_norm": 0.21744515001773834, + "learning_rate": 1.971402681443851e-05, + "loss": 0.6359, + "step": 13494 + }, + { + "epoch": 0.16870421760544013, + "grad_norm": 4.154667377471924, + "learning_rate": 1.9713819567900198e-05, + "loss": 1.0177, + "step": 13496 + }, + { + "epoch": 0.16872921823045575, + "grad_norm": 3.156951904296875, + "learning_rate": 1.9713612247382816e-05, + "loss": 0.7688, + "step": 13498 + }, + { + "epoch": 0.1687542188554714, + "grad_norm": 4.327866077423096, + "learning_rate": 1.9713404852887935e-05, + "loss": 1.6551, + "step": 13500 + }, + { + "epoch": 0.16877921948048702, + "grad_norm": 0.007421849761158228, + "learning_rate": 1.9713197384417136e-05, + "loss": 0.7173, + "step": 13502 + }, + { + "epoch": 0.16880422010550264, + "grad_norm": 2.2013046741485596, + "learning_rate": 1.9712989841972002e-05, + "loss": 0.9912, + "step": 13504 + }, + { + "epoch": 0.16882922073051826, + "grad_norm": 0.011126822791993618, + "learning_rate": 1.9712782225554106e-05, + "loss": 0.6681, + "step": 13506 + }, + { + "epoch": 0.16885422135553388, + "grad_norm": 0.009458230808377266, + "learning_rate": 1.9712574535165035e-05, + "loss": 0.2, + "step": 13508 + }, + { + "epoch": 0.16887922198054953, + "grad_norm": 3.4632325172424316, + "learning_rate": 1.9712366770806374e-05, + "loss": 1.4269, + "step": 13510 + }, + { + "epoch": 0.16890422260556515, + "grad_norm": 2.5957515239715576, + "learning_rate": 1.97121589324797e-05, + "loss": 0.8899, + "step": 13512 + }, + { + "epoch": 0.16892922323058077, + "grad_norm": 4.938492774963379, + "learning_rate": 1.9711951020186594e-05, + "loss": 0.9429, + "step": 13514 + }, + { + "epoch": 0.16895422385559639, + "grad_norm": 3.32066011428833, + "learning_rate": 1.9711743033928644e-05, + "loss": 0.9018, + "step": 13516 + }, + { + "epoch": 0.168979224480612, + "grad_norm": 2.9054908752441406, + "learning_rate": 1.971153497370743e-05, + "loss": 0.3454, + "step": 13518 + }, + { + "epoch": 0.16900422510562765, + "grad_norm": 3.335430383682251, + "learning_rate": 1.9711326839524542e-05, + "loss": 0.8966, + "step": 13520 + }, + { + "epoch": 0.16902922573064327, + "grad_norm": 3.185012102127075, + "learning_rate": 1.971111863138156e-05, + "loss": 0.4819, + "step": 13522 + }, + { + "epoch": 0.1690542263556589, + "grad_norm": 3.4057936668395996, + "learning_rate": 1.9710910349280075e-05, + "loss": 1.6015, + "step": 13524 + }, + { + "epoch": 0.1690792269806745, + "grad_norm": 5.757046699523926, + "learning_rate": 1.971070199322167e-05, + "loss": 1.4181, + "step": 13526 + }, + { + "epoch": 0.16910422760569013, + "grad_norm": 3.4021310806274414, + "learning_rate": 1.9710493563207932e-05, + "loss": 1.2317, + "step": 13528 + }, + { + "epoch": 0.16912922823070578, + "grad_norm": 3.366118907928467, + "learning_rate": 1.9710285059240447e-05, + "loss": 0.6417, + "step": 13530 + }, + { + "epoch": 0.1691542288557214, + "grad_norm": 3.6745197772979736, + "learning_rate": 1.97100764813208e-05, + "loss": 0.782, + "step": 13532 + }, + { + "epoch": 0.16917922948073702, + "grad_norm": 0.7685943841934204, + "learning_rate": 1.970986782945059e-05, + "loss": 0.9954, + "step": 13534 + }, + { + "epoch": 0.16920423010575264, + "grad_norm": 0.06594044715166092, + "learning_rate": 1.9709659103631394e-05, + "loss": 0.0132, + "step": 13536 + }, + { + "epoch": 0.16922923073076826, + "grad_norm": 0.14174044132232666, + "learning_rate": 1.970945030386481e-05, + "loss": 0.4086, + "step": 13538 + }, + { + "epoch": 0.1692542313557839, + "grad_norm": 0.008921940810978413, + "learning_rate": 1.9709241430152428e-05, + "loss": 0.4443, + "step": 13540 + }, + { + "epoch": 0.16927923198079953, + "grad_norm": 4.055429458618164, + "learning_rate": 1.9709032482495833e-05, + "loss": 1.7883, + "step": 13542 + }, + { + "epoch": 0.16930423260581515, + "grad_norm": 3.194103717803955, + "learning_rate": 1.970882346089662e-05, + "loss": 1.0244, + "step": 13544 + }, + { + "epoch": 0.16932923323083077, + "grad_norm": 4.229072093963623, + "learning_rate": 1.970861436535638e-05, + "loss": 1.7566, + "step": 13546 + }, + { + "epoch": 0.1693542338558464, + "grad_norm": 0.0034252156037837267, + "learning_rate": 1.9708405195876707e-05, + "loss": 0.4351, + "step": 13548 + }, + { + "epoch": 0.16937923448086203, + "grad_norm": 2.940643787384033, + "learning_rate": 1.9708195952459196e-05, + "loss": 0.5483, + "step": 13550 + }, + { + "epoch": 0.16940423510587765, + "grad_norm": 2.5789852142333984, + "learning_rate": 1.9707986635105433e-05, + "loss": 1.0819, + "step": 13552 + }, + { + "epoch": 0.16942923573089327, + "grad_norm": 0.001761921914294362, + "learning_rate": 1.970777724381702e-05, + "loss": 1.2195, + "step": 13554 + }, + { + "epoch": 0.1694542363559089, + "grad_norm": 2.5930240154266357, + "learning_rate": 1.9707567778595545e-05, + "loss": 1.2026, + "step": 13556 + }, + { + "epoch": 0.1694792369809245, + "grad_norm": 3.110666513442993, + "learning_rate": 1.9707358239442606e-05, + "loss": 0.8702, + "step": 13558 + }, + { + "epoch": 0.16950423760594016, + "grad_norm": 4.257696151733398, + "learning_rate": 1.97071486263598e-05, + "loss": 1.1782, + "step": 13560 + }, + { + "epoch": 0.16952923823095578, + "grad_norm": 3.34714674949646, + "learning_rate": 1.9706938939348727e-05, + "loss": 1.1923, + "step": 13562 + }, + { + "epoch": 0.1695542388559714, + "grad_norm": 0.7564710378646851, + "learning_rate": 1.9706729178410975e-05, + "loss": 0.6097, + "step": 13564 + }, + { + "epoch": 0.16957923948098702, + "grad_norm": 1.0650315284729004, + "learning_rate": 1.970651934354815e-05, + "loss": 0.0446, + "step": 13566 + }, + { + "epoch": 0.16960424010600264, + "grad_norm": 15.648641586303711, + "learning_rate": 1.9706309434761845e-05, + "loss": 0.4023, + "step": 13568 + }, + { + "epoch": 0.1696292407310183, + "grad_norm": 3.8026981353759766, + "learning_rate": 1.970609945205366e-05, + "loss": 0.7841, + "step": 13570 + }, + { + "epoch": 0.1696542413560339, + "grad_norm": 4.06581449508667, + "learning_rate": 1.9705889395425192e-05, + "loss": 1.6826, + "step": 13572 + }, + { + "epoch": 0.16967924198104953, + "grad_norm": 2.148383855819702, + "learning_rate": 1.9705679264878044e-05, + "loss": 0.3225, + "step": 13574 + }, + { + "epoch": 0.16970424260606515, + "grad_norm": 4.081587314605713, + "learning_rate": 1.9705469060413817e-05, + "loss": 1.4638, + "step": 13576 + }, + { + "epoch": 0.16972924323108077, + "grad_norm": 2.654517650604248, + "learning_rate": 1.9705258782034106e-05, + "loss": 1.5906, + "step": 13578 + }, + { + "epoch": 0.16975424385609642, + "grad_norm": 4.076125621795654, + "learning_rate": 1.9705048429740522e-05, + "loss": 2.1502, + "step": 13580 + }, + { + "epoch": 0.16977924448111203, + "grad_norm": 0.00628284364938736, + "learning_rate": 1.9704838003534657e-05, + "loss": 0.0215, + "step": 13582 + }, + { + "epoch": 0.16980424510612765, + "grad_norm": 1.440614104270935, + "learning_rate": 1.970462750341812e-05, + "loss": 0.1127, + "step": 13584 + }, + { + "epoch": 0.16982924573114327, + "grad_norm": 6.049037456512451, + "learning_rate": 1.9704416929392513e-05, + "loss": 1.0747, + "step": 13586 + }, + { + "epoch": 0.1698542463561589, + "grad_norm": 3.4260709285736084, + "learning_rate": 1.9704206281459436e-05, + "loss": 1.5195, + "step": 13588 + }, + { + "epoch": 0.16987924698117454, + "grad_norm": 3.5727462768554688, + "learning_rate": 1.97039955596205e-05, + "loss": 1.1892, + "step": 13590 + }, + { + "epoch": 0.16990424760619016, + "grad_norm": 3.4052016735076904, + "learning_rate": 1.9703784763877306e-05, + "loss": 0.9646, + "step": 13592 + }, + { + "epoch": 0.16992924823120578, + "grad_norm": 3.1013824939727783, + "learning_rate": 1.9703573894231454e-05, + "loss": 0.3132, + "step": 13594 + }, + { + "epoch": 0.1699542488562214, + "grad_norm": 1.7780685424804688, + "learning_rate": 1.970336295068456e-05, + "loss": 0.2158, + "step": 13596 + }, + { + "epoch": 0.16997924948123702, + "grad_norm": 3.4516079425811768, + "learning_rate": 1.970315193323823e-05, + "loss": 0.9377, + "step": 13598 + }, + { + "epoch": 0.17000425010625267, + "grad_norm": 6.346052169799805, + "learning_rate": 1.970294084189406e-05, + "loss": 0.6013, + "step": 13600 + }, + { + "epoch": 0.1700292507312683, + "grad_norm": 4.5400872230529785, + "learning_rate": 1.9702729676653665e-05, + "loss": 1.0523, + "step": 13602 + }, + { + "epoch": 0.1700542513562839, + "grad_norm": 1.9164724349975586, + "learning_rate": 1.9702518437518656e-05, + "loss": 1.548, + "step": 13604 + }, + { + "epoch": 0.17007925198129953, + "grad_norm": 5.048062801361084, + "learning_rate": 1.970230712449064e-05, + "loss": 1.5521, + "step": 13606 + }, + { + "epoch": 0.17010425260631515, + "grad_norm": 2.8429291248321533, + "learning_rate": 1.970209573757122e-05, + "loss": 1.7285, + "step": 13608 + }, + { + "epoch": 0.1701292532313308, + "grad_norm": 2.064819812774658, + "learning_rate": 1.9701884276762017e-05, + "loss": 0.2704, + "step": 13610 + }, + { + "epoch": 0.17015425385634642, + "grad_norm": 5.328028202056885, + "learning_rate": 1.970167274206463e-05, + "loss": 0.9836, + "step": 13612 + }, + { + "epoch": 0.17017925448136204, + "grad_norm": 5.255513668060303, + "learning_rate": 1.9701461133480677e-05, + "loss": 0.4874, + "step": 13614 + }, + { + "epoch": 0.17020425510637766, + "grad_norm": 4.437098503112793, + "learning_rate": 1.9701249451011767e-05, + "loss": 1.231, + "step": 13616 + }, + { + "epoch": 0.17022925573139328, + "grad_norm": 3.4767911434173584, + "learning_rate": 1.9701037694659514e-05, + "loss": 0.686, + "step": 13618 + }, + { + "epoch": 0.17025425635640892, + "grad_norm": 4.118288993835449, + "learning_rate": 1.970082586442553e-05, + "loss": 0.6583, + "step": 13620 + }, + { + "epoch": 0.17027925698142454, + "grad_norm": 3.2286460399627686, + "learning_rate": 1.970061396031143e-05, + "loss": 0.9703, + "step": 13622 + }, + { + "epoch": 0.17030425760644016, + "grad_norm": 3.0948705673217773, + "learning_rate": 1.970040198231882e-05, + "loss": 1.6433, + "step": 13624 + }, + { + "epoch": 0.17032925823145578, + "grad_norm": 1.1560274362564087, + "learning_rate": 1.9700189930449326e-05, + "loss": 0.8414, + "step": 13626 + }, + { + "epoch": 0.1703542588564714, + "grad_norm": 0.005322488956153393, + "learning_rate": 1.9699977804704554e-05, + "loss": 0.822, + "step": 13628 + }, + { + "epoch": 0.17037925948148705, + "grad_norm": 3.8321053981781006, + "learning_rate": 1.9699765605086126e-05, + "loss": 0.6207, + "step": 13630 + }, + { + "epoch": 0.17040426010650267, + "grad_norm": 0.0030826570000499487, + "learning_rate": 1.969955333159565e-05, + "loss": 0.6788, + "step": 13632 + }, + { + "epoch": 0.1704292607315183, + "grad_norm": 3.098337173461914, + "learning_rate": 1.969934098423475e-05, + "loss": 1.6775, + "step": 13634 + }, + { + "epoch": 0.1704542613565339, + "grad_norm": 3.7025208473205566, + "learning_rate": 1.9699128563005042e-05, + "loss": 1.5189, + "step": 13636 + }, + { + "epoch": 0.17047926198154953, + "grad_norm": 5.243704319000244, + "learning_rate": 1.969891606790814e-05, + "loss": 0.3761, + "step": 13638 + }, + { + "epoch": 0.17050426260656518, + "grad_norm": 4.10504150390625, + "learning_rate": 1.969870349894567e-05, + "loss": 2.0503, + "step": 13640 + }, + { + "epoch": 0.1705292632315808, + "grad_norm": 5.312821388244629, + "learning_rate": 1.969849085611924e-05, + "loss": 0.9879, + "step": 13642 + }, + { + "epoch": 0.17055426385659642, + "grad_norm": 2.7487149238586426, + "learning_rate": 1.9698278139430477e-05, + "loss": 0.1339, + "step": 13644 + }, + { + "epoch": 0.17057926448161204, + "grad_norm": 0.3257158398628235, + "learning_rate": 1.9698065348881e-05, + "loss": 0.074, + "step": 13646 + }, + { + "epoch": 0.17060426510662766, + "grad_norm": 2.5022552013397217, + "learning_rate": 1.969785248447243e-05, + "loss": 0.4462, + "step": 13648 + }, + { + "epoch": 0.1706292657316433, + "grad_norm": 0.09891724586486816, + "learning_rate": 1.9697639546206383e-05, + "loss": 0.0054, + "step": 13650 + }, + { + "epoch": 0.17065426635665892, + "grad_norm": 2.2244017124176025, + "learning_rate": 1.9697426534084485e-05, + "loss": 0.307, + "step": 13652 + }, + { + "epoch": 0.17067926698167454, + "grad_norm": 1.4767659902572632, + "learning_rate": 1.9697213448108358e-05, + "loss": 0.8128, + "step": 13654 + }, + { + "epoch": 0.17070426760669016, + "grad_norm": 5.009930610656738, + "learning_rate": 1.9697000288279628e-05, + "loss": 1.4121, + "step": 13656 + }, + { + "epoch": 0.17072926823170578, + "grad_norm": 3.7092583179473877, + "learning_rate": 1.9696787054599912e-05, + "loss": 0.7859, + "step": 13658 + }, + { + "epoch": 0.17075426885672143, + "grad_norm": 0.0014343159273266792, + "learning_rate": 1.969657374707084e-05, + "loss": 0.4755, + "step": 13660 + }, + { + "epoch": 0.17077926948173705, + "grad_norm": 3.54649019241333, + "learning_rate": 1.969636036569403e-05, + "loss": 1.4115, + "step": 13662 + }, + { + "epoch": 0.17080427010675267, + "grad_norm": 10.379925727844238, + "learning_rate": 1.9696146910471112e-05, + "loss": 0.894, + "step": 13664 + }, + { + "epoch": 0.1708292707317683, + "grad_norm": 2.795198440551758, + "learning_rate": 1.9695933381403712e-05, + "loss": 0.841, + "step": 13666 + }, + { + "epoch": 0.1708542713567839, + "grad_norm": 1.3330644369125366, + "learning_rate": 1.969571977849345e-05, + "loss": 0.9105, + "step": 13668 + }, + { + "epoch": 0.17087927198179956, + "grad_norm": 4.227121829986572, + "learning_rate": 1.969550610174196e-05, + "loss": 0.3315, + "step": 13670 + }, + { + "epoch": 0.17090427260681518, + "grad_norm": 3.443685531616211, + "learning_rate": 1.9695292351150867e-05, + "loss": 1.2154, + "step": 13672 + }, + { + "epoch": 0.1709292732318308, + "grad_norm": 3.942075729370117, + "learning_rate": 1.96950785267218e-05, + "loss": 1.4815, + "step": 13674 + }, + { + "epoch": 0.17095427385684642, + "grad_norm": 0.26635998487472534, + "learning_rate": 1.9694864628456385e-05, + "loss": 1.0339, + "step": 13676 + }, + { + "epoch": 0.17097927448186204, + "grad_norm": 0.006010144017636776, + "learning_rate": 1.9694650656356252e-05, + "loss": 0.7681, + "step": 13678 + }, + { + "epoch": 0.17100427510687768, + "grad_norm": 3.511329412460327, + "learning_rate": 1.969443661042303e-05, + "loss": 0.8901, + "step": 13680 + }, + { + "epoch": 0.1710292757318933, + "grad_norm": 5.176652431488037, + "learning_rate": 1.9694222490658348e-05, + "loss": 1.5066, + "step": 13682 + }, + { + "epoch": 0.17105427635690892, + "grad_norm": 4.232545852661133, + "learning_rate": 1.969400829706384e-05, + "loss": 0.7505, + "step": 13684 + }, + { + "epoch": 0.17107927698192454, + "grad_norm": 2.1405608654022217, + "learning_rate": 1.9693794029641137e-05, + "loss": 1.0314, + "step": 13686 + }, + { + "epoch": 0.17110427760694016, + "grad_norm": 1.383185863494873, + "learning_rate": 1.9693579688391865e-05, + "loss": 0.7486, + "step": 13688 + }, + { + "epoch": 0.1711292782319558, + "grad_norm": 2.035769462585449, + "learning_rate": 1.9693365273317665e-05, + "loss": 0.3518, + "step": 13690 + }, + { + "epoch": 0.17115427885697143, + "grad_norm": 2.5322275161743164, + "learning_rate": 1.9693150784420166e-05, + "loss": 0.5358, + "step": 13692 + }, + { + "epoch": 0.17117927948198705, + "grad_norm": 1.8318157196044922, + "learning_rate": 1.9692936221701e-05, + "loss": 0.1487, + "step": 13694 + }, + { + "epoch": 0.17120428010700267, + "grad_norm": 3.156949520111084, + "learning_rate": 1.96927215851618e-05, + "loss": 1.2782, + "step": 13696 + }, + { + "epoch": 0.1712292807320183, + "grad_norm": 3.8234941959381104, + "learning_rate": 1.9692506874804208e-05, + "loss": 1.432, + "step": 13698 + }, + { + "epoch": 0.17125428135703394, + "grad_norm": 2.0419068336486816, + "learning_rate": 1.969229209062985e-05, + "loss": 0.6317, + "step": 13700 + }, + { + "epoch": 0.17127928198204956, + "grad_norm": 3.393023729324341, + "learning_rate": 1.9692077232640368e-05, + "loss": 1.3566, + "step": 13702 + }, + { + "epoch": 0.17130428260706518, + "grad_norm": 2.439770221710205, + "learning_rate": 1.9691862300837398e-05, + "loss": 1.0598, + "step": 13704 + }, + { + "epoch": 0.1713292832320808, + "grad_norm": 2.045900344848633, + "learning_rate": 1.9691647295222575e-05, + "loss": 0.5272, + "step": 13706 + }, + { + "epoch": 0.17135428385709642, + "grad_norm": 3.134639024734497, + "learning_rate": 1.9691432215797534e-05, + "loss": 0.8642, + "step": 13708 + }, + { + "epoch": 0.17137928448211207, + "grad_norm": 3.6353375911712646, + "learning_rate": 1.9691217062563915e-05, + "loss": 0.535, + "step": 13710 + }, + { + "epoch": 0.17140428510712769, + "grad_norm": 0.001546448445878923, + "learning_rate": 1.9691001835523363e-05, + "loss": 0.7059, + "step": 13712 + }, + { + "epoch": 0.1714292857321433, + "grad_norm": 4.768155574798584, + "learning_rate": 1.969078653467751e-05, + "loss": 2.2262, + "step": 13714 + }, + { + "epoch": 0.17145428635715892, + "grad_norm": 5.797440528869629, + "learning_rate": 1.969057116002799e-05, + "loss": 1.2909, + "step": 13716 + }, + { + "epoch": 0.17147928698217454, + "grad_norm": 0.0020071102771908045, + "learning_rate": 1.9690355711576455e-05, + "loss": 0.5522, + "step": 13718 + }, + { + "epoch": 0.1715042876071902, + "grad_norm": 1.1088093519210815, + "learning_rate": 1.9690140189324545e-05, + "loss": 0.028, + "step": 13720 + }, + { + "epoch": 0.1715292882322058, + "grad_norm": 2.7870137691497803, + "learning_rate": 1.9689924593273892e-05, + "loss": 0.7368, + "step": 13722 + }, + { + "epoch": 0.17155428885722143, + "grad_norm": 0.008097272366285324, + "learning_rate": 1.9689708923426148e-05, + "loss": 0.5864, + "step": 13724 + }, + { + "epoch": 0.17157928948223705, + "grad_norm": 0.2799019515514374, + "learning_rate": 1.968949317978295e-05, + "loss": 0.0361, + "step": 13726 + }, + { + "epoch": 0.17160429010725267, + "grad_norm": 2.4492526054382324, + "learning_rate": 1.968927736234594e-05, + "loss": 1.6657, + "step": 13728 + }, + { + "epoch": 0.17162929073226832, + "grad_norm": 3.875976800918579, + "learning_rate": 1.9689061471116765e-05, + "loss": 2.0374, + "step": 13730 + }, + { + "epoch": 0.17165429135728394, + "grad_norm": 2.787862539291382, + "learning_rate": 1.9688845506097067e-05, + "loss": 1.478, + "step": 13732 + }, + { + "epoch": 0.17167929198229956, + "grad_norm": 4.744863986968994, + "learning_rate": 1.9688629467288493e-05, + "loss": 3.0716, + "step": 13734 + }, + { + "epoch": 0.17170429260731518, + "grad_norm": 0.0013092707376927137, + "learning_rate": 1.9688413354692687e-05, + "loss": 0.4852, + "step": 13736 + }, + { + "epoch": 0.1717292932323308, + "grad_norm": 4.062909126281738, + "learning_rate": 1.9688197168311295e-05, + "loss": 1.5545, + "step": 13738 + }, + { + "epoch": 0.17175429385734645, + "grad_norm": 3.2566874027252197, + "learning_rate": 1.968798090814596e-05, + "loss": 0.9351, + "step": 13740 + }, + { + "epoch": 0.17177929448236207, + "grad_norm": 2.1268248558044434, + "learning_rate": 1.968776457419834e-05, + "loss": 1.2558, + "step": 13742 + }, + { + "epoch": 0.17180429510737769, + "grad_norm": 2.375840187072754, + "learning_rate": 1.9687548166470067e-05, + "loss": 0.2537, + "step": 13744 + }, + { + "epoch": 0.1718292957323933, + "grad_norm": 2.6391544342041016, + "learning_rate": 1.9687331684962803e-05, + "loss": 0.9311, + "step": 13746 + }, + { + "epoch": 0.17185429635740893, + "grad_norm": 2.494935989379883, + "learning_rate": 1.968711512967819e-05, + "loss": 0.4724, + "step": 13748 + }, + { + "epoch": 0.17187929698242457, + "grad_norm": 0.23413464426994324, + "learning_rate": 1.9686898500617875e-05, + "loss": 0.7458, + "step": 13750 + }, + { + "epoch": 0.1719042976074402, + "grad_norm": 2.074455738067627, + "learning_rate": 1.968668179778351e-05, + "loss": 1.1873, + "step": 13752 + }, + { + "epoch": 0.1719292982324558, + "grad_norm": 13.834395408630371, + "learning_rate": 1.968646502117675e-05, + "loss": 1.4079, + "step": 13754 + }, + { + "epoch": 0.17195429885747143, + "grad_norm": 2.790133237838745, + "learning_rate": 1.968624817079924e-05, + "loss": 0.5581, + "step": 13756 + }, + { + "epoch": 0.17197929948248705, + "grad_norm": 4.828736782073975, + "learning_rate": 1.9686031246652636e-05, + "loss": 0.8579, + "step": 13758 + }, + { + "epoch": 0.1720043001075027, + "grad_norm": 6.652040004730225, + "learning_rate": 1.9685814248738586e-05, + "loss": 0.1292, + "step": 13760 + }, + { + "epoch": 0.17202930073251832, + "grad_norm": 0.00254857842810452, + "learning_rate": 1.9685597177058745e-05, + "loss": 0.0579, + "step": 13762 + }, + { + "epoch": 0.17205430135753394, + "grad_norm": 0.37856394052505493, + "learning_rate": 1.9685380031614764e-05, + "loss": 0.512, + "step": 13764 + }, + { + "epoch": 0.17207930198254956, + "grad_norm": 8.666831970214844, + "learning_rate": 1.96851628124083e-05, + "loss": 0.9118, + "step": 13766 + }, + { + "epoch": 0.17210430260756518, + "grad_norm": 0.23004484176635742, + "learning_rate": 1.9684945519441005e-05, + "loss": 0.547, + "step": 13768 + }, + { + "epoch": 0.17212930323258083, + "grad_norm": 3.471992254257202, + "learning_rate": 1.9684728152714533e-05, + "loss": 1.304, + "step": 13770 + }, + { + "epoch": 0.17215430385759645, + "grad_norm": 2.1641886234283447, + "learning_rate": 1.9684510712230545e-05, + "loss": 0.472, + "step": 13772 + }, + { + "epoch": 0.17217930448261207, + "grad_norm": 3.4574663639068604, + "learning_rate": 1.968429319799069e-05, + "loss": 0.1998, + "step": 13774 + }, + { + "epoch": 0.1722043051076277, + "grad_norm": 2.099823474884033, + "learning_rate": 1.968407560999663e-05, + "loss": 0.1125, + "step": 13776 + }, + { + "epoch": 0.1722293057326433, + "grad_norm": 2.740487813949585, + "learning_rate": 1.9683857948250015e-05, + "loss": 1.2738, + "step": 13778 + }, + { + "epoch": 0.17225430635765895, + "grad_norm": 3.6923162937164307, + "learning_rate": 1.968364021275251e-05, + "loss": 1.2342, + "step": 13780 + }, + { + "epoch": 0.17227930698267457, + "grad_norm": 0.017164720222353935, + "learning_rate": 1.9683422403505773e-05, + "loss": 0.499, + "step": 13782 + }, + { + "epoch": 0.1723043076076902, + "grad_norm": 13.184992790222168, + "learning_rate": 1.9683204520511456e-05, + "loss": 0.1083, + "step": 13784 + }, + { + "epoch": 0.1723293082327058, + "grad_norm": 3.970876932144165, + "learning_rate": 1.9682986563771224e-05, + "loss": 1.0522, + "step": 13786 + }, + { + "epoch": 0.17235430885772143, + "grad_norm": 5.9867987632751465, + "learning_rate": 1.9682768533286736e-05, + "loss": 1.224, + "step": 13788 + }, + { + "epoch": 0.17237930948273708, + "grad_norm": 4.0938568115234375, + "learning_rate": 1.9682550429059654e-05, + "loss": 1.3719, + "step": 13790 + }, + { + "epoch": 0.1724043101077527, + "grad_norm": 2.8034119606018066, + "learning_rate": 1.9682332251091635e-05, + "loss": 1.8167, + "step": 13792 + }, + { + "epoch": 0.17242931073276832, + "grad_norm": 3.349118232727051, + "learning_rate": 1.9682113999384343e-05, + "loss": 0.602, + "step": 13794 + }, + { + "epoch": 0.17245431135778394, + "grad_norm": 1.4216914176940918, + "learning_rate": 1.968189567393944e-05, + "loss": 0.0774, + "step": 13796 + }, + { + "epoch": 0.17247931198279956, + "grad_norm": 4.440876483917236, + "learning_rate": 1.9681677274758584e-05, + "loss": 1.1801, + "step": 13798 + }, + { + "epoch": 0.1725043126078152, + "grad_norm": 1.6774829626083374, + "learning_rate": 1.968145880184345e-05, + "loss": 0.5701, + "step": 13800 + }, + { + "epoch": 0.17252931323283083, + "grad_norm": 4.132779121398926, + "learning_rate": 1.9681240255195692e-05, + "loss": 1.6162, + "step": 13802 + }, + { + "epoch": 0.17255431385784645, + "grad_norm": 0.0018346882425248623, + "learning_rate": 1.9681021634816978e-05, + "loss": 0.0001, + "step": 13804 + }, + { + "epoch": 0.17257931448286207, + "grad_norm": 3.4091060161590576, + "learning_rate": 1.9680802940708976e-05, + "loss": 0.7782, + "step": 13806 + }, + { + "epoch": 0.1726043151078777, + "grad_norm": 2.6922948360443115, + "learning_rate": 1.968058417287334e-05, + "loss": 0.2362, + "step": 13808 + }, + { + "epoch": 0.17262931573289333, + "grad_norm": 0.3835470974445343, + "learning_rate": 1.968036533131175e-05, + "loss": 0.7071, + "step": 13810 + }, + { + "epoch": 0.17265431635790895, + "grad_norm": 3.0421791076660156, + "learning_rate": 1.9680146416025864e-05, + "loss": 0.7921, + "step": 13812 + }, + { + "epoch": 0.17267931698292457, + "grad_norm": 4.522592544555664, + "learning_rate": 1.967992742701735e-05, + "loss": 2.1766, + "step": 13814 + }, + { + "epoch": 0.1727043176079402, + "grad_norm": 2.0303030014038086, + "learning_rate": 1.967970836428788e-05, + "loss": 1.2388, + "step": 13816 + }, + { + "epoch": 0.17272931823295581, + "grad_norm": 3.0534961223602295, + "learning_rate": 1.967948922783912e-05, + "loss": 1.1547, + "step": 13818 + }, + { + "epoch": 0.17275431885797146, + "grad_norm": 2.393758535385132, + "learning_rate": 1.967927001767274e-05, + "loss": 1.6592, + "step": 13820 + }, + { + "epoch": 0.17277931948298708, + "grad_norm": 4.659926414489746, + "learning_rate": 1.9679050733790408e-05, + "loss": 2.157, + "step": 13822 + }, + { + "epoch": 0.1728043201080027, + "grad_norm": 4.447721481323242, + "learning_rate": 1.9678831376193793e-05, + "loss": 1.2891, + "step": 13824 + }, + { + "epoch": 0.17282932073301832, + "grad_norm": 2.991485834121704, + "learning_rate": 1.9678611944884567e-05, + "loss": 0.1196, + "step": 13826 + }, + { + "epoch": 0.17285432135803394, + "grad_norm": 0.674543023109436, + "learning_rate": 1.9678392439864402e-05, + "loss": 0.9287, + "step": 13828 + }, + { + "epoch": 0.1728793219830496, + "grad_norm": 5.793308734893799, + "learning_rate": 1.967817286113497e-05, + "loss": 1.9643, + "step": 13830 + }, + { + "epoch": 0.1729043226080652, + "grad_norm": 4.242647171020508, + "learning_rate": 1.967795320869794e-05, + "loss": 1.7753, + "step": 13832 + }, + { + "epoch": 0.17292932323308083, + "grad_norm": 2.6494877338409424, + "learning_rate": 1.9677733482554985e-05, + "loss": 1.111, + "step": 13834 + }, + { + "epoch": 0.17295432385809645, + "grad_norm": 2.231208562850952, + "learning_rate": 1.9677513682707784e-05, + "loss": 1.3772, + "step": 13836 + }, + { + "epoch": 0.17297932448311207, + "grad_norm": 2.8345441818237305, + "learning_rate": 1.9677293809158e-05, + "loss": 1.1871, + "step": 13838 + }, + { + "epoch": 0.17300432510812772, + "grad_norm": 2.455592393875122, + "learning_rate": 1.9677073861907325e-05, + "loss": 1.1546, + "step": 13840 + }, + { + "epoch": 0.17302932573314334, + "grad_norm": 0.2717028856277466, + "learning_rate": 1.967685384095742e-05, + "loss": 0.4286, + "step": 13842 + }, + { + "epoch": 0.17305432635815896, + "grad_norm": 3.0176584720611572, + "learning_rate": 1.967663374630996e-05, + "loss": 1.6236, + "step": 13844 + }, + { + "epoch": 0.17307932698317458, + "grad_norm": 2.113306760787964, + "learning_rate": 1.9676413577966632e-05, + "loss": 0.6044, + "step": 13846 + }, + { + "epoch": 0.1731043276081902, + "grad_norm": 2.612675189971924, + "learning_rate": 1.9676193335929103e-05, + "loss": 1.0001, + "step": 13848 + }, + { + "epoch": 0.17312932823320584, + "grad_norm": 0.7014439702033997, + "learning_rate": 1.9675973020199058e-05, + "loss": 0.1238, + "step": 13850 + }, + { + "epoch": 0.17315432885822146, + "grad_norm": 3.1994175910949707, + "learning_rate": 1.9675752630778166e-05, + "loss": 0.3632, + "step": 13852 + }, + { + "epoch": 0.17317932948323708, + "grad_norm": 4.200078010559082, + "learning_rate": 1.9675532167668114e-05, + "loss": 1.6949, + "step": 13854 + }, + { + "epoch": 0.1732043301082527, + "grad_norm": 2.9918713569641113, + "learning_rate": 1.9675311630870574e-05, + "loss": 1.2147, + "step": 13856 + }, + { + "epoch": 0.17322933073326832, + "grad_norm": 6.359409332275391, + "learning_rate": 1.9675091020387232e-05, + "loss": 0.7592, + "step": 13858 + }, + { + "epoch": 0.17325433135828397, + "grad_norm": 1.7887475490570068, + "learning_rate": 1.9674870336219762e-05, + "loss": 0.2782, + "step": 13860 + }, + { + "epoch": 0.1732793319832996, + "grad_norm": 2.094909429550171, + "learning_rate": 1.967464957836985e-05, + "loss": 0.7493, + "step": 13862 + }, + { + "epoch": 0.1733043326083152, + "grad_norm": 0.010869845747947693, + "learning_rate": 1.9674428746839177e-05, + "loss": 0.4701, + "step": 13864 + }, + { + "epoch": 0.17332933323333083, + "grad_norm": 8.960504531860352, + "learning_rate": 1.9674207841629417e-05, + "loss": 1.7443, + "step": 13866 + }, + { + "epoch": 0.17335433385834645, + "grad_norm": 2.0813045501708984, + "learning_rate": 1.967398686274226e-05, + "loss": 0.2353, + "step": 13868 + }, + { + "epoch": 0.1733793344833621, + "grad_norm": 2.2683584690093994, + "learning_rate": 1.967376581017939e-05, + "loss": 0.4883, + "step": 13870 + }, + { + "epoch": 0.17340433510837772, + "grad_norm": 5.733895301818848, + "learning_rate": 1.9673544683942487e-05, + "loss": 1.29, + "step": 13872 + }, + { + "epoch": 0.17342933573339334, + "grad_norm": 4.02370023727417, + "learning_rate": 1.9673323484033234e-05, + "loss": 0.5074, + "step": 13874 + }, + { + "epoch": 0.17345433635840896, + "grad_norm": 0.06562389433383942, + "learning_rate": 1.9673102210453317e-05, + "loss": 0.41, + "step": 13876 + }, + { + "epoch": 0.17347933698342458, + "grad_norm": 4.855876445770264, + "learning_rate": 1.967288086320442e-05, + "loss": 1.0468, + "step": 13878 + }, + { + "epoch": 0.17350433760844022, + "grad_norm": 4.460604190826416, + "learning_rate": 1.9672659442288235e-05, + "loss": 0.9523, + "step": 13880 + }, + { + "epoch": 0.17352933823345584, + "grad_norm": 7.084589958190918, + "learning_rate": 1.967243794770644e-05, + "loss": 0.877, + "step": 13882 + }, + { + "epoch": 0.17355433885847146, + "grad_norm": 3.511136531829834, + "learning_rate": 1.9672216379460723e-05, + "loss": 0.6242, + "step": 13884 + }, + { + "epoch": 0.17357933948348708, + "grad_norm": 5.8896002769470215, + "learning_rate": 1.967199473755278e-05, + "loss": 2.552, + "step": 13886 + }, + { + "epoch": 0.1736043401085027, + "grad_norm": 6.7609453201293945, + "learning_rate": 1.967177302198429e-05, + "loss": 0.9009, + "step": 13888 + }, + { + "epoch": 0.17362934073351835, + "grad_norm": 4.378993034362793, + "learning_rate": 1.967155123275694e-05, + "loss": 0.4492, + "step": 13890 + }, + { + "epoch": 0.17365434135853397, + "grad_norm": 1.7505176067352295, + "learning_rate": 1.967132936987243e-05, + "loss": 0.8941, + "step": 13892 + }, + { + "epoch": 0.1736793419835496, + "grad_norm": 4.513197422027588, + "learning_rate": 1.9671107433332445e-05, + "loss": 1.0303, + "step": 13894 + }, + { + "epoch": 0.1737043426085652, + "grad_norm": 2.3167901039123535, + "learning_rate": 1.9670885423138667e-05, + "loss": 0.7552, + "step": 13896 + }, + { + "epoch": 0.17372934323358083, + "grad_norm": 3.828786849975586, + "learning_rate": 1.9670663339292796e-05, + "loss": 1.4381, + "step": 13898 + }, + { + "epoch": 0.17375434385859648, + "grad_norm": 3.7804813385009766, + "learning_rate": 1.967044118179652e-05, + "loss": 1.0374, + "step": 13900 + }, + { + "epoch": 0.1737793444836121, + "grad_norm": 2.340322494506836, + "learning_rate": 1.967021895065153e-05, + "loss": 0.279, + "step": 13902 + }, + { + "epoch": 0.17380434510862772, + "grad_norm": 5.462181568145752, + "learning_rate": 1.966999664585952e-05, + "loss": 2.0316, + "step": 13904 + }, + { + "epoch": 0.17382934573364334, + "grad_norm": 0.0011475072242319584, + "learning_rate": 1.9669774267422188e-05, + "loss": 0.5616, + "step": 13906 + }, + { + "epoch": 0.17385434635865896, + "grad_norm": 5.010406494140625, + "learning_rate": 1.9669551815341218e-05, + "loss": 1.0484, + "step": 13908 + }, + { + "epoch": 0.1738793469836746, + "grad_norm": 4.909757137298584, + "learning_rate": 1.966932928961831e-05, + "loss": 1.28, + "step": 13910 + }, + { + "epoch": 0.17390434760869022, + "grad_norm": 5.556811809539795, + "learning_rate": 1.9669106690255156e-05, + "loss": 1.0413, + "step": 13912 + }, + { + "epoch": 0.17392934823370584, + "grad_norm": 6.588428020477295, + "learning_rate": 1.9668884017253455e-05, + "loss": 0.3464, + "step": 13914 + }, + { + "epoch": 0.17395434885872146, + "grad_norm": 4.797134876251221, + "learning_rate": 1.96686612706149e-05, + "loss": 1.9344, + "step": 13916 + }, + { + "epoch": 0.17397934948373708, + "grad_norm": 1.4563549757003784, + "learning_rate": 1.9668438450341187e-05, + "loss": 0.7321, + "step": 13918 + }, + { + "epoch": 0.17400435010875273, + "grad_norm": 6.61008358001709, + "learning_rate": 1.9668215556434016e-05, + "loss": 1.5538, + "step": 13920 + }, + { + "epoch": 0.17402935073376835, + "grad_norm": 3.650904893875122, + "learning_rate": 1.9667992588895083e-05, + "loss": 1.323, + "step": 13922 + }, + { + "epoch": 0.17405435135878397, + "grad_norm": 3.3929426670074463, + "learning_rate": 1.966776954772608e-05, + "loss": 1.301, + "step": 13924 + }, + { + "epoch": 0.1740793519837996, + "grad_norm": 2.4406843185424805, + "learning_rate": 1.9667546432928717e-05, + "loss": 0.4603, + "step": 13926 + }, + { + "epoch": 0.1741043526088152, + "grad_norm": 3.0460400581359863, + "learning_rate": 1.9667323244504683e-05, + "loss": 1.2834, + "step": 13928 + }, + { + "epoch": 0.17412935323383086, + "grad_norm": 5.342087745666504, + "learning_rate": 1.9667099982455688e-05, + "loss": 0.5271, + "step": 13930 + }, + { + "epoch": 0.17415435385884648, + "grad_norm": 0.09086349606513977, + "learning_rate": 1.9666876646783422e-05, + "loss": 0.1116, + "step": 13932 + }, + { + "epoch": 0.1741793544838621, + "grad_norm": 0.0021080055739730597, + "learning_rate": 1.966665323748959e-05, + "loss": 0.2664, + "step": 13934 + }, + { + "epoch": 0.17420435510887772, + "grad_norm": 0.0006008241325616837, + "learning_rate": 1.9666429754575897e-05, + "loss": 1.1257, + "step": 13936 + }, + { + "epoch": 0.17422935573389334, + "grad_norm": 1.696940541267395, + "learning_rate": 1.9666206198044043e-05, + "loss": 1.287, + "step": 13938 + }, + { + "epoch": 0.17425435635890899, + "grad_norm": 2.6360888481140137, + "learning_rate": 1.9665982567895728e-05, + "loss": 0.445, + "step": 13940 + }, + { + "epoch": 0.1742793569839246, + "grad_norm": 4.938445568084717, + "learning_rate": 1.9665758864132654e-05, + "loss": 1.0531, + "step": 13942 + }, + { + "epoch": 0.17430435760894022, + "grad_norm": 3.137723922729492, + "learning_rate": 1.9665535086756532e-05, + "loss": 0.1026, + "step": 13944 + }, + { + "epoch": 0.17432935823395584, + "grad_norm": 5.6920623779296875, + "learning_rate": 1.966531123576906e-05, + "loss": 1.5974, + "step": 13946 + }, + { + "epoch": 0.17435435885897146, + "grad_norm": 2.605069637298584, + "learning_rate": 1.9665087311171943e-05, + "loss": 0.9004, + "step": 13948 + }, + { + "epoch": 0.1743793594839871, + "grad_norm": 0.6267382502555847, + "learning_rate": 1.9664863312966888e-05, + "loss": 0.0369, + "step": 13950 + }, + { + "epoch": 0.17440436010900273, + "grad_norm": 5.3662495613098145, + "learning_rate": 1.9664639241155598e-05, + "loss": 2.0418, + "step": 13952 + }, + { + "epoch": 0.17442936073401835, + "grad_norm": 2.514901876449585, + "learning_rate": 1.9664415095739787e-05, + "loss": 1.2634, + "step": 13954 + }, + { + "epoch": 0.17445436135903397, + "grad_norm": 7.401736736297607, + "learning_rate": 1.9664190876721157e-05, + "loss": 2.7346, + "step": 13956 + }, + { + "epoch": 0.1744793619840496, + "grad_norm": 3.1396567821502686, + "learning_rate": 1.9663966584101412e-05, + "loss": 1.2828, + "step": 13958 + }, + { + "epoch": 0.17450436260906524, + "grad_norm": 3.191673755645752, + "learning_rate": 1.9663742217882266e-05, + "loss": 0.2162, + "step": 13960 + }, + { + "epoch": 0.17452936323408086, + "grad_norm": 3.9266109466552734, + "learning_rate": 1.966351777806543e-05, + "loss": 1.3362, + "step": 13962 + }, + { + "epoch": 0.17455436385909648, + "grad_norm": 4.247039794921875, + "learning_rate": 1.9663293264652605e-05, + "loss": 0.6588, + "step": 13964 + }, + { + "epoch": 0.1745793644841121, + "grad_norm": 7.394929885864258, + "learning_rate": 1.9663068677645505e-05, + "loss": 0.5983, + "step": 13966 + }, + { + "epoch": 0.17460436510912772, + "grad_norm": 3.6877377033233643, + "learning_rate": 1.9662844017045838e-05, + "loss": 1.8997, + "step": 13968 + }, + { + "epoch": 0.17462936573414337, + "grad_norm": 0.003265228820964694, + "learning_rate": 1.9662619282855322e-05, + "loss": 0.7942, + "step": 13970 + }, + { + "epoch": 0.17465436635915899, + "grad_norm": 2.8314130306243896, + "learning_rate": 1.966239447507566e-05, + "loss": 0.4011, + "step": 13972 + }, + { + "epoch": 0.1746793669841746, + "grad_norm": 5.498602390289307, + "learning_rate": 1.9662169593708572e-05, + "loss": 1.6304, + "step": 13974 + }, + { + "epoch": 0.17470436760919023, + "grad_norm": 3.207118272781372, + "learning_rate": 1.9661944638755764e-05, + "loss": 1.0642, + "step": 13976 + }, + { + "epoch": 0.17472936823420585, + "grad_norm": 0.04945472255349159, + "learning_rate": 1.9661719610218958e-05, + "loss": 0.4575, + "step": 13978 + }, + { + "epoch": 0.1747543688592215, + "grad_norm": 0.20569591224193573, + "learning_rate": 1.9661494508099854e-05, + "loss": 0.8285, + "step": 13980 + }, + { + "epoch": 0.1747793694842371, + "grad_norm": 3.64902400970459, + "learning_rate": 1.9661269332400177e-05, + "loss": 0.39, + "step": 13982 + }, + { + "epoch": 0.17480437010925273, + "grad_norm": 0.0007698521367274225, + "learning_rate": 1.9661044083121638e-05, + "loss": 0.0189, + "step": 13984 + }, + { + "epoch": 0.17482937073426835, + "grad_norm": 2.746689796447754, + "learning_rate": 1.9660818760265957e-05, + "loss": 1.047, + "step": 13986 + }, + { + "epoch": 0.17485437135928397, + "grad_norm": 3.326741933822632, + "learning_rate": 1.9660593363834844e-05, + "loss": 1.1076, + "step": 13988 + }, + { + "epoch": 0.17487937198429962, + "grad_norm": 0.0005186835187487304, + "learning_rate": 1.9660367893830015e-05, + "loss": 1.0747, + "step": 13990 + }, + { + "epoch": 0.17490437260931524, + "grad_norm": 0.808350682258606, + "learning_rate": 1.9660142350253196e-05, + "loss": 0.7737, + "step": 13992 + }, + { + "epoch": 0.17492937323433086, + "grad_norm": 7.5833353996276855, + "learning_rate": 1.9659916733106096e-05, + "loss": 1.0132, + "step": 13994 + }, + { + "epoch": 0.17495437385934648, + "grad_norm": 3.4503180980682373, + "learning_rate": 1.9659691042390438e-05, + "loss": 0.5467, + "step": 13996 + }, + { + "epoch": 0.1749793744843621, + "grad_norm": 2.5624263286590576, + "learning_rate": 1.9659465278107935e-05, + "loss": 0.7852, + "step": 13998 + }, + { + "epoch": 0.17500437510937775, + "grad_norm": 2.6924593448638916, + "learning_rate": 1.9659239440260312e-05, + "loss": 0.783, + "step": 14000 + }, + { + "epoch": 0.17502937573439337, + "grad_norm": 4.3800950050354, + "learning_rate": 1.9659013528849288e-05, + "loss": 1.2885, + "step": 14002 + }, + { + "epoch": 0.175054376359409, + "grad_norm": 3.045111656188965, + "learning_rate": 1.9658787543876582e-05, + "loss": 1.3721, + "step": 14004 + }, + { + "epoch": 0.1750793769844246, + "grad_norm": 6.011955738067627, + "learning_rate": 1.9658561485343917e-05, + "loss": 2.0465, + "step": 14006 + }, + { + "epoch": 0.17510437760944023, + "grad_norm": 4.002774238586426, + "learning_rate": 1.9658335353253014e-05, + "loss": 0.9251, + "step": 14008 + }, + { + "epoch": 0.17512937823445587, + "grad_norm": 4.329987049102783, + "learning_rate": 1.9658109147605597e-05, + "loss": 1.7895, + "step": 14010 + }, + { + "epoch": 0.1751543788594715, + "grad_norm": 6.902409076690674, + "learning_rate": 1.9657882868403383e-05, + "loss": 0.7773, + "step": 14012 + }, + { + "epoch": 0.1751793794844871, + "grad_norm": 4.301056861877441, + "learning_rate": 1.9657656515648098e-05, + "loss": 0.4533, + "step": 14014 + }, + { + "epoch": 0.17520438010950273, + "grad_norm": 0.0011524972505867481, + "learning_rate": 1.965743008934147e-05, + "loss": 0.0159, + "step": 14016 + }, + { + "epoch": 0.17522938073451835, + "grad_norm": 4.893288612365723, + "learning_rate": 1.9657203589485218e-05, + "loss": 1.3175, + "step": 14018 + }, + { + "epoch": 0.175254381359534, + "grad_norm": 0.4208732545375824, + "learning_rate": 1.965697701608107e-05, + "loss": 0.5596, + "step": 14020 + }, + { + "epoch": 0.17527938198454962, + "grad_norm": 0.01733431965112686, + "learning_rate": 1.965675036913075e-05, + "loss": 1.1932, + "step": 14022 + }, + { + "epoch": 0.17530438260956524, + "grad_norm": 3.9296929836273193, + "learning_rate": 1.9656523648635985e-05, + "loss": 1.2017, + "step": 14024 + }, + { + "epoch": 0.17532938323458086, + "grad_norm": 2.2793052196502686, + "learning_rate": 1.96562968545985e-05, + "loss": 0.9809, + "step": 14026 + }, + { + "epoch": 0.17535438385959648, + "grad_norm": 4.076605319976807, + "learning_rate": 1.9656069987020024e-05, + "loss": 0.3478, + "step": 14028 + }, + { + "epoch": 0.17537938448461213, + "grad_norm": 4.528200626373291, + "learning_rate": 1.9655843045902288e-05, + "loss": 0.9072, + "step": 14030 + }, + { + "epoch": 0.17540438510962775, + "grad_norm": 5.923074722290039, + "learning_rate": 1.9655616031247014e-05, + "loss": 1.195, + "step": 14032 + }, + { + "epoch": 0.17542938573464337, + "grad_norm": 0.0006043986650183797, + "learning_rate": 1.965538894305593e-05, + "loss": 0.2873, + "step": 14034 + }, + { + "epoch": 0.175454386359659, + "grad_norm": 1.8903591632843018, + "learning_rate": 1.9655161781330778e-05, + "loss": 1.2718, + "step": 14036 + }, + { + "epoch": 0.1754793869846746, + "grad_norm": 0.0013835042482241988, + "learning_rate": 1.9654934546073275e-05, + "loss": 0.2179, + "step": 14038 + }, + { + "epoch": 0.17550438760969025, + "grad_norm": 3.540231704711914, + "learning_rate": 1.9654707237285153e-05, + "loss": 1.8116, + "step": 14040 + }, + { + "epoch": 0.17552938823470587, + "grad_norm": 0.5894123911857605, + "learning_rate": 1.965447985496815e-05, + "loss": 0.0449, + "step": 14042 + }, + { + "epoch": 0.1755543888597215, + "grad_norm": 3.851609945297241, + "learning_rate": 1.9654252399123993e-05, + "loss": 1.026, + "step": 14044 + }, + { + "epoch": 0.17557938948473711, + "grad_norm": 2.9419963359832764, + "learning_rate": 1.965402486975441e-05, + "loss": 1.5431, + "step": 14046 + }, + { + "epoch": 0.17560439010975273, + "grad_norm": 2.730760335922241, + "learning_rate": 1.9653797266861145e-05, + "loss": 1.0649, + "step": 14048 + }, + { + "epoch": 0.17562939073476838, + "grad_norm": 0.529731273651123, + "learning_rate": 1.965356959044592e-05, + "loss": 0.4428, + "step": 14050 + }, + { + "epoch": 0.175654391359784, + "grad_norm": 2.3233985900878906, + "learning_rate": 1.965334184051048e-05, + "loss": 0.0772, + "step": 14052 + }, + { + "epoch": 0.17567939198479962, + "grad_norm": 1.8902082443237305, + "learning_rate": 1.965311401705655e-05, + "loss": 0.4936, + "step": 14054 + }, + { + "epoch": 0.17570439260981524, + "grad_norm": 10.451683044433594, + "learning_rate": 1.965288612008587e-05, + "loss": 0.7034, + "step": 14056 + }, + { + "epoch": 0.17572939323483086, + "grad_norm": 4.935850620269775, + "learning_rate": 1.9652658149600175e-05, + "loss": 0.4963, + "step": 14058 + }, + { + "epoch": 0.1757543938598465, + "grad_norm": 2.8762238025665283, + "learning_rate": 1.96524301056012e-05, + "loss": 1.4943, + "step": 14060 + }, + { + "epoch": 0.17577939448486213, + "grad_norm": 2.7278239727020264, + "learning_rate": 1.965220198809068e-05, + "loss": 1.0419, + "step": 14062 + }, + { + "epoch": 0.17580439510987775, + "grad_norm": 0.0015415116213262081, + "learning_rate": 1.9651973797070357e-05, + "loss": 0.5483, + "step": 14064 + }, + { + "epoch": 0.17582939573489337, + "grad_norm": 0.0008638879517093301, + "learning_rate": 1.9651745532541967e-05, + "loss": 0.0765, + "step": 14066 + }, + { + "epoch": 0.175854396359909, + "grad_norm": 0.000816542305983603, + "learning_rate": 1.9651517194507245e-05, + "loss": 0.9203, + "step": 14068 + }, + { + "epoch": 0.17587939698492464, + "grad_norm": 3.7801685333251953, + "learning_rate": 1.9651288782967935e-05, + "loss": 1.3469, + "step": 14070 + }, + { + "epoch": 0.17590439760994026, + "grad_norm": 2.7579827308654785, + "learning_rate": 1.9651060297925773e-05, + "loss": 0.6699, + "step": 14072 + }, + { + "epoch": 0.17592939823495587, + "grad_norm": 0.008589206263422966, + "learning_rate": 1.9650831739382503e-05, + "loss": 0.1766, + "step": 14074 + }, + { + "epoch": 0.1759543988599715, + "grad_norm": 0.001550402957946062, + "learning_rate": 1.965060310733986e-05, + "loss": 0.0149, + "step": 14076 + }, + { + "epoch": 0.17597939948498711, + "grad_norm": 4.758456707000732, + "learning_rate": 1.9650374401799592e-05, + "loss": 0.3087, + "step": 14078 + }, + { + "epoch": 0.17600440011000276, + "grad_norm": 8.233919143676758, + "learning_rate": 1.9650145622763434e-05, + "loss": 1.0191, + "step": 14080 + }, + { + "epoch": 0.17602940073501838, + "grad_norm": 5.472476959228516, + "learning_rate": 1.9649916770233134e-05, + "loss": 0.941, + "step": 14082 + }, + { + "epoch": 0.176054401360034, + "grad_norm": 3.191730260848999, + "learning_rate": 1.964968784421043e-05, + "loss": 0.9365, + "step": 14084 + }, + { + "epoch": 0.17607940198504962, + "grad_norm": 3.305543899536133, + "learning_rate": 1.964945884469707e-05, + "loss": 0.7376, + "step": 14086 + }, + { + "epoch": 0.17610440261006524, + "grad_norm": 2.077714204788208, + "learning_rate": 1.96492297716948e-05, + "loss": 0.4025, + "step": 14088 + }, + { + "epoch": 0.1761294032350809, + "grad_norm": 3.0947024822235107, + "learning_rate": 1.9649000625205353e-05, + "loss": 0.8477, + "step": 14090 + }, + { + "epoch": 0.1761544038600965, + "grad_norm": 10.65878963470459, + "learning_rate": 1.9648771405230484e-05, + "loss": 1.282, + "step": 14092 + }, + { + "epoch": 0.17617940448511213, + "grad_norm": 4.388411998748779, + "learning_rate": 1.964854211177194e-05, + "loss": 1.5135, + "step": 14094 + }, + { + "epoch": 0.17620440511012775, + "grad_norm": 3.874180316925049, + "learning_rate": 1.964831274483146e-05, + "loss": 1.346, + "step": 14096 + }, + { + "epoch": 0.17622940573514337, + "grad_norm": 2.9098453521728516, + "learning_rate": 1.9648083304410796e-05, + "loss": 1.3688, + "step": 14098 + }, + { + "epoch": 0.17625440636015902, + "grad_norm": 0.0909925252199173, + "learning_rate": 1.9647853790511692e-05, + "loss": 0.516, + "step": 14100 + }, + { + "epoch": 0.17627940698517464, + "grad_norm": 3.8108131885528564, + "learning_rate": 1.96476242031359e-05, + "loss": 1.3682, + "step": 14102 + }, + { + "epoch": 0.17630440761019026, + "grad_norm": 0.20958569645881653, + "learning_rate": 1.9647394542285165e-05, + "loss": 0.7431, + "step": 14104 + }, + { + "epoch": 0.17632940823520588, + "grad_norm": 2.0809485912323, + "learning_rate": 1.9647164807961237e-05, + "loss": 1.1136, + "step": 14106 + }, + { + "epoch": 0.1763544088602215, + "grad_norm": 0.002779071219265461, + "learning_rate": 1.964693500016587e-05, + "loss": 0.209, + "step": 14108 + }, + { + "epoch": 0.17637940948523714, + "grad_norm": 7.874961853027344, + "learning_rate": 1.9646705118900806e-05, + "loss": 0.3416, + "step": 14110 + }, + { + "epoch": 0.17640441011025276, + "grad_norm": 4.460888862609863, + "learning_rate": 1.9646475164167803e-05, + "loss": 1.1929, + "step": 14112 + }, + { + "epoch": 0.17642941073526838, + "grad_norm": 1.5477168560028076, + "learning_rate": 1.9646245135968607e-05, + "loss": 0.8287, + "step": 14114 + }, + { + "epoch": 0.176454411360284, + "grad_norm": 3.287177324295044, + "learning_rate": 1.964601503430497e-05, + "loss": 0.6033, + "step": 14116 + }, + { + "epoch": 0.17647941198529962, + "grad_norm": 2.530565023422241, + "learning_rate": 1.964578485917865e-05, + "loss": 0.6994, + "step": 14118 + }, + { + "epoch": 0.17650441261031527, + "grad_norm": 4.828287124633789, + "learning_rate": 1.96455546105914e-05, + "loss": 1.484, + "step": 14120 + }, + { + "epoch": 0.1765294132353309, + "grad_norm": 3.1662306785583496, + "learning_rate": 1.9645324288544964e-05, + "loss": 0.6653, + "step": 14122 + }, + { + "epoch": 0.1765544138603465, + "grad_norm": 3.668012857437134, + "learning_rate": 1.9645093893041104e-05, + "loss": 1.2824, + "step": 14124 + }, + { + "epoch": 0.17657941448536213, + "grad_norm": 1.9465035200119019, + "learning_rate": 1.9644863424081574e-05, + "loss": 0.392, + "step": 14126 + }, + { + "epoch": 0.17660441511037775, + "grad_norm": 2.6786465644836426, + "learning_rate": 1.9644632881668127e-05, + "loss": 0.6052, + "step": 14128 + }, + { + "epoch": 0.1766294157353934, + "grad_norm": 1.1942988634109497, + "learning_rate": 1.9644402265802517e-05, + "loss": 1.0953, + "step": 14130 + }, + { + "epoch": 0.17665441636040902, + "grad_norm": 0.5769292712211609, + "learning_rate": 1.964417157648651e-05, + "loss": 1.2622, + "step": 14132 + }, + { + "epoch": 0.17667941698542464, + "grad_norm": 15.731136322021484, + "learning_rate": 1.964394081372185e-05, + "loss": 2.1952, + "step": 14134 + }, + { + "epoch": 0.17670441761044026, + "grad_norm": 1.7727539539337158, + "learning_rate": 1.9643709977510303e-05, + "loss": 1.2222, + "step": 14136 + }, + { + "epoch": 0.17672941823545588, + "grad_norm": 0.0007304716273210943, + "learning_rate": 1.9643479067853626e-05, + "loss": 0.1554, + "step": 14138 + }, + { + "epoch": 0.17675441886047152, + "grad_norm": 2.9414992332458496, + "learning_rate": 1.9643248084753574e-05, + "loss": 1.3252, + "step": 14140 + }, + { + "epoch": 0.17677941948548714, + "grad_norm": 2.3379714488983154, + "learning_rate": 1.9643017028211907e-05, + "loss": 0.9054, + "step": 14142 + }, + { + "epoch": 0.17680442011050276, + "grad_norm": 4.481883525848389, + "learning_rate": 1.9642785898230387e-05, + "loss": 1.785, + "step": 14144 + }, + { + "epoch": 0.17682942073551838, + "grad_norm": 1.6965010166168213, + "learning_rate": 1.9642554694810777e-05, + "loss": 0.5155, + "step": 14146 + }, + { + "epoch": 0.176854421360534, + "grad_norm": 13.864910125732422, + "learning_rate": 1.964232341795483e-05, + "loss": 2.1145, + "step": 14148 + }, + { + "epoch": 0.17687942198554965, + "grad_norm": 4.712080001831055, + "learning_rate": 1.964209206766431e-05, + "loss": 1.8243, + "step": 14150 + }, + { + "epoch": 0.17690442261056527, + "grad_norm": 7.739492416381836, + "learning_rate": 1.9641860643940985e-05, + "loss": 1.3835, + "step": 14152 + }, + { + "epoch": 0.1769294232355809, + "grad_norm": 2.666079044342041, + "learning_rate": 1.964162914678661e-05, + "loss": 0.3599, + "step": 14154 + }, + { + "epoch": 0.1769544238605965, + "grad_norm": 4.0145111083984375, + "learning_rate": 1.964139757620295e-05, + "loss": 0.2642, + "step": 14156 + }, + { + "epoch": 0.17697942448561213, + "grad_norm": 3.9239487648010254, + "learning_rate": 1.964116593219177e-05, + "loss": 0.8736, + "step": 14158 + }, + { + "epoch": 0.17700442511062778, + "grad_norm": 0.6895784735679626, + "learning_rate": 1.9640934214754836e-05, + "loss": 0.1964, + "step": 14160 + }, + { + "epoch": 0.1770294257356434, + "grad_norm": 6.981980800628662, + "learning_rate": 1.964070242389391e-05, + "loss": 0.9746, + "step": 14162 + }, + { + "epoch": 0.17705442636065902, + "grad_norm": 3.424107551574707, + "learning_rate": 1.9640470559610756e-05, + "loss": 0.3321, + "step": 14164 + }, + { + "epoch": 0.17707942698567464, + "grad_norm": 6.7232666015625, + "learning_rate": 1.964023862190714e-05, + "loss": 1.1862, + "step": 14166 + }, + { + "epoch": 0.17710442761069026, + "grad_norm": 3.765690803527832, + "learning_rate": 1.9640006610784835e-05, + "loss": 0.9246, + "step": 14168 + }, + { + "epoch": 0.1771294282357059, + "grad_norm": 5.794242858886719, + "learning_rate": 1.96397745262456e-05, + "loss": 0.5277, + "step": 14170 + }, + { + "epoch": 0.17715442886072152, + "grad_norm": 3.1862292289733887, + "learning_rate": 1.96395423682912e-05, + "loss": 1.6363, + "step": 14172 + }, + { + "epoch": 0.17717942948573714, + "grad_norm": 2.0870909690856934, + "learning_rate": 1.963931013692342e-05, + "loss": 0.6325, + "step": 14174 + }, + { + "epoch": 0.17720443011075276, + "grad_norm": 2.8587253093719482, + "learning_rate": 1.963907783214401e-05, + "loss": 0.6515, + "step": 14176 + }, + { + "epoch": 0.17722943073576838, + "grad_norm": 4.124249458312988, + "learning_rate": 1.9638845453954744e-05, + "loss": 1.2437, + "step": 14178 + }, + { + "epoch": 0.17725443136078403, + "grad_norm": 0.5688669085502625, + "learning_rate": 1.96386130023574e-05, + "loss": 0.1483, + "step": 14180 + }, + { + "epoch": 0.17727943198579965, + "grad_norm": 6.757004737854004, + "learning_rate": 1.963838047735374e-05, + "loss": 0.871, + "step": 14182 + }, + { + "epoch": 0.17730443261081527, + "grad_norm": 4.667675971984863, + "learning_rate": 1.9638147878945536e-05, + "loss": 1.7151, + "step": 14184 + }, + { + "epoch": 0.1773294332358309, + "grad_norm": 2.733738899230957, + "learning_rate": 1.963791520713456e-05, + "loss": 1.6124, + "step": 14186 + }, + { + "epoch": 0.1773544338608465, + "grad_norm": 1.5040591955184937, + "learning_rate": 1.9637682461922588e-05, + "loss": 0.3359, + "step": 14188 + }, + { + "epoch": 0.17737943448586216, + "grad_norm": 3.019916296005249, + "learning_rate": 1.9637449643311386e-05, + "loss": 1.8976, + "step": 14190 + }, + { + "epoch": 0.17740443511087778, + "grad_norm": 2.32999324798584, + "learning_rate": 1.9637216751302733e-05, + "loss": 1.1302, + "step": 14192 + }, + { + "epoch": 0.1774294357358934, + "grad_norm": 5.043391704559326, + "learning_rate": 1.9636983785898397e-05, + "loss": 1.1893, + "step": 14194 + }, + { + "epoch": 0.17745443636090902, + "grad_norm": 14.773965835571289, + "learning_rate": 1.963675074710016e-05, + "loss": 1.3854, + "step": 14196 + }, + { + "epoch": 0.17747943698592464, + "grad_norm": 4.9757561683654785, + "learning_rate": 1.9636517634909786e-05, + "loss": 0.915, + "step": 14198 + }, + { + "epoch": 0.17750443761094029, + "grad_norm": 3.576107978820801, + "learning_rate": 1.963628444932906e-05, + "loss": 2.8386, + "step": 14200 + }, + { + "epoch": 0.1775294382359559, + "grad_norm": 7.537342071533203, + "learning_rate": 1.9636051190359754e-05, + "loss": 0.9938, + "step": 14202 + }, + { + "epoch": 0.17755443886097153, + "grad_norm": 0.0036674782168120146, + "learning_rate": 1.9635817858003646e-05, + "loss": 0.6296, + "step": 14204 + }, + { + "epoch": 0.17757943948598715, + "grad_norm": 3.5641932487487793, + "learning_rate": 1.963558445226251e-05, + "loss": 1.6999, + "step": 14206 + }, + { + "epoch": 0.17760444011100276, + "grad_norm": 4.229938507080078, + "learning_rate": 1.9635350973138127e-05, + "loss": 1.2799, + "step": 14208 + }, + { + "epoch": 0.1776294407360184, + "grad_norm": 3.8015387058258057, + "learning_rate": 1.9635117420632273e-05, + "loss": 1.9585, + "step": 14210 + }, + { + "epoch": 0.17765444136103403, + "grad_norm": 2.949831008911133, + "learning_rate": 1.9634883794746726e-05, + "loss": 1.6589, + "step": 14212 + }, + { + "epoch": 0.17767944198604965, + "grad_norm": 3.0505828857421875, + "learning_rate": 1.9634650095483266e-05, + "loss": 1.2708, + "step": 14214 + }, + { + "epoch": 0.17770444261106527, + "grad_norm": 3.492487907409668, + "learning_rate": 1.9634416322843677e-05, + "loss": 0.8166, + "step": 14216 + }, + { + "epoch": 0.1777294432360809, + "grad_norm": 3.88785719871521, + "learning_rate": 1.9634182476829734e-05, + "loss": 0.8681, + "step": 14218 + }, + { + "epoch": 0.17775444386109654, + "grad_norm": 0.0023724003694951534, + "learning_rate": 1.9633948557443218e-05, + "loss": 0.5339, + "step": 14220 + }, + { + "epoch": 0.17777944448611216, + "grad_norm": 2.1839706897735596, + "learning_rate": 1.9633714564685915e-05, + "loss": 0.1023, + "step": 14222 + }, + { + "epoch": 0.17780444511112778, + "grad_norm": 4.307661056518555, + "learning_rate": 1.96334804985596e-05, + "loss": 2.7713, + "step": 14224 + }, + { + "epoch": 0.1778294457361434, + "grad_norm": 0.4455930292606354, + "learning_rate": 1.963324635906606e-05, + "loss": 0.5132, + "step": 14226 + }, + { + "epoch": 0.17785444636115902, + "grad_norm": 5.254903793334961, + "learning_rate": 1.9633012146207085e-05, + "loss": 0.6881, + "step": 14228 + }, + { + "epoch": 0.17787944698617467, + "grad_norm": 3.076138734817505, + "learning_rate": 1.9632777859984443e-05, + "loss": 1.2237, + "step": 14230 + }, + { + "epoch": 0.1779044476111903, + "grad_norm": 5.929948806762695, + "learning_rate": 1.9632543500399933e-05, + "loss": 2.4596, + "step": 14232 + }, + { + "epoch": 0.1779294482362059, + "grad_norm": 3.9622209072113037, + "learning_rate": 1.9632309067455332e-05, + "loss": 2.0264, + "step": 14234 + }, + { + "epoch": 0.17795444886122153, + "grad_norm": 3.0484118461608887, + "learning_rate": 1.9632074561152426e-05, + "loss": 1.2632, + "step": 14236 + }, + { + "epoch": 0.17797944948623715, + "grad_norm": 0.003703880589455366, + "learning_rate": 1.9631839981493004e-05, + "loss": 0.0, + "step": 14238 + }, + { + "epoch": 0.1780044501112528, + "grad_norm": 6.366408824920654, + "learning_rate": 1.9631605328478852e-05, + "loss": 2.0421, + "step": 14240 + }, + { + "epoch": 0.1780294507362684, + "grad_norm": 3.162278652191162, + "learning_rate": 1.9631370602111755e-05, + "loss": 0.7905, + "step": 14242 + }, + { + "epoch": 0.17805445136128403, + "grad_norm": 8.932808876037598, + "learning_rate": 1.96311358023935e-05, + "loss": 1.5735, + "step": 14244 + }, + { + "epoch": 0.17807945198629965, + "grad_norm": 3.358785390853882, + "learning_rate": 1.9630900929325876e-05, + "loss": 0.3949, + "step": 14246 + }, + { + "epoch": 0.17810445261131527, + "grad_norm": 3.2271294593811035, + "learning_rate": 1.9630665982910675e-05, + "loss": 0.9762, + "step": 14248 + }, + { + "epoch": 0.17812945323633092, + "grad_norm": 2.617868661880493, + "learning_rate": 1.963043096314968e-05, + "loss": 1.8383, + "step": 14250 + }, + { + "epoch": 0.17815445386134654, + "grad_norm": 3.829000949859619, + "learning_rate": 1.9630195870044688e-05, + "loss": 0.8061, + "step": 14252 + }, + { + "epoch": 0.17817945448636216, + "grad_norm": 4.833803653717041, + "learning_rate": 1.9629960703597483e-05, + "loss": 2.3984, + "step": 14254 + }, + { + "epoch": 0.17820445511137778, + "grad_norm": 2.5549709796905518, + "learning_rate": 1.9629725463809863e-05, + "loss": 0.7717, + "step": 14256 + }, + { + "epoch": 0.1782294557363934, + "grad_norm": 4.668399810791016, + "learning_rate": 1.9629490150683616e-05, + "loss": 0.6253, + "step": 14258 + }, + { + "epoch": 0.17825445636140905, + "grad_norm": 4.213015079498291, + "learning_rate": 1.9629254764220528e-05, + "loss": 1.7089, + "step": 14260 + }, + { + "epoch": 0.17827945698642467, + "grad_norm": 3.3129379749298096, + "learning_rate": 1.9629019304422404e-05, + "loss": 0.9832, + "step": 14262 + }, + { + "epoch": 0.1783044576114403, + "grad_norm": 5.018467903137207, + "learning_rate": 1.962878377129103e-05, + "loss": 2.5303, + "step": 14264 + }, + { + "epoch": 0.1783294582364559, + "grad_norm": 3.6462810039520264, + "learning_rate": 1.9628548164828195e-05, + "loss": 0.8053, + "step": 14266 + }, + { + "epoch": 0.17835445886147153, + "grad_norm": 0.5720509886741638, + "learning_rate": 1.96283124850357e-05, + "loss": 0.1091, + "step": 14268 + }, + { + "epoch": 0.17837945948648717, + "grad_norm": 5.942283630371094, + "learning_rate": 1.9628076731915342e-05, + "loss": 0.569, + "step": 14270 + }, + { + "epoch": 0.1784044601115028, + "grad_norm": 2.799854278564453, + "learning_rate": 1.962784090546891e-05, + "loss": 1.0732, + "step": 14272 + }, + { + "epoch": 0.1784294607365184, + "grad_norm": 2.8499088287353516, + "learning_rate": 1.9627605005698206e-05, + "loss": 0.8565, + "step": 14274 + }, + { + "epoch": 0.17845446136153403, + "grad_norm": 1.6018010377883911, + "learning_rate": 1.9627369032605025e-05, + "loss": 0.5434, + "step": 14276 + }, + { + "epoch": 0.17847946198654965, + "grad_norm": 2.8599045276641846, + "learning_rate": 1.9627132986191158e-05, + "loss": 1.082, + "step": 14278 + }, + { + "epoch": 0.1785044626115653, + "grad_norm": 1.9716500043869019, + "learning_rate": 1.962689686645841e-05, + "loss": 0.5529, + "step": 14280 + }, + { + "epoch": 0.17852946323658092, + "grad_norm": 0.0020251746755093336, + "learning_rate": 1.9626660673408578e-05, + "loss": 1.0572, + "step": 14282 + }, + { + "epoch": 0.17855446386159654, + "grad_norm": 4.72792387008667, + "learning_rate": 1.962642440704346e-05, + "loss": 2.1351, + "step": 14284 + }, + { + "epoch": 0.17857946448661216, + "grad_norm": 4.477542877197266, + "learning_rate": 1.9626188067364855e-05, + "loss": 0.9538, + "step": 14286 + }, + { + "epoch": 0.17860446511162778, + "grad_norm": 2.637538194656372, + "learning_rate": 1.962595165437456e-05, + "loss": 1.2986, + "step": 14288 + }, + { + "epoch": 0.17862946573664343, + "grad_norm": 4.942470550537109, + "learning_rate": 1.9625715168074378e-05, + "loss": 1.5929, + "step": 14290 + }, + { + "epoch": 0.17865446636165905, + "grad_norm": 5.002274990081787, + "learning_rate": 1.9625478608466114e-05, + "loss": 0.9593, + "step": 14292 + }, + { + "epoch": 0.17867946698667467, + "grad_norm": 2.9381260871887207, + "learning_rate": 1.9625241975551564e-05, + "loss": 0.15, + "step": 14294 + }, + { + "epoch": 0.1787044676116903, + "grad_norm": 5.419125080108643, + "learning_rate": 1.962500526933253e-05, + "loss": 1.0925, + "step": 14296 + }, + { + "epoch": 0.1787294682367059, + "grad_norm": 3.4227142333984375, + "learning_rate": 1.962476848981082e-05, + "loss": 0.282, + "step": 14298 + }, + { + "epoch": 0.17875446886172155, + "grad_norm": 1.4611574411392212, + "learning_rate": 1.9624531636988234e-05, + "loss": 0.5628, + "step": 14300 + }, + { + "epoch": 0.17877946948673717, + "grad_norm": 3.3008992671966553, + "learning_rate": 1.9624294710866578e-05, + "loss": 0.6036, + "step": 14302 + }, + { + "epoch": 0.1788044701117528, + "grad_norm": 4.8538007736206055, + "learning_rate": 1.962405771144765e-05, + "loss": 1.2874, + "step": 14304 + }, + { + "epoch": 0.17882947073676841, + "grad_norm": 2.8565237522125244, + "learning_rate": 1.9623820638733263e-05, + "loss": 0.1997, + "step": 14306 + }, + { + "epoch": 0.17885447136178403, + "grad_norm": 0.15227264165878296, + "learning_rate": 1.9623583492725218e-05, + "loss": 0.6666, + "step": 14308 + }, + { + "epoch": 0.17887947198679968, + "grad_norm": 0.5349077582359314, + "learning_rate": 1.9623346273425323e-05, + "loss": 0.6371, + "step": 14310 + }, + { + "epoch": 0.1789044726118153, + "grad_norm": 0.0023099316749721766, + "learning_rate": 1.9623108980835385e-05, + "loss": 1.5085, + "step": 14312 + }, + { + "epoch": 0.17892947323683092, + "grad_norm": 3.297201633453369, + "learning_rate": 1.9622871614957204e-05, + "loss": 0.2324, + "step": 14314 + }, + { + "epoch": 0.17895447386184654, + "grad_norm": 2.5434300899505615, + "learning_rate": 1.96226341757926e-05, + "loss": 0.5532, + "step": 14316 + }, + { + "epoch": 0.17897947448686216, + "grad_norm": 0.636792004108429, + "learning_rate": 1.962239666334337e-05, + "loss": 1.4097, + "step": 14318 + }, + { + "epoch": 0.1790044751118778, + "grad_norm": 7.718227863311768, + "learning_rate": 1.962215907761133e-05, + "loss": 0.9524, + "step": 14320 + }, + { + "epoch": 0.17902947573689343, + "grad_norm": 5.074490070343018, + "learning_rate": 1.9621921418598285e-05, + "loss": 1.276, + "step": 14322 + }, + { + "epoch": 0.17905447636190905, + "grad_norm": 2.9362568855285645, + "learning_rate": 1.962168368630605e-05, + "loss": 1.0308, + "step": 14324 + }, + { + "epoch": 0.17907947698692467, + "grad_norm": 1.6243102550506592, + "learning_rate": 1.962144588073643e-05, + "loss": 0.1481, + "step": 14326 + }, + { + "epoch": 0.1791044776119403, + "grad_norm": 5.602258682250977, + "learning_rate": 1.962120800189124e-05, + "loss": 0.9897, + "step": 14328 + }, + { + "epoch": 0.17912947823695594, + "grad_norm": 2.4933602809906006, + "learning_rate": 1.9620970049772295e-05, + "loss": 1.6738, + "step": 14330 + }, + { + "epoch": 0.17915447886197156, + "grad_norm": 0.022958502173423767, + "learning_rate": 1.9620732024381397e-05, + "loss": 0.3346, + "step": 14332 + }, + { + "epoch": 0.17917947948698718, + "grad_norm": 4.7552571296691895, + "learning_rate": 1.9620493925720364e-05, + "loss": 1.6671, + "step": 14334 + }, + { + "epoch": 0.1792044801120028, + "grad_norm": 1.4036370515823364, + "learning_rate": 1.9620255753791014e-05, + "loss": 1.4412, + "step": 14336 + }, + { + "epoch": 0.17922948073701842, + "grad_norm": 1.5517640113830566, + "learning_rate": 1.962001750859515e-05, + "loss": 0.9146, + "step": 14338 + }, + { + "epoch": 0.17925448136203406, + "grad_norm": 2.796226739883423, + "learning_rate": 1.96197791901346e-05, + "loss": 0.99, + "step": 14340 + }, + { + "epoch": 0.17927948198704968, + "grad_norm": 2.375060558319092, + "learning_rate": 1.961954079841117e-05, + "loss": 1.3876, + "step": 14342 + }, + { + "epoch": 0.1793044826120653, + "grad_norm": 0.8326281309127808, + "learning_rate": 1.961930233342668e-05, + "loss": 0.3808, + "step": 14344 + }, + { + "epoch": 0.17932948323708092, + "grad_norm": 3.5269737243652344, + "learning_rate": 1.961906379518294e-05, + "loss": 1.1552, + "step": 14346 + }, + { + "epoch": 0.17935448386209654, + "grad_norm": 3.9467453956604004, + "learning_rate": 1.9618825183681765e-05, + "loss": 1.0533, + "step": 14348 + }, + { + "epoch": 0.1793794844871122, + "grad_norm": 3.5226612091064453, + "learning_rate": 1.9618586498924985e-05, + "loss": 0.774, + "step": 14350 + }, + { + "epoch": 0.1794044851121278, + "grad_norm": 0.004037423990666866, + "learning_rate": 1.961834774091441e-05, + "loss": 0.5883, + "step": 14352 + }, + { + "epoch": 0.17942948573714343, + "grad_norm": 3.9997880458831787, + "learning_rate": 1.9618108909651857e-05, + "loss": 1.3507, + "step": 14354 + }, + { + "epoch": 0.17945448636215905, + "grad_norm": 3.262216567993164, + "learning_rate": 1.9617870005139147e-05, + "loss": 1.4298, + "step": 14356 + }, + { + "epoch": 0.17947948698717467, + "grad_norm": 3.2494142055511475, + "learning_rate": 1.9617631027378102e-05, + "loss": 1.0178, + "step": 14358 + }, + { + "epoch": 0.17950448761219032, + "grad_norm": 2.1812613010406494, + "learning_rate": 1.9617391976370536e-05, + "loss": 0.7591, + "step": 14360 + }, + { + "epoch": 0.17952948823720594, + "grad_norm": 0.013801071792840958, + "learning_rate": 1.9617152852118273e-05, + "loss": 0.6096, + "step": 14362 + }, + { + "epoch": 0.17955448886222156, + "grad_norm": 1.2862911224365234, + "learning_rate": 1.9616913654623136e-05, + "loss": 1.3553, + "step": 14364 + }, + { + "epoch": 0.17957948948723718, + "grad_norm": 8.825326919555664, + "learning_rate": 1.9616674383886943e-05, + "loss": 0.7407, + "step": 14366 + }, + { + "epoch": 0.1796044901122528, + "grad_norm": 0.0007216534577310085, + "learning_rate": 1.9616435039911518e-05, + "loss": 0.534, + "step": 14368 + }, + { + "epoch": 0.17962949073726844, + "grad_norm": 3.395932912826538, + "learning_rate": 1.961619562269868e-05, + "loss": 0.8223, + "step": 14370 + }, + { + "epoch": 0.17965449136228406, + "grad_norm": 3.233759641647339, + "learning_rate": 1.961595613225026e-05, + "loss": 0.7938, + "step": 14372 + }, + { + "epoch": 0.17967949198729968, + "grad_norm": 0.40138494968414307, + "learning_rate": 1.9615716568568077e-05, + "loss": 0.6738, + "step": 14374 + }, + { + "epoch": 0.1797044926123153, + "grad_norm": 2.0164377689361572, + "learning_rate": 1.9615476931653958e-05, + "loss": 0.6214, + "step": 14376 + }, + { + "epoch": 0.17972949323733092, + "grad_norm": 2.8904263973236084, + "learning_rate": 1.9615237221509723e-05, + "loss": 1.241, + "step": 14378 + }, + { + "epoch": 0.17975449386234657, + "grad_norm": 0.4804503321647644, + "learning_rate": 1.9614997438137204e-05, + "loss": 1.0592, + "step": 14380 + }, + { + "epoch": 0.1797794944873622, + "grad_norm": 5.001620769500732, + "learning_rate": 1.961475758153822e-05, + "loss": 1.3507, + "step": 14382 + }, + { + "epoch": 0.1798044951123778, + "grad_norm": 0.0011534132063388824, + "learning_rate": 1.961451765171461e-05, + "loss": 0.0, + "step": 14384 + }, + { + "epoch": 0.17982949573739343, + "grad_norm": 1.2384510040283203, + "learning_rate": 1.961427764866819e-05, + "loss": 0.965, + "step": 14386 + }, + { + "epoch": 0.17985449636240905, + "grad_norm": 3.5028936862945557, + "learning_rate": 1.9614037572400786e-05, + "loss": 1.3975, + "step": 14388 + }, + { + "epoch": 0.1798794969874247, + "grad_norm": 7.07072114944458, + "learning_rate": 1.9613797422914237e-05, + "loss": 0.6439, + "step": 14390 + }, + { + "epoch": 0.17990449761244032, + "grad_norm": 3.0918617248535156, + "learning_rate": 1.9613557200210363e-05, + "loss": 0.2058, + "step": 14392 + }, + { + "epoch": 0.17992949823745594, + "grad_norm": 3.2821638584136963, + "learning_rate": 1.9613316904290997e-05, + "loss": 0.2968, + "step": 14394 + }, + { + "epoch": 0.17995449886247156, + "grad_norm": 1.3336881399154663, + "learning_rate": 1.961307653515797e-05, + "loss": 1.0135, + "step": 14396 + }, + { + "epoch": 0.17997949948748718, + "grad_norm": 7.736968994140625, + "learning_rate": 1.961283609281311e-05, + "loss": 2.0142, + "step": 14398 + }, + { + "epoch": 0.18000450011250282, + "grad_norm": 0.44047078490257263, + "learning_rate": 1.9612595577258255e-05, + "loss": 0.015, + "step": 14400 + }, + { + "epoch": 0.18002950073751844, + "grad_norm": 0.0030668957624584436, + "learning_rate": 1.9612354988495225e-05, + "loss": 0.0685, + "step": 14402 + }, + { + "epoch": 0.18005450136253406, + "grad_norm": 0.0016083811642602086, + "learning_rate": 1.961211432652586e-05, + "loss": 0.4141, + "step": 14404 + }, + { + "epoch": 0.18007950198754968, + "grad_norm": 2.6524736881256104, + "learning_rate": 1.9611873591351994e-05, + "loss": 1.2277, + "step": 14406 + }, + { + "epoch": 0.1801045026125653, + "grad_norm": 3.5531527996063232, + "learning_rate": 1.9611632782975455e-05, + "loss": 1.3191, + "step": 14408 + }, + { + "epoch": 0.18012950323758095, + "grad_norm": 3.1785788536071777, + "learning_rate": 1.9611391901398082e-05, + "loss": 0.7238, + "step": 14410 + }, + { + "epoch": 0.18015450386259657, + "grad_norm": 4.20660924911499, + "learning_rate": 1.9611150946621707e-05, + "loss": 0.285, + "step": 14412 + }, + { + "epoch": 0.1801795044876122, + "grad_norm": 2.8240792751312256, + "learning_rate": 1.9610909918648167e-05, + "loss": 1.284, + "step": 14414 + }, + { + "epoch": 0.1802045051126278, + "grad_norm": 1.1252400875091553, + "learning_rate": 1.9610668817479294e-05, + "loss": 0.4851, + "step": 14416 + }, + { + "epoch": 0.18022950573764343, + "grad_norm": 4.963965892791748, + "learning_rate": 1.9610427643116926e-05, + "loss": 1.0733, + "step": 14418 + }, + { + "epoch": 0.18025450636265908, + "grad_norm": 2.4468603134155273, + "learning_rate": 1.9610186395562905e-05, + "loss": 1.1106, + "step": 14420 + }, + { + "epoch": 0.1802795069876747, + "grad_norm": 2.1765449047088623, + "learning_rate": 1.960994507481906e-05, + "loss": 1.2824, + "step": 14422 + }, + { + "epoch": 0.18030450761269032, + "grad_norm": 2.1608548164367676, + "learning_rate": 1.960970368088723e-05, + "loss": 0.1858, + "step": 14424 + }, + { + "epoch": 0.18032950823770594, + "grad_norm": 0.6181086301803589, + "learning_rate": 1.9609462213769258e-05, + "loss": 1.2563, + "step": 14426 + }, + { + "epoch": 0.18035450886272156, + "grad_norm": 4.272320747375488, + "learning_rate": 1.960922067346698e-05, + "loss": 1.839, + "step": 14428 + }, + { + "epoch": 0.1803795094877372, + "grad_norm": 1.0744199752807617, + "learning_rate": 1.960897905998224e-05, + "loss": 0.703, + "step": 14430 + }, + { + "epoch": 0.18040451011275283, + "grad_norm": 2.7128820419311523, + "learning_rate": 1.9608737373316868e-05, + "loss": 0.661, + "step": 14432 + }, + { + "epoch": 0.18042951073776844, + "grad_norm": 3.2886013984680176, + "learning_rate": 1.9608495613472713e-05, + "loss": 0.2431, + "step": 14434 + }, + { + "epoch": 0.18045451136278406, + "grad_norm": 6.148894309997559, + "learning_rate": 1.9608253780451616e-05, + "loss": 1.5227, + "step": 14436 + }, + { + "epoch": 0.18047951198779968, + "grad_norm": 5.56458044052124, + "learning_rate": 1.9608011874255413e-05, + "loss": 0.6895, + "step": 14438 + }, + { + "epoch": 0.18050451261281533, + "grad_norm": 1.3079966306686401, + "learning_rate": 1.9607769894885956e-05, + "loss": 0.7985, + "step": 14440 + }, + { + "epoch": 0.18052951323783095, + "grad_norm": 4.7812089920043945, + "learning_rate": 1.960752784234508e-05, + "loss": 2.0019, + "step": 14442 + }, + { + "epoch": 0.18055451386284657, + "grad_norm": 15.44162368774414, + "learning_rate": 1.960728571663463e-05, + "loss": 2.0404, + "step": 14444 + }, + { + "epoch": 0.1805795144878622, + "grad_norm": 2.5047109127044678, + "learning_rate": 1.960704351775645e-05, + "loss": 0.4778, + "step": 14446 + }, + { + "epoch": 0.1806045151128778, + "grad_norm": 10.000624656677246, + "learning_rate": 1.9606801245712385e-05, + "loss": 0.3645, + "step": 14448 + }, + { + "epoch": 0.18062951573789346, + "grad_norm": 1.7429507970809937, + "learning_rate": 1.9606558900504282e-05, + "loss": 0.1409, + "step": 14450 + }, + { + "epoch": 0.18065451636290908, + "grad_norm": 1.583320140838623, + "learning_rate": 1.9606316482133985e-05, + "loss": 1.0609, + "step": 14452 + }, + { + "epoch": 0.1806795169879247, + "grad_norm": 2.0067553520202637, + "learning_rate": 1.960607399060334e-05, + "loss": 0.4964, + "step": 14454 + }, + { + "epoch": 0.18070451761294032, + "grad_norm": 0.0035308427177369595, + "learning_rate": 1.960583142591419e-05, + "loss": 1.2445, + "step": 14456 + }, + { + "epoch": 0.18072951823795594, + "grad_norm": 3.4786183834075928, + "learning_rate": 1.960558878806839e-05, + "loss": 1.3839, + "step": 14458 + }, + { + "epoch": 0.18075451886297159, + "grad_norm": 3.200119733810425, + "learning_rate": 1.9605346077067783e-05, + "loss": 2.0693, + "step": 14460 + }, + { + "epoch": 0.1807795194879872, + "grad_norm": 3.2613115310668945, + "learning_rate": 1.960510329291422e-05, + "loss": 1.0906, + "step": 14462 + }, + { + "epoch": 0.18080452011300283, + "grad_norm": 8.484411239624023, + "learning_rate": 1.9604860435609546e-05, + "loss": 1.3929, + "step": 14464 + }, + { + "epoch": 0.18082952073801845, + "grad_norm": 2.80924129486084, + "learning_rate": 1.9604617505155615e-05, + "loss": 1.0638, + "step": 14466 + }, + { + "epoch": 0.18085452136303407, + "grad_norm": 1.8412084579467773, + "learning_rate": 1.9604374501554277e-05, + "loss": 1.2312, + "step": 14468 + }, + { + "epoch": 0.1808795219880497, + "grad_norm": 1.025118112564087, + "learning_rate": 1.9604131424807375e-05, + "loss": 1.1728, + "step": 14470 + }, + { + "epoch": 0.18090452261306533, + "grad_norm": 2.6207289695739746, + "learning_rate": 1.9603888274916772e-05, + "loss": 0.6423, + "step": 14472 + }, + { + "epoch": 0.18092952323808095, + "grad_norm": 0.5450817942619324, + "learning_rate": 1.960364505188431e-05, + "loss": 0.0166, + "step": 14474 + }, + { + "epoch": 0.18095452386309657, + "grad_norm": 2.786604881286621, + "learning_rate": 1.9603401755711847e-05, + "loss": 0.4487, + "step": 14476 + }, + { + "epoch": 0.1809795244881122, + "grad_norm": 2.7727644443511963, + "learning_rate": 1.960315838640124e-05, + "loss": 0.8987, + "step": 14478 + }, + { + "epoch": 0.18100452511312784, + "grad_norm": 2.610153913497925, + "learning_rate": 1.9602914943954326e-05, + "loss": 0.9125, + "step": 14480 + }, + { + "epoch": 0.18102952573814346, + "grad_norm": 2.6657588481903076, + "learning_rate": 1.9602671428372976e-05, + "loss": 1.5535, + "step": 14482 + }, + { + "epoch": 0.18105452636315908, + "grad_norm": 1.474133849143982, + "learning_rate": 1.960242783965904e-05, + "loss": 1.2183, + "step": 14484 + }, + { + "epoch": 0.1810795269881747, + "grad_norm": 2.543382167816162, + "learning_rate": 1.9602184177814367e-05, + "loss": 0.7711, + "step": 14486 + }, + { + "epoch": 0.18110452761319032, + "grad_norm": 2.507205009460449, + "learning_rate": 1.9601940442840818e-05, + "loss": 0.6301, + "step": 14488 + }, + { + "epoch": 0.18112952823820597, + "grad_norm": 5.434654712677002, + "learning_rate": 1.960169663474025e-05, + "loss": 1.5264, + "step": 14490 + }, + { + "epoch": 0.1811545288632216, + "grad_norm": 0.0012022501323372126, + "learning_rate": 1.9601452753514517e-05, + "loss": 0.7723, + "step": 14492 + }, + { + "epoch": 0.1811795294882372, + "grad_norm": 4.313812732696533, + "learning_rate": 1.9601208799165482e-05, + "loss": 1.2549, + "step": 14494 + }, + { + "epoch": 0.18120453011325283, + "grad_norm": 2.482224464416504, + "learning_rate": 1.9600964771694993e-05, + "loss": 1.0202, + "step": 14496 + }, + { + "epoch": 0.18122953073826845, + "grad_norm": 2.894005537033081, + "learning_rate": 1.9600720671104917e-05, + "loss": 1.1822, + "step": 14498 + }, + { + "epoch": 0.1812545313632841, + "grad_norm": 1.7500256299972534, + "learning_rate": 1.960047649739711e-05, + "loss": 0.2804, + "step": 14500 + }, + { + "epoch": 0.1812795319882997, + "grad_norm": 3.396440029144287, + "learning_rate": 1.960023225057343e-05, + "loss": 1.1516, + "step": 14502 + }, + { + "epoch": 0.18130453261331533, + "grad_norm": 3.509943962097168, + "learning_rate": 1.959998793063574e-05, + "loss": 1.0366, + "step": 14504 + }, + { + "epoch": 0.18132953323833095, + "grad_norm": 4.865563869476318, + "learning_rate": 1.9599743537585898e-05, + "loss": 0.9735, + "step": 14506 + }, + { + "epoch": 0.18135453386334657, + "grad_norm": 4.235531330108643, + "learning_rate": 1.9599499071425767e-05, + "loss": 1.1969, + "step": 14508 + }, + { + "epoch": 0.18137953448836222, + "grad_norm": 6.775665760040283, + "learning_rate": 1.959925453215721e-05, + "loss": 1.803, + "step": 14510 + }, + { + "epoch": 0.18140453511337784, + "grad_norm": 3.106531858444214, + "learning_rate": 1.9599009919782088e-05, + "loss": 1.1092, + "step": 14512 + }, + { + "epoch": 0.18142953573839346, + "grad_norm": 3.7825570106506348, + "learning_rate": 1.9598765234302263e-05, + "loss": 1.0192, + "step": 14514 + }, + { + "epoch": 0.18145453636340908, + "grad_norm": 2.7981131076812744, + "learning_rate": 1.95985204757196e-05, + "loss": 0.7705, + "step": 14516 + }, + { + "epoch": 0.1814795369884247, + "grad_norm": 2.17230486869812, + "learning_rate": 1.959827564403596e-05, + "loss": 0.9481, + "step": 14518 + }, + { + "epoch": 0.18150453761344035, + "grad_norm": 4.737555503845215, + "learning_rate": 1.9598030739253212e-05, + "loss": 1.2738, + "step": 14520 + }, + { + "epoch": 0.18152953823845597, + "grad_norm": 4.309680461883545, + "learning_rate": 1.959778576137322e-05, + "loss": 0.7925, + "step": 14522 + }, + { + "epoch": 0.1815545388634716, + "grad_norm": 3.6691551208496094, + "learning_rate": 1.9597540710397845e-05, + "loss": 1.5768, + "step": 14524 + }, + { + "epoch": 0.1815795394884872, + "grad_norm": 2.653733253479004, + "learning_rate": 1.959729558632896e-05, + "loss": 0.7897, + "step": 14526 + }, + { + "epoch": 0.18160454011350283, + "grad_norm": 1.2113227844238281, + "learning_rate": 1.959705038916843e-05, + "loss": 0.7088, + "step": 14528 + }, + { + "epoch": 0.18162954073851847, + "grad_norm": 2.429166793823242, + "learning_rate": 1.9596805118918122e-05, + "loss": 1.0701, + "step": 14530 + }, + { + "epoch": 0.1816545413635341, + "grad_norm": 1.716507077217102, + "learning_rate": 1.95965597755799e-05, + "loss": 0.7513, + "step": 14532 + }, + { + "epoch": 0.18167954198854971, + "grad_norm": 0.7336974740028381, + "learning_rate": 1.9596314359155636e-05, + "loss": 0.0178, + "step": 14534 + }, + { + "epoch": 0.18170454261356533, + "grad_norm": 0.0003858592244796455, + "learning_rate": 1.9596068869647205e-05, + "loss": 1.1094, + "step": 14536 + }, + { + "epoch": 0.18172954323858095, + "grad_norm": 3.0232856273651123, + "learning_rate": 1.9595823307056463e-05, + "loss": 0.7879, + "step": 14538 + }, + { + "epoch": 0.1817545438635966, + "grad_norm": 3.0169103145599365, + "learning_rate": 1.959557767138529e-05, + "loss": 3.093, + "step": 14540 + }, + { + "epoch": 0.18177954448861222, + "grad_norm": 3.1787314414978027, + "learning_rate": 1.9595331962635557e-05, + "loss": 0.8309, + "step": 14542 + }, + { + "epoch": 0.18180454511362784, + "grad_norm": 2.722743034362793, + "learning_rate": 1.959508618080913e-05, + "loss": 1.14, + "step": 14544 + }, + { + "epoch": 0.18182954573864346, + "grad_norm": 3.501987934112549, + "learning_rate": 1.9594840325907883e-05, + "loss": 0.8225, + "step": 14546 + }, + { + "epoch": 0.18185454636365908, + "grad_norm": 0.5412774682044983, + "learning_rate": 1.9594594397933694e-05, + "loss": 0.3959, + "step": 14548 + }, + { + "epoch": 0.18187954698867473, + "grad_norm": 3.1663219928741455, + "learning_rate": 1.9594348396888425e-05, + "loss": 1.6239, + "step": 14550 + }, + { + "epoch": 0.18190454761369035, + "grad_norm": 3.3179051876068115, + "learning_rate": 1.9594102322773958e-05, + "loss": 1.0399, + "step": 14552 + }, + { + "epoch": 0.18192954823870597, + "grad_norm": 2.2060766220092773, + "learning_rate": 1.9593856175592165e-05, + "loss": 0.7506, + "step": 14554 + }, + { + "epoch": 0.1819545488637216, + "grad_norm": 4.359488010406494, + "learning_rate": 1.9593609955344917e-05, + "loss": 1.3417, + "step": 14556 + }, + { + "epoch": 0.1819795494887372, + "grad_norm": 2.27890682220459, + "learning_rate": 1.9593363662034094e-05, + "loss": 0.8423, + "step": 14558 + }, + { + "epoch": 0.18200455011375286, + "grad_norm": 3.215942859649658, + "learning_rate": 1.959311729566157e-05, + "loss": 0.6459, + "step": 14560 + }, + { + "epoch": 0.18202955073876848, + "grad_norm": 7.772760391235352, + "learning_rate": 1.959287085622922e-05, + "loss": 0.8646, + "step": 14562 + }, + { + "epoch": 0.1820545513637841, + "grad_norm": 4.991543769836426, + "learning_rate": 1.9592624343738925e-05, + "loss": 0.616, + "step": 14564 + }, + { + "epoch": 0.18207955198879971, + "grad_norm": 3.1340222358703613, + "learning_rate": 1.9592377758192557e-05, + "loss": 1.5547, + "step": 14566 + }, + { + "epoch": 0.18210455261381533, + "grad_norm": 0.00032770659890957177, + "learning_rate": 1.9592131099592e-05, + "loss": 1.0576, + "step": 14568 + }, + { + "epoch": 0.18212955323883098, + "grad_norm": 0.07129815965890884, + "learning_rate": 1.9591884367939126e-05, + "loss": 0.8718, + "step": 14570 + }, + { + "epoch": 0.1821545538638466, + "grad_norm": 2.903001308441162, + "learning_rate": 1.9591637563235814e-05, + "loss": 1.9802, + "step": 14572 + }, + { + "epoch": 0.18217955448886222, + "grad_norm": 0.0004410151159390807, + "learning_rate": 1.959139068548395e-05, + "loss": 0.4389, + "step": 14574 + }, + { + "epoch": 0.18220455511387784, + "grad_norm": 5.142148017883301, + "learning_rate": 1.9591143734685415e-05, + "loss": 0.6848, + "step": 14576 + }, + { + "epoch": 0.18222955573889346, + "grad_norm": 2.1979787349700928, + "learning_rate": 1.959089671084208e-05, + "loss": 0.1127, + "step": 14578 + }, + { + "epoch": 0.1822545563639091, + "grad_norm": 2.2503652572631836, + "learning_rate": 1.9590649613955836e-05, + "loss": 0.6468, + "step": 14580 + }, + { + "epoch": 0.18227955698892473, + "grad_norm": 5.056207656860352, + "learning_rate": 1.959040244402856e-05, + "loss": 0.2874, + "step": 14582 + }, + { + "epoch": 0.18230455761394035, + "grad_norm": 2.6744160652160645, + "learning_rate": 1.959015520106213e-05, + "loss": 0.4705, + "step": 14584 + }, + { + "epoch": 0.18232955823895597, + "grad_norm": 3.398634910583496, + "learning_rate": 1.9589907885058438e-05, + "loss": 0.2213, + "step": 14586 + }, + { + "epoch": 0.1823545588639716, + "grad_norm": 3.1851279735565186, + "learning_rate": 1.9589660496019364e-05, + "loss": 2.0496, + "step": 14588 + }, + { + "epoch": 0.18237955948898724, + "grad_norm": 4.08469820022583, + "learning_rate": 1.958941303394679e-05, + "loss": 1.4222, + "step": 14590 + }, + { + "epoch": 0.18240456011400286, + "grad_norm": 3.1647822856903076, + "learning_rate": 1.9589165498842605e-05, + "loss": 0.9934, + "step": 14592 + }, + { + "epoch": 0.18242956073901848, + "grad_norm": 3.792929172515869, + "learning_rate": 1.9588917890708688e-05, + "loss": 1.6526, + "step": 14594 + }, + { + "epoch": 0.1824545613640341, + "grad_norm": 8.570874214172363, + "learning_rate": 1.958867020954693e-05, + "loss": 1.1725, + "step": 14596 + }, + { + "epoch": 0.18247956198904972, + "grad_norm": 2.1061911582946777, + "learning_rate": 1.9588422455359218e-05, + "loss": 0.2175, + "step": 14598 + }, + { + "epoch": 0.18250456261406536, + "grad_norm": 3.450291872024536, + "learning_rate": 1.9588174628147435e-05, + "loss": 1.1492, + "step": 14600 + }, + { + "epoch": 0.18252956323908098, + "grad_norm": 4.689064979553223, + "learning_rate": 1.9587926727913468e-05, + "loss": 1.4441, + "step": 14602 + }, + { + "epoch": 0.1825545638640966, + "grad_norm": 0.002236894564703107, + "learning_rate": 1.958767875465921e-05, + "loss": 0.0, + "step": 14604 + }, + { + "epoch": 0.18257956448911222, + "grad_norm": 3.458801746368408, + "learning_rate": 1.9587430708386543e-05, + "loss": 0.6087, + "step": 14606 + }, + { + "epoch": 0.18260456511412784, + "grad_norm": 4.344333648681641, + "learning_rate": 1.9587182589097363e-05, + "loss": 1.9595, + "step": 14608 + }, + { + "epoch": 0.1826295657391435, + "grad_norm": 4.1204304695129395, + "learning_rate": 1.9586934396793557e-05, + "loss": 1.2532, + "step": 14610 + }, + { + "epoch": 0.1826545663641591, + "grad_norm": 2.069169044494629, + "learning_rate": 1.958668613147701e-05, + "loss": 1.5593, + "step": 14612 + }, + { + "epoch": 0.18267956698917473, + "grad_norm": 5.040449142456055, + "learning_rate": 1.958643779314962e-05, + "loss": 0.371, + "step": 14614 + }, + { + "epoch": 0.18270456761419035, + "grad_norm": 2.161320686340332, + "learning_rate": 1.9586189381813273e-05, + "loss": 1.1997, + "step": 14616 + }, + { + "epoch": 0.18272956823920597, + "grad_norm": 4.248841762542725, + "learning_rate": 1.9585940897469867e-05, + "loss": 1.2489, + "step": 14618 + }, + { + "epoch": 0.18275456886422162, + "grad_norm": 0.0007466149982064962, + "learning_rate": 1.958569234012129e-05, + "loss": 0.0069, + "step": 14620 + }, + { + "epoch": 0.18277956948923724, + "grad_norm": 1.584708333015442, + "learning_rate": 1.958544370976943e-05, + "loss": 1.191, + "step": 14622 + }, + { + "epoch": 0.18280457011425286, + "grad_norm": 2.326591968536377, + "learning_rate": 1.958519500641619e-05, + "loss": 0.9808, + "step": 14624 + }, + { + "epoch": 0.18282957073926848, + "grad_norm": 3.48476505279541, + "learning_rate": 1.958494623006346e-05, + "loss": 1.8209, + "step": 14626 + }, + { + "epoch": 0.1828545713642841, + "grad_norm": 2.0457370281219482, + "learning_rate": 1.9584697380713135e-05, + "loss": 1.4327, + "step": 14628 + }, + { + "epoch": 0.18287957198929974, + "grad_norm": 2.5690078735351562, + "learning_rate": 1.9584448458367112e-05, + "loss": 0.4142, + "step": 14630 + }, + { + "epoch": 0.18290457261431536, + "grad_norm": 3.48093581199646, + "learning_rate": 1.9584199463027286e-05, + "loss": 1.351, + "step": 14632 + }, + { + "epoch": 0.18292957323933098, + "grad_norm": 3.498751640319824, + "learning_rate": 1.9583950394695548e-05, + "loss": 0.874, + "step": 14634 + }, + { + "epoch": 0.1829545738643466, + "grad_norm": 0.0015851814532652497, + "learning_rate": 1.9583701253373802e-05, + "loss": 0.8318, + "step": 14636 + }, + { + "epoch": 0.18297957448936222, + "grad_norm": 2.8664913177490234, + "learning_rate": 1.958345203906394e-05, + "loss": 1.2418, + "step": 14638 + }, + { + "epoch": 0.18300457511437787, + "grad_norm": 3.3226993083953857, + "learning_rate": 1.9583202751767864e-05, + "loss": 1.1075, + "step": 14640 + }, + { + "epoch": 0.1830295757393935, + "grad_norm": 3.5866854190826416, + "learning_rate": 1.958295339148747e-05, + "loss": 1.0487, + "step": 14642 + }, + { + "epoch": 0.1830545763644091, + "grad_norm": 0.0015343489358201623, + "learning_rate": 1.958270395822466e-05, + "loss": 1.7746, + "step": 14644 + }, + { + "epoch": 0.18307957698942473, + "grad_norm": 6.841219425201416, + "learning_rate": 1.9582454451981333e-05, + "loss": 1.4875, + "step": 14646 + }, + { + "epoch": 0.18310457761444035, + "grad_norm": 2.1139090061187744, + "learning_rate": 1.9582204872759386e-05, + "loss": 0.4543, + "step": 14648 + }, + { + "epoch": 0.183129578239456, + "grad_norm": 0.6923001408576965, + "learning_rate": 1.958195522056072e-05, + "loss": 0.8094, + "step": 14650 + }, + { + "epoch": 0.18315457886447162, + "grad_norm": 3.5213711261749268, + "learning_rate": 1.958170549538724e-05, + "loss": 1.122, + "step": 14652 + }, + { + "epoch": 0.18317957948948724, + "grad_norm": 8.008312225341797, + "learning_rate": 1.9581455697240845e-05, + "loss": 0.9055, + "step": 14654 + }, + { + "epoch": 0.18320458011450286, + "grad_norm": 3.185826063156128, + "learning_rate": 1.958120582612344e-05, + "loss": 1.674, + "step": 14656 + }, + { + "epoch": 0.18322958073951848, + "grad_norm": 2.2388315200805664, + "learning_rate": 1.9580955882036924e-05, + "loss": 0.6496, + "step": 14658 + }, + { + "epoch": 0.18325458136453412, + "grad_norm": 2.3769752979278564, + "learning_rate": 1.9580705864983206e-05, + "loss": 0.3185, + "step": 14660 + }, + { + "epoch": 0.18327958198954974, + "grad_norm": 4.1630024909973145, + "learning_rate": 1.9580455774964183e-05, + "loss": 0.7911, + "step": 14662 + }, + { + "epoch": 0.18330458261456536, + "grad_norm": 4.83203125, + "learning_rate": 1.9580205611981765e-05, + "loss": 2.0193, + "step": 14664 + }, + { + "epoch": 0.18332958323958098, + "grad_norm": 2.883624792098999, + "learning_rate": 1.9579955376037857e-05, + "loss": 1.4553, + "step": 14666 + }, + { + "epoch": 0.1833545838645966, + "grad_norm": 3.3239336013793945, + "learning_rate": 1.9579705067134363e-05, + "loss": 1.7164, + "step": 14668 + }, + { + "epoch": 0.18337958448961225, + "grad_norm": 2.6294801235198975, + "learning_rate": 1.957945468527319e-05, + "loss": 1.0393, + "step": 14670 + }, + { + "epoch": 0.18340458511462787, + "grad_norm": 1.4216995239257812, + "learning_rate": 1.9579204230456247e-05, + "loss": 0.9606, + "step": 14672 + }, + { + "epoch": 0.1834295857396435, + "grad_norm": 3.1094796657562256, + "learning_rate": 1.9578953702685436e-05, + "loss": 1.5676, + "step": 14674 + }, + { + "epoch": 0.1834545863646591, + "grad_norm": 0.0008256987784989178, + "learning_rate": 1.957870310196267e-05, + "loss": 0.6311, + "step": 14676 + }, + { + "epoch": 0.18347958698967473, + "grad_norm": 2.536808729171753, + "learning_rate": 1.9578452428289854e-05, + "loss": 0.4725, + "step": 14678 + }, + { + "epoch": 0.18350458761469038, + "grad_norm": 0.0005158795393072069, + "learning_rate": 1.9578201681668898e-05, + "loss": 0.8195, + "step": 14680 + }, + { + "epoch": 0.183529588239706, + "grad_norm": 4.32307243347168, + "learning_rate": 1.9577950862101715e-05, + "loss": 1.1867, + "step": 14682 + }, + { + "epoch": 0.18355458886472162, + "grad_norm": 0.9686886072158813, + "learning_rate": 1.957769996959021e-05, + "loss": 0.0355, + "step": 14684 + }, + { + "epoch": 0.18357958948973724, + "grad_norm": 2.1427254676818848, + "learning_rate": 1.95774490041363e-05, + "loss": 0.1993, + "step": 14686 + }, + { + "epoch": 0.18360459011475286, + "grad_norm": 0.0019060102058574557, + "learning_rate": 1.9577197965741892e-05, + "loss": 0.7307, + "step": 14688 + }, + { + "epoch": 0.1836295907397685, + "grad_norm": 3.5698156356811523, + "learning_rate": 1.9576946854408898e-05, + "loss": 0.5841, + "step": 14690 + }, + { + "epoch": 0.18365459136478413, + "grad_norm": 3.5040416717529297, + "learning_rate": 1.9576695670139232e-05, + "loss": 1.1843, + "step": 14692 + }, + { + "epoch": 0.18367959198979975, + "grad_norm": 5.147172927856445, + "learning_rate": 1.9576444412934804e-05, + "loss": 0.3457, + "step": 14694 + }, + { + "epoch": 0.18370459261481537, + "grad_norm": 4.55830192565918, + "learning_rate": 1.957619308279753e-05, + "loss": 1.0166, + "step": 14696 + }, + { + "epoch": 0.18372959323983099, + "grad_norm": 5.1034088134765625, + "learning_rate": 1.9575941679729326e-05, + "loss": 1.0728, + "step": 14698 + }, + { + "epoch": 0.18375459386484663, + "grad_norm": 0.24766112864017487, + "learning_rate": 1.9575690203732097e-05, + "loss": 0.0627, + "step": 14700 + }, + { + "epoch": 0.18377959448986225, + "grad_norm": 0.07246707379817963, + "learning_rate": 1.957543865480777e-05, + "loss": 0.5036, + "step": 14702 + }, + { + "epoch": 0.18380459511487787, + "grad_norm": 3.2715487480163574, + "learning_rate": 1.957518703295826e-05, + "loss": 1.3749, + "step": 14704 + }, + { + "epoch": 0.1838295957398935, + "grad_norm": 0.6390644907951355, + "learning_rate": 1.9574935338185477e-05, + "loss": 0.3597, + "step": 14706 + }, + { + "epoch": 0.1838545963649091, + "grad_norm": 0.00031465975916944444, + "learning_rate": 1.957468357049134e-05, + "loss": 0.9963, + "step": 14708 + }, + { + "epoch": 0.18387959698992476, + "grad_norm": 4.956110954284668, + "learning_rate": 1.957443172987777e-05, + "loss": 1.0668, + "step": 14710 + }, + { + "epoch": 0.18390459761494038, + "grad_norm": 2.6765103340148926, + "learning_rate": 1.957417981634668e-05, + "loss": 0.9136, + "step": 14712 + }, + { + "epoch": 0.183929598239956, + "grad_norm": 1.4717957973480225, + "learning_rate": 1.9573927829899988e-05, + "loss": 0.1416, + "step": 14714 + }, + { + "epoch": 0.18395459886497162, + "grad_norm": 2.6536359786987305, + "learning_rate": 1.9573675770539618e-05, + "loss": 0.5526, + "step": 14716 + }, + { + "epoch": 0.18397959948998724, + "grad_norm": 19.369571685791016, + "learning_rate": 1.9573423638267485e-05, + "loss": 2.4341, + "step": 14718 + }, + { + "epoch": 0.1840046001150029, + "grad_norm": 2.707200765609741, + "learning_rate": 1.957317143308551e-05, + "loss": 0.6089, + "step": 14720 + }, + { + "epoch": 0.1840296007400185, + "grad_norm": 6.548948287963867, + "learning_rate": 1.957291915499562e-05, + "loss": 2.2244, + "step": 14722 + }, + { + "epoch": 0.18405460136503413, + "grad_norm": 4.264769554138184, + "learning_rate": 1.957266680399973e-05, + "loss": 1.9581, + "step": 14724 + }, + { + "epoch": 0.18407960199004975, + "grad_norm": 3.7329609394073486, + "learning_rate": 1.957241438009976e-05, + "loss": 0.9112, + "step": 14726 + }, + { + "epoch": 0.18410460261506537, + "grad_norm": 0.7281776666641235, + "learning_rate": 1.9572161883297642e-05, + "loss": 1.5036, + "step": 14728 + }, + { + "epoch": 0.184129603240081, + "grad_norm": 0.00036927967448718846, + "learning_rate": 1.9571909313595286e-05, + "loss": 1.0027, + "step": 14730 + }, + { + "epoch": 0.18415460386509663, + "grad_norm": 6.053549766540527, + "learning_rate": 1.957165667099463e-05, + "loss": 1.4285, + "step": 14732 + }, + { + "epoch": 0.18417960449011225, + "grad_norm": 3.4410464763641357, + "learning_rate": 1.9571403955497585e-05, + "loss": 0.5468, + "step": 14734 + }, + { + "epoch": 0.18420460511512787, + "grad_norm": 2.6379897594451904, + "learning_rate": 1.957115116710608e-05, + "loss": 1.1409, + "step": 14736 + }, + { + "epoch": 0.1842296057401435, + "grad_norm": 2.4276938438415527, + "learning_rate": 1.9570898305822042e-05, + "loss": 0.7609, + "step": 14738 + }, + { + "epoch": 0.18425460636515914, + "grad_norm": 0.16318504512310028, + "learning_rate": 1.9570645371647393e-05, + "loss": 0.4522, + "step": 14740 + }, + { + "epoch": 0.18427960699017476, + "grad_norm": 0.016577523201704025, + "learning_rate": 1.9570392364584068e-05, + "loss": 0.7771, + "step": 14742 + }, + { + "epoch": 0.18430460761519038, + "grad_norm": 3.2072601318359375, + "learning_rate": 1.9570139284633988e-05, + "loss": 1.5283, + "step": 14744 + }, + { + "epoch": 0.184329608240206, + "grad_norm": 0.4465264678001404, + "learning_rate": 1.9569886131799078e-05, + "loss": 0.8037, + "step": 14746 + }, + { + "epoch": 0.18435460886522162, + "grad_norm": 2.3858439922332764, + "learning_rate": 1.9569632906081268e-05, + "loss": 0.7365, + "step": 14748 + }, + { + "epoch": 0.18437960949023727, + "grad_norm": 2.3265416622161865, + "learning_rate": 1.956937960748249e-05, + "loss": 0.4239, + "step": 14750 + }, + { + "epoch": 0.1844046101152529, + "grad_norm": 4.684553146362305, + "learning_rate": 1.9569126236004667e-05, + "loss": 1.4929, + "step": 14752 + }, + { + "epoch": 0.1844296107402685, + "grad_norm": 3.1578195095062256, + "learning_rate": 1.9568872791649734e-05, + "loss": 2.074, + "step": 14754 + }, + { + "epoch": 0.18445461136528413, + "grad_norm": 3.613401174545288, + "learning_rate": 1.9568619274419616e-05, + "loss": 0.7733, + "step": 14756 + }, + { + "epoch": 0.18447961199029975, + "grad_norm": 0.0009437055559828877, + "learning_rate": 1.956836568431625e-05, + "loss": 0.8525, + "step": 14758 + }, + { + "epoch": 0.1845046126153154, + "grad_norm": 3.1420059204101562, + "learning_rate": 1.9568112021341564e-05, + "loss": 1.8419, + "step": 14760 + }, + { + "epoch": 0.18452961324033101, + "grad_norm": 2.9671685695648193, + "learning_rate": 1.956785828549749e-05, + "loss": 1.1661, + "step": 14762 + }, + { + "epoch": 0.18455461386534663, + "grad_norm": 2.5480642318725586, + "learning_rate": 1.956760447678596e-05, + "loss": 1.3335, + "step": 14764 + }, + { + "epoch": 0.18457961449036225, + "grad_norm": 2.7197163105010986, + "learning_rate": 1.9567350595208902e-05, + "loss": 0.591, + "step": 14766 + }, + { + "epoch": 0.18460461511537787, + "grad_norm": 3.303903341293335, + "learning_rate": 1.956709664076826e-05, + "loss": 0.9693, + "step": 14768 + }, + { + "epoch": 0.18462961574039352, + "grad_norm": 4.0889739990234375, + "learning_rate": 1.9566842613465962e-05, + "loss": 0.6259, + "step": 14770 + }, + { + "epoch": 0.18465461636540914, + "grad_norm": 4.8935723304748535, + "learning_rate": 1.9566588513303942e-05, + "loss": 2.019, + "step": 14772 + }, + { + "epoch": 0.18467961699042476, + "grad_norm": 4.321831226348877, + "learning_rate": 1.9566334340284142e-05, + "loss": 1.2728, + "step": 14774 + }, + { + "epoch": 0.18470461761544038, + "grad_norm": 0.0015317992074415088, + "learning_rate": 1.9566080094408487e-05, + "loss": 0.3761, + "step": 14776 + }, + { + "epoch": 0.184729618240456, + "grad_norm": 3.627501964569092, + "learning_rate": 1.9565825775678922e-05, + "loss": 1.2902, + "step": 14778 + }, + { + "epoch": 0.18475461886547165, + "grad_norm": 1.0005650520324707, + "learning_rate": 1.9565571384097378e-05, + "loss": 1.1257, + "step": 14780 + }, + { + "epoch": 0.18477961949048727, + "grad_norm": 3.2219114303588867, + "learning_rate": 1.9565316919665796e-05, + "loss": 0.7993, + "step": 14782 + }, + { + "epoch": 0.1848046201155029, + "grad_norm": 0.5403105020523071, + "learning_rate": 1.9565062382386112e-05, + "loss": 0.6969, + "step": 14784 + }, + { + "epoch": 0.1848296207405185, + "grad_norm": 0.0007458117906935513, + "learning_rate": 1.956480777226027e-05, + "loss": 0.6872, + "step": 14786 + }, + { + "epoch": 0.18485462136553413, + "grad_norm": 0.023195872083306313, + "learning_rate": 1.95645530892902e-05, + "loss": 0.9275, + "step": 14788 + }, + { + "epoch": 0.18487962199054978, + "grad_norm": 1.7627888917922974, + "learning_rate": 1.9564298333477847e-05, + "loss": 0.0904, + "step": 14790 + }, + { + "epoch": 0.1849046226155654, + "grad_norm": 2.6751585006713867, + "learning_rate": 1.956404350482515e-05, + "loss": 0.8623, + "step": 14792 + }, + { + "epoch": 0.18492962324058101, + "grad_norm": 2.4274396896362305, + "learning_rate": 1.9563788603334055e-05, + "loss": 0.9683, + "step": 14794 + }, + { + "epoch": 0.18495462386559663, + "grad_norm": 4.005202293395996, + "learning_rate": 1.9563533629006494e-05, + "loss": 1.5331, + "step": 14796 + }, + { + "epoch": 0.18497962449061225, + "grad_norm": 0.6390326023101807, + "learning_rate": 1.9563278581844414e-05, + "loss": 0.4307, + "step": 14798 + }, + { + "epoch": 0.1850046251156279, + "grad_norm": 0.002887025708332658, + "learning_rate": 1.9563023461849754e-05, + "loss": 0.0, + "step": 14800 + }, + { + "epoch": 0.18502962574064352, + "grad_norm": 1.0802515745162964, + "learning_rate": 1.9562768269024465e-05, + "loss": 0.7301, + "step": 14802 + }, + { + "epoch": 0.18505462636565914, + "grad_norm": 3.0223352909088135, + "learning_rate": 1.9562513003370483e-05, + "loss": 0.7858, + "step": 14804 + }, + { + "epoch": 0.18507962699067476, + "grad_norm": 3.160220146179199, + "learning_rate": 1.956225766488975e-05, + "loss": 1.0813, + "step": 14806 + }, + { + "epoch": 0.18510462761569038, + "grad_norm": 4.557854175567627, + "learning_rate": 1.956200225358422e-05, + "loss": 0.3529, + "step": 14808 + }, + { + "epoch": 0.18512962824070603, + "grad_norm": 3.2351326942443848, + "learning_rate": 1.956174676945583e-05, + "loss": 1.2492, + "step": 14810 + }, + { + "epoch": 0.18515462886572165, + "grad_norm": 2.7332763671875, + "learning_rate": 1.956149121250653e-05, + "loss": 0.485, + "step": 14812 + }, + { + "epoch": 0.18517962949073727, + "grad_norm": 3.147498846054077, + "learning_rate": 1.956123558273827e-05, + "loss": 0.8071, + "step": 14814 + }, + { + "epoch": 0.1852046301157529, + "grad_norm": 3.7115986347198486, + "learning_rate": 1.9560979880152983e-05, + "loss": 1.3693, + "step": 14816 + }, + { + "epoch": 0.1852296307407685, + "grad_norm": 2.8243629932403564, + "learning_rate": 1.9560724104752632e-05, + "loss": 1.7306, + "step": 14818 + }, + { + "epoch": 0.18525463136578416, + "grad_norm": 2.3724029064178467, + "learning_rate": 1.9560468256539154e-05, + "loss": 1.063, + "step": 14820 + }, + { + "epoch": 0.18527963199079978, + "grad_norm": 4.009453296661377, + "learning_rate": 1.9560212335514503e-05, + "loss": 1.9653, + "step": 14822 + }, + { + "epoch": 0.1853046326158154, + "grad_norm": 0.007169996853917837, + "learning_rate": 1.9559956341680628e-05, + "loss": 0.4794, + "step": 14824 + }, + { + "epoch": 0.18532963324083102, + "grad_norm": 1.7824921607971191, + "learning_rate": 1.9559700275039474e-05, + "loss": 0.9213, + "step": 14826 + }, + { + "epoch": 0.18535463386584664, + "grad_norm": 0.0005919425748288631, + "learning_rate": 1.9559444135593e-05, + "loss": 0.0, + "step": 14828 + }, + { + "epoch": 0.18537963449086228, + "grad_norm": 2.5185348987579346, + "learning_rate": 1.9559187923343147e-05, + "loss": 1.4672, + "step": 14830 + }, + { + "epoch": 0.1854046351158779, + "grad_norm": 3.0138115882873535, + "learning_rate": 1.9558931638291874e-05, + "loss": 0.7329, + "step": 14832 + }, + { + "epoch": 0.18542963574089352, + "grad_norm": 2.832113027572632, + "learning_rate": 1.9558675280441128e-05, + "loss": 1.0708, + "step": 14834 + }, + { + "epoch": 0.18545463636590914, + "grad_norm": 1.9086594581604004, + "learning_rate": 1.955841884979286e-05, + "loss": 0.7636, + "step": 14836 + }, + { + "epoch": 0.18547963699092476, + "grad_norm": 1.969253420829773, + "learning_rate": 1.955816234634903e-05, + "loss": 0.6284, + "step": 14838 + }, + { + "epoch": 0.1855046376159404, + "grad_norm": 0.10244707018136978, + "learning_rate": 1.9557905770111586e-05, + "loss": 0.7249, + "step": 14840 + }, + { + "epoch": 0.18552963824095603, + "grad_norm": 4.2581353187561035, + "learning_rate": 1.955764912108248e-05, + "loss": 0.2761, + "step": 14842 + }, + { + "epoch": 0.18555463886597165, + "grad_norm": 4.512755870819092, + "learning_rate": 1.9557392399263672e-05, + "loss": 0.8996, + "step": 14844 + }, + { + "epoch": 0.18557963949098727, + "grad_norm": 3.7706902027130127, + "learning_rate": 1.9557135604657117e-05, + "loss": 1.437, + "step": 14846 + }, + { + "epoch": 0.1856046401160029, + "grad_norm": 2.8614213466644287, + "learning_rate": 1.9556878737264764e-05, + "loss": 1.0132, + "step": 14848 + }, + { + "epoch": 0.18562964074101854, + "grad_norm": 0.15888801217079163, + "learning_rate": 1.955662179708858e-05, + "loss": 0.9596, + "step": 14850 + }, + { + "epoch": 0.18565464136603416, + "grad_norm": 2.406553268432617, + "learning_rate": 1.955636478413051e-05, + "loss": 1.379, + "step": 14852 + }, + { + "epoch": 0.18567964199104978, + "grad_norm": 4.16954231262207, + "learning_rate": 1.9556107698392522e-05, + "loss": 1.3329, + "step": 14854 + }, + { + "epoch": 0.1857046426160654, + "grad_norm": 4.751332759857178, + "learning_rate": 1.9555850539876568e-05, + "loss": 0.728, + "step": 14856 + }, + { + "epoch": 0.18572964324108102, + "grad_norm": 0.000669636472593993, + "learning_rate": 1.9555593308584603e-05, + "loss": 0.0, + "step": 14858 + }, + { + "epoch": 0.18575464386609666, + "grad_norm": 4.171295166015625, + "learning_rate": 1.9555336004518596e-05, + "loss": 1.1203, + "step": 14860 + }, + { + "epoch": 0.18577964449111228, + "grad_norm": 1.1314563751220703, + "learning_rate": 1.95550786276805e-05, + "loss": 0.0653, + "step": 14862 + }, + { + "epoch": 0.1858046451161279, + "grad_norm": 0.0005162147572264075, + "learning_rate": 1.9554821178072277e-05, + "loss": 0.6636, + "step": 14864 + }, + { + "epoch": 0.18582964574114352, + "grad_norm": 3.027371644973755, + "learning_rate": 1.9554563655695885e-05, + "loss": 0.8573, + "step": 14866 + }, + { + "epoch": 0.18585464636615914, + "grad_norm": 1.4702595472335815, + "learning_rate": 1.955430606055329e-05, + "loss": 0.0385, + "step": 14868 + }, + { + "epoch": 0.1858796469911748, + "grad_norm": 0.0004497488553170115, + "learning_rate": 1.9554048392646447e-05, + "loss": 0.1292, + "step": 14870 + }, + { + "epoch": 0.1859046476161904, + "grad_norm": 2.457036256790161, + "learning_rate": 1.9553790651977328e-05, + "loss": 0.9604, + "step": 14872 + }, + { + "epoch": 0.18592964824120603, + "grad_norm": 3.150533437728882, + "learning_rate": 1.9553532838547885e-05, + "loss": 1.1742, + "step": 14874 + }, + { + "epoch": 0.18595464886622165, + "grad_norm": 4.138139247894287, + "learning_rate": 1.9553274952360092e-05, + "loss": 2.0414, + "step": 14876 + }, + { + "epoch": 0.18597964949123727, + "grad_norm": 1.8499833345413208, + "learning_rate": 1.9553016993415905e-05, + "loss": 0.8275, + "step": 14878 + }, + { + "epoch": 0.18600465011625292, + "grad_norm": 0.0004637909878510982, + "learning_rate": 1.955275896171729e-05, + "loss": 0.3909, + "step": 14880 + }, + { + "epoch": 0.18602965074126854, + "grad_norm": 2.847141742706299, + "learning_rate": 1.9552500857266216e-05, + "loss": 1.7912, + "step": 14882 + }, + { + "epoch": 0.18605465136628416, + "grad_norm": 3.7855241298675537, + "learning_rate": 1.9552242680064644e-05, + "loss": 0.5489, + "step": 14884 + }, + { + "epoch": 0.18607965199129978, + "grad_norm": 3.542926788330078, + "learning_rate": 1.9551984430114547e-05, + "loss": 1.2117, + "step": 14886 + }, + { + "epoch": 0.1861046526163154, + "grad_norm": 0.3854113817214966, + "learning_rate": 1.9551726107417885e-05, + "loss": 0.4357, + "step": 14888 + }, + { + "epoch": 0.18612965324133104, + "grad_norm": 3.228898525238037, + "learning_rate": 1.9551467711976628e-05, + "loss": 1.5839, + "step": 14890 + }, + { + "epoch": 0.18615465386634666, + "grad_norm": 2.9855146408081055, + "learning_rate": 1.9551209243792743e-05, + "loss": 0.4725, + "step": 14892 + }, + { + "epoch": 0.18617965449136228, + "grad_norm": 3.5632941722869873, + "learning_rate": 1.9550950702868197e-05, + "loss": 1.7706, + "step": 14894 + }, + { + "epoch": 0.1862046551163779, + "grad_norm": 14.411269187927246, + "learning_rate": 1.9550692089204964e-05, + "loss": 1.2268, + "step": 14896 + }, + { + "epoch": 0.18622965574139352, + "grad_norm": 7.031842231750488, + "learning_rate": 1.9550433402805012e-05, + "loss": 1.2118, + "step": 14898 + }, + { + "epoch": 0.18625465636640917, + "grad_norm": 2.8220396041870117, + "learning_rate": 1.9550174643670305e-05, + "loss": 0.8557, + "step": 14900 + }, + { + "epoch": 0.1862796569914248, + "grad_norm": 7.774138927459717, + "learning_rate": 1.954991581180282e-05, + "loss": 2.282, + "step": 14902 + }, + { + "epoch": 0.1863046576164404, + "grad_norm": 1.6656746864318848, + "learning_rate": 1.9549656907204528e-05, + "loss": 0.0928, + "step": 14904 + }, + { + "epoch": 0.18632965824145603, + "grad_norm": 3.774820566177368, + "learning_rate": 1.95493979298774e-05, + "loss": 1.0646, + "step": 14906 + }, + { + "epoch": 0.18635465886647165, + "grad_norm": 3.829921245574951, + "learning_rate": 1.9549138879823405e-05, + "loss": 0.7488, + "step": 14908 + }, + { + "epoch": 0.1863796594914873, + "grad_norm": 6.138329029083252, + "learning_rate": 1.9548879757044524e-05, + "loss": 1.1102, + "step": 14910 + }, + { + "epoch": 0.18640466011650292, + "grad_norm": 2.8632726669311523, + "learning_rate": 1.954862056154272e-05, + "loss": 0.5245, + "step": 14912 + }, + { + "epoch": 0.18642966074151854, + "grad_norm": 0.015774542465806007, + "learning_rate": 1.9548361293319972e-05, + "loss": 0.8556, + "step": 14914 + }, + { + "epoch": 0.18645466136653416, + "grad_norm": 3.573888063430786, + "learning_rate": 1.9548101952378258e-05, + "loss": 1.3552, + "step": 14916 + }, + { + "epoch": 0.18647966199154978, + "grad_norm": 3.5242373943328857, + "learning_rate": 1.954784253871955e-05, + "loss": 1.8957, + "step": 14918 + }, + { + "epoch": 0.18650466261656543, + "grad_norm": 2.4846065044403076, + "learning_rate": 1.954758305234582e-05, + "loss": 1.3043, + "step": 14920 + }, + { + "epoch": 0.18652966324158105, + "grad_norm": 3.134871006011963, + "learning_rate": 1.9547323493259047e-05, + "loss": 0.5379, + "step": 14922 + }, + { + "epoch": 0.18655466386659667, + "grad_norm": 3.016026020050049, + "learning_rate": 1.954706386146121e-05, + "loss": 1.1487, + "step": 14924 + }, + { + "epoch": 0.18657966449161228, + "grad_norm": 2.7526564598083496, + "learning_rate": 1.9546804156954285e-05, + "loss": 1.4121, + "step": 14926 + }, + { + "epoch": 0.1866046651166279, + "grad_norm": 2.5781824588775635, + "learning_rate": 1.9546544379740253e-05, + "loss": 0.9794, + "step": 14928 + }, + { + "epoch": 0.18662966574164355, + "grad_norm": 6.988504409790039, + "learning_rate": 1.9546284529821086e-05, + "loss": 1.483, + "step": 14930 + }, + { + "epoch": 0.18665466636665917, + "grad_norm": 2.636559009552002, + "learning_rate": 1.9546024607198764e-05, + "loss": 0.9007, + "step": 14932 + }, + { + "epoch": 0.1866796669916748, + "grad_norm": 3.474699020385742, + "learning_rate": 1.954576461187527e-05, + "loss": 0.7088, + "step": 14934 + }, + { + "epoch": 0.1867046676166904, + "grad_norm": 0.9003409743309021, + "learning_rate": 1.9545504543852583e-05, + "loss": 0.7487, + "step": 14936 + }, + { + "epoch": 0.18672966824170603, + "grad_norm": 3.4015698432922363, + "learning_rate": 1.9545244403132684e-05, + "loss": 0.6319, + "step": 14938 + }, + { + "epoch": 0.18675466886672168, + "grad_norm": 2.312880754470825, + "learning_rate": 1.954498418971755e-05, + "loss": 0.918, + "step": 14940 + }, + { + "epoch": 0.1867796694917373, + "grad_norm": 3.4996461868286133, + "learning_rate": 1.954472390360917e-05, + "loss": 0.7555, + "step": 14942 + }, + { + "epoch": 0.18680467011675292, + "grad_norm": 3.1984856128692627, + "learning_rate": 1.9544463544809523e-05, + "loss": 1.283, + "step": 14944 + }, + { + "epoch": 0.18682967074176854, + "grad_norm": 0.015197375789284706, + "learning_rate": 1.954420311332059e-05, + "loss": 0.9487, + "step": 14946 + }, + { + "epoch": 0.18685467136678416, + "grad_norm": 3.9737548828125, + "learning_rate": 1.9543942609144357e-05, + "loss": 1.5894, + "step": 14948 + }, + { + "epoch": 0.1868796719917998, + "grad_norm": 4.166337013244629, + "learning_rate": 1.9543682032282802e-05, + "loss": 1.4213, + "step": 14950 + }, + { + "epoch": 0.18690467261681543, + "grad_norm": 2.6890130043029785, + "learning_rate": 1.954342138273792e-05, + "loss": 1.3809, + "step": 14952 + }, + { + "epoch": 0.18692967324183105, + "grad_norm": 4.6067023277282715, + "learning_rate": 1.9543160660511688e-05, + "loss": 0.1643, + "step": 14954 + }, + { + "epoch": 0.18695467386684667, + "grad_norm": 5.247937202453613, + "learning_rate": 1.9542899865606092e-05, + "loss": 0.8075, + "step": 14956 + }, + { + "epoch": 0.18697967449186229, + "grad_norm": 0.7063453197479248, + "learning_rate": 1.954263899802312e-05, + "loss": 1.3976, + "step": 14958 + }, + { + "epoch": 0.18700467511687793, + "grad_norm": 2.776991605758667, + "learning_rate": 1.9542378057764765e-05, + "loss": 1.2404, + "step": 14960 + }, + { + "epoch": 0.18702967574189355, + "grad_norm": 6.7256364822387695, + "learning_rate": 1.9542117044833004e-05, + "loss": 1.6012, + "step": 14962 + }, + { + "epoch": 0.18705467636690917, + "grad_norm": 1.6870001554489136, + "learning_rate": 1.9541855959229824e-05, + "loss": 1.3083, + "step": 14964 + }, + { + "epoch": 0.1870796769919248, + "grad_norm": 7.529923439025879, + "learning_rate": 1.9541594800957228e-05, + "loss": 1.7511, + "step": 14966 + }, + { + "epoch": 0.1871046776169404, + "grad_norm": 5.027195453643799, + "learning_rate": 1.9541333570017187e-05, + "loss": 1.4122, + "step": 14968 + }, + { + "epoch": 0.18712967824195606, + "grad_norm": 0.9674218893051147, + "learning_rate": 1.95410722664117e-05, + "loss": 1.1269, + "step": 14970 + }, + { + "epoch": 0.18715467886697168, + "grad_norm": 6.023327350616455, + "learning_rate": 1.9540810890142762e-05, + "loss": 0.6033, + "step": 14972 + }, + { + "epoch": 0.1871796794919873, + "grad_norm": 5.115671634674072, + "learning_rate": 1.954054944121235e-05, + "loss": 1.2819, + "step": 14974 + }, + { + "epoch": 0.18720468011700292, + "grad_norm": 1.6129906177520752, + "learning_rate": 1.9540287919622466e-05, + "loss": 0.7789, + "step": 14976 + }, + { + "epoch": 0.18722968074201854, + "grad_norm": 3.865514039993286, + "learning_rate": 1.95400263253751e-05, + "loss": 1.7939, + "step": 14978 + }, + { + "epoch": 0.1872546813670342, + "grad_norm": 2.6691362857818604, + "learning_rate": 1.953976465847224e-05, + "loss": 1.0068, + "step": 14980 + }, + { + "epoch": 0.1872796819920498, + "grad_norm": 4.44624137878418, + "learning_rate": 1.9539502918915884e-05, + "loss": 1.3187, + "step": 14982 + }, + { + "epoch": 0.18730468261706543, + "grad_norm": 6.074986457824707, + "learning_rate": 1.953924110670802e-05, + "loss": 0.423, + "step": 14984 + }, + { + "epoch": 0.18732968324208105, + "grad_norm": 2.7310268878936768, + "learning_rate": 1.9538979221850648e-05, + "loss": 1.3027, + "step": 14986 + }, + { + "epoch": 0.18735468386709667, + "grad_norm": 9.341099739074707, + "learning_rate": 1.9538717264345755e-05, + "loss": 0.8844, + "step": 14988 + }, + { + "epoch": 0.18737968449211231, + "grad_norm": 2.2091076374053955, + "learning_rate": 1.9538455234195343e-05, + "loss": 0.9213, + "step": 14990 + }, + { + "epoch": 0.18740468511712793, + "grad_norm": 4.1139445304870605, + "learning_rate": 1.9538193131401403e-05, + "loss": 1.3487, + "step": 14992 + }, + { + "epoch": 0.18742968574214355, + "grad_norm": 5.088298797607422, + "learning_rate": 1.9537930955965935e-05, + "loss": 1.4638, + "step": 14994 + }, + { + "epoch": 0.18745468636715917, + "grad_norm": 3.1366870403289795, + "learning_rate": 1.9537668707890933e-05, + "loss": 0.887, + "step": 14996 + }, + { + "epoch": 0.1874796869921748, + "grad_norm": 3.236710548400879, + "learning_rate": 1.9537406387178394e-05, + "loss": 0.7631, + "step": 14998 + }, + { + "epoch": 0.18750468761719044, + "grad_norm": 0.00023628753842785954, + "learning_rate": 1.9537143993830317e-05, + "loss": 0.7639, + "step": 15000 + }, + { + "epoch": 0.18752968824220606, + "grad_norm": 2.395841121673584, + "learning_rate": 1.95368815278487e-05, + "loss": 1.2249, + "step": 15002 + }, + { + "epoch": 0.18755468886722168, + "grad_norm": 3.100954532623291, + "learning_rate": 1.9536618989235543e-05, + "loss": 1.351, + "step": 15004 + }, + { + "epoch": 0.1875796894922373, + "grad_norm": 4.379046440124512, + "learning_rate": 1.9536356377992845e-05, + "loss": 0.8051, + "step": 15006 + }, + { + "epoch": 0.18760469011725292, + "grad_norm": 4.550036907196045, + "learning_rate": 1.95360936941226e-05, + "loss": 0.9094, + "step": 15008 + }, + { + "epoch": 0.18762969074226857, + "grad_norm": 3.5383541584014893, + "learning_rate": 1.953583093762682e-05, + "loss": 2.5607, + "step": 15010 + }, + { + "epoch": 0.1876546913672842, + "grad_norm": 1.1982052326202393, + "learning_rate": 1.9535568108507498e-05, + "loss": 0.2915, + "step": 15012 + }, + { + "epoch": 0.1876796919922998, + "grad_norm": 3.197343111038208, + "learning_rate": 1.9535305206766637e-05, + "loss": 1.5502, + "step": 15014 + }, + { + "epoch": 0.18770469261731543, + "grad_norm": 4.046470642089844, + "learning_rate": 1.953504223240624e-05, + "loss": 1.2228, + "step": 15016 + }, + { + "epoch": 0.18772969324233105, + "grad_norm": 0.00020657037384808064, + "learning_rate": 1.9534779185428313e-05, + "loss": 0.0, + "step": 15018 + }, + { + "epoch": 0.1877546938673467, + "grad_norm": 3.8097407817840576, + "learning_rate": 1.953451606583485e-05, + "loss": 0.699, + "step": 15020 + }, + { + "epoch": 0.18777969449236231, + "grad_norm": 3.5668318271636963, + "learning_rate": 1.9534252873627866e-05, + "loss": 1.0901, + "step": 15022 + }, + { + "epoch": 0.18780469511737793, + "grad_norm": 2.558112621307373, + "learning_rate": 1.953398960880936e-05, + "loss": 0.4788, + "step": 15024 + }, + { + "epoch": 0.18782969574239355, + "grad_norm": 5.011739730834961, + "learning_rate": 1.9533726271381333e-05, + "loss": 1.661, + "step": 15026 + }, + { + "epoch": 0.18785469636740917, + "grad_norm": 2.39125919342041, + "learning_rate": 1.95334628613458e-05, + "loss": 1.3533, + "step": 15028 + }, + { + "epoch": 0.18787969699242482, + "grad_norm": 6.293559551239014, + "learning_rate": 1.953319937870476e-05, + "loss": 2.4393, + "step": 15030 + }, + { + "epoch": 0.18790469761744044, + "grad_norm": 2.8515849113464355, + "learning_rate": 1.9532935823460223e-05, + "loss": 0.8032, + "step": 15032 + }, + { + "epoch": 0.18792969824245606, + "grad_norm": 1.4680603742599487, + "learning_rate": 1.9532672195614192e-05, + "loss": 0.11, + "step": 15034 + }, + { + "epoch": 0.18795469886747168, + "grad_norm": 0.0002714702277444303, + "learning_rate": 1.9532408495168676e-05, + "loss": 0.0, + "step": 15036 + }, + { + "epoch": 0.1879796994924873, + "grad_norm": 3.5865845680236816, + "learning_rate": 1.953214472212569e-05, + "loss": 0.9064, + "step": 15038 + }, + { + "epoch": 0.18800470011750295, + "grad_norm": 4.181960582733154, + "learning_rate": 1.9531880876487234e-05, + "loss": 0.8149, + "step": 15040 + }, + { + "epoch": 0.18802970074251857, + "grad_norm": 3.0155346393585205, + "learning_rate": 1.9531616958255327e-05, + "loss": 0.7889, + "step": 15042 + }, + { + "epoch": 0.1880547013675342, + "grad_norm": 2.721217393875122, + "learning_rate": 1.9531352967431968e-05, + "loss": 1.2709, + "step": 15044 + }, + { + "epoch": 0.1880797019925498, + "grad_norm": 2.251054048538208, + "learning_rate": 1.9531088904019173e-05, + "loss": 0.5552, + "step": 15046 + }, + { + "epoch": 0.18810470261756543, + "grad_norm": 4.397651195526123, + "learning_rate": 1.953082476801895e-05, + "loss": 1.2397, + "step": 15048 + }, + { + "epoch": 0.18812970324258108, + "grad_norm": 3.518721342086792, + "learning_rate": 1.953056055943332e-05, + "loss": 1.6772, + "step": 15050 + }, + { + "epoch": 0.1881547038675967, + "grad_norm": 0.0003925897181034088, + "learning_rate": 1.9530296278264283e-05, + "loss": 0.9056, + "step": 15052 + }, + { + "epoch": 0.18817970449261232, + "grad_norm": 3.987520694732666, + "learning_rate": 1.9530031924513862e-05, + "loss": 0.6765, + "step": 15054 + }, + { + "epoch": 0.18820470511762794, + "grad_norm": 2.736412525177002, + "learning_rate": 1.952976749818406e-05, + "loss": 1.4683, + "step": 15056 + }, + { + "epoch": 0.18822970574264355, + "grad_norm": 2.46398663520813, + "learning_rate": 1.9529502999276903e-05, + "loss": 0.2189, + "step": 15058 + }, + { + "epoch": 0.1882547063676592, + "grad_norm": 2.406189203262329, + "learning_rate": 1.9529238427794393e-05, + "loss": 1.049, + "step": 15060 + }, + { + "epoch": 0.18827970699267482, + "grad_norm": 9.806272506713867, + "learning_rate": 1.9528973783738554e-05, + "loss": 1.681, + "step": 15062 + }, + { + "epoch": 0.18830470761769044, + "grad_norm": 3.050790548324585, + "learning_rate": 1.95287090671114e-05, + "loss": 1.0232, + "step": 15064 + }, + { + "epoch": 0.18832970824270606, + "grad_norm": 3.7973551750183105, + "learning_rate": 1.952844427791494e-05, + "loss": 1.9941, + "step": 15066 + }, + { + "epoch": 0.18835470886772168, + "grad_norm": 3.157073974609375, + "learning_rate": 1.95281794161512e-05, + "loss": 0.5371, + "step": 15068 + }, + { + "epoch": 0.18837970949273733, + "grad_norm": 5.381820201873779, + "learning_rate": 1.952791448182219e-05, + "loss": 1.088, + "step": 15070 + }, + { + "epoch": 0.18840471011775295, + "grad_norm": 2.72337007522583, + "learning_rate": 1.952764947492993e-05, + "loss": 1.068, + "step": 15072 + }, + { + "epoch": 0.18842971074276857, + "grad_norm": 0.85989910364151, + "learning_rate": 1.9527384395476442e-05, + "loss": 0.2555, + "step": 15074 + }, + { + "epoch": 0.1884547113677842, + "grad_norm": 2.472327947616577, + "learning_rate": 1.952711924346374e-05, + "loss": 1.3232, + "step": 15076 + }, + { + "epoch": 0.1884797119927998, + "grad_norm": 3.733684539794922, + "learning_rate": 1.9526854018893844e-05, + "loss": 1.3147, + "step": 15078 + }, + { + "epoch": 0.18850471261781546, + "grad_norm": 0.0003341476258356124, + "learning_rate": 1.952658872176878e-05, + "loss": 0.6188, + "step": 15080 + }, + { + "epoch": 0.18852971324283108, + "grad_norm": 4.911625385284424, + "learning_rate": 1.952632335209056e-05, + "loss": 1.6486, + "step": 15082 + }, + { + "epoch": 0.1885547138678467, + "grad_norm": 2.0493688583374023, + "learning_rate": 1.9526057909861204e-05, + "loss": 0.9496, + "step": 15084 + }, + { + "epoch": 0.18857971449286232, + "grad_norm": 3.498648166656494, + "learning_rate": 1.9525792395082743e-05, + "loss": 1.1412, + "step": 15086 + }, + { + "epoch": 0.18860471511787794, + "grad_norm": 0.4668789505958557, + "learning_rate": 1.9525526807757194e-05, + "loss": 0.4357, + "step": 15088 + }, + { + "epoch": 0.18862971574289358, + "grad_norm": 3.358823299407959, + "learning_rate": 1.952526114788658e-05, + "loss": 0.2778, + "step": 15090 + }, + { + "epoch": 0.1886547163679092, + "grad_norm": 6.240701198577881, + "learning_rate": 1.9524995415472923e-05, + "loss": 0.5006, + "step": 15092 + }, + { + "epoch": 0.18867971699292482, + "grad_norm": 9.712586402893066, + "learning_rate": 1.952472961051825e-05, + "loss": 1.964, + "step": 15094 + }, + { + "epoch": 0.18870471761794044, + "grad_norm": 2.6158664226531982, + "learning_rate": 1.952446373302458e-05, + "loss": 0.7347, + "step": 15096 + }, + { + "epoch": 0.18872971824295606, + "grad_norm": 2.6800692081451416, + "learning_rate": 1.9524197782993943e-05, + "loss": 0.7207, + "step": 15098 + }, + { + "epoch": 0.1887547188679717, + "grad_norm": 3.8677685260772705, + "learning_rate": 1.9523931760428364e-05, + "loss": 1.0274, + "step": 15100 + }, + { + "epoch": 0.18877971949298733, + "grad_norm": 0.0004366387438494712, + "learning_rate": 1.9523665665329864e-05, + "loss": 0.1349, + "step": 15102 + }, + { + "epoch": 0.18880472011800295, + "grad_norm": 0.00025295602972619236, + "learning_rate": 1.9523399497700477e-05, + "loss": 1.4246, + "step": 15104 + }, + { + "epoch": 0.18882972074301857, + "grad_norm": 3.882216215133667, + "learning_rate": 1.9523133257542227e-05, + "loss": 1.4021, + "step": 15106 + }, + { + "epoch": 0.1888547213680342, + "grad_norm": 4.3885650634765625, + "learning_rate": 1.9522866944857134e-05, + "loss": 1.5865, + "step": 15108 + }, + { + "epoch": 0.18887972199304984, + "grad_norm": 5.6366868019104, + "learning_rate": 1.952260055964724e-05, + "loss": 1.6053, + "step": 15110 + }, + { + "epoch": 0.18890472261806546, + "grad_norm": 3.071467876434326, + "learning_rate": 1.952233410191456e-05, + "loss": 1.0149, + "step": 15112 + }, + { + "epoch": 0.18892972324308108, + "grad_norm": 5.9738969802856445, + "learning_rate": 1.9522067571661137e-05, + "loss": 0.9951, + "step": 15114 + }, + { + "epoch": 0.1889547238680967, + "grad_norm": 5.57064962387085, + "learning_rate": 1.952180096888899e-05, + "loss": 1.8863, + "step": 15116 + }, + { + "epoch": 0.18897972449311232, + "grad_norm": 4.537623882293701, + "learning_rate": 1.952153429360015e-05, + "loss": 1.0538, + "step": 15118 + }, + { + "epoch": 0.18900472511812796, + "grad_norm": 4.488608360290527, + "learning_rate": 1.952126754579666e-05, + "loss": 0.2256, + "step": 15120 + }, + { + "epoch": 0.18902972574314358, + "grad_norm": 2.5673391819000244, + "learning_rate": 1.9521000725480536e-05, + "loss": 1.4627, + "step": 15122 + }, + { + "epoch": 0.1890547263681592, + "grad_norm": 3.0956199169158936, + "learning_rate": 1.952073383265382e-05, + "loss": 1.765, + "step": 15124 + }, + { + "epoch": 0.18907972699317482, + "grad_norm": 3.2889583110809326, + "learning_rate": 1.952046686731854e-05, + "loss": 0.6781, + "step": 15126 + }, + { + "epoch": 0.18910472761819044, + "grad_norm": 3.834639072418213, + "learning_rate": 1.9520199829476733e-05, + "loss": 1.1726, + "step": 15128 + }, + { + "epoch": 0.1891297282432061, + "grad_norm": 3.9132866859436035, + "learning_rate": 1.9519932719130427e-05, + "loss": 0.6016, + "step": 15130 + }, + { + "epoch": 0.1891547288682217, + "grad_norm": 1.3250197172164917, + "learning_rate": 1.951966553628166e-05, + "loss": 0.698, + "step": 15132 + }, + { + "epoch": 0.18917972949323733, + "grad_norm": 7.705612659454346, + "learning_rate": 1.951939828093247e-05, + "loss": 0.8235, + "step": 15134 + }, + { + "epoch": 0.18920473011825295, + "grad_norm": 3.961451292037964, + "learning_rate": 1.9519130953084887e-05, + "loss": 0.8929, + "step": 15136 + }, + { + "epoch": 0.18922973074326857, + "grad_norm": 3.1971750259399414, + "learning_rate": 1.9518863552740948e-05, + "loss": 1.9692, + "step": 15138 + }, + { + "epoch": 0.18925473136828422, + "grad_norm": 0.9925752878189087, + "learning_rate": 1.9518596079902687e-05, + "loss": 0.8056, + "step": 15140 + }, + { + "epoch": 0.18927973199329984, + "grad_norm": 3.8237242698669434, + "learning_rate": 1.9518328534572148e-05, + "loss": 1.4951, + "step": 15142 + }, + { + "epoch": 0.18930473261831546, + "grad_norm": 1.5255515575408936, + "learning_rate": 1.9518060916751368e-05, + "loss": 0.283, + "step": 15144 + }, + { + "epoch": 0.18932973324333108, + "grad_norm": 3.0539650917053223, + "learning_rate": 1.951779322644238e-05, + "loss": 1.7327, + "step": 15146 + }, + { + "epoch": 0.1893547338683467, + "grad_norm": 0.8772318959236145, + "learning_rate": 1.951752546364722e-05, + "loss": 0.1776, + "step": 15148 + }, + { + "epoch": 0.18937973449336234, + "grad_norm": 4.095871448516846, + "learning_rate": 1.9517257628367935e-05, + "loss": 0.7348, + "step": 15150 + }, + { + "epoch": 0.18940473511837796, + "grad_norm": 0.000521522480994463, + "learning_rate": 1.9516989720606564e-05, + "loss": 0.4176, + "step": 15152 + }, + { + "epoch": 0.18942973574339358, + "grad_norm": 0.9973003268241882, + "learning_rate": 1.951672174036514e-05, + "loss": 0.2801, + "step": 15154 + }, + { + "epoch": 0.1894547363684092, + "grad_norm": 2.795497179031372, + "learning_rate": 1.9516453687645714e-05, + "loss": 0.929, + "step": 15156 + }, + { + "epoch": 0.18947973699342482, + "grad_norm": 5.070882320404053, + "learning_rate": 1.951618556245032e-05, + "loss": 1.8118, + "step": 15158 + }, + { + "epoch": 0.18950473761844047, + "grad_norm": 5.486221790313721, + "learning_rate": 1.9515917364781004e-05, + "loss": 0.5302, + "step": 15160 + }, + { + "epoch": 0.1895297382434561, + "grad_norm": 2.9639248847961426, + "learning_rate": 1.9515649094639805e-05, + "loss": 0.9802, + "step": 15162 + }, + { + "epoch": 0.1895547388684717, + "grad_norm": 3.8219265937805176, + "learning_rate": 1.951538075202877e-05, + "loss": 0.7446, + "step": 15164 + }, + { + "epoch": 0.18957973949348733, + "grad_norm": 5.352784156799316, + "learning_rate": 1.9515112336949942e-05, + "loss": 1.6782, + "step": 15166 + }, + { + "epoch": 0.18960474011850295, + "grad_norm": 0.0008320026681758463, + "learning_rate": 1.9514843849405364e-05, + "loss": 0.0855, + "step": 15168 + }, + { + "epoch": 0.1896297407435186, + "grad_norm": 5.5823283195495605, + "learning_rate": 1.9514575289397076e-05, + "loss": 0.6043, + "step": 15170 + }, + { + "epoch": 0.18965474136853422, + "grad_norm": 2.1589581966400146, + "learning_rate": 1.9514306656927133e-05, + "loss": 0.7018, + "step": 15172 + }, + { + "epoch": 0.18967974199354984, + "grad_norm": 3.3112661838531494, + "learning_rate": 1.9514037951997573e-05, + "loss": 0.3723, + "step": 15174 + }, + { + "epoch": 0.18970474261856546, + "grad_norm": 3.6972317695617676, + "learning_rate": 1.9513769174610446e-05, + "loss": 1.838, + "step": 15176 + }, + { + "epoch": 0.18972974324358108, + "grad_norm": 4.293310165405273, + "learning_rate": 1.95135003247678e-05, + "loss": 1.0478, + "step": 15178 + }, + { + "epoch": 0.18975474386859673, + "grad_norm": 2.7614006996154785, + "learning_rate": 1.951323140247168e-05, + "loss": 1.5081, + "step": 15180 + }, + { + "epoch": 0.18977974449361235, + "grad_norm": 3.530916213989258, + "learning_rate": 1.9512962407724134e-05, + "loss": 0.596, + "step": 15182 + }, + { + "epoch": 0.18980474511862797, + "grad_norm": 8.499629974365234, + "learning_rate": 1.9512693340527212e-05, + "loss": 1.0795, + "step": 15184 + }, + { + "epoch": 0.18982974574364359, + "grad_norm": 4.551486492156982, + "learning_rate": 1.9512424200882963e-05, + "loss": 1.4589, + "step": 15186 + }, + { + "epoch": 0.1898547463686592, + "grad_norm": 3.4019064903259277, + "learning_rate": 1.9512154988793435e-05, + "loss": 0.5723, + "step": 15188 + }, + { + "epoch": 0.18987974699367485, + "grad_norm": 9.154369354248047, + "learning_rate": 1.951188570426068e-05, + "loss": 1.9571, + "step": 15190 + }, + { + "epoch": 0.18990474761869047, + "grad_norm": 2.846923828125, + "learning_rate": 1.951161634728675e-05, + "loss": 0.9968, + "step": 15192 + }, + { + "epoch": 0.1899297482437061, + "grad_norm": 3.3483009338378906, + "learning_rate": 1.9511346917873694e-05, + "loss": 1.2248, + "step": 15194 + }, + { + "epoch": 0.1899547488687217, + "grad_norm": 3.9913201332092285, + "learning_rate": 1.9511077416023567e-05, + "loss": 0.8677, + "step": 15196 + }, + { + "epoch": 0.18997974949373733, + "grad_norm": 3.623288154602051, + "learning_rate": 1.9510807841738417e-05, + "loss": 1.5834, + "step": 15198 + }, + { + "epoch": 0.19000475011875298, + "grad_norm": 3.45104718208313, + "learning_rate": 1.95105381950203e-05, + "loss": 0.943, + "step": 15200 + }, + { + "epoch": 0.1900297507437686, + "grad_norm": 4.6982855796813965, + "learning_rate": 1.951026847587127e-05, + "loss": 1.4375, + "step": 15202 + }, + { + "epoch": 0.19005475136878422, + "grad_norm": 5.113043308258057, + "learning_rate": 1.950999868429338e-05, + "loss": 1.035, + "step": 15204 + }, + { + "epoch": 0.19007975199379984, + "grad_norm": 1.0015535354614258, + "learning_rate": 1.9509728820288685e-05, + "loss": 0.1651, + "step": 15206 + }, + { + "epoch": 0.19010475261881546, + "grad_norm": 4.690918922424316, + "learning_rate": 1.9509458883859238e-05, + "loss": 2.2546, + "step": 15208 + }, + { + "epoch": 0.1901297532438311, + "grad_norm": 2.718440055847168, + "learning_rate": 1.9509188875007098e-05, + "loss": 0.1966, + "step": 15210 + }, + { + "epoch": 0.19015475386884673, + "grad_norm": 11.480283737182617, + "learning_rate": 1.950891879373432e-05, + "loss": 1.1381, + "step": 15212 + }, + { + "epoch": 0.19017975449386235, + "grad_norm": 0.012019720859825611, + "learning_rate": 1.9508648640042963e-05, + "loss": 0.0697, + "step": 15214 + }, + { + "epoch": 0.19020475511887797, + "grad_norm": 0.02661658637225628, + "learning_rate": 1.9508378413935082e-05, + "loss": 0.1754, + "step": 15216 + }, + { + "epoch": 0.19022975574389359, + "grad_norm": 5.668783187866211, + "learning_rate": 1.9508108115412734e-05, + "loss": 1.7481, + "step": 15218 + }, + { + "epoch": 0.19025475636890923, + "grad_norm": 2.0682315826416016, + "learning_rate": 1.950783774447798e-05, + "loss": 0.8665, + "step": 15220 + }, + { + "epoch": 0.19027975699392485, + "grad_norm": 3.571038007736206, + "learning_rate": 1.950756730113288e-05, + "loss": 0.8601, + "step": 15222 + }, + { + "epoch": 0.19030475761894047, + "grad_norm": 2.607980489730835, + "learning_rate": 1.9507296785379487e-05, + "loss": 0.5143, + "step": 15224 + }, + { + "epoch": 0.1903297582439561, + "grad_norm": 6.974945545196533, + "learning_rate": 1.950702619721987e-05, + "loss": 1.6309, + "step": 15226 + }, + { + "epoch": 0.1903547588689717, + "grad_norm": 0.007892695255577564, + "learning_rate": 1.9506755536656085e-05, + "loss": 0.0001, + "step": 15228 + }, + { + "epoch": 0.19037975949398736, + "grad_norm": 2.7994916439056396, + "learning_rate": 1.9506484803690197e-05, + "loss": 0.9106, + "step": 15230 + }, + { + "epoch": 0.19040476011900298, + "grad_norm": 6.1945953369140625, + "learning_rate": 1.950621399832426e-05, + "loss": 1.3377, + "step": 15232 + }, + { + "epoch": 0.1904297607440186, + "grad_norm": 0.0033443807624280453, + "learning_rate": 1.9505943120560344e-05, + "loss": 0.6516, + "step": 15234 + }, + { + "epoch": 0.19045476136903422, + "grad_norm": 7.652204513549805, + "learning_rate": 1.950567217040051e-05, + "loss": 1.4, + "step": 15236 + }, + { + "epoch": 0.19047976199404984, + "grad_norm": 4.7131242752075195, + "learning_rate": 1.950540114784682e-05, + "loss": 1.4847, + "step": 15238 + }, + { + "epoch": 0.1905047626190655, + "grad_norm": 3.0607218742370605, + "learning_rate": 1.950513005290134e-05, + "loss": 0.902, + "step": 15240 + }, + { + "epoch": 0.1905297632440811, + "grad_norm": 1.2674602270126343, + "learning_rate": 1.9504858885566135e-05, + "loss": 0.5782, + "step": 15242 + }, + { + "epoch": 0.19055476386909673, + "grad_norm": 1.765793800354004, + "learning_rate": 1.9504587645843263e-05, + "loss": 1.1787, + "step": 15244 + }, + { + "epoch": 0.19057976449411235, + "grad_norm": 3.6094038486480713, + "learning_rate": 1.9504316333734802e-05, + "loss": 0.5055, + "step": 15246 + }, + { + "epoch": 0.19060476511912797, + "grad_norm": 6.214005947113037, + "learning_rate": 1.9504044949242812e-05, + "loss": 1.1892, + "step": 15248 + }, + { + "epoch": 0.19062976574414361, + "grad_norm": 3.454000949859619, + "learning_rate": 1.9503773492369357e-05, + "loss": 0.7084, + "step": 15250 + }, + { + "epoch": 0.19065476636915923, + "grad_norm": 6.4313273429870605, + "learning_rate": 1.9503501963116505e-05, + "loss": 0.4882, + "step": 15252 + }, + { + "epoch": 0.19067976699417485, + "grad_norm": 0.005785051733255386, + "learning_rate": 1.950323036148633e-05, + "loss": 0.6015, + "step": 15254 + }, + { + "epoch": 0.19070476761919047, + "grad_norm": 4.252932071685791, + "learning_rate": 1.9502958687480893e-05, + "loss": 1.0372, + "step": 15256 + }, + { + "epoch": 0.1907297682442061, + "grad_norm": 2.221992254257202, + "learning_rate": 1.9502686941102265e-05, + "loss": 0.5946, + "step": 15258 + }, + { + "epoch": 0.19075476886922174, + "grad_norm": 2.9929914474487305, + "learning_rate": 1.9502415122352523e-05, + "loss": 0.7817, + "step": 15260 + }, + { + "epoch": 0.19077976949423736, + "grad_norm": 3.3708505630493164, + "learning_rate": 1.950214323123373e-05, + "loss": 0.6426, + "step": 15262 + }, + { + "epoch": 0.19080477011925298, + "grad_norm": 0.14490102231502533, + "learning_rate": 1.9501871267747953e-05, + "loss": 0.4013, + "step": 15264 + }, + { + "epoch": 0.1908297707442686, + "grad_norm": 3.0523335933685303, + "learning_rate": 1.9501599231897272e-05, + "loss": 1.3933, + "step": 15266 + }, + { + "epoch": 0.19085477136928422, + "grad_norm": 2.6741368770599365, + "learning_rate": 1.9501327123683754e-05, + "loss": 1.3118, + "step": 15268 + }, + { + "epoch": 0.19087977199429987, + "grad_norm": 3.160078287124634, + "learning_rate": 1.950105494310947e-05, + "loss": 0.509, + "step": 15270 + }, + { + "epoch": 0.1909047726193155, + "grad_norm": 1.8171849250793457, + "learning_rate": 1.95007826901765e-05, + "loss": 0.4976, + "step": 15272 + }, + { + "epoch": 0.1909297732443311, + "grad_norm": 3.325343608856201, + "learning_rate": 1.950051036488691e-05, + "loss": 1.2027, + "step": 15274 + }, + { + "epoch": 0.19095477386934673, + "grad_norm": 2.0444717407226562, + "learning_rate": 1.9500237967242777e-05, + "loss": 0.6226, + "step": 15276 + }, + { + "epoch": 0.19097977449436235, + "grad_norm": 1.6861612796783447, + "learning_rate": 1.9499965497246173e-05, + "loss": 0.5127, + "step": 15278 + }, + { + "epoch": 0.191004775119378, + "grad_norm": 1.971110463142395, + "learning_rate": 1.949969295489918e-05, + "loss": 0.118, + "step": 15280 + }, + { + "epoch": 0.19102977574439362, + "grad_norm": 2.1405982971191406, + "learning_rate": 1.9499420340203864e-05, + "loss": 2.0146, + "step": 15282 + }, + { + "epoch": 0.19105477636940923, + "grad_norm": 2.6119329929351807, + "learning_rate": 1.9499147653162308e-05, + "loss": 0.9727, + "step": 15284 + }, + { + "epoch": 0.19107977699442485, + "grad_norm": 3.6940255165100098, + "learning_rate": 1.9498874893776585e-05, + "loss": 1.559, + "step": 15286 + }, + { + "epoch": 0.19110477761944047, + "grad_norm": 2.3454184532165527, + "learning_rate": 1.9498602062048777e-05, + "loss": 0.8477, + "step": 15288 + }, + { + "epoch": 0.19112977824445612, + "grad_norm": 4.525667667388916, + "learning_rate": 1.9498329157980957e-05, + "loss": 0.8231, + "step": 15290 + }, + { + "epoch": 0.19115477886947174, + "grad_norm": 5.00078010559082, + "learning_rate": 1.9498056181575206e-05, + "loss": 1.5035, + "step": 15292 + }, + { + "epoch": 0.19117977949448736, + "grad_norm": 2.786754608154297, + "learning_rate": 1.9497783132833603e-05, + "loss": 0.804, + "step": 15294 + }, + { + "epoch": 0.19120478011950298, + "grad_norm": 2.2919247150421143, + "learning_rate": 1.9497510011758225e-05, + "loss": 1.2506, + "step": 15296 + }, + { + "epoch": 0.1912297807445186, + "grad_norm": 4.606576442718506, + "learning_rate": 1.9497236818351157e-05, + "loss": 0.9159, + "step": 15298 + }, + { + "epoch": 0.19125478136953425, + "grad_norm": 4.521635055541992, + "learning_rate": 1.949696355261447e-05, + "loss": 0.3042, + "step": 15300 + }, + { + "epoch": 0.19127978199454987, + "grad_norm": 2.9154441356658936, + "learning_rate": 1.949669021455026e-05, + "loss": 1.0324, + "step": 15302 + }, + { + "epoch": 0.1913047826195655, + "grad_norm": 0.1688559204339981, + "learning_rate": 1.9496416804160594e-05, + "loss": 0.7216, + "step": 15304 + }, + { + "epoch": 0.1913297832445811, + "grad_norm": 0.004574374761432409, + "learning_rate": 1.9496143321447562e-05, + "loss": 0.2702, + "step": 15306 + }, + { + "epoch": 0.19135478386959673, + "grad_norm": 0.7009148597717285, + "learning_rate": 1.9495869766413245e-05, + "loss": 0.3954, + "step": 15308 + }, + { + "epoch": 0.19137978449461238, + "grad_norm": 2.5969655513763428, + "learning_rate": 1.9495596139059726e-05, + "loss": 0.5511, + "step": 15310 + }, + { + "epoch": 0.191404785119628, + "grad_norm": 2.900204658508301, + "learning_rate": 1.949532243938909e-05, + "loss": 0.0994, + "step": 15312 + }, + { + "epoch": 0.19142978574464362, + "grad_norm": 4.242552280426025, + "learning_rate": 1.9495048667403426e-05, + "loss": 0.4984, + "step": 15314 + }, + { + "epoch": 0.19145478636965924, + "grad_norm": 0.05292050540447235, + "learning_rate": 1.949477482310481e-05, + "loss": 0.7372, + "step": 15316 + }, + { + "epoch": 0.19147978699467486, + "grad_norm": 2.8467206954956055, + "learning_rate": 1.9494500906495332e-05, + "loss": 2.4072, + "step": 15318 + }, + { + "epoch": 0.1915047876196905, + "grad_norm": 0.022677630186080933, + "learning_rate": 1.9494226917577078e-05, + "loss": 0.0002, + "step": 15320 + }, + { + "epoch": 0.19152978824470612, + "grad_norm": 3.6431572437286377, + "learning_rate": 1.9493952856352135e-05, + "loss": 1.0518, + "step": 15322 + }, + { + "epoch": 0.19155478886972174, + "grad_norm": 2.6873247623443604, + "learning_rate": 1.9493678722822585e-05, + "loss": 0.9417, + "step": 15324 + }, + { + "epoch": 0.19157978949473736, + "grad_norm": 6.554098129272461, + "learning_rate": 1.9493404516990527e-05, + "loss": 1.1989, + "step": 15326 + }, + { + "epoch": 0.19160479011975298, + "grad_norm": 3.7977681159973145, + "learning_rate": 1.9493130238858036e-05, + "loss": 0.6928, + "step": 15328 + }, + { + "epoch": 0.19162979074476863, + "grad_norm": 0.00877221580594778, + "learning_rate": 1.9492855888427215e-05, + "loss": 0.4793, + "step": 15330 + }, + { + "epoch": 0.19165479136978425, + "grad_norm": 3.36863374710083, + "learning_rate": 1.949258146570014e-05, + "loss": 0.7734, + "step": 15332 + }, + { + "epoch": 0.19167979199479987, + "grad_norm": 0.005376218818128109, + "learning_rate": 1.949230697067891e-05, + "loss": 0.0098, + "step": 15334 + }, + { + "epoch": 0.1917047926198155, + "grad_norm": 1.9345307350158691, + "learning_rate": 1.9492032403365614e-05, + "loss": 1.3143, + "step": 15336 + }, + { + "epoch": 0.1917297932448311, + "grad_norm": 2.7828927040100098, + "learning_rate": 1.949175776376234e-05, + "loss": 0.7821, + "step": 15338 + }, + { + "epoch": 0.19175479386984676, + "grad_norm": 3.073216199874878, + "learning_rate": 1.9491483051871178e-05, + "loss": 1.2077, + "step": 15340 + }, + { + "epoch": 0.19177979449486238, + "grad_norm": 2.3091132640838623, + "learning_rate": 1.9491208267694225e-05, + "loss": 0.3754, + "step": 15342 + }, + { + "epoch": 0.191804795119878, + "grad_norm": 0.0032086793798953295, + "learning_rate": 1.949093341123357e-05, + "loss": 0.3845, + "step": 15344 + }, + { + "epoch": 0.19182979574489362, + "grad_norm": 14.370248794555664, + "learning_rate": 1.9490658482491313e-05, + "loss": 2.0855, + "step": 15346 + }, + { + "epoch": 0.19185479636990924, + "grad_norm": 3.8172175884246826, + "learning_rate": 1.9490383481469542e-05, + "loss": 0.67, + "step": 15348 + }, + { + "epoch": 0.19187979699492488, + "grad_norm": 5.1745452880859375, + "learning_rate": 1.949010840817035e-05, + "loss": 1.2013, + "step": 15350 + }, + { + "epoch": 0.1919047976199405, + "grad_norm": 2.9104137420654297, + "learning_rate": 1.9489833262595838e-05, + "loss": 1.1986, + "step": 15352 + }, + { + "epoch": 0.19192979824495612, + "grad_norm": 2.1282944679260254, + "learning_rate": 1.9489558044748092e-05, + "loss": 0.7805, + "step": 15354 + }, + { + "epoch": 0.19195479886997174, + "grad_norm": 3.7242541313171387, + "learning_rate": 1.948928275462922e-05, + "loss": 1.8889, + "step": 15356 + }, + { + "epoch": 0.19197979949498736, + "grad_norm": 3.2968497276306152, + "learning_rate": 1.9489007392241312e-05, + "loss": 1.0597, + "step": 15358 + }, + { + "epoch": 0.192004800120003, + "grad_norm": 4.8106608390808105, + "learning_rate": 1.948873195758646e-05, + "loss": 1.1698, + "step": 15360 + }, + { + "epoch": 0.19202980074501863, + "grad_norm": 0.04929207265377045, + "learning_rate": 1.948845645066677e-05, + "loss": 0.8576, + "step": 15362 + }, + { + "epoch": 0.19205480137003425, + "grad_norm": 3.4183907508850098, + "learning_rate": 1.948818087148434e-05, + "loss": 0.5084, + "step": 15364 + }, + { + "epoch": 0.19207980199504987, + "grad_norm": 3.1272637844085693, + "learning_rate": 1.9487905220041264e-05, + "loss": 1.4042, + "step": 15366 + }, + { + "epoch": 0.1921048026200655, + "grad_norm": 2.816216468811035, + "learning_rate": 1.9487629496339646e-05, + "loss": 0.8204, + "step": 15368 + }, + { + "epoch": 0.19212980324508114, + "grad_norm": 3.3190016746520996, + "learning_rate": 1.948735370038158e-05, + "loss": 1.1008, + "step": 15370 + }, + { + "epoch": 0.19215480387009676, + "grad_norm": 3.251804828643799, + "learning_rate": 1.948707783216917e-05, + "loss": 0.9982, + "step": 15372 + }, + { + "epoch": 0.19217980449511238, + "grad_norm": 4.2961273193359375, + "learning_rate": 1.948680189170452e-05, + "loss": 1.1755, + "step": 15374 + }, + { + "epoch": 0.192204805120128, + "grad_norm": 2.3572328090667725, + "learning_rate": 1.9486525878989727e-05, + "loss": 2.029, + "step": 15376 + }, + { + "epoch": 0.19222980574514362, + "grad_norm": 2.1534385681152344, + "learning_rate": 1.9486249794026893e-05, + "loss": 1.4366, + "step": 15378 + }, + { + "epoch": 0.19225480637015926, + "grad_norm": 2.3334522247314453, + "learning_rate": 1.9485973636818122e-05, + "loss": 0.4563, + "step": 15380 + }, + { + "epoch": 0.19227980699517488, + "grad_norm": 2.261148452758789, + "learning_rate": 1.9485697407365518e-05, + "loss": 0.8624, + "step": 15382 + }, + { + "epoch": 0.1923048076201905, + "grad_norm": 2.963495969772339, + "learning_rate": 1.9485421105671184e-05, + "loss": 0.5722, + "step": 15384 + }, + { + "epoch": 0.19232980824520612, + "grad_norm": 1.1214957237243652, + "learning_rate": 1.9485144731737226e-05, + "loss": 0.4788, + "step": 15386 + }, + { + "epoch": 0.19235480887022174, + "grad_norm": 0.29438695311546326, + "learning_rate": 1.9484868285565746e-05, + "loss": 0.0208, + "step": 15388 + }, + { + "epoch": 0.1923798094952374, + "grad_norm": 3.5069756507873535, + "learning_rate": 1.948459176715885e-05, + "loss": 1.3849, + "step": 15390 + }, + { + "epoch": 0.192404810120253, + "grad_norm": 3.329827308654785, + "learning_rate": 1.948431517651864e-05, + "loss": 0.8357, + "step": 15392 + }, + { + "epoch": 0.19242981074526863, + "grad_norm": 3.327632188796997, + "learning_rate": 1.9484038513647233e-05, + "loss": 1.6396, + "step": 15394 + }, + { + "epoch": 0.19245481137028425, + "grad_norm": 4.735336780548096, + "learning_rate": 1.9483761778546726e-05, + "loss": 1.1889, + "step": 15396 + }, + { + "epoch": 0.19247981199529987, + "grad_norm": 0.01504423562437296, + "learning_rate": 1.9483484971219236e-05, + "loss": 0.5706, + "step": 15398 + }, + { + "epoch": 0.19250481262031552, + "grad_norm": 0.8343683481216431, + "learning_rate": 1.948320809166686e-05, + "loss": 1.001, + "step": 15400 + }, + { + "epoch": 0.19252981324533114, + "grad_norm": 2.561016321182251, + "learning_rate": 1.9482931139891714e-05, + "loss": 1.0845, + "step": 15402 + }, + { + "epoch": 0.19255481387034676, + "grad_norm": 5.612203121185303, + "learning_rate": 1.9482654115895906e-05, + "loss": 1.1901, + "step": 15404 + }, + { + "epoch": 0.19257981449536238, + "grad_norm": 4.1200971603393555, + "learning_rate": 1.9482377019681545e-05, + "loss": 1.7589, + "step": 15406 + }, + { + "epoch": 0.192604815120378, + "grad_norm": 3.7035114765167236, + "learning_rate": 1.948209985125074e-05, + "loss": 0.7457, + "step": 15408 + }, + { + "epoch": 0.19262981574539365, + "grad_norm": 3.656672477722168, + "learning_rate": 1.9481822610605605e-05, + "loss": 0.9558, + "step": 15410 + }, + { + "epoch": 0.19265481637040927, + "grad_norm": 4.1874308586120605, + "learning_rate": 1.948154529774825e-05, + "loss": 1.2721, + "step": 15412 + }, + { + "epoch": 0.19267981699542489, + "grad_norm": 6.425839900970459, + "learning_rate": 1.9481267912680785e-05, + "loss": 1.2667, + "step": 15414 + }, + { + "epoch": 0.1927048176204405, + "grad_norm": 5.466951370239258, + "learning_rate": 1.9480990455405327e-05, + "loss": 0.5866, + "step": 15416 + }, + { + "epoch": 0.19272981824545612, + "grad_norm": 2.895120620727539, + "learning_rate": 1.9480712925923987e-05, + "loss": 0.6664, + "step": 15418 + }, + { + "epoch": 0.19275481887047177, + "grad_norm": 3.2524449825286865, + "learning_rate": 1.9480435324238872e-05, + "loss": 0.872, + "step": 15420 + }, + { + "epoch": 0.1927798194954874, + "grad_norm": 0.6638497710227966, + "learning_rate": 1.9480157650352106e-05, + "loss": 0.6618, + "step": 15422 + }, + { + "epoch": 0.192804820120503, + "grad_norm": 4.982268810272217, + "learning_rate": 1.9479879904265804e-05, + "loss": 0.8234, + "step": 15424 + }, + { + "epoch": 0.19282982074551863, + "grad_norm": 2.6789469718933105, + "learning_rate": 1.947960208598207e-05, + "loss": 1.2048, + "step": 15426 + }, + { + "epoch": 0.19285482137053425, + "grad_norm": 2.3024513721466064, + "learning_rate": 1.9479324195503035e-05, + "loss": 0.5515, + "step": 15428 + }, + { + "epoch": 0.1928798219955499, + "grad_norm": 1.824635624885559, + "learning_rate": 1.94790462328308e-05, + "loss": 0.8782, + "step": 15430 + }, + { + "epoch": 0.19290482262056552, + "grad_norm": 1.9983510971069336, + "learning_rate": 1.9478768197967493e-05, + "loss": 0.7128, + "step": 15432 + }, + { + "epoch": 0.19292982324558114, + "grad_norm": 5.106515407562256, + "learning_rate": 1.9478490090915227e-05, + "loss": 1.3187, + "step": 15434 + }, + { + "epoch": 0.19295482387059676, + "grad_norm": 2.4357457160949707, + "learning_rate": 1.9478211911676123e-05, + "loss": 0.4823, + "step": 15436 + }, + { + "epoch": 0.19297982449561238, + "grad_norm": 0.7494646906852722, + "learning_rate": 1.947793366025229e-05, + "loss": 0.0444, + "step": 15438 + }, + { + "epoch": 0.19300482512062803, + "grad_norm": 3.923102617263794, + "learning_rate": 1.9477655336645864e-05, + "loss": 0.6473, + "step": 15440 + }, + { + "epoch": 0.19302982574564365, + "grad_norm": 0.10215801745653152, + "learning_rate": 1.947737694085895e-05, + "loss": 0.0436, + "step": 15442 + }, + { + "epoch": 0.19305482637065927, + "grad_norm": 0.3279931843280792, + "learning_rate": 1.9477098472893675e-05, + "loss": 0.0461, + "step": 15444 + }, + { + "epoch": 0.19307982699567489, + "grad_norm": 3.4232561588287354, + "learning_rate": 1.9476819932752158e-05, + "loss": 0.7097, + "step": 15446 + }, + { + "epoch": 0.1931048276206905, + "grad_norm": 2.5228161811828613, + "learning_rate": 1.9476541320436522e-05, + "loss": 0.7019, + "step": 15448 + }, + { + "epoch": 0.19312982824570615, + "grad_norm": 1.828261375427246, + "learning_rate": 1.9476262635948882e-05, + "loss": 0.575, + "step": 15450 + }, + { + "epoch": 0.19315482887072177, + "grad_norm": 4.806243419647217, + "learning_rate": 1.9475983879291372e-05, + "loss": 0.4364, + "step": 15452 + }, + { + "epoch": 0.1931798294957374, + "grad_norm": 4.290428161621094, + "learning_rate": 1.947570505046611e-05, + "loss": 0.9747, + "step": 15454 + }, + { + "epoch": 0.193204830120753, + "grad_norm": 2.791534900665283, + "learning_rate": 1.947542614947521e-05, + "loss": 0.3736, + "step": 15456 + }, + { + "epoch": 0.19322983074576863, + "grad_norm": 0.007393004838377237, + "learning_rate": 1.947514717632081e-05, + "loss": 0.7488, + "step": 15458 + }, + { + "epoch": 0.19325483137078428, + "grad_norm": 0.007235164288431406, + "learning_rate": 1.9474868131005027e-05, + "loss": 0.7237, + "step": 15460 + }, + { + "epoch": 0.1932798319957999, + "grad_norm": 2.127828598022461, + "learning_rate": 1.947458901352999e-05, + "loss": 0.3031, + "step": 15462 + }, + { + "epoch": 0.19330483262081552, + "grad_norm": 4.007054328918457, + "learning_rate": 1.9474309823897823e-05, + "loss": 1.7829, + "step": 15464 + }, + { + "epoch": 0.19332983324583114, + "grad_norm": 1.0798656940460205, + "learning_rate": 1.947403056211065e-05, + "loss": 0.8243, + "step": 15466 + }, + { + "epoch": 0.19335483387084676, + "grad_norm": 2.8949673175811768, + "learning_rate": 1.9473751228170604e-05, + "loss": 0.4912, + "step": 15468 + }, + { + "epoch": 0.1933798344958624, + "grad_norm": 4.042862415313721, + "learning_rate": 1.9473471822079807e-05, + "loss": 1.2434, + "step": 15470 + }, + { + "epoch": 0.19340483512087803, + "grad_norm": 2.177366018295288, + "learning_rate": 1.9473192343840384e-05, + "loss": 1.2934, + "step": 15472 + }, + { + "epoch": 0.19342983574589365, + "grad_norm": 3.0349369049072266, + "learning_rate": 1.9472912793454472e-05, + "loss": 1.173, + "step": 15474 + }, + { + "epoch": 0.19345483637090927, + "grad_norm": 3.6332554817199707, + "learning_rate": 1.9472633170924192e-05, + "loss": 0.8088, + "step": 15476 + }, + { + "epoch": 0.1934798369959249, + "grad_norm": 3.6316378116607666, + "learning_rate": 1.947235347625168e-05, + "loss": 1.8489, + "step": 15478 + }, + { + "epoch": 0.19350483762094053, + "grad_norm": 3.0059192180633545, + "learning_rate": 1.9472073709439064e-05, + "loss": 0.4624, + "step": 15480 + }, + { + "epoch": 0.19352983824595615, + "grad_norm": 2.874600887298584, + "learning_rate": 1.9471793870488475e-05, + "loss": 0.6461, + "step": 15482 + }, + { + "epoch": 0.19355483887097177, + "grad_norm": 1.5507066249847412, + "learning_rate": 1.9471513959402043e-05, + "loss": 0.9928, + "step": 15484 + }, + { + "epoch": 0.1935798394959874, + "grad_norm": 3.9432666301727295, + "learning_rate": 1.9471233976181895e-05, + "loss": 1.8052, + "step": 15486 + }, + { + "epoch": 0.193604840121003, + "grad_norm": 2.958035469055176, + "learning_rate": 1.9470953920830173e-05, + "loss": 0.3236, + "step": 15488 + }, + { + "epoch": 0.19362984074601866, + "grad_norm": 0.9630446434020996, + "learning_rate": 1.9470673793349008e-05, + "loss": 0.8224, + "step": 15490 + }, + { + "epoch": 0.19365484137103428, + "grad_norm": 1.537521481513977, + "learning_rate": 1.9470393593740524e-05, + "loss": 0.5862, + "step": 15492 + }, + { + "epoch": 0.1936798419960499, + "grad_norm": 3.4097719192504883, + "learning_rate": 1.9470113322006863e-05, + "loss": 0.782, + "step": 15494 + }, + { + "epoch": 0.19370484262106552, + "grad_norm": 2.795006513595581, + "learning_rate": 1.9469832978150163e-05, + "loss": 1.2509, + "step": 15496 + }, + { + "epoch": 0.19372984324608114, + "grad_norm": 5.956544399261475, + "learning_rate": 1.946955256217255e-05, + "loss": 1.4058, + "step": 15498 + }, + { + "epoch": 0.1937548438710968, + "grad_norm": 5.462111473083496, + "learning_rate": 1.9469272074076165e-05, + "loss": 1.324, + "step": 15500 + }, + { + "epoch": 0.1937798444961124, + "grad_norm": 3.7228589057922363, + "learning_rate": 1.9468991513863144e-05, + "loss": 0.6341, + "step": 15502 + }, + { + "epoch": 0.19380484512112803, + "grad_norm": 3.6208229064941406, + "learning_rate": 1.946871088153562e-05, + "loss": 0.7314, + "step": 15504 + }, + { + "epoch": 0.19382984574614365, + "grad_norm": 4.404168605804443, + "learning_rate": 1.9468430177095735e-05, + "loss": 0.8693, + "step": 15506 + }, + { + "epoch": 0.19385484637115927, + "grad_norm": 0.14816409349441528, + "learning_rate": 1.9468149400545623e-05, + "loss": 0.3056, + "step": 15508 + }, + { + "epoch": 0.19387984699617491, + "grad_norm": 3.5862369537353516, + "learning_rate": 1.9467868551887427e-05, + "loss": 0.6686, + "step": 15510 + }, + { + "epoch": 0.19390484762119053, + "grad_norm": 3.6912176609039307, + "learning_rate": 1.946758763112328e-05, + "loss": 0.8543, + "step": 15512 + }, + { + "epoch": 0.19392984824620615, + "grad_norm": 1.4192341566085815, + "learning_rate": 1.9467306638255327e-05, + "loss": 0.9561, + "step": 15514 + }, + { + "epoch": 0.19395484887122177, + "grad_norm": 2.7726473808288574, + "learning_rate": 1.9467025573285702e-05, + "loss": 0.7468, + "step": 15516 + }, + { + "epoch": 0.1939798494962374, + "grad_norm": 0.26574844121932983, + "learning_rate": 1.9466744436216552e-05, + "loss": 0.436, + "step": 15518 + }, + { + "epoch": 0.19400485012125304, + "grad_norm": 4.01760196685791, + "learning_rate": 1.9466463227050013e-05, + "loss": 0.744, + "step": 15520 + }, + { + "epoch": 0.19402985074626866, + "grad_norm": 5.749754428863525, + "learning_rate": 1.946618194578823e-05, + "loss": 1.2847, + "step": 15522 + }, + { + "epoch": 0.19405485137128428, + "grad_norm": 4.265073776245117, + "learning_rate": 1.9465900592433345e-05, + "loss": 0.7707, + "step": 15524 + }, + { + "epoch": 0.1940798519962999, + "grad_norm": 3.888749599456787, + "learning_rate": 1.9465619166987498e-05, + "loss": 1.7874, + "step": 15526 + }, + { + "epoch": 0.19410485262131552, + "grad_norm": 11.797565460205078, + "learning_rate": 1.9465337669452835e-05, + "loss": 0.8411, + "step": 15528 + }, + { + "epoch": 0.19412985324633117, + "grad_norm": 3.3867697715759277, + "learning_rate": 1.9465056099831497e-05, + "loss": 1.5742, + "step": 15530 + }, + { + "epoch": 0.1941548538713468, + "grad_norm": 5.056527137756348, + "learning_rate": 1.9464774458125633e-05, + "loss": 1.1704, + "step": 15532 + }, + { + "epoch": 0.1941798544963624, + "grad_norm": 3.112163543701172, + "learning_rate": 1.9464492744337383e-05, + "loss": 1.3559, + "step": 15534 + }, + { + "epoch": 0.19420485512137803, + "grad_norm": 5.312311172485352, + "learning_rate": 1.9464210958468896e-05, + "loss": 0.9946, + "step": 15536 + }, + { + "epoch": 0.19422985574639365, + "grad_norm": 6.955366611480713, + "learning_rate": 1.9463929100522314e-05, + "loss": 1.2383, + "step": 15538 + }, + { + "epoch": 0.1942548563714093, + "grad_norm": 1.9034287929534912, + "learning_rate": 1.946364717049979e-05, + "loss": 0.0836, + "step": 15540 + }, + { + "epoch": 0.19427985699642492, + "grad_norm": 1.6888738870620728, + "learning_rate": 1.9463365168403467e-05, + "loss": 0.4866, + "step": 15542 + }, + { + "epoch": 0.19430485762144054, + "grad_norm": 2.821985960006714, + "learning_rate": 1.946308309423549e-05, + "loss": 0.6688, + "step": 15544 + }, + { + "epoch": 0.19432985824645616, + "grad_norm": 6.830864906311035, + "learning_rate": 1.9462800947998014e-05, + "loss": 2.6573, + "step": 15546 + }, + { + "epoch": 0.19435485887147178, + "grad_norm": 0.0007018379401415586, + "learning_rate": 1.946251872969318e-05, + "loss": 0.4842, + "step": 15548 + }, + { + "epoch": 0.19437985949648742, + "grad_norm": 0.0007623512065038085, + "learning_rate": 1.9462236439323145e-05, + "loss": 0.3531, + "step": 15550 + }, + { + "epoch": 0.19440486012150304, + "grad_norm": 3.246185779571533, + "learning_rate": 1.946195407689005e-05, + "loss": 1.5128, + "step": 15552 + }, + { + "epoch": 0.19442986074651866, + "grad_norm": 3.255544424057007, + "learning_rate": 1.946167164239606e-05, + "loss": 0.5783, + "step": 15554 + }, + { + "epoch": 0.19445486137153428, + "grad_norm": 1.1046544313430786, + "learning_rate": 1.946138913584331e-05, + "loss": 0.0305, + "step": 15556 + }, + { + "epoch": 0.1944798619965499, + "grad_norm": 5.911269187927246, + "learning_rate": 1.9461106557233962e-05, + "loss": 1.1125, + "step": 15558 + }, + { + "epoch": 0.19450486262156555, + "grad_norm": 5.68609619140625, + "learning_rate": 1.9460823906570163e-05, + "loss": 0.8904, + "step": 15560 + }, + { + "epoch": 0.19452986324658117, + "grad_norm": 3.0660202503204346, + "learning_rate": 1.9460541183854064e-05, + "loss": 0.9024, + "step": 15562 + }, + { + "epoch": 0.1945548638715968, + "grad_norm": 3.8443870544433594, + "learning_rate": 1.9460258389087826e-05, + "loss": 0.909, + "step": 15564 + }, + { + "epoch": 0.1945798644966124, + "grad_norm": 2.7222723960876465, + "learning_rate": 1.945997552227359e-05, + "loss": 0.488, + "step": 15566 + }, + { + "epoch": 0.19460486512162803, + "grad_norm": 2.7001819610595703, + "learning_rate": 1.9459692583413527e-05, + "loss": 1.4303, + "step": 15568 + }, + { + "epoch": 0.19462986574664368, + "grad_norm": 2.3575329780578613, + "learning_rate": 1.945940957250978e-05, + "loss": 0.8571, + "step": 15570 + }, + { + "epoch": 0.1946548663716593, + "grad_norm": 3.227552890777588, + "learning_rate": 1.9459126489564507e-05, + "loss": 1.1085, + "step": 15572 + }, + { + "epoch": 0.19467986699667492, + "grad_norm": 4.213550567626953, + "learning_rate": 1.9458843334579866e-05, + "loss": 1.4164, + "step": 15574 + }, + { + "epoch": 0.19470486762169054, + "grad_norm": 3.913025140762329, + "learning_rate": 1.945856010755801e-05, + "loss": 0.8658, + "step": 15576 + }, + { + "epoch": 0.19472986824670616, + "grad_norm": 0.7190417051315308, + "learning_rate": 1.9458276808501097e-05, + "loss": 0.4913, + "step": 15578 + }, + { + "epoch": 0.1947548688717218, + "grad_norm": 2.3515777587890625, + "learning_rate": 1.9457993437411287e-05, + "loss": 0.2076, + "step": 15580 + }, + { + "epoch": 0.19477986949673742, + "grad_norm": 3.263988971710205, + "learning_rate": 1.9457709994290733e-05, + "loss": 1.0478, + "step": 15582 + }, + { + "epoch": 0.19480487012175304, + "grad_norm": 5.653043270111084, + "learning_rate": 1.9457426479141604e-05, + "loss": 0.9792, + "step": 15584 + }, + { + "epoch": 0.19482987074676866, + "grad_norm": 2.8921432495117188, + "learning_rate": 1.9457142891966048e-05, + "loss": 0.4499, + "step": 15586 + }, + { + "epoch": 0.19485487137178428, + "grad_norm": 4.106485843658447, + "learning_rate": 1.945685923276623e-05, + "loss": 0.896, + "step": 15588 + }, + { + "epoch": 0.19487987199679993, + "grad_norm": 2.339489459991455, + "learning_rate": 1.945657550154431e-05, + "loss": 0.9625, + "step": 15590 + }, + { + "epoch": 0.19490487262181555, + "grad_norm": 11.52040958404541, + "learning_rate": 1.9456291698302446e-05, + "loss": 0.1913, + "step": 15592 + }, + { + "epoch": 0.19492987324683117, + "grad_norm": 3.176969528198242, + "learning_rate": 1.94560078230428e-05, + "loss": 0.6045, + "step": 15594 + }, + { + "epoch": 0.1949548738718468, + "grad_norm": 4.10851526260376, + "learning_rate": 1.9455723875767537e-05, + "loss": 1.7702, + "step": 15596 + }, + { + "epoch": 0.1949798744968624, + "grad_norm": 1.8843183517456055, + "learning_rate": 1.945543985647882e-05, + "loss": 0.9374, + "step": 15598 + }, + { + "epoch": 0.19500487512187806, + "grad_norm": 7.827676296234131, + "learning_rate": 1.945515576517881e-05, + "loss": 2.0335, + "step": 15600 + }, + { + "epoch": 0.19502987574689368, + "grad_norm": 0.000976611627265811, + "learning_rate": 1.9454871601869672e-05, + "loss": 0.2145, + "step": 15602 + }, + { + "epoch": 0.1950548763719093, + "grad_norm": 4.467309951782227, + "learning_rate": 1.945458736655357e-05, + "loss": 1.3511, + "step": 15604 + }, + { + "epoch": 0.19507987699692492, + "grad_norm": 2.051804542541504, + "learning_rate": 1.945430305923266e-05, + "loss": 1.1906, + "step": 15606 + }, + { + "epoch": 0.19510487762194054, + "grad_norm": 1.4347690343856812, + "learning_rate": 1.945401867990912e-05, + "loss": 1.0816, + "step": 15608 + }, + { + "epoch": 0.19512987824695618, + "grad_norm": 4.504307270050049, + "learning_rate": 1.945373422858511e-05, + "loss": 1.5003, + "step": 15610 + }, + { + "epoch": 0.1951548788719718, + "grad_norm": 2.5965495109558105, + "learning_rate": 1.94534497052628e-05, + "loss": 0.4956, + "step": 15612 + }, + { + "epoch": 0.19517987949698742, + "grad_norm": 3.01245379447937, + "learning_rate": 1.945316510994435e-05, + "loss": 1.2103, + "step": 15614 + }, + { + "epoch": 0.19520488012200304, + "grad_norm": 1.161108136177063, + "learning_rate": 1.9452880442631934e-05, + "loss": 1.6145, + "step": 15616 + }, + { + "epoch": 0.19522988074701866, + "grad_norm": 0.8154830932617188, + "learning_rate": 1.9452595703327716e-05, + "loss": 0.5441, + "step": 15618 + }, + { + "epoch": 0.1952548813720343, + "grad_norm": 4.393746852874756, + "learning_rate": 1.9452310892033867e-05, + "loss": 0.2068, + "step": 15620 + }, + { + "epoch": 0.19527988199704993, + "grad_norm": 3.261575937271118, + "learning_rate": 1.945202600875255e-05, + "loss": 1.647, + "step": 15622 + }, + { + "epoch": 0.19530488262206555, + "grad_norm": 4.083011150360107, + "learning_rate": 1.9451741053485947e-05, + "loss": 1.5555, + "step": 15624 + }, + { + "epoch": 0.19532988324708117, + "grad_norm": 4.346163272857666, + "learning_rate": 1.9451456026236217e-05, + "loss": 0.576, + "step": 15626 + }, + { + "epoch": 0.1953548838720968, + "grad_norm": 3.9363205432891846, + "learning_rate": 1.9451170927005535e-05, + "loss": 1.8706, + "step": 15628 + }, + { + "epoch": 0.19537988449711244, + "grad_norm": 2.297098159790039, + "learning_rate": 1.945088575579607e-05, + "loss": 0.4557, + "step": 15630 + }, + { + "epoch": 0.19540488512212806, + "grad_norm": 2.3067691326141357, + "learning_rate": 1.9450600512609994e-05, + "loss": 0.4863, + "step": 15632 + }, + { + "epoch": 0.19542988574714368, + "grad_norm": 3.6669416427612305, + "learning_rate": 1.9450315197449486e-05, + "loss": 0.4984, + "step": 15634 + }, + { + "epoch": 0.1954548863721593, + "grad_norm": 4.493389129638672, + "learning_rate": 1.945002981031671e-05, + "loss": 1.6992, + "step": 15636 + }, + { + "epoch": 0.19547988699717492, + "grad_norm": 1.8924531936645508, + "learning_rate": 1.9449744351213848e-05, + "loss": 0.5211, + "step": 15638 + }, + { + "epoch": 0.19550488762219057, + "grad_norm": 5.557664394378662, + "learning_rate": 1.9449458820143062e-05, + "loss": 1.0637, + "step": 15640 + }, + { + "epoch": 0.19552988824720618, + "grad_norm": 3.5897598266601562, + "learning_rate": 1.944917321710654e-05, + "loss": 0.6113, + "step": 15642 + }, + { + "epoch": 0.1955548888722218, + "grad_norm": 3.5014636516571045, + "learning_rate": 1.944888754210645e-05, + "loss": 0.8321, + "step": 15644 + }, + { + "epoch": 0.19557988949723742, + "grad_norm": 0.000978462165221572, + "learning_rate": 1.9448601795144964e-05, + "loss": 0.5913, + "step": 15646 + }, + { + "epoch": 0.19560489012225304, + "grad_norm": 4.151119232177734, + "learning_rate": 1.944831597622427e-05, + "loss": 1.384, + "step": 15648 + }, + { + "epoch": 0.1956298907472687, + "grad_norm": 4.220527648925781, + "learning_rate": 1.9448030085346534e-05, + "loss": 1.0163, + "step": 15650 + }, + { + "epoch": 0.1956548913722843, + "grad_norm": 0.9130098223686218, + "learning_rate": 1.9447744122513938e-05, + "loss": 1.1702, + "step": 15652 + }, + { + "epoch": 0.19567989199729993, + "grad_norm": 2.343515157699585, + "learning_rate": 1.9447458087728657e-05, + "loss": 1.4026, + "step": 15654 + }, + { + "epoch": 0.19570489262231555, + "grad_norm": 3.464388132095337, + "learning_rate": 1.944717198099287e-05, + "loss": 0.8974, + "step": 15656 + }, + { + "epoch": 0.19572989324733117, + "grad_norm": 7.355956554412842, + "learning_rate": 1.944688580230876e-05, + "loss": 1.1208, + "step": 15658 + }, + { + "epoch": 0.19575489387234682, + "grad_norm": 2.3999850749969482, + "learning_rate": 1.9446599551678503e-05, + "loss": 0.5781, + "step": 15660 + }, + { + "epoch": 0.19577989449736244, + "grad_norm": 3.89593505859375, + "learning_rate": 1.9446313229104282e-05, + "loss": 0.7278, + "step": 15662 + }, + { + "epoch": 0.19580489512237806, + "grad_norm": 2.570105791091919, + "learning_rate": 1.9446026834588272e-05, + "loss": 0.7868, + "step": 15664 + }, + { + "epoch": 0.19582989574739368, + "grad_norm": 2.2210140228271484, + "learning_rate": 1.9445740368132657e-05, + "loss": 0.4164, + "step": 15666 + }, + { + "epoch": 0.1958548963724093, + "grad_norm": 2.610854387283325, + "learning_rate": 1.944545382973962e-05, + "loss": 1.1451, + "step": 15668 + }, + { + "epoch": 0.19587989699742495, + "grad_norm": 4.053581714630127, + "learning_rate": 1.9445167219411343e-05, + "loss": 0.9029, + "step": 15670 + }, + { + "epoch": 0.19590489762244057, + "grad_norm": 9.878178596496582, + "learning_rate": 1.9444880537150012e-05, + "loss": 1.4862, + "step": 15672 + }, + { + "epoch": 0.19592989824745619, + "grad_norm": 4.587179183959961, + "learning_rate": 1.94445937829578e-05, + "loss": 1.662, + "step": 15674 + }, + { + "epoch": 0.1959548988724718, + "grad_norm": 3.282439947128296, + "learning_rate": 1.94443069568369e-05, + "loss": 0.8369, + "step": 15676 + }, + { + "epoch": 0.19597989949748743, + "grad_norm": 5.054330348968506, + "learning_rate": 1.944402005878949e-05, + "loss": 0.5495, + "step": 15678 + }, + { + "epoch": 0.19600490012250307, + "grad_norm": 0.0009607461979612708, + "learning_rate": 1.9443733088817766e-05, + "loss": 0.0908, + "step": 15680 + }, + { + "epoch": 0.1960299007475187, + "grad_norm": 3.6249451637268066, + "learning_rate": 1.94434460469239e-05, + "loss": 0.7793, + "step": 15682 + }, + { + "epoch": 0.1960549013725343, + "grad_norm": 2.471219778060913, + "learning_rate": 1.9443158933110088e-05, + "loss": 1.5436, + "step": 15684 + }, + { + "epoch": 0.19607990199754993, + "grad_norm": 3.481555700302124, + "learning_rate": 1.9442871747378512e-05, + "loss": 0.5846, + "step": 15686 + }, + { + "epoch": 0.19610490262256555, + "grad_norm": 3.48612380027771, + "learning_rate": 1.9442584489731362e-05, + "loss": 1.6822, + "step": 15688 + }, + { + "epoch": 0.1961299032475812, + "grad_norm": 0.0014243751065805554, + "learning_rate": 1.9442297160170824e-05, + "loss": 0.1812, + "step": 15690 + }, + { + "epoch": 0.19615490387259682, + "grad_norm": 3.671076536178589, + "learning_rate": 1.9442009758699082e-05, + "loss": 2.2336, + "step": 15692 + }, + { + "epoch": 0.19617990449761244, + "grad_norm": 0.15921317040920258, + "learning_rate": 1.944172228531833e-05, + "loss": 0.5367, + "step": 15694 + }, + { + "epoch": 0.19620490512262806, + "grad_norm": 3.803293228149414, + "learning_rate": 1.9441434740030754e-05, + "loss": 1.5197, + "step": 15696 + }, + { + "epoch": 0.19622990574764368, + "grad_norm": 3.6294844150543213, + "learning_rate": 1.9441147122838552e-05, + "loss": 0.488, + "step": 15698 + }, + { + "epoch": 0.19625490637265933, + "grad_norm": 4.2459330558776855, + "learning_rate": 1.9440859433743905e-05, + "loss": 1.3897, + "step": 15700 + }, + { + "epoch": 0.19627990699767495, + "grad_norm": 3.5039045810699463, + "learning_rate": 1.9440571672749005e-05, + "loss": 1.0301, + "step": 15702 + }, + { + "epoch": 0.19630490762269057, + "grad_norm": 5.523213863372803, + "learning_rate": 1.9440283839856055e-05, + "loss": 2.4912, + "step": 15704 + }, + { + "epoch": 0.1963299082477062, + "grad_norm": 2.6678996086120605, + "learning_rate": 1.943999593506723e-05, + "loss": 1.4612, + "step": 15706 + }, + { + "epoch": 0.1963549088727218, + "grad_norm": 2.3667354583740234, + "learning_rate": 1.9439707958384734e-05, + "loss": 1.13, + "step": 15708 + }, + { + "epoch": 0.19637990949773745, + "grad_norm": 2.234626054763794, + "learning_rate": 1.9439419909810758e-05, + "loss": 1.4607, + "step": 15710 + }, + { + "epoch": 0.19640491012275307, + "grad_norm": 2.4813342094421387, + "learning_rate": 1.9439131789347493e-05, + "loss": 0.4385, + "step": 15712 + }, + { + "epoch": 0.1964299107477687, + "grad_norm": 5.85038948059082, + "learning_rate": 1.9438843596997134e-05, + "loss": 1.5134, + "step": 15714 + }, + { + "epoch": 0.1964549113727843, + "grad_norm": 0.00040483500924892724, + "learning_rate": 1.9438555332761878e-05, + "loss": 0.7086, + "step": 15716 + }, + { + "epoch": 0.19647991199779993, + "grad_norm": 0.0004449926782399416, + "learning_rate": 1.9438266996643923e-05, + "loss": 0.5954, + "step": 15718 + }, + { + "epoch": 0.19650491262281558, + "grad_norm": 1.5738635063171387, + "learning_rate": 1.9437978588645458e-05, + "loss": 0.5055, + "step": 15720 + }, + { + "epoch": 0.1965299132478312, + "grad_norm": 3.826979875564575, + "learning_rate": 1.943769010876868e-05, + "loss": 0.9352, + "step": 15722 + }, + { + "epoch": 0.19655491387284682, + "grad_norm": 3.4828507900238037, + "learning_rate": 1.9437401557015792e-05, + "loss": 1.5439, + "step": 15724 + }, + { + "epoch": 0.19657991449786244, + "grad_norm": 0.0008024289854802191, + "learning_rate": 1.943711293338899e-05, + "loss": 0.5892, + "step": 15726 + }, + { + "epoch": 0.19660491512287806, + "grad_norm": 0.0031827634666115046, + "learning_rate": 1.943682423789047e-05, + "loss": 1.0876, + "step": 15728 + }, + { + "epoch": 0.1966299157478937, + "grad_norm": 3.2905805110931396, + "learning_rate": 1.9436535470522427e-05, + "loss": 1.5615, + "step": 15730 + }, + { + "epoch": 0.19665491637290933, + "grad_norm": 3.7016441822052, + "learning_rate": 1.9436246631287067e-05, + "loss": 1.3209, + "step": 15732 + }, + { + "epoch": 0.19667991699792495, + "grad_norm": 5.465248107910156, + "learning_rate": 1.9435957720186586e-05, + "loss": 0.6454, + "step": 15734 + }, + { + "epoch": 0.19670491762294057, + "grad_norm": 6.0209174156188965, + "learning_rate": 1.943566873722319e-05, + "loss": 1.2461, + "step": 15736 + }, + { + "epoch": 0.1967299182479562, + "grad_norm": 0.08335497230291367, + "learning_rate": 1.943537968239907e-05, + "loss": 0.1661, + "step": 15738 + }, + { + "epoch": 0.19675491887297183, + "grad_norm": 3.2813355922698975, + "learning_rate": 1.943509055571643e-05, + "loss": 1.0872, + "step": 15740 + }, + { + "epoch": 0.19677991949798745, + "grad_norm": 0.0008309257100336254, + "learning_rate": 1.9434801357177482e-05, + "loss": 0.0854, + "step": 15742 + }, + { + "epoch": 0.19680492012300307, + "grad_norm": 3.072990894317627, + "learning_rate": 1.943451208678442e-05, + "loss": 1.1345, + "step": 15744 + }, + { + "epoch": 0.1968299207480187, + "grad_norm": 3.6648731231689453, + "learning_rate": 1.943422274453944e-05, + "loss": 1.5899, + "step": 15746 + }, + { + "epoch": 0.19685492137303431, + "grad_norm": 2.9165570735931396, + "learning_rate": 1.9433933330444762e-05, + "loss": 0.9705, + "step": 15748 + }, + { + "epoch": 0.19687992199804996, + "grad_norm": 3.8890280723571777, + "learning_rate": 1.943364384450258e-05, + "loss": 1.3065, + "step": 15750 + }, + { + "epoch": 0.19690492262306558, + "grad_norm": 4.53907585144043, + "learning_rate": 1.94333542867151e-05, + "loss": 2.0183, + "step": 15752 + }, + { + "epoch": 0.1969299232480812, + "grad_norm": 3.6139349937438965, + "learning_rate": 1.9433064657084525e-05, + "loss": 0.8692, + "step": 15754 + }, + { + "epoch": 0.19695492387309682, + "grad_norm": 0.5965185761451721, + "learning_rate": 1.943277495561307e-05, + "loss": 0.0152, + "step": 15756 + }, + { + "epoch": 0.19697992449811244, + "grad_norm": 4.0251030921936035, + "learning_rate": 1.943248518230293e-05, + "loss": 0.7249, + "step": 15758 + }, + { + "epoch": 0.1970049251231281, + "grad_norm": 1.881253719329834, + "learning_rate": 1.9432195337156318e-05, + "loss": 0.8373, + "step": 15760 + }, + { + "epoch": 0.1970299257481437, + "grad_norm": 0.001006655627861619, + "learning_rate": 1.943190542017544e-05, + "loss": 0.8498, + "step": 15762 + }, + { + "epoch": 0.19705492637315933, + "grad_norm": 3.6475625038146973, + "learning_rate": 1.9431615431362502e-05, + "loss": 1.2193, + "step": 15764 + }, + { + "epoch": 0.19707992699817495, + "grad_norm": 4.633810520172119, + "learning_rate": 1.9431325370719717e-05, + "loss": 1.4602, + "step": 15766 + }, + { + "epoch": 0.19710492762319057, + "grad_norm": 2.8782684803009033, + "learning_rate": 1.943103523824929e-05, + "loss": 1.8038, + "step": 15768 + }, + { + "epoch": 0.19712992824820622, + "grad_norm": 5.408377170562744, + "learning_rate": 1.9430745033953434e-05, + "loss": 1.7007, + "step": 15770 + }, + { + "epoch": 0.19715492887322184, + "grad_norm": 5.8074049949646, + "learning_rate": 1.9430454757834358e-05, + "loss": 2.4078, + "step": 15772 + }, + { + "epoch": 0.19717992949823746, + "grad_norm": 3.355024814605713, + "learning_rate": 1.943016440989427e-05, + "loss": 1.0539, + "step": 15774 + }, + { + "epoch": 0.19720493012325307, + "grad_norm": 2.8605947494506836, + "learning_rate": 1.9429873990135387e-05, + "loss": 0.4676, + "step": 15776 + }, + { + "epoch": 0.1972299307482687, + "grad_norm": 1.3452098369598389, + "learning_rate": 1.9429583498559915e-05, + "loss": 0.6662, + "step": 15778 + }, + { + "epoch": 0.19725493137328434, + "grad_norm": 3.622330665588379, + "learning_rate": 1.9429292935170068e-05, + "loss": 0.6582, + "step": 15780 + }, + { + "epoch": 0.19727993199829996, + "grad_norm": 4.062414646148682, + "learning_rate": 1.9429002299968057e-05, + "loss": 1.8558, + "step": 15782 + }, + { + "epoch": 0.19730493262331558, + "grad_norm": 2.6454286575317383, + "learning_rate": 1.9428711592956103e-05, + "loss": 0.5682, + "step": 15784 + }, + { + "epoch": 0.1973299332483312, + "grad_norm": 1.9808069467544556, + "learning_rate": 1.942842081413641e-05, + "loss": 0.6013, + "step": 15786 + }, + { + "epoch": 0.19735493387334682, + "grad_norm": 0.3458380699157715, + "learning_rate": 1.9428129963511203e-05, + "loss": 0.7722, + "step": 15788 + }, + { + "epoch": 0.19737993449836247, + "grad_norm": 2.4864206314086914, + "learning_rate": 1.9427839041082688e-05, + "loss": 0.6497, + "step": 15790 + }, + { + "epoch": 0.1974049351233781, + "grad_norm": 3.521329641342163, + "learning_rate": 1.9427548046853083e-05, + "loss": 2.1476, + "step": 15792 + }, + { + "epoch": 0.1974299357483937, + "grad_norm": 4.41181755065918, + "learning_rate": 1.9427256980824606e-05, + "loss": 2.0617, + "step": 15794 + }, + { + "epoch": 0.19745493637340933, + "grad_norm": 4.18837308883667, + "learning_rate": 1.9426965842999474e-05, + "loss": 1.6813, + "step": 15796 + }, + { + "epoch": 0.19747993699842495, + "grad_norm": 1.6774476766586304, + "learning_rate": 1.94266746333799e-05, + "loss": 1.0537, + "step": 15798 + }, + { + "epoch": 0.1975049376234406, + "grad_norm": 2.64452862739563, + "learning_rate": 1.942638335196811e-05, + "loss": 0.3533, + "step": 15800 + }, + { + "epoch": 0.19752993824845622, + "grad_norm": 0.05049968510866165, + "learning_rate": 1.9426091998766315e-05, + "loss": 0.0007, + "step": 15802 + }, + { + "epoch": 0.19755493887347184, + "grad_norm": 2.3216991424560547, + "learning_rate": 1.942580057377674e-05, + "loss": 1.3114, + "step": 15804 + }, + { + "epoch": 0.19757993949848746, + "grad_norm": 2.3714795112609863, + "learning_rate": 1.9425509077001594e-05, + "loss": 1.1047, + "step": 15806 + }, + { + "epoch": 0.19760494012350308, + "grad_norm": 3.5066564083099365, + "learning_rate": 1.942521750844311e-05, + "loss": 0.8695, + "step": 15808 + }, + { + "epoch": 0.19762994074851872, + "grad_norm": 2.3311266899108887, + "learning_rate": 1.9424925868103498e-05, + "loss": 0.8855, + "step": 15810 + }, + { + "epoch": 0.19765494137353434, + "grad_norm": 4.4788713455200195, + "learning_rate": 1.9424634155984986e-05, + "loss": 1.3948, + "step": 15812 + }, + { + "epoch": 0.19767994199854996, + "grad_norm": 1.3570743799209595, + "learning_rate": 1.942434237208979e-05, + "loss": 1.7264, + "step": 15814 + }, + { + "epoch": 0.19770494262356558, + "grad_norm": 2.8148365020751953, + "learning_rate": 1.9424050516420137e-05, + "loss": 0.9449, + "step": 15816 + }, + { + "epoch": 0.1977299432485812, + "grad_norm": 4.546467304229736, + "learning_rate": 1.9423758588978248e-05, + "loss": 1.279, + "step": 15818 + }, + { + "epoch": 0.19775494387359685, + "grad_norm": 0.0006061044405214489, + "learning_rate": 1.9423466589766346e-05, + "loss": 0.5692, + "step": 15820 + }, + { + "epoch": 0.19777994449861247, + "grad_norm": 5.472014427185059, + "learning_rate": 1.9423174518786656e-05, + "loss": 2.1129, + "step": 15822 + }, + { + "epoch": 0.1978049451236281, + "grad_norm": 0.3551781177520752, + "learning_rate": 1.9422882376041403e-05, + "loss": 0.1302, + "step": 15824 + }, + { + "epoch": 0.1978299457486437, + "grad_norm": 0.0007560992380604148, + "learning_rate": 1.9422590161532807e-05, + "loss": 0.5077, + "step": 15826 + }, + { + "epoch": 0.19785494637365933, + "grad_norm": 1.6944526433944702, + "learning_rate": 1.9422297875263096e-05, + "loss": 0.8148, + "step": 15828 + }, + { + "epoch": 0.19787994699867498, + "grad_norm": 0.0005704512586817145, + "learning_rate": 1.9422005517234498e-05, + "loss": 0.7516, + "step": 15830 + }, + { + "epoch": 0.1979049476236906, + "grad_norm": 4.61087703704834, + "learning_rate": 1.942171308744924e-05, + "loss": 1.3492, + "step": 15832 + }, + { + "epoch": 0.19792994824870622, + "grad_norm": 3.0656168460845947, + "learning_rate": 1.9421420585909544e-05, + "loss": 2.0126, + "step": 15834 + }, + { + "epoch": 0.19795494887372184, + "grad_norm": 6.993746757507324, + "learning_rate": 1.9421128012617646e-05, + "loss": 1.2959, + "step": 15836 + }, + { + "epoch": 0.19797994949873746, + "grad_norm": 0.00047750180237926543, + "learning_rate": 1.9420835367575766e-05, + "loss": 0.2799, + "step": 15838 + }, + { + "epoch": 0.1980049501237531, + "grad_norm": 2.7912113666534424, + "learning_rate": 1.942054265078614e-05, + "loss": 1.2263, + "step": 15840 + }, + { + "epoch": 0.19802995074876872, + "grad_norm": 0.030639564618468285, + "learning_rate": 1.9420249862250987e-05, + "loss": 0.5417, + "step": 15842 + }, + { + "epoch": 0.19805495137378434, + "grad_norm": 4.953604221343994, + "learning_rate": 1.941995700197255e-05, + "loss": 2.7663, + "step": 15844 + }, + { + "epoch": 0.19807995199879996, + "grad_norm": 1.6992442607879639, + "learning_rate": 1.9419664069953047e-05, + "loss": 0.1363, + "step": 15846 + }, + { + "epoch": 0.19810495262381558, + "grad_norm": 3.1944589614868164, + "learning_rate": 1.9419371066194717e-05, + "loss": 0.7575, + "step": 15848 + }, + { + "epoch": 0.19812995324883123, + "grad_norm": 3.708564043045044, + "learning_rate": 1.941907799069979e-05, + "loss": 2.2774, + "step": 15850 + }, + { + "epoch": 0.19815495387384685, + "grad_norm": 16.213640213012695, + "learning_rate": 1.9418784843470493e-05, + "loss": 1.2582, + "step": 15852 + }, + { + "epoch": 0.19817995449886247, + "grad_norm": 5.516010761260986, + "learning_rate": 1.9418491624509067e-05, + "loss": 1.3318, + "step": 15854 + }, + { + "epoch": 0.1982049551238781, + "grad_norm": 1.0530248880386353, + "learning_rate": 1.941819833381774e-05, + "loss": 0.7251, + "step": 15856 + }, + { + "epoch": 0.1982299557488937, + "grad_norm": 3.4126479625701904, + "learning_rate": 1.9417904971398743e-05, + "loss": 1.8101, + "step": 15858 + }, + { + "epoch": 0.19825495637390936, + "grad_norm": 2.6258318424224854, + "learning_rate": 1.9417611537254318e-05, + "loss": 1.1021, + "step": 15860 + }, + { + "epoch": 0.19827995699892498, + "grad_norm": 4.405264377593994, + "learning_rate": 1.9417318031386688e-05, + "loss": 1.906, + "step": 15862 + }, + { + "epoch": 0.1983049576239406, + "grad_norm": 4.119356632232666, + "learning_rate": 1.9417024453798104e-05, + "loss": 1.0431, + "step": 15864 + }, + { + "epoch": 0.19832995824895622, + "grad_norm": 2.6395225524902344, + "learning_rate": 1.9416730804490787e-05, + "loss": 1.3608, + "step": 15866 + }, + { + "epoch": 0.19835495887397184, + "grad_norm": 3.9106485843658447, + "learning_rate": 1.9416437083466986e-05, + "loss": 1.0123, + "step": 15868 + }, + { + "epoch": 0.19837995949898748, + "grad_norm": 3.218500852584839, + "learning_rate": 1.9416143290728927e-05, + "loss": 1.1764, + "step": 15870 + }, + { + "epoch": 0.1984049601240031, + "grad_norm": 2.172241449356079, + "learning_rate": 1.9415849426278853e-05, + "loss": 1.5164, + "step": 15872 + }, + { + "epoch": 0.19842996074901872, + "grad_norm": 6.302181720733643, + "learning_rate": 1.9415555490119004e-05, + "loss": 1.8742, + "step": 15874 + }, + { + "epoch": 0.19845496137403434, + "grad_norm": 3.219433069229126, + "learning_rate": 1.9415261482251615e-05, + "loss": 1.1024, + "step": 15876 + }, + { + "epoch": 0.19847996199904996, + "grad_norm": 2.8699047565460205, + "learning_rate": 1.9414967402678924e-05, + "loss": 0.4349, + "step": 15878 + }, + { + "epoch": 0.1985049626240656, + "grad_norm": 2.719735860824585, + "learning_rate": 1.9414673251403172e-05, + "loss": 1.2421, + "step": 15880 + }, + { + "epoch": 0.19852996324908123, + "grad_norm": 0.2686707079410553, + "learning_rate": 1.9414379028426603e-05, + "loss": 0.7717, + "step": 15882 + }, + { + "epoch": 0.19855496387409685, + "grad_norm": 27.955219268798828, + "learning_rate": 1.9414084733751456e-05, + "loss": 3.5492, + "step": 15884 + }, + { + "epoch": 0.19857996449911247, + "grad_norm": 5.825193405151367, + "learning_rate": 1.9413790367379968e-05, + "loss": 1.8587, + "step": 15886 + }, + { + "epoch": 0.1986049651241281, + "grad_norm": 17.29633903503418, + "learning_rate": 1.941349592931438e-05, + "loss": 1.2236, + "step": 15888 + }, + { + "epoch": 0.19862996574914374, + "grad_norm": 1.85598623752594, + "learning_rate": 1.9413201419556945e-05, + "loss": 0.0838, + "step": 15890 + }, + { + "epoch": 0.19865496637415936, + "grad_norm": 3.5734517574310303, + "learning_rate": 1.9412906838109897e-05, + "loss": 2.0844, + "step": 15892 + }, + { + "epoch": 0.19867996699917498, + "grad_norm": 2.875349998474121, + "learning_rate": 1.9412612184975483e-05, + "loss": 1.0355, + "step": 15894 + }, + { + "epoch": 0.1987049676241906, + "grad_norm": 1.9000256061553955, + "learning_rate": 1.941231746015594e-05, + "loss": 0.7256, + "step": 15896 + }, + { + "epoch": 0.19872996824920622, + "grad_norm": 6.420203685760498, + "learning_rate": 1.9412022663653523e-05, + "loss": 2.0011, + "step": 15898 + }, + { + "epoch": 0.19875496887422187, + "grad_norm": 4.221259593963623, + "learning_rate": 1.941172779547047e-05, + "loss": 1.3699, + "step": 15900 + }, + { + "epoch": 0.19877996949923749, + "grad_norm": 8.593576431274414, + "learning_rate": 1.941143285560903e-05, + "loss": 1.1587, + "step": 15902 + }, + { + "epoch": 0.1988049701242531, + "grad_norm": 1.3673256635665894, + "learning_rate": 1.9411137844071448e-05, + "loss": 0.207, + "step": 15904 + }, + { + "epoch": 0.19882997074926873, + "grad_norm": 2.4055395126342773, + "learning_rate": 1.9410842760859975e-05, + "loss": 0.3955, + "step": 15906 + }, + { + "epoch": 0.19885497137428434, + "grad_norm": 3.3088581562042236, + "learning_rate": 1.941054760597685e-05, + "loss": 0.8934, + "step": 15908 + }, + { + "epoch": 0.1988799719993, + "grad_norm": 3.395881175994873, + "learning_rate": 1.9410252379424324e-05, + "loss": 0.6595, + "step": 15910 + }, + { + "epoch": 0.1989049726243156, + "grad_norm": 3.634739637374878, + "learning_rate": 1.940995708120465e-05, + "loss": 1.3876, + "step": 15912 + }, + { + "epoch": 0.19892997324933123, + "grad_norm": 2.2930119037628174, + "learning_rate": 1.9409661711320073e-05, + "loss": 0.7784, + "step": 15914 + }, + { + "epoch": 0.19895497387434685, + "grad_norm": 1.9761313199996948, + "learning_rate": 1.9409366269772843e-05, + "loss": 0.6869, + "step": 15916 + }, + { + "epoch": 0.19897997449936247, + "grad_norm": 7.136513710021973, + "learning_rate": 1.940907075656521e-05, + "loss": 2.2933, + "step": 15918 + }, + { + "epoch": 0.19900497512437812, + "grad_norm": 3.455291271209717, + "learning_rate": 1.940877517169942e-05, + "loss": 0.7275, + "step": 15920 + }, + { + "epoch": 0.19902997574939374, + "grad_norm": 5.151914119720459, + "learning_rate": 1.9408479515177736e-05, + "loss": 1.3792, + "step": 15922 + }, + { + "epoch": 0.19905497637440936, + "grad_norm": 0.7542971968650818, + "learning_rate": 1.94081837870024e-05, + "loss": 0.1905, + "step": 15924 + }, + { + "epoch": 0.19907997699942498, + "grad_norm": 2.507322072982788, + "learning_rate": 1.9407887987175665e-05, + "loss": 1.5276, + "step": 15926 + }, + { + "epoch": 0.1991049776244406, + "grad_norm": 2.5475711822509766, + "learning_rate": 1.9407592115699786e-05, + "loss": 0.415, + "step": 15928 + }, + { + "epoch": 0.19912997824945625, + "grad_norm": 0.23933671414852142, + "learning_rate": 1.9407296172577016e-05, + "loss": 0.4974, + "step": 15930 + }, + { + "epoch": 0.19915497887447187, + "grad_norm": 0.5217235684394836, + "learning_rate": 1.940700015780961e-05, + "loss": 1.137, + "step": 15932 + }, + { + "epoch": 0.1991799794994875, + "grad_norm": 4.237585544586182, + "learning_rate": 1.940670407139982e-05, + "loss": 0.5322, + "step": 15934 + }, + { + "epoch": 0.1992049801245031, + "grad_norm": 6.223837375640869, + "learning_rate": 1.94064079133499e-05, + "loss": 0.6813, + "step": 15936 + }, + { + "epoch": 0.19922998074951873, + "grad_norm": 0.25227591395378113, + "learning_rate": 1.9406111683662115e-05, + "loss": 0.6265, + "step": 15938 + }, + { + "epoch": 0.19925498137453437, + "grad_norm": 3.0341975688934326, + "learning_rate": 1.9405815382338706e-05, + "loss": 1.3795, + "step": 15940 + }, + { + "epoch": 0.19927998199955, + "grad_norm": 3.545743703842163, + "learning_rate": 1.9405519009381938e-05, + "loss": 1.4694, + "step": 15942 + }, + { + "epoch": 0.1993049826245656, + "grad_norm": 1.028074860572815, + "learning_rate": 1.940522256479407e-05, + "loss": 0.7357, + "step": 15944 + }, + { + "epoch": 0.19932998324958123, + "grad_norm": 1.4746242761611938, + "learning_rate": 1.9404926048577355e-05, + "loss": 0.72, + "step": 15946 + }, + { + "epoch": 0.19935498387459685, + "grad_norm": 0.46104303002357483, + "learning_rate": 1.9404629460734054e-05, + "loss": 1.4157, + "step": 15948 + }, + { + "epoch": 0.1993799844996125, + "grad_norm": 2.8525593280792236, + "learning_rate": 1.9404332801266426e-05, + "loss": 0.846, + "step": 15950 + }, + { + "epoch": 0.19940498512462812, + "grad_norm": 0.2223324030637741, + "learning_rate": 1.9404036070176728e-05, + "loss": 0.0118, + "step": 15952 + }, + { + "epoch": 0.19942998574964374, + "grad_norm": 1.5447012186050415, + "learning_rate": 1.940373926746722e-05, + "loss": 0.4081, + "step": 15954 + }, + { + "epoch": 0.19945498637465936, + "grad_norm": 3.4675941467285156, + "learning_rate": 1.9403442393140167e-05, + "loss": 1.5346, + "step": 15956 + }, + { + "epoch": 0.19947998699967498, + "grad_norm": 0.1170429065823555, + "learning_rate": 1.940314544719782e-05, + "loss": 0.0103, + "step": 15958 + }, + { + "epoch": 0.19950498762469063, + "grad_norm": 3.4051642417907715, + "learning_rate": 1.9402848429642454e-05, + "loss": 0.4118, + "step": 15960 + }, + { + "epoch": 0.19952998824970625, + "grad_norm": 0.15130363404750824, + "learning_rate": 1.9402551340476324e-05, + "loss": 0.5626, + "step": 15962 + }, + { + "epoch": 0.19955498887472187, + "grad_norm": 3.0163986682891846, + "learning_rate": 1.940225417970169e-05, + "loss": 0.453, + "step": 15964 + }, + { + "epoch": 0.1995799894997375, + "grad_norm": 3.9548933506011963, + "learning_rate": 1.9401956947320816e-05, + "loss": 0.9727, + "step": 15966 + }, + { + "epoch": 0.1996049901247531, + "grad_norm": 6.699351787567139, + "learning_rate": 1.940165964333597e-05, + "loss": 1.0324, + "step": 15968 + }, + { + "epoch": 0.19962999074976875, + "grad_norm": 3.021341323852539, + "learning_rate": 1.940136226774941e-05, + "loss": 1.1765, + "step": 15970 + }, + { + "epoch": 0.19965499137478437, + "grad_norm": 1.299764633178711, + "learning_rate": 1.940106482056341e-05, + "loss": 0.7971, + "step": 15972 + }, + { + "epoch": 0.1996799919998, + "grad_norm": 2.1345174312591553, + "learning_rate": 1.9400767301780226e-05, + "loss": 0.5274, + "step": 15974 + }, + { + "epoch": 0.1997049926248156, + "grad_norm": 0.12540698051452637, + "learning_rate": 1.940046971140213e-05, + "loss": 0.0072, + "step": 15976 + }, + { + "epoch": 0.19972999324983123, + "grad_norm": 2.456390619277954, + "learning_rate": 1.9400172049431384e-05, + "loss": 1.0603, + "step": 15978 + }, + { + "epoch": 0.19975499387484688, + "grad_norm": 3.8871288299560547, + "learning_rate": 1.9399874315870257e-05, + "loss": 0.7199, + "step": 15980 + }, + { + "epoch": 0.1997799944998625, + "grad_norm": 0.11247794330120087, + "learning_rate": 1.939957651072102e-05, + "loss": 0.3782, + "step": 15982 + }, + { + "epoch": 0.19980499512487812, + "grad_norm": 4.2858357429504395, + "learning_rate": 1.9399278633985932e-05, + "loss": 1.9271, + "step": 15984 + }, + { + "epoch": 0.19982999574989374, + "grad_norm": 0.5771366953849792, + "learning_rate": 1.939898068566727e-05, + "loss": 0.2654, + "step": 15986 + }, + { + "epoch": 0.19985499637490936, + "grad_norm": 0.08291582018136978, + "learning_rate": 1.93986826657673e-05, + "loss": 0.0034, + "step": 15988 + }, + { + "epoch": 0.199879996999925, + "grad_norm": 3.6564745903015137, + "learning_rate": 1.939838457428829e-05, + "loss": 0.8231, + "step": 15990 + }, + { + "epoch": 0.19990499762494063, + "grad_norm": 2.0463757514953613, + "learning_rate": 1.939808641123252e-05, + "loss": 0.7073, + "step": 15992 + }, + { + "epoch": 0.19992999824995625, + "grad_norm": 1.5856324434280396, + "learning_rate": 1.9397788176602242e-05, + "loss": 0.3232, + "step": 15994 + }, + { + "epoch": 0.19995499887497187, + "grad_norm": 3.6682963371276855, + "learning_rate": 1.9397489870399745e-05, + "loss": 1.715, + "step": 15996 + }, + { + "epoch": 0.1999799994999875, + "grad_norm": 0.09693510830402374, + "learning_rate": 1.939719149262729e-05, + "loss": 1.0501, + "step": 15998 + }, + { + "epoch": 0.20000500012500314, + "grad_norm": 3.6998918056488037, + "learning_rate": 1.9396893043287156e-05, + "loss": 1.2068, + "step": 16000 + }, + { + "epoch": 0.20003000075001875, + "grad_norm": 2.3575727939605713, + "learning_rate": 1.9396594522381614e-05, + "loss": 0.8846, + "step": 16002 + }, + { + "epoch": 0.20005500137503437, + "grad_norm": 3.947153091430664, + "learning_rate": 1.9396295929912935e-05, + "loss": 0.7979, + "step": 16004 + }, + { + "epoch": 0.20008000200005, + "grad_norm": 0.08401574939489365, + "learning_rate": 1.93959972658834e-05, + "loss": 0.9924, + "step": 16006 + }, + { + "epoch": 0.20010500262506561, + "grad_norm": 4.183083534240723, + "learning_rate": 1.9395698530295272e-05, + "loss": 1.4794, + "step": 16008 + }, + { + "epoch": 0.20013000325008126, + "grad_norm": 2.295255661010742, + "learning_rate": 1.9395399723150836e-05, + "loss": 1.2999, + "step": 16010 + }, + { + "epoch": 0.20015500387509688, + "grad_norm": 2.7647860050201416, + "learning_rate": 1.939510084445236e-05, + "loss": 0.9845, + "step": 16012 + }, + { + "epoch": 0.2001800045001125, + "grad_norm": 4.335355758666992, + "learning_rate": 1.939480189420213e-05, + "loss": 1.1558, + "step": 16014 + }, + { + "epoch": 0.20020500512512812, + "grad_norm": 2.2820029258728027, + "learning_rate": 1.9394502872402415e-05, + "loss": 0.8827, + "step": 16016 + }, + { + "epoch": 0.20023000575014374, + "grad_norm": 1.7586886882781982, + "learning_rate": 1.9394203779055497e-05, + "loss": 0.747, + "step": 16018 + }, + { + "epoch": 0.2002550063751594, + "grad_norm": 2.762436628341675, + "learning_rate": 1.939390461416365e-05, + "loss": 0.4768, + "step": 16020 + }, + { + "epoch": 0.200280007000175, + "grad_norm": 2.145789384841919, + "learning_rate": 1.9393605377729153e-05, + "loss": 1.0246, + "step": 16022 + }, + { + "epoch": 0.20030500762519063, + "grad_norm": 1.9823273420333862, + "learning_rate": 1.9393306069754284e-05, + "loss": 0.3752, + "step": 16024 + }, + { + "epoch": 0.20033000825020625, + "grad_norm": 2.3413736820220947, + "learning_rate": 1.939300669024133e-05, + "loss": 1.4263, + "step": 16026 + }, + { + "epoch": 0.20035500887522187, + "grad_norm": 3.2998268604278564, + "learning_rate": 1.9392707239192557e-05, + "loss": 0.5732, + "step": 16028 + }, + { + "epoch": 0.20038000950023752, + "grad_norm": 2.464564561843872, + "learning_rate": 1.9392407716610263e-05, + "loss": 0.3787, + "step": 16030 + }, + { + "epoch": 0.20040501012525314, + "grad_norm": 4.115699291229248, + "learning_rate": 1.9392108122496714e-05, + "loss": 1.1078, + "step": 16032 + }, + { + "epoch": 0.20043001075026876, + "grad_norm": 0.07203079760074615, + "learning_rate": 1.93918084568542e-05, + "loss": 0.5926, + "step": 16034 + }, + { + "epoch": 0.20045501137528438, + "grad_norm": 5.1710429191589355, + "learning_rate": 1.9391508719685e-05, + "loss": 1.7664, + "step": 16036 + }, + { + "epoch": 0.2004800120003, + "grad_norm": 2.4638445377349854, + "learning_rate": 1.93912089109914e-05, + "loss": 0.2654, + "step": 16038 + }, + { + "epoch": 0.20050501262531564, + "grad_norm": 4.482306003570557, + "learning_rate": 1.9390909030775677e-05, + "loss": 0.4515, + "step": 16040 + }, + { + "epoch": 0.20053001325033126, + "grad_norm": 0.0799797922372818, + "learning_rate": 1.9390609079040122e-05, + "loss": 0.6342, + "step": 16042 + }, + { + "epoch": 0.20055501387534688, + "grad_norm": 1.8840091228485107, + "learning_rate": 1.9390309055787014e-05, + "loss": 0.4401, + "step": 16044 + }, + { + "epoch": 0.2005800145003625, + "grad_norm": 5.077608108520508, + "learning_rate": 1.939000896101864e-05, + "loss": 1.1325, + "step": 16046 + }, + { + "epoch": 0.20060501512537812, + "grad_norm": 2.1415627002716064, + "learning_rate": 1.938970879473729e-05, + "loss": 0.6891, + "step": 16048 + }, + { + "epoch": 0.20063001575039377, + "grad_norm": 3.228638172149658, + "learning_rate": 1.9389408556945243e-05, + "loss": 1.4789, + "step": 16050 + }, + { + "epoch": 0.2006550163754094, + "grad_norm": 3.5076427459716797, + "learning_rate": 1.9389108247644786e-05, + "loss": 0.939, + "step": 16052 + }, + { + "epoch": 0.200680017000425, + "grad_norm": 1.088663101196289, + "learning_rate": 1.938880786683821e-05, + "loss": 0.8238, + "step": 16054 + }, + { + "epoch": 0.20070501762544063, + "grad_norm": 4.972509384155273, + "learning_rate": 1.9388507414527802e-05, + "loss": 1.1599, + "step": 16056 + }, + { + "epoch": 0.20073001825045625, + "grad_norm": 0.4858766794204712, + "learning_rate": 1.9388206890715847e-05, + "loss": 0.6127, + "step": 16058 + }, + { + "epoch": 0.2007550188754719, + "grad_norm": 6.896998405456543, + "learning_rate": 1.938790629540464e-05, + "loss": 1.5577, + "step": 16060 + }, + { + "epoch": 0.20078001950048752, + "grad_norm": 1.9530680179595947, + "learning_rate": 1.938760562859646e-05, + "loss": 0.9419, + "step": 16062 + }, + { + "epoch": 0.20080502012550314, + "grad_norm": 10.316136360168457, + "learning_rate": 1.9387304890293607e-05, + "loss": 1.2309, + "step": 16064 + }, + { + "epoch": 0.20083002075051876, + "grad_norm": 5.3579511642456055, + "learning_rate": 1.9387004080498366e-05, + "loss": 1.6785, + "step": 16066 + }, + { + "epoch": 0.20085502137553438, + "grad_norm": 6.746122360229492, + "learning_rate": 1.9386703199213032e-05, + "loss": 2.0437, + "step": 16068 + }, + { + "epoch": 0.20088002200055002, + "grad_norm": 2.4398508071899414, + "learning_rate": 1.9386402246439892e-05, + "loss": 1.2005, + "step": 16070 + }, + { + "epoch": 0.20090502262556564, + "grad_norm": 0.057447224855422974, + "learning_rate": 1.9386101222181238e-05, + "loss": 0.0037, + "step": 16072 + }, + { + "epoch": 0.20093002325058126, + "grad_norm": 4.243691444396973, + "learning_rate": 1.9385800126439366e-05, + "loss": 0.0934, + "step": 16074 + }, + { + "epoch": 0.20095502387559688, + "grad_norm": 1.6258317232131958, + "learning_rate": 1.938549895921657e-05, + "loss": 0.4775, + "step": 16076 + }, + { + "epoch": 0.2009800245006125, + "grad_norm": 3.6681017875671387, + "learning_rate": 1.938519772051514e-05, + "loss": 1.2417, + "step": 16078 + }, + { + "epoch": 0.20100502512562815, + "grad_norm": 4.092211723327637, + "learning_rate": 1.938489641033737e-05, + "loss": 1.2671, + "step": 16080 + }, + { + "epoch": 0.20103002575064377, + "grad_norm": 3.061323404312134, + "learning_rate": 1.9384595028685557e-05, + "loss": 1.3348, + "step": 16082 + }, + { + "epoch": 0.2010550263756594, + "grad_norm": 0.052820246666669846, + "learning_rate": 1.9384293575561992e-05, + "loss": 0.5149, + "step": 16084 + }, + { + "epoch": 0.201080027000675, + "grad_norm": 0.7237803339958191, + "learning_rate": 1.938399205096898e-05, + "loss": 0.3922, + "step": 16086 + }, + { + "epoch": 0.20110502762569063, + "grad_norm": 1.5300120115280151, + "learning_rate": 1.9383690454908807e-05, + "loss": 0.4577, + "step": 16088 + }, + { + "epoch": 0.20113002825070628, + "grad_norm": 0.0732884630560875, + "learning_rate": 1.9383388787383778e-05, + "loss": 1.1646, + "step": 16090 + }, + { + "epoch": 0.2011550288757219, + "grad_norm": 4.035252571105957, + "learning_rate": 1.938308704839618e-05, + "loss": 1.4713, + "step": 16092 + }, + { + "epoch": 0.20118002950073752, + "grad_norm": 1.061232328414917, + "learning_rate": 1.9382785237948325e-05, + "loss": 0.0331, + "step": 16094 + }, + { + "epoch": 0.20120503012575314, + "grad_norm": 6.452089309692383, + "learning_rate": 1.9382483356042503e-05, + "loss": 1.2363, + "step": 16096 + }, + { + "epoch": 0.20123003075076876, + "grad_norm": 5.295407295227051, + "learning_rate": 1.9382181402681012e-05, + "loss": 2.0516, + "step": 16098 + }, + { + "epoch": 0.2012550313757844, + "grad_norm": 3.509091854095459, + "learning_rate": 1.938187937786616e-05, + "loss": 1.1914, + "step": 16100 + }, + { + "epoch": 0.20128003200080002, + "grad_norm": 2.7789459228515625, + "learning_rate": 1.9381577281600235e-05, + "loss": 0.7813, + "step": 16102 + }, + { + "epoch": 0.20130503262581564, + "grad_norm": 0.03674786537885666, + "learning_rate": 1.9381275113885547e-05, + "loss": 0.621, + "step": 16104 + }, + { + "epoch": 0.20133003325083126, + "grad_norm": 6.001125335693359, + "learning_rate": 1.9380972874724392e-05, + "loss": 2.0197, + "step": 16106 + }, + { + "epoch": 0.20135503387584688, + "grad_norm": 0.03437921032309532, + "learning_rate": 1.938067056411908e-05, + "loss": 0.1705, + "step": 16108 + }, + { + "epoch": 0.20138003450086253, + "grad_norm": 1.2699151039123535, + "learning_rate": 1.93803681820719e-05, + "loss": 0.4789, + "step": 16110 + }, + { + "epoch": 0.20140503512587815, + "grad_norm": 8.891321182250977, + "learning_rate": 1.938006572858517e-05, + "loss": 0.486, + "step": 16112 + }, + { + "epoch": 0.20143003575089377, + "grad_norm": 1.4173921346664429, + "learning_rate": 1.937976320366118e-05, + "loss": 1.257, + "step": 16114 + }, + { + "epoch": 0.2014550363759094, + "grad_norm": 4.47376012802124, + "learning_rate": 1.9379460607302244e-05, + "loss": 0.5413, + "step": 16116 + }, + { + "epoch": 0.201480037000925, + "grad_norm": 3.722255229949951, + "learning_rate": 1.9379157939510657e-05, + "loss": 1.1583, + "step": 16118 + }, + { + "epoch": 0.20150503762594066, + "grad_norm": 0.4226854145526886, + "learning_rate": 1.9378855200288732e-05, + "loss": 0.9534, + "step": 16120 + }, + { + "epoch": 0.20153003825095628, + "grad_norm": 3.5367064476013184, + "learning_rate": 1.9378552389638774e-05, + "loss": 1.0735, + "step": 16122 + }, + { + "epoch": 0.2015550388759719, + "grad_norm": 2.3586864471435547, + "learning_rate": 1.9378249507563085e-05, + "loss": 0.1396, + "step": 16124 + }, + { + "epoch": 0.20158003950098752, + "grad_norm": 4.516465663909912, + "learning_rate": 1.9377946554063975e-05, + "loss": 1.2797, + "step": 16126 + }, + { + "epoch": 0.20160504012600314, + "grad_norm": 0.13719508051872253, + "learning_rate": 1.9377643529143752e-05, + "loss": 0.6162, + "step": 16128 + }, + { + "epoch": 0.20163004075101879, + "grad_norm": 2.2766640186309814, + "learning_rate": 1.937734043280472e-05, + "loss": 0.0305, + "step": 16130 + }, + { + "epoch": 0.2016550413760344, + "grad_norm": 4.718758583068848, + "learning_rate": 1.937703726504919e-05, + "loss": 1.6471, + "step": 16132 + }, + { + "epoch": 0.20168004200105002, + "grad_norm": 3.070807695388794, + "learning_rate": 1.937673402587947e-05, + "loss": 1.2267, + "step": 16134 + }, + { + "epoch": 0.20170504262606564, + "grad_norm": 3.4291093349456787, + "learning_rate": 1.937643071529787e-05, + "loss": 0.6117, + "step": 16136 + }, + { + "epoch": 0.20173004325108126, + "grad_norm": 11.814725875854492, + "learning_rate": 1.93761273333067e-05, + "loss": 1.2513, + "step": 16138 + }, + { + "epoch": 0.2017550438760969, + "grad_norm": 3.276761293411255, + "learning_rate": 1.9375823879908266e-05, + "loss": 1.7601, + "step": 16140 + }, + { + "epoch": 0.20178004450111253, + "grad_norm": 4.288705825805664, + "learning_rate": 1.9375520355104888e-05, + "loss": 1.4846, + "step": 16142 + }, + { + "epoch": 0.20180504512612815, + "grad_norm": 3.5126850605010986, + "learning_rate": 1.937521675889887e-05, + "loss": 0.9103, + "step": 16144 + }, + { + "epoch": 0.20183004575114377, + "grad_norm": 1.6837269067764282, + "learning_rate": 1.9374913091292525e-05, + "loss": 1.5148, + "step": 16146 + }, + { + "epoch": 0.2018550463761594, + "grad_norm": 7.748693466186523, + "learning_rate": 1.9374609352288172e-05, + "loss": 1.7489, + "step": 16148 + }, + { + "epoch": 0.20188004700117504, + "grad_norm": 1.9061795473098755, + "learning_rate": 1.9374305541888117e-05, + "loss": 0.8634, + "step": 16150 + }, + { + "epoch": 0.20190504762619066, + "grad_norm": 3.4287309646606445, + "learning_rate": 1.937400166009468e-05, + "loss": 1.5248, + "step": 16152 + }, + { + "epoch": 0.20193004825120628, + "grad_norm": 4.017831325531006, + "learning_rate": 1.937369770691017e-05, + "loss": 0.4976, + "step": 16154 + }, + { + "epoch": 0.2019550488762219, + "grad_norm": 1.1842753887176514, + "learning_rate": 1.9373393682336905e-05, + "loss": 0.0281, + "step": 16156 + }, + { + "epoch": 0.20198004950123752, + "grad_norm": 2.6526341438293457, + "learning_rate": 1.93730895863772e-05, + "loss": 1.3959, + "step": 16158 + }, + { + "epoch": 0.20200505012625317, + "grad_norm": 8.889820098876953, + "learning_rate": 1.9372785419033366e-05, + "loss": 2.2842, + "step": 16160 + }, + { + "epoch": 0.20203005075126879, + "grad_norm": 2.3649792671203613, + "learning_rate": 1.9372481180307727e-05, + "loss": 0.4978, + "step": 16162 + }, + { + "epoch": 0.2020550513762844, + "grad_norm": 2.4495277404785156, + "learning_rate": 1.9372176870202593e-05, + "loss": 0.1137, + "step": 16164 + }, + { + "epoch": 0.20208005200130003, + "grad_norm": 3.391512870788574, + "learning_rate": 1.937187248872029e-05, + "loss": 1.1283, + "step": 16166 + }, + { + "epoch": 0.20210505262631565, + "grad_norm": 4.179758071899414, + "learning_rate": 1.937156803586313e-05, + "loss": 0.2955, + "step": 16168 + }, + { + "epoch": 0.2021300532513313, + "grad_norm": 3.596743106842041, + "learning_rate": 1.937126351163343e-05, + "loss": 1.575, + "step": 16170 + }, + { + "epoch": 0.2021550538763469, + "grad_norm": 2.498945951461792, + "learning_rate": 1.9370958916033518e-05, + "loss": 1.1401, + "step": 16172 + }, + { + "epoch": 0.20218005450136253, + "grad_norm": 5.695172309875488, + "learning_rate": 1.9370654249065706e-05, + "loss": 0.1955, + "step": 16174 + }, + { + "epoch": 0.20220505512637815, + "grad_norm": 3.620067834854126, + "learning_rate": 1.9370349510732313e-05, + "loss": 1.0716, + "step": 16176 + }, + { + "epoch": 0.20223005575139377, + "grad_norm": 2.4425127506256104, + "learning_rate": 1.9370044701035665e-05, + "loss": 0.5298, + "step": 16178 + }, + { + "epoch": 0.20225505637640942, + "grad_norm": 6.191170692443848, + "learning_rate": 1.9369739819978083e-05, + "loss": 0.4012, + "step": 16180 + }, + { + "epoch": 0.20228005700142504, + "grad_norm": 2.056246280670166, + "learning_rate": 1.9369434867561886e-05, + "loss": 0.9822, + "step": 16182 + }, + { + "epoch": 0.20230505762644066, + "grad_norm": 5.61297082901001, + "learning_rate": 1.93691298437894e-05, + "loss": 2.6068, + "step": 16184 + }, + { + "epoch": 0.20233005825145628, + "grad_norm": 6.527688503265381, + "learning_rate": 1.9368824748662944e-05, + "loss": 1.738, + "step": 16186 + }, + { + "epoch": 0.2023550588764719, + "grad_norm": 0.7595486044883728, + "learning_rate": 1.9368519582184845e-05, + "loss": 0.1567, + "step": 16188 + }, + { + "epoch": 0.20238005950148755, + "grad_norm": 6.437367916107178, + "learning_rate": 1.9368214344357427e-05, + "loss": 2.086, + "step": 16190 + }, + { + "epoch": 0.20240506012650317, + "grad_norm": 3.4716594219207764, + "learning_rate": 1.936790903518301e-05, + "loss": 1.5376, + "step": 16192 + }, + { + "epoch": 0.2024300607515188, + "grad_norm": 10.270779609680176, + "learning_rate": 1.9367603654663924e-05, + "loss": 1.0508, + "step": 16194 + }, + { + "epoch": 0.2024550613765344, + "grad_norm": 3.474334955215454, + "learning_rate": 1.9367298202802494e-05, + "loss": 0.8707, + "step": 16196 + }, + { + "epoch": 0.20248006200155003, + "grad_norm": 0.03583014756441116, + "learning_rate": 1.9366992679601043e-05, + "loss": 0.4915, + "step": 16198 + }, + { + "epoch": 0.20250506262656567, + "grad_norm": 0.03936063498258591, + "learning_rate": 1.9366687085061905e-05, + "loss": 0.7307, + "step": 16200 + }, + { + "epoch": 0.2025300632515813, + "grad_norm": 3.0613598823547363, + "learning_rate": 1.9366381419187396e-05, + "loss": 0.9902, + "step": 16202 + }, + { + "epoch": 0.2025550638765969, + "grad_norm": 8.970232963562012, + "learning_rate": 1.9366075681979855e-05, + "loss": 1.6033, + "step": 16204 + }, + { + "epoch": 0.20258006450161253, + "grad_norm": 3.9343814849853516, + "learning_rate": 1.9365769873441604e-05, + "loss": 0.9905, + "step": 16206 + }, + { + "epoch": 0.20260506512662815, + "grad_norm": 2.753696918487549, + "learning_rate": 1.936546399357498e-05, + "loss": 1.1442, + "step": 16208 + }, + { + "epoch": 0.2026300657516438, + "grad_norm": 4.627656936645508, + "learning_rate": 1.93651580423823e-05, + "loss": 0.6747, + "step": 16210 + }, + { + "epoch": 0.20265506637665942, + "grad_norm": 1.323525309562683, + "learning_rate": 1.9364852019865904e-05, + "loss": 0.4174, + "step": 16212 + }, + { + "epoch": 0.20268006700167504, + "grad_norm": 6.119852542877197, + "learning_rate": 1.9364545926028118e-05, + "loss": 0.8358, + "step": 16214 + }, + { + "epoch": 0.20270506762669066, + "grad_norm": 2.1441264152526855, + "learning_rate": 1.9364239760871272e-05, + "loss": 0.1058, + "step": 16216 + }, + { + "epoch": 0.20273006825170628, + "grad_norm": 2.9575135707855225, + "learning_rate": 1.9363933524397704e-05, + "loss": 0.6712, + "step": 16218 + }, + { + "epoch": 0.20275506887672193, + "grad_norm": 4.307199954986572, + "learning_rate": 1.9363627216609743e-05, + "loss": 1.4858, + "step": 16220 + }, + { + "epoch": 0.20278006950173755, + "grad_norm": 1.1545958518981934, + "learning_rate": 1.9363320837509718e-05, + "loss": 0.1243, + "step": 16222 + }, + { + "epoch": 0.20280507012675317, + "grad_norm": 2.3407273292541504, + "learning_rate": 1.9363014387099968e-05, + "loss": 0.5782, + "step": 16224 + }, + { + "epoch": 0.2028300707517688, + "grad_norm": 0.8964581489562988, + "learning_rate": 1.9362707865382824e-05, + "loss": 0.0252, + "step": 16226 + }, + { + "epoch": 0.2028550713767844, + "grad_norm": 2.310126781463623, + "learning_rate": 1.936240127236062e-05, + "loss": 0.4762, + "step": 16228 + }, + { + "epoch": 0.20288007200180005, + "grad_norm": 0.031562697142362595, + "learning_rate": 1.9362094608035692e-05, + "loss": 1.8485, + "step": 16230 + }, + { + "epoch": 0.20290507262681567, + "grad_norm": 1.786292552947998, + "learning_rate": 1.936178787241038e-05, + "loss": 1.3448, + "step": 16232 + }, + { + "epoch": 0.2029300732518313, + "grad_norm": 2.183889865875244, + "learning_rate": 1.936148106548701e-05, + "loss": 0.9411, + "step": 16234 + }, + { + "epoch": 0.20295507387684691, + "grad_norm": 4.424091815948486, + "learning_rate": 1.9361174187267925e-05, + "loss": 1.0875, + "step": 16236 + }, + { + "epoch": 0.20298007450186253, + "grad_norm": 2.8737633228302, + "learning_rate": 1.936086723775546e-05, + "loss": 0.7085, + "step": 16238 + }, + { + "epoch": 0.20300507512687818, + "grad_norm": 4.661364555358887, + "learning_rate": 1.9360560216951954e-05, + "loss": 0.69, + "step": 16240 + }, + { + "epoch": 0.2030300757518938, + "grad_norm": 4.112095832824707, + "learning_rate": 1.9360253124859747e-05, + "loss": 0.8976, + "step": 16242 + }, + { + "epoch": 0.20305507637690942, + "grad_norm": 0.2511855661869049, + "learning_rate": 1.9359945961481177e-05, + "loss": 0.664, + "step": 16244 + }, + { + "epoch": 0.20308007700192504, + "grad_norm": 0.9921243786811829, + "learning_rate": 1.9359638726818583e-05, + "loss": 0.6731, + "step": 16246 + }, + { + "epoch": 0.20310507762694066, + "grad_norm": 1.8096234798431396, + "learning_rate": 1.9359331420874298e-05, + "loss": 1.0419, + "step": 16248 + }, + { + "epoch": 0.2031300782519563, + "grad_norm": 3.847869396209717, + "learning_rate": 1.9359024043650675e-05, + "loss": 2.1179, + "step": 16250 + }, + { + "epoch": 0.20315507887697193, + "grad_norm": 2.943387031555176, + "learning_rate": 1.9358716595150043e-05, + "loss": 0.5861, + "step": 16252 + }, + { + "epoch": 0.20318007950198755, + "grad_norm": 3.2642152309417725, + "learning_rate": 1.935840907537475e-05, + "loss": 0.801, + "step": 16254 + }, + { + "epoch": 0.20320508012700317, + "grad_norm": 2.4737637042999268, + "learning_rate": 1.935810148432714e-05, + "loss": 1.072, + "step": 16256 + }, + { + "epoch": 0.2032300807520188, + "grad_norm": 1.6178430318832397, + "learning_rate": 1.9357793822009548e-05, + "loss": 0.6773, + "step": 16258 + }, + { + "epoch": 0.20325508137703444, + "grad_norm": 0.04341525211930275, + "learning_rate": 1.9357486088424325e-05, + "loss": 0.4362, + "step": 16260 + }, + { + "epoch": 0.20328008200205006, + "grad_norm": 3.498126983642578, + "learning_rate": 1.9357178283573808e-05, + "loss": 0.9251, + "step": 16262 + }, + { + "epoch": 0.20330508262706568, + "grad_norm": 3.072985887527466, + "learning_rate": 1.935687040746035e-05, + "loss": 0.8525, + "step": 16264 + }, + { + "epoch": 0.2033300832520813, + "grad_norm": 2.5039074420928955, + "learning_rate": 1.9356562460086284e-05, + "loss": 0.6232, + "step": 16266 + }, + { + "epoch": 0.20335508387709691, + "grad_norm": 3.074183464050293, + "learning_rate": 1.9356254441453965e-05, + "loss": 1.1373, + "step": 16268 + }, + { + "epoch": 0.20338008450211256, + "grad_norm": 2.2573904991149902, + "learning_rate": 1.935594635156574e-05, + "loss": 0.2317, + "step": 16270 + }, + { + "epoch": 0.20340508512712818, + "grad_norm": 2.819962978363037, + "learning_rate": 1.9355638190423942e-05, + "loss": 0.6499, + "step": 16272 + }, + { + "epoch": 0.2034300857521438, + "grad_norm": 2.3442304134368896, + "learning_rate": 1.935532995803093e-05, + "loss": 0.4464, + "step": 16274 + }, + { + "epoch": 0.20345508637715942, + "grad_norm": 0.03328723460435867, + "learning_rate": 1.9355021654389047e-05, + "loss": 0.1704, + "step": 16276 + }, + { + "epoch": 0.20348008700217504, + "grad_norm": 0.019745411351323128, + "learning_rate": 1.9354713279500644e-05, + "loss": 0.8709, + "step": 16278 + }, + { + "epoch": 0.2035050876271907, + "grad_norm": 2.376063823699951, + "learning_rate": 1.9354404833368066e-05, + "loss": 0.9271, + "step": 16280 + }, + { + "epoch": 0.2035300882522063, + "grad_norm": 3.202929735183716, + "learning_rate": 1.935409631599366e-05, + "loss": 1.8057, + "step": 16282 + }, + { + "epoch": 0.20355508887722193, + "grad_norm": 16.861095428466797, + "learning_rate": 1.9353787727379788e-05, + "loss": 0.6772, + "step": 16284 + }, + { + "epoch": 0.20358008950223755, + "grad_norm": 4.226761817932129, + "learning_rate": 1.9353479067528783e-05, + "loss": 0.565, + "step": 16286 + }, + { + "epoch": 0.20360509012725317, + "grad_norm": 4.369072437286377, + "learning_rate": 1.935317033644301e-05, + "loss": 1.0515, + "step": 16288 + }, + { + "epoch": 0.20363009075226882, + "grad_norm": 4.114746570587158, + "learning_rate": 1.9352861534124813e-05, + "loss": 1.8281, + "step": 16290 + }, + { + "epoch": 0.20365509137728444, + "grad_norm": 0.5147926211357117, + "learning_rate": 1.9352552660576544e-05, + "loss": 0.2612, + "step": 16292 + }, + { + "epoch": 0.20368009200230006, + "grad_norm": 1.7917778491973877, + "learning_rate": 1.9352243715800556e-05, + "loss": 0.1958, + "step": 16294 + }, + { + "epoch": 0.20370509262731568, + "grad_norm": 4.230902671813965, + "learning_rate": 1.9351934699799203e-05, + "loss": 1.1579, + "step": 16296 + }, + { + "epoch": 0.2037300932523313, + "grad_norm": 4.236385345458984, + "learning_rate": 1.935162561257484e-05, + "loss": 0.6504, + "step": 16298 + }, + { + "epoch": 0.20375509387734694, + "grad_norm": 4.8948798179626465, + "learning_rate": 1.9351316454129816e-05, + "loss": 0.3572, + "step": 16300 + }, + { + "epoch": 0.20378009450236256, + "grad_norm": 0.023576008155941963, + "learning_rate": 1.9351007224466486e-05, + "loss": 0.4733, + "step": 16302 + }, + { + "epoch": 0.20380509512737818, + "grad_norm": 0.8865755796432495, + "learning_rate": 1.9350697923587215e-05, + "loss": 0.5961, + "step": 16304 + }, + { + "epoch": 0.2038300957523938, + "grad_norm": 2.817354202270508, + "learning_rate": 1.9350388551494347e-05, + "loss": 0.7091, + "step": 16306 + }, + { + "epoch": 0.20385509637740942, + "grad_norm": 3.4514918327331543, + "learning_rate": 1.935007910819024e-05, + "loss": 1.448, + "step": 16308 + }, + { + "epoch": 0.20388009700242507, + "grad_norm": 4.072614669799805, + "learning_rate": 1.9349769593677252e-05, + "loss": 1.2525, + "step": 16310 + }, + { + "epoch": 0.2039050976274407, + "grad_norm": 3.301095962524414, + "learning_rate": 1.9349460007957745e-05, + "loss": 1.9291, + "step": 16312 + }, + { + "epoch": 0.2039300982524563, + "grad_norm": 5.398072719573975, + "learning_rate": 1.934915035103407e-05, + "loss": 0.4208, + "step": 16314 + }, + { + "epoch": 0.20395509887747193, + "grad_norm": 4.0506792068481445, + "learning_rate": 1.9348840622908588e-05, + "loss": 0.7853, + "step": 16316 + }, + { + "epoch": 0.20398009950248755, + "grad_norm": 2.827505350112915, + "learning_rate": 1.934853082358366e-05, + "loss": 1.968, + "step": 16318 + }, + { + "epoch": 0.2040051001275032, + "grad_norm": 2.989234447479248, + "learning_rate": 1.9348220953061642e-05, + "loss": 0.7518, + "step": 16320 + }, + { + "epoch": 0.20403010075251882, + "grad_norm": 8.18559455871582, + "learning_rate": 1.9347911011344897e-05, + "loss": 1.3549, + "step": 16322 + }, + { + "epoch": 0.20405510137753444, + "grad_norm": 4.494256496429443, + "learning_rate": 1.9347600998435778e-05, + "loss": 2.1977, + "step": 16324 + }, + { + "epoch": 0.20408010200255006, + "grad_norm": 0.016794655472040176, + "learning_rate": 1.9347290914336657e-05, + "loss": 0.5602, + "step": 16326 + }, + { + "epoch": 0.20410510262756568, + "grad_norm": 3.122405767440796, + "learning_rate": 1.934698075904989e-05, + "loss": 0.6463, + "step": 16328 + }, + { + "epoch": 0.20413010325258132, + "grad_norm": 2.459519624710083, + "learning_rate": 1.9346670532577837e-05, + "loss": 1.2573, + "step": 16330 + }, + { + "epoch": 0.20415510387759694, + "grad_norm": 2.388803243637085, + "learning_rate": 1.9346360234922862e-05, + "loss": 1.0545, + "step": 16332 + }, + { + "epoch": 0.20418010450261256, + "grad_norm": 11.741652488708496, + "learning_rate": 1.934604986608733e-05, + "loss": 0.8301, + "step": 16334 + }, + { + "epoch": 0.20420510512762818, + "grad_norm": 5.590234279632568, + "learning_rate": 1.9345739426073605e-05, + "loss": 2.1169, + "step": 16336 + }, + { + "epoch": 0.2042301057526438, + "grad_norm": 2.5604536533355713, + "learning_rate": 1.934542891488405e-05, + "loss": 0.6045, + "step": 16338 + }, + { + "epoch": 0.20425510637765945, + "grad_norm": 3.7947301864624023, + "learning_rate": 1.934511833252103e-05, + "loss": 1.555, + "step": 16340 + }, + { + "epoch": 0.20428010700267507, + "grad_norm": 1.896687626838684, + "learning_rate": 1.9344807678986914e-05, + "loss": 1.0663, + "step": 16342 + }, + { + "epoch": 0.2043051076276907, + "grad_norm": 7.340259552001953, + "learning_rate": 1.934449695428406e-05, + "loss": 1.304, + "step": 16344 + }, + { + "epoch": 0.2043301082527063, + "grad_norm": 2.5043129920959473, + "learning_rate": 1.9344186158414838e-05, + "loss": 0.4127, + "step": 16346 + }, + { + "epoch": 0.20435510887772193, + "grad_norm": 4.425262928009033, + "learning_rate": 1.9343875291381615e-05, + "loss": 0.3666, + "step": 16348 + }, + { + "epoch": 0.20438010950273758, + "grad_norm": 2.0605831146240234, + "learning_rate": 1.9343564353186763e-05, + "loss": 1.5885, + "step": 16350 + }, + { + "epoch": 0.2044051101277532, + "grad_norm": 5.911958694458008, + "learning_rate": 1.934325334383264e-05, + "loss": 1.325, + "step": 16352 + }, + { + "epoch": 0.20443011075276882, + "grad_norm": 3.269771099090576, + "learning_rate": 1.934294226332163e-05, + "loss": 1.3137, + "step": 16354 + }, + { + "epoch": 0.20445511137778444, + "grad_norm": 2.193608283996582, + "learning_rate": 1.9342631111656085e-05, + "loss": 0.5079, + "step": 16356 + }, + { + "epoch": 0.20448011200280006, + "grad_norm": 3.9370594024658203, + "learning_rate": 1.9342319888838385e-05, + "loss": 1.0271, + "step": 16358 + }, + { + "epoch": 0.2045051126278157, + "grad_norm": 3.637995719909668, + "learning_rate": 1.9342008594870897e-05, + "loss": 1.3773, + "step": 16360 + }, + { + "epoch": 0.20453011325283132, + "grad_norm": 4.313623905181885, + "learning_rate": 1.9341697229755996e-05, + "loss": 1.7069, + "step": 16362 + }, + { + "epoch": 0.20455511387784694, + "grad_norm": 1.8843785524368286, + "learning_rate": 1.9341385793496047e-05, + "loss": 0.6223, + "step": 16364 + }, + { + "epoch": 0.20458011450286256, + "grad_norm": 3.041025400161743, + "learning_rate": 1.9341074286093427e-05, + "loss": 1.8776, + "step": 16366 + }, + { + "epoch": 0.20460511512787818, + "grad_norm": 3.2984492778778076, + "learning_rate": 1.9340762707550503e-05, + "loss": 2.9609, + "step": 16368 + }, + { + "epoch": 0.20463011575289383, + "grad_norm": 0.03744538873434067, + "learning_rate": 1.9340451057869655e-05, + "loss": 0.3157, + "step": 16370 + }, + { + "epoch": 0.20465511637790945, + "grad_norm": 3.998243570327759, + "learning_rate": 1.934013933705325e-05, + "loss": 1.3694, + "step": 16372 + }, + { + "epoch": 0.20468011700292507, + "grad_norm": 3.8493664264678955, + "learning_rate": 1.9339827545103663e-05, + "loss": 0.91, + "step": 16374 + }, + { + "epoch": 0.2047051176279407, + "grad_norm": 1.7960370779037476, + "learning_rate": 1.9339515682023273e-05, + "loss": 0.1482, + "step": 16376 + }, + { + "epoch": 0.2047301182529563, + "grad_norm": 4.659246444702148, + "learning_rate": 1.9339203747814453e-05, + "loss": 1.9791, + "step": 16378 + }, + { + "epoch": 0.20475511887797196, + "grad_norm": 5.593038558959961, + "learning_rate": 1.9338891742479575e-05, + "loss": 2.1641, + "step": 16380 + }, + { + "epoch": 0.20478011950298758, + "grad_norm": 3.2076892852783203, + "learning_rate": 1.933857966602102e-05, + "loss": 0.6821, + "step": 16382 + }, + { + "epoch": 0.2048051201280032, + "grad_norm": 0.471113383769989, + "learning_rate": 1.9338267518441163e-05, + "loss": 0.5503, + "step": 16384 + }, + { + "epoch": 0.20483012075301882, + "grad_norm": 3.0731239318847656, + "learning_rate": 1.933795529974238e-05, + "loss": 0.9334, + "step": 16386 + }, + { + "epoch": 0.20485512137803444, + "grad_norm": 3.0731117725372314, + "learning_rate": 1.933764300992705e-05, + "loss": 0.9845, + "step": 16388 + }, + { + "epoch": 0.20488012200305009, + "grad_norm": 0.04870719835162163, + "learning_rate": 1.9337330648997553e-05, + "loss": 0.0443, + "step": 16390 + }, + { + "epoch": 0.2049051226280657, + "grad_norm": 5.482316970825195, + "learning_rate": 1.9337018216956265e-05, + "loss": 1.773, + "step": 16392 + }, + { + "epoch": 0.20493012325308133, + "grad_norm": 3.4558334350585938, + "learning_rate": 1.9336705713805565e-05, + "loss": 1.0706, + "step": 16394 + }, + { + "epoch": 0.20495512387809695, + "grad_norm": 5.509994029998779, + "learning_rate": 1.933639313954784e-05, + "loss": 2.0511, + "step": 16396 + }, + { + "epoch": 0.20498012450311257, + "grad_norm": 2.159050703048706, + "learning_rate": 1.9336080494185457e-05, + "loss": 1.2845, + "step": 16398 + }, + { + "epoch": 0.2050051251281282, + "grad_norm": 18.019466400146484, + "learning_rate": 1.9335767777720812e-05, + "loss": 2.0548, + "step": 16400 + }, + { + "epoch": 0.20503012575314383, + "grad_norm": 2.8277111053466797, + "learning_rate": 1.9335454990156276e-05, + "loss": 0.4211, + "step": 16402 + }, + { + "epoch": 0.20505512637815945, + "grad_norm": 0.20458349585533142, + "learning_rate": 1.9335142131494237e-05, + "loss": 1.1395, + "step": 16404 + }, + { + "epoch": 0.20508012700317507, + "grad_norm": 4.044463157653809, + "learning_rate": 1.9334829201737074e-05, + "loss": 0.6283, + "step": 16406 + }, + { + "epoch": 0.2051051276281907, + "grad_norm": 5.001949787139893, + "learning_rate": 1.9334516200887175e-05, + "loss": 1.532, + "step": 16408 + }, + { + "epoch": 0.20513012825320634, + "grad_norm": 1.9294692277908325, + "learning_rate": 1.933420312894692e-05, + "loss": 1.0393, + "step": 16410 + }, + { + "epoch": 0.20515512887822196, + "grad_norm": 2.6143505573272705, + "learning_rate": 1.933388998591869e-05, + "loss": 0.647, + "step": 16412 + }, + { + "epoch": 0.20518012950323758, + "grad_norm": 5.036990642547607, + "learning_rate": 1.933357677180488e-05, + "loss": 0.5991, + "step": 16414 + }, + { + "epoch": 0.2052051301282532, + "grad_norm": 2.9862911701202393, + "learning_rate": 1.9333263486607864e-05, + "loss": 0.9639, + "step": 16416 + }, + { + "epoch": 0.20523013075326882, + "grad_norm": 5.296509265899658, + "learning_rate": 1.933295013033003e-05, + "loss": 2.0275, + "step": 16418 + }, + { + "epoch": 0.20525513137828447, + "grad_norm": 3.351611852645874, + "learning_rate": 1.9332636702973775e-05, + "loss": 1.146, + "step": 16420 + }, + { + "epoch": 0.2052801320033001, + "grad_norm": 2.502923011779785, + "learning_rate": 1.9332323204541476e-05, + "loss": 0.2147, + "step": 16422 + }, + { + "epoch": 0.2053051326283157, + "grad_norm": 3.3414859771728516, + "learning_rate": 1.9332009635035524e-05, + "loss": 0.92, + "step": 16424 + }, + { + "epoch": 0.20533013325333133, + "grad_norm": 3.1733524799346924, + "learning_rate": 1.9331695994458303e-05, + "loss": 0.6833, + "step": 16426 + }, + { + "epoch": 0.20535513387834695, + "grad_norm": 4.012197017669678, + "learning_rate": 1.933138228281221e-05, + "loss": 1.1975, + "step": 16428 + }, + { + "epoch": 0.2053801345033626, + "grad_norm": 2.5342423915863037, + "learning_rate": 1.9331068500099627e-05, + "loss": 1.0669, + "step": 16430 + }, + { + "epoch": 0.2054051351283782, + "grad_norm": 2.478361129760742, + "learning_rate": 1.9330754646322943e-05, + "loss": 1.3285, + "step": 16432 + }, + { + "epoch": 0.20543013575339383, + "grad_norm": 7.344250679016113, + "learning_rate": 1.9330440721484555e-05, + "loss": 1.5307, + "step": 16434 + }, + { + "epoch": 0.20545513637840945, + "grad_norm": 2.840263605117798, + "learning_rate": 1.9330126725586845e-05, + "loss": 1.2302, + "step": 16436 + }, + { + "epoch": 0.20548013700342507, + "grad_norm": 4.520957946777344, + "learning_rate": 1.9329812658632215e-05, + "loss": 0.2751, + "step": 16438 + }, + { + "epoch": 0.20550513762844072, + "grad_norm": 4.454472541809082, + "learning_rate": 1.9329498520623048e-05, + "loss": 1.1986, + "step": 16440 + }, + { + "epoch": 0.20553013825345634, + "grad_norm": 5.506472110748291, + "learning_rate": 1.932918431156174e-05, + "loss": 1.2747, + "step": 16442 + }, + { + "epoch": 0.20555513887847196, + "grad_norm": 4.203255653381348, + "learning_rate": 1.9328870031450684e-05, + "loss": 1.0478, + "step": 16444 + }, + { + "epoch": 0.20558013950348758, + "grad_norm": 3.316240072250366, + "learning_rate": 1.9328555680292272e-05, + "loss": 1.4109, + "step": 16446 + }, + { + "epoch": 0.2056051401285032, + "grad_norm": 3.6821444034576416, + "learning_rate": 1.93282412580889e-05, + "loss": 0.9589, + "step": 16448 + }, + { + "epoch": 0.20563014075351885, + "grad_norm": 6.09835958480835, + "learning_rate": 1.932792676484296e-05, + "loss": 1.1239, + "step": 16450 + }, + { + "epoch": 0.20565514137853447, + "grad_norm": 4.42420768737793, + "learning_rate": 1.932761220055685e-05, + "loss": 1.1673, + "step": 16452 + }, + { + "epoch": 0.2056801420035501, + "grad_norm": 2.263615608215332, + "learning_rate": 1.932729756523297e-05, + "loss": 0.7131, + "step": 16454 + }, + { + "epoch": 0.2057051426285657, + "grad_norm": 4.6684184074401855, + "learning_rate": 1.9326982858873703e-05, + "loss": 1.3889, + "step": 16456 + }, + { + "epoch": 0.20573014325358133, + "grad_norm": 3.790792465209961, + "learning_rate": 1.9326668081481457e-05, + "loss": 1.6102, + "step": 16458 + }, + { + "epoch": 0.20575514387859697, + "grad_norm": 4.734850883483887, + "learning_rate": 1.9326353233058625e-05, + "loss": 1.2254, + "step": 16460 + }, + { + "epoch": 0.2057801445036126, + "grad_norm": 11.313741683959961, + "learning_rate": 1.9326038313607604e-05, + "loss": 2.5617, + "step": 16462 + }, + { + "epoch": 0.20580514512862821, + "grad_norm": 2.9200730323791504, + "learning_rate": 1.9325723323130797e-05, + "loss": 1.7627, + "step": 16464 + }, + { + "epoch": 0.20583014575364383, + "grad_norm": 1.3456676006317139, + "learning_rate": 1.93254082616306e-05, + "loss": 0.5401, + "step": 16466 + }, + { + "epoch": 0.20585514637865945, + "grad_norm": 4.521666049957275, + "learning_rate": 1.932509312910941e-05, + "loss": 1.9534, + "step": 16468 + }, + { + "epoch": 0.2058801470036751, + "grad_norm": 3.3518354892730713, + "learning_rate": 1.9324777925569627e-05, + "loss": 1.3597, + "step": 16470 + }, + { + "epoch": 0.20590514762869072, + "grad_norm": 2.332796573638916, + "learning_rate": 1.932446265101366e-05, + "loss": 1.2461, + "step": 16472 + }, + { + "epoch": 0.20593014825370634, + "grad_norm": 1.1985862255096436, + "learning_rate": 1.93241473054439e-05, + "loss": 0.7171, + "step": 16474 + }, + { + "epoch": 0.20595514887872196, + "grad_norm": 4.013763427734375, + "learning_rate": 1.9323831888862756e-05, + "loss": 0.6413, + "step": 16476 + }, + { + "epoch": 0.20598014950373758, + "grad_norm": 5.890068054199219, + "learning_rate": 1.9323516401272625e-05, + "loss": 0.6819, + "step": 16478 + }, + { + "epoch": 0.20600515012875323, + "grad_norm": 2.9372808933258057, + "learning_rate": 1.932320084267591e-05, + "loss": 1.3562, + "step": 16480 + }, + { + "epoch": 0.20603015075376885, + "grad_norm": 4.624879360198975, + "learning_rate": 1.932288521307502e-05, + "loss": 0.9955, + "step": 16482 + }, + { + "epoch": 0.20605515137878447, + "grad_norm": 5.288248538970947, + "learning_rate": 1.932256951247235e-05, + "loss": 0.8913, + "step": 16484 + }, + { + "epoch": 0.2060801520038001, + "grad_norm": 3.6783065795898438, + "learning_rate": 1.932225374087031e-05, + "loss": 1.5276, + "step": 16486 + }, + { + "epoch": 0.2061051526288157, + "grad_norm": 4.411109924316406, + "learning_rate": 1.9321937898271305e-05, + "loss": 1.2587, + "step": 16488 + }, + { + "epoch": 0.20613015325383136, + "grad_norm": 2.78015398979187, + "learning_rate": 1.9321621984677738e-05, + "loss": 0.8556, + "step": 16490 + }, + { + "epoch": 0.20615515387884698, + "grad_norm": 2.300790786743164, + "learning_rate": 1.932130600009202e-05, + "loss": 0.2144, + "step": 16492 + }, + { + "epoch": 0.2061801545038626, + "grad_norm": 2.5189592838287354, + "learning_rate": 1.932098994451655e-05, + "loss": 0.5099, + "step": 16494 + }, + { + "epoch": 0.20620515512887821, + "grad_norm": 6.639420986175537, + "learning_rate": 1.932067381795374e-05, + "loss": 1.5384, + "step": 16496 + }, + { + "epoch": 0.20623015575389383, + "grad_norm": 3.4388535022735596, + "learning_rate": 1.9320357620405994e-05, + "loss": 0.7184, + "step": 16498 + }, + { + "epoch": 0.20625515637890948, + "grad_norm": 2.2826507091522217, + "learning_rate": 1.9320041351875725e-05, + "loss": 0.5399, + "step": 16500 + }, + { + "epoch": 0.2062801570039251, + "grad_norm": 3.2201406955718994, + "learning_rate": 1.931972501236534e-05, + "loss": 1.2532, + "step": 16502 + }, + { + "epoch": 0.20630515762894072, + "grad_norm": 5.5740861892700195, + "learning_rate": 1.9319408601877243e-05, + "loss": 2.2617, + "step": 16504 + }, + { + "epoch": 0.20633015825395634, + "grad_norm": 2.8010141849517822, + "learning_rate": 1.9319092120413852e-05, + "loss": 0.4839, + "step": 16506 + }, + { + "epoch": 0.20635515887897196, + "grad_norm": 7.861551761627197, + "learning_rate": 1.931877556797757e-05, + "loss": 1.1754, + "step": 16508 + }, + { + "epoch": 0.2063801595039876, + "grad_norm": 3.772998332977295, + "learning_rate": 1.9318458944570814e-05, + "loss": 0.587, + "step": 16510 + }, + { + "epoch": 0.20640516012900323, + "grad_norm": 4.847893238067627, + "learning_rate": 1.9318142250195988e-05, + "loss": 2.1762, + "step": 16512 + }, + { + "epoch": 0.20643016075401885, + "grad_norm": 2.277730703353882, + "learning_rate": 1.931782548485551e-05, + "loss": 0.9687, + "step": 16514 + }, + { + "epoch": 0.20645516137903447, + "grad_norm": 3.0383479595184326, + "learning_rate": 1.9317508648551794e-05, + "loss": 0.5828, + "step": 16516 + }, + { + "epoch": 0.2064801620040501, + "grad_norm": 4.4286627769470215, + "learning_rate": 1.9317191741287247e-05, + "loss": 1.2122, + "step": 16518 + }, + { + "epoch": 0.20650516262906574, + "grad_norm": 4.824170112609863, + "learning_rate": 1.9316874763064288e-05, + "loss": 0.9795, + "step": 16520 + }, + { + "epoch": 0.20653016325408136, + "grad_norm": 1.8040317296981812, + "learning_rate": 1.9316557713885327e-05, + "loss": 0.8289, + "step": 16522 + }, + { + "epoch": 0.20655516387909698, + "grad_norm": 6.191468715667725, + "learning_rate": 1.931624059375278e-05, + "loss": 0.3889, + "step": 16524 + }, + { + "epoch": 0.2065801645041126, + "grad_norm": 2.9438068866729736, + "learning_rate": 1.931592340266906e-05, + "loss": 1.2906, + "step": 16526 + }, + { + "epoch": 0.20660516512912822, + "grad_norm": 4.497733116149902, + "learning_rate": 1.9315606140636585e-05, + "loss": 1.0617, + "step": 16528 + }, + { + "epoch": 0.20663016575414386, + "grad_norm": 5.104039669036865, + "learning_rate": 1.9315288807657772e-05, + "loss": 0.8647, + "step": 16530 + }, + { + "epoch": 0.20665516637915948, + "grad_norm": 3.639504909515381, + "learning_rate": 1.931497140373504e-05, + "loss": 0.8264, + "step": 16532 + }, + { + "epoch": 0.2066801670041751, + "grad_norm": 3.727499485015869, + "learning_rate": 1.9314653928870795e-05, + "loss": 1.1218, + "step": 16534 + }, + { + "epoch": 0.20670516762919072, + "grad_norm": 3.077237606048584, + "learning_rate": 1.931433638306747e-05, + "loss": 0.6925, + "step": 16536 + }, + { + "epoch": 0.20673016825420634, + "grad_norm": 4.043461322784424, + "learning_rate": 1.9314018766327476e-05, + "loss": 1.4447, + "step": 16538 + }, + { + "epoch": 0.206755168879222, + "grad_norm": 0.29965320229530334, + "learning_rate": 1.931370107865323e-05, + "loss": 0.9067, + "step": 16540 + }, + { + "epoch": 0.2067801695042376, + "grad_norm": 1.7639678716659546, + "learning_rate": 1.931338332004715e-05, + "loss": 0.7704, + "step": 16542 + }, + { + "epoch": 0.20680517012925323, + "grad_norm": 6.831397533416748, + "learning_rate": 1.9313065490511668e-05, + "loss": 0.9884, + "step": 16544 + }, + { + "epoch": 0.20683017075426885, + "grad_norm": 3.9665985107421875, + "learning_rate": 1.9312747590049193e-05, + "loss": 0.7583, + "step": 16546 + }, + { + "epoch": 0.20685517137928447, + "grad_norm": 3.4576663970947266, + "learning_rate": 1.9312429618662146e-05, + "loss": 1.3004, + "step": 16548 + }, + { + "epoch": 0.20688017200430012, + "grad_norm": 0.6690617203712463, + "learning_rate": 1.9312111576352955e-05, + "loss": 0.8416, + "step": 16550 + }, + { + "epoch": 0.20690517262931574, + "grad_norm": 2.5121023654937744, + "learning_rate": 1.9311793463124037e-05, + "loss": 0.1783, + "step": 16552 + }, + { + "epoch": 0.20693017325433136, + "grad_norm": 2.282076120376587, + "learning_rate": 1.9311475278977817e-05, + "loss": 1.2171, + "step": 16554 + }, + { + "epoch": 0.20695517387934698, + "grad_norm": 3.219137191772461, + "learning_rate": 1.931115702391672e-05, + "loss": 1.3865, + "step": 16556 + }, + { + "epoch": 0.2069801745043626, + "grad_norm": 5.542757987976074, + "learning_rate": 1.9310838697943167e-05, + "loss": 0.4189, + "step": 16558 + }, + { + "epoch": 0.20700517512937824, + "grad_norm": 0.05516421049833298, + "learning_rate": 1.9310520301059584e-05, + "loss": 0.2331, + "step": 16560 + }, + { + "epoch": 0.20703017575439386, + "grad_norm": 3.8405301570892334, + "learning_rate": 1.9310201833268394e-05, + "loss": 1.4374, + "step": 16562 + }, + { + "epoch": 0.20705517637940948, + "grad_norm": 4.772418022155762, + "learning_rate": 1.9309883294572024e-05, + "loss": 1.6953, + "step": 16564 + }, + { + "epoch": 0.2070801770044251, + "grad_norm": 0.030455728992819786, + "learning_rate": 1.9309564684972895e-05, + "loss": 0.111, + "step": 16566 + }, + { + "epoch": 0.20710517762944072, + "grad_norm": 0.019422423094511032, + "learning_rate": 1.9309246004473446e-05, + "loss": 0.5944, + "step": 16568 + }, + { + "epoch": 0.20713017825445637, + "grad_norm": 3.5541412830352783, + "learning_rate": 1.930892725307609e-05, + "loss": 0.7881, + "step": 16570 + }, + { + "epoch": 0.207155178879472, + "grad_norm": 5.542299270629883, + "learning_rate": 1.930860843078326e-05, + "loss": 1.1237, + "step": 16572 + }, + { + "epoch": 0.2071801795044876, + "grad_norm": 1.7304681539535522, + "learning_rate": 1.9308289537597388e-05, + "loss": 0.0436, + "step": 16574 + }, + { + "epoch": 0.20720518012950323, + "grad_norm": 3.954159736633301, + "learning_rate": 1.93079705735209e-05, + "loss": 1.2655, + "step": 16576 + }, + { + "epoch": 0.20723018075451885, + "grad_norm": 3.350813865661621, + "learning_rate": 1.9307651538556217e-05, + "loss": 0.2944, + "step": 16578 + }, + { + "epoch": 0.2072551813795345, + "grad_norm": 2.770599842071533, + "learning_rate": 1.9307332432705783e-05, + "loss": 0.3934, + "step": 16580 + }, + { + "epoch": 0.20728018200455012, + "grad_norm": 4.185618877410889, + "learning_rate": 1.930701325597202e-05, + "loss": 1.0677, + "step": 16582 + }, + { + "epoch": 0.20730518262956574, + "grad_norm": 3.73455548286438, + "learning_rate": 1.930669400835736e-05, + "loss": 1.482, + "step": 16584 + }, + { + "epoch": 0.20733018325458136, + "grad_norm": 2.430006980895996, + "learning_rate": 1.9306374689864237e-05, + "loss": 1.4946, + "step": 16586 + }, + { + "epoch": 0.20735518387959698, + "grad_norm": 1.5161306858062744, + "learning_rate": 1.9306055300495074e-05, + "loss": 0.7197, + "step": 16588 + }, + { + "epoch": 0.20738018450461262, + "grad_norm": 4.214519500732422, + "learning_rate": 1.9305735840252313e-05, + "loss": 1.968, + "step": 16590 + }, + { + "epoch": 0.20740518512962824, + "grad_norm": 5.261515140533447, + "learning_rate": 1.9305416309138386e-05, + "loss": 1.4612, + "step": 16592 + }, + { + "epoch": 0.20743018575464386, + "grad_norm": 2.0978891849517822, + "learning_rate": 1.930509670715572e-05, + "loss": 1.3324, + "step": 16594 + }, + { + "epoch": 0.20745518637965948, + "grad_norm": 1.9335519075393677, + "learning_rate": 1.9304777034306758e-05, + "loss": 1.9008, + "step": 16596 + }, + { + "epoch": 0.2074801870046751, + "grad_norm": 3.411552667617798, + "learning_rate": 1.9304457290593928e-05, + "loss": 1.7951, + "step": 16598 + }, + { + "epoch": 0.20750518762969075, + "grad_norm": 0.2018096148967743, + "learning_rate": 1.9304137476019667e-05, + "loss": 0.4586, + "step": 16600 + }, + { + "epoch": 0.20753018825470637, + "grad_norm": 1.9552196264266968, + "learning_rate": 1.930381759058641e-05, + "loss": 1.1201, + "step": 16602 + }, + { + "epoch": 0.207555188879722, + "grad_norm": 2.737076759338379, + "learning_rate": 1.930349763429659e-05, + "loss": 1.734, + "step": 16604 + }, + { + "epoch": 0.2075801895047376, + "grad_norm": 2.0097386837005615, + "learning_rate": 1.9303177607152656e-05, + "loss": 0.2205, + "step": 16606 + }, + { + "epoch": 0.20760519012975323, + "grad_norm": 4.126959800720215, + "learning_rate": 1.9302857509157034e-05, + "loss": 0.9573, + "step": 16608 + }, + { + "epoch": 0.20763019075476888, + "grad_norm": 3.7416725158691406, + "learning_rate": 1.9302537340312164e-05, + "loss": 0.8483, + "step": 16610 + }, + { + "epoch": 0.2076551913797845, + "grad_norm": 6.652001857757568, + "learning_rate": 1.9302217100620483e-05, + "loss": 0.211, + "step": 16612 + }, + { + "epoch": 0.20768019200480012, + "grad_norm": 9.092864036560059, + "learning_rate": 1.9301896790084437e-05, + "loss": 1.3596, + "step": 16614 + }, + { + "epoch": 0.20770519262981574, + "grad_norm": 4.161346912384033, + "learning_rate": 1.9301576408706454e-05, + "loss": 1.3935, + "step": 16616 + }, + { + "epoch": 0.20773019325483136, + "grad_norm": 2.43403959274292, + "learning_rate": 1.9301255956488985e-05, + "loss": 0.8921, + "step": 16618 + }, + { + "epoch": 0.207755193879847, + "grad_norm": 0.030448364093899727, + "learning_rate": 1.9300935433434463e-05, + "loss": 0.5636, + "step": 16620 + }, + { + "epoch": 0.20778019450486263, + "grad_norm": 5.133103847503662, + "learning_rate": 1.9300614839545336e-05, + "loss": 0.8569, + "step": 16622 + }, + { + "epoch": 0.20780519512987825, + "grad_norm": 2.439314126968384, + "learning_rate": 1.930029417482404e-05, + "loss": 0.9371, + "step": 16624 + }, + { + "epoch": 0.20783019575489386, + "grad_norm": 2.975980043411255, + "learning_rate": 1.9299973439273015e-05, + "loss": 0.6555, + "step": 16626 + }, + { + "epoch": 0.20785519637990948, + "grad_norm": 3.130481243133545, + "learning_rate": 1.9299652632894713e-05, + "loss": 0.6704, + "step": 16628 + }, + { + "epoch": 0.20788019700492513, + "grad_norm": 4.851682662963867, + "learning_rate": 1.929933175569157e-05, + "loss": 2.3287, + "step": 16630 + }, + { + "epoch": 0.20790519762994075, + "grad_norm": 3.1598246097564697, + "learning_rate": 1.929901080766603e-05, + "loss": 0.2654, + "step": 16632 + }, + { + "epoch": 0.20793019825495637, + "grad_norm": 2.0802133083343506, + "learning_rate": 1.929868978882054e-05, + "loss": 1.399, + "step": 16634 + }, + { + "epoch": 0.207955198879972, + "grad_norm": 4.03174352645874, + "learning_rate": 1.9298368699157543e-05, + "loss": 1.6937, + "step": 16636 + }, + { + "epoch": 0.2079801995049876, + "grad_norm": 3.4346630573272705, + "learning_rate": 1.9298047538679482e-05, + "loss": 0.8504, + "step": 16638 + }, + { + "epoch": 0.20800520013000326, + "grad_norm": 2.6579885482788086, + "learning_rate": 1.929772630738881e-05, + "loss": 1.4385, + "step": 16640 + }, + { + "epoch": 0.20803020075501888, + "grad_norm": 9.940733909606934, + "learning_rate": 1.929740500528797e-05, + "loss": 0.9224, + "step": 16642 + }, + { + "epoch": 0.2080552013800345, + "grad_norm": 2.689626693725586, + "learning_rate": 1.92970836323794e-05, + "loss": 1.2882, + "step": 16644 + }, + { + "epoch": 0.20808020200505012, + "grad_norm": 3.355466365814209, + "learning_rate": 1.9296762188665565e-05, + "loss": 0.4411, + "step": 16646 + }, + { + "epoch": 0.20810520263006574, + "grad_norm": 9.68507194519043, + "learning_rate": 1.92964406741489e-05, + "loss": 2.9882, + "step": 16648 + }, + { + "epoch": 0.2081302032550814, + "grad_norm": 0.02015337161719799, + "learning_rate": 1.929611908883186e-05, + "loss": 0.0316, + "step": 16650 + }, + { + "epoch": 0.208155203880097, + "grad_norm": 2.551311731338501, + "learning_rate": 1.929579743271689e-05, + "loss": 0.4397, + "step": 16652 + }, + { + "epoch": 0.20818020450511263, + "grad_norm": 0.5571892857551575, + "learning_rate": 1.9295475705806443e-05, + "loss": 1.0869, + "step": 16654 + }, + { + "epoch": 0.20820520513012825, + "grad_norm": 1.2553635835647583, + "learning_rate": 1.929515390810296e-05, + "loss": 0.7017, + "step": 16656 + }, + { + "epoch": 0.20823020575514387, + "grad_norm": 3.5359013080596924, + "learning_rate": 1.929483203960891e-05, + "loss": 1.2009, + "step": 16658 + }, + { + "epoch": 0.2082552063801595, + "grad_norm": 2.666121482849121, + "learning_rate": 1.9294510100326723e-05, + "loss": 1.5216, + "step": 16660 + }, + { + "epoch": 0.20828020700517513, + "grad_norm": 4.418664932250977, + "learning_rate": 1.9294188090258868e-05, + "loss": 0.6623, + "step": 16662 + }, + { + "epoch": 0.20830520763019075, + "grad_norm": 0.472517192363739, + "learning_rate": 1.929386600940779e-05, + "loss": 0.1036, + "step": 16664 + }, + { + "epoch": 0.20833020825520637, + "grad_norm": 0.5806611776351929, + "learning_rate": 1.9293543857775943e-05, + "loss": 0.3091, + "step": 16666 + }, + { + "epoch": 0.208355208880222, + "grad_norm": 3.235873222351074, + "learning_rate": 1.929322163536578e-05, + "loss": 1.8717, + "step": 16668 + }, + { + "epoch": 0.20838020950523764, + "grad_norm": 2.911848306655884, + "learning_rate": 1.9292899342179756e-05, + "loss": 1.5844, + "step": 16670 + }, + { + "epoch": 0.20840521013025326, + "grad_norm": 2.9898440837860107, + "learning_rate": 1.9292576978220322e-05, + "loss": 0.7742, + "step": 16672 + }, + { + "epoch": 0.20843021075526888, + "grad_norm": 3.5556373596191406, + "learning_rate": 1.9292254543489938e-05, + "loss": 1.4126, + "step": 16674 + }, + { + "epoch": 0.2084552113802845, + "grad_norm": 4.201451301574707, + "learning_rate": 1.9291932037991056e-05, + "loss": 1.0862, + "step": 16676 + }, + { + "epoch": 0.20848021200530012, + "grad_norm": 0.010458925738930702, + "learning_rate": 1.9291609461726135e-05, + "loss": 0.7482, + "step": 16678 + }, + { + "epoch": 0.20850521263031577, + "grad_norm": 5.723374366760254, + "learning_rate": 1.9291286814697628e-05, + "loss": 1.2971, + "step": 16680 + }, + { + "epoch": 0.2085302132553314, + "grad_norm": 3.9859459400177, + "learning_rate": 1.9290964096908e-05, + "loss": 1.3008, + "step": 16682 + }, + { + "epoch": 0.208555213880347, + "grad_norm": 0.3129290044307709, + "learning_rate": 1.9290641308359695e-05, + "loss": 0.0229, + "step": 16684 + }, + { + "epoch": 0.20858021450536263, + "grad_norm": 0.04318195953965187, + "learning_rate": 1.9290318449055188e-05, + "loss": 0.3556, + "step": 16686 + }, + { + "epoch": 0.20860521513037825, + "grad_norm": 1.6877235174179077, + "learning_rate": 1.9289995518996924e-05, + "loss": 0.6908, + "step": 16688 + }, + { + "epoch": 0.2086302157553939, + "grad_norm": 1.5242339372634888, + "learning_rate": 1.928967251818737e-05, + "loss": 0.7888, + "step": 16690 + }, + { + "epoch": 0.20865521638040951, + "grad_norm": 3.7527997493743896, + "learning_rate": 1.9289349446628983e-05, + "loss": 0.6385, + "step": 16692 + }, + { + "epoch": 0.20868021700542513, + "grad_norm": 3.484449625015259, + "learning_rate": 1.9289026304324222e-05, + "loss": 0.5542, + "step": 16694 + }, + { + "epoch": 0.20870521763044075, + "grad_norm": 1.9726659059524536, + "learning_rate": 1.9288703091275554e-05, + "loss": 0.0724, + "step": 16696 + }, + { + "epoch": 0.20873021825545637, + "grad_norm": 3.5697319507598877, + "learning_rate": 1.9288379807485435e-05, + "loss": 1.084, + "step": 16698 + }, + { + "epoch": 0.20875521888047202, + "grad_norm": 1.3526735305786133, + "learning_rate": 1.9288056452956332e-05, + "loss": 0.5758, + "step": 16700 + }, + { + "epoch": 0.20878021950548764, + "grad_norm": 4.0144476890563965, + "learning_rate": 1.92877330276907e-05, + "loss": 2.0397, + "step": 16702 + }, + { + "epoch": 0.20880522013050326, + "grad_norm": 5.528491020202637, + "learning_rate": 1.928740953169101e-05, + "loss": 1.4925, + "step": 16704 + }, + { + "epoch": 0.20883022075551888, + "grad_norm": 2.4836466312408447, + "learning_rate": 1.9287085964959723e-05, + "loss": 0.5203, + "step": 16706 + }, + { + "epoch": 0.2088552213805345, + "grad_norm": 4.329421043395996, + "learning_rate": 1.9286762327499298e-05, + "loss": 1.4971, + "step": 16708 + }, + { + "epoch": 0.20888022200555015, + "grad_norm": 1.1853617429733276, + "learning_rate": 1.9286438619312208e-05, + "loss": 0.1572, + "step": 16710 + }, + { + "epoch": 0.20890522263056577, + "grad_norm": 4.845726490020752, + "learning_rate": 1.9286114840400913e-05, + "loss": 2.0724, + "step": 16712 + }, + { + "epoch": 0.2089302232555814, + "grad_norm": 4.337953090667725, + "learning_rate": 1.928579099076788e-05, + "loss": 1.0591, + "step": 16714 + }, + { + "epoch": 0.208955223880597, + "grad_norm": 3.157764434814453, + "learning_rate": 1.9285467070415577e-05, + "loss": 0.652, + "step": 16716 + }, + { + "epoch": 0.20898022450561263, + "grad_norm": 2.127964973449707, + "learning_rate": 1.9285143079346472e-05, + "loss": 0.3699, + "step": 16718 + }, + { + "epoch": 0.20900522513062827, + "grad_norm": 4.231760025024414, + "learning_rate": 1.928481901756303e-05, + "loss": 1.38, + "step": 16720 + }, + { + "epoch": 0.2090302257556439, + "grad_norm": 1.0047746896743774, + "learning_rate": 1.9284494885067717e-05, + "loss": 0.0806, + "step": 16722 + }, + { + "epoch": 0.20905522638065951, + "grad_norm": 7.107082843780518, + "learning_rate": 1.9284170681863005e-05, + "loss": 0.7332, + "step": 16724 + }, + { + "epoch": 0.20908022700567513, + "grad_norm": 4.287851810455322, + "learning_rate": 1.9283846407951358e-05, + "loss": 1.6629, + "step": 16726 + }, + { + "epoch": 0.20910522763069075, + "grad_norm": 0.015676021575927734, + "learning_rate": 1.9283522063335255e-05, + "loss": 0.0069, + "step": 16728 + }, + { + "epoch": 0.2091302282557064, + "grad_norm": 3.222309112548828, + "learning_rate": 1.928319764801716e-05, + "loss": 0.9022, + "step": 16730 + }, + { + "epoch": 0.20915522888072202, + "grad_norm": 2.3595151901245117, + "learning_rate": 1.928287316199954e-05, + "loss": 0.8311, + "step": 16732 + }, + { + "epoch": 0.20918022950573764, + "grad_norm": 5.033219814300537, + "learning_rate": 1.9282548605284877e-05, + "loss": 1.5382, + "step": 16734 + }, + { + "epoch": 0.20920523013075326, + "grad_norm": 2.9656968116760254, + "learning_rate": 1.9282223977875634e-05, + "loss": 0.0786, + "step": 16736 + }, + { + "epoch": 0.20923023075576888, + "grad_norm": 0.3080686330795288, + "learning_rate": 1.9281899279774284e-05, + "loss": 0.4423, + "step": 16738 + }, + { + "epoch": 0.20925523138078453, + "grad_norm": 2.847562313079834, + "learning_rate": 1.9281574510983302e-05, + "loss": 1.1217, + "step": 16740 + }, + { + "epoch": 0.20928023200580015, + "grad_norm": 3.0510647296905518, + "learning_rate": 1.928124967150516e-05, + "loss": 0.965, + "step": 16742 + }, + { + "epoch": 0.20930523263081577, + "grad_norm": 4.050568580627441, + "learning_rate": 1.928092476134234e-05, + "loss": 1.0908, + "step": 16744 + }, + { + "epoch": 0.2093302332558314, + "grad_norm": 5.153210163116455, + "learning_rate": 1.9280599780497303e-05, + "loss": 0.9615, + "step": 16746 + }, + { + "epoch": 0.209355233880847, + "grad_norm": 3.8734164237976074, + "learning_rate": 1.928027472897253e-05, + "loss": 1.9962, + "step": 16748 + }, + { + "epoch": 0.20938023450586266, + "grad_norm": 2.3658714294433594, + "learning_rate": 1.92799496067705e-05, + "loss": 1.4473, + "step": 16750 + }, + { + "epoch": 0.20940523513087828, + "grad_norm": 2.815696954727173, + "learning_rate": 1.9279624413893688e-05, + "loss": 0.5327, + "step": 16752 + }, + { + "epoch": 0.2094302357558939, + "grad_norm": 5.425656318664551, + "learning_rate": 1.9279299150344564e-05, + "loss": 0.4469, + "step": 16754 + }, + { + "epoch": 0.20945523638090952, + "grad_norm": 1.5496701002120972, + "learning_rate": 1.9278973816125612e-05, + "loss": 0.7105, + "step": 16756 + }, + { + "epoch": 0.20948023700592514, + "grad_norm": 2.9034619331359863, + "learning_rate": 1.9278648411239307e-05, + "loss": 0.9599, + "step": 16758 + }, + { + "epoch": 0.20950523763094078, + "grad_norm": 3.0299875736236572, + "learning_rate": 1.927832293568813e-05, + "loss": 0.3958, + "step": 16760 + }, + { + "epoch": 0.2095302382559564, + "grad_norm": 0.020365426316857338, + "learning_rate": 1.9277997389474557e-05, + "loss": 0.001, + "step": 16762 + }, + { + "epoch": 0.20955523888097202, + "grad_norm": 4.865665912628174, + "learning_rate": 1.9277671772601066e-05, + "loss": 1.4297, + "step": 16764 + }, + { + "epoch": 0.20958023950598764, + "grad_norm": 0.021829111501574516, + "learning_rate": 1.927734608507014e-05, + "loss": 0.4888, + "step": 16766 + }, + { + "epoch": 0.20960524013100326, + "grad_norm": 3.4224679470062256, + "learning_rate": 1.9277020326884255e-05, + "loss": 0.1891, + "step": 16768 + }, + { + "epoch": 0.2096302407560189, + "grad_norm": 1.7875568866729736, + "learning_rate": 1.9276694498045898e-05, + "loss": 0.5872, + "step": 16770 + }, + { + "epoch": 0.20965524138103453, + "grad_norm": 5.4943413734436035, + "learning_rate": 1.927636859855755e-05, + "loss": 0.7489, + "step": 16772 + }, + { + "epoch": 0.20968024200605015, + "grad_norm": 4.067590713500977, + "learning_rate": 1.9276042628421687e-05, + "loss": 1.3224, + "step": 16774 + }, + { + "epoch": 0.20970524263106577, + "grad_norm": 2.9978151321411133, + "learning_rate": 1.9275716587640792e-05, + "loss": 0.4754, + "step": 16776 + }, + { + "epoch": 0.2097302432560814, + "grad_norm": 1.317478060722351, + "learning_rate": 1.927539047621736e-05, + "loss": 0.5346, + "step": 16778 + }, + { + "epoch": 0.20975524388109704, + "grad_norm": 7.469034194946289, + "learning_rate": 1.9275064294153857e-05, + "loss": 0.9588, + "step": 16780 + }, + { + "epoch": 0.20978024450611266, + "grad_norm": 3.4389359951019287, + "learning_rate": 1.927473804145278e-05, + "loss": 1.2818, + "step": 16782 + }, + { + "epoch": 0.20980524513112828, + "grad_norm": 0.27818968892097473, + "learning_rate": 1.927441171811661e-05, + "loss": 0.6972, + "step": 16784 + }, + { + "epoch": 0.2098302457561439, + "grad_norm": 1.9987995624542236, + "learning_rate": 1.9274085324147835e-05, + "loss": 0.5095, + "step": 16786 + }, + { + "epoch": 0.20985524638115952, + "grad_norm": 1.6821677684783936, + "learning_rate": 1.9273758859548933e-05, + "loss": 0.6503, + "step": 16788 + }, + { + "epoch": 0.20988024700617516, + "grad_norm": 2.2714779376983643, + "learning_rate": 1.9273432324322395e-05, + "loss": 0.8958, + "step": 16790 + }, + { + "epoch": 0.20990524763119078, + "grad_norm": 0.019566437229514122, + "learning_rate": 1.9273105718470707e-05, + "loss": 0.2517, + "step": 16792 + }, + { + "epoch": 0.2099302482562064, + "grad_norm": 3.799008846282959, + "learning_rate": 1.927277904199636e-05, + "loss": 0.8627, + "step": 16794 + }, + { + "epoch": 0.20995524888122202, + "grad_norm": 0.025982387363910675, + "learning_rate": 1.9272452294901836e-05, + "loss": 0.2281, + "step": 16796 + }, + { + "epoch": 0.20998024950623764, + "grad_norm": 2.319723129272461, + "learning_rate": 1.927212547718963e-05, + "loss": 0.1622, + "step": 16798 + }, + { + "epoch": 0.2100052501312533, + "grad_norm": 2.4644460678100586, + "learning_rate": 1.9271798588862226e-05, + "loss": 0.4177, + "step": 16800 + }, + { + "epoch": 0.2100302507562689, + "grad_norm": 3.313595771789551, + "learning_rate": 1.9271471629922117e-05, + "loss": 0.7081, + "step": 16802 + }, + { + "epoch": 0.21005525138128453, + "grad_norm": 3.7706806659698486, + "learning_rate": 1.9271144600371786e-05, + "loss": 1.0414, + "step": 16804 + }, + { + "epoch": 0.21008025200630015, + "grad_norm": 7.425693511962891, + "learning_rate": 1.9270817500213733e-05, + "loss": 0.8228, + "step": 16806 + }, + { + "epoch": 0.21010525263131577, + "grad_norm": 3.8443963527679443, + "learning_rate": 1.9270490329450443e-05, + "loss": 1.8193, + "step": 16808 + }, + { + "epoch": 0.21013025325633142, + "grad_norm": 4.6550188064575195, + "learning_rate": 1.9270163088084412e-05, + "loss": 1.7252, + "step": 16810 + }, + { + "epoch": 0.21015525388134704, + "grad_norm": 3.1983697414398193, + "learning_rate": 1.926983577611813e-05, + "loss": 2.1218, + "step": 16812 + }, + { + "epoch": 0.21018025450636266, + "grad_norm": 3.2935006618499756, + "learning_rate": 1.9269508393554087e-05, + "loss": 0.6399, + "step": 16814 + }, + { + "epoch": 0.21020525513137828, + "grad_norm": 2.598647356033325, + "learning_rate": 1.9269180940394783e-05, + "loss": 1.2671, + "step": 16816 + }, + { + "epoch": 0.2102302557563939, + "grad_norm": 3.9124560356140137, + "learning_rate": 1.9268853416642702e-05, + "loss": 1.4957, + "step": 16818 + }, + { + "epoch": 0.21025525638140954, + "grad_norm": 2.3946938514709473, + "learning_rate": 1.926852582230035e-05, + "loss": 1.4161, + "step": 16820 + }, + { + "epoch": 0.21028025700642516, + "grad_norm": 2.522717237472534, + "learning_rate": 1.926819815737021e-05, + "loss": 1.6576, + "step": 16822 + }, + { + "epoch": 0.21030525763144078, + "grad_norm": 2.6020724773406982, + "learning_rate": 1.926787042185479e-05, + "loss": 0.5533, + "step": 16824 + }, + { + "epoch": 0.2103302582564564, + "grad_norm": 0.9543086290359497, + "learning_rate": 1.9267542615756574e-05, + "loss": 0.2431, + "step": 16826 + }, + { + "epoch": 0.21035525888147202, + "grad_norm": 0.22882293164730072, + "learning_rate": 1.9267214739078065e-05, + "loss": 0.0024, + "step": 16828 + }, + { + "epoch": 0.21038025950648767, + "grad_norm": 0.01774434931576252, + "learning_rate": 1.926688679182176e-05, + "loss": 0.9068, + "step": 16830 + }, + { + "epoch": 0.2104052601315033, + "grad_norm": 2.922475576400757, + "learning_rate": 1.9266558773990157e-05, + "loss": 0.6272, + "step": 16832 + }, + { + "epoch": 0.2104302607565189, + "grad_norm": 4.822674751281738, + "learning_rate": 1.926623068558575e-05, + "loss": 1.6026, + "step": 16834 + }, + { + "epoch": 0.21045526138153453, + "grad_norm": 3.430365800857544, + "learning_rate": 1.9265902526611042e-05, + "loss": 0.9939, + "step": 16836 + }, + { + "epoch": 0.21048026200655015, + "grad_norm": 2.4088661670684814, + "learning_rate": 1.9265574297068532e-05, + "loss": 1.2256, + "step": 16838 + }, + { + "epoch": 0.2105052626315658, + "grad_norm": 3.2205419540405273, + "learning_rate": 1.9265245996960716e-05, + "loss": 0.6197, + "step": 16840 + }, + { + "epoch": 0.21053026325658142, + "grad_norm": 4.364763259887695, + "learning_rate": 1.92649176262901e-05, + "loss": 1.6447, + "step": 16842 + }, + { + "epoch": 0.21055526388159704, + "grad_norm": 3.503856897354126, + "learning_rate": 1.9264589185059177e-05, + "loss": 1.1372, + "step": 16844 + }, + { + "epoch": 0.21058026450661266, + "grad_norm": 3.699324607849121, + "learning_rate": 1.9264260673270455e-05, + "loss": 1.1144, + "step": 16846 + }, + { + "epoch": 0.21060526513162828, + "grad_norm": 3.7051620483398438, + "learning_rate": 1.9263932090926433e-05, + "loss": 1.0547, + "step": 16848 + }, + { + "epoch": 0.21063026575664393, + "grad_norm": 1.5408036708831787, + "learning_rate": 1.9263603438029615e-05, + "loss": 0.3912, + "step": 16850 + }, + { + "epoch": 0.21065526638165954, + "grad_norm": 1.1041591167449951, + "learning_rate": 1.9263274714582505e-05, + "loss": 0.3762, + "step": 16852 + }, + { + "epoch": 0.21068026700667516, + "grad_norm": 3.129547119140625, + "learning_rate": 1.9262945920587602e-05, + "loss": 1.0728, + "step": 16854 + }, + { + "epoch": 0.21070526763169078, + "grad_norm": 1.1632877588272095, + "learning_rate": 1.9262617056047415e-05, + "loss": 0.6834, + "step": 16856 + }, + { + "epoch": 0.2107302682567064, + "grad_norm": 2.2740790843963623, + "learning_rate": 1.9262288120964444e-05, + "loss": 0.3502, + "step": 16858 + }, + { + "epoch": 0.21075526888172205, + "grad_norm": 3.227757215499878, + "learning_rate": 1.92619591153412e-05, + "loss": 0.7939, + "step": 16860 + }, + { + "epoch": 0.21078026950673767, + "grad_norm": 0.3369571566581726, + "learning_rate": 1.926163003918018e-05, + "loss": 0.0089, + "step": 16862 + }, + { + "epoch": 0.2108052701317533, + "grad_norm": 2.7573864459991455, + "learning_rate": 1.92613008924839e-05, + "loss": 0.9035, + "step": 16864 + }, + { + "epoch": 0.2108302707567689, + "grad_norm": 0.012451072223484516, + "learning_rate": 1.926097167525486e-05, + "loss": 0.6046, + "step": 16866 + }, + { + "epoch": 0.21085527138178453, + "grad_norm": 0.398653507232666, + "learning_rate": 1.926064238749557e-05, + "loss": 1.0567, + "step": 16868 + }, + { + "epoch": 0.21088027200680018, + "grad_norm": 0.4189889132976532, + "learning_rate": 1.9260313029208533e-05, + "loss": 0.4615, + "step": 16870 + }, + { + "epoch": 0.2109052726318158, + "grad_norm": 0.00826784037053585, + "learning_rate": 1.9259983600396267e-05, + "loss": 0.5836, + "step": 16872 + }, + { + "epoch": 0.21093027325683142, + "grad_norm": 4.185727596282959, + "learning_rate": 1.9259654101061275e-05, + "loss": 1.6372, + "step": 16874 + }, + { + "epoch": 0.21095527388184704, + "grad_norm": 5.361883640289307, + "learning_rate": 1.9259324531206063e-05, + "loss": 1.341, + "step": 16876 + }, + { + "epoch": 0.21098027450686266, + "grad_norm": 5.929994583129883, + "learning_rate": 1.925899489083315e-05, + "loss": 1.6056, + "step": 16878 + }, + { + "epoch": 0.2110052751318783, + "grad_norm": 4.166484832763672, + "learning_rate": 1.9258665179945038e-05, + "loss": 1.2361, + "step": 16880 + }, + { + "epoch": 0.21103027575689393, + "grad_norm": 0.018021395429968834, + "learning_rate": 1.9258335398544242e-05, + "loss": 0.4116, + "step": 16882 + }, + { + "epoch": 0.21105527638190955, + "grad_norm": 3.74409556388855, + "learning_rate": 1.9258005546633273e-05, + "loss": 1.228, + "step": 16884 + }, + { + "epoch": 0.21108027700692517, + "grad_norm": 2.400137424468994, + "learning_rate": 1.925767562421464e-05, + "loss": 0.455, + "step": 16886 + }, + { + "epoch": 0.21110527763194079, + "grad_norm": 3.287611484527588, + "learning_rate": 1.925734563129086e-05, + "loss": 0.6514, + "step": 16888 + }, + { + "epoch": 0.21113027825695643, + "grad_norm": 2.7262661457061768, + "learning_rate": 1.9257015567864447e-05, + "loss": 0.5825, + "step": 16890 + }, + { + "epoch": 0.21115527888197205, + "grad_norm": 3.1310267448425293, + "learning_rate": 1.925668543393791e-05, + "loss": 1.3031, + "step": 16892 + }, + { + "epoch": 0.21118027950698767, + "grad_norm": 1.3660845756530762, + "learning_rate": 1.9256355229513766e-05, + "loss": 0.3726, + "step": 16894 + }, + { + "epoch": 0.2112052801320033, + "grad_norm": 8.552738189697266, + "learning_rate": 1.9256024954594533e-05, + "loss": 1.2131, + "step": 16896 + }, + { + "epoch": 0.2112302807570189, + "grad_norm": 0.013921793550252914, + "learning_rate": 1.925569460918272e-05, + "loss": 0.6852, + "step": 16898 + }, + { + "epoch": 0.21125528138203456, + "grad_norm": 4.039567470550537, + "learning_rate": 1.9255364193280847e-05, + "loss": 1.5526, + "step": 16900 + }, + { + "epoch": 0.21128028200705018, + "grad_norm": 5.102224826812744, + "learning_rate": 1.925503370689143e-05, + "loss": 1.2207, + "step": 16902 + }, + { + "epoch": 0.2113052826320658, + "grad_norm": 2.9372994899749756, + "learning_rate": 1.925470315001698e-05, + "loss": 1.6627, + "step": 16904 + }, + { + "epoch": 0.21133028325708142, + "grad_norm": 5.7535319328308105, + "learning_rate": 1.9254372522660027e-05, + "loss": 1.3837, + "step": 16906 + }, + { + "epoch": 0.21135528388209704, + "grad_norm": 2.7232794761657715, + "learning_rate": 1.9254041824823077e-05, + "loss": 1.3901, + "step": 16908 + }, + { + "epoch": 0.2113802845071127, + "grad_norm": 6.179012775421143, + "learning_rate": 1.9253711056508653e-05, + "loss": 1.8635, + "step": 16910 + }, + { + "epoch": 0.2114052851321283, + "grad_norm": 6.333560466766357, + "learning_rate": 1.9253380217719275e-05, + "loss": 0.2688, + "step": 16912 + }, + { + "epoch": 0.21143028575714393, + "grad_norm": 0.008729846216738224, + "learning_rate": 1.925304930845746e-05, + "loss": 0.3884, + "step": 16914 + }, + { + "epoch": 0.21145528638215955, + "grad_norm": 4.591287136077881, + "learning_rate": 1.9252718328725734e-05, + "loss": 0.7498, + "step": 16916 + }, + { + "epoch": 0.21148028700717517, + "grad_norm": 2.852884531021118, + "learning_rate": 1.925238727852661e-05, + "loss": 0.5476, + "step": 16918 + }, + { + "epoch": 0.2115052876321908, + "grad_norm": 2.1891720294952393, + "learning_rate": 1.9252056157862612e-05, + "loss": 1.1461, + "step": 16920 + }, + { + "epoch": 0.21153028825720643, + "grad_norm": 2.142735004425049, + "learning_rate": 1.9251724966736266e-05, + "loss": 0.1767, + "step": 16922 + }, + { + "epoch": 0.21155528888222205, + "grad_norm": 1.4454652070999146, + "learning_rate": 1.925139370515009e-05, + "loss": 0.0959, + "step": 16924 + }, + { + "epoch": 0.21158028950723767, + "grad_norm": 3.9867804050445557, + "learning_rate": 1.9251062373106608e-05, + "loss": 1.4826, + "step": 16926 + }, + { + "epoch": 0.2116052901322533, + "grad_norm": 4.023599624633789, + "learning_rate": 1.9250730970608343e-05, + "loss": 1.667, + "step": 16928 + }, + { + "epoch": 0.21163029075726894, + "grad_norm": 3.90594744682312, + "learning_rate": 1.9250399497657818e-05, + "loss": 1.3448, + "step": 16930 + }, + { + "epoch": 0.21165529138228456, + "grad_norm": 3.1408815383911133, + "learning_rate": 1.9250067954257557e-05, + "loss": 1.2738, + "step": 16932 + }, + { + "epoch": 0.21168029200730018, + "grad_norm": 3.114816427230835, + "learning_rate": 1.924973634041009e-05, + "loss": 1.337, + "step": 16934 + }, + { + "epoch": 0.2117052926323158, + "grad_norm": 3.548114776611328, + "learning_rate": 1.924940465611794e-05, + "loss": 1.6442, + "step": 16936 + }, + { + "epoch": 0.21173029325733142, + "grad_norm": 2.5640134811401367, + "learning_rate": 1.924907290138363e-05, + "loss": 0.6258, + "step": 16938 + }, + { + "epoch": 0.21175529388234707, + "grad_norm": 5.096097946166992, + "learning_rate": 1.924874107620969e-05, + "loss": 1.1634, + "step": 16940 + }, + { + "epoch": 0.2117802945073627, + "grad_norm": 1.8228338956832886, + "learning_rate": 1.9248409180598644e-05, + "loss": 0.911, + "step": 16942 + }, + { + "epoch": 0.2118052951323783, + "grad_norm": 0.5008156299591064, + "learning_rate": 1.9248077214553022e-05, + "loss": 0.6226, + "step": 16944 + }, + { + "epoch": 0.21183029575739393, + "grad_norm": 2.6828675270080566, + "learning_rate": 1.9247745178075353e-05, + "loss": 0.9726, + "step": 16946 + }, + { + "epoch": 0.21185529638240955, + "grad_norm": 2.5742149353027344, + "learning_rate": 1.9247413071168163e-05, + "loss": 0.8416, + "step": 16948 + }, + { + "epoch": 0.2118802970074252, + "grad_norm": 0.5138269066810608, + "learning_rate": 1.9247080893833984e-05, + "loss": 0.644, + "step": 16950 + }, + { + "epoch": 0.21190529763244081, + "grad_norm": 4.838881015777588, + "learning_rate": 1.924674864607534e-05, + "loss": 2.6368, + "step": 16952 + }, + { + "epoch": 0.21193029825745643, + "grad_norm": 2.243603229522705, + "learning_rate": 1.924641632789477e-05, + "loss": 0.183, + "step": 16954 + }, + { + "epoch": 0.21195529888247205, + "grad_norm": 4.910800933837891, + "learning_rate": 1.92460839392948e-05, + "loss": 0.8645, + "step": 16956 + }, + { + "epoch": 0.21198029950748767, + "grad_norm": 4.629169940948486, + "learning_rate": 1.9245751480277965e-05, + "loss": 0.8416, + "step": 16958 + }, + { + "epoch": 0.21200530013250332, + "grad_norm": 2.891692638397217, + "learning_rate": 1.9245418950846792e-05, + "loss": 0.5804, + "step": 16960 + }, + { + "epoch": 0.21203030075751894, + "grad_norm": 2.7307236194610596, + "learning_rate": 1.9245086351003817e-05, + "loss": 1.5648, + "step": 16962 + }, + { + "epoch": 0.21205530138253456, + "grad_norm": 0.34605178236961365, + "learning_rate": 1.9244753680751568e-05, + "loss": 1.07, + "step": 16964 + }, + { + "epoch": 0.21208030200755018, + "grad_norm": 0.5256044268608093, + "learning_rate": 1.9244420940092584e-05, + "loss": 0.0056, + "step": 16966 + }, + { + "epoch": 0.2121053026325658, + "grad_norm": 4.287721157073975, + "learning_rate": 1.9244088129029398e-05, + "loss": 1.3578, + "step": 16968 + }, + { + "epoch": 0.21213030325758145, + "grad_norm": 3.5784213542938232, + "learning_rate": 1.9243755247564546e-05, + "loss": 0.7525, + "step": 16970 + }, + { + "epoch": 0.21215530388259707, + "grad_norm": 3.1349880695343018, + "learning_rate": 1.9243422295700557e-05, + "loss": 0.9127, + "step": 16972 + }, + { + "epoch": 0.2121803045076127, + "grad_norm": 3.4083075523376465, + "learning_rate": 1.9243089273439978e-05, + "loss": 1.1121, + "step": 16974 + }, + { + "epoch": 0.2122053051326283, + "grad_norm": 0.04256806522607803, + "learning_rate": 1.9242756180785333e-05, + "loss": 1.0545, + "step": 16976 + }, + { + "epoch": 0.21223030575764393, + "grad_norm": 1.1082299947738647, + "learning_rate": 1.9242423017739164e-05, + "loss": 0.4509, + "step": 16978 + }, + { + "epoch": 0.21225530638265958, + "grad_norm": 3.234769344329834, + "learning_rate": 1.9242089784304005e-05, + "loss": 0.9002, + "step": 16980 + }, + { + "epoch": 0.2122803070076752, + "grad_norm": 1.9004364013671875, + "learning_rate": 1.92417564804824e-05, + "loss": 0.6541, + "step": 16982 + }, + { + "epoch": 0.21230530763269082, + "grad_norm": 0.0077029261738061905, + "learning_rate": 1.924142310627689e-05, + "loss": 0.3948, + "step": 16984 + }, + { + "epoch": 0.21233030825770643, + "grad_norm": 4.191351890563965, + "learning_rate": 1.9241089661690003e-05, + "loss": 0.6833, + "step": 16986 + }, + { + "epoch": 0.21235530888272205, + "grad_norm": 4.195827960968018, + "learning_rate": 1.9240756146724284e-05, + "loss": 1.8004, + "step": 16988 + }, + { + "epoch": 0.2123803095077377, + "grad_norm": 4.02821159362793, + "learning_rate": 1.9240422561382273e-05, + "loss": 1.3711, + "step": 16990 + }, + { + "epoch": 0.21240531013275332, + "grad_norm": 5.312283039093018, + "learning_rate": 1.924008890566651e-05, + "loss": 1.5788, + "step": 16992 + }, + { + "epoch": 0.21243031075776894, + "grad_norm": 2.4638073444366455, + "learning_rate": 1.923975517957954e-05, + "loss": 1.244, + "step": 16994 + }, + { + "epoch": 0.21245531138278456, + "grad_norm": 0.01044053677469492, + "learning_rate": 1.92394213831239e-05, + "loss": 0.2785, + "step": 16996 + }, + { + "epoch": 0.21248031200780018, + "grad_norm": 2.1954703330993652, + "learning_rate": 1.923908751630213e-05, + "loss": 1.2924, + "step": 16998 + }, + { + "epoch": 0.21250531263281583, + "grad_norm": 2.0735862255096436, + "learning_rate": 1.923875357911678e-05, + "loss": 0.9152, + "step": 17000 + }, + { + "epoch": 0.21253031325783145, + "grad_norm": 3.2478272914886475, + "learning_rate": 1.9238419571570386e-05, + "loss": 1.2864, + "step": 17002 + }, + { + "epoch": 0.21255531388284707, + "grad_norm": 3.494084358215332, + "learning_rate": 1.9238085493665496e-05, + "loss": 0.7844, + "step": 17004 + }, + { + "epoch": 0.2125803145078627, + "grad_norm": 1.9106929302215576, + "learning_rate": 1.9237751345404654e-05, + "loss": 0.2988, + "step": 17006 + }, + { + "epoch": 0.2126053151328783, + "grad_norm": 0.015851236879825592, + "learning_rate": 1.9237417126790407e-05, + "loss": 0.6381, + "step": 17008 + }, + { + "epoch": 0.21263031575789396, + "grad_norm": 2.4833314418792725, + "learning_rate": 1.9237082837825292e-05, + "loss": 1.1316, + "step": 17010 + }, + { + "epoch": 0.21265531638290958, + "grad_norm": 2.7802324295043945, + "learning_rate": 1.9236748478511863e-05, + "loss": 1.5941, + "step": 17012 + }, + { + "epoch": 0.2126803170079252, + "grad_norm": 5.210480690002441, + "learning_rate": 1.9236414048852662e-05, + "loss": 1.106, + "step": 17014 + }, + { + "epoch": 0.21270531763294082, + "grad_norm": 3.72121000289917, + "learning_rate": 1.923607954885024e-05, + "loss": 0.5788, + "step": 17016 + }, + { + "epoch": 0.21273031825795644, + "grad_norm": 0.3202141523361206, + "learning_rate": 1.923574497850714e-05, + "loss": 0.1254, + "step": 17018 + }, + { + "epoch": 0.21275531888297208, + "grad_norm": 2.409619092941284, + "learning_rate": 1.9235410337825914e-05, + "loss": 0.6255, + "step": 17020 + }, + { + "epoch": 0.2127803195079877, + "grad_norm": 3.9460930824279785, + "learning_rate": 1.9235075626809107e-05, + "loss": 0.7124, + "step": 17022 + }, + { + "epoch": 0.21280532013300332, + "grad_norm": 3.100761651992798, + "learning_rate": 1.9234740845459275e-05, + "loss": 1.5673, + "step": 17024 + }, + { + "epoch": 0.21283032075801894, + "grad_norm": 4.049243927001953, + "learning_rate": 1.9234405993778956e-05, + "loss": 0.9558, + "step": 17026 + }, + { + "epoch": 0.21285532138303456, + "grad_norm": 5.957705020904541, + "learning_rate": 1.923407107177071e-05, + "loss": 1.5194, + "step": 17028 + }, + { + "epoch": 0.2128803220080502, + "grad_norm": 3.389321804046631, + "learning_rate": 1.9233736079437088e-05, + "loss": 1.0473, + "step": 17030 + }, + { + "epoch": 0.21290532263306583, + "grad_norm": 0.007952542044222355, + "learning_rate": 1.923340101678063e-05, + "loss": 0.5341, + "step": 17032 + }, + { + "epoch": 0.21293032325808145, + "grad_norm": 2.75184965133667, + "learning_rate": 1.9233065883803903e-05, + "loss": 0.3289, + "step": 17034 + }, + { + "epoch": 0.21295532388309707, + "grad_norm": 3.7091546058654785, + "learning_rate": 1.923273068050945e-05, + "loss": 1.0817, + "step": 17036 + }, + { + "epoch": 0.2129803245081127, + "grad_norm": 4.931692600250244, + "learning_rate": 1.9232395406899824e-05, + "loss": 1.3565, + "step": 17038 + }, + { + "epoch": 0.21300532513312834, + "grad_norm": 0.4461209774017334, + "learning_rate": 1.923206006297758e-05, + "loss": 0.8536, + "step": 17040 + }, + { + "epoch": 0.21303032575814396, + "grad_norm": 4.617480278015137, + "learning_rate": 1.9231724648745272e-05, + "loss": 1.1819, + "step": 17042 + }, + { + "epoch": 0.21305532638315958, + "grad_norm": 3.5429611206054688, + "learning_rate": 1.9231389164205455e-05, + "loss": 0.9866, + "step": 17044 + }, + { + "epoch": 0.2130803270081752, + "grad_norm": 1.2498914003372192, + "learning_rate": 1.9231053609360684e-05, + "loss": 0.5542, + "step": 17046 + }, + { + "epoch": 0.21310532763319082, + "grad_norm": 4.210245132446289, + "learning_rate": 1.9230717984213516e-05, + "loss": 1.073, + "step": 17048 + }, + { + "epoch": 0.21313032825820646, + "grad_norm": 0.05494815483689308, + "learning_rate": 1.9230382288766504e-05, + "loss": 1.29, + "step": 17050 + }, + { + "epoch": 0.21315532888322208, + "grad_norm": 4.6763916015625, + "learning_rate": 1.9230046523022204e-05, + "loss": 0.9855, + "step": 17052 + }, + { + "epoch": 0.2131803295082377, + "grad_norm": 3.1479148864746094, + "learning_rate": 1.9229710686983176e-05, + "loss": 0.746, + "step": 17054 + }, + { + "epoch": 0.21320533013325332, + "grad_norm": 2.2836852073669434, + "learning_rate": 1.9229374780651976e-05, + "loss": 0.2438, + "step": 17056 + }, + { + "epoch": 0.21323033075826894, + "grad_norm": 1.58704674243927, + "learning_rate": 1.922903880403116e-05, + "loss": 0.7497, + "step": 17058 + }, + { + "epoch": 0.2132553313832846, + "grad_norm": 1.4527561664581299, + "learning_rate": 1.9228702757123297e-05, + "loss": 0.7029, + "step": 17060 + }, + { + "epoch": 0.2132803320083002, + "grad_norm": 2.745206356048584, + "learning_rate": 1.9228366639930933e-05, + "loss": 0.957, + "step": 17062 + }, + { + "epoch": 0.21330533263331583, + "grad_norm": 0.009712304919958115, + "learning_rate": 1.9228030452456637e-05, + "loss": 0.7594, + "step": 17064 + }, + { + "epoch": 0.21333033325833145, + "grad_norm": 3.2727630138397217, + "learning_rate": 1.9227694194702963e-05, + "loss": 0.3145, + "step": 17066 + }, + { + "epoch": 0.21335533388334707, + "grad_norm": 3.2623207569122314, + "learning_rate": 1.9227357866672477e-05, + "loss": 0.9377, + "step": 17068 + }, + { + "epoch": 0.21338033450836272, + "grad_norm": 4.153512001037598, + "learning_rate": 1.9227021468367737e-05, + "loss": 1.9987, + "step": 17070 + }, + { + "epoch": 0.21340533513337834, + "grad_norm": 6.612986087799072, + "learning_rate": 1.922668499979131e-05, + "loss": 1.1936, + "step": 17072 + }, + { + "epoch": 0.21343033575839396, + "grad_norm": 4.086963176727295, + "learning_rate": 1.922634846094575e-05, + "loss": 1.877, + "step": 17074 + }, + { + "epoch": 0.21345533638340958, + "grad_norm": 3.40181040763855, + "learning_rate": 1.9226011851833627e-05, + "loss": 0.6732, + "step": 17076 + }, + { + "epoch": 0.2134803370084252, + "grad_norm": 2.905816078186035, + "learning_rate": 1.9225675172457503e-05, + "loss": 1.4092, + "step": 17078 + }, + { + "epoch": 0.21350533763344084, + "grad_norm": 2.910778045654297, + "learning_rate": 1.922533842281994e-05, + "loss": 0.581, + "step": 17080 + }, + { + "epoch": 0.21353033825845646, + "grad_norm": 5.193574905395508, + "learning_rate": 1.9225001602923506e-05, + "loss": 1.1974, + "step": 17082 + }, + { + "epoch": 0.21355533888347208, + "grad_norm": 2.5497565269470215, + "learning_rate": 1.922466471277076e-05, + "loss": 1.0914, + "step": 17084 + }, + { + "epoch": 0.2135803395084877, + "grad_norm": 3.5192127227783203, + "learning_rate": 1.9224327752364278e-05, + "loss": 1.2937, + "step": 17086 + }, + { + "epoch": 0.21360534013350332, + "grad_norm": 0.013187641277909279, + "learning_rate": 1.9223990721706617e-05, + "loss": 1.8342, + "step": 17088 + }, + { + "epoch": 0.21363034075851897, + "grad_norm": 1.7855552434921265, + "learning_rate": 1.9223653620800346e-05, + "loss": 1.5665, + "step": 17090 + }, + { + "epoch": 0.2136553413835346, + "grad_norm": 0.7376680970191956, + "learning_rate": 1.9223316449648036e-05, + "loss": 0.1408, + "step": 17092 + }, + { + "epoch": 0.2136803420085502, + "grad_norm": 1.135969638824463, + "learning_rate": 1.922297920825225e-05, + "loss": 0.0486, + "step": 17094 + }, + { + "epoch": 0.21370534263356583, + "grad_norm": 3.393454074859619, + "learning_rate": 1.9222641896615562e-05, + "loss": 1.1686, + "step": 17096 + }, + { + "epoch": 0.21373034325858145, + "grad_norm": 0.650736927986145, + "learning_rate": 1.9222304514740534e-05, + "loss": 0.6647, + "step": 17098 + }, + { + "epoch": 0.2137553438835971, + "grad_norm": 2.4534237384796143, + "learning_rate": 1.922196706262974e-05, + "loss": 0.477, + "step": 17100 + }, + { + "epoch": 0.21378034450861272, + "grad_norm": 3.604264974594116, + "learning_rate": 1.9221629540285747e-05, + "loss": 1.0109, + "step": 17102 + }, + { + "epoch": 0.21380534513362834, + "grad_norm": 5.0535478591918945, + "learning_rate": 1.922129194771113e-05, + "loss": 0.4882, + "step": 17104 + }, + { + "epoch": 0.21383034575864396, + "grad_norm": 3.905618667602539, + "learning_rate": 1.9220954284908458e-05, + "loss": 1.2191, + "step": 17106 + }, + { + "epoch": 0.21385534638365958, + "grad_norm": 4.252909183502197, + "learning_rate": 1.9220616551880303e-05, + "loss": 1.0266, + "step": 17108 + }, + { + "epoch": 0.21388034700867523, + "grad_norm": 0.16959448158740997, + "learning_rate": 1.9220278748629236e-05, + "loss": 0.5321, + "step": 17110 + }, + { + "epoch": 0.21390534763369085, + "grad_norm": 0.007739747408777475, + "learning_rate": 1.9219940875157824e-05, + "loss": 0.6167, + "step": 17112 + }, + { + "epoch": 0.21393034825870647, + "grad_norm": 7.428134918212891, + "learning_rate": 1.9219602931468654e-05, + "loss": 0.2573, + "step": 17114 + }, + { + "epoch": 0.21395534888372209, + "grad_norm": 0.014616766013205051, + "learning_rate": 1.9219264917564285e-05, + "loss": 0.5914, + "step": 17116 + }, + { + "epoch": 0.2139803495087377, + "grad_norm": 5.663690090179443, + "learning_rate": 1.9218926833447306e-05, + "loss": 1.6558, + "step": 17118 + }, + { + "epoch": 0.21400535013375335, + "grad_norm": 1.9005519151687622, + "learning_rate": 1.9218588679120276e-05, + "loss": 0.3667, + "step": 17120 + }, + { + "epoch": 0.21403035075876897, + "grad_norm": 0.008882730267941952, + "learning_rate": 1.9218250454585783e-05, + "loss": 0.0005, + "step": 17122 + }, + { + "epoch": 0.2140553513837846, + "grad_norm": 0.6808550357818604, + "learning_rate": 1.9217912159846397e-05, + "loss": 0.9697, + "step": 17124 + }, + { + "epoch": 0.2140803520088002, + "grad_norm": 0.011367506347596645, + "learning_rate": 1.9217573794904693e-05, + "loss": 0.0634, + "step": 17126 + }, + { + "epoch": 0.21410535263381583, + "grad_norm": 0.007716040126979351, + "learning_rate": 1.9217235359763253e-05, + "loss": 0.0014, + "step": 17128 + }, + { + "epoch": 0.21413035325883148, + "grad_norm": 4.691359519958496, + "learning_rate": 1.9216896854424653e-05, + "loss": 1.2739, + "step": 17130 + }, + { + "epoch": 0.2141553538838471, + "grad_norm": 2.866955280303955, + "learning_rate": 1.921655827889147e-05, + "loss": 1.1182, + "step": 17132 + }, + { + "epoch": 0.21418035450886272, + "grad_norm": 0.11906374245882034, + "learning_rate": 1.921621963316628e-05, + "loss": 0.9606, + "step": 17134 + }, + { + "epoch": 0.21420535513387834, + "grad_norm": 7.029892921447754, + "learning_rate": 1.9215880917251666e-05, + "loss": 1.573, + "step": 17136 + }, + { + "epoch": 0.21423035575889396, + "grad_norm": 2.1890475749969482, + "learning_rate": 1.9215542131150205e-05, + "loss": 1.2181, + "step": 17138 + }, + { + "epoch": 0.2142553563839096, + "grad_norm": 2.3432412147521973, + "learning_rate": 1.921520327486448e-05, + "loss": 0.5894, + "step": 17140 + }, + { + "epoch": 0.21428035700892523, + "grad_norm": 0.006032585632055998, + "learning_rate": 1.9214864348397068e-05, + "loss": 0.0406, + "step": 17142 + }, + { + "epoch": 0.21430535763394085, + "grad_norm": 0.7952998280525208, + "learning_rate": 1.9214525351750556e-05, + "loss": 0.5267, + "step": 17144 + }, + { + "epoch": 0.21433035825895647, + "grad_norm": 3.7179276943206787, + "learning_rate": 1.921418628492752e-05, + "loss": 2.0156, + "step": 17146 + }, + { + "epoch": 0.21435535888397209, + "grad_norm": 2.7202162742614746, + "learning_rate": 1.921384714793054e-05, + "loss": 0.3941, + "step": 17148 + }, + { + "epoch": 0.21438035950898773, + "grad_norm": 1.7478654384613037, + "learning_rate": 1.921350794076221e-05, + "loss": 0.4103, + "step": 17150 + }, + { + "epoch": 0.21440536013400335, + "grad_norm": 0.011700310744345188, + "learning_rate": 1.9213168663425102e-05, + "loss": 0.6379, + "step": 17152 + }, + { + "epoch": 0.21443036075901897, + "grad_norm": 2.766407012939453, + "learning_rate": 1.9212829315921805e-05, + "loss": 0.6876, + "step": 17154 + }, + { + "epoch": 0.2144553613840346, + "grad_norm": 3.8253252506256104, + "learning_rate": 1.9212489898254906e-05, + "loss": 0.6332, + "step": 17156 + }, + { + "epoch": 0.2144803620090502, + "grad_norm": 0.006819684989750385, + "learning_rate": 1.921215041042698e-05, + "loss": 0.7392, + "step": 17158 + }, + { + "epoch": 0.21450536263406586, + "grad_norm": 2.423529624938965, + "learning_rate": 1.9211810852440624e-05, + "loss": 0.3856, + "step": 17160 + }, + { + "epoch": 0.21453036325908148, + "grad_norm": 0.7734041810035706, + "learning_rate": 1.921147122429842e-05, + "loss": 0.0179, + "step": 17162 + }, + { + "epoch": 0.2145553638840971, + "grad_norm": 0.008783121593296528, + "learning_rate": 1.9211131526002952e-05, + "loss": 0.9938, + "step": 17164 + }, + { + "epoch": 0.21458036450911272, + "grad_norm": 5.77821683883667, + "learning_rate": 1.9210791757556806e-05, + "loss": 2.4815, + "step": 17166 + }, + { + "epoch": 0.21460536513412834, + "grad_norm": 0.009296281263232231, + "learning_rate": 1.9210451918962576e-05, + "loss": 0.7425, + "step": 17168 + }, + { + "epoch": 0.214630365759144, + "grad_norm": 10.068684577941895, + "learning_rate": 1.9210112010222848e-05, + "loss": 0.6984, + "step": 17170 + }, + { + "epoch": 0.2146553663841596, + "grad_norm": 5.207919597625732, + "learning_rate": 1.9209772031340206e-05, + "loss": 0.2453, + "step": 17172 + }, + { + "epoch": 0.21468036700917523, + "grad_norm": 13.198558807373047, + "learning_rate": 1.9209431982317242e-05, + "loss": 2.0347, + "step": 17174 + }, + { + "epoch": 0.21470536763419085, + "grad_norm": 3.2370119094848633, + "learning_rate": 1.920909186315655e-05, + "loss": 0.681, + "step": 17176 + }, + { + "epoch": 0.21473036825920647, + "grad_norm": 3.030244827270508, + "learning_rate": 1.9208751673860714e-05, + "loss": 1.3452, + "step": 17178 + }, + { + "epoch": 0.21475536888422211, + "grad_norm": 2.735018730163574, + "learning_rate": 1.920841141443233e-05, + "loss": 1.9518, + "step": 17180 + }, + { + "epoch": 0.21478036950923773, + "grad_norm": 5.472278118133545, + "learning_rate": 1.9208071084873986e-05, + "loss": 0.6742, + "step": 17182 + }, + { + "epoch": 0.21480537013425335, + "grad_norm": 4.3484649658203125, + "learning_rate": 1.9207730685188276e-05, + "loss": 0.5484, + "step": 17184 + }, + { + "epoch": 0.21483037075926897, + "grad_norm": 7.287136077880859, + "learning_rate": 1.920739021537779e-05, + "loss": 1.7144, + "step": 17186 + }, + { + "epoch": 0.2148553713842846, + "grad_norm": 0.018907567486166954, + "learning_rate": 1.920704967544512e-05, + "loss": 0.6831, + "step": 17188 + }, + { + "epoch": 0.21488037200930024, + "grad_norm": 3.633772611618042, + "learning_rate": 1.9206709065392863e-05, + "loss": 0.6483, + "step": 17190 + }, + { + "epoch": 0.21490537263431586, + "grad_norm": 2.2339580059051514, + "learning_rate": 1.920636838522361e-05, + "loss": 0.7408, + "step": 17192 + }, + { + "epoch": 0.21493037325933148, + "grad_norm": 4.327044486999512, + "learning_rate": 1.9206027634939962e-05, + "loss": 1.2356, + "step": 17194 + }, + { + "epoch": 0.2149553738843471, + "grad_norm": 0.14912283420562744, + "learning_rate": 1.9205686814544507e-05, + "loss": 1.0043, + "step": 17196 + }, + { + "epoch": 0.21498037450936272, + "grad_norm": 1.0035873651504517, + "learning_rate": 1.9205345924039843e-05, + "loss": 0.6493, + "step": 17198 + }, + { + "epoch": 0.21500537513437837, + "grad_norm": 5.7717742919921875, + "learning_rate": 1.9205004963428563e-05, + "loss": 1.2872, + "step": 17200 + }, + { + "epoch": 0.215030375759394, + "grad_norm": 0.0113902622833848, + "learning_rate": 1.920466393271327e-05, + "loss": 0.7356, + "step": 17202 + }, + { + "epoch": 0.2150553763844096, + "grad_norm": 2.727553367614746, + "learning_rate": 1.920432283189656e-05, + "loss": 0.5707, + "step": 17204 + }, + { + "epoch": 0.21508037700942523, + "grad_norm": 2.534243106842041, + "learning_rate": 1.9203981660981024e-05, + "loss": 0.6711, + "step": 17206 + }, + { + "epoch": 0.21510537763444085, + "grad_norm": 7.828842639923096, + "learning_rate": 1.920364041996927e-05, + "loss": 0.4399, + "step": 17208 + }, + { + "epoch": 0.2151303782594565, + "grad_norm": 3.261793375015259, + "learning_rate": 1.920329910886389e-05, + "loss": 1.34, + "step": 17210 + }, + { + "epoch": 0.21515537888447211, + "grad_norm": 3.0692684650421143, + "learning_rate": 1.9202957727667483e-05, + "loss": 0.7061, + "step": 17212 + }, + { + "epoch": 0.21518037950948773, + "grad_norm": 5.735293865203857, + "learning_rate": 1.9202616276382655e-05, + "loss": 0.9133, + "step": 17214 + }, + { + "epoch": 0.21520538013450335, + "grad_norm": 2.876894235610962, + "learning_rate": 1.9202274755012002e-05, + "loss": 0.7859, + "step": 17216 + }, + { + "epoch": 0.21523038075951897, + "grad_norm": 3.9259755611419678, + "learning_rate": 1.9201933163558125e-05, + "loss": 0.8152, + "step": 17218 + }, + { + "epoch": 0.21525538138453462, + "grad_norm": 0.15345868468284607, + "learning_rate": 1.9201591502023625e-05, + "loss": 0.0043, + "step": 17220 + }, + { + "epoch": 0.21528038200955024, + "grad_norm": 5.094593048095703, + "learning_rate": 1.920124977041111e-05, + "loss": 0.484, + "step": 17222 + }, + { + "epoch": 0.21530538263456586, + "grad_norm": 3.120901584625244, + "learning_rate": 1.9200907968723176e-05, + "loss": 1.1019, + "step": 17224 + }, + { + "epoch": 0.21533038325958148, + "grad_norm": 4.9380035400390625, + "learning_rate": 1.9200566096962427e-05, + "loss": 0.9971, + "step": 17226 + }, + { + "epoch": 0.2153553838845971, + "grad_norm": 0.023374995216727257, + "learning_rate": 1.9200224155131467e-05, + "loss": 0.3513, + "step": 17228 + }, + { + "epoch": 0.21538038450961275, + "grad_norm": 3.916816473007202, + "learning_rate": 1.91998821432329e-05, + "loss": 1.755, + "step": 17230 + }, + { + "epoch": 0.21540538513462837, + "grad_norm": 0.011462061665952206, + "learning_rate": 1.9199540061269336e-05, + "loss": 0.0134, + "step": 17232 + }, + { + "epoch": 0.215430385759644, + "grad_norm": 7.80267333984375, + "learning_rate": 1.919919790924337e-05, + "loss": 0.4249, + "step": 17234 + }, + { + "epoch": 0.2154553863846596, + "grad_norm": 3.885035514831543, + "learning_rate": 1.9198855687157617e-05, + "loss": 1.1168, + "step": 17236 + }, + { + "epoch": 0.21548038700967523, + "grad_norm": 3.3640389442443848, + "learning_rate": 1.919851339501468e-05, + "loss": 1.4764, + "step": 17238 + }, + { + "epoch": 0.21550538763469088, + "grad_norm": 10.69012451171875, + "learning_rate": 1.9198171032817166e-05, + "loss": 2.5723, + "step": 17240 + }, + { + "epoch": 0.2155303882597065, + "grad_norm": 2.7388803958892822, + "learning_rate": 1.9197828600567682e-05, + "loss": 1.2461, + "step": 17242 + }, + { + "epoch": 0.21555538888472212, + "grad_norm": 1.989577293395996, + "learning_rate": 1.9197486098268834e-05, + "loss": 0.5896, + "step": 17244 + }, + { + "epoch": 0.21558038950973774, + "grad_norm": 7.937366962432861, + "learning_rate": 1.9197143525923235e-05, + "loss": 0.7338, + "step": 17246 + }, + { + "epoch": 0.21560539013475336, + "grad_norm": 5.215911388397217, + "learning_rate": 1.919680088353349e-05, + "loss": 0.1332, + "step": 17248 + }, + { + "epoch": 0.215630390759769, + "grad_norm": 2.75817608833313, + "learning_rate": 1.919645817110221e-05, + "loss": 0.299, + "step": 17250 + }, + { + "epoch": 0.21565539138478462, + "grad_norm": 2.4977102279663086, + "learning_rate": 1.9196115388632006e-05, + "loss": 0.5906, + "step": 17252 + }, + { + "epoch": 0.21568039200980024, + "grad_norm": 4.0315260887146, + "learning_rate": 1.919577253612549e-05, + "loss": 0.8535, + "step": 17254 + }, + { + "epoch": 0.21570539263481586, + "grad_norm": 2.1188721656799316, + "learning_rate": 1.9195429613585265e-05, + "loss": 1.1339, + "step": 17256 + }, + { + "epoch": 0.21573039325983148, + "grad_norm": 5.775993824005127, + "learning_rate": 1.9195086621013952e-05, + "loss": 1.0993, + "step": 17258 + }, + { + "epoch": 0.21575539388484713, + "grad_norm": 2.7735471725463867, + "learning_rate": 1.919474355841416e-05, + "loss": 0.4603, + "step": 17260 + }, + { + "epoch": 0.21578039450986275, + "grad_norm": 6.363295555114746, + "learning_rate": 1.91944004257885e-05, + "loss": 1.9565, + "step": 17262 + }, + { + "epoch": 0.21580539513487837, + "grad_norm": 4.806553363800049, + "learning_rate": 1.9194057223139587e-05, + "loss": 1.1242, + "step": 17264 + }, + { + "epoch": 0.215830395759894, + "grad_norm": 1.635159969329834, + "learning_rate": 1.9193713950470034e-05, + "loss": 1.566, + "step": 17266 + }, + { + "epoch": 0.2158553963849096, + "grad_norm": 0.42688363790512085, + "learning_rate": 1.9193370607782457e-05, + "loss": 0.0408, + "step": 17268 + }, + { + "epoch": 0.21588039700992526, + "grad_norm": 0.010900202207267284, + "learning_rate": 1.9193027195079468e-05, + "loss": 0.1665, + "step": 17270 + }, + { + "epoch": 0.21590539763494088, + "grad_norm": 0.008113277144730091, + "learning_rate": 1.9192683712363684e-05, + "loss": 0.026, + "step": 17272 + }, + { + "epoch": 0.2159303982599565, + "grad_norm": 3.592935562133789, + "learning_rate": 1.919234015963772e-05, + "loss": 0.8027, + "step": 17274 + }, + { + "epoch": 0.21595539888497212, + "grad_norm": 2.5712380409240723, + "learning_rate": 1.9191996536904194e-05, + "loss": 0.5543, + "step": 17276 + }, + { + "epoch": 0.21598039950998774, + "grad_norm": 2.70892071723938, + "learning_rate": 1.9191652844165724e-05, + "loss": 2.4102, + "step": 17278 + }, + { + "epoch": 0.21600540013500338, + "grad_norm": 3.264681816101074, + "learning_rate": 1.9191309081424923e-05, + "loss": 0.4252, + "step": 17280 + }, + { + "epoch": 0.216030400760019, + "grad_norm": 4.589734077453613, + "learning_rate": 1.9190965248684415e-05, + "loss": 1.8707, + "step": 17282 + }, + { + "epoch": 0.21605540138503462, + "grad_norm": 3.516087770462036, + "learning_rate": 1.9190621345946815e-05, + "loss": 1.1291, + "step": 17284 + }, + { + "epoch": 0.21608040201005024, + "grad_norm": 3.711496591567993, + "learning_rate": 1.919027737321474e-05, + "loss": 1.1202, + "step": 17286 + }, + { + "epoch": 0.21610540263506586, + "grad_norm": 2.981694221496582, + "learning_rate": 1.9189933330490814e-05, + "loss": 0.5622, + "step": 17288 + }, + { + "epoch": 0.2161304032600815, + "grad_norm": 3.071410894393921, + "learning_rate": 1.9189589217777653e-05, + "loss": 0.6537, + "step": 17290 + }, + { + "epoch": 0.21615540388509713, + "grad_norm": 0.006153027061372995, + "learning_rate": 1.9189245035077883e-05, + "loss": 0.0003, + "step": 17292 + }, + { + "epoch": 0.21618040451011275, + "grad_norm": 2.0346128940582275, + "learning_rate": 1.918890078239412e-05, + "loss": 0.1225, + "step": 17294 + }, + { + "epoch": 0.21620540513512837, + "grad_norm": 4.777986526489258, + "learning_rate": 1.918855645972899e-05, + "loss": 0.229, + "step": 17296 + }, + { + "epoch": 0.216230405760144, + "grad_norm": 1.3521510362625122, + "learning_rate": 1.918821206708511e-05, + "loss": 0.3018, + "step": 17298 + }, + { + "epoch": 0.21625540638515964, + "grad_norm": 0.011276901699602604, + "learning_rate": 1.9187867604465113e-05, + "loss": 0.0005, + "step": 17300 + }, + { + "epoch": 0.21628040701017526, + "grad_norm": 2.723071575164795, + "learning_rate": 1.9187523071871608e-05, + "loss": 1.2524, + "step": 17302 + }, + { + "epoch": 0.21630540763519088, + "grad_norm": 0.009272895753383636, + "learning_rate": 1.9187178469307232e-05, + "loss": 0.2786, + "step": 17304 + }, + { + "epoch": 0.2163304082602065, + "grad_norm": 2.4858105182647705, + "learning_rate": 1.91868337967746e-05, + "loss": 1.2834, + "step": 17306 + }, + { + "epoch": 0.21635540888522212, + "grad_norm": 0.006020347587764263, + "learning_rate": 1.9186489054276343e-05, + "loss": 0.3025, + "step": 17308 + }, + { + "epoch": 0.21638040951023776, + "grad_norm": 4.5319318771362305, + "learning_rate": 1.9186144241815082e-05, + "loss": 1.632, + "step": 17310 + }, + { + "epoch": 0.21640541013525338, + "grad_norm": 3.884376049041748, + "learning_rate": 1.918579935939345e-05, + "loss": 0.8673, + "step": 17312 + }, + { + "epoch": 0.216430410760269, + "grad_norm": 2.2688536643981934, + "learning_rate": 1.9185454407014065e-05, + "loss": 1.0703, + "step": 17314 + }, + { + "epoch": 0.21645541138528462, + "grad_norm": 1.309323787689209, + "learning_rate": 1.918510938467956e-05, + "loss": 0.0786, + "step": 17316 + }, + { + "epoch": 0.21648041201030024, + "grad_norm": 4.506923198699951, + "learning_rate": 1.9184764292392563e-05, + "loss": 2.2078, + "step": 17318 + }, + { + "epoch": 0.2165054126353159, + "grad_norm": 3.9522006511688232, + "learning_rate": 1.9184419130155697e-05, + "loss": 0.7397, + "step": 17320 + }, + { + "epoch": 0.2165304132603315, + "grad_norm": 4.255096435546875, + "learning_rate": 1.9184073897971592e-05, + "loss": 0.4021, + "step": 17322 + }, + { + "epoch": 0.21655541388534713, + "grad_norm": 4.131780624389648, + "learning_rate": 1.918372859584288e-05, + "loss": 0.9507, + "step": 17324 + }, + { + "epoch": 0.21658041451036275, + "grad_norm": 2.9563891887664795, + "learning_rate": 1.9183383223772193e-05, + "loss": 0.3261, + "step": 17326 + }, + { + "epoch": 0.21660541513537837, + "grad_norm": 15.142723083496094, + "learning_rate": 1.9183037781762154e-05, + "loss": 1.7052, + "step": 17328 + }, + { + "epoch": 0.21663041576039402, + "grad_norm": 1.953074336051941, + "learning_rate": 1.9182692269815403e-05, + "loss": 0.9399, + "step": 17330 + }, + { + "epoch": 0.21665541638540964, + "grad_norm": 7.695346832275391, + "learning_rate": 1.9182346687934563e-05, + "loss": 1.587, + "step": 17332 + }, + { + "epoch": 0.21668041701042526, + "grad_norm": 2.8937199115753174, + "learning_rate": 1.918200103612227e-05, + "loss": 1.0762, + "step": 17334 + }, + { + "epoch": 0.21670541763544088, + "grad_norm": 4.041536808013916, + "learning_rate": 1.9181655314381154e-05, + "loss": 1.2465, + "step": 17336 + }, + { + "epoch": 0.2167304182604565, + "grad_norm": 2.530264139175415, + "learning_rate": 1.918130952271385e-05, + "loss": 0.847, + "step": 17338 + }, + { + "epoch": 0.21675541888547215, + "grad_norm": 3.7261569499969482, + "learning_rate": 1.9180963661122992e-05, + "loss": 0.7624, + "step": 17340 + }, + { + "epoch": 0.21678041951048777, + "grad_norm": 0.5516719222068787, + "learning_rate": 1.918061772961121e-05, + "loss": 0.3659, + "step": 17342 + }, + { + "epoch": 0.21680542013550338, + "grad_norm": 0.11724771559238434, + "learning_rate": 1.9180271728181147e-05, + "loss": 1.1348, + "step": 17344 + }, + { + "epoch": 0.216830420760519, + "grad_norm": 3.768160104751587, + "learning_rate": 1.9179925656835427e-05, + "loss": 1.2743, + "step": 17346 + }, + { + "epoch": 0.21685542138553462, + "grad_norm": 3.769277811050415, + "learning_rate": 1.9179579515576698e-05, + "loss": 1.7122, + "step": 17348 + }, + { + "epoch": 0.21688042201055027, + "grad_norm": 3.166666269302368, + "learning_rate": 1.9179233304407583e-05, + "loss": 0.6596, + "step": 17350 + }, + { + "epoch": 0.2169054226355659, + "grad_norm": 3.396899461746216, + "learning_rate": 1.9178887023330728e-05, + "loss": 1.5299, + "step": 17352 + }, + { + "epoch": 0.2169304232605815, + "grad_norm": 2.4698898792266846, + "learning_rate": 1.9178540672348765e-05, + "loss": 0.139, + "step": 17354 + }, + { + "epoch": 0.21695542388559713, + "grad_norm": 3.997499704360962, + "learning_rate": 1.917819425146434e-05, + "loss": 0.9617, + "step": 17356 + }, + { + "epoch": 0.21698042451061275, + "grad_norm": 3.4954748153686523, + "learning_rate": 1.917784776068008e-05, + "loss": 1.0424, + "step": 17358 + }, + { + "epoch": 0.2170054251356284, + "grad_norm": 3.898754358291626, + "learning_rate": 1.917750119999863e-05, + "loss": 1.1615, + "step": 17360 + }, + { + "epoch": 0.21703042576064402, + "grad_norm": 3.278442859649658, + "learning_rate": 1.9177154569422633e-05, + "loss": 0.7674, + "step": 17362 + }, + { + "epoch": 0.21705542638565964, + "grad_norm": 0.446670264005661, + "learning_rate": 1.9176807868954717e-05, + "loss": 0.4625, + "step": 17364 + }, + { + "epoch": 0.21708042701067526, + "grad_norm": 3.0764575004577637, + "learning_rate": 1.9176461098597537e-05, + "loss": 0.5316, + "step": 17366 + }, + { + "epoch": 0.21710542763569088, + "grad_norm": 13.702893257141113, + "learning_rate": 1.9176114258353725e-05, + "loss": 2.4239, + "step": 17368 + }, + { + "epoch": 0.21713042826070653, + "grad_norm": 6.900668144226074, + "learning_rate": 1.9175767348225924e-05, + "loss": 0.5381, + "step": 17370 + }, + { + "epoch": 0.21715542888572215, + "grad_norm": 5.196295261383057, + "learning_rate": 1.9175420368216777e-05, + "loss": 1.3392, + "step": 17372 + }, + { + "epoch": 0.21718042951073777, + "grad_norm": 3.4829330444335938, + "learning_rate": 1.9175073318328925e-05, + "loss": 0.8572, + "step": 17374 + }, + { + "epoch": 0.21720543013575339, + "grad_norm": 4.386665344238281, + "learning_rate": 1.9174726198565013e-05, + "loss": 2.7363, + "step": 17376 + }, + { + "epoch": 0.217230430760769, + "grad_norm": 4.61818265914917, + "learning_rate": 1.9174379008927683e-05, + "loss": 1.0538, + "step": 17378 + }, + { + "epoch": 0.21725543138578465, + "grad_norm": 5.951943397521973, + "learning_rate": 1.917403174941958e-05, + "loss": 1.0999, + "step": 17380 + }, + { + "epoch": 0.21728043201080027, + "grad_norm": 5.735032081604004, + "learning_rate": 1.917368442004335e-05, + "loss": 1.3622, + "step": 17382 + }, + { + "epoch": 0.2173054326358159, + "grad_norm": 4.8477067947387695, + "learning_rate": 1.917333702080163e-05, + "loss": 1.2841, + "step": 17384 + }, + { + "epoch": 0.2173304332608315, + "grad_norm": 0.007128749042749405, + "learning_rate": 1.9172989551697083e-05, + "loss": 0.0147, + "step": 17386 + }, + { + "epoch": 0.21735543388584713, + "grad_norm": 3.9244847297668457, + "learning_rate": 1.9172642012732337e-05, + "loss": 2.0824, + "step": 17388 + }, + { + "epoch": 0.21738043451086278, + "grad_norm": 4.174149513244629, + "learning_rate": 1.917229440391005e-05, + "loss": 1.6743, + "step": 17390 + }, + { + "epoch": 0.2174054351358784, + "grad_norm": 4.189001083374023, + "learning_rate": 1.9171946725232863e-05, + "loss": 1.8098, + "step": 17392 + }, + { + "epoch": 0.21743043576089402, + "grad_norm": 3.5813546180725098, + "learning_rate": 1.9171598976703428e-05, + "loss": 0.5509, + "step": 17394 + }, + { + "epoch": 0.21745543638590964, + "grad_norm": 3.040806531906128, + "learning_rate": 1.9171251158324393e-05, + "loss": 0.7995, + "step": 17396 + }, + { + "epoch": 0.21748043701092526, + "grad_norm": 2.5789053440093994, + "learning_rate": 1.9170903270098408e-05, + "loss": 0.9386, + "step": 17398 + }, + { + "epoch": 0.2175054376359409, + "grad_norm": 1.9191886186599731, + "learning_rate": 1.9170555312028116e-05, + "loss": 1.0314, + "step": 17400 + }, + { + "epoch": 0.21753043826095653, + "grad_norm": 3.4614315032958984, + "learning_rate": 1.9170207284116176e-05, + "loss": 0.6944, + "step": 17402 + }, + { + "epoch": 0.21755543888597215, + "grad_norm": 2.1524016857147217, + "learning_rate": 1.916985918636523e-05, + "loss": 0.941, + "step": 17404 + }, + { + "epoch": 0.21758043951098777, + "grad_norm": 2.963338851928711, + "learning_rate": 1.9169511018777938e-05, + "loss": 1.3478, + "step": 17406 + }, + { + "epoch": 0.2176054401360034, + "grad_norm": 1.3552519083023071, + "learning_rate": 1.9169162781356942e-05, + "loss": 0.1419, + "step": 17408 + }, + { + "epoch": 0.21763044076101903, + "grad_norm": 2.682471513748169, + "learning_rate": 1.9168814474104903e-05, + "loss": 1.0135, + "step": 17410 + }, + { + "epoch": 0.21765544138603465, + "grad_norm": 0.6838375926017761, + "learning_rate": 1.9168466097024468e-05, + "loss": 0.1693, + "step": 17412 + }, + { + "epoch": 0.21768044201105027, + "grad_norm": 0.011714871041476727, + "learning_rate": 1.9168117650118292e-05, + "loss": 0.0005, + "step": 17414 + }, + { + "epoch": 0.2177054426360659, + "grad_norm": 3.4318599700927734, + "learning_rate": 1.9167769133389026e-05, + "loss": 1.0304, + "step": 17416 + }, + { + "epoch": 0.2177304432610815, + "grad_norm": 2.812380075454712, + "learning_rate": 1.9167420546839328e-05, + "loss": 0.5526, + "step": 17418 + }, + { + "epoch": 0.21775544388609716, + "grad_norm": 7.69746208190918, + "learning_rate": 1.9167071890471852e-05, + "loss": 1.1263, + "step": 17420 + }, + { + "epoch": 0.21778044451111278, + "grad_norm": 3.304595708847046, + "learning_rate": 1.9166723164289254e-05, + "loss": 0.8948, + "step": 17422 + }, + { + "epoch": 0.2178054451361284, + "grad_norm": 4.156594276428223, + "learning_rate": 1.9166374368294187e-05, + "loss": 1.4165, + "step": 17424 + }, + { + "epoch": 0.21783044576114402, + "grad_norm": 0.006718644872307777, + "learning_rate": 1.916602550248931e-05, + "loss": 0.6091, + "step": 17426 + }, + { + "epoch": 0.21785544638615964, + "grad_norm": 2.914837598800659, + "learning_rate": 1.9165676566877277e-05, + "loss": 0.9994, + "step": 17428 + }, + { + "epoch": 0.2178804470111753, + "grad_norm": 1.6745545864105225, + "learning_rate": 1.916532756146075e-05, + "loss": 0.4429, + "step": 17430 + }, + { + "epoch": 0.2179054476361909, + "grad_norm": 4.505791187286377, + "learning_rate": 1.9164978486242384e-05, + "loss": 1.4178, + "step": 17432 + }, + { + "epoch": 0.21793044826120653, + "grad_norm": 3.503223180770874, + "learning_rate": 1.9164629341224836e-05, + "loss": 2.0013, + "step": 17434 + }, + { + "epoch": 0.21795544888622215, + "grad_norm": 0.006668992806226015, + "learning_rate": 1.916428012641077e-05, + "loss": 0.1071, + "step": 17436 + }, + { + "epoch": 0.21798044951123777, + "grad_norm": 0.7493406534194946, + "learning_rate": 1.916393084180284e-05, + "loss": 0.5696, + "step": 17438 + }, + { + "epoch": 0.21800545013625341, + "grad_norm": 1.6559957265853882, + "learning_rate": 1.916358148740371e-05, + "loss": 0.601, + "step": 17440 + }, + { + "epoch": 0.21803045076126903, + "grad_norm": 0.008466952480375767, + "learning_rate": 1.9163232063216038e-05, + "loss": 0.6718, + "step": 17442 + }, + { + "epoch": 0.21805545138628465, + "grad_norm": 3.1424410343170166, + "learning_rate": 1.9162882569242488e-05, + "loss": 0.8245, + "step": 17444 + }, + { + "epoch": 0.21808045201130027, + "grad_norm": 0.90096116065979, + "learning_rate": 1.9162533005485716e-05, + "loss": 0.5557, + "step": 17446 + }, + { + "epoch": 0.2181054526363159, + "grad_norm": 4.209840774536133, + "learning_rate": 1.9162183371948394e-05, + "loss": 0.5036, + "step": 17448 + }, + { + "epoch": 0.21813045326133154, + "grad_norm": 2.127868175506592, + "learning_rate": 1.9161833668633176e-05, + "loss": 1.356, + "step": 17450 + }, + { + "epoch": 0.21815545388634716, + "grad_norm": 4.169093132019043, + "learning_rate": 1.9161483895542727e-05, + "loss": 1.7859, + "step": 17452 + }, + { + "epoch": 0.21818045451136278, + "grad_norm": 11.858972549438477, + "learning_rate": 1.9161134052679715e-05, + "loss": 1.2707, + "step": 17454 + }, + { + "epoch": 0.2182054551363784, + "grad_norm": 4.415783405303955, + "learning_rate": 1.9160784140046802e-05, + "loss": 0.9978, + "step": 17456 + }, + { + "epoch": 0.21823045576139402, + "grad_norm": 0.005716489162296057, + "learning_rate": 1.9160434157646652e-05, + "loss": 0.7713, + "step": 17458 + }, + { + "epoch": 0.21825545638640967, + "grad_norm": 4.68405818939209, + "learning_rate": 1.9160084105481927e-05, + "loss": 1.8733, + "step": 17460 + }, + { + "epoch": 0.2182804570114253, + "grad_norm": 0.00904938019812107, + "learning_rate": 1.91597339835553e-05, + "loss": 0.2711, + "step": 17462 + }, + { + "epoch": 0.2183054576364409, + "grad_norm": 3.6879758834838867, + "learning_rate": 1.9159383791869434e-05, + "loss": 0.9552, + "step": 17464 + }, + { + "epoch": 0.21833045826145653, + "grad_norm": 0.011286946944892406, + "learning_rate": 1.9159033530426998e-05, + "loss": 0.3636, + "step": 17466 + }, + { + "epoch": 0.21835545888647215, + "grad_norm": 3.4287004470825195, + "learning_rate": 1.9158683199230654e-05, + "loss": 1.3316, + "step": 17468 + }, + { + "epoch": 0.2183804595114878, + "grad_norm": 4.687029838562012, + "learning_rate": 1.9158332798283077e-05, + "loss": 1.2035, + "step": 17470 + }, + { + "epoch": 0.21840546013650342, + "grad_norm": 0.01642656698822975, + "learning_rate": 1.915798232758693e-05, + "loss": 0.0148, + "step": 17472 + }, + { + "epoch": 0.21843046076151904, + "grad_norm": 3.4499757289886475, + "learning_rate": 1.9157631787144888e-05, + "loss": 0.5511, + "step": 17474 + }, + { + "epoch": 0.21845546138653466, + "grad_norm": 2.6701104640960693, + "learning_rate": 1.9157281176959614e-05, + "loss": 1.0697, + "step": 17476 + }, + { + "epoch": 0.21848046201155027, + "grad_norm": 5.2216362953186035, + "learning_rate": 1.9156930497033785e-05, + "loss": 0.2576, + "step": 17478 + }, + { + "epoch": 0.21850546263656592, + "grad_norm": 5.499237537384033, + "learning_rate": 1.9156579747370066e-05, + "loss": 0.4886, + "step": 17480 + }, + { + "epoch": 0.21853046326158154, + "grad_norm": 4.524376392364502, + "learning_rate": 1.915622892797113e-05, + "loss": 0.6842, + "step": 17482 + }, + { + "epoch": 0.21855546388659716, + "grad_norm": 1.1753363609313965, + "learning_rate": 1.915587803883965e-05, + "loss": 0.0398, + "step": 17484 + }, + { + "epoch": 0.21858046451161278, + "grad_norm": 0.006395441945642233, + "learning_rate": 1.9155527079978298e-05, + "loss": 0.0002, + "step": 17486 + }, + { + "epoch": 0.2186054651366284, + "grad_norm": 0.011755523271858692, + "learning_rate": 1.9155176051389747e-05, + "loss": 0.8621, + "step": 17488 + }, + { + "epoch": 0.21863046576164405, + "grad_norm": 0.011592499911785126, + "learning_rate": 1.915482495307667e-05, + "loss": 1.0606, + "step": 17490 + }, + { + "epoch": 0.21865546638665967, + "grad_norm": 3.7910871505737305, + "learning_rate": 1.9154473785041737e-05, + "loss": 1.1823, + "step": 17492 + }, + { + "epoch": 0.2186804670116753, + "grad_norm": 2.8043570518493652, + "learning_rate": 1.915412254728763e-05, + "loss": 0.9931, + "step": 17494 + }, + { + "epoch": 0.2187054676366909, + "grad_norm": 5.045172214508057, + "learning_rate": 1.915377123981702e-05, + "loss": 1.0687, + "step": 17496 + }, + { + "epoch": 0.21873046826170653, + "grad_norm": 3.5088117122650146, + "learning_rate": 1.915341986263258e-05, + "loss": 1.1584, + "step": 17498 + }, + { + "epoch": 0.21875546888672218, + "grad_norm": 4.576797962188721, + "learning_rate": 1.915306841573699e-05, + "loss": 1.1024, + "step": 17500 + }, + { + "epoch": 0.2187804695117378, + "grad_norm": 2.8407773971557617, + "learning_rate": 1.9152716899132925e-05, + "loss": 1.5013, + "step": 17502 + }, + { + "epoch": 0.21880547013675342, + "grad_norm": 0.007400046102702618, + "learning_rate": 1.9152365312823065e-05, + "loss": 0.4831, + "step": 17504 + }, + { + "epoch": 0.21883047076176904, + "grad_norm": 0.09491265565156937, + "learning_rate": 1.9152013656810084e-05, + "loss": 0.419, + "step": 17506 + }, + { + "epoch": 0.21885547138678468, + "grad_norm": 4.115873336791992, + "learning_rate": 1.915166193109666e-05, + "loss": 1.4458, + "step": 17508 + }, + { + "epoch": 0.2188804720118003, + "grad_norm": 16.51447105407715, + "learning_rate": 1.9151310135685474e-05, + "loss": 0.8961, + "step": 17510 + }, + { + "epoch": 0.21890547263681592, + "grad_norm": 2.772977352142334, + "learning_rate": 1.91509582705792e-05, + "loss": 0.3417, + "step": 17512 + }, + { + "epoch": 0.21893047326183154, + "grad_norm": 0.2405204325914383, + "learning_rate": 1.915060633578053e-05, + "loss": 0.0121, + "step": 17514 + }, + { + "epoch": 0.21895547388684716, + "grad_norm": 5.515206336975098, + "learning_rate": 1.9150254331292132e-05, + "loss": 1.856, + "step": 17516 + }, + { + "epoch": 0.2189804745118628, + "grad_norm": 1.2171789407730103, + "learning_rate": 1.914990225711669e-05, + "loss": 0.0839, + "step": 17518 + }, + { + "epoch": 0.21900547513687843, + "grad_norm": 3.9752891063690186, + "learning_rate": 1.9149550113256884e-05, + "loss": 0.9514, + "step": 17520 + }, + { + "epoch": 0.21903047576189405, + "grad_norm": 3.563128709793091, + "learning_rate": 1.9149197899715403e-05, + "loss": 1.4408, + "step": 17522 + }, + { + "epoch": 0.21905547638690967, + "grad_norm": 3.4914004802703857, + "learning_rate": 1.914884561649492e-05, + "loss": 0.7524, + "step": 17524 + }, + { + "epoch": 0.2190804770119253, + "grad_norm": 1.6347233057022095, + "learning_rate": 1.914849326359813e-05, + "loss": 1.0286, + "step": 17526 + }, + { + "epoch": 0.21910547763694094, + "grad_norm": 3.0910961627960205, + "learning_rate": 1.91481408410277e-05, + "loss": 0.8717, + "step": 17528 + }, + { + "epoch": 0.21913047826195656, + "grad_norm": 3.7879586219787598, + "learning_rate": 1.9147788348786328e-05, + "loss": 0.7866, + "step": 17530 + }, + { + "epoch": 0.21915547888697218, + "grad_norm": 1.7018320560455322, + "learning_rate": 1.9147435786876692e-05, + "loss": 1.0864, + "step": 17532 + }, + { + "epoch": 0.2191804795119878, + "grad_norm": 4.720847129821777, + "learning_rate": 1.914708315530148e-05, + "loss": 0.658, + "step": 17534 + }, + { + "epoch": 0.21920548013700342, + "grad_norm": 3.0895726680755615, + "learning_rate": 1.9146730454063375e-05, + "loss": 1.9128, + "step": 17536 + }, + { + "epoch": 0.21923048076201906, + "grad_norm": 5.994684219360352, + "learning_rate": 1.9146377683165066e-05, + "loss": 2.1298, + "step": 17538 + }, + { + "epoch": 0.21925548138703468, + "grad_norm": 3.111171245574951, + "learning_rate": 1.9146024842609238e-05, + "loss": 1.3401, + "step": 17540 + }, + { + "epoch": 0.2192804820120503, + "grad_norm": 2.6887998580932617, + "learning_rate": 1.9145671932398577e-05, + "loss": 1.7075, + "step": 17542 + }, + { + "epoch": 0.21930548263706592, + "grad_norm": 2.379978895187378, + "learning_rate": 1.9145318952535773e-05, + "loss": 0.781, + "step": 17544 + }, + { + "epoch": 0.21933048326208154, + "grad_norm": 6.273845672607422, + "learning_rate": 1.914496590302351e-05, + "loss": 1.0962, + "step": 17546 + }, + { + "epoch": 0.2193554838870972, + "grad_norm": 1.8750231266021729, + "learning_rate": 1.9144612783864485e-05, + "loss": 1.0371, + "step": 17548 + }, + { + "epoch": 0.2193804845121128, + "grad_norm": 2.5580146312713623, + "learning_rate": 1.9144259595061378e-05, + "loss": 0.7807, + "step": 17550 + }, + { + "epoch": 0.21940548513712843, + "grad_norm": 2.6272151470184326, + "learning_rate": 1.9143906336616885e-05, + "loss": 0.6335, + "step": 17552 + }, + { + "epoch": 0.21943048576214405, + "grad_norm": 2.995384454727173, + "learning_rate": 1.914355300853369e-05, + "loss": 1.3721, + "step": 17554 + }, + { + "epoch": 0.21945548638715967, + "grad_norm": 5.246273994445801, + "learning_rate": 1.9143199610814497e-05, + "loss": 2.3136, + "step": 17556 + }, + { + "epoch": 0.21948048701217532, + "grad_norm": 4.4201979637146, + "learning_rate": 1.9142846143461982e-05, + "loss": 0.4818, + "step": 17558 + }, + { + "epoch": 0.21950548763719094, + "grad_norm": 3.8435802459716797, + "learning_rate": 1.9142492606478843e-05, + "loss": 1.1726, + "step": 17560 + }, + { + "epoch": 0.21953048826220656, + "grad_norm": 0.6668071746826172, + "learning_rate": 1.9142138999867776e-05, + "loss": 0.1813, + "step": 17562 + }, + { + "epoch": 0.21955548888722218, + "grad_norm": 4.202677249908447, + "learning_rate": 1.914178532363147e-05, + "loss": 1.4897, + "step": 17564 + }, + { + "epoch": 0.2195804895122378, + "grad_norm": 3.660646915435791, + "learning_rate": 1.9141431577772623e-05, + "loss": 1.3625, + "step": 17566 + }, + { + "epoch": 0.21960549013725345, + "grad_norm": 0.42740148305892944, + "learning_rate": 1.9141077762293924e-05, + "loss": 0.0966, + "step": 17568 + }, + { + "epoch": 0.21963049076226906, + "grad_norm": 2.529759407043457, + "learning_rate": 1.9140723877198065e-05, + "loss": 0.8987, + "step": 17570 + }, + { + "epoch": 0.21965549138728468, + "grad_norm": 2.390505075454712, + "learning_rate": 1.9140369922487753e-05, + "loss": 0.2421, + "step": 17572 + }, + { + "epoch": 0.2196804920123003, + "grad_norm": 10.22183609008789, + "learning_rate": 1.914001589816567e-05, + "loss": 1.0303, + "step": 17574 + }, + { + "epoch": 0.21970549263731592, + "grad_norm": 1.070652961730957, + "learning_rate": 1.9139661804234523e-05, + "loss": 0.07, + "step": 17576 + }, + { + "epoch": 0.21973049326233157, + "grad_norm": 0.16186441481113434, + "learning_rate": 1.9139307640697003e-05, + "loss": 0.84, + "step": 17578 + }, + { + "epoch": 0.2197554938873472, + "grad_norm": 2.815274953842163, + "learning_rate": 1.9138953407555804e-05, + "loss": 0.2114, + "step": 17580 + }, + { + "epoch": 0.2197804945123628, + "grad_norm": 0.17940451204776764, + "learning_rate": 1.9138599104813633e-05, + "loss": 0.358, + "step": 17582 + }, + { + "epoch": 0.21980549513737843, + "grad_norm": 4.567174434661865, + "learning_rate": 1.9138244732473184e-05, + "loss": 1.2832, + "step": 17584 + }, + { + "epoch": 0.21983049576239405, + "grad_norm": 2.994424819946289, + "learning_rate": 1.913789029053715e-05, + "loss": 1.5852, + "step": 17586 + }, + { + "epoch": 0.2198554963874097, + "grad_norm": 0.9301842451095581, + "learning_rate": 1.913753577900824e-05, + "loss": 0.3075, + "step": 17588 + }, + { + "epoch": 0.21988049701242532, + "grad_norm": 5.352240562438965, + "learning_rate": 1.913718119788915e-05, + "loss": 1.5224, + "step": 17590 + }, + { + "epoch": 0.21990549763744094, + "grad_norm": 1.7670507431030273, + "learning_rate": 1.913682654718258e-05, + "loss": 1.4538, + "step": 17592 + }, + { + "epoch": 0.21993049826245656, + "grad_norm": 5.823740482330322, + "learning_rate": 1.9136471826891232e-05, + "loss": 0.5784, + "step": 17594 + }, + { + "epoch": 0.21995549888747218, + "grad_norm": 4.093822479248047, + "learning_rate": 1.9136117037017804e-05, + "loss": 1.6986, + "step": 17596 + }, + { + "epoch": 0.21998049951248783, + "grad_norm": 3.3482096195220947, + "learning_rate": 1.9135762177565e-05, + "loss": 0.3332, + "step": 17598 + }, + { + "epoch": 0.22000550013750345, + "grad_norm": 4.612293243408203, + "learning_rate": 1.9135407248535525e-05, + "loss": 0.6894, + "step": 17600 + }, + { + "epoch": 0.22003050076251907, + "grad_norm": 0.10422469675540924, + "learning_rate": 1.9135052249932082e-05, + "loss": 0.029, + "step": 17602 + }, + { + "epoch": 0.22005550138753469, + "grad_norm": 3.5217037200927734, + "learning_rate": 1.913469718175737e-05, + "loss": 0.7726, + "step": 17604 + }, + { + "epoch": 0.2200805020125503, + "grad_norm": 4.280953407287598, + "learning_rate": 1.9134342044014098e-05, + "loss": 1.2552, + "step": 17606 + }, + { + "epoch": 0.22010550263756595, + "grad_norm": 1.9508293867111206, + "learning_rate": 1.9133986836704965e-05, + "loss": 1.1928, + "step": 17608 + }, + { + "epoch": 0.22013050326258157, + "grad_norm": 3.1513688564300537, + "learning_rate": 1.913363155983268e-05, + "loss": 0.1966, + "step": 17610 + }, + { + "epoch": 0.2201555038875972, + "grad_norm": 1.2978614568710327, + "learning_rate": 1.9133276213399952e-05, + "loss": 0.6327, + "step": 17612 + }, + { + "epoch": 0.2201805045126128, + "grad_norm": 3.9911837577819824, + "learning_rate": 1.9132920797409484e-05, + "loss": 1.6295, + "step": 17614 + }, + { + "epoch": 0.22020550513762843, + "grad_norm": 3.084383726119995, + "learning_rate": 1.9132565311863983e-05, + "loss": 0.5744, + "step": 17616 + }, + { + "epoch": 0.22023050576264408, + "grad_norm": 2.139986991882324, + "learning_rate": 1.9132209756766155e-05, + "loss": 0.1312, + "step": 17618 + }, + { + "epoch": 0.2202555063876597, + "grad_norm": 4.113583087921143, + "learning_rate": 1.913185413211871e-05, + "loss": 0.6961, + "step": 17620 + }, + { + "epoch": 0.22028050701267532, + "grad_norm": 2.7439873218536377, + "learning_rate": 1.9131498437924354e-05, + "loss": 1.2121, + "step": 17622 + }, + { + "epoch": 0.22030550763769094, + "grad_norm": 7.386373519897461, + "learning_rate": 1.9131142674185797e-05, + "loss": 1.0538, + "step": 17624 + }, + { + "epoch": 0.22033050826270656, + "grad_norm": 5.141300201416016, + "learning_rate": 1.9130786840905746e-05, + "loss": 0.9576, + "step": 17626 + }, + { + "epoch": 0.2203555088877222, + "grad_norm": 0.1045873761177063, + "learning_rate": 1.9130430938086917e-05, + "loss": 0.2266, + "step": 17628 + }, + { + "epoch": 0.22038050951273783, + "grad_norm": 2.096418619155884, + "learning_rate": 1.913007496573202e-05, + "loss": 1.1597, + "step": 17630 + }, + { + "epoch": 0.22040551013775345, + "grad_norm": 3.1653778553009033, + "learning_rate": 1.9129718923843755e-05, + "loss": 0.8924, + "step": 17632 + }, + { + "epoch": 0.22043051076276907, + "grad_norm": 0.6753683686256409, + "learning_rate": 1.9129362812424848e-05, + "loss": 0.0336, + "step": 17634 + }, + { + "epoch": 0.2204555113877847, + "grad_norm": 3.034970283508301, + "learning_rate": 1.9129006631478003e-05, + "loss": 0.9906, + "step": 17636 + }, + { + "epoch": 0.22048051201280033, + "grad_norm": 2.053208827972412, + "learning_rate": 1.9128650381005935e-05, + "loss": 1.4789, + "step": 17638 + }, + { + "epoch": 0.22050551263781595, + "grad_norm": 5.186792373657227, + "learning_rate": 1.9128294061011358e-05, + "loss": 1.577, + "step": 17640 + }, + { + "epoch": 0.22053051326283157, + "grad_norm": 0.30876609683036804, + "learning_rate": 1.9127937671496985e-05, + "loss": 0.1468, + "step": 17642 + }, + { + "epoch": 0.2205555138878472, + "grad_norm": 4.057744026184082, + "learning_rate": 1.9127581212465524e-05, + "loss": 1.2023, + "step": 17644 + }, + { + "epoch": 0.2205805145128628, + "grad_norm": 2.8417389392852783, + "learning_rate": 1.91272246839197e-05, + "loss": 1.3288, + "step": 17646 + }, + { + "epoch": 0.22060551513787846, + "grad_norm": 0.014323676936328411, + "learning_rate": 1.912686808586222e-05, + "loss": 0.4276, + "step": 17648 + }, + { + "epoch": 0.22063051576289408, + "grad_norm": 4.150639057159424, + "learning_rate": 1.9126511418295807e-05, + "loss": 2.0162, + "step": 17650 + }, + { + "epoch": 0.2206555163879097, + "grad_norm": 2.3747758865356445, + "learning_rate": 1.912615468122317e-05, + "loss": 0.9525, + "step": 17652 + }, + { + "epoch": 0.22068051701292532, + "grad_norm": 0.2636207938194275, + "learning_rate": 1.9125797874647032e-05, + "loss": 0.2577, + "step": 17654 + }, + { + "epoch": 0.22070551763794094, + "grad_norm": 3.8498828411102295, + "learning_rate": 1.9125440998570107e-05, + "loss": 0.383, + "step": 17656 + }, + { + "epoch": 0.2207305182629566, + "grad_norm": 0.19642066955566406, + "learning_rate": 1.912508405299511e-05, + "loss": 0.5314, + "step": 17658 + }, + { + "epoch": 0.2207555188879722, + "grad_norm": 0.28905725479125977, + "learning_rate": 1.912472703792477e-05, + "loss": 0.6682, + "step": 17660 + }, + { + "epoch": 0.22078051951298783, + "grad_norm": 1.406488060951233, + "learning_rate": 1.9124369953361797e-05, + "loss": 0.3302, + "step": 17662 + }, + { + "epoch": 0.22080552013800345, + "grad_norm": 3.0073182582855225, + "learning_rate": 1.9124012799308917e-05, + "loss": 0.5078, + "step": 17664 + }, + { + "epoch": 0.22083052076301907, + "grad_norm": 0.20678487420082092, + "learning_rate": 1.9123655575768838e-05, + "loss": 0.3918, + "step": 17666 + }, + { + "epoch": 0.22085552138803471, + "grad_norm": 0.2839130461215973, + "learning_rate": 1.9123298282744292e-05, + "loss": 0.4986, + "step": 17668 + }, + { + "epoch": 0.22088052201305033, + "grad_norm": 0.20281171798706055, + "learning_rate": 1.9122940920237996e-05, + "loss": 0.0074, + "step": 17670 + }, + { + "epoch": 0.22090552263806595, + "grad_norm": 4.922669410705566, + "learning_rate": 1.912258348825267e-05, + "loss": 1.2035, + "step": 17672 + }, + { + "epoch": 0.22093052326308157, + "grad_norm": 8.05523681640625, + "learning_rate": 1.9122225986791042e-05, + "loss": 1.6573, + "step": 17674 + }, + { + "epoch": 0.2209555238880972, + "grad_norm": 0.06362928450107574, + "learning_rate": 1.912186841585583e-05, + "loss": 0.705, + "step": 17676 + }, + { + "epoch": 0.22098052451311284, + "grad_norm": 4.835093021392822, + "learning_rate": 1.9121510775449756e-05, + "loss": 1.0841, + "step": 17678 + }, + { + "epoch": 0.22100552513812846, + "grad_norm": 4.8590826988220215, + "learning_rate": 1.9121153065575546e-05, + "loss": 2.6577, + "step": 17680 + }, + { + "epoch": 0.22103052576314408, + "grad_norm": 4.055215835571289, + "learning_rate": 1.9120795286235924e-05, + "loss": 0.5843, + "step": 17682 + }, + { + "epoch": 0.2210555263881597, + "grad_norm": 4.679352283477783, + "learning_rate": 1.912043743743362e-05, + "loss": 2.7457, + "step": 17684 + }, + { + "epoch": 0.22108052701317532, + "grad_norm": 2.6400411128997803, + "learning_rate": 1.9120079519171345e-05, + "loss": 0.5419, + "step": 17686 + }, + { + "epoch": 0.22110552763819097, + "grad_norm": 1.6107896566390991, + "learning_rate": 1.9119721531451842e-05, + "loss": 1.3438, + "step": 17688 + }, + { + "epoch": 0.2211305282632066, + "grad_norm": 0.5887475609779358, + "learning_rate": 1.9119363474277824e-05, + "loss": 0.5267, + "step": 17690 + }, + { + "epoch": 0.2211555288882222, + "grad_norm": 4.0625200271606445, + "learning_rate": 1.911900534765203e-05, + "loss": 1.1391, + "step": 17692 + }, + { + "epoch": 0.22118052951323783, + "grad_norm": 3.2205052375793457, + "learning_rate": 1.9118647151577175e-05, + "loss": 1.1072, + "step": 17694 + }, + { + "epoch": 0.22120553013825345, + "grad_norm": 3.0675365924835205, + "learning_rate": 1.9118288886055996e-05, + "loss": 0.9568, + "step": 17696 + }, + { + "epoch": 0.2212305307632691, + "grad_norm": 2.3951032161712646, + "learning_rate": 1.9117930551091217e-05, + "loss": 0.4203, + "step": 17698 + }, + { + "epoch": 0.22125553138828472, + "grad_norm": 7.205334186553955, + "learning_rate": 1.9117572146685566e-05, + "loss": 0.6778, + "step": 17700 + }, + { + "epoch": 0.22128053201330034, + "grad_norm": 0.5960238575935364, + "learning_rate": 1.911721367284178e-05, + "loss": 0.6632, + "step": 17702 + }, + { + "epoch": 0.22130553263831595, + "grad_norm": 1.1818424463272095, + "learning_rate": 1.911685512956258e-05, + "loss": 0.0996, + "step": 17704 + }, + { + "epoch": 0.22133053326333157, + "grad_norm": 7.041118144989014, + "learning_rate": 1.9116496516850703e-05, + "loss": 0.8646, + "step": 17706 + }, + { + "epoch": 0.22135553388834722, + "grad_norm": 2.3225791454315186, + "learning_rate": 1.9116137834708875e-05, + "loss": 1.4929, + "step": 17708 + }, + { + "epoch": 0.22138053451336284, + "grad_norm": 2.4701690673828125, + "learning_rate": 1.911577908313983e-05, + "loss": 0.6709, + "step": 17710 + }, + { + "epoch": 0.22140553513837846, + "grad_norm": 3.378779888153076, + "learning_rate": 1.9115420262146304e-05, + "loss": 0.5958, + "step": 17712 + }, + { + "epoch": 0.22143053576339408, + "grad_norm": 4.700928211212158, + "learning_rate": 1.9115061371731027e-05, + "loss": 1.9129, + "step": 17714 + }, + { + "epoch": 0.2214555363884097, + "grad_norm": 5.071202754974365, + "learning_rate": 1.9114702411896727e-05, + "loss": 1.2772, + "step": 17716 + }, + { + "epoch": 0.22148053701342535, + "grad_norm": 3.8512704372406006, + "learning_rate": 1.9114343382646147e-05, + "loss": 1.9253, + "step": 17718 + }, + { + "epoch": 0.22150553763844097, + "grad_norm": 2.7549455165863037, + "learning_rate": 1.9113984283982012e-05, + "loss": 0.4895, + "step": 17720 + }, + { + "epoch": 0.2215305382634566, + "grad_norm": 0.9948642253875732, + "learning_rate": 1.9113625115907065e-05, + "loss": 0.2628, + "step": 17722 + }, + { + "epoch": 0.2215555388884722, + "grad_norm": 6.249518871307373, + "learning_rate": 1.9113265878424035e-05, + "loss": 0.5784, + "step": 17724 + }, + { + "epoch": 0.22158053951348783, + "grad_norm": 0.37957215309143066, + "learning_rate": 1.9112906571535662e-05, + "loss": 0.8054, + "step": 17726 + }, + { + "epoch": 0.22160554013850348, + "grad_norm": 3.9607229232788086, + "learning_rate": 1.911254719524468e-05, + "loss": 1.5068, + "step": 17728 + }, + { + "epoch": 0.2216305407635191, + "grad_norm": 6.7569451332092285, + "learning_rate": 1.9112187749553832e-05, + "loss": 1.2757, + "step": 17730 + }, + { + "epoch": 0.22165554138853472, + "grad_norm": 0.12835681438446045, + "learning_rate": 1.9111828234465847e-05, + "loss": 0.1927, + "step": 17732 + }, + { + "epoch": 0.22168054201355034, + "grad_norm": 2.294922113418579, + "learning_rate": 1.911146864998347e-05, + "loss": 1.1459, + "step": 17734 + }, + { + "epoch": 0.22170554263856596, + "grad_norm": 3.314323902130127, + "learning_rate": 1.911110899610943e-05, + "loss": 1.4978, + "step": 17736 + }, + { + "epoch": 0.2217305432635816, + "grad_norm": 1.948129415512085, + "learning_rate": 1.9110749272846477e-05, + "loss": 0.0419, + "step": 17738 + }, + { + "epoch": 0.22175554388859722, + "grad_norm": 0.2528464198112488, + "learning_rate": 1.9110389480197346e-05, + "loss": 0.0557, + "step": 17740 + }, + { + "epoch": 0.22178054451361284, + "grad_norm": 4.9278106689453125, + "learning_rate": 1.9110029618164775e-05, + "loss": 1.2623, + "step": 17742 + }, + { + "epoch": 0.22180554513862846, + "grad_norm": 0.8799591660499573, + "learning_rate": 1.9109669686751508e-05, + "loss": 0.1247, + "step": 17744 + }, + { + "epoch": 0.22183054576364408, + "grad_norm": 4.222655773162842, + "learning_rate": 1.9109309685960286e-05, + "loss": 1.0288, + "step": 17746 + }, + { + "epoch": 0.22185554638865973, + "grad_norm": 3.6293745040893555, + "learning_rate": 1.9108949615793847e-05, + "loss": 0.8313, + "step": 17748 + }, + { + "epoch": 0.22188054701367535, + "grad_norm": 4.696975231170654, + "learning_rate": 1.9108589476254936e-05, + "loss": 1.397, + "step": 17750 + }, + { + "epoch": 0.22190554763869097, + "grad_norm": 0.16730768978595734, + "learning_rate": 1.9108229267346296e-05, + "loss": 0.8294, + "step": 17752 + }, + { + "epoch": 0.2219305482637066, + "grad_norm": 4.126861095428467, + "learning_rate": 1.9107868989070672e-05, + "loss": 0.701, + "step": 17754 + }, + { + "epoch": 0.2219555488887222, + "grad_norm": 3.9593465328216553, + "learning_rate": 1.9107508641430802e-05, + "loss": 1.213, + "step": 17756 + }, + { + "epoch": 0.22198054951373786, + "grad_norm": 2.9627914428710938, + "learning_rate": 1.9107148224429436e-05, + "loss": 0.9468, + "step": 17758 + }, + { + "epoch": 0.22200555013875348, + "grad_norm": 1.7011433839797974, + "learning_rate": 1.910678773806932e-05, + "loss": 0.5869, + "step": 17760 + }, + { + "epoch": 0.2220305507637691, + "grad_norm": 1.3046913146972656, + "learning_rate": 1.910642718235319e-05, + "loss": 0.4701, + "step": 17762 + }, + { + "epoch": 0.22205555138878472, + "grad_norm": 4.215795516967773, + "learning_rate": 1.9106066557283803e-05, + "loss": 1.9495, + "step": 17764 + }, + { + "epoch": 0.22208055201380034, + "grad_norm": 2.3859190940856934, + "learning_rate": 1.91057058628639e-05, + "loss": 1.0792, + "step": 17766 + }, + { + "epoch": 0.22210555263881598, + "grad_norm": 0.1362321674823761, + "learning_rate": 1.910534509909623e-05, + "loss": 0.478, + "step": 17768 + }, + { + "epoch": 0.2221305532638316, + "grad_norm": 0.006230425089597702, + "learning_rate": 1.9104984265983538e-05, + "loss": 0.057, + "step": 17770 + }, + { + "epoch": 0.22215555388884722, + "grad_norm": 3.339980125427246, + "learning_rate": 1.9104623363528573e-05, + "loss": 0.9618, + "step": 17772 + }, + { + "epoch": 0.22218055451386284, + "grad_norm": 3.1811506748199463, + "learning_rate": 1.9104262391734085e-05, + "loss": 2.188, + "step": 17774 + }, + { + "epoch": 0.22220555513887846, + "grad_norm": 5.968886852264404, + "learning_rate": 1.910390135060282e-05, + "loss": 1.9373, + "step": 17776 + }, + { + "epoch": 0.2222305557638941, + "grad_norm": 5.079289436340332, + "learning_rate": 1.9103540240137534e-05, + "loss": 1.6439, + "step": 17778 + }, + { + "epoch": 0.22225555638890973, + "grad_norm": 3.4912002086639404, + "learning_rate": 1.910317906034097e-05, + "loss": 0.9788, + "step": 17780 + }, + { + "epoch": 0.22228055701392535, + "grad_norm": 1.8424135446548462, + "learning_rate": 1.910281781121588e-05, + "loss": 1.0757, + "step": 17782 + }, + { + "epoch": 0.22230555763894097, + "grad_norm": 0.05746188387274742, + "learning_rate": 1.9102456492765017e-05, + "loss": 1.0284, + "step": 17784 + }, + { + "epoch": 0.2223305582639566, + "grad_norm": 2.7631006240844727, + "learning_rate": 1.9102095104991134e-05, + "loss": 0.7075, + "step": 17786 + }, + { + "epoch": 0.22235555888897224, + "grad_norm": 3.2150936126708984, + "learning_rate": 1.9101733647896983e-05, + "loss": 0.8731, + "step": 17788 + }, + { + "epoch": 0.22238055951398786, + "grad_norm": 5.398874759674072, + "learning_rate": 1.9101372121485314e-05, + "loss": 1.6557, + "step": 17790 + }, + { + "epoch": 0.22240556013900348, + "grad_norm": 7.120207786560059, + "learning_rate": 1.9101010525758883e-05, + "loss": 1.0616, + "step": 17792 + }, + { + "epoch": 0.2224305607640191, + "grad_norm": 2.4632906913757324, + "learning_rate": 1.910064886072044e-05, + "loss": 0.1768, + "step": 17794 + }, + { + "epoch": 0.22245556138903472, + "grad_norm": 2.2774770259857178, + "learning_rate": 1.9100287126372747e-05, + "loss": 0.5919, + "step": 17796 + }, + { + "epoch": 0.22248056201405036, + "grad_norm": 0.06513552367687225, + "learning_rate": 1.909992532271855e-05, + "loss": 0.5637, + "step": 17798 + }, + { + "epoch": 0.22250556263906598, + "grad_norm": 0.06382124125957489, + "learning_rate": 1.9099563449760612e-05, + "loss": 0.242, + "step": 17800 + }, + { + "epoch": 0.2225305632640816, + "grad_norm": 1.1240659952163696, + "learning_rate": 1.9099201507501682e-05, + "loss": 0.8094, + "step": 17802 + }, + { + "epoch": 0.22255556388909722, + "grad_norm": 1.5822046995162964, + "learning_rate": 1.9098839495944524e-05, + "loss": 0.052, + "step": 17804 + }, + { + "epoch": 0.22258056451411284, + "grad_norm": 2.4770374298095703, + "learning_rate": 1.9098477415091888e-05, + "loss": 1.1908, + "step": 17806 + }, + { + "epoch": 0.2226055651391285, + "grad_norm": 2.574681043624878, + "learning_rate": 1.9098115264946536e-05, + "loss": 0.5205, + "step": 17808 + }, + { + "epoch": 0.2226305657641441, + "grad_norm": 0.6751037836074829, + "learning_rate": 1.9097753045511226e-05, + "loss": 0.9135, + "step": 17810 + }, + { + "epoch": 0.22265556638915973, + "grad_norm": 2.3174712657928467, + "learning_rate": 1.9097390756788712e-05, + "loss": 0.4855, + "step": 17812 + }, + { + "epoch": 0.22268056701417535, + "grad_norm": 3.063109874725342, + "learning_rate": 1.909702839878176e-05, + "loss": 0.8059, + "step": 17814 + }, + { + "epoch": 0.22270556763919097, + "grad_norm": 2.274144172668457, + "learning_rate": 1.9096665971493123e-05, + "loss": 1.0395, + "step": 17816 + }, + { + "epoch": 0.22273056826420662, + "grad_norm": 2.0356712341308594, + "learning_rate": 1.9096303474925567e-05, + "loss": 1.2906, + "step": 17818 + }, + { + "epoch": 0.22275556888922224, + "grad_norm": 0.4014451205730438, + "learning_rate": 1.909594090908185e-05, + "loss": 0.0807, + "step": 17820 + }, + { + "epoch": 0.22278056951423786, + "grad_norm": 4.933962821960449, + "learning_rate": 1.9095578273964727e-05, + "loss": 1.4599, + "step": 17822 + }, + { + "epoch": 0.22280557013925348, + "grad_norm": 2.1450419425964355, + "learning_rate": 1.9095215569576972e-05, + "loss": 0.6646, + "step": 17824 + }, + { + "epoch": 0.2228305707642691, + "grad_norm": 5.824737071990967, + "learning_rate": 1.909485279592134e-05, + "loss": 1.8935, + "step": 17826 + }, + { + "epoch": 0.22285557138928475, + "grad_norm": 3.132087469100952, + "learning_rate": 1.9094489953000596e-05, + "loss": 1.1244, + "step": 17828 + }, + { + "epoch": 0.22288057201430037, + "grad_norm": 2.3930392265319824, + "learning_rate": 1.9094127040817506e-05, + "loss": 0.5116, + "step": 17830 + }, + { + "epoch": 0.22290557263931599, + "grad_norm": 2.970458745956421, + "learning_rate": 1.9093764059374822e-05, + "loss": 0.6516, + "step": 17832 + }, + { + "epoch": 0.2229305732643316, + "grad_norm": 3.090311288833618, + "learning_rate": 1.9093401008675324e-05, + "loss": 0.7149, + "step": 17834 + }, + { + "epoch": 0.22295557388934722, + "grad_norm": 3.0061535835266113, + "learning_rate": 1.9093037888721767e-05, + "loss": 1.4261, + "step": 17836 + }, + { + "epoch": 0.22298057451436287, + "grad_norm": 3.723846197128296, + "learning_rate": 1.909267469951692e-05, + "loss": 2.0542, + "step": 17838 + }, + { + "epoch": 0.2230055751393785, + "grad_norm": 2.8269238471984863, + "learning_rate": 1.9092311441063547e-05, + "loss": 1.2874, + "step": 17840 + }, + { + "epoch": 0.2230305757643941, + "grad_norm": 1.0038748979568481, + "learning_rate": 1.909194811336442e-05, + "loss": 0.8138, + "step": 17842 + }, + { + "epoch": 0.22305557638940973, + "grad_norm": 1.4719791412353516, + "learning_rate": 1.9091584716422298e-05, + "loss": 0.6866, + "step": 17844 + }, + { + "epoch": 0.22308057701442535, + "grad_norm": 0.003863678313791752, + "learning_rate": 1.9091221250239952e-05, + "loss": 0.3111, + "step": 17846 + }, + { + "epoch": 0.223105577639441, + "grad_norm": 5.105356693267822, + "learning_rate": 1.9090857714820153e-05, + "loss": 1.7131, + "step": 17848 + }, + { + "epoch": 0.22313057826445662, + "grad_norm": 3.7403969764709473, + "learning_rate": 1.9090494110165667e-05, + "loss": 1.4903, + "step": 17850 + }, + { + "epoch": 0.22315557888947224, + "grad_norm": 4.432741641998291, + "learning_rate": 1.9090130436279262e-05, + "loss": 1.6344, + "step": 17852 + }, + { + "epoch": 0.22318057951448786, + "grad_norm": 3.7024266719818115, + "learning_rate": 1.908976669316371e-05, + "loss": 0.6483, + "step": 17854 + }, + { + "epoch": 0.22320558013950348, + "grad_norm": 1.9809141159057617, + "learning_rate": 1.9089402880821782e-05, + "loss": 0.6602, + "step": 17856 + }, + { + "epoch": 0.22323058076451913, + "grad_norm": 3.073742628097534, + "learning_rate": 1.9089038999256244e-05, + "loss": 1.1093, + "step": 17858 + }, + { + "epoch": 0.22325558138953475, + "grad_norm": 4.587313175201416, + "learning_rate": 1.908867504846987e-05, + "loss": 1.2351, + "step": 17860 + }, + { + "epoch": 0.22328058201455037, + "grad_norm": 3.7960939407348633, + "learning_rate": 1.9088311028465435e-05, + "loss": 0.257, + "step": 17862 + }, + { + "epoch": 0.22330558263956599, + "grad_norm": 5.488715171813965, + "learning_rate": 1.9087946939245704e-05, + "loss": 1.8795, + "step": 17864 + }, + { + "epoch": 0.2233305832645816, + "grad_norm": 1.8735789060592651, + "learning_rate": 1.9087582780813457e-05, + "loss": 0.0782, + "step": 17866 + }, + { + "epoch": 0.22335558388959725, + "grad_norm": 3.9606375694274902, + "learning_rate": 1.9087218553171466e-05, + "loss": 2.2479, + "step": 17868 + }, + { + "epoch": 0.22338058451461287, + "grad_norm": 2.4523892402648926, + "learning_rate": 1.9086854256322503e-05, + "loss": 1.1297, + "step": 17870 + }, + { + "epoch": 0.2234055851396285, + "grad_norm": 2.9593958854675293, + "learning_rate": 1.9086489890269342e-05, + "loss": 1.0717, + "step": 17872 + }, + { + "epoch": 0.2234305857646441, + "grad_norm": 0.006900089792907238, + "learning_rate": 1.9086125455014758e-05, + "loss": 0.381, + "step": 17874 + }, + { + "epoch": 0.22345558638965973, + "grad_norm": 0.32916882634162903, + "learning_rate": 1.908576095056153e-05, + "loss": 0.0337, + "step": 17876 + }, + { + "epoch": 0.22348058701467538, + "grad_norm": 2.848590612411499, + "learning_rate": 1.9085396376912427e-05, + "loss": 0.7549, + "step": 17878 + }, + { + "epoch": 0.223505587639691, + "grad_norm": 4.58698844909668, + "learning_rate": 1.9085031734070234e-05, + "loss": 1.1492, + "step": 17880 + }, + { + "epoch": 0.22353058826470662, + "grad_norm": 4.968616485595703, + "learning_rate": 1.9084667022037724e-05, + "loss": 2.1717, + "step": 17882 + }, + { + "epoch": 0.22355558888972224, + "grad_norm": 4.16237211227417, + "learning_rate": 1.9084302240817672e-05, + "loss": 1.3526, + "step": 17884 + }, + { + "epoch": 0.22358058951473786, + "grad_norm": 0.9356205463409424, + "learning_rate": 1.908393739041286e-05, + "loss": 0.5265, + "step": 17886 + }, + { + "epoch": 0.2236055901397535, + "grad_norm": 2.362431287765503, + "learning_rate": 1.9083572470826067e-05, + "loss": 0.1553, + "step": 17888 + }, + { + "epoch": 0.22363059076476913, + "grad_norm": 4.301016330718994, + "learning_rate": 1.908320748206007e-05, + "loss": 1.7983, + "step": 17890 + }, + { + "epoch": 0.22365559138978475, + "grad_norm": 2.79032301902771, + "learning_rate": 1.9082842424117648e-05, + "loss": 0.835, + "step": 17892 + }, + { + "epoch": 0.22368059201480037, + "grad_norm": 2.934292793273926, + "learning_rate": 1.9082477297001586e-05, + "loss": 1.8193, + "step": 17894 + }, + { + "epoch": 0.223705592639816, + "grad_norm": 6.856455326080322, + "learning_rate": 1.9082112100714654e-05, + "loss": 0.4706, + "step": 17896 + }, + { + "epoch": 0.22373059326483163, + "grad_norm": 4.983741760253906, + "learning_rate": 1.9081746835259646e-05, + "loss": 1.1212, + "step": 17898 + }, + { + "epoch": 0.22375559388984725, + "grad_norm": 4.033047676086426, + "learning_rate": 1.908138150063934e-05, + "loss": 0.2127, + "step": 17900 + }, + { + "epoch": 0.22378059451486287, + "grad_norm": 3.6575489044189453, + "learning_rate": 1.9081016096856513e-05, + "loss": 1.3201, + "step": 17902 + }, + { + "epoch": 0.2238055951398785, + "grad_norm": 4.102745532989502, + "learning_rate": 1.908065062391395e-05, + "loss": 0.2483, + "step": 17904 + }, + { + "epoch": 0.2238305957648941, + "grad_norm": 4.051555633544922, + "learning_rate": 1.908028508181444e-05, + "loss": 1.7288, + "step": 17906 + }, + { + "epoch": 0.22385559638990976, + "grad_norm": 4.178441524505615, + "learning_rate": 1.9079919470560764e-05, + "loss": 1.6312, + "step": 17908 + }, + { + "epoch": 0.22388059701492538, + "grad_norm": 3.767415761947632, + "learning_rate": 1.90795537901557e-05, + "loss": 2.1719, + "step": 17910 + }, + { + "epoch": 0.223905597639941, + "grad_norm": 3.8240444660186768, + "learning_rate": 1.9079188040602042e-05, + "loss": 1.1392, + "step": 17912 + }, + { + "epoch": 0.22393059826495662, + "grad_norm": 2.3937504291534424, + "learning_rate": 1.9078822221902572e-05, + "loss": 0.679, + "step": 17914 + }, + { + "epoch": 0.22395559888997224, + "grad_norm": 3.18898868560791, + "learning_rate": 1.9078456334060074e-05, + "loss": 0.4739, + "step": 17916 + }, + { + "epoch": 0.2239805995149879, + "grad_norm": 2.579470157623291, + "learning_rate": 1.907809037707734e-05, + "loss": 0.5215, + "step": 17918 + }, + { + "epoch": 0.2240056001400035, + "grad_norm": 1.7747936248779297, + "learning_rate": 1.907772435095715e-05, + "loss": 1.2848, + "step": 17920 + }, + { + "epoch": 0.22403060076501913, + "grad_norm": 2.7937710285186768, + "learning_rate": 1.9077358255702296e-05, + "loss": 1.5581, + "step": 17922 + }, + { + "epoch": 0.22405560139003475, + "grad_norm": 15.271629333496094, + "learning_rate": 1.907699209131556e-05, + "loss": 1.5078, + "step": 17924 + }, + { + "epoch": 0.22408060201505037, + "grad_norm": 4.431042671203613, + "learning_rate": 1.9076625857799744e-05, + "loss": 2.5371, + "step": 17926 + }, + { + "epoch": 0.22410560264006602, + "grad_norm": 0.01604302041232586, + "learning_rate": 1.9076259555157625e-05, + "loss": 0.4456, + "step": 17928 + }, + { + "epoch": 0.22413060326508163, + "grad_norm": 2.9544479846954346, + "learning_rate": 1.9075893183391997e-05, + "loss": 1.2224, + "step": 17930 + }, + { + "epoch": 0.22415560389009725, + "grad_norm": 2.2468667030334473, + "learning_rate": 1.907552674250565e-05, + "loss": 0.4421, + "step": 17932 + }, + { + "epoch": 0.22418060451511287, + "grad_norm": 4.035822868347168, + "learning_rate": 1.9075160232501376e-05, + "loss": 1.7905, + "step": 17934 + }, + { + "epoch": 0.2242056051401285, + "grad_norm": 4.300164699554443, + "learning_rate": 1.9074793653381964e-05, + "loss": 1.1758, + "step": 17936 + }, + { + "epoch": 0.22423060576514414, + "grad_norm": 2.203930616378784, + "learning_rate": 1.907442700515021e-05, + "loss": 0.909, + "step": 17938 + }, + { + "epoch": 0.22425560639015976, + "grad_norm": 5.422328948974609, + "learning_rate": 1.90740602878089e-05, + "loss": 0.8967, + "step": 17940 + }, + { + "epoch": 0.22428060701517538, + "grad_norm": 1.944762110710144, + "learning_rate": 1.907369350136083e-05, + "loss": 0.8984, + "step": 17942 + }, + { + "epoch": 0.224305607640191, + "grad_norm": 0.05145464465022087, + "learning_rate": 1.9073326645808792e-05, + "loss": 0.5676, + "step": 17944 + }, + { + "epoch": 0.22433060826520662, + "grad_norm": 0.008527850732207298, + "learning_rate": 1.9072959721155584e-05, + "loss": 1.0923, + "step": 17946 + }, + { + "epoch": 0.22435560889022227, + "grad_norm": 0.03657565638422966, + "learning_rate": 1.9072592727403996e-05, + "loss": 0.399, + "step": 17948 + }, + { + "epoch": 0.2243806095152379, + "grad_norm": 4.707253456115723, + "learning_rate": 1.9072225664556824e-05, + "loss": 1.2286, + "step": 17950 + }, + { + "epoch": 0.2244056101402535, + "grad_norm": 3.2760932445526123, + "learning_rate": 1.9071858532616866e-05, + "loss": 1.2662, + "step": 17952 + }, + { + "epoch": 0.22443061076526913, + "grad_norm": 2.6921768188476562, + "learning_rate": 1.9071491331586918e-05, + "loss": 1.9023, + "step": 17954 + }, + { + "epoch": 0.22445561139028475, + "grad_norm": 2.543466567993164, + "learning_rate": 1.9071124061469774e-05, + "loss": 0.6882, + "step": 17956 + }, + { + "epoch": 0.2244806120153004, + "grad_norm": 2.6198277473449707, + "learning_rate": 1.907075672226823e-05, + "loss": 1.0696, + "step": 17958 + }, + { + "epoch": 0.22450561264031602, + "grad_norm": 0.35937777161598206, + "learning_rate": 1.9070389313985088e-05, + "loss": 1.1687, + "step": 17960 + }, + { + "epoch": 0.22453061326533164, + "grad_norm": 3.3808417320251465, + "learning_rate": 1.907002183662314e-05, + "loss": 0.883, + "step": 17962 + }, + { + "epoch": 0.22455561389034726, + "grad_norm": 2.2633328437805176, + "learning_rate": 1.9069654290185192e-05, + "loss": 0.6754, + "step": 17964 + }, + { + "epoch": 0.22458061451536288, + "grad_norm": 10.289257049560547, + "learning_rate": 1.9069286674674034e-05, + "loss": 0.5665, + "step": 17966 + }, + { + "epoch": 0.22460561514037852, + "grad_norm": 0.052203841507434845, + "learning_rate": 1.9068918990092474e-05, + "loss": 0.2599, + "step": 17968 + }, + { + "epoch": 0.22463061576539414, + "grad_norm": 3.4320757389068604, + "learning_rate": 1.9068551236443313e-05, + "loss": 1.6186, + "step": 17970 + }, + { + "epoch": 0.22465561639040976, + "grad_norm": 5.870650768280029, + "learning_rate": 1.9068183413729345e-05, + "loss": 0.9677, + "step": 17972 + }, + { + "epoch": 0.22468061701542538, + "grad_norm": 3.478410243988037, + "learning_rate": 1.9067815521953376e-05, + "loss": 0.8804, + "step": 17974 + }, + { + "epoch": 0.224705617640441, + "grad_norm": 5.806146144866943, + "learning_rate": 1.9067447561118207e-05, + "loss": 0.9964, + "step": 17976 + }, + { + "epoch": 0.22473061826545665, + "grad_norm": 1.1198433637619019, + "learning_rate": 1.9067079531226635e-05, + "loss": 0.8262, + "step": 17978 + }, + { + "epoch": 0.22475561889047227, + "grad_norm": 4.287750720977783, + "learning_rate": 1.9066711432281474e-05, + "loss": 1.5467, + "step": 17980 + }, + { + "epoch": 0.2247806195154879, + "grad_norm": 2.249211549758911, + "learning_rate": 1.9066343264285516e-05, + "loss": 0.7774, + "step": 17982 + }, + { + "epoch": 0.2248056201405035, + "grad_norm": 0.6586966514587402, + "learning_rate": 1.906597502724157e-05, + "loss": 0.5052, + "step": 17984 + }, + { + "epoch": 0.22483062076551913, + "grad_norm": 0.04245049133896828, + "learning_rate": 1.9065606721152442e-05, + "loss": 1.1574, + "step": 17986 + }, + { + "epoch": 0.22485562139053478, + "grad_norm": 0.038102637976408005, + "learning_rate": 1.9065238346020937e-05, + "loss": 0.586, + "step": 17988 + }, + { + "epoch": 0.2248806220155504, + "grad_norm": 0.03122873604297638, + "learning_rate": 1.9064869901849853e-05, + "loss": 0.6689, + "step": 17990 + }, + { + "epoch": 0.22490562264056602, + "grad_norm": 3.7095537185668945, + "learning_rate": 1.9064501388642006e-05, + "loss": 1.8718, + "step": 17992 + }, + { + "epoch": 0.22493062326558164, + "grad_norm": 3.0910656452178955, + "learning_rate": 1.90641328064002e-05, + "loss": 0.7713, + "step": 17994 + }, + { + "epoch": 0.22495562389059726, + "grad_norm": 3.8680906295776367, + "learning_rate": 1.9063764155127237e-05, + "loss": 0.7729, + "step": 17996 + }, + { + "epoch": 0.2249806245156129, + "grad_norm": 0.00834348052740097, + "learning_rate": 1.906339543482593e-05, + "loss": 0.6819, + "step": 17998 + }, + { + "epoch": 0.22500562514062852, + "grad_norm": 7.052984714508057, + "learning_rate": 1.906302664549908e-05, + "loss": 1.3503, + "step": 18000 + }, + { + "epoch": 0.22503062576564414, + "grad_norm": 5.686069965362549, + "learning_rate": 1.9062657787149505e-05, + "loss": 1.3779, + "step": 18002 + }, + { + "epoch": 0.22505562639065976, + "grad_norm": 4.541360855102539, + "learning_rate": 1.906228885978001e-05, + "loss": 1.3804, + "step": 18004 + }, + { + "epoch": 0.22508062701567538, + "grad_norm": 4.242555618286133, + "learning_rate": 1.9061919863393402e-05, + "loss": 0.7288, + "step": 18006 + }, + { + "epoch": 0.22510562764069103, + "grad_norm": 5.240377426147461, + "learning_rate": 1.9061550797992495e-05, + "loss": 0.4653, + "step": 18008 + }, + { + "epoch": 0.22513062826570665, + "grad_norm": 5.253702163696289, + "learning_rate": 1.9061181663580103e-05, + "loss": 1.1856, + "step": 18010 + }, + { + "epoch": 0.22515562889072227, + "grad_norm": 0.17309921979904175, + "learning_rate": 1.906081246015903e-05, + "loss": 1.5072, + "step": 18012 + }, + { + "epoch": 0.2251806295157379, + "grad_norm": 4.982430458068848, + "learning_rate": 1.906044318773209e-05, + "loss": 1.2778, + "step": 18014 + }, + { + "epoch": 0.2252056301407535, + "grad_norm": 3.343425750732422, + "learning_rate": 1.906007384630209e-05, + "loss": 0.7363, + "step": 18016 + }, + { + "epoch": 0.22523063076576916, + "grad_norm": 1.2413572072982788, + "learning_rate": 1.9059704435871856e-05, + "loss": 0.5789, + "step": 18018 + }, + { + "epoch": 0.22525563139078478, + "grad_norm": 3.105952739715576, + "learning_rate": 1.905933495644419e-05, + "loss": 0.8267, + "step": 18020 + }, + { + "epoch": 0.2252806320158004, + "grad_norm": 0.03501225635409355, + "learning_rate": 1.9058965408021916e-05, + "loss": 0.0019, + "step": 18022 + }, + { + "epoch": 0.22530563264081602, + "grad_norm": 2.847510814666748, + "learning_rate": 1.9058595790607838e-05, + "loss": 1.7695, + "step": 18024 + }, + { + "epoch": 0.22533063326583164, + "grad_norm": 5.780735969543457, + "learning_rate": 1.905822610420478e-05, + "loss": 1.3461, + "step": 18026 + }, + { + "epoch": 0.22535563389084728, + "grad_norm": 0.4126316010951996, + "learning_rate": 1.9057856348815546e-05, + "loss": 0.9975, + "step": 18028 + }, + { + "epoch": 0.2253806345158629, + "grad_norm": 2.5809545516967773, + "learning_rate": 1.9057486524442963e-05, + "loss": 0.7184, + "step": 18030 + }, + { + "epoch": 0.22540563514087852, + "grad_norm": 0.033696990460157394, + "learning_rate": 1.9057116631089844e-05, + "loss": 0.0006, + "step": 18032 + }, + { + "epoch": 0.22543063576589414, + "grad_norm": 1.6969283819198608, + "learning_rate": 1.9056746668759003e-05, + "loss": 0.7851, + "step": 18034 + }, + { + "epoch": 0.22545563639090976, + "grad_norm": 4.394291400909424, + "learning_rate": 1.9056376637453263e-05, + "loss": 1.4026, + "step": 18036 + }, + { + "epoch": 0.2254806370159254, + "grad_norm": 3.9771687984466553, + "learning_rate": 1.9056006537175435e-05, + "loss": 1.4751, + "step": 18038 + }, + { + "epoch": 0.22550563764094103, + "grad_norm": 2.6855430603027344, + "learning_rate": 1.9055636367928343e-05, + "loss": 0.9194, + "step": 18040 + }, + { + "epoch": 0.22553063826595665, + "grad_norm": 3.3286941051483154, + "learning_rate": 1.905526612971481e-05, + "loss": 0.7821, + "step": 18042 + }, + { + "epoch": 0.22555563889097227, + "grad_norm": 3.1408395767211914, + "learning_rate": 1.9054895822537648e-05, + "loss": 1.5045, + "step": 18044 + }, + { + "epoch": 0.2255806395159879, + "grad_norm": 3.633110523223877, + "learning_rate": 1.905452544639968e-05, + "loss": 1.6791, + "step": 18046 + }, + { + "epoch": 0.22560564014100354, + "grad_norm": 0.053437985479831696, + "learning_rate": 1.9054155001303724e-05, + "loss": 0.3986, + "step": 18048 + }, + { + "epoch": 0.22563064076601916, + "grad_norm": 2.5211167335510254, + "learning_rate": 1.9053784487252606e-05, + "loss": 1.3289, + "step": 18050 + }, + { + "epoch": 0.22565564139103478, + "grad_norm": 3.1000614166259766, + "learning_rate": 1.9053413904249144e-05, + "loss": 1.3383, + "step": 18052 + }, + { + "epoch": 0.2256806420160504, + "grad_norm": 5.641939163208008, + "learning_rate": 1.9053043252296163e-05, + "loss": 1.5905, + "step": 18054 + }, + { + "epoch": 0.22570564264106602, + "grad_norm": 3.5832972526550293, + "learning_rate": 1.905267253139649e-05, + "loss": 1.4929, + "step": 18056 + }, + { + "epoch": 0.22573064326608167, + "grad_norm": 0.32754844427108765, + "learning_rate": 1.9052301741552934e-05, + "loss": 0.615, + "step": 18058 + }, + { + "epoch": 0.22575564389109729, + "grad_norm": 4.086910724639893, + "learning_rate": 1.9051930882768334e-05, + "loss": 0.9356, + "step": 18060 + }, + { + "epoch": 0.2257806445161129, + "grad_norm": 2.4357006549835205, + "learning_rate": 1.9051559955045508e-05, + "loss": 0.4369, + "step": 18062 + }, + { + "epoch": 0.22580564514112852, + "grad_norm": 0.02032286301255226, + "learning_rate": 1.905118895838728e-05, + "loss": 0.3528, + "step": 18064 + }, + { + "epoch": 0.22583064576614414, + "grad_norm": 0.023609697818756104, + "learning_rate": 1.9050817892796474e-05, + "loss": 0.0717, + "step": 18066 + }, + { + "epoch": 0.2258556463911598, + "grad_norm": 6.408613681793213, + "learning_rate": 1.9050446758275925e-05, + "loss": 1.1791, + "step": 18068 + }, + { + "epoch": 0.2258806470161754, + "grad_norm": 3.974893093109131, + "learning_rate": 1.9050075554828448e-05, + "loss": 1.111, + "step": 18070 + }, + { + "epoch": 0.22590564764119103, + "grad_norm": 3.978647232055664, + "learning_rate": 1.9049704282456878e-05, + "loss": 0.1657, + "step": 18072 + }, + { + "epoch": 0.22593064826620665, + "grad_norm": 5.636742115020752, + "learning_rate": 1.904933294116404e-05, + "loss": 1.1025, + "step": 18074 + }, + { + "epoch": 0.22595564889122227, + "grad_norm": 3.1193056106567383, + "learning_rate": 1.904896153095276e-05, + "loss": 1.2225, + "step": 18076 + }, + { + "epoch": 0.22598064951623792, + "grad_norm": 3.6835546493530273, + "learning_rate": 1.9048590051825872e-05, + "loss": 1.3373, + "step": 18078 + }, + { + "epoch": 0.22600565014125354, + "grad_norm": 2.920319080352783, + "learning_rate": 1.90482185037862e-05, + "loss": 0.8293, + "step": 18080 + }, + { + "epoch": 0.22603065076626916, + "grad_norm": 5.668987274169922, + "learning_rate": 1.9047846886836574e-05, + "loss": 1.1393, + "step": 18082 + }, + { + "epoch": 0.22605565139128478, + "grad_norm": 4.585434436798096, + "learning_rate": 1.9047475200979826e-05, + "loss": 1.0056, + "step": 18084 + }, + { + "epoch": 0.2260806520163004, + "grad_norm": 5.283871173858643, + "learning_rate": 1.9047103446218787e-05, + "loss": 0.1291, + "step": 18086 + }, + { + "epoch": 0.22610565264131605, + "grad_norm": 3.5360751152038574, + "learning_rate": 1.904673162255629e-05, + "loss": 1.7936, + "step": 18088 + }, + { + "epoch": 0.22613065326633167, + "grad_norm": 0.2176230102777481, + "learning_rate": 1.904635972999516e-05, + "loss": 0.8074, + "step": 18090 + }, + { + "epoch": 0.22615565389134729, + "grad_norm": 2.468393087387085, + "learning_rate": 1.9045987768538236e-05, + "loss": 0.5084, + "step": 18092 + }, + { + "epoch": 0.2261806545163629, + "grad_norm": 5.426553726196289, + "learning_rate": 1.904561573818835e-05, + "loss": 1.3164, + "step": 18094 + }, + { + "epoch": 0.22620565514137853, + "grad_norm": 3.8211820125579834, + "learning_rate": 1.904524363894833e-05, + "loss": 0.3712, + "step": 18096 + }, + { + "epoch": 0.22623065576639417, + "grad_norm": 4.255768299102783, + "learning_rate": 1.9044871470821016e-05, + "loss": 1.5809, + "step": 18098 + }, + { + "epoch": 0.2262556563914098, + "grad_norm": 3.7630629539489746, + "learning_rate": 1.904449923380924e-05, + "loss": 0.5865, + "step": 18100 + }, + { + "epoch": 0.2262806570164254, + "grad_norm": 4.210865497589111, + "learning_rate": 1.9044126927915836e-05, + "loss": 1.7443, + "step": 18102 + }, + { + "epoch": 0.22630565764144103, + "grad_norm": 2.5946567058563232, + "learning_rate": 1.9043754553143638e-05, + "loss": 1.0472, + "step": 18104 + }, + { + "epoch": 0.22633065826645665, + "grad_norm": 2.5525946617126465, + "learning_rate": 1.9043382109495487e-05, + "loss": 1.4811, + "step": 18106 + }, + { + "epoch": 0.2263556588914723, + "grad_norm": 1.0240778923034668, + "learning_rate": 1.9043009596974217e-05, + "loss": 0.0515, + "step": 18108 + }, + { + "epoch": 0.22638065951648792, + "grad_norm": 0.8340214490890503, + "learning_rate": 1.9042637015582663e-05, + "loss": 0.0338, + "step": 18110 + }, + { + "epoch": 0.22640566014150354, + "grad_norm": 2.6682403087615967, + "learning_rate": 1.9042264365323664e-05, + "loss": 0.2429, + "step": 18112 + }, + { + "epoch": 0.22643066076651916, + "grad_norm": 3.0296711921691895, + "learning_rate": 1.904189164620006e-05, + "loss": 0.8717, + "step": 18114 + }, + { + "epoch": 0.22645566139153478, + "grad_norm": 3.773561477661133, + "learning_rate": 1.904151885821469e-05, + "loss": 1.1795, + "step": 18116 + }, + { + "epoch": 0.22648066201655043, + "grad_norm": 2.187197685241699, + "learning_rate": 1.9041146001370386e-05, + "loss": 0.2453, + "step": 18118 + }, + { + "epoch": 0.22650566264156605, + "grad_norm": 2.679408311843872, + "learning_rate": 1.9040773075669995e-05, + "loss": 0.7398, + "step": 18120 + }, + { + "epoch": 0.22653066326658167, + "grad_norm": 2.7723946571350098, + "learning_rate": 1.9040400081116356e-05, + "loss": 0.7972, + "step": 18122 + }, + { + "epoch": 0.2265556638915973, + "grad_norm": 0.21740642189979553, + "learning_rate": 1.9040027017712307e-05, + "loss": 0.7102, + "step": 18124 + }, + { + "epoch": 0.2265806645166129, + "grad_norm": 10.333670616149902, + "learning_rate": 1.903965388546069e-05, + "loss": 1.8315, + "step": 18126 + }, + { + "epoch": 0.22660566514162855, + "grad_norm": 3.051222562789917, + "learning_rate": 1.9039280684364347e-05, + "loss": 1.5564, + "step": 18128 + }, + { + "epoch": 0.22663066576664417, + "grad_norm": 5.086068630218506, + "learning_rate": 1.9038907414426124e-05, + "loss": 1.7914, + "step": 18130 + }, + { + "epoch": 0.2266556663916598, + "grad_norm": 5.0358662605285645, + "learning_rate": 1.9038534075648855e-05, + "loss": 1.0299, + "step": 18132 + }, + { + "epoch": 0.2266806670166754, + "grad_norm": 1.8086566925048828, + "learning_rate": 1.903816066803539e-05, + "loss": 0.5879, + "step": 18134 + }, + { + "epoch": 0.22670566764169103, + "grad_norm": 4.504037857055664, + "learning_rate": 1.9037787191588573e-05, + "loss": 1.5143, + "step": 18136 + }, + { + "epoch": 0.22673066826670668, + "grad_norm": 0.004824908450245857, + "learning_rate": 1.9037413646311245e-05, + "loss": 0.5528, + "step": 18138 + }, + { + "epoch": 0.2267556688917223, + "grad_norm": 4.093486785888672, + "learning_rate": 1.9037040032206256e-05, + "loss": 0.7628, + "step": 18140 + }, + { + "epoch": 0.22678066951673792, + "grad_norm": 0.016275832429528236, + "learning_rate": 1.903666634927645e-05, + "loss": 1.0054, + "step": 18142 + }, + { + "epoch": 0.22680567014175354, + "grad_norm": 5.47177267074585, + "learning_rate": 1.9036292597524665e-05, + "loss": 0.2497, + "step": 18144 + }, + { + "epoch": 0.22683067076676916, + "grad_norm": 5.332368850708008, + "learning_rate": 1.9035918776953755e-05, + "loss": 0.8898, + "step": 18146 + }, + { + "epoch": 0.2268556713917848, + "grad_norm": 0.013258250430226326, + "learning_rate": 1.903554488756657e-05, + "loss": 0.5582, + "step": 18148 + }, + { + "epoch": 0.22688067201680043, + "grad_norm": 2.669360876083374, + "learning_rate": 1.9035170929365947e-05, + "loss": 1.0984, + "step": 18150 + }, + { + "epoch": 0.22690567264181605, + "grad_norm": 0.014966858550906181, + "learning_rate": 1.903479690235474e-05, + "loss": 0.1316, + "step": 18152 + }, + { + "epoch": 0.22693067326683167, + "grad_norm": 5.0207672119140625, + "learning_rate": 1.9034422806535803e-05, + "loss": 1.0808, + "step": 18154 + }, + { + "epoch": 0.2269556738918473, + "grad_norm": 2.0480475425720215, + "learning_rate": 1.9034048641911975e-05, + "loss": 1.1046, + "step": 18156 + }, + { + "epoch": 0.22698067451686293, + "grad_norm": 0.026737134903669357, + "learning_rate": 1.903367440848611e-05, + "loss": 0.3028, + "step": 18158 + }, + { + "epoch": 0.22700567514187855, + "grad_norm": 2.722750425338745, + "learning_rate": 1.903330010626106e-05, + "loss": 0.4715, + "step": 18160 + }, + { + "epoch": 0.22703067576689417, + "grad_norm": 0.06408143043518066, + "learning_rate": 1.9032925735239672e-05, + "loss": 0.7226, + "step": 18162 + }, + { + "epoch": 0.2270556763919098, + "grad_norm": 1.1459599733352661, + "learning_rate": 1.90325512954248e-05, + "loss": 0.0863, + "step": 18164 + }, + { + "epoch": 0.22708067701692541, + "grad_norm": 0.22418750822544098, + "learning_rate": 1.9032176786819298e-05, + "loss": 0.355, + "step": 18166 + }, + { + "epoch": 0.22710567764194106, + "grad_norm": 3.7106573581695557, + "learning_rate": 1.903180220942601e-05, + "loss": 1.1346, + "step": 18168 + }, + { + "epoch": 0.22713067826695668, + "grad_norm": 1.878454566001892, + "learning_rate": 1.903142756324779e-05, + "loss": 0.425, + "step": 18170 + }, + { + "epoch": 0.2271556788919723, + "grad_norm": 13.65670108795166, + "learning_rate": 1.9031052848287505e-05, + "loss": 0.8794, + "step": 18172 + }, + { + "epoch": 0.22718067951698792, + "grad_norm": 5.335409164428711, + "learning_rate": 1.903067806454799e-05, + "loss": 1.121, + "step": 18174 + }, + { + "epoch": 0.22720568014200354, + "grad_norm": 3.7266364097595215, + "learning_rate": 1.9030303212032114e-05, + "loss": 0.7524, + "step": 18176 + }, + { + "epoch": 0.2272306807670192, + "grad_norm": 0.0484795942902565, + "learning_rate": 1.9029928290742723e-05, + "loss": 0.0022, + "step": 18178 + }, + { + "epoch": 0.2272556813920348, + "grad_norm": 3.8119025230407715, + "learning_rate": 1.9029553300682676e-05, + "loss": 1.1174, + "step": 18180 + }, + { + "epoch": 0.22728068201705043, + "grad_norm": 3.7323050498962402, + "learning_rate": 1.9029178241854826e-05, + "loss": 1.1351, + "step": 18182 + }, + { + "epoch": 0.22730568264206605, + "grad_norm": 0.015915486961603165, + "learning_rate": 1.9028803114262032e-05, + "loss": 0.6138, + "step": 18184 + }, + { + "epoch": 0.22733068326708167, + "grad_norm": 0.24602654576301575, + "learning_rate": 1.902842791790715e-05, + "loss": 0.5412, + "step": 18186 + }, + { + "epoch": 0.22735568389209732, + "grad_norm": 2.9724652767181396, + "learning_rate": 1.902805265279304e-05, + "loss": 1.2996, + "step": 18188 + }, + { + "epoch": 0.22738068451711294, + "grad_norm": 2.373429298400879, + "learning_rate": 1.9027677318922556e-05, + "loss": 0.8403, + "step": 18190 + }, + { + "epoch": 0.22740568514212856, + "grad_norm": 1.316459059715271, + "learning_rate": 1.902730191629856e-05, + "loss": 0.1196, + "step": 18192 + }, + { + "epoch": 0.22743068576714418, + "grad_norm": 1.9453679323196411, + "learning_rate": 1.9026926444923907e-05, + "loss": 1.4143, + "step": 18194 + }, + { + "epoch": 0.2274556863921598, + "grad_norm": 6.929661750793457, + "learning_rate": 1.902655090480146e-05, + "loss": 1.0143, + "step": 18196 + }, + { + "epoch": 0.22748068701717544, + "grad_norm": 0.13008874654769897, + "learning_rate": 1.9026175295934076e-05, + "loss": 0.6596, + "step": 18198 + }, + { + "epoch": 0.22750568764219106, + "grad_norm": 1.7763460874557495, + "learning_rate": 1.902579961832462e-05, + "loss": 1.0201, + "step": 18200 + }, + { + "epoch": 0.22753068826720668, + "grad_norm": 3.5438852310180664, + "learning_rate": 1.902542387197595e-05, + "loss": 1.0691, + "step": 18202 + }, + { + "epoch": 0.2275556888922223, + "grad_norm": 1.7395020723342896, + "learning_rate": 1.9025048056890923e-05, + "loss": 0.7447, + "step": 18204 + }, + { + "epoch": 0.22758068951723792, + "grad_norm": 0.7449834942817688, + "learning_rate": 1.9024672173072414e-05, + "loss": 0.5505, + "step": 18206 + }, + { + "epoch": 0.22760569014225357, + "grad_norm": 4.179600715637207, + "learning_rate": 1.9024296220523272e-05, + "loss": 0.9306, + "step": 18208 + }, + { + "epoch": 0.2276306907672692, + "grad_norm": 3.289525270462036, + "learning_rate": 1.902392019924637e-05, + "loss": 0.8597, + "step": 18210 + }, + { + "epoch": 0.2276556913922848, + "grad_norm": 1.1702684164047241, + "learning_rate": 1.902354410924457e-05, + "loss": 0.0486, + "step": 18212 + }, + { + "epoch": 0.22768069201730043, + "grad_norm": 2.7350003719329834, + "learning_rate": 1.9023167950520726e-05, + "loss": 0.4772, + "step": 18214 + }, + { + "epoch": 0.22770569264231605, + "grad_norm": 0.1682155430316925, + "learning_rate": 1.9022791723077714e-05, + "loss": 0.316, + "step": 18216 + }, + { + "epoch": 0.2277306932673317, + "grad_norm": 2.9747395515441895, + "learning_rate": 1.90224154269184e-05, + "loss": 0.729, + "step": 18218 + }, + { + "epoch": 0.22775569389234732, + "grad_norm": 2.7491979598999023, + "learning_rate": 1.902203906204564e-05, + "loss": 0.4864, + "step": 18220 + }, + { + "epoch": 0.22778069451736294, + "grad_norm": 3.8103415966033936, + "learning_rate": 1.9021662628462313e-05, + "loss": 0.6273, + "step": 18222 + }, + { + "epoch": 0.22780569514237856, + "grad_norm": 1.9052647352218628, + "learning_rate": 1.9021286126171275e-05, + "loss": 0.6047, + "step": 18224 + }, + { + "epoch": 0.22783069576739418, + "grad_norm": 2.7824323177337646, + "learning_rate": 1.9020909555175396e-05, + "loss": 1.6802, + "step": 18226 + }, + { + "epoch": 0.22785569639240982, + "grad_norm": 0.027255097404122353, + "learning_rate": 1.902053291547755e-05, + "loss": 0.0622, + "step": 18228 + }, + { + "epoch": 0.22788069701742544, + "grad_norm": 4.495183944702148, + "learning_rate": 1.9020156207080597e-05, + "loss": 1.4612, + "step": 18230 + }, + { + "epoch": 0.22790569764244106, + "grad_norm": 2.7721521854400635, + "learning_rate": 1.9019779429987415e-05, + "loss": 0.8847, + "step": 18232 + }, + { + "epoch": 0.22793069826745668, + "grad_norm": 1.1920969486236572, + "learning_rate": 1.9019402584200865e-05, + "loss": 0.737, + "step": 18234 + }, + { + "epoch": 0.2279556988924723, + "grad_norm": 3.390414237976074, + "learning_rate": 1.901902566972382e-05, + "loss": 1.5002, + "step": 18236 + }, + { + "epoch": 0.22798069951748795, + "grad_norm": 3.2020034790039062, + "learning_rate": 1.9018648686559153e-05, + "loss": 0.3388, + "step": 18238 + }, + { + "epoch": 0.22800570014250357, + "grad_norm": 2.477473735809326, + "learning_rate": 1.9018271634709727e-05, + "loss": 0.8578, + "step": 18240 + }, + { + "epoch": 0.2280307007675192, + "grad_norm": 0.7450408935546875, + "learning_rate": 1.9017894514178428e-05, + "loss": 0.4346, + "step": 18242 + }, + { + "epoch": 0.2280557013925348, + "grad_norm": 2.218045711517334, + "learning_rate": 1.9017517324968112e-05, + "loss": 0.3127, + "step": 18244 + }, + { + "epoch": 0.22808070201755043, + "grad_norm": 0.3044646382331848, + "learning_rate": 1.9017140067081666e-05, + "loss": 0.705, + "step": 18246 + }, + { + "epoch": 0.22810570264256608, + "grad_norm": 3.837545394897461, + "learning_rate": 1.9016762740521952e-05, + "loss": 1.4392, + "step": 18248 + }, + { + "epoch": 0.2281307032675817, + "grad_norm": 4.187751293182373, + "learning_rate": 1.901638534529185e-05, + "loss": 1.3285, + "step": 18250 + }, + { + "epoch": 0.22815570389259732, + "grad_norm": 1.7385298013687134, + "learning_rate": 1.9016007881394233e-05, + "loss": 0.4582, + "step": 18252 + }, + { + "epoch": 0.22818070451761294, + "grad_norm": 3.814697265625, + "learning_rate": 1.9015630348831973e-05, + "loss": 0.839, + "step": 18254 + }, + { + "epoch": 0.22820570514262856, + "grad_norm": 4.099475860595703, + "learning_rate": 1.9015252747607945e-05, + "loss": 1.8241, + "step": 18256 + }, + { + "epoch": 0.2282307057676442, + "grad_norm": 0.027267439290881157, + "learning_rate": 1.9014875077725032e-05, + "loss": 0.462, + "step": 18258 + }, + { + "epoch": 0.22825570639265982, + "grad_norm": 2.441179037094116, + "learning_rate": 1.90144973391861e-05, + "loss": 0.3578, + "step": 18260 + }, + { + "epoch": 0.22828070701767544, + "grad_norm": 0.019488006830215454, + "learning_rate": 1.9014119531994035e-05, + "loss": 1.3333, + "step": 18262 + }, + { + "epoch": 0.22830570764269106, + "grad_norm": 8.924689292907715, + "learning_rate": 1.901374165615171e-05, + "loss": 1.5867, + "step": 18264 + }, + { + "epoch": 0.22833070826770668, + "grad_norm": 0.945347011089325, + "learning_rate": 1.9013363711662004e-05, + "loss": 0.0291, + "step": 18266 + }, + { + "epoch": 0.22835570889272233, + "grad_norm": 2.161404848098755, + "learning_rate": 1.901298569852779e-05, + "loss": 0.2888, + "step": 18268 + }, + { + "epoch": 0.22838070951773795, + "grad_norm": 1.8771717548370361, + "learning_rate": 1.9012607616751957e-05, + "loss": 0.3879, + "step": 18270 + }, + { + "epoch": 0.22840571014275357, + "grad_norm": 3.874115467071533, + "learning_rate": 1.9012229466337374e-05, + "loss": 1.3385, + "step": 18272 + }, + { + "epoch": 0.2284307107677692, + "grad_norm": 4.029338836669922, + "learning_rate": 1.901185124728693e-05, + "loss": 1.457, + "step": 18274 + }, + { + "epoch": 0.2284557113927848, + "grad_norm": 0.9559961557388306, + "learning_rate": 1.90114729596035e-05, + "loss": 2.4379, + "step": 18276 + }, + { + "epoch": 0.22848071201780046, + "grad_norm": 3.3350720405578613, + "learning_rate": 1.9011094603289962e-05, + "loss": 1.3562, + "step": 18278 + }, + { + "epoch": 0.22850571264281608, + "grad_norm": 2.892920732498169, + "learning_rate": 1.901071617834921e-05, + "loss": 1.17, + "step": 18280 + }, + { + "epoch": 0.2285307132678317, + "grad_norm": 3.5102312564849854, + "learning_rate": 1.9010337684784113e-05, + "loss": 2.0216, + "step": 18282 + }, + { + "epoch": 0.22855571389284732, + "grad_norm": 4.266988277435303, + "learning_rate": 1.900995912259756e-05, + "loss": 0.6476, + "step": 18284 + }, + { + "epoch": 0.22858071451786294, + "grad_norm": 5.205684185028076, + "learning_rate": 1.900958049179243e-05, + "loss": 1.7573, + "step": 18286 + }, + { + "epoch": 0.22860571514287858, + "grad_norm": 2.715780258178711, + "learning_rate": 1.900920179237161e-05, + "loss": 0.8355, + "step": 18288 + }, + { + "epoch": 0.2286307157678942, + "grad_norm": 3.3328378200531006, + "learning_rate": 1.9008823024337984e-05, + "loss": 2.6184, + "step": 18290 + }, + { + "epoch": 0.22865571639290982, + "grad_norm": 2.6490485668182373, + "learning_rate": 1.9008444187694433e-05, + "loss": 1.2212, + "step": 18292 + }, + { + "epoch": 0.22868071701792544, + "grad_norm": 0.05169183760881424, + "learning_rate": 1.9008065282443847e-05, + "loss": 0.007, + "step": 18294 + }, + { + "epoch": 0.22870571764294106, + "grad_norm": 3.8118393421173096, + "learning_rate": 1.900768630858911e-05, + "loss": 1.9951, + "step": 18296 + }, + { + "epoch": 0.2287307182679567, + "grad_norm": 2.8440864086151123, + "learning_rate": 1.900730726613311e-05, + "loss": 0.9904, + "step": 18298 + }, + { + "epoch": 0.22875571889297233, + "grad_norm": 8.149754524230957, + "learning_rate": 1.9006928155078728e-05, + "loss": 1.1249, + "step": 18300 + }, + { + "epoch": 0.22878071951798795, + "grad_norm": 1.2005865573883057, + "learning_rate": 1.9006548975428857e-05, + "loss": 0.6429, + "step": 18302 + }, + { + "epoch": 0.22880572014300357, + "grad_norm": 1.7835513353347778, + "learning_rate": 1.900616972718638e-05, + "loss": 1.0324, + "step": 18304 + }, + { + "epoch": 0.2288307207680192, + "grad_norm": 0.5008477568626404, + "learning_rate": 1.900579041035419e-05, + "loss": 0.0164, + "step": 18306 + }, + { + "epoch": 0.22885572139303484, + "grad_norm": 5.107936859130859, + "learning_rate": 1.9005411024935176e-05, + "loss": 1.3936, + "step": 18308 + }, + { + "epoch": 0.22888072201805046, + "grad_norm": 4.515604019165039, + "learning_rate": 1.900503157093222e-05, + "loss": 1.8687, + "step": 18310 + }, + { + "epoch": 0.22890572264306608, + "grad_norm": 6.7371954917907715, + "learning_rate": 1.9004652048348224e-05, + "loss": 0.2461, + "step": 18312 + }, + { + "epoch": 0.2289307232680817, + "grad_norm": 2.742523193359375, + "learning_rate": 1.9004272457186064e-05, + "loss": 0.7359, + "step": 18314 + }, + { + "epoch": 0.22895572389309732, + "grad_norm": 2.712514877319336, + "learning_rate": 1.900389279744864e-05, + "loss": 1.0594, + "step": 18316 + }, + { + "epoch": 0.22898072451811297, + "grad_norm": 1.669000506401062, + "learning_rate": 1.9003513069138845e-05, + "loss": 1.0791, + "step": 18318 + }, + { + "epoch": 0.22900572514312859, + "grad_norm": 2.0644655227661133, + "learning_rate": 1.9003133272259565e-05, + "loss": 0.7641, + "step": 18320 + }, + { + "epoch": 0.2290307257681442, + "grad_norm": 5.072585582733154, + "learning_rate": 1.9002753406813697e-05, + "loss": 1.6331, + "step": 18322 + }, + { + "epoch": 0.22905572639315983, + "grad_norm": 1.7260500192642212, + "learning_rate": 1.900237347280413e-05, + "loss": 0.1701, + "step": 18324 + }, + { + "epoch": 0.22908072701817545, + "grad_norm": 3.821833610534668, + "learning_rate": 1.9001993470233762e-05, + "loss": 1.9374, + "step": 18326 + }, + { + "epoch": 0.2291057276431911, + "grad_norm": 1.1545562744140625, + "learning_rate": 1.9001613399105482e-05, + "loss": 0.3591, + "step": 18328 + }, + { + "epoch": 0.2291307282682067, + "grad_norm": 0.009603449143469334, + "learning_rate": 1.900123325942219e-05, + "loss": 0.2488, + "step": 18330 + }, + { + "epoch": 0.22915572889322233, + "grad_norm": 2.2177505493164062, + "learning_rate": 1.9000853051186775e-05, + "loss": 0.456, + "step": 18332 + }, + { + "epoch": 0.22918072951823795, + "grad_norm": 3.2086575031280518, + "learning_rate": 1.9000472774402138e-05, + "loss": 1.4657, + "step": 18334 + }, + { + "epoch": 0.22920573014325357, + "grad_norm": 0.2045535296201706, + "learning_rate": 1.9000092429071176e-05, + "loss": 0.468, + "step": 18336 + }, + { + "epoch": 0.22923073076826922, + "grad_norm": 2.5027425289154053, + "learning_rate": 1.8999712015196777e-05, + "loss": 0.7025, + "step": 18338 + }, + { + "epoch": 0.22925573139328484, + "grad_norm": 3.795642375946045, + "learning_rate": 1.8999331532781848e-05, + "loss": 0.8287, + "step": 18340 + }, + { + "epoch": 0.22928073201830046, + "grad_norm": 0.02360985055565834, + "learning_rate": 1.899895098182928e-05, + "loss": 0.6632, + "step": 18342 + }, + { + "epoch": 0.22930573264331608, + "grad_norm": 2.8293392658233643, + "learning_rate": 1.8998570362341978e-05, + "loss": 1.9533, + "step": 18344 + }, + { + "epoch": 0.2293307332683317, + "grad_norm": 4.265273571014404, + "learning_rate": 1.8998189674322835e-05, + "loss": 0.9059, + "step": 18346 + }, + { + "epoch": 0.22935573389334735, + "grad_norm": 3.6064352989196777, + "learning_rate": 1.899780891777475e-05, + "loss": 0.7486, + "step": 18348 + }, + { + "epoch": 0.22938073451836297, + "grad_norm": 3.5356786251068115, + "learning_rate": 1.8997428092700626e-05, + "loss": 1.6001, + "step": 18350 + }, + { + "epoch": 0.2294057351433786, + "grad_norm": 1.5432322025299072, + "learning_rate": 1.899704719910336e-05, + "loss": 1.0347, + "step": 18352 + }, + { + "epoch": 0.2294307357683942, + "grad_norm": 2.6941277980804443, + "learning_rate": 1.8996666236985857e-05, + "loss": 0.4331, + "step": 18354 + }, + { + "epoch": 0.22945573639340983, + "grad_norm": 3.117675542831421, + "learning_rate": 1.8996285206351018e-05, + "loss": 0.9484, + "step": 18356 + }, + { + "epoch": 0.22948073701842547, + "grad_norm": 0.01565909944474697, + "learning_rate": 1.899590410720174e-05, + "loss": 0.018, + "step": 18358 + }, + { + "epoch": 0.2295057376434411, + "grad_norm": 8.894926071166992, + "learning_rate": 1.8995522939540927e-05, + "loss": 1.6146, + "step": 18360 + }, + { + "epoch": 0.2295307382684567, + "grad_norm": 4.7801337242126465, + "learning_rate": 1.8995141703371486e-05, + "loss": 1.5026, + "step": 18362 + }, + { + "epoch": 0.22955573889347233, + "grad_norm": 0.018700234591960907, + "learning_rate": 1.8994760398696318e-05, + "loss": 0.0032, + "step": 18364 + }, + { + "epoch": 0.22958073951848795, + "grad_norm": 5.414190292358398, + "learning_rate": 1.899437902551833e-05, + "loss": 1.3669, + "step": 18366 + }, + { + "epoch": 0.2296057401435036, + "grad_norm": 2.2978603839874268, + "learning_rate": 1.8993997583840417e-05, + "loss": 0.734, + "step": 18368 + }, + { + "epoch": 0.22963074076851922, + "grad_norm": 2.5107624530792236, + "learning_rate": 1.899361607366549e-05, + "loss": 0.1635, + "step": 18370 + }, + { + "epoch": 0.22965574139353484, + "grad_norm": 2.1663753986358643, + "learning_rate": 1.899323449499646e-05, + "loss": 1.1474, + "step": 18372 + }, + { + "epoch": 0.22968074201855046, + "grad_norm": 2.3628180027008057, + "learning_rate": 1.8992852847836224e-05, + "loss": 1.3004, + "step": 18374 + }, + { + "epoch": 0.22970574264356608, + "grad_norm": 0.020473958924412727, + "learning_rate": 1.8992471132187696e-05, + "loss": 0.6085, + "step": 18376 + }, + { + "epoch": 0.22973074326858173, + "grad_norm": 2.0636346340179443, + "learning_rate": 1.8992089348053777e-05, + "loss": 1.1158, + "step": 18378 + }, + { + "epoch": 0.22975574389359735, + "grad_norm": 3.0287811756134033, + "learning_rate": 1.8991707495437376e-05, + "loss": 0.4473, + "step": 18380 + }, + { + "epoch": 0.22978074451861297, + "grad_norm": 3.062140703201294, + "learning_rate": 1.8991325574341407e-05, + "loss": 0.6538, + "step": 18382 + }, + { + "epoch": 0.2298057451436286, + "grad_norm": 1.7063114643096924, + "learning_rate": 1.8990943584768767e-05, + "loss": 0.9084, + "step": 18384 + }, + { + "epoch": 0.2298307457686442, + "grad_norm": 4.23002815246582, + "learning_rate": 1.8990561526722377e-05, + "loss": 0.9956, + "step": 18386 + }, + { + "epoch": 0.22985574639365985, + "grad_norm": 2.2224907875061035, + "learning_rate": 1.8990179400205143e-05, + "loss": 0.711, + "step": 18388 + }, + { + "epoch": 0.22988074701867547, + "grad_norm": 2.9824163913726807, + "learning_rate": 1.898979720521997e-05, + "loss": 0.4743, + "step": 18390 + }, + { + "epoch": 0.2299057476436911, + "grad_norm": 3.517073631286621, + "learning_rate": 1.8989414941769774e-05, + "loss": 1.3194, + "step": 18392 + }, + { + "epoch": 0.22993074826870671, + "grad_norm": 4.195837020874023, + "learning_rate": 1.8989032609857466e-05, + "loss": 1.3308, + "step": 18394 + }, + { + "epoch": 0.22995574889372233, + "grad_norm": 2.9643394947052, + "learning_rate": 1.8988650209485956e-05, + "loss": 1.5864, + "step": 18396 + }, + { + "epoch": 0.22998074951873798, + "grad_norm": 6.083552837371826, + "learning_rate": 1.8988267740658158e-05, + "loss": 0.8616, + "step": 18398 + }, + { + "epoch": 0.2300057501437536, + "grad_norm": 3.9361114501953125, + "learning_rate": 1.8987885203376982e-05, + "loss": 0.5943, + "step": 18400 + }, + { + "epoch": 0.23003075076876922, + "grad_norm": 2.561328887939453, + "learning_rate": 1.8987502597645346e-05, + "loss": 0.8898, + "step": 18402 + }, + { + "epoch": 0.23005575139378484, + "grad_norm": 4.236965656280518, + "learning_rate": 1.8987119923466156e-05, + "loss": 1.5931, + "step": 18404 + }, + { + "epoch": 0.23008075201880046, + "grad_norm": 2.6651415824890137, + "learning_rate": 1.8986737180842337e-05, + "loss": 1.0388, + "step": 18406 + }, + { + "epoch": 0.2301057526438161, + "grad_norm": 3.2643883228302, + "learning_rate": 1.8986354369776796e-05, + "loss": 1.0793, + "step": 18408 + }, + { + "epoch": 0.23013075326883173, + "grad_norm": 3.5993478298187256, + "learning_rate": 1.8985971490272453e-05, + "loss": 0.6218, + "step": 18410 + }, + { + "epoch": 0.23015575389384735, + "grad_norm": 5.15665864944458, + "learning_rate": 1.898558854233222e-05, + "loss": 0.6637, + "step": 18412 + }, + { + "epoch": 0.23018075451886297, + "grad_norm": 0.025988135486841202, + "learning_rate": 1.8985205525959016e-05, + "loss": 0.3892, + "step": 18414 + }, + { + "epoch": 0.2302057551438786, + "grad_norm": 3.9648900032043457, + "learning_rate": 1.8984822441155757e-05, + "loss": 1.2928, + "step": 18416 + }, + { + "epoch": 0.23023075576889424, + "grad_norm": 3.288621425628662, + "learning_rate": 1.898443928792536e-05, + "loss": 0.7031, + "step": 18418 + }, + { + "epoch": 0.23025575639390986, + "grad_norm": 2.901226282119751, + "learning_rate": 1.8984056066270746e-05, + "loss": 1.1918, + "step": 18420 + }, + { + "epoch": 0.23028075701892547, + "grad_norm": 3.1940901279449463, + "learning_rate": 1.898367277619483e-05, + "loss": 0.7951, + "step": 18422 + }, + { + "epoch": 0.2303057576439411, + "grad_norm": 3.1709675788879395, + "learning_rate": 1.8983289417700534e-05, + "loss": 1.251, + "step": 18424 + }, + { + "epoch": 0.23033075826895671, + "grad_norm": 4.1838178634643555, + "learning_rate": 1.8982905990790774e-05, + "loss": 1.5271, + "step": 18426 + }, + { + "epoch": 0.23035575889397236, + "grad_norm": 1.5481786727905273, + "learning_rate": 1.898252249546847e-05, + "loss": 0.4499, + "step": 18428 + }, + { + "epoch": 0.23038075951898798, + "grad_norm": 3.2462151050567627, + "learning_rate": 1.898213893173655e-05, + "loss": 1.0468, + "step": 18430 + }, + { + "epoch": 0.2304057601440036, + "grad_norm": 3.99900221824646, + "learning_rate": 1.8981755299597927e-05, + "loss": 1.4774, + "step": 18432 + }, + { + "epoch": 0.23043076076901922, + "grad_norm": 1.630207896232605, + "learning_rate": 1.8981371599055532e-05, + "loss": 0.8194, + "step": 18434 + }, + { + "epoch": 0.23045576139403484, + "grad_norm": 3.493753671646118, + "learning_rate": 1.8980987830112273e-05, + "loss": 0.9021, + "step": 18436 + }, + { + "epoch": 0.2304807620190505, + "grad_norm": 0.04228460416197777, + "learning_rate": 1.8980603992771086e-05, + "loss": 0.687, + "step": 18438 + }, + { + "epoch": 0.2305057626440661, + "grad_norm": 3.016964912414551, + "learning_rate": 1.8980220087034885e-05, + "loss": 0.26, + "step": 18440 + }, + { + "epoch": 0.23053076326908173, + "grad_norm": 3.5329020023345947, + "learning_rate": 1.89798361129066e-05, + "loss": 0.216, + "step": 18442 + }, + { + "epoch": 0.23055576389409735, + "grad_norm": 4.053007125854492, + "learning_rate": 1.897945207038915e-05, + "loss": 1.4079, + "step": 18444 + }, + { + "epoch": 0.23058076451911297, + "grad_norm": 5.7352166175842285, + "learning_rate": 1.8979067959485467e-05, + "loss": 2.0382, + "step": 18446 + }, + { + "epoch": 0.23060576514412862, + "grad_norm": 0.798366367816925, + "learning_rate": 1.8978683780198472e-05, + "loss": 0.1855, + "step": 18448 + }, + { + "epoch": 0.23063076576914424, + "grad_norm": 2.300678014755249, + "learning_rate": 1.897829953253109e-05, + "loss": 0.9055, + "step": 18450 + }, + { + "epoch": 0.23065576639415986, + "grad_norm": 7.638444900512695, + "learning_rate": 1.8977915216486247e-05, + "loss": 2.0717, + "step": 18452 + }, + { + "epoch": 0.23068076701917548, + "grad_norm": 2.4988160133361816, + "learning_rate": 1.8977530832066872e-05, + "loss": 0.6926, + "step": 18454 + }, + { + "epoch": 0.2307057676441911, + "grad_norm": 1.7773698568344116, + "learning_rate": 1.8977146379275893e-05, + "loss": 0.39, + "step": 18456 + }, + { + "epoch": 0.23073076826920674, + "grad_norm": 3.260394811630249, + "learning_rate": 1.8976761858116235e-05, + "loss": 1.2998, + "step": 18458 + }, + { + "epoch": 0.23075576889422236, + "grad_norm": 4.038083553314209, + "learning_rate": 1.8976377268590828e-05, + "loss": 1.043, + "step": 18460 + }, + { + "epoch": 0.23078076951923798, + "grad_norm": 0.6700119376182556, + "learning_rate": 1.8975992610702604e-05, + "loss": 0.0741, + "step": 18462 + }, + { + "epoch": 0.2308057701442536, + "grad_norm": 3.361823797225952, + "learning_rate": 1.8975607884454484e-05, + "loss": 0.6996, + "step": 18464 + }, + { + "epoch": 0.23083077076926922, + "grad_norm": 3.438368082046509, + "learning_rate": 1.897522308984941e-05, + "loss": 1.2785, + "step": 18466 + }, + { + "epoch": 0.23085577139428487, + "grad_norm": 3.143735647201538, + "learning_rate": 1.89748382268903e-05, + "loss": 1.3048, + "step": 18468 + }, + { + "epoch": 0.2308807720193005, + "grad_norm": 2.648712158203125, + "learning_rate": 1.8974453295580096e-05, + "loss": 1.2252, + "step": 18470 + }, + { + "epoch": 0.2309057726443161, + "grad_norm": 4.50628137588501, + "learning_rate": 1.8974068295921725e-05, + "loss": 1.8779, + "step": 18472 + }, + { + "epoch": 0.23093077326933173, + "grad_norm": 2.2336833477020264, + "learning_rate": 1.8973683227918115e-05, + "loss": 0.8259, + "step": 18474 + }, + { + "epoch": 0.23095577389434735, + "grad_norm": 3.0564863681793213, + "learning_rate": 1.8973298091572205e-05, + "loss": 1.4438, + "step": 18476 + }, + { + "epoch": 0.230980774519363, + "grad_norm": 3.3368165493011475, + "learning_rate": 1.8972912886886927e-05, + "loss": 0.7449, + "step": 18478 + }, + { + "epoch": 0.23100577514437862, + "grad_norm": 3.812657356262207, + "learning_rate": 1.897252761386521e-05, + "loss": 1.1566, + "step": 18480 + }, + { + "epoch": 0.23103077576939424, + "grad_norm": 0.00529854279011488, + "learning_rate": 1.8972142272509993e-05, + "loss": 0.7512, + "step": 18482 + }, + { + "epoch": 0.23105577639440986, + "grad_norm": 4.709118843078613, + "learning_rate": 1.897175686282421e-05, + "loss": 1.2696, + "step": 18484 + }, + { + "epoch": 0.23108077701942548, + "grad_norm": 4.1433210372924805, + "learning_rate": 1.8971371384810795e-05, + "loss": 1.0847, + "step": 18486 + }, + { + "epoch": 0.23110577764444112, + "grad_norm": 0.02733405865728855, + "learning_rate": 1.8970985838472684e-05, + "loss": 0.0667, + "step": 18488 + }, + { + "epoch": 0.23113077826945674, + "grad_norm": 8.354914665222168, + "learning_rate": 1.8970600223812814e-05, + "loss": 0.9446, + "step": 18490 + }, + { + "epoch": 0.23115577889447236, + "grad_norm": 3.202517509460449, + "learning_rate": 1.897021454083412e-05, + "loss": 0.2797, + "step": 18492 + }, + { + "epoch": 0.23118077951948798, + "grad_norm": 2.6701345443725586, + "learning_rate": 1.896982878953954e-05, + "loss": 0.9592, + "step": 18494 + }, + { + "epoch": 0.2312057801445036, + "grad_norm": 5.517007350921631, + "learning_rate": 1.8969442969932017e-05, + "loss": 1.8852, + "step": 18496 + }, + { + "epoch": 0.23123078076951925, + "grad_norm": 8.71818733215332, + "learning_rate": 1.896905708201448e-05, + "loss": 1.3488, + "step": 18498 + }, + { + "epoch": 0.23125578139453487, + "grad_norm": 2.9488768577575684, + "learning_rate": 1.8968671125789876e-05, + "loss": 0.7236, + "step": 18500 + }, + { + "epoch": 0.2312807820195505, + "grad_norm": 3.6654624938964844, + "learning_rate": 1.8968285101261137e-05, + "loss": 1.9214, + "step": 18502 + }, + { + "epoch": 0.2313057826445661, + "grad_norm": 0.11433552205562592, + "learning_rate": 1.896789900843121e-05, + "loss": 0.0947, + "step": 18504 + }, + { + "epoch": 0.23133078326958173, + "grad_norm": 10.640653610229492, + "learning_rate": 1.8967512847303033e-05, + "loss": 0.7162, + "step": 18506 + }, + { + "epoch": 0.23135578389459738, + "grad_norm": 4.497223377227783, + "learning_rate": 1.896712661787954e-05, + "loss": 1.0626, + "step": 18508 + }, + { + "epoch": 0.231380784519613, + "grad_norm": 1.938721776008606, + "learning_rate": 1.8966740320163687e-05, + "loss": 0.0668, + "step": 18510 + }, + { + "epoch": 0.23140578514462862, + "grad_norm": 1.438867449760437, + "learning_rate": 1.8966353954158403e-05, + "loss": 0.9652, + "step": 18512 + }, + { + "epoch": 0.23143078576964424, + "grad_norm": 2.035200834274292, + "learning_rate": 1.8965967519866635e-05, + "loss": 0.6273, + "step": 18514 + }, + { + "epoch": 0.23145578639465986, + "grad_norm": 1.8163341283798218, + "learning_rate": 1.896558101729133e-05, + "loss": 0.6027, + "step": 18516 + }, + { + "epoch": 0.2314807870196755, + "grad_norm": 3.3249032497406006, + "learning_rate": 1.8965194446435423e-05, + "loss": 0.4749, + "step": 18518 + }, + { + "epoch": 0.23150578764469112, + "grad_norm": 4.020758628845215, + "learning_rate": 1.8964807807301865e-05, + "loss": 0.6361, + "step": 18520 + }, + { + "epoch": 0.23153078826970674, + "grad_norm": 1.9070512056350708, + "learning_rate": 1.89644210998936e-05, + "loss": 0.1631, + "step": 18522 + }, + { + "epoch": 0.23155578889472236, + "grad_norm": 4.883615970611572, + "learning_rate": 1.896403432421357e-05, + "loss": 1.4922, + "step": 18524 + }, + { + "epoch": 0.23158078951973798, + "grad_norm": 1.1117531061172485, + "learning_rate": 1.896364748026472e-05, + "loss": 0.2295, + "step": 18526 + }, + { + "epoch": 0.23160579014475363, + "grad_norm": 8.280014991760254, + "learning_rate": 1.8963260568050007e-05, + "loss": 0.9893, + "step": 18528 + }, + { + "epoch": 0.23163079076976925, + "grad_norm": 3.3662428855895996, + "learning_rate": 1.896287358757236e-05, + "loss": 0.9147, + "step": 18530 + }, + { + "epoch": 0.23165579139478487, + "grad_norm": 5.95646333694458, + "learning_rate": 1.896248653883474e-05, + "loss": 2.5797, + "step": 18532 + }, + { + "epoch": 0.2316807920198005, + "grad_norm": 3.974640130996704, + "learning_rate": 1.8962099421840087e-05, + "loss": 1.6414, + "step": 18534 + }, + { + "epoch": 0.2317057926448161, + "grad_norm": 2.4409472942352295, + "learning_rate": 1.8961712236591354e-05, + "loss": 1.8865, + "step": 18536 + }, + { + "epoch": 0.23173079326983176, + "grad_norm": 0.1790550947189331, + "learning_rate": 1.8961324983091484e-05, + "loss": 0.394, + "step": 18538 + }, + { + "epoch": 0.23175579389484738, + "grad_norm": 2.1321136951446533, + "learning_rate": 1.8960937661343438e-05, + "loss": 0.5209, + "step": 18540 + }, + { + "epoch": 0.231780794519863, + "grad_norm": 2.605257987976074, + "learning_rate": 1.896055027135015e-05, + "loss": 1.5295, + "step": 18542 + }, + { + "epoch": 0.23180579514487862, + "grad_norm": 0.5665909647941589, + "learning_rate": 1.896016281311458e-05, + "loss": 0.6228, + "step": 18544 + }, + { + "epoch": 0.23183079576989424, + "grad_norm": 3.5372798442840576, + "learning_rate": 1.8959775286639682e-05, + "loss": 0.5631, + "step": 18546 + }, + { + "epoch": 0.23185579639490989, + "grad_norm": 0.02537567913532257, + "learning_rate": 1.89593876919284e-05, + "loss": 0.4345, + "step": 18548 + }, + { + "epoch": 0.2318807970199255, + "grad_norm": 3.4851818084716797, + "learning_rate": 1.8959000028983685e-05, + "loss": 1.1272, + "step": 18550 + }, + { + "epoch": 0.23190579764494113, + "grad_norm": 0.023330209776759148, + "learning_rate": 1.8958612297808495e-05, + "loss": 0.4546, + "step": 18552 + }, + { + "epoch": 0.23193079826995674, + "grad_norm": 3.382211208343506, + "learning_rate": 1.895822449840578e-05, + "loss": 0.8775, + "step": 18554 + }, + { + "epoch": 0.23195579889497236, + "grad_norm": 3.4510440826416016, + "learning_rate": 1.8957836630778495e-05, + "loss": 1.1271, + "step": 18556 + }, + { + "epoch": 0.231980799519988, + "grad_norm": 3.3472914695739746, + "learning_rate": 1.8957448694929595e-05, + "loss": 0.9427, + "step": 18558 + }, + { + "epoch": 0.23200580014500363, + "grad_norm": 3.172685384750366, + "learning_rate": 1.895706069086203e-05, + "loss": 0.8734, + "step": 18560 + }, + { + "epoch": 0.23203080077001925, + "grad_norm": 3.9704675674438477, + "learning_rate": 1.8956672618578757e-05, + "loss": 1.2882, + "step": 18562 + }, + { + "epoch": 0.23205580139503487, + "grad_norm": 2.6521987915039062, + "learning_rate": 1.8956284478082732e-05, + "loss": 1.3112, + "step": 18564 + }, + { + "epoch": 0.2320808020200505, + "grad_norm": 2.5440664291381836, + "learning_rate": 1.8955896269376913e-05, + "loss": 0.6478, + "step": 18566 + }, + { + "epoch": 0.23210580264506614, + "grad_norm": 3.131580114364624, + "learning_rate": 1.8955507992464253e-05, + "loss": 1.3276, + "step": 18568 + }, + { + "epoch": 0.23213080327008176, + "grad_norm": 2.0952541828155518, + "learning_rate": 1.8955119647347708e-05, + "loss": 1.4459, + "step": 18570 + }, + { + "epoch": 0.23215580389509738, + "grad_norm": 0.07083018869161606, + "learning_rate": 1.8954731234030245e-05, + "loss": 0.7241, + "step": 18572 + }, + { + "epoch": 0.232180804520113, + "grad_norm": 0.022993460297584534, + "learning_rate": 1.8954342752514808e-05, + "loss": 0.0009, + "step": 18574 + }, + { + "epoch": 0.23220580514512862, + "grad_norm": 1.7876198291778564, + "learning_rate": 1.8953954202804366e-05, + "loss": 1.7844, + "step": 18576 + }, + { + "epoch": 0.23223080577014427, + "grad_norm": 7.258236408233643, + "learning_rate": 1.8953565584901873e-05, + "loss": 2.7797, + "step": 18578 + }, + { + "epoch": 0.2322558063951599, + "grad_norm": 0.5486904978752136, + "learning_rate": 1.8953176898810294e-05, + "loss": 0.0116, + "step": 18580 + }, + { + "epoch": 0.2322808070201755, + "grad_norm": 0.026272272691130638, + "learning_rate": 1.8952788144532586e-05, + "loss": 0.8493, + "step": 18582 + }, + { + "epoch": 0.23230580764519113, + "grad_norm": 0.024708181619644165, + "learning_rate": 1.8952399322071707e-05, + "loss": 0.3173, + "step": 18584 + }, + { + "epoch": 0.23233080827020675, + "grad_norm": 4.291469097137451, + "learning_rate": 1.8952010431430622e-05, + "loss": 1.2569, + "step": 18586 + }, + { + "epoch": 0.2323558088952224, + "grad_norm": 0.015804948285222054, + "learning_rate": 1.895162147261229e-05, + "loss": 0.5601, + "step": 18588 + }, + { + "epoch": 0.232380809520238, + "grad_norm": 5.165743827819824, + "learning_rate": 1.8951232445619676e-05, + "loss": 0.9842, + "step": 18590 + }, + { + "epoch": 0.23240581014525363, + "grad_norm": 2.8122761249542236, + "learning_rate": 1.895084335045574e-05, + "loss": 1.1107, + "step": 18592 + }, + { + "epoch": 0.23243081077026925, + "grad_norm": 0.016924824565649033, + "learning_rate": 1.8950454187123447e-05, + "loss": 0.1562, + "step": 18594 + }, + { + "epoch": 0.23245581139528487, + "grad_norm": 1.2025984525680542, + "learning_rate": 1.8950064955625764e-05, + "loss": 0.1164, + "step": 18596 + }, + { + "epoch": 0.23248081202030052, + "grad_norm": 5.986151695251465, + "learning_rate": 1.894967565596565e-05, + "loss": 0.6703, + "step": 18598 + }, + { + "epoch": 0.23250581264531614, + "grad_norm": 3.714296817779541, + "learning_rate": 1.894928628814607e-05, + "loss": 1.034, + "step": 18600 + }, + { + "epoch": 0.23253081327033176, + "grad_norm": 3.8225769996643066, + "learning_rate": 1.8948896852169992e-05, + "loss": 1.785, + "step": 18602 + }, + { + "epoch": 0.23255581389534738, + "grad_norm": 3.852842330932617, + "learning_rate": 1.8948507348040382e-05, + "loss": 1.2166, + "step": 18604 + }, + { + "epoch": 0.232580814520363, + "grad_norm": 2.474165916442871, + "learning_rate": 1.8948117775760206e-05, + "loss": 0.4454, + "step": 18606 + }, + { + "epoch": 0.23260581514537865, + "grad_norm": 2.786038398742676, + "learning_rate": 1.8947728135332427e-05, + "loss": 0.7272, + "step": 18608 + }, + { + "epoch": 0.23263081577039427, + "grad_norm": 4.085448741912842, + "learning_rate": 1.894733842676002e-05, + "loss": 0.7467, + "step": 18610 + }, + { + "epoch": 0.2326558163954099, + "grad_norm": 0.02573258802294731, + "learning_rate": 1.8946948650045946e-05, + "loss": 1.1615, + "step": 18612 + }, + { + "epoch": 0.2326808170204255, + "grad_norm": 3.2158303260803223, + "learning_rate": 1.894655880519318e-05, + "loss": 0.6194, + "step": 18614 + }, + { + "epoch": 0.23270581764544113, + "grad_norm": 0.010296832770109177, + "learning_rate": 1.8946168892204683e-05, + "loss": 1.5821, + "step": 18616 + }, + { + "epoch": 0.23273081827045677, + "grad_norm": 0.6378875374794006, + "learning_rate": 1.894577891108343e-05, + "loss": 0.6377, + "step": 18618 + }, + { + "epoch": 0.2327558188954724, + "grad_norm": 3.1944174766540527, + "learning_rate": 1.8945388861832392e-05, + "loss": 1.157, + "step": 18620 + }, + { + "epoch": 0.232780819520488, + "grad_norm": 2.9606637954711914, + "learning_rate": 1.8944998744454537e-05, + "loss": 0.2027, + "step": 18622 + }, + { + "epoch": 0.23280582014550363, + "grad_norm": 2.422356605529785, + "learning_rate": 1.8944608558952834e-05, + "loss": 1.1933, + "step": 18624 + }, + { + "epoch": 0.23283082077051925, + "grad_norm": 2.510641098022461, + "learning_rate": 1.894421830533026e-05, + "loss": 0.7093, + "step": 18626 + }, + { + "epoch": 0.2328558213955349, + "grad_norm": 4.444851875305176, + "learning_rate": 1.8943827983589783e-05, + "loss": 0.6427, + "step": 18628 + }, + { + "epoch": 0.23288082202055052, + "grad_norm": 2.6495444774627686, + "learning_rate": 1.8943437593734375e-05, + "loss": 0.8851, + "step": 18630 + }, + { + "epoch": 0.23290582264556614, + "grad_norm": 4.686295032501221, + "learning_rate": 1.8943047135767015e-05, + "loss": 1.1312, + "step": 18632 + }, + { + "epoch": 0.23293082327058176, + "grad_norm": 3.8295094966888428, + "learning_rate": 1.8942656609690667e-05, + "loss": 0.8296, + "step": 18634 + }, + { + "epoch": 0.23295582389559738, + "grad_norm": 5.636688709259033, + "learning_rate": 1.8942266015508316e-05, + "loss": 0.7647, + "step": 18636 + }, + { + "epoch": 0.23298082452061303, + "grad_norm": 2.8819003105163574, + "learning_rate": 1.8941875353222928e-05, + "loss": 1.0988, + "step": 18638 + }, + { + "epoch": 0.23300582514562865, + "grad_norm": 3.8404359817504883, + "learning_rate": 1.8941484622837483e-05, + "loss": 0.4511, + "step": 18640 + }, + { + "epoch": 0.23303082577064427, + "grad_norm": 3.546358108520508, + "learning_rate": 1.8941093824354956e-05, + "loss": 1.1697, + "step": 18642 + }, + { + "epoch": 0.2330558263956599, + "grad_norm": 1.655881643295288, + "learning_rate": 1.8940702957778323e-05, + "loss": 0.6327, + "step": 18644 + }, + { + "epoch": 0.2330808270206755, + "grad_norm": 0.4361773431301117, + "learning_rate": 1.894031202311056e-05, + "loss": 0.0084, + "step": 18646 + }, + { + "epoch": 0.23310582764569115, + "grad_norm": 3.814911365509033, + "learning_rate": 1.8939921020354644e-05, + "loss": 1.2807, + "step": 18648 + }, + { + "epoch": 0.23313082827070677, + "grad_norm": 2.6725287437438965, + "learning_rate": 1.893952994951356e-05, + "loss": 0.9727, + "step": 18650 + }, + { + "epoch": 0.2331558288957224, + "grad_norm": 2.875047206878662, + "learning_rate": 1.8939138810590273e-05, + "loss": 0.2012, + "step": 18652 + }, + { + "epoch": 0.23318082952073801, + "grad_norm": 0.009080578573048115, + "learning_rate": 1.893874760358777e-05, + "loss": 0.8123, + "step": 18654 + }, + { + "epoch": 0.23320583014575363, + "grad_norm": 0.9195566177368164, + "learning_rate": 1.893835632850903e-05, + "loss": 1.5111, + "step": 18656 + }, + { + "epoch": 0.23323083077076928, + "grad_norm": 8.481976509094238, + "learning_rate": 1.8937964985357035e-05, + "loss": 1.3882, + "step": 18658 + }, + { + "epoch": 0.2332558313957849, + "grad_norm": 3.472907304763794, + "learning_rate": 1.8937573574134757e-05, + "loss": 1.4808, + "step": 18660 + }, + { + "epoch": 0.23328083202080052, + "grad_norm": 0.9523957967758179, + "learning_rate": 1.893718209484519e-05, + "loss": 0.9362, + "step": 18662 + }, + { + "epoch": 0.23330583264581614, + "grad_norm": 6.355681419372559, + "learning_rate": 1.8936790547491303e-05, + "loss": 1.2728, + "step": 18664 + }, + { + "epoch": 0.23333083327083176, + "grad_norm": 0.00841685850173235, + "learning_rate": 1.8936398932076083e-05, + "loss": 0.6913, + "step": 18666 + }, + { + "epoch": 0.2333558338958474, + "grad_norm": 1.0909138917922974, + "learning_rate": 1.8936007248602512e-05, + "loss": 0.6774, + "step": 18668 + }, + { + "epoch": 0.23338083452086303, + "grad_norm": 0.012569728307425976, + "learning_rate": 1.8935615497073577e-05, + "loss": 0.5529, + "step": 18670 + }, + { + "epoch": 0.23340583514587865, + "grad_norm": 3.9801931381225586, + "learning_rate": 1.8935223677492253e-05, + "loss": 0.3382, + "step": 18672 + }, + { + "epoch": 0.23343083577089427, + "grad_norm": 4.9206438064575195, + "learning_rate": 1.8934831789861532e-05, + "loss": 1.4523, + "step": 18674 + }, + { + "epoch": 0.2334558363959099, + "grad_norm": 2.5884106159210205, + "learning_rate": 1.8934439834184395e-05, + "loss": 2.0476, + "step": 18676 + }, + { + "epoch": 0.23348083702092554, + "grad_norm": 3.8861501216888428, + "learning_rate": 1.893404781046383e-05, + "loss": 1.982, + "step": 18678 + }, + { + "epoch": 0.23350583764594116, + "grad_norm": 6.654012203216553, + "learning_rate": 1.8933655718702816e-05, + "loss": 1.7108, + "step": 18680 + }, + { + "epoch": 0.23353083827095678, + "grad_norm": 1.8208574056625366, + "learning_rate": 1.8933263558904346e-05, + "loss": 0.4738, + "step": 18682 + }, + { + "epoch": 0.2335558388959724, + "grad_norm": 7.151173114776611, + "learning_rate": 1.8932871331071406e-05, + "loss": 0.8233, + "step": 18684 + }, + { + "epoch": 0.23358083952098802, + "grad_norm": 2.0048677921295166, + "learning_rate": 1.8932479035206977e-05, + "loss": 0.3161, + "step": 18686 + }, + { + "epoch": 0.23360584014600366, + "grad_norm": 3.800629138946533, + "learning_rate": 1.893208667131405e-05, + "loss": 0.7107, + "step": 18688 + }, + { + "epoch": 0.23363084077101928, + "grad_norm": 10.232025146484375, + "learning_rate": 1.8931694239395618e-05, + "loss": 1.8676, + "step": 18690 + }, + { + "epoch": 0.2336558413960349, + "grad_norm": 5.6110334396362305, + "learning_rate": 1.8931301739454663e-05, + "loss": 0.3967, + "step": 18692 + }, + { + "epoch": 0.23368084202105052, + "grad_norm": 1.9221110343933105, + "learning_rate": 1.893090917149418e-05, + "loss": 0.8495, + "step": 18694 + }, + { + "epoch": 0.23370584264606614, + "grad_norm": 3.746349334716797, + "learning_rate": 1.8930516535517155e-05, + "loss": 1.4141, + "step": 18696 + }, + { + "epoch": 0.2337308432710818, + "grad_norm": 0.013751902617514133, + "learning_rate": 1.8930123831526576e-05, + "loss": 0.4381, + "step": 18698 + }, + { + "epoch": 0.2337558438960974, + "grad_norm": 3.410621166229248, + "learning_rate": 1.8929731059525438e-05, + "loss": 1.3055, + "step": 18700 + }, + { + "epoch": 0.23378084452111303, + "grad_norm": 4.390133857727051, + "learning_rate": 1.8929338219516732e-05, + "loss": 0.6597, + "step": 18702 + }, + { + "epoch": 0.23380584514612865, + "grad_norm": 2.543344020843506, + "learning_rate": 1.892894531150345e-05, + "loss": 1.36, + "step": 18704 + }, + { + "epoch": 0.23383084577114427, + "grad_norm": 3.4765470027923584, + "learning_rate": 1.892855233548858e-05, + "loss": 1.8128, + "step": 18706 + }, + { + "epoch": 0.23385584639615992, + "grad_norm": 2.618479013442993, + "learning_rate": 1.892815929147512e-05, + "loss": 1.2915, + "step": 18708 + }, + { + "epoch": 0.23388084702117554, + "grad_norm": 8.442605018615723, + "learning_rate": 1.892776617946606e-05, + "loss": 1.0266, + "step": 18710 + }, + { + "epoch": 0.23390584764619116, + "grad_norm": 6.059683322906494, + "learning_rate": 1.8927372999464398e-05, + "loss": 2.0411, + "step": 18712 + }, + { + "epoch": 0.23393084827120678, + "grad_norm": 3.268740653991699, + "learning_rate": 1.8926979751473126e-05, + "loss": 0.5157, + "step": 18714 + }, + { + "epoch": 0.2339558488962224, + "grad_norm": 2.1265270709991455, + "learning_rate": 1.8926586435495235e-05, + "loss": 0.6953, + "step": 18716 + }, + { + "epoch": 0.23398084952123804, + "grad_norm": 5.326000213623047, + "learning_rate": 1.8926193051533724e-05, + "loss": 0.8465, + "step": 18718 + }, + { + "epoch": 0.23400585014625366, + "grad_norm": 4.20834493637085, + "learning_rate": 1.8925799599591592e-05, + "loss": 1.9896, + "step": 18720 + }, + { + "epoch": 0.23403085077126928, + "grad_norm": 5.046466827392578, + "learning_rate": 1.892540607967183e-05, + "loss": 1.1456, + "step": 18722 + }, + { + "epoch": 0.2340558513962849, + "grad_norm": 0.010568000376224518, + "learning_rate": 1.892501249177744e-05, + "loss": 0.221, + "step": 18724 + }, + { + "epoch": 0.23408085202130052, + "grad_norm": 0.2777460217475891, + "learning_rate": 1.8924618835911416e-05, + "loss": 0.3739, + "step": 18726 + }, + { + "epoch": 0.23410585264631617, + "grad_norm": 3.15914249420166, + "learning_rate": 1.8924225112076756e-05, + "loss": 2.1448, + "step": 18728 + }, + { + "epoch": 0.2341308532713318, + "grad_norm": 0.005534920375794172, + "learning_rate": 1.892383132027646e-05, + "loss": 0.0524, + "step": 18730 + }, + { + "epoch": 0.2341558538963474, + "grad_norm": 0.812991201877594, + "learning_rate": 1.8923437460513528e-05, + "loss": 0.2224, + "step": 18732 + }, + { + "epoch": 0.23418085452136303, + "grad_norm": 0.23627327382564545, + "learning_rate": 1.8923043532790957e-05, + "loss": 0.431, + "step": 18734 + }, + { + "epoch": 0.23420585514637865, + "grad_norm": 4.678743362426758, + "learning_rate": 1.892264953711175e-05, + "loss": 1.3185, + "step": 18736 + }, + { + "epoch": 0.2342308557713943, + "grad_norm": 0.12061230838298798, + "learning_rate": 1.8922255473478902e-05, + "loss": 0.1711, + "step": 18738 + }, + { + "epoch": 0.23425585639640992, + "grad_norm": 5.792442321777344, + "learning_rate": 1.892186134189542e-05, + "loss": 1.2953, + "step": 18740 + }, + { + "epoch": 0.23428085702142554, + "grad_norm": 1.8689117431640625, + "learning_rate": 1.8921467142364305e-05, + "loss": 0.2895, + "step": 18742 + }, + { + "epoch": 0.23430585764644116, + "grad_norm": 4.664459705352783, + "learning_rate": 1.8921072874888555e-05, + "loss": 1.6687, + "step": 18744 + }, + { + "epoch": 0.23433085827145678, + "grad_norm": 4.634392738342285, + "learning_rate": 1.8920678539471177e-05, + "loss": 0.8577, + "step": 18746 + }, + { + "epoch": 0.23435585889647242, + "grad_norm": 0.019844969734549522, + "learning_rate": 1.8920284136115174e-05, + "loss": 0.5002, + "step": 18748 + }, + { + "epoch": 0.23438085952148804, + "grad_norm": 2.7584967613220215, + "learning_rate": 1.8919889664823548e-05, + "loss": 0.8375, + "step": 18750 + }, + { + "epoch": 0.23440586014650366, + "grad_norm": 2.530102491378784, + "learning_rate": 1.89194951255993e-05, + "loss": 1.7155, + "step": 18752 + }, + { + "epoch": 0.23443086077151928, + "grad_norm": 0.8037441372871399, + "learning_rate": 1.8919100518445445e-05, + "loss": 0.3355, + "step": 18754 + }, + { + "epoch": 0.2344558613965349, + "grad_norm": 3.823704719543457, + "learning_rate": 1.891870584336498e-05, + "loss": 2.0561, + "step": 18756 + }, + { + "epoch": 0.23448086202155055, + "grad_norm": 3.344420909881592, + "learning_rate": 1.891831110036091e-05, + "loss": 1.4547, + "step": 18758 + }, + { + "epoch": 0.23450586264656617, + "grad_norm": 0.6195327043533325, + "learning_rate": 1.891791628943624e-05, + "loss": 0.0338, + "step": 18760 + }, + { + "epoch": 0.2345308632715818, + "grad_norm": 2.1463100910186768, + "learning_rate": 1.8917521410593987e-05, + "loss": 0.8193, + "step": 18762 + }, + { + "epoch": 0.2345558638965974, + "grad_norm": 1.7845643758773804, + "learning_rate": 1.8917126463837153e-05, + "loss": 1.9398, + "step": 18764 + }, + { + "epoch": 0.23458086452161303, + "grad_norm": 2.7152819633483887, + "learning_rate": 1.891673144916874e-05, + "loss": 1.8671, + "step": 18766 + }, + { + "epoch": 0.23460586514662868, + "grad_norm": 3.9961278438568115, + "learning_rate": 1.8916336366591762e-05, + "loss": 0.6619, + "step": 18768 + }, + { + "epoch": 0.2346308657716443, + "grad_norm": 7.8487348556518555, + "learning_rate": 1.8915941216109227e-05, + "loss": 1.0855, + "step": 18770 + }, + { + "epoch": 0.23465586639665992, + "grad_norm": 3.295722246170044, + "learning_rate": 1.8915545997724146e-05, + "loss": 0.2961, + "step": 18772 + }, + { + "epoch": 0.23468086702167554, + "grad_norm": 4.274766445159912, + "learning_rate": 1.891515071143953e-05, + "loss": 1.426, + "step": 18774 + }, + { + "epoch": 0.23470586764669116, + "grad_norm": 2.748072624206543, + "learning_rate": 1.8914755357258383e-05, + "loss": 0.8064, + "step": 18776 + }, + { + "epoch": 0.2347308682717068, + "grad_norm": 2.7859227657318115, + "learning_rate": 1.891435993518372e-05, + "loss": 0.6779, + "step": 18778 + }, + { + "epoch": 0.23475586889672242, + "grad_norm": 2.4360649585723877, + "learning_rate": 1.891396444521856e-05, + "loss": 1.3575, + "step": 18780 + }, + { + "epoch": 0.23478086952173804, + "grad_norm": 4.928075790405273, + "learning_rate": 1.89135688873659e-05, + "loss": 1.3879, + "step": 18782 + }, + { + "epoch": 0.23480587014675366, + "grad_norm": 3.5685622692108154, + "learning_rate": 1.891317326162876e-05, + "loss": 1.188, + "step": 18784 + }, + { + "epoch": 0.23483087077176928, + "grad_norm": 1.1831567287445068, + "learning_rate": 1.8912777568010154e-05, + "loss": 0.6963, + "step": 18786 + }, + { + "epoch": 0.23485587139678493, + "grad_norm": 5.106257915496826, + "learning_rate": 1.891238180651309e-05, + "loss": 1.0944, + "step": 18788 + }, + { + "epoch": 0.23488087202180055, + "grad_norm": 2.208766222000122, + "learning_rate": 1.8911985977140593e-05, + "loss": 0.8677, + "step": 18790 + }, + { + "epoch": 0.23490587264681617, + "grad_norm": 1.867799162864685, + "learning_rate": 1.8911590079895667e-05, + "loss": 0.4072, + "step": 18792 + }, + { + "epoch": 0.2349308732718318, + "grad_norm": 6.150773525238037, + "learning_rate": 1.8911194114781333e-05, + "loss": 1.3557, + "step": 18794 + }, + { + "epoch": 0.2349558738968474, + "grad_norm": 4.871220111846924, + "learning_rate": 1.8910798081800604e-05, + "loss": 0.7716, + "step": 18796 + }, + { + "epoch": 0.23498087452186306, + "grad_norm": 8.697028160095215, + "learning_rate": 1.89104019809565e-05, + "loss": 0.7558, + "step": 18798 + }, + { + "epoch": 0.23500587514687868, + "grad_norm": 0.019329603761434555, + "learning_rate": 1.891000581225203e-05, + "loss": 0.2668, + "step": 18800 + }, + { + "epoch": 0.2350308757718943, + "grad_norm": 2.9187753200531006, + "learning_rate": 1.890960957569022e-05, + "loss": 0.7685, + "step": 18802 + }, + { + "epoch": 0.23505587639690992, + "grad_norm": 4.602449417114258, + "learning_rate": 1.890921327127408e-05, + "loss": 2.043, + "step": 18804 + }, + { + "epoch": 0.23508087702192554, + "grad_norm": 1.541910171508789, + "learning_rate": 1.8908816899006633e-05, + "loss": 1.4054, + "step": 18806 + }, + { + "epoch": 0.23510587764694119, + "grad_norm": 0.01233743317425251, + "learning_rate": 1.8908420458890895e-05, + "loss": 0.3027, + "step": 18808 + }, + { + "epoch": 0.2351308782719568, + "grad_norm": 4.274483680725098, + "learning_rate": 1.890802395092989e-05, + "loss": 1.7412, + "step": 18810 + }, + { + "epoch": 0.23515587889697243, + "grad_norm": 6.1918206214904785, + "learning_rate": 1.890762737512663e-05, + "loss": 0.9999, + "step": 18812 + }, + { + "epoch": 0.23518087952198805, + "grad_norm": 3.327326536178589, + "learning_rate": 1.890723073148414e-05, + "loss": 0.4822, + "step": 18814 + }, + { + "epoch": 0.23520588014700367, + "grad_norm": 6.723355293273926, + "learning_rate": 1.8906834020005444e-05, + "loss": 0.6365, + "step": 18816 + }, + { + "epoch": 0.2352308807720193, + "grad_norm": 4.197898864746094, + "learning_rate": 1.8906437240693557e-05, + "loss": 1.5113, + "step": 18818 + }, + { + "epoch": 0.23525588139703493, + "grad_norm": 6.083738803863525, + "learning_rate": 1.8906040393551505e-05, + "loss": 1.5159, + "step": 18820 + }, + { + "epoch": 0.23528088202205055, + "grad_norm": 0.004759168718010187, + "learning_rate": 1.8905643478582307e-05, + "loss": 0.7034, + "step": 18822 + }, + { + "epoch": 0.23530588264706617, + "grad_norm": 2.9124038219451904, + "learning_rate": 1.890524649578899e-05, + "loss": 1.7, + "step": 18824 + }, + { + "epoch": 0.2353308832720818, + "grad_norm": 5.619978904724121, + "learning_rate": 1.890484944517457e-05, + "loss": 0.7572, + "step": 18826 + }, + { + "epoch": 0.23535588389709744, + "grad_norm": 4.6929216384887695, + "learning_rate": 1.890445232674208e-05, + "loss": 1.9083, + "step": 18828 + }, + { + "epoch": 0.23538088452211306, + "grad_norm": 0.5664227604866028, + "learning_rate": 1.890405514049454e-05, + "loss": 0.9532, + "step": 18830 + }, + { + "epoch": 0.23540588514712868, + "grad_norm": 0.007938547991216183, + "learning_rate": 1.8903657886434974e-05, + "loss": 0.0998, + "step": 18832 + }, + { + "epoch": 0.2354308857721443, + "grad_norm": 0.008921888656914234, + "learning_rate": 1.890326056456641e-05, + "loss": 0.0002, + "step": 18834 + }, + { + "epoch": 0.23545588639715992, + "grad_norm": 3.3156235218048096, + "learning_rate": 1.8902863174891874e-05, + "loss": 1.2457, + "step": 18836 + }, + { + "epoch": 0.23548088702217557, + "grad_norm": 7.112497329711914, + "learning_rate": 1.8902465717414387e-05, + "loss": 2.0856, + "step": 18838 + }, + { + "epoch": 0.2355058876471912, + "grad_norm": 4.144747257232666, + "learning_rate": 1.8902068192136982e-05, + "loss": 0.7603, + "step": 18840 + }, + { + "epoch": 0.2355308882722068, + "grad_norm": 7.6358184814453125, + "learning_rate": 1.8901670599062683e-05, + "loss": 0.0754, + "step": 18842 + }, + { + "epoch": 0.23555588889722243, + "grad_norm": 3.0813419818878174, + "learning_rate": 1.890127293819452e-05, + "loss": 1.5849, + "step": 18844 + }, + { + "epoch": 0.23558088952223805, + "grad_norm": 2.1288864612579346, + "learning_rate": 1.8900875209535523e-05, + "loss": 1.1014, + "step": 18846 + }, + { + "epoch": 0.2356058901472537, + "grad_norm": 3.9587340354919434, + "learning_rate": 1.890047741308872e-05, + "loss": 1.2783, + "step": 18848 + }, + { + "epoch": 0.2356308907722693, + "grad_norm": 0.01299529429525137, + "learning_rate": 1.8900079548857138e-05, + "loss": 0.1225, + "step": 18850 + }, + { + "epoch": 0.23565589139728493, + "grad_norm": 3.271174669265747, + "learning_rate": 1.8899681616843808e-05, + "loss": 1.4914, + "step": 18852 + }, + { + "epoch": 0.23568089202230055, + "grad_norm": 3.1504576206207275, + "learning_rate": 1.889928361705176e-05, + "loss": 0.354, + "step": 18854 + }, + { + "epoch": 0.23570589264731617, + "grad_norm": 5.299320220947266, + "learning_rate": 1.889888554948403e-05, + "loss": 0.564, + "step": 18856 + }, + { + "epoch": 0.23573089327233182, + "grad_norm": 8.90093994140625, + "learning_rate": 1.8898487414143644e-05, + "loss": 1.8144, + "step": 18858 + }, + { + "epoch": 0.23575589389734744, + "grad_norm": 3.5594513416290283, + "learning_rate": 1.8898089211033635e-05, + "loss": 0.6668, + "step": 18860 + }, + { + "epoch": 0.23578089452236306, + "grad_norm": 8.519227027893066, + "learning_rate": 1.889769094015704e-05, + "loss": 0.2164, + "step": 18862 + }, + { + "epoch": 0.23580589514737868, + "grad_norm": 0.011362089775502682, + "learning_rate": 1.8897292601516888e-05, + "loss": 0.6506, + "step": 18864 + }, + { + "epoch": 0.2358308957723943, + "grad_norm": 3.715770721435547, + "learning_rate": 1.8896894195116213e-05, + "loss": 0.9355, + "step": 18866 + }, + { + "epoch": 0.23585589639740995, + "grad_norm": 2.079930067062378, + "learning_rate": 1.889649572095805e-05, + "loss": 0.3792, + "step": 18868 + }, + { + "epoch": 0.23588089702242557, + "grad_norm": 3.617842197418213, + "learning_rate": 1.8896097179045433e-05, + "loss": 1.1739, + "step": 18870 + }, + { + "epoch": 0.2359058976474412, + "grad_norm": 0.9551558494567871, + "learning_rate": 1.88956985693814e-05, + "loss": 1.4991, + "step": 18872 + }, + { + "epoch": 0.2359308982724568, + "grad_norm": 0.004643397871404886, + "learning_rate": 1.8895299891968983e-05, + "loss": 1.0592, + "step": 18874 + }, + { + "epoch": 0.23595589889747243, + "grad_norm": 1.5738950967788696, + "learning_rate": 1.889490114681122e-05, + "loss": 0.1058, + "step": 18876 + }, + { + "epoch": 0.23598089952248807, + "grad_norm": 3.719895362854004, + "learning_rate": 1.8894502333911147e-05, + "loss": 1.6014, + "step": 18878 + }, + { + "epoch": 0.2360059001475037, + "grad_norm": 2.9751882553100586, + "learning_rate": 1.8894103453271807e-05, + "loss": 0.8258, + "step": 18880 + }, + { + "epoch": 0.23603090077251931, + "grad_norm": 5.911384582519531, + "learning_rate": 1.8893704504896227e-05, + "loss": 1.2921, + "step": 18882 + }, + { + "epoch": 0.23605590139753493, + "grad_norm": 2.9397099018096924, + "learning_rate": 1.8893305488787455e-05, + "loss": 0.7651, + "step": 18884 + }, + { + "epoch": 0.23608090202255055, + "grad_norm": 4.3372626304626465, + "learning_rate": 1.8892906404948522e-05, + "loss": 1.6212, + "step": 18886 + }, + { + "epoch": 0.2361059026475662, + "grad_norm": 3.595733165740967, + "learning_rate": 1.8892507253382474e-05, + "loss": 1.7542, + "step": 18888 + }, + { + "epoch": 0.23613090327258182, + "grad_norm": 4.157711505889893, + "learning_rate": 1.8892108034092348e-05, + "loss": 0.8311, + "step": 18890 + }, + { + "epoch": 0.23615590389759744, + "grad_norm": 2.503331422805786, + "learning_rate": 1.8891708747081185e-05, + "loss": 0.1977, + "step": 18892 + }, + { + "epoch": 0.23618090452261306, + "grad_norm": 1.688469409942627, + "learning_rate": 1.8891309392352025e-05, + "loss": 0.1213, + "step": 18894 + }, + { + "epoch": 0.23620590514762868, + "grad_norm": 4.237115383148193, + "learning_rate": 1.8890909969907907e-05, + "loss": 1.2123, + "step": 18896 + }, + { + "epoch": 0.23623090577264433, + "grad_norm": 3.1508686542510986, + "learning_rate": 1.889051047975188e-05, + "loss": 1.0372, + "step": 18898 + }, + { + "epoch": 0.23625590639765995, + "grad_norm": 2.7788991928100586, + "learning_rate": 1.889011092188698e-05, + "loss": 0.7031, + "step": 18900 + }, + { + "epoch": 0.23628090702267557, + "grad_norm": 3.6855719089508057, + "learning_rate": 1.888971129631626e-05, + "loss": 1.1813, + "step": 18902 + }, + { + "epoch": 0.2363059076476912, + "grad_norm": 3.843172550201416, + "learning_rate": 1.8889311603042748e-05, + "loss": 1.9482, + "step": 18904 + }, + { + "epoch": 0.2363309082727068, + "grad_norm": 0.24690139293670654, + "learning_rate": 1.8888911842069495e-05, + "loss": 0.5301, + "step": 18906 + }, + { + "epoch": 0.23635590889772246, + "grad_norm": 6.455967426300049, + "learning_rate": 1.888851201339955e-05, + "loss": 1.0948, + "step": 18908 + }, + { + "epoch": 0.23638090952273808, + "grad_norm": 0.010142171755433083, + "learning_rate": 1.8888112117035954e-05, + "loss": 0.4836, + "step": 18910 + }, + { + "epoch": 0.2364059101477537, + "grad_norm": 2.5215890407562256, + "learning_rate": 1.888771215298175e-05, + "loss": 1.3686, + "step": 18912 + }, + { + "epoch": 0.23643091077276931, + "grad_norm": 2.702103614807129, + "learning_rate": 1.8887312121239994e-05, + "loss": 0.8043, + "step": 18914 + }, + { + "epoch": 0.23645591139778493, + "grad_norm": 2.9822683334350586, + "learning_rate": 1.888691202181372e-05, + "loss": 1.0629, + "step": 18916 + }, + { + "epoch": 0.23648091202280058, + "grad_norm": 2.8490183353424072, + "learning_rate": 1.888651185470598e-05, + "loss": 0.8452, + "step": 18918 + }, + { + "epoch": 0.2365059126478162, + "grad_norm": 2.2783725261688232, + "learning_rate": 1.8886111619919828e-05, + "loss": 0.6506, + "step": 18920 + }, + { + "epoch": 0.23653091327283182, + "grad_norm": 2.1129322052001953, + "learning_rate": 1.88857113174583e-05, + "loss": 1.3583, + "step": 18922 + }, + { + "epoch": 0.23655591389784744, + "grad_norm": 2.7833173274993896, + "learning_rate": 1.8885310947324455e-05, + "loss": 0.5362, + "step": 18924 + }, + { + "epoch": 0.23658091452286306, + "grad_norm": 5.705460071563721, + "learning_rate": 1.888491050952134e-05, + "loss": 0.5298, + "step": 18926 + }, + { + "epoch": 0.2366059151478787, + "grad_norm": 7.1001482009887695, + "learning_rate": 1.8884510004051998e-05, + "loss": 0.6033, + "step": 18928 + }, + { + "epoch": 0.23663091577289433, + "grad_norm": 3.2569074630737305, + "learning_rate": 1.8884109430919486e-05, + "loss": 0.4754, + "step": 18930 + }, + { + "epoch": 0.23665591639790995, + "grad_norm": 2.093277931213379, + "learning_rate": 1.8883708790126857e-05, + "loss": 0.5813, + "step": 18932 + }, + { + "epoch": 0.23668091702292557, + "grad_norm": 4.613935470581055, + "learning_rate": 1.8883308081677152e-05, + "loss": 0.9393, + "step": 18934 + }, + { + "epoch": 0.2367059176479412, + "grad_norm": 4.910780429840088, + "learning_rate": 1.8882907305573432e-05, + "loss": 1.7538, + "step": 18936 + }, + { + "epoch": 0.23673091827295684, + "grad_norm": 5.017189979553223, + "learning_rate": 1.8882506461818745e-05, + "loss": 1.3024, + "step": 18938 + }, + { + "epoch": 0.23675591889797246, + "grad_norm": 5.43204927444458, + "learning_rate": 1.8882105550416145e-05, + "loss": 0.6956, + "step": 18940 + }, + { + "epoch": 0.23678091952298808, + "grad_norm": 9.520402908325195, + "learning_rate": 1.8881704571368687e-05, + "loss": 1.9882, + "step": 18942 + }, + { + "epoch": 0.2368059201480037, + "grad_norm": 4.930741310119629, + "learning_rate": 1.888130352467942e-05, + "loss": 1.7718, + "step": 18944 + }, + { + "epoch": 0.23683092077301932, + "grad_norm": 6.392001152038574, + "learning_rate": 1.8880902410351406e-05, + "loss": 1.4501, + "step": 18946 + }, + { + "epoch": 0.23685592139803496, + "grad_norm": 5.313492298126221, + "learning_rate": 1.888050122838769e-05, + "loss": 0.7759, + "step": 18948 + }, + { + "epoch": 0.23688092202305058, + "grad_norm": 3.7971084117889404, + "learning_rate": 1.8880099978791334e-05, + "loss": 1.4228, + "step": 18950 + }, + { + "epoch": 0.2369059226480662, + "grad_norm": 3.162368059158325, + "learning_rate": 1.8879698661565393e-05, + "loss": 1.2798, + "step": 18952 + }, + { + "epoch": 0.23693092327308182, + "grad_norm": 1.2528475522994995, + "learning_rate": 1.8879297276712924e-05, + "loss": 0.3523, + "step": 18954 + }, + { + "epoch": 0.23695592389809744, + "grad_norm": 3.404484748840332, + "learning_rate": 1.887889582423698e-05, + "loss": 0.0781, + "step": 18956 + }, + { + "epoch": 0.2369809245231131, + "grad_norm": 3.0100038051605225, + "learning_rate": 1.8878494304140623e-05, + "loss": 0.4718, + "step": 18958 + }, + { + "epoch": 0.2370059251481287, + "grad_norm": 4.253931999206543, + "learning_rate": 1.8878092716426907e-05, + "loss": 0.4772, + "step": 18960 + }, + { + "epoch": 0.23703092577314433, + "grad_norm": 6.226618766784668, + "learning_rate": 1.887769106109889e-05, + "loss": 1.3879, + "step": 18962 + }, + { + "epoch": 0.23705592639815995, + "grad_norm": 2.5192861557006836, + "learning_rate": 1.887728933815964e-05, + "loss": 0.4383, + "step": 18964 + }, + { + "epoch": 0.23708092702317557, + "grad_norm": 3.7166271209716797, + "learning_rate": 1.8876887547612204e-05, + "loss": 1.1197, + "step": 18966 + }, + { + "epoch": 0.23710592764819122, + "grad_norm": 3.682900905609131, + "learning_rate": 1.8876485689459652e-05, + "loss": 1.2138, + "step": 18968 + }, + { + "epoch": 0.23713092827320684, + "grad_norm": 2.7587437629699707, + "learning_rate": 1.887608376370504e-05, + "loss": 2.7995, + "step": 18970 + }, + { + "epoch": 0.23715592889822246, + "grad_norm": 3.0278642177581787, + "learning_rate": 1.8875681770351425e-05, + "loss": 1.1954, + "step": 18972 + }, + { + "epoch": 0.23718092952323808, + "grad_norm": 3.917499542236328, + "learning_rate": 1.887527970940188e-05, + "loss": 1.0515, + "step": 18974 + }, + { + "epoch": 0.2372059301482537, + "grad_norm": 0.622285008430481, + "learning_rate": 1.8874877580859452e-05, + "loss": 0.3353, + "step": 18976 + }, + { + "epoch": 0.23723093077326934, + "grad_norm": 4.124719142913818, + "learning_rate": 1.8874475384727215e-05, + "loss": 1.1281, + "step": 18978 + }, + { + "epoch": 0.23725593139828496, + "grad_norm": 2.9701170921325684, + "learning_rate": 1.8874073121008227e-05, + "loss": 1.1714, + "step": 18980 + }, + { + "epoch": 0.23728093202330058, + "grad_norm": 2.5628795623779297, + "learning_rate": 1.8873670789705554e-05, + "loss": 1.7268, + "step": 18982 + }, + { + "epoch": 0.2373059326483162, + "grad_norm": 0.7676742672920227, + "learning_rate": 1.887326839082226e-05, + "loss": 0.937, + "step": 18984 + }, + { + "epoch": 0.23733093327333182, + "grad_norm": 3.194347858428955, + "learning_rate": 1.887286592436141e-05, + "loss": 0.8977, + "step": 18986 + }, + { + "epoch": 0.23735593389834747, + "grad_norm": 0.33707404136657715, + "learning_rate": 1.8872463390326065e-05, + "loss": 1.704, + "step": 18988 + }, + { + "epoch": 0.2373809345233631, + "grad_norm": 0.015989962965250015, + "learning_rate": 1.887206078871929e-05, + "loss": 0.7043, + "step": 18990 + }, + { + "epoch": 0.2374059351483787, + "grad_norm": 4.070887088775635, + "learning_rate": 1.887165811954416e-05, + "loss": 2.0854, + "step": 18992 + }, + { + "epoch": 0.23743093577339433, + "grad_norm": 0.011728128418326378, + "learning_rate": 1.8871255382803737e-05, + "loss": 0.6107, + "step": 18994 + }, + { + "epoch": 0.23745593639840995, + "grad_norm": 7.714166641235352, + "learning_rate": 1.8870852578501085e-05, + "loss": 1.1385, + "step": 18996 + }, + { + "epoch": 0.2374809370234256, + "grad_norm": 1.2056697607040405, + "learning_rate": 1.8870449706639275e-05, + "loss": 0.8099, + "step": 18998 + }, + { + "epoch": 0.23750593764844122, + "grad_norm": 4.757809638977051, + "learning_rate": 1.8870046767221376e-05, + "loss": 1.0068, + "step": 19000 + }, + { + "epoch": 0.23753093827345684, + "grad_norm": 4.898269176483154, + "learning_rate": 1.8869643760250455e-05, + "loss": 0.7329, + "step": 19002 + }, + { + "epoch": 0.23755593889847246, + "grad_norm": 0.008922048844397068, + "learning_rate": 1.8869240685729578e-05, + "loss": 0.9745, + "step": 19004 + }, + { + "epoch": 0.23758093952348808, + "grad_norm": 3.29427170753479, + "learning_rate": 1.8868837543661818e-05, + "loss": 1.1615, + "step": 19006 + }, + { + "epoch": 0.23760594014850372, + "grad_norm": 2.6423604488372803, + "learning_rate": 1.886843433405025e-05, + "loss": 0.2088, + "step": 19008 + }, + { + "epoch": 0.23763094077351934, + "grad_norm": 5.653146266937256, + "learning_rate": 1.886803105689794e-05, + "loss": 0.5368, + "step": 19010 + }, + { + "epoch": 0.23765594139853496, + "grad_norm": 0.028470434248447418, + "learning_rate": 1.8867627712207954e-05, + "loss": 0.9065, + "step": 19012 + }, + { + "epoch": 0.23768094202355058, + "grad_norm": 2.958106756210327, + "learning_rate": 1.8867224299983373e-05, + "loss": 0.6992, + "step": 19014 + }, + { + "epoch": 0.2377059426485662, + "grad_norm": 0.005388505291193724, + "learning_rate": 1.8866820820227267e-05, + "loss": 0.329, + "step": 19016 + }, + { + "epoch": 0.23773094327358185, + "grad_norm": 2.0397543907165527, + "learning_rate": 1.8866417272942702e-05, + "loss": 1.1666, + "step": 19018 + }, + { + "epoch": 0.23775594389859747, + "grad_norm": 3.959345817565918, + "learning_rate": 1.8866013658132766e-05, + "loss": 1.8052, + "step": 19020 + }, + { + "epoch": 0.2377809445236131, + "grad_norm": 1.778710961341858, + "learning_rate": 1.8865609975800514e-05, + "loss": 0.3087, + "step": 19022 + }, + { + "epoch": 0.2378059451486287, + "grad_norm": 0.01012275367975235, + "learning_rate": 1.8865206225949037e-05, + "loss": 0.0042, + "step": 19024 + }, + { + "epoch": 0.23783094577364433, + "grad_norm": 2.958890676498413, + "learning_rate": 1.88648024085814e-05, + "loss": 0.6013, + "step": 19026 + }, + { + "epoch": 0.23785594639865998, + "grad_norm": 4.263233661651611, + "learning_rate": 1.886439852370068e-05, + "loss": 1.6748, + "step": 19028 + }, + { + "epoch": 0.2378809470236756, + "grad_norm": 4.166896343231201, + "learning_rate": 1.8863994571309958e-05, + "loss": 0.3501, + "step": 19030 + }, + { + "epoch": 0.23790594764869122, + "grad_norm": 3.3393869400024414, + "learning_rate": 1.8863590551412305e-05, + "loss": 0.789, + "step": 19032 + }, + { + "epoch": 0.23793094827370684, + "grad_norm": 6.460185527801514, + "learning_rate": 1.8863186464010802e-05, + "loss": 0.5994, + "step": 19034 + }, + { + "epoch": 0.23795594889872246, + "grad_norm": 0.13727226853370667, + "learning_rate": 1.8862782309108522e-05, + "loss": 0.1728, + "step": 19036 + }, + { + "epoch": 0.2379809495237381, + "grad_norm": 1.8468412160873413, + "learning_rate": 1.8862378086708544e-05, + "loss": 0.1721, + "step": 19038 + }, + { + "epoch": 0.23800595014875373, + "grad_norm": 0.010121544823050499, + "learning_rate": 1.886197379681395e-05, + "loss": 0.5334, + "step": 19040 + }, + { + "epoch": 0.23803095077376935, + "grad_norm": 7.183967590332031, + "learning_rate": 1.8861569439427816e-05, + "loss": 1.3967, + "step": 19042 + }, + { + "epoch": 0.23805595139878497, + "grad_norm": 5.305819034576416, + "learning_rate": 1.886116501455322e-05, + "loss": 0.5049, + "step": 19044 + }, + { + "epoch": 0.23808095202380058, + "grad_norm": 0.009750849567353725, + "learning_rate": 1.886076052219325e-05, + "loss": 0.8937, + "step": 19046 + }, + { + "epoch": 0.23810595264881623, + "grad_norm": 5.288569450378418, + "learning_rate": 1.8860355962350977e-05, + "loss": 1.0292, + "step": 19048 + }, + { + "epoch": 0.23813095327383185, + "grad_norm": 3.9959018230438232, + "learning_rate": 1.8859951335029488e-05, + "loss": 1.1872, + "step": 19050 + }, + { + "epoch": 0.23815595389884747, + "grad_norm": 2.3690054416656494, + "learning_rate": 1.885954664023186e-05, + "loss": 0.3956, + "step": 19052 + }, + { + "epoch": 0.2381809545238631, + "grad_norm": 3.901857614517212, + "learning_rate": 1.885914187796118e-05, + "loss": 1.1717, + "step": 19054 + }, + { + "epoch": 0.2382059551488787, + "grad_norm": 3.9707705974578857, + "learning_rate": 1.8858737048220526e-05, + "loss": 1.4535, + "step": 19056 + }, + { + "epoch": 0.23823095577389436, + "grad_norm": 5.071638584136963, + "learning_rate": 1.8858332151012984e-05, + "loss": 1.0982, + "step": 19058 + }, + { + "epoch": 0.23825595639890998, + "grad_norm": 4.237053871154785, + "learning_rate": 1.885792718634164e-05, + "loss": 1.3163, + "step": 19060 + }, + { + "epoch": 0.2382809570239256, + "grad_norm": 0.8388041257858276, + "learning_rate": 1.885752215420957e-05, + "loss": 0.0592, + "step": 19062 + }, + { + "epoch": 0.23830595764894122, + "grad_norm": 3.7393040657043457, + "learning_rate": 1.8857117054619865e-05, + "loss": 1.3932, + "step": 19064 + }, + { + "epoch": 0.23833095827395684, + "grad_norm": 5.984249591827393, + "learning_rate": 1.8856711887575612e-05, + "loss": 0.6432, + "step": 19066 + }, + { + "epoch": 0.2383559588989725, + "grad_norm": 0.009703798219561577, + "learning_rate": 1.8856306653079893e-05, + "loss": 0.3759, + "step": 19068 + }, + { + "epoch": 0.2383809595239881, + "grad_norm": 2.849499225616455, + "learning_rate": 1.8855901351135793e-05, + "loss": 1.4276, + "step": 19070 + }, + { + "epoch": 0.23840596014900373, + "grad_norm": 1.520416259765625, + "learning_rate": 1.88554959817464e-05, + "loss": 0.6238, + "step": 19072 + }, + { + "epoch": 0.23843096077401935, + "grad_norm": 1.4724829196929932, + "learning_rate": 1.8855090544914805e-05, + "loss": 0.1604, + "step": 19074 + }, + { + "epoch": 0.23845596139903497, + "grad_norm": 3.323824167251587, + "learning_rate": 1.885468504064409e-05, + "loss": 1.1253, + "step": 19076 + }, + { + "epoch": 0.2384809620240506, + "grad_norm": 3.7410471439361572, + "learning_rate": 1.8854279468937348e-05, + "loss": 0.606, + "step": 19078 + }, + { + "epoch": 0.23850596264906623, + "grad_norm": 2.5934112071990967, + "learning_rate": 1.8853873829797665e-05, + "loss": 0.7502, + "step": 19080 + }, + { + "epoch": 0.23853096327408185, + "grad_norm": 4.356198310852051, + "learning_rate": 1.885346812322813e-05, + "loss": 1.6094, + "step": 19082 + }, + { + "epoch": 0.23855596389909747, + "grad_norm": 3.2088918685913086, + "learning_rate": 1.8853062349231833e-05, + "loss": 1.5949, + "step": 19084 + }, + { + "epoch": 0.2385809645241131, + "grad_norm": 2.519768476486206, + "learning_rate": 1.8852656507811867e-05, + "loss": 1.2285, + "step": 19086 + }, + { + "epoch": 0.23860596514912874, + "grad_norm": 2.8373098373413086, + "learning_rate": 1.885225059897132e-05, + "loss": 0.5837, + "step": 19088 + }, + { + "epoch": 0.23863096577414436, + "grad_norm": 5.879086494445801, + "learning_rate": 1.8851844622713288e-05, + "loss": 0.886, + "step": 19090 + }, + { + "epoch": 0.23865596639915998, + "grad_norm": 2.8802857398986816, + "learning_rate": 1.8851438579040855e-05, + "loss": 0.6591, + "step": 19092 + }, + { + "epoch": 0.2386809670241756, + "grad_norm": 1.846949815750122, + "learning_rate": 1.8851032467957118e-05, + "loss": 0.3998, + "step": 19094 + }, + { + "epoch": 0.23870596764919122, + "grad_norm": 0.13260327279567719, + "learning_rate": 1.885062628946517e-05, + "loss": 0.2429, + "step": 19096 + }, + { + "epoch": 0.23873096827420687, + "grad_norm": 3.0629796981811523, + "learning_rate": 1.8850220043568104e-05, + "loss": 0.1886, + "step": 19098 + }, + { + "epoch": 0.2387559688992225, + "grad_norm": 3.171963691711426, + "learning_rate": 1.8849813730269014e-05, + "loss": 1.3772, + "step": 19100 + }, + { + "epoch": 0.2387809695242381, + "grad_norm": 0.7332066297531128, + "learning_rate": 1.884940734957099e-05, + "loss": 0.6372, + "step": 19102 + }, + { + "epoch": 0.23880597014925373, + "grad_norm": 0.006830192171037197, + "learning_rate": 1.8849000901477134e-05, + "loss": 0.8584, + "step": 19104 + }, + { + "epoch": 0.23883097077426935, + "grad_norm": 0.006517061498016119, + "learning_rate": 1.884859438599054e-05, + "loss": 0.1242, + "step": 19106 + }, + { + "epoch": 0.238855971399285, + "grad_norm": 4.1981635093688965, + "learning_rate": 1.8848187803114304e-05, + "loss": 1.7871, + "step": 19108 + }, + { + "epoch": 0.23888097202430061, + "grad_norm": 2.3364665508270264, + "learning_rate": 1.8847781152851518e-05, + "loss": 1.1288, + "step": 19110 + }, + { + "epoch": 0.23890597264931623, + "grad_norm": 4.600542068481445, + "learning_rate": 1.884737443520528e-05, + "loss": 1.2745, + "step": 19112 + }, + { + "epoch": 0.23893097327433185, + "grad_norm": 6.5575785636901855, + "learning_rate": 1.8846967650178693e-05, + "loss": 1.2768, + "step": 19114 + }, + { + "epoch": 0.23895597389934747, + "grad_norm": 3.676757574081421, + "learning_rate": 1.8846560797774848e-05, + "loss": 1.5152, + "step": 19116 + }, + { + "epoch": 0.23898097452436312, + "grad_norm": 0.006059241946786642, + "learning_rate": 1.884615387799685e-05, + "loss": 0.5481, + "step": 19118 + }, + { + "epoch": 0.23900597514937874, + "grad_norm": 7.036760330200195, + "learning_rate": 1.8845746890847794e-05, + "loss": 1.5586, + "step": 19120 + }, + { + "epoch": 0.23903097577439436, + "grad_norm": 2.556225061416626, + "learning_rate": 1.884533983633078e-05, + "loss": 0.576, + "step": 19122 + }, + { + "epoch": 0.23905597639940998, + "grad_norm": 4.201109409332275, + "learning_rate": 1.884493271444891e-05, + "loss": 1.5371, + "step": 19124 + }, + { + "epoch": 0.2390809770244256, + "grad_norm": 3.7914340496063232, + "learning_rate": 1.884452552520528e-05, + "loss": 1.1792, + "step": 19126 + }, + { + "epoch": 0.23910597764944125, + "grad_norm": 2.8808860778808594, + "learning_rate": 1.8844118268603e-05, + "loss": 0.6681, + "step": 19128 + }, + { + "epoch": 0.23913097827445687, + "grad_norm": 2.749288320541382, + "learning_rate": 1.884371094464516e-05, + "loss": 0.7174, + "step": 19130 + }, + { + "epoch": 0.2391559788994725, + "grad_norm": 1.618449091911316, + "learning_rate": 1.8843303553334875e-05, + "loss": 1.1185, + "step": 19132 + }, + { + "epoch": 0.2391809795244881, + "grad_norm": 0.02272951789200306, + "learning_rate": 1.8842896094675235e-05, + "loss": 0.3438, + "step": 19134 + }, + { + "epoch": 0.23920598014950373, + "grad_norm": 1.7152912616729736, + "learning_rate": 1.884248856866935e-05, + "loss": 0.9806, + "step": 19136 + }, + { + "epoch": 0.23923098077451938, + "grad_norm": 3.7625250816345215, + "learning_rate": 1.8842080975320325e-05, + "loss": 0.9958, + "step": 19138 + }, + { + "epoch": 0.239255981399535, + "grad_norm": 11.542091369628906, + "learning_rate": 1.8841673314631258e-05, + "loss": 0.4877, + "step": 19140 + }, + { + "epoch": 0.23928098202455061, + "grad_norm": 7.623800754547119, + "learning_rate": 1.884126558660526e-05, + "loss": 1.1135, + "step": 19142 + }, + { + "epoch": 0.23930598264956623, + "grad_norm": 0.03823436051607132, + "learning_rate": 1.884085779124543e-05, + "loss": 0.566, + "step": 19144 + }, + { + "epoch": 0.23933098327458185, + "grad_norm": 3.184577226638794, + "learning_rate": 1.8840449928554882e-05, + "loss": 0.3188, + "step": 19146 + }, + { + "epoch": 0.2393559838995975, + "grad_norm": 2.637355327606201, + "learning_rate": 1.8840041998536714e-05, + "loss": 0.6245, + "step": 19148 + }, + { + "epoch": 0.23938098452461312, + "grad_norm": 2.017932415008545, + "learning_rate": 1.8839634001194037e-05, + "loss": 0.7845, + "step": 19150 + }, + { + "epoch": 0.23940598514962874, + "grad_norm": 3.0094306468963623, + "learning_rate": 1.883922593652996e-05, + "loss": 0.4019, + "step": 19152 + }, + { + "epoch": 0.23943098577464436, + "grad_norm": 3.6632518768310547, + "learning_rate": 1.8838817804547584e-05, + "loss": 1.2447, + "step": 19154 + }, + { + "epoch": 0.23945598639965998, + "grad_norm": 4.160538196563721, + "learning_rate": 1.8838409605250025e-05, + "loss": 0.8915, + "step": 19156 + }, + { + "epoch": 0.23948098702467563, + "grad_norm": 7.230335712432861, + "learning_rate": 1.8838001338640385e-05, + "loss": 1.2181, + "step": 19158 + }, + { + "epoch": 0.23950598764969125, + "grad_norm": 4.325188159942627, + "learning_rate": 1.883759300472178e-05, + "loss": 2.1432, + "step": 19160 + }, + { + "epoch": 0.23953098827470687, + "grad_norm": 2.8873984813690186, + "learning_rate": 1.8837184603497317e-05, + "loss": 0.9399, + "step": 19162 + }, + { + "epoch": 0.2395559888997225, + "grad_norm": 1.5580475330352783, + "learning_rate": 1.8836776134970102e-05, + "loss": 0.9839, + "step": 19164 + }, + { + "epoch": 0.2395809895247381, + "grad_norm": 3.516719102859497, + "learning_rate": 1.8836367599143254e-05, + "loss": 0.4653, + "step": 19166 + }, + { + "epoch": 0.23960599014975376, + "grad_norm": 2.4822750091552734, + "learning_rate": 1.8835958996019875e-05, + "loss": 0.1547, + "step": 19168 + }, + { + "epoch": 0.23963099077476938, + "grad_norm": 1.9252641201019287, + "learning_rate": 1.8835550325603083e-05, + "loss": 1.7695, + "step": 19170 + }, + { + "epoch": 0.239655991399785, + "grad_norm": 4.259552955627441, + "learning_rate": 1.8835141587895994e-05, + "loss": 1.0987, + "step": 19172 + }, + { + "epoch": 0.23968099202480062, + "grad_norm": 3.954775094985962, + "learning_rate": 1.883473278290171e-05, + "loss": 0.9287, + "step": 19174 + }, + { + "epoch": 0.23970599264981624, + "grad_norm": 5.571338653564453, + "learning_rate": 1.8834323910623353e-05, + "loss": 1.3798, + "step": 19176 + }, + { + "epoch": 0.23973099327483188, + "grad_norm": 0.015010993927717209, + "learning_rate": 1.8833914971064034e-05, + "loss": 0.0612, + "step": 19178 + }, + { + "epoch": 0.2397559938998475, + "grad_norm": 5.656940937042236, + "learning_rate": 1.883350596422687e-05, + "loss": 1.3691, + "step": 19180 + }, + { + "epoch": 0.23978099452486312, + "grad_norm": 5.689105033874512, + "learning_rate": 1.883309689011497e-05, + "loss": 1.6891, + "step": 19182 + }, + { + "epoch": 0.23980599514987874, + "grad_norm": 2.451003313064575, + "learning_rate": 1.8832687748731455e-05, + "loss": 1.5243, + "step": 19184 + }, + { + "epoch": 0.23983099577489436, + "grad_norm": 4.101743698120117, + "learning_rate": 1.883227854007944e-05, + "loss": 1.3296, + "step": 19186 + }, + { + "epoch": 0.23985599639991, + "grad_norm": 4.546595573425293, + "learning_rate": 1.883186926416204e-05, + "loss": 2.0411, + "step": 19188 + }, + { + "epoch": 0.23988099702492563, + "grad_norm": 3.1678009033203125, + "learning_rate": 1.8831459920982375e-05, + "loss": 0.6775, + "step": 19190 + }, + { + "epoch": 0.23990599764994125, + "grad_norm": 5.951722621917725, + "learning_rate": 1.8831050510543556e-05, + "loss": 0.494, + "step": 19192 + }, + { + "epoch": 0.23993099827495687, + "grad_norm": 4.302640914916992, + "learning_rate": 1.8830641032848708e-05, + "loss": 1.634, + "step": 19194 + }, + { + "epoch": 0.2399559988999725, + "grad_norm": 3.0651421546936035, + "learning_rate": 1.883023148790094e-05, + "loss": 0.539, + "step": 19196 + }, + { + "epoch": 0.23998099952498814, + "grad_norm": 0.02166481502354145, + "learning_rate": 1.8829821875703388e-05, + "loss": 0.1067, + "step": 19198 + }, + { + "epoch": 0.24000600015000376, + "grad_norm": 3.110955238342285, + "learning_rate": 1.8829412196259156e-05, + "loss": 1.4218, + "step": 19200 + }, + { + "epoch": 0.24003100077501938, + "grad_norm": 5.265346527099609, + "learning_rate": 1.8829002449571367e-05, + "loss": 0.8238, + "step": 19202 + }, + { + "epoch": 0.240056001400035, + "grad_norm": 3.1583468914031982, + "learning_rate": 1.882859263564315e-05, + "loss": 1.6012, + "step": 19204 + }, + { + "epoch": 0.24008100202505062, + "grad_norm": 2.7664151191711426, + "learning_rate": 1.8828182754477614e-05, + "loss": 0.9603, + "step": 19206 + }, + { + "epoch": 0.24010600265006626, + "grad_norm": 3.059885263442993, + "learning_rate": 1.882777280607789e-05, + "loss": 1.2645, + "step": 19208 + }, + { + "epoch": 0.24013100327508188, + "grad_norm": 0.01434918213635683, + "learning_rate": 1.8827362790447096e-05, + "loss": 0.1017, + "step": 19210 + }, + { + "epoch": 0.2401560039000975, + "grad_norm": 1.2695602178573608, + "learning_rate": 1.8826952707588356e-05, + "loss": 0.0388, + "step": 19212 + }, + { + "epoch": 0.24018100452511312, + "grad_norm": 2.0329647064208984, + "learning_rate": 1.882654255750479e-05, + "loss": 0.7789, + "step": 19214 + }, + { + "epoch": 0.24020600515012874, + "grad_norm": 2.4911108016967773, + "learning_rate": 1.8826132340199525e-05, + "loss": 1.1556, + "step": 19216 + }, + { + "epoch": 0.2402310057751444, + "grad_norm": 3.1411654949188232, + "learning_rate": 1.8825722055675684e-05, + "loss": 0.3616, + "step": 19218 + }, + { + "epoch": 0.24025600640016, + "grad_norm": 0.016579056158661842, + "learning_rate": 1.8825311703936395e-05, + "loss": 0.6667, + "step": 19220 + }, + { + "epoch": 0.24028100702517563, + "grad_norm": 4.964548587799072, + "learning_rate": 1.8824901284984777e-05, + "loss": 1.6296, + "step": 19222 + }, + { + "epoch": 0.24030600765019125, + "grad_norm": 3.3035848140716553, + "learning_rate": 1.8824490798823957e-05, + "loss": 1.0441, + "step": 19224 + }, + { + "epoch": 0.24033100827520687, + "grad_norm": 4.189709186553955, + "learning_rate": 1.8824080245457067e-05, + "loss": 1.1277, + "step": 19226 + }, + { + "epoch": 0.24035600890022252, + "grad_norm": 2.5523178577423096, + "learning_rate": 1.882366962488723e-05, + "loss": 1.4227, + "step": 19228 + }, + { + "epoch": 0.24038100952523814, + "grad_norm": 2.647449254989624, + "learning_rate": 1.882325893711757e-05, + "loss": 0.7302, + "step": 19230 + }, + { + "epoch": 0.24040601015025376, + "grad_norm": 2.470229148864746, + "learning_rate": 1.882284818215122e-05, + "loss": 1.416, + "step": 19232 + }, + { + "epoch": 0.24043101077526938, + "grad_norm": 3.775617837905884, + "learning_rate": 1.8822437359991302e-05, + "loss": 0.9234, + "step": 19234 + }, + { + "epoch": 0.240456011400285, + "grad_norm": 4.1083245277404785, + "learning_rate": 1.8822026470640953e-05, + "loss": 1.4936, + "step": 19236 + }, + { + "epoch": 0.24048101202530064, + "grad_norm": 1.6792041063308716, + "learning_rate": 1.8821615514103294e-05, + "loss": 1.2042, + "step": 19238 + }, + { + "epoch": 0.24050601265031626, + "grad_norm": 3.011849880218506, + "learning_rate": 1.882120449038146e-05, + "loss": 0.8762, + "step": 19240 + }, + { + "epoch": 0.24053101327533188, + "grad_norm": 2.6701371669769287, + "learning_rate": 1.8820793399478582e-05, + "loss": 1.539, + "step": 19242 + }, + { + "epoch": 0.2405560139003475, + "grad_norm": 2.2029829025268555, + "learning_rate": 1.8820382241397788e-05, + "loss": 0.6305, + "step": 19244 + }, + { + "epoch": 0.24058101452536312, + "grad_norm": 2.038071870803833, + "learning_rate": 1.881997101614221e-05, + "loss": 0.1622, + "step": 19246 + }, + { + "epoch": 0.24060601515037877, + "grad_norm": 0.007530242670327425, + "learning_rate": 1.8819559723714982e-05, + "loss": 0.5873, + "step": 19248 + }, + { + "epoch": 0.2406310157753944, + "grad_norm": 0.009466834366321564, + "learning_rate": 1.8819148364119232e-05, + "loss": 0.0757, + "step": 19250 + }, + { + "epoch": 0.24065601640041, + "grad_norm": 2.616514205932617, + "learning_rate": 1.8818736937358096e-05, + "loss": 1.1334, + "step": 19252 + }, + { + "epoch": 0.24068101702542563, + "grad_norm": 2.4109160900115967, + "learning_rate": 1.8818325443434705e-05, + "loss": 0.6088, + "step": 19254 + }, + { + "epoch": 0.24070601765044125, + "grad_norm": 4.344179630279541, + "learning_rate": 1.8817913882352195e-05, + "loss": 1.2187, + "step": 19256 + }, + { + "epoch": 0.2407310182754569, + "grad_norm": 2.944643497467041, + "learning_rate": 1.88175022541137e-05, + "loss": 0.477, + "step": 19258 + }, + { + "epoch": 0.24075601890047252, + "grad_norm": 3.9832229614257812, + "learning_rate": 1.8817090558722355e-05, + "loss": 1.1381, + "step": 19260 + }, + { + "epoch": 0.24078101952548814, + "grad_norm": 3.120123863220215, + "learning_rate": 1.8816678796181296e-05, + "loss": 0.9033, + "step": 19262 + }, + { + "epoch": 0.24080602015050376, + "grad_norm": 1.857604742050171, + "learning_rate": 1.8816266966493658e-05, + "loss": 0.4948, + "step": 19264 + }, + { + "epoch": 0.24083102077551938, + "grad_norm": 7.184645175933838, + "learning_rate": 1.8815855069662576e-05, + "loss": 1.7342, + "step": 19266 + }, + { + "epoch": 0.24085602140053503, + "grad_norm": 0.4639808237552643, + "learning_rate": 1.881544310569119e-05, + "loss": 0.6533, + "step": 19268 + }, + { + "epoch": 0.24088102202555065, + "grad_norm": 0.009037374518811703, + "learning_rate": 1.8815031074582633e-05, + "loss": 0.001, + "step": 19270 + }, + { + "epoch": 0.24090602265056626, + "grad_norm": 4.646787166595459, + "learning_rate": 1.881461897634005e-05, + "loss": 1.0807, + "step": 19272 + }, + { + "epoch": 0.24093102327558188, + "grad_norm": 1.7685989141464233, + "learning_rate": 1.8814206810966572e-05, + "loss": 0.2567, + "step": 19274 + }, + { + "epoch": 0.2409560239005975, + "grad_norm": 3.0135691165924072, + "learning_rate": 1.8813794578465342e-05, + "loss": 2.0317, + "step": 19276 + }, + { + "epoch": 0.24098102452561315, + "grad_norm": 3.65449857711792, + "learning_rate": 1.88133822788395e-05, + "loss": 0.6908, + "step": 19278 + }, + { + "epoch": 0.24100602515062877, + "grad_norm": 2.7767724990844727, + "learning_rate": 1.881296991209219e-05, + "loss": 0.8155, + "step": 19280 + }, + { + "epoch": 0.2410310257756444, + "grad_norm": 5.264185905456543, + "learning_rate": 1.8812557478226537e-05, + "loss": 1.7756, + "step": 19282 + }, + { + "epoch": 0.24105602640066, + "grad_norm": 2.1423275470733643, + "learning_rate": 1.88121449772457e-05, + "loss": 0.0772, + "step": 19284 + }, + { + "epoch": 0.24108102702567563, + "grad_norm": 0.3535240888595581, + "learning_rate": 1.8811732409152806e-05, + "loss": 1.2341, + "step": 19286 + }, + { + "epoch": 0.24110602765069128, + "grad_norm": 3.606935739517212, + "learning_rate": 1.8811319773951015e-05, + "loss": 0.2523, + "step": 19288 + }, + { + "epoch": 0.2411310282757069, + "grad_norm": 4.6788201332092285, + "learning_rate": 1.8810907071643448e-05, + "loss": 0.8626, + "step": 19290 + }, + { + "epoch": 0.24115602890072252, + "grad_norm": 4.4613237380981445, + "learning_rate": 1.8810494302233264e-05, + "loss": 1.4478, + "step": 19292 + }, + { + "epoch": 0.24118102952573814, + "grad_norm": 4.159293174743652, + "learning_rate": 1.88100814657236e-05, + "loss": 0.3589, + "step": 19294 + }, + { + "epoch": 0.24120603015075376, + "grad_norm": 2.230494737625122, + "learning_rate": 1.8809668562117603e-05, + "loss": 1.3597, + "step": 19296 + }, + { + "epoch": 0.2412310307757694, + "grad_norm": 0.008812041953206062, + "learning_rate": 1.8809255591418414e-05, + "loss": 0.7896, + "step": 19298 + }, + { + "epoch": 0.24125603140078503, + "grad_norm": 4.7032270431518555, + "learning_rate": 1.880884255362918e-05, + "loss": 1.6664, + "step": 19300 + }, + { + "epoch": 0.24128103202580065, + "grad_norm": 4.575479984283447, + "learning_rate": 1.8808429448753048e-05, + "loss": 1.8195, + "step": 19302 + }, + { + "epoch": 0.24130603265081627, + "grad_norm": 5.166295051574707, + "learning_rate": 1.8808016276793163e-05, + "loss": 0.252, + "step": 19304 + }, + { + "epoch": 0.24133103327583189, + "grad_norm": 3.517611265182495, + "learning_rate": 1.880760303775267e-05, + "loss": 0.5309, + "step": 19306 + }, + { + "epoch": 0.24135603390084753, + "grad_norm": 0.01101623848080635, + "learning_rate": 1.8807189731634722e-05, + "loss": 0.1925, + "step": 19308 + }, + { + "epoch": 0.24138103452586315, + "grad_norm": 4.516998291015625, + "learning_rate": 1.8806776358442458e-05, + "loss": 0.8138, + "step": 19310 + }, + { + "epoch": 0.24140603515087877, + "grad_norm": 0.006722187623381615, + "learning_rate": 1.8806362918179032e-05, + "loss": 0.0607, + "step": 19312 + }, + { + "epoch": 0.2414310357758944, + "grad_norm": 5.2476701736450195, + "learning_rate": 1.880594941084759e-05, + "loss": 1.1569, + "step": 19314 + }, + { + "epoch": 0.24145603640091, + "grad_norm": 3.570228338241577, + "learning_rate": 1.8805535836451284e-05, + "loss": 0.8907, + "step": 19316 + }, + { + "epoch": 0.24148103702592566, + "grad_norm": 1.8769793510437012, + "learning_rate": 1.8805122194993263e-05, + "loss": 1.5828, + "step": 19318 + }, + { + "epoch": 0.24150603765094128, + "grad_norm": 2.575732946395874, + "learning_rate": 1.8804708486476678e-05, + "loss": 1.1127, + "step": 19320 + }, + { + "epoch": 0.2415310382759569, + "grad_norm": 2.9270267486572266, + "learning_rate": 1.8804294710904672e-05, + "loss": 0.651, + "step": 19322 + }, + { + "epoch": 0.24155603890097252, + "grad_norm": 0.011651002801954746, + "learning_rate": 1.880388086828041e-05, + "loss": 0.0003, + "step": 19324 + }, + { + "epoch": 0.24158103952598814, + "grad_norm": 3.6389498710632324, + "learning_rate": 1.880346695860703e-05, + "loss": 0.9604, + "step": 19326 + }, + { + "epoch": 0.2416060401510038, + "grad_norm": 2.974421501159668, + "learning_rate": 1.8803052981887696e-05, + "loss": 0.2396, + "step": 19328 + }, + { + "epoch": 0.2416310407760194, + "grad_norm": 4.184759616851807, + "learning_rate": 1.880263893812555e-05, + "loss": 1.8009, + "step": 19330 + }, + { + "epoch": 0.24165604140103503, + "grad_norm": 3.1004676818847656, + "learning_rate": 1.8802224827323754e-05, + "loss": 1.1788, + "step": 19332 + }, + { + "epoch": 0.24168104202605065, + "grad_norm": 2.1508359909057617, + "learning_rate": 1.8801810649485458e-05, + "loss": 0.914, + "step": 19334 + }, + { + "epoch": 0.24170604265106627, + "grad_norm": 16.215234756469727, + "learning_rate": 1.8801396404613817e-05, + "loss": 1.5591, + "step": 19336 + }, + { + "epoch": 0.24173104327608191, + "grad_norm": 1.6100538969039917, + "learning_rate": 1.8800982092711987e-05, + "loss": 0.8584, + "step": 19338 + }, + { + "epoch": 0.24175604390109753, + "grad_norm": 2.1485350131988525, + "learning_rate": 1.880056771378312e-05, + "loss": 1.5457, + "step": 19340 + }, + { + "epoch": 0.24178104452611315, + "grad_norm": 2.970867872238159, + "learning_rate": 1.8800153267830373e-05, + "loss": 0.5929, + "step": 19342 + }, + { + "epoch": 0.24180604515112877, + "grad_norm": 2.156784772872925, + "learning_rate": 1.8799738754856904e-05, + "loss": 1.1929, + "step": 19344 + }, + { + "epoch": 0.2418310457761444, + "grad_norm": 2.9604568481445312, + "learning_rate": 1.879932417486587e-05, + "loss": 1.6193, + "step": 19346 + }, + { + "epoch": 0.24185604640116004, + "grad_norm": 3.0741469860076904, + "learning_rate": 1.8798909527860428e-05, + "loss": 0.5194, + "step": 19348 + }, + { + "epoch": 0.24188104702617566, + "grad_norm": 4.130224227905273, + "learning_rate": 1.879849481384373e-05, + "loss": 1.4622, + "step": 19350 + }, + { + "epoch": 0.24190604765119128, + "grad_norm": 2.235081195831299, + "learning_rate": 1.8798080032818946e-05, + "loss": 0.4306, + "step": 19352 + }, + { + "epoch": 0.2419310482762069, + "grad_norm": 3.1670520305633545, + "learning_rate": 1.8797665184789225e-05, + "loss": 0.8378, + "step": 19354 + }, + { + "epoch": 0.24195604890122252, + "grad_norm": 4.390103340148926, + "learning_rate": 1.8797250269757734e-05, + "loss": 0.9401, + "step": 19356 + }, + { + "epoch": 0.24198104952623817, + "grad_norm": 1.9532362222671509, + "learning_rate": 1.8796835287727628e-05, + "loss": 1.875, + "step": 19358 + }, + { + "epoch": 0.2420060501512538, + "grad_norm": 5.905322551727295, + "learning_rate": 1.8796420238702063e-05, + "loss": 0.9585, + "step": 19360 + }, + { + "epoch": 0.2420310507762694, + "grad_norm": 5.187026023864746, + "learning_rate": 1.8796005122684213e-05, + "loss": 1.1726, + "step": 19362 + }, + { + "epoch": 0.24205605140128503, + "grad_norm": 0.2816031277179718, + "learning_rate": 1.8795589939677223e-05, + "loss": 0.8425, + "step": 19364 + }, + { + "epoch": 0.24208105202630065, + "grad_norm": 2.847151279449463, + "learning_rate": 1.879517468968427e-05, + "loss": 1.176, + "step": 19366 + }, + { + "epoch": 0.2421060526513163, + "grad_norm": 0.022089455276727676, + "learning_rate": 1.879475937270851e-05, + "loss": 0.035, + "step": 19368 + }, + { + "epoch": 0.24213105327633191, + "grad_norm": 1.3994882106781006, + "learning_rate": 1.8794343988753104e-05, + "loss": 0.5838, + "step": 19370 + }, + { + "epoch": 0.24215605390134753, + "grad_norm": 2.198615074157715, + "learning_rate": 1.879392853782122e-05, + "loss": 0.2124, + "step": 19372 + }, + { + "epoch": 0.24218105452636315, + "grad_norm": 2.719963550567627, + "learning_rate": 1.879351301991602e-05, + "loss": 1.1138, + "step": 19374 + }, + { + "epoch": 0.24220605515137877, + "grad_norm": 3.1452431678771973, + "learning_rate": 1.8793097435040665e-05, + "loss": 0.5862, + "step": 19376 + }, + { + "epoch": 0.24223105577639442, + "grad_norm": 3.0186948776245117, + "learning_rate": 1.8792681783198327e-05, + "loss": 1.3703, + "step": 19378 + }, + { + "epoch": 0.24225605640141004, + "grad_norm": 4.015457630157471, + "learning_rate": 1.879226606439217e-05, + "loss": 0.4946, + "step": 19380 + }, + { + "epoch": 0.24228105702642566, + "grad_norm": 4.024872779846191, + "learning_rate": 1.879185027862535e-05, + "loss": 1.0789, + "step": 19382 + }, + { + "epoch": 0.24230605765144128, + "grad_norm": 3.5230181217193604, + "learning_rate": 1.8791434425901048e-05, + "loss": 1.271, + "step": 19384 + }, + { + "epoch": 0.2423310582764569, + "grad_norm": 4.27770471572876, + "learning_rate": 1.8791018506222423e-05, + "loss": 0.8342, + "step": 19386 + }, + { + "epoch": 0.24235605890147255, + "grad_norm": 2.995270013809204, + "learning_rate": 1.8790602519592647e-05, + "loss": 0.5689, + "step": 19388 + }, + { + "epoch": 0.24238105952648817, + "grad_norm": 5.594159126281738, + "learning_rate": 1.8790186466014878e-05, + "loss": 1.334, + "step": 19390 + }, + { + "epoch": 0.2424060601515038, + "grad_norm": 0.007006392348557711, + "learning_rate": 1.87897703454923e-05, + "loss": 0.6424, + "step": 19392 + }, + { + "epoch": 0.2424310607765194, + "grad_norm": 3.7034800052642822, + "learning_rate": 1.8789354158028068e-05, + "loss": 0.8793, + "step": 19394 + }, + { + "epoch": 0.24245606140153503, + "grad_norm": 6.43422269821167, + "learning_rate": 1.878893790362536e-05, + "loss": 0.2907, + "step": 19396 + }, + { + "epoch": 0.24248106202655068, + "grad_norm": 2.0094547271728516, + "learning_rate": 1.8788521582287344e-05, + "loss": 0.3802, + "step": 19398 + }, + { + "epoch": 0.2425060626515663, + "grad_norm": 0.011334342882037163, + "learning_rate": 1.8788105194017194e-05, + "loss": 0.7006, + "step": 19400 + }, + { + "epoch": 0.24253106327658192, + "grad_norm": 10.01231575012207, + "learning_rate": 1.878768873881807e-05, + "loss": 0.9116, + "step": 19402 + }, + { + "epoch": 0.24255606390159754, + "grad_norm": 0.008065799251198769, + "learning_rate": 1.8787272216693158e-05, + "loss": 0.5646, + "step": 19404 + }, + { + "epoch": 0.24258106452661315, + "grad_norm": 15.104560852050781, + "learning_rate": 1.8786855627645622e-05, + "loss": 1.0974, + "step": 19406 + }, + { + "epoch": 0.2426060651516288, + "grad_norm": 1.9788415431976318, + "learning_rate": 1.8786438971678634e-05, + "loss": 0.3829, + "step": 19408 + }, + { + "epoch": 0.24263106577664442, + "grad_norm": 2.9669430255889893, + "learning_rate": 1.878602224879537e-05, + "loss": 0.207, + "step": 19410 + }, + { + "epoch": 0.24265606640166004, + "grad_norm": 4.07337760925293, + "learning_rate": 1.8785605458999005e-05, + "loss": 1.3835, + "step": 19412 + }, + { + "epoch": 0.24268106702667566, + "grad_norm": 3.18336820602417, + "learning_rate": 1.878518860229271e-05, + "loss": 0.9803, + "step": 19414 + }, + { + "epoch": 0.24270606765169128, + "grad_norm": 1.2880665063858032, + "learning_rate": 1.8784771678679658e-05, + "loss": 1.0562, + "step": 19416 + }, + { + "epoch": 0.24273106827670693, + "grad_norm": 4.337641716003418, + "learning_rate": 1.8784354688163028e-05, + "loss": 0.2276, + "step": 19418 + }, + { + "epoch": 0.24275606890172255, + "grad_norm": 3.1008501052856445, + "learning_rate": 1.8783937630745997e-05, + "loss": 0.189, + "step": 19420 + }, + { + "epoch": 0.24278106952673817, + "grad_norm": 8.105224609375, + "learning_rate": 1.8783520506431738e-05, + "loss": 1.2857, + "step": 19422 + }, + { + "epoch": 0.2428060701517538, + "grad_norm": 2.2326321601867676, + "learning_rate": 1.878310331522343e-05, + "loss": 1.4092, + "step": 19424 + }, + { + "epoch": 0.2428310707767694, + "grad_norm": 2.8566370010375977, + "learning_rate": 1.8782686057124248e-05, + "loss": 0.5837, + "step": 19426 + }, + { + "epoch": 0.24285607140178506, + "grad_norm": 5.086221218109131, + "learning_rate": 1.878226873213737e-05, + "loss": 1.3212, + "step": 19428 + }, + { + "epoch": 0.24288107202680068, + "grad_norm": 4.335855007171631, + "learning_rate": 1.8781851340265976e-05, + "loss": 0.607, + "step": 19430 + }, + { + "epoch": 0.2429060726518163, + "grad_norm": 3.9608218669891357, + "learning_rate": 1.8781433881513246e-05, + "loss": 1.4883, + "step": 19432 + }, + { + "epoch": 0.24293107327683192, + "grad_norm": 3.1157820224761963, + "learning_rate": 1.878101635588236e-05, + "loss": 0.692, + "step": 19434 + }, + { + "epoch": 0.24295607390184754, + "grad_norm": 15.562807083129883, + "learning_rate": 1.878059876337649e-05, + "loss": 1.4178, + "step": 19436 + }, + { + "epoch": 0.24298107452686318, + "grad_norm": 0.0061873490922153, + "learning_rate": 1.8780181103998822e-05, + "loss": 0.4348, + "step": 19438 + }, + { + "epoch": 0.2430060751518788, + "grad_norm": 1.6011183261871338, + "learning_rate": 1.8779763377752535e-05, + "loss": 0.6768, + "step": 19440 + }, + { + "epoch": 0.24303107577689442, + "grad_norm": 0.6155469417572021, + "learning_rate": 1.8779345584640817e-05, + "loss": 0.6334, + "step": 19442 + }, + { + "epoch": 0.24305607640191004, + "grad_norm": 5.032634735107422, + "learning_rate": 1.8778927724666842e-05, + "loss": 1.6935, + "step": 19444 + }, + { + "epoch": 0.24308107702692566, + "grad_norm": 0.007697327062487602, + "learning_rate": 1.8778509797833795e-05, + "loss": 0.1186, + "step": 19446 + }, + { + "epoch": 0.2431060776519413, + "grad_norm": 4.906222343444824, + "learning_rate": 1.8778091804144858e-05, + "loss": 1.6016, + "step": 19448 + }, + { + "epoch": 0.24313107827695693, + "grad_norm": 0.0163026861846447, + "learning_rate": 1.8777673743603215e-05, + "loss": 0.6334, + "step": 19450 + }, + { + "epoch": 0.24315607890197255, + "grad_norm": 0.021536152809858322, + "learning_rate": 1.877725561621205e-05, + "loss": 0.8205, + "step": 19452 + }, + { + "epoch": 0.24318107952698817, + "grad_norm": 3.6265146732330322, + "learning_rate": 1.877683742197455e-05, + "loss": 0.8359, + "step": 19454 + }, + { + "epoch": 0.2432060801520038, + "grad_norm": 0.0030224828515201807, + "learning_rate": 1.8776419160893892e-05, + "loss": 0.3373, + "step": 19456 + }, + { + "epoch": 0.24323108077701944, + "grad_norm": 0.008330285549163818, + "learning_rate": 1.8776000832973272e-05, + "loss": 1.5133, + "step": 19458 + }, + { + "epoch": 0.24325608140203506, + "grad_norm": 12.339111328125, + "learning_rate": 1.8775582438215868e-05, + "loss": 1.0559, + "step": 19460 + }, + { + "epoch": 0.24328108202705068, + "grad_norm": 4.033109188079834, + "learning_rate": 1.877516397662487e-05, + "loss": 1.4731, + "step": 19462 + }, + { + "epoch": 0.2433060826520663, + "grad_norm": 3.782649040222168, + "learning_rate": 1.877474544820346e-05, + "loss": 1.3512, + "step": 19464 + }, + { + "epoch": 0.24333108327708192, + "grad_norm": 0.6106801629066467, + "learning_rate": 1.8774326852954838e-05, + "loss": 0.0119, + "step": 19466 + }, + { + "epoch": 0.24335608390209756, + "grad_norm": 4.515509128570557, + "learning_rate": 1.8773908190882177e-05, + "loss": 1.3144, + "step": 19468 + }, + { + "epoch": 0.24338108452711318, + "grad_norm": 4.937326908111572, + "learning_rate": 1.8773489461988675e-05, + "loss": 1.4612, + "step": 19470 + }, + { + "epoch": 0.2434060851521288, + "grad_norm": 4.672992706298828, + "learning_rate": 1.8773070666277515e-05, + "loss": 0.5096, + "step": 19472 + }, + { + "epoch": 0.24343108577714442, + "grad_norm": 2.914647102355957, + "learning_rate": 1.877265180375189e-05, + "loss": 0.8254, + "step": 19474 + }, + { + "epoch": 0.24345608640216004, + "grad_norm": 2.923818588256836, + "learning_rate": 1.8772232874414992e-05, + "loss": 0.8633, + "step": 19476 + }, + { + "epoch": 0.2434810870271757, + "grad_norm": 4.312501430511475, + "learning_rate": 1.877181387827001e-05, + "loss": 1.1119, + "step": 19478 + }, + { + "epoch": 0.2435060876521913, + "grad_norm": 4.49246883392334, + "learning_rate": 1.877139481532013e-05, + "loss": 0.9662, + "step": 19480 + }, + { + "epoch": 0.24353108827720693, + "grad_norm": 1.5659816265106201, + "learning_rate": 1.8770975685568547e-05, + "loss": 0.2826, + "step": 19482 + }, + { + "epoch": 0.24355608890222255, + "grad_norm": 2.992400884628296, + "learning_rate": 1.8770556489018456e-05, + "loss": 0.4113, + "step": 19484 + }, + { + "epoch": 0.24358108952723817, + "grad_norm": 2.5918045043945312, + "learning_rate": 1.877013722567305e-05, + "loss": 0.3586, + "step": 19486 + }, + { + "epoch": 0.24360609015225382, + "grad_norm": 4.377160549163818, + "learning_rate": 1.876971789553551e-05, + "loss": 1.556, + "step": 19488 + }, + { + "epoch": 0.24363109077726944, + "grad_norm": 4.600455284118652, + "learning_rate": 1.876929849860905e-05, + "loss": 1.6232, + "step": 19490 + }, + { + "epoch": 0.24365609140228506, + "grad_norm": 2.255680799484253, + "learning_rate": 1.8768879034896844e-05, + "loss": 0.3988, + "step": 19492 + }, + { + "epoch": 0.24368109202730068, + "grad_norm": 3.148642063140869, + "learning_rate": 1.87684595044021e-05, + "loss": 0.903, + "step": 19494 + }, + { + "epoch": 0.2437060926523163, + "grad_norm": 4.94312858581543, + "learning_rate": 1.8768039907128008e-05, + "loss": 0.2648, + "step": 19496 + }, + { + "epoch": 0.24373109327733194, + "grad_norm": 3.9648871421813965, + "learning_rate": 1.8767620243077762e-05, + "loss": 1.0415, + "step": 19498 + }, + { + "epoch": 0.24375609390234756, + "grad_norm": 3.374746799468994, + "learning_rate": 1.876720051225456e-05, + "loss": 1.2182, + "step": 19500 + }, + { + "epoch": 0.24378109452736318, + "grad_norm": 2.421543836593628, + "learning_rate": 1.87667807146616e-05, + "loss": 0.4946, + "step": 19502 + }, + { + "epoch": 0.2438060951523788, + "grad_norm": 4.420958518981934, + "learning_rate": 1.8766360850302075e-05, + "loss": 2.5054, + "step": 19504 + }, + { + "epoch": 0.24383109577739442, + "grad_norm": 0.03252250328660011, + "learning_rate": 1.876594091917919e-05, + "loss": 0.3484, + "step": 19506 + }, + { + "epoch": 0.24385609640241007, + "grad_norm": 6.5545549392700195, + "learning_rate": 1.8765520921296137e-05, + "loss": 1.013, + "step": 19508 + }, + { + "epoch": 0.2438810970274257, + "grad_norm": 3.3693413734436035, + "learning_rate": 1.8765100856656113e-05, + "loss": 1.4186, + "step": 19510 + }, + { + "epoch": 0.2439060976524413, + "grad_norm": 3.6623575687408447, + "learning_rate": 1.8764680725262324e-05, + "loss": 0.9315, + "step": 19512 + }, + { + "epoch": 0.24393109827745693, + "grad_norm": 2.953821897506714, + "learning_rate": 1.8764260527117964e-05, + "loss": 1.3629, + "step": 19514 + }, + { + "epoch": 0.24395609890247255, + "grad_norm": 1.5712945461273193, + "learning_rate": 1.8763840262226236e-05, + "loss": 1.1534, + "step": 19516 + }, + { + "epoch": 0.2439810995274882, + "grad_norm": 3.31305193901062, + "learning_rate": 1.876341993059034e-05, + "loss": 0.7429, + "step": 19518 + }, + { + "epoch": 0.24400610015250382, + "grad_norm": 1.8684951066970825, + "learning_rate": 1.8762999532213478e-05, + "loss": 0.7122, + "step": 19520 + }, + { + "epoch": 0.24403110077751944, + "grad_norm": 7.887758255004883, + "learning_rate": 1.8762579067098848e-05, + "loss": 1.835, + "step": 19522 + }, + { + "epoch": 0.24405610140253506, + "grad_norm": 0.012776107527315617, + "learning_rate": 1.876215853524966e-05, + "loss": 0.3449, + "step": 19524 + }, + { + "epoch": 0.24408110202755068, + "grad_norm": 4.49656343460083, + "learning_rate": 1.8761737936669106e-05, + "loss": 0.9442, + "step": 19526 + }, + { + "epoch": 0.24410610265256633, + "grad_norm": 3.586811065673828, + "learning_rate": 1.87613172713604e-05, + "loss": 0.7733, + "step": 19528 + }, + { + "epoch": 0.24413110327758195, + "grad_norm": 2.2012298107147217, + "learning_rate": 1.876089653932674e-05, + "loss": 0.9684, + "step": 19530 + }, + { + "epoch": 0.24415610390259757, + "grad_norm": 3.5057430267333984, + "learning_rate": 1.8760475740571327e-05, + "loss": 1.6418, + "step": 19532 + }, + { + "epoch": 0.24418110452761319, + "grad_norm": 3.046071767807007, + "learning_rate": 1.8760054875097372e-05, + "loss": 0.7685, + "step": 19534 + }, + { + "epoch": 0.2442061051526288, + "grad_norm": 2.1337406635284424, + "learning_rate": 1.8759633942908074e-05, + "loss": 0.3373, + "step": 19536 + }, + { + "epoch": 0.24423110577764445, + "grad_norm": 0.022006887942552567, + "learning_rate": 1.8759212944006647e-05, + "loss": 0.156, + "step": 19538 + }, + { + "epoch": 0.24425610640266007, + "grad_norm": 4.850739479064941, + "learning_rate": 1.8758791878396293e-05, + "loss": 1.1037, + "step": 19540 + }, + { + "epoch": 0.2442811070276757, + "grad_norm": 0.038277555257081985, + "learning_rate": 1.8758370746080218e-05, + "loss": 0.3351, + "step": 19542 + }, + { + "epoch": 0.2443061076526913, + "grad_norm": 1.933232307434082, + "learning_rate": 1.8757949547061626e-05, + "loss": 0.3689, + "step": 19544 + }, + { + "epoch": 0.24433110827770693, + "grad_norm": 3.8235528469085693, + "learning_rate": 1.8757528281343736e-05, + "loss": 0.5841, + "step": 19546 + }, + { + "epoch": 0.24435610890272258, + "grad_norm": 2.628296375274658, + "learning_rate": 1.8757106948929744e-05, + "loss": 0.9399, + "step": 19548 + }, + { + "epoch": 0.2443811095277382, + "grad_norm": 1.6431578397750854, + "learning_rate": 1.8756685549822867e-05, + "loss": 1.3206, + "step": 19550 + }, + { + "epoch": 0.24440611015275382, + "grad_norm": 3.5960752964019775, + "learning_rate": 1.875626408402631e-05, + "loss": 1.2252, + "step": 19552 + }, + { + "epoch": 0.24443111077776944, + "grad_norm": 4.598973751068115, + "learning_rate": 1.875584255154328e-05, + "loss": 1.6321, + "step": 19554 + }, + { + "epoch": 0.24445611140278506, + "grad_norm": 3.0874996185302734, + "learning_rate": 1.8755420952376996e-05, + "loss": 0.5971, + "step": 19556 + }, + { + "epoch": 0.2444811120278007, + "grad_norm": 2.0200612545013428, + "learning_rate": 1.8754999286530663e-05, + "loss": 1.0178, + "step": 19558 + }, + { + "epoch": 0.24450611265281633, + "grad_norm": 12.572919845581055, + "learning_rate": 1.875457755400749e-05, + "loss": 2.9098, + "step": 19560 + }, + { + "epoch": 0.24453111327783195, + "grad_norm": 6.457676410675049, + "learning_rate": 1.8754155754810698e-05, + "loss": 1.0998, + "step": 19562 + }, + { + "epoch": 0.24455611390284757, + "grad_norm": 5.226576328277588, + "learning_rate": 1.875373388894349e-05, + "loss": 0.7625, + "step": 19564 + }, + { + "epoch": 0.24458111452786319, + "grad_norm": 0.01635589450597763, + "learning_rate": 1.8753311956409088e-05, + "loss": 0.0007, + "step": 19566 + }, + { + "epoch": 0.24460611515287883, + "grad_norm": 8.021167755126953, + "learning_rate": 1.875288995721069e-05, + "loss": 1.1999, + "step": 19568 + }, + { + "epoch": 0.24463111577789445, + "grad_norm": 2.6539294719696045, + "learning_rate": 1.8752467891351527e-05, + "loss": 1.6723, + "step": 19570 + }, + { + "epoch": 0.24465611640291007, + "grad_norm": 0.01029352005571127, + "learning_rate": 1.8752045758834807e-05, + "loss": 0.8546, + "step": 19572 + }, + { + "epoch": 0.2446811170279257, + "grad_norm": 0.18314048647880554, + "learning_rate": 1.875162355966374e-05, + "loss": 0.3923, + "step": 19574 + }, + { + "epoch": 0.2447061176529413, + "grad_norm": 3.276796817779541, + "learning_rate": 1.875120129384155e-05, + "loss": 0.7833, + "step": 19576 + }, + { + "epoch": 0.24473111827795696, + "grad_norm": 0.006170277018100023, + "learning_rate": 1.8750778961371445e-05, + "loss": 0.0021, + "step": 19578 + }, + { + "epoch": 0.24475611890297258, + "grad_norm": 2.206104278564453, + "learning_rate": 1.8750356562256647e-05, + "loss": 0.7409, + "step": 19580 + }, + { + "epoch": 0.2447811195279882, + "grad_norm": 3.2497541904449463, + "learning_rate": 1.8749934096500368e-05, + "loss": 0.9937, + "step": 19582 + }, + { + "epoch": 0.24480612015300382, + "grad_norm": 1.9906479120254517, + "learning_rate": 1.874951156410583e-05, + "loss": 0.4217, + "step": 19584 + }, + { + "epoch": 0.24483112077801944, + "grad_norm": 9.000633239746094, + "learning_rate": 1.8749088965076252e-05, + "loss": 1.6713, + "step": 19586 + }, + { + "epoch": 0.2448561214030351, + "grad_norm": 1.7786672115325928, + "learning_rate": 1.8748666299414843e-05, + "loss": 0.103, + "step": 19588 + }, + { + "epoch": 0.2448811220280507, + "grad_norm": 2.8190040588378906, + "learning_rate": 1.8748243567124834e-05, + "loss": 1.4061, + "step": 19590 + }, + { + "epoch": 0.24490612265306633, + "grad_norm": 2.3827879428863525, + "learning_rate": 1.874782076820944e-05, + "loss": 0.7389, + "step": 19592 + }, + { + "epoch": 0.24493112327808195, + "grad_norm": 2.977428913116455, + "learning_rate": 1.8747397902671876e-05, + "loss": 1.0791, + "step": 19594 + }, + { + "epoch": 0.24495612390309757, + "grad_norm": 0.01179635338485241, + "learning_rate": 1.874697497051537e-05, + "loss": 0.1843, + "step": 19596 + }, + { + "epoch": 0.24498112452811321, + "grad_norm": 2.04976487159729, + "learning_rate": 1.8746551971743136e-05, + "loss": 0.5921, + "step": 19598 + }, + { + "epoch": 0.24500612515312883, + "grad_norm": 0.009129004552960396, + "learning_rate": 1.8746128906358403e-05, + "loss": 0.0005, + "step": 19600 + }, + { + "epoch": 0.24503112577814445, + "grad_norm": 6.526822566986084, + "learning_rate": 1.8745705774364388e-05, + "loss": 2.3739, + "step": 19602 + }, + { + "epoch": 0.24505612640316007, + "grad_norm": 1.7026793956756592, + "learning_rate": 1.8745282575764312e-05, + "loss": 0.3389, + "step": 19604 + }, + { + "epoch": 0.2450811270281757, + "grad_norm": 4.48002815246582, + "learning_rate": 1.8744859310561403e-05, + "loss": 0.8949, + "step": 19606 + }, + { + "epoch": 0.24510612765319134, + "grad_norm": 1.9397345781326294, + "learning_rate": 1.874443597875888e-05, + "loss": 2.2745, + "step": 19608 + }, + { + "epoch": 0.24513112827820696, + "grad_norm": 3.449280023574829, + "learning_rate": 1.874401258035997e-05, + "loss": 1.5143, + "step": 19610 + }, + { + "epoch": 0.24515612890322258, + "grad_norm": 0.01625893823802471, + "learning_rate": 1.8743589115367898e-05, + "loss": 1.0212, + "step": 19612 + }, + { + "epoch": 0.2451811295282382, + "grad_norm": 6.051583290100098, + "learning_rate": 1.8743165583785886e-05, + "loss": 0.575, + "step": 19614 + }, + { + "epoch": 0.24520613015325382, + "grad_norm": 5.05009126663208, + "learning_rate": 1.8742741985617165e-05, + "loss": 1.289, + "step": 19616 + }, + { + "epoch": 0.24523113077826947, + "grad_norm": 0.9122740030288696, + "learning_rate": 1.8742318320864955e-05, + "loss": 0.6377, + "step": 19618 + }, + { + "epoch": 0.2452561314032851, + "grad_norm": 3.0673413276672363, + "learning_rate": 1.8741894589532483e-05, + "loss": 1.7212, + "step": 19620 + }, + { + "epoch": 0.2452811320283007, + "grad_norm": 3.189038038253784, + "learning_rate": 1.874147079162298e-05, + "loss": 1.0834, + "step": 19622 + }, + { + "epoch": 0.24530613265331633, + "grad_norm": 4.704362392425537, + "learning_rate": 1.8741046927139673e-05, + "loss": 1.7054, + "step": 19624 + }, + { + "epoch": 0.24533113327833195, + "grad_norm": 3.8481040000915527, + "learning_rate": 1.8740622996085784e-05, + "loss": 0.9624, + "step": 19626 + }, + { + "epoch": 0.2453561339033476, + "grad_norm": 2.881791114807129, + "learning_rate": 1.874019899846455e-05, + "loss": 1.7929, + "step": 19628 + }, + { + "epoch": 0.24538113452836322, + "grad_norm": 1.5810681581497192, + "learning_rate": 1.8739774934279196e-05, + "loss": 0.0666, + "step": 19630 + }, + { + "epoch": 0.24540613515337883, + "grad_norm": 4.1695146560668945, + "learning_rate": 1.8739350803532952e-05, + "loss": 2.0884, + "step": 19632 + }, + { + "epoch": 0.24543113577839445, + "grad_norm": 3.6367597579956055, + "learning_rate": 1.8738926606229045e-05, + "loss": 0.86, + "step": 19634 + }, + { + "epoch": 0.24545613640341007, + "grad_norm": 2.9368743896484375, + "learning_rate": 1.8738502342370713e-05, + "loss": 0.4708, + "step": 19636 + }, + { + "epoch": 0.24548113702842572, + "grad_norm": 1.8738576173782349, + "learning_rate": 1.8738078011961182e-05, + "loss": 0.8249, + "step": 19638 + }, + { + "epoch": 0.24550613765344134, + "grad_norm": 3.4877431392669678, + "learning_rate": 1.8737653615003687e-05, + "loss": 0.9489, + "step": 19640 + }, + { + "epoch": 0.24553113827845696, + "grad_norm": 1.6288648843765259, + "learning_rate": 1.873722915150145e-05, + "loss": 0.708, + "step": 19642 + }, + { + "epoch": 0.24555613890347258, + "grad_norm": 5.285561561584473, + "learning_rate": 1.8736804621457717e-05, + "loss": 0.5865, + "step": 19644 + }, + { + "epoch": 0.2455811395284882, + "grad_norm": 3.4450385570526123, + "learning_rate": 1.8736380024875715e-05, + "loss": 1.154, + "step": 19646 + }, + { + "epoch": 0.24560614015350385, + "grad_norm": 0.025665417313575745, + "learning_rate": 1.8735955361758676e-05, + "loss": 0.1893, + "step": 19648 + }, + { + "epoch": 0.24563114077851947, + "grad_norm": 9.714760780334473, + "learning_rate": 1.873553063210984e-05, + "loss": 1.0505, + "step": 19650 + }, + { + "epoch": 0.2456561414035351, + "grad_norm": 3.2659201622009277, + "learning_rate": 1.8735105835932438e-05, + "loss": 1.5418, + "step": 19652 + }, + { + "epoch": 0.2456811420285507, + "grad_norm": 0.14399121701717377, + "learning_rate": 1.8734680973229703e-05, + "loss": 1.0742, + "step": 19654 + }, + { + "epoch": 0.24570614265356633, + "grad_norm": 5.221679210662842, + "learning_rate": 1.8734256044004874e-05, + "loss": 0.5252, + "step": 19656 + }, + { + "epoch": 0.24573114327858198, + "grad_norm": 2.758943557739258, + "learning_rate": 1.873383104826118e-05, + "loss": 0.5743, + "step": 19658 + }, + { + "epoch": 0.2457561439035976, + "grad_norm": 5.363875865936279, + "learning_rate": 1.873340598600187e-05, + "loss": 1.0904, + "step": 19660 + }, + { + "epoch": 0.24578114452861322, + "grad_norm": 3.0575661659240723, + "learning_rate": 1.8732980857230177e-05, + "loss": 0.7784, + "step": 19662 + }, + { + "epoch": 0.24580614515362884, + "grad_norm": 3.6516547203063965, + "learning_rate": 1.873255566194933e-05, + "loss": 1.3566, + "step": 19664 + }, + { + "epoch": 0.24583114577864446, + "grad_norm": 3.769369602203369, + "learning_rate": 1.873213040016258e-05, + "loss": 0.0814, + "step": 19666 + }, + { + "epoch": 0.2458561464036601, + "grad_norm": 5.166285037994385, + "learning_rate": 1.8731705071873157e-05, + "loss": 1.1113, + "step": 19668 + }, + { + "epoch": 0.24588114702867572, + "grad_norm": 1.06743586063385, + "learning_rate": 1.8731279677084305e-05, + "loss": 0.1412, + "step": 19670 + }, + { + "epoch": 0.24590614765369134, + "grad_norm": 8.198134422302246, + "learning_rate": 1.873085421579926e-05, + "loss": 0.8858, + "step": 19672 + }, + { + "epoch": 0.24593114827870696, + "grad_norm": 5.305309295654297, + "learning_rate": 1.8730428688021264e-05, + "loss": 0.1557, + "step": 19674 + }, + { + "epoch": 0.24595614890372258, + "grad_norm": 3.8951785564422607, + "learning_rate": 1.873000309375356e-05, + "loss": 0.9729, + "step": 19676 + }, + { + "epoch": 0.24598114952873823, + "grad_norm": 3.613100051879883, + "learning_rate": 1.872957743299938e-05, + "loss": 1.4839, + "step": 19678 + }, + { + "epoch": 0.24600615015375385, + "grad_norm": 0.5212199091911316, + "learning_rate": 1.872915170576198e-05, + "loss": 0.805, + "step": 19680 + }, + { + "epoch": 0.24603115077876947, + "grad_norm": 4.982018947601318, + "learning_rate": 1.8728725912044594e-05, + "loss": 0.4285, + "step": 19682 + }, + { + "epoch": 0.2460561514037851, + "grad_norm": 3.4330525398254395, + "learning_rate": 1.8728300051850463e-05, + "loss": 0.6879, + "step": 19684 + }, + { + "epoch": 0.2460811520288007, + "grad_norm": 5.427041053771973, + "learning_rate": 1.8727874125182833e-05, + "loss": 0.5642, + "step": 19686 + }, + { + "epoch": 0.24610615265381636, + "grad_norm": 2.619959831237793, + "learning_rate": 1.872744813204495e-05, + "loss": 1.0846, + "step": 19688 + }, + { + "epoch": 0.24613115327883198, + "grad_norm": 2.831080436706543, + "learning_rate": 1.872702207244006e-05, + "loss": 1.0034, + "step": 19690 + }, + { + "epoch": 0.2461561539038476, + "grad_norm": 3.7903597354888916, + "learning_rate": 1.8726595946371397e-05, + "loss": 1.0331, + "step": 19692 + }, + { + "epoch": 0.24618115452886322, + "grad_norm": 5.920024871826172, + "learning_rate": 1.8726169753842215e-05, + "loss": 1.1018, + "step": 19694 + }, + { + "epoch": 0.24620615515387884, + "grad_norm": 0.015148411504924297, + "learning_rate": 1.872574349485576e-05, + "loss": 0.362, + "step": 19696 + }, + { + "epoch": 0.24623115577889448, + "grad_norm": 3.104344129562378, + "learning_rate": 1.8725317169415275e-05, + "loss": 0.5372, + "step": 19698 + }, + { + "epoch": 0.2462561564039101, + "grad_norm": 0.8659901022911072, + "learning_rate": 1.872489077752401e-05, + "loss": 0.5096, + "step": 19700 + }, + { + "epoch": 0.24628115702892572, + "grad_norm": 2.7081780433654785, + "learning_rate": 1.8724464319185208e-05, + "loss": 0.2344, + "step": 19702 + }, + { + "epoch": 0.24630615765394134, + "grad_norm": 11.552416801452637, + "learning_rate": 1.872403779440212e-05, + "loss": 2.1941, + "step": 19704 + }, + { + "epoch": 0.24633115827895696, + "grad_norm": 2.3252007961273193, + "learning_rate": 1.8723611203177995e-05, + "loss": 1.0041, + "step": 19706 + }, + { + "epoch": 0.2463561589039726, + "grad_norm": 0.8893625736236572, + "learning_rate": 1.8723184545516083e-05, + "loss": 0.8341, + "step": 19708 + }, + { + "epoch": 0.24638115952898823, + "grad_norm": 2.4732606410980225, + "learning_rate": 1.872275782141963e-05, + "loss": 1.3713, + "step": 19710 + }, + { + "epoch": 0.24640616015400385, + "grad_norm": 0.014041617512702942, + "learning_rate": 1.8722331030891885e-05, + "loss": 0.1359, + "step": 19712 + }, + { + "epoch": 0.24643116077901947, + "grad_norm": 4.403253555297852, + "learning_rate": 1.8721904173936097e-05, + "loss": 1.2993, + "step": 19714 + }, + { + "epoch": 0.2464561614040351, + "grad_norm": 1.9286037683486938, + "learning_rate": 1.8721477250555528e-05, + "loss": 0.3192, + "step": 19716 + }, + { + "epoch": 0.24648116202905074, + "grad_norm": 0.6096284985542297, + "learning_rate": 1.8721050260753414e-05, + "loss": 0.5617, + "step": 19718 + }, + { + "epoch": 0.24650616265406636, + "grad_norm": 0.007192973978817463, + "learning_rate": 1.872062320453302e-05, + "loss": 0.1459, + "step": 19720 + }, + { + "epoch": 0.24653116327908198, + "grad_norm": 0.017846275120973587, + "learning_rate": 1.872019608189759e-05, + "loss": 0.0449, + "step": 19722 + }, + { + "epoch": 0.2465561639040976, + "grad_norm": 3.5516719818115234, + "learning_rate": 1.871976889285038e-05, + "loss": 0.8298, + "step": 19724 + }, + { + "epoch": 0.24658116452911322, + "grad_norm": 3.1075122356414795, + "learning_rate": 1.8719341637394645e-05, + "loss": 1.1147, + "step": 19726 + }, + { + "epoch": 0.24660616515412886, + "grad_norm": 2.9046905040740967, + "learning_rate": 1.8718914315533637e-05, + "loss": 1.1465, + "step": 19728 + }, + { + "epoch": 0.24663116577914448, + "grad_norm": 3.1688544750213623, + "learning_rate": 1.871848692727061e-05, + "loss": 1.1422, + "step": 19730 + }, + { + "epoch": 0.2466561664041601, + "grad_norm": 5.042324066162109, + "learning_rate": 1.871805947260882e-05, + "loss": 0.1253, + "step": 19732 + }, + { + "epoch": 0.24668116702917572, + "grad_norm": 14.655481338500977, + "learning_rate": 1.871763195155152e-05, + "loss": 1.0412, + "step": 19734 + }, + { + "epoch": 0.24670616765419134, + "grad_norm": 2.7535433769226074, + "learning_rate": 1.871720436410197e-05, + "loss": 0.4221, + "step": 19736 + }, + { + "epoch": 0.246731168279207, + "grad_norm": 0.0181865394115448, + "learning_rate": 1.8716776710263423e-05, + "loss": 0.2677, + "step": 19738 + }, + { + "epoch": 0.2467561689042226, + "grad_norm": 0.01054561510682106, + "learning_rate": 1.8716348990039136e-05, + "loss": 0.6811, + "step": 19740 + }, + { + "epoch": 0.24678116952923823, + "grad_norm": 4.471351623535156, + "learning_rate": 1.871592120343237e-05, + "loss": 1.2698, + "step": 19742 + }, + { + "epoch": 0.24680617015425385, + "grad_norm": 1.1442545652389526, + "learning_rate": 1.871549335044638e-05, + "loss": 0.02, + "step": 19744 + }, + { + "epoch": 0.24683117077926947, + "grad_norm": 2.239961862564087, + "learning_rate": 1.8715065431084426e-05, + "loss": 1.2064, + "step": 19746 + }, + { + "epoch": 0.24685617140428512, + "grad_norm": 7.314047813415527, + "learning_rate": 1.8714637445349764e-05, + "loss": 1.2473, + "step": 19748 + }, + { + "epoch": 0.24688117202930074, + "grad_norm": 4.209654331207275, + "learning_rate": 1.8714209393245653e-05, + "loss": 1.8949, + "step": 19750 + }, + { + "epoch": 0.24690617265431636, + "grad_norm": 2.2202913761138916, + "learning_rate": 1.8713781274775362e-05, + "loss": 0.714, + "step": 19752 + }, + { + "epoch": 0.24693117327933198, + "grad_norm": 1.604878306388855, + "learning_rate": 1.871335308994214e-05, + "loss": 0.1122, + "step": 19754 + }, + { + "epoch": 0.2469561739043476, + "grad_norm": 6.4182209968566895, + "learning_rate": 1.8712924838749257e-05, + "loss": 0.4097, + "step": 19756 + }, + { + "epoch": 0.24698117452936325, + "grad_norm": 8.810954093933105, + "learning_rate": 1.871249652119997e-05, + "loss": 1.3433, + "step": 19758 + }, + { + "epoch": 0.24700617515437887, + "grad_norm": 4.00775146484375, + "learning_rate": 1.8712068137297538e-05, + "loss": 1.7378, + "step": 19760 + }, + { + "epoch": 0.24703117577939449, + "grad_norm": 5.031407356262207, + "learning_rate": 1.871163968704523e-05, + "loss": 1.8765, + "step": 19762 + }, + { + "epoch": 0.2470561764044101, + "grad_norm": 0.012168710120022297, + "learning_rate": 1.8711211170446307e-05, + "loss": 0.0347, + "step": 19764 + }, + { + "epoch": 0.24708117702942572, + "grad_norm": 1.306984543800354, + "learning_rate": 1.871078258750403e-05, + "loss": 1.04, + "step": 19766 + }, + { + "epoch": 0.24710617765444137, + "grad_norm": 4.320162773132324, + "learning_rate": 1.8710353938221666e-05, + "loss": 1.5947, + "step": 19768 + }, + { + "epoch": 0.247131178279457, + "grad_norm": 7.025442123413086, + "learning_rate": 1.8709925222602475e-05, + "loss": 0.9986, + "step": 19770 + }, + { + "epoch": 0.2471561789044726, + "grad_norm": 2.74462628364563, + "learning_rate": 1.8709496440649728e-05, + "loss": 1.6896, + "step": 19772 + }, + { + "epoch": 0.24718117952948823, + "grad_norm": 6.830306529998779, + "learning_rate": 1.8709067592366684e-05, + "loss": 1.9795, + "step": 19774 + }, + { + "epoch": 0.24720618015450385, + "grad_norm": 2.6345553398132324, + "learning_rate": 1.8708638677756618e-05, + "loss": 0.2951, + "step": 19776 + }, + { + "epoch": 0.2472311807795195, + "grad_norm": 4.05303955078125, + "learning_rate": 1.8708209696822787e-05, + "loss": 0.8039, + "step": 19778 + }, + { + "epoch": 0.24725618140453512, + "grad_norm": 4.559197425842285, + "learning_rate": 1.870778064956847e-05, + "loss": 1.8467, + "step": 19780 + }, + { + "epoch": 0.24728118202955074, + "grad_norm": 10.406386375427246, + "learning_rate": 1.8707351535996915e-05, + "loss": 1.4379, + "step": 19782 + }, + { + "epoch": 0.24730618265456636, + "grad_norm": 3.119006395339966, + "learning_rate": 1.870692235611141e-05, + "loss": 0.4737, + "step": 19784 + }, + { + "epoch": 0.24733118327958198, + "grad_norm": 3.944732904434204, + "learning_rate": 1.8706493109915212e-05, + "loss": 1.0999, + "step": 19786 + }, + { + "epoch": 0.24735618390459763, + "grad_norm": 0.5459148287773132, + "learning_rate": 1.8706063797411598e-05, + "loss": 0.0504, + "step": 19788 + }, + { + "epoch": 0.24738118452961325, + "grad_norm": 3.292360305786133, + "learning_rate": 1.870563441860383e-05, + "loss": 1.8308, + "step": 19790 + }, + { + "epoch": 0.24740618515462887, + "grad_norm": 0.016395801678299904, + "learning_rate": 1.8705204973495182e-05, + "loss": 0.4579, + "step": 19792 + }, + { + "epoch": 0.24743118577964449, + "grad_norm": 0.03994498774409294, + "learning_rate": 1.8704775462088927e-05, + "loss": 0.5346, + "step": 19794 + }, + { + "epoch": 0.2474561864046601, + "grad_norm": 1.6126506328582764, + "learning_rate": 1.870434588438833e-05, + "loss": 1.2731, + "step": 19796 + }, + { + "epoch": 0.24748118702967575, + "grad_norm": 1.7765557765960693, + "learning_rate": 1.8703916240396664e-05, + "loss": 1.1568, + "step": 19798 + }, + { + "epoch": 0.24750618765469137, + "grad_norm": 3.033712387084961, + "learning_rate": 1.8703486530117206e-05, + "loss": 1.3685, + "step": 19800 + }, + { + "epoch": 0.247531188279707, + "grad_norm": 0.12717004120349884, + "learning_rate": 1.8703056753553222e-05, + "loss": 0.758, + "step": 19802 + }, + { + "epoch": 0.2475561889047226, + "grad_norm": 6.0904011726379395, + "learning_rate": 1.8702626910707992e-05, + "loss": 1.3933, + "step": 19804 + }, + { + "epoch": 0.24758118952973823, + "grad_norm": 2.039820432662964, + "learning_rate": 1.8702197001584785e-05, + "loss": 0.5406, + "step": 19806 + }, + { + "epoch": 0.24760619015475388, + "grad_norm": 0.8587817549705505, + "learning_rate": 1.8701767026186875e-05, + "loss": 0.2825, + "step": 19808 + }, + { + "epoch": 0.2476311907797695, + "grad_norm": 4.866968154907227, + "learning_rate": 1.870133698451754e-05, + "loss": 0.6608, + "step": 19810 + }, + { + "epoch": 0.24765619140478512, + "grad_norm": 1.795621395111084, + "learning_rate": 1.870090687658005e-05, + "loss": 0.5576, + "step": 19812 + }, + { + "epoch": 0.24768119202980074, + "grad_norm": 0.11429794877767563, + "learning_rate": 1.8700476702377686e-05, + "loss": 0.4186, + "step": 19814 + }, + { + "epoch": 0.24770619265481636, + "grad_norm": 2.4944324493408203, + "learning_rate": 1.870004646191372e-05, + "loss": 0.5013, + "step": 19816 + }, + { + "epoch": 0.247731193279832, + "grad_norm": 2.9469451904296875, + "learning_rate": 1.8699616155191432e-05, + "loss": 0.3926, + "step": 19818 + }, + { + "epoch": 0.24775619390484763, + "grad_norm": 2.4547622203826904, + "learning_rate": 1.8699185782214098e-05, + "loss": 0.7387, + "step": 19820 + }, + { + "epoch": 0.24778119452986325, + "grad_norm": 2.814274311065674, + "learning_rate": 1.8698755342984994e-05, + "loss": 1.0779, + "step": 19822 + }, + { + "epoch": 0.24780619515487887, + "grad_norm": 1.3520512580871582, + "learning_rate": 1.86983248375074e-05, + "loss": 0.664, + "step": 19824 + }, + { + "epoch": 0.2478311957798945, + "grad_norm": 1.782645583152771, + "learning_rate": 1.8697894265784593e-05, + "loss": 0.6045, + "step": 19826 + }, + { + "epoch": 0.24785619640491013, + "grad_norm": 4.299716949462891, + "learning_rate": 1.8697463627819855e-05, + "loss": 0.5117, + "step": 19828 + }, + { + "epoch": 0.24788119702992575, + "grad_norm": 2.7349941730499268, + "learning_rate": 1.8697032923616464e-05, + "loss": 1.1973, + "step": 19830 + }, + { + "epoch": 0.24790619765494137, + "grad_norm": 3.645221710205078, + "learning_rate": 1.8696602153177698e-05, + "loss": 1.0349, + "step": 19832 + }, + { + "epoch": 0.247931198279957, + "grad_norm": 10.915017127990723, + "learning_rate": 1.869617131650684e-05, + "loss": 0.9073, + "step": 19834 + }, + { + "epoch": 0.2479561989049726, + "grad_norm": 1.7535183429718018, + "learning_rate": 1.8695740413607175e-05, + "loss": 1.2201, + "step": 19836 + }, + { + "epoch": 0.24798119952998826, + "grad_norm": 1.3558661937713623, + "learning_rate": 1.8695309444481977e-05, + "loss": 0.0172, + "step": 19838 + }, + { + "epoch": 0.24800620015500388, + "grad_norm": 2.599350690841675, + "learning_rate": 1.869487840913453e-05, + "loss": 0.5916, + "step": 19840 + }, + { + "epoch": 0.2480312007800195, + "grad_norm": 3.735828399658203, + "learning_rate": 1.8694447307568123e-05, + "loss": 0.4793, + "step": 19842 + }, + { + "epoch": 0.24805620140503512, + "grad_norm": 4.177779674530029, + "learning_rate": 1.8694016139786035e-05, + "loss": 0.7404, + "step": 19844 + }, + { + "epoch": 0.24808120203005074, + "grad_norm": 0.020858608186244965, + "learning_rate": 1.8693584905791548e-05, + "loss": 0.8323, + "step": 19846 + }, + { + "epoch": 0.2481062026550664, + "grad_norm": 2.831648588180542, + "learning_rate": 1.8693153605587948e-05, + "loss": 0.519, + "step": 19848 + }, + { + "epoch": 0.248131203280082, + "grad_norm": 4.168927192687988, + "learning_rate": 1.8692722239178518e-05, + "loss": 0.8587, + "step": 19850 + }, + { + "epoch": 0.24815620390509763, + "grad_norm": 0.010027589276432991, + "learning_rate": 1.8692290806566548e-05, + "loss": 0.9213, + "step": 19852 + }, + { + "epoch": 0.24818120453011325, + "grad_norm": 3.3685786724090576, + "learning_rate": 1.8691859307755314e-05, + "loss": 1.1461, + "step": 19854 + }, + { + "epoch": 0.24820620515512887, + "grad_norm": 1.9490482807159424, + "learning_rate": 1.8691427742748113e-05, + "loss": 1.5325, + "step": 19856 + }, + { + "epoch": 0.24823120578014451, + "grad_norm": 0.9108299016952515, + "learning_rate": 1.869099611154823e-05, + "loss": 0.0898, + "step": 19858 + }, + { + "epoch": 0.24825620640516013, + "grad_norm": 5.639979839324951, + "learning_rate": 1.8690564414158943e-05, + "loss": 1.5327, + "step": 19860 + }, + { + "epoch": 0.24828120703017575, + "grad_norm": 2.446586847305298, + "learning_rate": 1.8690132650583554e-05, + "loss": 1.4465, + "step": 19862 + }, + { + "epoch": 0.24830620765519137, + "grad_norm": 2.944164514541626, + "learning_rate": 1.868970082082534e-05, + "loss": 0.6251, + "step": 19864 + }, + { + "epoch": 0.248331208280207, + "grad_norm": 2.0216755867004395, + "learning_rate": 1.868926892488759e-05, + "loss": 0.377, + "step": 19866 + }, + { + "epoch": 0.24835620890522264, + "grad_norm": 2.6294102668762207, + "learning_rate": 1.86888369627736e-05, + "loss": 1.275, + "step": 19868 + }, + { + "epoch": 0.24838120953023826, + "grad_norm": 0.00914309173822403, + "learning_rate": 1.8688404934486656e-05, + "loss": 0.2244, + "step": 19870 + }, + { + "epoch": 0.24840621015525388, + "grad_norm": 2.1093242168426514, + "learning_rate": 1.868797284003005e-05, + "loss": 1.1791, + "step": 19872 + }, + { + "epoch": 0.2484312107802695, + "grad_norm": 7.188520431518555, + "learning_rate": 1.868754067940707e-05, + "loss": 1.6818, + "step": 19874 + }, + { + "epoch": 0.24845621140528512, + "grad_norm": 2.5678322315216064, + "learning_rate": 1.8687108452621008e-05, + "loss": 1.0816, + "step": 19876 + }, + { + "epoch": 0.24848121203030077, + "grad_norm": 2.1222167015075684, + "learning_rate": 1.8686676159675156e-05, + "loss": 0.3053, + "step": 19878 + }, + { + "epoch": 0.2485062126553164, + "grad_norm": 9.403414726257324, + "learning_rate": 1.868624380057281e-05, + "loss": 0.2902, + "step": 19880 + }, + { + "epoch": 0.248531213280332, + "grad_norm": 5.444758892059326, + "learning_rate": 1.8685811375317256e-05, + "loss": 0.244, + "step": 19882 + }, + { + "epoch": 0.24855621390534763, + "grad_norm": 2.763827085494995, + "learning_rate": 1.8685378883911795e-05, + "loss": 1.0123, + "step": 19884 + }, + { + "epoch": 0.24858121453036325, + "grad_norm": 2.5634868144989014, + "learning_rate": 1.8684946326359715e-05, + "loss": 0.2172, + "step": 19886 + }, + { + "epoch": 0.2486062151553789, + "grad_norm": 1.50376296043396, + "learning_rate": 1.868451370266431e-05, + "loss": 0.1989, + "step": 19888 + }, + { + "epoch": 0.24863121578039452, + "grad_norm": 2.602450370788574, + "learning_rate": 1.8684081012828877e-05, + "loss": 1.039, + "step": 19890 + }, + { + "epoch": 0.24865621640541014, + "grad_norm": 3.1176507472991943, + "learning_rate": 1.8683648256856714e-05, + "loss": 0.8493, + "step": 19892 + }, + { + "epoch": 0.24868121703042576, + "grad_norm": 3.3617303371429443, + "learning_rate": 1.868321543475111e-05, + "loss": 1.3186, + "step": 19894 + }, + { + "epoch": 0.24870621765544138, + "grad_norm": 2.3994319438934326, + "learning_rate": 1.8682782546515365e-05, + "loss": 0.3531, + "step": 19896 + }, + { + "epoch": 0.24873121828045702, + "grad_norm": 2.0926308631896973, + "learning_rate": 1.868234959215278e-05, + "loss": 0.5509, + "step": 19898 + }, + { + "epoch": 0.24875621890547264, + "grad_norm": 0.6999564170837402, + "learning_rate": 1.8681916571666648e-05, + "loss": 0.9478, + "step": 19900 + }, + { + "epoch": 0.24878121953048826, + "grad_norm": 3.1269428730010986, + "learning_rate": 1.8681483485060262e-05, + "loss": 0.8429, + "step": 19902 + }, + { + "epoch": 0.24880622015550388, + "grad_norm": 9.024492263793945, + "learning_rate": 1.868105033233693e-05, + "loss": 1.1579, + "step": 19904 + }, + { + "epoch": 0.2488312207805195, + "grad_norm": 3.6289703845977783, + "learning_rate": 1.8680617113499947e-05, + "loss": 1.0566, + "step": 19906 + }, + { + "epoch": 0.24885622140553515, + "grad_norm": 4.011654853820801, + "learning_rate": 1.8680183828552607e-05, + "loss": 1.1665, + "step": 19908 + }, + { + "epoch": 0.24888122203055077, + "grad_norm": 1.9319252967834473, + "learning_rate": 1.867975047749822e-05, + "loss": 0.7506, + "step": 19910 + }, + { + "epoch": 0.2489062226555664, + "grad_norm": 0.5031473636627197, + "learning_rate": 1.867931706034008e-05, + "loss": 0.0591, + "step": 19912 + }, + { + "epoch": 0.248931223280582, + "grad_norm": 3.3246521949768066, + "learning_rate": 1.8678883577081482e-05, + "loss": 0.4259, + "step": 19914 + }, + { + "epoch": 0.24895622390559763, + "grad_norm": 1.5431288480758667, + "learning_rate": 1.8678450027725737e-05, + "loss": 0.1168, + "step": 19916 + }, + { + "epoch": 0.24898122453061328, + "grad_norm": 3.1896417140960693, + "learning_rate": 1.8678016412276147e-05, + "loss": 1.7404, + "step": 19918 + }, + { + "epoch": 0.2490062251556289, + "grad_norm": 3.780470609664917, + "learning_rate": 1.867758273073601e-05, + "loss": 0.7652, + "step": 19920 + }, + { + "epoch": 0.24903122578064452, + "grad_norm": 2.092395782470703, + "learning_rate": 1.867714898310863e-05, + "loss": 0.4149, + "step": 19922 + }, + { + "epoch": 0.24905622640566014, + "grad_norm": 0.014293440617620945, + "learning_rate": 1.8676715169397314e-05, + "loss": 1.1319, + "step": 19924 + }, + { + "epoch": 0.24908122703067576, + "grad_norm": 4.0241804122924805, + "learning_rate": 1.8676281289605356e-05, + "loss": 0.5936, + "step": 19926 + }, + { + "epoch": 0.2491062276556914, + "grad_norm": 3.2808191776275635, + "learning_rate": 1.867584734373607e-05, + "loss": 1.9726, + "step": 19928 + }, + { + "epoch": 0.24913122828070702, + "grad_norm": 2.784018039703369, + "learning_rate": 1.8675413331792756e-05, + "loss": 1.432, + "step": 19930 + }, + { + "epoch": 0.24915622890572264, + "grad_norm": 2.628650188446045, + "learning_rate": 1.867497925377872e-05, + "loss": 0.5584, + "step": 19932 + }, + { + "epoch": 0.24918122953073826, + "grad_norm": 3.357691526412964, + "learning_rate": 1.867454510969727e-05, + "loss": 0.2859, + "step": 19934 + }, + { + "epoch": 0.24920623015575388, + "grad_norm": 4.811458587646484, + "learning_rate": 1.8674110899551713e-05, + "loss": 0.4173, + "step": 19936 + }, + { + "epoch": 0.24923123078076953, + "grad_norm": 3.4534761905670166, + "learning_rate": 1.867367662334535e-05, + "loss": 1.7213, + "step": 19938 + }, + { + "epoch": 0.24925623140578515, + "grad_norm": 11.462995529174805, + "learning_rate": 1.8673242281081494e-05, + "loss": 0.8699, + "step": 19940 + }, + { + "epoch": 0.24928123203080077, + "grad_norm": 2.6831812858581543, + "learning_rate": 1.867280787276345e-05, + "loss": 1.4313, + "step": 19942 + }, + { + "epoch": 0.2493062326558164, + "grad_norm": 3.0940499305725098, + "learning_rate": 1.8672373398394534e-05, + "loss": 1.0539, + "step": 19944 + }, + { + "epoch": 0.249331233280832, + "grad_norm": 3.272800922393799, + "learning_rate": 1.8671938857978043e-05, + "loss": 0.5598, + "step": 19946 + }, + { + "epoch": 0.24935623390584766, + "grad_norm": 1.6954156160354614, + "learning_rate": 1.8671504251517292e-05, + "loss": 0.2048, + "step": 19948 + }, + { + "epoch": 0.24938123453086328, + "grad_norm": 8.698304176330566, + "learning_rate": 1.8671069579015592e-05, + "loss": 1.112, + "step": 19950 + }, + { + "epoch": 0.2494062351558789, + "grad_norm": 3.591120958328247, + "learning_rate": 1.8670634840476255e-05, + "loss": 0.4874, + "step": 19952 + }, + { + "epoch": 0.24943123578089452, + "grad_norm": 0.024003474041819572, + "learning_rate": 1.8670200035902585e-05, + "loss": 0.5428, + "step": 19954 + }, + { + "epoch": 0.24945623640591014, + "grad_norm": 4.426325798034668, + "learning_rate": 1.86697651652979e-05, + "loss": 0.68, + "step": 19956 + }, + { + "epoch": 0.24948123703092578, + "grad_norm": 0.7710449695587158, + "learning_rate": 1.866933022866551e-05, + "loss": 1.1586, + "step": 19958 + }, + { + "epoch": 0.2495062376559414, + "grad_norm": 3.0382535457611084, + "learning_rate": 1.8668895226008726e-05, + "loss": 1.4912, + "step": 19960 + }, + { + "epoch": 0.24953123828095702, + "grad_norm": 4.065207481384277, + "learning_rate": 1.8668460157330857e-05, + "loss": 0.8254, + "step": 19962 + }, + { + "epoch": 0.24955623890597264, + "grad_norm": 2.1959519386291504, + "learning_rate": 1.8668025022635227e-05, + "loss": 0.9529, + "step": 19964 + }, + { + "epoch": 0.24958123953098826, + "grad_norm": 0.0032341897021979094, + "learning_rate": 1.8667589821925146e-05, + "loss": 0.058, + "step": 19966 + }, + { + "epoch": 0.2496062401560039, + "grad_norm": 6.067203998565674, + "learning_rate": 1.8667154555203924e-05, + "loss": 0.0902, + "step": 19968 + }, + { + "epoch": 0.24963124078101953, + "grad_norm": 2.2853546142578125, + "learning_rate": 1.8666719222474875e-05, + "loss": 1.5151, + "step": 19970 + }, + { + "epoch": 0.24965624140603515, + "grad_norm": 8.066407203674316, + "learning_rate": 1.866628382374132e-05, + "loss": 1.8055, + "step": 19972 + }, + { + "epoch": 0.24968124203105077, + "grad_norm": 3.268671751022339, + "learning_rate": 1.8665848359006578e-05, + "loss": 1.3307, + "step": 19974 + }, + { + "epoch": 0.2497062426560664, + "grad_norm": 5.929646015167236, + "learning_rate": 1.8665412828273958e-05, + "loss": 0.508, + "step": 19976 + }, + { + "epoch": 0.24973124328108204, + "grad_norm": 3.5933480262756348, + "learning_rate": 1.8664977231546775e-05, + "loss": 2.2798, + "step": 19978 + }, + { + "epoch": 0.24975624390609766, + "grad_norm": 3.3100197315216064, + "learning_rate": 1.8664541568828353e-05, + "loss": 0.8167, + "step": 19980 + }, + { + "epoch": 0.24978124453111328, + "grad_norm": 0.04063272103667259, + "learning_rate": 1.866410584012201e-05, + "loss": 0.1432, + "step": 19982 + }, + { + "epoch": 0.2498062451561289, + "grad_norm": 2.9209978580474854, + "learning_rate": 1.866367004543106e-05, + "loss": 0.9264, + "step": 19984 + }, + { + "epoch": 0.24983124578114452, + "grad_norm": 0.013804898597300053, + "learning_rate": 1.8663234184758822e-05, + "loss": 0.0184, + "step": 19986 + }, + { + "epoch": 0.24985624640616017, + "grad_norm": 5.232398509979248, + "learning_rate": 1.866279825810862e-05, + "loss": 1.3684, + "step": 19988 + }, + { + "epoch": 0.24988124703117578, + "grad_norm": 0.023665498942136765, + "learning_rate": 1.8662362265483774e-05, + "loss": 0.4855, + "step": 19990 + }, + { + "epoch": 0.2499062476561914, + "grad_norm": 2.607717275619507, + "learning_rate": 1.86619262068876e-05, + "loss": 0.2052, + "step": 19992 + }, + { + "epoch": 0.24993124828120702, + "grad_norm": 6.050573348999023, + "learning_rate": 1.8661490082323417e-05, + "loss": 0.4224, + "step": 19994 + }, + { + "epoch": 0.24995624890622264, + "grad_norm": 1.7851394414901733, + "learning_rate": 1.8661053891794557e-05, + "loss": 0.4147, + "step": 19996 + }, + { + "epoch": 0.2499812495312383, + "grad_norm": 3.471363067626953, + "learning_rate": 1.866061763530433e-05, + "loss": 0.96, + "step": 19998 + }, + { + "epoch": 0.2500062501562539, + "grad_norm": 3.3804876804351807, + "learning_rate": 1.8660181312856064e-05, + "loss": 0.6064, + "step": 20000 + }, + { + "epoch": 0.25003125078126953, + "grad_norm": 3.7775940895080566, + "learning_rate": 1.8659744924453087e-05, + "loss": 2.2788, + "step": 20002 + }, + { + "epoch": 0.2500562514062852, + "grad_norm": 0.017603931948542595, + "learning_rate": 1.8659308470098713e-05, + "loss": 0.8566, + "step": 20004 + }, + { + "epoch": 0.25008125203130077, + "grad_norm": 2.697004795074463, + "learning_rate": 1.865887194979627e-05, + "loss": 0.4816, + "step": 20006 + }, + { + "epoch": 0.2501062526563164, + "grad_norm": 4.445120811462402, + "learning_rate": 1.8658435363549085e-05, + "loss": 1.7842, + "step": 20008 + }, + { + "epoch": 0.250131253281332, + "grad_norm": 0.7590680718421936, + "learning_rate": 1.865799871136048e-05, + "loss": 0.2677, + "step": 20010 + }, + { + "epoch": 0.25015625390634766, + "grad_norm": 3.221987247467041, + "learning_rate": 1.865756199323378e-05, + "loss": 0.7316, + "step": 20012 + }, + { + "epoch": 0.2501812545313633, + "grad_norm": 1.2973359823226929, + "learning_rate": 1.865712520917231e-05, + "loss": 1.1526, + "step": 20014 + }, + { + "epoch": 0.2502062551563789, + "grad_norm": 5.850167751312256, + "learning_rate": 1.8656688359179403e-05, + "loss": 2.4935, + "step": 20016 + }, + { + "epoch": 0.25023125578139455, + "grad_norm": 2.7457988262176514, + "learning_rate": 1.865625144325838e-05, + "loss": 0.2274, + "step": 20018 + }, + { + "epoch": 0.25025625640641014, + "grad_norm": 3.1914610862731934, + "learning_rate": 1.8655814461412572e-05, + "loss": 1.3447, + "step": 20020 + }, + { + "epoch": 0.2502812570314258, + "grad_norm": 0.8026686310768127, + "learning_rate": 1.8655377413645302e-05, + "loss": 0.7169, + "step": 20022 + }, + { + "epoch": 0.25030625765644143, + "grad_norm": 0.015136360190808773, + "learning_rate": 1.8654940299959902e-05, + "loss": 0.5236, + "step": 20024 + }, + { + "epoch": 0.250331258281457, + "grad_norm": 4.926590919494629, + "learning_rate": 1.8654503120359703e-05, + "loss": 1.4271, + "step": 20026 + }, + { + "epoch": 0.2503562589064727, + "grad_norm": 3.986299514770508, + "learning_rate": 1.8654065874848028e-05, + "loss": 1.9228, + "step": 20028 + }, + { + "epoch": 0.25038125953148826, + "grad_norm": 2.911694049835205, + "learning_rate": 1.8653628563428214e-05, + "loss": 0.9192, + "step": 20030 + }, + { + "epoch": 0.2504062601565039, + "grad_norm": 4.729872703552246, + "learning_rate": 1.865319118610359e-05, + "loss": 1.1009, + "step": 20032 + }, + { + "epoch": 0.25043126078151956, + "grad_norm": 3.224290609359741, + "learning_rate": 1.8652753742877483e-05, + "loss": 0.1098, + "step": 20034 + }, + { + "epoch": 0.25045626140653515, + "grad_norm": 2.685915946960449, + "learning_rate": 1.8652316233753227e-05, + "loss": 1.3039, + "step": 20036 + }, + { + "epoch": 0.2504812620315508, + "grad_norm": 4.311151027679443, + "learning_rate": 1.8651878658734155e-05, + "loss": 0.7231, + "step": 20038 + }, + { + "epoch": 0.2505062626565664, + "grad_norm": 1.4561703205108643, + "learning_rate": 1.86514410178236e-05, + "loss": 0.8896, + "step": 20040 + }, + { + "epoch": 0.25053126328158204, + "grad_norm": 3.1463301181793213, + "learning_rate": 1.865100331102489e-05, + "loss": 1.4036, + "step": 20042 + }, + { + "epoch": 0.2505562639065977, + "grad_norm": 1.9436430931091309, + "learning_rate": 1.8650565538341366e-05, + "loss": 0.9098, + "step": 20044 + }, + { + "epoch": 0.2505812645316133, + "grad_norm": 5.680058002471924, + "learning_rate": 1.8650127699776358e-05, + "loss": 2.1735, + "step": 20046 + }, + { + "epoch": 0.2506062651566289, + "grad_norm": 1.2507199048995972, + "learning_rate": 1.86496897953332e-05, + "loss": 0.0553, + "step": 20048 + }, + { + "epoch": 0.2506312657816445, + "grad_norm": 3.493171215057373, + "learning_rate": 1.8649251825015224e-05, + "loss": 1.4251, + "step": 20050 + }, + { + "epoch": 0.25065626640666017, + "grad_norm": 1.9559314250946045, + "learning_rate": 1.8648813788825774e-05, + "loss": 0.904, + "step": 20052 + }, + { + "epoch": 0.2506812670316758, + "grad_norm": 0.019699566066265106, + "learning_rate": 1.864837568676818e-05, + "loss": 0.0015, + "step": 20054 + }, + { + "epoch": 0.2507062676566914, + "grad_norm": 2.9276578426361084, + "learning_rate": 1.864793751884578e-05, + "loss": 0.9235, + "step": 20056 + }, + { + "epoch": 0.25073126828170705, + "grad_norm": 4.156733512878418, + "learning_rate": 1.864749928506191e-05, + "loss": 1.5878, + "step": 20058 + }, + { + "epoch": 0.25075626890672265, + "grad_norm": 0.015080154873430729, + "learning_rate": 1.8647060985419906e-05, + "loss": 0.5976, + "step": 20060 + }, + { + "epoch": 0.2507812695317383, + "grad_norm": 0.018377557396888733, + "learning_rate": 1.864662261992311e-05, + "loss": 0.0136, + "step": 20062 + }, + { + "epoch": 0.25080627015675394, + "grad_norm": 2.806320905685425, + "learning_rate": 1.864618418857486e-05, + "loss": 1.2304, + "step": 20064 + }, + { + "epoch": 0.25083127078176953, + "grad_norm": 5.579169273376465, + "learning_rate": 1.8645745691378493e-05, + "loss": 1.0679, + "step": 20066 + }, + { + "epoch": 0.2508562714067852, + "grad_norm": 27.170799255371094, + "learning_rate": 1.864530712833735e-05, + "loss": 0.9392, + "step": 20068 + }, + { + "epoch": 0.2508812720318008, + "grad_norm": 4.046467304229736, + "learning_rate": 1.864486849945477e-05, + "loss": 0.6682, + "step": 20070 + }, + { + "epoch": 0.2509062726568164, + "grad_norm": 1.7089930772781372, + "learning_rate": 1.8644429804734097e-05, + "loss": 0.3648, + "step": 20072 + }, + { + "epoch": 0.25093127328183207, + "grad_norm": 0.8928830623626709, + "learning_rate": 1.8643991044178665e-05, + "loss": 0.5145, + "step": 20074 + }, + { + "epoch": 0.25095627390684766, + "grad_norm": 5.057262420654297, + "learning_rate": 1.864355221779182e-05, + "loss": 1.4624, + "step": 20076 + }, + { + "epoch": 0.2509812745318633, + "grad_norm": 2.7583401203155518, + "learning_rate": 1.8643113325576907e-05, + "loss": 0.711, + "step": 20078 + }, + { + "epoch": 0.2510062751568789, + "grad_norm": 1.9545470476150513, + "learning_rate": 1.864267436753726e-05, + "loss": 0.1065, + "step": 20080 + }, + { + "epoch": 0.25103127578189455, + "grad_norm": 3.3197154998779297, + "learning_rate": 1.864223534367623e-05, + "loss": 1.0643, + "step": 20082 + }, + { + "epoch": 0.2510562764069102, + "grad_norm": 6.58437442779541, + "learning_rate": 1.864179625399716e-05, + "loss": 1.2845, + "step": 20084 + }, + { + "epoch": 0.2510812770319258, + "grad_norm": 2.780294418334961, + "learning_rate": 1.864135709850339e-05, + "loss": 0.813, + "step": 20086 + }, + { + "epoch": 0.25110627765694143, + "grad_norm": 2.5597083568573, + "learning_rate": 1.8640917877198267e-05, + "loss": 0.9513, + "step": 20088 + }, + { + "epoch": 0.251131278281957, + "grad_norm": 5.420149326324463, + "learning_rate": 1.8640478590085132e-05, + "loss": 2.5618, + "step": 20090 + }, + { + "epoch": 0.2511562789069727, + "grad_norm": 0.5891119241714478, + "learning_rate": 1.8640039237167336e-05, + "loss": 0.2353, + "step": 20092 + }, + { + "epoch": 0.2511812795319883, + "grad_norm": 3.27528977394104, + "learning_rate": 1.8639599818448225e-05, + "loss": 0.219, + "step": 20094 + }, + { + "epoch": 0.2512062801570039, + "grad_norm": 3.684750556945801, + "learning_rate": 1.863916033393114e-05, + "loss": 0.3285, + "step": 20096 + }, + { + "epoch": 0.25123128078201956, + "grad_norm": 13.214899063110352, + "learning_rate": 1.8638720783619432e-05, + "loss": 1.8053, + "step": 20098 + }, + { + "epoch": 0.25125628140703515, + "grad_norm": 3.413447856903076, + "learning_rate": 1.863828116751645e-05, + "loss": 1.9461, + "step": 20100 + }, + { + "epoch": 0.2512812820320508, + "grad_norm": 3.461688995361328, + "learning_rate": 1.863784148562554e-05, + "loss": 1.5023, + "step": 20102 + }, + { + "epoch": 0.25130628265706645, + "grad_norm": 5.559988975524902, + "learning_rate": 1.863740173795005e-05, + "loss": 0.3577, + "step": 20104 + }, + { + "epoch": 0.25133128328208204, + "grad_norm": 5.411302089691162, + "learning_rate": 1.8636961924493332e-05, + "loss": 1.9644, + "step": 20106 + }, + { + "epoch": 0.2513562839070977, + "grad_norm": 4.109818935394287, + "learning_rate": 1.863652204525873e-05, + "loss": 1.625, + "step": 20108 + }, + { + "epoch": 0.2513812845321133, + "grad_norm": 0.6319835782051086, + "learning_rate": 1.86360821002496e-05, + "loss": 0.925, + "step": 20110 + }, + { + "epoch": 0.25140628515712893, + "grad_norm": 1.3537335395812988, + "learning_rate": 1.8635642089469285e-05, + "loss": 0.4337, + "step": 20112 + }, + { + "epoch": 0.2514312857821446, + "grad_norm": 4.15239143371582, + "learning_rate": 1.8635202012921145e-05, + "loss": 1.1666, + "step": 20114 + }, + { + "epoch": 0.25145628640716017, + "grad_norm": 0.6536705493927002, + "learning_rate": 1.863476187060853e-05, + "loss": 0.7737, + "step": 20116 + }, + { + "epoch": 0.2514812870321758, + "grad_norm": 3.536614418029785, + "learning_rate": 1.8634321662534788e-05, + "loss": 0.9706, + "step": 20118 + }, + { + "epoch": 0.2515062876571914, + "grad_norm": 3.8989362716674805, + "learning_rate": 1.863388138870327e-05, + "loss": 0.4136, + "step": 20120 + }, + { + "epoch": 0.25153128828220706, + "grad_norm": 1.6972979307174683, + "learning_rate": 1.8633441049117332e-05, + "loss": 0.5088, + "step": 20122 + }, + { + "epoch": 0.2515562889072227, + "grad_norm": 2.5540435314178467, + "learning_rate": 1.8633000643780333e-05, + "loss": 0.7818, + "step": 20124 + }, + { + "epoch": 0.2515812895322383, + "grad_norm": 6.834232807159424, + "learning_rate": 1.863256017269562e-05, + "loss": 0.7112, + "step": 20126 + }, + { + "epoch": 0.25160629015725394, + "grad_norm": 4.787838459014893, + "learning_rate": 1.863211963586655e-05, + "loss": 1.6451, + "step": 20128 + }, + { + "epoch": 0.25163129078226953, + "grad_norm": 6.306681156158447, + "learning_rate": 1.8631679033296475e-05, + "loss": 0.7461, + "step": 20130 + }, + { + "epoch": 0.2516562914072852, + "grad_norm": 3.570866107940674, + "learning_rate": 1.8631238364988756e-05, + "loss": 1.4326, + "step": 20132 + }, + { + "epoch": 0.25168129203230083, + "grad_norm": 4.774158477783203, + "learning_rate": 1.863079763094674e-05, + "loss": 1.9284, + "step": 20134 + }, + { + "epoch": 0.2517062926573164, + "grad_norm": 1.2776436805725098, + "learning_rate": 1.8630356831173797e-05, + "loss": 0.5508, + "step": 20136 + }, + { + "epoch": 0.25173129328233207, + "grad_norm": 5.118372917175293, + "learning_rate": 1.8629915965673275e-05, + "loss": 0.5914, + "step": 20138 + }, + { + "epoch": 0.25175629390734766, + "grad_norm": 2.135253429412842, + "learning_rate": 1.8629475034448532e-05, + "loss": 0.5644, + "step": 20140 + }, + { + "epoch": 0.2517812945323633, + "grad_norm": 1.7728936672210693, + "learning_rate": 1.862903403750293e-05, + "loss": 1.5042, + "step": 20142 + }, + { + "epoch": 0.25180629515737896, + "grad_norm": 2.001462459564209, + "learning_rate": 1.8628592974839822e-05, + "loss": 0.1944, + "step": 20144 + }, + { + "epoch": 0.25183129578239455, + "grad_norm": 0.18199655413627625, + "learning_rate": 1.862815184646257e-05, + "loss": 0.7696, + "step": 20146 + }, + { + "epoch": 0.2518562964074102, + "grad_norm": 0.2675926685333252, + "learning_rate": 1.862771065237454e-05, + "loss": 0.7431, + "step": 20148 + }, + { + "epoch": 0.2518812970324258, + "grad_norm": 2.6443240642547607, + "learning_rate": 1.862726939257908e-05, + "loss": 1.0229, + "step": 20150 + }, + { + "epoch": 0.25190629765744144, + "grad_norm": 1.6576085090637207, + "learning_rate": 1.8626828067079554e-05, + "loss": 0.7844, + "step": 20152 + }, + { + "epoch": 0.2519312982824571, + "grad_norm": 3.3799145221710205, + "learning_rate": 1.8626386675879325e-05, + "loss": 0.6934, + "step": 20154 + }, + { + "epoch": 0.2519562989074727, + "grad_norm": 2.8827898502349854, + "learning_rate": 1.862594521898176e-05, + "loss": 2.5855, + "step": 20156 + }, + { + "epoch": 0.2519812995324883, + "grad_norm": 11.890602111816406, + "learning_rate": 1.8625503696390216e-05, + "loss": 0.9464, + "step": 20158 + }, + { + "epoch": 0.2520063001575039, + "grad_norm": 3.292389154434204, + "learning_rate": 1.8625062108108053e-05, + "loss": 0.3437, + "step": 20160 + }, + { + "epoch": 0.25203130078251956, + "grad_norm": 1.8425183296203613, + "learning_rate": 1.862462045413864e-05, + "loss": 1.1889, + "step": 20162 + }, + { + "epoch": 0.2520563014075352, + "grad_norm": 3.616424798965454, + "learning_rate": 1.8624178734485335e-05, + "loss": 1.8226, + "step": 20164 + }, + { + "epoch": 0.2520813020325508, + "grad_norm": 3.175724983215332, + "learning_rate": 1.86237369491515e-05, + "loss": 2.2577, + "step": 20166 + }, + { + "epoch": 0.25210630265756645, + "grad_norm": 3.460066795349121, + "learning_rate": 1.862329509814051e-05, + "loss": 2.3832, + "step": 20168 + }, + { + "epoch": 0.25213130328258204, + "grad_norm": 2.6596157550811768, + "learning_rate": 1.8622853181455724e-05, + "loss": 1.4829, + "step": 20170 + }, + { + "epoch": 0.2521563039075977, + "grad_norm": 0.8428264260292053, + "learning_rate": 1.8622411199100505e-05, + "loss": 0.377, + "step": 20172 + }, + { + "epoch": 0.25218130453261334, + "grad_norm": 3.7314956188201904, + "learning_rate": 1.862196915107822e-05, + "loss": 0.7522, + "step": 20174 + }, + { + "epoch": 0.25220630515762893, + "grad_norm": 2.699352741241455, + "learning_rate": 1.862152703739224e-05, + "loss": 1.1734, + "step": 20176 + }, + { + "epoch": 0.2522313057826446, + "grad_norm": 3.3051843643188477, + "learning_rate": 1.862108485804593e-05, + "loss": 1.3306, + "step": 20178 + }, + { + "epoch": 0.25225630640766017, + "grad_norm": 0.42168644070625305, + "learning_rate": 1.8620642613042658e-05, + "loss": 0.0387, + "step": 20180 + }, + { + "epoch": 0.2522813070326758, + "grad_norm": 2.3300251960754395, + "learning_rate": 1.8620200302385787e-05, + "loss": 1.0562, + "step": 20182 + }, + { + "epoch": 0.25230630765769146, + "grad_norm": 0.15116320550441742, + "learning_rate": 1.861975792607869e-05, + "loss": 0.5399, + "step": 20184 + }, + { + "epoch": 0.25233130828270706, + "grad_norm": 4.110387325286865, + "learning_rate": 1.8619315484124735e-05, + "loss": 2.1532, + "step": 20186 + }, + { + "epoch": 0.2523563089077227, + "grad_norm": 0.17895729839801788, + "learning_rate": 1.8618872976527296e-05, + "loss": 0.6338, + "step": 20188 + }, + { + "epoch": 0.2523813095327383, + "grad_norm": 4.827414035797119, + "learning_rate": 1.8618430403289737e-05, + "loss": 1.0965, + "step": 20190 + }, + { + "epoch": 0.25240631015775394, + "grad_norm": 3.7700271606445312, + "learning_rate": 1.8617987764415432e-05, + "loss": 1.7248, + "step": 20192 + }, + { + "epoch": 0.2524313107827696, + "grad_norm": 5.964962005615234, + "learning_rate": 1.8617545059907746e-05, + "loss": 0.3165, + "step": 20194 + }, + { + "epoch": 0.2524563114077852, + "grad_norm": 3.720363140106201, + "learning_rate": 1.861710228977006e-05, + "loss": 0.5801, + "step": 20196 + }, + { + "epoch": 0.25248131203280083, + "grad_norm": 2.405888319015503, + "learning_rate": 1.861665945400574e-05, + "loss": 0.8369, + "step": 20198 + }, + { + "epoch": 0.2525063126578164, + "grad_norm": 2.661973476409912, + "learning_rate": 1.861621655261816e-05, + "loss": 1.1859, + "step": 20200 + }, + { + "epoch": 0.25253131328283207, + "grad_norm": 1.9210542440414429, + "learning_rate": 1.8615773585610694e-05, + "loss": 0.2982, + "step": 20202 + }, + { + "epoch": 0.2525563139078477, + "grad_norm": 5.016319274902344, + "learning_rate": 1.8615330552986716e-05, + "loss": 1.5344, + "step": 20204 + }, + { + "epoch": 0.2525813145328633, + "grad_norm": 1.1248067617416382, + "learning_rate": 1.86148874547496e-05, + "loss": 0.5742, + "step": 20206 + }, + { + "epoch": 0.25260631515787896, + "grad_norm": 5.5933027267456055, + "learning_rate": 1.8614444290902715e-05, + "loss": 2.5967, + "step": 20208 + }, + { + "epoch": 0.25263131578289455, + "grad_norm": 4.856967926025391, + "learning_rate": 1.8614001061449443e-05, + "loss": 0.9032, + "step": 20210 + }, + { + "epoch": 0.2526563164079102, + "grad_norm": 0.0623546838760376, + "learning_rate": 1.8613557766393154e-05, + "loss": 0.3984, + "step": 20212 + }, + { + "epoch": 0.25268131703292585, + "grad_norm": 3.227098226547241, + "learning_rate": 1.8613114405737233e-05, + "loss": 1.4959, + "step": 20214 + }, + { + "epoch": 0.25270631765794144, + "grad_norm": 2.7548446655273438, + "learning_rate": 1.8612670979485045e-05, + "loss": 1.2096, + "step": 20216 + }, + { + "epoch": 0.2527313182829571, + "grad_norm": 2.593677282333374, + "learning_rate": 1.8612227487639976e-05, + "loss": 1.3773, + "step": 20218 + }, + { + "epoch": 0.2527563189079727, + "grad_norm": 0.7781035304069519, + "learning_rate": 1.86117839302054e-05, + "loss": 0.6081, + "step": 20220 + }, + { + "epoch": 0.2527813195329883, + "grad_norm": 3.3921754360198975, + "learning_rate": 1.8611340307184693e-05, + "loss": 1.7191, + "step": 20222 + }, + { + "epoch": 0.25280632015800397, + "grad_norm": 4.448179244995117, + "learning_rate": 1.8610896618581237e-05, + "loss": 1.3144, + "step": 20224 + }, + { + "epoch": 0.25283132078301956, + "grad_norm": 0.0671488493680954, + "learning_rate": 1.8610452864398413e-05, + "loss": 0.7458, + "step": 20226 + }, + { + "epoch": 0.2528563214080352, + "grad_norm": 2.733868360519409, + "learning_rate": 1.8610009044639593e-05, + "loss": 0.3742, + "step": 20228 + }, + { + "epoch": 0.2528813220330508, + "grad_norm": 3.2900235652923584, + "learning_rate": 1.8609565159308164e-05, + "loss": 1.734, + "step": 20230 + }, + { + "epoch": 0.25290632265806645, + "grad_norm": 4.87379264831543, + "learning_rate": 1.8609121208407505e-05, + "loss": 0.8384, + "step": 20232 + }, + { + "epoch": 0.2529313232830821, + "grad_norm": 2.161653757095337, + "learning_rate": 1.8608677191940997e-05, + "loss": 0.3633, + "step": 20234 + }, + { + "epoch": 0.2529563239080977, + "grad_norm": 4.995064735412598, + "learning_rate": 1.8608233109912018e-05, + "loss": 1.8693, + "step": 20236 + }, + { + "epoch": 0.25298132453311334, + "grad_norm": 4.31893253326416, + "learning_rate": 1.8607788962323952e-05, + "loss": 0.9605, + "step": 20238 + }, + { + "epoch": 0.25300632515812893, + "grad_norm": 9.959479331970215, + "learning_rate": 1.860734474918019e-05, + "loss": 1.2416, + "step": 20240 + }, + { + "epoch": 0.2530313257831446, + "grad_norm": 3.38415789604187, + "learning_rate": 1.86069004704841e-05, + "loss": 1.4738, + "step": 20242 + }, + { + "epoch": 0.2530563264081602, + "grad_norm": 3.61659836769104, + "learning_rate": 1.8606456126239075e-05, + "loss": 0.9772, + "step": 20244 + }, + { + "epoch": 0.2530813270331758, + "grad_norm": 11.897416114807129, + "learning_rate": 1.86060117164485e-05, + "loss": 1.7529, + "step": 20246 + }, + { + "epoch": 0.25310632765819147, + "grad_norm": 2.8407046794891357, + "learning_rate": 1.8605567241115755e-05, + "loss": 1.2433, + "step": 20248 + }, + { + "epoch": 0.25313132828320706, + "grad_norm": 0.03551461920142174, + "learning_rate": 1.8605122700244224e-05, + "loss": 0.0012, + "step": 20250 + }, + { + "epoch": 0.2531563289082227, + "grad_norm": 0.9959635138511658, + "learning_rate": 1.86046780938373e-05, + "loss": 0.5415, + "step": 20252 + }, + { + "epoch": 0.25318132953323835, + "grad_norm": 4.976224422454834, + "learning_rate": 1.860423342189836e-05, + "loss": 1.4666, + "step": 20254 + }, + { + "epoch": 0.25320633015825394, + "grad_norm": 4.008146286010742, + "learning_rate": 1.86037886844308e-05, + "loss": 0.3846, + "step": 20256 + }, + { + "epoch": 0.2532313307832696, + "grad_norm": 3.3768744468688965, + "learning_rate": 1.8603343881438e-05, + "loss": 0.9594, + "step": 20258 + }, + { + "epoch": 0.2532563314082852, + "grad_norm": 4.007598400115967, + "learning_rate": 1.860289901292335e-05, + "loss": 0.9282, + "step": 20260 + }, + { + "epoch": 0.25328133203330083, + "grad_norm": 2.3658487796783447, + "learning_rate": 1.8602454078890234e-05, + "loss": 0.9435, + "step": 20262 + }, + { + "epoch": 0.2533063326583165, + "grad_norm": 3.0467658042907715, + "learning_rate": 1.8602009079342048e-05, + "loss": 1.0437, + "step": 20264 + }, + { + "epoch": 0.25333133328333207, + "grad_norm": 2.4972028732299805, + "learning_rate": 1.8601564014282175e-05, + "loss": 1.1428, + "step": 20266 + }, + { + "epoch": 0.2533563339083477, + "grad_norm": 1.0738922357559204, + "learning_rate": 1.860111888371401e-05, + "loss": 1.4235, + "step": 20268 + }, + { + "epoch": 0.2533813345333633, + "grad_norm": 0.07174059748649597, + "learning_rate": 1.8600673687640936e-05, + "loss": 0.3264, + "step": 20270 + }, + { + "epoch": 0.25340633515837896, + "grad_norm": 3.5693626403808594, + "learning_rate": 1.860022842606635e-05, + "loss": 1.5425, + "step": 20272 + }, + { + "epoch": 0.2534313357833946, + "grad_norm": 13.18042278289795, + "learning_rate": 1.859978309899364e-05, + "loss": 0.5291, + "step": 20274 + }, + { + "epoch": 0.2534563364084102, + "grad_norm": 5.180391311645508, + "learning_rate": 1.8599337706426197e-05, + "loss": 1.5316, + "step": 20276 + }, + { + "epoch": 0.25348133703342585, + "grad_norm": 0.028633125126361847, + "learning_rate": 1.8598892248367415e-05, + "loss": 0.9449, + "step": 20278 + }, + { + "epoch": 0.25350633765844144, + "grad_norm": 3.64935302734375, + "learning_rate": 1.8598446724820686e-05, + "loss": 0.763, + "step": 20280 + }, + { + "epoch": 0.2535313382834571, + "grad_norm": 2.804356575012207, + "learning_rate": 1.85980011357894e-05, + "loss": 1.0173, + "step": 20282 + }, + { + "epoch": 0.25355633890847273, + "grad_norm": 0.5070845484733582, + "learning_rate": 1.8597555481276958e-05, + "loss": 0.3903, + "step": 20284 + }, + { + "epoch": 0.2535813395334883, + "grad_norm": 0.9126644134521484, + "learning_rate": 1.8597109761286748e-05, + "loss": 0.5024, + "step": 20286 + }, + { + "epoch": 0.253606340158504, + "grad_norm": 5.799901008605957, + "learning_rate": 1.8596663975822162e-05, + "loss": 0.8556, + "step": 20288 + }, + { + "epoch": 0.25363134078351957, + "grad_norm": 3.3677804470062256, + "learning_rate": 1.85962181248866e-05, + "loss": 0.7921, + "step": 20290 + }, + { + "epoch": 0.2536563414085352, + "grad_norm": 0.43048080801963806, + "learning_rate": 1.859577220848346e-05, + "loss": 0.8258, + "step": 20292 + }, + { + "epoch": 0.25368134203355086, + "grad_norm": 3.434462547302246, + "learning_rate": 1.8595326226616132e-05, + "loss": 0.6195, + "step": 20294 + }, + { + "epoch": 0.25370634265856645, + "grad_norm": 3.4539413452148438, + "learning_rate": 1.8594880179288014e-05, + "loss": 1.1179, + "step": 20296 + }, + { + "epoch": 0.2537313432835821, + "grad_norm": 8.264786720275879, + "learning_rate": 1.8594434066502507e-05, + "loss": 1.1467, + "step": 20298 + }, + { + "epoch": 0.2537563439085977, + "grad_norm": 2.5998754501342773, + "learning_rate": 1.8593987888263004e-05, + "loss": 0.7148, + "step": 20300 + }, + { + "epoch": 0.25378134453361334, + "grad_norm": 0.7794394493103027, + "learning_rate": 1.8593541644572902e-05, + "loss": 0.029, + "step": 20302 + }, + { + "epoch": 0.253806345158629, + "grad_norm": 2.0927212238311768, + "learning_rate": 1.8593095335435607e-05, + "loss": 1.8453, + "step": 20304 + }, + { + "epoch": 0.2538313457836446, + "grad_norm": 10.268948554992676, + "learning_rate": 1.859264896085451e-05, + "loss": 1.173, + "step": 20306 + }, + { + "epoch": 0.2538563464086602, + "grad_norm": 0.9183616638183594, + "learning_rate": 1.8592202520833014e-05, + "loss": 0.6567, + "step": 20308 + }, + { + "epoch": 0.2538813470336758, + "grad_norm": 3.117448091506958, + "learning_rate": 1.859175601537452e-05, + "loss": 0.5368, + "step": 20310 + }, + { + "epoch": 0.25390634765869147, + "grad_norm": 3.1257781982421875, + "learning_rate": 1.8591309444482424e-05, + "loss": 1.6497, + "step": 20312 + }, + { + "epoch": 0.2539313482837071, + "grad_norm": 3.10964035987854, + "learning_rate": 1.8590862808160133e-05, + "loss": 1.0589, + "step": 20314 + }, + { + "epoch": 0.2539563489087227, + "grad_norm": 2.7694265842437744, + "learning_rate": 1.8590416106411043e-05, + "loss": 0.5802, + "step": 20316 + }, + { + "epoch": 0.25398134953373835, + "grad_norm": 0.022887781262397766, + "learning_rate": 1.858996933923856e-05, + "loss": 0.9305, + "step": 20318 + }, + { + "epoch": 0.25400635015875395, + "grad_norm": 0.0216422900557518, + "learning_rate": 1.858952250664609e-05, + "loss": 0.451, + "step": 20320 + }, + { + "epoch": 0.2540313507837696, + "grad_norm": 1.7516067028045654, + "learning_rate": 1.8589075608637025e-05, + "loss": 1.2771, + "step": 20322 + }, + { + "epoch": 0.25405635140878524, + "grad_norm": 2.1143219470977783, + "learning_rate": 1.8588628645214774e-05, + "loss": 0.4365, + "step": 20324 + }, + { + "epoch": 0.25408135203380083, + "grad_norm": 2.9156360626220703, + "learning_rate": 1.8588181616382746e-05, + "loss": 1.0107, + "step": 20326 + }, + { + "epoch": 0.2541063526588165, + "grad_norm": 2.865119457244873, + "learning_rate": 1.8587734522144337e-05, + "loss": 1.468, + "step": 20328 + }, + { + "epoch": 0.2541313532838321, + "grad_norm": 3.556931972503662, + "learning_rate": 1.858728736250296e-05, + "loss": 0.7281, + "step": 20330 + }, + { + "epoch": 0.2541563539088477, + "grad_norm": 0.22604110836982727, + "learning_rate": 1.8586840137462018e-05, + "loss": 0.0707, + "step": 20332 + }, + { + "epoch": 0.25418135453386337, + "grad_norm": 0.3558037579059601, + "learning_rate": 1.858639284702491e-05, + "loss": 1.0566, + "step": 20334 + }, + { + "epoch": 0.25420635515887896, + "grad_norm": 0.027364853769540787, + "learning_rate": 1.8585945491195057e-05, + "loss": 0.8829, + "step": 20336 + }, + { + "epoch": 0.2542313557838946, + "grad_norm": 2.055081367492676, + "learning_rate": 1.858549806997585e-05, + "loss": 0.3096, + "step": 20338 + }, + { + "epoch": 0.2542563564089102, + "grad_norm": 0.6327925324440002, + "learning_rate": 1.8585050583370706e-05, + "loss": 0.0298, + "step": 20340 + }, + { + "epoch": 0.25428135703392585, + "grad_norm": 4.385743618011475, + "learning_rate": 1.858460303138303e-05, + "loss": 1.1948, + "step": 20342 + }, + { + "epoch": 0.2543063576589415, + "grad_norm": 0.014623419381678104, + "learning_rate": 1.858415541401623e-05, + "loss": 0.1629, + "step": 20344 + }, + { + "epoch": 0.2543313582839571, + "grad_norm": 3.4861137866973877, + "learning_rate": 1.858370773127372e-05, + "loss": 1.0486, + "step": 20346 + }, + { + "epoch": 0.25435635890897274, + "grad_norm": 1.1131614446640015, + "learning_rate": 1.8583259983158904e-05, + "loss": 0.9222, + "step": 20348 + }, + { + "epoch": 0.2543813595339883, + "grad_norm": 2.530749797821045, + "learning_rate": 1.8582812169675194e-05, + "loss": 0.5409, + "step": 20350 + }, + { + "epoch": 0.254406360159004, + "grad_norm": 2.2649195194244385, + "learning_rate": 1.8582364290826e-05, + "loss": 1.2494, + "step": 20352 + }, + { + "epoch": 0.2544313607840196, + "grad_norm": 1.0715807676315308, + "learning_rate": 1.8581916346614735e-05, + "loss": 0.7148, + "step": 20354 + }, + { + "epoch": 0.2544563614090352, + "grad_norm": 3.579028606414795, + "learning_rate": 1.858146833704481e-05, + "loss": 1.6535, + "step": 20356 + }, + { + "epoch": 0.25448136203405086, + "grad_norm": 1.9722906351089478, + "learning_rate": 1.858102026211963e-05, + "loss": 0.7084, + "step": 20358 + }, + { + "epoch": 0.25450636265906645, + "grad_norm": 5.541613578796387, + "learning_rate": 1.8580572121842616e-05, + "loss": 1.514, + "step": 20360 + }, + { + "epoch": 0.2545313632840821, + "grad_norm": 2.6853208541870117, + "learning_rate": 1.858012391621718e-05, + "loss": 0.7701, + "step": 20362 + }, + { + "epoch": 0.25455636390909775, + "grad_norm": 5.3000807762146, + "learning_rate": 1.8579675645246727e-05, + "loss": 1.041, + "step": 20364 + }, + { + "epoch": 0.25458136453411334, + "grad_norm": 2.691540479660034, + "learning_rate": 1.8579227308934685e-05, + "loss": 0.3626, + "step": 20366 + }, + { + "epoch": 0.254606365159129, + "grad_norm": 0.16912764310836792, + "learning_rate": 1.857877890728446e-05, + "loss": 0.007, + "step": 20368 + }, + { + "epoch": 0.2546313657841446, + "grad_norm": 4.9110846519470215, + "learning_rate": 1.8578330440299463e-05, + "loss": 2.0741, + "step": 20370 + }, + { + "epoch": 0.25465636640916023, + "grad_norm": 1.425724744796753, + "learning_rate": 1.8577881907983116e-05, + "loss": 0.22, + "step": 20372 + }, + { + "epoch": 0.2546813670341759, + "grad_norm": 12.534245491027832, + "learning_rate": 1.8577433310338838e-05, + "loss": 1.5568, + "step": 20374 + }, + { + "epoch": 0.25470636765919147, + "grad_norm": 3.409733533859253, + "learning_rate": 1.8576984647370037e-05, + "loss": 0.1451, + "step": 20376 + }, + { + "epoch": 0.2547313682842071, + "grad_norm": 4.438136100769043, + "learning_rate": 1.8576535919080133e-05, + "loss": 0.8735, + "step": 20378 + }, + { + "epoch": 0.2547563689092227, + "grad_norm": 2.6100902557373047, + "learning_rate": 1.857608712547255e-05, + "loss": 0.8592, + "step": 20380 + }, + { + "epoch": 0.25478136953423836, + "grad_norm": 3.327516794204712, + "learning_rate": 1.8575638266550694e-05, + "loss": 0.6352, + "step": 20382 + }, + { + "epoch": 0.254806370159254, + "grad_norm": 2.7628791332244873, + "learning_rate": 1.8575189342317992e-05, + "loss": 0.4322, + "step": 20384 + }, + { + "epoch": 0.2548313707842696, + "grad_norm": 1.017306923866272, + "learning_rate": 1.8574740352777862e-05, + "loss": 0.486, + "step": 20386 + }, + { + "epoch": 0.25485637140928524, + "grad_norm": 7.617847442626953, + "learning_rate": 1.857429129793372e-05, + "loss": 1.1265, + "step": 20388 + }, + { + "epoch": 0.25488137203430083, + "grad_norm": 0.006748138461261988, + "learning_rate": 1.8573842177788987e-05, + "loss": 0.1817, + "step": 20390 + }, + { + "epoch": 0.2549063726593165, + "grad_norm": 3.1239705085754395, + "learning_rate": 1.857339299234709e-05, + "loss": 1.0842, + "step": 20392 + }, + { + "epoch": 0.25493137328433213, + "grad_norm": 3.874222755432129, + "learning_rate": 1.8572943741611437e-05, + "loss": 0.7622, + "step": 20394 + }, + { + "epoch": 0.2549563739093477, + "grad_norm": 3.5038270950317383, + "learning_rate": 1.8572494425585462e-05, + "loss": 1.0507, + "step": 20396 + }, + { + "epoch": 0.25498137453436337, + "grad_norm": 4.728578567504883, + "learning_rate": 1.8572045044272577e-05, + "loss": 1.592, + "step": 20398 + }, + { + "epoch": 0.25500637515937896, + "grad_norm": 2.1268067359924316, + "learning_rate": 1.8571595597676215e-05, + "loss": 1.3411, + "step": 20400 + }, + { + "epoch": 0.2550313757843946, + "grad_norm": 2.4103851318359375, + "learning_rate": 1.8571146085799787e-05, + "loss": 1.0456, + "step": 20402 + }, + { + "epoch": 0.25505637640941026, + "grad_norm": 6.4864068031311035, + "learning_rate": 1.8570696508646724e-05, + "loss": 1.2277, + "step": 20404 + }, + { + "epoch": 0.25508137703442585, + "grad_norm": 3.0171964168548584, + "learning_rate": 1.8570246866220452e-05, + "loss": 1.0476, + "step": 20406 + }, + { + "epoch": 0.2551063776594415, + "grad_norm": 3.695774555206299, + "learning_rate": 1.8569797158524393e-05, + "loss": 0.5965, + "step": 20408 + }, + { + "epoch": 0.2551313782844571, + "grad_norm": 2.823129653930664, + "learning_rate": 1.8569347385561968e-05, + "loss": 0.0872, + "step": 20410 + }, + { + "epoch": 0.25515637890947274, + "grad_norm": 1.8567382097244263, + "learning_rate": 1.8568897547336603e-05, + "loss": 0.8836, + "step": 20412 + }, + { + "epoch": 0.2551813795344884, + "grad_norm": 5.710148334503174, + "learning_rate": 1.8568447643851727e-05, + "loss": 1.3482, + "step": 20414 + }, + { + "epoch": 0.255206380159504, + "grad_norm": 2.3261940479278564, + "learning_rate": 1.8567997675110766e-05, + "loss": 0.0996, + "step": 20416 + }, + { + "epoch": 0.2552313807845196, + "grad_norm": 2.7933387756347656, + "learning_rate": 1.8567547641117144e-05, + "loss": 0.7896, + "step": 20418 + }, + { + "epoch": 0.2552563814095352, + "grad_norm": 3.1460459232330322, + "learning_rate": 1.8567097541874294e-05, + "loss": 1.3229, + "step": 20420 + }, + { + "epoch": 0.25528138203455086, + "grad_norm": 2.5722219944000244, + "learning_rate": 1.856664737738564e-05, + "loss": 1.2931, + "step": 20422 + }, + { + "epoch": 0.2553063826595665, + "grad_norm": 2.296367883682251, + "learning_rate": 1.856619714765461e-05, + "loss": 0.3403, + "step": 20424 + }, + { + "epoch": 0.2553313832845821, + "grad_norm": 0.012752458453178406, + "learning_rate": 1.8565746852684636e-05, + "loss": 0.4357, + "step": 20426 + }, + { + "epoch": 0.25535638390959775, + "grad_norm": 3.044545888900757, + "learning_rate": 1.8565296492479144e-05, + "loss": 2.7874, + "step": 20428 + }, + { + "epoch": 0.25538138453461334, + "grad_norm": 2.7438790798187256, + "learning_rate": 1.8564846067041565e-05, + "loss": 1.6044, + "step": 20430 + }, + { + "epoch": 0.255406385159629, + "grad_norm": 3.8752002716064453, + "learning_rate": 1.856439557637533e-05, + "loss": 1.7755, + "step": 20432 + }, + { + "epoch": 0.25543138578464464, + "grad_norm": 0.29431387782096863, + "learning_rate": 1.856394502048387e-05, + "loss": 1.1302, + "step": 20434 + }, + { + "epoch": 0.25545638640966023, + "grad_norm": 3.3584086894989014, + "learning_rate": 1.8563494399370613e-05, + "loss": 0.3377, + "step": 20436 + }, + { + "epoch": 0.2554813870346759, + "grad_norm": 2.9880433082580566, + "learning_rate": 1.8563043713038995e-05, + "loss": 1.4296, + "step": 20438 + }, + { + "epoch": 0.25550638765969147, + "grad_norm": 1.6345641613006592, + "learning_rate": 1.856259296149245e-05, + "loss": 0.5447, + "step": 20440 + }, + { + "epoch": 0.2555313882847071, + "grad_norm": 1.1641318798065186, + "learning_rate": 1.8562142144734405e-05, + "loss": 0.0337, + "step": 20442 + }, + { + "epoch": 0.25555638890972276, + "grad_norm": 4.319372653961182, + "learning_rate": 1.8561691262768297e-05, + "loss": 1.0215, + "step": 20444 + }, + { + "epoch": 0.25558138953473836, + "grad_norm": 2.4524190425872803, + "learning_rate": 1.8561240315597558e-05, + "loss": 0.9094, + "step": 20446 + }, + { + "epoch": 0.255606390159754, + "grad_norm": 3.9427475929260254, + "learning_rate": 1.8560789303225625e-05, + "loss": 1.5432, + "step": 20448 + }, + { + "epoch": 0.2556313907847696, + "grad_norm": 2.785284996032715, + "learning_rate": 1.8560338225655926e-05, + "loss": 0.6334, + "step": 20450 + }, + { + "epoch": 0.25565639140978524, + "grad_norm": 0.2686302959918976, + "learning_rate": 1.8559887082891907e-05, + "loss": 0.9869, + "step": 20452 + }, + { + "epoch": 0.2556813920348009, + "grad_norm": 0.005073191598057747, + "learning_rate": 1.8559435874937e-05, + "loss": 0.5678, + "step": 20454 + }, + { + "epoch": 0.2557063926598165, + "grad_norm": 1.6120774745941162, + "learning_rate": 1.8558984601794635e-05, + "loss": 0.825, + "step": 20456 + }, + { + "epoch": 0.25573139328483213, + "grad_norm": 3.502833604812622, + "learning_rate": 1.8558533263468255e-05, + "loss": 0.8837, + "step": 20458 + }, + { + "epoch": 0.2557563939098477, + "grad_norm": 0.12533842027187347, + "learning_rate": 1.8558081859961298e-05, + "loss": 0.2201, + "step": 20460 + }, + { + "epoch": 0.25578139453486337, + "grad_norm": 4.0491108894348145, + "learning_rate": 1.85576303912772e-05, + "loss": 1.728, + "step": 20462 + }, + { + "epoch": 0.255806395159879, + "grad_norm": 2.3164048194885254, + "learning_rate": 1.8557178857419396e-05, + "loss": 0.8543, + "step": 20464 + }, + { + "epoch": 0.2558313957848946, + "grad_norm": 0.11774211376905441, + "learning_rate": 1.855672725839133e-05, + "loss": 1.0347, + "step": 20466 + }, + { + "epoch": 0.25585639640991026, + "grad_norm": 1.5271432399749756, + "learning_rate": 1.8556275594196437e-05, + "loss": 0.2531, + "step": 20468 + }, + { + "epoch": 0.25588139703492585, + "grad_norm": 3.1489295959472656, + "learning_rate": 1.855582386483816e-05, + "loss": 0.7757, + "step": 20470 + }, + { + "epoch": 0.2559063976599415, + "grad_norm": 3.3391916751861572, + "learning_rate": 1.8555372070319942e-05, + "loss": 1.1361, + "step": 20472 + }, + { + "epoch": 0.25593139828495715, + "grad_norm": 1.8553473949432373, + "learning_rate": 1.8554920210645217e-05, + "loss": 1.0491, + "step": 20474 + }, + { + "epoch": 0.25595639890997274, + "grad_norm": 0.06712786853313446, + "learning_rate": 1.855446828581743e-05, + "loss": 0.0131, + "step": 20476 + }, + { + "epoch": 0.2559813995349884, + "grad_norm": 2.085329055786133, + "learning_rate": 1.8554016295840022e-05, + "loss": 1.182, + "step": 20478 + }, + { + "epoch": 0.256006400160004, + "grad_norm": 3.349663496017456, + "learning_rate": 1.8553564240716433e-05, + "loss": 0.3128, + "step": 20480 + }, + { + "epoch": 0.2560314007850196, + "grad_norm": 5.589386940002441, + "learning_rate": 1.8553112120450114e-05, + "loss": 0.6972, + "step": 20482 + }, + { + "epoch": 0.2560564014100353, + "grad_norm": 0.3641173839569092, + "learning_rate": 1.8552659935044496e-05, + "loss": 0.0839, + "step": 20484 + }, + { + "epoch": 0.25608140203505086, + "grad_norm": 0.004594489000737667, + "learning_rate": 1.8552207684503036e-05, + "loss": 0.0034, + "step": 20486 + }, + { + "epoch": 0.2561064026600665, + "grad_norm": 3.83117413520813, + "learning_rate": 1.8551755368829168e-05, + "loss": 1.1205, + "step": 20488 + }, + { + "epoch": 0.2561314032850821, + "grad_norm": 3.5306591987609863, + "learning_rate": 1.8551302988026337e-05, + "loss": 0.975, + "step": 20490 + }, + { + "epoch": 0.25615640391009775, + "grad_norm": 4.923953056335449, + "learning_rate": 1.8550850542098e-05, + "loss": 1.4616, + "step": 20492 + }, + { + "epoch": 0.2561814045351134, + "grad_norm": 0.7491998672485352, + "learning_rate": 1.855039803104759e-05, + "loss": 0.5318, + "step": 20494 + }, + { + "epoch": 0.256206405160129, + "grad_norm": 7.376331329345703, + "learning_rate": 1.8549945454878556e-05, + "loss": 1.7253, + "step": 20496 + }, + { + "epoch": 0.25623140578514464, + "grad_norm": 0.00437890412285924, + "learning_rate": 1.854949281359435e-05, + "loss": 0.8713, + "step": 20498 + }, + { + "epoch": 0.25625640641016023, + "grad_norm": 4.019556522369385, + "learning_rate": 1.8549040107198414e-05, + "loss": 1.793, + "step": 20500 + }, + { + "epoch": 0.2562814070351759, + "grad_norm": 0.012696648016571999, + "learning_rate": 1.85485873356942e-05, + "loss": 0.2578, + "step": 20502 + }, + { + "epoch": 0.2563064076601915, + "grad_norm": 0.0058528645895421505, + "learning_rate": 1.8548134499085147e-05, + "loss": 0.123, + "step": 20504 + }, + { + "epoch": 0.2563314082852071, + "grad_norm": 3.374666452407837, + "learning_rate": 1.8547681597374716e-05, + "loss": 0.6108, + "step": 20506 + }, + { + "epoch": 0.25635640891022277, + "grad_norm": 3.981318950653076, + "learning_rate": 1.854722863056635e-05, + "loss": 0.7409, + "step": 20508 + }, + { + "epoch": 0.25638140953523836, + "grad_norm": 0.11447606980800629, + "learning_rate": 1.85467755986635e-05, + "loss": 0.9206, + "step": 20510 + }, + { + "epoch": 0.256406410160254, + "grad_norm": 2.3312721252441406, + "learning_rate": 1.854632250166961e-05, + "loss": 0.9248, + "step": 20512 + }, + { + "epoch": 0.25643141078526965, + "grad_norm": 4.058536052703857, + "learning_rate": 1.8545869339588143e-05, + "loss": 1.9144, + "step": 20514 + }, + { + "epoch": 0.25645641141028525, + "grad_norm": 3.4328086376190186, + "learning_rate": 1.854541611242254e-05, + "loss": 1.1066, + "step": 20516 + }, + { + "epoch": 0.2564814120353009, + "grad_norm": 4.7924017906188965, + "learning_rate": 1.854496282017626e-05, + "loss": 1.5989, + "step": 20518 + }, + { + "epoch": 0.2565064126603165, + "grad_norm": 4.374124050140381, + "learning_rate": 1.854450946285275e-05, + "loss": 1.5611, + "step": 20520 + }, + { + "epoch": 0.25653141328533213, + "grad_norm": 1.8527592420578003, + "learning_rate": 1.8544056040455463e-05, + "loss": 0.1945, + "step": 20522 + }, + { + "epoch": 0.2565564139103478, + "grad_norm": 5.438563823699951, + "learning_rate": 1.8543602552987857e-05, + "loss": 2.0672, + "step": 20524 + }, + { + "epoch": 0.2565814145353634, + "grad_norm": 5.323781490325928, + "learning_rate": 1.8543149000453375e-05, + "loss": 0.731, + "step": 20526 + }, + { + "epoch": 0.256606415160379, + "grad_norm": 2.9536428451538086, + "learning_rate": 1.854269538285548e-05, + "loss": 1.0369, + "step": 20528 + }, + { + "epoch": 0.2566314157853946, + "grad_norm": 3.261629343032837, + "learning_rate": 1.8542241700197633e-05, + "loss": 0.8432, + "step": 20530 + }, + { + "epoch": 0.25665641641041026, + "grad_norm": 0.0093122823163867, + "learning_rate": 1.8541787952483274e-05, + "loss": 0.4069, + "step": 20532 + }, + { + "epoch": 0.2566814170354259, + "grad_norm": 3.943828821182251, + "learning_rate": 1.8541334139715867e-05, + "loss": 0.4076, + "step": 20534 + }, + { + "epoch": 0.2567064176604415, + "grad_norm": 7.4538254737854, + "learning_rate": 1.854088026189887e-05, + "loss": 1.3965, + "step": 20536 + }, + { + "epoch": 0.25673141828545715, + "grad_norm": 2.089327812194824, + "learning_rate": 1.8540426319035733e-05, + "loss": 0.707, + "step": 20538 + }, + { + "epoch": 0.25675641891047274, + "grad_norm": 0.44851788878440857, + "learning_rate": 1.853997231112992e-05, + "loss": 0.659, + "step": 20540 + }, + { + "epoch": 0.2567814195354884, + "grad_norm": 3.721514940261841, + "learning_rate": 1.8539518238184882e-05, + "loss": 1.3054, + "step": 20542 + }, + { + "epoch": 0.25680642016050403, + "grad_norm": 3.0875823497772217, + "learning_rate": 1.8539064100204084e-05, + "loss": 1.2832, + "step": 20544 + }, + { + "epoch": 0.2568314207855196, + "grad_norm": 2.3780505657196045, + "learning_rate": 1.853860989719098e-05, + "loss": 0.3503, + "step": 20546 + }, + { + "epoch": 0.2568564214105353, + "grad_norm": 4.3417158126831055, + "learning_rate": 1.8538155629149028e-05, + "loss": 1.5177, + "step": 20548 + }, + { + "epoch": 0.25688142203555087, + "grad_norm": 4.42333984375, + "learning_rate": 1.8537701296081695e-05, + "loss": 0.7486, + "step": 20550 + }, + { + "epoch": 0.2569064226605665, + "grad_norm": 2.545673370361328, + "learning_rate": 1.853724689799243e-05, + "loss": 0.6953, + "step": 20552 + }, + { + "epoch": 0.25693142328558216, + "grad_norm": 0.7109977602958679, + "learning_rate": 1.8536792434884706e-05, + "loss": 0.9611, + "step": 20554 + }, + { + "epoch": 0.25695642391059775, + "grad_norm": 6.106414794921875, + "learning_rate": 1.8536337906761976e-05, + "loss": 1.4401, + "step": 20556 + }, + { + "epoch": 0.2569814245356134, + "grad_norm": 3.01629900932312, + "learning_rate": 1.85358833136277e-05, + "loss": 1.8446, + "step": 20558 + }, + { + "epoch": 0.257006425160629, + "grad_norm": 2.0490381717681885, + "learning_rate": 1.853542865548535e-05, + "loss": 0.3448, + "step": 20560 + }, + { + "epoch": 0.25703142578564464, + "grad_norm": 2.2988736629486084, + "learning_rate": 1.8534973932338378e-05, + "loss": 1.1728, + "step": 20562 + }, + { + "epoch": 0.2570564264106603, + "grad_norm": 4.6618218421936035, + "learning_rate": 1.853451914419025e-05, + "loss": 0.5487, + "step": 20564 + }, + { + "epoch": 0.2570814270356759, + "grad_norm": 3.522526264190674, + "learning_rate": 1.8534064291044438e-05, + "loss": 1.4448, + "step": 20566 + }, + { + "epoch": 0.25710642766069153, + "grad_norm": 4.712881088256836, + "learning_rate": 1.8533609372904394e-05, + "loss": 1.3943, + "step": 20568 + }, + { + "epoch": 0.2571314282857071, + "grad_norm": 3.2664804458618164, + "learning_rate": 1.853315438977359e-05, + "loss": 1.0349, + "step": 20570 + }, + { + "epoch": 0.25715642891072277, + "grad_norm": 4.3290534019470215, + "learning_rate": 1.8532699341655486e-05, + "loss": 1.4094, + "step": 20572 + }, + { + "epoch": 0.2571814295357384, + "grad_norm": 1.703888177871704, + "learning_rate": 1.8532244228553554e-05, + "loss": 0.9559, + "step": 20574 + }, + { + "epoch": 0.257206430160754, + "grad_norm": 2.5986971855163574, + "learning_rate": 1.8531789050471255e-05, + "loss": 1.7167, + "step": 20576 + }, + { + "epoch": 0.25723143078576965, + "grad_norm": 4.274390697479248, + "learning_rate": 1.8531333807412056e-05, + "loss": 2.5648, + "step": 20578 + }, + { + "epoch": 0.25725643141078525, + "grad_norm": 5.583097457885742, + "learning_rate": 1.8530878499379427e-05, + "loss": 1.503, + "step": 20580 + }, + { + "epoch": 0.2572814320358009, + "grad_norm": 4.349621772766113, + "learning_rate": 1.8530423126376832e-05, + "loss": 0.9396, + "step": 20582 + }, + { + "epoch": 0.25730643266081654, + "grad_norm": 4.3730692863464355, + "learning_rate": 1.8529967688407742e-05, + "loss": 1.2121, + "step": 20584 + }, + { + "epoch": 0.25733143328583213, + "grad_norm": 3.1411664485931396, + "learning_rate": 1.8529512185475623e-05, + "loss": 1.0572, + "step": 20586 + }, + { + "epoch": 0.2573564339108478, + "grad_norm": 0.009892667643725872, + "learning_rate": 1.8529056617583948e-05, + "loss": 0.0032, + "step": 20588 + }, + { + "epoch": 0.2573814345358634, + "grad_norm": 0.006999169941991568, + "learning_rate": 1.8528600984736182e-05, + "loss": 0.7251, + "step": 20590 + }, + { + "epoch": 0.257406435160879, + "grad_norm": 7.238365650177002, + "learning_rate": 1.85281452869358e-05, + "loss": 1.035, + "step": 20592 + }, + { + "epoch": 0.25743143578589467, + "grad_norm": 2.992265224456787, + "learning_rate": 1.8527689524186264e-05, + "loss": 1.1419, + "step": 20594 + }, + { + "epoch": 0.25745643641091026, + "grad_norm": 3.04435658454895, + "learning_rate": 1.8527233696491053e-05, + "loss": 1.2559, + "step": 20596 + }, + { + "epoch": 0.2574814370359259, + "grad_norm": 3.3720204830169678, + "learning_rate": 1.8526777803853634e-05, + "loss": 1.0447, + "step": 20598 + }, + { + "epoch": 0.2575064376609415, + "grad_norm": 2.167587995529175, + "learning_rate": 1.8526321846277485e-05, + "loss": 0.4465, + "step": 20600 + }, + { + "epoch": 0.25753143828595715, + "grad_norm": 2.7842555046081543, + "learning_rate": 1.8525865823766073e-05, + "loss": 1.307, + "step": 20602 + }, + { + "epoch": 0.2575564389109728, + "grad_norm": 0.8809083700180054, + "learning_rate": 1.8525409736322873e-05, + "loss": 0.7445, + "step": 20604 + }, + { + "epoch": 0.2575814395359884, + "grad_norm": 2.7100207805633545, + "learning_rate": 1.8524953583951356e-05, + "loss": 1.1741, + "step": 20606 + }, + { + "epoch": 0.25760644016100404, + "grad_norm": 3.654067277908325, + "learning_rate": 1.8524497366655e-05, + "loss": 0.7478, + "step": 20608 + }, + { + "epoch": 0.2576314407860196, + "grad_norm": 3.565688371658325, + "learning_rate": 1.8524041084437275e-05, + "loss": 1.8353, + "step": 20610 + }, + { + "epoch": 0.2576564414110353, + "grad_norm": 2.1530277729034424, + "learning_rate": 1.852358473730166e-05, + "loss": 1.5109, + "step": 20612 + }, + { + "epoch": 0.2576814420360509, + "grad_norm": 1.3897777795791626, + "learning_rate": 1.852312832525163e-05, + "loss": 0.0889, + "step": 20614 + }, + { + "epoch": 0.2577064426610665, + "grad_norm": 2.4252779483795166, + "learning_rate": 1.852267184829066e-05, + "loss": 0.9942, + "step": 20616 + }, + { + "epoch": 0.25773144328608216, + "grad_norm": 2.203364849090576, + "learning_rate": 1.852221530642223e-05, + "loss": 1.5499, + "step": 20618 + }, + { + "epoch": 0.25775644391109775, + "grad_norm": 4.912839889526367, + "learning_rate": 1.8521758699649805e-05, + "loss": 0.8852, + "step": 20620 + }, + { + "epoch": 0.2577814445361134, + "grad_norm": 3.604841470718384, + "learning_rate": 1.8521302027976875e-05, + "loss": 0.719, + "step": 20622 + }, + { + "epoch": 0.25780644516112905, + "grad_norm": 2.063934087753296, + "learning_rate": 1.8520845291406914e-05, + "loss": 1.2474, + "step": 20624 + }, + { + "epoch": 0.25783144578614464, + "grad_norm": 0.0857388898730278, + "learning_rate": 1.8520388489943403e-05, + "loss": 0.156, + "step": 20626 + }, + { + "epoch": 0.2578564464111603, + "grad_norm": 5.0496344566345215, + "learning_rate": 1.8519931623589813e-05, + "loss": 0.4679, + "step": 20628 + }, + { + "epoch": 0.2578814470361759, + "grad_norm": 1.1468489170074463, + "learning_rate": 1.851947469234963e-05, + "loss": 0.4151, + "step": 20630 + }, + { + "epoch": 0.25790644766119153, + "grad_norm": 3.601412534713745, + "learning_rate": 1.8519017696226337e-05, + "loss": 0.7636, + "step": 20632 + }, + { + "epoch": 0.2579314482862072, + "grad_norm": 0.8450374007225037, + "learning_rate": 1.8518560635223404e-05, + "loss": 0.7492, + "step": 20634 + }, + { + "epoch": 0.25795644891122277, + "grad_norm": 1.645966649055481, + "learning_rate": 1.851810350934432e-05, + "loss": 1.8153, + "step": 20636 + }, + { + "epoch": 0.2579814495362384, + "grad_norm": 4.184247016906738, + "learning_rate": 1.8517646318592564e-05, + "loss": 1.283, + "step": 20638 + }, + { + "epoch": 0.258006450161254, + "grad_norm": 3.777475595474243, + "learning_rate": 1.851718906297162e-05, + "loss": 2.1174, + "step": 20640 + }, + { + "epoch": 0.25803145078626966, + "grad_norm": 4.367381572723389, + "learning_rate": 1.8516731742484965e-05, + "loss": 0.5749, + "step": 20642 + }, + { + "epoch": 0.2580564514112853, + "grad_norm": 0.008620780892670155, + "learning_rate": 1.8516274357136087e-05, + "loss": 0.5985, + "step": 20644 + }, + { + "epoch": 0.2580814520363009, + "grad_norm": 3.784425735473633, + "learning_rate": 1.851581690692847e-05, + "loss": 0.6176, + "step": 20646 + }, + { + "epoch": 0.25810645266131654, + "grad_norm": 0.004594023805111647, + "learning_rate": 1.851535939186559e-05, + "loss": 0.2207, + "step": 20648 + }, + { + "epoch": 0.25813145328633214, + "grad_norm": 2.4763343334198, + "learning_rate": 1.8514901811950944e-05, + "loss": 0.1842, + "step": 20650 + }, + { + "epoch": 0.2581564539113478, + "grad_norm": 0.005213284865021706, + "learning_rate": 1.8514444167188007e-05, + "loss": 0.0561, + "step": 20652 + }, + { + "epoch": 0.25818145453636343, + "grad_norm": 4.266303539276123, + "learning_rate": 1.8513986457580267e-05, + "loss": 1.5416, + "step": 20654 + }, + { + "epoch": 0.258206455161379, + "grad_norm": 3.717444896697998, + "learning_rate": 1.851352868313121e-05, + "loss": 1.0198, + "step": 20656 + }, + { + "epoch": 0.25823145578639467, + "grad_norm": 4.132247447967529, + "learning_rate": 1.8513070843844324e-05, + "loss": 1.1817, + "step": 20658 + }, + { + "epoch": 0.25825645641141026, + "grad_norm": 3.3994927406311035, + "learning_rate": 1.851261293972309e-05, + "loss": 1.8263, + "step": 20660 + }, + { + "epoch": 0.2582814570364259, + "grad_norm": 2.800084352493286, + "learning_rate": 1.8512154970771003e-05, + "loss": 1.3059, + "step": 20662 + }, + { + "epoch": 0.25830645766144156, + "grad_norm": 3.6909339427948, + "learning_rate": 1.8511696936991546e-05, + "loss": 1.8882, + "step": 20664 + }, + { + "epoch": 0.25833145828645715, + "grad_norm": 2.7889010906219482, + "learning_rate": 1.851123883838821e-05, + "loss": 1.4103, + "step": 20666 + }, + { + "epoch": 0.2583564589114728, + "grad_norm": 6.736733913421631, + "learning_rate": 1.8510780674964483e-05, + "loss": 1.3338, + "step": 20668 + }, + { + "epoch": 0.2583814595364884, + "grad_norm": 3.9170515537261963, + "learning_rate": 1.8510322446723853e-05, + "loss": 0.8309, + "step": 20670 + }, + { + "epoch": 0.25840646016150404, + "grad_norm": 0.05962488427758217, + "learning_rate": 1.8509864153669814e-05, + "loss": 0.6695, + "step": 20672 + }, + { + "epoch": 0.2584314607865197, + "grad_norm": 5.506434917449951, + "learning_rate": 1.8509405795805848e-05, + "loss": 1.4687, + "step": 20674 + }, + { + "epoch": 0.2584564614115353, + "grad_norm": 4.117928504943848, + "learning_rate": 1.8508947373135452e-05, + "loss": 1.3502, + "step": 20676 + }, + { + "epoch": 0.2584814620365509, + "grad_norm": 5.218518257141113, + "learning_rate": 1.850848888566212e-05, + "loss": 0.898, + "step": 20678 + }, + { + "epoch": 0.2585064626615665, + "grad_norm": 4.055530548095703, + "learning_rate": 1.8508030333389334e-05, + "loss": 0.8159, + "step": 20680 + }, + { + "epoch": 0.25853146328658216, + "grad_norm": 5.358179092407227, + "learning_rate": 1.8507571716320594e-05, + "loss": 1.8585, + "step": 20682 + }, + { + "epoch": 0.2585564639115978, + "grad_norm": 0.7875246405601501, + "learning_rate": 1.8507113034459397e-05, + "loss": 0.0234, + "step": 20684 + }, + { + "epoch": 0.2585814645366134, + "grad_norm": 0.44342848658561707, + "learning_rate": 1.8506654287809223e-05, + "loss": 0.024, + "step": 20686 + }, + { + "epoch": 0.25860646516162905, + "grad_norm": 2.383039951324463, + "learning_rate": 1.8506195476373575e-05, + "loss": 0.542, + "step": 20688 + }, + { + "epoch": 0.25863146578664464, + "grad_norm": 3.4946203231811523, + "learning_rate": 1.8505736600155945e-05, + "loss": 0.7606, + "step": 20690 + }, + { + "epoch": 0.2586564664116603, + "grad_norm": 3.9002976417541504, + "learning_rate": 1.8505277659159828e-05, + "loss": 1.1242, + "step": 20692 + }, + { + "epoch": 0.25868146703667594, + "grad_norm": 3.3718502521514893, + "learning_rate": 1.850481865338872e-05, + "loss": 1.3653, + "step": 20694 + }, + { + "epoch": 0.25870646766169153, + "grad_norm": 1.030151605606079, + "learning_rate": 1.8504359582846115e-05, + "loss": 0.7873, + "step": 20696 + }, + { + "epoch": 0.2587314682867072, + "grad_norm": 2.5197651386260986, + "learning_rate": 1.8503900447535507e-05, + "loss": 0.7862, + "step": 20698 + }, + { + "epoch": 0.25875646891172277, + "grad_norm": 0.9679605960845947, + "learning_rate": 1.85034412474604e-05, + "loss": 0.0688, + "step": 20700 + }, + { + "epoch": 0.2587814695367384, + "grad_norm": 3.58669376373291, + "learning_rate": 1.8502981982624284e-05, + "loss": 1.2464, + "step": 20702 + }, + { + "epoch": 0.25880647016175407, + "grad_norm": 0.8052295446395874, + "learning_rate": 1.8502522653030664e-05, + "loss": 0.0163, + "step": 20704 + }, + { + "epoch": 0.25883147078676966, + "grad_norm": 3.727632761001587, + "learning_rate": 1.850206325868303e-05, + "loss": 1.2041, + "step": 20706 + }, + { + "epoch": 0.2588564714117853, + "grad_norm": 3.543837070465088, + "learning_rate": 1.8501603799584886e-05, + "loss": 0.6924, + "step": 20708 + }, + { + "epoch": 0.2588814720368009, + "grad_norm": 7.670420169830322, + "learning_rate": 1.850114427573973e-05, + "loss": 0.8362, + "step": 20710 + }, + { + "epoch": 0.25890647266181654, + "grad_norm": 2.9058291912078857, + "learning_rate": 1.8500684687151057e-05, + "loss": 1.1783, + "step": 20712 + }, + { + "epoch": 0.2589314732868322, + "grad_norm": 0.00558212585747242, + "learning_rate": 1.850022503382238e-05, + "loss": 1.0116, + "step": 20714 + }, + { + "epoch": 0.2589564739118478, + "grad_norm": 0.6301460862159729, + "learning_rate": 1.8499765315757186e-05, + "loss": 0.8825, + "step": 20716 + }, + { + "epoch": 0.25898147453686343, + "grad_norm": 3.6348376274108887, + "learning_rate": 1.8499305532958976e-05, + "loss": 1.0331, + "step": 20718 + }, + { + "epoch": 0.259006475161879, + "grad_norm": 4.349358081817627, + "learning_rate": 1.8498845685431265e-05, + "loss": 1.1809, + "step": 20720 + }, + { + "epoch": 0.25903147578689467, + "grad_norm": 3.8608527183532715, + "learning_rate": 1.8498385773177547e-05, + "loss": 1.2343, + "step": 20722 + }, + { + "epoch": 0.2590564764119103, + "grad_norm": 2.490957498550415, + "learning_rate": 1.849792579620132e-05, + "loss": 0.5406, + "step": 20724 + }, + { + "epoch": 0.2590814770369259, + "grad_norm": 1.6643662452697754, + "learning_rate": 1.8497465754506095e-05, + "loss": 0.7114, + "step": 20726 + }, + { + "epoch": 0.25910647766194156, + "grad_norm": 2.3040268421173096, + "learning_rate": 1.8497005648095374e-05, + "loss": 0.0718, + "step": 20728 + }, + { + "epoch": 0.25913147828695715, + "grad_norm": 1.4021588563919067, + "learning_rate": 1.8496545476972656e-05, + "loss": 0.0291, + "step": 20730 + }, + { + "epoch": 0.2591564789119728, + "grad_norm": 2.0409443378448486, + "learning_rate": 1.849608524114145e-05, + "loss": 0.994, + "step": 20732 + }, + { + "epoch": 0.25918147953698845, + "grad_norm": 2.4768874645233154, + "learning_rate": 1.849562494060526e-05, + "loss": 1.0338, + "step": 20734 + }, + { + "epoch": 0.25920648016200404, + "grad_norm": 0.55203777551651, + "learning_rate": 1.8495164575367594e-05, + "loss": 0.8949, + "step": 20736 + }, + { + "epoch": 0.2592314807870197, + "grad_norm": 1.4941585063934326, + "learning_rate": 1.8494704145431954e-05, + "loss": 0.2938, + "step": 20738 + }, + { + "epoch": 0.2592564814120353, + "grad_norm": 4.593569755554199, + "learning_rate": 1.8494243650801852e-05, + "loss": 0.6939, + "step": 20740 + }, + { + "epoch": 0.2592814820370509, + "grad_norm": 7.944253921508789, + "learning_rate": 1.8493783091480787e-05, + "loss": 1.9455, + "step": 20742 + }, + { + "epoch": 0.2593064826620666, + "grad_norm": 0.003027888247743249, + "learning_rate": 1.849332246747227e-05, + "loss": 0.2353, + "step": 20744 + }, + { + "epoch": 0.25933148328708217, + "grad_norm": 6.236448287963867, + "learning_rate": 1.8492861778779815e-05, + "loss": 0.9894, + "step": 20746 + }, + { + "epoch": 0.2593564839120978, + "grad_norm": 0.6947116851806641, + "learning_rate": 1.8492401025406925e-05, + "loss": 0.1976, + "step": 20748 + }, + { + "epoch": 0.2593814845371134, + "grad_norm": 3.1658618450164795, + "learning_rate": 1.849194020735711e-05, + "loss": 1.559, + "step": 20750 + }, + { + "epoch": 0.25940648516212905, + "grad_norm": 2.3680572509765625, + "learning_rate": 1.8491479324633874e-05, + "loss": 0.5009, + "step": 20752 + }, + { + "epoch": 0.2594314857871447, + "grad_norm": 7.487617492675781, + "learning_rate": 1.849101837724074e-05, + "loss": 1.1552, + "step": 20754 + }, + { + "epoch": 0.2594564864121603, + "grad_norm": 1.6894172430038452, + "learning_rate": 1.8490557365181204e-05, + "loss": 0.3899, + "step": 20756 + }, + { + "epoch": 0.25948148703717594, + "grad_norm": 4.091275215148926, + "learning_rate": 1.8490096288458785e-05, + "loss": 1.8617, + "step": 20758 + }, + { + "epoch": 0.25950648766219153, + "grad_norm": 4.398712635040283, + "learning_rate": 1.8489635147076998e-05, + "loss": 0.7562, + "step": 20760 + }, + { + "epoch": 0.2595314882872072, + "grad_norm": 5.984988689422607, + "learning_rate": 1.8489173941039346e-05, + "loss": 1.5753, + "step": 20762 + }, + { + "epoch": 0.2595564889122228, + "grad_norm": 1.459525465965271, + "learning_rate": 1.8488712670349346e-05, + "loss": 0.8106, + "step": 20764 + }, + { + "epoch": 0.2595814895372384, + "grad_norm": 0.006451237015426159, + "learning_rate": 1.848825133501051e-05, + "loss": 0.0033, + "step": 20766 + }, + { + "epoch": 0.25960649016225407, + "grad_norm": 2.956885576248169, + "learning_rate": 1.8487789935026354e-05, + "loss": 1.8366, + "step": 20768 + }, + { + "epoch": 0.25963149078726966, + "grad_norm": 2.3877923488616943, + "learning_rate": 1.848732847040039e-05, + "loss": 1.0889, + "step": 20770 + }, + { + "epoch": 0.2596564914122853, + "grad_norm": 2.8372156620025635, + "learning_rate": 1.8486866941136134e-05, + "loss": 0.6989, + "step": 20772 + }, + { + "epoch": 0.25968149203730095, + "grad_norm": 4.36910343170166, + "learning_rate": 1.8486405347237094e-05, + "loss": 0.8561, + "step": 20774 + }, + { + "epoch": 0.25970649266231655, + "grad_norm": 2.281430959701538, + "learning_rate": 1.84859436887068e-05, + "loss": 1.4294, + "step": 20776 + }, + { + "epoch": 0.2597314932873322, + "grad_norm": 5.264828205108643, + "learning_rate": 1.848548196554875e-05, + "loss": 1.4986, + "step": 20778 + }, + { + "epoch": 0.2597564939123478, + "grad_norm": 2.47973370552063, + "learning_rate": 1.8485020177766474e-05, + "loss": 1.3286, + "step": 20780 + }, + { + "epoch": 0.25978149453736343, + "grad_norm": 4.8565802574157715, + "learning_rate": 1.8484558325363483e-05, + "loss": 1.1233, + "step": 20782 + }, + { + "epoch": 0.2598064951623791, + "grad_norm": 6.744158744812012, + "learning_rate": 1.8484096408343295e-05, + "loss": 1.9974, + "step": 20784 + }, + { + "epoch": 0.2598314957873947, + "grad_norm": 0.007978453300893307, + "learning_rate": 1.8483634426709432e-05, + "loss": 0.3938, + "step": 20786 + }, + { + "epoch": 0.2598564964124103, + "grad_norm": 4.423379421234131, + "learning_rate": 1.8483172380465404e-05, + "loss": 0.6558, + "step": 20788 + }, + { + "epoch": 0.2598814970374259, + "grad_norm": 3.8210995197296143, + "learning_rate": 1.8482710269614737e-05, + "loss": 1.1413, + "step": 20790 + }, + { + "epoch": 0.25990649766244156, + "grad_norm": 0.683174729347229, + "learning_rate": 1.8482248094160948e-05, + "loss": 0.2322, + "step": 20792 + }, + { + "epoch": 0.2599314982874572, + "grad_norm": 3.0542025566101074, + "learning_rate": 1.848178585410756e-05, + "loss": 0.7604, + "step": 20794 + }, + { + "epoch": 0.2599564989124728, + "grad_norm": 5.026374340057373, + "learning_rate": 1.8481323549458084e-05, + "loss": 1.0186, + "step": 20796 + }, + { + "epoch": 0.25998149953748845, + "grad_norm": 3.6497926712036133, + "learning_rate": 1.8480861180216052e-05, + "loss": 1.017, + "step": 20798 + }, + { + "epoch": 0.26000650016250404, + "grad_norm": 3.7439277172088623, + "learning_rate": 1.8480398746384977e-05, + "loss": 0.9596, + "step": 20800 + }, + { + "epoch": 0.2600315007875197, + "grad_norm": 0.0035517984069883823, + "learning_rate": 1.8479936247968388e-05, + "loss": 0.3715, + "step": 20802 + }, + { + "epoch": 0.26005650141253533, + "grad_norm": 3.0071017742156982, + "learning_rate": 1.8479473684969802e-05, + "loss": 0.9668, + "step": 20804 + }, + { + "epoch": 0.2600815020375509, + "grad_norm": 0.02423325553536415, + "learning_rate": 1.8479011057392744e-05, + "loss": 0.0007, + "step": 20806 + }, + { + "epoch": 0.2601065026625666, + "grad_norm": 3.1602346897125244, + "learning_rate": 1.8478548365240736e-05, + "loss": 0.6632, + "step": 20808 + }, + { + "epoch": 0.26013150328758217, + "grad_norm": 3.524338483810425, + "learning_rate": 1.8478085608517303e-05, + "loss": 0.5773, + "step": 20810 + }, + { + "epoch": 0.2601565039125978, + "grad_norm": 6.3813629150390625, + "learning_rate": 1.847762278722597e-05, + "loss": 0.9069, + "step": 20812 + }, + { + "epoch": 0.26018150453761346, + "grad_norm": 0.991256833076477, + "learning_rate": 1.847715990137026e-05, + "loss": 0.4718, + "step": 20814 + }, + { + "epoch": 0.26020650516262905, + "grad_norm": 2.9461607933044434, + "learning_rate": 1.84766969509537e-05, + "loss": 0.6818, + "step": 20816 + }, + { + "epoch": 0.2602315057876447, + "grad_norm": 2.0105223655700684, + "learning_rate": 1.8476233935979814e-05, + "loss": 0.635, + "step": 20818 + }, + { + "epoch": 0.2602565064126603, + "grad_norm": 0.7569397687911987, + "learning_rate": 1.8475770856452128e-05, + "loss": 0.675, + "step": 20820 + }, + { + "epoch": 0.26028150703767594, + "grad_norm": 5.383545875549316, + "learning_rate": 1.8475307712374172e-05, + "loss": 1.677, + "step": 20822 + }, + { + "epoch": 0.2603065076626916, + "grad_norm": 2.608046054840088, + "learning_rate": 1.847484450374947e-05, + "loss": 0.6178, + "step": 20824 + }, + { + "epoch": 0.2603315082877072, + "grad_norm": 0.0020368180703371763, + "learning_rate": 1.8474381230581547e-05, + "loss": 0.4409, + "step": 20826 + }, + { + "epoch": 0.26035650891272283, + "grad_norm": 3.3435797691345215, + "learning_rate": 1.847391789287394e-05, + "loss": 0.2724, + "step": 20828 + }, + { + "epoch": 0.2603815095377384, + "grad_norm": 8.833345413208008, + "learning_rate": 1.847345449063017e-05, + "loss": 0.4752, + "step": 20830 + }, + { + "epoch": 0.26040651016275407, + "grad_norm": 3.1192498207092285, + "learning_rate": 1.847299102385377e-05, + "loss": 0.5406, + "step": 20832 + }, + { + "epoch": 0.2604315107877697, + "grad_norm": 0.00412823585793376, + "learning_rate": 1.847252749254827e-05, + "loss": 0.0165, + "step": 20834 + }, + { + "epoch": 0.2604565114127853, + "grad_norm": 4.597357273101807, + "learning_rate": 1.8472063896717194e-05, + "loss": 1.4898, + "step": 20836 + }, + { + "epoch": 0.26048151203780096, + "grad_norm": 1.5921225547790527, + "learning_rate": 1.847160023636408e-05, + "loss": 0.448, + "step": 20838 + }, + { + "epoch": 0.26050651266281655, + "grad_norm": 3.573540449142456, + "learning_rate": 1.8471136511492458e-05, + "loss": 1.4429, + "step": 20840 + }, + { + "epoch": 0.2605315132878322, + "grad_norm": 3.1683738231658936, + "learning_rate": 1.847067272210586e-05, + "loss": 0.6813, + "step": 20842 + }, + { + "epoch": 0.26055651391284784, + "grad_norm": 3.3655037879943848, + "learning_rate": 1.8470208868207812e-05, + "loss": 1.1104, + "step": 20844 + }, + { + "epoch": 0.26058151453786343, + "grad_norm": 3.454545497894287, + "learning_rate": 1.846974494980185e-05, + "loss": 0.9419, + "step": 20846 + }, + { + "epoch": 0.2606065151628791, + "grad_norm": 7.875922679901123, + "learning_rate": 1.8469280966891512e-05, + "loss": 1.0518, + "step": 20848 + }, + { + "epoch": 0.2606315157878947, + "grad_norm": 2.4604079723358154, + "learning_rate": 1.8468816919480325e-05, + "loss": 1.1828, + "step": 20850 + }, + { + "epoch": 0.2606565164129103, + "grad_norm": 3.00716233253479, + "learning_rate": 1.846835280757183e-05, + "loss": 1.0763, + "step": 20852 + }, + { + "epoch": 0.26068151703792597, + "grad_norm": 2.8828775882720947, + "learning_rate": 1.8467888631169555e-05, + "loss": 0.849, + "step": 20854 + }, + { + "epoch": 0.26070651766294156, + "grad_norm": 0.11963877081871033, + "learning_rate": 1.8467424390277035e-05, + "loss": 0.0131, + "step": 20856 + }, + { + "epoch": 0.2607315182879572, + "grad_norm": 0.0035307544749230146, + "learning_rate": 1.8466960084897813e-05, + "loss": 1.2716, + "step": 20858 + }, + { + "epoch": 0.2607565189129728, + "grad_norm": 5.581565856933594, + "learning_rate": 1.846649571503542e-05, + "loss": 0.8616, + "step": 20860 + }, + { + "epoch": 0.26078151953798845, + "grad_norm": 3.3519129753112793, + "learning_rate": 1.8466031280693387e-05, + "loss": 0.9197, + "step": 20862 + }, + { + "epoch": 0.2608065201630041, + "grad_norm": 2.9164977073669434, + "learning_rate": 1.8465566781875265e-05, + "loss": 1.003, + "step": 20864 + }, + { + "epoch": 0.2608315207880197, + "grad_norm": 2.014763355255127, + "learning_rate": 1.846510221858458e-05, + "loss": 0.6636, + "step": 20866 + }, + { + "epoch": 0.26085652141303534, + "grad_norm": 1.3637264966964722, + "learning_rate": 1.846463759082487e-05, + "loss": 0.1699, + "step": 20868 + }, + { + "epoch": 0.26088152203805093, + "grad_norm": 2.6747658252716064, + "learning_rate": 1.846417289859968e-05, + "loss": 0.7467, + "step": 20870 + }, + { + "epoch": 0.2609065226630666, + "grad_norm": 2.8447203636169434, + "learning_rate": 1.846370814191255e-05, + "loss": 0.3981, + "step": 20872 + }, + { + "epoch": 0.2609315232880822, + "grad_norm": 2.221637010574341, + "learning_rate": 1.846324332076701e-05, + "loss": 1.2314, + "step": 20874 + }, + { + "epoch": 0.2609565239130978, + "grad_norm": 9.198205947875977, + "learning_rate": 1.8462778435166607e-05, + "loss": 1.9646, + "step": 20876 + }, + { + "epoch": 0.26098152453811346, + "grad_norm": 3.280381679534912, + "learning_rate": 1.846231348511488e-05, + "loss": 1.3402, + "step": 20878 + }, + { + "epoch": 0.26100652516312906, + "grad_norm": 5.562235355377197, + "learning_rate": 1.8461848470615373e-05, + "loss": 2.1261, + "step": 20880 + }, + { + "epoch": 0.2610315257881447, + "grad_norm": 3.5959174633026123, + "learning_rate": 1.8461383391671622e-05, + "loss": 1.4667, + "step": 20882 + }, + { + "epoch": 0.26105652641316035, + "grad_norm": 4.233811378479004, + "learning_rate": 1.8460918248287174e-05, + "loss": 0.8078, + "step": 20884 + }, + { + "epoch": 0.26108152703817594, + "grad_norm": 4.27008581161499, + "learning_rate": 1.8460453040465568e-05, + "loss": 3.3359, + "step": 20886 + }, + { + "epoch": 0.2611065276631916, + "grad_norm": 1.8415855169296265, + "learning_rate": 1.8459987768210345e-05, + "loss": 1.0196, + "step": 20888 + }, + { + "epoch": 0.2611315282882072, + "grad_norm": 2.838512659072876, + "learning_rate": 1.8459522431525057e-05, + "loss": 0.948, + "step": 20890 + }, + { + "epoch": 0.26115652891322283, + "grad_norm": 0.28620821237564087, + "learning_rate": 1.845905703041324e-05, + "loss": 0.1355, + "step": 20892 + }, + { + "epoch": 0.2611815295382385, + "grad_norm": 1.8439089059829712, + "learning_rate": 1.845859156487844e-05, + "loss": 1.6022, + "step": 20894 + }, + { + "epoch": 0.26120653016325407, + "grad_norm": 1.737701416015625, + "learning_rate": 1.84581260349242e-05, + "loss": 0.7872, + "step": 20896 + }, + { + "epoch": 0.2612315307882697, + "grad_norm": 3.532630205154419, + "learning_rate": 1.8457660440554076e-05, + "loss": 1.3607, + "step": 20898 + }, + { + "epoch": 0.2612565314132853, + "grad_norm": 3.238858461380005, + "learning_rate": 1.84571947817716e-05, + "loss": 1.276, + "step": 20900 + }, + { + "epoch": 0.26128153203830096, + "grad_norm": 4.140963077545166, + "learning_rate": 1.8456729058580326e-05, + "loss": 0.7773, + "step": 20902 + }, + { + "epoch": 0.2613065326633166, + "grad_norm": 2.0047199726104736, + "learning_rate": 1.8456263270983797e-05, + "loss": 0.5287, + "step": 20904 + }, + { + "epoch": 0.2613315332883322, + "grad_norm": 2.603482246398926, + "learning_rate": 1.8455797418985568e-05, + "loss": 0.1911, + "step": 20906 + }, + { + "epoch": 0.26135653391334784, + "grad_norm": 0.0036717746406793594, + "learning_rate": 1.845533150258918e-05, + "loss": 0.6278, + "step": 20908 + }, + { + "epoch": 0.26138153453836344, + "grad_norm": 6.985012054443359, + "learning_rate": 1.8454865521798184e-05, + "loss": 1.0363, + "step": 20910 + }, + { + "epoch": 0.2614065351633791, + "grad_norm": 1.1905224323272705, + "learning_rate": 1.8454399476616123e-05, + "loss": 0.0796, + "step": 20912 + }, + { + "epoch": 0.26143153578839473, + "grad_norm": 2.6499736309051514, + "learning_rate": 1.8453933367046556e-05, + "loss": 0.378, + "step": 20914 + }, + { + "epoch": 0.2614565364134103, + "grad_norm": 2.2546935081481934, + "learning_rate": 1.845346719309302e-05, + "loss": 0.0641, + "step": 20916 + }, + { + "epoch": 0.26148153703842597, + "grad_norm": 0.0032302404288202524, + "learning_rate": 1.8453000954759083e-05, + "loss": 0.0002, + "step": 20918 + }, + { + "epoch": 0.26150653766344156, + "grad_norm": 1.9756191968917847, + "learning_rate": 1.8452534652048283e-05, + "loss": 0.7112, + "step": 20920 + }, + { + "epoch": 0.2615315382884572, + "grad_norm": 2.0716958045959473, + "learning_rate": 1.8452068284964172e-05, + "loss": 1.0837, + "step": 20922 + }, + { + "epoch": 0.26155653891347286, + "grad_norm": 0.6601183414459229, + "learning_rate": 1.8451601853510307e-05, + "loss": 0.2274, + "step": 20924 + }, + { + "epoch": 0.26158153953848845, + "grad_norm": 8.92652702331543, + "learning_rate": 1.8451135357690236e-05, + "loss": 1.8361, + "step": 20926 + }, + { + "epoch": 0.2616065401635041, + "grad_norm": 3.956080436706543, + "learning_rate": 1.8450668797507512e-05, + "loss": 0.7325, + "step": 20928 + }, + { + "epoch": 0.2616315407885197, + "grad_norm": 0.4205591678619385, + "learning_rate": 1.8450202172965694e-05, + "loss": 0.412, + "step": 20930 + }, + { + "epoch": 0.26165654141353534, + "grad_norm": 2.7569403648376465, + "learning_rate": 1.8449735484068324e-05, + "loss": 1.4808, + "step": 20932 + }, + { + "epoch": 0.261681542038551, + "grad_norm": 0.85260009765625, + "learning_rate": 1.844926873081897e-05, + "loss": 0.099, + "step": 20934 + }, + { + "epoch": 0.2617065426635666, + "grad_norm": 0.008907555602490902, + "learning_rate": 1.8448801913221177e-05, + "loss": 0.1087, + "step": 20936 + }, + { + "epoch": 0.2617315432885822, + "grad_norm": 2.583394765853882, + "learning_rate": 1.8448335031278504e-05, + "loss": 0.8744, + "step": 20938 + }, + { + "epoch": 0.2617565439135978, + "grad_norm": 0.00343825644813478, + "learning_rate": 1.8447868084994507e-05, + "loss": 0.6559, + "step": 20940 + }, + { + "epoch": 0.26178154453861346, + "grad_norm": 2.515969753265381, + "learning_rate": 1.844740107437274e-05, + "loss": 0.6606, + "step": 20942 + }, + { + "epoch": 0.2618065451636291, + "grad_norm": 3.815372943878174, + "learning_rate": 1.844693399941676e-05, + "loss": 1.0355, + "step": 20944 + }, + { + "epoch": 0.2618315457886447, + "grad_norm": 4.536055564880371, + "learning_rate": 1.8446466860130127e-05, + "loss": 1.276, + "step": 20946 + }, + { + "epoch": 0.26185654641366035, + "grad_norm": 3.909010410308838, + "learning_rate": 1.84459996565164e-05, + "loss": 0.7278, + "step": 20948 + }, + { + "epoch": 0.26188154703867594, + "grad_norm": 3.9853715896606445, + "learning_rate": 1.8445532388579128e-05, + "loss": 1.4247, + "step": 20950 + }, + { + "epoch": 0.2619065476636916, + "grad_norm": 2.223839521408081, + "learning_rate": 1.844506505632188e-05, + "loss": 0.4746, + "step": 20952 + }, + { + "epoch": 0.26193154828870724, + "grad_norm": 1.9584726095199585, + "learning_rate": 1.8444597659748208e-05, + "loss": 0.6483, + "step": 20954 + }, + { + "epoch": 0.26195654891372283, + "grad_norm": 2.741780996322632, + "learning_rate": 1.8444130198861673e-05, + "loss": 0.896, + "step": 20956 + }, + { + "epoch": 0.2619815495387385, + "grad_norm": 6.531975746154785, + "learning_rate": 1.8443662673665842e-05, + "loss": 0.8774, + "step": 20958 + }, + { + "epoch": 0.26200655016375407, + "grad_norm": 3.998225688934326, + "learning_rate": 1.8443195084164266e-05, + "loss": 1.3126, + "step": 20960 + }, + { + "epoch": 0.2620315507887697, + "grad_norm": 3.291705846786499, + "learning_rate": 1.844272743036051e-05, + "loss": 0.5893, + "step": 20962 + }, + { + "epoch": 0.26205655141378537, + "grad_norm": 1.0133451223373413, + "learning_rate": 1.8442259712258136e-05, + "loss": 0.3882, + "step": 20964 + }, + { + "epoch": 0.26208155203880096, + "grad_norm": 5.2118306159973145, + "learning_rate": 1.844179192986071e-05, + "loss": 0.9283, + "step": 20966 + }, + { + "epoch": 0.2621065526638166, + "grad_norm": 3.4135899543762207, + "learning_rate": 1.8441324083171786e-05, + "loss": 1.2113, + "step": 20968 + }, + { + "epoch": 0.2621315532888322, + "grad_norm": 2.242176055908203, + "learning_rate": 1.844085617219493e-05, + "loss": 0.8919, + "step": 20970 + }, + { + "epoch": 0.26215655391384785, + "grad_norm": 3.0523643493652344, + "learning_rate": 1.844038819693371e-05, + "loss": 0.74, + "step": 20972 + }, + { + "epoch": 0.2621815545388635, + "grad_norm": 12.007596015930176, + "learning_rate": 1.843992015739169e-05, + "loss": 1.4213, + "step": 20974 + }, + { + "epoch": 0.2622065551638791, + "grad_norm": 3.5384361743927, + "learning_rate": 1.8439452053572426e-05, + "loss": 0.6405, + "step": 20976 + }, + { + "epoch": 0.26223155578889473, + "grad_norm": 4.338492393493652, + "learning_rate": 1.8438983885479493e-05, + "loss": 1.3582, + "step": 20978 + }, + { + "epoch": 0.2622565564139103, + "grad_norm": 2.72182559967041, + "learning_rate": 1.8438515653116447e-05, + "loss": 0.6543, + "step": 20980 + }, + { + "epoch": 0.26228155703892597, + "grad_norm": 2.9903078079223633, + "learning_rate": 1.8438047356486864e-05, + "loss": 1.2037, + "step": 20982 + }, + { + "epoch": 0.2623065576639416, + "grad_norm": 1.8186839818954468, + "learning_rate": 1.8437578995594304e-05, + "loss": 0.4937, + "step": 20984 + }, + { + "epoch": 0.2623315582889572, + "grad_norm": 3.4511775970458984, + "learning_rate": 1.8437110570442335e-05, + "loss": 1.8569, + "step": 20986 + }, + { + "epoch": 0.26235655891397286, + "grad_norm": 3.228041410446167, + "learning_rate": 1.8436642081034525e-05, + "loss": 0.7984, + "step": 20988 + }, + { + "epoch": 0.26238155953898845, + "grad_norm": 3.589641809463501, + "learning_rate": 1.8436173527374444e-05, + "loss": 1.2409, + "step": 20990 + }, + { + "epoch": 0.2624065601640041, + "grad_norm": 2.529844045639038, + "learning_rate": 1.8435704909465653e-05, + "loss": 1.1302, + "step": 20992 + }, + { + "epoch": 0.26243156078901975, + "grad_norm": 7.165794372558594, + "learning_rate": 1.843523622731173e-05, + "loss": 0.3364, + "step": 20994 + }, + { + "epoch": 0.26245656141403534, + "grad_norm": 0.0038565366994589567, + "learning_rate": 1.8434767480916242e-05, + "loss": 0.2021, + "step": 20996 + }, + { + "epoch": 0.262481562039051, + "grad_norm": 1.239985704421997, + "learning_rate": 1.843429867028276e-05, + "loss": 0.6346, + "step": 20998 + }, + { + "epoch": 0.2625065626640666, + "grad_norm": 3.4852020740509033, + "learning_rate": 1.8433829795414846e-05, + "loss": 1.1536, + "step": 21000 + }, + { + "epoch": 0.2625315632890822, + "grad_norm": 4.124420166015625, + "learning_rate": 1.8433360856316082e-05, + "loss": 0.9115, + "step": 21002 + }, + { + "epoch": 0.2625565639140979, + "grad_norm": 3.5992000102996826, + "learning_rate": 1.8432891852990028e-05, + "loss": 1.5552, + "step": 21004 + }, + { + "epoch": 0.26258156453911347, + "grad_norm": 2.1846776008605957, + "learning_rate": 1.8432422785440266e-05, + "loss": 0.9312, + "step": 21006 + }, + { + "epoch": 0.2626065651641291, + "grad_norm": 2.526333808898926, + "learning_rate": 1.8431953653670366e-05, + "loss": 0.3434, + "step": 21008 + }, + { + "epoch": 0.2626315657891447, + "grad_norm": 3.684783935546875, + "learning_rate": 1.8431484457683896e-05, + "loss": 0.1368, + "step": 21010 + }, + { + "epoch": 0.26265656641416035, + "grad_norm": 4.288950443267822, + "learning_rate": 1.8431015197484435e-05, + "loss": 1.3837, + "step": 21012 + }, + { + "epoch": 0.262681567039176, + "grad_norm": 5.841103553771973, + "learning_rate": 1.8430545873075557e-05, + "loss": 0.8038, + "step": 21014 + }, + { + "epoch": 0.2627065676641916, + "grad_norm": 2.936309576034546, + "learning_rate": 1.843007648446083e-05, + "loss": 1.0699, + "step": 21016 + }, + { + "epoch": 0.26273156828920724, + "grad_norm": 5.475618362426758, + "learning_rate": 1.8429607031643836e-05, + "loss": 1.5734, + "step": 21018 + }, + { + "epoch": 0.26275656891422283, + "grad_norm": 2.5717458724975586, + "learning_rate": 1.8429137514628142e-05, + "loss": 0.4277, + "step": 21020 + }, + { + "epoch": 0.2627815695392385, + "grad_norm": 2.016773223876953, + "learning_rate": 1.842866793341733e-05, + "loss": 0.3465, + "step": 21022 + }, + { + "epoch": 0.2628065701642541, + "grad_norm": 2.45682430267334, + "learning_rate": 1.842819828801498e-05, + "loss": 1.1695, + "step": 21024 + }, + { + "epoch": 0.2628315707892697, + "grad_norm": 0.05801411345601082, + "learning_rate": 1.8427728578424657e-05, + "loss": 0.002, + "step": 21026 + }, + { + "epoch": 0.26285657141428537, + "grad_norm": 4.023478984832764, + "learning_rate": 1.8427258804649947e-05, + "loss": 1.5286, + "step": 21028 + }, + { + "epoch": 0.26288157203930096, + "grad_norm": 2.2648303508758545, + "learning_rate": 1.8426788966694425e-05, + "loss": 0.6942, + "step": 21030 + }, + { + "epoch": 0.2629065726643166, + "grad_norm": 2.717717409133911, + "learning_rate": 1.842631906456167e-05, + "loss": 0.477, + "step": 21032 + }, + { + "epoch": 0.26293157328933225, + "grad_norm": 2.6368441581726074, + "learning_rate": 1.8425849098255263e-05, + "loss": 0.3792, + "step": 21034 + }, + { + "epoch": 0.26295657391434785, + "grad_norm": 2.009145975112915, + "learning_rate": 1.8425379067778778e-05, + "loss": 1.1784, + "step": 21036 + }, + { + "epoch": 0.2629815745393635, + "grad_norm": 4.231860637664795, + "learning_rate": 1.8424908973135803e-05, + "loss": 1.0396, + "step": 21038 + }, + { + "epoch": 0.2630065751643791, + "grad_norm": 5.561961650848389, + "learning_rate": 1.8424438814329906e-05, + "loss": 1.7348, + "step": 21040 + }, + { + "epoch": 0.26303157578939473, + "grad_norm": 3.229924440383911, + "learning_rate": 1.842396859136468e-05, + "loss": 1.2995, + "step": 21042 + }, + { + "epoch": 0.2630565764144104, + "grad_norm": 4.602447509765625, + "learning_rate": 1.8423498304243693e-05, + "loss": 1.905, + "step": 21044 + }, + { + "epoch": 0.263081577039426, + "grad_norm": 1.2862263917922974, + "learning_rate": 1.842302795297054e-05, + "loss": 0.0983, + "step": 21046 + }, + { + "epoch": 0.2631065776644416, + "grad_norm": 3.9421305656433105, + "learning_rate": 1.8422557537548797e-05, + "loss": 0.8968, + "step": 21048 + }, + { + "epoch": 0.2631315782894572, + "grad_norm": 0.8895215392112732, + "learning_rate": 1.8422087057982047e-05, + "loss": 0.6223, + "step": 21050 + }, + { + "epoch": 0.26315657891447286, + "grad_norm": 3.4357762336730957, + "learning_rate": 1.842161651427387e-05, + "loss": 0.4326, + "step": 21052 + }, + { + "epoch": 0.2631815795394885, + "grad_norm": 2.543018102645874, + "learning_rate": 1.8421145906427855e-05, + "loss": 0.439, + "step": 21054 + }, + { + "epoch": 0.2632065801645041, + "grad_norm": 3.4608852863311768, + "learning_rate": 1.842067523444758e-05, + "loss": 1.3184, + "step": 21056 + }, + { + "epoch": 0.26323158078951975, + "grad_norm": 0.0023115018848329782, + "learning_rate": 1.8420204498336637e-05, + "loss": 0.5139, + "step": 21058 + }, + { + "epoch": 0.26325658141453534, + "grad_norm": 3.617258071899414, + "learning_rate": 1.8419733698098605e-05, + "loss": 0.0206, + "step": 21060 + }, + { + "epoch": 0.263281582039551, + "grad_norm": 6.447940826416016, + "learning_rate": 1.8419262833737076e-05, + "loss": 0.906, + "step": 21062 + }, + { + "epoch": 0.26330658266456664, + "grad_norm": 0.5426695346832275, + "learning_rate": 1.8418791905255626e-05, + "loss": 0.876, + "step": 21064 + }, + { + "epoch": 0.2633315832895822, + "grad_norm": 4.5597028732299805, + "learning_rate": 1.8418320912657852e-05, + "loss": 1.1298, + "step": 21066 + }, + { + "epoch": 0.2633565839145979, + "grad_norm": 0.0026761421468108892, + "learning_rate": 1.8417849855947335e-05, + "loss": 0.8862, + "step": 21068 + }, + { + "epoch": 0.26338158453961347, + "grad_norm": 3.0741231441497803, + "learning_rate": 1.8417378735127664e-05, + "loss": 1.0963, + "step": 21070 + }, + { + "epoch": 0.2634065851646291, + "grad_norm": 0.004206355195492506, + "learning_rate": 1.8416907550202427e-05, + "loss": 0.0002, + "step": 21072 + }, + { + "epoch": 0.26343158578964476, + "grad_norm": 2.645333766937256, + "learning_rate": 1.841643630117521e-05, + "loss": 0.653, + "step": 21074 + }, + { + "epoch": 0.26345658641466035, + "grad_norm": 7.183883190155029, + "learning_rate": 1.8415964988049607e-05, + "loss": 1.6358, + "step": 21076 + }, + { + "epoch": 0.263481587039676, + "grad_norm": 3.1905136108398438, + "learning_rate": 1.84154936108292e-05, + "loss": 1.2574, + "step": 21078 + }, + { + "epoch": 0.2635065876646916, + "grad_norm": 3.2966372966766357, + "learning_rate": 1.841502216951759e-05, + "loss": 0.7472, + "step": 21080 + }, + { + "epoch": 0.26353158828970724, + "grad_norm": 3.212942600250244, + "learning_rate": 1.841455066411836e-05, + "loss": 1.369, + "step": 21082 + }, + { + "epoch": 0.2635565889147229, + "grad_norm": 1.606210470199585, + "learning_rate": 1.84140790946351e-05, + "loss": 0.1252, + "step": 21084 + }, + { + "epoch": 0.2635815895397385, + "grad_norm": 3.7933382987976074, + "learning_rate": 1.8413607461071406e-05, + "loss": 0.4456, + "step": 21086 + }, + { + "epoch": 0.26360659016475413, + "grad_norm": 5.628840923309326, + "learning_rate": 1.8413135763430864e-05, + "loss": 1.0991, + "step": 21088 + }, + { + "epoch": 0.2636315907897697, + "grad_norm": 2.818437099456787, + "learning_rate": 1.841266400171707e-05, + "loss": 1.6606, + "step": 21090 + }, + { + "epoch": 0.26365659141478537, + "grad_norm": 1.9309642314910889, + "learning_rate": 1.841219217593362e-05, + "loss": 0.2345, + "step": 21092 + }, + { + "epoch": 0.263681592039801, + "grad_norm": 0.00289534917101264, + "learning_rate": 1.8411720286084104e-05, + "loss": 0.2704, + "step": 21094 + }, + { + "epoch": 0.2637065926648166, + "grad_norm": 2.7642273902893066, + "learning_rate": 1.841124833217211e-05, + "loss": 0.9845, + "step": 21096 + }, + { + "epoch": 0.26373159328983226, + "grad_norm": 5.025702953338623, + "learning_rate": 1.8410776314201245e-05, + "loss": 0.9279, + "step": 21098 + }, + { + "epoch": 0.26375659391484785, + "grad_norm": 1.8858535289764404, + "learning_rate": 1.8410304232175092e-05, + "loss": 0.6256, + "step": 21100 + }, + { + "epoch": 0.2637815945398635, + "grad_norm": 0.0021669811103492975, + "learning_rate": 1.8409832086097256e-05, + "loss": 0.0252, + "step": 21102 + }, + { + "epoch": 0.26380659516487914, + "grad_norm": 4.797242164611816, + "learning_rate": 1.840935987597132e-05, + "loss": 1.6032, + "step": 21104 + }, + { + "epoch": 0.26383159578989474, + "grad_norm": 5.100276947021484, + "learning_rate": 1.8408887601800897e-05, + "loss": 1.1875, + "step": 21106 + }, + { + "epoch": 0.2638565964149104, + "grad_norm": 8.52053165435791, + "learning_rate": 1.840841526358957e-05, + "loss": 1.2313, + "step": 21108 + }, + { + "epoch": 0.263881597039926, + "grad_norm": 2.067626953125, + "learning_rate": 1.8407942861340945e-05, + "loss": 0.9107, + "step": 21110 + }, + { + "epoch": 0.2639065976649416, + "grad_norm": 5.510507583618164, + "learning_rate": 1.8407470395058614e-05, + "loss": 1.8659, + "step": 21112 + }, + { + "epoch": 0.26393159828995727, + "grad_norm": 3.0036959648132324, + "learning_rate": 1.840699786474618e-05, + "loss": 1.3737, + "step": 21114 + }, + { + "epoch": 0.26395659891497286, + "grad_norm": 3.4296956062316895, + "learning_rate": 1.8406525270407238e-05, + "loss": 0.8878, + "step": 21116 + }, + { + "epoch": 0.2639815995399885, + "grad_norm": 0.39427265524864197, + "learning_rate": 1.8406052612045386e-05, + "loss": 0.0039, + "step": 21118 + }, + { + "epoch": 0.2640066001650041, + "grad_norm": 5.050327777862549, + "learning_rate": 1.8405579889664225e-05, + "loss": 0.839, + "step": 21120 + }, + { + "epoch": 0.26403160079001975, + "grad_norm": 2.1213159561157227, + "learning_rate": 1.840510710326736e-05, + "loss": 0.8172, + "step": 21122 + }, + { + "epoch": 0.2640566014150354, + "grad_norm": 3.6500251293182373, + "learning_rate": 1.8404634252858386e-05, + "loss": 1.1039, + "step": 21124 + }, + { + "epoch": 0.264081602040051, + "grad_norm": 3.748779296875, + "learning_rate": 1.8404161338440907e-05, + "loss": 0.9686, + "step": 21126 + }, + { + "epoch": 0.26410660266506664, + "grad_norm": 0.002585270442068577, + "learning_rate": 1.8403688360018522e-05, + "loss": 0.7633, + "step": 21128 + }, + { + "epoch": 0.26413160329008223, + "grad_norm": 1.8186006546020508, + "learning_rate": 1.8403215317594835e-05, + "loss": 0.9338, + "step": 21130 + }, + { + "epoch": 0.2641566039150979, + "grad_norm": 0.0014839534414932132, + "learning_rate": 1.840274221117345e-05, + "loss": 0.6785, + "step": 21132 + }, + { + "epoch": 0.2641816045401135, + "grad_norm": 3.946655511856079, + "learning_rate": 1.840226904075797e-05, + "loss": 1.3464, + "step": 21134 + }, + { + "epoch": 0.2642066051651291, + "grad_norm": 1.4740614891052246, + "learning_rate": 1.8401795806351995e-05, + "loss": 0.6152, + "step": 21136 + }, + { + "epoch": 0.26423160579014476, + "grad_norm": 2.4667985439300537, + "learning_rate": 1.8401322507959133e-05, + "loss": 0.7339, + "step": 21138 + }, + { + "epoch": 0.26425660641516036, + "grad_norm": 3.164121389389038, + "learning_rate": 1.840084914558298e-05, + "loss": 1.0313, + "step": 21140 + }, + { + "epoch": 0.264281607040176, + "grad_norm": 2.140854597091675, + "learning_rate": 1.8400375719227157e-05, + "loss": 0.0712, + "step": 21142 + }, + { + "epoch": 0.26430660766519165, + "grad_norm": 3.8604979515075684, + "learning_rate": 1.8399902228895257e-05, + "loss": 1.3193, + "step": 21144 + }, + { + "epoch": 0.26433160829020724, + "grad_norm": 2.9104466438293457, + "learning_rate": 1.839942867459089e-05, + "loss": 1.1001, + "step": 21146 + }, + { + "epoch": 0.2643566089152229, + "grad_norm": 0.0027009020559489727, + "learning_rate": 1.839895505631766e-05, + "loss": 0.7236, + "step": 21148 + }, + { + "epoch": 0.2643816095402385, + "grad_norm": 5.471864700317383, + "learning_rate": 1.8398481374079177e-05, + "loss": 0.6019, + "step": 21150 + }, + { + "epoch": 0.26440661016525413, + "grad_norm": 3.633151054382324, + "learning_rate": 1.8398007627879047e-05, + "loss": 0.6387, + "step": 21152 + }, + { + "epoch": 0.2644316107902698, + "grad_norm": 1.7873347997665405, + "learning_rate": 1.839753381772088e-05, + "loss": 0.1085, + "step": 21154 + }, + { + "epoch": 0.26445661141528537, + "grad_norm": 3.9397542476654053, + "learning_rate": 1.839705994360828e-05, + "loss": 0.723, + "step": 21156 + }, + { + "epoch": 0.264481612040301, + "grad_norm": 1.9269204139709473, + "learning_rate": 1.8396586005544862e-05, + "loss": 1.0978, + "step": 21158 + }, + { + "epoch": 0.2645066126653166, + "grad_norm": 3.3477225303649902, + "learning_rate": 1.8396112003534233e-05, + "loss": 0.6743, + "step": 21160 + }, + { + "epoch": 0.26453161329033226, + "grad_norm": 7.526628017425537, + "learning_rate": 1.839563793758e-05, + "loss": 1.2416, + "step": 21162 + }, + { + "epoch": 0.2645566139153479, + "grad_norm": 3.432875871658325, + "learning_rate": 1.8395163807685775e-05, + "loss": 0.6989, + "step": 21164 + }, + { + "epoch": 0.2645816145403635, + "grad_norm": 0.0027472921647131443, + "learning_rate": 1.8394689613855174e-05, + "loss": 0.6881, + "step": 21166 + }, + { + "epoch": 0.26460661516537914, + "grad_norm": 0.09453659504652023, + "learning_rate": 1.83942153560918e-05, + "loss": 0.2767, + "step": 21168 + }, + { + "epoch": 0.26463161579039474, + "grad_norm": 4.183764934539795, + "learning_rate": 1.8393741034399267e-05, + "loss": 1.0825, + "step": 21170 + }, + { + "epoch": 0.2646566164154104, + "grad_norm": 0.012403902597725391, + "learning_rate": 1.8393266648781195e-05, + "loss": 0.0004, + "step": 21172 + }, + { + "epoch": 0.26468161704042603, + "grad_norm": 6.561824798583984, + "learning_rate": 1.839279219924119e-05, + "loss": 0.7361, + "step": 21174 + }, + { + "epoch": 0.2647066176654416, + "grad_norm": 2.9493906497955322, + "learning_rate": 1.8392317685782865e-05, + "loss": 0.736, + "step": 21176 + }, + { + "epoch": 0.26473161829045727, + "grad_norm": 3.3251101970672607, + "learning_rate": 1.8391843108409833e-05, + "loss": 0.2499, + "step": 21178 + }, + { + "epoch": 0.26475661891547286, + "grad_norm": 3.191457748413086, + "learning_rate": 1.839136846712571e-05, + "loss": 1.734, + "step": 21180 + }, + { + "epoch": 0.2647816195404885, + "grad_norm": 4.712528705596924, + "learning_rate": 1.8390893761934117e-05, + "loss": 1.0255, + "step": 21182 + }, + { + "epoch": 0.26480662016550416, + "grad_norm": 2.6173861026763916, + "learning_rate": 1.839041899283866e-05, + "loss": 1.5606, + "step": 21184 + }, + { + "epoch": 0.26483162079051975, + "grad_norm": 0.0018517518183216453, + "learning_rate": 1.838994415984296e-05, + "loss": 0.3844, + "step": 21186 + }, + { + "epoch": 0.2648566214155354, + "grad_norm": 0.5614980459213257, + "learning_rate": 1.838946926295063e-05, + "loss": 0.5213, + "step": 21188 + }, + { + "epoch": 0.264881622040551, + "grad_norm": 5.636438369750977, + "learning_rate": 1.8388994302165292e-05, + "loss": 0.9457, + "step": 21190 + }, + { + "epoch": 0.26490662266556664, + "grad_norm": 3.2293143272399902, + "learning_rate": 1.8388519277490555e-05, + "loss": 1.2801, + "step": 21192 + }, + { + "epoch": 0.2649316232905823, + "grad_norm": 3.976243019104004, + "learning_rate": 1.8388044188930046e-05, + "loss": 0.6221, + "step": 21194 + }, + { + "epoch": 0.2649566239155979, + "grad_norm": 6.127871513366699, + "learning_rate": 1.8387569036487372e-05, + "loss": 0.7189, + "step": 21196 + }, + { + "epoch": 0.2649816245406135, + "grad_norm": 0.001634459593333304, + "learning_rate": 1.8387093820166163e-05, + "loss": 0.4207, + "step": 21198 + }, + { + "epoch": 0.2650066251656291, + "grad_norm": 2.617138385772705, + "learning_rate": 1.8386618539970036e-05, + "loss": 0.6959, + "step": 21200 + }, + { + "epoch": 0.26503162579064476, + "grad_norm": 0.06320769339799881, + "learning_rate": 1.8386143195902604e-05, + "loss": 0.0017, + "step": 21202 + }, + { + "epoch": 0.2650566264156604, + "grad_norm": 3.3799989223480225, + "learning_rate": 1.8385667787967492e-05, + "loss": 1.0222, + "step": 21204 + }, + { + "epoch": 0.265081627040676, + "grad_norm": 2.4092774391174316, + "learning_rate": 1.8385192316168322e-05, + "loss": 0.9087, + "step": 21206 + }, + { + "epoch": 0.26510662766569165, + "grad_norm": 0.002543705515563488, + "learning_rate": 1.838471678050871e-05, + "loss": 0.016, + "step": 21208 + }, + { + "epoch": 0.26513162829070724, + "grad_norm": 6.333165168762207, + "learning_rate": 1.838424118099228e-05, + "loss": 2.5264, + "step": 21210 + }, + { + "epoch": 0.2651566289157229, + "grad_norm": 5.4576005935668945, + "learning_rate": 1.838376551762266e-05, + "loss": 1.3978, + "step": 21212 + }, + { + "epoch": 0.26518162954073854, + "grad_norm": 7.6134514808654785, + "learning_rate": 1.8383289790403466e-05, + "loss": 1.3409, + "step": 21214 + }, + { + "epoch": 0.26520663016575413, + "grad_norm": 3.449867010116577, + "learning_rate": 1.838281399933832e-05, + "loss": 1.6083, + "step": 21216 + }, + { + "epoch": 0.2652316307907698, + "grad_norm": 0.001572385779581964, + "learning_rate": 1.838233814443085e-05, + "loss": 1.0456, + "step": 21218 + }, + { + "epoch": 0.26525663141578537, + "grad_norm": 4.299172401428223, + "learning_rate": 1.8381862225684675e-05, + "loss": 0.9392, + "step": 21220 + }, + { + "epoch": 0.265281632040801, + "grad_norm": 4.582866668701172, + "learning_rate": 1.8381386243103424e-05, + "loss": 1.6909, + "step": 21222 + }, + { + "epoch": 0.26530663266581667, + "grad_norm": 0.002941576763987541, + "learning_rate": 1.8380910196690723e-05, + "loss": 0.3001, + "step": 21224 + }, + { + "epoch": 0.26533163329083226, + "grad_norm": 8.698057174682617, + "learning_rate": 1.8380434086450193e-05, + "loss": 0.6037, + "step": 21226 + }, + { + "epoch": 0.2653566339158479, + "grad_norm": 2.766207456588745, + "learning_rate": 1.8379957912385462e-05, + "loss": 0.2883, + "step": 21228 + }, + { + "epoch": 0.2653816345408635, + "grad_norm": 2.2110795974731445, + "learning_rate": 1.8379481674500156e-05, + "loss": 1.1441, + "step": 21230 + }, + { + "epoch": 0.26540663516587915, + "grad_norm": 3.7328896522521973, + "learning_rate": 1.8379005372797904e-05, + "loss": 0.502, + "step": 21232 + }, + { + "epoch": 0.2654316357908948, + "grad_norm": 0.002424628473818302, + "learning_rate": 1.837852900728233e-05, + "loss": 0.2278, + "step": 21234 + }, + { + "epoch": 0.2654566364159104, + "grad_norm": 4.584891319274902, + "learning_rate": 1.8378052577957066e-05, + "loss": 0.5905, + "step": 21236 + }, + { + "epoch": 0.26548163704092603, + "grad_norm": 1.3350796699523926, + "learning_rate": 1.837757608482574e-05, + "loss": 0.1599, + "step": 21238 + }, + { + "epoch": 0.2655066376659416, + "grad_norm": 3.5220413208007812, + "learning_rate": 1.8377099527891978e-05, + "loss": 1.3385, + "step": 21240 + }, + { + "epoch": 0.2655316382909573, + "grad_norm": 0.012576154433190823, + "learning_rate": 1.8376622907159408e-05, + "loss": 0.5439, + "step": 21242 + }, + { + "epoch": 0.2655566389159729, + "grad_norm": 0.09766247123479843, + "learning_rate": 1.8376146222631664e-05, + "loss": 0.4752, + "step": 21244 + }, + { + "epoch": 0.2655816395409885, + "grad_norm": 4.7150983810424805, + "learning_rate": 1.8375669474312375e-05, + "loss": 1.8465, + "step": 21246 + }, + { + "epoch": 0.26560664016600416, + "grad_norm": 3.327509641647339, + "learning_rate": 1.8375192662205172e-05, + "loss": 1.4484, + "step": 21248 + }, + { + "epoch": 0.26563164079101975, + "grad_norm": 1.3451141119003296, + "learning_rate": 1.8374715786313685e-05, + "loss": 2.1872, + "step": 21250 + }, + { + "epoch": 0.2656566414160354, + "grad_norm": 3.978407859802246, + "learning_rate": 1.837423884664155e-05, + "loss": 0.9675, + "step": 21252 + }, + { + "epoch": 0.26568164204105105, + "grad_norm": 1.713996410369873, + "learning_rate": 1.837376184319239e-05, + "loss": 0.7118, + "step": 21254 + }, + { + "epoch": 0.26570664266606664, + "grad_norm": 3.1111559867858887, + "learning_rate": 1.837328477596985e-05, + "loss": 1.3974, + "step": 21256 + }, + { + "epoch": 0.2657316432910823, + "grad_norm": 3.0829527378082275, + "learning_rate": 1.8372807644977555e-05, + "loss": 0.1555, + "step": 21258 + }, + { + "epoch": 0.2657566439160979, + "grad_norm": 3.2841196060180664, + "learning_rate": 1.8372330450219142e-05, + "loss": 0.3776, + "step": 21260 + }, + { + "epoch": 0.2657816445411135, + "grad_norm": 3.264230728149414, + "learning_rate": 1.8371853191698244e-05, + "loss": 1.5618, + "step": 21262 + }, + { + "epoch": 0.2658066451661292, + "grad_norm": 2.982574701309204, + "learning_rate": 1.8371375869418493e-05, + "loss": 1.3036, + "step": 21264 + }, + { + "epoch": 0.26583164579114477, + "grad_norm": 3.4506912231445312, + "learning_rate": 1.837089848338353e-05, + "loss": 1.8455, + "step": 21266 + }, + { + "epoch": 0.2658566464161604, + "grad_norm": 2.6546006202697754, + "learning_rate": 1.8370421033596986e-05, + "loss": 0.956, + "step": 21268 + }, + { + "epoch": 0.265881647041176, + "grad_norm": 5.537849426269531, + "learning_rate": 1.83699435200625e-05, + "loss": 0.7567, + "step": 21270 + }, + { + "epoch": 0.26590664766619165, + "grad_norm": 0.008659588173031807, + "learning_rate": 1.8369465942783708e-05, + "loss": 0.3012, + "step": 21272 + }, + { + "epoch": 0.2659316482912073, + "grad_norm": 3.7197933197021484, + "learning_rate": 1.836898830176425e-05, + "loss": 1.1531, + "step": 21274 + }, + { + "epoch": 0.2659566489162229, + "grad_norm": 3.5649282932281494, + "learning_rate": 1.8368510597007757e-05, + "loss": 1.0645, + "step": 21276 + }, + { + "epoch": 0.26598164954123854, + "grad_norm": 0.001827624742873013, + "learning_rate": 1.8368032828517873e-05, + "loss": 0.8412, + "step": 21278 + }, + { + "epoch": 0.26600665016625413, + "grad_norm": 2.5709781646728516, + "learning_rate": 1.8367554996298233e-05, + "loss": 1.4026, + "step": 21280 + }, + { + "epoch": 0.2660316507912698, + "grad_norm": 0.010934390127658844, + "learning_rate": 1.8367077100352475e-05, + "loss": 0.0778, + "step": 21282 + }, + { + "epoch": 0.26605665141628543, + "grad_norm": 3.2279739379882812, + "learning_rate": 1.8366599140684243e-05, + "loss": 0.7421, + "step": 21284 + }, + { + "epoch": 0.266081652041301, + "grad_norm": 0.0025531058199703693, + "learning_rate": 1.8366121117297174e-05, + "loss": 1.2061, + "step": 21286 + }, + { + "epoch": 0.26610665266631667, + "grad_norm": 1.942981243133545, + "learning_rate": 1.8365643030194912e-05, + "loss": 0.3585, + "step": 21288 + }, + { + "epoch": 0.26613165329133226, + "grad_norm": 1.8048288822174072, + "learning_rate": 1.8365164879381095e-05, + "loss": 0.7742, + "step": 21290 + }, + { + "epoch": 0.2661566539163479, + "grad_norm": 0.002969704568386078, + "learning_rate": 1.8364686664859364e-05, + "loss": 0.0001, + "step": 21292 + }, + { + "epoch": 0.26618165454136355, + "grad_norm": 3.3302252292633057, + "learning_rate": 1.836420838663336e-05, + "loss": 0.9065, + "step": 21294 + }, + { + "epoch": 0.26620665516637915, + "grad_norm": 2.543253183364868, + "learning_rate": 1.8363730044706735e-05, + "loss": 1.0297, + "step": 21296 + }, + { + "epoch": 0.2662316557913948, + "grad_norm": 5.665989875793457, + "learning_rate": 1.836325163908312e-05, + "loss": 0.5035, + "step": 21298 + }, + { + "epoch": 0.2662566564164104, + "grad_norm": 2.265645742416382, + "learning_rate": 1.8362773169766164e-05, + "loss": 0.8405, + "step": 21300 + }, + { + "epoch": 0.26628165704142603, + "grad_norm": 1.9401923418045044, + "learning_rate": 1.836229463675951e-05, + "loss": 0.307, + "step": 21302 + }, + { + "epoch": 0.2663066576664417, + "grad_norm": 7.844910621643066, + "learning_rate": 1.83618160400668e-05, + "loss": 1.0092, + "step": 21304 + }, + { + "epoch": 0.2663316582914573, + "grad_norm": 2.367825984954834, + "learning_rate": 1.8361337379691685e-05, + "loss": 2.223, + "step": 21306 + }, + { + "epoch": 0.2663566589164729, + "grad_norm": 1.0891071557998657, + "learning_rate": 1.8360858655637807e-05, + "loss": 0.6455, + "step": 21308 + }, + { + "epoch": 0.2663816595414885, + "grad_norm": 0.0021841370034962893, + "learning_rate": 1.836037986790881e-05, + "loss": 0.4591, + "step": 21310 + }, + { + "epoch": 0.26640666016650416, + "grad_norm": 1.3469120264053345, + "learning_rate": 1.8359901016508343e-05, + "loss": 0.0357, + "step": 21312 + }, + { + "epoch": 0.2664316607915198, + "grad_norm": 0.6816152930259705, + "learning_rate": 1.8359422101440052e-05, + "loss": 0.0179, + "step": 21314 + }, + { + "epoch": 0.2664566614165354, + "grad_norm": 2.07466721534729, + "learning_rate": 1.8358943122707585e-05, + "loss": 0.0899, + "step": 21316 + }, + { + "epoch": 0.26648166204155105, + "grad_norm": 5.163802146911621, + "learning_rate": 1.8358464080314585e-05, + "loss": 1.8697, + "step": 21318 + }, + { + "epoch": 0.26650666266656664, + "grad_norm": 0.004028677940368652, + "learning_rate": 1.835798497426471e-05, + "loss": 0.0002, + "step": 21320 + }, + { + "epoch": 0.2665316632915823, + "grad_norm": 3.276939630508423, + "learning_rate": 1.8357505804561603e-05, + "loss": 1.3184, + "step": 21322 + }, + { + "epoch": 0.26655666391659794, + "grad_norm": 3.760782480239868, + "learning_rate": 1.835702657120891e-05, + "loss": 0.9203, + "step": 21324 + }, + { + "epoch": 0.26658166454161353, + "grad_norm": 1.9658701419830322, + "learning_rate": 1.835654727421029e-05, + "loss": 0.9074, + "step": 21326 + }, + { + "epoch": 0.2666066651666292, + "grad_norm": 7.445523262023926, + "learning_rate": 1.8356067913569383e-05, + "loss": 0.2811, + "step": 21328 + }, + { + "epoch": 0.26663166579164477, + "grad_norm": 2.0524954795837402, + "learning_rate": 1.8355588489289844e-05, + "loss": 1.869, + "step": 21330 + }, + { + "epoch": 0.2666566664166604, + "grad_norm": 0.0041433945298194885, + "learning_rate": 1.8355109001375325e-05, + "loss": 0.0346, + "step": 21332 + }, + { + "epoch": 0.26668166704167606, + "grad_norm": 4.764466285705566, + "learning_rate": 1.835462944982948e-05, + "loss": 1.5903, + "step": 21334 + }, + { + "epoch": 0.26670666766669165, + "grad_norm": 4.3777899742126465, + "learning_rate": 1.8354149834655958e-05, + "loss": 1.0433, + "step": 21336 + }, + { + "epoch": 0.2667316682917073, + "grad_norm": 0.8517720103263855, + "learning_rate": 1.835367015585841e-05, + "loss": 0.4124, + "step": 21338 + }, + { + "epoch": 0.2667566689167229, + "grad_norm": 6.810955047607422, + "learning_rate": 1.8353190413440493e-05, + "loss": 1.6328, + "step": 21340 + }, + { + "epoch": 0.26678166954173854, + "grad_norm": 1.9655792713165283, + "learning_rate": 1.8352710607405858e-05, + "loss": 0.0826, + "step": 21342 + }, + { + "epoch": 0.2668066701667542, + "grad_norm": 6.729326248168945, + "learning_rate": 1.8352230737758164e-05, + "loss": 0.7126, + "step": 21344 + }, + { + "epoch": 0.2668316707917698, + "grad_norm": 0.0054574147798120975, + "learning_rate": 1.8351750804501057e-05, + "loss": 1.289, + "step": 21346 + }, + { + "epoch": 0.26685667141678543, + "grad_norm": 3.5569324493408203, + "learning_rate": 1.83512708076382e-05, + "loss": 2.167, + "step": 21348 + }, + { + "epoch": 0.266881672041801, + "grad_norm": 3.9544506072998047, + "learning_rate": 1.8350790747173245e-05, + "loss": 0.8958, + "step": 21350 + }, + { + "epoch": 0.26690667266681667, + "grad_norm": 0.3997102379798889, + "learning_rate": 1.8350310623109847e-05, + "loss": 0.8072, + "step": 21352 + }, + { + "epoch": 0.2669316732918323, + "grad_norm": 1.7544467449188232, + "learning_rate": 1.8349830435451666e-05, + "loss": 0.3878, + "step": 21354 + }, + { + "epoch": 0.2669566739168479, + "grad_norm": 3.4675021171569824, + "learning_rate": 1.8349350184202355e-05, + "loss": 1.5705, + "step": 21356 + }, + { + "epoch": 0.26698167454186356, + "grad_norm": 2.914132595062256, + "learning_rate": 1.8348869869365576e-05, + "loss": 0.4663, + "step": 21358 + }, + { + "epoch": 0.26700667516687915, + "grad_norm": 6.53049898147583, + "learning_rate": 1.834838949094498e-05, + "loss": 2.6134, + "step": 21360 + }, + { + "epoch": 0.2670316757918948, + "grad_norm": 9.476529121398926, + "learning_rate": 1.8347909048944236e-05, + "loss": 1.4903, + "step": 21362 + }, + { + "epoch": 0.26705667641691044, + "grad_norm": 2.676006317138672, + "learning_rate": 1.8347428543366997e-05, + "loss": 1.981, + "step": 21364 + }, + { + "epoch": 0.26708167704192604, + "grad_norm": 3.8623976707458496, + "learning_rate": 1.8346947974216917e-05, + "loss": 1.609, + "step": 21366 + }, + { + "epoch": 0.2671066776669417, + "grad_norm": 5.686813831329346, + "learning_rate": 1.8346467341497668e-05, + "loss": 0.4308, + "step": 21368 + }, + { + "epoch": 0.2671316782919573, + "grad_norm": 0.07021289318799973, + "learning_rate": 1.8345986645212903e-05, + "loss": 0.1135, + "step": 21370 + }, + { + "epoch": 0.2671566789169729, + "grad_norm": 1.2820792198181152, + "learning_rate": 1.834550588536628e-05, + "loss": 0.529, + "step": 21372 + }, + { + "epoch": 0.26718167954198857, + "grad_norm": 2.3793532848358154, + "learning_rate": 1.8345025061961466e-05, + "loss": 1.1213, + "step": 21374 + }, + { + "epoch": 0.26720668016700416, + "grad_norm": 2.361470937728882, + "learning_rate": 1.8344544175002124e-05, + "loss": 0.4253, + "step": 21376 + }, + { + "epoch": 0.2672316807920198, + "grad_norm": 2.238837242126465, + "learning_rate": 1.834406322449191e-05, + "loss": 0.903, + "step": 21378 + }, + { + "epoch": 0.2672566814170354, + "grad_norm": 3.850369453430176, + "learning_rate": 1.8343582210434493e-05, + "loss": 0.7767, + "step": 21380 + }, + { + "epoch": 0.26728168204205105, + "grad_norm": 0.0040670121088624, + "learning_rate": 1.834310113283353e-05, + "loss": 0.8523, + "step": 21382 + }, + { + "epoch": 0.2673066826670667, + "grad_norm": 4.4955925941467285, + "learning_rate": 1.834261999169269e-05, + "loss": 0.6556, + "step": 21384 + }, + { + "epoch": 0.2673316832920823, + "grad_norm": 3.648254871368408, + "learning_rate": 1.834213878701564e-05, + "loss": 1.1731, + "step": 21386 + }, + { + "epoch": 0.26735668391709794, + "grad_norm": 4.519613265991211, + "learning_rate": 1.8341657518806037e-05, + "loss": 0.6313, + "step": 21388 + }, + { + "epoch": 0.26738168454211353, + "grad_norm": 21.152565002441406, + "learning_rate": 1.8341176187067553e-05, + "loss": 0.8546, + "step": 21390 + }, + { + "epoch": 0.2674066851671292, + "grad_norm": 3.2106971740722656, + "learning_rate": 1.8340694791803847e-05, + "loss": 0.8645, + "step": 21392 + }, + { + "epoch": 0.2674316857921448, + "grad_norm": 1.8636951446533203, + "learning_rate": 1.834021333301859e-05, + "loss": 0.6183, + "step": 21394 + }, + { + "epoch": 0.2674566864171604, + "grad_norm": 4.458049297332764, + "learning_rate": 1.833973181071545e-05, + "loss": 2.7048, + "step": 21396 + }, + { + "epoch": 0.26748168704217606, + "grad_norm": 0.1137775182723999, + "learning_rate": 1.8339250224898085e-05, + "loss": 0.9237, + "step": 21398 + }, + { + "epoch": 0.26750668766719166, + "grad_norm": 1.077541470527649, + "learning_rate": 1.8338768575570177e-05, + "loss": 0.8586, + "step": 21400 + }, + { + "epoch": 0.2675316882922073, + "grad_norm": 3.5524961948394775, + "learning_rate": 1.8338286862735385e-05, + "loss": 0.7452, + "step": 21402 + }, + { + "epoch": 0.26755668891722295, + "grad_norm": 4.625899314880371, + "learning_rate": 1.8337805086397377e-05, + "loss": 0.7476, + "step": 21404 + }, + { + "epoch": 0.26758168954223854, + "grad_norm": 6.28200626373291, + "learning_rate": 1.8337323246559827e-05, + "loss": 1.7299, + "step": 21406 + }, + { + "epoch": 0.2676066901672542, + "grad_norm": 13.750629425048828, + "learning_rate": 1.8336841343226403e-05, + "loss": 1.629, + "step": 21408 + }, + { + "epoch": 0.2676316907922698, + "grad_norm": 2.1047439575195312, + "learning_rate": 1.8336359376400768e-05, + "loss": 0.4532, + "step": 21410 + }, + { + "epoch": 0.26765669141728543, + "grad_norm": 2.9534614086151123, + "learning_rate": 1.8335877346086606e-05, + "loss": 0.4229, + "step": 21412 + }, + { + "epoch": 0.2676816920423011, + "grad_norm": 3.893333673477173, + "learning_rate": 1.833539525228758e-05, + "loss": 0.9505, + "step": 21414 + }, + { + "epoch": 0.26770669266731667, + "grad_norm": 2.4421074390411377, + "learning_rate": 1.8334913095007357e-05, + "loss": 0.6202, + "step": 21416 + }, + { + "epoch": 0.2677316932923323, + "grad_norm": 2.044511318206787, + "learning_rate": 1.833443087424962e-05, + "loss": 0.921, + "step": 21418 + }, + { + "epoch": 0.2677566939173479, + "grad_norm": 3.583855152130127, + "learning_rate": 1.8333948590018034e-05, + "loss": 0.9618, + "step": 21420 + }, + { + "epoch": 0.26778169454236356, + "grad_norm": 0.8896406888961792, + "learning_rate": 1.8333466242316275e-05, + "loss": 1.3053, + "step": 21422 + }, + { + "epoch": 0.2678066951673792, + "grad_norm": 3.3164713382720947, + "learning_rate": 1.8332983831148016e-05, + "loss": 1.7751, + "step": 21424 + }, + { + "epoch": 0.2678316957923948, + "grad_norm": 2.5178098678588867, + "learning_rate": 1.833250135651693e-05, + "loss": 0.4943, + "step": 21426 + }, + { + "epoch": 0.26785669641741044, + "grad_norm": 2.9651458263397217, + "learning_rate": 1.8332018818426692e-05, + "loss": 1.626, + "step": 21428 + }, + { + "epoch": 0.26788169704242604, + "grad_norm": 4.249985218048096, + "learning_rate": 1.8331536216880974e-05, + "loss": 1.6577, + "step": 21430 + }, + { + "epoch": 0.2679066976674417, + "grad_norm": 4.749431133270264, + "learning_rate": 1.833105355188346e-05, + "loss": 1.7017, + "step": 21432 + }, + { + "epoch": 0.26793169829245733, + "grad_norm": 3.50238037109375, + "learning_rate": 1.8330570823437813e-05, + "loss": 1.1791, + "step": 21434 + }, + { + "epoch": 0.2679566989174729, + "grad_norm": 0.6056642532348633, + "learning_rate": 1.8330088031547725e-05, + "loss": 0.9649, + "step": 21436 + }, + { + "epoch": 0.26798169954248857, + "grad_norm": 1.1505414247512817, + "learning_rate": 1.832960517621686e-05, + "loss": 0.8929, + "step": 21438 + }, + { + "epoch": 0.26800670016750416, + "grad_norm": 3.5299179553985596, + "learning_rate": 1.83291222574489e-05, + "loss": 0.6605, + "step": 21440 + }, + { + "epoch": 0.2680317007925198, + "grad_norm": 2.75168776512146, + "learning_rate": 1.8328639275247523e-05, + "loss": 0.1547, + "step": 21442 + }, + { + "epoch": 0.26805670141753546, + "grad_norm": 0.6750131845474243, + "learning_rate": 1.8328156229616407e-05, + "loss": 0.9346, + "step": 21444 + }, + { + "epoch": 0.26808170204255105, + "grad_norm": 2.6702218055725098, + "learning_rate": 1.832767312055923e-05, + "loss": 0.5388, + "step": 21446 + }, + { + "epoch": 0.2681067026675667, + "grad_norm": 3.122213840484619, + "learning_rate": 1.8327189948079674e-05, + "loss": 1.6209, + "step": 21448 + }, + { + "epoch": 0.2681317032925823, + "grad_norm": 0.628381073474884, + "learning_rate": 1.8326706712181414e-05, + "loss": 1.0922, + "step": 21450 + }, + { + "epoch": 0.26815670391759794, + "grad_norm": 4.62929630279541, + "learning_rate": 1.8326223412868135e-05, + "loss": 1.8291, + "step": 21452 + }, + { + "epoch": 0.2681817045426136, + "grad_norm": 3.1049702167510986, + "learning_rate": 1.8325740050143516e-05, + "loss": 1.1491, + "step": 21454 + }, + { + "epoch": 0.2682067051676292, + "grad_norm": 4.44682502746582, + "learning_rate": 1.8325256624011237e-05, + "loss": 0.7264, + "step": 21456 + }, + { + "epoch": 0.2682317057926448, + "grad_norm": 4.443964958190918, + "learning_rate": 1.8324773134474985e-05, + "loss": 1.1878, + "step": 21458 + }, + { + "epoch": 0.2682567064176604, + "grad_norm": 4.604781150817871, + "learning_rate": 1.8324289581538435e-05, + "loss": 1.5302, + "step": 21460 + }, + { + "epoch": 0.26828170704267607, + "grad_norm": 2.1924619674682617, + "learning_rate": 1.832380596520527e-05, + "loss": 0.9727, + "step": 21462 + }, + { + "epoch": 0.2683067076676917, + "grad_norm": 5.930898189544678, + "learning_rate": 1.832332228547918e-05, + "loss": 0.1578, + "step": 21464 + }, + { + "epoch": 0.2683317082927073, + "grad_norm": 3.4424245357513428, + "learning_rate": 1.8322838542363843e-05, + "loss": 0.7714, + "step": 21466 + }, + { + "epoch": 0.26835670891772295, + "grad_norm": 6.025420665740967, + "learning_rate": 1.8322354735862946e-05, + "loss": 1.0715, + "step": 21468 + }, + { + "epoch": 0.26838170954273854, + "grad_norm": 0.003871573368087411, + "learning_rate": 1.8321870865980173e-05, + "loss": 0.6865, + "step": 21470 + }, + { + "epoch": 0.2684067101677542, + "grad_norm": 4.713444709777832, + "learning_rate": 1.8321386932719206e-05, + "loss": 1.6384, + "step": 21472 + }, + { + "epoch": 0.26843171079276984, + "grad_norm": 2.649028778076172, + "learning_rate": 1.832090293608373e-05, + "loss": 0.7738, + "step": 21474 + }, + { + "epoch": 0.26845671141778543, + "grad_norm": 0.07550941407680511, + "learning_rate": 1.8320418876077437e-05, + "loss": 0.7223, + "step": 21476 + }, + { + "epoch": 0.2684817120428011, + "grad_norm": 6.594125270843506, + "learning_rate": 1.831993475270401e-05, + "loss": 2.083, + "step": 21478 + }, + { + "epoch": 0.26850671266781667, + "grad_norm": 3.5144870281219482, + "learning_rate": 1.8319450565967137e-05, + "loss": 1.5716, + "step": 21480 + }, + { + "epoch": 0.2685317132928323, + "grad_norm": 2.422887086868286, + "learning_rate": 1.8318966315870508e-05, + "loss": 0.1146, + "step": 21482 + }, + { + "epoch": 0.26855671391784797, + "grad_norm": 0.004281402099877596, + "learning_rate": 1.8318482002417803e-05, + "loss": 0.0491, + "step": 21484 + }, + { + "epoch": 0.26858171454286356, + "grad_norm": 4.013017177581787, + "learning_rate": 1.831799762561272e-05, + "loss": 1.6001, + "step": 21486 + }, + { + "epoch": 0.2686067151678792, + "grad_norm": 2.2772655487060547, + "learning_rate": 1.831751318545894e-05, + "loss": 0.2982, + "step": 21488 + }, + { + "epoch": 0.2686317157928948, + "grad_norm": 4.7158284187316895, + "learning_rate": 1.8317028681960155e-05, + "loss": 1.0952, + "step": 21490 + }, + { + "epoch": 0.26865671641791045, + "grad_norm": 2.9854376316070557, + "learning_rate": 1.831654411512006e-05, + "loss": 0.9261, + "step": 21492 + }, + { + "epoch": 0.2686817170429261, + "grad_norm": 4.120916843414307, + "learning_rate": 1.8316059484942338e-05, + "loss": 0.3918, + "step": 21494 + }, + { + "epoch": 0.2687067176679417, + "grad_norm": 3.4464659690856934, + "learning_rate": 1.8315574791430683e-05, + "loss": 0.9996, + "step": 21496 + }, + { + "epoch": 0.26873171829295733, + "grad_norm": 1.1546045541763306, + "learning_rate": 1.8315090034588783e-05, + "loss": 0.6132, + "step": 21498 + }, + { + "epoch": 0.2687567189179729, + "grad_norm": 2.985617160797119, + "learning_rate": 1.831460521442034e-05, + "loss": 1.3811, + "step": 21500 + }, + { + "epoch": 0.2687817195429886, + "grad_norm": 2.3937830924987793, + "learning_rate": 1.8314120330929036e-05, + "loss": 0.6807, + "step": 21502 + }, + { + "epoch": 0.2688067201680042, + "grad_norm": 0.4284961521625519, + "learning_rate": 1.8313635384118565e-05, + "loss": 0.5953, + "step": 21504 + }, + { + "epoch": 0.2688317207930198, + "grad_norm": 0.006704732310026884, + "learning_rate": 1.8313150373992625e-05, + "loss": 0.0321, + "step": 21506 + }, + { + "epoch": 0.26885672141803546, + "grad_norm": 0.35903069376945496, + "learning_rate": 1.831266530055491e-05, + "loss": 0.9052, + "step": 21508 + }, + { + "epoch": 0.26888172204305105, + "grad_norm": 1.4582858085632324, + "learning_rate": 1.8312180163809107e-05, + "loss": 0.6595, + "step": 21510 + }, + { + "epoch": 0.2689067226680667, + "grad_norm": 12.997960090637207, + "learning_rate": 1.831169496375892e-05, + "loss": 2.2853, + "step": 21512 + }, + { + "epoch": 0.26893172329308235, + "grad_norm": 0.5843169093132019, + "learning_rate": 1.8311209700408034e-05, + "loss": 0.6882, + "step": 21514 + }, + { + "epoch": 0.26895672391809794, + "grad_norm": 1.938527226448059, + "learning_rate": 1.8310724373760152e-05, + "loss": 0.2029, + "step": 21516 + }, + { + "epoch": 0.2689817245431136, + "grad_norm": 4.306480407714844, + "learning_rate": 1.831023898381897e-05, + "loss": 2.4841, + "step": 21518 + }, + { + "epoch": 0.2690067251681292, + "grad_norm": 4.671163558959961, + "learning_rate": 1.8309753530588184e-05, + "loss": 0.94, + "step": 21520 + }, + { + "epoch": 0.2690317257931448, + "grad_norm": 2.5230553150177, + "learning_rate": 1.8309268014071487e-05, + "loss": 0.9027, + "step": 21522 + }, + { + "epoch": 0.2690567264181605, + "grad_norm": 3.6794540882110596, + "learning_rate": 1.8308782434272583e-05, + "loss": 0.7719, + "step": 21524 + }, + { + "epoch": 0.26908172704317607, + "grad_norm": 1.513599157333374, + "learning_rate": 1.8308296791195164e-05, + "loss": 0.3396, + "step": 21526 + }, + { + "epoch": 0.2691067276681917, + "grad_norm": 5.147559642791748, + "learning_rate": 1.830781108484293e-05, + "loss": 2.0611, + "step": 21528 + }, + { + "epoch": 0.2691317282932073, + "grad_norm": 5.143999099731445, + "learning_rate": 1.830732531521959e-05, + "loss": 0.168, + "step": 21530 + }, + { + "epoch": 0.26915672891822295, + "grad_norm": 3.5274147987365723, + "learning_rate": 1.830683948232883e-05, + "loss": 0.2617, + "step": 21532 + }, + { + "epoch": 0.2691817295432386, + "grad_norm": 2.8092074394226074, + "learning_rate": 1.8306353586174355e-05, + "loss": 0.845, + "step": 21534 + }, + { + "epoch": 0.2692067301682542, + "grad_norm": 3.6545097827911377, + "learning_rate": 1.8305867626759864e-05, + "loss": 0.9565, + "step": 21536 + }, + { + "epoch": 0.26923173079326984, + "grad_norm": 0.0053160483948886395, + "learning_rate": 1.8305381604089062e-05, + "loss": 0.7038, + "step": 21538 + }, + { + "epoch": 0.26925673141828543, + "grad_norm": 3.552482843399048, + "learning_rate": 1.8304895518165647e-05, + "loss": 0.9714, + "step": 21540 + }, + { + "epoch": 0.2692817320433011, + "grad_norm": 1.8294785022735596, + "learning_rate": 1.8304409368993324e-05, + "loss": 0.3647, + "step": 21542 + }, + { + "epoch": 0.26930673266831673, + "grad_norm": 0.0057667214423418045, + "learning_rate": 1.830392315657579e-05, + "loss": 0.2882, + "step": 21544 + }, + { + "epoch": 0.2693317332933323, + "grad_norm": 2.8615477085113525, + "learning_rate": 1.8303436880916756e-05, + "loss": 0.5067, + "step": 21546 + }, + { + "epoch": 0.26935673391834797, + "grad_norm": 0.0022327019833028316, + "learning_rate": 1.830295054201992e-05, + "loss": 0.9684, + "step": 21548 + }, + { + "epoch": 0.26938173454336356, + "grad_norm": 6.030083656311035, + "learning_rate": 1.8302464139888983e-05, + "loss": 2.226, + "step": 21550 + }, + { + "epoch": 0.2694067351683792, + "grad_norm": 1.5909267663955688, + "learning_rate": 1.8301977674527656e-05, + "loss": 0.9933, + "step": 21552 + }, + { + "epoch": 0.26943173579339486, + "grad_norm": 3.880159854888916, + "learning_rate": 1.8301491145939643e-05, + "loss": 1.3509, + "step": 21554 + }, + { + "epoch": 0.26945673641841045, + "grad_norm": 3.5072803497314453, + "learning_rate": 1.8301004554128643e-05, + "loss": 0.2431, + "step": 21556 + }, + { + "epoch": 0.2694817370434261, + "grad_norm": 3.100409746170044, + "learning_rate": 1.830051789909837e-05, + "loss": 1.0775, + "step": 21558 + }, + { + "epoch": 0.2695067376684417, + "grad_norm": 7.363621711730957, + "learning_rate": 1.8300031180852524e-05, + "loss": 1.0811, + "step": 21560 + }, + { + "epoch": 0.26953173829345733, + "grad_norm": 3.0139784812927246, + "learning_rate": 1.8299544399394815e-05, + "loss": 1.2102, + "step": 21562 + }, + { + "epoch": 0.269556738918473, + "grad_norm": 0.118843674659729, + "learning_rate": 1.8299057554728953e-05, + "loss": 0.4782, + "step": 21564 + }, + { + "epoch": 0.2695817395434886, + "grad_norm": 0.26969411969184875, + "learning_rate": 1.8298570646858634e-05, + "loss": 0.6269, + "step": 21566 + }, + { + "epoch": 0.2696067401685042, + "grad_norm": 4.988354206085205, + "learning_rate": 1.8298083675787582e-05, + "loss": 1.1852, + "step": 21568 + }, + { + "epoch": 0.2696317407935198, + "grad_norm": 3.4211857318878174, + "learning_rate": 1.8297596641519496e-05, + "loss": 1.3732, + "step": 21570 + }, + { + "epoch": 0.26965674141853546, + "grad_norm": 1.7010364532470703, + "learning_rate": 1.8297109544058087e-05, + "loss": 1.0185, + "step": 21572 + }, + { + "epoch": 0.2696817420435511, + "grad_norm": 3.3123278617858887, + "learning_rate": 1.8296622383407062e-05, + "loss": 0.573, + "step": 21574 + }, + { + "epoch": 0.2697067426685667, + "grad_norm": 2.0606725215911865, + "learning_rate": 1.829613515957014e-05, + "loss": 0.9666, + "step": 21576 + }, + { + "epoch": 0.26973174329358235, + "grad_norm": 3.0976269245147705, + "learning_rate": 1.829564787255102e-05, + "loss": 0.7328, + "step": 21578 + }, + { + "epoch": 0.26975674391859794, + "grad_norm": 2.7643418312072754, + "learning_rate": 1.8295160522353422e-05, + "loss": 0.1532, + "step": 21580 + }, + { + "epoch": 0.2697817445436136, + "grad_norm": 0.005264649633318186, + "learning_rate": 1.8294673108981057e-05, + "loss": 0.5961, + "step": 21582 + }, + { + "epoch": 0.26980674516862924, + "grad_norm": 9.222343444824219, + "learning_rate": 1.8294185632437632e-05, + "loss": 1.7974, + "step": 21584 + }, + { + "epoch": 0.26983174579364483, + "grad_norm": 0.2862796485424042, + "learning_rate": 1.829369809272686e-05, + "loss": 1.4617, + "step": 21586 + }, + { + "epoch": 0.2698567464186605, + "grad_norm": 3.0016098022460938, + "learning_rate": 1.829321048985246e-05, + "loss": 1.7528, + "step": 21588 + }, + { + "epoch": 0.26988174704367607, + "grad_norm": 0.009979424998164177, + "learning_rate": 1.8292722823818138e-05, + "loss": 0.6069, + "step": 21590 + }, + { + "epoch": 0.2699067476686917, + "grad_norm": 3.5371193885803223, + "learning_rate": 1.8292235094627614e-05, + "loss": 1.305, + "step": 21592 + }, + { + "epoch": 0.26993174829370736, + "grad_norm": 4.862240791320801, + "learning_rate": 1.82917473022846e-05, + "loss": 1.1997, + "step": 21594 + }, + { + "epoch": 0.26995674891872296, + "grad_norm": 2.4961163997650146, + "learning_rate": 1.829125944679281e-05, + "loss": 1.1484, + "step": 21596 + }, + { + "epoch": 0.2699817495437386, + "grad_norm": 3.2471718788146973, + "learning_rate": 1.829077152815596e-05, + "loss": 0.76, + "step": 21598 + }, + { + "epoch": 0.2700067501687542, + "grad_norm": 0.046372756361961365, + "learning_rate": 1.8290283546377773e-05, + "loss": 0.6482, + "step": 21600 + }, + { + "epoch": 0.27003175079376984, + "grad_norm": 5.8084492683410645, + "learning_rate": 1.8289795501461952e-05, + "loss": 0.9718, + "step": 21602 + }, + { + "epoch": 0.2700567514187855, + "grad_norm": 3.0334107875823975, + "learning_rate": 1.8289307393412222e-05, + "loss": 0.6909, + "step": 21604 + }, + { + "epoch": 0.2700817520438011, + "grad_norm": 9.774495124816895, + "learning_rate": 1.82888192222323e-05, + "loss": 1.0027, + "step": 21606 + }, + { + "epoch": 0.27010675266881673, + "grad_norm": 2.8066484928131104, + "learning_rate": 1.82883309879259e-05, + "loss": 1.2266, + "step": 21608 + }, + { + "epoch": 0.2701317532938323, + "grad_norm": 3.8001697063446045, + "learning_rate": 1.828784269049675e-05, + "loss": 0.3513, + "step": 21610 + }, + { + "epoch": 0.27015675391884797, + "grad_norm": 0.007617214694619179, + "learning_rate": 1.8287354329948557e-05, + "loss": 0.837, + "step": 21612 + }, + { + "epoch": 0.2701817545438636, + "grad_norm": 0.006545962765812874, + "learning_rate": 1.8286865906285044e-05, + "loss": 0.6918, + "step": 21614 + }, + { + "epoch": 0.2702067551688792, + "grad_norm": 3.4115982055664062, + "learning_rate": 1.8286377419509935e-05, + "loss": 0.7151, + "step": 21616 + }, + { + "epoch": 0.27023175579389486, + "grad_norm": 2.920926094055176, + "learning_rate": 1.828588886962695e-05, + "loss": 1.2153, + "step": 21618 + }, + { + "epoch": 0.27025675641891045, + "grad_norm": 0.8757997751235962, + "learning_rate": 1.82854002566398e-05, + "loss": 0.9567, + "step": 21620 + }, + { + "epoch": 0.2702817570439261, + "grad_norm": 4.5250935554504395, + "learning_rate": 1.8284911580552216e-05, + "loss": 2.2377, + "step": 21622 + }, + { + "epoch": 0.27030675766894174, + "grad_norm": 3.801889181137085, + "learning_rate": 1.828442284136792e-05, + "loss": 1.5598, + "step": 21624 + }, + { + "epoch": 0.27033175829395734, + "grad_norm": 2.9384098052978516, + "learning_rate": 1.828393403909063e-05, + "loss": 0.4537, + "step": 21626 + }, + { + "epoch": 0.270356758918973, + "grad_norm": 2.9442138671875, + "learning_rate": 1.8283445173724067e-05, + "loss": 0.3148, + "step": 21628 + }, + { + "epoch": 0.2703817595439886, + "grad_norm": 3.6836352348327637, + "learning_rate": 1.828295624527196e-05, + "loss": 1.0977, + "step": 21630 + }, + { + "epoch": 0.2704067601690042, + "grad_norm": 0.006417368073016405, + "learning_rate": 1.828246725373803e-05, + "loss": 0.2186, + "step": 21632 + }, + { + "epoch": 0.27043176079401987, + "grad_norm": 5.236886501312256, + "learning_rate": 1.8281978199126e-05, + "loss": 1.2278, + "step": 21634 + }, + { + "epoch": 0.27045676141903546, + "grad_norm": 1.7105286121368408, + "learning_rate": 1.8281489081439593e-05, + "loss": 0.0948, + "step": 21636 + }, + { + "epoch": 0.2704817620440511, + "grad_norm": 5.623603820800781, + "learning_rate": 1.8280999900682537e-05, + "loss": 1.5529, + "step": 21638 + }, + { + "epoch": 0.2705067626690667, + "grad_norm": 6.681788921356201, + "learning_rate": 1.828051065685856e-05, + "loss": 0.8841, + "step": 21640 + }, + { + "epoch": 0.27053176329408235, + "grad_norm": 0.028530577197670937, + "learning_rate": 1.828002134997138e-05, + "loss": 0.7596, + "step": 21642 + }, + { + "epoch": 0.270556763919098, + "grad_norm": 5.304638385772705, + "learning_rate": 1.827953198002473e-05, + "loss": 0.8995, + "step": 21644 + }, + { + "epoch": 0.2705817645441136, + "grad_norm": 0.5426416993141174, + "learning_rate": 1.8279042547022338e-05, + "loss": 0.2416, + "step": 21646 + }, + { + "epoch": 0.27060676516912924, + "grad_norm": 4.512139797210693, + "learning_rate": 1.8278553050967924e-05, + "loss": 0.8099, + "step": 21648 + }, + { + "epoch": 0.27063176579414483, + "grad_norm": 3.9092416763305664, + "learning_rate": 1.8278063491865225e-05, + "loss": 0.9578, + "step": 21650 + }, + { + "epoch": 0.2706567664191605, + "grad_norm": 1.8664904832839966, + "learning_rate": 1.8277573869717962e-05, + "loss": 1.4799, + "step": 21652 + }, + { + "epoch": 0.2706817670441761, + "grad_norm": 2.0727620124816895, + "learning_rate": 1.827708418452987e-05, + "loss": 0.7999, + "step": 21654 + }, + { + "epoch": 0.2707067676691917, + "grad_norm": 4.654934883117676, + "learning_rate": 1.8276594436304673e-05, + "loss": 0.8188, + "step": 21656 + }, + { + "epoch": 0.27073176829420736, + "grad_norm": 4.349542617797852, + "learning_rate": 1.8276104625046104e-05, + "loss": 1.0499, + "step": 21658 + }, + { + "epoch": 0.27075676891922296, + "grad_norm": 3.07997465133667, + "learning_rate": 1.827561475075789e-05, + "loss": 1.0129, + "step": 21660 + }, + { + "epoch": 0.2707817695442386, + "grad_norm": 5.785107135772705, + "learning_rate": 1.827512481344377e-05, + "loss": 0.5468, + "step": 21662 + }, + { + "epoch": 0.27080677016925425, + "grad_norm": 1.6219855546951294, + "learning_rate": 1.8274634813107464e-05, + "loss": 0.7463, + "step": 21664 + }, + { + "epoch": 0.27083177079426984, + "grad_norm": 4.818944454193115, + "learning_rate": 1.8274144749752713e-05, + "loss": 1.8593, + "step": 21666 + }, + { + "epoch": 0.2708567714192855, + "grad_norm": 3.4156129360198975, + "learning_rate": 1.8273654623383247e-05, + "loss": 2.2547, + "step": 21668 + }, + { + "epoch": 0.2708817720443011, + "grad_norm": 1.8383512496948242, + "learning_rate": 1.8273164434002794e-05, + "loss": 1.3225, + "step": 21670 + }, + { + "epoch": 0.27090677266931673, + "grad_norm": 2.7639646530151367, + "learning_rate": 1.827267418161509e-05, + "loss": 0.5402, + "step": 21672 + }, + { + "epoch": 0.2709317732943324, + "grad_norm": 14.661949157714844, + "learning_rate": 1.827218386622387e-05, + "loss": 2.2159, + "step": 21674 + }, + { + "epoch": 0.27095677391934797, + "grad_norm": 8.297962188720703, + "learning_rate": 1.827169348783287e-05, + "loss": 1.9693, + "step": 21676 + }, + { + "epoch": 0.2709817745443636, + "grad_norm": 2.561140537261963, + "learning_rate": 1.827120304644582e-05, + "loss": 1.3087, + "step": 21678 + }, + { + "epoch": 0.2710067751693792, + "grad_norm": 5.631614685058594, + "learning_rate": 1.8270712542066457e-05, + "loss": 2.4684, + "step": 21680 + }, + { + "epoch": 0.27103177579439486, + "grad_norm": 3.4807090759277344, + "learning_rate": 1.8270221974698516e-05, + "loss": 1.0506, + "step": 21682 + }, + { + "epoch": 0.2710567764194105, + "grad_norm": 0.1034388467669487, + "learning_rate": 1.8269731344345734e-05, + "loss": 0.0333, + "step": 21684 + }, + { + "epoch": 0.2710817770444261, + "grad_norm": 0.8650673627853394, + "learning_rate": 1.826924065101185e-05, + "loss": 0.0778, + "step": 21686 + }, + { + "epoch": 0.27110677766944175, + "grad_norm": 3.7550604343414307, + "learning_rate": 1.8268749894700598e-05, + "loss": 0.897, + "step": 21688 + }, + { + "epoch": 0.27113177829445734, + "grad_norm": 8.875335693359375, + "learning_rate": 1.8268259075415714e-05, + "loss": 0.9509, + "step": 21690 + }, + { + "epoch": 0.271156778919473, + "grad_norm": 2.5745882987976074, + "learning_rate": 1.826776819316094e-05, + "loss": 1.1551, + "step": 21692 + }, + { + "epoch": 0.27118177954448863, + "grad_norm": 6.095357894897461, + "learning_rate": 1.8267277247940012e-05, + "loss": 0.6867, + "step": 21694 + }, + { + "epoch": 0.2712067801695042, + "grad_norm": 0.007291901390999556, + "learning_rate": 1.826678623975667e-05, + "loss": 0.0892, + "step": 21696 + }, + { + "epoch": 0.27123178079451987, + "grad_norm": 3.99416446685791, + "learning_rate": 1.8266295168614656e-05, + "loss": 1.7159, + "step": 21698 + }, + { + "epoch": 0.27125678141953546, + "grad_norm": 3.469360828399658, + "learning_rate": 1.8265804034517702e-05, + "loss": 1.0204, + "step": 21700 + }, + { + "epoch": 0.2712817820445511, + "grad_norm": 4.376137733459473, + "learning_rate": 1.8265312837469555e-05, + "loss": 0.7416, + "step": 21702 + }, + { + "epoch": 0.27130678266956676, + "grad_norm": 3.4344332218170166, + "learning_rate": 1.8264821577473954e-05, + "loss": 1.4516, + "step": 21704 + }, + { + "epoch": 0.27133178329458235, + "grad_norm": 1.9271246194839478, + "learning_rate": 1.826433025453464e-05, + "loss": 0.1167, + "step": 21706 + }, + { + "epoch": 0.271356783919598, + "grad_norm": 4.435909271240234, + "learning_rate": 1.8263838868655357e-05, + "loss": 1.2832, + "step": 21708 + }, + { + "epoch": 0.2713817845446136, + "grad_norm": 5.031375885009766, + "learning_rate": 1.8263347419839846e-05, + "loss": 1.7964, + "step": 21710 + }, + { + "epoch": 0.27140678516962924, + "grad_norm": 6.290681838989258, + "learning_rate": 1.8262855908091847e-05, + "loss": 1.6253, + "step": 21712 + }, + { + "epoch": 0.2714317857946449, + "grad_norm": 2.703540563583374, + "learning_rate": 1.826236433341511e-05, + "loss": 1.019, + "step": 21714 + }, + { + "epoch": 0.2714567864196605, + "grad_norm": 3.087902307510376, + "learning_rate": 1.826187269581337e-05, + "loss": 0.4376, + "step": 21716 + }, + { + "epoch": 0.2714817870446761, + "grad_norm": 2.8497416973114014, + "learning_rate": 1.826138099529038e-05, + "loss": 0.6338, + "step": 21718 + }, + { + "epoch": 0.2715067876696917, + "grad_norm": 1.2073540687561035, + "learning_rate": 1.8260889231849882e-05, + "loss": 0.4604, + "step": 21720 + }, + { + "epoch": 0.27153178829470737, + "grad_norm": 1.3434804677963257, + "learning_rate": 1.8260397405495615e-05, + "loss": 0.0864, + "step": 21722 + }, + { + "epoch": 0.271556788919723, + "grad_norm": 3.121495485305786, + "learning_rate": 1.825990551623133e-05, + "loss": 0.5838, + "step": 21724 + }, + { + "epoch": 0.2715817895447386, + "grad_norm": 2.376833915710449, + "learning_rate": 1.8259413564060773e-05, + "loss": 0.0796, + "step": 21726 + }, + { + "epoch": 0.27160679016975425, + "grad_norm": 3.880938768386841, + "learning_rate": 1.825892154898769e-05, + "loss": 0.3991, + "step": 21728 + }, + { + "epoch": 0.27163179079476985, + "grad_norm": 1.7079484462738037, + "learning_rate": 1.8258429471015832e-05, + "loss": 0.8696, + "step": 21730 + }, + { + "epoch": 0.2716567914197855, + "grad_norm": 1.6136177778244019, + "learning_rate": 1.825793733014894e-05, + "loss": 1.6677, + "step": 21732 + }, + { + "epoch": 0.27168179204480114, + "grad_norm": 1.6235356330871582, + "learning_rate": 1.8257445126390765e-05, + "loss": 0.7161, + "step": 21734 + }, + { + "epoch": 0.27170679266981673, + "grad_norm": 2.7403016090393066, + "learning_rate": 1.8256952859745055e-05, + "loss": 0.9318, + "step": 21736 + }, + { + "epoch": 0.2717317932948324, + "grad_norm": 6.2524237632751465, + "learning_rate": 1.825646053021556e-05, + "loss": 1.7624, + "step": 21738 + }, + { + "epoch": 0.27175679391984797, + "grad_norm": 2.2559361457824707, + "learning_rate": 1.825596813780603e-05, + "loss": 0.2228, + "step": 21740 + }, + { + "epoch": 0.2717817945448636, + "grad_norm": 1.9483249187469482, + "learning_rate": 1.825547568252022e-05, + "loss": 0.6467, + "step": 21742 + }, + { + "epoch": 0.27180679516987927, + "grad_norm": 1.3011645078659058, + "learning_rate": 1.825498316436187e-05, + "loss": 0.5774, + "step": 21744 + }, + { + "epoch": 0.27183179579489486, + "grad_norm": 4.461515426635742, + "learning_rate": 1.8254490583334734e-05, + "loss": 1.0551, + "step": 21746 + }, + { + "epoch": 0.2718567964199105, + "grad_norm": 2.0309019088745117, + "learning_rate": 1.825399793944256e-05, + "loss": 0.9522, + "step": 21748 + }, + { + "epoch": 0.2718817970449261, + "grad_norm": 7.816713333129883, + "learning_rate": 1.8253505232689113e-05, + "loss": 1.2572, + "step": 21750 + }, + { + "epoch": 0.27190679766994175, + "grad_norm": 2.346029281616211, + "learning_rate": 1.8253012463078136e-05, + "loss": 0.5286, + "step": 21752 + }, + { + "epoch": 0.2719317982949574, + "grad_norm": 2.9265761375427246, + "learning_rate": 1.825251963061338e-05, + "loss": 1.4268, + "step": 21754 + }, + { + "epoch": 0.271956798919973, + "grad_norm": 2.835857391357422, + "learning_rate": 1.8252026735298604e-05, + "loss": 1.2942, + "step": 21756 + }, + { + "epoch": 0.27198179954498863, + "grad_norm": 4.395848751068115, + "learning_rate": 1.8251533777137557e-05, + "loss": 1.1444, + "step": 21758 + }, + { + "epoch": 0.2720068001700042, + "grad_norm": 3.824941635131836, + "learning_rate": 1.8251040756133998e-05, + "loss": 0.6664, + "step": 21760 + }, + { + "epoch": 0.2720318007950199, + "grad_norm": 0.7501939535140991, + "learning_rate": 1.825054767229168e-05, + "loss": 0.674, + "step": 21762 + }, + { + "epoch": 0.2720568014200355, + "grad_norm": 3.0317749977111816, + "learning_rate": 1.8250054525614356e-05, + "loss": 1.0947, + "step": 21764 + }, + { + "epoch": 0.2720818020450511, + "grad_norm": 2.578394651412964, + "learning_rate": 1.8249561316105783e-05, + "loss": 1.6001, + "step": 21766 + }, + { + "epoch": 0.27210680267006676, + "grad_norm": 2.9033327102661133, + "learning_rate": 1.8249068043769716e-05, + "loss": 0.9261, + "step": 21768 + }, + { + "epoch": 0.27213180329508235, + "grad_norm": 2.2498209476470947, + "learning_rate": 1.8248574708609917e-05, + "loss": 0.8773, + "step": 21770 + }, + { + "epoch": 0.272156803920098, + "grad_norm": 4.273099422454834, + "learning_rate": 1.824808131063014e-05, + "loss": 1.3922, + "step": 21772 + }, + { + "epoch": 0.27218180454511365, + "grad_norm": 2.9700767993927, + "learning_rate": 1.8247587849834142e-05, + "loss": 1.5675, + "step": 21774 + }, + { + "epoch": 0.27220680517012924, + "grad_norm": 3.754911184310913, + "learning_rate": 1.8247094326225677e-05, + "loss": 1.5722, + "step": 21776 + }, + { + "epoch": 0.2722318057951449, + "grad_norm": 1.8143800497055054, + "learning_rate": 1.8246600739808513e-05, + "loss": 0.3022, + "step": 21778 + }, + { + "epoch": 0.2722568064201605, + "grad_norm": 5.86183500289917, + "learning_rate": 1.8246107090586396e-05, + "loss": 1.5547, + "step": 21780 + }, + { + "epoch": 0.2722818070451761, + "grad_norm": 3.9093241691589355, + "learning_rate": 1.82456133785631e-05, + "loss": 0.5072, + "step": 21782 + }, + { + "epoch": 0.2723068076701918, + "grad_norm": 4.062649726867676, + "learning_rate": 1.824511960374238e-05, + "loss": 0.6751, + "step": 21784 + }, + { + "epoch": 0.27233180829520737, + "grad_norm": 3.825806140899658, + "learning_rate": 1.824462576612799e-05, + "loss": 0.4862, + "step": 21786 + }, + { + "epoch": 0.272356808920223, + "grad_norm": 3.974959373474121, + "learning_rate": 1.82441318657237e-05, + "loss": 1.4633, + "step": 21788 + }, + { + "epoch": 0.2723818095452386, + "grad_norm": 3.8275997638702393, + "learning_rate": 1.8243637902533264e-05, + "loss": 1.5272, + "step": 21790 + }, + { + "epoch": 0.27240681017025425, + "grad_norm": 1.831208348274231, + "learning_rate": 1.824314387656045e-05, + "loss": 1.163, + "step": 21792 + }, + { + "epoch": 0.2724318107952699, + "grad_norm": 0.011700164526700974, + "learning_rate": 1.8242649787809018e-05, + "loss": 0.8626, + "step": 21794 + }, + { + "epoch": 0.2724568114202855, + "grad_norm": 0.08674914389848709, + "learning_rate": 1.8242155636282734e-05, + "loss": 0.6998, + "step": 21796 + }, + { + "epoch": 0.27248181204530114, + "grad_norm": 2.521113872528076, + "learning_rate": 1.824166142198535e-05, + "loss": 1.4686, + "step": 21798 + }, + { + "epoch": 0.27250681267031673, + "grad_norm": 6.078054904937744, + "learning_rate": 1.8241167144920643e-05, + "loss": 1.3044, + "step": 21800 + }, + { + "epoch": 0.2725318132953324, + "grad_norm": 4.496287822723389, + "learning_rate": 1.824067280509237e-05, + "loss": 1.1665, + "step": 21802 + }, + { + "epoch": 0.27255681392034803, + "grad_norm": 0.00687398063018918, + "learning_rate": 1.8240178402504302e-05, + "loss": 0.2901, + "step": 21804 + }, + { + "epoch": 0.2725818145453636, + "grad_norm": 4.57294225692749, + "learning_rate": 1.8239683937160198e-05, + "loss": 1.2645, + "step": 21806 + }, + { + "epoch": 0.27260681517037927, + "grad_norm": 3.217867612838745, + "learning_rate": 1.8239189409063828e-05, + "loss": 0.2784, + "step": 21808 + }, + { + "epoch": 0.27263181579539486, + "grad_norm": 2.3523013591766357, + "learning_rate": 1.8238694818218955e-05, + "loss": 0.9445, + "step": 21810 + }, + { + "epoch": 0.2726568164204105, + "grad_norm": 4.733066558837891, + "learning_rate": 1.823820016462935e-05, + "loss": 1.809, + "step": 21812 + }, + { + "epoch": 0.27268181704542616, + "grad_norm": 1.973552942276001, + "learning_rate": 1.8237705448298774e-05, + "loss": 1.0297, + "step": 21814 + }, + { + "epoch": 0.27270681767044175, + "grad_norm": 3.234501838684082, + "learning_rate": 1.8237210669231e-05, + "loss": 0.5181, + "step": 21816 + }, + { + "epoch": 0.2727318182954574, + "grad_norm": 2.8291494846343994, + "learning_rate": 1.8236715827429795e-05, + "loss": 0.733, + "step": 21818 + }, + { + "epoch": 0.272756818920473, + "grad_norm": 2.3268320560455322, + "learning_rate": 1.8236220922898925e-05, + "loss": 0.3222, + "step": 21820 + }, + { + "epoch": 0.27278181954548864, + "grad_norm": 6.445098400115967, + "learning_rate": 1.8235725955642162e-05, + "loss": 1.3327, + "step": 21822 + }, + { + "epoch": 0.2728068201705043, + "grad_norm": 7.0146484375, + "learning_rate": 1.8235230925663273e-05, + "loss": 1.6571, + "step": 21824 + }, + { + "epoch": 0.2728318207955199, + "grad_norm": 0.007573212031275034, + "learning_rate": 1.8234735832966036e-05, + "loss": 0.2317, + "step": 21826 + }, + { + "epoch": 0.2728568214205355, + "grad_norm": 2.1255831718444824, + "learning_rate": 1.8234240677554213e-05, + "loss": 0.7494, + "step": 21828 + }, + { + "epoch": 0.2728818220455511, + "grad_norm": 2.7301418781280518, + "learning_rate": 1.8233745459431573e-05, + "loss": 2.2013, + "step": 21830 + }, + { + "epoch": 0.27290682267056676, + "grad_norm": 2.751605749130249, + "learning_rate": 1.8233250178601895e-05, + "loss": 1.1565, + "step": 21832 + }, + { + "epoch": 0.2729318232955824, + "grad_norm": 5.088311672210693, + "learning_rate": 1.823275483506895e-05, + "loss": 1.6325, + "step": 21834 + }, + { + "epoch": 0.272956823920598, + "grad_norm": 4.317236423492432, + "learning_rate": 1.8232259428836504e-05, + "loss": 1.6897, + "step": 21836 + }, + { + "epoch": 0.27298182454561365, + "grad_norm": 1.191833257675171, + "learning_rate": 1.8231763959908333e-05, + "loss": 0.113, + "step": 21838 + }, + { + "epoch": 0.27300682517062924, + "grad_norm": 6.822560787200928, + "learning_rate": 1.823126842828822e-05, + "loss": 1.8023, + "step": 21840 + }, + { + "epoch": 0.2730318257956449, + "grad_norm": 3.1913487911224365, + "learning_rate": 1.8230772833979924e-05, + "loss": 0.7025, + "step": 21842 + }, + { + "epoch": 0.27305682642066054, + "grad_norm": 3.591891050338745, + "learning_rate": 1.8230277176987227e-05, + "loss": 2.1571, + "step": 21844 + }, + { + "epoch": 0.27308182704567613, + "grad_norm": 5.801956653594971, + "learning_rate": 1.8229781457313906e-05, + "loss": 1.4149, + "step": 21846 + }, + { + "epoch": 0.2731068276706918, + "grad_norm": 5.80841064453125, + "learning_rate": 1.8229285674963727e-05, + "loss": 1.2916, + "step": 21848 + }, + { + "epoch": 0.27313182829570737, + "grad_norm": 0.28682368993759155, + "learning_rate": 1.822878982994048e-05, + "loss": 0.9354, + "step": 21850 + }, + { + "epoch": 0.273156828920723, + "grad_norm": 4.420774936676025, + "learning_rate": 1.822829392224793e-05, + "loss": 1.127, + "step": 21852 + }, + { + "epoch": 0.27318182954573866, + "grad_norm": 0.8107182383537292, + "learning_rate": 1.8227797951889853e-05, + "loss": 0.0846, + "step": 21854 + }, + { + "epoch": 0.27320683017075426, + "grad_norm": 3.486632823944092, + "learning_rate": 1.8227301918870032e-05, + "loss": 1.1637, + "step": 21856 + }, + { + "epoch": 0.2732318307957699, + "grad_norm": 3.902456283569336, + "learning_rate": 1.8226805823192246e-05, + "loss": 0.7229, + "step": 21858 + }, + { + "epoch": 0.2732568314207855, + "grad_norm": 2.389718532562256, + "learning_rate": 1.822630966486027e-05, + "loss": 1.1236, + "step": 21860 + }, + { + "epoch": 0.27328183204580114, + "grad_norm": 4.171392440795898, + "learning_rate": 1.822581344387788e-05, + "loss": 1.7981, + "step": 21862 + }, + { + "epoch": 0.2733068326708168, + "grad_norm": 3.0248312950134277, + "learning_rate": 1.8225317160248862e-05, + "loss": 1.247, + "step": 21864 + }, + { + "epoch": 0.2733318332958324, + "grad_norm": 2.7599282264709473, + "learning_rate": 1.822482081397699e-05, + "loss": 0.393, + "step": 21866 + }, + { + "epoch": 0.27335683392084803, + "grad_norm": 0.011312829330563545, + "learning_rate": 1.8224324405066043e-05, + "loss": 0.0652, + "step": 21868 + }, + { + "epoch": 0.2733818345458636, + "grad_norm": 3.264720916748047, + "learning_rate": 1.822382793351981e-05, + "loss": 1.609, + "step": 21870 + }, + { + "epoch": 0.27340683517087927, + "grad_norm": 2.859617233276367, + "learning_rate": 1.8223331399342062e-05, + "loss": 0.7716, + "step": 21872 + }, + { + "epoch": 0.2734318357958949, + "grad_norm": 0.009930586442351341, + "learning_rate": 1.8222834802536583e-05, + "loss": 0.5447, + "step": 21874 + }, + { + "epoch": 0.2734568364209105, + "grad_norm": 1.3609142303466797, + "learning_rate": 1.8222338143107162e-05, + "loss": 0.7339, + "step": 21876 + }, + { + "epoch": 0.27348183704592616, + "grad_norm": 2.8190276622772217, + "learning_rate": 1.8221841421057575e-05, + "loss": 0.8024, + "step": 21878 + }, + { + "epoch": 0.27350683767094175, + "grad_norm": 0.973734438419342, + "learning_rate": 1.8221344636391606e-05, + "loss": 0.0981, + "step": 21880 + }, + { + "epoch": 0.2735318382959574, + "grad_norm": 3.6298511028289795, + "learning_rate": 1.822084778911304e-05, + "loss": 1.418, + "step": 21882 + }, + { + "epoch": 0.27355683892097304, + "grad_norm": 0.30725741386413574, + "learning_rate": 1.8220350879225657e-05, + "loss": 0.0861, + "step": 21884 + }, + { + "epoch": 0.27358183954598864, + "grad_norm": 6.921279430389404, + "learning_rate": 1.8219853906733245e-05, + "loss": 2.5186, + "step": 21886 + }, + { + "epoch": 0.2736068401710043, + "grad_norm": 5.522582054138184, + "learning_rate": 1.821935687163959e-05, + "loss": 2.0337, + "step": 21888 + }, + { + "epoch": 0.2736318407960199, + "grad_norm": 1.9721742868423462, + "learning_rate": 1.8218859773948475e-05, + "loss": 1.0858, + "step": 21890 + }, + { + "epoch": 0.2736568414210355, + "grad_norm": 6.977244853973389, + "learning_rate": 1.8218362613663682e-05, + "loss": 1.5298, + "step": 21892 + }, + { + "epoch": 0.27368184204605117, + "grad_norm": 3.3271427154541016, + "learning_rate": 1.8217865390789006e-05, + "loss": 1.1873, + "step": 21894 + }, + { + "epoch": 0.27370684267106676, + "grad_norm": 2.619683265686035, + "learning_rate": 1.8217368105328227e-05, + "loss": 0.7905, + "step": 21896 + }, + { + "epoch": 0.2737318432960824, + "grad_norm": 1.533614993095398, + "learning_rate": 1.8216870757285137e-05, + "loss": 0.7283, + "step": 21898 + }, + { + "epoch": 0.273756843921098, + "grad_norm": 2.0208239555358887, + "learning_rate": 1.8216373346663518e-05, + "loss": 0.205, + "step": 21900 + }, + { + "epoch": 0.27378184454611365, + "grad_norm": 0.019828174263238907, + "learning_rate": 1.8215875873467163e-05, + "loss": 0.388, + "step": 21902 + }, + { + "epoch": 0.2738068451711293, + "grad_norm": 4.867998123168945, + "learning_rate": 1.8215378337699864e-05, + "loss": 0.7888, + "step": 21904 + }, + { + "epoch": 0.2738318457961449, + "grad_norm": 1.7892690896987915, + "learning_rate": 1.8214880739365396e-05, + "loss": 0.4049, + "step": 21906 + }, + { + "epoch": 0.27385684642116054, + "grad_norm": 4.239115238189697, + "learning_rate": 1.8214383078467563e-05, + "loss": 1.2713, + "step": 21908 + }, + { + "epoch": 0.27388184704617613, + "grad_norm": 2.793253183364868, + "learning_rate": 1.8213885355010154e-05, + "loss": 0.6249, + "step": 21910 + }, + { + "epoch": 0.2739068476711918, + "grad_norm": 2.269867181777954, + "learning_rate": 1.821338756899695e-05, + "loss": 0.2133, + "step": 21912 + }, + { + "epoch": 0.2739318482962074, + "grad_norm": 36.84553909301758, + "learning_rate": 1.821288972043175e-05, + "loss": 1.9421, + "step": 21914 + }, + { + "epoch": 0.273956848921223, + "grad_norm": 0.5109667778015137, + "learning_rate": 1.8212391809318342e-05, + "loss": 0.5836, + "step": 21916 + }, + { + "epoch": 0.27398184954623866, + "grad_norm": 4.136247158050537, + "learning_rate": 1.821189383566052e-05, + "loss": 0.9691, + "step": 21918 + }, + { + "epoch": 0.27400685017125426, + "grad_norm": 4.5433349609375, + "learning_rate": 1.8211395799462075e-05, + "loss": 1.0354, + "step": 21920 + }, + { + "epoch": 0.2740318507962699, + "grad_norm": 0.0073334393091499805, + "learning_rate": 1.82108977007268e-05, + "loss": 0.019, + "step": 21922 + }, + { + "epoch": 0.27405685142128555, + "grad_norm": 3.5372824668884277, + "learning_rate": 1.8210399539458494e-05, + "loss": 0.9669, + "step": 21924 + }, + { + "epoch": 0.27408185204630114, + "grad_norm": 3.1427485942840576, + "learning_rate": 1.8209901315660942e-05, + "loss": 0.3603, + "step": 21926 + }, + { + "epoch": 0.2741068526713168, + "grad_norm": 4.884337425231934, + "learning_rate": 1.8209403029337945e-05, + "loss": 1.9174, + "step": 21928 + }, + { + "epoch": 0.2741318532963324, + "grad_norm": 5.067074298858643, + "learning_rate": 1.8208904680493292e-05, + "loss": 1.7122, + "step": 21930 + }, + { + "epoch": 0.27415685392134803, + "grad_norm": 1.9082099199295044, + "learning_rate": 1.8208406269130783e-05, + "loss": 0.668, + "step": 21932 + }, + { + "epoch": 0.2741818545463637, + "grad_norm": 0.036491647362709045, + "learning_rate": 1.8207907795254214e-05, + "loss": 0.0014, + "step": 21934 + }, + { + "epoch": 0.27420685517137927, + "grad_norm": 4.585294723510742, + "learning_rate": 1.820740925886738e-05, + "loss": 1.9152, + "step": 21936 + }, + { + "epoch": 0.2742318557963949, + "grad_norm": 2.476949691772461, + "learning_rate": 1.8206910659974077e-05, + "loss": 0.9996, + "step": 21938 + }, + { + "epoch": 0.2742568564214105, + "grad_norm": 2.1930534839630127, + "learning_rate": 1.8206411998578105e-05, + "loss": 0.5738, + "step": 21940 + }, + { + "epoch": 0.27428185704642616, + "grad_norm": 0.8507153987884521, + "learning_rate": 1.8205913274683258e-05, + "loss": 0.3941, + "step": 21942 + }, + { + "epoch": 0.2743068576714418, + "grad_norm": 3.287116527557373, + "learning_rate": 1.8205414488293334e-05, + "loss": 2.2869, + "step": 21944 + }, + { + "epoch": 0.2743318582964574, + "grad_norm": 2.6022613048553467, + "learning_rate": 1.8204915639412135e-05, + "loss": 2.7639, + "step": 21946 + }, + { + "epoch": 0.27435685892147305, + "grad_norm": 2.319546699523926, + "learning_rate": 1.8204416728043458e-05, + "loss": 0.107, + "step": 21948 + }, + { + "epoch": 0.27438185954648864, + "grad_norm": 4.481103420257568, + "learning_rate": 1.8203917754191106e-05, + "loss": 1.9248, + "step": 21950 + }, + { + "epoch": 0.2744068601715043, + "grad_norm": 6.247864723205566, + "learning_rate": 1.8203418717858876e-05, + "loss": 1.3393, + "step": 21952 + }, + { + "epoch": 0.27443186079651993, + "grad_norm": 1.9091154336929321, + "learning_rate": 1.8202919619050567e-05, + "loss": 0.5602, + "step": 21954 + }, + { + "epoch": 0.2744568614215355, + "grad_norm": 1.6182810068130493, + "learning_rate": 1.8202420457769983e-05, + "loss": 0.7822, + "step": 21956 + }, + { + "epoch": 0.2744818620465512, + "grad_norm": 2.9816818237304688, + "learning_rate": 1.8201921234020926e-05, + "loss": 1.2968, + "step": 21958 + }, + { + "epoch": 0.27450686267156676, + "grad_norm": 1.7881325483322144, + "learning_rate": 1.8201421947807194e-05, + "loss": 0.2863, + "step": 21960 + }, + { + "epoch": 0.2745318632965824, + "grad_norm": 0.007157785817980766, + "learning_rate": 1.8200922599132595e-05, + "loss": 0.8076, + "step": 21962 + }, + { + "epoch": 0.27455686392159806, + "grad_norm": 4.497842788696289, + "learning_rate": 1.820042318800093e-05, + "loss": 1.3444, + "step": 21964 + }, + { + "epoch": 0.27458186454661365, + "grad_norm": 1.6452564001083374, + "learning_rate": 1.8199923714416e-05, + "loss": 0.652, + "step": 21966 + }, + { + "epoch": 0.2746068651716293, + "grad_norm": 0.043329011648893356, + "learning_rate": 1.8199424178381608e-05, + "loss": 0.4517, + "step": 21968 + }, + { + "epoch": 0.2746318657966449, + "grad_norm": 1.8199831247329712, + "learning_rate": 1.8198924579901563e-05, + "loss": 0.591, + "step": 21970 + }, + { + "epoch": 0.27465686642166054, + "grad_norm": 3.1379599571228027, + "learning_rate": 1.819842491897967e-05, + "loss": 0.4216, + "step": 21972 + }, + { + "epoch": 0.2746818670466762, + "grad_norm": 6.038350582122803, + "learning_rate": 1.819792519561973e-05, + "loss": 1.3609, + "step": 21974 + }, + { + "epoch": 0.2747068676716918, + "grad_norm": 4.227221965789795, + "learning_rate": 1.819742540982555e-05, + "loss": 1.1814, + "step": 21976 + }, + { + "epoch": 0.2747318682967074, + "grad_norm": 0.0062087164260447025, + "learning_rate": 1.819692556160094e-05, + "loss": 0.1977, + "step": 21978 + }, + { + "epoch": 0.274756868921723, + "grad_norm": 0.01288788951933384, + "learning_rate": 1.81964256509497e-05, + "loss": 0.1312, + "step": 21980 + }, + { + "epoch": 0.27478186954673867, + "grad_norm": 2.9393417835235596, + "learning_rate": 1.8195925677875648e-05, + "loss": 2.3301, + "step": 21982 + }, + { + "epoch": 0.2748068701717543, + "grad_norm": 2.1517274379730225, + "learning_rate": 1.8195425642382578e-05, + "loss": 0.3517, + "step": 21984 + }, + { + "epoch": 0.2748318707967699, + "grad_norm": 3.72560977935791, + "learning_rate": 1.8194925544474313e-05, + "loss": 2.0596, + "step": 21986 + }, + { + "epoch": 0.27485687142178555, + "grad_norm": 1.3450376987457275, + "learning_rate": 1.8194425384154647e-05, + "loss": 0.0247, + "step": 21988 + }, + { + "epoch": 0.27488187204680115, + "grad_norm": 3.9731218814849854, + "learning_rate": 1.81939251614274e-05, + "loss": 1.6605, + "step": 21990 + }, + { + "epoch": 0.2749068726718168, + "grad_norm": 0.03212955966591835, + "learning_rate": 1.8193424876296374e-05, + "loss": 0.0427, + "step": 21992 + }, + { + "epoch": 0.27493187329683244, + "grad_norm": 3.023832321166992, + "learning_rate": 1.8192924528765385e-05, + "loss": 0.2319, + "step": 21994 + }, + { + "epoch": 0.27495687392184803, + "grad_norm": 1.6652570962905884, + "learning_rate": 1.8192424118838243e-05, + "loss": 1.1126, + "step": 21996 + }, + { + "epoch": 0.2749818745468637, + "grad_norm": 3.7359824180603027, + "learning_rate": 1.8191923646518755e-05, + "loss": 0.9829, + "step": 21998 + }, + { + "epoch": 0.2750068751718793, + "grad_norm": 4.227418422698975, + "learning_rate": 1.8191423111810734e-05, + "loss": 1.5496, + "step": 22000 + }, + { + "epoch": 0.2750318757968949, + "grad_norm": 2.512866258621216, + "learning_rate": 1.8190922514717994e-05, + "loss": 0.7124, + "step": 22002 + }, + { + "epoch": 0.27505687642191057, + "grad_norm": 2.3740575313568115, + "learning_rate": 1.8190421855244348e-05, + "loss": 0.7777, + "step": 22004 + }, + { + "epoch": 0.27508187704692616, + "grad_norm": 3.154414415359497, + "learning_rate": 1.8189921133393606e-05, + "loss": 0.9138, + "step": 22006 + }, + { + "epoch": 0.2751068776719418, + "grad_norm": 2.5874247550964355, + "learning_rate": 1.8189420349169583e-05, + "loss": 0.5171, + "step": 22008 + }, + { + "epoch": 0.2751318782969574, + "grad_norm": 2.5388429164886475, + "learning_rate": 1.8188919502576096e-05, + "loss": 0.7511, + "step": 22010 + }, + { + "epoch": 0.27515687892197305, + "grad_norm": 3.830132007598877, + "learning_rate": 1.818841859361695e-05, + "loss": 1.4614, + "step": 22012 + }, + { + "epoch": 0.2751818795469887, + "grad_norm": 2.734811782836914, + "learning_rate": 1.818791762229597e-05, + "loss": 0.411, + "step": 22014 + }, + { + "epoch": 0.2752068801720043, + "grad_norm": 1.5915015935897827, + "learning_rate": 1.8187416588616966e-05, + "loss": 0.7018, + "step": 22016 + }, + { + "epoch": 0.27523188079701993, + "grad_norm": 1.1954678297042847, + "learning_rate": 1.8186915492583755e-05, + "loss": 0.1722, + "step": 22018 + }, + { + "epoch": 0.2752568814220355, + "grad_norm": 2.2292535305023193, + "learning_rate": 1.8186414334200154e-05, + "loss": 0.7077, + "step": 22020 + }, + { + "epoch": 0.2752818820470512, + "grad_norm": 0.02702856808900833, + "learning_rate": 1.8185913113469977e-05, + "loss": 0.001, + "step": 22022 + }, + { + "epoch": 0.2753068826720668, + "grad_norm": 2.3341450691223145, + "learning_rate": 1.8185411830397044e-05, + "loss": 1.049, + "step": 22024 + }, + { + "epoch": 0.2753318832970824, + "grad_norm": 3.889493465423584, + "learning_rate": 1.8184910484985174e-05, + "loss": 1.2428, + "step": 22026 + }, + { + "epoch": 0.27535688392209806, + "grad_norm": 1.8190594911575317, + "learning_rate": 1.8184409077238183e-05, + "loss": 0.6848, + "step": 22028 + }, + { + "epoch": 0.27538188454711365, + "grad_norm": 2.257143020629883, + "learning_rate": 1.818390760715989e-05, + "loss": 1.4219, + "step": 22030 + }, + { + "epoch": 0.2754068851721293, + "grad_norm": 7.006102085113525, + "learning_rate": 1.818340607475411e-05, + "loss": 1.7932, + "step": 22032 + }, + { + "epoch": 0.27543188579714495, + "grad_norm": 1.704142451286316, + "learning_rate": 1.8182904480024673e-05, + "loss": 0.3497, + "step": 22034 + }, + { + "epoch": 0.27545688642216054, + "grad_norm": 7.542949676513672, + "learning_rate": 1.8182402822975388e-05, + "loss": 1.0792, + "step": 22036 + }, + { + "epoch": 0.2754818870471762, + "grad_norm": 1.1403729915618896, + "learning_rate": 1.818190110361008e-05, + "loss": 0.2252, + "step": 22038 + }, + { + "epoch": 0.2755068876721918, + "grad_norm": 2.9271469116210938, + "learning_rate": 1.8181399321932574e-05, + "loss": 0.3903, + "step": 22040 + }, + { + "epoch": 0.27553188829720743, + "grad_norm": 3.039431571960449, + "learning_rate": 1.8180897477946685e-05, + "loss": 0.5728, + "step": 22042 + }, + { + "epoch": 0.2755568889222231, + "grad_norm": 2.7757816314697266, + "learning_rate": 1.8180395571656237e-05, + "loss": 1.5625, + "step": 22044 + }, + { + "epoch": 0.27558188954723867, + "grad_norm": 0.008762864395976067, + "learning_rate": 1.8179893603065055e-05, + "loss": 0.3083, + "step": 22046 + }, + { + "epoch": 0.2756068901722543, + "grad_norm": 2.3416638374328613, + "learning_rate": 1.817939157217696e-05, + "loss": 0.2775, + "step": 22048 + }, + { + "epoch": 0.2756318907972699, + "grad_norm": 2.5616016387939453, + "learning_rate": 1.8178889478995773e-05, + "loss": 1.0605, + "step": 22050 + }, + { + "epoch": 0.27565689142228555, + "grad_norm": 1.806735873222351, + "learning_rate": 1.817838732352532e-05, + "loss": 0.7537, + "step": 22052 + }, + { + "epoch": 0.2756818920473012, + "grad_norm": 4.537355422973633, + "learning_rate": 1.8177885105769434e-05, + "loss": 1.0588, + "step": 22054 + }, + { + "epoch": 0.2757068926723168, + "grad_norm": 0.9806625843048096, + "learning_rate": 1.8177382825731923e-05, + "loss": 0.3183, + "step": 22056 + }, + { + "epoch": 0.27573189329733244, + "grad_norm": 2.785602569580078, + "learning_rate": 1.8176880483416625e-05, + "loss": 0.6858, + "step": 22058 + }, + { + "epoch": 0.27575689392234803, + "grad_norm": 0.30671262741088867, + "learning_rate": 1.8176378078827357e-05, + "loss": 0.0308, + "step": 22060 + }, + { + "epoch": 0.2757818945473637, + "grad_norm": 5.949493885040283, + "learning_rate": 1.8175875611967953e-05, + "loss": 1.3012, + "step": 22062 + }, + { + "epoch": 0.27580689517237933, + "grad_norm": 2.3450543880462646, + "learning_rate": 1.8175373082842233e-05, + "loss": 0.6212, + "step": 22064 + }, + { + "epoch": 0.2758318957973949, + "grad_norm": 2.830479383468628, + "learning_rate": 1.8174870491454034e-05, + "loss": 0.9493, + "step": 22066 + }, + { + "epoch": 0.27585689642241057, + "grad_norm": 6.799006938934326, + "learning_rate": 1.8174367837807173e-05, + "loss": 1.6811, + "step": 22068 + }, + { + "epoch": 0.27588189704742616, + "grad_norm": 1.6896156072616577, + "learning_rate": 1.8173865121905483e-05, + "loss": 1.1217, + "step": 22070 + }, + { + "epoch": 0.2759068976724418, + "grad_norm": 3.024461269378662, + "learning_rate": 1.8173362343752794e-05, + "loss": 1.6306, + "step": 22072 + }, + { + "epoch": 0.27593189829745746, + "grad_norm": 5.977072238922119, + "learning_rate": 1.8172859503352932e-05, + "loss": 1.5652, + "step": 22074 + }, + { + "epoch": 0.27595689892247305, + "grad_norm": 3.4052438735961914, + "learning_rate": 1.8172356600709724e-05, + "loss": 0.9389, + "step": 22076 + }, + { + "epoch": 0.2759818995474887, + "grad_norm": 2.3751397132873535, + "learning_rate": 1.817185363582701e-05, + "loss": 0.5636, + "step": 22078 + }, + { + "epoch": 0.2760069001725043, + "grad_norm": 3.755781888961792, + "learning_rate": 1.817135060870861e-05, + "loss": 0.2156, + "step": 22080 + }, + { + "epoch": 0.27603190079751994, + "grad_norm": 3.0965805053710938, + "learning_rate": 1.8170847519358364e-05, + "loss": 0.4581, + "step": 22082 + }, + { + "epoch": 0.2760569014225356, + "grad_norm": 0.003893546760082245, + "learning_rate": 1.8170344367780094e-05, + "loss": 0.6209, + "step": 22084 + }, + { + "epoch": 0.2760819020475512, + "grad_norm": 3.024271011352539, + "learning_rate": 1.8169841153977636e-05, + "loss": 0.5972, + "step": 22086 + }, + { + "epoch": 0.2761069026725668, + "grad_norm": 3.9767720699310303, + "learning_rate": 1.816933787795483e-05, + "loss": 1.0498, + "step": 22088 + }, + { + "epoch": 0.2761319032975824, + "grad_norm": 4.058225631713867, + "learning_rate": 1.8168834539715497e-05, + "loss": 1.6654, + "step": 22090 + }, + { + "epoch": 0.27615690392259806, + "grad_norm": 5.239742755889893, + "learning_rate": 1.8168331139263477e-05, + "loss": 1.1963, + "step": 22092 + }, + { + "epoch": 0.2761819045476137, + "grad_norm": 1.427642583847046, + "learning_rate": 1.81678276766026e-05, + "loss": 0.4986, + "step": 22094 + }, + { + "epoch": 0.2762069051726293, + "grad_norm": 2.983276605606079, + "learning_rate": 1.8167324151736703e-05, + "loss": 0.9916, + "step": 22096 + }, + { + "epoch": 0.27623190579764495, + "grad_norm": 3.0605216026306152, + "learning_rate": 1.816682056466962e-05, + "loss": 0.5304, + "step": 22098 + }, + { + "epoch": 0.27625690642266054, + "grad_norm": 0.016070067882537842, + "learning_rate": 1.8166316915405185e-05, + "loss": 0.0526, + "step": 22100 + }, + { + "epoch": 0.2762819070476762, + "grad_norm": 3.4292070865631104, + "learning_rate": 1.8165813203947238e-05, + "loss": 0.5757, + "step": 22102 + }, + { + "epoch": 0.27630690767269184, + "grad_norm": 2.580726385116577, + "learning_rate": 1.816530943029961e-05, + "loss": 0.7576, + "step": 22104 + }, + { + "epoch": 0.27633190829770743, + "grad_norm": 1.8797507286071777, + "learning_rate": 1.8164805594466143e-05, + "loss": 1.2007, + "step": 22106 + }, + { + "epoch": 0.2763569089227231, + "grad_norm": 2.7004032135009766, + "learning_rate": 1.816430169645067e-05, + "loss": 0.7073, + "step": 22108 + }, + { + "epoch": 0.27638190954773867, + "grad_norm": 1.9303210973739624, + "learning_rate": 1.8163797736257032e-05, + "loss": 0.7144, + "step": 22110 + }, + { + "epoch": 0.2764069101727543, + "grad_norm": 0.014978885650634766, + "learning_rate": 1.816329371388906e-05, + "loss": 0.1795, + "step": 22112 + }, + { + "epoch": 0.27643191079776996, + "grad_norm": 4.048197269439697, + "learning_rate": 1.81627896293506e-05, + "loss": 0.6467, + "step": 22114 + }, + { + "epoch": 0.27645691142278556, + "grad_norm": 4.107072353363037, + "learning_rate": 1.816228548264549e-05, + "loss": 0.796, + "step": 22116 + }, + { + "epoch": 0.2764819120478012, + "grad_norm": 2.969149351119995, + "learning_rate": 1.8161781273777567e-05, + "loss": 0.7933, + "step": 22118 + }, + { + "epoch": 0.2765069126728168, + "grad_norm": 4.753088474273682, + "learning_rate": 1.8161277002750674e-05, + "loss": 0.9612, + "step": 22120 + }, + { + "epoch": 0.27653191329783244, + "grad_norm": 0.04559174180030823, + "learning_rate": 1.8160772669568646e-05, + "loss": 0.8152, + "step": 22122 + }, + { + "epoch": 0.2765569139228481, + "grad_norm": 3.237900495529175, + "learning_rate": 1.8160268274235328e-05, + "loss": 2.1186, + "step": 22124 + }, + { + "epoch": 0.2765819145478637, + "grad_norm": 2.4241325855255127, + "learning_rate": 1.815976381675456e-05, + "loss": 1.1156, + "step": 22126 + }, + { + "epoch": 0.27660691517287933, + "grad_norm": 0.15802700817584991, + "learning_rate": 1.8159259297130188e-05, + "loss": 0.3009, + "step": 22128 + }, + { + "epoch": 0.2766319157978949, + "grad_norm": 4.926868438720703, + "learning_rate": 1.8158754715366047e-05, + "loss": 0.3708, + "step": 22130 + }, + { + "epoch": 0.27665691642291057, + "grad_norm": 1.9245140552520752, + "learning_rate": 1.8158250071465984e-05, + "loss": 0.4051, + "step": 22132 + }, + { + "epoch": 0.2766819170479262, + "grad_norm": 0.14963461458683014, + "learning_rate": 1.8157745365433846e-05, + "loss": 0.643, + "step": 22134 + }, + { + "epoch": 0.2767069176729418, + "grad_norm": 7.142705917358398, + "learning_rate": 1.815724059727347e-05, + "loss": 1.3785, + "step": 22136 + }, + { + "epoch": 0.27673191829795746, + "grad_norm": 0.07465305179357529, + "learning_rate": 1.8156735766988704e-05, + "loss": 1.2512, + "step": 22138 + }, + { + "epoch": 0.27675691892297305, + "grad_norm": 0.10682721436023712, + "learning_rate": 1.8156230874583395e-05, + "loss": 1.094, + "step": 22140 + }, + { + "epoch": 0.2767819195479887, + "grad_norm": 3.485511064529419, + "learning_rate": 1.815572592006138e-05, + "loss": 1.5386, + "step": 22142 + }, + { + "epoch": 0.27680692017300434, + "grad_norm": 0.504654586315155, + "learning_rate": 1.815522090342651e-05, + "loss": 0.8375, + "step": 22144 + }, + { + "epoch": 0.27683192079801994, + "grad_norm": 0.009170379489660263, + "learning_rate": 1.8154715824682633e-05, + "loss": 0.7536, + "step": 22146 + }, + { + "epoch": 0.2768569214230356, + "grad_norm": 2.8439526557922363, + "learning_rate": 1.8154210683833592e-05, + "loss": 0.8984, + "step": 22148 + }, + { + "epoch": 0.2768819220480512, + "grad_norm": 2.504930019378662, + "learning_rate": 1.815370548088324e-05, + "loss": 0.9929, + "step": 22150 + }, + { + "epoch": 0.2769069226730668, + "grad_norm": 1.5266426801681519, + "learning_rate": 1.8153200215835417e-05, + "loss": 0.5795, + "step": 22152 + }, + { + "epoch": 0.27693192329808247, + "grad_norm": 3.658478021621704, + "learning_rate": 1.815269488869397e-05, + "loss": 1.7439, + "step": 22154 + }, + { + "epoch": 0.27695692392309806, + "grad_norm": 5.190890312194824, + "learning_rate": 1.8152189499462758e-05, + "loss": 0.2103, + "step": 22156 + }, + { + "epoch": 0.2769819245481137, + "grad_norm": 2.0892679691314697, + "learning_rate": 1.8151684048145618e-05, + "loss": 0.141, + "step": 22158 + }, + { + "epoch": 0.2770069251731293, + "grad_norm": 2.7250216007232666, + "learning_rate": 1.815117853474641e-05, + "loss": 1.175, + "step": 22160 + }, + { + "epoch": 0.27703192579814495, + "grad_norm": 4.308544158935547, + "learning_rate": 1.815067295926898e-05, + "loss": 0.7353, + "step": 22162 + }, + { + "epoch": 0.2770569264231606, + "grad_norm": 4.392152786254883, + "learning_rate": 1.815016732171717e-05, + "loss": 0.8465, + "step": 22164 + }, + { + "epoch": 0.2770819270481762, + "grad_norm": 0.005222716368734837, + "learning_rate": 1.8149661622094844e-05, + "loss": 0.147, + "step": 22166 + }, + { + "epoch": 0.27710692767319184, + "grad_norm": 0.22041180729866028, + "learning_rate": 1.814915586040585e-05, + "loss": 0.4802, + "step": 22168 + }, + { + "epoch": 0.27713192829820743, + "grad_norm": 3.7242794036865234, + "learning_rate": 1.814865003665403e-05, + "loss": 0.8355, + "step": 22170 + }, + { + "epoch": 0.2771569289232231, + "grad_norm": 7.976824760437012, + "learning_rate": 1.8148144150843246e-05, + "loss": 2.078, + "step": 22172 + }, + { + "epoch": 0.2771819295482387, + "grad_norm": 4.5728230476379395, + "learning_rate": 1.814763820297735e-05, + "loss": 1.4736, + "step": 22174 + }, + { + "epoch": 0.2772069301732543, + "grad_norm": 3.986790657043457, + "learning_rate": 1.8147132193060193e-05, + "loss": 1.8304, + "step": 22176 + }, + { + "epoch": 0.27723193079826997, + "grad_norm": 0.007415687199681997, + "learning_rate": 1.814662612109563e-05, + "loss": 0.2952, + "step": 22178 + }, + { + "epoch": 0.27725693142328556, + "grad_norm": 3.393937349319458, + "learning_rate": 1.814611998708751e-05, + "loss": 1.0756, + "step": 22180 + }, + { + "epoch": 0.2772819320483012, + "grad_norm": 2.862558126449585, + "learning_rate": 1.81456137910397e-05, + "loss": 0.1697, + "step": 22182 + }, + { + "epoch": 0.27730693267331685, + "grad_norm": 2.777846574783325, + "learning_rate": 1.814510753295604e-05, + "loss": 0.3228, + "step": 22184 + }, + { + "epoch": 0.27733193329833244, + "grad_norm": 3.6745386123657227, + "learning_rate": 1.8144601212840398e-05, + "loss": 1.32, + "step": 22186 + }, + { + "epoch": 0.2773569339233481, + "grad_norm": 0.07813695818185806, + "learning_rate": 1.8144094830696624e-05, + "loss": 1.4949, + "step": 22188 + }, + { + "epoch": 0.2773819345483637, + "grad_norm": 2.847507953643799, + "learning_rate": 1.8143588386528575e-05, + "loss": 0.9244, + "step": 22190 + }, + { + "epoch": 0.27740693517337933, + "grad_norm": 2.477062940597534, + "learning_rate": 1.8143081880340106e-05, + "loss": 0.4339, + "step": 22192 + }, + { + "epoch": 0.277431935798395, + "grad_norm": 2.592097043991089, + "learning_rate": 1.814257531213508e-05, + "loss": 0.8372, + "step": 22194 + }, + { + "epoch": 0.27745693642341057, + "grad_norm": 3.1013262271881104, + "learning_rate": 1.814206868191735e-05, + "loss": 0.4947, + "step": 22196 + }, + { + "epoch": 0.2774819370484262, + "grad_norm": 2.282989501953125, + "learning_rate": 1.8141561989690778e-05, + "loss": 1.2386, + "step": 22198 + }, + { + "epoch": 0.2775069376734418, + "grad_norm": 5.8767828941345215, + "learning_rate": 1.814105523545922e-05, + "loss": 1.3018, + "step": 22200 + }, + { + "epoch": 0.27753193829845746, + "grad_norm": 1.4101628065109253, + "learning_rate": 1.8140548419226535e-05, + "loss": 0.7118, + "step": 22202 + }, + { + "epoch": 0.2775569389234731, + "grad_norm": 4.828278064727783, + "learning_rate": 1.814004154099659e-05, + "loss": 1.574, + "step": 22204 + }, + { + "epoch": 0.2775819395484887, + "grad_norm": 0.026865648105740547, + "learning_rate": 1.8139534600773233e-05, + "loss": 0.0007, + "step": 22206 + }, + { + "epoch": 0.27760694017350435, + "grad_norm": 0.08251164108514786, + "learning_rate": 1.8139027598560336e-05, + "loss": 1.0422, + "step": 22208 + }, + { + "epoch": 0.27763194079851994, + "grad_norm": 0.006701998878270388, + "learning_rate": 1.8138520534361754e-05, + "loss": 0.7454, + "step": 22210 + }, + { + "epoch": 0.2776569414235356, + "grad_norm": 4.396459102630615, + "learning_rate": 1.813801340818135e-05, + "loss": 0.9392, + "step": 22212 + }, + { + "epoch": 0.27768194204855123, + "grad_norm": 3.2708096504211426, + "learning_rate": 1.8137506220022988e-05, + "loss": 0.4819, + "step": 22214 + }, + { + "epoch": 0.2777069426735668, + "grad_norm": 5.242308139801025, + "learning_rate": 1.813699896989053e-05, + "loss": 2.1879, + "step": 22216 + }, + { + "epoch": 0.2777319432985825, + "grad_norm": 2.162137269973755, + "learning_rate": 1.8136491657787837e-05, + "loss": 0.9879, + "step": 22218 + }, + { + "epoch": 0.27775694392359807, + "grad_norm": 2.853498935699463, + "learning_rate": 1.8135984283718775e-05, + "loss": 1.562, + "step": 22220 + }, + { + "epoch": 0.2777819445486137, + "grad_norm": 3.7360341548919678, + "learning_rate": 1.813547684768721e-05, + "loss": 0.8603, + "step": 22222 + }, + { + "epoch": 0.27780694517362936, + "grad_norm": 4.132920265197754, + "learning_rate": 1.8134969349696997e-05, + "loss": 1.1736, + "step": 22224 + }, + { + "epoch": 0.27783194579864495, + "grad_norm": 3.2063825130462646, + "learning_rate": 1.8134461789752013e-05, + "loss": 1.3079, + "step": 22226 + }, + { + "epoch": 0.2778569464236606, + "grad_norm": 3.735163927078247, + "learning_rate": 1.8133954167856113e-05, + "loss": 1.1027, + "step": 22228 + }, + { + "epoch": 0.2778819470486762, + "grad_norm": 4.226227283477783, + "learning_rate": 1.8133446484013177e-05, + "loss": 1.2762, + "step": 22230 + }, + { + "epoch": 0.27790694767369184, + "grad_norm": 1.2557412385940552, + "learning_rate": 1.8132938738227053e-05, + "loss": 0.4124, + "step": 22232 + }, + { + "epoch": 0.2779319482987075, + "grad_norm": 3.9321951866149902, + "learning_rate": 1.8132430930501622e-05, + "loss": 1.3216, + "step": 22234 + }, + { + "epoch": 0.2779569489237231, + "grad_norm": 0.019328903406858444, + "learning_rate": 1.813192306084075e-05, + "loss": 1.3216, + "step": 22236 + }, + { + "epoch": 0.2779819495487387, + "grad_norm": 1.7385069131851196, + "learning_rate": 1.8131415129248298e-05, + "loss": 1.2975, + "step": 22238 + }, + { + "epoch": 0.2780069501737543, + "grad_norm": 3.3371541500091553, + "learning_rate": 1.813090713572814e-05, + "loss": 0.6131, + "step": 22240 + }, + { + "epoch": 0.27803195079876997, + "grad_norm": 2.923759698867798, + "learning_rate": 1.8130399080284143e-05, + "loss": 1.644, + "step": 22242 + }, + { + "epoch": 0.2780569514237856, + "grad_norm": 3.1856820583343506, + "learning_rate": 1.8129890962920177e-05, + "loss": 1.4082, + "step": 22244 + }, + { + "epoch": 0.2780819520488012, + "grad_norm": 0.007950331084430218, + "learning_rate": 1.812938278364011e-05, + "loss": 0.6125, + "step": 22246 + }, + { + "epoch": 0.27810695267381685, + "grad_norm": 1.704097867012024, + "learning_rate": 1.8128874542447815e-05, + "loss": 0.1563, + "step": 22248 + }, + { + "epoch": 0.27813195329883245, + "grad_norm": 0.6723847985267639, + "learning_rate": 1.8128366239347158e-05, + "loss": 0.7434, + "step": 22250 + }, + { + "epoch": 0.2781569539238481, + "grad_norm": 0.04178125038743019, + "learning_rate": 1.8127857874342013e-05, + "loss": 0.6859, + "step": 22252 + }, + { + "epoch": 0.27818195454886374, + "grad_norm": 3.439464807510376, + "learning_rate": 1.8127349447436256e-05, + "loss": 1.9755, + "step": 22254 + }, + { + "epoch": 0.27820695517387933, + "grad_norm": 1.416151523590088, + "learning_rate": 1.812684095863375e-05, + "loss": 2.2466, + "step": 22256 + }, + { + "epoch": 0.278231955798895, + "grad_norm": 3.2998435497283936, + "learning_rate": 1.812633240793838e-05, + "loss": 1.8227, + "step": 22258 + }, + { + "epoch": 0.2782569564239106, + "grad_norm": 2.5859029293060303, + "learning_rate": 1.8125823795354005e-05, + "loss": 0.2382, + "step": 22260 + }, + { + "epoch": 0.2782819570489262, + "grad_norm": 3.040870189666748, + "learning_rate": 1.8125315120884505e-05, + "loss": 1.0606, + "step": 22262 + }, + { + "epoch": 0.27830695767394187, + "grad_norm": 3.321779251098633, + "learning_rate": 1.8124806384533754e-05, + "loss": 1.1604, + "step": 22264 + }, + { + "epoch": 0.27833195829895746, + "grad_norm": 4.0292487144470215, + "learning_rate": 1.812429758630563e-05, + "loss": 1.8419, + "step": 22266 + }, + { + "epoch": 0.2783569589239731, + "grad_norm": 3.0169665813446045, + "learning_rate": 1.8123788726204004e-05, + "loss": 0.9473, + "step": 22268 + }, + { + "epoch": 0.2783819595489887, + "grad_norm": 1.887647271156311, + "learning_rate": 1.812327980423275e-05, + "loss": 1.0491, + "step": 22270 + }, + { + "epoch": 0.27840696017400435, + "grad_norm": 4.268537521362305, + "learning_rate": 1.8122770820395746e-05, + "loss": 1.7554, + "step": 22272 + }, + { + "epoch": 0.27843196079902, + "grad_norm": 5.215592861175537, + "learning_rate": 1.8122261774696866e-05, + "loss": 1.297, + "step": 22274 + }, + { + "epoch": 0.2784569614240356, + "grad_norm": 4.944152355194092, + "learning_rate": 1.8121752667139994e-05, + "loss": 0.8366, + "step": 22276 + }, + { + "epoch": 0.27848196204905123, + "grad_norm": 7.417101860046387, + "learning_rate": 1.8121243497729e-05, + "loss": 1.5729, + "step": 22278 + }, + { + "epoch": 0.2785069626740668, + "grad_norm": 4.745904445648193, + "learning_rate": 1.812073426646776e-05, + "loss": 1.1155, + "step": 22280 + }, + { + "epoch": 0.2785319632990825, + "grad_norm": 2.4737086296081543, + "learning_rate": 1.8120224973360165e-05, + "loss": 0.2453, + "step": 22282 + }, + { + "epoch": 0.2785569639240981, + "grad_norm": 5.512209892272949, + "learning_rate": 1.8119715618410077e-05, + "loss": 0.7466, + "step": 22284 + }, + { + "epoch": 0.2785819645491137, + "grad_norm": 2.8202755451202393, + "learning_rate": 1.8119206201621384e-05, + "loss": 1.332, + "step": 22286 + }, + { + "epoch": 0.27860696517412936, + "grad_norm": 4.460962772369385, + "learning_rate": 1.8118696722997965e-05, + "loss": 1.856, + "step": 22288 + }, + { + "epoch": 0.27863196579914495, + "grad_norm": 1.8086278438568115, + "learning_rate": 1.8118187182543704e-05, + "loss": 0.7821, + "step": 22290 + }, + { + "epoch": 0.2786569664241606, + "grad_norm": 2.71840238571167, + "learning_rate": 1.8117677580262473e-05, + "loss": 0.9311, + "step": 22292 + }, + { + "epoch": 0.27868196704917625, + "grad_norm": 3.737499475479126, + "learning_rate": 1.811716791615816e-05, + "loss": 0.187, + "step": 22294 + }, + { + "epoch": 0.27870696767419184, + "grad_norm": 10.137548446655273, + "learning_rate": 1.8116658190234642e-05, + "loss": 0.3057, + "step": 22296 + }, + { + "epoch": 0.2787319682992075, + "grad_norm": 4.587704181671143, + "learning_rate": 1.8116148402495805e-05, + "loss": 0.1684, + "step": 22298 + }, + { + "epoch": 0.2787569689242231, + "grad_norm": 5.767232418060303, + "learning_rate": 1.8115638552945528e-05, + "loss": 2.0586, + "step": 22300 + }, + { + "epoch": 0.27878196954923873, + "grad_norm": 3.969024658203125, + "learning_rate": 1.8115128641587696e-05, + "loss": 1.0659, + "step": 22302 + }, + { + "epoch": 0.2788069701742544, + "grad_norm": 1.2018978595733643, + "learning_rate": 1.811461866842619e-05, + "loss": 0.0483, + "step": 22304 + }, + { + "epoch": 0.27883197079926997, + "grad_norm": 6.908449172973633, + "learning_rate": 1.81141086334649e-05, + "loss": 1.6603, + "step": 22306 + }, + { + "epoch": 0.2788569714242856, + "grad_norm": 2.5577943325042725, + "learning_rate": 1.8113598536707706e-05, + "loss": 1.1983, + "step": 22308 + }, + { + "epoch": 0.2788819720493012, + "grad_norm": 4.8626813888549805, + "learning_rate": 1.8113088378158488e-05, + "loss": 1.6703, + "step": 22310 + }, + { + "epoch": 0.27890697267431686, + "grad_norm": 0.0053259520791471004, + "learning_rate": 1.811257815782114e-05, + "loss": 0.5112, + "step": 22312 + }, + { + "epoch": 0.2789319732993325, + "grad_norm": 2.532519578933716, + "learning_rate": 1.8112067875699543e-05, + "loss": 0.3677, + "step": 22314 + }, + { + "epoch": 0.2789569739243481, + "grad_norm": 2.536203145980835, + "learning_rate": 1.8111557531797585e-05, + "loss": 0.7559, + "step": 22316 + }, + { + "epoch": 0.27898197454936374, + "grad_norm": 3.45021915435791, + "learning_rate": 1.811104712611915e-05, + "loss": 0.8935, + "step": 22318 + }, + { + "epoch": 0.27900697517437933, + "grad_norm": 3.7415826320648193, + "learning_rate": 1.8110536658668128e-05, + "loss": 1.5977, + "step": 22320 + }, + { + "epoch": 0.279031975799395, + "grad_norm": 2.8361928462982178, + "learning_rate": 1.8110026129448405e-05, + "loss": 0.7887, + "step": 22322 + }, + { + "epoch": 0.27905697642441063, + "grad_norm": 4.28811502456665, + "learning_rate": 1.810951553846387e-05, + "loss": 1.381, + "step": 22324 + }, + { + "epoch": 0.2790819770494262, + "grad_norm": 5.129857540130615, + "learning_rate": 1.8109004885718416e-05, + "loss": 0.4332, + "step": 22326 + }, + { + "epoch": 0.27910697767444187, + "grad_norm": 0.5564095973968506, + "learning_rate": 1.810849417121592e-05, + "loss": 0.0144, + "step": 22328 + }, + { + "epoch": 0.27913197829945746, + "grad_norm": 3.7735085487365723, + "learning_rate": 1.8107983394960286e-05, + "loss": 1.2681, + "step": 22330 + }, + { + "epoch": 0.2791569789244731, + "grad_norm": 8.634010314941406, + "learning_rate": 1.8107472556955394e-05, + "loss": 1.3974, + "step": 22332 + }, + { + "epoch": 0.27918197954948876, + "grad_norm": 4.6083526611328125, + "learning_rate": 1.8106961657205137e-05, + "loss": 1.4271, + "step": 22334 + }, + { + "epoch": 0.27920698017450435, + "grad_norm": 6.203512668609619, + "learning_rate": 1.8106450695713405e-05, + "loss": 1.4329, + "step": 22336 + }, + { + "epoch": 0.27923198079952, + "grad_norm": 2.4368011951446533, + "learning_rate": 1.8105939672484094e-05, + "loss": 0.6442, + "step": 22338 + }, + { + "epoch": 0.2792569814245356, + "grad_norm": 2.534170150756836, + "learning_rate": 1.810542858752109e-05, + "loss": 0.6353, + "step": 22340 + }, + { + "epoch": 0.27928198204955124, + "grad_norm": 2.6696572303771973, + "learning_rate": 1.810491744082829e-05, + "loss": 0.9021, + "step": 22342 + }, + { + "epoch": 0.2793069826745669, + "grad_norm": 3.4479031562805176, + "learning_rate": 1.8104406232409583e-05, + "loss": 0.9494, + "step": 22344 + }, + { + "epoch": 0.2793319832995825, + "grad_norm": 3.651827812194824, + "learning_rate": 1.8103894962268867e-05, + "loss": 2.6215, + "step": 22346 + }, + { + "epoch": 0.2793569839245981, + "grad_norm": 3.4075517654418945, + "learning_rate": 1.810338363041003e-05, + "loss": 1.5043, + "step": 22348 + }, + { + "epoch": 0.2793819845496137, + "grad_norm": 1.6244916915893555, + "learning_rate": 1.810287223683697e-05, + "loss": 0.1862, + "step": 22350 + }, + { + "epoch": 0.27940698517462936, + "grad_norm": 1.96490478515625, + "learning_rate": 1.8102360781553582e-05, + "loss": 1.3896, + "step": 22352 + }, + { + "epoch": 0.279431985799645, + "grad_norm": 1.6531528234481812, + "learning_rate": 1.810184926456376e-05, + "loss": 0.0693, + "step": 22354 + }, + { + "epoch": 0.2794569864246606, + "grad_norm": 5.414399147033691, + "learning_rate": 1.8101337685871397e-05, + "loss": 0.8593, + "step": 22356 + }, + { + "epoch": 0.27948198704967625, + "grad_norm": 3.0417118072509766, + "learning_rate": 1.8100826045480396e-05, + "loss": 1.3954, + "step": 22358 + }, + { + "epoch": 0.27950698767469184, + "grad_norm": 3.2612085342407227, + "learning_rate": 1.810031434339465e-05, + "loss": 0.638, + "step": 22360 + }, + { + "epoch": 0.2795319882997075, + "grad_norm": 2.1525399684906006, + "learning_rate": 1.809980257961805e-05, + "loss": 1.415, + "step": 22362 + }, + { + "epoch": 0.27955698892472314, + "grad_norm": 1.2750850915908813, + "learning_rate": 1.8099290754154505e-05, + "loss": 1.1817, + "step": 22364 + }, + { + "epoch": 0.27958198954973873, + "grad_norm": 0.0026650545187294483, + "learning_rate": 1.80987788670079e-05, + "loss": 0.5219, + "step": 22366 + }, + { + "epoch": 0.2796069901747544, + "grad_norm": 3.7464303970336914, + "learning_rate": 1.8098266918182146e-05, + "loss": 1.0832, + "step": 22368 + }, + { + "epoch": 0.27963199079976997, + "grad_norm": 0.005754820071160793, + "learning_rate": 1.8097754907681135e-05, + "loss": 0.0157, + "step": 22370 + }, + { + "epoch": 0.2796569914247856, + "grad_norm": 14.226799011230469, + "learning_rate": 1.8097242835508768e-05, + "loss": 1.0727, + "step": 22372 + }, + { + "epoch": 0.27968199204980126, + "grad_norm": 3.084461212158203, + "learning_rate": 1.8096730701668942e-05, + "loss": 0.5433, + "step": 22374 + }, + { + "epoch": 0.27970699267481686, + "grad_norm": 3.4833426475524902, + "learning_rate": 1.8096218506165565e-05, + "loss": 0.5053, + "step": 22376 + }, + { + "epoch": 0.2797319932998325, + "grad_norm": 2.2110278606414795, + "learning_rate": 1.8095706249002526e-05, + "loss": 0.6027, + "step": 22378 + }, + { + "epoch": 0.2797569939248481, + "grad_norm": 0.011505071073770523, + "learning_rate": 1.809519393018374e-05, + "loss": 0.1684, + "step": 22380 + }, + { + "epoch": 0.27978199454986374, + "grad_norm": 0.58389812707901, + "learning_rate": 1.8094681549713096e-05, + "loss": 0.0399, + "step": 22382 + }, + { + "epoch": 0.2798069951748794, + "grad_norm": 3.4736759662628174, + "learning_rate": 1.8094169107594505e-05, + "loss": 1.1212, + "step": 22384 + }, + { + "epoch": 0.279831995799895, + "grad_norm": 2.6716277599334717, + "learning_rate": 1.8093656603831867e-05, + "loss": 0.5472, + "step": 22386 + }, + { + "epoch": 0.27985699642491063, + "grad_norm": 3.0676567554473877, + "learning_rate": 1.8093144038429084e-05, + "loss": 0.2476, + "step": 22388 + }, + { + "epoch": 0.2798819970499262, + "grad_norm": 3.1941075325012207, + "learning_rate": 1.809263141139006e-05, + "loss": 0.3203, + "step": 22390 + }, + { + "epoch": 0.27990699767494187, + "grad_norm": 4.985957622528076, + "learning_rate": 1.80921187227187e-05, + "loss": 2.0838, + "step": 22392 + }, + { + "epoch": 0.2799319982999575, + "grad_norm": 5.406824588775635, + "learning_rate": 1.8091605972418908e-05, + "loss": 0.8078, + "step": 22394 + }, + { + "epoch": 0.2799569989249731, + "grad_norm": 2.140930652618408, + "learning_rate": 1.809109316049459e-05, + "loss": 1.212, + "step": 22396 + }, + { + "epoch": 0.27998199954998876, + "grad_norm": 2.8564090728759766, + "learning_rate": 1.809058028694965e-05, + "loss": 0.4602, + "step": 22398 + }, + { + "epoch": 0.28000700017500435, + "grad_norm": 2.2847180366516113, + "learning_rate": 1.8090067351787994e-05, + "loss": 0.1862, + "step": 22400 + }, + { + "epoch": 0.28003200080002, + "grad_norm": 0.6458494663238525, + "learning_rate": 1.808955435501353e-05, + "loss": 0.039, + "step": 22402 + }, + { + "epoch": 0.28005700142503565, + "grad_norm": 6.552790641784668, + "learning_rate": 1.8089041296630167e-05, + "loss": 1.6236, + "step": 22404 + }, + { + "epoch": 0.28008200205005124, + "grad_norm": 5.065819263458252, + "learning_rate": 1.8088528176641804e-05, + "loss": 1.0342, + "step": 22406 + }, + { + "epoch": 0.2801070026750669, + "grad_norm": 10.389139175415039, + "learning_rate": 1.8088014995052357e-05, + "loss": 2.4903, + "step": 22408 + }, + { + "epoch": 0.2801320033000825, + "grad_norm": 0.007825713604688644, + "learning_rate": 1.808750175186573e-05, + "loss": 0.8784, + "step": 22410 + }, + { + "epoch": 0.2801570039250981, + "grad_norm": 2.3029985427856445, + "learning_rate": 1.8086988447085835e-05, + "loss": 0.4331, + "step": 22412 + }, + { + "epoch": 0.2801820045501138, + "grad_norm": 0.3771038353443146, + "learning_rate": 1.808647508071658e-05, + "loss": 0.4849, + "step": 22414 + }, + { + "epoch": 0.28020700517512936, + "grad_norm": 0.011246937327086926, + "learning_rate": 1.808596165276187e-05, + "loss": 0.9249, + "step": 22416 + }, + { + "epoch": 0.280232005800145, + "grad_norm": 0.8620988130569458, + "learning_rate": 1.8085448163225627e-05, + "loss": 0.6834, + "step": 22418 + }, + { + "epoch": 0.2802570064251606, + "grad_norm": 2.0743327140808105, + "learning_rate": 1.8084934612111747e-05, + "loss": 0.5845, + "step": 22420 + }, + { + "epoch": 0.28028200705017625, + "grad_norm": 7.712711334228516, + "learning_rate": 1.8084420999424153e-05, + "loss": 1.6383, + "step": 22422 + }, + { + "epoch": 0.2803070076751919, + "grad_norm": 3.100604772567749, + "learning_rate": 1.808390732516675e-05, + "loss": 1.3675, + "step": 22424 + }, + { + "epoch": 0.2803320083002075, + "grad_norm": 3.047637462615967, + "learning_rate": 1.8083393589343452e-05, + "loss": 0.3107, + "step": 22426 + }, + { + "epoch": 0.28035700892522314, + "grad_norm": 2.3308255672454834, + "learning_rate": 1.808287979195817e-05, + "loss": 1.258, + "step": 22428 + }, + { + "epoch": 0.28038200955023873, + "grad_norm": 5.205602169036865, + "learning_rate": 1.8082365933014824e-05, + "loss": 0.9573, + "step": 22430 + }, + { + "epoch": 0.2804070101752544, + "grad_norm": 3.960265874862671, + "learning_rate": 1.8081852012517317e-05, + "loss": 1.2429, + "step": 22432 + }, + { + "epoch": 0.28043201080027, + "grad_norm": 0.4298723638057709, + "learning_rate": 1.8081338030469567e-05, + "loss": 0.6799, + "step": 22434 + }, + { + "epoch": 0.2804570114252856, + "grad_norm": 4.247316837310791, + "learning_rate": 1.8080823986875493e-05, + "loss": 0.94, + "step": 22436 + }, + { + "epoch": 0.28048201205030127, + "grad_norm": 1.2925852537155151, + "learning_rate": 1.8080309881739002e-05, + "loss": 0.668, + "step": 22438 + }, + { + "epoch": 0.28050701267531686, + "grad_norm": 0.011049228720366955, + "learning_rate": 1.807979571506402e-05, + "loss": 0.544, + "step": 22440 + }, + { + "epoch": 0.2805320133003325, + "grad_norm": 1.7056645154953003, + "learning_rate": 1.807928148685445e-05, + "loss": 0.7173, + "step": 22442 + }, + { + "epoch": 0.28055701392534815, + "grad_norm": 4.1333909034729, + "learning_rate": 1.807876719711422e-05, + "loss": 1.2224, + "step": 22444 + }, + { + "epoch": 0.28058201455036375, + "grad_norm": 0.7411629557609558, + "learning_rate": 1.8078252845847238e-05, + "loss": 0.4506, + "step": 22446 + }, + { + "epoch": 0.2806070151753794, + "grad_norm": 4.496321201324463, + "learning_rate": 1.8077738433057424e-05, + "loss": 1.9099, + "step": 22448 + }, + { + "epoch": 0.280632015800395, + "grad_norm": 3.7563164234161377, + "learning_rate": 1.80772239587487e-05, + "loss": 1.1523, + "step": 22450 + }, + { + "epoch": 0.28065701642541063, + "grad_norm": 2.4868152141571045, + "learning_rate": 1.8076709422924978e-05, + "loss": 0.6413, + "step": 22452 + }, + { + "epoch": 0.2806820170504263, + "grad_norm": 3.2037642002105713, + "learning_rate": 1.807619482559018e-05, + "loss": 0.9813, + "step": 22454 + }, + { + "epoch": 0.28070701767544187, + "grad_norm": 1.3692227602005005, + "learning_rate": 1.8075680166748223e-05, + "loss": 0.2322, + "step": 22456 + }, + { + "epoch": 0.2807320183004575, + "grad_norm": 2.4895944595336914, + "learning_rate": 1.807516544640303e-05, + "loss": 0.6235, + "step": 22458 + }, + { + "epoch": 0.2807570189254731, + "grad_norm": 0.049575645476579666, + "learning_rate": 1.8074650664558518e-05, + "loss": 0.0125, + "step": 22460 + }, + { + "epoch": 0.28078201955048876, + "grad_norm": 6.345440864562988, + "learning_rate": 1.8074135821218608e-05, + "loss": 1.6445, + "step": 22462 + }, + { + "epoch": 0.2808070201755044, + "grad_norm": 3.8424625396728516, + "learning_rate": 1.807362091638722e-05, + "loss": 1.2565, + "step": 22464 + }, + { + "epoch": 0.28083202080052, + "grad_norm": 4.038275241851807, + "learning_rate": 1.8073105950068283e-05, + "loss": 0.7585, + "step": 22466 + }, + { + "epoch": 0.28085702142553565, + "grad_norm": 2.93155574798584, + "learning_rate": 1.8072590922265705e-05, + "loss": 1.077, + "step": 22468 + }, + { + "epoch": 0.28088202205055124, + "grad_norm": 3.752326488494873, + "learning_rate": 1.807207583298342e-05, + "loss": 1.0613, + "step": 22470 + }, + { + "epoch": 0.2809070226755669, + "grad_norm": 3.215703248977661, + "learning_rate": 1.8071560682225346e-05, + "loss": 1.1422, + "step": 22472 + }, + { + "epoch": 0.28093202330058253, + "grad_norm": 6.5502166748046875, + "learning_rate": 1.8071045469995408e-05, + "loss": 1.0584, + "step": 22474 + }, + { + "epoch": 0.2809570239255981, + "grad_norm": 3.9444119930267334, + "learning_rate": 1.807053019629753e-05, + "loss": 1.2281, + "step": 22476 + }, + { + "epoch": 0.2809820245506138, + "grad_norm": 1.4602136611938477, + "learning_rate": 1.8070014861135632e-05, + "loss": 0.7611, + "step": 22478 + }, + { + "epoch": 0.28100702517562937, + "grad_norm": 3.333648443222046, + "learning_rate": 1.8069499464513645e-05, + "loss": 1.0741, + "step": 22480 + }, + { + "epoch": 0.281032025800645, + "grad_norm": 2.6899781227111816, + "learning_rate": 1.8068984006435488e-05, + "loss": 1.5306, + "step": 22482 + }, + { + "epoch": 0.28105702642566066, + "grad_norm": 2.659675359725952, + "learning_rate": 1.8068468486905093e-05, + "loss": 1.5654, + "step": 22484 + }, + { + "epoch": 0.28108202705067625, + "grad_norm": 0.27673017978668213, + "learning_rate": 1.806795290592638e-05, + "loss": 0.6734, + "step": 22486 + }, + { + "epoch": 0.2811070276756919, + "grad_norm": 0.04627593606710434, + "learning_rate": 1.806743726350328e-05, + "loss": 0.3659, + "step": 22488 + }, + { + "epoch": 0.2811320283007075, + "grad_norm": 1.4945379495620728, + "learning_rate": 1.8066921559639722e-05, + "loss": 0.4192, + "step": 22490 + }, + { + "epoch": 0.28115702892572314, + "grad_norm": 2.007688045501709, + "learning_rate": 1.8066405794339625e-05, + "loss": 0.0993, + "step": 22492 + }, + { + "epoch": 0.2811820295507388, + "grad_norm": 5.401791572570801, + "learning_rate": 1.8065889967606924e-05, + "loss": 0.4572, + "step": 22494 + }, + { + "epoch": 0.2812070301757544, + "grad_norm": 2.339076042175293, + "learning_rate": 1.8065374079445548e-05, + "loss": 0.1522, + "step": 22496 + }, + { + "epoch": 0.28123203080077, + "grad_norm": 0.11060594767332077, + "learning_rate": 1.8064858129859423e-05, + "loss": 0.4215, + "step": 22498 + }, + { + "epoch": 0.2812570314257856, + "grad_norm": 5.868184566497803, + "learning_rate": 1.8064342118852476e-05, + "loss": 0.8187, + "step": 22500 + }, + { + "epoch": 0.28128203205080127, + "grad_norm": 0.027463020756840706, + "learning_rate": 1.806382604642864e-05, + "loss": 0.8949, + "step": 22502 + }, + { + "epoch": 0.2813070326758169, + "grad_norm": 4.068359375, + "learning_rate": 1.8063309912591846e-05, + "loss": 1.4028, + "step": 22504 + }, + { + "epoch": 0.2813320333008325, + "grad_norm": 0.004840783309191465, + "learning_rate": 1.8062793717346027e-05, + "loss": 1.0037, + "step": 22506 + }, + { + "epoch": 0.28135703392584815, + "grad_norm": 0.007624255493283272, + "learning_rate": 1.8062277460695106e-05, + "loss": 0.2931, + "step": 22508 + }, + { + "epoch": 0.28138203455086375, + "grad_norm": 6.668421268463135, + "learning_rate": 1.8061761142643025e-05, + "loss": 2.2728, + "step": 22510 + }, + { + "epoch": 0.2814070351758794, + "grad_norm": 3.255150556564331, + "learning_rate": 1.8061244763193707e-05, + "loss": 0.721, + "step": 22512 + }, + { + "epoch": 0.28143203580089504, + "grad_norm": 4.877946853637695, + "learning_rate": 1.806072832235109e-05, + "loss": 0.4163, + "step": 22514 + }, + { + "epoch": 0.28145703642591063, + "grad_norm": 2.4448025226593018, + "learning_rate": 1.806021182011911e-05, + "loss": 0.8396, + "step": 22516 + }, + { + "epoch": 0.2814820370509263, + "grad_norm": 2.907005548477173, + "learning_rate": 1.805969525650169e-05, + "loss": 0.8931, + "step": 22518 + }, + { + "epoch": 0.2815070376759419, + "grad_norm": 1.4807757139205933, + "learning_rate": 1.8059178631502776e-05, + "loss": 0.5098, + "step": 22520 + }, + { + "epoch": 0.2815320383009575, + "grad_norm": 5.672238826751709, + "learning_rate": 1.8058661945126295e-05, + "loss": 1.2043, + "step": 22522 + }, + { + "epoch": 0.28155703892597317, + "grad_norm": 5.820294380187988, + "learning_rate": 1.8058145197376185e-05, + "loss": 1.6655, + "step": 22524 + }, + { + "epoch": 0.28158203955098876, + "grad_norm": 2.7004857063293457, + "learning_rate": 1.805762838825638e-05, + "loss": 0.1862, + "step": 22526 + }, + { + "epoch": 0.2816070401760044, + "grad_norm": 2.804824113845825, + "learning_rate": 1.8057111517770817e-05, + "loss": 0.7772, + "step": 22528 + }, + { + "epoch": 0.28163204080102, + "grad_norm": 3.315044403076172, + "learning_rate": 1.8056594585923433e-05, + "loss": 1.8441, + "step": 22530 + }, + { + "epoch": 0.28165704142603565, + "grad_norm": 5.45849084854126, + "learning_rate": 1.805607759271816e-05, + "loss": 1.4948, + "step": 22532 + }, + { + "epoch": 0.2816820420510513, + "grad_norm": 3.415188789367676, + "learning_rate": 1.805556053815894e-05, + "loss": 1.543, + "step": 22534 + }, + { + "epoch": 0.2817070426760669, + "grad_norm": 3.009883165359497, + "learning_rate": 1.8055043422249716e-05, + "loss": 0.7803, + "step": 22536 + }, + { + "epoch": 0.28173204330108254, + "grad_norm": 3.158080577850342, + "learning_rate": 1.8054526244994416e-05, + "loss": 0.8449, + "step": 22538 + }, + { + "epoch": 0.2817570439260981, + "grad_norm": 2.603726387023926, + "learning_rate": 1.8054009006396984e-05, + "loss": 0.6746, + "step": 22540 + }, + { + "epoch": 0.2817820445511138, + "grad_norm": 1.414620280265808, + "learning_rate": 1.8053491706461356e-05, + "loss": 2.1533, + "step": 22542 + }, + { + "epoch": 0.2818070451761294, + "grad_norm": 3.4766361713409424, + "learning_rate": 1.805297434519148e-05, + "loss": 1.4402, + "step": 22544 + }, + { + "epoch": 0.281832045801145, + "grad_norm": 4.6289849281311035, + "learning_rate": 1.8052456922591285e-05, + "loss": 1.2209, + "step": 22546 + }, + { + "epoch": 0.28185704642616066, + "grad_norm": 8.29099178314209, + "learning_rate": 1.805193943866472e-05, + "loss": 1.4283, + "step": 22548 + }, + { + "epoch": 0.28188204705117625, + "grad_norm": 3.584177255630493, + "learning_rate": 1.8051421893415718e-05, + "loss": 0.8254, + "step": 22550 + }, + { + "epoch": 0.2819070476761919, + "grad_norm": 0.015041586942970753, + "learning_rate": 1.8050904286848227e-05, + "loss": 1.0065, + "step": 22552 + }, + { + "epoch": 0.28193204830120755, + "grad_norm": 2.7432801723480225, + "learning_rate": 1.805038661896619e-05, + "loss": 1.1161, + "step": 22554 + }, + { + "epoch": 0.28195704892622314, + "grad_norm": 0.006265197414904833, + "learning_rate": 1.8049868889773546e-05, + "loss": 0.7968, + "step": 22556 + }, + { + "epoch": 0.2819820495512388, + "grad_norm": 3.1105306148529053, + "learning_rate": 1.8049351099274236e-05, + "loss": 0.2549, + "step": 22558 + }, + { + "epoch": 0.2820070501762544, + "grad_norm": 2.073570966720581, + "learning_rate": 1.804883324747221e-05, + "loss": 0.8262, + "step": 22560 + }, + { + "epoch": 0.28203205080127003, + "grad_norm": 1.417624592781067, + "learning_rate": 1.8048315334371408e-05, + "loss": 0.1117, + "step": 22562 + }, + { + "epoch": 0.2820570514262857, + "grad_norm": 4.240817070007324, + "learning_rate": 1.8047797359975774e-05, + "loss": 1.4675, + "step": 22564 + }, + { + "epoch": 0.28208205205130127, + "grad_norm": 6.87900972366333, + "learning_rate": 1.8047279324289252e-05, + "loss": 1.0659, + "step": 22566 + }, + { + "epoch": 0.2821070526763169, + "grad_norm": 5.758504390716553, + "learning_rate": 1.804676122731579e-05, + "loss": 1.6169, + "step": 22568 + }, + { + "epoch": 0.2821320533013325, + "grad_norm": 0.0064103570766747, + "learning_rate": 1.8046243069059333e-05, + "loss": 0.0115, + "step": 22570 + }, + { + "epoch": 0.28215705392634816, + "grad_norm": 1.3160643577575684, + "learning_rate": 1.8045724849523825e-05, + "loss": 0.519, + "step": 22572 + }, + { + "epoch": 0.2821820545513638, + "grad_norm": 4.300440311431885, + "learning_rate": 1.8045206568713216e-05, + "loss": 0.8396, + "step": 22574 + }, + { + "epoch": 0.2822070551763794, + "grad_norm": 2.664644241333008, + "learning_rate": 1.8044688226631452e-05, + "loss": 0.0739, + "step": 22576 + }, + { + "epoch": 0.28223205580139504, + "grad_norm": 2.2475945949554443, + "learning_rate": 1.8044169823282476e-05, + "loss": 0.2324, + "step": 22578 + }, + { + "epoch": 0.28225705642641064, + "grad_norm": 0.17698733508586884, + "learning_rate": 1.804365135867024e-05, + "loss": 0.1397, + "step": 22580 + }, + { + "epoch": 0.2822820570514263, + "grad_norm": 0.5665932297706604, + "learning_rate": 1.80431328327987e-05, + "loss": 0.7897, + "step": 22582 + }, + { + "epoch": 0.28230705767644193, + "grad_norm": 2.5672338008880615, + "learning_rate": 1.8042614245671793e-05, + "loss": 0.4255, + "step": 22584 + }, + { + "epoch": 0.2823320583014575, + "grad_norm": 3.9688947200775146, + "learning_rate": 1.8042095597293473e-05, + "loss": 1.4297, + "step": 22586 + }, + { + "epoch": 0.28235705892647317, + "grad_norm": 4.973527908325195, + "learning_rate": 1.804157688766769e-05, + "loss": 1.5596, + "step": 22588 + }, + { + "epoch": 0.28238205955148876, + "grad_norm": 5.175196170806885, + "learning_rate": 1.8041058116798397e-05, + "loss": 1.0872, + "step": 22590 + }, + { + "epoch": 0.2824070601765044, + "grad_norm": 2.7489967346191406, + "learning_rate": 1.804053928468954e-05, + "loss": 1.6967, + "step": 22592 + }, + { + "epoch": 0.28243206080152006, + "grad_norm": 2.3158817291259766, + "learning_rate": 1.8040020391345073e-05, + "loss": 0.4362, + "step": 22594 + }, + { + "epoch": 0.28245706142653565, + "grad_norm": 5.762673854827881, + "learning_rate": 1.803950143676895e-05, + "loss": 1.4915, + "step": 22596 + }, + { + "epoch": 0.2824820620515513, + "grad_norm": 15.209907531738281, + "learning_rate": 1.8038982420965118e-05, + "loss": 1.3601, + "step": 22598 + }, + { + "epoch": 0.2825070626765669, + "grad_norm": 1.6913974285125732, + "learning_rate": 1.8038463343937534e-05, + "loss": 1.2552, + "step": 22600 + }, + { + "epoch": 0.28253206330158254, + "grad_norm": 3.6669118404388428, + "learning_rate": 1.803794420569015e-05, + "loss": 0.6415, + "step": 22602 + }, + { + "epoch": 0.2825570639265982, + "grad_norm": 2.443254232406616, + "learning_rate": 1.8037425006226918e-05, + "loss": 0.4502, + "step": 22604 + }, + { + "epoch": 0.2825820645516138, + "grad_norm": 4.702226161956787, + "learning_rate": 1.8036905745551797e-05, + "loss": 1.7764, + "step": 22606 + }, + { + "epoch": 0.2826070651766294, + "grad_norm": 2.9404096603393555, + "learning_rate": 1.8036386423668734e-05, + "loss": 0.761, + "step": 22608 + }, + { + "epoch": 0.282632065801645, + "grad_norm": 3.3180184364318848, + "learning_rate": 1.8035867040581695e-05, + "loss": 0.7918, + "step": 22610 + }, + { + "epoch": 0.28265706642666066, + "grad_norm": 3.0501458644866943, + "learning_rate": 1.8035347596294622e-05, + "loss": 0.6796, + "step": 22612 + }, + { + "epoch": 0.2826820670516763, + "grad_norm": 2.6524932384490967, + "learning_rate": 1.8034828090811482e-05, + "loss": 0.9645, + "step": 22614 + }, + { + "epoch": 0.2827070676766919, + "grad_norm": 4.743135452270508, + "learning_rate": 1.8034308524136225e-05, + "loss": 1.5052, + "step": 22616 + }, + { + "epoch": 0.28273206830170755, + "grad_norm": 2.811819076538086, + "learning_rate": 1.8033788896272813e-05, + "loss": 0.6752, + "step": 22618 + }, + { + "epoch": 0.28275706892672314, + "grad_norm": 0.07437650114297867, + "learning_rate": 1.80332692072252e-05, + "loss": 0.0461, + "step": 22620 + }, + { + "epoch": 0.2827820695517388, + "grad_norm": 3.022099256515503, + "learning_rate": 1.8032749456997343e-05, + "loss": 0.5527, + "step": 22622 + }, + { + "epoch": 0.28280707017675444, + "grad_norm": 0.004970664158463478, + "learning_rate": 1.8032229645593204e-05, + "loss": 0.5684, + "step": 22624 + }, + { + "epoch": 0.28283207080177003, + "grad_norm": 1.888890027999878, + "learning_rate": 1.8031709773016742e-05, + "loss": 0.3467, + "step": 22626 + }, + { + "epoch": 0.2828570714267857, + "grad_norm": 1.4367051124572754, + "learning_rate": 1.803118983927191e-05, + "loss": 1.596, + "step": 22628 + }, + { + "epoch": 0.28288207205180127, + "grad_norm": 10.147915840148926, + "learning_rate": 1.803066984436267e-05, + "loss": 0.7258, + "step": 22630 + }, + { + "epoch": 0.2829070726768169, + "grad_norm": 8.039432525634766, + "learning_rate": 1.803014978829299e-05, + "loss": 0.7433, + "step": 22632 + }, + { + "epoch": 0.28293207330183257, + "grad_norm": 2.7234551906585693, + "learning_rate": 1.8029629671066823e-05, + "loss": 1.3961, + "step": 22634 + }, + { + "epoch": 0.28295707392684816, + "grad_norm": 1.3461685180664062, + "learning_rate": 1.8029109492688133e-05, + "loss": 0.9609, + "step": 22636 + }, + { + "epoch": 0.2829820745518638, + "grad_norm": 9.882031440734863, + "learning_rate": 1.8028589253160877e-05, + "loss": 0.9751, + "step": 22638 + }, + { + "epoch": 0.2830070751768794, + "grad_norm": 5.042984962463379, + "learning_rate": 1.802806895248902e-05, + "loss": 0.5442, + "step": 22640 + }, + { + "epoch": 0.28303207580189504, + "grad_norm": 2.8206779956817627, + "learning_rate": 1.802754859067653e-05, + "loss": 0.4278, + "step": 22642 + }, + { + "epoch": 0.2830570764269107, + "grad_norm": 5.244500637054443, + "learning_rate": 1.8027028167727362e-05, + "loss": 2.4588, + "step": 22644 + }, + { + "epoch": 0.2830820770519263, + "grad_norm": 5.221538066864014, + "learning_rate": 1.8026507683645486e-05, + "loss": 1.7993, + "step": 22646 + }, + { + "epoch": 0.28310707767694193, + "grad_norm": 7.667856216430664, + "learning_rate": 1.8025987138434858e-05, + "loss": 0.6936, + "step": 22648 + }, + { + "epoch": 0.2831320783019575, + "grad_norm": 2.860443115234375, + "learning_rate": 1.802546653209945e-05, + "loss": 0.4503, + "step": 22650 + }, + { + "epoch": 0.28315707892697317, + "grad_norm": 5.596779823303223, + "learning_rate": 1.802494586464322e-05, + "loss": 2.1718, + "step": 22652 + }, + { + "epoch": 0.2831820795519888, + "grad_norm": 4.548329830169678, + "learning_rate": 1.802442513607014e-05, + "loss": 0.1958, + "step": 22654 + }, + { + "epoch": 0.2832070801770044, + "grad_norm": 0.006668606773018837, + "learning_rate": 1.8023904346384172e-05, + "loss": 0.2526, + "step": 22656 + }, + { + "epoch": 0.28323208080202006, + "grad_norm": 2.797579765319824, + "learning_rate": 1.8023383495589285e-05, + "loss": 0.6996, + "step": 22658 + }, + { + "epoch": 0.28325708142703565, + "grad_norm": 1.5118285417556763, + "learning_rate": 1.802286258368944e-05, + "loss": 0.5883, + "step": 22660 + }, + { + "epoch": 0.2832820820520513, + "grad_norm": 2.984626531600952, + "learning_rate": 1.8022341610688614e-05, + "loss": 1.7465, + "step": 22662 + }, + { + "epoch": 0.28330708267706695, + "grad_norm": 2.704120635986328, + "learning_rate": 1.8021820576590766e-05, + "loss": 1.5606, + "step": 22664 + }, + { + "epoch": 0.28333208330208254, + "grad_norm": 3.9244213104248047, + "learning_rate": 1.8021299481399866e-05, + "loss": 1.1256, + "step": 22666 + }, + { + "epoch": 0.2833570839270982, + "grad_norm": 2.318753480911255, + "learning_rate": 1.8020778325119883e-05, + "loss": 0.6254, + "step": 22668 + }, + { + "epoch": 0.2833820845521138, + "grad_norm": 5.403872489929199, + "learning_rate": 1.8020257107754785e-05, + "loss": 1.3639, + "step": 22670 + }, + { + "epoch": 0.2834070851771294, + "grad_norm": 3.478811740875244, + "learning_rate": 1.8019735829308548e-05, + "loss": 0.6442, + "step": 22672 + }, + { + "epoch": 0.2834320858021451, + "grad_norm": 2.9772863388061523, + "learning_rate": 1.8019214489785133e-05, + "loss": 0.4184, + "step": 22674 + }, + { + "epoch": 0.28345708642716066, + "grad_norm": 3.3989737033843994, + "learning_rate": 1.8018693089188515e-05, + "loss": 0.1687, + "step": 22676 + }, + { + "epoch": 0.2834820870521763, + "grad_norm": 4.06696081161499, + "learning_rate": 1.801817162752267e-05, + "loss": 1.6503, + "step": 22678 + }, + { + "epoch": 0.2835070876771919, + "grad_norm": 1.5133873224258423, + "learning_rate": 1.8017650104791554e-05, + "loss": 1.1303, + "step": 22680 + }, + { + "epoch": 0.28353208830220755, + "grad_norm": 5.584849834442139, + "learning_rate": 1.8017128520999155e-05, + "loss": 1.2976, + "step": 22682 + }, + { + "epoch": 0.2835570889272232, + "grad_norm": 3.481398344039917, + "learning_rate": 1.801660687614944e-05, + "loss": 0.5759, + "step": 22684 + }, + { + "epoch": 0.2835820895522388, + "grad_norm": 3.9445712566375732, + "learning_rate": 1.801608517024638e-05, + "loss": 0.7113, + "step": 22686 + }, + { + "epoch": 0.28360709017725444, + "grad_norm": 2.4630508422851562, + "learning_rate": 1.8015563403293944e-05, + "loss": 0.5878, + "step": 22688 + }, + { + "epoch": 0.28363209080227003, + "grad_norm": 3.639876365661621, + "learning_rate": 1.8015041575296113e-05, + "loss": 0.952, + "step": 22690 + }, + { + "epoch": 0.2836570914272857, + "grad_norm": 4.613024711608887, + "learning_rate": 1.8014519686256858e-05, + "loss": 0.9251, + "step": 22692 + }, + { + "epoch": 0.2836820920523013, + "grad_norm": 4.680307865142822, + "learning_rate": 1.801399773618016e-05, + "loss": 1.4433, + "step": 22694 + }, + { + "epoch": 0.2837070926773169, + "grad_norm": 0.2994896173477173, + "learning_rate": 1.8013475725069982e-05, + "loss": 0.01, + "step": 22696 + }, + { + "epoch": 0.28373209330233257, + "grad_norm": 3.9100775718688965, + "learning_rate": 1.8012953652930306e-05, + "loss": 0.8594, + "step": 22698 + }, + { + "epoch": 0.28375709392734816, + "grad_norm": 2.468308687210083, + "learning_rate": 1.8012431519765114e-05, + "loss": 0.6295, + "step": 22700 + }, + { + "epoch": 0.2837820945523638, + "grad_norm": 0.005482542794197798, + "learning_rate": 1.8011909325578375e-05, + "loss": 0.4244, + "step": 22702 + }, + { + "epoch": 0.28380709517737945, + "grad_norm": 4.222771644592285, + "learning_rate": 1.8011387070374065e-05, + "loss": 1.2728, + "step": 22704 + }, + { + "epoch": 0.28383209580239505, + "grad_norm": 2.6608939170837402, + "learning_rate": 1.8010864754156166e-05, + "loss": 0.4012, + "step": 22706 + }, + { + "epoch": 0.2838570964274107, + "grad_norm": 6.405643939971924, + "learning_rate": 1.8010342376928653e-05, + "loss": 0.2832, + "step": 22708 + }, + { + "epoch": 0.2838820970524263, + "grad_norm": 2.4258577823638916, + "learning_rate": 1.8009819938695502e-05, + "loss": 0.5725, + "step": 22710 + }, + { + "epoch": 0.28390709767744193, + "grad_norm": 5.703607082366943, + "learning_rate": 1.80092974394607e-05, + "loss": 1.301, + "step": 22712 + }, + { + "epoch": 0.2839320983024576, + "grad_norm": 3.128369092941284, + "learning_rate": 1.8008774879228217e-05, + "loss": 0.6764, + "step": 22714 + }, + { + "epoch": 0.2839570989274732, + "grad_norm": 2.373439311981201, + "learning_rate": 1.800825225800204e-05, + "loss": 1.2485, + "step": 22716 + }, + { + "epoch": 0.2839820995524888, + "grad_norm": 3.5420384407043457, + "learning_rate": 1.8007729575786146e-05, + "loss": 1.1438, + "step": 22718 + }, + { + "epoch": 0.2840071001775044, + "grad_norm": 5.347560882568359, + "learning_rate": 1.8007206832584512e-05, + "loss": 1.2897, + "step": 22720 + }, + { + "epoch": 0.28403210080252006, + "grad_norm": 4.64316463470459, + "learning_rate": 1.8006684028401128e-05, + "loss": 1.1672, + "step": 22722 + }, + { + "epoch": 0.2840571014275357, + "grad_norm": 2.1392171382904053, + "learning_rate": 1.800616116323997e-05, + "loss": 0.4347, + "step": 22724 + }, + { + "epoch": 0.2840821020525513, + "grad_norm": 2.3363049030303955, + "learning_rate": 1.800563823710502e-05, + "loss": 1.4235, + "step": 22726 + }, + { + "epoch": 0.28410710267756695, + "grad_norm": 1.632436752319336, + "learning_rate": 1.8005115250000262e-05, + "loss": 0.2059, + "step": 22728 + }, + { + "epoch": 0.28413210330258254, + "grad_norm": 0.32729578018188477, + "learning_rate": 1.800459220192968e-05, + "loss": 0.062, + "step": 22730 + }, + { + "epoch": 0.2841571039275982, + "grad_norm": 5.526390552520752, + "learning_rate": 1.8004069092897252e-05, + "loss": 1.984, + "step": 22732 + }, + { + "epoch": 0.28418210455261383, + "grad_norm": 1.2628660202026367, + "learning_rate": 1.8003545922906965e-05, + "loss": 1.2854, + "step": 22734 + }, + { + "epoch": 0.2842071051776294, + "grad_norm": 4.921426296234131, + "learning_rate": 1.8003022691962808e-05, + "loss": 0.8989, + "step": 22736 + }, + { + "epoch": 0.2842321058026451, + "grad_norm": 0.0045179217122495174, + "learning_rate": 1.800249940006876e-05, + "loss": 0.513, + "step": 22738 + }, + { + "epoch": 0.28425710642766067, + "grad_norm": 4.177064895629883, + "learning_rate": 1.8001976047228808e-05, + "loss": 1.5924, + "step": 22740 + }, + { + "epoch": 0.2842821070526763, + "grad_norm": 1.8008428812026978, + "learning_rate": 1.8001452633446938e-05, + "loss": 0.8511, + "step": 22742 + }, + { + "epoch": 0.28430710767769196, + "grad_norm": 2.233739137649536, + "learning_rate": 1.8000929158727135e-05, + "loss": 1.3089, + "step": 22744 + }, + { + "epoch": 0.28433210830270755, + "grad_norm": 2.6794700622558594, + "learning_rate": 1.8000405623073392e-05, + "loss": 1.0698, + "step": 22746 + }, + { + "epoch": 0.2843571089277232, + "grad_norm": 2.898247480392456, + "learning_rate": 1.7999882026489686e-05, + "loss": 1.1755, + "step": 22748 + }, + { + "epoch": 0.2843821095527388, + "grad_norm": 0.027961434796452522, + "learning_rate": 1.799935836898001e-05, + "loss": 1.118, + "step": 22750 + }, + { + "epoch": 0.28440711017775444, + "grad_norm": 3.6810238361358643, + "learning_rate": 1.7998834650548355e-05, + "loss": 0.727, + "step": 22752 + }, + { + "epoch": 0.2844321108027701, + "grad_norm": 3.1237430572509766, + "learning_rate": 1.7998310871198703e-05, + "loss": 0.134, + "step": 22754 + }, + { + "epoch": 0.2844571114277857, + "grad_norm": 7.591385841369629, + "learning_rate": 1.799778703093505e-05, + "loss": 1.3393, + "step": 22756 + }, + { + "epoch": 0.28448211205280133, + "grad_norm": 0.49765288829803467, + "learning_rate": 1.799726312976138e-05, + "loss": 0.8364, + "step": 22758 + }, + { + "epoch": 0.2845071126778169, + "grad_norm": 3.8534913063049316, + "learning_rate": 1.7996739167681685e-05, + "loss": 1.2623, + "step": 22760 + }, + { + "epoch": 0.28453211330283257, + "grad_norm": 2.2889997959136963, + "learning_rate": 1.7996215144699956e-05, + "loss": 0.1073, + "step": 22762 + }, + { + "epoch": 0.2845571139278482, + "grad_norm": 4.025629997253418, + "learning_rate": 1.799569106082018e-05, + "loss": 1.0048, + "step": 22764 + }, + { + "epoch": 0.2845821145528638, + "grad_norm": 2.0402514934539795, + "learning_rate": 1.7995166916046356e-05, + "loss": 0.7673, + "step": 22766 + }, + { + "epoch": 0.28460711517787946, + "grad_norm": 6.340447425842285, + "learning_rate": 1.799464271038247e-05, + "loss": 1.8154, + "step": 22768 + }, + { + "epoch": 0.28463211580289505, + "grad_norm": 6.402575969696045, + "learning_rate": 1.7994118443832515e-05, + "loss": 1.8572, + "step": 22770 + }, + { + "epoch": 0.2846571164279107, + "grad_norm": 2.341599941253662, + "learning_rate": 1.7993594116400484e-05, + "loss": 1.0431, + "step": 22772 + }, + { + "epoch": 0.28468211705292634, + "grad_norm": 3.0541939735412598, + "learning_rate": 1.7993069728090373e-05, + "loss": 0.5984, + "step": 22774 + }, + { + "epoch": 0.28470711767794193, + "grad_norm": 2.3529856204986572, + "learning_rate": 1.799254527890617e-05, + "loss": 0.1506, + "step": 22776 + }, + { + "epoch": 0.2847321183029576, + "grad_norm": 4.819988250732422, + "learning_rate": 1.7992020768851875e-05, + "loss": 1.2747, + "step": 22778 + }, + { + "epoch": 0.2847571189279732, + "grad_norm": 2.075854539871216, + "learning_rate": 1.799149619793148e-05, + "loss": 0.8546, + "step": 22780 + }, + { + "epoch": 0.2847821195529888, + "grad_norm": 3.0806639194488525, + "learning_rate": 1.7990971566148977e-05, + "loss": 1.1172, + "step": 22782 + }, + { + "epoch": 0.28480712017800447, + "grad_norm": 1.9650299549102783, + "learning_rate": 1.7990446873508367e-05, + "loss": 0.7902, + "step": 22784 + }, + { + "epoch": 0.28483212080302006, + "grad_norm": 0.006601263768970966, + "learning_rate": 1.7989922120013644e-05, + "loss": 0.7882, + "step": 22786 + }, + { + "epoch": 0.2848571214280357, + "grad_norm": 0.9265008568763733, + "learning_rate": 1.7989397305668803e-05, + "loss": 0.7838, + "step": 22788 + }, + { + "epoch": 0.2848821220530513, + "grad_norm": 4.992796897888184, + "learning_rate": 1.798887243047784e-05, + "loss": 1.5805, + "step": 22790 + }, + { + "epoch": 0.28490712267806695, + "grad_norm": 2.229637861251831, + "learning_rate": 1.7988347494444757e-05, + "loss": 0.6998, + "step": 22792 + }, + { + "epoch": 0.2849321233030826, + "grad_norm": 2.8092470169067383, + "learning_rate": 1.7987822497573548e-05, + "loss": 0.8573, + "step": 22794 + }, + { + "epoch": 0.2849571239280982, + "grad_norm": 2.050896406173706, + "learning_rate": 1.7987297439868212e-05, + "loss": 0.7127, + "step": 22796 + }, + { + "epoch": 0.28498212455311384, + "grad_norm": 3.090104818344116, + "learning_rate": 1.7986772321332752e-05, + "loss": 0.9538, + "step": 22798 + }, + { + "epoch": 0.28500712517812943, + "grad_norm": 4.508167743682861, + "learning_rate": 1.7986247141971158e-05, + "loss": 1.6242, + "step": 22800 + }, + { + "epoch": 0.2850321258031451, + "grad_norm": 1.4207689762115479, + "learning_rate": 1.7985721901787438e-05, + "loss": 0.0588, + "step": 22802 + }, + { + "epoch": 0.2850571264281607, + "grad_norm": 0.23497526347637177, + "learning_rate": 1.798519660078559e-05, + "loss": 1.1108, + "step": 22804 + }, + { + "epoch": 0.2850821270531763, + "grad_norm": 6.5191545486450195, + "learning_rate": 1.7984671238969612e-05, + "loss": 1.4605, + "step": 22806 + }, + { + "epoch": 0.28510712767819196, + "grad_norm": 2.972616672515869, + "learning_rate": 1.7984145816343506e-05, + "loss": 0.2546, + "step": 22808 + }, + { + "epoch": 0.28513212830320755, + "grad_norm": 7.558426856994629, + "learning_rate": 1.7983620332911277e-05, + "loss": 1.0657, + "step": 22810 + }, + { + "epoch": 0.2851571289282232, + "grad_norm": 0.005120086017996073, + "learning_rate": 1.7983094788676925e-05, + "loss": 1.0249, + "step": 22812 + }, + { + "epoch": 0.28518212955323885, + "grad_norm": 2.0428969860076904, + "learning_rate": 1.7982569183644452e-05, + "loss": 0.8914, + "step": 22814 + }, + { + "epoch": 0.28520713017825444, + "grad_norm": 3.15604567527771, + "learning_rate": 1.7982043517817857e-05, + "loss": 1.2401, + "step": 22816 + }, + { + "epoch": 0.2852321308032701, + "grad_norm": 1.939404010772705, + "learning_rate": 1.798151779120115e-05, + "loss": 0.5328, + "step": 22818 + }, + { + "epoch": 0.2852571314282857, + "grad_norm": 4.20070219039917, + "learning_rate": 1.7980992003798334e-05, + "loss": 0.7823, + "step": 22820 + }, + { + "epoch": 0.28528213205330133, + "grad_norm": 3.567093849182129, + "learning_rate": 1.798046615561341e-05, + "loss": 0.815, + "step": 22822 + }, + { + "epoch": 0.285307132678317, + "grad_norm": 0.7393497824668884, + "learning_rate": 1.7979940246650383e-05, + "loss": 0.4356, + "step": 22824 + }, + { + "epoch": 0.28533213330333257, + "grad_norm": 0.6102579236030579, + "learning_rate": 1.7979414276913262e-05, + "loss": 0.5492, + "step": 22826 + }, + { + "epoch": 0.2853571339283482, + "grad_norm": 5.9574174880981445, + "learning_rate": 1.7978888246406047e-05, + "loss": 1.9592, + "step": 22828 + }, + { + "epoch": 0.2853821345533638, + "grad_norm": 3.2465622425079346, + "learning_rate": 1.7978362155132753e-05, + "loss": 1.4641, + "step": 22830 + }, + { + "epoch": 0.28540713517837946, + "grad_norm": 2.6009843349456787, + "learning_rate": 1.7977836003097377e-05, + "loss": 0.8102, + "step": 22832 + }, + { + "epoch": 0.2854321358033951, + "grad_norm": 4.746140956878662, + "learning_rate": 1.7977309790303928e-05, + "loss": 1.696, + "step": 22834 + }, + { + "epoch": 0.2854571364284107, + "grad_norm": 4.730316162109375, + "learning_rate": 1.797678351675642e-05, + "loss": 1.5015, + "step": 22836 + }, + { + "epoch": 0.28548213705342634, + "grad_norm": 4.445559501647949, + "learning_rate": 1.7976257182458857e-05, + "loss": 2.3477, + "step": 22838 + }, + { + "epoch": 0.28550713767844194, + "grad_norm": 1.9084755182266235, + "learning_rate": 1.7975730787415245e-05, + "loss": 0.6187, + "step": 22840 + }, + { + "epoch": 0.2855321383034576, + "grad_norm": 5.885852336883545, + "learning_rate": 1.7975204331629595e-05, + "loss": 1.4084, + "step": 22842 + }, + { + "epoch": 0.28555713892847323, + "grad_norm": 4.193026065826416, + "learning_rate": 1.7974677815105917e-05, + "loss": 1.4354, + "step": 22844 + }, + { + "epoch": 0.2855821395534888, + "grad_norm": 2.614882469177246, + "learning_rate": 1.797415123784822e-05, + "loss": 1.0306, + "step": 22846 + }, + { + "epoch": 0.28560714017850447, + "grad_norm": 4.771459579467773, + "learning_rate": 1.7973624599860516e-05, + "loss": 1.2163, + "step": 22848 + }, + { + "epoch": 0.28563214080352006, + "grad_norm": 2.864044427871704, + "learning_rate": 1.7973097901146814e-05, + "loss": 0.0669, + "step": 22850 + }, + { + "epoch": 0.2856571414285357, + "grad_norm": 4.159411907196045, + "learning_rate": 1.7972571141711125e-05, + "loss": 1.1318, + "step": 22852 + }, + { + "epoch": 0.28568214205355136, + "grad_norm": 2.771822690963745, + "learning_rate": 1.797204432155746e-05, + "loss": 0.5075, + "step": 22854 + }, + { + "epoch": 0.28570714267856695, + "grad_norm": 3.75762939453125, + "learning_rate": 1.797151744068984e-05, + "loss": 1.0102, + "step": 22856 + }, + { + "epoch": 0.2857321433035826, + "grad_norm": 2.718177556991577, + "learning_rate": 1.7970990499112264e-05, + "loss": 0.9376, + "step": 22858 + }, + { + "epoch": 0.2857571439285982, + "grad_norm": 5.915886402130127, + "learning_rate": 1.7970463496828753e-05, + "loss": 1.4532, + "step": 22860 + }, + { + "epoch": 0.28578214455361384, + "grad_norm": 2.1092209815979004, + "learning_rate": 1.7969936433843317e-05, + "loss": 0.9067, + "step": 22862 + }, + { + "epoch": 0.2858071451786295, + "grad_norm": 4.378939628601074, + "learning_rate": 1.7969409310159978e-05, + "loss": 1.1332, + "step": 22864 + }, + { + "epoch": 0.2858321458036451, + "grad_norm": 5.499042987823486, + "learning_rate": 1.796888212578274e-05, + "loss": 1.6849, + "step": 22866 + }, + { + "epoch": 0.2858571464286607, + "grad_norm": 3.066007137298584, + "learning_rate": 1.7968354880715626e-05, + "loss": 0.8457, + "step": 22868 + }, + { + "epoch": 0.2858821470536763, + "grad_norm": 7.286687850952148, + "learning_rate": 1.7967827574962646e-05, + "loss": 1.1656, + "step": 22870 + }, + { + "epoch": 0.28590714767869196, + "grad_norm": 2.332669973373413, + "learning_rate": 1.7967300208527816e-05, + "loss": 0.5767, + "step": 22872 + }, + { + "epoch": 0.2859321483037076, + "grad_norm": 2.967069625854492, + "learning_rate": 1.7966772781415157e-05, + "loss": 0.711, + "step": 22874 + }, + { + "epoch": 0.2859571489287232, + "grad_norm": 6.1956892013549805, + "learning_rate": 1.7966245293628685e-05, + "loss": 2.0461, + "step": 22876 + }, + { + "epoch": 0.28598214955373885, + "grad_norm": 3.3872244358062744, + "learning_rate": 1.796571774517241e-05, + "loss": 2.1104, + "step": 22878 + }, + { + "epoch": 0.28600715017875444, + "grad_norm": 3.0267832279205322, + "learning_rate": 1.796519013605036e-05, + "loss": 1.0132, + "step": 22880 + }, + { + "epoch": 0.2860321508037701, + "grad_norm": 1.2530728578567505, + "learning_rate": 1.7964662466266545e-05, + "loss": 1.169, + "step": 22882 + }, + { + "epoch": 0.28605715142878574, + "grad_norm": 0.634460985660553, + "learning_rate": 1.7964134735824988e-05, + "loss": 0.3931, + "step": 22884 + }, + { + "epoch": 0.28608215205380133, + "grad_norm": 3.8440022468566895, + "learning_rate": 1.796360694472971e-05, + "loss": 1.0056, + "step": 22886 + }, + { + "epoch": 0.286107152678817, + "grad_norm": 4.259836673736572, + "learning_rate": 1.7963079092984722e-05, + "loss": 0.8763, + "step": 22888 + }, + { + "epoch": 0.28613215330383257, + "grad_norm": 0.0032283395994454622, + "learning_rate": 1.796255118059405e-05, + "loss": 0.0001, + "step": 22890 + }, + { + "epoch": 0.2861571539288482, + "grad_norm": 0.9457482695579529, + "learning_rate": 1.796202320756172e-05, + "loss": 1.5904, + "step": 22892 + }, + { + "epoch": 0.28618215455386387, + "grad_norm": 1.6495836973190308, + "learning_rate": 1.7961495173891744e-05, + "loss": 1.2587, + "step": 22894 + }, + { + "epoch": 0.28620715517887946, + "grad_norm": 4.281296730041504, + "learning_rate": 1.7960967079588147e-05, + "loss": 1.263, + "step": 22896 + }, + { + "epoch": 0.2862321558038951, + "grad_norm": 2.2125329971313477, + "learning_rate": 1.7960438924654947e-05, + "loss": 0.6301, + "step": 22898 + }, + { + "epoch": 0.2862571564289107, + "grad_norm": 1.883294701576233, + "learning_rate": 1.7959910709096174e-05, + "loss": 0.5092, + "step": 22900 + }, + { + "epoch": 0.28628215705392634, + "grad_norm": 3.3794610500335693, + "learning_rate": 1.7959382432915846e-05, + "loss": 2.1151, + "step": 22902 + }, + { + "epoch": 0.286307157678942, + "grad_norm": 6.677228927612305, + "learning_rate": 1.7958854096117985e-05, + "loss": 0.6189, + "step": 22904 + }, + { + "epoch": 0.2863321583039576, + "grad_norm": 3.5490715503692627, + "learning_rate": 1.795832569870662e-05, + "loss": 1.3906, + "step": 22906 + }, + { + "epoch": 0.28635715892897323, + "grad_norm": 4.706233501434326, + "learning_rate": 1.795779724068577e-05, + "loss": 1.0001, + "step": 22908 + }, + { + "epoch": 0.2863821595539888, + "grad_norm": 2.6239335536956787, + "learning_rate": 1.795726872205946e-05, + "loss": 1.0022, + "step": 22910 + }, + { + "epoch": 0.28640716017900447, + "grad_norm": 0.0047211479395627975, + "learning_rate": 1.795674014283172e-05, + "loss": 0.5534, + "step": 22912 + }, + { + "epoch": 0.2864321608040201, + "grad_norm": 10.635916709899902, + "learning_rate": 1.795621150300657e-05, + "loss": 0.2633, + "step": 22914 + }, + { + "epoch": 0.2864571614290357, + "grad_norm": 3.090935707092285, + "learning_rate": 1.795568280258804e-05, + "loss": 1.4208, + "step": 22916 + }, + { + "epoch": 0.28648216205405136, + "grad_norm": 0.005233845207840204, + "learning_rate": 1.795515404158015e-05, + "loss": 0.0002, + "step": 22918 + }, + { + "epoch": 0.28650716267906695, + "grad_norm": 2.610353946685791, + "learning_rate": 1.795462521998694e-05, + "loss": 0.2326, + "step": 22920 + }, + { + "epoch": 0.2865321633040826, + "grad_norm": 2.4543631076812744, + "learning_rate": 1.7954096337812424e-05, + "loss": 1.1208, + "step": 22922 + }, + { + "epoch": 0.28655716392909825, + "grad_norm": 5.035472869873047, + "learning_rate": 1.7953567395060636e-05, + "loss": 4.2093, + "step": 22924 + }, + { + "epoch": 0.28658216455411384, + "grad_norm": 3.2449445724487305, + "learning_rate": 1.79530383917356e-05, + "loss": 1.6193, + "step": 22926 + }, + { + "epoch": 0.2866071651791295, + "grad_norm": 0.700965166091919, + "learning_rate": 1.7952509327841354e-05, + "loss": 0.2419, + "step": 22928 + }, + { + "epoch": 0.2866321658041451, + "grad_norm": 2.786512851715088, + "learning_rate": 1.7951980203381917e-05, + "loss": 0.956, + "step": 22930 + }, + { + "epoch": 0.2866571664291607, + "grad_norm": 3.016345500946045, + "learning_rate": 1.7951451018361328e-05, + "loss": 1.6287, + "step": 22932 + }, + { + "epoch": 0.2866821670541764, + "grad_norm": 33.65840148925781, + "learning_rate": 1.795092177278361e-05, + "loss": 0.9725, + "step": 22934 + }, + { + "epoch": 0.28670716767919197, + "grad_norm": 6.062676906585693, + "learning_rate": 1.7950392466652794e-05, + "loss": 2.1459, + "step": 22936 + }, + { + "epoch": 0.2867321683042076, + "grad_norm": 0.4151858389377594, + "learning_rate": 1.7949863099972918e-05, + "loss": 0.007, + "step": 22938 + }, + { + "epoch": 0.2867571689292232, + "grad_norm": 0.0037177882622927427, + "learning_rate": 1.7949333672748005e-05, + "loss": 0.0002, + "step": 22940 + }, + { + "epoch": 0.28678216955423885, + "grad_norm": 4.380813121795654, + "learning_rate": 1.7948804184982093e-05, + "loss": 0.8998, + "step": 22942 + }, + { + "epoch": 0.2868071701792545, + "grad_norm": 3.787536859512329, + "learning_rate": 1.7948274636679207e-05, + "loss": 1.5039, + "step": 22944 + }, + { + "epoch": 0.2868321708042701, + "grad_norm": 2.744328737258911, + "learning_rate": 1.7947745027843393e-05, + "loss": 1.3073, + "step": 22946 + }, + { + "epoch": 0.28685717142928574, + "grad_norm": 4.586367607116699, + "learning_rate": 1.7947215358478673e-05, + "loss": 0.9656, + "step": 22948 + }, + { + "epoch": 0.28688217205430133, + "grad_norm": 2.9802632331848145, + "learning_rate": 1.7946685628589087e-05, + "loss": 1.478, + "step": 22950 + }, + { + "epoch": 0.286907172679317, + "grad_norm": 4.944736480712891, + "learning_rate": 1.7946155838178666e-05, + "loss": 0.952, + "step": 22952 + }, + { + "epoch": 0.2869321733043326, + "grad_norm": 3.9571263790130615, + "learning_rate": 1.7945625987251444e-05, + "loss": 1.4763, + "step": 22954 + }, + { + "epoch": 0.2869571739293482, + "grad_norm": 3.657109498977661, + "learning_rate": 1.7945096075811456e-05, + "loss": 0.635, + "step": 22956 + }, + { + "epoch": 0.28698217455436387, + "grad_norm": 3.9859795570373535, + "learning_rate": 1.7944566103862744e-05, + "loss": 0.8373, + "step": 22958 + }, + { + "epoch": 0.28700717517937946, + "grad_norm": 3.882089376449585, + "learning_rate": 1.794403607140934e-05, + "loss": 1.1819, + "step": 22960 + }, + { + "epoch": 0.2870321758043951, + "grad_norm": 2.731233835220337, + "learning_rate": 1.794350597845528e-05, + "loss": 0.5286, + "step": 22962 + }, + { + "epoch": 0.28705717642941075, + "grad_norm": 15.714526176452637, + "learning_rate": 1.7942975825004604e-05, + "loss": 1.1218, + "step": 22964 + }, + { + "epoch": 0.28708217705442635, + "grad_norm": 4.693194389343262, + "learning_rate": 1.794244561106134e-05, + "loss": 1.9456, + "step": 22966 + }, + { + "epoch": 0.287107177679442, + "grad_norm": 2.888056755065918, + "learning_rate": 1.794191533662954e-05, + "loss": 0.5881, + "step": 22968 + }, + { + "epoch": 0.2871321783044576, + "grad_norm": 0.5792509913444519, + "learning_rate": 1.7941385001713236e-05, + "loss": 0.0608, + "step": 22970 + }, + { + "epoch": 0.28715717892947323, + "grad_norm": 6.473003387451172, + "learning_rate": 1.7940854606316464e-05, + "loss": 1.325, + "step": 22972 + }, + { + "epoch": 0.2871821795544889, + "grad_norm": 3.6474177837371826, + "learning_rate": 1.7940324150443267e-05, + "loss": 1.4779, + "step": 22974 + }, + { + "epoch": 0.2872071801795045, + "grad_norm": 3.0771894454956055, + "learning_rate": 1.7939793634097685e-05, + "loss": 2.099, + "step": 22976 + }, + { + "epoch": 0.2872321808045201, + "grad_norm": 3.5053844451904297, + "learning_rate": 1.7939263057283756e-05, + "loss": 1.6354, + "step": 22978 + }, + { + "epoch": 0.2872571814295357, + "grad_norm": 3.8881287574768066, + "learning_rate": 1.793873242000552e-05, + "loss": 1.3567, + "step": 22980 + }, + { + "epoch": 0.28728218205455136, + "grad_norm": 4.405828475952148, + "learning_rate": 1.7938201722267027e-05, + "loss": 2.1492, + "step": 22982 + }, + { + "epoch": 0.287307182679567, + "grad_norm": 3.759711265563965, + "learning_rate": 1.7937670964072308e-05, + "loss": 1.6175, + "step": 22984 + }, + { + "epoch": 0.2873321833045826, + "grad_norm": 0.0028187851421535015, + "learning_rate": 1.793714014542541e-05, + "loss": 0.5804, + "step": 22986 + }, + { + "epoch": 0.28735718392959825, + "grad_norm": 3.43794846534729, + "learning_rate": 1.7936609266330376e-05, + "loss": 0.4582, + "step": 22988 + }, + { + "epoch": 0.28738218455461384, + "grad_norm": 1.8098549842834473, + "learning_rate": 1.7936078326791246e-05, + "loss": 1.311, + "step": 22990 + }, + { + "epoch": 0.2874071851796295, + "grad_norm": 1.5534480810165405, + "learning_rate": 1.7935547326812067e-05, + "loss": 0.3154, + "step": 22992 + }, + { + "epoch": 0.28743218580464514, + "grad_norm": 0.005082160234451294, + "learning_rate": 1.7935016266396884e-05, + "loss": 0.456, + "step": 22994 + }, + { + "epoch": 0.2874571864296607, + "grad_norm": 0.04261770471930504, + "learning_rate": 1.7934485145549735e-05, + "loss": 0.6866, + "step": 22996 + }, + { + "epoch": 0.2874821870546764, + "grad_norm": 1.9784789085388184, + "learning_rate": 1.7933953964274673e-05, + "loss": 1.0324, + "step": 22998 + }, + { + "epoch": 0.28750718767969197, + "grad_norm": 0.004550321493297815, + "learning_rate": 1.7933422722575734e-05, + "loss": 1.0123, + "step": 23000 + }, + { + "epoch": 0.2875321883047076, + "grad_norm": 3.294971466064453, + "learning_rate": 1.7932891420456978e-05, + "loss": 1.0968, + "step": 23002 + }, + { + "epoch": 0.28755718892972326, + "grad_norm": 4.261204242706299, + "learning_rate": 1.7932360057922436e-05, + "loss": 1.479, + "step": 23004 + }, + { + "epoch": 0.28758218955473885, + "grad_norm": 7.577049732208252, + "learning_rate": 1.7931828634976162e-05, + "loss": 1.3486, + "step": 23006 + }, + { + "epoch": 0.2876071901797545, + "grad_norm": 2.181222915649414, + "learning_rate": 1.7931297151622205e-05, + "loss": 0.8134, + "step": 23008 + }, + { + "epoch": 0.2876321908047701, + "grad_norm": 2.498739719390869, + "learning_rate": 1.793076560786461e-05, + "loss": 0.1384, + "step": 23010 + }, + { + "epoch": 0.28765719142978574, + "grad_norm": 0.6837036609649658, + "learning_rate": 1.7930234003707425e-05, + "loss": 0.6079, + "step": 23012 + }, + { + "epoch": 0.2876821920548014, + "grad_norm": 4.479123115539551, + "learning_rate": 1.7929702339154704e-05, + "loss": 1.1249, + "step": 23014 + }, + { + "epoch": 0.287707192679817, + "grad_norm": 4.942234516143799, + "learning_rate": 1.7929170614210484e-05, + "loss": 2.0071, + "step": 23016 + }, + { + "epoch": 0.28773219330483263, + "grad_norm": 6.185738563537598, + "learning_rate": 1.7928638828878826e-05, + "loss": 2.3105, + "step": 23018 + }, + { + "epoch": 0.2877571939298482, + "grad_norm": 2.50002121925354, + "learning_rate": 1.7928106983163775e-05, + "loss": 0.9368, + "step": 23020 + }, + { + "epoch": 0.28778219455486387, + "grad_norm": 0.004773498512804508, + "learning_rate": 1.7927575077069382e-05, + "loss": 0.0002, + "step": 23022 + }, + { + "epoch": 0.2878071951798795, + "grad_norm": 3.260324716567993, + "learning_rate": 1.7927043110599698e-05, + "loss": 0.5445, + "step": 23024 + }, + { + "epoch": 0.2878321958048951, + "grad_norm": 7.107529163360596, + "learning_rate": 1.7926511083758774e-05, + "loss": 1.5416, + "step": 23026 + }, + { + "epoch": 0.28785719642991076, + "grad_norm": 3.338379144668579, + "learning_rate": 1.7925978996550665e-05, + "loss": 0.6408, + "step": 23028 + }, + { + "epoch": 0.28788219705492635, + "grad_norm": 6.400793075561523, + "learning_rate": 1.792544684897942e-05, + "loss": 1.1079, + "step": 23030 + }, + { + "epoch": 0.287907197679942, + "grad_norm": 6.448143482208252, + "learning_rate": 1.792491464104909e-05, + "loss": 1.8709, + "step": 23032 + }, + { + "epoch": 0.28793219830495764, + "grad_norm": 0.0032535234931856394, + "learning_rate": 1.7924382372763735e-05, + "loss": 0.2324, + "step": 23034 + }, + { + "epoch": 0.28795719892997323, + "grad_norm": 0.004436590243130922, + "learning_rate": 1.79238500441274e-05, + "loss": 0.1461, + "step": 23036 + }, + { + "epoch": 0.2879821995549889, + "grad_norm": 6.159657001495361, + "learning_rate": 1.7923317655144147e-05, + "loss": 0.6296, + "step": 23038 + }, + { + "epoch": 0.2880072001800045, + "grad_norm": 0.9793679714202881, + "learning_rate": 1.7922785205818028e-05, + "loss": 0.7739, + "step": 23040 + }, + { + "epoch": 0.2880322008050201, + "grad_norm": 3.2469851970672607, + "learning_rate": 1.7922252696153094e-05, + "loss": 0.4441, + "step": 23042 + }, + { + "epoch": 0.28805720143003577, + "grad_norm": 4.198575973510742, + "learning_rate": 1.7921720126153404e-05, + "loss": 1.5541, + "step": 23044 + }, + { + "epoch": 0.28808220205505136, + "grad_norm": 2.8855597972869873, + "learning_rate": 1.7921187495823016e-05, + "loss": 1.0254, + "step": 23046 + }, + { + "epoch": 0.288107202680067, + "grad_norm": 2.7891082763671875, + "learning_rate": 1.7920654805165983e-05, + "loss": 0.4245, + "step": 23048 + }, + { + "epoch": 0.2881322033050826, + "grad_norm": 2.9222545623779297, + "learning_rate": 1.7920122054186363e-05, + "loss": 0.8318, + "step": 23050 + }, + { + "epoch": 0.28815720393009825, + "grad_norm": 5.317674160003662, + "learning_rate": 1.7919589242888212e-05, + "loss": 2.1482, + "step": 23052 + }, + { + "epoch": 0.2881822045551139, + "grad_norm": 2.9415507316589355, + "learning_rate": 1.791905637127559e-05, + "loss": 0.1787, + "step": 23054 + }, + { + "epoch": 0.2882072051801295, + "grad_norm": 3.956395149230957, + "learning_rate": 1.7918523439352555e-05, + "loss": 1.3057, + "step": 23056 + }, + { + "epoch": 0.28823220580514514, + "grad_norm": 0.2180374562740326, + "learning_rate": 1.791799044712317e-05, + "loss": 0.2862, + "step": 23058 + }, + { + "epoch": 0.28825720643016073, + "grad_norm": 2.6181931495666504, + "learning_rate": 1.7917457394591486e-05, + "loss": 0.6245, + "step": 23060 + }, + { + "epoch": 0.2882822070551764, + "grad_norm": 3.5535457134246826, + "learning_rate": 1.7916924281761563e-05, + "loss": 0.7157, + "step": 23062 + }, + { + "epoch": 0.288307207680192, + "grad_norm": 4.609415531158447, + "learning_rate": 1.7916391108637466e-05, + "loss": 1.0392, + "step": 23064 + }, + { + "epoch": 0.2883322083052076, + "grad_norm": 2.2448816299438477, + "learning_rate": 1.7915857875223252e-05, + "loss": 0.4594, + "step": 23066 + }, + { + "epoch": 0.28835720893022326, + "grad_norm": 3.724072217941284, + "learning_rate": 1.7915324581522986e-05, + "loss": 0.7263, + "step": 23068 + }, + { + "epoch": 0.28838220955523886, + "grad_norm": 1.2629495859146118, + "learning_rate": 1.791479122754073e-05, + "loss": 1.0704, + "step": 23070 + }, + { + "epoch": 0.2884072101802545, + "grad_norm": 2.172950267791748, + "learning_rate": 1.791425781328054e-05, + "loss": 0.4508, + "step": 23072 + }, + { + "epoch": 0.28843221080527015, + "grad_norm": 3.789494037628174, + "learning_rate": 1.7913724338746483e-05, + "loss": 0.9323, + "step": 23074 + }, + { + "epoch": 0.28845721143028574, + "grad_norm": 3.762500286102295, + "learning_rate": 1.791319080394262e-05, + "loss": 0.9521, + "step": 23076 + }, + { + "epoch": 0.2884822120553014, + "grad_norm": 1.2651922702789307, + "learning_rate": 1.7912657208873013e-05, + "loss": 0.5439, + "step": 23078 + }, + { + "epoch": 0.288507212680317, + "grad_norm": 1.4100053310394287, + "learning_rate": 1.791212355354173e-05, + "loss": 0.402, + "step": 23080 + }, + { + "epoch": 0.28853221330533263, + "grad_norm": 4.081090450286865, + "learning_rate": 1.791158983795283e-05, + "loss": 0.9028, + "step": 23082 + }, + { + "epoch": 0.2885572139303483, + "grad_norm": 0.004493335727602243, + "learning_rate": 1.7911056062110383e-05, + "loss": 0.0534, + "step": 23084 + }, + { + "epoch": 0.28858221455536387, + "grad_norm": 0.0028514105360955, + "learning_rate": 1.791052222601845e-05, + "loss": 0.6098, + "step": 23086 + }, + { + "epoch": 0.2886072151803795, + "grad_norm": 2.529355764389038, + "learning_rate": 1.7909988329681103e-05, + "loss": 1.0211, + "step": 23088 + }, + { + "epoch": 0.2886322158053951, + "grad_norm": 0.004683008883148432, + "learning_rate": 1.79094543731024e-05, + "loss": 0.7206, + "step": 23090 + }, + { + "epoch": 0.28865721643041076, + "grad_norm": 3.306363344192505, + "learning_rate": 1.7908920356286414e-05, + "loss": 0.9055, + "step": 23092 + }, + { + "epoch": 0.2886822170554264, + "grad_norm": 2.686502695083618, + "learning_rate": 1.7908386279237202e-05, + "loss": 0.9676, + "step": 23094 + }, + { + "epoch": 0.288707217680442, + "grad_norm": 4.287817001342773, + "learning_rate": 1.7907852141958843e-05, + "loss": 1.3323, + "step": 23096 + }, + { + "epoch": 0.28873221830545764, + "grad_norm": 0.0019177315989509225, + "learning_rate": 1.79073179444554e-05, + "loss": 0.0002, + "step": 23098 + }, + { + "epoch": 0.28875721893047324, + "grad_norm": 0.0023847525008022785, + "learning_rate": 1.7906783686730942e-05, + "loss": 0.0001, + "step": 23100 + }, + { + "epoch": 0.2887822195554889, + "grad_norm": 4.357913017272949, + "learning_rate": 1.7906249368789538e-05, + "loss": 0.4007, + "step": 23102 + }, + { + "epoch": 0.28880722018050453, + "grad_norm": 2.48372745513916, + "learning_rate": 1.7905714990635256e-05, + "loss": 0.7934, + "step": 23104 + }, + { + "epoch": 0.2888322208055201, + "grad_norm": 0.08697925508022308, + "learning_rate": 1.7905180552272168e-05, + "loss": 0.5775, + "step": 23106 + }, + { + "epoch": 0.28885722143053577, + "grad_norm": 3.520078659057617, + "learning_rate": 1.7904646053704342e-05, + "loss": 0.2369, + "step": 23108 + }, + { + "epoch": 0.28888222205555136, + "grad_norm": 2.701260805130005, + "learning_rate": 1.790411149493585e-05, + "loss": 0.7803, + "step": 23110 + }, + { + "epoch": 0.288907222680567, + "grad_norm": 1.3640244007110596, + "learning_rate": 1.790357687597076e-05, + "loss": 0.824, + "step": 23112 + }, + { + "epoch": 0.28893222330558266, + "grad_norm": 0.607017993927002, + "learning_rate": 1.790304219681315e-05, + "loss": 1.4886, + "step": 23114 + }, + { + "epoch": 0.28895722393059825, + "grad_norm": 0.12134566158056259, + "learning_rate": 1.7902507457467085e-05, + "loss": 1.4426, + "step": 23116 + }, + { + "epoch": 0.2889822245556139, + "grad_norm": 1.8862074613571167, + "learning_rate": 1.7901972657936642e-05, + "loss": 0.6024, + "step": 23118 + }, + { + "epoch": 0.2890072251806295, + "grad_norm": 2.797212839126587, + "learning_rate": 1.7901437798225895e-05, + "loss": 0.3473, + "step": 23120 + }, + { + "epoch": 0.28903222580564514, + "grad_norm": 3.908033847808838, + "learning_rate": 1.790090287833891e-05, + "loss": 0.8716, + "step": 23122 + }, + { + "epoch": 0.2890572264306608, + "grad_norm": 0.0016007274389266968, + "learning_rate": 1.790036789827977e-05, + "loss": 0.6201, + "step": 23124 + }, + { + "epoch": 0.2890822270556764, + "grad_norm": 0.004283050075173378, + "learning_rate": 1.7899832858052543e-05, + "loss": 0.1254, + "step": 23126 + }, + { + "epoch": 0.289107227680692, + "grad_norm": 4.1797919273376465, + "learning_rate": 1.789929775766131e-05, + "loss": 1.8927, + "step": 23128 + }, + { + "epoch": 0.2891322283057076, + "grad_norm": 1.6367498636245728, + "learning_rate": 1.7898762597110138e-05, + "loss": 1.7164, + "step": 23130 + }, + { + "epoch": 0.28915722893072326, + "grad_norm": 4.4219746589660645, + "learning_rate": 1.7898227376403107e-05, + "loss": 1.8276, + "step": 23132 + }, + { + "epoch": 0.2891822295557389, + "grad_norm": 3.514263868331909, + "learning_rate": 1.7897692095544298e-05, + "loss": 1.0285, + "step": 23134 + }, + { + "epoch": 0.2892072301807545, + "grad_norm": 3.1724092960357666, + "learning_rate": 1.7897156754537778e-05, + "loss": 0.5853, + "step": 23136 + }, + { + "epoch": 0.28923223080577015, + "grad_norm": 0.0026032791938632727, + "learning_rate": 1.789662135338763e-05, + "loss": 0.7742, + "step": 23138 + }, + { + "epoch": 0.28925723143078574, + "grad_norm": 5.099331378936768, + "learning_rate": 1.789608589209793e-05, + "loss": 1.8158, + "step": 23140 + }, + { + "epoch": 0.2892822320558014, + "grad_norm": 5.144131660461426, + "learning_rate": 1.7895550370672756e-05, + "loss": 0.8471, + "step": 23142 + }, + { + "epoch": 0.28930723268081704, + "grad_norm": 4.2802910804748535, + "learning_rate": 1.789501478911619e-05, + "loss": 1.3239, + "step": 23144 + }, + { + "epoch": 0.28933223330583263, + "grad_norm": 4.944882392883301, + "learning_rate": 1.7894479147432307e-05, + "loss": 1.4747, + "step": 23146 + }, + { + "epoch": 0.2893572339308483, + "grad_norm": 1.4672309160232544, + "learning_rate": 1.7893943445625185e-05, + "loss": 0.4507, + "step": 23148 + }, + { + "epoch": 0.28938223455586387, + "grad_norm": 4.968398094177246, + "learning_rate": 1.7893407683698906e-05, + "loss": 1.043, + "step": 23150 + }, + { + "epoch": 0.2894072351808795, + "grad_norm": 2.0687830448150635, + "learning_rate": 1.7892871861657554e-05, + "loss": 0.9397, + "step": 23152 + }, + { + "epoch": 0.28943223580589517, + "grad_norm": 7.767861366271973, + "learning_rate": 1.78923359795052e-05, + "loss": 0.5471, + "step": 23154 + }, + { + "epoch": 0.28945723643091076, + "grad_norm": 6.187671661376953, + "learning_rate": 1.7891800037245936e-05, + "loss": 0.8759, + "step": 23156 + }, + { + "epoch": 0.2894822370559264, + "grad_norm": 2.094485282897949, + "learning_rate": 1.7891264034883836e-05, + "loss": 0.6308, + "step": 23158 + }, + { + "epoch": 0.289507237680942, + "grad_norm": 4.827674388885498, + "learning_rate": 1.789072797242299e-05, + "loss": 1.7747, + "step": 23160 + }, + { + "epoch": 0.28953223830595765, + "grad_norm": 1.3146872520446777, + "learning_rate": 1.789019184986747e-05, + "loss": 0.4729, + "step": 23162 + }, + { + "epoch": 0.2895572389309733, + "grad_norm": 2.646235227584839, + "learning_rate": 1.7889655667221367e-05, + "loss": 1.0492, + "step": 23164 + }, + { + "epoch": 0.2895822395559889, + "grad_norm": 4.449390888214111, + "learning_rate": 1.7889119424488766e-05, + "loss": 0.1447, + "step": 23166 + }, + { + "epoch": 0.28960724018100453, + "grad_norm": 0.002548397285863757, + "learning_rate": 1.7888583121673742e-05, + "loss": 0.4639, + "step": 23168 + }, + { + "epoch": 0.2896322408060201, + "grad_norm": 1.7154604196548462, + "learning_rate": 1.7888046758780383e-05, + "loss": 0.6056, + "step": 23170 + }, + { + "epoch": 0.2896572414310358, + "grad_norm": 3.8285129070281982, + "learning_rate": 1.788751033581278e-05, + "loss": 0.6201, + "step": 23172 + }, + { + "epoch": 0.2896822420560514, + "grad_norm": 3.9733006954193115, + "learning_rate": 1.788697385277501e-05, + "loss": 0.3262, + "step": 23174 + }, + { + "epoch": 0.289707242681067, + "grad_norm": 2.8056039810180664, + "learning_rate": 1.7886437309671166e-05, + "loss": 0.7106, + "step": 23176 + }, + { + "epoch": 0.28973224330608266, + "grad_norm": 4.647122383117676, + "learning_rate": 1.7885900706505327e-05, + "loss": 1.2295, + "step": 23178 + }, + { + "epoch": 0.28975724393109825, + "grad_norm": 6.9418721199035645, + "learning_rate": 1.788536404328159e-05, + "loss": 1.9739, + "step": 23180 + }, + { + "epoch": 0.2897822445561139, + "grad_norm": 2.2961111068725586, + "learning_rate": 1.788482732000403e-05, + "loss": 0.5021, + "step": 23182 + }, + { + "epoch": 0.28980724518112955, + "grad_norm": 5.422747611999512, + "learning_rate": 1.7884290536676738e-05, + "loss": 1.1589, + "step": 23184 + }, + { + "epoch": 0.28983224580614514, + "grad_norm": 2.516071081161499, + "learning_rate": 1.788375369330381e-05, + "loss": 0.5396, + "step": 23186 + }, + { + "epoch": 0.2898572464311608, + "grad_norm": 3.6171927452087402, + "learning_rate": 1.7883216789889325e-05, + "loss": 2.9158, + "step": 23188 + }, + { + "epoch": 0.2898822470561764, + "grad_norm": 3.31439208984375, + "learning_rate": 1.788267982643738e-05, + "loss": 1.0836, + "step": 23190 + }, + { + "epoch": 0.289907247681192, + "grad_norm": 3.461881637573242, + "learning_rate": 1.7882142802952053e-05, + "loss": 1.8084, + "step": 23192 + }, + { + "epoch": 0.2899322483062077, + "grad_norm": 4.223483085632324, + "learning_rate": 1.7881605719437444e-05, + "loss": 0.357, + "step": 23194 + }, + { + "epoch": 0.28995724893122327, + "grad_norm": 2.945796251296997, + "learning_rate": 1.7881068575897643e-05, + "loss": 0.2366, + "step": 23196 + }, + { + "epoch": 0.2899822495562389, + "grad_norm": 1.1102805137634277, + "learning_rate": 1.7880531372336735e-05, + "loss": 0.0856, + "step": 23198 + }, + { + "epoch": 0.2900072501812545, + "grad_norm": 9.93179988861084, + "learning_rate": 1.787999410875882e-05, + "loss": 1.4341, + "step": 23200 + }, + { + "epoch": 0.29003225080627015, + "grad_norm": 2.8843131065368652, + "learning_rate": 1.787945678516798e-05, + "loss": 1.3326, + "step": 23202 + }, + { + "epoch": 0.2900572514312858, + "grad_norm": 0.07923784106969833, + "learning_rate": 1.787891940156831e-05, + "loss": 0.622, + "step": 23204 + }, + { + "epoch": 0.2900822520563014, + "grad_norm": 5.604105472564697, + "learning_rate": 1.7878381957963905e-05, + "loss": 0.3304, + "step": 23206 + }, + { + "epoch": 0.29010725268131704, + "grad_norm": 1.3967368602752686, + "learning_rate": 1.7877844454358858e-05, + "loss": 1.4358, + "step": 23208 + }, + { + "epoch": 0.29013225330633263, + "grad_norm": 13.354632377624512, + "learning_rate": 1.787730689075726e-05, + "loss": 0.4889, + "step": 23210 + }, + { + "epoch": 0.2901572539313483, + "grad_norm": 3.294642210006714, + "learning_rate": 1.787676926716321e-05, + "loss": 0.3966, + "step": 23212 + }, + { + "epoch": 0.2901822545563639, + "grad_norm": 3.810001850128174, + "learning_rate": 1.7876231583580796e-05, + "loss": 0.8482, + "step": 23214 + }, + { + "epoch": 0.2902072551813795, + "grad_norm": 0.01990130916237831, + "learning_rate": 1.7875693840014117e-05, + "loss": 0.4539, + "step": 23216 + }, + { + "epoch": 0.29023225580639517, + "grad_norm": 3.0896637439727783, + "learning_rate": 1.787515603646727e-05, + "loss": 1.2724, + "step": 23218 + }, + { + "epoch": 0.29025725643141076, + "grad_norm": 1.8753553628921509, + "learning_rate": 1.7874618172944344e-05, + "loss": 0.4724, + "step": 23220 + }, + { + "epoch": 0.2902822570564264, + "grad_norm": 1.0619328022003174, + "learning_rate": 1.7874080249449442e-05, + "loss": 0.4939, + "step": 23222 + }, + { + "epoch": 0.29030725768144205, + "grad_norm": 4.021705150604248, + "learning_rate": 1.7873542265986657e-05, + "loss": 1.5889, + "step": 23224 + }, + { + "epoch": 0.29033225830645765, + "grad_norm": 0.4727688431739807, + "learning_rate": 1.787300422256009e-05, + "loss": 0.315, + "step": 23226 + }, + { + "epoch": 0.2903572589314733, + "grad_norm": 7.859408378601074, + "learning_rate": 1.787246611917383e-05, + "loss": 1.3311, + "step": 23228 + }, + { + "epoch": 0.2903822595564889, + "grad_norm": 7.866095066070557, + "learning_rate": 1.787192795583199e-05, + "loss": 1.0634, + "step": 23230 + }, + { + "epoch": 0.29040726018150453, + "grad_norm": 1.8882763385772705, + "learning_rate": 1.7871389732538653e-05, + "loss": 0.5494, + "step": 23232 + }, + { + "epoch": 0.2904322608065202, + "grad_norm": 4.060075283050537, + "learning_rate": 1.7870851449297932e-05, + "loss": 0.9094, + "step": 23234 + }, + { + "epoch": 0.2904572614315358, + "grad_norm": 0.9639532566070557, + "learning_rate": 1.7870313106113912e-05, + "loss": 0.385, + "step": 23236 + }, + { + "epoch": 0.2904822620565514, + "grad_norm": 4.830441474914551, + "learning_rate": 1.7869774702990704e-05, + "loss": 1.5524, + "step": 23238 + }, + { + "epoch": 0.290507262681567, + "grad_norm": 3.391136646270752, + "learning_rate": 1.7869236239932403e-05, + "loss": 0.1937, + "step": 23240 + }, + { + "epoch": 0.29053226330658266, + "grad_norm": 3.9174273014068604, + "learning_rate": 1.7868697716943116e-05, + "loss": 1.1724, + "step": 23242 + }, + { + "epoch": 0.2905572639315983, + "grad_norm": 4.3556718826293945, + "learning_rate": 1.7868159134026935e-05, + "loss": 2.2685, + "step": 23244 + }, + { + "epoch": 0.2905822645566139, + "grad_norm": 4.017693996429443, + "learning_rate": 1.7867620491187968e-05, + "loss": 1.5391, + "step": 23246 + }, + { + "epoch": 0.29060726518162955, + "grad_norm": 7.227538108825684, + "learning_rate": 1.7867081788430317e-05, + "loss": 0.8664, + "step": 23248 + }, + { + "epoch": 0.29063226580664514, + "grad_norm": 1.9367996454238892, + "learning_rate": 1.7866543025758082e-05, + "loss": 0.223, + "step": 23250 + }, + { + "epoch": 0.2906572664316608, + "grad_norm": 1.6012368202209473, + "learning_rate": 1.786600420317537e-05, + "loss": 0.7791, + "step": 23252 + }, + { + "epoch": 0.29068226705667644, + "grad_norm": 0.7060946226119995, + "learning_rate": 1.786546532068628e-05, + "loss": 0.2725, + "step": 23254 + }, + { + "epoch": 0.290707267681692, + "grad_norm": 1.3560993671417236, + "learning_rate": 1.786492637829492e-05, + "loss": 0.0325, + "step": 23256 + }, + { + "epoch": 0.2907322683067077, + "grad_norm": 4.344513416290283, + "learning_rate": 1.786438737600539e-05, + "loss": 1.0092, + "step": 23258 + }, + { + "epoch": 0.29075726893172327, + "grad_norm": 3.996899366378784, + "learning_rate": 1.7863848313821802e-05, + "loss": 0.9201, + "step": 23260 + }, + { + "epoch": 0.2907822695567389, + "grad_norm": 6.593382835388184, + "learning_rate": 1.7863309191748256e-05, + "loss": 2.4704, + "step": 23262 + }, + { + "epoch": 0.29080727018175456, + "grad_norm": 2.7452259063720703, + "learning_rate": 1.7862770009788857e-05, + "loss": 0.4773, + "step": 23264 + }, + { + "epoch": 0.29083227080677015, + "grad_norm": 3.276798725128174, + "learning_rate": 1.7862230767947716e-05, + "loss": 0.6734, + "step": 23266 + }, + { + "epoch": 0.2908572714317858, + "grad_norm": 5.612918853759766, + "learning_rate": 1.7861691466228938e-05, + "loss": 1.2352, + "step": 23268 + }, + { + "epoch": 0.2908822720568014, + "grad_norm": 0.0026893382892012596, + "learning_rate": 1.7861152104636626e-05, + "loss": 0.5096, + "step": 23270 + }, + { + "epoch": 0.29090727268181704, + "grad_norm": 2.8000354766845703, + "learning_rate": 1.7860612683174894e-05, + "loss": 1.5215, + "step": 23272 + }, + { + "epoch": 0.2909322733068327, + "grad_norm": 3.7414944171905518, + "learning_rate": 1.7860073201847848e-05, + "loss": 1.3597, + "step": 23274 + }, + { + "epoch": 0.2909572739318483, + "grad_norm": 9.912074089050293, + "learning_rate": 1.785953366065959e-05, + "loss": 1.9976, + "step": 23276 + }, + { + "epoch": 0.29098227455686393, + "grad_norm": 4.800819396972656, + "learning_rate": 1.7858994059614243e-05, + "loss": 0.8513, + "step": 23278 + }, + { + "epoch": 0.2910072751818795, + "grad_norm": 1.789441466331482, + "learning_rate": 1.7858454398715904e-05, + "loss": 1.5346, + "step": 23280 + }, + { + "epoch": 0.29103227580689517, + "grad_norm": 3.1359589099884033, + "learning_rate": 1.785791467796869e-05, + "loss": 0.5847, + "step": 23282 + }, + { + "epoch": 0.2910572764319108, + "grad_norm": 3.909912586212158, + "learning_rate": 1.7857374897376708e-05, + "loss": 1.4514, + "step": 23284 + }, + { + "epoch": 0.2910822770569264, + "grad_norm": 4.198321342468262, + "learning_rate": 1.7856835056944067e-05, + "loss": 1.1453, + "step": 23286 + }, + { + "epoch": 0.29110727768194206, + "grad_norm": 3.1318295001983643, + "learning_rate": 1.7856295156674885e-05, + "loss": 0.5528, + "step": 23288 + }, + { + "epoch": 0.29113227830695765, + "grad_norm": 2.8029167652130127, + "learning_rate": 1.785575519657327e-05, + "loss": 0.969, + "step": 23290 + }, + { + "epoch": 0.2911572789319733, + "grad_norm": 0.009893018752336502, + "learning_rate": 1.7855215176643333e-05, + "loss": 0.1599, + "step": 23292 + }, + { + "epoch": 0.29118227955698894, + "grad_norm": 1.542109489440918, + "learning_rate": 1.785467509688919e-05, + "loss": 0.0594, + "step": 23294 + }, + { + "epoch": 0.29120728018200454, + "grad_norm": 3.481818199157715, + "learning_rate": 1.785413495731495e-05, + "loss": 1.5037, + "step": 23296 + }, + { + "epoch": 0.2912322808070202, + "grad_norm": 2.1412291526794434, + "learning_rate": 1.7853594757924727e-05, + "loss": 0.6417, + "step": 23298 + }, + { + "epoch": 0.2912572814320358, + "grad_norm": 0.05184357985854149, + "learning_rate": 1.7853054498722643e-05, + "loss": 0.3998, + "step": 23300 + }, + { + "epoch": 0.2912822820570514, + "grad_norm": 3.7035257816314697, + "learning_rate": 1.7852514179712802e-05, + "loss": 1.4861, + "step": 23302 + }, + { + "epoch": 0.29130728268206707, + "grad_norm": 3.8668479919433594, + "learning_rate": 1.7851973800899322e-05, + "loss": 0.7933, + "step": 23304 + }, + { + "epoch": 0.29133228330708266, + "grad_norm": 2.090930223464966, + "learning_rate": 1.7851433362286322e-05, + "loss": 0.8365, + "step": 23306 + }, + { + "epoch": 0.2913572839320983, + "grad_norm": 2.4254825115203857, + "learning_rate": 1.785089286387792e-05, + "loss": 0.9352, + "step": 23308 + }, + { + "epoch": 0.2913822845571139, + "grad_norm": 0.002041054889559746, + "learning_rate": 1.7850352305678222e-05, + "loss": 0.957, + "step": 23310 + }, + { + "epoch": 0.29140728518212955, + "grad_norm": 3.12563157081604, + "learning_rate": 1.7849811687691355e-05, + "loss": 0.8403, + "step": 23312 + }, + { + "epoch": 0.2914322858071452, + "grad_norm": 2.9245266914367676, + "learning_rate": 1.784927100992143e-05, + "loss": 0.4522, + "step": 23314 + }, + { + "epoch": 0.2914572864321608, + "grad_norm": 9.119933128356934, + "learning_rate": 1.7848730272372568e-05, + "loss": 2.0687, + "step": 23316 + }, + { + "epoch": 0.29148228705717644, + "grad_norm": 3.34134840965271, + "learning_rate": 1.7848189475048884e-05, + "loss": 0.6481, + "step": 23318 + }, + { + "epoch": 0.29150728768219203, + "grad_norm": 3.97537899017334, + "learning_rate": 1.78476486179545e-05, + "loss": 1.1521, + "step": 23320 + }, + { + "epoch": 0.2915322883072077, + "grad_norm": 2.6903185844421387, + "learning_rate": 1.7847107701093534e-05, + "loss": 1.3478, + "step": 23322 + }, + { + "epoch": 0.2915572889322233, + "grad_norm": 0.002420404925942421, + "learning_rate": 1.7846566724470105e-05, + "loss": 0.761, + "step": 23324 + }, + { + "epoch": 0.2915822895572389, + "grad_norm": 3.0645697116851807, + "learning_rate": 1.7846025688088336e-05, + "loss": 0.5368, + "step": 23326 + }, + { + "epoch": 0.29160729018225456, + "grad_norm": 2.8188278675079346, + "learning_rate": 1.7845484591952343e-05, + "loss": 0.9721, + "step": 23328 + }, + { + "epoch": 0.29163229080727016, + "grad_norm": 1.7171730995178223, + "learning_rate": 1.784494343606625e-05, + "loss": 1.1495, + "step": 23330 + }, + { + "epoch": 0.2916572914322858, + "grad_norm": 3.804136037826538, + "learning_rate": 1.7844402220434175e-05, + "loss": 1.5204, + "step": 23332 + }, + { + "epoch": 0.29168229205730145, + "grad_norm": 5.179830074310303, + "learning_rate": 1.784386094506024e-05, + "loss": 1.0248, + "step": 23334 + }, + { + "epoch": 0.29170729268231704, + "grad_norm": 2.7004668712615967, + "learning_rate": 1.7843319609948575e-05, + "loss": 1.455, + "step": 23336 + }, + { + "epoch": 0.2917322933073327, + "grad_norm": 3.5737133026123047, + "learning_rate": 1.7842778215103293e-05, + "loss": 0.9656, + "step": 23338 + }, + { + "epoch": 0.2917572939323483, + "grad_norm": 6.305494785308838, + "learning_rate": 1.7842236760528524e-05, + "loss": 0.5827, + "step": 23340 + }, + { + "epoch": 0.29178229455736393, + "grad_norm": 2.9433884620666504, + "learning_rate": 1.7841695246228387e-05, + "loss": 0.5773, + "step": 23342 + }, + { + "epoch": 0.2918072951823796, + "grad_norm": 0.00226985034532845, + "learning_rate": 1.7841153672207006e-05, + "loss": 0.5381, + "step": 23344 + }, + { + "epoch": 0.29183229580739517, + "grad_norm": 4.811735153198242, + "learning_rate": 1.784061203846851e-05, + "loss": 1.8662, + "step": 23346 + }, + { + "epoch": 0.2918572964324108, + "grad_norm": 1.444809913635254, + "learning_rate": 1.784007034501702e-05, + "loss": 0.1031, + "step": 23348 + }, + { + "epoch": 0.2918822970574264, + "grad_norm": 3.9974470138549805, + "learning_rate": 1.7839528591856666e-05, + "loss": 0.2068, + "step": 23350 + }, + { + "epoch": 0.29190729768244206, + "grad_norm": 1.0283610820770264, + "learning_rate": 1.7838986778991568e-05, + "loss": 0.5835, + "step": 23352 + }, + { + "epoch": 0.2919322983074577, + "grad_norm": 0.045158322900533676, + "learning_rate": 1.7838444906425856e-05, + "loss": 0.188, + "step": 23354 + }, + { + "epoch": 0.2919572989324733, + "grad_norm": 0.0019285030430182815, + "learning_rate": 1.7837902974163658e-05, + "loss": 0.1306, + "step": 23356 + }, + { + "epoch": 0.29198229955748894, + "grad_norm": 3.889963150024414, + "learning_rate": 1.78373609822091e-05, + "loss": 1.7798, + "step": 23358 + }, + { + "epoch": 0.29200730018250454, + "grad_norm": 1.7325079441070557, + "learning_rate": 1.7836818930566304e-05, + "loss": 0.8025, + "step": 23360 + }, + { + "epoch": 0.2920323008075202, + "grad_norm": 0.002059993566945195, + "learning_rate": 1.7836276819239406e-05, + "loss": 0.5598, + "step": 23362 + }, + { + "epoch": 0.29205730143253583, + "grad_norm": 3.345505714416504, + "learning_rate": 1.783573464823253e-05, + "loss": 1.5967, + "step": 23364 + }, + { + "epoch": 0.2920823020575514, + "grad_norm": 3.0425496101379395, + "learning_rate": 1.783519241754981e-05, + "loss": 0.8751, + "step": 23366 + }, + { + "epoch": 0.29210730268256707, + "grad_norm": 1.968603253364563, + "learning_rate": 1.783465012719537e-05, + "loss": 0.6897, + "step": 23368 + }, + { + "epoch": 0.29213230330758266, + "grad_norm": 6.322403907775879, + "learning_rate": 1.7834107777173348e-05, + "loss": 1.6797, + "step": 23370 + }, + { + "epoch": 0.2921573039325983, + "grad_norm": 3.5443127155303955, + "learning_rate": 1.7833565367487862e-05, + "loss": 2.1024, + "step": 23372 + }, + { + "epoch": 0.29218230455761396, + "grad_norm": 2.4820258617401123, + "learning_rate": 1.7833022898143055e-05, + "loss": 1.1368, + "step": 23374 + }, + { + "epoch": 0.29220730518262955, + "grad_norm": 2.916503429412842, + "learning_rate": 1.783248036914305e-05, + "loss": 1.397, + "step": 23376 + }, + { + "epoch": 0.2922323058076452, + "grad_norm": 1.3418755531311035, + "learning_rate": 1.7831937780491983e-05, + "loss": 0.5545, + "step": 23378 + }, + { + "epoch": 0.2922573064326608, + "grad_norm": 4.581054210662842, + "learning_rate": 1.7831395132193986e-05, + "loss": 1.2337, + "step": 23380 + }, + { + "epoch": 0.29228230705767644, + "grad_norm": 3.0425868034362793, + "learning_rate": 1.783085242425319e-05, + "loss": 0.8506, + "step": 23382 + }, + { + "epoch": 0.2923073076826921, + "grad_norm": 2.3342678546905518, + "learning_rate": 1.783030965667373e-05, + "loss": 2.0848, + "step": 23384 + }, + { + "epoch": 0.2923323083077077, + "grad_norm": 1.3236006498336792, + "learning_rate": 1.7829766829459738e-05, + "loss": 0.6418, + "step": 23386 + }, + { + "epoch": 0.2923573089327233, + "grad_norm": 6.506448745727539, + "learning_rate": 1.7829223942615352e-05, + "loss": 0.6007, + "step": 23388 + }, + { + "epoch": 0.2923823095577389, + "grad_norm": 3.0816848278045654, + "learning_rate": 1.78286809961447e-05, + "loss": 1.1963, + "step": 23390 + }, + { + "epoch": 0.29240731018275457, + "grad_norm": 4.054937839508057, + "learning_rate": 1.7828137990051924e-05, + "loss": 1.0939, + "step": 23392 + }, + { + "epoch": 0.2924323108077702, + "grad_norm": 1.4773224592208862, + "learning_rate": 1.7827594924341155e-05, + "loss": 0.0233, + "step": 23394 + }, + { + "epoch": 0.2924573114327858, + "grad_norm": 7.25545072555542, + "learning_rate": 1.7827051799016527e-05, + "loss": 0.7184, + "step": 23396 + }, + { + "epoch": 0.29248231205780145, + "grad_norm": 0.24120791256427765, + "learning_rate": 1.782650861408218e-05, + "loss": 0.5411, + "step": 23398 + }, + { + "epoch": 0.29250731268281704, + "grad_norm": 6.658598899841309, + "learning_rate": 1.7825965369542253e-05, + "loss": 0.5538, + "step": 23400 + }, + { + "epoch": 0.2925323133078327, + "grad_norm": 0.9485190510749817, + "learning_rate": 1.782542206540088e-05, + "loss": 0.4852, + "step": 23402 + }, + { + "epoch": 0.29255731393284834, + "grad_norm": 4.580448150634766, + "learning_rate": 1.7824878701662198e-05, + "loss": 1.7722, + "step": 23404 + }, + { + "epoch": 0.29258231455786393, + "grad_norm": 2.5546793937683105, + "learning_rate": 1.7824335278330348e-05, + "loss": 0.4356, + "step": 23406 + }, + { + "epoch": 0.2926073151828796, + "grad_norm": 0.003728767391294241, + "learning_rate": 1.7823791795409463e-05, + "loss": 0.6671, + "step": 23408 + }, + { + "epoch": 0.29263231580789517, + "grad_norm": 2.0989019870758057, + "learning_rate": 1.7823248252903693e-05, + "loss": 0.3075, + "step": 23410 + }, + { + "epoch": 0.2926573164329108, + "grad_norm": 4.456882476806641, + "learning_rate": 1.7822704650817164e-05, + "loss": 1.2994, + "step": 23412 + }, + { + "epoch": 0.29268231705792647, + "grad_norm": 4.448308944702148, + "learning_rate": 1.7822160989154028e-05, + "loss": 1.7511, + "step": 23414 + }, + { + "epoch": 0.29270731768294206, + "grad_norm": 4.688806056976318, + "learning_rate": 1.7821617267918414e-05, + "loss": 2.0145, + "step": 23416 + }, + { + "epoch": 0.2927323183079577, + "grad_norm": 4.367803573608398, + "learning_rate": 1.7821073487114473e-05, + "loss": 1.0217, + "step": 23418 + }, + { + "epoch": 0.2927573189329733, + "grad_norm": 0.18292157351970673, + "learning_rate": 1.782052964674634e-05, + "loss": 0.5852, + "step": 23420 + }, + { + "epoch": 0.29278231955798895, + "grad_norm": 0.7637166976928711, + "learning_rate": 1.781998574681816e-05, + "loss": 0.4398, + "step": 23422 + }, + { + "epoch": 0.2928073201830046, + "grad_norm": 5.279405117034912, + "learning_rate": 1.7819441787334075e-05, + "loss": 0.7184, + "step": 23424 + }, + { + "epoch": 0.2928323208080202, + "grad_norm": 0.14513523876667023, + "learning_rate": 1.7818897768298228e-05, + "loss": 0.4314, + "step": 23426 + }, + { + "epoch": 0.29285732143303583, + "grad_norm": 4.662219524383545, + "learning_rate": 1.781835368971476e-05, + "loss": 0.9306, + "step": 23428 + }, + { + "epoch": 0.2928823220580514, + "grad_norm": 0.5973135232925415, + "learning_rate": 1.7817809551587813e-05, + "loss": 0.4251, + "step": 23430 + }, + { + "epoch": 0.2929073226830671, + "grad_norm": 0.0013330889632925391, + "learning_rate": 1.781726535392154e-05, + "loss": 1.1632, + "step": 23432 + }, + { + "epoch": 0.2929323233080827, + "grad_norm": 0.5118767619132996, + "learning_rate": 1.781672109672008e-05, + "loss": 0.2089, + "step": 23434 + }, + { + "epoch": 0.2929573239330983, + "grad_norm": 2.682711601257324, + "learning_rate": 1.781617677998757e-05, + "loss": 1.1708, + "step": 23436 + }, + { + "epoch": 0.29298232455811396, + "grad_norm": 3.6887295246124268, + "learning_rate": 1.7815632403728168e-05, + "loss": 0.8359, + "step": 23438 + }, + { + "epoch": 0.29300732518312955, + "grad_norm": 1.4742647409439087, + "learning_rate": 1.7815087967946016e-05, + "loss": 0.0744, + "step": 23440 + }, + { + "epoch": 0.2930323258081452, + "grad_norm": 7.3330488204956055, + "learning_rate": 1.7814543472645257e-05, + "loss": 1.1251, + "step": 23442 + }, + { + "epoch": 0.29305732643316085, + "grad_norm": 1.5626511573791504, + "learning_rate": 1.781399891783004e-05, + "loss": 0.3726, + "step": 23444 + }, + { + "epoch": 0.29308232705817644, + "grad_norm": 3.91393780708313, + "learning_rate": 1.781345430350451e-05, + "loss": 1.869, + "step": 23446 + }, + { + "epoch": 0.2931073276831921, + "grad_norm": 3.1494061946868896, + "learning_rate": 1.7812909629672822e-05, + "loss": 1.9862, + "step": 23448 + }, + { + "epoch": 0.2931323283082077, + "grad_norm": 2.524794816970825, + "learning_rate": 1.7812364896339115e-05, + "loss": 1.3244, + "step": 23450 + }, + { + "epoch": 0.2931573289332233, + "grad_norm": 4.4220757484436035, + "learning_rate": 1.7811820103507544e-05, + "loss": 1.5362, + "step": 23452 + }, + { + "epoch": 0.293182329558239, + "grad_norm": 3.526688575744629, + "learning_rate": 1.7811275251182255e-05, + "loss": 0.5662, + "step": 23454 + }, + { + "epoch": 0.29320733018325457, + "grad_norm": 7.073726177215576, + "learning_rate": 1.78107303393674e-05, + "loss": 1.5931, + "step": 23456 + }, + { + "epoch": 0.2932323308082702, + "grad_norm": 3.0800976753234863, + "learning_rate": 1.7810185368067125e-05, + "loss": 0.703, + "step": 23458 + }, + { + "epoch": 0.2932573314332858, + "grad_norm": 2.7101666927337646, + "learning_rate": 1.7809640337285585e-05, + "loss": 1.427, + "step": 23460 + }, + { + "epoch": 0.29328233205830145, + "grad_norm": 2.0754246711730957, + "learning_rate": 1.7809095247026925e-05, + "loss": 1.0198, + "step": 23462 + }, + { + "epoch": 0.2933073326833171, + "grad_norm": 3.4890427589416504, + "learning_rate": 1.78085500972953e-05, + "loss": 0.7796, + "step": 23464 + }, + { + "epoch": 0.2933323333083327, + "grad_norm": 3.7195611000061035, + "learning_rate": 1.7808004888094866e-05, + "loss": 0.8117, + "step": 23466 + }, + { + "epoch": 0.29335733393334834, + "grad_norm": 4.4028000831604, + "learning_rate": 1.7807459619429766e-05, + "loss": 0.6463, + "step": 23468 + }, + { + "epoch": 0.29338233455836393, + "grad_norm": 0.0033830441534519196, + "learning_rate": 1.7806914291304164e-05, + "loss": 0.0875, + "step": 23470 + }, + { + "epoch": 0.2934073351833796, + "grad_norm": 2.808934211730957, + "learning_rate": 1.78063689037222e-05, + "loss": 1.2197, + "step": 23472 + }, + { + "epoch": 0.29343233580839523, + "grad_norm": 2.763225555419922, + "learning_rate": 1.7805823456688037e-05, + "loss": 1.2038, + "step": 23474 + }, + { + "epoch": 0.2934573364334108, + "grad_norm": 3.8015332221984863, + "learning_rate": 1.7805277950205827e-05, + "loss": 0.9015, + "step": 23476 + }, + { + "epoch": 0.29348233705842647, + "grad_norm": 5.694789409637451, + "learning_rate": 1.7804732384279722e-05, + "loss": 1.7265, + "step": 23478 + }, + { + "epoch": 0.29350733768344206, + "grad_norm": 0.28506192564964294, + "learning_rate": 1.780418675891388e-05, + "loss": 0.4098, + "step": 23480 + }, + { + "epoch": 0.2935323383084577, + "grad_norm": 2.539668321609497, + "learning_rate": 1.7803641074112458e-05, + "loss": 0.7586, + "step": 23482 + }, + { + "epoch": 0.29355733893347336, + "grad_norm": 0.03752473369240761, + "learning_rate": 1.7803095329879606e-05, + "loss": 0.0006, + "step": 23484 + }, + { + "epoch": 0.29358233955848895, + "grad_norm": 2.9996259212493896, + "learning_rate": 1.7802549526219485e-05, + "loss": 1.2047, + "step": 23486 + }, + { + "epoch": 0.2936073401835046, + "grad_norm": 0.0010753386886790395, + "learning_rate": 1.7802003663136248e-05, + "loss": 0.1543, + "step": 23488 + }, + { + "epoch": 0.2936323408085202, + "grad_norm": 5.263902187347412, + "learning_rate": 1.7801457740634057e-05, + "loss": 0.8962, + "step": 23490 + }, + { + "epoch": 0.29365734143353583, + "grad_norm": 2.6366775035858154, + "learning_rate": 1.7800911758717065e-05, + "loss": 0.9463, + "step": 23492 + }, + { + "epoch": 0.2936823420585515, + "grad_norm": 3.7315049171447754, + "learning_rate": 1.7800365717389433e-05, + "loss": 0.5406, + "step": 23494 + }, + { + "epoch": 0.2937073426835671, + "grad_norm": 0.8842182755470276, + "learning_rate": 1.7799819616655317e-05, + "loss": 0.2729, + "step": 23496 + }, + { + "epoch": 0.2937323433085827, + "grad_norm": 0.8715896010398865, + "learning_rate": 1.779927345651888e-05, + "loss": 0.4566, + "step": 23498 + }, + { + "epoch": 0.2937573439335983, + "grad_norm": 2.8994462490081787, + "learning_rate": 1.7798727236984277e-05, + "loss": 0.9774, + "step": 23500 + }, + { + "epoch": 0.29378234455861396, + "grad_norm": 1.077879548072815, + "learning_rate": 1.7798180958055674e-05, + "loss": 0.0439, + "step": 23502 + }, + { + "epoch": 0.2938073451836296, + "grad_norm": 0.18285587430000305, + "learning_rate": 1.7797634619737226e-05, + "loss": 1.1207, + "step": 23504 + }, + { + "epoch": 0.2938323458086452, + "grad_norm": 2.7463550567626953, + "learning_rate": 1.7797088222033095e-05, + "loss": 1.0824, + "step": 23506 + }, + { + "epoch": 0.29385734643366085, + "grad_norm": 4.249354362487793, + "learning_rate": 1.7796541764947442e-05, + "loss": 3.597, + "step": 23508 + }, + { + "epoch": 0.29388234705867644, + "grad_norm": 4.8600616455078125, + "learning_rate": 1.7795995248484427e-05, + "loss": 1.6814, + "step": 23510 + }, + { + "epoch": 0.2939073476836921, + "grad_norm": 4.687312126159668, + "learning_rate": 1.779544867264822e-05, + "loss": 0.9513, + "step": 23512 + }, + { + "epoch": 0.29393234830870774, + "grad_norm": 4.761168956756592, + "learning_rate": 1.7794902037442976e-05, + "loss": 1.0166, + "step": 23514 + }, + { + "epoch": 0.29395734893372333, + "grad_norm": 1.8207393884658813, + "learning_rate": 1.779435534287286e-05, + "loss": 0.1371, + "step": 23516 + }, + { + "epoch": 0.293982349558739, + "grad_norm": 2.018207311630249, + "learning_rate": 1.7793808588942036e-05, + "loss": 0.3943, + "step": 23518 + }, + { + "epoch": 0.29400735018375457, + "grad_norm": 4.717404365539551, + "learning_rate": 1.7793261775654668e-05, + "loss": 0.6889, + "step": 23520 + }, + { + "epoch": 0.2940323508087702, + "grad_norm": 4.477569103240967, + "learning_rate": 1.779271490301492e-05, + "loss": 1.5152, + "step": 23522 + }, + { + "epoch": 0.29405735143378586, + "grad_norm": 0.0013228291645646095, + "learning_rate": 1.7792167971026958e-05, + "loss": 1.1248, + "step": 23524 + }, + { + "epoch": 0.29408235205880146, + "grad_norm": 0.013182909227907658, + "learning_rate": 1.7791620979694948e-05, + "loss": 0.8956, + "step": 23526 + }, + { + "epoch": 0.2941073526838171, + "grad_norm": 5.211976528167725, + "learning_rate": 1.7791073929023052e-05, + "loss": 1.6907, + "step": 23528 + }, + { + "epoch": 0.2941323533088327, + "grad_norm": 4.169424057006836, + "learning_rate": 1.779052681901544e-05, + "loss": 0.6897, + "step": 23530 + }, + { + "epoch": 0.29415735393384834, + "grad_norm": 2.928270101547241, + "learning_rate": 1.778997964967628e-05, + "loss": 0.248, + "step": 23532 + }, + { + "epoch": 0.294182354558864, + "grad_norm": 0.7948634028434753, + "learning_rate": 1.778943242100973e-05, + "loss": 0.2455, + "step": 23534 + }, + { + "epoch": 0.2942073551838796, + "grad_norm": 0.001255035400390625, + "learning_rate": 1.778888513301997e-05, + "loss": 0.0397, + "step": 23536 + }, + { + "epoch": 0.29423235580889523, + "grad_norm": 6.660451412200928, + "learning_rate": 1.7788337785711162e-05, + "loss": 0.4013, + "step": 23538 + }, + { + "epoch": 0.2942573564339108, + "grad_norm": 2.9832944869995117, + "learning_rate": 1.778779037908747e-05, + "loss": 0.6842, + "step": 23540 + }, + { + "epoch": 0.29428235705892647, + "grad_norm": 4.849851608276367, + "learning_rate": 1.778724291315307e-05, + "loss": 2.3776, + "step": 23542 + }, + { + "epoch": 0.2943073576839421, + "grad_norm": 2.4474408626556396, + "learning_rate": 1.778669538791213e-05, + "loss": 2.6261, + "step": 23544 + }, + { + "epoch": 0.2943323583089577, + "grad_norm": 3.7350239753723145, + "learning_rate": 1.778614780336882e-05, + "loss": 1.4896, + "step": 23546 + }, + { + "epoch": 0.29435735893397336, + "grad_norm": 3.1102139949798584, + "learning_rate": 1.778560015952731e-05, + "loss": 0.3176, + "step": 23548 + }, + { + "epoch": 0.29438235955898895, + "grad_norm": 3.0675344467163086, + "learning_rate": 1.7785052456391766e-05, + "loss": 1.3357, + "step": 23550 + }, + { + "epoch": 0.2944073601840046, + "grad_norm": 14.86589241027832, + "learning_rate": 1.778450469396637e-05, + "loss": 1.948, + "step": 23552 + }, + { + "epoch": 0.29443236080902024, + "grad_norm": 3.86594557762146, + "learning_rate": 1.7783956872255282e-05, + "loss": 1.286, + "step": 23554 + }, + { + "epoch": 0.29445736143403584, + "grad_norm": 0.5656824111938477, + "learning_rate": 1.7783408991262677e-05, + "loss": 0.6817, + "step": 23556 + }, + { + "epoch": 0.2944823620590515, + "grad_norm": 4.4933695793151855, + "learning_rate": 1.7782861050992737e-05, + "loss": 1.0024, + "step": 23558 + }, + { + "epoch": 0.2945073626840671, + "grad_norm": 2.4305877685546875, + "learning_rate": 1.778231305144962e-05, + "loss": 1.9161, + "step": 23560 + }, + { + "epoch": 0.2945323633090827, + "grad_norm": 3.4713058471679688, + "learning_rate": 1.7781764992637516e-05, + "loss": 1.1728, + "step": 23562 + }, + { + "epoch": 0.29455736393409837, + "grad_norm": 2.991917848587036, + "learning_rate": 1.7781216874560585e-05, + "loss": 0.5814, + "step": 23564 + }, + { + "epoch": 0.29458236455911396, + "grad_norm": 1.2172553539276123, + "learning_rate": 1.7780668697223004e-05, + "loss": 0.2644, + "step": 23566 + }, + { + "epoch": 0.2946073651841296, + "grad_norm": 2.631782054901123, + "learning_rate": 1.7780120460628955e-05, + "loss": 0.3078, + "step": 23568 + }, + { + "epoch": 0.2946323658091452, + "grad_norm": 0.008184462785720825, + "learning_rate": 1.7779572164782608e-05, + "loss": 0.0005, + "step": 23570 + }, + { + "epoch": 0.29465736643416085, + "grad_norm": 0.012212232686579227, + "learning_rate": 1.7779023809688142e-05, + "loss": 0.5692, + "step": 23572 + }, + { + "epoch": 0.2946823670591765, + "grad_norm": 2.35713791847229, + "learning_rate": 1.7778475395349722e-05, + "loss": 0.2175, + "step": 23574 + }, + { + "epoch": 0.2947073676841921, + "grad_norm": 0.0038214114028960466, + "learning_rate": 1.7777926921771542e-05, + "loss": 0.7461, + "step": 23576 + }, + { + "epoch": 0.29473236830920774, + "grad_norm": 4.043019771575928, + "learning_rate": 1.7777378388957767e-05, + "loss": 1.5494, + "step": 23578 + }, + { + "epoch": 0.29475736893422333, + "grad_norm": 3.922384023666382, + "learning_rate": 1.777682979691258e-05, + "loss": 1.663, + "step": 23580 + }, + { + "epoch": 0.294782369559239, + "grad_norm": 0.016921166330575943, + "learning_rate": 1.7776281145640152e-05, + "loss": 0.6979, + "step": 23582 + }, + { + "epoch": 0.2948073701842546, + "grad_norm": 2.290417194366455, + "learning_rate": 1.777573243514467e-05, + "loss": 0.4677, + "step": 23584 + }, + { + "epoch": 0.2948323708092702, + "grad_norm": 2.7101500034332275, + "learning_rate": 1.7775183665430312e-05, + "loss": 0.6663, + "step": 23586 + }, + { + "epoch": 0.29485737143428586, + "grad_norm": 3.463085412979126, + "learning_rate": 1.7774634836501252e-05, + "loss": 0.3502, + "step": 23588 + }, + { + "epoch": 0.29488237205930146, + "grad_norm": 3.968647003173828, + "learning_rate": 1.777408594836167e-05, + "loss": 0.2335, + "step": 23590 + }, + { + "epoch": 0.2949073726843171, + "grad_norm": 2.2140021324157715, + "learning_rate": 1.777353700101575e-05, + "loss": 1.2112, + "step": 23592 + }, + { + "epoch": 0.29493237330933275, + "grad_norm": 4.062304496765137, + "learning_rate": 1.7772987994467677e-05, + "loss": 1.3218, + "step": 23594 + }, + { + "epoch": 0.29495737393434834, + "grad_norm": 0.003268167609348893, + "learning_rate": 1.777243892872162e-05, + "loss": 0.6079, + "step": 23596 + }, + { + "epoch": 0.294982374559364, + "grad_norm": 1.1816600561141968, + "learning_rate": 1.7771889803781773e-05, + "loss": 0.7337, + "step": 23598 + }, + { + "epoch": 0.2950073751843796, + "grad_norm": 0.003233005292713642, + "learning_rate": 1.7771340619652306e-05, + "loss": 0.7121, + "step": 23600 + }, + { + "epoch": 0.29503237580939523, + "grad_norm": 0.003591322572901845, + "learning_rate": 1.7770791376337414e-05, + "loss": 0.6747, + "step": 23602 + }, + { + "epoch": 0.2950573764344109, + "grad_norm": 2.567453384399414, + "learning_rate": 1.777024207384127e-05, + "loss": 0.9299, + "step": 23604 + }, + { + "epoch": 0.29508237705942647, + "grad_norm": 3.77905535697937, + "learning_rate": 1.776969271216806e-05, + "loss": 1.4801, + "step": 23606 + }, + { + "epoch": 0.2951073776844421, + "grad_norm": 2.182547092437744, + "learning_rate": 1.7769143291321973e-05, + "loss": 0.4512, + "step": 23608 + }, + { + "epoch": 0.2951323783094577, + "grad_norm": 0.00378630543127656, + "learning_rate": 1.7768593811307187e-05, + "loss": 0.0002, + "step": 23610 + }, + { + "epoch": 0.29515737893447336, + "grad_norm": 3.7904696464538574, + "learning_rate": 1.7768044272127888e-05, + "loss": 2.2809, + "step": 23612 + }, + { + "epoch": 0.295182379559489, + "grad_norm": 3.7768847942352295, + "learning_rate": 1.7767494673788264e-05, + "loss": 0.8264, + "step": 23614 + }, + { + "epoch": 0.2952073801845046, + "grad_norm": 4.982472896575928, + "learning_rate": 1.77669450162925e-05, + "loss": 2.6286, + "step": 23616 + }, + { + "epoch": 0.29523238080952025, + "grad_norm": 3.662874698638916, + "learning_rate": 1.7766395299644776e-05, + "loss": 0.9989, + "step": 23618 + }, + { + "epoch": 0.29525738143453584, + "grad_norm": 0.5609293580055237, + "learning_rate": 1.776584552384929e-05, + "loss": 0.0133, + "step": 23620 + }, + { + "epoch": 0.2952823820595515, + "grad_norm": 0.11025764793157578, + "learning_rate": 1.776529568891022e-05, + "loss": 0.1209, + "step": 23622 + }, + { + "epoch": 0.29530738268456713, + "grad_norm": 3.0976603031158447, + "learning_rate": 1.7764745794831756e-05, + "loss": 0.5887, + "step": 23624 + }, + { + "epoch": 0.2953323833095827, + "grad_norm": 0.035427119582891464, + "learning_rate": 1.7764195841618087e-05, + "loss": 0.4978, + "step": 23626 + }, + { + "epoch": 0.29535738393459837, + "grad_norm": 3.042996883392334, + "learning_rate": 1.77636458292734e-05, + "loss": 0.9609, + "step": 23628 + }, + { + "epoch": 0.29538238455961396, + "grad_norm": 3.4386518001556396, + "learning_rate": 1.776309575780188e-05, + "loss": 1.5268, + "step": 23630 + }, + { + "epoch": 0.2954073851846296, + "grad_norm": 0.015030565671622753, + "learning_rate": 1.7762545627207726e-05, + "loss": 0.2879, + "step": 23632 + }, + { + "epoch": 0.29543238580964526, + "grad_norm": 13.215689659118652, + "learning_rate": 1.7761995437495123e-05, + "loss": 0.5695, + "step": 23634 + }, + { + "epoch": 0.29545738643466085, + "grad_norm": 1.9707310199737549, + "learning_rate": 1.7761445188668256e-05, + "loss": 0.4622, + "step": 23636 + }, + { + "epoch": 0.2954823870596765, + "grad_norm": 3.9194068908691406, + "learning_rate": 1.776089488073132e-05, + "loss": 2.4432, + "step": 23638 + }, + { + "epoch": 0.2955073876846921, + "grad_norm": 4.842195510864258, + "learning_rate": 1.776034451368851e-05, + "loss": 0.2349, + "step": 23640 + }, + { + "epoch": 0.29553238830970774, + "grad_norm": 3.5182607173919678, + "learning_rate": 1.7759794087544012e-05, + "loss": 0.4904, + "step": 23642 + }, + { + "epoch": 0.2955573889347234, + "grad_norm": 3.091703176498413, + "learning_rate": 1.775924360230202e-05, + "loss": 0.885, + "step": 23644 + }, + { + "epoch": 0.295582389559739, + "grad_norm": 2.7639782428741455, + "learning_rate": 1.7758693057966724e-05, + "loss": 1.2919, + "step": 23646 + }, + { + "epoch": 0.2956073901847546, + "grad_norm": 0.019691497087478638, + "learning_rate": 1.775814245454232e-05, + "loss": 1.2009, + "step": 23648 + }, + { + "epoch": 0.2956323908097702, + "grad_norm": 4.080623626708984, + "learning_rate": 1.7757591792033004e-05, + "loss": 1.4216, + "step": 23650 + }, + { + "epoch": 0.29565739143478587, + "grad_norm": 0.1401035189628601, + "learning_rate": 1.7757041070442958e-05, + "loss": 0.8012, + "step": 23652 + }, + { + "epoch": 0.2956823920598015, + "grad_norm": 4.621767520904541, + "learning_rate": 1.7756490289776393e-05, + "loss": 1.124, + "step": 23654 + }, + { + "epoch": 0.2957073926848171, + "grad_norm": 2.622913122177124, + "learning_rate": 1.7755939450037486e-05, + "loss": 0.4804, + "step": 23656 + }, + { + "epoch": 0.29573239330983275, + "grad_norm": 0.07315602153539658, + "learning_rate": 1.7755388551230445e-05, + "loss": 0.5261, + "step": 23658 + }, + { + "epoch": 0.29575739393484834, + "grad_norm": 5.334677696228027, + "learning_rate": 1.775483759335946e-05, + "loss": 1.2445, + "step": 23660 + }, + { + "epoch": 0.295782394559864, + "grad_norm": 2.8114829063415527, + "learning_rate": 1.7754286576428735e-05, + "loss": 1.7066, + "step": 23662 + }, + { + "epoch": 0.29580739518487964, + "grad_norm": 0.7128816843032837, + "learning_rate": 1.7753735500442452e-05, + "loss": 0.9924, + "step": 23664 + }, + { + "epoch": 0.29583239580989523, + "grad_norm": 0.4068928062915802, + "learning_rate": 1.775318436540482e-05, + "loss": 1.1508, + "step": 23666 + }, + { + "epoch": 0.2958573964349109, + "grad_norm": 2.5019185543060303, + "learning_rate": 1.7752633171320034e-05, + "loss": 0.4563, + "step": 23668 + }, + { + "epoch": 0.29588239705992647, + "grad_norm": 0.0386970192193985, + "learning_rate": 1.7752081918192284e-05, + "loss": 0.1534, + "step": 23670 + }, + { + "epoch": 0.2959073976849421, + "grad_norm": 3.0284831523895264, + "learning_rate": 1.7751530606025778e-05, + "loss": 0.8878, + "step": 23672 + }, + { + "epoch": 0.29593239830995777, + "grad_norm": 0.018135512247681618, + "learning_rate": 1.7750979234824712e-05, + "loss": 0.0216, + "step": 23674 + }, + { + "epoch": 0.29595739893497336, + "grad_norm": 3.2587759494781494, + "learning_rate": 1.7750427804593282e-05, + "loss": 1.4549, + "step": 23676 + }, + { + "epoch": 0.295982399559989, + "grad_norm": 0.03317119926214218, + "learning_rate": 1.774987631533569e-05, + "loss": 0.9444, + "step": 23678 + }, + { + "epoch": 0.2960074001850046, + "grad_norm": 4.242650508880615, + "learning_rate": 1.774932476705614e-05, + "loss": 1.2968, + "step": 23680 + }, + { + "epoch": 0.29603240081002025, + "grad_norm": 0.07792700827121735, + "learning_rate": 1.7748773159758825e-05, + "loss": 0.8173, + "step": 23682 + }, + { + "epoch": 0.2960574014350359, + "grad_norm": 1.8382655382156372, + "learning_rate": 1.774822149344795e-05, + "loss": 0.806, + "step": 23684 + }, + { + "epoch": 0.2960824020600515, + "grad_norm": 1.137004017829895, + "learning_rate": 1.7747669768127713e-05, + "loss": 1.1265, + "step": 23686 + }, + { + "epoch": 0.29610740268506713, + "grad_norm": 5.269449234008789, + "learning_rate": 1.7747117983802322e-05, + "loss": 1.7984, + "step": 23688 + }, + { + "epoch": 0.2961324033100827, + "grad_norm": 2.799443483352661, + "learning_rate": 1.7746566140475977e-05, + "loss": 1.1224, + "step": 23690 + }, + { + "epoch": 0.2961574039350984, + "grad_norm": 1.5207347869873047, + "learning_rate": 1.7746014238152875e-05, + "loss": 0.7798, + "step": 23692 + }, + { + "epoch": 0.296182404560114, + "grad_norm": 3.7444093227386475, + "learning_rate": 1.7745462276837226e-05, + "loss": 1.7472, + "step": 23694 + }, + { + "epoch": 0.2962074051851296, + "grad_norm": 6.29256010055542, + "learning_rate": 1.7744910256533233e-05, + "loss": 0.7292, + "step": 23696 + }, + { + "epoch": 0.29623240581014526, + "grad_norm": 5.678765773773193, + "learning_rate": 1.77443581772451e-05, + "loss": 2.0449, + "step": 23698 + }, + { + "epoch": 0.29625740643516085, + "grad_norm": 3.090435743331909, + "learning_rate": 1.7743806038977025e-05, + "loss": 0.6233, + "step": 23700 + }, + { + "epoch": 0.2962824070601765, + "grad_norm": 0.7148879766464233, + "learning_rate": 1.7743253841733223e-05, + "loss": 1.0919, + "step": 23702 + }, + { + "epoch": 0.29630740768519215, + "grad_norm": 2.756638765335083, + "learning_rate": 1.7742701585517893e-05, + "loss": 0.5615, + "step": 23704 + }, + { + "epoch": 0.29633240831020774, + "grad_norm": 2.0105981826782227, + "learning_rate": 1.7742149270335246e-05, + "loss": 0.3099, + "step": 23706 + }, + { + "epoch": 0.2963574089352234, + "grad_norm": 1.2972047328948975, + "learning_rate": 1.774159689618948e-05, + "loss": 0.6785, + "step": 23708 + }, + { + "epoch": 0.296382409560239, + "grad_norm": 3.1964917182922363, + "learning_rate": 1.774104446308481e-05, + "loss": 0.6991, + "step": 23710 + }, + { + "epoch": 0.2964074101852546, + "grad_norm": 4.511210918426514, + "learning_rate": 1.7740491971025438e-05, + "loss": 1.9138, + "step": 23712 + }, + { + "epoch": 0.2964324108102703, + "grad_norm": 1.9273254871368408, + "learning_rate": 1.7739939420015574e-05, + "loss": 0.3779, + "step": 23714 + }, + { + "epoch": 0.29645741143528587, + "grad_norm": 4.1403937339782715, + "learning_rate": 1.7739386810059427e-05, + "loss": 0.9804, + "step": 23716 + }, + { + "epoch": 0.2964824120603015, + "grad_norm": 0.8929314017295837, + "learning_rate": 1.7738834141161202e-05, + "loss": 0.0637, + "step": 23718 + }, + { + "epoch": 0.2965074126853171, + "grad_norm": 3.319322109222412, + "learning_rate": 1.7738281413325116e-05, + "loss": 0.906, + "step": 23720 + }, + { + "epoch": 0.29653241331033275, + "grad_norm": 3.9952914714813232, + "learning_rate": 1.7737728626555368e-05, + "loss": 0.7452, + "step": 23722 + }, + { + "epoch": 0.2965574139353484, + "grad_norm": 0.6231458187103271, + "learning_rate": 1.7737175780856176e-05, + "loss": 0.8403, + "step": 23724 + }, + { + "epoch": 0.296582414560364, + "grad_norm": 3.50648832321167, + "learning_rate": 1.7736622876231747e-05, + "loss": 1.1766, + "step": 23726 + }, + { + "epoch": 0.29660741518537964, + "grad_norm": 5.271650791168213, + "learning_rate": 1.7736069912686288e-05, + "loss": 1.5132, + "step": 23728 + }, + { + "epoch": 0.29663241581039523, + "grad_norm": 4.761834144592285, + "learning_rate": 1.7735516890224018e-05, + "loss": 1.1868, + "step": 23730 + }, + { + "epoch": 0.2966574164354109, + "grad_norm": 2.074615478515625, + "learning_rate": 1.7734963808849147e-05, + "loss": 0.5878, + "step": 23732 + }, + { + "epoch": 0.29668241706042653, + "grad_norm": 2.5318093299865723, + "learning_rate": 1.773441066856588e-05, + "loss": 0.7209, + "step": 23734 + }, + { + "epoch": 0.2967074176854421, + "grad_norm": 3.2867753505706787, + "learning_rate": 1.7733857469378438e-05, + "loss": 1.5781, + "step": 23736 + }, + { + "epoch": 0.29673241831045777, + "grad_norm": 0.026958972215652466, + "learning_rate": 1.7733304211291033e-05, + "loss": 0.4513, + "step": 23738 + }, + { + "epoch": 0.29675741893547336, + "grad_norm": 4.833669185638428, + "learning_rate": 1.7732750894307876e-05, + "loss": 1.1663, + "step": 23740 + }, + { + "epoch": 0.296782419560489, + "grad_norm": 3.1320879459381104, + "learning_rate": 1.7732197518433184e-05, + "loss": 1.471, + "step": 23742 + }, + { + "epoch": 0.29680742018550466, + "grad_norm": 0.015089846216142178, + "learning_rate": 1.7731644083671163e-05, + "loss": 0.0004, + "step": 23744 + }, + { + "epoch": 0.29683242081052025, + "grad_norm": 3.8623602390289307, + "learning_rate": 1.7731090590026037e-05, + "loss": 0.7822, + "step": 23746 + }, + { + "epoch": 0.2968574214355359, + "grad_norm": 2.801017999649048, + "learning_rate": 1.773053703750202e-05, + "loss": 0.1188, + "step": 23748 + }, + { + "epoch": 0.2968824220605515, + "grad_norm": 2.3735270500183105, + "learning_rate": 1.7729983426103324e-05, + "loss": 0.7264, + "step": 23750 + }, + { + "epoch": 0.29690742268556714, + "grad_norm": 5.509336471557617, + "learning_rate": 1.772942975583417e-05, + "loss": 1.0423, + "step": 23752 + }, + { + "epoch": 0.2969324233105828, + "grad_norm": 6.899447441101074, + "learning_rate": 1.772887602669877e-05, + "loss": 1.6035, + "step": 23754 + }, + { + "epoch": 0.2969574239355984, + "grad_norm": 4.473194599151611, + "learning_rate": 1.7728322238701343e-05, + "loss": 1.1258, + "step": 23756 + }, + { + "epoch": 0.296982424560614, + "grad_norm": 3.059086561203003, + "learning_rate": 1.7727768391846107e-05, + "loss": 1.4645, + "step": 23758 + }, + { + "epoch": 0.2970074251856296, + "grad_norm": 2.3400075435638428, + "learning_rate": 1.7727214486137277e-05, + "loss": 0.4417, + "step": 23760 + }, + { + "epoch": 0.29703242581064526, + "grad_norm": 0.011240005493164062, + "learning_rate": 1.772666052157908e-05, + "loss": 0.7359, + "step": 23762 + }, + { + "epoch": 0.2970574264356609, + "grad_norm": 2.744509696960449, + "learning_rate": 1.7726106498175726e-05, + "loss": 1.1829, + "step": 23764 + }, + { + "epoch": 0.2970824270606765, + "grad_norm": 2.2182512283325195, + "learning_rate": 1.7725552415931436e-05, + "loss": 0.8634, + "step": 23766 + }, + { + "epoch": 0.29710742768569215, + "grad_norm": 2.7349679470062256, + "learning_rate": 1.7724998274850436e-05, + "loss": 0.9233, + "step": 23768 + }, + { + "epoch": 0.29713242831070774, + "grad_norm": 3.875096321105957, + "learning_rate": 1.7724444074936937e-05, + "loss": 0.3703, + "step": 23770 + }, + { + "epoch": 0.2971574289357234, + "grad_norm": 2.526948928833008, + "learning_rate": 1.7723889816195162e-05, + "loss": 1.0511, + "step": 23772 + }, + { + "epoch": 0.29718242956073904, + "grad_norm": 3.5373353958129883, + "learning_rate": 1.7723335498629342e-05, + "loss": 0.5597, + "step": 23774 + }, + { + "epoch": 0.29720743018575463, + "grad_norm": 0.006477119401097298, + "learning_rate": 1.7722781122243687e-05, + "loss": 0.8076, + "step": 23776 + }, + { + "epoch": 0.2972324308107703, + "grad_norm": 6.4348530769348145, + "learning_rate": 1.7722226687042422e-05, + "loss": 2.9235, + "step": 23778 + }, + { + "epoch": 0.29725743143578587, + "grad_norm": 0.024594897404313087, + "learning_rate": 1.7721672193029773e-05, + "loss": 0.1219, + "step": 23780 + }, + { + "epoch": 0.2972824320608015, + "grad_norm": 0.7239705920219421, + "learning_rate": 1.7721117640209957e-05, + "loss": 0.8961, + "step": 23782 + }, + { + "epoch": 0.29730743268581716, + "grad_norm": 1.7616740465164185, + "learning_rate": 1.7720563028587206e-05, + "loss": 0.4777, + "step": 23784 + }, + { + "epoch": 0.29733243331083276, + "grad_norm": 2.4950015544891357, + "learning_rate": 1.7720008358165735e-05, + "loss": 1.2459, + "step": 23786 + }, + { + "epoch": 0.2973574339358484, + "grad_norm": 2.0034196376800537, + "learning_rate": 1.7719453628949773e-05, + "loss": 0.3576, + "step": 23788 + }, + { + "epoch": 0.297382434560864, + "grad_norm": 8.136738777160645, + "learning_rate": 1.7718898840943545e-05, + "loss": 1.2765, + "step": 23790 + }, + { + "epoch": 0.29740743518587964, + "grad_norm": 3.8241145610809326, + "learning_rate": 1.771834399415127e-05, + "loss": 1.6361, + "step": 23792 + }, + { + "epoch": 0.2974324358108953, + "grad_norm": 7.964831352233887, + "learning_rate": 1.7717789088577185e-05, + "loss": 1.1036, + "step": 23794 + }, + { + "epoch": 0.2974574364359109, + "grad_norm": 3.068455219268799, + "learning_rate": 1.7717234124225507e-05, + "loss": 1.6592, + "step": 23796 + }, + { + "epoch": 0.29748243706092653, + "grad_norm": 3.0332093238830566, + "learning_rate": 1.7716679101100465e-05, + "loss": 0.7213, + "step": 23798 + }, + { + "epoch": 0.2975074376859421, + "grad_norm": 4.722908973693848, + "learning_rate": 1.7716124019206286e-05, + "loss": 1.7789, + "step": 23800 + }, + { + "epoch": 0.29753243831095777, + "grad_norm": 3.5645108222961426, + "learning_rate": 1.7715568878547197e-05, + "loss": 1.458, + "step": 23802 + }, + { + "epoch": 0.2975574389359734, + "grad_norm": 1.1015920639038086, + "learning_rate": 1.771501367912743e-05, + "loss": 1.1139, + "step": 23804 + }, + { + "epoch": 0.297582439560989, + "grad_norm": 9.605985641479492, + "learning_rate": 1.7714458420951207e-05, + "loss": 1.5362, + "step": 23806 + }, + { + "epoch": 0.29760744018600466, + "grad_norm": 3.02018141746521, + "learning_rate": 1.771390310402276e-05, + "loss": 1.9752, + "step": 23808 + }, + { + "epoch": 0.29763244081102025, + "grad_norm": 2.8280463218688965, + "learning_rate": 1.771334772834632e-05, + "loss": 0.9274, + "step": 23810 + }, + { + "epoch": 0.2976574414360359, + "grad_norm": 4.272798538208008, + "learning_rate": 1.771279229392611e-05, + "loss": 1.5049, + "step": 23812 + }, + { + "epoch": 0.29768244206105154, + "grad_norm": 3.236189126968384, + "learning_rate": 1.7712236800766366e-05, + "loss": 0.5037, + "step": 23814 + }, + { + "epoch": 0.29770744268606714, + "grad_norm": 0.008996103890240192, + "learning_rate": 1.7711681248871317e-05, + "loss": 0.5268, + "step": 23816 + }, + { + "epoch": 0.2977324433110828, + "grad_norm": 2.762705087661743, + "learning_rate": 1.7711125638245196e-05, + "loss": 1.214, + "step": 23818 + }, + { + "epoch": 0.2977574439360984, + "grad_norm": 2.2608015537261963, + "learning_rate": 1.771056996889223e-05, + "loss": 1.355, + "step": 23820 + }, + { + "epoch": 0.297782444561114, + "grad_norm": 2.4243857860565186, + "learning_rate": 1.7710014240816657e-05, + "loss": 1.2148, + "step": 23822 + }, + { + "epoch": 0.29780744518612967, + "grad_norm": 4.803445816040039, + "learning_rate": 1.7709458454022703e-05, + "loss": 1.3057, + "step": 23824 + }, + { + "epoch": 0.29783244581114526, + "grad_norm": 3.3117363452911377, + "learning_rate": 1.7708902608514607e-05, + "loss": 1.0307, + "step": 23826 + }, + { + "epoch": 0.2978574464361609, + "grad_norm": 2.609600305557251, + "learning_rate": 1.7708346704296597e-05, + "loss": 0.6778, + "step": 23828 + }, + { + "epoch": 0.2978824470611765, + "grad_norm": 3.5040786266326904, + "learning_rate": 1.7707790741372908e-05, + "loss": 1.2626, + "step": 23830 + }, + { + "epoch": 0.29790744768619215, + "grad_norm": 4.012670516967773, + "learning_rate": 1.7707234719747773e-05, + "loss": 1.9749, + "step": 23832 + }, + { + "epoch": 0.2979324483112078, + "grad_norm": 2.531322956085205, + "learning_rate": 1.770667863942543e-05, + "loss": 0.7247, + "step": 23834 + }, + { + "epoch": 0.2979574489362234, + "grad_norm": 3.0020229816436768, + "learning_rate": 1.7706122500410114e-05, + "loss": 0.7514, + "step": 23836 + }, + { + "epoch": 0.29798244956123904, + "grad_norm": 5.477136611938477, + "learning_rate": 1.7705566302706057e-05, + "loss": 1.0394, + "step": 23838 + }, + { + "epoch": 0.29800745018625463, + "grad_norm": 0.0966091901063919, + "learning_rate": 1.7705010046317498e-05, + "loss": 0.9213, + "step": 23840 + }, + { + "epoch": 0.2980324508112703, + "grad_norm": 0.025219829753041267, + "learning_rate": 1.770445373124867e-05, + "loss": 0.5757, + "step": 23842 + }, + { + "epoch": 0.2980574514362859, + "grad_norm": 0.008306670002639294, + "learning_rate": 1.770389735750381e-05, + "loss": 0.2359, + "step": 23844 + }, + { + "epoch": 0.2980824520613015, + "grad_norm": 4.145644664764404, + "learning_rate": 1.770334092508716e-05, + "loss": 1.6511, + "step": 23846 + }, + { + "epoch": 0.29810745268631716, + "grad_norm": 2.222738265991211, + "learning_rate": 1.7702784434002954e-05, + "loss": 0.0859, + "step": 23848 + }, + { + "epoch": 0.29813245331133276, + "grad_norm": 2.2209341526031494, + "learning_rate": 1.7702227884255436e-05, + "loss": 1.1877, + "step": 23850 + }, + { + "epoch": 0.2981574539363484, + "grad_norm": 5.989334583282471, + "learning_rate": 1.7701671275848837e-05, + "loss": 1.8812, + "step": 23852 + }, + { + "epoch": 0.29818245456136405, + "grad_norm": 0.007458178326487541, + "learning_rate": 1.7701114608787394e-05, + "loss": 1.2855, + "step": 23854 + }, + { + "epoch": 0.29820745518637964, + "grad_norm": 0.025192059576511383, + "learning_rate": 1.7700557883075355e-05, + "loss": 0.0005, + "step": 23856 + }, + { + "epoch": 0.2982324558113953, + "grad_norm": 8.712952613830566, + "learning_rate": 1.7700001098716956e-05, + "loss": 0.8917, + "step": 23858 + }, + { + "epoch": 0.2982574564364109, + "grad_norm": 3.463256359100342, + "learning_rate": 1.7699444255716437e-05, + "loss": 1.292, + "step": 23860 + }, + { + "epoch": 0.29828245706142653, + "grad_norm": 1.1656643152236938, + "learning_rate": 1.769888735407804e-05, + "loss": 0.0801, + "step": 23862 + }, + { + "epoch": 0.2983074576864422, + "grad_norm": 1.276163101196289, + "learning_rate": 1.7698330393806005e-05, + "loss": 0.7627, + "step": 23864 + }, + { + "epoch": 0.29833245831145777, + "grad_norm": 4.013850688934326, + "learning_rate": 1.7697773374904573e-05, + "loss": 0.4089, + "step": 23866 + }, + { + "epoch": 0.2983574589364734, + "grad_norm": 51.509735107421875, + "learning_rate": 1.7697216297377992e-05, + "loss": 0.5846, + "step": 23868 + }, + { + "epoch": 0.298382459561489, + "grad_norm": 0.004989292938262224, + "learning_rate": 1.7696659161230493e-05, + "loss": 0.2016, + "step": 23870 + }, + { + "epoch": 0.29840746018650466, + "grad_norm": 5.220907211303711, + "learning_rate": 1.7696101966466332e-05, + "loss": 0.9852, + "step": 23872 + }, + { + "epoch": 0.2984324608115203, + "grad_norm": 3.153244733810425, + "learning_rate": 1.7695544713089744e-05, + "loss": 0.9707, + "step": 23874 + }, + { + "epoch": 0.2984574614365359, + "grad_norm": 1.2954442501068115, + "learning_rate": 1.7694987401104978e-05, + "loss": 0.4109, + "step": 23876 + }, + { + "epoch": 0.29848246206155155, + "grad_norm": 3.0442137718200684, + "learning_rate": 1.7694430030516273e-05, + "loss": 0.8553, + "step": 23878 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 0.9733307957649231, + "learning_rate": 1.769387260132788e-05, + "loss": 0.0376, + "step": 23880 + }, + { + "epoch": 0.2985324633115828, + "grad_norm": 3.504287004470825, + "learning_rate": 1.769331511354404e-05, + "loss": 1.0593, + "step": 23882 + }, + { + "epoch": 0.29855746393659843, + "grad_norm": 5.22900390625, + "learning_rate": 1.7692757567169e-05, + "loss": 1.3206, + "step": 23884 + }, + { + "epoch": 0.298582464561614, + "grad_norm": 6.9964189529418945, + "learning_rate": 1.7692199962207006e-05, + "loss": 1.1664, + "step": 23886 + }, + { + "epoch": 0.2986074651866297, + "grad_norm": 3.1972923278808594, + "learning_rate": 1.7691642298662306e-05, + "loss": 1.3707, + "step": 23888 + }, + { + "epoch": 0.29863246581164526, + "grad_norm": 2.354419708251953, + "learning_rate": 1.7691084576539142e-05, + "loss": 0.0788, + "step": 23890 + }, + { + "epoch": 0.2986574664366609, + "grad_norm": 2.7727389335632324, + "learning_rate": 1.7690526795841773e-05, + "loss": 0.9589, + "step": 23892 + }, + { + "epoch": 0.29868246706167656, + "grad_norm": 5.388273239135742, + "learning_rate": 1.7689968956574434e-05, + "loss": 1.1878, + "step": 23894 + }, + { + "epoch": 0.29870746768669215, + "grad_norm": 3.9191391468048096, + "learning_rate": 1.768941105874138e-05, + "loss": 1.0543, + "step": 23896 + }, + { + "epoch": 0.2987324683117078, + "grad_norm": 2.8113720417022705, + "learning_rate": 1.7688853102346856e-05, + "loss": 0.3295, + "step": 23898 + }, + { + "epoch": 0.2987574689367234, + "grad_norm": 3.3654861450195312, + "learning_rate": 1.768829508739512e-05, + "loss": 1.2305, + "step": 23900 + }, + { + "epoch": 0.29878246956173904, + "grad_norm": 0.7836727499961853, + "learning_rate": 1.768773701389041e-05, + "loss": 0.0863, + "step": 23902 + }, + { + "epoch": 0.2988074701867547, + "grad_norm": 3.6064674854278564, + "learning_rate": 1.7687178881836988e-05, + "loss": 1.4015, + "step": 23904 + }, + { + "epoch": 0.2988324708117703, + "grad_norm": 3.4965298175811768, + "learning_rate": 1.7686620691239095e-05, + "loss": 1.0435, + "step": 23906 + }, + { + "epoch": 0.2988574714367859, + "grad_norm": 4.521432399749756, + "learning_rate": 1.7686062442100986e-05, + "loss": 0.4628, + "step": 23908 + }, + { + "epoch": 0.2988824720618015, + "grad_norm": 2.3937060832977295, + "learning_rate": 1.7685504134426913e-05, + "loss": 0.6089, + "step": 23910 + }, + { + "epoch": 0.29890747268681717, + "grad_norm": 1.7910866737365723, + "learning_rate": 1.7684945768221123e-05, + "loss": 0.1091, + "step": 23912 + }, + { + "epoch": 0.2989324733118328, + "grad_norm": 3.587297201156616, + "learning_rate": 1.768438734348788e-05, + "loss": 0.5239, + "step": 23914 + }, + { + "epoch": 0.2989574739368484, + "grad_norm": 12.900592803955078, + "learning_rate": 1.7683828860231423e-05, + "loss": 2.4343, + "step": 23916 + }, + { + "epoch": 0.29898247456186405, + "grad_norm": 5.538506507873535, + "learning_rate": 1.7683270318456018e-05, + "loss": 0.5788, + "step": 23918 + }, + { + "epoch": 0.29900747518687965, + "grad_norm": 6.6080732345581055, + "learning_rate": 1.768271171816591e-05, + "loss": 2.0693, + "step": 23920 + }, + { + "epoch": 0.2990324758118953, + "grad_norm": 9.154728889465332, + "learning_rate": 1.7682153059365354e-05, + "loss": 1.1834, + "step": 23922 + }, + { + "epoch": 0.29905747643691094, + "grad_norm": 3.8321499824523926, + "learning_rate": 1.768159434205861e-05, + "loss": 1.0577, + "step": 23924 + }, + { + "epoch": 0.29908247706192653, + "grad_norm": 0.022763323038816452, + "learning_rate": 1.768103556624993e-05, + "loss": 0.4089, + "step": 23926 + }, + { + "epoch": 0.2991074776869422, + "grad_norm": 2.5873639583587646, + "learning_rate": 1.7680476731943564e-05, + "loss": 0.4078, + "step": 23928 + }, + { + "epoch": 0.2991324783119578, + "grad_norm": 1.6457855701446533, + "learning_rate": 1.767991783914378e-05, + "loss": 0.7464, + "step": 23930 + }, + { + "epoch": 0.2991574789369734, + "grad_norm": 6.26596212387085, + "learning_rate": 1.7679358887854827e-05, + "loss": 1.7947, + "step": 23932 + }, + { + "epoch": 0.29918247956198907, + "grad_norm": 17.317474365234375, + "learning_rate": 1.767879987808096e-05, + "loss": 0.3673, + "step": 23934 + }, + { + "epoch": 0.29920748018700466, + "grad_norm": 3.1242141723632812, + "learning_rate": 1.7678240809826438e-05, + "loss": 1.6941, + "step": 23936 + }, + { + "epoch": 0.2992324808120203, + "grad_norm": 0.12836967408657074, + "learning_rate": 1.767768168309552e-05, + "loss": 0.969, + "step": 23938 + }, + { + "epoch": 0.2992574814370359, + "grad_norm": 1.2182382345199585, + "learning_rate": 1.7677122497892466e-05, + "loss": 0.1084, + "step": 23940 + }, + { + "epoch": 0.29928248206205155, + "grad_norm": 2.357170581817627, + "learning_rate": 1.7676563254221535e-05, + "loss": 0.3589, + "step": 23942 + }, + { + "epoch": 0.2993074826870672, + "grad_norm": 0.036354608833789825, + "learning_rate": 1.767600395208698e-05, + "loss": 0.5036, + "step": 23944 + }, + { + "epoch": 0.2993324833120828, + "grad_norm": 1.888810634613037, + "learning_rate": 1.7675444591493065e-05, + "loss": 0.1801, + "step": 23946 + }, + { + "epoch": 0.29935748393709843, + "grad_norm": 3.5059900283813477, + "learning_rate": 1.767488517244405e-05, + "loss": 0.7781, + "step": 23948 + }, + { + "epoch": 0.299382484562114, + "grad_norm": 5.068413257598877, + "learning_rate": 1.7674325694944193e-05, + "loss": 1.3932, + "step": 23950 + }, + { + "epoch": 0.2994074851871297, + "grad_norm": 11.823339462280273, + "learning_rate": 1.767376615899776e-05, + "loss": 0.3503, + "step": 23952 + }, + { + "epoch": 0.2994324858121453, + "grad_norm": 1.997764229774475, + "learning_rate": 1.7673206564609007e-05, + "loss": 0.9524, + "step": 23954 + }, + { + "epoch": 0.2994574864371609, + "grad_norm": 3.887878894805908, + "learning_rate": 1.7672646911782202e-05, + "loss": 0.935, + "step": 23956 + }, + { + "epoch": 0.29948248706217656, + "grad_norm": 4.503264904022217, + "learning_rate": 1.7672087200521593e-05, + "loss": 1.5081, + "step": 23958 + }, + { + "epoch": 0.29950748768719215, + "grad_norm": 4.183433532714844, + "learning_rate": 1.7671527430831463e-05, + "loss": 1.2137, + "step": 23960 + }, + { + "epoch": 0.2995324883122078, + "grad_norm": 4.954734802246094, + "learning_rate": 1.7670967602716062e-05, + "loss": 1.1423, + "step": 23962 + }, + { + "epoch": 0.29955748893722345, + "grad_norm": 1.2221790552139282, + "learning_rate": 1.7670407716179654e-05, + "loss": 0.5541, + "step": 23964 + }, + { + "epoch": 0.29958248956223904, + "grad_norm": 3.7451419830322266, + "learning_rate": 1.7669847771226507e-05, + "loss": 0.8605, + "step": 23966 + }, + { + "epoch": 0.2996074901872547, + "grad_norm": 3.0200817584991455, + "learning_rate": 1.7669287767860884e-05, + "loss": 1.2171, + "step": 23968 + }, + { + "epoch": 0.2996324908122703, + "grad_norm": 3.682285785675049, + "learning_rate": 1.766872770608705e-05, + "loss": 1.0372, + "step": 23970 + }, + { + "epoch": 0.2996574914372859, + "grad_norm": 2.757838010787964, + "learning_rate": 1.766816758590927e-05, + "loss": 1.3956, + "step": 23972 + }, + { + "epoch": 0.2996824920623016, + "grad_norm": 4.910162925720215, + "learning_rate": 1.766760740733181e-05, + "loss": 0.2048, + "step": 23974 + }, + { + "epoch": 0.29970749268731717, + "grad_norm": 3.2211403846740723, + "learning_rate": 1.766704717035894e-05, + "loss": 1.4408, + "step": 23976 + }, + { + "epoch": 0.2997324933123328, + "grad_norm": 0.023736612871289253, + "learning_rate": 1.7666486874994917e-05, + "loss": 0.2817, + "step": 23978 + }, + { + "epoch": 0.2997574939373484, + "grad_norm": 0.00799609161913395, + "learning_rate": 1.766592652124402e-05, + "loss": 0.0219, + "step": 23980 + }, + { + "epoch": 0.29978249456236405, + "grad_norm": 2.6685798168182373, + "learning_rate": 1.7665366109110507e-05, + "loss": 0.7201, + "step": 23982 + }, + { + "epoch": 0.2998074951873797, + "grad_norm": 0.18869075179100037, + "learning_rate": 1.766480563859865e-05, + "loss": 0.0095, + "step": 23984 + }, + { + "epoch": 0.2998324958123953, + "grad_norm": 3.92657470703125, + "learning_rate": 1.7664245109712718e-05, + "loss": 0.6437, + "step": 23986 + }, + { + "epoch": 0.29985749643741094, + "grad_norm": 2.074366807937622, + "learning_rate": 1.7663684522456977e-05, + "loss": 0.7695, + "step": 23988 + }, + { + "epoch": 0.29988249706242653, + "grad_norm": 2.174804925918579, + "learning_rate": 1.76631238768357e-05, + "loss": 0.9543, + "step": 23990 + }, + { + "epoch": 0.2999074976874422, + "grad_norm": 1.488101840019226, + "learning_rate": 1.7662563172853154e-05, + "loss": 0.5168, + "step": 23992 + }, + { + "epoch": 0.29993249831245783, + "grad_norm": 5.495787143707275, + "learning_rate": 1.7662002410513613e-05, + "loss": 1.5272, + "step": 23994 + }, + { + "epoch": 0.2999574989374734, + "grad_norm": 3.144733428955078, + "learning_rate": 1.7661441589821346e-05, + "loss": 0.8536, + "step": 23996 + }, + { + "epoch": 0.29998249956248907, + "grad_norm": 3.8325417041778564, + "learning_rate": 1.7660880710780617e-05, + "loss": 0.7065, + "step": 23998 + }, + { + "epoch": 0.30000750018750466, + "grad_norm": 3.307996988296509, + "learning_rate": 1.7660319773395707e-05, + "loss": 0.8358, + "step": 24000 + }, + { + "epoch": 0.3000325008125203, + "grad_norm": 1.7302541732788086, + "learning_rate": 1.7659758777670888e-05, + "loss": 1.0351, + "step": 24002 + }, + { + "epoch": 0.30005750143753596, + "grad_norm": 0.00828101672232151, + "learning_rate": 1.7659197723610425e-05, + "loss": 0.1825, + "step": 24004 + }, + { + "epoch": 0.30008250206255155, + "grad_norm": 2.9734690189361572, + "learning_rate": 1.7658636611218598e-05, + "loss": 0.9311, + "step": 24006 + }, + { + "epoch": 0.3001075026875672, + "grad_norm": 6.793642997741699, + "learning_rate": 1.765807544049968e-05, + "loss": 0.3739, + "step": 24008 + }, + { + "epoch": 0.3001325033125828, + "grad_norm": 3.61523175239563, + "learning_rate": 1.7657514211457936e-05, + "loss": 0.9262, + "step": 24010 + }, + { + "epoch": 0.30015750393759844, + "grad_norm": 0.49394047260284424, + "learning_rate": 1.765695292409765e-05, + "loss": 0.5278, + "step": 24012 + }, + { + "epoch": 0.3001825045626141, + "grad_norm": 3.0111753940582275, + "learning_rate": 1.7656391578423094e-05, + "loss": 0.6944, + "step": 24014 + }, + { + "epoch": 0.3002075051876297, + "grad_norm": 4.045135021209717, + "learning_rate": 1.765583017443854e-05, + "loss": 0.8021, + "step": 24016 + }, + { + "epoch": 0.3002325058126453, + "grad_norm": 0.003175413468852639, + "learning_rate": 1.765526871214827e-05, + "loss": 0.8106, + "step": 24018 + }, + { + "epoch": 0.3002575064376609, + "grad_norm": 2.3667237758636475, + "learning_rate": 1.7654707191556553e-05, + "loss": 1.1977, + "step": 24020 + }, + { + "epoch": 0.30028250706267656, + "grad_norm": 2.407500982284546, + "learning_rate": 1.765414561266767e-05, + "loss": 0.2215, + "step": 24022 + }, + { + "epoch": 0.3003075076876922, + "grad_norm": 5.042681694030762, + "learning_rate": 1.7653583975485894e-05, + "loss": 0.7967, + "step": 24024 + }, + { + "epoch": 0.3003325083127078, + "grad_norm": 2.882148027420044, + "learning_rate": 1.7653022280015508e-05, + "loss": 0.4494, + "step": 24026 + }, + { + "epoch": 0.30035750893772345, + "grad_norm": 3.188136339187622, + "learning_rate": 1.7652460526260785e-05, + "loss": 1.5547, + "step": 24028 + }, + { + "epoch": 0.30038250956273904, + "grad_norm": 2.8596060276031494, + "learning_rate": 1.7651898714226005e-05, + "loss": 0.6565, + "step": 24030 + }, + { + "epoch": 0.3004075101877547, + "grad_norm": 3.4181249141693115, + "learning_rate": 1.7651336843915446e-05, + "loss": 2.0311, + "step": 24032 + }, + { + "epoch": 0.30043251081277034, + "grad_norm": 2.1049273014068604, + "learning_rate": 1.7650774915333387e-05, + "loss": 0.4715, + "step": 24034 + }, + { + "epoch": 0.30045751143778593, + "grad_norm": 4.887292861938477, + "learning_rate": 1.765021292848411e-05, + "loss": 0.8049, + "step": 24036 + }, + { + "epoch": 0.3004825120628016, + "grad_norm": 6.790212631225586, + "learning_rate": 1.764965088337189e-05, + "loss": 2.0254, + "step": 24038 + }, + { + "epoch": 0.30050751268781717, + "grad_norm": 4.142544746398926, + "learning_rate": 1.7649088780001014e-05, + "loss": 0.5352, + "step": 24040 + }, + { + "epoch": 0.3005325133128328, + "grad_norm": 0.2963968515396118, + "learning_rate": 1.764852661837576e-05, + "loss": 0.453, + "step": 24042 + }, + { + "epoch": 0.30055751393784846, + "grad_norm": 0.005902244243770838, + "learning_rate": 1.7647964398500405e-05, + "loss": 0.3244, + "step": 24044 + }, + { + "epoch": 0.30058251456286406, + "grad_norm": 5.599947452545166, + "learning_rate": 1.7647402120379237e-05, + "loss": 1.6899, + "step": 24046 + }, + { + "epoch": 0.3006075151878797, + "grad_norm": 0.09129834175109863, + "learning_rate": 1.7646839784016534e-05, + "loss": 0.2134, + "step": 24048 + }, + { + "epoch": 0.3006325158128953, + "grad_norm": 0.5335970520973206, + "learning_rate": 1.7646277389416582e-05, + "loss": 0.0978, + "step": 24050 + }, + { + "epoch": 0.30065751643791094, + "grad_norm": 1.984819769859314, + "learning_rate": 1.7645714936583664e-05, + "loss": 0.5933, + "step": 24052 + }, + { + "epoch": 0.3006825170629266, + "grad_norm": 2.2373199462890625, + "learning_rate": 1.7645152425522062e-05, + "loss": 0.6549, + "step": 24054 + }, + { + "epoch": 0.3007075176879422, + "grad_norm": 1.8820215463638306, + "learning_rate": 1.764458985623606e-05, + "loss": 0.3075, + "step": 24056 + }, + { + "epoch": 0.30073251831295783, + "grad_norm": 2.84647274017334, + "learning_rate": 1.764402722872994e-05, + "loss": 0.4149, + "step": 24058 + }, + { + "epoch": 0.3007575189379734, + "grad_norm": 3.9812982082366943, + "learning_rate": 1.7643464543007994e-05, + "loss": 1.4232, + "step": 24060 + }, + { + "epoch": 0.30078251956298907, + "grad_norm": 6.0448455810546875, + "learning_rate": 1.76429017990745e-05, + "loss": 0.9845, + "step": 24062 + }, + { + "epoch": 0.3008075201880047, + "grad_norm": 0.00510397320613265, + "learning_rate": 1.7642338996933746e-05, + "loss": 0.0006, + "step": 24064 + }, + { + "epoch": 0.3008325208130203, + "grad_norm": 3.3148558139801025, + "learning_rate": 1.764177613659002e-05, + "loss": 1.9688, + "step": 24066 + }, + { + "epoch": 0.30085752143803596, + "grad_norm": 3.2233242988586426, + "learning_rate": 1.764121321804761e-05, + "loss": 0.7481, + "step": 24068 + }, + { + "epoch": 0.30088252206305155, + "grad_norm": 0.05774286761879921, + "learning_rate": 1.7640650241310798e-05, + "loss": 0.0007, + "step": 24070 + }, + { + "epoch": 0.3009075226880672, + "grad_norm": 5.1473588943481445, + "learning_rate": 1.7640087206383872e-05, + "loss": 1.142, + "step": 24072 + }, + { + "epoch": 0.30093252331308284, + "grad_norm": 3.427612543106079, + "learning_rate": 1.7639524113271126e-05, + "loss": 1.0199, + "step": 24074 + }, + { + "epoch": 0.30095752393809844, + "grad_norm": 3.7598206996917725, + "learning_rate": 1.7638960961976845e-05, + "loss": 1.6388, + "step": 24076 + }, + { + "epoch": 0.3009825245631141, + "grad_norm": 6.011173248291016, + "learning_rate": 1.7638397752505316e-05, + "loss": 0.967, + "step": 24078 + }, + { + "epoch": 0.3010075251881297, + "grad_norm": 0.006849600933492184, + "learning_rate": 1.763783448486083e-05, + "loss": 0.6121, + "step": 24080 + }, + { + "epoch": 0.3010325258131453, + "grad_norm": 3.378248691558838, + "learning_rate": 1.7637271159047677e-05, + "loss": 0.5963, + "step": 24082 + }, + { + "epoch": 0.30105752643816097, + "grad_norm": 0.0058514149859547615, + "learning_rate": 1.763670777507015e-05, + "loss": 0.0251, + "step": 24084 + }, + { + "epoch": 0.30108252706317656, + "grad_norm": 9.133893013000488, + "learning_rate": 1.763614433293253e-05, + "loss": 1.0555, + "step": 24086 + }, + { + "epoch": 0.3011075276881922, + "grad_norm": 2.219128131866455, + "learning_rate": 1.7635580832639116e-05, + "loss": 0.7388, + "step": 24088 + }, + { + "epoch": 0.3011325283132078, + "grad_norm": 3.27056622505188, + "learning_rate": 1.76350172741942e-05, + "loss": 0.5992, + "step": 24090 + }, + { + "epoch": 0.30115752893822345, + "grad_norm": 2.1389148235321045, + "learning_rate": 1.7634453657602075e-05, + "loss": 0.8355, + "step": 24092 + }, + { + "epoch": 0.3011825295632391, + "grad_norm": 4.945845603942871, + "learning_rate": 1.7633889982867027e-05, + "loss": 0.9436, + "step": 24094 + }, + { + "epoch": 0.3012075301882547, + "grad_norm": 4.056905269622803, + "learning_rate": 1.7633326249993352e-05, + "loss": 0.1795, + "step": 24096 + }, + { + "epoch": 0.30123253081327034, + "grad_norm": 3.262908458709717, + "learning_rate": 1.7632762458985346e-05, + "loss": 0.6739, + "step": 24098 + }, + { + "epoch": 0.30125753143828593, + "grad_norm": 2.2276406288146973, + "learning_rate": 1.7632198609847298e-05, + "loss": 0.818, + "step": 24100 + }, + { + "epoch": 0.3012825320633016, + "grad_norm": 3.525949239730835, + "learning_rate": 1.7631634702583508e-05, + "loss": 0.7229, + "step": 24102 + }, + { + "epoch": 0.3013075326883172, + "grad_norm": 3.1088695526123047, + "learning_rate": 1.7631070737198262e-05, + "loss": 0.6839, + "step": 24104 + }, + { + "epoch": 0.3013325333133328, + "grad_norm": 4.030834674835205, + "learning_rate": 1.7630506713695866e-05, + "loss": 1.6217, + "step": 24106 + }, + { + "epoch": 0.30135753393834847, + "grad_norm": 0.03028922900557518, + "learning_rate": 1.7629942632080607e-05, + "loss": 0.0254, + "step": 24108 + }, + { + "epoch": 0.30138253456336406, + "grad_norm": 0.0027444029692560434, + "learning_rate": 1.7629378492356783e-05, + "loss": 0.0258, + "step": 24110 + }, + { + "epoch": 0.3014075351883797, + "grad_norm": 4.572049140930176, + "learning_rate": 1.7628814294528695e-05, + "loss": 1.5315, + "step": 24112 + }, + { + "epoch": 0.30143253581339535, + "grad_norm": 0.005930814892053604, + "learning_rate": 1.7628250038600635e-05, + "loss": 0.9475, + "step": 24114 + }, + { + "epoch": 0.30145753643841094, + "grad_norm": 4.573971271514893, + "learning_rate": 1.76276857245769e-05, + "loss": 1.276, + "step": 24116 + }, + { + "epoch": 0.3014825370634266, + "grad_norm": 2.975588083267212, + "learning_rate": 1.762712135246179e-05, + "loss": 0.1448, + "step": 24118 + }, + { + "epoch": 0.3015075376884422, + "grad_norm": 2.720953941345215, + "learning_rate": 1.7626556922259598e-05, + "loss": 1.1167, + "step": 24120 + }, + { + "epoch": 0.30153253831345783, + "grad_norm": 3.3931005001068115, + "learning_rate": 1.7625992433974635e-05, + "loss": 0.5106, + "step": 24122 + }, + { + "epoch": 0.3015575389384735, + "grad_norm": 2.3648054599761963, + "learning_rate": 1.7625427887611185e-05, + "loss": 0.4309, + "step": 24124 + }, + { + "epoch": 0.30158253956348907, + "grad_norm": 6.7800164222717285, + "learning_rate": 1.762486328317356e-05, + "loss": 0.916, + "step": 24126 + }, + { + "epoch": 0.3016075401885047, + "grad_norm": 2.0344631671905518, + "learning_rate": 1.762429862066605e-05, + "loss": 1.1295, + "step": 24128 + }, + { + "epoch": 0.3016325408135203, + "grad_norm": 4.216212272644043, + "learning_rate": 1.7623733900092964e-05, + "loss": 1.6649, + "step": 24130 + }, + { + "epoch": 0.30165754143853596, + "grad_norm": 4.145557880401611, + "learning_rate": 1.7623169121458595e-05, + "loss": 1.1112, + "step": 24132 + }, + { + "epoch": 0.3016825420635516, + "grad_norm": 4.017470836639404, + "learning_rate": 1.762260428476725e-05, + "loss": 0.6791, + "step": 24134 + }, + { + "epoch": 0.3017075426885672, + "grad_norm": 4.879366874694824, + "learning_rate": 1.7622039390023227e-05, + "loss": 0.9934, + "step": 24136 + }, + { + "epoch": 0.30173254331358285, + "grad_norm": 4.1341633796691895, + "learning_rate": 1.762147443723083e-05, + "loss": 2.3973, + "step": 24138 + }, + { + "epoch": 0.30175754393859844, + "grad_norm": 3.374323844909668, + "learning_rate": 1.7620909426394364e-05, + "loss": 1.4667, + "step": 24140 + }, + { + "epoch": 0.3017825445636141, + "grad_norm": 3.4973959922790527, + "learning_rate": 1.7620344357518128e-05, + "loss": 0.7116, + "step": 24142 + }, + { + "epoch": 0.30180754518862973, + "grad_norm": 4.144270896911621, + "learning_rate": 1.7619779230606427e-05, + "loss": 1.1483, + "step": 24144 + }, + { + "epoch": 0.3018325458136453, + "grad_norm": 2.3223166465759277, + "learning_rate": 1.7619214045663567e-05, + "loss": 0.8527, + "step": 24146 + }, + { + "epoch": 0.301857546438661, + "grad_norm": 6.538242816925049, + "learning_rate": 1.7618648802693848e-05, + "loss": 1.4335, + "step": 24148 + }, + { + "epoch": 0.30188254706367657, + "grad_norm": 1.9137563705444336, + "learning_rate": 1.7618083501701575e-05, + "loss": 0.9078, + "step": 24150 + }, + { + "epoch": 0.3019075476886922, + "grad_norm": 0.004017021041363478, + "learning_rate": 1.761751814269106e-05, + "loss": 0.7908, + "step": 24152 + }, + { + "epoch": 0.30193254831370786, + "grad_norm": 3.5412256717681885, + "learning_rate": 1.76169527256666e-05, + "loss": 2.0737, + "step": 24154 + }, + { + "epoch": 0.30195754893872345, + "grad_norm": 5.1492085456848145, + "learning_rate": 1.7616387250632506e-05, + "loss": 0.9721, + "step": 24156 + }, + { + "epoch": 0.3019825495637391, + "grad_norm": 2.272874593734741, + "learning_rate": 1.7615821717593085e-05, + "loss": 0.8958, + "step": 24158 + }, + { + "epoch": 0.3020075501887547, + "grad_norm": 0.6520674228668213, + "learning_rate": 1.761525612655264e-05, + "loss": 0.0132, + "step": 24160 + }, + { + "epoch": 0.30203255081377034, + "grad_norm": 4.52031135559082, + "learning_rate": 1.7614690477515487e-05, + "loss": 1.994, + "step": 24162 + }, + { + "epoch": 0.302057551438786, + "grad_norm": 3.2926411628723145, + "learning_rate": 1.7614124770485923e-05, + "loss": 1.0021, + "step": 24164 + }, + { + "epoch": 0.3020825520638016, + "grad_norm": 2.5804052352905273, + "learning_rate": 1.7613559005468263e-05, + "loss": 0.1066, + "step": 24166 + }, + { + "epoch": 0.3021075526888172, + "grad_norm": 2.00551176071167, + "learning_rate": 1.7612993182466814e-05, + "loss": 0.0344, + "step": 24168 + }, + { + "epoch": 0.3021325533138328, + "grad_norm": 5.670490264892578, + "learning_rate": 1.7612427301485885e-05, + "loss": 0.6599, + "step": 24170 + }, + { + "epoch": 0.30215755393884847, + "grad_norm": 3.5645618438720703, + "learning_rate": 1.761186136252979e-05, + "loss": 0.1518, + "step": 24172 + }, + { + "epoch": 0.3021825545638641, + "grad_norm": 4.936675548553467, + "learning_rate": 1.761129536560283e-05, + "loss": 1.2595, + "step": 24174 + }, + { + "epoch": 0.3022075551888797, + "grad_norm": 6.158871650695801, + "learning_rate": 1.7610729310709324e-05, + "loss": 1.5045, + "step": 24176 + }, + { + "epoch": 0.30223255581389535, + "grad_norm": 5.664249897003174, + "learning_rate": 1.761016319785358e-05, + "loss": 0.8599, + "step": 24178 + }, + { + "epoch": 0.30225755643891095, + "grad_norm": 1.7968703508377075, + "learning_rate": 1.7609597027039907e-05, + "loss": 0.3924, + "step": 24180 + }, + { + "epoch": 0.3022825570639266, + "grad_norm": 2.229905843734741, + "learning_rate": 1.760903079827262e-05, + "loss": 0.6492, + "step": 24182 + }, + { + "epoch": 0.30230755768894224, + "grad_norm": 5.344062805175781, + "learning_rate": 1.7608464511556035e-05, + "loss": 1.1293, + "step": 24184 + }, + { + "epoch": 0.30233255831395783, + "grad_norm": 3.7756762504577637, + "learning_rate": 1.7607898166894455e-05, + "loss": 0.7556, + "step": 24186 + }, + { + "epoch": 0.3023575589389735, + "grad_norm": 4.6494903564453125, + "learning_rate": 1.76073317642922e-05, + "loss": 0.9614, + "step": 24188 + }, + { + "epoch": 0.3023825595639891, + "grad_norm": 5.971339702606201, + "learning_rate": 1.760676530375358e-05, + "loss": 2.1005, + "step": 24190 + }, + { + "epoch": 0.3024075601890047, + "grad_norm": 3.643158435821533, + "learning_rate": 1.7606198785282913e-05, + "loss": 0.8464, + "step": 24192 + }, + { + "epoch": 0.30243256081402037, + "grad_norm": 3.683577299118042, + "learning_rate": 1.7605632208884512e-05, + "loss": 0.5793, + "step": 24194 + }, + { + "epoch": 0.30245756143903596, + "grad_norm": 0.007036255672574043, + "learning_rate": 1.760506557456269e-05, + "loss": 0.7263, + "step": 24196 + }, + { + "epoch": 0.3024825620640516, + "grad_norm": 0.002608025446534157, + "learning_rate": 1.7604498882321765e-05, + "loss": 0.6177, + "step": 24198 + }, + { + "epoch": 0.3025075626890672, + "grad_norm": 3.1757428646087646, + "learning_rate": 1.760393213216605e-05, + "loss": 1.2562, + "step": 24200 + }, + { + "epoch": 0.30253256331408285, + "grad_norm": 2.1410741806030273, + "learning_rate": 1.7603365324099867e-05, + "loss": 0.709, + "step": 24202 + }, + { + "epoch": 0.3025575639390985, + "grad_norm": 0.005608287174254656, + "learning_rate": 1.760279845812753e-05, + "loss": 0.0112, + "step": 24204 + }, + { + "epoch": 0.3025825645641141, + "grad_norm": 0.004019756335765123, + "learning_rate": 1.760223153425335e-05, + "loss": 0.5732, + "step": 24206 + }, + { + "epoch": 0.30260756518912973, + "grad_norm": 4.841452598571777, + "learning_rate": 1.760166455248165e-05, + "loss": 2.2793, + "step": 24208 + }, + { + "epoch": 0.3026325658141453, + "grad_norm": 3.45935320854187, + "learning_rate": 1.760109751281675e-05, + "loss": 0.7069, + "step": 24210 + }, + { + "epoch": 0.302657566439161, + "grad_norm": 1.194451928138733, + "learning_rate": 1.7600530415262965e-05, + "loss": 1.0732, + "step": 24212 + }, + { + "epoch": 0.3026825670641766, + "grad_norm": 8.07232666015625, + "learning_rate": 1.759996325982462e-05, + "loss": 0.5481, + "step": 24214 + }, + { + "epoch": 0.3027075676891922, + "grad_norm": 2.850651264190674, + "learning_rate": 1.7599396046506024e-05, + "loss": 0.7989, + "step": 24216 + }, + { + "epoch": 0.30273256831420786, + "grad_norm": 1.213615894317627, + "learning_rate": 1.75988287753115e-05, + "loss": 0.3917, + "step": 24218 + }, + { + "epoch": 0.30275756893922345, + "grad_norm": 9.948662757873535, + "learning_rate": 1.7598261446245377e-05, + "loss": 0.6618, + "step": 24220 + }, + { + "epoch": 0.3027825695642391, + "grad_norm": 1.8811553716659546, + "learning_rate": 1.7597694059311967e-05, + "loss": 0.6064, + "step": 24222 + }, + { + "epoch": 0.30280757018925475, + "grad_norm": 4.939521789550781, + "learning_rate": 1.7597126614515596e-05, + "loss": 0.5606, + "step": 24224 + }, + { + "epoch": 0.30283257081427034, + "grad_norm": 10.244725227355957, + "learning_rate": 1.759655911186058e-05, + "loss": 0.3076, + "step": 24226 + }, + { + "epoch": 0.302857571439286, + "grad_norm": 2.115696430206299, + "learning_rate": 1.7595991551351243e-05, + "loss": 0.488, + "step": 24228 + }, + { + "epoch": 0.3028825720643016, + "grad_norm": 3.529694080352783, + "learning_rate": 1.7595423932991908e-05, + "loss": 1.0614, + "step": 24230 + }, + { + "epoch": 0.30290757268931723, + "grad_norm": 5.250219345092773, + "learning_rate": 1.75948562567869e-05, + "loss": 1.0721, + "step": 24232 + }, + { + "epoch": 0.3029325733143329, + "grad_norm": 0.09415969997644424, + "learning_rate": 1.7594288522740545e-05, + "loss": 0.2315, + "step": 24234 + }, + { + "epoch": 0.30295757393934847, + "grad_norm": 3.5157880783081055, + "learning_rate": 1.759372073085716e-05, + "loss": 0.6536, + "step": 24236 + }, + { + "epoch": 0.3029825745643641, + "grad_norm": 5.2028326988220215, + "learning_rate": 1.7593152881141068e-05, + "loss": 0.9243, + "step": 24238 + }, + { + "epoch": 0.3030075751893797, + "grad_norm": 3.7493178844451904, + "learning_rate": 1.7592584973596598e-05, + "loss": 1.1751, + "step": 24240 + }, + { + "epoch": 0.30303257581439536, + "grad_norm": 2.307805299758911, + "learning_rate": 1.7592017008228078e-05, + "loss": 1.2548, + "step": 24242 + }, + { + "epoch": 0.303057576439411, + "grad_norm": 3.6343870162963867, + "learning_rate": 1.7591448985039826e-05, + "loss": 1.5963, + "step": 24244 + }, + { + "epoch": 0.3030825770644266, + "grad_norm": 3.0393528938293457, + "learning_rate": 1.7590880904036177e-05, + "loss": 0.7059, + "step": 24246 + }, + { + "epoch": 0.30310757768944224, + "grad_norm": 0.001769835245795548, + "learning_rate": 1.759031276522145e-05, + "loss": 1.0907, + "step": 24248 + }, + { + "epoch": 0.30313257831445783, + "grad_norm": 7.407936096191406, + "learning_rate": 1.7589744568599972e-05, + "loss": 2.1377, + "step": 24250 + }, + { + "epoch": 0.3031575789394735, + "grad_norm": 3.371737003326416, + "learning_rate": 1.7589176314176076e-05, + "loss": 0.4261, + "step": 24252 + }, + { + "epoch": 0.30318257956448913, + "grad_norm": 5.211552619934082, + "learning_rate": 1.7588608001954086e-05, + "loss": 2.2836, + "step": 24254 + }, + { + "epoch": 0.3032075801895047, + "grad_norm": 6.035679340362549, + "learning_rate": 1.758803963193833e-05, + "loss": 1.968, + "step": 24256 + }, + { + "epoch": 0.30323258081452037, + "grad_norm": 1.2047882080078125, + "learning_rate": 1.758747120413314e-05, + "loss": 0.0662, + "step": 24258 + }, + { + "epoch": 0.30325758143953596, + "grad_norm": 8.811266899108887, + "learning_rate": 1.7586902718542838e-05, + "loss": 1.0748, + "step": 24260 + }, + { + "epoch": 0.3032825820645516, + "grad_norm": 0.0019143434474244714, + "learning_rate": 1.7586334175171758e-05, + "loss": 0.0156, + "step": 24262 + }, + { + "epoch": 0.30330758268956726, + "grad_norm": 3.809784412384033, + "learning_rate": 1.7585765574024228e-05, + "loss": 1.3764, + "step": 24264 + }, + { + "epoch": 0.30333258331458285, + "grad_norm": 4.208999156951904, + "learning_rate": 1.7585196915104585e-05, + "loss": 1.0003, + "step": 24266 + }, + { + "epoch": 0.3033575839395985, + "grad_norm": 0.0019854723941534758, + "learning_rate": 1.7584628198417154e-05, + "loss": 0.1938, + "step": 24268 + }, + { + "epoch": 0.3033825845646141, + "grad_norm": 0.027631765231490135, + "learning_rate": 1.7584059423966263e-05, + "loss": 0.6913, + "step": 24270 + }, + { + "epoch": 0.30340758518962974, + "grad_norm": 8.881589889526367, + "learning_rate": 1.7583490591756253e-05, + "loss": 0.6682, + "step": 24272 + }, + { + "epoch": 0.3034325858146454, + "grad_norm": 1.1408048868179321, + "learning_rate": 1.758292170179145e-05, + "loss": 0.7608, + "step": 24274 + }, + { + "epoch": 0.303457586439661, + "grad_norm": 4.672890663146973, + "learning_rate": 1.7582352754076185e-05, + "loss": 1.1572, + "step": 24276 + }, + { + "epoch": 0.3034825870646766, + "grad_norm": 4.962440013885498, + "learning_rate": 1.7581783748614795e-05, + "loss": 1.7415, + "step": 24278 + }, + { + "epoch": 0.3035075876896922, + "grad_norm": 0.6681368947029114, + "learning_rate": 1.7581214685411612e-05, + "loss": 0.1228, + "step": 24280 + }, + { + "epoch": 0.30353258831470786, + "grad_norm": 2.4618120193481445, + "learning_rate": 1.7580645564470974e-05, + "loss": 0.7514, + "step": 24282 + }, + { + "epoch": 0.3035575889397235, + "grad_norm": 3.032729387283325, + "learning_rate": 1.7580076385797204e-05, + "loss": 0.7135, + "step": 24284 + }, + { + "epoch": 0.3035825895647391, + "grad_norm": 3.675811767578125, + "learning_rate": 1.7579507149394653e-05, + "loss": 0.5086, + "step": 24286 + }, + { + "epoch": 0.30360759018975475, + "grad_norm": 3.3799219131469727, + "learning_rate": 1.757893785526764e-05, + "loss": 1.2499, + "step": 24288 + }, + { + "epoch": 0.30363259081477034, + "grad_norm": 3.2840707302093506, + "learning_rate": 1.757836850342051e-05, + "loss": 1.355, + "step": 24290 + }, + { + "epoch": 0.303657591439786, + "grad_norm": 7.049571990966797, + "learning_rate": 1.75777990938576e-05, + "loss": 2.084, + "step": 24292 + }, + { + "epoch": 0.30368259206480164, + "grad_norm": 4.381526470184326, + "learning_rate": 1.757722962658324e-05, + "loss": 1.6224, + "step": 24294 + }, + { + "epoch": 0.30370759268981723, + "grad_norm": 3.051515579223633, + "learning_rate": 1.7576660101601773e-05, + "loss": 0.4949, + "step": 24296 + }, + { + "epoch": 0.3037325933148329, + "grad_norm": 2.4106783866882324, + "learning_rate": 1.7576090518917532e-05, + "loss": 1.7212, + "step": 24298 + }, + { + "epoch": 0.30375759393984847, + "grad_norm": 2.7994585037231445, + "learning_rate": 1.7575520878534858e-05, + "loss": 1.0863, + "step": 24300 + }, + { + "epoch": 0.3037825945648641, + "grad_norm": 3.384472131729126, + "learning_rate": 1.757495118045809e-05, + "loss": 1.4038, + "step": 24302 + }, + { + "epoch": 0.30380759518987976, + "grad_norm": 0.8355507254600525, + "learning_rate": 1.7574381424691563e-05, + "loss": 0.0422, + "step": 24304 + }, + { + "epoch": 0.30383259581489536, + "grad_norm": 2.5010874271392822, + "learning_rate": 1.7573811611239616e-05, + "loss": 0.5382, + "step": 24306 + }, + { + "epoch": 0.303857596439911, + "grad_norm": 0.004606295842677355, + "learning_rate": 1.7573241740106593e-05, + "loss": 0.449, + "step": 24308 + }, + { + "epoch": 0.3038825970649266, + "grad_norm": 2.8292794227600098, + "learning_rate": 1.7572671811296837e-05, + "loss": 0.8425, + "step": 24310 + }, + { + "epoch": 0.30390759768994224, + "grad_norm": 2.2580766677856445, + "learning_rate": 1.7572101824814677e-05, + "loss": 0.3651, + "step": 24312 + }, + { + "epoch": 0.3039325983149579, + "grad_norm": 3.2826480865478516, + "learning_rate": 1.757153178066446e-05, + "loss": 2.0771, + "step": 24314 + }, + { + "epoch": 0.3039575989399735, + "grad_norm": 5.545421600341797, + "learning_rate": 1.7570961678850532e-05, + "loss": 1.0393, + "step": 24316 + }, + { + "epoch": 0.30398259956498913, + "grad_norm": 0.0816812515258789, + "learning_rate": 1.7570391519377228e-05, + "loss": 0.123, + "step": 24318 + }, + { + "epoch": 0.3040076001900047, + "grad_norm": 3.7328453063964844, + "learning_rate": 1.756982130224889e-05, + "loss": 1.5549, + "step": 24320 + }, + { + "epoch": 0.30403260081502037, + "grad_norm": 4.112720489501953, + "learning_rate": 1.756925102746987e-05, + "loss": 0.3437, + "step": 24322 + }, + { + "epoch": 0.304057601440036, + "grad_norm": 6.184789657592773, + "learning_rate": 1.7568680695044498e-05, + "loss": 1.1421, + "step": 24324 + }, + { + "epoch": 0.3040826020650516, + "grad_norm": 2.5453526973724365, + "learning_rate": 1.756811030497713e-05, + "loss": 0.4577, + "step": 24326 + }, + { + "epoch": 0.30410760269006726, + "grad_norm": 3.7356646060943604, + "learning_rate": 1.75675398572721e-05, + "loss": 0.9643, + "step": 24328 + }, + { + "epoch": 0.30413260331508285, + "grad_norm": 3.934523820877075, + "learning_rate": 1.7566969351933755e-05, + "loss": 0.5191, + "step": 24330 + }, + { + "epoch": 0.3041576039400985, + "grad_norm": 3.800724983215332, + "learning_rate": 1.7566398788966446e-05, + "loss": 1.5193, + "step": 24332 + }, + { + "epoch": 0.30418260456511415, + "grad_norm": 4.067652702331543, + "learning_rate": 1.7565828168374514e-05, + "loss": 0.7228, + "step": 24334 + }, + { + "epoch": 0.30420760519012974, + "grad_norm": 0.004329352173954248, + "learning_rate": 1.7565257490162298e-05, + "loss": 0.9681, + "step": 24336 + }, + { + "epoch": 0.3042326058151454, + "grad_norm": 0.8911432027816772, + "learning_rate": 1.756468675433416e-05, + "loss": 0.0292, + "step": 24338 + }, + { + "epoch": 0.304257606440161, + "grad_norm": 2.1098666191101074, + "learning_rate": 1.756411596089443e-05, + "loss": 0.3978, + "step": 24340 + }, + { + "epoch": 0.3042826070651766, + "grad_norm": 1.9468287229537964, + "learning_rate": 1.7563545109847466e-05, + "loss": 0.43, + "step": 24342 + }, + { + "epoch": 0.30430760769019227, + "grad_norm": 2.497701406478882, + "learning_rate": 1.756297420119761e-05, + "loss": 0.5477, + "step": 24344 + }, + { + "epoch": 0.30433260831520786, + "grad_norm": 1.7760204076766968, + "learning_rate": 1.7562403234949214e-05, + "loss": 0.9164, + "step": 24346 + }, + { + "epoch": 0.3043576089402235, + "grad_norm": 5.8436360359191895, + "learning_rate": 1.7561832211106624e-05, + "loss": 1.0929, + "step": 24348 + }, + { + "epoch": 0.3043826095652391, + "grad_norm": 5.943318843841553, + "learning_rate": 1.756126112967419e-05, + "loss": 0.3702, + "step": 24350 + }, + { + "epoch": 0.30440761019025475, + "grad_norm": 0.002327051479369402, + "learning_rate": 1.756068999065626e-05, + "loss": 0.8529, + "step": 24352 + }, + { + "epoch": 0.3044326108152704, + "grad_norm": 1.9349143505096436, + "learning_rate": 1.7560118794057183e-05, + "loss": 0.0428, + "step": 24354 + }, + { + "epoch": 0.304457611440286, + "grad_norm": 2.4570860862731934, + "learning_rate": 1.755954753988131e-05, + "loss": 0.5342, + "step": 24356 + }, + { + "epoch": 0.30448261206530164, + "grad_norm": 3.944540500640869, + "learning_rate": 1.7558976228132993e-05, + "loss": 1.0883, + "step": 24358 + }, + { + "epoch": 0.30450761269031723, + "grad_norm": 2.9205546379089355, + "learning_rate": 1.7558404858816584e-05, + "loss": 1.6878, + "step": 24360 + }, + { + "epoch": 0.3045326133153329, + "grad_norm": 2.660259962081909, + "learning_rate": 1.755783343193643e-05, + "loss": 0.803, + "step": 24362 + }, + { + "epoch": 0.3045576139403485, + "grad_norm": 0.0025293375365436077, + "learning_rate": 1.7557261947496885e-05, + "loss": 0.1798, + "step": 24364 + }, + { + "epoch": 0.3045826145653641, + "grad_norm": 3.048571825027466, + "learning_rate": 1.7556690405502302e-05, + "loss": 0.7932, + "step": 24366 + }, + { + "epoch": 0.30460761519037977, + "grad_norm": 3.3675308227539062, + "learning_rate": 1.7556118805957032e-05, + "loss": 1.56, + "step": 24368 + }, + { + "epoch": 0.30463261581539536, + "grad_norm": 3.079679250717163, + "learning_rate": 1.7555547148865434e-05, + "loss": 0.6321, + "step": 24370 + }, + { + "epoch": 0.304657616440411, + "grad_norm": 3.093707323074341, + "learning_rate": 1.7554975434231852e-05, + "loss": 1.3102, + "step": 24372 + }, + { + "epoch": 0.30468261706542665, + "grad_norm": 1.8179428577423096, + "learning_rate": 1.755440366206065e-05, + "loss": 0.7981, + "step": 24374 + }, + { + "epoch": 0.30470761769044225, + "grad_norm": 6.176287651062012, + "learning_rate": 1.7553831832356176e-05, + "loss": 1.8689, + "step": 24376 + }, + { + "epoch": 0.3047326183154579, + "grad_norm": 3.0515663623809814, + "learning_rate": 1.755325994512279e-05, + "loss": 0.6451, + "step": 24378 + }, + { + "epoch": 0.3047576189404735, + "grad_norm": 3.4623265266418457, + "learning_rate": 1.755268800036484e-05, + "loss": 0.2976, + "step": 24380 + }, + { + "epoch": 0.30478261956548913, + "grad_norm": 1.384185552597046, + "learning_rate": 1.7552115998086686e-05, + "loss": 0.0468, + "step": 24382 + }, + { + "epoch": 0.3048076201905048, + "grad_norm": 3.437356472015381, + "learning_rate": 1.7551543938292688e-05, + "loss": 0.9948, + "step": 24384 + }, + { + "epoch": 0.30483262081552037, + "grad_norm": 3.698521375656128, + "learning_rate": 1.7550971820987198e-05, + "loss": 1.3527, + "step": 24386 + }, + { + "epoch": 0.304857621440536, + "grad_norm": 3.4658796787261963, + "learning_rate": 1.7550399646174576e-05, + "loss": 0.6847, + "step": 24388 + }, + { + "epoch": 0.3048826220655516, + "grad_norm": 3.245643138885498, + "learning_rate": 1.7549827413859176e-05, + "loss": 1.3167, + "step": 24390 + }, + { + "epoch": 0.30490762269056726, + "grad_norm": 1.3415982723236084, + "learning_rate": 1.7549255124045358e-05, + "loss": 0.0749, + "step": 24392 + }, + { + "epoch": 0.3049326233155829, + "grad_norm": 6.457678318023682, + "learning_rate": 1.754868277673748e-05, + "loss": 1.8922, + "step": 24394 + }, + { + "epoch": 0.3049576239405985, + "grad_norm": 1.45488703250885, + "learning_rate": 1.7548110371939902e-05, + "loss": 0.6526, + "step": 24396 + }, + { + "epoch": 0.30498262456561415, + "grad_norm": 3.2701845169067383, + "learning_rate": 1.7547537909656985e-05, + "loss": 1.3261, + "step": 24398 + }, + { + "epoch": 0.30500762519062974, + "grad_norm": 1.2815531492233276, + "learning_rate": 1.7546965389893085e-05, + "loss": 0.148, + "step": 24400 + }, + { + "epoch": 0.3050326258156454, + "grad_norm": 0.7035471796989441, + "learning_rate": 1.754639281265256e-05, + "loss": 0.5879, + "step": 24402 + }, + { + "epoch": 0.30505762644066103, + "grad_norm": 3.296053886413574, + "learning_rate": 1.754582017793978e-05, + "loss": 1.0891, + "step": 24404 + }, + { + "epoch": 0.3050826270656766, + "grad_norm": 4.137001991271973, + "learning_rate": 1.75452474857591e-05, + "loss": 1.2569, + "step": 24406 + }, + { + "epoch": 0.3051076276906923, + "grad_norm": 0.0018637783359736204, + "learning_rate": 1.754467473611488e-05, + "loss": 0.0001, + "step": 24408 + }, + { + "epoch": 0.30513262831570787, + "grad_norm": 1.8836396932601929, + "learning_rate": 1.7544101929011482e-05, + "loss": 0.9342, + "step": 24410 + }, + { + "epoch": 0.3051576289407235, + "grad_norm": 2.743135929107666, + "learning_rate": 1.7543529064453276e-05, + "loss": 1.4572, + "step": 24412 + }, + { + "epoch": 0.30518262956573916, + "grad_norm": 3.095386505126953, + "learning_rate": 1.7542956142444617e-05, + "loss": 0.5375, + "step": 24414 + }, + { + "epoch": 0.30520763019075475, + "grad_norm": 5.012991428375244, + "learning_rate": 1.7542383162989868e-05, + "loss": 1.6885, + "step": 24416 + }, + { + "epoch": 0.3052326308157704, + "grad_norm": 0.8658956289291382, + "learning_rate": 1.75418101260934e-05, + "loss": 1.0084, + "step": 24418 + }, + { + "epoch": 0.305257631440786, + "grad_norm": 0.5550372004508972, + "learning_rate": 1.754123703175957e-05, + "loss": 0.4849, + "step": 24420 + }, + { + "epoch": 0.30528263206580164, + "grad_norm": 3.5169429779052734, + "learning_rate": 1.7540663879992746e-05, + "loss": 0.4787, + "step": 24422 + }, + { + "epoch": 0.3053076326908173, + "grad_norm": 0.018577758222818375, + "learning_rate": 1.754009067079729e-05, + "loss": 0.6618, + "step": 24424 + }, + { + "epoch": 0.3053326333158329, + "grad_norm": 2.165311813354492, + "learning_rate": 1.7539517404177573e-05, + "loss": 0.1881, + "step": 24426 + }, + { + "epoch": 0.3053576339408485, + "grad_norm": 2.426562786102295, + "learning_rate": 1.7538944080137956e-05, + "loss": 0.7834, + "step": 24428 + }, + { + "epoch": 0.3053826345658641, + "grad_norm": 5.309070110321045, + "learning_rate": 1.7538370698682807e-05, + "loss": 1.7765, + "step": 24430 + }, + { + "epoch": 0.30540763519087977, + "grad_norm": 1.1957677602767944, + "learning_rate": 1.7537797259816493e-05, + "loss": 0.9331, + "step": 24432 + }, + { + "epoch": 0.3054326358158954, + "grad_norm": 3.993208646774292, + "learning_rate": 1.753722376354338e-05, + "loss": 1.5592, + "step": 24434 + }, + { + "epoch": 0.305457636440911, + "grad_norm": 1.6310889720916748, + "learning_rate": 1.753665020986784e-05, + "loss": 0.4758, + "step": 24436 + }, + { + "epoch": 0.30548263706592665, + "grad_norm": 4.212790012359619, + "learning_rate": 1.7536076598794235e-05, + "loss": 1.4576, + "step": 24438 + }, + { + "epoch": 0.30550763769094225, + "grad_norm": 0.004405179060995579, + "learning_rate": 1.7535502930326936e-05, + "loss": 0.0272, + "step": 24440 + }, + { + "epoch": 0.3055326383159579, + "grad_norm": 0.0015212680445984006, + "learning_rate": 1.7534929204470314e-05, + "loss": 1.1947, + "step": 24442 + }, + { + "epoch": 0.30555763894097354, + "grad_norm": 0.02082994394004345, + "learning_rate": 1.7534355421228735e-05, + "loss": 0.0223, + "step": 24444 + }, + { + "epoch": 0.30558263956598913, + "grad_norm": 3.8207125663757324, + "learning_rate": 1.7533781580606573e-05, + "loss": 1.1629, + "step": 24446 + }, + { + "epoch": 0.3056076401910048, + "grad_norm": 3.8315906524658203, + "learning_rate": 1.753320768260819e-05, + "loss": 0.2412, + "step": 24448 + }, + { + "epoch": 0.3056326408160204, + "grad_norm": 3.2971067428588867, + "learning_rate": 1.7532633727237968e-05, + "loss": 1.5497, + "step": 24450 + }, + { + "epoch": 0.305657641441036, + "grad_norm": 2.067619562149048, + "learning_rate": 1.7532059714500272e-05, + "loss": 0.7425, + "step": 24452 + }, + { + "epoch": 0.30568264206605167, + "grad_norm": 3.6790075302124023, + "learning_rate": 1.7531485644399477e-05, + "loss": 1.3145, + "step": 24454 + }, + { + "epoch": 0.30570764269106726, + "grad_norm": 4.776839256286621, + "learning_rate": 1.7530911516939947e-05, + "loss": 0.6688, + "step": 24456 + }, + { + "epoch": 0.3057326433160829, + "grad_norm": 0.21388396620750427, + "learning_rate": 1.7530337332126062e-05, + "loss": 0.0024, + "step": 24458 + }, + { + "epoch": 0.3057576439410985, + "grad_norm": 4.046051502227783, + "learning_rate": 1.7529763089962192e-05, + "loss": 0.4007, + "step": 24460 + }, + { + "epoch": 0.30578264456611415, + "grad_norm": 0.0038521571550518274, + "learning_rate": 1.7529188790452717e-05, + "loss": 0.0798, + "step": 24462 + }, + { + "epoch": 0.3058076451911298, + "grad_norm": 2.475059986114502, + "learning_rate": 1.7528614433601995e-05, + "loss": 0.402, + "step": 24464 + }, + { + "epoch": 0.3058326458161454, + "grad_norm": 2.3735783100128174, + "learning_rate": 1.7528040019414417e-05, + "loss": 1.3588, + "step": 24466 + }, + { + "epoch": 0.30585764644116104, + "grad_norm": 2.2748475074768066, + "learning_rate": 1.752746554789435e-05, + "loss": 0.4882, + "step": 24468 + }, + { + "epoch": 0.3058826470661766, + "grad_norm": 2.4136462211608887, + "learning_rate": 1.7526891019046168e-05, + "loss": 0.9843, + "step": 24470 + }, + { + "epoch": 0.3059076476911923, + "grad_norm": 0.0023375863675028086, + "learning_rate": 1.752631643287425e-05, + "loss": 0.9761, + "step": 24472 + }, + { + "epoch": 0.3059326483162079, + "grad_norm": 5.077093601226807, + "learning_rate": 1.752574178938297e-05, + "loss": 0.8927, + "step": 24474 + }, + { + "epoch": 0.3059576489412235, + "grad_norm": 2.180659532546997, + "learning_rate": 1.7525167088576704e-05, + "loss": 0.1325, + "step": 24476 + }, + { + "epoch": 0.30598264956623916, + "grad_norm": 0.002087546978145838, + "learning_rate": 1.7524592330459828e-05, + "loss": 0.0424, + "step": 24478 + }, + { + "epoch": 0.30600765019125475, + "grad_norm": 2.704025983810425, + "learning_rate": 1.7524017515036723e-05, + "loss": 2.0086, + "step": 24480 + }, + { + "epoch": 0.3060326508162704, + "grad_norm": 3.721794366836548, + "learning_rate": 1.7523442642311766e-05, + "loss": 1.3479, + "step": 24482 + }, + { + "epoch": 0.30605765144128605, + "grad_norm": 3.9030375480651855, + "learning_rate": 1.7522867712289332e-05, + "loss": 0.9208, + "step": 24484 + }, + { + "epoch": 0.30608265206630164, + "grad_norm": 2.451772928237915, + "learning_rate": 1.75222927249738e-05, + "loss": 0.7716, + "step": 24486 + }, + { + "epoch": 0.3061076526913173, + "grad_norm": 2.2636501789093018, + "learning_rate": 1.7521717680369554e-05, + "loss": 0.3845, + "step": 24488 + }, + { + "epoch": 0.3061326533163329, + "grad_norm": 3.020089864730835, + "learning_rate": 1.7521142578480964e-05, + "loss": 0.7307, + "step": 24490 + }, + { + "epoch": 0.30615765394134853, + "grad_norm": 2.8565468788146973, + "learning_rate": 1.752056741931242e-05, + "loss": 0.5547, + "step": 24492 + }, + { + "epoch": 0.3061826545663642, + "grad_norm": 3.8054184913635254, + "learning_rate": 1.75199922028683e-05, + "loss": 1.7437, + "step": 24494 + }, + { + "epoch": 0.30620765519137977, + "grad_norm": 4.538657188415527, + "learning_rate": 1.751941692915298e-05, + "loss": 1.4871, + "step": 24496 + }, + { + "epoch": 0.3062326558163954, + "grad_norm": 3.9957985877990723, + "learning_rate": 1.751884159817084e-05, + "loss": 1.4085, + "step": 24498 + }, + { + "epoch": 0.306257656441411, + "grad_norm": 4.037264347076416, + "learning_rate": 1.7518266209926272e-05, + "loss": 0.8927, + "step": 24500 + }, + { + "epoch": 0.30628265706642666, + "grad_norm": 3.741830348968506, + "learning_rate": 1.751769076442365e-05, + "loss": 0.3899, + "step": 24502 + }, + { + "epoch": 0.3063076576914423, + "grad_norm": 3.085949420928955, + "learning_rate": 1.751711526166736e-05, + "loss": 0.9034, + "step": 24504 + }, + { + "epoch": 0.3063326583164579, + "grad_norm": 3.4979989528656006, + "learning_rate": 1.751653970166178e-05, + "loss": 1.0136, + "step": 24506 + }, + { + "epoch": 0.30635765894147354, + "grad_norm": 0.6055760383605957, + "learning_rate": 1.7515964084411297e-05, + "loss": 0.9558, + "step": 24508 + }, + { + "epoch": 0.30638265956648914, + "grad_norm": 0.0036926683969795704, + "learning_rate": 1.7515388409920293e-05, + "loss": 0.7387, + "step": 24510 + }, + { + "epoch": 0.3064076601915048, + "grad_norm": 2.9467713832855225, + "learning_rate": 1.7514812678193155e-05, + "loss": 0.8045, + "step": 24512 + }, + { + "epoch": 0.30643266081652043, + "grad_norm": 3.6300172805786133, + "learning_rate": 1.7514236889234268e-05, + "loss": 0.8954, + "step": 24514 + }, + { + "epoch": 0.306457661441536, + "grad_norm": 3.8991668224334717, + "learning_rate": 1.7513661043048015e-05, + "loss": 0.8034, + "step": 24516 + }, + { + "epoch": 0.30648266206655167, + "grad_norm": 3.5447144508361816, + "learning_rate": 1.751308513963878e-05, + "loss": 0.8877, + "step": 24518 + }, + { + "epoch": 0.30650766269156726, + "grad_norm": 2.83284330368042, + "learning_rate": 1.7512509179010954e-05, + "loss": 0.9775, + "step": 24520 + }, + { + "epoch": 0.3065326633165829, + "grad_norm": 4.200242042541504, + "learning_rate": 1.7511933161168917e-05, + "loss": 1.062, + "step": 24522 + }, + { + "epoch": 0.30655766394159856, + "grad_norm": 3.0317232608795166, + "learning_rate": 1.7511357086117063e-05, + "loss": 0.2484, + "step": 24524 + }, + { + "epoch": 0.30658266456661415, + "grad_norm": 6.724824905395508, + "learning_rate": 1.751078095385977e-05, + "loss": 1.7965, + "step": 24526 + }, + { + "epoch": 0.3066076651916298, + "grad_norm": 7.014170169830322, + "learning_rate": 1.7510204764401436e-05, + "loss": 0.698, + "step": 24528 + }, + { + "epoch": 0.3066326658166454, + "grad_norm": 3.9924967288970947, + "learning_rate": 1.7509628517746445e-05, + "loss": 0.6304, + "step": 24530 + }, + { + "epoch": 0.30665766644166104, + "grad_norm": 2.676133394241333, + "learning_rate": 1.7509052213899184e-05, + "loss": 0.9774, + "step": 24532 + }, + { + "epoch": 0.3066826670666767, + "grad_norm": 2.9512572288513184, + "learning_rate": 1.7508475852864042e-05, + "loss": 1.2612, + "step": 24534 + }, + { + "epoch": 0.3067076676916923, + "grad_norm": 4.756293296813965, + "learning_rate": 1.7507899434645412e-05, + "loss": 0.754, + "step": 24536 + }, + { + "epoch": 0.3067326683167079, + "grad_norm": 3.376819372177124, + "learning_rate": 1.750732295924768e-05, + "loss": 1.7048, + "step": 24538 + }, + { + "epoch": 0.3067576689417235, + "grad_norm": 0.0022202087566256523, + "learning_rate": 1.7506746426675238e-05, + "loss": 0.3779, + "step": 24540 + }, + { + "epoch": 0.30678266956673916, + "grad_norm": 3.6558210849761963, + "learning_rate": 1.750616983693248e-05, + "loss": 1.2537, + "step": 24542 + }, + { + "epoch": 0.3068076701917548, + "grad_norm": 0.0010038753971457481, + "learning_rate": 1.750559319002379e-05, + "loss": 0.4775, + "step": 24544 + }, + { + "epoch": 0.3068326708167704, + "grad_norm": 0.7093806862831116, + "learning_rate": 1.7505016485953566e-05, + "loss": 0.1398, + "step": 24546 + }, + { + "epoch": 0.30685767144178605, + "grad_norm": 5.602453231811523, + "learning_rate": 1.7504439724726195e-05, + "loss": 0.2776, + "step": 24548 + }, + { + "epoch": 0.30688267206680164, + "grad_norm": 0.035267122089862823, + "learning_rate": 1.7503862906346073e-05, + "loss": 0.7195, + "step": 24550 + }, + { + "epoch": 0.3069076726918173, + "grad_norm": 1.9321048259735107, + "learning_rate": 1.7503286030817594e-05, + "loss": 0.0869, + "step": 24552 + }, + { + "epoch": 0.30693267331683294, + "grad_norm": 1.905699610710144, + "learning_rate": 1.750270909814515e-05, + "loss": 0.7368, + "step": 24554 + }, + { + "epoch": 0.30695767394184853, + "grad_norm": 1.8810527324676514, + "learning_rate": 1.7502132108333133e-05, + "loss": 0.6163, + "step": 24556 + }, + { + "epoch": 0.3069826745668642, + "grad_norm": 3.7818756103515625, + "learning_rate": 1.7501555061385937e-05, + "loss": 0.9704, + "step": 24558 + }, + { + "epoch": 0.30700767519187977, + "grad_norm": 2.756782293319702, + "learning_rate": 1.750097795730796e-05, + "loss": 0.8586, + "step": 24560 + }, + { + "epoch": 0.3070326758168954, + "grad_norm": 2.340985059738159, + "learning_rate": 1.7500400796103596e-05, + "loss": 0.7071, + "step": 24562 + }, + { + "epoch": 0.30705767644191106, + "grad_norm": 1.413710355758667, + "learning_rate": 1.749982357777724e-05, + "loss": 0.8463, + "step": 24564 + }, + { + "epoch": 0.30708267706692666, + "grad_norm": 0.013890925794839859, + "learning_rate": 1.7499246302333292e-05, + "loss": 0.7207, + "step": 24566 + }, + { + "epoch": 0.3071076776919423, + "grad_norm": 7.849658966064453, + "learning_rate": 1.749866896977614e-05, + "loss": 1.1517, + "step": 24568 + }, + { + "epoch": 0.3071326783169579, + "grad_norm": 0.05215340107679367, + "learning_rate": 1.7498091580110185e-05, + "loss": 0.4657, + "step": 24570 + }, + { + "epoch": 0.30715767894197354, + "grad_norm": 7.668541431427002, + "learning_rate": 1.7497514133339826e-05, + "loss": 1.5414, + "step": 24572 + }, + { + "epoch": 0.3071826795669892, + "grad_norm": 11.013336181640625, + "learning_rate": 1.7496936629469462e-05, + "loss": 1.7193, + "step": 24574 + }, + { + "epoch": 0.3072076801920048, + "grad_norm": 0.018173405900597572, + "learning_rate": 1.7496359068503487e-05, + "loss": 0.7426, + "step": 24576 + }, + { + "epoch": 0.30723268081702043, + "grad_norm": 2.880828380584717, + "learning_rate": 1.74957814504463e-05, + "loss": 0.2448, + "step": 24578 + }, + { + "epoch": 0.307257681442036, + "grad_norm": 2.10377836227417, + "learning_rate": 1.7495203775302304e-05, + "loss": 0.4163, + "step": 24580 + }, + { + "epoch": 0.30728268206705167, + "grad_norm": 2.8846235275268555, + "learning_rate": 1.7494626043075892e-05, + "loss": 1.6643, + "step": 24582 + }, + { + "epoch": 0.3073076826920673, + "grad_norm": 2.2163617610931396, + "learning_rate": 1.7494048253771474e-05, + "loss": 0.3038, + "step": 24584 + }, + { + "epoch": 0.3073326833170829, + "grad_norm": 2.5452096462249756, + "learning_rate": 1.7493470407393438e-05, + "loss": 0.4937, + "step": 24586 + }, + { + "epoch": 0.30735768394209856, + "grad_norm": 8.518051147460938, + "learning_rate": 1.7492892503946194e-05, + "loss": 1.1123, + "step": 24588 + }, + { + "epoch": 0.30738268456711415, + "grad_norm": 5.118730545043945, + "learning_rate": 1.749231454343414e-05, + "loss": 1.2775, + "step": 24590 + }, + { + "epoch": 0.3074076851921298, + "grad_norm": 2.6996781826019287, + "learning_rate": 1.749173652586168e-05, + "loss": 1.124, + "step": 24592 + }, + { + "epoch": 0.30743268581714545, + "grad_norm": 0.005900751333683729, + "learning_rate": 1.7491158451233212e-05, + "loss": 0.0125, + "step": 24594 + }, + { + "epoch": 0.30745768644216104, + "grad_norm": 1.9456160068511963, + "learning_rate": 1.749058031955314e-05, + "loss": 0.3044, + "step": 24596 + }, + { + "epoch": 0.3074826870671767, + "grad_norm": 2.9003841876983643, + "learning_rate": 1.749000213082587e-05, + "loss": 0.1805, + "step": 24598 + }, + { + "epoch": 0.3075076876921923, + "grad_norm": 3.720611810684204, + "learning_rate": 1.74894238850558e-05, + "loss": 0.4359, + "step": 24600 + }, + { + "epoch": 0.3075326883172079, + "grad_norm": 7.1806254386901855, + "learning_rate": 1.7488845582247342e-05, + "loss": 1.5204, + "step": 24602 + }, + { + "epoch": 0.3075576889422236, + "grad_norm": 4.228293418884277, + "learning_rate": 1.7488267222404885e-05, + "loss": 0.9366, + "step": 24604 + }, + { + "epoch": 0.30758268956723916, + "grad_norm": 3.9218666553497314, + "learning_rate": 1.7487688805532852e-05, + "loss": 0.3022, + "step": 24606 + }, + { + "epoch": 0.3076076901922548, + "grad_norm": 5.619804382324219, + "learning_rate": 1.7487110331635637e-05, + "loss": 1.2315, + "step": 24608 + }, + { + "epoch": 0.3076326908172704, + "grad_norm": 2.704928159713745, + "learning_rate": 1.748653180071765e-05, + "loss": 1.336, + "step": 24610 + }, + { + "epoch": 0.30765769144228605, + "grad_norm": 3.6864962577819824, + "learning_rate": 1.7485953212783295e-05, + "loss": 1.3801, + "step": 24612 + }, + { + "epoch": 0.3076826920673017, + "grad_norm": 0.004362952429801226, + "learning_rate": 1.748537456783698e-05, + "loss": 0.7138, + "step": 24614 + }, + { + "epoch": 0.3077076926923173, + "grad_norm": 6.154554843902588, + "learning_rate": 1.748479586588311e-05, + "loss": 1.9174, + "step": 24616 + }, + { + "epoch": 0.30773269331733294, + "grad_norm": 0.004033442120999098, + "learning_rate": 1.7484217106926093e-05, + "loss": 0.6875, + "step": 24618 + }, + { + "epoch": 0.30775769394234853, + "grad_norm": 6.04164457321167, + "learning_rate": 1.7483638290970336e-05, + "loss": 1.2902, + "step": 24620 + }, + { + "epoch": 0.3077826945673642, + "grad_norm": 3.2964305877685547, + "learning_rate": 1.7483059418020246e-05, + "loss": 0.9723, + "step": 24622 + }, + { + "epoch": 0.3078076951923798, + "grad_norm": 1.649945855140686, + "learning_rate": 1.7482480488080238e-05, + "loss": 0.736, + "step": 24624 + }, + { + "epoch": 0.3078326958173954, + "grad_norm": 3.7465837001800537, + "learning_rate": 1.7481901501154713e-05, + "loss": 1.0946, + "step": 24626 + }, + { + "epoch": 0.30785769644241107, + "grad_norm": 2.4312798976898193, + "learning_rate": 1.7481322457248085e-05, + "loss": 0.6476, + "step": 24628 + }, + { + "epoch": 0.30788269706742666, + "grad_norm": 4.618589401245117, + "learning_rate": 1.748074335636477e-05, + "loss": 2.4191, + "step": 24630 + }, + { + "epoch": 0.3079076976924423, + "grad_norm": 4.443359375, + "learning_rate": 1.748016419850916e-05, + "loss": 0.9036, + "step": 24632 + }, + { + "epoch": 0.30793269831745795, + "grad_norm": 3.6177589893341064, + "learning_rate": 1.7479584983685688e-05, + "loss": 1.1388, + "step": 24634 + }, + { + "epoch": 0.30795769894247355, + "grad_norm": 0.029463661834597588, + "learning_rate": 1.7479005711898748e-05, + "loss": 0.3226, + "step": 24636 + }, + { + "epoch": 0.3079826995674892, + "grad_norm": 3.4761619567871094, + "learning_rate": 1.747842638315276e-05, + "loss": 1.0667, + "step": 24638 + }, + { + "epoch": 0.3080077001925048, + "grad_norm": 0.01166000310331583, + "learning_rate": 1.7477846997452134e-05, + "loss": 0.7414, + "step": 24640 + }, + { + "epoch": 0.30803270081752043, + "grad_norm": 0.006808142643421888, + "learning_rate": 1.747726755480128e-05, + "loss": 0.7907, + "step": 24642 + }, + { + "epoch": 0.3080577014425361, + "grad_norm": 0.0035453177988529205, + "learning_rate": 1.747668805520462e-05, + "loss": 0.4322, + "step": 24644 + }, + { + "epoch": 0.3080827020675517, + "grad_norm": 1.4736744165420532, + "learning_rate": 1.7476108498666557e-05, + "loss": 0.0164, + "step": 24646 + }, + { + "epoch": 0.3081077026925673, + "grad_norm": 2.4616756439208984, + "learning_rate": 1.747552888519151e-05, + "loss": 0.9421, + "step": 24648 + }, + { + "epoch": 0.3081327033175829, + "grad_norm": 4.22205924987793, + "learning_rate": 1.747494921478389e-05, + "loss": 1.6259, + "step": 24650 + }, + { + "epoch": 0.30815770394259856, + "grad_norm": 0.004463935270905495, + "learning_rate": 1.7474369487448117e-05, + "loss": 0.5307, + "step": 24652 + }, + { + "epoch": 0.3081827045676142, + "grad_norm": 4.1222028732299805, + "learning_rate": 1.74737897031886e-05, + "loss": 1.3974, + "step": 24654 + }, + { + "epoch": 0.3082077051926298, + "grad_norm": 0.47362640500068665, + "learning_rate": 1.747320986200976e-05, + "loss": 0.4787, + "step": 24656 + }, + { + "epoch": 0.30823270581764545, + "grad_norm": 3.9084813594818115, + "learning_rate": 1.7472629963916005e-05, + "loss": 1.8211, + "step": 24658 + }, + { + "epoch": 0.30825770644266104, + "grad_norm": 3.37499737739563, + "learning_rate": 1.7472050008911764e-05, + "loss": 0.9718, + "step": 24660 + }, + { + "epoch": 0.3082827070676767, + "grad_norm": 1.3965373039245605, + "learning_rate": 1.747146999700144e-05, + "loss": 0.9537, + "step": 24662 + }, + { + "epoch": 0.30830770769269233, + "grad_norm": 3.577524185180664, + "learning_rate": 1.7470889928189467e-05, + "loss": 1.5807, + "step": 24664 + }, + { + "epoch": 0.3083327083177079, + "grad_norm": 0.011731976643204689, + "learning_rate": 1.7470309802480243e-05, + "loss": 0.8257, + "step": 24666 + }, + { + "epoch": 0.3083577089427236, + "grad_norm": 4.308596611022949, + "learning_rate": 1.7469729619878202e-05, + "loss": 1.1163, + "step": 24668 + }, + { + "epoch": 0.30838270956773917, + "grad_norm": 4.112128257751465, + "learning_rate": 1.746914938038775e-05, + "loss": 2.3884, + "step": 24670 + }, + { + "epoch": 0.3084077101927548, + "grad_norm": 0.19461749494075775, + "learning_rate": 1.7468569084013318e-05, + "loss": 0.5698, + "step": 24672 + }, + { + "epoch": 0.30843271081777046, + "grad_norm": 8.606801986694336, + "learning_rate": 1.746798873075932e-05, + "loss": 0.5609, + "step": 24674 + }, + { + "epoch": 0.30845771144278605, + "grad_norm": 9.938130378723145, + "learning_rate": 1.7467408320630173e-05, + "loss": 1.2892, + "step": 24676 + }, + { + "epoch": 0.3084827120678017, + "grad_norm": 2.7209019660949707, + "learning_rate": 1.7466827853630305e-05, + "loss": 0.7004, + "step": 24678 + }, + { + "epoch": 0.3085077126928173, + "grad_norm": 3.016791343688965, + "learning_rate": 1.7466247329764127e-05, + "loss": 0.5303, + "step": 24680 + }, + { + "epoch": 0.30853271331783294, + "grad_norm": 1.33864426612854, + "learning_rate": 1.7465666749036068e-05, + "loss": 0.204, + "step": 24682 + }, + { + "epoch": 0.3085577139428486, + "grad_norm": 0.039140745997428894, + "learning_rate": 1.7465086111450547e-05, + "loss": 0.843, + "step": 24684 + }, + { + "epoch": 0.3085827145678642, + "grad_norm": 1.1074916124343872, + "learning_rate": 1.746450541701198e-05, + "loss": 0.1036, + "step": 24686 + }, + { + "epoch": 0.30860771519287983, + "grad_norm": 1.8745747804641724, + "learning_rate": 1.7463924665724802e-05, + "loss": 0.4869, + "step": 24688 + }, + { + "epoch": 0.3086327158178954, + "grad_norm": 1.270039677619934, + "learning_rate": 1.746334385759343e-05, + "loss": 0.6869, + "step": 24690 + }, + { + "epoch": 0.30865771644291107, + "grad_norm": 4.959958553314209, + "learning_rate": 1.7462762992622282e-05, + "loss": 1.4117, + "step": 24692 + }, + { + "epoch": 0.3086827170679267, + "grad_norm": 2.671029567718506, + "learning_rate": 1.746218207081579e-05, + "loss": 1.8499, + "step": 24694 + }, + { + "epoch": 0.3087077176929423, + "grad_norm": 2.900498628616333, + "learning_rate": 1.7461601092178372e-05, + "loss": 1.0323, + "step": 24696 + }, + { + "epoch": 0.30873271831795795, + "grad_norm": 5.2135396003723145, + "learning_rate": 1.7461020056714455e-05, + "loss": 1.5815, + "step": 24698 + }, + { + "epoch": 0.30875771894297355, + "grad_norm": 3.727144956588745, + "learning_rate": 1.7460438964428465e-05, + "loss": 1.1185, + "step": 24700 + }, + { + "epoch": 0.3087827195679892, + "grad_norm": 2.753469467163086, + "learning_rate": 1.745985781532483e-05, + "loss": 0.5658, + "step": 24702 + }, + { + "epoch": 0.30880772019300484, + "grad_norm": 4.0054612159729, + "learning_rate": 1.745927660940797e-05, + "loss": 0.4206, + "step": 24704 + }, + { + "epoch": 0.30883272081802043, + "grad_norm": 3.619926691055298, + "learning_rate": 1.745869534668231e-05, + "loss": 0.816, + "step": 24706 + }, + { + "epoch": 0.3088577214430361, + "grad_norm": 2.571458101272583, + "learning_rate": 1.7458114027152288e-05, + "loss": 1.3551, + "step": 24708 + }, + { + "epoch": 0.3088827220680517, + "grad_norm": 1.385530710220337, + "learning_rate": 1.745753265082232e-05, + "loss": 0.7191, + "step": 24710 + }, + { + "epoch": 0.3089077226930673, + "grad_norm": 3.0135793685913086, + "learning_rate": 1.7456951217696837e-05, + "loss": 1.0485, + "step": 24712 + }, + { + "epoch": 0.30893272331808297, + "grad_norm": 3.7610225677490234, + "learning_rate": 1.7456369727780273e-05, + "loss": 0.867, + "step": 24714 + }, + { + "epoch": 0.30895772394309856, + "grad_norm": 12.7775297164917, + "learning_rate": 1.745578818107705e-05, + "loss": 0.9118, + "step": 24716 + }, + { + "epoch": 0.3089827245681142, + "grad_norm": 3.2254247665405273, + "learning_rate": 1.7455206577591596e-05, + "loss": 0.4281, + "step": 24718 + }, + { + "epoch": 0.3090077251931298, + "grad_norm": 2.7091598510742188, + "learning_rate": 1.7454624917328343e-05, + "loss": 0.5686, + "step": 24720 + }, + { + "epoch": 0.30903272581814545, + "grad_norm": 0.021132251247763634, + "learning_rate": 1.7454043200291725e-05, + "loss": 0.9131, + "step": 24722 + }, + { + "epoch": 0.3090577264431611, + "grad_norm": 5.074082374572754, + "learning_rate": 1.7453461426486166e-05, + "loss": 0.9956, + "step": 24724 + }, + { + "epoch": 0.3090827270681767, + "grad_norm": 1.3281675577163696, + "learning_rate": 1.7452879595916097e-05, + "loss": 0.2563, + "step": 24726 + }, + { + "epoch": 0.30910772769319234, + "grad_norm": 3.9197371006011963, + "learning_rate": 1.745229770858595e-05, + "loss": 1.4075, + "step": 24728 + }, + { + "epoch": 0.3091327283182079, + "grad_norm": 4.5993852615356445, + "learning_rate": 1.745171576450016e-05, + "loss": 1.2805, + "step": 24730 + }, + { + "epoch": 0.3091577289432236, + "grad_norm": 4.755119800567627, + "learning_rate": 1.7451133763663157e-05, + "loss": 1.8229, + "step": 24732 + }, + { + "epoch": 0.3091827295682392, + "grad_norm": 4.027383327484131, + "learning_rate": 1.745055170607937e-05, + "loss": 0.932, + "step": 24734 + }, + { + "epoch": 0.3092077301932548, + "grad_norm": 3.8318355083465576, + "learning_rate": 1.744996959175324e-05, + "loss": 0.2463, + "step": 24736 + }, + { + "epoch": 0.30923273081827046, + "grad_norm": 3.1525020599365234, + "learning_rate": 1.7449387420689193e-05, + "loss": 0.5808, + "step": 24738 + }, + { + "epoch": 0.30925773144328605, + "grad_norm": 6.076506614685059, + "learning_rate": 1.7448805192891664e-05, + "loss": 0.4539, + "step": 24740 + }, + { + "epoch": 0.3092827320683017, + "grad_norm": 5.8792724609375, + "learning_rate": 1.7448222908365085e-05, + "loss": 1.49, + "step": 24742 + }, + { + "epoch": 0.30930773269331735, + "grad_norm": 2.3332648277282715, + "learning_rate": 1.7447640567113896e-05, + "loss": 1.2603, + "step": 24744 + }, + { + "epoch": 0.30933273331833294, + "grad_norm": 2.8626439571380615, + "learning_rate": 1.744705816914253e-05, + "loss": 2.012, + "step": 24746 + }, + { + "epoch": 0.3093577339433486, + "grad_norm": 2.679321050643921, + "learning_rate": 1.7446475714455423e-05, + "loss": 2.1168, + "step": 24748 + }, + { + "epoch": 0.3093827345683642, + "grad_norm": 3.8983123302459717, + "learning_rate": 1.744589320305701e-05, + "loss": 1.1236, + "step": 24750 + }, + { + "epoch": 0.30940773519337983, + "grad_norm": 10.95604133605957, + "learning_rate": 1.7445310634951728e-05, + "loss": 1.6918, + "step": 24752 + }, + { + "epoch": 0.3094327358183955, + "grad_norm": 1.7805944681167603, + "learning_rate": 1.7444728010144008e-05, + "loss": 0.7144, + "step": 24754 + }, + { + "epoch": 0.30945773644341107, + "grad_norm": 0.009540130384266376, + "learning_rate": 1.7444145328638295e-05, + "loss": 0.2491, + "step": 24756 + }, + { + "epoch": 0.3094827370684267, + "grad_norm": 5.187952041625977, + "learning_rate": 1.7443562590439027e-05, + "loss": 1.6338, + "step": 24758 + }, + { + "epoch": 0.3095077376934423, + "grad_norm": 3.5805399417877197, + "learning_rate": 1.7442979795550635e-05, + "loss": 0.9972, + "step": 24760 + }, + { + "epoch": 0.30953273831845796, + "grad_norm": 2.938236951828003, + "learning_rate": 1.7442396943977565e-05, + "loss": 0.669, + "step": 24762 + }, + { + "epoch": 0.3095577389434736, + "grad_norm": 3.880053758621216, + "learning_rate": 1.744181403572425e-05, + "loss": 0.9447, + "step": 24764 + }, + { + "epoch": 0.3095827395684892, + "grad_norm": 1.677634596824646, + "learning_rate": 1.7441231070795132e-05, + "loss": 1.1401, + "step": 24766 + }, + { + "epoch": 0.30960774019350484, + "grad_norm": 1.471751093864441, + "learning_rate": 1.7440648049194652e-05, + "loss": 0.4774, + "step": 24768 + }, + { + "epoch": 0.30963274081852044, + "grad_norm": 4.472496032714844, + "learning_rate": 1.7440064970927248e-05, + "loss": 0.7661, + "step": 24770 + }, + { + "epoch": 0.3096577414435361, + "grad_norm": 0.4276988208293915, + "learning_rate": 1.7439481835997357e-05, + "loss": 0.0216, + "step": 24772 + }, + { + "epoch": 0.30968274206855173, + "grad_norm": 1.9085084199905396, + "learning_rate": 1.7438898644409427e-05, + "loss": 0.2813, + "step": 24774 + }, + { + "epoch": 0.3097077426935673, + "grad_norm": 6.098444938659668, + "learning_rate": 1.7438315396167897e-05, + "loss": 1.1595, + "step": 24776 + }, + { + "epoch": 0.30973274331858297, + "grad_norm": 3.5092928409576416, + "learning_rate": 1.743773209127721e-05, + "loss": 1.7397, + "step": 24778 + }, + { + "epoch": 0.30975774394359856, + "grad_norm": 1.101592779159546, + "learning_rate": 1.743714872974181e-05, + "loss": 0.6168, + "step": 24780 + }, + { + "epoch": 0.3097827445686142, + "grad_norm": 3.288580894470215, + "learning_rate": 1.743656531156613e-05, + "loss": 1.0932, + "step": 24782 + }, + { + "epoch": 0.30980774519362986, + "grad_norm": 3.683122396469116, + "learning_rate": 1.7435981836754624e-05, + "loss": 0.3942, + "step": 24784 + }, + { + "epoch": 0.30983274581864545, + "grad_norm": 3.543675422668457, + "learning_rate": 1.7435398305311735e-05, + "loss": 1.3989, + "step": 24786 + }, + { + "epoch": 0.3098577464436611, + "grad_norm": 5.177966117858887, + "learning_rate": 1.74348147172419e-05, + "loss": 1.3728, + "step": 24788 + }, + { + "epoch": 0.3098827470686767, + "grad_norm": 1.9389429092407227, + "learning_rate": 1.7434231072549566e-05, + "loss": 0.696, + "step": 24790 + }, + { + "epoch": 0.30990774769369234, + "grad_norm": 5.146157264709473, + "learning_rate": 1.7433647371239183e-05, + "loss": 0.8481, + "step": 24792 + }, + { + "epoch": 0.309932748318708, + "grad_norm": 1.9027518033981323, + "learning_rate": 1.743306361331519e-05, + "loss": 1.1184, + "step": 24794 + }, + { + "epoch": 0.3099577489437236, + "grad_norm": 5.478270530700684, + "learning_rate": 1.743247979878204e-05, + "loss": 0.5977, + "step": 24796 + }, + { + "epoch": 0.3099827495687392, + "grad_norm": 5.172145843505859, + "learning_rate": 1.743189592764417e-05, + "loss": 1.7668, + "step": 24798 + }, + { + "epoch": 0.3100077501937548, + "grad_norm": 0.1348402202129364, + "learning_rate": 1.7431311999906033e-05, + "loss": 0.0053, + "step": 24800 + }, + { + "epoch": 0.31003275081877046, + "grad_norm": 2.4399309158325195, + "learning_rate": 1.7430728015572076e-05, + "loss": 1.5457, + "step": 24802 + }, + { + "epoch": 0.3100577514437861, + "grad_norm": 3.538095712661743, + "learning_rate": 1.7430143974646743e-05, + "loss": 1.3508, + "step": 24804 + }, + { + "epoch": 0.3100827520688017, + "grad_norm": 2.188420057296753, + "learning_rate": 1.7429559877134487e-05, + "loss": 1.4806, + "step": 24806 + }, + { + "epoch": 0.31010775269381735, + "grad_norm": 5.262379169464111, + "learning_rate": 1.7428975723039752e-05, + "loss": 0.7825, + "step": 24808 + }, + { + "epoch": 0.31013275331883294, + "grad_norm": 3.99247670173645, + "learning_rate": 1.742839151236699e-05, + "loss": 1.1277, + "step": 24810 + }, + { + "epoch": 0.3101577539438486, + "grad_norm": 0.5707498788833618, + "learning_rate": 1.7427807245120644e-05, + "loss": 0.5427, + "step": 24812 + }, + { + "epoch": 0.31018275456886424, + "grad_norm": 2.8647522926330566, + "learning_rate": 1.7427222921305175e-05, + "loss": 0.9878, + "step": 24814 + }, + { + "epoch": 0.31020775519387983, + "grad_norm": 0.625601589679718, + "learning_rate": 1.7426638540925022e-05, + "loss": 0.0139, + "step": 24816 + }, + { + "epoch": 0.3102327558188955, + "grad_norm": 3.6835622787475586, + "learning_rate": 1.742605410398464e-05, + "loss": 0.5694, + "step": 24818 + }, + { + "epoch": 0.31025775644391107, + "grad_norm": 3.0460290908813477, + "learning_rate": 1.7425469610488483e-05, + "loss": 0.9809, + "step": 24820 + }, + { + "epoch": 0.3102827570689267, + "grad_norm": 3.2081944942474365, + "learning_rate": 1.7424885060441e-05, + "loss": 1.3079, + "step": 24822 + }, + { + "epoch": 0.31030775769394237, + "grad_norm": 7.240373134613037, + "learning_rate": 1.7424300453846638e-05, + "loss": 1.6522, + "step": 24824 + }, + { + "epoch": 0.31033275831895796, + "grad_norm": 0.5881249308586121, + "learning_rate": 1.7423715790709856e-05, + "loss": 0.0142, + "step": 24826 + }, + { + "epoch": 0.3103577589439736, + "grad_norm": 0.7517539262771606, + "learning_rate": 1.7423131071035104e-05, + "loss": 1.1298, + "step": 24828 + }, + { + "epoch": 0.3103827595689892, + "grad_norm": 0.07048332691192627, + "learning_rate": 1.742254629482684e-05, + "loss": 0.4164, + "step": 24830 + }, + { + "epoch": 0.31040776019400484, + "grad_norm": 3.448775053024292, + "learning_rate": 1.7421961462089507e-05, + "loss": 1.8322, + "step": 24832 + }, + { + "epoch": 0.3104327608190205, + "grad_norm": 4.120838165283203, + "learning_rate": 1.7421376572827565e-05, + "loss": 1.2772, + "step": 24834 + }, + { + "epoch": 0.3104577614440361, + "grad_norm": 4.412872791290283, + "learning_rate": 1.7420791627045473e-05, + "loss": 1.7047, + "step": 24836 + }, + { + "epoch": 0.31048276206905173, + "grad_norm": 6.519939422607422, + "learning_rate": 1.7420206624747676e-05, + "loss": 0.7477, + "step": 24838 + }, + { + "epoch": 0.3105077626940673, + "grad_norm": 5.658886909484863, + "learning_rate": 1.741962156593864e-05, + "loss": 1.3707, + "step": 24840 + }, + { + "epoch": 0.31053276331908297, + "grad_norm": 1.23649001121521, + "learning_rate": 1.7419036450622814e-05, + "loss": 0.1782, + "step": 24842 + }, + { + "epoch": 0.3105577639440986, + "grad_norm": 5.226031303405762, + "learning_rate": 1.7418451278804654e-05, + "loss": 1.5646, + "step": 24844 + }, + { + "epoch": 0.3105827645691142, + "grad_norm": 0.00823410414159298, + "learning_rate": 1.7417866050488617e-05, + "loss": 0.6017, + "step": 24846 + }, + { + "epoch": 0.31060776519412986, + "grad_norm": 5.378289699554443, + "learning_rate": 1.741728076567916e-05, + "loss": 1.254, + "step": 24848 + }, + { + "epoch": 0.31063276581914545, + "grad_norm": 6.615922927856445, + "learning_rate": 1.7416695424380747e-05, + "loss": 1.0213, + "step": 24850 + }, + { + "epoch": 0.3106577664441611, + "grad_norm": 4.948137283325195, + "learning_rate": 1.741611002659783e-05, + "loss": 0.5404, + "step": 24852 + }, + { + "epoch": 0.31068276706917675, + "grad_norm": 4.269230365753174, + "learning_rate": 1.7415524572334862e-05, + "loss": 1.6663, + "step": 24854 + }, + { + "epoch": 0.31070776769419234, + "grad_norm": 3.7002077102661133, + "learning_rate": 1.7414939061596312e-05, + "loss": 1.9568, + "step": 24856 + }, + { + "epoch": 0.310732768319208, + "grad_norm": 0.009430034086108208, + "learning_rate": 1.7414353494386635e-05, + "loss": 0.6604, + "step": 24858 + }, + { + "epoch": 0.3107577689442236, + "grad_norm": 10.567651748657227, + "learning_rate": 1.741376787071029e-05, + "loss": 2.3749, + "step": 24860 + }, + { + "epoch": 0.3107827695692392, + "grad_norm": 2.488600969314575, + "learning_rate": 1.7413182190571736e-05, + "loss": 1.0692, + "step": 24862 + }, + { + "epoch": 0.3108077701942549, + "grad_norm": 2.9643447399139404, + "learning_rate": 1.7412596453975435e-05, + "loss": 1.6431, + "step": 24864 + }, + { + "epoch": 0.31083277081927047, + "grad_norm": 2.307011842727661, + "learning_rate": 1.7412010660925846e-05, + "loss": 1.1854, + "step": 24866 + }, + { + "epoch": 0.3108577714442861, + "grad_norm": 4.3841233253479, + "learning_rate": 1.7411424811427436e-05, + "loss": 0.5864, + "step": 24868 + }, + { + "epoch": 0.3108827720693017, + "grad_norm": 6.3725972175598145, + "learning_rate": 1.7410838905484658e-05, + "loss": 1.1985, + "step": 24870 + }, + { + "epoch": 0.31090777269431735, + "grad_norm": 2.564518690109253, + "learning_rate": 1.741025294310198e-05, + "loss": 0.4431, + "step": 24872 + }, + { + "epoch": 0.310932773319333, + "grad_norm": 2.983551025390625, + "learning_rate": 1.7409666924283867e-05, + "loss": 1.5294, + "step": 24874 + }, + { + "epoch": 0.3109577739443486, + "grad_norm": 0.8497034907341003, + "learning_rate": 1.7409080849034775e-05, + "loss": 1.3695, + "step": 24876 + }, + { + "epoch": 0.31098277456936424, + "grad_norm": 4.310133934020996, + "learning_rate": 1.7408494717359174e-05, + "loss": 0.4409, + "step": 24878 + }, + { + "epoch": 0.31100777519437983, + "grad_norm": 3.7402408123016357, + "learning_rate": 1.7407908529261522e-05, + "loss": 0.7081, + "step": 24880 + }, + { + "epoch": 0.3110327758193955, + "grad_norm": 2.004539966583252, + "learning_rate": 1.7407322284746285e-05, + "loss": 0.3268, + "step": 24882 + }, + { + "epoch": 0.3110577764444111, + "grad_norm": 3.9382693767547607, + "learning_rate": 1.7406735983817933e-05, + "loss": 1.5013, + "step": 24884 + }, + { + "epoch": 0.3110827770694267, + "grad_norm": 3.6243982315063477, + "learning_rate": 1.740614962648093e-05, + "loss": 0.6034, + "step": 24886 + }, + { + "epoch": 0.31110777769444237, + "grad_norm": 0.00618675397709012, + "learning_rate": 1.740556321273973e-05, + "loss": 0.3963, + "step": 24888 + }, + { + "epoch": 0.31113277831945796, + "grad_norm": 3.8736069202423096, + "learning_rate": 1.7404976742598814e-05, + "loss": 0.3379, + "step": 24890 + }, + { + "epoch": 0.3111577789444736, + "grad_norm": 0.3390183746814728, + "learning_rate": 1.740439021606264e-05, + "loss": 0.0264, + "step": 24892 + }, + { + "epoch": 0.31118277956948925, + "grad_norm": 3.39593505859375, + "learning_rate": 1.7403803633135677e-05, + "loss": 0.6806, + "step": 24894 + }, + { + "epoch": 0.31120778019450485, + "grad_norm": 4.43414306640625, + "learning_rate": 1.7403216993822395e-05, + "loss": 0.68, + "step": 24896 + }, + { + "epoch": 0.3112327808195205, + "grad_norm": 4.314140319824219, + "learning_rate": 1.7402630298127253e-05, + "loss": 0.8613, + "step": 24898 + }, + { + "epoch": 0.3112577814445361, + "grad_norm": 0.35721641778945923, + "learning_rate": 1.7402043546054733e-05, + "loss": 1.0454, + "step": 24900 + }, + { + "epoch": 0.31128278206955173, + "grad_norm": 3.90120267868042, + "learning_rate": 1.7401456737609293e-05, + "loss": 1.6758, + "step": 24902 + }, + { + "epoch": 0.3113077826945674, + "grad_norm": 4.05564022064209, + "learning_rate": 1.7400869872795403e-05, + "loss": 1.0342, + "step": 24904 + }, + { + "epoch": 0.311332783319583, + "grad_norm": 1.5919402837753296, + "learning_rate": 1.740028295161754e-05, + "loss": 0.2894, + "step": 24906 + }, + { + "epoch": 0.3113577839445986, + "grad_norm": 3.847844362258911, + "learning_rate": 1.7399695974080163e-05, + "loss": 0.6827, + "step": 24908 + }, + { + "epoch": 0.3113827845696142, + "grad_norm": 2.118669271469116, + "learning_rate": 1.739910894018775e-05, + "loss": 1.0955, + "step": 24910 + }, + { + "epoch": 0.31140778519462986, + "grad_norm": 3.4054412841796875, + "learning_rate": 1.7398521849944772e-05, + "loss": 0.7137, + "step": 24912 + }, + { + "epoch": 0.3114327858196455, + "grad_norm": 1.3228678703308105, + "learning_rate": 1.7397934703355693e-05, + "loss": 0.469, + "step": 24914 + }, + { + "epoch": 0.3114577864446611, + "grad_norm": 3.473375082015991, + "learning_rate": 1.7397347500424994e-05, + "loss": 2.1961, + "step": 24916 + }, + { + "epoch": 0.31148278706967675, + "grad_norm": 0.005585675127804279, + "learning_rate": 1.739676024115714e-05, + "loss": 0.266, + "step": 24918 + }, + { + "epoch": 0.31150778769469234, + "grad_norm": 4.261851787567139, + "learning_rate": 1.7396172925556606e-05, + "loss": 1.1108, + "step": 24920 + }, + { + "epoch": 0.311532788319708, + "grad_norm": 1.382239580154419, + "learning_rate": 1.739558555362787e-05, + "loss": 0.1928, + "step": 24922 + }, + { + "epoch": 0.31155778894472363, + "grad_norm": 3.3764922618865967, + "learning_rate": 1.739499812537539e-05, + "loss": 0.837, + "step": 24924 + }, + { + "epoch": 0.3115827895697392, + "grad_norm": 5.305788040161133, + "learning_rate": 1.7394410640803655e-05, + "loss": 1.6655, + "step": 24926 + }, + { + "epoch": 0.3116077901947549, + "grad_norm": 3.1464555263519287, + "learning_rate": 1.7393823099917137e-05, + "loss": 1.402, + "step": 24928 + }, + { + "epoch": 0.31163279081977047, + "grad_norm": 0.006871512625366449, + "learning_rate": 1.7393235502720306e-05, + "loss": 0.784, + "step": 24930 + }, + { + "epoch": 0.3116577914447861, + "grad_norm": 1.0085065364837646, + "learning_rate": 1.7392647849217637e-05, + "loss": 0.1736, + "step": 24932 + }, + { + "epoch": 0.31168279206980176, + "grad_norm": 0.008590847253799438, + "learning_rate": 1.739206013941361e-05, + "loss": 0.0003, + "step": 24934 + }, + { + "epoch": 0.31170779269481735, + "grad_norm": 2.00567889213562, + "learning_rate": 1.73914723733127e-05, + "loss": 1.0052, + "step": 24936 + }, + { + "epoch": 0.311732793319833, + "grad_norm": 0.0031706816516816616, + "learning_rate": 1.739088455091938e-05, + "loss": 1.0146, + "step": 24938 + }, + { + "epoch": 0.3117577939448486, + "grad_norm": 6.248863220214844, + "learning_rate": 1.739029667223812e-05, + "loss": 0.754, + "step": 24940 + }, + { + "epoch": 0.31178279456986424, + "grad_norm": 5.6949639320373535, + "learning_rate": 1.7389708737273418e-05, + "loss": 0.2255, + "step": 24942 + }, + { + "epoch": 0.3118077951948799, + "grad_norm": 0.003995893988758326, + "learning_rate": 1.7389120746029734e-05, + "loss": 0.4164, + "step": 24944 + }, + { + "epoch": 0.3118327958198955, + "grad_norm": 3.6316065788269043, + "learning_rate": 1.738853269851155e-05, + "loss": 0.8183, + "step": 24946 + }, + { + "epoch": 0.31185779644491113, + "grad_norm": 0.00694018742069602, + "learning_rate": 1.738794459472335e-05, + "loss": 0.5253, + "step": 24948 + }, + { + "epoch": 0.3118827970699267, + "grad_norm": 3.8284592628479004, + "learning_rate": 1.7387356434669606e-05, + "loss": 1.5685, + "step": 24950 + }, + { + "epoch": 0.31190779769494237, + "grad_norm": 0.004616195801645517, + "learning_rate": 1.73867682183548e-05, + "loss": 0.5738, + "step": 24952 + }, + { + "epoch": 0.311932798319958, + "grad_norm": 3.749128818511963, + "learning_rate": 1.738617994578341e-05, + "loss": 1.8375, + "step": 24954 + }, + { + "epoch": 0.3119577989449736, + "grad_norm": 9.508438110351562, + "learning_rate": 1.7385591616959924e-05, + "loss": 0.3186, + "step": 24956 + }, + { + "epoch": 0.31198279956998926, + "grad_norm": 6.622244358062744, + "learning_rate": 1.738500323188881e-05, + "loss": 0.7195, + "step": 24958 + }, + { + "epoch": 0.31200780019500485, + "grad_norm": 4.388070583343506, + "learning_rate": 1.738441479057456e-05, + "loss": 1.0904, + "step": 24960 + }, + { + "epoch": 0.3120328008200205, + "grad_norm": 6.104118824005127, + "learning_rate": 1.738382629302165e-05, + "loss": 0.6002, + "step": 24962 + }, + { + "epoch": 0.31205780144503614, + "grad_norm": 3.5200493335723877, + "learning_rate": 1.7383237739234562e-05, + "loss": 1.3513, + "step": 24964 + }, + { + "epoch": 0.31208280207005173, + "grad_norm": 3.3286473751068115, + "learning_rate": 1.738264912921778e-05, + "loss": 2.2466, + "step": 24966 + }, + { + "epoch": 0.3121078026950674, + "grad_norm": 3.785137414932251, + "learning_rate": 1.7382060462975786e-05, + "loss": 1.0688, + "step": 24968 + }, + { + "epoch": 0.312132803320083, + "grad_norm": 0.0033211070112884045, + "learning_rate": 1.7381471740513066e-05, + "loss": 0.0012, + "step": 24970 + }, + { + "epoch": 0.3121578039450986, + "grad_norm": 2.7475452423095703, + "learning_rate": 1.7380882961834096e-05, + "loss": 1.8754, + "step": 24972 + }, + { + "epoch": 0.31218280457011427, + "grad_norm": 2.2649943828582764, + "learning_rate": 1.738029412694337e-05, + "loss": 0.7385, + "step": 24974 + }, + { + "epoch": 0.31220780519512986, + "grad_norm": 6.2333292961120605, + "learning_rate": 1.7379705235845366e-05, + "loss": 1.2324, + "step": 24976 + }, + { + "epoch": 0.3122328058201455, + "grad_norm": 4.979231834411621, + "learning_rate": 1.737911628854457e-05, + "loss": 0.5936, + "step": 24978 + }, + { + "epoch": 0.3122578064451611, + "grad_norm": 1.9685851335525513, + "learning_rate": 1.737852728504547e-05, + "loss": 0.6356, + "step": 24980 + }, + { + "epoch": 0.31228280707017675, + "grad_norm": 3.6204934120178223, + "learning_rate": 1.7377938225352545e-05, + "loss": 0.9566, + "step": 24982 + }, + { + "epoch": 0.3123078076951924, + "grad_norm": 5.764997959136963, + "learning_rate": 1.7377349109470292e-05, + "loss": 0.5916, + "step": 24984 + }, + { + "epoch": 0.312332808320208, + "grad_norm": 1.748085856437683, + "learning_rate": 1.7376759937403184e-05, + "loss": 0.5795, + "step": 24986 + }, + { + "epoch": 0.31235780894522364, + "grad_norm": 1.9017764329910278, + "learning_rate": 1.7376170709155722e-05, + "loss": 1.1861, + "step": 24988 + }, + { + "epoch": 0.31238280957023923, + "grad_norm": 3.273444175720215, + "learning_rate": 1.7375581424732384e-05, + "loss": 0.9749, + "step": 24990 + }, + { + "epoch": 0.3124078101952549, + "grad_norm": 4.796011447906494, + "learning_rate": 1.737499208413766e-05, + "loss": 0.36, + "step": 24992 + }, + { + "epoch": 0.3124328108202705, + "grad_norm": 0.4964343011379242, + "learning_rate": 1.7374402687376042e-05, + "loss": 0.0141, + "step": 24994 + }, + { + "epoch": 0.3124578114452861, + "grad_norm": 1.9591679573059082, + "learning_rate": 1.7373813234452017e-05, + "loss": 0.8697, + "step": 24996 + }, + { + "epoch": 0.31248281207030176, + "grad_norm": 4.448511600494385, + "learning_rate": 1.737322372537007e-05, + "loss": 0.8003, + "step": 24998 + }, + { + "epoch": 0.3125078126953174, + "grad_norm": 0.006009866949170828, + "learning_rate": 1.7372634160134692e-05, + "loss": 0.0608, + "step": 25000 + }, + { + "epoch": 0.312532813320333, + "grad_norm": 2.703174591064453, + "learning_rate": 1.7372044538750376e-05, + "loss": 1.0569, + "step": 25002 + }, + { + "epoch": 0.31255781394534865, + "grad_norm": 0.0031298429239541292, + "learning_rate": 1.7371454861221612e-05, + "loss": 1.4055, + "step": 25004 + }, + { + "epoch": 0.31258281457036424, + "grad_norm": 6.109205722808838, + "learning_rate": 1.7370865127552893e-05, + "loss": 0.724, + "step": 25006 + }, + { + "epoch": 0.3126078151953799, + "grad_norm": 0.0026832951698452234, + "learning_rate": 1.7370275337748706e-05, + "loss": 0.0714, + "step": 25008 + }, + { + "epoch": 0.31263281582039554, + "grad_norm": 3.8315982818603516, + "learning_rate": 1.7369685491813543e-05, + "loss": 1.3276, + "step": 25010 + }, + { + "epoch": 0.31265781644541113, + "grad_norm": 3.3466010093688965, + "learning_rate": 1.7369095589751896e-05, + "loss": 0.7285, + "step": 25012 + }, + { + "epoch": 0.3126828170704268, + "grad_norm": 3.696438789367676, + "learning_rate": 1.7368505631568262e-05, + "loss": 0.5877, + "step": 25014 + }, + { + "epoch": 0.31270781769544237, + "grad_norm": 4.353429317474365, + "learning_rate": 1.736791561726713e-05, + "loss": 0.298, + "step": 25016 + }, + { + "epoch": 0.312732818320458, + "grad_norm": 3.2040059566497803, + "learning_rate": 1.7367325546852996e-05, + "loss": 1.4742, + "step": 25018 + }, + { + "epoch": 0.31275781894547366, + "grad_norm": 5.234970569610596, + "learning_rate": 1.7366735420330352e-05, + "loss": 2.3054, + "step": 25020 + }, + { + "epoch": 0.31278281957048926, + "grad_norm": 4.521873950958252, + "learning_rate": 1.736614523770369e-05, + "loss": 1.066, + "step": 25022 + }, + { + "epoch": 0.3128078201955049, + "grad_norm": 2.939687728881836, + "learning_rate": 1.7365554998977508e-05, + "loss": 1.3193, + "step": 25024 + }, + { + "epoch": 0.3128328208205205, + "grad_norm": 0.046762123703956604, + "learning_rate": 1.7364964704156303e-05, + "loss": 0.0013, + "step": 25026 + }, + { + "epoch": 0.31285782144553614, + "grad_norm": 4.884642124176025, + "learning_rate": 1.736437435324457e-05, + "loss": 1.8774, + "step": 25028 + }, + { + "epoch": 0.3128828220705518, + "grad_norm": 7.763941764831543, + "learning_rate": 1.73637839462468e-05, + "loss": 1.0638, + "step": 25030 + }, + { + "epoch": 0.3129078226955674, + "grad_norm": 1.6852636337280273, + "learning_rate": 1.7363193483167493e-05, + "loss": 0.3021, + "step": 25032 + }, + { + "epoch": 0.31293282332058303, + "grad_norm": 3.1765894889831543, + "learning_rate": 1.7362602964011146e-05, + "loss": 1.5913, + "step": 25034 + }, + { + "epoch": 0.3129578239455986, + "grad_norm": 4.777217388153076, + "learning_rate": 1.7362012388782258e-05, + "loss": 0.1652, + "step": 25036 + }, + { + "epoch": 0.31298282457061427, + "grad_norm": 3.2888665199279785, + "learning_rate": 1.7361421757485322e-05, + "loss": 1.2658, + "step": 25038 + }, + { + "epoch": 0.3130078251956299, + "grad_norm": 4.524282455444336, + "learning_rate": 1.736083107012484e-05, + "loss": 2.0346, + "step": 25040 + }, + { + "epoch": 0.3130328258206455, + "grad_norm": 3.7655580043792725, + "learning_rate": 1.736024032670531e-05, + "loss": 1.0495, + "step": 25042 + }, + { + "epoch": 0.31305782644566116, + "grad_norm": 3.9000256061553955, + "learning_rate": 1.735964952723123e-05, + "loss": 1.7352, + "step": 25044 + }, + { + "epoch": 0.31308282707067675, + "grad_norm": 3.469438314437866, + "learning_rate": 1.7359058671707098e-05, + "loss": 0.765, + "step": 25046 + }, + { + "epoch": 0.3131078276956924, + "grad_norm": 2.7366557121276855, + "learning_rate": 1.735846776013742e-05, + "loss": 0.5527, + "step": 25048 + }, + { + "epoch": 0.31313282832070805, + "grad_norm": 7.629113674163818, + "learning_rate": 1.7357876792526692e-05, + "loss": 1.173, + "step": 25050 + }, + { + "epoch": 0.31315782894572364, + "grad_norm": 3.3805480003356934, + "learning_rate": 1.735728576887941e-05, + "loss": 0.4012, + "step": 25052 + }, + { + "epoch": 0.3131828295707393, + "grad_norm": 0.008097236044704914, + "learning_rate": 1.7356694689200083e-05, + "loss": 0.0554, + "step": 25054 + }, + { + "epoch": 0.3132078301957549, + "grad_norm": 5.263744831085205, + "learning_rate": 1.735610355349321e-05, + "loss": 1.3357, + "step": 25056 + }, + { + "epoch": 0.3132328308207705, + "grad_norm": 4.349169731140137, + "learning_rate": 1.7355512361763292e-05, + "loss": 1.5177, + "step": 25058 + }, + { + "epoch": 0.3132578314457862, + "grad_norm": 3.0554897785186768, + "learning_rate": 1.7354921114014832e-05, + "loss": 1.3853, + "step": 25060 + }, + { + "epoch": 0.31328283207080176, + "grad_norm": 3.4415507316589355, + "learning_rate": 1.7354329810252334e-05, + "loss": 1.0526, + "step": 25062 + }, + { + "epoch": 0.3133078326958174, + "grad_norm": 0.003706506686285138, + "learning_rate": 1.7353738450480297e-05, + "loss": 0.3951, + "step": 25064 + }, + { + "epoch": 0.313332833320833, + "grad_norm": 2.2639150619506836, + "learning_rate": 1.7353147034703228e-05, + "loss": 1.5323, + "step": 25066 + }, + { + "epoch": 0.31335783394584865, + "grad_norm": 1.9711229801177979, + "learning_rate": 1.7352555562925636e-05, + "loss": 0.4211, + "step": 25068 + }, + { + "epoch": 0.3133828345708643, + "grad_norm": 3.654682159423828, + "learning_rate": 1.7351964035152017e-05, + "loss": 1.5448, + "step": 25070 + }, + { + "epoch": 0.3134078351958799, + "grad_norm": 0.002408065367490053, + "learning_rate": 1.7351372451386878e-05, + "loss": 0.0001, + "step": 25072 + }, + { + "epoch": 0.31343283582089554, + "grad_norm": 0.06347613036632538, + "learning_rate": 1.7350780811634727e-05, + "loss": 0.0015, + "step": 25074 + }, + { + "epoch": 0.31345783644591113, + "grad_norm": 4.131280899047852, + "learning_rate": 1.7350189115900072e-05, + "loss": 0.7155, + "step": 25076 + }, + { + "epoch": 0.3134828370709268, + "grad_norm": 4.006975173950195, + "learning_rate": 1.7349597364187415e-05, + "loss": 1.1987, + "step": 25078 + }, + { + "epoch": 0.3135078376959424, + "grad_norm": 8.02575969696045, + "learning_rate": 1.7349005556501263e-05, + "loss": 1.0306, + "step": 25080 + }, + { + "epoch": 0.313532838320958, + "grad_norm": 3.42149019241333, + "learning_rate": 1.734841369284612e-05, + "loss": 1.764, + "step": 25082 + }, + { + "epoch": 0.31355783894597367, + "grad_norm": 4.211935043334961, + "learning_rate": 1.7347821773226502e-05, + "loss": 1.2282, + "step": 25084 + }, + { + "epoch": 0.31358283957098926, + "grad_norm": 3.2704789638519287, + "learning_rate": 1.7347229797646913e-05, + "loss": 0.6815, + "step": 25086 + }, + { + "epoch": 0.3136078401960049, + "grad_norm": 5.299287796020508, + "learning_rate": 1.734663776611186e-05, + "loss": 1.2445, + "step": 25088 + }, + { + "epoch": 0.31363284082102055, + "grad_norm": 0.5838820934295654, + "learning_rate": 1.7346045678625853e-05, + "loss": 0.2933, + "step": 25090 + }, + { + "epoch": 0.31365784144603615, + "grad_norm": 4.5665998458862305, + "learning_rate": 1.73454535351934e-05, + "loss": 1.5927, + "step": 25092 + }, + { + "epoch": 0.3136828420710518, + "grad_norm": 1.6165472269058228, + "learning_rate": 1.734486133581901e-05, + "loss": 1.3461, + "step": 25094 + }, + { + "epoch": 0.3137078426960674, + "grad_norm": 2.74613618850708, + "learning_rate": 1.7344269080507196e-05, + "loss": 0.1997, + "step": 25096 + }, + { + "epoch": 0.31373284332108303, + "grad_norm": 1.4928542375564575, + "learning_rate": 1.734367676926247e-05, + "loss": 0.4355, + "step": 25098 + }, + { + "epoch": 0.3137578439460987, + "grad_norm": 15.613968849182129, + "learning_rate": 1.7343084402089335e-05, + "loss": 0.4792, + "step": 25100 + }, + { + "epoch": 0.31378284457111427, + "grad_norm": 11.035283088684082, + "learning_rate": 1.734249197899231e-05, + "loss": 0.9404, + "step": 25102 + }, + { + "epoch": 0.3138078451961299, + "grad_norm": 2.4638097286224365, + "learning_rate": 1.7341899499975906e-05, + "loss": 0.9058, + "step": 25104 + }, + { + "epoch": 0.3138328458211455, + "grad_norm": 3.416483163833618, + "learning_rate": 1.7341306965044633e-05, + "loss": 0.8802, + "step": 25106 + }, + { + "epoch": 0.31385784644616116, + "grad_norm": 3.8490512371063232, + "learning_rate": 1.7340714374203e-05, + "loss": 1.6396, + "step": 25108 + }, + { + "epoch": 0.3138828470711768, + "grad_norm": 1.3960667848587036, + "learning_rate": 1.734012172745553e-05, + "loss": 1.5751, + "step": 25110 + }, + { + "epoch": 0.3139078476961924, + "grad_norm": 8.997833251953125, + "learning_rate": 1.733952902480673e-05, + "loss": 1.5813, + "step": 25112 + }, + { + "epoch": 0.31393284832120805, + "grad_norm": 2.507065773010254, + "learning_rate": 1.7338936266261113e-05, + "loss": 1.3101, + "step": 25114 + }, + { + "epoch": 0.31395784894622364, + "grad_norm": 2.793311834335327, + "learning_rate": 1.7338343451823197e-05, + "loss": 0.9826, + "step": 25116 + }, + { + "epoch": 0.3139828495712393, + "grad_norm": 4.31710147857666, + "learning_rate": 1.7337750581497495e-05, + "loss": 0.9733, + "step": 25118 + }, + { + "epoch": 0.31400785019625493, + "grad_norm": 4.118467330932617, + "learning_rate": 1.733715765528852e-05, + "loss": 0.7692, + "step": 25120 + }, + { + "epoch": 0.3140328508212705, + "grad_norm": 8.798096656799316, + "learning_rate": 1.7336564673200794e-05, + "loss": 0.8641, + "step": 25122 + }, + { + "epoch": 0.3140578514462862, + "grad_norm": 2.5735294818878174, + "learning_rate": 1.7335971635238825e-05, + "loss": 0.5282, + "step": 25124 + }, + { + "epoch": 0.31408285207130177, + "grad_norm": 2.046420097351074, + "learning_rate": 1.7335378541407136e-05, + "loss": 0.471, + "step": 25126 + }, + { + "epoch": 0.3141078526963174, + "grad_norm": 2.3665738105773926, + "learning_rate": 1.733478539171024e-05, + "loss": 1.1639, + "step": 25128 + }, + { + "epoch": 0.31413285332133306, + "grad_norm": 3.20139217376709, + "learning_rate": 1.7334192186152653e-05, + "loss": 0.8722, + "step": 25130 + }, + { + "epoch": 0.31415785394634865, + "grad_norm": 2.304929733276367, + "learning_rate": 1.73335989247389e-05, + "loss": 0.1418, + "step": 25132 + }, + { + "epoch": 0.3141828545713643, + "grad_norm": 4.509532451629639, + "learning_rate": 1.7333005607473492e-05, + "loss": 1.446, + "step": 25134 + }, + { + "epoch": 0.3142078551963799, + "grad_norm": 2.5936086177825928, + "learning_rate": 1.733241223436095e-05, + "loss": 0.7506, + "step": 25136 + }, + { + "epoch": 0.31423285582139554, + "grad_norm": 2.738870143890381, + "learning_rate": 1.7331818805405796e-05, + "loss": 0.4791, + "step": 25138 + }, + { + "epoch": 0.3142578564464112, + "grad_norm": 3.2237026691436768, + "learning_rate": 1.7331225320612544e-05, + "loss": 0.6367, + "step": 25140 + }, + { + "epoch": 0.3142828570714268, + "grad_norm": 3.035092830657959, + "learning_rate": 1.7330631779985717e-05, + "loss": 0.5189, + "step": 25142 + }, + { + "epoch": 0.3143078576964424, + "grad_norm": 3.7678773403167725, + "learning_rate": 1.733003818352984e-05, + "loss": 1.2777, + "step": 25144 + }, + { + "epoch": 0.314332858321458, + "grad_norm": 3.2792699337005615, + "learning_rate": 1.7329444531249423e-05, + "loss": 0.3645, + "step": 25146 + }, + { + "epoch": 0.31435785894647367, + "grad_norm": 0.008947344496846199, + "learning_rate": 1.7328850823148996e-05, + "loss": 1.2437, + "step": 25148 + }, + { + "epoch": 0.3143828595714893, + "grad_norm": 2.557040214538574, + "learning_rate": 1.7328257059233075e-05, + "loss": 0.7314, + "step": 25150 + }, + { + "epoch": 0.3144078601965049, + "grad_norm": 3.6678783893585205, + "learning_rate": 1.7327663239506185e-05, + "loss": 0.3575, + "step": 25152 + }, + { + "epoch": 0.31443286082152055, + "grad_norm": 4.315976142883301, + "learning_rate": 1.7327069363972847e-05, + "loss": 0.3675, + "step": 25154 + }, + { + "epoch": 0.31445786144653615, + "grad_norm": 3.286914110183716, + "learning_rate": 1.7326475432637588e-05, + "loss": 1.0411, + "step": 25156 + }, + { + "epoch": 0.3144828620715518, + "grad_norm": 2.280465602874756, + "learning_rate": 1.7325881445504925e-05, + "loss": 0.7092, + "step": 25158 + }, + { + "epoch": 0.31450786269656744, + "grad_norm": 2.538490056991577, + "learning_rate": 1.7325287402579387e-05, + "loss": 0.81, + "step": 25160 + }, + { + "epoch": 0.31453286332158303, + "grad_norm": 3.726986885070801, + "learning_rate": 1.7324693303865495e-05, + "loss": 0.9614, + "step": 25162 + }, + { + "epoch": 0.3145578639465987, + "grad_norm": 1.9273775815963745, + "learning_rate": 1.732409914936778e-05, + "loss": 0.1994, + "step": 25164 + }, + { + "epoch": 0.3145828645716143, + "grad_norm": 0.9518029093742371, + "learning_rate": 1.7323504939090753e-05, + "loss": 1.3554, + "step": 25166 + }, + { + "epoch": 0.3146078651966299, + "grad_norm": 0.13555116951465607, + "learning_rate": 1.732291067303895e-05, + "loss": 0.0027, + "step": 25168 + }, + { + "epoch": 0.31463286582164557, + "grad_norm": 4.346685886383057, + "learning_rate": 1.7322316351216898e-05, + "loss": 1.0123, + "step": 25170 + }, + { + "epoch": 0.31465786644666116, + "grad_norm": 8.25788688659668, + "learning_rate": 1.732172197362912e-05, + "loss": 0.9671, + "step": 25172 + }, + { + "epoch": 0.3146828670716768, + "grad_norm": 4.046724796295166, + "learning_rate": 1.732112754028014e-05, + "loss": 2.0236, + "step": 25174 + }, + { + "epoch": 0.3147078676966924, + "grad_norm": 0.0030101428274065256, + "learning_rate": 1.732053305117449e-05, + "loss": 0.8523, + "step": 25176 + }, + { + "epoch": 0.31473286832170805, + "grad_norm": 4.108839988708496, + "learning_rate": 1.7319938506316696e-05, + "loss": 0.769, + "step": 25178 + }, + { + "epoch": 0.3147578689467237, + "grad_norm": 9.761765480041504, + "learning_rate": 1.7319343905711283e-05, + "loss": 1.1508, + "step": 25180 + }, + { + "epoch": 0.3147828695717393, + "grad_norm": 1.635396957397461, + "learning_rate": 1.7318749249362784e-05, + "loss": 0.4547, + "step": 25182 + }, + { + "epoch": 0.31480787019675494, + "grad_norm": 5.661288738250732, + "learning_rate": 1.7318154537275726e-05, + "loss": 1.2384, + "step": 25184 + }, + { + "epoch": 0.3148328708217705, + "grad_norm": 2.138334274291992, + "learning_rate": 1.7317559769454636e-05, + "loss": 0.9718, + "step": 25186 + }, + { + "epoch": 0.3148578714467862, + "grad_norm": 0.5138885378837585, + "learning_rate": 1.7316964945904046e-05, + "loss": 0.594, + "step": 25188 + }, + { + "epoch": 0.3148828720718018, + "grad_norm": 4.101780891418457, + "learning_rate": 1.7316370066628486e-05, + "loss": 1.282, + "step": 25190 + }, + { + "epoch": 0.3149078726968174, + "grad_norm": 2.406825304031372, + "learning_rate": 1.7315775131632487e-05, + "loss": 0.9233, + "step": 25192 + }, + { + "epoch": 0.31493287332183306, + "grad_norm": 3.697725296020508, + "learning_rate": 1.731518014092058e-05, + "loss": 1.1507, + "step": 25194 + }, + { + "epoch": 0.31495787394684865, + "grad_norm": 1.8586013317108154, + "learning_rate": 1.7314585094497294e-05, + "loss": 0.2196, + "step": 25196 + }, + { + "epoch": 0.3149828745718643, + "grad_norm": 5.040292739868164, + "learning_rate": 1.7313989992367162e-05, + "loss": 1.2991, + "step": 25198 + }, + { + "epoch": 0.31500787519687995, + "grad_norm": 2.8702566623687744, + "learning_rate": 1.7313394834534718e-05, + "loss": 0.5475, + "step": 25200 + }, + { + "epoch": 0.31503287582189554, + "grad_norm": 0.9907547831535339, + "learning_rate": 1.731279962100449e-05, + "loss": 0.1371, + "step": 25202 + }, + { + "epoch": 0.3150578764469112, + "grad_norm": 8.422123908996582, + "learning_rate": 1.7312204351781017e-05, + "loss": 0.969, + "step": 25204 + }, + { + "epoch": 0.3150828770719268, + "grad_norm": 3.135122060775757, + "learning_rate": 1.731160902686883e-05, + "loss": 1.6411, + "step": 25206 + }, + { + "epoch": 0.31510787769694243, + "grad_norm": 0.8385627865791321, + "learning_rate": 1.731101364627246e-05, + "loss": 0.4058, + "step": 25208 + }, + { + "epoch": 0.3151328783219581, + "grad_norm": 3.7659223079681396, + "learning_rate": 1.7310418209996448e-05, + "loss": 1.3966, + "step": 25210 + }, + { + "epoch": 0.31515787894697367, + "grad_norm": 2.9177634716033936, + "learning_rate": 1.730982271804532e-05, + "loss": 0.9925, + "step": 25212 + }, + { + "epoch": 0.3151828795719893, + "grad_norm": 8.34302806854248, + "learning_rate": 1.730922717042362e-05, + "loss": 2.9645, + "step": 25214 + }, + { + "epoch": 0.3152078801970049, + "grad_norm": 2.323246479034424, + "learning_rate": 1.7308631567135877e-05, + "loss": 0.7499, + "step": 25216 + }, + { + "epoch": 0.31523288082202056, + "grad_norm": 3.691281318664551, + "learning_rate": 1.730803590818663e-05, + "loss": 1.016, + "step": 25218 + }, + { + "epoch": 0.3152578814470362, + "grad_norm": 3.4019031524658203, + "learning_rate": 1.7307440193580418e-05, + "loss": 0.7757, + "step": 25220 + }, + { + "epoch": 0.3152828820720518, + "grad_norm": 2.9874277114868164, + "learning_rate": 1.730684442332177e-05, + "loss": 0.5628, + "step": 25222 + }, + { + "epoch": 0.31530788269706744, + "grad_norm": 4.37983512878418, + "learning_rate": 1.7306248597415226e-05, + "loss": 1.3691, + "step": 25224 + }, + { + "epoch": 0.31533288332208304, + "grad_norm": 7.014648914337158, + "learning_rate": 1.7305652715865333e-05, + "loss": 0.9777, + "step": 25226 + }, + { + "epoch": 0.3153578839470987, + "grad_norm": 3.084641456604004, + "learning_rate": 1.730505677867662e-05, + "loss": 0.9333, + "step": 25228 + }, + { + "epoch": 0.31538288457211433, + "grad_norm": 3.0980916023254395, + "learning_rate": 1.7304460785853625e-05, + "loss": 1.3502, + "step": 25230 + }, + { + "epoch": 0.3154078851971299, + "grad_norm": 4.611331939697266, + "learning_rate": 1.7303864737400894e-05, + "loss": 0.5707, + "step": 25232 + }, + { + "epoch": 0.31543288582214557, + "grad_norm": 0.1725826859474182, + "learning_rate": 1.7303268633322957e-05, + "loss": 0.6575, + "step": 25234 + }, + { + "epoch": 0.31545788644716116, + "grad_norm": 0.8986076712608337, + "learning_rate": 1.730267247362436e-05, + "loss": 0.0984, + "step": 25236 + }, + { + "epoch": 0.3154828870721768, + "grad_norm": 2.774251699447632, + "learning_rate": 1.7302076258309646e-05, + "loss": 0.645, + "step": 25238 + }, + { + "epoch": 0.31550788769719246, + "grad_norm": 3.2069008350372314, + "learning_rate": 1.7301479987383348e-05, + "loss": 0.8726, + "step": 25240 + }, + { + "epoch": 0.31553288832220805, + "grad_norm": 2.755687713623047, + "learning_rate": 1.730088366085001e-05, + "loss": 0.597, + "step": 25242 + }, + { + "epoch": 0.3155578889472237, + "grad_norm": 3.1364972591400146, + "learning_rate": 1.7300287278714178e-05, + "loss": 0.8905, + "step": 25244 + }, + { + "epoch": 0.3155828895722393, + "grad_norm": 1.7957614660263062, + "learning_rate": 1.729969084098039e-05, + "loss": 0.3283, + "step": 25246 + }, + { + "epoch": 0.31560789019725494, + "grad_norm": 0.004950135946273804, + "learning_rate": 1.7299094347653186e-05, + "loss": 0.5721, + "step": 25248 + }, + { + "epoch": 0.3156328908222706, + "grad_norm": 0.10630592703819275, + "learning_rate": 1.7298497798737113e-05, + "loss": 0.3164, + "step": 25250 + }, + { + "epoch": 0.3156578914472862, + "grad_norm": 3.7396080493927, + "learning_rate": 1.7297901194236715e-05, + "loss": 1.4463, + "step": 25252 + }, + { + "epoch": 0.3156828920723018, + "grad_norm": 3.798142671585083, + "learning_rate": 1.7297304534156533e-05, + "loss": 1.5541, + "step": 25254 + }, + { + "epoch": 0.3157078926973174, + "grad_norm": 0.7781470417976379, + "learning_rate": 1.729670781850111e-05, + "loss": 0.2335, + "step": 25256 + }, + { + "epoch": 0.31573289332233306, + "grad_norm": 1.8216232061386108, + "learning_rate": 1.729611104727499e-05, + "loss": 0.0503, + "step": 25258 + }, + { + "epoch": 0.3157578939473487, + "grad_norm": 2.717158794403076, + "learning_rate": 1.7295514220482723e-05, + "loss": 0.0974, + "step": 25260 + }, + { + "epoch": 0.3157828945723643, + "grad_norm": 1.669703722000122, + "learning_rate": 1.729491733812885e-05, + "loss": 1.0556, + "step": 25262 + }, + { + "epoch": 0.31580789519737995, + "grad_norm": 3.374394655227661, + "learning_rate": 1.729432040021792e-05, + "loss": 1.9035, + "step": 25264 + }, + { + "epoch": 0.31583289582239554, + "grad_norm": 0.06629492342472076, + "learning_rate": 1.7293723406754477e-05, + "loss": 0.5396, + "step": 25266 + }, + { + "epoch": 0.3158578964474112, + "grad_norm": 1.1195075511932373, + "learning_rate": 1.7293126357743068e-05, + "loss": 1.0102, + "step": 25268 + }, + { + "epoch": 0.31588289707242684, + "grad_norm": 3.1360878944396973, + "learning_rate": 1.7292529253188238e-05, + "loss": 0.8793, + "step": 25270 + }, + { + "epoch": 0.31590789769744243, + "grad_norm": 5.335892200469971, + "learning_rate": 1.729193209309454e-05, + "loss": 0.7448, + "step": 25272 + }, + { + "epoch": 0.3159328983224581, + "grad_norm": 4.095048904418945, + "learning_rate": 1.7291334877466514e-05, + "loss": 1.5005, + "step": 25274 + }, + { + "epoch": 0.31595789894747367, + "grad_norm": 4.426077365875244, + "learning_rate": 1.7290737606308713e-05, + "loss": 0.8228, + "step": 25276 + }, + { + "epoch": 0.3159828995724893, + "grad_norm": 1.7430357933044434, + "learning_rate": 1.729014027962569e-05, + "loss": 0.2602, + "step": 25278 + }, + { + "epoch": 0.31600790019750497, + "grad_norm": 3.8383491039276123, + "learning_rate": 1.7289542897421984e-05, + "loss": 1.0054, + "step": 25280 + }, + { + "epoch": 0.31603290082252056, + "grad_norm": 4.926910400390625, + "learning_rate": 1.7288945459702154e-05, + "loss": 1.0591, + "step": 25282 + }, + { + "epoch": 0.3160579014475362, + "grad_norm": 0.04419584572315216, + "learning_rate": 1.7288347966470746e-05, + "loss": 0.094, + "step": 25284 + }, + { + "epoch": 0.3160829020725518, + "grad_norm": 0.00678833294659853, + "learning_rate": 1.728775041773231e-05, + "loss": 0.8043, + "step": 25286 + }, + { + "epoch": 0.31610790269756744, + "grad_norm": 5.806065082550049, + "learning_rate": 1.7287152813491396e-05, + "loss": 2.1568, + "step": 25288 + }, + { + "epoch": 0.3161329033225831, + "grad_norm": 1.847642421722412, + "learning_rate": 1.728655515375256e-05, + "loss": 0.3235, + "step": 25290 + }, + { + "epoch": 0.3161579039475987, + "grad_norm": 4.925067901611328, + "learning_rate": 1.7285957438520347e-05, + "loss": 1.5681, + "step": 25292 + }, + { + "epoch": 0.31618290457261433, + "grad_norm": 2.556565523147583, + "learning_rate": 1.7285359667799312e-05, + "loss": 0.7456, + "step": 25294 + }, + { + "epoch": 0.3162079051976299, + "grad_norm": 0.9450318813323975, + "learning_rate": 1.7284761841594007e-05, + "loss": 0.6373, + "step": 25296 + }, + { + "epoch": 0.31623290582264557, + "grad_norm": 3.7724075317382812, + "learning_rate": 1.728416395990899e-05, + "loss": 1.0584, + "step": 25298 + }, + { + "epoch": 0.3162579064476612, + "grad_norm": 3.300259590148926, + "learning_rate": 1.728356602274881e-05, + "loss": 0.8184, + "step": 25300 + }, + { + "epoch": 0.3162829070726768, + "grad_norm": 4.747298240661621, + "learning_rate": 1.7282968030118018e-05, + "loss": 0.8789, + "step": 25302 + }, + { + "epoch": 0.31630790769769246, + "grad_norm": 3.705134868621826, + "learning_rate": 1.7282369982021174e-05, + "loss": 1.317, + "step": 25304 + }, + { + "epoch": 0.31633290832270805, + "grad_norm": 2.2991750240325928, + "learning_rate": 1.7281771878462828e-05, + "loss": 0.4233, + "step": 25306 + }, + { + "epoch": 0.3163579089477237, + "grad_norm": 3.9943788051605225, + "learning_rate": 1.7281173719447538e-05, + "loss": 0.6941, + "step": 25308 + }, + { + "epoch": 0.31638290957273935, + "grad_norm": 1.1189175844192505, + "learning_rate": 1.728057550497986e-05, + "loss": 0.0155, + "step": 25310 + }, + { + "epoch": 0.31640791019775494, + "grad_norm": 0.10058029741048813, + "learning_rate": 1.727997723506435e-05, + "loss": 0.431, + "step": 25312 + }, + { + "epoch": 0.3164329108227706, + "grad_norm": 1.6933798789978027, + "learning_rate": 1.727937890970556e-05, + "loss": 1.0919, + "step": 25314 + }, + { + "epoch": 0.3164579114477862, + "grad_norm": 2.4498519897460938, + "learning_rate": 1.7278780528908053e-05, + "loss": 0.7449, + "step": 25316 + }, + { + "epoch": 0.3164829120728018, + "grad_norm": 3.338299512863159, + "learning_rate": 1.7278182092676376e-05, + "loss": 0.7875, + "step": 25318 + }, + { + "epoch": 0.3165079126978175, + "grad_norm": 0.0033140939194709063, + "learning_rate": 1.7277583601015096e-05, + "loss": 0.6616, + "step": 25320 + }, + { + "epoch": 0.31653291332283306, + "grad_norm": 0.0038579669781029224, + "learning_rate": 1.727698505392877e-05, + "loss": 0.8405, + "step": 25322 + }, + { + "epoch": 0.3165579139478487, + "grad_norm": 3.3401761054992676, + "learning_rate": 1.7276386451421955e-05, + "loss": 0.8822, + "step": 25324 + }, + { + "epoch": 0.3165829145728643, + "grad_norm": 4.220065593719482, + "learning_rate": 1.727578779349921e-05, + "loss": 0.938, + "step": 25326 + }, + { + "epoch": 0.31660791519787995, + "grad_norm": 3.0208773612976074, + "learning_rate": 1.7275189080165096e-05, + "loss": 0.5629, + "step": 25328 + }, + { + "epoch": 0.3166329158228956, + "grad_norm": 2.2773349285125732, + "learning_rate": 1.7274590311424167e-05, + "loss": 0.4807, + "step": 25330 + }, + { + "epoch": 0.3166579164479112, + "grad_norm": 4.549215793609619, + "learning_rate": 1.7273991487280988e-05, + "loss": 0.8854, + "step": 25332 + }, + { + "epoch": 0.31668291707292684, + "grad_norm": 6.148088455200195, + "learning_rate": 1.7273392607740117e-05, + "loss": 0.5662, + "step": 25334 + }, + { + "epoch": 0.31670791769794243, + "grad_norm": 4.193377494812012, + "learning_rate": 1.727279367280612e-05, + "loss": 0.5819, + "step": 25336 + }, + { + "epoch": 0.3167329183229581, + "grad_norm": 5.675304889678955, + "learning_rate": 1.727219468248355e-05, + "loss": 2.9719, + "step": 25338 + }, + { + "epoch": 0.3167579189479737, + "grad_norm": 1.3824464082717896, + "learning_rate": 1.727159563677698e-05, + "loss": 0.258, + "step": 25340 + }, + { + "epoch": 0.3167829195729893, + "grad_norm": 5.305593013763428, + "learning_rate": 1.727099653569096e-05, + "loss": 1.9258, + "step": 25342 + }, + { + "epoch": 0.31680792019800497, + "grad_norm": 1.7782994508743286, + "learning_rate": 1.7270397379230062e-05, + "loss": 0.9217, + "step": 25344 + }, + { + "epoch": 0.31683292082302056, + "grad_norm": 2.0268068313598633, + "learning_rate": 1.7269798167398847e-05, + "loss": 1.0163, + "step": 25346 + }, + { + "epoch": 0.3168579214480362, + "grad_norm": 3.9950754642486572, + "learning_rate": 1.7269198900201875e-05, + "loss": 1.5724, + "step": 25348 + }, + { + "epoch": 0.31688292207305185, + "grad_norm": 2.9640464782714844, + "learning_rate": 1.726859957764371e-05, + "loss": 1.3776, + "step": 25350 + }, + { + "epoch": 0.31690792269806745, + "grad_norm": 3.5931451320648193, + "learning_rate": 1.7268000199728924e-05, + "loss": 1.1427, + "step": 25352 + }, + { + "epoch": 0.3169329233230831, + "grad_norm": 1.918902039527893, + "learning_rate": 1.726740076646207e-05, + "loss": 0.9725, + "step": 25354 + }, + { + "epoch": 0.3169579239480987, + "grad_norm": 2.72446608543396, + "learning_rate": 1.7266801277847723e-05, + "loss": 1.0023, + "step": 25356 + }, + { + "epoch": 0.31698292457311433, + "grad_norm": 2.878863573074341, + "learning_rate": 1.7266201733890444e-05, + "loss": 0.5577, + "step": 25358 + }, + { + "epoch": 0.31700792519813, + "grad_norm": 0.0020611395593732595, + "learning_rate": 1.72656021345948e-05, + "loss": 0.9642, + "step": 25360 + }, + { + "epoch": 0.3170329258231456, + "grad_norm": 4.5161027908325195, + "learning_rate": 1.726500247996536e-05, + "loss": 1.7252, + "step": 25362 + }, + { + "epoch": 0.3170579264481612, + "grad_norm": 2.8497672080993652, + "learning_rate": 1.7264402770006684e-05, + "loss": 0.5596, + "step": 25364 + }, + { + "epoch": 0.3170829270731768, + "grad_norm": 6.468835830688477, + "learning_rate": 1.7263803004723348e-05, + "loss": 1.3782, + "step": 25366 + }, + { + "epoch": 0.31710792769819246, + "grad_norm": 4.266883850097656, + "learning_rate": 1.7263203184119913e-05, + "loss": 1.4568, + "step": 25368 + }, + { + "epoch": 0.3171329283232081, + "grad_norm": 6.157420635223389, + "learning_rate": 1.726260330820095e-05, + "loss": 0.8508, + "step": 25370 + }, + { + "epoch": 0.3171579289482237, + "grad_norm": 4.258925914764404, + "learning_rate": 1.7262003376971027e-05, + "loss": 1.325, + "step": 25372 + }, + { + "epoch": 0.31718292957323935, + "grad_norm": 4.611105918884277, + "learning_rate": 1.7261403390434716e-05, + "loss": 1.8646, + "step": 25374 + }, + { + "epoch": 0.31720793019825494, + "grad_norm": 2.6923139095306396, + "learning_rate": 1.726080334859658e-05, + "loss": 0.7927, + "step": 25376 + }, + { + "epoch": 0.3172329308232706, + "grad_norm": 3.650230884552002, + "learning_rate": 1.7260203251461195e-05, + "loss": 2.0116, + "step": 25378 + }, + { + "epoch": 0.31725793144828623, + "grad_norm": 1.0975440740585327, + "learning_rate": 1.725960309903313e-05, + "loss": 0.0467, + "step": 25380 + }, + { + "epoch": 0.3172829320733018, + "grad_norm": 2.7828989028930664, + "learning_rate": 1.725900289131695e-05, + "loss": 0.5937, + "step": 25382 + }, + { + "epoch": 0.3173079326983175, + "grad_norm": 5.2976179122924805, + "learning_rate": 1.7258402628317235e-05, + "loss": 1.363, + "step": 25384 + }, + { + "epoch": 0.31733293332333307, + "grad_norm": 3.204488515853882, + "learning_rate": 1.7257802310038546e-05, + "loss": 0.5537, + "step": 25386 + }, + { + "epoch": 0.3173579339483487, + "grad_norm": 8.3009614944458, + "learning_rate": 1.7257201936485465e-05, + "loss": 2.4889, + "step": 25388 + }, + { + "epoch": 0.31738293457336436, + "grad_norm": 1.4089932441711426, + "learning_rate": 1.725660150766256e-05, + "loss": 0.4543, + "step": 25390 + }, + { + "epoch": 0.31740793519837995, + "grad_norm": 1.9190236330032349, + "learning_rate": 1.7256001023574406e-05, + "loss": 0.5318, + "step": 25392 + }, + { + "epoch": 0.3174329358233956, + "grad_norm": 5.773799896240234, + "learning_rate": 1.7255400484225572e-05, + "loss": 1.5465, + "step": 25394 + }, + { + "epoch": 0.3174579364484112, + "grad_norm": 0.007027763407677412, + "learning_rate": 1.7254799889620636e-05, + "loss": 0.0006, + "step": 25396 + }, + { + "epoch": 0.31748293707342684, + "grad_norm": 7.128140449523926, + "learning_rate": 1.725419923976417e-05, + "loss": 1.2581, + "step": 25398 + }, + { + "epoch": 0.3175079376984425, + "grad_norm": 2.1456799507141113, + "learning_rate": 1.7253598534660744e-05, + "loss": 1.1826, + "step": 25400 + }, + { + "epoch": 0.3175329383234581, + "grad_norm": 4.026373863220215, + "learning_rate": 1.7252997774314943e-05, + "loss": 1.5532, + "step": 25402 + }, + { + "epoch": 0.31755793894847373, + "grad_norm": 3.8345947265625, + "learning_rate": 1.725239695873133e-05, + "loss": 1.4315, + "step": 25404 + }, + { + "epoch": 0.3175829395734893, + "grad_norm": 3.0899813175201416, + "learning_rate": 1.725179608791449e-05, + "loss": 0.6491, + "step": 25406 + }, + { + "epoch": 0.31760794019850497, + "grad_norm": 5.154409408569336, + "learning_rate": 1.7251195161868997e-05, + "loss": 1.6171, + "step": 25408 + }, + { + "epoch": 0.3176329408235206, + "grad_norm": 3.076180934906006, + "learning_rate": 1.725059418059943e-05, + "loss": 1.1305, + "step": 25410 + }, + { + "epoch": 0.3176579414485362, + "grad_norm": 4.079408645629883, + "learning_rate": 1.724999314411036e-05, + "loss": 1.8702, + "step": 25412 + }, + { + "epoch": 0.31768294207355185, + "grad_norm": 4.012138843536377, + "learning_rate": 1.7249392052406368e-05, + "loss": 0.3621, + "step": 25414 + }, + { + "epoch": 0.31770794269856745, + "grad_norm": 0.020364921540021896, + "learning_rate": 1.724879090549203e-05, + "loss": 0.4913, + "step": 25416 + }, + { + "epoch": 0.3177329433235831, + "grad_norm": 4.255956172943115, + "learning_rate": 1.724818970337193e-05, + "loss": 0.5521, + "step": 25418 + }, + { + "epoch": 0.31775794394859874, + "grad_norm": 2.115161657333374, + "learning_rate": 1.7247588446050638e-05, + "loss": 1.0591, + "step": 25420 + }, + { + "epoch": 0.31778294457361433, + "grad_norm": 2.844115734100342, + "learning_rate": 1.724698713353274e-05, + "loss": 0.5403, + "step": 25422 + }, + { + "epoch": 0.31780794519863, + "grad_norm": 4.911956787109375, + "learning_rate": 1.724638576582281e-05, + "loss": 1.2053, + "step": 25424 + }, + { + "epoch": 0.3178329458236456, + "grad_norm": 3.184983730316162, + "learning_rate": 1.7245784342925432e-05, + "loss": 1.1464, + "step": 25426 + }, + { + "epoch": 0.3178579464486612, + "grad_norm": 3.780764102935791, + "learning_rate": 1.7245182864845182e-05, + "loss": 0.6662, + "step": 25428 + }, + { + "epoch": 0.31788294707367687, + "grad_norm": 0.1931815892457962, + "learning_rate": 1.724458133158665e-05, + "loss": 0.0016, + "step": 25430 + }, + { + "epoch": 0.31790794769869246, + "grad_norm": 0.04386134445667267, + "learning_rate": 1.7243979743154408e-05, + "loss": 1.0972, + "step": 25432 + }, + { + "epoch": 0.3179329483237081, + "grad_norm": 4.2371134757995605, + "learning_rate": 1.724337809955304e-05, + "loss": 0.5871, + "step": 25434 + }, + { + "epoch": 0.3179579489487237, + "grad_norm": 0.019187072291970253, + "learning_rate": 1.724277640078713e-05, + "loss": 0.4429, + "step": 25436 + }, + { + "epoch": 0.31798294957373935, + "grad_norm": 2.3234636783599854, + "learning_rate": 1.7242174646861255e-05, + "loss": 0.6825, + "step": 25438 + }, + { + "epoch": 0.318007950198755, + "grad_norm": 4.2456817626953125, + "learning_rate": 1.7241572837780008e-05, + "loss": 1.0144, + "step": 25440 + }, + { + "epoch": 0.3180329508237706, + "grad_norm": 0.016681035980582237, + "learning_rate": 1.7240970973547964e-05, + "loss": 0.4889, + "step": 25442 + }, + { + "epoch": 0.31805795144878624, + "grad_norm": 1.49723482131958, + "learning_rate": 1.724036905416971e-05, + "loss": 0.5148, + "step": 25444 + }, + { + "epoch": 0.31808295207380183, + "grad_norm": 3.0182178020477295, + "learning_rate": 1.723976707964983e-05, + "loss": 0.3977, + "step": 25446 + }, + { + "epoch": 0.3181079526988175, + "grad_norm": 4.263789653778076, + "learning_rate": 1.72391650499929e-05, + "loss": 0.579, + "step": 25448 + }, + { + "epoch": 0.3181329533238331, + "grad_norm": 2.2030484676361084, + "learning_rate": 1.723856296520352e-05, + "loss": 0.4891, + "step": 25450 + }, + { + "epoch": 0.3181579539488487, + "grad_norm": 0.008553295396268368, + "learning_rate": 1.7237960825286265e-05, + "loss": 0.0224, + "step": 25452 + }, + { + "epoch": 0.31818295457386436, + "grad_norm": 3.125000238418579, + "learning_rate": 1.7237358630245727e-05, + "loss": 1.493, + "step": 25454 + }, + { + "epoch": 0.31820795519887995, + "grad_norm": 4.347564697265625, + "learning_rate": 1.723675638008649e-05, + "loss": 1.433, + "step": 25456 + }, + { + "epoch": 0.3182329558238956, + "grad_norm": 0.009930246509611607, + "learning_rate": 1.7236154074813135e-05, + "loss": 0.0329, + "step": 25458 + }, + { + "epoch": 0.31825795644891125, + "grad_norm": 2.1721954345703125, + "learning_rate": 1.7235551714430255e-05, + "loss": 0.6427, + "step": 25460 + }, + { + "epoch": 0.31828295707392684, + "grad_norm": 2.811850070953369, + "learning_rate": 1.7234949298942437e-05, + "loss": 0.8376, + "step": 25462 + }, + { + "epoch": 0.3183079576989425, + "grad_norm": 6.689491271972656, + "learning_rate": 1.7234346828354273e-05, + "loss": 1.2645, + "step": 25464 + }, + { + "epoch": 0.3183329583239581, + "grad_norm": 9.462681770324707, + "learning_rate": 1.723374430267034e-05, + "loss": 1.6754, + "step": 25466 + }, + { + "epoch": 0.31835795894897373, + "grad_norm": 2.9844164848327637, + "learning_rate": 1.7233141721895235e-05, + "loss": 0.6491, + "step": 25468 + }, + { + "epoch": 0.3183829595739894, + "grad_norm": 4.239630222320557, + "learning_rate": 1.7232539086033548e-05, + "loss": 0.9072, + "step": 25470 + }, + { + "epoch": 0.31840796019900497, + "grad_norm": 6.004729270935059, + "learning_rate": 1.723193639508986e-05, + "loss": 0.8405, + "step": 25472 + }, + { + "epoch": 0.3184329608240206, + "grad_norm": 0.05318477749824524, + "learning_rate": 1.7231333649068777e-05, + "loss": 0.3852, + "step": 25474 + }, + { + "epoch": 0.3184579614490362, + "grad_norm": 1.1799163818359375, + "learning_rate": 1.723073084797487e-05, + "loss": 0.6523, + "step": 25476 + }, + { + "epoch": 0.31848296207405186, + "grad_norm": 2.232128620147705, + "learning_rate": 1.7230127991812745e-05, + "loss": 0.8689, + "step": 25478 + }, + { + "epoch": 0.3185079626990675, + "grad_norm": 4.58129358291626, + "learning_rate": 1.7229525080586986e-05, + "loss": 1.1941, + "step": 25480 + }, + { + "epoch": 0.3185329633240831, + "grad_norm": 6.614104747772217, + "learning_rate": 1.7228922114302184e-05, + "loss": 0.9776, + "step": 25482 + }, + { + "epoch": 0.31855796394909874, + "grad_norm": 0.08024780452251434, + "learning_rate": 1.7228319092962938e-05, + "loss": 0.0717, + "step": 25484 + }, + { + "epoch": 0.31858296457411434, + "grad_norm": 2.203436851501465, + "learning_rate": 1.7227716016573834e-05, + "loss": 0.8597, + "step": 25486 + }, + { + "epoch": 0.31860796519913, + "grad_norm": 5.009816646575928, + "learning_rate": 1.7227112885139465e-05, + "loss": 2.3542, + "step": 25488 + }, + { + "epoch": 0.31863296582414563, + "grad_norm": 2.9305038452148438, + "learning_rate": 1.7226509698664427e-05, + "loss": 0.7637, + "step": 25490 + }, + { + "epoch": 0.3186579664491612, + "grad_norm": 3.6370348930358887, + "learning_rate": 1.7225906457153314e-05, + "loss": 0.557, + "step": 25492 + }, + { + "epoch": 0.31868296707417687, + "grad_norm": 4.034574031829834, + "learning_rate": 1.722530316061072e-05, + "loss": 1.4255, + "step": 25494 + }, + { + "epoch": 0.31870796769919246, + "grad_norm": 5.556608200073242, + "learning_rate": 1.7224699809041238e-05, + "loss": 0.4001, + "step": 25496 + }, + { + "epoch": 0.3187329683242081, + "grad_norm": 5.67066764831543, + "learning_rate": 1.7224096402449466e-05, + "loss": 1.5161, + "step": 25498 + }, + { + "epoch": 0.31875796894922376, + "grad_norm": 4.848753929138184, + "learning_rate": 1.7223492940839993e-05, + "loss": 1.7853, + "step": 25500 + }, + { + "epoch": 0.31878296957423935, + "grad_norm": 2.6477878093719482, + "learning_rate": 1.722288942421742e-05, + "loss": 0.7058, + "step": 25502 + }, + { + "epoch": 0.318807970199255, + "grad_norm": 2.18817138671875, + "learning_rate": 1.7222285852586344e-05, + "loss": 0.6192, + "step": 25504 + }, + { + "epoch": 0.3188329708242706, + "grad_norm": 4.77048921585083, + "learning_rate": 1.722168222595136e-05, + "loss": 0.3619, + "step": 25506 + }, + { + "epoch": 0.31885797144928624, + "grad_norm": 3.096644639968872, + "learning_rate": 1.7221078544317067e-05, + "loss": 0.4797, + "step": 25508 + }, + { + "epoch": 0.3188829720743019, + "grad_norm": 3.5022292137145996, + "learning_rate": 1.722047480768806e-05, + "loss": 2.313, + "step": 25510 + }, + { + "epoch": 0.3189079726993175, + "grad_norm": 0.03968961536884308, + "learning_rate": 1.7219871016068938e-05, + "loss": 1.1838, + "step": 25512 + }, + { + "epoch": 0.3189329733243331, + "grad_norm": 3.73126220703125, + "learning_rate": 1.72192671694643e-05, + "loss": 0.516, + "step": 25514 + }, + { + "epoch": 0.3189579739493487, + "grad_norm": 2.8091485500335693, + "learning_rate": 1.721866326787874e-05, + "loss": 1.3364, + "step": 25516 + }, + { + "epoch": 0.31898297457436436, + "grad_norm": 2.6153130531311035, + "learning_rate": 1.7218059311316868e-05, + "loss": 0.6646, + "step": 25518 + }, + { + "epoch": 0.31900797519938, + "grad_norm": 2.414618730545044, + "learning_rate": 1.7217455299783272e-05, + "loss": 0.5358, + "step": 25520 + }, + { + "epoch": 0.3190329758243956, + "grad_norm": 4.238138675689697, + "learning_rate": 1.721685123328256e-05, + "loss": 1.2913, + "step": 25522 + }, + { + "epoch": 0.31905797644941125, + "grad_norm": 2.184250593185425, + "learning_rate": 1.7216247111819328e-05, + "loss": 0.3407, + "step": 25524 + }, + { + "epoch": 0.31908297707442684, + "grad_norm": 2.0601019859313965, + "learning_rate": 1.721564293539818e-05, + "loss": 1.2709, + "step": 25526 + }, + { + "epoch": 0.3191079776994425, + "grad_norm": 3.1737308502197266, + "learning_rate": 1.7215038704023714e-05, + "loss": 0.3776, + "step": 25528 + }, + { + "epoch": 0.31913297832445814, + "grad_norm": 3.1986920833587646, + "learning_rate": 1.7214434417700534e-05, + "loss": 1.5327, + "step": 25530 + }, + { + "epoch": 0.31915797894947373, + "grad_norm": 4.5209455490112305, + "learning_rate": 1.721383007643324e-05, + "loss": 0.9445, + "step": 25532 + }, + { + "epoch": 0.3191829795744894, + "grad_norm": 0.8584669828414917, + "learning_rate": 1.721322568022644e-05, + "loss": 0.9889, + "step": 25534 + }, + { + "epoch": 0.31920798019950497, + "grad_norm": 3.802999496459961, + "learning_rate": 1.721262122908473e-05, + "loss": 0.9881, + "step": 25536 + }, + { + "epoch": 0.3192329808245206, + "grad_norm": 0.00493523757904768, + "learning_rate": 1.7212016723012715e-05, + "loss": 0.0809, + "step": 25538 + }, + { + "epoch": 0.31925798144953627, + "grad_norm": 4.654892444610596, + "learning_rate": 1.7211412162015006e-05, + "loss": 0.9519, + "step": 25540 + }, + { + "epoch": 0.31928298207455186, + "grad_norm": 3.030611515045166, + "learning_rate": 1.72108075460962e-05, + "loss": 1.0505, + "step": 25542 + }, + { + "epoch": 0.3193079826995675, + "grad_norm": 1.1666773557662964, + "learning_rate": 1.72102028752609e-05, + "loss": 0.9653, + "step": 25544 + }, + { + "epoch": 0.3193329833245831, + "grad_norm": 0.01567283645272255, + "learning_rate": 1.720959814951372e-05, + "loss": 0.2122, + "step": 25546 + }, + { + "epoch": 0.31935798394959874, + "grad_norm": 2.9393539428710938, + "learning_rate": 1.720899336885926e-05, + "loss": 0.4695, + "step": 25548 + }, + { + "epoch": 0.3193829845746144, + "grad_norm": 4.137389659881592, + "learning_rate": 1.7208388533302118e-05, + "loss": 0.5802, + "step": 25550 + }, + { + "epoch": 0.31940798519963, + "grad_norm": 3.0797975063323975, + "learning_rate": 1.7207783642846912e-05, + "loss": 1.5703, + "step": 25552 + }, + { + "epoch": 0.31943298582464563, + "grad_norm": 5.638492107391357, + "learning_rate": 1.720717869749825e-05, + "loss": 1.1891, + "step": 25554 + }, + { + "epoch": 0.3194579864496612, + "grad_norm": 5.636478424072266, + "learning_rate": 1.720657369726073e-05, + "loss": 1.7291, + "step": 25556 + }, + { + "epoch": 0.31948298707467687, + "grad_norm": 2.143345594406128, + "learning_rate": 1.720596864213896e-05, + "loss": 0.1535, + "step": 25558 + }, + { + "epoch": 0.3195079876996925, + "grad_norm": 0.01028808206319809, + "learning_rate": 1.7205363532137558e-05, + "loss": 0.5798, + "step": 25560 + }, + { + "epoch": 0.3195329883247081, + "grad_norm": 3.0124757289886475, + "learning_rate": 1.7204758367261126e-05, + "loss": 1.1477, + "step": 25562 + }, + { + "epoch": 0.31955798894972376, + "grad_norm": 3.433378219604492, + "learning_rate": 1.720415314751427e-05, + "loss": 0.2654, + "step": 25564 + }, + { + "epoch": 0.31958298957473935, + "grad_norm": 0.019403858110308647, + "learning_rate": 1.7203547872901605e-05, + "loss": 1.0201, + "step": 25566 + }, + { + "epoch": 0.319607990199755, + "grad_norm": 4.325021266937256, + "learning_rate": 1.7202942543427735e-05, + "loss": 1.5015, + "step": 25568 + }, + { + "epoch": 0.31963299082477065, + "grad_norm": 2.3254282474517822, + "learning_rate": 1.7202337159097278e-05, + "loss": 1.0768, + "step": 25570 + }, + { + "epoch": 0.31965799144978624, + "grad_norm": 7.44813346862793, + "learning_rate": 1.7201731719914837e-05, + "loss": 1.1442, + "step": 25572 + }, + { + "epoch": 0.3196829920748019, + "grad_norm": 4.8349714279174805, + "learning_rate": 1.7201126225885027e-05, + "loss": 0.6605, + "step": 25574 + }, + { + "epoch": 0.3197079926998175, + "grad_norm": 2.3796517848968506, + "learning_rate": 1.7200520677012456e-05, + "loss": 0.7392, + "step": 25576 + }, + { + "epoch": 0.3197329933248331, + "grad_norm": 3.6952152252197266, + "learning_rate": 1.719991507330174e-05, + "loss": 1.423, + "step": 25578 + }, + { + "epoch": 0.3197579939498488, + "grad_norm": 0.011256367899477482, + "learning_rate": 1.7199309414757483e-05, + "loss": 0.5932, + "step": 25580 + }, + { + "epoch": 0.31978299457486437, + "grad_norm": 3.4740421772003174, + "learning_rate": 1.719870370138431e-05, + "loss": 1.4475, + "step": 25582 + }, + { + "epoch": 0.31980799519988, + "grad_norm": 3.7655224800109863, + "learning_rate": 1.7198097933186826e-05, + "loss": 0.4298, + "step": 25584 + }, + { + "epoch": 0.3198329958248956, + "grad_norm": 1.3798774480819702, + "learning_rate": 1.7197492110169646e-05, + "loss": 0.06, + "step": 25586 + }, + { + "epoch": 0.31985799644991125, + "grad_norm": 0.008417278528213501, + "learning_rate": 1.7196886232337383e-05, + "loss": 0.2212, + "step": 25588 + }, + { + "epoch": 0.3198829970749269, + "grad_norm": 1.4650474786758423, + "learning_rate": 1.719628029969465e-05, + "loss": 1.5262, + "step": 25590 + }, + { + "epoch": 0.3199079976999425, + "grad_norm": 3.8577351570129395, + "learning_rate": 1.719567431224607e-05, + "loss": 1.1023, + "step": 25592 + }, + { + "epoch": 0.31993299832495814, + "grad_norm": 2.4696834087371826, + "learning_rate": 1.7195068269996248e-05, + "loss": 0.5282, + "step": 25594 + }, + { + "epoch": 0.31995799894997373, + "grad_norm": 2.017482280731201, + "learning_rate": 1.71944621729498e-05, + "loss": 0.3237, + "step": 25596 + }, + { + "epoch": 0.3199829995749894, + "grad_norm": 2.1695961952209473, + "learning_rate": 1.7193856021111352e-05, + "loss": 1.5428, + "step": 25598 + }, + { + "epoch": 0.320008000200005, + "grad_norm": 4.760039806365967, + "learning_rate": 1.7193249814485514e-05, + "loss": 0.957, + "step": 25600 + }, + { + "epoch": 0.3200330008250206, + "grad_norm": 3.415160894393921, + "learning_rate": 1.7192643553076898e-05, + "loss": 1.2599, + "step": 25602 + }, + { + "epoch": 0.32005800145003627, + "grad_norm": 5.159355640411377, + "learning_rate": 1.7192037236890126e-05, + "loss": 1.7538, + "step": 25604 + }, + { + "epoch": 0.32008300207505186, + "grad_norm": 4.3618245124816895, + "learning_rate": 1.719143086592982e-05, + "loss": 0.874, + "step": 25606 + }, + { + "epoch": 0.3201080027000675, + "grad_norm": 8.54335880279541, + "learning_rate": 1.7190824440200587e-05, + "loss": 1.723, + "step": 25608 + }, + { + "epoch": 0.32013300332508315, + "grad_norm": 2.9610753059387207, + "learning_rate": 1.719021795970706e-05, + "loss": 0.9862, + "step": 25610 + }, + { + "epoch": 0.32015800395009875, + "grad_norm": 2.4480717182159424, + "learning_rate": 1.7189611424453843e-05, + "loss": 0.7416, + "step": 25612 + }, + { + "epoch": 0.3201830045751144, + "grad_norm": 0.005978505127131939, + "learning_rate": 1.7189004834445565e-05, + "loss": 0.104, + "step": 25614 + }, + { + "epoch": 0.32020800520013, + "grad_norm": 2.186892509460449, + "learning_rate": 1.7188398189686843e-05, + "loss": 0.8194, + "step": 25616 + }, + { + "epoch": 0.32023300582514563, + "grad_norm": 1.9118247032165527, + "learning_rate": 1.7187791490182294e-05, + "loss": 0.7084, + "step": 25618 + }, + { + "epoch": 0.3202580064501613, + "grad_norm": 3.217134714126587, + "learning_rate": 1.7187184735936546e-05, + "loss": 1.1357, + "step": 25620 + }, + { + "epoch": 0.3202830070751769, + "grad_norm": 4.375315189361572, + "learning_rate": 1.7186577926954212e-05, + "loss": 0.2425, + "step": 25622 + }, + { + "epoch": 0.3203080077001925, + "grad_norm": 4.220244407653809, + "learning_rate": 1.718597106323992e-05, + "loss": 1.7084, + "step": 25624 + }, + { + "epoch": 0.3203330083252081, + "grad_norm": 3.754145860671997, + "learning_rate": 1.7185364144798288e-05, + "loss": 1.2301, + "step": 25626 + }, + { + "epoch": 0.32035800895022376, + "grad_norm": 2.0498721599578857, + "learning_rate": 1.7184757171633935e-05, + "loss": 0.8578, + "step": 25628 + }, + { + "epoch": 0.3203830095752394, + "grad_norm": 4.386748313903809, + "learning_rate": 1.718415014375149e-05, + "loss": 1.9522, + "step": 25630 + }, + { + "epoch": 0.320408010200255, + "grad_norm": 0.44047555327415466, + "learning_rate": 1.7183543061155572e-05, + "loss": 0.0074, + "step": 25632 + }, + { + "epoch": 0.32043301082527065, + "grad_norm": 4.974932670593262, + "learning_rate": 1.7182935923850806e-05, + "loss": 2.0824, + "step": 25634 + }, + { + "epoch": 0.32045801145028624, + "grad_norm": 4.131364822387695, + "learning_rate": 1.718232873184182e-05, + "loss": 0.886, + "step": 25636 + }, + { + "epoch": 0.3204830120753019, + "grad_norm": 5.785496234893799, + "learning_rate": 1.718172148513323e-05, + "loss": 0.748, + "step": 25638 + }, + { + "epoch": 0.32050801270031753, + "grad_norm": 3.1114861965179443, + "learning_rate": 1.7181114183729665e-05, + "loss": 1.0178, + "step": 25640 + }, + { + "epoch": 0.3205330133253331, + "grad_norm": 3.90082049369812, + "learning_rate": 1.718050682763575e-05, + "loss": 0.3069, + "step": 25642 + }, + { + "epoch": 0.3205580139503488, + "grad_norm": 4.457705974578857, + "learning_rate": 1.7179899416856113e-05, + "loss": 1.0589, + "step": 25644 + }, + { + "epoch": 0.32058301457536437, + "grad_norm": 1.9036788940429688, + "learning_rate": 1.7179291951395374e-05, + "loss": 0.7102, + "step": 25646 + }, + { + "epoch": 0.32060801520038, + "grad_norm": 3.7516236305236816, + "learning_rate": 1.7178684431258163e-05, + "loss": 0.5911, + "step": 25648 + }, + { + "epoch": 0.32063301582539566, + "grad_norm": 5.1929545402526855, + "learning_rate": 1.717807685644911e-05, + "loss": 1.1723, + "step": 25650 + }, + { + "epoch": 0.32065801645041125, + "grad_norm": 2.482177972793579, + "learning_rate": 1.7177469226972837e-05, + "loss": 0.0836, + "step": 25652 + }, + { + "epoch": 0.3206830170754269, + "grad_norm": 4.282087326049805, + "learning_rate": 1.7176861542833968e-05, + "loss": 0.4915, + "step": 25654 + }, + { + "epoch": 0.3207080177004425, + "grad_norm": 6.63112735748291, + "learning_rate": 1.7176253804037143e-05, + "loss": 2.1124, + "step": 25656 + }, + { + "epoch": 0.32073301832545814, + "grad_norm": 1.6707450151443481, + "learning_rate": 1.7175646010586985e-05, + "loss": 0.5629, + "step": 25658 + }, + { + "epoch": 0.3207580189504738, + "grad_norm": 3.1453349590301514, + "learning_rate": 1.7175038162488117e-05, + "loss": 0.8744, + "step": 25660 + }, + { + "epoch": 0.3207830195754894, + "grad_norm": 2.8718299865722656, + "learning_rate": 1.7174430259745176e-05, + "loss": 0.9327, + "step": 25662 + }, + { + "epoch": 0.32080802020050503, + "grad_norm": 10.666459083557129, + "learning_rate": 1.7173822302362785e-05, + "loss": 1.3181, + "step": 25664 + }, + { + "epoch": 0.3208330208255206, + "grad_norm": 1.8584725856781006, + "learning_rate": 1.717321429034558e-05, + "loss": 0.9832, + "step": 25666 + }, + { + "epoch": 0.32085802145053627, + "grad_norm": 1.6832818984985352, + "learning_rate": 1.7172606223698188e-05, + "loss": 0.4543, + "step": 25668 + }, + { + "epoch": 0.3208830220755519, + "grad_norm": 3.998579263687134, + "learning_rate": 1.7171998102425245e-05, + "loss": 0.9933, + "step": 25670 + }, + { + "epoch": 0.3209080227005675, + "grad_norm": 0.005098208785057068, + "learning_rate": 1.7171389926531373e-05, + "loss": 0.5346, + "step": 25672 + }, + { + "epoch": 0.32093302332558316, + "grad_norm": 0.13971532881259918, + "learning_rate": 1.7170781696021216e-05, + "loss": 0.0029, + "step": 25674 + }, + { + "epoch": 0.32095802395059875, + "grad_norm": 3.9039785861968994, + "learning_rate": 1.71701734108994e-05, + "loss": 0.8933, + "step": 25676 + }, + { + "epoch": 0.3209830245756144, + "grad_norm": 2.673089027404785, + "learning_rate": 1.7169565071170547e-05, + "loss": 0.7985, + "step": 25678 + }, + { + "epoch": 0.32100802520063004, + "grad_norm": 11.35559368133545, + "learning_rate": 1.716895667683931e-05, + "loss": 1.1823, + "step": 25680 + }, + { + "epoch": 0.32103302582564563, + "grad_norm": 21.899036407470703, + "learning_rate": 1.716834822791031e-05, + "loss": 0.891, + "step": 25682 + }, + { + "epoch": 0.3210580264506613, + "grad_norm": 0.8448024988174438, + "learning_rate": 1.7167739724388182e-05, + "loss": 0.1799, + "step": 25684 + }, + { + "epoch": 0.3210830270756769, + "grad_norm": 3.1114208698272705, + "learning_rate": 1.7167131166277563e-05, + "loss": 1.0127, + "step": 25686 + }, + { + "epoch": 0.3211080277006925, + "grad_norm": 5.918696880340576, + "learning_rate": 1.7166522553583088e-05, + "loss": 0.8688, + "step": 25688 + }, + { + "epoch": 0.32113302832570817, + "grad_norm": 1.646448016166687, + "learning_rate": 1.7165913886309387e-05, + "loss": 0.0868, + "step": 25690 + }, + { + "epoch": 0.32115802895072376, + "grad_norm": 3.222964286804199, + "learning_rate": 1.7165305164461106e-05, + "loss": 1.2134, + "step": 25692 + }, + { + "epoch": 0.3211830295757394, + "grad_norm": 0.012627501972019672, + "learning_rate": 1.716469638804287e-05, + "loss": 0.6201, + "step": 25694 + }, + { + "epoch": 0.321208030200755, + "grad_norm": 3.2926840782165527, + "learning_rate": 1.7164087557059317e-05, + "loss": 0.9153, + "step": 25696 + }, + { + "epoch": 0.32123303082577065, + "grad_norm": 4.9018874168396, + "learning_rate": 1.716347867151509e-05, + "loss": 0.7203, + "step": 25698 + }, + { + "epoch": 0.3212580314507863, + "grad_norm": 3.6266674995422363, + "learning_rate": 1.716286973141482e-05, + "loss": 1.6543, + "step": 25700 + }, + { + "epoch": 0.3212830320758019, + "grad_norm": 6.194333076477051, + "learning_rate": 1.7162260736763146e-05, + "loss": 1.393, + "step": 25702 + }, + { + "epoch": 0.32130803270081754, + "grad_norm": 2.817361831665039, + "learning_rate": 1.7161651687564712e-05, + "loss": 0.7211, + "step": 25704 + }, + { + "epoch": 0.32133303332583313, + "grad_norm": 0.004414936527609825, + "learning_rate": 1.716104258382415e-05, + "loss": 0.5937, + "step": 25706 + }, + { + "epoch": 0.3213580339508488, + "grad_norm": 3.457350730895996, + "learning_rate": 1.7160433425546096e-05, + "loss": 1.4474, + "step": 25708 + }, + { + "epoch": 0.3213830345758644, + "grad_norm": 1.8887598514556885, + "learning_rate": 1.7159824212735198e-05, + "loss": 1.1842, + "step": 25710 + }, + { + "epoch": 0.32140803520088, + "grad_norm": 3.232905864715576, + "learning_rate": 1.7159214945396086e-05, + "loss": 0.6115, + "step": 25712 + }, + { + "epoch": 0.32143303582589566, + "grad_norm": 0.013909194618463516, + "learning_rate": 1.715860562353341e-05, + "loss": 0.2951, + "step": 25714 + }, + { + "epoch": 0.32145803645091126, + "grad_norm": 3.1772119998931885, + "learning_rate": 1.7157996247151806e-05, + "loss": 0.6265, + "step": 25716 + }, + { + "epoch": 0.3214830370759269, + "grad_norm": 2.533104658126831, + "learning_rate": 1.7157386816255918e-05, + "loss": 1.7976, + "step": 25718 + }, + { + "epoch": 0.32150803770094255, + "grad_norm": 5.6661810874938965, + "learning_rate": 1.7156777330850377e-05, + "loss": 1.4057, + "step": 25720 + }, + { + "epoch": 0.32153303832595814, + "grad_norm": 4.032831192016602, + "learning_rate": 1.7156167790939833e-05, + "loss": 0.4947, + "step": 25722 + }, + { + "epoch": 0.3215580389509738, + "grad_norm": 3.480675458908081, + "learning_rate": 1.715555819652893e-05, + "loss": 0.2562, + "step": 25724 + }, + { + "epoch": 0.3215830395759894, + "grad_norm": 2.7812306880950928, + "learning_rate": 1.715494854762231e-05, + "loss": 0.1078, + "step": 25726 + }, + { + "epoch": 0.32160804020100503, + "grad_norm": 3.689014196395874, + "learning_rate": 1.7154338844224606e-05, + "loss": 2.4667, + "step": 25728 + }, + { + "epoch": 0.3216330408260207, + "grad_norm": 0.6829531192779541, + "learning_rate": 1.7153729086340475e-05, + "loss": 0.7165, + "step": 25730 + }, + { + "epoch": 0.32165804145103627, + "grad_norm": 1.434050440788269, + "learning_rate": 1.7153119273974554e-05, + "loss": 0.5435, + "step": 25732 + }, + { + "epoch": 0.3216830420760519, + "grad_norm": 4.35767126083374, + "learning_rate": 1.715250940713149e-05, + "loss": 0.7556, + "step": 25734 + }, + { + "epoch": 0.3217080427010675, + "grad_norm": 4.254301071166992, + "learning_rate": 1.715189948581592e-05, + "loss": 1.765, + "step": 25736 + }, + { + "epoch": 0.32173304332608316, + "grad_norm": 6.664880275726318, + "learning_rate": 1.71512895100325e-05, + "loss": 2.6519, + "step": 25738 + }, + { + "epoch": 0.3217580439510988, + "grad_norm": 4.697746276855469, + "learning_rate": 1.715067947978587e-05, + "loss": 1.1295, + "step": 25740 + }, + { + "epoch": 0.3217830445761144, + "grad_norm": 1.601096272468567, + "learning_rate": 1.7150069395080672e-05, + "loss": 0.448, + "step": 25742 + }, + { + "epoch": 0.32180804520113004, + "grad_norm": 0.6742838025093079, + "learning_rate": 1.7149459255921563e-05, + "loss": 0.5352, + "step": 25744 + }, + { + "epoch": 0.32183304582614564, + "grad_norm": 1.8079346418380737, + "learning_rate": 1.7148849062313176e-05, + "loss": 1.4853, + "step": 25746 + }, + { + "epoch": 0.3218580464511613, + "grad_norm": 3.6139700412750244, + "learning_rate": 1.7148238814260172e-05, + "loss": 1.6159, + "step": 25748 + }, + { + "epoch": 0.32188304707617693, + "grad_norm": 3.2259464263916016, + "learning_rate": 1.7147628511767188e-05, + "loss": 0.4249, + "step": 25750 + }, + { + "epoch": 0.3219080477011925, + "grad_norm": 3.244140148162842, + "learning_rate": 1.7147018154838876e-05, + "loss": 0.6996, + "step": 25752 + }, + { + "epoch": 0.32193304832620817, + "grad_norm": 0.004282653797417879, + "learning_rate": 1.7146407743479888e-05, + "loss": 0.131, + "step": 25754 + }, + { + "epoch": 0.32195804895122376, + "grad_norm": 1.605790138244629, + "learning_rate": 1.7145797277694866e-05, + "loss": 0.7455, + "step": 25756 + }, + { + "epoch": 0.3219830495762394, + "grad_norm": 3.6220200061798096, + "learning_rate": 1.7145186757488464e-05, + "loss": 1.2747, + "step": 25758 + }, + { + "epoch": 0.32200805020125506, + "grad_norm": 3.1158483028411865, + "learning_rate": 1.714457618286533e-05, + "loss": 0.8601, + "step": 25760 + }, + { + "epoch": 0.32203305082627065, + "grad_norm": 1.9739818572998047, + "learning_rate": 1.7143965553830112e-05, + "loss": 0.844, + "step": 25762 + }, + { + "epoch": 0.3220580514512863, + "grad_norm": 0.3207736313343048, + "learning_rate": 1.7143354870387465e-05, + "loss": 0.6575, + "step": 25764 + }, + { + "epoch": 0.3220830520763019, + "grad_norm": 2.822190284729004, + "learning_rate": 1.7142744132542036e-05, + "loss": 0.5572, + "step": 25766 + }, + { + "epoch": 0.32210805270131754, + "grad_norm": 3.520833730697632, + "learning_rate": 1.7142133340298478e-05, + "loss": 0.524, + "step": 25768 + }, + { + "epoch": 0.3221330533263332, + "grad_norm": 3.3307864665985107, + "learning_rate": 1.7141522493661446e-05, + "loss": 1.0509, + "step": 25770 + }, + { + "epoch": 0.3221580539513488, + "grad_norm": 0.004394253715872765, + "learning_rate": 1.7140911592635584e-05, + "loss": 0.2949, + "step": 25772 + }, + { + "epoch": 0.3221830545763644, + "grad_norm": 3.080186605453491, + "learning_rate": 1.7140300637225552e-05, + "loss": 0.9621, + "step": 25774 + }, + { + "epoch": 0.32220805520138, + "grad_norm": 2.50016713142395, + "learning_rate": 1.7139689627435994e-05, + "loss": 0.5461, + "step": 25776 + }, + { + "epoch": 0.32223305582639566, + "grad_norm": 3.740330696105957, + "learning_rate": 1.7139078563271577e-05, + "loss": 1.3764, + "step": 25778 + }, + { + "epoch": 0.3222580564514113, + "grad_norm": 2.7204606533050537, + "learning_rate": 1.7138467444736944e-05, + "loss": 0.9849, + "step": 25780 + }, + { + "epoch": 0.3222830570764269, + "grad_norm": 2.3476357460021973, + "learning_rate": 1.7137856271836753e-05, + "loss": 0.9204, + "step": 25782 + }, + { + "epoch": 0.32230805770144255, + "grad_norm": 5.487532615661621, + "learning_rate": 1.7137245044575657e-05, + "loss": 1.7059, + "step": 25784 + }, + { + "epoch": 0.32233305832645814, + "grad_norm": 3.2364792823791504, + "learning_rate": 1.7136633762958315e-05, + "loss": 0.424, + "step": 25786 + }, + { + "epoch": 0.3223580589514738, + "grad_norm": 1.5044690370559692, + "learning_rate": 1.7136022426989377e-05, + "loss": 0.3163, + "step": 25788 + }, + { + "epoch": 0.32238305957648944, + "grad_norm": 4.496176719665527, + "learning_rate": 1.7135411036673503e-05, + "loss": 1.1953, + "step": 25790 + }, + { + "epoch": 0.32240806020150503, + "grad_norm": 4.979851722717285, + "learning_rate": 1.7134799592015343e-05, + "loss": 0.6061, + "step": 25792 + }, + { + "epoch": 0.3224330608265207, + "grad_norm": 2.2025094032287598, + "learning_rate": 1.7134188093019562e-05, + "loss": 1.0703, + "step": 25794 + }, + { + "epoch": 0.32245806145153627, + "grad_norm": 0.0020982034038752317, + "learning_rate": 1.7133576539690816e-05, + "loss": 1.3143, + "step": 25796 + }, + { + "epoch": 0.3224830620765519, + "grad_norm": 3.343592643737793, + "learning_rate": 1.7132964932033752e-05, + "loss": 1.4651, + "step": 25798 + }, + { + "epoch": 0.32250806270156757, + "grad_norm": 0.9209179878234863, + "learning_rate": 1.7132353270053043e-05, + "loss": 0.2647, + "step": 25800 + }, + { + "epoch": 0.32253306332658316, + "grad_norm": 4.359158039093018, + "learning_rate": 1.7131741553753335e-05, + "loss": 0.9327, + "step": 25802 + }, + { + "epoch": 0.3225580639515988, + "grad_norm": 9.176262855529785, + "learning_rate": 1.7131129783139296e-05, + "loss": 0.2495, + "step": 25804 + }, + { + "epoch": 0.3225830645766144, + "grad_norm": 3.252774238586426, + "learning_rate": 1.713051795821558e-05, + "loss": 1.5244, + "step": 25806 + }, + { + "epoch": 0.32260806520163005, + "grad_norm": 0.8260684609413147, + "learning_rate": 1.7129906078986845e-05, + "loss": 0.838, + "step": 25808 + }, + { + "epoch": 0.3226330658266457, + "grad_norm": 5.252350330352783, + "learning_rate": 1.7129294145457755e-05, + "loss": 1.3423, + "step": 25810 + }, + { + "epoch": 0.3226580664516613, + "grad_norm": 5.774500370025635, + "learning_rate": 1.7128682157632972e-05, + "loss": 2.9445, + "step": 25812 + }, + { + "epoch": 0.32268306707667693, + "grad_norm": 1.4994187355041504, + "learning_rate": 1.712807011551715e-05, + "loss": 0.0276, + "step": 25814 + }, + { + "epoch": 0.3227080677016925, + "grad_norm": 2.3714988231658936, + "learning_rate": 1.7127458019114955e-05, + "loss": 0.5132, + "step": 25816 + }, + { + "epoch": 0.3227330683267082, + "grad_norm": 1.9583215713500977, + "learning_rate": 1.712684586843105e-05, + "loss": 1.5195, + "step": 25818 + }, + { + "epoch": 0.3227580689517238, + "grad_norm": 2.411649703979492, + "learning_rate": 1.7126233663470092e-05, + "loss": 0.535, + "step": 25820 + }, + { + "epoch": 0.3227830695767394, + "grad_norm": 4.520749568939209, + "learning_rate": 1.712562140423675e-05, + "loss": 1.4144, + "step": 25822 + }, + { + "epoch": 0.32280807020175506, + "grad_norm": 3.7355117797851562, + "learning_rate": 1.7125009090735674e-05, + "loss": 1.6547, + "step": 25824 + }, + { + "epoch": 0.32283307082677065, + "grad_norm": 0.003473531221970916, + "learning_rate": 1.7124396722971542e-05, + "loss": 0.2924, + "step": 25826 + }, + { + "epoch": 0.3228580714517863, + "grad_norm": 0.24340827763080597, + "learning_rate": 1.7123784300949014e-05, + "loss": 0.2104, + "step": 25828 + }, + { + "epoch": 0.32288307207680195, + "grad_norm": 0.4588046669960022, + "learning_rate": 1.712317182467275e-05, + "loss": 0.3223, + "step": 25830 + }, + { + "epoch": 0.32290807270181754, + "grad_norm": 2.4819910526275635, + "learning_rate": 1.712255929414742e-05, + "loss": 1.0477, + "step": 25832 + }, + { + "epoch": 0.3229330733268332, + "grad_norm": 2.5274274349212646, + "learning_rate": 1.7121946709377678e-05, + "loss": 1.7212, + "step": 25834 + }, + { + "epoch": 0.3229580739518488, + "grad_norm": 3.0264673233032227, + "learning_rate": 1.7121334070368205e-05, + "loss": 0.7764, + "step": 25836 + }, + { + "epoch": 0.3229830745768644, + "grad_norm": 3.0543575286865234, + "learning_rate": 1.7120721377123653e-05, + "loss": 0.9699, + "step": 25838 + }, + { + "epoch": 0.3230080752018801, + "grad_norm": 0.0051254816353321075, + "learning_rate": 1.7120108629648698e-05, + "loss": 0.5691, + "step": 25840 + }, + { + "epoch": 0.32303307582689567, + "grad_norm": 1.2175815105438232, + "learning_rate": 1.7119495827948e-05, + "loss": 0.2675, + "step": 25842 + }, + { + "epoch": 0.3230580764519113, + "grad_norm": 0.07971689850091934, + "learning_rate": 1.711888297202623e-05, + "loss": 0.7972, + "step": 25844 + }, + { + "epoch": 0.3230830770769269, + "grad_norm": 4.7004714012146, + "learning_rate": 1.7118270061888055e-05, + "loss": 1.6854, + "step": 25846 + }, + { + "epoch": 0.32310807770194255, + "grad_norm": 7.3783674240112305, + "learning_rate": 1.7117657097538137e-05, + "loss": 1.2562, + "step": 25848 + }, + { + "epoch": 0.3231330783269582, + "grad_norm": 4.567975044250488, + "learning_rate": 1.7117044078981156e-05, + "loss": 0.975, + "step": 25850 + }, + { + "epoch": 0.3231580789519738, + "grad_norm": 0.0021637880709022284, + "learning_rate": 1.711643100622177e-05, + "loss": 0.1435, + "step": 25852 + }, + { + "epoch": 0.32318307957698944, + "grad_norm": 2.2354512214660645, + "learning_rate": 1.7115817879264648e-05, + "loss": 1.4732, + "step": 25854 + }, + { + "epoch": 0.32320808020200503, + "grad_norm": 5.512148857116699, + "learning_rate": 1.7115204698114468e-05, + "loss": 0.6066, + "step": 25856 + }, + { + "epoch": 0.3232330808270207, + "grad_norm": 1.9512497186660767, + "learning_rate": 1.7114591462775898e-05, + "loss": 0.6563, + "step": 25858 + }, + { + "epoch": 0.3232580814520363, + "grad_norm": 8.8252592086792, + "learning_rate": 1.7113978173253602e-05, + "loss": 0.6883, + "step": 25860 + }, + { + "epoch": 0.3232830820770519, + "grad_norm": 3.0894546508789062, + "learning_rate": 1.7113364829552253e-05, + "loss": 0.6803, + "step": 25862 + }, + { + "epoch": 0.32330808270206757, + "grad_norm": 4.286130428314209, + "learning_rate": 1.711275143167652e-05, + "loss": 0.6389, + "step": 25864 + }, + { + "epoch": 0.32333308332708316, + "grad_norm": 4.282675266265869, + "learning_rate": 1.7112137979631086e-05, + "loss": 0.5766, + "step": 25866 + }, + { + "epoch": 0.3233580839520988, + "grad_norm": 3.5130298137664795, + "learning_rate": 1.7111524473420612e-05, + "loss": 0.2157, + "step": 25868 + }, + { + "epoch": 0.32338308457711445, + "grad_norm": 2.6446826457977295, + "learning_rate": 1.7110910913049773e-05, + "loss": 1.3089, + "step": 25870 + }, + { + "epoch": 0.32340808520213005, + "grad_norm": 3.419579029083252, + "learning_rate": 1.7110297298523243e-05, + "loss": 0.7891, + "step": 25872 + }, + { + "epoch": 0.3234330858271457, + "grad_norm": 0.0028843192849308252, + "learning_rate": 1.710968362984569e-05, + "loss": 1.3727, + "step": 25874 + }, + { + "epoch": 0.3234580864521613, + "grad_norm": 2.1910033226013184, + "learning_rate": 1.7109069907021796e-05, + "loss": 0.9806, + "step": 25876 + }, + { + "epoch": 0.32348308707717693, + "grad_norm": 4.787562370300293, + "learning_rate": 1.710845613005623e-05, + "loss": 1.033, + "step": 25878 + }, + { + "epoch": 0.3235080877021926, + "grad_norm": 3.59104323387146, + "learning_rate": 1.7107842298953666e-05, + "loss": 1.634, + "step": 25880 + }, + { + "epoch": 0.3235330883272082, + "grad_norm": 2.5624618530273438, + "learning_rate": 1.710722841371878e-05, + "loss": 0.7124, + "step": 25882 + }, + { + "epoch": 0.3235580889522238, + "grad_norm": 3.6255171298980713, + "learning_rate": 1.710661447435625e-05, + "loss": 0.9415, + "step": 25884 + }, + { + "epoch": 0.3235830895772394, + "grad_norm": 1.6891086101531982, + "learning_rate": 1.7106000480870747e-05, + "loss": 0.0915, + "step": 25886 + }, + { + "epoch": 0.32360809020225506, + "grad_norm": 4.877310752868652, + "learning_rate": 1.7105386433266953e-05, + "loss": 1.4199, + "step": 25888 + }, + { + "epoch": 0.3236330908272707, + "grad_norm": 0.005912060849368572, + "learning_rate": 1.710477233154954e-05, + "loss": 0.7581, + "step": 25890 + }, + { + "epoch": 0.3236580914522863, + "grad_norm": 3.8591830730438232, + "learning_rate": 1.710415817572318e-05, + "loss": 1.2684, + "step": 25892 + }, + { + "epoch": 0.32368309207730195, + "grad_norm": 3.73465633392334, + "learning_rate": 1.710354396579256e-05, + "loss": 1.474, + "step": 25894 + }, + { + "epoch": 0.32370809270231754, + "grad_norm": 2.5852503776550293, + "learning_rate": 1.7102929701762353e-05, + "loss": 1.1536, + "step": 25896 + }, + { + "epoch": 0.3237330933273332, + "grad_norm": 2.7791800498962402, + "learning_rate": 1.710231538363724e-05, + "loss": 0.1266, + "step": 25898 + }, + { + "epoch": 0.32375809395234884, + "grad_norm": 3.814974069595337, + "learning_rate": 1.7101701011421892e-05, + "loss": 0.867, + "step": 25900 + }, + { + "epoch": 0.3237830945773644, + "grad_norm": 3.7054121494293213, + "learning_rate": 1.7101086585120995e-05, + "loss": 0.703, + "step": 25902 + }, + { + "epoch": 0.3238080952023801, + "grad_norm": 3.5877444744110107, + "learning_rate": 1.7100472104739232e-05, + "loss": 0.9693, + "step": 25904 + }, + { + "epoch": 0.32383309582739567, + "grad_norm": 0.5211531519889832, + "learning_rate": 1.709985757028127e-05, + "loss": 0.4231, + "step": 25906 + }, + { + "epoch": 0.3238580964524113, + "grad_norm": 4.800047397613525, + "learning_rate": 1.70992429817518e-05, + "loss": 0.6083, + "step": 25908 + }, + { + "epoch": 0.32388309707742696, + "grad_norm": 3.2458627223968506, + "learning_rate": 1.70986283391555e-05, + "loss": 1.1588, + "step": 25910 + }, + { + "epoch": 0.32390809770244255, + "grad_norm": 0.2645914554595947, + "learning_rate": 1.7098013642497052e-05, + "loss": 0.2097, + "step": 25912 + }, + { + "epoch": 0.3239330983274582, + "grad_norm": 6.925661087036133, + "learning_rate": 1.7097398891781137e-05, + "loss": 2.1853, + "step": 25914 + }, + { + "epoch": 0.3239580989524738, + "grad_norm": 0.005017337389290333, + "learning_rate": 1.7096784087012434e-05, + "loss": 0.4621, + "step": 25916 + }, + { + "epoch": 0.32398309957748944, + "grad_norm": 2.162057399749756, + "learning_rate": 1.7096169228195622e-05, + "loss": 0.4497, + "step": 25918 + }, + { + "epoch": 0.3240081002025051, + "grad_norm": 2.636401891708374, + "learning_rate": 1.7095554315335398e-05, + "loss": 0.8984, + "step": 25920 + }, + { + "epoch": 0.3240331008275207, + "grad_norm": 0.005201774183660746, + "learning_rate": 1.7094939348436427e-05, + "loss": 0.932, + "step": 25922 + }, + { + "epoch": 0.32405810145253633, + "grad_norm": 1.3949273824691772, + "learning_rate": 1.7094324327503404e-05, + "loss": 0.0446, + "step": 25924 + }, + { + "epoch": 0.3240831020775519, + "grad_norm": 1.7760792970657349, + "learning_rate": 1.709370925254101e-05, + "loss": 0.3496, + "step": 25926 + }, + { + "epoch": 0.32410810270256757, + "grad_norm": 2.896225690841675, + "learning_rate": 1.7093094123553932e-05, + "loss": 1.8772, + "step": 25928 + }, + { + "epoch": 0.3241331033275832, + "grad_norm": 2.6993472576141357, + "learning_rate": 1.709247894054685e-05, + "loss": 1.2334, + "step": 25930 + }, + { + "epoch": 0.3241581039525988, + "grad_norm": 3.1852993965148926, + "learning_rate": 1.709186370352445e-05, + "loss": 1.7824, + "step": 25932 + }, + { + "epoch": 0.32418310457761446, + "grad_norm": 4.201244354248047, + "learning_rate": 1.7091248412491423e-05, + "loss": 1.3807, + "step": 25934 + }, + { + "epoch": 0.32420810520263005, + "grad_norm": 0.20196503400802612, + "learning_rate": 1.7090633067452448e-05, + "loss": 0.0021, + "step": 25936 + }, + { + "epoch": 0.3242331058276457, + "grad_norm": 3.5647130012512207, + "learning_rate": 1.7090017668412215e-05, + "loss": 1.5082, + "step": 25938 + }, + { + "epoch": 0.32425810645266134, + "grad_norm": 4.383909702301025, + "learning_rate": 1.708940221537541e-05, + "loss": 1.5, + "step": 25940 + }, + { + "epoch": 0.32428310707767694, + "grad_norm": 5.201476573944092, + "learning_rate": 1.708878670834672e-05, + "loss": 1.0282, + "step": 25942 + }, + { + "epoch": 0.3243081077026926, + "grad_norm": 2.4068431854248047, + "learning_rate": 1.7088171147330835e-05, + "loss": 0.1372, + "step": 25944 + }, + { + "epoch": 0.3243331083277082, + "grad_norm": 3.8178882598876953, + "learning_rate": 1.7087555532332437e-05, + "loss": 1.284, + "step": 25946 + }, + { + "epoch": 0.3243581089527238, + "grad_norm": 0.23576612770557404, + "learning_rate": 1.708693986335622e-05, + "loss": 1.1379, + "step": 25948 + }, + { + "epoch": 0.32438310957773947, + "grad_norm": 2.3374345302581787, + "learning_rate": 1.7086324140406873e-05, + "loss": 0.3272, + "step": 25950 + }, + { + "epoch": 0.32440811020275506, + "grad_norm": 11.041858673095703, + "learning_rate": 1.7085708363489078e-05, + "loss": 1.6621, + "step": 25952 + }, + { + "epoch": 0.3244331108277707, + "grad_norm": 0.004354840610176325, + "learning_rate": 1.7085092532607536e-05, + "loss": 0.0167, + "step": 25954 + }, + { + "epoch": 0.3244581114527863, + "grad_norm": 0.8758553266525269, + "learning_rate": 1.708447664776693e-05, + "loss": 0.7777, + "step": 25956 + }, + { + "epoch": 0.32448311207780195, + "grad_norm": 1.5059750080108643, + "learning_rate": 1.7083860708971952e-05, + "loss": 0.3613, + "step": 25958 + }, + { + "epoch": 0.3245081127028176, + "grad_norm": 3.9463772773742676, + "learning_rate": 1.7083244716227292e-05, + "loss": 1.503, + "step": 25960 + }, + { + "epoch": 0.3245331133278332, + "grad_norm": 2.1332004070281982, + "learning_rate": 1.708262866953764e-05, + "loss": 0.9869, + "step": 25962 + }, + { + "epoch": 0.32455811395284884, + "grad_norm": 6.639357566833496, + "learning_rate": 1.708201256890769e-05, + "loss": 1.1539, + "step": 25964 + }, + { + "epoch": 0.32458311457786443, + "grad_norm": 1.533440113067627, + "learning_rate": 1.708139641434214e-05, + "loss": 0.9226, + "step": 25966 + }, + { + "epoch": 0.3246081152028801, + "grad_norm": 3.6588854789733887, + "learning_rate": 1.708078020584567e-05, + "loss": 2.387, + "step": 25968 + }, + { + "epoch": 0.3246331158278957, + "grad_norm": 2.3046810626983643, + "learning_rate": 1.7080163943422977e-05, + "loss": 0.5903, + "step": 25970 + }, + { + "epoch": 0.3246581164529113, + "grad_norm": 3.904862880706787, + "learning_rate": 1.7079547627078762e-05, + "loss": 1.6754, + "step": 25972 + }, + { + "epoch": 0.32468311707792696, + "grad_norm": 2.901707887649536, + "learning_rate": 1.707893125681771e-05, + "loss": 1.3857, + "step": 25974 + }, + { + "epoch": 0.32470811770294256, + "grad_norm": 2.387568712234497, + "learning_rate": 1.707831483264452e-05, + "loss": 1.3486, + "step": 25976 + }, + { + "epoch": 0.3247331183279582, + "grad_norm": 2.0933477878570557, + "learning_rate": 1.7077698354563884e-05, + "loss": 1.3778, + "step": 25978 + }, + { + "epoch": 0.32475811895297385, + "grad_norm": 3.5640363693237305, + "learning_rate": 1.7077081822580503e-05, + "loss": 1.4266, + "step": 25980 + }, + { + "epoch": 0.32478311957798944, + "grad_norm": 15.458965301513672, + "learning_rate": 1.7076465236699063e-05, + "loss": 1.3741, + "step": 25982 + }, + { + "epoch": 0.3248081202030051, + "grad_norm": 4.7614617347717285, + "learning_rate": 1.7075848596924265e-05, + "loss": 1.3496, + "step": 25984 + }, + { + "epoch": 0.3248331208280207, + "grad_norm": 3.141231060028076, + "learning_rate": 1.7075231903260804e-05, + "loss": 0.7581, + "step": 25986 + }, + { + "epoch": 0.32485812145303633, + "grad_norm": 5.882469654083252, + "learning_rate": 1.707461515571338e-05, + "loss": 0.8253, + "step": 25988 + }, + { + "epoch": 0.324883122078052, + "grad_norm": 0.005536118522286415, + "learning_rate": 1.7073998354286686e-05, + "loss": 0.4373, + "step": 25990 + }, + { + "epoch": 0.32490812270306757, + "grad_norm": 2.1862940788269043, + "learning_rate": 1.707338149898542e-05, + "loss": 1.5912, + "step": 25992 + }, + { + "epoch": 0.3249331233280832, + "grad_norm": 3.059880256652832, + "learning_rate": 1.7072764589814285e-05, + "loss": 1.4489, + "step": 25994 + }, + { + "epoch": 0.3249581239530988, + "grad_norm": 7.180861949920654, + "learning_rate": 1.707214762677797e-05, + "loss": 0.6419, + "step": 25996 + }, + { + "epoch": 0.32498312457811446, + "grad_norm": 27.492557525634766, + "learning_rate": 1.707153060988118e-05, + "loss": 0.6202, + "step": 25998 + }, + { + "epoch": 0.3250081252031301, + "grad_norm": 1.7593977451324463, + "learning_rate": 1.7070913539128612e-05, + "loss": 0.0622, + "step": 26000 + }, + { + "epoch": 0.3250331258281457, + "grad_norm": 1.117924690246582, + "learning_rate": 1.707029641452497e-05, + "loss": 0.8798, + "step": 26002 + }, + { + "epoch": 0.32505812645316134, + "grad_norm": 4.028555870056152, + "learning_rate": 1.7069679236074946e-05, + "loss": 1.6296, + "step": 26004 + }, + { + "epoch": 0.32508312707817694, + "grad_norm": 1.1487239599227905, + "learning_rate": 1.7069062003783247e-05, + "loss": 0.1075, + "step": 26006 + }, + { + "epoch": 0.3251081277031926, + "grad_norm": 4.746738910675049, + "learning_rate": 1.7068444717654575e-05, + "loss": 0.9302, + "step": 26008 + }, + { + "epoch": 0.32513312832820823, + "grad_norm": 6.629581451416016, + "learning_rate": 1.706782737769362e-05, + "loss": 1.4923, + "step": 26010 + }, + { + "epoch": 0.3251581289532238, + "grad_norm": 3.3234009742736816, + "learning_rate": 1.7067209983905096e-05, + "loss": 0.8622, + "step": 26012 + }, + { + "epoch": 0.32518312957823947, + "grad_norm": 1.6941508054733276, + "learning_rate": 1.7066592536293698e-05, + "loss": 0.1104, + "step": 26014 + }, + { + "epoch": 0.32520813020325506, + "grad_norm": 1.4881834983825684, + "learning_rate": 1.7065975034864134e-05, + "loss": 0.7163, + "step": 26016 + }, + { + "epoch": 0.3252331308282707, + "grad_norm": 0.8604161143302917, + "learning_rate": 1.70653574796211e-05, + "loss": 0.033, + "step": 26018 + }, + { + "epoch": 0.32525813145328636, + "grad_norm": 4.124701023101807, + "learning_rate": 1.7064739870569306e-05, + "loss": 0.6234, + "step": 26020 + }, + { + "epoch": 0.32528313207830195, + "grad_norm": 2.646334409713745, + "learning_rate": 1.7064122207713446e-05, + "loss": 0.9392, + "step": 26022 + }, + { + "epoch": 0.3253081327033176, + "grad_norm": 2.008150815963745, + "learning_rate": 1.7063504491058235e-05, + "loss": 1.7945, + "step": 26024 + }, + { + "epoch": 0.3253331333283332, + "grad_norm": 2.7213313579559326, + "learning_rate": 1.706288672060837e-05, + "loss": 1.3289, + "step": 26026 + }, + { + "epoch": 0.32535813395334884, + "grad_norm": 4.629852294921875, + "learning_rate": 1.7062268896368564e-05, + "loss": 0.6662, + "step": 26028 + }, + { + "epoch": 0.3253831345783645, + "grad_norm": 3.686025381088257, + "learning_rate": 1.7061651018343514e-05, + "loss": 0.762, + "step": 26030 + }, + { + "epoch": 0.3254081352033801, + "grad_norm": 4.388864040374756, + "learning_rate": 1.7061033086537926e-05, + "loss": 1.1555, + "step": 26032 + }, + { + "epoch": 0.3254331358283957, + "grad_norm": 2.6844327449798584, + "learning_rate": 1.706041510095651e-05, + "loss": 0.4954, + "step": 26034 + }, + { + "epoch": 0.3254581364534113, + "grad_norm": 3.7250497341156006, + "learning_rate": 1.7059797061603972e-05, + "loss": 1.3302, + "step": 26036 + }, + { + "epoch": 0.32548313707842697, + "grad_norm": 0.0059311664663255215, + "learning_rate": 1.7059178968485014e-05, + "loss": 0.6049, + "step": 26038 + }, + { + "epoch": 0.3255081377034426, + "grad_norm": 3.951599597930908, + "learning_rate": 1.705856082160435e-05, + "loss": 1.3612, + "step": 26040 + }, + { + "epoch": 0.3255331383284582, + "grad_norm": 2.5822691917419434, + "learning_rate": 1.7057942620966684e-05, + "loss": 0.3417, + "step": 26042 + }, + { + "epoch": 0.32555813895347385, + "grad_norm": 2.199854850769043, + "learning_rate": 1.705732436657673e-05, + "loss": 2.1634, + "step": 26044 + }, + { + "epoch": 0.32558313957848944, + "grad_norm": 2.748286008834839, + "learning_rate": 1.7056706058439187e-05, + "loss": 0.7872, + "step": 26046 + }, + { + "epoch": 0.3256081402035051, + "grad_norm": 1.4422365427017212, + "learning_rate": 1.7056087696558766e-05, + "loss": 0.9417, + "step": 26048 + }, + { + "epoch": 0.32563314082852074, + "grad_norm": 3.807577610015869, + "learning_rate": 1.7055469280940183e-05, + "loss": 1.2086, + "step": 26050 + }, + { + "epoch": 0.32565814145353633, + "grad_norm": 2.6167147159576416, + "learning_rate": 1.7054850811588144e-05, + "loss": 1.407, + "step": 26052 + }, + { + "epoch": 0.325683142078552, + "grad_norm": 2.760146141052246, + "learning_rate": 1.705423228850736e-05, + "loss": 0.8977, + "step": 26054 + }, + { + "epoch": 0.32570814270356757, + "grad_norm": 1.02920401096344, + "learning_rate": 1.705361371170254e-05, + "loss": 0.7406, + "step": 26056 + }, + { + "epoch": 0.3257331433285832, + "grad_norm": 0.8032989501953125, + "learning_rate": 1.7052995081178388e-05, + "loss": 0.6353, + "step": 26058 + }, + { + "epoch": 0.32575814395359887, + "grad_norm": 4.31263542175293, + "learning_rate": 1.7052376396939633e-05, + "loss": 0.8821, + "step": 26060 + }, + { + "epoch": 0.32578314457861446, + "grad_norm": 3.5005478858947754, + "learning_rate": 1.705175765899097e-05, + "loss": 1.1637, + "step": 26062 + }, + { + "epoch": 0.3258081452036301, + "grad_norm": 2.5748109817504883, + "learning_rate": 1.705113886733712e-05, + "loss": 0.2062, + "step": 26064 + }, + { + "epoch": 0.3258331458286457, + "grad_norm": 10.127583503723145, + "learning_rate": 1.7050520021982796e-05, + "loss": 1.75, + "step": 26066 + }, + { + "epoch": 0.32585814645366135, + "grad_norm": 3.097057342529297, + "learning_rate": 1.7049901122932704e-05, + "loss": 0.7424, + "step": 26068 + }, + { + "epoch": 0.325883147078677, + "grad_norm": 0.9986922740936279, + "learning_rate": 1.7049282170191562e-05, + "loss": 0.1385, + "step": 26070 + }, + { + "epoch": 0.3259081477036926, + "grad_norm": 3.37034010887146, + "learning_rate": 1.704866316376409e-05, + "loss": 1.077, + "step": 26072 + }, + { + "epoch": 0.32593314832870823, + "grad_norm": 0.0055918339639902115, + "learning_rate": 1.7048044103654993e-05, + "loss": 0.5022, + "step": 26074 + }, + { + "epoch": 0.3259581489537238, + "grad_norm": 2.44917368888855, + "learning_rate": 1.7047424989868985e-05, + "loss": 0.7718, + "step": 26076 + }, + { + "epoch": 0.3259831495787395, + "grad_norm": 3.326326370239258, + "learning_rate": 1.7046805822410788e-05, + "loss": 1.5484, + "step": 26078 + }, + { + "epoch": 0.3260081502037551, + "grad_norm": 2.889415740966797, + "learning_rate": 1.7046186601285116e-05, + "loss": 1.4184, + "step": 26080 + }, + { + "epoch": 0.3260331508287707, + "grad_norm": 2.5599398612976074, + "learning_rate": 1.704556732649668e-05, + "loss": 0.7223, + "step": 26082 + }, + { + "epoch": 0.32605815145378636, + "grad_norm": 4.127289295196533, + "learning_rate": 1.70449479980502e-05, + "loss": 0.2094, + "step": 26084 + }, + { + "epoch": 0.32608315207880195, + "grad_norm": 3.670778512954712, + "learning_rate": 1.7044328615950398e-05, + "loss": 1.919, + "step": 26086 + }, + { + "epoch": 0.3261081527038176, + "grad_norm": 2.8522121906280518, + "learning_rate": 1.7043709180201976e-05, + "loss": 0.6623, + "step": 26088 + }, + { + "epoch": 0.32613315332883325, + "grad_norm": 6.157789707183838, + "learning_rate": 1.7043089690809666e-05, + "loss": 0.9594, + "step": 26090 + }, + { + "epoch": 0.32615815395384884, + "grad_norm": 3.5776994228363037, + "learning_rate": 1.7042470147778182e-05, + "loss": 0.6222, + "step": 26092 + }, + { + "epoch": 0.3261831545788645, + "grad_norm": 0.0032496238127350807, + "learning_rate": 1.704185055111224e-05, + "loss": 0.5442, + "step": 26094 + }, + { + "epoch": 0.3262081552038801, + "grad_norm": 4.319736003875732, + "learning_rate": 1.704123090081656e-05, + "loss": 1.1207, + "step": 26096 + }, + { + "epoch": 0.3262331558288957, + "grad_norm": 0.8553602695465088, + "learning_rate": 1.7040611196895862e-05, + "loss": 0.0175, + "step": 26098 + }, + { + "epoch": 0.3262581564539114, + "grad_norm": 0.4496035575866699, + "learning_rate": 1.7039991439354863e-05, + "loss": 0.0073, + "step": 26100 + }, + { + "epoch": 0.32628315707892697, + "grad_norm": 2.0878257751464844, + "learning_rate": 1.7039371628198284e-05, + "loss": 0.6344, + "step": 26102 + }, + { + "epoch": 0.3263081577039426, + "grad_norm": 5.051883697509766, + "learning_rate": 1.703875176343085e-05, + "loss": 0.7096, + "step": 26104 + }, + { + "epoch": 0.3263331583289582, + "grad_norm": 4.227267265319824, + "learning_rate": 1.7038131845057274e-05, + "loss": 1.8357, + "step": 26106 + }, + { + "epoch": 0.32635815895397385, + "grad_norm": 1.891803503036499, + "learning_rate": 1.703751187308228e-05, + "loss": 0.5038, + "step": 26108 + }, + { + "epoch": 0.3263831595789895, + "grad_norm": 2.949404716491699, + "learning_rate": 1.7036891847510597e-05, + "loss": 0.5376, + "step": 26110 + }, + { + "epoch": 0.3264081602040051, + "grad_norm": 2.0510175228118896, + "learning_rate": 1.7036271768346936e-05, + "loss": 0.959, + "step": 26112 + }, + { + "epoch": 0.32643316082902074, + "grad_norm": 3.71357798576355, + "learning_rate": 1.7035651635596027e-05, + "loss": 1.4857, + "step": 26114 + }, + { + "epoch": 0.32645816145403633, + "grad_norm": 0.3359908163547516, + "learning_rate": 1.7035031449262586e-05, + "loss": 0.0579, + "step": 26116 + }, + { + "epoch": 0.326483162079052, + "grad_norm": 0.0052776820957660675, + "learning_rate": 1.7034411209351346e-05, + "loss": 0.2749, + "step": 26118 + }, + { + "epoch": 0.32650816270406763, + "grad_norm": 2.7148663997650146, + "learning_rate": 1.703379091586702e-05, + "loss": 0.6717, + "step": 26120 + }, + { + "epoch": 0.3265331633290832, + "grad_norm": 0.01626475155353546, + "learning_rate": 1.703317056881434e-05, + "loss": 0.6716, + "step": 26122 + }, + { + "epoch": 0.32655816395409887, + "grad_norm": 2.941835641860962, + "learning_rate": 1.7032550168198026e-05, + "loss": 1.4043, + "step": 26124 + }, + { + "epoch": 0.32658316457911446, + "grad_norm": 4.763500690460205, + "learning_rate": 1.7031929714022806e-05, + "loss": 1.6111, + "step": 26126 + }, + { + "epoch": 0.3266081652041301, + "grad_norm": 3.167543888092041, + "learning_rate": 1.7031309206293403e-05, + "loss": 0.9954, + "step": 26128 + }, + { + "epoch": 0.32663316582914576, + "grad_norm": 0.004192139953374863, + "learning_rate": 1.7030688645014542e-05, + "loss": 0.834, + "step": 26130 + }, + { + "epoch": 0.32665816645416135, + "grad_norm": 3.5870485305786133, + "learning_rate": 1.7030068030190952e-05, + "loss": 0.5756, + "step": 26132 + }, + { + "epoch": 0.326683167079177, + "grad_norm": 1.4159058332443237, + "learning_rate": 1.7029447361827357e-05, + "loss": 0.9713, + "step": 26134 + }, + { + "epoch": 0.3267081677041926, + "grad_norm": 2.9132466316223145, + "learning_rate": 1.7028826639928485e-05, + "loss": 1.3061, + "step": 26136 + }, + { + "epoch": 0.32673316832920823, + "grad_norm": 3.775733232498169, + "learning_rate": 1.7028205864499066e-05, + "loss": 1.3492, + "step": 26138 + }, + { + "epoch": 0.3267581689542239, + "grad_norm": 4.095211982727051, + "learning_rate": 1.702758503554382e-05, + "loss": 1.4021, + "step": 26140 + }, + { + "epoch": 0.3267831695792395, + "grad_norm": 3.8421876430511475, + "learning_rate": 1.7026964153067483e-05, + "loss": 0.6639, + "step": 26142 + }, + { + "epoch": 0.3268081702042551, + "grad_norm": 0.003049250924959779, + "learning_rate": 1.7026343217074782e-05, + "loss": 0.5061, + "step": 26144 + }, + { + "epoch": 0.3268331708292707, + "grad_norm": 2.3633980751037598, + "learning_rate": 1.7025722227570443e-05, + "loss": 0.9138, + "step": 26146 + }, + { + "epoch": 0.32685817145428636, + "grad_norm": 3.8588180541992188, + "learning_rate": 1.7025101184559196e-05, + "loss": 1.2412, + "step": 26148 + }, + { + "epoch": 0.326883172079302, + "grad_norm": 9.891597747802734, + "learning_rate": 1.7024480088045776e-05, + "loss": 0.3388, + "step": 26150 + }, + { + "epoch": 0.3269081727043176, + "grad_norm": 3.8461830615997314, + "learning_rate": 1.7023858938034904e-05, + "loss": 0.7618, + "step": 26152 + }, + { + "epoch": 0.32693317332933325, + "grad_norm": 2.9879794120788574, + "learning_rate": 1.7023237734531316e-05, + "loss": 0.9076, + "step": 26154 + }, + { + "epoch": 0.32695817395434884, + "grad_norm": 5.649870872497559, + "learning_rate": 1.7022616477539746e-05, + "loss": 1.1757, + "step": 26156 + }, + { + "epoch": 0.3269831745793645, + "grad_norm": 1.4624361991882324, + "learning_rate": 1.7021995167064923e-05, + "loss": 0.2699, + "step": 26158 + }, + { + "epoch": 0.32700817520438014, + "grad_norm": 3.7013497352600098, + "learning_rate": 1.7021373803111574e-05, + "loss": 1.5966, + "step": 26160 + }, + { + "epoch": 0.32703317582939573, + "grad_norm": 6.2391252517700195, + "learning_rate": 1.7020752385684435e-05, + "loss": 1.5109, + "step": 26162 + }, + { + "epoch": 0.3270581764544114, + "grad_norm": 4.306336402893066, + "learning_rate": 1.7020130914788243e-05, + "loss": 0.7545, + "step": 26164 + }, + { + "epoch": 0.32708317707942697, + "grad_norm": 3.1783242225646973, + "learning_rate": 1.701950939042772e-05, + "loss": 0.5049, + "step": 26166 + }, + { + "epoch": 0.3271081777044426, + "grad_norm": 4.353357315063477, + "learning_rate": 1.701888781260761e-05, + "loss": 1.5043, + "step": 26168 + }, + { + "epoch": 0.32713317832945826, + "grad_norm": 2.4200003147125244, + "learning_rate": 1.7018266181332644e-05, + "loss": 1.2659, + "step": 26170 + }, + { + "epoch": 0.32715817895447385, + "grad_norm": 2.198160409927368, + "learning_rate": 1.701764449660755e-05, + "loss": 1.715, + "step": 26172 + }, + { + "epoch": 0.3271831795794895, + "grad_norm": 0.002887970069423318, + "learning_rate": 1.7017022758437075e-05, + "loss": 0.0001, + "step": 26174 + }, + { + "epoch": 0.3272081802045051, + "grad_norm": 1.6149286031723022, + "learning_rate": 1.7016400966825942e-05, + "loss": 0.241, + "step": 26176 + }, + { + "epoch": 0.32723318082952074, + "grad_norm": 1.1861029863357544, + "learning_rate": 1.7015779121778896e-05, + "loss": 0.059, + "step": 26178 + }, + { + "epoch": 0.3272581814545364, + "grad_norm": 2.916849136352539, + "learning_rate": 1.7015157223300665e-05, + "loss": 0.6998, + "step": 26180 + }, + { + "epoch": 0.327283182079552, + "grad_norm": 3.0863559246063232, + "learning_rate": 1.701453527139599e-05, + "loss": 0.304, + "step": 26182 + }, + { + "epoch": 0.32730818270456763, + "grad_norm": 2.5286858081817627, + "learning_rate": 1.7013913266069605e-05, + "loss": 0.1802, + "step": 26184 + }, + { + "epoch": 0.3273331833295832, + "grad_norm": 0.0028131597209721804, + "learning_rate": 1.701329120732625e-05, + "loss": 0.012, + "step": 26186 + }, + { + "epoch": 0.32735818395459887, + "grad_norm": 0.005637265741825104, + "learning_rate": 1.7012669095170656e-05, + "loss": 0.7481, + "step": 26188 + }, + { + "epoch": 0.3273831845796145, + "grad_norm": 7.100647449493408, + "learning_rate": 1.7012046929607572e-05, + "loss": 1.1033, + "step": 26190 + }, + { + "epoch": 0.3274081852046301, + "grad_norm": 2.8778367042541504, + "learning_rate": 1.7011424710641727e-05, + "loss": 1.0122, + "step": 26192 + }, + { + "epoch": 0.32743318582964576, + "grad_norm": 0.0030943690799176693, + "learning_rate": 1.7010802438277864e-05, + "loss": 0.8007, + "step": 26194 + }, + { + "epoch": 0.32745818645466135, + "grad_norm": 2.953334093093872, + "learning_rate": 1.701018011252072e-05, + "loss": 0.4566, + "step": 26196 + }, + { + "epoch": 0.327483187079677, + "grad_norm": 0.002656008815392852, + "learning_rate": 1.700955773337504e-05, + "loss": 0.6172, + "step": 26198 + }, + { + "epoch": 0.32750818770469264, + "grad_norm": 3.712552309036255, + "learning_rate": 1.7008935300845552e-05, + "loss": 1.7996, + "step": 26200 + }, + { + "epoch": 0.32753318832970824, + "grad_norm": 4.4423956871032715, + "learning_rate": 1.700831281493701e-05, + "loss": 0.4809, + "step": 26202 + }, + { + "epoch": 0.3275581889547239, + "grad_norm": 3.894334554672241, + "learning_rate": 1.7007690275654146e-05, + "loss": 1.2836, + "step": 26204 + }, + { + "epoch": 0.3275831895797395, + "grad_norm": 6.638822078704834, + "learning_rate": 1.7007067683001705e-05, + "loss": 2.2468, + "step": 26206 + }, + { + "epoch": 0.3276081902047551, + "grad_norm": 3.8698196411132812, + "learning_rate": 1.7006445036984427e-05, + "loss": 0.8357, + "step": 26208 + }, + { + "epoch": 0.32763319082977077, + "grad_norm": 3.0203888416290283, + "learning_rate": 1.7005822337607054e-05, + "loss": 0.7998, + "step": 26210 + }, + { + "epoch": 0.32765819145478636, + "grad_norm": 4.461034774780273, + "learning_rate": 1.700519958487433e-05, + "loss": 2.0882, + "step": 26212 + }, + { + "epoch": 0.327683192079802, + "grad_norm": 4.259728908538818, + "learning_rate": 1.7004576778790993e-05, + "loss": 1.509, + "step": 26214 + }, + { + "epoch": 0.3277081927048176, + "grad_norm": 2.582664966583252, + "learning_rate": 1.7003953919361795e-05, + "loss": 0.4255, + "step": 26216 + }, + { + "epoch": 0.32773319332983325, + "grad_norm": 2.945751905441284, + "learning_rate": 1.700333100659147e-05, + "loss": 1.1352, + "step": 26218 + }, + { + "epoch": 0.3277581939548489, + "grad_norm": 1.9005190134048462, + "learning_rate": 1.700270804048477e-05, + "loss": 0.2987, + "step": 26220 + }, + { + "epoch": 0.3277831945798645, + "grad_norm": 2.3512089252471924, + "learning_rate": 1.7002085021046434e-05, + "loss": 1.1967, + "step": 26222 + }, + { + "epoch": 0.32780819520488014, + "grad_norm": 2.8805272579193115, + "learning_rate": 1.7001461948281208e-05, + "loss": 1.3578, + "step": 26224 + }, + { + "epoch": 0.32783319582989573, + "grad_norm": 4.996901035308838, + "learning_rate": 1.7000838822193837e-05, + "loss": 0.2524, + "step": 26226 + }, + { + "epoch": 0.3278581964549114, + "grad_norm": 0.34544458985328674, + "learning_rate": 1.700021564278907e-05, + "loss": 0.1541, + "step": 26228 + }, + { + "epoch": 0.327883197079927, + "grad_norm": 5.569890975952148, + "learning_rate": 1.699959241007165e-05, + "loss": 0.5011, + "step": 26230 + }, + { + "epoch": 0.3279081977049426, + "grad_norm": 3.0883216857910156, + "learning_rate": 1.699896912404632e-05, + "loss": 0.7203, + "step": 26232 + }, + { + "epoch": 0.32793319832995826, + "grad_norm": 0.8271772265434265, + "learning_rate": 1.6998345784717837e-05, + "loss": 0.0235, + "step": 26234 + }, + { + "epoch": 0.32795819895497386, + "grad_norm": 2.8931710720062256, + "learning_rate": 1.6997722392090943e-05, + "loss": 1.4327, + "step": 26236 + }, + { + "epoch": 0.3279831995799895, + "grad_norm": 4.853851795196533, + "learning_rate": 1.699709894617038e-05, + "loss": 0.6803, + "step": 26238 + }, + { + "epoch": 0.32800820020500515, + "grad_norm": 1.720834732055664, + "learning_rate": 1.6996475446960903e-05, + "loss": 0.9184, + "step": 26240 + }, + { + "epoch": 0.32803320083002074, + "grad_norm": 3.6790735721588135, + "learning_rate": 1.699585189446726e-05, + "loss": 1.3486, + "step": 26242 + }, + { + "epoch": 0.3280582014550364, + "grad_norm": 4.87925386428833, + "learning_rate": 1.6995228288694195e-05, + "loss": 1.3185, + "step": 26244 + }, + { + "epoch": 0.328083202080052, + "grad_norm": 9.790279388427734, + "learning_rate": 1.699460462964646e-05, + "loss": 1.0252, + "step": 26246 + }, + { + "epoch": 0.32810820270506763, + "grad_norm": 5.348605632781982, + "learning_rate": 1.6993980917328807e-05, + "loss": 1.6016, + "step": 26248 + }, + { + "epoch": 0.3281332033300833, + "grad_norm": 1.8188163042068481, + "learning_rate": 1.6993357151745984e-05, + "loss": 0.967, + "step": 26250 + }, + { + "epoch": 0.32815820395509887, + "grad_norm": 0.00750908674672246, + "learning_rate": 1.6992733332902743e-05, + "loss": 0.4817, + "step": 26252 + }, + { + "epoch": 0.3281832045801145, + "grad_norm": 0.005291312467306852, + "learning_rate": 1.6992109460803834e-05, + "loss": 0.6384, + "step": 26254 + }, + { + "epoch": 0.3282082052051301, + "grad_norm": 4.4440741539001465, + "learning_rate": 1.6991485535454006e-05, + "loss": 1.7429, + "step": 26256 + }, + { + "epoch": 0.32823320583014576, + "grad_norm": 0.024770328775048256, + "learning_rate": 1.6990861556858013e-05, + "loss": 0.4371, + "step": 26258 + }, + { + "epoch": 0.3282582064551614, + "grad_norm": 0.0060096625238657, + "learning_rate": 1.699023752502061e-05, + "loss": 0.4031, + "step": 26260 + }, + { + "epoch": 0.328283207080177, + "grad_norm": 1.936419129371643, + "learning_rate": 1.6989613439946544e-05, + "loss": 1.6596, + "step": 26262 + }, + { + "epoch": 0.32830820770519265, + "grad_norm": 0.21588510274887085, + "learning_rate": 1.698898930164057e-05, + "loss": 0.0206, + "step": 26264 + }, + { + "epoch": 0.32833320833020824, + "grad_norm": 4.609457015991211, + "learning_rate": 1.698836511010744e-05, + "loss": 0.569, + "step": 26266 + }, + { + "epoch": 0.3283582089552239, + "grad_norm": 0.7075909972190857, + "learning_rate": 1.6987740865351915e-05, + "loss": 0.2384, + "step": 26268 + }, + { + "epoch": 0.32838320958023953, + "grad_norm": 3.6285533905029297, + "learning_rate": 1.6987116567378744e-05, + "loss": 0.9503, + "step": 26270 + }, + { + "epoch": 0.3284082102052551, + "grad_norm": 0.002417638199403882, + "learning_rate": 1.6986492216192677e-05, + "loss": 0.1819, + "step": 26272 + }, + { + "epoch": 0.32843321083027077, + "grad_norm": 5.860667705535889, + "learning_rate": 1.6985867811798473e-05, + "loss": 1.0715, + "step": 26274 + }, + { + "epoch": 0.32845821145528636, + "grad_norm": 3.2398486137390137, + "learning_rate": 1.698524335420089e-05, + "loss": 0.9517, + "step": 26276 + }, + { + "epoch": 0.328483212080302, + "grad_norm": 2.37362003326416, + "learning_rate": 1.698461884340468e-05, + "loss": 0.9575, + "step": 26278 + }, + { + "epoch": 0.32850821270531766, + "grad_norm": 3.9085562229156494, + "learning_rate": 1.6983994279414603e-05, + "loss": 1.2851, + "step": 26280 + }, + { + "epoch": 0.32853321333033325, + "grad_norm": 0.033615730702877045, + "learning_rate": 1.6983369662235413e-05, + "loss": 0.0508, + "step": 26282 + }, + { + "epoch": 0.3285582139553489, + "grad_norm": 2.9170145988464355, + "learning_rate": 1.6982744991871864e-05, + "loss": 1.4597, + "step": 26284 + }, + { + "epoch": 0.3285832145803645, + "grad_norm": 2.030776023864746, + "learning_rate": 1.6982120268328717e-05, + "loss": 0.0945, + "step": 26286 + }, + { + "epoch": 0.32860821520538014, + "grad_norm": 1.86750066280365, + "learning_rate": 1.6981495491610733e-05, + "loss": 1.1564, + "step": 26288 + }, + { + "epoch": 0.3286332158303958, + "grad_norm": 19.489940643310547, + "learning_rate": 1.698087066172266e-05, + "loss": 1.9517, + "step": 26290 + }, + { + "epoch": 0.3286582164554114, + "grad_norm": 4.375030517578125, + "learning_rate": 1.698024577866927e-05, + "loss": 1.8751, + "step": 26292 + }, + { + "epoch": 0.328683217080427, + "grad_norm": 4.888244152069092, + "learning_rate": 1.6979620842455314e-05, + "loss": 0.7848, + "step": 26294 + }, + { + "epoch": 0.3287082177054426, + "grad_norm": 2.5733678340911865, + "learning_rate": 1.697899585308555e-05, + "loss": 0.941, + "step": 26296 + }, + { + "epoch": 0.32873321833045827, + "grad_norm": 2.6266438961029053, + "learning_rate": 1.697837081056474e-05, + "loss": 0.6294, + "step": 26298 + }, + { + "epoch": 0.3287582189554739, + "grad_norm": 1.3343472480773926, + "learning_rate": 1.697774571489765e-05, + "loss": 0.9345, + "step": 26300 + }, + { + "epoch": 0.3287832195804895, + "grad_norm": 3.5791962146759033, + "learning_rate": 1.697712056608903e-05, + "loss": 0.8665, + "step": 26302 + }, + { + "epoch": 0.32880822020550515, + "grad_norm": 4.377987861633301, + "learning_rate": 1.6976495364143647e-05, + "loss": 2.0264, + "step": 26304 + }, + { + "epoch": 0.32883322083052074, + "grad_norm": 1.7492907047271729, + "learning_rate": 1.6975870109066262e-05, + "loss": 0.151, + "step": 26306 + }, + { + "epoch": 0.3288582214555364, + "grad_norm": 4.7517008781433105, + "learning_rate": 1.6975244800861642e-05, + "loss": 1.1578, + "step": 26308 + }, + { + "epoch": 0.32888322208055204, + "grad_norm": 2.8298866748809814, + "learning_rate": 1.6974619439534538e-05, + "loss": 0.6738, + "step": 26310 + }, + { + "epoch": 0.32890822270556763, + "grad_norm": 2.0005621910095215, + "learning_rate": 1.6973994025089723e-05, + "loss": 2.2741, + "step": 26312 + }, + { + "epoch": 0.3289332233305833, + "grad_norm": 3.2142257690429688, + "learning_rate": 1.6973368557531955e-05, + "loss": 1.426, + "step": 26314 + }, + { + "epoch": 0.32895822395559887, + "grad_norm": 5.024219512939453, + "learning_rate": 1.6972743036865993e-05, + "loss": 1.2377, + "step": 26316 + }, + { + "epoch": 0.3289832245806145, + "grad_norm": 2.4587223529815674, + "learning_rate": 1.6972117463096612e-05, + "loss": 0.4149, + "step": 26318 + }, + { + "epoch": 0.32900822520563017, + "grad_norm": 3.0042004585266113, + "learning_rate": 1.6971491836228568e-05, + "loss": 0.9204, + "step": 26320 + }, + { + "epoch": 0.32903322583064576, + "grad_norm": 2.963827133178711, + "learning_rate": 1.697086615626663e-05, + "loss": 1.0519, + "step": 26322 + }, + { + "epoch": 0.3290582264556614, + "grad_norm": 5.222509860992432, + "learning_rate": 1.6970240423215555e-05, + "loss": 1.9872, + "step": 26324 + }, + { + "epoch": 0.329083227080677, + "grad_norm": 5.3206467628479, + "learning_rate": 1.696961463708012e-05, + "loss": 1.158, + "step": 26326 + }, + { + "epoch": 0.32910822770569265, + "grad_norm": 5.496815204620361, + "learning_rate": 1.6968988797865084e-05, + "loss": 1.3049, + "step": 26328 + }, + { + "epoch": 0.3291332283307083, + "grad_norm": 6.518242359161377, + "learning_rate": 1.6968362905575214e-05, + "loss": 0.4514, + "step": 26330 + }, + { + "epoch": 0.3291582289557239, + "grad_norm": 0.006520363036543131, + "learning_rate": 1.696773696021528e-05, + "loss": 0.3759, + "step": 26332 + }, + { + "epoch": 0.32918322958073953, + "grad_norm": 2.6170854568481445, + "learning_rate": 1.6967110961790045e-05, + "loss": 0.8518, + "step": 26334 + }, + { + "epoch": 0.3292082302057551, + "grad_norm": 3.6971564292907715, + "learning_rate": 1.696648491030428e-05, + "loss": 0.5682, + "step": 26336 + }, + { + "epoch": 0.3292332308307708, + "grad_norm": 4.4602484703063965, + "learning_rate": 1.696585880576275e-05, + "loss": 0.4192, + "step": 26338 + }, + { + "epoch": 0.3292582314557864, + "grad_norm": 4.001972198486328, + "learning_rate": 1.6965232648170224e-05, + "loss": 1.0485, + "step": 26340 + }, + { + "epoch": 0.329283232080802, + "grad_norm": 0.002736492082476616, + "learning_rate": 1.696460643753147e-05, + "loss": 0.6035, + "step": 26342 + }, + { + "epoch": 0.32930823270581766, + "grad_norm": 7.244813919067383, + "learning_rate": 1.6963980173851257e-05, + "loss": 2.2581, + "step": 26344 + }, + { + "epoch": 0.32933323333083325, + "grad_norm": 2.485724687576294, + "learning_rate": 1.696335385713436e-05, + "loss": 0.661, + "step": 26346 + }, + { + "epoch": 0.3293582339558489, + "grad_norm": 5.577744483947754, + "learning_rate": 1.6962727487385543e-05, + "loss": 1.923, + "step": 26348 + }, + { + "epoch": 0.32938323458086455, + "grad_norm": 2.8588194847106934, + "learning_rate": 1.6962101064609577e-05, + "loss": 0.4255, + "step": 26350 + }, + { + "epoch": 0.32940823520588014, + "grad_norm": 3.4432406425476074, + "learning_rate": 1.696147458881124e-05, + "loss": 0.6795, + "step": 26352 + }, + { + "epoch": 0.3294332358308958, + "grad_norm": 2.3112878799438477, + "learning_rate": 1.6960848059995286e-05, + "loss": 0.1656, + "step": 26354 + }, + { + "epoch": 0.3294582364559114, + "grad_norm": 0.37532541155815125, + "learning_rate": 1.6960221478166502e-05, + "loss": 0.0086, + "step": 26356 + }, + { + "epoch": 0.329483237080927, + "grad_norm": 2.8357224464416504, + "learning_rate": 1.6959594843329658e-05, + "loss": 1.2213, + "step": 26358 + }, + { + "epoch": 0.3295082377059427, + "grad_norm": 0.03584400936961174, + "learning_rate": 1.6958968155489524e-05, + "loss": 0.381, + "step": 26360 + }, + { + "epoch": 0.32953323833095827, + "grad_norm": 0.32534098625183105, + "learning_rate": 1.695834141465087e-05, + "loss": 1.0481, + "step": 26362 + }, + { + "epoch": 0.3295582389559739, + "grad_norm": 3.4057235717773438, + "learning_rate": 1.6957714620818474e-05, + "loss": 1.3216, + "step": 26364 + }, + { + "epoch": 0.3295832395809895, + "grad_norm": 3.542518138885498, + "learning_rate": 1.6957087773997106e-05, + "loss": 1.6598, + "step": 26366 + }, + { + "epoch": 0.32960824020600515, + "grad_norm": 3.5185163021087646, + "learning_rate": 1.6956460874191544e-05, + "loss": 1.8979, + "step": 26368 + }, + { + "epoch": 0.3296332408310208, + "grad_norm": 3.1153922080993652, + "learning_rate": 1.6955833921406555e-05, + "loss": 1.6355, + "step": 26370 + }, + { + "epoch": 0.3296582414560364, + "grad_norm": 2.4762895107269287, + "learning_rate": 1.695520691564692e-05, + "loss": 1.4648, + "step": 26372 + }, + { + "epoch": 0.32968324208105204, + "grad_norm": 1.1878371238708496, + "learning_rate": 1.6954579856917416e-05, + "loss": 0.619, + "step": 26374 + }, + { + "epoch": 0.32970824270606763, + "grad_norm": 1.0234954357147217, + "learning_rate": 1.6953952745222812e-05, + "loss": 0.498, + "step": 26376 + }, + { + "epoch": 0.3297332433310833, + "grad_norm": 4.865983486175537, + "learning_rate": 1.695332558056789e-05, + "loss": 1.7107, + "step": 26378 + }, + { + "epoch": 0.32975824395609893, + "grad_norm": 2.5494961738586426, + "learning_rate": 1.695269836295742e-05, + "loss": 0.6412, + "step": 26380 + }, + { + "epoch": 0.3297832445811145, + "grad_norm": 4.132368087768555, + "learning_rate": 1.6952071092396184e-05, + "loss": 1.0751, + "step": 26382 + }, + { + "epoch": 0.32980824520613017, + "grad_norm": 3.0165672302246094, + "learning_rate": 1.6951443768888956e-05, + "loss": 1.503, + "step": 26384 + }, + { + "epoch": 0.32983324583114576, + "grad_norm": 2.796809673309326, + "learning_rate": 1.695081639244052e-05, + "loss": 1.404, + "step": 26386 + }, + { + "epoch": 0.3298582464561614, + "grad_norm": 2.6143078804016113, + "learning_rate": 1.6950188963055644e-05, + "loss": 0.7247, + "step": 26388 + }, + { + "epoch": 0.32988324708117706, + "grad_norm": 3.2070837020874023, + "learning_rate": 1.6949561480739114e-05, + "loss": 1.3057, + "step": 26390 + }, + { + "epoch": 0.32990824770619265, + "grad_norm": 3.3298420906066895, + "learning_rate": 1.694893394549571e-05, + "loss": 0.6595, + "step": 26392 + }, + { + "epoch": 0.3299332483312083, + "grad_norm": 5.658307075500488, + "learning_rate": 1.69483063573302e-05, + "loss": 1.5956, + "step": 26394 + }, + { + "epoch": 0.3299582489562239, + "grad_norm": 1.7879021167755127, + "learning_rate": 1.694767871624738e-05, + "loss": 0.3442, + "step": 26396 + }, + { + "epoch": 0.32998324958123953, + "grad_norm": 6.392433166503906, + "learning_rate": 1.6947051022252018e-05, + "loss": 0.3824, + "step": 26398 + }, + { + "epoch": 0.3300082502062552, + "grad_norm": 0.03317936882376671, + "learning_rate": 1.6946423275348897e-05, + "loss": 0.1095, + "step": 26400 + }, + { + "epoch": 0.3300332508312708, + "grad_norm": 3.4705755710601807, + "learning_rate": 1.6945795475542795e-05, + "loss": 0.937, + "step": 26402 + }, + { + "epoch": 0.3300582514562864, + "grad_norm": 3.457990884780884, + "learning_rate": 1.69451676228385e-05, + "loss": 1.3077, + "step": 26404 + }, + { + "epoch": 0.330083252081302, + "grad_norm": 4.812582969665527, + "learning_rate": 1.6944539717240795e-05, + "loss": 1.682, + "step": 26406 + }, + { + "epoch": 0.33010825270631766, + "grad_norm": 2.3103578090667725, + "learning_rate": 1.694391175875445e-05, + "loss": 0.243, + "step": 26408 + }, + { + "epoch": 0.3301332533313333, + "grad_norm": 0.02961559034883976, + "learning_rate": 1.6943283747384262e-05, + "loss": 0.8824, + "step": 26410 + }, + { + "epoch": 0.3301582539563489, + "grad_norm": 3.828784704208374, + "learning_rate": 1.6942655683135003e-05, + "loss": 1.6458, + "step": 26412 + }, + { + "epoch": 0.33018325458136455, + "grad_norm": 0.013031859882175922, + "learning_rate": 1.694202756601146e-05, + "loss": 0.7642, + "step": 26414 + }, + { + "epoch": 0.33020825520638014, + "grad_norm": 4.601908206939697, + "learning_rate": 1.6941399396018417e-05, + "loss": 1.5701, + "step": 26416 + }, + { + "epoch": 0.3302332558313958, + "grad_norm": 6.1993842124938965, + "learning_rate": 1.694077117316066e-05, + "loss": 0.8777, + "step": 26418 + }, + { + "epoch": 0.33025825645641144, + "grad_norm": 2.927133560180664, + "learning_rate": 1.6940142897442967e-05, + "loss": 0.9708, + "step": 26420 + }, + { + "epoch": 0.33028325708142703, + "grad_norm": 5.130112648010254, + "learning_rate": 1.693951456887013e-05, + "loss": 0.3566, + "step": 26422 + }, + { + "epoch": 0.3303082577064427, + "grad_norm": 1.577268123626709, + "learning_rate": 1.693888618744693e-05, + "loss": 0.085, + "step": 26424 + }, + { + "epoch": 0.33033325833145827, + "grad_norm": 2.8124520778656006, + "learning_rate": 1.6938257753178155e-05, + "loss": 0.7427, + "step": 26426 + }, + { + "epoch": 0.3303582589564739, + "grad_norm": 3.9835498332977295, + "learning_rate": 1.6937629266068588e-05, + "loss": 1.2485, + "step": 26428 + }, + { + "epoch": 0.33038325958148956, + "grad_norm": 2.3401706218719482, + "learning_rate": 1.6937000726123023e-05, + "loss": 1.2572, + "step": 26430 + }, + { + "epoch": 0.33040826020650516, + "grad_norm": 3.769794225692749, + "learning_rate": 1.6936372133346238e-05, + "loss": 0.7792, + "step": 26432 + }, + { + "epoch": 0.3304332608315208, + "grad_norm": 5.408169269561768, + "learning_rate": 1.6935743487743023e-05, + "loss": 0.8949, + "step": 26434 + }, + { + "epoch": 0.3304582614565364, + "grad_norm": 2.798137903213501, + "learning_rate": 1.6935114789318167e-05, + "loss": 0.6436, + "step": 26436 + }, + { + "epoch": 0.33048326208155204, + "grad_norm": 1.8798264265060425, + "learning_rate": 1.693448603807646e-05, + "loss": 0.4564, + "step": 26438 + }, + { + "epoch": 0.3305082627065677, + "grad_norm": 3.9492294788360596, + "learning_rate": 1.6933857234022684e-05, + "loss": 1.156, + "step": 26440 + }, + { + "epoch": 0.3305332633315833, + "grad_norm": 2.2570912837982178, + "learning_rate": 1.6933228377161632e-05, + "loss": 0.633, + "step": 26442 + }, + { + "epoch": 0.33055826395659893, + "grad_norm": 3.308346748352051, + "learning_rate": 1.6932599467498095e-05, + "loss": 1.9265, + "step": 26444 + }, + { + "epoch": 0.3305832645816145, + "grad_norm": 1.6193357706069946, + "learning_rate": 1.693197050503686e-05, + "loss": 0.1254, + "step": 26446 + }, + { + "epoch": 0.33060826520663017, + "grad_norm": 2.895055055618286, + "learning_rate": 1.6931341489782718e-05, + "loss": 0.2382, + "step": 26448 + }, + { + "epoch": 0.3306332658316458, + "grad_norm": 0.6638442873954773, + "learning_rate": 1.6930712421740462e-05, + "loss": 0.0607, + "step": 26450 + }, + { + "epoch": 0.3306582664566614, + "grad_norm": 0.02225273847579956, + "learning_rate": 1.6930083300914878e-05, + "loss": 0.0007, + "step": 26452 + }, + { + "epoch": 0.33068326708167706, + "grad_norm": 3.266265869140625, + "learning_rate": 1.6929454127310758e-05, + "loss": 1.283, + "step": 26454 + }, + { + "epoch": 0.33070826770669265, + "grad_norm": 4.1078877449035645, + "learning_rate": 1.6928824900932898e-05, + "loss": 0.4377, + "step": 26456 + }, + { + "epoch": 0.3307332683317083, + "grad_norm": 2.3513126373291016, + "learning_rate": 1.6928195621786085e-05, + "loss": 1.5175, + "step": 26458 + }, + { + "epoch": 0.33075826895672394, + "grad_norm": 2.6460981369018555, + "learning_rate": 1.6927566289875116e-05, + "loss": 0.4072, + "step": 26460 + }, + { + "epoch": 0.33078326958173954, + "grad_norm": 3.8981175422668457, + "learning_rate": 1.6926936905204782e-05, + "loss": 1.2898, + "step": 26462 + }, + { + "epoch": 0.3308082702067552, + "grad_norm": 3.659986972808838, + "learning_rate": 1.6926307467779874e-05, + "loss": 1.0299, + "step": 26464 + }, + { + "epoch": 0.3308332708317708, + "grad_norm": 0.0008948079193942249, + "learning_rate": 1.6925677977605187e-05, + "loss": 1.0644, + "step": 26466 + }, + { + "epoch": 0.3308582714567864, + "grad_norm": 5.489635944366455, + "learning_rate": 1.6925048434685517e-05, + "loss": 1.1239, + "step": 26468 + }, + { + "epoch": 0.33088327208180207, + "grad_norm": 3.164975881576538, + "learning_rate": 1.692441883902566e-05, + "loss": 0.2856, + "step": 26470 + }, + { + "epoch": 0.33090827270681766, + "grad_norm": 2.484079599380493, + "learning_rate": 1.6923789190630405e-05, + "loss": 0.7388, + "step": 26472 + }, + { + "epoch": 0.3309332733318333, + "grad_norm": 0.0020630608778446913, + "learning_rate": 1.6923159489504554e-05, + "loss": 0.4484, + "step": 26474 + }, + { + "epoch": 0.3309582739568489, + "grad_norm": 4.326606750488281, + "learning_rate": 1.6922529735652897e-05, + "loss": 0.9851, + "step": 26476 + }, + { + "epoch": 0.33098327458186455, + "grad_norm": 2.781236410140991, + "learning_rate": 1.6921899929080233e-05, + "loss": 0.5549, + "step": 26478 + }, + { + "epoch": 0.3310082752068802, + "grad_norm": 2.7246532440185547, + "learning_rate": 1.692127006979136e-05, + "loss": 0.6664, + "step": 26480 + }, + { + "epoch": 0.3310332758318958, + "grad_norm": 3.288374662399292, + "learning_rate": 1.692064015779107e-05, + "loss": 1.6518, + "step": 26482 + }, + { + "epoch": 0.33105827645691144, + "grad_norm": 0.0018658180488273501, + "learning_rate": 1.6920010193084164e-05, + "loss": 0.1028, + "step": 26484 + }, + { + "epoch": 0.33108327708192703, + "grad_norm": 4.880986213684082, + "learning_rate": 1.6919380175675438e-05, + "loss": 0.7671, + "step": 26486 + }, + { + "epoch": 0.3311082777069427, + "grad_norm": 7.549800395965576, + "learning_rate": 1.6918750105569692e-05, + "loss": 1.418, + "step": 26488 + }, + { + "epoch": 0.3311332783319583, + "grad_norm": 2.333693027496338, + "learning_rate": 1.6918119982771724e-05, + "loss": 0.8169, + "step": 26490 + }, + { + "epoch": 0.3311582789569739, + "grad_norm": 3.3888773918151855, + "learning_rate": 1.691748980728633e-05, + "loss": 1.3184, + "step": 26492 + }, + { + "epoch": 0.33118327958198956, + "grad_norm": 2.5495293140411377, + "learning_rate": 1.6916859579118315e-05, + "loss": 0.4821, + "step": 26494 + }, + { + "epoch": 0.33120828020700516, + "grad_norm": 3.6972436904907227, + "learning_rate": 1.6916229298272476e-05, + "loss": 1.1484, + "step": 26496 + }, + { + "epoch": 0.3312332808320208, + "grad_norm": 0.0017614230746403337, + "learning_rate": 1.691559896475361e-05, + "loss": 0.4156, + "step": 26498 + }, + { + "epoch": 0.33125828145703645, + "grad_norm": 6.367526054382324, + "learning_rate": 1.691496857856652e-05, + "loss": 1.6514, + "step": 26500 + }, + { + "epoch": 0.33128328208205204, + "grad_norm": 3.59849214553833, + "learning_rate": 1.691433813971601e-05, + "loss": 1.3346, + "step": 26502 + }, + { + "epoch": 0.3313082827070677, + "grad_norm": 4.162550449371338, + "learning_rate": 1.6913707648206877e-05, + "loss": 0.216, + "step": 26504 + }, + { + "epoch": 0.3313332833320833, + "grad_norm": 5.242199897766113, + "learning_rate": 1.6913077104043926e-05, + "loss": 1.2523, + "step": 26506 + }, + { + "epoch": 0.33135828395709893, + "grad_norm": 2.5116941928863525, + "learning_rate": 1.6912446507231952e-05, + "loss": 1.4697, + "step": 26508 + }, + { + "epoch": 0.3313832845821146, + "grad_norm": 3.401104688644409, + "learning_rate": 1.6911815857775772e-05, + "loss": 1.5564, + "step": 26510 + }, + { + "epoch": 0.33140828520713017, + "grad_norm": 4.431003570556641, + "learning_rate": 1.691118515568017e-05, + "loss": 1.3139, + "step": 26512 + }, + { + "epoch": 0.3314332858321458, + "grad_norm": 5.35401725769043, + "learning_rate": 1.6910554400949965e-05, + "loss": 1.501, + "step": 26514 + }, + { + "epoch": 0.3314582864571614, + "grad_norm": 2.942593812942505, + "learning_rate": 1.6909923593589954e-05, + "loss": 0.7137, + "step": 26516 + }, + { + "epoch": 0.33148328708217706, + "grad_norm": 3.63244891166687, + "learning_rate": 1.690929273360494e-05, + "loss": 0.9635, + "step": 26518 + }, + { + "epoch": 0.3315082877071927, + "grad_norm": 1.3395605087280273, + "learning_rate": 1.690866182099973e-05, + "loss": 1.2787, + "step": 26520 + }, + { + "epoch": 0.3315332883322083, + "grad_norm": 3.0481796264648438, + "learning_rate": 1.6908030855779132e-05, + "loss": 1.6967, + "step": 26522 + }, + { + "epoch": 0.33155828895722395, + "grad_norm": 8.316746711730957, + "learning_rate": 1.6907399837947944e-05, + "loss": 0.7026, + "step": 26524 + }, + { + "epoch": 0.33158328958223954, + "grad_norm": 3.230489730834961, + "learning_rate": 1.6906768767510978e-05, + "loss": 1.4782, + "step": 26526 + }, + { + "epoch": 0.3316082902072552, + "grad_norm": 4.23842716217041, + "learning_rate": 1.6906137644473036e-05, + "loss": 1.1218, + "step": 26528 + }, + { + "epoch": 0.33163329083227083, + "grad_norm": 3.069645643234253, + "learning_rate": 1.690550646883893e-05, + "loss": 0.4824, + "step": 26530 + }, + { + "epoch": 0.3316582914572864, + "grad_norm": 1.389526605606079, + "learning_rate": 1.690487524061346e-05, + "loss": 0.7356, + "step": 26532 + }, + { + "epoch": 0.3316832920823021, + "grad_norm": 5.321286678314209, + "learning_rate": 1.6904243959801436e-05, + "loss": 1.7019, + "step": 26534 + }, + { + "epoch": 0.33170829270731766, + "grad_norm": 0.00120014906860888, + "learning_rate": 1.690361262640767e-05, + "loss": 1.0474, + "step": 26536 + }, + { + "epoch": 0.3317332933323333, + "grad_norm": 2.326390504837036, + "learning_rate": 1.6902981240436962e-05, + "loss": 0.423, + "step": 26538 + }, + { + "epoch": 0.33175829395734896, + "grad_norm": 2.813246965408325, + "learning_rate": 1.6902349801894128e-05, + "loss": 1.3654, + "step": 26540 + }, + { + "epoch": 0.33178329458236455, + "grad_norm": 0.0015662669902667403, + "learning_rate": 1.6901718310783975e-05, + "loss": 0.1883, + "step": 26542 + }, + { + "epoch": 0.3318082952073802, + "grad_norm": 4.503045558929443, + "learning_rate": 1.6901086767111308e-05, + "loss": 1.1209, + "step": 26544 + }, + { + "epoch": 0.3318332958323958, + "grad_norm": 2.915287971496582, + "learning_rate": 1.6900455170880944e-05, + "loss": 0.6367, + "step": 26546 + }, + { + "epoch": 0.33185829645741144, + "grad_norm": 2.299407720565796, + "learning_rate": 1.6899823522097688e-05, + "loss": 0.7421, + "step": 26548 + }, + { + "epoch": 0.3318832970824271, + "grad_norm": 2.71852707862854, + "learning_rate": 1.6899191820766348e-05, + "loss": 0.7541, + "step": 26550 + }, + { + "epoch": 0.3319082977074427, + "grad_norm": 2.7072198390960693, + "learning_rate": 1.6898560066891742e-05, + "loss": 0.6144, + "step": 26552 + }, + { + "epoch": 0.3319332983324583, + "grad_norm": 4.698315143585205, + "learning_rate": 1.6897928260478677e-05, + "loss": 1.3653, + "step": 26554 + }, + { + "epoch": 0.3319582989574739, + "grad_norm": 3.427539348602295, + "learning_rate": 1.6897296401531965e-05, + "loss": 0.6664, + "step": 26556 + }, + { + "epoch": 0.33198329958248957, + "grad_norm": 3.7145402431488037, + "learning_rate": 1.689666449005642e-05, + "loss": 1.6338, + "step": 26558 + }, + { + "epoch": 0.3320083002075052, + "grad_norm": 3.821873188018799, + "learning_rate": 1.6896032526056852e-05, + "loss": 0.6966, + "step": 26560 + }, + { + "epoch": 0.3320333008325208, + "grad_norm": 0.001797069446183741, + "learning_rate": 1.689540050953808e-05, + "loss": 0.657, + "step": 26562 + }, + { + "epoch": 0.33205830145753645, + "grad_norm": 2.094714403152466, + "learning_rate": 1.689476844050491e-05, + "loss": 1.0302, + "step": 26564 + }, + { + "epoch": 0.33208330208255205, + "grad_norm": 2.7999329566955566, + "learning_rate": 1.689413631896216e-05, + "loss": 0.6602, + "step": 26566 + }, + { + "epoch": 0.3321083027075677, + "grad_norm": 4.544336795806885, + "learning_rate": 1.689350414491464e-05, + "loss": 1.1364, + "step": 26568 + }, + { + "epoch": 0.33213330333258334, + "grad_norm": 5.654669284820557, + "learning_rate": 1.6892871918367168e-05, + "loss": 1.5509, + "step": 26570 + }, + { + "epoch": 0.33215830395759893, + "grad_norm": 8.64430046081543, + "learning_rate": 1.689223963932456e-05, + "loss": 3.0888, + "step": 26572 + }, + { + "epoch": 0.3321833045826146, + "grad_norm": 0.7753102779388428, + "learning_rate": 1.689160730779163e-05, + "loss": 0.8085, + "step": 26574 + }, + { + "epoch": 0.3322083052076302, + "grad_norm": 3.4139788150787354, + "learning_rate": 1.689097492377319e-05, + "loss": 0.6972, + "step": 26576 + }, + { + "epoch": 0.3322333058326458, + "grad_norm": 3.0542960166931152, + "learning_rate": 1.6890342487274058e-05, + "loss": 1.2592, + "step": 26578 + }, + { + "epoch": 0.33225830645766147, + "grad_norm": 0.0067287287674844265, + "learning_rate": 1.6889709998299055e-05, + "loss": 0.358, + "step": 26580 + }, + { + "epoch": 0.33228330708267706, + "grad_norm": 4.368167877197266, + "learning_rate": 1.6889077456852998e-05, + "loss": 0.5996, + "step": 26582 + }, + { + "epoch": 0.3323083077076927, + "grad_norm": 2.6647374629974365, + "learning_rate": 1.6888444862940696e-05, + "loss": 0.719, + "step": 26584 + }, + { + "epoch": 0.3323333083327083, + "grad_norm": 3.7768847942352295, + "learning_rate": 1.6887812216566976e-05, + "loss": 0.827, + "step": 26586 + }, + { + "epoch": 0.33235830895772395, + "grad_norm": 3.6474971771240234, + "learning_rate": 1.688717951773665e-05, + "loss": 1.0753, + "step": 26588 + }, + { + "epoch": 0.3323833095827396, + "grad_norm": 3.679609775543213, + "learning_rate": 1.688654676645454e-05, + "loss": 0.9246, + "step": 26590 + }, + { + "epoch": 0.3324083102077552, + "grad_norm": 2.5368998050689697, + "learning_rate": 1.6885913962725463e-05, + "loss": 0.7907, + "step": 26592 + }, + { + "epoch": 0.33243331083277083, + "grad_norm": 0.11165780574083328, + "learning_rate": 1.6885281106554238e-05, + "loss": 0.1586, + "step": 26594 + }, + { + "epoch": 0.3324583114577864, + "grad_norm": 3.54795503616333, + "learning_rate": 1.6884648197945686e-05, + "loss": 0.9161, + "step": 26596 + }, + { + "epoch": 0.3324833120828021, + "grad_norm": 2.8442063331604004, + "learning_rate": 1.688401523690463e-05, + "loss": 0.7755, + "step": 26598 + }, + { + "epoch": 0.3325083127078177, + "grad_norm": 3.5129916667938232, + "learning_rate": 1.6883382223435886e-05, + "loss": 0.9545, + "step": 26600 + }, + { + "epoch": 0.3325333133328333, + "grad_norm": 0.0009545634966343641, + "learning_rate": 1.6882749157544273e-05, + "loss": 0.0, + "step": 26602 + }, + { + "epoch": 0.33255831395784896, + "grad_norm": 3.8344719409942627, + "learning_rate": 1.688211603923462e-05, + "loss": 0.5681, + "step": 26604 + }, + { + "epoch": 0.33258331458286455, + "grad_norm": 0.0219538826495409, + "learning_rate": 1.688148286851174e-05, + "loss": 0.0188, + "step": 26606 + }, + { + "epoch": 0.3326083152078802, + "grad_norm": 6.93481969833374, + "learning_rate": 1.6880849645380464e-05, + "loss": 1.5639, + "step": 26608 + }, + { + "epoch": 0.33263331583289585, + "grad_norm": 3.609778881072998, + "learning_rate": 1.688021636984561e-05, + "loss": 1.3157, + "step": 26610 + }, + { + "epoch": 0.33265831645791144, + "grad_norm": 2.45061993598938, + "learning_rate": 1.6879583041911997e-05, + "loss": 0.2213, + "step": 26612 + }, + { + "epoch": 0.3326833170829271, + "grad_norm": 1.6782894134521484, + "learning_rate": 1.6878949661584458e-05, + "loss": 0.3973, + "step": 26614 + }, + { + "epoch": 0.3327083177079427, + "grad_norm": 9.668856620788574, + "learning_rate": 1.6878316228867805e-05, + "loss": 2.2654, + "step": 26616 + }, + { + "epoch": 0.3327333183329583, + "grad_norm": 2.4334022998809814, + "learning_rate": 1.687768274376687e-05, + "loss": 1.1783, + "step": 26618 + }, + { + "epoch": 0.332758318957974, + "grad_norm": 2.5758919715881348, + "learning_rate": 1.687704920628648e-05, + "loss": 0.3366, + "step": 26620 + }, + { + "epoch": 0.33278331958298957, + "grad_norm": 0.26687124371528625, + "learning_rate": 1.687641561643145e-05, + "loss": 0.0055, + "step": 26622 + }, + { + "epoch": 0.3328083202080052, + "grad_norm": 4.007158279418945, + "learning_rate": 1.6875781974206616e-05, + "loss": 1.515, + "step": 26624 + }, + { + "epoch": 0.3328333208330208, + "grad_norm": 8.396470069885254, + "learning_rate": 1.68751482796168e-05, + "loss": 1.2987, + "step": 26626 + }, + { + "epoch": 0.33285832145803645, + "grad_norm": 3.099809408187866, + "learning_rate": 1.687451453266682e-05, + "loss": 1.1046, + "step": 26628 + }, + { + "epoch": 0.3328833220830521, + "grad_norm": 3.4460062980651855, + "learning_rate": 1.6873880733361515e-05, + "loss": 0.6185, + "step": 26630 + }, + { + "epoch": 0.3329083227080677, + "grad_norm": 6.671700954437256, + "learning_rate": 1.68732468817057e-05, + "loss": 1.1239, + "step": 26632 + }, + { + "epoch": 0.33293332333308334, + "grad_norm": 4.776125907897949, + "learning_rate": 1.6872612977704216e-05, + "loss": 0.8075, + "step": 26634 + }, + { + "epoch": 0.33295832395809893, + "grad_norm": 3.700690746307373, + "learning_rate": 1.687197902136188e-05, + "loss": 1.9292, + "step": 26636 + }, + { + "epoch": 0.3329833245831146, + "grad_norm": 4.704551696777344, + "learning_rate": 1.6871345012683528e-05, + "loss": 1.3022, + "step": 26638 + }, + { + "epoch": 0.33300832520813023, + "grad_norm": 2.869755983352661, + "learning_rate": 1.6870710951673976e-05, + "loss": 0.8661, + "step": 26640 + }, + { + "epoch": 0.3330333258331458, + "grad_norm": 6.515152454376221, + "learning_rate": 1.6870076838338067e-05, + "loss": 1.3664, + "step": 26642 + }, + { + "epoch": 0.33305832645816147, + "grad_norm": 6.724056720733643, + "learning_rate": 1.6869442672680624e-05, + "loss": 2.1455, + "step": 26644 + }, + { + "epoch": 0.33308332708317706, + "grad_norm": 4.785434722900391, + "learning_rate": 1.6868808454706472e-05, + "loss": 0.7577, + "step": 26646 + }, + { + "epoch": 0.3331083277081927, + "grad_norm": 2.488816738128662, + "learning_rate": 1.6868174184420453e-05, + "loss": 1.4512, + "step": 26648 + }, + { + "epoch": 0.33313332833320836, + "grad_norm": 1.9739060401916504, + "learning_rate": 1.6867539861827388e-05, + "loss": 0.9097, + "step": 26650 + }, + { + "epoch": 0.33315832895822395, + "grad_norm": 1.0442513227462769, + "learning_rate": 1.686690548693211e-05, + "loss": 0.9317, + "step": 26652 + }, + { + "epoch": 0.3331833295832396, + "grad_norm": 4.150050163269043, + "learning_rate": 1.686627105973945e-05, + "loss": 1.8955, + "step": 26654 + }, + { + "epoch": 0.3332083302082552, + "grad_norm": 4.252279281616211, + "learning_rate": 1.686563658025424e-05, + "loss": 1.2228, + "step": 26656 + }, + { + "epoch": 0.33323333083327084, + "grad_norm": 1.9739890098571777, + "learning_rate": 1.6865002048481317e-05, + "loss": 0.8743, + "step": 26658 + }, + { + "epoch": 0.3332583314582865, + "grad_norm": 4.1476149559021, + "learning_rate": 1.6864367464425506e-05, + "loss": 1.1304, + "step": 26660 + }, + { + "epoch": 0.3332833320833021, + "grad_norm": 6.784598350524902, + "learning_rate": 1.6863732828091644e-05, + "loss": 1.0333, + "step": 26662 + }, + { + "epoch": 0.3333083327083177, + "grad_norm": 6.868639945983887, + "learning_rate": 1.686309813948456e-05, + "loss": 1.8046, + "step": 26664 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 4.864518165588379, + "learning_rate": 1.6862463398609096e-05, + "loss": 1.6223, + "step": 26666 + }, + { + "epoch": 0.33335833395834896, + "grad_norm": 3.1872968673706055, + "learning_rate": 1.686182860547008e-05, + "loss": 0.2503, + "step": 26668 + }, + { + "epoch": 0.3333833345833646, + "grad_norm": 2.3848612308502197, + "learning_rate": 1.6861193760072345e-05, + "loss": 0.1206, + "step": 26670 + }, + { + "epoch": 0.3334083352083802, + "grad_norm": 3.3125064373016357, + "learning_rate": 1.686055886242073e-05, + "loss": 1.6237, + "step": 26672 + }, + { + "epoch": 0.33343333583339585, + "grad_norm": 3.3569412231445312, + "learning_rate": 1.685992391252007e-05, + "loss": 1.3278, + "step": 26674 + }, + { + "epoch": 0.33345833645841144, + "grad_norm": 0.0007755886181257665, + "learning_rate": 1.68592889103752e-05, + "loss": 0.9317, + "step": 26676 + }, + { + "epoch": 0.3334833370834271, + "grad_norm": 3.207778215408325, + "learning_rate": 1.685865385599095e-05, + "loss": 0.637, + "step": 26678 + }, + { + "epoch": 0.33350833770844274, + "grad_norm": 6.403885364532471, + "learning_rate": 1.6858018749372172e-05, + "loss": 0.6733, + "step": 26680 + }, + { + "epoch": 0.33353333833345833, + "grad_norm": 4.4052815437316895, + "learning_rate": 1.6857383590523685e-05, + "loss": 1.6312, + "step": 26682 + }, + { + "epoch": 0.333558338958474, + "grad_norm": 5.973640441894531, + "learning_rate": 1.6856748379450335e-05, + "loss": 0.6213, + "step": 26684 + }, + { + "epoch": 0.33358333958348957, + "grad_norm": 3.883652448654175, + "learning_rate": 1.6856113116156964e-05, + "loss": 0.249, + "step": 26686 + }, + { + "epoch": 0.3336083402085052, + "grad_norm": 0.12040767073631287, + "learning_rate": 1.6855477800648402e-05, + "loss": 0.006, + "step": 26688 + }, + { + "epoch": 0.33363334083352086, + "grad_norm": 3.178643226623535, + "learning_rate": 1.685484243292949e-05, + "loss": 1.4404, + "step": 26690 + }, + { + "epoch": 0.33365834145853646, + "grad_norm": 0.000979500007815659, + "learning_rate": 1.6854207013005067e-05, + "loss": 0.0002, + "step": 26692 + }, + { + "epoch": 0.3336833420835521, + "grad_norm": 2.217874050140381, + "learning_rate": 1.6853571540879974e-05, + "loss": 1.2296, + "step": 26694 + }, + { + "epoch": 0.3337083427085677, + "grad_norm": 2.7536075115203857, + "learning_rate": 1.6852936016559048e-05, + "loss": 1.1261, + "step": 26696 + }, + { + "epoch": 0.33373334333358334, + "grad_norm": 2.458383560180664, + "learning_rate": 1.6852300440047136e-05, + "loss": 0.848, + "step": 26698 + }, + { + "epoch": 0.333758343958599, + "grad_norm": 2.788775682449341, + "learning_rate": 1.6851664811349067e-05, + "loss": 0.4276, + "step": 26700 + }, + { + "epoch": 0.3337833445836146, + "grad_norm": 4.787105083465576, + "learning_rate": 1.685102913046969e-05, + "loss": 0.9118, + "step": 26702 + }, + { + "epoch": 0.33380834520863023, + "grad_norm": 4.435895919799805, + "learning_rate": 1.685039339741384e-05, + "loss": 1.2741, + "step": 26704 + }, + { + "epoch": 0.3338333458336458, + "grad_norm": 0.0008359836647287011, + "learning_rate": 1.684975761218637e-05, + "loss": 0.0, + "step": 26706 + }, + { + "epoch": 0.33385834645866147, + "grad_norm": 2.8006503582000732, + "learning_rate": 1.6849121774792104e-05, + "loss": 1.0788, + "step": 26708 + }, + { + "epoch": 0.3338833470836771, + "grad_norm": 4.253189563751221, + "learning_rate": 1.6848485885235904e-05, + "loss": 0.3337, + "step": 26710 + }, + { + "epoch": 0.3339083477086927, + "grad_norm": 0.635922372341156, + "learning_rate": 1.6847849943522603e-05, + "loss": 0.7499, + "step": 26712 + }, + { + "epoch": 0.33393334833370836, + "grad_norm": 3.5292165279388428, + "learning_rate": 1.684721394965704e-05, + "loss": 1.3949, + "step": 26714 + }, + { + "epoch": 0.33395834895872395, + "grad_norm": 0.0006908461800776422, + "learning_rate": 1.684657790364407e-05, + "loss": 0.5189, + "step": 26716 + }, + { + "epoch": 0.3339833495837396, + "grad_norm": 3.3548665046691895, + "learning_rate": 1.6845941805488525e-05, + "loss": 0.516, + "step": 26718 + }, + { + "epoch": 0.33400835020875524, + "grad_norm": 0.0006418628618121147, + "learning_rate": 1.684530565519526e-05, + "loss": 0.5643, + "step": 26720 + }, + { + "epoch": 0.33403335083377084, + "grad_norm": 2.496072769165039, + "learning_rate": 1.6844669452769112e-05, + "loss": 1.4511, + "step": 26722 + }, + { + "epoch": 0.3340583514587865, + "grad_norm": 3.1528830528259277, + "learning_rate": 1.6844033198214936e-05, + "loss": 1.496, + "step": 26724 + }, + { + "epoch": 0.3340833520838021, + "grad_norm": 3.7684483528137207, + "learning_rate": 1.6843396891537564e-05, + "loss": 0.5962, + "step": 26726 + }, + { + "epoch": 0.3341083527088177, + "grad_norm": 2.9374783039093018, + "learning_rate": 1.6842760532741848e-05, + "loss": 1.1864, + "step": 26728 + }, + { + "epoch": 0.33413335333383337, + "grad_norm": 2.6230218410491943, + "learning_rate": 1.684212412183264e-05, + "loss": 1.319, + "step": 26730 + }, + { + "epoch": 0.33415835395884896, + "grad_norm": 4.62359094619751, + "learning_rate": 1.684148765881478e-05, + "loss": 1.8969, + "step": 26732 + }, + { + "epoch": 0.3341833545838646, + "grad_norm": 2.7538280487060547, + "learning_rate": 1.684085114369312e-05, + "loss": 0.7709, + "step": 26734 + }, + { + "epoch": 0.3342083552088802, + "grad_norm": 4.246316432952881, + "learning_rate": 1.68402145764725e-05, + "loss": 1.0676, + "step": 26736 + }, + { + "epoch": 0.33423335583389585, + "grad_norm": 1.2563519477844238, + "learning_rate": 1.683957795715778e-05, + "loss": 0.7557, + "step": 26738 + }, + { + "epoch": 0.3342583564589115, + "grad_norm": 3.3743903636932373, + "learning_rate": 1.6838941285753796e-05, + "loss": 1.9643, + "step": 26740 + }, + { + "epoch": 0.3342833570839271, + "grad_norm": 0.983491837978363, + "learning_rate": 1.6838304562265406e-05, + "loss": 0.0452, + "step": 26742 + }, + { + "epoch": 0.33430835770894274, + "grad_norm": 4.787248611450195, + "learning_rate": 1.683766778669745e-05, + "loss": 1.3856, + "step": 26744 + }, + { + "epoch": 0.33433335833395833, + "grad_norm": 3.1423821449279785, + "learning_rate": 1.6837030959054786e-05, + "loss": 1.2226, + "step": 26746 + }, + { + "epoch": 0.334358358958974, + "grad_norm": 7.8436408042907715, + "learning_rate": 1.6836394079342264e-05, + "loss": 1.8146, + "step": 26748 + }, + { + "epoch": 0.3343833595839896, + "grad_norm": 2.020692825317383, + "learning_rate": 1.683575714756473e-05, + "loss": 0.5582, + "step": 26750 + }, + { + "epoch": 0.3344083602090052, + "grad_norm": 3.187520742416382, + "learning_rate": 1.6835120163727036e-05, + "loss": 1.363, + "step": 26752 + }, + { + "epoch": 0.33443336083402087, + "grad_norm": 3.9159531593322754, + "learning_rate": 1.6834483127834032e-05, + "loss": 0.8701, + "step": 26754 + }, + { + "epoch": 0.33445836145903646, + "grad_norm": 0.006104963831603527, + "learning_rate": 1.683384603989057e-05, + "loss": 0.9637, + "step": 26756 + }, + { + "epoch": 0.3344833620840521, + "grad_norm": 1.7623164653778076, + "learning_rate": 1.6833208899901507e-05, + "loss": 0.534, + "step": 26758 + }, + { + "epoch": 0.33450836270906775, + "grad_norm": 7.018033027648926, + "learning_rate": 1.6832571707871688e-05, + "loss": 1.8251, + "step": 26760 + }, + { + "epoch": 0.33453336333408334, + "grad_norm": 2.730518102645874, + "learning_rate": 1.683193446380597e-05, + "loss": 1.5244, + "step": 26762 + }, + { + "epoch": 0.334558363959099, + "grad_norm": 3.670457363128662, + "learning_rate": 1.6831297167709208e-05, + "loss": 1.8188, + "step": 26764 + }, + { + "epoch": 0.3345833645841146, + "grad_norm": 2.537616491317749, + "learning_rate": 1.6830659819586246e-05, + "loss": 1.0354, + "step": 26766 + }, + { + "epoch": 0.33460836520913023, + "grad_norm": 0.0011314558796584606, + "learning_rate": 1.683002241944195e-05, + "loss": 0.8485, + "step": 26768 + }, + { + "epoch": 0.3346333658341459, + "grad_norm": 4.356926918029785, + "learning_rate": 1.682938496728117e-05, + "loss": 0.9284, + "step": 26770 + }, + { + "epoch": 0.33465836645916147, + "grad_norm": 2.473331928253174, + "learning_rate": 1.6828747463108758e-05, + "loss": 0.6323, + "step": 26772 + }, + { + "epoch": 0.3346833670841771, + "grad_norm": 0.04071173071861267, + "learning_rate": 1.682810990692957e-05, + "loss": 0.9555, + "step": 26774 + }, + { + "epoch": 0.3347083677091927, + "grad_norm": 0.0010116511257365346, + "learning_rate": 1.6827472298748464e-05, + "loss": 0.6107, + "step": 26776 + }, + { + "epoch": 0.33473336833420836, + "grad_norm": 0.0013730465434491634, + "learning_rate": 1.6826834638570295e-05, + "loss": 0.468, + "step": 26778 + }, + { + "epoch": 0.334758368959224, + "grad_norm": 3.170128345489502, + "learning_rate": 1.6826196926399915e-05, + "loss": 1.1354, + "step": 26780 + }, + { + "epoch": 0.3347833695842396, + "grad_norm": 5.132997989654541, + "learning_rate": 1.6825559162242186e-05, + "loss": 1.2058, + "step": 26782 + }, + { + "epoch": 0.33480837020925525, + "grad_norm": 1.5164988040924072, + "learning_rate": 1.6824921346101968e-05, + "loss": 0.602, + "step": 26784 + }, + { + "epoch": 0.33483337083427084, + "grad_norm": 3.6590309143066406, + "learning_rate": 1.682428347798411e-05, + "loss": 0.3358, + "step": 26786 + }, + { + "epoch": 0.3348583714592865, + "grad_norm": 9.453947067260742, + "learning_rate": 1.6823645557893473e-05, + "loss": 2.4188, + "step": 26788 + }, + { + "epoch": 0.33488337208430213, + "grad_norm": 0.0016069997800514102, + "learning_rate": 1.682300758583492e-05, + "loss": 0.0249, + "step": 26790 + }, + { + "epoch": 0.3349083727093177, + "grad_norm": 5.251373767852783, + "learning_rate": 1.6822369561813303e-05, + "loss": 1.4898, + "step": 26792 + }, + { + "epoch": 0.3349333733343334, + "grad_norm": 1.9490437507629395, + "learning_rate": 1.6821731485833487e-05, + "loss": 0.1269, + "step": 26794 + }, + { + "epoch": 0.33495837395934897, + "grad_norm": 4.216481685638428, + "learning_rate": 1.682109335790033e-05, + "loss": 1.2118, + "step": 26796 + }, + { + "epoch": 0.3349833745843646, + "grad_norm": 2.260991096496582, + "learning_rate": 1.6820455178018688e-05, + "loss": 0.431, + "step": 26798 + }, + { + "epoch": 0.33500837520938026, + "grad_norm": 4.705681324005127, + "learning_rate": 1.6819816946193422e-05, + "loss": 0.7073, + "step": 26800 + }, + { + "epoch": 0.33503337583439585, + "grad_norm": 4.142817974090576, + "learning_rate": 1.6819178662429397e-05, + "loss": 1.2218, + "step": 26802 + }, + { + "epoch": 0.3350583764594115, + "grad_norm": 2.699263095855713, + "learning_rate": 1.6818540326731468e-05, + "loss": 0.772, + "step": 26804 + }, + { + "epoch": 0.3350833770844271, + "grad_norm": 10.755215644836426, + "learning_rate": 1.6817901939104503e-05, + "loss": 1.0877, + "step": 26806 + }, + { + "epoch": 0.33510837770944274, + "grad_norm": 2.960225820541382, + "learning_rate": 1.6817263499553362e-05, + "loss": 0.1088, + "step": 26808 + }, + { + "epoch": 0.3351333783344584, + "grad_norm": 3.5789263248443604, + "learning_rate": 1.6816625008082906e-05, + "loss": 0.661, + "step": 26810 + }, + { + "epoch": 0.335158378959474, + "grad_norm": 0.7683701515197754, + "learning_rate": 1.6815986464697997e-05, + "loss": 0.0592, + "step": 26812 + }, + { + "epoch": 0.3351833795844896, + "grad_norm": 2.109111785888672, + "learning_rate": 1.68153478694035e-05, + "loss": 0.7435, + "step": 26814 + }, + { + "epoch": 0.3352083802095052, + "grad_norm": 2.905315637588501, + "learning_rate": 1.681470922220428e-05, + "loss": 1.1948, + "step": 26816 + }, + { + "epoch": 0.33523338083452087, + "grad_norm": 3.2335033416748047, + "learning_rate": 1.681407052310519e-05, + "loss": 0.9264, + "step": 26818 + }, + { + "epoch": 0.3352583814595365, + "grad_norm": 5.636012554168701, + "learning_rate": 1.681343177211111e-05, + "loss": 1.2791, + "step": 26820 + }, + { + "epoch": 0.3352833820845521, + "grad_norm": 5.196803569793701, + "learning_rate": 1.6812792969226893e-05, + "loss": 1.1742, + "step": 26822 + }, + { + "epoch": 0.33530838270956775, + "grad_norm": 5.877933502197266, + "learning_rate": 1.681215411445741e-05, + "loss": 2.5796, + "step": 26824 + }, + { + "epoch": 0.33533338333458335, + "grad_norm": 2.228459358215332, + "learning_rate": 1.6811515207807524e-05, + "loss": 1.1009, + "step": 26826 + }, + { + "epoch": 0.335358383959599, + "grad_norm": 3.917541265487671, + "learning_rate": 1.68108762492821e-05, + "loss": 0.8175, + "step": 26828 + }, + { + "epoch": 0.33538338458461464, + "grad_norm": 0.38561272621154785, + "learning_rate": 1.681023723888601e-05, + "loss": 0.0631, + "step": 26830 + }, + { + "epoch": 0.33540838520963023, + "grad_norm": 6.379402160644531, + "learning_rate": 1.680959817662411e-05, + "loss": 1.333, + "step": 26832 + }, + { + "epoch": 0.3354333858346459, + "grad_norm": 2.421316146850586, + "learning_rate": 1.6808959062501276e-05, + "loss": 1.8193, + "step": 26834 + }, + { + "epoch": 0.3354583864596615, + "grad_norm": 2.4381377696990967, + "learning_rate": 1.6808319896522377e-05, + "loss": 0.6203, + "step": 26836 + }, + { + "epoch": 0.3354833870846771, + "grad_norm": 0.001158549333922565, + "learning_rate": 1.6807680678692275e-05, + "loss": 0.0409, + "step": 26838 + }, + { + "epoch": 0.33550838770969277, + "grad_norm": 3.890104055404663, + "learning_rate": 1.6807041409015834e-05, + "loss": 1.0558, + "step": 26840 + }, + { + "epoch": 0.33553338833470836, + "grad_norm": 0.001167629612609744, + "learning_rate": 1.6806402087497932e-05, + "loss": 0.4508, + "step": 26842 + }, + { + "epoch": 0.335558388959724, + "grad_norm": 2.7390267848968506, + "learning_rate": 1.6805762714143436e-05, + "loss": 0.8863, + "step": 26844 + }, + { + "epoch": 0.3355833895847396, + "grad_norm": 7.310038089752197, + "learning_rate": 1.6805123288957208e-05, + "loss": 1.7575, + "step": 26846 + }, + { + "epoch": 0.33560839020975525, + "grad_norm": 5.029323577880859, + "learning_rate": 1.6804483811944128e-05, + "loss": 2.0412, + "step": 26848 + }, + { + "epoch": 0.3356333908347709, + "grad_norm": 1.0455913543701172, + "learning_rate": 1.680384428310906e-05, + "loss": 0.136, + "step": 26850 + }, + { + "epoch": 0.3356583914597865, + "grad_norm": 2.775008201599121, + "learning_rate": 1.6803204702456878e-05, + "loss": 0.9238, + "step": 26852 + }, + { + "epoch": 0.33568339208480213, + "grad_norm": 2.456505060195923, + "learning_rate": 1.6802565069992448e-05, + "loss": 0.792, + "step": 26854 + }, + { + "epoch": 0.3357083927098177, + "grad_norm": 0.002257702639326453, + "learning_rate": 1.6801925385720648e-05, + "loss": 0.7204, + "step": 26856 + }, + { + "epoch": 0.3357333933348334, + "grad_norm": 4.1295623779296875, + "learning_rate": 1.6801285649646342e-05, + "loss": 1.1442, + "step": 26858 + }, + { + "epoch": 0.335758393959849, + "grad_norm": 6.073734760284424, + "learning_rate": 1.6800645861774407e-05, + "loss": 1.3409, + "step": 26860 + }, + { + "epoch": 0.3357833945848646, + "grad_norm": 0.002200842835009098, + "learning_rate": 1.680000602210972e-05, + "loss": 0.8212, + "step": 26862 + }, + { + "epoch": 0.33580839520988026, + "grad_norm": 2.3277666568756104, + "learning_rate": 1.6799366130657144e-05, + "loss": 0.9161, + "step": 26864 + }, + { + "epoch": 0.33583339583489585, + "grad_norm": 6.10488748550415, + "learning_rate": 1.6798726187421554e-05, + "loss": 2.4693, + "step": 26866 + }, + { + "epoch": 0.3358583964599115, + "grad_norm": 0.0014150369679555297, + "learning_rate": 1.6798086192407833e-05, + "loss": 1.3312, + "step": 26868 + }, + { + "epoch": 0.33588339708492715, + "grad_norm": 4.807626247406006, + "learning_rate": 1.679744614562085e-05, + "loss": 1.4542, + "step": 26870 + }, + { + "epoch": 0.33590839770994274, + "grad_norm": 0.018344314768910408, + "learning_rate": 1.6796806047065472e-05, + "loss": 0.0004, + "step": 26872 + }, + { + "epoch": 0.3359333983349584, + "grad_norm": 5.609313488006592, + "learning_rate": 1.6796165896746583e-05, + "loss": 0.7453, + "step": 26874 + }, + { + "epoch": 0.335958398959974, + "grad_norm": 7.203884601593018, + "learning_rate": 1.6795525694669058e-05, + "loss": 0.4788, + "step": 26876 + }, + { + "epoch": 0.33598339958498963, + "grad_norm": 0.0018833192298188806, + "learning_rate": 1.679488544083777e-05, + "loss": 0.0001, + "step": 26878 + }, + { + "epoch": 0.3360084002100053, + "grad_norm": 6.933934688568115, + "learning_rate": 1.679424513525759e-05, + "loss": 2.2827, + "step": 26880 + }, + { + "epoch": 0.33603340083502087, + "grad_norm": 2.8323144912719727, + "learning_rate": 1.6793604777933404e-05, + "loss": 0.5969, + "step": 26882 + }, + { + "epoch": 0.3360584014600365, + "grad_norm": 2.897830009460449, + "learning_rate": 1.6792964368870084e-05, + "loss": 0.6915, + "step": 26884 + }, + { + "epoch": 0.3360834020850521, + "grad_norm": 0.0014577490510419011, + "learning_rate": 1.6792323908072508e-05, + "loss": 0.0001, + "step": 26886 + }, + { + "epoch": 0.33610840271006776, + "grad_norm": 3.8434982299804688, + "learning_rate": 1.6791683395545556e-05, + "loss": 1.4102, + "step": 26888 + }, + { + "epoch": 0.3361334033350834, + "grad_norm": 6.052341938018799, + "learning_rate": 1.67910428312941e-05, + "loss": 1.0761, + "step": 26890 + }, + { + "epoch": 0.336158403960099, + "grad_norm": 4.519077777862549, + "learning_rate": 1.6790402215323022e-05, + "loss": 1.36, + "step": 26892 + }, + { + "epoch": 0.33618340458511464, + "grad_norm": 3.911280870437622, + "learning_rate": 1.6789761547637205e-05, + "loss": 1.8454, + "step": 26894 + }, + { + "epoch": 0.33620840521013023, + "grad_norm": 4.559387683868408, + "learning_rate": 1.678912082824152e-05, + "loss": 0.961, + "step": 26896 + }, + { + "epoch": 0.3362334058351459, + "grad_norm": 4.1559367179870605, + "learning_rate": 1.678848005714085e-05, + "loss": 1.2663, + "step": 26898 + }, + { + "epoch": 0.33625840646016153, + "grad_norm": 4.951423645019531, + "learning_rate": 1.6787839234340083e-05, + "loss": 0.9349, + "step": 26900 + }, + { + "epoch": 0.3362834070851771, + "grad_norm": 2.372957944869995, + "learning_rate": 1.6787198359844084e-05, + "loss": 0.9746, + "step": 26902 + }, + { + "epoch": 0.33630840771019277, + "grad_norm": 5.540175437927246, + "learning_rate": 1.6786557433657747e-05, + "loss": 1.566, + "step": 26904 + }, + { + "epoch": 0.33633340833520836, + "grad_norm": 1.7668687105178833, + "learning_rate": 1.6785916455785947e-05, + "loss": 0.6454, + "step": 26906 + }, + { + "epoch": 0.336358408960224, + "grad_norm": 2.649254322052002, + "learning_rate": 1.678527542623357e-05, + "loss": 1.0447, + "step": 26908 + }, + { + "epoch": 0.33638340958523966, + "grad_norm": 0.0019336321856826544, + "learning_rate": 1.678463434500549e-05, + "loss": 0.2657, + "step": 26910 + }, + { + "epoch": 0.33640841021025525, + "grad_norm": 1.9017804861068726, + "learning_rate": 1.6783993212106596e-05, + "loss": 0.117, + "step": 26912 + }, + { + "epoch": 0.3364334108352709, + "grad_norm": 3.33156156539917, + "learning_rate": 1.678335202754177e-05, + "loss": 0.6289, + "step": 26914 + }, + { + "epoch": 0.3364584114602865, + "grad_norm": 2.3827242851257324, + "learning_rate": 1.6782710791315894e-05, + "loss": 0.7893, + "step": 26916 + }, + { + "epoch": 0.33648341208530214, + "grad_norm": 1.803613305091858, + "learning_rate": 1.6782069503433848e-05, + "loss": 0.7722, + "step": 26918 + }, + { + "epoch": 0.3365084127103178, + "grad_norm": 3.1265065670013428, + "learning_rate": 1.6781428163900525e-05, + "loss": 0.9381, + "step": 26920 + }, + { + "epoch": 0.3365334133353334, + "grad_norm": 3.1028454303741455, + "learning_rate": 1.67807867727208e-05, + "loss": 1.2723, + "step": 26922 + }, + { + "epoch": 0.336558413960349, + "grad_norm": 4.31479549407959, + "learning_rate": 1.6780145329899565e-05, + "loss": 2.5228, + "step": 26924 + }, + { + "epoch": 0.3365834145853646, + "grad_norm": 5.471868991851807, + "learning_rate": 1.67795038354417e-05, + "loss": 0.7561, + "step": 26926 + }, + { + "epoch": 0.33660841521038026, + "grad_norm": 3.3012359142303467, + "learning_rate": 1.6778862289352095e-05, + "loss": 0.7804, + "step": 26928 + }, + { + "epoch": 0.3366334158353959, + "grad_norm": 3.928011655807495, + "learning_rate": 1.677822069163563e-05, + "loss": 1.5912, + "step": 26930 + }, + { + "epoch": 0.3366584164604115, + "grad_norm": 3.875393867492676, + "learning_rate": 1.67775790422972e-05, + "loss": 1.6766, + "step": 26932 + }, + { + "epoch": 0.33668341708542715, + "grad_norm": 2.7609713077545166, + "learning_rate": 1.677693734134168e-05, + "loss": 0.6253, + "step": 26934 + }, + { + "epoch": 0.33670841771044274, + "grad_norm": 3.8601624965667725, + "learning_rate": 1.6776295588773967e-05, + "loss": 1.3453, + "step": 26936 + }, + { + "epoch": 0.3367334183354584, + "grad_norm": 4.1120452880859375, + "learning_rate": 1.6775653784598946e-05, + "loss": 1.1286, + "step": 26938 + }, + { + "epoch": 0.33675841896047404, + "grad_norm": 0.3831344246864319, + "learning_rate": 1.6775011928821505e-05, + "loss": 0.0654, + "step": 26940 + }, + { + "epoch": 0.33678341958548963, + "grad_norm": 2.4300761222839355, + "learning_rate": 1.6774370021446523e-05, + "loss": 0.4035, + "step": 26942 + }, + { + "epoch": 0.3368084202105053, + "grad_norm": 5.425512790679932, + "learning_rate": 1.6773728062478903e-05, + "loss": 1.5677, + "step": 26944 + }, + { + "epoch": 0.33683342083552087, + "grad_norm": 3.5458083152770996, + "learning_rate": 1.6773086051923532e-05, + "loss": 1.4735, + "step": 26946 + }, + { + "epoch": 0.3368584214605365, + "grad_norm": 0.017606457695364952, + "learning_rate": 1.677244398978529e-05, + "loss": 0.0252, + "step": 26948 + }, + { + "epoch": 0.33688342208555216, + "grad_norm": 1.8240000009536743, + "learning_rate": 1.677180187606907e-05, + "loss": 1.1289, + "step": 26950 + }, + { + "epoch": 0.33690842271056776, + "grad_norm": 4.292431354522705, + "learning_rate": 1.677115971077977e-05, + "loss": 1.1098, + "step": 26952 + }, + { + "epoch": 0.3369334233355834, + "grad_norm": 2.1814041137695312, + "learning_rate": 1.677051749392227e-05, + "loss": 0.7756, + "step": 26954 + }, + { + "epoch": 0.336958423960599, + "grad_norm": 5.098667144775391, + "learning_rate": 1.6769875225501474e-05, + "loss": 0.8372, + "step": 26956 + }, + { + "epoch": 0.33698342458561464, + "grad_norm": 4.961780071258545, + "learning_rate": 1.676923290552226e-05, + "loss": 1.1776, + "step": 26958 + }, + { + "epoch": 0.3370084252106303, + "grad_norm": 2.962557554244995, + "learning_rate": 1.6768590533989523e-05, + "loss": 1.1855, + "step": 26960 + }, + { + "epoch": 0.3370334258356459, + "grad_norm": 3.7344682216644287, + "learning_rate": 1.6767948110908164e-05, + "loss": 1.1655, + "step": 26962 + }, + { + "epoch": 0.33705842646066153, + "grad_norm": 4.534800052642822, + "learning_rate": 1.6767305636283064e-05, + "loss": 1.2248, + "step": 26964 + }, + { + "epoch": 0.3370834270856771, + "grad_norm": 3.6911933422088623, + "learning_rate": 1.6766663110119123e-05, + "loss": 1.2377, + "step": 26966 + }, + { + "epoch": 0.33710842771069277, + "grad_norm": 3.357759952545166, + "learning_rate": 1.6766020532421234e-05, + "loss": 1.3647, + "step": 26968 + }, + { + "epoch": 0.3371334283357084, + "grad_norm": 4.45209264755249, + "learning_rate": 1.676537790319429e-05, + "loss": 1.3507, + "step": 26970 + }, + { + "epoch": 0.337158428960724, + "grad_norm": 4.24743127822876, + "learning_rate": 1.6764735222443183e-05, + "loss": 0.8269, + "step": 26972 + }, + { + "epoch": 0.33718342958573966, + "grad_norm": 3.2403063774108887, + "learning_rate": 1.676409249017281e-05, + "loss": 0.5977, + "step": 26974 + }, + { + "epoch": 0.33720843021075525, + "grad_norm": 1.3395072221755981, + "learning_rate": 1.6763449706388063e-05, + "loss": 0.1171, + "step": 26976 + }, + { + "epoch": 0.3372334308357709, + "grad_norm": 2.6118226051330566, + "learning_rate": 1.676280687109384e-05, + "loss": 2.2556, + "step": 26978 + }, + { + "epoch": 0.33725843146078655, + "grad_norm": 0.0008454936905764043, + "learning_rate": 1.676216398429504e-05, + "loss": 0.5623, + "step": 26980 + }, + { + "epoch": 0.33728343208580214, + "grad_norm": 5.3583292961120605, + "learning_rate": 1.6761521045996548e-05, + "loss": 0.5651, + "step": 26982 + }, + { + "epoch": 0.3373084327108178, + "grad_norm": 0.0015081496676430106, + "learning_rate": 1.6760878056203273e-05, + "loss": 0.0001, + "step": 26984 + }, + { + "epoch": 0.3373334333358334, + "grad_norm": 0.0016320667928084731, + "learning_rate": 1.676023501492011e-05, + "loss": 0.5723, + "step": 26986 + }, + { + "epoch": 0.337358433960849, + "grad_norm": 7.901464462280273, + "learning_rate": 1.675959192215195e-05, + "loss": 0.0576, + "step": 26988 + }, + { + "epoch": 0.33738343458586467, + "grad_norm": 6.1163530349731445, + "learning_rate": 1.675894877790369e-05, + "loss": 2.4178, + "step": 26990 + }, + { + "epoch": 0.33740843521088026, + "grad_norm": 2.6381618976593018, + "learning_rate": 1.6758305582180234e-05, + "loss": 0.9181, + "step": 26992 + }, + { + "epoch": 0.3374334358358959, + "grad_norm": 3.5237534046173096, + "learning_rate": 1.675766233498648e-05, + "loss": 1.1559, + "step": 26994 + }, + { + "epoch": 0.3374584364609115, + "grad_norm": 1.3654378652572632, + "learning_rate": 1.6757019036327323e-05, + "loss": 0.8148, + "step": 26996 + }, + { + "epoch": 0.33748343708592715, + "grad_norm": 4.057119369506836, + "learning_rate": 1.6756375686207668e-05, + "loss": 0.8589, + "step": 26998 + }, + { + "epoch": 0.3375084377109428, + "grad_norm": 2.1170527935028076, + "learning_rate": 1.675573228463241e-05, + "loss": 1.2034, + "step": 27000 + }, + { + "epoch": 0.3375334383359584, + "grad_norm": 8.785992622375488, + "learning_rate": 1.675508883160645e-05, + "loss": 1.4266, + "step": 27002 + }, + { + "epoch": 0.33755843896097404, + "grad_norm": 2.0291590690612793, + "learning_rate": 1.6754445327134688e-05, + "loss": 0.1914, + "step": 27004 + }, + { + "epoch": 0.33758343958598963, + "grad_norm": 5.054376125335693, + "learning_rate": 1.6753801771222026e-05, + "loss": 0.9336, + "step": 27006 + }, + { + "epoch": 0.3376084402110053, + "grad_norm": 1.778622031211853, + "learning_rate": 1.6753158163873363e-05, + "loss": 1.9439, + "step": 27008 + }, + { + "epoch": 0.3376334408360209, + "grad_norm": 2.0759689807891846, + "learning_rate": 1.6752514505093604e-05, + "loss": 0.2855, + "step": 27010 + }, + { + "epoch": 0.3376584414610365, + "grad_norm": 0.4280519187450409, + "learning_rate": 1.6751870794887653e-05, + "loss": 0.2205, + "step": 27012 + }, + { + "epoch": 0.33768344208605217, + "grad_norm": 2.3003194332122803, + "learning_rate": 1.6751227033260406e-05, + "loss": 1.5923, + "step": 27014 + }, + { + "epoch": 0.33770844271106776, + "grad_norm": 4.027867317199707, + "learning_rate": 1.6750583220216764e-05, + "loss": 0.8949, + "step": 27016 + }, + { + "epoch": 0.3377334433360834, + "grad_norm": 0.024169504642486572, + "learning_rate": 1.674993935576164e-05, + "loss": 0.011, + "step": 27018 + }, + { + "epoch": 0.33775844396109905, + "grad_norm": 3.7099897861480713, + "learning_rate": 1.6749295439899933e-05, + "loss": 1.3015, + "step": 27020 + }, + { + "epoch": 0.33778344458611465, + "grad_norm": 5.209362983703613, + "learning_rate": 1.6748651472636545e-05, + "loss": 1.2097, + "step": 27022 + }, + { + "epoch": 0.3378084452111303, + "grad_norm": 5.1014509201049805, + "learning_rate": 1.674800745397638e-05, + "loss": 1.9842, + "step": 27024 + }, + { + "epoch": 0.3378334458361459, + "grad_norm": 3.8215200901031494, + "learning_rate": 1.6747363383924345e-05, + "loss": 0.7701, + "step": 27026 + }, + { + "epoch": 0.33785844646116153, + "grad_norm": 4.585231781005859, + "learning_rate": 1.6746719262485345e-05, + "loss": 1.3265, + "step": 27028 + }, + { + "epoch": 0.3378834470861772, + "grad_norm": 3.8109278678894043, + "learning_rate": 1.6746075089664286e-05, + "loss": 0.2008, + "step": 27030 + }, + { + "epoch": 0.33790844771119277, + "grad_norm": 2.022578001022339, + "learning_rate": 1.674543086546607e-05, + "loss": 0.6043, + "step": 27032 + }, + { + "epoch": 0.3379334483362084, + "grad_norm": 6.547144412994385, + "learning_rate": 1.6744786589895613e-05, + "loss": 0.9655, + "step": 27034 + }, + { + "epoch": 0.337958448961224, + "grad_norm": 4.622925758361816, + "learning_rate": 1.674414226295781e-05, + "loss": 0.842, + "step": 27036 + }, + { + "epoch": 0.33798344958623966, + "grad_norm": 4.173982620239258, + "learning_rate": 1.6743497884657576e-05, + "loss": 1.6361, + "step": 27038 + }, + { + "epoch": 0.3380084502112553, + "grad_norm": 3.5223724842071533, + "learning_rate": 1.6742853454999814e-05, + "loss": 0.8999, + "step": 27040 + }, + { + "epoch": 0.3380334508362709, + "grad_norm": 3.598902702331543, + "learning_rate": 1.6742208973989433e-05, + "loss": 0.7521, + "step": 27042 + }, + { + "epoch": 0.33805845146128655, + "grad_norm": 5.056869983673096, + "learning_rate": 1.6741564441631344e-05, + "loss": 2.0467, + "step": 27044 + }, + { + "epoch": 0.33808345208630214, + "grad_norm": 3.624662399291992, + "learning_rate": 1.6740919857930453e-05, + "loss": 1.2507, + "step": 27046 + }, + { + "epoch": 0.3381084527113178, + "grad_norm": 4.065524101257324, + "learning_rate": 1.6740275222891666e-05, + "loss": 1.7702, + "step": 27048 + }, + { + "epoch": 0.33813345333633343, + "grad_norm": 3.891965389251709, + "learning_rate": 1.6739630536519898e-05, + "loss": 1.4554, + "step": 27050 + }, + { + "epoch": 0.338158453961349, + "grad_norm": 3.174922227859497, + "learning_rate": 1.673898579882006e-05, + "loss": 1.5636, + "step": 27052 + }, + { + "epoch": 0.3381834545863647, + "grad_norm": 2.9421756267547607, + "learning_rate": 1.673834100979706e-05, + "loss": 0.879, + "step": 27054 + }, + { + "epoch": 0.33820845521138027, + "grad_norm": 2.463520050048828, + "learning_rate": 1.6737696169455802e-05, + "loss": 1.0479, + "step": 27056 + }, + { + "epoch": 0.3382334558363959, + "grad_norm": 4.430275917053223, + "learning_rate": 1.6737051277801205e-05, + "loss": 2.1205, + "step": 27058 + }, + { + "epoch": 0.33825845646141156, + "grad_norm": 1.6844983100891113, + "learning_rate": 1.6736406334838176e-05, + "loss": 1.582, + "step": 27060 + }, + { + "epoch": 0.33828345708642715, + "grad_norm": 1.1215065717697144, + "learning_rate": 1.6735761340571636e-05, + "loss": 0.3808, + "step": 27062 + }, + { + "epoch": 0.3383084577114428, + "grad_norm": 2.8455398082733154, + "learning_rate": 1.6735116295006487e-05, + "loss": 1.0201, + "step": 27064 + }, + { + "epoch": 0.3383334583364584, + "grad_norm": 3.1025354862213135, + "learning_rate": 1.6734471198147638e-05, + "loss": 1.2876, + "step": 27066 + }, + { + "epoch": 0.33835845896147404, + "grad_norm": 2.78240966796875, + "learning_rate": 1.6733826050000016e-05, + "loss": 1.457, + "step": 27068 + }, + { + "epoch": 0.3383834595864897, + "grad_norm": 2.6812562942504883, + "learning_rate": 1.6733180850568527e-05, + "loss": 1.4594, + "step": 27070 + }, + { + "epoch": 0.3384084602115053, + "grad_norm": 3.8353161811828613, + "learning_rate": 1.6732535599858083e-05, + "loss": 1.5371, + "step": 27072 + }, + { + "epoch": 0.3384334608365209, + "grad_norm": 3.9622607231140137, + "learning_rate": 1.6731890297873597e-05, + "loss": 1.4404, + "step": 27074 + }, + { + "epoch": 0.3384584614615365, + "grad_norm": 3.5870726108551025, + "learning_rate": 1.673124494461999e-05, + "loss": 1.6696, + "step": 27076 + }, + { + "epoch": 0.33848346208655217, + "grad_norm": 4.190028667449951, + "learning_rate": 1.6730599540102176e-05, + "loss": 0.9638, + "step": 27078 + }, + { + "epoch": 0.3385084627115678, + "grad_norm": 3.4259352684020996, + "learning_rate": 1.6729954084325066e-05, + "loss": 0.3728, + "step": 27080 + }, + { + "epoch": 0.3385334633365834, + "grad_norm": 3.4880034923553467, + "learning_rate": 1.672930857729357e-05, + "loss": 0.9844, + "step": 27082 + }, + { + "epoch": 0.33855846396159905, + "grad_norm": 2.6336257457733154, + "learning_rate": 1.672866301901262e-05, + "loss": 0.7316, + "step": 27084 + }, + { + "epoch": 0.33858346458661465, + "grad_norm": 3.8251829147338867, + "learning_rate": 1.672801740948712e-05, + "loss": 1.5374, + "step": 27086 + }, + { + "epoch": 0.3386084652116303, + "grad_norm": 4.276794910430908, + "learning_rate": 1.6727371748721993e-05, + "loss": 2.0426, + "step": 27088 + }, + { + "epoch": 0.33863346583664594, + "grad_norm": 4.8074798583984375, + "learning_rate": 1.6726726036722153e-05, + "loss": 0.6679, + "step": 27090 + }, + { + "epoch": 0.33865846646166153, + "grad_norm": 3.895646572113037, + "learning_rate": 1.6726080273492523e-05, + "loss": 1.3301, + "step": 27092 + }, + { + "epoch": 0.3386834670866772, + "grad_norm": 0.8742340207099915, + "learning_rate": 1.6725434459038007e-05, + "loss": 1.1901, + "step": 27094 + }, + { + "epoch": 0.3387084677116928, + "grad_norm": 4.442474842071533, + "learning_rate": 1.6724788593363544e-05, + "loss": 0.9221, + "step": 27096 + }, + { + "epoch": 0.3387334683367084, + "grad_norm": 3.990562677383423, + "learning_rate": 1.6724142676474037e-05, + "loss": 1.1149, + "step": 27098 + }, + { + "epoch": 0.33875846896172407, + "grad_norm": 4.311432361602783, + "learning_rate": 1.672349670837441e-05, + "loss": 0.8261, + "step": 27100 + }, + { + "epoch": 0.33878346958673966, + "grad_norm": 4.614291667938232, + "learning_rate": 1.6722850689069582e-05, + "loss": 1.2673, + "step": 27102 + }, + { + "epoch": 0.3388084702117553, + "grad_norm": 4.982415676116943, + "learning_rate": 1.6722204618564477e-05, + "loss": 1.2852, + "step": 27104 + }, + { + "epoch": 0.3388334708367709, + "grad_norm": 1.4191862344741821, + "learning_rate": 1.672155849686401e-05, + "loss": 0.0975, + "step": 27106 + }, + { + "epoch": 0.33885847146178655, + "grad_norm": 5.329476356506348, + "learning_rate": 1.67209123239731e-05, + "loss": 1.3211, + "step": 27108 + }, + { + "epoch": 0.3388834720868022, + "grad_norm": 5.530961513519287, + "learning_rate": 1.672026609989668e-05, + "loss": 1.327, + "step": 27110 + }, + { + "epoch": 0.3389084727118178, + "grad_norm": 5.129233360290527, + "learning_rate": 1.671961982463966e-05, + "loss": 1.6755, + "step": 27112 + }, + { + "epoch": 0.33893347333683344, + "grad_norm": 1.3616589307785034, + "learning_rate": 1.6718973498206965e-05, + "loss": 1.2117, + "step": 27114 + }, + { + "epoch": 0.338958473961849, + "grad_norm": 4.817745208740234, + "learning_rate": 1.671832712060352e-05, + "loss": 1.5795, + "step": 27116 + }, + { + "epoch": 0.3389834745868647, + "grad_norm": 2.8539083003997803, + "learning_rate": 1.6717680691834244e-05, + "loss": 0.7836, + "step": 27118 + }, + { + "epoch": 0.3390084752118803, + "grad_norm": 4.130643367767334, + "learning_rate": 1.671703421190406e-05, + "loss": 0.4789, + "step": 27120 + }, + { + "epoch": 0.3390334758368959, + "grad_norm": 2.9040799140930176, + "learning_rate": 1.67163876808179e-05, + "loss": 1.5189, + "step": 27122 + }, + { + "epoch": 0.33905847646191156, + "grad_norm": 3.591825485229492, + "learning_rate": 1.6715741098580675e-05, + "loss": 0.7808, + "step": 27124 + }, + { + "epoch": 0.33908347708692715, + "grad_norm": 3.3278403282165527, + "learning_rate": 1.6715094465197317e-05, + "loss": 1.2958, + "step": 27126 + }, + { + "epoch": 0.3391084777119428, + "grad_norm": 1.5616577863693237, + "learning_rate": 1.671444778067275e-05, + "loss": 0.588, + "step": 27128 + }, + { + "epoch": 0.33913347833695845, + "grad_norm": 1.8638992309570312, + "learning_rate": 1.67138010450119e-05, + "loss": 0.4138, + "step": 27130 + }, + { + "epoch": 0.33915847896197404, + "grad_norm": 2.495452642440796, + "learning_rate": 1.6713154258219687e-05, + "loss": 1.6396, + "step": 27132 + }, + { + "epoch": 0.3391834795869897, + "grad_norm": 1.9015549421310425, + "learning_rate": 1.671250742030104e-05, + "loss": 0.2848, + "step": 27134 + }, + { + "epoch": 0.3392084802120053, + "grad_norm": 5.479132175445557, + "learning_rate": 1.671186053126089e-05, + "loss": 0.4238, + "step": 27136 + }, + { + "epoch": 0.33923348083702093, + "grad_norm": 3.881340265274048, + "learning_rate": 1.6711213591104157e-05, + "loss": 1.6319, + "step": 27138 + }, + { + "epoch": 0.3392584814620366, + "grad_norm": 2.847935438156128, + "learning_rate": 1.671056659983577e-05, + "loss": 1.2054, + "step": 27140 + }, + { + "epoch": 0.33928348208705217, + "grad_norm": 4.5075788497924805, + "learning_rate": 1.6709919557460656e-05, + "loss": 0.8851, + "step": 27142 + }, + { + "epoch": 0.3393084827120678, + "grad_norm": 1.915912389755249, + "learning_rate": 1.6709272463983747e-05, + "loss": 0.421, + "step": 27144 + }, + { + "epoch": 0.3393334833370834, + "grad_norm": 3.696152925491333, + "learning_rate": 1.6708625319409964e-05, + "loss": 1.1883, + "step": 27146 + }, + { + "epoch": 0.33935848396209906, + "grad_norm": 2.4265353679656982, + "learning_rate": 1.670797812374424e-05, + "loss": 1.4057, + "step": 27148 + }, + { + "epoch": 0.3393834845871147, + "grad_norm": 7.049619674682617, + "learning_rate": 1.67073308769915e-05, + "loss": 0.9373, + "step": 27150 + }, + { + "epoch": 0.3394084852121303, + "grad_norm": 2.0653343200683594, + "learning_rate": 1.6706683579156684e-05, + "loss": 0.4993, + "step": 27152 + }, + { + "epoch": 0.33943348583714594, + "grad_norm": 4.687279224395752, + "learning_rate": 1.6706036230244708e-05, + "loss": 1.5503, + "step": 27154 + }, + { + "epoch": 0.33945848646216153, + "grad_norm": 3.3715877532958984, + "learning_rate": 1.670538883026051e-05, + "loss": 0.4878, + "step": 27156 + }, + { + "epoch": 0.3394834870871772, + "grad_norm": 1.2562620639801025, + "learning_rate": 1.6704741379209022e-05, + "loss": 0.0841, + "step": 27158 + }, + { + "epoch": 0.33950848771219283, + "grad_norm": 11.899784088134766, + "learning_rate": 1.670409387709517e-05, + "loss": 1.2858, + "step": 27160 + }, + { + "epoch": 0.3395334883372084, + "grad_norm": 7.594020366668701, + "learning_rate": 1.6703446323923887e-05, + "loss": 1.5925, + "step": 27162 + }, + { + "epoch": 0.33955848896222407, + "grad_norm": 2.4500224590301514, + "learning_rate": 1.6702798719700104e-05, + "loss": 0.7325, + "step": 27164 + }, + { + "epoch": 0.33958348958723966, + "grad_norm": 3.115140676498413, + "learning_rate": 1.6702151064428755e-05, + "loss": 1.1889, + "step": 27166 + }, + { + "epoch": 0.3396084902122553, + "grad_norm": 3.846489191055298, + "learning_rate": 1.670150335811477e-05, + "loss": 1.2162, + "step": 27168 + }, + { + "epoch": 0.33963349083727096, + "grad_norm": 4.660043716430664, + "learning_rate": 1.6700855600763083e-05, + "loss": 1.6722, + "step": 27170 + }, + { + "epoch": 0.33965849146228655, + "grad_norm": 0.1992308646440506, + "learning_rate": 1.670020779237863e-05, + "loss": 0.1475, + "step": 27172 + }, + { + "epoch": 0.3396834920873022, + "grad_norm": 3.9755859375, + "learning_rate": 1.6699559932966337e-05, + "loss": 1.6933, + "step": 27174 + }, + { + "epoch": 0.3397084927123178, + "grad_norm": 2.403688907623291, + "learning_rate": 1.6698912022531147e-05, + "loss": 0.5184, + "step": 27176 + }, + { + "epoch": 0.33973349333733344, + "grad_norm": 2.4231457710266113, + "learning_rate": 1.6698264061077988e-05, + "loss": 0.3299, + "step": 27178 + }, + { + "epoch": 0.3397584939623491, + "grad_norm": 2.8058650493621826, + "learning_rate": 1.6697616048611798e-05, + "loss": 1.1882, + "step": 27180 + }, + { + "epoch": 0.3397834945873647, + "grad_norm": 2.5725622177124023, + "learning_rate": 1.6696967985137515e-05, + "loss": 1.3914, + "step": 27182 + }, + { + "epoch": 0.3398084952123803, + "grad_norm": 4.3705644607543945, + "learning_rate": 1.6696319870660067e-05, + "loss": 0.8618, + "step": 27184 + }, + { + "epoch": 0.3398334958373959, + "grad_norm": 0.0009366553858853877, + "learning_rate": 1.6695671705184393e-05, + "loss": 0.0001, + "step": 27186 + }, + { + "epoch": 0.33985849646241156, + "grad_norm": 5.800314903259277, + "learning_rate": 1.669502348871543e-05, + "loss": 0.7927, + "step": 27188 + }, + { + "epoch": 0.3398834970874272, + "grad_norm": 0.0006312512559816241, + "learning_rate": 1.669437522125812e-05, + "loss": 0.0, + "step": 27190 + }, + { + "epoch": 0.3399084977124428, + "grad_norm": 1.9037117958068848, + "learning_rate": 1.669372690281739e-05, + "loss": 0.7691, + "step": 27192 + }, + { + "epoch": 0.33993349833745845, + "grad_norm": 3.6438868045806885, + "learning_rate": 1.6693078533398183e-05, + "loss": 1.0801, + "step": 27194 + }, + { + "epoch": 0.33995849896247404, + "grad_norm": 4.389482498168945, + "learning_rate": 1.6692430113005435e-05, + "loss": 1.2672, + "step": 27196 + }, + { + "epoch": 0.3399834995874897, + "grad_norm": 3.608067512512207, + "learning_rate": 1.669178164164409e-05, + "loss": 0.4141, + "step": 27198 + }, + { + "epoch": 0.34000850021250534, + "grad_norm": 3.0600826740264893, + "learning_rate": 1.669113311931908e-05, + "loss": 0.9997, + "step": 27200 + }, + { + "epoch": 0.34003350083752093, + "grad_norm": 9.735257148742676, + "learning_rate": 1.6690484546035346e-05, + "loss": 2.1977, + "step": 27202 + }, + { + "epoch": 0.3400585014625366, + "grad_norm": 5.1956257820129395, + "learning_rate": 1.668983592179783e-05, + "loss": 0.0788, + "step": 27204 + }, + { + "epoch": 0.34008350208755217, + "grad_norm": 3.5278823375701904, + "learning_rate": 1.6689187246611468e-05, + "loss": 0.5947, + "step": 27206 + }, + { + "epoch": 0.3401085027125678, + "grad_norm": 3.8763082027435303, + "learning_rate": 1.6688538520481202e-05, + "loss": 1.8212, + "step": 27208 + }, + { + "epoch": 0.34013350333758346, + "grad_norm": 3.724128007888794, + "learning_rate": 1.668788974341197e-05, + "loss": 0.6784, + "step": 27210 + }, + { + "epoch": 0.34015850396259906, + "grad_norm": 1.630030870437622, + "learning_rate": 1.6687240915408717e-05, + "loss": 1.4099, + "step": 27212 + }, + { + "epoch": 0.3401835045876147, + "grad_norm": 5.36702299118042, + "learning_rate": 1.6686592036476385e-05, + "loss": 0.9024, + "step": 27214 + }, + { + "epoch": 0.3402085052126303, + "grad_norm": 3.2790725231170654, + "learning_rate": 1.6685943106619913e-05, + "loss": 0.5724, + "step": 27216 + }, + { + "epoch": 0.34023350583764594, + "grad_norm": 3.5688838958740234, + "learning_rate": 1.6685294125844242e-05, + "loss": 2.0031, + "step": 27218 + }, + { + "epoch": 0.3402585064626616, + "grad_norm": 8.663154602050781, + "learning_rate": 1.6684645094154318e-05, + "loss": 1.0263, + "step": 27220 + }, + { + "epoch": 0.3402835070876772, + "grad_norm": 4.506411552429199, + "learning_rate": 1.6683996011555082e-05, + "loss": 1.0591, + "step": 27222 + }, + { + "epoch": 0.34030850771269283, + "grad_norm": 2.923316240310669, + "learning_rate": 1.6683346878051476e-05, + "loss": 1.2785, + "step": 27224 + }, + { + "epoch": 0.3403335083377084, + "grad_norm": 9.279181480407715, + "learning_rate": 1.6682697693648447e-05, + "loss": 0.7886, + "step": 27226 + }, + { + "epoch": 0.34035850896272407, + "grad_norm": 5.127459526062012, + "learning_rate": 1.6682048458350936e-05, + "loss": 0.8538, + "step": 27228 + }, + { + "epoch": 0.3403835095877397, + "grad_norm": 0.2604948878288269, + "learning_rate": 1.668139917216389e-05, + "loss": 0.1965, + "step": 27230 + }, + { + "epoch": 0.3404085102127553, + "grad_norm": 3.8906774520874023, + "learning_rate": 1.668074983509225e-05, + "loss": 0.8281, + "step": 27232 + }, + { + "epoch": 0.34043351083777096, + "grad_norm": 0.06132880970835686, + "learning_rate": 1.668010044714097e-05, + "loss": 0.3926, + "step": 27234 + }, + { + "epoch": 0.34045851146278655, + "grad_norm": 2.815077781677246, + "learning_rate": 1.6679451008314985e-05, + "loss": 0.0726, + "step": 27236 + }, + { + "epoch": 0.3404835120878022, + "grad_norm": 5.185252666473389, + "learning_rate": 1.667880151861925e-05, + "loss": 1.3894, + "step": 27238 + }, + { + "epoch": 0.34050851271281785, + "grad_norm": 4.919657230377197, + "learning_rate": 1.66781519780587e-05, + "loss": 1.3212, + "step": 27240 + }, + { + "epoch": 0.34053351333783344, + "grad_norm": 0.7306010723114014, + "learning_rate": 1.6677502386638295e-05, + "loss": 0.3645, + "step": 27242 + }, + { + "epoch": 0.3405585139628491, + "grad_norm": 3.2696518898010254, + "learning_rate": 1.6676852744362974e-05, + "loss": 0.6989, + "step": 27244 + }, + { + "epoch": 0.3405835145878647, + "grad_norm": 3.301414966583252, + "learning_rate": 1.6676203051237683e-05, + "loss": 1.2995, + "step": 27246 + }, + { + "epoch": 0.3406085152128803, + "grad_norm": 4.6136369705200195, + "learning_rate": 1.667555330726738e-05, + "loss": 1.6059, + "step": 27248 + }, + { + "epoch": 0.340633515837896, + "grad_norm": 4.623034954071045, + "learning_rate": 1.6674903512457007e-05, + "loss": 1.2696, + "step": 27250 + }, + { + "epoch": 0.34065851646291156, + "grad_norm": 1.5972672700881958, + "learning_rate": 1.6674253666811508e-05, + "loss": 0.0985, + "step": 27252 + }, + { + "epoch": 0.3406835170879272, + "grad_norm": 2.586637258529663, + "learning_rate": 1.667360377033584e-05, + "loss": 0.5955, + "step": 27254 + }, + { + "epoch": 0.3407085177129428, + "grad_norm": 3.4755656719207764, + "learning_rate": 1.6672953823034945e-05, + "loss": 0.4666, + "step": 27256 + }, + { + "epoch": 0.34073351833795845, + "grad_norm": 3.7454912662506104, + "learning_rate": 1.6672303824913784e-05, + "loss": 1.1478, + "step": 27258 + }, + { + "epoch": 0.3407585189629741, + "grad_norm": 5.54393196105957, + "learning_rate": 1.6671653775977298e-05, + "loss": 1.1874, + "step": 27260 + }, + { + "epoch": 0.3407835195879897, + "grad_norm": 0.5865609049797058, + "learning_rate": 1.6671003676230436e-05, + "loss": 0.0255, + "step": 27262 + }, + { + "epoch": 0.34080852021300534, + "grad_norm": 6.104053020477295, + "learning_rate": 1.6670353525678158e-05, + "loss": 1.1117, + "step": 27264 + }, + { + "epoch": 0.34083352083802093, + "grad_norm": 1.547965168952942, + "learning_rate": 1.666970332432541e-05, + "loss": 0.7013, + "step": 27266 + }, + { + "epoch": 0.3408585214630366, + "grad_norm": 5.516486167907715, + "learning_rate": 1.6669053072177145e-05, + "loss": 1.3874, + "step": 27268 + }, + { + "epoch": 0.3408835220880522, + "grad_norm": 3.9978997707366943, + "learning_rate": 1.6668402769238314e-05, + "loss": 0.4007, + "step": 27270 + }, + { + "epoch": 0.3409085227130678, + "grad_norm": 0.0013712325599044561, + "learning_rate": 1.666775241551387e-05, + "loss": 0.8149, + "step": 27272 + }, + { + "epoch": 0.34093352333808347, + "grad_norm": 3.620818614959717, + "learning_rate": 1.666710201100877e-05, + "loss": 1.4402, + "step": 27274 + }, + { + "epoch": 0.34095852396309906, + "grad_norm": 0.0007846341468393803, + "learning_rate": 1.666645155572796e-05, + "loss": 0.0014, + "step": 27276 + }, + { + "epoch": 0.3409835245881147, + "grad_norm": 9.354205131530762, + "learning_rate": 1.6665801049676397e-05, + "loss": 1.8517, + "step": 27278 + }, + { + "epoch": 0.34100852521313035, + "grad_norm": 3.922544002532959, + "learning_rate": 1.666515049285904e-05, + "loss": 1.697, + "step": 27280 + }, + { + "epoch": 0.34103352583814595, + "grad_norm": 3.967600107192993, + "learning_rate": 1.6664499885280835e-05, + "loss": 1.1356, + "step": 27282 + }, + { + "epoch": 0.3410585264631616, + "grad_norm": 3.869156837463379, + "learning_rate": 1.6663849226946746e-05, + "loss": 1.3483, + "step": 27284 + }, + { + "epoch": 0.3410835270881772, + "grad_norm": 3.596993923187256, + "learning_rate": 1.6663198517861722e-05, + "loss": 0.6599, + "step": 27286 + }, + { + "epoch": 0.34110852771319283, + "grad_norm": 2.426731586456299, + "learning_rate": 1.6662547758030717e-05, + "loss": 1.3276, + "step": 27288 + }, + { + "epoch": 0.3411335283382085, + "grad_norm": 3.9899959564208984, + "learning_rate": 1.666189694745869e-05, + "loss": 1.1224, + "step": 27290 + }, + { + "epoch": 0.3411585289632241, + "grad_norm": 1.8992446660995483, + "learning_rate": 1.6661246086150608e-05, + "loss": 0.7661, + "step": 27292 + }, + { + "epoch": 0.3411835295882397, + "grad_norm": 3.34918212890625, + "learning_rate": 1.6660595174111408e-05, + "loss": 1.2271, + "step": 27294 + }, + { + "epoch": 0.3412085302132553, + "grad_norm": 3.8108749389648438, + "learning_rate": 1.6659944211346057e-05, + "loss": 0.9366, + "step": 27296 + }, + { + "epoch": 0.34123353083827096, + "grad_norm": 5.467159748077393, + "learning_rate": 1.665929319785952e-05, + "loss": 1.6227, + "step": 27298 + }, + { + "epoch": 0.3412585314632866, + "grad_norm": 2.4571444988250732, + "learning_rate": 1.6658642133656742e-05, + "loss": 0.9863, + "step": 27300 + }, + { + "epoch": 0.3412835320883022, + "grad_norm": 13.66543197631836, + "learning_rate": 1.6657991018742685e-05, + "loss": 1.4208, + "step": 27302 + }, + { + "epoch": 0.34130853271331785, + "grad_norm": 7.5149431228637695, + "learning_rate": 1.6657339853122315e-05, + "loss": 1.8003, + "step": 27304 + }, + { + "epoch": 0.34133353333833344, + "grad_norm": 4.992807388305664, + "learning_rate": 1.6656688636800586e-05, + "loss": 1.8268, + "step": 27306 + }, + { + "epoch": 0.3413585339633491, + "grad_norm": 2.238987684249878, + "learning_rate": 1.665603736978245e-05, + "loss": 1.21, + "step": 27308 + }, + { + "epoch": 0.34138353458836473, + "grad_norm": 3.3437225818634033, + "learning_rate": 1.6655386052072883e-05, + "loss": 0.7504, + "step": 27310 + }, + { + "epoch": 0.3414085352133803, + "grad_norm": 0.8925546407699585, + "learning_rate": 1.6654734683676837e-05, + "loss": 0.9852, + "step": 27312 + }, + { + "epoch": 0.341433535838396, + "grad_norm": 4.703874111175537, + "learning_rate": 1.665408326459927e-05, + "loss": 1.0211, + "step": 27314 + }, + { + "epoch": 0.34145853646341157, + "grad_norm": 0.7247998118400574, + "learning_rate": 1.6653431794845144e-05, + "loss": 0.0425, + "step": 27316 + }, + { + "epoch": 0.3414835370884272, + "grad_norm": 2.3685688972473145, + "learning_rate": 1.6652780274419426e-05, + "loss": 1.6766, + "step": 27318 + }, + { + "epoch": 0.34150853771344286, + "grad_norm": 3.211961269378662, + "learning_rate": 1.665212870332707e-05, + "loss": 1.1626, + "step": 27320 + }, + { + "epoch": 0.34153353833845845, + "grad_norm": 4.899713516235352, + "learning_rate": 1.6651477081573046e-05, + "loss": 0.9287, + "step": 27322 + }, + { + "epoch": 0.3415585389634741, + "grad_norm": 4.005167007446289, + "learning_rate": 1.6650825409162316e-05, + "loss": 1.1235, + "step": 27324 + }, + { + "epoch": 0.3415835395884897, + "grad_norm": 4.650477886199951, + "learning_rate": 1.6650173686099833e-05, + "loss": 0.2775, + "step": 27326 + }, + { + "epoch": 0.34160854021350534, + "grad_norm": 5.500977039337158, + "learning_rate": 1.6649521912390573e-05, + "loss": 1.8233, + "step": 27328 + }, + { + "epoch": 0.341633540838521, + "grad_norm": 3.664032459259033, + "learning_rate": 1.664887008803949e-05, + "loss": 0.8208, + "step": 27330 + }, + { + "epoch": 0.3416585414635366, + "grad_norm": 5.257888317108154, + "learning_rate": 1.6648218213051558e-05, + "loss": 0.9676, + "step": 27332 + }, + { + "epoch": 0.34168354208855223, + "grad_norm": 0.0013239237014204264, + "learning_rate": 1.6647566287431733e-05, + "loss": 0.7067, + "step": 27334 + }, + { + "epoch": 0.3417085427135678, + "grad_norm": 2.867218017578125, + "learning_rate": 1.6646914311184983e-05, + "loss": 1.3106, + "step": 27336 + }, + { + "epoch": 0.34173354333858347, + "grad_norm": 3.6406283378601074, + "learning_rate": 1.6646262284316275e-05, + "loss": 1.6829, + "step": 27338 + }, + { + "epoch": 0.3417585439635991, + "grad_norm": 2.9856371879577637, + "learning_rate": 1.664561020683057e-05, + "loss": 1.6981, + "step": 27340 + }, + { + "epoch": 0.3417835445886147, + "grad_norm": 3.116720199584961, + "learning_rate": 1.664495807873284e-05, + "loss": 1.6624, + "step": 27342 + }, + { + "epoch": 0.34180854521363035, + "grad_norm": 3.317439079284668, + "learning_rate": 1.664430590002805e-05, + "loss": 1.1071, + "step": 27344 + }, + { + "epoch": 0.34183354583864595, + "grad_norm": 6.617164134979248, + "learning_rate": 1.6643653670721165e-05, + "loss": 0.8585, + "step": 27346 + }, + { + "epoch": 0.3418585464636616, + "grad_norm": 2.0318474769592285, + "learning_rate": 1.6643001390817152e-05, + "loss": 0.5188, + "step": 27348 + }, + { + "epoch": 0.34188354708867724, + "grad_norm": 5.898661136627197, + "learning_rate": 1.6642349060320983e-05, + "loss": 2.0232, + "step": 27350 + }, + { + "epoch": 0.34190854771369283, + "grad_norm": 2.309203863143921, + "learning_rate": 1.664169667923762e-05, + "loss": 0.8675, + "step": 27352 + }, + { + "epoch": 0.3419335483387085, + "grad_norm": 4.951773643493652, + "learning_rate": 1.6641044247572032e-05, + "loss": 0.971, + "step": 27354 + }, + { + "epoch": 0.3419585489637241, + "grad_norm": 0.11512216925621033, + "learning_rate": 1.6640391765329192e-05, + "loss": 0.4091, + "step": 27356 + }, + { + "epoch": 0.3419835495887397, + "grad_norm": 2.9367382526397705, + "learning_rate": 1.663973923251407e-05, + "loss": 0.3682, + "step": 27358 + }, + { + "epoch": 0.34200855021375537, + "grad_norm": 0.10368678718805313, + "learning_rate": 1.6639086649131634e-05, + "loss": 0.2068, + "step": 27360 + }, + { + "epoch": 0.34203355083877096, + "grad_norm": 2.3563790321350098, + "learning_rate": 1.663843401518685e-05, + "loss": 1.049, + "step": 27362 + }, + { + "epoch": 0.3420585514637866, + "grad_norm": 0.0009810581104829907, + "learning_rate": 1.663778133068469e-05, + "loss": 1.0774, + "step": 27364 + }, + { + "epoch": 0.3420835520888022, + "grad_norm": 2.8466808795928955, + "learning_rate": 1.6637128595630126e-05, + "loss": 0.5782, + "step": 27366 + }, + { + "epoch": 0.34210855271381785, + "grad_norm": 4.1979217529296875, + "learning_rate": 1.663647581002813e-05, + "loss": 1.692, + "step": 27368 + }, + { + "epoch": 0.3421335533388335, + "grad_norm": 2.859903335571289, + "learning_rate": 1.6635822973883678e-05, + "loss": 0.4903, + "step": 27370 + }, + { + "epoch": 0.3421585539638491, + "grad_norm": 3.5242908000946045, + "learning_rate": 1.663517008720173e-05, + "loss": 0.6417, + "step": 27372 + }, + { + "epoch": 0.34218355458886474, + "grad_norm": 5.7976789474487305, + "learning_rate": 1.663451714998727e-05, + "loss": 1.0784, + "step": 27374 + }, + { + "epoch": 0.3422085552138803, + "grad_norm": 2.101045846939087, + "learning_rate": 1.663386416224526e-05, + "loss": 1.1753, + "step": 27376 + }, + { + "epoch": 0.342233555838896, + "grad_norm": 4.454558372497559, + "learning_rate": 1.6633211123980682e-05, + "loss": 1.0064, + "step": 27378 + }, + { + "epoch": 0.3422585564639116, + "grad_norm": 0.641855001449585, + "learning_rate": 1.6632558035198504e-05, + "loss": 0.027, + "step": 27380 + }, + { + "epoch": 0.3422835570889272, + "grad_norm": 4.034873962402344, + "learning_rate": 1.6631904895903702e-05, + "loss": 1.8422, + "step": 27382 + }, + { + "epoch": 0.34230855771394286, + "grad_norm": 2.136852264404297, + "learning_rate": 1.6631251706101255e-05, + "loss": 1.3025, + "step": 27384 + }, + { + "epoch": 0.34233355833895845, + "grad_norm": 4.7053446769714355, + "learning_rate": 1.6630598465796127e-05, + "loss": 1.9389, + "step": 27386 + }, + { + "epoch": 0.3423585589639741, + "grad_norm": 17.85112762451172, + "learning_rate": 1.66299451749933e-05, + "loss": 1.8378, + "step": 27388 + }, + { + "epoch": 0.34238355958898975, + "grad_norm": 1.1982519626617432, + "learning_rate": 1.662929183369775e-05, + "loss": 0.2325, + "step": 27390 + }, + { + "epoch": 0.34240856021400534, + "grad_norm": 7.983001708984375, + "learning_rate": 1.6628638441914452e-05, + "loss": 1.6932, + "step": 27392 + }, + { + "epoch": 0.342433560839021, + "grad_norm": 3.110201835632324, + "learning_rate": 1.662798499964838e-05, + "loss": 0.5824, + "step": 27394 + }, + { + "epoch": 0.3424585614640366, + "grad_norm": 0.0012711454182863235, + "learning_rate": 1.662733150690451e-05, + "loss": 0.0, + "step": 27396 + }, + { + "epoch": 0.34248356208905223, + "grad_norm": 5.057005405426025, + "learning_rate": 1.662667796368782e-05, + "loss": 0.891, + "step": 27398 + }, + { + "epoch": 0.3425085627140679, + "grad_norm": 7.573017120361328, + "learning_rate": 1.662602437000329e-05, + "loss": 1.8961, + "step": 27400 + }, + { + "epoch": 0.34253356333908347, + "grad_norm": 9.561912536621094, + "learning_rate": 1.6625370725855892e-05, + "loss": 0.5637, + "step": 27402 + }, + { + "epoch": 0.3425585639640991, + "grad_norm": 1.8431206941604614, + "learning_rate": 1.662471703125061e-05, + "loss": 1.1262, + "step": 27404 + }, + { + "epoch": 0.3425835645891147, + "grad_norm": 2.40500545501709, + "learning_rate": 1.662406328619242e-05, + "loss": 0.7656, + "step": 27406 + }, + { + "epoch": 0.34260856521413036, + "grad_norm": 3.752403736114502, + "learning_rate": 1.66234094906863e-05, + "loss": 0.7934, + "step": 27408 + }, + { + "epoch": 0.342633565839146, + "grad_norm": 0.000988817890174687, + "learning_rate": 1.662275564473723e-05, + "loss": 0.8375, + "step": 27410 + }, + { + "epoch": 0.3426585664641616, + "grad_norm": 1.3457129001617432, + "learning_rate": 1.662210174835019e-05, + "loss": 0.1004, + "step": 27412 + }, + { + "epoch": 0.34268356708917724, + "grad_norm": 4.205784320831299, + "learning_rate": 1.6621447801530156e-05, + "loss": 0.7884, + "step": 27414 + }, + { + "epoch": 0.34270856771419284, + "grad_norm": 0.9223338961601257, + "learning_rate": 1.662079380428212e-05, + "loss": 0.7996, + "step": 27416 + }, + { + "epoch": 0.3427335683392085, + "grad_norm": 3.5264272689819336, + "learning_rate": 1.662013975661105e-05, + "loss": 1.2173, + "step": 27418 + }, + { + "epoch": 0.34275856896422413, + "grad_norm": 2.200977087020874, + "learning_rate": 1.661948565852193e-05, + "loss": 1.5554, + "step": 27420 + }, + { + "epoch": 0.3427835695892397, + "grad_norm": 7.24582052230835, + "learning_rate": 1.6618831510019743e-05, + "loss": 1.4281, + "step": 27422 + }, + { + "epoch": 0.34280857021425537, + "grad_norm": 3.7845964431762695, + "learning_rate": 1.6618177311109476e-05, + "loss": 1.2875, + "step": 27424 + }, + { + "epoch": 0.34283357083927096, + "grad_norm": 1.2260936498641968, + "learning_rate": 1.6617523061796103e-05, + "loss": 1.2983, + "step": 27426 + }, + { + "epoch": 0.3428585714642866, + "grad_norm": 0.000952343107201159, + "learning_rate": 1.6616868762084612e-05, + "loss": 0.0459, + "step": 27428 + }, + { + "epoch": 0.34288357208930226, + "grad_norm": 2.290055513381958, + "learning_rate": 1.661621441197998e-05, + "loss": 0.3512, + "step": 27430 + }, + { + "epoch": 0.34290857271431785, + "grad_norm": 3.0605640411376953, + "learning_rate": 1.6615560011487198e-05, + "loss": 1.3643, + "step": 27432 + }, + { + "epoch": 0.3429335733393335, + "grad_norm": 5.634695529937744, + "learning_rate": 1.6614905560611246e-05, + "loss": 1.9666, + "step": 27434 + }, + { + "epoch": 0.3429585739643491, + "grad_norm": 3.41853666305542, + "learning_rate": 1.6614251059357108e-05, + "loss": 0.9483, + "step": 27436 + }, + { + "epoch": 0.34298357458936474, + "grad_norm": 4.527684688568115, + "learning_rate": 1.661359650772977e-05, + "loss": 0.6721, + "step": 27438 + }, + { + "epoch": 0.3430085752143804, + "grad_norm": 0.0009905584156513214, + "learning_rate": 1.6612941905734218e-05, + "loss": 0.0001, + "step": 27440 + }, + { + "epoch": 0.343033575839396, + "grad_norm": 0.008687181398272514, + "learning_rate": 1.6612287253375433e-05, + "loss": 0.0002, + "step": 27442 + }, + { + "epoch": 0.3430585764644116, + "grad_norm": 2.3213183879852295, + "learning_rate": 1.6611632550658402e-05, + "loss": 0.9176, + "step": 27444 + }, + { + "epoch": 0.3430835770894272, + "grad_norm": 0.0010364368790760636, + "learning_rate": 1.661097779758812e-05, + "loss": 1.1374, + "step": 27446 + }, + { + "epoch": 0.34310857771444286, + "grad_norm": 5.319198131561279, + "learning_rate": 1.6610322994169556e-05, + "loss": 1.562, + "step": 27448 + }, + { + "epoch": 0.3431335783394585, + "grad_norm": 3.046732187271118, + "learning_rate": 1.6609668140407712e-05, + "loss": 1.186, + "step": 27450 + }, + { + "epoch": 0.3431585789644741, + "grad_norm": 1.8637446165084839, + "learning_rate": 1.660901323630757e-05, + "loss": 1.0073, + "step": 27452 + }, + { + "epoch": 0.34318357958948975, + "grad_norm": 0.007112455554306507, + "learning_rate": 1.6608358281874117e-05, + "loss": 0.8102, + "step": 27454 + }, + { + "epoch": 0.34320858021450534, + "grad_norm": 1.038183331489563, + "learning_rate": 1.6607703277112343e-05, + "loss": 0.7129, + "step": 27456 + }, + { + "epoch": 0.343233580839521, + "grad_norm": 9.00684642791748, + "learning_rate": 1.6607048222027234e-05, + "loss": 0.8248, + "step": 27458 + }, + { + "epoch": 0.34325858146453664, + "grad_norm": 7.432424545288086, + "learning_rate": 1.660639311662378e-05, + "loss": 1.6988, + "step": 27460 + }, + { + "epoch": 0.34328358208955223, + "grad_norm": 4.910881996154785, + "learning_rate": 1.6605737960906973e-05, + "loss": 2.9786, + "step": 27462 + }, + { + "epoch": 0.3433085827145679, + "grad_norm": 0.6250400543212891, + "learning_rate": 1.6605082754881796e-05, + "loss": 0.21, + "step": 27464 + }, + { + "epoch": 0.34333358333958347, + "grad_norm": 0.9507775902748108, + "learning_rate": 1.6604427498553242e-05, + "loss": 0.0423, + "step": 27466 + }, + { + "epoch": 0.3433585839645991, + "grad_norm": 6.022039413452148, + "learning_rate": 1.6603772191926307e-05, + "loss": 1.1661, + "step": 27468 + }, + { + "epoch": 0.34338358458961477, + "grad_norm": 5.509811878204346, + "learning_rate": 1.6603116835005975e-05, + "loss": 1.4658, + "step": 27470 + }, + { + "epoch": 0.34340858521463036, + "grad_norm": 0.0038787168450653553, + "learning_rate": 1.6602461427797235e-05, + "loss": 0.3906, + "step": 27472 + }, + { + "epoch": 0.343433585839646, + "grad_norm": 4.1182122230529785, + "learning_rate": 1.6601805970305085e-05, + "loss": 0.1701, + "step": 27474 + }, + { + "epoch": 0.3434585864646616, + "grad_norm": 2.2660326957702637, + "learning_rate": 1.6601150462534517e-05, + "loss": 0.5331, + "step": 27476 + }, + { + "epoch": 0.34348358708967724, + "grad_norm": 1.6670541763305664, + "learning_rate": 1.660049490449052e-05, + "loss": 1.2827, + "step": 27478 + }, + { + "epoch": 0.3435085877146929, + "grad_norm": 4.535989761352539, + "learning_rate": 1.6599839296178087e-05, + "loss": 0.3072, + "step": 27480 + }, + { + "epoch": 0.3435335883397085, + "grad_norm": 4.972168922424316, + "learning_rate": 1.659918363760221e-05, + "loss": 0.9841, + "step": 27482 + }, + { + "epoch": 0.34355858896472413, + "grad_norm": 2.135117769241333, + "learning_rate": 1.6598527928767884e-05, + "loss": 1.9687, + "step": 27484 + }, + { + "epoch": 0.3435835895897397, + "grad_norm": 2.641615152359009, + "learning_rate": 1.65978721696801e-05, + "loss": 1.0762, + "step": 27486 + }, + { + "epoch": 0.34360859021475537, + "grad_norm": 6.488393306732178, + "learning_rate": 1.659721636034386e-05, + "loss": 2.2316, + "step": 27488 + }, + { + "epoch": 0.343633590839771, + "grad_norm": 5.0961737632751465, + "learning_rate": 1.6596560500764147e-05, + "loss": 1.5638, + "step": 27490 + }, + { + "epoch": 0.3436585914647866, + "grad_norm": 2.369736433029175, + "learning_rate": 1.6595904590945965e-05, + "loss": 0.7469, + "step": 27492 + }, + { + "epoch": 0.34368359208980226, + "grad_norm": 2.321723222732544, + "learning_rate": 1.6595248630894308e-05, + "loss": 0.5429, + "step": 27494 + }, + { + "epoch": 0.34370859271481785, + "grad_norm": 0.10813575237989426, + "learning_rate": 1.6594592620614166e-05, + "loss": 0.5359, + "step": 27496 + }, + { + "epoch": 0.3437335933398335, + "grad_norm": 4.777368545532227, + "learning_rate": 1.659393656011054e-05, + "loss": 1.0256, + "step": 27498 + }, + { + "epoch": 0.34375859396484915, + "grad_norm": 2.269495725631714, + "learning_rate": 1.659328044938843e-05, + "loss": 0.325, + "step": 27500 + }, + { + "epoch": 0.34378359458986474, + "grad_norm": 0.001136325066909194, + "learning_rate": 1.6592624288452825e-05, + "loss": 1.5155, + "step": 27502 + }, + { + "epoch": 0.3438085952148804, + "grad_norm": 2.100313425064087, + "learning_rate": 1.6591968077308726e-05, + "loss": 0.0146, + "step": 27504 + }, + { + "epoch": 0.343833595839896, + "grad_norm": 0.0008049689349718392, + "learning_rate": 1.6591311815961127e-05, + "loss": 0.0574, + "step": 27506 + }, + { + "epoch": 0.3438585964649116, + "grad_norm": 1.793283224105835, + "learning_rate": 1.6590655504415036e-05, + "loss": 1.5866, + "step": 27508 + }, + { + "epoch": 0.3438835970899273, + "grad_norm": 5.618873596191406, + "learning_rate": 1.6589999142675442e-05, + "loss": 0.1888, + "step": 27510 + }, + { + "epoch": 0.34390859771494287, + "grad_norm": 1.9311670064926147, + "learning_rate": 1.6589342730747343e-05, + "loss": 0.1846, + "step": 27512 + }, + { + "epoch": 0.3439335983399585, + "grad_norm": 3.824871778488159, + "learning_rate": 1.6588686268635748e-05, + "loss": 0.8544, + "step": 27514 + }, + { + "epoch": 0.3439585989649741, + "grad_norm": 7.748846530914307, + "learning_rate": 1.6588029756345646e-05, + "loss": 0.8354, + "step": 27516 + }, + { + "epoch": 0.34398359958998975, + "grad_norm": 3.1590914726257324, + "learning_rate": 1.6587373193882043e-05, + "loss": 0.8287, + "step": 27518 + }, + { + "epoch": 0.3440086002150054, + "grad_norm": 4.265861988067627, + "learning_rate": 1.6586716581249938e-05, + "loss": 1.8227, + "step": 27520 + }, + { + "epoch": 0.344033600840021, + "grad_norm": 0.9293432831764221, + "learning_rate": 1.6586059918454326e-05, + "loss": 1.0171, + "step": 27522 + }, + { + "epoch": 0.34405860146503664, + "grad_norm": 0.2701648771762848, + "learning_rate": 1.6585403205500215e-05, + "loss": 0.1235, + "step": 27524 + }, + { + "epoch": 0.34408360209005223, + "grad_norm": 0.1607355922460556, + "learning_rate": 1.6584746442392606e-05, + "loss": 0.0767, + "step": 27526 + }, + { + "epoch": 0.3441086027150679, + "grad_norm": 2.374483823776245, + "learning_rate": 1.65840896291365e-05, + "loss": 0.1277, + "step": 27528 + }, + { + "epoch": 0.3441336033400835, + "grad_norm": 4.901374816894531, + "learning_rate": 1.65834327657369e-05, + "loss": 2.3746, + "step": 27530 + }, + { + "epoch": 0.3441586039650991, + "grad_norm": 0.0005949255428276956, + "learning_rate": 1.65827758521988e-05, + "loss": 0.2174, + "step": 27532 + }, + { + "epoch": 0.34418360459011477, + "grad_norm": 4.120319366455078, + "learning_rate": 1.6582118888527215e-05, + "loss": 0.572, + "step": 27534 + }, + { + "epoch": 0.34420860521513036, + "grad_norm": 2.2395310401916504, + "learning_rate": 1.6581461874727144e-05, + "loss": 0.1348, + "step": 27536 + }, + { + "epoch": 0.344233605840146, + "grad_norm": 2.235760450363159, + "learning_rate": 1.658080481080359e-05, + "loss": 0.7221, + "step": 27538 + }, + { + "epoch": 0.34425860646516165, + "grad_norm": 5.8219313621521, + "learning_rate": 1.6580147696761556e-05, + "loss": 0.6051, + "step": 27540 + }, + { + "epoch": 0.34428360709017725, + "grad_norm": 3.882655382156372, + "learning_rate": 1.6579490532606046e-05, + "loss": 2.1767, + "step": 27542 + }, + { + "epoch": 0.3443086077151929, + "grad_norm": 4.7151384353637695, + "learning_rate": 1.6578833318342064e-05, + "loss": 0.7686, + "step": 27544 + }, + { + "epoch": 0.3443336083402085, + "grad_norm": 2.7703893184661865, + "learning_rate": 1.6578176053974628e-05, + "loss": 0.8195, + "step": 27546 + }, + { + "epoch": 0.34435860896522413, + "grad_norm": 1.6974424123764038, + "learning_rate": 1.6577518739508726e-05, + "loss": 0.0619, + "step": 27548 + }, + { + "epoch": 0.3443836095902398, + "grad_norm": 0.008500502444803715, + "learning_rate": 1.657686137494937e-05, + "loss": 0.2925, + "step": 27550 + }, + { + "epoch": 0.3444086102152554, + "grad_norm": 0.0005178612773306668, + "learning_rate": 1.6576203960301568e-05, + "loss": 0.0288, + "step": 27552 + }, + { + "epoch": 0.344433610840271, + "grad_norm": 3.7187018394470215, + "learning_rate": 1.657554649557033e-05, + "loss": 1.875, + "step": 27554 + }, + { + "epoch": 0.3444586114652866, + "grad_norm": 2.89034366607666, + "learning_rate": 1.657488898076066e-05, + "loss": 0.7835, + "step": 27556 + }, + { + "epoch": 0.34448361209030226, + "grad_norm": 0.12207618355751038, + "learning_rate": 1.6574231415877564e-05, + "loss": 0.272, + "step": 27558 + }, + { + "epoch": 0.3445086127153179, + "grad_norm": 4.089604377746582, + "learning_rate": 1.6573573800926053e-05, + "loss": 1.313, + "step": 27560 + }, + { + "epoch": 0.3445336133403335, + "grad_norm": 5.804590702056885, + "learning_rate": 1.657291613591113e-05, + "loss": 2.0004, + "step": 27562 + }, + { + "epoch": 0.34455861396534915, + "grad_norm": 2.4671125411987305, + "learning_rate": 1.657225842083781e-05, + "loss": 0.3108, + "step": 27564 + }, + { + "epoch": 0.34458361459036474, + "grad_norm": 4.84440803527832, + "learning_rate": 1.6571600655711097e-05, + "loss": 2.2124, + "step": 27566 + }, + { + "epoch": 0.3446086152153804, + "grad_norm": 3.957669496536255, + "learning_rate": 1.6570942840536004e-05, + "loss": 0.9137, + "step": 27568 + }, + { + "epoch": 0.34463361584039603, + "grad_norm": 1.7476003170013428, + "learning_rate": 1.657028497531754e-05, + "loss": 0.8263, + "step": 27570 + }, + { + "epoch": 0.3446586164654116, + "grad_norm": 4.361243724822998, + "learning_rate": 1.6569627060060713e-05, + "loss": 0.7901, + "step": 27572 + }, + { + "epoch": 0.3446836170904273, + "grad_norm": 2.844179391860962, + "learning_rate": 1.6568969094770537e-05, + "loss": 0.6639, + "step": 27574 + }, + { + "epoch": 0.34470861771544287, + "grad_norm": 0.0011183592723682523, + "learning_rate": 1.656831107945202e-05, + "loss": 1.283, + "step": 27576 + }, + { + "epoch": 0.3447336183404585, + "grad_norm": 4.019261360168457, + "learning_rate": 1.6567653014110175e-05, + "loss": 1.7768, + "step": 27578 + }, + { + "epoch": 0.34475861896547416, + "grad_norm": 4.92368745803833, + "learning_rate": 1.6566994898750017e-05, + "loss": 2.0619, + "step": 27580 + }, + { + "epoch": 0.34478361959048975, + "grad_norm": 5.461354732513428, + "learning_rate": 1.6566336733376548e-05, + "loss": 2.4261, + "step": 27582 + }, + { + "epoch": 0.3448086202155054, + "grad_norm": 0.1093178540468216, + "learning_rate": 1.6565678517994792e-05, + "loss": 0.0195, + "step": 27584 + }, + { + "epoch": 0.344833620840521, + "grad_norm": 4.07862663269043, + "learning_rate": 1.6565020252609754e-05, + "loss": 1.126, + "step": 27586 + }, + { + "epoch": 0.34485862146553664, + "grad_norm": 2.3277220726013184, + "learning_rate": 1.6564361937226456e-05, + "loss": 1.8945, + "step": 27588 + }, + { + "epoch": 0.3448836220905523, + "grad_norm": 0.001185734523460269, + "learning_rate": 1.65637035718499e-05, + "loss": 0.1302, + "step": 27590 + }, + { + "epoch": 0.3449086227155679, + "grad_norm": 3.780951976776123, + "learning_rate": 1.6563045156485105e-05, + "loss": 1.1605, + "step": 27592 + }, + { + "epoch": 0.34493362334058353, + "grad_norm": 6.369259834289551, + "learning_rate": 1.656238669113709e-05, + "loss": 1.805, + "step": 27594 + }, + { + "epoch": 0.3449586239655991, + "grad_norm": 2.3427894115448, + "learning_rate": 1.6561728175810864e-05, + "loss": 0.5937, + "step": 27596 + }, + { + "epoch": 0.34498362459061477, + "grad_norm": 0.0013373716501519084, + "learning_rate": 1.6561069610511443e-05, + "loss": 0.0001, + "step": 27598 + }, + { + "epoch": 0.3450086252156304, + "grad_norm": 3.615288734436035, + "learning_rate": 1.6560410995243844e-05, + "loss": 0.9395, + "step": 27600 + }, + { + "epoch": 0.345033625840646, + "grad_norm": 6.0260467529296875, + "learning_rate": 1.6559752330013085e-05, + "loss": 1.1373, + "step": 27602 + }, + { + "epoch": 0.34505862646566166, + "grad_norm": 3.4366841316223145, + "learning_rate": 1.6559093614824177e-05, + "loss": 0.9113, + "step": 27604 + }, + { + "epoch": 0.34508362709067725, + "grad_norm": 3.237333059310913, + "learning_rate": 1.6558434849682145e-05, + "loss": 0.5613, + "step": 27606 + }, + { + "epoch": 0.3451086277156929, + "grad_norm": 9.050920486450195, + "learning_rate": 1.6557776034591996e-05, + "loss": 0.7326, + "step": 27608 + }, + { + "epoch": 0.34513362834070854, + "grad_norm": 2.3890011310577393, + "learning_rate": 1.6557117169558748e-05, + "loss": 0.0919, + "step": 27610 + }, + { + "epoch": 0.34515862896572413, + "grad_norm": 3.018256902694702, + "learning_rate": 1.655645825458743e-05, + "loss": 0.5467, + "step": 27612 + }, + { + "epoch": 0.3451836295907398, + "grad_norm": 0.41686323285102844, + "learning_rate": 1.655579928968305e-05, + "loss": 0.1309, + "step": 27614 + }, + { + "epoch": 0.3452086302157554, + "grad_norm": 2.9132721424102783, + "learning_rate": 1.655514027485063e-05, + "loss": 0.6488, + "step": 27616 + }, + { + "epoch": 0.345233630840771, + "grad_norm": 0.0009172196732833982, + "learning_rate": 1.6554481210095193e-05, + "loss": 0.0178, + "step": 27618 + }, + { + "epoch": 0.34525863146578667, + "grad_norm": 2.255305528640747, + "learning_rate": 1.655382209542175e-05, + "loss": 1.2285, + "step": 27620 + }, + { + "epoch": 0.34528363209080226, + "grad_norm": 2.375643491744995, + "learning_rate": 1.6553162930835323e-05, + "loss": 1.6338, + "step": 27622 + }, + { + "epoch": 0.3453086327158179, + "grad_norm": 4.382068634033203, + "learning_rate": 1.6552503716340937e-05, + "loss": 1.4163, + "step": 27624 + }, + { + "epoch": 0.3453336333408335, + "grad_norm": 4.230109214782715, + "learning_rate": 1.6551844451943607e-05, + "loss": 1.5791, + "step": 27626 + }, + { + "epoch": 0.34535863396584915, + "grad_norm": 4.3139753341674805, + "learning_rate": 1.6551185137648357e-05, + "loss": 1.5552, + "step": 27628 + }, + { + "epoch": 0.3453836345908648, + "grad_norm": 0.023173458874225616, + "learning_rate": 1.6550525773460206e-05, + "loss": 1.0185, + "step": 27630 + }, + { + "epoch": 0.3454086352158804, + "grad_norm": 5.070705890655518, + "learning_rate": 1.654986635938418e-05, + "loss": 1.8117, + "step": 27632 + }, + { + "epoch": 0.34543363584089604, + "grad_norm": 0.004809493664652109, + "learning_rate": 1.6549206895425295e-05, + "loss": 0.2125, + "step": 27634 + }, + { + "epoch": 0.34545863646591163, + "grad_norm": 3.8946993350982666, + "learning_rate": 1.654854738158858e-05, + "loss": 1.5196, + "step": 27636 + }, + { + "epoch": 0.3454836370909273, + "grad_norm": 5.410979270935059, + "learning_rate": 1.654788781787905e-05, + "loss": 1.665, + "step": 27638 + }, + { + "epoch": 0.3455086377159429, + "grad_norm": 3.9604358673095703, + "learning_rate": 1.6547228204301738e-05, + "loss": 0.9781, + "step": 27640 + }, + { + "epoch": 0.3455336383409585, + "grad_norm": 3.6500163078308105, + "learning_rate": 1.6546568540861658e-05, + "loss": 0.9721, + "step": 27642 + }, + { + "epoch": 0.34555863896597416, + "grad_norm": 5.985200881958008, + "learning_rate": 1.6545908827563836e-05, + "loss": 1.4246, + "step": 27644 + }, + { + "epoch": 0.34558363959098976, + "grad_norm": 4.441377639770508, + "learning_rate": 1.65452490644133e-05, + "loss": 1.4011, + "step": 27646 + }, + { + "epoch": 0.3456086402160054, + "grad_norm": 3.8163528442382812, + "learning_rate": 1.6544589251415073e-05, + "loss": 1.102, + "step": 27648 + }, + { + "epoch": 0.34563364084102105, + "grad_norm": 0.8175216913223267, + "learning_rate": 1.6543929388574178e-05, + "loss": 0.1317, + "step": 27650 + }, + { + "epoch": 0.34565864146603664, + "grad_norm": 0.0012245289981365204, + "learning_rate": 1.6543269475895646e-05, + "loss": 0.7283, + "step": 27652 + }, + { + "epoch": 0.3456836420910523, + "grad_norm": 0.008143397979438305, + "learning_rate": 1.65426095133845e-05, + "loss": 0.0062, + "step": 27654 + }, + { + "epoch": 0.3457086427160679, + "grad_norm": 3.6242659091949463, + "learning_rate": 1.654194950104576e-05, + "loss": 0.8417, + "step": 27656 + }, + { + "epoch": 0.34573364334108353, + "grad_norm": 3.1048595905303955, + "learning_rate": 1.6541289438884457e-05, + "loss": 0.884, + "step": 27658 + }, + { + "epoch": 0.3457586439660992, + "grad_norm": 0.9081008434295654, + "learning_rate": 1.6540629326905625e-05, + "loss": 0.7045, + "step": 27660 + }, + { + "epoch": 0.34578364459111477, + "grad_norm": 4.735917568206787, + "learning_rate": 1.6539969165114276e-05, + "loss": 2.1394, + "step": 27662 + }, + { + "epoch": 0.3458086452161304, + "grad_norm": 2.67175555229187, + "learning_rate": 1.6539308953515453e-05, + "loss": 1.0595, + "step": 27664 + }, + { + "epoch": 0.345833645841146, + "grad_norm": 0.007414933759719133, + "learning_rate": 1.6538648692114178e-05, + "loss": 0.0004, + "step": 27666 + }, + { + "epoch": 0.34585864646616166, + "grad_norm": 2.743511199951172, + "learning_rate": 1.653798838091548e-05, + "loss": 1.0783, + "step": 27668 + }, + { + "epoch": 0.3458836470911773, + "grad_norm": 1.712149739265442, + "learning_rate": 1.6537328019924385e-05, + "loss": 0.9408, + "step": 27670 + }, + { + "epoch": 0.3459086477161929, + "grad_norm": 3.9260706901550293, + "learning_rate": 1.6536667609145923e-05, + "loss": 0.7867, + "step": 27672 + }, + { + "epoch": 0.34593364834120854, + "grad_norm": 4.170298099517822, + "learning_rate": 1.6536007148585125e-05, + "loss": 0.9455, + "step": 27674 + }, + { + "epoch": 0.34595864896622414, + "grad_norm": 4.492753505706787, + "learning_rate": 1.653534663824702e-05, + "loss": 0.4502, + "step": 27676 + }, + { + "epoch": 0.3459836495912398, + "grad_norm": 0.6393963098526001, + "learning_rate": 1.6534686078136642e-05, + "loss": 0.3352, + "step": 27678 + }, + { + "epoch": 0.34600865021625543, + "grad_norm": 3.0457077026367188, + "learning_rate": 1.6534025468259017e-05, + "loss": 1.3252, + "step": 27680 + }, + { + "epoch": 0.346033650841271, + "grad_norm": 8.292543411254883, + "learning_rate": 1.6533364808619178e-05, + "loss": 1.4279, + "step": 27682 + }, + { + "epoch": 0.34605865146628667, + "grad_norm": 5.764059543609619, + "learning_rate": 1.653270409922216e-05, + "loss": 0.9945, + "step": 27684 + }, + { + "epoch": 0.34608365209130226, + "grad_norm": 4.514287948608398, + "learning_rate": 1.653204334007299e-05, + "loss": 0.9727, + "step": 27686 + }, + { + "epoch": 0.3461086527163179, + "grad_norm": 4.591359615325928, + "learning_rate": 1.65313825311767e-05, + "loss": 1.0456, + "step": 27688 + }, + { + "epoch": 0.34613365334133356, + "grad_norm": 3.4643337726593018, + "learning_rate": 1.6530721672538327e-05, + "loss": 1.7085, + "step": 27690 + }, + { + "epoch": 0.34615865396634915, + "grad_norm": 0.001614426844753325, + "learning_rate": 1.65300607641629e-05, + "loss": 0.5399, + "step": 27692 + }, + { + "epoch": 0.3461836545913648, + "grad_norm": 1.2325384616851807, + "learning_rate": 1.6529399806055447e-05, + "loss": 1.4463, + "step": 27694 + }, + { + "epoch": 0.3462086552163804, + "grad_norm": 5.489204406738281, + "learning_rate": 1.6528738798221017e-05, + "loss": 0.7643, + "step": 27696 + }, + { + "epoch": 0.34623365584139604, + "grad_norm": 0.16124801337718964, + "learning_rate": 1.6528077740664632e-05, + "loss": 0.1215, + "step": 27698 + }, + { + "epoch": 0.3462586564664117, + "grad_norm": 3.4797916412353516, + "learning_rate": 1.652741663339133e-05, + "loss": 1.435, + "step": 27700 + }, + { + "epoch": 0.3462836570914273, + "grad_norm": 4.236467361450195, + "learning_rate": 1.6526755476406145e-05, + "loss": 0.8309, + "step": 27702 + }, + { + "epoch": 0.3463086577164429, + "grad_norm": 0.0011863213730975986, + "learning_rate": 1.652609426971412e-05, + "loss": 0.1395, + "step": 27704 + }, + { + "epoch": 0.3463336583414585, + "grad_norm": 4.045217037200928, + "learning_rate": 1.6525433013320277e-05, + "loss": 0.3898, + "step": 27706 + }, + { + "epoch": 0.34635865896647416, + "grad_norm": 11.340527534484863, + "learning_rate": 1.652477170722966e-05, + "loss": 1.956, + "step": 27708 + }, + { + "epoch": 0.3463836595914898, + "grad_norm": 2.3262417316436768, + "learning_rate": 1.6524110351447306e-05, + "loss": 0.7071, + "step": 27710 + }, + { + "epoch": 0.3464086602165054, + "grad_norm": 1.6409751176834106, + "learning_rate": 1.652344894597825e-05, + "loss": 0.4589, + "step": 27712 + }, + { + "epoch": 0.34643366084152105, + "grad_norm": 2.4564270973205566, + "learning_rate": 1.6522787490827527e-05, + "loss": 0.8501, + "step": 27714 + }, + { + "epoch": 0.34645866146653664, + "grad_norm": 1.8023415803909302, + "learning_rate": 1.6522125986000182e-05, + "loss": 0.5251, + "step": 27716 + }, + { + "epoch": 0.3464836620915523, + "grad_norm": 2.5445785522460938, + "learning_rate": 1.6521464431501246e-05, + "loss": 1.0306, + "step": 27718 + }, + { + "epoch": 0.34650866271656794, + "grad_norm": 5.771529197692871, + "learning_rate": 1.6520802827335757e-05, + "loss": 1.2424, + "step": 27720 + }, + { + "epoch": 0.34653366334158353, + "grad_norm": 3.8792662620544434, + "learning_rate": 1.6520141173508753e-05, + "loss": 0.4601, + "step": 27722 + }, + { + "epoch": 0.3465586639665992, + "grad_norm": 3.682345151901245, + "learning_rate": 1.651947947002528e-05, + "loss": 0.4991, + "step": 27724 + }, + { + "epoch": 0.34658366459161477, + "grad_norm": 6.432804107666016, + "learning_rate": 1.6518817716890372e-05, + "loss": 0.7671, + "step": 27726 + }, + { + "epoch": 0.3466086652166304, + "grad_norm": 5.6969523429870605, + "learning_rate": 1.651815591410907e-05, + "loss": 1.566, + "step": 27728 + }, + { + "epoch": 0.34663366584164607, + "grad_norm": 4.510130405426025, + "learning_rate": 1.6517494061686417e-05, + "loss": 1.569, + "step": 27730 + }, + { + "epoch": 0.34665866646666166, + "grad_norm": 3.71036434173584, + "learning_rate": 1.651683215962745e-05, + "loss": 1.5739, + "step": 27732 + }, + { + "epoch": 0.3466836670916773, + "grad_norm": 7.121776103973389, + "learning_rate": 1.6516170207937208e-05, + "loss": 0.9143, + "step": 27734 + }, + { + "epoch": 0.3467086677166929, + "grad_norm": 4.525111198425293, + "learning_rate": 1.6515508206620738e-05, + "loss": 0.5481, + "step": 27736 + }, + { + "epoch": 0.34673366834170855, + "grad_norm": 0.022230273112654686, + "learning_rate": 1.6514846155683076e-05, + "loss": 0.0415, + "step": 27738 + }, + { + "epoch": 0.3467586689667242, + "grad_norm": 3.2519962787628174, + "learning_rate": 1.651418405512927e-05, + "loss": 1.7472, + "step": 27740 + }, + { + "epoch": 0.3467836695917398, + "grad_norm": 3.3401005268096924, + "learning_rate": 1.6513521904964357e-05, + "loss": 1.502, + "step": 27742 + }, + { + "epoch": 0.34680867021675543, + "grad_norm": 3.1154677867889404, + "learning_rate": 1.6512859705193384e-05, + "loss": 1.1264, + "step": 27744 + }, + { + "epoch": 0.346833670841771, + "grad_norm": 9.940079689025879, + "learning_rate": 1.651219745582139e-05, + "loss": 2.0549, + "step": 27746 + }, + { + "epoch": 0.34685867146678667, + "grad_norm": 3.9957361221313477, + "learning_rate": 1.6511535156853424e-05, + "loss": 2.0362, + "step": 27748 + }, + { + "epoch": 0.3468836720918023, + "grad_norm": 2.511709213256836, + "learning_rate": 1.6510872808294527e-05, + "loss": 0.3905, + "step": 27750 + }, + { + "epoch": 0.3469086727168179, + "grad_norm": 3.2062110900878906, + "learning_rate": 1.6510210410149742e-05, + "loss": 1.0094, + "step": 27752 + }, + { + "epoch": 0.34693367334183356, + "grad_norm": 2.9011247158050537, + "learning_rate": 1.6509547962424114e-05, + "loss": 0.3286, + "step": 27754 + }, + { + "epoch": 0.34695867396684915, + "grad_norm": 4.743391513824463, + "learning_rate": 1.650888546512269e-05, + "loss": 1.4045, + "step": 27756 + }, + { + "epoch": 0.3469836745918648, + "grad_norm": 5.8952555656433105, + "learning_rate": 1.6508222918250518e-05, + "loss": 1.8942, + "step": 27758 + }, + { + "epoch": 0.34700867521688045, + "grad_norm": 2.2328686714172363, + "learning_rate": 1.6507560321812636e-05, + "loss": 1.8868, + "step": 27760 + }, + { + "epoch": 0.34703367584189604, + "grad_norm": 2.155200481414795, + "learning_rate": 1.6506897675814098e-05, + "loss": 0.3865, + "step": 27762 + }, + { + "epoch": 0.3470586764669117, + "grad_norm": 0.09264307469129562, + "learning_rate": 1.650623498025995e-05, + "loss": 0.3027, + "step": 27764 + }, + { + "epoch": 0.3470836770919273, + "grad_norm": 0.0013766129268333316, + "learning_rate": 1.6505572235155228e-05, + "loss": 0.6689, + "step": 27766 + }, + { + "epoch": 0.3471086777169429, + "grad_norm": 3.2979323863983154, + "learning_rate": 1.6504909440504993e-05, + "loss": 0.5332, + "step": 27768 + }, + { + "epoch": 0.3471336783419586, + "grad_norm": 2.0245308876037598, + "learning_rate": 1.6504246596314288e-05, + "loss": 0.5931, + "step": 27770 + }, + { + "epoch": 0.34715867896697417, + "grad_norm": 3.458706855773926, + "learning_rate": 1.650358370258816e-05, + "loss": 0.7691, + "step": 27772 + }, + { + "epoch": 0.3471836795919898, + "grad_norm": 0.3370482325553894, + "learning_rate": 1.650292075933166e-05, + "loss": 1.0425, + "step": 27774 + }, + { + "epoch": 0.3472086802170054, + "grad_norm": 9.644562721252441, + "learning_rate": 1.650225776654983e-05, + "loss": 1.5914, + "step": 27776 + }, + { + "epoch": 0.34723368084202105, + "grad_norm": 3.6116645336151123, + "learning_rate": 1.650159472424773e-05, + "loss": 1.0486, + "step": 27778 + }, + { + "epoch": 0.3472586814670367, + "grad_norm": 8.563125610351562, + "learning_rate": 1.6500931632430402e-05, + "loss": 1.5104, + "step": 27780 + }, + { + "epoch": 0.3472836820920523, + "grad_norm": 2.7971575260162354, + "learning_rate": 1.6500268491102897e-05, + "loss": 1.4854, + "step": 27782 + }, + { + "epoch": 0.34730868271706794, + "grad_norm": 2.4915709495544434, + "learning_rate": 1.6499605300270267e-05, + "loss": 1.1583, + "step": 27784 + }, + { + "epoch": 0.34733368334208353, + "grad_norm": 8.547185897827148, + "learning_rate": 1.6498942059937563e-05, + "loss": 1.649, + "step": 27786 + }, + { + "epoch": 0.3473586839670992, + "grad_norm": 3.369580030441284, + "learning_rate": 1.6498278770109834e-05, + "loss": 1.4057, + "step": 27788 + }, + { + "epoch": 0.3473836845921148, + "grad_norm": 3.2788383960723877, + "learning_rate": 1.6497615430792135e-05, + "loss": 1.116, + "step": 27790 + }, + { + "epoch": 0.3474086852171304, + "grad_norm": 3.1432173252105713, + "learning_rate": 1.649695204198951e-05, + "loss": 1.586, + "step": 27792 + }, + { + "epoch": 0.34743368584214607, + "grad_norm": 4.050424098968506, + "learning_rate": 1.6496288603707023e-05, + "loss": 1.1306, + "step": 27794 + }, + { + "epoch": 0.34745868646716166, + "grad_norm": 0.077205590903759, + "learning_rate": 1.6495625115949717e-05, + "loss": 0.0005, + "step": 27796 + }, + { + "epoch": 0.3474836870921773, + "grad_norm": 3.2340142726898193, + "learning_rate": 1.649496157872265e-05, + "loss": 1.5428, + "step": 27798 + }, + { + "epoch": 0.34750868771719295, + "grad_norm": 2.6768741607666016, + "learning_rate": 1.6494297992030877e-05, + "loss": 0.2056, + "step": 27800 + }, + { + "epoch": 0.34753368834220855, + "grad_norm": 4.321773052215576, + "learning_rate": 1.6493634355879447e-05, + "loss": 0.9163, + "step": 27802 + }, + { + "epoch": 0.3475586889672242, + "grad_norm": 6.843419075012207, + "learning_rate": 1.6492970670273415e-05, + "loss": 2.1807, + "step": 27804 + }, + { + "epoch": 0.3475836895922398, + "grad_norm": 0.3945225179195404, + "learning_rate": 1.6492306935217833e-05, + "loss": 1.0906, + "step": 27806 + }, + { + "epoch": 0.34760869021725543, + "grad_norm": 3.2480015754699707, + "learning_rate": 1.6491643150717767e-05, + "loss": 1.2201, + "step": 27808 + }, + { + "epoch": 0.3476336908422711, + "grad_norm": 2.5998892784118652, + "learning_rate": 1.6490979316778258e-05, + "loss": 0.6445, + "step": 27810 + }, + { + "epoch": 0.3476586914672867, + "grad_norm": 2.4628236293792725, + "learning_rate": 1.6490315433404375e-05, + "loss": 0.7648, + "step": 27812 + }, + { + "epoch": 0.3476836920923023, + "grad_norm": 2.5271530151367188, + "learning_rate": 1.6489651500601166e-05, + "loss": 0.9836, + "step": 27814 + }, + { + "epoch": 0.3477086927173179, + "grad_norm": 9.277474403381348, + "learning_rate": 1.6488987518373686e-05, + "loss": 0.2478, + "step": 27816 + }, + { + "epoch": 0.34773369334233356, + "grad_norm": 9.439448356628418, + "learning_rate": 1.6488323486726994e-05, + "loss": 1.7502, + "step": 27818 + }, + { + "epoch": 0.3477586939673492, + "grad_norm": 2.740567922592163, + "learning_rate": 1.648765940566615e-05, + "loss": 0.5941, + "step": 27820 + }, + { + "epoch": 0.3477836945923648, + "grad_norm": 5.1162004470825195, + "learning_rate": 1.6486995275196206e-05, + "loss": 0.9935, + "step": 27822 + }, + { + "epoch": 0.34780869521738045, + "grad_norm": 2.13911509513855, + "learning_rate": 1.6486331095322228e-05, + "loss": 0.8882, + "step": 27824 + }, + { + "epoch": 0.34783369584239604, + "grad_norm": 3.4959425926208496, + "learning_rate": 1.6485666866049268e-05, + "loss": 0.7517, + "step": 27826 + }, + { + "epoch": 0.3478586964674117, + "grad_norm": 3.0965592861175537, + "learning_rate": 1.6485002587382388e-05, + "loss": 1.13, + "step": 27828 + }, + { + "epoch": 0.34788369709242734, + "grad_norm": 2.0368804931640625, + "learning_rate": 1.6484338259326643e-05, + "loss": 1.1448, + "step": 27830 + }, + { + "epoch": 0.3479086977174429, + "grad_norm": 0.7836657166481018, + "learning_rate": 1.6483673881887095e-05, + "loss": 0.5264, + "step": 27832 + }, + { + "epoch": 0.3479336983424586, + "grad_norm": 3.920229911804199, + "learning_rate": 1.6483009455068802e-05, + "loss": 1.4538, + "step": 27834 + }, + { + "epoch": 0.34795869896747417, + "grad_norm": 0.9871447086334229, + "learning_rate": 1.6482344978876827e-05, + "loss": 0.5253, + "step": 27836 + }, + { + "epoch": 0.3479836995924898, + "grad_norm": 3.677138328552246, + "learning_rate": 1.648168045331623e-05, + "loss": 1.1197, + "step": 27838 + }, + { + "epoch": 0.34800870021750546, + "grad_norm": 7.764638423919678, + "learning_rate": 1.6481015878392073e-05, + "loss": 0.8392, + "step": 27840 + }, + { + "epoch": 0.34803370084252105, + "grad_norm": 3.112514019012451, + "learning_rate": 1.6480351254109415e-05, + "loss": 0.6529, + "step": 27842 + }, + { + "epoch": 0.3480587014675367, + "grad_norm": 3.133833646774292, + "learning_rate": 1.6479686580473314e-05, + "loss": 0.2434, + "step": 27844 + }, + { + "epoch": 0.3480837020925523, + "grad_norm": 0.0012345211580395699, + "learning_rate": 1.647902185748884e-05, + "loss": 1.1448, + "step": 27846 + }, + { + "epoch": 0.34810870271756794, + "grad_norm": 4.618617057800293, + "learning_rate": 1.6478357085161052e-05, + "loss": 1.304, + "step": 27848 + }, + { + "epoch": 0.3481337033425836, + "grad_norm": 0.0013309363275766373, + "learning_rate": 1.6477692263495014e-05, + "loss": 0.2012, + "step": 27850 + }, + { + "epoch": 0.3481587039675992, + "grad_norm": 2.311706066131592, + "learning_rate": 1.6477027392495785e-05, + "loss": 0.8613, + "step": 27852 + }, + { + "epoch": 0.34818370459261483, + "grad_norm": 3.800057888031006, + "learning_rate": 1.647636247216843e-05, + "loss": 1.1316, + "step": 27854 + }, + { + "epoch": 0.3482087052176304, + "grad_norm": 5.201521396636963, + "learning_rate": 1.6475697502518014e-05, + "loss": 1.7318, + "step": 27856 + }, + { + "epoch": 0.34823370584264607, + "grad_norm": 3.8922970294952393, + "learning_rate": 1.6475032483549607e-05, + "loss": 1.0583, + "step": 27858 + }, + { + "epoch": 0.3482587064676617, + "grad_norm": 4.3003153800964355, + "learning_rate": 1.6474367415268266e-05, + "loss": 0.8985, + "step": 27860 + }, + { + "epoch": 0.3482837070926773, + "grad_norm": 4.207132816314697, + "learning_rate": 1.6473702297679057e-05, + "loss": 0.33, + "step": 27862 + }, + { + "epoch": 0.34830870771769296, + "grad_norm": 0.03365686163306236, + "learning_rate": 1.6473037130787045e-05, + "loss": 0.0499, + "step": 27864 + }, + { + "epoch": 0.34833370834270855, + "grad_norm": 3.026071071624756, + "learning_rate": 1.6472371914597302e-05, + "loss": 1.0409, + "step": 27866 + }, + { + "epoch": 0.3483587089677242, + "grad_norm": 3.1770222187042236, + "learning_rate": 1.6471706649114883e-05, + "loss": 0.599, + "step": 27868 + }, + { + "epoch": 0.34838370959273984, + "grad_norm": 2.994515895843506, + "learning_rate": 1.6471041334344868e-05, + "loss": 1.3464, + "step": 27870 + }, + { + "epoch": 0.34840871021775544, + "grad_norm": 3.1395668983459473, + "learning_rate": 1.6470375970292313e-05, + "loss": 0.1891, + "step": 27872 + }, + { + "epoch": 0.3484337108427711, + "grad_norm": 2.6045994758605957, + "learning_rate": 1.6469710556962293e-05, + "loss": 0.5407, + "step": 27874 + }, + { + "epoch": 0.3484587114677867, + "grad_norm": 1.5331751108169556, + "learning_rate": 1.646904509435987e-05, + "loss": 1.1292, + "step": 27876 + }, + { + "epoch": 0.3484837120928023, + "grad_norm": 4.248866081237793, + "learning_rate": 1.6468379582490117e-05, + "loss": 0.8288, + "step": 27878 + }, + { + "epoch": 0.34850871271781797, + "grad_norm": 3.255329132080078, + "learning_rate": 1.6467714021358098e-05, + "loss": 0.6598, + "step": 27880 + }, + { + "epoch": 0.34853371334283356, + "grad_norm": 3.086530923843384, + "learning_rate": 1.6467048410968885e-05, + "loss": 0.6005, + "step": 27882 + }, + { + "epoch": 0.3485587139678492, + "grad_norm": 3.341233253479004, + "learning_rate": 1.646638275132754e-05, + "loss": 0.5448, + "step": 27884 + }, + { + "epoch": 0.3485837145928648, + "grad_norm": 3.2140846252441406, + "learning_rate": 1.6465717042439146e-05, + "loss": 0.7266, + "step": 27886 + }, + { + "epoch": 0.34860871521788045, + "grad_norm": 4.538894176483154, + "learning_rate": 1.6465051284308762e-05, + "loss": 1.6988, + "step": 27888 + }, + { + "epoch": 0.3486337158428961, + "grad_norm": 0.0020511506590992212, + "learning_rate": 1.646438547694146e-05, + "loss": 0.2905, + "step": 27890 + }, + { + "epoch": 0.3486587164679117, + "grad_norm": 2.996692180633545, + "learning_rate": 1.6463719620342314e-05, + "loss": 1.9913, + "step": 27892 + }, + { + "epoch": 0.34868371709292734, + "grad_norm": 2.3993449211120605, + "learning_rate": 1.6463053714516394e-05, + "loss": 1.6975, + "step": 27894 + }, + { + "epoch": 0.34870871771794293, + "grad_norm": 6.431376934051514, + "learning_rate": 1.646238775946877e-05, + "loss": 0.3916, + "step": 27896 + }, + { + "epoch": 0.3487337183429586, + "grad_norm": 3.0377249717712402, + "learning_rate": 1.6461721755204516e-05, + "loss": 0.9996, + "step": 27898 + }, + { + "epoch": 0.3487587189679742, + "grad_norm": 4.016563415527344, + "learning_rate": 1.64610557017287e-05, + "loss": 1.6337, + "step": 27900 + }, + { + "epoch": 0.3487837195929898, + "grad_norm": 3.167668342590332, + "learning_rate": 1.6460389599046402e-05, + "loss": 0.4741, + "step": 27902 + }, + { + "epoch": 0.34880872021800546, + "grad_norm": 2.369536876678467, + "learning_rate": 1.6459723447162686e-05, + "loss": 0.6077, + "step": 27904 + }, + { + "epoch": 0.34883372084302106, + "grad_norm": 13.054741859436035, + "learning_rate": 1.6459057246082633e-05, + "loss": 1.5525, + "step": 27906 + }, + { + "epoch": 0.3488587214680367, + "grad_norm": 4.497417449951172, + "learning_rate": 1.645839099581131e-05, + "loss": 1.416, + "step": 27908 + }, + { + "epoch": 0.34888372209305235, + "grad_norm": 0.002855560975149274, + "learning_rate": 1.6457724696353798e-05, + "loss": 0.0001, + "step": 27910 + }, + { + "epoch": 0.34890872271806794, + "grad_norm": 4.528566360473633, + "learning_rate": 1.645705834771517e-05, + "loss": 1.0901, + "step": 27912 + }, + { + "epoch": 0.3489337233430836, + "grad_norm": 3.5591237545013428, + "learning_rate": 1.6456391949900493e-05, + "loss": 1.6656, + "step": 27914 + }, + { + "epoch": 0.3489587239680992, + "grad_norm": 3.0037269592285156, + "learning_rate": 1.645572550291485e-05, + "loss": 0.3221, + "step": 27916 + }, + { + "epoch": 0.34898372459311483, + "grad_norm": 1.5299015045166016, + "learning_rate": 1.6455059006763313e-05, + "loss": 1.3025, + "step": 27918 + }, + { + "epoch": 0.3490087252181305, + "grad_norm": 7.622738838195801, + "learning_rate": 1.645439246145096e-05, + "loss": 1.8037, + "step": 27920 + }, + { + "epoch": 0.34903372584314607, + "grad_norm": 8.7064790725708, + "learning_rate": 1.645372586698287e-05, + "loss": 1.1742, + "step": 27922 + }, + { + "epoch": 0.3490587264681617, + "grad_norm": 1.177788257598877, + "learning_rate": 1.6453059223364114e-05, + "loss": 0.4542, + "step": 27924 + }, + { + "epoch": 0.3490837270931773, + "grad_norm": 3.739053249359131, + "learning_rate": 1.645239253059977e-05, + "loss": 0.7619, + "step": 27926 + }, + { + "epoch": 0.34910872771819296, + "grad_norm": 1.0172771215438843, + "learning_rate": 1.645172578869492e-05, + "loss": 0.5529, + "step": 27928 + }, + { + "epoch": 0.3491337283432086, + "grad_norm": 3.425645351409912, + "learning_rate": 1.6451058997654637e-05, + "loss": 0.6542, + "step": 27930 + }, + { + "epoch": 0.3491587289682242, + "grad_norm": 0.07046301662921906, + "learning_rate": 1.6450392157484003e-05, + "loss": 0.8786, + "step": 27932 + }, + { + "epoch": 0.34918372959323984, + "grad_norm": 3.706496000289917, + "learning_rate": 1.644972526818809e-05, + "loss": 0.4351, + "step": 27934 + }, + { + "epoch": 0.34920873021825544, + "grad_norm": 3.487415075302124, + "learning_rate": 1.6449058329771986e-05, + "loss": 1.537, + "step": 27936 + }, + { + "epoch": 0.3492337308432711, + "grad_norm": 2.1614086627960205, + "learning_rate": 1.6448391342240767e-05, + "loss": 1.2791, + "step": 27938 + }, + { + "epoch": 0.34925873146828673, + "grad_norm": 4.309901237487793, + "learning_rate": 1.6447724305599507e-05, + "loss": 1.0548, + "step": 27940 + }, + { + "epoch": 0.3492837320933023, + "grad_norm": 4.847990036010742, + "learning_rate": 1.644705721985329e-05, + "loss": 0.3464, + "step": 27942 + }, + { + "epoch": 0.34930873271831797, + "grad_norm": 4.371910572052002, + "learning_rate": 1.64463900850072e-05, + "loss": 1.2824, + "step": 27944 + }, + { + "epoch": 0.34933373334333356, + "grad_norm": 0.03282184153795242, + "learning_rate": 1.6445722901066315e-05, + "loss": 0.0582, + "step": 27946 + }, + { + "epoch": 0.3493587339683492, + "grad_norm": 5.935128211975098, + "learning_rate": 1.6445055668035712e-05, + "loss": 0.8062, + "step": 27948 + }, + { + "epoch": 0.34938373459336486, + "grad_norm": 4.309968948364258, + "learning_rate": 1.644438838592048e-05, + "loss": 1.8853, + "step": 27950 + }, + { + "epoch": 0.34940873521838045, + "grad_norm": 0.007427132688462734, + "learning_rate": 1.6443721054725695e-05, + "loss": 0.7337, + "step": 27952 + }, + { + "epoch": 0.3494337358433961, + "grad_norm": 4.344595909118652, + "learning_rate": 1.6443053674456445e-05, + "loss": 2.2219, + "step": 27954 + }, + { + "epoch": 0.3494587364684117, + "grad_norm": 2.7942159175872803, + "learning_rate": 1.6442386245117805e-05, + "loss": 0.2629, + "step": 27956 + }, + { + "epoch": 0.34948373709342734, + "grad_norm": 1.913996934890747, + "learning_rate": 1.6441718766714865e-05, + "loss": 0.8718, + "step": 27958 + }, + { + "epoch": 0.349508737718443, + "grad_norm": 0.0010750446235761046, + "learning_rate": 1.6441051239252706e-05, + "loss": 0.0003, + "step": 27960 + }, + { + "epoch": 0.3495337383434586, + "grad_norm": 3.599332332611084, + "learning_rate": 1.644038366273641e-05, + "loss": 0.9737, + "step": 27962 + }, + { + "epoch": 0.3495587389684742, + "grad_norm": 2.874082088470459, + "learning_rate": 1.6439716037171067e-05, + "loss": 0.7694, + "step": 27964 + }, + { + "epoch": 0.3495837395934898, + "grad_norm": 15.161107063293457, + "learning_rate": 1.643904836256175e-05, + "loss": 0.5692, + "step": 27966 + }, + { + "epoch": 0.34960874021850546, + "grad_norm": 5.234468460083008, + "learning_rate": 1.6438380638913558e-05, + "loss": 0.8927, + "step": 27968 + }, + { + "epoch": 0.3496337408435211, + "grad_norm": 0.029923535883426666, + "learning_rate": 1.6437712866231565e-05, + "loss": 0.8058, + "step": 27970 + }, + { + "epoch": 0.3496587414685367, + "grad_norm": 1.693270206451416, + "learning_rate": 1.6437045044520867e-05, + "loss": 0.5422, + "step": 27972 + }, + { + "epoch": 0.34968374209355235, + "grad_norm": 1.6084647178649902, + "learning_rate": 1.6436377173786537e-05, + "loss": 0.2931, + "step": 27974 + }, + { + "epoch": 0.34970874271856794, + "grad_norm": 5.648869514465332, + "learning_rate": 1.6435709254033672e-05, + "loss": 1.2371, + "step": 27976 + }, + { + "epoch": 0.3497337433435836, + "grad_norm": 2.9327306747436523, + "learning_rate": 1.6435041285267357e-05, + "loss": 0.9139, + "step": 27978 + }, + { + "epoch": 0.34975874396859924, + "grad_norm": 3.7667112350463867, + "learning_rate": 1.6434373267492675e-05, + "loss": 0.7553, + "step": 27980 + }, + { + "epoch": 0.34978374459361483, + "grad_norm": 2.582075595855713, + "learning_rate": 1.6433705200714715e-05, + "loss": 0.2649, + "step": 27982 + }, + { + "epoch": 0.3498087452186305, + "grad_norm": 2.6791319847106934, + "learning_rate": 1.643303708493857e-05, + "loss": 0.62, + "step": 27984 + }, + { + "epoch": 0.34983374584364607, + "grad_norm": 7.939249038696289, + "learning_rate": 1.643236892016932e-05, + "loss": 1.23, + "step": 27986 + }, + { + "epoch": 0.3498587464686617, + "grad_norm": 3.178100109100342, + "learning_rate": 1.643170070641206e-05, + "loss": 0.8965, + "step": 27988 + }, + { + "epoch": 0.34988374709367737, + "grad_norm": 3.364248514175415, + "learning_rate": 1.643103244367188e-05, + "loss": 1.5424, + "step": 27990 + }, + { + "epoch": 0.34990874771869296, + "grad_norm": 2.8320200443267822, + "learning_rate": 1.6430364131953865e-05, + "loss": 0.5154, + "step": 27992 + }, + { + "epoch": 0.3499337483437086, + "grad_norm": 0.5068999528884888, + "learning_rate": 1.6429695771263105e-05, + "loss": 0.762, + "step": 27994 + }, + { + "epoch": 0.3499587489687242, + "grad_norm": 3.583599805831909, + "learning_rate": 1.642902736160469e-05, + "loss": 1.5077, + "step": 27996 + }, + { + "epoch": 0.34998374959373985, + "grad_norm": 5.199024200439453, + "learning_rate": 1.642835890298371e-05, + "loss": 0.5789, + "step": 27998 + }, + { + "epoch": 0.3500087502187555, + "grad_norm": 4.845982074737549, + "learning_rate": 1.642769039540526e-05, + "loss": 0.398, + "step": 28000 + }, + { + "epoch": 0.3500337508437711, + "grad_norm": 0.001593056134879589, + "learning_rate": 1.6427021838874432e-05, + "loss": 0.0151, + "step": 28002 + }, + { + "epoch": 0.35005875146878673, + "grad_norm": 3.8596856594085693, + "learning_rate": 1.642635323339631e-05, + "loss": 1.4162, + "step": 28004 + }, + { + "epoch": 0.3500837520938023, + "grad_norm": 1.7178727388381958, + "learning_rate": 1.6425684578975992e-05, + "loss": 0.1969, + "step": 28006 + }, + { + "epoch": 0.350108752718818, + "grad_norm": 3.557372808456421, + "learning_rate": 1.6425015875618568e-05, + "loss": 0.2124, + "step": 28008 + }, + { + "epoch": 0.3501337533438336, + "grad_norm": 10.472808837890625, + "learning_rate": 1.6424347123329135e-05, + "loss": 1.6151, + "step": 28010 + }, + { + "epoch": 0.3501587539688492, + "grad_norm": 3.4842090606689453, + "learning_rate": 1.642367832211278e-05, + "loss": 0.7938, + "step": 28012 + }, + { + "epoch": 0.35018375459386486, + "grad_norm": 4.149575233459473, + "learning_rate": 1.6423009471974598e-05, + "loss": 0.7589, + "step": 28014 + }, + { + "epoch": 0.35020875521888045, + "grad_norm": 9.576383590698242, + "learning_rate": 1.642234057291969e-05, + "loss": 1.7215, + "step": 28016 + }, + { + "epoch": 0.3502337558438961, + "grad_norm": 3.082322835922241, + "learning_rate": 1.642167162495314e-05, + "loss": 0.9576, + "step": 28018 + }, + { + "epoch": 0.35025875646891175, + "grad_norm": 4.325602054595947, + "learning_rate": 1.6421002628080046e-05, + "loss": 1.0103, + "step": 28020 + }, + { + "epoch": 0.35028375709392734, + "grad_norm": 8.71461296081543, + "learning_rate": 1.6420333582305505e-05, + "loss": 2.218, + "step": 28022 + }, + { + "epoch": 0.350308757718943, + "grad_norm": 6.0361809730529785, + "learning_rate": 1.641966448763461e-05, + "loss": 0.7933, + "step": 28024 + }, + { + "epoch": 0.3503337583439586, + "grad_norm": 3.2592644691467285, + "learning_rate": 1.641899534407246e-05, + "loss": 0.5606, + "step": 28026 + }, + { + "epoch": 0.3503587589689742, + "grad_norm": 5.016489505767822, + "learning_rate": 1.6418326151624152e-05, + "loss": 1.3955, + "step": 28028 + }, + { + "epoch": 0.3503837595939899, + "grad_norm": 3.945678472518921, + "learning_rate": 1.641765691029478e-05, + "loss": 1.1652, + "step": 28030 + }, + { + "epoch": 0.35040876021900547, + "grad_norm": 0.029413459822535515, + "learning_rate": 1.6416987620089436e-05, + "loss": 0.261, + "step": 28032 + }, + { + "epoch": 0.3504337608440211, + "grad_norm": 1.161680817604065, + "learning_rate": 1.6416318281013223e-05, + "loss": 0.4027, + "step": 28034 + }, + { + "epoch": 0.3504587614690367, + "grad_norm": 0.44652533531188965, + "learning_rate": 1.6415648893071235e-05, + "loss": 0.2071, + "step": 28036 + }, + { + "epoch": 0.35048376209405235, + "grad_norm": 3.072650671005249, + "learning_rate": 1.6414979456268575e-05, + "loss": 1.4107, + "step": 28038 + }, + { + "epoch": 0.350508762719068, + "grad_norm": 0.001069171936251223, + "learning_rate": 1.6414309970610336e-05, + "loss": 0.5128, + "step": 28040 + }, + { + "epoch": 0.3505337633440836, + "grad_norm": 0.0007440081681124866, + "learning_rate": 1.6413640436101622e-05, + "loss": 0.1015, + "step": 28042 + }, + { + "epoch": 0.35055876396909924, + "grad_norm": 2.850825071334839, + "learning_rate": 1.641297085274753e-05, + "loss": 0.431, + "step": 28044 + }, + { + "epoch": 0.35058376459411483, + "grad_norm": 3.044403076171875, + "learning_rate": 1.641230122055316e-05, + "loss": 0.1063, + "step": 28046 + }, + { + "epoch": 0.3506087652191305, + "grad_norm": 5.685296535491943, + "learning_rate": 1.6411631539523604e-05, + "loss": 1.9888, + "step": 28048 + }, + { + "epoch": 0.35063376584414613, + "grad_norm": 3.1107373237609863, + "learning_rate": 1.6410961809663977e-05, + "loss": 1.452, + "step": 28050 + }, + { + "epoch": 0.3506587664691617, + "grad_norm": 7.86307430267334, + "learning_rate": 1.6410292030979365e-05, + "loss": 1.5947, + "step": 28052 + }, + { + "epoch": 0.35068376709417737, + "grad_norm": 0.383164644241333, + "learning_rate": 1.640962220347488e-05, + "loss": 0.0402, + "step": 28054 + }, + { + "epoch": 0.35070876771919296, + "grad_norm": 6.222992420196533, + "learning_rate": 1.6408952327155617e-05, + "loss": 0.8872, + "step": 28056 + }, + { + "epoch": 0.3507337683442086, + "grad_norm": 3.0557076930999756, + "learning_rate": 1.640828240202668e-05, + "loss": 1.2198, + "step": 28058 + }, + { + "epoch": 0.35075876896922425, + "grad_norm": 3.26011323928833, + "learning_rate": 1.640761242809317e-05, + "loss": 1.5539, + "step": 28060 + }, + { + "epoch": 0.35078376959423985, + "grad_norm": 3.2615747451782227, + "learning_rate": 1.640694240536019e-05, + "loss": 1.3041, + "step": 28062 + }, + { + "epoch": 0.3508087702192555, + "grad_norm": 4.413626194000244, + "learning_rate": 1.640627233383284e-05, + "loss": 0.8984, + "step": 28064 + }, + { + "epoch": 0.3508337708442711, + "grad_norm": 0.0016653670463711023, + "learning_rate": 1.640560221351623e-05, + "loss": 1.0523, + "step": 28066 + }, + { + "epoch": 0.35085877146928673, + "grad_norm": 3.4668378829956055, + "learning_rate": 1.6404932044415457e-05, + "loss": 0.9703, + "step": 28068 + }, + { + "epoch": 0.3508837720943024, + "grad_norm": 0.5175804495811462, + "learning_rate": 1.640426182653563e-05, + "loss": 0.4602, + "step": 28070 + }, + { + "epoch": 0.350908772719318, + "grad_norm": 0.8645546436309814, + "learning_rate": 1.6403591559881845e-05, + "loss": 0.6206, + "step": 28072 + }, + { + "epoch": 0.3509337733443336, + "grad_norm": 0.4417513906955719, + "learning_rate": 1.6402921244459217e-05, + "loss": 0.1706, + "step": 28074 + }, + { + "epoch": 0.3509587739693492, + "grad_norm": 3.1918582916259766, + "learning_rate": 1.6402250880272845e-05, + "loss": 1.5944, + "step": 28076 + }, + { + "epoch": 0.35098377459436486, + "grad_norm": 2.7291955947875977, + "learning_rate": 1.6401580467327834e-05, + "loss": 1.0483, + "step": 28078 + }, + { + "epoch": 0.3510087752193805, + "grad_norm": 5.281599998474121, + "learning_rate": 1.6400910005629294e-05, + "loss": 1.0248, + "step": 28080 + }, + { + "epoch": 0.3510337758443961, + "grad_norm": 2.975498676300049, + "learning_rate": 1.6400239495182328e-05, + "loss": 1.3155, + "step": 28082 + }, + { + "epoch": 0.35105877646941175, + "grad_norm": 2.8666300773620605, + "learning_rate": 1.6399568935992044e-05, + "loss": 1.1985, + "step": 28084 + }, + { + "epoch": 0.35108377709442734, + "grad_norm": 0.0015816113445907831, + "learning_rate": 1.6398898328063542e-05, + "loss": 0.2706, + "step": 28086 + }, + { + "epoch": 0.351108777719443, + "grad_norm": 0.001593542518094182, + "learning_rate": 1.6398227671401942e-05, + "loss": 0.0001, + "step": 28088 + }, + { + "epoch": 0.35113377834445864, + "grad_norm": 2.8733763694763184, + "learning_rate": 1.639755696601234e-05, + "loss": 0.9162, + "step": 28090 + }, + { + "epoch": 0.35115877896947423, + "grad_norm": 3.7720539569854736, + "learning_rate": 1.6396886211899854e-05, + "loss": 1.7211, + "step": 28092 + }, + { + "epoch": 0.3511837795944899, + "grad_norm": 3.591536283493042, + "learning_rate": 1.6396215409069583e-05, + "loss": 0.3422, + "step": 28094 + }, + { + "epoch": 0.35120878021950547, + "grad_norm": 4.937602519989014, + "learning_rate": 1.6395544557526647e-05, + "loss": 1.4322, + "step": 28096 + }, + { + "epoch": 0.3512337808445211, + "grad_norm": 1.0775494575500488, + "learning_rate": 1.639487365727614e-05, + "loss": 0.0588, + "step": 28098 + }, + { + "epoch": 0.35125878146953676, + "grad_norm": 1.7332340478897095, + "learning_rate": 1.6394202708323183e-05, + "loss": 0.3163, + "step": 28100 + }, + { + "epoch": 0.35128378209455235, + "grad_norm": 3.3770766258239746, + "learning_rate": 1.639353171067288e-05, + "loss": 0.4861, + "step": 28102 + }, + { + "epoch": 0.351308782719568, + "grad_norm": 3.2380473613739014, + "learning_rate": 1.6392860664330347e-05, + "loss": 0.7807, + "step": 28104 + }, + { + "epoch": 0.3513337833445836, + "grad_norm": 4.233567237854004, + "learning_rate": 1.639218956930069e-05, + "loss": 1.5798, + "step": 28106 + }, + { + "epoch": 0.35135878396959924, + "grad_norm": 3.8457844257354736, + "learning_rate": 1.639151842558902e-05, + "loss": 0.3147, + "step": 28108 + }, + { + "epoch": 0.3513837845946149, + "grad_norm": 4.360482692718506, + "learning_rate": 1.639084723320045e-05, + "loss": 1.0527, + "step": 28110 + }, + { + "epoch": 0.3514087852196305, + "grad_norm": 2.8426403999328613, + "learning_rate": 1.639017599214009e-05, + "loss": 0.6537, + "step": 28112 + }, + { + "epoch": 0.35143378584464613, + "grad_norm": 5.161013603210449, + "learning_rate": 1.6389504702413056e-05, + "loss": 0.7204, + "step": 28114 + }, + { + "epoch": 0.3514587864696617, + "grad_norm": 1.8203260898590088, + "learning_rate": 1.6388833364024456e-05, + "loss": 0.0896, + "step": 28116 + }, + { + "epoch": 0.35148378709467737, + "grad_norm": 3.056335210800171, + "learning_rate": 1.6388161976979406e-05, + "loss": 0.2751, + "step": 28118 + }, + { + "epoch": 0.351508787719693, + "grad_norm": 3.0208916664123535, + "learning_rate": 1.6387490541283013e-05, + "loss": 1.32, + "step": 28120 + }, + { + "epoch": 0.3515337883447086, + "grad_norm": 2.348428249359131, + "learning_rate": 1.6386819056940403e-05, + "loss": 0.783, + "step": 28122 + }, + { + "epoch": 0.35155878896972426, + "grad_norm": 2.8847718238830566, + "learning_rate": 1.6386147523956676e-05, + "loss": 0.6784, + "step": 28124 + }, + { + "epoch": 0.35158378959473985, + "grad_norm": 2.6250786781311035, + "learning_rate": 1.6385475942336953e-05, + "loss": 0.8225, + "step": 28126 + }, + { + "epoch": 0.3516087902197555, + "grad_norm": 0.0019653369672596455, + "learning_rate": 1.6384804312086352e-05, + "loss": 0.5989, + "step": 28128 + }, + { + "epoch": 0.35163379084477114, + "grad_norm": 2.5754802227020264, + "learning_rate": 1.6384132633209985e-05, + "loss": 1.2701, + "step": 28130 + }, + { + "epoch": 0.35165879146978674, + "grad_norm": 4.8356499671936035, + "learning_rate": 1.638346090571296e-05, + "loss": 2.1324, + "step": 28132 + }, + { + "epoch": 0.3516837920948024, + "grad_norm": 5.062501430511475, + "learning_rate": 1.6382789129600407e-05, + "loss": 1.7178, + "step": 28134 + }, + { + "epoch": 0.351708792719818, + "grad_norm": 3.1174967288970947, + "learning_rate": 1.638211730487743e-05, + "loss": 1.0001, + "step": 28136 + }, + { + "epoch": 0.3517337933448336, + "grad_norm": 2.559662342071533, + "learning_rate": 1.638144543154915e-05, + "loss": 0.5107, + "step": 28138 + }, + { + "epoch": 0.35175879396984927, + "grad_norm": 2.6519699096679688, + "learning_rate": 1.6380773509620687e-05, + "loss": 0.7922, + "step": 28140 + }, + { + "epoch": 0.35178379459486486, + "grad_norm": 6.187228679656982, + "learning_rate": 1.6380101539097154e-05, + "loss": 1.8207, + "step": 28142 + }, + { + "epoch": 0.3518087952198805, + "grad_norm": 0.5516307353973389, + "learning_rate": 1.6379429519983667e-05, + "loss": 0.6083, + "step": 28144 + }, + { + "epoch": 0.3518337958448961, + "grad_norm": 1.4603092670440674, + "learning_rate": 1.6378757452285347e-05, + "loss": 0.279, + "step": 28146 + }, + { + "epoch": 0.35185879646991175, + "grad_norm": 2.289232015609741, + "learning_rate": 1.6378085336007315e-05, + "loss": 0.6362, + "step": 28148 + }, + { + "epoch": 0.3518837970949274, + "grad_norm": 0.005220616701990366, + "learning_rate": 1.6377413171154688e-05, + "loss": 0.2107, + "step": 28150 + }, + { + "epoch": 0.351908797719943, + "grad_norm": 4.262112140655518, + "learning_rate": 1.6376740957732582e-05, + "loss": 0.7174, + "step": 28152 + }, + { + "epoch": 0.35193379834495864, + "grad_norm": 4.245232105255127, + "learning_rate": 1.637606869574612e-05, + "loss": 1.0898, + "step": 28154 + }, + { + "epoch": 0.35195879896997423, + "grad_norm": 3.7696728706359863, + "learning_rate": 1.6375396385200418e-05, + "loss": 1.0289, + "step": 28156 + }, + { + "epoch": 0.3519837995949899, + "grad_norm": 0.0023054825142025948, + "learning_rate": 1.6374724026100602e-05, + "loss": 0.8726, + "step": 28158 + }, + { + "epoch": 0.3520088002200055, + "grad_norm": 5.455779075622559, + "learning_rate": 1.6374051618451788e-05, + "loss": 2.1054, + "step": 28160 + }, + { + "epoch": 0.3520338008450211, + "grad_norm": 6.451002597808838, + "learning_rate": 1.63733791622591e-05, + "loss": 0.3593, + "step": 28162 + }, + { + "epoch": 0.35205880147003676, + "grad_norm": 9.942981719970703, + "learning_rate": 1.637270665752765e-05, + "loss": 1.6663, + "step": 28164 + }, + { + "epoch": 0.35208380209505236, + "grad_norm": 1.6648577451705933, + "learning_rate": 1.6372034104262574e-05, + "loss": 0.9793, + "step": 28166 + }, + { + "epoch": 0.352108802720068, + "grad_norm": 1.6937941312789917, + "learning_rate": 1.637136150246899e-05, + "loss": 0.4796, + "step": 28168 + }, + { + "epoch": 0.35213380334508365, + "grad_norm": 1.2234127521514893, + "learning_rate": 1.6370688852152014e-05, + "loss": 1.6873, + "step": 28170 + }, + { + "epoch": 0.35215880397009924, + "grad_norm": 3.423552989959717, + "learning_rate": 1.637001615331677e-05, + "loss": 1.2638, + "step": 28172 + }, + { + "epoch": 0.3521838045951149, + "grad_norm": 1.7024284601211548, + "learning_rate": 1.6369343405968387e-05, + "loss": 0.1409, + "step": 28174 + }, + { + "epoch": 0.3522088052201305, + "grad_norm": 2.5632646083831787, + "learning_rate": 1.6368670610111987e-05, + "loss": 0.3482, + "step": 28176 + }, + { + "epoch": 0.35223380584514613, + "grad_norm": 2.8288586139678955, + "learning_rate": 1.6367997765752687e-05, + "loss": 0.6931, + "step": 28178 + }, + { + "epoch": 0.3522588064701618, + "grad_norm": 1.6579633951187134, + "learning_rate": 1.6367324872895622e-05, + "loss": 0.9377, + "step": 28180 + }, + { + "epoch": 0.35228380709517737, + "grad_norm": 0.9129658341407776, + "learning_rate": 1.6366651931545907e-05, + "loss": 0.0735, + "step": 28182 + }, + { + "epoch": 0.352308807720193, + "grad_norm": 3.0182437896728516, + "learning_rate": 1.6365978941708675e-05, + "loss": 1.7821, + "step": 28184 + }, + { + "epoch": 0.3523338083452086, + "grad_norm": 7.3238725662231445, + "learning_rate": 1.6365305903389045e-05, + "loss": 0.8043, + "step": 28186 + }, + { + "epoch": 0.35235880897022426, + "grad_norm": 2.514915704727173, + "learning_rate": 1.636463281659214e-05, + "loss": 0.8804, + "step": 28188 + }, + { + "epoch": 0.3523838095952399, + "grad_norm": 0.0008599174325354397, + "learning_rate": 1.63639596813231e-05, + "loss": 0.0001, + "step": 28190 + }, + { + "epoch": 0.3524088102202555, + "grad_norm": 1.7921971082687378, + "learning_rate": 1.636328649758704e-05, + "loss": 0.301, + "step": 28192 + }, + { + "epoch": 0.35243381084527114, + "grad_norm": 0.0013730233768001199, + "learning_rate": 1.636261326538909e-05, + "loss": 0.6866, + "step": 28194 + }, + { + "epoch": 0.35245881147028674, + "grad_norm": 4.244307041168213, + "learning_rate": 1.636193998473438e-05, + "loss": 1.6459, + "step": 28196 + }, + { + "epoch": 0.3524838120953024, + "grad_norm": 6.4762115478515625, + "learning_rate": 1.6361266655628034e-05, + "loss": 1.1101, + "step": 28198 + }, + { + "epoch": 0.35250881272031803, + "grad_norm": 4.508643627166748, + "learning_rate": 1.6360593278075175e-05, + "loss": 1.1536, + "step": 28200 + }, + { + "epoch": 0.3525338133453336, + "grad_norm": 2.287017583847046, + "learning_rate": 1.635991985208094e-05, + "loss": 0.122, + "step": 28202 + }, + { + "epoch": 0.35255881397034927, + "grad_norm": 0.0012282629031687975, + "learning_rate": 1.6359246377650456e-05, + "loss": 0.6835, + "step": 28204 + }, + { + "epoch": 0.35258381459536486, + "grad_norm": 5.263204097747803, + "learning_rate": 1.635857285478885e-05, + "loss": 1.0576, + "step": 28206 + }, + { + "epoch": 0.3526088152203805, + "grad_norm": 6.459306716918945, + "learning_rate": 1.6357899283501255e-05, + "loss": 0.1397, + "step": 28208 + }, + { + "epoch": 0.35263381584539616, + "grad_norm": 2.827141761779785, + "learning_rate": 1.6357225663792793e-05, + "loss": 1.1345, + "step": 28210 + }, + { + "epoch": 0.35265881647041175, + "grad_norm": 3.355703830718994, + "learning_rate": 1.6356551995668606e-05, + "loss": 2.1403, + "step": 28212 + }, + { + "epoch": 0.3526838170954274, + "grad_norm": 2.7763686180114746, + "learning_rate": 1.6355878279133813e-05, + "loss": 0.6794, + "step": 28214 + }, + { + "epoch": 0.352708817720443, + "grad_norm": 0.0012322580441832542, + "learning_rate": 1.635520451419355e-05, + "loss": 0.2904, + "step": 28216 + }, + { + "epoch": 0.35273381834545864, + "grad_norm": 1.605046033859253, + "learning_rate": 1.635453070085295e-05, + "loss": 0.0736, + "step": 28218 + }, + { + "epoch": 0.3527588189704743, + "grad_norm": 2.658346176147461, + "learning_rate": 1.6353856839117142e-05, + "loss": 1.2464, + "step": 28220 + }, + { + "epoch": 0.3527838195954899, + "grad_norm": 2.449270009994507, + "learning_rate": 1.6353182928991258e-05, + "loss": 0.7798, + "step": 28222 + }, + { + "epoch": 0.3528088202205055, + "grad_norm": 2.6245436668395996, + "learning_rate": 1.6352508970480432e-05, + "loss": 0.6188, + "step": 28224 + }, + { + "epoch": 0.3528338208455211, + "grad_norm": 3.759768009185791, + "learning_rate": 1.6351834963589798e-05, + "loss": 1.2543, + "step": 28226 + }, + { + "epoch": 0.35285882147053677, + "grad_norm": 2.3594605922698975, + "learning_rate": 1.6351160908324483e-05, + "loss": 0.5318, + "step": 28228 + }, + { + "epoch": 0.3528838220955524, + "grad_norm": 0.001901090843603015, + "learning_rate": 1.635048680468963e-05, + "loss": 0.2397, + "step": 28230 + }, + { + "epoch": 0.352908822720568, + "grad_norm": 4.165462493896484, + "learning_rate": 1.6349812652690365e-05, + "loss": 0.8263, + "step": 28232 + }, + { + "epoch": 0.35293382334558365, + "grad_norm": 2.5238893032073975, + "learning_rate": 1.6349138452331823e-05, + "loss": 2.1193, + "step": 28234 + }, + { + "epoch": 0.35295882397059924, + "grad_norm": 0.0017146407626569271, + "learning_rate": 1.634846420361914e-05, + "loss": 0.3105, + "step": 28236 + }, + { + "epoch": 0.3529838245956149, + "grad_norm": 2.7674548625946045, + "learning_rate": 1.6347789906557455e-05, + "loss": 1.3967, + "step": 28238 + }, + { + "epoch": 0.35300882522063054, + "grad_norm": 2.3381218910217285, + "learning_rate": 1.63471155611519e-05, + "loss": 0.8627, + "step": 28240 + }, + { + "epoch": 0.35303382584564613, + "grad_norm": 4.000534534454346, + "learning_rate": 1.6346441167407604e-05, + "loss": 1.533, + "step": 28242 + }, + { + "epoch": 0.3530588264706618, + "grad_norm": 2.902966022491455, + "learning_rate": 1.6345766725329714e-05, + "loss": 0.2648, + "step": 28244 + }, + { + "epoch": 0.35308382709567737, + "grad_norm": 1.8022396564483643, + "learning_rate": 1.6345092234923363e-05, + "loss": 1.433, + "step": 28246 + }, + { + "epoch": 0.353108827720693, + "grad_norm": 0.0016372272511944175, + "learning_rate": 1.6344417696193685e-05, + "loss": 1.0411, + "step": 28248 + }, + { + "epoch": 0.35313382834570867, + "grad_norm": 2.6974332332611084, + "learning_rate": 1.634374310914582e-05, + "loss": 0.7613, + "step": 28250 + }, + { + "epoch": 0.35315882897072426, + "grad_norm": 0.022206395864486694, + "learning_rate": 1.6343068473784903e-05, + "loss": 0.7338, + "step": 28252 + }, + { + "epoch": 0.3531838295957399, + "grad_norm": 0.0017317095771431923, + "learning_rate": 1.6342393790116076e-05, + "loss": 0.9494, + "step": 28254 + }, + { + "epoch": 0.3532088302207555, + "grad_norm": 7.38131856918335, + "learning_rate": 1.6341719058144473e-05, + "loss": 0.9029, + "step": 28256 + }, + { + "epoch": 0.35323383084577115, + "grad_norm": 3.2117176055908203, + "learning_rate": 1.6341044277875235e-05, + "loss": 0.8041, + "step": 28258 + }, + { + "epoch": 0.3532588314707868, + "grad_norm": 3.8465094566345215, + "learning_rate": 1.6340369449313498e-05, + "loss": 0.8049, + "step": 28260 + }, + { + "epoch": 0.3532838320958024, + "grad_norm": 2.7241733074188232, + "learning_rate": 1.6339694572464408e-05, + "loss": 1.0003, + "step": 28262 + }, + { + "epoch": 0.35330883272081803, + "grad_norm": 2.1871585845947266, + "learning_rate": 1.6339019647333098e-05, + "loss": 0.8309, + "step": 28264 + }, + { + "epoch": 0.3533338333458336, + "grad_norm": 3.259669303894043, + "learning_rate": 1.633834467392471e-05, + "loss": 1.0443, + "step": 28266 + }, + { + "epoch": 0.3533588339708493, + "grad_norm": 3.7595784664154053, + "learning_rate": 1.633766965224439e-05, + "loss": 0.7337, + "step": 28268 + }, + { + "epoch": 0.3533838345958649, + "grad_norm": 1.567594289779663, + "learning_rate": 1.633699458229727e-05, + "loss": 0.2233, + "step": 28270 + }, + { + "epoch": 0.3534088352208805, + "grad_norm": 1.252350926399231, + "learning_rate": 1.6336319464088495e-05, + "loss": 1.1819, + "step": 28272 + }, + { + "epoch": 0.35343383584589616, + "grad_norm": 2.559271812438965, + "learning_rate": 1.6335644297623207e-05, + "loss": 0.5605, + "step": 28274 + }, + { + "epoch": 0.35345883647091175, + "grad_norm": 1.8397973775863647, + "learning_rate": 1.6334969082906552e-05, + "loss": 0.7452, + "step": 28276 + }, + { + "epoch": 0.3534838370959274, + "grad_norm": 2.5833377838134766, + "learning_rate": 1.6334293819943666e-05, + "loss": 0.7953, + "step": 28278 + }, + { + "epoch": 0.35350883772094305, + "grad_norm": 0.020409660413861275, + "learning_rate": 1.6333618508739696e-05, + "loss": 0.8153, + "step": 28280 + }, + { + "epoch": 0.35353383834595864, + "grad_norm": 4.623874187469482, + "learning_rate": 1.6332943149299777e-05, + "loss": 0.9734, + "step": 28282 + }, + { + "epoch": 0.3535588389709743, + "grad_norm": 2.034592628479004, + "learning_rate": 1.6332267741629064e-05, + "loss": 1.3861, + "step": 28284 + }, + { + "epoch": 0.3535838395959899, + "grad_norm": 3.0005667209625244, + "learning_rate": 1.6331592285732693e-05, + "loss": 1.0657, + "step": 28286 + }, + { + "epoch": 0.3536088402210055, + "grad_norm": 3.7638156414031982, + "learning_rate": 1.633091678161581e-05, + "loss": 0.6092, + "step": 28288 + }, + { + "epoch": 0.3536338408460212, + "grad_norm": 7.573443412780762, + "learning_rate": 1.6330241229283562e-05, + "loss": 1.3447, + "step": 28290 + }, + { + "epoch": 0.35365884147103677, + "grad_norm": 2.216175079345703, + "learning_rate": 1.6329565628741093e-05, + "loss": 1.0757, + "step": 28292 + }, + { + "epoch": 0.3536838420960524, + "grad_norm": 3.902421474456787, + "learning_rate": 1.6328889979993544e-05, + "loss": 0.8526, + "step": 28294 + }, + { + "epoch": 0.353708842721068, + "grad_norm": 4.60344934463501, + "learning_rate": 1.6328214283046063e-05, + "loss": 1.1415, + "step": 28296 + }, + { + "epoch": 0.35373384334608365, + "grad_norm": 5.441995620727539, + "learning_rate": 1.63275385379038e-05, + "loss": 1.0884, + "step": 28298 + }, + { + "epoch": 0.3537588439710993, + "grad_norm": 0.0006863918388262391, + "learning_rate": 1.63268627445719e-05, + "loss": 1.0609, + "step": 28300 + }, + { + "epoch": 0.3537838445961149, + "grad_norm": 2.676347255706787, + "learning_rate": 1.6326186903055504e-05, + "loss": 0.1874, + "step": 28302 + }, + { + "epoch": 0.35380884522113054, + "grad_norm": 2.6946537494659424, + "learning_rate": 1.6325511013359764e-05, + "loss": 2.1937, + "step": 28304 + }, + { + "epoch": 0.35383384584614613, + "grad_norm": 4.127182483673096, + "learning_rate": 1.632483507548983e-05, + "loss": 1.56, + "step": 28306 + }, + { + "epoch": 0.3538588464711618, + "grad_norm": 3.305605888366699, + "learning_rate": 1.6324159089450843e-05, + "loss": 1.1061, + "step": 28308 + }, + { + "epoch": 0.35388384709617743, + "grad_norm": 0.0010716876713559031, + "learning_rate": 1.632348305524796e-05, + "loss": 0.0637, + "step": 28310 + }, + { + "epoch": 0.353908847721193, + "grad_norm": 2.892742395401001, + "learning_rate": 1.6322806972886314e-05, + "loss": 1.0032, + "step": 28312 + }, + { + "epoch": 0.35393384834620867, + "grad_norm": 4.400352954864502, + "learning_rate": 1.632213084237107e-05, + "loss": 1.7924, + "step": 28314 + }, + { + "epoch": 0.35395884897122426, + "grad_norm": 3.7997725009918213, + "learning_rate": 1.6321454663707374e-05, + "loss": 1.9392, + "step": 28316 + }, + { + "epoch": 0.3539838495962399, + "grad_norm": 4.538825511932373, + "learning_rate": 1.632077843690037e-05, + "loss": 1.0846, + "step": 28318 + }, + { + "epoch": 0.35400885022125556, + "grad_norm": 1.4015666246414185, + "learning_rate": 1.6320102161955212e-05, + "loss": 0.2627, + "step": 28320 + }, + { + "epoch": 0.35403385084627115, + "grad_norm": 4.960493087768555, + "learning_rate": 1.631942583887705e-05, + "loss": 1.1481, + "step": 28322 + }, + { + "epoch": 0.3540588514712868, + "grad_norm": 1.5029141902923584, + "learning_rate": 1.6318749467671035e-05, + "loss": 0.088, + "step": 28324 + }, + { + "epoch": 0.3540838520963024, + "grad_norm": 2.6355786323547363, + "learning_rate": 1.6318073048342317e-05, + "loss": 0.564, + "step": 28326 + }, + { + "epoch": 0.35410885272131803, + "grad_norm": 5.018423557281494, + "learning_rate": 1.631739658089605e-05, + "loss": 0.9026, + "step": 28328 + }, + { + "epoch": 0.3541338533463337, + "grad_norm": 4.36644172668457, + "learning_rate": 1.6316720065337378e-05, + "loss": 0.5441, + "step": 28330 + }, + { + "epoch": 0.3541588539713493, + "grad_norm": 3.6473655700683594, + "learning_rate": 1.6316043501671464e-05, + "loss": 0.9497, + "step": 28332 + }, + { + "epoch": 0.3541838545963649, + "grad_norm": 4.82885217666626, + "learning_rate": 1.6315366889903453e-05, + "loss": 1.1796, + "step": 28334 + }, + { + "epoch": 0.3542088552213805, + "grad_norm": 7.186625957489014, + "learning_rate": 1.6314690230038506e-05, + "loss": 0.6964, + "step": 28336 + }, + { + "epoch": 0.35423385584639616, + "grad_norm": 1.3709754943847656, + "learning_rate": 1.6314013522081766e-05, + "loss": 0.7688, + "step": 28338 + }, + { + "epoch": 0.3542588564714118, + "grad_norm": 4.172861576080322, + "learning_rate": 1.6313336766038393e-05, + "loss": 1.6508, + "step": 28340 + }, + { + "epoch": 0.3542838570964274, + "grad_norm": 1.5821075439453125, + "learning_rate": 1.631265996191354e-05, + "loss": 0.5986, + "step": 28342 + }, + { + "epoch": 0.35430885772144305, + "grad_norm": 2.419886350631714, + "learning_rate": 1.6311983109712365e-05, + "loss": 1.0584, + "step": 28344 + }, + { + "epoch": 0.35433385834645864, + "grad_norm": 4.5289177894592285, + "learning_rate": 1.6311306209440017e-05, + "loss": 1.8937, + "step": 28346 + }, + { + "epoch": 0.3543588589714743, + "grad_norm": 2.8322489261627197, + "learning_rate": 1.6310629261101652e-05, + "loss": 1.464, + "step": 28348 + }, + { + "epoch": 0.35438385959648994, + "grad_norm": 3.6621363162994385, + "learning_rate": 1.6309952264702428e-05, + "loss": 0.5522, + "step": 28350 + }, + { + "epoch": 0.35440886022150553, + "grad_norm": 0.9166619777679443, + "learning_rate": 1.63092752202475e-05, + "loss": 0.2041, + "step": 28352 + }, + { + "epoch": 0.3544338608465212, + "grad_norm": 1.2918134927749634, + "learning_rate": 1.6308598127742024e-05, + "loss": 0.0769, + "step": 28354 + }, + { + "epoch": 0.35445886147153677, + "grad_norm": 3.7402727603912354, + "learning_rate": 1.630792098719116e-05, + "loss": 1.1224, + "step": 28356 + }, + { + "epoch": 0.3544838620965524, + "grad_norm": 2.773815631866455, + "learning_rate": 1.6307243798600057e-05, + "loss": 0.6742, + "step": 28358 + }, + { + "epoch": 0.35450886272156806, + "grad_norm": 2.328368902206421, + "learning_rate": 1.6306566561973876e-05, + "loss": 1.2372, + "step": 28360 + }, + { + "epoch": 0.35453386334658366, + "grad_norm": 4.406185150146484, + "learning_rate": 1.630588927731778e-05, + "loss": 1.153, + "step": 28362 + }, + { + "epoch": 0.3545588639715993, + "grad_norm": 6.774655342102051, + "learning_rate": 1.6305211944636924e-05, + "loss": 1.2305, + "step": 28364 + }, + { + "epoch": 0.3545838645966149, + "grad_norm": 4.264794826507568, + "learning_rate": 1.630453456393646e-05, + "loss": 1.5781, + "step": 28366 + }, + { + "epoch": 0.35460886522163054, + "grad_norm": 3.270206928253174, + "learning_rate": 1.6303857135221557e-05, + "loss": 1.291, + "step": 28368 + }, + { + "epoch": 0.3546338658466462, + "grad_norm": 0.025452349334955215, + "learning_rate": 1.6303179658497365e-05, + "loss": 0.9286, + "step": 28370 + }, + { + "epoch": 0.3546588664716618, + "grad_norm": 5.415526866912842, + "learning_rate": 1.6302502133769054e-05, + "loss": 1.6899, + "step": 28372 + }, + { + "epoch": 0.35468386709667743, + "grad_norm": 2.739572525024414, + "learning_rate": 1.6301824561041772e-05, + "loss": 1.1917, + "step": 28374 + }, + { + "epoch": 0.354708867721693, + "grad_norm": 2.4282186031341553, + "learning_rate": 1.6301146940320692e-05, + "loss": 1.3607, + "step": 28376 + }, + { + "epoch": 0.35473386834670867, + "grad_norm": 3.100961446762085, + "learning_rate": 1.630046927161096e-05, + "loss": 1.6881, + "step": 28378 + }, + { + "epoch": 0.3547588689717243, + "grad_norm": 2.172722101211548, + "learning_rate": 1.6299791554917753e-05, + "loss": 0.6222, + "step": 28380 + }, + { + "epoch": 0.3547838695967399, + "grad_norm": 0.08450216054916382, + "learning_rate": 1.629911379024622e-05, + "loss": 2.3099, + "step": 28382 + }, + { + "epoch": 0.35480887022175556, + "grad_norm": 3.2076773643493652, + "learning_rate": 1.6298435977601526e-05, + "loss": 1.2647, + "step": 28384 + }, + { + "epoch": 0.35483387084677115, + "grad_norm": 2.3570330142974854, + "learning_rate": 1.6297758116988837e-05, + "loss": 0.6093, + "step": 28386 + }, + { + "epoch": 0.3548588714717868, + "grad_norm": 2.8058183193206787, + "learning_rate": 1.6297080208413313e-05, + "loss": 1.0493, + "step": 28388 + }, + { + "epoch": 0.35488387209680244, + "grad_norm": 4.185992240905762, + "learning_rate": 1.629640225188011e-05, + "loss": 2.5206, + "step": 28390 + }, + { + "epoch": 0.35490887272181804, + "grad_norm": 5.980004787445068, + "learning_rate": 1.6295724247394407e-05, + "loss": 1.818, + "step": 28392 + }, + { + "epoch": 0.3549338733468337, + "grad_norm": 1.6498693227767944, + "learning_rate": 1.6295046194961355e-05, + "loss": 0.6802, + "step": 28394 + }, + { + "epoch": 0.3549588739718493, + "grad_norm": 5.988003730773926, + "learning_rate": 1.629436809458612e-05, + "loss": 0.9516, + "step": 28396 + }, + { + "epoch": 0.3549838745968649, + "grad_norm": 7.768298149108887, + "learning_rate": 1.6293689946273868e-05, + "loss": 0.8441, + "step": 28398 + }, + { + "epoch": 0.35500887522188057, + "grad_norm": 3.8257699012756348, + "learning_rate": 1.6293011750029767e-05, + "loss": 1.6607, + "step": 28400 + }, + { + "epoch": 0.35503387584689616, + "grad_norm": 5.926972389221191, + "learning_rate": 1.6292333505858976e-05, + "loss": 1.6234, + "step": 28402 + }, + { + "epoch": 0.3550588764719118, + "grad_norm": 1.5843409299850464, + "learning_rate": 1.629165521376666e-05, + "loss": 0.3808, + "step": 28404 + }, + { + "epoch": 0.3550838770969274, + "grad_norm": 2.301515579223633, + "learning_rate": 1.629097687375799e-05, + "loss": 1.2862, + "step": 28406 + }, + { + "epoch": 0.35510887772194305, + "grad_norm": 4.111472129821777, + "learning_rate": 1.6290298485838132e-05, + "loss": 1.343, + "step": 28408 + }, + { + "epoch": 0.3551338783469587, + "grad_norm": 0.029386183246970177, + "learning_rate": 1.6289620050012246e-05, + "loss": 0.3502, + "step": 28410 + }, + { + "epoch": 0.3551588789719743, + "grad_norm": 9.405861854553223, + "learning_rate": 1.6288941566285505e-05, + "loss": 0.8235, + "step": 28412 + }, + { + "epoch": 0.35518387959698994, + "grad_norm": 4.2123823165893555, + "learning_rate": 1.6288263034663075e-05, + "loss": 1.045, + "step": 28414 + }, + { + "epoch": 0.35520888022200553, + "grad_norm": 0.08067169040441513, + "learning_rate": 1.628758445515012e-05, + "loss": 1.4056, + "step": 28416 + }, + { + "epoch": 0.3552338808470212, + "grad_norm": 0.15993814170360565, + "learning_rate": 1.6286905827751816e-05, + "loss": 0.5507, + "step": 28418 + }, + { + "epoch": 0.3552588814720368, + "grad_norm": 2.5541889667510986, + "learning_rate": 1.628622715247332e-05, + "loss": 0.3663, + "step": 28420 + }, + { + "epoch": 0.3552838820970524, + "grad_norm": 0.2790912985801697, + "learning_rate": 1.6285548429319815e-05, + "loss": 0.8051, + "step": 28422 + }, + { + "epoch": 0.35530888272206806, + "grad_norm": 1.2185472249984741, + "learning_rate": 1.6284869658296455e-05, + "loss": 0.2801, + "step": 28424 + }, + { + "epoch": 0.35533388334708366, + "grad_norm": 5.477195739746094, + "learning_rate": 1.628419083940842e-05, + "loss": 0.4516, + "step": 28426 + }, + { + "epoch": 0.3553588839720993, + "grad_norm": 5.634146690368652, + "learning_rate": 1.6283511972660874e-05, + "loss": 1.4458, + "step": 28428 + }, + { + "epoch": 0.35538388459711495, + "grad_norm": 2.7504189014434814, + "learning_rate": 1.6282833058058992e-05, + "loss": 1.2434, + "step": 28430 + }, + { + "epoch": 0.35540888522213054, + "grad_norm": 0.758528470993042, + "learning_rate": 1.628215409560794e-05, + "loss": 0.1538, + "step": 28432 + }, + { + "epoch": 0.3554338858471462, + "grad_norm": 0.017175383865833282, + "learning_rate": 1.6281475085312896e-05, + "loss": 0.2296, + "step": 28434 + }, + { + "epoch": 0.3554588864721618, + "grad_norm": 1.8689395189285278, + "learning_rate": 1.6280796027179018e-05, + "loss": 0.1957, + "step": 28436 + }, + { + "epoch": 0.35548388709717743, + "grad_norm": 1.3302382230758667, + "learning_rate": 1.628011692121149e-05, + "loss": 0.18, + "step": 28438 + }, + { + "epoch": 0.3555088877221931, + "grad_norm": 4.095109939575195, + "learning_rate": 1.627943776741548e-05, + "loss": 1.2473, + "step": 28440 + }, + { + "epoch": 0.35553388834720867, + "grad_norm": 0.006181732285767794, + "learning_rate": 1.6278758565796158e-05, + "loss": 0.5608, + "step": 28442 + }, + { + "epoch": 0.3555588889722243, + "grad_norm": 4.412900447845459, + "learning_rate": 1.6278079316358696e-05, + "loss": 0.5887, + "step": 28444 + }, + { + "epoch": 0.3555838895972399, + "grad_norm": 4.040500640869141, + "learning_rate": 1.6277400019108275e-05, + "loss": 1.7594, + "step": 28446 + }, + { + "epoch": 0.35560889022225556, + "grad_norm": 4.34759521484375, + "learning_rate": 1.627672067405006e-05, + "loss": 1.6448, + "step": 28448 + }, + { + "epoch": 0.3556338908472712, + "grad_norm": 6.138049125671387, + "learning_rate": 1.627604128118923e-05, + "loss": 0.9441, + "step": 28450 + }, + { + "epoch": 0.3556588914722868, + "grad_norm": 3.0151379108428955, + "learning_rate": 1.6275361840530956e-05, + "loss": 1.3333, + "step": 28452 + }, + { + "epoch": 0.35568389209730245, + "grad_norm": 3.9911179542541504, + "learning_rate": 1.6274682352080414e-05, + "loss": 1.7104, + "step": 28454 + }, + { + "epoch": 0.35570889272231804, + "grad_norm": 2.807249069213867, + "learning_rate": 1.627400281584278e-05, + "loss": 0.6155, + "step": 28456 + }, + { + "epoch": 0.3557338933473337, + "grad_norm": 5.901269912719727, + "learning_rate": 1.6273323231823223e-05, + "loss": 1.3086, + "step": 28458 + }, + { + "epoch": 0.35575889397234933, + "grad_norm": 2.923379898071289, + "learning_rate": 1.627264360002693e-05, + "loss": 1.2118, + "step": 28460 + }, + { + "epoch": 0.3557838945973649, + "grad_norm": 2.8598904609680176, + "learning_rate": 1.6271963920459067e-05, + "loss": 0.6762, + "step": 28462 + }, + { + "epoch": 0.3558088952223806, + "grad_norm": 0.8914341330528259, + "learning_rate": 1.6271284193124814e-05, + "loss": 0.442, + "step": 28464 + }, + { + "epoch": 0.35583389584739616, + "grad_norm": 3.6436517238616943, + "learning_rate": 1.6270604418029344e-05, + "loss": 0.7777, + "step": 28466 + }, + { + "epoch": 0.3558588964724118, + "grad_norm": 2.7718453407287598, + "learning_rate": 1.626992459517784e-05, + "loss": 1.0045, + "step": 28468 + }, + { + "epoch": 0.35588389709742746, + "grad_norm": 4.087403297424316, + "learning_rate": 1.626924472457548e-05, + "loss": 1.2095, + "step": 28470 + }, + { + "epoch": 0.35590889772244305, + "grad_norm": 3.477893114089966, + "learning_rate": 1.6268564806227437e-05, + "loss": 0.8007, + "step": 28472 + }, + { + "epoch": 0.3559338983474587, + "grad_norm": 0.3399140536785126, + "learning_rate": 1.626788484013889e-05, + "loss": 0.1638, + "step": 28474 + }, + { + "epoch": 0.3559588989724743, + "grad_norm": 0.9433779120445251, + "learning_rate": 1.6267204826315016e-05, + "loss": 0.6408, + "step": 28476 + }, + { + "epoch": 0.35598389959748994, + "grad_norm": 5.311615467071533, + "learning_rate": 1.6266524764761e-05, + "loss": 1.2725, + "step": 28478 + }, + { + "epoch": 0.3560089002225056, + "grad_norm": 3.789191246032715, + "learning_rate": 1.6265844655482016e-05, + "loss": 0.3158, + "step": 28480 + }, + { + "epoch": 0.3560339008475212, + "grad_norm": 4.3775634765625, + "learning_rate": 1.6265164498483246e-05, + "loss": 1.3658, + "step": 28482 + }, + { + "epoch": 0.3560589014725368, + "grad_norm": 5.809157848358154, + "learning_rate": 1.626448429376987e-05, + "loss": 1.0997, + "step": 28484 + }, + { + "epoch": 0.3560839020975524, + "grad_norm": 2.1953608989715576, + "learning_rate": 1.6263804041347066e-05, + "loss": 1.0283, + "step": 28486 + }, + { + "epoch": 0.35610890272256807, + "grad_norm": 3.5242981910705566, + "learning_rate": 1.6263123741220016e-05, + "loss": 0.2971, + "step": 28488 + }, + { + "epoch": 0.3561339033475837, + "grad_norm": 3.557264566421509, + "learning_rate": 1.62624433933939e-05, + "loss": 1.3416, + "step": 28490 + }, + { + "epoch": 0.3561589039725993, + "grad_norm": 0.13408376276493073, + "learning_rate": 1.6261762997873903e-05, + "loss": 0.5252, + "step": 28492 + }, + { + "epoch": 0.35618390459761495, + "grad_norm": 2.480191469192505, + "learning_rate": 1.6261082554665204e-05, + "loss": 1.4648, + "step": 28494 + }, + { + "epoch": 0.35620890522263055, + "grad_norm": 10.201445579528809, + "learning_rate": 1.6260402063772985e-05, + "loss": 1.0517, + "step": 28496 + }, + { + "epoch": 0.3562339058476462, + "grad_norm": 2.745558023452759, + "learning_rate": 1.625972152520243e-05, + "loss": 0.6337, + "step": 28498 + }, + { + "epoch": 0.35625890647266184, + "grad_norm": 2.595209836959839, + "learning_rate": 1.6259040938958723e-05, + "loss": 0.5426, + "step": 28500 + }, + { + "epoch": 0.35628390709767743, + "grad_norm": 3.4185261726379395, + "learning_rate": 1.625836030504704e-05, + "loss": 1.416, + "step": 28502 + }, + { + "epoch": 0.3563089077226931, + "grad_norm": 4.008460521697998, + "learning_rate": 1.6257679623472575e-05, + "loss": 0.5411, + "step": 28504 + }, + { + "epoch": 0.35633390834770867, + "grad_norm": 0.2264113426208496, + "learning_rate": 1.6256998894240504e-05, + "loss": 0.1941, + "step": 28506 + }, + { + "epoch": 0.3563589089727243, + "grad_norm": 2.7158045768737793, + "learning_rate": 1.6256318117356016e-05, + "loss": 0.7589, + "step": 28508 + }, + { + "epoch": 0.35638390959773997, + "grad_norm": 2.642336368560791, + "learning_rate": 1.6255637292824293e-05, + "loss": 1.2995, + "step": 28510 + }, + { + "epoch": 0.35640891022275556, + "grad_norm": 7.19624662399292, + "learning_rate": 1.6254956420650523e-05, + "loss": 0.9136, + "step": 28512 + }, + { + "epoch": 0.3564339108477712, + "grad_norm": 5.856320858001709, + "learning_rate": 1.6254275500839888e-05, + "loss": 0.3862, + "step": 28514 + }, + { + "epoch": 0.3564589114727868, + "grad_norm": 3.8831660747528076, + "learning_rate": 1.6253594533397575e-05, + "loss": 1.2103, + "step": 28516 + }, + { + "epoch": 0.35648391209780245, + "grad_norm": 2.238842487335205, + "learning_rate": 1.6252913518328768e-05, + "loss": 0.8691, + "step": 28518 + }, + { + "epoch": 0.3565089127228181, + "grad_norm": 4.322253227233887, + "learning_rate": 1.6252232455638657e-05, + "loss": 0.7951, + "step": 28520 + }, + { + "epoch": 0.3565339133478337, + "grad_norm": 4.395102500915527, + "learning_rate": 1.625155134533243e-05, + "loss": 1.2248, + "step": 28522 + }, + { + "epoch": 0.35655891397284933, + "grad_norm": 2.35412335395813, + "learning_rate": 1.625087018741527e-05, + "loss": 1.4827, + "step": 28524 + }, + { + "epoch": 0.3565839145978649, + "grad_norm": 4.191435813903809, + "learning_rate": 1.625018898189237e-05, + "loss": 1.3718, + "step": 28526 + }, + { + "epoch": 0.3566089152228806, + "grad_norm": 0.005500511731952429, + "learning_rate": 1.6249507728768913e-05, + "loss": 0.0016, + "step": 28528 + }, + { + "epoch": 0.3566339158478962, + "grad_norm": 6.643413543701172, + "learning_rate": 1.6248826428050082e-05, + "loss": 0.611, + "step": 28530 + }, + { + "epoch": 0.3566589164729118, + "grad_norm": 6.291377067565918, + "learning_rate": 1.6248145079741085e-05, + "loss": 0.5529, + "step": 28532 + }, + { + "epoch": 0.35668391709792746, + "grad_norm": 0.6224892735481262, + "learning_rate": 1.624746368384709e-05, + "loss": 0.3525, + "step": 28534 + }, + { + "epoch": 0.35670891772294305, + "grad_norm": 3.9569687843322754, + "learning_rate": 1.6246782240373296e-05, + "loss": 1.1271, + "step": 28536 + }, + { + "epoch": 0.3567339183479587, + "grad_norm": 5.106566429138184, + "learning_rate": 1.6246100749324895e-05, + "loss": 1.5373, + "step": 28538 + }, + { + "epoch": 0.35675891897297435, + "grad_norm": 4.777742862701416, + "learning_rate": 1.6245419210707073e-05, + "loss": 0.3955, + "step": 28540 + }, + { + "epoch": 0.35678391959798994, + "grad_norm": 2.8216443061828613, + "learning_rate": 1.6244737624525017e-05, + "loss": 0.914, + "step": 28542 + }, + { + "epoch": 0.3568089202230056, + "grad_norm": 5.445387840270996, + "learning_rate": 1.6244055990783928e-05, + "loss": 1.2406, + "step": 28544 + }, + { + "epoch": 0.3568339208480212, + "grad_norm": 6.848621368408203, + "learning_rate": 1.624337430948899e-05, + "loss": 1.1799, + "step": 28546 + }, + { + "epoch": 0.3568589214730368, + "grad_norm": 3.2970380783081055, + "learning_rate": 1.6242692580645393e-05, + "loss": 1.222, + "step": 28548 + }, + { + "epoch": 0.3568839220980525, + "grad_norm": 0.3846873641014099, + "learning_rate": 1.6242010804258334e-05, + "loss": 1.4499, + "step": 28550 + }, + { + "epoch": 0.35690892272306807, + "grad_norm": 5.623931884765625, + "learning_rate": 1.6241328980333005e-05, + "loss": 1.1152, + "step": 28552 + }, + { + "epoch": 0.3569339233480837, + "grad_norm": 3.731677770614624, + "learning_rate": 1.6240647108874596e-05, + "loss": 0.5828, + "step": 28554 + }, + { + "epoch": 0.3569589239730993, + "grad_norm": 2.9389030933380127, + "learning_rate": 1.62399651898883e-05, + "loss": 1.4126, + "step": 28556 + }, + { + "epoch": 0.35698392459811495, + "grad_norm": 3.0637736320495605, + "learning_rate": 1.6239283223379314e-05, + "loss": 1.0963, + "step": 28558 + }, + { + "epoch": 0.3570089252231306, + "grad_norm": 0.05022182688117027, + "learning_rate": 1.623860120935283e-05, + "loss": 0.5976, + "step": 28560 + }, + { + "epoch": 0.3570339258481462, + "grad_norm": 6.430458068847656, + "learning_rate": 1.6237919147814037e-05, + "loss": 1.0752, + "step": 28562 + }, + { + "epoch": 0.35705892647316184, + "grad_norm": 0.006593166850507259, + "learning_rate": 1.6237237038768138e-05, + "loss": 0.2606, + "step": 28564 + }, + { + "epoch": 0.35708392709817743, + "grad_norm": 0.02260916493833065, + "learning_rate": 1.6236554882220323e-05, + "loss": 1.4995, + "step": 28566 + }, + { + "epoch": 0.3571089277231931, + "grad_norm": 0.08323509246110916, + "learning_rate": 1.6235872678175786e-05, + "loss": 0.8448, + "step": 28568 + }, + { + "epoch": 0.35713392834820873, + "grad_norm": 3.5875866413116455, + "learning_rate": 1.6235190426639724e-05, + "loss": 0.2548, + "step": 28570 + }, + { + "epoch": 0.3571589289732243, + "grad_norm": 4.045574188232422, + "learning_rate": 1.6234508127617335e-05, + "loss": 1.2161, + "step": 28572 + }, + { + "epoch": 0.35718392959823997, + "grad_norm": 5.4569172859191895, + "learning_rate": 1.6233825781113813e-05, + "loss": 1.4049, + "step": 28574 + }, + { + "epoch": 0.35720893022325556, + "grad_norm": 4.076409339904785, + "learning_rate": 1.623314338713436e-05, + "loss": 1.2942, + "step": 28576 + }, + { + "epoch": 0.3572339308482712, + "grad_norm": 5.696414470672607, + "learning_rate": 1.6232460945684162e-05, + "loss": 0.7202, + "step": 28578 + }, + { + "epoch": 0.35725893147328686, + "grad_norm": 0.347213476896286, + "learning_rate": 1.6231778456768425e-05, + "loss": 0.3487, + "step": 28580 + }, + { + "epoch": 0.35728393209830245, + "grad_norm": 3.580524206161499, + "learning_rate": 1.6231095920392347e-05, + "loss": 0.4014, + "step": 28582 + }, + { + "epoch": 0.3573089327233181, + "grad_norm": 4.039446830749512, + "learning_rate": 1.623041333656112e-05, + "loss": 1.5068, + "step": 28584 + }, + { + "epoch": 0.3573339333483337, + "grad_norm": 2.8701231479644775, + "learning_rate": 1.6229730705279947e-05, + "loss": 0.4039, + "step": 28586 + }, + { + "epoch": 0.35735893397334934, + "grad_norm": 4.416842937469482, + "learning_rate": 1.6229048026554027e-05, + "loss": 0.971, + "step": 28588 + }, + { + "epoch": 0.357383934598365, + "grad_norm": 0.02419707551598549, + "learning_rate": 1.622836530038856e-05, + "loss": 0.9429, + "step": 28590 + }, + { + "epoch": 0.3574089352233806, + "grad_norm": 2.3696014881134033, + "learning_rate": 1.622768252678874e-05, + "loss": 0.4289, + "step": 28592 + }, + { + "epoch": 0.3574339358483962, + "grad_norm": 0.7668911814689636, + "learning_rate": 1.622699970575977e-05, + "loss": 0.174, + "step": 28594 + }, + { + "epoch": 0.3574589364734118, + "grad_norm": 10.887603759765625, + "learning_rate": 1.6226316837306852e-05, + "loss": 0.8626, + "step": 28596 + }, + { + "epoch": 0.35748393709842746, + "grad_norm": 4.228444576263428, + "learning_rate": 1.622563392143519e-05, + "loss": 1.307, + "step": 28598 + }, + { + "epoch": 0.3575089377234431, + "grad_norm": 4.210491180419922, + "learning_rate": 1.6224950958149976e-05, + "loss": 0.8888, + "step": 28600 + }, + { + "epoch": 0.3575339383484587, + "grad_norm": 5.849670886993408, + "learning_rate": 1.6224267947456414e-05, + "loss": 1.0894, + "step": 28602 + }, + { + "epoch": 0.35755893897347435, + "grad_norm": 0.02740389294922352, + "learning_rate": 1.622358488935971e-05, + "loss": 0.573, + "step": 28604 + }, + { + "epoch": 0.35758393959848994, + "grad_norm": 5.6603522300720215, + "learning_rate": 1.6222901783865065e-05, + "loss": 0.6481, + "step": 28606 + }, + { + "epoch": 0.3576089402235056, + "grad_norm": 4.922563076019287, + "learning_rate": 1.6222218630977676e-05, + "loss": 1.0445, + "step": 28608 + }, + { + "epoch": 0.35763394084852124, + "grad_norm": 3.584833860397339, + "learning_rate": 1.6221535430702755e-05, + "loss": 1.349, + "step": 28610 + }, + { + "epoch": 0.35765894147353683, + "grad_norm": 2.092052459716797, + "learning_rate": 1.6220852183045498e-05, + "loss": 0.562, + "step": 28612 + }, + { + "epoch": 0.3576839420985525, + "grad_norm": 2.649404764175415, + "learning_rate": 1.6220168888011107e-05, + "loss": 0.7346, + "step": 28614 + }, + { + "epoch": 0.35770894272356807, + "grad_norm": 2.9432241916656494, + "learning_rate": 1.6219485545604792e-05, + "loss": 1.3086, + "step": 28616 + }, + { + "epoch": 0.3577339433485837, + "grad_norm": 2.7241084575653076, + "learning_rate": 1.6218802155831755e-05, + "loss": 0.6684, + "step": 28618 + }, + { + "epoch": 0.35775894397359936, + "grad_norm": 0.009147309698164463, + "learning_rate": 1.62181187186972e-05, + "loss": 1.646, + "step": 28620 + }, + { + "epoch": 0.35778394459861496, + "grad_norm": 5.572994709014893, + "learning_rate": 1.6217435234206332e-05, + "loss": 1.7625, + "step": 28622 + }, + { + "epoch": 0.3578089452236306, + "grad_norm": 4.609100818634033, + "learning_rate": 1.6216751702364358e-05, + "loss": 1.6161, + "step": 28624 + }, + { + "epoch": 0.3578339458486462, + "grad_norm": 8.338676452636719, + "learning_rate": 1.6216068123176478e-05, + "loss": 1.0426, + "step": 28626 + }, + { + "epoch": 0.35785894647366184, + "grad_norm": 2.8178648948669434, + "learning_rate": 1.6215384496647906e-05, + "loss": 0.7293, + "step": 28628 + }, + { + "epoch": 0.3578839470986775, + "grad_norm": 1.3636665344238281, + "learning_rate": 1.6214700822783843e-05, + "loss": 0.8829, + "step": 28630 + }, + { + "epoch": 0.3579089477236931, + "grad_norm": 3.515993595123291, + "learning_rate": 1.6214017101589497e-05, + "loss": 1.3288, + "step": 28632 + }, + { + "epoch": 0.35793394834870873, + "grad_norm": 3.4563350677490234, + "learning_rate": 1.621333333307008e-05, + "loss": 1.3701, + "step": 28634 + }, + { + "epoch": 0.3579589489737243, + "grad_norm": 0.02057523839175701, + "learning_rate": 1.6212649517230788e-05, + "loss": 0.8198, + "step": 28636 + }, + { + "epoch": 0.35798394959873997, + "grad_norm": 1.25254487991333, + "learning_rate": 1.621196565407684e-05, + "loss": 0.7633, + "step": 28638 + }, + { + "epoch": 0.3580089502237556, + "grad_norm": 2.8195977210998535, + "learning_rate": 1.621128174361344e-05, + "loss": 1.2232, + "step": 28640 + }, + { + "epoch": 0.3580339508487712, + "grad_norm": 3.0963075160980225, + "learning_rate": 1.6210597785845793e-05, + "loss": 1.3172, + "step": 28642 + }, + { + "epoch": 0.35805895147378686, + "grad_norm": 3.433262586593628, + "learning_rate": 1.6209913780779115e-05, + "loss": 1.7663, + "step": 28644 + }, + { + "epoch": 0.35808395209880245, + "grad_norm": 5.545505046844482, + "learning_rate": 1.620922972841861e-05, + "loss": 1.5641, + "step": 28646 + }, + { + "epoch": 0.3581089527238181, + "grad_norm": 2.498323440551758, + "learning_rate": 1.6208545628769493e-05, + "loss": 1.0659, + "step": 28648 + }, + { + "epoch": 0.35813395334883374, + "grad_norm": 5.770199775695801, + "learning_rate": 1.620786148183697e-05, + "loss": 2.1898, + "step": 28650 + }, + { + "epoch": 0.35815895397384934, + "grad_norm": 4.499716758728027, + "learning_rate": 1.620717728762625e-05, + "loss": 1.4958, + "step": 28652 + }, + { + "epoch": 0.358183954598865, + "grad_norm": 2.9566309452056885, + "learning_rate": 1.6206493046142542e-05, + "loss": 0.4814, + "step": 28654 + }, + { + "epoch": 0.3582089552238806, + "grad_norm": 3.0319900512695312, + "learning_rate": 1.6205808757391066e-05, + "loss": 1.2278, + "step": 28656 + }, + { + "epoch": 0.3582339558488962, + "grad_norm": 7.916463851928711, + "learning_rate": 1.6205124421377025e-05, + "loss": 0.2483, + "step": 28658 + }, + { + "epoch": 0.35825895647391187, + "grad_norm": 2.300778388977051, + "learning_rate": 1.6204440038105637e-05, + "loss": 0.9476, + "step": 28660 + }, + { + "epoch": 0.35828395709892746, + "grad_norm": 2.1816141605377197, + "learning_rate": 1.620375560758211e-05, + "loss": 0.3174, + "step": 28662 + }, + { + "epoch": 0.3583089577239431, + "grad_norm": 4.929166316986084, + "learning_rate": 1.6203071129811653e-05, + "loss": 1.844, + "step": 28664 + }, + { + "epoch": 0.3583339583489587, + "grad_norm": 2.048489570617676, + "learning_rate": 1.6202386604799486e-05, + "loss": 0.6462, + "step": 28666 + }, + { + "epoch": 0.35835895897397435, + "grad_norm": 3.099024534225464, + "learning_rate": 1.6201702032550822e-05, + "loss": 1.1161, + "step": 28668 + }, + { + "epoch": 0.35838395959899, + "grad_norm": 1.2895668745040894, + "learning_rate": 1.6201017413070873e-05, + "loss": 1.3676, + "step": 28670 + }, + { + "epoch": 0.3584089602240056, + "grad_norm": 1.2212491035461426, + "learning_rate": 1.620033274636485e-05, + "loss": 0.0811, + "step": 28672 + }, + { + "epoch": 0.35843396084902124, + "grad_norm": 2.9045138359069824, + "learning_rate": 1.6199648032437968e-05, + "loss": 0.6273, + "step": 28674 + }, + { + "epoch": 0.35845896147403683, + "grad_norm": 0.09596911072731018, + "learning_rate": 1.6198963271295448e-05, + "loss": 0.5597, + "step": 28676 + }, + { + "epoch": 0.3584839620990525, + "grad_norm": 0.5801238417625427, + "learning_rate": 1.6198278462942497e-05, + "loss": 0.0973, + "step": 28678 + }, + { + "epoch": 0.3585089627240681, + "grad_norm": 5.546363830566406, + "learning_rate": 1.619759360738433e-05, + "loss": 2.1509, + "step": 28680 + }, + { + "epoch": 0.3585339633490837, + "grad_norm": 3.3804714679718018, + "learning_rate": 1.6196908704626176e-05, + "loss": 0.0833, + "step": 28682 + }, + { + "epoch": 0.35855896397409937, + "grad_norm": 3.166010856628418, + "learning_rate": 1.6196223754673236e-05, + "loss": 1.3813, + "step": 28684 + }, + { + "epoch": 0.35858396459911496, + "grad_norm": 3.4208223819732666, + "learning_rate": 1.619553875753073e-05, + "loss": 0.9314, + "step": 28686 + }, + { + "epoch": 0.3586089652241306, + "grad_norm": 4.78906774520874, + "learning_rate": 1.619485371320388e-05, + "loss": 1.0366, + "step": 28688 + }, + { + "epoch": 0.35863396584914625, + "grad_norm": 0.20152755081653595, + "learning_rate": 1.61941686216979e-05, + "loss": 0.0035, + "step": 28690 + }, + { + "epoch": 0.35865896647416184, + "grad_norm": 3.566371440887451, + "learning_rate": 1.6193483483018007e-05, + "loss": 0.9581, + "step": 28692 + }, + { + "epoch": 0.3586839670991775, + "grad_norm": 0.06100558117032051, + "learning_rate": 1.619279829716942e-05, + "loss": 0.1255, + "step": 28694 + }, + { + "epoch": 0.3587089677241931, + "grad_norm": 3.5026564598083496, + "learning_rate": 1.619211306415736e-05, + "loss": 0.7594, + "step": 28696 + }, + { + "epoch": 0.35873396834920873, + "grad_norm": 7.914218902587891, + "learning_rate": 1.6191427783987038e-05, + "loss": 0.7519, + "step": 28698 + }, + { + "epoch": 0.3587589689742244, + "grad_norm": 4.954015254974365, + "learning_rate": 1.6190742456663677e-05, + "loss": 0.7738, + "step": 28700 + }, + { + "epoch": 0.35878396959923997, + "grad_norm": 6.370765209197998, + "learning_rate": 1.61900570821925e-05, + "loss": 1.2646, + "step": 28702 + }, + { + "epoch": 0.3588089702242556, + "grad_norm": 2.2292542457580566, + "learning_rate": 1.6189371660578724e-05, + "loss": 1.1997, + "step": 28704 + }, + { + "epoch": 0.3588339708492712, + "grad_norm": 0.00451117055490613, + "learning_rate": 1.6188686191827567e-05, + "loss": 0.0892, + "step": 28706 + }, + { + "epoch": 0.35885897147428686, + "grad_norm": 4.74111270904541, + "learning_rate": 1.618800067594425e-05, + "loss": 1.4575, + "step": 28708 + }, + { + "epoch": 0.3588839720993025, + "grad_norm": 1.9412353038787842, + "learning_rate": 1.6187315112933996e-05, + "loss": 0.7733, + "step": 28710 + }, + { + "epoch": 0.3589089727243181, + "grad_norm": 3.643723726272583, + "learning_rate": 1.6186629502802028e-05, + "loss": 1.9243, + "step": 28712 + }, + { + "epoch": 0.35893397334933375, + "grad_norm": 5.092223644256592, + "learning_rate": 1.618594384555356e-05, + "loss": 0.6843, + "step": 28714 + }, + { + "epoch": 0.35895897397434934, + "grad_norm": 2.181797742843628, + "learning_rate": 1.6185258141193824e-05, + "loss": 1.7933, + "step": 28716 + }, + { + "epoch": 0.358983974599365, + "grad_norm": 3.46618390083313, + "learning_rate": 1.618457238972803e-05, + "loss": 1.3549, + "step": 28718 + }, + { + "epoch": 0.35900897522438063, + "grad_norm": 2.4155895709991455, + "learning_rate": 1.618388659116141e-05, + "loss": 0.6106, + "step": 28720 + }, + { + "epoch": 0.3590339758493962, + "grad_norm": 0.027631472796201706, + "learning_rate": 1.6183200745499187e-05, + "loss": 0.6134, + "step": 28722 + }, + { + "epoch": 0.3590589764744119, + "grad_norm": 2.414940357208252, + "learning_rate": 1.6182514852746575e-05, + "loss": 0.948, + "step": 28724 + }, + { + "epoch": 0.35908397709942746, + "grad_norm": 3.3944618701934814, + "learning_rate": 1.618182891290881e-05, + "loss": 1.262, + "step": 28726 + }, + { + "epoch": 0.3591089777244431, + "grad_norm": 2.3075573444366455, + "learning_rate": 1.618114292599111e-05, + "loss": 1.1536, + "step": 28728 + }, + { + "epoch": 0.35913397834945876, + "grad_norm": 4.208751678466797, + "learning_rate": 1.6180456891998696e-05, + "loss": 0.7307, + "step": 28730 + }, + { + "epoch": 0.35915897897447435, + "grad_norm": 3.108344316482544, + "learning_rate": 1.6179770810936796e-05, + "loss": 0.3216, + "step": 28732 + }, + { + "epoch": 0.35918397959949, + "grad_norm": 2.465162754058838, + "learning_rate": 1.6179084682810642e-05, + "loss": 0.472, + "step": 28734 + }, + { + "epoch": 0.3592089802245056, + "grad_norm": 0.007739433087408543, + "learning_rate": 1.6178398507625445e-05, + "loss": 0.8234, + "step": 28736 + }, + { + "epoch": 0.35923398084952124, + "grad_norm": 4.808141231536865, + "learning_rate": 1.6177712285386446e-05, + "loss": 1.7725, + "step": 28738 + }, + { + "epoch": 0.3592589814745369, + "grad_norm": 2.9710092544555664, + "learning_rate": 1.6177026016098856e-05, + "loss": 0.912, + "step": 28740 + }, + { + "epoch": 0.3592839820995525, + "grad_norm": 0.7302516102790833, + "learning_rate": 1.6176339699767917e-05, + "loss": 0.0254, + "step": 28742 + }, + { + "epoch": 0.3593089827245681, + "grad_norm": 3.1588943004608154, + "learning_rate": 1.6175653336398843e-05, + "loss": 0.5533, + "step": 28744 + }, + { + "epoch": 0.3593339833495837, + "grad_norm": 4.798074245452881, + "learning_rate": 1.6174966925996866e-05, + "loss": 1.2702, + "step": 28746 + }, + { + "epoch": 0.35935898397459937, + "grad_norm": 4.380312919616699, + "learning_rate": 1.617428046856722e-05, + "loss": 1.7844, + "step": 28748 + }, + { + "epoch": 0.359383984599615, + "grad_norm": 4.064299583435059, + "learning_rate": 1.617359396411512e-05, + "loss": 1.602, + "step": 28750 + }, + { + "epoch": 0.3594089852246306, + "grad_norm": 4.922176361083984, + "learning_rate": 1.6172907412645807e-05, + "loss": 2.4974, + "step": 28752 + }, + { + "epoch": 0.35943398584964625, + "grad_norm": 1.393155574798584, + "learning_rate": 1.61722208141645e-05, + "loss": 0.4952, + "step": 28754 + }, + { + "epoch": 0.35945898647466185, + "grad_norm": 3.24395751953125, + "learning_rate": 1.6171534168676436e-05, + "loss": 1.2092, + "step": 28756 + }, + { + "epoch": 0.3594839870996775, + "grad_norm": 3.463041305541992, + "learning_rate": 1.617084747618684e-05, + "loss": 0.7234, + "step": 28758 + }, + { + "epoch": 0.35950898772469314, + "grad_norm": 3.0883843898773193, + "learning_rate": 1.6170160736700936e-05, + "loss": 1.5288, + "step": 28760 + }, + { + "epoch": 0.35953398834970873, + "grad_norm": 1.8097506761550903, + "learning_rate": 1.6169473950223967e-05, + "loss": 0.9056, + "step": 28762 + }, + { + "epoch": 0.3595589889747244, + "grad_norm": 2.930391311645508, + "learning_rate": 1.6168787116761152e-05, + "loss": 0.9606, + "step": 28764 + }, + { + "epoch": 0.35958398959974, + "grad_norm": 1.8910387754440308, + "learning_rate": 1.6168100236317732e-05, + "loss": 0.5872, + "step": 28766 + }, + { + "epoch": 0.3596089902247556, + "grad_norm": 3.4331395626068115, + "learning_rate": 1.6167413308898932e-05, + "loss": 1.8012, + "step": 28768 + }, + { + "epoch": 0.35963399084977127, + "grad_norm": 4.149116516113281, + "learning_rate": 1.616672633450998e-05, + "loss": 1.1193, + "step": 28770 + }, + { + "epoch": 0.35965899147478686, + "grad_norm": 3.552326202392578, + "learning_rate": 1.6166039313156118e-05, + "loss": 1.0149, + "step": 28772 + }, + { + "epoch": 0.3596839920998025, + "grad_norm": 11.369917869567871, + "learning_rate": 1.616535224484257e-05, + "loss": 0.0697, + "step": 28774 + }, + { + "epoch": 0.3597089927248181, + "grad_norm": 3.0306668281555176, + "learning_rate": 1.616466512957457e-05, + "loss": 0.5443, + "step": 28776 + }, + { + "epoch": 0.35973399334983375, + "grad_norm": 3.533576488494873, + "learning_rate": 1.6163977967357352e-05, + "loss": 1.217, + "step": 28778 + }, + { + "epoch": 0.3597589939748494, + "grad_norm": 4.74571418762207, + "learning_rate": 1.616329075819615e-05, + "loss": 1.4911, + "step": 28780 + }, + { + "epoch": 0.359783994599865, + "grad_norm": 3.8422675132751465, + "learning_rate": 1.6162603502096196e-05, + "loss": 0.9623, + "step": 28782 + }, + { + "epoch": 0.35980899522488063, + "grad_norm": 3.3863182067871094, + "learning_rate": 1.616191619906273e-05, + "loss": 0.7573, + "step": 28784 + }, + { + "epoch": 0.3598339958498962, + "grad_norm": 2.229782819747925, + "learning_rate": 1.6161228849100975e-05, + "loss": 0.9397, + "step": 28786 + }, + { + "epoch": 0.3598589964749119, + "grad_norm": 5.408048152923584, + "learning_rate": 1.6160541452216177e-05, + "loss": 1.497, + "step": 28788 + }, + { + "epoch": 0.3598839970999275, + "grad_norm": 3.607506036758423, + "learning_rate": 1.6159854008413563e-05, + "loss": 1.1432, + "step": 28790 + }, + { + "epoch": 0.3599089977249431, + "grad_norm": 4.222280502319336, + "learning_rate": 1.615916651769838e-05, + "loss": 0.8481, + "step": 28792 + }, + { + "epoch": 0.35993399834995876, + "grad_norm": 2.126131534576416, + "learning_rate": 1.6158478980075848e-05, + "loss": 0.0512, + "step": 28794 + }, + { + "epoch": 0.35995899897497435, + "grad_norm": 3.1480185985565186, + "learning_rate": 1.6157791395551212e-05, + "loss": 0.552, + "step": 28796 + }, + { + "epoch": 0.35998399959999, + "grad_norm": 1.5093833208084106, + "learning_rate": 1.6157103764129706e-05, + "loss": 0.4799, + "step": 28798 + }, + { + "epoch": 0.36000900022500565, + "grad_norm": 2.5064544677734375, + "learning_rate": 1.6156416085816574e-05, + "loss": 0.5139, + "step": 28800 + }, + { + "epoch": 0.36003400085002124, + "grad_norm": 2.351466655731201, + "learning_rate": 1.6155728360617045e-05, + "loss": 0.1589, + "step": 28802 + }, + { + "epoch": 0.3600590014750369, + "grad_norm": 2.588911533355713, + "learning_rate": 1.615504058853636e-05, + "loss": 0.4305, + "step": 28804 + }, + { + "epoch": 0.3600840021000525, + "grad_norm": 3.069087505340576, + "learning_rate": 1.6154352769579754e-05, + "loss": 2.3354, + "step": 28806 + }, + { + "epoch": 0.36010900272506813, + "grad_norm": 3.321106433868408, + "learning_rate": 1.6153664903752465e-05, + "loss": 1.6203, + "step": 28808 + }, + { + "epoch": 0.3601340033500838, + "grad_norm": 3.5403757095336914, + "learning_rate": 1.615297699105974e-05, + "loss": 1.585, + "step": 28810 + }, + { + "epoch": 0.36015900397509937, + "grad_norm": 3.397953987121582, + "learning_rate": 1.615228903150681e-05, + "loss": 1.8073, + "step": 28812 + }, + { + "epoch": 0.360184004600115, + "grad_norm": 4.338979244232178, + "learning_rate": 1.6151601025098917e-05, + "loss": 1.0985, + "step": 28814 + }, + { + "epoch": 0.3602090052251306, + "grad_norm": 2.866168737411499, + "learning_rate": 1.61509129718413e-05, + "loss": 0.5579, + "step": 28816 + }, + { + "epoch": 0.36023400585014625, + "grad_norm": 2.9507791996002197, + "learning_rate": 1.6150224871739203e-05, + "loss": 0.2863, + "step": 28818 + }, + { + "epoch": 0.3602590064751619, + "grad_norm": 3.3665292263031006, + "learning_rate": 1.614953672479786e-05, + "loss": 1.4669, + "step": 28820 + }, + { + "epoch": 0.3602840071001775, + "grad_norm": 2.074676752090454, + "learning_rate": 1.6148848531022517e-05, + "loss": 1.0606, + "step": 28822 + }, + { + "epoch": 0.36030900772519314, + "grad_norm": 2.5483977794647217, + "learning_rate": 1.614816029041841e-05, + "loss": 1.1042, + "step": 28824 + }, + { + "epoch": 0.36033400835020873, + "grad_norm": 3.31390118598938, + "learning_rate": 1.6147472002990786e-05, + "loss": 0.7277, + "step": 28826 + }, + { + "epoch": 0.3603590089752244, + "grad_norm": 3.9972565174102783, + "learning_rate": 1.6146783668744885e-05, + "loss": 0.5233, + "step": 28828 + }, + { + "epoch": 0.36038400960024003, + "grad_norm": 0.003101030830293894, + "learning_rate": 1.6146095287685946e-05, + "loss": 0.2654, + "step": 28830 + }, + { + "epoch": 0.3604090102252556, + "grad_norm": 3.2591259479522705, + "learning_rate": 1.6145406859819214e-05, + "loss": 1.157, + "step": 28832 + }, + { + "epoch": 0.36043401085027127, + "grad_norm": 3.2310497760772705, + "learning_rate": 1.6144718385149934e-05, + "loss": 0.9746, + "step": 28834 + }, + { + "epoch": 0.36045901147528686, + "grad_norm": 10.47032356262207, + "learning_rate": 1.6144029863683348e-05, + "loss": 1.3512, + "step": 28836 + }, + { + "epoch": 0.3604840121003025, + "grad_norm": 3.3018171787261963, + "learning_rate": 1.6143341295424703e-05, + "loss": 1.2021, + "step": 28838 + }, + { + "epoch": 0.36050901272531816, + "grad_norm": 3.6456735134124756, + "learning_rate": 1.614265268037923e-05, + "loss": 1.3832, + "step": 28840 + }, + { + "epoch": 0.36053401335033375, + "grad_norm": 3.816145658493042, + "learning_rate": 1.614196401855219e-05, + "loss": 1.7163, + "step": 28842 + }, + { + "epoch": 0.3605590139753494, + "grad_norm": 4.711363792419434, + "learning_rate": 1.614127530994882e-05, + "loss": 0.5962, + "step": 28844 + }, + { + "epoch": 0.360584014600365, + "grad_norm": 3.2934911251068115, + "learning_rate": 1.6140586554574366e-05, + "loss": 0.8598, + "step": 28846 + }, + { + "epoch": 0.36060901522538064, + "grad_norm": 2.1011924743652344, + "learning_rate": 1.613989775243407e-05, + "loss": 0.6717, + "step": 28848 + }, + { + "epoch": 0.3606340158503963, + "grad_norm": 6.195206165313721, + "learning_rate": 1.6139208903533187e-05, + "loss": 1.8174, + "step": 28850 + }, + { + "epoch": 0.3606590164754119, + "grad_norm": 0.2103242129087448, + "learning_rate": 1.613852000787695e-05, + "loss": 0.8248, + "step": 28852 + }, + { + "epoch": 0.3606840171004275, + "grad_norm": 0.26299360394477844, + "learning_rate": 1.6137831065470617e-05, + "loss": 0.2892, + "step": 28854 + }, + { + "epoch": 0.3607090177254431, + "grad_norm": 7.2656474113464355, + "learning_rate": 1.613714207631943e-05, + "loss": 1.4195, + "step": 28856 + }, + { + "epoch": 0.36073401835045876, + "grad_norm": 3.666234016418457, + "learning_rate": 1.6136453040428634e-05, + "loss": 1.2878, + "step": 28858 + }, + { + "epoch": 0.3607590189754744, + "grad_norm": 2.8003783226013184, + "learning_rate": 1.6135763957803482e-05, + "loss": 0.733, + "step": 28860 + }, + { + "epoch": 0.36078401960049, + "grad_norm": 0.045640747994184494, + "learning_rate": 1.6135074828449224e-05, + "loss": 0.0703, + "step": 28862 + }, + { + "epoch": 0.36080902022550565, + "grad_norm": 1.6143006086349487, + "learning_rate": 1.6134385652371093e-05, + "loss": 0.1348, + "step": 28864 + }, + { + "epoch": 0.36083402085052124, + "grad_norm": 3.6696267127990723, + "learning_rate": 1.613369642957436e-05, + "loss": 0.8584, + "step": 28866 + }, + { + "epoch": 0.3608590214755369, + "grad_norm": 0.036086149513721466, + "learning_rate": 1.6133007160064255e-05, + "loss": 0.7918, + "step": 28868 + }, + { + "epoch": 0.36088402210055254, + "grad_norm": 5.553411483764648, + "learning_rate": 1.613231784384603e-05, + "loss": 1.7908, + "step": 28870 + }, + { + "epoch": 0.36090902272556813, + "grad_norm": 4.030111789703369, + "learning_rate": 1.613162848092495e-05, + "loss": 1.5959, + "step": 28872 + }, + { + "epoch": 0.3609340233505838, + "grad_norm": 9.15664005279541, + "learning_rate": 1.613093907130625e-05, + "loss": 0.7222, + "step": 28874 + }, + { + "epoch": 0.36095902397559937, + "grad_norm": 1.9733808040618896, + "learning_rate": 1.6130249614995187e-05, + "loss": 0.3925, + "step": 28876 + }, + { + "epoch": 0.360984024600615, + "grad_norm": 2.767673969268799, + "learning_rate": 1.6129560111997006e-05, + "loss": 0.6649, + "step": 28878 + }, + { + "epoch": 0.36100902522563066, + "grad_norm": 3.7579710483551025, + "learning_rate": 1.6128870562316966e-05, + "loss": 1.557, + "step": 28880 + }, + { + "epoch": 0.36103402585064626, + "grad_norm": 8.13185977935791, + "learning_rate": 1.612818096596031e-05, + "loss": 1.7788, + "step": 28882 + }, + { + "epoch": 0.3610590264756619, + "grad_norm": 0.004495892208069563, + "learning_rate": 1.6127491322932297e-05, + "loss": 0.8302, + "step": 28884 + }, + { + "epoch": 0.3610840271006775, + "grad_norm": 10.521872520446777, + "learning_rate": 1.6126801633238178e-05, + "loss": 0.992, + "step": 28886 + }, + { + "epoch": 0.36110902772569314, + "grad_norm": 0.004594713449478149, + "learning_rate": 1.6126111896883203e-05, + "loss": 0.7534, + "step": 28888 + }, + { + "epoch": 0.3611340283507088, + "grad_norm": 4.730041980743408, + "learning_rate": 1.6125422113872623e-05, + "loss": 1.1287, + "step": 28890 + }, + { + "epoch": 0.3611590289757244, + "grad_norm": 3.0960261821746826, + "learning_rate": 1.6124732284211703e-05, + "loss": 0.9314, + "step": 28892 + }, + { + "epoch": 0.36118402960074003, + "grad_norm": 4.1271538734436035, + "learning_rate": 1.612404240790568e-05, + "loss": 1.5838, + "step": 28894 + }, + { + "epoch": 0.3612090302257556, + "grad_norm": 2.3577065467834473, + "learning_rate": 1.6123352484959814e-05, + "loss": 1.1613, + "step": 28896 + }, + { + "epoch": 0.36123403085077127, + "grad_norm": 3.4688684940338135, + "learning_rate": 1.6122662515379372e-05, + "loss": 0.3715, + "step": 28898 + }, + { + "epoch": 0.3612590314757869, + "grad_norm": 0.002906579989939928, + "learning_rate": 1.612197249916959e-05, + "loss": 0.1096, + "step": 28900 + }, + { + "epoch": 0.3612840321008025, + "grad_norm": 2.1174020767211914, + "learning_rate": 1.6121282436335733e-05, + "loss": 0.9252, + "step": 28902 + }, + { + "epoch": 0.36130903272581816, + "grad_norm": 4.763408184051514, + "learning_rate": 1.6120592326883056e-05, + "loss": 1.9015, + "step": 28904 + }, + { + "epoch": 0.36133403335083375, + "grad_norm": 2.610106945037842, + "learning_rate": 1.611990217081681e-05, + "loss": 0.4666, + "step": 28906 + }, + { + "epoch": 0.3613590339758494, + "grad_norm": 2.860687732696533, + "learning_rate": 1.6119211968142257e-05, + "loss": 1.4993, + "step": 28908 + }, + { + "epoch": 0.36138403460086505, + "grad_norm": 2.92073392868042, + "learning_rate": 1.611852171886465e-05, + "loss": 0.5273, + "step": 28910 + }, + { + "epoch": 0.36140903522588064, + "grad_norm": 5.161757469177246, + "learning_rate": 1.6117831422989248e-05, + "loss": 1.0303, + "step": 28912 + }, + { + "epoch": 0.3614340358508963, + "grad_norm": 2.553264856338501, + "learning_rate": 1.6117141080521307e-05, + "loss": 1.1821, + "step": 28914 + }, + { + "epoch": 0.3614590364759119, + "grad_norm": 1.7905436754226685, + "learning_rate": 1.611645069146608e-05, + "loss": 0.6143, + "step": 28916 + }, + { + "epoch": 0.3614840371009275, + "grad_norm": 3.838510751724243, + "learning_rate": 1.6115760255828836e-05, + "loss": 1.4626, + "step": 28918 + }, + { + "epoch": 0.36150903772594317, + "grad_norm": 3.732208251953125, + "learning_rate": 1.6115069773614824e-05, + "loss": 0.6842, + "step": 28920 + }, + { + "epoch": 0.36153403835095876, + "grad_norm": 4.866199970245361, + "learning_rate": 1.6114379244829302e-05, + "loss": 0.6939, + "step": 28922 + }, + { + "epoch": 0.3615590389759744, + "grad_norm": 11.453906059265137, + "learning_rate": 1.611368866947754e-05, + "loss": 0.3853, + "step": 28924 + }, + { + "epoch": 0.36158403960099, + "grad_norm": 2.6235697269439697, + "learning_rate": 1.6112998047564784e-05, + "loss": 1.2012, + "step": 28926 + }, + { + "epoch": 0.36160904022600565, + "grad_norm": 1.657292366027832, + "learning_rate": 1.61123073790963e-05, + "loss": 2.2833, + "step": 28928 + }, + { + "epoch": 0.3616340408510213, + "grad_norm": 3.483065128326416, + "learning_rate": 1.6111616664077348e-05, + "loss": 1.4297, + "step": 28930 + }, + { + "epoch": 0.3616590414760369, + "grad_norm": 3.567321300506592, + "learning_rate": 1.611092590251319e-05, + "loss": 0.4183, + "step": 28932 + }, + { + "epoch": 0.36168404210105254, + "grad_norm": 4.464304447174072, + "learning_rate": 1.6110235094409082e-05, + "loss": 1.1532, + "step": 28934 + }, + { + "epoch": 0.36170904272606813, + "grad_norm": 5.20028829574585, + "learning_rate": 1.6109544239770286e-05, + "loss": 1.4819, + "step": 28936 + }, + { + "epoch": 0.3617340433510838, + "grad_norm": 6.151208877563477, + "learning_rate": 1.610885333860207e-05, + "loss": 1.4416, + "step": 28938 + }, + { + "epoch": 0.3617590439760994, + "grad_norm": 1.8697154521942139, + "learning_rate": 1.610816239090969e-05, + "loss": 0.1472, + "step": 28940 + }, + { + "epoch": 0.361784044601115, + "grad_norm": 2.5518946647644043, + "learning_rate": 1.6107471396698403e-05, + "loss": 0.4711, + "step": 28942 + }, + { + "epoch": 0.36180904522613067, + "grad_norm": 3.2056586742401123, + "learning_rate": 1.6106780355973482e-05, + "loss": 1.0298, + "step": 28944 + }, + { + "epoch": 0.36183404585114626, + "grad_norm": 6.7672929763793945, + "learning_rate": 1.6106089268740184e-05, + "loss": 1.7704, + "step": 28946 + }, + { + "epoch": 0.3618590464761619, + "grad_norm": 13.01771354675293, + "learning_rate": 1.6105398135003775e-05, + "loss": 1.3674, + "step": 28948 + }, + { + "epoch": 0.36188404710117755, + "grad_norm": 3.580949068069458, + "learning_rate": 1.6104706954769516e-05, + "loss": 1.2203, + "step": 28950 + }, + { + "epoch": 0.36190904772619314, + "grad_norm": 4.940613269805908, + "learning_rate": 1.610401572804267e-05, + "loss": 1.4245, + "step": 28952 + }, + { + "epoch": 0.3619340483512088, + "grad_norm": 4.656303405761719, + "learning_rate": 1.6103324454828508e-05, + "loss": 1.8566, + "step": 28954 + }, + { + "epoch": 0.3619590489762244, + "grad_norm": 0.08245094865560532, + "learning_rate": 1.610263313513229e-05, + "loss": 0.796, + "step": 28956 + }, + { + "epoch": 0.36198404960124003, + "grad_norm": 5.384162425994873, + "learning_rate": 1.6101941768959276e-05, + "loss": 1.3501, + "step": 28958 + }, + { + "epoch": 0.3620090502262557, + "grad_norm": 1.5279016494750977, + "learning_rate": 1.610125035631474e-05, + "loss": 1.0318, + "step": 28960 + }, + { + "epoch": 0.36203405085127127, + "grad_norm": 4.97634220123291, + "learning_rate": 1.6100558897203947e-05, + "loss": 0.4354, + "step": 28962 + }, + { + "epoch": 0.3620590514762869, + "grad_norm": 4.300848484039307, + "learning_rate": 1.6099867391632152e-05, + "loss": 1.5027, + "step": 28964 + }, + { + "epoch": 0.3620840521013025, + "grad_norm": 1.4737889766693115, + "learning_rate": 1.6099175839604636e-05, + "loss": 1.0594, + "step": 28966 + }, + { + "epoch": 0.36210905272631816, + "grad_norm": 4.0113844871521, + "learning_rate": 1.609848424112666e-05, + "loss": 1.1363, + "step": 28968 + }, + { + "epoch": 0.3621340533513338, + "grad_norm": 2.0255420207977295, + "learning_rate": 1.6097792596203485e-05, + "loss": 0.9229, + "step": 28970 + }, + { + "epoch": 0.3621590539763494, + "grad_norm": 3.88834285736084, + "learning_rate": 1.6097100904840387e-05, + "loss": 1.0173, + "step": 28972 + }, + { + "epoch": 0.36218405460136505, + "grad_norm": 4.287393093109131, + "learning_rate": 1.609640916704263e-05, + "loss": 1.7693, + "step": 28974 + }, + { + "epoch": 0.36220905522638064, + "grad_norm": 3.1960489749908447, + "learning_rate": 1.6095717382815486e-05, + "loss": 0.2903, + "step": 28976 + }, + { + "epoch": 0.3622340558513963, + "grad_norm": 3.05533766746521, + "learning_rate": 1.609502555216422e-05, + "loss": 1.7956, + "step": 28978 + }, + { + "epoch": 0.36225905647641193, + "grad_norm": 3.0473456382751465, + "learning_rate": 1.6094333675094098e-05, + "loss": 1.6595, + "step": 28980 + }, + { + "epoch": 0.3622840571014275, + "grad_norm": 3.789478063583374, + "learning_rate": 1.6093641751610393e-05, + "loss": 1.3749, + "step": 28982 + }, + { + "epoch": 0.3623090577264432, + "grad_norm": 0.0023631779477000237, + "learning_rate": 1.6092949781718376e-05, + "loss": 0.0001, + "step": 28984 + }, + { + "epoch": 0.36233405835145877, + "grad_norm": 3.500290870666504, + "learning_rate": 1.6092257765423312e-05, + "loss": 0.9827, + "step": 28986 + }, + { + "epoch": 0.3623590589764744, + "grad_norm": 2.5646884441375732, + "learning_rate": 1.6091565702730478e-05, + "loss": 1.5454, + "step": 28988 + }, + { + "epoch": 0.36238405960149006, + "grad_norm": 4.7130537033081055, + "learning_rate": 1.609087359364514e-05, + "loss": 1.6344, + "step": 28990 + }, + { + "epoch": 0.36240906022650565, + "grad_norm": 0.002030817326158285, + "learning_rate": 1.6090181438172568e-05, + "loss": 0.8282, + "step": 28992 + }, + { + "epoch": 0.3624340608515213, + "grad_norm": 4.902876377105713, + "learning_rate": 1.6089489236318037e-05, + "loss": 0.8938, + "step": 28994 + }, + { + "epoch": 0.3624590614765369, + "grad_norm": 2.986661911010742, + "learning_rate": 1.6088796988086816e-05, + "loss": 0.7212, + "step": 28996 + }, + { + "epoch": 0.36248406210155254, + "grad_norm": 5.395079135894775, + "learning_rate": 1.608810469348418e-05, + "loss": 1.737, + "step": 28998 + }, + { + "epoch": 0.3625090627265682, + "grad_norm": 2.5629754066467285, + "learning_rate": 1.6087412352515398e-05, + "loss": 0.4093, + "step": 29000 + }, + { + "epoch": 0.3625340633515838, + "grad_norm": 0.9940172433853149, + "learning_rate": 1.6086719965185748e-05, + "loss": 0.4543, + "step": 29002 + }, + { + "epoch": 0.3625590639765994, + "grad_norm": 3.4599721431732178, + "learning_rate": 1.6086027531500495e-05, + "loss": 0.6925, + "step": 29004 + }, + { + "epoch": 0.362584064601615, + "grad_norm": 4.89987325668335, + "learning_rate": 1.6085335051464918e-05, + "loss": 1.6581, + "step": 29006 + }, + { + "epoch": 0.36260906522663067, + "grad_norm": 2.881443500518799, + "learning_rate": 1.608464252508429e-05, + "loss": 1.4836, + "step": 29008 + }, + { + "epoch": 0.3626340658516463, + "grad_norm": 1.437382698059082, + "learning_rate": 1.6083949952363887e-05, + "loss": 1.0574, + "step": 29010 + }, + { + "epoch": 0.3626590664766619, + "grad_norm": 8.963829040527344, + "learning_rate": 1.6083257333308977e-05, + "loss": 0.8346, + "step": 29012 + }, + { + "epoch": 0.36268406710167755, + "grad_norm": 2.9943177700042725, + "learning_rate": 1.608256466792484e-05, + "loss": 0.6433, + "step": 29014 + }, + { + "epoch": 0.36270906772669315, + "grad_norm": 1.3707125186920166, + "learning_rate": 1.6081871956216758e-05, + "loss": 0.0975, + "step": 29016 + }, + { + "epoch": 0.3627340683517088, + "grad_norm": 3.4524734020233154, + "learning_rate": 1.6081179198189992e-05, + "loss": 1.0006, + "step": 29018 + }, + { + "epoch": 0.36275906897672444, + "grad_norm": 0.008646929636597633, + "learning_rate": 1.6080486393849827e-05, + "loss": 0.0245, + "step": 29020 + }, + { + "epoch": 0.36278406960174003, + "grad_norm": 3.7620785236358643, + "learning_rate": 1.6079793543201537e-05, + "loss": 0.9701, + "step": 29022 + }, + { + "epoch": 0.3628090702267557, + "grad_norm": 3.04927921295166, + "learning_rate": 1.60791006462504e-05, + "loss": 1.0455, + "step": 29024 + }, + { + "epoch": 0.3628340708517713, + "grad_norm": 4.308865070343018, + "learning_rate": 1.6078407703001694e-05, + "loss": 1.0067, + "step": 29026 + }, + { + "epoch": 0.3628590714767869, + "grad_norm": 3.240056276321411, + "learning_rate": 1.607771471346069e-05, + "loss": 1.2409, + "step": 29028 + }, + { + "epoch": 0.36288407210180257, + "grad_norm": 2.3475425243377686, + "learning_rate": 1.6077021677632676e-05, + "loss": 0.4487, + "step": 29030 + }, + { + "epoch": 0.36290907272681816, + "grad_norm": 2.7702527046203613, + "learning_rate": 1.607632859552292e-05, + "loss": 1.2416, + "step": 29032 + }, + { + "epoch": 0.3629340733518338, + "grad_norm": 2.910712480545044, + "learning_rate": 1.6075635467136707e-05, + "loss": 0.8147, + "step": 29034 + }, + { + "epoch": 0.3629590739768494, + "grad_norm": 2.6950254440307617, + "learning_rate": 1.607494229247931e-05, + "loss": 0.466, + "step": 29036 + }, + { + "epoch": 0.36298407460186505, + "grad_norm": 1.3571683168411255, + "learning_rate": 1.6074249071556015e-05, + "loss": 0.7221, + "step": 29038 + }, + { + "epoch": 0.3630090752268807, + "grad_norm": 3.881540298461914, + "learning_rate": 1.6073555804372097e-05, + "loss": 1.1983, + "step": 29040 + }, + { + "epoch": 0.3630340758518963, + "grad_norm": 0.003779117949306965, + "learning_rate": 1.6072862490932838e-05, + "loss": 0.2655, + "step": 29042 + }, + { + "epoch": 0.36305907647691193, + "grad_norm": 3.6844890117645264, + "learning_rate": 1.6072169131243513e-05, + "loss": 0.6805, + "step": 29044 + }, + { + "epoch": 0.3630840771019275, + "grad_norm": 2.3768322467803955, + "learning_rate": 1.607147572530941e-05, + "loss": 0.8352, + "step": 29046 + }, + { + "epoch": 0.3631090777269432, + "grad_norm": 6.164806842803955, + "learning_rate": 1.6070782273135806e-05, + "loss": 0.2978, + "step": 29048 + }, + { + "epoch": 0.3631340783519588, + "grad_norm": 5.315096378326416, + "learning_rate": 1.607008877472798e-05, + "loss": 1.1308, + "step": 29050 + }, + { + "epoch": 0.3631590789769744, + "grad_norm": 4.000403881072998, + "learning_rate": 1.606939523009122e-05, + "loss": 0.8401, + "step": 29052 + }, + { + "epoch": 0.36318407960199006, + "grad_norm": 3.010051965713501, + "learning_rate": 1.6068701639230805e-05, + "loss": 1.2524, + "step": 29054 + }, + { + "epoch": 0.36320908022700565, + "grad_norm": 3.9479787349700928, + "learning_rate": 1.6068008002152014e-05, + "loss": 1.4845, + "step": 29056 + }, + { + "epoch": 0.3632340808520213, + "grad_norm": 0.0028285589069128036, + "learning_rate": 1.606731431886013e-05, + "loss": 0.2528, + "step": 29058 + }, + { + "epoch": 0.36325908147703695, + "grad_norm": 3.788156747817993, + "learning_rate": 1.6066620589360444e-05, + "loss": 1.0821, + "step": 29060 + }, + { + "epoch": 0.36328408210205254, + "grad_norm": 9.578390121459961, + "learning_rate": 1.606592681365823e-05, + "loss": 1.6463, + "step": 29062 + }, + { + "epoch": 0.3633090827270682, + "grad_norm": 3.2228479385375977, + "learning_rate": 1.6065232991758773e-05, + "loss": 0.8033, + "step": 29064 + }, + { + "epoch": 0.3633340833520838, + "grad_norm": 0.03223860636353493, + "learning_rate": 1.606453912366736e-05, + "loss": 0.7956, + "step": 29066 + }, + { + "epoch": 0.36335908397709943, + "grad_norm": 4.585761547088623, + "learning_rate": 1.606384520938928e-05, + "loss": 0.5273, + "step": 29068 + }, + { + "epoch": 0.3633840846021151, + "grad_norm": 1.405200481414795, + "learning_rate": 1.606315124892981e-05, + "loss": 0.8042, + "step": 29070 + }, + { + "epoch": 0.36340908522713067, + "grad_norm": 1.42315673828125, + "learning_rate": 1.6062457242294232e-05, + "loss": 0.0452, + "step": 29072 + }, + { + "epoch": 0.3634340858521463, + "grad_norm": 0.004502077121287584, + "learning_rate": 1.6061763189487845e-05, + "loss": 0.3961, + "step": 29074 + }, + { + "epoch": 0.3634590864771619, + "grad_norm": 2.3184382915496826, + "learning_rate": 1.606106909051592e-05, + "loss": 0.6173, + "step": 29076 + }, + { + "epoch": 0.36348408710217756, + "grad_norm": 4.613079071044922, + "learning_rate": 1.606037494538375e-05, + "loss": 1.4217, + "step": 29078 + }, + { + "epoch": 0.3635090877271932, + "grad_norm": 3.1593196392059326, + "learning_rate": 1.6059680754096628e-05, + "loss": 0.7384, + "step": 29080 + }, + { + "epoch": 0.3635340883522088, + "grad_norm": 5.457621097564697, + "learning_rate": 1.605898651665983e-05, + "loss": 1.9074, + "step": 29082 + }, + { + "epoch": 0.36355908897722444, + "grad_norm": 0.004308249801397324, + "learning_rate": 1.605829223307865e-05, + "loss": 0.5681, + "step": 29084 + }, + { + "epoch": 0.36358408960224003, + "grad_norm": 5.7088165283203125, + "learning_rate": 1.6057597903358367e-05, + "loss": 1.5173, + "step": 29086 + }, + { + "epoch": 0.3636090902272557, + "grad_norm": 0.004644345957785845, + "learning_rate": 1.6056903527504283e-05, + "loss": 0.6361, + "step": 29088 + }, + { + "epoch": 0.36363409085227133, + "grad_norm": 0.007918057031929493, + "learning_rate": 1.6056209105521672e-05, + "loss": 0.2291, + "step": 29090 + }, + { + "epoch": 0.3636590914772869, + "grad_norm": 1.864837646484375, + "learning_rate": 1.605551463741583e-05, + "loss": 0.8751, + "step": 29092 + }, + { + "epoch": 0.36368409210230257, + "grad_norm": 4.019345283508301, + "learning_rate": 1.6054820123192047e-05, + "loss": 0.8246, + "step": 29094 + }, + { + "epoch": 0.36370909272731816, + "grad_norm": 8.40980052947998, + "learning_rate": 1.605412556285561e-05, + "loss": 0.6761, + "step": 29096 + }, + { + "epoch": 0.3637340933523338, + "grad_norm": 2.6781606674194336, + "learning_rate": 1.605343095641181e-05, + "loss": 0.5366, + "step": 29098 + }, + { + "epoch": 0.36375909397734946, + "grad_norm": 3.4428305625915527, + "learning_rate": 1.6052736303865933e-05, + "loss": 1.4555, + "step": 29100 + }, + { + "epoch": 0.36378409460236505, + "grad_norm": 4.080727577209473, + "learning_rate": 1.6052041605223273e-05, + "loss": 1.1956, + "step": 29102 + }, + { + "epoch": 0.3638090952273807, + "grad_norm": 5.05393123626709, + "learning_rate": 1.605134686048912e-05, + "loss": 0.8471, + "step": 29104 + }, + { + "epoch": 0.3638340958523963, + "grad_norm": 0.09261934459209442, + "learning_rate": 1.6050652069668766e-05, + "loss": 0.0264, + "step": 29106 + }, + { + "epoch": 0.36385909647741194, + "grad_norm": 1.8957245349884033, + "learning_rate": 1.60499572327675e-05, + "loss": 0.6976, + "step": 29108 + }, + { + "epoch": 0.3638840971024276, + "grad_norm": 7.389183044433594, + "learning_rate": 1.6049262349790616e-05, + "loss": 0.5569, + "step": 29110 + }, + { + "epoch": 0.3639090977274432, + "grad_norm": 0.07493771612644196, + "learning_rate": 1.6048567420743407e-05, + "loss": 0.0044, + "step": 29112 + }, + { + "epoch": 0.3639340983524588, + "grad_norm": 2.33911395072937, + "learning_rate": 1.604787244563116e-05, + "loss": 0.96, + "step": 29114 + }, + { + "epoch": 0.3639590989774744, + "grad_norm": 1.0794578790664673, + "learning_rate": 1.6047177424459172e-05, + "loss": 0.8089, + "step": 29116 + }, + { + "epoch": 0.36398409960249006, + "grad_norm": 2.964754819869995, + "learning_rate": 1.604648235723274e-05, + "loss": 0.6502, + "step": 29118 + }, + { + "epoch": 0.3640091002275057, + "grad_norm": 0.4480469226837158, + "learning_rate": 1.604578724395715e-05, + "loss": 0.6858, + "step": 29120 + }, + { + "epoch": 0.3640341008525213, + "grad_norm": 0.048884883522987366, + "learning_rate": 1.6045092084637702e-05, + "loss": 0.5766, + "step": 29122 + }, + { + "epoch": 0.36405910147753695, + "grad_norm": 0.03611763194203377, + "learning_rate": 1.6044396879279685e-05, + "loss": 0.3375, + "step": 29124 + }, + { + "epoch": 0.36408410210255254, + "grad_norm": 3.7473032474517822, + "learning_rate": 1.6043701627888396e-05, + "loss": 0.4213, + "step": 29126 + }, + { + "epoch": 0.3641091027275682, + "grad_norm": 4.196491718292236, + "learning_rate": 1.604300633046913e-05, + "loss": 2.2483, + "step": 29128 + }, + { + "epoch": 0.36413410335258384, + "grad_norm": 5.033780097961426, + "learning_rate": 1.6042310987027186e-05, + "loss": 1.5368, + "step": 29130 + }, + { + "epoch": 0.36415910397759943, + "grad_norm": 7.226014614105225, + "learning_rate": 1.6041615597567853e-05, + "loss": 1.4472, + "step": 29132 + }, + { + "epoch": 0.3641841046026151, + "grad_norm": 1.8234319686889648, + "learning_rate": 1.6040920162096433e-05, + "loss": 0.9637, + "step": 29134 + }, + { + "epoch": 0.36420910522763067, + "grad_norm": 2.355320453643799, + "learning_rate": 1.6040224680618215e-05, + "loss": 1.0774, + "step": 29136 + }, + { + "epoch": 0.3642341058526463, + "grad_norm": 4.252763271331787, + "learning_rate": 1.6039529153138502e-05, + "loss": 1.3066, + "step": 29138 + }, + { + "epoch": 0.36425910647766196, + "grad_norm": 4.9688920974731445, + "learning_rate": 1.6038833579662586e-05, + "loss": 1.4365, + "step": 29140 + }, + { + "epoch": 0.36428410710267756, + "grad_norm": 2.9099225997924805, + "learning_rate": 1.6038137960195772e-05, + "loss": 0.6065, + "step": 29142 + }, + { + "epoch": 0.3643091077276932, + "grad_norm": 3.8898072242736816, + "learning_rate": 1.603744229474335e-05, + "loss": 1.5533, + "step": 29144 + }, + { + "epoch": 0.3643341083527088, + "grad_norm": 4.677189826965332, + "learning_rate": 1.6036746583310626e-05, + "loss": 1.468, + "step": 29146 + }, + { + "epoch": 0.36435910897772444, + "grad_norm": 0.017018428072333336, + "learning_rate": 1.6036050825902887e-05, + "loss": 0.5339, + "step": 29148 + }, + { + "epoch": 0.3643841096027401, + "grad_norm": 3.1052498817443848, + "learning_rate": 1.6035355022525445e-05, + "loss": 1.5598, + "step": 29150 + }, + { + "epoch": 0.3644091102277557, + "grad_norm": 3.3420774936676025, + "learning_rate": 1.603465917318359e-05, + "loss": 2.2831, + "step": 29152 + }, + { + "epoch": 0.36443411085277133, + "grad_norm": 3.3918306827545166, + "learning_rate": 1.6033963277882624e-05, + "loss": 2.4091, + "step": 29154 + }, + { + "epoch": 0.3644591114777869, + "grad_norm": 4.13126277923584, + "learning_rate": 1.6033267336627845e-05, + "loss": 0.5699, + "step": 29156 + }, + { + "epoch": 0.36448411210280257, + "grad_norm": 3.051316022872925, + "learning_rate": 1.6032571349424557e-05, + "loss": 0.4303, + "step": 29158 + }, + { + "epoch": 0.3645091127278182, + "grad_norm": 2.7825369834899902, + "learning_rate": 1.6031875316278057e-05, + "loss": 0.9273, + "step": 29160 + }, + { + "epoch": 0.3645341133528338, + "grad_norm": 1.4551877975463867, + "learning_rate": 1.603117923719365e-05, + "loss": 0.9778, + "step": 29162 + }, + { + "epoch": 0.36455911397784946, + "grad_norm": 0.008833288215100765, + "learning_rate": 1.6030483112176634e-05, + "loss": 0.4352, + "step": 29164 + }, + { + "epoch": 0.36458411460286505, + "grad_norm": 2.590813159942627, + "learning_rate": 1.6029786941232313e-05, + "loss": 0.9241, + "step": 29166 + }, + { + "epoch": 0.3646091152278807, + "grad_norm": 1.734621524810791, + "learning_rate": 1.6029090724365983e-05, + "loss": 1.1007, + "step": 29168 + }, + { + "epoch": 0.36463411585289635, + "grad_norm": 0.014321311376988888, + "learning_rate": 1.602839446158295e-05, + "loss": 0.4701, + "step": 29170 + }, + { + "epoch": 0.36465911647791194, + "grad_norm": 1.8006035089492798, + "learning_rate": 1.6027698152888526e-05, + "loss": 0.5048, + "step": 29172 + }, + { + "epoch": 0.3646841171029276, + "grad_norm": 0.014087690971791744, + "learning_rate": 1.6027001798287997e-05, + "loss": 0.0414, + "step": 29174 + }, + { + "epoch": 0.3647091177279432, + "grad_norm": 3.461965799331665, + "learning_rate": 1.6026305397786675e-05, + "loss": 1.2418, + "step": 29176 + }, + { + "epoch": 0.3647341183529588, + "grad_norm": 5.07861328125, + "learning_rate": 1.602560895138987e-05, + "loss": 0.6687, + "step": 29178 + }, + { + "epoch": 0.3647591189779745, + "grad_norm": 4.792228698730469, + "learning_rate": 1.6024912459102875e-05, + "loss": 1.7869, + "step": 29180 + }, + { + "epoch": 0.36478411960299006, + "grad_norm": 7.597724914550781, + "learning_rate": 1.6024215920930998e-05, + "loss": 1.1657, + "step": 29182 + }, + { + "epoch": 0.3648091202280057, + "grad_norm": 2.5896570682525635, + "learning_rate": 1.6023519336879544e-05, + "loss": 0.5563, + "step": 29184 + }, + { + "epoch": 0.3648341208530213, + "grad_norm": 0.14268112182617188, + "learning_rate": 1.602282270695382e-05, + "loss": 0.1604, + "step": 29186 + }, + { + "epoch": 0.36485912147803695, + "grad_norm": 5.119374752044678, + "learning_rate": 1.602212603115913e-05, + "loss": 0.6086, + "step": 29188 + }, + { + "epoch": 0.3648841221030526, + "grad_norm": 3.6446454524993896, + "learning_rate": 1.602142930950078e-05, + "loss": 0.79, + "step": 29190 + }, + { + "epoch": 0.3649091227280682, + "grad_norm": 9.808141708374023, + "learning_rate": 1.6020732541984075e-05, + "loss": 2.0272, + "step": 29192 + }, + { + "epoch": 0.36493412335308384, + "grad_norm": 2.7828149795532227, + "learning_rate": 1.6020035728614324e-05, + "loss": 0.8063, + "step": 29194 + }, + { + "epoch": 0.36495912397809943, + "grad_norm": 7.551031112670898, + "learning_rate": 1.601933886939683e-05, + "loss": 2.2791, + "step": 29196 + }, + { + "epoch": 0.3649841246031151, + "grad_norm": 3.780803680419922, + "learning_rate": 1.6018641964336902e-05, + "loss": 1.0492, + "step": 29198 + }, + { + "epoch": 0.3650091252281307, + "grad_norm": 6.961643695831299, + "learning_rate": 1.601794501343985e-05, + "loss": 2.0049, + "step": 29200 + }, + { + "epoch": 0.3650341258531463, + "grad_norm": 2.1499414443969727, + "learning_rate": 1.601724801671098e-05, + "loss": 0.8762, + "step": 29202 + }, + { + "epoch": 0.36505912647816197, + "grad_norm": 2.5684571266174316, + "learning_rate": 1.60165509741556e-05, + "loss": 1.1279, + "step": 29204 + }, + { + "epoch": 0.36508412710317756, + "grad_norm": 0.08636673539876938, + "learning_rate": 1.6015853885779014e-05, + "loss": 0.0014, + "step": 29206 + }, + { + "epoch": 0.3651091277281932, + "grad_norm": 3.4450674057006836, + "learning_rate": 1.601515675158654e-05, + "loss": 0.8422, + "step": 29208 + }, + { + "epoch": 0.36513412835320885, + "grad_norm": 0.4833666682243347, + "learning_rate": 1.6014459571583483e-05, + "loss": 1.2302, + "step": 29210 + }, + { + "epoch": 0.36515912897822445, + "grad_norm": 4.044803619384766, + "learning_rate": 1.601376234577515e-05, + "loss": 0.8393, + "step": 29212 + }, + { + "epoch": 0.3651841296032401, + "grad_norm": 0.5396847724914551, + "learning_rate": 1.6013065074166854e-05, + "loss": 0.7224, + "step": 29214 + }, + { + "epoch": 0.3652091302282557, + "grad_norm": 1.1070904731750488, + "learning_rate": 1.6012367756763903e-05, + "loss": 0.4574, + "step": 29216 + }, + { + "epoch": 0.36523413085327133, + "grad_norm": 3.207608938217163, + "learning_rate": 1.6011670393571615e-05, + "loss": 0.981, + "step": 29218 + }, + { + "epoch": 0.365259131478287, + "grad_norm": 3.3099379539489746, + "learning_rate": 1.6010972984595292e-05, + "loss": 0.7734, + "step": 29220 + }, + { + "epoch": 0.3652841321033026, + "grad_norm": 3.2261972427368164, + "learning_rate": 1.6010275529840247e-05, + "loss": 1.9425, + "step": 29222 + }, + { + "epoch": 0.3653091327283182, + "grad_norm": 0.014449574053287506, + "learning_rate": 1.6009578029311795e-05, + "loss": 0.2914, + "step": 29224 + }, + { + "epoch": 0.3653341333533338, + "grad_norm": 4.948655605316162, + "learning_rate": 1.600888048301525e-05, + "loss": 0.9476, + "step": 29226 + }, + { + "epoch": 0.36535913397834946, + "grad_norm": 0.007024707738310099, + "learning_rate": 1.6008182890955916e-05, + "loss": 0.0803, + "step": 29228 + }, + { + "epoch": 0.3653841346033651, + "grad_norm": 0.911193311214447, + "learning_rate": 1.6007485253139114e-05, + "loss": 0.0935, + "step": 29230 + }, + { + "epoch": 0.3654091352283807, + "grad_norm": 0.010458752512931824, + "learning_rate": 1.6006787569570153e-05, + "loss": 0.0003, + "step": 29232 + }, + { + "epoch": 0.36543413585339635, + "grad_norm": 2.9201672077178955, + "learning_rate": 1.6006089840254345e-05, + "loss": 1.3596, + "step": 29234 + }, + { + "epoch": 0.36545913647841194, + "grad_norm": 2.167858123779297, + "learning_rate": 1.600539206519701e-05, + "loss": 1.0622, + "step": 29236 + }, + { + "epoch": 0.3654841371034276, + "grad_norm": 3.7929418087005615, + "learning_rate": 1.6004694244403455e-05, + "loss": 1.0221, + "step": 29238 + }, + { + "epoch": 0.36550913772844323, + "grad_norm": 3.705775499343872, + "learning_rate": 1.6003996377879e-05, + "loss": 0.639, + "step": 29240 + }, + { + "epoch": 0.3655341383534588, + "grad_norm": 0.011726639233529568, + "learning_rate": 1.6003298465628957e-05, + "loss": 0.9932, + "step": 29242 + }, + { + "epoch": 0.3655591389784745, + "grad_norm": 7.586362361907959, + "learning_rate": 1.600260050765864e-05, + "loss": 1.2026, + "step": 29244 + }, + { + "epoch": 0.36558413960349007, + "grad_norm": 1.4639562368392944, + "learning_rate": 1.6001902503973367e-05, + "loss": 0.0954, + "step": 29246 + }, + { + "epoch": 0.3656091402285057, + "grad_norm": 7.203911781311035, + "learning_rate": 1.6001204454578455e-05, + "loss": 2.067, + "step": 29248 + }, + { + "epoch": 0.36563414085352136, + "grad_norm": 4.967589855194092, + "learning_rate": 1.600050635947922e-05, + "loss": 1.8777, + "step": 29250 + }, + { + "epoch": 0.36565914147853695, + "grad_norm": 5.506636619567871, + "learning_rate": 1.5999808218680973e-05, + "loss": 0.9612, + "step": 29252 + }, + { + "epoch": 0.3656841421035526, + "grad_norm": 1.4661370515823364, + "learning_rate": 1.5999110032189036e-05, + "loss": 0.3421, + "step": 29254 + }, + { + "epoch": 0.3657091427285682, + "grad_norm": 3.9510750770568848, + "learning_rate": 1.5998411800008727e-05, + "loss": 1.1834, + "step": 29256 + }, + { + "epoch": 0.36573414335358384, + "grad_norm": 2.6179358959198, + "learning_rate": 1.5997713522145363e-05, + "loss": 0.8068, + "step": 29258 + }, + { + "epoch": 0.3657591439785995, + "grad_norm": 4.332285404205322, + "learning_rate": 1.5997015198604256e-05, + "loss": 1.3164, + "step": 29260 + }, + { + "epoch": 0.3657841446036151, + "grad_norm": 0.693528950214386, + "learning_rate": 1.5996316829390733e-05, + "loss": 0.565, + "step": 29262 + }, + { + "epoch": 0.3658091452286307, + "grad_norm": 6.0103325843811035, + "learning_rate": 1.599561841451011e-05, + "loss": 0.9877, + "step": 29264 + }, + { + "epoch": 0.3658341458536463, + "grad_norm": 3.211667776107788, + "learning_rate": 1.5994919953967703e-05, + "loss": 1.2516, + "step": 29266 + }, + { + "epoch": 0.36585914647866197, + "grad_norm": 10.285841941833496, + "learning_rate": 1.599422144776883e-05, + "loss": 0.8258, + "step": 29268 + }, + { + "epoch": 0.3658841471036776, + "grad_norm": 4.118843078613281, + "learning_rate": 1.5993522895918822e-05, + "loss": 0.7314, + "step": 29270 + }, + { + "epoch": 0.3659091477286932, + "grad_norm": 0.6425942182540894, + "learning_rate": 1.5992824298422985e-05, + "loss": 1.0153, + "step": 29272 + }, + { + "epoch": 0.36593414835370885, + "grad_norm": 0.9014698266983032, + "learning_rate": 1.599212565528665e-05, + "loss": 0.9375, + "step": 29274 + }, + { + "epoch": 0.36595914897872445, + "grad_norm": 0.6460109353065491, + "learning_rate": 1.5991426966515132e-05, + "loss": 0.3587, + "step": 29276 + }, + { + "epoch": 0.3659841496037401, + "grad_norm": 2.007882833480835, + "learning_rate": 1.5990728232113752e-05, + "loss": 0.1065, + "step": 29278 + }, + { + "epoch": 0.36600915022875574, + "grad_norm": 2.0562870502471924, + "learning_rate": 1.599002945208783e-05, + "loss": 0.6293, + "step": 29280 + }, + { + "epoch": 0.36603415085377133, + "grad_norm": 0.00863604061305523, + "learning_rate": 1.5989330626442696e-05, + "loss": 0.0005, + "step": 29282 + }, + { + "epoch": 0.366059151478787, + "grad_norm": 2.515814781188965, + "learning_rate": 1.598863175518367e-05, + "loss": 0.1944, + "step": 29284 + }, + { + "epoch": 0.3660841521038026, + "grad_norm": 0.04578007385134697, + "learning_rate": 1.598793283831606e-05, + "loss": 0.3748, + "step": 29286 + }, + { + "epoch": 0.3661091527288182, + "grad_norm": 0.8951057195663452, + "learning_rate": 1.598723387584521e-05, + "loss": 0.1136, + "step": 29288 + }, + { + "epoch": 0.36613415335383387, + "grad_norm": 2.008394241333008, + "learning_rate": 1.598653486777643e-05, + "loss": 0.1383, + "step": 29290 + }, + { + "epoch": 0.36615915397884946, + "grad_norm": 4.800393104553223, + "learning_rate": 1.5985835814115046e-05, + "loss": 1.3009, + "step": 29292 + }, + { + "epoch": 0.3661841546038651, + "grad_norm": 0.6666750907897949, + "learning_rate": 1.5985136714866384e-05, + "loss": 0.706, + "step": 29294 + }, + { + "epoch": 0.3662091552288807, + "grad_norm": 0.023814402520656586, + "learning_rate": 1.598443757003577e-05, + "loss": 0.0005, + "step": 29296 + }, + { + "epoch": 0.36623415585389635, + "grad_norm": 2.668283224105835, + "learning_rate": 1.5983738379628523e-05, + "loss": 0.2114, + "step": 29298 + }, + { + "epoch": 0.366259156478912, + "grad_norm": 3.5869264602661133, + "learning_rate": 1.598303914364997e-05, + "loss": 0.5321, + "step": 29300 + }, + { + "epoch": 0.3662841571039276, + "grad_norm": 1.9342296123504639, + "learning_rate": 1.598233986210544e-05, + "loss": 0.5489, + "step": 29302 + }, + { + "epoch": 0.36630915772894324, + "grad_norm": 24.160512924194336, + "learning_rate": 1.5981640535000252e-05, + "loss": 0.9751, + "step": 29304 + }, + { + "epoch": 0.3663341583539588, + "grad_norm": 3.080655574798584, + "learning_rate": 1.5980941162339736e-05, + "loss": 0.8287, + "step": 29306 + }, + { + "epoch": 0.3663591589789745, + "grad_norm": 1.5323312282562256, + "learning_rate": 1.598024174412922e-05, + "loss": 1.0283, + "step": 29308 + }, + { + "epoch": 0.3663841596039901, + "grad_norm": 2.0934834480285645, + "learning_rate": 1.5979542280374028e-05, + "loss": 0.3263, + "step": 29310 + }, + { + "epoch": 0.3664091602290057, + "grad_norm": 8.823494911193848, + "learning_rate": 1.5978842771079484e-05, + "loss": 1.2166, + "step": 29312 + }, + { + "epoch": 0.36643416085402136, + "grad_norm": 1.8760210275650024, + "learning_rate": 1.5978143216250923e-05, + "loss": 0.9446, + "step": 29314 + }, + { + "epoch": 0.36645916147903695, + "grad_norm": 3.926023483276367, + "learning_rate": 1.597744361589367e-05, + "loss": 1.37, + "step": 29316 + }, + { + "epoch": 0.3664841621040526, + "grad_norm": 5.9294633865356445, + "learning_rate": 1.5976743970013048e-05, + "loss": 0.1277, + "step": 29318 + }, + { + "epoch": 0.36650916272906825, + "grad_norm": 0.024590572342276573, + "learning_rate": 1.597604427861439e-05, + "loss": 0.8306, + "step": 29320 + }, + { + "epoch": 0.36653416335408384, + "grad_norm": 3.9208266735076904, + "learning_rate": 1.5975344541703024e-05, + "loss": 0.8875, + "step": 29322 + }, + { + "epoch": 0.3665591639790995, + "grad_norm": 5.436573505401611, + "learning_rate": 1.597464475928428e-05, + "loss": 0.3246, + "step": 29324 + }, + { + "epoch": 0.3665841646041151, + "grad_norm": 1.3202277421951294, + "learning_rate": 1.5973944931363483e-05, + "loss": 0.0821, + "step": 29326 + }, + { + "epoch": 0.36660916522913073, + "grad_norm": 11.816706657409668, + "learning_rate": 1.5973245057945972e-05, + "loss": 0.5917, + "step": 29328 + }, + { + "epoch": 0.3666341658541464, + "grad_norm": 1.1239198446273804, + "learning_rate": 1.5972545139037066e-05, + "loss": 0.1528, + "step": 29330 + }, + { + "epoch": 0.36665916647916197, + "grad_norm": 0.4201180338859558, + "learning_rate": 1.5971845174642105e-05, + "loss": 0.7092, + "step": 29332 + }, + { + "epoch": 0.3666841671041776, + "grad_norm": 3.106462240219116, + "learning_rate": 1.5971145164766414e-05, + "loss": 1.3493, + "step": 29334 + }, + { + "epoch": 0.3667091677291932, + "grad_norm": 2.254387855529785, + "learning_rate": 1.5970445109415324e-05, + "loss": 0.0741, + "step": 29336 + }, + { + "epoch": 0.36673416835420886, + "grad_norm": 4.70330286026001, + "learning_rate": 1.596974500859417e-05, + "loss": 1.5943, + "step": 29338 + }, + { + "epoch": 0.3667591689792245, + "grad_norm": 3.0164568424224854, + "learning_rate": 1.596904486230828e-05, + "loss": 1.0444, + "step": 29340 + }, + { + "epoch": 0.3667841696042401, + "grad_norm": 0.26158446073532104, + "learning_rate": 1.5968344670562995e-05, + "loss": 0.419, + "step": 29342 + }, + { + "epoch": 0.36680917022925574, + "grad_norm": 2.0082218647003174, + "learning_rate": 1.5967644433363635e-05, + "loss": 0.8261, + "step": 29344 + }, + { + "epoch": 0.36683417085427134, + "grad_norm": 4.5203022956848145, + "learning_rate": 1.596694415071554e-05, + "loss": 1.202, + "step": 29346 + }, + { + "epoch": 0.366859171479287, + "grad_norm": 4.545638084411621, + "learning_rate": 1.5966243822624044e-05, + "loss": 2.4345, + "step": 29348 + }, + { + "epoch": 0.36688417210430263, + "grad_norm": 3.936439275741577, + "learning_rate": 1.5965543449094476e-05, + "loss": 0.7921, + "step": 29350 + }, + { + "epoch": 0.3669091727293182, + "grad_norm": 3.077751398086548, + "learning_rate": 1.5964843030132175e-05, + "loss": 0.6353, + "step": 29352 + }, + { + "epoch": 0.36693417335433387, + "grad_norm": 5.522324562072754, + "learning_rate": 1.596414256574247e-05, + "loss": 1.2348, + "step": 29354 + }, + { + "epoch": 0.36695917397934946, + "grad_norm": 3.680943012237549, + "learning_rate": 1.59634420559307e-05, + "loss": 0.6215, + "step": 29356 + }, + { + "epoch": 0.3669841746043651, + "grad_norm": 3.4666967391967773, + "learning_rate": 1.59627415007022e-05, + "loss": 1.4531, + "step": 29358 + }, + { + "epoch": 0.36700917522938076, + "grad_norm": 5.755508899688721, + "learning_rate": 1.5962040900062302e-05, + "loss": 0.6725, + "step": 29360 + }, + { + "epoch": 0.36703417585439635, + "grad_norm": 3.1031312942504883, + "learning_rate": 1.5961340254016343e-05, + "loss": 0.5528, + "step": 29362 + }, + { + "epoch": 0.367059176479412, + "grad_norm": 2.4011282920837402, + "learning_rate": 1.5960639562569666e-05, + "loss": 0.4899, + "step": 29364 + }, + { + "epoch": 0.3670841771044276, + "grad_norm": 3.384753704071045, + "learning_rate": 1.5959938825727595e-05, + "loss": 1.683, + "step": 29366 + }, + { + "epoch": 0.36710917772944324, + "grad_norm": 0.5887787938117981, + "learning_rate": 1.5959238043495472e-05, + "loss": 0.8765, + "step": 29368 + }, + { + "epoch": 0.3671341783544589, + "grad_norm": 0.14235875010490417, + "learning_rate": 1.5958537215878636e-05, + "loss": 1.2135, + "step": 29370 + }, + { + "epoch": 0.3671591789794745, + "grad_norm": 3.562666416168213, + "learning_rate": 1.5957836342882424e-05, + "loss": 0.7155, + "step": 29372 + }, + { + "epoch": 0.3671841796044901, + "grad_norm": 3.9782700538635254, + "learning_rate": 1.5957135424512177e-05, + "loss": 1.087, + "step": 29374 + }, + { + "epoch": 0.3672091802295057, + "grad_norm": 0.0068265339359641075, + "learning_rate": 1.5956434460773223e-05, + "loss": 0.5897, + "step": 29376 + }, + { + "epoch": 0.36723418085452136, + "grad_norm": 4.082326412200928, + "learning_rate": 1.595573345167091e-05, + "loss": 1.1408, + "step": 29378 + }, + { + "epoch": 0.367259181479537, + "grad_norm": 7.625278472900391, + "learning_rate": 1.595503239721057e-05, + "loss": 0.6614, + "step": 29380 + }, + { + "epoch": 0.3672841821045526, + "grad_norm": 0.5724866986274719, + "learning_rate": 1.5954331297397546e-05, + "loss": 0.8928, + "step": 29382 + }, + { + "epoch": 0.36730918272956825, + "grad_norm": 1.5174957513809204, + "learning_rate": 1.595363015223718e-05, + "loss": 0.0958, + "step": 29384 + }, + { + "epoch": 0.36733418335458384, + "grad_norm": 1.3928965330123901, + "learning_rate": 1.5952928961734806e-05, + "loss": 1.6382, + "step": 29386 + }, + { + "epoch": 0.3673591839795995, + "grad_norm": 3.303795576095581, + "learning_rate": 1.5952227725895767e-05, + "loss": 2.0666, + "step": 29388 + }, + { + "epoch": 0.36738418460461514, + "grad_norm": 3.2711963653564453, + "learning_rate": 1.5951526444725405e-05, + "loss": 0.3129, + "step": 29390 + }, + { + "epoch": 0.36740918522963073, + "grad_norm": 0.789703369140625, + "learning_rate": 1.5950825118229057e-05, + "loss": 0.5701, + "step": 29392 + }, + { + "epoch": 0.3674341858546464, + "grad_norm": 2.5886149406433105, + "learning_rate": 1.595012374641207e-05, + "loss": 0.7361, + "step": 29394 + }, + { + "epoch": 0.36745918647966197, + "grad_norm": 0.8492627739906311, + "learning_rate": 1.5949422329279776e-05, + "loss": 0.5674, + "step": 29396 + }, + { + "epoch": 0.3674841871046776, + "grad_norm": 10.655865669250488, + "learning_rate": 1.594872086683753e-05, + "loss": 1.7308, + "step": 29398 + }, + { + "epoch": 0.36750918772969327, + "grad_norm": 5.486154079437256, + "learning_rate": 1.5948019359090662e-05, + "loss": 0.7421, + "step": 29400 + }, + { + "epoch": 0.36753418835470886, + "grad_norm": 5.715700626373291, + "learning_rate": 1.594731780604452e-05, + "loss": 1.4492, + "step": 29402 + }, + { + "epoch": 0.3675591889797245, + "grad_norm": 2.370206832885742, + "learning_rate": 1.5946616207704447e-05, + "loss": 1.8144, + "step": 29404 + }, + { + "epoch": 0.3675841896047401, + "grad_norm": 6.906230449676514, + "learning_rate": 1.5945914564075788e-05, + "loss": 1.8188, + "step": 29406 + }, + { + "epoch": 0.36760919022975574, + "grad_norm": 5.684041976928711, + "learning_rate": 1.594521287516388e-05, + "loss": 1.1191, + "step": 29408 + }, + { + "epoch": 0.3676341908547714, + "grad_norm": 1.9481784105300903, + "learning_rate": 1.5944511140974075e-05, + "loss": 1.3778, + "step": 29410 + }, + { + "epoch": 0.367659191479787, + "grad_norm": 3.5463643074035645, + "learning_rate": 1.5943809361511716e-05, + "loss": 0.8814, + "step": 29412 + }, + { + "epoch": 0.36768419210480263, + "grad_norm": 0.006312818266451359, + "learning_rate": 1.5943107536782142e-05, + "loss": 0.0082, + "step": 29414 + }, + { + "epoch": 0.3677091927298182, + "grad_norm": 3.8054957389831543, + "learning_rate": 1.59424056667907e-05, + "loss": 1.526, + "step": 29416 + }, + { + "epoch": 0.36773419335483387, + "grad_norm": 1.4409488439559937, + "learning_rate": 1.594170375154274e-05, + "loss": 0.0208, + "step": 29418 + }, + { + "epoch": 0.3677591939798495, + "grad_norm": 4.807865142822266, + "learning_rate": 1.5941001791043603e-05, + "loss": 2.2098, + "step": 29420 + }, + { + "epoch": 0.3677841946048651, + "grad_norm": 5.810028076171875, + "learning_rate": 1.594029978529864e-05, + "loss": 1.2064, + "step": 29422 + }, + { + "epoch": 0.36780919522988076, + "grad_norm": 0.7066434025764465, + "learning_rate": 1.593959773431319e-05, + "loss": 0.038, + "step": 29424 + }, + { + "epoch": 0.36783419585489635, + "grad_norm": 2.8544270992279053, + "learning_rate": 1.5938895638092603e-05, + "loss": 1.8774, + "step": 29426 + }, + { + "epoch": 0.367859196479912, + "grad_norm": 3.587674140930176, + "learning_rate": 1.593819349664223e-05, + "loss": 1.3882, + "step": 29428 + }, + { + "epoch": 0.36788419710492765, + "grad_norm": 3.772179365158081, + "learning_rate": 1.593749130996741e-05, + "loss": 1.0489, + "step": 29430 + }, + { + "epoch": 0.36790919772994324, + "grad_norm": 4.027314186096191, + "learning_rate": 1.59367890780735e-05, + "loss": 1.8398, + "step": 29432 + }, + { + "epoch": 0.3679341983549589, + "grad_norm": 2.3995249271392822, + "learning_rate": 1.593608680096584e-05, + "loss": 0.4905, + "step": 29434 + }, + { + "epoch": 0.3679591989799745, + "grad_norm": 2.424272298812866, + "learning_rate": 1.593538447864979e-05, + "loss": 1.2667, + "step": 29436 + }, + { + "epoch": 0.3679841996049901, + "grad_norm": 0.028495702892541885, + "learning_rate": 1.5934682111130685e-05, + "loss": 0.5545, + "step": 29438 + }, + { + "epoch": 0.3680092002300058, + "grad_norm": 5.855260372161865, + "learning_rate": 1.593397969841388e-05, + "loss": 1.2207, + "step": 29440 + }, + { + "epoch": 0.36803420085502137, + "grad_norm": 2.025049924850464, + "learning_rate": 1.5933277240504728e-05, + "loss": 0.4994, + "step": 29442 + }, + { + "epoch": 0.368059201480037, + "grad_norm": 4.676986217498779, + "learning_rate": 1.5932574737408574e-05, + "loss": 1.2015, + "step": 29444 + }, + { + "epoch": 0.3680842021050526, + "grad_norm": 2.925706386566162, + "learning_rate": 1.593187218913077e-05, + "loss": 0.8228, + "step": 29446 + }, + { + "epoch": 0.36810920273006825, + "grad_norm": 3.002747058868408, + "learning_rate": 1.5931169595676665e-05, + "loss": 0.4573, + "step": 29448 + }, + { + "epoch": 0.3681342033550839, + "grad_norm": 3.0251657962799072, + "learning_rate": 1.5930466957051614e-05, + "loss": 1.0671, + "step": 29450 + }, + { + "epoch": 0.3681592039800995, + "grad_norm": 3.4613146781921387, + "learning_rate": 1.5929764273260963e-05, + "loss": 0.8636, + "step": 29452 + }, + { + "epoch": 0.36818420460511514, + "grad_norm": 3.5108704566955566, + "learning_rate": 1.5929061544310067e-05, + "loss": 0.9999, + "step": 29454 + }, + { + "epoch": 0.36820920523013073, + "grad_norm": 0.1940114051103592, + "learning_rate": 1.5928358770204274e-05, + "loss": 0.6953, + "step": 29456 + }, + { + "epoch": 0.3682342058551464, + "grad_norm": 5.4672651290893555, + "learning_rate": 1.592765595094894e-05, + "loss": 1.7141, + "step": 29458 + }, + { + "epoch": 0.368259206480162, + "grad_norm": 3.3811285495758057, + "learning_rate": 1.592695308654942e-05, + "loss": 1.9304, + "step": 29460 + }, + { + "epoch": 0.3682842071051776, + "grad_norm": 4.846039772033691, + "learning_rate": 1.5926250177011062e-05, + "loss": 1.0113, + "step": 29462 + }, + { + "epoch": 0.36830920773019327, + "grad_norm": 0.016465332359075546, + "learning_rate": 1.5925547222339214e-05, + "loss": 0.7263, + "step": 29464 + }, + { + "epoch": 0.36833420835520886, + "grad_norm": 1.7533689737319946, + "learning_rate": 1.5924844222539246e-05, + "loss": 1.3874, + "step": 29466 + }, + { + "epoch": 0.3683592089802245, + "grad_norm": 1.6435648202896118, + "learning_rate": 1.5924141177616498e-05, + "loss": 1.239, + "step": 29468 + }, + { + "epoch": 0.36838420960524015, + "grad_norm": 0.5832394957542419, + "learning_rate": 1.5923438087576327e-05, + "loss": 0.02, + "step": 29470 + }, + { + "epoch": 0.36840921023025575, + "grad_norm": 2.1768898963928223, + "learning_rate": 1.592273495242409e-05, + "loss": 0.4465, + "step": 29472 + }, + { + "epoch": 0.3684342108552714, + "grad_norm": 4.180432319641113, + "learning_rate": 1.5922031772165144e-05, + "loss": 1.5633, + "step": 29474 + }, + { + "epoch": 0.368459211480287, + "grad_norm": 4.55027961730957, + "learning_rate": 1.5921328546804836e-05, + "loss": 1.268, + "step": 29476 + }, + { + "epoch": 0.36848421210530263, + "grad_norm": 7.724940299987793, + "learning_rate": 1.5920625276348534e-05, + "loss": 1.3146, + "step": 29478 + }, + { + "epoch": 0.3685092127303183, + "grad_norm": 0.00648142397403717, + "learning_rate": 1.5919921960801582e-05, + "loss": 0.5315, + "step": 29480 + }, + { + "epoch": 0.3685342133553339, + "grad_norm": 0.5565630793571472, + "learning_rate": 1.5919218600169343e-05, + "loss": 0.8572, + "step": 29482 + }, + { + "epoch": 0.3685592139803495, + "grad_norm": 5.6761980056762695, + "learning_rate": 1.5918515194457173e-05, + "loss": 1.6528, + "step": 29484 + }, + { + "epoch": 0.3685842146053651, + "grad_norm": 3.199641227722168, + "learning_rate": 1.5917811743670427e-05, + "loss": 1.058, + "step": 29486 + }, + { + "epoch": 0.36860921523038076, + "grad_norm": 2.894172430038452, + "learning_rate": 1.5917108247814467e-05, + "loss": 0.6946, + "step": 29488 + }, + { + "epoch": 0.3686342158553964, + "grad_norm": 4.401655673980713, + "learning_rate": 1.5916404706894643e-05, + "loss": 0.931, + "step": 29490 + }, + { + "epoch": 0.368659216480412, + "grad_norm": 2.568938732147217, + "learning_rate": 1.5915701120916322e-05, + "loss": 1.1526, + "step": 29492 + }, + { + "epoch": 0.36868421710542765, + "grad_norm": 3.389446258544922, + "learning_rate": 1.5914997489884854e-05, + "loss": 0.7177, + "step": 29494 + }, + { + "epoch": 0.36870921773044324, + "grad_norm": 1.619000792503357, + "learning_rate": 1.59142938138056e-05, + "loss": 0.2353, + "step": 29496 + }, + { + "epoch": 0.3687342183554589, + "grad_norm": 2.4133191108703613, + "learning_rate": 1.5913590092683926e-05, + "loss": 1.2702, + "step": 29498 + }, + { + "epoch": 0.36875921898047453, + "grad_norm": 3.976853370666504, + "learning_rate": 1.591288632652518e-05, + "loss": 0.7533, + "step": 29500 + }, + { + "epoch": 0.3687842196054901, + "grad_norm": 4.388519763946533, + "learning_rate": 1.591218251533473e-05, + "loss": 0.9975, + "step": 29502 + }, + { + "epoch": 0.3688092202305058, + "grad_norm": 3.1586875915527344, + "learning_rate": 1.5911478659117937e-05, + "loss": 1.2481, + "step": 29504 + }, + { + "epoch": 0.36883422085552137, + "grad_norm": 5.367849826812744, + "learning_rate": 1.5910774757880157e-05, + "loss": 0.5563, + "step": 29506 + }, + { + "epoch": 0.368859221480537, + "grad_norm": 4.729092121124268, + "learning_rate": 1.5910070811626753e-05, + "loss": 1.2691, + "step": 29508 + }, + { + "epoch": 0.36888422210555266, + "grad_norm": 0.7842063903808594, + "learning_rate": 1.5909366820363082e-05, + "loss": 0.8746, + "step": 29510 + }, + { + "epoch": 0.36890922273056825, + "grad_norm": 1.7013200521469116, + "learning_rate": 1.590866278409451e-05, + "loss": 0.2454, + "step": 29512 + }, + { + "epoch": 0.3689342233555839, + "grad_norm": 5.39492130279541, + "learning_rate": 1.59079587028264e-05, + "loss": 0.6881, + "step": 29514 + }, + { + "epoch": 0.3689592239805995, + "grad_norm": 5.997808456420898, + "learning_rate": 1.590725457656411e-05, + "loss": 1.7414, + "step": 29516 + }, + { + "epoch": 0.36898422460561514, + "grad_norm": 2.456138849258423, + "learning_rate": 1.590655040531301e-05, + "loss": 1.6069, + "step": 29518 + }, + { + "epoch": 0.3690092252306308, + "grad_norm": 5.087534427642822, + "learning_rate": 1.590584618907845e-05, + "loss": 0.8331, + "step": 29520 + }, + { + "epoch": 0.3690342258556464, + "grad_norm": 2.2288901805877686, + "learning_rate": 1.5905141927865803e-05, + "loss": 0.8187, + "step": 29522 + }, + { + "epoch": 0.36905922648066203, + "grad_norm": 4.579920291900635, + "learning_rate": 1.5904437621680432e-05, + "loss": 1.3478, + "step": 29524 + }, + { + "epoch": 0.3690842271056776, + "grad_norm": 5.527646064758301, + "learning_rate": 1.59037332705277e-05, + "loss": 0.524, + "step": 29526 + }, + { + "epoch": 0.36910922773069327, + "grad_norm": 2.80073618888855, + "learning_rate": 1.5903028874412965e-05, + "loss": 0.8511, + "step": 29528 + }, + { + "epoch": 0.3691342283557089, + "grad_norm": 4.962551593780518, + "learning_rate": 1.59023244333416e-05, + "loss": 1.5237, + "step": 29530 + }, + { + "epoch": 0.3691592289807245, + "grad_norm": 0.0091569097712636, + "learning_rate": 1.5901619947318967e-05, + "loss": 0.0318, + "step": 29532 + }, + { + "epoch": 0.36918422960574016, + "grad_norm": 5.413727760314941, + "learning_rate": 1.5900915416350432e-05, + "loss": 1.2846, + "step": 29534 + }, + { + "epoch": 0.36920923023075575, + "grad_norm": 0.7564938068389893, + "learning_rate": 1.590021084044136e-05, + "loss": 0.1753, + "step": 29536 + }, + { + "epoch": 0.3692342308557714, + "grad_norm": 3.2067482471466064, + "learning_rate": 1.5899506219597112e-05, + "loss": 1.3828, + "step": 29538 + }, + { + "epoch": 0.36925923148078704, + "grad_norm": 2.444988489151001, + "learning_rate": 1.5898801553823064e-05, + "loss": 2.0321, + "step": 29540 + }, + { + "epoch": 0.36928423210580263, + "grad_norm": 2.339078664779663, + "learning_rate": 1.5898096843124573e-05, + "loss": 1.2864, + "step": 29542 + }, + { + "epoch": 0.3693092327308183, + "grad_norm": 2.6788573265075684, + "learning_rate": 1.5897392087507014e-05, + "loss": 0.4495, + "step": 29544 + }, + { + "epoch": 0.3693342333558339, + "grad_norm": 3.013563394546509, + "learning_rate": 1.589668728697575e-05, + "loss": 0.2033, + "step": 29546 + }, + { + "epoch": 0.3693592339808495, + "grad_norm": 4.0786824226379395, + "learning_rate": 1.589598244153615e-05, + "loss": 0.3601, + "step": 29548 + }, + { + "epoch": 0.36938423460586517, + "grad_norm": 8.464913368225098, + "learning_rate": 1.589527755119358e-05, + "loss": 1.9592, + "step": 29550 + }, + { + "epoch": 0.36940923523088076, + "grad_norm": 2.0723695755004883, + "learning_rate": 1.589457261595341e-05, + "loss": 0.452, + "step": 29552 + }, + { + "epoch": 0.3694342358558964, + "grad_norm": 4.763535022735596, + "learning_rate": 1.589386763582101e-05, + "loss": 0.8743, + "step": 29554 + }, + { + "epoch": 0.369459236480912, + "grad_norm": 6.812545299530029, + "learning_rate": 1.5893162610801748e-05, + "loss": 1.6207, + "step": 29556 + }, + { + "epoch": 0.36948423710592765, + "grad_norm": 5.089162826538086, + "learning_rate": 1.589245754090099e-05, + "loss": 1.0028, + "step": 29558 + }, + { + "epoch": 0.3695092377309433, + "grad_norm": 2.876169204711914, + "learning_rate": 1.5891752426124106e-05, + "loss": 0.7993, + "step": 29560 + }, + { + "epoch": 0.3695342383559589, + "grad_norm": 4.4536261558532715, + "learning_rate": 1.5891047266476476e-05, + "loss": 0.7587, + "step": 29562 + }, + { + "epoch": 0.36955923898097454, + "grad_norm": 4.0605788230896, + "learning_rate": 1.5890342061963463e-05, + "loss": 1.145, + "step": 29564 + }, + { + "epoch": 0.36958423960599013, + "grad_norm": 0.04667828604578972, + "learning_rate": 1.5889636812590432e-05, + "loss": 0.1243, + "step": 29566 + }, + { + "epoch": 0.3696092402310058, + "grad_norm": 2.932058334350586, + "learning_rate": 1.5888931518362762e-05, + "loss": 1.1347, + "step": 29568 + }, + { + "epoch": 0.3696342408560214, + "grad_norm": 3.04354190826416, + "learning_rate": 1.5888226179285825e-05, + "loss": 1.0988, + "step": 29570 + }, + { + "epoch": 0.369659241481037, + "grad_norm": 3.473334550857544, + "learning_rate": 1.588752079536499e-05, + "loss": 2.0333, + "step": 29572 + }, + { + "epoch": 0.36968424210605266, + "grad_norm": 0.013140414841473103, + "learning_rate": 1.5886815366605624e-05, + "loss": 0.1029, + "step": 29574 + }, + { + "epoch": 0.36970924273106825, + "grad_norm": 2.017326593399048, + "learning_rate": 1.588610989301311e-05, + "loss": 1.0282, + "step": 29576 + }, + { + "epoch": 0.3697342433560839, + "grad_norm": 6.43115758895874, + "learning_rate": 1.588540437459281e-05, + "loss": 1.5848, + "step": 29578 + }, + { + "epoch": 0.36975924398109955, + "grad_norm": 1.4747575521469116, + "learning_rate": 1.5884698811350107e-05, + "loss": 0.0921, + "step": 29580 + }, + { + "epoch": 0.36978424460611514, + "grad_norm": 2.3828799724578857, + "learning_rate": 1.588399320329037e-05, + "loss": 0.696, + "step": 29582 + }, + { + "epoch": 0.3698092452311308, + "grad_norm": 5.247182846069336, + "learning_rate": 1.5883287550418972e-05, + "loss": 1.0269, + "step": 29584 + }, + { + "epoch": 0.3698342458561464, + "grad_norm": 4.313385963439941, + "learning_rate": 1.588258185274129e-05, + "loss": 0.7783, + "step": 29586 + }, + { + "epoch": 0.36985924648116203, + "grad_norm": 4.236937999725342, + "learning_rate": 1.588187611026269e-05, + "loss": 1.0177, + "step": 29588 + }, + { + "epoch": 0.3698842471061777, + "grad_norm": 3.0520951747894287, + "learning_rate": 1.5881170322988558e-05, + "loss": 0.8199, + "step": 29590 + }, + { + "epoch": 0.36990924773119327, + "grad_norm": 4.956145763397217, + "learning_rate": 1.5880464490924263e-05, + "loss": 0.7004, + "step": 29592 + }, + { + "epoch": 0.3699342483562089, + "grad_norm": 5.145648002624512, + "learning_rate": 1.5879758614075185e-05, + "loss": 0.9249, + "step": 29594 + }, + { + "epoch": 0.3699592489812245, + "grad_norm": 3.628758668899536, + "learning_rate": 1.5879052692446694e-05, + "loss": 0.6583, + "step": 29596 + }, + { + "epoch": 0.36998424960624016, + "grad_norm": 3.0230422019958496, + "learning_rate": 1.5878346726044173e-05, + "loss": 0.6469, + "step": 29598 + }, + { + "epoch": 0.3700092502312558, + "grad_norm": 2.8115830421447754, + "learning_rate": 1.587764071487299e-05, + "loss": 0.8785, + "step": 29600 + }, + { + "epoch": 0.3700342508562714, + "grad_norm": 0.3454797565937042, + "learning_rate": 1.5876934658938527e-05, + "loss": 0.5074, + "step": 29602 + }, + { + "epoch": 0.37005925148128704, + "grad_norm": 12.264676094055176, + "learning_rate": 1.5876228558246163e-05, + "loss": 1.3563, + "step": 29604 + }, + { + "epoch": 0.37008425210630264, + "grad_norm": 7.209698677062988, + "learning_rate": 1.587552241280127e-05, + "loss": 1.6441, + "step": 29606 + }, + { + "epoch": 0.3701092527313183, + "grad_norm": 4.872492790222168, + "learning_rate": 1.5874816222609233e-05, + "loss": 2.1804, + "step": 29608 + }, + { + "epoch": 0.37013425335633393, + "grad_norm": 9.270752906799316, + "learning_rate": 1.5874109987675425e-05, + "loss": 1.8775, + "step": 29610 + }, + { + "epoch": 0.3701592539813495, + "grad_norm": 0.7818934917449951, + "learning_rate": 1.5873403708005226e-05, + "loss": 0.0672, + "step": 29612 + }, + { + "epoch": 0.37018425460636517, + "grad_norm": 2.2500369548797607, + "learning_rate": 1.5872697383604013e-05, + "loss": 0.9785, + "step": 29614 + }, + { + "epoch": 0.37020925523138076, + "grad_norm": 2.572767972946167, + "learning_rate": 1.587199101447717e-05, + "loss": 1.8979, + "step": 29616 + }, + { + "epoch": 0.3702342558563964, + "grad_norm": 0.62480229139328, + "learning_rate": 1.587128460063007e-05, + "loss": 0.1508, + "step": 29618 + }, + { + "epoch": 0.37025925648141206, + "grad_norm": 10.121915817260742, + "learning_rate": 1.5870578142068103e-05, + "loss": 0.8684, + "step": 29620 + }, + { + "epoch": 0.37028425710642765, + "grad_norm": 5.108806610107422, + "learning_rate": 1.586987163879664e-05, + "loss": 1.8292, + "step": 29622 + }, + { + "epoch": 0.3703092577314433, + "grad_norm": 3.313567876815796, + "learning_rate": 1.5869165090821063e-05, + "loss": 0.8537, + "step": 29624 + }, + { + "epoch": 0.3703342583564589, + "grad_norm": 1.2082951068878174, + "learning_rate": 1.5868458498146755e-05, + "loss": 0.7926, + "step": 29626 + }, + { + "epoch": 0.37035925898147454, + "grad_norm": 4.68818473815918, + "learning_rate": 1.58677518607791e-05, + "loss": 1.0831, + "step": 29628 + }, + { + "epoch": 0.3703842596064902, + "grad_norm": 0.004991587717086077, + "learning_rate": 1.5867045178723473e-05, + "loss": 0.1611, + "step": 29630 + }, + { + "epoch": 0.3704092602315058, + "grad_norm": 3.0648083686828613, + "learning_rate": 1.586633845198526e-05, + "loss": 1.7735, + "step": 29632 + }, + { + "epoch": 0.3704342608565214, + "grad_norm": 2.8717730045318604, + "learning_rate": 1.5865631680569846e-05, + "loss": 0.8642, + "step": 29634 + }, + { + "epoch": 0.370459261481537, + "grad_norm": 2.8420960903167725, + "learning_rate": 1.5864924864482605e-05, + "loss": 0.7354, + "step": 29636 + }, + { + "epoch": 0.37048426210655266, + "grad_norm": 4.280620098114014, + "learning_rate": 1.586421800372893e-05, + "loss": 1.3145, + "step": 29638 + }, + { + "epoch": 0.3705092627315683, + "grad_norm": 6.158063888549805, + "learning_rate": 1.58635110983142e-05, + "loss": 0.5043, + "step": 29640 + }, + { + "epoch": 0.3705342633565839, + "grad_norm": 1.1485693454742432, + "learning_rate": 1.5862804148243797e-05, + "loss": 0.2327, + "step": 29642 + }, + { + "epoch": 0.37055926398159955, + "grad_norm": 0.1844099760055542, + "learning_rate": 1.5862097153523106e-05, + "loss": 0.0033, + "step": 29644 + }, + { + "epoch": 0.37058426460661514, + "grad_norm": 0.009763951413333416, + "learning_rate": 1.5861390114157513e-05, + "loss": 0.4758, + "step": 29646 + }, + { + "epoch": 0.3706092652316308, + "grad_norm": 2.4788522720336914, + "learning_rate": 1.5860683030152403e-05, + "loss": 1.5481, + "step": 29648 + }, + { + "epoch": 0.37063426585664644, + "grad_norm": 5.1776604652404785, + "learning_rate": 1.5859975901513157e-05, + "loss": 1.1994, + "step": 29650 + }, + { + "epoch": 0.37065926648166203, + "grad_norm": 2.4826457500457764, + "learning_rate": 1.5859268728245167e-05, + "loss": 1.0209, + "step": 29652 + }, + { + "epoch": 0.3706842671066777, + "grad_norm": 0.007135419640690088, + "learning_rate": 1.5858561510353815e-05, + "loss": 0.0003, + "step": 29654 + }, + { + "epoch": 0.37070926773169327, + "grad_norm": 1.3903417587280273, + "learning_rate": 1.5857854247844483e-05, + "loss": 0.6532, + "step": 29656 + }, + { + "epoch": 0.3707342683567089, + "grad_norm": 2.642596483230591, + "learning_rate": 1.5857146940722564e-05, + "loss": 1.3412, + "step": 29658 + }, + { + "epoch": 0.37075926898172457, + "grad_norm": 1.3967421054840088, + "learning_rate": 1.5856439588993443e-05, + "loss": 0.2703, + "step": 29660 + }, + { + "epoch": 0.37078426960674016, + "grad_norm": 3.9549076557159424, + "learning_rate": 1.5855732192662505e-05, + "loss": 1.6849, + "step": 29662 + }, + { + "epoch": 0.3708092702317558, + "grad_norm": 6.143954753875732, + "learning_rate": 1.585502475173514e-05, + "loss": 0.4895, + "step": 29664 + }, + { + "epoch": 0.3708342708567714, + "grad_norm": 0.006737538613379002, + "learning_rate": 1.585431726621673e-05, + "loss": 0.056, + "step": 29666 + }, + { + "epoch": 0.37085927148178705, + "grad_norm": 4.677196025848389, + "learning_rate": 1.5853609736112673e-05, + "loss": 0.9441, + "step": 29668 + }, + { + "epoch": 0.3708842721068027, + "grad_norm": 2.6950998306274414, + "learning_rate": 1.585290216142835e-05, + "loss": 2.0636, + "step": 29670 + }, + { + "epoch": 0.3709092727318183, + "grad_norm": 3.754042625427246, + "learning_rate": 1.5852194542169148e-05, + "loss": 1.042, + "step": 29672 + }, + { + "epoch": 0.37093427335683393, + "grad_norm": 3.7735726833343506, + "learning_rate": 1.5851486878340465e-05, + "loss": 1.1202, + "step": 29674 + }, + { + "epoch": 0.3709592739818495, + "grad_norm": 3.4627816677093506, + "learning_rate": 1.5850779169947684e-05, + "loss": 0.5753, + "step": 29676 + }, + { + "epoch": 0.37098427460686517, + "grad_norm": 0.011608761735260487, + "learning_rate": 1.5850071416996197e-05, + "loss": 0.589, + "step": 29678 + }, + { + "epoch": 0.3710092752318808, + "grad_norm": 5.78651762008667, + "learning_rate": 1.5849363619491393e-05, + "loss": 2.3243, + "step": 29680 + }, + { + "epoch": 0.3710342758568964, + "grad_norm": 3.310384511947632, + "learning_rate": 1.584865577743866e-05, + "loss": 0.6966, + "step": 29682 + }, + { + "epoch": 0.37105927648191206, + "grad_norm": 4.511722087860107, + "learning_rate": 1.5847947890843395e-05, + "loss": 1.3043, + "step": 29684 + }, + { + "epoch": 0.37108427710692765, + "grad_norm": 2.844198226928711, + "learning_rate": 1.5847239959710985e-05, + "loss": 0.962, + "step": 29686 + }, + { + "epoch": 0.3711092777319433, + "grad_norm": 3.240715265274048, + "learning_rate": 1.5846531984046825e-05, + "loss": 1.3197, + "step": 29688 + }, + { + "epoch": 0.37113427835695895, + "grad_norm": 4.353682041168213, + "learning_rate": 1.58458239638563e-05, + "loss": 0.8179, + "step": 29690 + }, + { + "epoch": 0.37115927898197454, + "grad_norm": 7.44798469543457, + "learning_rate": 1.5845115899144806e-05, + "loss": 1.2926, + "step": 29692 + }, + { + "epoch": 0.3711842796069902, + "grad_norm": 0.053879182785749435, + "learning_rate": 1.584440778991774e-05, + "loss": 0.6656, + "step": 29694 + }, + { + "epoch": 0.3712092802320058, + "grad_norm": 2.975672960281372, + "learning_rate": 1.5843699636180486e-05, + "loss": 1.5324, + "step": 29696 + }, + { + "epoch": 0.3712342808570214, + "grad_norm": 3.635244846343994, + "learning_rate": 1.5842991437938446e-05, + "loss": 1.0262, + "step": 29698 + }, + { + "epoch": 0.3712592814820371, + "grad_norm": 0.42428097128868103, + "learning_rate": 1.5842283195197007e-05, + "loss": 0.6363, + "step": 29700 + }, + { + "epoch": 0.37128428210705267, + "grad_norm": 6.517059326171875, + "learning_rate": 1.5841574907961565e-05, + "loss": 1.7628, + "step": 29702 + }, + { + "epoch": 0.3713092827320683, + "grad_norm": 4.456569194793701, + "learning_rate": 1.5840866576237517e-05, + "loss": 1.03, + "step": 29704 + }, + { + "epoch": 0.3713342833570839, + "grad_norm": 3.248258590698242, + "learning_rate": 1.5840158200030256e-05, + "loss": 1.3066, + "step": 29706 + }, + { + "epoch": 0.37135928398209955, + "grad_norm": 1.152012825012207, + "learning_rate": 1.5839449779345172e-05, + "loss": 0.9925, + "step": 29708 + }, + { + "epoch": 0.3713842846071152, + "grad_norm": 3.9944159984588623, + "learning_rate": 1.5838741314187665e-05, + "loss": 0.905, + "step": 29710 + }, + { + "epoch": 0.3714092852321308, + "grad_norm": 0.08446012437343597, + "learning_rate": 1.583803280456313e-05, + "loss": 0.6024, + "step": 29712 + }, + { + "epoch": 0.37143428585714644, + "grad_norm": 5.201968669891357, + "learning_rate": 1.5837324250476967e-05, + "loss": 1.3977, + "step": 29714 + }, + { + "epoch": 0.37145928648216203, + "grad_norm": 0.04229126125574112, + "learning_rate": 1.5836615651934562e-05, + "loss": 0.0007, + "step": 29716 + }, + { + "epoch": 0.3714842871071777, + "grad_norm": 2.993866205215454, + "learning_rate": 1.583590700894132e-05, + "loss": 1.2993, + "step": 29718 + }, + { + "epoch": 0.3715092877321933, + "grad_norm": 4.920527458190918, + "learning_rate": 1.583519832150264e-05, + "loss": 1.4306, + "step": 29720 + }, + { + "epoch": 0.3715342883572089, + "grad_norm": 2.9459686279296875, + "learning_rate": 1.5834489589623906e-05, + "loss": 0.7558, + "step": 29722 + }, + { + "epoch": 0.37155928898222457, + "grad_norm": 5.618409633636475, + "learning_rate": 1.583378081331053e-05, + "loss": 0.9181, + "step": 29724 + }, + { + "epoch": 0.37158428960724016, + "grad_norm": 4.469542503356934, + "learning_rate": 1.5833071992567902e-05, + "loss": 1.5162, + "step": 29726 + }, + { + "epoch": 0.3716092902322558, + "grad_norm": 4.045436859130859, + "learning_rate": 1.5832363127401426e-05, + "loss": 1.1547, + "step": 29728 + }, + { + "epoch": 0.37163429085727145, + "grad_norm": 2.6367955207824707, + "learning_rate": 1.5831654217816495e-05, + "loss": 0.2968, + "step": 29730 + }, + { + "epoch": 0.37165929148228705, + "grad_norm": 4.9649553298950195, + "learning_rate": 1.5830945263818512e-05, + "loss": 0.5533, + "step": 29732 + }, + { + "epoch": 0.3716842921073027, + "grad_norm": 17.290260314941406, + "learning_rate": 1.583023626541287e-05, + "loss": 0.2247, + "step": 29734 + }, + { + "epoch": 0.3717092927323183, + "grad_norm": 8.130908966064453, + "learning_rate": 1.5829527222604973e-05, + "loss": 1.293, + "step": 29736 + }, + { + "epoch": 0.37173429335733393, + "grad_norm": 5.60189151763916, + "learning_rate": 1.5828818135400225e-05, + "loss": 0.6707, + "step": 29738 + }, + { + "epoch": 0.3717592939823496, + "grad_norm": 3.589336395263672, + "learning_rate": 1.582810900380402e-05, + "loss": 1.5362, + "step": 29740 + }, + { + "epoch": 0.3717842946073652, + "grad_norm": 11.431159973144531, + "learning_rate": 1.5827399827821763e-05, + "loss": 1.1744, + "step": 29742 + }, + { + "epoch": 0.3718092952323808, + "grad_norm": 1.9595601558685303, + "learning_rate": 1.582669060745885e-05, + "loss": 0.5209, + "step": 29744 + }, + { + "epoch": 0.3718342958573964, + "grad_norm": 0.4937020540237427, + "learning_rate": 1.582598134272069e-05, + "loss": 0.6586, + "step": 29746 + }, + { + "epoch": 0.37185929648241206, + "grad_norm": 4.536776065826416, + "learning_rate": 1.582527203361268e-05, + "loss": 1.8371, + "step": 29748 + }, + { + "epoch": 0.3718842971074277, + "grad_norm": 3.3088576793670654, + "learning_rate": 1.5824562680140216e-05, + "loss": 0.8361, + "step": 29750 + }, + { + "epoch": 0.3719092977324433, + "grad_norm": 4.379653453826904, + "learning_rate": 1.582385328230871e-05, + "loss": 1.7799, + "step": 29752 + }, + { + "epoch": 0.37193429835745895, + "grad_norm": 4.303706645965576, + "learning_rate": 1.582314384012356e-05, + "loss": 1.1228, + "step": 29754 + }, + { + "epoch": 0.37195929898247454, + "grad_norm": 6.94883394241333, + "learning_rate": 1.582243435359017e-05, + "loss": 0.3636, + "step": 29756 + }, + { + "epoch": 0.3719842996074902, + "grad_norm": 0.8710183501243591, + "learning_rate": 1.5821724822713944e-05, + "loss": 0.2382, + "step": 29758 + }, + { + "epoch": 0.37200930023250584, + "grad_norm": 4.179632186889648, + "learning_rate": 1.5821015247500287e-05, + "loss": 2.3894, + "step": 29760 + }, + { + "epoch": 0.3720343008575214, + "grad_norm": 3.7228479385375977, + "learning_rate": 1.5820305627954596e-05, + "loss": 1.8581, + "step": 29762 + }, + { + "epoch": 0.3720593014825371, + "grad_norm": 3.3897807598114014, + "learning_rate": 1.5819595964082285e-05, + "loss": 2.0537, + "step": 29764 + }, + { + "epoch": 0.37208430210755267, + "grad_norm": 6.730804920196533, + "learning_rate": 1.5818886255888752e-05, + "loss": 1.0837, + "step": 29766 + }, + { + "epoch": 0.3721093027325683, + "grad_norm": 4.278800964355469, + "learning_rate": 1.5818176503379407e-05, + "loss": 0.3948, + "step": 29768 + }, + { + "epoch": 0.37213430335758396, + "grad_norm": 3.114678144454956, + "learning_rate": 1.5817466706559647e-05, + "loss": 1.8363, + "step": 29770 + }, + { + "epoch": 0.37215930398259955, + "grad_norm": 3.1853716373443604, + "learning_rate": 1.5816756865434884e-05, + "loss": 0.7378, + "step": 29772 + }, + { + "epoch": 0.3721843046076152, + "grad_norm": 3.3680903911590576, + "learning_rate": 1.5816046980010525e-05, + "loss": 0.5838, + "step": 29774 + }, + { + "epoch": 0.3722093052326308, + "grad_norm": 3.9915482997894287, + "learning_rate": 1.5815337050291977e-05, + "loss": 0.7172, + "step": 29776 + }, + { + "epoch": 0.37223430585764644, + "grad_norm": 1.4193001985549927, + "learning_rate": 1.5814627076284644e-05, + "loss": 1.1565, + "step": 29778 + }, + { + "epoch": 0.3722593064826621, + "grad_norm": 3.943634510040283, + "learning_rate": 1.581391705799393e-05, + "loss": 1.3284, + "step": 29780 + }, + { + "epoch": 0.3722843071076777, + "grad_norm": 3.1227035522460938, + "learning_rate": 1.581320699542525e-05, + "loss": 1.8247, + "step": 29782 + }, + { + "epoch": 0.37230930773269333, + "grad_norm": 3.7871274948120117, + "learning_rate": 1.5812496888584003e-05, + "loss": 2.0192, + "step": 29784 + }, + { + "epoch": 0.3723343083577089, + "grad_norm": 2.8087198734283447, + "learning_rate": 1.5811786737475604e-05, + "loss": 1.6831, + "step": 29786 + }, + { + "epoch": 0.37235930898272457, + "grad_norm": 5.197474479675293, + "learning_rate": 1.581107654210546e-05, + "loss": 0.6795, + "step": 29788 + }, + { + "epoch": 0.3723843096077402, + "grad_norm": 4.260553359985352, + "learning_rate": 1.581036630247898e-05, + "loss": 1.3426, + "step": 29790 + }, + { + "epoch": 0.3724093102327558, + "grad_norm": 3.752868413925171, + "learning_rate": 1.580965601860157e-05, + "loss": 1.1874, + "step": 29792 + }, + { + "epoch": 0.37243431085777146, + "grad_norm": 0.024577155709266663, + "learning_rate": 1.580894569047864e-05, + "loss": 0.4816, + "step": 29794 + }, + { + "epoch": 0.37245931148278705, + "grad_norm": 6.9797468185424805, + "learning_rate": 1.5808235318115605e-05, + "loss": 1.9933, + "step": 29796 + }, + { + "epoch": 0.3724843121078027, + "grad_norm": 5.44827938079834, + "learning_rate": 1.5807524901517867e-05, + "loss": 1.3162, + "step": 29798 + }, + { + "epoch": 0.37250931273281834, + "grad_norm": 4.036038875579834, + "learning_rate": 1.580681444069084e-05, + "loss": 0.7687, + "step": 29800 + }, + { + "epoch": 0.37253431335783393, + "grad_norm": 3.3014135360717773, + "learning_rate": 1.5806103935639938e-05, + "loss": 0.4012, + "step": 29802 + }, + { + "epoch": 0.3725593139828496, + "grad_norm": 0.6534454226493835, + "learning_rate": 1.5805393386370573e-05, + "loss": 1.0923, + "step": 29804 + }, + { + "epoch": 0.3725843146078652, + "grad_norm": 2.2982089519500732, + "learning_rate": 1.5804682792888147e-05, + "loss": 0.3656, + "step": 29806 + }, + { + "epoch": 0.3726093152328808, + "grad_norm": 2.625328540802002, + "learning_rate": 1.5803972155198076e-05, + "loss": 1.0098, + "step": 29808 + }, + { + "epoch": 0.37263431585789647, + "grad_norm": 1.410197377204895, + "learning_rate": 1.580326147330578e-05, + "loss": 0.1426, + "step": 29810 + }, + { + "epoch": 0.37265931648291206, + "grad_norm": 2.9479002952575684, + "learning_rate": 1.580255074721666e-05, + "loss": 1.0129, + "step": 29812 + }, + { + "epoch": 0.3726843171079277, + "grad_norm": 1.3131765127182007, + "learning_rate": 1.580183997693614e-05, + "loss": 1.2638, + "step": 29814 + }, + { + "epoch": 0.3727093177329433, + "grad_norm": 1.6060184240341187, + "learning_rate": 1.5801129162469623e-05, + "loss": 0.5784, + "step": 29816 + }, + { + "epoch": 0.37273431835795895, + "grad_norm": 1.6425403356552124, + "learning_rate": 1.5800418303822527e-05, + "loss": 0.1036, + "step": 29818 + }, + { + "epoch": 0.3727593189829746, + "grad_norm": 3.790114164352417, + "learning_rate": 1.5799707401000267e-05, + "loss": 0.7336, + "step": 29820 + }, + { + "epoch": 0.3727843196079902, + "grad_norm": 2.9077322483062744, + "learning_rate": 1.5798996454008252e-05, + "loss": 0.4261, + "step": 29822 + }, + { + "epoch": 0.37280932023300584, + "grad_norm": 5.048074245452881, + "learning_rate": 1.5798285462851904e-05, + "loss": 0.7142, + "step": 29824 + }, + { + "epoch": 0.37283432085802143, + "grad_norm": 3.5626909732818604, + "learning_rate": 1.5797574427536632e-05, + "loss": 0.6649, + "step": 29826 + }, + { + "epoch": 0.3728593214830371, + "grad_norm": 2.0630455017089844, + "learning_rate": 1.5796863348067854e-05, + "loss": 0.7755, + "step": 29828 + }, + { + "epoch": 0.3728843221080527, + "grad_norm": 1.5984747409820557, + "learning_rate": 1.5796152224450983e-05, + "loss": 0.8575, + "step": 29830 + }, + { + "epoch": 0.3729093227330683, + "grad_norm": 2.7129316329956055, + "learning_rate": 1.5795441056691438e-05, + "loss": 1.2137, + "step": 29832 + }, + { + "epoch": 0.37293432335808396, + "grad_norm": 0.8021073341369629, + "learning_rate": 1.5794729844794635e-05, + "loss": 0.664, + "step": 29834 + }, + { + "epoch": 0.37295932398309956, + "grad_norm": 3.791635751724243, + "learning_rate": 1.5794018588765985e-05, + "loss": 0.5812, + "step": 29836 + }, + { + "epoch": 0.3729843246081152, + "grad_norm": 6.284912109375, + "learning_rate": 1.579330728861091e-05, + "loss": 1.3622, + "step": 29838 + }, + { + "epoch": 0.37300932523313085, + "grad_norm": 0.7482725977897644, + "learning_rate": 1.5792595944334825e-05, + "loss": 0.0253, + "step": 29840 + }, + { + "epoch": 0.37303432585814644, + "grad_norm": 2.9312169551849365, + "learning_rate": 1.579188455594315e-05, + "loss": 1.2135, + "step": 29842 + }, + { + "epoch": 0.3730593264831621, + "grad_norm": 3.094346523284912, + "learning_rate": 1.5791173123441305e-05, + "loss": 0.6338, + "step": 29844 + }, + { + "epoch": 0.3730843271081777, + "grad_norm": 5.187567710876465, + "learning_rate": 1.5790461646834698e-05, + "loss": 0.6304, + "step": 29846 + }, + { + "epoch": 0.37310932773319333, + "grad_norm": 0.7669540643692017, + "learning_rate": 1.5789750126128755e-05, + "loss": 0.3014, + "step": 29848 + }, + { + "epoch": 0.373134328358209, + "grad_norm": 0.16451098024845123, + "learning_rate": 1.5789038561328898e-05, + "loss": 0.0031, + "step": 29850 + }, + { + "epoch": 0.37315932898322457, + "grad_norm": 2.256387710571289, + "learning_rate": 1.5788326952440536e-05, + "loss": 1.193, + "step": 29852 + }, + { + "epoch": 0.3731843296082402, + "grad_norm": 4.645895004272461, + "learning_rate": 1.57876152994691e-05, + "loss": 1.4451, + "step": 29854 + }, + { + "epoch": 0.3732093302332558, + "grad_norm": 3.10896635055542, + "learning_rate": 1.5786903602420004e-05, + "loss": 0.7076, + "step": 29856 + }, + { + "epoch": 0.37323433085827146, + "grad_norm": 3.6241257190704346, + "learning_rate": 1.5786191861298666e-05, + "loss": 0.8577, + "step": 29858 + }, + { + "epoch": 0.3732593314832871, + "grad_norm": 3.778731107711792, + "learning_rate": 1.578548007611051e-05, + "loss": 1.5638, + "step": 29860 + }, + { + "epoch": 0.3732843321083027, + "grad_norm": 2.9784860610961914, + "learning_rate": 1.5784768246860953e-05, + "loss": 0.3141, + "step": 29862 + }, + { + "epoch": 0.37330933273331834, + "grad_norm": 0.025146326050162315, + "learning_rate": 1.5784056373555424e-05, + "loss": 0.6129, + "step": 29864 + }, + { + "epoch": 0.37333433335833394, + "grad_norm": 2.5686702728271484, + "learning_rate": 1.5783344456199334e-05, + "loss": 0.7289, + "step": 29866 + }, + { + "epoch": 0.3733593339833496, + "grad_norm": 2.906802177429199, + "learning_rate": 1.5782632494798112e-05, + "loss": 0.5944, + "step": 29868 + }, + { + "epoch": 0.37338433460836523, + "grad_norm": 4.713976860046387, + "learning_rate": 1.578192048935718e-05, + "loss": 1.9999, + "step": 29870 + }, + { + "epoch": 0.3734093352333808, + "grad_norm": 4.848275184631348, + "learning_rate": 1.5781208439881962e-05, + "loss": 1.1346, + "step": 29872 + }, + { + "epoch": 0.37343433585839647, + "grad_norm": 2.9184560775756836, + "learning_rate": 1.5780496346377876e-05, + "loss": 0.9533, + "step": 29874 + }, + { + "epoch": 0.37345933648341206, + "grad_norm": 3.7627809047698975, + "learning_rate": 1.577978420885035e-05, + "loss": 0.7919, + "step": 29876 + }, + { + "epoch": 0.3734843371084277, + "grad_norm": 2.927647829055786, + "learning_rate": 1.5779072027304798e-05, + "loss": 1.0517, + "step": 29878 + }, + { + "epoch": 0.37350933773344336, + "grad_norm": 0.0996997058391571, + "learning_rate": 1.5778359801746655e-05, + "loss": 0.3778, + "step": 29880 + }, + { + "epoch": 0.37353433835845895, + "grad_norm": 3.2963650226593018, + "learning_rate": 1.5777647532181343e-05, + "loss": 1.383, + "step": 29882 + }, + { + "epoch": 0.3735593389834746, + "grad_norm": 2.9329962730407715, + "learning_rate": 1.577693521861428e-05, + "loss": 0.412, + "step": 29884 + }, + { + "epoch": 0.3735843396084902, + "grad_norm": 0.07024354487657547, + "learning_rate": 1.5776222861050895e-05, + "loss": 0.0472, + "step": 29886 + }, + { + "epoch": 0.37360934023350584, + "grad_norm": 0.026684025302529335, + "learning_rate": 1.5775510459496616e-05, + "loss": 0.1744, + "step": 29888 + }, + { + "epoch": 0.3736343408585215, + "grad_norm": 3.122279167175293, + "learning_rate": 1.5774798013956868e-05, + "loss": 0.837, + "step": 29890 + }, + { + "epoch": 0.3736593414835371, + "grad_norm": 3.6583046913146973, + "learning_rate": 1.577408552443707e-05, + "loss": 1.5803, + "step": 29892 + }, + { + "epoch": 0.3736843421085527, + "grad_norm": 2.081986665725708, + "learning_rate": 1.5773372990942656e-05, + "loss": 0.3498, + "step": 29894 + }, + { + "epoch": 0.3737093427335683, + "grad_norm": 0.0030503123998641968, + "learning_rate": 1.577266041347905e-05, + "loss": 0.0005, + "step": 29896 + }, + { + "epoch": 0.37373434335858396, + "grad_norm": 3.1377649307250977, + "learning_rate": 1.5771947792051675e-05, + "loss": 1.1304, + "step": 29898 + }, + { + "epoch": 0.3737593439835996, + "grad_norm": 2.982231378555298, + "learning_rate": 1.5771235126665966e-05, + "loss": 1.3278, + "step": 29900 + }, + { + "epoch": 0.3737843446086152, + "grad_norm": 0.003174874698743224, + "learning_rate": 1.5770522417327344e-05, + "loss": 0.0669, + "step": 29902 + }, + { + "epoch": 0.37380934523363085, + "grad_norm": 0.0023066482972353697, + "learning_rate": 1.576980966404124e-05, + "loss": 0.1093, + "step": 29904 + }, + { + "epoch": 0.37383434585864644, + "grad_norm": 1.7718701362609863, + "learning_rate": 1.5769096866813084e-05, + "loss": 1.7509, + "step": 29906 + }, + { + "epoch": 0.3738593464836621, + "grad_norm": 3.813072681427002, + "learning_rate": 1.57683840256483e-05, + "loss": 1.3338, + "step": 29908 + }, + { + "epoch": 0.37388434710867774, + "grad_norm": 5.5196075439453125, + "learning_rate": 1.576767114055232e-05, + "loss": 1.1632, + "step": 29910 + }, + { + "epoch": 0.37390934773369333, + "grad_norm": 3.8131203651428223, + "learning_rate": 1.5766958211530573e-05, + "loss": 2.1118, + "step": 29912 + }, + { + "epoch": 0.373934348358709, + "grad_norm": 0.006907958537340164, + "learning_rate": 1.576624523858848e-05, + "loss": 0.1084, + "step": 29914 + }, + { + "epoch": 0.37395934898372457, + "grad_norm": 3.691291332244873, + "learning_rate": 1.5765532221731488e-05, + "loss": 1.1105, + "step": 29916 + }, + { + "epoch": 0.3739843496087402, + "grad_norm": 1.8956283330917358, + "learning_rate": 1.5764819160965016e-05, + "loss": 0.4975, + "step": 29918 + }, + { + "epoch": 0.37400935023375587, + "grad_norm": 3.191007375717163, + "learning_rate": 1.5764106056294494e-05, + "loss": 1.0998, + "step": 29920 + }, + { + "epoch": 0.37403435085877146, + "grad_norm": 5.408885478973389, + "learning_rate": 1.576339290772536e-05, + "loss": 0.9468, + "step": 29922 + }, + { + "epoch": 0.3740593514837871, + "grad_norm": 4.361241817474365, + "learning_rate": 1.5762679715263033e-05, + "loss": 1.178, + "step": 29924 + }, + { + "epoch": 0.3740843521088027, + "grad_norm": 4.334438800811768, + "learning_rate": 1.5761966478912956e-05, + "loss": 1.4999, + "step": 29926 + }, + { + "epoch": 0.37410935273381835, + "grad_norm": 7.905309200286865, + "learning_rate": 1.576125319868056e-05, + "loss": 1.5415, + "step": 29928 + }, + { + "epoch": 0.374134353358834, + "grad_norm": 2.18566632270813, + "learning_rate": 1.5760539874571272e-05, + "loss": 0.0881, + "step": 29930 + }, + { + "epoch": 0.3741593539838496, + "grad_norm": 1.7369240522384644, + "learning_rate": 1.5759826506590526e-05, + "loss": 1.0579, + "step": 29932 + }, + { + "epoch": 0.37418435460886523, + "grad_norm": 2.1605584621429443, + "learning_rate": 1.5759113094743757e-05, + "loss": 0.8818, + "step": 29934 + }, + { + "epoch": 0.3742093552338808, + "grad_norm": 4.936219215393066, + "learning_rate": 1.5758399639036396e-05, + "loss": 0.2692, + "step": 29936 + }, + { + "epoch": 0.3742343558588965, + "grad_norm": 2.1636507511138916, + "learning_rate": 1.5757686139473877e-05, + "loss": 0.3905, + "step": 29938 + }, + { + "epoch": 0.3742593564839121, + "grad_norm": 0.20804014801979065, + "learning_rate": 1.5756972596061637e-05, + "loss": 0.5978, + "step": 29940 + }, + { + "epoch": 0.3742843571089277, + "grad_norm": 3.651085138320923, + "learning_rate": 1.575625900880511e-05, + "loss": 1.2553, + "step": 29942 + }, + { + "epoch": 0.37430935773394336, + "grad_norm": 1.9422739744186401, + "learning_rate": 1.5755545377709723e-05, + "loss": 0.9864, + "step": 29944 + }, + { + "epoch": 0.37433435835895895, + "grad_norm": 0.18308410048484802, + "learning_rate": 1.5754831702780917e-05, + "loss": 0.0024, + "step": 29946 + }, + { + "epoch": 0.3743593589839746, + "grad_norm": 5.431649208068848, + "learning_rate": 1.575411798402413e-05, + "loss": 1.3558, + "step": 29948 + }, + { + "epoch": 0.37438435960899025, + "grad_norm": 4.085818767547607, + "learning_rate": 1.5753404221444792e-05, + "loss": 0.9604, + "step": 29950 + }, + { + "epoch": 0.37440936023400584, + "grad_norm": 1.5574226379394531, + "learning_rate": 1.5752690415048338e-05, + "loss": 0.1863, + "step": 29952 + }, + { + "epoch": 0.3744343608590215, + "grad_norm": 2.155975580215454, + "learning_rate": 1.5751976564840213e-05, + "loss": 1.3785, + "step": 29954 + }, + { + "epoch": 0.3744593614840371, + "grad_norm": 3.8032870292663574, + "learning_rate": 1.5751262670825848e-05, + "loss": 1.0294, + "step": 29956 + }, + { + "epoch": 0.3744843621090527, + "grad_norm": 1.9184626340866089, + "learning_rate": 1.5750548733010673e-05, + "loss": 1.0601, + "step": 29958 + }, + { + "epoch": 0.3745093627340684, + "grad_norm": 5.036995887756348, + "learning_rate": 1.5749834751400138e-05, + "loss": 1.9508, + "step": 29960 + }, + { + "epoch": 0.37453436335908397, + "grad_norm": 3.7052650451660156, + "learning_rate": 1.5749120725999674e-05, + "loss": 1.5507, + "step": 29962 + }, + { + "epoch": 0.3745593639840996, + "grad_norm": 8.773737907409668, + "learning_rate": 1.5748406656814717e-05, + "loss": 2.4984, + "step": 29964 + }, + { + "epoch": 0.3745843646091152, + "grad_norm": 1.58514404296875, + "learning_rate": 1.5747692543850707e-05, + "loss": 0.4555, + "step": 29966 + }, + { + "epoch": 0.37460936523413085, + "grad_norm": 1.500217318534851, + "learning_rate": 1.5746978387113088e-05, + "loss": 1.1074, + "step": 29968 + }, + { + "epoch": 0.3746343658591465, + "grad_norm": 3.4939005374908447, + "learning_rate": 1.574626418660729e-05, + "loss": 1.3918, + "step": 29970 + }, + { + "epoch": 0.3746593664841621, + "grad_norm": 4.42173957824707, + "learning_rate": 1.574554994233876e-05, + "loss": 1.982, + "step": 29972 + }, + { + "epoch": 0.37468436710917774, + "grad_norm": 4.204009532928467, + "learning_rate": 1.5744835654312932e-05, + "loss": 1.8733, + "step": 29974 + }, + { + "epoch": 0.37470936773419333, + "grad_norm": 2.796989679336548, + "learning_rate": 1.574412132253525e-05, + "loss": 0.816, + "step": 29976 + }, + { + "epoch": 0.374734368359209, + "grad_norm": 5.549876689910889, + "learning_rate": 1.5743406947011152e-05, + "loss": 1.2229, + "step": 29978 + }, + { + "epoch": 0.37475936898422463, + "grad_norm": 3.92712664604187, + "learning_rate": 1.5742692527746076e-05, + "loss": 1.5226, + "step": 29980 + }, + { + "epoch": 0.3747843696092402, + "grad_norm": 0.02134023979306221, + "learning_rate": 1.5741978064745466e-05, + "loss": 0.1297, + "step": 29982 + }, + { + "epoch": 0.37480937023425587, + "grad_norm": 2.2590079307556152, + "learning_rate": 1.5741263558014765e-05, + "loss": 0.614, + "step": 29984 + }, + { + "epoch": 0.37483437085927146, + "grad_norm": 0.4178447723388672, + "learning_rate": 1.5740549007559413e-05, + "loss": 0.3587, + "step": 29986 + }, + { + "epoch": 0.3748593714842871, + "grad_norm": 2.9783880710601807, + "learning_rate": 1.5739834413384853e-05, + "loss": 0.3301, + "step": 29988 + }, + { + "epoch": 0.37488437210930275, + "grad_norm": 3.3776042461395264, + "learning_rate": 1.5739119775496524e-05, + "loss": 0.8446, + "step": 29990 + }, + { + "epoch": 0.37490937273431835, + "grad_norm": 0.004662925377488136, + "learning_rate": 1.573840509389987e-05, + "loss": 0.0474, + "step": 29992 + }, + { + "epoch": 0.374934373359334, + "grad_norm": 4.168696403503418, + "learning_rate": 1.5737690368600332e-05, + "loss": 1.6946, + "step": 29994 + }, + { + "epoch": 0.3749593739843496, + "grad_norm": 4.059000492095947, + "learning_rate": 1.5736975599603358e-05, + "loss": 0.1309, + "step": 29996 + }, + { + "epoch": 0.37498437460936523, + "grad_norm": 0.004758118186146021, + "learning_rate": 1.5736260786914392e-05, + "loss": 1.0482, + "step": 29998 + }, + { + "epoch": 0.3750093752343809, + "grad_norm": 2.355820894241333, + "learning_rate": 1.573554593053887e-05, + "loss": 0.3778, + "step": 30000 + }, + { + "epoch": 0.3750343758593965, + "grad_norm": 2.4000468254089355, + "learning_rate": 1.573483103048224e-05, + "loss": 1.2412, + "step": 30002 + }, + { + "epoch": 0.3750593764844121, + "grad_norm": 2.8943116664886475, + "learning_rate": 1.5734116086749956e-05, + "loss": 0.6538, + "step": 30004 + }, + { + "epoch": 0.3750843771094277, + "grad_norm": 2.8385860919952393, + "learning_rate": 1.573340109934745e-05, + "loss": 0.6064, + "step": 30006 + }, + { + "epoch": 0.37510937773444336, + "grad_norm": 3.012650728225708, + "learning_rate": 1.573268606828017e-05, + "loss": 0.2897, + "step": 30008 + }, + { + "epoch": 0.375134378359459, + "grad_norm": 0.7748319506645203, + "learning_rate": 1.5731970993553566e-05, + "loss": 0.3756, + "step": 30010 + }, + { + "epoch": 0.3751593789844746, + "grad_norm": 3.702003240585327, + "learning_rate": 1.573125587517308e-05, + "loss": 1.4098, + "step": 30012 + }, + { + "epoch": 0.37518437960949025, + "grad_norm": 4.6875810623168945, + "learning_rate": 1.573054071314416e-05, + "loss": 1.349, + "step": 30014 + }, + { + "epoch": 0.37520938023450584, + "grad_norm": 0.006518353708088398, + "learning_rate": 1.5729825507472253e-05, + "loss": 0.593, + "step": 30016 + }, + { + "epoch": 0.3752343808595215, + "grad_norm": 6.812193870544434, + "learning_rate": 1.5729110258162807e-05, + "loss": 1.2887, + "step": 30018 + }, + { + "epoch": 0.37525938148453714, + "grad_norm": 4.9952545166015625, + "learning_rate": 1.5728394965221264e-05, + "loss": 1.5047, + "step": 30020 + }, + { + "epoch": 0.3752843821095527, + "grad_norm": 2.9590041637420654, + "learning_rate": 1.572767962865308e-05, + "loss": 0.594, + "step": 30022 + }, + { + "epoch": 0.3753093827345684, + "grad_norm": 6.558527946472168, + "learning_rate": 1.5726964248463693e-05, + "loss": 1.6897, + "step": 30024 + }, + { + "epoch": 0.37533438335958397, + "grad_norm": 0.003545608837157488, + "learning_rate": 1.572624882465856e-05, + "loss": 0.4887, + "step": 30026 + }, + { + "epoch": 0.3753593839845996, + "grad_norm": 2.1318650245666504, + "learning_rate": 1.5725533357243122e-05, + "loss": 0.7091, + "step": 30028 + }, + { + "epoch": 0.37538438460961526, + "grad_norm": 3.1223275661468506, + "learning_rate": 1.572481784622283e-05, + "loss": 1.6841, + "step": 30030 + }, + { + "epoch": 0.37540938523463085, + "grad_norm": 0.05935109779238701, + "learning_rate": 1.572410229160314e-05, + "loss": 0.642, + "step": 30032 + }, + { + "epoch": 0.3754343858596465, + "grad_norm": 2.3476455211639404, + "learning_rate": 1.5723386693389496e-05, + "loss": 0.6243, + "step": 30034 + }, + { + "epoch": 0.3754593864846621, + "grad_norm": 0.005840592551976442, + "learning_rate": 1.5722671051587344e-05, + "loss": 0.5727, + "step": 30036 + }, + { + "epoch": 0.37548438710967774, + "grad_norm": 4.082401752471924, + "learning_rate": 1.572195536620214e-05, + "loss": 1.3193, + "step": 30038 + }, + { + "epoch": 0.3755093877346934, + "grad_norm": 0.9887843132019043, + "learning_rate": 1.5721239637239337e-05, + "loss": 0.5752, + "step": 30040 + }, + { + "epoch": 0.375534388359709, + "grad_norm": 1.3198163509368896, + "learning_rate": 1.5720523864704378e-05, + "loss": 0.2447, + "step": 30042 + }, + { + "epoch": 0.37555938898472463, + "grad_norm": 2.606559991836548, + "learning_rate": 1.5719808048602717e-05, + "loss": 1.5355, + "step": 30044 + }, + { + "epoch": 0.3755843896097402, + "grad_norm": 4.56749963760376, + "learning_rate": 1.571909218893981e-05, + "loss": 0.8589, + "step": 30046 + }, + { + "epoch": 0.37560939023475587, + "grad_norm": 6.143381595611572, + "learning_rate": 1.5718376285721106e-05, + "loss": 2.5014, + "step": 30048 + }, + { + "epoch": 0.3756343908597715, + "grad_norm": 1.2210729122161865, + "learning_rate": 1.5717660338952056e-05, + "loss": 0.0663, + "step": 30050 + }, + { + "epoch": 0.3756593914847871, + "grad_norm": 0.3427819013595581, + "learning_rate": 1.5716944348638113e-05, + "loss": 1.2754, + "step": 30052 + }, + { + "epoch": 0.37568439210980276, + "grad_norm": 3.446812152862549, + "learning_rate": 1.5716228314784728e-05, + "loss": 0.6558, + "step": 30054 + }, + { + "epoch": 0.37570939273481835, + "grad_norm": 2.5785410404205322, + "learning_rate": 1.571551223739736e-05, + "loss": 1.0396, + "step": 30056 + }, + { + "epoch": 0.375734393359834, + "grad_norm": 2.430664539337158, + "learning_rate": 1.5714796116481457e-05, + "loss": 0.1029, + "step": 30058 + }, + { + "epoch": 0.37575939398484964, + "grad_norm": 2.6565804481506348, + "learning_rate": 1.571407995204248e-05, + "loss": 0.9802, + "step": 30060 + }, + { + "epoch": 0.37578439460986524, + "grad_norm": 3.2259206771850586, + "learning_rate": 1.5713363744085874e-05, + "loss": 0.8411, + "step": 30062 + }, + { + "epoch": 0.3758093952348809, + "grad_norm": 4.03872537612915, + "learning_rate": 1.5712647492617094e-05, + "loss": 1.3351, + "step": 30064 + }, + { + "epoch": 0.3758343958598965, + "grad_norm": 1.6170508861541748, + "learning_rate": 1.5711931197641605e-05, + "loss": 0.3848, + "step": 30066 + }, + { + "epoch": 0.3758593964849121, + "grad_norm": 1.9094655513763428, + "learning_rate": 1.5711214859164852e-05, + "loss": 0.1268, + "step": 30068 + }, + { + "epoch": 0.37588439710992777, + "grad_norm": 2.9100277423858643, + "learning_rate": 1.5710498477192297e-05, + "loss": 1.2013, + "step": 30070 + }, + { + "epoch": 0.37590939773494336, + "grad_norm": 3.5098018646240234, + "learning_rate": 1.570978205172939e-05, + "loss": 0.719, + "step": 30072 + }, + { + "epoch": 0.375934398359959, + "grad_norm": 2.4034929275512695, + "learning_rate": 1.570906558278159e-05, + "loss": 1.7948, + "step": 30074 + }, + { + "epoch": 0.3759593989849746, + "grad_norm": 2.1364195346832275, + "learning_rate": 1.5708349070354357e-05, + "loss": 0.8643, + "step": 30076 + }, + { + "epoch": 0.37598439960999025, + "grad_norm": 0.004130099434405565, + "learning_rate": 1.5707632514453144e-05, + "loss": 1.2916, + "step": 30078 + }, + { + "epoch": 0.3760094002350059, + "grad_norm": 4.136708736419678, + "learning_rate": 1.5706915915083406e-05, + "loss": 1.41, + "step": 30080 + }, + { + "epoch": 0.3760344008600215, + "grad_norm": 3.8749091625213623, + "learning_rate": 1.5706199272250605e-05, + "loss": 0.8454, + "step": 30082 + }, + { + "epoch": 0.37605940148503714, + "grad_norm": 3.489165782928467, + "learning_rate": 1.57054825859602e-05, + "loss": 0.9988, + "step": 30084 + }, + { + "epoch": 0.37608440211005273, + "grad_norm": 1.678958535194397, + "learning_rate": 1.5704765856217644e-05, + "loss": 0.253, + "step": 30086 + }, + { + "epoch": 0.3761094027350684, + "grad_norm": 0.013079924508929253, + "learning_rate": 1.5704049083028397e-05, + "loss": 0.7009, + "step": 30088 + }, + { + "epoch": 0.376134403360084, + "grad_norm": 2.9355692863464355, + "learning_rate": 1.570333226639792e-05, + "loss": 0.8637, + "step": 30090 + }, + { + "epoch": 0.3761594039850996, + "grad_norm": 0.08701728284358978, + "learning_rate": 1.570261540633167e-05, + "loss": 0.9123, + "step": 30092 + }, + { + "epoch": 0.37618440461011526, + "grad_norm": 2.849698305130005, + "learning_rate": 1.570189850283511e-05, + "loss": 1.0219, + "step": 30094 + }, + { + "epoch": 0.37620940523513086, + "grad_norm": 1.9788377285003662, + "learning_rate": 1.5701181555913696e-05, + "loss": 0.6534, + "step": 30096 + }, + { + "epoch": 0.3762344058601465, + "grad_norm": 2.5845108032226562, + "learning_rate": 1.570046456557289e-05, + "loss": 0.8054, + "step": 30098 + }, + { + "epoch": 0.37625940648516215, + "grad_norm": 3.283243417739868, + "learning_rate": 1.569974753181815e-05, + "loss": 0.2322, + "step": 30100 + }, + { + "epoch": 0.37628440711017774, + "grad_norm": 3.5009827613830566, + "learning_rate": 1.5699030454654938e-05, + "loss": 1.1004, + "step": 30102 + }, + { + "epoch": 0.3763094077351934, + "grad_norm": 3.7964284420013428, + "learning_rate": 1.569831333408872e-05, + "loss": 1.2073, + "step": 30104 + }, + { + "epoch": 0.376334408360209, + "grad_norm": 1.6818153858184814, + "learning_rate": 1.5697596170124954e-05, + "loss": 0.2767, + "step": 30106 + }, + { + "epoch": 0.37635940898522463, + "grad_norm": 3.32841157913208, + "learning_rate": 1.56968789627691e-05, + "loss": 0.6829, + "step": 30108 + }, + { + "epoch": 0.3763844096102403, + "grad_norm": 2.3339264392852783, + "learning_rate": 1.5696161712026616e-05, + "loss": 0.1002, + "step": 30110 + }, + { + "epoch": 0.37640941023525587, + "grad_norm": 3.9847681522369385, + "learning_rate": 1.5695444417902975e-05, + "loss": 1.8767, + "step": 30112 + }, + { + "epoch": 0.3764344108602715, + "grad_norm": 3.839246988296509, + "learning_rate": 1.5694727080403635e-05, + "loss": 2.0871, + "step": 30114 + }, + { + "epoch": 0.3764594114852871, + "grad_norm": 1.0880272388458252, + "learning_rate": 1.5694009699534056e-05, + "loss": 0.623, + "step": 30116 + }, + { + "epoch": 0.37648441211030276, + "grad_norm": 2.3498880863189697, + "learning_rate": 1.5693292275299707e-05, + "loss": 0.7695, + "step": 30118 + }, + { + "epoch": 0.3765094127353184, + "grad_norm": 0.0037840171717107296, + "learning_rate": 1.5692574807706046e-05, + "loss": 0.6985, + "step": 30120 + }, + { + "epoch": 0.376534413360334, + "grad_norm": 0.022401485592126846, + "learning_rate": 1.5691857296758546e-05, + "loss": 0.0195, + "step": 30122 + }, + { + "epoch": 0.37655941398534964, + "grad_norm": 0.8602520823478699, + "learning_rate": 1.5691139742462664e-05, + "loss": 0.6847, + "step": 30124 + }, + { + "epoch": 0.37658441461036524, + "grad_norm": 4.507165431976318, + "learning_rate": 1.5690422144823862e-05, + "loss": 1.848, + "step": 30126 + }, + { + "epoch": 0.3766094152353809, + "grad_norm": 3.6480753421783447, + "learning_rate": 1.5689704503847612e-05, + "loss": 0.8396, + "step": 30128 + }, + { + "epoch": 0.37663441586039653, + "grad_norm": 0.07669135183095932, + "learning_rate": 1.568898681953938e-05, + "loss": 0.9776, + "step": 30130 + }, + { + "epoch": 0.3766594164854121, + "grad_norm": 4.993127346038818, + "learning_rate": 1.5688269091904625e-05, + "loss": 1.6035, + "step": 30132 + }, + { + "epoch": 0.37668441711042777, + "grad_norm": 0.7404353618621826, + "learning_rate": 1.568755132094882e-05, + "loss": 1.2976, + "step": 30134 + }, + { + "epoch": 0.37670941773544336, + "grad_norm": 1.0858532190322876, + "learning_rate": 1.5686833506677424e-05, + "loss": 0.55, + "step": 30136 + }, + { + "epoch": 0.376734418360459, + "grad_norm": 4.103649139404297, + "learning_rate": 1.5686115649095913e-05, + "loss": 1.638, + "step": 30138 + }, + { + "epoch": 0.37675941898547466, + "grad_norm": 2.6968321800231934, + "learning_rate": 1.5685397748209744e-05, + "loss": 1.2665, + "step": 30140 + }, + { + "epoch": 0.37678441961049025, + "grad_norm": 3.907358407974243, + "learning_rate": 1.5684679804024394e-05, + "loss": 0.4032, + "step": 30142 + }, + { + "epoch": 0.3768094202355059, + "grad_norm": 3.3944931030273438, + "learning_rate": 1.568396181654533e-05, + "loss": 0.4561, + "step": 30144 + }, + { + "epoch": 0.3768344208605215, + "grad_norm": 8.941263198852539, + "learning_rate": 1.5683243785778008e-05, + "loss": 2.0749, + "step": 30146 + }, + { + "epoch": 0.37685942148553714, + "grad_norm": 2.0437636375427246, + "learning_rate": 1.568252571172791e-05, + "loss": 0.7584, + "step": 30148 + }, + { + "epoch": 0.3768844221105528, + "grad_norm": 0.7551707625389099, + "learning_rate": 1.56818075944005e-05, + "loss": 0.9445, + "step": 30150 + }, + { + "epoch": 0.3769094227355684, + "grad_norm": 4.075037956237793, + "learning_rate": 1.5681089433801244e-05, + "loss": 1.6246, + "step": 30152 + }, + { + "epoch": 0.376934423360584, + "grad_norm": 1.8617414236068726, + "learning_rate": 1.5680371229935615e-05, + "loss": 0.8835, + "step": 30154 + }, + { + "epoch": 0.3769594239855996, + "grad_norm": 1.1489031314849854, + "learning_rate": 1.5679652982809087e-05, + "loss": 0.5972, + "step": 30156 + }, + { + "epoch": 0.37698442461061527, + "grad_norm": 1.4466768503189087, + "learning_rate": 1.5678934692427116e-05, + "loss": 0.2422, + "step": 30158 + }, + { + "epoch": 0.3770094252356309, + "grad_norm": 1.7563719749450684, + "learning_rate": 1.567821635879519e-05, + "loss": 0.3217, + "step": 30160 + }, + { + "epoch": 0.3770344258606465, + "grad_norm": 0.9575015902519226, + "learning_rate": 1.5677497981918765e-05, + "loss": 1.255, + "step": 30162 + }, + { + "epoch": 0.37705942648566215, + "grad_norm": 0.0029797616880387068, + "learning_rate": 1.567677956180332e-05, + "loss": 0.1318, + "step": 30164 + }, + { + "epoch": 0.37708442711067774, + "grad_norm": 1.9725611209869385, + "learning_rate": 1.567606109845433e-05, + "loss": 0.827, + "step": 30166 + }, + { + "epoch": 0.3771094277356934, + "grad_norm": 1.4955620765686035, + "learning_rate": 1.5675342591877256e-05, + "loss": 1.1615, + "step": 30168 + }, + { + "epoch": 0.37713442836070904, + "grad_norm": 0.0019542023073881865, + "learning_rate": 1.5674624042077574e-05, + "loss": 0.0001, + "step": 30170 + }, + { + "epoch": 0.37715942898572463, + "grad_norm": 3.1351394653320312, + "learning_rate": 1.567390544906076e-05, + "loss": 1.199, + "step": 30172 + }, + { + "epoch": 0.3771844296107403, + "grad_norm": 1.932847023010254, + "learning_rate": 1.5673186812832284e-05, + "loss": 0.4767, + "step": 30174 + }, + { + "epoch": 0.37720943023575587, + "grad_norm": 3.159101724624634, + "learning_rate": 1.567246813339762e-05, + "loss": 0.5579, + "step": 30176 + }, + { + "epoch": 0.3772344308607715, + "grad_norm": 0.003215892007574439, + "learning_rate": 1.5671749410762243e-05, + "loss": 0.1113, + "step": 30178 + }, + { + "epoch": 0.37725943148578717, + "grad_norm": 4.423730850219727, + "learning_rate": 1.5671030644931623e-05, + "loss": 2.4134, + "step": 30180 + }, + { + "epoch": 0.37728443211080276, + "grad_norm": 2.017984390258789, + "learning_rate": 1.5670311835911233e-05, + "loss": 0.6471, + "step": 30182 + }, + { + "epoch": 0.3773094327358184, + "grad_norm": 3.987037181854248, + "learning_rate": 1.5669592983706553e-05, + "loss": 0.4607, + "step": 30184 + }, + { + "epoch": 0.377334433360834, + "grad_norm": 0.002364946762099862, + "learning_rate": 1.5668874088323053e-05, + "loss": 0.4546, + "step": 30186 + }, + { + "epoch": 0.37735943398584965, + "grad_norm": 0.04651040583848953, + "learning_rate": 1.566815514976621e-05, + "loss": 0.9012, + "step": 30188 + }, + { + "epoch": 0.3773844346108653, + "grad_norm": 0.028420133516192436, + "learning_rate": 1.5667436168041498e-05, + "loss": 0.5214, + "step": 30190 + }, + { + "epoch": 0.3774094352358809, + "grad_norm": 2.6479694843292236, + "learning_rate": 1.5666717143154396e-05, + "loss": 0.8774, + "step": 30192 + }, + { + "epoch": 0.37743443586089653, + "grad_norm": 4.210768699645996, + "learning_rate": 1.5665998075110376e-05, + "loss": 0.5427, + "step": 30194 + }, + { + "epoch": 0.3774594364859121, + "grad_norm": 9.692893028259277, + "learning_rate": 1.5665278963914916e-05, + "loss": 1.0264, + "step": 30196 + }, + { + "epoch": 0.3774844371109278, + "grad_norm": 3.6187522411346436, + "learning_rate": 1.566455980957349e-05, + "loss": 1.4557, + "step": 30198 + }, + { + "epoch": 0.3775094377359434, + "grad_norm": 0.917295515537262, + "learning_rate": 1.5663840612091585e-05, + "loss": 0.1558, + "step": 30200 + }, + { + "epoch": 0.377534438360959, + "grad_norm": 3.78959584236145, + "learning_rate": 1.5663121371474664e-05, + "loss": 1.9252, + "step": 30202 + }, + { + "epoch": 0.37755943898597466, + "grad_norm": 2.537977457046509, + "learning_rate": 1.5662402087728215e-05, + "loss": 1.4628, + "step": 30204 + }, + { + "epoch": 0.37758443961099025, + "grad_norm": 2.8785107135772705, + "learning_rate": 1.566168276085771e-05, + "loss": 0.57, + "step": 30206 + }, + { + "epoch": 0.3776094402360059, + "grad_norm": 3.110352039337158, + "learning_rate": 1.5660963390868632e-05, + "loss": 1.4382, + "step": 30208 + }, + { + "epoch": 0.37763444086102155, + "grad_norm": 1.6876325607299805, + "learning_rate": 1.5660243977766452e-05, + "loss": 0.3946, + "step": 30210 + }, + { + "epoch": 0.37765944148603714, + "grad_norm": 0.6215665340423584, + "learning_rate": 1.565952452155666e-05, + "loss": 0.1583, + "step": 30212 + }, + { + "epoch": 0.3776844421110528, + "grad_norm": 0.03384111821651459, + "learning_rate": 1.565880502224473e-05, + "loss": 0.669, + "step": 30214 + }, + { + "epoch": 0.3777094427360684, + "grad_norm": 4.339939117431641, + "learning_rate": 1.565808547983614e-05, + "loss": 0.8812, + "step": 30216 + }, + { + "epoch": 0.377734443361084, + "grad_norm": 0.003515461925417185, + "learning_rate": 1.5657365894336367e-05, + "loss": 0.8141, + "step": 30218 + }, + { + "epoch": 0.3777594439860997, + "grad_norm": 2.9046590328216553, + "learning_rate": 1.56566462657509e-05, + "loss": 0.8107, + "step": 30220 + }, + { + "epoch": 0.37778444461111527, + "grad_norm": 0.009207959286868572, + "learning_rate": 1.5655926594085214e-05, + "loss": 0.7056, + "step": 30222 + }, + { + "epoch": 0.3778094452361309, + "grad_norm": 5.049124240875244, + "learning_rate": 1.565520687934479e-05, + "loss": 2.0017, + "step": 30224 + }, + { + "epoch": 0.3778344458611465, + "grad_norm": 2.991636276245117, + "learning_rate": 1.5654487121535108e-05, + "loss": 1.7278, + "step": 30226 + }, + { + "epoch": 0.37785944648616215, + "grad_norm": 6.008134841918945, + "learning_rate": 1.5653767320661658e-05, + "loss": 0.9191, + "step": 30228 + }, + { + "epoch": 0.3778844471111778, + "grad_norm": 2.0584022998809814, + "learning_rate": 1.5653047476729912e-05, + "loss": 1.1027, + "step": 30230 + }, + { + "epoch": 0.3779094477361934, + "grad_norm": 5.006610870361328, + "learning_rate": 1.5652327589745354e-05, + "loss": 0.9491, + "step": 30232 + }, + { + "epoch": 0.37793444836120904, + "grad_norm": 2.8691036701202393, + "learning_rate": 1.5651607659713468e-05, + "loss": 1.1243, + "step": 30234 + }, + { + "epoch": 0.37795944898622463, + "grad_norm": 13.539953231811523, + "learning_rate": 1.565088768663974e-05, + "loss": 0.6452, + "step": 30236 + }, + { + "epoch": 0.3779844496112403, + "grad_norm": 1.2906296253204346, + "learning_rate": 1.565016767052965e-05, + "loss": 0.8315, + "step": 30238 + }, + { + "epoch": 0.37800945023625593, + "grad_norm": 4.357545852661133, + "learning_rate": 1.564944761138868e-05, + "loss": 0.5068, + "step": 30240 + }, + { + "epoch": 0.3780344508612715, + "grad_norm": 3.453674793243408, + "learning_rate": 1.564872750922232e-05, + "loss": 0.5672, + "step": 30242 + }, + { + "epoch": 0.37805945148628717, + "grad_norm": 4.935675621032715, + "learning_rate": 1.5648007364036047e-05, + "loss": 1.0768, + "step": 30244 + }, + { + "epoch": 0.37808445211130276, + "grad_norm": 1.400139570236206, + "learning_rate": 1.5647287175835352e-05, + "loss": 0.0768, + "step": 30246 + }, + { + "epoch": 0.3781094527363184, + "grad_norm": 5.1470232009887695, + "learning_rate": 1.5646566944625712e-05, + "loss": 2.1298, + "step": 30248 + }, + { + "epoch": 0.37813445336133406, + "grad_norm": 1.9457874298095703, + "learning_rate": 1.564584667041262e-05, + "loss": 1.0206, + "step": 30250 + }, + { + "epoch": 0.37815945398634965, + "grad_norm": 0.0014074454084038734, + "learning_rate": 1.5645126353201553e-05, + "loss": 0.552, + "step": 30252 + }, + { + "epoch": 0.3781844546113653, + "grad_norm": 1.7864265441894531, + "learning_rate": 1.5644405992998007e-05, + "loss": 1.0273, + "step": 30254 + }, + { + "epoch": 0.3782094552363809, + "grad_norm": 3.0537219047546387, + "learning_rate": 1.5643685589807462e-05, + "loss": 0.7948, + "step": 30256 + }, + { + "epoch": 0.37823445586139653, + "grad_norm": 5.684289455413818, + "learning_rate": 1.5642965143635405e-05, + "loss": 1.1535, + "step": 30258 + }, + { + "epoch": 0.3782594564864122, + "grad_norm": 10.754509925842285, + "learning_rate": 1.5642244654487323e-05, + "loss": 2.6024, + "step": 30260 + }, + { + "epoch": 0.3782844571114278, + "grad_norm": 5.522238731384277, + "learning_rate": 1.5641524122368707e-05, + "loss": 0.8163, + "step": 30262 + }, + { + "epoch": 0.3783094577364434, + "grad_norm": 3.1803793907165527, + "learning_rate": 1.5640803547285036e-05, + "loss": 1.6795, + "step": 30264 + }, + { + "epoch": 0.378334458361459, + "grad_norm": 4.307185649871826, + "learning_rate": 1.5640082929241805e-05, + "loss": 0.89, + "step": 30266 + }, + { + "epoch": 0.37835945898647466, + "grad_norm": 3.426630735397339, + "learning_rate": 1.5639362268244503e-05, + "loss": 1.0386, + "step": 30268 + }, + { + "epoch": 0.3783844596114903, + "grad_norm": 0.457202672958374, + "learning_rate": 1.5638641564298613e-05, + "loss": 0.4682, + "step": 30270 + }, + { + "epoch": 0.3784094602365059, + "grad_norm": 0.0033702992368489504, + "learning_rate": 1.5637920817409623e-05, + "loss": 0.8846, + "step": 30272 + }, + { + "epoch": 0.37843446086152155, + "grad_norm": 2.6239986419677734, + "learning_rate": 1.563720002758303e-05, + "loss": 0.7104, + "step": 30274 + }, + { + "epoch": 0.37845946148653714, + "grad_norm": 1.1969410181045532, + "learning_rate": 1.5636479194824318e-05, + "loss": 0.618, + "step": 30276 + }, + { + "epoch": 0.3784844621115528, + "grad_norm": 0.003446402493864298, + "learning_rate": 1.5635758319138977e-05, + "loss": 0.8438, + "step": 30278 + }, + { + "epoch": 0.37850946273656844, + "grad_norm": 3.2389655113220215, + "learning_rate": 1.56350374005325e-05, + "loss": 0.635, + "step": 30280 + }, + { + "epoch": 0.37853446336158403, + "grad_norm": 0.021431926637887955, + "learning_rate": 1.563431643901037e-05, + "loss": 0.1758, + "step": 30282 + }, + { + "epoch": 0.3785594639865997, + "grad_norm": 2.666109800338745, + "learning_rate": 1.5633595434578085e-05, + "loss": 1.0391, + "step": 30284 + }, + { + "epoch": 0.37858446461161527, + "grad_norm": 2.204163074493408, + "learning_rate": 1.5632874387241132e-05, + "loss": 1.3129, + "step": 30286 + }, + { + "epoch": 0.3786094652366309, + "grad_norm": 0.006110445596277714, + "learning_rate": 1.563215329700501e-05, + "loss": 0.0314, + "step": 30288 + }, + { + "epoch": 0.37863446586164656, + "grad_norm": 0.5272877216339111, + "learning_rate": 1.56314321638752e-05, + "loss": 0.7141, + "step": 30290 + }, + { + "epoch": 0.37865946648666216, + "grad_norm": 0.0069760591723024845, + "learning_rate": 1.5630710987857196e-05, + "loss": 0.2669, + "step": 30292 + }, + { + "epoch": 0.3786844671116778, + "grad_norm": 2.6813063621520996, + "learning_rate": 1.56299897689565e-05, + "loss": 0.6417, + "step": 30294 + }, + { + "epoch": 0.3787094677366934, + "grad_norm": 4.577996730804443, + "learning_rate": 1.5629268507178594e-05, + "loss": 0.8251, + "step": 30296 + }, + { + "epoch": 0.37873446836170904, + "grad_norm": 1.4831849336624146, + "learning_rate": 1.5628547202528976e-05, + "loss": 1.054, + "step": 30298 + }, + { + "epoch": 0.3787594689867247, + "grad_norm": 3.4204814434051514, + "learning_rate": 1.562782585501314e-05, + "loss": 1.1327, + "step": 30300 + }, + { + "epoch": 0.3787844696117403, + "grad_norm": 1.920538067817688, + "learning_rate": 1.5627104464636578e-05, + "loss": 0.2231, + "step": 30302 + }, + { + "epoch": 0.37880947023675593, + "grad_norm": 3.3748528957366943, + "learning_rate": 1.5626383031404783e-05, + "loss": 1.5056, + "step": 30304 + }, + { + "epoch": 0.3788344708617715, + "grad_norm": 0.003974251449108124, + "learning_rate": 1.5625661555323252e-05, + "loss": 0.0667, + "step": 30306 + }, + { + "epoch": 0.37885947148678717, + "grad_norm": 5.407590389251709, + "learning_rate": 1.562494003639748e-05, + "loss": 1.758, + "step": 30308 + }, + { + "epoch": 0.3788844721118028, + "grad_norm": 2.995112657546997, + "learning_rate": 1.562421847463295e-05, + "loss": 0.6711, + "step": 30310 + }, + { + "epoch": 0.3789094727368184, + "grad_norm": 3.45449161529541, + "learning_rate": 1.5623496870035177e-05, + "loss": 2.0626, + "step": 30312 + }, + { + "epoch": 0.37893447336183406, + "grad_norm": 3.0986969470977783, + "learning_rate": 1.5622775222609648e-05, + "loss": 0.8068, + "step": 30314 + }, + { + "epoch": 0.37895947398684965, + "grad_norm": 1.193084955215454, + "learning_rate": 1.5622053532361853e-05, + "loss": 0.3637, + "step": 30316 + }, + { + "epoch": 0.3789844746118653, + "grad_norm": 2.9809482097625732, + "learning_rate": 1.562133179929729e-05, + "loss": 0.568, + "step": 30318 + }, + { + "epoch": 0.37900947523688094, + "grad_norm": 3.4037926197052, + "learning_rate": 1.562061002342147e-05, + "loss": 0.8602, + "step": 30320 + }, + { + "epoch": 0.37903447586189654, + "grad_norm": 0.0019136398332193494, + "learning_rate": 1.561988820473987e-05, + "loss": 0.2332, + "step": 30322 + }, + { + "epoch": 0.3790594764869122, + "grad_norm": 5.6287055015563965, + "learning_rate": 1.5619166343258e-05, + "loss": 1.8227, + "step": 30324 + }, + { + "epoch": 0.3790844771119278, + "grad_norm": 3.8021843433380127, + "learning_rate": 1.5618444438981354e-05, + "loss": 2.0877, + "step": 30326 + }, + { + "epoch": 0.3791094777369434, + "grad_norm": 4.5018720626831055, + "learning_rate": 1.5617722491915426e-05, + "loss": 1.246, + "step": 30328 + }, + { + "epoch": 0.37913447836195907, + "grad_norm": 0.17228859663009644, + "learning_rate": 1.561700050206572e-05, + "loss": 1.0069, + "step": 30330 + }, + { + "epoch": 0.37915947898697466, + "grad_norm": 1.5001697540283203, + "learning_rate": 1.5616278469437733e-05, + "loss": 1.239, + "step": 30332 + }, + { + "epoch": 0.3791844796119903, + "grad_norm": 2.139738082885742, + "learning_rate": 1.5615556394036962e-05, + "loss": 0.1615, + "step": 30334 + }, + { + "epoch": 0.3792094802370059, + "grad_norm": 1.9787238836288452, + "learning_rate": 1.561483427586891e-05, + "loss": 0.5687, + "step": 30336 + }, + { + "epoch": 0.37923448086202155, + "grad_norm": 2.3993842601776123, + "learning_rate": 1.5614112114939074e-05, + "loss": 0.6759, + "step": 30338 + }, + { + "epoch": 0.3792594814870372, + "grad_norm": 2.785216808319092, + "learning_rate": 1.561338991125295e-05, + "loss": 0.8447, + "step": 30340 + }, + { + "epoch": 0.3792844821120528, + "grad_norm": 5.036620140075684, + "learning_rate": 1.561266766481605e-05, + "loss": 0.9412, + "step": 30342 + }, + { + "epoch": 0.37930948273706844, + "grad_norm": 0.0025030304677784443, + "learning_rate": 1.561194537563386e-05, + "loss": 0.0002, + "step": 30344 + }, + { + "epoch": 0.37933448336208403, + "grad_norm": 2.229783058166504, + "learning_rate": 1.561122304371189e-05, + "loss": 0.7667, + "step": 30346 + }, + { + "epoch": 0.3793594839870997, + "grad_norm": 0.006351475138217211, + "learning_rate": 1.5610500669055637e-05, + "loss": 0.578, + "step": 30348 + }, + { + "epoch": 0.3793844846121153, + "grad_norm": 1.6709269285202026, + "learning_rate": 1.5609778251670603e-05, + "loss": 0.0561, + "step": 30350 + }, + { + "epoch": 0.3794094852371309, + "grad_norm": 6.483983993530273, + "learning_rate": 1.5609055791562297e-05, + "loss": 2.0093, + "step": 30352 + }, + { + "epoch": 0.37943448586214656, + "grad_norm": 4.535984039306641, + "learning_rate": 1.5608333288736213e-05, + "loss": 0.5735, + "step": 30354 + }, + { + "epoch": 0.37945948648716216, + "grad_norm": 4.611697196960449, + "learning_rate": 1.5607610743197855e-05, + "loss": 2.139, + "step": 30356 + }, + { + "epoch": 0.3794844871121778, + "grad_norm": 4.263829708099365, + "learning_rate": 1.5606888154952726e-05, + "loss": 1.4595, + "step": 30358 + }, + { + "epoch": 0.37950948773719345, + "grad_norm": 0.593062162399292, + "learning_rate": 1.560616552400633e-05, + "loss": 0.4429, + "step": 30360 + }, + { + "epoch": 0.37953448836220904, + "grad_norm": 1.960900068283081, + "learning_rate": 1.5605442850364168e-05, + "loss": 0.9905, + "step": 30362 + }, + { + "epoch": 0.3795594889872247, + "grad_norm": 0.005228148773312569, + "learning_rate": 1.560472013403175e-05, + "loss": 0.2518, + "step": 30364 + }, + { + "epoch": 0.3795844896122403, + "grad_norm": 3.5649213790893555, + "learning_rate": 1.5603997375014574e-05, + "loss": 1.6797, + "step": 30366 + }, + { + "epoch": 0.37960949023725593, + "grad_norm": 3.2763240337371826, + "learning_rate": 1.560327457331815e-05, + "loss": 1.3192, + "step": 30368 + }, + { + "epoch": 0.3796344908622716, + "grad_norm": 2.5512332916259766, + "learning_rate": 1.5602551728947974e-05, + "loss": 0.7676, + "step": 30370 + }, + { + "epoch": 0.37965949148728717, + "grad_norm": 2.1918632984161377, + "learning_rate": 1.560182884190956e-05, + "loss": 0.3858, + "step": 30372 + }, + { + "epoch": 0.3796844921123028, + "grad_norm": 3.1123743057250977, + "learning_rate": 1.5601105912208405e-05, + "loss": 0.5066, + "step": 30374 + }, + { + "epoch": 0.3797094927373184, + "grad_norm": 0.002112706657499075, + "learning_rate": 1.5600382939850023e-05, + "loss": 0.7173, + "step": 30376 + }, + { + "epoch": 0.37973449336233406, + "grad_norm": 1.8085726499557495, + "learning_rate": 1.5599659924839917e-05, + "loss": 0.1206, + "step": 30378 + }, + { + "epoch": 0.3797594939873497, + "grad_norm": 4.297982215881348, + "learning_rate": 1.559893686718359e-05, + "loss": 0.5011, + "step": 30380 + }, + { + "epoch": 0.3797844946123653, + "grad_norm": 3.5286624431610107, + "learning_rate": 1.5598213766886556e-05, + "loss": 0.9214, + "step": 30382 + }, + { + "epoch": 0.37980949523738095, + "grad_norm": 6.8975629806518555, + "learning_rate": 1.5597490623954312e-05, + "loss": 1.1969, + "step": 30384 + }, + { + "epoch": 0.37983449586239654, + "grad_norm": 3.243635416030884, + "learning_rate": 1.5596767438392374e-05, + "loss": 0.7983, + "step": 30386 + }, + { + "epoch": 0.3798594964874122, + "grad_norm": 0.24905182421207428, + "learning_rate": 1.559604421020625e-05, + "loss": 0.0083, + "step": 30388 + }, + { + "epoch": 0.37988449711242783, + "grad_norm": 3.282480239868164, + "learning_rate": 1.5595320939401437e-05, + "loss": 0.3597, + "step": 30390 + }, + { + "epoch": 0.3799094977374434, + "grad_norm": 0.03261006996035576, + "learning_rate": 1.559459762598346e-05, + "loss": 1.0875, + "step": 30392 + }, + { + "epoch": 0.37993449836245907, + "grad_norm": 2.2195262908935547, + "learning_rate": 1.5593874269957812e-05, + "loss": 0.9963, + "step": 30394 + }, + { + "epoch": 0.37995949898747466, + "grad_norm": 2.4964118003845215, + "learning_rate": 1.559315087133001e-05, + "loss": 1.4982, + "step": 30396 + }, + { + "epoch": 0.3799844996124903, + "grad_norm": 0.062103286385536194, + "learning_rate": 1.559242743010556e-05, + "loss": 0.0332, + "step": 30398 + }, + { + "epoch": 0.38000950023750596, + "grad_norm": 3.283769130706787, + "learning_rate": 1.559170394628998e-05, + "loss": 0.8008, + "step": 30400 + }, + { + "epoch": 0.38003450086252155, + "grad_norm": 3.741773843765259, + "learning_rate": 1.5590980419888766e-05, + "loss": 2.1266, + "step": 30402 + }, + { + "epoch": 0.3800595014875372, + "grad_norm": 4.782905578613281, + "learning_rate": 1.559025685090744e-05, + "loss": 1.1395, + "step": 30404 + }, + { + "epoch": 0.3800845021125528, + "grad_norm": 3.801312208175659, + "learning_rate": 1.5589533239351508e-05, + "loss": 1.8435, + "step": 30406 + }, + { + "epoch": 0.38010950273756844, + "grad_norm": 0.7724413871765137, + "learning_rate": 1.558880958522648e-05, + "loss": 0.475, + "step": 30408 + }, + { + "epoch": 0.3801345033625841, + "grad_norm": 2.385510206222534, + "learning_rate": 1.558808588853787e-05, + "loss": 1.3702, + "step": 30410 + }, + { + "epoch": 0.3801595039875997, + "grad_norm": 5.054802417755127, + "learning_rate": 1.558736214929119e-05, + "loss": 1.2538, + "step": 30412 + }, + { + "epoch": 0.3801845046126153, + "grad_norm": 0.621084451675415, + "learning_rate": 1.5586638367491947e-05, + "loss": 0.1807, + "step": 30414 + }, + { + "epoch": 0.3802095052376309, + "grad_norm": 2.905869722366333, + "learning_rate": 1.5585914543145655e-05, + "loss": 0.8179, + "step": 30416 + }, + { + "epoch": 0.38023450586264657, + "grad_norm": 2.982257604598999, + "learning_rate": 1.5585190676257827e-05, + "loss": 1.391, + "step": 30418 + }, + { + "epoch": 0.3802595064876622, + "grad_norm": 4.840310573577881, + "learning_rate": 1.558446676683398e-05, + "loss": 1.1776, + "step": 30420 + }, + { + "epoch": 0.3802845071126778, + "grad_norm": 3.780665397644043, + "learning_rate": 1.5583742814879623e-05, + "loss": 2.1785, + "step": 30422 + }, + { + "epoch": 0.38030950773769345, + "grad_norm": 3.7229256629943848, + "learning_rate": 1.5583018820400272e-05, + "loss": 0.7694, + "step": 30424 + }, + { + "epoch": 0.38033450836270905, + "grad_norm": 0.02361309714615345, + "learning_rate": 1.5582294783401435e-05, + "loss": 0.6514, + "step": 30426 + }, + { + "epoch": 0.3803595089877247, + "grad_norm": 0.4796699285507202, + "learning_rate": 1.5581570703888633e-05, + "loss": 0.0287, + "step": 30428 + }, + { + "epoch": 0.38038450961274034, + "grad_norm": 3.8246114253997803, + "learning_rate": 1.5580846581867376e-05, + "loss": 1.177, + "step": 30430 + }, + { + "epoch": 0.38040951023775593, + "grad_norm": 3.586496591567993, + "learning_rate": 1.5580122417343183e-05, + "loss": 1.198, + "step": 30432 + }, + { + "epoch": 0.3804345108627716, + "grad_norm": 0.8822323679924011, + "learning_rate": 1.5579398210321567e-05, + "loss": 0.4048, + "step": 30434 + }, + { + "epoch": 0.38045951148778717, + "grad_norm": 3.017054796218872, + "learning_rate": 1.5578673960808042e-05, + "loss": 0.1588, + "step": 30436 + }, + { + "epoch": 0.3804845121128028, + "grad_norm": 4.659743309020996, + "learning_rate": 1.5577949668808124e-05, + "loss": 1.0872, + "step": 30438 + }, + { + "epoch": 0.38050951273781847, + "grad_norm": 3.2967331409454346, + "learning_rate": 1.5577225334327332e-05, + "loss": 1.1275, + "step": 30440 + }, + { + "epoch": 0.38053451336283406, + "grad_norm": 0.07542051374912262, + "learning_rate": 1.557650095737118e-05, + "loss": 0.0985, + "step": 30442 + }, + { + "epoch": 0.3805595139878497, + "grad_norm": 2.7950730323791504, + "learning_rate": 1.5575776537945185e-05, + "loss": 0.623, + "step": 30444 + }, + { + "epoch": 0.3805845146128653, + "grad_norm": 3.960014820098877, + "learning_rate": 1.5575052076054866e-05, + "loss": 0.6857, + "step": 30446 + }, + { + "epoch": 0.38060951523788095, + "grad_norm": 1.2951486110687256, + "learning_rate": 1.5574327571705736e-05, + "loss": 0.0947, + "step": 30448 + }, + { + "epoch": 0.3806345158628966, + "grad_norm": 3.0766661167144775, + "learning_rate": 1.5573603024903315e-05, + "loss": 1.5625, + "step": 30450 + }, + { + "epoch": 0.3806595164879122, + "grad_norm": 3.066297769546509, + "learning_rate": 1.5572878435653125e-05, + "loss": 1.0412, + "step": 30452 + }, + { + "epoch": 0.38068451711292783, + "grad_norm": 5.5578389167785645, + "learning_rate": 1.557215380396068e-05, + "loss": 1.1461, + "step": 30454 + }, + { + "epoch": 0.3807095177379434, + "grad_norm": 2.4417598247528076, + "learning_rate": 1.5571429129831496e-05, + "loss": 0.74, + "step": 30456 + }, + { + "epoch": 0.3807345183629591, + "grad_norm": 2.9891464710235596, + "learning_rate": 1.55707044132711e-05, + "loss": 1.1769, + "step": 30458 + }, + { + "epoch": 0.3807595189879747, + "grad_norm": 2.6297836303710938, + "learning_rate": 1.5569979654285007e-05, + "loss": 0.6551, + "step": 30460 + }, + { + "epoch": 0.3807845196129903, + "grad_norm": 4.5774030685424805, + "learning_rate": 1.5569254852878733e-05, + "loss": 1.5394, + "step": 30462 + }, + { + "epoch": 0.38080952023800596, + "grad_norm": 3.4406375885009766, + "learning_rate": 1.5568530009057805e-05, + "loss": 1.8195, + "step": 30464 + }, + { + "epoch": 0.38083452086302155, + "grad_norm": 5.035178184509277, + "learning_rate": 1.556780512282774e-05, + "loss": 1.0592, + "step": 30466 + }, + { + "epoch": 0.3808595214880372, + "grad_norm": 9.120392799377441, + "learning_rate": 1.5567080194194054e-05, + "loss": 0.5264, + "step": 30468 + }, + { + "epoch": 0.38088452211305285, + "grad_norm": 3.4513514041900635, + "learning_rate": 1.5566355223162276e-05, + "loss": 1.0795, + "step": 30470 + }, + { + "epoch": 0.38090952273806844, + "grad_norm": 2.2440176010131836, + "learning_rate": 1.5565630209737922e-05, + "loss": 0.7284, + "step": 30472 + }, + { + "epoch": 0.3809345233630841, + "grad_norm": 4.674522399902344, + "learning_rate": 1.5564905153926518e-05, + "loss": 1.2211, + "step": 30474 + }, + { + "epoch": 0.3809595239880997, + "grad_norm": 3.696847677230835, + "learning_rate": 1.5564180055733583e-05, + "loss": 0.9401, + "step": 30476 + }, + { + "epoch": 0.3809845246131153, + "grad_norm": 2.952505111694336, + "learning_rate": 1.556345491516464e-05, + "loss": 1.3889, + "step": 30478 + }, + { + "epoch": 0.381009525238131, + "grad_norm": 4.417607307434082, + "learning_rate": 1.556272973222521e-05, + "loss": 0.5886, + "step": 30480 + }, + { + "epoch": 0.38103452586314657, + "grad_norm": 0.8666216731071472, + "learning_rate": 1.5562004506920815e-05, + "loss": 0.8306, + "step": 30482 + }, + { + "epoch": 0.3810595264881622, + "grad_norm": 6.342482089996338, + "learning_rate": 1.5561279239256984e-05, + "loss": 1.3602, + "step": 30484 + }, + { + "epoch": 0.3810845271131778, + "grad_norm": 1.8263497352600098, + "learning_rate": 1.5560553929239232e-05, + "loss": 0.3616, + "step": 30486 + }, + { + "epoch": 0.38110952773819345, + "grad_norm": 3.9828989505767822, + "learning_rate": 1.555982857687309e-05, + "loss": 0.4954, + "step": 30488 + }, + { + "epoch": 0.3811345283632091, + "grad_norm": 0.015814276412129402, + "learning_rate": 1.555910318216408e-05, + "loss": 0.3232, + "step": 30490 + }, + { + "epoch": 0.3811595289882247, + "grad_norm": 9.27421760559082, + "learning_rate": 1.5558377745117727e-05, + "loss": 1.5923, + "step": 30492 + }, + { + "epoch": 0.38118452961324034, + "grad_norm": 1.6238170862197876, + "learning_rate": 1.5557652265739552e-05, + "loss": 0.6345, + "step": 30494 + }, + { + "epoch": 0.38120953023825593, + "grad_norm": 2.977382183074951, + "learning_rate": 1.5556926744035087e-05, + "loss": 1.2827, + "step": 30496 + }, + { + "epoch": 0.3812345308632716, + "grad_norm": 2.6862621307373047, + "learning_rate": 1.5556201180009854e-05, + "loss": 0.6687, + "step": 30498 + }, + { + "epoch": 0.38125953148828723, + "grad_norm": 1.823943853378296, + "learning_rate": 1.5555475573669373e-05, + "loss": 1.1227, + "step": 30500 + }, + { + "epoch": 0.3812845321133028, + "grad_norm": 2.380714178085327, + "learning_rate": 1.5554749925019182e-05, + "loss": 0.7677, + "step": 30502 + }, + { + "epoch": 0.38130953273831847, + "grad_norm": 2.343120813369751, + "learning_rate": 1.5554024234064798e-05, + "loss": 0.4658, + "step": 30504 + }, + { + "epoch": 0.38133453336333406, + "grad_norm": 2.0080480575561523, + "learning_rate": 1.5553298500811754e-05, + "loss": 0.3586, + "step": 30506 + }, + { + "epoch": 0.3813595339883497, + "grad_norm": 0.3630448877811432, + "learning_rate": 1.555257272526557e-05, + "loss": 0.8836, + "step": 30508 + }, + { + "epoch": 0.38138453461336536, + "grad_norm": 4.995911121368408, + "learning_rate": 1.5551846907431782e-05, + "loss": 2.0074, + "step": 30510 + }, + { + "epoch": 0.38140953523838095, + "grad_norm": 5.010340690612793, + "learning_rate": 1.5551121047315906e-05, + "loss": 2.2241, + "step": 30512 + }, + { + "epoch": 0.3814345358633966, + "grad_norm": 3.288844585418701, + "learning_rate": 1.5550395144923484e-05, + "loss": 0.6202, + "step": 30514 + }, + { + "epoch": 0.3814595364884122, + "grad_norm": 1.2525967359542847, + "learning_rate": 1.5549669200260032e-05, + "loss": 0.1198, + "step": 30516 + }, + { + "epoch": 0.38148453711342784, + "grad_norm": 2.3304436206817627, + "learning_rate": 1.5548943213331086e-05, + "loss": 0.5096, + "step": 30518 + }, + { + "epoch": 0.3815095377384435, + "grad_norm": 3.2016584873199463, + "learning_rate": 1.5548217184142172e-05, + "loss": 0.8084, + "step": 30520 + }, + { + "epoch": 0.3815345383634591, + "grad_norm": 1.7923898696899414, + "learning_rate": 1.5547491112698826e-05, + "loss": 1.7838, + "step": 30522 + }, + { + "epoch": 0.3815595389884747, + "grad_norm": 2.4345052242279053, + "learning_rate": 1.554676499900657e-05, + "loss": 1.1676, + "step": 30524 + }, + { + "epoch": 0.3815845396134903, + "grad_norm": 2.031613826751709, + "learning_rate": 1.554603884307093e-05, + "loss": 1.6968, + "step": 30526 + }, + { + "epoch": 0.38160954023850596, + "grad_norm": 0.7432435154914856, + "learning_rate": 1.554531264489745e-05, + "loss": 0.2507, + "step": 30528 + }, + { + "epoch": 0.3816345408635216, + "grad_norm": 5.879244327545166, + "learning_rate": 1.5544586404491648e-05, + "loss": 1.7328, + "step": 30530 + }, + { + "epoch": 0.3816595414885372, + "grad_norm": 4.075808048248291, + "learning_rate": 1.554386012185906e-05, + "loss": 1.1333, + "step": 30532 + }, + { + "epoch": 0.38168454211355285, + "grad_norm": 9.712677955627441, + "learning_rate": 1.5543133797005224e-05, + "loss": 1.859, + "step": 30534 + }, + { + "epoch": 0.38170954273856844, + "grad_norm": 6.759766578674316, + "learning_rate": 1.554240742993566e-05, + "loss": 1.6922, + "step": 30536 + }, + { + "epoch": 0.3817345433635841, + "grad_norm": 2.914069414138794, + "learning_rate": 1.5541681020655903e-05, + "loss": 0.9358, + "step": 30538 + }, + { + "epoch": 0.38175954398859974, + "grad_norm": 5.559665679931641, + "learning_rate": 1.554095456917149e-05, + "loss": 1.2426, + "step": 30540 + }, + { + "epoch": 0.38178454461361533, + "grad_norm": 0.04288545623421669, + "learning_rate": 1.554022807548795e-05, + "loss": 0.2878, + "step": 30542 + }, + { + "epoch": 0.381809545238631, + "grad_norm": 6.4572343826293945, + "learning_rate": 1.5539501539610815e-05, + "loss": 1.6123, + "step": 30544 + }, + { + "epoch": 0.38183454586364657, + "grad_norm": 2.5340278148651123, + "learning_rate": 1.5538774961545618e-05, + "loss": 0.5248, + "step": 30546 + }, + { + "epoch": 0.3818595464886622, + "grad_norm": 2.706904411315918, + "learning_rate": 1.55380483412979e-05, + "loss": 0.8737, + "step": 30548 + }, + { + "epoch": 0.38188454711367786, + "grad_norm": 7.848823547363281, + "learning_rate": 1.5537321678873185e-05, + "loss": 1.2158, + "step": 30550 + }, + { + "epoch": 0.38190954773869346, + "grad_norm": 3.327831506729126, + "learning_rate": 1.553659497427701e-05, + "loss": 1.2138, + "step": 30552 + }, + { + "epoch": 0.3819345483637091, + "grad_norm": 6.102779865264893, + "learning_rate": 1.5535868227514917e-05, + "loss": 2.1462, + "step": 30554 + }, + { + "epoch": 0.3819595489887247, + "grad_norm": 4.469995498657227, + "learning_rate": 1.553514143859243e-05, + "loss": 0.875, + "step": 30556 + }, + { + "epoch": 0.38198454961374034, + "grad_norm": 0.07298153638839722, + "learning_rate": 1.5534414607515086e-05, + "loss": 0.0054, + "step": 30558 + }, + { + "epoch": 0.382009550238756, + "grad_norm": 0.5775812864303589, + "learning_rate": 1.5533687734288428e-05, + "loss": 0.2436, + "step": 30560 + }, + { + "epoch": 0.3820345508637716, + "grad_norm": 2.352931022644043, + "learning_rate": 1.5532960818917985e-05, + "loss": 0.8982, + "step": 30562 + }, + { + "epoch": 0.38205955148878723, + "grad_norm": 2.2256572246551514, + "learning_rate": 1.5532233861409292e-05, + "loss": 0.4322, + "step": 30564 + }, + { + "epoch": 0.3820845521138028, + "grad_norm": 2.821807384490967, + "learning_rate": 1.553150686176789e-05, + "loss": 0.3485, + "step": 30566 + }, + { + "epoch": 0.38210955273881847, + "grad_norm": 3.0997068881988525, + "learning_rate": 1.5530779819999314e-05, + "loss": 1.2017, + "step": 30568 + }, + { + "epoch": 0.3821345533638341, + "grad_norm": 2.6979427337646484, + "learning_rate": 1.55300527361091e-05, + "loss": 0.5817, + "step": 30570 + }, + { + "epoch": 0.3821595539888497, + "grad_norm": 2.6521120071411133, + "learning_rate": 1.552932561010279e-05, + "loss": 1.4942, + "step": 30572 + }, + { + "epoch": 0.38218455461386536, + "grad_norm": 3.7784337997436523, + "learning_rate": 1.5528598441985915e-05, + "loss": 1.399, + "step": 30574 + }, + { + "epoch": 0.38220955523888095, + "grad_norm": 3.3195807933807373, + "learning_rate": 1.5527871231764018e-05, + "loss": 0.8302, + "step": 30576 + }, + { + "epoch": 0.3822345558638966, + "grad_norm": 0.7137205004692078, + "learning_rate": 1.5527143979442635e-05, + "loss": 0.5421, + "step": 30578 + }, + { + "epoch": 0.38225955648891224, + "grad_norm": 0.8263937830924988, + "learning_rate": 1.5526416685027303e-05, + "loss": 0.1278, + "step": 30580 + }, + { + "epoch": 0.38228455711392784, + "grad_norm": 2.3709232807159424, + "learning_rate": 1.5525689348523564e-05, + "loss": 0.7508, + "step": 30582 + }, + { + "epoch": 0.3823095577389435, + "grad_norm": 5.661446571350098, + "learning_rate": 1.5524961969936958e-05, + "loss": 1.9042, + "step": 30584 + }, + { + "epoch": 0.3823345583639591, + "grad_norm": 8.638007164001465, + "learning_rate": 1.5524234549273022e-05, + "loss": 1.2675, + "step": 30586 + }, + { + "epoch": 0.3823595589889747, + "grad_norm": 3.2341246604919434, + "learning_rate": 1.5523507086537298e-05, + "loss": 0.9092, + "step": 30588 + }, + { + "epoch": 0.38238455961399037, + "grad_norm": 0.022355183959007263, + "learning_rate": 1.5522779581735322e-05, + "loss": 0.7143, + "step": 30590 + }, + { + "epoch": 0.38240956023900596, + "grad_norm": 2.006716012954712, + "learning_rate": 1.5522052034872638e-05, + "loss": 1.4571, + "step": 30592 + }, + { + "epoch": 0.3824345608640216, + "grad_norm": 2.1180646419525146, + "learning_rate": 1.5521324445954792e-05, + "loss": 1.0572, + "step": 30594 + }, + { + "epoch": 0.3824595614890372, + "grad_norm": 2.6358206272125244, + "learning_rate": 1.5520596814987313e-05, + "loss": 0.8229, + "step": 30596 + }, + { + "epoch": 0.38248456211405285, + "grad_norm": 0.8179665803909302, + "learning_rate": 1.551986914197575e-05, + "loss": 0.1103, + "step": 30598 + }, + { + "epoch": 0.3825095627390685, + "grad_norm": 4.248683929443359, + "learning_rate": 1.5519141426925646e-05, + "loss": 1.0227, + "step": 30600 + }, + { + "epoch": 0.3825345633640841, + "grad_norm": 3.843665599822998, + "learning_rate": 1.551841366984254e-05, + "loss": 1.8606, + "step": 30602 + }, + { + "epoch": 0.38255956398909974, + "grad_norm": 4.378719329833984, + "learning_rate": 1.551768587073198e-05, + "loss": 2.0172, + "step": 30604 + }, + { + "epoch": 0.38258456461411533, + "grad_norm": 0.27678316831588745, + "learning_rate": 1.55169580295995e-05, + "loss": 1.1779, + "step": 30606 + }, + { + "epoch": 0.382609565239131, + "grad_norm": 3.938765287399292, + "learning_rate": 1.5516230146450656e-05, + "loss": 0.6387, + "step": 30608 + }, + { + "epoch": 0.3826345658641466, + "grad_norm": 1.9210152626037598, + "learning_rate": 1.5515502221290976e-05, + "loss": 1.0547, + "step": 30610 + }, + { + "epoch": 0.3826595664891622, + "grad_norm": 1.63499116897583, + "learning_rate": 1.551477425412601e-05, + "loss": 0.3697, + "step": 30612 + }, + { + "epoch": 0.38268456711417786, + "grad_norm": 5.304969310760498, + "learning_rate": 1.551404624496131e-05, + "loss": 0.8652, + "step": 30614 + }, + { + "epoch": 0.38270956773919346, + "grad_norm": 3.532921552658081, + "learning_rate": 1.5513318193802404e-05, + "loss": 0.7642, + "step": 30616 + }, + { + "epoch": 0.3827345683642091, + "grad_norm": 0.029429204761981964, + "learning_rate": 1.5512590100654853e-05, + "loss": 0.291, + "step": 30618 + }, + { + "epoch": 0.38275956898922475, + "grad_norm": 3.759953737258911, + "learning_rate": 1.5511861965524193e-05, + "loss": 0.766, + "step": 30620 + }, + { + "epoch": 0.38278456961424034, + "grad_norm": 1.0901858806610107, + "learning_rate": 1.5511133788415972e-05, + "loss": 0.066, + "step": 30622 + }, + { + "epoch": 0.382809570239256, + "grad_norm": 0.030033597722649574, + "learning_rate": 1.5510405569335737e-05, + "loss": 0.3566, + "step": 30624 + }, + { + "epoch": 0.3828345708642716, + "grad_norm": 2.2189438343048096, + "learning_rate": 1.5509677308289034e-05, + "loss": 0.7376, + "step": 30626 + }, + { + "epoch": 0.38285957148928723, + "grad_norm": 0.38396552205085754, + "learning_rate": 1.5508949005281403e-05, + "loss": 0.9413, + "step": 30628 + }, + { + "epoch": 0.3828845721143029, + "grad_norm": 3.107097864151001, + "learning_rate": 1.5508220660318395e-05, + "loss": 1.0059, + "step": 30630 + }, + { + "epoch": 0.38290957273931847, + "grad_norm": 2.4989378452301025, + "learning_rate": 1.550749227340556e-05, + "loss": 1.2121, + "step": 30632 + }, + { + "epoch": 0.3829345733643341, + "grad_norm": 0.014176270924508572, + "learning_rate": 1.550676384454844e-05, + "loss": 1.3313, + "step": 30634 + }, + { + "epoch": 0.3829595739893497, + "grad_norm": 3.637284994125366, + "learning_rate": 1.5506035373752593e-05, + "loss": 1.1307, + "step": 30636 + }, + { + "epoch": 0.38298457461436536, + "grad_norm": 3.699939489364624, + "learning_rate": 1.550530686102355e-05, + "loss": 1.221, + "step": 30638 + }, + { + "epoch": 0.383009575239381, + "grad_norm": 1.9414199590682983, + "learning_rate": 1.5504578306366874e-05, + "loss": 1.3493, + "step": 30640 + }, + { + "epoch": 0.3830345758643966, + "grad_norm": 0.7422562837600708, + "learning_rate": 1.5503849709788107e-05, + "loss": 0.0973, + "step": 30642 + }, + { + "epoch": 0.38305957648941225, + "grad_norm": 2.158836603164673, + "learning_rate": 1.5503121071292796e-05, + "loss": 0.388, + "step": 30644 + }, + { + "epoch": 0.38308457711442784, + "grad_norm": 2.0514392852783203, + "learning_rate": 1.55023923908865e-05, + "loss": 0.7133, + "step": 30646 + }, + { + "epoch": 0.3831095777394435, + "grad_norm": 0.9519199132919312, + "learning_rate": 1.5501663668574757e-05, + "loss": 0.4774, + "step": 30648 + }, + { + "epoch": 0.38313457836445913, + "grad_norm": 3.485093832015991, + "learning_rate": 1.550093490436312e-05, + "loss": 0.755, + "step": 30650 + }, + { + "epoch": 0.3831595789894747, + "grad_norm": 5.817553520202637, + "learning_rate": 1.550020609825714e-05, + "loss": 2.1201, + "step": 30652 + }, + { + "epoch": 0.3831845796144904, + "grad_norm": 8.280889511108398, + "learning_rate": 1.5499477250262373e-05, + "loss": 3.45, + "step": 30654 + }, + { + "epoch": 0.38320958023950596, + "grad_norm": 2.9186489582061768, + "learning_rate": 1.5498748360384363e-05, + "loss": 0.8388, + "step": 30656 + }, + { + "epoch": 0.3832345808645216, + "grad_norm": 2.0275087356567383, + "learning_rate": 1.549801942862866e-05, + "loss": 0.2389, + "step": 30658 + }, + { + "epoch": 0.38325958148953726, + "grad_norm": 4.860206127166748, + "learning_rate": 1.549729045500082e-05, + "loss": 0.7279, + "step": 30660 + }, + { + "epoch": 0.38328458211455285, + "grad_norm": 4.500059604644775, + "learning_rate": 1.5496561439506396e-05, + "loss": 1.1164, + "step": 30662 + }, + { + "epoch": 0.3833095827395685, + "grad_norm": 6.237593173980713, + "learning_rate": 1.5495832382150938e-05, + "loss": 1.467, + "step": 30664 + }, + { + "epoch": 0.3833345833645841, + "grad_norm": 2.9692912101745605, + "learning_rate": 1.5495103282939994e-05, + "loss": 0.8684, + "step": 30666 + }, + { + "epoch": 0.38335958398959974, + "grad_norm": 5.928379058837891, + "learning_rate": 1.5494374141879124e-05, + "loss": 0.5271, + "step": 30668 + }, + { + "epoch": 0.3833845846146154, + "grad_norm": 4.118968963623047, + "learning_rate": 1.549364495897387e-05, + "loss": 1.8104, + "step": 30670 + }, + { + "epoch": 0.383409585239631, + "grad_norm": 6.077016830444336, + "learning_rate": 1.5492915734229802e-05, + "loss": 1.7181, + "step": 30672 + }, + { + "epoch": 0.3834345858646466, + "grad_norm": 4.778030872344971, + "learning_rate": 1.5492186467652463e-05, + "loss": 1.298, + "step": 30674 + }, + { + "epoch": 0.3834595864896622, + "grad_norm": 0.2530514895915985, + "learning_rate": 1.5491457159247403e-05, + "loss": 0.7243, + "step": 30676 + }, + { + "epoch": 0.38348458711467787, + "grad_norm": 2.4794228076934814, + "learning_rate": 1.5490727809020187e-05, + "loss": 0.2349, + "step": 30678 + }, + { + "epoch": 0.3835095877396935, + "grad_norm": 2.5980148315429688, + "learning_rate": 1.5489998416976363e-05, + "loss": 1.1414, + "step": 30680 + }, + { + "epoch": 0.3835345883647091, + "grad_norm": 2.0016257762908936, + "learning_rate": 1.5489268983121486e-05, + "loss": 0.8863, + "step": 30682 + }, + { + "epoch": 0.38355958898972475, + "grad_norm": 3.3508243560791016, + "learning_rate": 1.548853950746111e-05, + "loss": 0.4616, + "step": 30684 + }, + { + "epoch": 0.38358458961474035, + "grad_norm": 0.012997927144169807, + "learning_rate": 1.54878099900008e-05, + "loss": 0.1838, + "step": 30686 + }, + { + "epoch": 0.383609590239756, + "grad_norm": 3.6027395725250244, + "learning_rate": 1.54870804307461e-05, + "loss": 1.5036, + "step": 30688 + }, + { + "epoch": 0.38363459086477164, + "grad_norm": 3.2108285427093506, + "learning_rate": 1.5486350829702573e-05, + "loss": 0.7164, + "step": 30690 + }, + { + "epoch": 0.38365959148978723, + "grad_norm": 5.744697093963623, + "learning_rate": 1.548562118687578e-05, + "loss": 1.667, + "step": 30692 + }, + { + "epoch": 0.3836845921148029, + "grad_norm": 5.095251560211182, + "learning_rate": 1.5484891502271263e-05, + "loss": 2.4397, + "step": 30694 + }, + { + "epoch": 0.3837095927398185, + "grad_norm": 6.599201679229736, + "learning_rate": 1.5484161775894594e-05, + "loss": 1.8993, + "step": 30696 + }, + { + "epoch": 0.3837345933648341, + "grad_norm": 2.372845411300659, + "learning_rate": 1.5483432007751323e-05, + "loss": 1.0536, + "step": 30698 + }, + { + "epoch": 0.38375959398984977, + "grad_norm": 4.111481666564941, + "learning_rate": 1.5482702197847007e-05, + "loss": 1.4317, + "step": 30700 + }, + { + "epoch": 0.38378459461486536, + "grad_norm": 1.7127676010131836, + "learning_rate": 1.5481972346187206e-05, + "loss": 0.1107, + "step": 30702 + }, + { + "epoch": 0.383809595239881, + "grad_norm": 5.739137649536133, + "learning_rate": 1.548124245277748e-05, + "loss": 0.7801, + "step": 30704 + }, + { + "epoch": 0.3838345958648966, + "grad_norm": 1.5137704610824585, + "learning_rate": 1.5480512517623387e-05, + "loss": 0.4894, + "step": 30706 + }, + { + "epoch": 0.38385959648991225, + "grad_norm": 1.05022132396698, + "learning_rate": 1.5479782540730486e-05, + "loss": 0.2682, + "step": 30708 + }, + { + "epoch": 0.3838845971149279, + "grad_norm": 2.651956796646118, + "learning_rate": 1.5479052522104333e-05, + "loss": 0.4738, + "step": 30710 + }, + { + "epoch": 0.3839095977399435, + "grad_norm": 0.9427334666252136, + "learning_rate": 1.5478322461750495e-05, + "loss": 0.6221, + "step": 30712 + }, + { + "epoch": 0.38393459836495913, + "grad_norm": 0.0398935005068779, + "learning_rate": 1.5477592359674523e-05, + "loss": 0.4882, + "step": 30714 + }, + { + "epoch": 0.3839595989899747, + "grad_norm": 3.3471107482910156, + "learning_rate": 1.5476862215881986e-05, + "loss": 0.5932, + "step": 30716 + }, + { + "epoch": 0.3839845996149904, + "grad_norm": 4.051546096801758, + "learning_rate": 1.5476132030378444e-05, + "loss": 2.6883, + "step": 30718 + }, + { + "epoch": 0.384009600240006, + "grad_norm": 4.1103620529174805, + "learning_rate": 1.5475401803169448e-05, + "loss": 0.4566, + "step": 30720 + }, + { + "epoch": 0.3840346008650216, + "grad_norm": 2.224839687347412, + "learning_rate": 1.547467153426057e-05, + "loss": 0.9446, + "step": 30722 + }, + { + "epoch": 0.38405960149003726, + "grad_norm": 2.90390944480896, + "learning_rate": 1.547394122365737e-05, + "loss": 0.6803, + "step": 30724 + }, + { + "epoch": 0.38408460211505285, + "grad_norm": 3.8852977752685547, + "learning_rate": 1.5473210871365402e-05, + "loss": 0.611, + "step": 30726 + }, + { + "epoch": 0.3841096027400685, + "grad_norm": 4.797177791595459, + "learning_rate": 1.5472480477390236e-05, + "loss": 1.485, + "step": 30728 + }, + { + "epoch": 0.38413460336508415, + "grad_norm": 3.852231979370117, + "learning_rate": 1.5471750041737433e-05, + "loss": 1.2079, + "step": 30730 + }, + { + "epoch": 0.38415960399009974, + "grad_norm": 1.1168015003204346, + "learning_rate": 1.5471019564412558e-05, + "loss": 0.6104, + "step": 30732 + }, + { + "epoch": 0.3841846046151154, + "grad_norm": 4.174166679382324, + "learning_rate": 1.5470289045421166e-05, + "loss": 0.9912, + "step": 30734 + }, + { + "epoch": 0.384209605240131, + "grad_norm": 2.5950896739959717, + "learning_rate": 1.546955848476883e-05, + "loss": 1.8867, + "step": 30736 + }, + { + "epoch": 0.38423460586514663, + "grad_norm": 2.854433298110962, + "learning_rate": 1.5468827882461114e-05, + "loss": 1.0737, + "step": 30738 + }, + { + "epoch": 0.3842596064901623, + "grad_norm": 4.843914985656738, + "learning_rate": 1.5468097238503573e-05, + "loss": 0.4174, + "step": 30740 + }, + { + "epoch": 0.38428460711517787, + "grad_norm": 0.004723533522337675, + "learning_rate": 1.5467366552901773e-05, + "loss": 0.0002, + "step": 30742 + }, + { + "epoch": 0.3843096077401935, + "grad_norm": 4.4691386222839355, + "learning_rate": 1.546663582566129e-05, + "loss": 0.3444, + "step": 30744 + }, + { + "epoch": 0.3843346083652091, + "grad_norm": 4.89204740524292, + "learning_rate": 1.5465905056787678e-05, + "loss": 0.4994, + "step": 30746 + }, + { + "epoch": 0.38435960899022475, + "grad_norm": 6.326858997344971, + "learning_rate": 1.5465174246286506e-05, + "loss": 0.6897, + "step": 30748 + }, + { + "epoch": 0.3843846096152404, + "grad_norm": 7.267762184143066, + "learning_rate": 1.5464443394163345e-05, + "loss": 1.3763, + "step": 30750 + }, + { + "epoch": 0.384409610240256, + "grad_norm": 3.323974847793579, + "learning_rate": 1.5463712500423747e-05, + "loss": 1.344, + "step": 30752 + }, + { + "epoch": 0.38443461086527164, + "grad_norm": 2.758821964263916, + "learning_rate": 1.5462981565073293e-05, + "loss": 0.055, + "step": 30754 + }, + { + "epoch": 0.38445961149028723, + "grad_norm": 1.0113531351089478, + "learning_rate": 1.546225058811754e-05, + "loss": 0.1442, + "step": 30756 + }, + { + "epoch": 0.3844846121153029, + "grad_norm": 2.7135252952575684, + "learning_rate": 1.5461519569562063e-05, + "loss": 0.2551, + "step": 30758 + }, + { + "epoch": 0.38450961274031853, + "grad_norm": 3.089383840560913, + "learning_rate": 1.5460788509412425e-05, + "loss": 1.9453, + "step": 30760 + }, + { + "epoch": 0.3845346133653341, + "grad_norm": 1.3536723852157593, + "learning_rate": 1.5460057407674193e-05, + "loss": 0.1222, + "step": 30762 + }, + { + "epoch": 0.38455961399034977, + "grad_norm": 0.01065079402178526, + "learning_rate": 1.5459326264352936e-05, + "loss": 0.0002, + "step": 30764 + }, + { + "epoch": 0.38458461461536536, + "grad_norm": 3.87490177154541, + "learning_rate": 1.545859507945422e-05, + "loss": 1.1504, + "step": 30766 + }, + { + "epoch": 0.384609615240381, + "grad_norm": 2.7339510917663574, + "learning_rate": 1.5457863852983617e-05, + "loss": 1.6966, + "step": 30768 + }, + { + "epoch": 0.38463461586539666, + "grad_norm": 5.086695194244385, + "learning_rate": 1.5457132584946695e-05, + "loss": 1.6286, + "step": 30770 + }, + { + "epoch": 0.38465961649041225, + "grad_norm": 5.615190029144287, + "learning_rate": 1.5456401275349025e-05, + "loss": 0.5164, + "step": 30772 + }, + { + "epoch": 0.3846846171154279, + "grad_norm": 3.6616032123565674, + "learning_rate": 1.5455669924196172e-05, + "loss": 1.1515, + "step": 30774 + }, + { + "epoch": 0.3847096177404435, + "grad_norm": 0.002435379894450307, + "learning_rate": 1.5454938531493707e-05, + "loss": 0.8273, + "step": 30776 + }, + { + "epoch": 0.38473461836545914, + "grad_norm": 0.8839031457901001, + "learning_rate": 1.5454207097247203e-05, + "loss": 0.297, + "step": 30778 + }, + { + "epoch": 0.3847596189904748, + "grad_norm": 0.008259108290076256, + "learning_rate": 1.545347562146223e-05, + "loss": 0.0207, + "step": 30780 + }, + { + "epoch": 0.3847846196154904, + "grad_norm": 4.213405132293701, + "learning_rate": 1.545274410414436e-05, + "loss": 0.8756, + "step": 30782 + }, + { + "epoch": 0.384809620240506, + "grad_norm": 2.872231960296631, + "learning_rate": 1.545201254529916e-05, + "loss": 0.6856, + "step": 30784 + }, + { + "epoch": 0.3848346208655216, + "grad_norm": 2.729113817214966, + "learning_rate": 1.54512809449322e-05, + "loss": 1.3001, + "step": 30786 + }, + { + "epoch": 0.38485962149053726, + "grad_norm": 5.089852809906006, + "learning_rate": 1.545054930304906e-05, + "loss": 1.0815, + "step": 30788 + }, + { + "epoch": 0.3848846221155529, + "grad_norm": 2.2884631156921387, + "learning_rate": 1.5449817619655307e-05, + "loss": 1.2064, + "step": 30790 + }, + { + "epoch": 0.3849096227405685, + "grad_norm": 6.878547191619873, + "learning_rate": 1.544908589475651e-05, + "loss": 0.9596, + "step": 30792 + }, + { + "epoch": 0.38493462336558415, + "grad_norm": 3.6460201740264893, + "learning_rate": 1.5448354128358248e-05, + "loss": 0.7253, + "step": 30794 + }, + { + "epoch": 0.38495962399059974, + "grad_norm": 0.5193122029304504, + "learning_rate": 1.544762232046609e-05, + "loss": 0.0214, + "step": 30796 + }, + { + "epoch": 0.3849846246156154, + "grad_norm": 4.065112590789795, + "learning_rate": 1.5446890471085612e-05, + "loss": 0.9993, + "step": 30798 + }, + { + "epoch": 0.38500962524063104, + "grad_norm": 1.704607367515564, + "learning_rate": 1.5446158580222387e-05, + "loss": 0.0403, + "step": 30800 + }, + { + "epoch": 0.38503462586564663, + "grad_norm": 2.890660047531128, + "learning_rate": 1.5445426647881987e-05, + "loss": 1.2194, + "step": 30802 + }, + { + "epoch": 0.3850596264906623, + "grad_norm": 4.299436569213867, + "learning_rate": 1.544469467406999e-05, + "loss": 1.8703, + "step": 30804 + }, + { + "epoch": 0.38508462711567787, + "grad_norm": 5.615850925445557, + "learning_rate": 1.5443962658791963e-05, + "loss": 0.5958, + "step": 30806 + }, + { + "epoch": 0.3851096277406935, + "grad_norm": 1.098024845123291, + "learning_rate": 1.5443230602053492e-05, + "loss": 1.1325, + "step": 30808 + }, + { + "epoch": 0.38513462836570916, + "grad_norm": 0.08870072662830353, + "learning_rate": 1.5442498503860145e-05, + "loss": 0.5215, + "step": 30810 + }, + { + "epoch": 0.38515962899072476, + "grad_norm": 0.0031865190248936415, + "learning_rate": 1.5441766364217498e-05, + "loss": 0.4202, + "step": 30812 + }, + { + "epoch": 0.3851846296157404, + "grad_norm": 0.241040900349617, + "learning_rate": 1.544103418313113e-05, + "loss": 0.2214, + "step": 30814 + }, + { + "epoch": 0.385209630240756, + "grad_norm": 2.848335027694702, + "learning_rate": 1.544030196060661e-05, + "loss": 0.0908, + "step": 30816 + }, + { + "epoch": 0.38523463086577164, + "grad_norm": 2.0152852535247803, + "learning_rate": 1.5439569696649524e-05, + "loss": 0.3803, + "step": 30818 + }, + { + "epoch": 0.3852596314907873, + "grad_norm": 1.0829925537109375, + "learning_rate": 1.5438837391265443e-05, + "loss": 0.4719, + "step": 30820 + }, + { + "epoch": 0.3852846321158029, + "grad_norm": 3.1560781002044678, + "learning_rate": 1.5438105044459947e-05, + "loss": 0.3919, + "step": 30822 + }, + { + "epoch": 0.38530963274081853, + "grad_norm": 0.012265213765203953, + "learning_rate": 1.543737265623861e-05, + "loss": 0.7443, + "step": 30824 + }, + { + "epoch": 0.3853346333658341, + "grad_norm": 3.7577033042907715, + "learning_rate": 1.5436640226607012e-05, + "loss": 0.4952, + "step": 30826 + }, + { + "epoch": 0.38535963399084977, + "grad_norm": 0.06103761866688728, + "learning_rate": 1.543590775557073e-05, + "loss": 0.8016, + "step": 30828 + }, + { + "epoch": 0.3853846346158654, + "grad_norm": 5.905666351318359, + "learning_rate": 1.5435175243135345e-05, + "loss": 1.8217, + "step": 30830 + }, + { + "epoch": 0.385409635240881, + "grad_norm": 2.235238552093506, + "learning_rate": 1.5434442689306435e-05, + "loss": 0.443, + "step": 30832 + }, + { + "epoch": 0.38543463586589666, + "grad_norm": 3.795109510421753, + "learning_rate": 1.5433710094089577e-05, + "loss": 0.7572, + "step": 30834 + }, + { + "epoch": 0.38545963649091225, + "grad_norm": 5.876345634460449, + "learning_rate": 1.5432977457490352e-05, + "loss": 1.3767, + "step": 30836 + }, + { + "epoch": 0.3854846371159279, + "grad_norm": 6.919496059417725, + "learning_rate": 1.5432244779514336e-05, + "loss": 1.189, + "step": 30838 + }, + { + "epoch": 0.38550963774094354, + "grad_norm": 4.705082416534424, + "learning_rate": 1.5431512060167114e-05, + "loss": 2.237, + "step": 30840 + }, + { + "epoch": 0.38553463836595914, + "grad_norm": 3.709367036819458, + "learning_rate": 1.5430779299454262e-05, + "loss": 0.4017, + "step": 30842 + }, + { + "epoch": 0.3855596389909748, + "grad_norm": 5.0954179763793945, + "learning_rate": 1.5430046497381367e-05, + "loss": 1.5129, + "step": 30844 + }, + { + "epoch": 0.3855846396159904, + "grad_norm": 0.0014043187256902456, + "learning_rate": 1.5429313653954003e-05, + "loss": 0.7089, + "step": 30846 + }, + { + "epoch": 0.385609640241006, + "grad_norm": 4.143510818481445, + "learning_rate": 1.5428580769177757e-05, + "loss": 1.6173, + "step": 30848 + }, + { + "epoch": 0.38563464086602167, + "grad_norm": 2.6379902362823486, + "learning_rate": 1.5427847843058205e-05, + "loss": 0.5384, + "step": 30850 + }, + { + "epoch": 0.38565964149103726, + "grad_norm": 2.586515426635742, + "learning_rate": 1.542711487560093e-05, + "loss": 0.783, + "step": 30852 + }, + { + "epoch": 0.3856846421160529, + "grad_norm": 8.17601490020752, + "learning_rate": 1.542638186681152e-05, + "loss": 2.2653, + "step": 30854 + }, + { + "epoch": 0.3857096427410685, + "grad_norm": 3.054628849029541, + "learning_rate": 1.542564881669555e-05, + "loss": 1.1621, + "step": 30856 + }, + { + "epoch": 0.38573464336608415, + "grad_norm": 0.006490832194685936, + "learning_rate": 1.54249157252586e-05, + "loss": 1.3808, + "step": 30858 + }, + { + "epoch": 0.3857596439910998, + "grad_norm": 4.995968341827393, + "learning_rate": 1.5424182592506267e-05, + "loss": 1.1161, + "step": 30860 + }, + { + "epoch": 0.3857846446161154, + "grad_norm": 2.3632683753967285, + "learning_rate": 1.5423449418444128e-05, + "loss": 1.0671, + "step": 30862 + }, + { + "epoch": 0.38580964524113104, + "grad_norm": 3.12726092338562, + "learning_rate": 1.542271620307776e-05, + "loss": 1.404, + "step": 30864 + }, + { + "epoch": 0.38583464586614663, + "grad_norm": 2.442962646484375, + "learning_rate": 1.542198294641275e-05, + "loss": 0.3037, + "step": 30866 + }, + { + "epoch": 0.3858596464911623, + "grad_norm": 3.021192789077759, + "learning_rate": 1.5421249648454694e-05, + "loss": 2.0606, + "step": 30868 + }, + { + "epoch": 0.3858846471161779, + "grad_norm": 0.008412591181695461, + "learning_rate": 1.542051630920916e-05, + "loss": 0.1794, + "step": 30870 + }, + { + "epoch": 0.3859096477411935, + "grad_norm": 3.1411473751068115, + "learning_rate": 1.541978292868174e-05, + "loss": 0.5534, + "step": 30872 + }, + { + "epoch": 0.38593464836620917, + "grad_norm": 0.002505666809156537, + "learning_rate": 1.5419049506878024e-05, + "loss": 0.2695, + "step": 30874 + }, + { + "epoch": 0.38595964899122476, + "grad_norm": 3.724914789199829, + "learning_rate": 1.5418316043803592e-05, + "loss": 1.2791, + "step": 30876 + }, + { + "epoch": 0.3859846496162404, + "grad_norm": 0.0026823580265045166, + "learning_rate": 1.541758253946403e-05, + "loss": 0.2089, + "step": 30878 + }, + { + "epoch": 0.38600965024125605, + "grad_norm": 4.42105770111084, + "learning_rate": 1.5416848993864925e-05, + "loss": 1.0217, + "step": 30880 + }, + { + "epoch": 0.38603465086627164, + "grad_norm": 3.0907790660858154, + "learning_rate": 1.5416115407011868e-05, + "loss": 0.6952, + "step": 30882 + }, + { + "epoch": 0.3860596514912873, + "grad_norm": 6.557451248168945, + "learning_rate": 1.5415381778910437e-05, + "loss": 1.0511, + "step": 30884 + }, + { + "epoch": 0.3860846521163029, + "grad_norm": 2.8890743255615234, + "learning_rate": 1.5414648109566223e-05, + "loss": 1.3293, + "step": 30886 + }, + { + "epoch": 0.38610965274131853, + "grad_norm": 0.002117402385920286, + "learning_rate": 1.541391439898482e-05, + "loss": 0.0002, + "step": 30888 + }, + { + "epoch": 0.3861346533663342, + "grad_norm": 1.8415049314498901, + "learning_rate": 1.541318064717181e-05, + "loss": 1.0644, + "step": 30890 + }, + { + "epoch": 0.38615965399134977, + "grad_norm": 2.6666693687438965, + "learning_rate": 1.5412446854132775e-05, + "loss": 1.6473, + "step": 30892 + }, + { + "epoch": 0.3861846546163654, + "grad_norm": 0.8561863899230957, + "learning_rate": 1.541171301987332e-05, + "loss": 0.7835, + "step": 30894 + }, + { + "epoch": 0.386209655241381, + "grad_norm": 2.7320799827575684, + "learning_rate": 1.5410979144399017e-05, + "loss": 0.3282, + "step": 30896 + }, + { + "epoch": 0.38623465586639666, + "grad_norm": 1.2200753688812256, + "learning_rate": 1.5410245227715463e-05, + "loss": 0.2613, + "step": 30898 + }, + { + "epoch": 0.3862596564914123, + "grad_norm": 1.8118274211883545, + "learning_rate": 1.540951126982825e-05, + "loss": 0.3957, + "step": 30900 + }, + { + "epoch": 0.3862846571164279, + "grad_norm": 3.4267210960388184, + "learning_rate": 1.540877727074296e-05, + "loss": 1.0623, + "step": 30902 + }, + { + "epoch": 0.38630965774144355, + "grad_norm": 3.554197072982788, + "learning_rate": 1.540804323046519e-05, + "loss": 1.4605, + "step": 30904 + }, + { + "epoch": 0.38633465836645914, + "grad_norm": 0.0018153996206820011, + "learning_rate": 1.5407309149000524e-05, + "loss": 1.0777, + "step": 30906 + }, + { + "epoch": 0.3863596589914748, + "grad_norm": 0.07765967398881912, + "learning_rate": 1.540657502635456e-05, + "loss": 0.5986, + "step": 30908 + }, + { + "epoch": 0.38638465961649043, + "grad_norm": 4.265078544616699, + "learning_rate": 1.540584086253288e-05, + "loss": 0.6565, + "step": 30910 + }, + { + "epoch": 0.386409660241506, + "grad_norm": 0.038727227598428726, + "learning_rate": 1.5405106657541088e-05, + "loss": 0.0005, + "step": 30912 + }, + { + "epoch": 0.3864346608665217, + "grad_norm": 3.5664002895355225, + "learning_rate": 1.5404372411384764e-05, + "loss": 0.983, + "step": 30914 + }, + { + "epoch": 0.38645966149153727, + "grad_norm": 1.3591684103012085, + "learning_rate": 1.5403638124069503e-05, + "loss": 0.0416, + "step": 30916 + }, + { + "epoch": 0.3864846621165529, + "grad_norm": 0.5145912170410156, + "learning_rate": 1.54029037956009e-05, + "loss": 0.7213, + "step": 30918 + }, + { + "epoch": 0.38650966274156856, + "grad_norm": 2.0631496906280518, + "learning_rate": 1.5402169425984546e-05, + "loss": 1.0264, + "step": 30920 + }, + { + "epoch": 0.38653466336658415, + "grad_norm": 2.6533854007720947, + "learning_rate": 1.540143501522603e-05, + "loss": 1.053, + "step": 30922 + }, + { + "epoch": 0.3865596639915998, + "grad_norm": 0.004762996919453144, + "learning_rate": 1.5400700563330954e-05, + "loss": 0.1185, + "step": 30924 + }, + { + "epoch": 0.3865846646166154, + "grad_norm": 3.9394023418426514, + "learning_rate": 1.5399966070304902e-05, + "loss": 0.8712, + "step": 30926 + }, + { + "epoch": 0.38660966524163104, + "grad_norm": 3.841893434524536, + "learning_rate": 1.5399231536153476e-05, + "loss": 0.6235, + "step": 30928 + }, + { + "epoch": 0.3866346658666467, + "grad_norm": 5.022141456604004, + "learning_rate": 1.5398496960882264e-05, + "loss": 2.3806, + "step": 30930 + }, + { + "epoch": 0.3866596664916623, + "grad_norm": 7.761609077453613, + "learning_rate": 1.539776234449686e-05, + "loss": 0.6794, + "step": 30932 + }, + { + "epoch": 0.3866846671166779, + "grad_norm": 3.1887354850769043, + "learning_rate": 1.5397027687002862e-05, + "loss": 0.9343, + "step": 30934 + }, + { + "epoch": 0.3867096677416935, + "grad_norm": 2.262439250946045, + "learning_rate": 1.539629298840587e-05, + "loss": 1.1928, + "step": 30936 + }, + { + "epoch": 0.38673466836670917, + "grad_norm": 2.5116028785705566, + "learning_rate": 1.5395558248711466e-05, + "loss": 0.7182, + "step": 30938 + }, + { + "epoch": 0.3867596689917248, + "grad_norm": 0.9623367786407471, + "learning_rate": 1.539482346792526e-05, + "loss": 0.752, + "step": 30940 + }, + { + "epoch": 0.3867846696167404, + "grad_norm": 2.7451162338256836, + "learning_rate": 1.5394088646052838e-05, + "loss": 0.8674, + "step": 30942 + }, + { + "epoch": 0.38680967024175605, + "grad_norm": 1.1432170867919922, + "learning_rate": 1.53933537830998e-05, + "loss": 0.1145, + "step": 30944 + }, + { + "epoch": 0.38683467086677165, + "grad_norm": 4.398810863494873, + "learning_rate": 1.5392618879071743e-05, + "loss": 0.7953, + "step": 30946 + }, + { + "epoch": 0.3868596714917873, + "grad_norm": 3.8941192626953125, + "learning_rate": 1.539188393397426e-05, + "loss": 0.947, + "step": 30948 + }, + { + "epoch": 0.38688467211680294, + "grad_norm": 0.0012506424682214856, + "learning_rate": 1.5391148947812955e-05, + "loss": 0.286, + "step": 30950 + }, + { + "epoch": 0.38690967274181853, + "grad_norm": 3.0566821098327637, + "learning_rate": 1.5390413920593422e-05, + "loss": 0.789, + "step": 30952 + }, + { + "epoch": 0.3869346733668342, + "grad_norm": 0.34669962525367737, + "learning_rate": 1.5389678852321258e-05, + "loss": 0.9348, + "step": 30954 + }, + { + "epoch": 0.3869596739918498, + "grad_norm": 7.0295820236206055, + "learning_rate": 1.538894374300206e-05, + "loss": 2.557, + "step": 30956 + }, + { + "epoch": 0.3869846746168654, + "grad_norm": 3.331909418106079, + "learning_rate": 1.538820859264143e-05, + "loss": 1.3187, + "step": 30958 + }, + { + "epoch": 0.38700967524188107, + "grad_norm": 2.7977993488311768, + "learning_rate": 1.5387473401244966e-05, + "loss": 1.0965, + "step": 30960 + }, + { + "epoch": 0.38703467586689666, + "grad_norm": 3.0695061683654785, + "learning_rate": 1.5386738168818268e-05, + "loss": 0.9237, + "step": 30962 + }, + { + "epoch": 0.3870596764919123, + "grad_norm": 3.0037102699279785, + "learning_rate": 1.538600289536693e-05, + "loss": 2.5093, + "step": 30964 + }, + { + "epoch": 0.3870846771169279, + "grad_norm": 2.934934139251709, + "learning_rate": 1.538526758089656e-05, + "loss": 1.8629, + "step": 30966 + }, + { + "epoch": 0.38710967774194355, + "grad_norm": 4.329530239105225, + "learning_rate": 1.5384532225412753e-05, + "loss": 2.0245, + "step": 30968 + }, + { + "epoch": 0.3871346783669592, + "grad_norm": 2.9215643405914307, + "learning_rate": 1.5383796828921105e-05, + "loss": 1.6957, + "step": 30970 + }, + { + "epoch": 0.3871596789919748, + "grad_norm": 4.966194152832031, + "learning_rate": 1.538306139142723e-05, + "loss": 1.1905, + "step": 30972 + }, + { + "epoch": 0.38718467961699043, + "grad_norm": 2.455803155899048, + "learning_rate": 1.538232591293671e-05, + "loss": 0.7316, + "step": 30974 + }, + { + "epoch": 0.387209680242006, + "grad_norm": 3.3986408710479736, + "learning_rate": 1.5381590393455164e-05, + "loss": 1.8316, + "step": 30976 + }, + { + "epoch": 0.3872346808670217, + "grad_norm": 3.244638204574585, + "learning_rate": 1.5380854832988187e-05, + "loss": 0.7925, + "step": 30978 + }, + { + "epoch": 0.3872596814920373, + "grad_norm": 3.492905855178833, + "learning_rate": 1.5380119231541376e-05, + "loss": 1.3461, + "step": 30980 + }, + { + "epoch": 0.3872846821170529, + "grad_norm": 2.6782853603363037, + "learning_rate": 1.5379383589120343e-05, + "loss": 0.7153, + "step": 30982 + }, + { + "epoch": 0.38730968274206856, + "grad_norm": 3.1765084266662598, + "learning_rate": 1.5378647905730678e-05, + "loss": 1.1734, + "step": 30984 + }, + { + "epoch": 0.38733468336708415, + "grad_norm": 2.9098241329193115, + "learning_rate": 1.5377912181377998e-05, + "loss": 0.3288, + "step": 30986 + }, + { + "epoch": 0.3873596839920998, + "grad_norm": 2.9970319271087646, + "learning_rate": 1.53771764160679e-05, + "loss": 1.9008, + "step": 30988 + }, + { + "epoch": 0.38738468461711545, + "grad_norm": 3.470066547393799, + "learning_rate": 1.537644060980598e-05, + "loss": 1.2263, + "step": 30990 + }, + { + "epoch": 0.38740968524213104, + "grad_norm": 2.5295000076293945, + "learning_rate": 1.5375704762597857e-05, + "loss": 0.332, + "step": 30992 + }, + { + "epoch": 0.3874346858671467, + "grad_norm": 0.6158449649810791, + "learning_rate": 1.5374968874449124e-05, + "loss": 0.8406, + "step": 30994 + }, + { + "epoch": 0.3874596864921623, + "grad_norm": 3.093970775604248, + "learning_rate": 1.5374232945365382e-05, + "loss": 1.3456, + "step": 30996 + }, + { + "epoch": 0.38748468711717793, + "grad_norm": 1.6118558645248413, + "learning_rate": 1.5373496975352245e-05, + "loss": 0.1183, + "step": 30998 + }, + { + "epoch": 0.3875096877421936, + "grad_norm": 2.49513578414917, + "learning_rate": 1.537276096441532e-05, + "loss": 0.4277, + "step": 31000 + }, + { + "epoch": 0.38753468836720917, + "grad_norm": 3.8257946968078613, + "learning_rate": 1.5372024912560202e-05, + "loss": 1.6035, + "step": 31002 + }, + { + "epoch": 0.3875596889922248, + "grad_norm": 1.79994797706604, + "learning_rate": 1.5371288819792503e-05, + "loss": 0.8379, + "step": 31004 + }, + { + "epoch": 0.3875846896172404, + "grad_norm": 2.8626747131347656, + "learning_rate": 1.5370552686117833e-05, + "loss": 0.3192, + "step": 31006 + }, + { + "epoch": 0.38760969024225606, + "grad_norm": 2.996356248855591, + "learning_rate": 1.5369816511541784e-05, + "loss": 1.4619, + "step": 31008 + }, + { + "epoch": 0.3876346908672717, + "grad_norm": 8.577194213867188, + "learning_rate": 1.536908029606998e-05, + "loss": 2.34, + "step": 31010 + }, + { + "epoch": 0.3876596914922873, + "grad_norm": 2.1019346714019775, + "learning_rate": 1.5368344039708014e-05, + "loss": 0.5954, + "step": 31012 + }, + { + "epoch": 0.38768469211730294, + "grad_norm": 4.902923583984375, + "learning_rate": 1.53676077424615e-05, + "loss": 2.4252, + "step": 31014 + }, + { + "epoch": 0.38770969274231853, + "grad_norm": 0.4932330250740051, + "learning_rate": 1.5366871404336047e-05, + "loss": 0.2474, + "step": 31016 + }, + { + "epoch": 0.3877346933673342, + "grad_norm": 5.05720329284668, + "learning_rate": 1.536613502533726e-05, + "loss": 1.2958, + "step": 31018 + }, + { + "epoch": 0.38775969399234983, + "grad_norm": 0.020185226574540138, + "learning_rate": 1.5365398605470746e-05, + "loss": 1.3284, + "step": 31020 + }, + { + "epoch": 0.3877846946173654, + "grad_norm": 3.6075515747070312, + "learning_rate": 1.5364662144742113e-05, + "loss": 0.2156, + "step": 31022 + }, + { + "epoch": 0.38780969524238107, + "grad_norm": 3.49560809135437, + "learning_rate": 1.5363925643156978e-05, + "loss": 0.5346, + "step": 31024 + }, + { + "epoch": 0.38783469586739666, + "grad_norm": 4.714436054229736, + "learning_rate": 1.5363189100720937e-05, + "loss": 1.3155, + "step": 31026 + }, + { + "epoch": 0.3878596964924123, + "grad_norm": 2.3686716556549072, + "learning_rate": 1.536245251743961e-05, + "loss": 1.4553, + "step": 31028 + }, + { + "epoch": 0.38788469711742796, + "grad_norm": 2.589585542678833, + "learning_rate": 1.5361715893318602e-05, + "loss": 0.4736, + "step": 31030 + }, + { + "epoch": 0.38790969774244355, + "grad_norm": 0.005600477568805218, + "learning_rate": 1.5360979228363525e-05, + "loss": 0.0002, + "step": 31032 + }, + { + "epoch": 0.3879346983674592, + "grad_norm": 5.945973873138428, + "learning_rate": 1.5360242522579986e-05, + "loss": 1.2576, + "step": 31034 + }, + { + "epoch": 0.3879596989924748, + "grad_norm": 6.026157379150391, + "learning_rate": 1.53595057759736e-05, + "loss": 0.7636, + "step": 31036 + }, + { + "epoch": 0.38798469961749044, + "grad_norm": 0.0023529408499598503, + "learning_rate": 1.5358768988549976e-05, + "loss": 1.0389, + "step": 31038 + }, + { + "epoch": 0.3880097002425061, + "grad_norm": 4.442980766296387, + "learning_rate": 1.5358032160314723e-05, + "loss": 1.2292, + "step": 31040 + }, + { + "epoch": 0.3880347008675217, + "grad_norm": 0.11058889329433441, + "learning_rate": 1.5357295291273457e-05, + "loss": 0.0231, + "step": 31042 + }, + { + "epoch": 0.3880597014925373, + "grad_norm": 1.9203499555587769, + "learning_rate": 1.5356558381431787e-05, + "loss": 1.4003, + "step": 31044 + }, + { + "epoch": 0.3880847021175529, + "grad_norm": 2.2387402057647705, + "learning_rate": 1.5355821430795326e-05, + "loss": 0.5958, + "step": 31046 + }, + { + "epoch": 0.38810970274256856, + "grad_norm": 4.9878058433532715, + "learning_rate": 1.5355084439369686e-05, + "loss": 1.3022, + "step": 31048 + }, + { + "epoch": 0.3881347033675842, + "grad_norm": 3.345402240753174, + "learning_rate": 1.5354347407160482e-05, + "loss": 0.8271, + "step": 31050 + }, + { + "epoch": 0.3881597039925998, + "grad_norm": 5.048244953155518, + "learning_rate": 1.535361033417332e-05, + "loss": 0.9104, + "step": 31052 + }, + { + "epoch": 0.38818470461761545, + "grad_norm": 4.183107376098633, + "learning_rate": 1.5352873220413825e-05, + "loss": 1.6106, + "step": 31054 + }, + { + "epoch": 0.38820970524263104, + "grad_norm": 0.7453113198280334, + "learning_rate": 1.5352136065887603e-05, + "loss": 0.0319, + "step": 31056 + }, + { + "epoch": 0.3882347058676467, + "grad_norm": 2.064415216445923, + "learning_rate": 1.5351398870600268e-05, + "loss": 0.972, + "step": 31058 + }, + { + "epoch": 0.38825970649266234, + "grad_norm": 5.599050998687744, + "learning_rate": 1.5350661634557435e-05, + "loss": 0.9508, + "step": 31060 + }, + { + "epoch": 0.38828470711767793, + "grad_norm": 2.5052921772003174, + "learning_rate": 1.534992435776472e-05, + "loss": 0.5292, + "step": 31062 + }, + { + "epoch": 0.3883097077426936, + "grad_norm": 3.213944673538208, + "learning_rate": 1.5349187040227743e-05, + "loss": 1.285, + "step": 31064 + }, + { + "epoch": 0.38833470836770917, + "grad_norm": 1.5534400939941406, + "learning_rate": 1.534844968195211e-05, + "loss": 0.2369, + "step": 31066 + }, + { + "epoch": 0.3883597089927248, + "grad_norm": 9.45281982421875, + "learning_rate": 1.5347712282943436e-05, + "loss": 2.6797, + "step": 31068 + }, + { + "epoch": 0.38838470961774046, + "grad_norm": 3.2874996662139893, + "learning_rate": 1.5346974843207346e-05, + "loss": 0.7194, + "step": 31070 + }, + { + "epoch": 0.38840971024275606, + "grad_norm": 2.4417290687561035, + "learning_rate": 1.5346237362749453e-05, + "loss": 1.5783, + "step": 31072 + }, + { + "epoch": 0.3884347108677717, + "grad_norm": 0.0018937084823846817, + "learning_rate": 1.534549984157537e-05, + "loss": 0.5426, + "step": 31074 + }, + { + "epoch": 0.3884597114927873, + "grad_norm": 2.541409730911255, + "learning_rate": 1.5344762279690715e-05, + "loss": 0.5515, + "step": 31076 + }, + { + "epoch": 0.38848471211780294, + "grad_norm": 1.481553077697754, + "learning_rate": 1.534402467710111e-05, + "loss": 0.0582, + "step": 31078 + }, + { + "epoch": 0.3885097127428186, + "grad_norm": 3.564951181411743, + "learning_rate": 1.534328703381217e-05, + "loss": 0.877, + "step": 31080 + }, + { + "epoch": 0.3885347133678342, + "grad_norm": 3.2821898460388184, + "learning_rate": 1.5342549349829505e-05, + "loss": 0.4489, + "step": 31082 + }, + { + "epoch": 0.38855971399284983, + "grad_norm": 3.7441012859344482, + "learning_rate": 1.5341811625158744e-05, + "loss": 0.8366, + "step": 31084 + }, + { + "epoch": 0.3885847146178654, + "grad_norm": 3.896854877471924, + "learning_rate": 1.53410738598055e-05, + "loss": 2.0984, + "step": 31086 + }, + { + "epoch": 0.38860971524288107, + "grad_norm": 2.5664782524108887, + "learning_rate": 1.5340336053775392e-05, + "loss": 0.3851, + "step": 31088 + }, + { + "epoch": 0.3886347158678967, + "grad_norm": 2.7441415786743164, + "learning_rate": 1.5339598207074042e-05, + "loss": 1.0161, + "step": 31090 + }, + { + "epoch": 0.3886597164929123, + "grad_norm": 2.4673876762390137, + "learning_rate": 1.5338860319707064e-05, + "loss": 1.1077, + "step": 31092 + }, + { + "epoch": 0.38868471711792796, + "grad_norm": 0.0009704080293886364, + "learning_rate": 1.5338122391680085e-05, + "loss": 0.5216, + "step": 31094 + }, + { + "epoch": 0.38870971774294355, + "grad_norm": 0.6539963483810425, + "learning_rate": 1.533738442299872e-05, + "loss": 0.1709, + "step": 31096 + }, + { + "epoch": 0.3887347183679592, + "grad_norm": 2.284575939178467, + "learning_rate": 1.5336646413668585e-05, + "loss": 1.1159, + "step": 31098 + }, + { + "epoch": 0.38875971899297485, + "grad_norm": 2.3415000438690186, + "learning_rate": 1.533590836369531e-05, + "loss": 0.9429, + "step": 31100 + }, + { + "epoch": 0.38878471961799044, + "grad_norm": 4.331859111785889, + "learning_rate": 1.533517027308451e-05, + "loss": 1.3223, + "step": 31102 + }, + { + "epoch": 0.3888097202430061, + "grad_norm": 3.5587987899780273, + "learning_rate": 1.5334432141841807e-05, + "loss": 0.9237, + "step": 31104 + }, + { + "epoch": 0.3888347208680217, + "grad_norm": 2.0719430446624756, + "learning_rate": 1.5333693969972827e-05, + "loss": 0.7501, + "step": 31106 + }, + { + "epoch": 0.3888597214930373, + "grad_norm": 1.576102614402771, + "learning_rate": 1.5332955757483185e-05, + "loss": 0.525, + "step": 31108 + }, + { + "epoch": 0.388884722118053, + "grad_norm": 3.3806982040405273, + "learning_rate": 1.5332217504378506e-05, + "loss": 0.9194, + "step": 31110 + }, + { + "epoch": 0.38890972274306856, + "grad_norm": 2.000445604324341, + "learning_rate": 1.5331479210664412e-05, + "loss": 0.7964, + "step": 31112 + }, + { + "epoch": 0.3889347233680842, + "grad_norm": 0.2737843096256256, + "learning_rate": 1.533074087634653e-05, + "loss": 0.8677, + "step": 31114 + }, + { + "epoch": 0.3889597239930998, + "grad_norm": 4.3611741065979, + "learning_rate": 1.5330002501430477e-05, + "loss": 1.4821, + "step": 31116 + }, + { + "epoch": 0.38898472461811545, + "grad_norm": 3.735053777694702, + "learning_rate": 1.5329264085921874e-05, + "loss": 1.0861, + "step": 31118 + }, + { + "epoch": 0.3890097252431311, + "grad_norm": 9.579827308654785, + "learning_rate": 1.5328525629826356e-05, + "loss": 1.6177, + "step": 31120 + }, + { + "epoch": 0.3890347258681467, + "grad_norm": 4.294154167175293, + "learning_rate": 1.5327787133149538e-05, + "loss": 0.3569, + "step": 31122 + }, + { + "epoch": 0.38905972649316234, + "grad_norm": 2.059199094772339, + "learning_rate": 1.532704859589705e-05, + "loss": 0.9051, + "step": 31124 + }, + { + "epoch": 0.38908472711817793, + "grad_norm": 1.2234630584716797, + "learning_rate": 1.5326310018074507e-05, + "loss": 0.2465, + "step": 31126 + }, + { + "epoch": 0.3891097277431936, + "grad_norm": 3.318544626235962, + "learning_rate": 1.532557139968754e-05, + "loss": 1.3625, + "step": 31128 + }, + { + "epoch": 0.3891347283682092, + "grad_norm": 3.565258741378784, + "learning_rate": 1.5324832740741778e-05, + "loss": 1.6875, + "step": 31130 + }, + { + "epoch": 0.3891597289932248, + "grad_norm": 5.8124237060546875, + "learning_rate": 1.5324094041242842e-05, + "loss": 0.6612, + "step": 31132 + }, + { + "epoch": 0.38918472961824047, + "grad_norm": 3.6487011909484863, + "learning_rate": 1.532335530119636e-05, + "loss": 2.4714, + "step": 31134 + }, + { + "epoch": 0.38920973024325606, + "grad_norm": 3.755615234375, + "learning_rate": 1.5322616520607952e-05, + "loss": 1.3136, + "step": 31136 + }, + { + "epoch": 0.3892347308682717, + "grad_norm": 4.3731255531311035, + "learning_rate": 1.5321877699483257e-05, + "loss": 1.7176, + "step": 31138 + }, + { + "epoch": 0.38925973149328735, + "grad_norm": 5.558335304260254, + "learning_rate": 1.5321138837827887e-05, + "loss": 2.4181, + "step": 31140 + }, + { + "epoch": 0.38928473211830295, + "grad_norm": 3.9070241451263428, + "learning_rate": 1.532039993564748e-05, + "loss": 1.3987, + "step": 31142 + }, + { + "epoch": 0.3893097327433186, + "grad_norm": 2.4564402103424072, + "learning_rate": 1.531966099294766e-05, + "loss": 0.1644, + "step": 31144 + }, + { + "epoch": 0.3893347333683342, + "grad_norm": 0.0027154707349836826, + "learning_rate": 1.5318922009734052e-05, + "loss": 1.5146, + "step": 31146 + }, + { + "epoch": 0.38935973399334983, + "grad_norm": 0.3759269714355469, + "learning_rate": 1.5318182986012286e-05, + "loss": 0.5607, + "step": 31148 + }, + { + "epoch": 0.3893847346183655, + "grad_norm": 0.03130126744508743, + "learning_rate": 1.531744392178799e-05, + "loss": 0.0373, + "step": 31150 + }, + { + "epoch": 0.38940973524338107, + "grad_norm": 2.4400885105133057, + "learning_rate": 1.5316704817066798e-05, + "loss": 0.8588, + "step": 31152 + }, + { + "epoch": 0.3894347358683967, + "grad_norm": 0.0029084498528391123, + "learning_rate": 1.5315965671854328e-05, + "loss": 0.685, + "step": 31154 + }, + { + "epoch": 0.3894597364934123, + "grad_norm": 3.843538761138916, + "learning_rate": 1.5315226486156217e-05, + "loss": 0.786, + "step": 31156 + }, + { + "epoch": 0.38948473711842796, + "grad_norm": 2.4588983058929443, + "learning_rate": 1.5314487259978093e-05, + "loss": 0.482, + "step": 31158 + }, + { + "epoch": 0.3895097377434436, + "grad_norm": 4.974063396453857, + "learning_rate": 1.5313747993325584e-05, + "loss": 1.9438, + "step": 31160 + }, + { + "epoch": 0.3895347383684592, + "grad_norm": 8.59165096282959, + "learning_rate": 1.5313008686204324e-05, + "loss": 1.1124, + "step": 31162 + }, + { + "epoch": 0.38955973899347485, + "grad_norm": 0.0013790795346722007, + "learning_rate": 1.531226933861994e-05, + "loss": 1.076, + "step": 31164 + }, + { + "epoch": 0.38958473961849044, + "grad_norm": 0.2768314778804779, + "learning_rate": 1.5311529950578066e-05, + "loss": 0.6022, + "step": 31166 + }, + { + "epoch": 0.3896097402435061, + "grad_norm": 0.003153339261189103, + "learning_rate": 1.5310790522084327e-05, + "loss": 0.716, + "step": 31168 + }, + { + "epoch": 0.38963474086852173, + "grad_norm": 3.666795015335083, + "learning_rate": 1.5310051053144362e-05, + "loss": 1.287, + "step": 31170 + }, + { + "epoch": 0.3896597414935373, + "grad_norm": 3.6136257648468018, + "learning_rate": 1.5309311543763798e-05, + "loss": 0.6975, + "step": 31172 + }, + { + "epoch": 0.389684742118553, + "grad_norm": 3.9996609687805176, + "learning_rate": 1.5308571993948266e-05, + "loss": 1.1954, + "step": 31174 + }, + { + "epoch": 0.38970974274356857, + "grad_norm": 5.558594226837158, + "learning_rate": 1.5307832403703404e-05, + "loss": 1.9962, + "step": 31176 + }, + { + "epoch": 0.3897347433685842, + "grad_norm": 3.883068323135376, + "learning_rate": 1.530709277303484e-05, + "loss": 1.2252, + "step": 31178 + }, + { + "epoch": 0.38975974399359986, + "grad_norm": 2.8098978996276855, + "learning_rate": 1.530635310194821e-05, + "loss": 0.5135, + "step": 31180 + }, + { + "epoch": 0.38978474461861545, + "grad_norm": 4.190893173217773, + "learning_rate": 1.530561339044914e-05, + "loss": 0.8517, + "step": 31182 + }, + { + "epoch": 0.3898097452436311, + "grad_norm": 2.316401481628418, + "learning_rate": 1.530487363854327e-05, + "loss": 0.8846, + "step": 31184 + }, + { + "epoch": 0.3898347458686467, + "grad_norm": 0.002235057298094034, + "learning_rate": 1.5304133846236233e-05, + "loss": 0.538, + "step": 31186 + }, + { + "epoch": 0.38985974649366234, + "grad_norm": 1.6187435388565063, + "learning_rate": 1.5303394013533666e-05, + "loss": 0.158, + "step": 31188 + }, + { + "epoch": 0.389884747118678, + "grad_norm": 2.495840311050415, + "learning_rate": 1.5302654140441196e-05, + "loss": 1.2026, + "step": 31190 + }, + { + "epoch": 0.3899097477436936, + "grad_norm": 2.1421220302581787, + "learning_rate": 1.5301914226964462e-05, + "loss": 1.048, + "step": 31192 + }, + { + "epoch": 0.3899347483687092, + "grad_norm": 0.9332618713378906, + "learning_rate": 1.5301174273109105e-05, + "loss": 0.1377, + "step": 31194 + }, + { + "epoch": 0.3899597489937248, + "grad_norm": 4.263777732849121, + "learning_rate": 1.530043427888075e-05, + "loss": 1.6763, + "step": 31196 + }, + { + "epoch": 0.38998474961874047, + "grad_norm": 5.182203769683838, + "learning_rate": 1.5299694244285033e-05, + "loss": 1.4007, + "step": 31198 + }, + { + "epoch": 0.3900097502437561, + "grad_norm": 3.377835988998413, + "learning_rate": 1.5298954169327596e-05, + "loss": 1.8733, + "step": 31200 + }, + { + "epoch": 0.3900347508687717, + "grad_norm": 0.4780643582344055, + "learning_rate": 1.529821405401408e-05, + "loss": 0.5766, + "step": 31202 + }, + { + "epoch": 0.39005975149378735, + "grad_norm": 4.596667766571045, + "learning_rate": 1.529747389835011e-05, + "loss": 0.2985, + "step": 31204 + }, + { + "epoch": 0.39008475211880295, + "grad_norm": 2.363231658935547, + "learning_rate": 1.5296733702341328e-05, + "loss": 1.144, + "step": 31206 + }, + { + "epoch": 0.3901097527438186, + "grad_norm": 2.952004909515381, + "learning_rate": 1.529599346599337e-05, + "loss": 1.1603, + "step": 31208 + }, + { + "epoch": 0.39013475336883424, + "grad_norm": 5.300638198852539, + "learning_rate": 1.5295253189311874e-05, + "loss": 1.403, + "step": 31210 + }, + { + "epoch": 0.39015975399384983, + "grad_norm": 1.7255277633666992, + "learning_rate": 1.529451287230248e-05, + "loss": 0.7967, + "step": 31212 + }, + { + "epoch": 0.3901847546188655, + "grad_norm": 0.6075528860092163, + "learning_rate": 1.529377251497083e-05, + "loss": 0.0976, + "step": 31214 + }, + { + "epoch": 0.3902097552438811, + "grad_norm": 6.310999393463135, + "learning_rate": 1.529303211732255e-05, + "loss": 0.9911, + "step": 31216 + }, + { + "epoch": 0.3902347558688967, + "grad_norm": 0.00217766803689301, + "learning_rate": 1.5292291679363288e-05, + "loss": 0.781, + "step": 31218 + }, + { + "epoch": 0.39025975649391237, + "grad_norm": 0.579507052898407, + "learning_rate": 1.5291551201098687e-05, + "loss": 1.8495, + "step": 31220 + }, + { + "epoch": 0.39028475711892796, + "grad_norm": 3.321791410446167, + "learning_rate": 1.5290810682534368e-05, + "loss": 0.9688, + "step": 31222 + }, + { + "epoch": 0.3903097577439436, + "grad_norm": 0.20378713309764862, + "learning_rate": 1.529007012367599e-05, + "loss": 0.3429, + "step": 31224 + }, + { + "epoch": 0.3903347583689592, + "grad_norm": 0.002764345146715641, + "learning_rate": 1.5289329524529187e-05, + "loss": 0.7753, + "step": 31226 + }, + { + "epoch": 0.39035975899397485, + "grad_norm": 3.8949222564697266, + "learning_rate": 1.52885888850996e-05, + "loss": 1.1255, + "step": 31228 + }, + { + "epoch": 0.3903847596189905, + "grad_norm": 7.822868824005127, + "learning_rate": 1.5287848205392863e-05, + "loss": 0.5198, + "step": 31230 + }, + { + "epoch": 0.3904097602440061, + "grad_norm": 4.509541034698486, + "learning_rate": 1.5287107485414627e-05, + "loss": 1.119, + "step": 31232 + }, + { + "epoch": 0.39043476086902174, + "grad_norm": 3.3793153762817383, + "learning_rate": 1.5286366725170524e-05, + "loss": 1.1334, + "step": 31234 + }, + { + "epoch": 0.3904597614940373, + "grad_norm": 2.0054540634155273, + "learning_rate": 1.52856259246662e-05, + "loss": 0.62, + "step": 31236 + }, + { + "epoch": 0.390484762119053, + "grad_norm": 6.548942565917969, + "learning_rate": 1.5284885083907296e-05, + "loss": 0.729, + "step": 31238 + }, + { + "epoch": 0.3905097627440686, + "grad_norm": 0.376539945602417, + "learning_rate": 1.5284144202899457e-05, + "loss": 0.0358, + "step": 31240 + }, + { + "epoch": 0.3905347633690842, + "grad_norm": 4.535933971405029, + "learning_rate": 1.528340328164832e-05, + "loss": 1.2942, + "step": 31242 + }, + { + "epoch": 0.39055976399409986, + "grad_norm": 5.316640377044678, + "learning_rate": 1.5282662320159533e-05, + "loss": 0.3098, + "step": 31244 + }, + { + "epoch": 0.39058476461911545, + "grad_norm": 6.154221057891846, + "learning_rate": 1.5281921318438737e-05, + "loss": 1.8794, + "step": 31246 + }, + { + "epoch": 0.3906097652441311, + "grad_norm": 2.993894338607788, + "learning_rate": 1.528118027649157e-05, + "loss": 1.1991, + "step": 31248 + }, + { + "epoch": 0.39063476586914675, + "grad_norm": 3.6916801929473877, + "learning_rate": 1.5280439194323687e-05, + "loss": 1.1993, + "step": 31250 + }, + { + "epoch": 0.39065976649416234, + "grad_norm": 0.3018098473548889, + "learning_rate": 1.5279698071940724e-05, + "loss": 0.6291, + "step": 31252 + }, + { + "epoch": 0.390684767119178, + "grad_norm": 6.021514892578125, + "learning_rate": 1.5278956909348328e-05, + "loss": 0.5509, + "step": 31254 + }, + { + "epoch": 0.3907097677441936, + "grad_norm": 4.643314361572266, + "learning_rate": 1.5278215706552143e-05, + "loss": 1.3562, + "step": 31256 + }, + { + "epoch": 0.39073476836920923, + "grad_norm": 5.308900356292725, + "learning_rate": 1.527747446355781e-05, + "loss": 0.9677, + "step": 31258 + }, + { + "epoch": 0.3907597689942249, + "grad_norm": 0.0010455292649567127, + "learning_rate": 1.527673318037098e-05, + "loss": 0.9814, + "step": 31260 + }, + { + "epoch": 0.39078476961924047, + "grad_norm": 1.852628231048584, + "learning_rate": 1.5275991856997293e-05, + "loss": 0.5451, + "step": 31262 + }, + { + "epoch": 0.3908097702442561, + "grad_norm": 2.148469924926758, + "learning_rate": 1.5275250493442404e-05, + "loss": 1.1883, + "step": 31264 + }, + { + "epoch": 0.3908347708692717, + "grad_norm": 4.207574367523193, + "learning_rate": 1.5274509089711952e-05, + "loss": 1.9536, + "step": 31266 + }, + { + "epoch": 0.39085977149428736, + "grad_norm": 0.004505499731749296, + "learning_rate": 1.5273767645811583e-05, + "loss": 1.403, + "step": 31268 + }, + { + "epoch": 0.390884772119303, + "grad_norm": 1.714237093925476, + "learning_rate": 1.5273026161746947e-05, + "loss": 0.057, + "step": 31270 + }, + { + "epoch": 0.3909097727443186, + "grad_norm": 2.170320749282837, + "learning_rate": 1.5272284637523687e-05, + "loss": 0.4698, + "step": 31272 + }, + { + "epoch": 0.39093477336933424, + "grad_norm": 0.0016638936940580606, + "learning_rate": 1.5271543073147453e-05, + "loss": 0.4978, + "step": 31274 + }, + { + "epoch": 0.39095977399434984, + "grad_norm": 4.840392112731934, + "learning_rate": 1.5270801468623894e-05, + "loss": 1.2472, + "step": 31276 + }, + { + "epoch": 0.3909847746193655, + "grad_norm": 0.22726860642433167, + "learning_rate": 1.5270059823958658e-05, + "loss": 0.0491, + "step": 31278 + }, + { + "epoch": 0.39100977524438113, + "grad_norm": 3.026869535446167, + "learning_rate": 1.526931813915739e-05, + "loss": 1.1638, + "step": 31280 + }, + { + "epoch": 0.3910347758693967, + "grad_norm": 7.304043292999268, + "learning_rate": 1.5268576414225734e-05, + "loss": 1.1172, + "step": 31282 + }, + { + "epoch": 0.39105977649441237, + "grad_norm": 3.450226306915283, + "learning_rate": 1.5267834649169354e-05, + "loss": 1.0048, + "step": 31284 + }, + { + "epoch": 0.39108477711942796, + "grad_norm": 3.586310863494873, + "learning_rate": 1.5267092843993884e-05, + "loss": 2.037, + "step": 31286 + }, + { + "epoch": 0.3911097777444436, + "grad_norm": 7.989938735961914, + "learning_rate": 1.5266350998704984e-05, + "loss": 1.3539, + "step": 31288 + }, + { + "epoch": 0.39113477836945926, + "grad_norm": 5.661449432373047, + "learning_rate": 1.52656091133083e-05, + "loss": 0.5176, + "step": 31290 + }, + { + "epoch": 0.39115977899447485, + "grad_norm": 0.5380175113677979, + "learning_rate": 1.526486718780948e-05, + "loss": 0.1137, + "step": 31292 + }, + { + "epoch": 0.3911847796194905, + "grad_norm": 0.0023001725785434246, + "learning_rate": 1.5264125222214174e-05, + "loss": 0.9357, + "step": 31294 + }, + { + "epoch": 0.3912097802445061, + "grad_norm": 1.42387056350708, + "learning_rate": 1.5263383216528036e-05, + "loss": 0.7613, + "step": 31296 + }, + { + "epoch": 0.39123478086952174, + "grad_norm": 0.0019675251096487045, + "learning_rate": 1.5262641170756717e-05, + "loss": 0.8281, + "step": 31298 + }, + { + "epoch": 0.3912597814945374, + "grad_norm": 2.516233444213867, + "learning_rate": 1.5261899084905865e-05, + "loss": 1.3215, + "step": 31300 + }, + { + "epoch": 0.391284782119553, + "grad_norm": 3.735743284225464, + "learning_rate": 1.5261156958981133e-05, + "loss": 1.1652, + "step": 31302 + }, + { + "epoch": 0.3913097827445686, + "grad_norm": 2.248354196548462, + "learning_rate": 1.5260414792988178e-05, + "loss": 0.6929, + "step": 31304 + }, + { + "epoch": 0.3913347833695842, + "grad_norm": 3.7524473667144775, + "learning_rate": 1.525967258693265e-05, + "loss": 1.4149, + "step": 31306 + }, + { + "epoch": 0.39135978399459986, + "grad_norm": 0.0011135210515931249, + "learning_rate": 1.525893034082019e-05, + "loss": 0.5618, + "step": 31308 + }, + { + "epoch": 0.3913847846196155, + "grad_norm": 0.8382257223129272, + "learning_rate": 1.5258188054656464e-05, + "loss": 0.2155, + "step": 31310 + }, + { + "epoch": 0.3914097852446311, + "grad_norm": 4.193097114562988, + "learning_rate": 1.5257445728447122e-05, + "loss": 0.7815, + "step": 31312 + }, + { + "epoch": 0.39143478586964675, + "grad_norm": 2.858571767807007, + "learning_rate": 1.5256703362197818e-05, + "loss": 0.445, + "step": 31314 + }, + { + "epoch": 0.39145978649466234, + "grad_norm": 2.5891053676605225, + "learning_rate": 1.5255960955914204e-05, + "loss": 1.1478, + "step": 31316 + }, + { + "epoch": 0.391484787119678, + "grad_norm": 3.2702152729034424, + "learning_rate": 1.5255218509601935e-05, + "loss": 1.9781, + "step": 31318 + }, + { + "epoch": 0.39150978774469364, + "grad_norm": 2.6852877140045166, + "learning_rate": 1.5254476023266664e-05, + "loss": 2.0943, + "step": 31320 + }, + { + "epoch": 0.39153478836970923, + "grad_norm": 2.8770065307617188, + "learning_rate": 1.5253733496914047e-05, + "loss": 1.2712, + "step": 31322 + }, + { + "epoch": 0.3915597889947249, + "grad_norm": 0.7826773524284363, + "learning_rate": 1.5252990930549737e-05, + "loss": 0.5605, + "step": 31324 + }, + { + "epoch": 0.39158478961974047, + "grad_norm": 2.1516735553741455, + "learning_rate": 1.5252248324179393e-05, + "loss": 0.6192, + "step": 31326 + }, + { + "epoch": 0.3916097902447561, + "grad_norm": 5.02403450012207, + "learning_rate": 1.525150567780867e-05, + "loss": 1.5352, + "step": 31328 + }, + { + "epoch": 0.39163479086977177, + "grad_norm": 3.5573790073394775, + "learning_rate": 1.5250762991443217e-05, + "loss": 1.3632, + "step": 31330 + }, + { + "epoch": 0.39165979149478736, + "grad_norm": 3.781390428543091, + "learning_rate": 1.5250020265088701e-05, + "loss": 1.2729, + "step": 31332 + }, + { + "epoch": 0.391684792119803, + "grad_norm": 0.020426752045750618, + "learning_rate": 1.5249277498750772e-05, + "loss": 0.5709, + "step": 31334 + }, + { + "epoch": 0.3917097927448186, + "grad_norm": 3.1512746810913086, + "learning_rate": 1.5248534692435083e-05, + "loss": 0.4698, + "step": 31336 + }, + { + "epoch": 0.39173479336983424, + "grad_norm": 2.806966781616211, + "learning_rate": 1.5247791846147302e-05, + "loss": 0.8327, + "step": 31338 + }, + { + "epoch": 0.3917597939948499, + "grad_norm": 3.3746562004089355, + "learning_rate": 1.5247048959893079e-05, + "loss": 0.9542, + "step": 31340 + }, + { + "epoch": 0.3917847946198655, + "grad_norm": 4.115454196929932, + "learning_rate": 1.524630603367807e-05, + "loss": 0.498, + "step": 31342 + }, + { + "epoch": 0.39180979524488113, + "grad_norm": 1.927931547164917, + "learning_rate": 1.5245563067507942e-05, + "loss": 0.7422, + "step": 31344 + }, + { + "epoch": 0.3918347958698967, + "grad_norm": 2.2111270427703857, + "learning_rate": 1.5244820061388343e-05, + "loss": 0.7977, + "step": 31346 + }, + { + "epoch": 0.39185979649491237, + "grad_norm": 5.578969955444336, + "learning_rate": 1.5244077015324936e-05, + "loss": 1.0317, + "step": 31348 + }, + { + "epoch": 0.391884797119928, + "grad_norm": 1.7474385499954224, + "learning_rate": 1.5243333929323378e-05, + "loss": 0.864, + "step": 31350 + }, + { + "epoch": 0.3919097977449436, + "grad_norm": 4.450174331665039, + "learning_rate": 1.5242590803389337e-05, + "loss": 0.9589, + "step": 31352 + }, + { + "epoch": 0.39193479836995926, + "grad_norm": 2.0659749507904053, + "learning_rate": 1.5241847637528462e-05, + "loss": 1.6467, + "step": 31354 + }, + { + "epoch": 0.39195979899497485, + "grad_norm": 7.075169086456299, + "learning_rate": 1.5241104431746417e-05, + "loss": 0.8682, + "step": 31356 + }, + { + "epoch": 0.3919847996199905, + "grad_norm": 3.413764715194702, + "learning_rate": 1.5240361186048861e-05, + "loss": 0.9567, + "step": 31358 + }, + { + "epoch": 0.39200980024500615, + "grad_norm": 2.8896312713623047, + "learning_rate": 1.5239617900441457e-05, + "loss": 0.4731, + "step": 31360 + }, + { + "epoch": 0.39203480087002174, + "grad_norm": 4.5244598388671875, + "learning_rate": 1.5238874574929862e-05, + "loss": 0.8875, + "step": 31362 + }, + { + "epoch": 0.3920598014950374, + "grad_norm": 4.076484203338623, + "learning_rate": 1.523813120951974e-05, + "loss": 0.7493, + "step": 31364 + }, + { + "epoch": 0.392084802120053, + "grad_norm": 4.621640205383301, + "learning_rate": 1.5237387804216752e-05, + "loss": 1.5982, + "step": 31366 + }, + { + "epoch": 0.3921098027450686, + "grad_norm": 2.668916940689087, + "learning_rate": 1.5236644359026557e-05, + "loss": 0.7233, + "step": 31368 + }, + { + "epoch": 0.3921348033700843, + "grad_norm": 0.0013145928969606757, + "learning_rate": 1.5235900873954823e-05, + "loss": 0.2067, + "step": 31370 + }, + { + "epoch": 0.39215980399509986, + "grad_norm": 1.9413509368896484, + "learning_rate": 1.5235157349007205e-05, + "loss": 0.5794, + "step": 31372 + }, + { + "epoch": 0.3921848046201155, + "grad_norm": 2.4820432662963867, + "learning_rate": 1.523441378418937e-05, + "loss": 0.1527, + "step": 31374 + }, + { + "epoch": 0.3922098052451311, + "grad_norm": 1.3834060430526733, + "learning_rate": 1.5233670179506979e-05, + "loss": 0.6997, + "step": 31376 + }, + { + "epoch": 0.39223480587014675, + "grad_norm": 44.42110061645508, + "learning_rate": 1.5232926534965698e-05, + "loss": 0.5973, + "step": 31378 + }, + { + "epoch": 0.3922598064951624, + "grad_norm": 1.108095407485962, + "learning_rate": 1.5232182850571187e-05, + "loss": 0.7413, + "step": 31380 + }, + { + "epoch": 0.392284807120178, + "grad_norm": 10.206426620483398, + "learning_rate": 1.523143912632911e-05, + "loss": 1.6217, + "step": 31382 + }, + { + "epoch": 0.39230980774519364, + "grad_norm": 2.149661064147949, + "learning_rate": 1.5230695362245139e-05, + "loss": 0.3622, + "step": 31384 + }, + { + "epoch": 0.39233480837020923, + "grad_norm": 0.8663884401321411, + "learning_rate": 1.5229951558324922e-05, + "loss": 0.0212, + "step": 31386 + }, + { + "epoch": 0.3923598089952249, + "grad_norm": 2.7671616077423096, + "learning_rate": 1.5229207714574137e-05, + "loss": 1.3171, + "step": 31388 + }, + { + "epoch": 0.3923848096202405, + "grad_norm": 2.874394178390503, + "learning_rate": 1.522846383099845e-05, + "loss": 1.1791, + "step": 31390 + }, + { + "epoch": 0.3924098102452561, + "grad_norm": 4.884511947631836, + "learning_rate": 1.5227719907603518e-05, + "loss": 2.0239, + "step": 31392 + }, + { + "epoch": 0.39243481087027177, + "grad_norm": 0.3756600320339203, + "learning_rate": 1.5226975944395012e-05, + "loss": 0.6888, + "step": 31394 + }, + { + "epoch": 0.39245981149528736, + "grad_norm": 2.7480690479278564, + "learning_rate": 1.5226231941378596e-05, + "loss": 1.0394, + "step": 31396 + }, + { + "epoch": 0.392484812120303, + "grad_norm": 4.671177387237549, + "learning_rate": 1.5225487898559936e-05, + "loss": 1.0169, + "step": 31398 + }, + { + "epoch": 0.39250981274531865, + "grad_norm": 2.8391315937042236, + "learning_rate": 1.5224743815944698e-05, + "loss": 0.1974, + "step": 31400 + }, + { + "epoch": 0.39253481337033425, + "grad_norm": 2.6779444217681885, + "learning_rate": 1.5223999693538552e-05, + "loss": 0.9956, + "step": 31402 + }, + { + "epoch": 0.3925598139953499, + "grad_norm": 5.054786682128906, + "learning_rate": 1.5223255531347163e-05, + "loss": 1.7448, + "step": 31404 + }, + { + "epoch": 0.3925848146203655, + "grad_norm": 0.002420492935925722, + "learning_rate": 1.5222511329376196e-05, + "loss": 0.3069, + "step": 31406 + }, + { + "epoch": 0.39260981524538113, + "grad_norm": 0.0013822016771882772, + "learning_rate": 1.5221767087631324e-05, + "loss": 0.2827, + "step": 31408 + }, + { + "epoch": 0.3926348158703968, + "grad_norm": 3.465580463409424, + "learning_rate": 1.5221022806118212e-05, + "loss": 0.9523, + "step": 31410 + }, + { + "epoch": 0.3926598164954124, + "grad_norm": 2.508046865463257, + "learning_rate": 1.5220278484842525e-05, + "loss": 1.0102, + "step": 31412 + }, + { + "epoch": 0.392684817120428, + "grad_norm": 2.5919573307037354, + "learning_rate": 1.5219534123809939e-05, + "loss": 0.3992, + "step": 31414 + }, + { + "epoch": 0.3927098177454436, + "grad_norm": 3.2087202072143555, + "learning_rate": 1.521878972302612e-05, + "loss": 0.8236, + "step": 31416 + }, + { + "epoch": 0.39273481837045926, + "grad_norm": 3.6729671955108643, + "learning_rate": 1.5218045282496734e-05, + "loss": 0.8648, + "step": 31418 + }, + { + "epoch": 0.3927598189954749, + "grad_norm": 1.6828728914260864, + "learning_rate": 1.5217300802227453e-05, + "loss": 0.9145, + "step": 31420 + }, + { + "epoch": 0.3927848196204905, + "grad_norm": 3.484890937805176, + "learning_rate": 1.521655628222395e-05, + "loss": 0.8257, + "step": 31422 + }, + { + "epoch": 0.39280982024550615, + "grad_norm": 2.177583932876587, + "learning_rate": 1.5215811722491888e-05, + "loss": 0.4235, + "step": 31424 + }, + { + "epoch": 0.39283482087052174, + "grad_norm": 2.018416404724121, + "learning_rate": 1.5215067123036944e-05, + "loss": 0.5622, + "step": 31426 + }, + { + "epoch": 0.3928598214955374, + "grad_norm": 0.2488701343536377, + "learning_rate": 1.5214322483864786e-05, + "loss": 0.8485, + "step": 31428 + }, + { + "epoch": 0.39288482212055303, + "grad_norm": 0.333593487739563, + "learning_rate": 1.5213577804981085e-05, + "loss": 0.7596, + "step": 31430 + }, + { + "epoch": 0.3929098227455686, + "grad_norm": 3.657740592956543, + "learning_rate": 1.5212833086391512e-05, + "loss": 1.4466, + "step": 31432 + }, + { + "epoch": 0.3929348233705843, + "grad_norm": 2.315887212753296, + "learning_rate": 1.521208832810174e-05, + "loss": 1.808, + "step": 31434 + }, + { + "epoch": 0.39295982399559987, + "grad_norm": 1.2725499868392944, + "learning_rate": 1.5211343530117444e-05, + "loss": 0.5512, + "step": 31436 + }, + { + "epoch": 0.3929848246206155, + "grad_norm": 3.9672131538391113, + "learning_rate": 1.5210598692444285e-05, + "loss": 1.0334, + "step": 31438 + }, + { + "epoch": 0.39300982524563116, + "grad_norm": 7.668076992034912, + "learning_rate": 1.5209853815087946e-05, + "loss": 1.1171, + "step": 31440 + }, + { + "epoch": 0.39303482587064675, + "grad_norm": 3.875335931777954, + "learning_rate": 1.52091088980541e-05, + "loss": 0.7953, + "step": 31442 + }, + { + "epoch": 0.3930598264956624, + "grad_norm": 0.0009660110808908939, + "learning_rate": 1.5208363941348416e-05, + "loss": 1.0089, + "step": 31444 + }, + { + "epoch": 0.393084827120678, + "grad_norm": 3.251757860183716, + "learning_rate": 1.5207618944976567e-05, + "loss": 1.5931, + "step": 31446 + }, + { + "epoch": 0.39310982774569364, + "grad_norm": 7.069347858428955, + "learning_rate": 1.5206873908944233e-05, + "loss": 1.7768, + "step": 31448 + }, + { + "epoch": 0.3931348283707093, + "grad_norm": 2.812706470489502, + "learning_rate": 1.520612883325708e-05, + "loss": 0.2527, + "step": 31450 + }, + { + "epoch": 0.3931598289957249, + "grad_norm": 2.7811739444732666, + "learning_rate": 1.5205383717920783e-05, + "loss": 0.52, + "step": 31452 + }, + { + "epoch": 0.39318482962074053, + "grad_norm": 1.9742405414581299, + "learning_rate": 1.5204638562941023e-05, + "loss": 1.0243, + "step": 31454 + }, + { + "epoch": 0.3932098302457561, + "grad_norm": 3.468602180480957, + "learning_rate": 1.5203893368323474e-05, + "loss": 1.1488, + "step": 31456 + }, + { + "epoch": 0.39323483087077177, + "grad_norm": 1.6857494115829468, + "learning_rate": 1.5203148134073806e-05, + "loss": 0.1411, + "step": 31458 + }, + { + "epoch": 0.3932598314957874, + "grad_norm": 3.6117477416992188, + "learning_rate": 1.52024028601977e-05, + "loss": 0.664, + "step": 31460 + }, + { + "epoch": 0.393284832120803, + "grad_norm": 4.272672176361084, + "learning_rate": 1.5201657546700827e-05, + "loss": 1.3654, + "step": 31462 + }, + { + "epoch": 0.39330983274581865, + "grad_norm": 3.843130111694336, + "learning_rate": 1.5200912193588867e-05, + "loss": 1.1067, + "step": 31464 + }, + { + "epoch": 0.39333483337083425, + "grad_norm": 1.3756190538406372, + "learning_rate": 1.5200166800867496e-05, + "loss": 0.7665, + "step": 31466 + }, + { + "epoch": 0.3933598339958499, + "grad_norm": 3.250680923461914, + "learning_rate": 1.519942136854239e-05, + "loss": 1.5459, + "step": 31468 + }, + { + "epoch": 0.39338483462086554, + "grad_norm": 0.0011436621425673366, + "learning_rate": 1.5198675896619226e-05, + "loss": 0.6032, + "step": 31470 + }, + { + "epoch": 0.39340983524588113, + "grad_norm": 4.108147621154785, + "learning_rate": 1.5197930385103678e-05, + "loss": 1.3271, + "step": 31472 + }, + { + "epoch": 0.3934348358708968, + "grad_norm": 2.1332767009735107, + "learning_rate": 1.519718483400143e-05, + "loss": 0.9019, + "step": 31474 + }, + { + "epoch": 0.3934598364959124, + "grad_norm": 4.202947616577148, + "learning_rate": 1.5196439243318157e-05, + "loss": 1.0249, + "step": 31476 + }, + { + "epoch": 0.393484837120928, + "grad_norm": 4.045922756195068, + "learning_rate": 1.5195693613059538e-05, + "loss": 0.7174, + "step": 31478 + }, + { + "epoch": 0.39350983774594367, + "grad_norm": 2.1819469928741455, + "learning_rate": 1.5194947943231251e-05, + "loss": 0.443, + "step": 31480 + }, + { + "epoch": 0.39353483837095926, + "grad_norm": 2.585127592086792, + "learning_rate": 1.5194202233838974e-05, + "loss": 0.8562, + "step": 31482 + }, + { + "epoch": 0.3935598389959749, + "grad_norm": 4.586875915527344, + "learning_rate": 1.5193456484888389e-05, + "loss": 0.9248, + "step": 31484 + }, + { + "epoch": 0.3935848396209905, + "grad_norm": 1.8247085809707642, + "learning_rate": 1.5192710696385173e-05, + "loss": 1.1058, + "step": 31486 + }, + { + "epoch": 0.39360984024600615, + "grad_norm": 2.554187297821045, + "learning_rate": 1.5191964868335007e-05, + "loss": 0.1734, + "step": 31488 + }, + { + "epoch": 0.3936348408710218, + "grad_norm": 1.9235613346099854, + "learning_rate": 1.5191219000743571e-05, + "loss": 0.1341, + "step": 31490 + }, + { + "epoch": 0.3936598414960374, + "grad_norm": 4.308427333831787, + "learning_rate": 1.5190473093616542e-05, + "loss": 1.3788, + "step": 31492 + }, + { + "epoch": 0.39368484212105304, + "grad_norm": 0.14514344930648804, + "learning_rate": 1.518972714695961e-05, + "loss": 0.9541, + "step": 31494 + }, + { + "epoch": 0.39370984274606863, + "grad_norm": 3.0458626747131348, + "learning_rate": 1.5188981160778447e-05, + "loss": 1.0414, + "step": 31496 + }, + { + "epoch": 0.3937348433710843, + "grad_norm": 8.42011547088623, + "learning_rate": 1.5188235135078735e-05, + "loss": 2.3205, + "step": 31498 + }, + { + "epoch": 0.3937598439960999, + "grad_norm": 4.032759666442871, + "learning_rate": 1.5187489069866162e-05, + "loss": 1.2934, + "step": 31500 + }, + { + "epoch": 0.3937848446211155, + "grad_norm": 9.586838722229004, + "learning_rate": 1.5186742965146401e-05, + "loss": 0.9386, + "step": 31502 + }, + { + "epoch": 0.39380984524613116, + "grad_norm": 2.700439691543579, + "learning_rate": 1.5185996820925141e-05, + "loss": 1.6226, + "step": 31504 + }, + { + "epoch": 0.39383484587114675, + "grad_norm": 7.478762149810791, + "learning_rate": 1.5185250637208065e-05, + "loss": 0.6166, + "step": 31506 + }, + { + "epoch": 0.3938598464961624, + "grad_norm": 3.7154347896575928, + "learning_rate": 1.518450441400085e-05, + "loss": 1.6722, + "step": 31508 + }, + { + "epoch": 0.39388484712117805, + "grad_norm": 2.4366698265075684, + "learning_rate": 1.5183758151309187e-05, + "loss": 1.3956, + "step": 31510 + }, + { + "epoch": 0.39390984774619364, + "grad_norm": 6.519592761993408, + "learning_rate": 1.5183011849138751e-05, + "loss": 0.5145, + "step": 31512 + }, + { + "epoch": 0.3939348483712093, + "grad_norm": 3.0251734256744385, + "learning_rate": 1.5182265507495231e-05, + "loss": 0.7525, + "step": 31514 + }, + { + "epoch": 0.3939598489962249, + "grad_norm": 0.0639670267701149, + "learning_rate": 1.5181519126384308e-05, + "loss": 0.0349, + "step": 31516 + }, + { + "epoch": 0.39398484962124053, + "grad_norm": 0.00101481971796602, + "learning_rate": 1.5180772705811671e-05, + "loss": 0.8227, + "step": 31518 + }, + { + "epoch": 0.3940098502462562, + "grad_norm": 3.1186678409576416, + "learning_rate": 1.5180026245783e-05, + "loss": 0.876, + "step": 31520 + }, + { + "epoch": 0.39403485087127177, + "grad_norm": 4.369289398193359, + "learning_rate": 1.517927974630398e-05, + "loss": 0.8131, + "step": 31522 + }, + { + "epoch": 0.3940598514962874, + "grad_norm": 2.2846508026123047, + "learning_rate": 1.5178533207380302e-05, + "loss": 1.181, + "step": 31524 + }, + { + "epoch": 0.394084852121303, + "grad_norm": 4.720566272735596, + "learning_rate": 1.5177786629017645e-05, + "loss": 0.2961, + "step": 31526 + }, + { + "epoch": 0.39410985274631866, + "grad_norm": 4.1426472663879395, + "learning_rate": 1.5177040011221697e-05, + "loss": 1.3203, + "step": 31528 + }, + { + "epoch": 0.3941348533713343, + "grad_norm": 5.049602031707764, + "learning_rate": 1.5176293353998147e-05, + "loss": 1.763, + "step": 31530 + }, + { + "epoch": 0.3941598539963499, + "grad_norm": 3.0542118549346924, + "learning_rate": 1.5175546657352675e-05, + "loss": 0.8563, + "step": 31532 + }, + { + "epoch": 0.39418485462136554, + "grad_norm": 2.902029514312744, + "learning_rate": 1.5174799921290975e-05, + "loss": 1.0999, + "step": 31534 + }, + { + "epoch": 0.39420985524638114, + "grad_norm": 0.0027372848708182573, + "learning_rate": 1.5174053145818728e-05, + "loss": 1.9599, + "step": 31536 + }, + { + "epoch": 0.3942348558713968, + "grad_norm": 0.3487669825553894, + "learning_rate": 1.5173306330941624e-05, + "loss": 0.7755, + "step": 31538 + }, + { + "epoch": 0.39425985649641243, + "grad_norm": 2.7698607444763184, + "learning_rate": 1.5172559476665352e-05, + "loss": 1.2236, + "step": 31540 + }, + { + "epoch": 0.394284857121428, + "grad_norm": 4.229750156402588, + "learning_rate": 1.5171812582995595e-05, + "loss": 0.6142, + "step": 31542 + }, + { + "epoch": 0.39430985774644367, + "grad_norm": 3.264683961868286, + "learning_rate": 1.5171065649938049e-05, + "loss": 0.6299, + "step": 31544 + }, + { + "epoch": 0.39433485837145926, + "grad_norm": 0.21652787923812866, + "learning_rate": 1.5170318677498399e-05, + "loss": 0.3882, + "step": 31546 + }, + { + "epoch": 0.3943598589964749, + "grad_norm": 0.5486330986022949, + "learning_rate": 1.5169571665682328e-05, + "loss": 0.5654, + "step": 31548 + }, + { + "epoch": 0.39438485962149056, + "grad_norm": 4.0704851150512695, + "learning_rate": 1.5168824614495534e-05, + "loss": 1.3671, + "step": 31550 + }, + { + "epoch": 0.39440986024650615, + "grad_norm": 2.295473575592041, + "learning_rate": 1.51680775239437e-05, + "loss": 1.1928, + "step": 31552 + }, + { + "epoch": 0.3944348608715218, + "grad_norm": 3.4363396167755127, + "learning_rate": 1.5167330394032522e-05, + "loss": 1.919, + "step": 31554 + }, + { + "epoch": 0.3944598614965374, + "grad_norm": 2.8945400714874268, + "learning_rate": 1.5166583224767684e-05, + "loss": 0.9863, + "step": 31556 + }, + { + "epoch": 0.39448486212155304, + "grad_norm": 2.943686008453369, + "learning_rate": 1.516583601615488e-05, + "loss": 0.7763, + "step": 31558 + }, + { + "epoch": 0.3945098627465687, + "grad_norm": 3.6226425170898438, + "learning_rate": 1.5165088768199798e-05, + "loss": 0.308, + "step": 31560 + }, + { + "epoch": 0.3945348633715843, + "grad_norm": 0.0013562373351305723, + "learning_rate": 1.5164341480908133e-05, + "loss": 0.086, + "step": 31562 + }, + { + "epoch": 0.3945598639965999, + "grad_norm": 3.1507904529571533, + "learning_rate": 1.5163594154285571e-05, + "loss": 0.1993, + "step": 31564 + }, + { + "epoch": 0.3945848646216155, + "grad_norm": 5.515274524688721, + "learning_rate": 1.5162846788337807e-05, + "loss": 1.0383, + "step": 31566 + }, + { + "epoch": 0.39460986524663116, + "grad_norm": 1.166382908821106, + "learning_rate": 1.5162099383070529e-05, + "loss": 0.0639, + "step": 31568 + }, + { + "epoch": 0.3946348658716468, + "grad_norm": 3.6479995250701904, + "learning_rate": 1.5161351938489435e-05, + "loss": 1.1237, + "step": 31570 + }, + { + "epoch": 0.3946598664966624, + "grad_norm": 2.6416728496551514, + "learning_rate": 1.5160604454600219e-05, + "loss": 0.1108, + "step": 31572 + }, + { + "epoch": 0.39468486712167805, + "grad_norm": 1.1817587614059448, + "learning_rate": 1.5159856931408564e-05, + "loss": 1.2931, + "step": 31574 + }, + { + "epoch": 0.39470986774669364, + "grad_norm": 0.02159573882818222, + "learning_rate": 1.5159109368920169e-05, + "loss": 0.0003, + "step": 31576 + }, + { + "epoch": 0.3947348683717093, + "grad_norm": 5.738411903381348, + "learning_rate": 1.515836176714072e-05, + "loss": 0.8177, + "step": 31578 + }, + { + "epoch": 0.39475986899672494, + "grad_norm": 3.567716360092163, + "learning_rate": 1.5157614126075928e-05, + "loss": 1.0788, + "step": 31580 + }, + { + "epoch": 0.39478486962174053, + "grad_norm": 4.6964921951293945, + "learning_rate": 1.515686644573147e-05, + "loss": 0.1506, + "step": 31582 + }, + { + "epoch": 0.3948098702467562, + "grad_norm": 2.541445255279541, + "learning_rate": 1.515611872611305e-05, + "loss": 0.5274, + "step": 31584 + }, + { + "epoch": 0.39483487087177177, + "grad_norm": 3.2782630920410156, + "learning_rate": 1.5155370967226359e-05, + "loss": 0.1874, + "step": 31586 + }, + { + "epoch": 0.3948598714967874, + "grad_norm": 5.734517574310303, + "learning_rate": 1.5154623169077089e-05, + "loss": 1.9521, + "step": 31588 + }, + { + "epoch": 0.39488487212180307, + "grad_norm": 3.8540735244750977, + "learning_rate": 1.5153875331670937e-05, + "loss": 0.8655, + "step": 31590 + }, + { + "epoch": 0.39490987274681866, + "grad_norm": 8.187209129333496, + "learning_rate": 1.5153127455013603e-05, + "loss": 1.5081, + "step": 31592 + }, + { + "epoch": 0.3949348733718343, + "grad_norm": 1.2116793394088745, + "learning_rate": 1.5152379539110777e-05, + "loss": 0.6467, + "step": 31594 + }, + { + "epoch": 0.3949598739968499, + "grad_norm": 5.214512348175049, + "learning_rate": 1.5151631583968157e-05, + "loss": 0.2673, + "step": 31596 + }, + { + "epoch": 0.39498487462186554, + "grad_norm": 0.004214842338114977, + "learning_rate": 1.5150883589591438e-05, + "loss": 0.0062, + "step": 31598 + }, + { + "epoch": 0.3950098752468812, + "grad_norm": 0.5941234230995178, + "learning_rate": 1.5150135555986323e-05, + "loss": 0.4722, + "step": 31600 + }, + { + "epoch": 0.3950348758718968, + "grad_norm": 8.511209487915039, + "learning_rate": 1.5149387483158498e-05, + "loss": 1.202, + "step": 31602 + }, + { + "epoch": 0.39505987649691243, + "grad_norm": 0.056812532246112823, + "learning_rate": 1.514863937111367e-05, + "loss": 0.9072, + "step": 31604 + }, + { + "epoch": 0.395084877121928, + "grad_norm": 2.0704922676086426, + "learning_rate": 1.5147891219857532e-05, + "loss": 0.893, + "step": 31606 + }, + { + "epoch": 0.39510987774694367, + "grad_norm": 3.4674079418182373, + "learning_rate": 1.514714302939578e-05, + "loss": 1.3504, + "step": 31608 + }, + { + "epoch": 0.3951348783719593, + "grad_norm": 3.591524600982666, + "learning_rate": 1.5146394799734117e-05, + "loss": 0.3543, + "step": 31610 + }, + { + "epoch": 0.3951598789969749, + "grad_norm": 0.3601765036582947, + "learning_rate": 1.514564653087824e-05, + "loss": 0.1367, + "step": 31612 + }, + { + "epoch": 0.39518487962199056, + "grad_norm": 2.4014525413513184, + "learning_rate": 1.5144898222833844e-05, + "loss": 0.7966, + "step": 31614 + }, + { + "epoch": 0.39520988024700615, + "grad_norm": 6.184160232543945, + "learning_rate": 1.514414987560663e-05, + "loss": 0.7865, + "step": 31616 + }, + { + "epoch": 0.3952348808720218, + "grad_norm": 0.4541633129119873, + "learning_rate": 1.51434014892023e-05, + "loss": 0.9204, + "step": 31618 + }, + { + "epoch": 0.39525988149703745, + "grad_norm": 2.3877618312835693, + "learning_rate": 1.5142653063626552e-05, + "loss": 0.0739, + "step": 31620 + }, + { + "epoch": 0.39528488212205304, + "grad_norm": 0.0037461237516254187, + "learning_rate": 1.5141904598885085e-05, + "loss": 0.9753, + "step": 31622 + }, + { + "epoch": 0.3953098827470687, + "grad_norm": 5.059910774230957, + "learning_rate": 1.5141156094983598e-05, + "loss": 1.6695, + "step": 31624 + }, + { + "epoch": 0.3953348833720843, + "grad_norm": 5.958943843841553, + "learning_rate": 1.5140407551927796e-05, + "loss": 1.2766, + "step": 31626 + }, + { + "epoch": 0.3953598839970999, + "grad_norm": 10.651625633239746, + "learning_rate": 1.5139658969723374e-05, + "loss": 0.7439, + "step": 31628 + }, + { + "epoch": 0.3953848846221156, + "grad_norm": 3.8061227798461914, + "learning_rate": 1.5138910348376038e-05, + "loss": 1.4665, + "step": 31630 + }, + { + "epoch": 0.39540988524713117, + "grad_norm": 3.6553235054016113, + "learning_rate": 1.5138161687891488e-05, + "loss": 0.9031, + "step": 31632 + }, + { + "epoch": 0.3954348858721468, + "grad_norm": 3.589301347732544, + "learning_rate": 1.5137412988275423e-05, + "loss": 1.7322, + "step": 31634 + }, + { + "epoch": 0.3954598864971624, + "grad_norm": 2.2641937732696533, + "learning_rate": 1.513666424953355e-05, + "loss": 1.074, + "step": 31636 + }, + { + "epoch": 0.39548488712217805, + "grad_norm": 3.742000102996826, + "learning_rate": 1.513591547167157e-05, + "loss": 0.8645, + "step": 31638 + }, + { + "epoch": 0.3955098877471937, + "grad_norm": 4.145226001739502, + "learning_rate": 1.5135166654695178e-05, + "loss": 0.7487, + "step": 31640 + }, + { + "epoch": 0.3955348883722093, + "grad_norm": 3.1310813426971436, + "learning_rate": 1.5134417798610086e-05, + "loss": 0.3504, + "step": 31642 + }, + { + "epoch": 0.39555988899722494, + "grad_norm": 1.110155701637268, + "learning_rate": 1.5133668903421994e-05, + "loss": 0.8151, + "step": 31644 + }, + { + "epoch": 0.39558488962224053, + "grad_norm": 4.488398551940918, + "learning_rate": 1.5132919969136606e-05, + "loss": 0.7276, + "step": 31646 + }, + { + "epoch": 0.3956098902472562, + "grad_norm": 0.4070165455341339, + "learning_rate": 1.5132170995759626e-05, + "loss": 0.3277, + "step": 31648 + }, + { + "epoch": 0.3956348908722718, + "grad_norm": 4.449004173278809, + "learning_rate": 1.5131421983296757e-05, + "loss": 2.4279, + "step": 31650 + }, + { + "epoch": 0.3956598914972874, + "grad_norm": 2.5651915073394775, + "learning_rate": 1.5130672931753703e-05, + "loss": 0.5172, + "step": 31652 + }, + { + "epoch": 0.39568489212230307, + "grad_norm": 3.9741101264953613, + "learning_rate": 1.5129923841136167e-05, + "loss": 1.5711, + "step": 31654 + }, + { + "epoch": 0.39570989274731866, + "grad_norm": 4.135925769805908, + "learning_rate": 1.512917471144986e-05, + "loss": 1.3852, + "step": 31656 + }, + { + "epoch": 0.3957348933723343, + "grad_norm": 9.005663871765137, + "learning_rate": 1.5128425542700485e-05, + "loss": 0.3808, + "step": 31658 + }, + { + "epoch": 0.39575989399734995, + "grad_norm": 4.263436794281006, + "learning_rate": 1.5127676334893744e-05, + "loss": 1.8915, + "step": 31660 + }, + { + "epoch": 0.39578489462236555, + "grad_norm": 5.0928144454956055, + "learning_rate": 1.5126927088035346e-05, + "loss": 1.1711, + "step": 31662 + }, + { + "epoch": 0.3958098952473812, + "grad_norm": 4.884415149688721, + "learning_rate": 1.5126177802130995e-05, + "loss": 0.3479, + "step": 31664 + }, + { + "epoch": 0.3958348958723968, + "grad_norm": 0.0012117139995098114, + "learning_rate": 1.51254284771864e-05, + "loss": 0.7933, + "step": 31666 + }, + { + "epoch": 0.39585989649741243, + "grad_norm": 9.837736129760742, + "learning_rate": 1.5124679113207265e-05, + "loss": 1.9479, + "step": 31668 + }, + { + "epoch": 0.3958848971224281, + "grad_norm": 3.8855321407318115, + "learning_rate": 1.5123929710199301e-05, + "loss": 0.9737, + "step": 31670 + }, + { + "epoch": 0.3959098977474437, + "grad_norm": 0.017549799755215645, + "learning_rate": 1.5123180268168212e-05, + "loss": 0.8445, + "step": 31672 + }, + { + "epoch": 0.3959348983724593, + "grad_norm": 3.6771950721740723, + "learning_rate": 1.5122430787119706e-05, + "loss": 2.7691, + "step": 31674 + }, + { + "epoch": 0.3959598989974749, + "grad_norm": 3.3852715492248535, + "learning_rate": 1.5121681267059489e-05, + "loss": 0.8341, + "step": 31676 + }, + { + "epoch": 0.39598489962249056, + "grad_norm": 6.76141357421875, + "learning_rate": 1.5120931707993276e-05, + "loss": 1.7398, + "step": 31678 + }, + { + "epoch": 0.3960099002475062, + "grad_norm": 3.6157262325286865, + "learning_rate": 1.512018210992677e-05, + "loss": 1.4854, + "step": 31680 + }, + { + "epoch": 0.3960349008725218, + "grad_norm": 0.0021102677565068007, + "learning_rate": 1.5119432472865678e-05, + "loss": 0.0001, + "step": 31682 + }, + { + "epoch": 0.39605990149753745, + "grad_norm": 3.8857035636901855, + "learning_rate": 1.511868279681572e-05, + "loss": 0.7744, + "step": 31684 + }, + { + "epoch": 0.39608490212255304, + "grad_norm": 8.428062438964844, + "learning_rate": 1.511793308178259e-05, + "loss": 1.854, + "step": 31686 + }, + { + "epoch": 0.3961099027475687, + "grad_norm": 2.6398305892944336, + "learning_rate": 1.511718332777201e-05, + "loss": 1.3077, + "step": 31688 + }, + { + "epoch": 0.39613490337258433, + "grad_norm": 3.4577322006225586, + "learning_rate": 1.5116433534789683e-05, + "loss": 2.1804, + "step": 31690 + }, + { + "epoch": 0.3961599039975999, + "grad_norm": 5.646445274353027, + "learning_rate": 1.511568370284132e-05, + "loss": 1.2141, + "step": 31692 + }, + { + "epoch": 0.3961849046226156, + "grad_norm": 3.0001261234283447, + "learning_rate": 1.5114933831932637e-05, + "loss": 0.558, + "step": 31694 + }, + { + "epoch": 0.39620990524763117, + "grad_norm": 3.7072150707244873, + "learning_rate": 1.5114183922069341e-05, + "loss": 1.6576, + "step": 31696 + }, + { + "epoch": 0.3962349058726468, + "grad_norm": 0.22500360012054443, + "learning_rate": 1.5113433973257143e-05, + "loss": 0.182, + "step": 31698 + }, + { + "epoch": 0.39625990649766246, + "grad_norm": 6.183840274810791, + "learning_rate": 1.5112683985501752e-05, + "loss": 1.4821, + "step": 31700 + }, + { + "epoch": 0.39628490712267805, + "grad_norm": 0.8779661655426025, + "learning_rate": 1.5111933958808886e-05, + "loss": 0.2654, + "step": 31702 + }, + { + "epoch": 0.3963099077476937, + "grad_norm": 2.091761589050293, + "learning_rate": 1.5111183893184254e-05, + "loss": 0.854, + "step": 31704 + }, + { + "epoch": 0.3963349083727093, + "grad_norm": 2.058593988418579, + "learning_rate": 1.5110433788633565e-05, + "loss": 0.1754, + "step": 31706 + }, + { + "epoch": 0.39635990899772494, + "grad_norm": 3.5973472595214844, + "learning_rate": 1.5109683645162537e-05, + "loss": 1.4974, + "step": 31708 + }, + { + "epoch": 0.3963849096227406, + "grad_norm": 5.82743501663208, + "learning_rate": 1.5108933462776882e-05, + "loss": 1.8564, + "step": 31710 + }, + { + "epoch": 0.3964099102477562, + "grad_norm": 5.033740997314453, + "learning_rate": 1.5108183241482311e-05, + "loss": 1.6046, + "step": 31712 + }, + { + "epoch": 0.39643491087277183, + "grad_norm": 3.906158924102783, + "learning_rate": 1.5107432981284538e-05, + "loss": 1.8638, + "step": 31714 + }, + { + "epoch": 0.3964599114977874, + "grad_norm": 4.837580680847168, + "learning_rate": 1.510668268218928e-05, + "loss": 1.5902, + "step": 31716 + }, + { + "epoch": 0.39648491212280307, + "grad_norm": 5.719967365264893, + "learning_rate": 1.5105932344202248e-05, + "loss": 1.6703, + "step": 31718 + }, + { + "epoch": 0.3965099127478187, + "grad_norm": 3.5120131969451904, + "learning_rate": 1.5105181967329156e-05, + "loss": 0.0651, + "step": 31720 + }, + { + "epoch": 0.3965349133728343, + "grad_norm": 0.0026863105595111847, + "learning_rate": 1.510443155157572e-05, + "loss": 0.0002, + "step": 31722 + }, + { + "epoch": 0.39655991399784996, + "grad_norm": 3.155050754547119, + "learning_rate": 1.5103681096947658e-05, + "loss": 1.6137, + "step": 31724 + }, + { + "epoch": 0.39658491462286555, + "grad_norm": 0.29516568779945374, + "learning_rate": 1.510293060345068e-05, + "loss": 0.7265, + "step": 31726 + }, + { + "epoch": 0.3966099152478812, + "grad_norm": 5.276038646697998, + "learning_rate": 1.5102180071090504e-05, + "loss": 1.0886, + "step": 31728 + }, + { + "epoch": 0.39663491587289684, + "grad_norm": 2.0765786170959473, + "learning_rate": 1.5101429499872847e-05, + "loss": 0.8338, + "step": 31730 + }, + { + "epoch": 0.39665991649791243, + "grad_norm": 0.002408035797998309, + "learning_rate": 1.5100678889803424e-05, + "loss": 1.5442, + "step": 31732 + }, + { + "epoch": 0.3966849171229281, + "grad_norm": 0.06713058054447174, + "learning_rate": 1.5099928240887951e-05, + "loss": 0.7229, + "step": 31734 + }, + { + "epoch": 0.3967099177479437, + "grad_norm": 6.293313026428223, + "learning_rate": 1.5099177553132149e-05, + "loss": 0.7864, + "step": 31736 + }, + { + "epoch": 0.3967349183729593, + "grad_norm": 2.0796732902526855, + "learning_rate": 1.5098426826541728e-05, + "loss": 0.0623, + "step": 31738 + }, + { + "epoch": 0.39675991899797497, + "grad_norm": 11.705398559570312, + "learning_rate": 1.5097676061122411e-05, + "loss": 1.5627, + "step": 31740 + }, + { + "epoch": 0.39678491962299056, + "grad_norm": 5.021740436553955, + "learning_rate": 1.5096925256879913e-05, + "loss": 1.125, + "step": 31742 + }, + { + "epoch": 0.3968099202480062, + "grad_norm": 0.002261303598061204, + "learning_rate": 1.5096174413819956e-05, + "loss": 1.1039, + "step": 31744 + }, + { + "epoch": 0.3968349208730218, + "grad_norm": 3.2741012573242188, + "learning_rate": 1.5095423531948252e-05, + "loss": 0.5575, + "step": 31746 + }, + { + "epoch": 0.39685992149803745, + "grad_norm": 2.9972288608551025, + "learning_rate": 1.5094672611270525e-05, + "loss": 1.3139, + "step": 31748 + }, + { + "epoch": 0.3968849221230531, + "grad_norm": 0.0015965558122843504, + "learning_rate": 1.5093921651792492e-05, + "loss": 0.0001, + "step": 31750 + }, + { + "epoch": 0.3969099227480687, + "grad_norm": 2.6838016510009766, + "learning_rate": 1.5093170653519869e-05, + "loss": 0.4942, + "step": 31752 + }, + { + "epoch": 0.39693492337308434, + "grad_norm": 0.028415149077773094, + "learning_rate": 1.509241961645838e-05, + "loss": 0.4348, + "step": 31754 + }, + { + "epoch": 0.39695992399809993, + "grad_norm": 0.8898085355758667, + "learning_rate": 1.5091668540613744e-05, + "loss": 0.3822, + "step": 31756 + }, + { + "epoch": 0.3969849246231156, + "grad_norm": 2.9822041988372803, + "learning_rate": 1.509091742599168e-05, + "loss": 0.6583, + "step": 31758 + }, + { + "epoch": 0.3970099252481312, + "grad_norm": 3.364435911178589, + "learning_rate": 1.5090166272597908e-05, + "loss": 1.6087, + "step": 31760 + }, + { + "epoch": 0.3970349258731468, + "grad_norm": 0.001197735546156764, + "learning_rate": 1.5089415080438153e-05, + "loss": 0.0, + "step": 31762 + }, + { + "epoch": 0.39705992649816246, + "grad_norm": 0.0029581852722913027, + "learning_rate": 1.508866384951813e-05, + "loss": 0.1602, + "step": 31764 + }, + { + "epoch": 0.39708492712317806, + "grad_norm": 0.8040655851364136, + "learning_rate": 1.5087912579843563e-05, + "loss": 0.5487, + "step": 31766 + }, + { + "epoch": 0.3971099277481937, + "grad_norm": 3.8213367462158203, + "learning_rate": 1.5087161271420171e-05, + "loss": 1.405, + "step": 31768 + }, + { + "epoch": 0.39713492837320935, + "grad_norm": 2.615344762802124, + "learning_rate": 1.5086409924253681e-05, + "loss": 0.7555, + "step": 31770 + }, + { + "epoch": 0.39715992899822494, + "grad_norm": 3.1931540966033936, + "learning_rate": 1.5085658538349811e-05, + "loss": 1.2335, + "step": 31772 + }, + { + "epoch": 0.3971849296232406, + "grad_norm": 0.7973222732543945, + "learning_rate": 1.5084907113714286e-05, + "loss": 0.0177, + "step": 31774 + }, + { + "epoch": 0.3972099302482562, + "grad_norm": 4.045102596282959, + "learning_rate": 1.508415565035283e-05, + "loss": 0.6833, + "step": 31776 + }, + { + "epoch": 0.39723493087327183, + "grad_norm": 8.454730033874512, + "learning_rate": 1.5083404148271159e-05, + "loss": 2.7192, + "step": 31778 + }, + { + "epoch": 0.3972599314982875, + "grad_norm": 0.0014468074077740312, + "learning_rate": 1.5082652607474998e-05, + "loss": 0.497, + "step": 31780 + }, + { + "epoch": 0.39728493212330307, + "grad_norm": 4.208024024963379, + "learning_rate": 1.5081901027970081e-05, + "loss": 1.1436, + "step": 31782 + }, + { + "epoch": 0.3973099327483187, + "grad_norm": 0.0016315391985699534, + "learning_rate": 1.5081149409762122e-05, + "loss": 0.0001, + "step": 31784 + }, + { + "epoch": 0.3973349333733343, + "grad_norm": 4.691291332244873, + "learning_rate": 1.5080397752856843e-05, + "loss": 1.4421, + "step": 31786 + }, + { + "epoch": 0.39735993399834996, + "grad_norm": 1.5561342239379883, + "learning_rate": 1.507964605725998e-05, + "loss": 0.9394, + "step": 31788 + }, + { + "epoch": 0.3973849346233656, + "grad_norm": 0.0011552406940609217, + "learning_rate": 1.5078894322977248e-05, + "loss": 1.2817, + "step": 31790 + }, + { + "epoch": 0.3974099352483812, + "grad_norm": 5.356195449829102, + "learning_rate": 1.5078142550014372e-05, + "loss": 1.0196, + "step": 31792 + }, + { + "epoch": 0.39743493587339684, + "grad_norm": 0.0015210096025839448, + "learning_rate": 1.5077390738377084e-05, + "loss": 0.6913, + "step": 31794 + }, + { + "epoch": 0.39745993649841244, + "grad_norm": 0.3465626537799835, + "learning_rate": 1.5076638888071107e-05, + "loss": 0.0104, + "step": 31796 + }, + { + "epoch": 0.3974849371234281, + "grad_norm": 0.6262550354003906, + "learning_rate": 1.5075886999102162e-05, + "loss": 0.0123, + "step": 31798 + }, + { + "epoch": 0.39750993774844373, + "grad_norm": 5.892071723937988, + "learning_rate": 1.5075135071475984e-05, + "loss": 1.8427, + "step": 31800 + }, + { + "epoch": 0.3975349383734593, + "grad_norm": 3.8670754432678223, + "learning_rate": 1.5074383105198293e-05, + "loss": 1.386, + "step": 31802 + }, + { + "epoch": 0.39755993899847497, + "grad_norm": 2.1804349422454834, + "learning_rate": 1.5073631100274817e-05, + "loss": 0.5306, + "step": 31804 + }, + { + "epoch": 0.39758493962349056, + "grad_norm": 0.7854501008987427, + "learning_rate": 1.5072879056711282e-05, + "loss": 0.5735, + "step": 31806 + }, + { + "epoch": 0.3976099402485062, + "grad_norm": 6.729971885681152, + "learning_rate": 1.5072126974513424e-05, + "loss": 1.3486, + "step": 31808 + }, + { + "epoch": 0.39763494087352186, + "grad_norm": 3.5155699253082275, + "learning_rate": 1.5071374853686962e-05, + "loss": 1.8365, + "step": 31810 + }, + { + "epoch": 0.39765994149853745, + "grad_norm": 2.9586029052734375, + "learning_rate": 1.5070622694237624e-05, + "loss": 0.685, + "step": 31812 + }, + { + "epoch": 0.3976849421235531, + "grad_norm": 0.9497914910316467, + "learning_rate": 1.5069870496171142e-05, + "loss": 0.0331, + "step": 31814 + }, + { + "epoch": 0.3977099427485687, + "grad_norm": 7.357766628265381, + "learning_rate": 1.5069118259493241e-05, + "loss": 1.2082, + "step": 31816 + }, + { + "epoch": 0.39773494337358434, + "grad_norm": 0.7958975434303284, + "learning_rate": 1.5068365984209652e-05, + "loss": 0.0143, + "step": 31818 + }, + { + "epoch": 0.3977599439986, + "grad_norm": 5.256061553955078, + "learning_rate": 1.506761367032611e-05, + "loss": 1.326, + "step": 31820 + }, + { + "epoch": 0.3977849446236156, + "grad_norm": 1.769444227218628, + "learning_rate": 1.5066861317848334e-05, + "loss": 0.7121, + "step": 31822 + }, + { + "epoch": 0.3978099452486312, + "grad_norm": 0.0011873847106471658, + "learning_rate": 1.506610892678206e-05, + "loss": 0.3161, + "step": 31824 + }, + { + "epoch": 0.3978349458736468, + "grad_norm": 5.306028366088867, + "learning_rate": 1.5065356497133015e-05, + "loss": 1.5644, + "step": 31826 + }, + { + "epoch": 0.39785994649866246, + "grad_norm": 3.180077314376831, + "learning_rate": 1.5064604028906935e-05, + "loss": 0.4996, + "step": 31828 + }, + { + "epoch": 0.3978849471236781, + "grad_norm": 0.6662534475326538, + "learning_rate": 1.5063851522109543e-05, + "loss": 0.1003, + "step": 31830 + }, + { + "epoch": 0.3979099477486937, + "grad_norm": 2.954376459121704, + "learning_rate": 1.5063098976746574e-05, + "loss": 0.7505, + "step": 31832 + }, + { + "epoch": 0.39793494837370935, + "grad_norm": 0.006421675439924002, + "learning_rate": 1.5062346392823763e-05, + "loss": 0.5218, + "step": 31834 + }, + { + "epoch": 0.39795994899872494, + "grad_norm": 0.0018332810141146183, + "learning_rate": 1.5061593770346835e-05, + "loss": 0.3232, + "step": 31836 + }, + { + "epoch": 0.3979849496237406, + "grad_norm": 4.871630668640137, + "learning_rate": 1.5060841109321527e-05, + "loss": 1.4987, + "step": 31838 + }, + { + "epoch": 0.39800995024875624, + "grad_norm": 0.7258780598640442, + "learning_rate": 1.5060088409753568e-05, + "loss": 0.0459, + "step": 31840 + }, + { + "epoch": 0.39803495087377183, + "grad_norm": 2.8979458808898926, + "learning_rate": 1.5059335671648688e-05, + "loss": 1.9482, + "step": 31842 + }, + { + "epoch": 0.3980599514987875, + "grad_norm": 0.754555881023407, + "learning_rate": 1.5058582895012626e-05, + "loss": 0.0332, + "step": 31844 + }, + { + "epoch": 0.39808495212380307, + "grad_norm": 2.912518262863159, + "learning_rate": 1.5057830079851113e-05, + "loss": 0.5636, + "step": 31846 + }, + { + "epoch": 0.3981099527488187, + "grad_norm": 2.5174219608306885, + "learning_rate": 1.5057077226169878e-05, + "loss": 0.727, + "step": 31848 + }, + { + "epoch": 0.39813495337383437, + "grad_norm": 2.0133087635040283, + "learning_rate": 1.5056324333974658e-05, + "loss": 0.3949, + "step": 31850 + }, + { + "epoch": 0.39815995399884996, + "grad_norm": 3.2244465351104736, + "learning_rate": 1.505557140327119e-05, + "loss": 0.4565, + "step": 31852 + }, + { + "epoch": 0.3981849546238656, + "grad_norm": 1.9417978525161743, + "learning_rate": 1.5054818434065203e-05, + "loss": 1.1299, + "step": 31854 + }, + { + "epoch": 0.3982099552488812, + "grad_norm": 3.82560133934021, + "learning_rate": 1.5054065426362435e-05, + "loss": 0.8071, + "step": 31856 + }, + { + "epoch": 0.39823495587389685, + "grad_norm": 3.8045084476470947, + "learning_rate": 1.5053312380168619e-05, + "loss": 1.3591, + "step": 31858 + }, + { + "epoch": 0.3982599564989125, + "grad_norm": 3.731229066848755, + "learning_rate": 1.505255929548949e-05, + "loss": 0.824, + "step": 31860 + }, + { + "epoch": 0.3982849571239281, + "grad_norm": 3.7843291759490967, + "learning_rate": 1.5051806172330783e-05, + "loss": 0.5905, + "step": 31862 + }, + { + "epoch": 0.39830995774894373, + "grad_norm": 3.073003053665161, + "learning_rate": 1.5051053010698238e-05, + "loss": 0.2076, + "step": 31864 + }, + { + "epoch": 0.3983349583739593, + "grad_norm": 5.211507797241211, + "learning_rate": 1.5050299810597584e-05, + "loss": 1.6089, + "step": 31866 + }, + { + "epoch": 0.398359958998975, + "grad_norm": 5.887419700622559, + "learning_rate": 1.5049546572034561e-05, + "loss": 0.4261, + "step": 31868 + }, + { + "epoch": 0.3983849596239906, + "grad_norm": 3.6451618671417236, + "learning_rate": 1.5048793295014906e-05, + "loss": 1.0316, + "step": 31870 + }, + { + "epoch": 0.3984099602490062, + "grad_norm": 9.195359230041504, + "learning_rate": 1.5048039979544356e-05, + "loss": 1.8575, + "step": 31872 + }, + { + "epoch": 0.39843496087402186, + "grad_norm": 2.687373161315918, + "learning_rate": 1.5047286625628648e-05, + "loss": 2.1132, + "step": 31874 + }, + { + "epoch": 0.39845996149903745, + "grad_norm": 4.453884124755859, + "learning_rate": 1.5046533233273518e-05, + "loss": 0.2541, + "step": 31876 + }, + { + "epoch": 0.3984849621240531, + "grad_norm": 2.9117109775543213, + "learning_rate": 1.5045779802484703e-05, + "loss": 0.6907, + "step": 31878 + }, + { + "epoch": 0.39850996274906875, + "grad_norm": 4.512272357940674, + "learning_rate": 1.5045026333267944e-05, + "loss": 1.2951, + "step": 31880 + }, + { + "epoch": 0.39853496337408434, + "grad_norm": 4.238558769226074, + "learning_rate": 1.5044272825628978e-05, + "loss": 1.5002, + "step": 31882 + }, + { + "epoch": 0.3985599639991, + "grad_norm": 0.0007350968080572784, + "learning_rate": 1.5043519279573544e-05, + "loss": 1.2592, + "step": 31884 + }, + { + "epoch": 0.3985849646241156, + "grad_norm": 2.6041419506073, + "learning_rate": 1.5042765695107382e-05, + "loss": 0.1871, + "step": 31886 + }, + { + "epoch": 0.3986099652491312, + "grad_norm": 5.4579758644104, + "learning_rate": 1.5042012072236226e-05, + "loss": 1.7368, + "step": 31888 + }, + { + "epoch": 0.3986349658741469, + "grad_norm": 2.674410343170166, + "learning_rate": 1.504125841096582e-05, + "loss": 0.7265, + "step": 31890 + }, + { + "epoch": 0.39865996649916247, + "grad_norm": 9.988726615905762, + "learning_rate": 1.5040504711301905e-05, + "loss": 0.8475, + "step": 31892 + }, + { + "epoch": 0.3986849671241781, + "grad_norm": 0.15589120984077454, + "learning_rate": 1.5039750973250216e-05, + "loss": 0.4003, + "step": 31894 + }, + { + "epoch": 0.3987099677491937, + "grad_norm": 2.828023672103882, + "learning_rate": 1.5038997196816496e-05, + "loss": 1.1778, + "step": 31896 + }, + { + "epoch": 0.39873496837420935, + "grad_norm": 2.992670774459839, + "learning_rate": 1.5038243382006488e-05, + "loss": 1.0798, + "step": 31898 + }, + { + "epoch": 0.398759968999225, + "grad_norm": 0.011675309389829636, + "learning_rate": 1.503748952882593e-05, + "loss": 0.0002, + "step": 31900 + }, + { + "epoch": 0.3987849696242406, + "grad_norm": 3.4210293292999268, + "learning_rate": 1.5036735637280564e-05, + "loss": 1.4343, + "step": 31902 + }, + { + "epoch": 0.39880997024925624, + "grad_norm": 2.214665174484253, + "learning_rate": 1.5035981707376133e-05, + "loss": 0.999, + "step": 31904 + }, + { + "epoch": 0.39883497087427183, + "grad_norm": 1.6055606603622437, + "learning_rate": 1.5035227739118377e-05, + "loss": 1.3869, + "step": 31906 + }, + { + "epoch": 0.3988599714992875, + "grad_norm": 3.783754348754883, + "learning_rate": 1.5034473732513037e-05, + "loss": 1.0537, + "step": 31908 + }, + { + "epoch": 0.3988849721243031, + "grad_norm": 3.1845905780792236, + "learning_rate": 1.5033719687565858e-05, + "loss": 1.1296, + "step": 31910 + }, + { + "epoch": 0.3989099727493187, + "grad_norm": 0.001089605619199574, + "learning_rate": 1.5032965604282584e-05, + "loss": 0.774, + "step": 31912 + }, + { + "epoch": 0.39893497337433437, + "grad_norm": 5.1779680252075195, + "learning_rate": 1.5032211482668952e-05, + "loss": 0.8627, + "step": 31914 + }, + { + "epoch": 0.39895997399934996, + "grad_norm": 0.0009865107713267207, + "learning_rate": 1.5031457322730709e-05, + "loss": 0.1844, + "step": 31916 + }, + { + "epoch": 0.3989849746243656, + "grad_norm": 3.283560276031494, + "learning_rate": 1.50307031244736e-05, + "loss": 0.6377, + "step": 31918 + }, + { + "epoch": 0.39900997524938125, + "grad_norm": 1.7904750108718872, + "learning_rate": 1.5029948887903367e-05, + "loss": 1.212, + "step": 31920 + }, + { + "epoch": 0.39903497587439685, + "grad_norm": 2.610769271850586, + "learning_rate": 1.5029194613025756e-05, + "loss": 1.1838, + "step": 31922 + }, + { + "epoch": 0.3990599764994125, + "grad_norm": 3.9397075176239014, + "learning_rate": 1.5028440299846507e-05, + "loss": 0.9732, + "step": 31924 + }, + { + "epoch": 0.3990849771244281, + "grad_norm": 4.358706951141357, + "learning_rate": 1.5027685948371372e-05, + "loss": 0.6776, + "step": 31926 + }, + { + "epoch": 0.39910997774944373, + "grad_norm": 0.0035527320578694344, + "learning_rate": 1.5026931558606089e-05, + "loss": 0.7397, + "step": 31928 + }, + { + "epoch": 0.3991349783744594, + "grad_norm": 0.0006637428305111825, + "learning_rate": 1.5026177130556406e-05, + "loss": 0.0, + "step": 31930 + }, + { + "epoch": 0.399159978999475, + "grad_norm": 5.301365852355957, + "learning_rate": 1.5025422664228069e-05, + "loss": 1.0073, + "step": 31932 + }, + { + "epoch": 0.3991849796244906, + "grad_norm": 3.4679927825927734, + "learning_rate": 1.5024668159626823e-05, + "loss": 1.9962, + "step": 31934 + }, + { + "epoch": 0.3992099802495062, + "grad_norm": 5.870053291320801, + "learning_rate": 1.5023913616758417e-05, + "loss": 1.8066, + "step": 31936 + }, + { + "epoch": 0.39923498087452186, + "grad_norm": 2.9101200103759766, + "learning_rate": 1.5023159035628596e-05, + "loss": 0.6412, + "step": 31938 + }, + { + "epoch": 0.3992599814995375, + "grad_norm": 3.00931715965271, + "learning_rate": 1.5022404416243101e-05, + "loss": 0.33, + "step": 31940 + }, + { + "epoch": 0.3992849821245531, + "grad_norm": 0.003453470068052411, + "learning_rate": 1.5021649758607687e-05, + "loss": 0.8752, + "step": 31942 + }, + { + "epoch": 0.39930998274956875, + "grad_norm": 0.3736729919910431, + "learning_rate": 1.50208950627281e-05, + "loss": 1.1476, + "step": 31944 + }, + { + "epoch": 0.39933498337458434, + "grad_norm": 1.090531587600708, + "learning_rate": 1.5020140328610086e-05, + "loss": 0.072, + "step": 31946 + }, + { + "epoch": 0.3993599839996, + "grad_norm": 2.6704823970794678, + "learning_rate": 1.5019385556259392e-05, + "loss": 0.963, + "step": 31948 + }, + { + "epoch": 0.39938498462461564, + "grad_norm": 4.30007266998291, + "learning_rate": 1.5018630745681768e-05, + "loss": 1.5917, + "step": 31950 + }, + { + "epoch": 0.3994099852496312, + "grad_norm": 2.6866989135742188, + "learning_rate": 1.5017875896882963e-05, + "loss": 1.2334, + "step": 31952 + }, + { + "epoch": 0.3994349858746469, + "grad_norm": 0.9248254895210266, + "learning_rate": 1.5017121009868723e-05, + "loss": 0.1125, + "step": 31954 + }, + { + "epoch": 0.39945998649966247, + "grad_norm": 0.0013873501447960734, + "learning_rate": 1.5016366084644798e-05, + "loss": 1.6704, + "step": 31956 + }, + { + "epoch": 0.3994849871246781, + "grad_norm": 4.266753673553467, + "learning_rate": 1.501561112121694e-05, + "loss": 0.8316, + "step": 31958 + }, + { + "epoch": 0.39950998774969376, + "grad_norm": 1.4240667819976807, + "learning_rate": 1.5014856119590895e-05, + "loss": 0.7998, + "step": 31960 + }, + { + "epoch": 0.39953498837470935, + "grad_norm": 2.5448713302612305, + "learning_rate": 1.5014101079772418e-05, + "loss": 0.8012, + "step": 31962 + }, + { + "epoch": 0.399559988999725, + "grad_norm": 2.832451581954956, + "learning_rate": 1.5013346001767258e-05, + "loss": 1.1712, + "step": 31964 + }, + { + "epoch": 0.3995849896247406, + "grad_norm": 3.6043758392333984, + "learning_rate": 1.5012590885581158e-05, + "loss": 1.5265, + "step": 31966 + }, + { + "epoch": 0.39960999024975624, + "grad_norm": 2.4081039428710938, + "learning_rate": 1.5011835731219879e-05, + "loss": 1.0687, + "step": 31968 + }, + { + "epoch": 0.3996349908747719, + "grad_norm": 0.0010103628737851977, + "learning_rate": 1.5011080538689164e-05, + "loss": 0.0001, + "step": 31970 + }, + { + "epoch": 0.3996599914997875, + "grad_norm": 3.9451043605804443, + "learning_rate": 1.501032530799477e-05, + "loss": 1.396, + "step": 31972 + }, + { + "epoch": 0.39968499212480313, + "grad_norm": 3.9626657962799072, + "learning_rate": 1.5009570039142447e-05, + "loss": 0.6319, + "step": 31974 + }, + { + "epoch": 0.3997099927498187, + "grad_norm": 2.771003484725952, + "learning_rate": 1.5008814732137947e-05, + "loss": 0.569, + "step": 31976 + }, + { + "epoch": 0.39973499337483437, + "grad_norm": 0.0012431705836206675, + "learning_rate": 1.5008059386987027e-05, + "loss": 0.643, + "step": 31978 + }, + { + "epoch": 0.39975999399985, + "grad_norm": 4.340770721435547, + "learning_rate": 1.5007304003695429e-05, + "loss": 1.65, + "step": 31980 + }, + { + "epoch": 0.3997849946248656, + "grad_norm": 6.291293144226074, + "learning_rate": 1.5006548582268909e-05, + "loss": 1.525, + "step": 31982 + }, + { + "epoch": 0.39980999524988126, + "grad_norm": 1.26524817943573, + "learning_rate": 1.500579312271323e-05, + "loss": 0.0763, + "step": 31984 + }, + { + "epoch": 0.39983499587489685, + "grad_norm": 2.8737494945526123, + "learning_rate": 1.5005037625034135e-05, + "loss": 0.2566, + "step": 31986 + }, + { + "epoch": 0.3998599964999125, + "grad_norm": 2.718712329864502, + "learning_rate": 1.5004282089237383e-05, + "loss": 1.3371, + "step": 31988 + }, + { + "epoch": 0.39988499712492814, + "grad_norm": 0.0012097973376512527, + "learning_rate": 1.5003526515328725e-05, + "loss": 0.12, + "step": 31990 + }, + { + "epoch": 0.39990999774994374, + "grad_norm": 2.8088834285736084, + "learning_rate": 1.5002770903313919e-05, + "loss": 0.7232, + "step": 31992 + }, + { + "epoch": 0.3999349983749594, + "grad_norm": 3.8924245834350586, + "learning_rate": 1.5002015253198711e-05, + "loss": 0.6819, + "step": 31994 + }, + { + "epoch": 0.399959998999975, + "grad_norm": 1.8657615184783936, + "learning_rate": 1.5001259564988866e-05, + "loss": 0.0673, + "step": 31996 + }, + { + "epoch": 0.3999849996249906, + "grad_norm": 3.3772590160369873, + "learning_rate": 1.500050383869014e-05, + "loss": 1.477, + "step": 31998 + }, + { + "epoch": 0.40001000025000627, + "grad_norm": 1.425752878189087, + "learning_rate": 1.4999748074308279e-05, + "loss": 0.076, + "step": 32000 + }, + { + "epoch": 0.40003500087502186, + "grad_norm": 4.681122303009033, + "learning_rate": 1.4998992271849042e-05, + "loss": 1.4284, + "step": 32002 + }, + { + "epoch": 0.4000600015000375, + "grad_norm": 1.5581047534942627, + "learning_rate": 1.499823643131819e-05, + "loss": 0.0616, + "step": 32004 + }, + { + "epoch": 0.4000850021250531, + "grad_norm": 5.034612655639648, + "learning_rate": 1.4997480552721475e-05, + "loss": 1.3151, + "step": 32006 + }, + { + "epoch": 0.40011000275006875, + "grad_norm": 0.0036548462230712175, + "learning_rate": 1.4996724636064654e-05, + "loss": 0.484, + "step": 32008 + }, + { + "epoch": 0.4001350033750844, + "grad_norm": 5.533017158508301, + "learning_rate": 1.4995968681353488e-05, + "loss": 0.5687, + "step": 32010 + }, + { + "epoch": 0.4001600040001, + "grad_norm": 2.445333957672119, + "learning_rate": 1.4995212688593728e-05, + "loss": 0.7824, + "step": 32012 + }, + { + "epoch": 0.40018500462511564, + "grad_norm": 2.4561352729797363, + "learning_rate": 1.4994456657791135e-05, + "loss": 0.6182, + "step": 32014 + }, + { + "epoch": 0.40021000525013123, + "grad_norm": 3.04988694190979, + "learning_rate": 1.499370058895147e-05, + "loss": 0.8555, + "step": 32016 + }, + { + "epoch": 0.4002350058751469, + "grad_norm": 1.1650571823120117, + "learning_rate": 1.4992944482080481e-05, + "loss": 0.3753, + "step": 32018 + }, + { + "epoch": 0.4002600065001625, + "grad_norm": 3.0888149738311768, + "learning_rate": 1.4992188337183935e-05, + "loss": 1.5395, + "step": 32020 + }, + { + "epoch": 0.4002850071251781, + "grad_norm": 0.43432366847991943, + "learning_rate": 1.4991432154267591e-05, + "loss": 0.711, + "step": 32022 + }, + { + "epoch": 0.40031000775019376, + "grad_norm": 7.554622173309326, + "learning_rate": 1.4990675933337206e-05, + "loss": 1.9787, + "step": 32024 + }, + { + "epoch": 0.40033500837520936, + "grad_norm": 5.037791728973389, + "learning_rate": 1.4989919674398536e-05, + "loss": 1.0783, + "step": 32026 + }, + { + "epoch": 0.400360009000225, + "grad_norm": 3.7553863525390625, + "learning_rate": 1.4989163377457345e-05, + "loss": 0.6864, + "step": 32028 + }, + { + "epoch": 0.40038500962524065, + "grad_norm": 0.7719022631645203, + "learning_rate": 1.4988407042519393e-05, + "loss": 0.8271, + "step": 32030 + }, + { + "epoch": 0.40041001025025624, + "grad_norm": 1.2519798278808594, + "learning_rate": 1.4987650669590435e-05, + "loss": 0.7182, + "step": 32032 + }, + { + "epoch": 0.4004350108752719, + "grad_norm": 3.879399299621582, + "learning_rate": 1.4986894258676234e-05, + "loss": 0.9126, + "step": 32034 + }, + { + "epoch": 0.4004600115002875, + "grad_norm": 4.6325860023498535, + "learning_rate": 1.4986137809782556e-05, + "loss": 1.393, + "step": 32036 + }, + { + "epoch": 0.40048501212530313, + "grad_norm": 4.16285514831543, + "learning_rate": 1.4985381322915153e-05, + "loss": 0.7268, + "step": 32038 + }, + { + "epoch": 0.4005100127503188, + "grad_norm": 0.0012474870309233665, + "learning_rate": 1.4984624798079794e-05, + "loss": 0.0354, + "step": 32040 + }, + { + "epoch": 0.40053501337533437, + "grad_norm": 3.1168298721313477, + "learning_rate": 1.4983868235282237e-05, + "loss": 0.5426, + "step": 32042 + }, + { + "epoch": 0.40056001400035, + "grad_norm": 5.929989337921143, + "learning_rate": 1.4983111634528243e-05, + "loss": 0.9584, + "step": 32044 + }, + { + "epoch": 0.4005850146253656, + "grad_norm": 4.604206562042236, + "learning_rate": 1.4982354995823576e-05, + "loss": 0.9571, + "step": 32046 + }, + { + "epoch": 0.40061001525038126, + "grad_norm": 2.967775821685791, + "learning_rate": 1.4981598319173999e-05, + "loss": 0.5073, + "step": 32048 + }, + { + "epoch": 0.4006350158753969, + "grad_norm": 2.8431289196014404, + "learning_rate": 1.4980841604585273e-05, + "loss": 0.7313, + "step": 32050 + }, + { + "epoch": 0.4006600165004125, + "grad_norm": 0.0008688094094395638, + "learning_rate": 1.4980084852063161e-05, + "loss": 0.5642, + "step": 32052 + }, + { + "epoch": 0.40068501712542814, + "grad_norm": 0.002475764835253358, + "learning_rate": 1.4979328061613428e-05, + "loss": 0.6293, + "step": 32054 + }, + { + "epoch": 0.40071001775044374, + "grad_norm": 3.060940980911255, + "learning_rate": 1.4978571233241835e-05, + "loss": 0.1766, + "step": 32056 + }, + { + "epoch": 0.4007350183754594, + "grad_norm": 2.6566121578216553, + "learning_rate": 1.4977814366954149e-05, + "loss": 1.6313, + "step": 32058 + }, + { + "epoch": 0.40076001900047503, + "grad_norm": 4.388610363006592, + "learning_rate": 1.497705746275613e-05, + "loss": 1.4093, + "step": 32060 + }, + { + "epoch": 0.4007850196254906, + "grad_norm": 1.699800968170166, + "learning_rate": 1.4976300520653549e-05, + "loss": 1.4624, + "step": 32062 + }, + { + "epoch": 0.40081002025050627, + "grad_norm": 0.0009386041201651096, + "learning_rate": 1.4975543540652167e-05, + "loss": 0.2087, + "step": 32064 + }, + { + "epoch": 0.40083502087552186, + "grad_norm": 3.9332661628723145, + "learning_rate": 1.4974786522757745e-05, + "loss": 1.1743, + "step": 32066 + }, + { + "epoch": 0.4008600215005375, + "grad_norm": 3.0242931842803955, + "learning_rate": 1.4974029466976054e-05, + "loss": 0.2824, + "step": 32068 + }, + { + "epoch": 0.40088502212555316, + "grad_norm": 3.6858160495758057, + "learning_rate": 1.4973272373312859e-05, + "loss": 1.1055, + "step": 32070 + }, + { + "epoch": 0.40091002275056875, + "grad_norm": 0.0012262456584721804, + "learning_rate": 1.4972515241773925e-05, + "loss": 0.3509, + "step": 32072 + }, + { + "epoch": 0.4009350233755844, + "grad_norm": 1.1271580457687378, + "learning_rate": 1.4971758072365016e-05, + "loss": 1.1814, + "step": 32074 + }, + { + "epoch": 0.4009600240006, + "grad_norm": 4.672534942626953, + "learning_rate": 1.4971000865091904e-05, + "loss": 0.4011, + "step": 32076 + }, + { + "epoch": 0.40098502462561564, + "grad_norm": 2.126310110092163, + "learning_rate": 1.4970243619960352e-05, + "loss": 1.1648, + "step": 32078 + }, + { + "epoch": 0.4010100252506313, + "grad_norm": 4.182097911834717, + "learning_rate": 1.4969486336976125e-05, + "loss": 1.6223, + "step": 32080 + }, + { + "epoch": 0.4010350258756469, + "grad_norm": 8.776619911193848, + "learning_rate": 1.4968729016144994e-05, + "loss": 0.825, + "step": 32082 + }, + { + "epoch": 0.4010600265006625, + "grad_norm": 5.662533760070801, + "learning_rate": 1.4967971657472727e-05, + "loss": 0.9945, + "step": 32084 + }, + { + "epoch": 0.4010850271256781, + "grad_norm": 1.9522536993026733, + "learning_rate": 1.4967214260965089e-05, + "loss": 0.7694, + "step": 32086 + }, + { + "epoch": 0.40111002775069377, + "grad_norm": 1.859866976737976, + "learning_rate": 1.4966456826627848e-05, + "loss": 0.7724, + "step": 32088 + }, + { + "epoch": 0.4011350283757094, + "grad_norm": 3.245910167694092, + "learning_rate": 1.4965699354466777e-05, + "loss": 0.8475, + "step": 32090 + }, + { + "epoch": 0.401160029000725, + "grad_norm": 0.007450541947036982, + "learning_rate": 1.4964941844487639e-05, + "loss": 1.0379, + "step": 32092 + }, + { + "epoch": 0.40118502962574065, + "grad_norm": 0.0004821992770303041, + "learning_rate": 1.4964184296696206e-05, + "loss": 0.0197, + "step": 32094 + }, + { + "epoch": 0.40121003025075624, + "grad_norm": 2.89650821685791, + "learning_rate": 1.496342671109825e-05, + "loss": 0.4345, + "step": 32096 + }, + { + "epoch": 0.4012350308757719, + "grad_norm": 2.229212999343872, + "learning_rate": 1.4962669087699537e-05, + "loss": 0.228, + "step": 32098 + }, + { + "epoch": 0.40126003150078754, + "grad_norm": 3.060100555419922, + "learning_rate": 1.4961911426505837e-05, + "loss": 0.8235, + "step": 32100 + }, + { + "epoch": 0.40128503212580313, + "grad_norm": 2.8184850215911865, + "learning_rate": 1.4961153727522924e-05, + "loss": 1.0366, + "step": 32102 + }, + { + "epoch": 0.4013100327508188, + "grad_norm": 4.854284286499023, + "learning_rate": 1.4960395990756564e-05, + "loss": 1.2452, + "step": 32104 + }, + { + "epoch": 0.40133503337583437, + "grad_norm": 4.645298957824707, + "learning_rate": 1.495963821621253e-05, + "loss": 1.8538, + "step": 32106 + }, + { + "epoch": 0.40136003400085, + "grad_norm": 2.0095126628875732, + "learning_rate": 1.4958880403896592e-05, + "loss": 0.0941, + "step": 32108 + }, + { + "epoch": 0.40138503462586567, + "grad_norm": 3.0495662689208984, + "learning_rate": 1.495812255381452e-05, + "loss": 0.8781, + "step": 32110 + }, + { + "epoch": 0.40141003525088126, + "grad_norm": 3.2495126724243164, + "learning_rate": 1.4957364665972092e-05, + "loss": 0.7934, + "step": 32112 + }, + { + "epoch": 0.4014350358758969, + "grad_norm": 5.074234962463379, + "learning_rate": 1.4956606740375075e-05, + "loss": 0.3265, + "step": 32114 + }, + { + "epoch": 0.4014600365009125, + "grad_norm": 4.672545433044434, + "learning_rate": 1.495584877702924e-05, + "loss": 1.1998, + "step": 32116 + }, + { + "epoch": 0.40148503712592815, + "grad_norm": 2.83286190032959, + "learning_rate": 1.4955090775940362e-05, + "loss": 0.8002, + "step": 32118 + }, + { + "epoch": 0.4015100377509438, + "grad_norm": 0.0011659219162538648, + "learning_rate": 1.4954332737114215e-05, + "loss": 0.5981, + "step": 32120 + }, + { + "epoch": 0.4015350383759594, + "grad_norm": 4.449558258056641, + "learning_rate": 1.4953574660556566e-05, + "loss": 1.0933, + "step": 32122 + }, + { + "epoch": 0.40156003900097503, + "grad_norm": 5.312067985534668, + "learning_rate": 1.4952816546273199e-05, + "loss": 1.7935, + "step": 32124 + }, + { + "epoch": 0.4015850396259906, + "grad_norm": 2.5486128330230713, + "learning_rate": 1.4952058394269877e-05, + "loss": 0.7262, + "step": 32126 + }, + { + "epoch": 0.4016100402510063, + "grad_norm": 4.018766403198242, + "learning_rate": 1.4951300204552382e-05, + "loss": 0.9942, + "step": 32128 + }, + { + "epoch": 0.4016350408760219, + "grad_norm": 7.320562839508057, + "learning_rate": 1.4950541977126482e-05, + "loss": 1.0585, + "step": 32130 + }, + { + "epoch": 0.4016600415010375, + "grad_norm": 2.8889660835266113, + "learning_rate": 1.4949783711997956e-05, + "loss": 1.7206, + "step": 32132 + }, + { + "epoch": 0.40168504212605316, + "grad_norm": 3.718883991241455, + "learning_rate": 1.4949025409172578e-05, + "loss": 1.3025, + "step": 32134 + }, + { + "epoch": 0.40171004275106875, + "grad_norm": 2.975184679031372, + "learning_rate": 1.4948267068656119e-05, + "loss": 1.9608, + "step": 32136 + }, + { + "epoch": 0.4017350433760844, + "grad_norm": 0.3545181155204773, + "learning_rate": 1.4947508690454359e-05, + "loss": 0.5972, + "step": 32138 + }, + { + "epoch": 0.40176004400110005, + "grad_norm": 2.784442901611328, + "learning_rate": 1.4946750274573076e-05, + "loss": 0.6269, + "step": 32140 + }, + { + "epoch": 0.40178504462611564, + "grad_norm": 2.0627129077911377, + "learning_rate": 1.494599182101804e-05, + "loss": 0.3131, + "step": 32142 + }, + { + "epoch": 0.4018100452511313, + "grad_norm": 1.8231827020645142, + "learning_rate": 1.4945233329795028e-05, + "loss": 1.1517, + "step": 32144 + }, + { + "epoch": 0.4018350458761469, + "grad_norm": 2.1610970497131348, + "learning_rate": 1.4944474800909819e-05, + "loss": 0.5932, + "step": 32146 + }, + { + "epoch": 0.4018600465011625, + "grad_norm": 3.792781114578247, + "learning_rate": 1.494371623436819e-05, + "loss": 1.5408, + "step": 32148 + }, + { + "epoch": 0.4018850471261782, + "grad_norm": 3.2632851600646973, + "learning_rate": 1.4942957630175916e-05, + "loss": 1.0884, + "step": 32150 + }, + { + "epoch": 0.40191004775119377, + "grad_norm": 0.0005259005120024085, + "learning_rate": 1.4942198988338778e-05, + "loss": 0.0001, + "step": 32152 + }, + { + "epoch": 0.4019350483762094, + "grad_norm": 4.1790595054626465, + "learning_rate": 1.4941440308862551e-05, + "loss": 0.1385, + "step": 32154 + }, + { + "epoch": 0.401960049001225, + "grad_norm": 3.079169511795044, + "learning_rate": 1.4940681591753013e-05, + "loss": 1.7461, + "step": 32156 + }, + { + "epoch": 0.40198504962624065, + "grad_norm": 2.787273645401001, + "learning_rate": 1.4939922837015942e-05, + "loss": 1.1565, + "step": 32158 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 5.088714599609375, + "learning_rate": 1.4939164044657118e-05, + "loss": 1.4359, + "step": 32160 + }, + { + "epoch": 0.4020350508762719, + "grad_norm": 1.009458303451538, + "learning_rate": 1.4938405214682316e-05, + "loss": 0.0352, + "step": 32162 + }, + { + "epoch": 0.40206005150128754, + "grad_norm": 3.491616725921631, + "learning_rate": 1.493764634709732e-05, + "loss": 0.7553, + "step": 32164 + }, + { + "epoch": 0.40208505212630313, + "grad_norm": 0.000966491992585361, + "learning_rate": 1.493688744190791e-05, + "loss": 0.5341, + "step": 32166 + }, + { + "epoch": 0.4021100527513188, + "grad_norm": 3.791752815246582, + "learning_rate": 1.4936128499119862e-05, + "loss": 1.2011, + "step": 32168 + }, + { + "epoch": 0.40213505337633443, + "grad_norm": 5.488115310668945, + "learning_rate": 1.4935369518738958e-05, + "loss": 1.6503, + "step": 32170 + }, + { + "epoch": 0.40216005400135, + "grad_norm": 5.6840009689331055, + "learning_rate": 1.4934610500770974e-05, + "loss": 0.9884, + "step": 32172 + }, + { + "epoch": 0.40218505462636567, + "grad_norm": 2.125887632369995, + "learning_rate": 1.4933851445221698e-05, + "loss": 1.2369, + "step": 32174 + }, + { + "epoch": 0.40221005525138126, + "grad_norm": 4.286860942840576, + "learning_rate": 1.4933092352096908e-05, + "loss": 0.7562, + "step": 32176 + }, + { + "epoch": 0.4022350558763969, + "grad_norm": 6.094538688659668, + "learning_rate": 1.4932333221402382e-05, + "loss": 0.6242, + "step": 32178 + }, + { + "epoch": 0.40226005650141256, + "grad_norm": 3.3159754276275635, + "learning_rate": 1.4931574053143904e-05, + "loss": 1.5282, + "step": 32180 + }, + { + "epoch": 0.40228505712642815, + "grad_norm": 0.0006630112184211612, + "learning_rate": 1.4930814847327257e-05, + "loss": 0.7862, + "step": 32182 + }, + { + "epoch": 0.4023100577514438, + "grad_norm": 0.003717929357662797, + "learning_rate": 1.4930055603958215e-05, + "loss": 0.7691, + "step": 32184 + }, + { + "epoch": 0.4023350583764594, + "grad_norm": 5.851372241973877, + "learning_rate": 1.4929296323042574e-05, + "loss": 1.0483, + "step": 32186 + }, + { + "epoch": 0.40236005900147503, + "grad_norm": 0.4947908818721771, + "learning_rate": 1.4928537004586108e-05, + "loss": 1.0084, + "step": 32188 + }, + { + "epoch": 0.4023850596264907, + "grad_norm": 4.901225566864014, + "learning_rate": 1.4927777648594598e-05, + "loss": 1.2574, + "step": 32190 + }, + { + "epoch": 0.4024100602515063, + "grad_norm": 3.151202917098999, + "learning_rate": 1.4927018255073835e-05, + "loss": 0.9753, + "step": 32192 + }, + { + "epoch": 0.4024350608765219, + "grad_norm": 3.837709426879883, + "learning_rate": 1.4926258824029596e-05, + "loss": 0.5678, + "step": 32194 + }, + { + "epoch": 0.4024600615015375, + "grad_norm": 3.332108736038208, + "learning_rate": 1.4925499355467663e-05, + "loss": 1.6629, + "step": 32196 + }, + { + "epoch": 0.40248506212655316, + "grad_norm": 0.004297167528420687, + "learning_rate": 1.4924739849393826e-05, + "loss": 1.3205, + "step": 32198 + }, + { + "epoch": 0.4025100627515688, + "grad_norm": 5.573537826538086, + "learning_rate": 1.4923980305813868e-05, + "loss": 0.5835, + "step": 32200 + }, + { + "epoch": 0.4025350633765844, + "grad_norm": 6.289117813110352, + "learning_rate": 1.492322072473357e-05, + "loss": 1.7929, + "step": 32202 + }, + { + "epoch": 0.40256006400160005, + "grad_norm": 0.002389125060290098, + "learning_rate": 1.4922461106158719e-05, + "loss": 0.463, + "step": 32204 + }, + { + "epoch": 0.40258506462661564, + "grad_norm": 2.8424296379089355, + "learning_rate": 1.4921701450095105e-05, + "loss": 0.6347, + "step": 32206 + }, + { + "epoch": 0.4026100652516313, + "grad_norm": 0.003432800993323326, + "learning_rate": 1.4920941756548502e-05, + "loss": 0.4529, + "step": 32208 + }, + { + "epoch": 0.40263506587664694, + "grad_norm": 9.939257621765137, + "learning_rate": 1.4920182025524705e-05, + "loss": 0.8589, + "step": 32210 + }, + { + "epoch": 0.40266006650166253, + "grad_norm": 2.9186034202575684, + "learning_rate": 1.4919422257029502e-05, + "loss": 1.7879, + "step": 32212 + }, + { + "epoch": 0.4026850671266782, + "grad_norm": 0.739677906036377, + "learning_rate": 1.491866245106867e-05, + "loss": 0.123, + "step": 32214 + }, + { + "epoch": 0.40271006775169377, + "grad_norm": 3.7666916847229004, + "learning_rate": 1.4917902607648001e-05, + "loss": 1.5962, + "step": 32216 + }, + { + "epoch": 0.4027350683767094, + "grad_norm": 1.4158073663711548, + "learning_rate": 1.4917142726773281e-05, + "loss": 0.8818, + "step": 32218 + }, + { + "epoch": 0.40276006900172506, + "grad_norm": 5.406414031982422, + "learning_rate": 1.4916382808450297e-05, + "loss": 1.2454, + "step": 32220 + }, + { + "epoch": 0.40278506962674065, + "grad_norm": 5.57509183883667, + "learning_rate": 1.4915622852684837e-05, + "loss": 1.6873, + "step": 32222 + }, + { + "epoch": 0.4028100702517563, + "grad_norm": 3.6766741275787354, + "learning_rate": 1.4914862859482688e-05, + "loss": 1.2125, + "step": 32224 + }, + { + "epoch": 0.4028350708767719, + "grad_norm": 1.5903658866882324, + "learning_rate": 1.491410282884964e-05, + "loss": 0.7918, + "step": 32226 + }, + { + "epoch": 0.40286007150178754, + "grad_norm": 2.092519998550415, + "learning_rate": 1.4913342760791481e-05, + "loss": 0.8717, + "step": 32228 + }, + { + "epoch": 0.4028850721268032, + "grad_norm": 3.7629101276397705, + "learning_rate": 1.4912582655313995e-05, + "loss": 0.6538, + "step": 32230 + }, + { + "epoch": 0.4029100727518188, + "grad_norm": 2.2377634048461914, + "learning_rate": 1.4911822512422978e-05, + "loss": 0.9365, + "step": 32232 + }, + { + "epoch": 0.40293507337683443, + "grad_norm": 1.7446733713150024, + "learning_rate": 1.491106233212421e-05, + "loss": 0.7815, + "step": 32234 + }, + { + "epoch": 0.40296007400185, + "grad_norm": 4.794820308685303, + "learning_rate": 1.491030211442349e-05, + "loss": 1.2064, + "step": 32236 + }, + { + "epoch": 0.40298507462686567, + "grad_norm": 3.151773452758789, + "learning_rate": 1.4909541859326605e-05, + "loss": 0.7117, + "step": 32238 + }, + { + "epoch": 0.4030100752518813, + "grad_norm": 2.6451218128204346, + "learning_rate": 1.4908781566839339e-05, + "loss": 0.6815, + "step": 32240 + }, + { + "epoch": 0.4030350758768969, + "grad_norm": 4.588404178619385, + "learning_rate": 1.4908021236967488e-05, + "loss": 0.6043, + "step": 32242 + }, + { + "epoch": 0.40306007650191256, + "grad_norm": 2.477278470993042, + "learning_rate": 1.4907260869716841e-05, + "loss": 1.436, + "step": 32244 + }, + { + "epoch": 0.40308507712692815, + "grad_norm": 1.2535878419876099, + "learning_rate": 1.490650046509319e-05, + "loss": 0.0527, + "step": 32246 + }, + { + "epoch": 0.4031100777519438, + "grad_norm": 11.039352416992188, + "learning_rate": 1.4905740023102327e-05, + "loss": 0.1976, + "step": 32248 + }, + { + "epoch": 0.40313507837695944, + "grad_norm": 1.639387845993042, + "learning_rate": 1.4904979543750037e-05, + "loss": 0.494, + "step": 32250 + }, + { + "epoch": 0.40316007900197504, + "grad_norm": 7.2927656173706055, + "learning_rate": 1.4904219027042124e-05, + "loss": 1.5409, + "step": 32252 + }, + { + "epoch": 0.4031850796269907, + "grad_norm": 5.03896951675415, + "learning_rate": 1.4903458472984367e-05, + "loss": 0.7184, + "step": 32254 + }, + { + "epoch": 0.4032100802520063, + "grad_norm": 2.505218982696533, + "learning_rate": 1.4902697881582565e-05, + "loss": 0.6374, + "step": 32256 + }, + { + "epoch": 0.4032350808770219, + "grad_norm": 0.001204945147037506, + "learning_rate": 1.4901937252842506e-05, + "loss": 0.5936, + "step": 32258 + }, + { + "epoch": 0.40326008150203757, + "grad_norm": 3.6500720977783203, + "learning_rate": 1.490117658676999e-05, + "loss": 1.6242, + "step": 32260 + }, + { + "epoch": 0.40328508212705316, + "grad_norm": 4.838953971862793, + "learning_rate": 1.4900415883370805e-05, + "loss": 1.0689, + "step": 32262 + }, + { + "epoch": 0.4033100827520688, + "grad_norm": 0.0009993467247113585, + "learning_rate": 1.4899655142650746e-05, + "loss": 0.1345, + "step": 32264 + }, + { + "epoch": 0.4033350833770844, + "grad_norm": 2.6061668395996094, + "learning_rate": 1.4898894364615606e-05, + "loss": 0.5966, + "step": 32266 + }, + { + "epoch": 0.40336008400210005, + "grad_norm": 3.8947222232818604, + "learning_rate": 1.4898133549271179e-05, + "loss": 0.8495, + "step": 32268 + }, + { + "epoch": 0.4033850846271157, + "grad_norm": 2.431267499923706, + "learning_rate": 1.489737269662326e-05, + "loss": 0.5516, + "step": 32270 + }, + { + "epoch": 0.4034100852521313, + "grad_norm": 1.208366870880127, + "learning_rate": 1.4896611806677644e-05, + "loss": 0.3502, + "step": 32272 + }, + { + "epoch": 0.40343508587714694, + "grad_norm": 6.772473335266113, + "learning_rate": 1.4895850879440122e-05, + "loss": 1.6373, + "step": 32274 + }, + { + "epoch": 0.40346008650216253, + "grad_norm": 3.08450984954834, + "learning_rate": 1.4895089914916494e-05, + "loss": 1.0562, + "step": 32276 + }, + { + "epoch": 0.4034850871271782, + "grad_norm": 3.2427189350128174, + "learning_rate": 1.4894328913112555e-05, + "loss": 1.1622, + "step": 32278 + }, + { + "epoch": 0.4035100877521938, + "grad_norm": 5.165832996368408, + "learning_rate": 1.4893567874034097e-05, + "loss": 1.8865, + "step": 32280 + }, + { + "epoch": 0.4035350883772094, + "grad_norm": 0.9878228306770325, + "learning_rate": 1.4892806797686919e-05, + "loss": 0.8762, + "step": 32282 + }, + { + "epoch": 0.40356008900222506, + "grad_norm": 2.7683897018432617, + "learning_rate": 1.4892045684076814e-05, + "loss": 0.786, + "step": 32284 + }, + { + "epoch": 0.40358508962724066, + "grad_norm": 3.514500856399536, + "learning_rate": 1.4891284533209585e-05, + "loss": 1.5144, + "step": 32286 + }, + { + "epoch": 0.4036100902522563, + "grad_norm": 4.096690654754639, + "learning_rate": 1.4890523345091022e-05, + "loss": 1.2122, + "step": 32288 + }, + { + "epoch": 0.40363509087727195, + "grad_norm": 2.5027287006378174, + "learning_rate": 1.4889762119726925e-05, + "loss": 1.6808, + "step": 32290 + }, + { + "epoch": 0.40366009150228754, + "grad_norm": 3.221916913986206, + "learning_rate": 1.4889000857123093e-05, + "loss": 0.8398, + "step": 32292 + }, + { + "epoch": 0.4036850921273032, + "grad_norm": 0.30519136786460876, + "learning_rate": 1.488823955728532e-05, + "loss": 0.7094, + "step": 32294 + }, + { + "epoch": 0.4037100927523188, + "grad_norm": 4.473215579986572, + "learning_rate": 1.4887478220219404e-05, + "loss": 1.3473, + "step": 32296 + }, + { + "epoch": 0.40373509337733443, + "grad_norm": 0.36437636613845825, + "learning_rate": 1.4886716845931149e-05, + "loss": 0.4485, + "step": 32298 + }, + { + "epoch": 0.4037600940023501, + "grad_norm": 2.173839807510376, + "learning_rate": 1.4885955434426347e-05, + "loss": 0.9206, + "step": 32300 + }, + { + "epoch": 0.40378509462736567, + "grad_norm": 6.130508899688721, + "learning_rate": 1.48851939857108e-05, + "loss": 2.1305, + "step": 32302 + }, + { + "epoch": 0.4038100952523813, + "grad_norm": 10.556923866271973, + "learning_rate": 1.4884432499790307e-05, + "loss": 0.6438, + "step": 32304 + }, + { + "epoch": 0.4038350958773969, + "grad_norm": 2.7584571838378906, + "learning_rate": 1.4883670976670666e-05, + "loss": 1.3168, + "step": 32306 + }, + { + "epoch": 0.40386009650241256, + "grad_norm": 3.715752363204956, + "learning_rate": 1.4882909416357679e-05, + "loss": 0.9115, + "step": 32308 + }, + { + "epoch": 0.4038850971274282, + "grad_norm": 9.557931900024414, + "learning_rate": 1.488214781885714e-05, + "loss": 0.7792, + "step": 32310 + }, + { + "epoch": 0.4039100977524438, + "grad_norm": 2.964855432510376, + "learning_rate": 1.4881386184174857e-05, + "loss": 1.6458, + "step": 32312 + }, + { + "epoch": 0.40393509837745945, + "grad_norm": 3.642131805419922, + "learning_rate": 1.4880624512316626e-05, + "loss": 0.9149, + "step": 32314 + }, + { + "epoch": 0.40396009900247504, + "grad_norm": 2.7868549823760986, + "learning_rate": 1.4879862803288252e-05, + "loss": 0.9052, + "step": 32316 + }, + { + "epoch": 0.4039850996274907, + "grad_norm": 1.21700918674469, + "learning_rate": 1.487910105709553e-05, + "loss": 0.3799, + "step": 32318 + }, + { + "epoch": 0.40401010025250633, + "grad_norm": 3.4696767330169678, + "learning_rate": 1.4878339273744265e-05, + "loss": 1.2777, + "step": 32320 + }, + { + "epoch": 0.4040351008775219, + "grad_norm": 1.014249324798584, + "learning_rate": 1.4877577453240257e-05, + "loss": 0.5241, + "step": 32322 + }, + { + "epoch": 0.40406010150253757, + "grad_norm": 8.688404083251953, + "learning_rate": 1.4876815595589312e-05, + "loss": 2.1246, + "step": 32324 + }, + { + "epoch": 0.40408510212755316, + "grad_norm": 4.350696563720703, + "learning_rate": 1.4876053700797223e-05, + "loss": 0.8285, + "step": 32326 + }, + { + "epoch": 0.4041101027525688, + "grad_norm": 4.4740729331970215, + "learning_rate": 1.4875291768869804e-05, + "loss": 1.6846, + "step": 32328 + }, + { + "epoch": 0.40413510337758446, + "grad_norm": 9.700328826904297, + "learning_rate": 1.4874529799812852e-05, + "loss": 0.2797, + "step": 32330 + }, + { + "epoch": 0.40416010400260005, + "grad_norm": 4.439016819000244, + "learning_rate": 1.487376779363217e-05, + "loss": 1.3895, + "step": 32332 + }, + { + "epoch": 0.4041851046276157, + "grad_norm": 3.4696192741394043, + "learning_rate": 1.487300575033356e-05, + "loss": 0.7054, + "step": 32334 + }, + { + "epoch": 0.4042101052526313, + "grad_norm": 6.534938335418701, + "learning_rate": 1.4872243669922828e-05, + "loss": 2.7527, + "step": 32336 + }, + { + "epoch": 0.40423510587764694, + "grad_norm": 6.043363094329834, + "learning_rate": 1.4871481552405775e-05, + "loss": 1.3444, + "step": 32338 + }, + { + "epoch": 0.4042601065026626, + "grad_norm": 3.1404929161071777, + "learning_rate": 1.4870719397788209e-05, + "loss": 0.6239, + "step": 32340 + }, + { + "epoch": 0.4042851071276782, + "grad_norm": 9.030830383300781, + "learning_rate": 1.4869957206075936e-05, + "loss": 0.4036, + "step": 32342 + }, + { + "epoch": 0.4043101077526938, + "grad_norm": 1.2894943952560425, + "learning_rate": 1.4869194977274756e-05, + "loss": 0.037, + "step": 32344 + }, + { + "epoch": 0.4043351083777094, + "grad_norm": 0.6466899514198303, + "learning_rate": 1.4868432711390474e-05, + "loss": 0.4164, + "step": 32346 + }, + { + "epoch": 0.40436010900272507, + "grad_norm": 3.6199638843536377, + "learning_rate": 1.4867670408428899e-05, + "loss": 0.7507, + "step": 32348 + }, + { + "epoch": 0.4043851096277407, + "grad_norm": 0.0030797626823186874, + "learning_rate": 1.486690806839583e-05, + "loss": 0.0296, + "step": 32350 + }, + { + "epoch": 0.4044101102527563, + "grad_norm": 8.656644821166992, + "learning_rate": 1.4866145691297083e-05, + "loss": 0.6764, + "step": 32352 + }, + { + "epoch": 0.40443511087777195, + "grad_norm": 5.263188362121582, + "learning_rate": 1.4865383277138454e-05, + "loss": 0.4123, + "step": 32354 + }, + { + "epoch": 0.40446011150278754, + "grad_norm": 0.0042254128493368626, + "learning_rate": 1.4864620825925757e-05, + "loss": 0.283, + "step": 32356 + }, + { + "epoch": 0.4044851121278032, + "grad_norm": 4.248513221740723, + "learning_rate": 1.4863858337664796e-05, + "loss": 0.9295, + "step": 32358 + }, + { + "epoch": 0.40451011275281884, + "grad_norm": 0.05765335634350777, + "learning_rate": 1.4863095812361376e-05, + "loss": 0.0033, + "step": 32360 + }, + { + "epoch": 0.40453511337783443, + "grad_norm": 3.7563374042510986, + "learning_rate": 1.4862333250021306e-05, + "loss": 1.9808, + "step": 32362 + }, + { + "epoch": 0.4045601140028501, + "grad_norm": 1.6627813577651978, + "learning_rate": 1.4861570650650394e-05, + "loss": 1.3614, + "step": 32364 + }, + { + "epoch": 0.40458511462786567, + "grad_norm": 2.2274441719055176, + "learning_rate": 1.4860808014254447e-05, + "loss": 0.2168, + "step": 32366 + }, + { + "epoch": 0.4046101152528813, + "grad_norm": 4.477396488189697, + "learning_rate": 1.4860045340839273e-05, + "loss": 0.9942, + "step": 32368 + }, + { + "epoch": 0.40463511587789697, + "grad_norm": 0.001796914846636355, + "learning_rate": 1.4859282630410686e-05, + "loss": 0.3648, + "step": 32370 + }, + { + "epoch": 0.40466011650291256, + "grad_norm": 0.157936230301857, + "learning_rate": 1.4858519882974485e-05, + "loss": 0.2404, + "step": 32372 + }, + { + "epoch": 0.4046851171279282, + "grad_norm": 2.2846434116363525, + "learning_rate": 1.485775709853648e-05, + "loss": 1.1388, + "step": 32374 + }, + { + "epoch": 0.4047101177529438, + "grad_norm": 0.06433695554733276, + "learning_rate": 1.485699427710249e-05, + "loss": 0.592, + "step": 32376 + }, + { + "epoch": 0.40473511837795945, + "grad_norm": 3.5054073333740234, + "learning_rate": 1.4856231418678318e-05, + "loss": 0.9734, + "step": 32378 + }, + { + "epoch": 0.4047601190029751, + "grad_norm": 0.01722409762442112, + "learning_rate": 1.4855468523269771e-05, + "loss": 0.0599, + "step": 32380 + }, + { + "epoch": 0.4047851196279907, + "grad_norm": 2.142371654510498, + "learning_rate": 1.4854705590882668e-05, + "loss": 0.3642, + "step": 32382 + }, + { + "epoch": 0.40481012025300633, + "grad_norm": 0.007045557256788015, + "learning_rate": 1.4853942621522807e-05, + "loss": 0.8594, + "step": 32384 + }, + { + "epoch": 0.4048351208780219, + "grad_norm": 4.905920505523682, + "learning_rate": 1.4853179615196007e-05, + "loss": 0.9456, + "step": 32386 + }, + { + "epoch": 0.4048601215030376, + "grad_norm": 2.3695316314697266, + "learning_rate": 1.4852416571908082e-05, + "loss": 0.74, + "step": 32388 + }, + { + "epoch": 0.4048851221280532, + "grad_norm": 0.0015704669058322906, + "learning_rate": 1.4851653491664833e-05, + "loss": 0.9138, + "step": 32390 + }, + { + "epoch": 0.4049101227530688, + "grad_norm": 6.363004684448242, + "learning_rate": 1.485089037447208e-05, + "loss": 1.9042, + "step": 32392 + }, + { + "epoch": 0.40493512337808446, + "grad_norm": 8.15991497039795, + "learning_rate": 1.485012722033563e-05, + "loss": 0.4597, + "step": 32394 + }, + { + "epoch": 0.40496012400310005, + "grad_norm": 4.602705955505371, + "learning_rate": 1.4849364029261302e-05, + "loss": 1.0991, + "step": 32396 + }, + { + "epoch": 0.4049851246281157, + "grad_norm": 3.509298086166382, + "learning_rate": 1.4848600801254896e-05, + "loss": 0.6516, + "step": 32398 + }, + { + "epoch": 0.40501012525313135, + "grad_norm": 4.687982559204102, + "learning_rate": 1.4847837536322236e-05, + "loss": 1.9547, + "step": 32400 + }, + { + "epoch": 0.40503512587814694, + "grad_norm": 3.1648738384246826, + "learning_rate": 1.484707423446913e-05, + "loss": 0.9413, + "step": 32402 + }, + { + "epoch": 0.4050601265031626, + "grad_norm": 4.9319844245910645, + "learning_rate": 1.4846310895701393e-05, + "loss": 0.6812, + "step": 32404 + }, + { + "epoch": 0.4050851271281782, + "grad_norm": 3.6072988510131836, + "learning_rate": 1.4845547520024837e-05, + "loss": 0.733, + "step": 32406 + }, + { + "epoch": 0.4051101277531938, + "grad_norm": 3.11934232711792, + "learning_rate": 1.4844784107445277e-05, + "loss": 1.8436, + "step": 32408 + }, + { + "epoch": 0.4051351283782095, + "grad_norm": 1.0944849252700806, + "learning_rate": 1.4844020657968522e-05, + "loss": 0.7939, + "step": 32410 + }, + { + "epoch": 0.40516012900322507, + "grad_norm": 0.003950705751776695, + "learning_rate": 1.4843257171600396e-05, + "loss": 1.179, + "step": 32412 + }, + { + "epoch": 0.4051851296282407, + "grad_norm": 3.5877819061279297, + "learning_rate": 1.4842493648346707e-05, + "loss": 1.2524, + "step": 32414 + }, + { + "epoch": 0.4052101302532563, + "grad_norm": 2.8514440059661865, + "learning_rate": 1.4841730088213271e-05, + "loss": 0.5979, + "step": 32416 + }, + { + "epoch": 0.40523513087827195, + "grad_norm": 3.8291873931884766, + "learning_rate": 1.4840966491205903e-05, + "loss": 1.3822, + "step": 32418 + }, + { + "epoch": 0.4052601315032876, + "grad_norm": 4.057247161865234, + "learning_rate": 1.484020285733042e-05, + "loss": 1.9165, + "step": 32420 + }, + { + "epoch": 0.4052851321283032, + "grad_norm": 0.22568656504154205, + "learning_rate": 1.4839439186592636e-05, + "loss": 0.2934, + "step": 32422 + }, + { + "epoch": 0.40531013275331884, + "grad_norm": 3.2528984546661377, + "learning_rate": 1.4838675478998366e-05, + "loss": 1.9204, + "step": 32424 + }, + { + "epoch": 0.40533513337833443, + "grad_norm": 3.4162330627441406, + "learning_rate": 1.4837911734553427e-05, + "loss": 1.5762, + "step": 32426 + }, + { + "epoch": 0.4053601340033501, + "grad_norm": 2.4203996658325195, + "learning_rate": 1.483714795326364e-05, + "loss": 0.8682, + "step": 32428 + }, + { + "epoch": 0.40538513462836573, + "grad_norm": 4.206857681274414, + "learning_rate": 1.4836384135134815e-05, + "loss": 0.3742, + "step": 32430 + }, + { + "epoch": 0.4054101352533813, + "grad_norm": 5.205116271972656, + "learning_rate": 1.4835620280172775e-05, + "loss": 1.3104, + "step": 32432 + }, + { + "epoch": 0.40543513587839697, + "grad_norm": 2.9849812984466553, + "learning_rate": 1.4834856388383335e-05, + "loss": 0.6026, + "step": 32434 + }, + { + "epoch": 0.40546013650341256, + "grad_norm": 3.0495173931121826, + "learning_rate": 1.4834092459772308e-05, + "loss": 1.1246, + "step": 32436 + }, + { + "epoch": 0.4054851371284282, + "grad_norm": 6.0381059646606445, + "learning_rate": 1.483332849434552e-05, + "loss": 1.2526, + "step": 32438 + }, + { + "epoch": 0.40551013775344386, + "grad_norm": 1.9385498762130737, + "learning_rate": 1.4832564492108786e-05, + "loss": 1.907, + "step": 32440 + }, + { + "epoch": 0.40553513837845945, + "grad_norm": 0.002100842073559761, + "learning_rate": 1.4831800453067921e-05, + "loss": 0.8662, + "step": 32442 + }, + { + "epoch": 0.4055601390034751, + "grad_norm": 2.956144094467163, + "learning_rate": 1.4831036377228752e-05, + "loss": 0.4422, + "step": 32444 + }, + { + "epoch": 0.4055851396284907, + "grad_norm": 2.568542003631592, + "learning_rate": 1.4830272264597088e-05, + "loss": 1.5825, + "step": 32446 + }, + { + "epoch": 0.40561014025350633, + "grad_norm": 0.3146844208240509, + "learning_rate": 1.4829508115178757e-05, + "loss": 0.6657, + "step": 32448 + }, + { + "epoch": 0.405635140878522, + "grad_norm": 3.6053481101989746, + "learning_rate": 1.4828743928979572e-05, + "loss": 0.4278, + "step": 32450 + }, + { + "epoch": 0.4056601415035376, + "grad_norm": 3.365201234817505, + "learning_rate": 1.4827979706005358e-05, + "loss": 1.0175, + "step": 32452 + }, + { + "epoch": 0.4056851421285532, + "grad_norm": 4.495724678039551, + "learning_rate": 1.4827215446261936e-05, + "loss": 1.1391, + "step": 32454 + }, + { + "epoch": 0.4057101427535688, + "grad_norm": 0.001806559506803751, + "learning_rate": 1.482645114975512e-05, + "loss": 1.3396, + "step": 32456 + }, + { + "epoch": 0.40573514337858446, + "grad_norm": 3.0836410522460938, + "learning_rate": 1.4825686816490736e-05, + "loss": 0.758, + "step": 32458 + }, + { + "epoch": 0.4057601440036001, + "grad_norm": 3.2943549156188965, + "learning_rate": 1.4824922446474603e-05, + "loss": 0.1382, + "step": 32460 + }, + { + "epoch": 0.4057851446286157, + "grad_norm": 3.3085618019104004, + "learning_rate": 1.4824158039712541e-05, + "loss": 0.6242, + "step": 32462 + }, + { + "epoch": 0.40581014525363135, + "grad_norm": 3.2308907508850098, + "learning_rate": 1.4823393596210378e-05, + "loss": 0.7893, + "step": 32464 + }, + { + "epoch": 0.40583514587864694, + "grad_norm": 5.59928560256958, + "learning_rate": 1.482262911597393e-05, + "loss": 0.574, + "step": 32466 + }, + { + "epoch": 0.4058601465036626, + "grad_norm": 4.254708290100098, + "learning_rate": 1.482186459900902e-05, + "loss": 1.6752, + "step": 32468 + }, + { + "epoch": 0.40588514712867824, + "grad_norm": 4.375111103057861, + "learning_rate": 1.482110004532147e-05, + "loss": 1.4949, + "step": 32470 + }, + { + "epoch": 0.40591014775369383, + "grad_norm": 4.583226203918457, + "learning_rate": 1.4820335454917105e-05, + "loss": 1.9295, + "step": 32472 + }, + { + "epoch": 0.4059351483787095, + "grad_norm": 3.135586738586426, + "learning_rate": 1.4819570827801748e-05, + "loss": 1.305, + "step": 32474 + }, + { + "epoch": 0.40596014900372507, + "grad_norm": 1.5314714908599854, + "learning_rate": 1.4818806163981219e-05, + "loss": 0.4301, + "step": 32476 + }, + { + "epoch": 0.4059851496287407, + "grad_norm": 3.035428285598755, + "learning_rate": 1.4818041463461343e-05, + "loss": 0.5987, + "step": 32478 + }, + { + "epoch": 0.40601015025375636, + "grad_norm": 0.0034717970993369818, + "learning_rate": 1.4817276726247949e-05, + "loss": 0.241, + "step": 32480 + }, + { + "epoch": 0.40603515087877196, + "grad_norm": 3.814521551132202, + "learning_rate": 1.4816511952346852e-05, + "loss": 1.4104, + "step": 32482 + }, + { + "epoch": 0.4060601515037876, + "grad_norm": 2.135897636413574, + "learning_rate": 1.4815747141763882e-05, + "loss": 0.7399, + "step": 32484 + }, + { + "epoch": 0.4060851521288032, + "grad_norm": 6.937652587890625, + "learning_rate": 1.4814982294504865e-05, + "loss": 0.8521, + "step": 32486 + }, + { + "epoch": 0.40611015275381884, + "grad_norm": 2.2204082012176514, + "learning_rate": 1.4814217410575622e-05, + "loss": 1.3229, + "step": 32488 + }, + { + "epoch": 0.4061351533788345, + "grad_norm": 3.9981977939605713, + "learning_rate": 1.4813452489981978e-05, + "loss": 0.2755, + "step": 32490 + }, + { + "epoch": 0.4061601540038501, + "grad_norm": 0.7554070949554443, + "learning_rate": 1.4812687532729765e-05, + "loss": 0.5805, + "step": 32492 + }, + { + "epoch": 0.40618515462886573, + "grad_norm": 0.9795701503753662, + "learning_rate": 1.4811922538824803e-05, + "loss": 0.1261, + "step": 32494 + }, + { + "epoch": 0.4062101552538813, + "grad_norm": 4.20576810836792, + "learning_rate": 1.481115750827292e-05, + "loss": 0.6631, + "step": 32496 + }, + { + "epoch": 0.40623515587889697, + "grad_norm": 0.4269576072692871, + "learning_rate": 1.481039244107994e-05, + "loss": 0.5569, + "step": 32498 + }, + { + "epoch": 0.4062601565039126, + "grad_norm": 0.42986056208610535, + "learning_rate": 1.480962733725169e-05, + "loss": 0.5043, + "step": 32500 + }, + { + "epoch": 0.4062851571289282, + "grad_norm": 1.6574501991271973, + "learning_rate": 1.4808862196794e-05, + "loss": 1.6659, + "step": 32502 + }, + { + "epoch": 0.40631015775394386, + "grad_norm": 3.5202267169952393, + "learning_rate": 1.4808097019712694e-05, + "loss": 1.8907, + "step": 32504 + }, + { + "epoch": 0.40633515837895945, + "grad_norm": 1.2312520742416382, + "learning_rate": 1.4807331806013604e-05, + "loss": 0.7503, + "step": 32506 + }, + { + "epoch": 0.4063601590039751, + "grad_norm": 2.2430756092071533, + "learning_rate": 1.4806566555702555e-05, + "loss": 1.1248, + "step": 32508 + }, + { + "epoch": 0.40638515962899074, + "grad_norm": 4.214864730834961, + "learning_rate": 1.4805801268785373e-05, + "loss": 1.5058, + "step": 32510 + }, + { + "epoch": 0.40641016025400634, + "grad_norm": 2.7029528617858887, + "learning_rate": 1.4805035945267889e-05, + "loss": 0.6641, + "step": 32512 + }, + { + "epoch": 0.406435160879022, + "grad_norm": 3.1814849376678467, + "learning_rate": 1.4804270585155928e-05, + "loss": 1.0385, + "step": 32514 + }, + { + "epoch": 0.4064601615040376, + "grad_norm": 0.5516479015350342, + "learning_rate": 1.4803505188455326e-05, + "loss": 0.6023, + "step": 32516 + }, + { + "epoch": 0.4064851621290532, + "grad_norm": 3.2311434745788574, + "learning_rate": 1.4802739755171904e-05, + "loss": 1.7116, + "step": 32518 + }, + { + "epoch": 0.40651016275406887, + "grad_norm": 4.527254581451416, + "learning_rate": 1.4801974285311499e-05, + "loss": 1.5399, + "step": 32520 + }, + { + "epoch": 0.40653516337908446, + "grad_norm": 2.716062307357788, + "learning_rate": 1.4801208778879934e-05, + "loss": 1.2567, + "step": 32522 + }, + { + "epoch": 0.4065601640041001, + "grad_norm": 1.3753786087036133, + "learning_rate": 1.4800443235883042e-05, + "loss": 0.3447, + "step": 32524 + }, + { + "epoch": 0.4065851646291157, + "grad_norm": 1.7089223861694336, + "learning_rate": 1.4799677656326653e-05, + "loss": 0.5671, + "step": 32526 + }, + { + "epoch": 0.40661016525413135, + "grad_norm": 0.666650652885437, + "learning_rate": 1.4798912040216597e-05, + "loss": 0.3284, + "step": 32528 + }, + { + "epoch": 0.406635165879147, + "grad_norm": 3.1614885330200195, + "learning_rate": 1.4798146387558708e-05, + "loss": 0.9457, + "step": 32530 + }, + { + "epoch": 0.4066601665041626, + "grad_norm": 5.529342174530029, + "learning_rate": 1.4797380698358814e-05, + "loss": 0.2745, + "step": 32532 + }, + { + "epoch": 0.40668516712917824, + "grad_norm": 3.1234424114227295, + "learning_rate": 1.4796614972622748e-05, + "loss": 1.0599, + "step": 32534 + }, + { + "epoch": 0.40671016775419383, + "grad_norm": 5.5551347732543945, + "learning_rate": 1.4795849210356339e-05, + "loss": 1.9517, + "step": 32536 + }, + { + "epoch": 0.4067351683792095, + "grad_norm": 5.274242877960205, + "learning_rate": 1.4795083411565418e-05, + "loss": 1.1256, + "step": 32538 + }, + { + "epoch": 0.4067601690042251, + "grad_norm": 2.800200939178467, + "learning_rate": 1.4794317576255823e-05, + "loss": 1.3163, + "step": 32540 + }, + { + "epoch": 0.4067851696292407, + "grad_norm": 1.4435261487960815, + "learning_rate": 1.4793551704433385e-05, + "loss": 0.0535, + "step": 32542 + }, + { + "epoch": 0.40681017025425636, + "grad_norm": 2.6022326946258545, + "learning_rate": 1.4792785796103932e-05, + "loss": 0.5303, + "step": 32544 + }, + { + "epoch": 0.40683517087927196, + "grad_norm": 1.9725756645202637, + "learning_rate": 1.4792019851273303e-05, + "loss": 1.3867, + "step": 32546 + }, + { + "epoch": 0.4068601715042876, + "grad_norm": 0.7977269887924194, + "learning_rate": 1.4791253869947326e-05, + "loss": 0.6349, + "step": 32548 + }, + { + "epoch": 0.40688517212930325, + "grad_norm": 2.6460611820220947, + "learning_rate": 1.4790487852131836e-05, + "loss": 0.6332, + "step": 32550 + }, + { + "epoch": 0.40691017275431884, + "grad_norm": 1.2671239376068115, + "learning_rate": 1.478972179783267e-05, + "loss": 1.4827, + "step": 32552 + }, + { + "epoch": 0.4069351733793345, + "grad_norm": 3.4099762439727783, + "learning_rate": 1.478895570705566e-05, + "loss": 1.0524, + "step": 32554 + }, + { + "epoch": 0.4069601740043501, + "grad_norm": 2.842242956161499, + "learning_rate": 1.478818957980664e-05, + "loss": 1.4741, + "step": 32556 + }, + { + "epoch": 0.40698517462936573, + "grad_norm": 3.827279806137085, + "learning_rate": 1.4787423416091447e-05, + "loss": 0.8671, + "step": 32558 + }, + { + "epoch": 0.4070101752543814, + "grad_norm": 3.0917158126831055, + "learning_rate": 1.4786657215915914e-05, + "loss": 0.4018, + "step": 32560 + }, + { + "epoch": 0.40703517587939697, + "grad_norm": 0.7097262144088745, + "learning_rate": 1.4785890979285876e-05, + "loss": 1.322, + "step": 32562 + }, + { + "epoch": 0.4070601765044126, + "grad_norm": 3.1616334915161133, + "learning_rate": 1.4785124706207166e-05, + "loss": 0.7827, + "step": 32564 + }, + { + "epoch": 0.4070851771294282, + "grad_norm": 0.001064899261109531, + "learning_rate": 1.4784358396685625e-05, + "loss": 0.4216, + "step": 32566 + }, + { + "epoch": 0.40711017775444386, + "grad_norm": 5.20591926574707, + "learning_rate": 1.4783592050727089e-05, + "loss": 0.6836, + "step": 32568 + }, + { + "epoch": 0.4071351783794595, + "grad_norm": 2.518347978591919, + "learning_rate": 1.478282566833739e-05, + "loss": 0.939, + "step": 32570 + }, + { + "epoch": 0.4071601790044751, + "grad_norm": 0.005232298281043768, + "learning_rate": 1.4782059249522369e-05, + "loss": 0.0012, + "step": 32572 + }, + { + "epoch": 0.40718517962949075, + "grad_norm": 1.4747647047042847, + "learning_rate": 1.478129279428786e-05, + "loss": 0.2097, + "step": 32574 + }, + { + "epoch": 0.40721018025450634, + "grad_norm": 2.8589484691619873, + "learning_rate": 1.47805263026397e-05, + "loss": 0.1821, + "step": 32576 + }, + { + "epoch": 0.407235180879522, + "grad_norm": 4.625043869018555, + "learning_rate": 1.4779759774583729e-05, + "loss": 0.9047, + "step": 32578 + }, + { + "epoch": 0.40726018150453763, + "grad_norm": 3.924847364425659, + "learning_rate": 1.4778993210125782e-05, + "loss": 2.2294, + "step": 32580 + }, + { + "epoch": 0.4072851821295532, + "grad_norm": 4.457533836364746, + "learning_rate": 1.4778226609271702e-05, + "loss": 1.3288, + "step": 32582 + }, + { + "epoch": 0.4073101827545689, + "grad_norm": 3.5286388397216797, + "learning_rate": 1.4777459972027322e-05, + "loss": 0.5036, + "step": 32584 + }, + { + "epoch": 0.40733518337958446, + "grad_norm": 3.5388989448547363, + "learning_rate": 1.4776693298398482e-05, + "loss": 1.7409, + "step": 32586 + }, + { + "epoch": 0.4073601840046001, + "grad_norm": 5.293076515197754, + "learning_rate": 1.4775926588391021e-05, + "loss": 1.7904, + "step": 32588 + }, + { + "epoch": 0.40738518462961576, + "grad_norm": 3.2110188007354736, + "learning_rate": 1.477515984201078e-05, + "loss": 0.6729, + "step": 32590 + }, + { + "epoch": 0.40741018525463135, + "grad_norm": 5.562241554260254, + "learning_rate": 1.4774393059263595e-05, + "loss": 1.5402, + "step": 32592 + }, + { + "epoch": 0.407435185879647, + "grad_norm": 3.2210493087768555, + "learning_rate": 1.4773626240155308e-05, + "loss": 0.3091, + "step": 32594 + }, + { + "epoch": 0.4074601865046626, + "grad_norm": 0.002059526275843382, + "learning_rate": 1.477285938469176e-05, + "loss": 0.8407, + "step": 32596 + }, + { + "epoch": 0.40748518712967824, + "grad_norm": 0.08298271149396896, + "learning_rate": 1.4772092492878789e-05, + "loss": 0.5802, + "step": 32598 + }, + { + "epoch": 0.4075101877546939, + "grad_norm": 4.461044788360596, + "learning_rate": 1.4771325564722235e-05, + "loss": 0.5495, + "step": 32600 + }, + { + "epoch": 0.4075351883797095, + "grad_norm": 4.309205055236816, + "learning_rate": 1.477055860022794e-05, + "loss": 1.0718, + "step": 32602 + }, + { + "epoch": 0.4075601890047251, + "grad_norm": 0.018262263387441635, + "learning_rate": 1.4769791599401751e-05, + "loss": 0.5571, + "step": 32604 + }, + { + "epoch": 0.4075851896297407, + "grad_norm": 3.898533344268799, + "learning_rate": 1.4769024562249499e-05, + "loss": 1.4218, + "step": 32606 + }, + { + "epoch": 0.40761019025475637, + "grad_norm": 0.8715682625770569, + "learning_rate": 1.476825748877703e-05, + "loss": 0.4912, + "step": 32608 + }, + { + "epoch": 0.407635190879772, + "grad_norm": 5.137513637542725, + "learning_rate": 1.4767490378990188e-05, + "loss": 2.0025, + "step": 32610 + }, + { + "epoch": 0.4076601915047876, + "grad_norm": 4.0537848472595215, + "learning_rate": 1.476672323289481e-05, + "loss": 1.048, + "step": 32612 + }, + { + "epoch": 0.40768519212980325, + "grad_norm": 4.950601100921631, + "learning_rate": 1.4765956050496745e-05, + "loss": 0.5222, + "step": 32614 + }, + { + "epoch": 0.40771019275481885, + "grad_norm": 2.925271511077881, + "learning_rate": 1.4765188831801834e-05, + "loss": 1.0603, + "step": 32616 + }, + { + "epoch": 0.4077351933798345, + "grad_norm": 4.259236812591553, + "learning_rate": 1.4764421576815916e-05, + "loss": 0.6767, + "step": 32618 + }, + { + "epoch": 0.40776019400485014, + "grad_norm": 0.08280697464942932, + "learning_rate": 1.476365428554484e-05, + "loss": 0.0082, + "step": 32620 + }, + { + "epoch": 0.40778519462986573, + "grad_norm": 2.3633954524993896, + "learning_rate": 1.4762886957994442e-05, + "loss": 0.7852, + "step": 32622 + }, + { + "epoch": 0.4078101952548814, + "grad_norm": 3.995943307876587, + "learning_rate": 1.4762119594170576e-05, + "loss": 1.527, + "step": 32624 + }, + { + "epoch": 0.407835195879897, + "grad_norm": 2.041383743286133, + "learning_rate": 1.4761352194079075e-05, + "loss": 0.7771, + "step": 32626 + }, + { + "epoch": 0.4078601965049126, + "grad_norm": 4.735668659210205, + "learning_rate": 1.4760584757725791e-05, + "loss": 1.6219, + "step": 32628 + }, + { + "epoch": 0.40788519712992827, + "grad_norm": 5.587068557739258, + "learning_rate": 1.4759817285116569e-05, + "loss": 1.8029, + "step": 32630 + }, + { + "epoch": 0.40791019775494386, + "grad_norm": 4.103881359100342, + "learning_rate": 1.475904977625725e-05, + "loss": 0.8173, + "step": 32632 + }, + { + "epoch": 0.4079351983799595, + "grad_norm": 5.866015911102295, + "learning_rate": 1.475828223115368e-05, + "loss": 1.5658, + "step": 32634 + }, + { + "epoch": 0.4079601990049751, + "grad_norm": 6.422228813171387, + "learning_rate": 1.4757514649811709e-05, + "loss": 0.6469, + "step": 32636 + }, + { + "epoch": 0.40798519962999075, + "grad_norm": 2.8337948322296143, + "learning_rate": 1.4756747032237173e-05, + "loss": 0.6229, + "step": 32638 + }, + { + "epoch": 0.4080102002550064, + "grad_norm": 2.5784263610839844, + "learning_rate": 1.475597937843593e-05, + "loss": 0.933, + "step": 32640 + }, + { + "epoch": 0.408035200880022, + "grad_norm": 6.827404975891113, + "learning_rate": 1.4755211688413818e-05, + "loss": 0.9631, + "step": 32642 + }, + { + "epoch": 0.40806020150503763, + "grad_norm": 3.8450217247009277, + "learning_rate": 1.4754443962176688e-05, + "loss": 1.5894, + "step": 32644 + }, + { + "epoch": 0.4080852021300532, + "grad_norm": 4.091899871826172, + "learning_rate": 1.4753676199730383e-05, + "loss": 0.4473, + "step": 32646 + }, + { + "epoch": 0.4081102027550689, + "grad_norm": 4.622716426849365, + "learning_rate": 1.4752908401080752e-05, + "loss": 1.4882, + "step": 32648 + }, + { + "epoch": 0.4081352033800845, + "grad_norm": 2.312460422515869, + "learning_rate": 1.4752140566233644e-05, + "loss": 0.1662, + "step": 32650 + }, + { + "epoch": 0.4081602040051001, + "grad_norm": 2.376847267150879, + "learning_rate": 1.4751372695194906e-05, + "loss": 0.3524, + "step": 32652 + }, + { + "epoch": 0.40818520463011576, + "grad_norm": 2.7551205158233643, + "learning_rate": 1.4750604787970383e-05, + "loss": 1.1169, + "step": 32654 + }, + { + "epoch": 0.40821020525513135, + "grad_norm": 3.6507456302642822, + "learning_rate": 1.474983684456593e-05, + "loss": 1.1689, + "step": 32656 + }, + { + "epoch": 0.408235205880147, + "grad_norm": 5.475161552429199, + "learning_rate": 1.4749068864987388e-05, + "loss": 1.0014, + "step": 32658 + }, + { + "epoch": 0.40826020650516265, + "grad_norm": 3.351374387741089, + "learning_rate": 1.4748300849240609e-05, + "loss": 1.3119, + "step": 32660 + }, + { + "epoch": 0.40828520713017824, + "grad_norm": 3.302096128463745, + "learning_rate": 1.4747532797331442e-05, + "loss": 1.3092, + "step": 32662 + }, + { + "epoch": 0.4083102077551939, + "grad_norm": 5.480557918548584, + "learning_rate": 1.474676470926574e-05, + "loss": 1.4489, + "step": 32664 + }, + { + "epoch": 0.4083352083802095, + "grad_norm": 8.81348991394043, + "learning_rate": 1.4745996585049344e-05, + "loss": 0.8412, + "step": 32666 + }, + { + "epoch": 0.4083602090052251, + "grad_norm": 4.147520065307617, + "learning_rate": 1.4745228424688112e-05, + "loss": 0.8489, + "step": 32668 + }, + { + "epoch": 0.4083852096302408, + "grad_norm": 2.2807137966156006, + "learning_rate": 1.4744460228187894e-05, + "loss": 1.0498, + "step": 32670 + }, + { + "epoch": 0.40841021025525637, + "grad_norm": 0.08244190365076065, + "learning_rate": 1.4743691995554535e-05, + "loss": 0.5868, + "step": 32672 + }, + { + "epoch": 0.408435210880272, + "grad_norm": 2.8885605335235596, + "learning_rate": 1.4742923726793888e-05, + "loss": 2.1989, + "step": 32674 + }, + { + "epoch": 0.4084602115052876, + "grad_norm": 0.001780590508133173, + "learning_rate": 1.4742155421911806e-05, + "loss": 0.0038, + "step": 32676 + }, + { + "epoch": 0.40848521213030325, + "grad_norm": 6.001827716827393, + "learning_rate": 1.4741387080914137e-05, + "loss": 1.5525, + "step": 32678 + }, + { + "epoch": 0.4085102127553189, + "grad_norm": 0.5319190621376038, + "learning_rate": 1.4740618703806735e-05, + "loss": 0.0206, + "step": 32680 + }, + { + "epoch": 0.4085352133803345, + "grad_norm": 3.802316904067993, + "learning_rate": 1.4739850290595455e-05, + "loss": 0.8194, + "step": 32682 + }, + { + "epoch": 0.40856021400535014, + "grad_norm": 4.626619338989258, + "learning_rate": 1.4739081841286143e-05, + "loss": 1.6133, + "step": 32684 + }, + { + "epoch": 0.40858521463036573, + "grad_norm": 3.520038366317749, + "learning_rate": 1.4738313355884653e-05, + "loss": 0.766, + "step": 32686 + }, + { + "epoch": 0.4086102152553814, + "grad_norm": 2.2115397453308105, + "learning_rate": 1.4737544834396838e-05, + "loss": 0.6096, + "step": 32688 + }, + { + "epoch": 0.40863521588039703, + "grad_norm": 2.876741409301758, + "learning_rate": 1.473677627682855e-05, + "loss": 0.5717, + "step": 32690 + }, + { + "epoch": 0.4086602165054126, + "grad_norm": 2.3481640815734863, + "learning_rate": 1.4736007683185647e-05, + "loss": 0.6189, + "step": 32692 + }, + { + "epoch": 0.40868521713042827, + "grad_norm": 0.8345170617103577, + "learning_rate": 1.4735239053473977e-05, + "loss": 0.0562, + "step": 32694 + }, + { + "epoch": 0.40871021775544386, + "grad_norm": 3.8276634216308594, + "learning_rate": 1.47344703876994e-05, + "loss": 1.9013, + "step": 32696 + }, + { + "epoch": 0.4087352183804595, + "grad_norm": 0.009984941221773624, + "learning_rate": 1.4733701685867763e-05, + "loss": 0.9169, + "step": 32698 + }, + { + "epoch": 0.40876021900547516, + "grad_norm": 0.006036167964339256, + "learning_rate": 1.4732932947984924e-05, + "loss": 0.6995, + "step": 32700 + }, + { + "epoch": 0.40878521963049075, + "grad_norm": 3.9465138912200928, + "learning_rate": 1.4732164174056735e-05, + "loss": 1.1569, + "step": 32702 + }, + { + "epoch": 0.4088102202555064, + "grad_norm": 3.423996925354004, + "learning_rate": 1.4731395364089056e-05, + "loss": 0.4821, + "step": 32704 + }, + { + "epoch": 0.408835220880522, + "grad_norm": 4.7299933433532715, + "learning_rate": 1.4730626518087738e-05, + "loss": 0.8054, + "step": 32706 + }, + { + "epoch": 0.40886022150553764, + "grad_norm": 0.014218511991202831, + "learning_rate": 1.4729857636058638e-05, + "loss": 0.3123, + "step": 32708 + }, + { + "epoch": 0.4088852221305533, + "grad_norm": 3.1090731620788574, + "learning_rate": 1.472908871800761e-05, + "loss": 1.121, + "step": 32710 + }, + { + "epoch": 0.4089102227555689, + "grad_norm": 0.0020707917865365744, + "learning_rate": 1.472831976394051e-05, + "loss": 0.7165, + "step": 32712 + }, + { + "epoch": 0.4089352233805845, + "grad_norm": 3.615257978439331, + "learning_rate": 1.4727550773863198e-05, + "loss": 1.2975, + "step": 32714 + }, + { + "epoch": 0.4089602240056001, + "grad_norm": 5.1917009353637695, + "learning_rate": 1.4726781747781525e-05, + "loss": 0.5718, + "step": 32716 + }, + { + "epoch": 0.40898522463061576, + "grad_norm": 2.6290032863616943, + "learning_rate": 1.4726012685701352e-05, + "loss": 0.9001, + "step": 32718 + }, + { + "epoch": 0.4090102252556314, + "grad_norm": 4.033831596374512, + "learning_rate": 1.4725243587628533e-05, + "loss": 1.2834, + "step": 32720 + }, + { + "epoch": 0.409035225880647, + "grad_norm": 3.978477954864502, + "learning_rate": 1.4724474453568931e-05, + "loss": 0.9585, + "step": 32722 + }, + { + "epoch": 0.40906022650566265, + "grad_norm": 4.952554225921631, + "learning_rate": 1.4723705283528399e-05, + "loss": 1.0801, + "step": 32724 + }, + { + "epoch": 0.40908522713067824, + "grad_norm": 2.6657140254974365, + "learning_rate": 1.4722936077512792e-05, + "loss": 1.4387, + "step": 32726 + }, + { + "epoch": 0.4091102277556939, + "grad_norm": 3.3373494148254395, + "learning_rate": 1.4722166835527973e-05, + "loss": 1.6305, + "step": 32728 + }, + { + "epoch": 0.40913522838070954, + "grad_norm": 2.484917640686035, + "learning_rate": 1.47213975575798e-05, + "loss": 0.8905, + "step": 32730 + }, + { + "epoch": 0.40916022900572513, + "grad_norm": 3.7085108757019043, + "learning_rate": 1.4720628243674131e-05, + "loss": 1.1056, + "step": 32732 + }, + { + "epoch": 0.4091852296307408, + "grad_norm": 3.962094783782959, + "learning_rate": 1.4719858893816825e-05, + "loss": 1.3394, + "step": 32734 + }, + { + "epoch": 0.40921023025575637, + "grad_norm": 2.813831090927124, + "learning_rate": 1.4719089508013742e-05, + "loss": 0.7603, + "step": 32736 + }, + { + "epoch": 0.409235230880772, + "grad_norm": 2.917567253112793, + "learning_rate": 1.4718320086270738e-05, + "loss": 0.6806, + "step": 32738 + }, + { + "epoch": 0.40926023150578766, + "grad_norm": 2.095752716064453, + "learning_rate": 1.4717550628593675e-05, + "loss": 1.253, + "step": 32740 + }, + { + "epoch": 0.40928523213080326, + "grad_norm": 6.452491760253906, + "learning_rate": 1.4716781134988416e-05, + "loss": 0.6726, + "step": 32742 + }, + { + "epoch": 0.4093102327558189, + "grad_norm": 4.594001770019531, + "learning_rate": 1.4716011605460819e-05, + "loss": 1.2012, + "step": 32744 + }, + { + "epoch": 0.4093352333808345, + "grad_norm": 3.535151243209839, + "learning_rate": 1.4715242040016744e-05, + "loss": 0.8451, + "step": 32746 + }, + { + "epoch": 0.40936023400585014, + "grad_norm": 0.0012122181942686439, + "learning_rate": 1.4714472438662052e-05, + "loss": 0.0045, + "step": 32748 + }, + { + "epoch": 0.4093852346308658, + "grad_norm": 3.707794666290283, + "learning_rate": 1.4713702801402605e-05, + "loss": 0.1531, + "step": 32750 + }, + { + "epoch": 0.4094102352558814, + "grad_norm": 0.0115516884252429, + "learning_rate": 1.4712933128244262e-05, + "loss": 0.7116, + "step": 32752 + }, + { + "epoch": 0.40943523588089703, + "grad_norm": 3.7580175399780273, + "learning_rate": 1.471216341919289e-05, + "loss": 0.869, + "step": 32754 + }, + { + "epoch": 0.4094602365059126, + "grad_norm": 5.4201178550720215, + "learning_rate": 1.4711393674254345e-05, + "loss": 0.9055, + "step": 32756 + }, + { + "epoch": 0.40948523713092827, + "grad_norm": 7.635748386383057, + "learning_rate": 1.4710623893434495e-05, + "loss": 2.388, + "step": 32758 + }, + { + "epoch": 0.4095102377559439, + "grad_norm": 2.0190727710723877, + "learning_rate": 1.4709854076739198e-05, + "loss": 0.838, + "step": 32760 + }, + { + "epoch": 0.4095352383809595, + "grad_norm": 5.047201156616211, + "learning_rate": 1.470908422417432e-05, + "loss": 1.2208, + "step": 32762 + }, + { + "epoch": 0.40956023900597516, + "grad_norm": 2.057612180709839, + "learning_rate": 1.470831433574572e-05, + "loss": 0.305, + "step": 32764 + }, + { + "epoch": 0.40958523963099075, + "grad_norm": 3.079195976257324, + "learning_rate": 1.4707544411459261e-05, + "loss": 0.7733, + "step": 32766 + }, + { + "epoch": 0.4096102402560064, + "grad_norm": 3.8632164001464844, + "learning_rate": 1.4706774451320813e-05, + "loss": 1.1372, + "step": 32768 + }, + { + "epoch": 0.40963524088102204, + "grad_norm": 3.5320687294006348, + "learning_rate": 1.4706004455336237e-05, + "loss": 1.7938, + "step": 32770 + }, + { + "epoch": 0.40966024150603764, + "grad_norm": 2.0632874965667725, + "learning_rate": 1.4705234423511396e-05, + "loss": 1.2358, + "step": 32772 + }, + { + "epoch": 0.4096852421310533, + "grad_norm": 0.8901981711387634, + "learning_rate": 1.4704464355852157e-05, + "loss": 0.4794, + "step": 32774 + }, + { + "epoch": 0.4097102427560689, + "grad_norm": 3.38397479057312, + "learning_rate": 1.470369425236438e-05, + "loss": 0.4981, + "step": 32776 + }, + { + "epoch": 0.4097352433810845, + "grad_norm": 3.3317935466766357, + "learning_rate": 1.4702924113053931e-05, + "loss": 1.3529, + "step": 32778 + }, + { + "epoch": 0.40976024400610017, + "grad_norm": 3.842865228652954, + "learning_rate": 1.4702153937926682e-05, + "loss": 0.8465, + "step": 32780 + }, + { + "epoch": 0.40978524463111576, + "grad_norm": 4.8477349281311035, + "learning_rate": 1.470138372698849e-05, + "loss": 0.6372, + "step": 32782 + }, + { + "epoch": 0.4098102452561314, + "grad_norm": 3.653233528137207, + "learning_rate": 1.4700613480245225e-05, + "loss": 1.882, + "step": 32784 + }, + { + "epoch": 0.409835245881147, + "grad_norm": 2.0960330963134766, + "learning_rate": 1.4699843197702754e-05, + "loss": 0.6784, + "step": 32786 + }, + { + "epoch": 0.40986024650616265, + "grad_norm": 3.1471827030181885, + "learning_rate": 1.4699072879366942e-05, + "loss": 1.0122, + "step": 32788 + }, + { + "epoch": 0.4098852471311783, + "grad_norm": 5.0818190574646, + "learning_rate": 1.469830252524365e-05, + "loss": 0.8792, + "step": 32790 + }, + { + "epoch": 0.4099102477561939, + "grad_norm": 0.000928803812712431, + "learning_rate": 1.4697532135338755e-05, + "loss": 0.3314, + "step": 32792 + }, + { + "epoch": 0.40993524838120954, + "grad_norm": 3.392427444458008, + "learning_rate": 1.4696761709658121e-05, + "loss": 1.478, + "step": 32794 + }, + { + "epoch": 0.40996024900622513, + "grad_norm": 4.992003917694092, + "learning_rate": 1.4695991248207611e-05, + "loss": 1.8084, + "step": 32796 + }, + { + "epoch": 0.4099852496312408, + "grad_norm": 2.631782054901123, + "learning_rate": 1.4695220750993098e-05, + "loss": 0.0908, + "step": 32798 + }, + { + "epoch": 0.4100102502562564, + "grad_norm": 3.5631844997406006, + "learning_rate": 1.4694450218020446e-05, + "loss": 0.2963, + "step": 32800 + }, + { + "epoch": 0.410035250881272, + "grad_norm": 2.795347213745117, + "learning_rate": 1.4693679649295525e-05, + "loss": 1.2219, + "step": 32802 + }, + { + "epoch": 0.41006025150628767, + "grad_norm": 0.001291517517529428, + "learning_rate": 1.4692909044824203e-05, + "loss": 0.2718, + "step": 32804 + }, + { + "epoch": 0.41008525213130326, + "grad_norm": 6.235169410705566, + "learning_rate": 1.4692138404612353e-05, + "loss": 0.7103, + "step": 32806 + }, + { + "epoch": 0.4101102527563189, + "grad_norm": 5.175805568695068, + "learning_rate": 1.4691367728665837e-05, + "loss": 1.4107, + "step": 32808 + }, + { + "epoch": 0.41013525338133455, + "grad_norm": 4.122079849243164, + "learning_rate": 1.4690597016990529e-05, + "loss": 1.5562, + "step": 32810 + }, + { + "epoch": 0.41016025400635014, + "grad_norm": 4.872461318969727, + "learning_rate": 1.4689826269592297e-05, + "loss": 2.9901, + "step": 32812 + }, + { + "epoch": 0.4101852546313658, + "grad_norm": 5.38543176651001, + "learning_rate": 1.468905548647701e-05, + "loss": 3.3544, + "step": 32814 + }, + { + "epoch": 0.4102102552563814, + "grad_norm": 5.684724807739258, + "learning_rate": 1.4688284667650542e-05, + "loss": 0.8203, + "step": 32816 + }, + { + "epoch": 0.41023525588139703, + "grad_norm": 3.2996039390563965, + "learning_rate": 1.468751381311876e-05, + "loss": 0.8721, + "step": 32818 + }, + { + "epoch": 0.4102602565064127, + "grad_norm": 6.404928207397461, + "learning_rate": 1.4686742922887536e-05, + "loss": 2.1484, + "step": 32820 + }, + { + "epoch": 0.41028525713142827, + "grad_norm": 0.0013086211401969194, + "learning_rate": 1.4685971996962738e-05, + "loss": 0.0002, + "step": 32822 + }, + { + "epoch": 0.4103102577564439, + "grad_norm": 5.028751373291016, + "learning_rate": 1.4685201035350243e-05, + "loss": 0.9046, + "step": 32824 + }, + { + "epoch": 0.4103352583814595, + "grad_norm": 2.443424940109253, + "learning_rate": 1.4684430038055921e-05, + "loss": 1.3286, + "step": 32826 + }, + { + "epoch": 0.41036025900647516, + "grad_norm": 3.870751142501831, + "learning_rate": 1.4683659005085638e-05, + "loss": 0.6526, + "step": 32828 + }, + { + "epoch": 0.4103852596314908, + "grad_norm": 1.9640648365020752, + "learning_rate": 1.4682887936445272e-05, + "loss": 0.4273, + "step": 32830 + }, + { + "epoch": 0.4104102602565064, + "grad_norm": 3.805858612060547, + "learning_rate": 1.4682116832140694e-05, + "loss": 1.7097, + "step": 32832 + }, + { + "epoch": 0.41043526088152205, + "grad_norm": 0.0010871222475543618, + "learning_rate": 1.4681345692177775e-05, + "loss": 0.2076, + "step": 32834 + }, + { + "epoch": 0.41046026150653764, + "grad_norm": 3.1085827350616455, + "learning_rate": 1.4680574516562391e-05, + "loss": 1.0603, + "step": 32836 + }, + { + "epoch": 0.4104852621315533, + "grad_norm": 2.2423956394195557, + "learning_rate": 1.4679803305300415e-05, + "loss": 0.8254, + "step": 32838 + }, + { + "epoch": 0.41051026275656893, + "grad_norm": 0.00616171071305871, + "learning_rate": 1.4679032058397713e-05, + "loss": 0.3594, + "step": 32840 + }, + { + "epoch": 0.4105352633815845, + "grad_norm": 2.9630258083343506, + "learning_rate": 1.4678260775860169e-05, + "loss": 0.7574, + "step": 32842 + }, + { + "epoch": 0.4105602640066002, + "grad_norm": 6.119542121887207, + "learning_rate": 1.467748945769365e-05, + "loss": 1.1283, + "step": 32844 + }, + { + "epoch": 0.41058526463161577, + "grad_norm": 3.001946210861206, + "learning_rate": 1.4676718103904038e-05, + "loss": 1.5936, + "step": 32846 + }, + { + "epoch": 0.4106102652566314, + "grad_norm": 4.446712970733643, + "learning_rate": 1.4675946714497198e-05, + "loss": 1.5057, + "step": 32848 + }, + { + "epoch": 0.41063526588164706, + "grad_norm": 0.16551432013511658, + "learning_rate": 1.4675175289479008e-05, + "loss": 0.873, + "step": 32850 + }, + { + "epoch": 0.41066026650666265, + "grad_norm": 0.6855801939964294, + "learning_rate": 1.4674403828855345e-05, + "loss": 0.0345, + "step": 32852 + }, + { + "epoch": 0.4106852671316783, + "grad_norm": 1.4435495138168335, + "learning_rate": 1.4673632332632087e-05, + "loss": 0.0762, + "step": 32854 + }, + { + "epoch": 0.4107102677566939, + "grad_norm": 4.1248779296875, + "learning_rate": 1.4672860800815101e-05, + "loss": 0.329, + "step": 32856 + }, + { + "epoch": 0.41073526838170954, + "grad_norm": 1.9137042760849, + "learning_rate": 1.4672089233410272e-05, + "loss": 0.8077, + "step": 32858 + }, + { + "epoch": 0.4107602690067252, + "grad_norm": 2.411623954772949, + "learning_rate": 1.467131763042347e-05, + "loss": 0.6683, + "step": 32860 + }, + { + "epoch": 0.4107852696317408, + "grad_norm": 3.792297124862671, + "learning_rate": 1.4670545991860575e-05, + "loss": 0.48, + "step": 32862 + }, + { + "epoch": 0.4108102702567564, + "grad_norm": 3.2725865840911865, + "learning_rate": 1.4669774317727461e-05, + "loss": 0.7823, + "step": 32864 + }, + { + "epoch": 0.410835270881772, + "grad_norm": 2.643556594848633, + "learning_rate": 1.4669002608030006e-05, + "loss": 0.4441, + "step": 32866 + }, + { + "epoch": 0.41086027150678767, + "grad_norm": 3.1003880500793457, + "learning_rate": 1.4668230862774087e-05, + "loss": 1.1048, + "step": 32868 + }, + { + "epoch": 0.4108852721318033, + "grad_norm": 5.727199077606201, + "learning_rate": 1.466745908196558e-05, + "loss": 0.811, + "step": 32870 + }, + { + "epoch": 0.4109102727568189, + "grad_norm": 5.177457809448242, + "learning_rate": 1.4666687265610371e-05, + "loss": 1.1301, + "step": 32872 + }, + { + "epoch": 0.41093527338183455, + "grad_norm": 0.33251795172691345, + "learning_rate": 1.4665915413714326e-05, + "loss": 1.2869, + "step": 32874 + }, + { + "epoch": 0.41096027400685015, + "grad_norm": 0.0029389297123998404, + "learning_rate": 1.4665143526283333e-05, + "loss": 0.5789, + "step": 32876 + }, + { + "epoch": 0.4109852746318658, + "grad_norm": 0.0011858859797939658, + "learning_rate": 1.4664371603323263e-05, + "loss": 0.6668, + "step": 32878 + }, + { + "epoch": 0.41101027525688144, + "grad_norm": 0.0009544392814859748, + "learning_rate": 1.4663599644839998e-05, + "loss": 0.6288, + "step": 32880 + }, + { + "epoch": 0.41103527588189703, + "grad_norm": 0.004173943307250738, + "learning_rate": 1.4662827650839422e-05, + "loss": 0.1332, + "step": 32882 + }, + { + "epoch": 0.4110602765069127, + "grad_norm": 6.203036308288574, + "learning_rate": 1.4662055621327408e-05, + "loss": 1.3862, + "step": 32884 + }, + { + "epoch": 0.4110852771319283, + "grad_norm": 3.5594520568847656, + "learning_rate": 1.4661283556309838e-05, + "loss": 1.1865, + "step": 32886 + }, + { + "epoch": 0.4111102777569439, + "grad_norm": 1.814942479133606, + "learning_rate": 1.466051145579259e-05, + "loss": 0.3965, + "step": 32888 + }, + { + "epoch": 0.41113527838195957, + "grad_norm": 0.9476045370101929, + "learning_rate": 1.4659739319781548e-05, + "loss": 0.4861, + "step": 32890 + }, + { + "epoch": 0.41116027900697516, + "grad_norm": 2.86061954498291, + "learning_rate": 1.4658967148282588e-05, + "loss": 0.3143, + "step": 32892 + }, + { + "epoch": 0.4111852796319908, + "grad_norm": 5.424784183502197, + "learning_rate": 1.4658194941301594e-05, + "loss": 1.7777, + "step": 32894 + }, + { + "epoch": 0.4112102802570064, + "grad_norm": 3.347649097442627, + "learning_rate": 1.4657422698844443e-05, + "loss": 0.7917, + "step": 32896 + }, + { + "epoch": 0.41123528088202205, + "grad_norm": 3.416069746017456, + "learning_rate": 1.4656650420917024e-05, + "loss": 0.8938, + "step": 32898 + }, + { + "epoch": 0.4112602815070377, + "grad_norm": 2.5534214973449707, + "learning_rate": 1.4655878107525215e-05, + "loss": 1.0464, + "step": 32900 + }, + { + "epoch": 0.4112852821320533, + "grad_norm": 3.060157537460327, + "learning_rate": 1.4655105758674892e-05, + "loss": 1.0286, + "step": 32902 + }, + { + "epoch": 0.41131028275706893, + "grad_norm": 3.429183006286621, + "learning_rate": 1.4654333374371942e-05, + "loss": 1.2612, + "step": 32904 + }, + { + "epoch": 0.4113352833820845, + "grad_norm": 3.300063371658325, + "learning_rate": 1.4653560954622251e-05, + "loss": 0.5348, + "step": 32906 + }, + { + "epoch": 0.4113602840071002, + "grad_norm": 5.786940574645996, + "learning_rate": 1.4652788499431693e-05, + "loss": 0.2311, + "step": 32908 + }, + { + "epoch": 0.4113852846321158, + "grad_norm": 6.668020725250244, + "learning_rate": 1.4652016008806163e-05, + "loss": 2.0304, + "step": 32910 + }, + { + "epoch": 0.4114102852571314, + "grad_norm": 4.550999641418457, + "learning_rate": 1.4651243482751531e-05, + "loss": 1.3749, + "step": 32912 + }, + { + "epoch": 0.41143528588214706, + "grad_norm": 2.4981703758239746, + "learning_rate": 1.4650470921273687e-05, + "loss": 0.8623, + "step": 32914 + }, + { + "epoch": 0.41146028650716265, + "grad_norm": 6.784116744995117, + "learning_rate": 1.4649698324378517e-05, + "loss": 0.6864, + "step": 32916 + }, + { + "epoch": 0.4114852871321783, + "grad_norm": 2.576265811920166, + "learning_rate": 1.4648925692071898e-05, + "loss": 1.1108, + "step": 32918 + }, + { + "epoch": 0.41151028775719395, + "grad_norm": 5.566657066345215, + "learning_rate": 1.4648153024359722e-05, + "loss": 0.7211, + "step": 32920 + }, + { + "epoch": 0.41153528838220954, + "grad_norm": 5.277470111846924, + "learning_rate": 1.4647380321247869e-05, + "loss": 3.3198, + "step": 32922 + }, + { + "epoch": 0.4115602890072252, + "grad_norm": 4.0383195877075195, + "learning_rate": 1.4646607582742226e-05, + "loss": 0.9042, + "step": 32924 + }, + { + "epoch": 0.4115852896322408, + "grad_norm": 0.7453575730323792, + "learning_rate": 1.4645834808848674e-05, + "loss": 0.447, + "step": 32926 + }, + { + "epoch": 0.41161029025725643, + "grad_norm": 0.9567068219184875, + "learning_rate": 1.46450619995731e-05, + "loss": 0.7045, + "step": 32928 + }, + { + "epoch": 0.4116352908822721, + "grad_norm": 6.380173206329346, + "learning_rate": 1.4644289154921394e-05, + "loss": 1.2527, + "step": 32930 + }, + { + "epoch": 0.41166029150728767, + "grad_norm": 2.8921332359313965, + "learning_rate": 1.4643516274899437e-05, + "loss": 1.4152, + "step": 32932 + }, + { + "epoch": 0.4116852921323033, + "grad_norm": 4.44711446762085, + "learning_rate": 1.4642743359513115e-05, + "loss": 1.7879, + "step": 32934 + }, + { + "epoch": 0.4117102927573189, + "grad_norm": 4.670811653137207, + "learning_rate": 1.464197040876832e-05, + "loss": 1.4785, + "step": 32936 + }, + { + "epoch": 0.41173529338233456, + "grad_norm": 3.201033115386963, + "learning_rate": 1.4641197422670931e-05, + "loss": 0.984, + "step": 32938 + }, + { + "epoch": 0.4117602940073502, + "grad_norm": 6.243851661682129, + "learning_rate": 1.4640424401226843e-05, + "loss": 1.1566, + "step": 32940 + }, + { + "epoch": 0.4117852946323658, + "grad_norm": 3.7500908374786377, + "learning_rate": 1.4639651344441934e-05, + "loss": 1.276, + "step": 32942 + }, + { + "epoch": 0.41181029525738144, + "grad_norm": 5.221036911010742, + "learning_rate": 1.46388782523221e-05, + "loss": 2.0135, + "step": 32944 + }, + { + "epoch": 0.41183529588239703, + "grad_norm": 2.4178950786590576, + "learning_rate": 1.4638105124873222e-05, + "loss": 0.5783, + "step": 32946 + }, + { + "epoch": 0.4118602965074127, + "grad_norm": 0.010679456405341625, + "learning_rate": 1.4637331962101193e-05, + "loss": 0.5466, + "step": 32948 + }, + { + "epoch": 0.41188529713242833, + "grad_norm": 3.1763951778411865, + "learning_rate": 1.4636558764011904e-05, + "loss": 1.1172, + "step": 32950 + }, + { + "epoch": 0.4119102977574439, + "grad_norm": 2.1463751792907715, + "learning_rate": 1.4635785530611232e-05, + "loss": 0.4504, + "step": 32952 + }, + { + "epoch": 0.41193529838245957, + "grad_norm": 7.152368545532227, + "learning_rate": 1.4635012261905077e-05, + "loss": 1.0827, + "step": 32954 + }, + { + "epoch": 0.41196029900747516, + "grad_norm": 0.4782591164112091, + "learning_rate": 1.4634238957899323e-05, + "loss": 0.1746, + "step": 32956 + }, + { + "epoch": 0.4119852996324908, + "grad_norm": 5.697539329528809, + "learning_rate": 1.463346561859986e-05, + "loss": 0.8831, + "step": 32958 + }, + { + "epoch": 0.41201030025750646, + "grad_norm": 5.149011135101318, + "learning_rate": 1.4632692244012578e-05, + "loss": 0.5022, + "step": 32960 + }, + { + "epoch": 0.41203530088252205, + "grad_norm": 6.242699146270752, + "learning_rate": 1.463191883414337e-05, + "loss": 0.4794, + "step": 32962 + }, + { + "epoch": 0.4120603015075377, + "grad_norm": 2.5764548778533936, + "learning_rate": 1.463114538899812e-05, + "loss": 1.6346, + "step": 32964 + }, + { + "epoch": 0.4120853021325533, + "grad_norm": 0.002521309070289135, + "learning_rate": 1.4630371908582724e-05, + "loss": 0.2028, + "step": 32966 + }, + { + "epoch": 0.41211030275756894, + "grad_norm": 3.4184486865997314, + "learning_rate": 1.4629598392903066e-05, + "loss": 0.9431, + "step": 32968 + }, + { + "epoch": 0.4121353033825846, + "grad_norm": 2.66518235206604, + "learning_rate": 1.4628824841965046e-05, + "loss": 1.3726, + "step": 32970 + }, + { + "epoch": 0.4121603040076002, + "grad_norm": 3.887023687362671, + "learning_rate": 1.4628051255774547e-05, + "loss": 0.5751, + "step": 32972 + }, + { + "epoch": 0.4121853046326158, + "grad_norm": 3.099395275115967, + "learning_rate": 1.4627277634337467e-05, + "loss": 0.7518, + "step": 32974 + }, + { + "epoch": 0.4122103052576314, + "grad_norm": 0.003298897063359618, + "learning_rate": 1.4626503977659695e-05, + "loss": 0.8687, + "step": 32976 + }, + { + "epoch": 0.41223530588264706, + "grad_norm": 2.9485690593719482, + "learning_rate": 1.4625730285747119e-05, + "loss": 1.023, + "step": 32978 + }, + { + "epoch": 0.4122603065076627, + "grad_norm": 4.902707099914551, + "learning_rate": 1.4624956558605638e-05, + "loss": 1.5854, + "step": 32980 + }, + { + "epoch": 0.4122853071326783, + "grad_norm": 3.4098565578460693, + "learning_rate": 1.4624182796241142e-05, + "loss": 1.1478, + "step": 32982 + }, + { + "epoch": 0.41231030775769395, + "grad_norm": 1.9382272958755493, + "learning_rate": 1.4623408998659522e-05, + "loss": 1.1666, + "step": 32984 + }, + { + "epoch": 0.41233530838270954, + "grad_norm": 2.289003849029541, + "learning_rate": 1.4622635165866676e-05, + "loss": 0.2243, + "step": 32986 + }, + { + "epoch": 0.4123603090077252, + "grad_norm": 4.396193981170654, + "learning_rate": 1.4621861297868494e-05, + "loss": 1.4221, + "step": 32988 + }, + { + "epoch": 0.41238530963274084, + "grad_norm": 2.7804086208343506, + "learning_rate": 1.4621087394670869e-05, + "loss": 0.9152, + "step": 32990 + }, + { + "epoch": 0.41241031025775643, + "grad_norm": 2.984013080596924, + "learning_rate": 1.4620313456279693e-05, + "loss": 1.1615, + "step": 32992 + }, + { + "epoch": 0.4124353108827721, + "grad_norm": 2.4750959873199463, + "learning_rate": 1.4619539482700866e-05, + "loss": 0.3232, + "step": 32994 + }, + { + "epoch": 0.41246031150778767, + "grad_norm": 1.423449158668518, + "learning_rate": 1.461876547394028e-05, + "loss": 0.3062, + "step": 32996 + }, + { + "epoch": 0.4124853121328033, + "grad_norm": 2.504359245300293, + "learning_rate": 1.4617991430003828e-05, + "loss": 1.4775, + "step": 32998 + }, + { + "epoch": 0.41251031275781896, + "grad_norm": 3.8665196895599365, + "learning_rate": 1.4617217350897406e-05, + "loss": 1.7365, + "step": 33000 + }, + { + "epoch": 0.41253531338283456, + "grad_norm": 4.343739986419678, + "learning_rate": 1.4616443236626915e-05, + "loss": 1.5302, + "step": 33002 + }, + { + "epoch": 0.4125603140078502, + "grad_norm": 7.88193416595459, + "learning_rate": 1.4615669087198237e-05, + "loss": 1.7081, + "step": 33004 + }, + { + "epoch": 0.4125853146328658, + "grad_norm": 3.007716655731201, + "learning_rate": 1.4614894902617279e-05, + "loss": 0.9616, + "step": 33006 + }, + { + "epoch": 0.41261031525788144, + "grad_norm": 1.6272246837615967, + "learning_rate": 1.4614120682889937e-05, + "loss": 0.0898, + "step": 33008 + }, + { + "epoch": 0.4126353158828971, + "grad_norm": 2.8673906326293945, + "learning_rate": 1.46133464280221e-05, + "loss": 0.4934, + "step": 33010 + }, + { + "epoch": 0.4126603165079127, + "grad_norm": 0.005151126999408007, + "learning_rate": 1.4612572138019672e-05, + "loss": 0.745, + "step": 33012 + }, + { + "epoch": 0.41268531713292833, + "grad_norm": 1.1255912780761719, + "learning_rate": 1.4611797812888546e-05, + "loss": 0.6638, + "step": 33014 + }, + { + "epoch": 0.4127103177579439, + "grad_norm": 3.451228618621826, + "learning_rate": 1.4611023452634618e-05, + "loss": 0.8166, + "step": 33016 + }, + { + "epoch": 0.41273531838295957, + "grad_norm": 5.555343151092529, + "learning_rate": 1.4610249057263788e-05, + "loss": 1.8002, + "step": 33018 + }, + { + "epoch": 0.4127603190079752, + "grad_norm": 0.9800499677658081, + "learning_rate": 1.4609474626781955e-05, + "loss": 0.7325, + "step": 33020 + }, + { + "epoch": 0.4127853196329908, + "grad_norm": 2.1577682495117188, + "learning_rate": 1.4608700161195015e-05, + "loss": 1.227, + "step": 33022 + }, + { + "epoch": 0.41281032025800646, + "grad_norm": 7.796626091003418, + "learning_rate": 1.4607925660508867e-05, + "loss": 0.4903, + "step": 33024 + }, + { + "epoch": 0.41283532088302205, + "grad_norm": 3.230356454849243, + "learning_rate": 1.4607151124729407e-05, + "loss": 2.0267, + "step": 33026 + }, + { + "epoch": 0.4128603215080377, + "grad_norm": 3.4809563159942627, + "learning_rate": 1.4606376553862536e-05, + "loss": 1.3733, + "step": 33028 + }, + { + "epoch": 0.41288532213305335, + "grad_norm": 2.219996690750122, + "learning_rate": 1.4605601947914152e-05, + "loss": 1.3757, + "step": 33030 + }, + { + "epoch": 0.41291032275806894, + "grad_norm": 3.4381868839263916, + "learning_rate": 1.4604827306890155e-05, + "loss": 0.7326, + "step": 33032 + }, + { + "epoch": 0.4129353233830846, + "grad_norm": 3.8152225017547607, + "learning_rate": 1.4604052630796446e-05, + "loss": 1.8477, + "step": 33034 + }, + { + "epoch": 0.4129603240081002, + "grad_norm": 4.021383762359619, + "learning_rate": 1.460327791963892e-05, + "loss": 1.3156, + "step": 33036 + }, + { + "epoch": 0.4129853246331158, + "grad_norm": 5.537136077880859, + "learning_rate": 1.4602503173423483e-05, + "loss": 1.3673, + "step": 33038 + }, + { + "epoch": 0.41301032525813147, + "grad_norm": 2.394880771636963, + "learning_rate": 1.4601728392156034e-05, + "loss": 0.8421, + "step": 33040 + }, + { + "epoch": 0.41303532588314706, + "grad_norm": 3.6917953491210938, + "learning_rate": 1.4600953575842469e-05, + "loss": 0.9025, + "step": 33042 + }, + { + "epoch": 0.4130603265081627, + "grad_norm": 2.5140819549560547, + "learning_rate": 1.4600178724488694e-05, + "loss": 1.1289, + "step": 33044 + }, + { + "epoch": 0.4130853271331783, + "grad_norm": 2.1331851482391357, + "learning_rate": 1.4599403838100608e-05, + "loss": 0.3681, + "step": 33046 + }, + { + "epoch": 0.41311032775819395, + "grad_norm": 0.003392610466107726, + "learning_rate": 1.4598628916684112e-05, + "loss": 0.3343, + "step": 33048 + }, + { + "epoch": 0.4131353283832096, + "grad_norm": 2.387108564376831, + "learning_rate": 1.459785396024511e-05, + "loss": 0.8922, + "step": 33050 + }, + { + "epoch": 0.4131603290082252, + "grad_norm": 3.8847720623016357, + "learning_rate": 1.4597078968789503e-05, + "loss": 0.3073, + "step": 33052 + }, + { + "epoch": 0.41318532963324084, + "grad_norm": 0.029213441535830498, + "learning_rate": 1.459630394232319e-05, + "loss": 0.0006, + "step": 33054 + }, + { + "epoch": 0.41321033025825643, + "grad_norm": 4.679352283477783, + "learning_rate": 1.4595528880852077e-05, + "loss": 0.4377, + "step": 33056 + }, + { + "epoch": 0.4132353308832721, + "grad_norm": 4.041204452514648, + "learning_rate": 1.4594753784382065e-05, + "loss": 1.0829, + "step": 33058 + }, + { + "epoch": 0.4132603315082877, + "grad_norm": 1.438536524772644, + "learning_rate": 1.459397865291906e-05, + "loss": 0.6713, + "step": 33060 + }, + { + "epoch": 0.4132853321333033, + "grad_norm": 1.6472910642623901, + "learning_rate": 1.4593203486468963e-05, + "loss": 0.1909, + "step": 33062 + }, + { + "epoch": 0.41331033275831897, + "grad_norm": 3.6795194149017334, + "learning_rate": 1.4592428285037677e-05, + "loss": 0.995, + "step": 33064 + }, + { + "epoch": 0.41333533338333456, + "grad_norm": 4.691558837890625, + "learning_rate": 1.4591653048631107e-05, + "loss": 1.7217, + "step": 33066 + }, + { + "epoch": 0.4133603340083502, + "grad_norm": 3.026271343231201, + "learning_rate": 1.4590877777255157e-05, + "loss": 0.4534, + "step": 33068 + }, + { + "epoch": 0.41338533463336585, + "grad_norm": 7.514465808868408, + "learning_rate": 1.4590102470915731e-05, + "loss": 0.5077, + "step": 33070 + }, + { + "epoch": 0.41341033525838145, + "grad_norm": 5.760039806365967, + "learning_rate": 1.4589327129618731e-05, + "loss": 1.4851, + "step": 33072 + }, + { + "epoch": 0.4134353358833971, + "grad_norm": 3.5500991344451904, + "learning_rate": 1.458855175337007e-05, + "loss": 1.1148, + "step": 33074 + }, + { + "epoch": 0.4134603365084127, + "grad_norm": 2.767781972885132, + "learning_rate": 1.4587776342175643e-05, + "loss": 1.5008, + "step": 33076 + }, + { + "epoch": 0.41348533713342833, + "grad_norm": 2.6204497814178467, + "learning_rate": 1.4587000896041363e-05, + "loss": 0.9873, + "step": 33078 + }, + { + "epoch": 0.413510337758444, + "grad_norm": 3.1263954639434814, + "learning_rate": 1.4586225414973133e-05, + "loss": 0.425, + "step": 33080 + }, + { + "epoch": 0.41353533838345957, + "grad_norm": 0.23216286301612854, + "learning_rate": 1.4585449898976855e-05, + "loss": 0.3702, + "step": 33082 + }, + { + "epoch": 0.4135603390084752, + "grad_norm": 3.394378423690796, + "learning_rate": 1.458467434805844e-05, + "loss": 0.9462, + "step": 33084 + }, + { + "epoch": 0.4135853396334908, + "grad_norm": 4.948674201965332, + "learning_rate": 1.4583898762223798e-05, + "loss": 1.6373, + "step": 33086 + }, + { + "epoch": 0.41361034025850646, + "grad_norm": 4.625911712646484, + "learning_rate": 1.4583123141478828e-05, + "loss": 1.136, + "step": 33088 + }, + { + "epoch": 0.4136353408835221, + "grad_norm": 2.2656285762786865, + "learning_rate": 1.458234748582944e-05, + "loss": 1.202, + "step": 33090 + }, + { + "epoch": 0.4136603415085377, + "grad_norm": 4.255441188812256, + "learning_rate": 1.4581571795281542e-05, + "loss": 1.7226, + "step": 33092 + }, + { + "epoch": 0.41368534213355335, + "grad_norm": 2.7692806720733643, + "learning_rate": 1.4580796069841041e-05, + "loss": 0.7061, + "step": 33094 + }, + { + "epoch": 0.41371034275856894, + "grad_norm": 3.981422185897827, + "learning_rate": 1.4580020309513844e-05, + "loss": 1.5486, + "step": 33096 + }, + { + "epoch": 0.4137353433835846, + "grad_norm": 4.185293674468994, + "learning_rate": 1.457924451430586e-05, + "loss": 1.3239, + "step": 33098 + }, + { + "epoch": 0.41376034400860023, + "grad_norm": 0.7813282012939453, + "learning_rate": 1.4578468684223001e-05, + "loss": 0.0145, + "step": 33100 + }, + { + "epoch": 0.4137853446336158, + "grad_norm": 0.0069812508299946785, + "learning_rate": 1.457769281927117e-05, + "loss": 0.3293, + "step": 33102 + }, + { + "epoch": 0.4138103452586315, + "grad_norm": 0.02176373451948166, + "learning_rate": 1.4576916919456277e-05, + "loss": 1.1752, + "step": 33104 + }, + { + "epoch": 0.41383534588364707, + "grad_norm": 4.622780799865723, + "learning_rate": 1.457614098478423e-05, + "loss": 1.4351, + "step": 33106 + }, + { + "epoch": 0.4138603465086627, + "grad_norm": 1.4149237871170044, + "learning_rate": 1.4575365015260942e-05, + "loss": 0.3564, + "step": 33108 + }, + { + "epoch": 0.41388534713367836, + "grad_norm": 4.589529514312744, + "learning_rate": 1.4574589010892321e-05, + "loss": 0.8653, + "step": 33110 + }, + { + "epoch": 0.41391034775869395, + "grad_norm": 3.9017767906188965, + "learning_rate": 1.457381297168428e-05, + "loss": 0.8378, + "step": 33112 + }, + { + "epoch": 0.4139353483837096, + "grad_norm": 3.713852643966675, + "learning_rate": 1.4573036897642725e-05, + "loss": 1.3604, + "step": 33114 + }, + { + "epoch": 0.4139603490087252, + "grad_norm": 2.5035929679870605, + "learning_rate": 1.4572260788773564e-05, + "loss": 1.7006, + "step": 33116 + }, + { + "epoch": 0.41398534963374084, + "grad_norm": 0.0028312676586210728, + "learning_rate": 1.4571484645082714e-05, + "loss": 0.6513, + "step": 33118 + }, + { + "epoch": 0.4140103502587565, + "grad_norm": 4.812022686004639, + "learning_rate": 1.4570708466576084e-05, + "loss": 2.3802, + "step": 33120 + }, + { + "epoch": 0.4140353508837721, + "grad_norm": 1.795314908027649, + "learning_rate": 1.4569932253259582e-05, + "loss": 0.7434, + "step": 33122 + }, + { + "epoch": 0.4140603515087877, + "grad_norm": 2.51686429977417, + "learning_rate": 1.4569156005139124e-05, + "loss": 1.2049, + "step": 33124 + }, + { + "epoch": 0.4140853521338033, + "grad_norm": 9.433416366577148, + "learning_rate": 1.4568379722220622e-05, + "loss": 0.5213, + "step": 33126 + }, + { + "epoch": 0.41411035275881897, + "grad_norm": 8.809565544128418, + "learning_rate": 1.4567603404509985e-05, + "loss": 0.8346, + "step": 33128 + }, + { + "epoch": 0.4141353533838346, + "grad_norm": 1.3843674659729004, + "learning_rate": 1.4566827052013127e-05, + "loss": 0.9061, + "step": 33130 + }, + { + "epoch": 0.4141603540088502, + "grad_norm": 0.2333461046218872, + "learning_rate": 1.4566050664735957e-05, + "loss": 0.9982, + "step": 33132 + }, + { + "epoch": 0.41418535463386585, + "grad_norm": 4.263688087463379, + "learning_rate": 1.456527424268439e-05, + "loss": 1.4747, + "step": 33134 + }, + { + "epoch": 0.41421035525888145, + "grad_norm": 13.724165916442871, + "learning_rate": 1.4564497785864344e-05, + "loss": 0.9569, + "step": 33136 + }, + { + "epoch": 0.4142353558838971, + "grad_norm": 7.297137260437012, + "learning_rate": 1.4563721294281728e-05, + "loss": 0.7788, + "step": 33138 + }, + { + "epoch": 0.41426035650891274, + "grad_norm": 0.006182557437568903, + "learning_rate": 1.4562944767942455e-05, + "loss": 1.0313, + "step": 33140 + }, + { + "epoch": 0.41428535713392833, + "grad_norm": 0.031521815806627274, + "learning_rate": 1.4562168206852439e-05, + "loss": 0.0005, + "step": 33142 + }, + { + "epoch": 0.414310357758944, + "grad_norm": 4.364489555358887, + "learning_rate": 1.4561391611017594e-05, + "loss": 0.5284, + "step": 33144 + }, + { + "epoch": 0.4143353583839596, + "grad_norm": 4.373971462249756, + "learning_rate": 1.4560614980443839e-05, + "loss": 0.628, + "step": 33146 + }, + { + "epoch": 0.4143603590089752, + "grad_norm": 3.759913206100464, + "learning_rate": 1.4559838315137083e-05, + "loss": 1.0647, + "step": 33148 + }, + { + "epoch": 0.41438535963399087, + "grad_norm": 3.3906702995300293, + "learning_rate": 1.4559061615103243e-05, + "loss": 1.0543, + "step": 33150 + }, + { + "epoch": 0.41441036025900646, + "grad_norm": 5.155141830444336, + "learning_rate": 1.4558284880348237e-05, + "loss": 1.8847, + "step": 33152 + }, + { + "epoch": 0.4144353608840221, + "grad_norm": 1.4077975749969482, + "learning_rate": 1.4557508110877976e-05, + "loss": 0.1585, + "step": 33154 + }, + { + "epoch": 0.4144603615090377, + "grad_norm": 4.314700126647949, + "learning_rate": 1.4556731306698378e-05, + "loss": 0.8476, + "step": 33156 + }, + { + "epoch": 0.41448536213405335, + "grad_norm": 4.004863739013672, + "learning_rate": 1.4555954467815357e-05, + "loss": 2.1767, + "step": 33158 + }, + { + "epoch": 0.414510362759069, + "grad_norm": 3.8025386333465576, + "learning_rate": 1.4555177594234832e-05, + "loss": 0.684, + "step": 33160 + }, + { + "epoch": 0.4145353633840846, + "grad_norm": 0.9033361673355103, + "learning_rate": 1.455440068596272e-05, + "loss": 0.2221, + "step": 33162 + }, + { + "epoch": 0.41456036400910024, + "grad_norm": 2.7806661128997803, + "learning_rate": 1.4553623743004935e-05, + "loss": 1.0986, + "step": 33164 + }, + { + "epoch": 0.4145853646341158, + "grad_norm": 1.0923582315444946, + "learning_rate": 1.4552846765367399e-05, + "loss": 0.6279, + "step": 33166 + }, + { + "epoch": 0.4146103652591315, + "grad_norm": 2.1938045024871826, + "learning_rate": 1.4552069753056022e-05, + "loss": 1.2772, + "step": 33168 + }, + { + "epoch": 0.4146353658841471, + "grad_norm": 3.305417537689209, + "learning_rate": 1.4551292706076723e-05, + "loss": 1.2776, + "step": 33170 + }, + { + "epoch": 0.4146603665091627, + "grad_norm": 1.8102675676345825, + "learning_rate": 1.4550515624435428e-05, + "loss": 0.5252, + "step": 33172 + }, + { + "epoch": 0.41468536713417836, + "grad_norm": 7.692842483520508, + "learning_rate": 1.4549738508138048e-05, + "loss": 0.9682, + "step": 33174 + }, + { + "epoch": 0.41471036775919395, + "grad_norm": 4.419485569000244, + "learning_rate": 1.4548961357190501e-05, + "loss": 0.4519, + "step": 33176 + }, + { + "epoch": 0.4147353683842096, + "grad_norm": 1.9626431465148926, + "learning_rate": 1.4548184171598711e-05, + "loss": 0.2434, + "step": 33178 + }, + { + "epoch": 0.41476036900922525, + "grad_norm": 0.004783771466463804, + "learning_rate": 1.454740695136859e-05, + "loss": 0.8984, + "step": 33180 + }, + { + "epoch": 0.41478536963424084, + "grad_norm": 1.7339826822280884, + "learning_rate": 1.454662969650606e-05, + "loss": 0.4104, + "step": 33182 + }, + { + "epoch": 0.4148103702592565, + "grad_norm": 2.738844156265259, + "learning_rate": 1.4545852407017045e-05, + "loss": 0.4236, + "step": 33184 + }, + { + "epoch": 0.4148353708842721, + "grad_norm": 5.110296726226807, + "learning_rate": 1.454507508290746e-05, + "loss": 0.4728, + "step": 33186 + }, + { + "epoch": 0.41486037150928773, + "grad_norm": 4.97648811340332, + "learning_rate": 1.4544297724183225e-05, + "loss": 1.1764, + "step": 33188 + }, + { + "epoch": 0.4148853721343034, + "grad_norm": 0.00461330683901906, + "learning_rate": 1.454352033085026e-05, + "loss": 0.2685, + "step": 33190 + }, + { + "epoch": 0.41491037275931897, + "grad_norm": 2.4234559535980225, + "learning_rate": 1.4542742902914491e-05, + "loss": 1.5184, + "step": 33192 + }, + { + "epoch": 0.4149353733843346, + "grad_norm": 2.7315776348114014, + "learning_rate": 1.4541965440381826e-05, + "loss": 1.1267, + "step": 33194 + }, + { + "epoch": 0.4149603740093502, + "grad_norm": 0.005308553110808134, + "learning_rate": 1.45411879432582e-05, + "loss": 0.7512, + "step": 33196 + }, + { + "epoch": 0.41498537463436586, + "grad_norm": 2.325305700302124, + "learning_rate": 1.454041041154953e-05, + "loss": 0.7612, + "step": 33198 + }, + { + "epoch": 0.4150103752593815, + "grad_norm": 2.208590030670166, + "learning_rate": 1.4539632845261737e-05, + "loss": 0.8756, + "step": 33200 + }, + { + "epoch": 0.4150353758843971, + "grad_norm": 8.170988082885742, + "learning_rate": 1.4538855244400738e-05, + "loss": 0.7613, + "step": 33202 + }, + { + "epoch": 0.41506037650941274, + "grad_norm": 3.357020378112793, + "learning_rate": 1.4538077608972465e-05, + "loss": 0.4095, + "step": 33204 + }, + { + "epoch": 0.41508537713442833, + "grad_norm": 3.6462762355804443, + "learning_rate": 1.453729993898283e-05, + "loss": 1.062, + "step": 33206 + }, + { + "epoch": 0.415110377759444, + "grad_norm": 0.015671364963054657, + "learning_rate": 1.453652223443776e-05, + "loss": 0.8716, + "step": 33208 + }, + { + "epoch": 0.41513537838445963, + "grad_norm": 2.612079620361328, + "learning_rate": 1.4535744495343181e-05, + "loss": 0.9914, + "step": 33210 + }, + { + "epoch": 0.4151603790094752, + "grad_norm": 4.322582244873047, + "learning_rate": 1.4534966721705013e-05, + "loss": 1.7616, + "step": 33212 + }, + { + "epoch": 0.41518537963449087, + "grad_norm": 1.463751196861267, + "learning_rate": 1.4534188913529177e-05, + "loss": 0.062, + "step": 33214 + }, + { + "epoch": 0.41521038025950646, + "grad_norm": 3.9364125728607178, + "learning_rate": 1.4533411070821601e-05, + "loss": 1.0023, + "step": 33216 + }, + { + "epoch": 0.4152353808845221, + "grad_norm": 0.01221516914665699, + "learning_rate": 1.453263319358821e-05, + "loss": 0.7684, + "step": 33218 + }, + { + "epoch": 0.41526038150953776, + "grad_norm": 5.3617167472839355, + "learning_rate": 1.4531855281834922e-05, + "loss": 1.253, + "step": 33220 + }, + { + "epoch": 0.41528538213455335, + "grad_norm": 5.0086565017700195, + "learning_rate": 1.4531077335567665e-05, + "loss": 1.6215, + "step": 33222 + }, + { + "epoch": 0.415310382759569, + "grad_norm": 0.6028192043304443, + "learning_rate": 1.4530299354792365e-05, + "loss": 0.0985, + "step": 33224 + }, + { + "epoch": 0.4153353833845846, + "grad_norm": 2.6981539726257324, + "learning_rate": 1.4529521339514947e-05, + "loss": 0.628, + "step": 33226 + }, + { + "epoch": 0.41536038400960024, + "grad_norm": 3.06489634513855, + "learning_rate": 1.4528743289741334e-05, + "loss": 0.137, + "step": 33228 + }, + { + "epoch": 0.4153853846346159, + "grad_norm": 0.03264880180358887, + "learning_rate": 1.4527965205477452e-05, + "loss": 0.3773, + "step": 33230 + }, + { + "epoch": 0.4154103852596315, + "grad_norm": 1.5097395181655884, + "learning_rate": 1.4527187086729226e-05, + "loss": 1.0259, + "step": 33232 + }, + { + "epoch": 0.4154353858846471, + "grad_norm": 6.191817283630371, + "learning_rate": 1.4526408933502584e-05, + "loss": 1.2691, + "step": 33234 + }, + { + "epoch": 0.4154603865096627, + "grad_norm": 5.2636847496032715, + "learning_rate": 1.4525630745803456e-05, + "loss": 0.5004, + "step": 33236 + }, + { + "epoch": 0.41548538713467836, + "grad_norm": 4.244625091552734, + "learning_rate": 1.4524852523637761e-05, + "loss": 0.7633, + "step": 33238 + }, + { + "epoch": 0.415510387759694, + "grad_norm": 1.7234665155410767, + "learning_rate": 1.4524074267011427e-05, + "loss": 0.5552, + "step": 33240 + }, + { + "epoch": 0.4155353883847096, + "grad_norm": 4.457982540130615, + "learning_rate": 1.4523295975930384e-05, + "loss": 0.898, + "step": 33242 + }, + { + "epoch": 0.41556038900972525, + "grad_norm": 3.6463348865509033, + "learning_rate": 1.452251765040056e-05, + "loss": 1.1308, + "step": 33244 + }, + { + "epoch": 0.41558538963474084, + "grad_norm": 0.00996578112244606, + "learning_rate": 1.452173929042788e-05, + "loss": 0.8855, + "step": 33246 + }, + { + "epoch": 0.4156103902597565, + "grad_norm": 5.045853137969971, + "learning_rate": 1.4520960896018275e-05, + "loss": 1.6502, + "step": 33248 + }, + { + "epoch": 0.41563539088477214, + "grad_norm": 3.6669676303863525, + "learning_rate": 1.452018246717767e-05, + "loss": 0.6309, + "step": 33250 + }, + { + "epoch": 0.41566039150978773, + "grad_norm": 0.001781684230081737, + "learning_rate": 1.4519404003911994e-05, + "loss": 1.0161, + "step": 33252 + }, + { + "epoch": 0.4156853921348034, + "grad_norm": 4.143650054931641, + "learning_rate": 1.4518625506227174e-05, + "loss": 0.9268, + "step": 33254 + }, + { + "epoch": 0.41571039275981897, + "grad_norm": 5.7544026374816895, + "learning_rate": 1.4517846974129146e-05, + "loss": 0.4924, + "step": 33256 + }, + { + "epoch": 0.4157353933848346, + "grad_norm": 3.593031167984009, + "learning_rate": 1.4517068407623831e-05, + "loss": 1.6872, + "step": 33258 + }, + { + "epoch": 0.41576039400985026, + "grad_norm": 3.4758107662200928, + "learning_rate": 1.4516289806717162e-05, + "loss": 0.692, + "step": 33260 + }, + { + "epoch": 0.41578539463486586, + "grad_norm": 4.275262355804443, + "learning_rate": 1.4515511171415071e-05, + "loss": 0.7865, + "step": 33262 + }, + { + "epoch": 0.4158103952598815, + "grad_norm": 2.918161630630493, + "learning_rate": 1.4514732501723482e-05, + "loss": 1.0473, + "step": 33264 + }, + { + "epoch": 0.4158353958848971, + "grad_norm": 0.015902338549494743, + "learning_rate": 1.4513953797648333e-05, + "loss": 0.7567, + "step": 33266 + }, + { + "epoch": 0.41586039650991274, + "grad_norm": 3.594552755355835, + "learning_rate": 1.4513175059195548e-05, + "loss": 0.9752, + "step": 33268 + }, + { + "epoch": 0.4158853971349284, + "grad_norm": 2.1109466552734375, + "learning_rate": 1.4512396286371058e-05, + "loss": 0.3698, + "step": 33270 + }, + { + "epoch": 0.415910397759944, + "grad_norm": 0.6730708479881287, + "learning_rate": 1.4511617479180797e-05, + "loss": 0.323, + "step": 33272 + }, + { + "epoch": 0.41593539838495963, + "grad_norm": 2.18746280670166, + "learning_rate": 1.4510838637630696e-05, + "loss": 0.488, + "step": 33274 + }, + { + "epoch": 0.4159603990099752, + "grad_norm": 6.627955436706543, + "learning_rate": 1.4510059761726687e-05, + "loss": 0.8017, + "step": 33276 + }, + { + "epoch": 0.41598539963499087, + "grad_norm": 10.540291786193848, + "learning_rate": 1.4509280851474699e-05, + "loss": 1.9194, + "step": 33278 + }, + { + "epoch": 0.4160104002600065, + "grad_norm": 2.7247314453125, + "learning_rate": 1.4508501906880666e-05, + "loss": 1.2103, + "step": 33280 + }, + { + "epoch": 0.4160354008850221, + "grad_norm": 4.238698959350586, + "learning_rate": 1.450772292795052e-05, + "loss": 0.793, + "step": 33282 + }, + { + "epoch": 0.41606040151003776, + "grad_norm": 0.8438801169395447, + "learning_rate": 1.4506943914690193e-05, + "loss": 0.7545, + "step": 33284 + }, + { + "epoch": 0.41608540213505335, + "grad_norm": 3.9019381999969482, + "learning_rate": 1.4506164867105619e-05, + "loss": 0.7854, + "step": 33286 + }, + { + "epoch": 0.416110402760069, + "grad_norm": 0.006035544443875551, + "learning_rate": 1.4505385785202731e-05, + "loss": 0.6833, + "step": 33288 + }, + { + "epoch": 0.41613540338508465, + "grad_norm": 2.1095163822174072, + "learning_rate": 1.4504606668987462e-05, + "loss": 0.1619, + "step": 33290 + }, + { + "epoch": 0.41616040401010024, + "grad_norm": 3.242082118988037, + "learning_rate": 1.4503827518465745e-05, + "loss": 0.953, + "step": 33292 + }, + { + "epoch": 0.4161854046351159, + "grad_norm": 3.9958043098449707, + "learning_rate": 1.4503048333643514e-05, + "loss": 0.7569, + "step": 33294 + }, + { + "epoch": 0.4162104052601315, + "grad_norm": 1.335479736328125, + "learning_rate": 1.4502269114526705e-05, + "loss": 0.2124, + "step": 33296 + }, + { + "epoch": 0.4162354058851471, + "grad_norm": 2.9705984592437744, + "learning_rate": 1.450148986112125e-05, + "loss": 1.5845, + "step": 33298 + }, + { + "epoch": 0.4162604065101628, + "grad_norm": 0.014305725693702698, + "learning_rate": 1.4500710573433086e-05, + "loss": 1.1571, + "step": 33300 + }, + { + "epoch": 0.41628540713517836, + "grad_norm": 2.977010488510132, + "learning_rate": 1.4499931251468149e-05, + "loss": 1.0282, + "step": 33302 + }, + { + "epoch": 0.416310407760194, + "grad_norm": 3.9954490661621094, + "learning_rate": 1.4499151895232369e-05, + "loss": 0.3756, + "step": 33304 + }, + { + "epoch": 0.4163354083852096, + "grad_norm": 5.56569242477417, + "learning_rate": 1.4498372504731683e-05, + "loss": 1.4351, + "step": 33306 + }, + { + "epoch": 0.41636040901022525, + "grad_norm": 2.869917869567871, + "learning_rate": 1.449759307997203e-05, + "loss": 0.5057, + "step": 33308 + }, + { + "epoch": 0.4163854096352409, + "grad_norm": 4.274308204650879, + "learning_rate": 1.4496813620959342e-05, + "loss": 1.7223, + "step": 33310 + }, + { + "epoch": 0.4164104102602565, + "grad_norm": 6.002233982086182, + "learning_rate": 1.4496034127699559e-05, + "loss": 2.6335, + "step": 33312 + }, + { + "epoch": 0.41643541088527214, + "grad_norm": 1.9204949140548706, + "learning_rate": 1.4495254600198616e-05, + "loss": 0.7307, + "step": 33314 + }, + { + "epoch": 0.41646041151028773, + "grad_norm": 2.1227900981903076, + "learning_rate": 1.4494475038462451e-05, + "loss": 0.3091, + "step": 33316 + }, + { + "epoch": 0.4164854121353034, + "grad_norm": 3.9615352153778076, + "learning_rate": 1.4493695442497e-05, + "loss": 0.8867, + "step": 33318 + }, + { + "epoch": 0.416510412760319, + "grad_norm": 3.3251824378967285, + "learning_rate": 1.4492915812308197e-05, + "loss": 0.8015, + "step": 33320 + }, + { + "epoch": 0.4165354133853346, + "grad_norm": 4.449217796325684, + "learning_rate": 1.4492136147901983e-05, + "loss": 0.324, + "step": 33322 + }, + { + "epoch": 0.41656041401035027, + "grad_norm": 0.005112682469189167, + "learning_rate": 1.4491356449284294e-05, + "loss": 0.0007, + "step": 33324 + }, + { + "epoch": 0.41658541463536586, + "grad_norm": 1.7468127012252808, + "learning_rate": 1.4490576716461073e-05, + "loss": 0.5211, + "step": 33326 + }, + { + "epoch": 0.4166104152603815, + "grad_norm": 2.055335283279419, + "learning_rate": 1.4489796949438255e-05, + "loss": 1.4615, + "step": 33328 + }, + { + "epoch": 0.41663541588539715, + "grad_norm": 0.5554137229919434, + "learning_rate": 1.4489017148221778e-05, + "loss": 0.655, + "step": 33330 + }, + { + "epoch": 0.41666041651041275, + "grad_norm": 0.6414194107055664, + "learning_rate": 1.448823731281758e-05, + "loss": 0.4583, + "step": 33332 + }, + { + "epoch": 0.4166854171354284, + "grad_norm": 8.790836334228516, + "learning_rate": 1.4487457443231598e-05, + "loss": 1.4205, + "step": 33334 + }, + { + "epoch": 0.416710417760444, + "grad_norm": 4.4169464111328125, + "learning_rate": 1.448667753946978e-05, + "loss": 0.4903, + "step": 33336 + }, + { + "epoch": 0.41673541838545963, + "grad_norm": 5.104755878448486, + "learning_rate": 1.4485897601538057e-05, + "loss": 1.8629, + "step": 33338 + }, + { + "epoch": 0.4167604190104753, + "grad_norm": 3.5318796634674072, + "learning_rate": 1.4485117629442373e-05, + "loss": 1.488, + "step": 33340 + }, + { + "epoch": 0.4167854196354909, + "grad_norm": 0.008625893853604794, + "learning_rate": 1.4484337623188669e-05, + "loss": 0.0002, + "step": 33342 + }, + { + "epoch": 0.4168104202605065, + "grad_norm": 4.901268482208252, + "learning_rate": 1.4483557582782883e-05, + "loss": 1.0917, + "step": 33344 + }, + { + "epoch": 0.4168354208855221, + "grad_norm": 1.9840582609176636, + "learning_rate": 1.4482777508230956e-05, + "loss": 0.3901, + "step": 33346 + }, + { + "epoch": 0.41686042151053776, + "grad_norm": 2.3274714946746826, + "learning_rate": 1.4481997399538829e-05, + "loss": 0.1629, + "step": 33348 + }, + { + "epoch": 0.4168854221355534, + "grad_norm": 2.8146090507507324, + "learning_rate": 1.4481217256712445e-05, + "loss": 0.6481, + "step": 33350 + }, + { + "epoch": 0.416910422760569, + "grad_norm": 0.004284955095499754, + "learning_rate": 1.4480437079757743e-05, + "loss": 0.0001, + "step": 33352 + }, + { + "epoch": 0.41693542338558465, + "grad_norm": 2.936096668243408, + "learning_rate": 1.4479656868680667e-05, + "loss": 1.1387, + "step": 33354 + }, + { + "epoch": 0.41696042401060024, + "grad_norm": 4.387167453765869, + "learning_rate": 1.4478876623487157e-05, + "loss": 0.5984, + "step": 33356 + }, + { + "epoch": 0.4169854246356159, + "grad_norm": 8.924456596374512, + "learning_rate": 1.4478096344183156e-05, + "loss": 1.848, + "step": 33358 + }, + { + "epoch": 0.41701042526063153, + "grad_norm": 2.3576273918151855, + "learning_rate": 1.4477316030774606e-05, + "loss": 1.0847, + "step": 33360 + }, + { + "epoch": 0.4170354258856471, + "grad_norm": 0.003566279774531722, + "learning_rate": 1.447653568326745e-05, + "loss": 0.0001, + "step": 33362 + }, + { + "epoch": 0.4170604265106628, + "grad_norm": 4.383301258087158, + "learning_rate": 1.4475755301667631e-05, + "loss": 1.8249, + "step": 33364 + }, + { + "epoch": 0.41708542713567837, + "grad_norm": 0.004008009098470211, + "learning_rate": 1.4474974885981094e-05, + "loss": 0.0919, + "step": 33366 + }, + { + "epoch": 0.417110427760694, + "grad_norm": 3.0980517864227295, + "learning_rate": 1.4474194436213782e-05, + "loss": 1.1407, + "step": 33368 + }, + { + "epoch": 0.41713542838570966, + "grad_norm": 2.3022007942199707, + "learning_rate": 1.4473413952371635e-05, + "loss": 0.5873, + "step": 33370 + }, + { + "epoch": 0.41716042901072525, + "grad_norm": 2.8890388011932373, + "learning_rate": 1.4472633434460601e-05, + "loss": 1.3103, + "step": 33372 + }, + { + "epoch": 0.4171854296357409, + "grad_norm": 1.3034796714782715, + "learning_rate": 1.4471852882486626e-05, + "loss": 0.053, + "step": 33374 + }, + { + "epoch": 0.4172104302607565, + "grad_norm": 3.242403745651245, + "learning_rate": 1.4471072296455648e-05, + "loss": 1.8703, + "step": 33376 + }, + { + "epoch": 0.41723543088577214, + "grad_norm": 1.521238923072815, + "learning_rate": 1.4470291676373616e-05, + "loss": 0.6018, + "step": 33378 + }, + { + "epoch": 0.4172604315107878, + "grad_norm": 2.9661006927490234, + "learning_rate": 1.4469511022246478e-05, + "loss": 0.3251, + "step": 33380 + }, + { + "epoch": 0.4172854321358034, + "grad_norm": 1.1375986337661743, + "learning_rate": 1.4468730334080174e-05, + "loss": 0.6805, + "step": 33382 + }, + { + "epoch": 0.41731043276081903, + "grad_norm": 4.757513046264648, + "learning_rate": 1.446794961188065e-05, + "loss": 0.8467, + "step": 33384 + }, + { + "epoch": 0.4173354333858346, + "grad_norm": 0.013189499266445637, + "learning_rate": 1.4467168855653856e-05, + "loss": 0.0024, + "step": 33386 + }, + { + "epoch": 0.41736043401085027, + "grad_norm": 3.5785202980041504, + "learning_rate": 1.4466388065405734e-05, + "loss": 1.3724, + "step": 33388 + }, + { + "epoch": 0.4173854346358659, + "grad_norm": 2.4942867755889893, + "learning_rate": 1.4465607241142232e-05, + "loss": 2.3519, + "step": 33390 + }, + { + "epoch": 0.4174104352608815, + "grad_norm": 3.5647313594818115, + "learning_rate": 1.4464826382869299e-05, + "loss": 0.8825, + "step": 33392 + }, + { + "epoch": 0.41743543588589715, + "grad_norm": 2.238751173019409, + "learning_rate": 1.446404549059288e-05, + "loss": 0.801, + "step": 33394 + }, + { + "epoch": 0.41746043651091275, + "grad_norm": 4.342466354370117, + "learning_rate": 1.4463264564318917e-05, + "loss": 0.98, + "step": 33396 + }, + { + "epoch": 0.4174854371359284, + "grad_norm": 0.008130267262458801, + "learning_rate": 1.4462483604053365e-05, + "loss": 0.1239, + "step": 33398 + }, + { + "epoch": 0.41751043776094404, + "grad_norm": 10.37772274017334, + "learning_rate": 1.4461702609802173e-05, + "loss": 2.897, + "step": 33400 + }, + { + "epoch": 0.41753543838595963, + "grad_norm": 2.639780282974243, + "learning_rate": 1.4460921581571282e-05, + "loss": 0.2272, + "step": 33402 + }, + { + "epoch": 0.4175604390109753, + "grad_norm": 3.3649957180023193, + "learning_rate": 1.4460140519366639e-05, + "loss": 1.0241, + "step": 33404 + }, + { + "epoch": 0.4175854396359909, + "grad_norm": 2.8519694805145264, + "learning_rate": 1.4459359423194203e-05, + "loss": 1.2666, + "step": 33406 + }, + { + "epoch": 0.4176104402610065, + "grad_norm": 5.244527816772461, + "learning_rate": 1.4458578293059912e-05, + "loss": 1.7171, + "step": 33408 + }, + { + "epoch": 0.41763544088602217, + "grad_norm": 2.3970065116882324, + "learning_rate": 1.445779712896972e-05, + "loss": 1.6441, + "step": 33410 + }, + { + "epoch": 0.41766044151103776, + "grad_norm": 0.006432530004531145, + "learning_rate": 1.4457015930929574e-05, + "loss": 1.6354, + "step": 33412 + }, + { + "epoch": 0.4176854421360534, + "grad_norm": 4.946732044219971, + "learning_rate": 1.4456234698945429e-05, + "loss": 1.7484, + "step": 33414 + }, + { + "epoch": 0.417710442761069, + "grad_norm": 0.003290025983005762, + "learning_rate": 1.4455453433023228e-05, + "loss": 0.1602, + "step": 33416 + }, + { + "epoch": 0.41773544338608465, + "grad_norm": 4.9913201332092285, + "learning_rate": 1.4454672133168923e-05, + "loss": 0.4976, + "step": 33418 + }, + { + "epoch": 0.4177604440111003, + "grad_norm": 4.347131729125977, + "learning_rate": 1.4453890799388469e-05, + "loss": 0.3192, + "step": 33420 + }, + { + "epoch": 0.4177854446361159, + "grad_norm": 1.8810912370681763, + "learning_rate": 1.4453109431687806e-05, + "loss": 0.2679, + "step": 33422 + }, + { + "epoch": 0.41781044526113154, + "grad_norm": 3.3040950298309326, + "learning_rate": 1.4452328030072896e-05, + "loss": 1.8521, + "step": 33424 + }, + { + "epoch": 0.4178354458861471, + "grad_norm": 2.516295909881592, + "learning_rate": 1.4451546594549686e-05, + "loss": 0.7917, + "step": 33426 + }, + { + "epoch": 0.4178604465111628, + "grad_norm": 2.921502113342285, + "learning_rate": 1.4450765125124124e-05, + "loss": 0.2858, + "step": 33428 + }, + { + "epoch": 0.4178854471361784, + "grad_norm": 4.383696556091309, + "learning_rate": 1.4449983621802166e-05, + "loss": 0.8342, + "step": 33430 + }, + { + "epoch": 0.417910447761194, + "grad_norm": 3.133904218673706, + "learning_rate": 1.4449202084589761e-05, + "loss": 1.3766, + "step": 33432 + }, + { + "epoch": 0.41793544838620966, + "grad_norm": 3.6850006580352783, + "learning_rate": 1.444842051349286e-05, + "loss": 1.0084, + "step": 33434 + }, + { + "epoch": 0.41796044901122525, + "grad_norm": 4.0044403076171875, + "learning_rate": 1.444763890851742e-05, + "loss": 0.7594, + "step": 33436 + }, + { + "epoch": 0.4179854496362409, + "grad_norm": 3.331852436065674, + "learning_rate": 1.444685726966939e-05, + "loss": 1.8673, + "step": 33438 + }, + { + "epoch": 0.41801045026125655, + "grad_norm": 3.2133266925811768, + "learning_rate": 1.4446075596954727e-05, + "loss": 0.2859, + "step": 33440 + }, + { + "epoch": 0.41803545088627214, + "grad_norm": 3.25038743019104, + "learning_rate": 1.4445293890379377e-05, + "loss": 1.9955, + "step": 33442 + }, + { + "epoch": 0.4180604515112878, + "grad_norm": 0.008376036770641804, + "learning_rate": 1.4444512149949299e-05, + "loss": 0.0002, + "step": 33444 + }, + { + "epoch": 0.4180854521363034, + "grad_norm": 0.008930152282118797, + "learning_rate": 1.4443730375670444e-05, + "loss": 1.0045, + "step": 33446 + }, + { + "epoch": 0.41811045276131903, + "grad_norm": 3.5231151580810547, + "learning_rate": 1.444294856754877e-05, + "loss": 0.886, + "step": 33448 + }, + { + "epoch": 0.4181354533863347, + "grad_norm": 0.006251825951039791, + "learning_rate": 1.4442166725590223e-05, + "loss": 0.5659, + "step": 33450 + }, + { + "epoch": 0.41816045401135027, + "grad_norm": 4.9648308753967285, + "learning_rate": 1.4441384849800768e-05, + "loss": 0.418, + "step": 33452 + }, + { + "epoch": 0.4181854546363659, + "grad_norm": 3.474614381790161, + "learning_rate": 1.444060294018635e-05, + "loss": 1.616, + "step": 33454 + }, + { + "epoch": 0.4182104552613815, + "grad_norm": 0.23901337385177612, + "learning_rate": 1.4439820996752929e-05, + "loss": 0.0333, + "step": 33456 + }, + { + "epoch": 0.41823545588639716, + "grad_norm": 0.30298393964767456, + "learning_rate": 1.443903901950646e-05, + "loss": 0.5268, + "step": 33458 + }, + { + "epoch": 0.4182604565114128, + "grad_norm": 2.811629295349121, + "learning_rate": 1.4438257008452897e-05, + "loss": 0.2412, + "step": 33460 + }, + { + "epoch": 0.4182854571364284, + "grad_norm": 0.800353467464447, + "learning_rate": 1.4437474963598195e-05, + "loss": 0.0993, + "step": 33462 + }, + { + "epoch": 0.41831045776144404, + "grad_norm": 5.288976192474365, + "learning_rate": 1.4436692884948312e-05, + "loss": 0.9358, + "step": 33464 + }, + { + "epoch": 0.41833545838645964, + "grad_norm": 4.356545925140381, + "learning_rate": 1.4435910772509206e-05, + "loss": 1.8211, + "step": 33466 + }, + { + "epoch": 0.4183604590114753, + "grad_norm": 8.303153038024902, + "learning_rate": 1.4435128626286827e-05, + "loss": 0.7586, + "step": 33468 + }, + { + "epoch": 0.41838545963649093, + "grad_norm": 6.0525360107421875, + "learning_rate": 1.4434346446287138e-05, + "loss": 1.8598, + "step": 33470 + }, + { + "epoch": 0.4184104602615065, + "grad_norm": 3.7524940967559814, + "learning_rate": 1.443356423251609e-05, + "loss": 1.9508, + "step": 33472 + }, + { + "epoch": 0.41843546088652217, + "grad_norm": 3.4612739086151123, + "learning_rate": 1.4432781984979647e-05, + "loss": 0.6062, + "step": 33474 + }, + { + "epoch": 0.41846046151153776, + "grad_norm": 6.456111431121826, + "learning_rate": 1.4431999703683763e-05, + "loss": 0.7321, + "step": 33476 + }, + { + "epoch": 0.4184854621365534, + "grad_norm": 1.6258625984191895, + "learning_rate": 1.4431217388634398e-05, + "loss": 0.2642, + "step": 33478 + }, + { + "epoch": 0.41851046276156906, + "grad_norm": 2.573133707046509, + "learning_rate": 1.4430435039837505e-05, + "loss": 1.4468, + "step": 33480 + }, + { + "epoch": 0.41853546338658465, + "grad_norm": 5.3059401512146, + "learning_rate": 1.4429652657299045e-05, + "loss": 0.7644, + "step": 33482 + }, + { + "epoch": 0.4185604640116003, + "grad_norm": 1.1642777919769287, + "learning_rate": 1.4428870241024976e-05, + "loss": 0.0663, + "step": 33484 + }, + { + "epoch": 0.4185854646366159, + "grad_norm": 2.4439666271209717, + "learning_rate": 1.4428087791021262e-05, + "loss": 1.2796, + "step": 33486 + }, + { + "epoch": 0.41861046526163154, + "grad_norm": 2.902353525161743, + "learning_rate": 1.4427305307293853e-05, + "loss": 1.0999, + "step": 33488 + }, + { + "epoch": 0.4186354658866472, + "grad_norm": 3.0676474571228027, + "learning_rate": 1.4426522789848716e-05, + "loss": 0.9346, + "step": 33490 + }, + { + "epoch": 0.4186604665116628, + "grad_norm": 3.4854538440704346, + "learning_rate": 1.4425740238691805e-05, + "loss": 1.4715, + "step": 33492 + }, + { + "epoch": 0.4186854671366784, + "grad_norm": 7.2542805671691895, + "learning_rate": 1.4424957653829083e-05, + "loss": 0.1979, + "step": 33494 + }, + { + "epoch": 0.418710467761694, + "grad_norm": 3.324326753616333, + "learning_rate": 1.442417503526651e-05, + "loss": 1.2678, + "step": 33496 + }, + { + "epoch": 0.41873546838670966, + "grad_norm": 2.394929885864258, + "learning_rate": 1.4423392383010046e-05, + "loss": 0.392, + "step": 33498 + }, + { + "epoch": 0.4187604690117253, + "grad_norm": 3.7861645221710205, + "learning_rate": 1.442260969706565e-05, + "loss": 0.6123, + "step": 33500 + }, + { + "epoch": 0.4187854696367409, + "grad_norm": 4.646618843078613, + "learning_rate": 1.4421826977439284e-05, + "loss": 1.5851, + "step": 33502 + }, + { + "epoch": 0.41881047026175655, + "grad_norm": 9.144857406616211, + "learning_rate": 1.442104422413691e-05, + "loss": 2.0897, + "step": 33504 + }, + { + "epoch": 0.41883547088677214, + "grad_norm": 7.570871353149414, + "learning_rate": 1.4420261437164488e-05, + "loss": 1.872, + "step": 33506 + }, + { + "epoch": 0.4188604715117878, + "grad_norm": 0.004516165237873793, + "learning_rate": 1.441947861652798e-05, + "loss": 1.075, + "step": 33508 + }, + { + "epoch": 0.41888547213680344, + "grad_norm": 2.1815264225006104, + "learning_rate": 1.4418695762233346e-05, + "loss": 0.2143, + "step": 33510 + }, + { + "epoch": 0.41891047276181903, + "grad_norm": 9.781235694885254, + "learning_rate": 1.441791287428655e-05, + "loss": 1.611, + "step": 33512 + }, + { + "epoch": 0.4189354733868347, + "grad_norm": 4.33171272277832, + "learning_rate": 1.4417129952693555e-05, + "loss": 0.7788, + "step": 33514 + }, + { + "epoch": 0.41896047401185027, + "grad_norm": 0.005145803559571505, + "learning_rate": 1.4416346997460324e-05, + "loss": 0.6441, + "step": 33516 + }, + { + "epoch": 0.4189854746368659, + "grad_norm": 10.086992263793945, + "learning_rate": 1.4415564008592822e-05, + "loss": 1.769, + "step": 33518 + }, + { + "epoch": 0.41901047526188157, + "grad_norm": 4.090620517730713, + "learning_rate": 1.4414780986097004e-05, + "loss": 1.392, + "step": 33520 + }, + { + "epoch": 0.41903547588689716, + "grad_norm": 4.009347915649414, + "learning_rate": 1.441399792997884e-05, + "loss": 0.901, + "step": 33522 + }, + { + "epoch": 0.4190604765119128, + "grad_norm": 6.244196891784668, + "learning_rate": 1.441321484024429e-05, + "loss": 0.7068, + "step": 33524 + }, + { + "epoch": 0.4190854771369284, + "grad_norm": 3.259742021560669, + "learning_rate": 1.4412431716899323e-05, + "loss": 0.7743, + "step": 33526 + }, + { + "epoch": 0.41911047776194404, + "grad_norm": 0.21169748902320862, + "learning_rate": 1.4411648559949897e-05, + "loss": 1.2706, + "step": 33528 + }, + { + "epoch": 0.4191354783869597, + "grad_norm": 0.0036499232519418, + "learning_rate": 1.4410865369401984e-05, + "loss": 0.0855, + "step": 33530 + }, + { + "epoch": 0.4191604790119753, + "grad_norm": 0.8187989592552185, + "learning_rate": 1.441008214526154e-05, + "loss": 0.0475, + "step": 33532 + }, + { + "epoch": 0.41918547963699093, + "grad_norm": 2.1811296939849854, + "learning_rate": 1.4409298887534533e-05, + "loss": 0.2378, + "step": 33534 + }, + { + "epoch": 0.4192104802620065, + "grad_norm": 1.9678212404251099, + "learning_rate": 1.440851559622693e-05, + "loss": 0.1, + "step": 33536 + }, + { + "epoch": 0.41923548088702217, + "grad_norm": 3.457305431365967, + "learning_rate": 1.44077322713447e-05, + "loss": 0.6663, + "step": 33538 + }, + { + "epoch": 0.4192604815120378, + "grad_norm": 1.7815245389938354, + "learning_rate": 1.4406948912893798e-05, + "loss": 0.0821, + "step": 33540 + }, + { + "epoch": 0.4192854821370534, + "grad_norm": 0.7087584137916565, + "learning_rate": 1.4406165520880199e-05, + "loss": 0.0144, + "step": 33542 + }, + { + "epoch": 0.41931048276206906, + "grad_norm": 0.002401139587163925, + "learning_rate": 1.440538209530987e-05, + "loss": 0.3508, + "step": 33544 + }, + { + "epoch": 0.41933548338708465, + "grad_norm": 2.4740593433380127, + "learning_rate": 1.4404598636188771e-05, + "loss": 0.3819, + "step": 33546 + }, + { + "epoch": 0.4193604840121003, + "grad_norm": 0.12771092355251312, + "learning_rate": 1.440381514352287e-05, + "loss": 0.2231, + "step": 33548 + }, + { + "epoch": 0.41938548463711595, + "grad_norm": 4.311071395874023, + "learning_rate": 1.4403031617318138e-05, + "loss": 0.915, + "step": 33550 + }, + { + "epoch": 0.41941048526213154, + "grad_norm": 6.012479305267334, + "learning_rate": 1.440224805758054e-05, + "loss": 1.4773, + "step": 33552 + }, + { + "epoch": 0.4194354858871472, + "grad_norm": 0.002105466788634658, + "learning_rate": 1.4401464464316042e-05, + "loss": 0.0001, + "step": 33554 + }, + { + "epoch": 0.4194604865121628, + "grad_norm": 1.6771705150604248, + "learning_rate": 1.4400680837530614e-05, + "loss": 0.6387, + "step": 33556 + }, + { + "epoch": 0.4194854871371784, + "grad_norm": 0.4787898063659668, + "learning_rate": 1.4399897177230222e-05, + "loss": 0.0808, + "step": 33558 + }, + { + "epoch": 0.4195104877621941, + "grad_norm": 0.003317511873319745, + "learning_rate": 1.4399113483420835e-05, + "loss": 0.0329, + "step": 33560 + }, + { + "epoch": 0.41953548838720967, + "grad_norm": 3.9209766387939453, + "learning_rate": 1.4398329756108421e-05, + "loss": 1.29, + "step": 33562 + }, + { + "epoch": 0.4195604890122253, + "grad_norm": 2.3794689178466797, + "learning_rate": 1.4397545995298952e-05, + "loss": 0.2995, + "step": 33564 + }, + { + "epoch": 0.4195854896372409, + "grad_norm": 2.6012072563171387, + "learning_rate": 1.4396762200998393e-05, + "loss": 2.0495, + "step": 33566 + }, + { + "epoch": 0.41961049026225655, + "grad_norm": 2.538081645965576, + "learning_rate": 1.4395978373212715e-05, + "loss": 1.5073, + "step": 33568 + }, + { + "epoch": 0.4196354908872722, + "grad_norm": 1.9326531887054443, + "learning_rate": 1.439519451194789e-05, + "loss": 0.9361, + "step": 33570 + }, + { + "epoch": 0.4196604915122878, + "grad_norm": 3.3330774307250977, + "learning_rate": 1.4394410617209884e-05, + "loss": 0.617, + "step": 33572 + }, + { + "epoch": 0.41968549213730344, + "grad_norm": 0.003138502361252904, + "learning_rate": 1.4393626689004663e-05, + "loss": 0.8458, + "step": 33574 + }, + { + "epoch": 0.41971049276231903, + "grad_norm": 4.72194766998291, + "learning_rate": 1.439284272733821e-05, + "loss": 1.007, + "step": 33576 + }, + { + "epoch": 0.4197354933873347, + "grad_norm": 2.485299587249756, + "learning_rate": 1.4392058732216483e-05, + "loss": 1.2928, + "step": 33578 + }, + { + "epoch": 0.4197604940123503, + "grad_norm": 3.5115396976470947, + "learning_rate": 1.439127470364546e-05, + "loss": 0.6655, + "step": 33580 + }, + { + "epoch": 0.4197854946373659, + "grad_norm": 2.7270617485046387, + "learning_rate": 1.4390490641631111e-05, + "loss": 1.1782, + "step": 33582 + }, + { + "epoch": 0.41981049526238157, + "grad_norm": 2.056684732437134, + "learning_rate": 1.4389706546179403e-05, + "loss": 0.503, + "step": 33584 + }, + { + "epoch": 0.41983549588739716, + "grad_norm": 2.2381484508514404, + "learning_rate": 1.4388922417296311e-05, + "loss": 1.003, + "step": 33586 + }, + { + "epoch": 0.4198604965124128, + "grad_norm": 2.07275390625, + "learning_rate": 1.4388138254987806e-05, + "loss": 0.8581, + "step": 33588 + }, + { + "epoch": 0.41988549713742845, + "grad_norm": 5.218395233154297, + "learning_rate": 1.4387354059259866e-05, + "loss": 0.4781, + "step": 33590 + }, + { + "epoch": 0.41991049776244405, + "grad_norm": 1.3974934816360474, + "learning_rate": 1.4386569830118454e-05, + "loss": 0.0717, + "step": 33592 + }, + { + "epoch": 0.4199354983874597, + "grad_norm": 2.0375330448150635, + "learning_rate": 1.4385785567569545e-05, + "loss": 0.7962, + "step": 33594 + }, + { + "epoch": 0.4199604990124753, + "grad_norm": 3.9296765327453613, + "learning_rate": 1.4385001271619117e-05, + "loss": 0.7059, + "step": 33596 + }, + { + "epoch": 0.41998549963749093, + "grad_norm": 3.4522500038146973, + "learning_rate": 1.4384216942273138e-05, + "loss": 1.0209, + "step": 33598 + }, + { + "epoch": 0.4200105002625066, + "grad_norm": 5.045698642730713, + "learning_rate": 1.4383432579537577e-05, + "loss": 0.7998, + "step": 33600 + }, + { + "epoch": 0.4200355008875222, + "grad_norm": 3.254748821258545, + "learning_rate": 1.4382648183418422e-05, + "loss": 1.0403, + "step": 33602 + }, + { + "epoch": 0.4200605015125378, + "grad_norm": 2.8931446075439453, + "learning_rate": 1.4381863753921635e-05, + "loss": 0.2186, + "step": 33604 + }, + { + "epoch": 0.4200855021375534, + "grad_norm": 2.4196605682373047, + "learning_rate": 1.438107929105319e-05, + "loss": 0.7379, + "step": 33606 + }, + { + "epoch": 0.42011050276256906, + "grad_norm": 5.229172229766846, + "learning_rate": 1.438029479481907e-05, + "loss": 1.6692, + "step": 33608 + }, + { + "epoch": 0.4201355033875847, + "grad_norm": 3.3526298999786377, + "learning_rate": 1.4379510265225243e-05, + "loss": 0.9176, + "step": 33610 + }, + { + "epoch": 0.4201605040126003, + "grad_norm": 0.3404153883457184, + "learning_rate": 1.437872570227768e-05, + "loss": 0.547, + "step": 33612 + }, + { + "epoch": 0.42018550463761595, + "grad_norm": 3.413783311843872, + "learning_rate": 1.4377941105982367e-05, + "loss": 1.9407, + "step": 33614 + }, + { + "epoch": 0.42021050526263154, + "grad_norm": 1.0563015937805176, + "learning_rate": 1.4377156476345273e-05, + "loss": 0.9647, + "step": 33616 + }, + { + "epoch": 0.4202355058876472, + "grad_norm": 5.821876049041748, + "learning_rate": 1.4376371813372372e-05, + "loss": 1.8442, + "step": 33618 + }, + { + "epoch": 0.42026050651266283, + "grad_norm": 0.6153280138969421, + "learning_rate": 1.4375587117069643e-05, + "loss": 0.8901, + "step": 33620 + }, + { + "epoch": 0.4202855071376784, + "grad_norm": 6.971651554107666, + "learning_rate": 1.4374802387443064e-05, + "loss": 1.7332, + "step": 33622 + }, + { + "epoch": 0.4203105077626941, + "grad_norm": 0.00296970852650702, + "learning_rate": 1.4374017624498605e-05, + "loss": 0.1849, + "step": 33624 + }, + { + "epoch": 0.42033550838770967, + "grad_norm": 4.888379096984863, + "learning_rate": 1.4373232828242245e-05, + "loss": 0.732, + "step": 33626 + }, + { + "epoch": 0.4203605090127253, + "grad_norm": 0.01792803965508938, + "learning_rate": 1.4372447998679968e-05, + "loss": 0.6138, + "step": 33628 + }, + { + "epoch": 0.42038550963774096, + "grad_norm": 2.6115331649780273, + "learning_rate": 1.4371663135817742e-05, + "loss": 1.2547, + "step": 33630 + }, + { + "epoch": 0.42041051026275655, + "grad_norm": 1.4648759365081787, + "learning_rate": 1.4370878239661548e-05, + "loss": 0.6588, + "step": 33632 + }, + { + "epoch": 0.4204355108877722, + "grad_norm": 3.693084478378296, + "learning_rate": 1.4370093310217364e-05, + "loss": 0.4872, + "step": 33634 + }, + { + "epoch": 0.4204605115127878, + "grad_norm": 4.687939643859863, + "learning_rate": 1.4369308347491164e-05, + "loss": 1.3542, + "step": 33636 + }, + { + "epoch": 0.42048551213780344, + "grad_norm": 2.131683826446533, + "learning_rate": 1.4368523351488931e-05, + "loss": 1.093, + "step": 33638 + }, + { + "epoch": 0.4205105127628191, + "grad_norm": 3.587684392929077, + "learning_rate": 1.4367738322216644e-05, + "loss": 0.8898, + "step": 33640 + }, + { + "epoch": 0.4205355133878347, + "grad_norm": 1.815023422241211, + "learning_rate": 1.436695325968028e-05, + "loss": 0.2924, + "step": 33642 + }, + { + "epoch": 0.42056051401285033, + "grad_norm": 3.5454273223876953, + "learning_rate": 1.4366168163885816e-05, + "loss": 1.867, + "step": 33644 + }, + { + "epoch": 0.4205855146378659, + "grad_norm": 4.027634620666504, + "learning_rate": 1.4365383034839232e-05, + "loss": 0.7373, + "step": 33646 + }, + { + "epoch": 0.42061051526288157, + "grad_norm": 4.196798324584961, + "learning_rate": 1.4364597872546511e-05, + "loss": 1.4988, + "step": 33648 + }, + { + "epoch": 0.4206355158878972, + "grad_norm": 4.033961772918701, + "learning_rate": 1.4363812677013627e-05, + "loss": 2.106, + "step": 33650 + }, + { + "epoch": 0.4206605165129128, + "grad_norm": 3.2986254692077637, + "learning_rate": 1.4363027448246563e-05, + "loss": 1.9514, + "step": 33652 + }, + { + "epoch": 0.42068551713792846, + "grad_norm": 3.4860963821411133, + "learning_rate": 1.4362242186251301e-05, + "loss": 1.0184, + "step": 33654 + }, + { + "epoch": 0.42071051776294405, + "grad_norm": 1.1379269361495972, + "learning_rate": 1.4361456891033818e-05, + "loss": 0.5847, + "step": 33656 + }, + { + "epoch": 0.4207355183879597, + "grad_norm": 4.431931018829346, + "learning_rate": 1.4360671562600095e-05, + "loss": 1.7863, + "step": 33658 + }, + { + "epoch": 0.42076051901297534, + "grad_norm": 0.24939769506454468, + "learning_rate": 1.4359886200956115e-05, + "loss": 0.5307, + "step": 33660 + }, + { + "epoch": 0.42078551963799093, + "grad_norm": 2.1578640937805176, + "learning_rate": 1.4359100806107857e-05, + "loss": 0.5286, + "step": 33662 + }, + { + "epoch": 0.4208105202630066, + "grad_norm": 5.212804317474365, + "learning_rate": 1.4358315378061305e-05, + "loss": 0.9818, + "step": 33664 + }, + { + "epoch": 0.4208355208880222, + "grad_norm": 4.184256076812744, + "learning_rate": 1.4357529916822438e-05, + "loss": 0.7914, + "step": 33666 + }, + { + "epoch": 0.4208605215130378, + "grad_norm": 0.4479660093784332, + "learning_rate": 1.4356744422397245e-05, + "loss": 0.314, + "step": 33668 + }, + { + "epoch": 0.42088552213805347, + "grad_norm": 3.7209436893463135, + "learning_rate": 1.4355958894791697e-05, + "loss": 1.2097, + "step": 33670 + }, + { + "epoch": 0.42091052276306906, + "grad_norm": 4.3483567237854, + "learning_rate": 1.4355173334011783e-05, + "loss": 1.1273, + "step": 33672 + }, + { + "epoch": 0.4209355233880847, + "grad_norm": 5.549270153045654, + "learning_rate": 1.4354387740063484e-05, + "loss": 1.408, + "step": 33674 + }, + { + "epoch": 0.4209605240131003, + "grad_norm": 2.334463357925415, + "learning_rate": 1.4353602112952785e-05, + "loss": 0.5476, + "step": 33676 + }, + { + "epoch": 0.42098552463811595, + "grad_norm": 3.8486146926879883, + "learning_rate": 1.4352816452685667e-05, + "loss": 0.5635, + "step": 33678 + }, + { + "epoch": 0.4210105252631316, + "grad_norm": 3.756361722946167, + "learning_rate": 1.4352030759268116e-05, + "loss": 1.9335, + "step": 33680 + }, + { + "epoch": 0.4210355258881472, + "grad_norm": 5.648494720458984, + "learning_rate": 1.4351245032706116e-05, + "loss": 1.4303, + "step": 33682 + }, + { + "epoch": 0.42106052651316284, + "grad_norm": 5.211223125457764, + "learning_rate": 1.4350459273005646e-05, + "loss": 1.3587, + "step": 33684 + }, + { + "epoch": 0.42108552713817843, + "grad_norm": 4.4910454750061035, + "learning_rate": 1.4349673480172692e-05, + "loss": 0.8574, + "step": 33686 + }, + { + "epoch": 0.4211105277631941, + "grad_norm": 0.0017424608813598752, + "learning_rate": 1.434888765421324e-05, + "loss": 0.1557, + "step": 33688 + }, + { + "epoch": 0.4211355283882097, + "grad_norm": 2.3877031803131104, + "learning_rate": 1.4348101795133279e-05, + "loss": 0.3094, + "step": 33690 + }, + { + "epoch": 0.4211605290132253, + "grad_norm": 3.6561405658721924, + "learning_rate": 1.4347315902938785e-05, + "loss": 0.7014, + "step": 33692 + }, + { + "epoch": 0.42118552963824096, + "grad_norm": 3.067841053009033, + "learning_rate": 1.4346529977635753e-05, + "loss": 1.7805, + "step": 33694 + }, + { + "epoch": 0.42121053026325656, + "grad_norm": 3.9695160388946533, + "learning_rate": 1.434574401923016e-05, + "loss": 1.0787, + "step": 33696 + }, + { + "epoch": 0.4212355308882722, + "grad_norm": 1.130725383758545, + "learning_rate": 1.4344958027727995e-05, + "loss": 0.1446, + "step": 33698 + }, + { + "epoch": 0.42126053151328785, + "grad_norm": 13.165431022644043, + "learning_rate": 1.4344172003135244e-05, + "loss": 1.2942, + "step": 33700 + }, + { + "epoch": 0.42128553213830344, + "grad_norm": 1.2493860721588135, + "learning_rate": 1.4343385945457892e-05, + "loss": 0.4421, + "step": 33702 + }, + { + "epoch": 0.4213105327633191, + "grad_norm": 3.464020013809204, + "learning_rate": 1.434259985470193e-05, + "loss": 1.3414, + "step": 33704 + }, + { + "epoch": 0.4213355333883347, + "grad_norm": 3.386018991470337, + "learning_rate": 1.4341813730873342e-05, + "loss": 1.659, + "step": 33706 + }, + { + "epoch": 0.42136053401335033, + "grad_norm": 3.9572296142578125, + "learning_rate": 1.4341027573978112e-05, + "loss": 0.9486, + "step": 33708 + }, + { + "epoch": 0.421385534638366, + "grad_norm": 3.1158852577209473, + "learning_rate": 1.4340241384022232e-05, + "loss": 0.869, + "step": 33710 + }, + { + "epoch": 0.42141053526338157, + "grad_norm": 5.4874587059021, + "learning_rate": 1.4339455161011684e-05, + "loss": 1.0627, + "step": 33712 + }, + { + "epoch": 0.4214355358883972, + "grad_norm": 6.273834705352783, + "learning_rate": 1.4338668904952462e-05, + "loss": 2.1271, + "step": 33714 + }, + { + "epoch": 0.4214605365134128, + "grad_norm": 0.0030468315817415714, + "learning_rate": 1.4337882615850549e-05, + "loss": 0.233, + "step": 33716 + }, + { + "epoch": 0.42148553713842846, + "grad_norm": 6.70435905456543, + "learning_rate": 1.4337096293711938e-05, + "loss": 0.694, + "step": 33718 + }, + { + "epoch": 0.4215105377634441, + "grad_norm": 2.0348095893859863, + "learning_rate": 1.4336309938542614e-05, + "loss": 1.3793, + "step": 33720 + }, + { + "epoch": 0.4215355383884597, + "grad_norm": 2.1714820861816406, + "learning_rate": 1.4335523550348566e-05, + "loss": 1.0186, + "step": 33722 + }, + { + "epoch": 0.42156053901347534, + "grad_norm": 6.627256870269775, + "learning_rate": 1.4334737129135785e-05, + "loss": 1.0782, + "step": 33724 + }, + { + "epoch": 0.42158553963849094, + "grad_norm": 1.7444686889648438, + "learning_rate": 1.4333950674910257e-05, + "loss": 0.095, + "step": 33726 + }, + { + "epoch": 0.4216105402635066, + "grad_norm": 0.7271323204040527, + "learning_rate": 1.4333164187677975e-05, + "loss": 0.8579, + "step": 33728 + }, + { + "epoch": 0.42163554088852223, + "grad_norm": 1.9969216585159302, + "learning_rate": 1.4332377667444928e-05, + "loss": 1.0363, + "step": 33730 + }, + { + "epoch": 0.4216605415135378, + "grad_norm": 2.526853084564209, + "learning_rate": 1.4331591114217105e-05, + "loss": 0.8554, + "step": 33732 + }, + { + "epoch": 0.42168554213855347, + "grad_norm": 4.423764228820801, + "learning_rate": 1.4330804528000498e-05, + "loss": 1.0058, + "step": 33734 + }, + { + "epoch": 0.42171054276356906, + "grad_norm": 1.0496273040771484, + "learning_rate": 1.4330017908801093e-05, + "loss": 1.0212, + "step": 33736 + }, + { + "epoch": 0.4217355433885847, + "grad_norm": 2.26070237159729, + "learning_rate": 1.4329231256624884e-05, + "loss": 1.4851, + "step": 33738 + }, + { + "epoch": 0.42176054401360036, + "grad_norm": 0.002913418458774686, + "learning_rate": 1.4328444571477861e-05, + "loss": 0.0844, + "step": 33740 + }, + { + "epoch": 0.42178554463861595, + "grad_norm": 3.185830593109131, + "learning_rate": 1.4327657853366019e-05, + "loss": 0.4368, + "step": 33742 + }, + { + "epoch": 0.4218105452636316, + "grad_norm": 2.7318320274353027, + "learning_rate": 1.4326871102295345e-05, + "loss": 1.0266, + "step": 33744 + }, + { + "epoch": 0.4218355458886472, + "grad_norm": 2.5030460357666016, + "learning_rate": 1.4326084318271838e-05, + "loss": 0.2727, + "step": 33746 + }, + { + "epoch": 0.42186054651366284, + "grad_norm": 6.206751823425293, + "learning_rate": 1.4325297501301478e-05, + "loss": 1.4711, + "step": 33748 + }, + { + "epoch": 0.4218855471386785, + "grad_norm": 5.030669212341309, + "learning_rate": 1.4324510651390267e-05, + "loss": 1.5804, + "step": 33750 + }, + { + "epoch": 0.4219105477636941, + "grad_norm": 2.11926531791687, + "learning_rate": 1.4323723768544194e-05, + "loss": 0.1041, + "step": 33752 + }, + { + "epoch": 0.4219355483887097, + "grad_norm": 3.5188844203948975, + "learning_rate": 1.432293685276925e-05, + "loss": 1.4928, + "step": 33754 + }, + { + "epoch": 0.4219605490137253, + "grad_norm": 2.4320197105407715, + "learning_rate": 1.4322149904071431e-05, + "loss": 1.1562, + "step": 33756 + }, + { + "epoch": 0.42198554963874096, + "grad_norm": 0.0014744417276233435, + "learning_rate": 1.4321362922456734e-05, + "loss": 0.7117, + "step": 33758 + }, + { + "epoch": 0.4220105502637566, + "grad_norm": 12.540582656860352, + "learning_rate": 1.4320575907931145e-05, + "loss": 1.8814, + "step": 33760 + }, + { + "epoch": 0.4220355508887722, + "grad_norm": 1.4865301847457886, + "learning_rate": 1.4319788860500658e-05, + "loss": 0.679, + "step": 33762 + }, + { + "epoch": 0.42206055151378785, + "grad_norm": 2.6721742153167725, + "learning_rate": 1.431900178017127e-05, + "loss": 0.5249, + "step": 33764 + }, + { + "epoch": 0.42208555213880344, + "grad_norm": 4.0026044845581055, + "learning_rate": 1.4318214666948982e-05, + "loss": 0.8438, + "step": 33766 + }, + { + "epoch": 0.4221105527638191, + "grad_norm": 2.169188976287842, + "learning_rate": 1.4317427520839777e-05, + "loss": 0.3394, + "step": 33768 + }, + { + "epoch": 0.42213555338883474, + "grad_norm": 3.8243608474731445, + "learning_rate": 1.4316640341849652e-05, + "loss": 1.0344, + "step": 33770 + }, + { + "epoch": 0.42216055401385033, + "grad_norm": 5.254875183105469, + "learning_rate": 1.4315853129984612e-05, + "loss": 1.8566, + "step": 33772 + }, + { + "epoch": 0.422185554638866, + "grad_norm": 14.16125202178955, + "learning_rate": 1.4315065885250639e-05, + "loss": 1.4273, + "step": 33774 + }, + { + "epoch": 0.42221055526388157, + "grad_norm": 0.003222984028980136, + "learning_rate": 1.4314278607653733e-05, + "loss": 0.8667, + "step": 33776 + }, + { + "epoch": 0.4222355558888972, + "grad_norm": 0.003963852301239967, + "learning_rate": 1.4313491297199895e-05, + "loss": 0.0263, + "step": 33778 + }, + { + "epoch": 0.42226055651391287, + "grad_norm": 5.307844638824463, + "learning_rate": 1.4312703953895118e-05, + "loss": 1.2973, + "step": 33780 + }, + { + "epoch": 0.42228555713892846, + "grad_norm": 3.5134687423706055, + "learning_rate": 1.4311916577745395e-05, + "loss": 1.0632, + "step": 33782 + }, + { + "epoch": 0.4223105577639441, + "grad_norm": 3.5090818405151367, + "learning_rate": 1.4311129168756727e-05, + "loss": 1.5164, + "step": 33784 + }, + { + "epoch": 0.4223355583889597, + "grad_norm": 4.711369514465332, + "learning_rate": 1.4310341726935108e-05, + "loss": 1.09, + "step": 33786 + }, + { + "epoch": 0.42236055901397535, + "grad_norm": 1.0236114263534546, + "learning_rate": 1.4309554252286532e-05, + "loss": 0.0391, + "step": 33788 + }, + { + "epoch": 0.422385559638991, + "grad_norm": 5.123362064361572, + "learning_rate": 1.4308766744817004e-05, + "loss": 1.6651, + "step": 33790 + }, + { + "epoch": 0.4224105602640066, + "grad_norm": 2.544257164001465, + "learning_rate": 1.430797920453252e-05, + "loss": 1.0754, + "step": 33792 + }, + { + "epoch": 0.42243556088902223, + "grad_norm": 3.8262600898742676, + "learning_rate": 1.4307191631439072e-05, + "loss": 1.6942, + "step": 33794 + }, + { + "epoch": 0.4224605615140378, + "grad_norm": 3.099424362182617, + "learning_rate": 1.4306404025542663e-05, + "loss": 0.7757, + "step": 33796 + }, + { + "epoch": 0.42248556213905347, + "grad_norm": 5.402597904205322, + "learning_rate": 1.4305616386849293e-05, + "loss": 0.9659, + "step": 33798 + }, + { + "epoch": 0.4225105627640691, + "grad_norm": 3.0701217651367188, + "learning_rate": 1.4304828715364956e-05, + "loss": 0.7369, + "step": 33800 + }, + { + "epoch": 0.4225355633890847, + "grad_norm": 3.699897050857544, + "learning_rate": 1.4304041011095647e-05, + "loss": 1.737, + "step": 33802 + }, + { + "epoch": 0.42256056401410036, + "grad_norm": 0.0010497192852199078, + "learning_rate": 1.4303253274047375e-05, + "loss": 0.3338, + "step": 33804 + }, + { + "epoch": 0.42258556463911595, + "grad_norm": 3.369842052459717, + "learning_rate": 1.4302465504226133e-05, + "loss": 0.9477, + "step": 33806 + }, + { + "epoch": 0.4226105652641316, + "grad_norm": 5.5322675704956055, + "learning_rate": 1.4301677701637924e-05, + "loss": 1.4582, + "step": 33808 + }, + { + "epoch": 0.42263556588914725, + "grad_norm": 3.4937469959259033, + "learning_rate": 1.4300889866288743e-05, + "loss": 0.8915, + "step": 33810 + }, + { + "epoch": 0.42266056651416284, + "grad_norm": 2.9730331897735596, + "learning_rate": 1.4300101998184596e-05, + "loss": 1.5036, + "step": 33812 + }, + { + "epoch": 0.4226855671391785, + "grad_norm": 0.10565770417451859, + "learning_rate": 1.4299314097331479e-05, + "loss": 0.9855, + "step": 33814 + }, + { + "epoch": 0.4227105677641941, + "grad_norm": 2.2533669471740723, + "learning_rate": 1.429852616373539e-05, + "loss": 1.6472, + "step": 33816 + }, + { + "epoch": 0.4227355683892097, + "grad_norm": 4.012732028961182, + "learning_rate": 1.429773819740234e-05, + "loss": 1.2035, + "step": 33818 + }, + { + "epoch": 0.4227605690142254, + "grad_norm": 4.435841083526611, + "learning_rate": 1.4296950198338322e-05, + "loss": 0.7225, + "step": 33820 + }, + { + "epoch": 0.42278556963924097, + "grad_norm": 2.9836907386779785, + "learning_rate": 1.4296162166549337e-05, + "loss": 0.8189, + "step": 33822 + }, + { + "epoch": 0.4228105702642566, + "grad_norm": 5.169360637664795, + "learning_rate": 1.4295374102041391e-05, + "loss": 1.3737, + "step": 33824 + }, + { + "epoch": 0.4228355708892722, + "grad_norm": 5.5767035484313965, + "learning_rate": 1.4294586004820482e-05, + "loss": 1.0572, + "step": 33826 + }, + { + "epoch": 0.42286057151428785, + "grad_norm": 0.0013821589527651668, + "learning_rate": 1.429379787489261e-05, + "loss": 0.406, + "step": 33828 + }, + { + "epoch": 0.4228855721393035, + "grad_norm": 1.6247663497924805, + "learning_rate": 1.4293009712263786e-05, + "loss": 0.3577, + "step": 33830 + }, + { + "epoch": 0.4229105727643191, + "grad_norm": 4.373203277587891, + "learning_rate": 1.4292221516940004e-05, + "loss": 1.1259, + "step": 33832 + }, + { + "epoch": 0.42293557338933474, + "grad_norm": 3.601233720779419, + "learning_rate": 1.429143328892727e-05, + "loss": 1.5776, + "step": 33834 + }, + { + "epoch": 0.42296057401435033, + "grad_norm": 5.174103260040283, + "learning_rate": 1.4290645028231586e-05, + "loss": 0.8954, + "step": 33836 + }, + { + "epoch": 0.422985574639366, + "grad_norm": 0.0013942754594609141, + "learning_rate": 1.428985673485896e-05, + "loss": 0.0002, + "step": 33838 + }, + { + "epoch": 0.4230105752643816, + "grad_norm": 3.0645229816436768, + "learning_rate": 1.4289068408815388e-05, + "loss": 1.4893, + "step": 33840 + }, + { + "epoch": 0.4230355758893972, + "grad_norm": 3.9575836658477783, + "learning_rate": 1.4288280050106878e-05, + "loss": 0.7422, + "step": 33842 + }, + { + "epoch": 0.42306057651441287, + "grad_norm": 1.2156364917755127, + "learning_rate": 1.4287491658739436e-05, + "loss": 0.1002, + "step": 33844 + }, + { + "epoch": 0.42308557713942846, + "grad_norm": 6.328212738037109, + "learning_rate": 1.4286703234719064e-05, + "loss": 1.3238, + "step": 33846 + }, + { + "epoch": 0.4231105777644441, + "grad_norm": 4.156554222106934, + "learning_rate": 1.4285914778051763e-05, + "loss": 1.3124, + "step": 33848 + }, + { + "epoch": 0.42313557838945975, + "grad_norm": 10.1044340133667, + "learning_rate": 1.4285126288743543e-05, + "loss": 1.6339, + "step": 33850 + }, + { + "epoch": 0.42316057901447535, + "grad_norm": 6.038040637969971, + "learning_rate": 1.4284337766800407e-05, + "loss": 1.0938, + "step": 33852 + }, + { + "epoch": 0.423185579639491, + "grad_norm": 3.718344211578369, + "learning_rate": 1.428354921222836e-05, + "loss": 1.4843, + "step": 33854 + }, + { + "epoch": 0.4232105802645066, + "grad_norm": 1.0288209915161133, + "learning_rate": 1.428276062503341e-05, + "loss": 0.5233, + "step": 33856 + }, + { + "epoch": 0.42323558088952223, + "grad_norm": 3.28664231300354, + "learning_rate": 1.4281972005221558e-05, + "loss": 1.2459, + "step": 33858 + }, + { + "epoch": 0.4232605815145379, + "grad_norm": 4.559327602386475, + "learning_rate": 1.4281183352798813e-05, + "loss": 0.6843, + "step": 33860 + }, + { + "epoch": 0.4232855821395535, + "grad_norm": 2.5564982891082764, + "learning_rate": 1.4280394667771182e-05, + "loss": 0.5643, + "step": 33862 + }, + { + "epoch": 0.4233105827645691, + "grad_norm": 3.214080572128296, + "learning_rate": 1.427960595014467e-05, + "loss": 0.8689, + "step": 33864 + }, + { + "epoch": 0.4233355833895847, + "grad_norm": 1.4751288890838623, + "learning_rate": 1.4278817199925286e-05, + "loss": 0.4527, + "step": 33866 + }, + { + "epoch": 0.42336058401460036, + "grad_norm": 3.909179210662842, + "learning_rate": 1.4278028417119031e-05, + "loss": 1.5211, + "step": 33868 + }, + { + "epoch": 0.423385584639616, + "grad_norm": 4.889486312866211, + "learning_rate": 1.4277239601731922e-05, + "loss": 1.1846, + "step": 33870 + }, + { + "epoch": 0.4234105852646316, + "grad_norm": 6.178357124328613, + "learning_rate": 1.4276450753769959e-05, + "loss": 1.485, + "step": 33872 + }, + { + "epoch": 0.42343558588964725, + "grad_norm": 3.1828370094299316, + "learning_rate": 1.427566187323915e-05, + "loss": 0.3802, + "step": 33874 + }, + { + "epoch": 0.42346058651466284, + "grad_norm": 3.3457136154174805, + "learning_rate": 1.4274872960145506e-05, + "loss": 1.5363, + "step": 33876 + }, + { + "epoch": 0.4234855871396785, + "grad_norm": 3.614194631576538, + "learning_rate": 1.4274084014495035e-05, + "loss": 1.0098, + "step": 33878 + }, + { + "epoch": 0.42351058776469414, + "grad_norm": 3.699319839477539, + "learning_rate": 1.4273295036293745e-05, + "loss": 1.7779, + "step": 33880 + }, + { + "epoch": 0.4235355883897097, + "grad_norm": 3.962094306945801, + "learning_rate": 1.4272506025547643e-05, + "loss": 2.0332, + "step": 33882 + }, + { + "epoch": 0.4235605890147254, + "grad_norm": 3.316167116165161, + "learning_rate": 1.427171698226274e-05, + "loss": 1.0183, + "step": 33884 + }, + { + "epoch": 0.42358558963974097, + "grad_norm": 5.1418023109436035, + "learning_rate": 1.4270927906445042e-05, + "loss": 1.4121, + "step": 33886 + }, + { + "epoch": 0.4236105902647566, + "grad_norm": 2.5694918632507324, + "learning_rate": 1.4270138798100563e-05, + "loss": 0.1047, + "step": 33888 + }, + { + "epoch": 0.42363559088977226, + "grad_norm": 0.8780694603919983, + "learning_rate": 1.4269349657235312e-05, + "loss": 0.6018, + "step": 33890 + }, + { + "epoch": 0.42366059151478785, + "grad_norm": 2.054192304611206, + "learning_rate": 1.4268560483855294e-05, + "loss": 0.3114, + "step": 33892 + }, + { + "epoch": 0.4236855921398035, + "grad_norm": 4.888576030731201, + "learning_rate": 1.4267771277966526e-05, + "loss": 1.1194, + "step": 33894 + }, + { + "epoch": 0.4237105927648191, + "grad_norm": 3.5081236362457275, + "learning_rate": 1.4266982039575017e-05, + "loss": 1.1858, + "step": 33896 + }, + { + "epoch": 0.42373559338983474, + "grad_norm": 3.1996355056762695, + "learning_rate": 1.4266192768686773e-05, + "loss": 1.2772, + "step": 33898 + }, + { + "epoch": 0.4237605940148504, + "grad_norm": 3.036597728729248, + "learning_rate": 1.426540346530781e-05, + "loss": 0.5014, + "step": 33900 + }, + { + "epoch": 0.423785594639866, + "grad_norm": 4.284143924713135, + "learning_rate": 1.4264614129444136e-05, + "loss": 1.4277, + "step": 33902 + }, + { + "epoch": 0.42381059526488163, + "grad_norm": 0.353422075510025, + "learning_rate": 1.4263824761101763e-05, + "loss": 0.0446, + "step": 33904 + }, + { + "epoch": 0.4238355958898972, + "grad_norm": 1.2625439167022705, + "learning_rate": 1.4263035360286705e-05, + "loss": 0.6118, + "step": 33906 + }, + { + "epoch": 0.42386059651491287, + "grad_norm": 0.006838102824985981, + "learning_rate": 1.4262245927004974e-05, + "loss": 0.4638, + "step": 33908 + }, + { + "epoch": 0.4238855971399285, + "grad_norm": 5.951265811920166, + "learning_rate": 1.426145646126258e-05, + "loss": 1.4971, + "step": 33910 + }, + { + "epoch": 0.4239105977649441, + "grad_norm": 3.9224095344543457, + "learning_rate": 1.4260666963065535e-05, + "loss": 1.3552, + "step": 33912 + }, + { + "epoch": 0.42393559838995976, + "grad_norm": 3.472482681274414, + "learning_rate": 1.4259877432419853e-05, + "loss": 0.7632, + "step": 33914 + }, + { + "epoch": 0.42396059901497535, + "grad_norm": 3.7193198204040527, + "learning_rate": 1.4259087869331547e-05, + "loss": 1.1226, + "step": 33916 + }, + { + "epoch": 0.423985599639991, + "grad_norm": 3.9492719173431396, + "learning_rate": 1.425829827380663e-05, + "loss": 1.1791, + "step": 33918 + }, + { + "epoch": 0.42401060026500664, + "grad_norm": 1.5450807809829712, + "learning_rate": 1.4257508645851117e-05, + "loss": 0.7613, + "step": 33920 + }, + { + "epoch": 0.42403560089002224, + "grad_norm": 0.0009910522494465113, + "learning_rate": 1.425671898547102e-05, + "loss": 1.638, + "step": 33922 + }, + { + "epoch": 0.4240606015150379, + "grad_norm": 0.0030093847308307886, + "learning_rate": 1.4255929292672353e-05, + "loss": 0.5268, + "step": 33924 + }, + { + "epoch": 0.4240856021400535, + "grad_norm": 4.259138584136963, + "learning_rate": 1.4255139567461129e-05, + "loss": 1.1997, + "step": 33926 + }, + { + "epoch": 0.4241106027650691, + "grad_norm": 3.7165706157684326, + "learning_rate": 1.4254349809843363e-05, + "loss": 1.1271, + "step": 33928 + }, + { + "epoch": 0.42413560339008477, + "grad_norm": 5.863954067230225, + "learning_rate": 1.4253560019825073e-05, + "loss": 2.5361, + "step": 33930 + }, + { + "epoch": 0.42416060401510036, + "grad_norm": 4.669577598571777, + "learning_rate": 1.4252770197412268e-05, + "loss": 1.45, + "step": 33932 + }, + { + "epoch": 0.424185604640116, + "grad_norm": 1.735987663269043, + "learning_rate": 1.425198034261097e-05, + "loss": 0.3883, + "step": 33934 + }, + { + "epoch": 0.4242106052651316, + "grad_norm": 3.5059287548065186, + "learning_rate": 1.425119045542719e-05, + "loss": 1.0823, + "step": 33936 + }, + { + "epoch": 0.42423560589014725, + "grad_norm": 1.7476422786712646, + "learning_rate": 1.4250400535866946e-05, + "loss": 0.7233, + "step": 33938 + }, + { + "epoch": 0.4242606065151629, + "grad_norm": 2.9533534049987793, + "learning_rate": 1.424961058393625e-05, + "loss": 1.1353, + "step": 33940 + }, + { + "epoch": 0.4242856071401785, + "grad_norm": 2.996352195739746, + "learning_rate": 1.4248820599641121e-05, + "loss": 1.2201, + "step": 33942 + }, + { + "epoch": 0.42431060776519414, + "grad_norm": 1.928126573562622, + "learning_rate": 1.4248030582987577e-05, + "loss": 0.9437, + "step": 33944 + }, + { + "epoch": 0.42433560839020973, + "grad_norm": 0.42564278841018677, + "learning_rate": 1.4247240533981633e-05, + "loss": 0.0123, + "step": 33946 + }, + { + "epoch": 0.4243606090152254, + "grad_norm": 3.8032026290893555, + "learning_rate": 1.4246450452629306e-05, + "loss": 1.2635, + "step": 33948 + }, + { + "epoch": 0.424385609640241, + "grad_norm": 3.3263728618621826, + "learning_rate": 1.4245660338936609e-05, + "loss": 1.0698, + "step": 33950 + }, + { + "epoch": 0.4244106102652566, + "grad_norm": 5.933575630187988, + "learning_rate": 1.4244870192909566e-05, + "loss": 0.6451, + "step": 33952 + }, + { + "epoch": 0.42443561089027226, + "grad_norm": 3.6745991706848145, + "learning_rate": 1.4244080014554191e-05, + "loss": 0.6186, + "step": 33954 + }, + { + "epoch": 0.42446061151528786, + "grad_norm": 5.316084861755371, + "learning_rate": 1.4243289803876503e-05, + "loss": 2.2336, + "step": 33956 + }, + { + "epoch": 0.4244856121403035, + "grad_norm": 4.840404987335205, + "learning_rate": 1.424249956088252e-05, + "loss": 1.2043, + "step": 33958 + }, + { + "epoch": 0.42451061276531915, + "grad_norm": 2.9807558059692383, + "learning_rate": 1.424170928557826e-05, + "loss": 1.0828, + "step": 33960 + }, + { + "epoch": 0.42453561339033474, + "grad_norm": 7.65562105178833, + "learning_rate": 1.4240918977969745e-05, + "loss": 0.2086, + "step": 33962 + }, + { + "epoch": 0.4245606140153504, + "grad_norm": 1.154577612876892, + "learning_rate": 1.4240128638062988e-05, + "loss": 0.4447, + "step": 33964 + }, + { + "epoch": 0.424585614640366, + "grad_norm": 0.011192705482244492, + "learning_rate": 1.423933826586401e-05, + "loss": 0.038, + "step": 33966 + }, + { + "epoch": 0.42461061526538163, + "grad_norm": 5.805307865142822, + "learning_rate": 1.4238547861378834e-05, + "loss": 1.0015, + "step": 33968 + }, + { + "epoch": 0.4246356158903973, + "grad_norm": 2.364800453186035, + "learning_rate": 1.4237757424613475e-05, + "loss": 0.6809, + "step": 33970 + }, + { + "epoch": 0.42466061651541287, + "grad_norm": 3.0299007892608643, + "learning_rate": 1.4236966955573955e-05, + "loss": 0.167, + "step": 33972 + }, + { + "epoch": 0.4246856171404285, + "grad_norm": 9.711200714111328, + "learning_rate": 1.4236176454266292e-05, + "loss": 1.7799, + "step": 33974 + }, + { + "epoch": 0.4247106177654441, + "grad_norm": 3.041588068008423, + "learning_rate": 1.4235385920696513e-05, + "loss": 0.3053, + "step": 33976 + }, + { + "epoch": 0.42473561839045976, + "grad_norm": 3.463524580001831, + "learning_rate": 1.4234595354870627e-05, + "loss": 1.4367, + "step": 33978 + }, + { + "epoch": 0.4247606190154754, + "grad_norm": 1.3018875122070312, + "learning_rate": 1.4233804756794667e-05, + "loss": 0.1999, + "step": 33980 + }, + { + "epoch": 0.424785619640491, + "grad_norm": 2.366238832473755, + "learning_rate": 1.4233014126474647e-05, + "loss": 1.178, + "step": 33982 + }, + { + "epoch": 0.42481062026550664, + "grad_norm": 2.9083926677703857, + "learning_rate": 1.423222346391659e-05, + "loss": 0.5179, + "step": 33984 + }, + { + "epoch": 0.42483562089052224, + "grad_norm": 6.3805670738220215, + "learning_rate": 1.4231432769126516e-05, + "loss": 0.4584, + "step": 33986 + }, + { + "epoch": 0.4248606215155379, + "grad_norm": 10.960522651672363, + "learning_rate": 1.4230642042110452e-05, + "loss": 1.0611, + "step": 33988 + }, + { + "epoch": 0.42488562214055353, + "grad_norm": 0.0051331412978470325, + "learning_rate": 1.4229851282874413e-05, + "loss": 1.6491, + "step": 33990 + }, + { + "epoch": 0.4249106227655691, + "grad_norm": 4.88139533996582, + "learning_rate": 1.4229060491424426e-05, + "loss": 1.0906, + "step": 33992 + }, + { + "epoch": 0.42493562339058477, + "grad_norm": 2.189277410507202, + "learning_rate": 1.4228269667766514e-05, + "loss": 1.6344, + "step": 33994 + }, + { + "epoch": 0.42496062401560036, + "grad_norm": 5.399969577789307, + "learning_rate": 1.4227478811906696e-05, + "loss": 1.2126, + "step": 33996 + }, + { + "epoch": 0.424985624640616, + "grad_norm": 3.6183736324310303, + "learning_rate": 1.4226687923850999e-05, + "loss": 1.2279, + "step": 33998 + }, + { + "epoch": 0.42501062526563166, + "grad_norm": 3.7314233779907227, + "learning_rate": 1.4225897003605444e-05, + "loss": 1.3997, + "step": 34000 + }, + { + "epoch": 0.42503562589064725, + "grad_norm": 4.979247570037842, + "learning_rate": 1.4225106051176054e-05, + "loss": 1.1011, + "step": 34002 + }, + { + "epoch": 0.4250606265156629, + "grad_norm": 2.841613531112671, + "learning_rate": 1.4224315066568854e-05, + "loss": 1.6234, + "step": 34004 + }, + { + "epoch": 0.4250856271406785, + "grad_norm": 1.8754085302352905, + "learning_rate": 1.422352404978987e-05, + "loss": 0.6207, + "step": 34006 + }, + { + "epoch": 0.42511062776569414, + "grad_norm": 1.8154122829437256, + "learning_rate": 1.422273300084512e-05, + "loss": 1.2084, + "step": 34008 + }, + { + "epoch": 0.4251356283907098, + "grad_norm": 6.920751571655273, + "learning_rate": 1.4221941919740637e-05, + "loss": 2.0517, + "step": 34010 + }, + { + "epoch": 0.4251606290157254, + "grad_norm": 4.532846450805664, + "learning_rate": 1.4221150806482438e-05, + "loss": 0.1876, + "step": 34012 + }, + { + "epoch": 0.425185629640741, + "grad_norm": 2.87007474899292, + "learning_rate": 1.4220359661076555e-05, + "loss": 0.5265, + "step": 34014 + }, + { + "epoch": 0.4252106302657566, + "grad_norm": 4.84425687789917, + "learning_rate": 1.4219568483529003e-05, + "loss": 0.1429, + "step": 34016 + }, + { + "epoch": 0.42523563089077226, + "grad_norm": 3.8421549797058105, + "learning_rate": 1.4218777273845818e-05, + "loss": 0.4535, + "step": 34018 + }, + { + "epoch": 0.4252606315157879, + "grad_norm": 1.8967782258987427, + "learning_rate": 1.4217986032033024e-05, + "loss": 0.8806, + "step": 34020 + }, + { + "epoch": 0.4252856321408035, + "grad_norm": 8.105917930603027, + "learning_rate": 1.4217194758096645e-05, + "loss": 1.1455, + "step": 34022 + }, + { + "epoch": 0.42531063276581915, + "grad_norm": 4.03717041015625, + "learning_rate": 1.4216403452042702e-05, + "loss": 1.0579, + "step": 34024 + }, + { + "epoch": 0.42533563339083474, + "grad_norm": 3.105977773666382, + "learning_rate": 1.4215612113877233e-05, + "loss": 1.4208, + "step": 34026 + }, + { + "epoch": 0.4253606340158504, + "grad_norm": 3.8116073608398438, + "learning_rate": 1.4214820743606253e-05, + "loss": 2.0597, + "step": 34028 + }, + { + "epoch": 0.42538563464086604, + "grad_norm": 0.00394076481461525, + "learning_rate": 1.4214029341235796e-05, + "loss": 0.0035, + "step": 34030 + }, + { + "epoch": 0.42541063526588163, + "grad_norm": 0.009120245464146137, + "learning_rate": 1.4213237906771888e-05, + "loss": 1.3206, + "step": 34032 + }, + { + "epoch": 0.4254356358908973, + "grad_norm": 3.1447806358337402, + "learning_rate": 1.4212446440220555e-05, + "loss": 0.8343, + "step": 34034 + }, + { + "epoch": 0.42546063651591287, + "grad_norm": 6.558899402618408, + "learning_rate": 1.4211654941587827e-05, + "loss": 1.3576, + "step": 34036 + }, + { + "epoch": 0.4254856371409285, + "grad_norm": 3.4487011432647705, + "learning_rate": 1.421086341087973e-05, + "loss": 0.7308, + "step": 34038 + }, + { + "epoch": 0.42551063776594417, + "grad_norm": 3.1775171756744385, + "learning_rate": 1.4210071848102291e-05, + "loss": 1.2456, + "step": 34040 + }, + { + "epoch": 0.42553563839095976, + "grad_norm": 2.5137534141540527, + "learning_rate": 1.4209280253261542e-05, + "loss": 1.0864, + "step": 34042 + }, + { + "epoch": 0.4255606390159754, + "grad_norm": 3.0827221870422363, + "learning_rate": 1.4208488626363507e-05, + "loss": 0.7295, + "step": 34044 + }, + { + "epoch": 0.425585639640991, + "grad_norm": 3.8330941200256348, + "learning_rate": 1.4207696967414224e-05, + "loss": 1.5296, + "step": 34046 + }, + { + "epoch": 0.42561064026600665, + "grad_norm": 0.6928947567939758, + "learning_rate": 1.4206905276419712e-05, + "loss": 0.7164, + "step": 34048 + }, + { + "epoch": 0.4256356408910223, + "grad_norm": 2.402750015258789, + "learning_rate": 1.4206113553386005e-05, + "loss": 1.387, + "step": 34050 + }, + { + "epoch": 0.4256606415160379, + "grad_norm": 3.323533296585083, + "learning_rate": 1.4205321798319132e-05, + "loss": 0.5559, + "step": 34052 + }, + { + "epoch": 0.42568564214105353, + "grad_norm": 5.820252418518066, + "learning_rate": 1.4204530011225121e-05, + "loss": 0.9972, + "step": 34054 + }, + { + "epoch": 0.4257106427660691, + "grad_norm": 3.104923725128174, + "learning_rate": 1.4203738192110006e-05, + "loss": 0.8583, + "step": 34056 + }, + { + "epoch": 0.4257356433910848, + "grad_norm": 3.46968412399292, + "learning_rate": 1.4202946340979813e-05, + "loss": 0.8899, + "step": 34058 + }, + { + "epoch": 0.4257606440161004, + "grad_norm": 2.6382970809936523, + "learning_rate": 1.4202154457840578e-05, + "loss": 0.6822, + "step": 34060 + }, + { + "epoch": 0.425785644641116, + "grad_norm": 6.2326531410217285, + "learning_rate": 1.4201362542698328e-05, + "loss": 0.7359, + "step": 34062 + }, + { + "epoch": 0.42581064526613166, + "grad_norm": 0.6082273721694946, + "learning_rate": 1.4200570595559096e-05, + "loss": 0.463, + "step": 34064 + }, + { + "epoch": 0.42583564589114725, + "grad_norm": 4.415215015411377, + "learning_rate": 1.4199778616428907e-05, + "loss": 0.6739, + "step": 34066 + }, + { + "epoch": 0.4258606465161629, + "grad_norm": 6.447166442871094, + "learning_rate": 1.4198986605313802e-05, + "loss": 0.7985, + "step": 34068 + }, + { + "epoch": 0.42588564714117855, + "grad_norm": 2.7567408084869385, + "learning_rate": 1.4198194562219812e-05, + "loss": 0.4838, + "step": 34070 + }, + { + "epoch": 0.42591064776619414, + "grad_norm": 3.784850597381592, + "learning_rate": 1.4197402487152962e-05, + "loss": 0.1001, + "step": 34072 + }, + { + "epoch": 0.4259356483912098, + "grad_norm": 6.443667888641357, + "learning_rate": 1.4196610380119289e-05, + "loss": 0.3418, + "step": 34074 + }, + { + "epoch": 0.4259606490162254, + "grad_norm": 4.618422031402588, + "learning_rate": 1.4195818241124823e-05, + "loss": 1.1383, + "step": 34076 + }, + { + "epoch": 0.425985649641241, + "grad_norm": 4.731720447540283, + "learning_rate": 1.4195026070175602e-05, + "loss": 1.0317, + "step": 34078 + }, + { + "epoch": 0.4260106502662567, + "grad_norm": 0.03839384391903877, + "learning_rate": 1.4194233867277654e-05, + "loss": 0.1462, + "step": 34080 + }, + { + "epoch": 0.42603565089127227, + "grad_norm": 0.7987863421440125, + "learning_rate": 1.4193441632437012e-05, + "loss": 0.4416, + "step": 34082 + }, + { + "epoch": 0.4260606515162879, + "grad_norm": 0.07374654710292816, + "learning_rate": 1.4192649365659715e-05, + "loss": 0.7699, + "step": 34084 + }, + { + "epoch": 0.4260856521413035, + "grad_norm": 0.01386029552668333, + "learning_rate": 1.4191857066951793e-05, + "loss": 0.0877, + "step": 34086 + }, + { + "epoch": 0.42611065276631915, + "grad_norm": 8.234038352966309, + "learning_rate": 1.4191064736319278e-05, + "loss": 0.4634, + "step": 34088 + }, + { + "epoch": 0.4261356533913348, + "grad_norm": 1.8843446969985962, + "learning_rate": 1.4190272373768208e-05, + "loss": 0.7664, + "step": 34090 + }, + { + "epoch": 0.4261606540163504, + "grad_norm": 2.9955615997314453, + "learning_rate": 1.4189479979304617e-05, + "loss": 0.6276, + "step": 34092 + }, + { + "epoch": 0.42618565464136604, + "grad_norm": 5.521753311157227, + "learning_rate": 1.4188687552934537e-05, + "loss": 0.4215, + "step": 34094 + }, + { + "epoch": 0.42621065526638163, + "grad_norm": 0.32581308484077454, + "learning_rate": 1.4187895094664006e-05, + "loss": 0.7045, + "step": 34096 + }, + { + "epoch": 0.4262356558913973, + "grad_norm": 1.8972597122192383, + "learning_rate": 1.418710260449906e-05, + "loss": 0.3615, + "step": 34098 + }, + { + "epoch": 0.42626065651641293, + "grad_norm": 5.805881977081299, + "learning_rate": 1.4186310082445729e-05, + "loss": 0.5116, + "step": 34100 + }, + { + "epoch": 0.4262856571414285, + "grad_norm": 2.4342963695526123, + "learning_rate": 1.4185517528510053e-05, + "loss": 0.5157, + "step": 34102 + }, + { + "epoch": 0.42631065776644417, + "grad_norm": 2.4000308513641357, + "learning_rate": 1.4184724942698069e-05, + "loss": 1.5635, + "step": 34104 + }, + { + "epoch": 0.42633565839145976, + "grad_norm": 0.04285382851958275, + "learning_rate": 1.418393232501581e-05, + "loss": 0.8799, + "step": 34106 + }, + { + "epoch": 0.4263606590164754, + "grad_norm": 0.523817777633667, + "learning_rate": 1.4183139675469315e-05, + "loss": 0.0229, + "step": 34108 + }, + { + "epoch": 0.42638565964149105, + "grad_norm": 2.7650885581970215, + "learning_rate": 1.418234699406462e-05, + "loss": 0.644, + "step": 34110 + }, + { + "epoch": 0.42641066026650665, + "grad_norm": 2.7888312339782715, + "learning_rate": 1.4181554280807764e-05, + "loss": 0.9904, + "step": 34112 + }, + { + "epoch": 0.4264356608915223, + "grad_norm": 3.2959680557250977, + "learning_rate": 1.418076153570478e-05, + "loss": 1.2084, + "step": 34114 + }, + { + "epoch": 0.4264606615165379, + "grad_norm": 0.01018353458493948, + "learning_rate": 1.4179968758761706e-05, + "loss": 0.6933, + "step": 34116 + }, + { + "epoch": 0.42648566214155353, + "grad_norm": 3.1896748542785645, + "learning_rate": 1.4179175949984582e-05, + "loss": 0.7795, + "step": 34118 + }, + { + "epoch": 0.4265106627665692, + "grad_norm": 2.240468740463257, + "learning_rate": 1.4178383109379446e-05, + "loss": 0.8956, + "step": 34120 + }, + { + "epoch": 0.4265356633915848, + "grad_norm": 2.532101631164551, + "learning_rate": 1.4177590236952333e-05, + "loss": 0.7561, + "step": 34122 + }, + { + "epoch": 0.4265606640166004, + "grad_norm": 4.979738235473633, + "learning_rate": 1.4176797332709285e-05, + "loss": 0.5601, + "step": 34124 + }, + { + "epoch": 0.426585664641616, + "grad_norm": 4.745916843414307, + "learning_rate": 1.417600439665634e-05, + "loss": 1.137, + "step": 34126 + }, + { + "epoch": 0.42661066526663166, + "grad_norm": 2.567241907119751, + "learning_rate": 1.4175211428799533e-05, + "loss": 0.6003, + "step": 34128 + }, + { + "epoch": 0.4266356658916473, + "grad_norm": 3.905240297317505, + "learning_rate": 1.4174418429144908e-05, + "loss": 0.7436, + "step": 34130 + }, + { + "epoch": 0.4266606665166629, + "grad_norm": 2.77274489402771, + "learning_rate": 1.4173625397698502e-05, + "loss": 1.2215, + "step": 34132 + }, + { + "epoch": 0.42668566714167855, + "grad_norm": 3.8211607933044434, + "learning_rate": 1.4172832334466355e-05, + "loss": 1.9799, + "step": 34134 + }, + { + "epoch": 0.42671066776669414, + "grad_norm": 0.12959572672843933, + "learning_rate": 1.4172039239454507e-05, + "loss": 0.729, + "step": 34136 + }, + { + "epoch": 0.4267356683917098, + "grad_norm": 0.6042704582214355, + "learning_rate": 1.4171246112669002e-05, + "loss": 0.9193, + "step": 34138 + }, + { + "epoch": 0.42676066901672544, + "grad_norm": 0.9917078018188477, + "learning_rate": 1.4170452954115872e-05, + "loss": 1.6939, + "step": 34140 + }, + { + "epoch": 0.42678566964174103, + "grad_norm": 2.527390718460083, + "learning_rate": 1.4169659763801164e-05, + "loss": 0.6453, + "step": 34142 + }, + { + "epoch": 0.4268106702667567, + "grad_norm": 3.9641273021698, + "learning_rate": 1.4168866541730917e-05, + "loss": 1.8601, + "step": 34144 + }, + { + "epoch": 0.42683567089177227, + "grad_norm": 4.404911518096924, + "learning_rate": 1.4168073287911171e-05, + "loss": 0.848, + "step": 34146 + }, + { + "epoch": 0.4268606715167879, + "grad_norm": 4.034006595611572, + "learning_rate": 1.4167280002347968e-05, + "loss": 1.0615, + "step": 34148 + }, + { + "epoch": 0.42688567214180356, + "grad_norm": 2.2177581787109375, + "learning_rate": 1.4166486685047351e-05, + "loss": 0.6192, + "step": 34150 + }, + { + "epoch": 0.42691067276681915, + "grad_norm": 2.731398105621338, + "learning_rate": 1.4165693336015361e-05, + "loss": 0.4314, + "step": 34152 + }, + { + "epoch": 0.4269356733918348, + "grad_norm": 4.477541446685791, + "learning_rate": 1.4164899955258038e-05, + "loss": 1.2634, + "step": 34154 + }, + { + "epoch": 0.4269606740168504, + "grad_norm": 0.006274614483118057, + "learning_rate": 1.4164106542781427e-05, + "loss": 0.7757, + "step": 34156 + }, + { + "epoch": 0.42698567464186604, + "grad_norm": 3.4375524520874023, + "learning_rate": 1.4163313098591566e-05, + "loss": 0.6296, + "step": 34158 + }, + { + "epoch": 0.4270106752668817, + "grad_norm": 4.775267124176025, + "learning_rate": 1.4162519622694503e-05, + "loss": 0.4878, + "step": 34160 + }, + { + "epoch": 0.4270356758918973, + "grad_norm": 3.2920584678649902, + "learning_rate": 1.4161726115096282e-05, + "loss": 1.0711, + "step": 34162 + }, + { + "epoch": 0.42706067651691293, + "grad_norm": 1.0142914056777954, + "learning_rate": 1.4160932575802944e-05, + "loss": 0.5915, + "step": 34164 + }, + { + "epoch": 0.4270856771419285, + "grad_norm": 1.352612018585205, + "learning_rate": 1.4160139004820527e-05, + "loss": 0.7173, + "step": 34166 + }, + { + "epoch": 0.42711067776694417, + "grad_norm": 1.371407151222229, + "learning_rate": 1.415934540215508e-05, + "loss": 0.7943, + "step": 34168 + }, + { + "epoch": 0.4271356783919598, + "grad_norm": 0.007130302954465151, + "learning_rate": 1.4158551767812652e-05, + "loss": 0.6685, + "step": 34170 + }, + { + "epoch": 0.4271606790169754, + "grad_norm": 0.0018052186351269484, + "learning_rate": 1.4157758101799278e-05, + "loss": 0.2654, + "step": 34172 + }, + { + "epoch": 0.42718567964199106, + "grad_norm": 3.439767360687256, + "learning_rate": 1.4156964404121008e-05, + "loss": 0.8691, + "step": 34174 + }, + { + "epoch": 0.42721068026700665, + "grad_norm": 3.8570969104766846, + "learning_rate": 1.4156170674783884e-05, + "loss": 0.8774, + "step": 34176 + }, + { + "epoch": 0.4272356808920223, + "grad_norm": 0.007242698688060045, + "learning_rate": 1.4155376913793952e-05, + "loss": 0.0337, + "step": 34178 + }, + { + "epoch": 0.42726068151703794, + "grad_norm": 4.853372097015381, + "learning_rate": 1.4154583121157255e-05, + "loss": 0.3877, + "step": 34180 + }, + { + "epoch": 0.42728568214205354, + "grad_norm": 3.287505865097046, + "learning_rate": 1.4153789296879844e-05, + "loss": 1.9412, + "step": 34182 + }, + { + "epoch": 0.4273106827670692, + "grad_norm": 1.9839686155319214, + "learning_rate": 1.4152995440967759e-05, + "loss": 0.9292, + "step": 34184 + }, + { + "epoch": 0.4273356833920848, + "grad_norm": 2.18040132522583, + "learning_rate": 1.4152201553427048e-05, + "loss": 1.2624, + "step": 34186 + }, + { + "epoch": 0.4273606840171004, + "grad_norm": 7.5457868576049805, + "learning_rate": 1.4151407634263757e-05, + "loss": 0.9401, + "step": 34188 + }, + { + "epoch": 0.42738568464211607, + "grad_norm": 3.8298792839050293, + "learning_rate": 1.4150613683483933e-05, + "loss": 1.6729, + "step": 34190 + }, + { + "epoch": 0.42741068526713166, + "grad_norm": 3.7595221996307373, + "learning_rate": 1.4149819701093624e-05, + "loss": 0.7744, + "step": 34192 + }, + { + "epoch": 0.4274356858921473, + "grad_norm": 4.8021135330200195, + "learning_rate": 1.414902568709887e-05, + "loss": 1.8159, + "step": 34194 + }, + { + "epoch": 0.4274606865171629, + "grad_norm": 4.3171844482421875, + "learning_rate": 1.414823164150573e-05, + "loss": 1.0437, + "step": 34196 + }, + { + "epoch": 0.42748568714217855, + "grad_norm": 2.3164374828338623, + "learning_rate": 1.4147437564320238e-05, + "loss": 0.6534, + "step": 34198 + }, + { + "epoch": 0.4275106877671942, + "grad_norm": 2.1373963356018066, + "learning_rate": 1.414664345554845e-05, + "loss": 0.3759, + "step": 34200 + }, + { + "epoch": 0.4275356883922098, + "grad_norm": 2.8230161666870117, + "learning_rate": 1.4145849315196416e-05, + "loss": 1.4193, + "step": 34202 + }, + { + "epoch": 0.42756068901722544, + "grad_norm": 1.17643141746521, + "learning_rate": 1.4145055143270176e-05, + "loss": 0.6124, + "step": 34204 + }, + { + "epoch": 0.42758568964224103, + "grad_norm": 3.5792198181152344, + "learning_rate": 1.414426093977578e-05, + "loss": 1.2406, + "step": 34206 + }, + { + "epoch": 0.4276106902672567, + "grad_norm": 2.886549472808838, + "learning_rate": 1.4143466704719279e-05, + "loss": 1.2992, + "step": 34208 + }, + { + "epoch": 0.4276356908922723, + "grad_norm": 3.70886492729187, + "learning_rate": 1.4142672438106729e-05, + "loss": 0.7559, + "step": 34210 + }, + { + "epoch": 0.4276606915172879, + "grad_norm": 2.6107027530670166, + "learning_rate": 1.4141878139944165e-05, + "loss": 0.431, + "step": 34212 + }, + { + "epoch": 0.42768569214230356, + "grad_norm": 3.152677536010742, + "learning_rate": 1.4141083810237644e-05, + "loss": 1.295, + "step": 34214 + }, + { + "epoch": 0.42771069276731916, + "grad_norm": 7.874953269958496, + "learning_rate": 1.4140289448993216e-05, + "loss": 1.0695, + "step": 34216 + }, + { + "epoch": 0.4277356933923348, + "grad_norm": 0.007478397339582443, + "learning_rate": 1.4139495056216925e-05, + "loss": 0.9649, + "step": 34218 + }, + { + "epoch": 0.42776069401735045, + "grad_norm": 0.004123257007449865, + "learning_rate": 1.4138700631914828e-05, + "loss": 0.2254, + "step": 34220 + }, + { + "epoch": 0.42778569464236604, + "grad_norm": 3.4716594219207764, + "learning_rate": 1.4137906176092973e-05, + "loss": 1.2593, + "step": 34222 + }, + { + "epoch": 0.4278106952673817, + "grad_norm": 2.4563207626342773, + "learning_rate": 1.4137111688757408e-05, + "loss": 0.0855, + "step": 34224 + }, + { + "epoch": 0.4278356958923973, + "grad_norm": 2.016855478286743, + "learning_rate": 1.4136317169914186e-05, + "loss": 0.3199, + "step": 34226 + }, + { + "epoch": 0.42786069651741293, + "grad_norm": 7.292469024658203, + "learning_rate": 1.413552261956936e-05, + "loss": 1.4871, + "step": 34228 + }, + { + "epoch": 0.4278856971424286, + "grad_norm": 3.0761029720306396, + "learning_rate": 1.4134728037728974e-05, + "loss": 0.9809, + "step": 34230 + }, + { + "epoch": 0.42791069776744417, + "grad_norm": 3.1842398643493652, + "learning_rate": 1.4133933424399084e-05, + "loss": 1.4884, + "step": 34232 + }, + { + "epoch": 0.4279356983924598, + "grad_norm": 0.00721989618614316, + "learning_rate": 1.4133138779585744e-05, + "loss": 0.0002, + "step": 34234 + }, + { + "epoch": 0.4279606990174754, + "grad_norm": 1.5111398696899414, + "learning_rate": 1.4132344103295004e-05, + "loss": 0.3502, + "step": 34236 + }, + { + "epoch": 0.42798569964249106, + "grad_norm": 8.470687866210938, + "learning_rate": 1.4131549395532913e-05, + "loss": 0.4321, + "step": 34238 + }, + { + "epoch": 0.4280107002675067, + "grad_norm": 0.3857095539569855, + "learning_rate": 1.413075465630553e-05, + "loss": 0.1224, + "step": 34240 + }, + { + "epoch": 0.4280357008925223, + "grad_norm": 3.020416259765625, + "learning_rate": 1.4129959885618901e-05, + "loss": 1.1604, + "step": 34242 + }, + { + "epoch": 0.42806070151753794, + "grad_norm": 2.8234903812408447, + "learning_rate": 1.4129165083479079e-05, + "loss": 1.0895, + "step": 34244 + }, + { + "epoch": 0.42808570214255354, + "grad_norm": 4.172037124633789, + "learning_rate": 1.4128370249892122e-05, + "loss": 1.1726, + "step": 34246 + }, + { + "epoch": 0.4281107027675692, + "grad_norm": 4.861802577972412, + "learning_rate": 1.4127575384864085e-05, + "loss": 1.0777, + "step": 34248 + }, + { + "epoch": 0.42813570339258483, + "grad_norm": 5.601099014282227, + "learning_rate": 1.4126780488401015e-05, + "loss": 1.0476, + "step": 34250 + }, + { + "epoch": 0.4281607040176004, + "grad_norm": 0.02091805823147297, + "learning_rate": 1.4125985560508966e-05, + "loss": 0.1718, + "step": 34252 + }, + { + "epoch": 0.42818570464261607, + "grad_norm": 2.993366003036499, + "learning_rate": 1.4125190601193997e-05, + "loss": 1.2842, + "step": 34254 + }, + { + "epoch": 0.42821070526763166, + "grad_norm": 0.6822180151939392, + "learning_rate": 1.4124395610462159e-05, + "loss": 0.0359, + "step": 34256 + }, + { + "epoch": 0.4282357058926473, + "grad_norm": 2.9835262298583984, + "learning_rate": 1.4123600588319507e-05, + "loss": 1.073, + "step": 34258 + }, + { + "epoch": 0.42826070651766296, + "grad_norm": 3.7178151607513428, + "learning_rate": 1.4122805534772095e-05, + "loss": 1.0837, + "step": 34260 + }, + { + "epoch": 0.42828570714267855, + "grad_norm": 2.5487682819366455, + "learning_rate": 1.412201044982598e-05, + "loss": 1.0126, + "step": 34262 + }, + { + "epoch": 0.4283107077676942, + "grad_norm": 2.7049736976623535, + "learning_rate": 1.4121215333487218e-05, + "loss": 1.1052, + "step": 34264 + }, + { + "epoch": 0.4283357083927098, + "grad_norm": 3.456523895263672, + "learning_rate": 1.412042018576186e-05, + "loss": 1.299, + "step": 34266 + }, + { + "epoch": 0.42836070901772544, + "grad_norm": 3.4017040729522705, + "learning_rate": 1.4119625006655966e-05, + "loss": 0.2474, + "step": 34268 + }, + { + "epoch": 0.4283857096427411, + "grad_norm": 2.83591890335083, + "learning_rate": 1.411882979617559e-05, + "loss": 1.3664, + "step": 34270 + }, + { + "epoch": 0.4284107102677567, + "grad_norm": 2.554745674133301, + "learning_rate": 1.4118034554326788e-05, + "loss": 1.5946, + "step": 34272 + }, + { + "epoch": 0.4284357108927723, + "grad_norm": 3.505147695541382, + "learning_rate": 1.411723928111562e-05, + "loss": 1.4674, + "step": 34274 + }, + { + "epoch": 0.4284607115177879, + "grad_norm": 3.2990617752075195, + "learning_rate": 1.4116443976548137e-05, + "loss": 0.6601, + "step": 34276 + }, + { + "epoch": 0.42848571214280357, + "grad_norm": 1.1244312524795532, + "learning_rate": 1.4115648640630399e-05, + "loss": 0.7047, + "step": 34278 + }, + { + "epoch": 0.4285107127678192, + "grad_norm": 2.446056604385376, + "learning_rate": 1.4114853273368465e-05, + "loss": 0.2587, + "step": 34280 + }, + { + "epoch": 0.4285357133928348, + "grad_norm": 3.5974948406219482, + "learning_rate": 1.4114057874768387e-05, + "loss": 0.3106, + "step": 34282 + }, + { + "epoch": 0.42856071401785045, + "grad_norm": 1.9467438459396362, + "learning_rate": 1.4113262444836229e-05, + "loss": 1.0849, + "step": 34284 + }, + { + "epoch": 0.42858571464286604, + "grad_norm": 2.937438488006592, + "learning_rate": 1.4112466983578046e-05, + "loss": 1.3906, + "step": 34286 + }, + { + "epoch": 0.4286107152678817, + "grad_norm": 7.367518901824951, + "learning_rate": 1.4111671490999897e-05, + "loss": 1.8809, + "step": 34288 + }, + { + "epoch": 0.42863571589289734, + "grad_norm": 0.0015716295456513762, + "learning_rate": 1.4110875967107835e-05, + "loss": 0.1913, + "step": 34290 + }, + { + "epoch": 0.42866071651791293, + "grad_norm": 2.395212411880493, + "learning_rate": 1.4110080411907925e-05, + "loss": 1.1728, + "step": 34292 + }, + { + "epoch": 0.4286857171429286, + "grad_norm": 0.0033535738475620747, + "learning_rate": 1.4109284825406224e-05, + "loss": 0.8069, + "step": 34294 + }, + { + "epoch": 0.42871071776794417, + "grad_norm": 2.5431766510009766, + "learning_rate": 1.4108489207608794e-05, + "loss": 1.1043, + "step": 34296 + }, + { + "epoch": 0.4287357183929598, + "grad_norm": 2.9215333461761475, + "learning_rate": 1.4107693558521685e-05, + "loss": 0.6069, + "step": 34298 + }, + { + "epoch": 0.42876071901797547, + "grad_norm": 3.3838298320770264, + "learning_rate": 1.410689787815097e-05, + "loss": 1.3768, + "step": 34300 + }, + { + "epoch": 0.42878571964299106, + "grad_norm": 0.30439621210098267, + "learning_rate": 1.4106102166502695e-05, + "loss": 0.4718, + "step": 34302 + }, + { + "epoch": 0.4288107202680067, + "grad_norm": 2.8602874279022217, + "learning_rate": 1.410530642358293e-05, + "loss": 0.9171, + "step": 34304 + }, + { + "epoch": 0.4288357208930223, + "grad_norm": 7.496429443359375, + "learning_rate": 1.4104510649397729e-05, + "loss": 2.0094, + "step": 34306 + }, + { + "epoch": 0.42886072151803795, + "grad_norm": 4.225896835327148, + "learning_rate": 1.4103714843953158e-05, + "loss": 1.8452, + "step": 34308 + }, + { + "epoch": 0.4288857221430536, + "grad_norm": 2.3508424758911133, + "learning_rate": 1.4102919007255273e-05, + "loss": 0.7261, + "step": 34310 + }, + { + "epoch": 0.4289107227680692, + "grad_norm": 0.7873705625534058, + "learning_rate": 1.410212313931014e-05, + "loss": 0.7722, + "step": 34312 + }, + { + "epoch": 0.42893572339308483, + "grad_norm": 0.002817910397425294, + "learning_rate": 1.4101327240123817e-05, + "loss": 0.7796, + "step": 34314 + }, + { + "epoch": 0.4289607240181004, + "grad_norm": 4.658419132232666, + "learning_rate": 1.4100531309702364e-05, + "loss": 2.3709, + "step": 34316 + }, + { + "epoch": 0.4289857246431161, + "grad_norm": 4.191395282745361, + "learning_rate": 1.4099735348051844e-05, + "loss": 1.0832, + "step": 34318 + }, + { + "epoch": 0.4290107252681317, + "grad_norm": 2.2487940788269043, + "learning_rate": 1.4098939355178321e-05, + "loss": 0.2833, + "step": 34320 + }, + { + "epoch": 0.4290357258931473, + "grad_norm": 2.4667844772338867, + "learning_rate": 1.4098143331087853e-05, + "loss": 1.0703, + "step": 34322 + }, + { + "epoch": 0.42906072651816296, + "grad_norm": 5.563642501831055, + "learning_rate": 1.4097347275786506e-05, + "loss": 2.0125, + "step": 34324 + }, + { + "epoch": 0.42908572714317855, + "grad_norm": 6.92611026763916, + "learning_rate": 1.4096551189280345e-05, + "loss": 1.4724, + "step": 34326 + }, + { + "epoch": 0.4291107277681942, + "grad_norm": 3.7756404876708984, + "learning_rate": 1.4095755071575426e-05, + "loss": 1.2002, + "step": 34328 + }, + { + "epoch": 0.42913572839320985, + "grad_norm": 3.609269380569458, + "learning_rate": 1.4094958922677816e-05, + "loss": 1.3969, + "step": 34330 + }, + { + "epoch": 0.42916072901822544, + "grad_norm": 3.6630163192749023, + "learning_rate": 1.4094162742593577e-05, + "loss": 1.1279, + "step": 34332 + }, + { + "epoch": 0.4291857296432411, + "grad_norm": 0.8822126388549805, + "learning_rate": 1.4093366531328774e-05, + "loss": 0.2591, + "step": 34334 + }, + { + "epoch": 0.4292107302682567, + "grad_norm": 3.2187469005584717, + "learning_rate": 1.4092570288889471e-05, + "loss": 1.0839, + "step": 34336 + }, + { + "epoch": 0.4292357308932723, + "grad_norm": 2.669158458709717, + "learning_rate": 1.4091774015281731e-05, + "loss": 0.7427, + "step": 34338 + }, + { + "epoch": 0.429260731518288, + "grad_norm": 5.944567680358887, + "learning_rate": 1.4090977710511622e-05, + "loss": 1.2191, + "step": 34340 + }, + { + "epoch": 0.42928573214330357, + "grad_norm": 3.2408719062805176, + "learning_rate": 1.40901813745852e-05, + "loss": 0.8386, + "step": 34342 + }, + { + "epoch": 0.4293107327683192, + "grad_norm": 2.801649570465088, + "learning_rate": 1.4089385007508539e-05, + "loss": 0.738, + "step": 34344 + }, + { + "epoch": 0.4293357333933348, + "grad_norm": 3.423887252807617, + "learning_rate": 1.4088588609287697e-05, + "loss": 0.9949, + "step": 34346 + }, + { + "epoch": 0.42936073401835045, + "grad_norm": 0.00261078798212111, + "learning_rate": 1.4087792179928743e-05, + "loss": 0.4668, + "step": 34348 + }, + { + "epoch": 0.4293857346433661, + "grad_norm": 3.2266530990600586, + "learning_rate": 1.4086995719437741e-05, + "loss": 0.5129, + "step": 34350 + }, + { + "epoch": 0.4294107352683817, + "grad_norm": 12.147953033447266, + "learning_rate": 1.4086199227820763e-05, + "loss": 0.7418, + "step": 34352 + }, + { + "epoch": 0.42943573589339734, + "grad_norm": 7.888031482696533, + "learning_rate": 1.4085402705083864e-05, + "loss": 1.0706, + "step": 34354 + }, + { + "epoch": 0.42946073651841293, + "grad_norm": 2.808807611465454, + "learning_rate": 1.4084606151233118e-05, + "loss": 0.1344, + "step": 34356 + }, + { + "epoch": 0.4294857371434286, + "grad_norm": 7.595006942749023, + "learning_rate": 1.4083809566274587e-05, + "loss": 1.6578, + "step": 34358 + }, + { + "epoch": 0.42951073776844423, + "grad_norm": 8.274250030517578, + "learning_rate": 1.4083012950214344e-05, + "loss": 2.2977, + "step": 34360 + }, + { + "epoch": 0.4295357383934598, + "grad_norm": 2.5214767456054688, + "learning_rate": 1.4082216303058447e-05, + "loss": 0.4792, + "step": 34362 + }, + { + "epoch": 0.42956073901847547, + "grad_norm": 3.2475197315216064, + "learning_rate": 1.4081419624812972e-05, + "loss": 1.5371, + "step": 34364 + }, + { + "epoch": 0.42958573964349106, + "grad_norm": 2.3209068775177, + "learning_rate": 1.408062291548398e-05, + "loss": 1.3462, + "step": 34366 + }, + { + "epoch": 0.4296107402685067, + "grad_norm": 0.3887075185775757, + "learning_rate": 1.407982617507754e-05, + "loss": 0.3125, + "step": 34368 + }, + { + "epoch": 0.42963574089352236, + "grad_norm": 4.407620906829834, + "learning_rate": 1.4079029403599722e-05, + "loss": 1.39, + "step": 34370 + }, + { + "epoch": 0.42966074151853795, + "grad_norm": 4.9552001953125, + "learning_rate": 1.4078232601056595e-05, + "loss": 1.0205, + "step": 34372 + }, + { + "epoch": 0.4296857421435536, + "grad_norm": 3.6996443271636963, + "learning_rate": 1.4077435767454221e-05, + "loss": 1.1519, + "step": 34374 + }, + { + "epoch": 0.4297107427685692, + "grad_norm": 3.3725643157958984, + "learning_rate": 1.4076638902798677e-05, + "loss": 0.0839, + "step": 34376 + }, + { + "epoch": 0.42973574339358483, + "grad_norm": 2.3119516372680664, + "learning_rate": 1.4075842007096025e-05, + "loss": 1.2319, + "step": 34378 + }, + { + "epoch": 0.4297607440186005, + "grad_norm": 2.927694082260132, + "learning_rate": 1.4075045080352339e-05, + "loss": 0.8296, + "step": 34380 + }, + { + "epoch": 0.4297857446436161, + "grad_norm": 0.24070653319358826, + "learning_rate": 1.4074248122573682e-05, + "loss": 0.1223, + "step": 34382 + }, + { + "epoch": 0.4298107452686317, + "grad_norm": 2.2897589206695557, + "learning_rate": 1.4073451133766129e-05, + "loss": 0.6515, + "step": 34384 + }, + { + "epoch": 0.4298357458936473, + "grad_norm": 0.00455186003819108, + "learning_rate": 1.4072654113935751e-05, + "loss": 0.6415, + "step": 34386 + }, + { + "epoch": 0.42986074651866296, + "grad_norm": 3.0155482292175293, + "learning_rate": 1.4071857063088612e-05, + "loss": 1.1383, + "step": 34388 + }, + { + "epoch": 0.4298857471436786, + "grad_norm": 3.366010904312134, + "learning_rate": 1.4071059981230785e-05, + "loss": 0.7171, + "step": 34390 + }, + { + "epoch": 0.4299107477686942, + "grad_norm": 0.00259949779137969, + "learning_rate": 1.4070262868368344e-05, + "loss": 1.2699, + "step": 34392 + }, + { + "epoch": 0.42993574839370985, + "grad_norm": 3.4350268840789795, + "learning_rate": 1.4069465724507354e-05, + "loss": 1.0169, + "step": 34394 + }, + { + "epoch": 0.42996074901872544, + "grad_norm": 3.7244131565093994, + "learning_rate": 1.4068668549653888e-05, + "loss": 0.3002, + "step": 34396 + }, + { + "epoch": 0.4299857496437411, + "grad_norm": 4.13460111618042, + "learning_rate": 1.4067871343814021e-05, + "loss": 0.8721, + "step": 34398 + }, + { + "epoch": 0.43001075026875674, + "grad_norm": 4.07058048248291, + "learning_rate": 1.4067074106993819e-05, + "loss": 1.1609, + "step": 34400 + }, + { + "epoch": 0.43003575089377233, + "grad_norm": 4.87245512008667, + "learning_rate": 1.4066276839199355e-05, + "loss": 1.5649, + "step": 34402 + }, + { + "epoch": 0.430060751518788, + "grad_norm": 1.7566055059432983, + "learning_rate": 1.4065479540436706e-05, + "loss": 1.1775, + "step": 34404 + }, + { + "epoch": 0.43008575214380357, + "grad_norm": 0.000940710015129298, + "learning_rate": 1.4064682210711934e-05, + "loss": 0.0002, + "step": 34406 + }, + { + "epoch": 0.4301107527688192, + "grad_norm": 3.221085786819458, + "learning_rate": 1.4063884850031117e-05, + "loss": 1.0477, + "step": 34408 + }, + { + "epoch": 0.43013575339383486, + "grad_norm": 5.9881062507629395, + "learning_rate": 1.406308745840033e-05, + "loss": 1.4403, + "step": 34410 + }, + { + "epoch": 0.43016075401885046, + "grad_norm": 2.3006551265716553, + "learning_rate": 1.4062290035825648e-05, + "loss": 0.7454, + "step": 34412 + }, + { + "epoch": 0.4301857546438661, + "grad_norm": 2.2865357398986816, + "learning_rate": 1.4061492582313134e-05, + "loss": 0.6384, + "step": 34414 + }, + { + "epoch": 0.4302107552688817, + "grad_norm": 0.17982041835784912, + "learning_rate": 1.4060695097868865e-05, + "loss": 0.5346, + "step": 34416 + }, + { + "epoch": 0.43023575589389734, + "grad_norm": 1.518017053604126, + "learning_rate": 1.4059897582498922e-05, + "loss": 0.6491, + "step": 34418 + }, + { + "epoch": 0.430260756518913, + "grad_norm": 5.714238166809082, + "learning_rate": 1.405910003620937e-05, + "loss": 2.1417, + "step": 34420 + }, + { + "epoch": 0.4302857571439286, + "grad_norm": 3.6254754066467285, + "learning_rate": 1.4058302459006286e-05, + "loss": 0.1775, + "step": 34422 + }, + { + "epoch": 0.43031075776894423, + "grad_norm": 15.159963607788086, + "learning_rate": 1.4057504850895748e-05, + "loss": 1.067, + "step": 34424 + }, + { + "epoch": 0.4303357583939598, + "grad_norm": 2.8450517654418945, + "learning_rate": 1.4056707211883823e-05, + "loss": 0.1264, + "step": 34426 + }, + { + "epoch": 0.43036075901897547, + "grad_norm": 4.342449188232422, + "learning_rate": 1.405590954197659e-05, + "loss": 1.6476, + "step": 34428 + }, + { + "epoch": 0.4303857596439911, + "grad_norm": 4.786613464355469, + "learning_rate": 1.4055111841180124e-05, + "loss": 1.7615, + "step": 34430 + }, + { + "epoch": 0.4304107602690067, + "grad_norm": 3.480475425720215, + "learning_rate": 1.4054314109500498e-05, + "loss": 1.552, + "step": 34432 + }, + { + "epoch": 0.43043576089402236, + "grad_norm": 5.122242450714111, + "learning_rate": 1.405351634694379e-05, + "loss": 0.9629, + "step": 34434 + }, + { + "epoch": 0.43046076151903795, + "grad_norm": 7.9796624183654785, + "learning_rate": 1.4052718553516074e-05, + "loss": 2.3572, + "step": 34436 + }, + { + "epoch": 0.4304857621440536, + "grad_norm": 5.608307838439941, + "learning_rate": 1.4051920729223429e-05, + "loss": 0.4572, + "step": 34438 + }, + { + "epoch": 0.43051076276906924, + "grad_norm": 3.3865039348602295, + "learning_rate": 1.4051122874071928e-05, + "loss": 0.6145, + "step": 34440 + }, + { + "epoch": 0.43053576339408484, + "grad_norm": 5.601773262023926, + "learning_rate": 1.4050324988067647e-05, + "loss": 1.8371, + "step": 34442 + }, + { + "epoch": 0.4305607640191005, + "grad_norm": 4.589918613433838, + "learning_rate": 1.4049527071216664e-05, + "loss": 1.7055, + "step": 34444 + }, + { + "epoch": 0.4305857646441161, + "grad_norm": 8.745182991027832, + "learning_rate": 1.4048729123525059e-05, + "loss": 1.0639, + "step": 34446 + }, + { + "epoch": 0.4306107652691317, + "grad_norm": 1.738324761390686, + "learning_rate": 1.40479311449989e-05, + "loss": 0.2147, + "step": 34448 + }, + { + "epoch": 0.43063576589414737, + "grad_norm": 3.158247470855713, + "learning_rate": 1.4047133135644273e-05, + "loss": 0.9062, + "step": 34450 + }, + { + "epoch": 0.43066076651916296, + "grad_norm": 0.0010487900581210852, + "learning_rate": 1.4046335095467251e-05, + "loss": 0.4145, + "step": 34452 + }, + { + "epoch": 0.4306857671441786, + "grad_norm": 3.016291618347168, + "learning_rate": 1.4045537024473915e-05, + "loss": 0.6452, + "step": 34454 + }, + { + "epoch": 0.4307107677691942, + "grad_norm": 1.436320424079895, + "learning_rate": 1.4044738922670338e-05, + "loss": 0.2892, + "step": 34456 + }, + { + "epoch": 0.43073576839420985, + "grad_norm": 4.240312099456787, + "learning_rate": 1.4043940790062603e-05, + "loss": 0.8389, + "step": 34458 + }, + { + "epoch": 0.4307607690192255, + "grad_norm": 2.3844974040985107, + "learning_rate": 1.4043142626656786e-05, + "loss": 0.8344, + "step": 34460 + }, + { + "epoch": 0.4307857696442411, + "grad_norm": 3.7636399269104004, + "learning_rate": 1.4042344432458968e-05, + "loss": 1.8187, + "step": 34462 + }, + { + "epoch": 0.43081077026925674, + "grad_norm": 0.003101196838542819, + "learning_rate": 1.4041546207475227e-05, + "loss": 0.668, + "step": 34464 + }, + { + "epoch": 0.43083577089427233, + "grad_norm": 3.318593978881836, + "learning_rate": 1.4040747951711642e-05, + "loss": 0.8802, + "step": 34466 + }, + { + "epoch": 0.430860771519288, + "grad_norm": 0.0010701582068577409, + "learning_rate": 1.4039949665174289e-05, + "loss": 0.0075, + "step": 34468 + }, + { + "epoch": 0.4308857721443036, + "grad_norm": 0.02802114188671112, + "learning_rate": 1.4039151347869252e-05, + "loss": 0.5735, + "step": 34470 + }, + { + "epoch": 0.4309107727693192, + "grad_norm": 3.2706215381622314, + "learning_rate": 1.403835299980261e-05, + "loss": 1.3325, + "step": 34472 + }, + { + "epoch": 0.43093577339433486, + "grad_norm": 6.997259140014648, + "learning_rate": 1.4037554620980442e-05, + "loss": 0.8557, + "step": 34474 + }, + { + "epoch": 0.43096077401935046, + "grad_norm": 3.2396068572998047, + "learning_rate": 1.4036756211408831e-05, + "loss": 0.8468, + "step": 34476 + }, + { + "epoch": 0.4309857746443661, + "grad_norm": 7.462541580200195, + "learning_rate": 1.4035957771093853e-05, + "loss": 1.3147, + "step": 34478 + }, + { + "epoch": 0.43101077526938175, + "grad_norm": 3.2461466789245605, + "learning_rate": 1.4035159300041593e-05, + "loss": 1.4056, + "step": 34480 + }, + { + "epoch": 0.43103577589439734, + "grad_norm": 1.287329912185669, + "learning_rate": 1.4034360798258128e-05, + "loss": 0.0494, + "step": 34482 + }, + { + "epoch": 0.431060776519413, + "grad_norm": 4.32685661315918, + "learning_rate": 1.4033562265749543e-05, + "loss": 0.851, + "step": 34484 + }, + { + "epoch": 0.4310857771444286, + "grad_norm": 0.0035544070415198803, + "learning_rate": 1.4032763702521916e-05, + "loss": 0.4014, + "step": 34486 + }, + { + "epoch": 0.43111077776944423, + "grad_norm": 5.488037586212158, + "learning_rate": 1.4031965108581336e-05, + "loss": 1.3903, + "step": 34488 + }, + { + "epoch": 0.4311357783944599, + "grad_norm": 4.993816375732422, + "learning_rate": 1.4031166483933878e-05, + "loss": 1.0359, + "step": 34490 + }, + { + "epoch": 0.43116077901947547, + "grad_norm": 2.884608745574951, + "learning_rate": 1.4030367828585625e-05, + "loss": 0.532, + "step": 34492 + }, + { + "epoch": 0.4311857796444911, + "grad_norm": 3.5287232398986816, + "learning_rate": 1.402956914254266e-05, + "loss": 0.2831, + "step": 34494 + }, + { + "epoch": 0.4312107802695067, + "grad_norm": 3.570239305496216, + "learning_rate": 1.4028770425811067e-05, + "loss": 0.7047, + "step": 34496 + }, + { + "epoch": 0.43123578089452236, + "grad_norm": 0.0008528149919584394, + "learning_rate": 1.4027971678396928e-05, + "loss": 0.0271, + "step": 34498 + }, + { + "epoch": 0.431260781519538, + "grad_norm": 3.2174904346466064, + "learning_rate": 1.4027172900306327e-05, + "loss": 0.4944, + "step": 34500 + }, + { + "epoch": 0.4312857821445536, + "grad_norm": 0.0009062862955033779, + "learning_rate": 1.4026374091545348e-05, + "loss": 0.6448, + "step": 34502 + }, + { + "epoch": 0.43131078276956925, + "grad_norm": 0.03495658189058304, + "learning_rate": 1.4025575252120072e-05, + "loss": 0.4808, + "step": 34504 + }, + { + "epoch": 0.43133578339458484, + "grad_norm": 2.480302333831787, + "learning_rate": 1.4024776382036582e-05, + "loss": 1.8103, + "step": 34506 + }, + { + "epoch": 0.4313607840196005, + "grad_norm": 4.641647815704346, + "learning_rate": 1.4023977481300965e-05, + "loss": 0.864, + "step": 34508 + }, + { + "epoch": 0.43138578464461613, + "grad_norm": 1.7957534790039062, + "learning_rate": 1.4023178549919307e-05, + "loss": 0.2529, + "step": 34510 + }, + { + "epoch": 0.4314107852696317, + "grad_norm": 4.762257099151611, + "learning_rate": 1.4022379587897688e-05, + "loss": 1.4238, + "step": 34512 + }, + { + "epoch": 0.4314357858946474, + "grad_norm": 3.519500970840454, + "learning_rate": 1.4021580595242195e-05, + "loss": 0.8531, + "step": 34514 + }, + { + "epoch": 0.43146078651966296, + "grad_norm": 1.8037444353103638, + "learning_rate": 1.4020781571958914e-05, + "loss": 0.8684, + "step": 34516 + }, + { + "epoch": 0.4314857871446786, + "grad_norm": 0.3030858635902405, + "learning_rate": 1.401998251805393e-05, + "loss": 0.0514, + "step": 34518 + }, + { + "epoch": 0.43151078776969426, + "grad_norm": 4.799895763397217, + "learning_rate": 1.4019183433533326e-05, + "loss": 1.6362, + "step": 34520 + }, + { + "epoch": 0.43153578839470985, + "grad_norm": 1.6822774410247803, + "learning_rate": 1.4018384318403189e-05, + "loss": 0.8886, + "step": 34522 + }, + { + "epoch": 0.4315607890197255, + "grad_norm": 1.9615378379821777, + "learning_rate": 1.4017585172669606e-05, + "loss": 0.1163, + "step": 34524 + }, + { + "epoch": 0.4315857896447411, + "grad_norm": 2.732501745223999, + "learning_rate": 1.401678599633866e-05, + "loss": 0.9623, + "step": 34526 + }, + { + "epoch": 0.43161079026975674, + "grad_norm": 3.07527494430542, + "learning_rate": 1.4015986789416444e-05, + "loss": 0.9981, + "step": 34528 + }, + { + "epoch": 0.4316357908947724, + "grad_norm": 5.758460521697998, + "learning_rate": 1.401518755190904e-05, + "loss": 1.2382, + "step": 34530 + }, + { + "epoch": 0.431660791519788, + "grad_norm": 0.0018556304275989532, + "learning_rate": 1.4014388283822532e-05, + "loss": 0.9722, + "step": 34532 + }, + { + "epoch": 0.4316857921448036, + "grad_norm": 3.397956132888794, + "learning_rate": 1.401358898516301e-05, + "loss": 1.411, + "step": 34534 + }, + { + "epoch": 0.4317107927698192, + "grad_norm": 0.7323449850082397, + "learning_rate": 1.4012789655936567e-05, + "loss": 0.5431, + "step": 34536 + }, + { + "epoch": 0.43173579339483487, + "grad_norm": 2.033128261566162, + "learning_rate": 1.4011990296149283e-05, + "loss": 0.8813, + "step": 34538 + }, + { + "epoch": 0.4317607940198505, + "grad_norm": 3.4011805057525635, + "learning_rate": 1.4011190905807249e-05, + "loss": 0.281, + "step": 34540 + }, + { + "epoch": 0.4317857946448661, + "grad_norm": 0.0013090656138956547, + "learning_rate": 1.401039148491655e-05, + "loss": 0.4823, + "step": 34542 + }, + { + "epoch": 0.43181079526988175, + "grad_norm": 2.084333658218384, + "learning_rate": 1.4009592033483279e-05, + "loss": 0.6069, + "step": 34544 + }, + { + "epoch": 0.43183579589489735, + "grad_norm": 1.4443918466567993, + "learning_rate": 1.400879255151352e-05, + "loss": 0.5531, + "step": 34546 + }, + { + "epoch": 0.431860796519913, + "grad_norm": 3.320244550704956, + "learning_rate": 1.4007993039013366e-05, + "loss": 1.7469, + "step": 34548 + }, + { + "epoch": 0.43188579714492864, + "grad_norm": 0.010579532012343407, + "learning_rate": 1.4007193495988902e-05, + "loss": 0.1815, + "step": 34550 + }, + { + "epoch": 0.43191079776994423, + "grad_norm": 4.218935489654541, + "learning_rate": 1.4006393922446218e-05, + "loss": 1.9214, + "step": 34552 + }, + { + "epoch": 0.4319357983949599, + "grad_norm": 0.0015453306259587407, + "learning_rate": 1.4005594318391408e-05, + "loss": 0.4227, + "step": 34554 + }, + { + "epoch": 0.43196079901997547, + "grad_norm": 4.579588890075684, + "learning_rate": 1.4004794683830556e-05, + "loss": 1.662, + "step": 34556 + }, + { + "epoch": 0.4319857996449911, + "grad_norm": 1.9967983961105347, + "learning_rate": 1.4003995018769753e-05, + "loss": 1.7664, + "step": 34558 + }, + { + "epoch": 0.43201080027000677, + "grad_norm": 5.847751617431641, + "learning_rate": 1.400319532321509e-05, + "loss": 1.8151, + "step": 34560 + }, + { + "epoch": 0.43203580089502236, + "grad_norm": 0.7691918611526489, + "learning_rate": 1.400239559717266e-05, + "loss": 0.2322, + "step": 34562 + }, + { + "epoch": 0.432060801520038, + "grad_norm": 3.7056000232696533, + "learning_rate": 1.4001595840648551e-05, + "loss": 1.2178, + "step": 34564 + }, + { + "epoch": 0.4320858021450536, + "grad_norm": 0.6912513375282288, + "learning_rate": 1.4000796053648853e-05, + "loss": 0.1093, + "step": 34566 + }, + { + "epoch": 0.43211080277006925, + "grad_norm": 0.0015659043565392494, + "learning_rate": 1.3999996236179661e-05, + "loss": 0.0752, + "step": 34568 + }, + { + "epoch": 0.4321358033950849, + "grad_norm": 2.2959210872650146, + "learning_rate": 1.3999196388247059e-05, + "loss": 0.191, + "step": 34570 + }, + { + "epoch": 0.4321608040201005, + "grad_norm": 2.6142685413360596, + "learning_rate": 1.3998396509857141e-05, + "loss": 0.7739, + "step": 34572 + }, + { + "epoch": 0.43218580464511613, + "grad_norm": 4.0300092697143555, + "learning_rate": 1.3997596601016006e-05, + "loss": 0.9496, + "step": 34574 + }, + { + "epoch": 0.4322108052701317, + "grad_norm": 3.3050713539123535, + "learning_rate": 1.3996796661729737e-05, + "loss": 0.7573, + "step": 34576 + }, + { + "epoch": 0.4322358058951474, + "grad_norm": 2.3559868335723877, + "learning_rate": 1.3995996692004432e-05, + "loss": 0.098, + "step": 34578 + }, + { + "epoch": 0.432260806520163, + "grad_norm": 0.24762149155139923, + "learning_rate": 1.399519669184618e-05, + "loss": 0.633, + "step": 34580 + }, + { + "epoch": 0.4322858071451786, + "grad_norm": 5.69561767578125, + "learning_rate": 1.3994396661261078e-05, + "loss": 1.5487, + "step": 34582 + }, + { + "epoch": 0.43231080777019426, + "grad_norm": 2.9820683002471924, + "learning_rate": 1.3993596600255212e-05, + "loss": 1.046, + "step": 34584 + }, + { + "epoch": 0.43233580839520985, + "grad_norm": 0.0010321536101400852, + "learning_rate": 1.3992796508834679e-05, + "loss": 0.033, + "step": 34586 + }, + { + "epoch": 0.4323608090202255, + "grad_norm": 4.346480846405029, + "learning_rate": 1.3991996387005572e-05, + "loss": 1.6862, + "step": 34588 + }, + { + "epoch": 0.43238580964524115, + "grad_norm": 3.726641893386841, + "learning_rate": 1.3991196234773989e-05, + "loss": 1.2836, + "step": 34590 + }, + { + "epoch": 0.43241081027025674, + "grad_norm": 4.740652561187744, + "learning_rate": 1.3990396052146018e-05, + "loss": 1.3784, + "step": 34592 + }, + { + "epoch": 0.4324358108952724, + "grad_norm": 7.801730632781982, + "learning_rate": 1.3989595839127757e-05, + "loss": 0.803, + "step": 34594 + }, + { + "epoch": 0.432460811520288, + "grad_norm": 3.6568965911865234, + "learning_rate": 1.3988795595725292e-05, + "loss": 1.2608, + "step": 34596 + }, + { + "epoch": 0.4324858121453036, + "grad_norm": 0.0011518846731632948, + "learning_rate": 1.3987995321944728e-05, + "loss": 0.0001, + "step": 34598 + }, + { + "epoch": 0.4325108127703193, + "grad_norm": 0.000890517549123615, + "learning_rate": 1.3987195017792156e-05, + "loss": 0.7516, + "step": 34600 + }, + { + "epoch": 0.43253581339533487, + "grad_norm": 4.2149200439453125, + "learning_rate": 1.398639468327367e-05, + "loss": 1.574, + "step": 34602 + }, + { + "epoch": 0.4325608140203505, + "grad_norm": 9.130012512207031, + "learning_rate": 1.3985594318395366e-05, + "loss": 0.8269, + "step": 34604 + }, + { + "epoch": 0.4325858146453661, + "grad_norm": 2.9648327827453613, + "learning_rate": 1.3984793923163338e-05, + "loss": 0.8944, + "step": 34606 + }, + { + "epoch": 0.43261081527038175, + "grad_norm": 4.423454284667969, + "learning_rate": 1.3983993497583685e-05, + "loss": 1.6169, + "step": 34608 + }, + { + "epoch": 0.4326358158953974, + "grad_norm": 0.0042732772417366505, + "learning_rate": 1.3983193041662499e-05, + "loss": 0.2223, + "step": 34610 + }, + { + "epoch": 0.432660816520413, + "grad_norm": 3.5511436462402344, + "learning_rate": 1.3982392555405877e-05, + "loss": 0.5046, + "step": 34612 + }, + { + "epoch": 0.43268581714542864, + "grad_norm": 3.537731647491455, + "learning_rate": 1.3981592038819923e-05, + "loss": 1.2439, + "step": 34614 + }, + { + "epoch": 0.43271081777044423, + "grad_norm": 0.016099248081445694, + "learning_rate": 1.3980791491910722e-05, + "loss": 0.776, + "step": 34616 + }, + { + "epoch": 0.4327358183954599, + "grad_norm": 0.5620770454406738, + "learning_rate": 1.3979990914684374e-05, + "loss": 0.6586, + "step": 34618 + }, + { + "epoch": 0.43276081902047553, + "grad_norm": 0.0025834403932094574, + "learning_rate": 1.3979190307146984e-05, + "loss": 0.6123, + "step": 34620 + }, + { + "epoch": 0.4327858196454911, + "grad_norm": 0.783109724521637, + "learning_rate": 1.3978389669304637e-05, + "loss": 0.4066, + "step": 34622 + }, + { + "epoch": 0.43281082027050677, + "grad_norm": 3.919900894165039, + "learning_rate": 1.3977589001163438e-05, + "loss": 1.2838, + "step": 34624 + }, + { + "epoch": 0.43283582089552236, + "grad_norm": 5.898822784423828, + "learning_rate": 1.3976788302729488e-05, + "loss": 2.0717, + "step": 34626 + }, + { + "epoch": 0.432860821520538, + "grad_norm": 2.5021822452545166, + "learning_rate": 1.3975987574008877e-05, + "loss": 0.1926, + "step": 34628 + }, + { + "epoch": 0.43288582214555366, + "grad_norm": 3.188084363937378, + "learning_rate": 1.3975186815007709e-05, + "loss": 1.0994, + "step": 34630 + }, + { + "epoch": 0.43291082277056925, + "grad_norm": 3.3651280403137207, + "learning_rate": 1.3974386025732078e-05, + "loss": 0.5731, + "step": 34632 + }, + { + "epoch": 0.4329358233955849, + "grad_norm": 2.9169466495513916, + "learning_rate": 1.3973585206188087e-05, + "loss": 1.062, + "step": 34634 + }, + { + "epoch": 0.4329608240206005, + "grad_norm": 3.198643684387207, + "learning_rate": 1.397278435638183e-05, + "loss": 0.6811, + "step": 34636 + }, + { + "epoch": 0.43298582464561614, + "grad_norm": 0.5414516925811768, + "learning_rate": 1.3971983476319413e-05, + "loss": 0.4938, + "step": 34638 + }, + { + "epoch": 0.4330108252706318, + "grad_norm": 3.459984540939331, + "learning_rate": 1.3971182566006931e-05, + "loss": 1.5522, + "step": 34640 + }, + { + "epoch": 0.4330358258956474, + "grad_norm": 5.422060966491699, + "learning_rate": 1.3970381625450482e-05, + "loss": 1.4637, + "step": 34642 + }, + { + "epoch": 0.433060826520663, + "grad_norm": 2.187788248062134, + "learning_rate": 1.3969580654656169e-05, + "loss": 0.5181, + "step": 34644 + }, + { + "epoch": 0.4330858271456786, + "grad_norm": 1.8257272243499756, + "learning_rate": 1.396877965363009e-05, + "loss": 0.4251, + "step": 34646 + }, + { + "epoch": 0.43311082777069426, + "grad_norm": 3.512075662612915, + "learning_rate": 1.3967978622378348e-05, + "loss": 1.9764, + "step": 34648 + }, + { + "epoch": 0.4331358283957099, + "grad_norm": 8.454010963439941, + "learning_rate": 1.3967177560907042e-05, + "loss": 1.3695, + "step": 34650 + }, + { + "epoch": 0.4331608290207255, + "grad_norm": 3.486116409301758, + "learning_rate": 1.3966376469222273e-05, + "loss": 0.1616, + "step": 34652 + }, + { + "epoch": 0.43318582964574115, + "grad_norm": 4.596710681915283, + "learning_rate": 1.396557534733014e-05, + "loss": 1.3526, + "step": 34654 + }, + { + "epoch": 0.43321083027075674, + "grad_norm": 10.484625816345215, + "learning_rate": 1.3964774195236748e-05, + "loss": 0.8682, + "step": 34656 + }, + { + "epoch": 0.4332358308957724, + "grad_norm": 1.76276433467865, + "learning_rate": 1.3963973012948196e-05, + "loss": 0.9003, + "step": 34658 + }, + { + "epoch": 0.43326083152078804, + "grad_norm": 5.127924919128418, + "learning_rate": 1.3963171800470583e-05, + "loss": 1.4028, + "step": 34660 + }, + { + "epoch": 0.43328583214580363, + "grad_norm": 1.6943459510803223, + "learning_rate": 1.3962370557810017e-05, + "loss": 1.0224, + "step": 34662 + }, + { + "epoch": 0.4333108327708193, + "grad_norm": 0.03415776044130325, + "learning_rate": 1.3961569284972596e-05, + "loss": 0.0247, + "step": 34664 + }, + { + "epoch": 0.43333583339583487, + "grad_norm": 1.0071395635604858, + "learning_rate": 1.3960767981964427e-05, + "loss": 0.0377, + "step": 34666 + }, + { + "epoch": 0.4333608340208505, + "grad_norm": 10.070242881774902, + "learning_rate": 1.3959966648791605e-05, + "loss": 1.6018, + "step": 34668 + }, + { + "epoch": 0.43338583464586616, + "grad_norm": 5.174814701080322, + "learning_rate": 1.3959165285460236e-05, + "loss": 1.2431, + "step": 34670 + }, + { + "epoch": 0.43341083527088176, + "grad_norm": 0.0020243977196514606, + "learning_rate": 1.3958363891976428e-05, + "loss": 0.0005, + "step": 34672 + }, + { + "epoch": 0.4334358358958974, + "grad_norm": 4.399600028991699, + "learning_rate": 1.3957562468346278e-05, + "loss": 0.2379, + "step": 34674 + }, + { + "epoch": 0.433460836520913, + "grad_norm": 3.4053661823272705, + "learning_rate": 1.3956761014575892e-05, + "loss": 1.0178, + "step": 34676 + }, + { + "epoch": 0.43348583714592864, + "grad_norm": 3.8483386039733887, + "learning_rate": 1.3955959530671375e-05, + "loss": 1.0916, + "step": 34678 + }, + { + "epoch": 0.4335108377709443, + "grad_norm": 2.1822612285614014, + "learning_rate": 1.395515801663883e-05, + "loss": 0.7235, + "step": 34680 + }, + { + "epoch": 0.4335358383959599, + "grad_norm": 3.3919925689697266, + "learning_rate": 1.3954356472484359e-05, + "loss": 0.5896, + "step": 34682 + }, + { + "epoch": 0.43356083902097553, + "grad_norm": 5.458566665649414, + "learning_rate": 1.3953554898214068e-05, + "loss": 0.5357, + "step": 34684 + }, + { + "epoch": 0.4335858396459911, + "grad_norm": 2.007565975189209, + "learning_rate": 1.3952753293834062e-05, + "loss": 0.8074, + "step": 34686 + }, + { + "epoch": 0.43361084027100677, + "grad_norm": 0.7877283692359924, + "learning_rate": 1.3951951659350447e-05, + "loss": 0.357, + "step": 34688 + }, + { + "epoch": 0.4336358408960224, + "grad_norm": 5.747721195220947, + "learning_rate": 1.3951149994769327e-05, + "loss": 1.3358, + "step": 34690 + }, + { + "epoch": 0.433660841521038, + "grad_norm": 4.036652088165283, + "learning_rate": 1.395034830009681e-05, + "loss": 1.6645, + "step": 34692 + }, + { + "epoch": 0.43368584214605366, + "grad_norm": 0.0016121286898851395, + "learning_rate": 1.3949546575338995e-05, + "loss": 0.0001, + "step": 34694 + }, + { + "epoch": 0.43371084277106925, + "grad_norm": 0.7785755395889282, + "learning_rate": 1.3948744820501991e-05, + "loss": 0.1329, + "step": 34696 + }, + { + "epoch": 0.4337358433960849, + "grad_norm": 4.917250633239746, + "learning_rate": 1.394794303559191e-05, + "loss": 0.9244, + "step": 34698 + }, + { + "epoch": 0.43376084402110054, + "grad_norm": 1.5189975500106812, + "learning_rate": 1.3947141220614848e-05, + "loss": 0.219, + "step": 34700 + }, + { + "epoch": 0.43378584464611614, + "grad_norm": 0.02942340262234211, + "learning_rate": 1.3946339375576918e-05, + "loss": 0.4276, + "step": 34702 + }, + { + "epoch": 0.4338108452711318, + "grad_norm": 6.733505725860596, + "learning_rate": 1.3945537500484228e-05, + "loss": 1.4506, + "step": 34704 + }, + { + "epoch": 0.4338358458961474, + "grad_norm": 3.2119295597076416, + "learning_rate": 1.394473559534288e-05, + "loss": 0.6225, + "step": 34706 + }, + { + "epoch": 0.433860846521163, + "grad_norm": 3.032191038131714, + "learning_rate": 1.3943933660158984e-05, + "loss": 0.1546, + "step": 34708 + }, + { + "epoch": 0.43388584714617867, + "grad_norm": 3.319380044937134, + "learning_rate": 1.3943131694938648e-05, + "loss": 0.8354, + "step": 34710 + }, + { + "epoch": 0.43391084777119426, + "grad_norm": 5.198832035064697, + "learning_rate": 1.3942329699687976e-05, + "loss": 0.7686, + "step": 34712 + }, + { + "epoch": 0.4339358483962099, + "grad_norm": 2.3532216548919678, + "learning_rate": 1.394152767441308e-05, + "loss": 1.3546, + "step": 34714 + }, + { + "epoch": 0.4339608490212255, + "grad_norm": 1.9585157632827759, + "learning_rate": 1.3940725619120066e-05, + "loss": 0.6737, + "step": 34716 + }, + { + "epoch": 0.43398584964624115, + "grad_norm": 2.992598056793213, + "learning_rate": 1.3939923533815045e-05, + "loss": 1.3972, + "step": 34718 + }, + { + "epoch": 0.4340108502712568, + "grad_norm": 0.28044000267982483, + "learning_rate": 1.3939121418504123e-05, + "loss": 0.8317, + "step": 34720 + }, + { + "epoch": 0.4340358508962724, + "grad_norm": 3.556097984313965, + "learning_rate": 1.3938319273193406e-05, + "loss": 0.8141, + "step": 34722 + }, + { + "epoch": 0.43406085152128804, + "grad_norm": 2.5620527267456055, + "learning_rate": 1.3937517097889009e-05, + "loss": 0.48, + "step": 34724 + }, + { + "epoch": 0.43408585214630363, + "grad_norm": 2.928715229034424, + "learning_rate": 1.3936714892597038e-05, + "loss": 0.7453, + "step": 34726 + }, + { + "epoch": 0.4341108527713193, + "grad_norm": 0.0004886328824795783, + "learning_rate": 1.3935912657323602e-05, + "loss": 0.0, + "step": 34728 + }, + { + "epoch": 0.4341358533963349, + "grad_norm": 3.225893497467041, + "learning_rate": 1.3935110392074813e-05, + "loss": 0.9944, + "step": 34730 + }, + { + "epoch": 0.4341608540213505, + "grad_norm": 2.1293067932128906, + "learning_rate": 1.3934308096856781e-05, + "loss": 1.1341, + "step": 34732 + }, + { + "epoch": 0.43418585464636616, + "grad_norm": 4.306427955627441, + "learning_rate": 1.3933505771675614e-05, + "loss": 1.3471, + "step": 34734 + }, + { + "epoch": 0.43421085527138176, + "grad_norm": 5.185788154602051, + "learning_rate": 1.3932703416537423e-05, + "loss": 1.4986, + "step": 34736 + }, + { + "epoch": 0.4342358558963974, + "grad_norm": 6.4634690284729, + "learning_rate": 1.393190103144832e-05, + "loss": 1.9043, + "step": 34738 + }, + { + "epoch": 0.43426085652141305, + "grad_norm": 2.206125259399414, + "learning_rate": 1.3931098616414413e-05, + "loss": 1.1918, + "step": 34740 + }, + { + "epoch": 0.43428585714642864, + "grad_norm": 3.5275938510894775, + "learning_rate": 1.3930296171441815e-05, + "loss": 0.6507, + "step": 34742 + }, + { + "epoch": 0.4343108577714443, + "grad_norm": 0.0055896081030368805, + "learning_rate": 1.392949369653664e-05, + "loss": 0.4375, + "step": 34744 + }, + { + "epoch": 0.4343358583964599, + "grad_norm": 3.7801740169525146, + "learning_rate": 1.3928691191704996e-05, + "loss": 0.9222, + "step": 34746 + }, + { + "epoch": 0.43436085902147553, + "grad_norm": 0.07709185034036636, + "learning_rate": 1.392788865695299e-05, + "loss": 0.0013, + "step": 34748 + }, + { + "epoch": 0.4343858596464912, + "grad_norm": 2.812350273132324, + "learning_rate": 1.3927086092286746e-05, + "loss": 0.8091, + "step": 34750 + }, + { + "epoch": 0.43441086027150677, + "grad_norm": 2.1871657371520996, + "learning_rate": 1.3926283497712365e-05, + "loss": 0.8306, + "step": 34752 + }, + { + "epoch": 0.4344358608965224, + "grad_norm": 0.002487841760739684, + "learning_rate": 1.3925480873235965e-05, + "loss": 0.5146, + "step": 34754 + }, + { + "epoch": 0.434460861521538, + "grad_norm": 0.0020372429862618446, + "learning_rate": 1.3924678218863661e-05, + "loss": 0.9046, + "step": 34756 + }, + { + "epoch": 0.43448586214655366, + "grad_norm": 5.3128509521484375, + "learning_rate": 1.392387553460156e-05, + "loss": 1.3435, + "step": 34758 + }, + { + "epoch": 0.4345108627715693, + "grad_norm": 3.2992846965789795, + "learning_rate": 1.3923072820455778e-05, + "loss": 0.8475, + "step": 34760 + }, + { + "epoch": 0.4345358633965849, + "grad_norm": 2.360105514526367, + "learning_rate": 1.3922270076432426e-05, + "loss": 0.8752, + "step": 34762 + }, + { + "epoch": 0.43456086402160055, + "grad_norm": 4.008269786834717, + "learning_rate": 1.3921467302537626e-05, + "loss": 1.9224, + "step": 34764 + }, + { + "epoch": 0.43458586464661614, + "grad_norm": 10.080678939819336, + "learning_rate": 1.392066449877748e-05, + "loss": 2.8671, + "step": 34766 + }, + { + "epoch": 0.4346108652716318, + "grad_norm": 3.338733196258545, + "learning_rate": 1.391986166515811e-05, + "loss": 0.8341, + "step": 34768 + }, + { + "epoch": 0.43463586589664743, + "grad_norm": 3.2434310913085938, + "learning_rate": 1.391905880168563e-05, + "loss": 0.6773, + "step": 34770 + }, + { + "epoch": 0.434660866521663, + "grad_norm": 3.206089496612549, + "learning_rate": 1.391825590836615e-05, + "loss": 1.7079, + "step": 34772 + }, + { + "epoch": 0.4346858671466787, + "grad_norm": 4.310932636260986, + "learning_rate": 1.3917452985205783e-05, + "loss": 1.3555, + "step": 34774 + }, + { + "epoch": 0.43471086777169426, + "grad_norm": 9.167256355285645, + "learning_rate": 1.3916650032210654e-05, + "loss": 0.7082, + "step": 34776 + }, + { + "epoch": 0.4347358683967099, + "grad_norm": 2.2718653678894043, + "learning_rate": 1.3915847049386871e-05, + "loss": 1.0866, + "step": 34778 + }, + { + "epoch": 0.43476086902172556, + "grad_norm": 3.594878911972046, + "learning_rate": 1.3915044036740549e-05, + "loss": 0.8901, + "step": 34780 + }, + { + "epoch": 0.43478586964674115, + "grad_norm": 0.005316618364304304, + "learning_rate": 1.3914240994277806e-05, + "loss": 0.1365, + "step": 34782 + }, + { + "epoch": 0.4348108702717568, + "grad_norm": 3.8012423515319824, + "learning_rate": 1.3913437922004758e-05, + "loss": 0.4824, + "step": 34784 + }, + { + "epoch": 0.4348358708967724, + "grad_norm": 4.110631942749023, + "learning_rate": 1.391263481992752e-05, + "loss": 2.0502, + "step": 34786 + }, + { + "epoch": 0.43486087152178804, + "grad_norm": 3.8492038249969482, + "learning_rate": 1.3911831688052204e-05, + "loss": 1.1586, + "step": 34788 + }, + { + "epoch": 0.4348858721468037, + "grad_norm": 1.7121268510818481, + "learning_rate": 1.3911028526384938e-05, + "loss": 1.0734, + "step": 34790 + }, + { + "epoch": 0.4349108727718193, + "grad_norm": 3.3127527236938477, + "learning_rate": 1.3910225334931828e-05, + "loss": 1.0636, + "step": 34792 + }, + { + "epoch": 0.4349358733968349, + "grad_norm": 5.325226783752441, + "learning_rate": 1.3909422113698994e-05, + "loss": 1.7062, + "step": 34794 + }, + { + "epoch": 0.4349608740218505, + "grad_norm": 2.2265114784240723, + "learning_rate": 1.3908618862692557e-05, + "loss": 0.1245, + "step": 34796 + }, + { + "epoch": 0.43498587464686617, + "grad_norm": 2.9213309288024902, + "learning_rate": 1.3907815581918632e-05, + "loss": 0.5785, + "step": 34798 + }, + { + "epoch": 0.4350108752718818, + "grad_norm": 7.408941745758057, + "learning_rate": 1.390701227138333e-05, + "loss": 0.3471, + "step": 34800 + }, + { + "epoch": 0.4350358758968974, + "grad_norm": 6.845752716064453, + "learning_rate": 1.3906208931092781e-05, + "loss": 0.9822, + "step": 34802 + }, + { + "epoch": 0.43506087652191305, + "grad_norm": 2.473975658416748, + "learning_rate": 1.3905405561053095e-05, + "loss": 0.6397, + "step": 34804 + }, + { + "epoch": 0.43508587714692865, + "grad_norm": 4.747694969177246, + "learning_rate": 1.3904602161270394e-05, + "loss": 1.4584, + "step": 34806 + }, + { + "epoch": 0.4351108777719443, + "grad_norm": 4.546311378479004, + "learning_rate": 1.3903798731750793e-05, + "loss": 1.3229, + "step": 34808 + }, + { + "epoch": 0.43513587839695994, + "grad_norm": 1.8801074028015137, + "learning_rate": 1.3902995272500415e-05, + "loss": 0.3472, + "step": 34810 + }, + { + "epoch": 0.43516087902197553, + "grad_norm": 3.8026347160339355, + "learning_rate": 1.3902191783525374e-05, + "loss": 1.6614, + "step": 34812 + }, + { + "epoch": 0.4351858796469912, + "grad_norm": 2.2986929416656494, + "learning_rate": 1.3901388264831792e-05, + "loss": 0.8509, + "step": 34814 + }, + { + "epoch": 0.4352108802720068, + "grad_norm": 0.020041391253471375, + "learning_rate": 1.3900584716425793e-05, + "loss": 0.0791, + "step": 34816 + }, + { + "epoch": 0.4352358808970224, + "grad_norm": 2.8975462913513184, + "learning_rate": 1.389978113831349e-05, + "loss": 0.324, + "step": 34818 + }, + { + "epoch": 0.43526088152203807, + "grad_norm": 4.390117168426514, + "learning_rate": 1.3898977530501006e-05, + "loss": 1.3416, + "step": 34820 + }, + { + "epoch": 0.43528588214705366, + "grad_norm": 0.0026586251333355904, + "learning_rate": 1.3898173892994462e-05, + "loss": 0.0141, + "step": 34822 + }, + { + "epoch": 0.4353108827720693, + "grad_norm": 2.8384487628936768, + "learning_rate": 1.3897370225799973e-05, + "loss": 1.161, + "step": 34824 + }, + { + "epoch": 0.4353358833970849, + "grad_norm": 3.054126262664795, + "learning_rate": 1.3896566528923665e-05, + "loss": 1.319, + "step": 34826 + }, + { + "epoch": 0.43536088402210055, + "grad_norm": 5.606430530548096, + "learning_rate": 1.3895762802371659e-05, + "loss": 1.4783, + "step": 34828 + }, + { + "epoch": 0.4353858846471162, + "grad_norm": 0.0034286193549633026, + "learning_rate": 1.3894959046150071e-05, + "loss": 0.5048, + "step": 34830 + }, + { + "epoch": 0.4354108852721318, + "grad_norm": 2.939788579940796, + "learning_rate": 1.3894155260265027e-05, + "loss": 0.7286, + "step": 34832 + }, + { + "epoch": 0.43543588589714743, + "grad_norm": 3.8652994632720947, + "learning_rate": 1.3893351444722649e-05, + "loss": 1.9223, + "step": 34834 + }, + { + "epoch": 0.435460886522163, + "grad_norm": 2.364232063293457, + "learning_rate": 1.3892547599529056e-05, + "loss": 0.58, + "step": 34836 + }, + { + "epoch": 0.4354858871471787, + "grad_norm": 0.7924213409423828, + "learning_rate": 1.389174372469037e-05, + "loss": 0.7695, + "step": 34838 + }, + { + "epoch": 0.4355108877721943, + "grad_norm": 3.798989772796631, + "learning_rate": 1.3890939820212714e-05, + "loss": 0.9964, + "step": 34840 + }, + { + "epoch": 0.4355358883972099, + "grad_norm": 4.051905632019043, + "learning_rate": 1.3890135886102213e-05, + "loss": 1.4167, + "step": 34842 + }, + { + "epoch": 0.43556088902222556, + "grad_norm": 2.54910945892334, + "learning_rate": 1.3889331922364985e-05, + "loss": 0.3499, + "step": 34844 + }, + { + "epoch": 0.43558588964724115, + "grad_norm": 2.483612060546875, + "learning_rate": 1.388852792900716e-05, + "loss": 1.0721, + "step": 34846 + }, + { + "epoch": 0.4356108902722568, + "grad_norm": 4.193725109100342, + "learning_rate": 1.388772390603485e-05, + "loss": 0.6833, + "step": 34848 + }, + { + "epoch": 0.43563589089727245, + "grad_norm": 0.0008603695314377546, + "learning_rate": 1.3886919853454183e-05, + "loss": 0.0001, + "step": 34850 + }, + { + "epoch": 0.43566089152228804, + "grad_norm": 2.3825066089630127, + "learning_rate": 1.388611577127129e-05, + "loss": 1.0458, + "step": 34852 + }, + { + "epoch": 0.4356858921473037, + "grad_norm": 4.452683448791504, + "learning_rate": 1.3885311659492286e-05, + "loss": 1.8752, + "step": 34854 + }, + { + "epoch": 0.4357108927723193, + "grad_norm": 13.791706085205078, + "learning_rate": 1.3884507518123301e-05, + "loss": 0.4071, + "step": 34856 + }, + { + "epoch": 0.43573589339733493, + "grad_norm": 2.8082854747772217, + "learning_rate": 1.3883703347170452e-05, + "loss": 1.1164, + "step": 34858 + }, + { + "epoch": 0.4357608940223506, + "grad_norm": 9.324535369873047, + "learning_rate": 1.3882899146639868e-05, + "loss": 1.1835, + "step": 34860 + }, + { + "epoch": 0.43578589464736617, + "grad_norm": 0.012951191514730453, + "learning_rate": 1.3882094916537675e-05, + "loss": 0.2003, + "step": 34862 + }, + { + "epoch": 0.4358108952723818, + "grad_norm": 4.3182148933410645, + "learning_rate": 1.3881290656869993e-05, + "loss": 0.8123, + "step": 34864 + }, + { + "epoch": 0.4358358958973974, + "grad_norm": 4.898362159729004, + "learning_rate": 1.3880486367642952e-05, + "loss": 1.8802, + "step": 34866 + }, + { + "epoch": 0.43586089652241305, + "grad_norm": 4.969032287597656, + "learning_rate": 1.3879682048862679e-05, + "loss": 1.2928, + "step": 34868 + }, + { + "epoch": 0.4358858971474287, + "grad_norm": 1.6773775815963745, + "learning_rate": 1.3878877700535291e-05, + "loss": 0.4283, + "step": 34870 + }, + { + "epoch": 0.4359108977724443, + "grad_norm": 2.915550947189331, + "learning_rate": 1.3878073322666921e-05, + "loss": 0.9016, + "step": 34872 + }, + { + "epoch": 0.43593589839745994, + "grad_norm": 0.0017060894751921296, + "learning_rate": 1.3877268915263693e-05, + "loss": 0.2164, + "step": 34874 + }, + { + "epoch": 0.43596089902247553, + "grad_norm": 0.08061697334051132, + "learning_rate": 1.3876464478331733e-05, + "loss": 0.0052, + "step": 34876 + }, + { + "epoch": 0.4359858996474912, + "grad_norm": 4.901693820953369, + "learning_rate": 1.3875660011877168e-05, + "loss": 0.7255, + "step": 34878 + }, + { + "epoch": 0.43601090027250683, + "grad_norm": 1.632619857788086, + "learning_rate": 1.3874855515906123e-05, + "loss": 0.5895, + "step": 34880 + }, + { + "epoch": 0.4360359008975224, + "grad_norm": 3.0665135383605957, + "learning_rate": 1.387405099042473e-05, + "loss": 0.7483, + "step": 34882 + }, + { + "epoch": 0.43606090152253807, + "grad_norm": 4.964548587799072, + "learning_rate": 1.387324643543911e-05, + "loss": 0.9649, + "step": 34884 + }, + { + "epoch": 0.43608590214755366, + "grad_norm": 4.241169452667236, + "learning_rate": 1.3872441850955394e-05, + "loss": 1.4643, + "step": 34886 + }, + { + "epoch": 0.4361109027725693, + "grad_norm": 2.887265682220459, + "learning_rate": 1.3871637236979707e-05, + "loss": 0.8579, + "step": 34888 + }, + { + "epoch": 0.43613590339758496, + "grad_norm": 0.004788892809301615, + "learning_rate": 1.3870832593518177e-05, + "loss": 0.8531, + "step": 34890 + }, + { + "epoch": 0.43616090402260055, + "grad_norm": 4.995781421661377, + "learning_rate": 1.3870027920576937e-05, + "loss": 0.9584, + "step": 34892 + }, + { + "epoch": 0.4361859046476162, + "grad_norm": 3.86041522026062, + "learning_rate": 1.386922321816211e-05, + "loss": 0.9994, + "step": 34894 + }, + { + "epoch": 0.4362109052726318, + "grad_norm": 2.618058919906616, + "learning_rate": 1.3868418486279826e-05, + "loss": 1.7613, + "step": 34896 + }, + { + "epoch": 0.43623590589764744, + "grad_norm": 3.2743141651153564, + "learning_rate": 1.3867613724936214e-05, + "loss": 0.6384, + "step": 34898 + }, + { + "epoch": 0.4362609065226631, + "grad_norm": 4.234291076660156, + "learning_rate": 1.3866808934137402e-05, + "loss": 1.5659, + "step": 34900 + }, + { + "epoch": 0.4362859071476787, + "grad_norm": 3.2267022132873535, + "learning_rate": 1.3866004113889519e-05, + "loss": 1.4835, + "step": 34902 + }, + { + "epoch": 0.4363109077726943, + "grad_norm": 6.918112277984619, + "learning_rate": 1.3865199264198697e-05, + "loss": 1.064, + "step": 34904 + }, + { + "epoch": 0.4363359083977099, + "grad_norm": 4.5203070640563965, + "learning_rate": 1.3864394385071065e-05, + "loss": 0.8495, + "step": 34906 + }, + { + "epoch": 0.43636090902272556, + "grad_norm": 3.0262935161590576, + "learning_rate": 1.386358947651275e-05, + "loss": 0.0804, + "step": 34908 + }, + { + "epoch": 0.4363859096477412, + "grad_norm": 3.375903844833374, + "learning_rate": 1.3862784538529885e-05, + "loss": 0.9328, + "step": 34910 + }, + { + "epoch": 0.4364109102727568, + "grad_norm": 3.6899003982543945, + "learning_rate": 1.3861979571128598e-05, + "loss": 1.0839, + "step": 34912 + }, + { + "epoch": 0.43643591089777245, + "grad_norm": 1.6742057800292969, + "learning_rate": 1.3861174574315022e-05, + "loss": 0.847, + "step": 34914 + }, + { + "epoch": 0.43646091152278804, + "grad_norm": 3.501249074935913, + "learning_rate": 1.3860369548095287e-05, + "loss": 0.6766, + "step": 34916 + }, + { + "epoch": 0.4364859121478037, + "grad_norm": 12.240201950073242, + "learning_rate": 1.3859564492475522e-05, + "loss": 1.0042, + "step": 34918 + }, + { + "epoch": 0.43651091277281934, + "grad_norm": 3.0984678268432617, + "learning_rate": 1.3858759407461862e-05, + "loss": 0.5757, + "step": 34920 + }, + { + "epoch": 0.43653591339783493, + "grad_norm": 3.9717369079589844, + "learning_rate": 1.3857954293060434e-05, + "loss": 0.7144, + "step": 34922 + }, + { + "epoch": 0.4365609140228506, + "grad_norm": 2.831186056137085, + "learning_rate": 1.3857149149277372e-05, + "loss": 0.7083, + "step": 34924 + }, + { + "epoch": 0.43658591464786617, + "grad_norm": 2.689938545227051, + "learning_rate": 1.3856343976118807e-05, + "loss": 1.9125, + "step": 34926 + }, + { + "epoch": 0.4366109152728818, + "grad_norm": 3.101799964904785, + "learning_rate": 1.3855538773590871e-05, + "loss": 0.9443, + "step": 34928 + }, + { + "epoch": 0.43663591589789746, + "grad_norm": 2.2153427600860596, + "learning_rate": 1.3854733541699696e-05, + "loss": 1.3747, + "step": 34930 + }, + { + "epoch": 0.43666091652291306, + "grad_norm": 3.74930739402771, + "learning_rate": 1.3853928280451419e-05, + "loss": 1.3056, + "step": 34932 + }, + { + "epoch": 0.4366859171479287, + "grad_norm": 1.9561909437179565, + "learning_rate": 1.385312298985217e-05, + "loss": 0.6124, + "step": 34934 + }, + { + "epoch": 0.4367109177729443, + "grad_norm": 4.236959457397461, + "learning_rate": 1.3852317669908078e-05, + "loss": 1.6345, + "step": 34936 + }, + { + "epoch": 0.43673591839795994, + "grad_norm": 1.6364039182662964, + "learning_rate": 1.385151232062528e-05, + "loss": 0.6779, + "step": 34938 + }, + { + "epoch": 0.4367609190229756, + "grad_norm": 2.684382915496826, + "learning_rate": 1.3850706942009909e-05, + "loss": 0.4604, + "step": 34940 + }, + { + "epoch": 0.4367859196479912, + "grad_norm": 9.86386775970459, + "learning_rate": 1.3849901534068097e-05, + "loss": 0.926, + "step": 34942 + }, + { + "epoch": 0.43681092027300683, + "grad_norm": 3.8147075176239014, + "learning_rate": 1.3849096096805981e-05, + "loss": 0.9065, + "step": 34944 + }, + { + "epoch": 0.4368359208980224, + "grad_norm": 0.05155714601278305, + "learning_rate": 1.3848290630229694e-05, + "loss": 0.6444, + "step": 34946 + }, + { + "epoch": 0.43686092152303807, + "grad_norm": 2.5976414680480957, + "learning_rate": 1.3847485134345368e-05, + "loss": 0.2328, + "step": 34948 + }, + { + "epoch": 0.4368859221480537, + "grad_norm": 4.152181625366211, + "learning_rate": 1.3846679609159141e-05, + "loss": 1.381, + "step": 34950 + }, + { + "epoch": 0.4369109227730693, + "grad_norm": 2.944277763366699, + "learning_rate": 1.3845874054677144e-05, + "loss": 1.727, + "step": 34952 + }, + { + "epoch": 0.43693592339808496, + "grad_norm": 3.2746267318725586, + "learning_rate": 1.3845068470905515e-05, + "loss": 1.4908, + "step": 34954 + }, + { + "epoch": 0.43696092402310055, + "grad_norm": 3.413222074508667, + "learning_rate": 1.384426285785039e-05, + "loss": 1.8229, + "step": 34956 + }, + { + "epoch": 0.4369859246481162, + "grad_norm": 8.291021347045898, + "learning_rate": 1.3843457215517902e-05, + "loss": 1.3858, + "step": 34958 + }, + { + "epoch": 0.43701092527313184, + "grad_norm": 1.8042678833007812, + "learning_rate": 1.3842651543914187e-05, + "loss": 0.8202, + "step": 34960 + }, + { + "epoch": 0.43703592589814744, + "grad_norm": 3.8135316371917725, + "learning_rate": 1.384184584304538e-05, + "loss": 1.2608, + "step": 34962 + }, + { + "epoch": 0.4370609265231631, + "grad_norm": 3.8887221813201904, + "learning_rate": 1.3841040112917616e-05, + "loss": 1.6513, + "step": 34964 + }, + { + "epoch": 0.4370859271481787, + "grad_norm": 1.032538652420044, + "learning_rate": 1.384023435353704e-05, + "loss": 0.456, + "step": 34966 + }, + { + "epoch": 0.4371109277731943, + "grad_norm": 6.033564567565918, + "learning_rate": 1.3839428564909781e-05, + "loss": 2.9253, + "step": 34968 + }, + { + "epoch": 0.43713592839820997, + "grad_norm": 0.0029964616987854242, + "learning_rate": 1.3838622747041975e-05, + "loss": 0.3035, + "step": 34970 + }, + { + "epoch": 0.43716092902322556, + "grad_norm": 9.080918312072754, + "learning_rate": 1.3837816899939762e-05, + "loss": 1.0457, + "step": 34972 + }, + { + "epoch": 0.4371859296482412, + "grad_norm": 8.11938190460205, + "learning_rate": 1.3837011023609276e-05, + "loss": 2.2516, + "step": 34974 + }, + { + "epoch": 0.4372109302732568, + "grad_norm": 3.081238031387329, + "learning_rate": 1.3836205118056659e-05, + "loss": 0.5334, + "step": 34976 + }, + { + "epoch": 0.43723593089827245, + "grad_norm": 16.135128021240234, + "learning_rate": 1.383539918328804e-05, + "loss": 1.7517, + "step": 34978 + }, + { + "epoch": 0.4372609315232881, + "grad_norm": 5.544651508331299, + "learning_rate": 1.3834593219309573e-05, + "loss": 1.3715, + "step": 34980 + }, + { + "epoch": 0.4372859321483037, + "grad_norm": 3.245215892791748, + "learning_rate": 1.3833787226127379e-05, + "loss": 1.4196, + "step": 34982 + }, + { + "epoch": 0.43731093277331934, + "grad_norm": 3.4091594219207764, + "learning_rate": 1.3832981203747608e-05, + "loss": 1.5478, + "step": 34984 + }, + { + "epoch": 0.43733593339833493, + "grad_norm": 9.296104431152344, + "learning_rate": 1.3832175152176395e-05, + "loss": 1.1409, + "step": 34986 + }, + { + "epoch": 0.4373609340233506, + "grad_norm": 5.254392147064209, + "learning_rate": 1.3831369071419873e-05, + "loss": 1.4183, + "step": 34988 + }, + { + "epoch": 0.4373859346483662, + "grad_norm": 2.6867220401763916, + "learning_rate": 1.3830562961484187e-05, + "loss": 0.6684, + "step": 34990 + }, + { + "epoch": 0.4374109352733818, + "grad_norm": 3.6779849529266357, + "learning_rate": 1.382975682237548e-05, + "loss": 1.6208, + "step": 34992 + }, + { + "epoch": 0.43743593589839747, + "grad_norm": 2.5727274417877197, + "learning_rate": 1.3828950654099881e-05, + "loss": 0.4508, + "step": 34994 + }, + { + "epoch": 0.43746093652341306, + "grad_norm": 0.13529875874519348, + "learning_rate": 1.3828144456663539e-05, + "loss": 0.3494, + "step": 34996 + }, + { + "epoch": 0.4374859371484287, + "grad_norm": 0.002377106575295329, + "learning_rate": 1.382733823007259e-05, + "loss": 0.1469, + "step": 34998 + }, + { + "epoch": 0.43751093777344435, + "grad_norm": 4.185694694519043, + "learning_rate": 1.3826531974333173e-05, + "loss": 1.2063, + "step": 35000 + }, + { + "epoch": 0.43753593839845994, + "grad_norm": 5.146781921386719, + "learning_rate": 1.3825725689451428e-05, + "loss": 0.8079, + "step": 35002 + }, + { + "epoch": 0.4375609390234756, + "grad_norm": 2.860032558441162, + "learning_rate": 1.3824919375433497e-05, + "loss": 0.6338, + "step": 35004 + }, + { + "epoch": 0.43758593964849124, + "grad_norm": 4.896488666534424, + "learning_rate": 1.3824113032285524e-05, + "loss": 1.0855, + "step": 35006 + }, + { + "epoch": 0.43761094027350683, + "grad_norm": 1.7310937643051147, + "learning_rate": 1.3823306660013646e-05, + "loss": 1.6298, + "step": 35008 + }, + { + "epoch": 0.4376359408985225, + "grad_norm": 2.524906873703003, + "learning_rate": 1.3822500258624006e-05, + "loss": 0.5591, + "step": 35010 + }, + { + "epoch": 0.43766094152353807, + "grad_norm": 0.03081091307103634, + "learning_rate": 1.3821693828122745e-05, + "loss": 0.5002, + "step": 35012 + }, + { + "epoch": 0.4376859421485537, + "grad_norm": 7.850461959838867, + "learning_rate": 1.3820887368516e-05, + "loss": 2.9208, + "step": 35014 + }, + { + "epoch": 0.43771094277356937, + "grad_norm": 3.713528871536255, + "learning_rate": 1.382008087980992e-05, + "loss": 1.1795, + "step": 35016 + }, + { + "epoch": 0.43773594339858496, + "grad_norm": 0.0022701923735439777, + "learning_rate": 1.3819274362010644e-05, + "loss": 0.3832, + "step": 35018 + }, + { + "epoch": 0.4377609440236006, + "grad_norm": 2.24794864654541, + "learning_rate": 1.3818467815124315e-05, + "loss": 0.2032, + "step": 35020 + }, + { + "epoch": 0.4377859446486162, + "grad_norm": 3.9601316452026367, + "learning_rate": 1.3817661239157074e-05, + "loss": 1.4262, + "step": 35022 + }, + { + "epoch": 0.43781094527363185, + "grad_norm": 0.6626203656196594, + "learning_rate": 1.3816854634115067e-05, + "loss": 0.4439, + "step": 35024 + }, + { + "epoch": 0.4378359458986475, + "grad_norm": 4.5063958168029785, + "learning_rate": 1.3816048000004433e-05, + "loss": 1.3779, + "step": 35026 + }, + { + "epoch": 0.4378609465236631, + "grad_norm": 0.898736834526062, + "learning_rate": 1.3815241336831315e-05, + "loss": 0.6056, + "step": 35028 + }, + { + "epoch": 0.43788594714867873, + "grad_norm": 4.7241902351379395, + "learning_rate": 1.3814434644601862e-05, + "loss": 2.5416, + "step": 35030 + }, + { + "epoch": 0.4379109477736943, + "grad_norm": 12.024002075195312, + "learning_rate": 1.3813627923322215e-05, + "loss": 0.8835, + "step": 35032 + }, + { + "epoch": 0.43793594839871, + "grad_norm": 0.004900166764855385, + "learning_rate": 1.3812821172998514e-05, + "loss": 0.4263, + "step": 35034 + }, + { + "epoch": 0.4379609490237256, + "grad_norm": 4.811800003051758, + "learning_rate": 1.3812014393636908e-05, + "loss": 1.3221, + "step": 35036 + }, + { + "epoch": 0.4379859496487412, + "grad_norm": 1.1321063041687012, + "learning_rate": 1.3811207585243537e-05, + "loss": 0.0993, + "step": 35038 + }, + { + "epoch": 0.43801095027375686, + "grad_norm": 3.0553138256073, + "learning_rate": 1.3810400747824551e-05, + "loss": 1.3472, + "step": 35040 + }, + { + "epoch": 0.43803595089877245, + "grad_norm": 3.013784170150757, + "learning_rate": 1.380959388138609e-05, + "loss": 1.462, + "step": 35042 + }, + { + "epoch": 0.4380609515237881, + "grad_norm": 1.0366796255111694, + "learning_rate": 1.3808786985934303e-05, + "loss": 0.2344, + "step": 35044 + }, + { + "epoch": 0.43808595214880375, + "grad_norm": 9.28918170928955, + "learning_rate": 1.380798006147533e-05, + "loss": 2.5225, + "step": 35046 + }, + { + "epoch": 0.43811095277381934, + "grad_norm": 1.0416409969329834, + "learning_rate": 1.3807173108015319e-05, + "loss": 1.4755, + "step": 35048 + }, + { + "epoch": 0.438135953398835, + "grad_norm": 1.3575594425201416, + "learning_rate": 1.380636612556042e-05, + "loss": 0.1157, + "step": 35050 + }, + { + "epoch": 0.4381609540238506, + "grad_norm": 3.6233327388763428, + "learning_rate": 1.3805559114116771e-05, + "loss": 1.6984, + "step": 35052 + }, + { + "epoch": 0.4381859546488662, + "grad_norm": 3.574071168899536, + "learning_rate": 1.3804752073690524e-05, + "loss": 0.5399, + "step": 35054 + }, + { + "epoch": 0.4382109552738819, + "grad_norm": 0.0021701387595385313, + "learning_rate": 1.380394500428782e-05, + "loss": 0.4912, + "step": 35056 + }, + { + "epoch": 0.43823595589889747, + "grad_norm": 5.943939685821533, + "learning_rate": 1.3803137905914815e-05, + "loss": 2.0756, + "step": 35058 + }, + { + "epoch": 0.4382609565239131, + "grad_norm": 4.682657241821289, + "learning_rate": 1.3802330778577644e-05, + "loss": 2.0865, + "step": 35060 + }, + { + "epoch": 0.4382859571489287, + "grad_norm": 3.6732664108276367, + "learning_rate": 1.380152362228246e-05, + "loss": 0.7547, + "step": 35062 + }, + { + "epoch": 0.43831095777394435, + "grad_norm": 3.8910622596740723, + "learning_rate": 1.3800716437035413e-05, + "loss": 1.5389, + "step": 35064 + }, + { + "epoch": 0.43833595839896, + "grad_norm": 4.367227077484131, + "learning_rate": 1.3799909222842645e-05, + "loss": 0.7727, + "step": 35066 + }, + { + "epoch": 0.4383609590239756, + "grad_norm": 4.284573078155518, + "learning_rate": 1.3799101979710303e-05, + "loss": 0.9972, + "step": 35068 + }, + { + "epoch": 0.43838595964899124, + "grad_norm": 4.32543420791626, + "learning_rate": 1.3798294707644541e-05, + "loss": 1.243, + "step": 35070 + }, + { + "epoch": 0.43841096027400683, + "grad_norm": 7.308209419250488, + "learning_rate": 1.3797487406651505e-05, + "loss": 1.3553, + "step": 35072 + }, + { + "epoch": 0.4384359608990225, + "grad_norm": 5.2765326499938965, + "learning_rate": 1.379668007673734e-05, + "loss": 1.3367, + "step": 35074 + }, + { + "epoch": 0.43846096152403813, + "grad_norm": 3.4144482612609863, + "learning_rate": 1.3795872717908197e-05, + "loss": 0.5883, + "step": 35076 + }, + { + "epoch": 0.4384859621490537, + "grad_norm": 2.439274549484253, + "learning_rate": 1.3795065330170221e-05, + "loss": 0.4027, + "step": 35078 + }, + { + "epoch": 0.43851096277406937, + "grad_norm": 3.6861133575439453, + "learning_rate": 1.3794257913529567e-05, + "loss": 0.4562, + "step": 35080 + }, + { + "epoch": 0.43853596339908496, + "grad_norm": 2.816620349884033, + "learning_rate": 1.379345046799238e-05, + "loss": 1.3746, + "step": 35082 + }, + { + "epoch": 0.4385609640241006, + "grad_norm": 5.893645763397217, + "learning_rate": 1.3792642993564813e-05, + "loss": 0.8606, + "step": 35084 + }, + { + "epoch": 0.43858596464911626, + "grad_norm": 2.924100637435913, + "learning_rate": 1.3791835490253012e-05, + "loss": 0.5495, + "step": 35086 + }, + { + "epoch": 0.43861096527413185, + "grad_norm": 2.937544107437134, + "learning_rate": 1.3791027958063128e-05, + "loss": 0.7632, + "step": 35088 + }, + { + "epoch": 0.4386359658991475, + "grad_norm": 6.099675178527832, + "learning_rate": 1.379022039700131e-05, + "loss": 1.0116, + "step": 35090 + }, + { + "epoch": 0.4386609665241631, + "grad_norm": 1.9627678394317627, + "learning_rate": 1.378941280707371e-05, + "loss": 0.2891, + "step": 35092 + }, + { + "epoch": 0.43868596714917873, + "grad_norm": 3.3899552822113037, + "learning_rate": 1.3788605188286479e-05, + "loss": 1.1606, + "step": 35094 + }, + { + "epoch": 0.4387109677741944, + "grad_norm": 9.158976554870605, + "learning_rate": 1.3787797540645769e-05, + "loss": 1.0737, + "step": 35096 + }, + { + "epoch": 0.43873596839921, + "grad_norm": 5.460868835449219, + "learning_rate": 1.3786989864157724e-05, + "loss": 2.4524, + "step": 35098 + }, + { + "epoch": 0.4387609690242256, + "grad_norm": 0.01959763467311859, + "learning_rate": 1.3786182158828503e-05, + "loss": 0.0011, + "step": 35100 + }, + { + "epoch": 0.4387859696492412, + "grad_norm": 1.9325124025344849, + "learning_rate": 1.3785374424664251e-05, + "loss": 0.5549, + "step": 35102 + }, + { + "epoch": 0.43881097027425686, + "grad_norm": 3.934532880783081, + "learning_rate": 1.3784566661671124e-05, + "loss": 0.6979, + "step": 35104 + }, + { + "epoch": 0.4388359708992725, + "grad_norm": 5.321405410766602, + "learning_rate": 1.3783758869855273e-05, + "loss": 1.7409, + "step": 35106 + }, + { + "epoch": 0.4388609715242881, + "grad_norm": 3.303124189376831, + "learning_rate": 1.3782951049222847e-05, + "loss": 0.4318, + "step": 35108 + }, + { + "epoch": 0.43888597214930375, + "grad_norm": 0.0014659571461379528, + "learning_rate": 1.3782143199780004e-05, + "loss": 0.2902, + "step": 35110 + }, + { + "epoch": 0.43891097277431934, + "grad_norm": 2.8843729496002197, + "learning_rate": 1.3781335321532894e-05, + "loss": 0.7218, + "step": 35112 + }, + { + "epoch": 0.438935973399335, + "grad_norm": 12.598807334899902, + "learning_rate": 1.3780527414487668e-05, + "loss": 1.4409, + "step": 35114 + }, + { + "epoch": 0.43896097402435064, + "grad_norm": 5.603188991546631, + "learning_rate": 1.3779719478650477e-05, + "loss": 1.0391, + "step": 35116 + }, + { + "epoch": 0.43898597464936623, + "grad_norm": 2.4346790313720703, + "learning_rate": 1.377891151402748e-05, + "loss": 0.9491, + "step": 35118 + }, + { + "epoch": 0.4390109752743819, + "grad_norm": 3.907836437225342, + "learning_rate": 1.3778103520624826e-05, + "loss": 1.4234, + "step": 35120 + }, + { + "epoch": 0.43903597589939747, + "grad_norm": 2.4656713008880615, + "learning_rate": 1.3777295498448671e-05, + "loss": 1.2632, + "step": 35122 + }, + { + "epoch": 0.4390609765244131, + "grad_norm": 4.7444233894348145, + "learning_rate": 1.3776487447505167e-05, + "loss": 1.9074, + "step": 35124 + }, + { + "epoch": 0.43908597714942876, + "grad_norm": 0.0025376162957400084, + "learning_rate": 1.3775679367800468e-05, + "loss": 0.796, + "step": 35126 + }, + { + "epoch": 0.43911097777444436, + "grad_norm": 3.4664556980133057, + "learning_rate": 1.377487125934073e-05, + "loss": 1.7262, + "step": 35128 + }, + { + "epoch": 0.43913597839946, + "grad_norm": 3.0746617317199707, + "learning_rate": 1.3774063122132105e-05, + "loss": 0.6604, + "step": 35130 + }, + { + "epoch": 0.4391609790244756, + "grad_norm": 6.8333868980407715, + "learning_rate": 1.3773254956180749e-05, + "loss": 2.1805, + "step": 35132 + }, + { + "epoch": 0.43918597964949124, + "grad_norm": 0.0007161159301176667, + "learning_rate": 1.3772446761492817e-05, + "loss": 0.6969, + "step": 35134 + }, + { + "epoch": 0.4392109802745069, + "grad_norm": 2.4950034618377686, + "learning_rate": 1.3771638538074465e-05, + "loss": 1.439, + "step": 35136 + }, + { + "epoch": 0.4392359808995225, + "grad_norm": 5.562235355377197, + "learning_rate": 1.3770830285931847e-05, + "loss": 1.275, + "step": 35138 + }, + { + "epoch": 0.43926098152453813, + "grad_norm": 2.2732369899749756, + "learning_rate": 1.3770022005071117e-05, + "loss": 0.0762, + "step": 35140 + }, + { + "epoch": 0.4392859821495537, + "grad_norm": 10.858701705932617, + "learning_rate": 1.3769213695498434e-05, + "loss": 1.1503, + "step": 35142 + }, + { + "epoch": 0.43931098277456937, + "grad_norm": 4.11664342880249, + "learning_rate": 1.3768405357219952e-05, + "loss": 1.3255, + "step": 35144 + }, + { + "epoch": 0.439335983399585, + "grad_norm": 7.424078464508057, + "learning_rate": 1.3767596990241827e-05, + "loss": 0.9873, + "step": 35146 + }, + { + "epoch": 0.4393609840246006, + "grad_norm": 3.1004185676574707, + "learning_rate": 1.376678859457022e-05, + "loss": 1.3828, + "step": 35148 + }, + { + "epoch": 0.43938598464961626, + "grad_norm": 2.1047725677490234, + "learning_rate": 1.376598017021128e-05, + "loss": 1.6391, + "step": 35150 + }, + { + "epoch": 0.43941098527463185, + "grad_norm": 3.2686612606048584, + "learning_rate": 1.3765171717171169e-05, + "loss": 1.3398, + "step": 35152 + }, + { + "epoch": 0.4394359858996475, + "grad_norm": 3.406592845916748, + "learning_rate": 1.376436323545604e-05, + "loss": 0.5858, + "step": 35154 + }, + { + "epoch": 0.43946098652466314, + "grad_norm": 3.2923669815063477, + "learning_rate": 1.3763554725072057e-05, + "loss": 1.6809, + "step": 35156 + }, + { + "epoch": 0.43948598714967874, + "grad_norm": 4.530613422393799, + "learning_rate": 1.3762746186025371e-05, + "loss": 0.9081, + "step": 35158 + }, + { + "epoch": 0.4395109877746944, + "grad_norm": 4.466556549072266, + "learning_rate": 1.3761937618322141e-05, + "loss": 1.8433, + "step": 35160 + }, + { + "epoch": 0.43953598839971, + "grad_norm": 2.204314947128296, + "learning_rate": 1.3761129021968528e-05, + "loss": 0.4208, + "step": 35162 + }, + { + "epoch": 0.4395609890247256, + "grad_norm": 3.122218370437622, + "learning_rate": 1.376032039697069e-05, + "loss": 0.4649, + "step": 35164 + }, + { + "epoch": 0.43958598964974127, + "grad_norm": 4.086790561676025, + "learning_rate": 1.3759511743334779e-05, + "loss": 0.7084, + "step": 35166 + }, + { + "epoch": 0.43961099027475686, + "grad_norm": 2.9270904064178467, + "learning_rate": 1.3758703061066963e-05, + "loss": 0.942, + "step": 35168 + }, + { + "epoch": 0.4396359908997725, + "grad_norm": 0.0015401440905407071, + "learning_rate": 1.3757894350173393e-05, + "loss": 0.7337, + "step": 35170 + }, + { + "epoch": 0.4396609915247881, + "grad_norm": 3.845954656600952, + "learning_rate": 1.3757085610660231e-05, + "loss": 0.7784, + "step": 35172 + }, + { + "epoch": 0.43968599214980375, + "grad_norm": 2.4823012351989746, + "learning_rate": 1.375627684253364e-05, + "loss": 0.9399, + "step": 35174 + }, + { + "epoch": 0.4397109927748194, + "grad_norm": 2.1232597827911377, + "learning_rate": 1.3755468045799771e-05, + "loss": 0.4925, + "step": 35176 + }, + { + "epoch": 0.439735993399835, + "grad_norm": 2.9885592460632324, + "learning_rate": 1.375465922046479e-05, + "loss": 0.6607, + "step": 35178 + }, + { + "epoch": 0.43976099402485064, + "grad_norm": 3.9743940830230713, + "learning_rate": 1.3753850366534853e-05, + "loss": 1.6398, + "step": 35180 + }, + { + "epoch": 0.43978599464986623, + "grad_norm": 3.948007583618164, + "learning_rate": 1.3753041484016128e-05, + "loss": 1.42, + "step": 35182 + }, + { + "epoch": 0.4398109952748819, + "grad_norm": 4.362400531768799, + "learning_rate": 1.3752232572914767e-05, + "loss": 1.0135, + "step": 35184 + }, + { + "epoch": 0.4398359958998975, + "grad_norm": 4.355003356933594, + "learning_rate": 1.3751423633236934e-05, + "loss": 1.0396, + "step": 35186 + }, + { + "epoch": 0.4398609965249131, + "grad_norm": 2.615912675857544, + "learning_rate": 1.3750614664988789e-05, + "loss": 1.0023, + "step": 35188 + }, + { + "epoch": 0.43988599714992876, + "grad_norm": 4.627819538116455, + "learning_rate": 1.3749805668176493e-05, + "loss": 0.8687, + "step": 35190 + }, + { + "epoch": 0.43991099777494436, + "grad_norm": 3.40380859375, + "learning_rate": 1.3748996642806203e-05, + "loss": 0.6583, + "step": 35192 + }, + { + "epoch": 0.43993599839996, + "grad_norm": 0.009301550686359406, + "learning_rate": 1.374818758888409e-05, + "loss": 0.0005, + "step": 35194 + }, + { + "epoch": 0.43996099902497565, + "grad_norm": 4.211658954620361, + "learning_rate": 1.374737850641631e-05, + "loss": 0.8746, + "step": 35196 + }, + { + "epoch": 0.43998599964999124, + "grad_norm": 2.9966626167297363, + "learning_rate": 1.3746569395409024e-05, + "loss": 1.2133, + "step": 35198 + }, + { + "epoch": 0.4400110002750069, + "grad_norm": 2.082125425338745, + "learning_rate": 1.3745760255868396e-05, + "loss": 0.5577, + "step": 35200 + }, + { + "epoch": 0.4400360009000225, + "grad_norm": 6.892858982086182, + "learning_rate": 1.3744951087800587e-05, + "loss": 1.6545, + "step": 35202 + }, + { + "epoch": 0.44006100152503813, + "grad_norm": 3.7847867012023926, + "learning_rate": 1.3744141891211758e-05, + "loss": 0.2895, + "step": 35204 + }, + { + "epoch": 0.4400860021500538, + "grad_norm": 2.35707426071167, + "learning_rate": 1.3743332666108074e-05, + "loss": 1.2421, + "step": 35206 + }, + { + "epoch": 0.44011100277506937, + "grad_norm": 3.555264472961426, + "learning_rate": 1.3742523412495703e-05, + "loss": 1.0734, + "step": 35208 + }, + { + "epoch": 0.440136003400085, + "grad_norm": 1.7110801935195923, + "learning_rate": 1.37417141303808e-05, + "loss": 0.0879, + "step": 35210 + }, + { + "epoch": 0.4401610040251006, + "grad_norm": 2.7049906253814697, + "learning_rate": 1.374090481976953e-05, + "loss": 0.7511, + "step": 35212 + }, + { + "epoch": 0.44018600465011626, + "grad_norm": 3.6334543228149414, + "learning_rate": 1.3740095480668059e-05, + "loss": 0.5568, + "step": 35214 + }, + { + "epoch": 0.4402110052751319, + "grad_norm": 5.525014400482178, + "learning_rate": 1.3739286113082548e-05, + "loss": 2.2071, + "step": 35216 + }, + { + "epoch": 0.4402360059001475, + "grad_norm": 6.37106990814209, + "learning_rate": 1.3738476717019165e-05, + "loss": 0.4138, + "step": 35218 + }, + { + "epoch": 0.44026100652516315, + "grad_norm": 2.9902801513671875, + "learning_rate": 1.3737667292484072e-05, + "loss": 1.2867, + "step": 35220 + }, + { + "epoch": 0.44028600715017874, + "grad_norm": 3.264732837677002, + "learning_rate": 1.3736857839483434e-05, + "loss": 1.2171, + "step": 35222 + }, + { + "epoch": 0.4403110077751944, + "grad_norm": 2.9346730709075928, + "learning_rate": 1.373604835802341e-05, + "loss": 0.3467, + "step": 35224 + }, + { + "epoch": 0.44033600840021003, + "grad_norm": 1.0047422647476196, + "learning_rate": 1.3735238848110178e-05, + "loss": 0.5326, + "step": 35226 + }, + { + "epoch": 0.4403610090252256, + "grad_norm": 0.6330782771110535, + "learning_rate": 1.3734429309749889e-05, + "loss": 0.7345, + "step": 35228 + }, + { + "epoch": 0.4403860096502413, + "grad_norm": 2.9214024543762207, + "learning_rate": 1.3733619742948716e-05, + "loss": 1.0825, + "step": 35230 + }, + { + "epoch": 0.44041101027525686, + "grad_norm": 2.1249046325683594, + "learning_rate": 1.3732810147712823e-05, + "loss": 0.2782, + "step": 35232 + }, + { + "epoch": 0.4404360109002725, + "grad_norm": 2.7440290451049805, + "learning_rate": 1.3732000524048376e-05, + "loss": 1.8064, + "step": 35234 + }, + { + "epoch": 0.44046101152528816, + "grad_norm": 4.029813289642334, + "learning_rate": 1.3731190871961541e-05, + "loss": 0.884, + "step": 35236 + }, + { + "epoch": 0.44048601215030375, + "grad_norm": 2.6896021366119385, + "learning_rate": 1.3730381191458484e-05, + "loss": 0.9552, + "step": 35238 + }, + { + "epoch": 0.4405110127753194, + "grad_norm": 1.4589812755584717, + "learning_rate": 1.3729571482545372e-05, + "loss": 0.618, + "step": 35240 + }, + { + "epoch": 0.440536013400335, + "grad_norm": 0.7824167609214783, + "learning_rate": 1.3728761745228369e-05, + "loss": 0.7868, + "step": 35242 + }, + { + "epoch": 0.44056101402535064, + "grad_norm": 4.768139839172363, + "learning_rate": 1.3727951979513646e-05, + "loss": 1.3326, + "step": 35244 + }, + { + "epoch": 0.4405860146503663, + "grad_norm": 5.345799922943115, + "learning_rate": 1.372714218540737e-05, + "loss": 1.687, + "step": 35246 + }, + { + "epoch": 0.4406110152753819, + "grad_norm": 0.5733428597450256, + "learning_rate": 1.3726332362915702e-05, + "loss": 0.2169, + "step": 35248 + }, + { + "epoch": 0.4406360159003975, + "grad_norm": 0.0012233075685799122, + "learning_rate": 1.3725522512044813e-05, + "loss": 0.0003, + "step": 35250 + }, + { + "epoch": 0.4406610165254131, + "grad_norm": 2.3997461795806885, + "learning_rate": 1.3724712632800873e-05, + "loss": 0.9442, + "step": 35252 + }, + { + "epoch": 0.44068601715042877, + "grad_norm": 4.185387134552002, + "learning_rate": 1.3723902725190051e-05, + "loss": 1.2738, + "step": 35254 + }, + { + "epoch": 0.4407110177754444, + "grad_norm": 3.0987942218780518, + "learning_rate": 1.3723092789218509e-05, + "loss": 0.5114, + "step": 35256 + }, + { + "epoch": 0.44073601840046, + "grad_norm": 2.387860059738159, + "learning_rate": 1.3722282824892419e-05, + "loss": 1.7536, + "step": 35258 + }, + { + "epoch": 0.44076101902547565, + "grad_norm": 2.9886107444763184, + "learning_rate": 1.3721472832217952e-05, + "loss": 0.7862, + "step": 35260 + }, + { + "epoch": 0.44078601965049125, + "grad_norm": 3.989455223083496, + "learning_rate": 1.372066281120127e-05, + "loss": 1.7923, + "step": 35262 + }, + { + "epoch": 0.4408110202755069, + "grad_norm": 1.9441457986831665, + "learning_rate": 1.371985276184855e-05, + "loss": 0.4738, + "step": 35264 + }, + { + "epoch": 0.44083602090052254, + "grad_norm": 1.9167022705078125, + "learning_rate": 1.3719042684165955e-05, + "loss": 1.3215, + "step": 35266 + }, + { + "epoch": 0.44086102152553813, + "grad_norm": 0.0019515200983732939, + "learning_rate": 1.3718232578159658e-05, + "loss": 0.0003, + "step": 35268 + }, + { + "epoch": 0.4408860221505538, + "grad_norm": 2.224165201187134, + "learning_rate": 1.3717422443835829e-05, + "loss": 0.3505, + "step": 35270 + }, + { + "epoch": 0.4409110227755694, + "grad_norm": 0.0012085281778126955, + "learning_rate": 1.3716612281200633e-05, + "loss": 0.4445, + "step": 35272 + }, + { + "epoch": 0.440936023400585, + "grad_norm": 4.360203742980957, + "learning_rate": 1.3715802090260248e-05, + "loss": 1.0357, + "step": 35274 + }, + { + "epoch": 0.44096102402560067, + "grad_norm": 4.55793571472168, + "learning_rate": 1.3714991871020836e-05, + "loss": 1.2228, + "step": 35276 + }, + { + "epoch": 0.44098602465061626, + "grad_norm": 4.190767288208008, + "learning_rate": 1.3714181623488572e-05, + "loss": 1.2777, + "step": 35278 + }, + { + "epoch": 0.4410110252756319, + "grad_norm": 0.005308507010340691, + "learning_rate": 1.3713371347669625e-05, + "loss": 0.8143, + "step": 35280 + }, + { + "epoch": 0.4410360259006475, + "grad_norm": 2.3188834190368652, + "learning_rate": 1.3712561043570167e-05, + "loss": 1.6776, + "step": 35282 + }, + { + "epoch": 0.44106102652566315, + "grad_norm": 6.035617828369141, + "learning_rate": 1.3711750711196372e-05, + "loss": 1.4625, + "step": 35284 + }, + { + "epoch": 0.4410860271506788, + "grad_norm": 0.4031377136707306, + "learning_rate": 1.3710940350554406e-05, + "loss": 0.1171, + "step": 35286 + }, + { + "epoch": 0.4411110277756944, + "grad_norm": 8.337530136108398, + "learning_rate": 1.3710129961650444e-05, + "loss": 1.5055, + "step": 35288 + }, + { + "epoch": 0.44113602840071003, + "grad_norm": 5.364744186401367, + "learning_rate": 1.3709319544490657e-05, + "loss": 1.63, + "step": 35290 + }, + { + "epoch": 0.4411610290257256, + "grad_norm": 0.0011918330565094948, + "learning_rate": 1.3708509099081217e-05, + "loss": 0.2386, + "step": 35292 + }, + { + "epoch": 0.4411860296507413, + "grad_norm": 4.81903076171875, + "learning_rate": 1.3707698625428293e-05, + "loss": 0.9866, + "step": 35294 + }, + { + "epoch": 0.4412110302757569, + "grad_norm": 1.9059323072433472, + "learning_rate": 1.3706888123538064e-05, + "loss": 0.6919, + "step": 35296 + }, + { + "epoch": 0.4412360309007725, + "grad_norm": 2.8348076343536377, + "learning_rate": 1.3706077593416702e-05, + "loss": 0.5714, + "step": 35298 + }, + { + "epoch": 0.44126103152578816, + "grad_norm": 0.0009421090944670141, + "learning_rate": 1.3705267035070373e-05, + "loss": 0.0421, + "step": 35300 + }, + { + "epoch": 0.44128603215080375, + "grad_norm": 1.685758352279663, + "learning_rate": 1.3704456448505254e-05, + "loss": 0.1476, + "step": 35302 + }, + { + "epoch": 0.4413110327758194, + "grad_norm": 4.079225063323975, + "learning_rate": 1.3703645833727518e-05, + "loss": 0.5111, + "step": 35304 + }, + { + "epoch": 0.44133603340083505, + "grad_norm": 3.2152345180511475, + "learning_rate": 1.3702835190743342e-05, + "loss": 1.5207, + "step": 35306 + }, + { + "epoch": 0.44136103402585064, + "grad_norm": 15.610401153564453, + "learning_rate": 1.3702024519558894e-05, + "loss": 1.2373, + "step": 35308 + }, + { + "epoch": 0.4413860346508663, + "grad_norm": 4.923377513885498, + "learning_rate": 1.3701213820180352e-05, + "loss": 1.6891, + "step": 35310 + }, + { + "epoch": 0.4414110352758819, + "grad_norm": 3.7508811950683594, + "learning_rate": 1.370040309261389e-05, + "loss": 0.7651, + "step": 35312 + }, + { + "epoch": 0.4414360359008975, + "grad_norm": 3.221977949142456, + "learning_rate": 1.369959233686568e-05, + "loss": 0.5425, + "step": 35314 + }, + { + "epoch": 0.4414610365259132, + "grad_norm": 4.330747127532959, + "learning_rate": 1.36987815529419e-05, + "loss": 0.5733, + "step": 35316 + }, + { + "epoch": 0.44148603715092877, + "grad_norm": 4.776243686676025, + "learning_rate": 1.3697970740848719e-05, + "loss": 1.1968, + "step": 35318 + }, + { + "epoch": 0.4415110377759444, + "grad_norm": 1.4015758037567139, + "learning_rate": 1.3697159900592319e-05, + "loss": 0.1814, + "step": 35320 + }, + { + "epoch": 0.44153603840096, + "grad_norm": 2.3293535709381104, + "learning_rate": 1.3696349032178868e-05, + "loss": 1.3815, + "step": 35322 + }, + { + "epoch": 0.44156103902597565, + "grad_norm": 3.2524962425231934, + "learning_rate": 1.3695538135614552e-05, + "loss": 0.8265, + "step": 35324 + }, + { + "epoch": 0.4415860396509913, + "grad_norm": 1.980147361755371, + "learning_rate": 1.3694727210905536e-05, + "loss": 0.8999, + "step": 35326 + }, + { + "epoch": 0.4416110402760069, + "grad_norm": 0.12013310194015503, + "learning_rate": 1.3693916258058e-05, + "loss": 0.1424, + "step": 35328 + }, + { + "epoch": 0.44163604090102254, + "grad_norm": 1.7137924432754517, + "learning_rate": 1.369310527707812e-05, + "loss": 0.457, + "step": 35330 + }, + { + "epoch": 0.44166104152603813, + "grad_norm": 2.7732229232788086, + "learning_rate": 1.3692294267972073e-05, + "loss": 0.6928, + "step": 35332 + }, + { + "epoch": 0.4416860421510538, + "grad_norm": 2.038297176361084, + "learning_rate": 1.3691483230746035e-05, + "loss": 0.3277, + "step": 35334 + }, + { + "epoch": 0.44171104277606943, + "grad_norm": 1.9348293542861938, + "learning_rate": 1.3690672165406183e-05, + "loss": 0.8836, + "step": 35336 + }, + { + "epoch": 0.441736043401085, + "grad_norm": 0.01641261577606201, + "learning_rate": 1.3689861071958694e-05, + "loss": 0.8884, + "step": 35338 + }, + { + "epoch": 0.44176104402610067, + "grad_norm": 3.0662882328033447, + "learning_rate": 1.3689049950409745e-05, + "loss": 0.3042, + "step": 35340 + }, + { + "epoch": 0.44178604465111626, + "grad_norm": 0.15876778960227966, + "learning_rate": 1.3688238800765512e-05, + "loss": 0.0305, + "step": 35342 + }, + { + "epoch": 0.4418110452761319, + "grad_norm": 0.5522041320800781, + "learning_rate": 1.3687427623032175e-05, + "loss": 0.0272, + "step": 35344 + }, + { + "epoch": 0.44183604590114756, + "grad_norm": 6.658947944641113, + "learning_rate": 1.368661641721591e-05, + "loss": 1.8738, + "step": 35346 + }, + { + "epoch": 0.44186104652616315, + "grad_norm": 2.819279432296753, + "learning_rate": 1.3685805183322896e-05, + "loss": 0.2856, + "step": 35348 + }, + { + "epoch": 0.4418860471511788, + "grad_norm": 6.074252128601074, + "learning_rate": 1.368499392135931e-05, + "loss": 0.9886, + "step": 35350 + }, + { + "epoch": 0.4419110477761944, + "grad_norm": 2.9986214637756348, + "learning_rate": 1.3684182631331333e-05, + "loss": 0.6796, + "step": 35352 + }, + { + "epoch": 0.44193604840121004, + "grad_norm": 2.1155645847320557, + "learning_rate": 1.3683371313245141e-05, + "loss": 0.7945, + "step": 35354 + }, + { + "epoch": 0.4419610490262257, + "grad_norm": 0.7573038935661316, + "learning_rate": 1.3682559967106913e-05, + "loss": 0.0143, + "step": 35356 + }, + { + "epoch": 0.4419860496512413, + "grad_norm": 8.34955883026123, + "learning_rate": 1.368174859292283e-05, + "loss": 0.6936, + "step": 35358 + }, + { + "epoch": 0.4420110502762569, + "grad_norm": 4.752922534942627, + "learning_rate": 1.3680937190699073e-05, + "loss": 1.4433, + "step": 35360 + }, + { + "epoch": 0.4420360509012725, + "grad_norm": 2.331779718399048, + "learning_rate": 1.3680125760441816e-05, + "loss": 0.969, + "step": 35362 + }, + { + "epoch": 0.44206105152628816, + "grad_norm": 0.0006074980483390391, + "learning_rate": 1.3679314302157243e-05, + "loss": 0.0413, + "step": 35364 + }, + { + "epoch": 0.4420860521513038, + "grad_norm": 3.4828028678894043, + "learning_rate": 1.3678502815851532e-05, + "loss": 1.1029, + "step": 35366 + }, + { + "epoch": 0.4421110527763194, + "grad_norm": 2.516740560531616, + "learning_rate": 1.3677691301530863e-05, + "loss": 0.1737, + "step": 35368 + }, + { + "epoch": 0.44213605340133505, + "grad_norm": 2.500722885131836, + "learning_rate": 1.3676879759201418e-05, + "loss": 0.5785, + "step": 35370 + }, + { + "epoch": 0.44216105402635064, + "grad_norm": 4.501828193664551, + "learning_rate": 1.3676068188869374e-05, + "loss": 1.1023, + "step": 35372 + }, + { + "epoch": 0.4421860546513663, + "grad_norm": 3.7026922702789307, + "learning_rate": 1.3675256590540917e-05, + "loss": 0.8278, + "step": 35374 + }, + { + "epoch": 0.44221105527638194, + "grad_norm": 2.9328484535217285, + "learning_rate": 1.3674444964222225e-05, + "loss": 0.2591, + "step": 35376 + }, + { + "epoch": 0.44223605590139753, + "grad_norm": 0.9512312412261963, + "learning_rate": 1.3673633309919483e-05, + "loss": 0.2516, + "step": 35378 + }, + { + "epoch": 0.4422610565264132, + "grad_norm": 2.1947860717773438, + "learning_rate": 1.3672821627638866e-05, + "loss": 0.1061, + "step": 35380 + }, + { + "epoch": 0.44228605715142877, + "grad_norm": 2.729367733001709, + "learning_rate": 1.3672009917386557e-05, + "loss": 0.065, + "step": 35382 + }, + { + "epoch": 0.4423110577764444, + "grad_norm": 2.805349588394165, + "learning_rate": 1.3671198179168742e-05, + "loss": 1.8857, + "step": 35384 + }, + { + "epoch": 0.44233605840146006, + "grad_norm": 9.638001441955566, + "learning_rate": 1.36703864129916e-05, + "loss": 1.0315, + "step": 35386 + }, + { + "epoch": 0.44236105902647566, + "grad_norm": 4.768123149871826, + "learning_rate": 1.3669574618861317e-05, + "loss": 1.681, + "step": 35388 + }, + { + "epoch": 0.4423860596514913, + "grad_norm": 5.057637691497803, + "learning_rate": 1.3668762796784073e-05, + "loss": 1.4845, + "step": 35390 + }, + { + "epoch": 0.4424110602765069, + "grad_norm": 2.584937810897827, + "learning_rate": 1.3667950946766047e-05, + "loss": 0.9971, + "step": 35392 + }, + { + "epoch": 0.44243606090152254, + "grad_norm": 3.5328681468963623, + "learning_rate": 1.3667139068813422e-05, + "loss": 1.7328, + "step": 35394 + }, + { + "epoch": 0.4424610615265382, + "grad_norm": 4.205072402954102, + "learning_rate": 1.3666327162932392e-05, + "loss": 1.9128, + "step": 35396 + }, + { + "epoch": 0.4424860621515538, + "grad_norm": 6.236845016479492, + "learning_rate": 1.366551522912913e-05, + "loss": 0.8161, + "step": 35398 + }, + { + "epoch": 0.44251106277656943, + "grad_norm": 2.5417795181274414, + "learning_rate": 1.366470326740982e-05, + "loss": 0.8903, + "step": 35400 + }, + { + "epoch": 0.442536063401585, + "grad_norm": 6.610139846801758, + "learning_rate": 1.366389127778065e-05, + "loss": 1.2168, + "step": 35402 + }, + { + "epoch": 0.44256106402660067, + "grad_norm": 5.613675594329834, + "learning_rate": 1.3663079260247802e-05, + "loss": 0.7881, + "step": 35404 + }, + { + "epoch": 0.4425860646516163, + "grad_norm": 3.9256820678710938, + "learning_rate": 1.366226721481746e-05, + "loss": 1.4465, + "step": 35406 + }, + { + "epoch": 0.4426110652766319, + "grad_norm": 6.995680809020996, + "learning_rate": 1.3661455141495807e-05, + "loss": 1.4436, + "step": 35408 + }, + { + "epoch": 0.44263606590164756, + "grad_norm": 2.627957820892334, + "learning_rate": 1.3660643040289034e-05, + "loss": 1.681, + "step": 35410 + }, + { + "epoch": 0.44266106652666315, + "grad_norm": 4.444146633148193, + "learning_rate": 1.3659830911203318e-05, + "loss": 1.6247, + "step": 35412 + }, + { + "epoch": 0.4426860671516788, + "grad_norm": 1.150293231010437, + "learning_rate": 1.3659018754244848e-05, + "loss": 1.1623, + "step": 35414 + }, + { + "epoch": 0.44271106777669444, + "grad_norm": 4.087453365325928, + "learning_rate": 1.3658206569419812e-05, + "loss": 1.339, + "step": 35416 + }, + { + "epoch": 0.44273606840171004, + "grad_norm": 3.712653875350952, + "learning_rate": 1.3657394356734386e-05, + "loss": 0.9298, + "step": 35418 + }, + { + "epoch": 0.4427610690267257, + "grad_norm": 0.8646188378334045, + "learning_rate": 1.3656582116194765e-05, + "loss": 0.0184, + "step": 35420 + }, + { + "epoch": 0.4427860696517413, + "grad_norm": 3.1674134731292725, + "learning_rate": 1.3655769847807133e-05, + "loss": 1.237, + "step": 35422 + }, + { + "epoch": 0.4428110702767569, + "grad_norm": 2.1380958557128906, + "learning_rate": 1.3654957551577672e-05, + "loss": 1.283, + "step": 35424 + }, + { + "epoch": 0.44283607090177257, + "grad_norm": 2.991725206375122, + "learning_rate": 1.3654145227512572e-05, + "loss": 0.9974, + "step": 35426 + }, + { + "epoch": 0.44286107152678816, + "grad_norm": 2.540071487426758, + "learning_rate": 1.365333287561802e-05, + "loss": 0.3509, + "step": 35428 + }, + { + "epoch": 0.4428860721518038, + "grad_norm": 5.136253833770752, + "learning_rate": 1.3652520495900199e-05, + "loss": 0.6698, + "step": 35430 + }, + { + "epoch": 0.4429110727768194, + "grad_norm": 2.046032428741455, + "learning_rate": 1.3651708088365298e-05, + "loss": 1.0097, + "step": 35432 + }, + { + "epoch": 0.44293607340183505, + "grad_norm": 2.5737075805664062, + "learning_rate": 1.365089565301951e-05, + "loss": 0.962, + "step": 35434 + }, + { + "epoch": 0.4429610740268507, + "grad_norm": 3.2626187801361084, + "learning_rate": 1.3650083189869012e-05, + "loss": 1.1217, + "step": 35436 + }, + { + "epoch": 0.4429860746518663, + "grad_norm": 1.1185660362243652, + "learning_rate": 1.3649270698919998e-05, + "loss": 0.2778, + "step": 35438 + }, + { + "epoch": 0.44301107527688194, + "grad_norm": 11.543243408203125, + "learning_rate": 1.3648458180178655e-05, + "loss": 2.0975, + "step": 35440 + }, + { + "epoch": 0.44303607590189753, + "grad_norm": 5.614781856536865, + "learning_rate": 1.3647645633651171e-05, + "loss": 1.7949, + "step": 35442 + }, + { + "epoch": 0.4430610765269132, + "grad_norm": 2.777163028717041, + "learning_rate": 1.3646833059343731e-05, + "loss": 0.9286, + "step": 35444 + }, + { + "epoch": 0.4430860771519288, + "grad_norm": 1.207945704460144, + "learning_rate": 1.3646020457262528e-05, + "loss": 0.0175, + "step": 35446 + }, + { + "epoch": 0.4431110777769444, + "grad_norm": 1.8476601839065552, + "learning_rate": 1.3645207827413748e-05, + "loss": 0.8666, + "step": 35448 + }, + { + "epoch": 0.44313607840196007, + "grad_norm": 1.579514503479004, + "learning_rate": 1.3644395169803581e-05, + "loss": 0.0741, + "step": 35450 + }, + { + "epoch": 0.44316107902697566, + "grad_norm": 3.9036993980407715, + "learning_rate": 1.3643582484438217e-05, + "loss": 1.0972, + "step": 35452 + }, + { + "epoch": 0.4431860796519913, + "grad_norm": 4.4850263595581055, + "learning_rate": 1.3642769771323842e-05, + "loss": 1.5854, + "step": 35454 + }, + { + "epoch": 0.44321108027700695, + "grad_norm": 1.5442464351654053, + "learning_rate": 1.3641957030466648e-05, + "loss": 0.1066, + "step": 35456 + }, + { + "epoch": 0.44323608090202254, + "grad_norm": 3.953296422958374, + "learning_rate": 1.3641144261872825e-05, + "loss": 1.2484, + "step": 35458 + }, + { + "epoch": 0.4432610815270382, + "grad_norm": 2.863090753555298, + "learning_rate": 1.3640331465548561e-05, + "loss": 0.8458, + "step": 35460 + }, + { + "epoch": 0.4432860821520538, + "grad_norm": 2.7519283294677734, + "learning_rate": 1.3639518641500052e-05, + "loss": 0.2507, + "step": 35462 + }, + { + "epoch": 0.44331108277706943, + "grad_norm": 4.748169422149658, + "learning_rate": 1.363870578973348e-05, + "loss": 2.3835, + "step": 35464 + }, + { + "epoch": 0.4433360834020851, + "grad_norm": 2.1201372146606445, + "learning_rate": 1.3637892910255038e-05, + "loss": 0.0756, + "step": 35466 + }, + { + "epoch": 0.44336108402710067, + "grad_norm": 0.0011944640427827835, + "learning_rate": 1.363708000307092e-05, + "loss": 0.7916, + "step": 35468 + }, + { + "epoch": 0.4433860846521163, + "grad_norm": 0.0007480988861061633, + "learning_rate": 1.3636267068187313e-05, + "loss": 0.0004, + "step": 35470 + }, + { + "epoch": 0.4434110852771319, + "grad_norm": 0.0011956623056903481, + "learning_rate": 1.363545410561041e-05, + "loss": 0.267, + "step": 35472 + }, + { + "epoch": 0.44343608590214756, + "grad_norm": 3.2028000354766846, + "learning_rate": 1.3634641115346408e-05, + "loss": 1.3483, + "step": 35474 + }, + { + "epoch": 0.4434610865271632, + "grad_norm": 1.7687742710113525, + "learning_rate": 1.3633828097401489e-05, + "loss": 0.3285, + "step": 35476 + }, + { + "epoch": 0.4434860871521788, + "grad_norm": 5.4843974113464355, + "learning_rate": 1.3633015051781849e-05, + "loss": 2.4231, + "step": 35478 + }, + { + "epoch": 0.44351108777719445, + "grad_norm": 2.190356731414795, + "learning_rate": 1.363220197849368e-05, + "loss": 0.1542, + "step": 35480 + }, + { + "epoch": 0.44353608840221004, + "grad_norm": 0.6013939380645752, + "learning_rate": 1.3631388877543174e-05, + "loss": 0.6526, + "step": 35482 + }, + { + "epoch": 0.4435610890272257, + "grad_norm": 0.24001629650592804, + "learning_rate": 1.3630575748936525e-05, + "loss": 0.8643, + "step": 35484 + }, + { + "epoch": 0.44358608965224133, + "grad_norm": 0.0027281707152724266, + "learning_rate": 1.3629762592679925e-05, + "loss": 0.5201, + "step": 35486 + }, + { + "epoch": 0.4436110902772569, + "grad_norm": 2.4606664180755615, + "learning_rate": 1.3628949408779567e-05, + "loss": 1.156, + "step": 35488 + }, + { + "epoch": 0.4436360909022726, + "grad_norm": 1.7963266372680664, + "learning_rate": 1.3628136197241641e-05, + "loss": 0.2261, + "step": 35490 + }, + { + "epoch": 0.44366109152728817, + "grad_norm": 0.001942806993611157, + "learning_rate": 1.3627322958072344e-05, + "loss": 0.7692, + "step": 35492 + }, + { + "epoch": 0.4436860921523038, + "grad_norm": 2.9596669673919678, + "learning_rate": 1.362650969127787e-05, + "loss": 0.9543, + "step": 35494 + }, + { + "epoch": 0.44371109277731946, + "grad_norm": 2.8748340606689453, + "learning_rate": 1.3625696396864409e-05, + "loss": 1.7148, + "step": 35496 + }, + { + "epoch": 0.44373609340233505, + "grad_norm": 2.80328106880188, + "learning_rate": 1.3624883074838155e-05, + "loss": 0.6712, + "step": 35498 + }, + { + "epoch": 0.4437610940273507, + "grad_norm": 3.5938150882720947, + "learning_rate": 1.3624069725205307e-05, + "loss": 0.7555, + "step": 35500 + }, + { + "epoch": 0.4437860946523663, + "grad_norm": 4.477220058441162, + "learning_rate": 1.3623256347972058e-05, + "loss": 0.8836, + "step": 35502 + }, + { + "epoch": 0.44381109527738194, + "grad_norm": 3.414332151412964, + "learning_rate": 1.3622442943144599e-05, + "loss": 0.9816, + "step": 35504 + }, + { + "epoch": 0.4438360959023976, + "grad_norm": 1.505658507347107, + "learning_rate": 1.3621629510729127e-05, + "loss": 0.6586, + "step": 35506 + }, + { + "epoch": 0.4438610965274132, + "grad_norm": 0.10741057246923447, + "learning_rate": 1.3620816050731836e-05, + "loss": 0.2958, + "step": 35508 + }, + { + "epoch": 0.4438860971524288, + "grad_norm": 0.7825071215629578, + "learning_rate": 1.3620002563158922e-05, + "loss": 0.0194, + "step": 35510 + }, + { + "epoch": 0.4439110977774444, + "grad_norm": 3.874859094619751, + "learning_rate": 1.3619189048016578e-05, + "loss": 1.7062, + "step": 35512 + }, + { + "epoch": 0.44393609840246007, + "grad_norm": 2.105314254760742, + "learning_rate": 1.3618375505311008e-05, + "loss": 0.3342, + "step": 35514 + }, + { + "epoch": 0.4439610990274757, + "grad_norm": 3.0900254249572754, + "learning_rate": 1.36175619350484e-05, + "loss": 0.5866, + "step": 35516 + }, + { + "epoch": 0.4439860996524913, + "grad_norm": 2.6458866596221924, + "learning_rate": 1.361674833723495e-05, + "loss": 0.6325, + "step": 35518 + }, + { + "epoch": 0.44401110027750695, + "grad_norm": 3.1724085807800293, + "learning_rate": 1.3615934711876858e-05, + "loss": 1.195, + "step": 35520 + }, + { + "epoch": 0.44403610090252255, + "grad_norm": 2.8509514331817627, + "learning_rate": 1.3615121058980316e-05, + "loss": 0.5789, + "step": 35522 + }, + { + "epoch": 0.4440611015275382, + "grad_norm": 6.877335071563721, + "learning_rate": 1.3614307378551523e-05, + "loss": 0.6862, + "step": 35524 + }, + { + "epoch": 0.44408610215255384, + "grad_norm": 0.032888613641262054, + "learning_rate": 1.3613493670596677e-05, + "loss": 0.6427, + "step": 35526 + }, + { + "epoch": 0.44411110277756943, + "grad_norm": 2.4451866149902344, + "learning_rate": 1.3612679935121977e-05, + "loss": 0.9913, + "step": 35528 + }, + { + "epoch": 0.4441361034025851, + "grad_norm": 2.11051344871521, + "learning_rate": 1.3611866172133617e-05, + "loss": 1.4083, + "step": 35530 + }, + { + "epoch": 0.4441611040276007, + "grad_norm": 2.5040431022644043, + "learning_rate": 1.3611052381637792e-05, + "loss": 1.232, + "step": 35532 + }, + { + "epoch": 0.4441861046526163, + "grad_norm": 0.002949861343950033, + "learning_rate": 1.3610238563640703e-05, + "loss": 0.4264, + "step": 35534 + }, + { + "epoch": 0.44421110527763197, + "grad_norm": 4.780704498291016, + "learning_rate": 1.3609424718148549e-05, + "loss": 0.7835, + "step": 35536 + }, + { + "epoch": 0.44423610590264756, + "grad_norm": 3.8495471477508545, + "learning_rate": 1.3608610845167528e-05, + "loss": 1.6413, + "step": 35538 + }, + { + "epoch": 0.4442611065276632, + "grad_norm": 0.7222747802734375, + "learning_rate": 1.3607796944703835e-05, + "loss": 0.4228, + "step": 35540 + }, + { + "epoch": 0.4442861071526788, + "grad_norm": 6.421191692352295, + "learning_rate": 1.360698301676367e-05, + "loss": 2.1324, + "step": 35542 + }, + { + "epoch": 0.44431110777769445, + "grad_norm": 3.5847647190093994, + "learning_rate": 1.3606169061353236e-05, + "loss": 1.0851, + "step": 35544 + }, + { + "epoch": 0.4443361084027101, + "grad_norm": 2.8123903274536133, + "learning_rate": 1.3605355078478723e-05, + "loss": 0.7946, + "step": 35546 + }, + { + "epoch": 0.4443611090277257, + "grad_norm": 4.2269439697265625, + "learning_rate": 1.360454106814634e-05, + "loss": 1.4765, + "step": 35548 + }, + { + "epoch": 0.44438610965274133, + "grad_norm": 3.035743236541748, + "learning_rate": 1.360372703036228e-05, + "loss": 0.8415, + "step": 35550 + }, + { + "epoch": 0.4444111102777569, + "grad_norm": 2.084848403930664, + "learning_rate": 1.3602912965132746e-05, + "loss": 1.2628, + "step": 35552 + }, + { + "epoch": 0.4444361109027726, + "grad_norm": 4.289822101593018, + "learning_rate": 1.3602098872463935e-05, + "loss": 1.1244, + "step": 35554 + }, + { + "epoch": 0.4444611115277882, + "grad_norm": 2.2585670948028564, + "learning_rate": 1.360128475236205e-05, + "loss": 0.8164, + "step": 35556 + }, + { + "epoch": 0.4444861121528038, + "grad_norm": 4.877645015716553, + "learning_rate": 1.3600470604833284e-05, + "loss": 0.7549, + "step": 35558 + }, + { + "epoch": 0.44451111277781946, + "grad_norm": 3.9079952239990234, + "learning_rate": 1.3599656429883852e-05, + "loss": 0.8154, + "step": 35560 + }, + { + "epoch": 0.44453611340283505, + "grad_norm": 5.214251518249512, + "learning_rate": 1.359884222751994e-05, + "loss": 1.7532, + "step": 35562 + }, + { + "epoch": 0.4445611140278507, + "grad_norm": 2.9633591175079346, + "learning_rate": 1.3598027997747755e-05, + "loss": 0.4397, + "step": 35564 + }, + { + "epoch": 0.44458611465286635, + "grad_norm": 3.7310030460357666, + "learning_rate": 1.3597213740573501e-05, + "loss": 1.2371, + "step": 35566 + }, + { + "epoch": 0.44461111527788194, + "grad_norm": 0.0037448222283273935, + "learning_rate": 1.3596399456003373e-05, + "loss": 0.0351, + "step": 35568 + }, + { + "epoch": 0.4446361159028976, + "grad_norm": 6.654795169830322, + "learning_rate": 1.3595585144043575e-05, + "loss": 0.2019, + "step": 35570 + }, + { + "epoch": 0.4446611165279132, + "grad_norm": 0.0020134993828833103, + "learning_rate": 1.3594770804700308e-05, + "loss": 0.4168, + "step": 35572 + }, + { + "epoch": 0.44468611715292883, + "grad_norm": 3.656313180923462, + "learning_rate": 1.3593956437979777e-05, + "loss": 1.4959, + "step": 35574 + }, + { + "epoch": 0.4447111177779445, + "grad_norm": 4.0176100730896, + "learning_rate": 1.3593142043888179e-05, + "loss": 2.1883, + "step": 35576 + }, + { + "epoch": 0.44473611840296007, + "grad_norm": 0.4731861352920532, + "learning_rate": 1.3592327622431723e-05, + "loss": 1.4969, + "step": 35578 + }, + { + "epoch": 0.4447611190279757, + "grad_norm": 2.7150869369506836, + "learning_rate": 1.3591513173616608e-05, + "loss": 0.9346, + "step": 35580 + }, + { + "epoch": 0.4447861196529913, + "grad_norm": 0.8372124433517456, + "learning_rate": 1.3590698697449034e-05, + "loss": 1.0408, + "step": 35582 + }, + { + "epoch": 0.44481112027800696, + "grad_norm": 3.2768402099609375, + "learning_rate": 1.3589884193935206e-05, + "loss": 1.3106, + "step": 35584 + }, + { + "epoch": 0.4448361209030226, + "grad_norm": 2.9335885047912598, + "learning_rate": 1.3589069663081329e-05, + "loss": 0.9072, + "step": 35586 + }, + { + "epoch": 0.4448611215280382, + "grad_norm": 2.584022283554077, + "learning_rate": 1.3588255104893606e-05, + "loss": 0.9076, + "step": 35588 + }, + { + "epoch": 0.44488612215305384, + "grad_norm": 0.6200221180915833, + "learning_rate": 1.3587440519378239e-05, + "loss": 0.8485, + "step": 35590 + }, + { + "epoch": 0.44491112277806943, + "grad_norm": 5.852635860443115, + "learning_rate": 1.3586625906541433e-05, + "loss": 1.9203, + "step": 35592 + }, + { + "epoch": 0.4449361234030851, + "grad_norm": 3.7959580421447754, + "learning_rate": 1.358581126638939e-05, + "loss": 0.1912, + "step": 35594 + }, + { + "epoch": 0.44496112402810073, + "grad_norm": 2.2373387813568115, + "learning_rate": 1.3584996598928313e-05, + "loss": 1.1919, + "step": 35596 + }, + { + "epoch": 0.4449861246531163, + "grad_norm": 0.10003320127725601, + "learning_rate": 1.3584181904164412e-05, + "loss": 0.2184, + "step": 35598 + }, + { + "epoch": 0.44501112527813197, + "grad_norm": 3.025794267654419, + "learning_rate": 1.358336718210389e-05, + "loss": 0.7021, + "step": 35600 + }, + { + "epoch": 0.44503612590314756, + "grad_norm": 5.270620346069336, + "learning_rate": 1.358255243275295e-05, + "loss": 1.0566, + "step": 35602 + }, + { + "epoch": 0.4450611265281632, + "grad_norm": 0.6234560012817383, + "learning_rate": 1.3581737656117793e-05, + "loss": 0.6208, + "step": 35604 + }, + { + "epoch": 0.44508612715317886, + "grad_norm": 6.507279872894287, + "learning_rate": 1.3580922852204635e-05, + "loss": 1.2838, + "step": 35606 + }, + { + "epoch": 0.44511112777819445, + "grad_norm": 0.0013977685011923313, + "learning_rate": 1.3580108021019667e-05, + "loss": 0.9808, + "step": 35608 + }, + { + "epoch": 0.4451361284032101, + "grad_norm": 4.059821128845215, + "learning_rate": 1.3579293162569105e-05, + "loss": 1.1718, + "step": 35610 + }, + { + "epoch": 0.4451611290282257, + "grad_norm": 3.651588201522827, + "learning_rate": 1.3578478276859157e-05, + "loss": 0.545, + "step": 35612 + }, + { + "epoch": 0.44518612965324134, + "grad_norm": 2.352771759033203, + "learning_rate": 1.3577663363896021e-05, + "loss": 0.8813, + "step": 35614 + }, + { + "epoch": 0.445211130278257, + "grad_norm": 4.007618427276611, + "learning_rate": 1.3576848423685907e-05, + "loss": 1.4183, + "step": 35616 + }, + { + "epoch": 0.4452361309032726, + "grad_norm": 0.009082823991775513, + "learning_rate": 1.3576033456235024e-05, + "loss": 0.0002, + "step": 35618 + }, + { + "epoch": 0.4452611315282882, + "grad_norm": 1.5357356071472168, + "learning_rate": 1.357521846154957e-05, + "loss": 0.0459, + "step": 35620 + }, + { + "epoch": 0.4452861321533038, + "grad_norm": 2.1940157413482666, + "learning_rate": 1.3574403439635761e-05, + "loss": 0.6384, + "step": 35622 + }, + { + "epoch": 0.44531113277831946, + "grad_norm": 2.137319803237915, + "learning_rate": 1.35735883904998e-05, + "loss": 0.1751, + "step": 35624 + }, + { + "epoch": 0.4453361334033351, + "grad_norm": 4.92578125, + "learning_rate": 1.35727733141479e-05, + "loss": 2.3971, + "step": 35626 + }, + { + "epoch": 0.4453611340283507, + "grad_norm": 0.0017216973938047886, + "learning_rate": 1.357195821058626e-05, + "loss": 0.6459, + "step": 35628 + }, + { + "epoch": 0.44538613465336635, + "grad_norm": 5.488449573516846, + "learning_rate": 1.357114307982109e-05, + "loss": 0.4744, + "step": 35630 + }, + { + "epoch": 0.44541113527838194, + "grad_norm": 1.9856317043304443, + "learning_rate": 1.35703279218586e-05, + "loss": 0.7889, + "step": 35632 + }, + { + "epoch": 0.4454361359033976, + "grad_norm": 1.895578145980835, + "learning_rate": 1.3569512736704997e-05, + "loss": 0.6854, + "step": 35634 + }, + { + "epoch": 0.44546113652841324, + "grad_norm": 6.072107315063477, + "learning_rate": 1.3568697524366493e-05, + "loss": 1.0817, + "step": 35636 + }, + { + "epoch": 0.44548613715342883, + "grad_norm": 4.166665554046631, + "learning_rate": 1.3567882284849293e-05, + "loss": 0.7849, + "step": 35638 + }, + { + "epoch": 0.4455111377784445, + "grad_norm": 1.6343313455581665, + "learning_rate": 1.3567067018159603e-05, + "loss": 0.8537, + "step": 35640 + }, + { + "epoch": 0.44553613840346007, + "grad_norm": 1.0883350372314453, + "learning_rate": 1.3566251724303637e-05, + "loss": 0.4268, + "step": 35642 + }, + { + "epoch": 0.4455611390284757, + "grad_norm": 3.1127917766571045, + "learning_rate": 1.3565436403287601e-05, + "loss": 0.7683, + "step": 35644 + }, + { + "epoch": 0.44558613965349136, + "grad_norm": 0.0010680643608793616, + "learning_rate": 1.3564621055117704e-05, + "loss": 0.604, + "step": 35646 + }, + { + "epoch": 0.44561114027850696, + "grad_norm": 4.019306182861328, + "learning_rate": 1.3563805679800159e-05, + "loss": 1.0371, + "step": 35648 + }, + { + "epoch": 0.4456361409035226, + "grad_norm": 2.6250953674316406, + "learning_rate": 1.3562990277341172e-05, + "loss": 1.2155, + "step": 35650 + }, + { + "epoch": 0.4456611415285382, + "grad_norm": 2.8553736209869385, + "learning_rate": 1.3562174847746961e-05, + "loss": 0.5776, + "step": 35652 + }, + { + "epoch": 0.44568614215355384, + "grad_norm": 3.8426525592803955, + "learning_rate": 1.3561359391023725e-05, + "loss": 0.8557, + "step": 35654 + }, + { + "epoch": 0.4457111427785695, + "grad_norm": 3.918736696243286, + "learning_rate": 1.3560543907177681e-05, + "loss": 1.2928, + "step": 35656 + }, + { + "epoch": 0.4457361434035851, + "grad_norm": 4.465418815612793, + "learning_rate": 1.3559728396215038e-05, + "loss": 2.6166, + "step": 35658 + }, + { + "epoch": 0.44576114402860073, + "grad_norm": 3.153184175491333, + "learning_rate": 1.3558912858142006e-05, + "loss": 1.2403, + "step": 35660 + }, + { + "epoch": 0.4457861446536163, + "grad_norm": 0.9733941555023193, + "learning_rate": 1.3558097292964797e-05, + "loss": 0.0915, + "step": 35662 + }, + { + "epoch": 0.44581114527863197, + "grad_norm": 2.7482826709747314, + "learning_rate": 1.3557281700689624e-05, + "loss": 0.6095, + "step": 35664 + }, + { + "epoch": 0.4458361459036476, + "grad_norm": 3.6351161003112793, + "learning_rate": 1.3556466081322695e-05, + "loss": 1.5639, + "step": 35666 + }, + { + "epoch": 0.4458611465286632, + "grad_norm": 2.2016758918762207, + "learning_rate": 1.3555650434870223e-05, + "loss": 0.4025, + "step": 35668 + }, + { + "epoch": 0.44588614715367886, + "grad_norm": 3.4692156314849854, + "learning_rate": 1.355483476133842e-05, + "loss": 1.2631, + "step": 35670 + }, + { + "epoch": 0.44591114777869445, + "grad_norm": 2.3896374702453613, + "learning_rate": 1.3554019060733499e-05, + "loss": 1.4584, + "step": 35672 + }, + { + "epoch": 0.4459361484037101, + "grad_norm": 3.170964241027832, + "learning_rate": 1.355320333306167e-05, + "loss": 0.62, + "step": 35674 + }, + { + "epoch": 0.44596114902872575, + "grad_norm": 2.3159024715423584, + "learning_rate": 1.355238757832915e-05, + "loss": 0.0798, + "step": 35676 + }, + { + "epoch": 0.44598614965374134, + "grad_norm": 4.049436569213867, + "learning_rate": 1.3551571796542147e-05, + "loss": 2.0425, + "step": 35678 + }, + { + "epoch": 0.446011150278757, + "grad_norm": 0.0008237407892011106, + "learning_rate": 1.3550755987706874e-05, + "loss": 0.0229, + "step": 35680 + }, + { + "epoch": 0.4460361509037726, + "grad_norm": 4.3107686042785645, + "learning_rate": 1.3549940151829546e-05, + "loss": 0.7796, + "step": 35682 + }, + { + "epoch": 0.4460611515287882, + "grad_norm": 0.000568824412766844, + "learning_rate": 1.3549124288916378e-05, + "loss": 0.098, + "step": 35684 + }, + { + "epoch": 0.44608615215380387, + "grad_norm": 3.377807140350342, + "learning_rate": 1.3548308398973578e-05, + "loss": 0.783, + "step": 35686 + }, + { + "epoch": 0.44611115277881946, + "grad_norm": 3.8484177589416504, + "learning_rate": 1.3547492482007365e-05, + "loss": 0.7555, + "step": 35688 + }, + { + "epoch": 0.4461361534038351, + "grad_norm": 5.0751776695251465, + "learning_rate": 1.3546676538023954e-05, + "loss": 1.4137, + "step": 35690 + }, + { + "epoch": 0.4461611540288507, + "grad_norm": 4.58221960067749, + "learning_rate": 1.3545860567029551e-05, + "loss": 0.5886, + "step": 35692 + }, + { + "epoch": 0.44618615465386635, + "grad_norm": 1.7879770994186401, + "learning_rate": 1.3545044569030376e-05, + "loss": 0.8105, + "step": 35694 + }, + { + "epoch": 0.446211155278882, + "grad_norm": 3.4172332286834717, + "learning_rate": 1.3544228544032645e-05, + "loss": 0.8207, + "step": 35696 + }, + { + "epoch": 0.4462361559038976, + "grad_norm": 2.97810697555542, + "learning_rate": 1.354341249204257e-05, + "loss": 0.6889, + "step": 35698 + }, + { + "epoch": 0.44626115652891324, + "grad_norm": 3.3902130126953125, + "learning_rate": 1.3542596413066364e-05, + "loss": 1.4636, + "step": 35700 + }, + { + "epoch": 0.44628615715392883, + "grad_norm": 0.0014007677091285586, + "learning_rate": 1.3541780307110247e-05, + "loss": 0.2307, + "step": 35702 + }, + { + "epoch": 0.4463111577789445, + "grad_norm": 2.6561930179595947, + "learning_rate": 1.3540964174180433e-05, + "loss": 0.9602, + "step": 35704 + }, + { + "epoch": 0.4463361584039601, + "grad_norm": 1.4992353916168213, + "learning_rate": 1.3540148014283134e-05, + "loss": 0.9301, + "step": 35706 + }, + { + "epoch": 0.4463611590289757, + "grad_norm": 1.9871618747711182, + "learning_rate": 1.353933182742457e-05, + "loss": 0.6988, + "step": 35708 + }, + { + "epoch": 0.44638615965399137, + "grad_norm": 5.251945972442627, + "learning_rate": 1.3538515613610957e-05, + "loss": 0.984, + "step": 35710 + }, + { + "epoch": 0.44641116027900696, + "grad_norm": 8.631775856018066, + "learning_rate": 1.3537699372848506e-05, + "loss": 2.4235, + "step": 35712 + }, + { + "epoch": 0.4464361609040226, + "grad_norm": 1.4999566078186035, + "learning_rate": 1.3536883105143437e-05, + "loss": 0.1002, + "step": 35714 + }, + { + "epoch": 0.44646116152903825, + "grad_norm": 6.375189304351807, + "learning_rate": 1.353606681050197e-05, + "loss": 0.3595, + "step": 35716 + }, + { + "epoch": 0.44648616215405384, + "grad_norm": 3.873727798461914, + "learning_rate": 1.3535250488930316e-05, + "loss": 1.848, + "step": 35718 + }, + { + "epoch": 0.4465111627790695, + "grad_norm": 1.1192643642425537, + "learning_rate": 1.3534434140434693e-05, + "loss": 0.4962, + "step": 35720 + }, + { + "epoch": 0.4465361634040851, + "grad_norm": 0.3690653145313263, + "learning_rate": 1.353361776502132e-05, + "loss": 0.7412, + "step": 35722 + }, + { + "epoch": 0.44656116402910073, + "grad_norm": 2.674016237258911, + "learning_rate": 1.3532801362696414e-05, + "loss": 0.43, + "step": 35724 + }, + { + "epoch": 0.4465861646541164, + "grad_norm": 3.0732226371765137, + "learning_rate": 1.3531984933466193e-05, + "loss": 0.5867, + "step": 35726 + }, + { + "epoch": 0.44661116527913197, + "grad_norm": 4.417179584503174, + "learning_rate": 1.3531168477336874e-05, + "loss": 0.9206, + "step": 35728 + }, + { + "epoch": 0.4466361659041476, + "grad_norm": 2.1703898906707764, + "learning_rate": 1.3530351994314676e-05, + "loss": 0.5223, + "step": 35730 + }, + { + "epoch": 0.4466611665291632, + "grad_norm": 0.0009647269616834819, + "learning_rate": 1.3529535484405819e-05, + "loss": 0.0, + "step": 35732 + }, + { + "epoch": 0.44668616715417886, + "grad_norm": 0.0011021244572475553, + "learning_rate": 1.3528718947616513e-05, + "loss": 0.4301, + "step": 35734 + }, + { + "epoch": 0.4467111677791945, + "grad_norm": 4.824124813079834, + "learning_rate": 1.3527902383952984e-05, + "loss": 1.7908, + "step": 35736 + }, + { + "epoch": 0.4467361684042101, + "grad_norm": 3.344773530960083, + "learning_rate": 1.352708579342145e-05, + "loss": 1.1383, + "step": 35738 + }, + { + "epoch": 0.44676116902922575, + "grad_norm": 2.2353274822235107, + "learning_rate": 1.352626917602813e-05, + "loss": 1.1334, + "step": 35740 + }, + { + "epoch": 0.44678616965424134, + "grad_norm": 1.8890732526779175, + "learning_rate": 1.3525452531779244e-05, + "loss": 0.3039, + "step": 35742 + }, + { + "epoch": 0.446811170279257, + "grad_norm": 3.857964038848877, + "learning_rate": 1.352463586068101e-05, + "loss": 0.831, + "step": 35744 + }, + { + "epoch": 0.44683617090427263, + "grad_norm": 4.98272705078125, + "learning_rate": 1.3523819162739643e-05, + "loss": 1.4129, + "step": 35746 + }, + { + "epoch": 0.4468611715292882, + "grad_norm": 2.325498342514038, + "learning_rate": 1.352300243796137e-05, + "loss": 1.4127, + "step": 35748 + }, + { + "epoch": 0.4468861721543039, + "grad_norm": 6.248244762420654, + "learning_rate": 1.3522185686352411e-05, + "loss": 0.6841, + "step": 35750 + }, + { + "epoch": 0.44691117277931947, + "grad_norm": 2.7043938636779785, + "learning_rate": 1.3521368907918983e-05, + "loss": 0.8715, + "step": 35752 + }, + { + "epoch": 0.4469361734043351, + "grad_norm": 6.683599472045898, + "learning_rate": 1.3520552102667306e-05, + "loss": 0.1881, + "step": 35754 + }, + { + "epoch": 0.44696117402935076, + "grad_norm": 5.3548583984375, + "learning_rate": 1.3519735270603604e-05, + "loss": 1.6503, + "step": 35756 + }, + { + "epoch": 0.44698617465436635, + "grad_norm": 5.433345794677734, + "learning_rate": 1.3518918411734094e-05, + "loss": 1.726, + "step": 35758 + }, + { + "epoch": 0.447011175279382, + "grad_norm": 4.2754364013671875, + "learning_rate": 1.3518101526064999e-05, + "loss": 1.9215, + "step": 35760 + }, + { + "epoch": 0.4470361759043976, + "grad_norm": 2.1073265075683594, + "learning_rate": 1.3517284613602542e-05, + "loss": 0.9069, + "step": 35762 + }, + { + "epoch": 0.44706117652941324, + "grad_norm": 1.7526735067367554, + "learning_rate": 1.351646767435294e-05, + "loss": 1.0663, + "step": 35764 + }, + { + "epoch": 0.4470861771544289, + "grad_norm": 4.346839427947998, + "learning_rate": 1.3515650708322418e-05, + "loss": 2.023, + "step": 35766 + }, + { + "epoch": 0.4471111777794445, + "grad_norm": 3.6581711769104004, + "learning_rate": 1.3514833715517201e-05, + "loss": 1.066, + "step": 35768 + }, + { + "epoch": 0.4471361784044601, + "grad_norm": 5.47685432434082, + "learning_rate": 1.3514016695943506e-05, + "loss": 1.0759, + "step": 35770 + }, + { + "epoch": 0.4471611790294757, + "grad_norm": 0.3379668891429901, + "learning_rate": 1.3513199649607555e-05, + "loss": 0.7294, + "step": 35772 + }, + { + "epoch": 0.44718617965449137, + "grad_norm": 2.6840672492980957, + "learning_rate": 1.3512382576515571e-05, + "loss": 1.3801, + "step": 35774 + }, + { + "epoch": 0.447211180279507, + "grad_norm": 0.0010978335049003363, + "learning_rate": 1.351156547667378e-05, + "loss": 0.3714, + "step": 35776 + }, + { + "epoch": 0.4472361809045226, + "grad_norm": 2.549156665802002, + "learning_rate": 1.3510748350088403e-05, + "loss": 1.2678, + "step": 35778 + }, + { + "epoch": 0.44726118152953825, + "grad_norm": 4.338800430297852, + "learning_rate": 1.3509931196765665e-05, + "loss": 0.4192, + "step": 35780 + }, + { + "epoch": 0.44728618215455385, + "grad_norm": 2.8121016025543213, + "learning_rate": 1.3509114016711786e-05, + "loss": 1.6064, + "step": 35782 + }, + { + "epoch": 0.4473111827795695, + "grad_norm": 0.4469984769821167, + "learning_rate": 1.3508296809932989e-05, + "loss": 0.0799, + "step": 35784 + }, + { + "epoch": 0.44733618340458514, + "grad_norm": 2.445657730102539, + "learning_rate": 1.3507479576435499e-05, + "loss": 1.0323, + "step": 35786 + }, + { + "epoch": 0.44736118402960073, + "grad_norm": 4.0733232498168945, + "learning_rate": 1.3506662316225544e-05, + "loss": 1.8254, + "step": 35788 + }, + { + "epoch": 0.4473861846546164, + "grad_norm": 4.424252986907959, + "learning_rate": 1.3505845029309344e-05, + "loss": 0.486, + "step": 35790 + }, + { + "epoch": 0.447411185279632, + "grad_norm": 2.6400818824768066, + "learning_rate": 1.3505027715693122e-05, + "loss": 1.5177, + "step": 35792 + }, + { + "epoch": 0.4474361859046476, + "grad_norm": 2.165755033493042, + "learning_rate": 1.3504210375383107e-05, + "loss": 0.3268, + "step": 35794 + }, + { + "epoch": 0.44746118652966327, + "grad_norm": 5.041914463043213, + "learning_rate": 1.3503393008385519e-05, + "loss": 0.6789, + "step": 35796 + }, + { + "epoch": 0.44748618715467886, + "grad_norm": 2.780505418777466, + "learning_rate": 1.3502575614706584e-05, + "loss": 0.7248, + "step": 35798 + }, + { + "epoch": 0.4475111877796945, + "grad_norm": 5.968677997589111, + "learning_rate": 1.350175819435253e-05, + "loss": 0.4715, + "step": 35800 + }, + { + "epoch": 0.4475361884047101, + "grad_norm": 1.5841792821884155, + "learning_rate": 1.3500940747329583e-05, + "loss": 0.317, + "step": 35802 + }, + { + "epoch": 0.44756118902972575, + "grad_norm": 8.971946716308594, + "learning_rate": 1.3500123273643966e-05, + "loss": 0.7855, + "step": 35804 + }, + { + "epoch": 0.4475861896547414, + "grad_norm": 0.00121134368237108, + "learning_rate": 1.3499305773301902e-05, + "loss": 0.6447, + "step": 35806 + }, + { + "epoch": 0.447611190279757, + "grad_norm": 2.894648790359497, + "learning_rate": 1.3498488246309624e-05, + "loss": 0.917, + "step": 35808 + }, + { + "epoch": 0.44763619090477264, + "grad_norm": 4.667994976043701, + "learning_rate": 1.349767069267335e-05, + "loss": 1.1772, + "step": 35810 + }, + { + "epoch": 0.4476611915297882, + "grad_norm": 4.556054592132568, + "learning_rate": 1.3496853112399313e-05, + "loss": 1.1803, + "step": 35812 + }, + { + "epoch": 0.4476861921548039, + "grad_norm": 3.282982587814331, + "learning_rate": 1.3496035505493738e-05, + "loss": 0.8726, + "step": 35814 + }, + { + "epoch": 0.4477111927798195, + "grad_norm": 3.746016263961792, + "learning_rate": 1.3495217871962849e-05, + "loss": 0.6227, + "step": 35816 + }, + { + "epoch": 0.4477361934048351, + "grad_norm": 4.35187292098999, + "learning_rate": 1.3494400211812876e-05, + "loss": 0.976, + "step": 35818 + }, + { + "epoch": 0.44776119402985076, + "grad_norm": 1.4937622547149658, + "learning_rate": 1.3493582525050047e-05, + "loss": 1.0731, + "step": 35820 + }, + { + "epoch": 0.44778619465486635, + "grad_norm": 3.2862260341644287, + "learning_rate": 1.3492764811680583e-05, + "loss": 0.8784, + "step": 35822 + }, + { + "epoch": 0.447811195279882, + "grad_norm": 3.686823606491089, + "learning_rate": 1.3491947071710718e-05, + "loss": 0.2787, + "step": 35824 + }, + { + "epoch": 0.44783619590489765, + "grad_norm": 4.226833343505859, + "learning_rate": 1.349112930514668e-05, + "loss": 0.1932, + "step": 35826 + }, + { + "epoch": 0.44786119652991324, + "grad_norm": 6.003742694854736, + "learning_rate": 1.3490311511994696e-05, + "loss": 0.7723, + "step": 35828 + }, + { + "epoch": 0.4478861971549289, + "grad_norm": 3.4880387783050537, + "learning_rate": 1.348949369226099e-05, + "loss": 1.5011, + "step": 35830 + }, + { + "epoch": 0.4479111977799445, + "grad_norm": 3.006528377532959, + "learning_rate": 1.3488675845951793e-05, + "loss": 1.75, + "step": 35832 + }, + { + "epoch": 0.44793619840496013, + "grad_norm": 2.985197067260742, + "learning_rate": 1.3487857973073338e-05, + "loss": 0.6448, + "step": 35834 + }, + { + "epoch": 0.4479611990299758, + "grad_norm": 2.504432439804077, + "learning_rate": 1.3487040073631843e-05, + "loss": 0.4795, + "step": 35836 + }, + { + "epoch": 0.44798619965499137, + "grad_norm": 2.6442410945892334, + "learning_rate": 1.348622214763355e-05, + "loss": 1.1992, + "step": 35838 + }, + { + "epoch": 0.448011200280007, + "grad_norm": 3.6728408336639404, + "learning_rate": 1.3485404195084678e-05, + "loss": 0.8941, + "step": 35840 + }, + { + "epoch": 0.4480362009050226, + "grad_norm": 3.4575257301330566, + "learning_rate": 1.3484586215991465e-05, + "loss": 1.5597, + "step": 35842 + }, + { + "epoch": 0.44806120153003826, + "grad_norm": 0.0010703591397032142, + "learning_rate": 1.3483768210360133e-05, + "loss": 0.6934, + "step": 35844 + }, + { + "epoch": 0.4480862021550539, + "grad_norm": 4.661465167999268, + "learning_rate": 1.3482950178196919e-05, + "loss": 1.299, + "step": 35846 + }, + { + "epoch": 0.4481112027800695, + "grad_norm": 3.9219088554382324, + "learning_rate": 1.3482132119508043e-05, + "loss": 0.367, + "step": 35848 + }, + { + "epoch": 0.44813620340508514, + "grad_norm": 4.102813720703125, + "learning_rate": 1.3481314034299745e-05, + "loss": 1.4001, + "step": 35850 + }, + { + "epoch": 0.44816120403010073, + "grad_norm": 2.7193193435668945, + "learning_rate": 1.3480495922578248e-05, + "loss": 0.7451, + "step": 35852 + }, + { + "epoch": 0.4481862046551164, + "grad_norm": 0.0011351692955940962, + "learning_rate": 1.3479677784349791e-05, + "loss": 0.0009, + "step": 35854 + }, + { + "epoch": 0.44821120528013203, + "grad_norm": 3.8168764114379883, + "learning_rate": 1.3478859619620596e-05, + "loss": 0.3913, + "step": 35856 + }, + { + "epoch": 0.4482362059051476, + "grad_norm": 1.887445330619812, + "learning_rate": 1.3478041428396902e-05, + "loss": 1.3738, + "step": 35858 + }, + { + "epoch": 0.44826120653016327, + "grad_norm": 4.599379539489746, + "learning_rate": 1.3477223210684932e-05, + "loss": 1.7818, + "step": 35860 + }, + { + "epoch": 0.44828620715517886, + "grad_norm": 2.6248013973236084, + "learning_rate": 1.3476404966490927e-05, + "loss": 1.8635, + "step": 35862 + }, + { + "epoch": 0.4483112077801945, + "grad_norm": 0.02249562181532383, + "learning_rate": 1.3475586695821109e-05, + "loss": 0.0191, + "step": 35864 + }, + { + "epoch": 0.44833620840521016, + "grad_norm": 2.005678653717041, + "learning_rate": 1.3474768398681718e-05, + "loss": 0.6634, + "step": 35866 + }, + { + "epoch": 0.44836120903022575, + "grad_norm": 3.9025707244873047, + "learning_rate": 1.347395007507898e-05, + "loss": 1.1239, + "step": 35868 + }, + { + "epoch": 0.4483862096552414, + "grad_norm": 1.8835833072662354, + "learning_rate": 1.3473131725019126e-05, + "loss": 0.2719, + "step": 35870 + }, + { + "epoch": 0.448411210280257, + "grad_norm": 2.4123008251190186, + "learning_rate": 1.3472313348508396e-05, + "loss": 1.5588, + "step": 35872 + }, + { + "epoch": 0.44843621090527264, + "grad_norm": 2.1669721603393555, + "learning_rate": 1.3471494945553018e-05, + "loss": 0.302, + "step": 35874 + }, + { + "epoch": 0.4484612115302883, + "grad_norm": 2.8401248455047607, + "learning_rate": 1.3470676516159224e-05, + "loss": 0.6755, + "step": 35876 + }, + { + "epoch": 0.4484862121553039, + "grad_norm": 2.9660916328430176, + "learning_rate": 1.346985806033325e-05, + "loss": 0.9485, + "step": 35878 + }, + { + "epoch": 0.4485112127803195, + "grad_norm": 5.000145435333252, + "learning_rate": 1.3469039578081329e-05, + "loss": 1.1492, + "step": 35880 + }, + { + "epoch": 0.4485362134053351, + "grad_norm": 4.294728755950928, + "learning_rate": 1.3468221069409693e-05, + "loss": 1.3852, + "step": 35882 + }, + { + "epoch": 0.44856121403035076, + "grad_norm": 3.2513198852539062, + "learning_rate": 1.3467402534324575e-05, + "loss": 1.5222, + "step": 35884 + }, + { + "epoch": 0.4485862146553664, + "grad_norm": 2.7598204612731934, + "learning_rate": 1.3466583972832208e-05, + "loss": 0.823, + "step": 35886 + }, + { + "epoch": 0.448611215280382, + "grad_norm": 3.595219612121582, + "learning_rate": 1.3465765384938827e-05, + "loss": 0.8523, + "step": 35888 + }, + { + "epoch": 0.44863621590539765, + "grad_norm": 0.0008791686268523335, + "learning_rate": 1.3464946770650668e-05, + "loss": 0.4443, + "step": 35890 + }, + { + "epoch": 0.44866121653041324, + "grad_norm": 5.707756996154785, + "learning_rate": 1.3464128129973969e-05, + "loss": 0.7341, + "step": 35892 + }, + { + "epoch": 0.4486862171554289, + "grad_norm": 4.527388095855713, + "learning_rate": 1.3463309462914956e-05, + "loss": 1.0091, + "step": 35894 + }, + { + "epoch": 0.44871121778044454, + "grad_norm": 4.609324932098389, + "learning_rate": 1.3462490769479865e-05, + "loss": 0.761, + "step": 35896 + }, + { + "epoch": 0.44873621840546013, + "grad_norm": 1.3032536506652832, + "learning_rate": 1.3461672049674937e-05, + "loss": 0.6974, + "step": 35898 + }, + { + "epoch": 0.4487612190304758, + "grad_norm": 9.63546371459961, + "learning_rate": 1.3460853303506403e-05, + "loss": 0.8454, + "step": 35900 + }, + { + "epoch": 0.44878621965549137, + "grad_norm": 3.6919071674346924, + "learning_rate": 1.3460034530980502e-05, + "loss": 1.1707, + "step": 35902 + }, + { + "epoch": 0.448811220280507, + "grad_norm": 0.9915325045585632, + "learning_rate": 1.3459215732103466e-05, + "loss": 0.314, + "step": 35904 + }, + { + "epoch": 0.44883622090552266, + "grad_norm": 4.596720218658447, + "learning_rate": 1.3458396906881533e-05, + "loss": 2.1899, + "step": 35906 + }, + { + "epoch": 0.44886122153053826, + "grad_norm": 1.5454933643341064, + "learning_rate": 1.3457578055320936e-05, + "loss": 1.2471, + "step": 35908 + }, + { + "epoch": 0.4488862221555539, + "grad_norm": 3.307003974914551, + "learning_rate": 1.3456759177427916e-05, + "loss": 0.1971, + "step": 35910 + }, + { + "epoch": 0.4489112227805695, + "grad_norm": 5.739518165588379, + "learning_rate": 1.3455940273208706e-05, + "loss": 1.1537, + "step": 35912 + }, + { + "epoch": 0.44893622340558514, + "grad_norm": 5.124398708343506, + "learning_rate": 1.3455121342669542e-05, + "loss": 1.0461, + "step": 35914 + }, + { + "epoch": 0.4489612240306008, + "grad_norm": 3.4500534534454346, + "learning_rate": 1.3454302385816665e-05, + "loss": 1.3068, + "step": 35916 + }, + { + "epoch": 0.4489862246556164, + "grad_norm": 3.3269293308258057, + "learning_rate": 1.3453483402656308e-05, + "loss": 0.6393, + "step": 35918 + }, + { + "epoch": 0.44901122528063203, + "grad_norm": 5.917181015014648, + "learning_rate": 1.3452664393194712e-05, + "loss": 1.686, + "step": 35920 + }, + { + "epoch": 0.4490362259056476, + "grad_norm": 3.9086179733276367, + "learning_rate": 1.3451845357438109e-05, + "loss": 1.19, + "step": 35922 + }, + { + "epoch": 0.44906122653066327, + "grad_norm": 2.436161756515503, + "learning_rate": 1.345102629539274e-05, + "loss": 0.7136, + "step": 35924 + }, + { + "epoch": 0.4490862271556789, + "grad_norm": 4.488478660583496, + "learning_rate": 1.3450207207064845e-05, + "loss": 1.0371, + "step": 35926 + }, + { + "epoch": 0.4491112277806945, + "grad_norm": 1.2277231216430664, + "learning_rate": 1.3449388092460657e-05, + "loss": 0.0237, + "step": 35928 + }, + { + "epoch": 0.44913622840571016, + "grad_norm": 2.44673752784729, + "learning_rate": 1.3448568951586421e-05, + "loss": 0.4706, + "step": 35930 + }, + { + "epoch": 0.44916122903072575, + "grad_norm": 0.09991008043289185, + "learning_rate": 1.344774978444837e-05, + "loss": 1.8988, + "step": 35932 + }, + { + "epoch": 0.4491862296557414, + "grad_norm": 3.893907308578491, + "learning_rate": 1.3446930591052742e-05, + "loss": 0.7051, + "step": 35934 + }, + { + "epoch": 0.44921123028075705, + "grad_norm": 4.566971778869629, + "learning_rate": 1.3446111371405782e-05, + "loss": 1.5417, + "step": 35936 + }, + { + "epoch": 0.44923623090577264, + "grad_norm": 2.1126770973205566, + "learning_rate": 1.344529212551372e-05, + "loss": 0.8103, + "step": 35938 + }, + { + "epoch": 0.4492612315307883, + "grad_norm": 2.7326748371124268, + "learning_rate": 1.3444472853382804e-05, + "loss": 1.3606, + "step": 35940 + }, + { + "epoch": 0.4492862321558039, + "grad_norm": 3.9993515014648438, + "learning_rate": 1.344365355501927e-05, + "loss": 1.2015, + "step": 35942 + }, + { + "epoch": 0.4493112327808195, + "grad_norm": 2.0868935585021973, + "learning_rate": 1.3442834230429358e-05, + "loss": 0.5847, + "step": 35944 + }, + { + "epoch": 0.4493362334058352, + "grad_norm": 3.674156427383423, + "learning_rate": 1.3442014879619305e-05, + "loss": 0.7104, + "step": 35946 + }, + { + "epoch": 0.44936123403085076, + "grad_norm": 0.0009681438677944243, + "learning_rate": 1.3441195502595355e-05, + "loss": 0.5412, + "step": 35948 + }, + { + "epoch": 0.4493862346558664, + "grad_norm": 3.678738832473755, + "learning_rate": 1.3440376099363741e-05, + "loss": 1.3049, + "step": 35950 + }, + { + "epoch": 0.449411235280882, + "grad_norm": 0.5162949562072754, + "learning_rate": 1.3439556669930718e-05, + "loss": 0.1869, + "step": 35952 + }, + { + "epoch": 0.44943623590589765, + "grad_norm": 5.882934093475342, + "learning_rate": 1.3438737214302512e-05, + "loss": 1.672, + "step": 35954 + }, + { + "epoch": 0.4494612365309133, + "grad_norm": 2.2233073711395264, + "learning_rate": 1.3437917732485372e-05, + "loss": 0.5283, + "step": 35956 + }, + { + "epoch": 0.4494862371559289, + "grad_norm": 3.1931872367858887, + "learning_rate": 1.3437098224485539e-05, + "loss": 0.9054, + "step": 35958 + }, + { + "epoch": 0.44951123778094454, + "grad_norm": 5.309523105621338, + "learning_rate": 1.3436278690309248e-05, + "loss": 1.1124, + "step": 35960 + }, + { + "epoch": 0.44953623840596013, + "grad_norm": 3.7283308506011963, + "learning_rate": 1.3435459129962745e-05, + "loss": 0.602, + "step": 35962 + }, + { + "epoch": 0.4495612390309758, + "grad_norm": 2.105661630630493, + "learning_rate": 1.3434639543452273e-05, + "loss": 1.2559, + "step": 35964 + }, + { + "epoch": 0.4495862396559914, + "grad_norm": 0.0011797084007412195, + "learning_rate": 1.3433819930784069e-05, + "loss": 0.0626, + "step": 35966 + }, + { + "epoch": 0.449611240281007, + "grad_norm": 3.520146131515503, + "learning_rate": 1.343300029196438e-05, + "loss": 0.2871, + "step": 35968 + }, + { + "epoch": 0.44963624090602267, + "grad_norm": 1.926448941230774, + "learning_rate": 1.3432180626999448e-05, + "loss": 1.4879, + "step": 35970 + }, + { + "epoch": 0.44966124153103826, + "grad_norm": 4.023420333862305, + "learning_rate": 1.3431360935895513e-05, + "loss": 1.2735, + "step": 35972 + }, + { + "epoch": 0.4496862421560539, + "grad_norm": 2.2283740043640137, + "learning_rate": 1.3430541218658814e-05, + "loss": 1.002, + "step": 35974 + }, + { + "epoch": 0.44971124278106955, + "grad_norm": 6.195583343505859, + "learning_rate": 1.3429721475295598e-05, + "loss": 1.9463, + "step": 35976 + }, + { + "epoch": 0.44973624340608515, + "grad_norm": 3.6197428703308105, + "learning_rate": 1.3428901705812117e-05, + "loss": 1.607, + "step": 35978 + }, + { + "epoch": 0.4497612440311008, + "grad_norm": 5.208782196044922, + "learning_rate": 1.34280819102146e-05, + "loss": 1.5785, + "step": 35980 + }, + { + "epoch": 0.4497862446561164, + "grad_norm": 2.171855926513672, + "learning_rate": 1.3427262088509294e-05, + "loss": 0.3319, + "step": 35982 + }, + { + "epoch": 0.44981124528113203, + "grad_norm": 4.383617877960205, + "learning_rate": 1.3426442240702446e-05, + "loss": 1.8428, + "step": 35984 + }, + { + "epoch": 0.4498362459061477, + "grad_norm": 3.354315757751465, + "learning_rate": 1.3425622366800299e-05, + "loss": 0.6484, + "step": 35986 + }, + { + "epoch": 0.4498612465311633, + "grad_norm": 0.0008014451595954597, + "learning_rate": 1.3424802466809094e-05, + "loss": 0.3976, + "step": 35988 + }, + { + "epoch": 0.4498862471561789, + "grad_norm": 2.792407274246216, + "learning_rate": 1.3423982540735083e-05, + "loss": 1.0901, + "step": 35990 + }, + { + "epoch": 0.4499112477811945, + "grad_norm": 5.643129825592041, + "learning_rate": 1.3423162588584503e-05, + "loss": 0.707, + "step": 35992 + }, + { + "epoch": 0.44993624840621016, + "grad_norm": 2.29764723777771, + "learning_rate": 1.3422342610363599e-05, + "loss": 0.1984, + "step": 35994 + }, + { + "epoch": 0.4499612490312258, + "grad_norm": 0.19323939085006714, + "learning_rate": 1.342152260607862e-05, + "loss": 0.0855, + "step": 35996 + }, + { + "epoch": 0.4499862496562414, + "grad_norm": 0.5231638550758362, + "learning_rate": 1.342070257573581e-05, + "loss": 0.1593, + "step": 35998 + }, + { + "epoch": 0.45001125028125705, + "grad_norm": 2.7005622386932373, + "learning_rate": 1.3419882519341405e-05, + "loss": 0.4724, + "step": 36000 + }, + { + "epoch": 0.45003625090627264, + "grad_norm": 3.4787709712982178, + "learning_rate": 1.3419062436901664e-05, + "loss": 0.3822, + "step": 36002 + }, + { + "epoch": 0.4500612515312883, + "grad_norm": 0.0011200136505067348, + "learning_rate": 1.3418242328422829e-05, + "loss": 0.8673, + "step": 36004 + }, + { + "epoch": 0.45008625215630393, + "grad_norm": 2.43742036819458, + "learning_rate": 1.341742219391114e-05, + "loss": 0.1459, + "step": 36006 + }, + { + "epoch": 0.4501112527813195, + "grad_norm": 2.2319135665893555, + "learning_rate": 1.341660203337285e-05, + "loss": 0.7823, + "step": 36008 + }, + { + "epoch": 0.4501362534063352, + "grad_norm": 2.278317928314209, + "learning_rate": 1.3415781846814202e-05, + "loss": 0.9902, + "step": 36010 + }, + { + "epoch": 0.45016125403135077, + "grad_norm": 1.077907919883728, + "learning_rate": 1.341496163424144e-05, + "loss": 0.6446, + "step": 36012 + }, + { + "epoch": 0.4501862546563664, + "grad_norm": 3.736910343170166, + "learning_rate": 1.341414139566081e-05, + "loss": 1.3309, + "step": 36014 + }, + { + "epoch": 0.45021125528138206, + "grad_norm": 1.4532923698425293, + "learning_rate": 1.3413321131078569e-05, + "loss": 0.1342, + "step": 36016 + }, + { + "epoch": 0.45023625590639765, + "grad_norm": 3.7309110164642334, + "learning_rate": 1.3412500840500955e-05, + "loss": 1.084, + "step": 36018 + }, + { + "epoch": 0.4502612565314133, + "grad_norm": 1.2290167808532715, + "learning_rate": 1.3411680523934214e-05, + "loss": 0.8025, + "step": 36020 + }, + { + "epoch": 0.4502862571564289, + "grad_norm": 0.9523539543151855, + "learning_rate": 1.3410860181384599e-05, + "loss": 0.6386, + "step": 36022 + }, + { + "epoch": 0.45031125778144454, + "grad_norm": 2.9812633991241455, + "learning_rate": 1.3410039812858355e-05, + "loss": 1.2622, + "step": 36024 + }, + { + "epoch": 0.4503362584064602, + "grad_norm": 0.7050113677978516, + "learning_rate": 1.340921941836173e-05, + "loss": 0.2583, + "step": 36026 + }, + { + "epoch": 0.4503612590314758, + "grad_norm": 5.276010990142822, + "learning_rate": 1.340839899790097e-05, + "loss": 1.392, + "step": 36028 + }, + { + "epoch": 0.45038625965649143, + "grad_norm": 3.41342830657959, + "learning_rate": 1.340757855148233e-05, + "loss": 0.394, + "step": 36030 + }, + { + "epoch": 0.450411260281507, + "grad_norm": 2.681105613708496, + "learning_rate": 1.340675807911205e-05, + "loss": 0.8921, + "step": 36032 + }, + { + "epoch": 0.45043626090652267, + "grad_norm": 4.2165207862854, + "learning_rate": 1.3405937580796381e-05, + "loss": 1.5476, + "step": 36034 + }, + { + "epoch": 0.4504612615315383, + "grad_norm": 4.280266761779785, + "learning_rate": 1.3405117056541578e-05, + "loss": 2.0225, + "step": 36036 + }, + { + "epoch": 0.4504862621565539, + "grad_norm": 0.0038163720164448023, + "learning_rate": 1.340429650635388e-05, + "loss": 0.5772, + "step": 36038 + }, + { + "epoch": 0.45051126278156955, + "grad_norm": 2.958859920501709, + "learning_rate": 1.3403475930239543e-05, + "loss": 0.6837, + "step": 36040 + }, + { + "epoch": 0.45053626340658515, + "grad_norm": 0.0014714046847075224, + "learning_rate": 1.3402655328204816e-05, + "loss": 0.0405, + "step": 36042 + }, + { + "epoch": 0.4505612640316008, + "grad_norm": 2.733513355255127, + "learning_rate": 1.3401834700255947e-05, + "loss": 0.1403, + "step": 36044 + }, + { + "epoch": 0.45058626465661644, + "grad_norm": 1.7506306171417236, + "learning_rate": 1.3401014046399183e-05, + "loss": 0.6903, + "step": 36046 + }, + { + "epoch": 0.45061126528163203, + "grad_norm": 3.9652442932128906, + "learning_rate": 1.340019336664078e-05, + "loss": 1.3654, + "step": 36048 + }, + { + "epoch": 0.4506362659066477, + "grad_norm": 4.048565864562988, + "learning_rate": 1.3399372660986983e-05, + "loss": 0.3462, + "step": 36050 + }, + { + "epoch": 0.4506612665316633, + "grad_norm": 1.6855454444885254, + "learning_rate": 1.3398551929444046e-05, + "loss": 0.644, + "step": 36052 + }, + { + "epoch": 0.4506862671566789, + "grad_norm": 5.836450099945068, + "learning_rate": 1.3397731172018218e-05, + "loss": 0.5076, + "step": 36054 + }, + { + "epoch": 0.45071126778169457, + "grad_norm": 4.424631595611572, + "learning_rate": 1.3396910388715752e-05, + "loss": 1.0988, + "step": 36056 + }, + { + "epoch": 0.45073626840671016, + "grad_norm": 2.2734508514404297, + "learning_rate": 1.3396089579542892e-05, + "loss": 1.1219, + "step": 36058 + }, + { + "epoch": 0.4507612690317258, + "grad_norm": 1.8716950416564941, + "learning_rate": 1.3395268744505897e-05, + "loss": 0.1699, + "step": 36060 + }, + { + "epoch": 0.4507862696567414, + "grad_norm": 6.230964183807373, + "learning_rate": 1.3394447883611015e-05, + "loss": 0.8097, + "step": 36062 + }, + { + "epoch": 0.45081127028175705, + "grad_norm": 6.016506195068359, + "learning_rate": 1.3393626996864498e-05, + "loss": 1.2255, + "step": 36064 + }, + { + "epoch": 0.4508362709067727, + "grad_norm": 3.675330400466919, + "learning_rate": 1.3392806084272597e-05, + "loss": 1.1693, + "step": 36066 + }, + { + "epoch": 0.4508612715317883, + "grad_norm": 3.0206401348114014, + "learning_rate": 1.3391985145841566e-05, + "loss": 1.5954, + "step": 36068 + }, + { + "epoch": 0.45088627215680394, + "grad_norm": 5.27079963684082, + "learning_rate": 1.3391164181577656e-05, + "loss": 0.4765, + "step": 36070 + }, + { + "epoch": 0.4509112727818195, + "grad_norm": 4.654643535614014, + "learning_rate": 1.3390343191487116e-05, + "loss": 1.6564, + "step": 36072 + }, + { + "epoch": 0.4509362734068352, + "grad_norm": 6.2484965324401855, + "learning_rate": 1.3389522175576204e-05, + "loss": 2.0203, + "step": 36074 + }, + { + "epoch": 0.4509612740318508, + "grad_norm": 4.585244178771973, + "learning_rate": 1.3388701133851169e-05, + "loss": 1.4529, + "step": 36076 + }, + { + "epoch": 0.4509862746568664, + "grad_norm": 0.0011424868134781718, + "learning_rate": 1.3387880066318268e-05, + "loss": 0.5027, + "step": 36078 + }, + { + "epoch": 0.45101127528188206, + "grad_norm": 2.8101658821105957, + "learning_rate": 1.3387058972983749e-05, + "loss": 0.6253, + "step": 36080 + }, + { + "epoch": 0.45103627590689765, + "grad_norm": 4.472098350524902, + "learning_rate": 1.3386237853853872e-05, + "loss": 2.2902, + "step": 36082 + }, + { + "epoch": 0.4510612765319133, + "grad_norm": 3.20923113822937, + "learning_rate": 1.3385416708934882e-05, + "loss": 1.596, + "step": 36084 + }, + { + "epoch": 0.45108627715692895, + "grad_norm": 2.583244562149048, + "learning_rate": 1.3384595538233038e-05, + "loss": 0.4556, + "step": 36086 + }, + { + "epoch": 0.45111127778194454, + "grad_norm": 1.9370434284210205, + "learning_rate": 1.3383774341754593e-05, + "loss": 1.5175, + "step": 36088 + }, + { + "epoch": 0.4511362784069602, + "grad_norm": 2.9757938385009766, + "learning_rate": 1.3382953119505802e-05, + "loss": 0.6994, + "step": 36090 + }, + { + "epoch": 0.4511612790319758, + "grad_norm": 2.6701560020446777, + "learning_rate": 1.3382131871492918e-05, + "loss": 1.3978, + "step": 36092 + }, + { + "epoch": 0.45118627965699143, + "grad_norm": 0.00362979038618505, + "learning_rate": 1.3381310597722197e-05, + "loss": 0.7121, + "step": 36094 + }, + { + "epoch": 0.4512112802820071, + "grad_norm": 2.513594388961792, + "learning_rate": 1.3380489298199892e-05, + "loss": 2.3931, + "step": 36096 + }, + { + "epoch": 0.45123628090702267, + "grad_norm": 3.850039482116699, + "learning_rate": 1.3379667972932258e-05, + "loss": 0.5446, + "step": 36098 + }, + { + "epoch": 0.4512612815320383, + "grad_norm": 1.158107042312622, + "learning_rate": 1.337884662192555e-05, + "loss": 1.0021, + "step": 36100 + }, + { + "epoch": 0.4512862821570539, + "grad_norm": 3.86726713180542, + "learning_rate": 1.3378025245186024e-05, + "loss": 1.1381, + "step": 36102 + }, + { + "epoch": 0.45131128278206956, + "grad_norm": 7.967104434967041, + "learning_rate": 1.3377203842719935e-05, + "loss": 1.1465, + "step": 36104 + }, + { + "epoch": 0.4513362834070852, + "grad_norm": 2.011671781539917, + "learning_rate": 1.337638241453354e-05, + "loss": 0.8972, + "step": 36106 + }, + { + "epoch": 0.4513612840321008, + "grad_norm": 2.4515225887298584, + "learning_rate": 1.3375560960633096e-05, + "loss": 0.7286, + "step": 36108 + }, + { + "epoch": 0.45138628465711644, + "grad_norm": 6.781315326690674, + "learning_rate": 1.3374739481024857e-05, + "loss": 1.0415, + "step": 36110 + }, + { + "epoch": 0.45141128528213204, + "grad_norm": 4.577553749084473, + "learning_rate": 1.3373917975715076e-05, + "loss": 0.6141, + "step": 36112 + }, + { + "epoch": 0.4514362859071477, + "grad_norm": 22.9893856048584, + "learning_rate": 1.3373096444710014e-05, + "loss": 2.0087, + "step": 36114 + }, + { + "epoch": 0.45146128653216333, + "grad_norm": 2.4222805500030518, + "learning_rate": 1.3372274888015926e-05, + "loss": 0.8664, + "step": 36116 + }, + { + "epoch": 0.4514862871571789, + "grad_norm": 3.970147132873535, + "learning_rate": 1.3371453305639071e-05, + "loss": 2.4165, + "step": 36118 + }, + { + "epoch": 0.45151128778219457, + "grad_norm": 1.833937406539917, + "learning_rate": 1.3370631697585705e-05, + "loss": 0.8659, + "step": 36120 + }, + { + "epoch": 0.45153628840721016, + "grad_norm": 4.7109222412109375, + "learning_rate": 1.3369810063862084e-05, + "loss": 0.6659, + "step": 36122 + }, + { + "epoch": 0.4515612890322258, + "grad_norm": 2.3324925899505615, + "learning_rate": 1.3368988404474464e-05, + "loss": 0.1749, + "step": 36124 + }, + { + "epoch": 0.45158628965724146, + "grad_norm": 5.722578048706055, + "learning_rate": 1.3368166719429105e-05, + "loss": 1.9268, + "step": 36126 + }, + { + "epoch": 0.45161129028225705, + "grad_norm": 4.2822675704956055, + "learning_rate": 1.3367345008732267e-05, + "loss": 2.011, + "step": 36128 + }, + { + "epoch": 0.4516362909072727, + "grad_norm": 0.4314844608306885, + "learning_rate": 1.3366523272390203e-05, + "loss": 0.7379, + "step": 36130 + }, + { + "epoch": 0.4516612915322883, + "grad_norm": 1.6327965259552002, + "learning_rate": 1.3365701510409175e-05, + "loss": 0.5567, + "step": 36132 + }, + { + "epoch": 0.45168629215730394, + "grad_norm": 3.586531162261963, + "learning_rate": 1.336487972279544e-05, + "loss": 1.1868, + "step": 36134 + }, + { + "epoch": 0.4517112927823196, + "grad_norm": 0.27371951937675476, + "learning_rate": 1.336405790955526e-05, + "loss": 0.396, + "step": 36136 + }, + { + "epoch": 0.4517362934073352, + "grad_norm": 2.930595874786377, + "learning_rate": 1.3363236070694887e-05, + "loss": 1.0464, + "step": 36138 + }, + { + "epoch": 0.4517612940323508, + "grad_norm": 2.5663387775421143, + "learning_rate": 1.3362414206220584e-05, + "loss": 0.9598, + "step": 36140 + }, + { + "epoch": 0.4517862946573664, + "grad_norm": 3.193391799926758, + "learning_rate": 1.3361592316138609e-05, + "loss": 1.1323, + "step": 36142 + }, + { + "epoch": 0.45181129528238206, + "grad_norm": 2.0716753005981445, + "learning_rate": 1.3360770400455225e-05, + "loss": 0.4033, + "step": 36144 + }, + { + "epoch": 0.4518362959073977, + "grad_norm": 3.6667535305023193, + "learning_rate": 1.3359948459176688e-05, + "loss": 1.501, + "step": 36146 + }, + { + "epoch": 0.4518612965324133, + "grad_norm": 2.909355401992798, + "learning_rate": 1.335912649230926e-05, + "loss": 0.757, + "step": 36148 + }, + { + "epoch": 0.45188629715742895, + "grad_norm": 12.383309364318848, + "learning_rate": 1.3358304499859195e-05, + "loss": 1.3301, + "step": 36150 + }, + { + "epoch": 0.45191129778244454, + "grad_norm": 2.1506736278533936, + "learning_rate": 1.335748248183276e-05, + "loss": 1.2566, + "step": 36152 + }, + { + "epoch": 0.4519362984074602, + "grad_norm": 6.398323059082031, + "learning_rate": 1.3356660438236215e-05, + "loss": 2.3288, + "step": 36154 + }, + { + "epoch": 0.45196129903247584, + "grad_norm": 0.9154337644577026, + "learning_rate": 1.3355838369075817e-05, + "loss": 0.0853, + "step": 36156 + }, + { + "epoch": 0.45198629965749143, + "grad_norm": 0.019902952015399933, + "learning_rate": 1.3355016274357831e-05, + "loss": 0.9844, + "step": 36158 + }, + { + "epoch": 0.4520113002825071, + "grad_norm": 3.5376195907592773, + "learning_rate": 1.3354194154088516e-05, + "loss": 0.8425, + "step": 36160 + }, + { + "epoch": 0.45203630090752267, + "grad_norm": 0.6836887001991272, + "learning_rate": 1.335337200827413e-05, + "loss": 0.7616, + "step": 36162 + }, + { + "epoch": 0.4520613015325383, + "grad_norm": 4.149667263031006, + "learning_rate": 1.3352549836920936e-05, + "loss": 0.4381, + "step": 36164 + }, + { + "epoch": 0.45208630215755397, + "grad_norm": 3.5537619590759277, + "learning_rate": 1.3351727640035198e-05, + "loss": 0.3479, + "step": 36166 + }, + { + "epoch": 0.45211130278256956, + "grad_norm": 2.2662110328674316, + "learning_rate": 1.3350905417623177e-05, + "loss": 0.7308, + "step": 36168 + }, + { + "epoch": 0.4521363034075852, + "grad_norm": 0.9200073480606079, + "learning_rate": 1.3350083169691133e-05, + "loss": 0.6976, + "step": 36170 + }, + { + "epoch": 0.4521613040326008, + "grad_norm": 2.332617998123169, + "learning_rate": 1.3349260896245328e-05, + "loss": 0.8297, + "step": 36172 + }, + { + "epoch": 0.45218630465761644, + "grad_norm": 4.282430648803711, + "learning_rate": 1.334843859729203e-05, + "loss": 1.4574, + "step": 36174 + }, + { + "epoch": 0.4522113052826321, + "grad_norm": 0.006607812363654375, + "learning_rate": 1.3347616272837494e-05, + "loss": 0.462, + "step": 36176 + }, + { + "epoch": 0.4522363059076477, + "grad_norm": 0.52911376953125, + "learning_rate": 1.3346793922887983e-05, + "loss": 0.0143, + "step": 36178 + }, + { + "epoch": 0.45226130653266333, + "grad_norm": 3.447263717651367, + "learning_rate": 1.334597154744977e-05, + "loss": 0.7861, + "step": 36180 + }, + { + "epoch": 0.4522863071576789, + "grad_norm": 1.8946459293365479, + "learning_rate": 1.3345149146529106e-05, + "loss": 0.8761, + "step": 36182 + }, + { + "epoch": 0.45231130778269457, + "grad_norm": 6.337350845336914, + "learning_rate": 1.3344326720132261e-05, + "loss": 1.0344, + "step": 36184 + }, + { + "epoch": 0.4523363084077102, + "grad_norm": 3.053269863128662, + "learning_rate": 1.3343504268265497e-05, + "loss": 0.4137, + "step": 36186 + }, + { + "epoch": 0.4523613090327258, + "grad_norm": 3.746476888656616, + "learning_rate": 1.3342681790935076e-05, + "loss": 0.578, + "step": 36188 + }, + { + "epoch": 0.45238630965774146, + "grad_norm": 1.2850255966186523, + "learning_rate": 1.334185928814726e-05, + "loss": 0.5656, + "step": 36190 + }, + { + "epoch": 0.45241131028275705, + "grad_norm": 2.906010866165161, + "learning_rate": 1.334103675990832e-05, + "loss": 0.4993, + "step": 36192 + }, + { + "epoch": 0.4524363109077727, + "grad_norm": 6.164986610412598, + "learning_rate": 1.3340214206224518e-05, + "loss": 1.1045, + "step": 36194 + }, + { + "epoch": 0.45246131153278835, + "grad_norm": 2.625584363937378, + "learning_rate": 1.3339391627102117e-05, + "loss": 0.6939, + "step": 36196 + }, + { + "epoch": 0.45248631215780394, + "grad_norm": 3.3169312477111816, + "learning_rate": 1.3338569022547377e-05, + "loss": 0.9705, + "step": 36198 + }, + { + "epoch": 0.4525113127828196, + "grad_norm": 3.7472121715545654, + "learning_rate": 1.3337746392566573e-05, + "loss": 1.3147, + "step": 36200 + }, + { + "epoch": 0.4525363134078352, + "grad_norm": 3.0643367767333984, + "learning_rate": 1.333692373716596e-05, + "loss": 0.8235, + "step": 36202 + }, + { + "epoch": 0.4525613140328508, + "grad_norm": 1.7688708305358887, + "learning_rate": 1.333610105635181e-05, + "loss": 0.2162, + "step": 36204 + }, + { + "epoch": 0.4525863146578665, + "grad_norm": 2.5651402473449707, + "learning_rate": 1.3335278350130385e-05, + "loss": 1.2775, + "step": 36206 + }, + { + "epoch": 0.45261131528288207, + "grad_norm": 0.7250610589981079, + "learning_rate": 1.3334455618507952e-05, + "loss": 0.1001, + "step": 36208 + }, + { + "epoch": 0.4526363159078977, + "grad_norm": 0.003642120398581028, + "learning_rate": 1.3333632861490778e-05, + "loss": 0.4079, + "step": 36210 + }, + { + "epoch": 0.4526613165329133, + "grad_norm": 5.020812034606934, + "learning_rate": 1.3332810079085126e-05, + "loss": 1.3258, + "step": 36212 + }, + { + "epoch": 0.45268631715792895, + "grad_norm": 0.0030447207391262054, + "learning_rate": 1.3331987271297262e-05, + "loss": 0.4671, + "step": 36214 + }, + { + "epoch": 0.4527113177829446, + "grad_norm": 0.07332679629325867, + "learning_rate": 1.3331164438133455e-05, + "loss": 1.4513, + "step": 36216 + }, + { + "epoch": 0.4527363184079602, + "grad_norm": 3.3858726024627686, + "learning_rate": 1.3330341579599974e-05, + "loss": 1.4491, + "step": 36218 + }, + { + "epoch": 0.45276131903297584, + "grad_norm": 5.249691009521484, + "learning_rate": 1.332951869570308e-05, + "loss": 1.9843, + "step": 36220 + }, + { + "epoch": 0.45278631965799143, + "grad_norm": 0.432243674993515, + "learning_rate": 1.3328695786449042e-05, + "loss": 0.4394, + "step": 36222 + }, + { + "epoch": 0.4528113202830071, + "grad_norm": 3.564419746398926, + "learning_rate": 1.3327872851844128e-05, + "loss": 1.086, + "step": 36224 + }, + { + "epoch": 0.4528363209080227, + "grad_norm": 3.198268175125122, + "learning_rate": 1.3327049891894608e-05, + "loss": 0.2277, + "step": 36226 + }, + { + "epoch": 0.4528613215330383, + "grad_norm": 0.004903450608253479, + "learning_rate": 1.332622690660674e-05, + "loss": 0.5139, + "step": 36228 + }, + { + "epoch": 0.45288632215805397, + "grad_norm": 3.106009006500244, + "learning_rate": 1.3325403895986804e-05, + "loss": 0.9351, + "step": 36230 + }, + { + "epoch": 0.45291132278306956, + "grad_norm": 2.6841721534729004, + "learning_rate": 1.332458086004106e-05, + "loss": 1.271, + "step": 36232 + }, + { + "epoch": 0.4529363234080852, + "grad_norm": 0.0011233891127631068, + "learning_rate": 1.3323757798775781e-05, + "loss": 0.525, + "step": 36234 + }, + { + "epoch": 0.45296132403310085, + "grad_norm": 1.4911861419677734, + "learning_rate": 1.3322934712197229e-05, + "loss": 0.5768, + "step": 36236 + }, + { + "epoch": 0.45298632465811645, + "grad_norm": 5.77140998840332, + "learning_rate": 1.3322111600311678e-05, + "loss": 1.3866, + "step": 36238 + }, + { + "epoch": 0.4530113252831321, + "grad_norm": 2.6127688884735107, + "learning_rate": 1.3321288463125392e-05, + "loss": 0.228, + "step": 36240 + }, + { + "epoch": 0.4530363259081477, + "grad_norm": 3.8702445030212402, + "learning_rate": 1.3320465300644644e-05, + "loss": 0.9068, + "step": 36242 + }, + { + "epoch": 0.45306132653316333, + "grad_norm": 4.154241561889648, + "learning_rate": 1.3319642112875706e-05, + "loss": 0.89, + "step": 36244 + }, + { + "epoch": 0.453086327158179, + "grad_norm": 2.3003060817718506, + "learning_rate": 1.331881889982484e-05, + "loss": 1.1946, + "step": 36246 + }, + { + "epoch": 0.4531113277831946, + "grad_norm": 7.372343063354492, + "learning_rate": 1.3317995661498317e-05, + "loss": 0.4396, + "step": 36248 + }, + { + "epoch": 0.4531363284082102, + "grad_norm": 5.125921249389648, + "learning_rate": 1.331717239790241e-05, + "loss": 1.436, + "step": 36250 + }, + { + "epoch": 0.4531613290332258, + "grad_norm": 0.0014281226322054863, + "learning_rate": 1.3316349109043385e-05, + "loss": 0.3736, + "step": 36252 + }, + { + "epoch": 0.45318632965824146, + "grad_norm": 0.0021709073334932327, + "learning_rate": 1.3315525794927515e-05, + "loss": 0.5283, + "step": 36254 + }, + { + "epoch": 0.4532113302832571, + "grad_norm": 2.1371853351593018, + "learning_rate": 1.331470245556107e-05, + "loss": 0.6286, + "step": 36256 + }, + { + "epoch": 0.4532363309082727, + "grad_norm": 3.3349156379699707, + "learning_rate": 1.3313879090950324e-05, + "loss": 0.6079, + "step": 36258 + }, + { + "epoch": 0.45326133153328835, + "grad_norm": 0.7408897876739502, + "learning_rate": 1.3313055701101539e-05, + "loss": 0.403, + "step": 36260 + }, + { + "epoch": 0.45328633215830394, + "grad_norm": 0.004678808618336916, + "learning_rate": 1.331223228602099e-05, + "loss": 0.3773, + "step": 36262 + }, + { + "epoch": 0.4533113327833196, + "grad_norm": 1.6151756048202515, + "learning_rate": 1.3311408845714948e-05, + "loss": 0.2405, + "step": 36264 + }, + { + "epoch": 0.45333633340833523, + "grad_norm": 1.9208372831344604, + "learning_rate": 1.3310585380189686e-05, + "loss": 0.5198, + "step": 36266 + }, + { + "epoch": 0.4533613340333508, + "grad_norm": 2.6987974643707275, + "learning_rate": 1.3309761889451474e-05, + "loss": 0.5539, + "step": 36268 + }, + { + "epoch": 0.4533863346583665, + "grad_norm": 8.63597297668457, + "learning_rate": 1.3308938373506584e-05, + "loss": 2.0353, + "step": 36270 + }, + { + "epoch": 0.45341133528338207, + "grad_norm": 4.101227283477783, + "learning_rate": 1.3308114832361287e-05, + "loss": 1.1471, + "step": 36272 + }, + { + "epoch": 0.4534363359083977, + "grad_norm": 4.527500152587891, + "learning_rate": 1.3307291266021855e-05, + "loss": 0.9864, + "step": 36274 + }, + { + "epoch": 0.45346133653341336, + "grad_norm": 5.722165107727051, + "learning_rate": 1.3306467674494559e-05, + "loss": 1.4921, + "step": 36276 + }, + { + "epoch": 0.45348633715842895, + "grad_norm": 4.530544757843018, + "learning_rate": 1.3305644057785674e-05, + "loss": 1.1689, + "step": 36278 + }, + { + "epoch": 0.4535113377834446, + "grad_norm": 2.1841177940368652, + "learning_rate": 1.3304820415901471e-05, + "loss": 0.3514, + "step": 36280 + }, + { + "epoch": 0.4535363384084602, + "grad_norm": 1.7287780046463013, + "learning_rate": 1.3303996748848223e-05, + "loss": 0.9713, + "step": 36282 + }, + { + "epoch": 0.45356133903347584, + "grad_norm": 0.1203431636095047, + "learning_rate": 1.3303173056632206e-05, + "loss": 0.0763, + "step": 36284 + }, + { + "epoch": 0.4535863396584915, + "grad_norm": 3.335095167160034, + "learning_rate": 1.3302349339259686e-05, + "loss": 1.2957, + "step": 36286 + }, + { + "epoch": 0.4536113402835071, + "grad_norm": 7.36084508895874, + "learning_rate": 1.3301525596736942e-05, + "loss": 2.0061, + "step": 36288 + }, + { + "epoch": 0.45363634090852273, + "grad_norm": 6.2496747970581055, + "learning_rate": 1.3300701829070244e-05, + "loss": 0.1009, + "step": 36290 + }, + { + "epoch": 0.4536613415335383, + "grad_norm": 1.347822666168213, + "learning_rate": 1.3299878036265869e-05, + "loss": 1.1572, + "step": 36292 + }, + { + "epoch": 0.45368634215855397, + "grad_norm": 4.630075931549072, + "learning_rate": 1.329905421833009e-05, + "loss": 1.1384, + "step": 36294 + }, + { + "epoch": 0.4537113427835696, + "grad_norm": 0.8870375752449036, + "learning_rate": 1.3298230375269178e-05, + "loss": 0.05, + "step": 36296 + }, + { + "epoch": 0.4537363434085852, + "grad_norm": 2.694573163986206, + "learning_rate": 1.3297406507089417e-05, + "loss": 0.8676, + "step": 36298 + }, + { + "epoch": 0.45376134403360086, + "grad_norm": 3.1590383052825928, + "learning_rate": 1.3296582613797068e-05, + "loss": 1.944, + "step": 36300 + }, + { + "epoch": 0.45378634465861645, + "grad_norm": 1.7541652917861938, + "learning_rate": 1.3295758695398413e-05, + "loss": 1.084, + "step": 36302 + }, + { + "epoch": 0.4538113452836321, + "grad_norm": 4.257191181182861, + "learning_rate": 1.3294934751899726e-05, + "loss": 1.1385, + "step": 36304 + }, + { + "epoch": 0.45383634590864774, + "grad_norm": 5.201686859130859, + "learning_rate": 1.3294110783307283e-05, + "loss": 1.6882, + "step": 36306 + }, + { + "epoch": 0.45386134653366333, + "grad_norm": 3.0815460681915283, + "learning_rate": 1.3293286789627358e-05, + "loss": 0.8599, + "step": 36308 + }, + { + "epoch": 0.453886347158679, + "grad_norm": 3.193920135498047, + "learning_rate": 1.3292462770866228e-05, + "loss": 0.8692, + "step": 36310 + }, + { + "epoch": 0.4539113477836946, + "grad_norm": 2.932345390319824, + "learning_rate": 1.3291638727030165e-05, + "loss": 0.3661, + "step": 36312 + }, + { + "epoch": 0.4539363484087102, + "grad_norm": 3.080817699432373, + "learning_rate": 1.3290814658125447e-05, + "loss": 1.2153, + "step": 36314 + }, + { + "epoch": 0.45396134903372587, + "grad_norm": 1.7722301483154297, + "learning_rate": 1.3289990564158348e-05, + "loss": 0.6832, + "step": 36316 + }, + { + "epoch": 0.45398634965874146, + "grad_norm": 0.0007655068184249103, + "learning_rate": 1.3289166445135148e-05, + "loss": 1.5583, + "step": 36318 + }, + { + "epoch": 0.4540113502837571, + "grad_norm": 0.001372936530970037, + "learning_rate": 1.3288342301062121e-05, + "loss": 0.0, + "step": 36320 + }, + { + "epoch": 0.4540363509087727, + "grad_norm": 2.51290225982666, + "learning_rate": 1.3287518131945545e-05, + "loss": 1.1702, + "step": 36322 + }, + { + "epoch": 0.45406135153378835, + "grad_norm": 8.938426971435547, + "learning_rate": 1.3286693937791698e-05, + "loss": 0.8185, + "step": 36324 + }, + { + "epoch": 0.454086352158804, + "grad_norm": 6.510619163513184, + "learning_rate": 1.328586971860685e-05, + "loss": 1.1381, + "step": 36326 + }, + { + "epoch": 0.4541113527838196, + "grad_norm": 1.2119148969650269, + "learning_rate": 1.3285045474397284e-05, + "loss": 0.9098, + "step": 36328 + }, + { + "epoch": 0.45413635340883524, + "grad_norm": 1.6238651275634766, + "learning_rate": 1.3284221205169279e-05, + "loss": 0.0782, + "step": 36330 + }, + { + "epoch": 0.45416135403385083, + "grad_norm": 2.9618358612060547, + "learning_rate": 1.3283396910929108e-05, + "loss": 0.2284, + "step": 36332 + }, + { + "epoch": 0.4541863546588665, + "grad_norm": 2.6421706676483154, + "learning_rate": 1.328257259168305e-05, + "loss": 0.9185, + "step": 36334 + }, + { + "epoch": 0.4542113552838821, + "grad_norm": 2.22831130027771, + "learning_rate": 1.3281748247437385e-05, + "loss": 0.481, + "step": 36336 + }, + { + "epoch": 0.4542363559088977, + "grad_norm": 3.259082317352295, + "learning_rate": 1.328092387819839e-05, + "loss": 1.1423, + "step": 36338 + }, + { + "epoch": 0.45426135653391336, + "grad_norm": 4.853209495544434, + "learning_rate": 1.328009948397234e-05, + "loss": 1.5985, + "step": 36340 + }, + { + "epoch": 0.45428635715892896, + "grad_norm": 3.2474122047424316, + "learning_rate": 1.3279275064765515e-05, + "loss": 1.2209, + "step": 36342 + }, + { + "epoch": 0.4543113577839446, + "grad_norm": 3.565298318862915, + "learning_rate": 1.3278450620584198e-05, + "loss": 1.0247, + "step": 36344 + }, + { + "epoch": 0.45433635840896025, + "grad_norm": 2.618598699569702, + "learning_rate": 1.3277626151434662e-05, + "loss": 0.7501, + "step": 36346 + }, + { + "epoch": 0.45436135903397584, + "grad_norm": 4.563591003417969, + "learning_rate": 1.327680165732319e-05, + "loss": 0.2952, + "step": 36348 + }, + { + "epoch": 0.4543863596589915, + "grad_norm": 4.071142673492432, + "learning_rate": 1.3275977138256062e-05, + "loss": 0.8642, + "step": 36350 + }, + { + "epoch": 0.4544113602840071, + "grad_norm": 5.328665733337402, + "learning_rate": 1.3275152594239554e-05, + "loss": 1.934, + "step": 36352 + }, + { + "epoch": 0.45443636090902273, + "grad_norm": 1.788603663444519, + "learning_rate": 1.3274328025279945e-05, + "loss": 0.3457, + "step": 36354 + }, + { + "epoch": 0.4544613615340384, + "grad_norm": 4.824636459350586, + "learning_rate": 1.3273503431383518e-05, + "loss": 0.3974, + "step": 36356 + }, + { + "epoch": 0.45448636215905397, + "grad_norm": 4.268202781677246, + "learning_rate": 1.3272678812556552e-05, + "loss": 1.2172, + "step": 36358 + }, + { + "epoch": 0.4545113627840696, + "grad_norm": 2.4568052291870117, + "learning_rate": 1.327185416880533e-05, + "loss": 1.1247, + "step": 36360 + }, + { + "epoch": 0.4545363634090852, + "grad_norm": 7.908929824829102, + "learning_rate": 1.3271029500136128e-05, + "loss": 0.732, + "step": 36362 + }, + { + "epoch": 0.45456136403410086, + "grad_norm": 3.469531297683716, + "learning_rate": 1.3270204806555224e-05, + "loss": 1.7721, + "step": 36364 + }, + { + "epoch": 0.4545863646591165, + "grad_norm": 2.4283573627471924, + "learning_rate": 1.3269380088068906e-05, + "loss": 0.721, + "step": 36366 + }, + { + "epoch": 0.4546113652841321, + "grad_norm": 3.040565252304077, + "learning_rate": 1.326855534468345e-05, + "loss": 0.8595, + "step": 36368 + }, + { + "epoch": 0.45463636590914774, + "grad_norm": 4.809183120727539, + "learning_rate": 1.326773057640514e-05, + "loss": 1.1638, + "step": 36370 + }, + { + "epoch": 0.45466136653416334, + "grad_norm": 3.7502942085266113, + "learning_rate": 1.3266905783240254e-05, + "loss": 1.6874, + "step": 36372 + }, + { + "epoch": 0.454686367159179, + "grad_norm": 0.12885726988315582, + "learning_rate": 1.3266080965195077e-05, + "loss": 0.7495, + "step": 36374 + }, + { + "epoch": 0.45471136778419463, + "grad_norm": 1.6682932376861572, + "learning_rate": 1.3265256122275892e-05, + "loss": 0.881, + "step": 36376 + }, + { + "epoch": 0.4547363684092102, + "grad_norm": 3.469480037689209, + "learning_rate": 1.3264431254488975e-05, + "loss": 0.9519, + "step": 36378 + }, + { + "epoch": 0.45476136903422587, + "grad_norm": 4.707235336303711, + "learning_rate": 1.3263606361840609e-05, + "loss": 2.0978, + "step": 36380 + }, + { + "epoch": 0.45478636965924146, + "grad_norm": 3.625619411468506, + "learning_rate": 1.3262781444337081e-05, + "loss": 0.4716, + "step": 36382 + }, + { + "epoch": 0.4548113702842571, + "grad_norm": 2.346611261367798, + "learning_rate": 1.3261956501984671e-05, + "loss": 0.3799, + "step": 36384 + }, + { + "epoch": 0.45483637090927276, + "grad_norm": 0.019063159823417664, + "learning_rate": 1.3261131534789661e-05, + "loss": 0.4697, + "step": 36386 + }, + { + "epoch": 0.45486137153428835, + "grad_norm": 6.105023384094238, + "learning_rate": 1.3260306542758337e-05, + "loss": 1.8007, + "step": 36388 + }, + { + "epoch": 0.454886372159304, + "grad_norm": 4.361415863037109, + "learning_rate": 1.3259481525896977e-05, + "loss": 0.9599, + "step": 36390 + }, + { + "epoch": 0.4549113727843196, + "grad_norm": 5.619009017944336, + "learning_rate": 1.3258656484211865e-05, + "loss": 0.8774, + "step": 36392 + }, + { + "epoch": 0.45493637340933524, + "grad_norm": 5.222806930541992, + "learning_rate": 1.3257831417709289e-05, + "loss": 0.8674, + "step": 36394 + }, + { + "epoch": 0.4549613740343509, + "grad_norm": 3.7174551486968994, + "learning_rate": 1.3257006326395527e-05, + "loss": 0.7595, + "step": 36396 + }, + { + "epoch": 0.4549863746593665, + "grad_norm": 4.413179397583008, + "learning_rate": 1.3256181210276868e-05, + "loss": 0.3338, + "step": 36398 + }, + { + "epoch": 0.4550113752843821, + "grad_norm": 3.356461763381958, + "learning_rate": 1.3255356069359593e-05, + "loss": 1.2561, + "step": 36400 + }, + { + "epoch": 0.4550363759093977, + "grad_norm": 0.0005056713707745075, + "learning_rate": 1.3254530903649987e-05, + "loss": 0.7145, + "step": 36402 + }, + { + "epoch": 0.45506137653441336, + "grad_norm": 0.0010446656960994005, + "learning_rate": 1.325370571315433e-05, + "loss": 0.8217, + "step": 36404 + }, + { + "epoch": 0.455086377159429, + "grad_norm": 1.0434650182724, + "learning_rate": 1.3252880497878912e-05, + "loss": 0.0322, + "step": 36406 + }, + { + "epoch": 0.4551113777844446, + "grad_norm": 5.503875732421875, + "learning_rate": 1.325205525783002e-05, + "loss": 1.5109, + "step": 36408 + }, + { + "epoch": 0.45513637840946025, + "grad_norm": 4.53319787979126, + "learning_rate": 1.3251229993013931e-05, + "loss": 1.6975, + "step": 36410 + }, + { + "epoch": 0.45516137903447584, + "grad_norm": 2.425055503845215, + "learning_rate": 1.3250404703436936e-05, + "loss": 0.5606, + "step": 36412 + }, + { + "epoch": 0.4551863796594915, + "grad_norm": 4.60908842086792, + "learning_rate": 1.324957938910532e-05, + "loss": 0.6838, + "step": 36414 + }, + { + "epoch": 0.45521138028450714, + "grad_norm": 0.8618867993354797, + "learning_rate": 1.3248754050025361e-05, + "loss": 0.0248, + "step": 36416 + }, + { + "epoch": 0.45523638090952273, + "grad_norm": 1.9852244853973389, + "learning_rate": 1.3247928686203355e-05, + "loss": 0.7292, + "step": 36418 + }, + { + "epoch": 0.4552613815345384, + "grad_norm": 0.4377184808254242, + "learning_rate": 1.3247103297645582e-05, + "loss": 0.055, + "step": 36420 + }, + { + "epoch": 0.45528638215955397, + "grad_norm": 4.910645484924316, + "learning_rate": 1.3246277884358331e-05, + "loss": 2.1563, + "step": 36422 + }, + { + "epoch": 0.4553113827845696, + "grad_norm": 4.077829837799072, + "learning_rate": 1.3245452446347887e-05, + "loss": 1.7622, + "step": 36424 + }, + { + "epoch": 0.45533638340958527, + "grad_norm": 2.588391065597534, + "learning_rate": 1.3244626983620535e-05, + "loss": 0.2305, + "step": 36426 + }, + { + "epoch": 0.45536138403460086, + "grad_norm": 4.061802864074707, + "learning_rate": 1.3243801496182564e-05, + "loss": 0.4545, + "step": 36428 + }, + { + "epoch": 0.4553863846596165, + "grad_norm": 4.318996906280518, + "learning_rate": 1.3242975984040255e-05, + "loss": 0.5064, + "step": 36430 + }, + { + "epoch": 0.4554113852846321, + "grad_norm": 2.0222129821777344, + "learning_rate": 1.3242150447199904e-05, + "loss": 0.8667, + "step": 36432 + }, + { + "epoch": 0.45543638590964775, + "grad_norm": 1.1121474504470825, + "learning_rate": 1.3241324885667794e-05, + "loss": 0.3406, + "step": 36434 + }, + { + "epoch": 0.4554613865346634, + "grad_norm": 4.193066120147705, + "learning_rate": 1.3240499299450211e-05, + "loss": 1.1411, + "step": 36436 + }, + { + "epoch": 0.455486387159679, + "grad_norm": 2.9407639503479004, + "learning_rate": 1.3239673688553443e-05, + "loss": 1.6926, + "step": 36438 + }, + { + "epoch": 0.45551138778469463, + "grad_norm": 0.00961809791624546, + "learning_rate": 1.323884805298378e-05, + "loss": 0.0089, + "step": 36440 + }, + { + "epoch": 0.4555363884097102, + "grad_norm": 3.2697994709014893, + "learning_rate": 1.3238022392747503e-05, + "loss": 0.7623, + "step": 36442 + }, + { + "epoch": 0.45556138903472587, + "grad_norm": 2.5691895484924316, + "learning_rate": 1.3237196707850907e-05, + "loss": 1.1373, + "step": 36444 + }, + { + "epoch": 0.4555863896597415, + "grad_norm": 1.870824933052063, + "learning_rate": 1.3236370998300283e-05, + "loss": 0.8947, + "step": 36446 + }, + { + "epoch": 0.4556113902847571, + "grad_norm": 0.07651903480291367, + "learning_rate": 1.3235545264101914e-05, + "loss": 0.0007, + "step": 36448 + }, + { + "epoch": 0.45563639090977276, + "grad_norm": 5.631601333618164, + "learning_rate": 1.3234719505262088e-05, + "loss": 1.2993, + "step": 36450 + }, + { + "epoch": 0.45566139153478835, + "grad_norm": 3.5566272735595703, + "learning_rate": 1.3233893721787096e-05, + "loss": 0.9631, + "step": 36452 + }, + { + "epoch": 0.455686392159804, + "grad_norm": 0.7561838626861572, + "learning_rate": 1.3233067913683226e-05, + "loss": 0.6799, + "step": 36454 + }, + { + "epoch": 0.45571139278481965, + "grad_norm": 2.6612629890441895, + "learning_rate": 1.323224208095677e-05, + "loss": 0.9243, + "step": 36456 + }, + { + "epoch": 0.45573639340983524, + "grad_norm": 2.2002742290496826, + "learning_rate": 1.3231416223614013e-05, + "loss": 1.1277, + "step": 36458 + }, + { + "epoch": 0.4557613940348509, + "grad_norm": 0.000870252784807235, + "learning_rate": 1.3230590341661252e-05, + "loss": 0.0001, + "step": 36460 + }, + { + "epoch": 0.4557863946598665, + "grad_norm": 3.2896668910980225, + "learning_rate": 1.3229764435104767e-05, + "loss": 0.6728, + "step": 36462 + }, + { + "epoch": 0.4558113952848821, + "grad_norm": 3.598958730697632, + "learning_rate": 1.3228938503950853e-05, + "loss": 1.527, + "step": 36464 + }, + { + "epoch": 0.4558363959098978, + "grad_norm": 1.3777735233306885, + "learning_rate": 1.3228112548205801e-05, + "loss": 0.2321, + "step": 36466 + }, + { + "epoch": 0.45586139653491337, + "grad_norm": 2.145095109939575, + "learning_rate": 1.3227286567875899e-05, + "loss": 0.9892, + "step": 36468 + }, + { + "epoch": 0.455886397159929, + "grad_norm": 4.762001991271973, + "learning_rate": 1.322646056296744e-05, + "loss": 1.5148, + "step": 36470 + }, + { + "epoch": 0.4559113977849446, + "grad_norm": 2.9976682662963867, + "learning_rate": 1.3225634533486713e-05, + "loss": 0.9352, + "step": 36472 + }, + { + "epoch": 0.45593639840996025, + "grad_norm": 0.001554165268316865, + "learning_rate": 1.3224808479440014e-05, + "loss": 0.9915, + "step": 36474 + }, + { + "epoch": 0.4559613990349759, + "grad_norm": 3.0543904304504395, + "learning_rate": 1.3223982400833624e-05, + "loss": 0.9426, + "step": 36476 + }, + { + "epoch": 0.4559863996599915, + "grad_norm": 0.8548787236213684, + "learning_rate": 1.3223156297673843e-05, + "loss": 0.6009, + "step": 36478 + }, + { + "epoch": 0.45601140028500714, + "grad_norm": 2.2931344509124756, + "learning_rate": 1.3222330169966957e-05, + "loss": 0.7233, + "step": 36480 + }, + { + "epoch": 0.45603640091002273, + "grad_norm": 5.0677571296691895, + "learning_rate": 1.3221504017719261e-05, + "loss": 1.7889, + "step": 36482 + }, + { + "epoch": 0.4560614015350384, + "grad_norm": 9.771156311035156, + "learning_rate": 1.3220677840937047e-05, + "loss": 0.9181, + "step": 36484 + }, + { + "epoch": 0.456086402160054, + "grad_norm": 3.2973456382751465, + "learning_rate": 1.3219851639626606e-05, + "loss": 0.7189, + "step": 36486 + }, + { + "epoch": 0.4561114027850696, + "grad_norm": 0.025285914540290833, + "learning_rate": 1.3219025413794231e-05, + "loss": 0.5015, + "step": 36488 + }, + { + "epoch": 0.45613640341008527, + "grad_norm": 3.771080493927002, + "learning_rate": 1.3218199163446212e-05, + "loss": 1.0524, + "step": 36490 + }, + { + "epoch": 0.45616140403510086, + "grad_norm": 0.0011234512785449624, + "learning_rate": 1.3217372888588843e-05, + "loss": 0.0004, + "step": 36492 + }, + { + "epoch": 0.4561864046601165, + "grad_norm": 1.7103019952774048, + "learning_rate": 1.3216546589228418e-05, + "loss": 0.1292, + "step": 36494 + }, + { + "epoch": 0.45621140528513215, + "grad_norm": 4.02601957321167, + "learning_rate": 1.3215720265371229e-05, + "loss": 1.0585, + "step": 36496 + }, + { + "epoch": 0.45623640591014775, + "grad_norm": 3.0342116355895996, + "learning_rate": 1.3214893917023566e-05, + "loss": 0.9673, + "step": 36498 + }, + { + "epoch": 0.4562614065351634, + "grad_norm": 0.000791654281783849, + "learning_rate": 1.321406754419173e-05, + "loss": 0.9205, + "step": 36500 + }, + { + "epoch": 0.456286407160179, + "grad_norm": 1.1082711219787598, + "learning_rate": 1.321324114688201e-05, + "loss": 0.0391, + "step": 36502 + }, + { + "epoch": 0.45631140778519463, + "grad_norm": 2.291837215423584, + "learning_rate": 1.3212414725100697e-05, + "loss": 0.7392, + "step": 36504 + }, + { + "epoch": 0.4563364084102103, + "grad_norm": 1.426201343536377, + "learning_rate": 1.3211588278854088e-05, + "loss": 0.2203, + "step": 36506 + }, + { + "epoch": 0.4563614090352259, + "grad_norm": 3.4957146644592285, + "learning_rate": 1.3210761808148477e-05, + "loss": 1.6751, + "step": 36508 + }, + { + "epoch": 0.4563864096602415, + "grad_norm": 0.006131904665380716, + "learning_rate": 1.3209935312990161e-05, + "loss": 0.3481, + "step": 36510 + }, + { + "epoch": 0.4564114102852571, + "grad_norm": 3.811596632003784, + "learning_rate": 1.320910879338543e-05, + "loss": 0.6832, + "step": 36512 + }, + { + "epoch": 0.45643641091027276, + "grad_norm": 5.024469375610352, + "learning_rate": 1.320828224934058e-05, + "loss": 1.2494, + "step": 36514 + }, + { + "epoch": 0.4564614115352884, + "grad_norm": 2.7100062370300293, + "learning_rate": 1.3207455680861907e-05, + "loss": 0.7032, + "step": 36516 + }, + { + "epoch": 0.456486412160304, + "grad_norm": 0.007966111414134502, + "learning_rate": 1.3206629087955702e-05, + "loss": 0.8017, + "step": 36518 + }, + { + "epoch": 0.45651141278531965, + "grad_norm": 0.10102719068527222, + "learning_rate": 1.3205802470628267e-05, + "loss": 0.7639, + "step": 36520 + }, + { + "epoch": 0.45653641341033524, + "grad_norm": 4.399290084838867, + "learning_rate": 1.3204975828885891e-05, + "loss": 0.175, + "step": 36522 + }, + { + "epoch": 0.4565614140353509, + "grad_norm": 2.808537006378174, + "learning_rate": 1.3204149162734874e-05, + "loss": 0.4881, + "step": 36524 + }, + { + "epoch": 0.45658641466036654, + "grad_norm": 4.218562126159668, + "learning_rate": 1.320332247218151e-05, + "loss": 1.8834, + "step": 36526 + }, + { + "epoch": 0.4566114152853821, + "grad_norm": 6.530623912811279, + "learning_rate": 1.3202495757232094e-05, + "loss": 0.6748, + "step": 36528 + }, + { + "epoch": 0.4566364159103978, + "grad_norm": 3.6023590564727783, + "learning_rate": 1.3201669017892923e-05, + "loss": 0.9604, + "step": 36530 + }, + { + "epoch": 0.45666141653541337, + "grad_norm": 0.7868509292602539, + "learning_rate": 1.3200842254170293e-05, + "loss": 0.0432, + "step": 36532 + }, + { + "epoch": 0.456686417160429, + "grad_norm": 0.7206645607948303, + "learning_rate": 1.3200015466070504e-05, + "loss": 0.5204, + "step": 36534 + }, + { + "epoch": 0.45671141778544466, + "grad_norm": 2.8713250160217285, + "learning_rate": 1.3199188653599846e-05, + "loss": 0.4119, + "step": 36536 + }, + { + "epoch": 0.45673641841046025, + "grad_norm": 4.538843154907227, + "learning_rate": 1.3198361816764624e-05, + "loss": 1.3579, + "step": 36538 + }, + { + "epoch": 0.4567614190354759, + "grad_norm": 4.130945682525635, + "learning_rate": 1.3197534955571127e-05, + "loss": 0.3622, + "step": 36540 + }, + { + "epoch": 0.4567864196604915, + "grad_norm": 5.344725131988525, + "learning_rate": 1.319670807002566e-05, + "loss": 1.5887, + "step": 36542 + }, + { + "epoch": 0.45681142028550714, + "grad_norm": 4.390019416809082, + "learning_rate": 1.319588116013451e-05, + "loss": 1.0213, + "step": 36544 + }, + { + "epoch": 0.4568364209105228, + "grad_norm": 14.707324981689453, + "learning_rate": 1.3195054225903988e-05, + "loss": 1.068, + "step": 36546 + }, + { + "epoch": 0.4568614215355384, + "grad_norm": 4.316130638122559, + "learning_rate": 1.3194227267340378e-05, + "loss": 0.8686, + "step": 36548 + }, + { + "epoch": 0.45688642216055403, + "grad_norm": 2.8627500534057617, + "learning_rate": 1.319340028444999e-05, + "loss": 0.9371, + "step": 36550 + }, + { + "epoch": 0.4569114227855696, + "grad_norm": 2.828038454055786, + "learning_rate": 1.3192573277239116e-05, + "loss": 1.1463, + "step": 36552 + }, + { + "epoch": 0.45693642341058527, + "grad_norm": 2.092461585998535, + "learning_rate": 1.3191746245714055e-05, + "loss": 0.3659, + "step": 36554 + }, + { + "epoch": 0.4569614240356009, + "grad_norm": 4.560265064239502, + "learning_rate": 1.3190919189881103e-05, + "loss": 0.8531, + "step": 36556 + }, + { + "epoch": 0.4569864246606165, + "grad_norm": 3.218012571334839, + "learning_rate": 1.3190092109746567e-05, + "loss": 0.9708, + "step": 36558 + }, + { + "epoch": 0.45701142528563216, + "grad_norm": 3.0867199897766113, + "learning_rate": 1.3189265005316738e-05, + "loss": 1.3676, + "step": 36560 + }, + { + "epoch": 0.45703642591064775, + "grad_norm": 0.0005460705724544823, + "learning_rate": 1.3188437876597918e-05, + "loss": 0.0002, + "step": 36562 + }, + { + "epoch": 0.4570614265356634, + "grad_norm": 4.960878849029541, + "learning_rate": 1.3187610723596404e-05, + "loss": 0.8185, + "step": 36564 + }, + { + "epoch": 0.45708642716067904, + "grad_norm": 4.7342939376831055, + "learning_rate": 1.31867835463185e-05, + "loss": 1.7099, + "step": 36566 + }, + { + "epoch": 0.45711142778569464, + "grad_norm": 4.334840774536133, + "learning_rate": 1.3185956344770502e-05, + "loss": 0.8041, + "step": 36568 + }, + { + "epoch": 0.4571364284107103, + "grad_norm": 5.491659641265869, + "learning_rate": 1.3185129118958711e-05, + "loss": 0.2067, + "step": 36570 + }, + { + "epoch": 0.4571614290357259, + "grad_norm": 1.4638582468032837, + "learning_rate": 1.318430186888943e-05, + "loss": 0.8303, + "step": 36572 + }, + { + "epoch": 0.4571864296607415, + "grad_norm": 2.7274489402770996, + "learning_rate": 1.318347459456895e-05, + "loss": 0.8581, + "step": 36574 + }, + { + "epoch": 0.45721143028575717, + "grad_norm": 2.4771835803985596, + "learning_rate": 1.3182647296003582e-05, + "loss": 0.5385, + "step": 36576 + }, + { + "epoch": 0.45723643091077276, + "grad_norm": 4.955549240112305, + "learning_rate": 1.3181819973199623e-05, + "loss": 0.5077, + "step": 36578 + }, + { + "epoch": 0.4572614315357884, + "grad_norm": 4.4720234870910645, + "learning_rate": 1.3180992626163373e-05, + "loss": 1.6569, + "step": 36580 + }, + { + "epoch": 0.457286432160804, + "grad_norm": 5.958954334259033, + "learning_rate": 1.3180165254901129e-05, + "loss": 1.1851, + "step": 36582 + }, + { + "epoch": 0.45731143278581965, + "grad_norm": 2.7009949684143066, + "learning_rate": 1.31793378594192e-05, + "loss": 0.2814, + "step": 36584 + }, + { + "epoch": 0.4573364334108353, + "grad_norm": 4.82789945602417, + "learning_rate": 1.3178510439723882e-05, + "loss": 1.8897, + "step": 36586 + }, + { + "epoch": 0.4573614340358509, + "grad_norm": 4.912539005279541, + "learning_rate": 1.3177682995821477e-05, + "loss": 1.3721, + "step": 36588 + }, + { + "epoch": 0.45738643466086654, + "grad_norm": 3.2223012447357178, + "learning_rate": 1.3176855527718293e-05, + "loss": 0.3483, + "step": 36590 + }, + { + "epoch": 0.45741143528588213, + "grad_norm": 9.091645240783691, + "learning_rate": 1.3176028035420621e-05, + "loss": 0.831, + "step": 36592 + }, + { + "epoch": 0.4574364359108978, + "grad_norm": 2.54136323928833, + "learning_rate": 1.3175200518934768e-05, + "loss": 0.6869, + "step": 36594 + }, + { + "epoch": 0.4574614365359134, + "grad_norm": 4.339852333068848, + "learning_rate": 1.3174372978267039e-05, + "loss": 1.0354, + "step": 36596 + }, + { + "epoch": 0.457486437160929, + "grad_norm": 1.7221084833145142, + "learning_rate": 1.3173545413423735e-05, + "loss": 0.7558, + "step": 36598 + }, + { + "epoch": 0.45751143778594466, + "grad_norm": 4.752964019775391, + "learning_rate": 1.3172717824411157e-05, + "loss": 1.4567, + "step": 36600 + }, + { + "epoch": 0.45753643841096026, + "grad_norm": 3.7099246978759766, + "learning_rate": 1.317189021123561e-05, + "loss": 0.8134, + "step": 36602 + }, + { + "epoch": 0.4575614390359759, + "grad_norm": 0.6761020421981812, + "learning_rate": 1.3171062573903395e-05, + "loss": 0.0544, + "step": 36604 + }, + { + "epoch": 0.45758643966099155, + "grad_norm": 2.3934993743896484, + "learning_rate": 1.3170234912420814e-05, + "loss": 0.9349, + "step": 36606 + }, + { + "epoch": 0.45761144028600714, + "grad_norm": 3.7852389812469482, + "learning_rate": 1.3169407226794173e-05, + "loss": 1.9125, + "step": 36608 + }, + { + "epoch": 0.4576364409110228, + "grad_norm": 2.8493189811706543, + "learning_rate": 1.3168579517029775e-05, + "loss": 0.9419, + "step": 36610 + }, + { + "epoch": 0.4576614415360384, + "grad_norm": 2.675847053527832, + "learning_rate": 1.3167751783133924e-05, + "loss": 1.4074, + "step": 36612 + }, + { + "epoch": 0.45768644216105403, + "grad_norm": 5.214183330535889, + "learning_rate": 1.3166924025112924e-05, + "loss": 1.6099, + "step": 36614 + }, + { + "epoch": 0.4577114427860697, + "grad_norm": 8.639286994934082, + "learning_rate": 1.3166096242973078e-05, + "loss": 0.5928, + "step": 36616 + }, + { + "epoch": 0.45773644341108527, + "grad_norm": 6.468244552612305, + "learning_rate": 1.3165268436720692e-05, + "loss": 0.2347, + "step": 36618 + }, + { + "epoch": 0.4577614440361009, + "grad_norm": 4.757630348205566, + "learning_rate": 1.3164440606362066e-05, + "loss": 0.9977, + "step": 36620 + }, + { + "epoch": 0.4577864446611165, + "grad_norm": 4.93613862991333, + "learning_rate": 1.3163612751903507e-05, + "loss": 1.216, + "step": 36622 + }, + { + "epoch": 0.45781144528613216, + "grad_norm": 2.13489031791687, + "learning_rate": 1.3162784873351326e-05, + "loss": 1.2335, + "step": 36624 + }, + { + "epoch": 0.4578364459111478, + "grad_norm": 9.050084114074707, + "learning_rate": 1.316195697071182e-05, + "loss": 1.2759, + "step": 36626 + }, + { + "epoch": 0.4578614465361634, + "grad_norm": 4.155766487121582, + "learning_rate": 1.3161129043991296e-05, + "loss": 1.8344, + "step": 36628 + }, + { + "epoch": 0.45788644716117904, + "grad_norm": 3.1979899406433105, + "learning_rate": 1.3160301093196063e-05, + "loss": 0.9455, + "step": 36630 + }, + { + "epoch": 0.45791144778619464, + "grad_norm": 1.086629033088684, + "learning_rate": 1.315947311833242e-05, + "loss": 0.2372, + "step": 36632 + }, + { + "epoch": 0.4579364484112103, + "grad_norm": 1.887086033821106, + "learning_rate": 1.3158645119406676e-05, + "loss": 0.9613, + "step": 36634 + }, + { + "epoch": 0.45796144903622593, + "grad_norm": 4.0011067390441895, + "learning_rate": 1.3157817096425143e-05, + "loss": 1.3421, + "step": 36636 + }, + { + "epoch": 0.4579864496612415, + "grad_norm": 1.4969456195831299, + "learning_rate": 1.3156989049394118e-05, + "loss": 0.9, + "step": 36638 + }, + { + "epoch": 0.45801145028625717, + "grad_norm": 3.6921322345733643, + "learning_rate": 1.315616097831991e-05, + "loss": 1.7965, + "step": 36640 + }, + { + "epoch": 0.45803645091127276, + "grad_norm": 2.3406789302825928, + "learning_rate": 1.3155332883208828e-05, + "loss": 0.2501, + "step": 36642 + }, + { + "epoch": 0.4580614515362884, + "grad_norm": 5.228623390197754, + "learning_rate": 1.3154504764067175e-05, + "loss": 0.6812, + "step": 36644 + }, + { + "epoch": 0.45808645216130406, + "grad_norm": 3.1861467361450195, + "learning_rate": 1.3153676620901261e-05, + "loss": 0.6985, + "step": 36646 + }, + { + "epoch": 0.45811145278631965, + "grad_norm": 0.0011611253721639514, + "learning_rate": 1.3152848453717391e-05, + "loss": 0.4112, + "step": 36648 + }, + { + "epoch": 0.4581364534113353, + "grad_norm": 3.2280664443969727, + "learning_rate": 1.3152020262521877e-05, + "loss": 1.5796, + "step": 36650 + }, + { + "epoch": 0.4581614540363509, + "grad_norm": 4.126480579376221, + "learning_rate": 1.3151192047321017e-05, + "loss": 1.9553, + "step": 36652 + }, + { + "epoch": 0.45818645466136654, + "grad_norm": 3.0923452377319336, + "learning_rate": 1.3150363808121127e-05, + "loss": 0.5547, + "step": 36654 + }, + { + "epoch": 0.4582114552863822, + "grad_norm": 5.154902935028076, + "learning_rate": 1.3149535544928512e-05, + "loss": 1.5731, + "step": 36656 + }, + { + "epoch": 0.4582364559113978, + "grad_norm": 4.231876373291016, + "learning_rate": 1.3148707257749477e-05, + "loss": 0.5492, + "step": 36658 + }, + { + "epoch": 0.4582614565364134, + "grad_norm": 3.797529697418213, + "learning_rate": 1.3147878946590335e-05, + "loss": 1.1288, + "step": 36660 + }, + { + "epoch": 0.458286457161429, + "grad_norm": 0.0013699865667149425, + "learning_rate": 1.3147050611457392e-05, + "loss": 0.9072, + "step": 36662 + }, + { + "epoch": 0.45831145778644466, + "grad_norm": 0.015589159913361073, + "learning_rate": 1.3146222252356958e-05, + "loss": 0.2417, + "step": 36664 + }, + { + "epoch": 0.4583364584114603, + "grad_norm": 4.926712989807129, + "learning_rate": 1.3145393869295337e-05, + "loss": 0.7613, + "step": 36666 + }, + { + "epoch": 0.4583614590364759, + "grad_norm": 3.336059093475342, + "learning_rate": 1.3144565462278841e-05, + "loss": 0.4354, + "step": 36668 + }, + { + "epoch": 0.45838645966149155, + "grad_norm": 0.0009903920581564307, + "learning_rate": 1.3143737031313782e-05, + "loss": 0.1734, + "step": 36670 + }, + { + "epoch": 0.45841146028650714, + "grad_norm": 2.3115346431732178, + "learning_rate": 1.3142908576406464e-05, + "loss": 1.0265, + "step": 36672 + }, + { + "epoch": 0.4584364609115228, + "grad_norm": 5.6057353019714355, + "learning_rate": 1.31420800975632e-05, + "loss": 1.6279, + "step": 36674 + }, + { + "epoch": 0.45846146153653844, + "grad_norm": 2.570333480834961, + "learning_rate": 1.3141251594790301e-05, + "loss": 0.5675, + "step": 36676 + }, + { + "epoch": 0.45848646216155403, + "grad_norm": 3.534478187561035, + "learning_rate": 1.314042306809407e-05, + "loss": 0.4996, + "step": 36678 + }, + { + "epoch": 0.4585114627865697, + "grad_norm": 0.03295513615012169, + "learning_rate": 1.3139594517480822e-05, + "loss": 0.0005, + "step": 36680 + }, + { + "epoch": 0.45853646341158527, + "grad_norm": 0.9470028281211853, + "learning_rate": 1.3138765942956867e-05, + "loss": 0.7147, + "step": 36682 + }, + { + "epoch": 0.4585614640366009, + "grad_norm": 0.002947063883766532, + "learning_rate": 1.3137937344528511e-05, + "loss": 0.5007, + "step": 36684 + }, + { + "epoch": 0.45858646466161657, + "grad_norm": 6.043843746185303, + "learning_rate": 1.3137108722202072e-05, + "loss": 1.1385, + "step": 36686 + }, + { + "epoch": 0.45861146528663216, + "grad_norm": 8.294949531555176, + "learning_rate": 1.3136280075983856e-05, + "loss": 1.8473, + "step": 36688 + }, + { + "epoch": 0.4586364659116478, + "grad_norm": 2.2568886280059814, + "learning_rate": 1.3135451405880173e-05, + "loss": 0.1124, + "step": 36690 + }, + { + "epoch": 0.4586614665366634, + "grad_norm": 10.599921226501465, + "learning_rate": 1.3134622711897338e-05, + "loss": 1.6091, + "step": 36692 + }, + { + "epoch": 0.45868646716167905, + "grad_norm": 1.7191529273986816, + "learning_rate": 1.3133793994041657e-05, + "loss": 2.1242, + "step": 36694 + }, + { + "epoch": 0.4587114677866947, + "grad_norm": 3.104987382888794, + "learning_rate": 1.3132965252319445e-05, + "loss": 0.7855, + "step": 36696 + }, + { + "epoch": 0.4587364684117103, + "grad_norm": 2.7121543884277344, + "learning_rate": 1.313213648673701e-05, + "loss": 0.6086, + "step": 36698 + }, + { + "epoch": 0.45876146903672593, + "grad_norm": 0.000748376885894686, + "learning_rate": 1.313130769730067e-05, + "loss": 1.9268, + "step": 36700 + }, + { + "epoch": 0.4587864696617415, + "grad_norm": 3.1462528705596924, + "learning_rate": 1.3130478884016732e-05, + "loss": 0.6688, + "step": 36702 + }, + { + "epoch": 0.4588114702867572, + "grad_norm": 2.818761110305786, + "learning_rate": 1.312965004689151e-05, + "loss": 1.6952, + "step": 36704 + }, + { + "epoch": 0.4588364709117728, + "grad_norm": 2.2281687259674072, + "learning_rate": 1.3128821185931317e-05, + "loss": 0.5562, + "step": 36706 + }, + { + "epoch": 0.4588614715367884, + "grad_norm": 4.228421688079834, + "learning_rate": 1.3127992301142463e-05, + "loss": 0.5809, + "step": 36708 + }, + { + "epoch": 0.45888647216180406, + "grad_norm": 0.40917059779167175, + "learning_rate": 1.3127163392531262e-05, + "loss": 0.5849, + "step": 36710 + }, + { + "epoch": 0.45891147278681965, + "grad_norm": 3.8308167457580566, + "learning_rate": 1.3126334460104026e-05, + "loss": 0.6418, + "step": 36712 + }, + { + "epoch": 0.4589364734118353, + "grad_norm": 4.793402194976807, + "learning_rate": 1.3125505503867073e-05, + "loss": 1.8376, + "step": 36714 + }, + { + "epoch": 0.45896147403685095, + "grad_norm": 5.221897602081299, + "learning_rate": 1.3124676523826707e-05, + "loss": 1.0418, + "step": 36716 + }, + { + "epoch": 0.45898647466186654, + "grad_norm": 4.856689929962158, + "learning_rate": 1.3123847519989249e-05, + "loss": 0.9365, + "step": 36718 + }, + { + "epoch": 0.4590114752868822, + "grad_norm": 0.0009393022628501058, + "learning_rate": 1.3123018492361008e-05, + "loss": 0.1984, + "step": 36720 + }, + { + "epoch": 0.4590364759118978, + "grad_norm": 1.8200269937515259, + "learning_rate": 1.3122189440948301e-05, + "loss": 1.1135, + "step": 36722 + }, + { + "epoch": 0.4590614765369134, + "grad_norm": 2.669717311859131, + "learning_rate": 1.3121360365757441e-05, + "loss": 1.1571, + "step": 36724 + }, + { + "epoch": 0.4590864771619291, + "grad_norm": 5.232283115386963, + "learning_rate": 1.3120531266794742e-05, + "loss": 1.1166, + "step": 36726 + }, + { + "epoch": 0.45911147778694467, + "grad_norm": 2.151113748550415, + "learning_rate": 1.311970214406652e-05, + "loss": 0.0766, + "step": 36728 + }, + { + "epoch": 0.4591364784119603, + "grad_norm": 4.763543128967285, + "learning_rate": 1.3118872997579083e-05, + "loss": 0.9764, + "step": 36730 + }, + { + "epoch": 0.4591614790369759, + "grad_norm": 2.520886182785034, + "learning_rate": 1.3118043827338753e-05, + "loss": 1.5002, + "step": 36732 + }, + { + "epoch": 0.45918647966199155, + "grad_norm": 3.2235312461853027, + "learning_rate": 1.3117214633351843e-05, + "loss": 2.642, + "step": 36734 + }, + { + "epoch": 0.4592114802870072, + "grad_norm": 3.8392109870910645, + "learning_rate": 1.3116385415624664e-05, + "loss": 0.889, + "step": 36736 + }, + { + "epoch": 0.4592364809120228, + "grad_norm": 0.0022618744987994432, + "learning_rate": 1.3115556174163537e-05, + "loss": 0.7545, + "step": 36738 + }, + { + "epoch": 0.45926148153703844, + "grad_norm": 9.14130973815918, + "learning_rate": 1.3114726908974777e-05, + "loss": 1.7339, + "step": 36740 + }, + { + "epoch": 0.45928648216205403, + "grad_norm": 2.870199203491211, + "learning_rate": 1.3113897620064692e-05, + "loss": 0.8182, + "step": 36742 + }, + { + "epoch": 0.4593114827870697, + "grad_norm": 1.844636082649231, + "learning_rate": 1.3113068307439608e-05, + "loss": 0.8867, + "step": 36744 + }, + { + "epoch": 0.45933648341208533, + "grad_norm": 6.180295944213867, + "learning_rate": 1.3112238971105832e-05, + "loss": 1.4697, + "step": 36746 + }, + { + "epoch": 0.4593614840371009, + "grad_norm": 4.79237699508667, + "learning_rate": 1.3111409611069686e-05, + "loss": 1.3797, + "step": 36748 + }, + { + "epoch": 0.45938648466211657, + "grad_norm": 6.001489639282227, + "learning_rate": 1.3110580227337483e-05, + "loss": 1.3104, + "step": 36750 + }, + { + "epoch": 0.45941148528713216, + "grad_norm": 4.20797061920166, + "learning_rate": 1.3109750819915542e-05, + "loss": 0.9923, + "step": 36752 + }, + { + "epoch": 0.4594364859121478, + "grad_norm": 0.6129425168037415, + "learning_rate": 1.3108921388810181e-05, + "loss": 0.2209, + "step": 36754 + }, + { + "epoch": 0.45946148653716345, + "grad_norm": 4.710355281829834, + "learning_rate": 1.3108091934027711e-05, + "loss": 1.4918, + "step": 36756 + }, + { + "epoch": 0.45948648716217905, + "grad_norm": 4.091335296630859, + "learning_rate": 1.3107262455574455e-05, + "loss": 0.8518, + "step": 36758 + }, + { + "epoch": 0.4595114877871947, + "grad_norm": 0.2294197827577591, + "learning_rate": 1.3106432953456723e-05, + "loss": 0.9688, + "step": 36760 + }, + { + "epoch": 0.4595364884122103, + "grad_norm": 2.9365787506103516, + "learning_rate": 1.310560342768084e-05, + "loss": 1.4008, + "step": 36762 + }, + { + "epoch": 0.45956148903722593, + "grad_norm": 2.229743480682373, + "learning_rate": 1.3104773878253121e-05, + "loss": 0.7332, + "step": 36764 + }, + { + "epoch": 0.4595864896622416, + "grad_norm": 2.717649459838867, + "learning_rate": 1.3103944305179883e-05, + "loss": 1.2644, + "step": 36766 + }, + { + "epoch": 0.4596114902872572, + "grad_norm": 3.608733654022217, + "learning_rate": 1.3103114708467444e-05, + "loss": 1.2964, + "step": 36768 + }, + { + "epoch": 0.4596364909122728, + "grad_norm": 12.597665786743164, + "learning_rate": 1.310228508812212e-05, + "loss": 2.359, + "step": 36770 + }, + { + "epoch": 0.4596614915372884, + "grad_norm": 4.049016952514648, + "learning_rate": 1.310145544415023e-05, + "loss": 1.3347, + "step": 36772 + }, + { + "epoch": 0.45968649216230406, + "grad_norm": 1.8064106702804565, + "learning_rate": 1.31006257765581e-05, + "loss": 1.2848, + "step": 36774 + }, + { + "epoch": 0.4597114927873197, + "grad_norm": 2.250166893005371, + "learning_rate": 1.309979608535204e-05, + "loss": 1.4667, + "step": 36776 + }, + { + "epoch": 0.4597364934123353, + "grad_norm": 3.362356424331665, + "learning_rate": 1.309896637053837e-05, + "loss": 1.3706, + "step": 36778 + }, + { + "epoch": 0.45976149403735095, + "grad_norm": 4.875405788421631, + "learning_rate": 1.3098136632123413e-05, + "loss": 0.7759, + "step": 36780 + }, + { + "epoch": 0.45978649466236654, + "grad_norm": 2.561600685119629, + "learning_rate": 1.3097306870113484e-05, + "loss": 1.8002, + "step": 36782 + }, + { + "epoch": 0.4598114952873822, + "grad_norm": 2.67846417427063, + "learning_rate": 1.30964770845149e-05, + "loss": 0.9334, + "step": 36784 + }, + { + "epoch": 0.45983649591239784, + "grad_norm": 1.712883710861206, + "learning_rate": 1.3095647275333991e-05, + "loss": 0.3472, + "step": 36786 + }, + { + "epoch": 0.45986149653741343, + "grad_norm": 3.764516830444336, + "learning_rate": 1.3094817442577067e-05, + "loss": 0.8079, + "step": 36788 + }, + { + "epoch": 0.4598864971624291, + "grad_norm": 6.55782413482666, + "learning_rate": 1.3093987586250449e-05, + "loss": 1.1778, + "step": 36790 + }, + { + "epoch": 0.45991149778744467, + "grad_norm": 3.059966564178467, + "learning_rate": 1.3093157706360462e-05, + "loss": 1.3557, + "step": 36792 + }, + { + "epoch": 0.4599364984124603, + "grad_norm": 3.9032840728759766, + "learning_rate": 1.3092327802913424e-05, + "loss": 2.6896, + "step": 36794 + }, + { + "epoch": 0.45996149903747596, + "grad_norm": 6.755425930023193, + "learning_rate": 1.3091497875915653e-05, + "loss": 1.9835, + "step": 36796 + }, + { + "epoch": 0.45998649966249155, + "grad_norm": 0.0008277114247903228, + "learning_rate": 1.309066792537347e-05, + "loss": 0.4395, + "step": 36798 + }, + { + "epoch": 0.4600115002875072, + "grad_norm": 3.2706708908081055, + "learning_rate": 1.30898379512932e-05, + "loss": 0.4737, + "step": 36800 + }, + { + "epoch": 0.4600365009125228, + "grad_norm": 4.606137275695801, + "learning_rate": 1.3089007953681162e-05, + "loss": 1.1984, + "step": 36802 + }, + { + "epoch": 0.46006150153753844, + "grad_norm": 5.947535514831543, + "learning_rate": 1.3088177932543673e-05, + "loss": 1.6124, + "step": 36804 + }, + { + "epoch": 0.4600865021625541, + "grad_norm": 3.0025713443756104, + "learning_rate": 1.308734788788706e-05, + "loss": 0.9255, + "step": 36806 + }, + { + "epoch": 0.4601115027875697, + "grad_norm": 0.24139933288097382, + "learning_rate": 1.308651781971764e-05, + "loss": 0.2515, + "step": 36808 + }, + { + "epoch": 0.46013650341258533, + "grad_norm": 2.396235466003418, + "learning_rate": 1.3085687728041735e-05, + "loss": 0.5916, + "step": 36810 + }, + { + "epoch": 0.4601615040376009, + "grad_norm": 3.1314210891723633, + "learning_rate": 1.3084857612865674e-05, + "loss": 0.9499, + "step": 36812 + }, + { + "epoch": 0.46018650466261657, + "grad_norm": 2.896921157836914, + "learning_rate": 1.308402747419577e-05, + "loss": 1.1691, + "step": 36814 + }, + { + "epoch": 0.4602115052876322, + "grad_norm": 5.0998616218566895, + "learning_rate": 1.308319731203835e-05, + "loss": 2.2508, + "step": 36816 + }, + { + "epoch": 0.4602365059126478, + "grad_norm": 1.0066144466400146, + "learning_rate": 1.3082367126399735e-05, + "loss": 0.4963, + "step": 36818 + }, + { + "epoch": 0.46026150653766346, + "grad_norm": 3.196910858154297, + "learning_rate": 1.308153691728625e-05, + "loss": 1.1295, + "step": 36820 + }, + { + "epoch": 0.46028650716267905, + "grad_norm": 0.000860324245877564, + "learning_rate": 1.3080706684704212e-05, + "loss": 0.0325, + "step": 36822 + }, + { + "epoch": 0.4603115077876947, + "grad_norm": 2.818040370941162, + "learning_rate": 1.3079876428659948e-05, + "loss": 0.6649, + "step": 36824 + }, + { + "epoch": 0.46033650841271034, + "grad_norm": 5.579938888549805, + "learning_rate": 1.3079046149159784e-05, + "loss": 0.6994, + "step": 36826 + }, + { + "epoch": 0.46036150903772594, + "grad_norm": 2.6642889976501465, + "learning_rate": 1.3078215846210037e-05, + "loss": 1.2851, + "step": 36828 + }, + { + "epoch": 0.4603865096627416, + "grad_norm": 0.03231192007660866, + "learning_rate": 1.3077385519817034e-05, + "loss": 0.0508, + "step": 36830 + }, + { + "epoch": 0.4604115102877572, + "grad_norm": 1.9655224084854126, + "learning_rate": 1.30765551699871e-05, + "loss": 0.5831, + "step": 36832 + }, + { + "epoch": 0.4604365109127728, + "grad_norm": 4.303737640380859, + "learning_rate": 1.3075724796726552e-05, + "loss": 0.8326, + "step": 36834 + }, + { + "epoch": 0.46046151153778847, + "grad_norm": 2.4701313972473145, + "learning_rate": 1.3074894400041722e-05, + "loss": 0.9294, + "step": 36836 + }, + { + "epoch": 0.46048651216280406, + "grad_norm": 2.7546842098236084, + "learning_rate": 1.3074063979938933e-05, + "loss": 1.1302, + "step": 36838 + }, + { + "epoch": 0.4605115127878197, + "grad_norm": 3.1836750507354736, + "learning_rate": 1.3073233536424506e-05, + "loss": 0.5629, + "step": 36840 + }, + { + "epoch": 0.4605365134128353, + "grad_norm": 3.4941463470458984, + "learning_rate": 1.3072403069504764e-05, + "loss": 1.0638, + "step": 36842 + }, + { + "epoch": 0.46056151403785095, + "grad_norm": 2.892686128616333, + "learning_rate": 1.3071572579186039e-05, + "loss": 0.6299, + "step": 36844 + }, + { + "epoch": 0.4605865146628666, + "grad_norm": 3.657660722732544, + "learning_rate": 1.3070742065474649e-05, + "loss": 0.7802, + "step": 36846 + }, + { + "epoch": 0.4606115152878822, + "grad_norm": 0.000960528792347759, + "learning_rate": 1.3069911528376922e-05, + "loss": 0.5044, + "step": 36848 + }, + { + "epoch": 0.46063651591289784, + "grad_norm": 2.578001022338867, + "learning_rate": 1.3069080967899184e-05, + "loss": 1.1652, + "step": 36850 + }, + { + "epoch": 0.46066151653791343, + "grad_norm": 4.115052700042725, + "learning_rate": 1.3068250384047759e-05, + "loss": 0.5543, + "step": 36852 + }, + { + "epoch": 0.4606865171629291, + "grad_norm": 4.735701084136963, + "learning_rate": 1.3067419776828971e-05, + "loss": 1.294, + "step": 36854 + }, + { + "epoch": 0.4607115177879447, + "grad_norm": 6.283249855041504, + "learning_rate": 1.3066589146249152e-05, + "loss": 1.1892, + "step": 36856 + }, + { + "epoch": 0.4607365184129603, + "grad_norm": 7.986799716949463, + "learning_rate": 1.306575849231462e-05, + "loss": 1.3087, + "step": 36858 + }, + { + "epoch": 0.46076151903797596, + "grad_norm": 3.746220588684082, + "learning_rate": 1.3064927815031705e-05, + "loss": 1.1833, + "step": 36860 + }, + { + "epoch": 0.46078651966299156, + "grad_norm": 3.967053174972534, + "learning_rate": 1.3064097114406734e-05, + "loss": 0.7728, + "step": 36862 + }, + { + "epoch": 0.4608115202880072, + "grad_norm": 0.9781149625778198, + "learning_rate": 1.3063266390446035e-05, + "loss": 0.2366, + "step": 36864 + }, + { + "epoch": 0.46083652091302285, + "grad_norm": 0.0009818852413445711, + "learning_rate": 1.3062435643155928e-05, + "loss": 0.5239, + "step": 36866 + }, + { + "epoch": 0.46086152153803844, + "grad_norm": 3.6087193489074707, + "learning_rate": 1.3061604872542749e-05, + "loss": 1.0666, + "step": 36868 + }, + { + "epoch": 0.4608865221630541, + "grad_norm": 1.2767508029937744, + "learning_rate": 1.3060774078612815e-05, + "loss": 0.1402, + "step": 36870 + }, + { + "epoch": 0.4609115227880697, + "grad_norm": 3.2206273078918457, + "learning_rate": 1.3059943261372461e-05, + "loss": 1.0125, + "step": 36872 + }, + { + "epoch": 0.46093652341308533, + "grad_norm": 0.33282724022865295, + "learning_rate": 1.3059112420828011e-05, + "loss": 0.5871, + "step": 36874 + }, + { + "epoch": 0.460961524038101, + "grad_norm": 5.953980445861816, + "learning_rate": 1.3058281556985794e-05, + "loss": 1.1378, + "step": 36876 + }, + { + "epoch": 0.46098652466311657, + "grad_norm": 3.719630241394043, + "learning_rate": 1.305745066985214e-05, + "loss": 0.1587, + "step": 36878 + }, + { + "epoch": 0.4610115252881322, + "grad_norm": 3.110783338546753, + "learning_rate": 1.305661975943337e-05, + "loss": 0.3614, + "step": 36880 + }, + { + "epoch": 0.4610365259131478, + "grad_norm": 0.0006706336862407625, + "learning_rate": 1.305578882573582e-05, + "loss": 0.0121, + "step": 36882 + }, + { + "epoch": 0.46106152653816346, + "grad_norm": 4.517900466918945, + "learning_rate": 1.3054957868765809e-05, + "loss": 1.7079, + "step": 36884 + }, + { + "epoch": 0.4610865271631791, + "grad_norm": 1.358060598373413, + "learning_rate": 1.3054126888529674e-05, + "loss": 0.5748, + "step": 36886 + }, + { + "epoch": 0.4611115277881947, + "grad_norm": 3.6304142475128174, + "learning_rate": 1.3053295885033741e-05, + "loss": 1.3777, + "step": 36888 + }, + { + "epoch": 0.46113652841321034, + "grad_norm": 0.37309256196022034, + "learning_rate": 1.3052464858284339e-05, + "loss": 0.417, + "step": 36890 + }, + { + "epoch": 0.46116152903822594, + "grad_norm": 1.5433740615844727, + "learning_rate": 1.3051633808287796e-05, + "loss": 0.7278, + "step": 36892 + }, + { + "epoch": 0.4611865296632416, + "grad_norm": 3.4165685176849365, + "learning_rate": 1.305080273505044e-05, + "loss": 1.8641, + "step": 36894 + }, + { + "epoch": 0.46121153028825723, + "grad_norm": 4.894824981689453, + "learning_rate": 1.3049971638578603e-05, + "loss": 1.6938, + "step": 36896 + }, + { + "epoch": 0.4612365309132728, + "grad_norm": 0.015254650264978409, + "learning_rate": 1.3049140518878612e-05, + "loss": 0.2846, + "step": 36898 + }, + { + "epoch": 0.46126153153828847, + "grad_norm": 3.9268035888671875, + "learning_rate": 1.3048309375956798e-05, + "loss": 1.281, + "step": 36900 + }, + { + "epoch": 0.46128653216330406, + "grad_norm": 3.71638822555542, + "learning_rate": 1.304747820981949e-05, + "loss": 1.645, + "step": 36902 + }, + { + "epoch": 0.4613115327883197, + "grad_norm": 3.6620686054229736, + "learning_rate": 1.3046647020473025e-05, + "loss": 0.3066, + "step": 36904 + }, + { + "epoch": 0.46133653341333536, + "grad_norm": 4.899271488189697, + "learning_rate": 1.304581580792372e-05, + "loss": 1.1265, + "step": 36906 + }, + { + "epoch": 0.46136153403835095, + "grad_norm": 3.786405324935913, + "learning_rate": 1.3044984572177914e-05, + "loss": 1.6211, + "step": 36908 + }, + { + "epoch": 0.4613865346633666, + "grad_norm": 3.0107202529907227, + "learning_rate": 1.3044153313241938e-05, + "loss": 1.1557, + "step": 36910 + }, + { + "epoch": 0.4614115352883822, + "grad_norm": 2.4503819942474365, + "learning_rate": 1.3043322031122121e-05, + "loss": 0.8689, + "step": 36912 + }, + { + "epoch": 0.46143653591339784, + "grad_norm": 1.7572674751281738, + "learning_rate": 1.3042490725824793e-05, + "loss": 0.6724, + "step": 36914 + }, + { + "epoch": 0.4614615365384135, + "grad_norm": 2.9283053874969482, + "learning_rate": 1.3041659397356285e-05, + "loss": 1.0588, + "step": 36916 + }, + { + "epoch": 0.4614865371634291, + "grad_norm": 4.1943039894104, + "learning_rate": 1.304082804572293e-05, + "loss": 1.3556, + "step": 36918 + }, + { + "epoch": 0.4615115377884447, + "grad_norm": 0.012077144347131252, + "learning_rate": 1.3039996670931058e-05, + "loss": 0.2458, + "step": 36920 + }, + { + "epoch": 0.4615365384134603, + "grad_norm": 0.06156444549560547, + "learning_rate": 1.3039165272987002e-05, + "loss": 0.2781, + "step": 36922 + }, + { + "epoch": 0.46156153903847597, + "grad_norm": 1.7992324829101562, + "learning_rate": 1.3038333851897094e-05, + "loss": 0.1036, + "step": 36924 + }, + { + "epoch": 0.4615865396634916, + "grad_norm": 1.5137856006622314, + "learning_rate": 1.3037502407667661e-05, + "loss": 0.1092, + "step": 36926 + }, + { + "epoch": 0.4616115402885072, + "grad_norm": 5.135875225067139, + "learning_rate": 1.3036670940305042e-05, + "loss": 0.6904, + "step": 36928 + }, + { + "epoch": 0.46163654091352285, + "grad_norm": 4.2681803703308105, + "learning_rate": 1.3035839449815569e-05, + "loss": 1.4481, + "step": 36930 + }, + { + "epoch": 0.46166154153853844, + "grad_norm": 3.5158259868621826, + "learning_rate": 1.3035007936205571e-05, + "loss": 1.0617, + "step": 36932 + }, + { + "epoch": 0.4616865421635541, + "grad_norm": 3.424250364303589, + "learning_rate": 1.3034176399481377e-05, + "loss": 0.4773, + "step": 36934 + }, + { + "epoch": 0.46171154278856974, + "grad_norm": 2.0204758644104004, + "learning_rate": 1.3033344839649329e-05, + "loss": 0.3692, + "step": 36936 + }, + { + "epoch": 0.46173654341358533, + "grad_norm": 4.079245567321777, + "learning_rate": 1.3032513256715756e-05, + "loss": 0.9539, + "step": 36938 + }, + { + "epoch": 0.461761544038601, + "grad_norm": 2.0792577266693115, + "learning_rate": 1.3031681650686987e-05, + "loss": 0.2618, + "step": 36940 + }, + { + "epoch": 0.46178654466361657, + "grad_norm": 3.207568645477295, + "learning_rate": 1.3030850021569363e-05, + "loss": 1.3885, + "step": 36942 + }, + { + "epoch": 0.4618115452886322, + "grad_norm": 8.595462799072266, + "learning_rate": 1.3030018369369213e-05, + "loss": 1.4801, + "step": 36944 + }, + { + "epoch": 0.46183654591364787, + "grad_norm": 0.0028974490705877542, + "learning_rate": 1.3029186694092872e-05, + "loss": 0.6577, + "step": 36946 + }, + { + "epoch": 0.46186154653866346, + "grad_norm": 2.9045488834381104, + "learning_rate": 1.3028354995746672e-05, + "loss": 0.5288, + "step": 36948 + }, + { + "epoch": 0.4618865471636791, + "grad_norm": 0.6922857165336609, + "learning_rate": 1.3027523274336948e-05, + "loss": 1.4679, + "step": 36950 + }, + { + "epoch": 0.4619115477886947, + "grad_norm": 2.108792543411255, + "learning_rate": 1.3026691529870038e-05, + "loss": 0.9299, + "step": 36952 + }, + { + "epoch": 0.46193654841371035, + "grad_norm": 2.818962812423706, + "learning_rate": 1.3025859762352271e-05, + "loss": 1.0002, + "step": 36954 + }, + { + "epoch": 0.461961549038726, + "grad_norm": 2.8335134983062744, + "learning_rate": 1.3025027971789986e-05, + "loss": 1.3233, + "step": 36956 + }, + { + "epoch": 0.4619865496637416, + "grad_norm": 1.412438988685608, + "learning_rate": 1.3024196158189512e-05, + "loss": 0.7677, + "step": 36958 + }, + { + "epoch": 0.46201155028875723, + "grad_norm": 2.0921432971954346, + "learning_rate": 1.3023364321557191e-05, + "loss": 0.8398, + "step": 36960 + }, + { + "epoch": 0.4620365509137728, + "grad_norm": 3.3525612354278564, + "learning_rate": 1.3022532461899353e-05, + "loss": 1.1098, + "step": 36962 + }, + { + "epoch": 0.4620615515387885, + "grad_norm": 2.3750293254852295, + "learning_rate": 1.3021700579222337e-05, + "loss": 0.4213, + "step": 36964 + }, + { + "epoch": 0.4620865521638041, + "grad_norm": 1.4859994649887085, + "learning_rate": 1.3020868673532475e-05, + "loss": 0.7325, + "step": 36966 + }, + { + "epoch": 0.4621115527888197, + "grad_norm": 1.8228551149368286, + "learning_rate": 1.3020036744836104e-05, + "loss": 1.2491, + "step": 36968 + }, + { + "epoch": 0.46213655341383536, + "grad_norm": 6.040914058685303, + "learning_rate": 1.3019204793139564e-05, + "loss": 1.6242, + "step": 36970 + }, + { + "epoch": 0.46216155403885095, + "grad_norm": 1.106325387954712, + "learning_rate": 1.3018372818449182e-05, + "loss": 0.0686, + "step": 36972 + }, + { + "epoch": 0.4621865546638666, + "grad_norm": 3.189157009124756, + "learning_rate": 1.3017540820771302e-05, + "loss": 1.5605, + "step": 36974 + }, + { + "epoch": 0.46221155528888225, + "grad_norm": 3.400089740753174, + "learning_rate": 1.3016708800112259e-05, + "loss": 1.4957, + "step": 36976 + }, + { + "epoch": 0.46223655591389784, + "grad_norm": 2.795848846435547, + "learning_rate": 1.3015876756478387e-05, + "loss": 0.5755, + "step": 36978 + }, + { + "epoch": 0.4622615565389135, + "grad_norm": 0.9470583200454712, + "learning_rate": 1.3015044689876025e-05, + "loss": 0.0637, + "step": 36980 + }, + { + "epoch": 0.4622865571639291, + "grad_norm": 0.6539632678031921, + "learning_rate": 1.3014212600311509e-05, + "loss": 0.6219, + "step": 36982 + }, + { + "epoch": 0.4623115577889447, + "grad_norm": 7.230886459350586, + "learning_rate": 1.3013380487791173e-05, + "loss": 1.6138, + "step": 36984 + }, + { + "epoch": 0.4623365584139604, + "grad_norm": 2.5374319553375244, + "learning_rate": 1.301254835232136e-05, + "loss": 1.1282, + "step": 36986 + }, + { + "epoch": 0.46236155903897597, + "grad_norm": 8.65894603729248, + "learning_rate": 1.3011716193908406e-05, + "loss": 1.5081, + "step": 36988 + }, + { + "epoch": 0.4623865596639916, + "grad_norm": 4.456752300262451, + "learning_rate": 1.3010884012558645e-05, + "loss": 1.652, + "step": 36990 + }, + { + "epoch": 0.4624115602890072, + "grad_norm": 1.5039435625076294, + "learning_rate": 1.3010051808278417e-05, + "loss": 0.0639, + "step": 36992 + }, + { + "epoch": 0.46243656091402285, + "grad_norm": 0.0006361969863064587, + "learning_rate": 1.3009219581074063e-05, + "loss": 0.6476, + "step": 36994 + }, + { + "epoch": 0.4624615615390385, + "grad_norm": 2.1948671340942383, + "learning_rate": 1.3008387330951915e-05, + "loss": 1.1436, + "step": 36996 + }, + { + "epoch": 0.4624865621640541, + "grad_norm": 4.643956184387207, + "learning_rate": 1.3007555057918315e-05, + "loss": 1.0446, + "step": 36998 + }, + { + "epoch": 0.46251156278906974, + "grad_norm": 2.5270352363586426, + "learning_rate": 1.3006722761979601e-05, + "loss": 0.4073, + "step": 37000 + }, + { + "epoch": 0.46253656341408533, + "grad_norm": 3.88039493560791, + "learning_rate": 1.3005890443142115e-05, + "loss": 1.2808, + "step": 37002 + }, + { + "epoch": 0.462561564039101, + "grad_norm": 2.7040727138519287, + "learning_rate": 1.3005058101412189e-05, + "loss": 0.5581, + "step": 37004 + }, + { + "epoch": 0.46258656466411663, + "grad_norm": 3.7142276763916016, + "learning_rate": 1.3004225736796166e-05, + "loss": 1.1127, + "step": 37006 + }, + { + "epoch": 0.4626115652891322, + "grad_norm": 9.687759399414062, + "learning_rate": 1.3003393349300386e-05, + "loss": 1.5637, + "step": 37008 + }, + { + "epoch": 0.46263656591414787, + "grad_norm": 6.129101276397705, + "learning_rate": 1.3002560938931184e-05, + "loss": 0.8504, + "step": 37010 + }, + { + "epoch": 0.46266156653916346, + "grad_norm": 1.7753245830535889, + "learning_rate": 1.3001728505694903e-05, + "loss": 0.6739, + "step": 37012 + }, + { + "epoch": 0.4626865671641791, + "grad_norm": 4.932182788848877, + "learning_rate": 1.3000896049597886e-05, + "loss": 0.8635, + "step": 37014 + }, + { + "epoch": 0.46271156778919476, + "grad_norm": 2.8584980964660645, + "learning_rate": 1.3000063570646466e-05, + "loss": 1.2762, + "step": 37016 + }, + { + "epoch": 0.46273656841421035, + "grad_norm": 4.963167190551758, + "learning_rate": 1.2999231068846988e-05, + "loss": 1.1503, + "step": 37018 + }, + { + "epoch": 0.462761569039226, + "grad_norm": 7.33347749710083, + "learning_rate": 1.2998398544205787e-05, + "loss": 0.462, + "step": 37020 + }, + { + "epoch": 0.4627865696642416, + "grad_norm": 5.33666467666626, + "learning_rate": 1.299756599672921e-05, + "loss": 1.9337, + "step": 37022 + }, + { + "epoch": 0.46281157028925723, + "grad_norm": 3.028304100036621, + "learning_rate": 1.2996733426423591e-05, + "loss": 0.89, + "step": 37024 + }, + { + "epoch": 0.4628365709142729, + "grad_norm": 3.4508183002471924, + "learning_rate": 1.2995900833295276e-05, + "loss": 0.5562, + "step": 37026 + }, + { + "epoch": 0.4628615715392885, + "grad_norm": 4.732057094573975, + "learning_rate": 1.2995068217350605e-05, + "loss": 1.1117, + "step": 37028 + }, + { + "epoch": 0.4628865721643041, + "grad_norm": 2.52547287940979, + "learning_rate": 1.2994235578595917e-05, + "loss": 0.5727, + "step": 37030 + }, + { + "epoch": 0.4629115727893197, + "grad_norm": 3.4012067317962646, + "learning_rate": 1.2993402917037554e-05, + "loss": 1.0641, + "step": 37032 + }, + { + "epoch": 0.46293657341433536, + "grad_norm": 4.461841583251953, + "learning_rate": 1.2992570232681861e-05, + "loss": 0.9189, + "step": 37034 + }, + { + "epoch": 0.462961574039351, + "grad_norm": 2.0865705013275146, + "learning_rate": 1.299173752553517e-05, + "loss": 1.2346, + "step": 37036 + }, + { + "epoch": 0.4629865746643666, + "grad_norm": 5.260610580444336, + "learning_rate": 1.2990904795603831e-05, + "loss": 1.653, + "step": 37038 + }, + { + "epoch": 0.46301157528938225, + "grad_norm": 3.0646414756774902, + "learning_rate": 1.2990072042894188e-05, + "loss": 1.2631, + "step": 37040 + }, + { + "epoch": 0.46303657591439784, + "grad_norm": 1.7894742488861084, + "learning_rate": 1.2989239267412577e-05, + "loss": 0.2889, + "step": 37042 + }, + { + "epoch": 0.4630615765394135, + "grad_norm": 2.3742141723632812, + "learning_rate": 1.298840646916534e-05, + "loss": 0.1633, + "step": 37044 + }, + { + "epoch": 0.46308657716442914, + "grad_norm": 5.450088024139404, + "learning_rate": 1.2987573648158824e-05, + "loss": 1.574, + "step": 37046 + }, + { + "epoch": 0.46311157778944473, + "grad_norm": 2.7080790996551514, + "learning_rate": 1.298674080439937e-05, + "loss": 1.3404, + "step": 37048 + }, + { + "epoch": 0.4631365784144604, + "grad_norm": 5.001122951507568, + "learning_rate": 1.2985907937893319e-05, + "loss": 1.51, + "step": 37050 + }, + { + "epoch": 0.46316157903947597, + "grad_norm": 3.663557767868042, + "learning_rate": 1.2985075048647015e-05, + "loss": 0.4238, + "step": 37052 + }, + { + "epoch": 0.4631865796644916, + "grad_norm": 0.0009607324609532952, + "learning_rate": 1.2984242136666804e-05, + "loss": 0.0715, + "step": 37054 + }, + { + "epoch": 0.46321158028950726, + "grad_norm": 4.047000408172607, + "learning_rate": 1.2983409201959028e-05, + "loss": 1.2485, + "step": 37056 + }, + { + "epoch": 0.46323658091452286, + "grad_norm": 3.400449514389038, + "learning_rate": 1.2982576244530026e-05, + "loss": 1.1435, + "step": 37058 + }, + { + "epoch": 0.4632615815395385, + "grad_norm": 2.1783556938171387, + "learning_rate": 1.2981743264386145e-05, + "loss": 0.5171, + "step": 37060 + }, + { + "epoch": 0.4632865821645541, + "grad_norm": 5.79604434967041, + "learning_rate": 1.298091026153373e-05, + "loss": 0.9309, + "step": 37062 + }, + { + "epoch": 0.46331158278956974, + "grad_norm": 2.5283613204956055, + "learning_rate": 1.2980077235979122e-05, + "loss": 0.2416, + "step": 37064 + }, + { + "epoch": 0.4633365834145854, + "grad_norm": 0.031345125287771225, + "learning_rate": 1.297924418772867e-05, + "loss": 0.3704, + "step": 37066 + }, + { + "epoch": 0.463361584039601, + "grad_norm": 6.932055950164795, + "learning_rate": 1.2978411116788716e-05, + "loss": 0.9931, + "step": 37068 + }, + { + "epoch": 0.46338658466461663, + "grad_norm": 0.7225167155265808, + "learning_rate": 1.2977578023165603e-05, + "loss": 0.0156, + "step": 37070 + }, + { + "epoch": 0.4634115852896322, + "grad_norm": 2.468132495880127, + "learning_rate": 1.2976744906865677e-05, + "loss": 0.9945, + "step": 37072 + }, + { + "epoch": 0.46343658591464787, + "grad_norm": 1.2387168407440186, + "learning_rate": 1.2975911767895281e-05, + "loss": 0.6099, + "step": 37074 + }, + { + "epoch": 0.4634615865396635, + "grad_norm": 2.5586776733398438, + "learning_rate": 1.2975078606260764e-05, + "loss": 1.3165, + "step": 37076 + }, + { + "epoch": 0.4634865871646791, + "grad_norm": 3.5494651794433594, + "learning_rate": 1.2974245421968468e-05, + "loss": 0.8854, + "step": 37078 + }, + { + "epoch": 0.46351158778969476, + "grad_norm": 6.222827434539795, + "learning_rate": 1.2973412215024741e-05, + "loss": 0.3638, + "step": 37080 + }, + { + "epoch": 0.46353658841471035, + "grad_norm": 2.632033348083496, + "learning_rate": 1.2972578985435927e-05, + "loss": 0.2756, + "step": 37082 + }, + { + "epoch": 0.463561589039726, + "grad_norm": 1.3179525136947632, + "learning_rate": 1.297174573320837e-05, + "loss": 0.6121, + "step": 37084 + }, + { + "epoch": 0.46358658966474164, + "grad_norm": 1.785904884338379, + "learning_rate": 1.297091245834842e-05, + "loss": 0.7969, + "step": 37086 + }, + { + "epoch": 0.46361159028975724, + "grad_norm": 0.0013457629829645157, + "learning_rate": 1.2970079160862416e-05, + "loss": 0.022, + "step": 37088 + }, + { + "epoch": 0.4636365909147729, + "grad_norm": 3.315911054611206, + "learning_rate": 1.2969245840756712e-05, + "loss": 1.55, + "step": 37090 + }, + { + "epoch": 0.4636615915397885, + "grad_norm": 3.4926719665527344, + "learning_rate": 1.2968412498037652e-05, + "loss": 0.9781, + "step": 37092 + }, + { + "epoch": 0.4636865921648041, + "grad_norm": 2.6830544471740723, + "learning_rate": 1.2967579132711586e-05, + "loss": 0.6079, + "step": 37094 + }, + { + "epoch": 0.46371159278981977, + "grad_norm": 3.5900275707244873, + "learning_rate": 1.2966745744784853e-05, + "loss": 0.5958, + "step": 37096 + }, + { + "epoch": 0.46373659341483536, + "grad_norm": 2.7405385971069336, + "learning_rate": 1.2965912334263802e-05, + "loss": 0.508, + "step": 37098 + }, + { + "epoch": 0.463761594039851, + "grad_norm": 3.6276934146881104, + "learning_rate": 1.2965078901154784e-05, + "loss": 1.6172, + "step": 37100 + }, + { + "epoch": 0.4637865946648666, + "grad_norm": 2.062154769897461, + "learning_rate": 1.2964245445464143e-05, + "loss": 0.8788, + "step": 37102 + }, + { + "epoch": 0.46381159528988225, + "grad_norm": 10.955938339233398, + "learning_rate": 1.2963411967198232e-05, + "loss": 1.66, + "step": 37104 + }, + { + "epoch": 0.4638365959148979, + "grad_norm": 5.0625176429748535, + "learning_rate": 1.2962578466363391e-05, + "loss": 1.5119, + "step": 37106 + }, + { + "epoch": 0.4638615965399135, + "grad_norm": 8.643315315246582, + "learning_rate": 1.2961744942965973e-05, + "loss": 0.4061, + "step": 37108 + }, + { + "epoch": 0.46388659716492914, + "grad_norm": 3.8794538974761963, + "learning_rate": 1.2960911397012323e-05, + "loss": 0.9489, + "step": 37110 + }, + { + "epoch": 0.46391159778994473, + "grad_norm": 4.192409515380859, + "learning_rate": 1.2960077828508793e-05, + "loss": 2.1965, + "step": 37112 + }, + { + "epoch": 0.4639365984149604, + "grad_norm": 3.205702543258667, + "learning_rate": 1.2959244237461726e-05, + "loss": 1.2362, + "step": 37114 + }, + { + "epoch": 0.463961599039976, + "grad_norm": 3.2857656478881836, + "learning_rate": 1.2958410623877472e-05, + "loss": 0.4697, + "step": 37116 + }, + { + "epoch": 0.4639865996649916, + "grad_norm": 1.3261789083480835, + "learning_rate": 1.2957576987762383e-05, + "loss": 0.3916, + "step": 37118 + }, + { + "epoch": 0.46401160029000726, + "grad_norm": 6.6831746101379395, + "learning_rate": 1.2956743329122809e-05, + "loss": 0.3394, + "step": 37120 + }, + { + "epoch": 0.46403660091502286, + "grad_norm": 4.309635639190674, + "learning_rate": 1.2955909647965092e-05, + "loss": 1.0988, + "step": 37122 + }, + { + "epoch": 0.4640616015400385, + "grad_norm": 5.884255886077881, + "learning_rate": 1.2955075944295588e-05, + "loss": 0.9671, + "step": 37124 + }, + { + "epoch": 0.46408660216505415, + "grad_norm": 0.23712125420570374, + "learning_rate": 1.295424221812064e-05, + "loss": 0.0055, + "step": 37126 + }, + { + "epoch": 0.46411160279006974, + "grad_norm": 0.8030833601951599, + "learning_rate": 1.2953408469446602e-05, + "loss": 0.4679, + "step": 37128 + }, + { + "epoch": 0.4641366034150854, + "grad_norm": 3.125778913497925, + "learning_rate": 1.2952574698279826e-05, + "loss": 0.3855, + "step": 37130 + }, + { + "epoch": 0.464161604040101, + "grad_norm": 5.095675945281982, + "learning_rate": 1.2951740904626657e-05, + "loss": 1.16, + "step": 37132 + }, + { + "epoch": 0.46418660466511663, + "grad_norm": 5.197037696838379, + "learning_rate": 1.2950907088493444e-05, + "loss": 0.8213, + "step": 37134 + }, + { + "epoch": 0.4642116052901323, + "grad_norm": 4.0282979011535645, + "learning_rate": 1.2950073249886543e-05, + "loss": 1.8245, + "step": 37136 + }, + { + "epoch": 0.46423660591514787, + "grad_norm": 3.3024938106536865, + "learning_rate": 1.29492393888123e-05, + "loss": 1.5953, + "step": 37138 + }, + { + "epoch": 0.4642616065401635, + "grad_norm": 6.401187896728516, + "learning_rate": 1.2948405505277066e-05, + "loss": 1.6052, + "step": 37140 + }, + { + "epoch": 0.4642866071651791, + "grad_norm": 1.7740504741668701, + "learning_rate": 1.2947571599287194e-05, + "loss": 1.3055, + "step": 37142 + }, + { + "epoch": 0.46431160779019476, + "grad_norm": 10.522658348083496, + "learning_rate": 1.2946737670849032e-05, + "loss": 1.823, + "step": 37144 + }, + { + "epoch": 0.4643366084152104, + "grad_norm": 7.404801845550537, + "learning_rate": 1.2945903719968936e-05, + "loss": 0.2165, + "step": 37146 + }, + { + "epoch": 0.464361609040226, + "grad_norm": 0.9393108487129211, + "learning_rate": 1.2945069746653252e-05, + "loss": 0.3975, + "step": 37148 + }, + { + "epoch": 0.46438660966524165, + "grad_norm": 1.4115946292877197, + "learning_rate": 1.2944235750908331e-05, + "loss": 1.1309, + "step": 37150 + }, + { + "epoch": 0.46441161029025724, + "grad_norm": 6.378432750701904, + "learning_rate": 1.294340173274053e-05, + "loss": 1.6516, + "step": 37152 + }, + { + "epoch": 0.4644366109152729, + "grad_norm": 0.0012382434215396643, + "learning_rate": 1.2942567692156197e-05, + "loss": 0.0, + "step": 37154 + }, + { + "epoch": 0.46446161154028853, + "grad_norm": 4.72382116317749, + "learning_rate": 1.294173362916168e-05, + "loss": 1.0661, + "step": 37156 + }, + { + "epoch": 0.4644866121653041, + "grad_norm": 4.952181816101074, + "learning_rate": 1.2940899543763343e-05, + "loss": 1.1813, + "step": 37158 + }, + { + "epoch": 0.4645116127903198, + "grad_norm": 2.839632034301758, + "learning_rate": 1.2940065435967529e-05, + "loss": 0.1259, + "step": 37160 + }, + { + "epoch": 0.46453661341533536, + "grad_norm": 6.344763278961182, + "learning_rate": 1.2939231305780591e-05, + "loss": 2.3131, + "step": 37162 + }, + { + "epoch": 0.464561614040351, + "grad_norm": 0.00047322906902991235, + "learning_rate": 1.2938397153208882e-05, + "loss": 0.1393, + "step": 37164 + }, + { + "epoch": 0.46458661466536666, + "grad_norm": 3.063288688659668, + "learning_rate": 1.2937562978258755e-05, + "loss": 1.9965, + "step": 37166 + }, + { + "epoch": 0.46461161529038225, + "grad_norm": 1.6186565160751343, + "learning_rate": 1.2936728780936565e-05, + "loss": 1.4491, + "step": 37168 + }, + { + "epoch": 0.4646366159153979, + "grad_norm": 3.2486886978149414, + "learning_rate": 1.2935894561248665e-05, + "loss": 1.0955, + "step": 37170 + }, + { + "epoch": 0.4646616165404135, + "grad_norm": 0.8519202470779419, + "learning_rate": 1.293506031920141e-05, + "loss": 0.2501, + "step": 37172 + }, + { + "epoch": 0.46468661716542914, + "grad_norm": 2.2930123805999756, + "learning_rate": 1.2934226054801146e-05, + "loss": 0.1844, + "step": 37174 + }, + { + "epoch": 0.4647116177904448, + "grad_norm": 3.2379276752471924, + "learning_rate": 1.293339176805423e-05, + "loss": 1.4729, + "step": 37176 + }, + { + "epoch": 0.4647366184154604, + "grad_norm": 2.624587059020996, + "learning_rate": 1.2932557458967021e-05, + "loss": 0.9463, + "step": 37178 + }, + { + "epoch": 0.464761619040476, + "grad_norm": 4.307660102844238, + "learning_rate": 1.293172312754587e-05, + "loss": 1.7116, + "step": 37180 + }, + { + "epoch": 0.4647866196654916, + "grad_norm": 0.9940335750579834, + "learning_rate": 1.2930888773797126e-05, + "loss": 0.6412, + "step": 37182 + }, + { + "epoch": 0.46481162029050727, + "grad_norm": 0.31001970171928406, + "learning_rate": 1.2930054397727152e-05, + "loss": 0.7013, + "step": 37184 + }, + { + "epoch": 0.4648366209155229, + "grad_norm": 1.4461945295333862, + "learning_rate": 1.2929219999342295e-05, + "loss": 0.1011, + "step": 37186 + }, + { + "epoch": 0.4648616215405385, + "grad_norm": 5.213118553161621, + "learning_rate": 1.2928385578648912e-05, + "loss": 1.2001, + "step": 37188 + }, + { + "epoch": 0.46488662216555415, + "grad_norm": 0.1542942076921463, + "learning_rate": 1.292755113565336e-05, + "loss": 0.3834, + "step": 37190 + }, + { + "epoch": 0.46491162279056975, + "grad_norm": 2.6110219955444336, + "learning_rate": 1.2926716670361995e-05, + "loss": 1.0361, + "step": 37192 + }, + { + "epoch": 0.4649366234155854, + "grad_norm": 0.649228572845459, + "learning_rate": 1.2925882182781164e-05, + "loss": 0.2574, + "step": 37194 + }, + { + "epoch": 0.46496162404060104, + "grad_norm": 5.547506809234619, + "learning_rate": 1.2925047672917232e-05, + "loss": 1.8438, + "step": 37196 + }, + { + "epoch": 0.46498662466561663, + "grad_norm": 3.167520523071289, + "learning_rate": 1.2924213140776552e-05, + "loss": 0.8931, + "step": 37198 + }, + { + "epoch": 0.4650116252906323, + "grad_norm": 3.3551740646362305, + "learning_rate": 1.2923378586365473e-05, + "loss": 0.4931, + "step": 37200 + }, + { + "epoch": 0.46503662591564787, + "grad_norm": 0.0034442448522895575, + "learning_rate": 1.2922544009690361e-05, + "loss": 0.0404, + "step": 37202 + }, + { + "epoch": 0.4650616265406635, + "grad_norm": 4.101374626159668, + "learning_rate": 1.2921709410757565e-05, + "loss": 2.1213, + "step": 37204 + }, + { + "epoch": 0.46508662716567917, + "grad_norm": 3.8585522174835205, + "learning_rate": 1.2920874789573443e-05, + "loss": 1.9717, + "step": 37206 + }, + { + "epoch": 0.46511162779069476, + "grad_norm": 4.100684642791748, + "learning_rate": 1.2920040146144352e-05, + "loss": 0.9697, + "step": 37208 + }, + { + "epoch": 0.4651366284157104, + "grad_norm": 0.6449434161186218, + "learning_rate": 1.2919205480476651e-05, + "loss": 0.6084, + "step": 37210 + }, + { + "epoch": 0.465161629040726, + "grad_norm": 1.4573918581008911, + "learning_rate": 1.291837079257669e-05, + "loss": 1.3777, + "step": 37212 + }, + { + "epoch": 0.46518662966574165, + "grad_norm": 1.9802840948104858, + "learning_rate": 1.291753608245083e-05, + "loss": 0.2441, + "step": 37214 + }, + { + "epoch": 0.4652116302907573, + "grad_norm": 3.5601940155029297, + "learning_rate": 1.2916701350105428e-05, + "loss": 1.0729, + "step": 37216 + }, + { + "epoch": 0.4652366309157729, + "grad_norm": 4.922303199768066, + "learning_rate": 1.2915866595546844e-05, + "loss": 1.7544, + "step": 37218 + }, + { + "epoch": 0.46526163154078853, + "grad_norm": 1.3853631019592285, + "learning_rate": 1.291503181878143e-05, + "loss": 0.5273, + "step": 37220 + }, + { + "epoch": 0.4652866321658041, + "grad_norm": 2.154937267303467, + "learning_rate": 1.2914197019815546e-05, + "loss": 0.5174, + "step": 37222 + }, + { + "epoch": 0.4653116327908198, + "grad_norm": 3.4783551692962646, + "learning_rate": 1.291336219865555e-05, + "loss": 0.9818, + "step": 37224 + }, + { + "epoch": 0.4653366334158354, + "grad_norm": 2.8873777389526367, + "learning_rate": 1.2912527355307798e-05, + "loss": 1.0547, + "step": 37226 + }, + { + "epoch": 0.465361634040851, + "grad_norm": 4.888721466064453, + "learning_rate": 1.2911692489778652e-05, + "loss": 1.5693, + "step": 37228 + }, + { + "epoch": 0.46538663466586666, + "grad_norm": 2.579484224319458, + "learning_rate": 1.2910857602074467e-05, + "loss": 1.1519, + "step": 37230 + }, + { + "epoch": 0.46541163529088225, + "grad_norm": 3.426729202270508, + "learning_rate": 1.29100226922016e-05, + "loss": 1.6037, + "step": 37232 + }, + { + "epoch": 0.4654366359158979, + "grad_norm": 11.042386054992676, + "learning_rate": 1.2909187760166413e-05, + "loss": 2.4572, + "step": 37234 + }, + { + "epoch": 0.46546163654091355, + "grad_norm": 3.852290153503418, + "learning_rate": 1.2908352805975267e-05, + "loss": 1.3905, + "step": 37236 + }, + { + "epoch": 0.46548663716592914, + "grad_norm": 2.89979887008667, + "learning_rate": 1.290751782963451e-05, + "loss": 0.4494, + "step": 37238 + }, + { + "epoch": 0.4655116377909448, + "grad_norm": 6.997592926025391, + "learning_rate": 1.2906682831150512e-05, + "loss": 0.6687, + "step": 37240 + }, + { + "epoch": 0.4655366384159604, + "grad_norm": 4.305520534515381, + "learning_rate": 1.2905847810529628e-05, + "loss": 0.5821, + "step": 37242 + }, + { + "epoch": 0.465561639040976, + "grad_norm": 2.386776924133301, + "learning_rate": 1.2905012767778223e-05, + "loss": 1.632, + "step": 37244 + }, + { + "epoch": 0.4655866396659917, + "grad_norm": 1.105806827545166, + "learning_rate": 1.2904177702902647e-05, + "loss": 0.1428, + "step": 37246 + }, + { + "epoch": 0.46561164029100727, + "grad_norm": 5.458094596862793, + "learning_rate": 1.2903342615909264e-05, + "loss": 1.2674, + "step": 37248 + }, + { + "epoch": 0.4656366409160229, + "grad_norm": 2.7169206142425537, + "learning_rate": 1.2902507506804434e-05, + "loss": 1.4512, + "step": 37250 + }, + { + "epoch": 0.4656616415410385, + "grad_norm": 3.4083898067474365, + "learning_rate": 1.2901672375594518e-05, + "loss": 0.4415, + "step": 37252 + }, + { + "epoch": 0.46568664216605415, + "grad_norm": 10.793522834777832, + "learning_rate": 1.2900837222285875e-05, + "loss": 0.8831, + "step": 37254 + }, + { + "epoch": 0.4657116427910698, + "grad_norm": 6.587237358093262, + "learning_rate": 1.290000204688487e-05, + "loss": 1.0937, + "step": 37256 + }, + { + "epoch": 0.4657366434160854, + "grad_norm": 2.437830686569214, + "learning_rate": 1.2899166849397855e-05, + "loss": 1.8851, + "step": 37258 + }, + { + "epoch": 0.46576164404110104, + "grad_norm": 1.317347764968872, + "learning_rate": 1.2898331629831195e-05, + "loss": 0.7836, + "step": 37260 + }, + { + "epoch": 0.46578664466611663, + "grad_norm": 4.259649276733398, + "learning_rate": 1.2897496388191253e-05, + "loss": 1.5421, + "step": 37262 + }, + { + "epoch": 0.4658116452911323, + "grad_norm": 0.61185222864151, + "learning_rate": 1.2896661124484388e-05, + "loss": 0.1612, + "step": 37264 + }, + { + "epoch": 0.46583664591614793, + "grad_norm": 1.378061294555664, + "learning_rate": 1.2895825838716962e-05, + "loss": 0.7555, + "step": 37266 + }, + { + "epoch": 0.4658616465411635, + "grad_norm": 2.8613152503967285, + "learning_rate": 1.2894990530895335e-05, + "loss": 1.2228, + "step": 37268 + }, + { + "epoch": 0.46588664716617917, + "grad_norm": 4.168942451477051, + "learning_rate": 1.2894155201025872e-05, + "loss": 1.5286, + "step": 37270 + }, + { + "epoch": 0.46591164779119476, + "grad_norm": 6.073257923126221, + "learning_rate": 1.289331984911493e-05, + "loss": 1.2981, + "step": 37272 + }, + { + "epoch": 0.4659366484162104, + "grad_norm": 2.8259518146514893, + "learning_rate": 1.2892484475168872e-05, + "loss": 0.9975, + "step": 37274 + }, + { + "epoch": 0.46596164904122606, + "grad_norm": 2.6927597522735596, + "learning_rate": 1.2891649079194061e-05, + "loss": 0.8509, + "step": 37276 + }, + { + "epoch": 0.46598664966624165, + "grad_norm": 10.505797386169434, + "learning_rate": 1.2890813661196862e-05, + "loss": 1.6974, + "step": 37278 + }, + { + "epoch": 0.4660116502912573, + "grad_norm": 6.483328342437744, + "learning_rate": 1.2889978221183635e-05, + "loss": 1.2391, + "step": 37280 + }, + { + "epoch": 0.4660366509162729, + "grad_norm": 0.0006467548082582653, + "learning_rate": 1.2889142759160743e-05, + "loss": 0.7475, + "step": 37282 + }, + { + "epoch": 0.46606165154128854, + "grad_norm": 4.87339973449707, + "learning_rate": 1.2888307275134544e-05, + "loss": 1.1478, + "step": 37284 + }, + { + "epoch": 0.4660866521663042, + "grad_norm": 0.17845061421394348, + "learning_rate": 1.2887471769111409e-05, + "loss": 0.1639, + "step": 37286 + }, + { + "epoch": 0.4661116527913198, + "grad_norm": 0.0007145137060433626, + "learning_rate": 1.2886636241097696e-05, + "loss": 0.8386, + "step": 37288 + }, + { + "epoch": 0.4661366534163354, + "grad_norm": 16.175682067871094, + "learning_rate": 1.288580069109977e-05, + "loss": 2.0652, + "step": 37290 + }, + { + "epoch": 0.466161654041351, + "grad_norm": 1.904990792274475, + "learning_rate": 1.2884965119123993e-05, + "loss": 0.6309, + "step": 37292 + }, + { + "epoch": 0.46618665466636666, + "grad_norm": 6.123648166656494, + "learning_rate": 1.2884129525176731e-05, + "loss": 0.6457, + "step": 37294 + }, + { + "epoch": 0.4662116552913823, + "grad_norm": 5.158976078033447, + "learning_rate": 1.288329390926435e-05, + "loss": 0.9777, + "step": 37296 + }, + { + "epoch": 0.4662366559163979, + "grad_norm": 3.1429848670959473, + "learning_rate": 1.2882458271393203e-05, + "loss": 1.3781, + "step": 37298 + }, + { + "epoch": 0.46626165654141355, + "grad_norm": 1.8887953758239746, + "learning_rate": 1.2881622611569666e-05, + "loss": 0.7653, + "step": 37300 + }, + { + "epoch": 0.46628665716642914, + "grad_norm": 5.197340965270996, + "learning_rate": 1.2880786929800098e-05, + "loss": 0.152, + "step": 37302 + }, + { + "epoch": 0.4663116577914448, + "grad_norm": 3.001146078109741, + "learning_rate": 1.287995122609086e-05, + "loss": 0.7026, + "step": 37304 + }, + { + "epoch": 0.46633665841646044, + "grad_norm": 7.3635029792785645, + "learning_rate": 1.2879115500448327e-05, + "loss": 0.4873, + "step": 37306 + }, + { + "epoch": 0.46636165904147603, + "grad_norm": 2.8506956100463867, + "learning_rate": 1.2878279752878858e-05, + "loss": 0.4019, + "step": 37308 + }, + { + "epoch": 0.4663866596664917, + "grad_norm": 0.0005601161974482238, + "learning_rate": 1.2877443983388812e-05, + "loss": 0.3906, + "step": 37310 + }, + { + "epoch": 0.46641166029150727, + "grad_norm": 0.00052942632464692, + "learning_rate": 1.2876608191984561e-05, + "loss": 0.472, + "step": 37312 + }, + { + "epoch": 0.4664366609165229, + "grad_norm": 4.588576793670654, + "learning_rate": 1.287577237867247e-05, + "loss": 2.121, + "step": 37314 + }, + { + "epoch": 0.46646166154153856, + "grad_norm": 2.7673075199127197, + "learning_rate": 1.2874936543458903e-05, + "loss": 1.1044, + "step": 37316 + }, + { + "epoch": 0.46648666216655416, + "grad_norm": 4.008786678314209, + "learning_rate": 1.2874100686350226e-05, + "loss": 0.275, + "step": 37318 + }, + { + "epoch": 0.4665116627915698, + "grad_norm": 0.0007860860787332058, + "learning_rate": 1.2873264807352804e-05, + "loss": 0.4624, + "step": 37320 + }, + { + "epoch": 0.4665366634165854, + "grad_norm": 2.1905770301818848, + "learning_rate": 1.2872428906473004e-05, + "loss": 0.3811, + "step": 37322 + }, + { + "epoch": 0.46656166404160104, + "grad_norm": 7.3128814697265625, + "learning_rate": 1.2871592983717192e-05, + "loss": 1.2999, + "step": 37324 + }, + { + "epoch": 0.4665866646666167, + "grad_norm": 0.0007568757864646614, + "learning_rate": 1.2870757039091732e-05, + "loss": 0.0, + "step": 37326 + }, + { + "epoch": 0.4666116652916323, + "grad_norm": 2.3424718379974365, + "learning_rate": 1.2869921072602993e-05, + "loss": 0.2495, + "step": 37328 + }, + { + "epoch": 0.46663666591664793, + "grad_norm": 2.2564475536346436, + "learning_rate": 1.2869085084257343e-05, + "loss": 0.6602, + "step": 37330 + }, + { + "epoch": 0.4666616665416635, + "grad_norm": 2.72292160987854, + "learning_rate": 1.2868249074061146e-05, + "loss": 1.5196, + "step": 37332 + }, + { + "epoch": 0.46668666716667917, + "grad_norm": 0.11841493844985962, + "learning_rate": 1.286741304202077e-05, + "loss": 0.3977, + "step": 37334 + }, + { + "epoch": 0.4667116677916948, + "grad_norm": 2.1530401706695557, + "learning_rate": 1.286657698814258e-05, + "loss": 0.3035, + "step": 37336 + }, + { + "epoch": 0.4667366684167104, + "grad_norm": 1.9245398044586182, + "learning_rate": 1.2865740912432946e-05, + "loss": 0.2634, + "step": 37338 + }, + { + "epoch": 0.46676166904172606, + "grad_norm": 3.8824100494384766, + "learning_rate": 1.2864904814898234e-05, + "loss": 0.7563, + "step": 37340 + }, + { + "epoch": 0.46678666966674165, + "grad_norm": 4.295708179473877, + "learning_rate": 1.2864068695544811e-05, + "loss": 1.587, + "step": 37342 + }, + { + "epoch": 0.4668116702917573, + "grad_norm": 2.7721495628356934, + "learning_rate": 1.2863232554379047e-05, + "loss": 0.9283, + "step": 37344 + }, + { + "epoch": 0.46683667091677294, + "grad_norm": 3.475522518157959, + "learning_rate": 1.2862396391407308e-05, + "loss": 1.4002, + "step": 37346 + }, + { + "epoch": 0.46686167154178854, + "grad_norm": 6.658603668212891, + "learning_rate": 1.2861560206635967e-05, + "loss": 0.4075, + "step": 37348 + }, + { + "epoch": 0.4668866721668042, + "grad_norm": 4.593158721923828, + "learning_rate": 1.2860724000071386e-05, + "loss": 1.247, + "step": 37350 + }, + { + "epoch": 0.4669116727918198, + "grad_norm": 0.46633586287498474, + "learning_rate": 1.2859887771719932e-05, + "loss": 0.3965, + "step": 37352 + }, + { + "epoch": 0.4669366734168354, + "grad_norm": 2.744368076324463, + "learning_rate": 1.2859051521587979e-05, + "loss": 1.2911, + "step": 37354 + }, + { + "epoch": 0.46696167404185107, + "grad_norm": 4.231015205383301, + "learning_rate": 1.2858215249681895e-05, + "loss": 1.3494, + "step": 37356 + }, + { + "epoch": 0.46698667466686666, + "grad_norm": 4.423974514007568, + "learning_rate": 1.2857378956008049e-05, + "loss": 1.1259, + "step": 37358 + }, + { + "epoch": 0.4670116752918823, + "grad_norm": 0.5671226382255554, + "learning_rate": 1.2856542640572806e-05, + "loss": 0.0588, + "step": 37360 + }, + { + "epoch": 0.4670366759168979, + "grad_norm": 12.316880226135254, + "learning_rate": 1.285570630338254e-05, + "loss": 0.5479, + "step": 37362 + }, + { + "epoch": 0.46706167654191355, + "grad_norm": 3.5345442295074463, + "learning_rate": 1.285486994444362e-05, + "loss": 1.1355, + "step": 37364 + }, + { + "epoch": 0.4670866771669292, + "grad_norm": 5.2937493324279785, + "learning_rate": 1.2854033563762408e-05, + "loss": 0.4251, + "step": 37366 + }, + { + "epoch": 0.4671116777919448, + "grad_norm": 6.176816940307617, + "learning_rate": 1.2853197161345288e-05, + "loss": 2.4164, + "step": 37368 + }, + { + "epoch": 0.46713667841696044, + "grad_norm": 3.210747003555298, + "learning_rate": 1.2852360737198617e-05, + "loss": 0.6714, + "step": 37370 + }, + { + "epoch": 0.46716167904197603, + "grad_norm": 6.048074245452881, + "learning_rate": 1.2851524291328772e-05, + "loss": 1.1769, + "step": 37372 + }, + { + "epoch": 0.4671866796669917, + "grad_norm": 0.7293726205825806, + "learning_rate": 1.2850687823742121e-05, + "loss": 0.3518, + "step": 37374 + }, + { + "epoch": 0.4672116802920073, + "grad_norm": 3.258451223373413, + "learning_rate": 1.2849851334445033e-05, + "loss": 2.3821, + "step": 37376 + }, + { + "epoch": 0.4672366809170229, + "grad_norm": 4.397103786468506, + "learning_rate": 1.2849014823443881e-05, + "loss": 1.9451, + "step": 37378 + }, + { + "epoch": 0.46726168154203856, + "grad_norm": 0.000813343096524477, + "learning_rate": 1.2848178290745037e-05, + "loss": 0.0, + "step": 37380 + }, + { + "epoch": 0.46728668216705416, + "grad_norm": 4.150391101837158, + "learning_rate": 1.2847341736354867e-05, + "loss": 0.5256, + "step": 37382 + }, + { + "epoch": 0.4673116827920698, + "grad_norm": 0.0070860120467841625, + "learning_rate": 1.2846505160279747e-05, + "loss": 0.6056, + "step": 37384 + }, + { + "epoch": 0.46733668341708545, + "grad_norm": 13.689114570617676, + "learning_rate": 1.2845668562526048e-05, + "loss": 0.6676, + "step": 37386 + }, + { + "epoch": 0.46736168404210104, + "grad_norm": 3.9310669898986816, + "learning_rate": 1.2844831943100139e-05, + "loss": 0.8253, + "step": 37388 + }, + { + "epoch": 0.4673866846671167, + "grad_norm": 7.08610200881958, + "learning_rate": 1.2843995302008385e-05, + "loss": 1.2624, + "step": 37390 + }, + { + "epoch": 0.4674116852921323, + "grad_norm": 7.914872646331787, + "learning_rate": 1.2843158639257172e-05, + "loss": 1.0947, + "step": 37392 + }, + { + "epoch": 0.46743668591714793, + "grad_norm": 2.061861038208008, + "learning_rate": 1.2842321954852864e-05, + "loss": 0.3156, + "step": 37394 + }, + { + "epoch": 0.4674616865421636, + "grad_norm": 3.237617015838623, + "learning_rate": 1.2841485248801833e-05, + "loss": 0.9977, + "step": 37396 + }, + { + "epoch": 0.46748668716717917, + "grad_norm": 4.475522041320801, + "learning_rate": 1.2840648521110453e-05, + "loss": 1.2717, + "step": 37398 + }, + { + "epoch": 0.4675116877921948, + "grad_norm": 0.26104050874710083, + "learning_rate": 1.2839811771785095e-05, + "loss": 0.0103, + "step": 37400 + }, + { + "epoch": 0.4675366884172104, + "grad_norm": 10.096619606018066, + "learning_rate": 1.283897500083213e-05, + "loss": 1.4967, + "step": 37402 + }, + { + "epoch": 0.46756168904222606, + "grad_norm": 2.4471302032470703, + "learning_rate": 1.2838138208257935e-05, + "loss": 1.3828, + "step": 37404 + }, + { + "epoch": 0.4675866896672417, + "grad_norm": 2.423947811126709, + "learning_rate": 1.2837301394068881e-05, + "loss": 0.7528, + "step": 37406 + }, + { + "epoch": 0.4676116902922573, + "grad_norm": 1.0627720355987549, + "learning_rate": 1.283646455827134e-05, + "loss": 0.2162, + "step": 37408 + }, + { + "epoch": 0.46763669091727295, + "grad_norm": 2.316835880279541, + "learning_rate": 1.2835627700871686e-05, + "loss": 0.6524, + "step": 37410 + }, + { + "epoch": 0.46766169154228854, + "grad_norm": 2.3372042179107666, + "learning_rate": 1.2834790821876295e-05, + "loss": 0.8731, + "step": 37412 + }, + { + "epoch": 0.4676866921673042, + "grad_norm": 3.633026599884033, + "learning_rate": 1.2833953921291537e-05, + "loss": 0.5165, + "step": 37414 + }, + { + "epoch": 0.46771169279231983, + "grad_norm": 3.833310127258301, + "learning_rate": 1.2833116999123782e-05, + "loss": 0.451, + "step": 37416 + }, + { + "epoch": 0.4677366934173354, + "grad_norm": 4.061777591705322, + "learning_rate": 1.2832280055379412e-05, + "loss": 0.9775, + "step": 37418 + }, + { + "epoch": 0.4677616940423511, + "grad_norm": 1.2091789245605469, + "learning_rate": 1.2831443090064801e-05, + "loss": 0.0197, + "step": 37420 + }, + { + "epoch": 0.46778669466736666, + "grad_norm": 0.0012429028283804655, + "learning_rate": 1.2830606103186316e-05, + "loss": 0.4299, + "step": 37422 + }, + { + "epoch": 0.4678116952923823, + "grad_norm": 3.075993776321411, + "learning_rate": 1.2829769094750334e-05, + "loss": 1.0849, + "step": 37424 + }, + { + "epoch": 0.46783669591739796, + "grad_norm": 5.406191825866699, + "learning_rate": 1.2828932064763236e-05, + "loss": 1.2409, + "step": 37426 + }, + { + "epoch": 0.46786169654241355, + "grad_norm": 4.266424655914307, + "learning_rate": 1.2828095013231384e-05, + "loss": 0.3832, + "step": 37428 + }, + { + "epoch": 0.4678866971674292, + "grad_norm": 0.11598973721265793, + "learning_rate": 1.2827257940161166e-05, + "loss": 0.2467, + "step": 37430 + }, + { + "epoch": 0.4679116977924448, + "grad_norm": 2.4816641807556152, + "learning_rate": 1.2826420845558952e-05, + "loss": 0.5995, + "step": 37432 + }, + { + "epoch": 0.46793669841746044, + "grad_norm": 3.7932071685791016, + "learning_rate": 1.2825583729431113e-05, + "loss": 1.5203, + "step": 37434 + }, + { + "epoch": 0.4679616990424761, + "grad_norm": 3.4433562755584717, + "learning_rate": 1.2824746591784028e-05, + "loss": 2.0153, + "step": 37436 + }, + { + "epoch": 0.4679866996674917, + "grad_norm": 3.6434712409973145, + "learning_rate": 1.2823909432624072e-05, + "loss": 0.8174, + "step": 37438 + }, + { + "epoch": 0.4680117002925073, + "grad_norm": 7.43391752243042, + "learning_rate": 1.2823072251957625e-05, + "loss": 0.9722, + "step": 37440 + }, + { + "epoch": 0.4680367009175229, + "grad_norm": 1.2727502584457397, + "learning_rate": 1.2822235049791054e-05, + "loss": 0.0635, + "step": 37442 + }, + { + "epoch": 0.46806170154253857, + "grad_norm": 1.0312753915786743, + "learning_rate": 1.2821397826130743e-05, + "loss": 0.6108, + "step": 37444 + }, + { + "epoch": 0.4680867021675542, + "grad_norm": 2.3521459102630615, + "learning_rate": 1.2820560580983066e-05, + "loss": 1.2836, + "step": 37446 + }, + { + "epoch": 0.4681117027925698, + "grad_norm": 3.222832679748535, + "learning_rate": 1.2819723314354398e-05, + "loss": 0.2961, + "step": 37448 + }, + { + "epoch": 0.46813670341758545, + "grad_norm": 3.862746477127075, + "learning_rate": 1.2818886026251113e-05, + "loss": 0.6663, + "step": 37450 + }, + { + "epoch": 0.46816170404260105, + "grad_norm": 3.8506217002868652, + "learning_rate": 1.2818048716679594e-05, + "loss": 1.008, + "step": 37452 + }, + { + "epoch": 0.4681867046676167, + "grad_norm": 3.4745547771453857, + "learning_rate": 1.2817211385646213e-05, + "loss": 0.7516, + "step": 37454 + }, + { + "epoch": 0.46821170529263234, + "grad_norm": 4.133323669433594, + "learning_rate": 1.2816374033157349e-05, + "loss": 1.8391, + "step": 37456 + }, + { + "epoch": 0.46823670591764793, + "grad_norm": 0.5439958572387695, + "learning_rate": 1.2815536659219378e-05, + "loss": 0.4691, + "step": 37458 + }, + { + "epoch": 0.4682617065426636, + "grad_norm": 5.60244607925415, + "learning_rate": 1.281469926383868e-05, + "loss": 2.2152, + "step": 37460 + }, + { + "epoch": 0.4682867071676792, + "grad_norm": 0.8487020134925842, + "learning_rate": 1.2813861847021627e-05, + "loss": 0.0406, + "step": 37462 + }, + { + "epoch": 0.4683117077926948, + "grad_norm": 5.280159950256348, + "learning_rate": 1.2813024408774602e-05, + "loss": 0.5421, + "step": 37464 + }, + { + "epoch": 0.46833670841771047, + "grad_norm": 0.0024866489693522453, + "learning_rate": 1.2812186949103979e-05, + "loss": 0.7436, + "step": 37466 + }, + { + "epoch": 0.46836170904272606, + "grad_norm": 4.223067760467529, + "learning_rate": 1.281134946801614e-05, + "loss": 1.9305, + "step": 37468 + }, + { + "epoch": 0.4683867096677417, + "grad_norm": 2.8726441860198975, + "learning_rate": 1.281051196551746e-05, + "loss": 0.7801, + "step": 37470 + }, + { + "epoch": 0.4684117102927573, + "grad_norm": 7.377798080444336, + "learning_rate": 1.2809674441614321e-05, + "loss": 1.1772, + "step": 37472 + }, + { + "epoch": 0.46843671091777295, + "grad_norm": 2.40720272064209, + "learning_rate": 1.2808836896313097e-05, + "loss": 1.1911, + "step": 37474 + }, + { + "epoch": 0.4684617115427886, + "grad_norm": 3.4272403717041016, + "learning_rate": 1.2807999329620168e-05, + "loss": 0.5741, + "step": 37476 + }, + { + "epoch": 0.4684867121678042, + "grad_norm": 2.8567161560058594, + "learning_rate": 1.2807161741541914e-05, + "loss": 1.25, + "step": 37478 + }, + { + "epoch": 0.46851171279281983, + "grad_norm": 4.117981433868408, + "learning_rate": 1.280632413208471e-05, + "loss": 1.5512, + "step": 37480 + }, + { + "epoch": 0.4685367134178354, + "grad_norm": 3.085139751434326, + "learning_rate": 1.2805486501254943e-05, + "loss": 1.4358, + "step": 37482 + }, + { + "epoch": 0.4685617140428511, + "grad_norm": 4.782902240753174, + "learning_rate": 1.2804648849058985e-05, + "loss": 0.7518, + "step": 37484 + }, + { + "epoch": 0.4685867146678667, + "grad_norm": 3.1314728260040283, + "learning_rate": 1.2803811175503219e-05, + "loss": 0.5747, + "step": 37486 + }, + { + "epoch": 0.4686117152928823, + "grad_norm": 1.151362657546997, + "learning_rate": 1.2802973480594023e-05, + "loss": 0.5167, + "step": 37488 + }, + { + "epoch": 0.46863671591789796, + "grad_norm": 3.6115002632141113, + "learning_rate": 1.2802135764337776e-05, + "loss": 1.9856, + "step": 37490 + }, + { + "epoch": 0.46866171654291355, + "grad_norm": 3.150350332260132, + "learning_rate": 1.280129802674086e-05, + "loss": 1.6236, + "step": 37492 + }, + { + "epoch": 0.4686867171679292, + "grad_norm": 0.003528429428115487, + "learning_rate": 1.2800460267809656e-05, + "loss": 0.3349, + "step": 37494 + }, + { + "epoch": 0.46871171779294485, + "grad_norm": 2.572355270385742, + "learning_rate": 1.279962248755054e-05, + "loss": 0.4111, + "step": 37496 + }, + { + "epoch": 0.46873671841796044, + "grad_norm": 0.6610356569290161, + "learning_rate": 1.2798784685969898e-05, + "loss": 1.3593, + "step": 37498 + }, + { + "epoch": 0.4687617190429761, + "grad_norm": 4.289309501647949, + "learning_rate": 1.2797946863074104e-05, + "loss": 1.1479, + "step": 37500 + }, + { + "epoch": 0.4687867196679917, + "grad_norm": 4.984355926513672, + "learning_rate": 1.2797109018869547e-05, + "loss": 0.6032, + "step": 37502 + }, + { + "epoch": 0.46881172029300733, + "grad_norm": 2.8511857986450195, + "learning_rate": 1.2796271153362598e-05, + "loss": 1.3311, + "step": 37504 + }, + { + "epoch": 0.468836720918023, + "grad_norm": 4.858156681060791, + "learning_rate": 1.2795433266559647e-05, + "loss": 0.8089, + "step": 37506 + }, + { + "epoch": 0.46886172154303857, + "grad_norm": 3.1351544857025146, + "learning_rate": 1.279459535846707e-05, + "loss": 2.0311, + "step": 37508 + }, + { + "epoch": 0.4688867221680542, + "grad_norm": 4.472691059112549, + "learning_rate": 1.2793757429091253e-05, + "loss": 1.2472, + "step": 37510 + }, + { + "epoch": 0.4689117227930698, + "grad_norm": 2.7372350692749023, + "learning_rate": 1.279291947843857e-05, + "loss": 1.1182, + "step": 37512 + }, + { + "epoch": 0.46893672341808545, + "grad_norm": 5.310741901397705, + "learning_rate": 1.279208150651541e-05, + "loss": 1.3254, + "step": 37514 + }, + { + "epoch": 0.4689617240431011, + "grad_norm": 4.9100565910339355, + "learning_rate": 1.2791243513328151e-05, + "loss": 0.578, + "step": 37516 + }, + { + "epoch": 0.4689867246681167, + "grad_norm": 2.7446939945220947, + "learning_rate": 1.2790405498883176e-05, + "loss": 1.4117, + "step": 37518 + }, + { + "epoch": 0.46901172529313234, + "grad_norm": 6.4184889793396, + "learning_rate": 1.2789567463186867e-05, + "loss": 2.0199, + "step": 37520 + }, + { + "epoch": 0.46903672591814793, + "grad_norm": 4.690478801727295, + "learning_rate": 1.2788729406245608e-05, + "loss": 0.7994, + "step": 37522 + }, + { + "epoch": 0.4690617265431636, + "grad_norm": 6.625486373901367, + "learning_rate": 1.278789132806578e-05, + "loss": 0.502, + "step": 37524 + }, + { + "epoch": 0.46908672716817923, + "grad_norm": 2.8364474773406982, + "learning_rate": 1.2787053228653767e-05, + "loss": 0.8563, + "step": 37526 + }, + { + "epoch": 0.4691117277931948, + "grad_norm": 0.33294254541397095, + "learning_rate": 1.2786215108015946e-05, + "loss": 0.2738, + "step": 37528 + }, + { + "epoch": 0.46913672841821047, + "grad_norm": 3.9075145721435547, + "learning_rate": 1.278537696615871e-05, + "loss": 0.8331, + "step": 37530 + }, + { + "epoch": 0.46916172904322606, + "grad_norm": 7.3033857345581055, + "learning_rate": 1.2784538803088436e-05, + "loss": 2.1713, + "step": 37532 + }, + { + "epoch": 0.4691867296682417, + "grad_norm": 0.544220507144928, + "learning_rate": 1.2783700618811505e-05, + "loss": 0.6928, + "step": 37534 + }, + { + "epoch": 0.46921173029325736, + "grad_norm": 0.5880292057991028, + "learning_rate": 1.2782862413334308e-05, + "loss": 0.3695, + "step": 37536 + }, + { + "epoch": 0.46923673091827295, + "grad_norm": 2.627135992050171, + "learning_rate": 1.2782024186663222e-05, + "loss": 0.6967, + "step": 37538 + }, + { + "epoch": 0.4692617315432886, + "grad_norm": 7.9963274002075195, + "learning_rate": 1.2781185938804631e-05, + "loss": 0.9933, + "step": 37540 + }, + { + "epoch": 0.4692867321683042, + "grad_norm": 2.7049615383148193, + "learning_rate": 1.2780347669764925e-05, + "loss": 1.5495, + "step": 37542 + }, + { + "epoch": 0.46931173279331984, + "grad_norm": 4.809319972991943, + "learning_rate": 1.2779509379550483e-05, + "loss": 0.5969, + "step": 37544 + }, + { + "epoch": 0.4693367334183355, + "grad_norm": 3.319821834564209, + "learning_rate": 1.277867106816769e-05, + "loss": 0.9175, + "step": 37546 + }, + { + "epoch": 0.4693617340433511, + "grad_norm": 5.743946552276611, + "learning_rate": 1.2777832735622933e-05, + "loss": 1.7674, + "step": 37548 + }, + { + "epoch": 0.4693867346683667, + "grad_norm": 2.3927457332611084, + "learning_rate": 1.2776994381922594e-05, + "loss": 0.9458, + "step": 37550 + }, + { + "epoch": 0.4694117352933823, + "grad_norm": 2.969376564025879, + "learning_rate": 1.2776156007073057e-05, + "loss": 0.5896, + "step": 37552 + }, + { + "epoch": 0.46943673591839796, + "grad_norm": 2.693387746810913, + "learning_rate": 1.2775317611080711e-05, + "loss": 0.1959, + "step": 37554 + }, + { + "epoch": 0.4694617365434136, + "grad_norm": 0.9107211232185364, + "learning_rate": 1.2774479193951934e-05, + "loss": 0.2638, + "step": 37556 + }, + { + "epoch": 0.4694867371684292, + "grad_norm": 5.144345760345459, + "learning_rate": 1.2773640755693119e-05, + "loss": 1.4347, + "step": 37558 + }, + { + "epoch": 0.46951173779344485, + "grad_norm": 2.916759729385376, + "learning_rate": 1.2772802296310647e-05, + "loss": 1.1275, + "step": 37560 + }, + { + "epoch": 0.46953673841846044, + "grad_norm": 3.587742805480957, + "learning_rate": 1.2771963815810909e-05, + "loss": 0.5503, + "step": 37562 + }, + { + "epoch": 0.4695617390434761, + "grad_norm": 0.83304363489151, + "learning_rate": 1.2771125314200284e-05, + "loss": 0.6712, + "step": 37564 + }, + { + "epoch": 0.46958673966849174, + "grad_norm": 7.057071685791016, + "learning_rate": 1.2770286791485159e-05, + "loss": 1.5084, + "step": 37566 + }, + { + "epoch": 0.46961174029350733, + "grad_norm": 4.484407424926758, + "learning_rate": 1.2769448247671918e-05, + "loss": 0.9046, + "step": 37568 + }, + { + "epoch": 0.469636740918523, + "grad_norm": 6.590177536010742, + "learning_rate": 1.276860968276696e-05, + "loss": 1.8354, + "step": 37570 + }, + { + "epoch": 0.46966174154353857, + "grad_norm": 4.27825403213501, + "learning_rate": 1.2767771096776656e-05, + "loss": 0.5495, + "step": 37572 + }, + { + "epoch": 0.4696867421685542, + "grad_norm": 3.5028183460235596, + "learning_rate": 1.27669324897074e-05, + "loss": 1.1452, + "step": 37574 + }, + { + "epoch": 0.46971174279356986, + "grad_norm": 2.732320547103882, + "learning_rate": 1.2766093861565578e-05, + "loss": 1.2858, + "step": 37576 + }, + { + "epoch": 0.46973674341858546, + "grad_norm": 2.960845947265625, + "learning_rate": 1.2765255212357575e-05, + "loss": 1.2576, + "step": 37578 + }, + { + "epoch": 0.4697617440436011, + "grad_norm": 2.6605560779571533, + "learning_rate": 1.276441654208978e-05, + "loss": 1.4057, + "step": 37580 + }, + { + "epoch": 0.4697867446686167, + "grad_norm": 0.4127174913883209, + "learning_rate": 1.2763577850768577e-05, + "loss": 0.6023, + "step": 37582 + }, + { + "epoch": 0.46981174529363234, + "grad_norm": 0.004933782387524843, + "learning_rate": 1.276273913840036e-05, + "loss": 0.5711, + "step": 37584 + }, + { + "epoch": 0.469836745918648, + "grad_norm": 3.669759511947632, + "learning_rate": 1.2761900404991511e-05, + "loss": 0.9441, + "step": 37586 + }, + { + "epoch": 0.4698617465436636, + "grad_norm": 6.134186267852783, + "learning_rate": 1.276106165054842e-05, + "loss": 0.9784, + "step": 37588 + }, + { + "epoch": 0.46988674716867923, + "grad_norm": 3.330845355987549, + "learning_rate": 1.2760222875077473e-05, + "loss": 0.917, + "step": 37590 + }, + { + "epoch": 0.4699117477936948, + "grad_norm": 4.810242176055908, + "learning_rate": 1.2759384078585057e-05, + "loss": 1.9504, + "step": 37592 + }, + { + "epoch": 0.46993674841871047, + "grad_norm": 1.9288853406906128, + "learning_rate": 1.2758545261077563e-05, + "loss": 0.6163, + "step": 37594 + }, + { + "epoch": 0.4699617490437261, + "grad_norm": 1.8804796934127808, + "learning_rate": 1.2757706422561383e-05, + "loss": 0.736, + "step": 37596 + }, + { + "epoch": 0.4699867496687417, + "grad_norm": 4.684364318847656, + "learning_rate": 1.2756867563042896e-05, + "loss": 0.5783, + "step": 37598 + }, + { + "epoch": 0.47001175029375736, + "grad_norm": 2.1309115886688232, + "learning_rate": 1.2756028682528496e-05, + "loss": 0.3398, + "step": 37600 + }, + { + "epoch": 0.47003675091877295, + "grad_norm": 3.336916446685791, + "learning_rate": 1.2755189781024574e-05, + "loss": 0.0712, + "step": 37602 + }, + { + "epoch": 0.4700617515437886, + "grad_norm": 0.0005245715728960931, + "learning_rate": 1.2754350858537514e-05, + "loss": 0.6289, + "step": 37604 + }, + { + "epoch": 0.47008675216880424, + "grad_norm": 3.187746524810791, + "learning_rate": 1.2753511915073704e-05, + "loss": 1.8027, + "step": 37606 + }, + { + "epoch": 0.47011175279381984, + "grad_norm": 9.135101318359375, + "learning_rate": 1.2752672950639541e-05, + "loss": 1.3349, + "step": 37608 + }, + { + "epoch": 0.4701367534188355, + "grad_norm": 3.046137809753418, + "learning_rate": 1.2751833965241412e-05, + "loss": 1.0942, + "step": 37610 + }, + { + "epoch": 0.4701617540438511, + "grad_norm": 3.6769731044769287, + "learning_rate": 1.27509949588857e-05, + "loss": 1.5331, + "step": 37612 + }, + { + "epoch": 0.4701867546688667, + "grad_norm": 3.561328887939453, + "learning_rate": 1.2750155931578801e-05, + "loss": 0.8098, + "step": 37614 + }, + { + "epoch": 0.47021175529388237, + "grad_norm": 1.8695424795150757, + "learning_rate": 1.2749316883327105e-05, + "loss": 0.6019, + "step": 37616 + }, + { + "epoch": 0.47023675591889796, + "grad_norm": 2.9142005443573, + "learning_rate": 1.2748477814136997e-05, + "loss": 1.2965, + "step": 37618 + }, + { + "epoch": 0.4702617565439136, + "grad_norm": 0.000539953529369086, + "learning_rate": 1.2747638724014873e-05, + "loss": 0.5881, + "step": 37620 + }, + { + "epoch": 0.4702867571689292, + "grad_norm": 3.8476243019104004, + "learning_rate": 1.2746799612967122e-05, + "loss": 1.0138, + "step": 37622 + }, + { + "epoch": 0.47031175779394485, + "grad_norm": 2.335869789123535, + "learning_rate": 1.2745960481000133e-05, + "loss": 0.7952, + "step": 37624 + }, + { + "epoch": 0.4703367584189605, + "grad_norm": 4.866481304168701, + "learning_rate": 1.2745121328120296e-05, + "loss": 1.3584, + "step": 37626 + }, + { + "epoch": 0.4703617590439761, + "grad_norm": 1.2490636110305786, + "learning_rate": 1.2744282154334004e-05, + "loss": 0.285, + "step": 37628 + }, + { + "epoch": 0.47038675966899174, + "grad_norm": 0.062434110790491104, + "learning_rate": 1.2743442959647645e-05, + "loss": 0.8471, + "step": 37630 + }, + { + "epoch": 0.47041176029400733, + "grad_norm": 4.197657108306885, + "learning_rate": 1.2742603744067616e-05, + "loss": 0.7694, + "step": 37632 + }, + { + "epoch": 0.470436760919023, + "grad_norm": 2.4813883304595947, + "learning_rate": 1.2741764507600305e-05, + "loss": 1.2399, + "step": 37634 + }, + { + "epoch": 0.4704617615440386, + "grad_norm": 2.3269741535186768, + "learning_rate": 1.27409252502521e-05, + "loss": 0.7813, + "step": 37636 + }, + { + "epoch": 0.4704867621690542, + "grad_norm": 3.8980515003204346, + "learning_rate": 1.2740085972029397e-05, + "loss": 1.1832, + "step": 37638 + }, + { + "epoch": 0.47051176279406987, + "grad_norm": 5.835520267486572, + "learning_rate": 1.2739246672938585e-05, + "loss": 0.9572, + "step": 37640 + }, + { + "epoch": 0.47053676341908546, + "grad_norm": 1.331368327140808, + "learning_rate": 1.2738407352986062e-05, + "loss": 0.04, + "step": 37642 + }, + { + "epoch": 0.4705617640441011, + "grad_norm": 1.9657953977584839, + "learning_rate": 1.2737568012178212e-05, + "loss": 1.0016, + "step": 37644 + }, + { + "epoch": 0.47058676466911675, + "grad_norm": 7.1194167137146, + "learning_rate": 1.2736728650521433e-05, + "loss": 0.167, + "step": 37646 + }, + { + "epoch": 0.47061176529413234, + "grad_norm": 0.0009851927170529962, + "learning_rate": 1.2735889268022117e-05, + "loss": 0.1408, + "step": 37648 + }, + { + "epoch": 0.470636765919148, + "grad_norm": 5.091829299926758, + "learning_rate": 1.2735049864686652e-05, + "loss": 1.6161, + "step": 37650 + }, + { + "epoch": 0.4706617665441636, + "grad_norm": 2.1018128395080566, + "learning_rate": 1.2734210440521437e-05, + "loss": 0.2186, + "step": 37652 + }, + { + "epoch": 0.47068676716917923, + "grad_norm": 5.816386699676514, + "learning_rate": 1.2733370995532858e-05, + "loss": 2.2704, + "step": 37654 + }, + { + "epoch": 0.4707117677941949, + "grad_norm": 2.741257429122925, + "learning_rate": 1.2732531529727314e-05, + "loss": 0.7486, + "step": 37656 + }, + { + "epoch": 0.47073676841921047, + "grad_norm": 4.555946350097656, + "learning_rate": 1.2731692043111197e-05, + "loss": 1.9006, + "step": 37658 + }, + { + "epoch": 0.4707617690442261, + "grad_norm": 3.1375014781951904, + "learning_rate": 1.2730852535690903e-05, + "loss": 1.3718, + "step": 37660 + }, + { + "epoch": 0.4707867696692417, + "grad_norm": 5.016190052032471, + "learning_rate": 1.2730013007472817e-05, + "loss": 1.0655, + "step": 37662 + }, + { + "epoch": 0.47081177029425736, + "grad_norm": 3.026870012283325, + "learning_rate": 1.272917345846334e-05, + "loss": 0.7466, + "step": 37664 + }, + { + "epoch": 0.470836770919273, + "grad_norm": 4.713681697845459, + "learning_rate": 1.2728333888668862e-05, + "loss": 0.8846, + "step": 37666 + }, + { + "epoch": 0.4708617715442886, + "grad_norm": 1.590704083442688, + "learning_rate": 1.2727494298095782e-05, + "loss": 0.6869, + "step": 37668 + }, + { + "epoch": 0.47088677216930425, + "grad_norm": 0.9890564680099487, + "learning_rate": 1.2726654686750489e-05, + "loss": 0.4677, + "step": 37670 + }, + { + "epoch": 0.47091177279431984, + "grad_norm": 2.8643720149993896, + "learning_rate": 1.2725815054639379e-05, + "loss": 0.5877, + "step": 37672 + }, + { + "epoch": 0.4709367734193355, + "grad_norm": 6.536144733428955, + "learning_rate": 1.2724975401768851e-05, + "loss": 1.197, + "step": 37674 + }, + { + "epoch": 0.47096177404435113, + "grad_norm": 1.3064601421356201, + "learning_rate": 1.272413572814529e-05, + "loss": 0.0759, + "step": 37676 + }, + { + "epoch": 0.4709867746693667, + "grad_norm": 4.4604082107543945, + "learning_rate": 1.2723296033775101e-05, + "loss": 1.1584, + "step": 37678 + }, + { + "epoch": 0.4710117752943824, + "grad_norm": 3.876981019973755, + "learning_rate": 1.2722456318664671e-05, + "loss": 1.0352, + "step": 37680 + }, + { + "epoch": 0.47103677591939797, + "grad_norm": 5.023538589477539, + "learning_rate": 1.2721616582820402e-05, + "loss": 0.8654, + "step": 37682 + }, + { + "epoch": 0.4710617765444136, + "grad_norm": 7.867284774780273, + "learning_rate": 1.2720776826248683e-05, + "loss": 1.8947, + "step": 37684 + }, + { + "epoch": 0.47108677716942926, + "grad_norm": 6.8650803565979, + "learning_rate": 1.2719937048955915e-05, + "loss": 0.8844, + "step": 37686 + }, + { + "epoch": 0.47111177779444485, + "grad_norm": 5.335639953613281, + "learning_rate": 1.2719097250948491e-05, + "loss": 0.4454, + "step": 37688 + }, + { + "epoch": 0.4711367784194605, + "grad_norm": 3.626584053039551, + "learning_rate": 1.2718257432232806e-05, + "loss": 0.5903, + "step": 37690 + }, + { + "epoch": 0.4711617790444761, + "grad_norm": 4.2221503257751465, + "learning_rate": 1.2717417592815256e-05, + "loss": 1.4679, + "step": 37692 + }, + { + "epoch": 0.47118677966949174, + "grad_norm": 1.699170470237732, + "learning_rate": 1.2716577732702237e-05, + "loss": 0.0517, + "step": 37694 + }, + { + "epoch": 0.4712117802945074, + "grad_norm": 1.8342686891555786, + "learning_rate": 1.2715737851900148e-05, + "loss": 0.0482, + "step": 37696 + }, + { + "epoch": 0.471236780919523, + "grad_norm": 2.5314934253692627, + "learning_rate": 1.2714897950415383e-05, + "loss": 0.4999, + "step": 37698 + }, + { + "epoch": 0.4712617815445386, + "grad_norm": 4.624905586242676, + "learning_rate": 1.2714058028254342e-05, + "loss": 1.1565, + "step": 37700 + }, + { + "epoch": 0.4712867821695542, + "grad_norm": 5.110109806060791, + "learning_rate": 1.2713218085423415e-05, + "loss": 0.8432, + "step": 37702 + }, + { + "epoch": 0.47131178279456987, + "grad_norm": 1.0234752893447876, + "learning_rate": 1.2712378121929004e-05, + "loss": 0.5488, + "step": 37704 + }, + { + "epoch": 0.4713367834195855, + "grad_norm": 6.581681251525879, + "learning_rate": 1.2711538137777504e-05, + "loss": 1.2447, + "step": 37706 + }, + { + "epoch": 0.4713617840446011, + "grad_norm": 0.5217154026031494, + "learning_rate": 1.2710698132975314e-05, + "loss": 0.4807, + "step": 37708 + }, + { + "epoch": 0.47138678466961675, + "grad_norm": 4.856429100036621, + "learning_rate": 1.270985810752883e-05, + "loss": 2.742, + "step": 37710 + }, + { + "epoch": 0.47141178529463235, + "grad_norm": 6.137046813964844, + "learning_rate": 1.2709018061444448e-05, + "loss": 0.6851, + "step": 37712 + }, + { + "epoch": 0.471436785919648, + "grad_norm": 5.814624786376953, + "learning_rate": 1.2708177994728573e-05, + "loss": 1.2104, + "step": 37714 + }, + { + "epoch": 0.47146178654466364, + "grad_norm": 1.109174370765686, + "learning_rate": 1.2707337907387593e-05, + "loss": 0.0385, + "step": 37716 + }, + { + "epoch": 0.47148678716967923, + "grad_norm": 0.0011732646962627769, + "learning_rate": 1.270649779942791e-05, + "loss": 0.8318, + "step": 37718 + }, + { + "epoch": 0.4715117877946949, + "grad_norm": 1.437896966934204, + "learning_rate": 1.2705657670855924e-05, + "loss": 0.3219, + "step": 37720 + }, + { + "epoch": 0.4715367884197105, + "grad_norm": 2.3830602169036865, + "learning_rate": 1.270481752167803e-05, + "loss": 0.7966, + "step": 37722 + }, + { + "epoch": 0.4715617890447261, + "grad_norm": 0.9103780388832092, + "learning_rate": 1.2703977351900632e-05, + "loss": 0.9976, + "step": 37724 + }, + { + "epoch": 0.47158678966974177, + "grad_norm": 2.796149969100952, + "learning_rate": 1.2703137161530123e-05, + "loss": 0.9223, + "step": 37726 + }, + { + "epoch": 0.47161179029475736, + "grad_norm": 2.408074378967285, + "learning_rate": 1.2702296950572904e-05, + "loss": 1.129, + "step": 37728 + }, + { + "epoch": 0.471636790919773, + "grad_norm": 5.997138500213623, + "learning_rate": 1.2701456719035373e-05, + "loss": 1.106, + "step": 37730 + }, + { + "epoch": 0.4716617915447886, + "grad_norm": 2.9968795776367188, + "learning_rate": 1.2700616466923928e-05, + "loss": 1.4528, + "step": 37732 + }, + { + "epoch": 0.47168679216980425, + "grad_norm": 1.8845974206924438, + "learning_rate": 1.2699776194244971e-05, + "loss": 1.0001, + "step": 37734 + }, + { + "epoch": 0.4717117927948199, + "grad_norm": 0.0008438454242423177, + "learning_rate": 1.2698935901004901e-05, + "loss": 0.7785, + "step": 37736 + }, + { + "epoch": 0.4717367934198355, + "grad_norm": 2.931497812271118, + "learning_rate": 1.2698095587210115e-05, + "loss": 0.5975, + "step": 37738 + }, + { + "epoch": 0.47176179404485113, + "grad_norm": 2.398712635040283, + "learning_rate": 1.2697255252867017e-05, + "loss": 0.736, + "step": 37740 + }, + { + "epoch": 0.4717867946698667, + "grad_norm": 5.6547746658325195, + "learning_rate": 1.2696414897982004e-05, + "loss": 1.1026, + "step": 37742 + }, + { + "epoch": 0.4718117952948824, + "grad_norm": 4.016934394836426, + "learning_rate": 1.2695574522561476e-05, + "loss": 1.7297, + "step": 37744 + }, + { + "epoch": 0.471836795919898, + "grad_norm": 1.063449740409851, + "learning_rate": 1.2694734126611832e-05, + "loss": 0.9, + "step": 37746 + }, + { + "epoch": 0.4718617965449136, + "grad_norm": 5.277735710144043, + "learning_rate": 1.2693893710139475e-05, + "loss": 1.2083, + "step": 37748 + }, + { + "epoch": 0.47188679716992926, + "grad_norm": 2.3455374240875244, + "learning_rate": 1.2693053273150806e-05, + "loss": 0.4204, + "step": 37750 + }, + { + "epoch": 0.47191179779494485, + "grad_norm": 8.00742244720459, + "learning_rate": 1.2692212815652224e-05, + "loss": 0.3231, + "step": 37752 + }, + { + "epoch": 0.4719367984199605, + "grad_norm": 5.865240097045898, + "learning_rate": 1.269137233765013e-05, + "loss": 1.1014, + "step": 37754 + }, + { + "epoch": 0.47196179904497615, + "grad_norm": 0.4029271900653839, + "learning_rate": 1.269053183915092e-05, + "loss": 1.0374, + "step": 37756 + }, + { + "epoch": 0.47198679966999174, + "grad_norm": 2.8123972415924072, + "learning_rate": 1.2689691320161005e-05, + "loss": 1.1485, + "step": 37758 + }, + { + "epoch": 0.4720118002950074, + "grad_norm": 2.3898656368255615, + "learning_rate": 1.268885078068678e-05, + "loss": 0.6101, + "step": 37760 + }, + { + "epoch": 0.472036800920023, + "grad_norm": 3.4510622024536133, + "learning_rate": 1.2688010220734648e-05, + "loss": 1.3767, + "step": 37762 + }, + { + "epoch": 0.47206180154503863, + "grad_norm": 2.739555835723877, + "learning_rate": 1.268716964031101e-05, + "loss": 0.9254, + "step": 37764 + }, + { + "epoch": 0.4720868021700543, + "grad_norm": 10.634767532348633, + "learning_rate": 1.2686329039422269e-05, + "loss": 1.4925, + "step": 37766 + }, + { + "epoch": 0.47211180279506987, + "grad_norm": 0.001732856617309153, + "learning_rate": 1.2685488418074825e-05, + "loss": 0.7082, + "step": 37768 + }, + { + "epoch": 0.4721368034200855, + "grad_norm": 7.421109199523926, + "learning_rate": 1.2684647776275078e-05, + "loss": 1.0827, + "step": 37770 + }, + { + "epoch": 0.4721618040451011, + "grad_norm": 4.598298072814941, + "learning_rate": 1.2683807114029438e-05, + "loss": 2.2022, + "step": 37772 + }, + { + "epoch": 0.47218680467011676, + "grad_norm": 2.4370627403259277, + "learning_rate": 1.26829664313443e-05, + "loss": 1.108, + "step": 37774 + }, + { + "epoch": 0.4722118052951324, + "grad_norm": 3.210479974746704, + "learning_rate": 1.2682125728226072e-05, + "loss": 0.9513, + "step": 37776 + }, + { + "epoch": 0.472236805920148, + "grad_norm": 0.3786783516407013, + "learning_rate": 1.2681285004681152e-05, + "loss": 0.568, + "step": 37778 + }, + { + "epoch": 0.47226180654516364, + "grad_norm": 4.112549781799316, + "learning_rate": 1.2680444260715943e-05, + "loss": 0.9107, + "step": 37780 + }, + { + "epoch": 0.47228680717017923, + "grad_norm": 1.2566938400268555, + "learning_rate": 1.2679603496336848e-05, + "loss": 0.6557, + "step": 37782 + }, + { + "epoch": 0.4723118077951949, + "grad_norm": 0.007892392575740814, + "learning_rate": 1.2678762711550277e-05, + "loss": 0.2164, + "step": 37784 + }, + { + "epoch": 0.47233680842021053, + "grad_norm": 0.0006462172605097294, + "learning_rate": 1.2677921906362625e-05, + "loss": 0.8508, + "step": 37786 + }, + { + "epoch": 0.4723618090452261, + "grad_norm": 0.9458528161048889, + "learning_rate": 1.2677081080780298e-05, + "loss": 2.0772, + "step": 37788 + }, + { + "epoch": 0.47238680967024177, + "grad_norm": 0.4745278060436249, + "learning_rate": 1.26762402348097e-05, + "loss": 0.0055, + "step": 37790 + }, + { + "epoch": 0.47241181029525736, + "grad_norm": 4.04725456237793, + "learning_rate": 1.267539936845724e-05, + "loss": 0.4712, + "step": 37792 + }, + { + "epoch": 0.472436810920273, + "grad_norm": 7.503284931182861, + "learning_rate": 1.2674558481729308e-05, + "loss": 1.8976, + "step": 37794 + }, + { + "epoch": 0.47246181154528866, + "grad_norm": 0.00046916809515096247, + "learning_rate": 1.267371757463232e-05, + "loss": 0.0, + "step": 37796 + }, + { + "epoch": 0.47248681217030425, + "grad_norm": 4.326360702514648, + "learning_rate": 1.267287664717268e-05, + "loss": 0.8265, + "step": 37798 + }, + { + "epoch": 0.4725118127953199, + "grad_norm": 2.0817971229553223, + "learning_rate": 1.2672035699356789e-05, + "loss": 0.7385, + "step": 37800 + }, + { + "epoch": 0.4725368134203355, + "grad_norm": 3.4075849056243896, + "learning_rate": 1.267119473119105e-05, + "loss": 1.3619, + "step": 37802 + }, + { + "epoch": 0.47256181404535114, + "grad_norm": 3.2093982696533203, + "learning_rate": 1.267035374268187e-05, + "loss": 1.2842, + "step": 37804 + }, + { + "epoch": 0.4725868146703668, + "grad_norm": 2.783379316329956, + "learning_rate": 1.2669512733835656e-05, + "loss": 0.4277, + "step": 37806 + }, + { + "epoch": 0.4726118152953824, + "grad_norm": 1.7051500082015991, + "learning_rate": 1.2668671704658806e-05, + "loss": 1.2108, + "step": 37808 + }, + { + "epoch": 0.472636815920398, + "grad_norm": 0.007318372372537851, + "learning_rate": 1.2667830655157733e-05, + "loss": 0.2612, + "step": 37810 + }, + { + "epoch": 0.4726618165454136, + "grad_norm": 2.9768519401550293, + "learning_rate": 1.2666989585338838e-05, + "loss": 1.589, + "step": 37812 + }, + { + "epoch": 0.47268681717042926, + "grad_norm": 4.837436199188232, + "learning_rate": 1.2666148495208528e-05, + "loss": 0.7911, + "step": 37814 + }, + { + "epoch": 0.4727118177954449, + "grad_norm": 6.022818565368652, + "learning_rate": 1.2665307384773207e-05, + "loss": 1.2324, + "step": 37816 + }, + { + "epoch": 0.4727368184204605, + "grad_norm": 2.1446778774261475, + "learning_rate": 1.2664466254039285e-05, + "loss": 0.9157, + "step": 37818 + }, + { + "epoch": 0.47276181904547615, + "grad_norm": 0.01580032892525196, + "learning_rate": 1.2663625103013159e-05, + "loss": 0.0002, + "step": 37820 + }, + { + "epoch": 0.47278681967049174, + "grad_norm": 0.0004974310868419707, + "learning_rate": 1.2662783931701243e-05, + "loss": 1.0921, + "step": 37822 + }, + { + "epoch": 0.4728118202955074, + "grad_norm": 4.491685390472412, + "learning_rate": 1.2661942740109945e-05, + "loss": 1.0979, + "step": 37824 + }, + { + "epoch": 0.47283682092052304, + "grad_norm": 3.872080087661743, + "learning_rate": 1.2661101528245663e-05, + "loss": 0.9526, + "step": 37826 + }, + { + "epoch": 0.47286182154553863, + "grad_norm": 3.7690539360046387, + "learning_rate": 1.266026029611481e-05, + "loss": 0.649, + "step": 37828 + }, + { + "epoch": 0.4728868221705543, + "grad_norm": 3.36082124710083, + "learning_rate": 1.2659419043723793e-05, + "loss": 1.2897, + "step": 37830 + }, + { + "epoch": 0.47291182279556987, + "grad_norm": 11.002522468566895, + "learning_rate": 1.2658577771079013e-05, + "loss": 1.6456, + "step": 37832 + }, + { + "epoch": 0.4729368234205855, + "grad_norm": 1.7165905237197876, + "learning_rate": 1.265773647818688e-05, + "loss": 1.3381, + "step": 37834 + }, + { + "epoch": 0.47296182404560116, + "grad_norm": 5.2242231369018555, + "learning_rate": 1.2656895165053804e-05, + "loss": 0.5389, + "step": 37836 + }, + { + "epoch": 0.47298682467061676, + "grad_norm": 2.575589418411255, + "learning_rate": 1.2656053831686193e-05, + "loss": 0.1621, + "step": 37838 + }, + { + "epoch": 0.4730118252956324, + "grad_norm": 0.06551409512758255, + "learning_rate": 1.2655212478090445e-05, + "loss": 0.0892, + "step": 37840 + }, + { + "epoch": 0.473036825920648, + "grad_norm": 2.464421272277832, + "learning_rate": 1.265437110427298e-05, + "loss": 1.0037, + "step": 37842 + }, + { + "epoch": 0.47306182654566364, + "grad_norm": 4.094751834869385, + "learning_rate": 1.2653529710240197e-05, + "loss": 0.4152, + "step": 37844 + }, + { + "epoch": 0.4730868271706793, + "grad_norm": 4.341224670410156, + "learning_rate": 1.2652688295998508e-05, + "loss": 1.8552, + "step": 37846 + }, + { + "epoch": 0.4731118277956949, + "grad_norm": 2.7533392906188965, + "learning_rate": 1.2651846861554318e-05, + "loss": 1.3823, + "step": 37848 + }, + { + "epoch": 0.47313682842071053, + "grad_norm": 5.825502872467041, + "learning_rate": 1.2651005406914042e-05, + "loss": 1.5006, + "step": 37850 + }, + { + "epoch": 0.4731618290457261, + "grad_norm": 5.121581077575684, + "learning_rate": 1.2650163932084081e-05, + "loss": 1.0059, + "step": 37852 + }, + { + "epoch": 0.47318682967074177, + "grad_norm": 4.7257771492004395, + "learning_rate": 1.2649322437070845e-05, + "loss": 1.2863, + "step": 37854 + }, + { + "epoch": 0.4732118302957574, + "grad_norm": 0.6623039245605469, + "learning_rate": 1.2648480921880746e-05, + "loss": 0.1029, + "step": 37856 + }, + { + "epoch": 0.473236830920773, + "grad_norm": 2.707615613937378, + "learning_rate": 1.2647639386520188e-05, + "loss": 0.1606, + "step": 37858 + }, + { + "epoch": 0.47326183154578866, + "grad_norm": 9.0728178024292, + "learning_rate": 1.2646797830995585e-05, + "loss": 1.1354, + "step": 37860 + }, + { + "epoch": 0.47328683217080425, + "grad_norm": 2.226552963256836, + "learning_rate": 1.2645956255313344e-05, + "loss": 0.9543, + "step": 37862 + }, + { + "epoch": 0.4733118327958199, + "grad_norm": 2.390052080154419, + "learning_rate": 1.2645114659479877e-05, + "loss": 0.4061, + "step": 37864 + }, + { + "epoch": 0.47333683342083555, + "grad_norm": 2.405717372894287, + "learning_rate": 1.2644273043501584e-05, + "loss": 0.9617, + "step": 37866 + }, + { + "epoch": 0.47336183404585114, + "grad_norm": 3.7075247764587402, + "learning_rate": 1.2643431407384887e-05, + "loss": 1.1424, + "step": 37868 + }, + { + "epoch": 0.4733868346708668, + "grad_norm": 3.019029378890991, + "learning_rate": 1.2642589751136186e-05, + "loss": 1.2495, + "step": 37870 + }, + { + "epoch": 0.4734118352958824, + "grad_norm": 0.936037540435791, + "learning_rate": 1.2641748074761899e-05, + "loss": 0.4748, + "step": 37872 + }, + { + "epoch": 0.473436835920898, + "grad_norm": 2.1253554821014404, + "learning_rate": 1.2640906378268432e-05, + "loss": 1.0699, + "step": 37874 + }, + { + "epoch": 0.4734618365459137, + "grad_norm": 1.1698275804519653, + "learning_rate": 1.2640064661662197e-05, + "loss": 0.8471, + "step": 37876 + }, + { + "epoch": 0.47348683717092926, + "grad_norm": 4.669074535369873, + "learning_rate": 1.2639222924949598e-05, + "loss": 0.2097, + "step": 37878 + }, + { + "epoch": 0.4735118377959449, + "grad_norm": 4.783570289611816, + "learning_rate": 1.2638381168137052e-05, + "loss": 2.1785, + "step": 37880 + }, + { + "epoch": 0.4735368384209605, + "grad_norm": 2.004378080368042, + "learning_rate": 1.2637539391230971e-05, + "loss": 1.0107, + "step": 37882 + }, + { + "epoch": 0.47356183904597615, + "grad_norm": 0.8152493834495544, + "learning_rate": 1.2636697594237759e-05, + "loss": 1.1643, + "step": 37884 + }, + { + "epoch": 0.4735868396709918, + "grad_norm": 4.392230987548828, + "learning_rate": 1.2635855777163833e-05, + "loss": 1.4124, + "step": 37886 + }, + { + "epoch": 0.4736118402960074, + "grad_norm": 1.9183887243270874, + "learning_rate": 1.2635013940015602e-05, + "loss": 1.3295, + "step": 37888 + }, + { + "epoch": 0.47363684092102304, + "grad_norm": 3.5238912105560303, + "learning_rate": 1.2634172082799478e-05, + "loss": 0.3658, + "step": 37890 + }, + { + "epoch": 0.47366184154603863, + "grad_norm": 3.9769625663757324, + "learning_rate": 1.2633330205521872e-05, + "loss": 0.9321, + "step": 37892 + }, + { + "epoch": 0.4736868421710543, + "grad_norm": 2.470964193344116, + "learning_rate": 1.2632488308189194e-05, + "loss": 0.7149, + "step": 37894 + }, + { + "epoch": 0.4737118427960699, + "grad_norm": 5.0085015296936035, + "learning_rate": 1.263164639080786e-05, + "loss": 2.242, + "step": 37896 + }, + { + "epoch": 0.4737368434210855, + "grad_norm": 0.0005573495873250067, + "learning_rate": 1.2630804453384275e-05, + "loss": 1.1893, + "step": 37898 + }, + { + "epoch": 0.47376184404610117, + "grad_norm": 1.5292713642120361, + "learning_rate": 1.2629962495924857e-05, + "loss": 0.9749, + "step": 37900 + }, + { + "epoch": 0.47378684467111676, + "grad_norm": 4.1559224128723145, + "learning_rate": 1.262912051843602e-05, + "loss": 1.228, + "step": 37902 + }, + { + "epoch": 0.4738118452961324, + "grad_norm": 4.956327438354492, + "learning_rate": 1.2628278520924169e-05, + "loss": 0.9505, + "step": 37904 + }, + { + "epoch": 0.47383684592114805, + "grad_norm": 2.1595280170440674, + "learning_rate": 1.2627436503395723e-05, + "loss": 0.6411, + "step": 37906 + }, + { + "epoch": 0.47386184654616365, + "grad_norm": 2.659940242767334, + "learning_rate": 1.262659446585709e-05, + "loss": 0.9113, + "step": 37908 + }, + { + "epoch": 0.4738868471711793, + "grad_norm": 4.02888822555542, + "learning_rate": 1.2625752408314684e-05, + "loss": 0.791, + "step": 37910 + }, + { + "epoch": 0.4739118477961949, + "grad_norm": 0.17808681726455688, + "learning_rate": 1.2624910330774922e-05, + "loss": 0.7221, + "step": 37912 + }, + { + "epoch": 0.47393684842121053, + "grad_norm": 2.1404943466186523, + "learning_rate": 1.2624068233244211e-05, + "loss": 0.5559, + "step": 37914 + }, + { + "epoch": 0.4739618490462262, + "grad_norm": 21.3858585357666, + "learning_rate": 1.2623226115728972e-05, + "loss": 1.6869, + "step": 37916 + }, + { + "epoch": 0.4739868496712418, + "grad_norm": 2.283405065536499, + "learning_rate": 1.2622383978235611e-05, + "loss": 0.7845, + "step": 37918 + }, + { + "epoch": 0.4740118502962574, + "grad_norm": 2.228853702545166, + "learning_rate": 1.2621541820770545e-05, + "loss": 0.5722, + "step": 37920 + }, + { + "epoch": 0.474036850921273, + "grad_norm": 11.332529067993164, + "learning_rate": 1.2620699643340187e-05, + "loss": 2.7678, + "step": 37922 + }, + { + "epoch": 0.47406185154628866, + "grad_norm": 2.4317188262939453, + "learning_rate": 1.261985744595095e-05, + "loss": 0.351, + "step": 37924 + }, + { + "epoch": 0.4740868521713043, + "grad_norm": 2.3809778690338135, + "learning_rate": 1.2619015228609247e-05, + "loss": 0.7623, + "step": 37926 + }, + { + "epoch": 0.4741118527963199, + "grad_norm": 2.5935444831848145, + "learning_rate": 1.26181729913215e-05, + "loss": 0.3627, + "step": 37928 + }, + { + "epoch": 0.47413685342133555, + "grad_norm": 4.24884557723999, + "learning_rate": 1.2617330734094115e-05, + "loss": 1.1591, + "step": 37930 + }, + { + "epoch": 0.47416185404635114, + "grad_norm": 0.0021995599381625652, + "learning_rate": 1.2616488456933511e-05, + "loss": 1.1075, + "step": 37932 + }, + { + "epoch": 0.4741868546713668, + "grad_norm": 4.786074161529541, + "learning_rate": 1.2615646159846097e-05, + "loss": 1.3701, + "step": 37934 + }, + { + "epoch": 0.47421185529638243, + "grad_norm": 4.166800022125244, + "learning_rate": 1.2614803842838294e-05, + "loss": 1.3531, + "step": 37936 + }, + { + "epoch": 0.474236855921398, + "grad_norm": 1.6878516674041748, + "learning_rate": 1.2613961505916516e-05, + "loss": 1.0465, + "step": 37938 + }, + { + "epoch": 0.4742618565464137, + "grad_norm": 4.084817409515381, + "learning_rate": 1.2613119149087174e-05, + "loss": 1.1146, + "step": 37940 + }, + { + "epoch": 0.47428685717142927, + "grad_norm": 3.8655526638031006, + "learning_rate": 1.261227677235669e-05, + "loss": 1.6735, + "step": 37942 + }, + { + "epoch": 0.4743118577964449, + "grad_norm": 3.310542345046997, + "learning_rate": 1.2611434375731473e-05, + "loss": 1.6672, + "step": 37944 + }, + { + "epoch": 0.47433685842146056, + "grad_norm": 2.0352697372436523, + "learning_rate": 1.2610591959217941e-05, + "loss": 0.1127, + "step": 37946 + }, + { + "epoch": 0.47436185904647615, + "grad_norm": 3.4605906009674072, + "learning_rate": 1.260974952282251e-05, + "loss": 0.3981, + "step": 37948 + }, + { + "epoch": 0.4743868596714918, + "grad_norm": 4.837399959564209, + "learning_rate": 1.2608907066551597e-05, + "loss": 0.7805, + "step": 37950 + }, + { + "epoch": 0.4744118602965074, + "grad_norm": 6.37346076965332, + "learning_rate": 1.2608064590411616e-05, + "loss": 0.4951, + "step": 37952 + }, + { + "epoch": 0.47443686092152304, + "grad_norm": 2.409217119216919, + "learning_rate": 1.2607222094408985e-05, + "loss": 0.8044, + "step": 37954 + }, + { + "epoch": 0.4744618615465387, + "grad_norm": 4.68395471572876, + "learning_rate": 1.2606379578550116e-05, + "loss": 0.5709, + "step": 37956 + }, + { + "epoch": 0.4744868621715543, + "grad_norm": 4.683382034301758, + "learning_rate": 1.260553704284143e-05, + "loss": 1.0019, + "step": 37958 + }, + { + "epoch": 0.4745118627965699, + "grad_norm": 5.6604084968566895, + "learning_rate": 1.260469448728934e-05, + "loss": 1.4237, + "step": 37960 + }, + { + "epoch": 0.4745368634215855, + "grad_norm": 3.8427934646606445, + "learning_rate": 1.260385191190027e-05, + "loss": 0.9664, + "step": 37962 + }, + { + "epoch": 0.47456186404660117, + "grad_norm": 0.7689913511276245, + "learning_rate": 1.260300931668063e-05, + "loss": 0.0671, + "step": 37964 + }, + { + "epoch": 0.4745868646716168, + "grad_norm": 0.8710233569145203, + "learning_rate": 1.2602166701636838e-05, + "loss": 0.8051, + "step": 37966 + }, + { + "epoch": 0.4746118652966324, + "grad_norm": 2.6212692260742188, + "learning_rate": 1.2601324066775315e-05, + "loss": 0.6315, + "step": 37968 + }, + { + "epoch": 0.47463686592164805, + "grad_norm": 3.3979907035827637, + "learning_rate": 1.2600481412102475e-05, + "loss": 1.4731, + "step": 37970 + }, + { + "epoch": 0.47466186654666365, + "grad_norm": 0.0025930588599294424, + "learning_rate": 1.2599638737624732e-05, + "loss": 0.7594, + "step": 37972 + }, + { + "epoch": 0.4746868671716793, + "grad_norm": 0.0007138507207855582, + "learning_rate": 1.2598796043348513e-05, + "loss": 0.7048, + "step": 37974 + }, + { + "epoch": 0.47471186779669494, + "grad_norm": 6.974182605743408, + "learning_rate": 1.259795332928023e-05, + "loss": 1.1481, + "step": 37976 + }, + { + "epoch": 0.47473686842171053, + "grad_norm": 13.312907218933105, + "learning_rate": 1.25971105954263e-05, + "loss": 2.0865, + "step": 37978 + }, + { + "epoch": 0.4747618690467262, + "grad_norm": 3.9048521518707275, + "learning_rate": 1.2596267841793145e-05, + "loss": 0.7717, + "step": 37980 + }, + { + "epoch": 0.4747868696717418, + "grad_norm": 6.768165588378906, + "learning_rate": 1.2595425068387182e-05, + "loss": 1.6366, + "step": 37982 + }, + { + "epoch": 0.4748118702967574, + "grad_norm": 2.3745131492614746, + "learning_rate": 1.2594582275214823e-05, + "loss": 0.8062, + "step": 37984 + }, + { + "epoch": 0.47483687092177307, + "grad_norm": 3.343276262283325, + "learning_rate": 1.2593739462282498e-05, + "loss": 0.7042, + "step": 37986 + }, + { + "epoch": 0.47486187154678866, + "grad_norm": 2.6443517208099365, + "learning_rate": 1.2592896629596618e-05, + "loss": 1.1992, + "step": 37988 + }, + { + "epoch": 0.4748868721718043, + "grad_norm": 3.4985010623931885, + "learning_rate": 1.2592053777163606e-05, + "loss": 1.2892, + "step": 37990 + }, + { + "epoch": 0.4749118727968199, + "grad_norm": 4.944515228271484, + "learning_rate": 1.2591210904989877e-05, + "loss": 0.9918, + "step": 37992 + }, + { + "epoch": 0.47493687342183555, + "grad_norm": 0.0013604359701275826, + "learning_rate": 1.2590368013081853e-05, + "loss": 1.0353, + "step": 37994 + }, + { + "epoch": 0.4749618740468512, + "grad_norm": 5.330575466156006, + "learning_rate": 1.2589525101445952e-05, + "loss": 1.2534, + "step": 37996 + }, + { + "epoch": 0.4749868746718668, + "grad_norm": 7.073069095611572, + "learning_rate": 1.2588682170088591e-05, + "loss": 0.4933, + "step": 37998 + }, + { + "epoch": 0.47501187529688244, + "grad_norm": 1.128326654434204, + "learning_rate": 1.2587839219016198e-05, + "loss": 0.4573, + "step": 38000 + }, + { + "epoch": 0.475036875921898, + "grad_norm": 4.933067321777344, + "learning_rate": 1.2586996248235186e-05, + "loss": 2.1748, + "step": 38002 + }, + { + "epoch": 0.4750618765469137, + "grad_norm": 2.296895742416382, + "learning_rate": 1.2586153257751975e-05, + "loss": 0.8334, + "step": 38004 + }, + { + "epoch": 0.4750868771719293, + "grad_norm": 2.776726245880127, + "learning_rate": 1.2585310247572989e-05, + "loss": 0.3393, + "step": 38006 + }, + { + "epoch": 0.4751118777969449, + "grad_norm": 4.766984462738037, + "learning_rate": 1.2584467217704643e-05, + "loss": 0.9103, + "step": 38008 + }, + { + "epoch": 0.47513687842196056, + "grad_norm": 0.47471895813941956, + "learning_rate": 1.2583624168153358e-05, + "loss": 0.5354, + "step": 38010 + }, + { + "epoch": 0.47516187904697615, + "grad_norm": 0.946908712387085, + "learning_rate": 1.2582781098925561e-05, + "loss": 0.5777, + "step": 38012 + }, + { + "epoch": 0.4751868796719918, + "grad_norm": 14.835805892944336, + "learning_rate": 1.2581938010027669e-05, + "loss": 1.1667, + "step": 38014 + }, + { + "epoch": 0.47521188029700745, + "grad_norm": 2.0628201961517334, + "learning_rate": 1.2581094901466099e-05, + "loss": 1.4947, + "step": 38016 + }, + { + "epoch": 0.47523688092202304, + "grad_norm": 3.677166700363159, + "learning_rate": 1.2580251773247277e-05, + "loss": 0.3385, + "step": 38018 + }, + { + "epoch": 0.4752618815470387, + "grad_norm": 0.003098492743447423, + "learning_rate": 1.2579408625377622e-05, + "loss": 0.5945, + "step": 38020 + }, + { + "epoch": 0.4752868821720543, + "grad_norm": 2.930790424346924, + "learning_rate": 1.2578565457863554e-05, + "loss": 1.4793, + "step": 38022 + }, + { + "epoch": 0.47531188279706993, + "grad_norm": 3.8481228351593018, + "learning_rate": 1.2577722270711496e-05, + "loss": 0.6752, + "step": 38024 + }, + { + "epoch": 0.4753368834220856, + "grad_norm": 9.796368598937988, + "learning_rate": 1.2576879063927874e-05, + "loss": 1.3107, + "step": 38026 + }, + { + "epoch": 0.47536188404710117, + "grad_norm": 3.495349884033203, + "learning_rate": 1.2576035837519102e-05, + "loss": 0.8359, + "step": 38028 + }, + { + "epoch": 0.4753868846721168, + "grad_norm": 3.047905445098877, + "learning_rate": 1.2575192591491607e-05, + "loss": 1.4512, + "step": 38030 + }, + { + "epoch": 0.4754118852971324, + "grad_norm": 0.0006182030774652958, + "learning_rate": 1.2574349325851811e-05, + "loss": 0.939, + "step": 38032 + }, + { + "epoch": 0.47543688592214806, + "grad_norm": 4.151993274688721, + "learning_rate": 1.257350604060613e-05, + "loss": 1.0519, + "step": 38034 + }, + { + "epoch": 0.4754618865471637, + "grad_norm": 2.985651731491089, + "learning_rate": 1.2572662735760992e-05, + "loss": 1.58, + "step": 38036 + }, + { + "epoch": 0.4754868871721793, + "grad_norm": 0.0006932355463504791, + "learning_rate": 1.2571819411322819e-05, + "loss": 0.1214, + "step": 38038 + }, + { + "epoch": 0.47551188779719494, + "grad_norm": 3.025280714035034, + "learning_rate": 1.2570976067298035e-05, + "loss": 1.5452, + "step": 38040 + }, + { + "epoch": 0.47553688842221054, + "grad_norm": 0.0007738447748124599, + "learning_rate": 1.257013270369306e-05, + "loss": 0.0192, + "step": 38042 + }, + { + "epoch": 0.4755618890472262, + "grad_norm": 5.84833288192749, + "learning_rate": 1.2569289320514318e-05, + "loss": 1.5559, + "step": 38044 + }, + { + "epoch": 0.47558688967224183, + "grad_norm": 1.5044764280319214, + "learning_rate": 1.2568445917768228e-05, + "loss": 0.2282, + "step": 38046 + }, + { + "epoch": 0.4756118902972574, + "grad_norm": 3.207603693008423, + "learning_rate": 1.2567602495461221e-05, + "loss": 1.0956, + "step": 38048 + }, + { + "epoch": 0.47563689092227307, + "grad_norm": 3.83699107170105, + "learning_rate": 1.2566759053599716e-05, + "loss": 1.7245, + "step": 38050 + }, + { + "epoch": 0.47566189154728866, + "grad_norm": 0.4849100708961487, + "learning_rate": 1.256591559219014e-05, + "loss": 0.8948, + "step": 38052 + }, + { + "epoch": 0.4756868921723043, + "grad_norm": 0.0006287156138569117, + "learning_rate": 1.256507211123891e-05, + "loss": 0.7394, + "step": 38054 + }, + { + "epoch": 0.47571189279731996, + "grad_norm": 0.0035801834892481565, + "learning_rate": 1.2564228610752452e-05, + "loss": 0.1752, + "step": 38056 + }, + { + "epoch": 0.47573689342233555, + "grad_norm": 0.7187353372573853, + "learning_rate": 1.2563385090737193e-05, + "loss": 0.7528, + "step": 38058 + }, + { + "epoch": 0.4757618940473512, + "grad_norm": 6.839519023895264, + "learning_rate": 1.2562541551199556e-05, + "loss": 0.6052, + "step": 38060 + }, + { + "epoch": 0.4757868946723668, + "grad_norm": 3.0155603885650635, + "learning_rate": 1.2561697992145965e-05, + "loss": 0.649, + "step": 38062 + }, + { + "epoch": 0.47581189529738244, + "grad_norm": 0.0007116817869246006, + "learning_rate": 1.2560854413582843e-05, + "loss": 0.6337, + "step": 38064 + }, + { + "epoch": 0.4758368959223981, + "grad_norm": 2.536682605743408, + "learning_rate": 1.2560010815516619e-05, + "loss": 1.2174, + "step": 38066 + }, + { + "epoch": 0.4758618965474137, + "grad_norm": 3.706453323364258, + "learning_rate": 1.2559167197953714e-05, + "loss": 0.8565, + "step": 38068 + }, + { + "epoch": 0.4758868971724293, + "grad_norm": 3.998913526535034, + "learning_rate": 1.2558323560900551e-05, + "loss": 0.6511, + "step": 38070 + }, + { + "epoch": 0.4759118977974449, + "grad_norm": 3.2506818771362305, + "learning_rate": 1.2557479904363558e-05, + "loss": 0.4996, + "step": 38072 + }, + { + "epoch": 0.47593689842246056, + "grad_norm": 1.8679413795471191, + "learning_rate": 1.2556636228349162e-05, + "loss": 0.5229, + "step": 38074 + }, + { + "epoch": 0.4759618990474762, + "grad_norm": 0.12565386295318604, + "learning_rate": 1.2555792532863784e-05, + "loss": 0.4949, + "step": 38076 + }, + { + "epoch": 0.4759868996724918, + "grad_norm": 2.8009469509124756, + "learning_rate": 1.2554948817913854e-05, + "loss": 1.2445, + "step": 38078 + }, + { + "epoch": 0.47601190029750745, + "grad_norm": 4.219555854797363, + "learning_rate": 1.2554105083505793e-05, + "loss": 1.2271, + "step": 38080 + }, + { + "epoch": 0.47603690092252304, + "grad_norm": 1.9262932538986206, + "learning_rate": 1.2553261329646026e-05, + "loss": 1.088, + "step": 38082 + }, + { + "epoch": 0.4760619015475387, + "grad_norm": 3.257664203643799, + "learning_rate": 1.2552417556340988e-05, + "loss": 0.6402, + "step": 38084 + }, + { + "epoch": 0.47608690217255434, + "grad_norm": 3.8485612869262695, + "learning_rate": 1.2551573763597093e-05, + "loss": 1.2176, + "step": 38086 + }, + { + "epoch": 0.47611190279756993, + "grad_norm": 19.841615676879883, + "learning_rate": 1.2550729951420777e-05, + "loss": 1.1678, + "step": 38088 + }, + { + "epoch": 0.4761369034225856, + "grad_norm": 5.71876859664917, + "learning_rate": 1.2549886119818462e-05, + "loss": 0.5209, + "step": 38090 + }, + { + "epoch": 0.47616190404760117, + "grad_norm": 3.6691315174102783, + "learning_rate": 1.2549042268796574e-05, + "loss": 0.7163, + "step": 38092 + }, + { + "epoch": 0.4761869046726168, + "grad_norm": 5.00123929977417, + "learning_rate": 1.2548198398361542e-05, + "loss": 1.5096, + "step": 38094 + }, + { + "epoch": 0.47621190529763247, + "grad_norm": 2.0735902786254883, + "learning_rate": 1.254735450851979e-05, + "loss": 0.7635, + "step": 38096 + }, + { + "epoch": 0.47623690592264806, + "grad_norm": 2.758955240249634, + "learning_rate": 1.2546510599277743e-05, + "loss": 1.1664, + "step": 38098 + }, + { + "epoch": 0.4762619065476637, + "grad_norm": 4.16330099105835, + "learning_rate": 1.2545666670641835e-05, + "loss": 0.2616, + "step": 38100 + }, + { + "epoch": 0.4762869071726793, + "grad_norm": 2.223970413208008, + "learning_rate": 1.254482272261849e-05, + "loss": 1.0167, + "step": 38102 + }, + { + "epoch": 0.47631190779769494, + "grad_norm": 2.3873257637023926, + "learning_rate": 1.2543978755214137e-05, + "loss": 1.3597, + "step": 38104 + }, + { + "epoch": 0.4763369084227106, + "grad_norm": 6.679619789123535, + "learning_rate": 1.2543134768435196e-05, + "loss": 0.7605, + "step": 38106 + }, + { + "epoch": 0.4763619090477262, + "grad_norm": 3.759777069091797, + "learning_rate": 1.2542290762288105e-05, + "loss": 1.7063, + "step": 38108 + }, + { + "epoch": 0.47638690967274183, + "grad_norm": 0.12497880309820175, + "learning_rate": 1.2541446736779284e-05, + "loss": 0.0029, + "step": 38110 + }, + { + "epoch": 0.4764119102977574, + "grad_norm": 2.030268430709839, + "learning_rate": 1.2540602691915165e-05, + "loss": 0.5265, + "step": 38112 + }, + { + "epoch": 0.47643691092277307, + "grad_norm": 0.9580949544906616, + "learning_rate": 1.2539758627702176e-05, + "loss": 0.428, + "step": 38114 + }, + { + "epoch": 0.4764619115477887, + "grad_norm": 2.9961211681365967, + "learning_rate": 1.2538914544146743e-05, + "loss": 1.2159, + "step": 38116 + }, + { + "epoch": 0.4764869121728043, + "grad_norm": 2.209843158721924, + "learning_rate": 1.25380704412553e-05, + "loss": 0.6897, + "step": 38118 + }, + { + "epoch": 0.47651191279781996, + "grad_norm": 3.34397554397583, + "learning_rate": 1.2537226319034265e-05, + "loss": 0.8194, + "step": 38120 + }, + { + "epoch": 0.47653691342283555, + "grad_norm": 5.2745771408081055, + "learning_rate": 1.2536382177490079e-05, + "loss": 1.2458, + "step": 38122 + }, + { + "epoch": 0.4765619140478512, + "grad_norm": 2.9733777046203613, + "learning_rate": 1.253553801662916e-05, + "loss": 0.5511, + "step": 38124 + }, + { + "epoch": 0.47658691467286685, + "grad_norm": 6.92733907699585, + "learning_rate": 1.2534693836457946e-05, + "loss": 0.4638, + "step": 38126 + }, + { + "epoch": 0.47661191529788244, + "grad_norm": 0.0006003024755045772, + "learning_rate": 1.2533849636982858e-05, + "loss": 0.7889, + "step": 38128 + }, + { + "epoch": 0.4766369159228981, + "grad_norm": 1.7283003330230713, + "learning_rate": 1.2533005418210333e-05, + "loss": 0.2847, + "step": 38130 + }, + { + "epoch": 0.4766619165479137, + "grad_norm": 5.925468921661377, + "learning_rate": 1.2532161180146796e-05, + "loss": 1.434, + "step": 38132 + }, + { + "epoch": 0.4766869171729293, + "grad_norm": 4.525571823120117, + "learning_rate": 1.2531316922798677e-05, + "loss": 0.9534, + "step": 38134 + }, + { + "epoch": 0.476711917797945, + "grad_norm": 3.3407907485961914, + "learning_rate": 1.2530472646172407e-05, + "loss": 1.5593, + "step": 38136 + }, + { + "epoch": 0.47673691842296056, + "grad_norm": 4.491698265075684, + "learning_rate": 1.2529628350274417e-05, + "loss": 1.5275, + "step": 38138 + }, + { + "epoch": 0.4767619190479762, + "grad_norm": 2.241302251815796, + "learning_rate": 1.252878403511113e-05, + "loss": 0.9066, + "step": 38140 + }, + { + "epoch": 0.4767869196729918, + "grad_norm": 3.1323459148406982, + "learning_rate": 1.2527939700688985e-05, + "loss": 0.3308, + "step": 38142 + }, + { + "epoch": 0.47681192029800745, + "grad_norm": 26.52869987487793, + "learning_rate": 1.2527095347014411e-05, + "loss": 0.6744, + "step": 38144 + }, + { + "epoch": 0.4768369209230231, + "grad_norm": 2.650040626525879, + "learning_rate": 1.2526250974093833e-05, + "loss": 0.7641, + "step": 38146 + }, + { + "epoch": 0.4768619215480387, + "grad_norm": 4.488851547241211, + "learning_rate": 1.2525406581933683e-05, + "loss": 1.6195, + "step": 38148 + }, + { + "epoch": 0.47688692217305434, + "grad_norm": 3.3182406425476074, + "learning_rate": 1.2524562170540397e-05, + "loss": 0.5681, + "step": 38150 + }, + { + "epoch": 0.47691192279806993, + "grad_norm": 4.721751689910889, + "learning_rate": 1.25237177399204e-05, + "loss": 1.1728, + "step": 38152 + }, + { + "epoch": 0.4769369234230856, + "grad_norm": 3.7024178504943848, + "learning_rate": 1.2522873290080129e-05, + "loss": 0.7718, + "step": 38154 + }, + { + "epoch": 0.4769619240481012, + "grad_norm": 1.0446406602859497, + "learning_rate": 1.2522028821026008e-05, + "loss": 0.1151, + "step": 38156 + }, + { + "epoch": 0.4769869246731168, + "grad_norm": 2.707735300064087, + "learning_rate": 1.2521184332764475e-05, + "loss": 0.1855, + "step": 38158 + }, + { + "epoch": 0.47701192529813247, + "grad_norm": 3.5870728492736816, + "learning_rate": 1.2520339825301957e-05, + "loss": 1.5127, + "step": 38160 + }, + { + "epoch": 0.47703692592314806, + "grad_norm": 3.374541997909546, + "learning_rate": 1.2519495298644887e-05, + "loss": 0.6196, + "step": 38162 + }, + { + "epoch": 0.4770619265481637, + "grad_norm": 4.3865885734558105, + "learning_rate": 1.25186507527997e-05, + "loss": 0.9141, + "step": 38164 + }, + { + "epoch": 0.47708692717317935, + "grad_norm": 0.0038648874033242464, + "learning_rate": 1.251780618777282e-05, + "loss": 0.1942, + "step": 38166 + }, + { + "epoch": 0.47711192779819495, + "grad_norm": 5.846180438995361, + "learning_rate": 1.2516961603570688e-05, + "loss": 1.4579, + "step": 38168 + }, + { + "epoch": 0.4771369284232106, + "grad_norm": 2.203143835067749, + "learning_rate": 1.2516117000199732e-05, + "loss": 0.423, + "step": 38170 + }, + { + "epoch": 0.4771619290482262, + "grad_norm": 6.637715816497803, + "learning_rate": 1.2515272377666386e-05, + "loss": 0.7728, + "step": 38172 + }, + { + "epoch": 0.47718692967324183, + "grad_norm": 2.8208694458007812, + "learning_rate": 1.2514427735977077e-05, + "loss": 1.3824, + "step": 38174 + }, + { + "epoch": 0.4772119302982575, + "grad_norm": 0.0008693209965713322, + "learning_rate": 1.2513583075138245e-05, + "loss": 0.0, + "step": 38176 + }, + { + "epoch": 0.4772369309232731, + "grad_norm": 0.019637910649180412, + "learning_rate": 1.2512738395156318e-05, + "loss": 0.3679, + "step": 38178 + }, + { + "epoch": 0.4772619315482887, + "grad_norm": 5.549799919128418, + "learning_rate": 1.2511893696037734e-05, + "loss": 0.8149, + "step": 38180 + }, + { + "epoch": 0.4772869321733043, + "grad_norm": 0.0007743394235149026, + "learning_rate": 1.2511048977788923e-05, + "loss": 0.4273, + "step": 38182 + }, + { + "epoch": 0.47731193279831996, + "grad_norm": 3.464284896850586, + "learning_rate": 1.2510204240416313e-05, + "loss": 0.93, + "step": 38184 + }, + { + "epoch": 0.4773369334233356, + "grad_norm": 0.21811489760875702, + "learning_rate": 1.2509359483926344e-05, + "loss": 0.0954, + "step": 38186 + }, + { + "epoch": 0.4773619340483512, + "grad_norm": 2.868457317352295, + "learning_rate": 1.2508514708325448e-05, + "loss": 0.4813, + "step": 38188 + }, + { + "epoch": 0.47738693467336685, + "grad_norm": 2.840629816055298, + "learning_rate": 1.2507669913620061e-05, + "loss": 1.282, + "step": 38190 + }, + { + "epoch": 0.47741193529838244, + "grad_norm": 4.097174644470215, + "learning_rate": 1.2506825099816616e-05, + "loss": 1.7052, + "step": 38192 + }, + { + "epoch": 0.4774369359233981, + "grad_norm": 1.7402838468551636, + "learning_rate": 1.2505980266921541e-05, + "loss": 1.2112, + "step": 38194 + }, + { + "epoch": 0.47746193654841373, + "grad_norm": 2.314750909805298, + "learning_rate": 1.2505135414941279e-05, + "loss": 0.6991, + "step": 38196 + }, + { + "epoch": 0.4774869371734293, + "grad_norm": 1.4232546091079712, + "learning_rate": 1.2504290543882257e-05, + "loss": 0.8756, + "step": 38198 + }, + { + "epoch": 0.477511937798445, + "grad_norm": 5.911690711975098, + "learning_rate": 1.2503445653750909e-05, + "loss": 1.8924, + "step": 38200 + }, + { + "epoch": 0.47753693842346057, + "grad_norm": 3.670306921005249, + "learning_rate": 1.2502600744553681e-05, + "loss": 0.9115, + "step": 38202 + }, + { + "epoch": 0.4775619390484762, + "grad_norm": 0.0006568074459210038, + "learning_rate": 1.2501755816296994e-05, + "loss": 1.175, + "step": 38204 + }, + { + "epoch": 0.47758693967349186, + "grad_norm": 2.0937767028808594, + "learning_rate": 1.250091086898729e-05, + "loss": 0.2363, + "step": 38206 + }, + { + "epoch": 0.47761194029850745, + "grad_norm": 3.493274211883545, + "learning_rate": 1.2500065902631004e-05, + "loss": 0.146, + "step": 38208 + }, + { + "epoch": 0.4776369409235231, + "grad_norm": 3.0447239875793457, + "learning_rate": 1.2499220917234569e-05, + "loss": 1.7745, + "step": 38210 + }, + { + "epoch": 0.4776619415485387, + "grad_norm": 2.9882895946502686, + "learning_rate": 1.2498375912804417e-05, + "loss": 1.2802, + "step": 38212 + }, + { + "epoch": 0.47768694217355434, + "grad_norm": 4.604159355163574, + "learning_rate": 1.2497530889346989e-05, + "loss": 1.0678, + "step": 38214 + }, + { + "epoch": 0.47771194279857, + "grad_norm": 4.7749128341674805, + "learning_rate": 1.2496685846868724e-05, + "loss": 2.2019, + "step": 38216 + }, + { + "epoch": 0.4777369434235856, + "grad_norm": 4.366673946380615, + "learning_rate": 1.2495840785376048e-05, + "loss": 1.7041, + "step": 38218 + }, + { + "epoch": 0.47776194404860123, + "grad_norm": 2.546956777572632, + "learning_rate": 1.24949957048754e-05, + "loss": 0.4564, + "step": 38220 + }, + { + "epoch": 0.4777869446736168, + "grad_norm": 5.465414047241211, + "learning_rate": 1.2494150605373224e-05, + "loss": 1.1477, + "step": 38222 + }, + { + "epoch": 0.47781194529863247, + "grad_norm": 3.9832048416137695, + "learning_rate": 1.2493305486875944e-05, + "loss": 0.8413, + "step": 38224 + }, + { + "epoch": 0.4778369459236481, + "grad_norm": 5.869096279144287, + "learning_rate": 1.2492460349390003e-05, + "loss": 0.4741, + "step": 38226 + }, + { + "epoch": 0.4778619465486637, + "grad_norm": 5.326428413391113, + "learning_rate": 1.249161519292184e-05, + "loss": 1.8406, + "step": 38228 + }, + { + "epoch": 0.47788694717367936, + "grad_norm": 5.095834732055664, + "learning_rate": 1.2490770017477885e-05, + "loss": 0.8816, + "step": 38230 + }, + { + "epoch": 0.47791194779869495, + "grad_norm": 6.165434837341309, + "learning_rate": 1.2489924823064576e-05, + "loss": 2.0571, + "step": 38232 + }, + { + "epoch": 0.4779369484237106, + "grad_norm": 10.645547866821289, + "learning_rate": 1.2489079609688355e-05, + "loss": 0.941, + "step": 38234 + }, + { + "epoch": 0.47796194904872624, + "grad_norm": 3.9381043910980225, + "learning_rate": 1.2488234377355653e-05, + "loss": 0.317, + "step": 38236 + }, + { + "epoch": 0.47798694967374183, + "grad_norm": 1.7401151657104492, + "learning_rate": 1.248738912607291e-05, + "loss": 0.5216, + "step": 38238 + }, + { + "epoch": 0.4780119502987575, + "grad_norm": 3.5138888359069824, + "learning_rate": 1.2486543855846565e-05, + "loss": 0.7968, + "step": 38240 + }, + { + "epoch": 0.4780369509237731, + "grad_norm": 0.003625263925641775, + "learning_rate": 1.2485698566683056e-05, + "loss": 0.0967, + "step": 38242 + }, + { + "epoch": 0.4780619515487887, + "grad_norm": 2.8576667308807373, + "learning_rate": 1.2484853258588814e-05, + "loss": 0.1844, + "step": 38244 + }, + { + "epoch": 0.47808695217380437, + "grad_norm": 3.792638063430786, + "learning_rate": 1.2484007931570281e-05, + "loss": 1.455, + "step": 38246 + }, + { + "epoch": 0.47811195279881996, + "grad_norm": 4.241647720336914, + "learning_rate": 1.2483162585633896e-05, + "loss": 0.5789, + "step": 38248 + }, + { + "epoch": 0.4781369534238356, + "grad_norm": 6.762905120849609, + "learning_rate": 1.2482317220786096e-05, + "loss": 1.9207, + "step": 38250 + }, + { + "epoch": 0.4781619540488512, + "grad_norm": 3.5654754638671875, + "learning_rate": 1.2481471837033318e-05, + "loss": 1.0244, + "step": 38252 + }, + { + "epoch": 0.47818695467386685, + "grad_norm": 4.334292411804199, + "learning_rate": 1.2480626434382005e-05, + "loss": 1.0385, + "step": 38254 + }, + { + "epoch": 0.4782119552988825, + "grad_norm": 2.8879282474517822, + "learning_rate": 1.247978101283859e-05, + "loss": 1.2666, + "step": 38256 + }, + { + "epoch": 0.4782369559238981, + "grad_norm": 6.852921009063721, + "learning_rate": 1.2478935572409512e-05, + "loss": 0.9061, + "step": 38258 + }, + { + "epoch": 0.47826195654891374, + "grad_norm": 3.114283323287964, + "learning_rate": 1.247809011310121e-05, + "loss": 0.3523, + "step": 38260 + }, + { + "epoch": 0.47828695717392933, + "grad_norm": 2.3381595611572266, + "learning_rate": 1.2477244634920127e-05, + "loss": 0.1099, + "step": 38262 + }, + { + "epoch": 0.478311957798945, + "grad_norm": 2.5248095989227295, + "learning_rate": 1.2476399137872698e-05, + "loss": 0.626, + "step": 38264 + }, + { + "epoch": 0.4783369584239606, + "grad_norm": 0.0022564649116247892, + "learning_rate": 1.2475553621965364e-05, + "loss": 0.0534, + "step": 38266 + }, + { + "epoch": 0.4783619590489762, + "grad_norm": 4.959877967834473, + "learning_rate": 1.2474708087204564e-05, + "loss": 1.0048, + "step": 38268 + }, + { + "epoch": 0.47838695967399186, + "grad_norm": 0.7821082472801208, + "learning_rate": 1.2473862533596737e-05, + "loss": 0.3427, + "step": 38270 + }, + { + "epoch": 0.47841196029900745, + "grad_norm": 3.3768889904022217, + "learning_rate": 1.247301696114832e-05, + "loss": 0.5067, + "step": 38272 + }, + { + "epoch": 0.4784369609240231, + "grad_norm": 3.58050537109375, + "learning_rate": 1.247217136986576e-05, + "loss": 1.7589, + "step": 38274 + }, + { + "epoch": 0.47846196154903875, + "grad_norm": 5.39924430847168, + "learning_rate": 1.2471325759755488e-05, + "loss": 1.1746, + "step": 38276 + }, + { + "epoch": 0.47848696217405434, + "grad_norm": 2.2750093936920166, + "learning_rate": 1.2470480130823951e-05, + "loss": 1.0923, + "step": 38278 + }, + { + "epoch": 0.47851196279907, + "grad_norm": 4.75717306137085, + "learning_rate": 1.2469634483077589e-05, + "loss": 1.8377, + "step": 38280 + }, + { + "epoch": 0.4785369634240856, + "grad_norm": 2.1746058464050293, + "learning_rate": 1.2468788816522836e-05, + "loss": 0.9709, + "step": 38282 + }, + { + "epoch": 0.47856196404910123, + "grad_norm": 0.9804593324661255, + "learning_rate": 1.2467943131166137e-05, + "loss": 0.0339, + "step": 38284 + }, + { + "epoch": 0.4785869646741169, + "grad_norm": 3.8888003826141357, + "learning_rate": 1.2467097427013935e-05, + "loss": 0.3022, + "step": 38286 + }, + { + "epoch": 0.47861196529913247, + "grad_norm": 2.2886598110198975, + "learning_rate": 1.2466251704072665e-05, + "loss": 0.5514, + "step": 38288 + }, + { + "epoch": 0.4786369659241481, + "grad_norm": 4.476321697235107, + "learning_rate": 1.2465405962348771e-05, + "loss": 0.1916, + "step": 38290 + }, + { + "epoch": 0.4786619665491637, + "grad_norm": 3.5977423191070557, + "learning_rate": 1.2464560201848695e-05, + "loss": 1.2334, + "step": 38292 + }, + { + "epoch": 0.47868696717417936, + "grad_norm": 7.716445446014404, + "learning_rate": 1.2463714422578878e-05, + "loss": 0.9078, + "step": 38294 + }, + { + "epoch": 0.478711967799195, + "grad_norm": 3.157076835632324, + "learning_rate": 1.2462868624545759e-05, + "loss": 1.7457, + "step": 38296 + }, + { + "epoch": 0.4787369684242106, + "grad_norm": 3.491506576538086, + "learning_rate": 1.2462022807755781e-05, + "loss": 1.3031, + "step": 38298 + }, + { + "epoch": 0.47876196904922624, + "grad_norm": 9.788898468017578, + "learning_rate": 1.2461176972215384e-05, + "loss": 1.0026, + "step": 38300 + }, + { + "epoch": 0.47878696967424184, + "grad_norm": 1.079921841621399, + "learning_rate": 1.2460331117931013e-05, + "loss": 1.0615, + "step": 38302 + }, + { + "epoch": 0.4788119702992575, + "grad_norm": 0.011906307190656662, + "learning_rate": 1.2459485244909106e-05, + "loss": 0.9858, + "step": 38304 + }, + { + "epoch": 0.47883697092427313, + "grad_norm": 4.691319465637207, + "learning_rate": 1.2458639353156113e-05, + "loss": 0.7704, + "step": 38306 + }, + { + "epoch": 0.4788619715492887, + "grad_norm": 3.10378098487854, + "learning_rate": 1.2457793442678463e-05, + "loss": 0.6596, + "step": 38308 + }, + { + "epoch": 0.47888697217430437, + "grad_norm": 1.4123811721801758, + "learning_rate": 1.2456947513482609e-05, + "loss": 0.21, + "step": 38310 + }, + { + "epoch": 0.47891197279931996, + "grad_norm": 2.940823793411255, + "learning_rate": 1.245610156557499e-05, + "loss": 0.2935, + "step": 38312 + }, + { + "epoch": 0.4789369734243356, + "grad_norm": 1.7854119539260864, + "learning_rate": 1.2455255598962048e-05, + "loss": 1.0126, + "step": 38314 + }, + { + "epoch": 0.47896197404935126, + "grad_norm": 3.085838794708252, + "learning_rate": 1.2454409613650227e-05, + "loss": 0.4752, + "step": 38316 + }, + { + "epoch": 0.47898697467436685, + "grad_norm": 2.3773672580718994, + "learning_rate": 1.2453563609645971e-05, + "loss": 0.7786, + "step": 38318 + }, + { + "epoch": 0.4790119752993825, + "grad_norm": 11.970732688903809, + "learning_rate": 1.2452717586955721e-05, + "loss": 2.1919, + "step": 38320 + }, + { + "epoch": 0.4790369759243981, + "grad_norm": 3.9438648223876953, + "learning_rate": 1.2451871545585921e-05, + "loss": 1.4636, + "step": 38322 + }, + { + "epoch": 0.47906197654941374, + "grad_norm": 4.730101585388184, + "learning_rate": 1.245102548554301e-05, + "loss": 1.4679, + "step": 38324 + }, + { + "epoch": 0.4790869771744294, + "grad_norm": 4.376184463500977, + "learning_rate": 1.245017940683344e-05, + "loss": 1.0944, + "step": 38326 + }, + { + "epoch": 0.479111977799445, + "grad_norm": 3.53483247756958, + "learning_rate": 1.244933330946365e-05, + "loss": 1.6894, + "step": 38328 + }, + { + "epoch": 0.4791369784244606, + "grad_norm": 2.7973663806915283, + "learning_rate": 1.2448487193440083e-05, + "loss": 1.3389, + "step": 38330 + }, + { + "epoch": 0.4791619790494762, + "grad_norm": 4.9394755363464355, + "learning_rate": 1.2447641058769184e-05, + "loss": 1.0001, + "step": 38332 + }, + { + "epoch": 0.47918697967449186, + "grad_norm": 3.6381642818450928, + "learning_rate": 1.2446794905457395e-05, + "loss": 0.8384, + "step": 38334 + }, + { + "epoch": 0.4792119802995075, + "grad_norm": 3.134472370147705, + "learning_rate": 1.2445948733511164e-05, + "loss": 1.2808, + "step": 38336 + }, + { + "epoch": 0.4792369809245231, + "grad_norm": 2.386200189590454, + "learning_rate": 1.2445102542936931e-05, + "loss": 1.1848, + "step": 38338 + }, + { + "epoch": 0.47926198154953875, + "grad_norm": 4.609679222106934, + "learning_rate": 1.2444256333741144e-05, + "loss": 1.4158, + "step": 38340 + }, + { + "epoch": 0.47928698217455434, + "grad_norm": 4.727839946746826, + "learning_rate": 1.2443410105930246e-05, + "loss": 0.2597, + "step": 38342 + }, + { + "epoch": 0.47931198279957, + "grad_norm": 3.841245412826538, + "learning_rate": 1.2442563859510683e-05, + "loss": 1.3877, + "step": 38344 + }, + { + "epoch": 0.47933698342458564, + "grad_norm": 3.9793193340301514, + "learning_rate": 1.2441717594488903e-05, + "loss": 0.335, + "step": 38346 + }, + { + "epoch": 0.47936198404960123, + "grad_norm": 0.0014786168467253447, + "learning_rate": 1.2440871310871341e-05, + "loss": 0.9188, + "step": 38348 + }, + { + "epoch": 0.4793869846746169, + "grad_norm": 0.0007144180126488209, + "learning_rate": 1.2440025008664449e-05, + "loss": 0.0096, + "step": 38350 + }, + { + "epoch": 0.47941198529963247, + "grad_norm": 1.62134850025177, + "learning_rate": 1.2439178687874673e-05, + "loss": 1.703, + "step": 38352 + }, + { + "epoch": 0.4794369859246481, + "grad_norm": 0.0011330406414344907, + "learning_rate": 1.2438332348508457e-05, + "loss": 0.0947, + "step": 38354 + }, + { + "epoch": 0.47946198654966377, + "grad_norm": 2.546856164932251, + "learning_rate": 1.2437485990572246e-05, + "loss": 0.927, + "step": 38356 + }, + { + "epoch": 0.47948698717467936, + "grad_norm": 0.025131570175290108, + "learning_rate": 1.2436639614072487e-05, + "loss": 1.2105, + "step": 38358 + }, + { + "epoch": 0.479511987799695, + "grad_norm": 3.5363378524780273, + "learning_rate": 1.2435793219015628e-05, + "loss": 1.1065, + "step": 38360 + }, + { + "epoch": 0.4795369884247106, + "grad_norm": 2.9731476306915283, + "learning_rate": 1.243494680540811e-05, + "loss": 1.4179, + "step": 38362 + }, + { + "epoch": 0.47956198904972624, + "grad_norm": 5.430442810058594, + "learning_rate": 1.2434100373256376e-05, + "loss": 2.2123, + "step": 38364 + }, + { + "epoch": 0.4795869896747419, + "grad_norm": 1.4627959728240967, + "learning_rate": 1.2433253922566887e-05, + "loss": 0.7605, + "step": 38366 + }, + { + "epoch": 0.4796119902997575, + "grad_norm": 3.3477911949157715, + "learning_rate": 1.2432407453346076e-05, + "loss": 0.8133, + "step": 38368 + }, + { + "epoch": 0.47963699092477313, + "grad_norm": 2.48740816116333, + "learning_rate": 1.2431560965600392e-05, + "loss": 1.3121, + "step": 38370 + }, + { + "epoch": 0.4796619915497887, + "grad_norm": 3.9881138801574707, + "learning_rate": 1.2430714459336288e-05, + "loss": 1.3921, + "step": 38372 + }, + { + "epoch": 0.47968699217480437, + "grad_norm": 3.804670810699463, + "learning_rate": 1.2429867934560203e-05, + "loss": 1.1735, + "step": 38374 + }, + { + "epoch": 0.47971199279982, + "grad_norm": 1.9916353225708008, + "learning_rate": 1.2429021391278586e-05, + "loss": 0.4597, + "step": 38376 + }, + { + "epoch": 0.4797369934248356, + "grad_norm": 3.669015884399414, + "learning_rate": 1.242817482949789e-05, + "loss": 0.6996, + "step": 38378 + }, + { + "epoch": 0.47976199404985126, + "grad_norm": 2.598898410797119, + "learning_rate": 1.2427328249224554e-05, + "loss": 0.6277, + "step": 38380 + }, + { + "epoch": 0.47978699467486685, + "grad_norm": 0.0009207144030369818, + "learning_rate": 1.2426481650465032e-05, + "loss": 0.3976, + "step": 38382 + }, + { + "epoch": 0.4798119952998825, + "grad_norm": 0.39852991700172424, + "learning_rate": 1.2425635033225768e-05, + "loss": 0.0249, + "step": 38384 + }, + { + "epoch": 0.47983699592489815, + "grad_norm": 2.0108749866485596, + "learning_rate": 1.2424788397513212e-05, + "loss": 1.7164, + "step": 38386 + }, + { + "epoch": 0.47986199654991374, + "grad_norm": 0.0028704607393592596, + "learning_rate": 1.2423941743333807e-05, + "loss": 0.055, + "step": 38388 + }, + { + "epoch": 0.4798869971749294, + "grad_norm": 0.0039650434628129005, + "learning_rate": 1.2423095070694004e-05, + "loss": 0.0014, + "step": 38390 + }, + { + "epoch": 0.479911997799945, + "grad_norm": 6.124033451080322, + "learning_rate": 1.2422248379600257e-05, + "loss": 0.1658, + "step": 38392 + }, + { + "epoch": 0.4799369984249606, + "grad_norm": 1.7382889986038208, + "learning_rate": 1.2421401670059005e-05, + "loss": 0.04, + "step": 38394 + }, + { + "epoch": 0.4799619990499763, + "grad_norm": 2.2378172874450684, + "learning_rate": 1.2420554942076702e-05, + "loss": 0.9191, + "step": 38396 + }, + { + "epoch": 0.47998699967499187, + "grad_norm": 4.4269561767578125, + "learning_rate": 1.2419708195659794e-05, + "loss": 0.8436, + "step": 38398 + }, + { + "epoch": 0.4800120003000075, + "grad_norm": 1.7246521711349487, + "learning_rate": 1.241886143081473e-05, + "loss": 1.0546, + "step": 38400 + }, + { + "epoch": 0.4800370009250231, + "grad_norm": 1.8465819358825684, + "learning_rate": 1.2418014647547957e-05, + "loss": 0.7677, + "step": 38402 + }, + { + "epoch": 0.48006200155003875, + "grad_norm": 7.191723823547363, + "learning_rate": 1.2417167845865934e-05, + "loss": 1.8426, + "step": 38404 + }, + { + "epoch": 0.4800870021750544, + "grad_norm": 2.8198001384735107, + "learning_rate": 1.2416321025775098e-05, + "loss": 0.8285, + "step": 38406 + }, + { + "epoch": 0.48011200280007, + "grad_norm": 3.8043253421783447, + "learning_rate": 1.2415474187281902e-05, + "loss": 1.3973, + "step": 38408 + }, + { + "epoch": 0.48013700342508564, + "grad_norm": 0.0053267702460289, + "learning_rate": 1.2414627330392794e-05, + "loss": 0.0001, + "step": 38410 + }, + { + "epoch": 0.48016200405010123, + "grad_norm": 3.272334337234497, + "learning_rate": 1.2413780455114233e-05, + "loss": 0.8986, + "step": 38412 + }, + { + "epoch": 0.4801870046751169, + "grad_norm": 2.915031909942627, + "learning_rate": 1.2412933561452653e-05, + "loss": 0.8149, + "step": 38414 + }, + { + "epoch": 0.4802120053001325, + "grad_norm": 5.406810283660889, + "learning_rate": 1.2412086649414517e-05, + "loss": 1.2376, + "step": 38416 + }, + { + "epoch": 0.4802370059251481, + "grad_norm": 5.0141801834106445, + "learning_rate": 1.241123971900627e-05, + "loss": 1.1209, + "step": 38418 + }, + { + "epoch": 0.48026200655016377, + "grad_norm": 3.586026191711426, + "learning_rate": 1.2410392770234361e-05, + "loss": 0.633, + "step": 38420 + }, + { + "epoch": 0.48028700717517936, + "grad_norm": 1.791496753692627, + "learning_rate": 1.2409545803105243e-05, + "loss": 1.2823, + "step": 38422 + }, + { + "epoch": 0.480312007800195, + "grad_norm": 3.5323476791381836, + "learning_rate": 1.2408698817625364e-05, + "loss": 1.8898, + "step": 38424 + }, + { + "epoch": 0.48033700842521065, + "grad_norm": 8.981853485107422, + "learning_rate": 1.2407851813801174e-05, + "loss": 0.3527, + "step": 38426 + }, + { + "epoch": 0.48036200905022625, + "grad_norm": 4.371600151062012, + "learning_rate": 1.2407004791639126e-05, + "loss": 1.3066, + "step": 38428 + }, + { + "epoch": 0.4803870096752419, + "grad_norm": 7.854085922241211, + "learning_rate": 1.2406157751145674e-05, + "loss": 1.681, + "step": 38430 + }, + { + "epoch": 0.4804120103002575, + "grad_norm": 3.1485486030578613, + "learning_rate": 1.2405310692327259e-05, + "loss": 0.5369, + "step": 38432 + }, + { + "epoch": 0.48043701092527313, + "grad_norm": 3.3596746921539307, + "learning_rate": 1.2404463615190341e-05, + "loss": 1.5138, + "step": 38434 + }, + { + "epoch": 0.4804620115502888, + "grad_norm": 7.233487606048584, + "learning_rate": 1.2403616519741366e-05, + "loss": 0.4346, + "step": 38436 + }, + { + "epoch": 0.4804870121753044, + "grad_norm": 2.198430299758911, + "learning_rate": 1.2402769405986787e-05, + "loss": 0.7118, + "step": 38438 + }, + { + "epoch": 0.48051201280032, + "grad_norm": 0.0005128337070345879, + "learning_rate": 1.240192227393306e-05, + "loss": 0.0, + "step": 38440 + }, + { + "epoch": 0.4805370134253356, + "grad_norm": 0.01222472358494997, + "learning_rate": 1.240107512358663e-05, + "loss": 0.0002, + "step": 38442 + }, + { + "epoch": 0.48056201405035126, + "grad_norm": 12.613787651062012, + "learning_rate": 1.2400227954953951e-05, + "loss": 3.3439, + "step": 38444 + }, + { + "epoch": 0.4805870146753669, + "grad_norm": 1.0622504949569702, + "learning_rate": 1.2399380768041477e-05, + "loss": 0.0428, + "step": 38446 + }, + { + "epoch": 0.4806120153003825, + "grad_norm": 0.0016532234149053693, + "learning_rate": 1.2398533562855657e-05, + "loss": 0.532, + "step": 38448 + }, + { + "epoch": 0.48063701592539815, + "grad_norm": 2.624846935272217, + "learning_rate": 1.2397686339402946e-05, + "loss": 0.4215, + "step": 38450 + }, + { + "epoch": 0.48066201655041374, + "grad_norm": 4.626823425292969, + "learning_rate": 1.2396839097689792e-05, + "loss": 0.7777, + "step": 38452 + }, + { + "epoch": 0.4806870171754294, + "grad_norm": 6.8257856369018555, + "learning_rate": 1.2395991837722654e-05, + "loss": 1.3404, + "step": 38454 + }, + { + "epoch": 0.48071201780044504, + "grad_norm": 3.964232921600342, + "learning_rate": 1.239514455950798e-05, + "loss": 0.6743, + "step": 38456 + }, + { + "epoch": 0.4807370184254606, + "grad_norm": 3.216568946838379, + "learning_rate": 1.2394297263052222e-05, + "loss": 0.7798, + "step": 38458 + }, + { + "epoch": 0.4807620190504763, + "grad_norm": 1.9548603296279907, + "learning_rate": 1.2393449948361835e-05, + "loss": 0.1747, + "step": 38460 + }, + { + "epoch": 0.48078701967549187, + "grad_norm": 1.7630494832992554, + "learning_rate": 1.2392602615443275e-05, + "loss": 0.2967, + "step": 38462 + }, + { + "epoch": 0.4808120203005075, + "grad_norm": 0.0006579715409316123, + "learning_rate": 1.2391755264302986e-05, + "loss": 0.0, + "step": 38464 + }, + { + "epoch": 0.48083702092552316, + "grad_norm": 3.139652729034424, + "learning_rate": 1.239090789494743e-05, + "loss": 1.2516, + "step": 38466 + }, + { + "epoch": 0.48086202155053875, + "grad_norm": 0.0008657341822981834, + "learning_rate": 1.239006050738306e-05, + "loss": 0.3101, + "step": 38468 + }, + { + "epoch": 0.4808870221755544, + "grad_norm": 0.9802165627479553, + "learning_rate": 1.2389213101616326e-05, + "loss": 0.5831, + "step": 38470 + }, + { + "epoch": 0.48091202280057, + "grad_norm": 7.254119873046875, + "learning_rate": 1.2388365677653682e-05, + "loss": 1.5247, + "step": 38472 + }, + { + "epoch": 0.48093702342558564, + "grad_norm": 1.4947422742843628, + "learning_rate": 1.238751823550158e-05, + "loss": 0.2051, + "step": 38474 + }, + { + "epoch": 0.4809620240506013, + "grad_norm": 1.6109733581542969, + "learning_rate": 1.238667077516648e-05, + "loss": 0.0989, + "step": 38476 + }, + { + "epoch": 0.4809870246756169, + "grad_norm": 3.926982879638672, + "learning_rate": 1.2385823296654833e-05, + "loss": 0.771, + "step": 38478 + }, + { + "epoch": 0.48101202530063253, + "grad_norm": 5.468361854553223, + "learning_rate": 1.2384975799973092e-05, + "loss": 1.8498, + "step": 38480 + }, + { + "epoch": 0.4810370259256481, + "grad_norm": 3.379262924194336, + "learning_rate": 1.2384128285127713e-05, + "loss": 0.6119, + "step": 38482 + }, + { + "epoch": 0.48106202655066377, + "grad_norm": 2.896657943725586, + "learning_rate": 1.2383280752125151e-05, + "loss": 0.5627, + "step": 38484 + }, + { + "epoch": 0.4810870271756794, + "grad_norm": 0.0005797014455311, + "learning_rate": 1.2382433200971859e-05, + "loss": 0.879, + "step": 38486 + }, + { + "epoch": 0.481112027800695, + "grad_norm": 4.312134265899658, + "learning_rate": 1.2381585631674291e-05, + "loss": 1.1141, + "step": 38488 + }, + { + "epoch": 0.48113702842571066, + "grad_norm": 0.0005178253049962223, + "learning_rate": 1.2380738044238907e-05, + "loss": 0.584, + "step": 38490 + }, + { + "epoch": 0.48116202905072625, + "grad_norm": 1.3063205480575562, + "learning_rate": 1.2379890438672155e-05, + "loss": 0.0586, + "step": 38492 + }, + { + "epoch": 0.4811870296757419, + "grad_norm": 2.0052475929260254, + "learning_rate": 1.2379042814980498e-05, + "loss": 0.8077, + "step": 38494 + }, + { + "epoch": 0.48121203030075754, + "grad_norm": 3.0068023204803467, + "learning_rate": 1.2378195173170386e-05, + "loss": 0.6219, + "step": 38496 + }, + { + "epoch": 0.48123703092577313, + "grad_norm": 4.362217426300049, + "learning_rate": 1.2377347513248274e-05, + "loss": 1.2182, + "step": 38498 + }, + { + "epoch": 0.4812620315507888, + "grad_norm": 8.177752494812012, + "learning_rate": 1.2376499835220622e-05, + "loss": 1.1547, + "step": 38500 + }, + { + "epoch": 0.4812870321758044, + "grad_norm": 1.2006608247756958, + "learning_rate": 1.2375652139093882e-05, + "loss": 0.392, + "step": 38502 + }, + { + "epoch": 0.48131203280082, + "grad_norm": 9.553483009338379, + "learning_rate": 1.237480442487451e-05, + "loss": 1.5451, + "step": 38504 + }, + { + "epoch": 0.48133703342583567, + "grad_norm": 3.8062474727630615, + "learning_rate": 1.2373956692568963e-05, + "loss": 2.1046, + "step": 38506 + }, + { + "epoch": 0.48136203405085126, + "grad_norm": 0.0008678306476213038, + "learning_rate": 1.2373108942183701e-05, + "loss": 0.2396, + "step": 38508 + }, + { + "epoch": 0.4813870346758669, + "grad_norm": 3.0803751945495605, + "learning_rate": 1.2372261173725175e-05, + "loss": 0.2443, + "step": 38510 + }, + { + "epoch": 0.4814120353008825, + "grad_norm": 0.09901231527328491, + "learning_rate": 1.2371413387199845e-05, + "loss": 0.0035, + "step": 38512 + }, + { + "epoch": 0.48143703592589815, + "grad_norm": 0.6195866465568542, + "learning_rate": 1.2370565582614163e-05, + "loss": 0.2714, + "step": 38514 + }, + { + "epoch": 0.4814620365509138, + "grad_norm": 3.099635601043701, + "learning_rate": 1.2369717759974589e-05, + "loss": 0.7884, + "step": 38516 + }, + { + "epoch": 0.4814870371759294, + "grad_norm": 0.25981828570365906, + "learning_rate": 1.2368869919287581e-05, + "loss": 0.6504, + "step": 38518 + }, + { + "epoch": 0.48151203780094504, + "grad_norm": 5.380058765411377, + "learning_rate": 1.2368022060559594e-05, + "loss": 1.9171, + "step": 38520 + }, + { + "epoch": 0.48153703842596063, + "grad_norm": 0.6437296867370605, + "learning_rate": 1.2367174183797088e-05, + "loss": 0.0696, + "step": 38522 + }, + { + "epoch": 0.4815620390509763, + "grad_norm": 6.154825210571289, + "learning_rate": 1.2366326289006517e-05, + "loss": 0.5464, + "step": 38524 + }, + { + "epoch": 0.4815870396759919, + "grad_norm": 5.077880382537842, + "learning_rate": 1.2365478376194337e-05, + "loss": 0.1509, + "step": 38526 + }, + { + "epoch": 0.4816120403010075, + "grad_norm": 9.291412353515625, + "learning_rate": 1.236463044536701e-05, + "loss": 0.5859, + "step": 38528 + }, + { + "epoch": 0.48163704092602316, + "grad_norm": 2.6123969554901123, + "learning_rate": 1.2363782496530992e-05, + "loss": 0.9571, + "step": 38530 + }, + { + "epoch": 0.48166204155103876, + "grad_norm": 2.4442267417907715, + "learning_rate": 1.2362934529692741e-05, + "loss": 1.2441, + "step": 38532 + }, + { + "epoch": 0.4816870421760544, + "grad_norm": 3.023571491241455, + "learning_rate": 1.2362086544858714e-05, + "loss": 0.1884, + "step": 38534 + }, + { + "epoch": 0.48171204280107005, + "grad_norm": 0.00030991286621429026, + "learning_rate": 1.2361238542035372e-05, + "loss": 0.0215, + "step": 38536 + }, + { + "epoch": 0.48173704342608564, + "grad_norm": 2.3135647773742676, + "learning_rate": 1.2360390521229172e-05, + "loss": 0.3137, + "step": 38538 + }, + { + "epoch": 0.4817620440511013, + "grad_norm": 4.493278980255127, + "learning_rate": 1.2359542482446567e-05, + "loss": 1.6335, + "step": 38540 + }, + { + "epoch": 0.4817870446761169, + "grad_norm": 5.9416913986206055, + "learning_rate": 1.2358694425694022e-05, + "loss": 1.646, + "step": 38542 + }, + { + "epoch": 0.48181204530113253, + "grad_norm": 2.186558485031128, + "learning_rate": 1.2357846350977996e-05, + "loss": 1.1172, + "step": 38544 + }, + { + "epoch": 0.4818370459261482, + "grad_norm": 4.388662338256836, + "learning_rate": 1.2356998258304942e-05, + "loss": 0.8605, + "step": 38546 + }, + { + "epoch": 0.48186204655116377, + "grad_norm": 3.076875925064087, + "learning_rate": 1.2356150147681328e-05, + "loss": 0.2339, + "step": 38548 + }, + { + "epoch": 0.4818870471761794, + "grad_norm": 4.741358280181885, + "learning_rate": 1.2355302019113602e-05, + "loss": 0.9597, + "step": 38550 + }, + { + "epoch": 0.481912047801195, + "grad_norm": 0.0005889471503905952, + "learning_rate": 1.235445387260823e-05, + "loss": 0.0, + "step": 38552 + }, + { + "epoch": 0.48193704842621066, + "grad_norm": 3.0466766357421875, + "learning_rate": 1.2353605708171672e-05, + "loss": 0.6681, + "step": 38554 + }, + { + "epoch": 0.4819620490512263, + "grad_norm": 6.052707672119141, + "learning_rate": 1.2352757525810383e-05, + "loss": 1.1641, + "step": 38556 + }, + { + "epoch": 0.4819870496762419, + "grad_norm": 2.2910938262939453, + "learning_rate": 1.2351909325530829e-05, + "loss": 0.6127, + "step": 38558 + }, + { + "epoch": 0.48201205030125754, + "grad_norm": 0.0005123642622493207, + "learning_rate": 1.2351061107339464e-05, + "loss": 0.6348, + "step": 38560 + }, + { + "epoch": 0.48203705092627314, + "grad_norm": 5.909537315368652, + "learning_rate": 1.2350212871242753e-05, + "loss": 1.2169, + "step": 38562 + }, + { + "epoch": 0.4820620515512888, + "grad_norm": 5.627435207366943, + "learning_rate": 1.2349364617247151e-05, + "loss": 1.5468, + "step": 38564 + }, + { + "epoch": 0.48208705217630443, + "grad_norm": 3.1207308769226074, + "learning_rate": 1.2348516345359116e-05, + "loss": 0.7407, + "step": 38566 + }, + { + "epoch": 0.48211205280132, + "grad_norm": 0.48069220781326294, + "learning_rate": 1.234766805558512e-05, + "loss": 0.2755, + "step": 38568 + }, + { + "epoch": 0.48213705342633567, + "grad_norm": 0.0006747801671735942, + "learning_rate": 1.234681974793161e-05, + "loss": 0.0082, + "step": 38570 + }, + { + "epoch": 0.48216205405135126, + "grad_norm": 3.307586669921875, + "learning_rate": 1.2345971422405057e-05, + "loss": 0.8844, + "step": 38572 + }, + { + "epoch": 0.4821870546763669, + "grad_norm": 2.9569389820098877, + "learning_rate": 1.2345123079011919e-05, + "loss": 0.6315, + "step": 38574 + }, + { + "epoch": 0.48221205530138256, + "grad_norm": 5.412867069244385, + "learning_rate": 1.234427471775865e-05, + "loss": 1.1356, + "step": 38576 + }, + { + "epoch": 0.48223705592639815, + "grad_norm": 4.734350681304932, + "learning_rate": 1.2343426338651716e-05, + "loss": 0.7057, + "step": 38578 + }, + { + "epoch": 0.4822620565514138, + "grad_norm": 2.63734769821167, + "learning_rate": 1.2342577941697585e-05, + "loss": 0.2593, + "step": 38580 + }, + { + "epoch": 0.4822870571764294, + "grad_norm": 3.4249227046966553, + "learning_rate": 1.2341729526902706e-05, + "loss": 1.2695, + "step": 38582 + }, + { + "epoch": 0.48231205780144504, + "grad_norm": 0.0040348367765545845, + "learning_rate": 1.2340881094273546e-05, + "loss": 0.0001, + "step": 38584 + }, + { + "epoch": 0.4823370584264607, + "grad_norm": 0.006622389890253544, + "learning_rate": 1.2340032643816568e-05, + "loss": 1.2808, + "step": 38586 + }, + { + "epoch": 0.4823620590514763, + "grad_norm": 3.494317054748535, + "learning_rate": 1.2339184175538235e-05, + "loss": 1.0454, + "step": 38588 + }, + { + "epoch": 0.4823870596764919, + "grad_norm": 3.3791022300720215, + "learning_rate": 1.2338335689445003e-05, + "loss": 0.731, + "step": 38590 + }, + { + "epoch": 0.4824120603015075, + "grad_norm": 5.7731852531433105, + "learning_rate": 1.2337487185543336e-05, + "loss": 0.3078, + "step": 38592 + }, + { + "epoch": 0.48243706092652316, + "grad_norm": 4.67320442199707, + "learning_rate": 1.23366386638397e-05, + "loss": 1.6586, + "step": 38594 + }, + { + "epoch": 0.4824620615515388, + "grad_norm": 2.64823317527771, + "learning_rate": 1.2335790124340551e-05, + "loss": 0.5603, + "step": 38596 + }, + { + "epoch": 0.4824870621765544, + "grad_norm": 2.3434951305389404, + "learning_rate": 1.233494156705236e-05, + "loss": 1.6711, + "step": 38598 + }, + { + "epoch": 0.48251206280157005, + "grad_norm": 6.793612480163574, + "learning_rate": 1.2334092991981581e-05, + "loss": 1.1954, + "step": 38600 + }, + { + "epoch": 0.48253706342658564, + "grad_norm": 4.376518249511719, + "learning_rate": 1.233324439913468e-05, + "loss": 0.7087, + "step": 38602 + }, + { + "epoch": 0.4825620640516013, + "grad_norm": 1.6959614753723145, + "learning_rate": 1.2332395788518116e-05, + "loss": 0.3229, + "step": 38604 + }, + { + "epoch": 0.48258706467661694, + "grad_norm": 1.9911571741104126, + "learning_rate": 1.2331547160138359e-05, + "loss": 0.4288, + "step": 38606 + }, + { + "epoch": 0.48261206530163253, + "grad_norm": 3.5653133392333984, + "learning_rate": 1.2330698514001871e-05, + "loss": 0.9392, + "step": 38608 + }, + { + "epoch": 0.4826370659266482, + "grad_norm": 2.202956199645996, + "learning_rate": 1.232984985011511e-05, + "loss": 0.6217, + "step": 38610 + }, + { + "epoch": 0.48266206655166377, + "grad_norm": 3.726270914077759, + "learning_rate": 1.2329001168484543e-05, + "loss": 0.4466, + "step": 38612 + }, + { + "epoch": 0.4826870671766794, + "grad_norm": 3.5057249069213867, + "learning_rate": 1.2328152469116632e-05, + "loss": 1.4216, + "step": 38614 + }, + { + "epoch": 0.48271206780169507, + "grad_norm": 4.284947872161865, + "learning_rate": 1.2327303752017839e-05, + "loss": 0.9984, + "step": 38616 + }, + { + "epoch": 0.48273706842671066, + "grad_norm": 5.668996810913086, + "learning_rate": 1.2326455017194631e-05, + "loss": 0.4698, + "step": 38618 + }, + { + "epoch": 0.4827620690517263, + "grad_norm": 2.2152037620544434, + "learning_rate": 1.2325606264653473e-05, + "loss": 0.1701, + "step": 38620 + }, + { + "epoch": 0.4827870696767419, + "grad_norm": 4.869150638580322, + "learning_rate": 1.2324757494400826e-05, + "loss": 1.3093, + "step": 38622 + }, + { + "epoch": 0.48281207030175755, + "grad_norm": 3.8318543434143066, + "learning_rate": 1.2323908706443151e-05, + "loss": 0.7151, + "step": 38624 + }, + { + "epoch": 0.4828370709267732, + "grad_norm": 3.4279801845550537, + "learning_rate": 1.232305990078692e-05, + "loss": 0.6911, + "step": 38626 + }, + { + "epoch": 0.4828620715517888, + "grad_norm": 3.2043983936309814, + "learning_rate": 1.2322211077438588e-05, + "loss": 0.227, + "step": 38628 + }, + { + "epoch": 0.48288707217680443, + "grad_norm": 4.189776420593262, + "learning_rate": 1.232136223640463e-05, + "loss": 0.6654, + "step": 38630 + }, + { + "epoch": 0.48291207280182, + "grad_norm": 0.0008302719215862453, + "learning_rate": 1.2320513377691504e-05, + "loss": 0.0213, + "step": 38632 + }, + { + "epoch": 0.4829370734268357, + "grad_norm": 6.309299468994141, + "learning_rate": 1.2319664501305677e-05, + "loss": 0.448, + "step": 38634 + }, + { + "epoch": 0.4829620740518513, + "grad_norm": 5.7035017013549805, + "learning_rate": 1.2318815607253611e-05, + "loss": 1.7044, + "step": 38636 + }, + { + "epoch": 0.4829870746768669, + "grad_norm": 2.824551582336426, + "learning_rate": 1.2317966695541774e-05, + "loss": 0.7222, + "step": 38638 + }, + { + "epoch": 0.48301207530188256, + "grad_norm": 2.231602907180786, + "learning_rate": 1.2317117766176631e-05, + "loss": 1.0298, + "step": 38640 + }, + { + "epoch": 0.48303707592689815, + "grad_norm": 2.1197328567504883, + "learning_rate": 1.2316268819164647e-05, + "loss": 0.8146, + "step": 38642 + }, + { + "epoch": 0.4830620765519138, + "grad_norm": 2.641021966934204, + "learning_rate": 1.2315419854512286e-05, + "loss": 0.7293, + "step": 38644 + }, + { + "epoch": 0.48308707717692945, + "grad_norm": 8.051114082336426, + "learning_rate": 1.231457087222602e-05, + "loss": 1.2191, + "step": 38646 + }, + { + "epoch": 0.48311207780194504, + "grad_norm": 3.8081886768341064, + "learning_rate": 1.2313721872312303e-05, + "loss": 1.6575, + "step": 38648 + }, + { + "epoch": 0.4831370784269607, + "grad_norm": 3.889982223510742, + "learning_rate": 1.231287285477761e-05, + "loss": 0.7834, + "step": 38650 + }, + { + "epoch": 0.4831620790519763, + "grad_norm": 2.8649067878723145, + "learning_rate": 1.2312023819628405e-05, + "loss": 1.1458, + "step": 38652 + }, + { + "epoch": 0.4831870796769919, + "grad_norm": 2.1207191944122314, + "learning_rate": 1.231117476687115e-05, + "loss": 0.1523, + "step": 38654 + }, + { + "epoch": 0.4832120803020076, + "grad_norm": 2.6752214431762695, + "learning_rate": 1.2310325696512319e-05, + "loss": 0.4281, + "step": 38656 + }, + { + "epoch": 0.48323708092702317, + "grad_norm": 0.0006827629404142499, + "learning_rate": 1.2309476608558372e-05, + "loss": 0.4761, + "step": 38658 + }, + { + "epoch": 0.4832620815520388, + "grad_norm": 0.000444192934082821, + "learning_rate": 1.2308627503015779e-05, + "loss": 0.9576, + "step": 38660 + }, + { + "epoch": 0.4832870821770544, + "grad_norm": 0.0008283460047096014, + "learning_rate": 1.2307778379891006e-05, + "loss": 0.2184, + "step": 38662 + }, + { + "epoch": 0.48331208280207005, + "grad_norm": 3.110973358154297, + "learning_rate": 1.2306929239190517e-05, + "loss": 0.7159, + "step": 38664 + }, + { + "epoch": 0.4833370834270857, + "grad_norm": 3.566394805908203, + "learning_rate": 1.2306080080920782e-05, + "loss": 1.0095, + "step": 38666 + }, + { + "epoch": 0.4833620840521013, + "grad_norm": 3.369401216506958, + "learning_rate": 1.2305230905088268e-05, + "loss": 1.3742, + "step": 38668 + }, + { + "epoch": 0.48338708467711694, + "grad_norm": 2.0692644119262695, + "learning_rate": 1.230438171169944e-05, + "loss": 1.3431, + "step": 38670 + }, + { + "epoch": 0.48341208530213253, + "grad_norm": 3.6144299507141113, + "learning_rate": 1.2303532500760768e-05, + "loss": 0.4474, + "step": 38672 + }, + { + "epoch": 0.4834370859271482, + "grad_norm": 6.1006364822387695, + "learning_rate": 1.2302683272278719e-05, + "loss": 2.0413, + "step": 38674 + }, + { + "epoch": 0.48346208655216383, + "grad_norm": 3.9849870204925537, + "learning_rate": 1.2301834026259757e-05, + "loss": 1.3347, + "step": 38676 + }, + { + "epoch": 0.4834870871771794, + "grad_norm": 5.362797737121582, + "learning_rate": 1.2300984762710356e-05, + "loss": 2.061, + "step": 38678 + }, + { + "epoch": 0.48351208780219507, + "grad_norm": 5.777114391326904, + "learning_rate": 1.2300135481636976e-05, + "loss": 1.2278, + "step": 38680 + }, + { + "epoch": 0.48353708842721066, + "grad_norm": 0.0007987513672560453, + "learning_rate": 1.2299286183046093e-05, + "loss": 0.5802, + "step": 38682 + }, + { + "epoch": 0.4835620890522263, + "grad_norm": 2.4575624465942383, + "learning_rate": 1.229843686694417e-05, + "loss": 0.6988, + "step": 38684 + }, + { + "epoch": 0.48358708967724195, + "grad_norm": 2.672813653945923, + "learning_rate": 1.229758753333768e-05, + "loss": 0.6278, + "step": 38686 + }, + { + "epoch": 0.48361209030225755, + "grad_norm": 5.089178085327148, + "learning_rate": 1.2296738182233085e-05, + "loss": 0.188, + "step": 38688 + }, + { + "epoch": 0.4836370909272732, + "grad_norm": 3.5871710777282715, + "learning_rate": 1.2295888813636858e-05, + "loss": 1.0653, + "step": 38690 + }, + { + "epoch": 0.4836620915522888, + "grad_norm": 0.7947502136230469, + "learning_rate": 1.2295039427555464e-05, + "loss": 0.625, + "step": 38692 + }, + { + "epoch": 0.48368709217730443, + "grad_norm": 5.324491024017334, + "learning_rate": 1.2294190023995379e-05, + "loss": 1.1317, + "step": 38694 + }, + { + "epoch": 0.4837120928023201, + "grad_norm": 2.880880832672119, + "learning_rate": 1.2293340602963062e-05, + "loss": 0.6713, + "step": 38696 + }, + { + "epoch": 0.4837370934273357, + "grad_norm": 7.080091953277588, + "learning_rate": 1.2292491164464992e-05, + "loss": 1.3705, + "step": 38698 + }, + { + "epoch": 0.4837620940523513, + "grad_norm": 0.3105092942714691, + "learning_rate": 1.229164170850763e-05, + "loss": 0.5766, + "step": 38700 + }, + { + "epoch": 0.4837870946773669, + "grad_norm": 5.031198978424072, + "learning_rate": 1.2290792235097451e-05, + "loss": 1.6207, + "step": 38702 + }, + { + "epoch": 0.48381209530238256, + "grad_norm": 0.0009249792783521116, + "learning_rate": 1.228994274424092e-05, + "loss": 0.8151, + "step": 38704 + }, + { + "epoch": 0.4838370959273982, + "grad_norm": 0.16101181507110596, + "learning_rate": 1.2289093235944512e-05, + "loss": 0.3356, + "step": 38706 + }, + { + "epoch": 0.4838620965524138, + "grad_norm": 0.0013410784304141998, + "learning_rate": 1.2288243710214691e-05, + "loss": 0.1, + "step": 38708 + }, + { + "epoch": 0.48388709717742945, + "grad_norm": 1.8213591575622559, + "learning_rate": 1.2287394167057929e-05, + "loss": 0.4943, + "step": 38710 + }, + { + "epoch": 0.48391209780244504, + "grad_norm": 8.568350791931152, + "learning_rate": 1.22865446064807e-05, + "loss": 1.5491, + "step": 38712 + }, + { + "epoch": 0.4839370984274607, + "grad_norm": 2.2915537357330322, + "learning_rate": 1.2285695028489468e-05, + "loss": 0.1527, + "step": 38714 + }, + { + "epoch": 0.48396209905247634, + "grad_norm": 1.9288935661315918, + "learning_rate": 1.2284845433090707e-05, + "loss": 1.1018, + "step": 38716 + }, + { + "epoch": 0.4839870996774919, + "grad_norm": 1.7264704704284668, + "learning_rate": 1.2283995820290883e-05, + "loss": 0.6262, + "step": 38718 + }, + { + "epoch": 0.4840121003025076, + "grad_norm": 1.637776494026184, + "learning_rate": 1.2283146190096474e-05, + "loss": 0.1266, + "step": 38720 + }, + { + "epoch": 0.48403710092752317, + "grad_norm": 2.5121166706085205, + "learning_rate": 1.2282296542513945e-05, + "loss": 0.5882, + "step": 38722 + }, + { + "epoch": 0.4840621015525388, + "grad_norm": 2.992114543914795, + "learning_rate": 1.228144687754977e-05, + "loss": 1.2728, + "step": 38724 + }, + { + "epoch": 0.48408710217755446, + "grad_norm": 0.7425459027290344, + "learning_rate": 1.2280597195210416e-05, + "loss": 0.3156, + "step": 38726 + }, + { + "epoch": 0.48411210280257005, + "grad_norm": 2.0157508850097656, + "learning_rate": 1.2279747495502357e-05, + "loss": 0.0861, + "step": 38728 + }, + { + "epoch": 0.4841371034275857, + "grad_norm": 6.024392604827881, + "learning_rate": 1.2278897778432061e-05, + "loss": 1.9126, + "step": 38730 + }, + { + "epoch": 0.4841621040526013, + "grad_norm": 2.572554588317871, + "learning_rate": 1.2278048044006004e-05, + "loss": 1.1629, + "step": 38732 + }, + { + "epoch": 0.48418710467761694, + "grad_norm": 2.8813982009887695, + "learning_rate": 1.2277198292230654e-05, + "loss": 1.0084, + "step": 38734 + }, + { + "epoch": 0.4842121053026326, + "grad_norm": 7.916965484619141, + "learning_rate": 1.2276348523112484e-05, + "loss": 1.9229, + "step": 38736 + }, + { + "epoch": 0.4842371059276482, + "grad_norm": 3.6217238903045654, + "learning_rate": 1.2275498736657969e-05, + "loss": 2.2915, + "step": 38738 + }, + { + "epoch": 0.48426210655266383, + "grad_norm": 1.896032691001892, + "learning_rate": 1.2274648932873576e-05, + "loss": 0.8367, + "step": 38740 + }, + { + "epoch": 0.4842871071776794, + "grad_norm": 5.016892433166504, + "learning_rate": 1.2273799111765777e-05, + "loss": 0.9759, + "step": 38742 + }, + { + "epoch": 0.48431210780269507, + "grad_norm": 4.534250259399414, + "learning_rate": 1.2272949273341045e-05, + "loss": 0.5918, + "step": 38744 + }, + { + "epoch": 0.4843371084277107, + "grad_norm": 12.979485511779785, + "learning_rate": 1.2272099417605854e-05, + "loss": 3.3945, + "step": 38746 + }, + { + "epoch": 0.4843621090527263, + "grad_norm": 1.3464093208312988, + "learning_rate": 1.2271249544566675e-05, + "loss": 0.7324, + "step": 38748 + }, + { + "epoch": 0.48438710967774196, + "grad_norm": 0.9305438995361328, + "learning_rate": 1.227039965422998e-05, + "loss": 0.6256, + "step": 38750 + }, + { + "epoch": 0.48441211030275755, + "grad_norm": 6.357110977172852, + "learning_rate": 1.2269549746602244e-05, + "loss": 1.6432, + "step": 38752 + }, + { + "epoch": 0.4844371109277732, + "grad_norm": 2.9935719966888428, + "learning_rate": 1.2268699821689935e-05, + "loss": 0.6958, + "step": 38754 + }, + { + "epoch": 0.48446211155278884, + "grad_norm": 2.951479196548462, + "learning_rate": 1.2267849879499529e-05, + "loss": 0.5732, + "step": 38756 + }, + { + "epoch": 0.48448711217780444, + "grad_norm": 2.4620859622955322, + "learning_rate": 1.2266999920037503e-05, + "loss": 1.0212, + "step": 38758 + }, + { + "epoch": 0.4845121128028201, + "grad_norm": 6.411454200744629, + "learning_rate": 1.2266149943310324e-05, + "loss": 1.5199, + "step": 38760 + }, + { + "epoch": 0.4845371134278357, + "grad_norm": 3.518165111541748, + "learning_rate": 1.2265299949324468e-05, + "loss": 1.4117, + "step": 38762 + }, + { + "epoch": 0.4845621140528513, + "grad_norm": 7.816111087799072, + "learning_rate": 1.2264449938086409e-05, + "loss": 1.8575, + "step": 38764 + }, + { + "epoch": 0.48458711467786697, + "grad_norm": 4.015373229980469, + "learning_rate": 1.2263599909602617e-05, + "loss": 0.8047, + "step": 38766 + }, + { + "epoch": 0.48461211530288256, + "grad_norm": 1.382094144821167, + "learning_rate": 1.2262749863879566e-05, + "loss": 0.7342, + "step": 38768 + }, + { + "epoch": 0.4846371159278982, + "grad_norm": 2.722160577774048, + "learning_rate": 1.2261899800923737e-05, + "loss": 0.7608, + "step": 38770 + }, + { + "epoch": 0.4846621165529138, + "grad_norm": 2.731354236602783, + "learning_rate": 1.2261049720741598e-05, + "loss": 1.9041, + "step": 38772 + }, + { + "epoch": 0.48468711717792945, + "grad_norm": 0.24634303152561188, + "learning_rate": 1.2260199623339622e-05, + "loss": 1.1712, + "step": 38774 + }, + { + "epoch": 0.4847121178029451, + "grad_norm": 2.7474446296691895, + "learning_rate": 1.2259349508724287e-05, + "loss": 0.6045, + "step": 38776 + }, + { + "epoch": 0.4847371184279607, + "grad_norm": 3.8429362773895264, + "learning_rate": 1.2258499376902067e-05, + "loss": 0.1566, + "step": 38778 + }, + { + "epoch": 0.48476211905297634, + "grad_norm": 2.2350614070892334, + "learning_rate": 1.225764922787943e-05, + "loss": 0.6702, + "step": 38780 + }, + { + "epoch": 0.48478711967799193, + "grad_norm": 6.284664630889893, + "learning_rate": 1.2256799061662858e-05, + "loss": 1.7665, + "step": 38782 + }, + { + "epoch": 0.4848121203030076, + "grad_norm": 5.458268642425537, + "learning_rate": 1.2255948878258827e-05, + "loss": 1.2215, + "step": 38784 + }, + { + "epoch": 0.4848371209280232, + "grad_norm": 0.9409512281417847, + "learning_rate": 1.2255098677673805e-05, + "loss": 0.5548, + "step": 38786 + }, + { + "epoch": 0.4848621215530388, + "grad_norm": 0.00643676845356822, + "learning_rate": 1.225424845991427e-05, + "loss": 0.6538, + "step": 38788 + }, + { + "epoch": 0.48488712217805446, + "grad_norm": 1.1430555582046509, + "learning_rate": 1.22533982249867e-05, + "loss": 0.8941, + "step": 38790 + }, + { + "epoch": 0.48491212280307006, + "grad_norm": 5.946310043334961, + "learning_rate": 1.2252547972897568e-05, + "loss": 1.2349, + "step": 38792 + }, + { + "epoch": 0.4849371234280857, + "grad_norm": 2.9565420150756836, + "learning_rate": 1.2251697703653344e-05, + "loss": 1.0628, + "step": 38794 + }, + { + "epoch": 0.48496212405310135, + "grad_norm": 7.227141380310059, + "learning_rate": 1.2250847417260513e-05, + "loss": 1.256, + "step": 38796 + }, + { + "epoch": 0.48498712467811694, + "grad_norm": 3.0712873935699463, + "learning_rate": 1.2249997113725547e-05, + "loss": 0.5954, + "step": 38798 + }, + { + "epoch": 0.4850121253031326, + "grad_norm": 2.4262726306915283, + "learning_rate": 1.224914679305492e-05, + "loss": 0.5458, + "step": 38800 + }, + { + "epoch": 0.4850371259281482, + "grad_norm": 9.984963417053223, + "learning_rate": 1.2248296455255112e-05, + "loss": 0.5329, + "step": 38802 + }, + { + "epoch": 0.48506212655316383, + "grad_norm": 0.0006191418506205082, + "learning_rate": 1.2247446100332592e-05, + "loss": 0.0, + "step": 38804 + }, + { + "epoch": 0.4850871271781795, + "grad_norm": 1.9887735843658447, + "learning_rate": 1.224659572829384e-05, + "loss": 1.1145, + "step": 38806 + }, + { + "epoch": 0.48511212780319507, + "grad_norm": 1.2443443536758423, + "learning_rate": 1.2245745339145331e-05, + "loss": 0.9913, + "step": 38808 + }, + { + "epoch": 0.4851371284282107, + "grad_norm": 5.570785999298096, + "learning_rate": 1.224489493289355e-05, + "loss": 1.1101, + "step": 38810 + }, + { + "epoch": 0.4851621290532263, + "grad_norm": 6.228426933288574, + "learning_rate": 1.2244044509544962e-05, + "loss": 0.244, + "step": 38812 + }, + { + "epoch": 0.48518712967824196, + "grad_norm": 0.3639448583126068, + "learning_rate": 1.2243194069106047e-05, + "loss": 0.1882, + "step": 38814 + }, + { + "epoch": 0.4852121303032576, + "grad_norm": 0.36746230721473694, + "learning_rate": 1.224234361158329e-05, + "loss": 1.0511, + "step": 38816 + }, + { + "epoch": 0.4852371309282732, + "grad_norm": 2.141921043395996, + "learning_rate": 1.2241493136983151e-05, + "loss": 0.7236, + "step": 38818 + }, + { + "epoch": 0.48526213155328884, + "grad_norm": 3.1439669132232666, + "learning_rate": 1.2240642645312125e-05, + "loss": 1.7778, + "step": 38820 + }, + { + "epoch": 0.48528713217830444, + "grad_norm": 0.0008374465396627784, + "learning_rate": 1.223979213657668e-05, + "loss": 1.2676, + "step": 38822 + }, + { + "epoch": 0.4853121328033201, + "grad_norm": 2.6948201656341553, + "learning_rate": 1.2238941610783294e-05, + "loss": 0.535, + "step": 38824 + }, + { + "epoch": 0.48533713342833573, + "grad_norm": 4.328764915466309, + "learning_rate": 1.2238091067938445e-05, + "loss": 1.8235, + "step": 38826 + }, + { + "epoch": 0.4853621340533513, + "grad_norm": 7.344858169555664, + "learning_rate": 1.2237240508048613e-05, + "loss": 0.9997, + "step": 38828 + }, + { + "epoch": 0.48538713467836697, + "grad_norm": 0.000689778127707541, + "learning_rate": 1.223638993112027e-05, + "loss": 0.0103, + "step": 38830 + }, + { + "epoch": 0.48541213530338256, + "grad_norm": 4.768600940704346, + "learning_rate": 1.2235539337159898e-05, + "loss": 0.9436, + "step": 38832 + }, + { + "epoch": 0.4854371359283982, + "grad_norm": 1.5156946182250977, + "learning_rate": 1.2234688726173975e-05, + "loss": 0.6783, + "step": 38834 + }, + { + "epoch": 0.48546213655341386, + "grad_norm": 4.735301494598389, + "learning_rate": 1.2233838098168983e-05, + "loss": 1.7546, + "step": 38836 + }, + { + "epoch": 0.48548713717842945, + "grad_norm": 3.884974718093872, + "learning_rate": 1.2232987453151392e-05, + "loss": 0.8208, + "step": 38838 + }, + { + "epoch": 0.4855121378034451, + "grad_norm": 0.4157455265522003, + "learning_rate": 1.2232136791127685e-05, + "loss": 0.679, + "step": 38840 + }, + { + "epoch": 0.4855371384284607, + "grad_norm": 2.1143622398376465, + "learning_rate": 1.2231286112104339e-05, + "loss": 0.2657, + "step": 38842 + }, + { + "epoch": 0.48556213905347634, + "grad_norm": 5.135213375091553, + "learning_rate": 1.2230435416087833e-05, + "loss": 1.7846, + "step": 38844 + }, + { + "epoch": 0.485587139678492, + "grad_norm": 3.009904623031616, + "learning_rate": 1.2229584703084646e-05, + "loss": 0.6905, + "step": 38846 + }, + { + "epoch": 0.4856121403035076, + "grad_norm": 4.583606719970703, + "learning_rate": 1.2228733973101261e-05, + "loss": 0.3588, + "step": 38848 + }, + { + "epoch": 0.4856371409285232, + "grad_norm": 0.20299319922924042, + "learning_rate": 1.2227883226144153e-05, + "loss": 0.0032, + "step": 38850 + }, + { + "epoch": 0.4856621415535388, + "grad_norm": 2.249997854232788, + "learning_rate": 1.2227032462219798e-05, + "loss": 0.7303, + "step": 38852 + }, + { + "epoch": 0.48568714217855447, + "grad_norm": 2.72662353515625, + "learning_rate": 1.2226181681334681e-05, + "loss": 1.002, + "step": 38854 + }, + { + "epoch": 0.4857121428035701, + "grad_norm": 9.278313636779785, + "learning_rate": 1.2225330883495276e-05, + "loss": 1.0932, + "step": 38856 + }, + { + "epoch": 0.4857371434285857, + "grad_norm": 3.1873788833618164, + "learning_rate": 1.2224480068708068e-05, + "loss": 1.0152, + "step": 38858 + }, + { + "epoch": 0.48576214405360135, + "grad_norm": 3.024675130844116, + "learning_rate": 1.2223629236979533e-05, + "loss": 0.672, + "step": 38860 + }, + { + "epoch": 0.48578714467861694, + "grad_norm": 1.4712668657302856, + "learning_rate": 1.2222778388316157e-05, + "loss": 1.7763, + "step": 38862 + }, + { + "epoch": 0.4858121453036326, + "grad_norm": 0.0007188313757069409, + "learning_rate": 1.2221927522724411e-05, + "loss": 0.0122, + "step": 38864 + }, + { + "epoch": 0.48583714592864824, + "grad_norm": 0.21504533290863037, + "learning_rate": 1.2221076640210778e-05, + "loss": 0.5764, + "step": 38866 + }, + { + "epoch": 0.48586214655366383, + "grad_norm": 3.5130202770233154, + "learning_rate": 1.2220225740781741e-05, + "loss": 1.0765, + "step": 38868 + }, + { + "epoch": 0.4858871471786795, + "grad_norm": 0.0012735564960166812, + "learning_rate": 1.2219374824443779e-05, + "loss": 1.0157, + "step": 38870 + }, + { + "epoch": 0.48591214780369507, + "grad_norm": 0.002527271630242467, + "learning_rate": 1.221852389120337e-05, + "loss": 0.3477, + "step": 38872 + }, + { + "epoch": 0.4859371484287107, + "grad_norm": 3.0652928352355957, + "learning_rate": 1.2217672941067003e-05, + "loss": 2.1175, + "step": 38874 + }, + { + "epoch": 0.48596214905372637, + "grad_norm": 16.88896369934082, + "learning_rate": 1.2216821974041149e-05, + "loss": 1.0957, + "step": 38876 + }, + { + "epoch": 0.48598714967874196, + "grad_norm": 0.01005540695041418, + "learning_rate": 1.2215970990132292e-05, + "loss": 0.6224, + "step": 38878 + }, + { + "epoch": 0.4860121503037576, + "grad_norm": 3.6235249042510986, + "learning_rate": 1.2215119989346911e-05, + "loss": 0.7115, + "step": 38880 + }, + { + "epoch": 0.4860371509287732, + "grad_norm": 7.35382080078125, + "learning_rate": 1.221426897169149e-05, + "loss": 1.9122, + "step": 38882 + }, + { + "epoch": 0.48606215155378885, + "grad_norm": 1.820999264717102, + "learning_rate": 1.2213417937172512e-05, + "loss": 0.74, + "step": 38884 + }, + { + "epoch": 0.4860871521788045, + "grad_norm": 2.6868607997894287, + "learning_rate": 1.2212566885796452e-05, + "loss": 0.8627, + "step": 38886 + }, + { + "epoch": 0.4861121528038201, + "grad_norm": 13.16156005859375, + "learning_rate": 1.2211715817569802e-05, + "loss": 1.0669, + "step": 38888 + }, + { + "epoch": 0.48613715342883573, + "grad_norm": 3.8328561782836914, + "learning_rate": 1.2210864732499033e-05, + "loss": 0.139, + "step": 38890 + }, + { + "epoch": 0.4861621540538513, + "grad_norm": 4.684243679046631, + "learning_rate": 1.221001363059063e-05, + "loss": 1.2038, + "step": 38892 + }, + { + "epoch": 0.486187154678867, + "grad_norm": 3.9433717727661133, + "learning_rate": 1.2209162511851078e-05, + "loss": 0.8986, + "step": 38894 + }, + { + "epoch": 0.4862121553038826, + "grad_norm": 3.1270320415496826, + "learning_rate": 1.2208311376286856e-05, + "loss": 0.7004, + "step": 38896 + }, + { + "epoch": 0.4862371559288982, + "grad_norm": 0.5325407981872559, + "learning_rate": 1.2207460223904446e-05, + "loss": 0.6341, + "step": 38898 + }, + { + "epoch": 0.48626215655391386, + "grad_norm": 1.748358964920044, + "learning_rate": 1.2206609054710335e-05, + "loss": 1.1036, + "step": 38900 + }, + { + "epoch": 0.48628715717892945, + "grad_norm": 4.148420333862305, + "learning_rate": 1.2205757868710997e-05, + "loss": 1.0508, + "step": 38902 + }, + { + "epoch": 0.4863121578039451, + "grad_norm": 0.9973394274711609, + "learning_rate": 1.2204906665912919e-05, + "loss": 0.6864, + "step": 38904 + }, + { + "epoch": 0.48633715842896075, + "grad_norm": 3.3669652938842773, + "learning_rate": 1.2204055446322584e-05, + "loss": 0.9482, + "step": 38906 + }, + { + "epoch": 0.48636215905397634, + "grad_norm": 4.1983795166015625, + "learning_rate": 1.2203204209946477e-05, + "loss": 1.2223, + "step": 38908 + }, + { + "epoch": 0.486387159678992, + "grad_norm": 1.1897404193878174, + "learning_rate": 1.2202352956791074e-05, + "loss": 0.3484, + "step": 38910 + }, + { + "epoch": 0.4864121603040076, + "grad_norm": 3.3380372524261475, + "learning_rate": 1.2201501686862867e-05, + "loss": 1.8894, + "step": 38912 + }, + { + "epoch": 0.4864371609290232, + "grad_norm": 0.0007331567467190325, + "learning_rate": 1.2200650400168334e-05, + "loss": 0.007, + "step": 38914 + }, + { + "epoch": 0.4864621615540389, + "grad_norm": 5.467441558837891, + "learning_rate": 1.2199799096713956e-05, + "loss": 1.094, + "step": 38916 + }, + { + "epoch": 0.48648716217905447, + "grad_norm": 4.708314895629883, + "learning_rate": 1.219894777650622e-05, + "loss": 1.5238, + "step": 38918 + }, + { + "epoch": 0.4865121628040701, + "grad_norm": 0.0005938450922258198, + "learning_rate": 1.219809643955161e-05, + "loss": 0.0, + "step": 38920 + }, + { + "epoch": 0.4865371634290857, + "grad_norm": 2.874039649963379, + "learning_rate": 1.2197245085856607e-05, + "loss": 1.2468, + "step": 38922 + }, + { + "epoch": 0.48656216405410135, + "grad_norm": 0.8178669214248657, + "learning_rate": 1.2196393715427695e-05, + "loss": 1.0284, + "step": 38924 + }, + { + "epoch": 0.486587164679117, + "grad_norm": 3.5315513610839844, + "learning_rate": 1.2195542328271363e-05, + "loss": 1.2076, + "step": 38926 + }, + { + "epoch": 0.4866121653041326, + "grad_norm": 1.233175277709961, + "learning_rate": 1.2194690924394089e-05, + "loss": 0.4269, + "step": 38928 + }, + { + "epoch": 0.48663716592914824, + "grad_norm": 4.250738143920898, + "learning_rate": 1.2193839503802358e-05, + "loss": 0.8788, + "step": 38930 + }, + { + "epoch": 0.48666216655416383, + "grad_norm": 1.3670305013656616, + "learning_rate": 1.2192988066502656e-05, + "loss": 1.1035, + "step": 38932 + }, + { + "epoch": 0.4866871671791795, + "grad_norm": 2.8525357246398926, + "learning_rate": 1.2192136612501465e-05, + "loss": 0.8826, + "step": 38934 + }, + { + "epoch": 0.48671216780419513, + "grad_norm": 2.8015329837799072, + "learning_rate": 1.2191285141805276e-05, + "loss": 0.8253, + "step": 38936 + }, + { + "epoch": 0.4867371684292107, + "grad_norm": 5.470884323120117, + "learning_rate": 1.2190433654420565e-05, + "loss": 1.9521, + "step": 38938 + }, + { + "epoch": 0.48676216905422637, + "grad_norm": 3.9790894985198975, + "learning_rate": 1.2189582150353827e-05, + "loss": 1.1993, + "step": 38940 + }, + { + "epoch": 0.48678716967924196, + "grad_norm": 4.938882350921631, + "learning_rate": 1.2188730629611536e-05, + "loss": 2.0442, + "step": 38942 + }, + { + "epoch": 0.4868121703042576, + "grad_norm": 3.0883750915527344, + "learning_rate": 1.2187879092200183e-05, + "loss": 0.2383, + "step": 38944 + }, + { + "epoch": 0.48683717092927326, + "grad_norm": 1.896897315979004, + "learning_rate": 1.2187027538126253e-05, + "loss": 1.4369, + "step": 38946 + }, + { + "epoch": 0.48686217155428885, + "grad_norm": 4.252267837524414, + "learning_rate": 1.218617596739623e-05, + "loss": 1.6164, + "step": 38948 + }, + { + "epoch": 0.4868871721793045, + "grad_norm": 1.1962049007415771, + "learning_rate": 1.21853243800166e-05, + "loss": 0.8885, + "step": 38950 + }, + { + "epoch": 0.4869121728043201, + "grad_norm": 4.848703861236572, + "learning_rate": 1.218447277599385e-05, + "loss": 1.1907, + "step": 38952 + }, + { + "epoch": 0.48693717342933573, + "grad_norm": 5.504390716552734, + "learning_rate": 1.2183621155334462e-05, + "loss": 1.6693, + "step": 38954 + }, + { + "epoch": 0.4869621740543514, + "grad_norm": 2.1566781997680664, + "learning_rate": 1.2182769518044925e-05, + "loss": 0.7631, + "step": 38956 + }, + { + "epoch": 0.486987174679367, + "grad_norm": 5.13570499420166, + "learning_rate": 1.2181917864131723e-05, + "loss": 0.8314, + "step": 38958 + }, + { + "epoch": 0.4870121753043826, + "grad_norm": 3.6848912239074707, + "learning_rate": 1.2181066193601345e-05, + "loss": 0.959, + "step": 38960 + }, + { + "epoch": 0.4870371759293982, + "grad_norm": 2.5901026725769043, + "learning_rate": 1.2180214506460275e-05, + "loss": 0.4638, + "step": 38962 + }, + { + "epoch": 0.48706217655441386, + "grad_norm": 4.18335485458374, + "learning_rate": 1.2179362802714998e-05, + "loss": 1.0681, + "step": 38964 + }, + { + "epoch": 0.4870871771794295, + "grad_norm": 3.505079984664917, + "learning_rate": 1.2178511082372005e-05, + "loss": 0.9184, + "step": 38966 + }, + { + "epoch": 0.4871121778044451, + "grad_norm": 7.986201286315918, + "learning_rate": 1.2177659345437778e-05, + "loss": 0.7492, + "step": 38968 + }, + { + "epoch": 0.48713717842946075, + "grad_norm": 2.14923357963562, + "learning_rate": 1.2176807591918803e-05, + "loss": 1.2952, + "step": 38970 + }, + { + "epoch": 0.48716217905447634, + "grad_norm": 3.7369847297668457, + "learning_rate": 1.2175955821821573e-05, + "loss": 0.8788, + "step": 38972 + }, + { + "epoch": 0.487187179679492, + "grad_norm": 3.6215291023254395, + "learning_rate": 1.2175104035152568e-05, + "loss": 1.07, + "step": 38974 + }, + { + "epoch": 0.48721218030450764, + "grad_norm": 1.675173044204712, + "learning_rate": 1.217425223191828e-05, + "loss": 0.9569, + "step": 38976 + }, + { + "epoch": 0.48723718092952323, + "grad_norm": 0.3529589772224426, + "learning_rate": 1.2173400412125196e-05, + "loss": 0.1205, + "step": 38978 + }, + { + "epoch": 0.4872621815545389, + "grad_norm": 0.000664413208141923, + "learning_rate": 1.21725485757798e-05, + "loss": 0.7697, + "step": 38980 + }, + { + "epoch": 0.48728718217955447, + "grad_norm": 3.2625961303710938, + "learning_rate": 1.2171696722888581e-05, + "loss": 0.2375, + "step": 38982 + }, + { + "epoch": 0.4873121828045701, + "grad_norm": 3.095336437225342, + "learning_rate": 1.2170844853458023e-05, + "loss": 1.2413, + "step": 38984 + }, + { + "epoch": 0.48733718342958576, + "grad_norm": 2.348724842071533, + "learning_rate": 1.2169992967494626e-05, + "loss": 0.0753, + "step": 38986 + }, + { + "epoch": 0.48736218405460136, + "grad_norm": 4.933621406555176, + "learning_rate": 1.2169141065004863e-05, + "loss": 1.2577, + "step": 38988 + }, + { + "epoch": 0.487387184679617, + "grad_norm": 2.0889365673065186, + "learning_rate": 1.216828914599523e-05, + "loss": 0.8766, + "step": 38990 + }, + { + "epoch": 0.4874121853046326, + "grad_norm": 3.8797943592071533, + "learning_rate": 1.2167437210472216e-05, + "loss": 1.9796, + "step": 38992 + }, + { + "epoch": 0.48743718592964824, + "grad_norm": 11.415633201599121, + "learning_rate": 1.2166585258442306e-05, + "loss": 0.2738, + "step": 38994 + }, + { + "epoch": 0.4874621865546639, + "grad_norm": 4.181601524353027, + "learning_rate": 1.2165733289911984e-05, + "loss": 0.5838, + "step": 38996 + }, + { + "epoch": 0.4874871871796795, + "grad_norm": 3.277103900909424, + "learning_rate": 1.216488130488775e-05, + "loss": 0.575, + "step": 38998 + }, + { + "epoch": 0.48751218780469513, + "grad_norm": 5.130817413330078, + "learning_rate": 1.2164029303376083e-05, + "loss": 2.8958, + "step": 39000 + }, + { + "epoch": 0.4875371884297107, + "grad_norm": 0.0005377261550165713, + "learning_rate": 1.2163177285383477e-05, + "loss": 0.6101, + "step": 39002 + }, + { + "epoch": 0.48756218905472637, + "grad_norm": 3.5735960006713867, + "learning_rate": 1.2162325250916416e-05, + "loss": 0.2507, + "step": 39004 + }, + { + "epoch": 0.487587189679742, + "grad_norm": 3.0883429050445557, + "learning_rate": 1.2161473199981395e-05, + "loss": 1.3745, + "step": 39006 + }, + { + "epoch": 0.4876121903047576, + "grad_norm": 2.077418804168701, + "learning_rate": 1.2160621132584894e-05, + "loss": 0.5302, + "step": 39008 + }, + { + "epoch": 0.48763719092977326, + "grad_norm": 0.0006749124149791896, + "learning_rate": 1.2159769048733412e-05, + "loss": 0.3383, + "step": 39010 + }, + { + "epoch": 0.48766219155478885, + "grad_norm": 0.17000915110111237, + "learning_rate": 1.2158916948433435e-05, + "loss": 0.0021, + "step": 39012 + }, + { + "epoch": 0.4876871921798045, + "grad_norm": 5.571370601654053, + "learning_rate": 1.2158064831691449e-05, + "loss": 1.5564, + "step": 39014 + }, + { + "epoch": 0.48771219280482014, + "grad_norm": 5.5144219398498535, + "learning_rate": 1.2157212698513949e-05, + "loss": 0.9171, + "step": 39016 + }, + { + "epoch": 0.48773719342983574, + "grad_norm": 0.004149450920522213, + "learning_rate": 1.2156360548907423e-05, + "loss": 0.8839, + "step": 39018 + }, + { + "epoch": 0.4877621940548514, + "grad_norm": 2.4901416301727295, + "learning_rate": 1.2155508382878356e-05, + "loss": 0.8405, + "step": 39020 + }, + { + "epoch": 0.487787194679867, + "grad_norm": 3.6043922901153564, + "learning_rate": 1.2154656200433244e-05, + "loss": 0.2233, + "step": 39022 + }, + { + "epoch": 0.4878121953048826, + "grad_norm": 0.0004560702363960445, + "learning_rate": 1.2153804001578578e-05, + "loss": 0.3692, + "step": 39024 + }, + { + "epoch": 0.48783719592989827, + "grad_norm": 3.8807475566864014, + "learning_rate": 1.215295178632084e-05, + "loss": 0.7487, + "step": 39026 + }, + { + "epoch": 0.48786219655491386, + "grad_norm": 0.31948983669281006, + "learning_rate": 1.2152099554666528e-05, + "loss": 0.5893, + "step": 39028 + }, + { + "epoch": 0.4878871971799295, + "grad_norm": 2.648189067840576, + "learning_rate": 1.2151247306622132e-05, + "loss": 0.1871, + "step": 39030 + }, + { + "epoch": 0.4879121978049451, + "grad_norm": 2.368208169937134, + "learning_rate": 1.215039504219414e-05, + "loss": 0.3659, + "step": 39032 + }, + { + "epoch": 0.48793719842996075, + "grad_norm": 2.063704252243042, + "learning_rate": 1.2149542761389039e-05, + "loss": 0.4982, + "step": 39034 + }, + { + "epoch": 0.4879621990549764, + "grad_norm": 7.610097885131836, + "learning_rate": 1.2148690464213329e-05, + "loss": 1.2184, + "step": 39036 + }, + { + "epoch": 0.487987199679992, + "grad_norm": 3.074248790740967, + "learning_rate": 1.2147838150673497e-05, + "loss": 1.1941, + "step": 39038 + }, + { + "epoch": 0.48801220030500764, + "grad_norm": 3.353062152862549, + "learning_rate": 1.214698582077603e-05, + "loss": 0.7543, + "step": 39040 + }, + { + "epoch": 0.48803720093002323, + "grad_norm": 7.536311626434326, + "learning_rate": 1.2146133474527424e-05, + "loss": 0.9602, + "step": 39042 + }, + { + "epoch": 0.4880622015550389, + "grad_norm": 4.740661144256592, + "learning_rate": 1.2145281111934169e-05, + "loss": 1.4691, + "step": 39044 + }, + { + "epoch": 0.4880872021800545, + "grad_norm": 4.525247573852539, + "learning_rate": 1.2144428733002756e-05, + "loss": 1.8534, + "step": 39046 + }, + { + "epoch": 0.4881122028050701, + "grad_norm": 3.8699655532836914, + "learning_rate": 1.2143576337739678e-05, + "loss": 1.7744, + "step": 39048 + }, + { + "epoch": 0.48813720343008576, + "grad_norm": 2.5866901874542236, + "learning_rate": 1.2142723926151424e-05, + "loss": 1.0806, + "step": 39050 + }, + { + "epoch": 0.48816220405510136, + "grad_norm": 9.533464431762695, + "learning_rate": 1.214187149824449e-05, + "loss": 1.0375, + "step": 39052 + }, + { + "epoch": 0.488187204680117, + "grad_norm": 1.5558096170425415, + "learning_rate": 1.2141019054025365e-05, + "loss": 0.3501, + "step": 39054 + }, + { + "epoch": 0.48821220530513265, + "grad_norm": 3.1137311458587646, + "learning_rate": 1.214016659350054e-05, + "loss": 0.7546, + "step": 39056 + }, + { + "epoch": 0.48823720593014824, + "grad_norm": 2.3069751262664795, + "learning_rate": 1.213931411667651e-05, + "loss": 0.4429, + "step": 39058 + }, + { + "epoch": 0.4882622065551639, + "grad_norm": 2.21090030670166, + "learning_rate": 1.2138461623559764e-05, + "loss": 1.3648, + "step": 39060 + }, + { + "epoch": 0.4882872071801795, + "grad_norm": 3.6247668266296387, + "learning_rate": 1.21376091141568e-05, + "loss": 1.074, + "step": 39062 + }, + { + "epoch": 0.48831220780519513, + "grad_norm": 4.038668632507324, + "learning_rate": 1.2136756588474107e-05, + "loss": 1.9068, + "step": 39064 + }, + { + "epoch": 0.4883372084302108, + "grad_norm": 3.087592363357544, + "learning_rate": 1.2135904046518178e-05, + "loss": 1.3728, + "step": 39066 + }, + { + "epoch": 0.48836220905522637, + "grad_norm": 4.111034393310547, + "learning_rate": 1.2135051488295505e-05, + "loss": 0.7075, + "step": 39068 + }, + { + "epoch": 0.488387209680242, + "grad_norm": 4.46456241607666, + "learning_rate": 1.213419891381258e-05, + "loss": 0.8844, + "step": 39070 + }, + { + "epoch": 0.4884122103052576, + "grad_norm": 2.8103599548339844, + "learning_rate": 1.21333463230759e-05, + "loss": 1.6442, + "step": 39072 + }, + { + "epoch": 0.48843721093027326, + "grad_norm": 0.216396763920784, + "learning_rate": 1.2132493716091956e-05, + "loss": 0.0018, + "step": 39074 + }, + { + "epoch": 0.4884622115552889, + "grad_norm": 3.738513946533203, + "learning_rate": 1.2131641092867242e-05, + "loss": 1.0247, + "step": 39076 + }, + { + "epoch": 0.4884872121803045, + "grad_norm": 0.0008665291243232787, + "learning_rate": 1.2130788453408249e-05, + "loss": 0.7448, + "step": 39078 + }, + { + "epoch": 0.48851221280532015, + "grad_norm": 4.7140092849731445, + "learning_rate": 1.2129935797721472e-05, + "loss": 1.8955, + "step": 39080 + }, + { + "epoch": 0.48853721343033574, + "grad_norm": 2.9672763347625732, + "learning_rate": 1.2129083125813408e-05, + "loss": 0.3626, + "step": 39082 + }, + { + "epoch": 0.4885622140553514, + "grad_norm": 3.2198867797851562, + "learning_rate": 1.2128230437690547e-05, + "loss": 1.2254, + "step": 39084 + }, + { + "epoch": 0.48858721468036703, + "grad_norm": 3.456463098526001, + "learning_rate": 1.2127377733359383e-05, + "loss": 0.7597, + "step": 39086 + }, + { + "epoch": 0.4886122153053826, + "grad_norm": 5.820477485656738, + "learning_rate": 1.2126525012826411e-05, + "loss": 1.8332, + "step": 39088 + }, + { + "epoch": 0.48863721593039827, + "grad_norm": 2.6461682319641113, + "learning_rate": 1.2125672276098126e-05, + "loss": 0.4096, + "step": 39090 + }, + { + "epoch": 0.48866221655541386, + "grad_norm": 3.211662769317627, + "learning_rate": 1.2124819523181021e-05, + "loss": 0.7591, + "step": 39092 + }, + { + "epoch": 0.4886872171804295, + "grad_norm": 5.860072135925293, + "learning_rate": 1.2123966754081591e-05, + "loss": 1.6756, + "step": 39094 + }, + { + "epoch": 0.48871221780544516, + "grad_norm": 0.007605365011841059, + "learning_rate": 1.212311396880633e-05, + "loss": 0.8941, + "step": 39096 + }, + { + "epoch": 0.48873721843046075, + "grad_norm": 2.2996902465820312, + "learning_rate": 1.2122261167361736e-05, + "loss": 0.8641, + "step": 39098 + }, + { + "epoch": 0.4887622190554764, + "grad_norm": 2.2515692710876465, + "learning_rate": 1.2121408349754297e-05, + "loss": 0.3048, + "step": 39100 + }, + { + "epoch": 0.488787219680492, + "grad_norm": 0.2550736367702484, + "learning_rate": 1.2120555515990517e-05, + "loss": 0.011, + "step": 39102 + }, + { + "epoch": 0.48881222030550764, + "grad_norm": 2.6299526691436768, + "learning_rate": 1.2119702666076883e-05, + "loss": 0.8906, + "step": 39104 + }, + { + "epoch": 0.4888372209305233, + "grad_norm": 2.4565553665161133, + "learning_rate": 1.2118849800019892e-05, + "loss": 0.3902, + "step": 39106 + }, + { + "epoch": 0.4888622215555389, + "grad_norm": 3.4435131549835205, + "learning_rate": 1.2117996917826042e-05, + "loss": 1.7732, + "step": 39108 + }, + { + "epoch": 0.4888872221805545, + "grad_norm": 3.951422929763794, + "learning_rate": 1.2117144019501827e-05, + "loss": 0.7093, + "step": 39110 + }, + { + "epoch": 0.4889122228055701, + "grad_norm": 0.8714689612388611, + "learning_rate": 1.211629110505374e-05, + "loss": 0.5088, + "step": 39112 + }, + { + "epoch": 0.48893722343058577, + "grad_norm": 6.301384449005127, + "learning_rate": 1.2115438174488284e-05, + "loss": 2.028, + "step": 39114 + }, + { + "epoch": 0.4889622240556014, + "grad_norm": 0.3457268476486206, + "learning_rate": 1.2114585227811949e-05, + "loss": 1.0679, + "step": 39116 + }, + { + "epoch": 0.488987224680617, + "grad_norm": 3.9947824478149414, + "learning_rate": 1.211373226503123e-05, + "loss": 0.8459, + "step": 39118 + }, + { + "epoch": 0.48901222530563265, + "grad_norm": 8.04660415649414, + "learning_rate": 1.2112879286152625e-05, + "loss": 1.5772, + "step": 39120 + }, + { + "epoch": 0.48903722593064824, + "grad_norm": 5.346249103546143, + "learning_rate": 1.211202629118263e-05, + "loss": 1.0062, + "step": 39122 + }, + { + "epoch": 0.4890622265556639, + "grad_norm": 1.0875256061553955, + "learning_rate": 1.2111173280127744e-05, + "loss": 0.6486, + "step": 39124 + }, + { + "epoch": 0.48908722718067954, + "grad_norm": 6.7345380783081055, + "learning_rate": 1.2110320252994458e-05, + "loss": 2.4603, + "step": 39126 + }, + { + "epoch": 0.48911222780569513, + "grad_norm": 3.7863993644714355, + "learning_rate": 1.2109467209789271e-05, + "loss": 1.6576, + "step": 39128 + }, + { + "epoch": 0.4891372284307108, + "grad_norm": 2.9177355766296387, + "learning_rate": 1.2108614150518684e-05, + "loss": 1.3938, + "step": 39130 + }, + { + "epoch": 0.48916222905572637, + "grad_norm": 3.371136426925659, + "learning_rate": 1.2107761075189188e-05, + "loss": 1.7964, + "step": 39132 + }, + { + "epoch": 0.489187229680742, + "grad_norm": 0.0006722485413774848, + "learning_rate": 1.2106907983807279e-05, + "loss": 0.535, + "step": 39134 + }, + { + "epoch": 0.48921223030575767, + "grad_norm": 2.9531400203704834, + "learning_rate": 1.2106054876379459e-05, + "loss": 1.2618, + "step": 39136 + }, + { + "epoch": 0.48923723093077326, + "grad_norm": 3.3233096599578857, + "learning_rate": 1.2105201752912223e-05, + "loss": 1.4009, + "step": 39138 + }, + { + "epoch": 0.4892622315557889, + "grad_norm": 2.9596190452575684, + "learning_rate": 1.2104348613412067e-05, + "loss": 1.5242, + "step": 39140 + }, + { + "epoch": 0.4892872321808045, + "grad_norm": 3.1938562393188477, + "learning_rate": 1.2103495457885493e-05, + "loss": 0.7679, + "step": 39142 + }, + { + "epoch": 0.48931223280582015, + "grad_norm": 3.1273200511932373, + "learning_rate": 1.210264228633899e-05, + "loss": 0.6193, + "step": 39144 + }, + { + "epoch": 0.4893372334308358, + "grad_norm": 0.000911738898139447, + "learning_rate": 1.2101789098779064e-05, + "loss": 0.2259, + "step": 39146 + }, + { + "epoch": 0.4893622340558514, + "grad_norm": 2.53791880607605, + "learning_rate": 1.2100935895212209e-05, + "loss": 0.8954, + "step": 39148 + }, + { + "epoch": 0.48938723468086703, + "grad_norm": 4.400355339050293, + "learning_rate": 1.2100082675644921e-05, + "loss": 0.8948, + "step": 39150 + }, + { + "epoch": 0.4894122353058826, + "grad_norm": 1.293670415878296, + "learning_rate": 1.2099229440083701e-05, + "loss": 0.6373, + "step": 39152 + }, + { + "epoch": 0.4894372359308983, + "grad_norm": 2.7886455059051514, + "learning_rate": 1.2098376188535048e-05, + "loss": 1.4786, + "step": 39154 + }, + { + "epoch": 0.4894622365559139, + "grad_norm": 2.90071439743042, + "learning_rate": 1.209752292100546e-05, + "loss": 0.866, + "step": 39156 + }, + { + "epoch": 0.4894872371809295, + "grad_norm": 7.9163055419921875, + "learning_rate": 1.2096669637501432e-05, + "loss": 1.2021, + "step": 39158 + }, + { + "epoch": 0.48951223780594516, + "grad_norm": 2.817502498626709, + "learning_rate": 1.2095816338029462e-05, + "loss": 0.9583, + "step": 39160 + }, + { + "epoch": 0.48953723843096075, + "grad_norm": 0.0006042994209565222, + "learning_rate": 1.2094963022596058e-05, + "loss": 0.4838, + "step": 39162 + }, + { + "epoch": 0.4895622390559764, + "grad_norm": 4.497681140899658, + "learning_rate": 1.2094109691207707e-05, + "loss": 1.2059, + "step": 39164 + }, + { + "epoch": 0.48958723968099205, + "grad_norm": 0.2817995250225067, + "learning_rate": 1.2093256343870914e-05, + "loss": 1.2266, + "step": 39166 + }, + { + "epoch": 0.48961224030600764, + "grad_norm": 1.1992158889770508, + "learning_rate": 1.2092402980592178e-05, + "loss": 0.7931, + "step": 39168 + }, + { + "epoch": 0.4896372409310233, + "grad_norm": 3.709909677505493, + "learning_rate": 1.2091549601377994e-05, + "loss": 0.8119, + "step": 39170 + }, + { + "epoch": 0.4896622415560389, + "grad_norm": 3.4422428607940674, + "learning_rate": 1.2090696206234862e-05, + "loss": 1.0872, + "step": 39172 + }, + { + "epoch": 0.4896872421810545, + "grad_norm": 0.0006647545960731804, + "learning_rate": 1.2089842795169288e-05, + "loss": 0.0, + "step": 39174 + }, + { + "epoch": 0.4897122428060702, + "grad_norm": 3.2794694900512695, + "learning_rate": 1.2088989368187767e-05, + "loss": 0.8915, + "step": 39176 + }, + { + "epoch": 0.48973724343108577, + "grad_norm": 4.049009799957275, + "learning_rate": 1.2088135925296795e-05, + "loss": 0.8718, + "step": 39178 + }, + { + "epoch": 0.4897622440561014, + "grad_norm": 3.216653347015381, + "learning_rate": 1.2087282466502877e-05, + "loss": 1.2603, + "step": 39180 + }, + { + "epoch": 0.489787244681117, + "grad_norm": 1.2695447206497192, + "learning_rate": 1.2086428991812511e-05, + "loss": 0.122, + "step": 39182 + }, + { + "epoch": 0.48981224530613265, + "grad_norm": 1.8232611417770386, + "learning_rate": 1.2085575501232199e-05, + "loss": 0.8251, + "step": 39184 + }, + { + "epoch": 0.4898372459311483, + "grad_norm": 6.701232433319092, + "learning_rate": 1.2084721994768431e-05, + "loss": 1.4804, + "step": 39186 + }, + { + "epoch": 0.4898622465561639, + "grad_norm": 3.2202227115631104, + "learning_rate": 1.2083868472427724e-05, + "loss": 1.8532, + "step": 39188 + }, + { + "epoch": 0.48988724718117954, + "grad_norm": 2.897562265396118, + "learning_rate": 1.2083014934216566e-05, + "loss": 0.8756, + "step": 39190 + }, + { + "epoch": 0.48991224780619513, + "grad_norm": 0.014386745169758797, + "learning_rate": 1.2082161380141459e-05, + "loss": 0.589, + "step": 39192 + }, + { + "epoch": 0.4899372484312108, + "grad_norm": 0.9880372881889343, + "learning_rate": 1.2081307810208908e-05, + "loss": 0.5579, + "step": 39194 + }, + { + "epoch": 0.48996224905622643, + "grad_norm": 2.1704020500183105, + "learning_rate": 1.2080454224425411e-05, + "loss": 0.2541, + "step": 39196 + }, + { + "epoch": 0.489987249681242, + "grad_norm": 5.966630935668945, + "learning_rate": 1.2079600622797465e-05, + "loss": 0.163, + "step": 39198 + }, + { + "epoch": 0.49001225030625767, + "grad_norm": 1.4529979228973389, + "learning_rate": 1.207874700533158e-05, + "loss": 0.717, + "step": 39200 + }, + { + "epoch": 0.49003725093127326, + "grad_norm": 2.8553619384765625, + "learning_rate": 1.2077893372034249e-05, + "loss": 0.7795, + "step": 39202 + }, + { + "epoch": 0.4900622515562889, + "grad_norm": 0.20330111682415009, + "learning_rate": 1.2077039722911976e-05, + "loss": 1.3016, + "step": 39204 + }, + { + "epoch": 0.49008725218130456, + "grad_norm": 0.0006314108613878489, + "learning_rate": 1.2076186057971262e-05, + "loss": 0.3968, + "step": 39206 + }, + { + "epoch": 0.49011225280632015, + "grad_norm": 5.497964382171631, + "learning_rate": 1.2075332377218612e-05, + "loss": 1.3019, + "step": 39208 + }, + { + "epoch": 0.4901372534313358, + "grad_norm": 3.0975849628448486, + "learning_rate": 1.2074478680660517e-05, + "loss": 0.9296, + "step": 39210 + }, + { + "epoch": 0.4901622540563514, + "grad_norm": 3.578979969024658, + "learning_rate": 1.2073624968303491e-05, + "loss": 0.7841, + "step": 39212 + }, + { + "epoch": 0.49018725468136704, + "grad_norm": 2.285463809967041, + "learning_rate": 1.207277124015403e-05, + "loss": 1.2994, + "step": 39214 + }, + { + "epoch": 0.4902122553063827, + "grad_norm": 2.5321643352508545, + "learning_rate": 1.2071917496218636e-05, + "loss": 0.6236, + "step": 39216 + }, + { + "epoch": 0.4902372559313983, + "grad_norm": 4.055169105529785, + "learning_rate": 1.2071063736503812e-05, + "loss": 0.6572, + "step": 39218 + }, + { + "epoch": 0.4902622565564139, + "grad_norm": 3.298383951187134, + "learning_rate": 1.207020996101606e-05, + "loss": 0.6525, + "step": 39220 + }, + { + "epoch": 0.4902872571814295, + "grad_norm": 0.0006243037641979754, + "learning_rate": 1.2069356169761877e-05, + "loss": 0.0003, + "step": 39222 + }, + { + "epoch": 0.49031225780644516, + "grad_norm": 2.431692123413086, + "learning_rate": 1.2068502362747774e-05, + "loss": 0.1066, + "step": 39224 + }, + { + "epoch": 0.4903372584314608, + "grad_norm": 2.136676788330078, + "learning_rate": 1.206764853998025e-05, + "loss": 0.7544, + "step": 39226 + }, + { + "epoch": 0.4903622590564764, + "grad_norm": 4.80412483215332, + "learning_rate": 1.2066794701465807e-05, + "loss": 0.951, + "step": 39228 + }, + { + "epoch": 0.49038725968149205, + "grad_norm": 4.485151767730713, + "learning_rate": 1.2065940847210946e-05, + "loss": 0.6016, + "step": 39230 + }, + { + "epoch": 0.49041226030650764, + "grad_norm": 0.0007507723639719188, + "learning_rate": 1.2065086977222171e-05, + "loss": 0.6265, + "step": 39232 + }, + { + "epoch": 0.4904372609315233, + "grad_norm": 2.4864346981048584, + "learning_rate": 1.2064233091505988e-05, + "loss": 0.6799, + "step": 39234 + }, + { + "epoch": 0.49046226155653894, + "grad_norm": 2.7057976722717285, + "learning_rate": 1.2063379190068896e-05, + "loss": 0.8399, + "step": 39236 + }, + { + "epoch": 0.49048726218155453, + "grad_norm": 0.0006317406077869236, + "learning_rate": 1.20625252729174e-05, + "loss": 0.522, + "step": 39238 + }, + { + "epoch": 0.4905122628065702, + "grad_norm": 5.997284412384033, + "learning_rate": 1.2061671340058005e-05, + "loss": 0.9638, + "step": 39240 + }, + { + "epoch": 0.49053726343158577, + "grad_norm": 0.0007311950321309268, + "learning_rate": 1.2060817391497213e-05, + "loss": 0.4821, + "step": 39242 + }, + { + "epoch": 0.4905622640566014, + "grad_norm": 1.2789654731750488, + "learning_rate": 1.2059963427241523e-05, + "loss": 1.0057, + "step": 39244 + }, + { + "epoch": 0.49058726468161706, + "grad_norm": 4.038971424102783, + "learning_rate": 1.2059109447297448e-05, + "loss": 1.1221, + "step": 39246 + }, + { + "epoch": 0.49061226530663266, + "grad_norm": 4.683907508850098, + "learning_rate": 1.205825545167148e-05, + "loss": 1.2826, + "step": 39248 + }, + { + "epoch": 0.4906372659316483, + "grad_norm": 4.819521903991699, + "learning_rate": 1.2057401440370136e-05, + "loss": 0.5895, + "step": 39250 + }, + { + "epoch": 0.4906622665566639, + "grad_norm": 2.1650474071502686, + "learning_rate": 1.2056547413399913e-05, + "loss": 0.1418, + "step": 39252 + }, + { + "epoch": 0.49068726718167954, + "grad_norm": 1.608264684677124, + "learning_rate": 1.2055693370767312e-05, + "loss": 1.1372, + "step": 39254 + }, + { + "epoch": 0.4907122678066952, + "grad_norm": 6.2879509925842285, + "learning_rate": 1.2054839312478843e-05, + "loss": 1.6397, + "step": 39256 + }, + { + "epoch": 0.4907372684317108, + "grad_norm": 2.238368272781372, + "learning_rate": 1.2053985238541008e-05, + "loss": 1.2257, + "step": 39258 + }, + { + "epoch": 0.49076226905672643, + "grad_norm": 3.571617603302002, + "learning_rate": 1.2053131148960313e-05, + "loss": 1.4709, + "step": 39260 + }, + { + "epoch": 0.490787269681742, + "grad_norm": 2.267667531967163, + "learning_rate": 1.2052277043743258e-05, + "loss": 0.9075, + "step": 39262 + }, + { + "epoch": 0.49081227030675767, + "grad_norm": 3.1468892097473145, + "learning_rate": 1.2051422922896357e-05, + "loss": 1.2708, + "step": 39264 + }, + { + "epoch": 0.4908372709317733, + "grad_norm": 5.23546838760376, + "learning_rate": 1.2050568786426107e-05, + "loss": 2.3615, + "step": 39266 + }, + { + "epoch": 0.4908622715567889, + "grad_norm": 3.3219194412231445, + "learning_rate": 1.2049714634339014e-05, + "loss": 1.0226, + "step": 39268 + }, + { + "epoch": 0.49088727218180456, + "grad_norm": 1.8375046253204346, + "learning_rate": 1.2048860466641584e-05, + "loss": 0.6585, + "step": 39270 + }, + { + "epoch": 0.49091227280682015, + "grad_norm": 4.491366863250732, + "learning_rate": 1.204800628334032e-05, + "loss": 0.7929, + "step": 39272 + }, + { + "epoch": 0.4909372734318358, + "grad_norm": 2.7353999614715576, + "learning_rate": 1.2047152084441733e-05, + "loss": 1.3232, + "step": 39274 + }, + { + "epoch": 0.49096227405685144, + "grad_norm": 0.028144191950559616, + "learning_rate": 1.2046297869952326e-05, + "loss": 0.7307, + "step": 39276 + }, + { + "epoch": 0.49098727468186704, + "grad_norm": 3.0633952617645264, + "learning_rate": 1.2045443639878603e-05, + "loss": 1.2419, + "step": 39278 + }, + { + "epoch": 0.4910122753068827, + "grad_norm": 3.4903194904327393, + "learning_rate": 1.204458939422707e-05, + "loss": 0.8429, + "step": 39280 + }, + { + "epoch": 0.4910372759318983, + "grad_norm": 3.802290678024292, + "learning_rate": 1.2043735133004232e-05, + "loss": 1.9114, + "step": 39282 + }, + { + "epoch": 0.4910622765569139, + "grad_norm": 4.078960418701172, + "learning_rate": 1.2042880856216599e-05, + "loss": 1.2986, + "step": 39284 + }, + { + "epoch": 0.49108727718192957, + "grad_norm": 5.954273223876953, + "learning_rate": 1.2042026563870672e-05, + "loss": 1.4013, + "step": 39286 + }, + { + "epoch": 0.49111227780694516, + "grad_norm": 1.156648874282837, + "learning_rate": 1.204117225597296e-05, + "loss": 0.97, + "step": 39288 + }, + { + "epoch": 0.4911372784319608, + "grad_norm": 2.2825963497161865, + "learning_rate": 1.2040317932529968e-05, + "loss": 0.6122, + "step": 39290 + }, + { + "epoch": 0.4911622790569764, + "grad_norm": 3.3107261657714844, + "learning_rate": 1.2039463593548207e-05, + "loss": 1.0382, + "step": 39292 + }, + { + "epoch": 0.49118727968199205, + "grad_norm": 1.5904629230499268, + "learning_rate": 1.2038609239034176e-05, + "loss": 0.9903, + "step": 39294 + }, + { + "epoch": 0.4912122803070077, + "grad_norm": 3.4895496368408203, + "learning_rate": 1.2037754868994387e-05, + "loss": 1.5286, + "step": 39296 + }, + { + "epoch": 0.4912372809320233, + "grad_norm": 3.0088634490966797, + "learning_rate": 1.2036900483435344e-05, + "loss": 1.2115, + "step": 39298 + }, + { + "epoch": 0.49126228155703894, + "grad_norm": 1.8640568256378174, + "learning_rate": 1.2036046082363555e-05, + "loss": 0.7457, + "step": 39300 + }, + { + "epoch": 0.49128728218205453, + "grad_norm": 0.0009751206962391734, + "learning_rate": 1.2035191665785527e-05, + "loss": 1.8219, + "step": 39302 + }, + { + "epoch": 0.4913122828070702, + "grad_norm": 1.1102054119110107, + "learning_rate": 1.2034337233707766e-05, + "loss": 0.6141, + "step": 39304 + }, + { + "epoch": 0.4913372834320858, + "grad_norm": 3.648968458175659, + "learning_rate": 1.2033482786136782e-05, + "loss": 0.5425, + "step": 39306 + }, + { + "epoch": 0.4913622840571014, + "grad_norm": 2.830260992050171, + "learning_rate": 1.203262832307908e-05, + "loss": 0.5263, + "step": 39308 + }, + { + "epoch": 0.49138728468211706, + "grad_norm": 3.081512451171875, + "learning_rate": 1.203177384454117e-05, + "loss": 1.8643, + "step": 39310 + }, + { + "epoch": 0.49141228530713266, + "grad_norm": 6.301189422607422, + "learning_rate": 1.2030919350529554e-05, + "loss": 0.6129, + "step": 39312 + }, + { + "epoch": 0.4914372859321483, + "grad_norm": 2.6138949394226074, + "learning_rate": 1.2030064841050747e-05, + "loss": 1.2128, + "step": 39314 + }, + { + "epoch": 0.49146228655716395, + "grad_norm": 7.3349504470825195, + "learning_rate": 1.202921031611125e-05, + "loss": 0.9339, + "step": 39316 + }, + { + "epoch": 0.49148728718217954, + "grad_norm": 5.309731960296631, + "learning_rate": 1.202835577571758e-05, + "loss": 1.2917, + "step": 39318 + }, + { + "epoch": 0.4915122878071952, + "grad_norm": 2.5066049098968506, + "learning_rate": 1.2027501219876233e-05, + "loss": 0.9393, + "step": 39320 + }, + { + "epoch": 0.4915372884322108, + "grad_norm": 0.0007061894284561276, + "learning_rate": 1.2026646648593724e-05, + "loss": 0.0, + "step": 39322 + }, + { + "epoch": 0.49156228905722643, + "grad_norm": 3.6411945819854736, + "learning_rate": 1.2025792061876564e-05, + "loss": 0.9077, + "step": 39324 + }, + { + "epoch": 0.4915872896822421, + "grad_norm": 2.7411463260650635, + "learning_rate": 1.2024937459731256e-05, + "loss": 0.6174, + "step": 39326 + }, + { + "epoch": 0.49161229030725767, + "grad_norm": 4.334627628326416, + "learning_rate": 1.202408284216431e-05, + "loss": 1.1212, + "step": 39328 + }, + { + "epoch": 0.4916372909322733, + "grad_norm": 0.07694835215806961, + "learning_rate": 1.2023228209182239e-05, + "loss": 1.1081, + "step": 39330 + }, + { + "epoch": 0.4916622915572889, + "grad_norm": 7.499912738800049, + "learning_rate": 1.2022373560791546e-05, + "loss": 2.4892, + "step": 39332 + }, + { + "epoch": 0.49168729218230456, + "grad_norm": 3.0200822353363037, + "learning_rate": 1.202151889699874e-05, + "loss": 0.4319, + "step": 39334 + }, + { + "epoch": 0.4917122928073202, + "grad_norm": 2.8836586475372314, + "learning_rate": 1.2020664217810334e-05, + "loss": 1.5521, + "step": 39336 + }, + { + "epoch": 0.4917372934323358, + "grad_norm": 6.07029914855957, + "learning_rate": 1.2019809523232835e-05, + "loss": 0.569, + "step": 39338 + }, + { + "epoch": 0.49176229405735145, + "grad_norm": 1.751466989517212, + "learning_rate": 1.2018954813272749e-05, + "loss": 0.6543, + "step": 39340 + }, + { + "epoch": 0.49178729468236704, + "grad_norm": 4.098711967468262, + "learning_rate": 1.2018100087936593e-05, + "loss": 1.2026, + "step": 39342 + }, + { + "epoch": 0.4918122953073827, + "grad_norm": 5.168069839477539, + "learning_rate": 1.2017245347230873e-05, + "loss": 1.7182, + "step": 39344 + }, + { + "epoch": 0.49183729593239833, + "grad_norm": 4.626949787139893, + "learning_rate": 1.2016390591162095e-05, + "loss": 0.5209, + "step": 39346 + }, + { + "epoch": 0.4918622965574139, + "grad_norm": 1.1459609270095825, + "learning_rate": 1.201553581973677e-05, + "loss": 0.7, + "step": 39348 + }, + { + "epoch": 0.4918872971824296, + "grad_norm": 2.530339241027832, + "learning_rate": 1.2014681032961411e-05, + "loss": 0.538, + "step": 39350 + }, + { + "epoch": 0.49191229780744516, + "grad_norm": 1.0034459829330444, + "learning_rate": 1.2013826230842524e-05, + "loss": 0.1057, + "step": 39352 + }, + { + "epoch": 0.4919372984324608, + "grad_norm": 2.217639684677124, + "learning_rate": 1.2012971413386624e-05, + "loss": 0.3657, + "step": 39354 + }, + { + "epoch": 0.49196229905747646, + "grad_norm": 3.304743528366089, + "learning_rate": 1.2012116580600218e-05, + "loss": 1.3415, + "step": 39356 + }, + { + "epoch": 0.49198729968249205, + "grad_norm": 4.083927631378174, + "learning_rate": 1.2011261732489818e-05, + "loss": 1.2995, + "step": 39358 + }, + { + "epoch": 0.4920123003075077, + "grad_norm": 3.4600162506103516, + "learning_rate": 1.201040686906193e-05, + "loss": 1.7826, + "step": 39360 + }, + { + "epoch": 0.4920373009325233, + "grad_norm": 4.412591457366943, + "learning_rate": 1.2009551990323067e-05, + "loss": 1.7769, + "step": 39362 + }, + { + "epoch": 0.49206230155753894, + "grad_norm": 3.3925414085388184, + "learning_rate": 1.2008697096279744e-05, + "loss": 0.9689, + "step": 39364 + }, + { + "epoch": 0.4920873021825546, + "grad_norm": 0.025278408080339432, + "learning_rate": 1.2007842186938466e-05, + "loss": 0.1788, + "step": 39366 + }, + { + "epoch": 0.4921123028075702, + "grad_norm": 7.1801862716674805, + "learning_rate": 1.2006987262305744e-05, + "loss": 0.4661, + "step": 39368 + }, + { + "epoch": 0.4921373034325858, + "grad_norm": 5.085987567901611, + "learning_rate": 1.2006132322388095e-05, + "loss": 1.2101, + "step": 39370 + }, + { + "epoch": 0.4921623040576014, + "grad_norm": 0.028478851541876793, + "learning_rate": 1.2005277367192026e-05, + "loss": 0.3334, + "step": 39372 + }, + { + "epoch": 0.49218730468261707, + "grad_norm": 6.021790027618408, + "learning_rate": 1.2004422396724044e-05, + "loss": 0.4549, + "step": 39374 + }, + { + "epoch": 0.4922123053076327, + "grad_norm": 2.489102840423584, + "learning_rate": 1.2003567410990667e-05, + "loss": 0.6979, + "step": 39376 + }, + { + "epoch": 0.4922373059326483, + "grad_norm": 4.176074504852295, + "learning_rate": 1.2002712409998403e-05, + "loss": 1.0086, + "step": 39378 + }, + { + "epoch": 0.49226230655766395, + "grad_norm": 7.886068820953369, + "learning_rate": 1.2001857393753764e-05, + "loss": 1.95, + "step": 39380 + }, + { + "epoch": 0.49228730718267955, + "grad_norm": 7.077792644500732, + "learning_rate": 1.200100236226326e-05, + "loss": 0.1563, + "step": 39382 + }, + { + "epoch": 0.4923123078076952, + "grad_norm": 3.131115436553955, + "learning_rate": 1.2000147315533413e-05, + "loss": 1.5255, + "step": 39384 + }, + { + "epoch": 0.49233730843271084, + "grad_norm": 3.4771888256073, + "learning_rate": 1.199929225357072e-05, + "loss": 0.7149, + "step": 39386 + }, + { + "epoch": 0.49236230905772643, + "grad_norm": 4.211706161499023, + "learning_rate": 1.1998437176381699e-05, + "loss": 1.2387, + "step": 39388 + }, + { + "epoch": 0.4923873096827421, + "grad_norm": 0.5549717545509338, + "learning_rate": 1.1997582083972867e-05, + "loss": 0.8082, + "step": 39390 + }, + { + "epoch": 0.4924123103077577, + "grad_norm": 0.32302480936050415, + "learning_rate": 1.1996726976350732e-05, + "loss": 0.4455, + "step": 39392 + }, + { + "epoch": 0.4924373109327733, + "grad_norm": 7.150963306427002, + "learning_rate": 1.1995871853521806e-05, + "loss": 0.7633, + "step": 39394 + }, + { + "epoch": 0.49246231155778897, + "grad_norm": 3.169111728668213, + "learning_rate": 1.1995016715492605e-05, + "loss": 0.8847, + "step": 39396 + }, + { + "epoch": 0.49248731218280456, + "grad_norm": 3.183270215988159, + "learning_rate": 1.1994161562269633e-05, + "loss": 0.536, + "step": 39398 + }, + { + "epoch": 0.4925123128078202, + "grad_norm": 3.9543957710266113, + "learning_rate": 1.199330639385941e-05, + "loss": 0.8484, + "step": 39400 + }, + { + "epoch": 0.4925373134328358, + "grad_norm": 3.6641509532928467, + "learning_rate": 1.1992451210268448e-05, + "loss": 0.662, + "step": 39402 + }, + { + "epoch": 0.49256231405785145, + "grad_norm": 4.870047569274902, + "learning_rate": 1.1991596011503261e-05, + "loss": 0.8611, + "step": 39404 + }, + { + "epoch": 0.4925873146828671, + "grad_norm": 0.003833193564787507, + "learning_rate": 1.199074079757036e-05, + "loss": 0.0241, + "step": 39406 + }, + { + "epoch": 0.4926123153078827, + "grad_norm": 3.5163707733154297, + "learning_rate": 1.1989885568476258e-05, + "loss": 1.7468, + "step": 39408 + }, + { + "epoch": 0.49263731593289833, + "grad_norm": 0.24113672971725464, + "learning_rate": 1.198903032422747e-05, + "loss": 0.0054, + "step": 39410 + }, + { + "epoch": 0.4926623165579139, + "grad_norm": 2.3086249828338623, + "learning_rate": 1.1988175064830506e-05, + "loss": 0.4583, + "step": 39412 + }, + { + "epoch": 0.4926873171829296, + "grad_norm": 4.465157508850098, + "learning_rate": 1.1987319790291882e-05, + "loss": 2.5346, + "step": 39414 + }, + { + "epoch": 0.4927123178079452, + "grad_norm": 2.800748825073242, + "learning_rate": 1.1986464500618116e-05, + "loss": 0.9506, + "step": 39416 + }, + { + "epoch": 0.4927373184329608, + "grad_norm": 8.700977325439453, + "learning_rate": 1.1985609195815714e-05, + "loss": 1.3074, + "step": 39418 + }, + { + "epoch": 0.49276231905797646, + "grad_norm": 4.790125846862793, + "learning_rate": 1.1984753875891191e-05, + "loss": 2.0834, + "step": 39420 + }, + { + "epoch": 0.49278731968299205, + "grad_norm": 2.8236942291259766, + "learning_rate": 1.1983898540851069e-05, + "loss": 1.1496, + "step": 39422 + }, + { + "epoch": 0.4928123203080077, + "grad_norm": 1.9229857921600342, + "learning_rate": 1.1983043190701848e-05, + "loss": 1.6821, + "step": 39424 + }, + { + "epoch": 0.49283732093302335, + "grad_norm": 3.954064130783081, + "learning_rate": 1.1982187825450056e-05, + "loss": 1.3178, + "step": 39426 + }, + { + "epoch": 0.49286232155803894, + "grad_norm": 0.0005966630997136235, + "learning_rate": 1.1981332445102201e-05, + "loss": 0.0064, + "step": 39428 + }, + { + "epoch": 0.4928873221830546, + "grad_norm": 5.115583896636963, + "learning_rate": 1.1980477049664798e-05, + "loss": 2.4716, + "step": 39430 + }, + { + "epoch": 0.4929123228080702, + "grad_norm": 0.7506197690963745, + "learning_rate": 1.1979621639144364e-05, + "loss": 0.5126, + "step": 39432 + }, + { + "epoch": 0.49293732343308583, + "grad_norm": 4.34896993637085, + "learning_rate": 1.1978766213547407e-05, + "loss": 0.7207, + "step": 39434 + }, + { + "epoch": 0.4929623240581015, + "grad_norm": 3.056157350540161, + "learning_rate": 1.197791077288045e-05, + "loss": 1.3061, + "step": 39436 + }, + { + "epoch": 0.49298732468311707, + "grad_norm": 3.07084321975708, + "learning_rate": 1.197705531715e-05, + "loss": 1.4, + "step": 39438 + }, + { + "epoch": 0.4930123253081327, + "grad_norm": 0.14508111774921417, + "learning_rate": 1.1976199846362581e-05, + "loss": 0.0214, + "step": 39440 + }, + { + "epoch": 0.4930373259331483, + "grad_norm": 2.145752191543579, + "learning_rate": 1.1975344360524703e-05, + "loss": 0.9283, + "step": 39442 + }, + { + "epoch": 0.49306232655816395, + "grad_norm": 2.015162944793701, + "learning_rate": 1.1974488859642882e-05, + "loss": 1.0773, + "step": 39444 + }, + { + "epoch": 0.4930873271831796, + "grad_norm": 3.3383896350860596, + "learning_rate": 1.1973633343723632e-05, + "loss": 0.2262, + "step": 39446 + }, + { + "epoch": 0.4931123278081952, + "grad_norm": 3.918437957763672, + "learning_rate": 1.197277781277347e-05, + "loss": 1.2713, + "step": 39448 + }, + { + "epoch": 0.49313732843321084, + "grad_norm": 0.2516123056411743, + "learning_rate": 1.1971922266798907e-05, + "loss": 0.3134, + "step": 39450 + }, + { + "epoch": 0.49316232905822643, + "grad_norm": 0.6655045747756958, + "learning_rate": 1.1971066705806466e-05, + "loss": 0.0291, + "step": 39452 + }, + { + "epoch": 0.4931873296832421, + "grad_norm": 3.610006332397461, + "learning_rate": 1.197021112980266e-05, + "loss": 0.82, + "step": 39454 + }, + { + "epoch": 0.49321233030825773, + "grad_norm": 2.9694983959198, + "learning_rate": 1.1969355538794008e-05, + "loss": 0.7897, + "step": 39456 + }, + { + "epoch": 0.4932373309332733, + "grad_norm": 3.194439172744751, + "learning_rate": 1.1968499932787016e-05, + "loss": 0.5509, + "step": 39458 + }, + { + "epoch": 0.49326233155828897, + "grad_norm": 2.2847824096679688, + "learning_rate": 1.196764431178821e-05, + "loss": 1.0529, + "step": 39460 + }, + { + "epoch": 0.49328733218330456, + "grad_norm": 3.50911545753479, + "learning_rate": 1.1966788675804105e-05, + "loss": 1.1444, + "step": 39462 + }, + { + "epoch": 0.4933123328083202, + "grad_norm": 15.98917007446289, + "learning_rate": 1.1965933024841212e-05, + "loss": 0.3596, + "step": 39464 + }, + { + "epoch": 0.49333733343333586, + "grad_norm": 1.0318152904510498, + "learning_rate": 1.1965077358906054e-05, + "loss": 0.1873, + "step": 39466 + }, + { + "epoch": 0.49336233405835145, + "grad_norm": 5.310940742492676, + "learning_rate": 1.1964221678005146e-05, + "loss": 1.2234, + "step": 39468 + }, + { + "epoch": 0.4933873346833671, + "grad_norm": 2.154754877090454, + "learning_rate": 1.1963365982145e-05, + "loss": 1.2063, + "step": 39470 + }, + { + "epoch": 0.4934123353083827, + "grad_norm": 5.361429691314697, + "learning_rate": 1.1962510271332138e-05, + "loss": 1.0282, + "step": 39472 + }, + { + "epoch": 0.49343733593339834, + "grad_norm": 0.9334723949432373, + "learning_rate": 1.1961654545573074e-05, + "loss": 0.0491, + "step": 39474 + }, + { + "epoch": 0.493462336558414, + "grad_norm": 0.06785523891448975, + "learning_rate": 1.1960798804874327e-05, + "loss": 0.0263, + "step": 39476 + }, + { + "epoch": 0.4934873371834296, + "grad_norm": 1.4788962602615356, + "learning_rate": 1.1959943049242413e-05, + "loss": 0.9221, + "step": 39478 + }, + { + "epoch": 0.4935123378084452, + "grad_norm": 0.5913851261138916, + "learning_rate": 1.195908727868385e-05, + "loss": 0.3487, + "step": 39480 + }, + { + "epoch": 0.4935373384334608, + "grad_norm": 4.38496208190918, + "learning_rate": 1.1958231493205159e-05, + "loss": 0.3603, + "step": 39482 + }, + { + "epoch": 0.49356233905847646, + "grad_norm": 0.0007776117417961359, + "learning_rate": 1.1957375692812849e-05, + "loss": 1.3802, + "step": 39484 + }, + { + "epoch": 0.4935873396834921, + "grad_norm": 1.8307799100875854, + "learning_rate": 1.1956519877513443e-05, + "loss": 1.2482, + "step": 39486 + }, + { + "epoch": 0.4936123403085077, + "grad_norm": 2.225400924682617, + "learning_rate": 1.195566404731346e-05, + "loss": 0.4641, + "step": 39488 + }, + { + "epoch": 0.49363734093352335, + "grad_norm": 5.480473041534424, + "learning_rate": 1.1954808202219417e-05, + "loss": 2.6337, + "step": 39490 + }, + { + "epoch": 0.49366234155853894, + "grad_norm": 3.2369744777679443, + "learning_rate": 1.195395234223783e-05, + "loss": 0.9494, + "step": 39492 + }, + { + "epoch": 0.4936873421835546, + "grad_norm": 0.0008521088166162372, + "learning_rate": 1.1953096467375218e-05, + "loss": 0.6768, + "step": 39494 + }, + { + "epoch": 0.49371234280857024, + "grad_norm": 2.2381093502044678, + "learning_rate": 1.1952240577638101e-05, + "loss": 0.7941, + "step": 39496 + }, + { + "epoch": 0.49373734343358583, + "grad_norm": 0.10615655779838562, + "learning_rate": 1.1951384673032993e-05, + "loss": 0.4609, + "step": 39498 + }, + { + "epoch": 0.4937623440586015, + "grad_norm": 3.7981691360473633, + "learning_rate": 1.1950528753566417e-05, + "loss": 0.4628, + "step": 39500 + }, + { + "epoch": 0.49378734468361707, + "grad_norm": 1.2639306783676147, + "learning_rate": 1.1949672819244892e-05, + "loss": 0.0767, + "step": 39502 + }, + { + "epoch": 0.4938123453086327, + "grad_norm": 2.999267101287842, + "learning_rate": 1.194881687007493e-05, + "loss": 0.3058, + "step": 39504 + }, + { + "epoch": 0.49383734593364836, + "grad_norm": 6.501527786254883, + "learning_rate": 1.1947960906063057e-05, + "loss": 1.5285, + "step": 39506 + }, + { + "epoch": 0.49386234655866396, + "grad_norm": 1.8778835535049438, + "learning_rate": 1.1947104927215789e-05, + "loss": 0.7681, + "step": 39508 + }, + { + "epoch": 0.4938873471836796, + "grad_norm": 3.9222846031188965, + "learning_rate": 1.1946248933539645e-05, + "loss": 0.548, + "step": 39510 + }, + { + "epoch": 0.4939123478086952, + "grad_norm": 0.0009233380551449955, + "learning_rate": 1.1945392925041143e-05, + "loss": 0.7977, + "step": 39512 + }, + { + "epoch": 0.49393734843371084, + "grad_norm": 4.172131538391113, + "learning_rate": 1.1944536901726807e-05, + "loss": 0.7004, + "step": 39514 + }, + { + "epoch": 0.4939623490587265, + "grad_norm": 6.81453800201416, + "learning_rate": 1.1943680863603149e-05, + "loss": 2.0002, + "step": 39516 + }, + { + "epoch": 0.4939873496837421, + "grad_norm": 2.144796848297119, + "learning_rate": 1.1942824810676695e-05, + "loss": 1.0813, + "step": 39518 + }, + { + "epoch": 0.49401235030875773, + "grad_norm": 2.229396104812622, + "learning_rate": 1.1941968742953963e-05, + "loss": 1.1282, + "step": 39520 + }, + { + "epoch": 0.4940373509337733, + "grad_norm": 3.3417046070098877, + "learning_rate": 1.194111266044147e-05, + "loss": 0.6223, + "step": 39522 + }, + { + "epoch": 0.49406235155878897, + "grad_norm": 4.309230804443359, + "learning_rate": 1.1940256563145736e-05, + "loss": 1.2194, + "step": 39524 + }, + { + "epoch": 0.4940873521838046, + "grad_norm": 2.3515734672546387, + "learning_rate": 1.1939400451073284e-05, + "loss": 1.6528, + "step": 39526 + }, + { + "epoch": 0.4941123528088202, + "grad_norm": 0.21738532185554504, + "learning_rate": 1.1938544324230632e-05, + "loss": 0.3978, + "step": 39528 + }, + { + "epoch": 0.49413735343383586, + "grad_norm": 1.7479889392852783, + "learning_rate": 1.1937688182624298e-05, + "loss": 0.9065, + "step": 39530 + }, + { + "epoch": 0.49416235405885145, + "grad_norm": 0.0010631964541971684, + "learning_rate": 1.193683202626081e-05, + "loss": 0.0098, + "step": 39532 + }, + { + "epoch": 0.4941873546838671, + "grad_norm": 0.5600419640541077, + "learning_rate": 1.1935975855146681e-05, + "loss": 0.0303, + "step": 39534 + }, + { + "epoch": 0.49421235530888274, + "grad_norm": 0.4728289246559143, + "learning_rate": 1.1935119669288434e-05, + "loss": 0.0286, + "step": 39536 + }, + { + "epoch": 0.49423735593389834, + "grad_norm": 3.5225186347961426, + "learning_rate": 1.1934263468692589e-05, + "loss": 0.7179, + "step": 39538 + }, + { + "epoch": 0.494262356558914, + "grad_norm": 0.5200410485267639, + "learning_rate": 1.1933407253365665e-05, + "loss": 0.5909, + "step": 39540 + }, + { + "epoch": 0.4942873571839296, + "grad_norm": 0.6777925491333008, + "learning_rate": 1.1932551023314188e-05, + "loss": 0.8729, + "step": 39542 + }, + { + "epoch": 0.4943123578089452, + "grad_norm": 3.4965124130249023, + "learning_rate": 1.1931694778544672e-05, + "loss": 0.6148, + "step": 39544 + }, + { + "epoch": 0.49433735843396087, + "grad_norm": 4.26306676864624, + "learning_rate": 1.1930838519063645e-05, + "loss": 0.8462, + "step": 39546 + }, + { + "epoch": 0.49436235905897646, + "grad_norm": 4.259152412414551, + "learning_rate": 1.1929982244877625e-05, + "loss": 0.1004, + "step": 39548 + }, + { + "epoch": 0.4943873596839921, + "grad_norm": 3.5732932090759277, + "learning_rate": 1.192912595599313e-05, + "loss": 1.6053, + "step": 39550 + }, + { + "epoch": 0.4944123603090077, + "grad_norm": 4.3269758224487305, + "learning_rate": 1.1928269652416685e-05, + "loss": 1.0166, + "step": 39552 + }, + { + "epoch": 0.49443736093402335, + "grad_norm": 3.7135725021362305, + "learning_rate": 1.1927413334154814e-05, + "loss": 0.2139, + "step": 39554 + }, + { + "epoch": 0.494462361559039, + "grad_norm": 0.0010519300121814013, + "learning_rate": 1.1926557001214035e-05, + "loss": 0.1617, + "step": 39556 + }, + { + "epoch": 0.4944873621840546, + "grad_norm": 3.752671003341675, + "learning_rate": 1.1925700653600866e-05, + "loss": 1.2151, + "step": 39558 + }, + { + "epoch": 0.49451236280907024, + "grad_norm": 3.7768523693084717, + "learning_rate": 1.1924844291321839e-05, + "loss": 1.0978, + "step": 39560 + }, + { + "epoch": 0.49453736343408583, + "grad_norm": 5.776876449584961, + "learning_rate": 1.1923987914383468e-05, + "loss": 0.8524, + "step": 39562 + }, + { + "epoch": 0.4945623640591015, + "grad_norm": 5.030548572540283, + "learning_rate": 1.1923131522792275e-05, + "loss": 1.4423, + "step": 39564 + }, + { + "epoch": 0.4945873646841171, + "grad_norm": 4.883152484893799, + "learning_rate": 1.1922275116554787e-05, + "loss": 1.8933, + "step": 39566 + }, + { + "epoch": 0.4946123653091327, + "grad_norm": 3.598386526107788, + "learning_rate": 1.1921418695677523e-05, + "loss": 0.6798, + "step": 39568 + }, + { + "epoch": 0.49463736593414837, + "grad_norm": 4.070552349090576, + "learning_rate": 1.1920562260167003e-05, + "loss": 1.6237, + "step": 39570 + }, + { + "epoch": 0.49466236655916396, + "grad_norm": 2.141249179840088, + "learning_rate": 1.1919705810029758e-05, + "loss": 0.8282, + "step": 39572 + }, + { + "epoch": 0.4946873671841796, + "grad_norm": 4.119268417358398, + "learning_rate": 1.19188493452723e-05, + "loss": 0.6444, + "step": 39574 + }, + { + "epoch": 0.49471236780919525, + "grad_norm": 5.929835319519043, + "learning_rate": 1.191799286590116e-05, + "loss": 1.5292, + "step": 39576 + }, + { + "epoch": 0.49473736843421084, + "grad_norm": 3.9232051372528076, + "learning_rate": 1.1917136371922853e-05, + "loss": 0.6743, + "step": 39578 + }, + { + "epoch": 0.4947623690592265, + "grad_norm": 3.103391408920288, + "learning_rate": 1.191627986334391e-05, + "loss": 1.7199, + "step": 39580 + }, + { + "epoch": 0.4947873696842421, + "grad_norm": 0.0016725322930142283, + "learning_rate": 1.1915423340170851e-05, + "loss": 0.819, + "step": 39582 + }, + { + "epoch": 0.49481237030925773, + "grad_norm": 3.3577075004577637, + "learning_rate": 1.1914566802410198e-05, + "loss": 1.2233, + "step": 39584 + }, + { + "epoch": 0.4948373709342734, + "grad_norm": 4.814577102661133, + "learning_rate": 1.1913710250068475e-05, + "loss": 1.2989, + "step": 39586 + }, + { + "epoch": 0.49486237155928897, + "grad_norm": 2.6758952140808105, + "learning_rate": 1.1912853683152206e-05, + "loss": 0.9676, + "step": 39588 + }, + { + "epoch": 0.4948873721843046, + "grad_norm": 2.799391508102417, + "learning_rate": 1.1911997101667912e-05, + "loss": 0.1603, + "step": 39590 + }, + { + "epoch": 0.4949123728093202, + "grad_norm": 2.1572322845458984, + "learning_rate": 1.1911140505622121e-05, + "loss": 1.094, + "step": 39592 + }, + { + "epoch": 0.49493737343433586, + "grad_norm": 3.801703691482544, + "learning_rate": 1.191028389502135e-05, + "loss": 0.1525, + "step": 39594 + }, + { + "epoch": 0.4949623740593515, + "grad_norm": 6.261645793914795, + "learning_rate": 1.1909427269872131e-05, + "loss": 1.5389, + "step": 39596 + }, + { + "epoch": 0.4949873746843671, + "grad_norm": 2.664289712905884, + "learning_rate": 1.1908570630180984e-05, + "loss": 1.4163, + "step": 39598 + }, + { + "epoch": 0.49501237530938275, + "grad_norm": 5.093279838562012, + "learning_rate": 1.1907713975954432e-05, + "loss": 0.7739, + "step": 39600 + }, + { + "epoch": 0.49503737593439834, + "grad_norm": 2.408277988433838, + "learning_rate": 1.1906857307198996e-05, + "loss": 0.3508, + "step": 39602 + }, + { + "epoch": 0.495062376559414, + "grad_norm": 3.811575412750244, + "learning_rate": 1.1906000623921207e-05, + "loss": 0.5918, + "step": 39604 + }, + { + "epoch": 0.49508737718442963, + "grad_norm": 5.971356391906738, + "learning_rate": 1.1905143926127591e-05, + "loss": 1.956, + "step": 39606 + }, + { + "epoch": 0.4951123778094452, + "grad_norm": 4.128951072692871, + "learning_rate": 1.1904287213824665e-05, + "loss": 0.8899, + "step": 39608 + }, + { + "epoch": 0.4951373784344609, + "grad_norm": 3.042466402053833, + "learning_rate": 1.1903430487018956e-05, + "loss": 0.9398, + "step": 39610 + }, + { + "epoch": 0.49516237905947647, + "grad_norm": 2.7854366302490234, + "learning_rate": 1.1902573745716995e-05, + "loss": 0.8924, + "step": 39612 + }, + { + "epoch": 0.4951873796844921, + "grad_norm": 3.050276756286621, + "learning_rate": 1.1901716989925293e-05, + "loss": 1.1992, + "step": 39614 + }, + { + "epoch": 0.49521238030950776, + "grad_norm": 2.542595624923706, + "learning_rate": 1.1900860219650387e-05, + "loss": 0.9815, + "step": 39616 + }, + { + "epoch": 0.49523738093452335, + "grad_norm": 2.2242276668548584, + "learning_rate": 1.1900003434898802e-05, + "loss": 0.2211, + "step": 39618 + }, + { + "epoch": 0.495262381559539, + "grad_norm": 8.838839530944824, + "learning_rate": 1.1899146635677055e-05, + "loss": 1.6266, + "step": 39620 + }, + { + "epoch": 0.4952873821845546, + "grad_norm": 3.839054584503174, + "learning_rate": 1.1898289821991676e-05, + "loss": 1.3251, + "step": 39622 + }, + { + "epoch": 0.49531238280957024, + "grad_norm": 7.179720401763916, + "learning_rate": 1.1897432993849193e-05, + "loss": 1.9627, + "step": 39624 + }, + { + "epoch": 0.4953373834345859, + "grad_norm": 5.204295635223389, + "learning_rate": 1.1896576151256125e-05, + "loss": 1.2423, + "step": 39626 + }, + { + "epoch": 0.4953623840596015, + "grad_norm": 3.1689488887786865, + "learning_rate": 1.1895719294219e-05, + "loss": 0.9137, + "step": 39628 + }, + { + "epoch": 0.4953873846846171, + "grad_norm": 0.6106671690940857, + "learning_rate": 1.1894862422744349e-05, + "loss": 0.0161, + "step": 39630 + }, + { + "epoch": 0.4954123853096327, + "grad_norm": 0.0006579423206858337, + "learning_rate": 1.1894005536838694e-05, + "loss": 0.5391, + "step": 39632 + }, + { + "epoch": 0.49543738593464837, + "grad_norm": 0.0010293069062754512, + "learning_rate": 1.189314863650856e-05, + "loss": 0.86, + "step": 39634 + }, + { + "epoch": 0.495462386559664, + "grad_norm": 4.1183624267578125, + "learning_rate": 1.1892291721760474e-05, + "loss": 1.9877, + "step": 39636 + }, + { + "epoch": 0.4954873871846796, + "grad_norm": 3.45546817779541, + "learning_rate": 1.1891434792600962e-05, + "loss": 0.758, + "step": 39638 + }, + { + "epoch": 0.49551238780969525, + "grad_norm": 9.79915714263916, + "learning_rate": 1.1890577849036546e-05, + "loss": 1.7188, + "step": 39640 + }, + { + "epoch": 0.49553738843471085, + "grad_norm": 4.53098726272583, + "learning_rate": 1.188972089107376e-05, + "loss": 1.6015, + "step": 39642 + }, + { + "epoch": 0.4955623890597265, + "grad_norm": 2.7419769763946533, + "learning_rate": 1.1888863918719129e-05, + "loss": 1.4002, + "step": 39644 + }, + { + "epoch": 0.49558738968474214, + "grad_norm": 0.000983203062787652, + "learning_rate": 1.1888006931979175e-05, + "loss": 0.5344, + "step": 39646 + }, + { + "epoch": 0.49561239030975773, + "grad_norm": 2.05983304977417, + "learning_rate": 1.1887149930860426e-05, + "loss": 0.8507, + "step": 39648 + }, + { + "epoch": 0.4956373909347734, + "grad_norm": 3.8463094234466553, + "learning_rate": 1.188629291536941e-05, + "loss": 1.0224, + "step": 39650 + }, + { + "epoch": 0.495662391559789, + "grad_norm": 0.0008207070641219616, + "learning_rate": 1.1885435885512655e-05, + "loss": 0.0019, + "step": 39652 + }, + { + "epoch": 0.4956873921848046, + "grad_norm": 3.302417516708374, + "learning_rate": 1.1884578841296687e-05, + "loss": 1.6484, + "step": 39654 + }, + { + "epoch": 0.49571239280982027, + "grad_norm": 3.2645347118377686, + "learning_rate": 1.1883721782728032e-05, + "loss": 1.3217, + "step": 39656 + }, + { + "epoch": 0.49573739343483586, + "grad_norm": 9.070124626159668, + "learning_rate": 1.1882864709813223e-05, + "loss": 1.6056, + "step": 39658 + }, + { + "epoch": 0.4957623940598515, + "grad_norm": 4.632514953613281, + "learning_rate": 1.1882007622558777e-05, + "loss": 0.5298, + "step": 39660 + }, + { + "epoch": 0.4957873946848671, + "grad_norm": 4.336508750915527, + "learning_rate": 1.188115052097123e-05, + "loss": 2.0262, + "step": 39662 + }, + { + "epoch": 0.49581239530988275, + "grad_norm": 5.542562007904053, + "learning_rate": 1.1880293405057105e-05, + "loss": 1.6263, + "step": 39664 + }, + { + "epoch": 0.4958373959348984, + "grad_norm": 3.4492149353027344, + "learning_rate": 1.187943627482293e-05, + "loss": 1.3734, + "step": 39666 + }, + { + "epoch": 0.495862396559914, + "grad_norm": 3.6762855052948, + "learning_rate": 1.1878579130275237e-05, + "loss": 1.1121, + "step": 39668 + }, + { + "epoch": 0.49588739718492963, + "grad_norm": 4.22918701171875, + "learning_rate": 1.1877721971420551e-05, + "loss": 1.9655, + "step": 39670 + }, + { + "epoch": 0.4959123978099452, + "grad_norm": 4.871767520904541, + "learning_rate": 1.18768647982654e-05, + "loss": 1.4022, + "step": 39672 + }, + { + "epoch": 0.4959373984349609, + "grad_norm": 2.068603038787842, + "learning_rate": 1.1876007610816309e-05, + "loss": 0.2979, + "step": 39674 + }, + { + "epoch": 0.4959623990599765, + "grad_norm": 3.180241823196411, + "learning_rate": 1.1875150409079813e-05, + "loss": 1.0592, + "step": 39676 + }, + { + "epoch": 0.4959873996849921, + "grad_norm": 0.06918182969093323, + "learning_rate": 1.1874293193062434e-05, + "loss": 0.3991, + "step": 39678 + }, + { + "epoch": 0.49601240031000776, + "grad_norm": 2.651293992996216, + "learning_rate": 1.1873435962770704e-05, + "loss": 0.9634, + "step": 39680 + }, + { + "epoch": 0.49603740093502335, + "grad_norm": 3.237880229949951, + "learning_rate": 1.1872578718211152e-05, + "loss": 0.6214, + "step": 39682 + }, + { + "epoch": 0.496062401560039, + "grad_norm": 3.9921419620513916, + "learning_rate": 1.1871721459390304e-05, + "loss": 3.1525, + "step": 39684 + }, + { + "epoch": 0.49608740218505465, + "grad_norm": 2.212137222290039, + "learning_rate": 1.187086418631469e-05, + "loss": 0.2967, + "step": 39686 + }, + { + "epoch": 0.49611240281007024, + "grad_norm": 2.123575448989868, + "learning_rate": 1.187000689899084e-05, + "loss": 1.965, + "step": 39688 + }, + { + "epoch": 0.4961374034350859, + "grad_norm": 1.873184084892273, + "learning_rate": 1.186914959742528e-05, + "loss": 1.2547, + "step": 39690 + }, + { + "epoch": 0.4961624040601015, + "grad_norm": 0.0013425331562757492, + "learning_rate": 1.1868292281624543e-05, + "loss": 0.4603, + "step": 39692 + }, + { + "epoch": 0.49618740468511713, + "grad_norm": 2.5617244243621826, + "learning_rate": 1.1867434951595154e-05, + "loss": 0.8875, + "step": 39694 + }, + { + "epoch": 0.4962124053101328, + "grad_norm": 5.659836292266846, + "learning_rate": 1.1866577607343648e-05, + "loss": 0.8844, + "step": 39696 + }, + { + "epoch": 0.49623740593514837, + "grad_norm": 5.883666515350342, + "learning_rate": 1.1865720248876547e-05, + "loss": 0.7697, + "step": 39698 + }, + { + "epoch": 0.496262406560164, + "grad_norm": 3.0109777450561523, + "learning_rate": 1.1864862876200388e-05, + "loss": 0.836, + "step": 39700 + }, + { + "epoch": 0.4962874071851796, + "grad_norm": 3.1460001468658447, + "learning_rate": 1.1864005489321693e-05, + "loss": 1.8521, + "step": 39702 + }, + { + "epoch": 0.49631240781019526, + "grad_norm": 1.7849153280258179, + "learning_rate": 1.1863148088246999e-05, + "loss": 0.7363, + "step": 39704 + }, + { + "epoch": 0.4963374084352109, + "grad_norm": 0.9991170167922974, + "learning_rate": 1.1862290672982831e-05, + "loss": 0.0269, + "step": 39706 + }, + { + "epoch": 0.4963624090602265, + "grad_norm": 5.518525123596191, + "learning_rate": 1.1861433243535721e-05, + "loss": 1.3014, + "step": 39708 + }, + { + "epoch": 0.49638740968524214, + "grad_norm": 0.007320761680603027, + "learning_rate": 1.1860575799912199e-05, + "loss": 1.5309, + "step": 39710 + }, + { + "epoch": 0.49641241031025773, + "grad_norm": 0.0008546325843781233, + "learning_rate": 1.1859718342118796e-05, + "loss": 0.8481, + "step": 39712 + }, + { + "epoch": 0.4964374109352734, + "grad_norm": 2.4186441898345947, + "learning_rate": 1.1858860870162037e-05, + "loss": 0.5182, + "step": 39714 + }, + { + "epoch": 0.49646241156028903, + "grad_norm": 5.058565616607666, + "learning_rate": 1.185800338404846e-05, + "loss": 0.3619, + "step": 39716 + }, + { + "epoch": 0.4964874121853046, + "grad_norm": 0.0011311067501083016, + "learning_rate": 1.185714588378459e-05, + "loss": 0.5173, + "step": 39718 + }, + { + "epoch": 0.49651241281032027, + "grad_norm": 3.806001663208008, + "learning_rate": 1.1856288369376961e-05, + "loss": 1.4075, + "step": 39720 + }, + { + "epoch": 0.49653741343533586, + "grad_norm": 1.3303459882736206, + "learning_rate": 1.1855430840832102e-05, + "loss": 0.2254, + "step": 39722 + }, + { + "epoch": 0.4965624140603515, + "grad_norm": 4.015680313110352, + "learning_rate": 1.1854573298156544e-05, + "loss": 1.8804, + "step": 39724 + }, + { + "epoch": 0.49658741468536716, + "grad_norm": 3.1391966342926025, + "learning_rate": 1.185371574135682e-05, + "loss": 0.9418, + "step": 39726 + }, + { + "epoch": 0.49661241531038275, + "grad_norm": 2.8510420322418213, + "learning_rate": 1.1852858170439456e-05, + "loss": 0.8819, + "step": 39728 + }, + { + "epoch": 0.4966374159353984, + "grad_norm": 1.8147441148757935, + "learning_rate": 1.1852000585410988e-05, + "loss": 1.2597, + "step": 39730 + }, + { + "epoch": 0.496662416560414, + "grad_norm": 4.3328938484191895, + "learning_rate": 1.1851142986277944e-05, + "loss": 0.4832, + "step": 39732 + }, + { + "epoch": 0.49668741718542964, + "grad_norm": 2.7788965702056885, + "learning_rate": 1.1850285373046857e-05, + "loss": 1.6894, + "step": 39734 + }, + { + "epoch": 0.4967124178104453, + "grad_norm": 4.252071380615234, + "learning_rate": 1.184942774572426e-05, + "loss": 1.2241, + "step": 39736 + }, + { + "epoch": 0.4967374184354609, + "grad_norm": 1.101167917251587, + "learning_rate": 1.1848570104316681e-05, + "loss": 0.0894, + "step": 39738 + }, + { + "epoch": 0.4967624190604765, + "grad_norm": 6.365849494934082, + "learning_rate": 1.1847712448830654e-05, + "loss": 2.1473, + "step": 39740 + }, + { + "epoch": 0.4967874196854921, + "grad_norm": 3.6759629249572754, + "learning_rate": 1.184685477927271e-05, + "loss": 1.101, + "step": 39742 + }, + { + "epoch": 0.49681242031050776, + "grad_norm": 4.851124286651611, + "learning_rate": 1.1845997095649381e-05, + "loss": 0.3566, + "step": 39744 + }, + { + "epoch": 0.4968374209355234, + "grad_norm": 3.58919620513916, + "learning_rate": 1.18451393979672e-05, + "loss": 2.0079, + "step": 39746 + }, + { + "epoch": 0.496862421560539, + "grad_norm": 3.4237723350524902, + "learning_rate": 1.18442816862327e-05, + "loss": 0.89, + "step": 39748 + }, + { + "epoch": 0.49688742218555465, + "grad_norm": 2.609790325164795, + "learning_rate": 1.184342396045241e-05, + "loss": 1.3493, + "step": 39750 + }, + { + "epoch": 0.49691242281057024, + "grad_norm": 4.156854629516602, + "learning_rate": 1.1842566220632863e-05, + "loss": 0.8509, + "step": 39752 + }, + { + "epoch": 0.4969374234355859, + "grad_norm": 0.006162095349282026, + "learning_rate": 1.1841708466780588e-05, + "loss": 0.9659, + "step": 39754 + }, + { + "epoch": 0.49696242406060154, + "grad_norm": 6.78322696685791, + "learning_rate": 1.1840850698902128e-05, + "loss": 1.448, + "step": 39756 + }, + { + "epoch": 0.49698742468561713, + "grad_norm": 3.328547239303589, + "learning_rate": 1.1839992917004008e-05, + "loss": 1.3303, + "step": 39758 + }, + { + "epoch": 0.4970124253106328, + "grad_norm": 1.9367481470108032, + "learning_rate": 1.1839135121092762e-05, + "loss": 0.8877, + "step": 39760 + }, + { + "epoch": 0.49703742593564837, + "grad_norm": 0.0015702091623097658, + "learning_rate": 1.1838277311174924e-05, + "loss": 0.663, + "step": 39762 + }, + { + "epoch": 0.497062426560664, + "grad_norm": 0.37915533781051636, + "learning_rate": 1.1837419487257024e-05, + "loss": 0.0543, + "step": 39764 + }, + { + "epoch": 0.49708742718567966, + "grad_norm": 2.4277851581573486, + "learning_rate": 1.18365616493456e-05, + "loss": 0.7509, + "step": 39766 + }, + { + "epoch": 0.49711242781069526, + "grad_norm": 1.3728188276290894, + "learning_rate": 1.1835703797447178e-05, + "loss": 0.0513, + "step": 39768 + }, + { + "epoch": 0.4971374284357109, + "grad_norm": 2.350160598754883, + "learning_rate": 1.1834845931568298e-05, + "loss": 0.1031, + "step": 39770 + }, + { + "epoch": 0.4971624290607265, + "grad_norm": 2.4002878665924072, + "learning_rate": 1.1833988051715491e-05, + "loss": 1.1845, + "step": 39772 + }, + { + "epoch": 0.49718742968574214, + "grad_norm": 3.053724527359009, + "learning_rate": 1.1833130157895289e-05, + "loss": 1.2297, + "step": 39774 + }, + { + "epoch": 0.4972124303107578, + "grad_norm": 0.0014479542151093483, + "learning_rate": 1.1832272250114227e-05, + "loss": 0.0295, + "step": 39776 + }, + { + "epoch": 0.4972374309357734, + "grad_norm": 3.5988097190856934, + "learning_rate": 1.183141432837884e-05, + "loss": 1.9095, + "step": 39778 + }, + { + "epoch": 0.49726243156078903, + "grad_norm": 4.715244293212891, + "learning_rate": 1.1830556392695658e-05, + "loss": 1.2578, + "step": 39780 + }, + { + "epoch": 0.4972874321858046, + "grad_norm": 0.02969651110470295, + "learning_rate": 1.1829698443071221e-05, + "loss": 0.072, + "step": 39782 + }, + { + "epoch": 0.49731243281082027, + "grad_norm": 0.0037774424999952316, + "learning_rate": 1.1828840479512055e-05, + "loss": 0.0577, + "step": 39784 + }, + { + "epoch": 0.4973374334358359, + "grad_norm": 2.564138412475586, + "learning_rate": 1.1827982502024702e-05, + "loss": 0.3894, + "step": 39786 + }, + { + "epoch": 0.4973624340608515, + "grad_norm": 3.694211006164551, + "learning_rate": 1.1827124510615692e-05, + "loss": 0.6297, + "step": 39788 + }, + { + "epoch": 0.49738743468586716, + "grad_norm": 1.9019542932510376, + "learning_rate": 1.1826266505291561e-05, + "loss": 0.7839, + "step": 39790 + }, + { + "epoch": 0.49741243531088275, + "grad_norm": 2.048341751098633, + "learning_rate": 1.1825408486058837e-05, + "loss": 1.0926, + "step": 39792 + }, + { + "epoch": 0.4974374359358984, + "grad_norm": 5.983281135559082, + "learning_rate": 1.1824550452924069e-05, + "loss": 1.5205, + "step": 39794 + }, + { + "epoch": 0.49746243656091405, + "grad_norm": 1.9435182809829712, + "learning_rate": 1.1823692405893778e-05, + "loss": 0.0775, + "step": 39796 + }, + { + "epoch": 0.49748743718592964, + "grad_norm": 2.6584882736206055, + "learning_rate": 1.1822834344974503e-05, + "loss": 2.0624, + "step": 39798 + }, + { + "epoch": 0.4975124378109453, + "grad_norm": 3.3986034393310547, + "learning_rate": 1.1821976270172779e-05, + "loss": 1.874, + "step": 39800 + }, + { + "epoch": 0.4975374384359609, + "grad_norm": 6.469125270843506, + "learning_rate": 1.1821118181495146e-05, + "loss": 1.6271, + "step": 39802 + }, + { + "epoch": 0.4975624390609765, + "grad_norm": 0.09972377866506577, + "learning_rate": 1.1820260078948128e-05, + "loss": 0.3349, + "step": 39804 + }, + { + "epoch": 0.49758743968599217, + "grad_norm": 2.8727777004241943, + "learning_rate": 1.181940196253827e-05, + "loss": 1.0941, + "step": 39806 + }, + { + "epoch": 0.49761244031100776, + "grad_norm": 2.863227367401123, + "learning_rate": 1.1818543832272105e-05, + "loss": 0.7989, + "step": 39808 + }, + { + "epoch": 0.4976374409360234, + "grad_norm": 3.543992757797241, + "learning_rate": 1.1817685688156166e-05, + "loss": 1.5447, + "step": 39810 + }, + { + "epoch": 0.497662441561039, + "grad_norm": 1.8221323490142822, + "learning_rate": 1.1816827530196987e-05, + "loss": 0.0854, + "step": 39812 + }, + { + "epoch": 0.49768744218605465, + "grad_norm": 3.18025541305542, + "learning_rate": 1.1815969358401114e-05, + "loss": 0.9175, + "step": 39814 + }, + { + "epoch": 0.4977124428110703, + "grad_norm": 3.390636444091797, + "learning_rate": 1.1815111172775067e-05, + "loss": 0.4898, + "step": 39816 + }, + { + "epoch": 0.4977374434360859, + "grad_norm": 15.563796997070312, + "learning_rate": 1.1814252973325394e-05, + "loss": 1.1216, + "step": 39818 + }, + { + "epoch": 0.49776244406110154, + "grad_norm": 1.2120317220687866, + "learning_rate": 1.1813394760058628e-05, + "loss": 0.3666, + "step": 39820 + }, + { + "epoch": 0.49778744468611713, + "grad_norm": 3.479452133178711, + "learning_rate": 1.1812536532981302e-05, + "loss": 1.46, + "step": 39822 + }, + { + "epoch": 0.4978124453111328, + "grad_norm": 2.5547921657562256, + "learning_rate": 1.1811678292099953e-05, + "loss": 1.1079, + "step": 39824 + }, + { + "epoch": 0.4978374459361484, + "grad_norm": 0.15587252378463745, + "learning_rate": 1.181082003742112e-05, + "loss": 0.689, + "step": 39826 + }, + { + "epoch": 0.497862446561164, + "grad_norm": 3.6685385704040527, + "learning_rate": 1.1809961768951336e-05, + "loss": 0.8011, + "step": 39828 + }, + { + "epoch": 0.49788744718617967, + "grad_norm": 0.07332322001457214, + "learning_rate": 1.1809103486697142e-05, + "loss": 0.8783, + "step": 39830 + }, + { + "epoch": 0.49791244781119526, + "grad_norm": 2.2896976470947266, + "learning_rate": 1.1808245190665068e-05, + "loss": 0.6784, + "step": 39832 + }, + { + "epoch": 0.4979374484362109, + "grad_norm": 3.408660888671875, + "learning_rate": 1.180738688086166e-05, + "loss": 0.7854, + "step": 39834 + }, + { + "epoch": 0.49796244906122655, + "grad_norm": 0.0006980904727242887, + "learning_rate": 1.1806528557293443e-05, + "loss": 0.0419, + "step": 39836 + }, + { + "epoch": 0.49798744968624215, + "grad_norm": 5.093539714813232, + "learning_rate": 1.1805670219966964e-05, + "loss": 1.4968, + "step": 39838 + }, + { + "epoch": 0.4980124503112578, + "grad_norm": 0.5343241095542908, + "learning_rate": 1.1804811868888756e-05, + "loss": 0.1391, + "step": 39840 + }, + { + "epoch": 0.4980374509362734, + "grad_norm": 2.3174803256988525, + "learning_rate": 1.1803953504065354e-05, + "loss": 0.9299, + "step": 39842 + }, + { + "epoch": 0.49806245156128903, + "grad_norm": 3.8588643074035645, + "learning_rate": 1.1803095125503297e-05, + "loss": 2.478, + "step": 39844 + }, + { + "epoch": 0.4980874521863047, + "grad_norm": 1.7579116821289062, + "learning_rate": 1.1802236733209124e-05, + "loss": 1.901, + "step": 39846 + }, + { + "epoch": 0.49811245281132027, + "grad_norm": 2.198106288909912, + "learning_rate": 1.180137832718937e-05, + "loss": 0.5761, + "step": 39848 + }, + { + "epoch": 0.4981374534363359, + "grad_norm": 4.522950172424316, + "learning_rate": 1.1800519907450574e-05, + "loss": 1.9617, + "step": 39850 + }, + { + "epoch": 0.4981624540613515, + "grad_norm": 1.615281343460083, + "learning_rate": 1.1799661473999271e-05, + "loss": 0.2049, + "step": 39852 + }, + { + "epoch": 0.49818745468636716, + "grad_norm": 1.3691898584365845, + "learning_rate": 1.1798803026842003e-05, + "loss": 0.7801, + "step": 39854 + }, + { + "epoch": 0.4982124553113828, + "grad_norm": 3.2048635482788086, + "learning_rate": 1.1797944565985304e-05, + "loss": 0.4418, + "step": 39856 + }, + { + "epoch": 0.4982374559363984, + "grad_norm": 2.744739294052124, + "learning_rate": 1.1797086091435715e-05, + "loss": 1.7989, + "step": 39858 + }, + { + "epoch": 0.49826245656141405, + "grad_norm": 0.0006434365641325712, + "learning_rate": 1.1796227603199773e-05, + "loss": 0.0, + "step": 39860 + }, + { + "epoch": 0.49828745718642964, + "grad_norm": 2.340277671813965, + "learning_rate": 1.1795369101284013e-05, + "loss": 1.243, + "step": 39862 + }, + { + "epoch": 0.4983124578114453, + "grad_norm": 4.077486515045166, + "learning_rate": 1.1794510585694978e-05, + "loss": 1.5449, + "step": 39864 + }, + { + "epoch": 0.49833745843646093, + "grad_norm": 3.179030656814575, + "learning_rate": 1.1793652056439202e-05, + "loss": 1.1114, + "step": 39866 + }, + { + "epoch": 0.4983624590614765, + "grad_norm": 0.47092190384864807, + "learning_rate": 1.1792793513523227e-05, + "loss": 0.1965, + "step": 39868 + }, + { + "epoch": 0.4983874596864922, + "grad_norm": 2.7922823429107666, + "learning_rate": 1.179193495695359e-05, + "loss": 1.7029, + "step": 39870 + }, + { + "epoch": 0.49841246031150777, + "grad_norm": 0.0014030019519850612, + "learning_rate": 1.179107638673683e-05, + "loss": 0.4013, + "step": 39872 + }, + { + "epoch": 0.4984374609365234, + "grad_norm": 4.529566287994385, + "learning_rate": 1.1790217802879487e-05, + "loss": 0.7334, + "step": 39874 + }, + { + "epoch": 0.49846246156153906, + "grad_norm": 0.17859730124473572, + "learning_rate": 1.1789359205388096e-05, + "loss": 0.4023, + "step": 39876 + }, + { + "epoch": 0.49848746218655465, + "grad_norm": 6.365379333496094, + "learning_rate": 1.1788500594269197e-05, + "loss": 1.5741, + "step": 39878 + }, + { + "epoch": 0.4985124628115703, + "grad_norm": 2.6736249923706055, + "learning_rate": 1.1787641969529333e-05, + "loss": 0.6182, + "step": 39880 + }, + { + "epoch": 0.4985374634365859, + "grad_norm": 16.08959197998047, + "learning_rate": 1.178678333117504e-05, + "loss": 0.4701, + "step": 39882 + }, + { + "epoch": 0.49856246406160154, + "grad_norm": 3.8682525157928467, + "learning_rate": 1.1785924679212856e-05, + "loss": 1.0612, + "step": 39884 + }, + { + "epoch": 0.4985874646866172, + "grad_norm": 2.498810291290283, + "learning_rate": 1.1785066013649326e-05, + "loss": 0.6113, + "step": 39886 + }, + { + "epoch": 0.4986124653116328, + "grad_norm": 3.3471317291259766, + "learning_rate": 1.1784207334490984e-05, + "loss": 1.1073, + "step": 39888 + }, + { + "epoch": 0.4986374659366484, + "grad_norm": 4.095739364624023, + "learning_rate": 1.178334864174437e-05, + "loss": 1.8375, + "step": 39890 + }, + { + "epoch": 0.498662466561664, + "grad_norm": 0.36044371128082275, + "learning_rate": 1.1782489935416027e-05, + "loss": 0.0072, + "step": 39892 + }, + { + "epoch": 0.49868746718667967, + "grad_norm": 4.1805195808410645, + "learning_rate": 1.178163121551249e-05, + "loss": 1.1293, + "step": 39894 + }, + { + "epoch": 0.4987124678116953, + "grad_norm": 0.5479249954223633, + "learning_rate": 1.1780772482040303e-05, + "loss": 1.0404, + "step": 39896 + }, + { + "epoch": 0.4987374684367109, + "grad_norm": 6.736414432525635, + "learning_rate": 1.1779913735006005e-05, + "loss": 1.5962, + "step": 39898 + }, + { + "epoch": 0.49876246906172655, + "grad_norm": 0.0006508238147944212, + "learning_rate": 1.1779054974416138e-05, + "loss": 0.0, + "step": 39900 + }, + { + "epoch": 0.49878746968674215, + "grad_norm": 4.300708293914795, + "learning_rate": 1.1778196200277235e-05, + "loss": 0.2071, + "step": 39902 + }, + { + "epoch": 0.4988124703117578, + "grad_norm": 0.0008109824266284704, + "learning_rate": 1.1777337412595843e-05, + "loss": 0.3148, + "step": 39904 + }, + { + "epoch": 0.49883747093677344, + "grad_norm": 2.5659902095794678, + "learning_rate": 1.1776478611378501e-05, + "loss": 1.0715, + "step": 39906 + }, + { + "epoch": 0.49886247156178903, + "grad_norm": 2.976834297180176, + "learning_rate": 1.1775619796631749e-05, + "loss": 0.6392, + "step": 39908 + }, + { + "epoch": 0.4988874721868047, + "grad_norm": 0.8388218283653259, + "learning_rate": 1.177476096836213e-05, + "loss": 0.8473, + "step": 39910 + }, + { + "epoch": 0.4989124728118203, + "grad_norm": 2.3672168254852295, + "learning_rate": 1.177390212657618e-05, + "loss": 1.1851, + "step": 39912 + }, + { + "epoch": 0.4989374734368359, + "grad_norm": 3.8744635581970215, + "learning_rate": 1.1773043271280441e-05, + "loss": 1.3287, + "step": 39914 + }, + { + "epoch": 0.49896247406185157, + "grad_norm": 3.63313364982605, + "learning_rate": 1.1772184402481459e-05, + "loss": 0.745, + "step": 39916 + }, + { + "epoch": 0.49898747468686716, + "grad_norm": 2.0384819507598877, + "learning_rate": 1.1771325520185768e-05, + "loss": 0.1541, + "step": 39918 + }, + { + "epoch": 0.4990124753118828, + "grad_norm": 2.483572006225586, + "learning_rate": 1.1770466624399912e-05, + "loss": 1.1303, + "step": 39920 + }, + { + "epoch": 0.4990374759368984, + "grad_norm": 1.8628178834915161, + "learning_rate": 1.1769607715130433e-05, + "loss": 1.2153, + "step": 39922 + }, + { + "epoch": 0.49906247656191405, + "grad_norm": 2.2925777435302734, + "learning_rate": 1.1768748792383873e-05, + "loss": 0.7509, + "step": 39924 + }, + { + "epoch": 0.4990874771869297, + "grad_norm": 2.5059573650360107, + "learning_rate": 1.1767889856166774e-05, + "loss": 0.3226, + "step": 39926 + }, + { + "epoch": 0.4991124778119453, + "grad_norm": 0.0005546223255805671, + "learning_rate": 1.1767030906485675e-05, + "loss": 0.9971, + "step": 39928 + }, + { + "epoch": 0.49913747843696094, + "grad_norm": 4.536005020141602, + "learning_rate": 1.1766171943347115e-05, + "loss": 1.1859, + "step": 39930 + }, + { + "epoch": 0.4991624790619765, + "grad_norm": 4.285913944244385, + "learning_rate": 1.1765312966757643e-05, + "loss": 1.4423, + "step": 39932 + }, + { + "epoch": 0.4991874796869922, + "grad_norm": 4.099671840667725, + "learning_rate": 1.1764453976723795e-05, + "loss": 0.731, + "step": 39934 + }, + { + "epoch": 0.4992124803120078, + "grad_norm": 0.0031683784909546375, + "learning_rate": 1.1763594973252114e-05, + "loss": 0.4549, + "step": 39936 + }, + { + "epoch": 0.4992374809370234, + "grad_norm": 2.5961852073669434, + "learning_rate": 1.176273595634915e-05, + "loss": 0.1271, + "step": 39938 + }, + { + "epoch": 0.49926248156203906, + "grad_norm": 4.788088321685791, + "learning_rate": 1.1761876926021432e-05, + "loss": 1.6952, + "step": 39940 + }, + { + "epoch": 0.49928748218705465, + "grad_norm": 14.406900405883789, + "learning_rate": 1.176101788227551e-05, + "loss": 0.9636, + "step": 39942 + }, + { + "epoch": 0.4993124828120703, + "grad_norm": 3.8692080974578857, + "learning_rate": 1.1760158825117923e-05, + "loss": 0.8422, + "step": 39944 + }, + { + "epoch": 0.49933748343708595, + "grad_norm": 2.2377758026123047, + "learning_rate": 1.1759299754555217e-05, + "loss": 0.2427, + "step": 39946 + }, + { + "epoch": 0.49936248406210154, + "grad_norm": 0.572400689125061, + "learning_rate": 1.1758440670593931e-05, + "loss": 0.5778, + "step": 39948 + }, + { + "epoch": 0.4993874846871172, + "grad_norm": 4.58272647857666, + "learning_rate": 1.1757581573240611e-05, + "loss": 1.3427, + "step": 39950 + }, + { + "epoch": 0.4994124853121328, + "grad_norm": 2.3605921268463135, + "learning_rate": 1.17567224625018e-05, + "loss": 0.4459, + "step": 39952 + }, + { + "epoch": 0.49943748593714843, + "grad_norm": 2.360292673110962, + "learning_rate": 1.1755863338384037e-05, + "loss": 0.7685, + "step": 39954 + }, + { + "epoch": 0.4994624865621641, + "grad_norm": 0.08345088362693787, + "learning_rate": 1.1755004200893865e-05, + "loss": 1.0872, + "step": 39956 + }, + { + "epoch": 0.49948748718717967, + "grad_norm": 3.045053243637085, + "learning_rate": 1.1754145050037836e-05, + "loss": 0.5615, + "step": 39958 + }, + { + "epoch": 0.4995124878121953, + "grad_norm": 4.473968982696533, + "learning_rate": 1.1753285885822481e-05, + "loss": 0.9272, + "step": 39960 + }, + { + "epoch": 0.4995374884372109, + "grad_norm": 0.0006182098295539618, + "learning_rate": 1.175242670825435e-05, + "loss": 0.3142, + "step": 39962 + }, + { + "epoch": 0.49956248906222656, + "grad_norm": 3.2974469661712646, + "learning_rate": 1.1751567517339986e-05, + "loss": 1.5492, + "step": 39964 + }, + { + "epoch": 0.4995874896872422, + "grad_norm": 0.0006708634318783879, + "learning_rate": 1.175070831308593e-05, + "loss": 0.0, + "step": 39966 + }, + { + "epoch": 0.4996124903122578, + "grad_norm": 0.0009850545320659876, + "learning_rate": 1.1749849095498726e-05, + "loss": 0.5031, + "step": 39968 + }, + { + "epoch": 0.49963749093727344, + "grad_norm": 2.9333605766296387, + "learning_rate": 1.174898986458492e-05, + "loss": 0.4661, + "step": 39970 + }, + { + "epoch": 0.49966249156228904, + "grad_norm": 4.84544038772583, + "learning_rate": 1.1748130620351054e-05, + "loss": 1.6797, + "step": 39972 + }, + { + "epoch": 0.4996874921873047, + "grad_norm": 4.348859786987305, + "learning_rate": 1.1747271362803674e-05, + "loss": 1.5157, + "step": 39974 + }, + { + "epoch": 0.49971249281232033, + "grad_norm": 4.599412441253662, + "learning_rate": 1.174641209194932e-05, + "loss": 0.7717, + "step": 39976 + }, + { + "epoch": 0.4997374934373359, + "grad_norm": 3.430736780166626, + "learning_rate": 1.1745552807794543e-05, + "loss": 0.2599, + "step": 39978 + }, + { + "epoch": 0.49976249406235157, + "grad_norm": 2.7592239379882812, + "learning_rate": 1.1744693510345879e-05, + "loss": 1.0852, + "step": 39980 + }, + { + "epoch": 0.49978749468736716, + "grad_norm": 2.798441171646118, + "learning_rate": 1.1743834199609874e-05, + "loss": 0.3585, + "step": 39982 + }, + { + "epoch": 0.4998124953123828, + "grad_norm": 1.3599790334701538, + "learning_rate": 1.1742974875593079e-05, + "loss": 0.952, + "step": 39984 + }, + { + "epoch": 0.49983749593739846, + "grad_norm": 0.003051051637157798, + "learning_rate": 1.174211553830203e-05, + "loss": 0.8725, + "step": 39986 + }, + { + "epoch": 0.49986249656241405, + "grad_norm": 2.832648277282715, + "learning_rate": 1.1741256187743277e-05, + "loss": 0.7285, + "step": 39988 + }, + { + "epoch": 0.4998874971874297, + "grad_norm": 2.8141837120056152, + "learning_rate": 1.1740396823923365e-05, + "loss": 0.2022, + "step": 39990 + }, + { + "epoch": 0.4999124978124453, + "grad_norm": 4.014220714569092, + "learning_rate": 1.1739537446848834e-05, + "loss": 0.9755, + "step": 39992 + }, + { + "epoch": 0.49993749843746094, + "grad_norm": 4.020103454589844, + "learning_rate": 1.1738678056526232e-05, + "loss": 1.2119, + "step": 39994 + }, + { + "epoch": 0.4999624990624766, + "grad_norm": 0.0009294369956478477, + "learning_rate": 1.1737818652962106e-05, + "loss": 0.4368, + "step": 39996 + }, + { + "epoch": 0.4999874996874922, + "grad_norm": 6.926454067230225, + "learning_rate": 1.1736959236162997e-05, + "loss": 1.3954, + "step": 39998 + }, + { + "epoch": 0.5000125003125078, + "grad_norm": 2.3588883876800537, + "learning_rate": 1.1736099806135452e-05, + "loss": 0.3733, + "step": 40000 + }, + { + "epoch": 0.5000375009375234, + "grad_norm": 0.00034347292967140675, + "learning_rate": 1.1735240362886018e-05, + "loss": 0.0338, + "step": 40002 + }, + { + "epoch": 0.5000625015625391, + "grad_norm": 3.088214635848999, + "learning_rate": 1.1734380906421239e-05, + "loss": 1.0983, + "step": 40004 + }, + { + "epoch": 0.5000875021875547, + "grad_norm": 2.7789194583892822, + "learning_rate": 1.1733521436747656e-05, + "loss": 1.5876, + "step": 40006 + }, + { + "epoch": 0.5001125028125704, + "grad_norm": 4.015364646911621, + "learning_rate": 1.1732661953871822e-05, + "loss": 2.0771, + "step": 40008 + }, + { + "epoch": 0.5001375034375859, + "grad_norm": 4.057159900665283, + "learning_rate": 1.1731802457800283e-05, + "loss": 0.8935, + "step": 40010 + }, + { + "epoch": 0.5001625040626015, + "grad_norm": 3.7978525161743164, + "learning_rate": 1.1730942948539576e-05, + "loss": 1.0105, + "step": 40012 + }, + { + "epoch": 0.5001875046876172, + "grad_norm": 3.3252854347229004, + "learning_rate": 1.1730083426096255e-05, + "loss": 1.015, + "step": 40014 + }, + { + "epoch": 0.5002125053126328, + "grad_norm": 2.649523973464966, + "learning_rate": 1.1729223890476865e-05, + "loss": 1.1267, + "step": 40016 + }, + { + "epoch": 0.5002375059376485, + "grad_norm": 4.507501125335693, + "learning_rate": 1.1728364341687944e-05, + "loss": 2.0488, + "step": 40018 + }, + { + "epoch": 0.500262506562664, + "grad_norm": 2.3772833347320557, + "learning_rate": 1.1727504779736048e-05, + "loss": 1.139, + "step": 40020 + }, + { + "epoch": 0.5002875071876797, + "grad_norm": 4.808604717254639, + "learning_rate": 1.1726645204627722e-05, + "loss": 1.4842, + "step": 40022 + }, + { + "epoch": 0.5003125078126953, + "grad_norm": 2.8057570457458496, + "learning_rate": 1.1725785616369509e-05, + "loss": 1.1249, + "step": 40024 + }, + { + "epoch": 0.500337508437711, + "grad_norm": 0.007635224610567093, + "learning_rate": 1.1724926014967956e-05, + "loss": 0.7075, + "step": 40026 + }, + { + "epoch": 0.5003625090627266, + "grad_norm": 6.792028427124023, + "learning_rate": 1.172406640042961e-05, + "loss": 1.7604, + "step": 40028 + }, + { + "epoch": 0.5003875096877421, + "grad_norm": 0.9862025380134583, + "learning_rate": 1.172320677276102e-05, + "loss": 1.3955, + "step": 40030 + }, + { + "epoch": 0.5004125103127578, + "grad_norm": 6.118257999420166, + "learning_rate": 1.1722347131968727e-05, + "loss": 0.4708, + "step": 40032 + }, + { + "epoch": 0.5004375109377734, + "grad_norm": 5.325932025909424, + "learning_rate": 1.1721487478059286e-05, + "loss": 1.783, + "step": 40034 + }, + { + "epoch": 0.5004625115627891, + "grad_norm": 5.5179009437561035, + "learning_rate": 1.172062781103924e-05, + "loss": 1.5611, + "step": 40036 + }, + { + "epoch": 0.5004875121878047, + "grad_norm": 5.421127796173096, + "learning_rate": 1.1719768130915134e-05, + "loss": 0.9238, + "step": 40038 + }, + { + "epoch": 0.5005125128128203, + "grad_norm": 6.4098429679870605, + "learning_rate": 1.1718908437693517e-05, + "loss": 0.8878, + "step": 40040 + }, + { + "epoch": 0.5005375134378359, + "grad_norm": 5.677743911743164, + "learning_rate": 1.171804873138094e-05, + "loss": 1.0123, + "step": 40042 + }, + { + "epoch": 0.5005625140628516, + "grad_norm": 3.8768997192382812, + "learning_rate": 1.1717189011983941e-05, + "loss": 1.1505, + "step": 40044 + }, + { + "epoch": 0.5005875146878672, + "grad_norm": 2.6171083450317383, + "learning_rate": 1.1716329279509076e-05, + "loss": 0.6901, + "step": 40046 + }, + { + "epoch": 0.5006125153128829, + "grad_norm": 3.165682554244995, + "learning_rate": 1.1715469533962889e-05, + "loss": 1.654, + "step": 40048 + }, + { + "epoch": 0.5006375159378984, + "grad_norm": 8.79563045501709, + "learning_rate": 1.1714609775351931e-05, + "loss": 1.6866, + "step": 40050 + }, + { + "epoch": 0.500662516562914, + "grad_norm": 2.4807822704315186, + "learning_rate": 1.1713750003682746e-05, + "loss": 0.2167, + "step": 40052 + }, + { + "epoch": 0.5006875171879297, + "grad_norm": 0.0011854879558086395, + "learning_rate": 1.1712890218961883e-05, + "loss": 0.2781, + "step": 40054 + }, + { + "epoch": 0.5007125178129453, + "grad_norm": 0.011429781094193459, + "learning_rate": 1.171203042119589e-05, + "loss": 0.5317, + "step": 40056 + }, + { + "epoch": 0.500737518437961, + "grad_norm": 3.264073610305786, + "learning_rate": 1.1711170610391315e-05, + "loss": 0.5697, + "step": 40058 + }, + { + "epoch": 0.5007625190629765, + "grad_norm": 4.452981472015381, + "learning_rate": 1.1710310786554708e-05, + "loss": 1.4663, + "step": 40060 + }, + { + "epoch": 0.5007875196879922, + "grad_norm": 3.076831340789795, + "learning_rate": 1.1709450949692617e-05, + "loss": 0.7731, + "step": 40062 + }, + { + "epoch": 0.5008125203130078, + "grad_norm": 4.320501327514648, + "learning_rate": 1.1708591099811588e-05, + "loss": 1.0418, + "step": 40064 + }, + { + "epoch": 0.5008375209380235, + "grad_norm": 1.7870879173278809, + "learning_rate": 1.1707731236918171e-05, + "loss": 1.0652, + "step": 40066 + }, + { + "epoch": 0.5008625215630391, + "grad_norm": 0.0007932294975034893, + "learning_rate": 1.1706871361018914e-05, + "loss": 0.0101, + "step": 40068 + }, + { + "epoch": 0.5008875221880547, + "grad_norm": 4.361469268798828, + "learning_rate": 1.1706011472120364e-05, + "loss": 0.2085, + "step": 40070 + }, + { + "epoch": 0.5009125228130703, + "grad_norm": 2.3025357723236084, + "learning_rate": 1.1705151570229077e-05, + "loss": 0.9484, + "step": 40072 + }, + { + "epoch": 0.500937523438086, + "grad_norm": 3.518372058868408, + "learning_rate": 1.1704291655351591e-05, + "loss": 0.7379, + "step": 40074 + }, + { + "epoch": 0.5009625240631016, + "grad_norm": 9.2046480178833, + "learning_rate": 1.1703431727494466e-05, + "loss": 1.0759, + "step": 40076 + }, + { + "epoch": 0.5009875246881172, + "grad_norm": 2.118189811706543, + "learning_rate": 1.1702571786664243e-05, + "loss": 0.8355, + "step": 40078 + }, + { + "epoch": 0.5010125253131328, + "grad_norm": 1.8786755800247192, + "learning_rate": 1.1701711832867475e-05, + "loss": 0.6647, + "step": 40080 + }, + { + "epoch": 0.5010375259381484, + "grad_norm": 1.6729073524475098, + "learning_rate": 1.170085186611071e-05, + "loss": 1.1598, + "step": 40082 + }, + { + "epoch": 0.5010625265631641, + "grad_norm": 2.288191080093384, + "learning_rate": 1.1699991886400498e-05, + "loss": 1.0587, + "step": 40084 + }, + { + "epoch": 0.5010875271881797, + "grad_norm": 0.0007177889929153025, + "learning_rate": 1.1699131893743385e-05, + "loss": 1.0969, + "step": 40086 + }, + { + "epoch": 0.5011125278131954, + "grad_norm": 2.173333168029785, + "learning_rate": 1.169827188814593e-05, + "loss": 0.4596, + "step": 40088 + }, + { + "epoch": 0.5011375284382109, + "grad_norm": 3.063323974609375, + "learning_rate": 1.1697411869614672e-05, + "loss": 0.3342, + "step": 40090 + }, + { + "epoch": 0.5011625290632266, + "grad_norm": 3.369910717010498, + "learning_rate": 1.1696551838156166e-05, + "loss": 0.425, + "step": 40092 + }, + { + "epoch": 0.5011875296882422, + "grad_norm": 3.239542007446289, + "learning_rate": 1.169569179377696e-05, + "loss": 1.2435, + "step": 40094 + }, + { + "epoch": 0.5012125303132579, + "grad_norm": 3.0212512016296387, + "learning_rate": 1.1694831736483608e-05, + "loss": 1.6901, + "step": 40096 + }, + { + "epoch": 0.5012375309382735, + "grad_norm": 2.7344253063201904, + "learning_rate": 1.1693971666282653e-05, + "loss": 1.1325, + "step": 40098 + }, + { + "epoch": 0.501262531563289, + "grad_norm": 0.020802108570933342, + "learning_rate": 1.1693111583180651e-05, + "loss": 0.3689, + "step": 40100 + }, + { + "epoch": 0.5012875321883047, + "grad_norm": 1.6909525394439697, + "learning_rate": 1.1692251487184155e-05, + "loss": 0.6205, + "step": 40102 + }, + { + "epoch": 0.5013125328133203, + "grad_norm": 3.536858081817627, + "learning_rate": 1.1691391378299705e-05, + "loss": 0.7164, + "step": 40104 + }, + { + "epoch": 0.501337533438336, + "grad_norm": 2.594607353210449, + "learning_rate": 1.1690531256533861e-05, + "loss": 0.1625, + "step": 40106 + }, + { + "epoch": 0.5013625340633516, + "grad_norm": 4.77531099319458, + "learning_rate": 1.1689671121893169e-05, + "loss": 1.2203, + "step": 40108 + }, + { + "epoch": 0.5013875346883672, + "grad_norm": 3.6735355854034424, + "learning_rate": 1.168881097438418e-05, + "loss": 1.7178, + "step": 40110 + }, + { + "epoch": 0.5014125353133828, + "grad_norm": 0.8855820298194885, + "learning_rate": 1.1687950814013445e-05, + "loss": 1.2613, + "step": 40112 + }, + { + "epoch": 0.5014375359383985, + "grad_norm": 4.269371032714844, + "learning_rate": 1.1687090640787519e-05, + "loss": 1.5147, + "step": 40114 + }, + { + "epoch": 0.5014625365634141, + "grad_norm": 9.801359176635742, + "learning_rate": 1.1686230454712947e-05, + "loss": 1.8697, + "step": 40116 + }, + { + "epoch": 0.5014875371884298, + "grad_norm": 3.079207420349121, + "learning_rate": 1.168537025579628e-05, + "loss": 1.098, + "step": 40118 + }, + { + "epoch": 0.5015125378134453, + "grad_norm": 3.0859146118164062, + "learning_rate": 1.1684510044044072e-05, + "loss": 1.5674, + "step": 40120 + }, + { + "epoch": 0.5015375384384609, + "grad_norm": 9.692927360534668, + "learning_rate": 1.1683649819462877e-05, + "loss": 1.336, + "step": 40122 + }, + { + "epoch": 0.5015625390634766, + "grad_norm": 0.23052851855754852, + "learning_rate": 1.168278958205924e-05, + "loss": 1.1065, + "step": 40124 + }, + { + "epoch": 0.5015875396884922, + "grad_norm": 4.464259147644043, + "learning_rate": 1.1681929331839716e-05, + "loss": 1.1016, + "step": 40126 + }, + { + "epoch": 0.5016125403135079, + "grad_norm": 3.5298843383789062, + "learning_rate": 1.168106906881086e-05, + "loss": 1.3296, + "step": 40128 + }, + { + "epoch": 0.5016375409385234, + "grad_norm": 0.2475314736366272, + "learning_rate": 1.1680208792979215e-05, + "loss": 0.418, + "step": 40130 + }, + { + "epoch": 0.5016625415635391, + "grad_norm": 4.347334384918213, + "learning_rate": 1.1679348504351338e-05, + "loss": 1.0133, + "step": 40132 + }, + { + "epoch": 0.5016875421885547, + "grad_norm": 2.65814208984375, + "learning_rate": 1.1678488202933782e-05, + "loss": 0.7193, + "step": 40134 + }, + { + "epoch": 0.5017125428135704, + "grad_norm": 2.800123691558838, + "learning_rate": 1.1677627888733096e-05, + "loss": 1.079, + "step": 40136 + }, + { + "epoch": 0.501737543438586, + "grad_norm": 2.866206169128418, + "learning_rate": 1.1676767561755833e-05, + "loss": 1.6882, + "step": 40138 + }, + { + "epoch": 0.5017625440636015, + "grad_norm": 5.309467315673828, + "learning_rate": 1.1675907222008547e-05, + "loss": 0.9618, + "step": 40140 + }, + { + "epoch": 0.5017875446886172, + "grad_norm": 6.027647018432617, + "learning_rate": 1.1675046869497787e-05, + "loss": 0.9569, + "step": 40142 + }, + { + "epoch": 0.5018125453136328, + "grad_norm": 3.1539716720581055, + "learning_rate": 1.1674186504230107e-05, + "loss": 0.7876, + "step": 40144 + }, + { + "epoch": 0.5018375459386485, + "grad_norm": 3.922560453414917, + "learning_rate": 1.1673326126212061e-05, + "loss": 1.7345, + "step": 40146 + }, + { + "epoch": 0.5018625465636641, + "grad_norm": 3.293281316757202, + "learning_rate": 1.1672465735450198e-05, + "loss": 1.3848, + "step": 40148 + }, + { + "epoch": 0.5018875471886797, + "grad_norm": 4.378936290740967, + "learning_rate": 1.1671605331951072e-05, + "loss": 1.0136, + "step": 40150 + }, + { + "epoch": 0.5019125478136953, + "grad_norm": 2.47735333442688, + "learning_rate": 1.1670744915721238e-05, + "loss": 0.1666, + "step": 40152 + }, + { + "epoch": 0.501937548438711, + "grad_norm": 2.0973384380340576, + "learning_rate": 1.1669884486767247e-05, + "loss": 0.0835, + "step": 40154 + }, + { + "epoch": 0.5019625490637266, + "grad_norm": 4.3640875816345215, + "learning_rate": 1.1669024045095651e-05, + "loss": 0.4543, + "step": 40156 + }, + { + "epoch": 0.5019875496887423, + "grad_norm": 0.20999827980995178, + "learning_rate": 1.1668163590713005e-05, + "loss": 0.008, + "step": 40158 + }, + { + "epoch": 0.5020125503137578, + "grad_norm": 6.590524673461914, + "learning_rate": 1.1667303123625859e-05, + "loss": 1.8175, + "step": 40160 + }, + { + "epoch": 0.5020375509387734, + "grad_norm": 2.665102958679199, + "learning_rate": 1.1666442643840767e-05, + "loss": 1.3585, + "step": 40162 + }, + { + "epoch": 0.5020625515637891, + "grad_norm": 2.2958033084869385, + "learning_rate": 1.1665582151364288e-05, + "loss": 1.1196, + "step": 40164 + }, + { + "epoch": 0.5020875521888047, + "grad_norm": 3.4310312271118164, + "learning_rate": 1.166472164620297e-05, + "loss": 1.3964, + "step": 40166 + }, + { + "epoch": 0.5021125528138204, + "grad_norm": 0.0006956508732400835, + "learning_rate": 1.1663861128363366e-05, + "loss": 0.0613, + "step": 40168 + }, + { + "epoch": 0.5021375534388359, + "grad_norm": 4.295356273651123, + "learning_rate": 1.166300059785203e-05, + "loss": 0.4862, + "step": 40170 + }, + { + "epoch": 0.5021625540638516, + "grad_norm": 5.0437703132629395, + "learning_rate": 1.1662140054675515e-05, + "loss": 2.113, + "step": 40172 + }, + { + "epoch": 0.5021875546888672, + "grad_norm": 3.4123637676239014, + "learning_rate": 1.1661279498840382e-05, + "loss": 0.938, + "step": 40174 + }, + { + "epoch": 0.5022125553138829, + "grad_norm": 2.6693105697631836, + "learning_rate": 1.1660418930353175e-05, + "loss": 1.3685, + "step": 40176 + }, + { + "epoch": 0.5022375559388985, + "grad_norm": 2.324627637863159, + "learning_rate": 1.1659558349220452e-05, + "loss": 1.0578, + "step": 40178 + }, + { + "epoch": 0.502262556563914, + "grad_norm": 4.634690284729004, + "learning_rate": 1.1658697755448771e-05, + "loss": 1.3358, + "step": 40180 + }, + { + "epoch": 0.5022875571889297, + "grad_norm": 2.853590488433838, + "learning_rate": 1.165783714904468e-05, + "loss": 0.5172, + "step": 40182 + }, + { + "epoch": 0.5023125578139453, + "grad_norm": 4.141264915466309, + "learning_rate": 1.1656976530014732e-05, + "loss": 0.8461, + "step": 40184 + }, + { + "epoch": 0.502337558438961, + "grad_norm": 5.839903831481934, + "learning_rate": 1.1656115898365491e-05, + "loss": 1.6434, + "step": 40186 + }, + { + "epoch": 0.5023625590639766, + "grad_norm": 5.263358116149902, + "learning_rate": 1.1655255254103503e-05, + "loss": 1.26, + "step": 40188 + }, + { + "epoch": 0.5023875596889922, + "grad_norm": 3.2863881587982178, + "learning_rate": 1.1654394597235326e-05, + "loss": 1.7479, + "step": 40190 + }, + { + "epoch": 0.5024125603140078, + "grad_norm": 1.7870155572891235, + "learning_rate": 1.1653533927767515e-05, + "loss": 1.3571, + "step": 40192 + }, + { + "epoch": 0.5024375609390235, + "grad_norm": 0.0005150751676410437, + "learning_rate": 1.165267324570662e-05, + "loss": 0.7641, + "step": 40194 + }, + { + "epoch": 0.5024625615640391, + "grad_norm": 0.41807013750076294, + "learning_rate": 1.1651812551059199e-05, + "loss": 0.9921, + "step": 40196 + }, + { + "epoch": 0.5024875621890548, + "grad_norm": 4.631465911865234, + "learning_rate": 1.1650951843831808e-05, + "loss": 0.7386, + "step": 40198 + }, + { + "epoch": 0.5025125628140703, + "grad_norm": 3.8115360736846924, + "learning_rate": 1.1650091124031005e-05, + "loss": 0.682, + "step": 40200 + }, + { + "epoch": 0.502537563439086, + "grad_norm": 5.25970458984375, + "learning_rate": 1.1649230391663338e-05, + "loss": 0.4594, + "step": 40202 + }, + { + "epoch": 0.5025625640641016, + "grad_norm": 3.5037693977355957, + "learning_rate": 1.1648369646735366e-05, + "loss": 1.3032, + "step": 40204 + }, + { + "epoch": 0.5025875646891172, + "grad_norm": 2.2547597885131836, + "learning_rate": 1.1647508889253647e-05, + "loss": 0.2078, + "step": 40206 + }, + { + "epoch": 0.5026125653141329, + "grad_norm": 3.882280111312866, + "learning_rate": 1.1646648119224726e-05, + "loss": 1.5721, + "step": 40208 + }, + { + "epoch": 0.5026375659391484, + "grad_norm": 0.3508424162864685, + "learning_rate": 1.164578733665517e-05, + "loss": 0.647, + "step": 40210 + }, + { + "epoch": 0.5026625665641641, + "grad_norm": 2.903738260269165, + "learning_rate": 1.1644926541551533e-05, + "loss": 0.8054, + "step": 40212 + }, + { + "epoch": 0.5026875671891797, + "grad_norm": 5.413090705871582, + "learning_rate": 1.1644065733920366e-05, + "loss": 0.4122, + "step": 40214 + }, + { + "epoch": 0.5027125678141954, + "grad_norm": 4.49486780166626, + "learning_rate": 1.1643204913768225e-05, + "loss": 1.9638, + "step": 40216 + }, + { + "epoch": 0.502737568439211, + "grad_norm": 3.82199764251709, + "learning_rate": 1.164234408110167e-05, + "loss": 1.579, + "step": 40218 + }, + { + "epoch": 0.5027625690642266, + "grad_norm": 2.8952395915985107, + "learning_rate": 1.1641483235927253e-05, + "loss": 0.9398, + "step": 40220 + }, + { + "epoch": 0.5027875696892422, + "grad_norm": 3.8392550945281982, + "learning_rate": 1.1640622378251535e-05, + "loss": 1.9984, + "step": 40222 + }, + { + "epoch": 0.5028125703142579, + "grad_norm": 5.267747402191162, + "learning_rate": 1.1639761508081065e-05, + "loss": 1.3335, + "step": 40224 + }, + { + "epoch": 0.5028375709392735, + "grad_norm": 4.330230236053467, + "learning_rate": 1.1638900625422407e-05, + "loss": 1.8781, + "step": 40226 + }, + { + "epoch": 0.5028625715642892, + "grad_norm": 0.0007578355725854635, + "learning_rate": 1.163803973028211e-05, + "loss": 0.7447, + "step": 40228 + }, + { + "epoch": 0.5028875721893047, + "grad_norm": 1.5725688934326172, + "learning_rate": 1.1637178822666734e-05, + "loss": 0.1108, + "step": 40230 + }, + { + "epoch": 0.5029125728143203, + "grad_norm": 5.009381294250488, + "learning_rate": 1.163631790258284e-05, + "loss": 1.377, + "step": 40232 + }, + { + "epoch": 0.502937573439336, + "grad_norm": 0.8439466953277588, + "learning_rate": 1.1635456970036975e-05, + "loss": 0.9674, + "step": 40234 + }, + { + "epoch": 0.5029625740643516, + "grad_norm": 3.941911220550537, + "learning_rate": 1.1634596025035703e-05, + "loss": 1.2382, + "step": 40236 + }, + { + "epoch": 0.5029875746893673, + "grad_norm": 4.542349338531494, + "learning_rate": 1.1633735067585581e-05, + "loss": 1.3295, + "step": 40238 + }, + { + "epoch": 0.5030125753143828, + "grad_norm": 0.018791330978274345, + "learning_rate": 1.1632874097693164e-05, + "loss": 0.2089, + "step": 40240 + }, + { + "epoch": 0.5030375759393985, + "grad_norm": 2.976280927658081, + "learning_rate": 1.1632013115365003e-05, + "loss": 1.1038, + "step": 40242 + }, + { + "epoch": 0.5030625765644141, + "grad_norm": 2.682359457015991, + "learning_rate": 1.1631152120607668e-05, + "loss": 1.1193, + "step": 40244 + }, + { + "epoch": 0.5030875771894298, + "grad_norm": 0.0008123747538775206, + "learning_rate": 1.1630291113427703e-05, + "loss": 0.8855, + "step": 40246 + }, + { + "epoch": 0.5031125778144454, + "grad_norm": 3.3599226474761963, + "learning_rate": 1.1629430093831675e-05, + "loss": 0.6806, + "step": 40248 + }, + { + "epoch": 0.5031375784394609, + "grad_norm": 5.101449489593506, + "learning_rate": 1.1628569061826135e-05, + "loss": 1.9236, + "step": 40250 + }, + { + "epoch": 0.5031625790644766, + "grad_norm": 8.963658332824707, + "learning_rate": 1.1627708017417648e-05, + "loss": 1.2223, + "step": 40252 + }, + { + "epoch": 0.5031875796894922, + "grad_norm": 5.888651371002197, + "learning_rate": 1.1626846960612763e-05, + "loss": 1.1455, + "step": 40254 + }, + { + "epoch": 0.5032125803145079, + "grad_norm": 1.8131203651428223, + "learning_rate": 1.1625985891418044e-05, + "loss": 0.9463, + "step": 40256 + }, + { + "epoch": 0.5032375809395235, + "grad_norm": 3.313568353652954, + "learning_rate": 1.1625124809840044e-05, + "loss": 0.9043, + "step": 40258 + }, + { + "epoch": 0.5032625815645391, + "grad_norm": 3.658280372619629, + "learning_rate": 1.1624263715885325e-05, + "loss": 1.7888, + "step": 40260 + }, + { + "epoch": 0.5032875821895547, + "grad_norm": 4.889994144439697, + "learning_rate": 1.1623402609560444e-05, + "loss": 2.1733, + "step": 40262 + }, + { + "epoch": 0.5033125828145704, + "grad_norm": 1.3217772245407104, + "learning_rate": 1.1622541490871957e-05, + "loss": 0.5644, + "step": 40264 + }, + { + "epoch": 0.503337583439586, + "grad_norm": 2.656431198120117, + "learning_rate": 1.1621680359826424e-05, + "loss": 1.1305, + "step": 40266 + }, + { + "epoch": 0.5033625840646017, + "grad_norm": 3.940438985824585, + "learning_rate": 1.1620819216430402e-05, + "loss": 2.0076, + "step": 40268 + }, + { + "epoch": 0.5033875846896172, + "grad_norm": 2.5345559120178223, + "learning_rate": 1.161995806069045e-05, + "loss": 0.3355, + "step": 40270 + }, + { + "epoch": 0.5034125853146328, + "grad_norm": 8.537503242492676, + "learning_rate": 1.1619096892613126e-05, + "loss": 0.6901, + "step": 40272 + }, + { + "epoch": 0.5034375859396485, + "grad_norm": 2.5484402179718018, + "learning_rate": 1.1618235712204991e-05, + "loss": 0.2692, + "step": 40274 + }, + { + "epoch": 0.5034625865646641, + "grad_norm": 5.6239399909973145, + "learning_rate": 1.16173745194726e-05, + "loss": 1.1395, + "step": 40276 + }, + { + "epoch": 0.5034875871896798, + "grad_norm": 1.81412672996521, + "learning_rate": 1.1616513314422515e-05, + "loss": 0.3673, + "step": 40278 + }, + { + "epoch": 0.5035125878146953, + "grad_norm": 3.541182041168213, + "learning_rate": 1.1615652097061292e-05, + "loss": 0.5972, + "step": 40280 + }, + { + "epoch": 0.503537588439711, + "grad_norm": 0.00676394160836935, + "learning_rate": 1.161479086739549e-05, + "loss": 0.5758, + "step": 40282 + }, + { + "epoch": 0.5035625890647266, + "grad_norm": 5.396970272064209, + "learning_rate": 1.1613929625431671e-05, + "loss": 1.1768, + "step": 40284 + }, + { + "epoch": 0.5035875896897423, + "grad_norm": 4.379105567932129, + "learning_rate": 1.161306837117639e-05, + "loss": 0.9438, + "step": 40286 + }, + { + "epoch": 0.5036125903147579, + "grad_norm": 1.9175786972045898, + "learning_rate": 1.161220710463621e-05, + "loss": 0.3837, + "step": 40288 + }, + { + "epoch": 0.5036375909397734, + "grad_norm": 2.801297664642334, + "learning_rate": 1.161134582581769e-05, + "loss": 1.0536, + "step": 40290 + }, + { + "epoch": 0.5036625915647891, + "grad_norm": 4.57533597946167, + "learning_rate": 1.1610484534727386e-05, + "loss": 1.5835, + "step": 40292 + }, + { + "epoch": 0.5036875921898047, + "grad_norm": 2.724452257156372, + "learning_rate": 1.160962323137186e-05, + "loss": 0.6969, + "step": 40294 + }, + { + "epoch": 0.5037125928148204, + "grad_norm": 7.183146953582764, + "learning_rate": 1.1608761915757672e-05, + "loss": 1.2578, + "step": 40296 + }, + { + "epoch": 0.503737593439836, + "grad_norm": 4.417898654937744, + "learning_rate": 1.1607900587891378e-05, + "loss": 0.6936, + "step": 40298 + }, + { + "epoch": 0.5037625940648516, + "grad_norm": 2.2500107288360596, + "learning_rate": 1.1607039247779542e-05, + "loss": 0.5051, + "step": 40300 + }, + { + "epoch": 0.5037875946898672, + "grad_norm": 4.672919750213623, + "learning_rate": 1.1606177895428723e-05, + "loss": 0.1638, + "step": 40302 + }, + { + "epoch": 0.5038125953148829, + "grad_norm": 4.582896709442139, + "learning_rate": 1.1605316530845481e-05, + "loss": 0.9787, + "step": 40304 + }, + { + "epoch": 0.5038375959398985, + "grad_norm": 4.7195000648498535, + "learning_rate": 1.1604455154036374e-05, + "loss": 1.2767, + "step": 40306 + }, + { + "epoch": 0.5038625965649142, + "grad_norm": 4.089399337768555, + "learning_rate": 1.1603593765007965e-05, + "loss": 0.6806, + "step": 40308 + }, + { + "epoch": 0.5038875971899297, + "grad_norm": 8.239082336425781, + "learning_rate": 1.1602732363766811e-05, + "loss": 0.8343, + "step": 40310 + }, + { + "epoch": 0.5039125978149454, + "grad_norm": 4.006728172302246, + "learning_rate": 1.1601870950319473e-05, + "loss": 0.8805, + "step": 40312 + }, + { + "epoch": 0.503937598439961, + "grad_norm": 1.8407951593399048, + "learning_rate": 1.1601009524672513e-05, + "loss": 0.1947, + "step": 40314 + }, + { + "epoch": 0.5039625990649766, + "grad_norm": 2.582456588745117, + "learning_rate": 1.1600148086832494e-05, + "loss": 1.7998, + "step": 40316 + }, + { + "epoch": 0.5039875996899923, + "grad_norm": 4.178628921508789, + "learning_rate": 1.159928663680597e-05, + "loss": 1.3419, + "step": 40318 + }, + { + "epoch": 0.5040126003150078, + "grad_norm": 0.0007400474278256297, + "learning_rate": 1.1598425174599509e-05, + "loss": 1.0756, + "step": 40320 + }, + { + "epoch": 0.5040376009400235, + "grad_norm": 5.559337615966797, + "learning_rate": 1.1597563700219663e-05, + "loss": 0.1264, + "step": 40322 + }, + { + "epoch": 0.5040626015650391, + "grad_norm": 2.192885398864746, + "learning_rate": 1.1596702213673e-05, + "loss": 1.1072, + "step": 40324 + }, + { + "epoch": 0.5040876021900548, + "grad_norm": 3.9776318073272705, + "learning_rate": 1.1595840714966078e-05, + "loss": 1.4675, + "step": 40326 + }, + { + "epoch": 0.5041126028150704, + "grad_norm": 5.2568254470825195, + "learning_rate": 1.159497920410546e-05, + "loss": 1.481, + "step": 40328 + }, + { + "epoch": 0.504137603440086, + "grad_norm": 3.875844717025757, + "learning_rate": 1.1594117681097707e-05, + "loss": 1.5017, + "step": 40330 + }, + { + "epoch": 0.5041626040651016, + "grad_norm": 3.000230550765991, + "learning_rate": 1.1593256145949377e-05, + "loss": 0.7218, + "step": 40332 + }, + { + "epoch": 0.5041876046901173, + "grad_norm": 1.0768991708755493, + "learning_rate": 1.1592394598667034e-05, + "loss": 0.0337, + "step": 40334 + }, + { + "epoch": 0.5042126053151329, + "grad_norm": 1.8438817262649536, + "learning_rate": 1.1591533039257239e-05, + "loss": 0.2264, + "step": 40336 + }, + { + "epoch": 0.5042376059401485, + "grad_norm": 0.0004591583274304867, + "learning_rate": 1.1590671467726553e-05, + "loss": 0.1549, + "step": 40338 + }, + { + "epoch": 0.5042626065651641, + "grad_norm": 4.708000183105469, + "learning_rate": 1.1589809884081538e-05, + "loss": 0.8037, + "step": 40340 + }, + { + "epoch": 0.5042876071901797, + "grad_norm": 0.12962868809700012, + "learning_rate": 1.1588948288328758e-05, + "loss": 1.032, + "step": 40342 + }, + { + "epoch": 0.5043126078151954, + "grad_norm": 0.014161741361021996, + "learning_rate": 1.158808668047477e-05, + "loss": 0.2663, + "step": 40344 + }, + { + "epoch": 0.504337608440211, + "grad_norm": 3.827364444732666, + "learning_rate": 1.158722506052614e-05, + "loss": 0.931, + "step": 40346 + }, + { + "epoch": 0.5043626090652267, + "grad_norm": 3.6288864612579346, + "learning_rate": 1.1586363428489425e-05, + "loss": 1.4159, + "step": 40348 + }, + { + "epoch": 0.5043876096902422, + "grad_norm": 0.0006949867238290608, + "learning_rate": 1.1585501784371194e-05, + "loss": 0.3149, + "step": 40350 + }, + { + "epoch": 0.5044126103152579, + "grad_norm": 5.9185967445373535, + "learning_rate": 1.1584640128178005e-05, + "loss": 2.5366, + "step": 40352 + }, + { + "epoch": 0.5044376109402735, + "grad_norm": 1.1603304147720337, + "learning_rate": 1.158377845991642e-05, + "loss": 0.6636, + "step": 40354 + }, + { + "epoch": 0.5044626115652892, + "grad_norm": 0.04143368452787399, + "learning_rate": 1.1582916779593003e-05, + "loss": 0.3236, + "step": 40356 + }, + { + "epoch": 0.5044876121903048, + "grad_norm": 1.7866913080215454, + "learning_rate": 1.1582055087214316e-05, + "loss": 0.0135, + "step": 40358 + }, + { + "epoch": 0.5045126128153203, + "grad_norm": 3.742621660232544, + "learning_rate": 1.1581193382786919e-05, + "loss": 1.1007, + "step": 40360 + }, + { + "epoch": 0.504537613440336, + "grad_norm": 1.7789359092712402, + "learning_rate": 1.158033166631738e-05, + "loss": 1.0655, + "step": 40362 + }, + { + "epoch": 0.5045626140653516, + "grad_norm": 3.8261616230010986, + "learning_rate": 1.1579469937812255e-05, + "loss": 0.5439, + "step": 40364 + }, + { + "epoch": 0.5045876146903673, + "grad_norm": 3.1179957389831543, + "learning_rate": 1.1578608197278112e-05, + "loss": 0.3169, + "step": 40366 + }, + { + "epoch": 0.5046126153153829, + "grad_norm": 2.7582685947418213, + "learning_rate": 1.1577746444721513e-05, + "loss": 0.1995, + "step": 40368 + }, + { + "epoch": 0.5046376159403985, + "grad_norm": 6.761320114135742, + "learning_rate": 1.157688468014902e-05, + "loss": 1.0197, + "step": 40370 + }, + { + "epoch": 0.5046626165654141, + "grad_norm": 5.136983394622803, + "learning_rate": 1.1576022903567197e-05, + "loss": 0.8455, + "step": 40372 + }, + { + "epoch": 0.5046876171904298, + "grad_norm": 4.23921012878418, + "learning_rate": 1.1575161114982602e-05, + "loss": 0.7847, + "step": 40374 + }, + { + "epoch": 0.5047126178154454, + "grad_norm": 0.0007787721115164459, + "learning_rate": 1.1574299314401807e-05, + "loss": 0.006, + "step": 40376 + }, + { + "epoch": 0.5047376184404611, + "grad_norm": 3.598792791366577, + "learning_rate": 1.1573437501831371e-05, + "loss": 1.0881, + "step": 40378 + }, + { + "epoch": 0.5047626190654766, + "grad_norm": 3.192643642425537, + "learning_rate": 1.1572575677277854e-05, + "loss": 0.8856, + "step": 40380 + }, + { + "epoch": 0.5047876196904922, + "grad_norm": 0.9162582755088806, + "learning_rate": 1.157171384074783e-05, + "loss": 0.0037, + "step": 40382 + }, + { + "epoch": 0.5048126203155079, + "grad_norm": 4.590310573577881, + "learning_rate": 1.1570851992247851e-05, + "loss": 0.9748, + "step": 40384 + }, + { + "epoch": 0.5048376209405235, + "grad_norm": 4.975644588470459, + "learning_rate": 1.1569990131784481e-05, + "loss": 1.2569, + "step": 40386 + }, + { + "epoch": 0.5048626215655392, + "grad_norm": 4.348571300506592, + "learning_rate": 1.1569128259364295e-05, + "loss": 1.458, + "step": 40388 + }, + { + "epoch": 0.5048876221905547, + "grad_norm": 0.6138961315155029, + "learning_rate": 1.1568266374993848e-05, + "loss": 0.029, + "step": 40390 + }, + { + "epoch": 0.5049126228155704, + "grad_norm": 2.808015823364258, + "learning_rate": 1.1567404478679704e-05, + "loss": 0.8739, + "step": 40392 + }, + { + "epoch": 0.504937623440586, + "grad_norm": 4.0880126953125, + "learning_rate": 1.1566542570428435e-05, + "loss": 0.7806, + "step": 40394 + }, + { + "epoch": 0.5049626240656017, + "grad_norm": 1.5405058860778809, + "learning_rate": 1.1565680650246594e-05, + "loss": 0.514, + "step": 40396 + }, + { + "epoch": 0.5049876246906173, + "grad_norm": 1.9012194871902466, + "learning_rate": 1.1564818718140749e-05, + "loss": 0.2405, + "step": 40398 + }, + { + "epoch": 0.5050126253156328, + "grad_norm": 0.02184348553419113, + "learning_rate": 1.1563956774117469e-05, + "loss": 0.8044, + "step": 40400 + }, + { + "epoch": 0.5050376259406485, + "grad_norm": 3.3525028228759766, + "learning_rate": 1.1563094818183318e-05, + "loss": 0.9632, + "step": 40402 + }, + { + "epoch": 0.5050626265656641, + "grad_norm": 1.4898271560668945, + "learning_rate": 1.1562232850344855e-05, + "loss": 0.4395, + "step": 40404 + }, + { + "epoch": 0.5050876271906798, + "grad_norm": 2.3187472820281982, + "learning_rate": 1.1561370870608646e-05, + "loss": 0.7011, + "step": 40406 + }, + { + "epoch": 0.5051126278156954, + "grad_norm": 5.948641777038574, + "learning_rate": 1.156050887898126e-05, + "loss": 1.9931, + "step": 40408 + }, + { + "epoch": 0.505137628440711, + "grad_norm": 0.5975732803344727, + "learning_rate": 1.1559646875469257e-05, + "loss": 0.6161, + "step": 40410 + }, + { + "epoch": 0.5051626290657266, + "grad_norm": 2.681165933609009, + "learning_rate": 1.1558784860079204e-05, + "loss": 1.4085, + "step": 40412 + }, + { + "epoch": 0.5051876296907423, + "grad_norm": 3.57464599609375, + "learning_rate": 1.155792283281767e-05, + "loss": 1.1876, + "step": 40414 + }, + { + "epoch": 0.5052126303157579, + "grad_norm": 6.312716007232666, + "learning_rate": 1.155706079369121e-05, + "loss": 1.1982, + "step": 40416 + }, + { + "epoch": 0.5052376309407736, + "grad_norm": 4.594329357147217, + "learning_rate": 1.1556198742706399e-05, + "loss": 0.6597, + "step": 40418 + }, + { + "epoch": 0.5052626315657891, + "grad_norm": 2.1451146602630615, + "learning_rate": 1.15553366798698e-05, + "loss": 0.9483, + "step": 40420 + }, + { + "epoch": 0.5052876321908047, + "grad_norm": 0.024916118010878563, + "learning_rate": 1.1554474605187971e-05, + "loss": 0.2222, + "step": 40422 + }, + { + "epoch": 0.5053126328158204, + "grad_norm": 0.4666590094566345, + "learning_rate": 1.1553612518667485e-05, + "loss": 0.8538, + "step": 40424 + }, + { + "epoch": 0.505337633440836, + "grad_norm": 2.119318962097168, + "learning_rate": 1.1552750420314908e-05, + "loss": 0.3396, + "step": 40426 + }, + { + "epoch": 0.5053626340658517, + "grad_norm": 3.4902069568634033, + "learning_rate": 1.1551888310136805e-05, + "loss": 1.9588, + "step": 40428 + }, + { + "epoch": 0.5053876346908672, + "grad_norm": 4.222763538360596, + "learning_rate": 1.1551026188139737e-05, + "loss": 1.1623, + "step": 40430 + }, + { + "epoch": 0.5054126353158829, + "grad_norm": 0.37757739424705505, + "learning_rate": 1.1550164054330274e-05, + "loss": 0.7444, + "step": 40432 + }, + { + "epoch": 0.5054376359408985, + "grad_norm": 4.364377498626709, + "learning_rate": 1.154930190871498e-05, + "loss": 1.6533, + "step": 40434 + }, + { + "epoch": 0.5054626365659142, + "grad_norm": 2.784700870513916, + "learning_rate": 1.1548439751300419e-05, + "loss": 1.4136, + "step": 40436 + }, + { + "epoch": 0.5054876371909298, + "grad_norm": 3.528094530105591, + "learning_rate": 1.1547577582093164e-05, + "loss": 1.3828, + "step": 40438 + }, + { + "epoch": 0.5055126378159454, + "grad_norm": 0.678118884563446, + "learning_rate": 1.1546715401099777e-05, + "loss": 0.6614, + "step": 40440 + }, + { + "epoch": 0.505537638440961, + "grad_norm": 3.2198894023895264, + "learning_rate": 1.1545853208326822e-05, + "loss": 0.6133, + "step": 40442 + }, + { + "epoch": 0.5055626390659766, + "grad_norm": 3.6771864891052246, + "learning_rate": 1.154499100378087e-05, + "loss": 0.751, + "step": 40444 + }, + { + "epoch": 0.5055876396909923, + "grad_norm": 3.0515902042388916, + "learning_rate": 1.1544128787468484e-05, + "loss": 0.399, + "step": 40446 + }, + { + "epoch": 0.5056126403160079, + "grad_norm": 3.0685904026031494, + "learning_rate": 1.154326655939623e-05, + "loss": 0.2319, + "step": 40448 + }, + { + "epoch": 0.5056376409410235, + "grad_norm": 3.6066079139709473, + "learning_rate": 1.1542404319570676e-05, + "loss": 1.8938, + "step": 40450 + }, + { + "epoch": 0.5056626415660391, + "grad_norm": 2.730367660522461, + "learning_rate": 1.1541542067998393e-05, + "loss": 0.4097, + "step": 40452 + }, + { + "epoch": 0.5056876421910548, + "grad_norm": 1.9386961460113525, + "learning_rate": 1.1540679804685942e-05, + "loss": 0.8234, + "step": 40454 + }, + { + "epoch": 0.5057126428160704, + "grad_norm": 0.7713955640792847, + "learning_rate": 1.1539817529639889e-05, + "loss": 0.751, + "step": 40456 + }, + { + "epoch": 0.5057376434410861, + "grad_norm": 5.053066730499268, + "learning_rate": 1.1538955242866805e-05, + "loss": 0.9539, + "step": 40458 + }, + { + "epoch": 0.5057626440661016, + "grad_norm": 5.697830677032471, + "learning_rate": 1.1538092944373254e-05, + "loss": 2.0373, + "step": 40460 + }, + { + "epoch": 0.5057876446911173, + "grad_norm": 10.866661071777344, + "learning_rate": 1.1537230634165806e-05, + "loss": 0.3703, + "step": 40462 + }, + { + "epoch": 0.5058126453161329, + "grad_norm": 3.837636947631836, + "learning_rate": 1.1536368312251025e-05, + "loss": 0.9153, + "step": 40464 + }, + { + "epoch": 0.5058376459411486, + "grad_norm": 5.729399681091309, + "learning_rate": 1.1535505978635484e-05, + "loss": 0.7546, + "step": 40466 + }, + { + "epoch": 0.5058626465661642, + "grad_norm": 4.486293315887451, + "learning_rate": 1.1534643633325744e-05, + "loss": 0.3808, + "step": 40468 + }, + { + "epoch": 0.5058876471911797, + "grad_norm": 3.9396588802337646, + "learning_rate": 1.1533781276328378e-05, + "loss": 0.8398, + "step": 40470 + }, + { + "epoch": 0.5059126478161954, + "grad_norm": 4.177111625671387, + "learning_rate": 1.1532918907649946e-05, + "loss": 2.2439, + "step": 40472 + }, + { + "epoch": 0.505937648441211, + "grad_norm": 5.077319145202637, + "learning_rate": 1.1532056527297023e-05, + "loss": 2.217, + "step": 40474 + }, + { + "epoch": 0.5059626490662267, + "grad_norm": 4.10941219329834, + "learning_rate": 1.1531194135276174e-05, + "loss": 1.5023, + "step": 40476 + }, + { + "epoch": 0.5059876496912423, + "grad_norm": 0.000728414102923125, + "learning_rate": 1.1530331731593964e-05, + "loss": 0.5302, + "step": 40478 + }, + { + "epoch": 0.5060126503162579, + "grad_norm": 3.3554306030273438, + "learning_rate": 1.152946931625697e-05, + "loss": 0.0986, + "step": 40480 + }, + { + "epoch": 0.5060376509412735, + "grad_norm": 5.146566390991211, + "learning_rate": 1.152860688927175e-05, + "loss": 1.5248, + "step": 40482 + }, + { + "epoch": 0.5060626515662892, + "grad_norm": 0.022875608876347542, + "learning_rate": 1.1527744450644877e-05, + "loss": 0.0251, + "step": 40484 + }, + { + "epoch": 0.5060876521913048, + "grad_norm": 4.607710838317871, + "learning_rate": 1.1526882000382917e-05, + "loss": 1.3242, + "step": 40486 + }, + { + "epoch": 0.5061126528163205, + "grad_norm": 5.909493446350098, + "learning_rate": 1.1526019538492441e-05, + "loss": 0.9312, + "step": 40488 + }, + { + "epoch": 0.506137653441336, + "grad_norm": 8.144364356994629, + "learning_rate": 1.1525157064980014e-05, + "loss": 2.7581, + "step": 40490 + }, + { + "epoch": 0.5061626540663516, + "grad_norm": 1.9404470920562744, + "learning_rate": 1.152429457985221e-05, + "loss": 0.0718, + "step": 40492 + }, + { + "epoch": 0.5061876546913673, + "grad_norm": 2.1285274028778076, + "learning_rate": 1.1523432083115594e-05, + "loss": 1.1164, + "step": 40494 + }, + { + "epoch": 0.5062126553163829, + "grad_norm": 5.09286642074585, + "learning_rate": 1.1522569574776731e-05, + "loss": 1.5198, + "step": 40496 + }, + { + "epoch": 0.5062376559413986, + "grad_norm": 4.442248344421387, + "learning_rate": 1.1521707054842194e-05, + "loss": 1.2077, + "step": 40498 + }, + { + "epoch": 0.5062626565664141, + "grad_norm": 5.1551713943481445, + "learning_rate": 1.1520844523318554e-05, + "loss": 0.9689, + "step": 40500 + }, + { + "epoch": 0.5062876571914298, + "grad_norm": 4.6941304206848145, + "learning_rate": 1.1519981980212376e-05, + "loss": 0.6619, + "step": 40502 + }, + { + "epoch": 0.5063126578164454, + "grad_norm": 0.0024631626438349485, + "learning_rate": 1.151911942553023e-05, + "loss": 0.4829, + "step": 40504 + }, + { + "epoch": 0.5063376584414611, + "grad_norm": 8.379179954528809, + "learning_rate": 1.1518256859278686e-05, + "loss": 1.8609, + "step": 40506 + }, + { + "epoch": 0.5063626590664767, + "grad_norm": 2.340282678604126, + "learning_rate": 1.1517394281464311e-05, + "loss": 1.0195, + "step": 40508 + }, + { + "epoch": 0.5063876596914922, + "grad_norm": 4.05522346496582, + "learning_rate": 1.1516531692093678e-05, + "loss": 1.1964, + "step": 40510 + }, + { + "epoch": 0.5064126603165079, + "grad_norm": 3.103849411010742, + "learning_rate": 1.1515669091173352e-05, + "loss": 0.9689, + "step": 40512 + }, + { + "epoch": 0.5064376609415235, + "grad_norm": 11.928998947143555, + "learning_rate": 1.1514806478709903e-05, + "loss": 2.8412, + "step": 40514 + }, + { + "epoch": 0.5064626615665392, + "grad_norm": 3.9155642986297607, + "learning_rate": 1.1513943854709906e-05, + "loss": 0.2271, + "step": 40516 + }, + { + "epoch": 0.5064876621915548, + "grad_norm": 0.0005210040835663676, + "learning_rate": 1.1513081219179924e-05, + "loss": 0.0, + "step": 40518 + }, + { + "epoch": 0.5065126628165704, + "grad_norm": 7.049016952514648, + "learning_rate": 1.1512218572126533e-05, + "loss": 2.9249, + "step": 40520 + }, + { + "epoch": 0.506537663441586, + "grad_norm": 0.026106607168912888, + "learning_rate": 1.1511355913556294e-05, + "loss": 0.0003, + "step": 40522 + }, + { + "epoch": 0.5065626640666017, + "grad_norm": 0.3539479970932007, + "learning_rate": 1.1510493243475784e-05, + "loss": 0.0487, + "step": 40524 + }, + { + "epoch": 0.5065876646916173, + "grad_norm": 0.32065728306770325, + "learning_rate": 1.1509630561891572e-05, + "loss": 1.1803, + "step": 40526 + }, + { + "epoch": 0.506612665316633, + "grad_norm": 2.776670455932617, + "learning_rate": 1.1508767868810227e-05, + "loss": 0.4042, + "step": 40528 + }, + { + "epoch": 0.5066376659416485, + "grad_norm": 2.2250618934631348, + "learning_rate": 1.1507905164238317e-05, + "loss": 1.2083, + "step": 40530 + }, + { + "epoch": 0.5066626665666641, + "grad_norm": 5.094323635101318, + "learning_rate": 1.1507042448182419e-05, + "loss": 1.6502, + "step": 40532 + }, + { + "epoch": 0.5066876671916798, + "grad_norm": 0.019873034209012985, + "learning_rate": 1.1506179720649095e-05, + "loss": 0.013, + "step": 40534 + }, + { + "epoch": 0.5067126678166954, + "grad_norm": 2.036837339401245, + "learning_rate": 1.1505316981644921e-05, + "loss": 1.4224, + "step": 40536 + }, + { + "epoch": 0.5067376684417111, + "grad_norm": 3.8485445976257324, + "learning_rate": 1.1504454231176464e-05, + "loss": 0.6049, + "step": 40538 + }, + { + "epoch": 0.5067626690667266, + "grad_norm": 4.48690128326416, + "learning_rate": 1.15035914692503e-05, + "loss": 2.0266, + "step": 40540 + }, + { + "epoch": 0.5067876696917423, + "grad_norm": 3.6548349857330322, + "learning_rate": 1.1502728695872993e-05, + "loss": 0.4045, + "step": 40542 + }, + { + "epoch": 0.5068126703167579, + "grad_norm": 5.848970413208008, + "learning_rate": 1.1501865911051118e-05, + "loss": 1.5932, + "step": 40544 + }, + { + "epoch": 0.5068376709417736, + "grad_norm": 3.158252000808716, + "learning_rate": 1.1501003114791245e-05, + "loss": 0.2469, + "step": 40546 + }, + { + "epoch": 0.5068626715667892, + "grad_norm": 0.0008876414503902197, + "learning_rate": 1.1500140307099943e-05, + "loss": 0.7977, + "step": 40548 + }, + { + "epoch": 0.5068876721918048, + "grad_norm": 0.0008610238437540829, + "learning_rate": 1.1499277487983784e-05, + "loss": 0.0, + "step": 40550 + }, + { + "epoch": 0.5069126728168204, + "grad_norm": 5.3483476638793945, + "learning_rate": 1.1498414657449342e-05, + "loss": 0.7059, + "step": 40552 + }, + { + "epoch": 0.506937673441836, + "grad_norm": 0.000820808345451951, + "learning_rate": 1.1497551815503183e-05, + "loss": 0.0247, + "step": 40554 + }, + { + "epoch": 0.5069626740668517, + "grad_norm": 4.526083946228027, + "learning_rate": 1.1496688962151882e-05, + "loss": 1.7896, + "step": 40556 + }, + { + "epoch": 0.5069876746918673, + "grad_norm": 1.082962155342102, + "learning_rate": 1.1495826097402011e-05, + "loss": 0.0413, + "step": 40558 + }, + { + "epoch": 0.5070126753168829, + "grad_norm": 1.3531153202056885, + "learning_rate": 1.149496322126014e-05, + "loss": 0.0601, + "step": 40560 + }, + { + "epoch": 0.5070376759418985, + "grad_norm": 5.03355073928833, + "learning_rate": 1.149410033373284e-05, + "loss": 1.7371, + "step": 40562 + }, + { + "epoch": 0.5070626765669142, + "grad_norm": 4.739262104034424, + "learning_rate": 1.1493237434826683e-05, + "loss": 0.8654, + "step": 40564 + }, + { + "epoch": 0.5070876771919298, + "grad_norm": 0.0007263842271640897, + "learning_rate": 1.149237452454824e-05, + "loss": 0.4081, + "step": 40566 + }, + { + "epoch": 0.5071126778169455, + "grad_norm": 0.0006864850292913616, + "learning_rate": 1.1491511602904084e-05, + "loss": 1.1126, + "step": 40568 + }, + { + "epoch": 0.507137678441961, + "grad_norm": 3.488783359527588, + "learning_rate": 1.1490648669900786e-05, + "loss": 1.2473, + "step": 40570 + }, + { + "epoch": 0.5071626790669767, + "grad_norm": 3.1438865661621094, + "learning_rate": 1.1489785725544921e-05, + "loss": 1.0183, + "step": 40572 + }, + { + "epoch": 0.5071876796919923, + "grad_norm": 4.1284050941467285, + "learning_rate": 1.1488922769843058e-05, + "loss": 1.3717, + "step": 40574 + }, + { + "epoch": 0.507212680317008, + "grad_norm": 2.207489252090454, + "learning_rate": 1.1488059802801765e-05, + "loss": 0.8744, + "step": 40576 + }, + { + "epoch": 0.5072376809420236, + "grad_norm": 2.961245059967041, + "learning_rate": 1.1487196824427627e-05, + "loss": 0.9138, + "step": 40578 + }, + { + "epoch": 0.5072626815670391, + "grad_norm": 2.6674208641052246, + "learning_rate": 1.14863338347272e-05, + "loss": 1.5443, + "step": 40580 + }, + { + "epoch": 0.5072876821920548, + "grad_norm": 3.3428637981414795, + "learning_rate": 1.1485470833707071e-05, + "loss": 1.4984, + "step": 40582 + }, + { + "epoch": 0.5073126828170704, + "grad_norm": 5.081838130950928, + "learning_rate": 1.1484607821373806e-05, + "loss": 1.1032, + "step": 40584 + }, + { + "epoch": 0.5073376834420861, + "grad_norm": 3.215604782104492, + "learning_rate": 1.1483744797733975e-05, + "loss": 0.8997, + "step": 40586 + }, + { + "epoch": 0.5073626840671017, + "grad_norm": 2.0096378326416016, + "learning_rate": 1.1482881762794151e-05, + "loss": 0.3701, + "step": 40588 + }, + { + "epoch": 0.5073876846921173, + "grad_norm": 1.1944302320480347, + "learning_rate": 1.1482018716560914e-05, + "loss": 0.3991, + "step": 40590 + }, + { + "epoch": 0.5074126853171329, + "grad_norm": 3.6465001106262207, + "learning_rate": 1.148115565904083e-05, + "loss": 1.796, + "step": 40592 + }, + { + "epoch": 0.5074376859421486, + "grad_norm": 2.1166317462921143, + "learning_rate": 1.1480292590240476e-05, + "loss": 0.5804, + "step": 40594 + }, + { + "epoch": 0.5074626865671642, + "grad_norm": 8.290165901184082, + "learning_rate": 1.147942951016642e-05, + "loss": 1.2966, + "step": 40596 + }, + { + "epoch": 0.5074876871921798, + "grad_norm": 0.004010336473584175, + "learning_rate": 1.1478566418825242e-05, + "loss": 0.3864, + "step": 40598 + }, + { + "epoch": 0.5075126878171954, + "grad_norm": 7.056210041046143, + "learning_rate": 1.1477703316223505e-05, + "loss": 1.2023, + "step": 40600 + }, + { + "epoch": 0.507537688442211, + "grad_norm": 2.9177637100219727, + "learning_rate": 1.1476840202367792e-05, + "loss": 0.7861, + "step": 40602 + }, + { + "epoch": 0.5075626890672267, + "grad_norm": 5.490476608276367, + "learning_rate": 1.1475977077264675e-05, + "loss": 0.7832, + "step": 40604 + }, + { + "epoch": 0.5075876896922423, + "grad_norm": 2.968301296234131, + "learning_rate": 1.1475113940920724e-05, + "loss": 0.3464, + "step": 40606 + }, + { + "epoch": 0.507612690317258, + "grad_norm": 2.780792713165283, + "learning_rate": 1.1474250793342511e-05, + "loss": 1.7716, + "step": 40608 + }, + { + "epoch": 0.5076376909422735, + "grad_norm": 3.0442135334014893, + "learning_rate": 1.1473387634536616e-05, + "loss": 0.2378, + "step": 40610 + }, + { + "epoch": 0.5076626915672892, + "grad_norm": 0.005412413273006678, + "learning_rate": 1.1472524464509607e-05, + "loss": 0.3034, + "step": 40612 + }, + { + "epoch": 0.5076876921923048, + "grad_norm": 9.858197212219238, + "learning_rate": 1.147166128326806e-05, + "loss": 1.691, + "step": 40614 + }, + { + "epoch": 0.5077126928173205, + "grad_norm": 5.015024662017822, + "learning_rate": 1.147079809081855e-05, + "loss": 0.94, + "step": 40616 + }, + { + "epoch": 0.5077376934423361, + "grad_norm": 4.6524858474731445, + "learning_rate": 1.1469934887167652e-05, + "loss": 1.8502, + "step": 40618 + }, + { + "epoch": 0.5077626940673516, + "grad_norm": 0.13739728927612305, + "learning_rate": 1.1469071672321932e-05, + "loss": 0.9715, + "step": 40620 + }, + { + "epoch": 0.5077876946923673, + "grad_norm": 4.146824836730957, + "learning_rate": 1.1468208446287974e-05, + "loss": 1.2465, + "step": 40622 + }, + { + "epoch": 0.5078126953173829, + "grad_norm": 0.0009966916404664516, + "learning_rate": 1.1467345209072347e-05, + "loss": 0.3976, + "step": 40624 + }, + { + "epoch": 0.5078376959423986, + "grad_norm": 4.183228492736816, + "learning_rate": 1.1466481960681625e-05, + "loss": 0.9903, + "step": 40626 + }, + { + "epoch": 0.5078626965674142, + "grad_norm": 0.8834559321403503, + "learning_rate": 1.1465618701122384e-05, + "loss": 0.7202, + "step": 40628 + }, + { + "epoch": 0.5078876971924298, + "grad_norm": 6.219298362731934, + "learning_rate": 1.1464755430401202e-05, + "loss": 0.5958, + "step": 40630 + }, + { + "epoch": 0.5079126978174454, + "grad_norm": 2.245065450668335, + "learning_rate": 1.1463892148524647e-05, + "loss": 1.1471, + "step": 40632 + }, + { + "epoch": 0.5079376984424611, + "grad_norm": 7.406485557556152, + "learning_rate": 1.1463028855499294e-05, + "loss": 0.9268, + "step": 40634 + }, + { + "epoch": 0.5079626990674767, + "grad_norm": 4.467386245727539, + "learning_rate": 1.1462165551331725e-05, + "loss": 1.5582, + "step": 40636 + }, + { + "epoch": 0.5079876996924924, + "grad_norm": 0.07237239927053452, + "learning_rate": 1.1461302236028506e-05, + "loss": 0.0014, + "step": 40638 + }, + { + "epoch": 0.5080127003175079, + "grad_norm": 3.78849720954895, + "learning_rate": 1.1460438909596217e-05, + "loss": 0.6426, + "step": 40640 + }, + { + "epoch": 0.5080377009425235, + "grad_norm": 0.0007763655157759786, + "learning_rate": 1.1459575572041435e-05, + "loss": 0.6794, + "step": 40642 + }, + { + "epoch": 0.5080627015675392, + "grad_norm": 5.248575687408447, + "learning_rate": 1.1458712223370727e-05, + "loss": 1.026, + "step": 40644 + }, + { + "epoch": 0.5080877021925548, + "grad_norm": 3.777945041656494, + "learning_rate": 1.1457848863590675e-05, + "loss": 0.701, + "step": 40646 + }, + { + "epoch": 0.5081127028175705, + "grad_norm": 12.417680740356445, + "learning_rate": 1.1456985492707852e-05, + "loss": 2.6102, + "step": 40648 + }, + { + "epoch": 0.508137703442586, + "grad_norm": 0.4220917522907257, + "learning_rate": 1.1456122110728833e-05, + "loss": 0.011, + "step": 40650 + }, + { + "epoch": 0.5081627040676017, + "grad_norm": 0.011590133421123028, + "learning_rate": 1.1455258717660193e-05, + "loss": 0.2458, + "step": 40652 + }, + { + "epoch": 0.5081877046926173, + "grad_norm": 4.218973636627197, + "learning_rate": 1.1454395313508511e-05, + "loss": 1.6131, + "step": 40654 + }, + { + "epoch": 0.508212705317633, + "grad_norm": 10.032363891601562, + "learning_rate": 1.1453531898280361e-05, + "loss": 1.2232, + "step": 40656 + }, + { + "epoch": 0.5082377059426486, + "grad_norm": 4.7270612716674805, + "learning_rate": 1.1452668471982315e-05, + "loss": 0.9542, + "step": 40658 + }, + { + "epoch": 0.5082627065676641, + "grad_norm": 3.1356828212738037, + "learning_rate": 1.145180503462095e-05, + "loss": 1.0966, + "step": 40660 + }, + { + "epoch": 0.5082877071926798, + "grad_norm": 7.149028778076172, + "learning_rate": 1.1450941586202846e-05, + "loss": 0.9605, + "step": 40662 + }, + { + "epoch": 0.5083127078176954, + "grad_norm": 4.344152927398682, + "learning_rate": 1.1450078126734576e-05, + "loss": 0.545, + "step": 40664 + }, + { + "epoch": 0.5083377084427111, + "grad_norm": 2.779947280883789, + "learning_rate": 1.1449214656222713e-05, + "loss": 1.5441, + "step": 40666 + }, + { + "epoch": 0.5083627090677267, + "grad_norm": 3.972785472869873, + "learning_rate": 1.1448351174673841e-05, + "loss": 0.6789, + "step": 40668 + }, + { + "epoch": 0.5083877096927423, + "grad_norm": 4.381388187408447, + "learning_rate": 1.1447487682094529e-05, + "loss": 0.6167, + "step": 40670 + }, + { + "epoch": 0.5084127103177579, + "grad_norm": 3.2690303325653076, + "learning_rate": 1.1446624178491354e-05, + "loss": 0.8466, + "step": 40672 + }, + { + "epoch": 0.5084377109427736, + "grad_norm": 3.281034469604492, + "learning_rate": 1.1445760663870895e-05, + "loss": 1.1674, + "step": 40674 + }, + { + "epoch": 0.5084627115677892, + "grad_norm": 0.02870633453130722, + "learning_rate": 1.1444897138239727e-05, + "loss": 0.195, + "step": 40676 + }, + { + "epoch": 0.5084877121928049, + "grad_norm": 4.609160423278809, + "learning_rate": 1.1444033601604427e-05, + "loss": 1.6364, + "step": 40678 + }, + { + "epoch": 0.5085127128178204, + "grad_norm": 0.05042817443609238, + "learning_rate": 1.1443170053971571e-05, + "loss": 0.5957, + "step": 40680 + }, + { + "epoch": 0.508537713442836, + "grad_norm": 3.690239191055298, + "learning_rate": 1.1442306495347738e-05, + "loss": 1.6037, + "step": 40682 + }, + { + "epoch": 0.5085627140678517, + "grad_norm": 1.943961262702942, + "learning_rate": 1.14414429257395e-05, + "loss": 0.3721, + "step": 40684 + }, + { + "epoch": 0.5085877146928673, + "grad_norm": 0.7072872519493103, + "learning_rate": 1.1440579345153437e-05, + "loss": 0.31, + "step": 40686 + }, + { + "epoch": 0.508612715317883, + "grad_norm": 2.813295602798462, + "learning_rate": 1.1439715753596125e-05, + "loss": 0.6839, + "step": 40688 + }, + { + "epoch": 0.5086377159428985, + "grad_norm": 0.0007204762659966946, + "learning_rate": 1.1438852151074142e-05, + "loss": 0.4555, + "step": 40690 + }, + { + "epoch": 0.5086627165679142, + "grad_norm": 5.877297878265381, + "learning_rate": 1.1437988537594065e-05, + "loss": 1.0056, + "step": 40692 + }, + { + "epoch": 0.5086877171929298, + "grad_norm": 3.2557339668273926, + "learning_rate": 1.143712491316247e-05, + "loss": 1.3926, + "step": 40694 + }, + { + "epoch": 0.5087127178179455, + "grad_norm": 3.1683554649353027, + "learning_rate": 1.1436261277785937e-05, + "loss": 0.5173, + "step": 40696 + }, + { + "epoch": 0.5087377184429611, + "grad_norm": 0.0006429032073356211, + "learning_rate": 1.1435397631471039e-05, + "loss": 0.0001, + "step": 40698 + }, + { + "epoch": 0.5087627190679767, + "grad_norm": 2.406752347946167, + "learning_rate": 1.1434533974224355e-05, + "loss": 0.145, + "step": 40700 + }, + { + "epoch": 0.5087877196929923, + "grad_norm": 3.7008888721466064, + "learning_rate": 1.1433670306052465e-05, + "loss": 0.9143, + "step": 40702 + }, + { + "epoch": 0.508812720318008, + "grad_norm": 2.998556613922119, + "learning_rate": 1.1432806626961942e-05, + "loss": 0.1459, + "step": 40704 + }, + { + "epoch": 0.5088377209430236, + "grad_norm": 11.446463584899902, + "learning_rate": 1.1431942936959369e-05, + "loss": 0.841, + "step": 40706 + }, + { + "epoch": 0.5088627215680392, + "grad_norm": 2.992790460586548, + "learning_rate": 1.1431079236051322e-05, + "loss": 0.813, + "step": 40708 + }, + { + "epoch": 0.5088877221930548, + "grad_norm": 2.0000052452087402, + "learning_rate": 1.1430215524244375e-05, + "loss": 0.2782, + "step": 40710 + }, + { + "epoch": 0.5089127228180704, + "grad_norm": 4.746365547180176, + "learning_rate": 1.142935180154511e-05, + "loss": 1.0244, + "step": 40712 + }, + { + "epoch": 0.5089377234430861, + "grad_norm": 1.5325907468795776, + "learning_rate": 1.1428488067960104e-05, + "loss": 0.3306, + "step": 40714 + }, + { + "epoch": 0.5089627240681017, + "grad_norm": 2.659353017807007, + "learning_rate": 1.1427624323495934e-05, + "loss": 1.1851, + "step": 40716 + }, + { + "epoch": 0.5089877246931174, + "grad_norm": 3.264890670776367, + "learning_rate": 1.1426760568159178e-05, + "loss": 0.9481, + "step": 40718 + }, + { + "epoch": 0.5090127253181329, + "grad_norm": 3.0186471939086914, + "learning_rate": 1.1425896801956418e-05, + "loss": 1.2226, + "step": 40720 + }, + { + "epoch": 0.5090377259431486, + "grad_norm": 2.657717227935791, + "learning_rate": 1.1425033024894229e-05, + "loss": 0.6669, + "step": 40722 + }, + { + "epoch": 0.5090627265681642, + "grad_norm": 2.328856945037842, + "learning_rate": 1.142416923697919e-05, + "loss": 0.583, + "step": 40724 + }, + { + "epoch": 0.5090877271931799, + "grad_norm": 3.443450689315796, + "learning_rate": 1.142330543821788e-05, + "loss": 2.0131, + "step": 40726 + }, + { + "epoch": 0.5091127278181955, + "grad_norm": 5.115182399749756, + "learning_rate": 1.1422441628616875e-05, + "loss": 0.9411, + "step": 40728 + }, + { + "epoch": 0.509137728443211, + "grad_norm": 0.00040704107959754765, + "learning_rate": 1.1421577808182756e-05, + "loss": 0.2779, + "step": 40730 + }, + { + "epoch": 0.5091627290682267, + "grad_norm": 1.8547570705413818, + "learning_rate": 1.1420713976922102e-05, + "loss": 1.1738, + "step": 40732 + }, + { + "epoch": 0.5091877296932423, + "grad_norm": 0.000774783780798316, + "learning_rate": 1.1419850134841494e-05, + "loss": 0.818, + "step": 40734 + }, + { + "epoch": 0.509212730318258, + "grad_norm": 2.679910182952881, + "learning_rate": 1.1418986281947505e-05, + "loss": 1.3088, + "step": 40736 + }, + { + "epoch": 0.5092377309432736, + "grad_norm": 1.3873828649520874, + "learning_rate": 1.1418122418246718e-05, + "loss": 0.019, + "step": 40738 + }, + { + "epoch": 0.5092627315682892, + "grad_norm": 6.428995609283447, + "learning_rate": 1.1417258543745711e-05, + "loss": 0.8043, + "step": 40740 + }, + { + "epoch": 0.5092877321933048, + "grad_norm": 3.213331937789917, + "learning_rate": 1.1416394658451062e-05, + "loss": 0.8411, + "step": 40742 + }, + { + "epoch": 0.5093127328183205, + "grad_norm": 4.447099208831787, + "learning_rate": 1.1415530762369353e-05, + "loss": 1.3836, + "step": 40744 + }, + { + "epoch": 0.5093377334433361, + "grad_norm": 5.28937292098999, + "learning_rate": 1.1414666855507161e-05, + "loss": 1.7476, + "step": 40746 + }, + { + "epoch": 0.5093627340683518, + "grad_norm": 0.0003321593103464693, + "learning_rate": 1.141380293787107e-05, + "loss": 1.2715, + "step": 40748 + }, + { + "epoch": 0.5093877346933673, + "grad_norm": 3.5420913696289062, + "learning_rate": 1.1412939009467652e-05, + "loss": 1.2394, + "step": 40750 + }, + { + "epoch": 0.5094127353183829, + "grad_norm": 2.552300214767456, + "learning_rate": 1.1412075070303493e-05, + "loss": 1.0259, + "step": 40752 + }, + { + "epoch": 0.5094377359433986, + "grad_norm": 11.911069869995117, + "learning_rate": 1.1411211120385166e-05, + "loss": 1.43, + "step": 40754 + }, + { + "epoch": 0.5094627365684142, + "grad_norm": 0.7532284259796143, + "learning_rate": 1.1410347159719257e-05, + "loss": 0.0143, + "step": 40756 + }, + { + "epoch": 0.5094877371934299, + "grad_norm": 0.0007114968611858785, + "learning_rate": 1.1409483188312343e-05, + "loss": 0.5741, + "step": 40758 + }, + { + "epoch": 0.5095127378184454, + "grad_norm": 2.7787208557128906, + "learning_rate": 1.1408619206171006e-05, + "loss": 1.5716, + "step": 40760 + }, + { + "epoch": 0.5095377384434611, + "grad_norm": 5.484953880310059, + "learning_rate": 1.1407755213301824e-05, + "loss": 1.3474, + "step": 40762 + }, + { + "epoch": 0.5095627390684767, + "grad_norm": 8.794147491455078, + "learning_rate": 1.1406891209711374e-05, + "loss": 0.6315, + "step": 40764 + }, + { + "epoch": 0.5095877396934924, + "grad_norm": 6.624298572540283, + "learning_rate": 1.1406027195406241e-05, + "loss": 0.2138, + "step": 40766 + }, + { + "epoch": 0.509612740318508, + "grad_norm": 0.0006020857254043221, + "learning_rate": 1.1405163170393006e-05, + "loss": 1.4254, + "step": 40768 + }, + { + "epoch": 0.5096377409435235, + "grad_norm": 3.2671804428100586, + "learning_rate": 1.1404299134678245e-05, + "loss": 1.8602, + "step": 40770 + }, + { + "epoch": 0.5096627415685392, + "grad_norm": 1.6978899240493774, + "learning_rate": 1.140343508826854e-05, + "loss": 0.3193, + "step": 40772 + }, + { + "epoch": 0.5096877421935548, + "grad_norm": 3.2478742599487305, + "learning_rate": 1.1402571031170476e-05, + "loss": 0.7299, + "step": 40774 + }, + { + "epoch": 0.5097127428185705, + "grad_norm": 3.416335344314575, + "learning_rate": 1.1401706963390625e-05, + "loss": 0.7782, + "step": 40776 + }, + { + "epoch": 0.5097377434435861, + "grad_norm": 4.1019673347473145, + "learning_rate": 1.140084288493557e-05, + "loss": 0.9715, + "step": 40778 + }, + { + "epoch": 0.5097627440686017, + "grad_norm": 3.5202159881591797, + "learning_rate": 1.1399978795811898e-05, + "loss": 1.4369, + "step": 40780 + }, + { + "epoch": 0.5097877446936173, + "grad_norm": 5.227280139923096, + "learning_rate": 1.1399114696026186e-05, + "loss": 1.2713, + "step": 40782 + }, + { + "epoch": 0.509812745318633, + "grad_norm": 8.295499801635742, + "learning_rate": 1.139825058558501e-05, + "loss": 1.1976, + "step": 40784 + }, + { + "epoch": 0.5098377459436486, + "grad_norm": 2.0132620334625244, + "learning_rate": 1.1397386464494962e-05, + "loss": 0.8331, + "step": 40786 + }, + { + "epoch": 0.5098627465686643, + "grad_norm": 0.7210425138473511, + "learning_rate": 1.1396522332762611e-05, + "loss": 1.2051, + "step": 40788 + }, + { + "epoch": 0.5098877471936798, + "grad_norm": 5.235047817230225, + "learning_rate": 1.1395658190394543e-05, + "loss": 1.3508, + "step": 40790 + }, + { + "epoch": 0.5099127478186954, + "grad_norm": 4.117257118225098, + "learning_rate": 1.1394794037397343e-05, + "loss": 1.422, + "step": 40792 + }, + { + "epoch": 0.5099377484437111, + "grad_norm": 2.566877841949463, + "learning_rate": 1.1393929873777584e-05, + "loss": 0.8127, + "step": 40794 + }, + { + "epoch": 0.5099627490687267, + "grad_norm": 2.186958074569702, + "learning_rate": 1.1393065699541857e-05, + "loss": 0.3738, + "step": 40796 + }, + { + "epoch": 0.5099877496937424, + "grad_norm": 4.3654985427856445, + "learning_rate": 1.1392201514696736e-05, + "loss": 1.5826, + "step": 40798 + }, + { + "epoch": 0.5100127503187579, + "grad_norm": 9.51797103881836, + "learning_rate": 1.1391337319248807e-05, + "loss": 0.8817, + "step": 40800 + }, + { + "epoch": 0.5100377509437736, + "grad_norm": 0.0007501388899981976, + "learning_rate": 1.1390473113204647e-05, + "loss": 0.7707, + "step": 40802 + }, + { + "epoch": 0.5100627515687892, + "grad_norm": 4.161233901977539, + "learning_rate": 1.1389608896570841e-05, + "loss": 1.6006, + "step": 40804 + }, + { + "epoch": 0.5100877521938049, + "grad_norm": 3.3838303089141846, + "learning_rate": 1.1388744669353975e-05, + "loss": 0.5796, + "step": 40806 + }, + { + "epoch": 0.5101127528188205, + "grad_norm": 0.44312500953674316, + "learning_rate": 1.138788043156062e-05, + "loss": 0.9771, + "step": 40808 + }, + { + "epoch": 0.510137753443836, + "grad_norm": 4.07821798324585, + "learning_rate": 1.1387016183197365e-05, + "loss": 0.9303, + "step": 40810 + }, + { + "epoch": 0.5101627540688517, + "grad_norm": 3.5240745544433594, + "learning_rate": 1.1386151924270796e-05, + "loss": 1.7072, + "step": 40812 + }, + { + "epoch": 0.5101877546938673, + "grad_norm": 1.7626951932907104, + "learning_rate": 1.1385287654787482e-05, + "loss": 1.126, + "step": 40814 + }, + { + "epoch": 0.510212755318883, + "grad_norm": 0.03303120285272598, + "learning_rate": 1.1384423374754019e-05, + "loss": 0.6373, + "step": 40816 + }, + { + "epoch": 0.5102377559438986, + "grad_norm": 3.9612622261047363, + "learning_rate": 1.138355908417698e-05, + "loss": 1.5314, + "step": 40818 + }, + { + "epoch": 0.5102627565689142, + "grad_norm": 3.8464393615722656, + "learning_rate": 1.1382694783062953e-05, + "loss": 0.5192, + "step": 40820 + }, + { + "epoch": 0.5102877571939298, + "grad_norm": 4.1210808753967285, + "learning_rate": 1.1381830471418518e-05, + "loss": 0.9224, + "step": 40822 + }, + { + "epoch": 0.5103127578189455, + "grad_norm": 0.0022111458238214254, + "learning_rate": 1.1380966149250257e-05, + "loss": 0.0011, + "step": 40824 + }, + { + "epoch": 0.5103377584439611, + "grad_norm": 0.001027149148285389, + "learning_rate": 1.1380101816564754e-05, + "loss": 0.4011, + "step": 40826 + }, + { + "epoch": 0.5103627590689768, + "grad_norm": 4.374035835266113, + "learning_rate": 1.1379237473368587e-05, + "loss": 1.3737, + "step": 40828 + }, + { + "epoch": 0.5103877596939923, + "grad_norm": 2.6712117195129395, + "learning_rate": 1.1378373119668344e-05, + "loss": 0.9936, + "step": 40830 + }, + { + "epoch": 0.510412760319008, + "grad_norm": 5.0939178466796875, + "learning_rate": 1.137750875547061e-05, + "loss": 0.5899, + "step": 40832 + }, + { + "epoch": 0.5104377609440236, + "grad_norm": 2.873481512069702, + "learning_rate": 1.137664438078196e-05, + "loss": 0.5592, + "step": 40834 + }, + { + "epoch": 0.5104627615690392, + "grad_norm": 0.8344748616218567, + "learning_rate": 1.1375779995608982e-05, + "loss": 0.6489, + "step": 40836 + }, + { + "epoch": 0.5104877621940549, + "grad_norm": 2.714850664138794, + "learning_rate": 1.137491559995826e-05, + "loss": 1.2321, + "step": 40838 + }, + { + "epoch": 0.5105127628190704, + "grad_norm": 0.0007642558193765581, + "learning_rate": 1.1374051193836373e-05, + "loss": 0.7911, + "step": 40840 + }, + { + "epoch": 0.5105377634440861, + "grad_norm": 4.759941101074219, + "learning_rate": 1.1373186777249907e-05, + "loss": 0.6967, + "step": 40842 + }, + { + "epoch": 0.5105627640691017, + "grad_norm": 3.4839048385620117, + "learning_rate": 1.1372322350205445e-05, + "loss": 1.2815, + "step": 40844 + }, + { + "epoch": 0.5105877646941174, + "grad_norm": 1.822827696800232, + "learning_rate": 1.1371457912709571e-05, + "loss": 0.8655, + "step": 40846 + }, + { + "epoch": 0.510612765319133, + "grad_norm": 3.1807608604431152, + "learning_rate": 1.1370593464768867e-05, + "loss": 1.1903, + "step": 40848 + }, + { + "epoch": 0.5106377659441486, + "grad_norm": 1.271881341934204, + "learning_rate": 1.1369729006389917e-05, + "loss": 0.2252, + "step": 40850 + }, + { + "epoch": 0.5106627665691642, + "grad_norm": 0.0009111305698752403, + "learning_rate": 1.1368864537579301e-05, + "loss": 0.9021, + "step": 40852 + }, + { + "epoch": 0.5106877671941799, + "grad_norm": 0.0008654703269712627, + "learning_rate": 1.136800005834361e-05, + "loss": 0.0, + "step": 40854 + }, + { + "epoch": 0.5107127678191955, + "grad_norm": 3.3247933387756348, + "learning_rate": 1.1367135568689422e-05, + "loss": 0.628, + "step": 40856 + }, + { + "epoch": 0.5107377684442111, + "grad_norm": 0.0008406714769080281, + "learning_rate": 1.1366271068623327e-05, + "loss": 0.6775, + "step": 40858 + }, + { + "epoch": 0.5107627690692267, + "grad_norm": 6.185091495513916, + "learning_rate": 1.1365406558151901e-05, + "loss": 1.8796, + "step": 40860 + }, + { + "epoch": 0.5107877696942423, + "grad_norm": 1.466148018836975, + "learning_rate": 1.1364542037281731e-05, + "loss": 0.0457, + "step": 40862 + }, + { + "epoch": 0.510812770319258, + "grad_norm": 0.006698978133499622, + "learning_rate": 1.1363677506019403e-05, + "loss": 0.1605, + "step": 40864 + }, + { + "epoch": 0.5108377709442736, + "grad_norm": 2.731339693069458, + "learning_rate": 1.1362812964371502e-05, + "loss": 1.0428, + "step": 40866 + }, + { + "epoch": 0.5108627715692893, + "grad_norm": 2.8779397010803223, + "learning_rate": 1.1361948412344607e-05, + "loss": 0.2241, + "step": 40868 + }, + { + "epoch": 0.5108877721943048, + "grad_norm": 0.0007466225069947541, + "learning_rate": 1.1361083849945306e-05, + "loss": 0.9622, + "step": 40870 + }, + { + "epoch": 0.5109127728193205, + "grad_norm": 1.0817956924438477, + "learning_rate": 1.1360219277180187e-05, + "loss": 0.03, + "step": 40872 + }, + { + "epoch": 0.5109377734443361, + "grad_norm": 4.6089043617248535, + "learning_rate": 1.1359354694055826e-05, + "loss": 0.7692, + "step": 40874 + }, + { + "epoch": 0.5109627740693518, + "grad_norm": 4.004670143127441, + "learning_rate": 1.1358490100578812e-05, + "loss": 2.1105, + "step": 40876 + }, + { + "epoch": 0.5109877746943674, + "grad_norm": 2.453390598297119, + "learning_rate": 1.1357625496755731e-05, + "loss": 0.4191, + "step": 40878 + }, + { + "epoch": 0.5110127753193829, + "grad_norm": 6.451602935791016, + "learning_rate": 1.1356760882593166e-05, + "loss": 2.0182, + "step": 40880 + }, + { + "epoch": 0.5110377759443986, + "grad_norm": 4.584702491760254, + "learning_rate": 1.13558962580977e-05, + "loss": 1.0775, + "step": 40882 + }, + { + "epoch": 0.5110627765694142, + "grad_norm": 3.1967129707336426, + "learning_rate": 1.1355031623275925e-05, + "loss": 1.1487, + "step": 40884 + }, + { + "epoch": 0.5110877771944299, + "grad_norm": 0.9275351166725159, + "learning_rate": 1.1354166978134416e-05, + "loss": 0.1911, + "step": 40886 + }, + { + "epoch": 0.5111127778194455, + "grad_norm": 6.0396409034729, + "learning_rate": 1.1353302322679765e-05, + "loss": 1.0157, + "step": 40888 + }, + { + "epoch": 0.5111377784444611, + "grad_norm": 5.33284854888916, + "learning_rate": 1.1352437656918553e-05, + "loss": 0.6681, + "step": 40890 + }, + { + "epoch": 0.5111627790694767, + "grad_norm": 5.072930812835693, + "learning_rate": 1.1351572980857368e-05, + "loss": 1.5021, + "step": 40892 + }, + { + "epoch": 0.5111877796944924, + "grad_norm": 0.8160592317581177, + "learning_rate": 1.1350708294502795e-05, + "loss": 0.1616, + "step": 40894 + }, + { + "epoch": 0.511212780319508, + "grad_norm": 5.390807151794434, + "learning_rate": 1.1349843597861416e-05, + "loss": 2.0527, + "step": 40896 + }, + { + "epoch": 0.5112377809445237, + "grad_norm": 3.1916749477386475, + "learning_rate": 1.1348978890939824e-05, + "loss": 2.5237, + "step": 40898 + }, + { + "epoch": 0.5112627815695392, + "grad_norm": 4.394272327423096, + "learning_rate": 1.1348114173744595e-05, + "loss": 1.0783, + "step": 40900 + }, + { + "epoch": 0.5112877821945548, + "grad_norm": 6.837753772735596, + "learning_rate": 1.1347249446282322e-05, + "loss": 1.501, + "step": 40902 + }, + { + "epoch": 0.5113127828195705, + "grad_norm": 0.0010308605851605535, + "learning_rate": 1.1346384708559586e-05, + "loss": 0.1125, + "step": 40904 + }, + { + "epoch": 0.5113377834445861, + "grad_norm": 3.834789991378784, + "learning_rate": 1.1345519960582974e-05, + "loss": 1.0691, + "step": 40906 + }, + { + "epoch": 0.5113627840696018, + "grad_norm": 3.0442025661468506, + "learning_rate": 1.1344655202359073e-05, + "loss": 0.5893, + "step": 40908 + }, + { + "epoch": 0.5113877846946173, + "grad_norm": 0.003422715002670884, + "learning_rate": 1.1343790433894471e-05, + "loss": 0.0412, + "step": 40910 + }, + { + "epoch": 0.511412785319633, + "grad_norm": 0.005219494458287954, + "learning_rate": 1.1342925655195747e-05, + "loss": 0.7271, + "step": 40912 + }, + { + "epoch": 0.5114377859446486, + "grad_norm": 2.4728920459747314, + "learning_rate": 1.1342060866269492e-05, + "loss": 1.7448, + "step": 40914 + }, + { + "epoch": 0.5114627865696643, + "grad_norm": 0.0009977736044675112, + "learning_rate": 1.134119606712229e-05, + "loss": 0.3077, + "step": 40916 + }, + { + "epoch": 0.5114877871946799, + "grad_norm": 5.893148899078369, + "learning_rate": 1.1340331257760732e-05, + "loss": 1.0298, + "step": 40918 + }, + { + "epoch": 0.5115127878196954, + "grad_norm": 3.986182928085327, + "learning_rate": 1.1339466438191398e-05, + "loss": 1.7713, + "step": 40920 + }, + { + "epoch": 0.5115377884447111, + "grad_norm": 2.9256415367126465, + "learning_rate": 1.1338601608420879e-05, + "loss": 0.1731, + "step": 40922 + }, + { + "epoch": 0.5115627890697267, + "grad_norm": 0.616551399230957, + "learning_rate": 1.1337736768455759e-05, + "loss": 0.5695, + "step": 40924 + }, + { + "epoch": 0.5115877896947424, + "grad_norm": 11.035072326660156, + "learning_rate": 1.1336871918302624e-05, + "loss": 1.5129, + "step": 40926 + }, + { + "epoch": 0.511612790319758, + "grad_norm": 6.570460319519043, + "learning_rate": 1.1336007057968062e-05, + "loss": 1.0297, + "step": 40928 + }, + { + "epoch": 0.5116377909447736, + "grad_norm": 0.000874807417858392, + "learning_rate": 1.1335142187458658e-05, + "loss": 0.0542, + "step": 40930 + }, + { + "epoch": 0.5116627915697892, + "grad_norm": 1.7656680345535278, + "learning_rate": 1.1334277306781001e-05, + "loss": 0.0758, + "step": 40932 + }, + { + "epoch": 0.5116877921948049, + "grad_norm": 4.894719123840332, + "learning_rate": 1.1333412415941677e-05, + "loss": 0.8377, + "step": 40934 + }, + { + "epoch": 0.5117127928198205, + "grad_norm": 2.5709187984466553, + "learning_rate": 1.1332547514947272e-05, + "loss": 0.2284, + "step": 40936 + }, + { + "epoch": 0.5117377934448362, + "grad_norm": 7.245412826538086, + "learning_rate": 1.1331682603804375e-05, + "loss": 0.926, + "step": 40938 + }, + { + "epoch": 0.5117627940698517, + "grad_norm": 3.820967197418213, + "learning_rate": 1.1330817682519569e-05, + "loss": 1.2184, + "step": 40940 + }, + { + "epoch": 0.5117877946948673, + "grad_norm": 2.5487499237060547, + "learning_rate": 1.1329952751099444e-05, + "loss": 0.7596, + "step": 40942 + }, + { + "epoch": 0.511812795319883, + "grad_norm": 4.730111122131348, + "learning_rate": 1.1329087809550588e-05, + "loss": 1.3274, + "step": 40944 + }, + { + "epoch": 0.5118377959448986, + "grad_norm": 3.256866455078125, + "learning_rate": 1.1328222857879587e-05, + "loss": 1.1268, + "step": 40946 + }, + { + "epoch": 0.5118627965699143, + "grad_norm": 5.023716449737549, + "learning_rate": 1.1327357896093028e-05, + "loss": 1.4416, + "step": 40948 + }, + { + "epoch": 0.5118877971949298, + "grad_norm": 1.0923148393630981, + "learning_rate": 1.13264929241975e-05, + "loss": 0.4505, + "step": 40950 + }, + { + "epoch": 0.5119127978199455, + "grad_norm": 4.12661075592041, + "learning_rate": 1.132562794219959e-05, + "loss": 0.9451, + "step": 40952 + }, + { + "epoch": 0.5119377984449611, + "grad_norm": 6.074955463409424, + "learning_rate": 1.1324762950105886e-05, + "loss": 0.0869, + "step": 40954 + }, + { + "epoch": 0.5119627990699768, + "grad_norm": 0.0007075035246089101, + "learning_rate": 1.132389794792297e-05, + "loss": 0.6218, + "step": 40956 + }, + { + "epoch": 0.5119877996949924, + "grad_norm": 2.544034719467163, + "learning_rate": 1.132303293565744e-05, + "loss": 1.5896, + "step": 40958 + }, + { + "epoch": 0.512012800320008, + "grad_norm": 3.219519853591919, + "learning_rate": 1.1322167913315874e-05, + "loss": 0.5989, + "step": 40960 + }, + { + "epoch": 0.5120378009450236, + "grad_norm": 4.2688446044921875, + "learning_rate": 1.1321302880904868e-05, + "loss": 1.3313, + "step": 40962 + }, + { + "epoch": 0.5120628015700392, + "grad_norm": 1.4407238960266113, + "learning_rate": 1.1320437838431003e-05, + "loss": 0.0832, + "step": 40964 + }, + { + "epoch": 0.5120878021950549, + "grad_norm": 2.0776145458221436, + "learning_rate": 1.131957278590087e-05, + "loss": 0.7838, + "step": 40966 + }, + { + "epoch": 0.5121128028200705, + "grad_norm": 0.7994773983955383, + "learning_rate": 1.1318707723321056e-05, + "loss": 0.7419, + "step": 40968 + }, + { + "epoch": 0.5121378034450861, + "grad_norm": 0.000794722349382937, + "learning_rate": 1.1317842650698154e-05, + "loss": 0.0003, + "step": 40970 + }, + { + "epoch": 0.5121628040701017, + "grad_norm": 3.429107904434204, + "learning_rate": 1.1316977568038747e-05, + "loss": 0.6676, + "step": 40972 + }, + { + "epoch": 0.5121878046951174, + "grad_norm": 2.7449233531951904, + "learning_rate": 1.1316112475349426e-05, + "loss": 0.5209, + "step": 40974 + }, + { + "epoch": 0.512212805320133, + "grad_norm": 3.8298745155334473, + "learning_rate": 1.131524737263678e-05, + "loss": 1.6421, + "step": 40976 + }, + { + "epoch": 0.5122378059451487, + "grad_norm": 2.5031914710998535, + "learning_rate": 1.1314382259907393e-05, + "loss": 0.449, + "step": 40978 + }, + { + "epoch": 0.5122628065701642, + "grad_norm": 2.5825302600860596, + "learning_rate": 1.1313517137167856e-05, + "loss": 0.9277, + "step": 40980 + }, + { + "epoch": 0.5122878071951799, + "grad_norm": 0.12928315997123718, + "learning_rate": 1.1312652004424763e-05, + "loss": 1.087, + "step": 40982 + }, + { + "epoch": 0.5123128078201955, + "grad_norm": 1.9324672222137451, + "learning_rate": 1.1311786861684692e-05, + "loss": 0.9647, + "step": 40984 + }, + { + "epoch": 0.5123378084452112, + "grad_norm": 2.7364656925201416, + "learning_rate": 1.1310921708954242e-05, + "loss": 0.5023, + "step": 40986 + }, + { + "epoch": 0.5123628090702268, + "grad_norm": 0.020634541288018227, + "learning_rate": 1.1310056546239998e-05, + "loss": 0.8019, + "step": 40988 + }, + { + "epoch": 0.5123878096952423, + "grad_norm": 0.7863529920578003, + "learning_rate": 1.1309191373548547e-05, + "loss": 0.0112, + "step": 40990 + }, + { + "epoch": 0.512412810320258, + "grad_norm": 3.501368761062622, + "learning_rate": 1.1308326190886477e-05, + "loss": 0.7357, + "step": 40992 + }, + { + "epoch": 0.5124378109452736, + "grad_norm": 2.7467525005340576, + "learning_rate": 1.130746099826038e-05, + "loss": 0.9163, + "step": 40994 + }, + { + "epoch": 0.5124628115702893, + "grad_norm": 5.597585201263428, + "learning_rate": 1.130659579567685e-05, + "loss": 0.9119, + "step": 40996 + }, + { + "epoch": 0.5124878121953049, + "grad_norm": 3.6761436462402344, + "learning_rate": 1.130573058314247e-05, + "loss": 1.5434, + "step": 40998 + }, + { + "epoch": 0.5125128128203205, + "grad_norm": 2.213397979736328, + "learning_rate": 1.1304865360663825e-05, + "loss": 0.4457, + "step": 41000 + }, + { + "epoch": 0.5125378134453361, + "grad_norm": 2.9859390258789062, + "learning_rate": 1.1304000128247516e-05, + "loss": 1.5393, + "step": 41002 + }, + { + "epoch": 0.5125628140703518, + "grad_norm": 5.828729629516602, + "learning_rate": 1.1303134885900123e-05, + "loss": 1.8882, + "step": 41004 + }, + { + "epoch": 0.5125878146953674, + "grad_norm": 0.0006819910486228764, + "learning_rate": 1.130226963362824e-05, + "loss": 0.1196, + "step": 41006 + }, + { + "epoch": 0.512612815320383, + "grad_norm": 7.000441551208496, + "learning_rate": 1.1301404371438458e-05, + "loss": 0.9768, + "step": 41008 + }, + { + "epoch": 0.5126378159453986, + "grad_norm": 2.0840628147125244, + "learning_rate": 1.1300539099337362e-05, + "loss": 0.8527, + "step": 41010 + }, + { + "epoch": 0.5126628165704142, + "grad_norm": 0.6192066669464111, + "learning_rate": 1.1299673817331543e-05, + "loss": 0.0194, + "step": 41012 + }, + { + "epoch": 0.5126878171954299, + "grad_norm": 0.00043894010013900697, + "learning_rate": 1.1298808525427596e-05, + "loss": 0.0923, + "step": 41014 + }, + { + "epoch": 0.5127128178204455, + "grad_norm": 2.3433055877685547, + "learning_rate": 1.1297943223632104e-05, + "loss": 0.6288, + "step": 41016 + }, + { + "epoch": 0.5127378184454612, + "grad_norm": 3.6009223461151123, + "learning_rate": 1.1297077911951657e-05, + "loss": 0.2732, + "step": 41018 + }, + { + "epoch": 0.5127628190704767, + "grad_norm": 0.0007577959913760424, + "learning_rate": 1.129621259039285e-05, + "loss": 0.0008, + "step": 41020 + }, + { + "epoch": 0.5127878196954924, + "grad_norm": 0.979278028011322, + "learning_rate": 1.1295347258962275e-05, + "loss": 1.1315, + "step": 41022 + }, + { + "epoch": 0.512812820320508, + "grad_norm": 2.8842806816101074, + "learning_rate": 1.1294481917666516e-05, + "loss": 0.1567, + "step": 41024 + }, + { + "epoch": 0.5128378209455237, + "grad_norm": 4.447488784790039, + "learning_rate": 1.1293616566512164e-05, + "loss": 0.7576, + "step": 41026 + }, + { + "epoch": 0.5128628215705393, + "grad_norm": 2.4503791332244873, + "learning_rate": 1.1292751205505816e-05, + "loss": 1.2996, + "step": 41028 + }, + { + "epoch": 0.5128878221955548, + "grad_norm": 3.4452996253967285, + "learning_rate": 1.1291885834654052e-05, + "loss": 0.6496, + "step": 41030 + }, + { + "epoch": 0.5129128228205705, + "grad_norm": 3.0047574043273926, + "learning_rate": 1.129102045396347e-05, + "loss": 2.4387, + "step": 41032 + }, + { + "epoch": 0.5129378234455861, + "grad_norm": 2.9586243629455566, + "learning_rate": 1.1290155063440661e-05, + "loss": 0.6841, + "step": 41034 + }, + { + "epoch": 0.5129628240706018, + "grad_norm": 4.5925445556640625, + "learning_rate": 1.1289289663092213e-05, + "loss": 0.6365, + "step": 41036 + }, + { + "epoch": 0.5129878246956174, + "grad_norm": 8.400313377380371, + "learning_rate": 1.1288424252924715e-05, + "loss": 0.6394, + "step": 41038 + }, + { + "epoch": 0.513012825320633, + "grad_norm": 5.748590469360352, + "learning_rate": 1.1287558832944763e-05, + "loss": 1.3315, + "step": 41040 + }, + { + "epoch": 0.5130378259456486, + "grad_norm": 5.116616249084473, + "learning_rate": 1.1286693403158942e-05, + "loss": 0.899, + "step": 41042 + }, + { + "epoch": 0.5130628265706643, + "grad_norm": 8.41268539428711, + "learning_rate": 1.1285827963573846e-05, + "loss": 1.6813, + "step": 41044 + }, + { + "epoch": 0.5130878271956799, + "grad_norm": 0.0009841484716162086, + "learning_rate": 1.1284962514196068e-05, + "loss": 0.3578, + "step": 41046 + }, + { + "epoch": 0.5131128278206956, + "grad_norm": 4.847071647644043, + "learning_rate": 1.1284097055032198e-05, + "loss": 1.3856, + "step": 41048 + }, + { + "epoch": 0.5131378284457111, + "grad_norm": 3.620256185531616, + "learning_rate": 1.1283231586088826e-05, + "loss": 1.1114, + "step": 41050 + }, + { + "epoch": 0.5131628290707267, + "grad_norm": 2.8797037601470947, + "learning_rate": 1.128236610737254e-05, + "loss": 0.6578, + "step": 41052 + }, + { + "epoch": 0.5131878296957424, + "grad_norm": 0.00073544419137761, + "learning_rate": 1.1281500618889938e-05, + "loss": 0.4471, + "step": 41054 + }, + { + "epoch": 0.513212830320758, + "grad_norm": 0.0006472523091360927, + "learning_rate": 1.1280635120647608e-05, + "loss": 0.6259, + "step": 41056 + }, + { + "epoch": 0.5132378309457737, + "grad_norm": 2.600693464279175, + "learning_rate": 1.1279769612652141e-05, + "loss": 0.5267, + "step": 41058 + }, + { + "epoch": 0.5132628315707892, + "grad_norm": 0.0007656464586034417, + "learning_rate": 1.1278904094910132e-05, + "loss": 0.6421, + "step": 41060 + }, + { + "epoch": 0.5132878321958049, + "grad_norm": 2.000319004058838, + "learning_rate": 1.127803856742817e-05, + "loss": 1.0598, + "step": 41062 + }, + { + "epoch": 0.5133128328208205, + "grad_norm": 3.673412799835205, + "learning_rate": 1.1277173030212848e-05, + "loss": 0.7101, + "step": 41064 + }, + { + "epoch": 0.5133378334458362, + "grad_norm": 5.979178428649902, + "learning_rate": 1.1276307483270752e-05, + "loss": 1.3971, + "step": 41066 + }, + { + "epoch": 0.5133628340708518, + "grad_norm": 3.8257782459259033, + "learning_rate": 1.1275441926608482e-05, + "loss": 0.6396, + "step": 41068 + }, + { + "epoch": 0.5133878346958674, + "grad_norm": 0.40705984830856323, + "learning_rate": 1.1274576360232626e-05, + "loss": 0.8398, + "step": 41070 + }, + { + "epoch": 0.513412835320883, + "grad_norm": 3.130330801010132, + "learning_rate": 1.1273710784149776e-05, + "loss": 1.4578, + "step": 41072 + }, + { + "epoch": 0.5134378359458986, + "grad_norm": 4.8402533531188965, + "learning_rate": 1.1272845198366529e-05, + "loss": 1.1679, + "step": 41074 + }, + { + "epoch": 0.5134628365709143, + "grad_norm": 4.092730522155762, + "learning_rate": 1.1271979602889469e-05, + "loss": 1.9791, + "step": 41076 + }, + { + "epoch": 0.5134878371959299, + "grad_norm": 0.7456533312797546, + "learning_rate": 1.1271113997725191e-05, + "loss": 0.082, + "step": 41078 + }, + { + "epoch": 0.5135128378209455, + "grad_norm": 3.6839864253997803, + "learning_rate": 1.127024838288029e-05, + "loss": 1.1545, + "step": 41080 + }, + { + "epoch": 0.5135378384459611, + "grad_norm": 52.65633010864258, + "learning_rate": 1.1269382758361357e-05, + "loss": 0.2952, + "step": 41082 + }, + { + "epoch": 0.5135628390709768, + "grad_norm": 2.017094850540161, + "learning_rate": 1.1268517124174986e-05, + "loss": 1.0344, + "step": 41084 + }, + { + "epoch": 0.5135878396959924, + "grad_norm": 0.43083375692367554, + "learning_rate": 1.1267651480327768e-05, + "loss": 0.3217, + "step": 41086 + }, + { + "epoch": 0.5136128403210081, + "grad_norm": 2.97383975982666, + "learning_rate": 1.1266785826826295e-05, + "loss": 1.0991, + "step": 41088 + }, + { + "epoch": 0.5136378409460236, + "grad_norm": 0.04521339014172554, + "learning_rate": 1.126592016367716e-05, + "loss": 0.4778, + "step": 41090 + }, + { + "epoch": 0.5136628415710393, + "grad_norm": 4.205217361450195, + "learning_rate": 1.1265054490886956e-05, + "loss": 0.7656, + "step": 41092 + }, + { + "epoch": 0.5136878421960549, + "grad_norm": 3.734722137451172, + "learning_rate": 1.1264188808462272e-05, + "loss": 0.8461, + "step": 41094 + }, + { + "epoch": 0.5137128428210705, + "grad_norm": 3.6992745399475098, + "learning_rate": 1.126332311640971e-05, + "loss": 0.573, + "step": 41096 + }, + { + "epoch": 0.5137378434460862, + "grad_norm": 2.4616711139678955, + "learning_rate": 1.1262457414735857e-05, + "loss": 0.1821, + "step": 41098 + }, + { + "epoch": 0.5137628440711017, + "grad_norm": 2.9637186527252197, + "learning_rate": 1.1261591703447308e-05, + "loss": 2.222, + "step": 41100 + }, + { + "epoch": 0.5137878446961174, + "grad_norm": 2.94144344329834, + "learning_rate": 1.126072598255065e-05, + "loss": 0.2045, + "step": 41102 + }, + { + "epoch": 0.513812845321133, + "grad_norm": 3.111217737197876, + "learning_rate": 1.1259860252052486e-05, + "loss": 0.2036, + "step": 41104 + }, + { + "epoch": 0.5138378459461487, + "grad_norm": 1.7919402122497559, + "learning_rate": 1.1258994511959402e-05, + "loss": 1.0755, + "step": 41106 + }, + { + "epoch": 0.5138628465711643, + "grad_norm": 6.791876316070557, + "learning_rate": 1.1258128762277993e-05, + "loss": 1.4906, + "step": 41108 + }, + { + "epoch": 0.5138878471961799, + "grad_norm": 3.1315724849700928, + "learning_rate": 1.1257263003014855e-05, + "loss": 0.8059, + "step": 41110 + }, + { + "epoch": 0.5139128478211955, + "grad_norm": 2.6583125591278076, + "learning_rate": 1.1256397234176581e-05, + "loss": 0.3615, + "step": 41112 + }, + { + "epoch": 0.5139378484462112, + "grad_norm": 4.086602210998535, + "learning_rate": 1.1255531455769764e-05, + "loss": 0.4994, + "step": 41114 + }, + { + "epoch": 0.5139628490712268, + "grad_norm": 2.1290924549102783, + "learning_rate": 1.1254665667800993e-05, + "loss": 0.9908, + "step": 41116 + }, + { + "epoch": 0.5139878496962424, + "grad_norm": 0.0011549955233931541, + "learning_rate": 1.1253799870276867e-05, + "loss": 0.727, + "step": 41118 + }, + { + "epoch": 0.514012850321258, + "grad_norm": 0.0006854751263745129, + "learning_rate": 1.1252934063203979e-05, + "loss": 0.5921, + "step": 41120 + }, + { + "epoch": 0.5140378509462736, + "grad_norm": 3.0848755836486816, + "learning_rate": 1.1252068246588924e-05, + "loss": 1.3618, + "step": 41122 + }, + { + "epoch": 0.5140628515712893, + "grad_norm": 0.0008513560169376433, + "learning_rate": 1.1251202420438292e-05, + "loss": 0.6945, + "step": 41124 + }, + { + "epoch": 0.5140878521963049, + "grad_norm": 3.3393142223358154, + "learning_rate": 1.1250336584758682e-05, + "loss": 0.8321, + "step": 41126 + }, + { + "epoch": 0.5141128528213206, + "grad_norm": 2.253911256790161, + "learning_rate": 1.1249470739556683e-05, + "loss": 0.585, + "step": 41128 + }, + { + "epoch": 0.5141378534463361, + "grad_norm": 2.4875998497009277, + "learning_rate": 1.124860488483889e-05, + "loss": 0.0721, + "step": 41130 + }, + { + "epoch": 0.5141628540713518, + "grad_norm": 2.362290859222412, + "learning_rate": 1.12477390206119e-05, + "loss": 1.0018, + "step": 41132 + }, + { + "epoch": 0.5141878546963674, + "grad_norm": 0.0008845367701724172, + "learning_rate": 1.1246873146882306e-05, + "loss": 0.2846, + "step": 41134 + }, + { + "epoch": 0.5142128553213831, + "grad_norm": 2.3519773483276367, + "learning_rate": 1.1246007263656704e-05, + "loss": 1.8073, + "step": 41136 + }, + { + "epoch": 0.5142378559463987, + "grad_norm": 3.6772162914276123, + "learning_rate": 1.1245141370941688e-05, + "loss": 0.9682, + "step": 41138 + }, + { + "epoch": 0.5142628565714142, + "grad_norm": 4.194272994995117, + "learning_rate": 1.124427546874385e-05, + "loss": 1.2962, + "step": 41140 + }, + { + "epoch": 0.5142878571964299, + "grad_norm": 5.858510971069336, + "learning_rate": 1.1243409557069784e-05, + "loss": 0.5243, + "step": 41142 + }, + { + "epoch": 0.5143128578214455, + "grad_norm": 3.2650654315948486, + "learning_rate": 1.124254363592609e-05, + "loss": 1.4651, + "step": 41144 + }, + { + "epoch": 0.5143378584464612, + "grad_norm": 0.0062059261836111546, + "learning_rate": 1.1241677705319356e-05, + "loss": 0.9022, + "step": 41146 + }, + { + "epoch": 0.5143628590714768, + "grad_norm": 1.6512504816055298, + "learning_rate": 1.1240811765256181e-05, + "loss": 0.0592, + "step": 41148 + }, + { + "epoch": 0.5143878596964924, + "grad_norm": 1.4733048677444458, + "learning_rate": 1.1239945815743161e-05, + "loss": 0.1167, + "step": 41150 + }, + { + "epoch": 0.514412860321508, + "grad_norm": 1.4920951128005981, + "learning_rate": 1.123907985678689e-05, + "loss": 0.7063, + "step": 41152 + }, + { + "epoch": 0.5144378609465237, + "grad_norm": 0.0064299809746444225, + "learning_rate": 1.1238213888393958e-05, + "loss": 0.882, + "step": 41154 + }, + { + "epoch": 0.5144628615715393, + "grad_norm": 5.065276145935059, + "learning_rate": 1.1237347910570966e-05, + "loss": 2.3036, + "step": 41156 + }, + { + "epoch": 0.514487862196555, + "grad_norm": 8.406710624694824, + "learning_rate": 1.1236481923324506e-05, + "loss": 1.1675, + "step": 41158 + }, + { + "epoch": 0.5145128628215705, + "grad_norm": 2.4200048446655273, + "learning_rate": 1.1235615926661177e-05, + "loss": 0.907, + "step": 41160 + }, + { + "epoch": 0.5145378634465861, + "grad_norm": 4.375838756561279, + "learning_rate": 1.1234749920587569e-05, + "loss": 1.4852, + "step": 41162 + }, + { + "epoch": 0.5145628640716018, + "grad_norm": 1.0918786525726318, + "learning_rate": 1.1233883905110283e-05, + "loss": 0.7802, + "step": 41164 + }, + { + "epoch": 0.5145878646966174, + "grad_norm": 0.0008570741047151387, + "learning_rate": 1.1233017880235913e-05, + "loss": 0.3226, + "step": 41166 + }, + { + "epoch": 0.5146128653216331, + "grad_norm": 3.2415075302124023, + "learning_rate": 1.1232151845971049e-05, + "loss": 1.4497, + "step": 41168 + }, + { + "epoch": 0.5146378659466486, + "grad_norm": 6.111270904541016, + "learning_rate": 1.1231285802322289e-05, + "loss": 1.2529, + "step": 41170 + }, + { + "epoch": 0.5146628665716643, + "grad_norm": 3.1247336864471436, + "learning_rate": 1.1230419749296237e-05, + "loss": 0.4751, + "step": 41172 + }, + { + "epoch": 0.5146878671966799, + "grad_norm": 6.630110740661621, + "learning_rate": 1.122955368689948e-05, + "loss": 2.5161, + "step": 41174 + }, + { + "epoch": 0.5147128678216956, + "grad_norm": 3.6351449489593506, + "learning_rate": 1.1228687615138614e-05, + "loss": 2.3957, + "step": 41176 + }, + { + "epoch": 0.5147378684467112, + "grad_norm": 4.765071392059326, + "learning_rate": 1.122782153402024e-05, + "loss": 0.6995, + "step": 41178 + }, + { + "epoch": 0.5147628690717267, + "grad_norm": 1.843782663345337, + "learning_rate": 1.1226955443550947e-05, + "loss": 0.9799, + "step": 41180 + }, + { + "epoch": 0.5147878696967424, + "grad_norm": 1.4347580671310425, + "learning_rate": 1.1226089343737335e-05, + "loss": 0.9153, + "step": 41182 + }, + { + "epoch": 0.514812870321758, + "grad_norm": 4.619332313537598, + "learning_rate": 1.1225223234586002e-05, + "loss": 1.373, + "step": 41184 + }, + { + "epoch": 0.5148378709467737, + "grad_norm": 2.050853967666626, + "learning_rate": 1.122435711610354e-05, + "loss": 0.3384, + "step": 41186 + }, + { + "epoch": 0.5148628715717893, + "grad_norm": 0.42093372344970703, + "learning_rate": 1.122349098829655e-05, + "loss": 0.6356, + "step": 41188 + }, + { + "epoch": 0.5148878721968049, + "grad_norm": 3.6382694244384766, + "learning_rate": 1.1222624851171629e-05, + "loss": 0.9932, + "step": 41190 + }, + { + "epoch": 0.5149128728218205, + "grad_norm": 3.65466046333313, + "learning_rate": 1.1221758704735364e-05, + "loss": 1.0024, + "step": 41192 + }, + { + "epoch": 0.5149378734468362, + "grad_norm": 3.052288770675659, + "learning_rate": 1.1220892548994358e-05, + "loss": 1.3417, + "step": 41194 + }, + { + "epoch": 0.5149628740718518, + "grad_norm": 0.0007519409991800785, + "learning_rate": 1.1220026383955207e-05, + "loss": 1.3321, + "step": 41196 + }, + { + "epoch": 0.5149878746968675, + "grad_norm": 1.5827486515045166, + "learning_rate": 1.1219160209624512e-05, + "loss": 0.9679, + "step": 41198 + }, + { + "epoch": 0.515012875321883, + "grad_norm": 1.490437388420105, + "learning_rate": 1.121829402600886e-05, + "loss": 0.0737, + "step": 41200 + }, + { + "epoch": 0.5150378759468986, + "grad_norm": 14.359716415405273, + "learning_rate": 1.1217427833114855e-05, + "loss": 1.8572, + "step": 41202 + }, + { + "epoch": 0.5150628765719143, + "grad_norm": 4.280669689178467, + "learning_rate": 1.1216561630949095e-05, + "loss": 0.6724, + "step": 41204 + }, + { + "epoch": 0.5150878771969299, + "grad_norm": 2.0949628353118896, + "learning_rate": 1.1215695419518166e-05, + "loss": 0.6455, + "step": 41206 + }, + { + "epoch": 0.5151128778219456, + "grad_norm": 0.024458087980747223, + "learning_rate": 1.1214829198828676e-05, + "loss": 1.0508, + "step": 41208 + }, + { + "epoch": 0.5151378784469611, + "grad_norm": 5.5916972160339355, + "learning_rate": 1.1213962968887221e-05, + "loss": 0.4209, + "step": 41210 + }, + { + "epoch": 0.5151628790719768, + "grad_norm": 1.9748526811599731, + "learning_rate": 1.1213096729700397e-05, + "loss": 0.558, + "step": 41212 + }, + { + "epoch": 0.5151878796969924, + "grad_norm": 3.7867722511291504, + "learning_rate": 1.1212230481274794e-05, + "loss": 0.7771, + "step": 41214 + }, + { + "epoch": 0.5152128803220081, + "grad_norm": 4.66817569732666, + "learning_rate": 1.121136422361702e-05, + "loss": 0.6632, + "step": 41216 + }, + { + "epoch": 0.5152378809470237, + "grad_norm": 3.0273702144622803, + "learning_rate": 1.1210497956733667e-05, + "loss": 0.5603, + "step": 41218 + }, + { + "epoch": 0.5152628815720393, + "grad_norm": 6.3303542137146, + "learning_rate": 1.120963168063133e-05, + "loss": 1.0695, + "step": 41220 + }, + { + "epoch": 0.5152878821970549, + "grad_norm": 2.7861592769622803, + "learning_rate": 1.120876539531661e-05, + "loss": 0.575, + "step": 41222 + }, + { + "epoch": 0.5153128828220706, + "grad_norm": 0.0008464690763503313, + "learning_rate": 1.1207899100796105e-05, + "loss": 0.3229, + "step": 41224 + }, + { + "epoch": 0.5153378834470862, + "grad_norm": 0.029587117955088615, + "learning_rate": 1.1207032797076412e-05, + "loss": 1.0439, + "step": 41226 + }, + { + "epoch": 0.5153628840721018, + "grad_norm": 0.2801695466041565, + "learning_rate": 1.1206166484164126e-05, + "loss": 0.0504, + "step": 41228 + }, + { + "epoch": 0.5153878846971174, + "grad_norm": 2.7699084281921387, + "learning_rate": 1.1205300162065849e-05, + "loss": 0.1089, + "step": 41230 + }, + { + "epoch": 0.515412885322133, + "grad_norm": 5.919309616088867, + "learning_rate": 1.1204433830788171e-05, + "loss": 0.8389, + "step": 41232 + }, + { + "epoch": 0.5154378859471487, + "grad_norm": 5.589211940765381, + "learning_rate": 1.1203567490337699e-05, + "loss": 1.2717, + "step": 41234 + }, + { + "epoch": 0.5154628865721643, + "grad_norm": 4.947680950164795, + "learning_rate": 1.1202701140721029e-05, + "loss": 0.6961, + "step": 41236 + }, + { + "epoch": 0.51548788719718, + "grad_norm": 4.8186845779418945, + "learning_rate": 1.1201834781944753e-05, + "loss": 0.2275, + "step": 41238 + }, + { + "epoch": 0.5155128878221955, + "grad_norm": 0.0011670355452224612, + "learning_rate": 1.1200968414015477e-05, + "loss": 0.9608, + "step": 41240 + }, + { + "epoch": 0.5155378884472112, + "grad_norm": 3.5558583736419678, + "learning_rate": 1.1200102036939793e-05, + "loss": 1.1881, + "step": 41242 + }, + { + "epoch": 0.5155628890722268, + "grad_norm": 4.129101753234863, + "learning_rate": 1.11992356507243e-05, + "loss": 1.5935, + "step": 41244 + }, + { + "epoch": 0.5155878896972425, + "grad_norm": 4.912830352783203, + "learning_rate": 1.11983692553756e-05, + "loss": 2.2481, + "step": 41246 + }, + { + "epoch": 0.5156128903222581, + "grad_norm": 0.000639550038613379, + "learning_rate": 1.1197502850900291e-05, + "loss": 1.0301, + "step": 41248 + }, + { + "epoch": 0.5156378909472736, + "grad_norm": 3.3595657348632812, + "learning_rate": 1.1196636437304968e-05, + "loss": 1.5734, + "step": 41250 + }, + { + "epoch": 0.5156628915722893, + "grad_norm": 7.915643692016602, + "learning_rate": 1.1195770014596233e-05, + "loss": 0.397, + "step": 41252 + }, + { + "epoch": 0.5156878921973049, + "grad_norm": 0.0007284855237230659, + "learning_rate": 1.119490358278068e-05, + "loss": 0.0009, + "step": 41254 + }, + { + "epoch": 0.5157128928223206, + "grad_norm": 7.6307268142700195, + "learning_rate": 1.1194037141864911e-05, + "loss": 0.74, + "step": 41256 + }, + { + "epoch": 0.5157378934473362, + "grad_norm": 1.1347451210021973, + "learning_rate": 1.1193170691855523e-05, + "loss": 0.4593, + "step": 41258 + }, + { + "epoch": 0.5157628940723518, + "grad_norm": 1.7931842803955078, + "learning_rate": 1.1192304232759117e-05, + "loss": 0.8556, + "step": 41260 + }, + { + "epoch": 0.5157878946973674, + "grad_norm": 4.012033462524414, + "learning_rate": 1.1191437764582291e-05, + "loss": 1.3807, + "step": 41262 + }, + { + "epoch": 0.5158128953223831, + "grad_norm": 4.7269134521484375, + "learning_rate": 1.1190571287331645e-05, + "loss": 0.8495, + "step": 41264 + }, + { + "epoch": 0.5158378959473987, + "grad_norm": 4.0263543128967285, + "learning_rate": 1.1189704801013772e-05, + "loss": 1.4749, + "step": 41266 + }, + { + "epoch": 0.5158628965724144, + "grad_norm": 2.905602216720581, + "learning_rate": 1.1188838305635279e-05, + "loss": 0.348, + "step": 41268 + }, + { + "epoch": 0.5158878971974299, + "grad_norm": 4.589589595794678, + "learning_rate": 1.118797180120276e-05, + "loss": 1.1345, + "step": 41270 + }, + { + "epoch": 0.5159128978224455, + "grad_norm": 1.7075016498565674, + "learning_rate": 1.1187105287722815e-05, + "loss": 0.0779, + "step": 41272 + }, + { + "epoch": 0.5159378984474612, + "grad_norm": 0.00070732063613832, + "learning_rate": 1.1186238765202045e-05, + "loss": 0.0025, + "step": 41274 + }, + { + "epoch": 0.5159628990724768, + "grad_norm": 2.1519722938537598, + "learning_rate": 1.1185372233647052e-05, + "loss": 0.6931, + "step": 41276 + }, + { + "epoch": 0.5159878996974925, + "grad_norm": 1.9183547496795654, + "learning_rate": 1.1184505693064427e-05, + "loss": 0.308, + "step": 41278 + }, + { + "epoch": 0.516012900322508, + "grad_norm": 2.434882879257202, + "learning_rate": 1.1183639143460773e-05, + "loss": 0.8294, + "step": 41280 + }, + { + "epoch": 0.5160379009475237, + "grad_norm": 4.538847923278809, + "learning_rate": 1.1182772584842693e-05, + "loss": 0.8604, + "step": 41282 + }, + { + "epoch": 0.5160629015725393, + "grad_norm": 2.3534555435180664, + "learning_rate": 1.1181906017216785e-05, + "loss": 0.714, + "step": 41284 + }, + { + "epoch": 0.516087902197555, + "grad_norm": 1.3990132808685303, + "learning_rate": 1.1181039440589648e-05, + "loss": 0.3037, + "step": 41286 + }, + { + "epoch": 0.5161129028225706, + "grad_norm": 4.309834957122803, + "learning_rate": 1.1180172854967881e-05, + "loss": 1.6355, + "step": 41288 + }, + { + "epoch": 0.5161379034475861, + "grad_norm": 4.856093883514404, + "learning_rate": 1.1179306260358085e-05, + "loss": 1.7181, + "step": 41290 + }, + { + "epoch": 0.5161629040726018, + "grad_norm": 4.177796363830566, + "learning_rate": 1.1178439656766858e-05, + "loss": 1.5563, + "step": 41292 + }, + { + "epoch": 0.5161879046976174, + "grad_norm": 0.6059793829917908, + "learning_rate": 1.11775730442008e-05, + "loss": 0.3583, + "step": 41294 + }, + { + "epoch": 0.5162129053226331, + "grad_norm": 0.15848945081233978, + "learning_rate": 1.1176706422666514e-05, + "loss": 1.0111, + "step": 41296 + }, + { + "epoch": 0.5162379059476487, + "grad_norm": 3.4882965087890625, + "learning_rate": 1.11758397921706e-05, + "loss": 0.46, + "step": 41298 + }, + { + "epoch": 0.5162629065726643, + "grad_norm": 1.59634530544281, + "learning_rate": 1.1174973152719652e-05, + "loss": 0.3378, + "step": 41300 + }, + { + "epoch": 0.5162879071976799, + "grad_norm": 9.132725715637207, + "learning_rate": 1.117410650432028e-05, + "loss": 0.7893, + "step": 41302 + }, + { + "epoch": 0.5163129078226956, + "grad_norm": 3.18919038772583, + "learning_rate": 1.1173239846979076e-05, + "loss": 0.901, + "step": 41304 + }, + { + "epoch": 0.5163379084477112, + "grad_norm": 5.740179061889648, + "learning_rate": 1.1172373180702642e-05, + "loss": 1.055, + "step": 41306 + }, + { + "epoch": 0.5163629090727269, + "grad_norm": 3.1211466789245605, + "learning_rate": 1.1171506505497582e-05, + "loss": 0.955, + "step": 41308 + }, + { + "epoch": 0.5163879096977424, + "grad_norm": 0.1402621865272522, + "learning_rate": 1.1170639821370493e-05, + "loss": 0.1799, + "step": 41310 + }, + { + "epoch": 0.516412910322758, + "grad_norm": 4.264436721801758, + "learning_rate": 1.1169773128327977e-05, + "loss": 1.0941, + "step": 41312 + }, + { + "epoch": 0.5164379109477737, + "grad_norm": 0.006186990533024073, + "learning_rate": 1.1168906426376635e-05, + "loss": 0.6641, + "step": 41314 + }, + { + "epoch": 0.5164629115727893, + "grad_norm": 3.574648141860962, + "learning_rate": 1.1168039715523066e-05, + "loss": 1.8566, + "step": 41316 + }, + { + "epoch": 0.516487912197805, + "grad_norm": 1.9041692018508911, + "learning_rate": 1.1167172995773871e-05, + "loss": 0.2027, + "step": 41318 + }, + { + "epoch": 0.5165129128228205, + "grad_norm": 4.072607040405273, + "learning_rate": 1.1166306267135652e-05, + "loss": 0.7811, + "step": 41320 + }, + { + "epoch": 0.5165379134478362, + "grad_norm": 4.895508289337158, + "learning_rate": 1.1165439529615009e-05, + "loss": 0.8072, + "step": 41322 + }, + { + "epoch": 0.5165629140728518, + "grad_norm": 0.02406439371407032, + "learning_rate": 1.1164572783218544e-05, + "loss": 0.1538, + "step": 41324 + }, + { + "epoch": 0.5165879146978675, + "grad_norm": 1.731882095336914, + "learning_rate": 1.1163706027952856e-05, + "loss": 0.3201, + "step": 41326 + }, + { + "epoch": 0.5166129153228831, + "grad_norm": 0.01830293983221054, + "learning_rate": 1.1162839263824552e-05, + "loss": 0.7549, + "step": 41328 + }, + { + "epoch": 0.5166379159478987, + "grad_norm": 4.6296820640563965, + "learning_rate": 1.1161972490840225e-05, + "loss": 0.8067, + "step": 41330 + }, + { + "epoch": 0.5166629165729143, + "grad_norm": 5.935329914093018, + "learning_rate": 1.1161105709006482e-05, + "loss": 1.1961, + "step": 41332 + }, + { + "epoch": 0.51668791719793, + "grad_norm": 3.761758804321289, + "learning_rate": 1.1160238918329917e-05, + "loss": 1.492, + "step": 41334 + }, + { + "epoch": 0.5167129178229456, + "grad_norm": 3.2885968685150146, + "learning_rate": 1.1159372118817142e-05, + "loss": 0.7588, + "step": 41336 + }, + { + "epoch": 0.5167379184479612, + "grad_norm": 2.3262710571289062, + "learning_rate": 1.115850531047475e-05, + "loss": 1.1911, + "step": 41338 + }, + { + "epoch": 0.5167629190729768, + "grad_norm": 2.7207508087158203, + "learning_rate": 1.1157638493309346e-05, + "loss": 1.0055, + "step": 41340 + }, + { + "epoch": 0.5167879196979924, + "grad_norm": 0.0009446691838093102, + "learning_rate": 1.1156771667327534e-05, + "loss": 0.1901, + "step": 41342 + }, + { + "epoch": 0.5168129203230081, + "grad_norm": 3.563222646713257, + "learning_rate": 1.1155904832535906e-05, + "loss": 0.5342, + "step": 41344 + }, + { + "epoch": 0.5168379209480237, + "grad_norm": 3.5202462673187256, + "learning_rate": 1.1155037988941075e-05, + "loss": 1.5573, + "step": 41346 + }, + { + "epoch": 0.5168629215730394, + "grad_norm": 7.152061462402344, + "learning_rate": 1.1154171136549636e-05, + "loss": 1.5813, + "step": 41348 + }, + { + "epoch": 0.5168879221980549, + "grad_norm": 0.09769617766141891, + "learning_rate": 1.1153304275368195e-05, + "loss": 0.3229, + "step": 41350 + }, + { + "epoch": 0.5169129228230706, + "grad_norm": 4.4804863929748535, + "learning_rate": 1.115243740540335e-05, + "loss": 1.3367, + "step": 41352 + }, + { + "epoch": 0.5169379234480862, + "grad_norm": 2.9282798767089844, + "learning_rate": 1.1151570526661706e-05, + "loss": 1.1574, + "step": 41354 + }, + { + "epoch": 0.5169629240731018, + "grad_norm": 4.115234851837158, + "learning_rate": 1.1150703639149862e-05, + "loss": 1.4461, + "step": 41356 + }, + { + "epoch": 0.5169879246981175, + "grad_norm": 2.169203996658325, + "learning_rate": 1.1149836742874422e-05, + "loss": 0.9334, + "step": 41358 + }, + { + "epoch": 0.517012925323133, + "grad_norm": 3.4415318965911865, + "learning_rate": 1.1148969837841989e-05, + "loss": 0.7646, + "step": 41360 + }, + { + "epoch": 0.5170379259481487, + "grad_norm": 5.920722961425781, + "learning_rate": 1.1148102924059165e-05, + "loss": 0.7821, + "step": 41362 + }, + { + "epoch": 0.5170629265731643, + "grad_norm": 4.522392272949219, + "learning_rate": 1.114723600153255e-05, + "loss": 1.1494, + "step": 41364 + }, + { + "epoch": 0.51708792719818, + "grad_norm": 2.2081713676452637, + "learning_rate": 1.1146369070268748e-05, + "loss": 0.8494, + "step": 41366 + }, + { + "epoch": 0.5171129278231956, + "grad_norm": 2.674900770187378, + "learning_rate": 1.1145502130274364e-05, + "loss": 0.6099, + "step": 41368 + }, + { + "epoch": 0.5171379284482112, + "grad_norm": 2.2590086460113525, + "learning_rate": 1.1144635181555994e-05, + "loss": 0.375, + "step": 41370 + }, + { + "epoch": 0.5171629290732268, + "grad_norm": 5.161069869995117, + "learning_rate": 1.1143768224120245e-05, + "loss": 0.6587, + "step": 41372 + }, + { + "epoch": 0.5171879296982425, + "grad_norm": 3.375042676925659, + "learning_rate": 1.1142901257973719e-05, + "loss": 1.0834, + "step": 41374 + }, + { + "epoch": 0.5172129303232581, + "grad_norm": 6.07909631729126, + "learning_rate": 1.114203428312302e-05, + "loss": 1.25, + "step": 41376 + }, + { + "epoch": 0.5172379309482737, + "grad_norm": 1.9259752035140991, + "learning_rate": 1.1141167299574746e-05, + "loss": 0.7123, + "step": 41378 + }, + { + "epoch": 0.5172629315732893, + "grad_norm": 5.021289825439453, + "learning_rate": 1.114030030733551e-05, + "loss": 1.6572, + "step": 41380 + }, + { + "epoch": 0.5172879321983049, + "grad_norm": 4.076225757598877, + "learning_rate": 1.1139433306411903e-05, + "loss": 1.4565, + "step": 41382 + }, + { + "epoch": 0.5173129328233206, + "grad_norm": 8.979449272155762, + "learning_rate": 1.1138566296810532e-05, + "loss": 0.9968, + "step": 41384 + }, + { + "epoch": 0.5173379334483362, + "grad_norm": 0.0007056462345644832, + "learning_rate": 1.1137699278538003e-05, + "loss": 1.2839, + "step": 41386 + }, + { + "epoch": 0.5173629340733519, + "grad_norm": 5.59699010848999, + "learning_rate": 1.1136832251600917e-05, + "loss": 1.6933, + "step": 41388 + }, + { + "epoch": 0.5173879346983674, + "grad_norm": 0.001166767324320972, + "learning_rate": 1.1135965216005877e-05, + "loss": 1.2581, + "step": 41390 + }, + { + "epoch": 0.5174129353233831, + "grad_norm": 3.1979150772094727, + "learning_rate": 1.1135098171759488e-05, + "loss": 0.7071, + "step": 41392 + }, + { + "epoch": 0.5174379359483987, + "grad_norm": 2.545653820037842, + "learning_rate": 1.1134231118868352e-05, + "loss": 0.3961, + "step": 41394 + }, + { + "epoch": 0.5174629365734144, + "grad_norm": 0.000565522990655154, + "learning_rate": 1.113336405733907e-05, + "loss": 0.0882, + "step": 41396 + }, + { + "epoch": 0.51748793719843, + "grad_norm": 3.5154898166656494, + "learning_rate": 1.1132496987178248e-05, + "loss": 1.832, + "step": 41398 + }, + { + "epoch": 0.5175129378234455, + "grad_norm": 3.9110183715820312, + "learning_rate": 1.1131629908392491e-05, + "loss": 1.5271, + "step": 41400 + }, + { + "epoch": 0.5175379384484612, + "grad_norm": 1.5574710369110107, + "learning_rate": 1.1130762820988398e-05, + "loss": 0.0962, + "step": 41402 + }, + { + "epoch": 0.5175629390734768, + "grad_norm": 1.2881929874420166, + "learning_rate": 1.1129895724972579e-05, + "loss": 1.0268, + "step": 41404 + }, + { + "epoch": 0.5175879396984925, + "grad_norm": 3.7476508617401123, + "learning_rate": 1.1129028620351631e-05, + "loss": 0.4406, + "step": 41406 + }, + { + "epoch": 0.5176129403235081, + "grad_norm": 1.8761879205703735, + "learning_rate": 1.1128161507132162e-05, + "loss": 0.8183, + "step": 41408 + }, + { + "epoch": 0.5176379409485237, + "grad_norm": 5.311307907104492, + "learning_rate": 1.1127294385320773e-05, + "loss": 0.7467, + "step": 41410 + }, + { + "epoch": 0.5176629415735393, + "grad_norm": 6.033237457275391, + "learning_rate": 1.112642725492407e-05, + "loss": 0.4177, + "step": 41412 + }, + { + "epoch": 0.517687942198555, + "grad_norm": 3.6333749294281006, + "learning_rate": 1.1125560115948658e-05, + "loss": 0.8951, + "step": 41414 + }, + { + "epoch": 0.5177129428235706, + "grad_norm": 0.08472706377506256, + "learning_rate": 1.1124692968401136e-05, + "loss": 0.8929, + "step": 41416 + }, + { + "epoch": 0.5177379434485863, + "grad_norm": 2.7115063667297363, + "learning_rate": 1.1123825812288113e-05, + "loss": 0.6921, + "step": 41418 + }, + { + "epoch": 0.5177629440736018, + "grad_norm": 5.9539408683776855, + "learning_rate": 1.1122958647616196e-05, + "loss": 1.846, + "step": 41420 + }, + { + "epoch": 0.5177879446986174, + "grad_norm": 2.1670079231262207, + "learning_rate": 1.1122091474391976e-05, + "loss": 0.1084, + "step": 41422 + }, + { + "epoch": 0.5178129453236331, + "grad_norm": 3.7267062664031982, + "learning_rate": 1.112122429262207e-05, + "loss": 1.4835, + "step": 41424 + }, + { + "epoch": 0.5178379459486487, + "grad_norm": 3.5689697265625, + "learning_rate": 1.1120357102313081e-05, + "loss": 0.6591, + "step": 41426 + }, + { + "epoch": 0.5178629465736644, + "grad_norm": 0.000610529154073447, + "learning_rate": 1.111948990347161e-05, + "loss": 0.0006, + "step": 41428 + }, + { + "epoch": 0.5178879471986799, + "grad_norm": 1.6825635433197021, + "learning_rate": 1.111862269610426e-05, + "loss": 0.1822, + "step": 41430 + }, + { + "epoch": 0.5179129478236956, + "grad_norm": 2.9617326259613037, + "learning_rate": 1.111775548021764e-05, + "loss": 0.5956, + "step": 41432 + }, + { + "epoch": 0.5179379484487112, + "grad_norm": 4.824795722961426, + "learning_rate": 1.1116888255818347e-05, + "loss": 1.6625, + "step": 41434 + }, + { + "epoch": 0.5179629490737269, + "grad_norm": 0.2703319489955902, + "learning_rate": 1.1116021022912998e-05, + "loss": 0.6679, + "step": 41436 + }, + { + "epoch": 0.5179879496987425, + "grad_norm": 3.3110313415527344, + "learning_rate": 1.1115153781508187e-05, + "loss": 0.6706, + "step": 41438 + }, + { + "epoch": 0.518012950323758, + "grad_norm": 6.021280765533447, + "learning_rate": 1.1114286531610524e-05, + "loss": 1.8077, + "step": 41440 + }, + { + "epoch": 0.5180379509487737, + "grad_norm": 5.769113540649414, + "learning_rate": 1.1113419273226611e-05, + "loss": 1.2388, + "step": 41442 + }, + { + "epoch": 0.5180629515737893, + "grad_norm": 3.679482936859131, + "learning_rate": 1.1112552006363053e-05, + "loss": 1.4081, + "step": 41444 + }, + { + "epoch": 0.518087952198805, + "grad_norm": 4.424749374389648, + "learning_rate": 1.111168473102646e-05, + "loss": 1.6939, + "step": 41446 + }, + { + "epoch": 0.5181129528238206, + "grad_norm": 3.330505132675171, + "learning_rate": 1.111081744722343e-05, + "loss": 1.0208, + "step": 41448 + }, + { + "epoch": 0.5181379534488362, + "grad_norm": 2.2133896350860596, + "learning_rate": 1.1109950154960572e-05, + "loss": 0.4955, + "step": 41450 + }, + { + "epoch": 0.5181629540738518, + "grad_norm": 6.572685718536377, + "learning_rate": 1.1109082854244493e-05, + "loss": 0.9886, + "step": 41452 + }, + { + "epoch": 0.5181879546988675, + "grad_norm": 2.6290805339813232, + "learning_rate": 1.1108215545081796e-05, + "loss": 1.1173, + "step": 41454 + }, + { + "epoch": 0.5182129553238831, + "grad_norm": 2.8557610511779785, + "learning_rate": 1.1107348227479083e-05, + "loss": 0.5488, + "step": 41456 + }, + { + "epoch": 0.5182379559488988, + "grad_norm": 0.0008215915295295417, + "learning_rate": 1.1106480901442962e-05, + "loss": 0.8922, + "step": 41458 + }, + { + "epoch": 0.5182629565739143, + "grad_norm": 3.878046989440918, + "learning_rate": 1.110561356698004e-05, + "loss": 1.2886, + "step": 41460 + }, + { + "epoch": 0.51828795719893, + "grad_norm": 2.7564034461975098, + "learning_rate": 1.1104746224096923e-05, + "loss": 0.793, + "step": 41462 + }, + { + "epoch": 0.5183129578239456, + "grad_norm": 8.64204216003418, + "learning_rate": 1.1103878872800215e-05, + "loss": 1.8804, + "step": 41464 + }, + { + "epoch": 0.5183379584489612, + "grad_norm": 5.828169345855713, + "learning_rate": 1.110301151309652e-05, + "loss": 1.3008, + "step": 41466 + }, + { + "epoch": 0.5183629590739769, + "grad_norm": 2.143009901046753, + "learning_rate": 1.1102144144992446e-05, + "loss": 0.708, + "step": 41468 + }, + { + "epoch": 0.5183879596989924, + "grad_norm": 4.01800012588501, + "learning_rate": 1.1101276768494598e-05, + "loss": 1.8205, + "step": 41470 + }, + { + "epoch": 0.5184129603240081, + "grad_norm": 3.445519208908081, + "learning_rate": 1.1100409383609583e-05, + "loss": 1.3076, + "step": 41472 + }, + { + "epoch": 0.5184379609490237, + "grad_norm": 2.815253496170044, + "learning_rate": 1.1099541990344003e-05, + "loss": 0.5032, + "step": 41474 + }, + { + "epoch": 0.5184629615740394, + "grad_norm": 3.7158825397491455, + "learning_rate": 1.1098674588704468e-05, + "loss": 1.1605, + "step": 41476 + }, + { + "epoch": 0.518487962199055, + "grad_norm": 4.49934196472168, + "learning_rate": 1.1097807178697585e-05, + "loss": 1.8439, + "step": 41478 + }, + { + "epoch": 0.5185129628240706, + "grad_norm": 2.836048126220703, + "learning_rate": 1.1096939760329954e-05, + "loss": 0.7824, + "step": 41480 + }, + { + "epoch": 0.5185379634490862, + "grad_norm": 1.1391528844833374, + "learning_rate": 1.1096072333608187e-05, + "loss": 1.2828, + "step": 41482 + }, + { + "epoch": 0.5185629640741019, + "grad_norm": 3.8704023361206055, + "learning_rate": 1.1095204898538889e-05, + "loss": 1.1758, + "step": 41484 + }, + { + "epoch": 0.5185879646991175, + "grad_norm": 1.3083351850509644, + "learning_rate": 1.1094337455128662e-05, + "loss": 0.2568, + "step": 41486 + }, + { + "epoch": 0.5186129653241331, + "grad_norm": 3.9700241088867188, + "learning_rate": 1.109347000338412e-05, + "loss": 0.6914, + "step": 41488 + }, + { + "epoch": 0.5186379659491487, + "grad_norm": 6.472109794616699, + "learning_rate": 1.109260254331186e-05, + "loss": 0.8226, + "step": 41490 + }, + { + "epoch": 0.5186629665741643, + "grad_norm": 4.7518134117126465, + "learning_rate": 1.1091735074918499e-05, + "loss": 0.2158, + "step": 41492 + }, + { + "epoch": 0.51868796719918, + "grad_norm": 3.9693524837493896, + "learning_rate": 1.1090867598210635e-05, + "loss": 1.4151, + "step": 41494 + }, + { + "epoch": 0.5187129678241956, + "grad_norm": 2.618075132369995, + "learning_rate": 1.1090000113194878e-05, + "loss": 1.2305, + "step": 41496 + }, + { + "epoch": 0.5187379684492113, + "grad_norm": 3.7270421981811523, + "learning_rate": 1.1089132619877835e-05, + "loss": 0.9887, + "step": 41498 + }, + { + "epoch": 0.5187629690742268, + "grad_norm": 3.795738458633423, + "learning_rate": 1.108826511826611e-05, + "loss": 0.4738, + "step": 41500 + }, + { + "epoch": 0.5187879696992425, + "grad_norm": 8.967445373535156, + "learning_rate": 1.1087397608366315e-05, + "loss": 0.9188, + "step": 41502 + }, + { + "epoch": 0.5188129703242581, + "grad_norm": 3.9861037731170654, + "learning_rate": 1.1086530090185053e-05, + "loss": 1.6754, + "step": 41504 + }, + { + "epoch": 0.5188379709492738, + "grad_norm": 3.43753981590271, + "learning_rate": 1.108566256372893e-05, + "loss": 1.5275, + "step": 41506 + }, + { + "epoch": 0.5188629715742894, + "grad_norm": 0.001026161015033722, + "learning_rate": 1.1084795029004553e-05, + "loss": 0.0, + "step": 41508 + }, + { + "epoch": 0.5188879721993049, + "grad_norm": 2.3036513328552246, + "learning_rate": 1.1083927486018533e-05, + "loss": 1.4166, + "step": 41510 + }, + { + "epoch": 0.5189129728243206, + "grad_norm": 0.26699575781822205, + "learning_rate": 1.1083059934777473e-05, + "loss": 0.4186, + "step": 41512 + }, + { + "epoch": 0.5189379734493362, + "grad_norm": 0.023443294689059258, + "learning_rate": 1.1082192375287981e-05, + "loss": 0.1165, + "step": 41514 + }, + { + "epoch": 0.5189629740743519, + "grad_norm": 6.01442289352417, + "learning_rate": 1.1081324807556666e-05, + "loss": 2.0984, + "step": 41516 + }, + { + "epoch": 0.5189879746993675, + "grad_norm": 3.8196969032287598, + "learning_rate": 1.1080457231590138e-05, + "loss": 1.6675, + "step": 41518 + }, + { + "epoch": 0.5190129753243831, + "grad_norm": 0.0009856866672635078, + "learning_rate": 1.1079589647394994e-05, + "loss": 0.0, + "step": 41520 + }, + { + "epoch": 0.5190379759493987, + "grad_norm": 2.900263547897339, + "learning_rate": 1.1078722054977852e-05, + "loss": 0.7769, + "step": 41522 + }, + { + "epoch": 0.5190629765744144, + "grad_norm": 9.985272407531738, + "learning_rate": 1.1077854454345314e-05, + "loss": 0.308, + "step": 41524 + }, + { + "epoch": 0.51908797719943, + "grad_norm": 5.002849102020264, + "learning_rate": 1.107698684550399e-05, + "loss": 1.6945, + "step": 41526 + }, + { + "epoch": 0.5191129778244457, + "grad_norm": 4.257190227508545, + "learning_rate": 1.1076119228460484e-05, + "loss": 1.8206, + "step": 41528 + }, + { + "epoch": 0.5191379784494612, + "grad_norm": 5.588281154632568, + "learning_rate": 1.1075251603221411e-05, + "loss": 0.5479, + "step": 41530 + }, + { + "epoch": 0.5191629790744768, + "grad_norm": 2.5305824279785156, + "learning_rate": 1.107438396979337e-05, + "loss": 0.6892, + "step": 41532 + }, + { + "epoch": 0.5191879796994925, + "grad_norm": 9.561551094055176, + "learning_rate": 1.1073516328182973e-05, + "loss": 1.835, + "step": 41534 + }, + { + "epoch": 0.5192129803245081, + "grad_norm": 3.226969003677368, + "learning_rate": 1.1072648678396827e-05, + "loss": 0.9139, + "step": 41536 + }, + { + "epoch": 0.5192379809495238, + "grad_norm": 4.673713684082031, + "learning_rate": 1.107178102044154e-05, + "loss": 1.2495, + "step": 41538 + }, + { + "epoch": 0.5192629815745393, + "grad_norm": 5.22969913482666, + "learning_rate": 1.1070913354323723e-05, + "loss": 0.9625, + "step": 41540 + }, + { + "epoch": 0.519287982199555, + "grad_norm": 6.4564056396484375, + "learning_rate": 1.1070045680049979e-05, + "loss": 1.1817, + "step": 41542 + }, + { + "epoch": 0.5193129828245706, + "grad_norm": 0.1020413413643837, + "learning_rate": 1.106917799762692e-05, + "loss": 0.0151, + "step": 41544 + }, + { + "epoch": 0.5193379834495863, + "grad_norm": 3.4805712699890137, + "learning_rate": 1.1068310307061152e-05, + "loss": 0.6268, + "step": 41546 + }, + { + "epoch": 0.5193629840746019, + "grad_norm": 3.3350753784179688, + "learning_rate": 1.1067442608359283e-05, + "loss": 1.129, + "step": 41548 + }, + { + "epoch": 0.5193879846996174, + "grad_norm": 4.714946269989014, + "learning_rate": 1.1066574901527922e-05, + "loss": 1.0537, + "step": 41550 + }, + { + "epoch": 0.5194129853246331, + "grad_norm": 1.9354841709136963, + "learning_rate": 1.1065707186573678e-05, + "loss": 0.964, + "step": 41552 + }, + { + "epoch": 0.5194379859496487, + "grad_norm": 3.0872139930725098, + "learning_rate": 1.1064839463503159e-05, + "loss": 1.2357, + "step": 41554 + }, + { + "epoch": 0.5194629865746644, + "grad_norm": 2.24739933013916, + "learning_rate": 1.1063971732322973e-05, + "loss": 0.1442, + "step": 41556 + }, + { + "epoch": 0.51948798719968, + "grad_norm": 4.126218795776367, + "learning_rate": 1.106310399303973e-05, + "loss": 0.6901, + "step": 41558 + }, + { + "epoch": 0.5195129878246956, + "grad_norm": 2.3546557426452637, + "learning_rate": 1.1062236245660033e-05, + "loss": 1.2176, + "step": 41560 + }, + { + "epoch": 0.5195379884497112, + "grad_norm": 10.480853080749512, + "learning_rate": 1.1061368490190499e-05, + "loss": 1.5981, + "step": 41562 + }, + { + "epoch": 0.5195629890747269, + "grad_norm": 2.047851085662842, + "learning_rate": 1.106050072663773e-05, + "loss": 0.7062, + "step": 41564 + }, + { + "epoch": 0.5195879896997425, + "grad_norm": 3.2551991939544678, + "learning_rate": 1.1059632955008341e-05, + "loss": 0.9555, + "step": 41566 + }, + { + "epoch": 0.5196129903247582, + "grad_norm": 3.0699617862701416, + "learning_rate": 1.1058765175308934e-05, + "loss": 0.6606, + "step": 41568 + }, + { + "epoch": 0.5196379909497737, + "grad_norm": 0.23852407932281494, + "learning_rate": 1.1057897387546123e-05, + "loss": 0.0081, + "step": 41570 + }, + { + "epoch": 0.5196629915747893, + "grad_norm": 2.957916498184204, + "learning_rate": 1.1057029591726516e-05, + "loss": 1.9388, + "step": 41572 + }, + { + "epoch": 0.519687992199805, + "grad_norm": 1.5223169326782227, + "learning_rate": 1.1056161787856716e-05, + "loss": 0.4316, + "step": 41574 + }, + { + "epoch": 0.5197129928248206, + "grad_norm": 2.688776969909668, + "learning_rate": 1.1055293975943344e-05, + "loss": 0.9058, + "step": 41576 + }, + { + "epoch": 0.5197379934498363, + "grad_norm": 2.590174913406372, + "learning_rate": 1.1054426155993e-05, + "loss": 0.6005, + "step": 41578 + }, + { + "epoch": 0.5197629940748518, + "grad_norm": 3.1043341159820557, + "learning_rate": 1.1053558328012293e-05, + "loss": 0.182, + "step": 41580 + }, + { + "epoch": 0.5197879946998675, + "grad_norm": 4.1617112159729, + "learning_rate": 1.1052690492007838e-05, + "loss": 0.4163, + "step": 41582 + }, + { + "epoch": 0.5198129953248831, + "grad_norm": 3.7828333377838135, + "learning_rate": 1.1051822647986239e-05, + "loss": 1.5027, + "step": 41584 + }, + { + "epoch": 0.5198379959498988, + "grad_norm": 2.3076775074005127, + "learning_rate": 1.1050954795954105e-05, + "loss": 0.828, + "step": 41586 + }, + { + "epoch": 0.5198629965749144, + "grad_norm": 4.920904159545898, + "learning_rate": 1.1050086935918053e-05, + "loss": 1.5988, + "step": 41588 + }, + { + "epoch": 0.51988799719993, + "grad_norm": 4.260356426239014, + "learning_rate": 1.1049219067884684e-05, + "loss": 1.0746, + "step": 41590 + }, + { + "epoch": 0.5199129978249456, + "grad_norm": 5.152005195617676, + "learning_rate": 1.1048351191860613e-05, + "loss": 0.0465, + "step": 41592 + }, + { + "epoch": 0.5199379984499612, + "grad_norm": 0.002774524036794901, + "learning_rate": 1.1047483307852447e-05, + "loss": 0.0001, + "step": 41594 + }, + { + "epoch": 0.5199629990749769, + "grad_norm": 5.182002067565918, + "learning_rate": 1.1046615415866797e-05, + "loss": 1.0402, + "step": 41596 + }, + { + "epoch": 0.5199879996999925, + "grad_norm": 0.05066744238138199, + "learning_rate": 1.1045747515910269e-05, + "loss": 0.0236, + "step": 41598 + }, + { + "epoch": 0.5200130003250081, + "grad_norm": 3.3811562061309814, + "learning_rate": 1.1044879607989478e-05, + "loss": 1.3651, + "step": 41600 + }, + { + "epoch": 0.5200380009500237, + "grad_norm": 3.681610584259033, + "learning_rate": 1.1044011692111033e-05, + "loss": 0.6342, + "step": 41602 + }, + { + "epoch": 0.5200630015750394, + "grad_norm": 7.351370334625244, + "learning_rate": 1.1043143768281542e-05, + "loss": 2.0241, + "step": 41604 + }, + { + "epoch": 0.520088002200055, + "grad_norm": 0.010343382135033607, + "learning_rate": 1.1042275836507612e-05, + "loss": 0.0163, + "step": 41606 + }, + { + "epoch": 0.5201130028250707, + "grad_norm": 0.09353338181972504, + "learning_rate": 1.1041407896795862e-05, + "loss": 0.6108, + "step": 41608 + }, + { + "epoch": 0.5201380034500862, + "grad_norm": 1.9900891780853271, + "learning_rate": 1.1040539949152893e-05, + "loss": 0.6695, + "step": 41610 + }, + { + "epoch": 0.5201630040751019, + "grad_norm": 7.807685375213623, + "learning_rate": 1.1039671993585317e-05, + "loss": 0.5323, + "step": 41612 + }, + { + "epoch": 0.5201880047001175, + "grad_norm": 0.0014732948038727045, + "learning_rate": 1.103880403009975e-05, + "loss": 0.1924, + "step": 41614 + }, + { + "epoch": 0.5202130053251331, + "grad_norm": 4.028494834899902, + "learning_rate": 1.1037936058702798e-05, + "loss": 1.2787, + "step": 41616 + }, + { + "epoch": 0.5202380059501488, + "grad_norm": 4.278817653656006, + "learning_rate": 1.1037068079401072e-05, + "loss": 0.3765, + "step": 41618 + }, + { + "epoch": 0.5202630065751643, + "grad_norm": 0.024569952860474586, + "learning_rate": 1.103620009220118e-05, + "loss": 0.9554, + "step": 41620 + }, + { + "epoch": 0.52028800720018, + "grad_norm": 2.4139974117279053, + "learning_rate": 1.1035332097109739e-05, + "loss": 0.0569, + "step": 41622 + }, + { + "epoch": 0.5203130078251956, + "grad_norm": 0.001326894387602806, + "learning_rate": 1.1034464094133349e-05, + "loss": 1.009, + "step": 41624 + }, + { + "epoch": 0.5203380084502113, + "grad_norm": 0.0276937335729599, + "learning_rate": 1.103359608327863e-05, + "loss": 0.6598, + "step": 41626 + }, + { + "epoch": 0.5203630090752269, + "grad_norm": 9.198343276977539, + "learning_rate": 1.1032728064552191e-05, + "loss": 2.2509, + "step": 41628 + }, + { + "epoch": 0.5203880097002425, + "grad_norm": 5.080960750579834, + "learning_rate": 1.1031860037960638e-05, + "loss": 1.3438, + "step": 41630 + }, + { + "epoch": 0.5204130103252581, + "grad_norm": 2.826197624206543, + "learning_rate": 1.1030992003510585e-05, + "loss": 1.2723, + "step": 41632 + }, + { + "epoch": 0.5204380109502738, + "grad_norm": 2.760709524154663, + "learning_rate": 1.1030123961208648e-05, + "loss": 1.9201, + "step": 41634 + }, + { + "epoch": 0.5204630115752894, + "grad_norm": 1.9353554248809814, + "learning_rate": 1.1029255911061425e-05, + "loss": 0.8429, + "step": 41636 + }, + { + "epoch": 0.520488012200305, + "grad_norm": 2.9061830043792725, + "learning_rate": 1.1028387853075537e-05, + "loss": 0.6794, + "step": 41638 + }, + { + "epoch": 0.5205130128253206, + "grad_norm": 1.2241979837417603, + "learning_rate": 1.1027519787257594e-05, + "loss": 0.302, + "step": 41640 + }, + { + "epoch": 0.5205380134503362, + "grad_norm": 3.8147881031036377, + "learning_rate": 1.1026651713614208e-05, + "loss": 1.7663, + "step": 41642 + }, + { + "epoch": 0.5205630140753519, + "grad_norm": 3.5703253746032715, + "learning_rate": 1.1025783632151983e-05, + "loss": 0.3939, + "step": 41644 + }, + { + "epoch": 0.5205880147003675, + "grad_norm": 5.407223224639893, + "learning_rate": 1.1024915542877537e-05, + "loss": 1.9561, + "step": 41646 + }, + { + "epoch": 0.5206130153253832, + "grad_norm": 2.0000572204589844, + "learning_rate": 1.1024047445797479e-05, + "loss": 0.9997, + "step": 41648 + }, + { + "epoch": 0.5206380159503987, + "grad_norm": 4.598398208618164, + "learning_rate": 1.1023179340918419e-05, + "loss": 1.4974, + "step": 41650 + }, + { + "epoch": 0.5206630165754144, + "grad_norm": 5.055205821990967, + "learning_rate": 1.1022311228246971e-05, + "loss": 1.1653, + "step": 41652 + }, + { + "epoch": 0.52068801720043, + "grad_norm": 3.895606517791748, + "learning_rate": 1.1021443107789749e-05, + "loss": 1.832, + "step": 41654 + }, + { + "epoch": 0.5207130178254457, + "grad_norm": 4.820596218109131, + "learning_rate": 1.1020574979553356e-05, + "loss": 1.0849, + "step": 41656 + }, + { + "epoch": 0.5207380184504613, + "grad_norm": 3.3460569381713867, + "learning_rate": 1.1019706843544408e-05, + "loss": 0.6865, + "step": 41658 + }, + { + "epoch": 0.5207630190754768, + "grad_norm": 5.696576118469238, + "learning_rate": 1.1018838699769518e-05, + "loss": 1.4419, + "step": 41660 + }, + { + "epoch": 0.5207880197004925, + "grad_norm": 3.6522529125213623, + "learning_rate": 1.1017970548235297e-05, + "loss": 1.1562, + "step": 41662 + }, + { + "epoch": 0.5208130203255081, + "grad_norm": 4.839389324188232, + "learning_rate": 1.1017102388948358e-05, + "loss": 0.4915, + "step": 41664 + }, + { + "epoch": 0.5208380209505238, + "grad_norm": 15.60747241973877, + "learning_rate": 1.1016234221915308e-05, + "loss": 1.8107, + "step": 41666 + }, + { + "epoch": 0.5208630215755394, + "grad_norm": 5.011492729187012, + "learning_rate": 1.1015366047142765e-05, + "loss": 0.5425, + "step": 41668 + }, + { + "epoch": 0.520888022200555, + "grad_norm": 3.0554356575012207, + "learning_rate": 1.1014497864637334e-05, + "loss": 0.8616, + "step": 41670 + }, + { + "epoch": 0.5209130228255706, + "grad_norm": 4.396804332733154, + "learning_rate": 1.1013629674405635e-05, + "loss": 1.1447, + "step": 41672 + }, + { + "epoch": 0.5209380234505863, + "grad_norm": 0.001384331495501101, + "learning_rate": 1.1012761476454272e-05, + "loss": 0.3912, + "step": 41674 + }, + { + "epoch": 0.5209630240756019, + "grad_norm": 2.3379924297332764, + "learning_rate": 1.1011893270789862e-05, + "loss": 1.2199, + "step": 41676 + }, + { + "epoch": 0.5209880247006176, + "grad_norm": 2.6794610023498535, + "learning_rate": 1.1011025057419015e-05, + "loss": 1.1392, + "step": 41678 + }, + { + "epoch": 0.5210130253256331, + "grad_norm": 0.001502771396189928, + "learning_rate": 1.1010156836348346e-05, + "loss": 0.7505, + "step": 41680 + }, + { + "epoch": 0.5210380259506487, + "grad_norm": 1.4869060516357422, + "learning_rate": 1.1009288607584465e-05, + "loss": 0.5155, + "step": 41682 + }, + { + "epoch": 0.5210630265756644, + "grad_norm": 0.0022135439794510603, + "learning_rate": 1.1008420371133985e-05, + "loss": 0.6853, + "step": 41684 + }, + { + "epoch": 0.52108802720068, + "grad_norm": 1.1591531038284302, + "learning_rate": 1.1007552127003519e-05, + "loss": 1.1854, + "step": 41686 + }, + { + "epoch": 0.5211130278256957, + "grad_norm": 6.423802375793457, + "learning_rate": 1.1006683875199675e-05, + "loss": 1.7218, + "step": 41688 + }, + { + "epoch": 0.5211380284507112, + "grad_norm": 5.894760608673096, + "learning_rate": 1.100581561572907e-05, + "loss": 1.0201, + "step": 41690 + }, + { + "epoch": 0.5211630290757269, + "grad_norm": 5.021176338195801, + "learning_rate": 1.1004947348598316e-05, + "loss": 1.0249, + "step": 41692 + }, + { + "epoch": 0.5211880297007425, + "grad_norm": 4.6062445640563965, + "learning_rate": 1.1004079073814027e-05, + "loss": 0.9785, + "step": 41694 + }, + { + "epoch": 0.5212130303257582, + "grad_norm": 3.208562135696411, + "learning_rate": 1.1003210791382812e-05, + "loss": 1.6069, + "step": 41696 + }, + { + "epoch": 0.5212380309507738, + "grad_norm": 2.000998020172119, + "learning_rate": 1.1002342501311284e-05, + "loss": 0.1908, + "step": 41698 + }, + { + "epoch": 0.5212630315757893, + "grad_norm": 1.5872143507003784, + "learning_rate": 1.100147420360606e-05, + "loss": 0.1228, + "step": 41700 + }, + { + "epoch": 0.521288032200805, + "grad_norm": 6.415144920349121, + "learning_rate": 1.1000605898273748e-05, + "loss": 1.2413, + "step": 41702 + }, + { + "epoch": 0.5213130328258206, + "grad_norm": 3.465776205062866, + "learning_rate": 1.0999737585320965e-05, + "loss": 0.6929, + "step": 41704 + }, + { + "epoch": 0.5213380334508363, + "grad_norm": 4.074665069580078, + "learning_rate": 1.0998869264754321e-05, + "loss": 0.7731, + "step": 41706 + }, + { + "epoch": 0.5213630340758519, + "grad_norm": 3.841113567352295, + "learning_rate": 1.099800093658043e-05, + "loss": 1.3743, + "step": 41708 + }, + { + "epoch": 0.5213880347008675, + "grad_norm": 3.8150954246520996, + "learning_rate": 1.0997132600805907e-05, + "loss": 1.6887, + "step": 41710 + }, + { + "epoch": 0.5214130353258831, + "grad_norm": 4.938525199890137, + "learning_rate": 1.099626425743736e-05, + "loss": 1.7578, + "step": 41712 + }, + { + "epoch": 0.5214380359508988, + "grad_norm": 4.299689292907715, + "learning_rate": 1.0995395906481405e-05, + "loss": 0.6181, + "step": 41714 + }, + { + "epoch": 0.5214630365759144, + "grad_norm": 0.0015136667061597109, + "learning_rate": 1.0994527547944655e-05, + "loss": 0.1197, + "step": 41716 + }, + { + "epoch": 0.5214880372009301, + "grad_norm": 3.1739721298217773, + "learning_rate": 1.0993659181833727e-05, + "loss": 0.6245, + "step": 41718 + }, + { + "epoch": 0.5215130378259456, + "grad_norm": 3.8068628311157227, + "learning_rate": 1.0992790808155231e-05, + "loss": 0.5631, + "step": 41720 + }, + { + "epoch": 0.5215380384509612, + "grad_norm": 2.3081088066101074, + "learning_rate": 1.099192242691578e-05, + "loss": 0.8118, + "step": 41722 + }, + { + "epoch": 0.5215630390759769, + "grad_norm": 2.437318801879883, + "learning_rate": 1.0991054038121986e-05, + "loss": 0.4096, + "step": 41724 + }, + { + "epoch": 0.5215880397009925, + "grad_norm": 1.6641714572906494, + "learning_rate": 1.0990185641780466e-05, + "loss": 0.1323, + "step": 41726 + }, + { + "epoch": 0.5216130403260082, + "grad_norm": 0.0031597092747688293, + "learning_rate": 1.0989317237897833e-05, + "loss": 0.326, + "step": 41728 + }, + { + "epoch": 0.5216380409510237, + "grad_norm": 4.447670936584473, + "learning_rate": 1.0988448826480698e-05, + "loss": 1.4716, + "step": 41730 + }, + { + "epoch": 0.5216630415760394, + "grad_norm": 4.665535926818848, + "learning_rate": 1.098758040753568e-05, + "loss": 1.1505, + "step": 41732 + }, + { + "epoch": 0.521688042201055, + "grad_norm": 4.566941738128662, + "learning_rate": 1.0986711981069387e-05, + "loss": 1.5077, + "step": 41734 + }, + { + "epoch": 0.5217130428260707, + "grad_norm": 4.452383995056152, + "learning_rate": 1.0985843547088435e-05, + "loss": 1.5581, + "step": 41736 + }, + { + "epoch": 0.5217380434510863, + "grad_norm": 3.419182300567627, + "learning_rate": 1.0984975105599436e-05, + "loss": 0.7962, + "step": 41738 + }, + { + "epoch": 0.5217630440761019, + "grad_norm": 1.8319799900054932, + "learning_rate": 1.0984106656609007e-05, + "loss": 0.3763, + "step": 41740 + }, + { + "epoch": 0.5217880447011175, + "grad_norm": 3.154487133026123, + "learning_rate": 1.0983238200123762e-05, + "loss": 0.3593, + "step": 41742 + }, + { + "epoch": 0.5218130453261332, + "grad_norm": 5.824884414672852, + "learning_rate": 1.0982369736150314e-05, + "loss": 0.4672, + "step": 41744 + }, + { + "epoch": 0.5218380459511488, + "grad_norm": 2.786522150039673, + "learning_rate": 1.0981501264695278e-05, + "loss": 0.3927, + "step": 41746 + }, + { + "epoch": 0.5218630465761644, + "grad_norm": 3.030055284500122, + "learning_rate": 1.0980632785765266e-05, + "loss": 0.5972, + "step": 41748 + }, + { + "epoch": 0.52188804720118, + "grad_norm": 0.19357101619243622, + "learning_rate": 1.0979764299366893e-05, + "loss": 0.5233, + "step": 41750 + }, + { + "epoch": 0.5219130478261956, + "grad_norm": 2.7428243160247803, + "learning_rate": 1.0978895805506773e-05, + "loss": 0.7483, + "step": 41752 + }, + { + "epoch": 0.5219380484512113, + "grad_norm": 4.72699499130249, + "learning_rate": 1.0978027304191521e-05, + "loss": 0.6679, + "step": 41754 + }, + { + "epoch": 0.5219630490762269, + "grad_norm": 1.9369003772735596, + "learning_rate": 1.0977158795427752e-05, + "loss": 0.4144, + "step": 41756 + }, + { + "epoch": 0.5219880497012426, + "grad_norm": 3.4763426780700684, + "learning_rate": 1.0976290279222081e-05, + "loss": 0.6922, + "step": 41758 + }, + { + "epoch": 0.5220130503262581, + "grad_norm": 2.949866771697998, + "learning_rate": 1.0975421755581122e-05, + "loss": 0.6576, + "step": 41760 + }, + { + "epoch": 0.5220380509512738, + "grad_norm": 7.623414993286133, + "learning_rate": 1.0974553224511486e-05, + "loss": 1.2014, + "step": 41762 + }, + { + "epoch": 0.5220630515762894, + "grad_norm": 4.812924385070801, + "learning_rate": 1.0973684686019788e-05, + "loss": 0.8434, + "step": 41764 + }, + { + "epoch": 0.522088052201305, + "grad_norm": 4.222620010375977, + "learning_rate": 1.0972816140112653e-05, + "loss": 1.2466, + "step": 41766 + }, + { + "epoch": 0.5221130528263207, + "grad_norm": 3.633105754852295, + "learning_rate": 1.0971947586796681e-05, + "loss": 0.6805, + "step": 41768 + }, + { + "epoch": 0.5221380534513362, + "grad_norm": 5.389729022979736, + "learning_rate": 1.0971079026078498e-05, + "loss": 1.2117, + "step": 41770 + }, + { + "epoch": 0.5221630540763519, + "grad_norm": 2.6830193996429443, + "learning_rate": 1.0970210457964712e-05, + "loss": 0.8287, + "step": 41772 + }, + { + "epoch": 0.5221880547013675, + "grad_norm": 2.761617422103882, + "learning_rate": 1.096934188246194e-05, + "loss": 2.2434, + "step": 41774 + }, + { + "epoch": 0.5222130553263832, + "grad_norm": 0.10888978838920593, + "learning_rate": 1.0968473299576796e-05, + "loss": 0.6689, + "step": 41776 + }, + { + "epoch": 0.5222380559513988, + "grad_norm": 1.5432281494140625, + "learning_rate": 1.0967604709315902e-05, + "loss": 0.1102, + "step": 41778 + }, + { + "epoch": 0.5222630565764144, + "grad_norm": 2.615574836730957, + "learning_rate": 1.0966736111685863e-05, + "loss": 1.208, + "step": 41780 + }, + { + "epoch": 0.52228805720143, + "grad_norm": 2.3423912525177, + "learning_rate": 1.0965867506693299e-05, + "loss": 0.2022, + "step": 41782 + }, + { + "epoch": 0.5223130578264457, + "grad_norm": 4.303787708282471, + "learning_rate": 1.0964998894344827e-05, + "loss": 1.4785, + "step": 41784 + }, + { + "epoch": 0.5223380584514613, + "grad_norm": 0.9082997441291809, + "learning_rate": 1.0964130274647058e-05, + "loss": 0.7226, + "step": 41786 + }, + { + "epoch": 0.522363059076477, + "grad_norm": 3.8381996154785156, + "learning_rate": 1.0963261647606606e-05, + "loss": 1.1889, + "step": 41788 + }, + { + "epoch": 0.5223880597014925, + "grad_norm": 3.137320041656494, + "learning_rate": 1.0962393013230093e-05, + "loss": 0.992, + "step": 41790 + }, + { + "epoch": 0.5224130603265081, + "grad_norm": 3.6918039321899414, + "learning_rate": 1.096152437152413e-05, + "loss": 0.7147, + "step": 41792 + }, + { + "epoch": 0.5224380609515238, + "grad_norm": 5.222500324249268, + "learning_rate": 1.0960655722495335e-05, + "loss": 1.5886, + "step": 41794 + }, + { + "epoch": 0.5224630615765394, + "grad_norm": 0.7354726791381836, + "learning_rate": 1.095978706615032e-05, + "loss": 0.7609, + "step": 41796 + }, + { + "epoch": 0.5224880622015551, + "grad_norm": 5.045327186584473, + "learning_rate": 1.0958918402495706e-05, + "loss": 1.148, + "step": 41798 + }, + { + "epoch": 0.5225130628265706, + "grad_norm": 1.8355305194854736, + "learning_rate": 1.0958049731538099e-05, + "loss": 0.8525, + "step": 41800 + }, + { + "epoch": 0.5225380634515863, + "grad_norm": 5.241974353790283, + "learning_rate": 1.0957181053284122e-05, + "loss": 2.0036, + "step": 41802 + }, + { + "epoch": 0.5225630640766019, + "grad_norm": 5.620728969573975, + "learning_rate": 1.0956312367740396e-05, + "loss": 1.7604, + "step": 41804 + }, + { + "epoch": 0.5225880647016176, + "grad_norm": 4.941931247711182, + "learning_rate": 1.0955443674913524e-05, + "loss": 1.9241, + "step": 41806 + }, + { + "epoch": 0.5226130653266332, + "grad_norm": 0.00323321670293808, + "learning_rate": 1.095457497481013e-05, + "loss": 1.0004, + "step": 41808 + }, + { + "epoch": 0.5226380659516487, + "grad_norm": 3.870039224624634, + "learning_rate": 1.095370626743683e-05, + "loss": 0.7984, + "step": 41810 + }, + { + "epoch": 0.5226630665766644, + "grad_norm": 3.179499626159668, + "learning_rate": 1.0952837552800235e-05, + "loss": 0.9441, + "step": 41812 + }, + { + "epoch": 0.52268806720168, + "grad_norm": 5.306732654571533, + "learning_rate": 1.095196883090696e-05, + "loss": 2.0897, + "step": 41814 + }, + { + "epoch": 0.5227130678266957, + "grad_norm": 2.8374617099761963, + "learning_rate": 1.095110010176363e-05, + "loss": 1.2109, + "step": 41816 + }, + { + "epoch": 0.5227380684517113, + "grad_norm": 6.4320292472839355, + "learning_rate": 1.0950231365376858e-05, + "loss": 2.1707, + "step": 41818 + }, + { + "epoch": 0.5227630690767269, + "grad_norm": 2.650899887084961, + "learning_rate": 1.0949362621753257e-05, + "loss": 0.7611, + "step": 41820 + }, + { + "epoch": 0.5227880697017425, + "grad_norm": 3.24267840385437, + "learning_rate": 1.0948493870899442e-05, + "loss": 1.128, + "step": 41822 + }, + { + "epoch": 0.5228130703267582, + "grad_norm": 5.0363240242004395, + "learning_rate": 1.0947625112822036e-05, + "loss": 0.6755, + "step": 41824 + }, + { + "epoch": 0.5228380709517738, + "grad_norm": 6.074904918670654, + "learning_rate": 1.0946756347527645e-05, + "loss": 1.2839, + "step": 41826 + }, + { + "epoch": 0.5228630715767895, + "grad_norm": 1.5810354948043823, + "learning_rate": 1.0945887575022896e-05, + "loss": 0.2828, + "step": 41828 + }, + { + "epoch": 0.522888072201805, + "grad_norm": 4.222354412078857, + "learning_rate": 1.0945018795314402e-05, + "loss": 0.8337, + "step": 41830 + }, + { + "epoch": 0.5229130728268206, + "grad_norm": 2.1774449348449707, + "learning_rate": 1.0944150008408777e-05, + "loss": 0.6957, + "step": 41832 + }, + { + "epoch": 0.5229380734518363, + "grad_norm": 9.418947219848633, + "learning_rate": 1.0943281214312636e-05, + "loss": 0.6943, + "step": 41834 + }, + { + "epoch": 0.5229630740768519, + "grad_norm": 0.004948167596012354, + "learning_rate": 1.0942412413032604e-05, + "loss": 0.0937, + "step": 41836 + }, + { + "epoch": 0.5229880747018676, + "grad_norm": 0.002772354753687978, + "learning_rate": 1.0941543604575287e-05, + "loss": 0.9476, + "step": 41838 + }, + { + "epoch": 0.5230130753268831, + "grad_norm": 4.8603668212890625, + "learning_rate": 1.0940674788947308e-05, + "loss": 1.2558, + "step": 41840 + }, + { + "epoch": 0.5230380759518988, + "grad_norm": 3.2356338500976562, + "learning_rate": 1.0939805966155284e-05, + "loss": 0.7631, + "step": 41842 + }, + { + "epoch": 0.5230630765769144, + "grad_norm": 2.795487403869629, + "learning_rate": 1.0938937136205835e-05, + "loss": 0.6754, + "step": 41844 + }, + { + "epoch": 0.5230880772019301, + "grad_norm": 2.60719895362854, + "learning_rate": 1.0938068299105569e-05, + "loss": 0.7494, + "step": 41846 + }, + { + "epoch": 0.5231130778269457, + "grad_norm": 4.18751859664917, + "learning_rate": 1.0937199454861107e-05, + "loss": 1.9004, + "step": 41848 + }, + { + "epoch": 0.5231380784519613, + "grad_norm": 0.0059557161293923855, + "learning_rate": 1.0936330603479067e-05, + "loss": 0.001, + "step": 41850 + }, + { + "epoch": 0.5231630790769769, + "grad_norm": 1.8380693197250366, + "learning_rate": 1.0935461744966066e-05, + "loss": 1.5175, + "step": 41852 + }, + { + "epoch": 0.5231880797019925, + "grad_norm": 2.9118781089782715, + "learning_rate": 1.0934592879328721e-05, + "loss": 0.3212, + "step": 41854 + }, + { + "epoch": 0.5232130803270082, + "grad_norm": 2.6311721801757812, + "learning_rate": 1.0933724006573649e-05, + "loss": 1.0504, + "step": 41856 + }, + { + "epoch": 0.5232380809520238, + "grad_norm": 6.002140522003174, + "learning_rate": 1.0932855126707467e-05, + "loss": 1.3155, + "step": 41858 + }, + { + "epoch": 0.5232630815770394, + "grad_norm": 1.370173454284668, + "learning_rate": 1.0931986239736792e-05, + "loss": 0.0704, + "step": 41860 + }, + { + "epoch": 0.523288082202055, + "grad_norm": 4.112255573272705, + "learning_rate": 1.0931117345668241e-05, + "loss": 1.7337, + "step": 41862 + }, + { + "epoch": 0.5233130828270707, + "grad_norm": 2.9996540546417236, + "learning_rate": 1.093024844450843e-05, + "loss": 1.4683, + "step": 41864 + }, + { + "epoch": 0.5233380834520863, + "grad_norm": 2.9023702144622803, + "learning_rate": 1.0929379536263982e-05, + "loss": 0.0876, + "step": 41866 + }, + { + "epoch": 0.523363084077102, + "grad_norm": 2.2860493659973145, + "learning_rate": 1.0928510620941508e-05, + "loss": 0.6307, + "step": 41868 + }, + { + "epoch": 0.5233880847021175, + "grad_norm": 2.0851824283599854, + "learning_rate": 1.0927641698547634e-05, + "loss": 0.3614, + "step": 41870 + }, + { + "epoch": 0.5234130853271332, + "grad_norm": 2.8604462146759033, + "learning_rate": 1.0926772769088967e-05, + "loss": 0.9407, + "step": 41872 + }, + { + "epoch": 0.5234380859521488, + "grad_norm": 5.401340484619141, + "learning_rate": 1.0925903832572131e-05, + "loss": 1.2978, + "step": 41874 + }, + { + "epoch": 0.5234630865771644, + "grad_norm": 1.7269366979599, + "learning_rate": 1.092503488900374e-05, + "loss": 0.2232, + "step": 41876 + }, + { + "epoch": 0.5234880872021801, + "grad_norm": 3.722104549407959, + "learning_rate": 1.0924165938390416e-05, + "loss": 1.353, + "step": 41878 + }, + { + "epoch": 0.5235130878271956, + "grad_norm": 0.4074215590953827, + "learning_rate": 1.0923296980738775e-05, + "loss": 0.0117, + "step": 41880 + }, + { + "epoch": 0.5235380884522113, + "grad_norm": 2.8887898921966553, + "learning_rate": 1.0922428016055437e-05, + "loss": 1.3693, + "step": 41882 + }, + { + "epoch": 0.5235630890772269, + "grad_norm": 4.166430473327637, + "learning_rate": 1.0921559044347017e-05, + "loss": 1.2836, + "step": 41884 + }, + { + "epoch": 0.5235880897022426, + "grad_norm": 5.523393154144287, + "learning_rate": 1.092069006562013e-05, + "loss": 1.0261, + "step": 41886 + }, + { + "epoch": 0.5236130903272582, + "grad_norm": 4.452111721038818, + "learning_rate": 1.0919821079881401e-05, + "loss": 1.267, + "step": 41888 + }, + { + "epoch": 0.5236380909522738, + "grad_norm": 3.6269872188568115, + "learning_rate": 1.0918952087137442e-05, + "loss": 1.1643, + "step": 41890 + }, + { + "epoch": 0.5236630915772894, + "grad_norm": 3.552969217300415, + "learning_rate": 1.0918083087394876e-05, + "loss": 0.8395, + "step": 41892 + }, + { + "epoch": 0.5236880922023051, + "grad_norm": 4.862204551696777, + "learning_rate": 1.0917214080660316e-05, + "loss": 1.4253, + "step": 41894 + }, + { + "epoch": 0.5237130928273207, + "grad_norm": 3.5781373977661133, + "learning_rate": 1.0916345066940388e-05, + "loss": 1.5429, + "step": 41896 + }, + { + "epoch": 0.5237380934523364, + "grad_norm": 0.21494439244270325, + "learning_rate": 1.0915476046241704e-05, + "loss": 0.0966, + "step": 41898 + }, + { + "epoch": 0.5237630940773519, + "grad_norm": 5.089397430419922, + "learning_rate": 1.0914607018570882e-05, + "loss": 0.6099, + "step": 41900 + }, + { + "epoch": 0.5237880947023675, + "grad_norm": 0.02405439503490925, + "learning_rate": 1.0913737983934545e-05, + "loss": 0.3556, + "step": 41902 + }, + { + "epoch": 0.5238130953273832, + "grad_norm": 1.8769323825836182, + "learning_rate": 1.0912868942339305e-05, + "loss": 1.0256, + "step": 41904 + }, + { + "epoch": 0.5238380959523988, + "grad_norm": 0.5402197241783142, + "learning_rate": 1.0911999893791787e-05, + "loss": 0.3447, + "step": 41906 + }, + { + "epoch": 0.5238630965774145, + "grad_norm": 0.0015021818690001965, + "learning_rate": 1.0911130838298608e-05, + "loss": 0.6789, + "step": 41908 + }, + { + "epoch": 0.52388809720243, + "grad_norm": 3.9403951168060303, + "learning_rate": 1.0910261775866382e-05, + "loss": 0.6026, + "step": 41910 + }, + { + "epoch": 0.5239130978274457, + "grad_norm": 1.6970998048782349, + "learning_rate": 1.0909392706501734e-05, + "loss": 0.0997, + "step": 41912 + }, + { + "epoch": 0.5239380984524613, + "grad_norm": 5.074650764465332, + "learning_rate": 1.0908523630211278e-05, + "loss": 1.2093, + "step": 41914 + }, + { + "epoch": 0.523963099077477, + "grad_norm": 0.1853998750448227, + "learning_rate": 1.0907654547001634e-05, + "loss": 0.0061, + "step": 41916 + }, + { + "epoch": 0.5239880997024926, + "grad_norm": 5.54794979095459, + "learning_rate": 1.090678545687942e-05, + "loss": 1.3644, + "step": 41918 + }, + { + "epoch": 0.5240131003275081, + "grad_norm": 0.0026477498468011618, + "learning_rate": 1.090591635985126e-05, + "loss": 0.561, + "step": 41920 + }, + { + "epoch": 0.5240381009525238, + "grad_norm": 0.003013865789398551, + "learning_rate": 1.0905047255923768e-05, + "loss": 1.0407, + "step": 41922 + }, + { + "epoch": 0.5240631015775394, + "grad_norm": 4.591494560241699, + "learning_rate": 1.0904178145103565e-05, + "loss": 0.9515, + "step": 41924 + }, + { + "epoch": 0.5240881022025551, + "grad_norm": 0.5735729932785034, + "learning_rate": 1.0903309027397266e-05, + "loss": 0.3035, + "step": 41926 + }, + { + "epoch": 0.5241131028275707, + "grad_norm": 1.2059264183044434, + "learning_rate": 1.0902439902811494e-05, + "loss": 0.0246, + "step": 41928 + }, + { + "epoch": 0.5241381034525863, + "grad_norm": 0.0015715367626398802, + "learning_rate": 1.090157077135287e-05, + "loss": 0.0001, + "step": 41930 + }, + { + "epoch": 0.5241631040776019, + "grad_norm": 0.0016481614438816905, + "learning_rate": 1.0900701633028008e-05, + "loss": 0.747, + "step": 41932 + }, + { + "epoch": 0.5241881047026176, + "grad_norm": 2.6016998291015625, + "learning_rate": 1.0899832487843531e-05, + "loss": 0.8371, + "step": 41934 + }, + { + "epoch": 0.5242131053276332, + "grad_norm": 4.7790141105651855, + "learning_rate": 1.0898963335806057e-05, + "loss": 0.9136, + "step": 41936 + }, + { + "epoch": 0.5242381059526489, + "grad_norm": 5.740937232971191, + "learning_rate": 1.0898094176922205e-05, + "loss": 0.5324, + "step": 41938 + }, + { + "epoch": 0.5242631065776644, + "grad_norm": 4.006649971008301, + "learning_rate": 1.0897225011198593e-05, + "loss": 1.1461, + "step": 41940 + }, + { + "epoch": 0.52428810720268, + "grad_norm": 3.7690491676330566, + "learning_rate": 1.0896355838641844e-05, + "loss": 0.5866, + "step": 41942 + }, + { + "epoch": 0.5243131078276957, + "grad_norm": 4.20495080947876, + "learning_rate": 1.0895486659258575e-05, + "loss": 1.0775, + "step": 41944 + }, + { + "epoch": 0.5243381084527113, + "grad_norm": 0.0018963573966175318, + "learning_rate": 1.0894617473055406e-05, + "loss": 0.0829, + "step": 41946 + }, + { + "epoch": 0.524363109077727, + "grad_norm": 4.789679527282715, + "learning_rate": 1.089374828003896e-05, + "loss": 1.9536, + "step": 41948 + }, + { + "epoch": 0.5243881097027425, + "grad_norm": 3.9796273708343506, + "learning_rate": 1.0892879080215851e-05, + "loss": 0.1679, + "step": 41950 + }, + { + "epoch": 0.5244131103277582, + "grad_norm": 1.6036498546600342, + "learning_rate": 1.08920098735927e-05, + "loss": 0.9595, + "step": 41952 + }, + { + "epoch": 0.5244381109527738, + "grad_norm": 3.7057347297668457, + "learning_rate": 1.089114066017613e-05, + "loss": 1.0844, + "step": 41954 + }, + { + "epoch": 0.5244631115777895, + "grad_norm": 4.918507099151611, + "learning_rate": 1.0890271439972757e-05, + "loss": 0.1492, + "step": 41956 + }, + { + "epoch": 0.5244881122028051, + "grad_norm": 2.363750457763672, + "learning_rate": 1.0889402212989203e-05, + "loss": 0.4772, + "step": 41958 + }, + { + "epoch": 0.5245131128278206, + "grad_norm": 3.470531702041626, + "learning_rate": 1.088853297923209e-05, + "loss": 0.1902, + "step": 41960 + }, + { + "epoch": 0.5245381134528363, + "grad_norm": 2.322737455368042, + "learning_rate": 1.0887663738708033e-05, + "loss": 0.9242, + "step": 41962 + }, + { + "epoch": 0.5245631140778519, + "grad_norm": 0.0012121410109102726, + "learning_rate": 1.0886794491423653e-05, + "loss": 0.8071, + "step": 41964 + }, + { + "epoch": 0.5245881147028676, + "grad_norm": 3.195564031600952, + "learning_rate": 1.0885925237385573e-05, + "loss": 1.1304, + "step": 41966 + }, + { + "epoch": 0.5246131153278832, + "grad_norm": 5.914672374725342, + "learning_rate": 1.0885055976600413e-05, + "loss": 0.3048, + "step": 41968 + }, + { + "epoch": 0.5246381159528988, + "grad_norm": 2.875750780105591, + "learning_rate": 1.088418670907479e-05, + "loss": 1.2583, + "step": 41970 + }, + { + "epoch": 0.5246631165779144, + "grad_norm": 4.122902870178223, + "learning_rate": 1.0883317434815328e-05, + "loss": 0.3285, + "step": 41972 + }, + { + "epoch": 0.5246881172029301, + "grad_norm": 2.6609630584716797, + "learning_rate": 1.0882448153828646e-05, + "loss": 0.7107, + "step": 41974 + }, + { + "epoch": 0.5247131178279457, + "grad_norm": 6.652135848999023, + "learning_rate": 1.088157886612136e-05, + "loss": 1.9952, + "step": 41976 + }, + { + "epoch": 0.5247381184529614, + "grad_norm": 3.3870179653167725, + "learning_rate": 1.0880709571700094e-05, + "loss": 1.0719, + "step": 41978 + }, + { + "epoch": 0.5247631190779769, + "grad_norm": 0.014477107673883438, + "learning_rate": 1.0879840270571473e-05, + "loss": 0.7696, + "step": 41980 + }, + { + "epoch": 0.5247881197029926, + "grad_norm": 0.006875952705740929, + "learning_rate": 1.087897096274211e-05, + "loss": 0.0504, + "step": 41982 + }, + { + "epoch": 0.5248131203280082, + "grad_norm": 6.761506080627441, + "learning_rate": 1.0878101648218629e-05, + "loss": 2.1265, + "step": 41984 + }, + { + "epoch": 0.5248381209530238, + "grad_norm": 4.299836158752441, + "learning_rate": 1.087723232700765e-05, + "loss": 0.6236, + "step": 41986 + }, + { + "epoch": 0.5248631215780395, + "grad_norm": 3.1090803146362305, + "learning_rate": 1.0876362999115794e-05, + "loss": 2.9763, + "step": 41988 + }, + { + "epoch": 0.524888122203055, + "grad_norm": 2.570404052734375, + "learning_rate": 1.0875493664549679e-05, + "loss": 0.919, + "step": 41990 + }, + { + "epoch": 0.5249131228280707, + "grad_norm": 5.773207664489746, + "learning_rate": 1.087462432331593e-05, + "loss": 0.4115, + "step": 41992 + }, + { + "epoch": 0.5249381234530863, + "grad_norm": 4.4976043701171875, + "learning_rate": 1.0873754975421167e-05, + "loss": 1.517, + "step": 41994 + }, + { + "epoch": 0.524963124078102, + "grad_norm": 2.589954137802124, + "learning_rate": 1.0872885620872009e-05, + "loss": 0.3869, + "step": 41996 + }, + { + "epoch": 0.5249881247031176, + "grad_norm": 0.43219709396362305, + "learning_rate": 1.0872016259675078e-05, + "loss": 0.017, + "step": 41998 + }, + { + "epoch": 0.5250131253281332, + "grad_norm": 3.408782958984375, + "learning_rate": 1.0871146891836995e-05, + "loss": 0.4082, + "step": 42000 + }, + { + "epoch": 0.5250381259531488, + "grad_norm": 3.3583061695098877, + "learning_rate": 1.0870277517364379e-05, + "loss": 0.512, + "step": 42002 + }, + { + "epoch": 0.5250631265781645, + "grad_norm": 3.836165428161621, + "learning_rate": 1.0869408136263849e-05, + "loss": 1.3663, + "step": 42004 + }, + { + "epoch": 0.5250881272031801, + "grad_norm": 5.8974151611328125, + "learning_rate": 1.0868538748542035e-05, + "loss": 0.521, + "step": 42006 + }, + { + "epoch": 0.5251131278281957, + "grad_norm": 1.870931625366211, + "learning_rate": 1.086766935420555e-05, + "loss": 0.1788, + "step": 42008 + }, + { + "epoch": 0.5251381284532113, + "grad_norm": 4.165159225463867, + "learning_rate": 1.0866799953261017e-05, + "loss": 0.5045, + "step": 42010 + }, + { + "epoch": 0.5251631290782269, + "grad_norm": 2.441223621368408, + "learning_rate": 1.086593054571506e-05, + "loss": 0.3577, + "step": 42012 + }, + { + "epoch": 0.5251881297032426, + "grad_norm": 0.004411993082612753, + "learning_rate": 1.0865061131574302e-05, + "loss": 0.8376, + "step": 42014 + }, + { + "epoch": 0.5252131303282582, + "grad_norm": 0.0031129310373216867, + "learning_rate": 1.0864191710845354e-05, + "loss": 0.5359, + "step": 42016 + }, + { + "epoch": 0.5252381309532739, + "grad_norm": 2.055360794067383, + "learning_rate": 1.0863322283534845e-05, + "loss": 0.1808, + "step": 42018 + }, + { + "epoch": 0.5252631315782894, + "grad_norm": 3.829859972000122, + "learning_rate": 1.0862452849649402e-05, + "loss": 1.1548, + "step": 42020 + }, + { + "epoch": 0.5252881322033051, + "grad_norm": 2.9087021350860596, + "learning_rate": 1.0861583409195633e-05, + "loss": 1.0917, + "step": 42022 + }, + { + "epoch": 0.5253131328283207, + "grad_norm": 3.1539130210876465, + "learning_rate": 1.086071396218017e-05, + "loss": 0.4287, + "step": 42024 + }, + { + "epoch": 0.5253381334533364, + "grad_norm": 2.3313138484954834, + "learning_rate": 1.0859844508609631e-05, + "loss": 1.5695, + "step": 42026 + }, + { + "epoch": 0.525363134078352, + "grad_norm": 2.3690617084503174, + "learning_rate": 1.0858975048490636e-05, + "loss": 0.305, + "step": 42028 + }, + { + "epoch": 0.5253881347033675, + "grad_norm": 3.094360113143921, + "learning_rate": 1.085810558182981e-05, + "loss": 0.2195, + "step": 42030 + }, + { + "epoch": 0.5254131353283832, + "grad_norm": 3.2205684185028076, + "learning_rate": 1.0857236108633775e-05, + "loss": 0.4402, + "step": 42032 + }, + { + "epoch": 0.5254381359533988, + "grad_norm": 1.9650198221206665, + "learning_rate": 1.0856366628909148e-05, + "loss": 0.383, + "step": 42034 + }, + { + "epoch": 0.5254631365784145, + "grad_norm": 3.068929433822632, + "learning_rate": 1.0855497142662554e-05, + "loss": 0.2388, + "step": 42036 + }, + { + "epoch": 0.5254881372034301, + "grad_norm": 6.968886852264404, + "learning_rate": 1.0854627649900615e-05, + "loss": 0.5594, + "step": 42038 + }, + { + "epoch": 0.5255131378284457, + "grad_norm": 1.0424513816833496, + "learning_rate": 1.0853758150629954e-05, + "loss": 0.0311, + "step": 42040 + }, + { + "epoch": 0.5255381384534613, + "grad_norm": 3.529021739959717, + "learning_rate": 1.0852888644857192e-05, + "loss": 1.1075, + "step": 42042 + }, + { + "epoch": 0.525563139078477, + "grad_norm": 4.488929271697998, + "learning_rate": 1.085201913258895e-05, + "loss": 1.1816, + "step": 42044 + }, + { + "epoch": 0.5255881397034926, + "grad_norm": 2.3456594944000244, + "learning_rate": 1.085114961383185e-05, + "loss": 1.0547, + "step": 42046 + }, + { + "epoch": 0.5256131403285083, + "grad_norm": 4.970624923706055, + "learning_rate": 1.0850280088592515e-05, + "loss": 1.0636, + "step": 42048 + }, + { + "epoch": 0.5256381409535238, + "grad_norm": 0.041766051203012466, + "learning_rate": 1.0849410556877566e-05, + "loss": 0.6811, + "step": 42050 + }, + { + "epoch": 0.5256631415785394, + "grad_norm": 3.514580249786377, + "learning_rate": 1.0848541018693628e-05, + "loss": 1.6335, + "step": 42052 + }, + { + "epoch": 0.5256881422035551, + "grad_norm": 2.7519726753234863, + "learning_rate": 1.084767147404732e-05, + "loss": 1.0723, + "step": 42054 + }, + { + "epoch": 0.5257131428285707, + "grad_norm": 0.013016919605433941, + "learning_rate": 1.0846801922945268e-05, + "loss": 0.0365, + "step": 42056 + }, + { + "epoch": 0.5257381434535864, + "grad_norm": 5.506282329559326, + "learning_rate": 1.0845932365394094e-05, + "loss": 2.2117, + "step": 42058 + }, + { + "epoch": 0.5257631440786019, + "grad_norm": 5.431835651397705, + "learning_rate": 1.0845062801400416e-05, + "loss": 1.828, + "step": 42060 + }, + { + "epoch": 0.5257881447036176, + "grad_norm": 2.0760340690612793, + "learning_rate": 1.0844193230970858e-05, + "loss": 0.4301, + "step": 42062 + }, + { + "epoch": 0.5258131453286332, + "grad_norm": 6.372207164764404, + "learning_rate": 1.0843323654112045e-05, + "loss": 1.0111, + "step": 42064 + }, + { + "epoch": 0.5258381459536489, + "grad_norm": 6.0013628005981445, + "learning_rate": 1.0842454070830598e-05, + "loss": 1.664, + "step": 42066 + }, + { + "epoch": 0.5258631465786645, + "grad_norm": 2.456798791885376, + "learning_rate": 1.0841584481133143e-05, + "loss": 0.6219, + "step": 42068 + }, + { + "epoch": 0.52588814720368, + "grad_norm": 2.716524124145508, + "learning_rate": 1.0840714885026296e-05, + "loss": 1.6565, + "step": 42070 + }, + { + "epoch": 0.5259131478286957, + "grad_norm": 0.08221980929374695, + "learning_rate": 1.0839845282516686e-05, + "loss": 0.7239, + "step": 42072 + }, + { + "epoch": 0.5259381484537113, + "grad_norm": 1.0827937126159668, + "learning_rate": 1.0838975673610933e-05, + "loss": 0.0285, + "step": 42074 + }, + { + "epoch": 0.525963149078727, + "grad_norm": 2.22206974029541, + "learning_rate": 1.0838106058315656e-05, + "loss": 1.4868, + "step": 42076 + }, + { + "epoch": 0.5259881497037426, + "grad_norm": 9.689962387084961, + "learning_rate": 1.0837236436637484e-05, + "loss": 1.6459, + "step": 42078 + }, + { + "epoch": 0.5260131503287582, + "grad_norm": 2.7782578468322754, + "learning_rate": 1.0836366808583039e-05, + "loss": 1.1943, + "step": 42080 + }, + { + "epoch": 0.5260381509537738, + "grad_norm": 4.9648003578186035, + "learning_rate": 1.083549717415894e-05, + "loss": 1.1997, + "step": 42082 + }, + { + "epoch": 0.5260631515787895, + "grad_norm": 7.859218597412109, + "learning_rate": 1.0834627533371816e-05, + "loss": 1.3189, + "step": 42084 + }, + { + "epoch": 0.5260881522038051, + "grad_norm": 1.0840984582901, + "learning_rate": 1.0833757886228284e-05, + "loss": 0.3292, + "step": 42086 + }, + { + "epoch": 0.5261131528288208, + "grad_norm": 0.0018464862369000912, + "learning_rate": 1.083288823273497e-05, + "loss": 0.1412, + "step": 42088 + }, + { + "epoch": 0.5261381534538363, + "grad_norm": 1.7698441743850708, + "learning_rate": 1.08320185728985e-05, + "loss": 0.1178, + "step": 42090 + }, + { + "epoch": 0.526163154078852, + "grad_norm": 1.2288990020751953, + "learning_rate": 1.083114890672549e-05, + "loss": 0.3787, + "step": 42092 + }, + { + "epoch": 0.5261881547038676, + "grad_norm": 4.165122985839844, + "learning_rate": 1.083027923422257e-05, + "loss": 0.8946, + "step": 42094 + }, + { + "epoch": 0.5262131553288832, + "grad_norm": 2.3751277923583984, + "learning_rate": 1.0829409555396359e-05, + "loss": 1.0239, + "step": 42096 + }, + { + "epoch": 0.5262381559538989, + "grad_norm": 3.598348617553711, + "learning_rate": 1.0828539870253485e-05, + "loss": 0.7904, + "step": 42098 + }, + { + "epoch": 0.5262631565789144, + "grad_norm": 0.0008532083593308926, + "learning_rate": 1.0827670178800567e-05, + "loss": 0.5527, + "step": 42100 + }, + { + "epoch": 0.5262881572039301, + "grad_norm": 0.001491406699642539, + "learning_rate": 1.0826800481044228e-05, + "loss": 0.9114, + "step": 42102 + }, + { + "epoch": 0.5263131578289457, + "grad_norm": 2.306687116622925, + "learning_rate": 1.0825930776991096e-05, + "loss": 0.182, + "step": 42104 + }, + { + "epoch": 0.5263381584539614, + "grad_norm": 3.1488215923309326, + "learning_rate": 1.082506106664779e-05, + "loss": 1.6922, + "step": 42106 + }, + { + "epoch": 0.526363159078977, + "grad_norm": 10.316508293151855, + "learning_rate": 1.0824191350020938e-05, + "loss": 1.9975, + "step": 42108 + }, + { + "epoch": 0.5263881597039926, + "grad_norm": 2.7845394611358643, + "learning_rate": 1.0823321627117162e-05, + "loss": 0.6315, + "step": 42110 + }, + { + "epoch": 0.5264131603290082, + "grad_norm": 3.03751540184021, + "learning_rate": 1.0822451897943083e-05, + "loss": 0.6823, + "step": 42112 + }, + { + "epoch": 0.5264381609540238, + "grad_norm": 1.951979398727417, + "learning_rate": 1.0821582162505326e-05, + "loss": 0.6081, + "step": 42114 + }, + { + "epoch": 0.5264631615790395, + "grad_norm": 2.756934642791748, + "learning_rate": 1.0820712420810516e-05, + "loss": 0.6258, + "step": 42116 + }, + { + "epoch": 0.5264881622040551, + "grad_norm": 0.0011496596271172166, + "learning_rate": 1.0819842672865275e-05, + "loss": 0.2948, + "step": 42118 + }, + { + "epoch": 0.5265131628290707, + "grad_norm": 4.41667366027832, + "learning_rate": 1.081897291867623e-05, + "loss": 1.3393, + "step": 42120 + }, + { + "epoch": 0.5265381634540863, + "grad_norm": 3.4650750160217285, + "learning_rate": 1.0818103158250002e-05, + "loss": 1.4449, + "step": 42122 + }, + { + "epoch": 0.526563164079102, + "grad_norm": 3.023709774017334, + "learning_rate": 1.0817233391593219e-05, + "loss": 0.7323, + "step": 42124 + }, + { + "epoch": 0.5265881647041176, + "grad_norm": 1.0738263130187988, + "learning_rate": 1.08163636187125e-05, + "loss": 0.4709, + "step": 42126 + }, + { + "epoch": 0.5266131653291333, + "grad_norm": 0.2247592806816101, + "learning_rate": 1.0815493839614471e-05, + "loss": 0.5879, + "step": 42128 + }, + { + "epoch": 0.5266381659541488, + "grad_norm": 1.403154969215393, + "learning_rate": 1.0814624054305753e-05, + "loss": 0.1824, + "step": 42130 + }, + { + "epoch": 0.5266631665791645, + "grad_norm": 1.9111016988754272, + "learning_rate": 1.0813754262792977e-05, + "loss": 1.0346, + "step": 42132 + }, + { + "epoch": 0.5266881672041801, + "grad_norm": 5.279770374298096, + "learning_rate": 1.0812884465082763e-05, + "loss": 1.2187, + "step": 42134 + }, + { + "epoch": 0.5267131678291957, + "grad_norm": 0.06773865967988968, + "learning_rate": 1.0812014661181736e-05, + "loss": 0.0223, + "step": 42136 + }, + { + "epoch": 0.5267381684542114, + "grad_norm": 2.498199462890625, + "learning_rate": 1.0811144851096523e-05, + "loss": 1.2726, + "step": 42138 + }, + { + "epoch": 0.5267631690792269, + "grad_norm": 2.462676525115967, + "learning_rate": 1.081027503483374e-05, + "loss": 1.8168, + "step": 42140 + }, + { + "epoch": 0.5267881697042426, + "grad_norm": 0.005048310849815607, + "learning_rate": 1.0809405212400019e-05, + "loss": 0.0193, + "step": 42142 + }, + { + "epoch": 0.5268131703292582, + "grad_norm": 2.6827118396759033, + "learning_rate": 1.0808535383801983e-05, + "loss": 0.6787, + "step": 42144 + }, + { + "epoch": 0.5268381709542739, + "grad_norm": 8.816794395446777, + "learning_rate": 1.0807665549046256e-05, + "loss": 2.089, + "step": 42146 + }, + { + "epoch": 0.5268631715792895, + "grad_norm": 2.2698233127593994, + "learning_rate": 1.0806795708139462e-05, + "loss": 1.6843, + "step": 42148 + }, + { + "epoch": 0.5268881722043051, + "grad_norm": 5.434141159057617, + "learning_rate": 1.0805925861088227e-05, + "loss": 1.1548, + "step": 42150 + }, + { + "epoch": 0.5269131728293207, + "grad_norm": 4.528879642486572, + "learning_rate": 1.0805056007899172e-05, + "loss": 0.4737, + "step": 42152 + }, + { + "epoch": 0.5269381734543364, + "grad_norm": 7.045990467071533, + "learning_rate": 1.0804186148578928e-05, + "loss": 0.3974, + "step": 42154 + }, + { + "epoch": 0.526963174079352, + "grad_norm": 5.138097286224365, + "learning_rate": 1.0803316283134113e-05, + "loss": 1.5504, + "step": 42156 + }, + { + "epoch": 0.5269881747043677, + "grad_norm": 5.18304967880249, + "learning_rate": 1.0802446411571355e-05, + "loss": 1.9673, + "step": 42158 + }, + { + "epoch": 0.5270131753293832, + "grad_norm": 2.9361536502838135, + "learning_rate": 1.080157653389728e-05, + "loss": 1.0437, + "step": 42160 + }, + { + "epoch": 0.5270381759543988, + "grad_norm": 2.3125176429748535, + "learning_rate": 1.0800706650118512e-05, + "loss": 0.3439, + "step": 42162 + }, + { + "epoch": 0.5270631765794145, + "grad_norm": 5.457840919494629, + "learning_rate": 1.0799836760241675e-05, + "loss": 1.0949, + "step": 42164 + }, + { + "epoch": 0.5270881772044301, + "grad_norm": 7.2491607666015625, + "learning_rate": 1.0798966864273393e-05, + "loss": 1.1503, + "step": 42166 + }, + { + "epoch": 0.5271131778294458, + "grad_norm": 5.899621963500977, + "learning_rate": 1.079809696222029e-05, + "loss": 0.52, + "step": 42168 + }, + { + "epoch": 0.5271381784544613, + "grad_norm": 2.4856934547424316, + "learning_rate": 1.0797227054089e-05, + "loss": 0.6926, + "step": 42170 + }, + { + "epoch": 0.527163179079477, + "grad_norm": 4.3924174308776855, + "learning_rate": 1.0796357139886136e-05, + "loss": 0.2261, + "step": 42172 + }, + { + "epoch": 0.5271881797044926, + "grad_norm": 6.428500652313232, + "learning_rate": 1.079548721961833e-05, + "loss": 2.2075, + "step": 42174 + }, + { + "epoch": 0.5272131803295083, + "grad_norm": 3.324749231338501, + "learning_rate": 1.0794617293292208e-05, + "loss": 1.7715, + "step": 42176 + }, + { + "epoch": 0.5272381809545239, + "grad_norm": 3.686154365539551, + "learning_rate": 1.0793747360914391e-05, + "loss": 0.2086, + "step": 42178 + }, + { + "epoch": 0.5272631815795394, + "grad_norm": 2.3004181385040283, + "learning_rate": 1.0792877422491503e-05, + "loss": 0.9672, + "step": 42180 + }, + { + "epoch": 0.5272881822045551, + "grad_norm": 0.0020122642163187265, + "learning_rate": 1.079200747803018e-05, + "loss": 0.5487, + "step": 42182 + }, + { + "epoch": 0.5273131828295707, + "grad_norm": 4.735190391540527, + "learning_rate": 1.0791137527537034e-05, + "loss": 0.9331, + "step": 42184 + }, + { + "epoch": 0.5273381834545864, + "grad_norm": 2.4943175315856934, + "learning_rate": 1.07902675710187e-05, + "loss": 0.6499, + "step": 42186 + }, + { + "epoch": 0.527363184079602, + "grad_norm": 3.5714328289031982, + "learning_rate": 1.0789397608481797e-05, + "loss": 1.8761, + "step": 42188 + }, + { + "epoch": 0.5273881847046176, + "grad_norm": 3.990817070007324, + "learning_rate": 1.078852763993296e-05, + "loss": 0.8708, + "step": 42190 + }, + { + "epoch": 0.5274131853296332, + "grad_norm": 6.08361291885376, + "learning_rate": 1.07876576653788e-05, + "loss": 1.0077, + "step": 42192 + }, + { + "epoch": 0.5274381859546489, + "grad_norm": 5.61079740524292, + "learning_rate": 1.0786787684825952e-05, + "loss": 0.5268, + "step": 42194 + }, + { + "epoch": 0.5274631865796645, + "grad_norm": 3.2022411823272705, + "learning_rate": 1.0785917698281046e-05, + "loss": 0.6647, + "step": 42196 + }, + { + "epoch": 0.5274881872046802, + "grad_norm": 1.4621468782424927, + "learning_rate": 1.0785047705750697e-05, + "loss": 0.5399, + "step": 42198 + }, + { + "epoch": 0.5275131878296957, + "grad_norm": 3.6453635692596436, + "learning_rate": 1.0784177707241536e-05, + "loss": 1.3123, + "step": 42200 + }, + { + "epoch": 0.5275381884547113, + "grad_norm": 0.05362976714968681, + "learning_rate": 1.078330770276019e-05, + "loss": 0.0736, + "step": 42202 + }, + { + "epoch": 0.527563189079727, + "grad_norm": 3.001093864440918, + "learning_rate": 1.0782437692313284e-05, + "loss": 0.6062, + "step": 42204 + }, + { + "epoch": 0.5275881897047426, + "grad_norm": 3.52543044090271, + "learning_rate": 1.0781567675907438e-05, + "loss": 0.6709, + "step": 42206 + }, + { + "epoch": 0.5276131903297583, + "grad_norm": 3.671231746673584, + "learning_rate": 1.078069765354929e-05, + "loss": 1.0382, + "step": 42208 + }, + { + "epoch": 0.5276381909547738, + "grad_norm": 3.0449562072753906, + "learning_rate": 1.0779827625245456e-05, + "loss": 1.1518, + "step": 42210 + }, + { + "epoch": 0.5276631915797895, + "grad_norm": 2.031278371810913, + "learning_rate": 1.0778957591002564e-05, + "loss": 0.7013, + "step": 42212 + }, + { + "epoch": 0.5276881922048051, + "grad_norm": 4.313855171203613, + "learning_rate": 1.077808755082724e-05, + "loss": 1.4041, + "step": 42214 + }, + { + "epoch": 0.5277131928298208, + "grad_norm": 5.437399387359619, + "learning_rate": 1.0777217504726115e-05, + "loss": 1.7806, + "step": 42216 + }, + { + "epoch": 0.5277381934548364, + "grad_norm": 4.4129958152771, + "learning_rate": 1.0776347452705809e-05, + "loss": 1.4539, + "step": 42218 + }, + { + "epoch": 0.527763194079852, + "grad_norm": 2.6826155185699463, + "learning_rate": 1.077547739477295e-05, + "loss": 0.9209, + "step": 42220 + }, + { + "epoch": 0.5277881947048676, + "grad_norm": 2.7066967487335205, + "learning_rate": 1.0774607330934167e-05, + "loss": 0.313, + "step": 42222 + }, + { + "epoch": 0.5278131953298832, + "grad_norm": 0.0011532793287187815, + "learning_rate": 1.0773737261196085e-05, + "loss": 1.3047, + "step": 42224 + }, + { + "epoch": 0.5278381959548989, + "grad_norm": 2.9011192321777344, + "learning_rate": 1.0772867185565328e-05, + "loss": 0.9773, + "step": 42226 + }, + { + "epoch": 0.5278631965799145, + "grad_norm": 4.118508815765381, + "learning_rate": 1.0771997104048526e-05, + "loss": 0.176, + "step": 42228 + }, + { + "epoch": 0.5278881972049301, + "grad_norm": 0.09360300749540329, + "learning_rate": 1.0771127016652297e-05, + "loss": 0.7089, + "step": 42230 + }, + { + "epoch": 0.5279131978299457, + "grad_norm": 3.168975591659546, + "learning_rate": 1.0770256923383278e-05, + "loss": 0.5333, + "step": 42232 + }, + { + "epoch": 0.5279381984549614, + "grad_norm": 2.8774709701538086, + "learning_rate": 1.0769386824248092e-05, + "loss": 0.6928, + "step": 42234 + }, + { + "epoch": 0.527963199079977, + "grad_norm": 0.006832157261669636, + "learning_rate": 1.0768516719253364e-05, + "loss": 0.9879, + "step": 42236 + }, + { + "epoch": 0.5279881997049927, + "grad_norm": 2.341884136199951, + "learning_rate": 1.076764660840572e-05, + "loss": 0.5178, + "step": 42238 + }, + { + "epoch": 0.5280132003300082, + "grad_norm": 2.251542329788208, + "learning_rate": 1.0766776491711788e-05, + "loss": 0.4671, + "step": 42240 + }, + { + "epoch": 0.5280382009550239, + "grad_norm": 1.1097313165664673, + "learning_rate": 1.0765906369178195e-05, + "loss": 0.2764, + "step": 42242 + }, + { + "epoch": 0.5280632015800395, + "grad_norm": 1.8206632137298584, + "learning_rate": 1.0765036240811566e-05, + "loss": 0.5508, + "step": 42244 + }, + { + "epoch": 0.5280882022050551, + "grad_norm": 0.7510795593261719, + "learning_rate": 1.0764166106618532e-05, + "loss": 0.0337, + "step": 42246 + }, + { + "epoch": 0.5281132028300708, + "grad_norm": 5.259782314300537, + "learning_rate": 1.0763295966605717e-05, + "loss": 1.4668, + "step": 42248 + }, + { + "epoch": 0.5281382034550863, + "grad_norm": 4.421560764312744, + "learning_rate": 1.0762425820779748e-05, + "loss": 0.8352, + "step": 42250 + }, + { + "epoch": 0.528163204080102, + "grad_norm": 3.6584980487823486, + "learning_rate": 1.076155566914725e-05, + "loss": 1.3826, + "step": 42252 + }, + { + "epoch": 0.5281882047051176, + "grad_norm": 3.57835054397583, + "learning_rate": 1.0760685511714852e-05, + "loss": 0.2143, + "step": 42254 + }, + { + "epoch": 0.5282132053301333, + "grad_norm": 3.4212985038757324, + "learning_rate": 1.075981534848918e-05, + "loss": 0.8745, + "step": 42256 + }, + { + "epoch": 0.5282382059551489, + "grad_norm": 2.880275249481201, + "learning_rate": 1.0758945179476864e-05, + "loss": 1.1565, + "step": 42258 + }, + { + "epoch": 0.5282632065801645, + "grad_norm": 2.5781335830688477, + "learning_rate": 1.0758075004684525e-05, + "loss": 1.0425, + "step": 42260 + }, + { + "epoch": 0.5282882072051801, + "grad_norm": 5.510824680328369, + "learning_rate": 1.0757204824118799e-05, + "loss": 1.1595, + "step": 42262 + }, + { + "epoch": 0.5283132078301958, + "grad_norm": 4.184436321258545, + "learning_rate": 1.0756334637786306e-05, + "loss": 1.232, + "step": 42264 + }, + { + "epoch": 0.5283382084552114, + "grad_norm": 6.099407196044922, + "learning_rate": 1.0755464445693675e-05, + "loss": 0.9552, + "step": 42266 + }, + { + "epoch": 0.528363209080227, + "grad_norm": 2.7817986011505127, + "learning_rate": 1.0754594247847531e-05, + "loss": 1.0724, + "step": 42268 + }, + { + "epoch": 0.5283882097052426, + "grad_norm": 1.1288589239120483, + "learning_rate": 1.0753724044254508e-05, + "loss": 0.0209, + "step": 42270 + }, + { + "epoch": 0.5284132103302582, + "grad_norm": 4.161681652069092, + "learning_rate": 1.0752853834921226e-05, + "loss": 0.5183, + "step": 42272 + }, + { + "epoch": 0.5284382109552739, + "grad_norm": 3.504157543182373, + "learning_rate": 1.075198361985432e-05, + "loss": 1.1717, + "step": 42274 + }, + { + "epoch": 0.5284632115802895, + "grad_norm": 4.196875095367432, + "learning_rate": 1.0751113399060412e-05, + "loss": 1.1507, + "step": 42276 + }, + { + "epoch": 0.5284882122053052, + "grad_norm": 0.2821425795555115, + "learning_rate": 1.0750243172546129e-05, + "loss": 0.0571, + "step": 42278 + }, + { + "epoch": 0.5285132128303207, + "grad_norm": 0.004672509152442217, + "learning_rate": 1.07493729403181e-05, + "loss": 0.0713, + "step": 42280 + }, + { + "epoch": 0.5285382134553364, + "grad_norm": 4.892718315124512, + "learning_rate": 1.074850270238295e-05, + "loss": 0.3885, + "step": 42282 + }, + { + "epoch": 0.528563214080352, + "grad_norm": 3.9760990142822266, + "learning_rate": 1.0747632458747313e-05, + "loss": 1.2085, + "step": 42284 + }, + { + "epoch": 0.5285882147053677, + "grad_norm": 4.296544551849365, + "learning_rate": 1.0746762209417811e-05, + "loss": 1.3498, + "step": 42286 + }, + { + "epoch": 0.5286132153303833, + "grad_norm": 0.01288547832518816, + "learning_rate": 1.0745891954401077e-05, + "loss": 0.1788, + "step": 42288 + }, + { + "epoch": 0.5286382159553988, + "grad_norm": 3.615452766418457, + "learning_rate": 1.0745021693703735e-05, + "loss": 0.8056, + "step": 42290 + }, + { + "epoch": 0.5286632165804145, + "grad_norm": 3.8300485610961914, + "learning_rate": 1.074415142733241e-05, + "loss": 1.5826, + "step": 42292 + }, + { + "epoch": 0.5286882172054301, + "grad_norm": 2.1329994201660156, + "learning_rate": 1.0743281155293732e-05, + "loss": 0.6335, + "step": 42294 + }, + { + "epoch": 0.5287132178304458, + "grad_norm": 0.002994499634951353, + "learning_rate": 1.0742410877594334e-05, + "loss": 1.0972, + "step": 42296 + }, + { + "epoch": 0.5287382184554614, + "grad_norm": 0.760219156742096, + "learning_rate": 1.0741540594240836e-05, + "loss": 0.0258, + "step": 42298 + }, + { + "epoch": 0.528763219080477, + "grad_norm": 5.6331915855407715, + "learning_rate": 1.0740670305239873e-05, + "loss": 1.5658, + "step": 42300 + }, + { + "epoch": 0.5287882197054926, + "grad_norm": 2.4858341217041016, + "learning_rate": 1.073980001059807e-05, + "loss": 1.0601, + "step": 42302 + }, + { + "epoch": 0.5288132203305083, + "grad_norm": 0.0010572123574092984, + "learning_rate": 1.073892971032205e-05, + "loss": 0.38, + "step": 42304 + }, + { + "epoch": 0.5288382209555239, + "grad_norm": 3.9178307056427, + "learning_rate": 1.0738059404418448e-05, + "loss": 1.4952, + "step": 42306 + }, + { + "epoch": 0.5288632215805396, + "grad_norm": 4.356890678405762, + "learning_rate": 1.0737189092893891e-05, + "loss": 0.5757, + "step": 42308 + }, + { + "epoch": 0.5288882222055551, + "grad_norm": 2.1329448223114014, + "learning_rate": 1.0736318775755006e-05, + "loss": 0.6728, + "step": 42310 + }, + { + "epoch": 0.5289132228305707, + "grad_norm": 0.33783990144729614, + "learning_rate": 1.0735448453008421e-05, + "loss": 0.2579, + "step": 42312 + }, + { + "epoch": 0.5289382234555864, + "grad_norm": 3.26802134513855, + "learning_rate": 1.0734578124660767e-05, + "loss": 0.3435, + "step": 42314 + }, + { + "epoch": 0.528963224080602, + "grad_norm": 5.052645206451416, + "learning_rate": 1.0733707790718666e-05, + "loss": 1.8101, + "step": 42316 + }, + { + "epoch": 0.5289882247056177, + "grad_norm": 2.900421142578125, + "learning_rate": 1.0732837451188752e-05, + "loss": 1.9934, + "step": 42318 + }, + { + "epoch": 0.5290132253306332, + "grad_norm": 3.6870408058166504, + "learning_rate": 1.0731967106077652e-05, + "loss": 1.1935, + "step": 42320 + }, + { + "epoch": 0.5290382259556489, + "grad_norm": 0.060881488025188446, + "learning_rate": 1.0731096755391992e-05, + "loss": 0.2028, + "step": 42322 + }, + { + "epoch": 0.5290632265806645, + "grad_norm": 3.024351119995117, + "learning_rate": 1.0730226399138403e-05, + "loss": 1.1128, + "step": 42324 + }, + { + "epoch": 0.5290882272056802, + "grad_norm": 3.649559259414673, + "learning_rate": 1.0729356037323518e-05, + "loss": 0.9629, + "step": 42326 + }, + { + "epoch": 0.5291132278306958, + "grad_norm": 2.5762948989868164, + "learning_rate": 1.0728485669953953e-05, + "loss": 1.3021, + "step": 42328 + }, + { + "epoch": 0.5291382284557113, + "grad_norm": 2.4003419876098633, + "learning_rate": 1.0727615297036348e-05, + "loss": 0.3983, + "step": 42330 + }, + { + "epoch": 0.529163229080727, + "grad_norm": 0.24055726826190948, + "learning_rate": 1.0726744918577328e-05, + "loss": 0.3621, + "step": 42332 + }, + { + "epoch": 0.5291882297057426, + "grad_norm": 2.336047887802124, + "learning_rate": 1.072587453458352e-05, + "loss": 1.0894, + "step": 42334 + }, + { + "epoch": 0.5292132303307583, + "grad_norm": 9.15353775024414, + "learning_rate": 1.0725004145061555e-05, + "loss": 1.3645, + "step": 42336 + }, + { + "epoch": 0.5292382309557739, + "grad_norm": 5.2832818031311035, + "learning_rate": 1.0724133750018062e-05, + "loss": 1.9789, + "step": 42338 + }, + { + "epoch": 0.5292632315807895, + "grad_norm": 0.00138758250977844, + "learning_rate": 1.072326334945967e-05, + "loss": 0.6502, + "step": 42340 + }, + { + "epoch": 0.5292882322058051, + "grad_norm": 0.4925289452075958, + "learning_rate": 1.0722392943393003e-05, + "loss": 0.4, + "step": 42342 + }, + { + "epoch": 0.5293132328308208, + "grad_norm": 2.8874683380126953, + "learning_rate": 1.0721522531824695e-05, + "loss": 0.7492, + "step": 42344 + }, + { + "epoch": 0.5293382334558364, + "grad_norm": 2.642420768737793, + "learning_rate": 1.0720652114761371e-05, + "loss": 0.9403, + "step": 42346 + }, + { + "epoch": 0.5293632340808521, + "grad_norm": 6.1073808670043945, + "learning_rate": 1.0719781692209665e-05, + "loss": 0.637, + "step": 42348 + }, + { + "epoch": 0.5293882347058676, + "grad_norm": 3.0061421394348145, + "learning_rate": 1.0718911264176203e-05, + "loss": 1.2857, + "step": 42350 + }, + { + "epoch": 0.5294132353308832, + "grad_norm": 2.8090810775756836, + "learning_rate": 1.0718040830667618e-05, + "loss": 0.6894, + "step": 42352 + }, + { + "epoch": 0.5294382359558989, + "grad_norm": 0.16606077551841736, + "learning_rate": 1.071717039169053e-05, + "loss": 0.5798, + "step": 42354 + }, + { + "epoch": 0.5294632365809145, + "grad_norm": 2.5704331398010254, + "learning_rate": 1.0716299947251577e-05, + "loss": 0.4734, + "step": 42356 + }, + { + "epoch": 0.5294882372059302, + "grad_norm": 2.8782589435577393, + "learning_rate": 1.0715429497357384e-05, + "loss": 1.1857, + "step": 42358 + }, + { + "epoch": 0.5295132378309457, + "grad_norm": 4.48987340927124, + "learning_rate": 1.0714559042014581e-05, + "loss": 1.2913, + "step": 42360 + }, + { + "epoch": 0.5295382384559614, + "grad_norm": 5.40187406539917, + "learning_rate": 1.0713688581229795e-05, + "loss": 1.6128, + "step": 42362 + }, + { + "epoch": 0.529563239080977, + "grad_norm": 2.050110340118408, + "learning_rate": 1.071281811500966e-05, + "loss": 0.1417, + "step": 42364 + }, + { + "epoch": 0.5295882397059927, + "grad_norm": 7.237631797790527, + "learning_rate": 1.0711947643360806e-05, + "loss": 1.4712, + "step": 42366 + }, + { + "epoch": 0.5296132403310083, + "grad_norm": 3.253582000732422, + "learning_rate": 1.0711077166289856e-05, + "loss": 0.6927, + "step": 42368 + }, + { + "epoch": 0.5296382409560239, + "grad_norm": 1.376237154006958, + "learning_rate": 1.0710206683803441e-05, + "loss": 0.501, + "step": 42370 + }, + { + "epoch": 0.5296632415810395, + "grad_norm": 11.021944046020508, + "learning_rate": 1.0709336195908195e-05, + "loss": 2.7635, + "step": 42372 + }, + { + "epoch": 0.5296882422060551, + "grad_norm": 8.767126083374023, + "learning_rate": 1.0708465702610743e-05, + "loss": 1.8296, + "step": 42374 + }, + { + "epoch": 0.5297132428310708, + "grad_norm": 2.2708899974823, + "learning_rate": 1.070759520391772e-05, + "loss": 0.4464, + "step": 42376 + }, + { + "epoch": 0.5297382434560864, + "grad_norm": 0.041598040610551834, + "learning_rate": 1.070672469983575e-05, + "loss": 0.2988, + "step": 42378 + }, + { + "epoch": 0.529763244081102, + "grad_norm": 4.98547887802124, + "learning_rate": 1.0705854190371466e-05, + "loss": 0.7782, + "step": 42380 + }, + { + "epoch": 0.5297882447061176, + "grad_norm": 2.437246322631836, + "learning_rate": 1.0704983675531493e-05, + "loss": 0.7174, + "step": 42382 + }, + { + "epoch": 0.5298132453311333, + "grad_norm": 3.2567203044891357, + "learning_rate": 1.0704113155322465e-05, + "loss": 0.6067, + "step": 42384 + }, + { + "epoch": 0.5298382459561489, + "grad_norm": 0.0007736569386906922, + "learning_rate": 1.0703242629751013e-05, + "loss": 0.7539, + "step": 42386 + }, + { + "epoch": 0.5298632465811646, + "grad_norm": 1.9450278282165527, + "learning_rate": 1.0702372098823764e-05, + "loss": 0.9023, + "step": 42388 + }, + { + "epoch": 0.5298882472061801, + "grad_norm": 2.226362943649292, + "learning_rate": 1.0701501562547347e-05, + "loss": 0.6827, + "step": 42390 + }, + { + "epoch": 0.5299132478311958, + "grad_norm": 0.3245130777359009, + "learning_rate": 1.0700631020928396e-05, + "loss": 0.9778, + "step": 42392 + }, + { + "epoch": 0.5299382484562114, + "grad_norm": 0.0010116720804944634, + "learning_rate": 1.0699760473973534e-05, + "loss": 1.9332, + "step": 42394 + }, + { + "epoch": 0.529963249081227, + "grad_norm": 5.228931427001953, + "learning_rate": 1.0698889921689395e-05, + "loss": 0.7566, + "step": 42396 + }, + { + "epoch": 0.5299882497062427, + "grad_norm": 2.850990056991577, + "learning_rate": 1.0698019364082615e-05, + "loss": 1.3383, + "step": 42398 + }, + { + "epoch": 0.5300132503312582, + "grad_norm": 1.9288758039474487, + "learning_rate": 1.0697148801159814e-05, + "loss": 0.5886, + "step": 42400 + }, + { + "epoch": 0.5300382509562739, + "grad_norm": 5.304136753082275, + "learning_rate": 1.0696278232927626e-05, + "loss": 0.4524, + "step": 42402 + }, + { + "epoch": 0.5300632515812895, + "grad_norm": 3.153909921646118, + "learning_rate": 1.0695407659392685e-05, + "loss": 0.7909, + "step": 42404 + }, + { + "epoch": 0.5300882522063052, + "grad_norm": 5.238150596618652, + "learning_rate": 1.0694537080561616e-05, + "loss": 0.9595, + "step": 42406 + }, + { + "epoch": 0.5301132528313208, + "grad_norm": 0.9949836134910583, + "learning_rate": 1.0693666496441046e-05, + "loss": 0.5155, + "step": 42408 + }, + { + "epoch": 0.5301382534563364, + "grad_norm": 0.4766888916492462, + "learning_rate": 1.0692795907037613e-05, + "loss": 0.2342, + "step": 42410 + }, + { + "epoch": 0.530163254081352, + "grad_norm": 3.6217281818389893, + "learning_rate": 1.0691925312357946e-05, + "loss": 1.5753, + "step": 42412 + }, + { + "epoch": 0.5301882547063677, + "grad_norm": 4.421515464782715, + "learning_rate": 1.0691054712408676e-05, + "loss": 1.1307, + "step": 42414 + }, + { + "epoch": 0.5302132553313833, + "grad_norm": 3.7202093601226807, + "learning_rate": 1.0690184107196427e-05, + "loss": 1.0823, + "step": 42416 + }, + { + "epoch": 0.530238255956399, + "grad_norm": 3.421320676803589, + "learning_rate": 1.0689313496727835e-05, + "loss": 1.4763, + "step": 42418 + }, + { + "epoch": 0.5302632565814145, + "grad_norm": 2.499493360519409, + "learning_rate": 1.0688442881009527e-05, + "loss": 0.8351, + "step": 42420 + }, + { + "epoch": 0.5302882572064301, + "grad_norm": 3.976486921310425, + "learning_rate": 1.0687572260048137e-05, + "loss": 0.866, + "step": 42422 + }, + { + "epoch": 0.5303132578314458, + "grad_norm": 11.799408912658691, + "learning_rate": 1.0686701633850294e-05, + "loss": 0.8556, + "step": 42424 + }, + { + "epoch": 0.5303382584564614, + "grad_norm": 0.0009657763293944299, + "learning_rate": 1.0685831002422628e-05, + "loss": 0.5894, + "step": 42426 + }, + { + "epoch": 0.5303632590814771, + "grad_norm": 0.0012336622457951307, + "learning_rate": 1.068496036577177e-05, + "loss": 0.6817, + "step": 42428 + }, + { + "epoch": 0.5303882597064926, + "grad_norm": 0.001243511214852333, + "learning_rate": 1.0684089723904352e-05, + "loss": 0.8998, + "step": 42430 + }, + { + "epoch": 0.5304132603315083, + "grad_norm": 6.2018141746521, + "learning_rate": 1.0683219076827001e-05, + "loss": 1.9391, + "step": 42432 + }, + { + "epoch": 0.5304382609565239, + "grad_norm": 4.210742473602295, + "learning_rate": 1.068234842454635e-05, + "loss": 1.5168, + "step": 42434 + }, + { + "epoch": 0.5304632615815396, + "grad_norm": 3.993791341781616, + "learning_rate": 1.068147776706903e-05, + "loss": 1.1162, + "step": 42436 + }, + { + "epoch": 0.5304882622065552, + "grad_norm": 2.3302550315856934, + "learning_rate": 1.0680607104401678e-05, + "loss": 0.3605, + "step": 42438 + }, + { + "epoch": 0.5305132628315707, + "grad_norm": 3.0870134830474854, + "learning_rate": 1.0679736436550912e-05, + "loss": 1.0407, + "step": 42440 + }, + { + "epoch": 0.5305382634565864, + "grad_norm": 4.26093864440918, + "learning_rate": 1.0678865763523369e-05, + "loss": 0.4256, + "step": 42442 + }, + { + "epoch": 0.530563264081602, + "grad_norm": 1.0401599407196045, + "learning_rate": 1.0677995085325682e-05, + "loss": 0.6463, + "step": 42444 + }, + { + "epoch": 0.5305882647066177, + "grad_norm": 2.1125316619873047, + "learning_rate": 1.067712440196448e-05, + "loss": 0.0965, + "step": 42446 + }, + { + "epoch": 0.5306132653316333, + "grad_norm": 4.125248908996582, + "learning_rate": 1.0676253713446396e-05, + "loss": 0.9211, + "step": 42448 + }, + { + "epoch": 0.5306382659566489, + "grad_norm": 0.47578901052474976, + "learning_rate": 1.0675383019778061e-05, + "loss": 0.0183, + "step": 42450 + }, + { + "epoch": 0.5306632665816645, + "grad_norm": 2.5381760597229004, + "learning_rate": 1.06745123209661e-05, + "loss": 0.2276, + "step": 42452 + }, + { + "epoch": 0.5306882672066802, + "grad_norm": 4.2031636238098145, + "learning_rate": 1.067364161701715e-05, + "loss": 1.4416, + "step": 42454 + }, + { + "epoch": 0.5307132678316958, + "grad_norm": 1.972724437713623, + "learning_rate": 1.0672770907937842e-05, + "loss": 0.1113, + "step": 42456 + }, + { + "epoch": 0.5307382684567115, + "grad_norm": 0.8865488767623901, + "learning_rate": 1.0671900193734804e-05, + "loss": 0.7346, + "step": 42458 + }, + { + "epoch": 0.530763269081727, + "grad_norm": 2.5543618202209473, + "learning_rate": 1.067102947441467e-05, + "loss": 0.5504, + "step": 42460 + }, + { + "epoch": 0.5307882697067426, + "grad_norm": 0.0009028909844346344, + "learning_rate": 1.067015874998407e-05, + "loss": 0.4957, + "step": 42462 + }, + { + "epoch": 0.5308132703317583, + "grad_norm": 3.5098228454589844, + "learning_rate": 1.0669288020449637e-05, + "loss": 0.9459, + "step": 42464 + }, + { + "epoch": 0.5308382709567739, + "grad_norm": 2.8744094371795654, + "learning_rate": 1.0668417285818001e-05, + "loss": 0.4986, + "step": 42466 + }, + { + "epoch": 0.5308632715817896, + "grad_norm": 2.511136054992676, + "learning_rate": 1.066754654609579e-05, + "loss": 0.4905, + "step": 42468 + }, + { + "epoch": 0.5308882722068051, + "grad_norm": 3.0703964233398438, + "learning_rate": 1.0666675801289643e-05, + "loss": 0.3162, + "step": 42470 + }, + { + "epoch": 0.5309132728318208, + "grad_norm": 4.386178016662598, + "learning_rate": 1.0665805051406186e-05, + "loss": 1.4575, + "step": 42472 + }, + { + "epoch": 0.5309382734568364, + "grad_norm": 5.009059429168701, + "learning_rate": 1.0664934296452051e-05, + "loss": 0.6795, + "step": 42474 + }, + { + "epoch": 0.5309632740818521, + "grad_norm": 2.703965663909912, + "learning_rate": 1.0664063536433872e-05, + "loss": 0.9722, + "step": 42476 + }, + { + "epoch": 0.5309882747068677, + "grad_norm": 2.0107662677764893, + "learning_rate": 1.066319277135828e-05, + "loss": 0.9441, + "step": 42478 + }, + { + "epoch": 0.5310132753318832, + "grad_norm": 4.260746002197266, + "learning_rate": 1.0662322001231902e-05, + "loss": 0.9812, + "step": 42480 + }, + { + "epoch": 0.5310382759568989, + "grad_norm": 0.010468455031514168, + "learning_rate": 1.0661451226061375e-05, + "loss": 0.0584, + "step": 42482 + }, + { + "epoch": 0.5310632765819145, + "grad_norm": 0.0009688070276752114, + "learning_rate": 1.066058044585333e-05, + "loss": 0.5765, + "step": 42484 + }, + { + "epoch": 0.5310882772069302, + "grad_norm": 4.734538555145264, + "learning_rate": 1.0659709660614398e-05, + "loss": 1.0152, + "step": 42486 + }, + { + "epoch": 0.5311132778319458, + "grad_norm": 3.674391031265259, + "learning_rate": 1.0658838870351206e-05, + "loss": 0.5857, + "step": 42488 + }, + { + "epoch": 0.5311382784569614, + "grad_norm": 3.969552516937256, + "learning_rate": 1.0657968075070397e-05, + "loss": 1.3652, + "step": 42490 + }, + { + "epoch": 0.531163279081977, + "grad_norm": 2.172253131866455, + "learning_rate": 1.0657097274778591e-05, + "loss": 0.4511, + "step": 42492 + }, + { + "epoch": 0.5311882797069927, + "grad_norm": 3.544304847717285, + "learning_rate": 1.0656226469482428e-05, + "loss": 1.0318, + "step": 42494 + }, + { + "epoch": 0.5312132803320083, + "grad_norm": 3.8624932765960693, + "learning_rate": 1.0655355659188536e-05, + "loss": 0.322, + "step": 42496 + }, + { + "epoch": 0.531238280957024, + "grad_norm": 0.0047834147699177265, + "learning_rate": 1.0654484843903547e-05, + "loss": 0.0175, + "step": 42498 + }, + { + "epoch": 0.5312632815820395, + "grad_norm": 3.6798932552337646, + "learning_rate": 1.0653614023634095e-05, + "loss": 1.4057, + "step": 42500 + }, + { + "epoch": 0.5312882822070552, + "grad_norm": 0.001119612017646432, + "learning_rate": 1.0652743198386812e-05, + "loss": 0.479, + "step": 42502 + }, + { + "epoch": 0.5313132828320708, + "grad_norm": 4.522812366485596, + "learning_rate": 1.065187236816833e-05, + "loss": 1.0391, + "step": 42504 + }, + { + "epoch": 0.5313382834570864, + "grad_norm": 4.328415870666504, + "learning_rate": 1.0651001532985277e-05, + "loss": 0.9845, + "step": 42506 + }, + { + "epoch": 0.5313632840821021, + "grad_norm": 5.753663063049316, + "learning_rate": 1.065013069284429e-05, + "loss": 0.7783, + "step": 42508 + }, + { + "epoch": 0.5313882847071176, + "grad_norm": 3.485292911529541, + "learning_rate": 1.0649259847751998e-05, + "loss": 0.6958, + "step": 42510 + }, + { + "epoch": 0.5314132853321333, + "grad_norm": 3.6445603370666504, + "learning_rate": 1.0648388997715035e-05, + "loss": 1.2288, + "step": 42512 + }, + { + "epoch": 0.5314382859571489, + "grad_norm": 4.311127662658691, + "learning_rate": 1.0647518142740036e-05, + "loss": 1.6996, + "step": 42514 + }, + { + "epoch": 0.5314632865821646, + "grad_norm": 2.9856696128845215, + "learning_rate": 1.0646647282833631e-05, + "loss": 1.4738, + "step": 42516 + }, + { + "epoch": 0.5314882872071802, + "grad_norm": 1.3025492429733276, + "learning_rate": 1.064577641800245e-05, + "loss": 0.6338, + "step": 42518 + }, + { + "epoch": 0.5315132878321958, + "grad_norm": 0.9455564618110657, + "learning_rate": 1.0644905548253125e-05, + "loss": 0.8619, + "step": 42520 + }, + { + "epoch": 0.5315382884572114, + "grad_norm": 0.006072289776057005, + "learning_rate": 1.0644034673592292e-05, + "loss": 0.1025, + "step": 42522 + }, + { + "epoch": 0.531563289082227, + "grad_norm": 5.60984992980957, + "learning_rate": 1.0643163794026581e-05, + "loss": 2.3893, + "step": 42524 + }, + { + "epoch": 0.5315882897072427, + "grad_norm": 5.801130771636963, + "learning_rate": 1.0642292909562627e-05, + "loss": 1.0314, + "step": 42526 + }, + { + "epoch": 0.5316132903322583, + "grad_norm": 6.193251132965088, + "learning_rate": 1.0641422020207062e-05, + "loss": 0.9168, + "step": 42528 + }, + { + "epoch": 0.5316382909572739, + "grad_norm": 0.0007702387520112097, + "learning_rate": 1.0640551125966517e-05, + "loss": 0.7393, + "step": 42530 + }, + { + "epoch": 0.5316632915822895, + "grad_norm": 3.2549095153808594, + "learning_rate": 1.0639680226847622e-05, + "loss": 1.5174, + "step": 42532 + }, + { + "epoch": 0.5316882922073052, + "grad_norm": 4.634430408477783, + "learning_rate": 1.0638809322857016e-05, + "loss": 2.3806, + "step": 42534 + }, + { + "epoch": 0.5317132928323208, + "grad_norm": 2.981829881668091, + "learning_rate": 1.0637938414001328e-05, + "loss": 0.8974, + "step": 42536 + }, + { + "epoch": 0.5317382934573365, + "grad_norm": 2.6570022106170654, + "learning_rate": 1.0637067500287191e-05, + "loss": 0.4898, + "step": 42538 + }, + { + "epoch": 0.531763294082352, + "grad_norm": 2.7734384536743164, + "learning_rate": 1.0636196581721236e-05, + "loss": 1.3668, + "step": 42540 + }, + { + "epoch": 0.5317882947073677, + "grad_norm": 2.9790053367614746, + "learning_rate": 1.0635325658310103e-05, + "loss": 0.5485, + "step": 42542 + }, + { + "epoch": 0.5318132953323833, + "grad_norm": 3.725985288619995, + "learning_rate": 1.0634454730060415e-05, + "loss": 0.9371, + "step": 42544 + }, + { + "epoch": 0.531838295957399, + "grad_norm": 3.077681541442871, + "learning_rate": 1.063358379697881e-05, + "loss": 1.2587, + "step": 42546 + }, + { + "epoch": 0.5318632965824146, + "grad_norm": 2.579850673675537, + "learning_rate": 1.0632712859071922e-05, + "loss": 0.5813, + "step": 42548 + }, + { + "epoch": 0.5318882972074301, + "grad_norm": 3.0169548988342285, + "learning_rate": 1.0631841916346381e-05, + "loss": 0.3445, + "step": 42550 + }, + { + "epoch": 0.5319132978324458, + "grad_norm": 2.422125816345215, + "learning_rate": 1.0630970968808823e-05, + "loss": 0.8629, + "step": 42552 + }, + { + "epoch": 0.5319382984574614, + "grad_norm": 4.447720527648926, + "learning_rate": 1.0630100016465878e-05, + "loss": 1.2264, + "step": 42554 + }, + { + "epoch": 0.5319632990824771, + "grad_norm": 0.11374286562204361, + "learning_rate": 1.0629229059324183e-05, + "loss": 0.5265, + "step": 42556 + }, + { + "epoch": 0.5319882997074927, + "grad_norm": 0.030353480949997902, + "learning_rate": 1.0628358097390362e-05, + "loss": 0.1546, + "step": 42558 + }, + { + "epoch": 0.5320133003325083, + "grad_norm": 1.4807647466659546, + "learning_rate": 1.0627487130671058e-05, + "loss": 0.871, + "step": 42560 + }, + { + "epoch": 0.5320383009575239, + "grad_norm": 3.9667606353759766, + "learning_rate": 1.0626616159172905e-05, + "loss": 1.182, + "step": 42562 + }, + { + "epoch": 0.5320633015825396, + "grad_norm": 3.7678022384643555, + "learning_rate": 1.0625745182902527e-05, + "loss": 1.7743, + "step": 42564 + }, + { + "epoch": 0.5320883022075552, + "grad_norm": 4.324053764343262, + "learning_rate": 1.0624874201866563e-05, + "loss": 1.5601, + "step": 42566 + }, + { + "epoch": 0.5321133028325709, + "grad_norm": 2.8833627700805664, + "learning_rate": 1.0624003216071647e-05, + "loss": 1.3254, + "step": 42568 + }, + { + "epoch": 0.5321383034575864, + "grad_norm": 5.1123270988464355, + "learning_rate": 1.062313222552441e-05, + "loss": 1.8523, + "step": 42570 + }, + { + "epoch": 0.532163304082602, + "grad_norm": 6.092258453369141, + "learning_rate": 1.0622261230231485e-05, + "loss": 1.7483, + "step": 42572 + }, + { + "epoch": 0.5321883047076177, + "grad_norm": 6.9445648193359375, + "learning_rate": 1.0621390230199508e-05, + "loss": 1.0359, + "step": 42574 + }, + { + "epoch": 0.5322133053326333, + "grad_norm": 0.0009174748556688428, + "learning_rate": 1.0620519225435111e-05, + "loss": 0.7032, + "step": 42576 + }, + { + "epoch": 0.532238305957649, + "grad_norm": 1.0697492361068726, + "learning_rate": 1.0619648215944926e-05, + "loss": 0.6993, + "step": 42578 + }, + { + "epoch": 0.5322633065826645, + "grad_norm": 4.327566623687744, + "learning_rate": 1.061877720173559e-05, + "loss": 1.2575, + "step": 42580 + }, + { + "epoch": 0.5322883072076802, + "grad_norm": 0.0008727514068596065, + "learning_rate": 1.0617906182813734e-05, + "loss": 0.0, + "step": 42582 + }, + { + "epoch": 0.5323133078326958, + "grad_norm": 2.477407693862915, + "learning_rate": 1.0617035159185986e-05, + "loss": 1.4381, + "step": 42584 + }, + { + "epoch": 0.5323383084577115, + "grad_norm": 2.5462512969970703, + "learning_rate": 1.061616413085899e-05, + "loss": 0.6221, + "step": 42586 + }, + { + "epoch": 0.5323633090827271, + "grad_norm": 2.7649688720703125, + "learning_rate": 1.0615293097839377e-05, + "loss": 0.736, + "step": 42588 + }, + { + "epoch": 0.5323883097077426, + "grad_norm": 3.0530176162719727, + "learning_rate": 1.0614422060133777e-05, + "loss": 0.8025, + "step": 42590 + }, + { + "epoch": 0.5324133103327583, + "grad_norm": 0.6604642271995544, + "learning_rate": 1.0613551017748824e-05, + "loss": 0.5404, + "step": 42592 + }, + { + "epoch": 0.5324383109577739, + "grad_norm": 4.803621768951416, + "learning_rate": 1.0612679970691157e-05, + "loss": 1.4793, + "step": 42594 + }, + { + "epoch": 0.5324633115827896, + "grad_norm": 0.0007313781534321606, + "learning_rate": 1.0611808918967404e-05, + "loss": 0.66, + "step": 42596 + }, + { + "epoch": 0.5324883122078052, + "grad_norm": 5.804902076721191, + "learning_rate": 1.06109378625842e-05, + "loss": 1.3554, + "step": 42598 + }, + { + "epoch": 0.5325133128328208, + "grad_norm": 0.35781678557395935, + "learning_rate": 1.061006680154818e-05, + "loss": 0.347, + "step": 42600 + }, + { + "epoch": 0.5325383134578364, + "grad_norm": 2.450594902038574, + "learning_rate": 1.0609195735865978e-05, + "loss": 1.3691, + "step": 42602 + }, + { + "epoch": 0.5325633140828521, + "grad_norm": 3.98138689994812, + "learning_rate": 1.0608324665544224e-05, + "loss": 1.0131, + "step": 42604 + }, + { + "epoch": 0.5325883147078677, + "grad_norm": 3.026348829269409, + "learning_rate": 1.0607453590589563e-05, + "loss": 0.6922, + "step": 42606 + }, + { + "epoch": 0.5326133153328834, + "grad_norm": 5.987293243408203, + "learning_rate": 1.0606582511008616e-05, + "loss": 1.8105, + "step": 42608 + }, + { + "epoch": 0.5326383159578989, + "grad_norm": 2.8382647037506104, + "learning_rate": 1.060571142680802e-05, + "loss": 1.488, + "step": 42610 + }, + { + "epoch": 0.5326633165829145, + "grad_norm": 2.0206656455993652, + "learning_rate": 1.0604840337994415e-05, + "loss": 0.0118, + "step": 42612 + }, + { + "epoch": 0.5326883172079302, + "grad_norm": 4.0263237953186035, + "learning_rate": 1.0603969244574432e-05, + "loss": 1.2643, + "step": 42614 + }, + { + "epoch": 0.5327133178329458, + "grad_norm": 11.481701850891113, + "learning_rate": 1.0603098146554704e-05, + "loss": 1.3872, + "step": 42616 + }, + { + "epoch": 0.5327383184579615, + "grad_norm": 3.7424240112304688, + "learning_rate": 1.0602227043941866e-05, + "loss": 1.0431, + "step": 42618 + }, + { + "epoch": 0.532763319082977, + "grad_norm": 7.102461814880371, + "learning_rate": 1.0601355936742552e-05, + "loss": 2.0403, + "step": 42620 + }, + { + "epoch": 0.5327883197079927, + "grad_norm": 3.0164501667022705, + "learning_rate": 1.0600484824963395e-05, + "loss": 0.178, + "step": 42622 + }, + { + "epoch": 0.5328133203330083, + "grad_norm": 2.174391746520996, + "learning_rate": 1.059961370861103e-05, + "loss": 0.2702, + "step": 42624 + }, + { + "epoch": 0.532838320958024, + "grad_norm": 2.6975152492523193, + "learning_rate": 1.0598742587692095e-05, + "loss": 1.0503, + "step": 42626 + }, + { + "epoch": 0.5328633215830396, + "grad_norm": 0.30378085374832153, + "learning_rate": 1.059787146221322e-05, + "loss": 0.4195, + "step": 42628 + }, + { + "epoch": 0.5328883222080552, + "grad_norm": 3.7714638710021973, + "learning_rate": 1.0597000332181038e-05, + "loss": 1.1487, + "step": 42630 + }, + { + "epoch": 0.5329133228330708, + "grad_norm": 2.1684165000915527, + "learning_rate": 1.059612919760219e-05, + "loss": 1.1321, + "step": 42632 + }, + { + "epoch": 0.5329383234580864, + "grad_norm": 0.005965851247310638, + "learning_rate": 1.05952580584833e-05, + "loss": 0.8325, + "step": 42634 + }, + { + "epoch": 0.5329633240831021, + "grad_norm": 0.0018395994557067752, + "learning_rate": 1.0594386914831014e-05, + "loss": 0.5401, + "step": 42636 + }, + { + "epoch": 0.5329883247081177, + "grad_norm": 4.592299461364746, + "learning_rate": 1.0593515766651959e-05, + "loss": 0.8404, + "step": 42638 + }, + { + "epoch": 0.5330133253331333, + "grad_norm": 7.329028606414795, + "learning_rate": 1.0592644613952773e-05, + "loss": 1.9788, + "step": 42640 + }, + { + "epoch": 0.5330383259581489, + "grad_norm": 6.190298557281494, + "learning_rate": 1.0591773456740091e-05, + "loss": 0.4641, + "step": 42642 + }, + { + "epoch": 0.5330633265831646, + "grad_norm": 2.183683156967163, + "learning_rate": 1.0590902295020545e-05, + "loss": 0.9312, + "step": 42644 + }, + { + "epoch": 0.5330883272081802, + "grad_norm": 3.784353017807007, + "learning_rate": 1.0590031128800769e-05, + "loss": 1.2489, + "step": 42646 + }, + { + "epoch": 0.5331133278331959, + "grad_norm": 4.545995712280273, + "learning_rate": 1.0589159958087398e-05, + "loss": 0.922, + "step": 42648 + }, + { + "epoch": 0.5331383284582114, + "grad_norm": 4.289804458618164, + "learning_rate": 1.0588288782887071e-05, + "loss": 1.7372, + "step": 42650 + }, + { + "epoch": 0.5331633290832271, + "grad_norm": 4.075716495513916, + "learning_rate": 1.058741760320642e-05, + "loss": 1.1242, + "step": 42652 + }, + { + "epoch": 0.5331883297082427, + "grad_norm": 3.5465991497039795, + "learning_rate": 1.058654641905208e-05, + "loss": 1.1973, + "step": 42654 + }, + { + "epoch": 0.5332133303332584, + "grad_norm": 2.2057716846466064, + "learning_rate": 1.0585675230430683e-05, + "loss": 1.2516, + "step": 42656 + }, + { + "epoch": 0.533238330958274, + "grad_norm": 10.475456237792969, + "learning_rate": 1.0584804037348867e-05, + "loss": 1.5258, + "step": 42658 + }, + { + "epoch": 0.5332633315832895, + "grad_norm": 4.2327985763549805, + "learning_rate": 1.0583932839813267e-05, + "loss": 1.1462, + "step": 42660 + }, + { + "epoch": 0.5332883322083052, + "grad_norm": 2.1184873580932617, + "learning_rate": 1.0583061637830516e-05, + "loss": 0.4708, + "step": 42662 + }, + { + "epoch": 0.5333133328333208, + "grad_norm": 2.4758241176605225, + "learning_rate": 1.058219043140725e-05, + "loss": 0.5597, + "step": 42664 + }, + { + "epoch": 0.5333383334583365, + "grad_norm": 1.4006283283233643, + "learning_rate": 1.0581319220550109e-05, + "loss": 0.6848, + "step": 42666 + }, + { + "epoch": 0.5333633340833521, + "grad_norm": 6.383723735809326, + "learning_rate": 1.0580448005265718e-05, + "loss": 0.8232, + "step": 42668 + }, + { + "epoch": 0.5333883347083677, + "grad_norm": 2.5572938919067383, + "learning_rate": 1.0579576785560717e-05, + "loss": 0.3848, + "step": 42670 + }, + { + "epoch": 0.5334133353333833, + "grad_norm": 6.302521228790283, + "learning_rate": 1.0578705561441741e-05, + "loss": 2.2164, + "step": 42672 + }, + { + "epoch": 0.533438335958399, + "grad_norm": 2.310777425765991, + "learning_rate": 1.0577834332915426e-05, + "loss": 0.0659, + "step": 42674 + }, + { + "epoch": 0.5334633365834146, + "grad_norm": 3.6647047996520996, + "learning_rate": 1.0576963099988407e-05, + "loss": 1.0212, + "step": 42676 + }, + { + "epoch": 0.5334883372084303, + "grad_norm": 6.211670875549316, + "learning_rate": 1.057609186266732e-05, + "loss": 2.4063, + "step": 42678 + }, + { + "epoch": 0.5335133378334458, + "grad_norm": 0.037513718008995056, + "learning_rate": 1.0575220620958798e-05, + "loss": 0.1287, + "step": 42680 + }, + { + "epoch": 0.5335383384584614, + "grad_norm": 2.9004361629486084, + "learning_rate": 1.0574349374869475e-05, + "loss": 0.8444, + "step": 42682 + }, + { + "epoch": 0.5335633390834771, + "grad_norm": 1.728094220161438, + "learning_rate": 1.057347812440599e-05, + "loss": 0.9374, + "step": 42684 + }, + { + "epoch": 0.5335883397084927, + "grad_norm": 4.212061405181885, + "learning_rate": 1.0572606869574975e-05, + "loss": 1.6813, + "step": 42686 + }, + { + "epoch": 0.5336133403335084, + "grad_norm": 5.453810214996338, + "learning_rate": 1.057173561038307e-05, + "loss": 1.337, + "step": 42688 + }, + { + "epoch": 0.5336383409585239, + "grad_norm": 4.084934234619141, + "learning_rate": 1.0570864346836904e-05, + "loss": 2.0145, + "step": 42690 + }, + { + "epoch": 0.5336633415835396, + "grad_norm": 6.279155731201172, + "learning_rate": 1.056999307894312e-05, + "loss": 1.2258, + "step": 42692 + }, + { + "epoch": 0.5336883422085552, + "grad_norm": 2.4901559352874756, + "learning_rate": 1.0569121806708347e-05, + "loss": 0.385, + "step": 42694 + }, + { + "epoch": 0.5337133428335709, + "grad_norm": 1.8404101133346558, + "learning_rate": 1.0568250530139221e-05, + "loss": 0.4518, + "step": 42696 + }, + { + "epoch": 0.5337383434585865, + "grad_norm": 2.407883644104004, + "learning_rate": 1.0567379249242383e-05, + "loss": 0.4391, + "step": 42698 + }, + { + "epoch": 0.533763344083602, + "grad_norm": 5.150905609130859, + "learning_rate": 1.0566507964024462e-05, + "loss": 0.9996, + "step": 42700 + }, + { + "epoch": 0.5337883447086177, + "grad_norm": 2.650183916091919, + "learning_rate": 1.0565636674492096e-05, + "loss": 0.8689, + "step": 42702 + }, + { + "epoch": 0.5338133453336333, + "grad_norm": 0.002865562913939357, + "learning_rate": 1.0564765380651924e-05, + "loss": 0.1282, + "step": 42704 + }, + { + "epoch": 0.533838345958649, + "grad_norm": 5.378782749176025, + "learning_rate": 1.0563894082510578e-05, + "loss": 0.9281, + "step": 42706 + }, + { + "epoch": 0.5338633465836646, + "grad_norm": 2.926276922225952, + "learning_rate": 1.0563022780074693e-05, + "loss": 0.7627, + "step": 42708 + }, + { + "epoch": 0.5338883472086802, + "grad_norm": 0.6799731254577637, + "learning_rate": 1.0562151473350908e-05, + "loss": 0.6449, + "step": 42710 + }, + { + "epoch": 0.5339133478336958, + "grad_norm": 2.8783421516418457, + "learning_rate": 1.0561280162345855e-05, + "loss": 0.353, + "step": 42712 + }, + { + "epoch": 0.5339383484587115, + "grad_norm": 2.786289691925049, + "learning_rate": 1.0560408847066173e-05, + "loss": 0.4228, + "step": 42714 + }, + { + "epoch": 0.5339633490837271, + "grad_norm": 0.4274500906467438, + "learning_rate": 1.0559537527518493e-05, + "loss": 0.6996, + "step": 42716 + }, + { + "epoch": 0.5339883497087428, + "grad_norm": 3.1179494857788086, + "learning_rate": 1.055866620370946e-05, + "loss": 0.5869, + "step": 42718 + }, + { + "epoch": 0.5340133503337583, + "grad_norm": 3.4660465717315674, + "learning_rate": 1.0557794875645701e-05, + "loss": 1.4208, + "step": 42720 + }, + { + "epoch": 0.5340383509587739, + "grad_norm": 0.003304209327325225, + "learning_rate": 1.0556923543333855e-05, + "loss": 0.4344, + "step": 42722 + }, + { + "epoch": 0.5340633515837896, + "grad_norm": 3.33657169342041, + "learning_rate": 1.0556052206780559e-05, + "loss": 0.7173, + "step": 42724 + }, + { + "epoch": 0.5340883522088052, + "grad_norm": 0.0767143964767456, + "learning_rate": 1.0555180865992447e-05, + "loss": 0.938, + "step": 42726 + }, + { + "epoch": 0.5341133528338209, + "grad_norm": 3.4611611366271973, + "learning_rate": 1.0554309520976155e-05, + "loss": 1.3784, + "step": 42728 + }, + { + "epoch": 0.5341383534588364, + "grad_norm": 0.0027689666021615267, + "learning_rate": 1.0553438171738324e-05, + "loss": 0.8179, + "step": 42730 + }, + { + "epoch": 0.5341633540838521, + "grad_norm": 0.6232449412345886, + "learning_rate": 1.0552566818285584e-05, + "loss": 0.0331, + "step": 42732 + }, + { + "epoch": 0.5341883547088677, + "grad_norm": 0.9171342253684998, + "learning_rate": 1.0551695460624574e-05, + "loss": 0.1701, + "step": 42734 + }, + { + "epoch": 0.5342133553338834, + "grad_norm": 3.3393607139587402, + "learning_rate": 1.0550824098761927e-05, + "loss": 2.171, + "step": 42736 + }, + { + "epoch": 0.534238355958899, + "grad_norm": 3.470343589782715, + "learning_rate": 1.0549952732704283e-05, + "loss": 1.4059, + "step": 42738 + }, + { + "epoch": 0.5342633565839146, + "grad_norm": 4.433072566986084, + "learning_rate": 1.0549081362458275e-05, + "loss": 1.1024, + "step": 42740 + }, + { + "epoch": 0.5342883572089302, + "grad_norm": 3.322782516479492, + "learning_rate": 1.0548209988030543e-05, + "loss": 1.3929, + "step": 42742 + }, + { + "epoch": 0.5343133578339458, + "grad_norm": 1.862650752067566, + "learning_rate": 1.0547338609427722e-05, + "loss": 0.2784, + "step": 42744 + }, + { + "epoch": 0.5343383584589615, + "grad_norm": 1.0735554695129395, + "learning_rate": 1.0546467226656446e-05, + "loss": 0.1147, + "step": 42746 + }, + { + "epoch": 0.5343633590839771, + "grad_norm": 5.021129608154297, + "learning_rate": 1.0545595839723355e-05, + "loss": 1.8059, + "step": 42748 + }, + { + "epoch": 0.5343883597089927, + "grad_norm": 0.6838604211807251, + "learning_rate": 1.054472444863508e-05, + "loss": 0.8818, + "step": 42750 + }, + { + "epoch": 0.5344133603340083, + "grad_norm": 0.0019835294224321842, + "learning_rate": 1.0543853053398264e-05, + "loss": 0.8144, + "step": 42752 + }, + { + "epoch": 0.534438360959024, + "grad_norm": 5.027790069580078, + "learning_rate": 1.0542981654019538e-05, + "loss": 0.9904, + "step": 42754 + }, + { + "epoch": 0.5344633615840396, + "grad_norm": 3.161940336227417, + "learning_rate": 1.0542110250505541e-05, + "loss": 1.3702, + "step": 42756 + }, + { + "epoch": 0.5344883622090553, + "grad_norm": 3.7519121170043945, + "learning_rate": 1.0541238842862908e-05, + "loss": 2.0315, + "step": 42758 + }, + { + "epoch": 0.5345133628340708, + "grad_norm": 5.631285667419434, + "learning_rate": 1.0540367431098277e-05, + "loss": 2.1484, + "step": 42760 + }, + { + "epoch": 0.5345383634590865, + "grad_norm": 3.9221267700195312, + "learning_rate": 1.0539496015218282e-05, + "loss": 0.5387, + "step": 42762 + }, + { + "epoch": 0.5345633640841021, + "grad_norm": 1.7055083513259888, + "learning_rate": 1.0538624595229568e-05, + "loss": 0.8055, + "step": 42764 + }, + { + "epoch": 0.5345883647091177, + "grad_norm": 7.325830936431885, + "learning_rate": 1.053775317113876e-05, + "loss": 2.0611, + "step": 42766 + }, + { + "epoch": 0.5346133653341334, + "grad_norm": 3.3706533908843994, + "learning_rate": 1.05368817429525e-05, + "loss": 1.3478, + "step": 42768 + }, + { + "epoch": 0.5346383659591489, + "grad_norm": 0.18553249537944794, + "learning_rate": 1.0536010310677429e-05, + "loss": 0.2895, + "step": 42770 + }, + { + "epoch": 0.5346633665841646, + "grad_norm": 1.7716383934020996, + "learning_rate": 1.0535138874320174e-05, + "loss": 0.7612, + "step": 42772 + }, + { + "epoch": 0.5346883672091802, + "grad_norm": 2.7728328704833984, + "learning_rate": 1.0534267433887377e-05, + "loss": 0.446, + "step": 42774 + }, + { + "epoch": 0.5347133678341959, + "grad_norm": 1.6766990423202515, + "learning_rate": 1.053339598938568e-05, + "loss": 0.2484, + "step": 42776 + }, + { + "epoch": 0.5347383684592115, + "grad_norm": 4.040594577789307, + "learning_rate": 1.053252454082171e-05, + "loss": 2.1662, + "step": 42778 + }, + { + "epoch": 0.5347633690842271, + "grad_norm": 0.0015989434905350208, + "learning_rate": 1.053165308820211e-05, + "loss": 0.1082, + "step": 42780 + }, + { + "epoch": 0.5347883697092427, + "grad_norm": 3.883603811264038, + "learning_rate": 1.0530781631533516e-05, + "loss": 1.5899, + "step": 42782 + }, + { + "epoch": 0.5348133703342584, + "grad_norm": 0.5037327408790588, + "learning_rate": 1.0529910170822564e-05, + "loss": 0.3575, + "step": 42784 + }, + { + "epoch": 0.534838370959274, + "grad_norm": 3.2797634601593018, + "learning_rate": 1.052903870607589e-05, + "loss": 0.6327, + "step": 42786 + }, + { + "epoch": 0.5348633715842896, + "grad_norm": 3.3918983936309814, + "learning_rate": 1.0528167237300129e-05, + "loss": 1.3668, + "step": 42788 + }, + { + "epoch": 0.5348883722093052, + "grad_norm": 1.963475227355957, + "learning_rate": 1.0527295764501926e-05, + "loss": 0.6675, + "step": 42790 + }, + { + "epoch": 0.5349133728343208, + "grad_norm": 0.24376071989536285, + "learning_rate": 1.052642428768791e-05, + "loss": 0.0953, + "step": 42792 + }, + { + "epoch": 0.5349383734593365, + "grad_norm": 3.7536022663116455, + "learning_rate": 1.0525552806864722e-05, + "loss": 1.325, + "step": 42794 + }, + { + "epoch": 0.5349633740843521, + "grad_norm": 1.6793859004974365, + "learning_rate": 1.0524681322039002e-05, + "loss": 0.5031, + "step": 42796 + }, + { + "epoch": 0.5349883747093678, + "grad_norm": 12.070199966430664, + "learning_rate": 1.052380983321738e-05, + "loss": 1.5008, + "step": 42798 + }, + { + "epoch": 0.5350133753343833, + "grad_norm": 4.187093257904053, + "learning_rate": 1.0522938340406491e-05, + "loss": 0.7516, + "step": 42800 + }, + { + "epoch": 0.535038375959399, + "grad_norm": 2.4405124187469482, + "learning_rate": 1.0522066843612986e-05, + "loss": 1.7205, + "step": 42802 + }, + { + "epoch": 0.5350633765844146, + "grad_norm": 6.059432506561279, + "learning_rate": 1.052119534284349e-05, + "loss": 2.0626, + "step": 42804 + }, + { + "epoch": 0.5350883772094303, + "grad_norm": 2.1867785453796387, + "learning_rate": 1.0520323838104645e-05, + "loss": 0.7132, + "step": 42806 + }, + { + "epoch": 0.5351133778344459, + "grad_norm": 0.0015822192654013634, + "learning_rate": 1.0519452329403085e-05, + "loss": 0.5762, + "step": 42808 + }, + { + "epoch": 0.5351383784594614, + "grad_norm": 2.156581401824951, + "learning_rate": 1.0518580816745454e-05, + "loss": 1.121, + "step": 42810 + }, + { + "epoch": 0.5351633790844771, + "grad_norm": 11.635347366333008, + "learning_rate": 1.0517709300138382e-05, + "loss": 1.1565, + "step": 42812 + }, + { + "epoch": 0.5351883797094927, + "grad_norm": 5.375754356384277, + "learning_rate": 1.0516837779588506e-05, + "loss": 2.0564, + "step": 42814 + }, + { + "epoch": 0.5352133803345084, + "grad_norm": 2.7764534950256348, + "learning_rate": 1.0515966255102472e-05, + "loss": 1.3491, + "step": 42816 + }, + { + "epoch": 0.535238380959524, + "grad_norm": 0.0018771521281450987, + "learning_rate": 1.051509472668691e-05, + "loss": 0.4142, + "step": 42818 + }, + { + "epoch": 0.5352633815845396, + "grad_norm": 4.686281681060791, + "learning_rate": 1.0514223194348458e-05, + "loss": 0.7855, + "step": 42820 + }, + { + "epoch": 0.5352883822095552, + "grad_norm": 0.0025654102209955454, + "learning_rate": 1.0513351658093757e-05, + "loss": 0.7648, + "step": 42822 + }, + { + "epoch": 0.5353133828345709, + "grad_norm": 2.745032787322998, + "learning_rate": 1.051248011792944e-05, + "loss": 0.6248, + "step": 42824 + }, + { + "epoch": 0.5353383834595865, + "grad_norm": 0.30743151903152466, + "learning_rate": 1.0511608573862148e-05, + "loss": 0.0069, + "step": 42826 + }, + { + "epoch": 0.5353633840846022, + "grad_norm": 0.0014309067046269774, + "learning_rate": 1.051073702589852e-05, + "loss": 1.176, + "step": 42828 + }, + { + "epoch": 0.5353883847096177, + "grad_norm": 0.0039892946369946, + "learning_rate": 1.0509865474045189e-05, + "loss": 0.0001, + "step": 42830 + }, + { + "epoch": 0.5354133853346333, + "grad_norm": 5.553815841674805, + "learning_rate": 1.0508993918308794e-05, + "loss": 1.0162, + "step": 42832 + }, + { + "epoch": 0.535438385959649, + "grad_norm": 0.21579600870609283, + "learning_rate": 1.0508122358695973e-05, + "loss": 0.9682, + "step": 42834 + }, + { + "epoch": 0.5354633865846646, + "grad_norm": 4.07489013671875, + "learning_rate": 1.0507250795213364e-05, + "loss": 0.4396, + "step": 42836 + }, + { + "epoch": 0.5354883872096803, + "grad_norm": 3.1458213329315186, + "learning_rate": 1.0506379227867605e-05, + "loss": 1.6922, + "step": 42838 + }, + { + "epoch": 0.5355133878346958, + "grad_norm": 0.5097916126251221, + "learning_rate": 1.0505507656665335e-05, + "loss": 0.0053, + "step": 42840 + }, + { + "epoch": 0.5355383884597115, + "grad_norm": 2.7534003257751465, + "learning_rate": 1.0504636081613191e-05, + "loss": 0.5336, + "step": 42842 + }, + { + "epoch": 0.5355633890847271, + "grad_norm": 0.28548872470855713, + "learning_rate": 1.0503764502717806e-05, + "loss": 0.4966, + "step": 42844 + }, + { + "epoch": 0.5355883897097428, + "grad_norm": 6.138446807861328, + "learning_rate": 1.0502892919985824e-05, + "loss": 0.2654, + "step": 42846 + }, + { + "epoch": 0.5356133903347584, + "grad_norm": 2.463064193725586, + "learning_rate": 1.050202133342388e-05, + "loss": 0.3265, + "step": 42848 + }, + { + "epoch": 0.535638390959774, + "grad_norm": 1.8308583498001099, + "learning_rate": 1.0501149743038612e-05, + "loss": 0.925, + "step": 42850 + }, + { + "epoch": 0.5356633915847896, + "grad_norm": 3.144176483154297, + "learning_rate": 1.050027814883666e-05, + "loss": 0.824, + "step": 42852 + }, + { + "epoch": 0.5356883922098052, + "grad_norm": 0.0029920402448624372, + "learning_rate": 1.049940655082466e-05, + "loss": 0.9311, + "step": 42854 + }, + { + "epoch": 0.5357133928348209, + "grad_norm": 2.4962732791900635, + "learning_rate": 1.0498534949009252e-05, + "loss": 0.5512, + "step": 42856 + }, + { + "epoch": 0.5357383934598365, + "grad_norm": 2.4819936752319336, + "learning_rate": 1.0497663343397069e-05, + "loss": 0.908, + "step": 42858 + }, + { + "epoch": 0.5357633940848521, + "grad_norm": 3.1447598934173584, + "learning_rate": 1.0496791733994753e-05, + "loss": 0.7554, + "step": 42860 + }, + { + "epoch": 0.5357883947098677, + "grad_norm": 0.30153921246528625, + "learning_rate": 1.0495920120808942e-05, + "loss": 0.4173, + "step": 42862 + }, + { + "epoch": 0.5358133953348834, + "grad_norm": 0.05870700627565384, + "learning_rate": 1.0495048503846273e-05, + "loss": 0.5256, + "step": 42864 + }, + { + "epoch": 0.535838395959899, + "grad_norm": 3.5370774269104004, + "learning_rate": 1.0494176883113384e-05, + "loss": 1.332, + "step": 42866 + }, + { + "epoch": 0.5358633965849147, + "grad_norm": 0.003418679116293788, + "learning_rate": 1.0493305258616916e-05, + "loss": 0.9943, + "step": 42868 + }, + { + "epoch": 0.5358883972099302, + "grad_norm": 2.3590643405914307, + "learning_rate": 1.0492433630363505e-05, + "loss": 0.7116, + "step": 42870 + }, + { + "epoch": 0.5359133978349458, + "grad_norm": 3.6074378490448, + "learning_rate": 1.0491561998359787e-05, + "loss": 1.5262, + "step": 42872 + }, + { + "epoch": 0.5359383984599615, + "grad_norm": 4.928585529327393, + "learning_rate": 1.0490690362612403e-05, + "loss": 0.8661, + "step": 42874 + }, + { + "epoch": 0.5359633990849771, + "grad_norm": 6.09382963180542, + "learning_rate": 1.0489818723127992e-05, + "loss": 1.2348, + "step": 42876 + }, + { + "epoch": 0.5359883997099928, + "grad_norm": 0.001807855092920363, + "learning_rate": 1.0488947079913187e-05, + "loss": 0.1711, + "step": 42878 + }, + { + "epoch": 0.5360134003350083, + "grad_norm": 4.709144592285156, + "learning_rate": 1.0488075432974635e-05, + "loss": 0.7038, + "step": 42880 + }, + { + "epoch": 0.536038400960024, + "grad_norm": 2.9129843711853027, + "learning_rate": 1.048720378231897e-05, + "loss": 0.9956, + "step": 42882 + }, + { + "epoch": 0.5360634015850396, + "grad_norm": 5.177168369293213, + "learning_rate": 1.0486332127952825e-05, + "loss": 2.0727, + "step": 42884 + }, + { + "epoch": 0.5360884022100553, + "grad_norm": 2.844656467437744, + "learning_rate": 1.0485460469882844e-05, + "loss": 0.3683, + "step": 42886 + }, + { + "epoch": 0.5361134028350709, + "grad_norm": 3.4171054363250732, + "learning_rate": 1.0484588808115668e-05, + "loss": 0.8522, + "step": 42888 + }, + { + "epoch": 0.5361384034600865, + "grad_norm": 0.0010813352419063449, + "learning_rate": 1.048371714265793e-05, + "loss": 0.6034, + "step": 42890 + }, + { + "epoch": 0.5361634040851021, + "grad_norm": 4.803134918212891, + "learning_rate": 1.048284547351627e-05, + "loss": 1.3208, + "step": 42892 + }, + { + "epoch": 0.5361884047101177, + "grad_norm": 2.8221452236175537, + "learning_rate": 1.048197380069733e-05, + "loss": 0.9101, + "step": 42894 + }, + { + "epoch": 0.5362134053351334, + "grad_norm": 2.7074480056762695, + "learning_rate": 1.0481102124207743e-05, + "loss": 1.1986, + "step": 42896 + }, + { + "epoch": 0.536238405960149, + "grad_norm": 0.7938486337661743, + "learning_rate": 1.0480230444054152e-05, + "loss": 0.0088, + "step": 42898 + }, + { + "epoch": 0.5362634065851646, + "grad_norm": 3.2960891723632812, + "learning_rate": 1.0479358760243192e-05, + "loss": 1.2831, + "step": 42900 + }, + { + "epoch": 0.5362884072101802, + "grad_norm": 4.0266804695129395, + "learning_rate": 1.0478487072781505e-05, + "loss": 0.8668, + "step": 42902 + }, + { + "epoch": 0.5363134078351959, + "grad_norm": 2.8033664226531982, + "learning_rate": 1.0477615381675728e-05, + "loss": 1.6706, + "step": 42904 + }, + { + "epoch": 0.5363384084602115, + "grad_norm": 0.019395360723137856, + "learning_rate": 1.0476743686932497e-05, + "loss": 0.0003, + "step": 42906 + }, + { + "epoch": 0.5363634090852272, + "grad_norm": 0.00768662104383111, + "learning_rate": 1.047587198855846e-05, + "loss": 0.2283, + "step": 42908 + }, + { + "epoch": 0.5363884097102427, + "grad_norm": 1.534250259399414, + "learning_rate": 1.0475000286560244e-05, + "loss": 0.6472, + "step": 42910 + }, + { + "epoch": 0.5364134103352584, + "grad_norm": 5.103336811065674, + "learning_rate": 1.0474128580944494e-05, + "loss": 1.1787, + "step": 42912 + }, + { + "epoch": 0.536438410960274, + "grad_norm": 3.464934825897217, + "learning_rate": 1.0473256871717847e-05, + "loss": 1.2058, + "step": 42914 + }, + { + "epoch": 0.5364634115852897, + "grad_norm": 1.7967795133590698, + "learning_rate": 1.0472385158886944e-05, + "loss": 0.2639, + "step": 42916 + }, + { + "epoch": 0.5364884122103053, + "grad_norm": 6.876976490020752, + "learning_rate": 1.0471513442458421e-05, + "loss": 1.643, + "step": 42918 + }, + { + "epoch": 0.5365134128353208, + "grad_norm": 3.0383903980255127, + "learning_rate": 1.0470641722438922e-05, + "loss": 1.2834, + "step": 42920 + }, + { + "epoch": 0.5365384134603365, + "grad_norm": 3.6411869525909424, + "learning_rate": 1.0469769998835078e-05, + "loss": 0.9634, + "step": 42922 + }, + { + "epoch": 0.5365634140853521, + "grad_norm": 2.9342877864837646, + "learning_rate": 1.0468898271653534e-05, + "loss": 1.5107, + "step": 42924 + }, + { + "epoch": 0.5365884147103678, + "grad_norm": 3.743197202682495, + "learning_rate": 1.0468026540900927e-05, + "loss": 0.6842, + "step": 42926 + }, + { + "epoch": 0.5366134153353834, + "grad_norm": 3.3298208713531494, + "learning_rate": 1.0467154806583895e-05, + "loss": 1.3196, + "step": 42928 + }, + { + "epoch": 0.536638415960399, + "grad_norm": 3.4478373527526855, + "learning_rate": 1.0466283068709078e-05, + "loss": 1.535, + "step": 42930 + }, + { + "epoch": 0.5366634165854146, + "grad_norm": 3.600647211074829, + "learning_rate": 1.0465411327283116e-05, + "loss": 0.8042, + "step": 42932 + }, + { + "epoch": 0.5366884172104303, + "grad_norm": 3.6331775188446045, + "learning_rate": 1.0464539582312647e-05, + "loss": 1.2532, + "step": 42934 + }, + { + "epoch": 0.5367134178354459, + "grad_norm": 2.7670340538024902, + "learning_rate": 1.046366783380431e-05, + "loss": 1.1141, + "step": 42936 + }, + { + "epoch": 0.5367384184604616, + "grad_norm": 4.4058518409729, + "learning_rate": 1.0462796081764746e-05, + "loss": 0.5319, + "step": 42938 + }, + { + "epoch": 0.5367634190854771, + "grad_norm": 2.911363124847412, + "learning_rate": 1.046192432620059e-05, + "loss": 0.5488, + "step": 42940 + }, + { + "epoch": 0.5367884197104927, + "grad_norm": 3.073251485824585, + "learning_rate": 1.0461052567118483e-05, + "loss": 1.4807, + "step": 42942 + }, + { + "epoch": 0.5368134203355084, + "grad_norm": 4.230746269226074, + "learning_rate": 1.0460180804525067e-05, + "loss": 1.2752, + "step": 42944 + }, + { + "epoch": 0.536838420960524, + "grad_norm": 3.0463500022888184, + "learning_rate": 1.045930903842698e-05, + "loss": 1.1536, + "step": 42946 + }, + { + "epoch": 0.5368634215855397, + "grad_norm": 0.25936514139175415, + "learning_rate": 1.0458437268830859e-05, + "loss": 0.5693, + "step": 42948 + }, + { + "epoch": 0.5368884222105552, + "grad_norm": 3.3062140941619873, + "learning_rate": 1.0457565495743345e-05, + "loss": 1.2285, + "step": 42950 + }, + { + "epoch": 0.5369134228355709, + "grad_norm": 3.2731130123138428, + "learning_rate": 1.0456693719171076e-05, + "loss": 1.2118, + "step": 42952 + }, + { + "epoch": 0.5369384234605865, + "grad_norm": 5.613951683044434, + "learning_rate": 1.0455821939120692e-05, + "loss": 1.5883, + "step": 42954 + }, + { + "epoch": 0.5369634240856022, + "grad_norm": 3.686100721359253, + "learning_rate": 1.0454950155598833e-05, + "loss": 0.6424, + "step": 42956 + }, + { + "epoch": 0.5369884247106178, + "grad_norm": 0.9451441168785095, + "learning_rate": 1.0454078368612137e-05, + "loss": 0.042, + "step": 42958 + }, + { + "epoch": 0.5370134253356333, + "grad_norm": 1.993855595588684, + "learning_rate": 1.0453206578167248e-05, + "loss": 1.0597, + "step": 42960 + }, + { + "epoch": 0.537038425960649, + "grad_norm": 20.400056838989258, + "learning_rate": 1.0452334784270799e-05, + "loss": 0.5564, + "step": 42962 + }, + { + "epoch": 0.5370634265856646, + "grad_norm": 4.538066864013672, + "learning_rate": 1.045146298692943e-05, + "loss": 1.3286, + "step": 42964 + }, + { + "epoch": 0.5370884272106803, + "grad_norm": 2.025033950805664, + "learning_rate": 1.0450591186149788e-05, + "loss": 1.0547, + "step": 42966 + }, + { + "epoch": 0.5371134278356959, + "grad_norm": 6.323965549468994, + "learning_rate": 1.0449719381938504e-05, + "loss": 1.2747, + "step": 42968 + }, + { + "epoch": 0.5371384284607115, + "grad_norm": 2.6382858753204346, + "learning_rate": 1.044884757430222e-05, + "loss": 1.2782, + "step": 42970 + }, + { + "epoch": 0.5371634290857271, + "grad_norm": 3.287637710571289, + "learning_rate": 1.044797576324758e-05, + "loss": 1.1145, + "step": 42972 + }, + { + "epoch": 0.5371884297107428, + "grad_norm": 1.4905405044555664, + "learning_rate": 1.0447103948781217e-05, + "loss": 0.3428, + "step": 42974 + }, + { + "epoch": 0.5372134303357584, + "grad_norm": 4.374811172485352, + "learning_rate": 1.0446232130909773e-05, + "loss": 1.2268, + "step": 42976 + }, + { + "epoch": 0.5372384309607741, + "grad_norm": 3.2137513160705566, + "learning_rate": 1.044536030963989e-05, + "loss": 0.5807, + "step": 42978 + }, + { + "epoch": 0.5372634315857896, + "grad_norm": 2.162189483642578, + "learning_rate": 1.0444488484978204e-05, + "loss": 1.0884, + "step": 42980 + }, + { + "epoch": 0.5372884322108052, + "grad_norm": 4.260162830352783, + "learning_rate": 1.0443616656931359e-05, + "loss": 1.1705, + "step": 42982 + }, + { + "epoch": 0.5373134328358209, + "grad_norm": 0.002480885712429881, + "learning_rate": 1.044274482550599e-05, + "loss": 0.7109, + "step": 42984 + }, + { + "epoch": 0.5373384334608365, + "grad_norm": 2.6035208702087402, + "learning_rate": 1.0441872990708742e-05, + "loss": 1.1578, + "step": 42986 + }, + { + "epoch": 0.5373634340858522, + "grad_norm": 3.6280362606048584, + "learning_rate": 1.0441001152546248e-05, + "loss": 1.4216, + "step": 42988 + }, + { + "epoch": 0.5373884347108677, + "grad_norm": 3.0462794303894043, + "learning_rate": 1.0440129311025151e-05, + "loss": 1.0207, + "step": 42990 + }, + { + "epoch": 0.5374134353358834, + "grad_norm": 0.001447803690098226, + "learning_rate": 1.0439257466152098e-05, + "loss": 0.0263, + "step": 42992 + }, + { + "epoch": 0.537438435960899, + "grad_norm": 1.797793984413147, + "learning_rate": 1.0438385617933717e-05, + "loss": 0.8208, + "step": 42994 + }, + { + "epoch": 0.5374634365859147, + "grad_norm": 3.5779948234558105, + "learning_rate": 1.0437513766376652e-05, + "loss": 0.2315, + "step": 42996 + }, + { + "epoch": 0.5374884372109303, + "grad_norm": 0.004429287277162075, + "learning_rate": 1.0436641911487548e-05, + "loss": 0.444, + "step": 42998 + }, + { + "epoch": 0.5375134378359459, + "grad_norm": 1.6762467622756958, + "learning_rate": 1.043577005327304e-05, + "loss": 0.2451, + "step": 43000 + }, + { + "epoch": 0.5375384384609615, + "grad_norm": 2.070568323135376, + "learning_rate": 1.0434898191739767e-05, + "loss": 0.7014, + "step": 43002 + }, + { + "epoch": 0.5375634390859771, + "grad_norm": 0.011409858241677284, + "learning_rate": 1.0434026326894373e-05, + "loss": 0.6302, + "step": 43004 + }, + { + "epoch": 0.5375884397109928, + "grad_norm": 0.0430738627910614, + "learning_rate": 1.0433154458743496e-05, + "loss": 0.0005, + "step": 43006 + }, + { + "epoch": 0.5376134403360084, + "grad_norm": 3.4654088020324707, + "learning_rate": 1.0432282587293771e-05, + "loss": 1.5478, + "step": 43008 + }, + { + "epoch": 0.537638440961024, + "grad_norm": 0.0010740126017481089, + "learning_rate": 1.0431410712551848e-05, + "loss": 0.0275, + "step": 43010 + }, + { + "epoch": 0.5376634415860396, + "grad_norm": 1.6512494087219238, + "learning_rate": 1.0430538834524361e-05, + "loss": 0.5715, + "step": 43012 + }, + { + "epoch": 0.5376884422110553, + "grad_norm": 0.0007607892621308565, + "learning_rate": 1.042966695321795e-05, + "loss": 0.0, + "step": 43014 + }, + { + "epoch": 0.5377134428360709, + "grad_norm": 3.2846145629882812, + "learning_rate": 1.0428795068639258e-05, + "loss": 1.2002, + "step": 43016 + }, + { + "epoch": 0.5377384434610866, + "grad_norm": 14.096699714660645, + "learning_rate": 1.0427923180794924e-05, + "loss": 1.521, + "step": 43018 + }, + { + "epoch": 0.5377634440861021, + "grad_norm": 6.005882740020752, + "learning_rate": 1.0427051289691585e-05, + "loss": 0.778, + "step": 43020 + }, + { + "epoch": 0.5377884447111178, + "grad_norm": 3.6943602561950684, + "learning_rate": 1.0426179395335885e-05, + "loss": 0.7826, + "step": 43022 + }, + { + "epoch": 0.5378134453361334, + "grad_norm": 3.119999408721924, + "learning_rate": 1.0425307497734462e-05, + "loss": 1.34, + "step": 43024 + }, + { + "epoch": 0.537838445961149, + "grad_norm": 1.53912353515625, + "learning_rate": 1.0424435596893957e-05, + "loss": 0.3969, + "step": 43026 + }, + { + "epoch": 0.5378634465861647, + "grad_norm": 5.336930274963379, + "learning_rate": 1.0423563692821011e-05, + "loss": 1.3435, + "step": 43028 + }, + { + "epoch": 0.5378884472111802, + "grad_norm": 1.9582033157348633, + "learning_rate": 1.0422691785522263e-05, + "loss": 0.3387, + "step": 43030 + }, + { + "epoch": 0.5379134478361959, + "grad_norm": 3.478347063064575, + "learning_rate": 1.0421819875004358e-05, + "loss": 1.4618, + "step": 43032 + }, + { + "epoch": 0.5379384484612115, + "grad_norm": 5.129927635192871, + "learning_rate": 1.042094796127393e-05, + "loss": 0.5016, + "step": 43034 + }, + { + "epoch": 0.5379634490862272, + "grad_norm": 2.592397451400757, + "learning_rate": 1.0420076044337621e-05, + "loss": 1.4739, + "step": 43036 + }, + { + "epoch": 0.5379884497112428, + "grad_norm": 0.24632839858531952, + "learning_rate": 1.0419204124202074e-05, + "loss": 0.5124, + "step": 43038 + }, + { + "epoch": 0.5380134503362584, + "grad_norm": 5.586838245391846, + "learning_rate": 1.0418332200873927e-05, + "loss": 1.0164, + "step": 43040 + }, + { + "epoch": 0.538038450961274, + "grad_norm": 7.158051013946533, + "learning_rate": 1.0417460274359818e-05, + "loss": 2.1346, + "step": 43042 + }, + { + "epoch": 0.5380634515862897, + "grad_norm": 2.461334228515625, + "learning_rate": 1.0416588344666397e-05, + "loss": 1.2897, + "step": 43044 + }, + { + "epoch": 0.5380884522113053, + "grad_norm": 3.0396924018859863, + "learning_rate": 1.0415716411800293e-05, + "loss": 1.6391, + "step": 43046 + }, + { + "epoch": 0.538113452836321, + "grad_norm": 1.963258147239685, + "learning_rate": 1.0414844475768153e-05, + "loss": 0.5677, + "step": 43048 + }, + { + "epoch": 0.5381384534613365, + "grad_norm": 5.146728992462158, + "learning_rate": 1.0413972536576618e-05, + "loss": 1.2962, + "step": 43050 + }, + { + "epoch": 0.5381634540863521, + "grad_norm": 0.003250245936214924, + "learning_rate": 1.0413100594232324e-05, + "loss": 0.5395, + "step": 43052 + }, + { + "epoch": 0.5381884547113678, + "grad_norm": 0.00989717710763216, + "learning_rate": 1.0412228648741913e-05, + "loss": 0.0001, + "step": 43054 + }, + { + "epoch": 0.5382134553363834, + "grad_norm": 0.18943087756633759, + "learning_rate": 1.041135670011203e-05, + "loss": 0.2127, + "step": 43056 + }, + { + "epoch": 0.5382384559613991, + "grad_norm": 3.2568066120147705, + "learning_rate": 1.0410484748349314e-05, + "loss": 0.8587, + "step": 43058 + }, + { + "epoch": 0.5382634565864146, + "grad_norm": 4.914600849151611, + "learning_rate": 1.0409612793460403e-05, + "loss": 1.937, + "step": 43060 + }, + { + "epoch": 0.5382884572114303, + "grad_norm": 5.123954772949219, + "learning_rate": 1.0408740835451935e-05, + "loss": 1.81, + "step": 43062 + }, + { + "epoch": 0.5383134578364459, + "grad_norm": 0.002398607786744833, + "learning_rate": 1.0407868874330557e-05, + "loss": 1.2468, + "step": 43064 + }, + { + "epoch": 0.5383384584614616, + "grad_norm": 3.9848787784576416, + "learning_rate": 1.0406996910102908e-05, + "loss": 1.759, + "step": 43066 + }, + { + "epoch": 0.5383634590864772, + "grad_norm": 3.1851699352264404, + "learning_rate": 1.0406124942775628e-05, + "loss": 1.4851, + "step": 43068 + }, + { + "epoch": 0.5383884597114927, + "grad_norm": 0.0006330095347948372, + "learning_rate": 1.040525297235536e-05, + "loss": 0.2894, + "step": 43070 + }, + { + "epoch": 0.5384134603365084, + "grad_norm": 3.0742945671081543, + "learning_rate": 1.040438099884874e-05, + "loss": 1.3322, + "step": 43072 + }, + { + "epoch": 0.538438460961524, + "grad_norm": 0.00294725364074111, + "learning_rate": 1.040350902226241e-05, + "loss": 0.4222, + "step": 43074 + }, + { + "epoch": 0.5384634615865397, + "grad_norm": 2.6678736209869385, + "learning_rate": 1.0402637042603015e-05, + "loss": 0.3065, + "step": 43076 + }, + { + "epoch": 0.5384884622115553, + "grad_norm": 2.8458783626556396, + "learning_rate": 1.0401765059877191e-05, + "loss": 0.7684, + "step": 43078 + }, + { + "epoch": 0.5385134628365709, + "grad_norm": 7.244773864746094, + "learning_rate": 1.0400893074091583e-05, + "loss": 1.1968, + "step": 43080 + }, + { + "epoch": 0.5385384634615865, + "grad_norm": 2.343928098678589, + "learning_rate": 1.0400021085252829e-05, + "loss": 0.8692, + "step": 43082 + }, + { + "epoch": 0.5385634640866022, + "grad_norm": 0.3183223605155945, + "learning_rate": 1.0399149093367574e-05, + "loss": 0.0035, + "step": 43084 + }, + { + "epoch": 0.5385884647116178, + "grad_norm": 2.0047857761383057, + "learning_rate": 1.0398277098442452e-05, + "loss": 0.5373, + "step": 43086 + }, + { + "epoch": 0.5386134653366335, + "grad_norm": 5.506577968597412, + "learning_rate": 1.039740510048411e-05, + "loss": 0.3928, + "step": 43088 + }, + { + "epoch": 0.538638465961649, + "grad_norm": 0.003875754540786147, + "learning_rate": 1.0396533099499186e-05, + "loss": 0.4453, + "step": 43090 + }, + { + "epoch": 0.5386634665866646, + "grad_norm": 2.5169503688812256, + "learning_rate": 1.0395661095494321e-05, + "loss": 1.5829, + "step": 43092 + }, + { + "epoch": 0.5386884672116803, + "grad_norm": 5.002411842346191, + "learning_rate": 1.0394789088476158e-05, + "loss": 1.6422, + "step": 43094 + }, + { + "epoch": 0.5387134678366959, + "grad_norm": 1.3296122550964355, + "learning_rate": 1.0393917078451342e-05, + "loss": 0.956, + "step": 43096 + }, + { + "epoch": 0.5387384684617116, + "grad_norm": 3.0373804569244385, + "learning_rate": 1.0393045065426502e-05, + "loss": 1.5697, + "step": 43098 + }, + { + "epoch": 0.5387634690867271, + "grad_norm": 0.001424314803443849, + "learning_rate": 1.0392173049408289e-05, + "loss": 0.2771, + "step": 43100 + }, + { + "epoch": 0.5387884697117428, + "grad_norm": 1.6067224740982056, + "learning_rate": 1.0391301030403342e-05, + "loss": 0.8586, + "step": 43102 + }, + { + "epoch": 0.5388134703367584, + "grad_norm": 2.75050687789917, + "learning_rate": 1.0390429008418302e-05, + "loss": 0.5817, + "step": 43104 + }, + { + "epoch": 0.5388384709617741, + "grad_norm": 2.8398337364196777, + "learning_rate": 1.0389556983459808e-05, + "loss": 1.0634, + "step": 43106 + }, + { + "epoch": 0.5388634715867897, + "grad_norm": 0.0010603736154735088, + "learning_rate": 1.0388684955534502e-05, + "loss": 1.2143, + "step": 43108 + }, + { + "epoch": 0.5388884722118052, + "grad_norm": 3.8421263694763184, + "learning_rate": 1.0387812924649033e-05, + "loss": 1.6795, + "step": 43110 + }, + { + "epoch": 0.5389134728368209, + "grad_norm": 13.4544038772583, + "learning_rate": 1.038694089081003e-05, + "loss": 1.925, + "step": 43112 + }, + { + "epoch": 0.5389384734618365, + "grad_norm": 5.517475605010986, + "learning_rate": 1.038606885402414e-05, + "loss": 1.5634, + "step": 43114 + }, + { + "epoch": 0.5389634740868522, + "grad_norm": 3.1417436599731445, + "learning_rate": 1.0385196814298006e-05, + "loss": 0.9383, + "step": 43116 + }, + { + "epoch": 0.5389884747118678, + "grad_norm": 3.5293033123016357, + "learning_rate": 1.0384324771638266e-05, + "loss": 0.7012, + "step": 43118 + }, + { + "epoch": 0.5390134753368834, + "grad_norm": 0.007047614082694054, + "learning_rate": 1.0383452726051564e-05, + "loss": 0.0061, + "step": 43120 + }, + { + "epoch": 0.539038475961899, + "grad_norm": 1.614093542098999, + "learning_rate": 1.0382580677544543e-05, + "loss": 0.9556, + "step": 43122 + }, + { + "epoch": 0.5390634765869147, + "grad_norm": 5.005822658538818, + "learning_rate": 1.0381708626123837e-05, + "loss": 2.681, + "step": 43124 + }, + { + "epoch": 0.5390884772119303, + "grad_norm": 3.8421101570129395, + "learning_rate": 1.0380836571796094e-05, + "loss": 1.2149, + "step": 43126 + }, + { + "epoch": 0.539113477836946, + "grad_norm": 2.1807634830474854, + "learning_rate": 1.0379964514567952e-05, + "loss": 1.103, + "step": 43128 + }, + { + "epoch": 0.5391384784619615, + "grad_norm": 4.227362155914307, + "learning_rate": 1.0379092454446056e-05, + "loss": 0.5169, + "step": 43130 + }, + { + "epoch": 0.5391634790869771, + "grad_norm": 4.516197681427002, + "learning_rate": 1.0378220391437044e-05, + "loss": 0.8841, + "step": 43132 + }, + { + "epoch": 0.5391884797119928, + "grad_norm": 3.2120542526245117, + "learning_rate": 1.037734832554756e-05, + "loss": 0.7222, + "step": 43134 + }, + { + "epoch": 0.5392134803370084, + "grad_norm": 0.0036042050924152136, + "learning_rate": 1.0376476256784247e-05, + "loss": 0.7425, + "step": 43136 + }, + { + "epoch": 0.5392384809620241, + "grad_norm": 2.2581238746643066, + "learning_rate": 1.037560418515374e-05, + "loss": 1.1839, + "step": 43138 + }, + { + "epoch": 0.5392634815870396, + "grad_norm": 17.559537887573242, + "learning_rate": 1.0374732110662686e-05, + "loss": 0.6813, + "step": 43140 + }, + { + "epoch": 0.5392884822120553, + "grad_norm": 5.7378764152526855, + "learning_rate": 1.0373860033317724e-05, + "loss": 1.9119, + "step": 43142 + }, + { + "epoch": 0.5393134828370709, + "grad_norm": 1.037759780883789, + "learning_rate": 1.0372987953125498e-05, + "loss": 0.7432, + "step": 43144 + }, + { + "epoch": 0.5393384834620866, + "grad_norm": 3.0601351261138916, + "learning_rate": 1.037211587009265e-05, + "loss": 0.7586, + "step": 43146 + }, + { + "epoch": 0.5393634840871022, + "grad_norm": 2.270529270172119, + "learning_rate": 1.037124378422582e-05, + "loss": 0.3988, + "step": 43148 + }, + { + "epoch": 0.5393884847121178, + "grad_norm": 3.4991750717163086, + "learning_rate": 1.0370371695531647e-05, + "loss": 0.8547, + "step": 43150 + }, + { + "epoch": 0.5394134853371334, + "grad_norm": 4.201278209686279, + "learning_rate": 1.0369499604016777e-05, + "loss": 1.4029, + "step": 43152 + }, + { + "epoch": 0.539438485962149, + "grad_norm": 5.5814948081970215, + "learning_rate": 1.0368627509687849e-05, + "loss": 1.3083, + "step": 43154 + }, + { + "epoch": 0.5394634865871647, + "grad_norm": 4.155534267425537, + "learning_rate": 1.0367755412551508e-05, + "loss": 1.0192, + "step": 43156 + }, + { + "epoch": 0.5394884872121803, + "grad_norm": 1.1529918909072876, + "learning_rate": 1.0366883312614393e-05, + "loss": 0.8674, + "step": 43158 + }, + { + "epoch": 0.5395134878371959, + "grad_norm": 5.5242180824279785, + "learning_rate": 1.0366011209883147e-05, + "loss": 0.9439, + "step": 43160 + }, + { + "epoch": 0.5395384884622115, + "grad_norm": 0.5476251840591431, + "learning_rate": 1.0365139104364412e-05, + "loss": 1.068, + "step": 43162 + }, + { + "epoch": 0.5395634890872272, + "grad_norm": 0.003755598096176982, + "learning_rate": 1.0364266996064829e-05, + "loss": 0.4597, + "step": 43164 + }, + { + "epoch": 0.5395884897122428, + "grad_norm": 8.986529350280762, + "learning_rate": 1.0363394884991036e-05, + "loss": 2.3997, + "step": 43166 + }, + { + "epoch": 0.5396134903372585, + "grad_norm": 2.707557201385498, + "learning_rate": 1.0362522771149686e-05, + "loss": 0.6976, + "step": 43168 + }, + { + "epoch": 0.539638490962274, + "grad_norm": 0.032500579953193665, + "learning_rate": 1.0361650654547408e-05, + "loss": 0.9374, + "step": 43170 + }, + { + "epoch": 0.5396634915872897, + "grad_norm": 3.172910690307617, + "learning_rate": 1.0360778535190851e-05, + "loss": 0.965, + "step": 43172 + }, + { + "epoch": 0.5396884922123053, + "grad_norm": 2.5515809059143066, + "learning_rate": 1.035990641308666e-05, + "loss": 1.3786, + "step": 43174 + }, + { + "epoch": 0.539713492837321, + "grad_norm": 5.999431133270264, + "learning_rate": 1.035903428824147e-05, + "loss": 1.1423, + "step": 43176 + }, + { + "epoch": 0.5397384934623366, + "grad_norm": 4.688346862792969, + "learning_rate": 1.0358162160661924e-05, + "loss": 0.9688, + "step": 43178 + }, + { + "epoch": 0.5397634940873521, + "grad_norm": 4.646968364715576, + "learning_rate": 1.0357290030354663e-05, + "loss": 0.7734, + "step": 43180 + }, + { + "epoch": 0.5397884947123678, + "grad_norm": 7.065082550048828, + "learning_rate": 1.0356417897326338e-05, + "loss": 1.877, + "step": 43182 + }, + { + "epoch": 0.5398134953373834, + "grad_norm": 1.2885065078735352, + "learning_rate": 1.0355545761583582e-05, + "loss": 0.2437, + "step": 43184 + }, + { + "epoch": 0.5398384959623991, + "grad_norm": 4.395711421966553, + "learning_rate": 1.0354673623133037e-05, + "loss": 1.9457, + "step": 43186 + }, + { + "epoch": 0.5398634965874147, + "grad_norm": 0.3811674118041992, + "learning_rate": 1.0353801481981354e-05, + "loss": 0.0282, + "step": 43188 + }, + { + "epoch": 0.5398884972124303, + "grad_norm": 3.186366081237793, + "learning_rate": 1.0352929338135164e-05, + "loss": 1.5865, + "step": 43190 + }, + { + "epoch": 0.5399134978374459, + "grad_norm": 1.3680676221847534, + "learning_rate": 1.0352057191601115e-05, + "loss": 0.1939, + "step": 43192 + }, + { + "epoch": 0.5399384984624616, + "grad_norm": 6.929533958435059, + "learning_rate": 1.035118504238585e-05, + "loss": 0.5869, + "step": 43194 + }, + { + "epoch": 0.5399634990874772, + "grad_norm": 0.002247779630124569, + "learning_rate": 1.0350312890496008e-05, + "loss": 0.2078, + "step": 43196 + }, + { + "epoch": 0.5399884997124929, + "grad_norm": 2.1655006408691406, + "learning_rate": 1.0349440735938233e-05, + "loss": 0.8726, + "step": 43198 + }, + { + "epoch": 0.5400135003375084, + "grad_norm": 0.5866329669952393, + "learning_rate": 1.0348568578719166e-05, + "loss": 1.3345, + "step": 43200 + }, + { + "epoch": 0.540038500962524, + "grad_norm": 3.3297033309936523, + "learning_rate": 1.0347696418845453e-05, + "loss": 1.3546, + "step": 43202 + }, + { + "epoch": 0.5400635015875397, + "grad_norm": 5.035799026489258, + "learning_rate": 1.0346824256323726e-05, + "loss": 0.5912, + "step": 43204 + }, + { + "epoch": 0.5400885022125553, + "grad_norm": 5.132823467254639, + "learning_rate": 1.0345952091160641e-05, + "loss": 0.9337, + "step": 43206 + }, + { + "epoch": 0.540113502837571, + "grad_norm": 1.4850677251815796, + "learning_rate": 1.0345079923362833e-05, + "loss": 0.0642, + "step": 43208 + }, + { + "epoch": 0.5401385034625865, + "grad_norm": 2.461704730987549, + "learning_rate": 1.0344207752936944e-05, + "loss": 0.6271, + "step": 43210 + }, + { + "epoch": 0.5401635040876022, + "grad_norm": 9.297874450683594, + "learning_rate": 1.0343335579889617e-05, + "loss": 0.6054, + "step": 43212 + }, + { + "epoch": 0.5401885047126178, + "grad_norm": 3.711327314376831, + "learning_rate": 1.0342463404227496e-05, + "loss": 1.3674, + "step": 43214 + }, + { + "epoch": 0.5402135053376335, + "grad_norm": 4.50025749206543, + "learning_rate": 1.034159122595722e-05, + "loss": 1.2939, + "step": 43216 + }, + { + "epoch": 0.5402385059626491, + "grad_norm": 4.114932537078857, + "learning_rate": 1.0340719045085434e-05, + "loss": 0.8268, + "step": 43218 + }, + { + "epoch": 0.5402635065876646, + "grad_norm": 6.088768005371094, + "learning_rate": 1.0339846861618783e-05, + "loss": 1.4911, + "step": 43220 + }, + { + "epoch": 0.5402885072126803, + "grad_norm": 0.0010715165408328176, + "learning_rate": 1.0338974675563905e-05, + "loss": 0.0006, + "step": 43222 + }, + { + "epoch": 0.5403135078376959, + "grad_norm": 2.718385934829712, + "learning_rate": 1.0338102486927441e-05, + "loss": 0.6911, + "step": 43224 + }, + { + "epoch": 0.5403385084627116, + "grad_norm": 1.5266553163528442, + "learning_rate": 1.0337230295716041e-05, + "loss": 0.1405, + "step": 43226 + }, + { + "epoch": 0.5403635090877272, + "grad_norm": 4.923636436462402, + "learning_rate": 1.033635810193634e-05, + "loss": 0.5358, + "step": 43228 + }, + { + "epoch": 0.5403885097127428, + "grad_norm": 4.4057841300964355, + "learning_rate": 1.0335485905594983e-05, + "loss": 1.9594, + "step": 43230 + }, + { + "epoch": 0.5404135103377584, + "grad_norm": 3.5641918182373047, + "learning_rate": 1.033461370669861e-05, + "loss": 1.835, + "step": 43232 + }, + { + "epoch": 0.5404385109627741, + "grad_norm": 4.642385005950928, + "learning_rate": 1.0333741505253874e-05, + "loss": 0.1071, + "step": 43234 + }, + { + "epoch": 0.5404635115877897, + "grad_norm": 0.5767835378646851, + "learning_rate": 1.0332869301267402e-05, + "loss": 0.8467, + "step": 43236 + }, + { + "epoch": 0.5404885122128054, + "grad_norm": 4.55694055557251, + "learning_rate": 1.033199709474585e-05, + "loss": 1.3763, + "step": 43238 + }, + { + "epoch": 0.5405135128378209, + "grad_norm": 1.6516321897506714, + "learning_rate": 1.0331124885695852e-05, + "loss": 1.0293, + "step": 43240 + }, + { + "epoch": 0.5405385134628365, + "grad_norm": 3.9524214267730713, + "learning_rate": 1.0330252674124055e-05, + "loss": 0.904, + "step": 43242 + }, + { + "epoch": 0.5405635140878522, + "grad_norm": 9.346524238586426, + "learning_rate": 1.0329380460037099e-05, + "loss": 1.6745, + "step": 43244 + }, + { + "epoch": 0.5405885147128678, + "grad_norm": 1.7714306116104126, + "learning_rate": 1.0328508243441631e-05, + "loss": 0.8669, + "step": 43246 + }, + { + "epoch": 0.5406135153378835, + "grad_norm": 0.7948061227798462, + "learning_rate": 1.0327636024344287e-05, + "loss": 0.0131, + "step": 43248 + }, + { + "epoch": 0.540638515962899, + "grad_norm": 2.7954814434051514, + "learning_rate": 1.0326763802751716e-05, + "loss": 1.2844, + "step": 43250 + }, + { + "epoch": 0.5406635165879147, + "grad_norm": 4.076493740081787, + "learning_rate": 1.0325891578670559e-05, + "loss": 1.1605, + "step": 43252 + }, + { + "epoch": 0.5406885172129303, + "grad_norm": 0.0019542109221220016, + "learning_rate": 1.0325019352107453e-05, + "loss": 0.9593, + "step": 43254 + }, + { + "epoch": 0.540713517837946, + "grad_norm": 3.8414418697357178, + "learning_rate": 1.0324147123069047e-05, + "loss": 1.0932, + "step": 43256 + }, + { + "epoch": 0.5407385184629616, + "grad_norm": 0.003902094205841422, + "learning_rate": 1.0323274891561985e-05, + "loss": 0.5421, + "step": 43258 + }, + { + "epoch": 0.5407635190879772, + "grad_norm": 5.834115982055664, + "learning_rate": 1.0322402657592907e-05, + "loss": 0.9682, + "step": 43260 + }, + { + "epoch": 0.5407885197129928, + "grad_norm": 0.007839365862309933, + "learning_rate": 1.0321530421168456e-05, + "loss": 0.4828, + "step": 43262 + }, + { + "epoch": 0.5408135203380084, + "grad_norm": 2.467041492462158, + "learning_rate": 1.0320658182295273e-05, + "loss": 1.4699, + "step": 43264 + }, + { + "epoch": 0.5408385209630241, + "grad_norm": 3.09505558013916, + "learning_rate": 1.0319785940980002e-05, + "loss": 0.9262, + "step": 43266 + }, + { + "epoch": 0.5408635215880397, + "grad_norm": 2.1223480701446533, + "learning_rate": 1.031891369722929e-05, + "loss": 1.2667, + "step": 43268 + }, + { + "epoch": 0.5408885222130553, + "grad_norm": 16.785215377807617, + "learning_rate": 1.0318041451049771e-05, + "loss": 2.2783, + "step": 43270 + }, + { + "epoch": 0.5409135228380709, + "grad_norm": 3.441452741622925, + "learning_rate": 1.03171692024481e-05, + "loss": 2.3785, + "step": 43272 + }, + { + "epoch": 0.5409385234630866, + "grad_norm": 0.0009463176247663796, + "learning_rate": 1.0316296951430909e-05, + "loss": 0.364, + "step": 43274 + }, + { + "epoch": 0.5409635240881022, + "grad_norm": 4.280930042266846, + "learning_rate": 1.0315424698004846e-05, + "loss": 1.1422, + "step": 43276 + }, + { + "epoch": 0.5409885247131179, + "grad_norm": 3.7170214653015137, + "learning_rate": 1.0314552442176553e-05, + "loss": 1.2572, + "step": 43278 + }, + { + "epoch": 0.5410135253381334, + "grad_norm": 1.445103406906128, + "learning_rate": 1.0313680183952672e-05, + "loss": 0.7025, + "step": 43280 + }, + { + "epoch": 0.541038525963149, + "grad_norm": 2.2904441356658936, + "learning_rate": 1.0312807923339846e-05, + "loss": 1.5716, + "step": 43282 + }, + { + "epoch": 0.5410635265881647, + "grad_norm": 2.8634555339813232, + "learning_rate": 1.0311935660344723e-05, + "loss": 1.4701, + "step": 43284 + }, + { + "epoch": 0.5410885272131803, + "grad_norm": 3.5462522506713867, + "learning_rate": 1.031106339497394e-05, + "loss": 1.1443, + "step": 43286 + }, + { + "epoch": 0.541113527838196, + "grad_norm": 3.658871650695801, + "learning_rate": 1.0310191127234144e-05, + "loss": 1.1213, + "step": 43288 + }, + { + "epoch": 0.5411385284632115, + "grad_norm": 3.8305795192718506, + "learning_rate": 1.0309318857131974e-05, + "loss": 1.3047, + "step": 43290 + }, + { + "epoch": 0.5411635290882272, + "grad_norm": 2.426657199859619, + "learning_rate": 1.0308446584674077e-05, + "loss": 0.6072, + "step": 43292 + }, + { + "epoch": 0.5411885297132428, + "grad_norm": 3.875218629837036, + "learning_rate": 1.0307574309867092e-05, + "loss": 1.2016, + "step": 43294 + }, + { + "epoch": 0.5412135303382585, + "grad_norm": 3.5859148502349854, + "learning_rate": 1.0306702032717665e-05, + "loss": 0.3313, + "step": 43296 + }, + { + "epoch": 0.5412385309632741, + "grad_norm": 1.4656963348388672, + "learning_rate": 1.0305829753232444e-05, + "loss": 1.0265, + "step": 43298 + }, + { + "epoch": 0.5412635315882897, + "grad_norm": 4.461472511291504, + "learning_rate": 1.0304957471418062e-05, + "loss": 0.3497, + "step": 43300 + }, + { + "epoch": 0.5412885322133053, + "grad_norm": 4.5077409744262695, + "learning_rate": 1.0304085187281166e-05, + "loss": 0.9516, + "step": 43302 + }, + { + "epoch": 0.541313532838321, + "grad_norm": 0.8191695809364319, + "learning_rate": 1.03032129008284e-05, + "loss": 0.106, + "step": 43304 + }, + { + "epoch": 0.5413385334633366, + "grad_norm": 2.518651247024536, + "learning_rate": 1.030234061206641e-05, + "loss": 1.1156, + "step": 43306 + }, + { + "epoch": 0.5413635340883522, + "grad_norm": 3.4396040439605713, + "learning_rate": 1.0301468321001838e-05, + "loss": 0.7444, + "step": 43308 + }, + { + "epoch": 0.5413885347133678, + "grad_norm": 3.1641721725463867, + "learning_rate": 1.0300596027641323e-05, + "loss": 1.0829, + "step": 43310 + }, + { + "epoch": 0.5414135353383834, + "grad_norm": 2.329090118408203, + "learning_rate": 1.0299723731991513e-05, + "loss": 1.7468, + "step": 43312 + }, + { + "epoch": 0.5414385359633991, + "grad_norm": 5.95973539352417, + "learning_rate": 1.0298851434059047e-05, + "loss": 0.3986, + "step": 43314 + }, + { + "epoch": 0.5414635365884147, + "grad_norm": 5.128706932067871, + "learning_rate": 1.0297979133850573e-05, + "loss": 1.6077, + "step": 43316 + }, + { + "epoch": 0.5414885372134304, + "grad_norm": 4.823365688323975, + "learning_rate": 1.0297106831372733e-05, + "loss": 0.8148, + "step": 43318 + }, + { + "epoch": 0.5415135378384459, + "grad_norm": 2.8626108169555664, + "learning_rate": 1.0296234526632165e-05, + "loss": 0.3127, + "step": 43320 + }, + { + "epoch": 0.5415385384634616, + "grad_norm": 1.3867000341415405, + "learning_rate": 1.0295362219635519e-05, + "loss": 1.0811, + "step": 43322 + }, + { + "epoch": 0.5415635390884772, + "grad_norm": 1.8332592248916626, + "learning_rate": 1.0294489910389437e-05, + "loss": 0.8029, + "step": 43324 + }, + { + "epoch": 0.5415885397134929, + "grad_norm": 3.4934959411621094, + "learning_rate": 1.0293617598900563e-05, + "loss": 1.6809, + "step": 43326 + }, + { + "epoch": 0.5416135403385085, + "grad_norm": 1.0729620456695557, + "learning_rate": 1.0292745285175536e-05, + "loss": 0.0965, + "step": 43328 + }, + { + "epoch": 0.541638540963524, + "grad_norm": 1.2676488161087036, + "learning_rate": 1.0291872969221001e-05, + "loss": 0.1692, + "step": 43330 + }, + { + "epoch": 0.5416635415885397, + "grad_norm": 4.563035488128662, + "learning_rate": 1.0291000651043606e-05, + "loss": 2.3766, + "step": 43332 + }, + { + "epoch": 0.5416885422135553, + "grad_norm": 0.0022992142476141453, + "learning_rate": 1.0290128330649989e-05, + "loss": 0.9396, + "step": 43334 + }, + { + "epoch": 0.541713542838571, + "grad_norm": 2.9170804023742676, + "learning_rate": 1.0289256008046797e-05, + "loss": 1.1454, + "step": 43336 + }, + { + "epoch": 0.5417385434635866, + "grad_norm": 1.1777186393737793, + "learning_rate": 1.0288383683240675e-05, + "loss": 0.0383, + "step": 43338 + }, + { + "epoch": 0.5417635440886022, + "grad_norm": 0.001982094021514058, + "learning_rate": 1.0287511356238257e-05, + "loss": 0.0, + "step": 43340 + }, + { + "epoch": 0.5417885447136178, + "grad_norm": 2.997526168823242, + "learning_rate": 1.02866390270462e-05, + "loss": 1.0971, + "step": 43342 + }, + { + "epoch": 0.5418135453386335, + "grad_norm": 0.7076337337493896, + "learning_rate": 1.0285766695671135e-05, + "loss": 0.116, + "step": 43344 + }, + { + "epoch": 0.5418385459636491, + "grad_norm": 5.3731608390808105, + "learning_rate": 1.0284894362119713e-05, + "loss": 1.9502, + "step": 43346 + }, + { + "epoch": 0.5418635465886648, + "grad_norm": 4.473869323730469, + "learning_rate": 1.0284022026398577e-05, + "loss": 1.1733, + "step": 43348 + }, + { + "epoch": 0.5418885472136803, + "grad_norm": 0.41242215037345886, + "learning_rate": 1.028314968851437e-05, + "loss": 0.7742, + "step": 43350 + }, + { + "epoch": 0.5419135478386959, + "grad_norm": 2.703526735305786, + "learning_rate": 1.0282277348473736e-05, + "loss": 0.5866, + "step": 43352 + }, + { + "epoch": 0.5419385484637116, + "grad_norm": 4.7637505531311035, + "learning_rate": 1.0281405006283314e-05, + "loss": 0.1823, + "step": 43354 + }, + { + "epoch": 0.5419635490887272, + "grad_norm": 1.6115128993988037, + "learning_rate": 1.028053266194975e-05, + "loss": 0.1073, + "step": 43356 + }, + { + "epoch": 0.5419885497137429, + "grad_norm": 4.27191162109375, + "learning_rate": 1.0279660315479694e-05, + "loss": 0.8532, + "step": 43358 + }, + { + "epoch": 0.5420135503387584, + "grad_norm": 4.961433410644531, + "learning_rate": 1.0278787966879781e-05, + "loss": 1.6341, + "step": 43360 + }, + { + "epoch": 0.5420385509637741, + "grad_norm": 3.6197502613067627, + "learning_rate": 1.027791561615666e-05, + "loss": 1.0191, + "step": 43362 + }, + { + "epoch": 0.5420635515887897, + "grad_norm": 1.482080101966858, + "learning_rate": 1.0277043263316974e-05, + "loss": 0.727, + "step": 43364 + }, + { + "epoch": 0.5420885522138054, + "grad_norm": 3.316697597503662, + "learning_rate": 1.0276170908367365e-05, + "loss": 1.3903, + "step": 43366 + }, + { + "epoch": 0.542113552838821, + "grad_norm": 3.6904773712158203, + "learning_rate": 1.0275298551314473e-05, + "loss": 0.7902, + "step": 43368 + }, + { + "epoch": 0.5421385534638365, + "grad_norm": 3.5157806873321533, + "learning_rate": 1.0274426192164952e-05, + "loss": 1.4849, + "step": 43370 + }, + { + "epoch": 0.5421635540888522, + "grad_norm": 0.7095217704772949, + "learning_rate": 1.0273553830925437e-05, + "loss": 0.8919, + "step": 43372 + }, + { + "epoch": 0.5421885547138678, + "grad_norm": 5.534646987915039, + "learning_rate": 1.0272681467602574e-05, + "loss": 0.7543, + "step": 43374 + }, + { + "epoch": 0.5422135553388835, + "grad_norm": 2.766075849533081, + "learning_rate": 1.0271809102203012e-05, + "loss": 0.5855, + "step": 43376 + }, + { + "epoch": 0.5422385559638991, + "grad_norm": 1.0852391719818115, + "learning_rate": 1.0270936734733387e-05, + "loss": 0.3971, + "step": 43378 + }, + { + "epoch": 0.5422635565889147, + "grad_norm": 5.75166654586792, + "learning_rate": 1.0270064365200346e-05, + "loss": 2.1671, + "step": 43380 + }, + { + "epoch": 0.5422885572139303, + "grad_norm": 3.159921884536743, + "learning_rate": 1.026919199361053e-05, + "loss": 1.0553, + "step": 43382 + }, + { + "epoch": 0.542313557838946, + "grad_norm": 0.0013273602817207575, + "learning_rate": 1.0268319619970592e-05, + "loss": 0.4077, + "step": 43384 + }, + { + "epoch": 0.5423385584639616, + "grad_norm": 0.05081808194518089, + "learning_rate": 1.0267447244287165e-05, + "loss": 0.6996, + "step": 43386 + }, + { + "epoch": 0.5423635590889773, + "grad_norm": 2.571673631668091, + "learning_rate": 1.02665748665669e-05, + "loss": 1.7864, + "step": 43388 + }, + { + "epoch": 0.5423885597139928, + "grad_norm": 2.4242348670959473, + "learning_rate": 1.026570248681644e-05, + "loss": 1.3251, + "step": 43390 + }, + { + "epoch": 0.5424135603390084, + "grad_norm": 2.941053628921509, + "learning_rate": 1.0264830105042425e-05, + "loss": 0.7255, + "step": 43392 + }, + { + "epoch": 0.5424385609640241, + "grad_norm": 2.356788158416748, + "learning_rate": 1.02639577212515e-05, + "loss": 0.7814, + "step": 43394 + }, + { + "epoch": 0.5424635615890397, + "grad_norm": 3.3724236488342285, + "learning_rate": 1.0263085335450313e-05, + "loss": 1.5288, + "step": 43396 + }, + { + "epoch": 0.5424885622140554, + "grad_norm": 4.561798572540283, + "learning_rate": 1.0262212947645504e-05, + "loss": 1.4299, + "step": 43398 + }, + { + "epoch": 0.5425135628390709, + "grad_norm": 3.5747323036193848, + "learning_rate": 1.026134055784372e-05, + "loss": 1.6765, + "step": 43400 + }, + { + "epoch": 0.5425385634640866, + "grad_norm": 3.8236472606658936, + "learning_rate": 1.0260468166051602e-05, + "loss": 1.3221, + "step": 43402 + }, + { + "epoch": 0.5425635640891022, + "grad_norm": 2.4942221641540527, + "learning_rate": 1.0259595772275797e-05, + "loss": 0.3404, + "step": 43404 + }, + { + "epoch": 0.5425885647141179, + "grad_norm": 0.2950063645839691, + "learning_rate": 1.0258723376522944e-05, + "loss": 0.4051, + "step": 43406 + }, + { + "epoch": 0.5426135653391335, + "grad_norm": 4.730149745941162, + "learning_rate": 1.025785097879969e-05, + "loss": 0.6927, + "step": 43408 + }, + { + "epoch": 0.5426385659641491, + "grad_norm": 4.448894023895264, + "learning_rate": 1.0256978579112684e-05, + "loss": 1.9676, + "step": 43410 + }, + { + "epoch": 0.5426635665891647, + "grad_norm": 0.0021956812124699354, + "learning_rate": 1.025610617746856e-05, + "loss": 1.2449, + "step": 43412 + }, + { + "epoch": 0.5426885672141804, + "grad_norm": 4.187556743621826, + "learning_rate": 1.0255233773873971e-05, + "loss": 0.4869, + "step": 43414 + }, + { + "epoch": 0.542713567839196, + "grad_norm": 0.5576848983764648, + "learning_rate": 1.025436136833556e-05, + "loss": 0.7894, + "step": 43416 + }, + { + "epoch": 0.5427385684642116, + "grad_norm": 4.470184803009033, + "learning_rate": 1.0253488960859964e-05, + "loss": 1.305, + "step": 43418 + }, + { + "epoch": 0.5427635690892272, + "grad_norm": 3.2335355281829834, + "learning_rate": 1.0252616551453834e-05, + "loss": 0.9497, + "step": 43420 + }, + { + "epoch": 0.5427885697142428, + "grad_norm": 4.463446617126465, + "learning_rate": 1.0251744140123811e-05, + "loss": 1.5967, + "step": 43422 + }, + { + "epoch": 0.5428135703392585, + "grad_norm": 7.2077178955078125, + "learning_rate": 1.0250871726876544e-05, + "loss": 2.678, + "step": 43424 + }, + { + "epoch": 0.5428385709642741, + "grad_norm": 1.815747857093811, + "learning_rate": 1.0249999311718667e-05, + "loss": 0.6062, + "step": 43426 + }, + { + "epoch": 0.5428635715892898, + "grad_norm": 3.813371419906616, + "learning_rate": 1.0249126894656835e-05, + "loss": 0.584, + "step": 43428 + }, + { + "epoch": 0.5428885722143053, + "grad_norm": 3.326270341873169, + "learning_rate": 1.0248254475697688e-05, + "loss": 0.6333, + "step": 43430 + }, + { + "epoch": 0.542913572839321, + "grad_norm": 3.6668026447296143, + "learning_rate": 1.0247382054847867e-05, + "loss": 0.3342, + "step": 43432 + }, + { + "epoch": 0.5429385734643366, + "grad_norm": 1.049540638923645, + "learning_rate": 1.0246509632114021e-05, + "loss": 0.4813, + "step": 43434 + }, + { + "epoch": 0.5429635740893523, + "grad_norm": 4.711830139160156, + "learning_rate": 1.0245637207502794e-05, + "loss": 0.5736, + "step": 43436 + }, + { + "epoch": 0.5429885747143679, + "grad_norm": 2.338252067565918, + "learning_rate": 1.024476478102083e-05, + "loss": 1.4264, + "step": 43438 + }, + { + "epoch": 0.5430135753393834, + "grad_norm": 3.739969253540039, + "learning_rate": 1.0243892352674766e-05, + "loss": 0.478, + "step": 43440 + }, + { + "epoch": 0.5430385759643991, + "grad_norm": 3.097505807876587, + "learning_rate": 1.0243019922471255e-05, + "loss": 0.1293, + "step": 43442 + }, + { + "epoch": 0.5430635765894147, + "grad_norm": 1.4852172136306763, + "learning_rate": 1.024214749041694e-05, + "loss": 0.2489, + "step": 43444 + }, + { + "epoch": 0.5430885772144304, + "grad_norm": 3.738778829574585, + "learning_rate": 1.0241275056518464e-05, + "loss": 1.3197, + "step": 43446 + }, + { + "epoch": 0.543113577839446, + "grad_norm": 3.191632032394409, + "learning_rate": 1.0240402620782473e-05, + "loss": 0.8295, + "step": 43448 + }, + { + "epoch": 0.5431385784644616, + "grad_norm": 5.1796345710754395, + "learning_rate": 1.0239530183215606e-05, + "loss": 2.18, + "step": 43450 + }, + { + "epoch": 0.5431635790894772, + "grad_norm": 2.6813788414001465, + "learning_rate": 1.0238657743824512e-05, + "loss": 0.223, + "step": 43452 + }, + { + "epoch": 0.5431885797144929, + "grad_norm": 0.4823264181613922, + "learning_rate": 1.0237785302615835e-05, + "loss": 0.0062, + "step": 43454 + }, + { + "epoch": 0.5432135803395085, + "grad_norm": 0.0007929130224511027, + "learning_rate": 1.0236912859596218e-05, + "loss": 0.9785, + "step": 43456 + }, + { + "epoch": 0.5432385809645242, + "grad_norm": 5.119254112243652, + "learning_rate": 1.0236040414772306e-05, + "loss": 1.107, + "step": 43458 + }, + { + "epoch": 0.5432635815895397, + "grad_norm": 4.652550220489502, + "learning_rate": 1.0235167968150744e-05, + "loss": 1.1071, + "step": 43460 + }, + { + "epoch": 0.5432885822145553, + "grad_norm": 3.086913585662842, + "learning_rate": 1.0234295519738177e-05, + "loss": 0.6657, + "step": 43462 + }, + { + "epoch": 0.543313582839571, + "grad_norm": 2.4856808185577393, + "learning_rate": 1.0233423069541248e-05, + "loss": 0.6624, + "step": 43464 + }, + { + "epoch": 0.5433385834645866, + "grad_norm": 3.6329197883605957, + "learning_rate": 1.0232550617566602e-05, + "loss": 0.6409, + "step": 43466 + }, + { + "epoch": 0.5433635840896023, + "grad_norm": 2.669771671295166, + "learning_rate": 1.0231678163820882e-05, + "loss": 0.5376, + "step": 43468 + }, + { + "epoch": 0.5433885847146178, + "grad_norm": 5.223621368408203, + "learning_rate": 1.0230805708310735e-05, + "loss": 1.3593, + "step": 43470 + }, + { + "epoch": 0.5434135853396335, + "grad_norm": 0.6816956400871277, + "learning_rate": 1.0229933251042804e-05, + "loss": 1.084, + "step": 43472 + }, + { + "epoch": 0.5434385859646491, + "grad_norm": 4.330381393432617, + "learning_rate": 1.0229060792023735e-05, + "loss": 1.204, + "step": 43474 + }, + { + "epoch": 0.5434635865896648, + "grad_norm": 3.2761595249176025, + "learning_rate": 1.0228188331260171e-05, + "loss": 0.4107, + "step": 43476 + }, + { + "epoch": 0.5434885872146804, + "grad_norm": 4.789122581481934, + "learning_rate": 1.0227315868758757e-05, + "loss": 0.6778, + "step": 43478 + }, + { + "epoch": 0.5435135878396959, + "grad_norm": 3.598663091659546, + "learning_rate": 1.0226443404526136e-05, + "loss": 0.8247, + "step": 43480 + }, + { + "epoch": 0.5435385884647116, + "grad_norm": 2.997636556625366, + "learning_rate": 1.0225570938568954e-05, + "loss": 0.6969, + "step": 43482 + }, + { + "epoch": 0.5435635890897272, + "grad_norm": 4.211601257324219, + "learning_rate": 1.0224698470893857e-05, + "loss": 0.3763, + "step": 43484 + }, + { + "epoch": 0.5435885897147429, + "grad_norm": 3.7492763996124268, + "learning_rate": 1.0223826001507486e-05, + "loss": 0.6903, + "step": 43486 + }, + { + "epoch": 0.5436135903397585, + "grad_norm": 6.108741283416748, + "learning_rate": 1.0222953530416492e-05, + "loss": 1.3512, + "step": 43488 + }, + { + "epoch": 0.5436385909647741, + "grad_norm": 1.283374309539795, + "learning_rate": 1.0222081057627513e-05, + "loss": 0.0734, + "step": 43490 + }, + { + "epoch": 0.5436635915897897, + "grad_norm": 3.1076180934906006, + "learning_rate": 1.0221208583147195e-05, + "loss": 0.5106, + "step": 43492 + }, + { + "epoch": 0.5436885922148054, + "grad_norm": 4.659538745880127, + "learning_rate": 1.0220336106982185e-05, + "loss": 1.0962, + "step": 43494 + }, + { + "epoch": 0.543713592839821, + "grad_norm": 5.196680068969727, + "learning_rate": 1.0219463629139125e-05, + "loss": 0.8516, + "step": 43496 + }, + { + "epoch": 0.5437385934648367, + "grad_norm": 2.7470688819885254, + "learning_rate": 1.0218591149624663e-05, + "loss": 1.3941, + "step": 43498 + }, + { + "epoch": 0.5437635940898522, + "grad_norm": 0.30013740062713623, + "learning_rate": 1.0217718668445442e-05, + "loss": 0.115, + "step": 43500 + }, + { + "epoch": 0.5437885947148678, + "grad_norm": 1.347477674484253, + "learning_rate": 1.0216846185608106e-05, + "loss": 0.1803, + "step": 43502 + }, + { + "epoch": 0.5438135953398835, + "grad_norm": 4.627899169921875, + "learning_rate": 1.0215973701119299e-05, + "loss": 0.6599, + "step": 43504 + }, + { + "epoch": 0.5438385959648991, + "grad_norm": 3.437678337097168, + "learning_rate": 1.0215101214985667e-05, + "loss": 0.3963, + "step": 43506 + }, + { + "epoch": 0.5438635965899148, + "grad_norm": 6.208085060119629, + "learning_rate": 1.0214228727213855e-05, + "loss": 0.9585, + "step": 43508 + }, + { + "epoch": 0.5438885972149303, + "grad_norm": 2.754817485809326, + "learning_rate": 1.0213356237810506e-05, + "loss": 1.1292, + "step": 43510 + }, + { + "epoch": 0.543913597839946, + "grad_norm": 3.0601396560668945, + "learning_rate": 1.0212483746782267e-05, + "loss": 1.7027, + "step": 43512 + }, + { + "epoch": 0.5439385984649616, + "grad_norm": 0.008570910431444645, + "learning_rate": 1.0211611254135785e-05, + "loss": 0.0002, + "step": 43514 + }, + { + "epoch": 0.5439635990899773, + "grad_norm": 4.705532550811768, + "learning_rate": 1.0210738759877698e-05, + "loss": 0.9042, + "step": 43516 + }, + { + "epoch": 0.5439885997149929, + "grad_norm": 3.665009021759033, + "learning_rate": 1.0209866264014654e-05, + "loss": 1.4311, + "step": 43518 + }, + { + "epoch": 0.5440136003400085, + "grad_norm": 3.38834810256958, + "learning_rate": 1.0208993766553298e-05, + "loss": 0.7401, + "step": 43520 + }, + { + "epoch": 0.5440386009650241, + "grad_norm": 4.37093448638916, + "learning_rate": 1.0208121267500278e-05, + "loss": 0.3195, + "step": 43522 + }, + { + "epoch": 0.5440636015900397, + "grad_norm": 3.9139649868011475, + "learning_rate": 1.0207248766862235e-05, + "loss": 1.3817, + "step": 43524 + }, + { + "epoch": 0.5440886022150554, + "grad_norm": 3.3704984188079834, + "learning_rate": 1.0206376264645814e-05, + "loss": 0.7835, + "step": 43526 + }, + { + "epoch": 0.544113602840071, + "grad_norm": 3.6130213737487793, + "learning_rate": 1.020550376085766e-05, + "loss": 0.4851, + "step": 43528 + }, + { + "epoch": 0.5441386034650866, + "grad_norm": 2.0421876907348633, + "learning_rate": 1.0204631255504418e-05, + "loss": 1.5314, + "step": 43530 + }, + { + "epoch": 0.5441636040901022, + "grad_norm": 5.8846282958984375, + "learning_rate": 1.0203758748592735e-05, + "loss": 0.5995, + "step": 43532 + }, + { + "epoch": 0.5441886047151179, + "grad_norm": 1.520699143409729, + "learning_rate": 1.0202886240129253e-05, + "loss": 0.6426, + "step": 43534 + }, + { + "epoch": 0.5442136053401335, + "grad_norm": 0.001098352950066328, + "learning_rate": 1.0202013730120619e-05, + "loss": 0.5491, + "step": 43536 + }, + { + "epoch": 0.5442386059651492, + "grad_norm": 0.0008215115522034466, + "learning_rate": 1.0201141218573476e-05, + "loss": 0.7186, + "step": 43538 + }, + { + "epoch": 0.5442636065901647, + "grad_norm": 4.637412071228027, + "learning_rate": 1.0200268705494472e-05, + "loss": 1.7534, + "step": 43540 + }, + { + "epoch": 0.5442886072151804, + "grad_norm": 4.708106994628906, + "learning_rate": 1.0199396190890247e-05, + "loss": 1.7069, + "step": 43542 + }, + { + "epoch": 0.544313607840196, + "grad_norm": 5.165928840637207, + "learning_rate": 1.019852367476745e-05, + "loss": 1.4904, + "step": 43544 + }, + { + "epoch": 0.5443386084652116, + "grad_norm": 0.000594232464209199, + "learning_rate": 1.0197651157132725e-05, + "loss": 0.6551, + "step": 43546 + }, + { + "epoch": 0.5443636090902273, + "grad_norm": 0.0009567639790475368, + "learning_rate": 1.0196778637992714e-05, + "loss": 0.2863, + "step": 43548 + }, + { + "epoch": 0.5443886097152428, + "grad_norm": 3.7446093559265137, + "learning_rate": 1.0195906117354068e-05, + "loss": 0.5489, + "step": 43550 + }, + { + "epoch": 0.5444136103402585, + "grad_norm": 6.999474048614502, + "learning_rate": 1.0195033595223428e-05, + "loss": 0.908, + "step": 43552 + }, + { + "epoch": 0.5444386109652741, + "grad_norm": 1.0683987140655518, + "learning_rate": 1.0194161071607442e-05, + "loss": 1.0305, + "step": 43554 + }, + { + "epoch": 0.5444636115902898, + "grad_norm": 3.545116901397705, + "learning_rate": 1.0193288546512749e-05, + "loss": 1.2424, + "step": 43556 + }, + { + "epoch": 0.5444886122153054, + "grad_norm": 0.00040610614814795554, + "learning_rate": 1.0192416019945996e-05, + "loss": 0.8984, + "step": 43558 + }, + { + "epoch": 0.544513612840321, + "grad_norm": 2.1973154544830322, + "learning_rate": 1.0191543491913835e-05, + "loss": 0.1072, + "step": 43560 + }, + { + "epoch": 0.5445386134653366, + "grad_norm": 3.3871140480041504, + "learning_rate": 1.0190670962422904e-05, + "loss": 0.1644, + "step": 43562 + }, + { + "epoch": 0.5445636140903523, + "grad_norm": 2.0035908222198486, + "learning_rate": 1.0189798431479852e-05, + "loss": 0.2174, + "step": 43564 + }, + { + "epoch": 0.5445886147153679, + "grad_norm": 6.184626579284668, + "learning_rate": 1.018892589909132e-05, + "loss": 1.9772, + "step": 43566 + }, + { + "epoch": 0.5446136153403836, + "grad_norm": 3.415134906768799, + "learning_rate": 1.0188053365263954e-05, + "loss": 0.6299, + "step": 43568 + }, + { + "epoch": 0.5446386159653991, + "grad_norm": 2.399644374847412, + "learning_rate": 1.0187180830004402e-05, + "loss": 0.7724, + "step": 43570 + }, + { + "epoch": 0.5446636165904147, + "grad_norm": 2.8386037349700928, + "learning_rate": 1.0186308293319305e-05, + "loss": 1.1604, + "step": 43572 + }, + { + "epoch": 0.5446886172154304, + "grad_norm": 4.033851146697998, + "learning_rate": 1.018543575521531e-05, + "loss": 1.5432, + "step": 43574 + }, + { + "epoch": 0.544713617840446, + "grad_norm": 3.212909698486328, + "learning_rate": 1.0184563215699066e-05, + "loss": 0.7506, + "step": 43576 + }, + { + "epoch": 0.5447386184654617, + "grad_norm": 7.864120006561279, + "learning_rate": 1.0183690674777211e-05, + "loss": 1.0791, + "step": 43578 + }, + { + "epoch": 0.5447636190904772, + "grad_norm": 4.003653526306152, + "learning_rate": 1.0182818132456398e-05, + "loss": 1.2823, + "step": 43580 + }, + { + "epoch": 0.5447886197154929, + "grad_norm": 3.2105231285095215, + "learning_rate": 1.0181945588743265e-05, + "loss": 1.2229, + "step": 43582 + }, + { + "epoch": 0.5448136203405085, + "grad_norm": 0.0010246215388178825, + "learning_rate": 1.0181073043644458e-05, + "loss": 0.0, + "step": 43584 + }, + { + "epoch": 0.5448386209655242, + "grad_norm": 4.210725784301758, + "learning_rate": 1.018020049716663e-05, + "loss": 1.172, + "step": 43586 + }, + { + "epoch": 0.5448636215905398, + "grad_norm": 4.139963626861572, + "learning_rate": 1.0179327949316416e-05, + "loss": 0.489, + "step": 43588 + }, + { + "epoch": 0.5448886222155553, + "grad_norm": 2.8329217433929443, + "learning_rate": 1.0178455400100466e-05, + "loss": 1.0522, + "step": 43590 + }, + { + "epoch": 0.544913622840571, + "grad_norm": 6.294987201690674, + "learning_rate": 1.017758284952543e-05, + "loss": 0.7034, + "step": 43592 + }, + { + "epoch": 0.5449386234655866, + "grad_norm": 4.826291084289551, + "learning_rate": 1.0176710297597942e-05, + "loss": 0.7503, + "step": 43594 + }, + { + "epoch": 0.5449636240906023, + "grad_norm": 1.1619231700897217, + "learning_rate": 1.0175837744324654e-05, + "loss": 0.4941, + "step": 43596 + }, + { + "epoch": 0.5449886247156179, + "grad_norm": 5.512567043304443, + "learning_rate": 1.0174965189712213e-05, + "loss": 0.8404, + "step": 43598 + }, + { + "epoch": 0.5450136253406335, + "grad_norm": 0.985165536403656, + "learning_rate": 1.017409263376726e-05, + "loss": 0.193, + "step": 43600 + }, + { + "epoch": 0.5450386259656491, + "grad_norm": 1.0114957094192505, + "learning_rate": 1.0173220076496444e-05, + "loss": 0.1386, + "step": 43602 + }, + { + "epoch": 0.5450636265906648, + "grad_norm": 3.8194587230682373, + "learning_rate": 1.0172347517906406e-05, + "loss": 0.4874, + "step": 43604 + }, + { + "epoch": 0.5450886272156804, + "grad_norm": 2.7693393230438232, + "learning_rate": 1.0171474958003799e-05, + "loss": 0.793, + "step": 43606 + }, + { + "epoch": 0.5451136278406961, + "grad_norm": 4.087007522583008, + "learning_rate": 1.0170602396795257e-05, + "loss": 1.0844, + "step": 43608 + }, + { + "epoch": 0.5451386284657116, + "grad_norm": 4.562782287597656, + "learning_rate": 1.0169729834287432e-05, + "loss": 1.8643, + "step": 43610 + }, + { + "epoch": 0.5451636290907272, + "grad_norm": 2.7563869953155518, + "learning_rate": 1.0168857270486974e-05, + "loss": 1.2177, + "step": 43612 + }, + { + "epoch": 0.5451886297157429, + "grad_norm": 2.9705774784088135, + "learning_rate": 1.0167984705400518e-05, + "loss": 1.5005, + "step": 43614 + }, + { + "epoch": 0.5452136303407585, + "grad_norm": 2.059826612472534, + "learning_rate": 1.0167112139034715e-05, + "loss": 1.1657, + "step": 43616 + }, + { + "epoch": 0.5452386309657742, + "grad_norm": 0.7151658535003662, + "learning_rate": 1.016623957139621e-05, + "loss": 0.0356, + "step": 43618 + }, + { + "epoch": 0.5452636315907897, + "grad_norm": 6.320305347442627, + "learning_rate": 1.0165367002491646e-05, + "loss": 1.7808, + "step": 43620 + }, + { + "epoch": 0.5452886322158054, + "grad_norm": 0.000603593303821981, + "learning_rate": 1.0164494432327672e-05, + "loss": 0.4602, + "step": 43622 + }, + { + "epoch": 0.545313632840821, + "grad_norm": 4.003577709197998, + "learning_rate": 1.0163621860910933e-05, + "loss": 0.8496, + "step": 43624 + }, + { + "epoch": 0.5453386334658367, + "grad_norm": 6.921877384185791, + "learning_rate": 1.016274928824807e-05, + "loss": 1.7072, + "step": 43626 + }, + { + "epoch": 0.5453636340908523, + "grad_norm": 1.6393535137176514, + "learning_rate": 1.0161876714345735e-05, + "loss": 0.3559, + "step": 43628 + }, + { + "epoch": 0.5453886347158678, + "grad_norm": 4.433106899261475, + "learning_rate": 1.0161004139210567e-05, + "loss": 0.9067, + "step": 43630 + }, + { + "epoch": 0.5454136353408835, + "grad_norm": 3.8810479640960693, + "learning_rate": 1.0160131562849215e-05, + "loss": 1.9742, + "step": 43632 + }, + { + "epoch": 0.5454386359658991, + "grad_norm": 3.6376054286956787, + "learning_rate": 1.0159258985268324e-05, + "loss": 0.8875, + "step": 43634 + }, + { + "epoch": 0.5454636365909148, + "grad_norm": 1.074447512626648, + "learning_rate": 1.015838640647454e-05, + "loss": 0.9336, + "step": 43636 + }, + { + "epoch": 0.5454886372159304, + "grad_norm": 0.00045494080404751003, + "learning_rate": 1.0157513826474505e-05, + "loss": 0.3111, + "step": 43638 + }, + { + "epoch": 0.545513637840946, + "grad_norm": 0.001116504892706871, + "learning_rate": 1.015664124527487e-05, + "loss": 0.006, + "step": 43640 + }, + { + "epoch": 0.5455386384659616, + "grad_norm": 2.1715357303619385, + "learning_rate": 1.0155768662882275e-05, + "loss": 0.8966, + "step": 43642 + }, + { + "epoch": 0.5455636390909773, + "grad_norm": 2.988323211669922, + "learning_rate": 1.015489607930337e-05, + "loss": 1.1312, + "step": 43644 + }, + { + "epoch": 0.5455886397159929, + "grad_norm": 0.2789607644081116, + "learning_rate": 1.0154023494544795e-05, + "loss": 0.1031, + "step": 43646 + }, + { + "epoch": 0.5456136403410086, + "grad_norm": 2.9042630195617676, + "learning_rate": 1.0153150908613201e-05, + "loss": 0.6883, + "step": 43648 + }, + { + "epoch": 0.5456386409660241, + "grad_norm": 2.600882053375244, + "learning_rate": 1.0152278321515234e-05, + "loss": 1.3832, + "step": 43650 + }, + { + "epoch": 0.5456636415910397, + "grad_norm": 0.8956068158149719, + "learning_rate": 1.0151405733257533e-05, + "loss": 0.1795, + "step": 43652 + }, + { + "epoch": 0.5456886422160554, + "grad_norm": 0.29878365993499756, + "learning_rate": 1.0150533143846748e-05, + "loss": 0.4441, + "step": 43654 + }, + { + "epoch": 0.545713642841071, + "grad_norm": 4.619248867034912, + "learning_rate": 1.0149660553289523e-05, + "loss": 0.7933, + "step": 43656 + }, + { + "epoch": 0.5457386434660867, + "grad_norm": 0.0051751406863331795, + "learning_rate": 1.0148787961592508e-05, + "loss": 0.0001, + "step": 43658 + }, + { + "epoch": 0.5457636440911022, + "grad_norm": 2.0151379108428955, + "learning_rate": 1.0147915368762342e-05, + "loss": 1.2992, + "step": 43660 + }, + { + "epoch": 0.5457886447161179, + "grad_norm": 4.022125720977783, + "learning_rate": 1.0147042774805673e-05, + "loss": 1.5696, + "step": 43662 + }, + { + "epoch": 0.5458136453411335, + "grad_norm": 6.4227824211120605, + "learning_rate": 1.014617017972915e-05, + "loss": 1.398, + "step": 43664 + }, + { + "epoch": 0.5458386459661492, + "grad_norm": 8.397071838378906, + "learning_rate": 1.0145297583539413e-05, + "loss": 2.0336, + "step": 43666 + }, + { + "epoch": 0.5458636465911648, + "grad_norm": 0.3261880576610565, + "learning_rate": 1.0144424986243112e-05, + "loss": 0.3432, + "step": 43668 + }, + { + "epoch": 0.5458886472161804, + "grad_norm": 3.345017910003662, + "learning_rate": 1.0143552387846889e-05, + "loss": 0.1631, + "step": 43670 + }, + { + "epoch": 0.545913647841196, + "grad_norm": 0.7306689023971558, + "learning_rate": 1.014267978835739e-05, + "loss": 0.4065, + "step": 43672 + }, + { + "epoch": 0.5459386484662117, + "grad_norm": 1.8154033422470093, + "learning_rate": 1.0141807187781266e-05, + "loss": 0.1636, + "step": 43674 + }, + { + "epoch": 0.5459636490912273, + "grad_norm": 3.3349316120147705, + "learning_rate": 1.0140934586125156e-05, + "loss": 0.3096, + "step": 43676 + }, + { + "epoch": 0.545988649716243, + "grad_norm": 2.614297389984131, + "learning_rate": 1.0140061983395709e-05, + "loss": 1.1692, + "step": 43678 + }, + { + "epoch": 0.5460136503412585, + "grad_norm": 6.733905792236328, + "learning_rate": 1.013918937959957e-05, + "loss": 1.6935, + "step": 43680 + }, + { + "epoch": 0.5460386509662741, + "grad_norm": 2.032320737838745, + "learning_rate": 1.0138316774743382e-05, + "loss": 0.4569, + "step": 43682 + }, + { + "epoch": 0.5460636515912898, + "grad_norm": 4.29327917098999, + "learning_rate": 1.0137444168833795e-05, + "loss": 0.3684, + "step": 43684 + }, + { + "epoch": 0.5460886522163054, + "grad_norm": 1.6888477802276611, + "learning_rate": 1.013657156187745e-05, + "loss": 0.068, + "step": 43686 + }, + { + "epoch": 0.5461136528413211, + "grad_norm": 9.013087272644043, + "learning_rate": 1.0135698953881e-05, + "loss": 1.3588, + "step": 43688 + }, + { + "epoch": 0.5461386534663366, + "grad_norm": 3.1514408588409424, + "learning_rate": 1.0134826344851085e-05, + "loss": 1.2398, + "step": 43690 + }, + { + "epoch": 0.5461636540913523, + "grad_norm": 0.4984254837036133, + "learning_rate": 1.013395373479435e-05, + "loss": 0.843, + "step": 43692 + }, + { + "epoch": 0.5461886547163679, + "grad_norm": 0.0008747889660298824, + "learning_rate": 1.013308112371744e-05, + "loss": 0.4529, + "step": 43694 + }, + { + "epoch": 0.5462136553413836, + "grad_norm": 2.6194849014282227, + "learning_rate": 1.0132208511627007e-05, + "loss": 1.3726, + "step": 43696 + }, + { + "epoch": 0.5462386559663992, + "grad_norm": 2.7608261108398438, + "learning_rate": 1.013133589852969e-05, + "loss": 0.228, + "step": 43698 + }, + { + "epoch": 0.5462636565914147, + "grad_norm": 0.14282985031604767, + "learning_rate": 1.0130463284432137e-05, + "loss": 0.0045, + "step": 43700 + }, + { + "epoch": 0.5462886572164304, + "grad_norm": 4.714839935302734, + "learning_rate": 1.0129590669340996e-05, + "loss": 0.312, + "step": 43702 + }, + { + "epoch": 0.546313657841446, + "grad_norm": 3.2655086517333984, + "learning_rate": 1.0128718053262913e-05, + "loss": 0.9727, + "step": 43704 + }, + { + "epoch": 0.5463386584664617, + "grad_norm": 3.678805112838745, + "learning_rate": 1.0127845436204529e-05, + "loss": 0.8047, + "step": 43706 + }, + { + "epoch": 0.5463636590914773, + "grad_norm": 2.486560106277466, + "learning_rate": 1.0126972818172492e-05, + "loss": 0.8526, + "step": 43708 + }, + { + "epoch": 0.5463886597164929, + "grad_norm": 2.5128068923950195, + "learning_rate": 1.0126100199173448e-05, + "loss": 0.7698, + "step": 43710 + }, + { + "epoch": 0.5464136603415085, + "grad_norm": 3.6871657371520996, + "learning_rate": 1.0125227579214041e-05, + "loss": 0.9053, + "step": 43712 + }, + { + "epoch": 0.5464386609665242, + "grad_norm": 9.660983085632324, + "learning_rate": 1.012435495830092e-05, + "loss": 2.2516, + "step": 43714 + }, + { + "epoch": 0.5464636615915398, + "grad_norm": 2.37300443649292, + "learning_rate": 1.012348233644073e-05, + "loss": 0.995, + "step": 43716 + }, + { + "epoch": 0.5464886622165555, + "grad_norm": 2.229994535446167, + "learning_rate": 1.0122609713640117e-05, + "loss": 0.7175, + "step": 43718 + }, + { + "epoch": 0.546513662841571, + "grad_norm": 3.3808541297912598, + "learning_rate": 1.0121737089905723e-05, + "loss": 1.6584, + "step": 43720 + }, + { + "epoch": 0.5465386634665866, + "grad_norm": 6.233678817749023, + "learning_rate": 1.0120864465244198e-05, + "loss": 1.4347, + "step": 43722 + }, + { + "epoch": 0.5465636640916023, + "grad_norm": 2.6801483631134033, + "learning_rate": 1.0119991839662185e-05, + "loss": 0.4816, + "step": 43724 + }, + { + "epoch": 0.5465886647166179, + "grad_norm": 2.3311808109283447, + "learning_rate": 1.0119119213166332e-05, + "loss": 1.2002, + "step": 43726 + }, + { + "epoch": 0.5466136653416336, + "grad_norm": 0.002531353384256363, + "learning_rate": 1.0118246585763285e-05, + "loss": 0.1083, + "step": 43728 + }, + { + "epoch": 0.5466386659666491, + "grad_norm": 3.745633602142334, + "learning_rate": 1.011737395745969e-05, + "loss": 1.8243, + "step": 43730 + }, + { + "epoch": 0.5466636665916648, + "grad_norm": 4.7686262130737305, + "learning_rate": 1.0116501328262187e-05, + "loss": 0.2329, + "step": 43732 + }, + { + "epoch": 0.5466886672166804, + "grad_norm": 6.229034423828125, + "learning_rate": 1.0115628698177429e-05, + "loss": 0.7467, + "step": 43734 + }, + { + "epoch": 0.5467136678416961, + "grad_norm": 5.485824108123779, + "learning_rate": 1.0114756067212059e-05, + "loss": 0.9672, + "step": 43736 + }, + { + "epoch": 0.5467386684667117, + "grad_norm": 0.0009901192970573902, + "learning_rate": 1.0113883435372722e-05, + "loss": 0.5444, + "step": 43738 + }, + { + "epoch": 0.5467636690917272, + "grad_norm": 2.7504591941833496, + "learning_rate": 1.0113010802666065e-05, + "loss": 1.076, + "step": 43740 + }, + { + "epoch": 0.5467886697167429, + "grad_norm": 0.0010759856086224318, + "learning_rate": 1.0112138169098737e-05, + "loss": 0.751, + "step": 43742 + }, + { + "epoch": 0.5468136703417585, + "grad_norm": 3.502086877822876, + "learning_rate": 1.0111265534677376e-05, + "loss": 1.095, + "step": 43744 + }, + { + "epoch": 0.5468386709667742, + "grad_norm": 6.038725852966309, + "learning_rate": 1.0110392899408637e-05, + "loss": 2.2031, + "step": 43746 + }, + { + "epoch": 0.5468636715917898, + "grad_norm": 1.1862108707427979, + "learning_rate": 1.0109520263299157e-05, + "loss": 0.527, + "step": 43748 + }, + { + "epoch": 0.5468886722168054, + "grad_norm": 2.0215327739715576, + "learning_rate": 1.0108647626355588e-05, + "loss": 0.0979, + "step": 43750 + }, + { + "epoch": 0.546913672841821, + "grad_norm": 4.563474178314209, + "learning_rate": 1.0107774988584573e-05, + "loss": 1.1678, + "step": 43752 + }, + { + "epoch": 0.5469386734668367, + "grad_norm": 4.570418357849121, + "learning_rate": 1.010690234999276e-05, + "loss": 1.4545, + "step": 43754 + }, + { + "epoch": 0.5469636740918523, + "grad_norm": 6.7011919021606445, + "learning_rate": 1.0106029710586796e-05, + "loss": 1.0338, + "step": 43756 + }, + { + "epoch": 0.546988674716868, + "grad_norm": 3.641923189163208, + "learning_rate": 1.0105157070373322e-05, + "loss": 1.1625, + "step": 43758 + }, + { + "epoch": 0.5470136753418835, + "grad_norm": 2.48683762550354, + "learning_rate": 1.0104284429358984e-05, + "loss": 0.7608, + "step": 43760 + }, + { + "epoch": 0.5470386759668991, + "grad_norm": 6.308766841888428, + "learning_rate": 1.0103411787550437e-05, + "loss": 0.7413, + "step": 43762 + }, + { + "epoch": 0.5470636765919148, + "grad_norm": 1.66007399559021, + "learning_rate": 1.0102539144954316e-05, + "loss": 0.5229, + "step": 43764 + }, + { + "epoch": 0.5470886772169304, + "grad_norm": 3.9612245559692383, + "learning_rate": 1.0101666501577272e-05, + "loss": 0.8455, + "step": 43766 + }, + { + "epoch": 0.5471136778419461, + "grad_norm": 1.6854314804077148, + "learning_rate": 1.0100793857425953e-05, + "loss": 0.1464, + "step": 43768 + }, + { + "epoch": 0.5471386784669616, + "grad_norm": 2.266697883605957, + "learning_rate": 1.0099921212506997e-05, + "loss": 0.9345, + "step": 43770 + }, + { + "epoch": 0.5471636790919773, + "grad_norm": 3.7563085556030273, + "learning_rate": 1.009904856682706e-05, + "loss": 0.7633, + "step": 43772 + }, + { + "epoch": 0.5471886797169929, + "grad_norm": 0.006692993920296431, + "learning_rate": 1.0098175920392781e-05, + "loss": 0.4797, + "step": 43774 + }, + { + "epoch": 0.5472136803420086, + "grad_norm": 2.77628493309021, + "learning_rate": 1.0097303273210808e-05, + "loss": 0.5595, + "step": 43776 + }, + { + "epoch": 0.5472386809670242, + "grad_norm": 7.81703519821167, + "learning_rate": 1.0096430625287788e-05, + "loss": 0.4699, + "step": 43778 + }, + { + "epoch": 0.5472636815920398, + "grad_norm": 4.664419651031494, + "learning_rate": 1.0095557976630363e-05, + "loss": 2.1288, + "step": 43780 + }, + { + "epoch": 0.5472886822170554, + "grad_norm": 7.156835079193115, + "learning_rate": 1.0094685327245186e-05, + "loss": 0.7923, + "step": 43782 + }, + { + "epoch": 0.547313682842071, + "grad_norm": 0.004184038378298283, + "learning_rate": 1.0093812677138896e-05, + "loss": 1.1379, + "step": 43784 + }, + { + "epoch": 0.5473386834670867, + "grad_norm": 2.2126128673553467, + "learning_rate": 1.0092940026318143e-05, + "loss": 0.8162, + "step": 43786 + }, + { + "epoch": 0.5473636840921023, + "grad_norm": 3.368764638900757, + "learning_rate": 1.0092067374789574e-05, + "loss": 1.8444, + "step": 43788 + }, + { + "epoch": 0.5473886847171179, + "grad_norm": 4.714674472808838, + "learning_rate": 1.0091194722559829e-05, + "loss": 1.5981, + "step": 43790 + }, + { + "epoch": 0.5474136853421335, + "grad_norm": 1.4815115928649902, + "learning_rate": 1.009032206963556e-05, + "loss": 0.1219, + "step": 43792 + }, + { + "epoch": 0.5474386859671492, + "grad_norm": 4.066650390625, + "learning_rate": 1.0089449416023413e-05, + "loss": 0.7316, + "step": 43794 + }, + { + "epoch": 0.5474636865921648, + "grad_norm": 3.911226511001587, + "learning_rate": 1.0088576761730029e-05, + "loss": 1.7878, + "step": 43796 + }, + { + "epoch": 0.5474886872171805, + "grad_norm": 8.288189888000488, + "learning_rate": 1.0087704106762055e-05, + "loss": 0.5921, + "step": 43798 + }, + { + "epoch": 0.547513687842196, + "grad_norm": 4.241810321807861, + "learning_rate": 1.0086831451126144e-05, + "loss": 1.5769, + "step": 43800 + }, + { + "epoch": 0.5475386884672117, + "grad_norm": 4.629834175109863, + "learning_rate": 1.0085958794828934e-05, + "loss": 0.7756, + "step": 43802 + }, + { + "epoch": 0.5475636890922273, + "grad_norm": 1.9441841840744019, + "learning_rate": 1.0085086137877073e-05, + "loss": 0.6994, + "step": 43804 + }, + { + "epoch": 0.547588689717243, + "grad_norm": 0.000733585620764643, + "learning_rate": 1.008421348027721e-05, + "loss": 0.0002, + "step": 43806 + }, + { + "epoch": 0.5476136903422586, + "grad_norm": 2.0861964225769043, + "learning_rate": 1.008334082203599e-05, + "loss": 0.6398, + "step": 43808 + }, + { + "epoch": 0.5476386909672741, + "grad_norm": 3.1325016021728516, + "learning_rate": 1.0082468163160055e-05, + "loss": 0.7363, + "step": 43810 + }, + { + "epoch": 0.5476636915922898, + "grad_norm": 4.065870761871338, + "learning_rate": 1.0081595503656055e-05, + "loss": 0.9898, + "step": 43812 + }, + { + "epoch": 0.5476886922173054, + "grad_norm": 1.6932857036590576, + "learning_rate": 1.0080722843530637e-05, + "loss": 1.2085, + "step": 43814 + }, + { + "epoch": 0.5477136928423211, + "grad_norm": 2.2295963764190674, + "learning_rate": 1.0079850182790445e-05, + "loss": 0.4703, + "step": 43816 + }, + { + "epoch": 0.5477386934673367, + "grad_norm": 2.999194622039795, + "learning_rate": 1.0078977521442124e-05, + "loss": 1.5278, + "step": 43818 + }, + { + "epoch": 0.5477636940923523, + "grad_norm": 4.075721263885498, + "learning_rate": 1.0078104859492324e-05, + "loss": 1.0896, + "step": 43820 + }, + { + "epoch": 0.5477886947173679, + "grad_norm": 0.0006178173935040832, + "learning_rate": 1.0077232196947681e-05, + "loss": 0.9915, + "step": 43822 + }, + { + "epoch": 0.5478136953423836, + "grad_norm": 3.071225881576538, + "learning_rate": 1.0076359533814854e-05, + "loss": 1.1981, + "step": 43824 + }, + { + "epoch": 0.5478386959673992, + "grad_norm": 6.815176963806152, + "learning_rate": 1.0075486870100484e-05, + "loss": 1.0033, + "step": 43826 + }, + { + "epoch": 0.5478636965924149, + "grad_norm": 1.254244089126587, + "learning_rate": 1.0074614205811217e-05, + "loss": 0.0294, + "step": 43828 + }, + { + "epoch": 0.5478886972174304, + "grad_norm": 0.0008940246771089733, + "learning_rate": 1.0073741540953699e-05, + "loss": 0.0, + "step": 43830 + }, + { + "epoch": 0.547913697842446, + "grad_norm": 14.074328422546387, + "learning_rate": 1.0072868875534573e-05, + "loss": 0.4297, + "step": 43832 + }, + { + "epoch": 0.5479386984674617, + "grad_norm": 0.7301169633865356, + "learning_rate": 1.0071996209560488e-05, + "loss": 0.4597, + "step": 43834 + }, + { + "epoch": 0.5479636990924773, + "grad_norm": 2.3900306224823, + "learning_rate": 1.0071123543038093e-05, + "loss": 1.1597, + "step": 43836 + }, + { + "epoch": 0.547988699717493, + "grad_norm": 0.933010458946228, + "learning_rate": 1.0070250875974027e-05, + "loss": 0.6771, + "step": 43838 + }, + { + "epoch": 0.5480137003425085, + "grad_norm": 4.201489448547363, + "learning_rate": 1.0069378208374947e-05, + "loss": 1.6876, + "step": 43840 + }, + { + "epoch": 0.5480387009675242, + "grad_norm": 4.665738582611084, + "learning_rate": 1.0068505540247489e-05, + "loss": 0.3078, + "step": 43842 + }, + { + "epoch": 0.5480637015925398, + "grad_norm": 1.3539499044418335, + "learning_rate": 1.0067632871598301e-05, + "loss": 0.7264, + "step": 43844 + }, + { + "epoch": 0.5480887022175555, + "grad_norm": 0.0016409317031502724, + "learning_rate": 1.0066760202434031e-05, + "loss": 0.1139, + "step": 43846 + }, + { + "epoch": 0.5481137028425711, + "grad_norm": 3.135622501373291, + "learning_rate": 1.0065887532761326e-05, + "loss": 0.5512, + "step": 43848 + }, + { + "epoch": 0.5481387034675866, + "grad_norm": 3.059042453765869, + "learning_rate": 1.0065014862586828e-05, + "loss": 0.4664, + "step": 43850 + }, + { + "epoch": 0.5481637040926023, + "grad_norm": 1.7663776874542236, + "learning_rate": 1.0064142191917187e-05, + "loss": 0.0899, + "step": 43852 + }, + { + "epoch": 0.5481887047176179, + "grad_norm": 3.070181369781494, + "learning_rate": 1.0063269520759052e-05, + "loss": 1.19, + "step": 43854 + }, + { + "epoch": 0.5482137053426336, + "grad_norm": 3.2595064640045166, + "learning_rate": 1.0062396849119064e-05, + "loss": 1.6043, + "step": 43856 + }, + { + "epoch": 0.5482387059676492, + "grad_norm": 0.035609953105449677, + "learning_rate": 1.0061524177003868e-05, + "loss": 0.2327, + "step": 43858 + }, + { + "epoch": 0.5482637065926648, + "grad_norm": 5.089846134185791, + "learning_rate": 1.0060651504420112e-05, + "loss": 1.5234, + "step": 43860 + }, + { + "epoch": 0.5482887072176804, + "grad_norm": 4.1170477867126465, + "learning_rate": 1.0059778831374443e-05, + "loss": 1.0111, + "step": 43862 + }, + { + "epoch": 0.5483137078426961, + "grad_norm": 3.6890461444854736, + "learning_rate": 1.0058906157873508e-05, + "loss": 0.0984, + "step": 43864 + }, + { + "epoch": 0.5483387084677117, + "grad_norm": 0.09233997017145157, + "learning_rate": 1.0058033483923954e-05, + "loss": 0.0243, + "step": 43866 + }, + { + "epoch": 0.5483637090927274, + "grad_norm": 1.5235885381698608, + "learning_rate": 1.0057160809532422e-05, + "loss": 0.7991, + "step": 43868 + }, + { + "epoch": 0.5483887097177429, + "grad_norm": 2.404022693634033, + "learning_rate": 1.0056288134705563e-05, + "loss": 0.5675, + "step": 43870 + }, + { + "epoch": 0.5484137103427585, + "grad_norm": 2.801147699356079, + "learning_rate": 1.0055415459450022e-05, + "loss": 1.2578, + "step": 43872 + }, + { + "epoch": 0.5484387109677742, + "grad_norm": 2.8684310913085938, + "learning_rate": 1.0054542783772442e-05, + "loss": 0.5856, + "step": 43874 + }, + { + "epoch": 0.5484637115927898, + "grad_norm": 5.135143280029297, + "learning_rate": 1.0053670107679475e-05, + "loss": 1.3541, + "step": 43876 + }, + { + "epoch": 0.5484887122178055, + "grad_norm": 4.397434234619141, + "learning_rate": 1.0052797431177761e-05, + "loss": 2.0261, + "step": 43878 + }, + { + "epoch": 0.548513712842821, + "grad_norm": 8.4989013671875, + "learning_rate": 1.0051924754273954e-05, + "loss": 2.101, + "step": 43880 + }, + { + "epoch": 0.5485387134678367, + "grad_norm": 3.980466842651367, + "learning_rate": 1.0051052076974691e-05, + "loss": 1.0076, + "step": 43882 + }, + { + "epoch": 0.5485637140928523, + "grad_norm": 3.6165664196014404, + "learning_rate": 1.0050179399286624e-05, + "loss": 0.1485, + "step": 43884 + }, + { + "epoch": 0.548588714717868, + "grad_norm": 0.5124138593673706, + "learning_rate": 1.0049306721216398e-05, + "loss": 0.0839, + "step": 43886 + }, + { + "epoch": 0.5486137153428836, + "grad_norm": 2.7620887756347656, + "learning_rate": 1.004843404277066e-05, + "loss": 0.7191, + "step": 43888 + }, + { + "epoch": 0.5486387159678991, + "grad_norm": 1.072569489479065, + "learning_rate": 1.0047561363956051e-05, + "loss": 0.1183, + "step": 43890 + }, + { + "epoch": 0.5486637165929148, + "grad_norm": 3.5663938522338867, + "learning_rate": 1.0046688684779226e-05, + "loss": 0.7387, + "step": 43892 + }, + { + "epoch": 0.5486887172179304, + "grad_norm": 7.087403774261475, + "learning_rate": 1.0045816005246827e-05, + "loss": 1.9046, + "step": 43894 + }, + { + "epoch": 0.5487137178429461, + "grad_norm": 6.727963447570801, + "learning_rate": 1.0044943325365498e-05, + "loss": 1.1923, + "step": 43896 + }, + { + "epoch": 0.5487387184679617, + "grad_norm": 10.978946685791016, + "learning_rate": 1.0044070645141884e-05, + "loss": 1.8004, + "step": 43898 + }, + { + "epoch": 0.5487637190929773, + "grad_norm": 4.441436290740967, + "learning_rate": 1.0043197964582635e-05, + "loss": 0.8877, + "step": 43900 + }, + { + "epoch": 0.5487887197179929, + "grad_norm": 2.0450363159179688, + "learning_rate": 1.0042325283694401e-05, + "loss": 1.1343, + "step": 43902 + }, + { + "epoch": 0.5488137203430086, + "grad_norm": 2.6324644088745117, + "learning_rate": 1.004145260248382e-05, + "loss": 0.2922, + "step": 43904 + }, + { + "epoch": 0.5488387209680242, + "grad_norm": 8.152981758117676, + "learning_rate": 1.0040579920957545e-05, + "loss": 2.278, + "step": 43906 + }, + { + "epoch": 0.5488637215930399, + "grad_norm": 3.744370222091675, + "learning_rate": 1.0039707239122216e-05, + "loss": 1.3822, + "step": 43908 + }, + { + "epoch": 0.5488887222180554, + "grad_norm": 4.826137542724609, + "learning_rate": 1.0038834556984483e-05, + "loss": 0.8877, + "step": 43910 + }, + { + "epoch": 0.548913722843071, + "grad_norm": 2.897368907928467, + "learning_rate": 1.0037961874550992e-05, + "loss": 0.7893, + "step": 43912 + }, + { + "epoch": 0.5489387234680867, + "grad_norm": 3.3772406578063965, + "learning_rate": 1.0037089191828386e-05, + "loss": 0.6603, + "step": 43914 + }, + { + "epoch": 0.5489637240931023, + "grad_norm": 0.08477964252233505, + "learning_rate": 1.0036216508823318e-05, + "loss": 0.8847, + "step": 43916 + }, + { + "epoch": 0.548988724718118, + "grad_norm": 0.11064016819000244, + "learning_rate": 1.003534382554243e-05, + "loss": 0.6781, + "step": 43918 + }, + { + "epoch": 0.5490137253431335, + "grad_norm": 5.978853702545166, + "learning_rate": 1.0034471141992366e-05, + "loss": 1.3092, + "step": 43920 + }, + { + "epoch": 0.5490387259681492, + "grad_norm": 3.5946943759918213, + "learning_rate": 1.0033598458179777e-05, + "loss": 1.6229, + "step": 43922 + }, + { + "epoch": 0.5490637265931648, + "grad_norm": 0.6411059498786926, + "learning_rate": 1.0032725774111303e-05, + "loss": 0.0487, + "step": 43924 + }, + { + "epoch": 0.5490887272181805, + "grad_norm": 0.0010994584299623966, + "learning_rate": 1.0031853089793598e-05, + "loss": 0.0, + "step": 43926 + }, + { + "epoch": 0.5491137278431961, + "grad_norm": 3.5829896926879883, + "learning_rate": 1.0030980405233301e-05, + "loss": 1.208, + "step": 43928 + }, + { + "epoch": 0.5491387284682117, + "grad_norm": 2.6956443786621094, + "learning_rate": 1.0030107720437063e-05, + "loss": 0.1746, + "step": 43930 + }, + { + "epoch": 0.5491637290932273, + "grad_norm": 2.585922956466675, + "learning_rate": 1.0029235035411533e-05, + "loss": 1.2347, + "step": 43932 + }, + { + "epoch": 0.549188729718243, + "grad_norm": 0.0020469590090215206, + "learning_rate": 1.0028362350163347e-05, + "loss": 1.0032, + "step": 43934 + }, + { + "epoch": 0.5492137303432586, + "grad_norm": 6.022834300994873, + "learning_rate": 1.0027489664699161e-05, + "loss": 1.068, + "step": 43936 + }, + { + "epoch": 0.5492387309682742, + "grad_norm": 3.4246926307678223, + "learning_rate": 1.0026616979025616e-05, + "loss": 1.1022, + "step": 43938 + }, + { + "epoch": 0.5492637315932898, + "grad_norm": 5.156805992126465, + "learning_rate": 1.002574429314936e-05, + "loss": 1.6956, + "step": 43940 + }, + { + "epoch": 0.5492887322183054, + "grad_norm": 4.267909049987793, + "learning_rate": 1.002487160707704e-05, + "loss": 0.7708, + "step": 43942 + }, + { + "epoch": 0.5493137328433211, + "grad_norm": 5.285261631011963, + "learning_rate": 1.0023998920815303e-05, + "loss": 1.0325, + "step": 43944 + }, + { + "epoch": 0.5493387334683367, + "grad_norm": 3.5626485347747803, + "learning_rate": 1.002312623437079e-05, + "loss": 1.6883, + "step": 43946 + }, + { + "epoch": 0.5493637340933524, + "grad_norm": 3.1108181476593018, + "learning_rate": 1.0022253547750151e-05, + "loss": 1.3248, + "step": 43948 + }, + { + "epoch": 0.5493887347183679, + "grad_norm": 2.5945417881011963, + "learning_rate": 1.0021380860960033e-05, + "loss": 1.3835, + "step": 43950 + }, + { + "epoch": 0.5494137353433836, + "grad_norm": 5.563601016998291, + "learning_rate": 1.0020508174007081e-05, + "loss": 0.8029, + "step": 43952 + }, + { + "epoch": 0.5494387359683992, + "grad_norm": 1.060548186302185, + "learning_rate": 1.0019635486897945e-05, + "loss": 0.0495, + "step": 43954 + }, + { + "epoch": 0.5494637365934149, + "grad_norm": 1.6212377548217773, + "learning_rate": 1.0018762799639264e-05, + "loss": 0.1002, + "step": 43956 + }, + { + "epoch": 0.5494887372184305, + "grad_norm": 1.44969642162323, + "learning_rate": 1.0017890112237692e-05, + "loss": 0.7478, + "step": 43958 + }, + { + "epoch": 0.549513737843446, + "grad_norm": 3.3342361450195312, + "learning_rate": 1.0017017424699869e-05, + "loss": 0.8397, + "step": 43960 + }, + { + "epoch": 0.5495387384684617, + "grad_norm": 3.5068931579589844, + "learning_rate": 1.001614473703244e-05, + "loss": 1.7266, + "step": 43962 + }, + { + "epoch": 0.5495637390934773, + "grad_norm": 3.6984379291534424, + "learning_rate": 1.0015272049242061e-05, + "loss": 0.9259, + "step": 43964 + }, + { + "epoch": 0.549588739718493, + "grad_norm": 0.645072877407074, + "learning_rate": 1.0014399361335369e-05, + "loss": 0.3583, + "step": 43966 + }, + { + "epoch": 0.5496137403435086, + "grad_norm": 2.7792162895202637, + "learning_rate": 1.0013526673319013e-05, + "loss": 1.1993, + "step": 43968 + }, + { + "epoch": 0.5496387409685242, + "grad_norm": 4.43312406539917, + "learning_rate": 1.0012653985199645e-05, + "loss": 1.0044, + "step": 43970 + }, + { + "epoch": 0.5496637415935398, + "grad_norm": 5.751445293426514, + "learning_rate": 1.0011781296983903e-05, + "loss": 0.7218, + "step": 43972 + }, + { + "epoch": 0.5496887422185555, + "grad_norm": 3.4327874183654785, + "learning_rate": 1.0010908608678433e-05, + "loss": 1.4303, + "step": 43974 + }, + { + "epoch": 0.5497137428435711, + "grad_norm": 3.829805850982666, + "learning_rate": 1.0010035920289884e-05, + "loss": 0.8065, + "step": 43976 + }, + { + "epoch": 0.5497387434685868, + "grad_norm": 4.082584857940674, + "learning_rate": 1.000916323182491e-05, + "loss": 0.5534, + "step": 43978 + }, + { + "epoch": 0.5497637440936023, + "grad_norm": 2.4929847717285156, + "learning_rate": 1.0008290543290144e-05, + "loss": 0.3796, + "step": 43980 + }, + { + "epoch": 0.5497887447186179, + "grad_norm": 0.001158130238763988, + "learning_rate": 1.0007417854692241e-05, + "loss": 0.9856, + "step": 43982 + }, + { + "epoch": 0.5498137453436336, + "grad_norm": 4.48045015335083, + "learning_rate": 1.0006545166037846e-05, + "loss": 1.6004, + "step": 43984 + }, + { + "epoch": 0.5498387459686492, + "grad_norm": 0.35683056712150574, + "learning_rate": 1.0005672477333601e-05, + "loss": 1.0096, + "step": 43986 + }, + { + "epoch": 0.5498637465936649, + "grad_norm": 0.002102222526445985, + "learning_rate": 1.0004799788586154e-05, + "loss": 0.9971, + "step": 43988 + }, + { + "epoch": 0.5498887472186804, + "grad_norm": 2.438718318939209, + "learning_rate": 1.0003927099802158e-05, + "loss": 0.3716, + "step": 43990 + }, + { + "epoch": 0.5499137478436961, + "grad_norm": 4.684816837310791, + "learning_rate": 1.0003054410988248e-05, + "loss": 1.3573, + "step": 43992 + }, + { + "epoch": 0.5499387484687117, + "grad_norm": 7.542220592498779, + "learning_rate": 1.0002181722151079e-05, + "loss": 2.0752, + "step": 43994 + }, + { + "epoch": 0.5499637490937274, + "grad_norm": 3.164720058441162, + "learning_rate": 1.0001309033297297e-05, + "loss": 1.0415, + "step": 43996 + }, + { + "epoch": 0.549988749718743, + "grad_norm": 1.994507074356079, + "learning_rate": 1.0000436344433543e-05, + "loss": 0.2436, + "step": 43998 + }, + { + "epoch": 0.5500137503437585, + "grad_norm": 1.16489577293396, + "learning_rate": 9.999563655566463e-06, + "loss": 0.0511, + "step": 44000 + }, + { + "epoch": 0.5500387509687742, + "grad_norm": 6.245860576629639, + "learning_rate": 9.998690966702708e-06, + "loss": 0.8481, + "step": 44002 + }, + { + "epoch": 0.5500637515937898, + "grad_norm": 2.132523775100708, + "learning_rate": 9.997818277848923e-06, + "loss": 0.5529, + "step": 44004 + }, + { + "epoch": 0.5500887522188055, + "grad_norm": 3.322741985321045, + "learning_rate": 9.996945589011753e-06, + "loss": 0.7388, + "step": 44006 + }, + { + "epoch": 0.5501137528438211, + "grad_norm": 2.0089688301086426, + "learning_rate": 9.996072900197845e-06, + "loss": 0.2307, + "step": 44008 + }, + { + "epoch": 0.5501387534688367, + "grad_norm": 3.9801313877105713, + "learning_rate": 9.995200211413846e-06, + "loss": 0.902, + "step": 44010 + }, + { + "epoch": 0.5501637540938523, + "grad_norm": 4.437711238861084, + "learning_rate": 9.994327522666402e-06, + "loss": 1.0512, + "step": 44012 + }, + { + "epoch": 0.550188754718868, + "grad_norm": 0.0011599576100707054, + "learning_rate": 9.99345483396216e-06, + "loss": 0.6194, + "step": 44014 + }, + { + "epoch": 0.5502137553438836, + "grad_norm": 0.4260721206665039, + "learning_rate": 9.992582145307762e-06, + "loss": 0.0181, + "step": 44016 + }, + { + "epoch": 0.5502387559688993, + "grad_norm": 8.944369316101074, + "learning_rate": 9.991709456709858e-06, + "loss": 2.0947, + "step": 44018 + }, + { + "epoch": 0.5502637565939148, + "grad_norm": 9.537036895751953, + "learning_rate": 9.990836768175094e-06, + "loss": 1.3544, + "step": 44020 + }, + { + "epoch": 0.5502887572189304, + "grad_norm": 5.350013732910156, + "learning_rate": 9.989964079710114e-06, + "loss": 0.8757, + "step": 44022 + }, + { + "epoch": 0.5503137578439461, + "grad_norm": 3.498995065689087, + "learning_rate": 9.989091391321569e-06, + "loss": 1.3354, + "step": 44024 + }, + { + "epoch": 0.5503387584689617, + "grad_norm": 5.055516719818115, + "learning_rate": 9.988218703016104e-06, + "loss": 1.0227, + "step": 44026 + }, + { + "epoch": 0.5503637590939774, + "grad_norm": 3.4807050228118896, + "learning_rate": 9.987346014800362e-06, + "loss": 0.6809, + "step": 44028 + }, + { + "epoch": 0.5503887597189929, + "grad_norm": 0.22062911093235016, + "learning_rate": 9.986473326680988e-06, + "loss": 0.0221, + "step": 44030 + }, + { + "epoch": 0.5504137603440086, + "grad_norm": 3.5043914318084717, + "learning_rate": 9.985600638664633e-06, + "loss": 1.5307, + "step": 44032 + }, + { + "epoch": 0.5504387609690242, + "grad_norm": 3.8719029426574707, + "learning_rate": 9.984727950757942e-06, + "loss": 1.2394, + "step": 44034 + }, + { + "epoch": 0.5504637615940399, + "grad_norm": 0.000993223744444549, + "learning_rate": 9.98385526296756e-06, + "loss": 0.0, + "step": 44036 + }, + { + "epoch": 0.5504887622190555, + "grad_norm": 2.7690200805664062, + "learning_rate": 9.982982575300134e-06, + "loss": 0.6399, + "step": 44038 + }, + { + "epoch": 0.550513762844071, + "grad_norm": 0.0006831132341176271, + "learning_rate": 9.982109887762313e-06, + "loss": 0.0015, + "step": 44040 + }, + { + "epoch": 0.5505387634690867, + "grad_norm": 2.5083868503570557, + "learning_rate": 9.981237200360737e-06, + "loss": 0.7362, + "step": 44042 + }, + { + "epoch": 0.5505637640941023, + "grad_norm": 0.03505905717611313, + "learning_rate": 9.98036451310206e-06, + "loss": 0.7169, + "step": 44044 + }, + { + "epoch": 0.550588764719118, + "grad_norm": 0.04468471556901932, + "learning_rate": 9.97949182599292e-06, + "loss": 0.0019, + "step": 44046 + }, + { + "epoch": 0.5506137653441336, + "grad_norm": 6.179903507232666, + "learning_rate": 9.978619139039969e-06, + "loss": 1.3662, + "step": 44048 + }, + { + "epoch": 0.5506387659691492, + "grad_norm": 8.255155563354492, + "learning_rate": 9.97774645224985e-06, + "loss": 1.1866, + "step": 44050 + }, + { + "epoch": 0.5506637665941648, + "grad_norm": 4.134150505065918, + "learning_rate": 9.976873765629214e-06, + "loss": 1.443, + "step": 44052 + }, + { + "epoch": 0.5506887672191805, + "grad_norm": 4.480398178100586, + "learning_rate": 9.976001079184702e-06, + "loss": 0.7876, + "step": 44054 + }, + { + "epoch": 0.5507137678441961, + "grad_norm": 3.4098713397979736, + "learning_rate": 9.975128392922964e-06, + "loss": 1.9642, + "step": 44056 + }, + { + "epoch": 0.5507387684692118, + "grad_norm": 0.000545640243217349, + "learning_rate": 9.974255706850643e-06, + "loss": 0.0052, + "step": 44058 + }, + { + "epoch": 0.5507637690942273, + "grad_norm": 10.526080131530762, + "learning_rate": 9.973383020974386e-06, + "loss": 1.4026, + "step": 44060 + }, + { + "epoch": 0.550788769719243, + "grad_norm": 5.066817760467529, + "learning_rate": 9.972510335300842e-06, + "loss": 2.3417, + "step": 44062 + }, + { + "epoch": 0.5508137703442586, + "grad_norm": 2.535794734954834, + "learning_rate": 9.971637649836654e-06, + "loss": 1.379, + "step": 44064 + }, + { + "epoch": 0.5508387709692742, + "grad_norm": 4.510377883911133, + "learning_rate": 9.970764964588473e-06, + "loss": 1.7723, + "step": 44066 + }, + { + "epoch": 0.5508637715942899, + "grad_norm": 6.13994836807251, + "learning_rate": 9.969892279562939e-06, + "loss": 1.4114, + "step": 44068 + }, + { + "epoch": 0.5508887722193054, + "grad_norm": 2.918482780456543, + "learning_rate": 9.9690195947667e-06, + "loss": 0.8811, + "step": 44070 + }, + { + "epoch": 0.5509137728443211, + "grad_norm": 2.365581512451172, + "learning_rate": 9.968146910206405e-06, + "loss": 1.592, + "step": 44072 + }, + { + "epoch": 0.5509387734693367, + "grad_norm": 2.5985543727874756, + "learning_rate": 9.967274225888699e-06, + "loss": 1.2537, + "step": 44074 + }, + { + "epoch": 0.5509637740943524, + "grad_norm": 1.3986701965332031, + "learning_rate": 9.966401541820228e-06, + "loss": 1.2209, + "step": 44076 + }, + { + "epoch": 0.550988774719368, + "grad_norm": 5.593341827392578, + "learning_rate": 9.96552885800764e-06, + "loss": 1.3071, + "step": 44078 + }, + { + "epoch": 0.5510137753443836, + "grad_norm": 5.98781156539917, + "learning_rate": 9.964656174457575e-06, + "loss": 1.8121, + "step": 44080 + }, + { + "epoch": 0.5510387759693992, + "grad_norm": 4.6284403800964355, + "learning_rate": 9.963783491176686e-06, + "loss": 1.2454, + "step": 44082 + }, + { + "epoch": 0.5510637765944149, + "grad_norm": 2.3868157863616943, + "learning_rate": 9.962910808171616e-06, + "loss": 1.1284, + "step": 44084 + }, + { + "epoch": 0.5510887772194305, + "grad_norm": 2.340611219406128, + "learning_rate": 9.962038125449011e-06, + "loss": 1.9571, + "step": 44086 + }, + { + "epoch": 0.5511137778444462, + "grad_norm": 3.786929130554199, + "learning_rate": 9.96116544301552e-06, + "loss": 1.1722, + "step": 44088 + }, + { + "epoch": 0.5511387784694617, + "grad_norm": 4.142889022827148, + "learning_rate": 9.960292760877787e-06, + "loss": 1.1975, + "step": 44090 + }, + { + "epoch": 0.5511637790944773, + "grad_norm": 2.9571645259857178, + "learning_rate": 9.959420079042461e-06, + "loss": 0.8614, + "step": 44092 + }, + { + "epoch": 0.551188779719493, + "grad_norm": 2.099531650543213, + "learning_rate": 9.958547397516182e-06, + "loss": 1.6046, + "step": 44094 + }, + { + "epoch": 0.5512137803445086, + "grad_norm": 2.945878028869629, + "learning_rate": 9.957674716305602e-06, + "loss": 1.012, + "step": 44096 + }, + { + "epoch": 0.5512387809695243, + "grad_norm": 2.361239433288574, + "learning_rate": 9.956802035417366e-06, + "loss": 2.0541, + "step": 44098 + }, + { + "epoch": 0.5512637815945398, + "grad_norm": 1.5600237846374512, + "learning_rate": 9.955929354858117e-06, + "loss": 0.8544, + "step": 44100 + }, + { + "epoch": 0.5512887822195555, + "grad_norm": 1.9980822801589966, + "learning_rate": 9.955056674634507e-06, + "loss": 0.6556, + "step": 44102 + }, + { + "epoch": 0.5513137828445711, + "grad_norm": 0.0756322517991066, + "learning_rate": 9.95418399475318e-06, + "loss": 0.7055, + "step": 44104 + }, + { + "epoch": 0.5513387834695868, + "grad_norm": 0.0034015921410173178, + "learning_rate": 9.953311315220779e-06, + "loss": 0.4828, + "step": 44106 + }, + { + "epoch": 0.5513637840946024, + "grad_norm": 2.549738883972168, + "learning_rate": 9.95243863604395e-06, + "loss": 0.4009, + "step": 44108 + }, + { + "epoch": 0.5513887847196179, + "grad_norm": 2.982029676437378, + "learning_rate": 9.951565957229344e-06, + "loss": 1.3176, + "step": 44110 + }, + { + "epoch": 0.5514137853446336, + "grad_norm": 12.133543014526367, + "learning_rate": 9.950693278783605e-06, + "loss": 0.252, + "step": 44112 + }, + { + "epoch": 0.5514387859696492, + "grad_norm": 4.088210582733154, + "learning_rate": 9.94982060071338e-06, + "loss": 1.6568, + "step": 44114 + }, + { + "epoch": 0.5514637865946649, + "grad_norm": 4.706923484802246, + "learning_rate": 9.94894792302531e-06, + "loss": 1.4818, + "step": 44116 + }, + { + "epoch": 0.5514887872196805, + "grad_norm": 2.743797540664673, + "learning_rate": 9.948075245726052e-06, + "loss": 0.5088, + "step": 44118 + }, + { + "epoch": 0.5515137878446961, + "grad_norm": 4.537893772125244, + "learning_rate": 9.947202568822242e-06, + "loss": 1.303, + "step": 44120 + }, + { + "epoch": 0.5515387884697117, + "grad_norm": 0.025700220838189125, + "learning_rate": 9.946329892320529e-06, + "loss": 0.5914, + "step": 44122 + }, + { + "epoch": 0.5515637890947274, + "grad_norm": 2.7370762825012207, + "learning_rate": 9.94545721622756e-06, + "loss": 0.5118, + "step": 44124 + }, + { + "epoch": 0.551588789719743, + "grad_norm": 1.0545645952224731, + "learning_rate": 9.944584540549982e-06, + "loss": 0.0412, + "step": 44126 + }, + { + "epoch": 0.5516137903447587, + "grad_norm": 1.4742521047592163, + "learning_rate": 9.943711865294438e-06, + "loss": 0.5211, + "step": 44128 + }, + { + "epoch": 0.5516387909697742, + "grad_norm": 1.1768903732299805, + "learning_rate": 9.942839190467583e-06, + "loss": 0.5257, + "step": 44130 + }, + { + "epoch": 0.5516637915947898, + "grad_norm": 2.6140034198760986, + "learning_rate": 9.94196651607605e-06, + "loss": 0.9372, + "step": 44132 + }, + { + "epoch": 0.5516887922198055, + "grad_norm": 2.030763864517212, + "learning_rate": 9.941093842126495e-06, + "loss": 0.2371, + "step": 44134 + }, + { + "epoch": 0.5517137928448211, + "grad_norm": 3.9640932083129883, + "learning_rate": 9.940221168625558e-06, + "loss": 0.8767, + "step": 44136 + }, + { + "epoch": 0.5517387934698368, + "grad_norm": 2.9735772609710693, + "learning_rate": 9.939348495579892e-06, + "loss": 0.3256, + "step": 44138 + }, + { + "epoch": 0.5517637940948523, + "grad_norm": 0.002127317013218999, + "learning_rate": 9.938475822996134e-06, + "loss": 0.0012, + "step": 44140 + }, + { + "epoch": 0.551788794719868, + "grad_norm": 5.814737319946289, + "learning_rate": 9.937603150880938e-06, + "loss": 1.4696, + "step": 44142 + }, + { + "epoch": 0.5518137953448836, + "grad_norm": 0.0017465458950027823, + "learning_rate": 9.936730479240953e-06, + "loss": 0.0246, + "step": 44144 + }, + { + "epoch": 0.5518387959698993, + "grad_norm": 3.2235326766967773, + "learning_rate": 9.935857808082815e-06, + "loss": 1.7849, + "step": 44146 + }, + { + "epoch": 0.5518637965949149, + "grad_norm": 0.016007887199521065, + "learning_rate": 9.934985137413174e-06, + "loss": 0.0003, + "step": 44148 + }, + { + "epoch": 0.5518887972199304, + "grad_norm": 6.556221961975098, + "learning_rate": 9.93411246723868e-06, + "loss": 0.7768, + "step": 44150 + }, + { + "epoch": 0.5519137978449461, + "grad_norm": 4.203087329864502, + "learning_rate": 9.93323979756597e-06, + "loss": 1.4495, + "step": 44152 + }, + { + "epoch": 0.5519387984699617, + "grad_norm": 3.344066858291626, + "learning_rate": 9.9323671284017e-06, + "loss": 1.6814, + "step": 44154 + }, + { + "epoch": 0.5519637990949774, + "grad_norm": 4.862980842590332, + "learning_rate": 9.931494459752518e-06, + "loss": 0.8252, + "step": 44156 + }, + { + "epoch": 0.551988799719993, + "grad_norm": 4.087234973907471, + "learning_rate": 9.930621791625058e-06, + "loss": 1.6702, + "step": 44158 + }, + { + "epoch": 0.5520138003450086, + "grad_norm": 3.694429397583008, + "learning_rate": 9.929749124025974e-06, + "loss": 0.8893, + "step": 44160 + }, + { + "epoch": 0.5520388009700242, + "grad_norm": 3.4579646587371826, + "learning_rate": 9.92887645696191e-06, + "loss": 1.2599, + "step": 44162 + }, + { + "epoch": 0.5520638015950399, + "grad_norm": 0.6797775626182556, + "learning_rate": 9.928003790439514e-06, + "loss": 1.0677, + "step": 44164 + }, + { + "epoch": 0.5520888022200555, + "grad_norm": 0.027537677437067032, + "learning_rate": 9.927131124465429e-06, + "loss": 0.8204, + "step": 44166 + }, + { + "epoch": 0.5521138028450712, + "grad_norm": 5.9793901443481445, + "learning_rate": 9.926258459046303e-06, + "loss": 1.5171, + "step": 44168 + }, + { + "epoch": 0.5521388034700867, + "grad_norm": 5.157558441162109, + "learning_rate": 9.925385794188788e-06, + "loss": 0.9327, + "step": 44170 + }, + { + "epoch": 0.5521638040951024, + "grad_norm": 0.0007169722230173647, + "learning_rate": 9.92451312989952e-06, + "loss": 0.6934, + "step": 44172 + }, + { + "epoch": 0.552188804720118, + "grad_norm": 0.3164816200733185, + "learning_rate": 9.923640466185149e-06, + "loss": 0.9823, + "step": 44174 + }, + { + "epoch": 0.5522138053451336, + "grad_norm": 3.685755491256714, + "learning_rate": 9.92276780305232e-06, + "loss": 0.3877, + "step": 44176 + }, + { + "epoch": 0.5522388059701493, + "grad_norm": 3.6434578895568848, + "learning_rate": 9.92189514050768e-06, + "loss": 0.919, + "step": 44178 + }, + { + "epoch": 0.5522638065951648, + "grad_norm": 1.0020569562911987, + "learning_rate": 9.921022478557878e-06, + "loss": 0.406, + "step": 44180 + }, + { + "epoch": 0.5522888072201805, + "grad_norm": 3.3359017372131348, + "learning_rate": 9.920149817209562e-06, + "loss": 1.0436, + "step": 44182 + }, + { + "epoch": 0.5523138078451961, + "grad_norm": 1.5437815189361572, + "learning_rate": 9.919277156469368e-06, + "loss": 0.0776, + "step": 44184 + }, + { + "epoch": 0.5523388084702118, + "grad_norm": 2.4787497520446777, + "learning_rate": 9.918404496343949e-06, + "loss": 0.1291, + "step": 44186 + }, + { + "epoch": 0.5523638090952274, + "grad_norm": 0.8519238829612732, + "learning_rate": 9.917531836839949e-06, + "loss": 0.3589, + "step": 44188 + }, + { + "epoch": 0.552388809720243, + "grad_norm": 4.519127368927002, + "learning_rate": 9.916659177964013e-06, + "loss": 1.8744, + "step": 44190 + }, + { + "epoch": 0.5524138103452586, + "grad_norm": 7.585314750671387, + "learning_rate": 9.915786519722792e-06, + "loss": 0.5674, + "step": 44192 + }, + { + "epoch": 0.5524388109702743, + "grad_norm": 0.0008498240495100617, + "learning_rate": 9.914913862122927e-06, + "loss": 1.168, + "step": 44194 + }, + { + "epoch": 0.5524638115952899, + "grad_norm": 0.007210689131170511, + "learning_rate": 9.91404120517107e-06, + "loss": 0.0002, + "step": 44196 + }, + { + "epoch": 0.5524888122203055, + "grad_norm": 3.7102534770965576, + "learning_rate": 9.913168548873861e-06, + "loss": 1.36, + "step": 44198 + }, + { + "epoch": 0.5525138128453211, + "grad_norm": 4.071348190307617, + "learning_rate": 9.912295893237947e-06, + "loss": 1.1543, + "step": 44200 + }, + { + "epoch": 0.5525388134703367, + "grad_norm": 3.2293834686279297, + "learning_rate": 9.911423238269974e-06, + "loss": 1.5099, + "step": 44202 + }, + { + "epoch": 0.5525638140953524, + "grad_norm": 4.251216411590576, + "learning_rate": 9.91055058397659e-06, + "loss": 2.1764, + "step": 44204 + }, + { + "epoch": 0.552588814720368, + "grad_norm": 0.0009017070406116545, + "learning_rate": 9.909677930364441e-06, + "loss": 0.0007, + "step": 44206 + }, + { + "epoch": 0.5526138153453837, + "grad_norm": 3.9537322521209717, + "learning_rate": 9.908805277440176e-06, + "loss": 0.7082, + "step": 44208 + }, + { + "epoch": 0.5526388159703992, + "grad_norm": 5.283222198486328, + "learning_rate": 9.907932625210432e-06, + "loss": 1.4357, + "step": 44210 + }, + { + "epoch": 0.5526638165954149, + "grad_norm": 5.239121913909912, + "learning_rate": 9.90705997368186e-06, + "loss": 1.7205, + "step": 44212 + }, + { + "epoch": 0.5526888172204305, + "grad_norm": 4.162136077880859, + "learning_rate": 9.906187322861105e-06, + "loss": 1.1514, + "step": 44214 + }, + { + "epoch": 0.5527138178454462, + "grad_norm": 4.553075790405273, + "learning_rate": 9.905314672754817e-06, + "loss": 1.954, + "step": 44216 + }, + { + "epoch": 0.5527388184704618, + "grad_norm": 3.195310354232788, + "learning_rate": 9.904442023369637e-06, + "loss": 1.2332, + "step": 44218 + }, + { + "epoch": 0.5527638190954773, + "grad_norm": 6.150961399078369, + "learning_rate": 9.903569374712214e-06, + "loss": 1.9627, + "step": 44220 + }, + { + "epoch": 0.552788819720493, + "grad_norm": 1.8819079399108887, + "learning_rate": 9.902696726789197e-06, + "loss": 0.1874, + "step": 44222 + }, + { + "epoch": 0.5528138203455086, + "grad_norm": 3.309772491455078, + "learning_rate": 9.901824079607224e-06, + "loss": 0.7965, + "step": 44224 + }, + { + "epoch": 0.5528388209705243, + "grad_norm": 2.6067187786102295, + "learning_rate": 9.900951433172942e-06, + "loss": 0.464, + "step": 44226 + }, + { + "epoch": 0.5528638215955399, + "grad_norm": 1.8424094915390015, + "learning_rate": 9.900078787493004e-06, + "loss": 0.203, + "step": 44228 + }, + { + "epoch": 0.5528888222205555, + "grad_norm": 0.9854472279548645, + "learning_rate": 9.89920614257405e-06, + "loss": 0.173, + "step": 44230 + }, + { + "epoch": 0.5529138228455711, + "grad_norm": 4.747757911682129, + "learning_rate": 9.898333498422728e-06, + "loss": 1.4268, + "step": 44232 + }, + { + "epoch": 0.5529388234705868, + "grad_norm": 2.7698452472686768, + "learning_rate": 9.89746085504569e-06, + "loss": 1.0857, + "step": 44234 + }, + { + "epoch": 0.5529638240956024, + "grad_norm": 0.09372737258672714, + "learning_rate": 9.89658821244957e-06, + "loss": 0.4633, + "step": 44236 + }, + { + "epoch": 0.552988824720618, + "grad_norm": 3.623660087585449, + "learning_rate": 9.89571557064102e-06, + "loss": 1.7661, + "step": 44238 + }, + { + "epoch": 0.5530138253456336, + "grad_norm": 4.127478122711182, + "learning_rate": 9.894842929626681e-06, + "loss": 0.8958, + "step": 44240 + }, + { + "epoch": 0.5530388259706492, + "grad_norm": 0.0013267425820231438, + "learning_rate": 9.893970289413208e-06, + "loss": 0.3893, + "step": 44242 + }, + { + "epoch": 0.5530638265956649, + "grad_norm": 3.7092130184173584, + "learning_rate": 9.893097650007241e-06, + "loss": 1.4677, + "step": 44244 + }, + { + "epoch": 0.5530888272206805, + "grad_norm": 4.934807300567627, + "learning_rate": 9.892225011415427e-06, + "loss": 1.0072, + "step": 44246 + }, + { + "epoch": 0.5531138278456962, + "grad_norm": 1.677810549736023, + "learning_rate": 9.891352373644417e-06, + "loss": 0.0388, + "step": 44248 + }, + { + "epoch": 0.5531388284707117, + "grad_norm": 16.760616302490234, + "learning_rate": 9.890479736700848e-06, + "loss": 2.6997, + "step": 44250 + }, + { + "epoch": 0.5531638290957274, + "grad_norm": 3.447528123855591, + "learning_rate": 9.889607100591368e-06, + "loss": 1.3912, + "step": 44252 + }, + { + "epoch": 0.553188829720743, + "grad_norm": 3.151043176651001, + "learning_rate": 9.888734465322625e-06, + "loss": 1.3105, + "step": 44254 + }, + { + "epoch": 0.5532138303457587, + "grad_norm": 6.731167316436768, + "learning_rate": 9.887861830901266e-06, + "loss": 1.7911, + "step": 44256 + }, + { + "epoch": 0.5532388309707743, + "grad_norm": 5.320699691772461, + "learning_rate": 9.886989197333935e-06, + "loss": 1.1769, + "step": 44258 + }, + { + "epoch": 0.5532638315957898, + "grad_norm": 2.7533812522888184, + "learning_rate": 9.886116564627277e-06, + "loss": 1.5199, + "step": 44260 + }, + { + "epoch": 0.5532888322208055, + "grad_norm": 0.9253056049346924, + "learning_rate": 9.885243932787946e-06, + "loss": 0.4152, + "step": 44262 + }, + { + "epoch": 0.5533138328458211, + "grad_norm": 6.98151159286499, + "learning_rate": 9.884371301822575e-06, + "loss": 1.0925, + "step": 44264 + }, + { + "epoch": 0.5533388334708368, + "grad_norm": 0.19456249475479126, + "learning_rate": 9.883498671737816e-06, + "loss": 0.9166, + "step": 44266 + }, + { + "epoch": 0.5533638340958524, + "grad_norm": 7.023538589477539, + "learning_rate": 9.882626042540314e-06, + "loss": 0.8048, + "step": 44268 + }, + { + "epoch": 0.553388834720868, + "grad_norm": 2.503964900970459, + "learning_rate": 9.881753414236717e-06, + "loss": 0.5153, + "step": 44270 + }, + { + "epoch": 0.5534138353458836, + "grad_norm": 0.023287907242774963, + "learning_rate": 9.88088078683367e-06, + "loss": 0.2086, + "step": 44272 + }, + { + "epoch": 0.5534388359708993, + "grad_norm": 0.0007029918488115072, + "learning_rate": 9.88000816033782e-06, + "loss": 0.1605, + "step": 44274 + }, + { + "epoch": 0.5534638365959149, + "grad_norm": 1.1280163526535034, + "learning_rate": 9.879135534755805e-06, + "loss": 1.2392, + "step": 44276 + }, + { + "epoch": 0.5534888372209306, + "grad_norm": 0.0503096729516983, + "learning_rate": 9.87826291009428e-06, + "loss": 0.0009, + "step": 44278 + }, + { + "epoch": 0.5535138378459461, + "grad_norm": 2.662393808364868, + "learning_rate": 9.877390286359887e-06, + "loss": 1.0826, + "step": 44280 + }, + { + "epoch": 0.5535388384709617, + "grad_norm": 5.1161909103393555, + "learning_rate": 9.876517663559273e-06, + "loss": 1.7863, + "step": 44282 + }, + { + "epoch": 0.5535638390959774, + "grad_norm": 1.3853318691253662, + "learning_rate": 9.875645041699082e-06, + "loss": 0.4193, + "step": 44284 + }, + { + "epoch": 0.553588839720993, + "grad_norm": 2.2917733192443848, + "learning_rate": 9.87477242078596e-06, + "loss": 0.6294, + "step": 44286 + }, + { + "epoch": 0.5536138403460087, + "grad_norm": 0.04957178980112076, + "learning_rate": 9.873899800826556e-06, + "loss": 0.4878, + "step": 44288 + }, + { + "epoch": 0.5536388409710242, + "grad_norm": 2.2238752841949463, + "learning_rate": 9.873027181827512e-06, + "loss": 1.2561, + "step": 44290 + }, + { + "epoch": 0.5536638415960399, + "grad_norm": 2.664621591567993, + "learning_rate": 9.872154563795475e-06, + "loss": 0.6975, + "step": 44292 + }, + { + "epoch": 0.5536888422210555, + "grad_norm": 1.0054254531860352, + "learning_rate": 9.87128194673709e-06, + "loss": 0.6783, + "step": 44294 + }, + { + "epoch": 0.5537138428460712, + "grad_norm": 2.832244634628296, + "learning_rate": 9.870409330659004e-06, + "loss": 1.2462, + "step": 44296 + }, + { + "epoch": 0.5537388434710868, + "grad_norm": 7.554788112640381, + "learning_rate": 9.869536715567861e-06, + "loss": 1.6683, + "step": 44298 + }, + { + "epoch": 0.5537638440961024, + "grad_norm": 3.159376621246338, + "learning_rate": 9.868664101470313e-06, + "loss": 1.1082, + "step": 44300 + }, + { + "epoch": 0.553788844721118, + "grad_norm": 0.012021898292005062, + "learning_rate": 9.867791488372996e-06, + "loss": 0.0002, + "step": 44302 + }, + { + "epoch": 0.5538138453461336, + "grad_norm": 4.339484214782715, + "learning_rate": 9.866918876282562e-06, + "loss": 1.3341, + "step": 44304 + }, + { + "epoch": 0.5538388459711493, + "grad_norm": 0.29586294293403625, + "learning_rate": 9.866046265205654e-06, + "loss": 0.0437, + "step": 44306 + }, + { + "epoch": 0.5538638465961649, + "grad_norm": 0.022298576310276985, + "learning_rate": 9.865173655148918e-06, + "loss": 0.5385, + "step": 44308 + }, + { + "epoch": 0.5538888472211805, + "grad_norm": 4.914283752441406, + "learning_rate": 9.864301046119002e-06, + "loss": 0.9598, + "step": 44310 + }, + { + "epoch": 0.5539138478461961, + "grad_norm": 5.84483003616333, + "learning_rate": 9.863428438122548e-06, + "loss": 1.2025, + "step": 44312 + }, + { + "epoch": 0.5539388484712118, + "grad_norm": 3.0553250312805176, + "learning_rate": 9.862555831166209e-06, + "loss": 1.0894, + "step": 44314 + }, + { + "epoch": 0.5539638490962274, + "grad_norm": 0.6804731488227844, + "learning_rate": 9.861683225256621e-06, + "loss": 0.2346, + "step": 44316 + }, + { + "epoch": 0.5539888497212431, + "grad_norm": 0.0006427220068871975, + "learning_rate": 9.860810620400434e-06, + "loss": 0.0641, + "step": 44318 + }, + { + "epoch": 0.5540138503462586, + "grad_norm": 3.6784300804138184, + "learning_rate": 9.859938016604293e-06, + "loss": 1.335, + "step": 44320 + }, + { + "epoch": 0.5540388509712743, + "grad_norm": 2.5259101390838623, + "learning_rate": 9.859065413874846e-06, + "loss": 1.5325, + "step": 44322 + }, + { + "epoch": 0.5540638515962899, + "grad_norm": 5.113340854644775, + "learning_rate": 9.858192812218736e-06, + "loss": 2.1878, + "step": 44324 + }, + { + "epoch": 0.5540888522213056, + "grad_norm": 3.8414735794067383, + "learning_rate": 9.857320211642611e-06, + "loss": 0.5025, + "step": 44326 + }, + { + "epoch": 0.5541138528463212, + "grad_norm": 4.217856407165527, + "learning_rate": 9.856447612153114e-06, + "loss": 0.8596, + "step": 44328 + }, + { + "epoch": 0.5541388534713367, + "grad_norm": 1.2060343027114868, + "learning_rate": 9.855575013756892e-06, + "loss": 0.7628, + "step": 44330 + }, + { + "epoch": 0.5541638540963524, + "grad_norm": 0.8279228210449219, + "learning_rate": 9.854702416460589e-06, + "loss": 0.4223, + "step": 44332 + }, + { + "epoch": 0.554188854721368, + "grad_norm": 1.9776068925857544, + "learning_rate": 9.853829820270853e-06, + "loss": 0.8232, + "step": 44334 + }, + { + "epoch": 0.5542138553463837, + "grad_norm": 2.4956612586975098, + "learning_rate": 9.852957225194327e-06, + "loss": 1.7001, + "step": 44336 + }, + { + "epoch": 0.5542388559713993, + "grad_norm": 5.015460014343262, + "learning_rate": 9.85208463123766e-06, + "loss": 1.5327, + "step": 44338 + }, + { + "epoch": 0.5542638565964149, + "grad_norm": 0.0007752808160148561, + "learning_rate": 9.851212038407495e-06, + "loss": 0.1708, + "step": 44340 + }, + { + "epoch": 0.5542888572214305, + "grad_norm": 2.2468347549438477, + "learning_rate": 9.850339446710478e-06, + "loss": 0.6016, + "step": 44342 + }, + { + "epoch": 0.5543138578464462, + "grad_norm": 0.0008642856846563518, + "learning_rate": 9.849466856153254e-06, + "loss": 0.534, + "step": 44344 + }, + { + "epoch": 0.5543388584714618, + "grad_norm": 4.436891555786133, + "learning_rate": 9.848594266742469e-06, + "loss": 1.2859, + "step": 44346 + }, + { + "epoch": 0.5543638590964775, + "grad_norm": 0.5460945963859558, + "learning_rate": 9.84772167848477e-06, + "loss": 0.911, + "step": 44348 + }, + { + "epoch": 0.554388859721493, + "grad_norm": 1.5352208614349365, + "learning_rate": 9.846849091386799e-06, + "loss": 0.1516, + "step": 44350 + }, + { + "epoch": 0.5544138603465086, + "grad_norm": 4.602029323577881, + "learning_rate": 9.845976505455207e-06, + "loss": 0.4374, + "step": 44352 + }, + { + "epoch": 0.5544388609715243, + "grad_norm": 5.146188259124756, + "learning_rate": 9.845103920696634e-06, + "loss": 1.8117, + "step": 44354 + }, + { + "epoch": 0.5544638615965399, + "grad_norm": 1.9790855646133423, + "learning_rate": 9.844231337117728e-06, + "loss": 0.4093, + "step": 44356 + }, + { + "epoch": 0.5544888622215556, + "grad_norm": 2.4109318256378174, + "learning_rate": 9.843358754725133e-06, + "loss": 1.3054, + "step": 44358 + }, + { + "epoch": 0.5545138628465711, + "grad_norm": 5.367937088012695, + "learning_rate": 9.842486173525497e-06, + "loss": 1.7833, + "step": 44360 + }, + { + "epoch": 0.5545388634715868, + "grad_norm": 3.221508026123047, + "learning_rate": 9.841613593525462e-06, + "loss": 1.7167, + "step": 44362 + }, + { + "epoch": 0.5545638640966024, + "grad_norm": 4.326646327972412, + "learning_rate": 9.840741014731678e-06, + "loss": 1.2594, + "step": 44364 + }, + { + "epoch": 0.5545888647216181, + "grad_norm": 4.733490943908691, + "learning_rate": 9.839868437150788e-06, + "loss": 1.7496, + "step": 44366 + }, + { + "epoch": 0.5546138653466337, + "grad_norm": 2.7277562618255615, + "learning_rate": 9.838995860789435e-06, + "loss": 0.998, + "step": 44368 + }, + { + "epoch": 0.5546388659716492, + "grad_norm": 2.7406797409057617, + "learning_rate": 9.838123285654268e-06, + "loss": 0.0324, + "step": 44370 + }, + { + "epoch": 0.5546638665966649, + "grad_norm": 0.001970674842596054, + "learning_rate": 9.837250711751931e-06, + "loss": 1.1966, + "step": 44372 + }, + { + "epoch": 0.5546888672216805, + "grad_norm": 0.0009642253280617297, + "learning_rate": 9.83637813908907e-06, + "loss": 1.3636, + "step": 44374 + }, + { + "epoch": 0.5547138678466962, + "grad_norm": 4.352071285247803, + "learning_rate": 9.83550556767233e-06, + "loss": 1.2366, + "step": 44376 + }, + { + "epoch": 0.5547388684717118, + "grad_norm": 5.617002964019775, + "learning_rate": 9.834632997508357e-06, + "loss": 1.1841, + "step": 44378 + }, + { + "epoch": 0.5547638690967274, + "grad_norm": 3.149007558822632, + "learning_rate": 9.833760428603795e-06, + "loss": 1.4654, + "step": 44380 + }, + { + "epoch": 0.554788869721743, + "grad_norm": 3.802324056625366, + "learning_rate": 9.832887860965289e-06, + "loss": 0.6614, + "step": 44382 + }, + { + "epoch": 0.5548138703467587, + "grad_norm": 4.000087738037109, + "learning_rate": 9.832015294599485e-06, + "loss": 1.5257, + "step": 44384 + }, + { + "epoch": 0.5548388709717743, + "grad_norm": 2.4412479400634766, + "learning_rate": 9.831142729513031e-06, + "loss": 0.824, + "step": 44386 + }, + { + "epoch": 0.55486387159679, + "grad_norm": 8.991660118103027, + "learning_rate": 9.830270165712568e-06, + "loss": 2.24, + "step": 44388 + }, + { + "epoch": 0.5548888722218055, + "grad_norm": 4.701828479766846, + "learning_rate": 9.829397603204745e-06, + "loss": 1.3378, + "step": 44390 + }, + { + "epoch": 0.5549138728468211, + "grad_norm": 2.658109188079834, + "learning_rate": 9.828525041996206e-06, + "loss": 0.713, + "step": 44392 + }, + { + "epoch": 0.5549388734718368, + "grad_norm": 1.0381829738616943, + "learning_rate": 9.827652482093596e-06, + "loss": 0.4932, + "step": 44394 + }, + { + "epoch": 0.5549638740968524, + "grad_norm": 0.00109258689917624, + "learning_rate": 9.826779923503557e-06, + "loss": 0.4332, + "step": 44396 + }, + { + "epoch": 0.5549888747218681, + "grad_norm": 1.7527621984481812, + "learning_rate": 9.825907366232741e-06, + "loss": 0.0794, + "step": 44398 + }, + { + "epoch": 0.5550138753468836, + "grad_norm": 8.037742614746094, + "learning_rate": 9.825034810287789e-06, + "loss": 0.9652, + "step": 44400 + }, + { + "epoch": 0.5550388759718993, + "grad_norm": 2.6594340801239014, + "learning_rate": 9.824162255675348e-06, + "loss": 0.7075, + "step": 44402 + }, + { + "epoch": 0.5550638765969149, + "grad_norm": 14.640095710754395, + "learning_rate": 9.823289702402063e-06, + "loss": 1.1879, + "step": 44404 + }, + { + "epoch": 0.5550888772219306, + "grad_norm": 4.492846965789795, + "learning_rate": 9.822417150474576e-06, + "loss": 0.7266, + "step": 44406 + }, + { + "epoch": 0.5551138778469462, + "grad_norm": 1.6512744426727295, + "learning_rate": 9.821544599899536e-06, + "loss": 0.4745, + "step": 44408 + }, + { + "epoch": 0.5551388784719617, + "grad_norm": 2.664806842803955, + "learning_rate": 9.820672050683586e-06, + "loss": 1.1951, + "step": 44410 + }, + { + "epoch": 0.5551638790969774, + "grad_norm": 3.057474374771118, + "learning_rate": 9.819799502833373e-06, + "loss": 0.9483, + "step": 44412 + }, + { + "epoch": 0.555188879721993, + "grad_norm": 2.525862693786621, + "learning_rate": 9.818926956355542e-06, + "loss": 0.1751, + "step": 44414 + }, + { + "epoch": 0.5552138803470087, + "grad_norm": 0.12577614188194275, + "learning_rate": 9.818054411256738e-06, + "loss": 1.0155, + "step": 44416 + }, + { + "epoch": 0.5552388809720243, + "grad_norm": 0.3060789108276367, + "learning_rate": 9.817181867543607e-06, + "loss": 0.6687, + "step": 44418 + }, + { + "epoch": 0.5552638815970399, + "grad_norm": 4.263937950134277, + "learning_rate": 9.816309325222792e-06, + "loss": 0.716, + "step": 44420 + }, + { + "epoch": 0.5552888822220555, + "grad_norm": 10.109881401062012, + "learning_rate": 9.815436784300938e-06, + "loss": 1.7187, + "step": 44422 + }, + { + "epoch": 0.5553138828470712, + "grad_norm": 3.4782674312591553, + "learning_rate": 9.81456424478469e-06, + "loss": 1.2542, + "step": 44424 + }, + { + "epoch": 0.5553388834720868, + "grad_norm": 0.9154694080352783, + "learning_rate": 9.813691706680698e-06, + "loss": 0.7099, + "step": 44426 + }, + { + "epoch": 0.5553638840971025, + "grad_norm": 6.549140453338623, + "learning_rate": 9.812819169995602e-06, + "loss": 1.186, + "step": 44428 + }, + { + "epoch": 0.555388884722118, + "grad_norm": 2.1497623920440674, + "learning_rate": 9.811946634736051e-06, + "loss": 1.4525, + "step": 44430 + }, + { + "epoch": 0.5554138853471337, + "grad_norm": 2.667480707168579, + "learning_rate": 9.811074100908686e-06, + "loss": 0.383, + "step": 44432 + }, + { + "epoch": 0.5554388859721493, + "grad_norm": 3.471925973892212, + "learning_rate": 9.810201568520153e-06, + "loss": 0.9093, + "step": 44434 + }, + { + "epoch": 0.555463886597165, + "grad_norm": 2.7542099952697754, + "learning_rate": 9.809329037577098e-06, + "loss": 1.0968, + "step": 44436 + }, + { + "epoch": 0.5554888872221806, + "grad_norm": 4.226868629455566, + "learning_rate": 9.808456508086167e-06, + "loss": 0.5585, + "step": 44438 + }, + { + "epoch": 0.5555138878471961, + "grad_norm": 1.1929728984832764, + "learning_rate": 9.807583980054002e-06, + "loss": 0.7951, + "step": 44440 + }, + { + "epoch": 0.5555388884722118, + "grad_norm": 12.270530700683594, + "learning_rate": 9.806711453487253e-06, + "loss": 2.0166, + "step": 44442 + }, + { + "epoch": 0.5555638890972274, + "grad_norm": 2.202519655227661, + "learning_rate": 9.805838928392563e-06, + "loss": 1.373, + "step": 44444 + }, + { + "epoch": 0.5555888897222431, + "grad_norm": 5.473848819732666, + "learning_rate": 9.804966404776575e-06, + "loss": 0.2393, + "step": 44446 + }, + { + "epoch": 0.5556138903472587, + "grad_norm": 0.12382671236991882, + "learning_rate": 9.804093882645935e-06, + "loss": 1.1853, + "step": 44448 + }, + { + "epoch": 0.5556388909722743, + "grad_norm": 2.624382495880127, + "learning_rate": 9.803221362007288e-06, + "loss": 1.9806, + "step": 44450 + }, + { + "epoch": 0.5556638915972899, + "grad_norm": 4.363320350646973, + "learning_rate": 9.802348842867279e-06, + "loss": 1.0574, + "step": 44452 + }, + { + "epoch": 0.5556888922223056, + "grad_norm": 3.394558906555176, + "learning_rate": 9.801476325232553e-06, + "loss": 1.6726, + "step": 44454 + }, + { + "epoch": 0.5557138928473212, + "grad_norm": 6.228392124176025, + "learning_rate": 9.800603809109758e-06, + "loss": 1.7785, + "step": 44456 + }, + { + "epoch": 0.5557388934723368, + "grad_norm": 2.3371191024780273, + "learning_rate": 9.799731294505533e-06, + "loss": 0.3742, + "step": 44458 + }, + { + "epoch": 0.5557638940973524, + "grad_norm": 3.532125473022461, + "learning_rate": 9.798858781426527e-06, + "loss": 0.8333, + "step": 44460 + }, + { + "epoch": 0.555788894722368, + "grad_norm": 7.071726322174072, + "learning_rate": 9.797986269879385e-06, + "loss": 1.7475, + "step": 44462 + }, + { + "epoch": 0.5558138953473837, + "grad_norm": 0.21125705540180206, + "learning_rate": 9.79711375987075e-06, + "loss": 0.1711, + "step": 44464 + }, + { + "epoch": 0.5558388959723993, + "grad_norm": 0.16325032711029053, + "learning_rate": 9.796241251407268e-06, + "loss": 0.6937, + "step": 44466 + }, + { + "epoch": 0.555863896597415, + "grad_norm": 6.152847766876221, + "learning_rate": 9.795368744495584e-06, + "loss": 1.6059, + "step": 44468 + }, + { + "epoch": 0.5558888972224305, + "grad_norm": 4.097702980041504, + "learning_rate": 9.794496239142344e-06, + "loss": 0.8689, + "step": 44470 + }, + { + "epoch": 0.5559138978474462, + "grad_norm": 1.2605695724487305, + "learning_rate": 9.793623735354191e-06, + "loss": 0.0643, + "step": 44472 + }, + { + "epoch": 0.5559388984724618, + "grad_norm": 6.985633373260498, + "learning_rate": 9.79275123313777e-06, + "loss": 1.5788, + "step": 44474 + }, + { + "epoch": 0.5559638990974775, + "grad_norm": 0.03278333693742752, + "learning_rate": 9.791878732499725e-06, + "loss": 0.0146, + "step": 44476 + }, + { + "epoch": 0.5559888997224931, + "grad_norm": 3.4165053367614746, + "learning_rate": 9.791006233446703e-06, + "loss": 0.9252, + "step": 44478 + }, + { + "epoch": 0.5560139003475086, + "grad_norm": 5.835819721221924, + "learning_rate": 9.790133735985347e-06, + "loss": 1.5896, + "step": 44480 + }, + { + "epoch": 0.5560389009725243, + "grad_norm": 0.0005301543860696256, + "learning_rate": 9.789261240122307e-06, + "loss": 0.8567, + "step": 44482 + }, + { + "epoch": 0.5560639015975399, + "grad_norm": 1.9657096862792969, + "learning_rate": 9.78838874586422e-06, + "loss": 0.2851, + "step": 44484 + }, + { + "epoch": 0.5560889022225556, + "grad_norm": 3.1144275665283203, + "learning_rate": 9.787516253217734e-06, + "loss": 0.8227, + "step": 44486 + }, + { + "epoch": 0.5561139028475712, + "grad_norm": 4.299482822418213, + "learning_rate": 9.786643762189495e-06, + "loss": 0.8135, + "step": 44488 + }, + { + "epoch": 0.5561389034725868, + "grad_norm": 4.352060317993164, + "learning_rate": 9.785771272786147e-06, + "loss": 1.734, + "step": 44490 + }, + { + "epoch": 0.5561639040976024, + "grad_norm": 2.7522435188293457, + "learning_rate": 9.784898785014336e-06, + "loss": 0.8262, + "step": 44492 + }, + { + "epoch": 0.5561889047226181, + "grad_norm": 4.6093525886535645, + "learning_rate": 9.784026298880703e-06, + "loss": 0.8928, + "step": 44494 + }, + { + "epoch": 0.5562139053476337, + "grad_norm": 3.222665786743164, + "learning_rate": 9.783153814391899e-06, + "loss": 1.3332, + "step": 44496 + }, + { + "epoch": 0.5562389059726494, + "grad_norm": 0.011805473826825619, + "learning_rate": 9.782281331554561e-06, + "loss": 0.3903, + "step": 44498 + }, + { + "epoch": 0.5562639065976649, + "grad_norm": 2.93753981590271, + "learning_rate": 9.78140885037534e-06, + "loss": 0.4936, + "step": 44500 + }, + { + "epoch": 0.5562889072226805, + "grad_norm": 2.980274200439453, + "learning_rate": 9.780536370860876e-06, + "loss": 0.5573, + "step": 44502 + }, + { + "epoch": 0.5563139078476962, + "grad_norm": 0.001995065249502659, + "learning_rate": 9.779663893017818e-06, + "loss": 0.0737, + "step": 44504 + }, + { + "epoch": 0.5563389084727118, + "grad_norm": 3.5863468647003174, + "learning_rate": 9.778791416852807e-06, + "loss": 0.4081, + "step": 44506 + }, + { + "epoch": 0.5563639090977275, + "grad_norm": 2.23661732673645, + "learning_rate": 9.777918942372492e-06, + "loss": 0.496, + "step": 44508 + }, + { + "epoch": 0.556388909722743, + "grad_norm": 0.3056373596191406, + "learning_rate": 9.777046469583513e-06, + "loss": 0.0059, + "step": 44510 + }, + { + "epoch": 0.5564139103477587, + "grad_norm": 2.830491542816162, + "learning_rate": 9.776173998492517e-06, + "loss": 1.5863, + "step": 44512 + }, + { + "epoch": 0.5564389109727743, + "grad_norm": 4.09003210067749, + "learning_rate": 9.775301529106148e-06, + "loss": 1.7344, + "step": 44514 + }, + { + "epoch": 0.55646391159779, + "grad_norm": 0.09836137294769287, + "learning_rate": 9.77442906143105e-06, + "loss": 0.5913, + "step": 44516 + }, + { + "epoch": 0.5564889122228056, + "grad_norm": 2.7161221504211426, + "learning_rate": 9.773556595473868e-06, + "loss": 0.644, + "step": 44518 + }, + { + "epoch": 0.5565139128478211, + "grad_norm": 3.107548952102661, + "learning_rate": 9.772684131241245e-06, + "loss": 1.731, + "step": 44520 + }, + { + "epoch": 0.5565389134728368, + "grad_norm": 2.692122220993042, + "learning_rate": 9.771811668739834e-06, + "loss": 1.4913, + "step": 44522 + }, + { + "epoch": 0.5565639140978524, + "grad_norm": 1.5307248830795288, + "learning_rate": 9.770939207976269e-06, + "loss": 0.0341, + "step": 44524 + }, + { + "epoch": 0.5565889147228681, + "grad_norm": 4.028563976287842, + "learning_rate": 9.770066748957198e-06, + "loss": 2.2604, + "step": 44526 + }, + { + "epoch": 0.5566139153478837, + "grad_norm": 44.85866928100586, + "learning_rate": 9.769194291689268e-06, + "loss": 0.9149, + "step": 44528 + }, + { + "epoch": 0.5566389159728993, + "grad_norm": 0.27415111660957336, + "learning_rate": 9.76832183617912e-06, + "loss": 0.4634, + "step": 44530 + }, + { + "epoch": 0.5566639165979149, + "grad_norm": 4.835346221923828, + "learning_rate": 9.7674493824334e-06, + "loss": 1.1075, + "step": 44532 + }, + { + "epoch": 0.5566889172229306, + "grad_norm": 5.1234331130981445, + "learning_rate": 9.766576930458752e-06, + "loss": 1.1544, + "step": 44534 + }, + { + "epoch": 0.5567139178479462, + "grad_norm": 2.275820732116699, + "learning_rate": 9.765704480261828e-06, + "loss": 1.1882, + "step": 44536 + }, + { + "epoch": 0.5567389184729619, + "grad_norm": 3.6000986099243164, + "learning_rate": 9.764832031849258e-06, + "loss": 1.525, + "step": 44538 + }, + { + "epoch": 0.5567639190979774, + "grad_norm": 5.216695308685303, + "learning_rate": 9.763959585227697e-06, + "loss": 0.8313, + "step": 44540 + }, + { + "epoch": 0.556788919722993, + "grad_norm": 4.187224388122559, + "learning_rate": 9.763087140403786e-06, + "loss": 2.0861, + "step": 44542 + }, + { + "epoch": 0.5568139203480087, + "grad_norm": 3.2200634479522705, + "learning_rate": 9.762214697384166e-06, + "loss": 1.4832, + "step": 44544 + }, + { + "epoch": 0.5568389209730243, + "grad_norm": 3.3081424236297607, + "learning_rate": 9.761342256175488e-06, + "loss": 1.0487, + "step": 44546 + }, + { + "epoch": 0.55686392159804, + "grad_norm": 0.46704232692718506, + "learning_rate": 9.760469816784399e-06, + "loss": 0.4516, + "step": 44548 + }, + { + "epoch": 0.5568889222230555, + "grad_norm": 4.453973293304443, + "learning_rate": 9.759597379217532e-06, + "loss": 1.1941, + "step": 44550 + }, + { + "epoch": 0.5569139228480712, + "grad_norm": 4.253767013549805, + "learning_rate": 9.758724943481539e-06, + "loss": 1.2178, + "step": 44552 + }, + { + "epoch": 0.5569389234730868, + "grad_norm": 0.013508941046893597, + "learning_rate": 9.757852509583062e-06, + "loss": 1.0477, + "step": 44554 + }, + { + "epoch": 0.5569639240981025, + "grad_norm": 4.7290520668029785, + "learning_rate": 9.756980077528746e-06, + "loss": 2.3142, + "step": 44556 + }, + { + "epoch": 0.5569889247231181, + "grad_norm": 7.162873268127441, + "learning_rate": 9.756107647325234e-06, + "loss": 0.9743, + "step": 44558 + }, + { + "epoch": 0.5570139253481337, + "grad_norm": 3.367854356765747, + "learning_rate": 9.755235218979172e-06, + "loss": 0.811, + "step": 44560 + }, + { + "epoch": 0.5570389259731493, + "grad_norm": 2.214130163192749, + "learning_rate": 9.75436279249721e-06, + "loss": 0.2727, + "step": 44562 + }, + { + "epoch": 0.557063926598165, + "grad_norm": 2.9089951515197754, + "learning_rate": 9.753490367885982e-06, + "loss": 1.0309, + "step": 44564 + }, + { + "epoch": 0.5570889272231806, + "grad_norm": 3.2894463539123535, + "learning_rate": 9.752617945152135e-06, + "loss": 1.0417, + "step": 44566 + }, + { + "epoch": 0.5571139278481962, + "grad_norm": 3.4995052814483643, + "learning_rate": 9.751745524302317e-06, + "loss": 0.5832, + "step": 44568 + }, + { + "epoch": 0.5571389284732118, + "grad_norm": 0.055058788508176804, + "learning_rate": 9.750873105343165e-06, + "loss": 0.941, + "step": 44570 + }, + { + "epoch": 0.5571639290982274, + "grad_norm": 0.0015784867573529482, + "learning_rate": 9.750000688281332e-06, + "loss": 0.5468, + "step": 44572 + }, + { + "epoch": 0.5571889297232431, + "grad_norm": 4.745910167694092, + "learning_rate": 9.749128273123463e-06, + "loss": 1.4569, + "step": 44574 + }, + { + "epoch": 0.5572139303482587, + "grad_norm": 0.9145157337188721, + "learning_rate": 9.748255859876192e-06, + "loss": 0.7961, + "step": 44576 + }, + { + "epoch": 0.5572389309732744, + "grad_norm": 3.304927110671997, + "learning_rate": 9.747383448546171e-06, + "loss": 1.3792, + "step": 44578 + }, + { + "epoch": 0.5572639315982899, + "grad_norm": 2.4642133712768555, + "learning_rate": 9.74651103914004e-06, + "loss": 1.2552, + "step": 44580 + }, + { + "epoch": 0.5572889322233056, + "grad_norm": 6.233972549438477, + "learning_rate": 9.745638631664442e-06, + "loss": 1.4944, + "step": 44582 + }, + { + "epoch": 0.5573139328483212, + "grad_norm": 2.7912192344665527, + "learning_rate": 9.744766226126029e-06, + "loss": 0.928, + "step": 44584 + }, + { + "epoch": 0.5573389334733369, + "grad_norm": 3.438354730606079, + "learning_rate": 9.743893822531438e-06, + "loss": 0.6209, + "step": 44586 + }, + { + "epoch": 0.5573639340983525, + "grad_norm": 0.0213128924369812, + "learning_rate": 9.743021420887321e-06, + "loss": 0.0004, + "step": 44588 + }, + { + "epoch": 0.557388934723368, + "grad_norm": 2.865424156188965, + "learning_rate": 9.742149021200312e-06, + "loss": 0.7883, + "step": 44590 + }, + { + "epoch": 0.5574139353483837, + "grad_norm": 2.3828938007354736, + "learning_rate": 9.74127662347706e-06, + "loss": 1.0703, + "step": 44592 + }, + { + "epoch": 0.5574389359733993, + "grad_norm": 2.0326671600341797, + "learning_rate": 9.740404227724207e-06, + "loss": 0.6367, + "step": 44594 + }, + { + "epoch": 0.557463936598415, + "grad_norm": 6.1747307777404785, + "learning_rate": 9.7395318339484e-06, + "loss": 1.2697, + "step": 44596 + }, + { + "epoch": 0.5574889372234306, + "grad_norm": 0.029940331354737282, + "learning_rate": 9.73865944215628e-06, + "loss": 0.7358, + "step": 44598 + }, + { + "epoch": 0.5575139378484462, + "grad_norm": 3.19692325592041, + "learning_rate": 9.7377870523545e-06, + "loss": 0.8568, + "step": 44600 + }, + { + "epoch": 0.5575389384734618, + "grad_norm": 0.0010106887202709913, + "learning_rate": 9.736914664549692e-06, + "loss": 0.0, + "step": 44602 + }, + { + "epoch": 0.5575639390984775, + "grad_norm": 2.6739134788513184, + "learning_rate": 9.736042278748503e-06, + "loss": 1.5579, + "step": 44604 + }, + { + "epoch": 0.5575889397234931, + "grad_norm": 1.8799049854278564, + "learning_rate": 9.735169894957577e-06, + "loss": 0.5448, + "step": 44606 + }, + { + "epoch": 0.5576139403485088, + "grad_norm": 3.030649423599243, + "learning_rate": 9.734297513183562e-06, + "loss": 0.8321, + "step": 44608 + }, + { + "epoch": 0.5576389409735243, + "grad_norm": 0.0028099410701543093, + "learning_rate": 9.7334251334331e-06, + "loss": 0.7818, + "step": 44610 + }, + { + "epoch": 0.5576639415985399, + "grad_norm": 5.3839006423950195, + "learning_rate": 9.732552755712834e-06, + "loss": 1.3122, + "step": 44612 + }, + { + "epoch": 0.5576889422235556, + "grad_norm": 3.7332141399383545, + "learning_rate": 9.731680380029413e-06, + "loss": 2.0214, + "step": 44614 + }, + { + "epoch": 0.5577139428485712, + "grad_norm": 3.393232583999634, + "learning_rate": 9.730808006389473e-06, + "loss": 0.6883, + "step": 44616 + }, + { + "epoch": 0.5577389434735869, + "grad_norm": 4.951847076416016, + "learning_rate": 9.729935634799656e-06, + "loss": 1.757, + "step": 44618 + }, + { + "epoch": 0.5577639440986024, + "grad_norm": 3.7737255096435547, + "learning_rate": 9.729063265266616e-06, + "loss": 1.2954, + "step": 44620 + }, + { + "epoch": 0.5577889447236181, + "grad_norm": 4.48084831237793, + "learning_rate": 9.728190897796991e-06, + "loss": 1.2675, + "step": 44622 + }, + { + "epoch": 0.5578139453486337, + "grad_norm": 0.008290509693324566, + "learning_rate": 9.727318532397426e-06, + "loss": 0.3513, + "step": 44624 + }, + { + "epoch": 0.5578389459736494, + "grad_norm": 0.09951046854257584, + "learning_rate": 9.726446169074568e-06, + "loss": 0.0137, + "step": 44626 + }, + { + "epoch": 0.557863946598665, + "grad_norm": 2.4108121395111084, + "learning_rate": 9.725573807835054e-06, + "loss": 0.2564, + "step": 44628 + }, + { + "epoch": 0.5578889472236805, + "grad_norm": 0.002073742914944887, + "learning_rate": 9.724701448685529e-06, + "loss": 0.0936, + "step": 44630 + }, + { + "epoch": 0.5579139478486962, + "grad_norm": 0.6540606021881104, + "learning_rate": 9.72382909163264e-06, + "loss": 0.0981, + "step": 44632 + }, + { + "epoch": 0.5579389484737118, + "grad_norm": 4.019223690032959, + "learning_rate": 9.72295673668303e-06, + "loss": 0.6903, + "step": 44634 + }, + { + "epoch": 0.5579639490987275, + "grad_norm": 5.9640069007873535, + "learning_rate": 9.72208438384334e-06, + "loss": 0.9975, + "step": 44636 + }, + { + "epoch": 0.5579889497237431, + "grad_norm": 3.20223069190979, + "learning_rate": 9.72121203312022e-06, + "loss": 0.7964, + "step": 44638 + }, + { + "epoch": 0.5580139503487587, + "grad_norm": 4.931706428527832, + "learning_rate": 9.720339684520312e-06, + "loss": 0.8421, + "step": 44640 + }, + { + "epoch": 0.5580389509737743, + "grad_norm": 2.822995662689209, + "learning_rate": 9.719467338050253e-06, + "loss": 0.4926, + "step": 44642 + }, + { + "epoch": 0.55806395159879, + "grad_norm": 3.6292080879211426, + "learning_rate": 9.71859499371669e-06, + "loss": 0.6762, + "step": 44644 + }, + { + "epoch": 0.5580889522238056, + "grad_norm": 0.958646297454834, + "learning_rate": 9.71772265152627e-06, + "loss": 0.7662, + "step": 44646 + }, + { + "epoch": 0.5581139528488213, + "grad_norm": 0.008296270854771137, + "learning_rate": 9.716850311485633e-06, + "loss": 0.421, + "step": 44648 + }, + { + "epoch": 0.5581389534738368, + "grad_norm": 0.0012157459277659655, + "learning_rate": 9.715977973601423e-06, + "loss": 0.8829, + "step": 44650 + }, + { + "epoch": 0.5581639540988524, + "grad_norm": 2.4888815879821777, + "learning_rate": 9.71510563788029e-06, + "loss": 1.5654, + "step": 44652 + }, + { + "epoch": 0.5581889547238681, + "grad_norm": 8.241665840148926, + "learning_rate": 9.71423330432887e-06, + "loss": 1.8675, + "step": 44654 + }, + { + "epoch": 0.5582139553488837, + "grad_norm": 0.869914174079895, + "learning_rate": 9.713360972953805e-06, + "loss": 0.5789, + "step": 44656 + }, + { + "epoch": 0.5582389559738994, + "grad_norm": 1.9799692630767822, + "learning_rate": 9.712488643761745e-06, + "loss": 0.1726, + "step": 44658 + }, + { + "epoch": 0.5582639565989149, + "grad_norm": 3.4148895740509033, + "learning_rate": 9.711616316759328e-06, + "loss": 1.8416, + "step": 44660 + }, + { + "epoch": 0.5582889572239306, + "grad_norm": 4.232595920562744, + "learning_rate": 9.710743991953204e-06, + "loss": 0.2056, + "step": 44662 + }, + { + "epoch": 0.5583139578489462, + "grad_norm": 2.738293409347534, + "learning_rate": 9.709871669350011e-06, + "loss": 0.6614, + "step": 44664 + }, + { + "epoch": 0.5583389584739619, + "grad_norm": 1.8530910015106201, + "learning_rate": 9.708999348956399e-06, + "loss": 0.3275, + "step": 44666 + }, + { + "epoch": 0.5583639590989775, + "grad_norm": 4.068665981292725, + "learning_rate": 9.708127030779e-06, + "loss": 1.2059, + "step": 44668 + }, + { + "epoch": 0.558388959723993, + "grad_norm": 3.5107953548431396, + "learning_rate": 9.707254714824467e-06, + "loss": 1.4178, + "step": 44670 + }, + { + "epoch": 0.5584139603490087, + "grad_norm": 0.361972838640213, + "learning_rate": 9.706382401099442e-06, + "loss": 0.0074, + "step": 44672 + }, + { + "epoch": 0.5584389609740243, + "grad_norm": 5.192419052124023, + "learning_rate": 9.705510089610564e-06, + "loss": 0.3785, + "step": 44674 + }, + { + "epoch": 0.55846396159904, + "grad_norm": 0.0015686068218201399, + "learning_rate": 9.704637780364481e-06, + "loss": 0.7032, + "step": 44676 + }, + { + "epoch": 0.5584889622240556, + "grad_norm": 0.0007612999179400504, + "learning_rate": 9.70376547336784e-06, + "loss": 0.017, + "step": 44678 + }, + { + "epoch": 0.5585139628490712, + "grad_norm": 3.8197081089019775, + "learning_rate": 9.702893168627272e-06, + "loss": 1.0769, + "step": 44680 + }, + { + "epoch": 0.5585389634740868, + "grad_norm": 1.513904333114624, + "learning_rate": 9.70202086614943e-06, + "loss": 0.188, + "step": 44682 + }, + { + "epoch": 0.5585639640991025, + "grad_norm": 2.971842050552368, + "learning_rate": 9.701148565940955e-06, + "loss": 0.7752, + "step": 44684 + }, + { + "epoch": 0.5585889647241181, + "grad_norm": 7.311211585998535, + "learning_rate": 9.70027626800849e-06, + "loss": 0.9703, + "step": 44686 + }, + { + "epoch": 0.5586139653491338, + "grad_norm": 3.603609085083008, + "learning_rate": 9.699403972358679e-06, + "loss": 0.5924, + "step": 44688 + }, + { + "epoch": 0.5586389659741493, + "grad_norm": 3.0353825092315674, + "learning_rate": 9.698531678998163e-06, + "loss": 1.7012, + "step": 44690 + }, + { + "epoch": 0.558663966599165, + "grad_norm": 0.7116823792457581, + "learning_rate": 9.697659387933592e-06, + "loss": 1.6728, + "step": 44692 + }, + { + "epoch": 0.5586889672241806, + "grad_norm": 0.0012716423952952027, + "learning_rate": 9.696787099171601e-06, + "loss": 0.0, + "step": 44694 + }, + { + "epoch": 0.5587139678491962, + "grad_norm": 2.672905445098877, + "learning_rate": 9.695914812718836e-06, + "loss": 1.1171, + "step": 44696 + }, + { + "epoch": 0.5587389684742119, + "grad_norm": 4.1010637283325195, + "learning_rate": 9.695042528581942e-06, + "loss": 0.8143, + "step": 44698 + }, + { + "epoch": 0.5587639690992274, + "grad_norm": 0.4173366129398346, + "learning_rate": 9.694170246767561e-06, + "loss": 0.2703, + "step": 44700 + }, + { + "epoch": 0.5587889697242431, + "grad_norm": 8.77225112915039, + "learning_rate": 9.693297967282335e-06, + "loss": 0.8911, + "step": 44702 + }, + { + "epoch": 0.5588139703492587, + "grad_norm": 2.3092968463897705, + "learning_rate": 9.692425690132913e-06, + "loss": 0.8197, + "step": 44704 + }, + { + "epoch": 0.5588389709742744, + "grad_norm": 3.4728024005889893, + "learning_rate": 9.691553415325926e-06, + "loss": 1.172, + "step": 44706 + }, + { + "epoch": 0.55886397159929, + "grad_norm": 3.575584888458252, + "learning_rate": 9.690681142868027e-06, + "loss": 1.4625, + "step": 44708 + }, + { + "epoch": 0.5588889722243056, + "grad_norm": 0.0016186012653633952, + "learning_rate": 9.68980887276586e-06, + "loss": 0.4653, + "step": 44710 + }, + { + "epoch": 0.5589139728493212, + "grad_norm": 4.344623565673828, + "learning_rate": 9.688936605026061e-06, + "loss": 1.5397, + "step": 44712 + }, + { + "epoch": 0.5589389734743369, + "grad_norm": 4.807344436645508, + "learning_rate": 9.688064339655278e-06, + "loss": 1.3959, + "step": 44714 + }, + { + "epoch": 0.5589639740993525, + "grad_norm": 1.166979432106018, + "learning_rate": 9.687192076660152e-06, + "loss": 0.1056, + "step": 44716 + }, + { + "epoch": 0.5589889747243681, + "grad_norm": 3.1891086101531982, + "learning_rate": 9.686319816047331e-06, + "loss": 1.7517, + "step": 44718 + }, + { + "epoch": 0.5590139753493837, + "grad_norm": 0.157780721783638, + "learning_rate": 9.685447557823452e-06, + "loss": 0.1721, + "step": 44720 + }, + { + "epoch": 0.5590389759743993, + "grad_norm": 0.6690744161605835, + "learning_rate": 9.684575301995157e-06, + "loss": 1.6243, + "step": 44722 + }, + { + "epoch": 0.559063976599415, + "grad_norm": 4.176554203033447, + "learning_rate": 9.683703048569095e-06, + "loss": 0.4169, + "step": 44724 + }, + { + "epoch": 0.5590889772244306, + "grad_norm": 3.3318216800689697, + "learning_rate": 9.682830797551903e-06, + "loss": 1.6109, + "step": 44726 + }, + { + "epoch": 0.5591139778494463, + "grad_norm": 1.9839177131652832, + "learning_rate": 9.681958548950228e-06, + "loss": 0.2623, + "step": 44728 + }, + { + "epoch": 0.5591389784744618, + "grad_norm": 1.8904153108596802, + "learning_rate": 9.681086302770715e-06, + "loss": 0.8194, + "step": 44730 + }, + { + "epoch": 0.5591639790994775, + "grad_norm": 2.676673650741577, + "learning_rate": 9.68021405902e-06, + "loss": 1.0322, + "step": 44732 + }, + { + "epoch": 0.5591889797244931, + "grad_norm": 2.8943023681640625, + "learning_rate": 9.67934181770473e-06, + "loss": 0.5498, + "step": 44734 + }, + { + "epoch": 0.5592139803495088, + "grad_norm": 0.8795281052589417, + "learning_rate": 9.678469578831548e-06, + "loss": 0.6468, + "step": 44736 + }, + { + "epoch": 0.5592389809745244, + "grad_norm": 3.794614791870117, + "learning_rate": 9.677597342407094e-06, + "loss": 1.313, + "step": 44738 + }, + { + "epoch": 0.5592639815995399, + "grad_norm": 7.239361763000488, + "learning_rate": 9.676725108438015e-06, + "loss": 2.1571, + "step": 44740 + }, + { + "epoch": 0.5592889822245556, + "grad_norm": 3.8155548572540283, + "learning_rate": 9.675852876930952e-06, + "loss": 1.7347, + "step": 44742 + }, + { + "epoch": 0.5593139828495712, + "grad_norm": 0.0049482546746730804, + "learning_rate": 9.67498064789255e-06, + "loss": 0.8227, + "step": 44744 + }, + { + "epoch": 0.5593389834745869, + "grad_norm": 4.561078071594238, + "learning_rate": 9.674108421329446e-06, + "loss": 0.9654, + "step": 44746 + }, + { + "epoch": 0.5593639840996025, + "grad_norm": 4.9671311378479, + "learning_rate": 9.673236197248287e-06, + "loss": 2.5034, + "step": 44748 + }, + { + "epoch": 0.5593889847246181, + "grad_norm": 5.035983085632324, + "learning_rate": 9.672363975655715e-06, + "loss": 1.305, + "step": 44750 + }, + { + "epoch": 0.5594139853496337, + "grad_norm": 2.6167197227478027, + "learning_rate": 9.671491756558372e-06, + "loss": 0.2765, + "step": 44752 + }, + { + "epoch": 0.5594389859746494, + "grad_norm": 0.9423018097877502, + "learning_rate": 9.670619539962903e-06, + "loss": 0.456, + "step": 44754 + }, + { + "epoch": 0.559463986599665, + "grad_norm": 0.0010384765919297934, + "learning_rate": 9.669747325875949e-06, + "loss": 0.0709, + "step": 44756 + }, + { + "epoch": 0.5594889872246807, + "grad_norm": 5.542527198791504, + "learning_rate": 9.668875114304152e-06, + "loss": 2.1172, + "step": 44758 + }, + { + "epoch": 0.5595139878496962, + "grad_norm": 0.001877744565717876, + "learning_rate": 9.668002905254153e-06, + "loss": 0.5557, + "step": 44760 + }, + { + "epoch": 0.5595389884747118, + "grad_norm": 0.002335162367671728, + "learning_rate": 9.667130698732599e-06, + "loss": 0.6339, + "step": 44762 + }, + { + "epoch": 0.5595639890997275, + "grad_norm": 0.462179571390152, + "learning_rate": 9.66625849474613e-06, + "loss": 0.0201, + "step": 44764 + }, + { + "epoch": 0.5595889897247431, + "grad_norm": 2.318500280380249, + "learning_rate": 9.66538629330139e-06, + "loss": 0.4554, + "step": 44766 + }, + { + "epoch": 0.5596139903497588, + "grad_norm": 3.15743088722229, + "learning_rate": 9.664514094405019e-06, + "loss": 0.9706, + "step": 44768 + }, + { + "epoch": 0.5596389909747743, + "grad_norm": 0.002929891925305128, + "learning_rate": 9.663641898063664e-06, + "loss": 0.5982, + "step": 44770 + }, + { + "epoch": 0.55966399159979, + "grad_norm": 0.5676836967468262, + "learning_rate": 9.662769704283964e-06, + "loss": 0.0218, + "step": 44772 + }, + { + "epoch": 0.5596889922248056, + "grad_norm": 1.0193065404891968, + "learning_rate": 9.66189751307256e-06, + "loss": 0.9276, + "step": 44774 + }, + { + "epoch": 0.5597139928498213, + "grad_norm": 4.010354995727539, + "learning_rate": 9.661025324436099e-06, + "loss": 1.4775, + "step": 44776 + }, + { + "epoch": 0.5597389934748369, + "grad_norm": 3.4853110313415527, + "learning_rate": 9.660153138381219e-06, + "loss": 0.1547, + "step": 44778 + }, + { + "epoch": 0.5597639940998524, + "grad_norm": 3.068758964538574, + "learning_rate": 9.659280954914566e-06, + "loss": 0.903, + "step": 44780 + }, + { + "epoch": 0.5597889947248681, + "grad_norm": 3.013075351715088, + "learning_rate": 9.65840877404278e-06, + "loss": 1.12, + "step": 44782 + }, + { + "epoch": 0.5598139953498837, + "grad_norm": 2.914856433868408, + "learning_rate": 9.657536595772508e-06, + "loss": 1.2111, + "step": 44784 + }, + { + "epoch": 0.5598389959748994, + "grad_norm": 2.547194004058838, + "learning_rate": 9.656664420110385e-06, + "loss": 0.9624, + "step": 44786 + }, + { + "epoch": 0.559863996599915, + "grad_norm": 4.22373104095459, + "learning_rate": 9.65579224706306e-06, + "loss": 0.9994, + "step": 44788 + }, + { + "epoch": 0.5598889972249306, + "grad_norm": 2.8098981380462646, + "learning_rate": 9.65492007663717e-06, + "loss": 0.8037, + "step": 44790 + }, + { + "epoch": 0.5599139978499462, + "grad_norm": 7.754700660705566, + "learning_rate": 9.65404790883936e-06, + "loss": 1.0677, + "step": 44792 + }, + { + "epoch": 0.5599389984749619, + "grad_norm": 3.8803648948669434, + "learning_rate": 9.653175743676274e-06, + "loss": 1.4685, + "step": 44794 + }, + { + "epoch": 0.5599639990999775, + "grad_norm": 3.624373435974121, + "learning_rate": 9.652303581154552e-06, + "loss": 1.5537, + "step": 44796 + }, + { + "epoch": 0.5599889997249932, + "grad_norm": 3.4974558353424072, + "learning_rate": 9.651431421280837e-06, + "loss": 1.2331, + "step": 44798 + }, + { + "epoch": 0.5600140003500087, + "grad_norm": 4.796306133270264, + "learning_rate": 9.65055926406177e-06, + "loss": 0.993, + "step": 44800 + }, + { + "epoch": 0.5600390009750243, + "grad_norm": 4.59212064743042, + "learning_rate": 9.649687109503994e-06, + "loss": 1.0988, + "step": 44802 + }, + { + "epoch": 0.56006400160004, + "grad_norm": 0.0015764225972816348, + "learning_rate": 9.648814957614152e-06, + "loss": 0.5731, + "step": 44804 + }, + { + "epoch": 0.5600890022250556, + "grad_norm": 6.749928951263428, + "learning_rate": 9.647942808398886e-06, + "loss": 0.8327, + "step": 44806 + }, + { + "epoch": 0.5601140028500713, + "grad_norm": 0.002612257609143853, + "learning_rate": 9.647070661864837e-06, + "loss": 1.315, + "step": 44808 + }, + { + "epoch": 0.5601390034750868, + "grad_norm": 2.6330885887145996, + "learning_rate": 9.64619851801865e-06, + "loss": 1.3576, + "step": 44810 + }, + { + "epoch": 0.5601640041001025, + "grad_norm": 4.109436511993408, + "learning_rate": 9.645326376866964e-06, + "loss": 1.228, + "step": 44812 + }, + { + "epoch": 0.5601890047251181, + "grad_norm": 1.2663863897323608, + "learning_rate": 9.64445423841642e-06, + "loss": 0.8552, + "step": 44814 + }, + { + "epoch": 0.5602140053501338, + "grad_norm": 4.838630676269531, + "learning_rate": 9.643582102673666e-06, + "loss": 0.7305, + "step": 44816 + }, + { + "epoch": 0.5602390059751494, + "grad_norm": 3.191835403442383, + "learning_rate": 9.642709969645337e-06, + "loss": 1.3265, + "step": 44818 + }, + { + "epoch": 0.560264006600165, + "grad_norm": 7.957157135009766, + "learning_rate": 9.641837839338079e-06, + "loss": 1.7864, + "step": 44820 + }, + { + "epoch": 0.5602890072251806, + "grad_norm": 4.164422035217285, + "learning_rate": 9.640965711758537e-06, + "loss": 0.4942, + "step": 44822 + }, + { + "epoch": 0.5603140078501962, + "grad_norm": 0.001226893742568791, + "learning_rate": 9.640093586913344e-06, + "loss": 0.6168, + "step": 44824 + }, + { + "epoch": 0.5603390084752119, + "grad_norm": 2.824403762817383, + "learning_rate": 9.63922146480915e-06, + "loss": 1.9741, + "step": 44826 + }, + { + "epoch": 0.5603640091002275, + "grad_norm": 4.286923408508301, + "learning_rate": 9.638349345452594e-06, + "loss": 0.399, + "step": 44828 + }, + { + "epoch": 0.5603890097252431, + "grad_norm": 2.5812578201293945, + "learning_rate": 9.637477228850319e-06, + "loss": 0.4095, + "step": 44830 + }, + { + "epoch": 0.5604140103502587, + "grad_norm": 0.0008695365395396948, + "learning_rate": 9.636605115008964e-06, + "loss": 0.4476, + "step": 44832 + }, + { + "epoch": 0.5604390109752744, + "grad_norm": 2.626147985458374, + "learning_rate": 9.635733003935173e-06, + "loss": 1.205, + "step": 44834 + }, + { + "epoch": 0.56046401160029, + "grad_norm": 5.202793121337891, + "learning_rate": 9.634860895635593e-06, + "loss": 0.9716, + "step": 44836 + }, + { + "epoch": 0.5604890122253057, + "grad_norm": 0.18046215176582336, + "learning_rate": 9.633988790116856e-06, + "loss": 0.0534, + "step": 44838 + }, + { + "epoch": 0.5605140128503212, + "grad_norm": 3.4774560928344727, + "learning_rate": 9.63311668738561e-06, + "loss": 0.3955, + "step": 44840 + }, + { + "epoch": 0.5605390134753369, + "grad_norm": 0.6721336841583252, + "learning_rate": 9.632244587448495e-06, + "loss": 0.099, + "step": 44842 + }, + { + "epoch": 0.5605640141003525, + "grad_norm": 2.4482367038726807, + "learning_rate": 9.631372490312153e-06, + "loss": 0.6442, + "step": 44844 + }, + { + "epoch": 0.5605890147253682, + "grad_norm": 3.4115238189697266, + "learning_rate": 9.630500395983226e-06, + "loss": 1.4133, + "step": 44846 + }, + { + "epoch": 0.5606140153503838, + "grad_norm": 0.7484058737754822, + "learning_rate": 9.629628304468358e-06, + "loss": 0.1004, + "step": 44848 + }, + { + "epoch": 0.5606390159753993, + "grad_norm": 0.001440912950783968, + "learning_rate": 9.628756215774186e-06, + "loss": 0.036, + "step": 44850 + }, + { + "epoch": 0.560664016600415, + "grad_norm": 2.075540542602539, + "learning_rate": 9.627884129907354e-06, + "loss": 0.3562, + "step": 44852 + }, + { + "epoch": 0.5606890172254306, + "grad_norm": 4.725403785705566, + "learning_rate": 9.627012046874504e-06, + "loss": 0.4725, + "step": 44854 + }, + { + "epoch": 0.5607140178504463, + "grad_norm": 0.0013116402551531792, + "learning_rate": 9.626139966682278e-06, + "loss": 1.1632, + "step": 44856 + }, + { + "epoch": 0.5607390184754619, + "grad_norm": 4.414370059967041, + "learning_rate": 9.625267889337317e-06, + "loss": 1.264, + "step": 44858 + }, + { + "epoch": 0.5607640191004775, + "grad_norm": 3.202167510986328, + "learning_rate": 9.624395814846263e-06, + "loss": 1.436, + "step": 44860 + }, + { + "epoch": 0.5607890197254931, + "grad_norm": 4.883016586303711, + "learning_rate": 9.62352374321576e-06, + "loss": 1.9223, + "step": 44862 + }, + { + "epoch": 0.5608140203505088, + "grad_norm": 4.919074535369873, + "learning_rate": 9.622651674452444e-06, + "loss": 0.8982, + "step": 44864 + }, + { + "epoch": 0.5608390209755244, + "grad_norm": 2.762437582015991, + "learning_rate": 9.621779608562958e-06, + "loss": 0.2313, + "step": 44866 + }, + { + "epoch": 0.56086402160054, + "grad_norm": 4.685489177703857, + "learning_rate": 9.620907545553947e-06, + "loss": 1.1789, + "step": 44868 + }, + { + "epoch": 0.5608890222255556, + "grad_norm": 1.2318437099456787, + "learning_rate": 9.62003548543205e-06, + "loss": 1.0519, + "step": 44870 + }, + { + "epoch": 0.5609140228505712, + "grad_norm": 4.284984588623047, + "learning_rate": 9.61916342820391e-06, + "loss": 0.8361, + "step": 44872 + }, + { + "epoch": 0.5609390234755869, + "grad_norm": 2.200598955154419, + "learning_rate": 9.618291373876168e-06, + "loss": 0.8035, + "step": 44874 + }, + { + "epoch": 0.5609640241006025, + "grad_norm": 3.3569111824035645, + "learning_rate": 9.617419322455464e-06, + "loss": 1.7461, + "step": 44876 + }, + { + "epoch": 0.5609890247256182, + "grad_norm": 6.834997653961182, + "learning_rate": 9.616547273948438e-06, + "loss": 0.5913, + "step": 44878 + }, + { + "epoch": 0.5610140253506337, + "grad_norm": 2.9506123065948486, + "learning_rate": 9.615675228361736e-06, + "loss": 0.5686, + "step": 44880 + }, + { + "epoch": 0.5610390259756494, + "grad_norm": 8.348294258117676, + "learning_rate": 9.614803185701997e-06, + "loss": 1.7026, + "step": 44882 + }, + { + "epoch": 0.561064026600665, + "grad_norm": 0.0017652068054303527, + "learning_rate": 9.613931145975862e-06, + "loss": 1.0945, + "step": 44884 + }, + { + "epoch": 0.5610890272256807, + "grad_norm": 10.694575309753418, + "learning_rate": 9.613059109189973e-06, + "loss": 1.1789, + "step": 44886 + }, + { + "epoch": 0.5611140278506963, + "grad_norm": 2.824155569076538, + "learning_rate": 9.612187075350972e-06, + "loss": 0.3886, + "step": 44888 + }, + { + "epoch": 0.5611390284757118, + "grad_norm": 1.8459564447402954, + "learning_rate": 9.6113150444655e-06, + "loss": 1.2161, + "step": 44890 + }, + { + "epoch": 0.5611640291007275, + "grad_norm": 2.037367105484009, + "learning_rate": 9.610443016540195e-06, + "loss": 0.3931, + "step": 44892 + }, + { + "epoch": 0.5611890297257431, + "grad_norm": 1.3274433612823486, + "learning_rate": 9.609570991581703e-06, + "loss": 0.7821, + "step": 44894 + }, + { + "epoch": 0.5612140303507588, + "grad_norm": 5.692092418670654, + "learning_rate": 9.608698969596661e-06, + "loss": 1.0933, + "step": 44896 + }, + { + "epoch": 0.5612390309757744, + "grad_norm": 1.748899221420288, + "learning_rate": 9.607826950591713e-06, + "loss": 0.7493, + "step": 44898 + }, + { + "epoch": 0.56126403160079, + "grad_norm": 5.026348114013672, + "learning_rate": 9.606954934573503e-06, + "loss": 1.0282, + "step": 44900 + }, + { + "epoch": 0.5612890322258056, + "grad_norm": 3.060429334640503, + "learning_rate": 9.606082921548665e-06, + "loss": 1.1013, + "step": 44902 + }, + { + "epoch": 0.5613140328508213, + "grad_norm": 0.9645171165466309, + "learning_rate": 9.605210911523845e-06, + "loss": 0.8526, + "step": 44904 + }, + { + "epoch": 0.5613390334758369, + "grad_norm": 5.069449424743652, + "learning_rate": 9.60433890450568e-06, + "loss": 1.0661, + "step": 44906 + }, + { + "epoch": 0.5613640341008526, + "grad_norm": 0.921440601348877, + "learning_rate": 9.603466900500817e-06, + "loss": 0.0436, + "step": 44908 + }, + { + "epoch": 0.5613890347258681, + "grad_norm": 5.293368339538574, + "learning_rate": 9.602594899515893e-06, + "loss": 0.6255, + "step": 44910 + }, + { + "epoch": 0.5614140353508837, + "grad_norm": 2.437150239944458, + "learning_rate": 9.601722901557549e-06, + "loss": 1.2314, + "step": 44912 + }, + { + "epoch": 0.5614390359758994, + "grad_norm": 3.6956982612609863, + "learning_rate": 9.600850906632431e-06, + "loss": 0.4919, + "step": 44914 + }, + { + "epoch": 0.561464036600915, + "grad_norm": 2.6787002086639404, + "learning_rate": 9.599978914747174e-06, + "loss": 1.1591, + "step": 44916 + }, + { + "epoch": 0.5614890372259307, + "grad_norm": 3.936316967010498, + "learning_rate": 9.59910692590842e-06, + "loss": 1.394, + "step": 44918 + }, + { + "epoch": 0.5615140378509462, + "grad_norm": 4.811101913452148, + "learning_rate": 9.598234940122812e-06, + "loss": 0.6436, + "step": 44920 + }, + { + "epoch": 0.5615390384759619, + "grad_norm": 5.882452964782715, + "learning_rate": 9.597362957396988e-06, + "loss": 1.5057, + "step": 44922 + }, + { + "epoch": 0.5615640391009775, + "grad_norm": 3.3165385723114014, + "learning_rate": 9.59649097773759e-06, + "loss": 1.1872, + "step": 44924 + }, + { + "epoch": 0.5615890397259932, + "grad_norm": 3.7738351821899414, + "learning_rate": 9.595619001151266e-06, + "loss": 0.8075, + "step": 44926 + }, + { + "epoch": 0.5616140403510088, + "grad_norm": 4.574287414550781, + "learning_rate": 9.594747027644647e-06, + "loss": 0.899, + "step": 44928 + }, + { + "epoch": 0.5616390409760244, + "grad_norm": 0.809904932975769, + "learning_rate": 9.593875057224375e-06, + "loss": 0.5784, + "step": 44930 + }, + { + "epoch": 0.56166404160104, + "grad_norm": 0.03078385442495346, + "learning_rate": 9.593003089897095e-06, + "loss": 0.9411, + "step": 44932 + }, + { + "epoch": 0.5616890422260556, + "grad_norm": 0.033046528697013855, + "learning_rate": 9.592131125669444e-06, + "loss": 0.0382, + "step": 44934 + }, + { + "epoch": 0.5617140428510713, + "grad_norm": 0.582435667514801, + "learning_rate": 9.591259164548066e-06, + "loss": 0.0874, + "step": 44936 + }, + { + "epoch": 0.5617390434760869, + "grad_norm": 4.536318778991699, + "learning_rate": 9.590387206539599e-06, + "loss": 0.9298, + "step": 44938 + }, + { + "epoch": 0.5617640441011025, + "grad_norm": 2.432835340499878, + "learning_rate": 9.589515251650691e-06, + "loss": 0.6561, + "step": 44940 + }, + { + "epoch": 0.5617890447261181, + "grad_norm": 2.5397236347198486, + "learning_rate": 9.588643299887974e-06, + "loss": 1.1584, + "step": 44942 + }, + { + "epoch": 0.5618140453511338, + "grad_norm": 0.0038612186908721924, + "learning_rate": 9.587771351258089e-06, + "loss": 0.0001, + "step": 44944 + }, + { + "epoch": 0.5618390459761494, + "grad_norm": 7.085373878479004, + "learning_rate": 9.58689940576768e-06, + "loss": 0.9431, + "step": 44946 + }, + { + "epoch": 0.5618640466011651, + "grad_norm": 2.906416177749634, + "learning_rate": 9.586027463423386e-06, + "loss": 0.569, + "step": 44948 + }, + { + "epoch": 0.5618890472261806, + "grad_norm": 3.4136452674865723, + "learning_rate": 9.585155524231847e-06, + "loss": 0.8106, + "step": 44950 + }, + { + "epoch": 0.5619140478511963, + "grad_norm": 2.1742758750915527, + "learning_rate": 9.584283588199712e-06, + "loss": 0.1252, + "step": 44952 + }, + { + "epoch": 0.5619390484762119, + "grad_norm": 0.002841123379766941, + "learning_rate": 9.583411655333608e-06, + "loss": 0.958, + "step": 44954 + }, + { + "epoch": 0.5619640491012275, + "grad_norm": 3.511059045791626, + "learning_rate": 9.582539725640183e-06, + "loss": 1.3196, + "step": 44956 + }, + { + "epoch": 0.5619890497262432, + "grad_norm": 3.1971042156219482, + "learning_rate": 9.581667799126078e-06, + "loss": 0.7816, + "step": 44958 + }, + { + "epoch": 0.5620140503512587, + "grad_norm": 3.096526861190796, + "learning_rate": 9.58079587579793e-06, + "loss": 1.9106, + "step": 44960 + }, + { + "epoch": 0.5620390509762744, + "grad_norm": 3.249276876449585, + "learning_rate": 9.579923955662379e-06, + "loss": 1.3002, + "step": 44962 + }, + { + "epoch": 0.56206405160129, + "grad_norm": 4.272286891937256, + "learning_rate": 9.579052038726072e-06, + "loss": 1.6269, + "step": 44964 + }, + { + "epoch": 0.5620890522263057, + "grad_norm": 2.458435297012329, + "learning_rate": 9.578180124995647e-06, + "loss": 0.9152, + "step": 44966 + }, + { + "epoch": 0.5621140528513213, + "grad_norm": 2.504484176635742, + "learning_rate": 9.577308214477739e-06, + "loss": 1.3716, + "step": 44968 + }, + { + "epoch": 0.5621390534763369, + "grad_norm": 0.544343888759613, + "learning_rate": 9.57643630717899e-06, + "loss": 0.882, + "step": 44970 + }, + { + "epoch": 0.5621640541013525, + "grad_norm": 0.1845995932817459, + "learning_rate": 9.575564403106046e-06, + "loss": 0.3884, + "step": 44972 + }, + { + "epoch": 0.5621890547263682, + "grad_norm": 0.0012960792519152164, + "learning_rate": 9.57469250226554e-06, + "loss": 1.5354, + "step": 44974 + }, + { + "epoch": 0.5622140553513838, + "grad_norm": 5.6837992668151855, + "learning_rate": 9.573820604664117e-06, + "loss": 0.7978, + "step": 44976 + }, + { + "epoch": 0.5622390559763994, + "grad_norm": 4.347768783569336, + "learning_rate": 9.57294871030842e-06, + "loss": 1.3085, + "step": 44978 + }, + { + "epoch": 0.562264056601415, + "grad_norm": 0.0009072839166037738, + "learning_rate": 9.572076819205083e-06, + "loss": 0.2546, + "step": 44980 + }, + { + "epoch": 0.5622890572264306, + "grad_norm": 5.565946102142334, + "learning_rate": 9.571204931360747e-06, + "loss": 1.8912, + "step": 44982 + }, + { + "epoch": 0.5623140578514463, + "grad_norm": 4.234257698059082, + "learning_rate": 9.570333046782054e-06, + "loss": 0.9264, + "step": 44984 + }, + { + "epoch": 0.5623390584764619, + "grad_norm": 3.3584320545196533, + "learning_rate": 9.56946116547564e-06, + "loss": 0.8395, + "step": 44986 + }, + { + "epoch": 0.5623640591014776, + "grad_norm": 0.001357397180981934, + "learning_rate": 9.568589287448152e-06, + "loss": 0.7299, + "step": 44988 + }, + { + "epoch": 0.5623890597264931, + "grad_norm": 1.0375139713287354, + "learning_rate": 9.567717412706227e-06, + "loss": 0.6469, + "step": 44990 + }, + { + "epoch": 0.5624140603515088, + "grad_norm": 3.698796510696411, + "learning_rate": 9.56684554125651e-06, + "loss": 0.8273, + "step": 44992 + }, + { + "epoch": 0.5624390609765244, + "grad_norm": 0.9416622519493103, + "learning_rate": 9.565973673105634e-06, + "loss": 0.8501, + "step": 44994 + }, + { + "epoch": 0.56246406160154, + "grad_norm": 0.4186151325702667, + "learning_rate": 9.565101808260238e-06, + "loss": 0.2307, + "step": 44996 + }, + { + "epoch": 0.5624890622265557, + "grad_norm": 12.29067611694336, + "learning_rate": 9.564229946726963e-06, + "loss": 1.0343, + "step": 44998 + }, + { + "epoch": 0.5625140628515712, + "grad_norm": 5.0176496505737305, + "learning_rate": 9.563358088512453e-06, + "loss": 1.6473, + "step": 45000 + }, + { + "epoch": 0.5625390634765869, + "grad_norm": 4.742405414581299, + "learning_rate": 9.562486233623347e-06, + "loss": 1.4646, + "step": 45002 + }, + { + "epoch": 0.5625640641016025, + "grad_norm": 2.9461419582366943, + "learning_rate": 9.56161438206629e-06, + "loss": 1.2363, + "step": 45004 + }, + { + "epoch": 0.5625890647266182, + "grad_norm": 4.246316909790039, + "learning_rate": 9.560742533847909e-06, + "loss": 0.275, + "step": 45006 + }, + { + "epoch": 0.5626140653516338, + "grad_norm": 4.010782718658447, + "learning_rate": 9.55987068897485e-06, + "loss": 1.2611, + "step": 45008 + }, + { + "epoch": 0.5626390659766494, + "grad_norm": 1.5728740692138672, + "learning_rate": 9.558998847453754e-06, + "loss": 0.2736, + "step": 45010 + }, + { + "epoch": 0.562664066601665, + "grad_norm": 2.9893240928649902, + "learning_rate": 9.558127009291262e-06, + "loss": 0.7517, + "step": 45012 + }, + { + "epoch": 0.5626890672266807, + "grad_norm": 5.686451435089111, + "learning_rate": 9.557255174494011e-06, + "loss": 1.4824, + "step": 45014 + }, + { + "epoch": 0.5627140678516963, + "grad_norm": 0.0008077883394435048, + "learning_rate": 9.556383343068643e-06, + "loss": 0.2048, + "step": 45016 + }, + { + "epoch": 0.562739068476712, + "grad_norm": 5.931024074554443, + "learning_rate": 9.5555115150218e-06, + "loss": 2.2925, + "step": 45018 + }, + { + "epoch": 0.5627640691017275, + "grad_norm": 0.0005725168157368898, + "learning_rate": 9.554639690360116e-06, + "loss": 0.0, + "step": 45020 + }, + { + "epoch": 0.5627890697267431, + "grad_norm": 7.3028388023376465, + "learning_rate": 9.553767869090232e-06, + "loss": 1.3871, + "step": 45022 + }, + { + "epoch": 0.5628140703517588, + "grad_norm": 4.960126876831055, + "learning_rate": 9.552896051218787e-06, + "loss": 1.4062, + "step": 45024 + }, + { + "epoch": 0.5628390709767744, + "grad_norm": 1.4775338172912598, + "learning_rate": 9.552024236752422e-06, + "loss": 0.4253, + "step": 45026 + }, + { + "epoch": 0.5628640716017901, + "grad_norm": 3.1231555938720703, + "learning_rate": 9.551152425697781e-06, + "loss": 1.1584, + "step": 45028 + }, + { + "epoch": 0.5628890722268056, + "grad_norm": 9.823740005493164, + "learning_rate": 9.550280618061501e-06, + "loss": 1.2952, + "step": 45030 + }, + { + "epoch": 0.5629140728518213, + "grad_norm": 0.2143132984638214, + "learning_rate": 9.549408813850219e-06, + "loss": 0.1074, + "step": 45032 + }, + { + "epoch": 0.5629390734768369, + "grad_norm": 0.567868173122406, + "learning_rate": 9.548537013070574e-06, + "loss": 1.5374, + "step": 45034 + }, + { + "epoch": 0.5629640741018526, + "grad_norm": 0.33563292026519775, + "learning_rate": 9.547665215729205e-06, + "loss": 0.2055, + "step": 45036 + }, + { + "epoch": 0.5629890747268682, + "grad_norm": 5.344232082366943, + "learning_rate": 9.546793421832756e-06, + "loss": 2.1252, + "step": 45038 + }, + { + "epoch": 0.5630140753518837, + "grad_norm": 4.95075798034668, + "learning_rate": 9.545921631387863e-06, + "loss": 1.9421, + "step": 45040 + }, + { + "epoch": 0.5630390759768994, + "grad_norm": 1.30292546749115, + "learning_rate": 9.545049844401168e-06, + "loss": 0.583, + "step": 45042 + }, + { + "epoch": 0.563064076601915, + "grad_norm": 4.842273712158203, + "learning_rate": 9.544178060879313e-06, + "loss": 1.652, + "step": 45044 + }, + { + "epoch": 0.5630890772269307, + "grad_norm": 3.0659284591674805, + "learning_rate": 9.54330628082893e-06, + "loss": 0.2037, + "step": 45046 + }, + { + "epoch": 0.5631140778519463, + "grad_norm": 3.818011522293091, + "learning_rate": 9.542434504256659e-06, + "loss": 1.7418, + "step": 45048 + }, + { + "epoch": 0.5631390784769619, + "grad_norm": 3.199899911880493, + "learning_rate": 9.541562731169145e-06, + "loss": 0.9022, + "step": 45050 + }, + { + "epoch": 0.5631640791019775, + "grad_norm": 1.282193899154663, + "learning_rate": 9.540690961573023e-06, + "loss": 0.0627, + "step": 45052 + }, + { + "epoch": 0.5631890797269932, + "grad_norm": 0.5513014197349548, + "learning_rate": 9.539819195474933e-06, + "loss": 0.2265, + "step": 45054 + }, + { + "epoch": 0.5632140803520088, + "grad_norm": 2.6897010803222656, + "learning_rate": 9.538947432881517e-06, + "loss": 0.3214, + "step": 45056 + }, + { + "epoch": 0.5632390809770245, + "grad_norm": 5.987569808959961, + "learning_rate": 9.538075673799415e-06, + "loss": 0.8254, + "step": 45058 + }, + { + "epoch": 0.56326408160204, + "grad_norm": 3.0922160148620605, + "learning_rate": 9.537203918235257e-06, + "loss": 1.0609, + "step": 45060 + }, + { + "epoch": 0.5632890822270556, + "grad_norm": 0.00116969074588269, + "learning_rate": 9.536332166195692e-06, + "loss": 0.9272, + "step": 45062 + }, + { + "epoch": 0.5633140828520713, + "grad_norm": 2.1615560054779053, + "learning_rate": 9.535460417687355e-06, + "loss": 0.319, + "step": 45064 + }, + { + "epoch": 0.5633390834770869, + "grad_norm": 1.447239637374878, + "learning_rate": 9.534588672716886e-06, + "loss": 0.0769, + "step": 45066 + }, + { + "epoch": 0.5633640841021026, + "grad_norm": 2.9858875274658203, + "learning_rate": 9.533716931290923e-06, + "loss": 0.6018, + "step": 45068 + }, + { + "epoch": 0.5633890847271181, + "grad_norm": 0.0007081695948727429, + "learning_rate": 9.532845193416112e-06, + "loss": 0.0, + "step": 45070 + }, + { + "epoch": 0.5634140853521338, + "grad_norm": 2.4856977462768555, + "learning_rate": 9.531973459099078e-06, + "loss": 0.5093, + "step": 45072 + }, + { + "epoch": 0.5634390859771494, + "grad_norm": 3.7899467945098877, + "learning_rate": 9.531101728346469e-06, + "loss": 1.2211, + "step": 45074 + }, + { + "epoch": 0.5634640866021651, + "grad_norm": 1.4092165231704712, + "learning_rate": 9.530230001164925e-06, + "loss": 0.7927, + "step": 45076 + }, + { + "epoch": 0.5634890872271807, + "grad_norm": 1.3929723501205444, + "learning_rate": 9.529358277561081e-06, + "loss": 0.3214, + "step": 45078 + }, + { + "epoch": 0.5635140878521963, + "grad_norm": 3.734525680541992, + "learning_rate": 9.528486557541579e-06, + "loss": 1.4226, + "step": 45080 + }, + { + "epoch": 0.5635390884772119, + "grad_norm": 2.982877254486084, + "learning_rate": 9.527614841113058e-06, + "loss": 0.991, + "step": 45082 + }, + { + "epoch": 0.5635640891022276, + "grad_norm": 0.8903031349182129, + "learning_rate": 9.526743128282155e-06, + "loss": 0.2298, + "step": 45084 + }, + { + "epoch": 0.5635890897272432, + "grad_norm": 5.384654521942139, + "learning_rate": 9.525871419055509e-06, + "loss": 1.8324, + "step": 45086 + }, + { + "epoch": 0.5636140903522588, + "grad_norm": 6.079070568084717, + "learning_rate": 9.524999713439758e-06, + "loss": 0.2204, + "step": 45088 + }, + { + "epoch": 0.5636390909772744, + "grad_norm": 0.0011817477643489838, + "learning_rate": 9.524128011441544e-06, + "loss": 0.4529, + "step": 45090 + }, + { + "epoch": 0.56366409160229, + "grad_norm": 1.7771477699279785, + "learning_rate": 9.523256313067503e-06, + "loss": 0.3421, + "step": 45092 + }, + { + "epoch": 0.5636890922273057, + "grad_norm": 2.1728556156158447, + "learning_rate": 9.522384618324273e-06, + "loss": 1.1494, + "step": 45094 + }, + { + "epoch": 0.5637140928523213, + "grad_norm": 5.251938819885254, + "learning_rate": 9.5215129272185e-06, + "loss": 1.4712, + "step": 45096 + }, + { + "epoch": 0.563739093477337, + "grad_norm": 1.978861689567566, + "learning_rate": 9.52064123975681e-06, + "loss": 0.7883, + "step": 45098 + }, + { + "epoch": 0.5637640941023525, + "grad_norm": 2.495046854019165, + "learning_rate": 9.519769555945852e-06, + "loss": 0.8259, + "step": 45100 + }, + { + "epoch": 0.5637890947273682, + "grad_norm": 4.542039394378662, + "learning_rate": 9.518897875792258e-06, + "loss": 1.2782, + "step": 45102 + }, + { + "epoch": 0.5638140953523838, + "grad_norm": 2.9009647369384766, + "learning_rate": 9.518026199302672e-06, + "loss": 0.4624, + "step": 45104 + }, + { + "epoch": 0.5638390959773995, + "grad_norm": 7.6258015632629395, + "learning_rate": 9.51715452648373e-06, + "loss": 1.4086, + "step": 45106 + }, + { + "epoch": 0.5638640966024151, + "grad_norm": 3.6792616844177246, + "learning_rate": 9.516282857342072e-06, + "loss": 1.1792, + "step": 45108 + }, + { + "epoch": 0.5638890972274306, + "grad_norm": 4.546351432800293, + "learning_rate": 9.515411191884335e-06, + "loss": 0.9859, + "step": 45110 + }, + { + "epoch": 0.5639140978524463, + "grad_norm": 3.9178595542907715, + "learning_rate": 9.514539530117158e-06, + "loss": 1.3388, + "step": 45112 + }, + { + "epoch": 0.5639390984774619, + "grad_norm": 0.0007616743096150458, + "learning_rate": 9.513667872047177e-06, + "loss": 0.6389, + "step": 45114 + }, + { + "epoch": 0.5639640991024776, + "grad_norm": 3.3454959392547607, + "learning_rate": 9.512796217681036e-06, + "loss": 1.1219, + "step": 45116 + }, + { + "epoch": 0.5639890997274932, + "grad_norm": 0.00099768559448421, + "learning_rate": 9.511924567025366e-06, + "loss": 0.0, + "step": 45118 + }, + { + "epoch": 0.5640141003525088, + "grad_norm": 6.588040828704834, + "learning_rate": 9.511052920086813e-06, + "loss": 0.4463, + "step": 45120 + }, + { + "epoch": 0.5640391009775244, + "grad_norm": 2.4153125286102295, + "learning_rate": 9.510181276872013e-06, + "loss": 0.8015, + "step": 45122 + }, + { + "epoch": 0.5640641016025401, + "grad_norm": 0.25787997245788574, + "learning_rate": 9.509309637387599e-06, + "loss": 0.6987, + "step": 45124 + }, + { + "epoch": 0.5640891022275557, + "grad_norm": 4.881200790405273, + "learning_rate": 9.508438001640216e-06, + "loss": 1.891, + "step": 45126 + }, + { + "epoch": 0.5641141028525714, + "grad_norm": 2.476801633834839, + "learning_rate": 9.507566369636498e-06, + "loss": 1.4413, + "step": 45128 + }, + { + "epoch": 0.5641391034775869, + "grad_norm": 0.24048247933387756, + "learning_rate": 9.506694741383085e-06, + "loss": 0.4441, + "step": 45130 + }, + { + "epoch": 0.5641641041026025, + "grad_norm": 4.260169982910156, + "learning_rate": 9.505823116886617e-06, + "loss": 0.7559, + "step": 45132 + }, + { + "epoch": 0.5641891047276182, + "grad_norm": 0.0018695004982873797, + "learning_rate": 9.504951496153729e-06, + "loss": 0.5589, + "step": 45134 + }, + { + "epoch": 0.5642141053526338, + "grad_norm": 3.6001675128936768, + "learning_rate": 9.504079879191062e-06, + "loss": 1.299, + "step": 45136 + }, + { + "epoch": 0.5642391059776495, + "grad_norm": 2.963831663131714, + "learning_rate": 9.50320826600525e-06, + "loss": 1.163, + "step": 45138 + }, + { + "epoch": 0.564264106602665, + "grad_norm": 2.611985921859741, + "learning_rate": 9.502336656602935e-06, + "loss": 0.0987, + "step": 45140 + }, + { + "epoch": 0.5642891072276807, + "grad_norm": 2.299846887588501, + "learning_rate": 9.501465050990753e-06, + "loss": 1.2892, + "step": 45142 + }, + { + "epoch": 0.5643141078526963, + "grad_norm": 4.100188732147217, + "learning_rate": 9.500593449175342e-06, + "loss": 1.4147, + "step": 45144 + }, + { + "epoch": 0.564339108477712, + "grad_norm": 4.438938140869141, + "learning_rate": 9.499721851163341e-06, + "loss": 1.005, + "step": 45146 + }, + { + "epoch": 0.5643641091027276, + "grad_norm": 0.6305831074714661, + "learning_rate": 9.49885025696139e-06, + "loss": 0.7804, + "step": 45148 + }, + { + "epoch": 0.5643891097277431, + "grad_norm": 1.1191250085830688, + "learning_rate": 9.497978666576123e-06, + "loss": 1.2814, + "step": 45150 + }, + { + "epoch": 0.5644141103527588, + "grad_norm": 0.5250236988067627, + "learning_rate": 9.497107080014178e-06, + "loss": 0.9445, + "step": 45152 + }, + { + "epoch": 0.5644391109777744, + "grad_norm": 2.1024069786071777, + "learning_rate": 9.496235497282198e-06, + "loss": 1.0428, + "step": 45154 + }, + { + "epoch": 0.5644641116027901, + "grad_norm": 0.0009568653185851872, + "learning_rate": 9.495363918386812e-06, + "loss": 0.9224, + "step": 45156 + }, + { + "epoch": 0.5644891122278057, + "grad_norm": 2.628099203109741, + "learning_rate": 9.494492343334666e-06, + "loss": 1.952, + "step": 45158 + }, + { + "epoch": 0.5645141128528213, + "grad_norm": 2.090925931930542, + "learning_rate": 9.493620772132395e-06, + "loss": 0.877, + "step": 45160 + }, + { + "epoch": 0.5645391134778369, + "grad_norm": 4.373915672302246, + "learning_rate": 9.49274920478664e-06, + "loss": 0.5752, + "step": 45162 + }, + { + "epoch": 0.5645641141028526, + "grad_norm": 1.9353225231170654, + "learning_rate": 9.49187764130403e-06, + "loss": 0.8584, + "step": 45164 + }, + { + "epoch": 0.5645891147278682, + "grad_norm": 2.9670703411102295, + "learning_rate": 9.49100608169121e-06, + "loss": 1.1389, + "step": 45166 + }, + { + "epoch": 0.5646141153528839, + "grad_norm": 4.52786922454834, + "learning_rate": 9.490134525954814e-06, + "loss": 1.4429, + "step": 45168 + }, + { + "epoch": 0.5646391159778994, + "grad_norm": 3.6695492267608643, + "learning_rate": 9.489262974101483e-06, + "loss": 0.1047, + "step": 45170 + }, + { + "epoch": 0.564664116602915, + "grad_norm": 2.9225335121154785, + "learning_rate": 9.488391426137852e-06, + "loss": 0.2886, + "step": 45172 + }, + { + "epoch": 0.5646891172279307, + "grad_norm": 0.004227946046739817, + "learning_rate": 9.487519882070562e-06, + "loss": 0.8175, + "step": 45174 + }, + { + "epoch": 0.5647141178529463, + "grad_norm": 2.805797815322876, + "learning_rate": 9.486648341906246e-06, + "loss": 0.9465, + "step": 45176 + }, + { + "epoch": 0.564739118477962, + "grad_norm": 4.975986003875732, + "learning_rate": 9.485776805651545e-06, + "loss": 1.9005, + "step": 45178 + }, + { + "epoch": 0.5647641191029775, + "grad_norm": 1.1297656297683716, + "learning_rate": 9.484905273313093e-06, + "loss": 0.2576, + "step": 45180 + }, + { + "epoch": 0.5647891197279932, + "grad_norm": 1.8323190212249756, + "learning_rate": 9.484033744897531e-06, + "loss": 0.638, + "step": 45182 + }, + { + "epoch": 0.5648141203530088, + "grad_norm": 0.3752215504646301, + "learning_rate": 9.483162220411494e-06, + "loss": 0.0106, + "step": 45184 + }, + { + "epoch": 0.5648391209780245, + "grad_norm": 3.4088926315307617, + "learning_rate": 9.48229069986162e-06, + "loss": 0.709, + "step": 45186 + }, + { + "epoch": 0.5648641216030401, + "grad_norm": 6.254817008972168, + "learning_rate": 9.48141918325455e-06, + "loss": 1.0176, + "step": 45188 + }, + { + "epoch": 0.5648891222280557, + "grad_norm": 2.837275505065918, + "learning_rate": 9.480547670596916e-06, + "loss": 0.8096, + "step": 45190 + }, + { + "epoch": 0.5649141228530713, + "grad_norm": 3.0135693550109863, + "learning_rate": 9.479676161895358e-06, + "loss": 0.054, + "step": 45192 + }, + { + "epoch": 0.564939123478087, + "grad_norm": 4.1162109375, + "learning_rate": 9.478804657156511e-06, + "loss": 0.2585, + "step": 45194 + }, + { + "epoch": 0.5649641241031026, + "grad_norm": 0.0005112475482746959, + "learning_rate": 9.477933156387015e-06, + "loss": 0.5349, + "step": 45196 + }, + { + "epoch": 0.5649891247281182, + "grad_norm": 8.83470344543457, + "learning_rate": 9.477061659593507e-06, + "loss": 1.414, + "step": 45198 + }, + { + "epoch": 0.5650141253531338, + "grad_norm": 4.39054012298584, + "learning_rate": 9.476190166782626e-06, + "loss": 1.4261, + "step": 45200 + }, + { + "epoch": 0.5650391259781494, + "grad_norm": 1.8198570013046265, + "learning_rate": 9.475318677961003e-06, + "loss": 0.4452, + "step": 45202 + }, + { + "epoch": 0.5650641266031651, + "grad_norm": 0.7680779695510864, + "learning_rate": 9.47444719313528e-06, + "loss": 0.9432, + "step": 45204 + }, + { + "epoch": 0.5650891272281807, + "grad_norm": 3.458339214324951, + "learning_rate": 9.473575712312091e-06, + "loss": 1.929, + "step": 45206 + }, + { + "epoch": 0.5651141278531964, + "grad_norm": 1.746229887008667, + "learning_rate": 9.472704235498076e-06, + "loss": 0.3243, + "step": 45208 + }, + { + "epoch": 0.5651391284782119, + "grad_norm": 0.0011451838072389364, + "learning_rate": 9.471832762699873e-06, + "loss": 0.0635, + "step": 45210 + }, + { + "epoch": 0.5651641291032276, + "grad_norm": 3.5846590995788574, + "learning_rate": 9.470961293924113e-06, + "loss": 0.1194, + "step": 45212 + }, + { + "epoch": 0.5651891297282432, + "grad_norm": 3.6551315784454346, + "learning_rate": 9.470089829177441e-06, + "loss": 1.1869, + "step": 45214 + }, + { + "epoch": 0.5652141303532588, + "grad_norm": 1.3334637880325317, + "learning_rate": 9.469218368466488e-06, + "loss": 1.0507, + "step": 45216 + }, + { + "epoch": 0.5652391309782745, + "grad_norm": 4.0215959548950195, + "learning_rate": 9.468346911797892e-06, + "loss": 1.1611, + "step": 45218 + }, + { + "epoch": 0.56526413160329, + "grad_norm": 0.493317186832428, + "learning_rate": 9.467475459178292e-06, + "loss": 0.5423, + "step": 45220 + }, + { + "epoch": 0.5652891322283057, + "grad_norm": 3.6569478511810303, + "learning_rate": 9.466604010614323e-06, + "loss": 0.9222, + "step": 45222 + }, + { + "epoch": 0.5653141328533213, + "grad_norm": 4.195709705352783, + "learning_rate": 9.465732566112623e-06, + "loss": 0.2093, + "step": 45224 + }, + { + "epoch": 0.565339133478337, + "grad_norm": 0.00045694501022808254, + "learning_rate": 9.464861125679831e-06, + "loss": 0.8966, + "step": 45226 + }, + { + "epoch": 0.5653641341033526, + "grad_norm": 1.8484688997268677, + "learning_rate": 9.463989689322576e-06, + "loss": 0.4038, + "step": 45228 + }, + { + "epoch": 0.5653891347283682, + "grad_norm": 3.460925340652466, + "learning_rate": 9.463118257047503e-06, + "loss": 1.3751, + "step": 45230 + }, + { + "epoch": 0.5654141353533838, + "grad_norm": 5.39837646484375, + "learning_rate": 9.462246828861244e-06, + "loss": 0.9855, + "step": 45232 + }, + { + "epoch": 0.5654391359783995, + "grad_norm": 0.000953134149312973, + "learning_rate": 9.461375404770436e-06, + "loss": 0.0979, + "step": 45234 + }, + { + "epoch": 0.5654641366034151, + "grad_norm": 2.545074462890625, + "learning_rate": 9.460503984781718e-06, + "loss": 0.5301, + "step": 45236 + }, + { + "epoch": 0.5654891372284307, + "grad_norm": 3.8062167167663574, + "learning_rate": 9.459632568901725e-06, + "loss": 0.9413, + "step": 45238 + }, + { + "epoch": 0.5655141378534463, + "grad_norm": 1.3717541694641113, + "learning_rate": 9.458761157137097e-06, + "loss": 1.0744, + "step": 45240 + }, + { + "epoch": 0.5655391384784619, + "grad_norm": 3.8997998237609863, + "learning_rate": 9.457889749494464e-06, + "loss": 1.1347, + "step": 45242 + }, + { + "epoch": 0.5655641391034776, + "grad_norm": 4.139707565307617, + "learning_rate": 9.457018345980467e-06, + "loss": 1.8126, + "step": 45244 + }, + { + "epoch": 0.5655891397284932, + "grad_norm": 6.964162349700928, + "learning_rate": 9.456146946601741e-06, + "loss": 0.8481, + "step": 45246 + }, + { + "epoch": 0.5656141403535089, + "grad_norm": 2.007213830947876, + "learning_rate": 9.455275551364922e-06, + "loss": 1.6502, + "step": 45248 + }, + { + "epoch": 0.5656391409785244, + "grad_norm": 3.213209867477417, + "learning_rate": 9.45440416027665e-06, + "loss": 0.8568, + "step": 45250 + }, + { + "epoch": 0.5656641416035401, + "grad_norm": 5.134977340698242, + "learning_rate": 9.453532773343557e-06, + "loss": 1.8254, + "step": 45252 + }, + { + "epoch": 0.5656891422285557, + "grad_norm": 0.0006470257649198174, + "learning_rate": 9.452661390572283e-06, + "loss": 0.9039, + "step": 45254 + }, + { + "epoch": 0.5657141428535714, + "grad_norm": 0.0027911337092518806, + "learning_rate": 9.451790011969459e-06, + "loss": 0.8389, + "step": 45256 + }, + { + "epoch": 0.565739143478587, + "grad_norm": 5.3187761306762695, + "learning_rate": 9.450918637541727e-06, + "loss": 1.6177, + "step": 45258 + }, + { + "epoch": 0.5657641441036025, + "grad_norm": 3.6290059089660645, + "learning_rate": 9.450047267295722e-06, + "loss": 0.6642, + "step": 45260 + }, + { + "epoch": 0.5657891447286182, + "grad_norm": 2.4618024826049805, + "learning_rate": 9.449175901238076e-06, + "loss": 1.5797, + "step": 45262 + }, + { + "epoch": 0.5658141453536338, + "grad_norm": 0.000579987361561507, + "learning_rate": 9.44830453937543e-06, + "loss": 0.0, + "step": 45264 + }, + { + "epoch": 0.5658391459786495, + "grad_norm": 0.07123282551765442, + "learning_rate": 9.447433181714421e-06, + "loss": 0.4784, + "step": 45266 + }, + { + "epoch": 0.5658641466036651, + "grad_norm": 1.729297161102295, + "learning_rate": 9.446561828261681e-06, + "loss": 0.1861, + "step": 45268 + }, + { + "epoch": 0.5658891472286807, + "grad_norm": 2.1316981315612793, + "learning_rate": 9.445690479023846e-06, + "loss": 0.7686, + "step": 45270 + }, + { + "epoch": 0.5659141478536963, + "grad_norm": 4.165405750274658, + "learning_rate": 9.444819134007556e-06, + "loss": 0.8366, + "step": 45272 + }, + { + "epoch": 0.565939148478712, + "grad_norm": 0.0005790030700154603, + "learning_rate": 9.443947793219445e-06, + "loss": 0.0888, + "step": 45274 + }, + { + "epoch": 0.5659641491037276, + "grad_norm": 0.765019953250885, + "learning_rate": 9.443076456666149e-06, + "loss": 0.1059, + "step": 45276 + }, + { + "epoch": 0.5659891497287433, + "grad_norm": 4.636776924133301, + "learning_rate": 9.442205124354304e-06, + "loss": 1.5624, + "step": 45278 + }, + { + "epoch": 0.5660141503537588, + "grad_norm": 0.8342725038528442, + "learning_rate": 9.441333796290545e-06, + "loss": 0.4352, + "step": 45280 + }, + { + "epoch": 0.5660391509787744, + "grad_norm": 2.9440746307373047, + "learning_rate": 9.440462472481508e-06, + "loss": 0.7637, + "step": 45282 + }, + { + "epoch": 0.5660641516037901, + "grad_norm": 3.7617695331573486, + "learning_rate": 9.439591152933832e-06, + "loss": 0.871, + "step": 45284 + }, + { + "epoch": 0.5660891522288057, + "grad_norm": 4.222152233123779, + "learning_rate": 9.438719837654148e-06, + "loss": 0.997, + "step": 45286 + }, + { + "epoch": 0.5661141528538214, + "grad_norm": 4.555727005004883, + "learning_rate": 9.437848526649095e-06, + "loss": 1.7139, + "step": 45288 + }, + { + "epoch": 0.5661391534788369, + "grad_norm": 4.764618396759033, + "learning_rate": 9.43697721992531e-06, + "loss": 1.6691, + "step": 45290 + }, + { + "epoch": 0.5661641541038526, + "grad_norm": 2.5010781288146973, + "learning_rate": 9.436105917489427e-06, + "loss": 0.8953, + "step": 45292 + }, + { + "epoch": 0.5661891547288682, + "grad_norm": 2.676693916320801, + "learning_rate": 9.43523461934808e-06, + "loss": 1.4063, + "step": 45294 + }, + { + "epoch": 0.5662141553538839, + "grad_norm": 4.547077655792236, + "learning_rate": 9.434363325507905e-06, + "loss": 1.6979, + "step": 45296 + }, + { + "epoch": 0.5662391559788995, + "grad_norm": 6.7676239013671875, + "learning_rate": 9.433492035975541e-06, + "loss": 1.412, + "step": 45298 + }, + { + "epoch": 0.566264156603915, + "grad_norm": 4.972382068634033, + "learning_rate": 9.43262075075762e-06, + "loss": 0.897, + "step": 45300 + }, + { + "epoch": 0.5662891572289307, + "grad_norm": 7.778543949127197, + "learning_rate": 9.43174946986078e-06, + "loss": 1.457, + "step": 45302 + }, + { + "epoch": 0.5663141578539463, + "grad_norm": 0.0005788501584902406, + "learning_rate": 9.430878193291654e-06, + "loss": 0.5322, + "step": 45304 + }, + { + "epoch": 0.566339158478962, + "grad_norm": 1.1202151775360107, + "learning_rate": 9.430006921056885e-06, + "loss": 0.7306, + "step": 45306 + }, + { + "epoch": 0.5663641591039776, + "grad_norm": 7.141280651092529, + "learning_rate": 9.429135653163098e-06, + "loss": 1.3186, + "step": 45308 + }, + { + "epoch": 0.5663891597289932, + "grad_norm": 1.2144631147384644, + "learning_rate": 9.428264389616935e-06, + "loss": 0.5304, + "step": 45310 + }, + { + "epoch": 0.5664141603540088, + "grad_norm": 2.619145393371582, + "learning_rate": 9.427393130425028e-06, + "loss": 1.0991, + "step": 45312 + }, + { + "epoch": 0.5664391609790245, + "grad_norm": 2.626272201538086, + "learning_rate": 9.426521875594013e-06, + "loss": 1.7228, + "step": 45314 + }, + { + "epoch": 0.5664641616040401, + "grad_norm": 2.774550437927246, + "learning_rate": 9.425650625130525e-06, + "loss": 1.035, + "step": 45316 + }, + { + "epoch": 0.5664891622290558, + "grad_norm": 3.4161012172698975, + "learning_rate": 9.424779379041207e-06, + "loss": 0.4826, + "step": 45318 + }, + { + "epoch": 0.5665141628540713, + "grad_norm": 2.390958070755005, + "learning_rate": 9.423908137332684e-06, + "loss": 0.3672, + "step": 45320 + }, + { + "epoch": 0.566539163479087, + "grad_norm": 1.9833636283874512, + "learning_rate": 9.423036900011597e-06, + "loss": 0.3935, + "step": 45322 + }, + { + "epoch": 0.5665641641041026, + "grad_norm": 1.9207005500793457, + "learning_rate": 9.422165667084576e-06, + "loss": 0.3941, + "step": 45324 + }, + { + "epoch": 0.5665891647291182, + "grad_norm": 3.08982253074646, + "learning_rate": 9.421294438558262e-06, + "loss": 1.0149, + "step": 45326 + }, + { + "epoch": 0.5666141653541339, + "grad_norm": 5.994816780090332, + "learning_rate": 9.420423214439284e-06, + "loss": 1.448, + "step": 45328 + }, + { + "epoch": 0.5666391659791494, + "grad_norm": 0.023511411622166634, + "learning_rate": 9.419551994734283e-06, + "loss": 0.0033, + "step": 45330 + }, + { + "epoch": 0.5666641666041651, + "grad_norm": 4.494080543518066, + "learning_rate": 9.418680779449896e-06, + "loss": 0.8496, + "step": 45332 + }, + { + "epoch": 0.5666891672291807, + "grad_norm": 7.21460485458374, + "learning_rate": 9.417809568592751e-06, + "loss": 1.029, + "step": 45334 + }, + { + "epoch": 0.5667141678541964, + "grad_norm": 3.62026309967041, + "learning_rate": 9.416938362169486e-06, + "loss": 0.5833, + "step": 45336 + }, + { + "epoch": 0.566739168479212, + "grad_norm": 6.312134742736816, + "learning_rate": 9.416067160186737e-06, + "loss": 1.6441, + "step": 45338 + }, + { + "epoch": 0.5667641691042276, + "grad_norm": 4.893598556518555, + "learning_rate": 9.415195962651134e-06, + "loss": 1.053, + "step": 45340 + }, + { + "epoch": 0.5667891697292432, + "grad_norm": 3.3297019004821777, + "learning_rate": 9.414324769569317e-06, + "loss": 0.9513, + "step": 45342 + }, + { + "epoch": 0.5668141703542589, + "grad_norm": 1.5653398036956787, + "learning_rate": 9.413453580947925e-06, + "loss": 0.4692, + "step": 45344 + }, + { + "epoch": 0.5668391709792745, + "grad_norm": 6.836367607116699, + "learning_rate": 9.412582396793583e-06, + "loss": 1.3366, + "step": 45346 + }, + { + "epoch": 0.5668641716042901, + "grad_norm": 3.2303380966186523, + "learning_rate": 9.411711217112932e-06, + "loss": 0.831, + "step": 45348 + }, + { + "epoch": 0.5668891722293057, + "grad_norm": 4.820319652557373, + "learning_rate": 9.410840041912604e-06, + "loss": 1.2292, + "step": 45350 + }, + { + "epoch": 0.5669141728543213, + "grad_norm": 3.042367935180664, + "learning_rate": 9.409968871199235e-06, + "loss": 1.3125, + "step": 45352 + }, + { + "epoch": 0.566939173479337, + "grad_norm": 2.1679868698120117, + "learning_rate": 9.409097704979457e-06, + "loss": 0.9272, + "step": 45354 + }, + { + "epoch": 0.5669641741043526, + "grad_norm": 7.088164329528809, + "learning_rate": 9.40822654325991e-06, + "loss": 0.9659, + "step": 45356 + }, + { + "epoch": 0.5669891747293683, + "grad_norm": 2.251920223236084, + "learning_rate": 9.407355386047232e-06, + "loss": 0.4008, + "step": 45358 + }, + { + "epoch": 0.5670141753543838, + "grad_norm": 2.0616068840026855, + "learning_rate": 9.406484233348045e-06, + "loss": 1.0421, + "step": 45360 + }, + { + "epoch": 0.5670391759793995, + "grad_norm": 4.387168884277344, + "learning_rate": 9.405613085168989e-06, + "loss": 0.8926, + "step": 45362 + }, + { + "epoch": 0.5670641766044151, + "grad_norm": 0.0008023454574868083, + "learning_rate": 9.404741941516701e-06, + "loss": 1.1986, + "step": 45364 + }, + { + "epoch": 0.5670891772294308, + "grad_norm": 0.00043332946370355785, + "learning_rate": 9.403870802397814e-06, + "loss": 1.2996, + "step": 45366 + }, + { + "epoch": 0.5671141778544464, + "grad_norm": 3.7969298362731934, + "learning_rate": 9.402999667818963e-06, + "loss": 2.5736, + "step": 45368 + }, + { + "epoch": 0.5671391784794619, + "grad_norm": 9.402859687805176, + "learning_rate": 9.402128537786786e-06, + "loss": 0.6554, + "step": 45370 + }, + { + "epoch": 0.5671641791044776, + "grad_norm": 2.3792665004730225, + "learning_rate": 9.40125741230791e-06, + "loss": 0.6601, + "step": 45372 + }, + { + "epoch": 0.5671891797294932, + "grad_norm": 6.2282562255859375, + "learning_rate": 9.400386291388974e-06, + "loss": 1.4456, + "step": 45374 + }, + { + "epoch": 0.5672141803545089, + "grad_norm": 4.780490398406982, + "learning_rate": 9.399515175036609e-06, + "loss": 1.1489, + "step": 45376 + }, + { + "epoch": 0.5672391809795245, + "grad_norm": 2.3617329597473145, + "learning_rate": 9.398644063257451e-06, + "loss": 0.3986, + "step": 45378 + }, + { + "epoch": 0.5672641816045401, + "grad_norm": 2.796962022781372, + "learning_rate": 9.397772956058136e-06, + "loss": 0.58, + "step": 45380 + }, + { + "epoch": 0.5672891822295557, + "grad_norm": 2.410885810852051, + "learning_rate": 9.396901853445296e-06, + "loss": 0.8561, + "step": 45382 + }, + { + "epoch": 0.5673141828545714, + "grad_norm": 4.377414703369141, + "learning_rate": 9.396030755425573e-06, + "loss": 1.3114, + "step": 45384 + }, + { + "epoch": 0.567339183479587, + "grad_norm": 1.8940744400024414, + "learning_rate": 9.395159662005589e-06, + "loss": 1.1445, + "step": 45386 + }, + { + "epoch": 0.5673641841046027, + "grad_norm": 3.8981947898864746, + "learning_rate": 9.394288573191982e-06, + "loss": 0.6105, + "step": 45388 + }, + { + "epoch": 0.5673891847296182, + "grad_norm": 3.0498292446136475, + "learning_rate": 9.393417488991387e-06, + "loss": 0.8253, + "step": 45390 + }, + { + "epoch": 0.5674141853546338, + "grad_norm": 2.2332828044891357, + "learning_rate": 9.39254640941044e-06, + "loss": 0.4574, + "step": 45392 + }, + { + "epoch": 0.5674391859796495, + "grad_norm": 2.9128026962280273, + "learning_rate": 9.391675334455774e-06, + "loss": 0.8864, + "step": 45394 + }, + { + "epoch": 0.5674641866046651, + "grad_norm": 0.6635257005691528, + "learning_rate": 9.390804264134029e-06, + "loss": 1.2262, + "step": 45396 + }, + { + "epoch": 0.5674891872296808, + "grad_norm": 2.462834358215332, + "learning_rate": 9.389933198451825e-06, + "loss": 1.0974, + "step": 45398 + }, + { + "epoch": 0.5675141878546963, + "grad_norm": 9.426126480102539, + "learning_rate": 9.389062137415804e-06, + "loss": 0.8617, + "step": 45400 + }, + { + "epoch": 0.567539188479712, + "grad_norm": 2.4536612033843994, + "learning_rate": 9.388191081032598e-06, + "loss": 0.5348, + "step": 45402 + }, + { + "epoch": 0.5675641891047276, + "grad_norm": 3.6518614292144775, + "learning_rate": 9.387320029308845e-06, + "loss": 1.4887, + "step": 45404 + }, + { + "epoch": 0.5675891897297433, + "grad_norm": 0.23429875075817108, + "learning_rate": 9.386448982251175e-06, + "loss": 0.1784, + "step": 45406 + }, + { + "epoch": 0.5676141903547589, + "grad_norm": 2.5342624187469482, + "learning_rate": 9.385577939866224e-06, + "loss": 1.5995, + "step": 45408 + }, + { + "epoch": 0.5676391909797744, + "grad_norm": 4.322473049163818, + "learning_rate": 9.384706902160628e-06, + "loss": 1.5543, + "step": 45410 + }, + { + "epoch": 0.5676641916047901, + "grad_norm": 0.0006082886247895658, + "learning_rate": 9.383835869141013e-06, + "loss": 0.3758, + "step": 45412 + }, + { + "epoch": 0.5676891922298057, + "grad_norm": 6.811263084411621, + "learning_rate": 9.382964840814016e-06, + "loss": 2.0376, + "step": 45414 + }, + { + "epoch": 0.5677141928548214, + "grad_norm": 3.6150431632995605, + "learning_rate": 9.382093817186271e-06, + "loss": 1.3846, + "step": 45416 + }, + { + "epoch": 0.567739193479837, + "grad_norm": 3.3904507160186768, + "learning_rate": 9.381222798264413e-06, + "loss": 1.4116, + "step": 45418 + }, + { + "epoch": 0.5677641941048526, + "grad_norm": 3.1476991176605225, + "learning_rate": 9.380351784055075e-06, + "loss": 1.0629, + "step": 45420 + }, + { + "epoch": 0.5677891947298682, + "grad_norm": 4.678417205810547, + "learning_rate": 9.379480774564894e-06, + "loss": 0.6296, + "step": 45422 + }, + { + "epoch": 0.5678141953548839, + "grad_norm": 3.4824767112731934, + "learning_rate": 9.378609769800497e-06, + "loss": 0.6676, + "step": 45424 + }, + { + "epoch": 0.5678391959798995, + "grad_norm": 3.4402503967285156, + "learning_rate": 9.377738769768518e-06, + "loss": 0.6001, + "step": 45426 + }, + { + "epoch": 0.5678641966049152, + "grad_norm": 5.087311267852783, + "learning_rate": 9.376867774475591e-06, + "loss": 2.127, + "step": 45428 + }, + { + "epoch": 0.5678891972299307, + "grad_norm": 9.116327285766602, + "learning_rate": 9.375996783928354e-06, + "loss": 1.0992, + "step": 45430 + }, + { + "epoch": 0.5679141978549463, + "grad_norm": 6.84814453125, + "learning_rate": 9.375125798133438e-06, + "loss": 0.9489, + "step": 45432 + }, + { + "epoch": 0.567939198479962, + "grad_norm": 4.038649559020996, + "learning_rate": 9.374254817097473e-06, + "loss": 0.4261, + "step": 45434 + }, + { + "epoch": 0.5679641991049776, + "grad_norm": 0.40996772050857544, + "learning_rate": 9.373383840827102e-06, + "loss": 0.2296, + "step": 45436 + }, + { + "epoch": 0.5679891997299933, + "grad_norm": 3.1549389362335205, + "learning_rate": 9.372512869328945e-06, + "loss": 1.5444, + "step": 45438 + }, + { + "epoch": 0.5680142003550088, + "grad_norm": 3.9146692752838135, + "learning_rate": 9.37164190260964e-06, + "loss": 2.4147, + "step": 45440 + }, + { + "epoch": 0.5680392009800245, + "grad_norm": 14.121867179870605, + "learning_rate": 9.370770940675822e-06, + "loss": 2.0143, + "step": 45442 + }, + { + "epoch": 0.5680642016050401, + "grad_norm": 0.8879891633987427, + "learning_rate": 9.369899983534125e-06, + "loss": 0.5057, + "step": 45444 + }, + { + "epoch": 0.5680892022300558, + "grad_norm": 0.049207717180252075, + "learning_rate": 9.369029031191179e-06, + "loss": 0.8213, + "step": 45446 + }, + { + "epoch": 0.5681142028550714, + "grad_norm": 5.045225620269775, + "learning_rate": 9.368158083653624e-06, + "loss": 1.9465, + "step": 45448 + }, + { + "epoch": 0.568139203480087, + "grad_norm": 0.018843790516257286, + "learning_rate": 9.367287140928083e-06, + "loss": 0.1588, + "step": 45450 + }, + { + "epoch": 0.5681642041051026, + "grad_norm": 3.1859421730041504, + "learning_rate": 9.366416203021193e-06, + "loss": 1.1793, + "step": 45452 + }, + { + "epoch": 0.5681892047301182, + "grad_norm": 2.5640759468078613, + "learning_rate": 9.365545269939588e-06, + "loss": 0.9801, + "step": 45454 + }, + { + "epoch": 0.5682142053551339, + "grad_norm": 3.4374778270721436, + "learning_rate": 9.364674341689902e-06, + "loss": 1.6947, + "step": 45456 + }, + { + "epoch": 0.5682392059801495, + "grad_norm": 0.2752630114555359, + "learning_rate": 9.363803418278764e-06, + "loss": 0.3516, + "step": 45458 + }, + { + "epoch": 0.5682642066051651, + "grad_norm": 2.358187437057495, + "learning_rate": 9.36293249971281e-06, + "loss": 0.1503, + "step": 45460 + }, + { + "epoch": 0.5682892072301807, + "grad_norm": 4.839885234832764, + "learning_rate": 9.362061585998679e-06, + "loss": 0.7006, + "step": 45462 + }, + { + "epoch": 0.5683142078551964, + "grad_norm": 2.393420457839966, + "learning_rate": 9.361190677142987e-06, + "loss": 0.3523, + "step": 45464 + }, + { + "epoch": 0.568339208480212, + "grad_norm": 2.572659969329834, + "learning_rate": 9.36031977315238e-06, + "loss": 0.4715, + "step": 45466 + }, + { + "epoch": 0.5683642091052277, + "grad_norm": 1.9128199815750122, + "learning_rate": 9.359448874033488e-06, + "loss": 0.1026, + "step": 45468 + }, + { + "epoch": 0.5683892097302432, + "grad_norm": 14.70226764678955, + "learning_rate": 9.358577979792942e-06, + "loss": 0.7044, + "step": 45470 + }, + { + "epoch": 0.5684142103552589, + "grad_norm": 1.2889548540115356, + "learning_rate": 9.357707090437375e-06, + "loss": 1.4447, + "step": 45472 + }, + { + "epoch": 0.5684392109802745, + "grad_norm": 6.5705132484436035, + "learning_rate": 9.356836205973424e-06, + "loss": 0.8744, + "step": 45474 + }, + { + "epoch": 0.5684642116052901, + "grad_norm": 3.2658274173736572, + "learning_rate": 9.355965326407711e-06, + "loss": 0.5633, + "step": 45476 + }, + { + "epoch": 0.5684892122303058, + "grad_norm": 0.0008149552741087973, + "learning_rate": 9.355094451746879e-06, + "loss": 1.5069, + "step": 45478 + }, + { + "epoch": 0.5685142128553213, + "grad_norm": 3.463366985321045, + "learning_rate": 9.354223581997555e-06, + "loss": 1.2716, + "step": 45480 + }, + { + "epoch": 0.568539213480337, + "grad_norm": 0.4694118797779083, + "learning_rate": 9.353352717166374e-06, + "loss": 1.1504, + "step": 45482 + }, + { + "epoch": 0.5685642141053526, + "grad_norm": 2.685326099395752, + "learning_rate": 9.352481857259966e-06, + "loss": 0.79, + "step": 45484 + }, + { + "epoch": 0.5685892147303683, + "grad_norm": 1.3220710754394531, + "learning_rate": 9.351611002284963e-06, + "loss": 1.1397, + "step": 45486 + }, + { + "epoch": 0.5686142153553839, + "grad_norm": 5.676471710205078, + "learning_rate": 9.350740152248005e-06, + "loss": 1.4109, + "step": 45488 + }, + { + "epoch": 0.5686392159803995, + "grad_norm": 2.8916878700256348, + "learning_rate": 9.349869307155714e-06, + "loss": 1.3357, + "step": 45490 + }, + { + "epoch": 0.5686642166054151, + "grad_norm": 2.3060379028320312, + "learning_rate": 9.348998467014727e-06, + "loss": 1.417, + "step": 45492 + }, + { + "epoch": 0.5686892172304308, + "grad_norm": 2.315154552459717, + "learning_rate": 9.348127631831674e-06, + "loss": 1.0588, + "step": 45494 + }, + { + "epoch": 0.5687142178554464, + "grad_norm": 0.10694582015275955, + "learning_rate": 9.34725680161319e-06, + "loss": 0.8702, + "step": 45496 + }, + { + "epoch": 0.568739218480462, + "grad_norm": 3.203803539276123, + "learning_rate": 9.346385976365905e-06, + "loss": 1.176, + "step": 45498 + }, + { + "epoch": 0.5687642191054776, + "grad_norm": 4.961576461791992, + "learning_rate": 9.345515156096458e-06, + "loss": 1.6561, + "step": 45500 + }, + { + "epoch": 0.5687892197304932, + "grad_norm": 3.0089025497436523, + "learning_rate": 9.344644340811467e-06, + "loss": 0.9361, + "step": 45502 + }, + { + "epoch": 0.5688142203555089, + "grad_norm": 5.107158184051514, + "learning_rate": 9.343773530517575e-06, + "loss": 2.4901, + "step": 45504 + }, + { + "epoch": 0.5688392209805245, + "grad_norm": 3.112464427947998, + "learning_rate": 9.34290272522141e-06, + "loss": 1.3721, + "step": 45506 + }, + { + "epoch": 0.5688642216055402, + "grad_norm": 3.6581199169158936, + "learning_rate": 9.342031924929607e-06, + "loss": 0.4366, + "step": 45508 + }, + { + "epoch": 0.5688892222305557, + "grad_norm": 1.8554781675338745, + "learning_rate": 9.341161129648794e-06, + "loss": 1.1598, + "step": 45510 + }, + { + "epoch": 0.5689142228555714, + "grad_norm": 2.830090045928955, + "learning_rate": 9.340290339385604e-06, + "loss": 1.2966, + "step": 45512 + }, + { + "epoch": 0.568939223480587, + "grad_norm": 3.622783899307251, + "learning_rate": 9.339419554146673e-06, + "loss": 1.0183, + "step": 45514 + }, + { + "epoch": 0.5689642241056027, + "grad_norm": 2.8283066749572754, + "learning_rate": 9.338548773938627e-06, + "loss": 0.8874, + "step": 45516 + }, + { + "epoch": 0.5689892247306183, + "grad_norm": 6.604499816894531, + "learning_rate": 9.3376779987681e-06, + "loss": 0.8334, + "step": 45518 + }, + { + "epoch": 0.5690142253556338, + "grad_norm": 3.98909592628479, + "learning_rate": 9.336807228641724e-06, + "loss": 1.0284, + "step": 45520 + }, + { + "epoch": 0.5690392259806495, + "grad_norm": 2.5901169776916504, + "learning_rate": 9.33593646356613e-06, + "loss": 0.4587, + "step": 45522 + }, + { + "epoch": 0.5690642266056651, + "grad_norm": 2.805788278579712, + "learning_rate": 9.33506570354795e-06, + "loss": 1.4471, + "step": 45524 + }, + { + "epoch": 0.5690892272306808, + "grad_norm": 0.6488988399505615, + "learning_rate": 9.334194948593817e-06, + "loss": 0.7489, + "step": 45526 + }, + { + "epoch": 0.5691142278556964, + "grad_norm": 1.879173755645752, + "learning_rate": 9.33332419871036e-06, + "loss": 0.4397, + "step": 45528 + }, + { + "epoch": 0.569139228480712, + "grad_norm": 0.5959612131118774, + "learning_rate": 9.332453453904212e-06, + "loss": 0.0158, + "step": 45530 + }, + { + "epoch": 0.5691642291057276, + "grad_norm": 15.184776306152344, + "learning_rate": 9.331582714182002e-06, + "loss": 0.5574, + "step": 45532 + }, + { + "epoch": 0.5691892297307433, + "grad_norm": 0.3577817380428314, + "learning_rate": 9.330711979550365e-06, + "loss": 0.4422, + "step": 45534 + }, + { + "epoch": 0.5692142303557589, + "grad_norm": 1.4931262731552124, + "learning_rate": 9.329841250015932e-06, + "loss": 1.029, + "step": 45536 + }, + { + "epoch": 0.5692392309807746, + "grad_norm": 3.095349073410034, + "learning_rate": 9.32897052558533e-06, + "loss": 0.3202, + "step": 45538 + }, + { + "epoch": 0.5692642316057901, + "grad_norm": 3.978213310241699, + "learning_rate": 9.3280998062652e-06, + "loss": 2.0493, + "step": 45540 + }, + { + "epoch": 0.5692892322308057, + "grad_norm": 2.567436695098877, + "learning_rate": 9.327229092062162e-06, + "loss": 0.5926, + "step": 45542 + }, + { + "epoch": 0.5693142328558214, + "grad_norm": 3.807705879211426, + "learning_rate": 9.326358382982853e-06, + "loss": 1.3495, + "step": 45544 + }, + { + "epoch": 0.569339233480837, + "grad_norm": 8.763529777526855, + "learning_rate": 9.325487679033901e-06, + "loss": 1.1166, + "step": 45546 + }, + { + "epoch": 0.5693642341058527, + "grad_norm": 2.5070815086364746, + "learning_rate": 9.324616980221944e-06, + "loss": 0.5329, + "step": 45548 + }, + { + "epoch": 0.5693892347308682, + "grad_norm": 4.5952839851379395, + "learning_rate": 9.323746286553605e-06, + "loss": 2.5159, + "step": 45550 + }, + { + "epoch": 0.5694142353558839, + "grad_norm": 4.306451797485352, + "learning_rate": 9.322875598035522e-06, + "loss": 1.0838, + "step": 45552 + }, + { + "epoch": 0.5694392359808995, + "grad_norm": 7.9039626121521, + "learning_rate": 9.32200491467432e-06, + "loss": 0.3276, + "step": 45554 + }, + { + "epoch": 0.5694642366059152, + "grad_norm": 2.3865997791290283, + "learning_rate": 9.321134236476633e-06, + "loss": 0.5034, + "step": 45556 + }, + { + "epoch": 0.5694892372309308, + "grad_norm": 5.094672203063965, + "learning_rate": 9.320263563449091e-06, + "loss": 1.0665, + "step": 45558 + }, + { + "epoch": 0.5695142378559463, + "grad_norm": 6.405282497406006, + "learning_rate": 9.319392895598327e-06, + "loss": 1.4151, + "step": 45560 + }, + { + "epoch": 0.569539238480962, + "grad_norm": 2.7315948009490967, + "learning_rate": 9.31852223293097e-06, + "loss": 1.6147, + "step": 45562 + }, + { + "epoch": 0.5695642391059776, + "grad_norm": 3.069096088409424, + "learning_rate": 9.31765157545365e-06, + "loss": 0.3547, + "step": 45564 + }, + { + "epoch": 0.5695892397309933, + "grad_norm": 0.7270247340202332, + "learning_rate": 9.316780923173004e-06, + "loss": 0.4303, + "step": 45566 + }, + { + "epoch": 0.5696142403560089, + "grad_norm": 2.8700406551361084, + "learning_rate": 9.315910276095652e-06, + "loss": 1.3226, + "step": 45568 + }, + { + "epoch": 0.5696392409810245, + "grad_norm": 3.0888748168945312, + "learning_rate": 9.315039634228233e-06, + "loss": 0.7991, + "step": 45570 + }, + { + "epoch": 0.5696642416060401, + "grad_norm": 0.1746455579996109, + "learning_rate": 9.314168997577376e-06, + "loss": 0.4359, + "step": 45572 + }, + { + "epoch": 0.5696892422310558, + "grad_norm": 1.4986872673034668, + "learning_rate": 9.31329836614971e-06, + "loss": 0.8186, + "step": 45574 + }, + { + "epoch": 0.5697142428560714, + "grad_norm": 2.651693820953369, + "learning_rate": 9.312427739951865e-06, + "loss": 1.0533, + "step": 45576 + }, + { + "epoch": 0.5697392434810871, + "grad_norm": 4.939410209655762, + "learning_rate": 9.311557118990473e-06, + "loss": 1.1401, + "step": 45578 + }, + { + "epoch": 0.5697642441061026, + "grad_norm": 5.221944808959961, + "learning_rate": 9.31068650327217e-06, + "loss": 1.4219, + "step": 45580 + }, + { + "epoch": 0.5697892447311182, + "grad_norm": 1.9586049318313599, + "learning_rate": 9.309815892803577e-06, + "loss": 0.9336, + "step": 45582 + }, + { + "epoch": 0.5698142453561339, + "grad_norm": 3.581660509109497, + "learning_rate": 9.30894528759133e-06, + "loss": 0.908, + "step": 45584 + }, + { + "epoch": 0.5698392459811495, + "grad_norm": 0.12766452133655548, + "learning_rate": 9.308074687642055e-06, + "loss": 0.0014, + "step": 45586 + }, + { + "epoch": 0.5698642466061652, + "grad_norm": 7.811588287353516, + "learning_rate": 9.307204092962388e-06, + "loss": 1.2165, + "step": 45588 + }, + { + "epoch": 0.5698892472311807, + "grad_norm": 2.3489675521850586, + "learning_rate": 9.306333503558956e-06, + "loss": 0.5738, + "step": 45590 + }, + { + "epoch": 0.5699142478561964, + "grad_norm": 3.104771614074707, + "learning_rate": 9.305462919438391e-06, + "loss": 0.8011, + "step": 45592 + }, + { + "epoch": 0.569939248481212, + "grad_norm": 7.055609226226807, + "learning_rate": 9.30459234060732e-06, + "loss": 0.6478, + "step": 45594 + }, + { + "epoch": 0.5699642491062277, + "grad_norm": 3.7924015522003174, + "learning_rate": 9.303721767072378e-06, + "loss": 1.3027, + "step": 45596 + }, + { + "epoch": 0.5699892497312433, + "grad_norm": 3.0288655757904053, + "learning_rate": 9.30285119884019e-06, + "loss": 1.4501, + "step": 45598 + }, + { + "epoch": 0.5700142503562589, + "grad_norm": 5.0834455490112305, + "learning_rate": 9.301980635917389e-06, + "loss": 1.2487, + "step": 45600 + }, + { + "epoch": 0.5700392509812745, + "grad_norm": 2.3420145511627197, + "learning_rate": 9.301110078310604e-06, + "loss": 0.8971, + "step": 45602 + }, + { + "epoch": 0.5700642516062902, + "grad_norm": 4.622331142425537, + "learning_rate": 9.300239526026467e-06, + "loss": 1.4403, + "step": 45604 + }, + { + "epoch": 0.5700892522313058, + "grad_norm": 2.3586344718933105, + "learning_rate": 9.29936897907161e-06, + "loss": 0.3867, + "step": 45606 + }, + { + "epoch": 0.5701142528563214, + "grad_norm": 3.010143518447876, + "learning_rate": 9.298498437452656e-06, + "loss": 0.7545, + "step": 45608 + }, + { + "epoch": 0.570139253481337, + "grad_norm": 2.5345239639282227, + "learning_rate": 9.297627901176241e-06, + "loss": 1.5351, + "step": 45610 + }, + { + "epoch": 0.5701642541063526, + "grad_norm": 10.60861587524414, + "learning_rate": 9.29675737024899e-06, + "loss": 1.2102, + "step": 45612 + }, + { + "epoch": 0.5701892547313683, + "grad_norm": 3.0199387073516846, + "learning_rate": 9.295886844677537e-06, + "loss": 0.8502, + "step": 45614 + }, + { + "epoch": 0.5702142553563839, + "grad_norm": 2.7267284393310547, + "learning_rate": 9.295016324468509e-06, + "loss": 1.1342, + "step": 45616 + }, + { + "epoch": 0.5702392559813996, + "grad_norm": 3.8151183128356934, + "learning_rate": 9.29414580962854e-06, + "loss": 0.892, + "step": 45618 + }, + { + "epoch": 0.5702642566064151, + "grad_norm": 0.502029299736023, + "learning_rate": 9.293275300164254e-06, + "loss": 0.5279, + "step": 45620 + }, + { + "epoch": 0.5702892572314308, + "grad_norm": 2.4153530597686768, + "learning_rate": 9.292404796082283e-06, + "loss": 0.7579, + "step": 45622 + }, + { + "epoch": 0.5703142578564464, + "grad_norm": 2.719660758972168, + "learning_rate": 9.291534297389258e-06, + "loss": 1.6472, + "step": 45624 + }, + { + "epoch": 0.570339258481462, + "grad_norm": 5.226475238800049, + "learning_rate": 9.290663804091807e-06, + "loss": 2.2047, + "step": 45626 + }, + { + "epoch": 0.5703642591064777, + "grad_norm": 3.2405145168304443, + "learning_rate": 9.28979331619656e-06, + "loss": 0.7093, + "step": 45628 + }, + { + "epoch": 0.5703892597314932, + "grad_norm": 2.1071722507476807, + "learning_rate": 9.288922833710146e-06, + "loss": 0.7084, + "step": 45630 + }, + { + "epoch": 0.5704142603565089, + "grad_norm": 3.2712161540985107, + "learning_rate": 9.2880523566392e-06, + "loss": 0.8674, + "step": 45632 + }, + { + "epoch": 0.5704392609815245, + "grad_norm": 3.8831441402435303, + "learning_rate": 9.287181884990341e-06, + "loss": 1.1297, + "step": 45634 + }, + { + "epoch": 0.5704642616065402, + "grad_norm": 0.0004918955382890999, + "learning_rate": 9.286311418770207e-06, + "loss": 0.5938, + "step": 45636 + }, + { + "epoch": 0.5704892622315558, + "grad_norm": 0.0007326296763494611, + "learning_rate": 9.285440957985424e-06, + "loss": 1.3565, + "step": 45638 + }, + { + "epoch": 0.5705142628565714, + "grad_norm": 3.599937915802002, + "learning_rate": 9.28457050264262e-06, + "loss": 0.8219, + "step": 45640 + }, + { + "epoch": 0.570539263481587, + "grad_norm": 3.202225923538208, + "learning_rate": 9.283700052748425e-06, + "loss": 0.809, + "step": 45642 + }, + { + "epoch": 0.5705642641066027, + "grad_norm": 4.027810573577881, + "learning_rate": 9.282829608309474e-06, + "loss": 1.201, + "step": 45644 + }, + { + "epoch": 0.5705892647316183, + "grad_norm": 0.0013077303301542997, + "learning_rate": 9.281959169332387e-06, + "loss": 0.0, + "step": 45646 + }, + { + "epoch": 0.570614265356634, + "grad_norm": 3.8326218128204346, + "learning_rate": 9.281088735823798e-06, + "loss": 2.4022, + "step": 45648 + }, + { + "epoch": 0.5706392659816495, + "grad_norm": 0.001161133055575192, + "learning_rate": 9.280218307790337e-06, + "loss": 0.0, + "step": 45650 + }, + { + "epoch": 0.5706642666066651, + "grad_norm": 2.567117691040039, + "learning_rate": 9.27934788523863e-06, + "loss": 1.5625, + "step": 45652 + }, + { + "epoch": 0.5706892672316808, + "grad_norm": 1.9423351287841797, + "learning_rate": 9.278477468175308e-06, + "loss": 1.3973, + "step": 45654 + }, + { + "epoch": 0.5707142678566964, + "grad_norm": 5.529319763183594, + "learning_rate": 9.277607056607e-06, + "loss": 0.7499, + "step": 45656 + }, + { + "epoch": 0.5707392684817121, + "grad_norm": 2.985520124435425, + "learning_rate": 9.276736650540337e-06, + "loss": 0.8971, + "step": 45658 + }, + { + "epoch": 0.5707642691067276, + "grad_norm": 2.97152042388916, + "learning_rate": 9.275866249981941e-06, + "loss": 1.3682, + "step": 45660 + }, + { + "epoch": 0.5707892697317433, + "grad_norm": 1.935356855392456, + "learning_rate": 9.274995854938448e-06, + "loss": 0.7184, + "step": 45662 + }, + { + "epoch": 0.5708142703567589, + "grad_norm": 2.108909845352173, + "learning_rate": 9.274125465416483e-06, + "loss": 0.6032, + "step": 45664 + }, + { + "epoch": 0.5708392709817746, + "grad_norm": 2.560296058654785, + "learning_rate": 9.273255081422675e-06, + "loss": 0.2028, + "step": 45666 + }, + { + "epoch": 0.5708642716067902, + "grad_norm": 1.995287299156189, + "learning_rate": 9.272384702963653e-06, + "loss": 0.4435, + "step": 45668 + }, + { + "epoch": 0.5708892722318057, + "grad_norm": 3.4894518852233887, + "learning_rate": 9.27151433004605e-06, + "loss": 1.9, + "step": 45670 + }, + { + "epoch": 0.5709142728568214, + "grad_norm": 2.9130642414093018, + "learning_rate": 9.270643962676489e-06, + "loss": 1.0476, + "step": 45672 + }, + { + "epoch": 0.570939273481837, + "grad_norm": 3.0545549392700195, + "learning_rate": 9.269773600861598e-06, + "loss": 0.9568, + "step": 45674 + }, + { + "epoch": 0.5709642741068527, + "grad_norm": 4.478982925415039, + "learning_rate": 9.268903244608011e-06, + "loss": 1.691, + "step": 45676 + }, + { + "epoch": 0.5709892747318683, + "grad_norm": 2.9734015464782715, + "learning_rate": 9.268032893922353e-06, + "loss": 0.2805, + "step": 45678 + }, + { + "epoch": 0.5710142753568839, + "grad_norm": 3.292268753051758, + "learning_rate": 9.26716254881125e-06, + "loss": 0.433, + "step": 45680 + }, + { + "epoch": 0.5710392759818995, + "grad_norm": 2.388918161392212, + "learning_rate": 9.266292209281336e-06, + "loss": 0.5938, + "step": 45682 + }, + { + "epoch": 0.5710642766069152, + "grad_norm": 5.307463645935059, + "learning_rate": 9.265421875339238e-06, + "loss": 1.1355, + "step": 45684 + }, + { + "epoch": 0.5710892772319308, + "grad_norm": 2.625697612762451, + "learning_rate": 9.264551546991582e-06, + "loss": 0.4983, + "step": 45686 + }, + { + "epoch": 0.5711142778569465, + "grad_norm": 2.138763904571533, + "learning_rate": 9.263681224244997e-06, + "loss": 1.0055, + "step": 45688 + }, + { + "epoch": 0.571139278481962, + "grad_norm": 0.5777183175086975, + "learning_rate": 9.26281090710611e-06, + "loss": 0.8621, + "step": 45690 + }, + { + "epoch": 0.5711642791069776, + "grad_norm": 3.551234245300293, + "learning_rate": 9.261940595581554e-06, + "loss": 1.1739, + "step": 45692 + }, + { + "epoch": 0.5711892797319933, + "grad_norm": 3.5087528228759766, + "learning_rate": 9.261070289677951e-06, + "loss": 0.8469, + "step": 45694 + }, + { + "epoch": 0.5712142803570089, + "grad_norm": 1.3190847635269165, + "learning_rate": 9.260199989401937e-06, + "loss": 0.4861, + "step": 45696 + }, + { + "epoch": 0.5712392809820246, + "grad_norm": 3.784691095352173, + "learning_rate": 9.259329694760132e-06, + "loss": 1.0049, + "step": 45698 + }, + { + "epoch": 0.5712642816070401, + "grad_norm": 4.44071102142334, + "learning_rate": 9.258459405759167e-06, + "loss": 0.6786, + "step": 45700 + }, + { + "epoch": 0.5712892822320558, + "grad_norm": 2.373568534851074, + "learning_rate": 9.25758912240567e-06, + "loss": 0.3904, + "step": 45702 + }, + { + "epoch": 0.5713142828570714, + "grad_norm": 1.6291017532348633, + "learning_rate": 9.25671884470627e-06, + "loss": 0.8691, + "step": 45704 + }, + { + "epoch": 0.5713392834820871, + "grad_norm": 7.748600482940674, + "learning_rate": 9.255848572667593e-06, + "loss": 0.6375, + "step": 45706 + }, + { + "epoch": 0.5713642841071027, + "grad_norm": 3.2374775409698486, + "learning_rate": 9.254978306296267e-06, + "loss": 1.0144, + "step": 45708 + }, + { + "epoch": 0.5713892847321183, + "grad_norm": 0.0009910649387165904, + "learning_rate": 9.254108045598928e-06, + "loss": 0.0, + "step": 45710 + }, + { + "epoch": 0.5714142853571339, + "grad_norm": 3.3564329147338867, + "learning_rate": 9.25323779058219e-06, + "loss": 0.8239, + "step": 45712 + }, + { + "epoch": 0.5714392859821495, + "grad_norm": 3.378084182739258, + "learning_rate": 9.252367541252689e-06, + "loss": 0.2331, + "step": 45714 + }, + { + "epoch": 0.5714642866071652, + "grad_norm": 5.6262688636779785, + "learning_rate": 9.251497297617052e-06, + "loss": 2.1316, + "step": 45716 + }, + { + "epoch": 0.5714892872321808, + "grad_norm": 2.384335994720459, + "learning_rate": 9.250627059681904e-06, + "loss": 0.4896, + "step": 45718 + }, + { + "epoch": 0.5715142878571964, + "grad_norm": 0.38865044713020325, + "learning_rate": 9.249756827453873e-06, + "loss": 0.0805, + "step": 45720 + }, + { + "epoch": 0.571539288482212, + "grad_norm": 2.996445417404175, + "learning_rate": 9.248886600939593e-06, + "loss": 0.2519, + "step": 45722 + }, + { + "epoch": 0.5715642891072277, + "grad_norm": 2.4207136631011963, + "learning_rate": 9.248016380145685e-06, + "loss": 0.9337, + "step": 45724 + }, + { + "epoch": 0.5715892897322433, + "grad_norm": 3.4801149368286133, + "learning_rate": 9.247146165078775e-06, + "loss": 0.9888, + "step": 45726 + }, + { + "epoch": 0.571614290357259, + "grad_norm": 3.926851987838745, + "learning_rate": 9.246275955745496e-06, + "loss": 1.4287, + "step": 45728 + }, + { + "epoch": 0.5716392909822745, + "grad_norm": 5.78959846496582, + "learning_rate": 9.24540575215247e-06, + "loss": 0.6957, + "step": 45730 + }, + { + "epoch": 0.5716642916072902, + "grad_norm": 6.9118242263793945, + "learning_rate": 9.244535554306326e-06, + "loss": 1.2071, + "step": 45732 + }, + { + "epoch": 0.5716892922323058, + "grad_norm": 3.2788279056549072, + "learning_rate": 9.243665362213696e-06, + "loss": 1.3187, + "step": 45734 + }, + { + "epoch": 0.5717142928573214, + "grad_norm": 1.6770986318588257, + "learning_rate": 9.242795175881206e-06, + "loss": 0.4956, + "step": 45736 + }, + { + "epoch": 0.5717392934823371, + "grad_norm": 4.677144527435303, + "learning_rate": 9.241924995315476e-06, + "loss": 1.1723, + "step": 45738 + }, + { + "epoch": 0.5717642941073526, + "grad_norm": 1.2228091955184937, + "learning_rate": 9.241054820523141e-06, + "loss": 0.3398, + "step": 45740 + }, + { + "epoch": 0.5717892947323683, + "grad_norm": 4.05891752243042, + "learning_rate": 9.240184651510822e-06, + "loss": 1.6046, + "step": 45742 + }, + { + "epoch": 0.5718142953573839, + "grad_norm": 2.1309871673583984, + "learning_rate": 9.239314488285151e-06, + "loss": 0.5955, + "step": 45744 + }, + { + "epoch": 0.5718392959823996, + "grad_norm": 4.026447772979736, + "learning_rate": 9.238444330852751e-06, + "loss": 0.647, + "step": 45746 + }, + { + "epoch": 0.5718642966074152, + "grad_norm": 4.710116386413574, + "learning_rate": 9.237574179220257e-06, + "loss": 1.9177, + "step": 45748 + }, + { + "epoch": 0.5718892972324308, + "grad_norm": 0.017617689445614815, + "learning_rate": 9.236704033394288e-06, + "loss": 0.188, + "step": 45750 + }, + { + "epoch": 0.5719142978574464, + "grad_norm": 0.0006087595247663558, + "learning_rate": 9.235833893381471e-06, + "loss": 0.1533, + "step": 45752 + }, + { + "epoch": 0.571939298482462, + "grad_norm": 0.16488151252269745, + "learning_rate": 9.234963759188435e-06, + "loss": 0.0033, + "step": 45754 + }, + { + "epoch": 0.5719642991074777, + "grad_norm": 4.328823089599609, + "learning_rate": 9.234093630821807e-06, + "loss": 1.2757, + "step": 45756 + }, + { + "epoch": 0.5719892997324934, + "grad_norm": 0.0006292397738434374, + "learning_rate": 9.233223508288213e-06, + "loss": 0.0585, + "step": 45758 + }, + { + "epoch": 0.5720143003575089, + "grad_norm": 3.820951223373413, + "learning_rate": 9.232353391594282e-06, + "loss": 0.7872, + "step": 45760 + }, + { + "epoch": 0.5720393009825245, + "grad_norm": 5.140398025512695, + "learning_rate": 9.231483280746643e-06, + "loss": 1.3392, + "step": 45762 + }, + { + "epoch": 0.5720643016075402, + "grad_norm": 3.466703414916992, + "learning_rate": 9.230613175751913e-06, + "loss": 1.0139, + "step": 45764 + }, + { + "epoch": 0.5720893022325558, + "grad_norm": 3.0654056072235107, + "learning_rate": 9.229743076616725e-06, + "loss": 0.3122, + "step": 45766 + }, + { + "epoch": 0.5721143028575715, + "grad_norm": 0.0008740671328268945, + "learning_rate": 9.228872983347705e-06, + "loss": 1.4455, + "step": 45768 + }, + { + "epoch": 0.572139303482587, + "grad_norm": 5.715202331542969, + "learning_rate": 9.228002895951478e-06, + "loss": 1.7795, + "step": 45770 + }, + { + "epoch": 0.5721643041076027, + "grad_norm": 3.5513107776641846, + "learning_rate": 9.227132814434674e-06, + "loss": 1.0967, + "step": 45772 + }, + { + "epoch": 0.5721893047326183, + "grad_norm": 2.9676034450531006, + "learning_rate": 9.22626273880392e-06, + "loss": 1.4387, + "step": 45774 + }, + { + "epoch": 0.572214305357634, + "grad_norm": 2.694061279296875, + "learning_rate": 9.225392669065836e-06, + "loss": 0.6355, + "step": 45776 + }, + { + "epoch": 0.5722393059826496, + "grad_norm": 4.439696788787842, + "learning_rate": 9.224522605227052e-06, + "loss": 1.3346, + "step": 45778 + }, + { + "epoch": 0.5722643066076651, + "grad_norm": 2.5973010063171387, + "learning_rate": 9.223652547294194e-06, + "loss": 0.5486, + "step": 45780 + }, + { + "epoch": 0.5722893072326808, + "grad_norm": 3.790194034576416, + "learning_rate": 9.222782495273886e-06, + "loss": 1.7541, + "step": 45782 + }, + { + "epoch": 0.5723143078576964, + "grad_norm": 0.3919656276702881, + "learning_rate": 9.221912449172758e-06, + "loss": 0.0425, + "step": 45784 + }, + { + "epoch": 0.5723393084827121, + "grad_norm": 5.12286901473999, + "learning_rate": 9.221042408997436e-06, + "loss": 1.2853, + "step": 45786 + }, + { + "epoch": 0.5723643091077277, + "grad_norm": 5.0643744468688965, + "learning_rate": 9.22017237475455e-06, + "loss": 1.2632, + "step": 45788 + }, + { + "epoch": 0.5723893097327433, + "grad_norm": 3.09375262260437, + "learning_rate": 9.219302346450716e-06, + "loss": 1.808, + "step": 45790 + }, + { + "epoch": 0.5724143103577589, + "grad_norm": 5.733551979064941, + "learning_rate": 9.218432324092565e-06, + "loss": 1.0322, + "step": 45792 + }, + { + "epoch": 0.5724393109827746, + "grad_norm": 1.7850871086120605, + "learning_rate": 9.21756230768672e-06, + "loss": 0.5231, + "step": 45794 + }, + { + "epoch": 0.5724643116077902, + "grad_norm": 5.355424880981445, + "learning_rate": 9.216692297239812e-06, + "loss": 1.8825, + "step": 45796 + }, + { + "epoch": 0.5724893122328059, + "grad_norm": 4.558154106140137, + "learning_rate": 9.215822292758464e-06, + "loss": 1.6271, + "step": 45798 + }, + { + "epoch": 0.5725143128578214, + "grad_norm": 2.9118010997772217, + "learning_rate": 9.214952294249308e-06, + "loss": 0.8673, + "step": 45800 + }, + { + "epoch": 0.572539313482837, + "grad_norm": 1.8757846355438232, + "learning_rate": 9.21408230171896e-06, + "loss": 0.9713, + "step": 45802 + }, + { + "epoch": 0.5725643141078527, + "grad_norm": 3.8828794956207275, + "learning_rate": 9.21321231517405e-06, + "loss": 0.1991, + "step": 45804 + }, + { + "epoch": 0.5725893147328683, + "grad_norm": 7.6924262046813965, + "learning_rate": 9.212342334621204e-06, + "loss": 1.6065, + "step": 45806 + }, + { + "epoch": 0.572614315357884, + "grad_norm": 2.50299334526062, + "learning_rate": 9.211472360067045e-06, + "loss": 0.476, + "step": 45808 + }, + { + "epoch": 0.5726393159828995, + "grad_norm": 0.5408657789230347, + "learning_rate": 9.210602391518201e-06, + "loss": 1.3009, + "step": 45810 + }, + { + "epoch": 0.5726643166079152, + "grad_norm": 4.162564754486084, + "learning_rate": 9.2097324289813e-06, + "loss": 0.9152, + "step": 45812 + }, + { + "epoch": 0.5726893172329308, + "grad_norm": 0.0811023861169815, + "learning_rate": 9.208862472462969e-06, + "loss": 1.0008, + "step": 45814 + }, + { + "epoch": 0.5727143178579465, + "grad_norm": 4.52084493637085, + "learning_rate": 9.207992521969826e-06, + "loss": 1.0928, + "step": 45816 + }, + { + "epoch": 0.5727393184829621, + "grad_norm": 4.515217304229736, + "learning_rate": 9.207122577508499e-06, + "loss": 0.3919, + "step": 45818 + }, + { + "epoch": 0.5727643191079776, + "grad_norm": 0.0007583058904856443, + "learning_rate": 9.206252639085612e-06, + "loss": 0.0001, + "step": 45820 + }, + { + "epoch": 0.5727893197329933, + "grad_norm": 5.149075508117676, + "learning_rate": 9.205382706707795e-06, + "loss": 1.3547, + "step": 45822 + }, + { + "epoch": 0.5728143203580089, + "grad_norm": 2.910688638687134, + "learning_rate": 9.204512780381672e-06, + "loss": 0.7312, + "step": 45824 + }, + { + "epoch": 0.5728393209830246, + "grad_norm": 0.0006982547929510474, + "learning_rate": 9.203642860113864e-06, + "loss": 0.0, + "step": 45826 + }, + { + "epoch": 0.5728643216080402, + "grad_norm": 3.055534601211548, + "learning_rate": 9.202772945911007e-06, + "loss": 0.5306, + "step": 45828 + }, + { + "epoch": 0.5728893222330558, + "grad_norm": 2.8793463706970215, + "learning_rate": 9.201903037779713e-06, + "loss": 0.8298, + "step": 45830 + }, + { + "epoch": 0.5729143228580714, + "grad_norm": 0.0011678390437737107, + "learning_rate": 9.20103313572661e-06, + "loss": 1.6392, + "step": 45832 + }, + { + "epoch": 0.5729393234830871, + "grad_norm": 2.902247190475464, + "learning_rate": 9.200163239758328e-06, + "loss": 1.046, + "step": 45834 + }, + { + "epoch": 0.5729643241081027, + "grad_norm": 4.082268714904785, + "learning_rate": 9.19929334988149e-06, + "loss": 0.7497, + "step": 45836 + }, + { + "epoch": 0.5729893247331184, + "grad_norm": 4.2187347412109375, + "learning_rate": 9.19842346610272e-06, + "loss": 0.9069, + "step": 45838 + }, + { + "epoch": 0.5730143253581339, + "grad_norm": 2.7209277153015137, + "learning_rate": 9.19755358842865e-06, + "loss": 0.7476, + "step": 45840 + }, + { + "epoch": 0.5730393259831496, + "grad_norm": 4.289315700531006, + "learning_rate": 9.196683716865892e-06, + "loss": 1.2705, + "step": 45842 + }, + { + "epoch": 0.5730643266081652, + "grad_norm": 4.763566970825195, + "learning_rate": 9.195813851421076e-06, + "loss": 1.0291, + "step": 45844 + }, + { + "epoch": 0.5730893272331808, + "grad_norm": 4.946986198425293, + "learning_rate": 9.19494399210083e-06, + "loss": 0.4702, + "step": 45846 + }, + { + "epoch": 0.5731143278581965, + "grad_norm": 1.6845933198928833, + "learning_rate": 9.194074138911775e-06, + "loss": 0.0409, + "step": 45848 + }, + { + "epoch": 0.573139328483212, + "grad_norm": 1.718023419380188, + "learning_rate": 9.19320429186054e-06, + "loss": 1.0068, + "step": 45850 + }, + { + "epoch": 0.5731643291082277, + "grad_norm": 5.024692535400391, + "learning_rate": 9.192334450953744e-06, + "loss": 1.568, + "step": 45852 + }, + { + "epoch": 0.5731893297332433, + "grad_norm": 2.7450380325317383, + "learning_rate": 9.19146461619802e-06, + "loss": 1.0865, + "step": 45854 + }, + { + "epoch": 0.573214330358259, + "grad_norm": 4.5674591064453125, + "learning_rate": 9.190594787599983e-06, + "loss": 0.7574, + "step": 45856 + }, + { + "epoch": 0.5732393309832746, + "grad_norm": 3.531928062438965, + "learning_rate": 9.189724965166263e-06, + "loss": 0.9966, + "step": 45858 + }, + { + "epoch": 0.5732643316082902, + "grad_norm": 4.926522731781006, + "learning_rate": 9.188855148903482e-06, + "loss": 0.5048, + "step": 45860 + }, + { + "epoch": 0.5732893322333058, + "grad_norm": 3.2219018936157227, + "learning_rate": 9.187985338818264e-06, + "loss": 0.9799, + "step": 45862 + }, + { + "epoch": 0.5733143328583215, + "grad_norm": 0.0007930250721983612, + "learning_rate": 9.187115534917239e-06, + "loss": 0.3209, + "step": 45864 + }, + { + "epoch": 0.5733393334833371, + "grad_norm": 1.7516344785690308, + "learning_rate": 9.186245737207028e-06, + "loss": 0.2692, + "step": 45866 + }, + { + "epoch": 0.5733643341083527, + "grad_norm": 3.203800678253174, + "learning_rate": 9.185375945694249e-06, + "loss": 0.9626, + "step": 45868 + }, + { + "epoch": 0.5733893347333683, + "grad_norm": 6.081878185272217, + "learning_rate": 9.184506160385534e-06, + "loss": 0.7519, + "step": 45870 + }, + { + "epoch": 0.5734143353583839, + "grad_norm": 0.2894630432128906, + "learning_rate": 9.183636381287504e-06, + "loss": 0.2221, + "step": 45872 + }, + { + "epoch": 0.5734393359833996, + "grad_norm": 3.828756093978882, + "learning_rate": 9.182766608406785e-06, + "loss": 0.6509, + "step": 45874 + }, + { + "epoch": 0.5734643366084152, + "grad_norm": 2.581037759780884, + "learning_rate": 9.18189684175e-06, + "loss": 0.8739, + "step": 45876 + }, + { + "epoch": 0.5734893372334309, + "grad_norm": 1.7626630067825317, + "learning_rate": 9.181027081323772e-06, + "loss": 1.0921, + "step": 45878 + }, + { + "epoch": 0.5735143378584464, + "grad_norm": 3.6991076469421387, + "learning_rate": 9.180157327134729e-06, + "loss": 0.7375, + "step": 45880 + }, + { + "epoch": 0.5735393384834621, + "grad_norm": 1.2604979276657104, + "learning_rate": 9.179287579189487e-06, + "loss": 0.2309, + "step": 45882 + }, + { + "epoch": 0.5735643391084777, + "grad_norm": 2.7130348682403564, + "learning_rate": 9.178417837494677e-06, + "loss": 1.2845, + "step": 45884 + }, + { + "epoch": 0.5735893397334934, + "grad_norm": 4.066656112670898, + "learning_rate": 9.17754810205692e-06, + "loss": 0.7308, + "step": 45886 + }, + { + "epoch": 0.573614340358509, + "grad_norm": 3.024080991744995, + "learning_rate": 9.176678372882841e-06, + "loss": 1.6733, + "step": 45888 + }, + { + "epoch": 0.5736393409835245, + "grad_norm": 2.955289602279663, + "learning_rate": 9.175808649979063e-06, + "loss": 1.176, + "step": 45890 + }, + { + "epoch": 0.5736643416085402, + "grad_norm": 4.6259613037109375, + "learning_rate": 9.174938933352213e-06, + "loss": 1.9553, + "step": 45892 + }, + { + "epoch": 0.5736893422335558, + "grad_norm": 0.00030762405367568135, + "learning_rate": 9.174069223008907e-06, + "loss": 0.4616, + "step": 45894 + }, + { + "epoch": 0.5737143428585715, + "grad_norm": 3.3654069900512695, + "learning_rate": 9.173199518955773e-06, + "loss": 1.2124, + "step": 45896 + }, + { + "epoch": 0.5737393434835871, + "grad_norm": 3.6433255672454834, + "learning_rate": 9.172329821199436e-06, + "loss": 1.5008, + "step": 45898 + }, + { + "epoch": 0.5737643441086027, + "grad_norm": 13.561140060424805, + "learning_rate": 9.171460129746517e-06, + "loss": 1.5748, + "step": 45900 + }, + { + "epoch": 0.5737893447336183, + "grad_norm": 3.515793561935425, + "learning_rate": 9.170590444603643e-06, + "loss": 1.5045, + "step": 45902 + }, + { + "epoch": 0.573814345358634, + "grad_norm": 4.707620143890381, + "learning_rate": 9.169720765777432e-06, + "loss": 0.8085, + "step": 45904 + }, + { + "epoch": 0.5738393459836496, + "grad_norm": 0.39038366079330444, + "learning_rate": 9.168851093274511e-06, + "loss": 0.8238, + "step": 45906 + }, + { + "epoch": 0.5738643466086653, + "grad_norm": 1.8035132884979248, + "learning_rate": 9.167981427101504e-06, + "loss": 0.9276, + "step": 45908 + }, + { + "epoch": 0.5738893472336808, + "grad_norm": 2.984436273574829, + "learning_rate": 9.167111767265033e-06, + "loss": 2.0368, + "step": 45910 + }, + { + "epoch": 0.5739143478586964, + "grad_norm": 5.10531759262085, + "learning_rate": 9.166242113771718e-06, + "loss": 1.3564, + "step": 45912 + }, + { + "epoch": 0.5739393484837121, + "grad_norm": 8.089200973510742, + "learning_rate": 9.165372466628188e-06, + "loss": 0.5161, + "step": 45914 + }, + { + "epoch": 0.5739643491087277, + "grad_norm": 5.620959281921387, + "learning_rate": 9.164502825841061e-06, + "loss": 0.6852, + "step": 45916 + }, + { + "epoch": 0.5739893497337434, + "grad_norm": 5.47632360458374, + "learning_rate": 9.163633191416966e-06, + "loss": 1.912, + "step": 45918 + }, + { + "epoch": 0.5740143503587589, + "grad_norm": 0.995298445224762, + "learning_rate": 9.16276356336252e-06, + "loss": 0.8593, + "step": 45920 + }, + { + "epoch": 0.5740393509837746, + "grad_norm": 0.0009547367808409035, + "learning_rate": 9.161893941684346e-06, + "loss": 0.182, + "step": 45922 + }, + { + "epoch": 0.5740643516087902, + "grad_norm": 3.121284008026123, + "learning_rate": 9.161024326389072e-06, + "loss": 1.0905, + "step": 45924 + }, + { + "epoch": 0.5740893522338059, + "grad_norm": 2.2613468170166016, + "learning_rate": 9.160154717483317e-06, + "loss": 0.4371, + "step": 45926 + }, + { + "epoch": 0.5741143528588215, + "grad_norm": 3.5295474529266357, + "learning_rate": 9.159285114973706e-06, + "loss": 0.7401, + "step": 45928 + }, + { + "epoch": 0.574139353483837, + "grad_norm": 10.374417304992676, + "learning_rate": 9.158415518866859e-06, + "loss": 0.8473, + "step": 45930 + }, + { + "epoch": 0.5741643541088527, + "grad_norm": 3.8821732997894287, + "learning_rate": 9.157545929169403e-06, + "loss": 1.0596, + "step": 45932 + }, + { + "epoch": 0.5741893547338683, + "grad_norm": 4.321761608123779, + "learning_rate": 9.156676345887956e-06, + "loss": 1.2539, + "step": 45934 + }, + { + "epoch": 0.574214355358884, + "grad_norm": 1.4274144172668457, + "learning_rate": 9.155806769029144e-06, + "loss": 0.6224, + "step": 45936 + }, + { + "epoch": 0.5742393559838996, + "grad_norm": 3.7006468772888184, + "learning_rate": 9.154937198599586e-06, + "loss": 1.1172, + "step": 45938 + }, + { + "epoch": 0.5742643566089152, + "grad_norm": 1.998971700668335, + "learning_rate": 9.15406763460591e-06, + "loss": 0.8124, + "step": 45940 + }, + { + "epoch": 0.5742893572339308, + "grad_norm": 5.4073405265808105, + "learning_rate": 9.153198077054731e-06, + "loss": 1.9033, + "step": 45942 + }, + { + "epoch": 0.5743143578589465, + "grad_norm": 4.11429500579834, + "learning_rate": 9.152328525952681e-06, + "loss": 1.9859, + "step": 45944 + }, + { + "epoch": 0.5743393584839621, + "grad_norm": 3.088884115219116, + "learning_rate": 9.151458981306374e-06, + "loss": 0.7339, + "step": 45946 + }, + { + "epoch": 0.5743643591089778, + "grad_norm": 6.4557037353515625, + "learning_rate": 9.150589443122435e-06, + "loss": 0.9922, + "step": 45948 + }, + { + "epoch": 0.5743893597339933, + "grad_norm": 3.711094379425049, + "learning_rate": 9.149719911407487e-06, + "loss": 0.3059, + "step": 45950 + }, + { + "epoch": 0.574414360359009, + "grad_norm": 0.0007021881174296141, + "learning_rate": 9.148850386168151e-06, + "loss": 0.7198, + "step": 45952 + }, + { + "epoch": 0.5744393609840246, + "grad_norm": 1.3799176216125488, + "learning_rate": 9.147980867411052e-06, + "loss": 0.6158, + "step": 45954 + }, + { + "epoch": 0.5744643616090402, + "grad_norm": 3.346971273422241, + "learning_rate": 9.14711135514281e-06, + "loss": 0.9696, + "step": 45956 + }, + { + "epoch": 0.5744893622340559, + "grad_norm": 12.91950798034668, + "learning_rate": 9.14624184937005e-06, + "loss": 1.9689, + "step": 45958 + }, + { + "epoch": 0.5745143628590714, + "grad_norm": 2.715090751647949, + "learning_rate": 9.145372350099387e-06, + "loss": 0.1281, + "step": 45960 + }, + { + "epoch": 0.5745393634840871, + "grad_norm": 0.7680964469909668, + "learning_rate": 9.144502857337448e-06, + "loss": 1.0061, + "step": 45962 + }, + { + "epoch": 0.5745643641091027, + "grad_norm": 0.0007388510857708752, + "learning_rate": 9.143633371090854e-06, + "loss": 0.0172, + "step": 45964 + }, + { + "epoch": 0.5745893647341184, + "grad_norm": 2.1199593544006348, + "learning_rate": 9.142763891366229e-06, + "loss": 0.5332, + "step": 45966 + }, + { + "epoch": 0.574614365359134, + "grad_norm": 2.0129597187042236, + "learning_rate": 9.141894418170191e-06, + "loss": 1.7278, + "step": 45968 + }, + { + "epoch": 0.5746393659841496, + "grad_norm": 0.0004965925472788513, + "learning_rate": 9.141024951509367e-06, + "loss": 0.6233, + "step": 45970 + }, + { + "epoch": 0.5746643666091652, + "grad_norm": 1.6475096940994263, + "learning_rate": 9.140155491390372e-06, + "loss": 0.5935, + "step": 45972 + }, + { + "epoch": 0.5746893672341808, + "grad_norm": 4.97035026550293, + "learning_rate": 9.139286037819834e-06, + "loss": 1.323, + "step": 45974 + }, + { + "epoch": 0.5747143678591965, + "grad_norm": 3.4916248321533203, + "learning_rate": 9.138416590804369e-06, + "loss": 0.4849, + "step": 45976 + }, + { + "epoch": 0.5747393684842121, + "grad_norm": 5.95675802230835, + "learning_rate": 9.137547150350602e-06, + "loss": 2.4934, + "step": 45978 + }, + { + "epoch": 0.5747643691092277, + "grad_norm": 2.4617581367492676, + "learning_rate": 9.136677716465155e-06, + "loss": 0.7835, + "step": 45980 + }, + { + "epoch": 0.5747893697342433, + "grad_norm": 0.0007836467120796442, + "learning_rate": 9.135808289154648e-06, + "loss": 0.9224, + "step": 45982 + }, + { + "epoch": 0.574814370359259, + "grad_norm": 0.16929791867733002, + "learning_rate": 9.134938868425705e-06, + "loss": 1.0079, + "step": 45984 + }, + { + "epoch": 0.5748393709842746, + "grad_norm": 3.0680930614471436, + "learning_rate": 9.134069454284942e-06, + "loss": 1.6445, + "step": 45986 + }, + { + "epoch": 0.5748643716092903, + "grad_norm": 3.7281689643859863, + "learning_rate": 9.133200046738984e-06, + "loss": 1.0431, + "step": 45988 + }, + { + "epoch": 0.5748893722343058, + "grad_norm": 1.623836636543274, + "learning_rate": 9.132330645794452e-06, + "loss": 0.546, + "step": 45990 + }, + { + "epoch": 0.5749143728593215, + "grad_norm": 3.8202402591705322, + "learning_rate": 9.131461251457969e-06, + "loss": 0.4569, + "step": 45992 + }, + { + "epoch": 0.5749393734843371, + "grad_norm": 0.021323829889297485, + "learning_rate": 9.130591863736153e-06, + "loss": 0.8415, + "step": 45994 + }, + { + "epoch": 0.5749643741093527, + "grad_norm": 3.5777316093444824, + "learning_rate": 9.129722482635628e-06, + "loss": 1.3782, + "step": 45996 + }, + { + "epoch": 0.5749893747343684, + "grad_norm": 0.005323279649019241, + "learning_rate": 9.12885310816301e-06, + "loss": 0.1809, + "step": 45998 + }, + { + "epoch": 0.5750143753593839, + "grad_norm": 0.0003110007382929325, + "learning_rate": 9.127983740324926e-06, + "loss": 0.0, + "step": 46000 + }, + { + "epoch": 0.5750393759843996, + "grad_norm": 3.764676809310913, + "learning_rate": 9.127114379127994e-06, + "loss": 0.5714, + "step": 46002 + }, + { + "epoch": 0.5750643766094152, + "grad_norm": 0.9022624492645264, + "learning_rate": 9.126245024578835e-06, + "loss": 0.4729, + "step": 46004 + }, + { + "epoch": 0.5750893772344309, + "grad_norm": 4.732720851898193, + "learning_rate": 9.12537567668407e-06, + "loss": 0.8633, + "step": 46006 + }, + { + "epoch": 0.5751143778594465, + "grad_norm": 2.263556480407715, + "learning_rate": 9.124506335450321e-06, + "loss": 1.1663, + "step": 46008 + }, + { + "epoch": 0.5751393784844621, + "grad_norm": 3.468903064727783, + "learning_rate": 9.12363700088421e-06, + "loss": 0.9045, + "step": 46010 + }, + { + "epoch": 0.5751643791094777, + "grad_norm": 2.9969379901885986, + "learning_rate": 9.122767672992354e-06, + "loss": 0.9238, + "step": 46012 + }, + { + "epoch": 0.5751893797344934, + "grad_norm": 5.425451278686523, + "learning_rate": 9.121898351781373e-06, + "loss": 2.2358, + "step": 46014 + }, + { + "epoch": 0.575214380359509, + "grad_norm": 3.182957649230957, + "learning_rate": 9.121029037257892e-06, + "loss": 0.6603, + "step": 46016 + }, + { + "epoch": 0.5752393809845247, + "grad_norm": 1.5118399858474731, + "learning_rate": 9.120159729428529e-06, + "loss": 0.6971, + "step": 46018 + }, + { + "epoch": 0.5752643816095402, + "grad_norm": 0.00048165780026465654, + "learning_rate": 9.119290428299904e-06, + "loss": 0.727, + "step": 46020 + }, + { + "epoch": 0.5752893822345558, + "grad_norm": 1.9274400472640991, + "learning_rate": 9.118421133878643e-06, + "loss": 0.5704, + "step": 46022 + }, + { + "epoch": 0.5753143828595715, + "grad_norm": 0.0005055288202129304, + "learning_rate": 9.11755184617136e-06, + "loss": 0.0, + "step": 46024 + }, + { + "epoch": 0.5753393834845871, + "grad_norm": 4.821473598480225, + "learning_rate": 9.116682565184677e-06, + "loss": 1.8375, + "step": 46026 + }, + { + "epoch": 0.5753643841096028, + "grad_norm": 3.4543912410736084, + "learning_rate": 9.115813290925211e-06, + "loss": 0.9095, + "step": 46028 + }, + { + "epoch": 0.5753893847346183, + "grad_norm": 4.17938232421875, + "learning_rate": 9.11494402339959e-06, + "loss": 1.46, + "step": 46030 + }, + { + "epoch": 0.575414385359634, + "grad_norm": 6.085442066192627, + "learning_rate": 9.114074762614429e-06, + "loss": 0.6468, + "step": 46032 + }, + { + "epoch": 0.5754393859846496, + "grad_norm": 0.18293462693691254, + "learning_rate": 9.113205508576349e-06, + "loss": 0.1403, + "step": 46034 + }, + { + "epoch": 0.5754643866096653, + "grad_norm": 5.95395040512085, + "learning_rate": 9.112336261291972e-06, + "loss": 1.2654, + "step": 46036 + }, + { + "epoch": 0.5754893872346809, + "grad_norm": 0.924598753452301, + "learning_rate": 9.111467020767915e-06, + "loss": 0.3097, + "step": 46038 + }, + { + "epoch": 0.5755143878596964, + "grad_norm": 4.154184341430664, + "learning_rate": 9.110597787010799e-06, + "loss": 0.2255, + "step": 46040 + }, + { + "epoch": 0.5755393884847121, + "grad_norm": 2.9689083099365234, + "learning_rate": 9.109728560027246e-06, + "loss": 1.0476, + "step": 46042 + }, + { + "epoch": 0.5755643891097277, + "grad_norm": 12.038630485534668, + "learning_rate": 9.108859339823875e-06, + "loss": 0.6163, + "step": 46044 + }, + { + "epoch": 0.5755893897347434, + "grad_norm": 3.692809820175171, + "learning_rate": 9.107990126407302e-06, + "loss": 1.5426, + "step": 46046 + }, + { + "epoch": 0.575614390359759, + "grad_norm": 7.718082904815674, + "learning_rate": 9.107120919784154e-06, + "loss": 1.413, + "step": 46048 + }, + { + "epoch": 0.5756393909847746, + "grad_norm": 0.0119176609441638, + "learning_rate": 9.106251719961046e-06, + "loss": 0.4324, + "step": 46050 + }, + { + "epoch": 0.5756643916097902, + "grad_norm": 2.560241937637329, + "learning_rate": 9.105382526944597e-06, + "loss": 0.5118, + "step": 46052 + }, + { + "epoch": 0.5756893922348059, + "grad_norm": 3.4112343788146973, + "learning_rate": 9.104513340741427e-06, + "loss": 0.897, + "step": 46054 + }, + { + "epoch": 0.5757143928598215, + "grad_norm": 0.048960961401462555, + "learning_rate": 9.10364416135816e-06, + "loss": 0.5973, + "step": 46056 + }, + { + "epoch": 0.5757393934848372, + "grad_norm": 5.668576240539551, + "learning_rate": 9.10277498880141e-06, + "loss": 1.1692, + "step": 46058 + }, + { + "epoch": 0.5757643941098527, + "grad_norm": 8.344095230102539, + "learning_rate": 9.101905823077799e-06, + "loss": 0.673, + "step": 46060 + }, + { + "epoch": 0.5757893947348683, + "grad_norm": 1.5880932807922363, + "learning_rate": 9.101036664193948e-06, + "loss": 0.062, + "step": 46062 + }, + { + "epoch": 0.575814395359884, + "grad_norm": 2.8780455589294434, + "learning_rate": 9.100167512156474e-06, + "loss": 0.8954, + "step": 46064 + }, + { + "epoch": 0.5758393959848996, + "grad_norm": 3.117772340774536, + "learning_rate": 9.099298366971995e-06, + "loss": 0.2653, + "step": 46066 + }, + { + "epoch": 0.5758643966099153, + "grad_norm": 5.513684272766113, + "learning_rate": 9.098429228647134e-06, + "loss": 1.4631, + "step": 46068 + }, + { + "epoch": 0.5758893972349308, + "grad_norm": 5.694369792938232, + "learning_rate": 9.097560097188508e-06, + "loss": 1.2475, + "step": 46070 + }, + { + "epoch": 0.5759143978599465, + "grad_norm": 6.033134460449219, + "learning_rate": 9.096690972602737e-06, + "loss": 1.4681, + "step": 46072 + }, + { + "epoch": 0.5759393984849621, + "grad_norm": 0.0004888419061899185, + "learning_rate": 9.095821854896441e-06, + "loss": 0.324, + "step": 46074 + }, + { + "epoch": 0.5759643991099778, + "grad_norm": 7.636178493499756, + "learning_rate": 9.094952744076237e-06, + "loss": 2.9126, + "step": 46076 + }, + { + "epoch": 0.5759893997349934, + "grad_norm": 3.503401517868042, + "learning_rate": 9.094083640148743e-06, + "loss": 0.4625, + "step": 46078 + }, + { + "epoch": 0.576014400360009, + "grad_norm": 5.744427680969238, + "learning_rate": 9.093214543120581e-06, + "loss": 0.9823, + "step": 46080 + }, + { + "epoch": 0.5760394009850246, + "grad_norm": 0.000540505803655833, + "learning_rate": 9.09234545299837e-06, + "loss": 0.339, + "step": 46082 + }, + { + "epoch": 0.5760644016100402, + "grad_norm": 9.715885162353516, + "learning_rate": 9.091476369788725e-06, + "loss": 1.9791, + "step": 46084 + }, + { + "epoch": 0.5760894022350559, + "grad_norm": 1.6933231353759766, + "learning_rate": 9.09060729349827e-06, + "loss": 0.5264, + "step": 46086 + }, + { + "epoch": 0.5761144028600715, + "grad_norm": 5.110794544219971, + "learning_rate": 9.089738224133621e-06, + "loss": 1.8608, + "step": 46088 + }, + { + "epoch": 0.5761394034850871, + "grad_norm": 4.632943153381348, + "learning_rate": 9.088869161701397e-06, + "loss": 0.6852, + "step": 46090 + }, + { + "epoch": 0.5761644041101027, + "grad_norm": 15.57904052734375, + "learning_rate": 9.088000106208216e-06, + "loss": 1.2789, + "step": 46092 + }, + { + "epoch": 0.5761894047351184, + "grad_norm": 2.50361704826355, + "learning_rate": 9.087131057660697e-06, + "loss": 0.6978, + "step": 46094 + }, + { + "epoch": 0.576214405360134, + "grad_norm": 3.7951836585998535, + "learning_rate": 9.08626201606546e-06, + "loss": 1.0085, + "step": 46096 + }, + { + "epoch": 0.5762394059851497, + "grad_norm": 2.659409761428833, + "learning_rate": 9.08539298142912e-06, + "loss": 0.9653, + "step": 46098 + }, + { + "epoch": 0.5762644066101652, + "grad_norm": 3.7209460735321045, + "learning_rate": 9.084523953758296e-06, + "loss": 0.9503, + "step": 46100 + }, + { + "epoch": 0.5762894072351809, + "grad_norm": 2.945377826690674, + "learning_rate": 9.083654933059616e-06, + "loss": 1.1101, + "step": 46102 + }, + { + "epoch": 0.5763144078601965, + "grad_norm": 0.0006373003125190735, + "learning_rate": 9.082785919339686e-06, + "loss": 0.6618, + "step": 46104 + }, + { + "epoch": 0.5763394084852121, + "grad_norm": 2.92944073677063, + "learning_rate": 9.081916912605128e-06, + "loss": 0.9881, + "step": 46106 + }, + { + "epoch": 0.5763644091102278, + "grad_norm": 2.594097137451172, + "learning_rate": 9.081047912862561e-06, + "loss": 0.5902, + "step": 46108 + }, + { + "epoch": 0.5763894097352433, + "grad_norm": 2.7624778747558594, + "learning_rate": 9.080178920118604e-06, + "loss": 0.2389, + "step": 46110 + }, + { + "epoch": 0.576414410360259, + "grad_norm": 9.017291069030762, + "learning_rate": 9.07930993437987e-06, + "loss": 2.0472, + "step": 46112 + }, + { + "epoch": 0.5764394109852746, + "grad_norm": 1.9181687831878662, + "learning_rate": 9.07844095565299e-06, + "loss": 0.65, + "step": 46114 + }, + { + "epoch": 0.5764644116102903, + "grad_norm": 2.9738378524780273, + "learning_rate": 9.077571983944568e-06, + "loss": 1.5988, + "step": 46116 + }, + { + "epoch": 0.5764894122353059, + "grad_norm": 0.04335782304406166, + "learning_rate": 9.076703019261227e-06, + "loss": 0.1923, + "step": 46118 + }, + { + "epoch": 0.5765144128603215, + "grad_norm": 7.690958499908447, + "learning_rate": 9.075834061609585e-06, + "loss": 0.3592, + "step": 46120 + }, + { + "epoch": 0.5765394134853371, + "grad_norm": 3.5320045948028564, + "learning_rate": 9.074965110996261e-06, + "loss": 0.5629, + "step": 46122 + }, + { + "epoch": 0.5765644141103528, + "grad_norm": 9.782646179199219, + "learning_rate": 9.07409616742787e-06, + "loss": 0.9831, + "step": 46124 + }, + { + "epoch": 0.5765894147353684, + "grad_norm": 4.09043550491333, + "learning_rate": 9.073227230911034e-06, + "loss": 1.7307, + "step": 46126 + }, + { + "epoch": 0.576614415360384, + "grad_norm": 7.550250053405762, + "learning_rate": 9.072358301452371e-06, + "loss": 0.4192, + "step": 46128 + }, + { + "epoch": 0.5766394159853996, + "grad_norm": 0.008061959408223629, + "learning_rate": 9.071489379058494e-06, + "loss": 0.0034, + "step": 46130 + }, + { + "epoch": 0.5766644166104152, + "grad_norm": 6.6656622886657715, + "learning_rate": 9.070620463736021e-06, + "loss": 1.1079, + "step": 46132 + }, + { + "epoch": 0.5766894172354309, + "grad_norm": 1.4144113063812256, + "learning_rate": 9.069751555491571e-06, + "loss": 0.7113, + "step": 46134 + }, + { + "epoch": 0.5767144178604465, + "grad_norm": 0.010341256856918335, + "learning_rate": 9.068882654331762e-06, + "loss": 0.2332, + "step": 46136 + }, + { + "epoch": 0.5767394184854622, + "grad_norm": 3.395155906677246, + "learning_rate": 9.06801376026321e-06, + "loss": 1.1618, + "step": 46138 + }, + { + "epoch": 0.5767644191104777, + "grad_norm": 2.2973451614379883, + "learning_rate": 9.067144873292538e-06, + "loss": 0.2505, + "step": 46140 + }, + { + "epoch": 0.5767894197354934, + "grad_norm": 2.5545547008514404, + "learning_rate": 9.066275993426356e-06, + "loss": 0.3903, + "step": 46142 + }, + { + "epoch": 0.576814420360509, + "grad_norm": 3.522753953933716, + "learning_rate": 9.065407120671282e-06, + "loss": 0.5316, + "step": 46144 + }, + { + "epoch": 0.5768394209855247, + "grad_norm": 2.4611527919769287, + "learning_rate": 9.064538255033936e-06, + "loss": 1.9384, + "step": 46146 + }, + { + "epoch": 0.5768644216105403, + "grad_norm": 0.004799129441380501, + "learning_rate": 9.063669396520935e-06, + "loss": 0.2005, + "step": 46148 + }, + { + "epoch": 0.5768894222355558, + "grad_norm": 0.5459058284759521, + "learning_rate": 9.062800545138894e-06, + "loss": 0.7632, + "step": 46150 + }, + { + "epoch": 0.5769144228605715, + "grad_norm": 6.3418378829956055, + "learning_rate": 9.061931700894433e-06, + "loss": 1.4336, + "step": 46152 + }, + { + "epoch": 0.5769394234855871, + "grad_norm": 3.106822967529297, + "learning_rate": 9.06106286379417e-06, + "loss": 1.383, + "step": 46154 + }, + { + "epoch": 0.5769644241106028, + "grad_norm": 1.8664566278457642, + "learning_rate": 9.060194033844718e-06, + "loss": 0.7732, + "step": 46156 + }, + { + "epoch": 0.5769894247356184, + "grad_norm": 3.609147548675537, + "learning_rate": 9.059325211052694e-06, + "loss": 0.6279, + "step": 46158 + }, + { + "epoch": 0.577014425360634, + "grad_norm": 2.283151865005493, + "learning_rate": 9.058456395424717e-06, + "loss": 0.841, + "step": 46160 + }, + { + "epoch": 0.5770394259856496, + "grad_norm": 3.7228636741638184, + "learning_rate": 9.0575875869674e-06, + "loss": 1.5655, + "step": 46162 + }, + { + "epoch": 0.5770644266106653, + "grad_norm": 3.065528631210327, + "learning_rate": 9.056718785687364e-06, + "loss": 1.6904, + "step": 46164 + }, + { + "epoch": 0.5770894272356809, + "grad_norm": 4.380735874176025, + "learning_rate": 9.05584999159123e-06, + "loss": 0.4821, + "step": 46166 + }, + { + "epoch": 0.5771144278606966, + "grad_norm": 4.9642767906188965, + "learning_rate": 9.054981204685605e-06, + "loss": 1.6635, + "step": 46168 + }, + { + "epoch": 0.5771394284857121, + "grad_norm": 0.006355591118335724, + "learning_rate": 9.054112424977109e-06, + "loss": 0.0001, + "step": 46170 + }, + { + "epoch": 0.5771644291107277, + "grad_norm": 6.131551742553711, + "learning_rate": 9.053243652472358e-06, + "loss": 1.2723, + "step": 46172 + }, + { + "epoch": 0.5771894297357434, + "grad_norm": 2.683133840560913, + "learning_rate": 9.052374887177968e-06, + "loss": 0.4281, + "step": 46174 + }, + { + "epoch": 0.577214430360759, + "grad_norm": 2.116184711456299, + "learning_rate": 9.051506129100558e-06, + "loss": 0.6211, + "step": 46176 + }, + { + "epoch": 0.5772394309857747, + "grad_norm": 4.150813579559326, + "learning_rate": 9.050637378246744e-06, + "loss": 1.0678, + "step": 46178 + }, + { + "epoch": 0.5772644316107902, + "grad_norm": 4.108298301696777, + "learning_rate": 9.049768634623147e-06, + "loss": 1.5713, + "step": 46180 + }, + { + "epoch": 0.5772894322358059, + "grad_norm": 2.2522969245910645, + "learning_rate": 9.048899898236371e-06, + "loss": 0.4631, + "step": 46182 + }, + { + "epoch": 0.5773144328608215, + "grad_norm": 0.011816078796982765, + "learning_rate": 9.048031169093042e-06, + "loss": 0.5539, + "step": 46184 + }, + { + "epoch": 0.5773394334858372, + "grad_norm": 3.2273519039154053, + "learning_rate": 9.047162447199769e-06, + "loss": 1.4979, + "step": 46186 + }, + { + "epoch": 0.5773644341108528, + "grad_norm": 0.006605022586882114, + "learning_rate": 9.046293732563174e-06, + "loss": 0.8041, + "step": 46188 + }, + { + "epoch": 0.5773894347358683, + "grad_norm": 2.7793333530426025, + "learning_rate": 9.045425025189872e-06, + "loss": 0.6275, + "step": 46190 + }, + { + "epoch": 0.577414435360884, + "grad_norm": 0.01087573729455471, + "learning_rate": 9.04455632508648e-06, + "loss": 1.0565, + "step": 46192 + }, + { + "epoch": 0.5774394359858996, + "grad_norm": 2.5979247093200684, + "learning_rate": 9.043687632259611e-06, + "loss": 0.2436, + "step": 46194 + }, + { + "epoch": 0.5774644366109153, + "grad_norm": 4.082379341125488, + "learning_rate": 9.04281894671588e-06, + "loss": 0.8836, + "step": 46196 + }, + { + "epoch": 0.5774894372359309, + "grad_norm": 0.022392839193344116, + "learning_rate": 9.041950268461903e-06, + "loss": 0.4225, + "step": 46198 + }, + { + "epoch": 0.5775144378609465, + "grad_norm": 0.012419315986335278, + "learning_rate": 9.041081597504297e-06, + "loss": 0.0001, + "step": 46200 + }, + { + "epoch": 0.5775394384859621, + "grad_norm": 5.445501327514648, + "learning_rate": 9.04021293384968e-06, + "loss": 0.674, + "step": 46202 + }, + { + "epoch": 0.5775644391109778, + "grad_norm": 2.9055562019348145, + "learning_rate": 9.039344277504665e-06, + "loss": 1.6955, + "step": 46204 + }, + { + "epoch": 0.5775894397359934, + "grad_norm": 2.027531147003174, + "learning_rate": 9.038475628475873e-06, + "loss": 0.6876, + "step": 46206 + }, + { + "epoch": 0.5776144403610091, + "grad_norm": 5.948375701904297, + "learning_rate": 9.037606986769912e-06, + "loss": 1.5099, + "step": 46208 + }, + { + "epoch": 0.5776394409860246, + "grad_norm": 4.393352508544922, + "learning_rate": 9.036738352393397e-06, + "loss": 1.2121, + "step": 46210 + }, + { + "epoch": 0.5776644416110402, + "grad_norm": 0.00584762915968895, + "learning_rate": 9.035869725352945e-06, + "loss": 0.7839, + "step": 46212 + }, + { + "epoch": 0.5776894422360559, + "grad_norm": 14.725744247436523, + "learning_rate": 9.035001105655177e-06, + "loss": 2.4131, + "step": 46214 + }, + { + "epoch": 0.5777144428610715, + "grad_norm": 1.5281039476394653, + "learning_rate": 9.034132493306703e-06, + "loss": 0.0393, + "step": 46216 + }, + { + "epoch": 0.5777394434860872, + "grad_norm": 4.500452041625977, + "learning_rate": 9.033263888314142e-06, + "loss": 0.2518, + "step": 46218 + }, + { + "epoch": 0.5777644441111027, + "grad_norm": 3.880934000015259, + "learning_rate": 9.032395290684104e-06, + "loss": 0.9097, + "step": 46220 + }, + { + "epoch": 0.5777894447361184, + "grad_norm": 0.6556220650672913, + "learning_rate": 9.031526700423205e-06, + "loss": 0.3805, + "step": 46222 + }, + { + "epoch": 0.577814445361134, + "grad_norm": 0.0036923158913850784, + "learning_rate": 9.030658117538061e-06, + "loss": 0.1886, + "step": 46224 + }, + { + "epoch": 0.5778394459861497, + "grad_norm": 0.004410149063915014, + "learning_rate": 9.02978954203529e-06, + "loss": 0.5636, + "step": 46226 + }, + { + "epoch": 0.5778644466111653, + "grad_norm": 1.701521396636963, + "learning_rate": 9.028920973921504e-06, + "loss": 0.7616, + "step": 46228 + }, + { + "epoch": 0.5778894472361809, + "grad_norm": 5.693814277648926, + "learning_rate": 9.028052413203318e-06, + "loss": 1.4949, + "step": 46230 + }, + { + "epoch": 0.5779144478611965, + "grad_norm": 1.09416925907135, + "learning_rate": 9.027183859887354e-06, + "loss": 0.6964, + "step": 46232 + }, + { + "epoch": 0.5779394484862121, + "grad_norm": 2.6530961990356445, + "learning_rate": 9.026315313980214e-06, + "loss": 1.8665, + "step": 46234 + }, + { + "epoch": 0.5779644491112278, + "grad_norm": 3.1311824321746826, + "learning_rate": 9.025446775488519e-06, + "loss": 0.472, + "step": 46236 + }, + { + "epoch": 0.5779894497362434, + "grad_norm": 0.002768628066405654, + "learning_rate": 9.024578244418882e-06, + "loss": 0.1802, + "step": 46238 + }, + { + "epoch": 0.578014450361259, + "grad_norm": 2.088164806365967, + "learning_rate": 9.02370972077792e-06, + "loss": 1.326, + "step": 46240 + }, + { + "epoch": 0.5780394509862746, + "grad_norm": 4.403123378753662, + "learning_rate": 9.022841204572248e-06, + "loss": 1.5237, + "step": 46242 + }, + { + "epoch": 0.5780644516112903, + "grad_norm": 3.9302890300750732, + "learning_rate": 9.021972695808484e-06, + "loss": 1.129, + "step": 46244 + }, + { + "epoch": 0.5780894522363059, + "grad_norm": 2.7455177307128906, + "learning_rate": 9.021104194493232e-06, + "loss": 0.1781, + "step": 46246 + }, + { + "epoch": 0.5781144528613216, + "grad_norm": 4.131152153015137, + "learning_rate": 9.02023570063311e-06, + "loss": 0.5851, + "step": 46248 + }, + { + "epoch": 0.5781394534863371, + "grad_norm": 6.12150239944458, + "learning_rate": 9.019367214234736e-06, + "loss": 0.9634, + "step": 46250 + }, + { + "epoch": 0.5781644541113528, + "grad_norm": 2.1149420738220215, + "learning_rate": 9.018498735304725e-06, + "loss": 0.9834, + "step": 46252 + }, + { + "epoch": 0.5781894547363684, + "grad_norm": 0.020483989268541336, + "learning_rate": 9.017630263849688e-06, + "loss": 0.1569, + "step": 46254 + }, + { + "epoch": 0.578214455361384, + "grad_norm": 2.315617084503174, + "learning_rate": 9.016761799876238e-06, + "loss": 1.1566, + "step": 46256 + }, + { + "epoch": 0.5782394559863997, + "grad_norm": 3.6798901557922363, + "learning_rate": 9.015893343390996e-06, + "loss": 0.9515, + "step": 46258 + }, + { + "epoch": 0.5782644566114152, + "grad_norm": 1.6620386838912964, + "learning_rate": 9.015024894400566e-06, + "loss": 0.2576, + "step": 46260 + }, + { + "epoch": 0.5782894572364309, + "grad_norm": 4.456686973571777, + "learning_rate": 9.014156452911568e-06, + "loss": 1.0687, + "step": 46262 + }, + { + "epoch": 0.5783144578614465, + "grad_norm": 0.006229347549378872, + "learning_rate": 9.013288018930617e-06, + "loss": 1.0703, + "step": 46264 + }, + { + "epoch": 0.5783394584864622, + "grad_norm": 1.563629388809204, + "learning_rate": 9.012419592464323e-06, + "loss": 0.1199, + "step": 46266 + }, + { + "epoch": 0.5783644591114778, + "grad_norm": 1.9509340524673462, + "learning_rate": 9.011551173519302e-06, + "loss": 0.916, + "step": 46268 + }, + { + "epoch": 0.5783894597364934, + "grad_norm": 4.265501022338867, + "learning_rate": 9.010682762102172e-06, + "loss": 1.6827, + "step": 46270 + }, + { + "epoch": 0.578414460361509, + "grad_norm": 7.301548480987549, + "learning_rate": 9.009814358219539e-06, + "loss": 0.8829, + "step": 46272 + }, + { + "epoch": 0.5784394609865247, + "grad_norm": 4.602953910827637, + "learning_rate": 9.008945961878017e-06, + "loss": 0.8858, + "step": 46274 + }, + { + "epoch": 0.5784644616115403, + "grad_norm": 3.931786060333252, + "learning_rate": 9.008077573084223e-06, + "loss": 1.3546, + "step": 46276 + }, + { + "epoch": 0.578489462236556, + "grad_norm": 4.269046783447266, + "learning_rate": 9.00720919184477e-06, + "loss": 0.7203, + "step": 46278 + }, + { + "epoch": 0.5785144628615715, + "grad_norm": 3.9595000743865967, + "learning_rate": 9.006340818166274e-06, + "loss": 1.1648, + "step": 46280 + }, + { + "epoch": 0.5785394634865871, + "grad_norm": 3.517866849899292, + "learning_rate": 9.005472452055343e-06, + "loss": 0.9092, + "step": 46282 + }, + { + "epoch": 0.5785644641116028, + "grad_norm": 5.520146369934082, + "learning_rate": 9.0046040935186e-06, + "loss": 0.2527, + "step": 46284 + }, + { + "epoch": 0.5785894647366184, + "grad_norm": 3.1772098541259766, + "learning_rate": 9.003735742562643e-06, + "loss": 0.4918, + "step": 46286 + }, + { + "epoch": 0.5786144653616341, + "grad_norm": 0.6834800839424133, + "learning_rate": 9.002867399194097e-06, + "loss": 0.6888, + "step": 46288 + }, + { + "epoch": 0.5786394659866496, + "grad_norm": 4.389501094818115, + "learning_rate": 9.001999063419573e-06, + "loss": 2.0683, + "step": 46290 + }, + { + "epoch": 0.5786644666116653, + "grad_norm": 1.8985944986343384, + "learning_rate": 9.00113073524568e-06, + "loss": 0.9186, + "step": 46292 + }, + { + "epoch": 0.5786894672366809, + "grad_norm": 4.1425018310546875, + "learning_rate": 9.000262414679037e-06, + "loss": 0.8608, + "step": 46294 + }, + { + "epoch": 0.5787144678616966, + "grad_norm": 4.291344165802002, + "learning_rate": 8.999394101726256e-06, + "loss": 1.4351, + "step": 46296 + }, + { + "epoch": 0.5787394684867122, + "grad_norm": 3.3024072647094727, + "learning_rate": 8.998525796393942e-06, + "loss": 1.6132, + "step": 46298 + }, + { + "epoch": 0.5787644691117277, + "grad_norm": 0.7887956500053406, + "learning_rate": 8.997657498688717e-06, + "loss": 0.3526, + "step": 46300 + }, + { + "epoch": 0.5787894697367434, + "grad_norm": 3.350151538848877, + "learning_rate": 8.996789208617191e-06, + "loss": 0.4412, + "step": 46302 + }, + { + "epoch": 0.578814470361759, + "grad_norm": 4.085829257965088, + "learning_rate": 8.995920926185975e-06, + "loss": 1.5723, + "step": 46304 + }, + { + "epoch": 0.5788394709867747, + "grad_norm": 0.007081371732056141, + "learning_rate": 8.995052651401684e-06, + "loss": 0.7373, + "step": 46306 + }, + { + "epoch": 0.5788644716117903, + "grad_norm": 3.532893419265747, + "learning_rate": 8.99418438427093e-06, + "loss": 0.7065, + "step": 46308 + }, + { + "epoch": 0.5788894722368059, + "grad_norm": 5.9567975997924805, + "learning_rate": 8.993316124800328e-06, + "loss": 1.0924, + "step": 46310 + }, + { + "epoch": 0.5789144728618215, + "grad_norm": 3.287304639816284, + "learning_rate": 8.992447872996486e-06, + "loss": 0.4829, + "step": 46312 + }, + { + "epoch": 0.5789394734868372, + "grad_norm": 1.2512232065200806, + "learning_rate": 8.991579628866018e-06, + "loss": 0.4214, + "step": 46314 + }, + { + "epoch": 0.5789644741118528, + "grad_norm": 4.107693672180176, + "learning_rate": 8.990711392415537e-06, + "loss": 1.3544, + "step": 46316 + }, + { + "epoch": 0.5789894747368685, + "grad_norm": 5.671206951141357, + "learning_rate": 8.989843163651655e-06, + "loss": 1.5932, + "step": 46318 + }, + { + "epoch": 0.579014475361884, + "grad_norm": 3.7627978324890137, + "learning_rate": 8.988974942580985e-06, + "loss": 0.0907, + "step": 46320 + }, + { + "epoch": 0.5790394759868996, + "grad_norm": 0.007994459010660648, + "learning_rate": 8.98810672921014e-06, + "loss": 0.5946, + "step": 46322 + }, + { + "epoch": 0.5790644766119153, + "grad_norm": 3.170011043548584, + "learning_rate": 8.98723852354573e-06, + "loss": 0.5948, + "step": 46324 + }, + { + "epoch": 0.5790894772369309, + "grad_norm": 0.21849171817302704, + "learning_rate": 8.98637032559437e-06, + "loss": 0.6459, + "step": 46326 + }, + { + "epoch": 0.5791144778619466, + "grad_norm": 6.737165451049805, + "learning_rate": 8.985502135362668e-06, + "loss": 2.3609, + "step": 46328 + }, + { + "epoch": 0.5791394784869621, + "grad_norm": 3.8213155269622803, + "learning_rate": 8.984633952857238e-06, + "loss": 1.0939, + "step": 46330 + }, + { + "epoch": 0.5791644791119778, + "grad_norm": 5.910691261291504, + "learning_rate": 8.983765778084692e-06, + "loss": 1.0494, + "step": 46332 + }, + { + "epoch": 0.5791894797369934, + "grad_norm": 3.2695248126983643, + "learning_rate": 8.982897611051644e-06, + "loss": 0.746, + "step": 46334 + }, + { + "epoch": 0.5792144803620091, + "grad_norm": 2.2200427055358887, + "learning_rate": 8.982029451764705e-06, + "loss": 0.564, + "step": 46336 + }, + { + "epoch": 0.5792394809870247, + "grad_norm": 2.846856117248535, + "learning_rate": 8.981161300230484e-06, + "loss": 0.7931, + "step": 46338 + }, + { + "epoch": 0.5792644816120402, + "grad_norm": 7.002364635467529, + "learning_rate": 8.980293156455593e-06, + "loss": 1.6818, + "step": 46340 + }, + { + "epoch": 0.5792894822370559, + "grad_norm": 3.162458896636963, + "learning_rate": 8.979425020446646e-06, + "loss": 0.6102, + "step": 46342 + }, + { + "epoch": 0.5793144828620715, + "grad_norm": 4.607024192810059, + "learning_rate": 8.978556892210255e-06, + "loss": 0.9601, + "step": 46344 + }, + { + "epoch": 0.5793394834870872, + "grad_norm": 8.34295654296875, + "learning_rate": 8.977688771753029e-06, + "loss": 0.3474, + "step": 46346 + }, + { + "epoch": 0.5793644841121028, + "grad_norm": 3.945173740386963, + "learning_rate": 8.976820659081581e-06, + "loss": 1.0771, + "step": 46348 + }, + { + "epoch": 0.5793894847371184, + "grad_norm": 3.854374885559082, + "learning_rate": 8.975952554202524e-06, + "loss": 1.1716, + "step": 46350 + }, + { + "epoch": 0.579414485362134, + "grad_norm": 5.688755035400391, + "learning_rate": 8.975084457122466e-06, + "loss": 2.2856, + "step": 46352 + }, + { + "epoch": 0.5794394859871497, + "grad_norm": 3.937723159790039, + "learning_rate": 8.974216367848018e-06, + "loss": 0.7521, + "step": 46354 + }, + { + "epoch": 0.5794644866121653, + "grad_norm": 2.891183853149414, + "learning_rate": 8.973348286385796e-06, + "loss": 1.071, + "step": 46356 + }, + { + "epoch": 0.579489487237181, + "grad_norm": 3.6122488975524902, + "learning_rate": 8.972480212742406e-06, + "loss": 1.2658, + "step": 46358 + }, + { + "epoch": 0.5795144878621965, + "grad_norm": 7.872437477111816, + "learning_rate": 8.971612146924462e-06, + "loss": 0.7581, + "step": 46360 + }, + { + "epoch": 0.5795394884872122, + "grad_norm": 2.8671481609344482, + "learning_rate": 8.970744088938577e-06, + "loss": 0.9265, + "step": 46362 + }, + { + "epoch": 0.5795644891122278, + "grad_norm": 2.9616246223449707, + "learning_rate": 8.969876038791357e-06, + "loss": 1.5072, + "step": 46364 + }, + { + "epoch": 0.5795894897372434, + "grad_norm": 3.6392664909362793, + "learning_rate": 8.969007996489416e-06, + "loss": 0.9087, + "step": 46366 + }, + { + "epoch": 0.5796144903622591, + "grad_norm": 2.3298227787017822, + "learning_rate": 8.968139962039364e-06, + "loss": 0.1674, + "step": 46368 + }, + { + "epoch": 0.5796394909872746, + "grad_norm": 2.2992820739746094, + "learning_rate": 8.967271935447812e-06, + "loss": 1.2507, + "step": 46370 + }, + { + "epoch": 0.5796644916122903, + "grad_norm": 4.231780529022217, + "learning_rate": 8.966403916721371e-06, + "loss": 1.002, + "step": 46372 + }, + { + "epoch": 0.5796894922373059, + "grad_norm": 3.4261744022369385, + "learning_rate": 8.965535905866651e-06, + "loss": 1.1368, + "step": 46374 + }, + { + "epoch": 0.5797144928623216, + "grad_norm": 1.0746724605560303, + "learning_rate": 8.964667902890268e-06, + "loss": 0.2933, + "step": 46376 + }, + { + "epoch": 0.5797394934873372, + "grad_norm": 5.8471174240112305, + "learning_rate": 8.963799907798823e-06, + "loss": 1.5229, + "step": 46378 + }, + { + "epoch": 0.5797644941123528, + "grad_norm": 3.723598003387451, + "learning_rate": 8.962931920598931e-06, + "loss": 0.9679, + "step": 46380 + }, + { + "epoch": 0.5797894947373684, + "grad_norm": 4.3511643409729, + "learning_rate": 8.962063941297205e-06, + "loss": 1.1113, + "step": 46382 + }, + { + "epoch": 0.579814495362384, + "grad_norm": 0.01515852753072977, + "learning_rate": 8.961195969900251e-06, + "loss": 0.0002, + "step": 46384 + }, + { + "epoch": 0.5798394959873997, + "grad_norm": 14.083331108093262, + "learning_rate": 8.960328006414682e-06, + "loss": 2.6978, + "step": 46386 + }, + { + "epoch": 0.5798644966124153, + "grad_norm": 2.133810043334961, + "learning_rate": 8.959460050847112e-06, + "loss": 0.8678, + "step": 46388 + }, + { + "epoch": 0.5798894972374309, + "grad_norm": 5.04534912109375, + "learning_rate": 8.958592103204143e-06, + "loss": 1.3156, + "step": 46390 + }, + { + "epoch": 0.5799144978624465, + "grad_norm": 2.90209698677063, + "learning_rate": 8.95772416349239e-06, + "loss": 0.587, + "step": 46392 + }, + { + "epoch": 0.5799394984874622, + "grad_norm": 0.5714671015739441, + "learning_rate": 8.956856231718463e-06, + "loss": 0.5183, + "step": 46394 + }, + { + "epoch": 0.5799644991124778, + "grad_norm": 3.244688034057617, + "learning_rate": 8.955988307888969e-06, + "loss": 1.5756, + "step": 46396 + }, + { + "epoch": 0.5799894997374935, + "grad_norm": 0.6138489842414856, + "learning_rate": 8.955120392010524e-06, + "loss": 0.5183, + "step": 46398 + }, + { + "epoch": 0.580014500362509, + "grad_norm": 3.48513126373291, + "learning_rate": 8.954252484089731e-06, + "loss": 0.5862, + "step": 46400 + }, + { + "epoch": 0.5800395009875247, + "grad_norm": 6.799125671386719, + "learning_rate": 8.953384584133208e-06, + "loss": 2.3329, + "step": 46402 + }, + { + "epoch": 0.5800645016125403, + "grad_norm": 3.324449300765991, + "learning_rate": 8.952516692147555e-06, + "loss": 0.9497, + "step": 46404 + }, + { + "epoch": 0.580089502237556, + "grad_norm": 4.566936016082764, + "learning_rate": 8.951648808139389e-06, + "loss": 1.1915, + "step": 46406 + }, + { + "epoch": 0.5801145028625716, + "grad_norm": 3.579125165939331, + "learning_rate": 8.950780932115317e-06, + "loss": 1.042, + "step": 46408 + }, + { + "epoch": 0.5801395034875871, + "grad_norm": 5.235481262207031, + "learning_rate": 8.94991306408195e-06, + "loss": 2.3854, + "step": 46410 + }, + { + "epoch": 0.5801645041126028, + "grad_norm": 3.1195480823516846, + "learning_rate": 8.949045204045895e-06, + "loss": 2.3336, + "step": 46412 + }, + { + "epoch": 0.5801895047376184, + "grad_norm": 10.231674194335938, + "learning_rate": 8.948177352013766e-06, + "loss": 1.7747, + "step": 46414 + }, + { + "epoch": 0.5802145053626341, + "grad_norm": 3.6680097579956055, + "learning_rate": 8.947309507992167e-06, + "loss": 1.2004, + "step": 46416 + }, + { + "epoch": 0.5802395059876497, + "grad_norm": 3.477461814880371, + "learning_rate": 8.94644167198771e-06, + "loss": 0.7617, + "step": 46418 + }, + { + "epoch": 0.5802645066126653, + "grad_norm": 2.467651605606079, + "learning_rate": 8.945573844007004e-06, + "loss": 0.5131, + "step": 46420 + }, + { + "epoch": 0.5802895072376809, + "grad_norm": 0.8287407159805298, + "learning_rate": 8.94470602405666e-06, + "loss": 0.504, + "step": 46422 + }, + { + "epoch": 0.5803145078626966, + "grad_norm": 3.3412280082702637, + "learning_rate": 8.943838212143282e-06, + "loss": 0.9357, + "step": 46424 + }, + { + "epoch": 0.5803395084877122, + "grad_norm": 2.5949254035949707, + "learning_rate": 8.942970408273487e-06, + "loss": 1.2045, + "step": 46426 + }, + { + "epoch": 0.5803645091127279, + "grad_norm": 5.626855373382568, + "learning_rate": 8.94210261245388e-06, + "loss": 1.3795, + "step": 46428 + }, + { + "epoch": 0.5803895097377434, + "grad_norm": 2.955350399017334, + "learning_rate": 8.941234824691069e-06, + "loss": 1.0496, + "step": 46430 + }, + { + "epoch": 0.580414510362759, + "grad_norm": 3.8133468627929688, + "learning_rate": 8.940367044991662e-06, + "loss": 1.4499, + "step": 46432 + }, + { + "epoch": 0.5804395109877747, + "grad_norm": 4.835333824157715, + "learning_rate": 8.93949927336227e-06, + "loss": 1.0145, + "step": 46434 + }, + { + "epoch": 0.5804645116127903, + "grad_norm": 5.779551982879639, + "learning_rate": 8.938631509809503e-06, + "loss": 0.7182, + "step": 46436 + }, + { + "epoch": 0.580489512237806, + "grad_norm": 3.5538878440856934, + "learning_rate": 8.937763754339968e-06, + "loss": 0.8105, + "step": 46438 + }, + { + "epoch": 0.5805145128628215, + "grad_norm": 9.923426628112793, + "learning_rate": 8.936896006960276e-06, + "loss": 0.9858, + "step": 46440 + }, + { + "epoch": 0.5805395134878372, + "grad_norm": 0.004588708281517029, + "learning_rate": 8.936028267677032e-06, + "loss": 0.2258, + "step": 46442 + }, + { + "epoch": 0.5805645141128528, + "grad_norm": 4.9778547286987305, + "learning_rate": 8.935160536496845e-06, + "loss": 0.6118, + "step": 46444 + }, + { + "epoch": 0.5805895147378685, + "grad_norm": 1.6562668085098267, + "learning_rate": 8.934292813426325e-06, + "loss": 0.7192, + "step": 46446 + }, + { + "epoch": 0.5806145153628841, + "grad_norm": 4.288814067840576, + "learning_rate": 8.933425098472081e-06, + "loss": 0.7532, + "step": 46448 + }, + { + "epoch": 0.5806395159878996, + "grad_norm": 4.451732158660889, + "learning_rate": 8.93255739164072e-06, + "loss": 0.9821, + "step": 46450 + }, + { + "epoch": 0.5806645166129153, + "grad_norm": 7.952417373657227, + "learning_rate": 8.931689692938851e-06, + "loss": 1.3844, + "step": 46452 + }, + { + "epoch": 0.5806895172379309, + "grad_norm": 4.853048324584961, + "learning_rate": 8.930822002373086e-06, + "loss": 1.4707, + "step": 46454 + }, + { + "epoch": 0.5807145178629466, + "grad_norm": 3.082474708557129, + "learning_rate": 8.929954319950025e-06, + "loss": 1.3882, + "step": 46456 + }, + { + "epoch": 0.5807395184879622, + "grad_norm": 4.477558135986328, + "learning_rate": 8.92908664567628e-06, + "loss": 1.6055, + "step": 46458 + }, + { + "epoch": 0.5807645191129778, + "grad_norm": 4.431539058685303, + "learning_rate": 8.928218979558461e-06, + "loss": 0.4943, + "step": 46460 + }, + { + "epoch": 0.5807895197379934, + "grad_norm": 2.8132739067077637, + "learning_rate": 8.927351321603177e-06, + "loss": 0.225, + "step": 46462 + }, + { + "epoch": 0.5808145203630091, + "grad_norm": 2.610408306121826, + "learning_rate": 8.92648367181703e-06, + "loss": 0.6865, + "step": 46464 + }, + { + "epoch": 0.5808395209880247, + "grad_norm": 2.8804171085357666, + "learning_rate": 8.925616030206635e-06, + "loss": 0.6158, + "step": 46466 + }, + { + "epoch": 0.5808645216130404, + "grad_norm": 0.007109237834811211, + "learning_rate": 8.924748396778594e-06, + "loss": 0.4111, + "step": 46468 + }, + { + "epoch": 0.5808895222380559, + "grad_norm": 3.213918924331665, + "learning_rate": 8.923880771539518e-06, + "loss": 0.1769, + "step": 46470 + }, + { + "epoch": 0.5809145228630715, + "grad_norm": 4.08953332901001, + "learning_rate": 8.923013154496012e-06, + "loss": 0.9703, + "step": 46472 + }, + { + "epoch": 0.5809395234880872, + "grad_norm": 4.479250907897949, + "learning_rate": 8.922145545654688e-06, + "loss": 0.4692, + "step": 46474 + }, + { + "epoch": 0.5809645241131028, + "grad_norm": 3.643904209136963, + "learning_rate": 8.92127794502215e-06, + "loss": 1.2233, + "step": 46476 + }, + { + "epoch": 0.5809895247381185, + "grad_norm": 2.8584773540496826, + "learning_rate": 8.920410352605007e-06, + "loss": 0.2035, + "step": 46478 + }, + { + "epoch": 0.581014525363134, + "grad_norm": 4.29431676864624, + "learning_rate": 8.919542768409868e-06, + "loss": 0.7303, + "step": 46480 + }, + { + "epoch": 0.5810395259881497, + "grad_norm": 2.97898006439209, + "learning_rate": 8.918675192443336e-06, + "loss": 0.86, + "step": 46482 + }, + { + "epoch": 0.5810645266131653, + "grad_norm": 3.7377824783325195, + "learning_rate": 8.91780762471202e-06, + "loss": 1.5401, + "step": 46484 + }, + { + "epoch": 0.581089527238181, + "grad_norm": 1.2122138738632202, + "learning_rate": 8.91694006522253e-06, + "loss": 0.6197, + "step": 46486 + }, + { + "epoch": 0.5811145278631966, + "grad_norm": 2.5469532012939453, + "learning_rate": 8.91607251398147e-06, + "loss": 0.408, + "step": 46488 + }, + { + "epoch": 0.5811395284882122, + "grad_norm": 4.112919330596924, + "learning_rate": 8.91520497099545e-06, + "loss": 1.1072, + "step": 46490 + }, + { + "epoch": 0.5811645291132278, + "grad_norm": 0.009995043277740479, + "learning_rate": 8.914337436271075e-06, + "loss": 0.6044, + "step": 46492 + }, + { + "epoch": 0.5811895297382434, + "grad_norm": 3.410498857498169, + "learning_rate": 8.913469909814952e-06, + "loss": 1.5845, + "step": 46494 + }, + { + "epoch": 0.5812145303632591, + "grad_norm": 0.004377649165689945, + "learning_rate": 8.91260239163369e-06, + "loss": 0.0642, + "step": 46496 + }, + { + "epoch": 0.5812395309882747, + "grad_norm": 1.5573772192001343, + "learning_rate": 8.911734881733893e-06, + "loss": 0.5301, + "step": 46498 + }, + { + "epoch": 0.5812645316132903, + "grad_norm": 0.005360265262424946, + "learning_rate": 8.910867380122167e-06, + "loss": 0.0002, + "step": 46500 + }, + { + "epoch": 0.5812895322383059, + "grad_norm": 3.0472218990325928, + "learning_rate": 8.909999886805123e-06, + "loss": 0.4047, + "step": 46502 + }, + { + "epoch": 0.5813145328633216, + "grad_norm": 0.6512437462806702, + "learning_rate": 8.909132401789364e-06, + "loss": 0.3988, + "step": 46504 + }, + { + "epoch": 0.5813395334883372, + "grad_norm": 2.7220208644866943, + "learning_rate": 8.908264925081506e-06, + "loss": 1.5912, + "step": 46506 + }, + { + "epoch": 0.5813645341133529, + "grad_norm": 5.940162658691406, + "learning_rate": 8.907397456688141e-06, + "loss": 0.9572, + "step": 46508 + }, + { + "epoch": 0.5813895347383684, + "grad_norm": 3.690946340560913, + "learning_rate": 8.906529996615885e-06, + "loss": 0.8448, + "step": 46510 + }, + { + "epoch": 0.581414535363384, + "grad_norm": 4.988209247589111, + "learning_rate": 8.90566254487134e-06, + "loss": 1.6529, + "step": 46512 + }, + { + "epoch": 0.5814395359883997, + "grad_norm": 2.694282054901123, + "learning_rate": 8.904795101461114e-06, + "loss": 0.2016, + "step": 46514 + }, + { + "epoch": 0.5814645366134154, + "grad_norm": 4.3450117111206055, + "learning_rate": 8.903927666391813e-06, + "loss": 0.9559, + "step": 46516 + }, + { + "epoch": 0.581489537238431, + "grad_norm": 2.63797664642334, + "learning_rate": 8.90306023967005e-06, + "loss": 1.1286, + "step": 46518 + }, + { + "epoch": 0.5815145378634465, + "grad_norm": 10.226539611816406, + "learning_rate": 8.90219282130242e-06, + "loss": 0.8483, + "step": 46520 + }, + { + "epoch": 0.5815395384884622, + "grad_norm": 3.9617919921875, + "learning_rate": 8.901325411295535e-06, + "loss": 1.0315, + "step": 46522 + }, + { + "epoch": 0.5815645391134778, + "grad_norm": 2.763206958770752, + "learning_rate": 8.900458009656e-06, + "loss": 0.7715, + "step": 46524 + }, + { + "epoch": 0.5815895397384935, + "grad_norm": 3.783698797225952, + "learning_rate": 8.89959061639042e-06, + "loss": 1.3356, + "step": 46526 + }, + { + "epoch": 0.5816145403635091, + "grad_norm": 6.491002559661865, + "learning_rate": 8.898723231505405e-06, + "loss": 2.3838, + "step": 46528 + }, + { + "epoch": 0.5816395409885247, + "grad_norm": 13.065228462219238, + "learning_rate": 8.897855855007554e-06, + "loss": 2.2929, + "step": 46530 + }, + { + "epoch": 0.5816645416135403, + "grad_norm": 5.5579609870910645, + "learning_rate": 8.896988486903483e-06, + "loss": 1.1508, + "step": 46532 + }, + { + "epoch": 0.581689542238556, + "grad_norm": 5.404433250427246, + "learning_rate": 8.89612112719979e-06, + "loss": 1.6114, + "step": 46534 + }, + { + "epoch": 0.5817145428635716, + "grad_norm": 2.818631410598755, + "learning_rate": 8.89525377590308e-06, + "loss": 1.0946, + "step": 46536 + }, + { + "epoch": 0.5817395434885873, + "grad_norm": 3.4107210636138916, + "learning_rate": 8.894386433019962e-06, + "loss": 1.6212, + "step": 46538 + }, + { + "epoch": 0.5817645441136028, + "grad_norm": 2.155855894088745, + "learning_rate": 8.89351909855704e-06, + "loss": 0.8262, + "step": 46540 + }, + { + "epoch": 0.5817895447386184, + "grad_norm": 0.013842555694282055, + "learning_rate": 8.892651772520919e-06, + "loss": 0.0026, + "step": 46542 + }, + { + "epoch": 0.5818145453636341, + "grad_norm": 0.2853069305419922, + "learning_rate": 8.89178445491821e-06, + "loss": 0.6112, + "step": 46544 + }, + { + "epoch": 0.5818395459886497, + "grad_norm": 3.491197347640991, + "learning_rate": 8.890917145755512e-06, + "loss": 0.4833, + "step": 46546 + }, + { + "epoch": 0.5818645466136654, + "grad_norm": 5.957622051239014, + "learning_rate": 8.89004984503943e-06, + "loss": 1.5928, + "step": 46548 + }, + { + "epoch": 0.5818895472386809, + "grad_norm": 4.91623592376709, + "learning_rate": 8.889182552776573e-06, + "loss": 2.1, + "step": 46550 + }, + { + "epoch": 0.5819145478636966, + "grad_norm": 4.655966758728027, + "learning_rate": 8.888315268973544e-06, + "loss": 1.2404, + "step": 46552 + }, + { + "epoch": 0.5819395484887122, + "grad_norm": 2.340001106262207, + "learning_rate": 8.887447993636946e-06, + "loss": 0.2165, + "step": 46554 + }, + { + "epoch": 0.5819645491137279, + "grad_norm": 7.373654842376709, + "learning_rate": 8.886580726773389e-06, + "loss": 1.1724, + "step": 46556 + }, + { + "epoch": 0.5819895497387435, + "grad_norm": 2.8124818801879883, + "learning_rate": 8.885713468389481e-06, + "loss": 1.1103, + "step": 46558 + }, + { + "epoch": 0.582014550363759, + "grad_norm": 2.84374737739563, + "learning_rate": 8.884846218491818e-06, + "loss": 0.792, + "step": 46560 + }, + { + "epoch": 0.5820395509887747, + "grad_norm": 5.349370956420898, + "learning_rate": 8.883978977087006e-06, + "loss": 1.6235, + "step": 46562 + }, + { + "epoch": 0.5820645516137903, + "grad_norm": 1.9278552532196045, + "learning_rate": 8.883111744181655e-06, + "loss": 1.1861, + "step": 46564 + }, + { + "epoch": 0.582089552238806, + "grad_norm": 5.043496608734131, + "learning_rate": 8.882244519782362e-06, + "loss": 1.2334, + "step": 46566 + }, + { + "epoch": 0.5821145528638216, + "grad_norm": 2.1846530437469482, + "learning_rate": 8.881377303895741e-06, + "loss": 0.1454, + "step": 46568 + }, + { + "epoch": 0.5821395534888372, + "grad_norm": 2.502384662628174, + "learning_rate": 8.880510096528396e-06, + "loss": 0.5089, + "step": 46570 + }, + { + "epoch": 0.5821645541138528, + "grad_norm": 3.09736967086792, + "learning_rate": 8.879642897686922e-06, + "loss": 1.2503, + "step": 46572 + }, + { + "epoch": 0.5821895547388685, + "grad_norm": 4.176715850830078, + "learning_rate": 8.878775707377931e-06, + "loss": 1.4381, + "step": 46574 + }, + { + "epoch": 0.5822145553638841, + "grad_norm": 0.004222430754452944, + "learning_rate": 8.877908525608025e-06, + "loss": 0.1595, + "step": 46576 + }, + { + "epoch": 0.5822395559888998, + "grad_norm": 3.721038579940796, + "learning_rate": 8.877041352383808e-06, + "loss": 1.2217, + "step": 46578 + }, + { + "epoch": 0.5822645566139153, + "grad_norm": 3.2355735301971436, + "learning_rate": 8.876174187711887e-06, + "loss": 1.3288, + "step": 46580 + }, + { + "epoch": 0.5822895572389309, + "grad_norm": 1.4364367723464966, + "learning_rate": 8.875307031598863e-06, + "loss": 0.1941, + "step": 46582 + }, + { + "epoch": 0.5823145578639466, + "grad_norm": 0.5807341933250427, + "learning_rate": 8.874439884051348e-06, + "loss": 0.9838, + "step": 46584 + }, + { + "epoch": 0.5823395584889622, + "grad_norm": 3.9313645362854004, + "learning_rate": 8.873572745075933e-06, + "loss": 0.147, + "step": 46586 + }, + { + "epoch": 0.5823645591139779, + "grad_norm": 3.2401723861694336, + "learning_rate": 8.87270561467923e-06, + "loss": 1.6285, + "step": 46588 + }, + { + "epoch": 0.5823895597389934, + "grad_norm": 4.580056667327881, + "learning_rate": 8.871838492867841e-06, + "loss": 0.9815, + "step": 46590 + }, + { + "epoch": 0.5824145603640091, + "grad_norm": 0.6456534266471863, + "learning_rate": 8.87097137964837e-06, + "loss": 0.0264, + "step": 46592 + }, + { + "epoch": 0.5824395609890247, + "grad_norm": 0.6632141470909119, + "learning_rate": 8.870104275027423e-06, + "loss": 0.6347, + "step": 46594 + }, + { + "epoch": 0.5824645616140404, + "grad_norm": 2.4701342582702637, + "learning_rate": 8.869237179011605e-06, + "loss": 0.8149, + "step": 46596 + }, + { + "epoch": 0.582489562239056, + "grad_norm": 4.803278923034668, + "learning_rate": 8.868370091607514e-06, + "loss": 0.7769, + "step": 46598 + }, + { + "epoch": 0.5825145628640716, + "grad_norm": 0.007803201209753752, + "learning_rate": 8.867503012821756e-06, + "loss": 0.0967, + "step": 46600 + }, + { + "epoch": 0.5825395634890872, + "grad_norm": 2.4477856159210205, + "learning_rate": 8.866635942660934e-06, + "loss": 0.7673, + "step": 46602 + }, + { + "epoch": 0.5825645641141028, + "grad_norm": 0.003987081814557314, + "learning_rate": 8.86576888113165e-06, + "loss": 0.3819, + "step": 46604 + }, + { + "epoch": 0.5825895647391185, + "grad_norm": 8.617767333984375, + "learning_rate": 8.864901828240513e-06, + "loss": 1.2013, + "step": 46606 + }, + { + "epoch": 0.5826145653641341, + "grad_norm": 3.5779972076416016, + "learning_rate": 8.864034783994123e-06, + "loss": 0.837, + "step": 46608 + }, + { + "epoch": 0.5826395659891497, + "grad_norm": 2.943509340286255, + "learning_rate": 8.863167748399088e-06, + "loss": 1.3623, + "step": 46610 + }, + { + "epoch": 0.5826645666141653, + "grad_norm": 4.256078720092773, + "learning_rate": 8.862300721462002e-06, + "loss": 1.206, + "step": 46612 + }, + { + "epoch": 0.582689567239181, + "grad_norm": 2.015594720840454, + "learning_rate": 8.861433703189473e-06, + "loss": 0.6323, + "step": 46614 + }, + { + "epoch": 0.5827145678641966, + "grad_norm": 0.0017592329531908035, + "learning_rate": 8.8605666935881e-06, + "loss": 2.4231, + "step": 46616 + }, + { + "epoch": 0.5827395684892123, + "grad_norm": 0.003241759492084384, + "learning_rate": 8.859699692664494e-06, + "loss": 1.0178, + "step": 46618 + }, + { + "epoch": 0.5827645691142278, + "grad_norm": 4.419447422027588, + "learning_rate": 8.858832700425254e-06, + "loss": 1.2652, + "step": 46620 + }, + { + "epoch": 0.5827895697392435, + "grad_norm": 3.347339153289795, + "learning_rate": 8.857965716876982e-06, + "loss": 1.5368, + "step": 46622 + }, + { + "epoch": 0.5828145703642591, + "grad_norm": 1.3740897178649902, + "learning_rate": 8.857098742026286e-06, + "loss": 0.0335, + "step": 46624 + }, + { + "epoch": 0.5828395709892747, + "grad_norm": 3.6149301528930664, + "learning_rate": 8.85623177587976e-06, + "loss": 1.3346, + "step": 46626 + }, + { + "epoch": 0.5828645716142904, + "grad_norm": 2.586749792098999, + "learning_rate": 8.85536481844401e-06, + "loss": 0.8375, + "step": 46628 + }, + { + "epoch": 0.5828895722393059, + "grad_norm": 2.3345468044281006, + "learning_rate": 8.85449786972564e-06, + "loss": 0.395, + "step": 46630 + }, + { + "epoch": 0.5829145728643216, + "grad_norm": 3.832132577896118, + "learning_rate": 8.853630929731252e-06, + "loss": 0.6153, + "step": 46632 + }, + { + "epoch": 0.5829395734893372, + "grad_norm": 0.011025551706552505, + "learning_rate": 8.85276399846745e-06, + "loss": 0.2854, + "step": 46634 + }, + { + "epoch": 0.5829645741143529, + "grad_norm": 0.005761456210166216, + "learning_rate": 8.851897075940841e-06, + "loss": 0.029, + "step": 46636 + }, + { + "epoch": 0.5829895747393685, + "grad_norm": 5.427237510681152, + "learning_rate": 8.851030162158016e-06, + "loss": 0.9654, + "step": 46638 + }, + { + "epoch": 0.5830145753643841, + "grad_norm": 0.014474321156740189, + "learning_rate": 8.85016325712558e-06, + "loss": 0.5728, + "step": 46640 + }, + { + "epoch": 0.5830395759893997, + "grad_norm": 3.9634647369384766, + "learning_rate": 8.84929636085014e-06, + "loss": 0.3443, + "step": 46642 + }, + { + "epoch": 0.5830645766144154, + "grad_norm": 2.709487199783325, + "learning_rate": 8.848429473338295e-06, + "loss": 0.1922, + "step": 46644 + }, + { + "epoch": 0.583089577239431, + "grad_norm": 4.091139793395996, + "learning_rate": 8.847562594596652e-06, + "loss": 1.1647, + "step": 46646 + }, + { + "epoch": 0.5831145778644466, + "grad_norm": 2.438987970352173, + "learning_rate": 8.846695724631807e-06, + "loss": 1.0072, + "step": 46648 + }, + { + "epoch": 0.5831395784894622, + "grad_norm": 4.024623870849609, + "learning_rate": 8.84582886345037e-06, + "loss": 0.8275, + "step": 46650 + }, + { + "epoch": 0.5831645791144778, + "grad_norm": 2.6696293354034424, + "learning_rate": 8.844962011058927e-06, + "loss": 0.119, + "step": 46652 + }, + { + "epoch": 0.5831895797394935, + "grad_norm": 0.9150251150131226, + "learning_rate": 8.844095167464096e-06, + "loss": 1.0101, + "step": 46654 + }, + { + "epoch": 0.5832145803645091, + "grad_norm": 2.3581950664520264, + "learning_rate": 8.843228332672471e-06, + "loss": 1.0946, + "step": 46656 + }, + { + "epoch": 0.5832395809895248, + "grad_norm": 3.375491142272949, + "learning_rate": 8.842361506690656e-06, + "loss": 0.9875, + "step": 46658 + }, + { + "epoch": 0.5832645816145403, + "grad_norm": 0.7817122936248779, + "learning_rate": 8.84149468952525e-06, + "loss": 0.0375, + "step": 46660 + }, + { + "epoch": 0.583289582239556, + "grad_norm": 2.733044385910034, + "learning_rate": 8.840627881182865e-06, + "loss": 1.608, + "step": 46662 + }, + { + "epoch": 0.5833145828645716, + "grad_norm": 4.635396480560303, + "learning_rate": 8.839761081670086e-06, + "loss": 1.7086, + "step": 46664 + }, + { + "epoch": 0.5833395834895873, + "grad_norm": 0.005467490758746862, + "learning_rate": 8.838894290993524e-06, + "loss": 0.4273, + "step": 46666 + }, + { + "epoch": 0.5833645841146029, + "grad_norm": 0.006670729722827673, + "learning_rate": 8.838027509159778e-06, + "loss": 0.2852, + "step": 46668 + }, + { + "epoch": 0.5833895847396184, + "grad_norm": 5.19458532333374, + "learning_rate": 8.837160736175452e-06, + "loss": 1.3447, + "step": 46670 + }, + { + "epoch": 0.5834145853646341, + "grad_norm": 5.801872730255127, + "learning_rate": 8.836293972047143e-06, + "loss": 1.0106, + "step": 46672 + }, + { + "epoch": 0.5834395859896497, + "grad_norm": 3.4514787197113037, + "learning_rate": 8.835427216781456e-06, + "loss": 1.1946, + "step": 46674 + }, + { + "epoch": 0.5834645866146654, + "grad_norm": 4.295386791229248, + "learning_rate": 8.834560470384996e-06, + "loss": 1.1557, + "step": 46676 + }, + { + "epoch": 0.583489587239681, + "grad_norm": 1.4185067415237427, + "learning_rate": 8.83369373286435e-06, + "loss": 0.3029, + "step": 46678 + }, + { + "epoch": 0.5835145878646966, + "grad_norm": 5.249072074890137, + "learning_rate": 8.83282700422613e-06, + "loss": 1.0154, + "step": 46680 + }, + { + "epoch": 0.5835395884897122, + "grad_norm": 3.1277542114257812, + "learning_rate": 8.831960284476937e-06, + "loss": 0.8928, + "step": 46682 + }, + { + "epoch": 0.5835645891147279, + "grad_norm": 3.2347872257232666, + "learning_rate": 8.831093573623367e-06, + "loss": 0.4969, + "step": 46684 + }, + { + "epoch": 0.5835895897397435, + "grad_norm": 2.639132261276245, + "learning_rate": 8.830226871672024e-06, + "loss": 1.5345, + "step": 46686 + }, + { + "epoch": 0.5836145903647592, + "grad_norm": 4.03333044052124, + "learning_rate": 8.829360178629512e-06, + "loss": 1.2335, + "step": 46688 + }, + { + "epoch": 0.5836395909897747, + "grad_norm": 3.3552489280700684, + "learning_rate": 8.828493494502422e-06, + "loss": 0.8711, + "step": 46690 + }, + { + "epoch": 0.5836645916147903, + "grad_norm": 4.605016231536865, + "learning_rate": 8.827626819297361e-06, + "loss": 1.0227, + "step": 46692 + }, + { + "epoch": 0.583689592239806, + "grad_norm": 0.0037514378782361746, + "learning_rate": 8.826760153020927e-06, + "loss": 1.1235, + "step": 46694 + }, + { + "epoch": 0.5837145928648216, + "grad_norm": 2.8816945552825928, + "learning_rate": 8.825893495679722e-06, + "loss": 1.4611, + "step": 46696 + }, + { + "epoch": 0.5837395934898373, + "grad_norm": 4.136258125305176, + "learning_rate": 8.825026847280347e-06, + "loss": 0.7555, + "step": 46698 + }, + { + "epoch": 0.5837645941148528, + "grad_norm": 3.6535980701446533, + "learning_rate": 8.824160207829401e-06, + "loss": 1.3144, + "step": 46700 + }, + { + "epoch": 0.5837895947398685, + "grad_norm": 7.56344747543335, + "learning_rate": 8.823293577333488e-06, + "loss": 0.4455, + "step": 46702 + }, + { + "epoch": 0.5838145953648841, + "grad_norm": 6.109554290771484, + "learning_rate": 8.822426955799201e-06, + "loss": 1.6287, + "step": 46704 + }, + { + "epoch": 0.5838395959898998, + "grad_norm": 6.107600688934326, + "learning_rate": 8.821560343233145e-06, + "loss": 1.0083, + "step": 46706 + }, + { + "epoch": 0.5838645966149154, + "grad_norm": 2.847259759902954, + "learning_rate": 8.820693739641918e-06, + "loss": 1.1922, + "step": 46708 + }, + { + "epoch": 0.583889597239931, + "grad_norm": 3.246001958847046, + "learning_rate": 8.819827145032122e-06, + "loss": 2.0584, + "step": 46710 + }, + { + "epoch": 0.5839145978649466, + "grad_norm": 0.0017909426242113113, + "learning_rate": 8.818960559410354e-06, + "loss": 0.6416, + "step": 46712 + }, + { + "epoch": 0.5839395984899622, + "grad_norm": 1.0893579721450806, + "learning_rate": 8.818093982783218e-06, + "loss": 0.9965, + "step": 46714 + }, + { + "epoch": 0.5839645991149779, + "grad_norm": 1.215453028678894, + "learning_rate": 8.817227415157308e-06, + "loss": 0.1199, + "step": 46716 + }, + { + "epoch": 0.5839895997399935, + "grad_norm": 0.002473197178915143, + "learning_rate": 8.816360856539228e-06, + "loss": 1.6052, + "step": 46718 + }, + { + "epoch": 0.5840146003650091, + "grad_norm": 0.04705530032515526, + "learning_rate": 8.815494306935576e-06, + "loss": 0.4813, + "step": 46720 + }, + { + "epoch": 0.5840396009900247, + "grad_norm": 1.523545265197754, + "learning_rate": 8.814627766352951e-06, + "loss": 0.2196, + "step": 46722 + }, + { + "epoch": 0.5840646016150404, + "grad_norm": 5.916467666625977, + "learning_rate": 8.813761234797955e-06, + "loss": 1.9521, + "step": 46724 + }, + { + "epoch": 0.584089602240056, + "grad_norm": 4.606235504150391, + "learning_rate": 8.812894712277185e-06, + "loss": 0.7188, + "step": 46726 + }, + { + "epoch": 0.5841146028650717, + "grad_norm": 1.0183913707733154, + "learning_rate": 8.812028198797243e-06, + "loss": 0.1215, + "step": 46728 + }, + { + "epoch": 0.5841396034900872, + "grad_norm": 4.283484935760498, + "learning_rate": 8.811161694364725e-06, + "loss": 1.5161, + "step": 46730 + }, + { + "epoch": 0.5841646041151028, + "grad_norm": 3.5450592041015625, + "learning_rate": 8.81029519898623e-06, + "loss": 0.5536, + "step": 46732 + }, + { + "epoch": 0.5841896047401185, + "grad_norm": 1.7452082633972168, + "learning_rate": 8.809428712668359e-06, + "loss": 0.924, + "step": 46734 + }, + { + "epoch": 0.5842146053651341, + "grad_norm": 2.616543769836426, + "learning_rate": 8.80856223541771e-06, + "loss": 0.7126, + "step": 46736 + }, + { + "epoch": 0.5842396059901498, + "grad_norm": 0.008000340312719345, + "learning_rate": 8.807695767240883e-06, + "loss": 0.6942, + "step": 46738 + }, + { + "epoch": 0.5842646066151653, + "grad_norm": 6.005368709564209, + "learning_rate": 8.806829308144479e-06, + "loss": 0.9653, + "step": 46740 + }, + { + "epoch": 0.584289607240181, + "grad_norm": 4.237912654876709, + "learning_rate": 8.805962858135092e-06, + "loss": 0.2912, + "step": 46742 + }, + { + "epoch": 0.5843146078651966, + "grad_norm": 4.452037811279297, + "learning_rate": 8.805096417219323e-06, + "loss": 0.9686, + "step": 46744 + }, + { + "epoch": 0.5843396084902123, + "grad_norm": 3.647533655166626, + "learning_rate": 8.80422998540377e-06, + "loss": 1.476, + "step": 46746 + }, + { + "epoch": 0.5843646091152279, + "grad_norm": 1.8522437810897827, + "learning_rate": 8.803363562695034e-06, + "loss": 1.1982, + "step": 46748 + }, + { + "epoch": 0.5843896097402435, + "grad_norm": 2.038336753845215, + "learning_rate": 8.80249714909971e-06, + "loss": 1.9543, + "step": 46750 + }, + { + "epoch": 0.5844146103652591, + "grad_norm": 3.912151336669922, + "learning_rate": 8.801630744624398e-06, + "loss": 1.3147, + "step": 46752 + }, + { + "epoch": 0.5844396109902747, + "grad_norm": 3.5207927227020264, + "learning_rate": 8.800764349275701e-06, + "loss": 1.3932, + "step": 46754 + }, + { + "epoch": 0.5844646116152904, + "grad_norm": 3.368746757507324, + "learning_rate": 8.79989796306021e-06, + "loss": 0.8072, + "step": 46756 + }, + { + "epoch": 0.584489612240306, + "grad_norm": 2.9261717796325684, + "learning_rate": 8.799031585984526e-06, + "loss": 0.9676, + "step": 46758 + }, + { + "epoch": 0.5845146128653216, + "grad_norm": 2.2566628456115723, + "learning_rate": 8.798165218055248e-06, + "loss": 0.6562, + "step": 46760 + }, + { + "epoch": 0.5845396134903372, + "grad_norm": 1.123020887374878, + "learning_rate": 8.797298859278976e-06, + "loss": 1.2061, + "step": 46762 + }, + { + "epoch": 0.5845646141153529, + "grad_norm": 4.177440166473389, + "learning_rate": 8.796432509662303e-06, + "loss": 0.9775, + "step": 46764 + }, + { + "epoch": 0.5845896147403685, + "grad_norm": 6.184908390045166, + "learning_rate": 8.795566169211832e-06, + "loss": 0.9847, + "step": 46766 + }, + { + "epoch": 0.5846146153653842, + "grad_norm": 2.887009382247925, + "learning_rate": 8.794699837934158e-06, + "loss": 0.7011, + "step": 46768 + }, + { + "epoch": 0.5846396159903997, + "grad_norm": 4.699990749359131, + "learning_rate": 8.793833515835878e-06, + "loss": 1.4071, + "step": 46770 + }, + { + "epoch": 0.5846646166154154, + "grad_norm": 4.045702934265137, + "learning_rate": 8.792967202923592e-06, + "loss": 1.655, + "step": 46772 + }, + { + "epoch": 0.584689617240431, + "grad_norm": 0.8705365061759949, + "learning_rate": 8.792100899203897e-06, + "loss": 0.37, + "step": 46774 + }, + { + "epoch": 0.5847146178654467, + "grad_norm": 9.177190780639648, + "learning_rate": 8.791234604683392e-06, + "loss": 0.8089, + "step": 46776 + }, + { + "epoch": 0.5847396184904623, + "grad_norm": 2.396272659301758, + "learning_rate": 8.790368319368672e-06, + "loss": 0.9078, + "step": 46778 + }, + { + "epoch": 0.5847646191154778, + "grad_norm": 2.9399983882904053, + "learning_rate": 8.789502043266338e-06, + "loss": 1.0107, + "step": 46780 + }, + { + "epoch": 0.5847896197404935, + "grad_norm": 5.151240348815918, + "learning_rate": 8.788635776382983e-06, + "loss": 1.2223, + "step": 46782 + }, + { + "epoch": 0.5848146203655091, + "grad_norm": 4.815273761749268, + "learning_rate": 8.787769518725207e-06, + "loss": 1.7189, + "step": 46784 + }, + { + "epoch": 0.5848396209905248, + "grad_norm": 6.280696868896484, + "learning_rate": 8.786903270299606e-06, + "loss": 2.2787, + "step": 46786 + }, + { + "epoch": 0.5848646216155404, + "grad_norm": 2.1758158206939697, + "learning_rate": 8.78603703111278e-06, + "loss": 0.4934, + "step": 46788 + }, + { + "epoch": 0.584889622240556, + "grad_norm": 2.745755195617676, + "learning_rate": 8.785170801171324e-06, + "loss": 0.5311, + "step": 46790 + }, + { + "epoch": 0.5849146228655716, + "grad_norm": 4.946323871612549, + "learning_rate": 8.784304580481836e-06, + "loss": 1.9468, + "step": 46792 + }, + { + "epoch": 0.5849396234905873, + "grad_norm": 10.464335441589355, + "learning_rate": 8.783438369050912e-06, + "loss": 1.0962, + "step": 46794 + }, + { + "epoch": 0.5849646241156029, + "grad_norm": 1.580051064491272, + "learning_rate": 8.782572166885148e-06, + "loss": 0.8577, + "step": 46796 + }, + { + "epoch": 0.5849896247406186, + "grad_norm": 3.399792432785034, + "learning_rate": 8.781705973991143e-06, + "loss": 1.1858, + "step": 46798 + }, + { + "epoch": 0.5850146253656341, + "grad_norm": 4.56272029876709, + "learning_rate": 8.780839790375494e-06, + "loss": 1.0937, + "step": 46800 + }, + { + "epoch": 0.5850396259906497, + "grad_norm": 4.054412364959717, + "learning_rate": 8.779973616044794e-06, + "loss": 0.7586, + "step": 46802 + }, + { + "epoch": 0.5850646266156654, + "grad_norm": 2.302164077758789, + "learning_rate": 8.779107451005643e-06, + "loss": 0.4616, + "step": 46804 + }, + { + "epoch": 0.585089627240681, + "grad_norm": 4.093067646026611, + "learning_rate": 8.77824129526464e-06, + "loss": 1.1922, + "step": 46806 + }, + { + "epoch": 0.5851146278656967, + "grad_norm": 4.486977577209473, + "learning_rate": 8.777375148828378e-06, + "loss": 0.6972, + "step": 46808 + }, + { + "epoch": 0.5851396284907122, + "grad_norm": 4.493397235870361, + "learning_rate": 8.776509011703452e-06, + "loss": 1.0456, + "step": 46810 + }, + { + "epoch": 0.5851646291157279, + "grad_norm": 1.2728770971298218, + "learning_rate": 8.77564288389646e-06, + "loss": 0.7709, + "step": 46812 + }, + { + "epoch": 0.5851896297407435, + "grad_norm": 2.1215007305145264, + "learning_rate": 8.774776765414e-06, + "loss": 1.6632, + "step": 46814 + }, + { + "epoch": 0.5852146303657592, + "grad_norm": 5.412642002105713, + "learning_rate": 8.773910656262667e-06, + "loss": 1.8395, + "step": 46816 + }, + { + "epoch": 0.5852396309907748, + "grad_norm": 5.746790885925293, + "learning_rate": 8.773044556449058e-06, + "loss": 2.6574, + "step": 46818 + }, + { + "epoch": 0.5852646316157903, + "grad_norm": 1.4522511959075928, + "learning_rate": 8.772178465979765e-06, + "loss": 1.2359, + "step": 46820 + }, + { + "epoch": 0.585289632240806, + "grad_norm": 3.8002259731292725, + "learning_rate": 8.77131238486139e-06, + "loss": 2.2239, + "step": 46822 + }, + { + "epoch": 0.5853146328658216, + "grad_norm": 3.0383856296539307, + "learning_rate": 8.770446313100524e-06, + "loss": 0.6647, + "step": 46824 + }, + { + "epoch": 0.5853396334908373, + "grad_norm": 0.8187783360481262, + "learning_rate": 8.769580250703767e-06, + "loss": 0.6995, + "step": 46826 + }, + { + "epoch": 0.5853646341158529, + "grad_norm": 0.7040791511535645, + "learning_rate": 8.768714197677711e-06, + "loss": 1.1858, + "step": 46828 + }, + { + "epoch": 0.5853896347408685, + "grad_norm": 1.840313196182251, + "learning_rate": 8.767848154028954e-06, + "loss": 0.6943, + "step": 46830 + }, + { + "epoch": 0.5854146353658841, + "grad_norm": 0.7148016095161438, + "learning_rate": 8.766982119764094e-06, + "loss": 0.1101, + "step": 46832 + }, + { + "epoch": 0.5854396359908998, + "grad_norm": 1.9358490705490112, + "learning_rate": 8.76611609488972e-06, + "loss": 0.7547, + "step": 46834 + }, + { + "epoch": 0.5854646366159154, + "grad_norm": 2.0667190551757812, + "learning_rate": 8.765250079412433e-06, + "loss": 1.7153, + "step": 46836 + }, + { + "epoch": 0.5854896372409311, + "grad_norm": 3.173712968826294, + "learning_rate": 8.764384073338826e-06, + "loss": 0.5628, + "step": 46838 + }, + { + "epoch": 0.5855146378659466, + "grad_norm": 4.579962253570557, + "learning_rate": 8.763518076675495e-06, + "loss": 1.9422, + "step": 46840 + }, + { + "epoch": 0.5855396384909622, + "grad_norm": 5.2405595779418945, + "learning_rate": 8.762652089429038e-06, + "loss": 1.1945, + "step": 46842 + }, + { + "epoch": 0.5855646391159779, + "grad_norm": 2.6229379177093506, + "learning_rate": 8.761786111606047e-06, + "loss": 1.4179, + "step": 46844 + }, + { + "epoch": 0.5855896397409935, + "grad_norm": 0.006901385262608528, + "learning_rate": 8.760920143213116e-06, + "loss": 0.8205, + "step": 46846 + }, + { + "epoch": 0.5856146403660092, + "grad_norm": 5.594252109527588, + "learning_rate": 8.760054184256842e-06, + "loss": 0.4604, + "step": 46848 + }, + { + "epoch": 0.5856396409910247, + "grad_norm": 2.5507354736328125, + "learning_rate": 8.759188234743822e-06, + "loss": 0.6887, + "step": 46850 + }, + { + "epoch": 0.5856646416160404, + "grad_norm": 2.3026177883148193, + "learning_rate": 8.758322294680647e-06, + "loss": 1.1459, + "step": 46852 + }, + { + "epoch": 0.585689642241056, + "grad_norm": 6.317038536071777, + "learning_rate": 8.757456364073914e-06, + "loss": 1.9457, + "step": 46854 + }, + { + "epoch": 0.5857146428660717, + "grad_norm": 0.8415732383728027, + "learning_rate": 8.756590442930218e-06, + "loss": 0.5531, + "step": 46856 + }, + { + "epoch": 0.5857396434910873, + "grad_norm": 0.3343745172023773, + "learning_rate": 8.755724531256156e-06, + "loss": 0.1389, + "step": 46858 + }, + { + "epoch": 0.5857646441161029, + "grad_norm": 0.14458104968070984, + "learning_rate": 8.754858629058315e-06, + "loss": 0.003, + "step": 46860 + }, + { + "epoch": 0.5857896447411185, + "grad_norm": 0.001241275924257934, + "learning_rate": 8.7539927363433e-06, + "loss": 0.8142, + "step": 46862 + }, + { + "epoch": 0.5858146453661341, + "grad_norm": 2.889730215072632, + "learning_rate": 8.753126853117695e-06, + "loss": 0.2904, + "step": 46864 + }, + { + "epoch": 0.5858396459911498, + "grad_norm": 3.628018617630005, + "learning_rate": 8.752260979388101e-06, + "loss": 1.0999, + "step": 46866 + }, + { + "epoch": 0.5858646466161654, + "grad_norm": 4.357878684997559, + "learning_rate": 8.751395115161112e-06, + "loss": 1.167, + "step": 46868 + }, + { + "epoch": 0.585889647241181, + "grad_norm": 2.2360222339630127, + "learning_rate": 8.750529260443322e-06, + "loss": 0.3616, + "step": 46870 + }, + { + "epoch": 0.5859146478661966, + "grad_norm": 4.574349880218506, + "learning_rate": 8.749663415241323e-06, + "loss": 0.8549, + "step": 46872 + }, + { + "epoch": 0.5859396484912123, + "grad_norm": 4.638269901275635, + "learning_rate": 8.748797579561711e-06, + "loss": 0.6749, + "step": 46874 + }, + { + "epoch": 0.5859646491162279, + "grad_norm": 1.9809314012527466, + "learning_rate": 8.747931753411081e-06, + "loss": 1.0259, + "step": 46876 + }, + { + "epoch": 0.5859896497412436, + "grad_norm": 3.5773367881774902, + "learning_rate": 8.747065936796023e-06, + "loss": 1.7355, + "step": 46878 + }, + { + "epoch": 0.5860146503662591, + "grad_norm": 1.23887038230896, + "learning_rate": 8.746200129723134e-06, + "loss": 0.7991, + "step": 46880 + }, + { + "epoch": 0.5860396509912748, + "grad_norm": 1.9045411348342896, + "learning_rate": 8.745334332199009e-06, + "loss": 1.0282, + "step": 46882 + }, + { + "epoch": 0.5860646516162904, + "grad_norm": 0.4862031936645508, + "learning_rate": 8.744468544230242e-06, + "loss": 0.5258, + "step": 46884 + }, + { + "epoch": 0.586089652241306, + "grad_norm": 2.4615907669067383, + "learning_rate": 8.743602765823422e-06, + "loss": 1.0877, + "step": 46886 + }, + { + "epoch": 0.5861146528663217, + "grad_norm": 0.0027984706684947014, + "learning_rate": 8.742736996985147e-06, + "loss": 1.0769, + "step": 46888 + }, + { + "epoch": 0.5861396534913372, + "grad_norm": 2.858363389968872, + "learning_rate": 8.741871237722008e-06, + "loss": 0.7388, + "step": 46890 + }, + { + "epoch": 0.5861646541163529, + "grad_norm": 2.694193124771118, + "learning_rate": 8.741005488040601e-06, + "loss": 0.2672, + "step": 46892 + }, + { + "epoch": 0.5861896547413685, + "grad_norm": 2.7365195751190186, + "learning_rate": 8.740139747947519e-06, + "loss": 0.5058, + "step": 46894 + }, + { + "epoch": 0.5862146553663842, + "grad_norm": 8.437912940979004, + "learning_rate": 8.73927401744935e-06, + "loss": 1.3743, + "step": 46896 + }, + { + "epoch": 0.5862396559913998, + "grad_norm": 4.540430068969727, + "learning_rate": 8.738408296552699e-06, + "loss": 1.4473, + "step": 46898 + }, + { + "epoch": 0.5862646566164154, + "grad_norm": 3.6337170600891113, + "learning_rate": 8.737542585264148e-06, + "loss": 0.9048, + "step": 46900 + }, + { + "epoch": 0.586289657241431, + "grad_norm": 1.6917831897735596, + "learning_rate": 8.736676883590293e-06, + "loss": 0.1153, + "step": 46902 + }, + { + "epoch": 0.5863146578664467, + "grad_norm": 4.0518951416015625, + "learning_rate": 8.73581119153773e-06, + "loss": 1.9269, + "step": 46904 + }, + { + "epoch": 0.5863396584914623, + "grad_norm": 3.1273889541625977, + "learning_rate": 8.734945509113049e-06, + "loss": 0.5242, + "step": 46906 + }, + { + "epoch": 0.586364659116478, + "grad_norm": 2.9012317657470703, + "learning_rate": 8.734079836322842e-06, + "loss": 1.0861, + "step": 46908 + }, + { + "epoch": 0.5863896597414935, + "grad_norm": 5.206058025360107, + "learning_rate": 8.73321417317371e-06, + "loss": 2.3725, + "step": 46910 + }, + { + "epoch": 0.5864146603665091, + "grad_norm": 4.786653518676758, + "learning_rate": 8.732348519672237e-06, + "loss": 1.2637, + "step": 46912 + }, + { + "epoch": 0.5864396609915248, + "grad_norm": 0.012323359027504921, + "learning_rate": 8.731482875825017e-06, + "loss": 0.0003, + "step": 46914 + }, + { + "epoch": 0.5864646616165404, + "grad_norm": 0.0030511573422700167, + "learning_rate": 8.730617241638644e-06, + "loss": 0.0206, + "step": 46916 + }, + { + "epoch": 0.5864896622415561, + "grad_norm": 2.958871364593506, + "learning_rate": 8.729751617119711e-06, + "loss": 0.6121, + "step": 46918 + }, + { + "epoch": 0.5865146628665716, + "grad_norm": 3.084777593612671, + "learning_rate": 8.72888600227481e-06, + "loss": 1.2725, + "step": 46920 + }, + { + "epoch": 0.5865396634915873, + "grad_norm": 7.388222694396973, + "learning_rate": 8.728020397110533e-06, + "loss": 1.4542, + "step": 46922 + }, + { + "epoch": 0.5865646641166029, + "grad_norm": 7.466170787811279, + "learning_rate": 8.727154801633478e-06, + "loss": 1.9838, + "step": 46924 + }, + { + "epoch": 0.5865896647416186, + "grad_norm": 2.882413625717163, + "learning_rate": 8.726289215850227e-06, + "loss": 0.8244, + "step": 46926 + }, + { + "epoch": 0.5866146653666342, + "grad_norm": 4.8410820960998535, + "learning_rate": 8.725423639767376e-06, + "loss": 1.03, + "step": 46928 + }, + { + "epoch": 0.5866396659916497, + "grad_norm": 0.0013549883151426911, + "learning_rate": 8.72455807339152e-06, + "loss": 0.2411, + "step": 46930 + }, + { + "epoch": 0.5866646666166654, + "grad_norm": 5.571873188018799, + "learning_rate": 8.72369251672925e-06, + "loss": 1.0302, + "step": 46932 + }, + { + "epoch": 0.586689667241681, + "grad_norm": 1.1461621522903442, + "learning_rate": 8.722826969787154e-06, + "loss": 0.1307, + "step": 46934 + }, + { + "epoch": 0.5867146678666967, + "grad_norm": 0.00133082817774266, + "learning_rate": 8.721961432571836e-06, + "loss": 0.535, + "step": 46936 + }, + { + "epoch": 0.5867396684917123, + "grad_norm": 0.0016378709115087986, + "learning_rate": 8.721095905089871e-06, + "loss": 0.942, + "step": 46938 + }, + { + "epoch": 0.5867646691167279, + "grad_norm": 0.3562173843383789, + "learning_rate": 8.72023038734786e-06, + "loss": 0.6976, + "step": 46940 + }, + { + "epoch": 0.5867896697417435, + "grad_norm": 5.3657307624816895, + "learning_rate": 8.719364879352395e-06, + "loss": 1.4559, + "step": 46942 + }, + { + "epoch": 0.5868146703667592, + "grad_norm": 3.1137728691101074, + "learning_rate": 8.718499381110064e-06, + "loss": 3.1358, + "step": 46944 + }, + { + "epoch": 0.5868396709917748, + "grad_norm": 2.6956005096435547, + "learning_rate": 8.71763389262746e-06, + "loss": 0.4101, + "step": 46946 + }, + { + "epoch": 0.5868646716167905, + "grad_norm": 4.545173645019531, + "learning_rate": 8.716768413911176e-06, + "loss": 0.6938, + "step": 46948 + }, + { + "epoch": 0.586889672241806, + "grad_norm": 0.09542389959096909, + "learning_rate": 8.715902944967808e-06, + "loss": 0.5436, + "step": 46950 + }, + { + "epoch": 0.5869146728668216, + "grad_norm": 3.812638759613037, + "learning_rate": 8.715037485803936e-06, + "loss": 1.2159, + "step": 46952 + }, + { + "epoch": 0.5869396734918373, + "grad_norm": 2.8313210010528564, + "learning_rate": 8.714172036426156e-06, + "loss": 0.9135, + "step": 46954 + }, + { + "epoch": 0.5869646741168529, + "grad_norm": 2.4182677268981934, + "learning_rate": 8.713306596841063e-06, + "loss": 0.6591, + "step": 46956 + }, + { + "epoch": 0.5869896747418686, + "grad_norm": 8.921876907348633, + "learning_rate": 8.712441167055238e-06, + "loss": 0.9754, + "step": 46958 + }, + { + "epoch": 0.5870146753668841, + "grad_norm": 3.9786078929901123, + "learning_rate": 8.711575747075285e-06, + "loss": 0.7716, + "step": 46960 + }, + { + "epoch": 0.5870396759918998, + "grad_norm": 1.8900395631790161, + "learning_rate": 8.710710336907792e-06, + "loss": 0.9981, + "step": 46962 + }, + { + "epoch": 0.5870646766169154, + "grad_norm": 0.8242027759552002, + "learning_rate": 8.709844936559344e-06, + "loss": 0.6745, + "step": 46964 + }, + { + "epoch": 0.5870896772419311, + "grad_norm": 5.8658342361450195, + "learning_rate": 8.708979546036532e-06, + "loss": 0.8522, + "step": 46966 + }, + { + "epoch": 0.5871146778669467, + "grad_norm": 3.7284796237945557, + "learning_rate": 8.708114165345951e-06, + "loss": 0.6705, + "step": 46968 + }, + { + "epoch": 0.5871396784919622, + "grad_norm": 5.260082244873047, + "learning_rate": 8.707248794494187e-06, + "loss": 0.5243, + "step": 46970 + }, + { + "epoch": 0.5871646791169779, + "grad_norm": 2.167144775390625, + "learning_rate": 8.706383433487836e-06, + "loss": 0.6483, + "step": 46972 + }, + { + "epoch": 0.5871896797419935, + "grad_norm": 0.0029883699025958776, + "learning_rate": 8.705518082333484e-06, + "loss": 0.7268, + "step": 46974 + }, + { + "epoch": 0.5872146803670092, + "grad_norm": 5.5857319831848145, + "learning_rate": 8.70465274103773e-06, + "loss": 1.2863, + "step": 46976 + }, + { + "epoch": 0.5872396809920248, + "grad_norm": 3.4823453426361084, + "learning_rate": 8.703787409607151e-06, + "loss": 0.6824, + "step": 46978 + }, + { + "epoch": 0.5872646816170404, + "grad_norm": 2.49784517288208, + "learning_rate": 8.702922088048346e-06, + "loss": 0.6774, + "step": 46980 + }, + { + "epoch": 0.587289682242056, + "grad_norm": 5.4085235595703125, + "learning_rate": 8.7020567763679e-06, + "loss": 1.713, + "step": 46982 + }, + { + "epoch": 0.5873146828670717, + "grad_norm": 2.722884178161621, + "learning_rate": 8.701191474572409e-06, + "loss": 1.1409, + "step": 46984 + }, + { + "epoch": 0.5873396834920873, + "grad_norm": 2.4025020599365234, + "learning_rate": 8.700326182668457e-06, + "loss": 0.8349, + "step": 46986 + }, + { + "epoch": 0.587364684117103, + "grad_norm": 0.0021710474975407124, + "learning_rate": 8.699460900662644e-06, + "loss": 0.7237, + "step": 46988 + }, + { + "epoch": 0.5873896847421185, + "grad_norm": 3.1275007724761963, + "learning_rate": 8.698595628561549e-06, + "loss": 1.5094, + "step": 46990 + }, + { + "epoch": 0.5874146853671341, + "grad_norm": 1.9158871173858643, + "learning_rate": 8.697730366371762e-06, + "loss": 0.4225, + "step": 46992 + }, + { + "epoch": 0.5874396859921498, + "grad_norm": 6.7176194190979, + "learning_rate": 8.696865114099879e-06, + "loss": 1.2251, + "step": 46994 + }, + { + "epoch": 0.5874646866171654, + "grad_norm": 2.5938832759857178, + "learning_rate": 8.695999871752486e-06, + "loss": 0.4069, + "step": 46996 + }, + { + "epoch": 0.5874896872421811, + "grad_norm": 12.610838890075684, + "learning_rate": 8.695134639336173e-06, + "loss": 0.3971, + "step": 46998 + }, + { + "epoch": 0.5875146878671966, + "grad_norm": 3.30249285697937, + "learning_rate": 8.694269416857532e-06, + "loss": 1.0653, + "step": 47000 + }, + { + "epoch": 0.5875396884922123, + "grad_norm": 0.6491366624832153, + "learning_rate": 8.693404204323154e-06, + "loss": 0.788, + "step": 47002 + }, + { + "epoch": 0.5875646891172279, + "grad_norm": 2.4433352947235107, + "learning_rate": 8.692539001739621e-06, + "loss": 1.4526, + "step": 47004 + }, + { + "epoch": 0.5875896897422436, + "grad_norm": 2.3980047702789307, + "learning_rate": 8.691673809113526e-06, + "loss": 0.469, + "step": 47006 + }, + { + "epoch": 0.5876146903672592, + "grad_norm": 3.0983474254608154, + "learning_rate": 8.690808626451456e-06, + "loss": 0.9734, + "step": 47008 + }, + { + "epoch": 0.5876396909922748, + "grad_norm": 1.9909114837646484, + "learning_rate": 8.689943453760006e-06, + "loss": 0.1095, + "step": 47010 + }, + { + "epoch": 0.5876646916172904, + "grad_norm": 2.250030517578125, + "learning_rate": 8.68907829104576e-06, + "loss": 0.7494, + "step": 47012 + }, + { + "epoch": 0.587689692242306, + "grad_norm": 4.222037315368652, + "learning_rate": 8.688213138315312e-06, + "loss": 0.9326, + "step": 47014 + }, + { + "epoch": 0.5877146928673217, + "grad_norm": 8.357660293579102, + "learning_rate": 8.687347995575244e-06, + "loss": 0.502, + "step": 47016 + }, + { + "epoch": 0.5877396934923373, + "grad_norm": 3.213923454284668, + "learning_rate": 8.686482862832146e-06, + "loss": 0.631, + "step": 47018 + }, + { + "epoch": 0.5877646941173529, + "grad_norm": 4.414656639099121, + "learning_rate": 8.685617740092609e-06, + "loss": 2.5963, + "step": 47020 + }, + { + "epoch": 0.5877896947423685, + "grad_norm": 8.062884330749512, + "learning_rate": 8.684752627363222e-06, + "loss": 0.8114, + "step": 47022 + }, + { + "epoch": 0.5878146953673842, + "grad_norm": 3.383216381072998, + "learning_rate": 8.683887524650574e-06, + "loss": 0.4543, + "step": 47024 + }, + { + "epoch": 0.5878396959923998, + "grad_norm": 3.313127040863037, + "learning_rate": 8.683022431961251e-06, + "loss": 0.4711, + "step": 47026 + }, + { + "epoch": 0.5878646966174155, + "grad_norm": 3.547640800476074, + "learning_rate": 8.68215734930185e-06, + "loss": 1.411, + "step": 47028 + }, + { + "epoch": 0.587889697242431, + "grad_norm": 0.805268406867981, + "learning_rate": 8.681292276678946e-06, + "loss": 1.2559, + "step": 47030 + }, + { + "epoch": 0.5879146978674467, + "grad_norm": 2.8081490993499756, + "learning_rate": 8.680427214099132e-06, + "loss": 1.3808, + "step": 47032 + }, + { + "epoch": 0.5879396984924623, + "grad_norm": 4.103538513183594, + "learning_rate": 8.679562161568999e-06, + "loss": 1.0732, + "step": 47034 + }, + { + "epoch": 0.587964699117478, + "grad_norm": 4.025442600250244, + "learning_rate": 8.678697119095135e-06, + "loss": 1.7223, + "step": 47036 + }, + { + "epoch": 0.5879896997424936, + "grad_norm": 1.0438493490219116, + "learning_rate": 8.677832086684126e-06, + "loss": 0.8882, + "step": 47038 + }, + { + "epoch": 0.5880147003675091, + "grad_norm": 2.5936872959136963, + "learning_rate": 8.676967064342566e-06, + "loss": 1.1812, + "step": 47040 + }, + { + "epoch": 0.5880397009925248, + "grad_norm": 3.602652072906494, + "learning_rate": 8.676102052077033e-06, + "loss": 0.7815, + "step": 47042 + }, + { + "epoch": 0.5880647016175404, + "grad_norm": 0.3110484182834625, + "learning_rate": 8.675237049894119e-06, + "loss": 0.1033, + "step": 47044 + }, + { + "epoch": 0.5880897022425561, + "grad_norm": 0.002333044307306409, + "learning_rate": 8.674372057800411e-06, + "loss": 0.0001, + "step": 47046 + }, + { + "epoch": 0.5881147028675717, + "grad_norm": 2.772921085357666, + "learning_rate": 8.673507075802501e-06, + "loss": 0.7011, + "step": 47048 + }, + { + "epoch": 0.5881397034925873, + "grad_norm": 3.314608573913574, + "learning_rate": 8.672642103906972e-06, + "loss": 1.7679, + "step": 47050 + }, + { + "epoch": 0.5881647041176029, + "grad_norm": 2.6352288722991943, + "learning_rate": 8.671777142120413e-06, + "loss": 0.8962, + "step": 47052 + }, + { + "epoch": 0.5881897047426186, + "grad_norm": 2.5569396018981934, + "learning_rate": 8.670912190449417e-06, + "loss": 0.1754, + "step": 47054 + }, + { + "epoch": 0.5882147053676342, + "grad_norm": 3.624617099761963, + "learning_rate": 8.67004724890056e-06, + "loss": 1.1021, + "step": 47056 + }, + { + "epoch": 0.5882397059926499, + "grad_norm": 2.761626958847046, + "learning_rate": 8.669182317480433e-06, + "loss": 1.5596, + "step": 47058 + }, + { + "epoch": 0.5882647066176654, + "grad_norm": 1.6675957441329956, + "learning_rate": 8.668317396195629e-06, + "loss": 0.1895, + "step": 47060 + }, + { + "epoch": 0.588289707242681, + "grad_norm": 3.9733357429504395, + "learning_rate": 8.66745248505273e-06, + "loss": 1.3211, + "step": 47062 + }, + { + "epoch": 0.5883147078676967, + "grad_norm": 2.383741855621338, + "learning_rate": 8.666587584058325e-06, + "loss": 1.1977, + "step": 47064 + }, + { + "epoch": 0.5883397084927123, + "grad_norm": 3.5717618465423584, + "learning_rate": 8.665722693219005e-06, + "loss": 0.8529, + "step": 47066 + }, + { + "epoch": 0.588364709117728, + "grad_norm": 4.573848247528076, + "learning_rate": 8.664857812541347e-06, + "loss": 0.8844, + "step": 47068 + }, + { + "epoch": 0.5883897097427435, + "grad_norm": 2.8371737003326416, + "learning_rate": 8.663992942031943e-06, + "loss": 1.0062, + "step": 47070 + }, + { + "epoch": 0.5884147103677592, + "grad_norm": 4.2120890617370605, + "learning_rate": 8.66312808169738e-06, + "loss": 1.4412, + "step": 47072 + }, + { + "epoch": 0.5884397109927748, + "grad_norm": 0.9584562182426453, + "learning_rate": 8.662263231544243e-06, + "loss": 0.184, + "step": 47074 + }, + { + "epoch": 0.5884647116177905, + "grad_norm": 2.6684417724609375, + "learning_rate": 8.661398391579123e-06, + "loss": 0.6082, + "step": 47076 + }, + { + "epoch": 0.5884897122428061, + "grad_norm": 3.7450220584869385, + "learning_rate": 8.660533561808603e-06, + "loss": 1.763, + "step": 47078 + }, + { + "epoch": 0.5885147128678216, + "grad_norm": 4.808479309082031, + "learning_rate": 8.659668742239273e-06, + "loss": 0.608, + "step": 47080 + }, + { + "epoch": 0.5885397134928373, + "grad_norm": 4.255490779876709, + "learning_rate": 8.658803932877711e-06, + "loss": 0.9978, + "step": 47082 + }, + { + "epoch": 0.5885647141178529, + "grad_norm": 3.100947141647339, + "learning_rate": 8.657939133730511e-06, + "loss": 1.3185, + "step": 47084 + }, + { + "epoch": 0.5885897147428686, + "grad_norm": 7.875432968139648, + "learning_rate": 8.657074344804256e-06, + "loss": 1.8261, + "step": 47086 + }, + { + "epoch": 0.5886147153678842, + "grad_norm": 4.824685096740723, + "learning_rate": 8.656209566105532e-06, + "loss": 0.8337, + "step": 47088 + }, + { + "epoch": 0.5886397159928998, + "grad_norm": 9.273116111755371, + "learning_rate": 8.655344797640928e-06, + "loss": 1.3314, + "step": 47090 + }, + { + "epoch": 0.5886647166179154, + "grad_norm": 3.1806981563568115, + "learning_rate": 8.65448003941703e-06, + "loss": 1.3673, + "step": 47092 + }, + { + "epoch": 0.5886897172429311, + "grad_norm": 2.6674702167510986, + "learning_rate": 8.653615291440417e-06, + "loss": 1.2519, + "step": 47094 + }, + { + "epoch": 0.5887147178679467, + "grad_norm": 6.4071831703186035, + "learning_rate": 8.652750553717682e-06, + "loss": 1.4441, + "step": 47096 + }, + { + "epoch": 0.5887397184929624, + "grad_norm": 2.151162624359131, + "learning_rate": 8.651885826255407e-06, + "loss": 0.4712, + "step": 47098 + }, + { + "epoch": 0.5887647191179779, + "grad_norm": 2.2430195808410645, + "learning_rate": 8.65102110906018e-06, + "loss": 0.2502, + "step": 47100 + }, + { + "epoch": 0.5887897197429935, + "grad_norm": 4.703644752502441, + "learning_rate": 8.650156402138584e-06, + "loss": 1.1581, + "step": 47102 + }, + { + "epoch": 0.5888147203680092, + "grad_norm": 0.3683857023715973, + "learning_rate": 8.649291705497207e-06, + "loss": 0.7046, + "step": 47104 + }, + { + "epoch": 0.5888397209930248, + "grad_norm": 3.6891708374023438, + "learning_rate": 8.648427019142635e-06, + "loss": 2.0641, + "step": 47106 + }, + { + "epoch": 0.5888647216180405, + "grad_norm": 2.8575267791748047, + "learning_rate": 8.647562343081448e-06, + "loss": 1.3563, + "step": 47108 + }, + { + "epoch": 0.588889722243056, + "grad_norm": 0.009136593900620937, + "learning_rate": 8.646697677320238e-06, + "loss": 0.788, + "step": 47110 + }, + { + "epoch": 0.5889147228680717, + "grad_norm": 3.4800000190734863, + "learning_rate": 8.645833021865586e-06, + "loss": 1.4434, + "step": 47112 + }, + { + "epoch": 0.5889397234930873, + "grad_norm": 1.9878426790237427, + "learning_rate": 8.644968376724078e-06, + "loss": 1.0642, + "step": 47114 + }, + { + "epoch": 0.588964724118103, + "grad_norm": 5.777011871337891, + "learning_rate": 8.6441037419023e-06, + "loss": 2.0118, + "step": 47116 + }, + { + "epoch": 0.5889897247431186, + "grad_norm": 3.106684684753418, + "learning_rate": 8.643239117406838e-06, + "loss": 1.0184, + "step": 47118 + }, + { + "epoch": 0.5890147253681342, + "grad_norm": 3.421872138977051, + "learning_rate": 8.642374503244272e-06, + "loss": 0.769, + "step": 47120 + }, + { + "epoch": 0.5890397259931498, + "grad_norm": 2.8564770221710205, + "learning_rate": 8.64150989942119e-06, + "loss": 0.9177, + "step": 47122 + }, + { + "epoch": 0.5890647266181654, + "grad_norm": 7.480384826660156, + "learning_rate": 8.640645305944177e-06, + "loss": 1.299, + "step": 47124 + }, + { + "epoch": 0.5890897272431811, + "grad_norm": 3.3178298473358154, + "learning_rate": 8.639780722819818e-06, + "loss": 0.881, + "step": 47126 + }, + { + "epoch": 0.5891147278681967, + "grad_norm": 2.9094078540802, + "learning_rate": 8.638916150054694e-06, + "loss": 1.7397, + "step": 47128 + }, + { + "epoch": 0.5891397284932123, + "grad_norm": 6.31366491317749, + "learning_rate": 8.638051587655394e-06, + "loss": 0.6713, + "step": 47130 + }, + { + "epoch": 0.5891647291182279, + "grad_norm": 0.0021959131117910147, + "learning_rate": 8.637187035628503e-06, + "loss": 0.381, + "step": 47132 + }, + { + "epoch": 0.5891897297432436, + "grad_norm": 4.617746829986572, + "learning_rate": 8.636322493980598e-06, + "loss": 1.8105, + "step": 47134 + }, + { + "epoch": 0.5892147303682592, + "grad_norm": 4.830608367919922, + "learning_rate": 8.63545796271827e-06, + "loss": 1.2842, + "step": 47136 + }, + { + "epoch": 0.5892397309932749, + "grad_norm": 3.1577887535095215, + "learning_rate": 8.634593441848102e-06, + "loss": 1.4623, + "step": 47138 + }, + { + "epoch": 0.5892647316182904, + "grad_norm": 2.203953266143799, + "learning_rate": 8.633728931376676e-06, + "loss": 2.258, + "step": 47140 + }, + { + "epoch": 0.589289732243306, + "grad_norm": 15.531360626220703, + "learning_rate": 8.63286443131058e-06, + "loss": 1.1191, + "step": 47142 + }, + { + "epoch": 0.5893147328683217, + "grad_norm": 3.161242961883545, + "learning_rate": 8.631999941656392e-06, + "loss": 1.1213, + "step": 47144 + }, + { + "epoch": 0.5893397334933373, + "grad_norm": 1.9743504524230957, + "learning_rate": 8.6311354624207e-06, + "loss": 0.5247, + "step": 47146 + }, + { + "epoch": 0.589364734118353, + "grad_norm": 5.572348117828369, + "learning_rate": 8.630270993610089e-06, + "loss": 1.2607, + "step": 47148 + }, + { + "epoch": 0.5893897347433685, + "grad_norm": 2.615935802459717, + "learning_rate": 8.629406535231138e-06, + "loss": 0.7329, + "step": 47150 + }, + { + "epoch": 0.5894147353683842, + "grad_norm": 1.0208739042282104, + "learning_rate": 8.628542087290432e-06, + "loss": 0.413, + "step": 47152 + }, + { + "epoch": 0.5894397359933998, + "grad_norm": 5.012424468994141, + "learning_rate": 8.627677649794557e-06, + "loss": 1.4299, + "step": 47154 + }, + { + "epoch": 0.5894647366184155, + "grad_norm": 3.8669896125793457, + "learning_rate": 8.626813222750094e-06, + "loss": 2.2431, + "step": 47156 + }, + { + "epoch": 0.5894897372434311, + "grad_norm": 0.8480882048606873, + "learning_rate": 8.62594880616363e-06, + "loss": 0.607, + "step": 47158 + }, + { + "epoch": 0.5895147378684467, + "grad_norm": 3.9989776611328125, + "learning_rate": 8.625084400041743e-06, + "loss": 1.0901, + "step": 47160 + }, + { + "epoch": 0.5895397384934623, + "grad_norm": 4.924104690551758, + "learning_rate": 8.62422000439102e-06, + "loss": 1.0317, + "step": 47162 + }, + { + "epoch": 0.589564739118478, + "grad_norm": 2.7553799152374268, + "learning_rate": 8.623355619218042e-06, + "loss": 0.8772, + "step": 47164 + }, + { + "epoch": 0.5895897397434936, + "grad_norm": 0.0014809623826295137, + "learning_rate": 8.622491244529392e-06, + "loss": 0.0293, + "step": 47166 + }, + { + "epoch": 0.5896147403685092, + "grad_norm": 3.5087945461273193, + "learning_rate": 8.621626880331656e-06, + "loss": 1.0554, + "step": 47168 + }, + { + "epoch": 0.5896397409935248, + "grad_norm": 3.1534411907196045, + "learning_rate": 8.620762526631415e-06, + "loss": 1.511, + "step": 47170 + }, + { + "epoch": 0.5896647416185404, + "grad_norm": 0.0016865506768226624, + "learning_rate": 8.61989818343525e-06, + "loss": 0.4774, + "step": 47172 + }, + { + "epoch": 0.5896897422435561, + "grad_norm": 2.4861013889312744, + "learning_rate": 8.619033850749747e-06, + "loss": 0.459, + "step": 47174 + }, + { + "epoch": 0.5897147428685717, + "grad_norm": 2.235053539276123, + "learning_rate": 8.618169528581485e-06, + "loss": 0.3526, + "step": 47176 + }, + { + "epoch": 0.5897397434935874, + "grad_norm": 3.124741315841675, + "learning_rate": 8.617305216937049e-06, + "loss": 0.7439, + "step": 47178 + }, + { + "epoch": 0.5897647441186029, + "grad_norm": 2.0272576808929443, + "learning_rate": 8.616440915823021e-06, + "loss": 1.071, + "step": 47180 + }, + { + "epoch": 0.5897897447436186, + "grad_norm": 4.982057094573975, + "learning_rate": 8.615576625245983e-06, + "loss": 1.1286, + "step": 47182 + }, + { + "epoch": 0.5898147453686342, + "grad_norm": 6.489689826965332, + "learning_rate": 8.61471234521252e-06, + "loss": 1.0178, + "step": 47184 + }, + { + "epoch": 0.5898397459936499, + "grad_norm": 0.0055844527669250965, + "learning_rate": 8.61384807572921e-06, + "loss": 0.4293, + "step": 47186 + }, + { + "epoch": 0.5898647466186655, + "grad_norm": 2.8718929290771484, + "learning_rate": 8.612983816802637e-06, + "loss": 0.6435, + "step": 47188 + }, + { + "epoch": 0.589889747243681, + "grad_norm": 5.618518829345703, + "learning_rate": 8.612119568439381e-06, + "loss": 0.9878, + "step": 47190 + }, + { + "epoch": 0.5899147478686967, + "grad_norm": 3.056741237640381, + "learning_rate": 8.611255330646028e-06, + "loss": 0.7186, + "step": 47192 + }, + { + "epoch": 0.5899397484937123, + "grad_norm": 1.076669454574585, + "learning_rate": 8.610391103429159e-06, + "loss": 0.5441, + "step": 47194 + }, + { + "epoch": 0.589964749118728, + "grad_norm": 0.0032711918465793133, + "learning_rate": 8.609526886795354e-06, + "loss": 0.0147, + "step": 47196 + }, + { + "epoch": 0.5899897497437436, + "grad_norm": 4.193239212036133, + "learning_rate": 8.608662680751197e-06, + "loss": 0.6257, + "step": 47198 + }, + { + "epoch": 0.5900147503687592, + "grad_norm": 3.613800048828125, + "learning_rate": 8.607798485303267e-06, + "loss": 0.9749, + "step": 47200 + }, + { + "epoch": 0.5900397509937748, + "grad_norm": 3.082893133163452, + "learning_rate": 8.606934300458147e-06, + "loss": 0.9516, + "step": 47202 + }, + { + "epoch": 0.5900647516187905, + "grad_norm": 4.4221906661987305, + "learning_rate": 8.606070126222417e-06, + "loss": 1.2128, + "step": 47204 + }, + { + "epoch": 0.5900897522438061, + "grad_norm": 3.50720477104187, + "learning_rate": 8.60520596260266e-06, + "loss": 1.7261, + "step": 47206 + }, + { + "epoch": 0.5901147528688218, + "grad_norm": 2.2062783241271973, + "learning_rate": 8.604341809605459e-06, + "loss": 0.3181, + "step": 47208 + }, + { + "epoch": 0.5901397534938373, + "grad_norm": 2.310014247894287, + "learning_rate": 8.603477667237394e-06, + "loss": 0.2758, + "step": 47210 + }, + { + "epoch": 0.5901647541188529, + "grad_norm": 2.6755807399749756, + "learning_rate": 8.602613535505044e-06, + "loss": 1.3576, + "step": 47212 + }, + { + "epoch": 0.5901897547438686, + "grad_norm": 1.848533034324646, + "learning_rate": 8.601749414414991e-06, + "loss": 0.9088, + "step": 47214 + }, + { + "epoch": 0.5902147553688842, + "grad_norm": 1.038521409034729, + "learning_rate": 8.600885303973818e-06, + "loss": 0.9592, + "step": 47216 + }, + { + "epoch": 0.5902397559938999, + "grad_norm": 0.001271526562049985, + "learning_rate": 8.600021204188103e-06, + "loss": 0.4286, + "step": 47218 + }, + { + "epoch": 0.5902647566189154, + "grad_norm": 10.556626319885254, + "learning_rate": 8.59915711506443e-06, + "loss": 0.3127, + "step": 47220 + }, + { + "epoch": 0.5902897572439311, + "grad_norm": 0.0006659234059043229, + "learning_rate": 8.598293036609377e-06, + "loss": 0.912, + "step": 47222 + }, + { + "epoch": 0.5903147578689467, + "grad_norm": 3.380889654159546, + "learning_rate": 8.597428968829529e-06, + "loss": 0.6836, + "step": 47224 + }, + { + "epoch": 0.5903397584939624, + "grad_norm": 1.9704275131225586, + "learning_rate": 8.596564911731462e-06, + "loss": 1.5409, + "step": 47226 + }, + { + "epoch": 0.590364759118978, + "grad_norm": 1.9902667999267578, + "learning_rate": 8.595700865321757e-06, + "loss": 0.5056, + "step": 47228 + }, + { + "epoch": 0.5903897597439935, + "grad_norm": 1.684273600578308, + "learning_rate": 8.594836829606997e-06, + "loss": 1.6847, + "step": 47230 + }, + { + "epoch": 0.5904147603690092, + "grad_norm": 0.0038770409300923347, + "learning_rate": 8.59397280459376e-06, + "loss": 0.0074, + "step": 47232 + }, + { + "epoch": 0.5904397609940248, + "grad_norm": 5.0850982666015625, + "learning_rate": 8.593108790288627e-06, + "loss": 0.9231, + "step": 47234 + }, + { + "epoch": 0.5904647616190405, + "grad_norm": 2.162144184112549, + "learning_rate": 8.592244786698183e-06, + "loss": 0.0874, + "step": 47236 + }, + { + "epoch": 0.5904897622440561, + "grad_norm": 2.7090752124786377, + "learning_rate": 8.591380793828998e-06, + "loss": 1.1354, + "step": 47238 + }, + { + "epoch": 0.5905147628690717, + "grad_norm": 2.5788631439208984, + "learning_rate": 8.59051681168766e-06, + "loss": 0.9078, + "step": 47240 + }, + { + "epoch": 0.5905397634940873, + "grad_norm": 2.4185144901275635, + "learning_rate": 8.589652840280747e-06, + "loss": 0.6014, + "step": 47242 + }, + { + "epoch": 0.590564764119103, + "grad_norm": 2.682469606399536, + "learning_rate": 8.588788879614837e-06, + "loss": 0.1999, + "step": 47244 + }, + { + "epoch": 0.5905897647441186, + "grad_norm": 3.9974565505981445, + "learning_rate": 8.587924929696512e-06, + "loss": 2.3418, + "step": 47246 + }, + { + "epoch": 0.5906147653691343, + "grad_norm": 2.4566404819488525, + "learning_rate": 8.58706099053235e-06, + "loss": 0.6586, + "step": 47248 + }, + { + "epoch": 0.5906397659941498, + "grad_norm": 0.00119927863124758, + "learning_rate": 8.586197062128934e-06, + "loss": 0.3464, + "step": 47250 + }, + { + "epoch": 0.5906647666191654, + "grad_norm": 3.560587167739868, + "learning_rate": 8.58533314449284e-06, + "loss": 0.6581, + "step": 47252 + }, + { + "epoch": 0.5906897672441811, + "grad_norm": 0.0032427154947072268, + "learning_rate": 8.584469237630649e-06, + "loss": 0.4661, + "step": 47254 + }, + { + "epoch": 0.5907147678691967, + "grad_norm": 5.181772232055664, + "learning_rate": 8.58360534154894e-06, + "loss": 1.2494, + "step": 47256 + }, + { + "epoch": 0.5907397684942124, + "grad_norm": 2.920132637023926, + "learning_rate": 8.582741456254292e-06, + "loss": 1.2104, + "step": 47258 + }, + { + "epoch": 0.5907647691192279, + "grad_norm": 6.15516996383667, + "learning_rate": 8.581877581753286e-06, + "loss": 0.9702, + "step": 47260 + }, + { + "epoch": 0.5907897697442436, + "grad_norm": 0.16716191172599792, + "learning_rate": 8.5810137180525e-06, + "loss": 0.0031, + "step": 47262 + }, + { + "epoch": 0.5908147703692592, + "grad_norm": 2.4593162536621094, + "learning_rate": 8.58014986515851e-06, + "loss": 0.4923, + "step": 47264 + }, + { + "epoch": 0.5908397709942749, + "grad_norm": 3.702465295791626, + "learning_rate": 8.5792860230779e-06, + "loss": 0.8687, + "step": 47266 + }, + { + "epoch": 0.5908647716192905, + "grad_norm": 1.9995132684707642, + "learning_rate": 8.578422191817246e-06, + "loss": 0.4706, + "step": 47268 + }, + { + "epoch": 0.590889772244306, + "grad_norm": 2.144761562347412, + "learning_rate": 8.577558371383128e-06, + "loss": 0.4899, + "step": 47270 + }, + { + "epoch": 0.5909147728693217, + "grad_norm": 4.10566520690918, + "learning_rate": 8.576694561782124e-06, + "loss": 1.5597, + "step": 47272 + }, + { + "epoch": 0.5909397734943374, + "grad_norm": 1.8940811157226562, + "learning_rate": 8.575830763020813e-06, + "loss": 0.6795, + "step": 47274 + }, + { + "epoch": 0.590964774119353, + "grad_norm": 2.878227710723877, + "learning_rate": 8.574966975105774e-06, + "loss": 0.494, + "step": 47276 + }, + { + "epoch": 0.5909897747443686, + "grad_norm": 5.232554912567139, + "learning_rate": 8.574103198043586e-06, + "loss": 1.7505, + "step": 47278 + }, + { + "epoch": 0.5910147753693842, + "grad_norm": 8.264715194702148, + "learning_rate": 8.573239431840823e-06, + "loss": 1.6386, + "step": 47280 + }, + { + "epoch": 0.5910397759943998, + "grad_norm": 4.384021282196045, + "learning_rate": 8.572375676504069e-06, + "loss": 1.6055, + "step": 47282 + }, + { + "epoch": 0.5910647766194155, + "grad_norm": 3.453695297241211, + "learning_rate": 8.5715119320399e-06, + "loss": 0.6728, + "step": 47284 + }, + { + "epoch": 0.5910897772444311, + "grad_norm": 3.6964094638824463, + "learning_rate": 8.570648198454893e-06, + "loss": 0.4199, + "step": 47286 + }, + { + "epoch": 0.5911147778694468, + "grad_norm": 1.8326047658920288, + "learning_rate": 8.56978447575563e-06, + "loss": 1.2322, + "step": 47288 + }, + { + "epoch": 0.5911397784944623, + "grad_norm": 2.6212379932403564, + "learning_rate": 8.568920763948683e-06, + "loss": 1.2215, + "step": 47290 + }, + { + "epoch": 0.591164779119478, + "grad_norm": 4.086060523986816, + "learning_rate": 8.568057063040635e-06, + "loss": 1.3355, + "step": 47292 + }, + { + "epoch": 0.5911897797444936, + "grad_norm": 3.9640471935272217, + "learning_rate": 8.567193373038061e-06, + "loss": 1.4163, + "step": 47294 + }, + { + "epoch": 0.5912147803695093, + "grad_norm": 2.7345428466796875, + "learning_rate": 8.56632969394754e-06, + "loss": 1.3895, + "step": 47296 + }, + { + "epoch": 0.5912397809945249, + "grad_norm": 3.8090789318084717, + "learning_rate": 8.565466025775648e-06, + "loss": 0.9075, + "step": 47298 + }, + { + "epoch": 0.5912647816195404, + "grad_norm": 3.870882272720337, + "learning_rate": 8.564602368528963e-06, + "loss": 1.3065, + "step": 47300 + }, + { + "epoch": 0.5912897822445561, + "grad_norm": 3.882728099822998, + "learning_rate": 8.563738722214068e-06, + "loss": 1.2412, + "step": 47302 + }, + { + "epoch": 0.5913147828695717, + "grad_norm": 3.424717903137207, + "learning_rate": 8.562875086837534e-06, + "loss": 0.3658, + "step": 47304 + }, + { + "epoch": 0.5913397834945874, + "grad_norm": 0.0043956199660897255, + "learning_rate": 8.562011462405939e-06, + "loss": 0.0001, + "step": 47306 + }, + { + "epoch": 0.591364784119603, + "grad_norm": 3.0821006298065186, + "learning_rate": 8.56114784892586e-06, + "loss": 1.4342, + "step": 47308 + }, + { + "epoch": 0.5913897847446186, + "grad_norm": 4.67930269241333, + "learning_rate": 8.560284246403878e-06, + "loss": 1.1146, + "step": 47310 + }, + { + "epoch": 0.5914147853696342, + "grad_norm": 3.880385160446167, + "learning_rate": 8.559420654846567e-06, + "loss": 0.3322, + "step": 47312 + }, + { + "epoch": 0.5914397859946499, + "grad_norm": 1.9623053073883057, + "learning_rate": 8.558557074260505e-06, + "loss": 1.1445, + "step": 47314 + }, + { + "epoch": 0.5914647866196655, + "grad_norm": 4.293607234954834, + "learning_rate": 8.557693504652267e-06, + "loss": 0.8738, + "step": 47316 + }, + { + "epoch": 0.5914897872446812, + "grad_norm": 0.0012665835674852133, + "learning_rate": 8.556829946028432e-06, + "loss": 0.1294, + "step": 47318 + }, + { + "epoch": 0.5915147878696967, + "grad_norm": 3.989720582962036, + "learning_rate": 8.555966398395576e-06, + "loss": 1.2897, + "step": 47320 + }, + { + "epoch": 0.5915397884947123, + "grad_norm": 2.840904474258423, + "learning_rate": 8.555102861760276e-06, + "loss": 1.4221, + "step": 47322 + }, + { + "epoch": 0.591564789119728, + "grad_norm": 3.654905319213867, + "learning_rate": 8.554239336129108e-06, + "loss": 0.7559, + "step": 47324 + }, + { + "epoch": 0.5915897897447436, + "grad_norm": 3.9176785945892334, + "learning_rate": 8.553375821508646e-06, + "loss": 1.1691, + "step": 47326 + }, + { + "epoch": 0.5916147903697593, + "grad_norm": 0.0007698422996327281, + "learning_rate": 8.552512317905476e-06, + "loss": 0.4298, + "step": 47328 + }, + { + "epoch": 0.5916397909947748, + "grad_norm": 4.146057605743408, + "learning_rate": 8.551648825326164e-06, + "loss": 0.5925, + "step": 47330 + }, + { + "epoch": 0.5916647916197905, + "grad_norm": 7.4047770500183105, + "learning_rate": 8.550785343777288e-06, + "loss": 0.7417, + "step": 47332 + }, + { + "epoch": 0.5916897922448061, + "grad_norm": 2.0966038703918457, + "learning_rate": 8.549921873265427e-06, + "loss": 1.0154, + "step": 47334 + }, + { + "epoch": 0.5917147928698218, + "grad_norm": 4.010589122772217, + "learning_rate": 8.549058413797156e-06, + "loss": 0.9425, + "step": 47336 + }, + { + "epoch": 0.5917397934948374, + "grad_norm": 3.1667778491973877, + "learning_rate": 8.548194965379048e-06, + "loss": 1.2402, + "step": 47338 + }, + { + "epoch": 0.5917647941198529, + "grad_norm": 3.807462453842163, + "learning_rate": 8.54733152801769e-06, + "loss": 1.3902, + "step": 47340 + }, + { + "epoch": 0.5917897947448686, + "grad_norm": 0.025874124839901924, + "learning_rate": 8.546468101719644e-06, + "loss": 0.8369, + "step": 47342 + }, + { + "epoch": 0.5918147953698842, + "grad_norm": 4.425856113433838, + "learning_rate": 8.54560468649149e-06, + "loss": 1.5995, + "step": 47344 + }, + { + "epoch": 0.5918397959948999, + "grad_norm": 1.9371559619903564, + "learning_rate": 8.544741282339809e-06, + "loss": 0.1755, + "step": 47346 + }, + { + "epoch": 0.5918647966199155, + "grad_norm": 3.5069949626922607, + "learning_rate": 8.54387788927117e-06, + "loss": 2.143, + "step": 47348 + }, + { + "epoch": 0.5918897972449311, + "grad_norm": 0.0011439998634159565, + "learning_rate": 8.54301450729215e-06, + "loss": 0.1201, + "step": 47350 + }, + { + "epoch": 0.5919147978699467, + "grad_norm": 1.87595796585083, + "learning_rate": 8.542151136409325e-06, + "loss": 0.6057, + "step": 47352 + }, + { + "epoch": 0.5919397984949624, + "grad_norm": 3.0434768199920654, + "learning_rate": 8.541287776629278e-06, + "loss": 0.8904, + "step": 47354 + }, + { + "epoch": 0.591964799119978, + "grad_norm": 3.8230128288269043, + "learning_rate": 8.540424427958571e-06, + "loss": 0.4116, + "step": 47356 + }, + { + "epoch": 0.5919897997449937, + "grad_norm": 4.290591239929199, + "learning_rate": 8.539561090403786e-06, + "loss": 1.2417, + "step": 47358 + }, + { + "epoch": 0.5920148003700092, + "grad_norm": 0.003582020988687873, + "learning_rate": 8.538697763971497e-06, + "loss": 0.0001, + "step": 47360 + }, + { + "epoch": 0.5920398009950248, + "grad_norm": 8.919278144836426, + "learning_rate": 8.537834448668278e-06, + "loss": 1.4297, + "step": 47362 + }, + { + "epoch": 0.5920648016200405, + "grad_norm": 7.458706855773926, + "learning_rate": 8.536971144500706e-06, + "loss": 1.0187, + "step": 47364 + }, + { + "epoch": 0.5920898022450561, + "grad_norm": 2.5498220920562744, + "learning_rate": 8.53610785147536e-06, + "loss": 0.4279, + "step": 47366 + }, + { + "epoch": 0.5921148028700718, + "grad_norm": 2.0413997173309326, + "learning_rate": 8.535244569598804e-06, + "loss": 0.1911, + "step": 47368 + }, + { + "epoch": 0.5921398034950873, + "grad_norm": 4.067343235015869, + "learning_rate": 8.534381298877619e-06, + "loss": 0.3699, + "step": 47370 + }, + { + "epoch": 0.592164804120103, + "grad_norm": 4.709250450134277, + "learning_rate": 8.533518039318378e-06, + "loss": 1.0744, + "step": 47372 + }, + { + "epoch": 0.5921898047451186, + "grad_norm": 3.237269163131714, + "learning_rate": 8.532654790927655e-06, + "loss": 0.5732, + "step": 47374 + }, + { + "epoch": 0.5922148053701343, + "grad_norm": 0.11190136522054672, + "learning_rate": 8.531791553712028e-06, + "loss": 0.0028, + "step": 47376 + }, + { + "epoch": 0.5922398059951499, + "grad_norm": 5.095001220703125, + "learning_rate": 8.530928327678067e-06, + "loss": 1.8245, + "step": 47378 + }, + { + "epoch": 0.5922648066201655, + "grad_norm": 3.663844347000122, + "learning_rate": 8.530065112832355e-06, + "loss": 1.2116, + "step": 47380 + }, + { + "epoch": 0.5922898072451811, + "grad_norm": 3.1350290775299072, + "learning_rate": 8.529201909181454e-06, + "loss": 1.0191, + "step": 47382 + }, + { + "epoch": 0.5923148078701967, + "grad_norm": 2.9551727771759033, + "learning_rate": 8.528338716731943e-06, + "loss": 0.2813, + "step": 47384 + }, + { + "epoch": 0.5923398084952124, + "grad_norm": 8.197587966918945, + "learning_rate": 8.527475535490396e-06, + "loss": 0.8047, + "step": 47386 + }, + { + "epoch": 0.592364809120228, + "grad_norm": 6.0554914474487305, + "learning_rate": 8.526612365463385e-06, + "loss": 1.3883, + "step": 47388 + }, + { + "epoch": 0.5923898097452436, + "grad_norm": 4.551660537719727, + "learning_rate": 8.525749206657489e-06, + "loss": 1.8437, + "step": 47390 + }, + { + "epoch": 0.5924148103702592, + "grad_norm": 3.0868029594421387, + "learning_rate": 8.524886059079278e-06, + "loss": 1.3789, + "step": 47392 + }, + { + "epoch": 0.5924398109952749, + "grad_norm": 1.8536230325698853, + "learning_rate": 8.52402292273533e-06, + "loss": 0.8376, + "step": 47394 + }, + { + "epoch": 0.5924648116202905, + "grad_norm": 2.8323941230773926, + "learning_rate": 8.52315979763221e-06, + "loss": 0.798, + "step": 47396 + }, + { + "epoch": 0.5924898122453062, + "grad_norm": 3.3169798851013184, + "learning_rate": 8.522296683776497e-06, + "loss": 1.2119, + "step": 47398 + }, + { + "epoch": 0.5925148128703217, + "grad_norm": 5.241332530975342, + "learning_rate": 8.521433581174762e-06, + "loss": 2.3018, + "step": 47400 + }, + { + "epoch": 0.5925398134953374, + "grad_norm": 2.1469156742095947, + "learning_rate": 8.52057048983358e-06, + "loss": 0.39, + "step": 47402 + }, + { + "epoch": 0.592564814120353, + "grad_norm": 4.820064544677734, + "learning_rate": 8.519707409759525e-06, + "loss": 0.3936, + "step": 47404 + }, + { + "epoch": 0.5925898147453686, + "grad_norm": 3.485015869140625, + "learning_rate": 8.518844340959173e-06, + "loss": 0.5852, + "step": 47406 + }, + { + "epoch": 0.5926148153703843, + "grad_norm": 1.0793367624282837, + "learning_rate": 8.51798128343909e-06, + "loss": 0.5935, + "step": 47408 + }, + { + "epoch": 0.5926398159953998, + "grad_norm": 4.137951850891113, + "learning_rate": 8.517118237205852e-06, + "loss": 1.2806, + "step": 47410 + }, + { + "epoch": 0.5926648166204155, + "grad_norm": 1.8042752742767334, + "learning_rate": 8.516255202266028e-06, + "loss": 0.8854, + "step": 47412 + }, + { + "epoch": 0.5926898172454311, + "grad_norm": 0.5142949819564819, + "learning_rate": 8.515392178626197e-06, + "loss": 0.1124, + "step": 47414 + }, + { + "epoch": 0.5927148178704468, + "grad_norm": 0.003330195089802146, + "learning_rate": 8.514529166292929e-06, + "loss": 1.3181, + "step": 47416 + }, + { + "epoch": 0.5927398184954624, + "grad_norm": 2.697805404663086, + "learning_rate": 8.513666165272797e-06, + "loss": 0.8275, + "step": 47418 + }, + { + "epoch": 0.592764819120478, + "grad_norm": 1.767045259475708, + "learning_rate": 8.51280317557238e-06, + "loss": 0.2282, + "step": 47420 + }, + { + "epoch": 0.5927898197454936, + "grad_norm": 3.707907199859619, + "learning_rate": 8.511940197198237e-06, + "loss": 1.3483, + "step": 47422 + }, + { + "epoch": 0.5928148203705093, + "grad_norm": 3.65726900100708, + "learning_rate": 8.511077230156945e-06, + "loss": 0.8037, + "step": 47424 + }, + { + "epoch": 0.5928398209955249, + "grad_norm": 4.507229804992676, + "learning_rate": 8.51021427445508e-06, + "loss": 0.4923, + "step": 47426 + }, + { + "epoch": 0.5928648216205405, + "grad_norm": 3.2708816528320312, + "learning_rate": 8.509351330099213e-06, + "loss": 0.6164, + "step": 47428 + }, + { + "epoch": 0.5928898222455561, + "grad_norm": 3.1268463134765625, + "learning_rate": 8.508488397095917e-06, + "loss": 0.6115, + "step": 47430 + }, + { + "epoch": 0.5929148228705717, + "grad_norm": 5.801562309265137, + "learning_rate": 8.507625475451765e-06, + "loss": 0.8409, + "step": 47432 + }, + { + "epoch": 0.5929398234955874, + "grad_norm": 1.7933591604232788, + "learning_rate": 8.506762565173322e-06, + "loss": 0.028, + "step": 47434 + }, + { + "epoch": 0.592964824120603, + "grad_norm": 2.3482489585876465, + "learning_rate": 8.505899666267163e-06, + "loss": 0.8457, + "step": 47436 + }, + { + "epoch": 0.5929898247456187, + "grad_norm": 3.673288583755493, + "learning_rate": 8.505036778739863e-06, + "loss": 1.1854, + "step": 47438 + }, + { + "epoch": 0.5930148253706342, + "grad_norm": 0.8558868169784546, + "learning_rate": 8.50417390259799e-06, + "loss": 0.3276, + "step": 47440 + }, + { + "epoch": 0.5930398259956499, + "grad_norm": 2.3802545070648193, + "learning_rate": 8.50331103784812e-06, + "loss": 0.8083, + "step": 47442 + }, + { + "epoch": 0.5930648266206655, + "grad_norm": 1.9091694355010986, + "learning_rate": 8.502448184496818e-06, + "loss": 0.6873, + "step": 47444 + }, + { + "epoch": 0.5930898272456812, + "grad_norm": 3.761066436767578, + "learning_rate": 8.501585342550665e-06, + "loss": 1.041, + "step": 47446 + }, + { + "epoch": 0.5931148278706968, + "grad_norm": 2.5893466472625732, + "learning_rate": 8.500722512016219e-06, + "loss": 0.6118, + "step": 47448 + }, + { + "epoch": 0.5931398284957123, + "grad_norm": 2.1747171878814697, + "learning_rate": 8.49985969290006e-06, + "loss": 0.4062, + "step": 47450 + }, + { + "epoch": 0.593164829120728, + "grad_norm": 3.796910285949707, + "learning_rate": 8.49899688520876e-06, + "loss": 1.2833, + "step": 47452 + }, + { + "epoch": 0.5931898297457436, + "grad_norm": 3.7279696464538574, + "learning_rate": 8.498134088948884e-06, + "loss": 1.4995, + "step": 47454 + }, + { + "epoch": 0.5932148303707593, + "grad_norm": 4.665502071380615, + "learning_rate": 8.497271304127008e-06, + "loss": 1.0604, + "step": 47456 + }, + { + "epoch": 0.5932398309957749, + "grad_norm": 3.271470546722412, + "learning_rate": 8.496408530749705e-06, + "loss": 0.6627, + "step": 47458 + }, + { + "epoch": 0.5932648316207905, + "grad_norm": 2.877697467803955, + "learning_rate": 8.495545768823539e-06, + "loss": 1.3438, + "step": 47460 + }, + { + "epoch": 0.5932898322458061, + "grad_norm": 3.7261340618133545, + "learning_rate": 8.49468301835508e-06, + "loss": 1.9682, + "step": 47462 + }, + { + "epoch": 0.5933148328708218, + "grad_norm": 2.1201376914978027, + "learning_rate": 8.493820279350907e-06, + "loss": 0.9135, + "step": 47464 + }, + { + "epoch": 0.5933398334958374, + "grad_norm": 0.018217163160443306, + "learning_rate": 8.492957551817583e-06, + "loss": 0.0445, + "step": 47466 + }, + { + "epoch": 0.5933648341208531, + "grad_norm": 0.00265818671323359, + "learning_rate": 8.492094835761683e-06, + "loss": 0.7733, + "step": 47468 + }, + { + "epoch": 0.5933898347458686, + "grad_norm": 4.267189979553223, + "learning_rate": 8.491232131189774e-06, + "loss": 0.7204, + "step": 47470 + }, + { + "epoch": 0.5934148353708842, + "grad_norm": 3.24220609664917, + "learning_rate": 8.490369438108432e-06, + "loss": 1.2439, + "step": 47472 + }, + { + "epoch": 0.5934398359958999, + "grad_norm": 0.5679329037666321, + "learning_rate": 8.489506756524218e-06, + "loss": 0.117, + "step": 47474 + }, + { + "epoch": 0.5934648366209155, + "grad_norm": 2.9987716674804688, + "learning_rate": 8.488644086443708e-06, + "loss": 1.2888, + "step": 47476 + }, + { + "epoch": 0.5934898372459312, + "grad_norm": 4.613471031188965, + "learning_rate": 8.487781427873472e-06, + "loss": 0.6571, + "step": 47478 + }, + { + "epoch": 0.5935148378709467, + "grad_norm": 3.145031213760376, + "learning_rate": 8.486918780820078e-06, + "loss": 0.924, + "step": 47480 + }, + { + "epoch": 0.5935398384959624, + "grad_norm": 2.098414421081543, + "learning_rate": 8.486056145290096e-06, + "loss": 0.2215, + "step": 47482 + }, + { + "epoch": 0.593564839120978, + "grad_norm": 0.00732562318444252, + "learning_rate": 8.4851935212901e-06, + "loss": 0.2378, + "step": 47484 + }, + { + "epoch": 0.5935898397459937, + "grad_norm": 0.9435462951660156, + "learning_rate": 8.484330908826652e-06, + "loss": 0.6223, + "step": 47486 + }, + { + "epoch": 0.5936148403710093, + "grad_norm": 3.0066580772399902, + "learning_rate": 8.483468307906326e-06, + "loss": 1.1661, + "step": 47488 + }, + { + "epoch": 0.5936398409960248, + "grad_norm": 0.008482242934405804, + "learning_rate": 8.48260571853569e-06, + "loss": 0.2648, + "step": 47490 + }, + { + "epoch": 0.5936648416210405, + "grad_norm": 3.922107458114624, + "learning_rate": 8.481743140721315e-06, + "loss": 0.2613, + "step": 47492 + }, + { + "epoch": 0.5936898422460561, + "grad_norm": 3.23583984375, + "learning_rate": 8.480880574469772e-06, + "loss": 0.8721, + "step": 47494 + }, + { + "epoch": 0.5937148428710718, + "grad_norm": 0.0019693062640726566, + "learning_rate": 8.480018019787624e-06, + "loss": 0.8137, + "step": 47496 + }, + { + "epoch": 0.5937398434960874, + "grad_norm": 1.690203070640564, + "learning_rate": 8.479155476681449e-06, + "loss": 0.5651, + "step": 47498 + }, + { + "epoch": 0.593764844121103, + "grad_norm": 2.362058162689209, + "learning_rate": 8.478292945157807e-06, + "loss": 1.7239, + "step": 47500 + }, + { + "epoch": 0.5937898447461186, + "grad_norm": 3.477771520614624, + "learning_rate": 8.47743042522327e-06, + "loss": 1.671, + "step": 47502 + }, + { + "epoch": 0.5938148453711343, + "grad_norm": 2.835671901702881, + "learning_rate": 8.47656791688441e-06, + "loss": 0.2, + "step": 47504 + }, + { + "epoch": 0.5938398459961499, + "grad_norm": 3.3049561977386475, + "learning_rate": 8.475705420147791e-06, + "loss": 0.3658, + "step": 47506 + }, + { + "epoch": 0.5938648466211656, + "grad_norm": 3.913978099822998, + "learning_rate": 8.474842935019984e-06, + "loss": 0.5246, + "step": 47508 + }, + { + "epoch": 0.5938898472461811, + "grad_norm": 1.060983657836914, + "learning_rate": 8.473980461507562e-06, + "loss": 0.0597, + "step": 47510 + }, + { + "epoch": 0.5939148478711967, + "grad_norm": 1.2472914457321167, + "learning_rate": 8.473117999617085e-06, + "loss": 0.6456, + "step": 47512 + }, + { + "epoch": 0.5939398484962124, + "grad_norm": 4.12636661529541, + "learning_rate": 8.472255549355126e-06, + "loss": 1.2503, + "step": 47514 + }, + { + "epoch": 0.593964849121228, + "grad_norm": 1.2255557775497437, + "learning_rate": 8.471393110728252e-06, + "loss": 0.2962, + "step": 47516 + }, + { + "epoch": 0.5939898497462437, + "grad_norm": 3.8319549560546875, + "learning_rate": 8.470530683743032e-06, + "loss": 0.6412, + "step": 47518 + }, + { + "epoch": 0.5940148503712592, + "grad_norm": 5.665881633758545, + "learning_rate": 8.469668268406035e-06, + "loss": 1.6676, + "step": 47520 + }, + { + "epoch": 0.5940398509962749, + "grad_norm": 2.410871744155884, + "learning_rate": 8.468805864723828e-06, + "loss": 0.1906, + "step": 47522 + }, + { + "epoch": 0.5940648516212905, + "grad_norm": 1.1913729906082153, + "learning_rate": 8.467943472702979e-06, + "loss": 0.0094, + "step": 47524 + }, + { + "epoch": 0.5940898522463062, + "grad_norm": 0.0041665066964924335, + "learning_rate": 8.467081092350056e-06, + "loss": 0.9611, + "step": 47526 + }, + { + "epoch": 0.5941148528713218, + "grad_norm": 3.3223400115966797, + "learning_rate": 8.466218723671626e-06, + "loss": 0.3916, + "step": 47528 + }, + { + "epoch": 0.5941398534963374, + "grad_norm": 3.501138210296631, + "learning_rate": 8.465356366674257e-06, + "loss": 1.8277, + "step": 47530 + }, + { + "epoch": 0.594164854121353, + "grad_norm": 3.910233736038208, + "learning_rate": 8.464494021364517e-06, + "loss": 0.6144, + "step": 47532 + }, + { + "epoch": 0.5941898547463687, + "grad_norm": 5.030078411102295, + "learning_rate": 8.463631687748974e-06, + "loss": 1.1191, + "step": 47534 + }, + { + "epoch": 0.5942148553713843, + "grad_norm": 2.845486879348755, + "learning_rate": 8.462769365834197e-06, + "loss": 0.0653, + "step": 47536 + }, + { + "epoch": 0.5942398559964, + "grad_norm": 2.460099935531616, + "learning_rate": 8.46190705562675e-06, + "loss": 1.49, + "step": 47538 + }, + { + "epoch": 0.5942648566214155, + "grad_norm": 3.7048134803771973, + "learning_rate": 8.461044757133198e-06, + "loss": 0.7282, + "step": 47540 + }, + { + "epoch": 0.5942898572464311, + "grad_norm": 4.449499130249023, + "learning_rate": 8.460182470360114e-06, + "loss": 1.5213, + "step": 47542 + }, + { + "epoch": 0.5943148578714468, + "grad_norm": 0.004981089383363724, + "learning_rate": 8.459320195314063e-06, + "loss": 0.7215, + "step": 47544 + }, + { + "epoch": 0.5943398584964624, + "grad_norm": 10.375019073486328, + "learning_rate": 8.458457932001608e-06, + "loss": 0.9753, + "step": 47546 + }, + { + "epoch": 0.5943648591214781, + "grad_norm": 4.60299015045166, + "learning_rate": 8.457595680429322e-06, + "loss": 0.5237, + "step": 47548 + }, + { + "epoch": 0.5943898597464936, + "grad_norm": 3.4988455772399902, + "learning_rate": 8.456733440603773e-06, + "loss": 0.4968, + "step": 47550 + }, + { + "epoch": 0.5944148603715093, + "grad_norm": 5.680421352386475, + "learning_rate": 8.455871212531519e-06, + "loss": 0.858, + "step": 47552 + }, + { + "epoch": 0.5944398609965249, + "grad_norm": 1.1837655305862427, + "learning_rate": 8.455008996219132e-06, + "loss": 1.06, + "step": 47554 + }, + { + "epoch": 0.5944648616215406, + "grad_norm": 3.5862691402435303, + "learning_rate": 8.45414679167318e-06, + "loss": 1.0237, + "step": 47556 + }, + { + "epoch": 0.5944898622465562, + "grad_norm": 2.5382003784179688, + "learning_rate": 8.453284598900224e-06, + "loss": 0.5985, + "step": 47558 + }, + { + "epoch": 0.5945148628715717, + "grad_norm": 3.3408379554748535, + "learning_rate": 8.452422417906836e-06, + "loss": 2.2634, + "step": 47560 + }, + { + "epoch": 0.5945398634965874, + "grad_norm": 2.9588353633880615, + "learning_rate": 8.451560248699583e-06, + "loss": 1.1482, + "step": 47562 + }, + { + "epoch": 0.594564864121603, + "grad_norm": 3.472317934036255, + "learning_rate": 8.450698091285023e-06, + "loss": 0.4947, + "step": 47564 + }, + { + "epoch": 0.5945898647466187, + "grad_norm": 12.107619285583496, + "learning_rate": 8.44983594566973e-06, + "loss": 1.1713, + "step": 47566 + }, + { + "epoch": 0.5946148653716343, + "grad_norm": 4.301192283630371, + "learning_rate": 8.448973811860266e-06, + "loss": 0.8547, + "step": 47568 + }, + { + "epoch": 0.5946398659966499, + "grad_norm": 1.1277706623077393, + "learning_rate": 8.448111689863199e-06, + "loss": 0.7466, + "step": 47570 + }, + { + "epoch": 0.5946648666216655, + "grad_norm": 3.762073516845703, + "learning_rate": 8.447249579685094e-06, + "loss": 0.8756, + "step": 47572 + }, + { + "epoch": 0.5946898672466812, + "grad_norm": 2.7607786655426025, + "learning_rate": 8.446387481332514e-06, + "loss": 0.6347, + "step": 47574 + }, + { + "epoch": 0.5947148678716968, + "grad_norm": 1.6208261251449585, + "learning_rate": 8.445525394812032e-06, + "loss": 0.7253, + "step": 47576 + }, + { + "epoch": 0.5947398684967125, + "grad_norm": 0.5673118233680725, + "learning_rate": 8.444663320130206e-06, + "loss": 0.6331, + "step": 47578 + }, + { + "epoch": 0.594764869121728, + "grad_norm": 1.342686653137207, + "learning_rate": 8.443801257293605e-06, + "loss": 0.7713, + "step": 47580 + }, + { + "epoch": 0.5947898697467436, + "grad_norm": 0.0010133213363587856, + "learning_rate": 8.442939206308791e-06, + "loss": 0.6497, + "step": 47582 + }, + { + "epoch": 0.5948148703717593, + "grad_norm": 3.136582612991333, + "learning_rate": 8.442077167182334e-06, + "loss": 0.2144, + "step": 47584 + }, + { + "epoch": 0.5948398709967749, + "grad_norm": 0.888627827167511, + "learning_rate": 8.441215139920796e-06, + "loss": 0.7517, + "step": 47586 + }, + { + "epoch": 0.5948648716217906, + "grad_norm": 0.00410893838852644, + "learning_rate": 8.440353124530746e-06, + "loss": 1.1816, + "step": 47588 + }, + { + "epoch": 0.5948898722468061, + "grad_norm": 3.4291555881500244, + "learning_rate": 8.439491121018743e-06, + "loss": 0.7273, + "step": 47590 + }, + { + "epoch": 0.5949148728718218, + "grad_norm": 2.71785831451416, + "learning_rate": 8.438629129391356e-06, + "loss": 0.8477, + "step": 47592 + }, + { + "epoch": 0.5949398734968374, + "grad_norm": 2.5081875324249268, + "learning_rate": 8.437767149655149e-06, + "loss": 0.6541, + "step": 47594 + }, + { + "epoch": 0.5949648741218531, + "grad_norm": 4.056326389312744, + "learning_rate": 8.436905181816685e-06, + "loss": 1.5964, + "step": 47596 + }, + { + "epoch": 0.5949898747468687, + "grad_norm": 6.521424770355225, + "learning_rate": 8.436043225882531e-06, + "loss": 0.901, + "step": 47598 + }, + { + "epoch": 0.5950148753718842, + "grad_norm": 6.66534948348999, + "learning_rate": 8.435181281859251e-06, + "loss": 1.5986, + "step": 47600 + }, + { + "epoch": 0.5950398759968999, + "grad_norm": 7.640644073486328, + "learning_rate": 8.434319349753411e-06, + "loss": 1.9942, + "step": 47602 + }, + { + "epoch": 0.5950648766219155, + "grad_norm": 6.052887916564941, + "learning_rate": 8.43345742957157e-06, + "loss": 1.4402, + "step": 47604 + }, + { + "epoch": 0.5950898772469312, + "grad_norm": 2.509279489517212, + "learning_rate": 8.432595521320298e-06, + "loss": 0.9742, + "step": 47606 + }, + { + "epoch": 0.5951148778719468, + "grad_norm": 2.322585105895996, + "learning_rate": 8.431733625006155e-06, + "loss": 0.2597, + "step": 47608 + }, + { + "epoch": 0.5951398784969624, + "grad_norm": 6.905354976654053, + "learning_rate": 8.430871740635707e-06, + "loss": 1.5856, + "step": 47610 + }, + { + "epoch": 0.595164879121978, + "grad_norm": 2.711615562438965, + "learning_rate": 8.430009868215519e-06, + "loss": 0.8673, + "step": 47612 + }, + { + "epoch": 0.5951898797469937, + "grad_norm": 3.0351598262786865, + "learning_rate": 8.429148007752156e-06, + "loss": 0.9497, + "step": 47614 + }, + { + "epoch": 0.5952148803720093, + "grad_norm": 3.147343158721924, + "learning_rate": 8.428286159252176e-06, + "loss": 1.2271, + "step": 47616 + }, + { + "epoch": 0.595239880997025, + "grad_norm": 0.009384600445628166, + "learning_rate": 8.427424322722147e-06, + "loss": 0.3392, + "step": 47618 + }, + { + "epoch": 0.5952648816220405, + "grad_norm": 4.795798301696777, + "learning_rate": 8.426562498168632e-06, + "loss": 1.0094, + "step": 47620 + }, + { + "epoch": 0.5952898822470561, + "grad_norm": 4.109121799468994, + "learning_rate": 8.425700685598194e-06, + "loss": 1.484, + "step": 47622 + }, + { + "epoch": 0.5953148828720718, + "grad_norm": 3.851533889770508, + "learning_rate": 8.4248388850174e-06, + "loss": 1.7007, + "step": 47624 + }, + { + "epoch": 0.5953398834970874, + "grad_norm": 2.708261728286743, + "learning_rate": 8.423977096432807e-06, + "loss": 0.1972, + "step": 47626 + }, + { + "epoch": 0.5953648841221031, + "grad_norm": 2.978123188018799, + "learning_rate": 8.423115319850984e-06, + "loss": 1.3382, + "step": 47628 + }, + { + "epoch": 0.5953898847471186, + "grad_norm": 0.0011633193353191018, + "learning_rate": 8.42225355527849e-06, + "loss": 0.7281, + "step": 47630 + }, + { + "epoch": 0.5954148853721343, + "grad_norm": 2.230841875076294, + "learning_rate": 8.42139180272189e-06, + "loss": 0.3991, + "step": 47632 + }, + { + "epoch": 0.5954398859971499, + "grad_norm": 3.2644128799438477, + "learning_rate": 8.420530062187747e-06, + "loss": 0.4002, + "step": 47634 + }, + { + "epoch": 0.5954648866221656, + "grad_norm": 3.9905197620391846, + "learning_rate": 8.419668333682623e-06, + "loss": 1.4151, + "step": 47636 + }, + { + "epoch": 0.5954898872471812, + "grad_norm": 3.3449506759643555, + "learning_rate": 8.418806617213083e-06, + "loss": 0.7865, + "step": 47638 + }, + { + "epoch": 0.5955148878721968, + "grad_norm": 5.410299777984619, + "learning_rate": 8.417944912785687e-06, + "loss": 3.4471, + "step": 47640 + }, + { + "epoch": 0.5955398884972124, + "grad_norm": 3.9812328815460205, + "learning_rate": 8.417083220407e-06, + "loss": 0.8482, + "step": 47642 + }, + { + "epoch": 0.595564889122228, + "grad_norm": 4.254965782165527, + "learning_rate": 8.416221540083584e-06, + "loss": 0.924, + "step": 47644 + }, + { + "epoch": 0.5955898897472437, + "grad_norm": 4.965426921844482, + "learning_rate": 8.415359871821999e-06, + "loss": 2.1045, + "step": 47646 + }, + { + "epoch": 0.5956148903722593, + "grad_norm": 0.00300826714374125, + "learning_rate": 8.41449821562881e-06, + "loss": 0.1882, + "step": 47648 + }, + { + "epoch": 0.5956398909972749, + "grad_norm": 4.156161785125732, + "learning_rate": 8.413636571510577e-06, + "loss": 1.8771, + "step": 47650 + }, + { + "epoch": 0.5956648916222905, + "grad_norm": 0.41223642230033875, + "learning_rate": 8.412774939473865e-06, + "loss": 0.1884, + "step": 47652 + }, + { + "epoch": 0.5956898922473062, + "grad_norm": 0.8655205965042114, + "learning_rate": 8.411913319525236e-06, + "loss": 0.015, + "step": 47654 + }, + { + "epoch": 0.5957148928723218, + "grad_norm": 2.859203338623047, + "learning_rate": 8.411051711671247e-06, + "loss": 0.6813, + "step": 47656 + }, + { + "epoch": 0.5957398934973375, + "grad_norm": 4.692699909210205, + "learning_rate": 8.410190115918465e-06, + "loss": 2.0445, + "step": 47658 + }, + { + "epoch": 0.595764894122353, + "grad_norm": 1.6639662981033325, + "learning_rate": 8.40932853227345e-06, + "loss": 0.1255, + "step": 47660 + }, + { + "epoch": 0.5957898947473687, + "grad_norm": 2.63173508644104, + "learning_rate": 8.408466960742765e-06, + "loss": 0.7721, + "step": 47662 + }, + { + "epoch": 0.5958148953723843, + "grad_norm": 4.9616265296936035, + "learning_rate": 8.40760540133297e-06, + "loss": 0.8932, + "step": 47664 + }, + { + "epoch": 0.5958398959974, + "grad_norm": 2.2633426189422607, + "learning_rate": 8.406743854050627e-06, + "loss": 0.5748, + "step": 47666 + }, + { + "epoch": 0.5958648966224156, + "grad_norm": 10.733400344848633, + "learning_rate": 8.405882318902298e-06, + "loss": 1.172, + "step": 47668 + }, + { + "epoch": 0.5958898972474311, + "grad_norm": 1.4537155628204346, + "learning_rate": 8.405020795894542e-06, + "loss": 0.4006, + "step": 47670 + }, + { + "epoch": 0.5959148978724468, + "grad_norm": 1.335782766342163, + "learning_rate": 8.404159285033925e-06, + "loss": 0.4955, + "step": 47672 + }, + { + "epoch": 0.5959398984974624, + "grad_norm": 0.004610566422343254, + "learning_rate": 8.403297786327002e-06, + "loss": 0.2329, + "step": 47674 + }, + { + "epoch": 0.5959648991224781, + "grad_norm": 2.2245874404907227, + "learning_rate": 8.40243629978034e-06, + "loss": 0.9254, + "step": 47676 + }, + { + "epoch": 0.5959898997474937, + "grad_norm": 0.004011936020106077, + "learning_rate": 8.401574825400496e-06, + "loss": 0.0579, + "step": 47678 + }, + { + "epoch": 0.5960149003725093, + "grad_norm": 1.7123279571533203, + "learning_rate": 8.400713363194033e-06, + "loss": 0.4161, + "step": 47680 + }, + { + "epoch": 0.5960399009975249, + "grad_norm": 0.616001307964325, + "learning_rate": 8.399851913167511e-06, + "loss": 0.4617, + "step": 47682 + }, + { + "epoch": 0.5960649016225406, + "grad_norm": 6.215956687927246, + "learning_rate": 8.398990475327488e-06, + "loss": 0.7537, + "step": 47684 + }, + { + "epoch": 0.5960899022475562, + "grad_norm": 6.248382568359375, + "learning_rate": 8.398129049680528e-06, + "loss": 1.9875, + "step": 47686 + }, + { + "epoch": 0.5961149028725719, + "grad_norm": 3.447770833969116, + "learning_rate": 8.397267636233192e-06, + "loss": 1.1218, + "step": 47688 + }, + { + "epoch": 0.5961399034975874, + "grad_norm": 3.03847599029541, + "learning_rate": 8.396406234992038e-06, + "loss": 1.3194, + "step": 47690 + }, + { + "epoch": 0.596164904122603, + "grad_norm": 3.2797389030456543, + "learning_rate": 8.395544845963626e-06, + "loss": 0.4901, + "step": 47692 + }, + { + "epoch": 0.5961899047476187, + "grad_norm": 4.470404148101807, + "learning_rate": 8.394683469154524e-06, + "loss": 0.9417, + "step": 47694 + }, + { + "epoch": 0.5962149053726343, + "grad_norm": 4.38192892074585, + "learning_rate": 8.39382210457128e-06, + "loss": 0.8145, + "step": 47696 + }, + { + "epoch": 0.59623990599765, + "grad_norm": 4.97644567489624, + "learning_rate": 8.39296075222046e-06, + "loss": 0.6957, + "step": 47698 + }, + { + "epoch": 0.5962649066226655, + "grad_norm": 3.830737352371216, + "learning_rate": 8.392099412108625e-06, + "loss": 0.8736, + "step": 47700 + }, + { + "epoch": 0.5962899072476812, + "grad_norm": 2.2086141109466553, + "learning_rate": 8.391238084242332e-06, + "loss": 0.0237, + "step": 47702 + }, + { + "epoch": 0.5963149078726968, + "grad_norm": 4.1567769050598145, + "learning_rate": 8.390376768628143e-06, + "loss": 0.9638, + "step": 47704 + }, + { + "epoch": 0.5963399084977125, + "grad_norm": 1.0488908290863037, + "learning_rate": 8.389515465272619e-06, + "loss": 0.6426, + "step": 47706 + }, + { + "epoch": 0.5963649091227281, + "grad_norm": 2.9286551475524902, + "learning_rate": 8.388654174182316e-06, + "loss": 0.6938, + "step": 47708 + }, + { + "epoch": 0.5963899097477436, + "grad_norm": 0.003558514406904578, + "learning_rate": 8.387792895363793e-06, + "loss": 0.686, + "step": 47710 + }, + { + "epoch": 0.5964149103727593, + "grad_norm": 3.8356733322143555, + "learning_rate": 8.386931628823613e-06, + "loss": 1.4658, + "step": 47712 + }, + { + "epoch": 0.5964399109977749, + "grad_norm": 2.730175733566284, + "learning_rate": 8.386070374568334e-06, + "loss": 1.3886, + "step": 47714 + }, + { + "epoch": 0.5964649116227906, + "grad_norm": 0.20095424354076385, + "learning_rate": 8.385209132604512e-06, + "loss": 0.1833, + "step": 47716 + }, + { + "epoch": 0.5964899122478062, + "grad_norm": 3.9161057472229004, + "learning_rate": 8.38434790293871e-06, + "loss": 1.1234, + "step": 47718 + }, + { + "epoch": 0.5965149128728218, + "grad_norm": 4.0450968742370605, + "learning_rate": 8.38348668557749e-06, + "loss": 1.3106, + "step": 47720 + }, + { + "epoch": 0.5965399134978374, + "grad_norm": 1.9642082452774048, + "learning_rate": 8.382625480527403e-06, + "loss": 0.6971, + "step": 47722 + }, + { + "epoch": 0.5965649141228531, + "grad_norm": 3.653282403945923, + "learning_rate": 8.381764287795014e-06, + "loss": 2.1426, + "step": 47724 + }, + { + "epoch": 0.5965899147478687, + "grad_norm": 2.6064345836639404, + "learning_rate": 8.380903107386877e-06, + "loss": 0.3087, + "step": 47726 + }, + { + "epoch": 0.5966149153728844, + "grad_norm": 1.58620285987854, + "learning_rate": 8.380041939309553e-06, + "loss": 0.2564, + "step": 47728 + }, + { + "epoch": 0.5966399159978999, + "grad_norm": 4.123347759246826, + "learning_rate": 8.3791807835696e-06, + "loss": 1.5537, + "step": 47730 + }, + { + "epoch": 0.5966649166229155, + "grad_norm": 11.150726318359375, + "learning_rate": 8.37831964017358e-06, + "loss": 0.8917, + "step": 47732 + }, + { + "epoch": 0.5966899172479312, + "grad_norm": 4.721108913421631, + "learning_rate": 8.377458509128048e-06, + "loss": 0.3496, + "step": 47734 + }, + { + "epoch": 0.5967149178729468, + "grad_norm": 0.0008587987977080047, + "learning_rate": 8.37659739043956e-06, + "loss": 1.2467, + "step": 47736 + }, + { + "epoch": 0.5967399184979625, + "grad_norm": 8.22646427154541, + "learning_rate": 8.375736284114676e-06, + "loss": 1.0428, + "step": 47738 + }, + { + "epoch": 0.596764919122978, + "grad_norm": 3.5540196895599365, + "learning_rate": 8.374875190159957e-06, + "loss": 1.0863, + "step": 47740 + }, + { + "epoch": 0.5967899197479937, + "grad_norm": 0.14723001420497894, + "learning_rate": 8.374014108581956e-06, + "loss": 0.5009, + "step": 47742 + }, + { + "epoch": 0.5968149203730093, + "grad_norm": 5.2079339027404785, + "learning_rate": 8.373153039387237e-06, + "loss": 1.6527, + "step": 47744 + }, + { + "epoch": 0.596839920998025, + "grad_norm": 5.297325134277344, + "learning_rate": 8.372291982582357e-06, + "loss": 1.7647, + "step": 47746 + }, + { + "epoch": 0.5968649216230406, + "grad_norm": 0.0012024883180856705, + "learning_rate": 8.371430938173866e-06, + "loss": 0.7675, + "step": 47748 + }, + { + "epoch": 0.5968899222480561, + "grad_norm": 3.051769495010376, + "learning_rate": 8.370569906168328e-06, + "loss": 1.3727, + "step": 47750 + }, + { + "epoch": 0.5969149228730718, + "grad_norm": 3.9969234466552734, + "learning_rate": 8.3697088865723e-06, + "loss": 1.1467, + "step": 47752 + }, + { + "epoch": 0.5969399234980874, + "grad_norm": 0.0024419608525931835, + "learning_rate": 8.368847879392334e-06, + "loss": 0.2264, + "step": 47754 + }, + { + "epoch": 0.5969649241231031, + "grad_norm": 5.025859355926514, + "learning_rate": 8.367986884634996e-06, + "loss": 0.7662, + "step": 47756 + }, + { + "epoch": 0.5969899247481187, + "grad_norm": 3.230670928955078, + "learning_rate": 8.367125902306843e-06, + "loss": 1.4511, + "step": 47758 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 18.028593063354492, + "learning_rate": 8.366264932414424e-06, + "loss": 2.4523, + "step": 47760 + }, + { + "epoch": 0.5970399259981499, + "grad_norm": 3.1841771602630615, + "learning_rate": 8.365403974964299e-06, + "loss": 1.0394, + "step": 47762 + }, + { + "epoch": 0.5970649266231656, + "grad_norm": 2.130908966064453, + "learning_rate": 8.364543029963028e-06, + "loss": 1.8302, + "step": 47764 + }, + { + "epoch": 0.5970899272481812, + "grad_norm": 4.288265228271484, + "learning_rate": 8.363682097417163e-06, + "loss": 1.4836, + "step": 47766 + }, + { + "epoch": 0.5971149278731969, + "grad_norm": 3.92903995513916, + "learning_rate": 8.362821177333266e-06, + "loss": 0.7627, + "step": 47768 + }, + { + "epoch": 0.5971399284982124, + "grad_norm": 5.78338098526001, + "learning_rate": 8.36196026971789e-06, + "loss": 0.974, + "step": 47770 + }, + { + "epoch": 0.597164929123228, + "grad_norm": 4.29806661605835, + "learning_rate": 8.3610993745776e-06, + "loss": 0.666, + "step": 47772 + }, + { + "epoch": 0.5971899297482437, + "grad_norm": 4.489459037780762, + "learning_rate": 8.36023849191894e-06, + "loss": 0.8984, + "step": 47774 + }, + { + "epoch": 0.5972149303732593, + "grad_norm": 2.2792131900787354, + "learning_rate": 8.35937762174847e-06, + "loss": 1.2359, + "step": 47776 + }, + { + "epoch": 0.597239930998275, + "grad_norm": 3.4134509563446045, + "learning_rate": 8.35851676407275e-06, + "loss": 1.6941, + "step": 47778 + }, + { + "epoch": 0.5972649316232905, + "grad_norm": 0.004255666863173246, + "learning_rate": 8.357655918898331e-06, + "loss": 0.6346, + "step": 47780 + }, + { + "epoch": 0.5972899322483062, + "grad_norm": 5.646005153656006, + "learning_rate": 8.356795086231775e-06, + "loss": 1.0771, + "step": 47782 + }, + { + "epoch": 0.5973149328733218, + "grad_norm": 5.766673564910889, + "learning_rate": 8.35593426607964e-06, + "loss": 1.6015, + "step": 47784 + }, + { + "epoch": 0.5973399334983375, + "grad_norm": 2.3035731315612793, + "learning_rate": 8.355073458448472e-06, + "loss": 0.6457, + "step": 47786 + }, + { + "epoch": 0.5973649341233531, + "grad_norm": 2.4343385696411133, + "learning_rate": 8.354212663344832e-06, + "loss": 0.3392, + "step": 47788 + }, + { + "epoch": 0.5973899347483687, + "grad_norm": 7.094142913818359, + "learning_rate": 8.353351880775275e-06, + "loss": 0.9175, + "step": 47790 + }, + { + "epoch": 0.5974149353733843, + "grad_norm": 0.38595762848854065, + "learning_rate": 8.352491110746357e-06, + "loss": 0.0075, + "step": 47792 + }, + { + "epoch": 0.5974399359984, + "grad_norm": 2.992572069168091, + "learning_rate": 8.351630353264635e-06, + "loss": 1.2309, + "step": 47794 + }, + { + "epoch": 0.5974649366234156, + "grad_norm": 0.011361906304955482, + "learning_rate": 8.350769608336662e-06, + "loss": 1.0161, + "step": 47796 + }, + { + "epoch": 0.5974899372484312, + "grad_norm": 3.4093501567840576, + "learning_rate": 8.349908875969e-06, + "loss": 1.7591, + "step": 47798 + }, + { + "epoch": 0.5975149378734468, + "grad_norm": 8.663514137268066, + "learning_rate": 8.349048156168193e-06, + "loss": 1.5575, + "step": 47800 + }, + { + "epoch": 0.5975399384984624, + "grad_norm": 2.155783176422119, + "learning_rate": 8.348187448940803e-06, + "loss": 0.103, + "step": 47802 + }, + { + "epoch": 0.5975649391234781, + "grad_norm": 4.151357650756836, + "learning_rate": 8.347326754293382e-06, + "loss": 1.017, + "step": 47804 + }, + { + "epoch": 0.5975899397484937, + "grad_norm": 3.470116138458252, + "learning_rate": 8.346466072232489e-06, + "loss": 0.7652, + "step": 47806 + }, + { + "epoch": 0.5976149403735094, + "grad_norm": 0.005832435563206673, + "learning_rate": 8.345605402764675e-06, + "loss": 0.0002, + "step": 47808 + }, + { + "epoch": 0.5976399409985249, + "grad_norm": 2.9083595275878906, + "learning_rate": 8.344744745896502e-06, + "loss": 0.416, + "step": 47810 + }, + { + "epoch": 0.5976649416235406, + "grad_norm": 1.2555012702941895, + "learning_rate": 8.343884101634514e-06, + "loss": 0.1826, + "step": 47812 + }, + { + "epoch": 0.5976899422485562, + "grad_norm": 2.770177125930786, + "learning_rate": 8.34302346998527e-06, + "loss": 1.6298, + "step": 47814 + }, + { + "epoch": 0.5977149428735719, + "grad_norm": 1.0575261116027832, + "learning_rate": 8.342162850955324e-06, + "loss": 0.3683, + "step": 47816 + }, + { + "epoch": 0.5977399434985875, + "grad_norm": 2.6406214237213135, + "learning_rate": 8.341302244551232e-06, + "loss": 0.8369, + "step": 47818 + }, + { + "epoch": 0.597764944123603, + "grad_norm": 3.800658702850342, + "learning_rate": 8.340441650779548e-06, + "loss": 2.247, + "step": 47820 + }, + { + "epoch": 0.5977899447486187, + "grad_norm": 0.0028638436924666166, + "learning_rate": 8.339581069646827e-06, + "loss": 0.0001, + "step": 47822 + }, + { + "epoch": 0.5978149453736343, + "grad_norm": 0.006926780100911856, + "learning_rate": 8.338720501159625e-06, + "loss": 0.9915, + "step": 47824 + }, + { + "epoch": 0.59783994599865, + "grad_norm": 3.259528160095215, + "learning_rate": 8.337859945324488e-06, + "loss": 1.3166, + "step": 47826 + }, + { + "epoch": 0.5978649466236656, + "grad_norm": 2.419189214706421, + "learning_rate": 8.336999402147974e-06, + "loss": 0.6716, + "step": 47828 + }, + { + "epoch": 0.5978899472486812, + "grad_norm": 1.5024206638336182, + "learning_rate": 8.336138871636637e-06, + "loss": 2.0338, + "step": 47830 + }, + { + "epoch": 0.5979149478736968, + "grad_norm": 1.431490182876587, + "learning_rate": 8.335278353797034e-06, + "loss": 1.392, + "step": 47832 + }, + { + "epoch": 0.5979399484987125, + "grad_norm": 2.170170307159424, + "learning_rate": 8.334417848635714e-06, + "loss": 0.4997, + "step": 47834 + }, + { + "epoch": 0.5979649491237281, + "grad_norm": 5.674100875854492, + "learning_rate": 8.333557356159236e-06, + "loss": 1.1063, + "step": 47836 + }, + { + "epoch": 0.5979899497487438, + "grad_norm": 2.6953868865966797, + "learning_rate": 8.332696876374146e-06, + "loss": 0.547, + "step": 47838 + }, + { + "epoch": 0.5980149503737593, + "grad_norm": 0.729199230670929, + "learning_rate": 8.331836409286998e-06, + "loss": 0.8172, + "step": 47840 + }, + { + "epoch": 0.5980399509987749, + "grad_norm": 0.8316742181777954, + "learning_rate": 8.330975954904352e-06, + "loss": 0.3928, + "step": 47842 + }, + { + "epoch": 0.5980649516237906, + "grad_norm": 3.194275140762329, + "learning_rate": 8.330115513232754e-06, + "loss": 0.5772, + "step": 47844 + }, + { + "epoch": 0.5980899522488062, + "grad_norm": 3.3595690727233887, + "learning_rate": 8.329255084278764e-06, + "loss": 0.987, + "step": 47846 + }, + { + "epoch": 0.5981149528738219, + "grad_norm": 4.929065227508545, + "learning_rate": 8.328394668048927e-06, + "loss": 0.8781, + "step": 47848 + }, + { + "epoch": 0.5981399534988374, + "grad_norm": 4.02938985824585, + "learning_rate": 8.327534264549808e-06, + "loss": 0.925, + "step": 47850 + }, + { + "epoch": 0.5981649541238531, + "grad_norm": 7.36995792388916, + "learning_rate": 8.326673873787945e-06, + "loss": 0.5055, + "step": 47852 + }, + { + "epoch": 0.5981899547488687, + "grad_norm": 3.8338217735290527, + "learning_rate": 8.325813495769895e-06, + "loss": 0.4699, + "step": 47854 + }, + { + "epoch": 0.5982149553738844, + "grad_norm": 3.0086262226104736, + "learning_rate": 8.324953130502216e-06, + "loss": 1.2543, + "step": 47856 + }, + { + "epoch": 0.5982399559989, + "grad_norm": 3.9752655029296875, + "learning_rate": 8.324092777991455e-06, + "loss": 1.5791, + "step": 47858 + }, + { + "epoch": 0.5982649566239155, + "grad_norm": 3.17018723487854, + "learning_rate": 8.323232438244167e-06, + "loss": 0.7871, + "step": 47860 + }, + { + "epoch": 0.5982899572489312, + "grad_norm": 3.5749921798706055, + "learning_rate": 8.32237211126691e-06, + "loss": 0.8971, + "step": 47862 + }, + { + "epoch": 0.5983149578739468, + "grad_norm": 1.9569145441055298, + "learning_rate": 8.321511797066225e-06, + "loss": 0.6686, + "step": 47864 + }, + { + "epoch": 0.5983399584989625, + "grad_norm": 3.3280515670776367, + "learning_rate": 8.320651495648663e-06, + "loss": 2.0987, + "step": 47866 + }, + { + "epoch": 0.5983649591239781, + "grad_norm": 4.715017318725586, + "learning_rate": 8.319791207020788e-06, + "loss": 1.5073, + "step": 47868 + }, + { + "epoch": 0.5983899597489937, + "grad_norm": 1.029658317565918, + "learning_rate": 8.318930931189145e-06, + "loss": 0.8863, + "step": 47870 + }, + { + "epoch": 0.5984149603740093, + "grad_norm": 2.6581592559814453, + "learning_rate": 8.318070668160284e-06, + "loss": 0.7207, + "step": 47872 + }, + { + "epoch": 0.598439960999025, + "grad_norm": 3.7453906536102295, + "learning_rate": 8.31721041794076e-06, + "loss": 0.1, + "step": 47874 + }, + { + "epoch": 0.5984649616240406, + "grad_norm": 2.262378215789795, + "learning_rate": 8.316350180537128e-06, + "loss": 1.0403, + "step": 47876 + }, + { + "epoch": 0.5984899622490563, + "grad_norm": 0.002895581303164363, + "learning_rate": 8.31548995595593e-06, + "loss": 0.4093, + "step": 47878 + }, + { + "epoch": 0.5985149628740718, + "grad_norm": 1.6232572793960571, + "learning_rate": 8.314629744203721e-06, + "loss": 0.0466, + "step": 47880 + }, + { + "epoch": 0.5985399634990874, + "grad_norm": 2.5907399654388428, + "learning_rate": 8.313769545287057e-06, + "loss": 1.0127, + "step": 47882 + }, + { + "epoch": 0.5985649641241031, + "grad_norm": 0.12418250739574432, + "learning_rate": 8.312909359212484e-06, + "loss": 0.8985, + "step": 47884 + }, + { + "epoch": 0.5985899647491187, + "grad_norm": 3.1591873168945312, + "learning_rate": 8.312049185986555e-06, + "loss": 0.6999, + "step": 47886 + }, + { + "epoch": 0.5986149653741344, + "grad_norm": 2.8841404914855957, + "learning_rate": 8.311189025615824e-06, + "loss": 0.9685, + "step": 47888 + }, + { + "epoch": 0.5986399659991499, + "grad_norm": 2.7044215202331543, + "learning_rate": 8.310328878106834e-06, + "loss": 0.7174, + "step": 47890 + }, + { + "epoch": 0.5986649666241656, + "grad_norm": 2.2841272354125977, + "learning_rate": 8.309468743466142e-06, + "loss": 0.3692, + "step": 47892 + }, + { + "epoch": 0.5986899672491812, + "grad_norm": 0.02168463170528412, + "learning_rate": 8.308608621700297e-06, + "loss": 0.0332, + "step": 47894 + }, + { + "epoch": 0.5987149678741969, + "grad_norm": 0.2543964684009552, + "learning_rate": 8.307748512815848e-06, + "loss": 0.5177, + "step": 47896 + }, + { + "epoch": 0.5987399684992125, + "grad_norm": 5.5529632568359375, + "learning_rate": 8.306888416819349e-06, + "loss": 1.8824, + "step": 47898 + }, + { + "epoch": 0.598764969124228, + "grad_norm": 3.536527395248413, + "learning_rate": 8.306028333717347e-06, + "loss": 0.9737, + "step": 47900 + }, + { + "epoch": 0.5987899697492437, + "grad_norm": 2.903812885284424, + "learning_rate": 8.305168263516397e-06, + "loss": 0.8969, + "step": 47902 + }, + { + "epoch": 0.5988149703742593, + "grad_norm": 0.6846624612808228, + "learning_rate": 8.304308206223042e-06, + "loss": 0.2119, + "step": 47904 + }, + { + "epoch": 0.598839970999275, + "grad_norm": 3.22229266166687, + "learning_rate": 8.303448161843838e-06, + "loss": 0.581, + "step": 47906 + }, + { + "epoch": 0.5988649716242906, + "grad_norm": 4.984726905822754, + "learning_rate": 8.302588130385331e-06, + "loss": 0.7549, + "step": 47908 + }, + { + "epoch": 0.5988899722493062, + "grad_norm": 5.948586940765381, + "learning_rate": 8.301728111854074e-06, + "loss": 1.3514, + "step": 47910 + }, + { + "epoch": 0.5989149728743218, + "grad_norm": 0.0031171680893749, + "learning_rate": 8.300868106256615e-06, + "loss": 0.4431, + "step": 47912 + }, + { + "epoch": 0.5989399734993375, + "grad_norm": 2.98547625541687, + "learning_rate": 8.300008113599504e-06, + "loss": 1.3405, + "step": 47914 + }, + { + "epoch": 0.5989649741243531, + "grad_norm": 3.209022045135498, + "learning_rate": 8.299148133889295e-06, + "loss": 0.4822, + "step": 47916 + }, + { + "epoch": 0.5989899747493688, + "grad_norm": 3.3422086238861084, + "learning_rate": 8.298288167132528e-06, + "loss": 0.771, + "step": 47918 + }, + { + "epoch": 0.5990149753743843, + "grad_norm": 1.948728322982788, + "learning_rate": 8.29742821333576e-06, + "loss": 0.5618, + "step": 47920 + }, + { + "epoch": 0.5990399759994, + "grad_norm": 2.3084628582000732, + "learning_rate": 8.296568272505536e-06, + "loss": 0.6903, + "step": 47922 + }, + { + "epoch": 0.5990649766244156, + "grad_norm": 1.7651618719100952, + "learning_rate": 8.295708344648409e-06, + "loss": 0.9297, + "step": 47924 + }, + { + "epoch": 0.5990899772494312, + "grad_norm": 1.7521677017211914, + "learning_rate": 8.294848429770924e-06, + "loss": 0.064, + "step": 47926 + }, + { + "epoch": 0.5991149778744469, + "grad_norm": 4.815563678741455, + "learning_rate": 8.293988527879638e-06, + "loss": 1.3302, + "step": 47928 + }, + { + "epoch": 0.5991399784994624, + "grad_norm": 3.484435558319092, + "learning_rate": 8.29312863898109e-06, + "loss": 1.6603, + "step": 47930 + }, + { + "epoch": 0.5991649791244781, + "grad_norm": 0.079791359603405, + "learning_rate": 8.292268763081832e-06, + "loss": 0.622, + "step": 47932 + }, + { + "epoch": 0.5991899797494937, + "grad_norm": 3.7175025939941406, + "learning_rate": 8.291408900188415e-06, + "loss": 1.0284, + "step": 47934 + }, + { + "epoch": 0.5992149803745094, + "grad_norm": 0.0025611610617488623, + "learning_rate": 8.290549050307384e-06, + "loss": 0.6378, + "step": 47936 + }, + { + "epoch": 0.599239980999525, + "grad_norm": 4.513824462890625, + "learning_rate": 8.289689213445292e-06, + "loss": 0.886, + "step": 47938 + }, + { + "epoch": 0.5992649816245406, + "grad_norm": 4.689242362976074, + "learning_rate": 8.288829389608685e-06, + "loss": 1.8531, + "step": 47940 + }, + { + "epoch": 0.5992899822495562, + "grad_norm": 0.06265386939048767, + "learning_rate": 8.287969578804113e-06, + "loss": 0.465, + "step": 47942 + }, + { + "epoch": 0.5993149828745719, + "grad_norm": 4.092440128326416, + "learning_rate": 8.28710978103812e-06, + "loss": 1.4797, + "step": 47944 + }, + { + "epoch": 0.5993399834995875, + "grad_norm": 2.2163336277008057, + "learning_rate": 8.286249996317257e-06, + "loss": 1.417, + "step": 47946 + }, + { + "epoch": 0.5993649841246032, + "grad_norm": 0.2681281268596649, + "learning_rate": 8.28539022464807e-06, + "loss": 0.9488, + "step": 47948 + }, + { + "epoch": 0.5993899847496187, + "grad_norm": 4.510804176330566, + "learning_rate": 8.284530466037111e-06, + "loss": 1.4701, + "step": 47950 + }, + { + "epoch": 0.5994149853746343, + "grad_norm": 3.4371349811553955, + "learning_rate": 8.283670720490926e-06, + "loss": 1.6786, + "step": 47952 + }, + { + "epoch": 0.59943998599965, + "grad_norm": 0.17842979729175568, + "learning_rate": 8.282810988016062e-06, + "loss": 0.0503, + "step": 47954 + }, + { + "epoch": 0.5994649866246656, + "grad_norm": 0.003986883442848921, + "learning_rate": 8.281951268619066e-06, + "loss": 0.0049, + "step": 47956 + }, + { + "epoch": 0.5994899872496813, + "grad_norm": 2.78115177154541, + "learning_rate": 8.281091562306484e-06, + "loss": 1.0603, + "step": 47958 + }, + { + "epoch": 0.5995149878746968, + "grad_norm": 5.416353702545166, + "learning_rate": 8.280231869084868e-06, + "loss": 0.4023, + "step": 47960 + }, + { + "epoch": 0.5995399884997125, + "grad_norm": 0.4263004660606384, + "learning_rate": 8.279372188960763e-06, + "loss": 0.1059, + "step": 47962 + }, + { + "epoch": 0.5995649891247281, + "grad_norm": 1.8545023202896118, + "learning_rate": 8.278512521940714e-06, + "loss": 0.7893, + "step": 47964 + }, + { + "epoch": 0.5995899897497438, + "grad_norm": 2.8109891414642334, + "learning_rate": 8.277652868031273e-06, + "loss": 0.4978, + "step": 47966 + }, + { + "epoch": 0.5996149903747594, + "grad_norm": 6.398804664611816, + "learning_rate": 8.276793227238985e-06, + "loss": 1.9311, + "step": 47968 + }, + { + "epoch": 0.5996399909997749, + "grad_norm": 2.276492118835449, + "learning_rate": 8.275933599570391e-06, + "loss": 0.4255, + "step": 47970 + }, + { + "epoch": 0.5996649916247906, + "grad_norm": 2.207498550415039, + "learning_rate": 8.275073985032047e-06, + "loss": 1.3104, + "step": 47972 + }, + { + "epoch": 0.5996899922498062, + "grad_norm": 4.498824596405029, + "learning_rate": 8.274214383630495e-06, + "loss": 0.3791, + "step": 47974 + }, + { + "epoch": 0.5997149928748219, + "grad_norm": 0.333036869764328, + "learning_rate": 8.273354795372281e-06, + "loss": 0.7374, + "step": 47976 + }, + { + "epoch": 0.5997399934998375, + "grad_norm": 4.6953277587890625, + "learning_rate": 8.272495220263952e-06, + "loss": 1.0075, + "step": 47978 + }, + { + "epoch": 0.5997649941248531, + "grad_norm": 1.7592518329620361, + "learning_rate": 8.271635658312058e-06, + "loss": 0.0653, + "step": 47980 + }, + { + "epoch": 0.5997899947498687, + "grad_norm": 2.8607535362243652, + "learning_rate": 8.27077610952314e-06, + "loss": 0.9376, + "step": 47982 + }, + { + "epoch": 0.5998149953748844, + "grad_norm": 3.3148481845855713, + "learning_rate": 8.269916573903749e-06, + "loss": 0.9458, + "step": 47984 + }, + { + "epoch": 0.5998399959999, + "grad_norm": 0.020120004191994667, + "learning_rate": 8.269057051460425e-06, + "loss": 0.2725, + "step": 47986 + }, + { + "epoch": 0.5998649966249157, + "grad_norm": 4.628477096557617, + "learning_rate": 8.268197542199722e-06, + "loss": 1.0233, + "step": 47988 + }, + { + "epoch": 0.5998899972499312, + "grad_norm": 1.4520012140274048, + "learning_rate": 8.267338046128178e-06, + "loss": 0.1351, + "step": 47990 + }, + { + "epoch": 0.5999149978749468, + "grad_norm": 3.351499557495117, + "learning_rate": 8.266478563252344e-06, + "loss": 1.0908, + "step": 47992 + }, + { + "epoch": 0.5999399984999625, + "grad_norm": 3.977086305618286, + "learning_rate": 8.265619093578766e-06, + "loss": 1.8145, + "step": 47994 + }, + { + "epoch": 0.5999649991249781, + "grad_norm": 2.0538136959075928, + "learning_rate": 8.264759637113985e-06, + "loss": 0.8614, + "step": 47996 + }, + { + "epoch": 0.5999899997499938, + "grad_norm": 2.7373321056365967, + "learning_rate": 8.26390019386455e-06, + "loss": 1.0909, + "step": 47998 + }, + { + "epoch": 0.6000150003750093, + "grad_norm": 6.260149002075195, + "learning_rate": 8.263040763837005e-06, + "loss": 1.7503, + "step": 48000 + }, + { + "epoch": 0.600040001000025, + "grad_norm": 0.05059182271361351, + "learning_rate": 8.262181347037896e-06, + "loss": 0.4769, + "step": 48002 + }, + { + "epoch": 0.6000650016250406, + "grad_norm": 0.002710068365558982, + "learning_rate": 8.26132194347377e-06, + "loss": 0.0871, + "step": 48004 + }, + { + "epoch": 0.6000900022500563, + "grad_norm": 2.1707427501678467, + "learning_rate": 8.260462553151169e-06, + "loss": 0.6224, + "step": 48006 + }, + { + "epoch": 0.6001150028750719, + "grad_norm": 6.615588188171387, + "learning_rate": 8.25960317607664e-06, + "loss": 1.5921, + "step": 48008 + }, + { + "epoch": 0.6001400035000874, + "grad_norm": 1.7158515453338623, + "learning_rate": 8.258743812256726e-06, + "loss": 0.6448, + "step": 48010 + }, + { + "epoch": 0.6001650041251031, + "grad_norm": 2.0192770957946777, + "learning_rate": 8.257884461697972e-06, + "loss": 0.3112, + "step": 48012 + }, + { + "epoch": 0.6001900047501187, + "grad_norm": 7.156093597412109, + "learning_rate": 8.257025124406925e-06, + "loss": 0.7453, + "step": 48014 + }, + { + "epoch": 0.6002150053751344, + "grad_norm": 0.0012267738347873092, + "learning_rate": 8.256165800390128e-06, + "loss": 0.5895, + "step": 48016 + }, + { + "epoch": 0.60024000600015, + "grad_norm": 4.6409525871276855, + "learning_rate": 8.255306489654124e-06, + "loss": 0.3914, + "step": 48018 + }, + { + "epoch": 0.6002650066251656, + "grad_norm": 1.7836073637008667, + "learning_rate": 8.254447192205462e-06, + "loss": 0.0843, + "step": 48020 + }, + { + "epoch": 0.6002900072501812, + "grad_norm": 2.1528522968292236, + "learning_rate": 8.253587908050681e-06, + "loss": 0.8517, + "step": 48022 + }, + { + "epoch": 0.6003150078751969, + "grad_norm": 4.048864841461182, + "learning_rate": 8.25272863719633e-06, + "loss": 0.7552, + "step": 48024 + }, + { + "epoch": 0.6003400085002125, + "grad_norm": 2.058443069458008, + "learning_rate": 8.251869379648947e-06, + "loss": 1.0048, + "step": 48026 + }, + { + "epoch": 0.6003650091252282, + "grad_norm": 2.7202351093292236, + "learning_rate": 8.251010135415083e-06, + "loss": 0.7761, + "step": 48028 + }, + { + "epoch": 0.6003900097502437, + "grad_norm": 0.7197694778442383, + "learning_rate": 8.250150904501276e-06, + "loss": 0.3878, + "step": 48030 + }, + { + "epoch": 0.6004150103752594, + "grad_norm": 0.000944924249779433, + "learning_rate": 8.249291686914075e-06, + "loss": 0.2067, + "step": 48032 + }, + { + "epoch": 0.600440011000275, + "grad_norm": 1.448067307472229, + "learning_rate": 8.248432482660019e-06, + "loss": 0.2619, + "step": 48034 + }, + { + "epoch": 0.6004650116252906, + "grad_norm": 0.7077910900115967, + "learning_rate": 8.247573291745654e-06, + "loss": 0.3872, + "step": 48036 + }, + { + "epoch": 0.6004900122503063, + "grad_norm": 0.0028712020721286535, + "learning_rate": 8.246714114177522e-06, + "loss": 0.5624, + "step": 48038 + }, + { + "epoch": 0.6005150128753218, + "grad_norm": 3.491896152496338, + "learning_rate": 8.24585494996217e-06, + "loss": 0.7693, + "step": 48040 + }, + { + "epoch": 0.6005400135003375, + "grad_norm": 4.0719895362854, + "learning_rate": 8.244995799106135e-06, + "loss": 1.6716, + "step": 48042 + }, + { + "epoch": 0.6005650141253531, + "grad_norm": 0.0034397391136735678, + "learning_rate": 8.244136661615964e-06, + "loss": 0.5442, + "step": 48044 + }, + { + "epoch": 0.6005900147503688, + "grad_norm": 6.148671627044678, + "learning_rate": 8.243277537498204e-06, + "loss": 1.4017, + "step": 48046 + }, + { + "epoch": 0.6006150153753844, + "grad_norm": 8.613445281982422, + "learning_rate": 8.242418426759392e-06, + "loss": 0.823, + "step": 48048 + }, + { + "epoch": 0.6006400160004, + "grad_norm": 2.072016477584839, + "learning_rate": 8.241559329406072e-06, + "loss": 0.1534, + "step": 48050 + }, + { + "epoch": 0.6006650166254156, + "grad_norm": 7.797093391418457, + "learning_rate": 8.240700245444787e-06, + "loss": 2.1556, + "step": 48052 + }, + { + "epoch": 0.6006900172504313, + "grad_norm": 2.097893714904785, + "learning_rate": 8.23984117488208e-06, + "loss": 0.3632, + "step": 48054 + }, + { + "epoch": 0.6007150178754469, + "grad_norm": 2.732107400894165, + "learning_rate": 8.238982117724494e-06, + "loss": 0.1678, + "step": 48056 + }, + { + "epoch": 0.6007400185004625, + "grad_norm": 2.5396182537078857, + "learning_rate": 8.238123073978573e-06, + "loss": 0.3517, + "step": 48058 + }, + { + "epoch": 0.6007650191254781, + "grad_norm": 2.465916872024536, + "learning_rate": 8.237264043650857e-06, + "loss": 1.3384, + "step": 48060 + }, + { + "epoch": 0.6007900197504937, + "grad_norm": 3.384636878967285, + "learning_rate": 8.236405026747888e-06, + "loss": 0.9704, + "step": 48062 + }, + { + "epoch": 0.6008150203755094, + "grad_norm": 3.0383801460266113, + "learning_rate": 8.235546023276208e-06, + "loss": 0.3732, + "step": 48064 + }, + { + "epoch": 0.600840021000525, + "grad_norm": 1.8367640972137451, + "learning_rate": 8.23468703324236e-06, + "loss": 0.6541, + "step": 48066 + }, + { + "epoch": 0.6008650216255407, + "grad_norm": 1.3429532051086426, + "learning_rate": 8.233828056652886e-06, + "loss": 0.8694, + "step": 48068 + }, + { + "epoch": 0.6008900222505562, + "grad_norm": 0.9336885809898376, + "learning_rate": 8.232969093514329e-06, + "loss": 0.4737, + "step": 48070 + }, + { + "epoch": 0.6009150228755719, + "grad_norm": 2.2042009830474854, + "learning_rate": 8.232110143833231e-06, + "loss": 0.1979, + "step": 48072 + }, + { + "epoch": 0.6009400235005875, + "grad_norm": 5.4975056648254395, + "learning_rate": 8.231251207616128e-06, + "loss": 0.9251, + "step": 48074 + }, + { + "epoch": 0.6009650241256032, + "grad_norm": 5.976530075073242, + "learning_rate": 8.230392284869569e-06, + "loss": 1.7149, + "step": 48076 + }, + { + "epoch": 0.6009900247506188, + "grad_norm": 2.0459835529327393, + "learning_rate": 8.229533375600091e-06, + "loss": 0.3994, + "step": 48078 + }, + { + "epoch": 0.6010150253756343, + "grad_norm": 2.637800455093384, + "learning_rate": 8.228674479814234e-06, + "loss": 1.2474, + "step": 48080 + }, + { + "epoch": 0.60104002600065, + "grad_norm": 4.976782321929932, + "learning_rate": 8.227815597518544e-06, + "loss": 1.1828, + "step": 48082 + }, + { + "epoch": 0.6010650266256656, + "grad_norm": 3.7962844371795654, + "learning_rate": 8.226956728719562e-06, + "loss": 1.9768, + "step": 48084 + }, + { + "epoch": 0.6010900272506813, + "grad_norm": 2.7579004764556885, + "learning_rate": 8.226097873423825e-06, + "loss": 1.1916, + "step": 48086 + }, + { + "epoch": 0.6011150278756969, + "grad_norm": 2.728107452392578, + "learning_rate": 8.225239031637875e-06, + "loss": 1.8491, + "step": 48088 + }, + { + "epoch": 0.6011400285007125, + "grad_norm": 0.7602834701538086, + "learning_rate": 8.224380203368254e-06, + "loss": 1.0828, + "step": 48090 + }, + { + "epoch": 0.6011650291257281, + "grad_norm": 5.452685356140137, + "learning_rate": 8.223521388621502e-06, + "loss": 1.8014, + "step": 48092 + }, + { + "epoch": 0.6011900297507438, + "grad_norm": 4.096703052520752, + "learning_rate": 8.222662587404158e-06, + "loss": 0.7991, + "step": 48094 + }, + { + "epoch": 0.6012150303757594, + "grad_norm": 7.098211288452148, + "learning_rate": 8.221803799722767e-06, + "loss": 2.5142, + "step": 48096 + }, + { + "epoch": 0.601240031000775, + "grad_norm": 3.460087776184082, + "learning_rate": 8.220945025583869e-06, + "loss": 1.2066, + "step": 48098 + }, + { + "epoch": 0.6012650316257906, + "grad_norm": 1.9969953298568726, + "learning_rate": 8.220086264993998e-06, + "loss": 1.7048, + "step": 48100 + }, + { + "epoch": 0.6012900322508062, + "grad_norm": 0.40895500779151917, + "learning_rate": 8.2192275179597e-06, + "loss": 0.004, + "step": 48102 + }, + { + "epoch": 0.6013150328758219, + "grad_norm": 4.729621887207031, + "learning_rate": 8.218368784487513e-06, + "loss": 2.1519, + "step": 48104 + }, + { + "epoch": 0.6013400335008375, + "grad_norm": 0.4529576897621155, + "learning_rate": 8.217510064583976e-06, + "loss": 0.023, + "step": 48106 + }, + { + "epoch": 0.6013650341258532, + "grad_norm": 5.920713424682617, + "learning_rate": 8.216651358255632e-06, + "loss": 1.8664, + "step": 48108 + }, + { + "epoch": 0.6013900347508687, + "grad_norm": 24.06836700439453, + "learning_rate": 8.215792665509021e-06, + "loss": 2.2912, + "step": 48110 + }, + { + "epoch": 0.6014150353758844, + "grad_norm": 3.0569324493408203, + "learning_rate": 8.214933986350679e-06, + "loss": 0.6715, + "step": 48112 + }, + { + "epoch": 0.6014400360009, + "grad_norm": 0.002214600332081318, + "learning_rate": 8.214075320787145e-06, + "loss": 0.0571, + "step": 48114 + }, + { + "epoch": 0.6014650366259157, + "grad_norm": 3.92941951751709, + "learning_rate": 8.213216668824964e-06, + "loss": 1.2843, + "step": 48116 + }, + { + "epoch": 0.6014900372509313, + "grad_norm": 3.5785341262817383, + "learning_rate": 8.21235803047067e-06, + "loss": 1.219, + "step": 48118 + }, + { + "epoch": 0.6015150378759468, + "grad_norm": 2.4194023609161377, + "learning_rate": 8.211499405730804e-06, + "loss": 1.1706, + "step": 48120 + }, + { + "epoch": 0.6015400385009625, + "grad_norm": 0.7641168832778931, + "learning_rate": 8.210640794611906e-06, + "loss": 0.403, + "step": 48122 + }, + { + "epoch": 0.6015650391259781, + "grad_norm": 4.0163044929504395, + "learning_rate": 8.20978219712052e-06, + "loss": 1.0359, + "step": 48124 + }, + { + "epoch": 0.6015900397509938, + "grad_norm": 2.046198844909668, + "learning_rate": 8.208923613263175e-06, + "loss": 0.7052, + "step": 48126 + }, + { + "epoch": 0.6016150403760094, + "grad_norm": 4.043252468109131, + "learning_rate": 8.208065043046414e-06, + "loss": 1.048, + "step": 48128 + }, + { + "epoch": 0.601640041001025, + "grad_norm": 2.2523353099823, + "learning_rate": 8.207206486476776e-06, + "loss": 0.3074, + "step": 48130 + }, + { + "epoch": 0.6016650416260406, + "grad_norm": 3.6968612670898438, + "learning_rate": 8.206347943560802e-06, + "loss": 0.9613, + "step": 48132 + }, + { + "epoch": 0.6016900422510563, + "grad_norm": 0.0015148930251598358, + "learning_rate": 8.205489414305024e-06, + "loss": 0.4414, + "step": 48134 + }, + { + "epoch": 0.6017150428760719, + "grad_norm": 2.634220838546753, + "learning_rate": 8.204630898715992e-06, + "loss": 0.4824, + "step": 48136 + }, + { + "epoch": 0.6017400435010876, + "grad_norm": 4.2211408615112305, + "learning_rate": 8.203772396800232e-06, + "loss": 1.6658, + "step": 48138 + }, + { + "epoch": 0.6017650441261031, + "grad_norm": 3.650815963745117, + "learning_rate": 8.20291390856429e-06, + "loss": 1.3417, + "step": 48140 + }, + { + "epoch": 0.6017900447511187, + "grad_norm": 3.030470132827759, + "learning_rate": 8.202055434014699e-06, + "loss": 0.8667, + "step": 48142 + }, + { + "epoch": 0.6018150453761344, + "grad_norm": 2.967463254928589, + "learning_rate": 8.201196973158e-06, + "loss": 0.5772, + "step": 48144 + }, + { + "epoch": 0.60184004600115, + "grad_norm": 16.095792770385742, + "learning_rate": 8.200338526000729e-06, + "loss": 0.4047, + "step": 48146 + }, + { + "epoch": 0.6018650466261657, + "grad_norm": 0.08328235894441605, + "learning_rate": 8.199480092549427e-06, + "loss": 0.5289, + "step": 48148 + }, + { + "epoch": 0.6018900472511812, + "grad_norm": 0.004099763929843903, + "learning_rate": 8.198621672810636e-06, + "loss": 0.0981, + "step": 48150 + }, + { + "epoch": 0.6019150478761969, + "grad_norm": 3.9436392784118652, + "learning_rate": 8.19776326679088e-06, + "loss": 1.1078, + "step": 48152 + }, + { + "epoch": 0.6019400485012125, + "grad_norm": 4.1574578285217285, + "learning_rate": 8.196904874496707e-06, + "loss": 0.8929, + "step": 48154 + }, + { + "epoch": 0.6019650491262282, + "grad_norm": 2.9225032329559326, + "learning_rate": 8.196046495934651e-06, + "loss": 0.6133, + "step": 48156 + }, + { + "epoch": 0.6019900497512438, + "grad_norm": 1.5171316862106323, + "learning_rate": 8.195188131111247e-06, + "loss": 0.6924, + "step": 48158 + }, + { + "epoch": 0.6020150503762594, + "grad_norm": 0.0024795704521238804, + "learning_rate": 8.194329780033037e-06, + "loss": 0.6727, + "step": 48160 + }, + { + "epoch": 0.602040051001275, + "grad_norm": 6.195246696472168, + "learning_rate": 8.193471442706555e-06, + "loss": 1.0228, + "step": 48162 + }, + { + "epoch": 0.6020650516262906, + "grad_norm": 4.132184028625488, + "learning_rate": 8.192613119138346e-06, + "loss": 1.4478, + "step": 48164 + }, + { + "epoch": 0.6020900522513063, + "grad_norm": 0.00265025207772851, + "learning_rate": 8.191754809334934e-06, + "loss": 0.0002, + "step": 48166 + }, + { + "epoch": 0.6021150528763219, + "grad_norm": 4.0181450843811035, + "learning_rate": 8.190896513302861e-06, + "loss": 2.1643, + "step": 48168 + }, + { + "epoch": 0.6021400535013375, + "grad_norm": 3.3201210498809814, + "learning_rate": 8.190038231048665e-06, + "loss": 0.8358, + "step": 48170 + }, + { + "epoch": 0.6021650541263531, + "grad_norm": 3.426668405532837, + "learning_rate": 8.18917996257888e-06, + "loss": 1.3187, + "step": 48172 + }, + { + "epoch": 0.6021900547513688, + "grad_norm": 4.756139755249023, + "learning_rate": 8.188321707900047e-06, + "loss": 1.2922, + "step": 48174 + }, + { + "epoch": 0.6022150553763844, + "grad_norm": 0.02539033256471157, + "learning_rate": 8.187463467018703e-06, + "loss": 0.5076, + "step": 48176 + }, + { + "epoch": 0.6022400560014001, + "grad_norm": 3.2489750385284424, + "learning_rate": 8.186605239941377e-06, + "loss": 1.524, + "step": 48178 + }, + { + "epoch": 0.6022650566264156, + "grad_norm": 4.410072326660156, + "learning_rate": 8.18574702667461e-06, + "loss": 1.0344, + "step": 48180 + }, + { + "epoch": 0.6022900572514313, + "grad_norm": 3.2908058166503906, + "learning_rate": 8.184888827224937e-06, + "loss": 1.1025, + "step": 48182 + }, + { + "epoch": 0.6023150578764469, + "grad_norm": 1.581837773323059, + "learning_rate": 8.18403064159889e-06, + "loss": 0.964, + "step": 48184 + }, + { + "epoch": 0.6023400585014625, + "grad_norm": 2.9891278743743896, + "learning_rate": 8.18317246980301e-06, + "loss": 0.6439, + "step": 48186 + }, + { + "epoch": 0.6023650591264782, + "grad_norm": 3.7043392658233643, + "learning_rate": 8.182314311843836e-06, + "loss": 0.7214, + "step": 48188 + }, + { + "epoch": 0.6023900597514937, + "grad_norm": 3.861208438873291, + "learning_rate": 8.1814561677279e-06, + "loss": 0.4968, + "step": 48190 + }, + { + "epoch": 0.6024150603765094, + "grad_norm": 4.673394203186035, + "learning_rate": 8.180598037461733e-06, + "loss": 1.1818, + "step": 48192 + }, + { + "epoch": 0.602440061001525, + "grad_norm": 1.593461275100708, + "learning_rate": 8.179739921051874e-06, + "loss": 0.4521, + "step": 48194 + }, + { + "epoch": 0.6024650616265407, + "grad_norm": 3.488162040710449, + "learning_rate": 8.178881818504858e-06, + "loss": 1.3371, + "step": 48196 + }, + { + "epoch": 0.6024900622515563, + "grad_norm": 0.003008641069754958, + "learning_rate": 8.17802372982722e-06, + "loss": 0.9081, + "step": 48198 + }, + { + "epoch": 0.6025150628765719, + "grad_norm": 3.8291709423065186, + "learning_rate": 8.177165655025497e-06, + "loss": 0.7695, + "step": 48200 + }, + { + "epoch": 0.6025400635015875, + "grad_norm": 3.567329168319702, + "learning_rate": 8.176307594106228e-06, + "loss": 2.1703, + "step": 48202 + }, + { + "epoch": 0.6025650641266032, + "grad_norm": 0.0012080759042873979, + "learning_rate": 8.175449547075936e-06, + "loss": 0.1454, + "step": 48204 + }, + { + "epoch": 0.6025900647516188, + "grad_norm": 2.5534589290618896, + "learning_rate": 8.174591513941164e-06, + "loss": 0.2419, + "step": 48206 + }, + { + "epoch": 0.6026150653766345, + "grad_norm": 3.238804340362549, + "learning_rate": 8.173733494708442e-06, + "loss": 0.74, + "step": 48208 + }, + { + "epoch": 0.60264006600165, + "grad_norm": 1.6738938093185425, + "learning_rate": 8.17287548938431e-06, + "loss": 1.0043, + "step": 48210 + }, + { + "epoch": 0.6026650666266656, + "grad_norm": 2.899573802947998, + "learning_rate": 8.172017497975298e-06, + "loss": 0.7616, + "step": 48212 + }, + { + "epoch": 0.6026900672516813, + "grad_norm": 4.5719523429870605, + "learning_rate": 8.171159520487945e-06, + "loss": 1.0094, + "step": 48214 + }, + { + "epoch": 0.6027150678766969, + "grad_norm": 1.9137485027313232, + "learning_rate": 8.170301556928784e-06, + "loss": 0.126, + "step": 48216 + }, + { + "epoch": 0.6027400685017126, + "grad_norm": 3.3890411853790283, + "learning_rate": 8.169443607304345e-06, + "loss": 0.8737, + "step": 48218 + }, + { + "epoch": 0.6027650691267281, + "grad_norm": 3.086799144744873, + "learning_rate": 8.168585671621163e-06, + "loss": 1.0574, + "step": 48220 + }, + { + "epoch": 0.6027900697517438, + "grad_norm": 4.7929277420043945, + "learning_rate": 8.167727749885775e-06, + "loss": 2.3294, + "step": 48222 + }, + { + "epoch": 0.6028150703767594, + "grad_norm": 3.4742653369903564, + "learning_rate": 8.166869842104713e-06, + "loss": 1.0872, + "step": 48224 + }, + { + "epoch": 0.6028400710017751, + "grad_norm": 1.9362852573394775, + "learning_rate": 8.16601194828451e-06, + "loss": 0.7253, + "step": 48226 + }, + { + "epoch": 0.6028650716267907, + "grad_norm": 0.00300482171587646, + "learning_rate": 8.165154068431707e-06, + "loss": 0.2009, + "step": 48228 + }, + { + "epoch": 0.6028900722518062, + "grad_norm": 1.2615535259246826, + "learning_rate": 8.164296202552827e-06, + "loss": 0.4511, + "step": 48230 + }, + { + "epoch": 0.6029150728768219, + "grad_norm": 8.642057418823242, + "learning_rate": 8.163438350654404e-06, + "loss": 1.9624, + "step": 48232 + }, + { + "epoch": 0.6029400735018375, + "grad_norm": 1.3357881307601929, + "learning_rate": 8.162580512742978e-06, + "loss": 0.2728, + "step": 48234 + }, + { + "epoch": 0.6029650741268532, + "grad_norm": 5.275697708129883, + "learning_rate": 8.161722688825077e-06, + "loss": 2.1805, + "step": 48236 + }, + { + "epoch": 0.6029900747518688, + "grad_norm": 2.9260778427124023, + "learning_rate": 8.160864878907239e-06, + "loss": 1.2284, + "step": 48238 + }, + { + "epoch": 0.6030150753768844, + "grad_norm": 1.9794608354568481, + "learning_rate": 8.160007082995992e-06, + "loss": 0.3561, + "step": 48240 + }, + { + "epoch": 0.6030400760019, + "grad_norm": 0.07341168075799942, + "learning_rate": 8.159149301097877e-06, + "loss": 0.5548, + "step": 48242 + }, + { + "epoch": 0.6030650766269157, + "grad_norm": 4.70941162109375, + "learning_rate": 8.158291533219414e-06, + "loss": 1.6562, + "step": 48244 + }, + { + "epoch": 0.6030900772519313, + "grad_norm": 3.83953857421875, + "learning_rate": 8.15743377936714e-06, + "loss": 0.5118, + "step": 48246 + }, + { + "epoch": 0.603115077876947, + "grad_norm": 4.688626289367676, + "learning_rate": 8.156576039547594e-06, + "loss": 1.9287, + "step": 48248 + }, + { + "epoch": 0.6031400785019625, + "grad_norm": 2.860525369644165, + "learning_rate": 8.155718313767302e-06, + "loss": 0.2053, + "step": 48250 + }, + { + "epoch": 0.6031650791269781, + "grad_norm": 6.995983600616455, + "learning_rate": 8.154860602032802e-06, + "loss": 1.2512, + "step": 48252 + }, + { + "epoch": 0.6031900797519938, + "grad_norm": 7.189907550811768, + "learning_rate": 8.154002904350624e-06, + "loss": 0.5746, + "step": 48254 + }, + { + "epoch": 0.6032150803770094, + "grad_norm": 3.3895323276519775, + "learning_rate": 8.153145220727294e-06, + "loss": 0.9602, + "step": 48256 + }, + { + "epoch": 0.6032400810020251, + "grad_norm": 0.9642294049263, + "learning_rate": 8.152287551169349e-06, + "loss": 0.2012, + "step": 48258 + }, + { + "epoch": 0.6032650816270406, + "grad_norm": 2.2650468349456787, + "learning_rate": 8.151429895683322e-06, + "loss": 0.5108, + "step": 48260 + }, + { + "epoch": 0.6032900822520563, + "grad_norm": 3.669739007949829, + "learning_rate": 8.150572254275743e-06, + "loss": 1.0099, + "step": 48262 + }, + { + "epoch": 0.6033150828770719, + "grad_norm": 0.0010592443868517876, + "learning_rate": 8.149714626953145e-06, + "loss": 0.4887, + "step": 48264 + }, + { + "epoch": 0.6033400835020876, + "grad_norm": 4.130573749542236, + "learning_rate": 8.148857013722057e-06, + "loss": 1.2331, + "step": 48266 + }, + { + "epoch": 0.6033650841271032, + "grad_norm": 3.8155410289764404, + "learning_rate": 8.147999414589019e-06, + "loss": 1.9911, + "step": 48268 + }, + { + "epoch": 0.6033900847521187, + "grad_norm": 0.007847589440643787, + "learning_rate": 8.147141829560547e-06, + "loss": 0.2114, + "step": 48270 + }, + { + "epoch": 0.6034150853771344, + "grad_norm": 6.202256679534912, + "learning_rate": 8.146284258643184e-06, + "loss": 0.4936, + "step": 48272 + }, + { + "epoch": 0.60344008600215, + "grad_norm": 2.968759536743164, + "learning_rate": 8.145426701843458e-06, + "loss": 0.1712, + "step": 48274 + }, + { + "epoch": 0.6034650866271657, + "grad_norm": 4.218911647796631, + "learning_rate": 8.1445691591679e-06, + "loss": 0.8633, + "step": 48276 + }, + { + "epoch": 0.6034900872521813, + "grad_norm": 2.358077049255371, + "learning_rate": 8.14371163062304e-06, + "loss": 0.8063, + "step": 48278 + }, + { + "epoch": 0.6035150878771969, + "grad_norm": 4.551360607147217, + "learning_rate": 8.142854116215414e-06, + "loss": 1.0456, + "step": 48280 + }, + { + "epoch": 0.6035400885022125, + "grad_norm": 1.5386775732040405, + "learning_rate": 8.141996615951544e-06, + "loss": 0.7928, + "step": 48282 + }, + { + "epoch": 0.6035650891272282, + "grad_norm": 4.150986194610596, + "learning_rate": 8.141139129837964e-06, + "loss": 0.2888, + "step": 48284 + }, + { + "epoch": 0.6035900897522438, + "grad_norm": 0.0008126929169520736, + "learning_rate": 8.140281657881209e-06, + "loss": 0.0451, + "step": 48286 + }, + { + "epoch": 0.6036150903772595, + "grad_norm": 3.0964419841766357, + "learning_rate": 8.139424200087804e-06, + "loss": 0.9923, + "step": 48288 + }, + { + "epoch": 0.603640091002275, + "grad_norm": 4.447278022766113, + "learning_rate": 8.13856675646428e-06, + "loss": 0.8946, + "step": 48290 + }, + { + "epoch": 0.6036650916272907, + "grad_norm": 5.663362979888916, + "learning_rate": 8.13770932701717e-06, + "loss": 0.8267, + "step": 48292 + }, + { + "epoch": 0.6036900922523063, + "grad_norm": 1.0185710191726685, + "learning_rate": 8.136851911753006e-06, + "loss": 0.0672, + "step": 48294 + }, + { + "epoch": 0.603715092877322, + "grad_norm": 6.0955915451049805, + "learning_rate": 8.13599451067831e-06, + "loss": 0.817, + "step": 48296 + }, + { + "epoch": 0.6037400935023376, + "grad_norm": 3.088717460632324, + "learning_rate": 8.135137123799617e-06, + "loss": 1.0576, + "step": 48298 + }, + { + "epoch": 0.6037650941273531, + "grad_norm": 4.150399684906006, + "learning_rate": 8.134279751123454e-06, + "loss": 1.2704, + "step": 48300 + }, + { + "epoch": 0.6037900947523688, + "grad_norm": 2.660456657409668, + "learning_rate": 8.133422392656355e-06, + "loss": 1.1651, + "step": 48302 + }, + { + "epoch": 0.6038150953773844, + "grad_norm": 0.0199327040463686, + "learning_rate": 8.132565048404846e-06, + "loss": 0.0004, + "step": 48304 + }, + { + "epoch": 0.6038400960024001, + "grad_norm": 2.524322509765625, + "learning_rate": 8.13170771837546e-06, + "loss": 0.716, + "step": 48306 + }, + { + "epoch": 0.6038650966274157, + "grad_norm": 1.613287329673767, + "learning_rate": 8.130850402574723e-06, + "loss": 0.1413, + "step": 48308 + }, + { + "epoch": 0.6038900972524313, + "grad_norm": 1.4853606224060059, + "learning_rate": 8.129993101009164e-06, + "loss": 0.5483, + "step": 48310 + }, + { + "epoch": 0.6039150978774469, + "grad_norm": 4.286666393280029, + "learning_rate": 8.129135813685313e-06, + "loss": 1.042, + "step": 48312 + }, + { + "epoch": 0.6039400985024626, + "grad_norm": 3.706190824508667, + "learning_rate": 8.128278540609698e-06, + "loss": 1.1693, + "step": 48314 + }, + { + "epoch": 0.6039650991274782, + "grad_norm": 4.32719087600708, + "learning_rate": 8.12742128178885e-06, + "loss": 1.1572, + "step": 48316 + }, + { + "epoch": 0.6039900997524938, + "grad_norm": 3.79272198677063, + "learning_rate": 8.126564037229296e-06, + "loss": 0.7415, + "step": 48318 + }, + { + "epoch": 0.6040151003775094, + "grad_norm": 2.449049949645996, + "learning_rate": 8.12570680693757e-06, + "loss": 0.4951, + "step": 48320 + }, + { + "epoch": 0.604040101002525, + "grad_norm": 2.5986316204071045, + "learning_rate": 8.12484959092019e-06, + "loss": 1.2577, + "step": 48322 + }, + { + "epoch": 0.6040651016275407, + "grad_norm": 2.064772844314575, + "learning_rate": 8.123992389183693e-06, + "loss": 0.7591, + "step": 48324 + }, + { + "epoch": 0.6040901022525563, + "grad_norm": 5.390254020690918, + "learning_rate": 8.123135201734604e-06, + "loss": 1.8795, + "step": 48326 + }, + { + "epoch": 0.604115102877572, + "grad_norm": 2.3723723888397217, + "learning_rate": 8.12227802857945e-06, + "loss": 0.9524, + "step": 48328 + }, + { + "epoch": 0.6041401035025875, + "grad_norm": 2.2171061038970947, + "learning_rate": 8.121420869724765e-06, + "loss": 0.3558, + "step": 48330 + }, + { + "epoch": 0.6041651041276032, + "grad_norm": 4.223928928375244, + "learning_rate": 8.120563725177072e-06, + "loss": 0.9241, + "step": 48332 + }, + { + "epoch": 0.6041901047526188, + "grad_norm": 2.459359884262085, + "learning_rate": 8.119706594942899e-06, + "loss": 0.2525, + "step": 48334 + }, + { + "epoch": 0.6042151053776345, + "grad_norm": 3.8560025691986084, + "learning_rate": 8.118849479028773e-06, + "loss": 0.3639, + "step": 48336 + }, + { + "epoch": 0.6042401060026501, + "grad_norm": 5.125353813171387, + "learning_rate": 8.117992377441226e-06, + "loss": 1.9858, + "step": 48338 + }, + { + "epoch": 0.6042651066276656, + "grad_norm": 5.064950466156006, + "learning_rate": 8.117135290186782e-06, + "loss": 0.5725, + "step": 48340 + }, + { + "epoch": 0.6042901072526813, + "grad_norm": 0.0013609497109428048, + "learning_rate": 8.116278217271968e-06, + "loss": 0.3468, + "step": 48342 + }, + { + "epoch": 0.6043151078776969, + "grad_norm": 4.012272834777832, + "learning_rate": 8.115421158703314e-06, + "loss": 1.2629, + "step": 48344 + }, + { + "epoch": 0.6043401085027126, + "grad_norm": 2.9505631923675537, + "learning_rate": 8.114564114487348e-06, + "loss": 0.3924, + "step": 48346 + }, + { + "epoch": 0.6043651091277282, + "grad_norm": 3.2749955654144287, + "learning_rate": 8.113707084630591e-06, + "loss": 1.2002, + "step": 48348 + }, + { + "epoch": 0.6043901097527438, + "grad_norm": 4.218690395355225, + "learning_rate": 8.112850069139577e-06, + "loss": 1.346, + "step": 48350 + }, + { + "epoch": 0.6044151103777594, + "grad_norm": 4.364165782928467, + "learning_rate": 8.111993068020829e-06, + "loss": 0.9655, + "step": 48352 + }, + { + "epoch": 0.6044401110027751, + "grad_norm": 0.03455941006541252, + "learning_rate": 8.111136081280874e-06, + "loss": 0.0007, + "step": 48354 + }, + { + "epoch": 0.6044651116277907, + "grad_norm": 3.216097593307495, + "learning_rate": 8.11027910892624e-06, + "loss": 0.5525, + "step": 48356 + }, + { + "epoch": 0.6044901122528064, + "grad_norm": 0.004617452155798674, + "learning_rate": 8.109422150963456e-06, + "loss": 0.6755, + "step": 48358 + }, + { + "epoch": 0.6045151128778219, + "grad_norm": 2.597679853439331, + "learning_rate": 8.108565207399043e-06, + "loss": 1.2182, + "step": 48360 + }, + { + "epoch": 0.6045401135028375, + "grad_norm": 4.073182582855225, + "learning_rate": 8.10770827823953e-06, + "loss": 0.4002, + "step": 48362 + }, + { + "epoch": 0.6045651141278532, + "grad_norm": 2.0001158714294434, + "learning_rate": 8.106851363491443e-06, + "loss": 0.4835, + "step": 48364 + }, + { + "epoch": 0.6045901147528688, + "grad_norm": 0.5552763342857361, + "learning_rate": 8.105994463161307e-06, + "loss": 0.0282, + "step": 48366 + }, + { + "epoch": 0.6046151153778845, + "grad_norm": 0.9276931881904602, + "learning_rate": 8.105137577255651e-06, + "loss": 0.6004, + "step": 48368 + }, + { + "epoch": 0.6046401160029, + "grad_norm": 4.354063987731934, + "learning_rate": 8.104280705780998e-06, + "loss": 1.2145, + "step": 48370 + }, + { + "epoch": 0.6046651166279157, + "grad_norm": 4.238426208496094, + "learning_rate": 8.103423848743878e-06, + "loss": 1.3407, + "step": 48372 + }, + { + "epoch": 0.6046901172529313, + "grad_norm": 5.914035320281982, + "learning_rate": 8.102567006150812e-06, + "loss": 1.427, + "step": 48374 + }, + { + "epoch": 0.604715117877947, + "grad_norm": 4.154957294464111, + "learning_rate": 8.101710178008326e-06, + "loss": 0.8437, + "step": 48376 + }, + { + "epoch": 0.6047401185029626, + "grad_norm": 0.0015648656990379095, + "learning_rate": 8.100853364322949e-06, + "loss": 0.0, + "step": 48378 + }, + { + "epoch": 0.6047651191279781, + "grad_norm": 0.002898798556998372, + "learning_rate": 8.099996565101203e-06, + "loss": 0.0, + "step": 48380 + }, + { + "epoch": 0.6047901197529938, + "grad_norm": 3.350247859954834, + "learning_rate": 8.099139780349615e-06, + "loss": 0.1525, + "step": 48382 + }, + { + "epoch": 0.6048151203780094, + "grad_norm": 5.814767360687256, + "learning_rate": 8.09828301007471e-06, + "loss": 1.0415, + "step": 48384 + }, + { + "epoch": 0.6048401210030251, + "grad_norm": 1.5385984182357788, + "learning_rate": 8.097426254283012e-06, + "loss": 1.3206, + "step": 48386 + }, + { + "epoch": 0.6048651216280407, + "grad_norm": 3.3139398097991943, + "learning_rate": 8.096569512981045e-06, + "loss": 1.6271, + "step": 48388 + }, + { + "epoch": 0.6048901222530563, + "grad_norm": 4.224145889282227, + "learning_rate": 8.095712786175338e-06, + "loss": 0.6746, + "step": 48390 + }, + { + "epoch": 0.6049151228780719, + "grad_norm": 0.0013173826737329364, + "learning_rate": 8.094856073872412e-06, + "loss": 0.0, + "step": 48392 + }, + { + "epoch": 0.6049401235030876, + "grad_norm": 5.8315629959106445, + "learning_rate": 8.093999376078791e-06, + "loss": 1.0361, + "step": 48394 + }, + { + "epoch": 0.6049651241281032, + "grad_norm": 3.389616012573242, + "learning_rate": 8.093142692801004e-06, + "loss": 1.2319, + "step": 48396 + }, + { + "epoch": 0.6049901247531189, + "grad_norm": 3.9251537322998047, + "learning_rate": 8.092286024045573e-06, + "loss": 1.2799, + "step": 48398 + }, + { + "epoch": 0.6050151253781344, + "grad_norm": 3.3601818084716797, + "learning_rate": 8.091429369819021e-06, + "loss": 1.622, + "step": 48400 + }, + { + "epoch": 0.60504012600315, + "grad_norm": 3.431195020675659, + "learning_rate": 8.090572730127872e-06, + "loss": 0.2449, + "step": 48402 + }, + { + "epoch": 0.6050651266281657, + "grad_norm": 2.627936840057373, + "learning_rate": 8.089716104978651e-06, + "loss": 0.662, + "step": 48404 + }, + { + "epoch": 0.6050901272531813, + "grad_norm": 3.194742202758789, + "learning_rate": 8.088859494377884e-06, + "loss": 0.6107, + "step": 48406 + }, + { + "epoch": 0.605115127878197, + "grad_norm": 0.0011450445745140314, + "learning_rate": 8.08800289833209e-06, + "loss": 0.8799, + "step": 48408 + }, + { + "epoch": 0.6051401285032125, + "grad_norm": 1.6700319051742554, + "learning_rate": 8.087146316847799e-06, + "loss": 0.483, + "step": 48410 + }, + { + "epoch": 0.6051651291282282, + "grad_norm": 3.0666332244873047, + "learning_rate": 8.086289749931529e-06, + "loss": 0.7651, + "step": 48412 + }, + { + "epoch": 0.6051901297532438, + "grad_norm": 1.0711617469787598, + "learning_rate": 8.085433197589804e-06, + "loss": 0.4469, + "step": 48414 + }, + { + "epoch": 0.6052151303782595, + "grad_norm": 3.9900686740875244, + "learning_rate": 8.084576659829152e-06, + "loss": 0.8005, + "step": 48416 + }, + { + "epoch": 0.6052401310032751, + "grad_norm": 0.0623924657702446, + "learning_rate": 8.083720136656091e-06, + "loss": 0.844, + "step": 48418 + }, + { + "epoch": 0.6052651316282907, + "grad_norm": 1.141016960144043, + "learning_rate": 8.082863628077149e-06, + "loss": 0.1382, + "step": 48420 + }, + { + "epoch": 0.6052901322533063, + "grad_norm": 3.623530864715576, + "learning_rate": 8.082007134098843e-06, + "loss": 1.3432, + "step": 48422 + }, + { + "epoch": 0.605315132878322, + "grad_norm": 0.010626464150846004, + "learning_rate": 8.081150654727704e-06, + "loss": 0.0002, + "step": 48424 + }, + { + "epoch": 0.6053401335033376, + "grad_norm": 5.2069411277771, + "learning_rate": 8.080294189970249e-06, + "loss": 1.7362, + "step": 48426 + }, + { + "epoch": 0.6053651341283532, + "grad_norm": 2.5997605323791504, + "learning_rate": 8.079437739833e-06, + "loss": 0.655, + "step": 48428 + }, + { + "epoch": 0.6053901347533688, + "grad_norm": 3.314554214477539, + "learning_rate": 8.07858130432248e-06, + "loss": 0.6248, + "step": 48430 + }, + { + "epoch": 0.6054151353783844, + "grad_norm": 2.834895610809326, + "learning_rate": 8.077724883445216e-06, + "loss": 0.3203, + "step": 48432 + }, + { + "epoch": 0.6054401360034001, + "grad_norm": 4.6577982902526855, + "learning_rate": 8.076868477207727e-06, + "loss": 0.0932, + "step": 48434 + }, + { + "epoch": 0.6054651366284157, + "grad_norm": 4.422093868255615, + "learning_rate": 8.076012085616535e-06, + "loss": 1.6371, + "step": 48436 + }, + { + "epoch": 0.6054901372534314, + "grad_norm": 2.101062059402466, + "learning_rate": 8.075155708678164e-06, + "loss": 1.0066, + "step": 48438 + }, + { + "epoch": 0.6055151378784469, + "grad_norm": 0.00048277564928866923, + "learning_rate": 8.074299346399135e-06, + "loss": 1.5983, + "step": 48440 + }, + { + "epoch": 0.6055401385034626, + "grad_norm": 6.029578685760498, + "learning_rate": 8.07344299878597e-06, + "loss": 0.4631, + "step": 48442 + }, + { + "epoch": 0.6055651391284782, + "grad_norm": 0.4339554011821747, + "learning_rate": 8.07258666584519e-06, + "loss": 0.0104, + "step": 48444 + }, + { + "epoch": 0.6055901397534939, + "grad_norm": 3.0257070064544678, + "learning_rate": 8.071730347583317e-06, + "loss": 0.988, + "step": 48446 + }, + { + "epoch": 0.6056151403785095, + "grad_norm": 1.4552427530288696, + "learning_rate": 8.070874044006872e-06, + "loss": 0.8163, + "step": 48448 + }, + { + "epoch": 0.605640141003525, + "grad_norm": 0.0027909402269870043, + "learning_rate": 8.070017755122382e-06, + "loss": 0.483, + "step": 48450 + }, + { + "epoch": 0.6056651416285407, + "grad_norm": 2.952470541000366, + "learning_rate": 8.06916148093636e-06, + "loss": 0.6668, + "step": 48452 + }, + { + "epoch": 0.6056901422535563, + "grad_norm": 2.4878270626068115, + "learning_rate": 8.06830522145533e-06, + "loss": 0.8605, + "step": 48454 + }, + { + "epoch": 0.605715142878572, + "grad_norm": 2.9215922355651855, + "learning_rate": 8.067448976685817e-06, + "loss": 0.8503, + "step": 48456 + }, + { + "epoch": 0.6057401435035876, + "grad_norm": 4.738234519958496, + "learning_rate": 8.066592746634337e-06, + "loss": 1.7143, + "step": 48458 + }, + { + "epoch": 0.6057651441286032, + "grad_norm": 0.8835908770561218, + "learning_rate": 8.065736531307415e-06, + "loss": 0.5762, + "step": 48460 + }, + { + "epoch": 0.6057901447536188, + "grad_norm": 6.197004795074463, + "learning_rate": 8.064880330711568e-06, + "loss": 1.0119, + "step": 48462 + }, + { + "epoch": 0.6058151453786345, + "grad_norm": 7.064082622528076, + "learning_rate": 8.064024144853324e-06, + "loss": 1.1311, + "step": 48464 + }, + { + "epoch": 0.6058401460036501, + "grad_norm": 2.8119072914123535, + "learning_rate": 8.063167973739194e-06, + "loss": 1.3355, + "step": 48466 + }, + { + "epoch": 0.6058651466286658, + "grad_norm": 2.121638774871826, + "learning_rate": 8.062311817375704e-06, + "loss": 1.7591, + "step": 48468 + }, + { + "epoch": 0.6058901472536813, + "grad_norm": 2.6571147441864014, + "learning_rate": 8.061455675769371e-06, + "loss": 1.1348, + "step": 48470 + }, + { + "epoch": 0.6059151478786969, + "grad_norm": 3.317460775375366, + "learning_rate": 8.06059954892672e-06, + "loss": 1.6492, + "step": 48472 + }, + { + "epoch": 0.6059401485037126, + "grad_norm": 4.0336995124816895, + "learning_rate": 8.059743436854267e-06, + "loss": 0.9262, + "step": 48474 + }, + { + "epoch": 0.6059651491287282, + "grad_norm": 0.38781243562698364, + "learning_rate": 8.058887339558537e-06, + "loss": 0.5839, + "step": 48476 + }, + { + "epoch": 0.6059901497537439, + "grad_norm": 4.558588027954102, + "learning_rate": 8.058031257046042e-06, + "loss": 1.9105, + "step": 48478 + }, + { + "epoch": 0.6060151503787594, + "grad_norm": 4.846762657165527, + "learning_rate": 8.057175189323308e-06, + "loss": 1.3771, + "step": 48480 + }, + { + "epoch": 0.6060401510037751, + "grad_norm": 5.534842014312744, + "learning_rate": 8.056319136396853e-06, + "loss": 1.4478, + "step": 48482 + }, + { + "epoch": 0.6060651516287907, + "grad_norm": 9.481222152709961, + "learning_rate": 8.055463098273198e-06, + "loss": 1.038, + "step": 48484 + }, + { + "epoch": 0.6060901522538064, + "grad_norm": 4.819306373596191, + "learning_rate": 8.054607074958858e-06, + "loss": 1.9152, + "step": 48486 + }, + { + "epoch": 0.606115152878822, + "grad_norm": 3.805591106414795, + "learning_rate": 8.053751066460358e-06, + "loss": 1.044, + "step": 48488 + }, + { + "epoch": 0.6061401535038375, + "grad_norm": 3.8460428714752197, + "learning_rate": 8.052895072784215e-06, + "loss": 1.1698, + "step": 48490 + }, + { + "epoch": 0.6061651541288532, + "grad_norm": 0.004880198277533054, + "learning_rate": 8.052039093936947e-06, + "loss": 1.4757, + "step": 48492 + }, + { + "epoch": 0.6061901547538688, + "grad_norm": 0.023961862549185753, + "learning_rate": 8.051183129925073e-06, + "loss": 0.0325, + "step": 48494 + }, + { + "epoch": 0.6062151553788845, + "grad_norm": 4.193896293640137, + "learning_rate": 8.050327180755114e-06, + "loss": 0.1746, + "step": 48496 + }, + { + "epoch": 0.6062401560039001, + "grad_norm": 4.30588436126709, + "learning_rate": 8.049471246433586e-06, + "loss": 0.9874, + "step": 48498 + }, + { + "epoch": 0.6062651566289157, + "grad_norm": 2.7442095279693604, + "learning_rate": 8.048615326967009e-06, + "loss": 0.772, + "step": 48500 + }, + { + "epoch": 0.6062901572539313, + "grad_norm": 3.0624194145202637, + "learning_rate": 8.047759422361906e-06, + "loss": 0.7308, + "step": 48502 + }, + { + "epoch": 0.606315157878947, + "grad_norm": 3.498842239379883, + "learning_rate": 8.046903532624786e-06, + "loss": 1.2044, + "step": 48504 + }, + { + "epoch": 0.6063401585039626, + "grad_norm": 5.8338236808776855, + "learning_rate": 8.046047657762174e-06, + "loss": 0.7027, + "step": 48506 + }, + { + "epoch": 0.6063651591289783, + "grad_norm": 3.158756971359253, + "learning_rate": 8.045191797780588e-06, + "loss": 1.0792, + "step": 48508 + }, + { + "epoch": 0.6063901597539938, + "grad_norm": 3.877702474594116, + "learning_rate": 8.044335952686542e-06, + "loss": 0.821, + "step": 48510 + }, + { + "epoch": 0.6064151603790094, + "grad_norm": 2.0072572231292725, + "learning_rate": 8.043480122486558e-06, + "loss": 0.8101, + "step": 48512 + }, + { + "epoch": 0.6064401610040251, + "grad_norm": 2.314119338989258, + "learning_rate": 8.042624307187151e-06, + "loss": 0.5035, + "step": 48514 + }, + { + "epoch": 0.6064651616290407, + "grad_norm": 3.180485248565674, + "learning_rate": 8.041768506794848e-06, + "loss": 1.2817, + "step": 48516 + }, + { + "epoch": 0.6064901622540564, + "grad_norm": 3.7228424549102783, + "learning_rate": 8.040912721316152e-06, + "loss": 0.6345, + "step": 48518 + }, + { + "epoch": 0.6065151628790719, + "grad_norm": 2.673393487930298, + "learning_rate": 8.040056950757589e-06, + "loss": 0.6264, + "step": 48520 + }, + { + "epoch": 0.6065401635040876, + "grad_norm": 3.5717597007751465, + "learning_rate": 8.039201195125676e-06, + "loss": 1.4097, + "step": 48522 + }, + { + "epoch": 0.6065651641291032, + "grad_norm": 3.858657121658325, + "learning_rate": 8.038345454426929e-06, + "loss": 1.1932, + "step": 48524 + }, + { + "epoch": 0.6065901647541189, + "grad_norm": 6.460424900054932, + "learning_rate": 8.037489728667864e-06, + "loss": 1.1617, + "step": 48526 + }, + { + "epoch": 0.6066151653791345, + "grad_norm": 3.7554233074188232, + "learning_rate": 8.036634017855006e-06, + "loss": 1.2704, + "step": 48528 + }, + { + "epoch": 0.60664016600415, + "grad_norm": 5.599824905395508, + "learning_rate": 8.03577832199486e-06, + "loss": 2.2122, + "step": 48530 + }, + { + "epoch": 0.6066651666291657, + "grad_norm": 3.3835859298706055, + "learning_rate": 8.034922641093949e-06, + "loss": 1.179, + "step": 48532 + }, + { + "epoch": 0.6066901672541813, + "grad_norm": 2.987579107284546, + "learning_rate": 8.03406697515879e-06, + "loss": 1.6802, + "step": 48534 + }, + { + "epoch": 0.606715167879197, + "grad_norm": 4.729719161987305, + "learning_rate": 8.033211324195899e-06, + "loss": 1.1995, + "step": 48536 + }, + { + "epoch": 0.6067401685042126, + "grad_norm": 3.6766769886016846, + "learning_rate": 8.03235568821179e-06, + "loss": 0.7131, + "step": 48538 + }, + { + "epoch": 0.6067651691292282, + "grad_norm": 1.992311716079712, + "learning_rate": 8.031500067212983e-06, + "loss": 0.5967, + "step": 48540 + }, + { + "epoch": 0.6067901697542438, + "grad_norm": 3.7664058208465576, + "learning_rate": 8.030644461205998e-06, + "loss": 0.1452, + "step": 48542 + }, + { + "epoch": 0.6068151703792595, + "grad_norm": 3.4197442531585693, + "learning_rate": 8.029788870197343e-06, + "loss": 0.9145, + "step": 48544 + }, + { + "epoch": 0.6068401710042751, + "grad_norm": 3.284862518310547, + "learning_rate": 8.028933294193535e-06, + "loss": 0.0221, + "step": 48546 + }, + { + "epoch": 0.6068651716292908, + "grad_norm": 1.9834867715835571, + "learning_rate": 8.028077733201095e-06, + "loss": 0.7276, + "step": 48548 + }, + { + "epoch": 0.6068901722543063, + "grad_norm": 4.768051624298096, + "learning_rate": 8.027222187226533e-06, + "loss": 0.8207, + "step": 48550 + }, + { + "epoch": 0.606915172879322, + "grad_norm": 5.494269371032715, + "learning_rate": 8.02636665627637e-06, + "loss": 1.9592, + "step": 48552 + }, + { + "epoch": 0.6069401735043376, + "grad_norm": 2.476015567779541, + "learning_rate": 8.025511140357123e-06, + "loss": 0.7433, + "step": 48554 + }, + { + "epoch": 0.6069651741293532, + "grad_norm": 5.187497138977051, + "learning_rate": 8.0246556394753e-06, + "loss": 1.1376, + "step": 48556 + }, + { + "epoch": 0.6069901747543689, + "grad_norm": 1.2323086261749268, + "learning_rate": 8.02380015363742e-06, + "loss": 0.1333, + "step": 48558 + }, + { + "epoch": 0.6070151753793844, + "grad_norm": 0.13108834624290466, + "learning_rate": 8.022944682850001e-06, + "loss": 0.6672, + "step": 48560 + }, + { + "epoch": 0.6070401760044001, + "grad_norm": 3.654160261154175, + "learning_rate": 8.022089227119553e-06, + "loss": 1.3066, + "step": 48562 + }, + { + "epoch": 0.6070651766294157, + "grad_norm": 1.513418436050415, + "learning_rate": 8.021233786452592e-06, + "loss": 0.5084, + "step": 48564 + }, + { + "epoch": 0.6070901772544314, + "grad_norm": 0.01638904958963394, + "learning_rate": 8.020378360855638e-06, + "loss": 0.4478, + "step": 48566 + }, + { + "epoch": 0.607115177879447, + "grad_norm": 1.7168875932693481, + "learning_rate": 8.019522950335205e-06, + "loss": 0.0851, + "step": 48568 + }, + { + "epoch": 0.6071401785044626, + "grad_norm": 3.890803813934326, + "learning_rate": 8.018667554897804e-06, + "loss": 0.9652, + "step": 48570 + }, + { + "epoch": 0.6071651791294782, + "grad_norm": 2.18265700340271, + "learning_rate": 8.017812174549947e-06, + "loss": 0.5502, + "step": 48572 + }, + { + "epoch": 0.6071901797544939, + "grad_norm": 0.0009523332701064646, + "learning_rate": 8.016956809298154e-06, + "loss": 0.5917, + "step": 48574 + }, + { + "epoch": 0.6072151803795095, + "grad_norm": 5.1826395988464355, + "learning_rate": 8.016101459148934e-06, + "loss": 1.6481, + "step": 48576 + }, + { + "epoch": 0.6072401810045251, + "grad_norm": 4.498344898223877, + "learning_rate": 8.015246124108809e-06, + "loss": 0.9427, + "step": 48578 + }, + { + "epoch": 0.6072651816295407, + "grad_norm": 0.5191982984542847, + "learning_rate": 8.014390804184291e-06, + "loss": 0.6453, + "step": 48580 + }, + { + "epoch": 0.6072901822545563, + "grad_norm": 6.833876609802246, + "learning_rate": 8.01353549938189e-06, + "loss": 1.6517, + "step": 48582 + }, + { + "epoch": 0.607315182879572, + "grad_norm": 4.103822708129883, + "learning_rate": 8.01268020970812e-06, + "loss": 1.4216, + "step": 48584 + }, + { + "epoch": 0.6073401835045876, + "grad_norm": 1.226838231086731, + "learning_rate": 8.011824935169496e-06, + "loss": 0.9053, + "step": 48586 + }, + { + "epoch": 0.6073651841296033, + "grad_norm": 1.197540521621704, + "learning_rate": 8.010969675772532e-06, + "loss": 0.1214, + "step": 48588 + }, + { + "epoch": 0.6073901847546188, + "grad_norm": 4.664125442504883, + "learning_rate": 8.010114431523742e-06, + "loss": 0.982, + "step": 48590 + }, + { + "epoch": 0.6074151853796345, + "grad_norm": 2.409654140472412, + "learning_rate": 8.00925920242964e-06, + "loss": 0.6773, + "step": 48592 + }, + { + "epoch": 0.6074401860046501, + "grad_norm": 2.6383872032165527, + "learning_rate": 8.008403988496742e-06, + "loss": 1.4067, + "step": 48594 + }, + { + "epoch": 0.6074651866296658, + "grad_norm": 5.109045505523682, + "learning_rate": 8.007548789731555e-06, + "loss": 1.9874, + "step": 48596 + }, + { + "epoch": 0.6074901872546814, + "grad_norm": 4.73509407043457, + "learning_rate": 8.006693606140594e-06, + "loss": 1.7535, + "step": 48598 + }, + { + "epoch": 0.6075151878796969, + "grad_norm": 3.3588197231292725, + "learning_rate": 8.005838437730369e-06, + "loss": 0.6112, + "step": 48600 + }, + { + "epoch": 0.6075401885047126, + "grad_norm": 5.660191535949707, + "learning_rate": 8.0049832845074e-06, + "loss": 0.811, + "step": 48602 + }, + { + "epoch": 0.6075651891297282, + "grad_norm": 0.00041959836380556226, + "learning_rate": 8.004128146478195e-06, + "loss": 0.5601, + "step": 48604 + }, + { + "epoch": 0.6075901897547439, + "grad_norm": 4.610557556152344, + "learning_rate": 8.003273023649273e-06, + "loss": 0.4648, + "step": 48606 + }, + { + "epoch": 0.6076151903797595, + "grad_norm": 3.161586284637451, + "learning_rate": 8.002417916027136e-06, + "loss": 0.3695, + "step": 48608 + }, + { + "epoch": 0.6076401910047751, + "grad_norm": 1.9083679914474487, + "learning_rate": 8.001562823618303e-06, + "loss": 1.2769, + "step": 48610 + }, + { + "epoch": 0.6076651916297907, + "grad_norm": 4.723910808563232, + "learning_rate": 8.000707746429282e-06, + "loss": 1.4054, + "step": 48612 + }, + { + "epoch": 0.6076901922548064, + "grad_norm": 13.062265396118164, + "learning_rate": 7.99985268446659e-06, + "loss": 0.7842, + "step": 48614 + }, + { + "epoch": 0.607715192879822, + "grad_norm": 5.964928150177002, + "learning_rate": 7.998997637736738e-06, + "loss": 0.1961, + "step": 48616 + }, + { + "epoch": 0.6077401935048377, + "grad_norm": 3.2621028423309326, + "learning_rate": 7.998142606246238e-06, + "loss": 1.1816, + "step": 48618 + }, + { + "epoch": 0.6077651941298532, + "grad_norm": 5.396901607513428, + "learning_rate": 7.997287590001604e-06, + "loss": 0.9105, + "step": 48620 + }, + { + "epoch": 0.6077901947548688, + "grad_norm": 5.174231052398682, + "learning_rate": 7.996432589009338e-06, + "loss": 1.3479, + "step": 48622 + }, + { + "epoch": 0.6078151953798845, + "grad_norm": 2.893720865249634, + "learning_rate": 7.99557760327596e-06, + "loss": 1.1244, + "step": 48624 + }, + { + "epoch": 0.6078401960049001, + "grad_norm": 3.2914109230041504, + "learning_rate": 7.994722632807978e-06, + "loss": 0.5794, + "step": 48626 + }, + { + "epoch": 0.6078651966299158, + "grad_norm": 2.2284209728240967, + "learning_rate": 7.993867677611907e-06, + "loss": 0.9483, + "step": 48628 + }, + { + "epoch": 0.6078901972549313, + "grad_norm": 4.620088577270508, + "learning_rate": 7.993012737694254e-06, + "loss": 0.5128, + "step": 48630 + }, + { + "epoch": 0.607915197879947, + "grad_norm": 3.977307081222534, + "learning_rate": 7.992157813061539e-06, + "loss": 0.772, + "step": 48632 + }, + { + "epoch": 0.6079401985049626, + "grad_norm": 2.73791241645813, + "learning_rate": 7.991302903720261e-06, + "loss": 1.5382, + "step": 48634 + }, + { + "epoch": 0.6079651991299783, + "grad_norm": 0.051521800458431244, + "learning_rate": 7.990448009676935e-06, + "loss": 0.139, + "step": 48636 + }, + { + "epoch": 0.6079901997549939, + "grad_norm": 2.5464537143707275, + "learning_rate": 7.989593130938071e-06, + "loss": 0.3656, + "step": 48638 + }, + { + "epoch": 0.6080152003800094, + "grad_norm": 2.7028493881225586, + "learning_rate": 7.988738267510185e-06, + "loss": 0.8068, + "step": 48640 + }, + { + "epoch": 0.6080402010050251, + "grad_norm": 4.006098747253418, + "learning_rate": 7.987883419399782e-06, + "loss": 1.9796, + "step": 48642 + }, + { + "epoch": 0.6080652016300407, + "grad_norm": 2.395031452178955, + "learning_rate": 7.987028586613377e-06, + "loss": 0.5493, + "step": 48644 + }, + { + "epoch": 0.6080902022550564, + "grad_norm": 3.1565771102905273, + "learning_rate": 7.98617376915748e-06, + "loss": 1.7375, + "step": 48646 + }, + { + "epoch": 0.608115202880072, + "grad_norm": 0.0013669920153915882, + "learning_rate": 7.985318967038594e-06, + "loss": 1.0185, + "step": 48648 + }, + { + "epoch": 0.6081402035050876, + "grad_norm": 0.0014489025343209505, + "learning_rate": 7.984464180263232e-06, + "loss": 1.194, + "step": 48650 + }, + { + "epoch": 0.6081652041301032, + "grad_norm": 7.749063014984131, + "learning_rate": 7.983609408837908e-06, + "loss": 1.9712, + "step": 48652 + }, + { + "epoch": 0.6081902047551189, + "grad_norm": 2.958692789077759, + "learning_rate": 7.98275465276913e-06, + "loss": 0.7193, + "step": 48654 + }, + { + "epoch": 0.6082152053801345, + "grad_norm": 4.747422695159912, + "learning_rate": 7.981899912063408e-06, + "loss": 0.9166, + "step": 48656 + }, + { + "epoch": 0.6082402060051502, + "grad_norm": 4.7545599937438965, + "learning_rate": 7.981045186727255e-06, + "loss": 1.3878, + "step": 48658 + }, + { + "epoch": 0.6082652066301657, + "grad_norm": 2.886667490005493, + "learning_rate": 7.980190476767171e-06, + "loss": 1.9461, + "step": 48660 + }, + { + "epoch": 0.6082902072551813, + "grad_norm": 7.559450626373291, + "learning_rate": 7.979335782189669e-06, + "loss": 0.4864, + "step": 48662 + }, + { + "epoch": 0.608315207880197, + "grad_norm": 0.0023996136151254177, + "learning_rate": 7.978481103001263e-06, + "loss": 0.3471, + "step": 48664 + }, + { + "epoch": 0.6083402085052126, + "grad_norm": 2.4081506729125977, + "learning_rate": 7.977626439208456e-06, + "loss": 0.9634, + "step": 48666 + }, + { + "epoch": 0.6083652091302283, + "grad_norm": 4.828892230987549, + "learning_rate": 7.976771790817763e-06, + "loss": 1.1337, + "step": 48668 + }, + { + "epoch": 0.6083902097552438, + "grad_norm": 0.7714526057243347, + "learning_rate": 7.97591715783569e-06, + "loss": 0.0724, + "step": 48670 + }, + { + "epoch": 0.6084152103802595, + "grad_norm": 5.780153751373291, + "learning_rate": 7.97506254026875e-06, + "loss": 2.3465, + "step": 48672 + }, + { + "epoch": 0.6084402110052751, + "grad_norm": 4.2654290199279785, + "learning_rate": 7.97420793812344e-06, + "loss": 0.7383, + "step": 48674 + }, + { + "epoch": 0.6084652116302908, + "grad_norm": 0.751335620880127, + "learning_rate": 7.973353351406277e-06, + "loss": 0.0372, + "step": 48676 + }, + { + "epoch": 0.6084902122553064, + "grad_norm": 6.085970878601074, + "learning_rate": 7.97249878012377e-06, + "loss": 0.8766, + "step": 48678 + }, + { + "epoch": 0.608515212880322, + "grad_norm": 5.394011497497559, + "learning_rate": 7.971644224282424e-06, + "loss": 1.3212, + "step": 48680 + }, + { + "epoch": 0.6085402135053376, + "grad_norm": 4.144909858703613, + "learning_rate": 7.97078968388875e-06, + "loss": 1.5605, + "step": 48682 + }, + { + "epoch": 0.6085652141303532, + "grad_norm": 0.9291056394577026, + "learning_rate": 7.969935158949253e-06, + "loss": 0.8052, + "step": 48684 + }, + { + "epoch": 0.6085902147553689, + "grad_norm": 2.802518367767334, + "learning_rate": 7.969080649470447e-06, + "loss": 0.3753, + "step": 48686 + }, + { + "epoch": 0.6086152153803845, + "grad_norm": 2.218186378479004, + "learning_rate": 7.968226155458834e-06, + "loss": 0.7084, + "step": 48688 + }, + { + "epoch": 0.6086402160054001, + "grad_norm": 2.4266037940979004, + "learning_rate": 7.967371676920922e-06, + "loss": 0.341, + "step": 48690 + }, + { + "epoch": 0.6086652166304157, + "grad_norm": 5.013915061950684, + "learning_rate": 7.96651721386322e-06, + "loss": 0.9212, + "step": 48692 + }, + { + "epoch": 0.6086902172554314, + "grad_norm": 6.332291126251221, + "learning_rate": 7.965662766292235e-06, + "loss": 0.7613, + "step": 48694 + }, + { + "epoch": 0.608715217880447, + "grad_norm": 7.2914299964904785, + "learning_rate": 7.964808334214475e-06, + "loss": 2.4384, + "step": 48696 + }, + { + "epoch": 0.6087402185054627, + "grad_norm": 3.1120364665985107, + "learning_rate": 7.963953917636449e-06, + "loss": 0.6801, + "step": 48698 + }, + { + "epoch": 0.6087652191304782, + "grad_norm": 3.458470106124878, + "learning_rate": 7.96309951656466e-06, + "loss": 0.551, + "step": 48700 + }, + { + "epoch": 0.6087902197554939, + "grad_norm": 2.8389194011688232, + "learning_rate": 7.962245131005617e-06, + "loss": 0.9654, + "step": 48702 + }, + { + "epoch": 0.6088152203805095, + "grad_norm": 2.6385109424591064, + "learning_rate": 7.961390760965825e-06, + "loss": 1.0736, + "step": 48704 + }, + { + "epoch": 0.6088402210055252, + "grad_norm": 6.668403625488281, + "learning_rate": 7.960536406451795e-06, + "loss": 0.9516, + "step": 48706 + }, + { + "epoch": 0.6088652216305408, + "grad_norm": 4.373481273651123, + "learning_rate": 7.959682067470032e-06, + "loss": 1.3227, + "step": 48708 + }, + { + "epoch": 0.6088902222555563, + "grad_norm": 1.1266932487487793, + "learning_rate": 7.95882774402704e-06, + "loss": 0.8259, + "step": 48710 + }, + { + "epoch": 0.608915222880572, + "grad_norm": 8.958934783935547, + "learning_rate": 7.957973436129331e-06, + "loss": 1.0989, + "step": 48712 + }, + { + "epoch": 0.6089402235055876, + "grad_norm": 4.439000606536865, + "learning_rate": 7.957119143783405e-06, + "loss": 1.1048, + "step": 48714 + }, + { + "epoch": 0.6089652241306033, + "grad_norm": 3.566394567489624, + "learning_rate": 7.95626486699577e-06, + "loss": 0.7833, + "step": 48716 + }, + { + "epoch": 0.6089902247556189, + "grad_norm": 1.157902479171753, + "learning_rate": 7.955410605772933e-06, + "loss": 1.3707, + "step": 48718 + }, + { + "epoch": 0.6090152253806345, + "grad_norm": 3.176255464553833, + "learning_rate": 7.954556360121399e-06, + "loss": 1.3149, + "step": 48720 + }, + { + "epoch": 0.6090402260056501, + "grad_norm": 2.940472364425659, + "learning_rate": 7.953702130047676e-06, + "loss": 0.5837, + "step": 48722 + }, + { + "epoch": 0.6090652266306658, + "grad_norm": 1.9576802253723145, + "learning_rate": 7.95284791555827e-06, + "loss": 1.0402, + "step": 48724 + }, + { + "epoch": 0.6090902272556814, + "grad_norm": 3.334409236907959, + "learning_rate": 7.951993716659681e-06, + "loss": 1.3287, + "step": 48726 + }, + { + "epoch": 0.609115227880697, + "grad_norm": 3.54526424407959, + "learning_rate": 7.95113953335842e-06, + "loss": 1.4435, + "step": 48728 + }, + { + "epoch": 0.6091402285057126, + "grad_norm": 3.063685178756714, + "learning_rate": 7.95028536566099e-06, + "loss": 0.4912, + "step": 48730 + }, + { + "epoch": 0.6091652291307282, + "grad_norm": 5.120991230010986, + "learning_rate": 7.949431213573898e-06, + "loss": 0.9111, + "step": 48732 + }, + { + "epoch": 0.6091902297557439, + "grad_norm": 1.9346771240234375, + "learning_rate": 7.948577077103645e-06, + "loss": 0.2831, + "step": 48734 + }, + { + "epoch": 0.6092152303807595, + "grad_norm": 0.8454941511154175, + "learning_rate": 7.94772295625674e-06, + "loss": 0.8393, + "step": 48736 + }, + { + "epoch": 0.6092402310057752, + "grad_norm": 6.447100639343262, + "learning_rate": 7.94686885103969e-06, + "loss": 0.4237, + "step": 48738 + }, + { + "epoch": 0.6092652316307907, + "grad_norm": 0.08352403342723846, + "learning_rate": 7.946014761458994e-06, + "loss": 0.2064, + "step": 48740 + }, + { + "epoch": 0.6092902322558064, + "grad_norm": 3.6721127033233643, + "learning_rate": 7.94516068752116e-06, + "loss": 0.6328, + "step": 48742 + }, + { + "epoch": 0.609315232880822, + "grad_norm": 2.6132493019104004, + "learning_rate": 7.94430662923269e-06, + "loss": 1.6692, + "step": 48744 + }, + { + "epoch": 0.6093402335058377, + "grad_norm": 2.625225782394409, + "learning_rate": 7.94345258660009e-06, + "loss": 1.1882, + "step": 48746 + }, + { + "epoch": 0.6093652341308533, + "grad_norm": 1.8616989850997925, + "learning_rate": 7.942598559629864e-06, + "loss": 0.8867, + "step": 48748 + }, + { + "epoch": 0.6093902347558688, + "grad_norm": 6.23043966293335, + "learning_rate": 7.94174454832852e-06, + "loss": 1.7628, + "step": 48750 + }, + { + "epoch": 0.6094152353808845, + "grad_norm": 4.220569133758545, + "learning_rate": 7.940890552702555e-06, + "loss": 0.6106, + "step": 48752 + }, + { + "epoch": 0.6094402360059001, + "grad_norm": 2.7037432193756104, + "learning_rate": 7.940036572758479e-06, + "loss": 0.4392, + "step": 48754 + }, + { + "epoch": 0.6094652366309158, + "grad_norm": 4.498518943786621, + "learning_rate": 7.939182608502792e-06, + "loss": 1.3142, + "step": 48756 + }, + { + "epoch": 0.6094902372559314, + "grad_norm": 0.0034152292646467686, + "learning_rate": 7.938328659941997e-06, + "loss": 0.2981, + "step": 48758 + }, + { + "epoch": 0.609515237880947, + "grad_norm": 2.8870277404785156, + "learning_rate": 7.9374747270826e-06, + "loss": 1.2622, + "step": 48760 + }, + { + "epoch": 0.6095402385059626, + "grad_norm": 3.129814624786377, + "learning_rate": 7.936620809931104e-06, + "loss": 0.6351, + "step": 48762 + }, + { + "epoch": 0.6095652391309783, + "grad_norm": 3.1316545009613037, + "learning_rate": 7.935766908494015e-06, + "loss": 0.578, + "step": 48764 + }, + { + "epoch": 0.6095902397559939, + "grad_norm": 7.128087520599365, + "learning_rate": 7.934913022777832e-06, + "loss": 1.3005, + "step": 48766 + }, + { + "epoch": 0.6096152403810096, + "grad_norm": 6.591156959533691, + "learning_rate": 7.934059152789057e-06, + "loss": 0.8373, + "step": 48768 + }, + { + "epoch": 0.6096402410060251, + "grad_norm": 2.6662025451660156, + "learning_rate": 7.933205298534197e-06, + "loss": 0.2418, + "step": 48770 + }, + { + "epoch": 0.6096652416310407, + "grad_norm": 4.675924301147461, + "learning_rate": 7.932351460019753e-06, + "loss": 1.4429, + "step": 48772 + }, + { + "epoch": 0.6096902422560564, + "grad_norm": 2.1786341667175293, + "learning_rate": 7.931497637252226e-06, + "loss": 0.137, + "step": 48774 + }, + { + "epoch": 0.609715242881072, + "grad_norm": 4.323419570922852, + "learning_rate": 7.930643830238125e-06, + "loss": 1.4043, + "step": 48776 + }, + { + "epoch": 0.6097402435060877, + "grad_norm": 2.956040382385254, + "learning_rate": 7.929790038983945e-06, + "loss": 1.5137, + "step": 48778 + }, + { + "epoch": 0.6097652441311032, + "grad_norm": 6.058319568634033, + "learning_rate": 7.928936263496191e-06, + "loss": 1.0022, + "step": 48780 + }, + { + "epoch": 0.6097902447561189, + "grad_norm": 5.724618434906006, + "learning_rate": 7.928082503781365e-06, + "loss": 0.7428, + "step": 48782 + }, + { + "epoch": 0.6098152453811345, + "grad_norm": 3.8825459480285645, + "learning_rate": 7.927228759845972e-06, + "loss": 1.0524, + "step": 48784 + }, + { + "epoch": 0.6098402460061502, + "grad_norm": 1.0005004405975342, + "learning_rate": 7.926375031696509e-06, + "loss": 1.2192, + "step": 48786 + }, + { + "epoch": 0.6098652466311658, + "grad_norm": 0.000689513108227402, + "learning_rate": 7.925521319339481e-06, + "loss": 0.2566, + "step": 48788 + }, + { + "epoch": 0.6098902472561814, + "grad_norm": 1.8860112428665161, + "learning_rate": 7.924667622781393e-06, + "loss": 0.3882, + "step": 48790 + }, + { + "epoch": 0.609915247881197, + "grad_norm": 3.8046579360961914, + "learning_rate": 7.92381394202874e-06, + "loss": 1.5068, + "step": 48792 + }, + { + "epoch": 0.6099402485062126, + "grad_norm": 0.0009614626760594547, + "learning_rate": 7.922960277088027e-06, + "loss": 0.357, + "step": 48794 + }, + { + "epoch": 0.6099652491312283, + "grad_norm": 4.864282608032227, + "learning_rate": 7.922106627965753e-06, + "loss": 0.5605, + "step": 48796 + }, + { + "epoch": 0.6099902497562439, + "grad_norm": 0.004535297397524118, + "learning_rate": 7.921252994668422e-06, + "loss": 0.9676, + "step": 48798 + }, + { + "epoch": 0.6100152503812595, + "grad_norm": 3.116283655166626, + "learning_rate": 7.920399377202535e-06, + "loss": 0.5203, + "step": 48800 + }, + { + "epoch": 0.6100402510062751, + "grad_norm": 2.4307987689971924, + "learning_rate": 7.919545775574594e-06, + "loss": 0.1628, + "step": 48802 + }, + { + "epoch": 0.6100652516312908, + "grad_norm": 2.1155519485473633, + "learning_rate": 7.918692189791095e-06, + "loss": 0.4909, + "step": 48804 + }, + { + "epoch": 0.6100902522563064, + "grad_norm": 0.4521171748638153, + "learning_rate": 7.917838619858543e-06, + "loss": 0.0333, + "step": 48806 + }, + { + "epoch": 0.6101152528813221, + "grad_norm": 0.00040196688496507704, + "learning_rate": 7.916985065783438e-06, + "loss": 0.3918, + "step": 48808 + }, + { + "epoch": 0.6101402535063376, + "grad_norm": 1.3963682651519775, + "learning_rate": 7.91613152757228e-06, + "loss": 0.0577, + "step": 48810 + }, + { + "epoch": 0.6101652541313533, + "grad_norm": 3.4818437099456787, + "learning_rate": 7.915278005231569e-06, + "loss": 1.4539, + "step": 48812 + }, + { + "epoch": 0.6101902547563689, + "grad_norm": 0.0009028074564412236, + "learning_rate": 7.914424498767805e-06, + "loss": 0.5299, + "step": 48814 + }, + { + "epoch": 0.6102152553813845, + "grad_norm": 3.694347381591797, + "learning_rate": 7.913571008187492e-06, + "loss": 0.6836, + "step": 48816 + }, + { + "epoch": 0.6102402560064002, + "grad_norm": 0.7069395184516907, + "learning_rate": 7.912717533497127e-06, + "loss": 0.3253, + "step": 48818 + }, + { + "epoch": 0.6102652566314157, + "grad_norm": 4.090511798858643, + "learning_rate": 7.911864074703208e-06, + "loss": 1.4327, + "step": 48820 + }, + { + "epoch": 0.6102902572564314, + "grad_norm": 3.465698003768921, + "learning_rate": 7.911010631812236e-06, + "loss": 0.8608, + "step": 48822 + }, + { + "epoch": 0.610315257881447, + "grad_norm": 5.437581539154053, + "learning_rate": 7.910157204830713e-06, + "loss": 0.9303, + "step": 48824 + }, + { + "epoch": 0.6103402585064627, + "grad_norm": 0.00036033388460054994, + "learning_rate": 7.909303793765138e-06, + "loss": 0.2157, + "step": 48826 + }, + { + "epoch": 0.6103652591314783, + "grad_norm": 2.4438860416412354, + "learning_rate": 7.908450398622011e-06, + "loss": 1.0138, + "step": 48828 + }, + { + "epoch": 0.6103902597564939, + "grad_norm": 0.03855254128575325, + "learning_rate": 7.907597019407828e-06, + "loss": 0.9137, + "step": 48830 + }, + { + "epoch": 0.6104152603815095, + "grad_norm": 4.425548076629639, + "learning_rate": 7.90674365612909e-06, + "loss": 0.7331, + "step": 48832 + }, + { + "epoch": 0.6104402610065252, + "grad_norm": 2.2553045749664307, + "learning_rate": 7.905890308792296e-06, + "loss": 0.898, + "step": 48834 + }, + { + "epoch": 0.6104652616315408, + "grad_norm": 3.4834482669830322, + "learning_rate": 7.905036977403945e-06, + "loss": 1.3397, + "step": 48836 + }, + { + "epoch": 0.6104902622565564, + "grad_norm": 3.2431840896606445, + "learning_rate": 7.904183661970538e-06, + "loss": 1.2187, + "step": 48838 + }, + { + "epoch": 0.610515262881572, + "grad_norm": 5.831538200378418, + "learning_rate": 7.90333036249857e-06, + "loss": 1.9093, + "step": 48840 + }, + { + "epoch": 0.6105402635065876, + "grad_norm": 3.499891757965088, + "learning_rate": 7.902477078994543e-06, + "loss": 1.6744, + "step": 48842 + }, + { + "epoch": 0.6105652641316033, + "grad_norm": 4.832442283630371, + "learning_rate": 7.901623811464953e-06, + "loss": 1.1852, + "step": 48844 + }, + { + "epoch": 0.6105902647566189, + "grad_norm": 6.126278877258301, + "learning_rate": 7.9007705599163e-06, + "loss": 0.8746, + "step": 48846 + }, + { + "epoch": 0.6106152653816346, + "grad_norm": 3.0160937309265137, + "learning_rate": 7.89991732435508e-06, + "loss": 0.2173, + "step": 48848 + }, + { + "epoch": 0.6106402660066501, + "grad_norm": 0.002972367685288191, + "learning_rate": 7.899064104787795e-06, + "loss": 0.2342, + "step": 48850 + }, + { + "epoch": 0.6106652666316658, + "grad_norm": 4.11774206161499, + "learning_rate": 7.898210901220939e-06, + "loss": 1.2235, + "step": 48852 + }, + { + "epoch": 0.6106902672566814, + "grad_norm": 3.3371756076812744, + "learning_rate": 7.897357713661014e-06, + "loss": 0.6901, + "step": 48854 + }, + { + "epoch": 0.610715267881697, + "grad_norm": 3.495567798614502, + "learning_rate": 7.896504542114512e-06, + "loss": 1.6451, + "step": 48856 + }, + { + "epoch": 0.6107402685067127, + "grad_norm": 2.5826284885406494, + "learning_rate": 7.895651386587935e-06, + "loss": 0.9475, + "step": 48858 + }, + { + "epoch": 0.6107652691317282, + "grad_norm": 6.723219394683838, + "learning_rate": 7.89479824708778e-06, + "loss": 2.5712, + "step": 48860 + }, + { + "epoch": 0.6107902697567439, + "grad_norm": 1.8823225498199463, + "learning_rate": 7.893945123620543e-06, + "loss": 1.6026, + "step": 48862 + }, + { + "epoch": 0.6108152703817595, + "grad_norm": 1.525983214378357, + "learning_rate": 7.893092016192723e-06, + "loss": 0.1148, + "step": 48864 + }, + { + "epoch": 0.6108402710067752, + "grad_norm": 2.0883822441101074, + "learning_rate": 7.892238924810815e-06, + "loss": 0.9378, + "step": 48866 + }, + { + "epoch": 0.6108652716317908, + "grad_norm": 0.0015633715083822608, + "learning_rate": 7.89138584948132e-06, + "loss": 0.0442, + "step": 48868 + }, + { + "epoch": 0.6108902722568064, + "grad_norm": 3.6056630611419678, + "learning_rate": 7.89053279021073e-06, + "loss": 1.7703, + "step": 48870 + }, + { + "epoch": 0.610915272881822, + "grad_norm": 5.803689956665039, + "learning_rate": 7.889679747005545e-06, + "loss": 0.5075, + "step": 48872 + }, + { + "epoch": 0.6109402735068377, + "grad_norm": 1.4134283065795898, + "learning_rate": 7.88882671987226e-06, + "loss": 0.0756, + "step": 48874 + }, + { + "epoch": 0.6109652741318533, + "grad_norm": 7.565690517425537, + "learning_rate": 7.887973708817371e-06, + "loss": 1.3193, + "step": 48876 + }, + { + "epoch": 0.610990274756869, + "grad_norm": 5.006349563598633, + "learning_rate": 7.887120713847376e-06, + "loss": 0.3063, + "step": 48878 + }, + { + "epoch": 0.6110152753818845, + "grad_norm": 1.458784580230713, + "learning_rate": 7.886267734968775e-06, + "loss": 0.8834, + "step": 48880 + }, + { + "epoch": 0.6110402760069001, + "grad_norm": 2.9510979652404785, + "learning_rate": 7.885414772188056e-06, + "loss": 0.7433, + "step": 48882 + }, + { + "epoch": 0.6110652766319158, + "grad_norm": 5.303048133850098, + "learning_rate": 7.88456182551172e-06, + "loss": 1.051, + "step": 48884 + }, + { + "epoch": 0.6110902772569314, + "grad_norm": 7.551106929779053, + "learning_rate": 7.883708894946261e-06, + "loss": 0.597, + "step": 48886 + }, + { + "epoch": 0.6111152778819471, + "grad_norm": 1.488946795463562, + "learning_rate": 7.882855980498176e-06, + "loss": 0.0722, + "step": 48888 + }, + { + "epoch": 0.6111402785069626, + "grad_norm": 2.5593762397766113, + "learning_rate": 7.882003082173961e-06, + "loss": 1.2483, + "step": 48890 + }, + { + "epoch": 0.6111652791319783, + "grad_norm": 2.7412798404693604, + "learning_rate": 7.88115019998011e-06, + "loss": 1.5457, + "step": 48892 + }, + { + "epoch": 0.6111902797569939, + "grad_norm": 0.0005291901179589331, + "learning_rate": 7.880297333923122e-06, + "loss": 1.1515, + "step": 48894 + }, + { + "epoch": 0.6112152803820096, + "grad_norm": 2.7550559043884277, + "learning_rate": 7.879444484009488e-06, + "loss": 1.1185, + "step": 48896 + }, + { + "epoch": 0.6112402810070252, + "grad_norm": 1.3195970058441162, + "learning_rate": 7.878591650245704e-06, + "loss": 0.1968, + "step": 48898 + }, + { + "epoch": 0.6112652816320407, + "grad_norm": 2.7364354133605957, + "learning_rate": 7.87773883263827e-06, + "loss": 0.8664, + "step": 48900 + }, + { + "epoch": 0.6112902822570564, + "grad_norm": 5.105672359466553, + "learning_rate": 7.876886031193671e-06, + "loss": 1.5039, + "step": 48902 + }, + { + "epoch": 0.611315282882072, + "grad_norm": 5.437198638916016, + "learning_rate": 7.87603324591841e-06, + "loss": 0.9658, + "step": 48904 + }, + { + "epoch": 0.6113402835070877, + "grad_norm": 3.3686611652374268, + "learning_rate": 7.875180476818984e-06, + "loss": 1.4369, + "step": 48906 + }, + { + "epoch": 0.6113652841321033, + "grad_norm": 3.0506083965301514, + "learning_rate": 7.874327723901878e-06, + "loss": 0.6299, + "step": 48908 + }, + { + "epoch": 0.6113902847571189, + "grad_norm": 3.128350019454956, + "learning_rate": 7.873474987173592e-06, + "loss": 0.5193, + "step": 48910 + }, + { + "epoch": 0.6114152853821345, + "grad_norm": 4.836708068847656, + "learning_rate": 7.87262226664062e-06, + "loss": 1.7674, + "step": 48912 + }, + { + "epoch": 0.6114402860071502, + "grad_norm": 0.7251683473587036, + "learning_rate": 7.871769562309458e-06, + "loss": 0.2794, + "step": 48914 + }, + { + "epoch": 0.6114652866321658, + "grad_norm": 0.4546617567539215, + "learning_rate": 7.870916874186594e-06, + "loss": 0.1843, + "step": 48916 + }, + { + "epoch": 0.6114902872571815, + "grad_norm": 1.4837754964828491, + "learning_rate": 7.870064202278526e-06, + "loss": 0.0763, + "step": 48918 + }, + { + "epoch": 0.611515287882197, + "grad_norm": 0.08316852152347565, + "learning_rate": 7.869211546591754e-06, + "loss": 0.0296, + "step": 48920 + }, + { + "epoch": 0.6115402885072126, + "grad_norm": 2.7031917572021484, + "learning_rate": 7.868358907132763e-06, + "loss": 0.9302, + "step": 48922 + }, + { + "epoch": 0.6115652891322283, + "grad_norm": 4.420734405517578, + "learning_rate": 7.867506283908048e-06, + "loss": 1.4679, + "step": 48924 + }, + { + "epoch": 0.6115902897572439, + "grad_norm": 4.011401653289795, + "learning_rate": 7.866653676924103e-06, + "loss": 1.0866, + "step": 48926 + }, + { + "epoch": 0.6116152903822596, + "grad_norm": 4.357228755950928, + "learning_rate": 7.865801086187423e-06, + "loss": 1.2902, + "step": 48928 + }, + { + "epoch": 0.6116402910072751, + "grad_norm": 0.0008568752091377974, + "learning_rate": 7.864948511704497e-06, + "loss": 0.5381, + "step": 48930 + }, + { + "epoch": 0.6116652916322908, + "grad_norm": 0.000887206697370857, + "learning_rate": 7.864095953481827e-06, + "loss": 0.0, + "step": 48932 + }, + { + "epoch": 0.6116902922573064, + "grad_norm": 0.28440043330192566, + "learning_rate": 7.863243411525898e-06, + "loss": 0.3091, + "step": 48934 + }, + { + "epoch": 0.6117152928823221, + "grad_norm": 4.032797813415527, + "learning_rate": 7.862390885843203e-06, + "loss": 1.4473, + "step": 48936 + }, + { + "epoch": 0.6117402935073377, + "grad_norm": 4.154947757720947, + "learning_rate": 7.861538376440237e-06, + "loss": 0.6076, + "step": 48938 + }, + { + "epoch": 0.6117652941323533, + "grad_norm": 0.007155482191592455, + "learning_rate": 7.860685883323493e-06, + "loss": 0.1786, + "step": 48940 + }, + { + "epoch": 0.6117902947573689, + "grad_norm": 1.6783877611160278, + "learning_rate": 7.859833406499459e-06, + "loss": 1.3355, + "step": 48942 + }, + { + "epoch": 0.6118152953823845, + "grad_norm": 2.763521194458008, + "learning_rate": 7.858980945974636e-06, + "loss": 0.5249, + "step": 48944 + }, + { + "epoch": 0.6118402960074002, + "grad_norm": 5.342438697814941, + "learning_rate": 7.858128501755515e-06, + "loss": 2.3065, + "step": 48946 + }, + { + "epoch": 0.6118652966324158, + "grad_norm": 1.5326106548309326, + "learning_rate": 7.857276073848579e-06, + "loss": 0.5333, + "step": 48948 + }, + { + "epoch": 0.6118902972574314, + "grad_norm": 2.8452701568603516, + "learning_rate": 7.856423662260326e-06, + "loss": 0.5724, + "step": 48950 + }, + { + "epoch": 0.611915297882447, + "grad_norm": 0.16265535354614258, + "learning_rate": 7.855571266997245e-06, + "loss": 0.9289, + "step": 48952 + }, + { + "epoch": 0.6119402985074627, + "grad_norm": 5.735026836395264, + "learning_rate": 7.854718888065834e-06, + "loss": 0.5548, + "step": 48954 + }, + { + "epoch": 0.6119652991324783, + "grad_norm": 0.7911316752433777, + "learning_rate": 7.853866525472577e-06, + "loss": 0.1859, + "step": 48956 + }, + { + "epoch": 0.611990299757494, + "grad_norm": 3.33158278465271, + "learning_rate": 7.85301417922397e-06, + "loss": 0.3919, + "step": 48958 + }, + { + "epoch": 0.6120153003825095, + "grad_norm": 2.7146003246307373, + "learning_rate": 7.852161849326508e-06, + "loss": 1.1281, + "step": 48960 + }, + { + "epoch": 0.6120403010075252, + "grad_norm": 0.0021549593657255173, + "learning_rate": 7.851309535786674e-06, + "loss": 0.4042, + "step": 48962 + }, + { + "epoch": 0.6120653016325408, + "grad_norm": 4.361843109130859, + "learning_rate": 7.850457238610963e-06, + "loss": 1.0742, + "step": 48964 + }, + { + "epoch": 0.6120903022575565, + "grad_norm": 4.008820056915283, + "learning_rate": 7.849604957805866e-06, + "loss": 1.6432, + "step": 48966 + }, + { + "epoch": 0.6121153028825721, + "grad_norm": 1.8875455856323242, + "learning_rate": 7.84875269337787e-06, + "loss": 0.6836, + "step": 48968 + }, + { + "epoch": 0.6121403035075876, + "grad_norm": 6.420310974121094, + "learning_rate": 7.847900445333472e-06, + "loss": 2.7549, + "step": 48970 + }, + { + "epoch": 0.6121653041326033, + "grad_norm": 4.01639986038208, + "learning_rate": 7.847048213679164e-06, + "loss": 2.6632, + "step": 48972 + }, + { + "epoch": 0.6121903047576189, + "grad_norm": 7.059863567352295, + "learning_rate": 7.846195998421429e-06, + "loss": 1.4143, + "step": 48974 + }, + { + "epoch": 0.6122153053826346, + "grad_norm": 4.890939712524414, + "learning_rate": 7.84534379956676e-06, + "loss": 1.6882, + "step": 48976 + }, + { + "epoch": 0.6122403060076502, + "grad_norm": 0.7350718975067139, + "learning_rate": 7.844491617121647e-06, + "loss": 0.0985, + "step": 48978 + }, + { + "epoch": 0.6122653066326658, + "grad_norm": 6.41060209274292, + "learning_rate": 7.843639451092579e-06, + "loss": 0.6388, + "step": 48980 + }, + { + "epoch": 0.6122903072576814, + "grad_norm": 0.1653316766023636, + "learning_rate": 7.842787301486053e-06, + "loss": 0.0167, + "step": 48982 + }, + { + "epoch": 0.6123153078826971, + "grad_norm": 4.006614685058594, + "learning_rate": 7.84193516830855e-06, + "loss": 1.6027, + "step": 48984 + }, + { + "epoch": 0.6123403085077127, + "grad_norm": 3.5097501277923584, + "learning_rate": 7.84108305156657e-06, + "loss": 1.7966, + "step": 48986 + }, + { + "epoch": 0.6123653091327284, + "grad_norm": 3.4239344596862793, + "learning_rate": 7.840230951266593e-06, + "loss": 1.0176, + "step": 48988 + }, + { + "epoch": 0.6123903097577439, + "grad_norm": 2.091355562210083, + "learning_rate": 7.839378867415108e-06, + "loss": 0.8796, + "step": 48990 + }, + { + "epoch": 0.6124153103827595, + "grad_norm": 2.923227310180664, + "learning_rate": 7.83852680001861e-06, + "loss": 0.9454, + "step": 48992 + }, + { + "epoch": 0.6124403110077752, + "grad_norm": 3.2352678775787354, + "learning_rate": 7.837674749083585e-06, + "loss": 1.173, + "step": 48994 + }, + { + "epoch": 0.6124653116327908, + "grad_norm": 2.6044816970825195, + "learning_rate": 7.836822714616525e-06, + "loss": 0.3955, + "step": 48996 + }, + { + "epoch": 0.6124903122578065, + "grad_norm": 1.845418095588684, + "learning_rate": 7.835970696623922e-06, + "loss": 0.7459, + "step": 48998 + }, + { + "epoch": 0.612515312882822, + "grad_norm": 0.11298643797636032, + "learning_rate": 7.835118695112255e-06, + "loss": 0.1383, + "step": 49000 + }, + { + "epoch": 0.6125403135078377, + "grad_norm": 4.670614719390869, + "learning_rate": 7.834266710088017e-06, + "loss": 1.2908, + "step": 49002 + }, + { + "epoch": 0.6125653141328533, + "grad_norm": 0.0013087509432807565, + "learning_rate": 7.833414741557696e-06, + "loss": 0.3337, + "step": 49004 + }, + { + "epoch": 0.612590314757869, + "grad_norm": 3.8131494522094727, + "learning_rate": 7.832562789527785e-06, + "loss": 1.2857, + "step": 49006 + }, + { + "epoch": 0.6126153153828846, + "grad_norm": 2.044266939163208, + "learning_rate": 7.83171085400477e-06, + "loss": 0.1511, + "step": 49008 + }, + { + "epoch": 0.6126403160079001, + "grad_norm": 5.992527484893799, + "learning_rate": 7.830858934995137e-06, + "loss": 0.6107, + "step": 49010 + }, + { + "epoch": 0.6126653166329158, + "grad_norm": 4.646137714385986, + "learning_rate": 7.83000703250538e-06, + "loss": 0.7575, + "step": 49012 + }, + { + "epoch": 0.6126903172579314, + "grad_norm": 0.000512592145241797, + "learning_rate": 7.829155146541978e-06, + "loss": 0.3674, + "step": 49014 + }, + { + "epoch": 0.6127153178829471, + "grad_norm": 1.7008602619171143, + "learning_rate": 7.828303277111422e-06, + "loss": 0.467, + "step": 49016 + }, + { + "epoch": 0.6127403185079627, + "grad_norm": 0.0036964467726647854, + "learning_rate": 7.827451424220202e-06, + "loss": 0.2446, + "step": 49018 + }, + { + "epoch": 0.6127653191329783, + "grad_norm": 7.7317280769348145, + "learning_rate": 7.826599587874807e-06, + "loss": 0.7362, + "step": 49020 + }, + { + "epoch": 0.6127903197579939, + "grad_norm": 9.36583423614502, + "learning_rate": 7.82574776808172e-06, + "loss": 1.983, + "step": 49022 + }, + { + "epoch": 0.6128153203830096, + "grad_norm": 0.6049275398254395, + "learning_rate": 7.824895964847436e-06, + "loss": 0.0198, + "step": 49024 + }, + { + "epoch": 0.6128403210080252, + "grad_norm": 6.33156681060791, + "learning_rate": 7.824044178178432e-06, + "loss": 1.0742, + "step": 49026 + }, + { + "epoch": 0.6128653216330409, + "grad_norm": 0.011872168630361557, + "learning_rate": 7.8231924080812e-06, + "loss": 0.0284, + "step": 49028 + }, + { + "epoch": 0.6128903222580564, + "grad_norm": 4.4377970695495605, + "learning_rate": 7.822340654562226e-06, + "loss": 0.4937, + "step": 49030 + }, + { + "epoch": 0.612915322883072, + "grad_norm": 0.020118458196520805, + "learning_rate": 7.821488917627997e-06, + "loss": 0.9879, + "step": 49032 + }, + { + "epoch": 0.6129403235080877, + "grad_norm": 2.9338855743408203, + "learning_rate": 7.820637197285002e-06, + "loss": 0.9122, + "step": 49034 + }, + { + "epoch": 0.6129653241331033, + "grad_norm": 2.8971004486083984, + "learning_rate": 7.819785493539727e-06, + "loss": 0.5384, + "step": 49036 + }, + { + "epoch": 0.612990324758119, + "grad_norm": 2.8508591651916504, + "learning_rate": 7.81893380639866e-06, + "loss": 1.8594, + "step": 49038 + }, + { + "epoch": 0.6130153253831345, + "grad_norm": 4.340666770935059, + "learning_rate": 7.81808213586828e-06, + "loss": 0.9628, + "step": 49040 + }, + { + "epoch": 0.6130403260081502, + "grad_norm": 4.170416831970215, + "learning_rate": 7.817230481955076e-06, + "loss": 1.8076, + "step": 49042 + }, + { + "epoch": 0.6130653266331658, + "grad_norm": 3.2592127323150635, + "learning_rate": 7.81637884466554e-06, + "loss": 0.7074, + "step": 49044 + }, + { + "epoch": 0.6130903272581815, + "grad_norm": 0.7464291453361511, + "learning_rate": 7.815527224006152e-06, + "loss": 0.7126, + "step": 49046 + }, + { + "epoch": 0.6131153278831971, + "grad_norm": 0.4935337007045746, + "learning_rate": 7.8146756199834e-06, + "loss": 0.636, + "step": 49048 + }, + { + "epoch": 0.6131403285082127, + "grad_norm": 2.640451192855835, + "learning_rate": 7.813824032603776e-06, + "loss": 1.0573, + "step": 49050 + }, + { + "epoch": 0.6131653291332283, + "grad_norm": 0.267442911863327, + "learning_rate": 7.81297246187375e-06, + "loss": 0.0139, + "step": 49052 + }, + { + "epoch": 0.613190329758244, + "grad_norm": 6.568918228149414, + "learning_rate": 7.812120907799818e-06, + "loss": 1.7403, + "step": 49054 + }, + { + "epoch": 0.6132153303832596, + "grad_norm": 2.1086478233337402, + "learning_rate": 7.811269370388466e-06, + "loss": 0.903, + "step": 49056 + }, + { + "epoch": 0.6132403310082752, + "grad_norm": 0.3781368136405945, + "learning_rate": 7.810417849646177e-06, + "loss": 0.0596, + "step": 49058 + }, + { + "epoch": 0.6132653316332908, + "grad_norm": 7.658726692199707, + "learning_rate": 7.809566345579433e-06, + "loss": 2.2625, + "step": 49060 + }, + { + "epoch": 0.6132903322583064, + "grad_norm": 3.4536163806915283, + "learning_rate": 7.808714858194724e-06, + "loss": 1.2794, + "step": 49062 + }, + { + "epoch": 0.6133153328833221, + "grad_norm": 2.1740589141845703, + "learning_rate": 7.807863387498538e-06, + "loss": 0.9321, + "step": 49064 + }, + { + "epoch": 0.6133403335083377, + "grad_norm": 2.5414555072784424, + "learning_rate": 7.807011933497347e-06, + "loss": 0.533, + "step": 49066 + }, + { + "epoch": 0.6133653341333534, + "grad_norm": 3.2582011222839355, + "learning_rate": 7.806160496197646e-06, + "loss": 1.4741, + "step": 49068 + }, + { + "epoch": 0.6133903347583689, + "grad_norm": 7.748077869415283, + "learning_rate": 7.805309075605914e-06, + "loss": 0.7989, + "step": 49070 + }, + { + "epoch": 0.6134153353833846, + "grad_norm": 2.8761062622070312, + "learning_rate": 7.80445767172864e-06, + "loss": 0.8215, + "step": 49072 + }, + { + "epoch": 0.6134403360084002, + "grad_norm": 4.809814929962158, + "learning_rate": 7.803606284572307e-06, + "loss": 1.1062, + "step": 49074 + }, + { + "epoch": 0.6134653366334158, + "grad_norm": 4.911978721618652, + "learning_rate": 7.802754914143398e-06, + "loss": 1.8297, + "step": 49076 + }, + { + "epoch": 0.6134903372584315, + "grad_norm": 3.8529255390167236, + "learning_rate": 7.801903560448395e-06, + "loss": 0.5951, + "step": 49078 + }, + { + "epoch": 0.613515337883447, + "grad_norm": 4.538547039031982, + "learning_rate": 7.801052223493783e-06, + "loss": 0.3635, + "step": 49080 + }, + { + "epoch": 0.6135403385084627, + "grad_norm": 0.10261639207601547, + "learning_rate": 7.800200903286045e-06, + "loss": 0.3687, + "step": 49082 + }, + { + "epoch": 0.6135653391334783, + "grad_norm": 3.898482084274292, + "learning_rate": 7.79934959983167e-06, + "loss": 0.8701, + "step": 49084 + }, + { + "epoch": 0.613590339758494, + "grad_norm": 3.3579952716827393, + "learning_rate": 7.798498313137135e-06, + "loss": 1.4621, + "step": 49086 + }, + { + "epoch": 0.6136153403835096, + "grad_norm": 7.522811412811279, + "learning_rate": 7.797647043208924e-06, + "loss": 1.3149, + "step": 49088 + }, + { + "epoch": 0.6136403410085252, + "grad_norm": 3.199674129486084, + "learning_rate": 7.796795790053526e-06, + "loss": 0.8167, + "step": 49090 + }, + { + "epoch": 0.6136653416335408, + "grad_norm": 2.272251605987549, + "learning_rate": 7.795944553677417e-06, + "loss": 1.0493, + "step": 49092 + }, + { + "epoch": 0.6136903422585565, + "grad_norm": 5.952346324920654, + "learning_rate": 7.795093334087082e-06, + "loss": 1.8157, + "step": 49094 + }, + { + "epoch": 0.6137153428835721, + "grad_norm": 2.2604660987854004, + "learning_rate": 7.794242131289006e-06, + "loss": 0.3434, + "step": 49096 + }, + { + "epoch": 0.6137403435085877, + "grad_norm": 2.9656198024749756, + "learning_rate": 7.793390945289669e-06, + "loss": 0.8061, + "step": 49098 + }, + { + "epoch": 0.6137653441336033, + "grad_norm": 2.5920398235321045, + "learning_rate": 7.792539776095554e-06, + "loss": 1.054, + "step": 49100 + }, + { + "epoch": 0.6137903447586189, + "grad_norm": 3.7251405715942383, + "learning_rate": 7.79168862371315e-06, + "loss": 0.6102, + "step": 49102 + }, + { + "epoch": 0.6138153453836346, + "grad_norm": 0.0004394249408505857, + "learning_rate": 7.790837488148924e-06, + "loss": 0.0961, + "step": 49104 + }, + { + "epoch": 0.6138403460086502, + "grad_norm": 3.675790548324585, + "learning_rate": 7.789986369409371e-06, + "loss": 1.1898, + "step": 49106 + }, + { + "epoch": 0.6138653466336659, + "grad_norm": 2.635044574737549, + "learning_rate": 7.789135267500969e-06, + "loss": 1.5188, + "step": 49108 + }, + { + "epoch": 0.6138903472586814, + "grad_norm": 5.85345983505249, + "learning_rate": 7.7882841824302e-06, + "loss": 1.2656, + "step": 49110 + }, + { + "epoch": 0.6139153478836971, + "grad_norm": 3.27705717086792, + "learning_rate": 7.787433114203546e-06, + "loss": 0.8051, + "step": 49112 + }, + { + "epoch": 0.6139403485087127, + "grad_norm": 0.7013858556747437, + "learning_rate": 7.78658206282749e-06, + "loss": 1.1975, + "step": 49114 + }, + { + "epoch": 0.6139653491337284, + "grad_norm": 2.798981189727783, + "learning_rate": 7.785731028308512e-06, + "loss": 1.5034, + "step": 49116 + }, + { + "epoch": 0.613990349758744, + "grad_norm": 2.151263475418091, + "learning_rate": 7.784880010653092e-06, + "loss": 0.699, + "step": 49118 + }, + { + "epoch": 0.6140153503837595, + "grad_norm": 10.041163444519043, + "learning_rate": 7.784029009867713e-06, + "loss": 0.761, + "step": 49120 + }, + { + "epoch": 0.6140403510087752, + "grad_norm": 3.672694444656372, + "learning_rate": 7.783178025958855e-06, + "loss": 1.2706, + "step": 49122 + }, + { + "epoch": 0.6140653516337908, + "grad_norm": 0.10891681909561157, + "learning_rate": 7.782327058933e-06, + "loss": 0.8234, + "step": 49124 + }, + { + "epoch": 0.6140903522588065, + "grad_norm": 5.209815979003906, + "learning_rate": 7.781476108796628e-06, + "loss": 0.7219, + "step": 49126 + }, + { + "epoch": 0.6141153528838221, + "grad_norm": 2.653747081756592, + "learning_rate": 7.780625175556223e-06, + "loss": 1.0641, + "step": 49128 + }, + { + "epoch": 0.6141403535088377, + "grad_norm": 2.4930601119995117, + "learning_rate": 7.77977425921826e-06, + "loss": 0.4362, + "step": 49130 + }, + { + "epoch": 0.6141653541338533, + "grad_norm": 4.062591075897217, + "learning_rate": 7.778923359789224e-06, + "loss": 0.6125, + "step": 49132 + }, + { + "epoch": 0.614190354758869, + "grad_norm": 2.355397939682007, + "learning_rate": 7.778072477275592e-06, + "loss": 0.1162, + "step": 49134 + }, + { + "epoch": 0.6142153553838846, + "grad_norm": 0.7221726775169373, + "learning_rate": 7.777221611683847e-06, + "loss": 0.0244, + "step": 49136 + }, + { + "epoch": 0.6142403560089003, + "grad_norm": 3.708279848098755, + "learning_rate": 7.776370763020466e-06, + "loss": 1.9108, + "step": 49138 + }, + { + "epoch": 0.6142653566339158, + "grad_norm": 1.7153767347335815, + "learning_rate": 7.775519931291933e-06, + "loss": 1.197, + "step": 49140 + }, + { + "epoch": 0.6142903572589314, + "grad_norm": 3.268219470977783, + "learning_rate": 7.774669116504725e-06, + "loss": 1.4478, + "step": 49142 + }, + { + "epoch": 0.6143153578839471, + "grad_norm": 4.350611209869385, + "learning_rate": 7.773818318665322e-06, + "loss": 1.4002, + "step": 49144 + }, + { + "epoch": 0.6143403585089627, + "grad_norm": 3.400221109390259, + "learning_rate": 7.772967537780205e-06, + "loss": 1.3402, + "step": 49146 + }, + { + "epoch": 0.6143653591339784, + "grad_norm": 2.895791530609131, + "learning_rate": 7.772116773855852e-06, + "loss": 2.1633, + "step": 49148 + }, + { + "epoch": 0.6143903597589939, + "grad_norm": 0.015112101100385189, + "learning_rate": 7.77126602689874e-06, + "loss": 0.7502, + "step": 49150 + }, + { + "epoch": 0.6144153603840096, + "grad_norm": 2.8202807903289795, + "learning_rate": 7.770415296915352e-06, + "loss": 1.0549, + "step": 49152 + }, + { + "epoch": 0.6144403610090252, + "grad_norm": 4.536965370178223, + "learning_rate": 7.76956458391217e-06, + "loss": 2.1215, + "step": 49154 + }, + { + "epoch": 0.6144653616340409, + "grad_norm": 7.061472415924072, + "learning_rate": 7.768713887895664e-06, + "loss": 0.75, + "step": 49156 + }, + { + "epoch": 0.6144903622590565, + "grad_norm": 3.3394346237182617, + "learning_rate": 7.767863208872319e-06, + "loss": 0.6834, + "step": 49158 + }, + { + "epoch": 0.614515362884072, + "grad_norm": 1.6171481609344482, + "learning_rate": 7.767012546848611e-06, + "loss": 0.1967, + "step": 49160 + }, + { + "epoch": 0.6145403635090877, + "grad_norm": 2.9935507774353027, + "learning_rate": 7.76616190183102e-06, + "loss": 1.0005, + "step": 49162 + }, + { + "epoch": 0.6145653641341033, + "grad_norm": 2.705152988433838, + "learning_rate": 7.765311273826025e-06, + "loss": 1.0353, + "step": 49164 + }, + { + "epoch": 0.614590364759119, + "grad_norm": 0.0007084186072461307, + "learning_rate": 7.764460662840104e-06, + "loss": 0.8841, + "step": 49166 + }, + { + "epoch": 0.6146153653841346, + "grad_norm": 3.005634307861328, + "learning_rate": 7.763610068879734e-06, + "loss": 0.7446, + "step": 49168 + }, + { + "epoch": 0.6146403660091502, + "grad_norm": 2.7564468383789062, + "learning_rate": 7.762759491951393e-06, + "loss": 1.2073, + "step": 49170 + }, + { + "epoch": 0.6146653666341658, + "grad_norm": 3.8480381965637207, + "learning_rate": 7.761908932061558e-06, + "loss": 1.9105, + "step": 49172 + }, + { + "epoch": 0.6146903672591815, + "grad_norm": 2.669567584991455, + "learning_rate": 7.761058389216709e-06, + "loss": 0.5925, + "step": 49174 + }, + { + "epoch": 0.6147153678841971, + "grad_norm": 2.820906162261963, + "learning_rate": 7.760207863423324e-06, + "loss": 1.5729, + "step": 49176 + }, + { + "epoch": 0.6147403685092128, + "grad_norm": 4.263150691986084, + "learning_rate": 7.759357354687877e-06, + "loss": 1.1083, + "step": 49178 + }, + { + "epoch": 0.6147653691342283, + "grad_norm": 5.948890209197998, + "learning_rate": 7.75850686301685e-06, + "loss": 1.4652, + "step": 49180 + }, + { + "epoch": 0.614790369759244, + "grad_norm": 2.5229785442352295, + "learning_rate": 7.757656388416717e-06, + "loss": 0.8615, + "step": 49182 + }, + { + "epoch": 0.6148153703842596, + "grad_norm": 0.0008641834720037878, + "learning_rate": 7.756805930893954e-06, + "loss": 0.2623, + "step": 49184 + }, + { + "epoch": 0.6148403710092752, + "grad_norm": 3.7976176738739014, + "learning_rate": 7.755955490455041e-06, + "loss": 1.3349, + "step": 49186 + }, + { + "epoch": 0.6148653716342909, + "grad_norm": 0.00044430160778574646, + "learning_rate": 7.755105067106455e-06, + "loss": 0.0511, + "step": 49188 + }, + { + "epoch": 0.6148903722593064, + "grad_norm": 4.819683074951172, + "learning_rate": 7.754254660854669e-06, + "loss": 0.6789, + "step": 49190 + }, + { + "epoch": 0.6149153728843221, + "grad_norm": 2.2022783756256104, + "learning_rate": 7.753404271706162e-06, + "loss": 1.6073, + "step": 49192 + }, + { + "epoch": 0.6149403735093377, + "grad_norm": 3.0835273265838623, + "learning_rate": 7.752553899667413e-06, + "loss": 0.235, + "step": 49194 + }, + { + "epoch": 0.6149653741343534, + "grad_norm": 2.5850164890289307, + "learning_rate": 7.751703544744893e-06, + "loss": 0.5842, + "step": 49196 + }, + { + "epoch": 0.614990374759369, + "grad_norm": 4.826389789581299, + "learning_rate": 7.750853206945083e-06, + "loss": 1.639, + "step": 49198 + }, + { + "epoch": 0.6150153753843846, + "grad_norm": 3.717926263809204, + "learning_rate": 7.750002886274455e-06, + "loss": 1.4336, + "step": 49200 + }, + { + "epoch": 0.6150403760094002, + "grad_norm": 3.4189200401306152, + "learning_rate": 7.749152582739488e-06, + "loss": 0.8368, + "step": 49202 + }, + { + "epoch": 0.6150653766344159, + "grad_norm": 1.9919052124023438, + "learning_rate": 7.748302296346656e-06, + "loss": 0.4823, + "step": 49204 + }, + { + "epoch": 0.6150903772594315, + "grad_norm": 6.353165149688721, + "learning_rate": 7.747452027102436e-06, + "loss": 0.8324, + "step": 49206 + }, + { + "epoch": 0.6151153778844471, + "grad_norm": 0.30935990810394287, + "learning_rate": 7.746601775013304e-06, + "loss": 0.4205, + "step": 49208 + }, + { + "epoch": 0.6151403785094627, + "grad_norm": 2.7991390228271484, + "learning_rate": 7.745751540085731e-06, + "loss": 0.7614, + "step": 49210 + }, + { + "epoch": 0.6151653791344783, + "grad_norm": 0.0024708460550755262, + "learning_rate": 7.744901322326196e-06, + "loss": 0.0245, + "step": 49212 + }, + { + "epoch": 0.615190379759494, + "grad_norm": 2.134002447128296, + "learning_rate": 7.744051121741176e-06, + "loss": 1.7573, + "step": 49214 + }, + { + "epoch": 0.6152153803845096, + "grad_norm": 1.7057558298110962, + "learning_rate": 7.743200938337142e-06, + "loss": 0.2997, + "step": 49216 + }, + { + "epoch": 0.6152403810095253, + "grad_norm": 1.700433611869812, + "learning_rate": 7.742350772120571e-06, + "loss": 0.4028, + "step": 49218 + }, + { + "epoch": 0.6152653816345408, + "grad_norm": 4.644800662994385, + "learning_rate": 7.74150062309794e-06, + "loss": 1.4454, + "step": 49220 + }, + { + "epoch": 0.6152903822595565, + "grad_norm": 2.6975715160369873, + "learning_rate": 7.740650491275716e-06, + "loss": 1.3348, + "step": 49222 + }, + { + "epoch": 0.6153153828845721, + "grad_norm": 1.575036883354187, + "learning_rate": 7.73980037666038e-06, + "loss": 0.5909, + "step": 49224 + }, + { + "epoch": 0.6153403835095878, + "grad_norm": 4.811244487762451, + "learning_rate": 7.738950279258406e-06, + "loss": 1.5397, + "step": 49226 + }, + { + "epoch": 0.6153653841346034, + "grad_norm": 5.895519256591797, + "learning_rate": 7.738100199076266e-06, + "loss": 1.0909, + "step": 49228 + }, + { + "epoch": 0.6153903847596189, + "grad_norm": 3.3649306297302246, + "learning_rate": 7.737250136120434e-06, + "loss": 0.3363, + "step": 49230 + }, + { + "epoch": 0.6154153853846346, + "grad_norm": 7.245418071746826, + "learning_rate": 7.736400090397385e-06, + "loss": 2.7207, + "step": 49232 + }, + { + "epoch": 0.6154403860096502, + "grad_norm": 5.178823471069336, + "learning_rate": 7.735550061913596e-06, + "loss": 2.5239, + "step": 49234 + }, + { + "epoch": 0.6154653866346659, + "grad_norm": 2.5313549041748047, + "learning_rate": 7.734700050675536e-06, + "loss": 2.3266, + "step": 49236 + }, + { + "epoch": 0.6154903872596815, + "grad_norm": 4.503298282623291, + "learning_rate": 7.73385005668968e-06, + "loss": 1.6122, + "step": 49238 + }, + { + "epoch": 0.6155153878846971, + "grad_norm": 2.5391623973846436, + "learning_rate": 7.733000079962499e-06, + "loss": 0.5397, + "step": 49240 + }, + { + "epoch": 0.6155403885097127, + "grad_norm": 0.001244699815288186, + "learning_rate": 7.732150120500471e-06, + "loss": 0.0, + "step": 49242 + }, + { + "epoch": 0.6155653891347284, + "grad_norm": 3.510772705078125, + "learning_rate": 7.731300178310066e-06, + "loss": 0.9062, + "step": 49244 + }, + { + "epoch": 0.615590389759744, + "grad_norm": 2.2143795490264893, + "learning_rate": 7.730450253397761e-06, + "loss": 1.0733, + "step": 49246 + }, + { + "epoch": 0.6156153903847597, + "grad_norm": 1.2877721786499023, + "learning_rate": 7.729600345770023e-06, + "loss": 0.4082, + "step": 49248 + }, + { + "epoch": 0.6156403910097752, + "grad_norm": 2.734801769256592, + "learning_rate": 7.728750455433328e-06, + "loss": 0.6417, + "step": 49250 + }, + { + "epoch": 0.6156653916347908, + "grad_norm": 1.506999135017395, + "learning_rate": 7.727900582394148e-06, + "loss": 0.1029, + "step": 49252 + }, + { + "epoch": 0.6156903922598065, + "grad_norm": 2.762622117996216, + "learning_rate": 7.727050726658958e-06, + "loss": 0.611, + "step": 49254 + }, + { + "epoch": 0.6157153928848221, + "grad_norm": 3.050438404083252, + "learning_rate": 7.726200888234225e-06, + "loss": 0.9013, + "step": 49256 + }, + { + "epoch": 0.6157403935098378, + "grad_norm": 4.262787818908691, + "learning_rate": 7.725351067126428e-06, + "loss": 0.7393, + "step": 49258 + }, + { + "epoch": 0.6157653941348533, + "grad_norm": 5.689241409301758, + "learning_rate": 7.724501263342035e-06, + "loss": 1.0716, + "step": 49260 + }, + { + "epoch": 0.615790394759869, + "grad_norm": 0.3579293489456177, + "learning_rate": 7.723651476887518e-06, + "loss": 0.7308, + "step": 49262 + }, + { + "epoch": 0.6158153953848846, + "grad_norm": 2.255152940750122, + "learning_rate": 7.722801707769348e-06, + "loss": 1.4986, + "step": 49264 + }, + { + "epoch": 0.6158403960099003, + "grad_norm": 2.8756985664367676, + "learning_rate": 7.721951955993998e-06, + "loss": 0.9339, + "step": 49266 + }, + { + "epoch": 0.6158653966349159, + "grad_norm": 2.5330734252929688, + "learning_rate": 7.72110222156794e-06, + "loss": 1.2274, + "step": 49268 + }, + { + "epoch": 0.6158903972599314, + "grad_norm": 1.0340962409973145, + "learning_rate": 7.720252504497648e-06, + "loss": 0.7217, + "step": 49270 + }, + { + "epoch": 0.6159153978849471, + "grad_norm": 2.8922195434570312, + "learning_rate": 7.71940280478959e-06, + "loss": 1.248, + "step": 49272 + }, + { + "epoch": 0.6159403985099627, + "grad_norm": 6.4214982986450195, + "learning_rate": 7.718553122450236e-06, + "loss": 2.6341, + "step": 49274 + }, + { + "epoch": 0.6159653991349784, + "grad_norm": 3.9324512481689453, + "learning_rate": 7.717703457486058e-06, + "loss": 1.7344, + "step": 49276 + }, + { + "epoch": 0.615990399759994, + "grad_norm": 3.9843552112579346, + "learning_rate": 7.71685380990353e-06, + "loss": 1.4197, + "step": 49278 + }, + { + "epoch": 0.6160154003850096, + "grad_norm": 5.624819755554199, + "learning_rate": 7.716004179709119e-06, + "loss": 1.3892, + "step": 49280 + }, + { + "epoch": 0.6160404010100252, + "grad_norm": 1.5292009115219116, + "learning_rate": 7.715154566909297e-06, + "loss": 0.7781, + "step": 49282 + }, + { + "epoch": 0.6160654016350409, + "grad_norm": 4.626553058624268, + "learning_rate": 7.714304971510535e-06, + "loss": 1.6903, + "step": 49284 + }, + { + "epoch": 0.6160904022600565, + "grad_norm": 0.002970051486045122, + "learning_rate": 7.713455393519305e-06, + "loss": 0.7611, + "step": 49286 + }, + { + "epoch": 0.6161154028850722, + "grad_norm": 5.112102031707764, + "learning_rate": 7.712605832942073e-06, + "loss": 1.3601, + "step": 49288 + }, + { + "epoch": 0.6161404035100877, + "grad_norm": 4.672636032104492, + "learning_rate": 7.711756289785312e-06, + "loss": 0.615, + "step": 49290 + }, + { + "epoch": 0.6161654041351033, + "grad_norm": 0.09992015361785889, + "learning_rate": 7.710906764055493e-06, + "loss": 0.1531, + "step": 49292 + }, + { + "epoch": 0.616190404760119, + "grad_norm": 0.0026680564042180777, + "learning_rate": 7.710057255759083e-06, + "loss": 0.5956, + "step": 49294 + }, + { + "epoch": 0.6162154053851346, + "grad_norm": 4.015212535858154, + "learning_rate": 7.709207764902552e-06, + "loss": 1.282, + "step": 49296 + }, + { + "epoch": 0.6162404060101503, + "grad_norm": 0.0011633917456492782, + "learning_rate": 7.708358291492375e-06, + "loss": 0.4878, + "step": 49298 + }, + { + "epoch": 0.6162654066351658, + "grad_norm": 0.017858393490314484, + "learning_rate": 7.707508835535014e-06, + "loss": 0.0628, + "step": 49300 + }, + { + "epoch": 0.6162904072601815, + "grad_norm": 0.3569207489490509, + "learning_rate": 7.706659397036941e-06, + "loss": 0.0079, + "step": 49302 + }, + { + "epoch": 0.6163154078851971, + "grad_norm": 3.4099068641662598, + "learning_rate": 7.705809976004626e-06, + "loss": 0.8535, + "step": 49304 + }, + { + "epoch": 0.6163404085102128, + "grad_norm": 4.662890434265137, + "learning_rate": 7.704960572444538e-06, + "loss": 0.6887, + "step": 49306 + }, + { + "epoch": 0.6163654091352284, + "grad_norm": 2.6789474487304688, + "learning_rate": 7.704111186363145e-06, + "loss": 0.5332, + "step": 49308 + }, + { + "epoch": 0.616390409760244, + "grad_norm": 5.269096374511719, + "learning_rate": 7.703261817766915e-06, + "loss": 1.144, + "step": 49310 + }, + { + "epoch": 0.6164154103852596, + "grad_norm": 6.8386945724487305, + "learning_rate": 7.702412466662325e-06, + "loss": 1.0912, + "step": 49312 + }, + { + "epoch": 0.6164404110102752, + "grad_norm": 2.806670665740967, + "learning_rate": 7.701563133055831e-06, + "loss": 0.8361, + "step": 49314 + }, + { + "epoch": 0.6164654116352909, + "grad_norm": 1.5360016822814941, + "learning_rate": 7.700713816953909e-06, + "loss": 0.0748, + "step": 49316 + }, + { + "epoch": 0.6164904122603065, + "grad_norm": 1.315207839012146, + "learning_rate": 7.699864518363026e-06, + "loss": 0.0228, + "step": 49318 + }, + { + "epoch": 0.6165154128853221, + "grad_norm": 1.4208126068115234, + "learning_rate": 7.699015237289648e-06, + "loss": 0.8047, + "step": 49320 + }, + { + "epoch": 0.6165404135103377, + "grad_norm": 3.798624038696289, + "learning_rate": 7.698165973740243e-06, + "loss": 0.4873, + "step": 49322 + }, + { + "epoch": 0.6165654141353534, + "grad_norm": 5.158833980560303, + "learning_rate": 7.697316727721286e-06, + "loss": 0.4004, + "step": 49324 + }, + { + "epoch": 0.616590414760369, + "grad_norm": 0.0035713608376681805, + "learning_rate": 7.696467499239235e-06, + "loss": 0.0356, + "step": 49326 + }, + { + "epoch": 0.6166154153853847, + "grad_norm": 3.502805233001709, + "learning_rate": 7.695618288300563e-06, + "loss": 0.7167, + "step": 49328 + }, + { + "epoch": 0.6166404160104002, + "grad_norm": 5.145775318145752, + "learning_rate": 7.694769094911736e-06, + "loss": 1.7679, + "step": 49330 + }, + { + "epoch": 0.6166654166354159, + "grad_norm": 2.1524577140808105, + "learning_rate": 7.69391991907922e-06, + "loss": 0.716, + "step": 49332 + }, + { + "epoch": 0.6166904172604315, + "grad_norm": 0.0014650525990873575, + "learning_rate": 7.693070760809483e-06, + "loss": 0.6751, + "step": 49334 + }, + { + "epoch": 0.6167154178854471, + "grad_norm": 3.407221555709839, + "learning_rate": 7.692221620108995e-06, + "loss": 0.4321, + "step": 49336 + }, + { + "epoch": 0.6167404185104628, + "grad_norm": 11.46786880493164, + "learning_rate": 7.691372496984226e-06, + "loss": 1.625, + "step": 49338 + }, + { + "epoch": 0.6167654191354783, + "grad_norm": 2.8479738235473633, + "learning_rate": 7.690523391441631e-06, + "loss": 1.9193, + "step": 49340 + }, + { + "epoch": 0.616790419760494, + "grad_norm": 1.8319166898727417, + "learning_rate": 7.689674303487684e-06, + "loss": 1.0842, + "step": 49342 + }, + { + "epoch": 0.6168154203855096, + "grad_norm": 0.0019218656234443188, + "learning_rate": 7.688825233128851e-06, + "loss": 0.6862, + "step": 49344 + }, + { + "epoch": 0.6168404210105253, + "grad_norm": 2.852417230606079, + "learning_rate": 7.6879761803716e-06, + "loss": 1.4861, + "step": 49346 + }, + { + "epoch": 0.6168654216355409, + "grad_norm": 1.0342482328414917, + "learning_rate": 7.687127145222391e-06, + "loss": 0.2155, + "step": 49348 + }, + { + "epoch": 0.6168904222605565, + "grad_norm": 1.6867427825927734, + "learning_rate": 7.6862781276877e-06, + "loss": 0.6563, + "step": 49350 + }, + { + "epoch": 0.6169154228855721, + "grad_norm": 2.3453879356384277, + "learning_rate": 7.685429127773987e-06, + "loss": 0.8842, + "step": 49352 + }, + { + "epoch": 0.6169404235105878, + "grad_norm": 2.9133524894714355, + "learning_rate": 7.684580145487715e-06, + "loss": 0.4606, + "step": 49354 + }, + { + "epoch": 0.6169654241356034, + "grad_norm": 0.0006159428739920259, + "learning_rate": 7.683731180835355e-06, + "loss": 0.0, + "step": 49356 + }, + { + "epoch": 0.616990424760619, + "grad_norm": 1.7393354177474976, + "learning_rate": 7.68288223382337e-06, + "loss": 1.5801, + "step": 49358 + }, + { + "epoch": 0.6170154253856346, + "grad_norm": 4.296435832977295, + "learning_rate": 7.682033304458224e-06, + "loss": 1.0334, + "step": 49360 + }, + { + "epoch": 0.6170404260106502, + "grad_norm": 1.1648540496826172, + "learning_rate": 7.681184392746389e-06, + "loss": 0.0792, + "step": 49362 + }, + { + "epoch": 0.6170654266356659, + "grad_norm": 2.751718282699585, + "learning_rate": 7.680335498694328e-06, + "loss": 0.1194, + "step": 49364 + }, + { + "epoch": 0.6170904272606815, + "grad_norm": 3.420064926147461, + "learning_rate": 7.6794866223085e-06, + "loss": 0.4449, + "step": 49366 + }, + { + "epoch": 0.6171154278856972, + "grad_norm": 3.749371290206909, + "learning_rate": 7.678637763595373e-06, + "loss": 1.0329, + "step": 49368 + }, + { + "epoch": 0.6171404285107127, + "grad_norm": 2.5576717853546143, + "learning_rate": 7.677788922561413e-06, + "loss": 0.7719, + "step": 49370 + }, + { + "epoch": 0.6171654291357284, + "grad_norm": 3.348418951034546, + "learning_rate": 7.676940099213083e-06, + "loss": 1.239, + "step": 49372 + }, + { + "epoch": 0.617190429760744, + "grad_norm": 0.010191281326115131, + "learning_rate": 7.676091293556849e-06, + "loss": 0.658, + "step": 49374 + }, + { + "epoch": 0.6172154303857597, + "grad_norm": 3.3321330547332764, + "learning_rate": 7.675242505599181e-06, + "loss": 0.7823, + "step": 49376 + }, + { + "epoch": 0.6172404310107753, + "grad_norm": 2.404318332672119, + "learning_rate": 7.674393735346532e-06, + "loss": 0.4482, + "step": 49378 + }, + { + "epoch": 0.6172654316357908, + "grad_norm": 2.0023317337036133, + "learning_rate": 7.673544982805372e-06, + "loss": 0.8937, + "step": 49380 + }, + { + "epoch": 0.6172904322608065, + "grad_norm": 7.945763111114502, + "learning_rate": 7.672696247982164e-06, + "loss": 1.5688, + "step": 49382 + }, + { + "epoch": 0.6173154328858221, + "grad_norm": 2.928680419921875, + "learning_rate": 7.67184753088337e-06, + "loss": 1.2962, + "step": 49384 + }, + { + "epoch": 0.6173404335108378, + "grad_norm": 2.8821041584014893, + "learning_rate": 7.670998831515459e-06, + "loss": 1.6447, + "step": 49386 + }, + { + "epoch": 0.6173654341358534, + "grad_norm": 1.760779619216919, + "learning_rate": 7.67015014988489e-06, + "loss": 0.8302, + "step": 49388 + }, + { + "epoch": 0.617390434760869, + "grad_norm": 10.017433166503906, + "learning_rate": 7.669301485998134e-06, + "loss": 1.3082, + "step": 49390 + }, + { + "epoch": 0.6174154353858846, + "grad_norm": 4.853704452514648, + "learning_rate": 7.668452839861643e-06, + "loss": 0.6746, + "step": 49392 + }, + { + "epoch": 0.6174404360109003, + "grad_norm": 2.268542528152466, + "learning_rate": 7.667604211481886e-06, + "loss": 0.6745, + "step": 49394 + }, + { + "epoch": 0.6174654366359159, + "grad_norm": 3.8420212268829346, + "learning_rate": 7.666755600865323e-06, + "loss": 1.4089, + "step": 49396 + }, + { + "epoch": 0.6174904372609316, + "grad_norm": 3.9464914798736572, + "learning_rate": 7.665907008018422e-06, + "loss": 2.3374, + "step": 49398 + }, + { + "epoch": 0.6175154378859471, + "grad_norm": 4.710386276245117, + "learning_rate": 7.665058432947642e-06, + "loss": 0.8866, + "step": 49400 + }, + { + "epoch": 0.6175404385109627, + "grad_norm": 1.9848921298980713, + "learning_rate": 7.664209875659452e-06, + "loss": 1.2353, + "step": 49402 + }, + { + "epoch": 0.6175654391359784, + "grad_norm": 3.8141815662384033, + "learning_rate": 7.663361336160306e-06, + "loss": 0.9248, + "step": 49404 + }, + { + "epoch": 0.617590439760994, + "grad_norm": 3.450592279434204, + "learning_rate": 7.662512814456668e-06, + "loss": 0.2178, + "step": 49406 + }, + { + "epoch": 0.6176154403860097, + "grad_norm": 6.07808780670166, + "learning_rate": 7.661664310554999e-06, + "loss": 1.3541, + "step": 49408 + }, + { + "epoch": 0.6176404410110252, + "grad_norm": 1.1913065910339355, + "learning_rate": 7.660815824461768e-06, + "loss": 0.038, + "step": 49410 + }, + { + "epoch": 0.6176654416360409, + "grad_norm": 1.9232313632965088, + "learning_rate": 7.659967356183432e-06, + "loss": 1.1165, + "step": 49412 + }, + { + "epoch": 0.6176904422610565, + "grad_norm": 4.183655738830566, + "learning_rate": 7.659118905726454e-06, + "loss": 1.8316, + "step": 49414 + }, + { + "epoch": 0.6177154428860722, + "grad_norm": 3.0793607234954834, + "learning_rate": 7.6582704730973e-06, + "loss": 0.7282, + "step": 49416 + }, + { + "epoch": 0.6177404435110878, + "grad_norm": 2.48634672164917, + "learning_rate": 7.657422058302422e-06, + "loss": 1.1047, + "step": 49418 + }, + { + "epoch": 0.6177654441361033, + "grad_norm": 2.8632874488830566, + "learning_rate": 7.656573661348287e-06, + "loss": 0.8121, + "step": 49420 + }, + { + "epoch": 0.617790444761119, + "grad_norm": 5.743358135223389, + "learning_rate": 7.655725282241351e-06, + "loss": 1.2254, + "step": 49422 + }, + { + "epoch": 0.6178154453861346, + "grad_norm": 2.894104242324829, + "learning_rate": 7.654876920988085e-06, + "loss": 0.9496, + "step": 49424 + }, + { + "epoch": 0.6178404460111503, + "grad_norm": 5.717175483703613, + "learning_rate": 7.654028577594943e-06, + "loss": 1.1834, + "step": 49426 + }, + { + "epoch": 0.6178654466361659, + "grad_norm": 6.893511772155762, + "learning_rate": 7.653180252068393e-06, + "loss": 1.435, + "step": 49428 + }, + { + "epoch": 0.6178904472611815, + "grad_norm": 1.9347872734069824, + "learning_rate": 7.652331944414886e-06, + "loss": 1.4373, + "step": 49430 + }, + { + "epoch": 0.6179154478861971, + "grad_norm": 4.025126934051514, + "learning_rate": 7.651483654640885e-06, + "loss": 1.4417, + "step": 49432 + }, + { + "epoch": 0.6179404485112128, + "grad_norm": 3.5204696655273438, + "learning_rate": 7.650635382752852e-06, + "loss": 1.609, + "step": 49434 + }, + { + "epoch": 0.6179654491362284, + "grad_norm": 8.896255493164062, + "learning_rate": 7.64978712875725e-06, + "loss": 1.7132, + "step": 49436 + }, + { + "epoch": 0.6179904497612441, + "grad_norm": 2.3766024112701416, + "learning_rate": 7.648938892660536e-06, + "loss": 1.1161, + "step": 49438 + }, + { + "epoch": 0.6180154503862596, + "grad_norm": 1.583614706993103, + "learning_rate": 7.648090674469171e-06, + "loss": 0.4451, + "step": 49440 + }, + { + "epoch": 0.6180404510112752, + "grad_norm": 2.2022576332092285, + "learning_rate": 7.64724247418962e-06, + "loss": 1.2006, + "step": 49442 + }, + { + "epoch": 0.6180654516362909, + "grad_norm": 2.547278642654419, + "learning_rate": 7.646394291828333e-06, + "loss": 1.2279, + "step": 49444 + }, + { + "epoch": 0.6180904522613065, + "grad_norm": 0.0005577270640060306, + "learning_rate": 7.645546127391772e-06, + "loss": 0.4359, + "step": 49446 + }, + { + "epoch": 0.6181154528863222, + "grad_norm": 0.06744068115949631, + "learning_rate": 7.644697980886401e-06, + "loss": 0.71, + "step": 49448 + }, + { + "epoch": 0.6181404535113377, + "grad_norm": 3.0780792236328125, + "learning_rate": 7.643849852318677e-06, + "loss": 0.378, + "step": 49450 + }, + { + "epoch": 0.6181654541363534, + "grad_norm": 5.112419128417969, + "learning_rate": 7.643001741695058e-06, + "loss": 0.5849, + "step": 49452 + }, + { + "epoch": 0.618190454761369, + "grad_norm": 1.0509153604507446, + "learning_rate": 7.64215364902201e-06, + "loss": 0.4976, + "step": 49454 + }, + { + "epoch": 0.6182154553863847, + "grad_norm": 2.4045801162719727, + "learning_rate": 7.641305574305983e-06, + "loss": 0.4782, + "step": 49456 + }, + { + "epoch": 0.6182404560114003, + "grad_norm": 1.6289137601852417, + "learning_rate": 7.640457517553436e-06, + "loss": 1.2078, + "step": 49458 + }, + { + "epoch": 0.6182654566364159, + "grad_norm": 2.519257068634033, + "learning_rate": 7.639609478770833e-06, + "loss": 0.5044, + "step": 49460 + }, + { + "epoch": 0.6182904572614315, + "grad_norm": 3.582190752029419, + "learning_rate": 7.63876145796463e-06, + "loss": 1.0079, + "step": 49462 + }, + { + "epoch": 0.6183154578864472, + "grad_norm": 2.720494031906128, + "learning_rate": 7.637913455141286e-06, + "loss": 0.5373, + "step": 49464 + }, + { + "epoch": 0.6183404585114628, + "grad_norm": 2.822420358657837, + "learning_rate": 7.63706547030726e-06, + "loss": 0.6174, + "step": 49466 + }, + { + "epoch": 0.6183654591364784, + "grad_norm": 1.4233366250991821, + "learning_rate": 7.636217503469013e-06, + "loss": 0.3099, + "step": 49468 + }, + { + "epoch": 0.618390459761494, + "grad_norm": 0.0006956331199035048, + "learning_rate": 7.635369554632991e-06, + "loss": 0.5433, + "step": 49470 + }, + { + "epoch": 0.6184154603865096, + "grad_norm": 2.454022169113159, + "learning_rate": 7.634521623805664e-06, + "loss": 1.1326, + "step": 49472 + }, + { + "epoch": 0.6184404610115253, + "grad_norm": 4.812450408935547, + "learning_rate": 7.633673710993488e-06, + "loss": 1.0203, + "step": 49474 + }, + { + "epoch": 0.6184654616365409, + "grad_norm": 0.9866482019424438, + "learning_rate": 7.632825816202916e-06, + "loss": 0.8718, + "step": 49476 + }, + { + "epoch": 0.6184904622615566, + "grad_norm": 0.17213653028011322, + "learning_rate": 7.631977939440408e-06, + "loss": 0.4747, + "step": 49478 + }, + { + "epoch": 0.6185154628865721, + "grad_norm": 4.605895519256592, + "learning_rate": 7.63113008071242e-06, + "loss": 1.1615, + "step": 49480 + }, + { + "epoch": 0.6185404635115878, + "grad_norm": 3.744264841079712, + "learning_rate": 7.630282240025415e-06, + "loss": 2.4072, + "step": 49482 + }, + { + "epoch": 0.6185654641366034, + "grad_norm": 0.27607986330986023, + "learning_rate": 7.62943441738584e-06, + "loss": 0.6248, + "step": 49484 + }, + { + "epoch": 0.618590464761619, + "grad_norm": 3.3303260803222656, + "learning_rate": 7.628586612800159e-06, + "loss": 0.9286, + "step": 49486 + }, + { + "epoch": 0.6186154653866347, + "grad_norm": 3.938560724258423, + "learning_rate": 7.627738826274827e-06, + "loss": 1.1344, + "step": 49488 + }, + { + "epoch": 0.6186404660116502, + "grad_norm": 4.2036333084106445, + "learning_rate": 7.626891057816302e-06, + "loss": 1.2382, + "step": 49490 + }, + { + "epoch": 0.6186654666366659, + "grad_norm": 4.151208877563477, + "learning_rate": 7.626043307431037e-06, + "loss": 1.6791, + "step": 49492 + }, + { + "epoch": 0.6186904672616815, + "grad_norm": 1.694845199584961, + "learning_rate": 7.6251955751254956e-06, + "loss": 0.2767, + "step": 49494 + }, + { + "epoch": 0.6187154678866972, + "grad_norm": 4.463265419006348, + "learning_rate": 7.624347860906122e-06, + "loss": 0.9945, + "step": 49496 + }, + { + "epoch": 0.6187404685117128, + "grad_norm": 4.367156982421875, + "learning_rate": 7.623500164779383e-06, + "loss": 1.1406, + "step": 49498 + }, + { + "epoch": 0.6187654691367284, + "grad_norm": 8.28200912475586, + "learning_rate": 7.622652486751728e-06, + "loss": 2.064, + "step": 49500 + }, + { + "epoch": 0.618790469761744, + "grad_norm": 4.085459232330322, + "learning_rate": 7.621804826829617e-06, + "loss": 0.5242, + "step": 49502 + }, + { + "epoch": 0.6188154703867597, + "grad_norm": 2.935370445251465, + "learning_rate": 7.620957185019504e-06, + "loss": 0.5069, + "step": 49504 + }, + { + "epoch": 0.6188404710117753, + "grad_norm": 5.689436912536621, + "learning_rate": 7.6201095613278445e-06, + "loss": 1.7858, + "step": 49506 + }, + { + "epoch": 0.618865471636791, + "grad_norm": 4.0915350914001465, + "learning_rate": 7.619261955761097e-06, + "loss": 1.2091, + "step": 49508 + }, + { + "epoch": 0.6188904722618065, + "grad_norm": 3.1330971717834473, + "learning_rate": 7.618414368325711e-06, + "loss": 1.0018, + "step": 49510 + }, + { + "epoch": 0.6189154728868221, + "grad_norm": 4.465839862823486, + "learning_rate": 7.6175667990281444e-06, + "loss": 0.2131, + "step": 49512 + }, + { + "epoch": 0.6189404735118378, + "grad_norm": 4.281044960021973, + "learning_rate": 7.616719247874851e-06, + "loss": 0.677, + "step": 49514 + }, + { + "epoch": 0.6189654741368534, + "grad_norm": 1.0936349630355835, + "learning_rate": 7.615871714872288e-06, + "loss": 0.4101, + "step": 49516 + }, + { + "epoch": 0.6189904747618691, + "grad_norm": 5.090686321258545, + "learning_rate": 7.615024200026909e-06, + "loss": 1.05, + "step": 49518 + }, + { + "epoch": 0.6190154753868846, + "grad_norm": 0.004205421078950167, + "learning_rate": 7.614176703345171e-06, + "loss": 0.5205, + "step": 49520 + }, + { + "epoch": 0.6190404760119003, + "grad_norm": 0.037278320640325546, + "learning_rate": 7.6133292248335225e-06, + "loss": 0.9829, + "step": 49522 + }, + { + "epoch": 0.6190654766369159, + "grad_norm": 0.00039749484858475626, + "learning_rate": 7.612481764498422e-06, + "loss": 0.1082, + "step": 49524 + }, + { + "epoch": 0.6190904772619316, + "grad_norm": 3.5542972087860107, + "learning_rate": 7.611634322346322e-06, + "loss": 0.6544, + "step": 49526 + }, + { + "epoch": 0.6191154778869472, + "grad_norm": 5.139448642730713, + "learning_rate": 7.610786898383678e-06, + "loss": 0.7817, + "step": 49528 + }, + { + "epoch": 0.6191404785119627, + "grad_norm": 5.937077045440674, + "learning_rate": 7.609939492616943e-06, + "loss": 1.2499, + "step": 49530 + }, + { + "epoch": 0.6191654791369784, + "grad_norm": 3.189751386642456, + "learning_rate": 7.609092105052569e-06, + "loss": 0.9562, + "step": 49532 + }, + { + "epoch": 0.619190479761994, + "grad_norm": 0.9654078483581543, + "learning_rate": 7.608244735697015e-06, + "loss": 0.0473, + "step": 49534 + }, + { + "epoch": 0.6192154803870097, + "grad_norm": 3.2815520763397217, + "learning_rate": 7.607397384556729e-06, + "loss": 0.3154, + "step": 49536 + }, + { + "epoch": 0.6192404810120253, + "grad_norm": 0.19103898108005524, + "learning_rate": 7.606550051638166e-06, + "loss": 0.3207, + "step": 49538 + }, + { + "epoch": 0.6192654816370409, + "grad_norm": 4.861411094665527, + "learning_rate": 7.6057027369477795e-06, + "loss": 1.1875, + "step": 49540 + }, + { + "epoch": 0.6192904822620565, + "grad_norm": 2.7820804119110107, + "learning_rate": 7.604855440492023e-06, + "loss": 0.6124, + "step": 49542 + }, + { + "epoch": 0.6193154828870722, + "grad_norm": 3.396965265274048, + "learning_rate": 7.6040081622773475e-06, + "loss": 1.1364, + "step": 49544 + }, + { + "epoch": 0.6193404835120878, + "grad_norm": 3.426347494125366, + "learning_rate": 7.603160902310209e-06, + "loss": 1.5192, + "step": 49546 + }, + { + "epoch": 0.6193654841371035, + "grad_norm": 6.670867443084717, + "learning_rate": 7.6023136605970575e-06, + "loss": 1.0764, + "step": 49548 + }, + { + "epoch": 0.619390484762119, + "grad_norm": 2.847978353500366, + "learning_rate": 7.601466437144345e-06, + "loss": 1.9348, + "step": 49550 + }, + { + "epoch": 0.6194154853871346, + "grad_norm": 11.771529197692871, + "learning_rate": 7.600619231958526e-06, + "loss": 2.0164, + "step": 49552 + }, + { + "epoch": 0.6194404860121503, + "grad_norm": 2.882063388824463, + "learning_rate": 7.5997720450460496e-06, + "loss": 0.6148, + "step": 49554 + }, + { + "epoch": 0.6194654866371659, + "grad_norm": 2.501751184463501, + "learning_rate": 7.598924876413372e-06, + "loss": 0.1318, + "step": 49556 + }, + { + "epoch": 0.6194904872621816, + "grad_norm": 5.313462257385254, + "learning_rate": 7.598077726066941e-06, + "loss": 1.0464, + "step": 49558 + }, + { + "epoch": 0.6195154878871971, + "grad_norm": 2.809337854385376, + "learning_rate": 7.597230594013214e-06, + "loss": 1.3793, + "step": 49560 + }, + { + "epoch": 0.6195404885122128, + "grad_norm": 0.9057309031486511, + "learning_rate": 7.5963834802586365e-06, + "loss": 0.0754, + "step": 49562 + }, + { + "epoch": 0.6195654891372284, + "grad_norm": 2.370830774307251, + "learning_rate": 7.595536384809663e-06, + "loss": 1.2807, + "step": 49564 + }, + { + "epoch": 0.6195904897622441, + "grad_norm": 0.0077800629660487175, + "learning_rate": 7.594689307672744e-06, + "loss": 0.0002, + "step": 49566 + }, + { + "epoch": 0.6196154903872597, + "grad_norm": 0.34542563557624817, + "learning_rate": 7.593842248854331e-06, + "loss": 0.6948, + "step": 49568 + }, + { + "epoch": 0.6196404910122753, + "grad_norm": 3.666808605194092, + "learning_rate": 7.592995208360874e-06, + "loss": 1.3327, + "step": 49570 + }, + { + "epoch": 0.6196654916372909, + "grad_norm": 2.6134071350097656, + "learning_rate": 7.592148186198829e-06, + "loss": 0.5619, + "step": 49572 + }, + { + "epoch": 0.6196904922623065, + "grad_norm": 6.433331489562988, + "learning_rate": 7.59130118237464e-06, + "loss": 2.1184, + "step": 49574 + }, + { + "epoch": 0.6197154928873222, + "grad_norm": 0.4115482568740845, + "learning_rate": 7.590454196894761e-06, + "loss": 0.7127, + "step": 49576 + }, + { + "epoch": 0.6197404935123378, + "grad_norm": 5.771332263946533, + "learning_rate": 7.589607229765641e-06, + "loss": 1.2139, + "step": 49578 + }, + { + "epoch": 0.6197654941373534, + "grad_norm": 7.249062538146973, + "learning_rate": 7.588760280993732e-06, + "loss": 0.5042, + "step": 49580 + }, + { + "epoch": 0.619790494762369, + "grad_norm": 2.389007568359375, + "learning_rate": 7.5879133505854854e-06, + "loss": 0.5321, + "step": 49582 + }, + { + "epoch": 0.6198154953873847, + "grad_norm": 3.0267553329467773, + "learning_rate": 7.587066438547348e-06, + "loss": 1.3124, + "step": 49584 + }, + { + "epoch": 0.6198404960124003, + "grad_norm": 3.5120506286621094, + "learning_rate": 7.586219544885774e-06, + "loss": 0.8812, + "step": 49586 + }, + { + "epoch": 0.619865496637416, + "grad_norm": 3.8180088996887207, + "learning_rate": 7.5853726696072075e-06, + "loss": 0.8776, + "step": 49588 + }, + { + "epoch": 0.6198904972624315, + "grad_norm": 2.332463264465332, + "learning_rate": 7.584525812718102e-06, + "loss": 1.2208, + "step": 49590 + }, + { + "epoch": 0.6199154978874472, + "grad_norm": 5.1733622550964355, + "learning_rate": 7.583678974224906e-06, + "loss": 1.7181, + "step": 49592 + }, + { + "epoch": 0.6199404985124628, + "grad_norm": 5.238697528839111, + "learning_rate": 7.58283215413407e-06, + "loss": 1.0693, + "step": 49594 + }, + { + "epoch": 0.6199654991374784, + "grad_norm": 3.349313497543335, + "learning_rate": 7.581985352452042e-06, + "loss": 0.731, + "step": 49596 + }, + { + "epoch": 0.6199904997624941, + "grad_norm": 3.7774438858032227, + "learning_rate": 7.581138569185274e-06, + "loss": 0.8367, + "step": 49598 + }, + { + "epoch": 0.6200155003875096, + "grad_norm": 4.6812744140625, + "learning_rate": 7.5802918043402095e-06, + "loss": 1.29, + "step": 49600 + }, + { + "epoch": 0.6200405010125253, + "grad_norm": 3.522372007369995, + "learning_rate": 7.5794450579233016e-06, + "loss": 0.7236, + "step": 49602 + }, + { + "epoch": 0.6200655016375409, + "grad_norm": 0.0006134907016530633, + "learning_rate": 7.578598329940997e-06, + "loss": 0.453, + "step": 49604 + }, + { + "epoch": 0.6200905022625566, + "grad_norm": 6.625421047210693, + "learning_rate": 7.577751620399746e-06, + "loss": 1.4233, + "step": 49606 + }, + { + "epoch": 0.6201155028875722, + "grad_norm": 2.11973237991333, + "learning_rate": 7.576904929305996e-06, + "loss": 0.9858, + "step": 49608 + }, + { + "epoch": 0.6201405035125878, + "grad_norm": 2.4810235500335693, + "learning_rate": 7.576058256666194e-06, + "loss": 0.0703, + "step": 49610 + }, + { + "epoch": 0.6201655041376034, + "grad_norm": 2.528017520904541, + "learning_rate": 7.5752116024867935e-06, + "loss": 0.7235, + "step": 49612 + }, + { + "epoch": 0.620190504762619, + "grad_norm": 4.693922519683838, + "learning_rate": 7.574364966774236e-06, + "loss": 1.3031, + "step": 49614 + }, + { + "epoch": 0.6202155053876347, + "grad_norm": 2.5716958045959473, + "learning_rate": 7.5735183495349716e-06, + "loss": 0.1357, + "step": 49616 + }, + { + "epoch": 0.6202405060126503, + "grad_norm": 3.2959072589874268, + "learning_rate": 7.572671750775447e-06, + "loss": 0.4906, + "step": 49618 + }, + { + "epoch": 0.6202655066376659, + "grad_norm": 4.499825954437256, + "learning_rate": 7.571825170502114e-06, + "loss": 1.0793, + "step": 49620 + }, + { + "epoch": 0.6202905072626815, + "grad_norm": 4.233190059661865, + "learning_rate": 7.570978608721416e-06, + "loss": 1.6917, + "step": 49622 + }, + { + "epoch": 0.6203155078876972, + "grad_norm": 3.4599192142486572, + "learning_rate": 7.570132065439801e-06, + "loss": 1.5981, + "step": 49624 + }, + { + "epoch": 0.6203405085127128, + "grad_norm": 2.1968443393707275, + "learning_rate": 7.569285540663717e-06, + "loss": 1.0694, + "step": 49626 + }, + { + "epoch": 0.6203655091377285, + "grad_norm": 3.8450076580047607, + "learning_rate": 7.568439034399611e-06, + "loss": 0.7617, + "step": 49628 + }, + { + "epoch": 0.620390509762744, + "grad_norm": 0.0018626217497512698, + "learning_rate": 7.567592546653928e-06, + "loss": 0.7954, + "step": 49630 + }, + { + "epoch": 0.6204155103877597, + "grad_norm": 0.34068959951400757, + "learning_rate": 7.566746077433117e-06, + "loss": 0.2619, + "step": 49632 + }, + { + "epoch": 0.6204405110127753, + "grad_norm": 4.90084981918335, + "learning_rate": 7.565899626743624e-06, + "loss": 0.9397, + "step": 49634 + }, + { + "epoch": 0.620465511637791, + "grad_norm": 0.000626644236035645, + "learning_rate": 7.565053194591894e-06, + "loss": 0.9129, + "step": 49636 + }, + { + "epoch": 0.6204905122628066, + "grad_norm": 4.078800678253174, + "learning_rate": 7.564206780984378e-06, + "loss": 0.9792, + "step": 49638 + }, + { + "epoch": 0.6205155128878221, + "grad_norm": 3.919062614440918, + "learning_rate": 7.563360385927515e-06, + "loss": 0.9856, + "step": 49640 + }, + { + "epoch": 0.6205405135128378, + "grad_norm": 6.735276222229004, + "learning_rate": 7.562514009427756e-06, + "loss": 2.0906, + "step": 49642 + }, + { + "epoch": 0.6205655141378534, + "grad_norm": 4.683187961578369, + "learning_rate": 7.5616676514915455e-06, + "loss": 1.6017, + "step": 49644 + }, + { + "epoch": 0.6205905147628691, + "grad_norm": 2.7100913524627686, + "learning_rate": 7.5608213121253294e-06, + "loss": 1.2303, + "step": 49646 + }, + { + "epoch": 0.6206155153878847, + "grad_norm": 5.000918865203857, + "learning_rate": 7.559974991335552e-06, + "loss": 1.7884, + "step": 49648 + }, + { + "epoch": 0.6206405160129003, + "grad_norm": 3.5094377994537354, + "learning_rate": 7.559128689128664e-06, + "loss": 0.8888, + "step": 49650 + }, + { + "epoch": 0.6206655166379159, + "grad_norm": 0.010007111355662346, + "learning_rate": 7.558282405511104e-06, + "loss": 0.6412, + "step": 49652 + }, + { + "epoch": 0.6206905172629316, + "grad_norm": 2.8991951942443848, + "learning_rate": 7.557436140489319e-06, + "loss": 0.9627, + "step": 49654 + }, + { + "epoch": 0.6207155178879472, + "grad_norm": 7.920064449310303, + "learning_rate": 7.5565898940697555e-06, + "loss": 0.6551, + "step": 49656 + }, + { + "epoch": 0.6207405185129629, + "grad_norm": 3.593501091003418, + "learning_rate": 7.555743666258859e-06, + "loss": 1.0537, + "step": 49658 + }, + { + "epoch": 0.6207655191379784, + "grad_norm": 2.703878164291382, + "learning_rate": 7.554897457063072e-06, + "loss": 1.1685, + "step": 49660 + }, + { + "epoch": 0.620790519762994, + "grad_norm": 1.83767831325531, + "learning_rate": 7.55405126648884e-06, + "loss": 0.8248, + "step": 49662 + }, + { + "epoch": 0.6208155203880097, + "grad_norm": 3.4873528480529785, + "learning_rate": 7.553205094542609e-06, + "loss": 1.0261, + "step": 49664 + }, + { + "epoch": 0.6208405210130253, + "grad_norm": 0.002801976166665554, + "learning_rate": 7.5523589412308215e-06, + "loss": 0.398, + "step": 49666 + }, + { + "epoch": 0.620865521638041, + "grad_norm": 0.002004226902499795, + "learning_rate": 7.5515128065599215e-06, + "loss": 0.4844, + "step": 49668 + }, + { + "epoch": 0.6208905222630565, + "grad_norm": 1.4265336990356445, + "learning_rate": 7.5506666905363544e-06, + "loss": 0.0934, + "step": 49670 + }, + { + "epoch": 0.6209155228880722, + "grad_norm": 2.831031560897827, + "learning_rate": 7.549820593166563e-06, + "loss": 0.4569, + "step": 49672 + }, + { + "epoch": 0.6209405235130878, + "grad_norm": 11.774229049682617, + "learning_rate": 7.54897451445699e-06, + "loss": 1.1523, + "step": 49674 + }, + { + "epoch": 0.6209655241381035, + "grad_norm": 1.557629108428955, + "learning_rate": 7.5481284544140855e-06, + "loss": 0.8631, + "step": 49676 + }, + { + "epoch": 0.6209905247631191, + "grad_norm": 2.0626978874206543, + "learning_rate": 7.547282413044283e-06, + "loss": 1.1597, + "step": 49678 + }, + { + "epoch": 0.6210155253881346, + "grad_norm": 5.817182540893555, + "learning_rate": 7.546436390354033e-06, + "loss": 2.0868, + "step": 49680 + }, + { + "epoch": 0.6210405260131503, + "grad_norm": 6.0887370109558105, + "learning_rate": 7.5455903863497745e-06, + "loss": 0.3694, + "step": 49682 + }, + { + "epoch": 0.6210655266381659, + "grad_norm": 2.5298614501953125, + "learning_rate": 7.544744401037954e-06, + "loss": 0.4879, + "step": 49684 + }, + { + "epoch": 0.6210905272631816, + "grad_norm": 3.106354236602783, + "learning_rate": 7.543898434425012e-06, + "loss": 0.7186, + "step": 49686 + }, + { + "epoch": 0.6211155278881972, + "grad_norm": 0.8250355124473572, + "learning_rate": 7.543052486517394e-06, + "loss": 0.0735, + "step": 49688 + }, + { + "epoch": 0.6211405285132128, + "grad_norm": 0.17164096236228943, + "learning_rate": 7.542206557321541e-06, + "loss": 0.0028, + "step": 49690 + }, + { + "epoch": 0.6211655291382284, + "grad_norm": 3.517291784286499, + "learning_rate": 7.541360646843894e-06, + "loss": 1.2628, + "step": 49692 + }, + { + "epoch": 0.6211905297632441, + "grad_norm": 1.5900514125823975, + "learning_rate": 7.540514755090896e-06, + "loss": 0.2555, + "step": 49694 + }, + { + "epoch": 0.6212155303882597, + "grad_norm": 2.937422275543213, + "learning_rate": 7.539668882068991e-06, + "loss": 0.2319, + "step": 49696 + }, + { + "epoch": 0.6212405310132754, + "grad_norm": 6.885311603546143, + "learning_rate": 7.538823027784619e-06, + "loss": 0.5909, + "step": 49698 + }, + { + "epoch": 0.6212655316382909, + "grad_norm": 2.654724359512329, + "learning_rate": 7.537977192244222e-06, + "loss": 1.4997, + "step": 49700 + }, + { + "epoch": 0.6212905322633065, + "grad_norm": 8.055004119873047, + "learning_rate": 7.537131375454246e-06, + "loss": 1.6023, + "step": 49702 + }, + { + "epoch": 0.6213155328883222, + "grad_norm": 3.6862714290618896, + "learning_rate": 7.536285577421128e-06, + "loss": 0.4993, + "step": 49704 + }, + { + "epoch": 0.6213405335133378, + "grad_norm": 1.869969129562378, + "learning_rate": 7.5354397981513085e-06, + "loss": 0.6788, + "step": 49706 + }, + { + "epoch": 0.6213655341383535, + "grad_norm": 3.232238292694092, + "learning_rate": 7.534594037651232e-06, + "loss": 0.6778, + "step": 49708 + }, + { + "epoch": 0.621390534763369, + "grad_norm": 3.620302200317383, + "learning_rate": 7.5337482959273375e-06, + "loss": 1.396, + "step": 49710 + }, + { + "epoch": 0.6214155353883847, + "grad_norm": 2.7103748321533203, + "learning_rate": 7.532902572986068e-06, + "loss": 0.242, + "step": 49712 + }, + { + "epoch": 0.6214405360134003, + "grad_norm": 6.911585807800293, + "learning_rate": 7.5320568688338625e-06, + "loss": 0.9303, + "step": 49714 + }, + { + "epoch": 0.621465536638416, + "grad_norm": 1.905761480331421, + "learning_rate": 7.531211183477169e-06, + "loss": 0.0828, + "step": 49716 + }, + { + "epoch": 0.6214905372634316, + "grad_norm": 2.3470444679260254, + "learning_rate": 7.530365516922417e-06, + "loss": 0.9292, + "step": 49718 + }, + { + "epoch": 0.6215155378884472, + "grad_norm": 0.5338162183761597, + "learning_rate": 7.529519869176052e-06, + "loss": 0.3948, + "step": 49720 + }, + { + "epoch": 0.6215405385134628, + "grad_norm": 2.900804042816162, + "learning_rate": 7.528674240244513e-06, + "loss": 1.0868, + "step": 49722 + }, + { + "epoch": 0.6215655391384785, + "grad_norm": 2.9427285194396973, + "learning_rate": 7.527828630134244e-06, + "loss": 0.559, + "step": 49724 + }, + { + "epoch": 0.6215905397634941, + "grad_norm": 2.5882601737976074, + "learning_rate": 7.526983038851679e-06, + "loss": 1.0047, + "step": 49726 + }, + { + "epoch": 0.6216155403885097, + "grad_norm": 4.142152309417725, + "learning_rate": 7.526137466403264e-06, + "loss": 0.3966, + "step": 49728 + }, + { + "epoch": 0.6216405410135253, + "grad_norm": 0.0005646315403282642, + "learning_rate": 7.52529191279544e-06, + "loss": 0.7013, + "step": 49730 + }, + { + "epoch": 0.6216655416385409, + "grad_norm": 0.30757176876068115, + "learning_rate": 7.52444637803464e-06, + "loss": 0.114, + "step": 49732 + }, + { + "epoch": 0.6216905422635566, + "grad_norm": 4.054929733276367, + "learning_rate": 7.523600862127305e-06, + "loss": 1.0208, + "step": 49734 + }, + { + "epoch": 0.6217155428885722, + "grad_norm": 0.8540615439414978, + "learning_rate": 7.522755365079876e-06, + "loss": 0.0204, + "step": 49736 + }, + { + "epoch": 0.6217405435135879, + "grad_norm": 2.880805253982544, + "learning_rate": 7.521909886898791e-06, + "loss": 1.3593, + "step": 49738 + }, + { + "epoch": 0.6217655441386034, + "grad_norm": 2.0822837352752686, + "learning_rate": 7.52106442759049e-06, + "loss": 0.1659, + "step": 49740 + }, + { + "epoch": 0.6217905447636191, + "grad_norm": 2.5145485401153564, + "learning_rate": 7.5202189871614165e-06, + "loss": 1.0075, + "step": 49742 + }, + { + "epoch": 0.6218155453886347, + "grad_norm": 4.852659225463867, + "learning_rate": 7.519373565618e-06, + "loss": 1.1481, + "step": 49744 + }, + { + "epoch": 0.6218405460136504, + "grad_norm": 3.2873270511627197, + "learning_rate": 7.518528162966684e-06, + "loss": 1.7362, + "step": 49746 + }, + { + "epoch": 0.621865546638666, + "grad_norm": 2.529947280883789, + "learning_rate": 7.5176827792139075e-06, + "loss": 0.8689, + "step": 49748 + }, + { + "epoch": 0.6218905472636815, + "grad_norm": 4.671329021453857, + "learning_rate": 7.516837414366107e-06, + "loss": 1.6177, + "step": 49750 + }, + { + "epoch": 0.6219155478886972, + "grad_norm": 3.195671558380127, + "learning_rate": 7.5159920684297194e-06, + "loss": 0.7505, + "step": 49752 + }, + { + "epoch": 0.6219405485137128, + "grad_norm": 5.745124340057373, + "learning_rate": 7.5151467414111865e-06, + "loss": 0.1451, + "step": 49754 + }, + { + "epoch": 0.6219655491387285, + "grad_norm": 4.110403060913086, + "learning_rate": 7.51430143331695e-06, + "loss": 1.1946, + "step": 49756 + }, + { + "epoch": 0.6219905497637441, + "grad_norm": 1.140174150466919, + "learning_rate": 7.513456144153438e-06, + "loss": 0.0586, + "step": 49758 + }, + { + "epoch": 0.6220155503887597, + "grad_norm": 1.2222648859024048, + "learning_rate": 7.512610873927092e-06, + "loss": 0.1419, + "step": 49760 + }, + { + "epoch": 0.6220405510137753, + "grad_norm": 3.175485372543335, + "learning_rate": 7.511765622644349e-06, + "loss": 1.5325, + "step": 49762 + }, + { + "epoch": 0.622065551638791, + "grad_norm": 5.877408981323242, + "learning_rate": 7.510920390311647e-06, + "loss": 1.2549, + "step": 49764 + }, + { + "epoch": 0.6220905522638066, + "grad_norm": 0.170615553855896, + "learning_rate": 7.510075176935424e-06, + "loss": 0.4587, + "step": 49766 + }, + { + "epoch": 0.6221155528888223, + "grad_norm": 4.64154577255249, + "learning_rate": 7.509229982522122e-06, + "loss": 0.2842, + "step": 49768 + }, + { + "epoch": 0.6221405535138378, + "grad_norm": 0.7808603644371033, + "learning_rate": 7.508384807078166e-06, + "loss": 0.5959, + "step": 49770 + }, + { + "epoch": 0.6221655541388534, + "grad_norm": 1.9952571392059326, + "learning_rate": 7.507539650609999e-06, + "loss": 0.5877, + "step": 49772 + }, + { + "epoch": 0.6221905547638691, + "grad_norm": 3.5281028747558594, + "learning_rate": 7.50669451312406e-06, + "loss": 0.8672, + "step": 49774 + }, + { + "epoch": 0.6222155553888847, + "grad_norm": 0.3216113746166229, + "learning_rate": 7.5058493946267786e-06, + "loss": 0.4705, + "step": 49776 + }, + { + "epoch": 0.6222405560139004, + "grad_norm": 5.642317295074463, + "learning_rate": 7.505004295124598e-06, + "loss": 1.6337, + "step": 49778 + }, + { + "epoch": 0.6222655566389159, + "grad_norm": 5.855889320373535, + "learning_rate": 7.504159214623953e-06, + "loss": 0.7114, + "step": 49780 + }, + { + "epoch": 0.6222905572639316, + "grad_norm": 3.997699499130249, + "learning_rate": 7.5033141531312814e-06, + "loss": 2.2667, + "step": 49782 + }, + { + "epoch": 0.6223155578889472, + "grad_norm": 2.0640103816986084, + "learning_rate": 7.502469110653012e-06, + "loss": 0.3707, + "step": 49784 + }, + { + "epoch": 0.6223405585139629, + "grad_norm": 4.96212100982666, + "learning_rate": 7.501624087195586e-06, + "loss": 1.5357, + "step": 49786 + }, + { + "epoch": 0.6223655591389785, + "grad_norm": 3.4459574222564697, + "learning_rate": 7.500779082765434e-06, + "loss": 0.8163, + "step": 49788 + }, + { + "epoch": 0.622390559763994, + "grad_norm": 3.5881259441375732, + "learning_rate": 7.499934097368998e-06, + "loss": 0.8902, + "step": 49790 + }, + { + "epoch": 0.6224155603890097, + "grad_norm": 4.754479885101318, + "learning_rate": 7.49908913101271e-06, + "loss": 0.9431, + "step": 49792 + }, + { + "epoch": 0.6224405610140253, + "grad_norm": 4.814883708953857, + "learning_rate": 7.49824418370301e-06, + "loss": 0.5674, + "step": 49794 + }, + { + "epoch": 0.622465561639041, + "grad_norm": 0.08642425388097763, + "learning_rate": 7.4973992554463245e-06, + "loss": 0.7703, + "step": 49796 + }, + { + "epoch": 0.6224905622640566, + "grad_norm": 2.1198556423187256, + "learning_rate": 7.496554346249092e-06, + "loss": 0.1426, + "step": 49798 + }, + { + "epoch": 0.6225155628890722, + "grad_norm": 2.685673713684082, + "learning_rate": 7.495709456117746e-06, + "loss": 1.0107, + "step": 49800 + }, + { + "epoch": 0.6225405635140878, + "grad_norm": 0.6172419190406799, + "learning_rate": 7.4948645850587246e-06, + "loss": 0.0072, + "step": 49802 + }, + { + "epoch": 0.6225655641391035, + "grad_norm": 4.236760139465332, + "learning_rate": 7.494019733078459e-06, + "loss": 0.8121, + "step": 49804 + }, + { + "epoch": 0.6225905647641191, + "grad_norm": 2.444044589996338, + "learning_rate": 7.4931749001833866e-06, + "loss": 0.9196, + "step": 49806 + }, + { + "epoch": 0.6226155653891348, + "grad_norm": 0.9621381163597107, + "learning_rate": 7.492330086379942e-06, + "loss": 0.654, + "step": 49808 + }, + { + "epoch": 0.6226405660141503, + "grad_norm": 3.8250370025634766, + "learning_rate": 7.491485291674554e-06, + "loss": 1.0623, + "step": 49810 + }, + { + "epoch": 0.622665566639166, + "grad_norm": 0.0007293270318768919, + "learning_rate": 7.49064051607366e-06, + "loss": 0.9853, + "step": 49812 + }, + { + "epoch": 0.6226905672641816, + "grad_norm": 2.1073825359344482, + "learning_rate": 7.489795759583689e-06, + "loss": 0.3202, + "step": 49814 + }, + { + "epoch": 0.6227155678891972, + "grad_norm": 2.0702176094055176, + "learning_rate": 7.488951022211082e-06, + "loss": 0.3492, + "step": 49816 + }, + { + "epoch": 0.6227405685142129, + "grad_norm": 2.777880907058716, + "learning_rate": 7.488106303962268e-06, + "loss": 0.5004, + "step": 49818 + }, + { + "epoch": 0.6227655691392284, + "grad_norm": 2.8511853218078613, + "learning_rate": 7.487261604843685e-06, + "loss": 1.9422, + "step": 49820 + }, + { + "epoch": 0.6227905697642441, + "grad_norm": 5.311609268188477, + "learning_rate": 7.4864169248617595e-06, + "loss": 2.2292, + "step": 49822 + }, + { + "epoch": 0.6228155703892597, + "grad_norm": 1.9774727821350098, + "learning_rate": 7.485572264022927e-06, + "loss": 0.9959, + "step": 49824 + }, + { + "epoch": 0.6228405710142754, + "grad_norm": 3.4089291095733643, + "learning_rate": 7.484727622333618e-06, + "loss": 1.0256, + "step": 49826 + }, + { + "epoch": 0.622865571639291, + "grad_norm": 2.922719955444336, + "learning_rate": 7.483882999800269e-06, + "loss": 0.4124, + "step": 49828 + }, + { + "epoch": 0.6228905722643066, + "grad_norm": 2.0486695766448975, + "learning_rate": 7.483038396429312e-06, + "loss": 1.318, + "step": 49830 + }, + { + "epoch": 0.6229155728893222, + "grad_norm": 5.090958595275879, + "learning_rate": 7.482193812227179e-06, + "loss": 0.3751, + "step": 49832 + }, + { + "epoch": 0.6229405735143378, + "grad_norm": 3.921083688735962, + "learning_rate": 7.481349247200307e-06, + "loss": 0.6187, + "step": 49834 + }, + { + "epoch": 0.6229655741393535, + "grad_norm": 3.5878164768218994, + "learning_rate": 7.480504701355117e-06, + "loss": 2.0581, + "step": 49836 + }, + { + "epoch": 0.6229905747643691, + "grad_norm": 3.8438143730163574, + "learning_rate": 7.479660174698044e-06, + "loss": 0.8253, + "step": 49838 + }, + { + "epoch": 0.6230155753893847, + "grad_norm": 4.038361549377441, + "learning_rate": 7.478815667235527e-06, + "loss": 1.1243, + "step": 49840 + }, + { + "epoch": 0.6230405760144003, + "grad_norm": 6.242705821990967, + "learning_rate": 7.477971178973993e-06, + "loss": 1.534, + "step": 49842 + }, + { + "epoch": 0.623065576639416, + "grad_norm": 0.003431618446484208, + "learning_rate": 7.477126709919873e-06, + "loss": 0.226, + "step": 49844 + }, + { + "epoch": 0.6230905772644316, + "grad_norm": 2.3643696308135986, + "learning_rate": 7.476282260079604e-06, + "loss": 0.4799, + "step": 49846 + }, + { + "epoch": 0.6231155778894473, + "grad_norm": 3.699314594268799, + "learning_rate": 7.475437829459609e-06, + "loss": 1.6685, + "step": 49848 + }, + { + "epoch": 0.6231405785144628, + "grad_norm": 3.2010140419006348, + "learning_rate": 7.4745934180663185e-06, + "loss": 0.9181, + "step": 49850 + }, + { + "epoch": 0.6231655791394785, + "grad_norm": 0.004534767009317875, + "learning_rate": 7.473749025906171e-06, + "loss": 0.5011, + "step": 49852 + }, + { + "epoch": 0.6231905797644941, + "grad_norm": 6.967874050140381, + "learning_rate": 7.472904652985593e-06, + "loss": 1.4649, + "step": 49854 + }, + { + "epoch": 0.6232155803895097, + "grad_norm": 2.284256935119629, + "learning_rate": 7.4720602993110145e-06, + "loss": 0.8089, + "step": 49856 + }, + { + "epoch": 0.6232405810145254, + "grad_norm": 1.742806077003479, + "learning_rate": 7.471215964888869e-06, + "loss": 0.4227, + "step": 49858 + }, + { + "epoch": 0.6232655816395409, + "grad_norm": 2.867133378982544, + "learning_rate": 7.470371649725589e-06, + "loss": 1.3458, + "step": 49860 + }, + { + "epoch": 0.6232905822645566, + "grad_norm": 5.815553188323975, + "learning_rate": 7.469527353827595e-06, + "loss": 2.016, + "step": 49862 + }, + { + "epoch": 0.6233155828895722, + "grad_norm": 4.896427154541016, + "learning_rate": 7.468683077201324e-06, + "loss": 1.4744, + "step": 49864 + }, + { + "epoch": 0.6233405835145879, + "grad_norm": 3.334097385406494, + "learning_rate": 7.467838819853206e-06, + "loss": 1.3654, + "step": 49866 + }, + { + "epoch": 0.6233655841396035, + "grad_norm": 1.9423680305480957, + "learning_rate": 7.466994581789668e-06, + "loss": 0.4038, + "step": 49868 + }, + { + "epoch": 0.6233905847646191, + "grad_norm": 4.46594762802124, + "learning_rate": 7.466150363017142e-06, + "loss": 1.7324, + "step": 49870 + }, + { + "epoch": 0.6234155853896347, + "grad_norm": 4.988265037536621, + "learning_rate": 7.465306163542061e-06, + "loss": 0.357, + "step": 49872 + }, + { + "epoch": 0.6234405860146504, + "grad_norm": 3.170384168624878, + "learning_rate": 7.464461983370843e-06, + "loss": 0.4227, + "step": 49874 + }, + { + "epoch": 0.623465586639666, + "grad_norm": 0.0012424758169800043, + "learning_rate": 7.463617822509925e-06, + "loss": 0.7001, + "step": 49876 + }, + { + "epoch": 0.6234905872646817, + "grad_norm": 7.0832929611206055, + "learning_rate": 7.4627736809657355e-06, + "loss": 1.9509, + "step": 49878 + }, + { + "epoch": 0.6235155878896972, + "grad_norm": 3.718315839767456, + "learning_rate": 7.461929558744705e-06, + "loss": 1.3687, + "step": 49880 + }, + { + "epoch": 0.6235405885147128, + "grad_norm": 3.724067449569702, + "learning_rate": 7.461085455853258e-06, + "loss": 1.1738, + "step": 49882 + }, + { + "epoch": 0.6235655891397285, + "grad_norm": 0.0007731092046014965, + "learning_rate": 7.460241372297825e-06, + "loss": 0.1883, + "step": 49884 + }, + { + "epoch": 0.6235905897647441, + "grad_norm": 4.0445146560668945, + "learning_rate": 7.459397308084839e-06, + "loss": 0.5293, + "step": 49886 + }, + { + "epoch": 0.6236155903897598, + "grad_norm": 6.957147121429443, + "learning_rate": 7.458553263220718e-06, + "loss": 1.4018, + "step": 49888 + }, + { + "epoch": 0.6236405910147753, + "grad_norm": 2.8052878379821777, + "learning_rate": 7.457709237711898e-06, + "loss": 1.0824, + "step": 49890 + }, + { + "epoch": 0.623665591639791, + "grad_norm": 2.467561721801758, + "learning_rate": 7.456865231564805e-06, + "loss": 0.5685, + "step": 49892 + }, + { + "epoch": 0.6236905922648066, + "grad_norm": 2.7168638706207275, + "learning_rate": 7.456021244785867e-06, + "loss": 1.3419, + "step": 49894 + }, + { + "epoch": 0.6237155928898223, + "grad_norm": 1.0924233198165894, + "learning_rate": 7.455177277381511e-06, + "loss": 0.3087, + "step": 49896 + }, + { + "epoch": 0.6237405935148379, + "grad_norm": 3.231517791748047, + "learning_rate": 7.454333329358169e-06, + "loss": 0.5561, + "step": 49898 + }, + { + "epoch": 0.6237655941398534, + "grad_norm": 1.9616822004318237, + "learning_rate": 7.453489400722258e-06, + "loss": 0.3689, + "step": 49900 + }, + { + "epoch": 0.6237905947648691, + "grad_norm": 4.96457052230835, + "learning_rate": 7.452645491480214e-06, + "loss": 2.0345, + "step": 49902 + }, + { + "epoch": 0.6238155953898847, + "grad_norm": 0.0013615230564028025, + "learning_rate": 7.451801601638462e-06, + "loss": 1.0896, + "step": 49904 + }, + { + "epoch": 0.6238405960149004, + "grad_norm": 2.891932249069214, + "learning_rate": 7.450957731203427e-06, + "loss": 0.7225, + "step": 49906 + }, + { + "epoch": 0.623865596639916, + "grad_norm": 2.757516384124756, + "learning_rate": 7.45011388018154e-06, + "loss": 0.4258, + "step": 49908 + }, + { + "epoch": 0.6238905972649316, + "grad_norm": 2.2104294300079346, + "learning_rate": 7.449270048579224e-06, + "loss": 1.8562, + "step": 49910 + }, + { + "epoch": 0.6239155978899472, + "grad_norm": 4.218964099884033, + "learning_rate": 7.448426236402908e-06, + "loss": 0.9574, + "step": 49912 + }, + { + "epoch": 0.6239405985149629, + "grad_norm": 4.221479415893555, + "learning_rate": 7.447582443659016e-06, + "loss": 0.6388, + "step": 49914 + }, + { + "epoch": 0.6239655991399785, + "grad_norm": 0.4509198069572449, + "learning_rate": 7.446738670353974e-06, + "loss": 0.7806, + "step": 49916 + }, + { + "epoch": 0.6239905997649942, + "grad_norm": 0.003535625757649541, + "learning_rate": 7.44589491649421e-06, + "loss": 0.6974, + "step": 49918 + }, + { + "epoch": 0.6240156003900097, + "grad_norm": 3.300764799118042, + "learning_rate": 7.445051182086149e-06, + "loss": 0.2347, + "step": 49920 + }, + { + "epoch": 0.6240406010150253, + "grad_norm": 2.340792655944824, + "learning_rate": 7.444207467136217e-06, + "loss": 0.7349, + "step": 49922 + }, + { + "epoch": 0.624065601640041, + "grad_norm": 3.149285078048706, + "learning_rate": 7.443363771650842e-06, + "loss": 1.4144, + "step": 49924 + }, + { + "epoch": 0.6240906022650566, + "grad_norm": 0.13903863728046417, + "learning_rate": 7.442520095636444e-06, + "loss": 0.739, + "step": 49926 + }, + { + "epoch": 0.6241156028900723, + "grad_norm": 0.0007414492429234087, + "learning_rate": 7.441676439099452e-06, + "loss": 0.2937, + "step": 49928 + }, + { + "epoch": 0.6241406035150878, + "grad_norm": 3.440324068069458, + "learning_rate": 7.44083280204629e-06, + "loss": 0.7588, + "step": 49930 + }, + { + "epoch": 0.6241656041401035, + "grad_norm": 3.9450011253356934, + "learning_rate": 7.439989184483383e-06, + "loss": 0.2839, + "step": 49932 + }, + { + "epoch": 0.6241906047651191, + "grad_norm": 0.0029913652688264847, + "learning_rate": 7.439145586417158e-06, + "loss": 0.1055, + "step": 49934 + }, + { + "epoch": 0.6242156053901348, + "grad_norm": 0.0011329872068017721, + "learning_rate": 7.438302007854037e-06, + "loss": 0.1983, + "step": 49936 + }, + { + "epoch": 0.6242406060151504, + "grad_norm": 6.447234630584717, + "learning_rate": 7.437458448800447e-06, + "loss": 1.3093, + "step": 49938 + }, + { + "epoch": 0.624265606640166, + "grad_norm": 1.052714228630066, + "learning_rate": 7.43661490926281e-06, + "loss": 0.0334, + "step": 49940 + }, + { + "epoch": 0.6242906072651816, + "grad_norm": 4.278343677520752, + "learning_rate": 7.43577138924755e-06, + "loss": 1.0231, + "step": 49942 + }, + { + "epoch": 0.6243156078901972, + "grad_norm": 2.5597782135009766, + "learning_rate": 7.434927888761094e-06, + "loss": 0.8807, + "step": 49944 + }, + { + "epoch": 0.6243406085152129, + "grad_norm": 2.527620553970337, + "learning_rate": 7.434084407809865e-06, + "loss": 0.8513, + "step": 49946 + }, + { + "epoch": 0.6243656091402285, + "grad_norm": 3.4562747478485107, + "learning_rate": 7.433240946400284e-06, + "loss": 1.8306, + "step": 49948 + }, + { + "epoch": 0.6243906097652441, + "grad_norm": 2.2662975788116455, + "learning_rate": 7.432397504538782e-06, + "loss": 0.7904, + "step": 49950 + }, + { + "epoch": 0.6244156103902597, + "grad_norm": 3.5591137409210205, + "learning_rate": 7.431554082231773e-06, + "loss": 0.9452, + "step": 49952 + }, + { + "epoch": 0.6244406110152754, + "grad_norm": 0.000977414776571095, + "learning_rate": 7.430710679485686e-06, + "loss": 0.581, + "step": 49954 + }, + { + "epoch": 0.624465611640291, + "grad_norm": 0.0008071094052866101, + "learning_rate": 7.429867296306943e-06, + "loss": 0.2136, + "step": 49956 + }, + { + "epoch": 0.6244906122653067, + "grad_norm": 0.045003652572631836, + "learning_rate": 7.429023932701966e-06, + "loss": 0.5037, + "step": 49958 + }, + { + "epoch": 0.6245156128903222, + "grad_norm": 6.5562825202941895, + "learning_rate": 7.428180588677181e-06, + "loss": 1.3792, + "step": 49960 + }, + { + "epoch": 0.6245406135153379, + "grad_norm": 3.0477864742279053, + "learning_rate": 7.427337264239009e-06, + "loss": 0.2805, + "step": 49962 + }, + { + "epoch": 0.6245656141403535, + "grad_norm": 3.732435464859009, + "learning_rate": 7.4264939593938735e-06, + "loss": 1.2452, + "step": 49964 + }, + { + "epoch": 0.6245906147653691, + "grad_norm": 2.515451431274414, + "learning_rate": 7.425650674148195e-06, + "loss": 0.8754, + "step": 49966 + }, + { + "epoch": 0.6246156153903848, + "grad_norm": 3.3114945888519287, + "learning_rate": 7.424807408508395e-06, + "loss": 1.0788, + "step": 49968 + }, + { + "epoch": 0.6246406160154003, + "grad_norm": 3.4928464889526367, + "learning_rate": 7.4239641624808985e-06, + "loss": 1.0814, + "step": 49970 + }, + { + "epoch": 0.624665616640416, + "grad_norm": 0.5359878540039062, + "learning_rate": 7.423120936072128e-06, + "loss": 0.5298, + "step": 49972 + }, + { + "epoch": 0.6246906172654316, + "grad_norm": 3.0071005821228027, + "learning_rate": 7.422277729288503e-06, + "loss": 0.5803, + "step": 49974 + }, + { + "epoch": 0.6247156178904473, + "grad_norm": 3.3156039714813232, + "learning_rate": 7.4214345421364485e-06, + "loss": 1.2577, + "step": 49976 + }, + { + "epoch": 0.6247406185154629, + "grad_norm": 0.005038293544203043, + "learning_rate": 7.420591374622381e-06, + "loss": 0.2418, + "step": 49978 + }, + { + "epoch": 0.6247656191404785, + "grad_norm": 5.051664352416992, + "learning_rate": 7.4197482267527265e-06, + "loss": 1.2551, + "step": 49980 + }, + { + "epoch": 0.6247906197654941, + "grad_norm": 0.011933285742998123, + "learning_rate": 7.418905098533904e-06, + "loss": 0.9798, + "step": 49982 + }, + { + "epoch": 0.6248156203905098, + "grad_norm": 4.621438026428223, + "learning_rate": 7.418061989972335e-06, + "loss": 0.6962, + "step": 49984 + }, + { + "epoch": 0.6248406210155254, + "grad_norm": 5.833001136779785, + "learning_rate": 7.417218901074441e-06, + "loss": 1.0031, + "step": 49986 + }, + { + "epoch": 0.624865621640541, + "grad_norm": 0.0015433711232617497, + "learning_rate": 7.4163758318466416e-06, + "loss": 0.7029, + "step": 49988 + }, + { + "epoch": 0.6248906222655566, + "grad_norm": 1.774115800857544, + "learning_rate": 7.415532782295361e-06, + "loss": 0.4223, + "step": 49990 + }, + { + "epoch": 0.6249156228905722, + "grad_norm": 3.2275915145874023, + "learning_rate": 7.414689752427016e-06, + "loss": 1.0476, + "step": 49992 + }, + { + "epoch": 0.6249406235155879, + "grad_norm": 3.4537079334259033, + "learning_rate": 7.413846742248028e-06, + "loss": 1.2679, + "step": 49994 + }, + { + "epoch": 0.6249656241406035, + "grad_norm": 1.361707329750061, + "learning_rate": 7.413003751764818e-06, + "loss": 1.0699, + "step": 49996 + }, + { + "epoch": 0.6249906247656192, + "grad_norm": 2.25468111038208, + "learning_rate": 7.412160780983804e-06, + "loss": 0.4953, + "step": 49998 + }, + { + "epoch": 0.6250156253906348, + "grad_norm": 0.24595774710178375, + "learning_rate": 7.411317829911408e-06, + "loss": 0.2822, + "step": 50000 + }, + { + "epoch": 0.6250406260156504, + "grad_norm": 5.888141632080078, + "learning_rate": 7.41047489855405e-06, + "loss": 0.2856, + "step": 50002 + }, + { + "epoch": 0.625065626640666, + "grad_norm": 1.975120186805725, + "learning_rate": 7.409631986918151e-06, + "loss": 1.4416, + "step": 50004 + }, + { + "epoch": 0.6250906272656817, + "grad_norm": 2.4115452766418457, + "learning_rate": 7.408789095010127e-06, + "loss": 0.1692, + "step": 50006 + }, + { + "epoch": 0.6251156278906973, + "grad_norm": 0.0009537639562040567, + "learning_rate": 7.407946222836398e-06, + "loss": 0.0306, + "step": 50008 + }, + { + "epoch": 0.625140628515713, + "grad_norm": 1.9149435758590698, + "learning_rate": 7.407103370403383e-06, + "loss": 1.0799, + "step": 50010 + }, + { + "epoch": 0.6251656291407285, + "grad_norm": 2.1547975540161133, + "learning_rate": 7.406260537717503e-06, + "loss": 1.6396, + "step": 50012 + }, + { + "epoch": 0.6251906297657441, + "grad_norm": 2.8140859603881836, + "learning_rate": 7.4054177247851765e-06, + "loss": 1.2227, + "step": 50014 + }, + { + "epoch": 0.6252156303907598, + "grad_norm": 3.2434961795806885, + "learning_rate": 7.404574931612823e-06, + "loss": 1.6316, + "step": 50016 + }, + { + "epoch": 0.6252406310157754, + "grad_norm": 4.819069862365723, + "learning_rate": 7.403732158206859e-06, + "loss": 1.0319, + "step": 50018 + }, + { + "epoch": 0.6252656316407911, + "grad_norm": 3.987204074859619, + "learning_rate": 7.402889404573703e-06, + "loss": 0.2722, + "step": 50020 + }, + { + "epoch": 0.6252906322658066, + "grad_norm": 5.580320835113525, + "learning_rate": 7.4020466707197736e-06, + "loss": 0.5676, + "step": 50022 + }, + { + "epoch": 0.6253156328908223, + "grad_norm": 7.320314407348633, + "learning_rate": 7.401203956651489e-06, + "loss": 1.2368, + "step": 50024 + }, + { + "epoch": 0.6253406335158379, + "grad_norm": 3.6040077209472656, + "learning_rate": 7.400361262375268e-06, + "loss": 0.4605, + "step": 50026 + }, + { + "epoch": 0.6253656341408536, + "grad_norm": 2.080916404724121, + "learning_rate": 7.399518587897529e-06, + "loss": 0.249, + "step": 50028 + }, + { + "epoch": 0.6253906347658692, + "grad_norm": 5.695312976837158, + "learning_rate": 7.3986759332246906e-06, + "loss": 1.6545, + "step": 50030 + }, + { + "epoch": 0.6254156353908847, + "grad_norm": 9.04936695098877, + "learning_rate": 7.397833298363164e-06, + "loss": 1.4752, + "step": 50032 + }, + { + "epoch": 0.6254406360159004, + "grad_norm": 4.641525745391846, + "learning_rate": 7.396990683319374e-06, + "loss": 1.3942, + "step": 50034 + }, + { + "epoch": 0.625465636640916, + "grad_norm": 0.0037785167805850506, + "learning_rate": 7.396148088099732e-06, + "loss": 0.0209, + "step": 50036 + }, + { + "epoch": 0.6254906372659317, + "grad_norm": 5.73002290725708, + "learning_rate": 7.39530551271066e-06, + "loss": 1.2248, + "step": 50038 + }, + { + "epoch": 0.6255156378909473, + "grad_norm": 2.7958948612213135, + "learning_rate": 7.3944629571585725e-06, + "loss": 0.0725, + "step": 50040 + }, + { + "epoch": 0.6255406385159629, + "grad_norm": 2.8737375736236572, + "learning_rate": 7.39362042144989e-06, + "loss": 1.1473, + "step": 50042 + }, + { + "epoch": 0.6255656391409785, + "grad_norm": 3.569835901260376, + "learning_rate": 7.392777905591022e-06, + "loss": 0.772, + "step": 50044 + }, + { + "epoch": 0.6255906397659942, + "grad_norm": 3.885575532913208, + "learning_rate": 7.391935409588389e-06, + "loss": 0.402, + "step": 50046 + }, + { + "epoch": 0.6256156403910098, + "grad_norm": 0.459515780210495, + "learning_rate": 7.391092933448407e-06, + "loss": 0.3464, + "step": 50048 + }, + { + "epoch": 0.6256406410160255, + "grad_norm": 2.973079204559326, + "learning_rate": 7.3902504771774915e-06, + "loss": 1.9092, + "step": 50050 + }, + { + "epoch": 0.625665641641041, + "grad_norm": 0.0055289012379944324, + "learning_rate": 7.389408040782062e-06, + "loss": 0.0007, + "step": 50052 + }, + { + "epoch": 0.6256906422660566, + "grad_norm": 2.9350533485412598, + "learning_rate": 7.388565624268529e-06, + "loss": 0.856, + "step": 50054 + }, + { + "epoch": 0.6257156428910723, + "grad_norm": 0.002619032980874181, + "learning_rate": 7.387723227643314e-06, + "loss": 0.0, + "step": 50056 + }, + { + "epoch": 0.6257406435160879, + "grad_norm": 2.518754482269287, + "learning_rate": 7.386880850912827e-06, + "loss": 0.5842, + "step": 50058 + }, + { + "epoch": 0.6257656441411036, + "grad_norm": 4.742024898529053, + "learning_rate": 7.386038494083487e-06, + "loss": 1.4641, + "step": 50060 + }, + { + "epoch": 0.6257906447661191, + "grad_norm": 3.9996204376220703, + "learning_rate": 7.385196157161707e-06, + "loss": 0.5904, + "step": 50062 + }, + { + "epoch": 0.6258156453911348, + "grad_norm": 9.42736530303955, + "learning_rate": 7.384353840153904e-06, + "loss": 1.336, + "step": 50064 + }, + { + "epoch": 0.6258406460161504, + "grad_norm": 0.0011479470413178205, + "learning_rate": 7.383511543066493e-06, + "loss": 0.5151, + "step": 50066 + }, + { + "epoch": 0.6258656466411661, + "grad_norm": 1.6042206287384033, + "learning_rate": 7.3826692659058885e-06, + "loss": 0.7098, + "step": 50068 + }, + { + "epoch": 0.6258906472661817, + "grad_norm": 0.2135602831840515, + "learning_rate": 7.381827008678504e-06, + "loss": 0.171, + "step": 50070 + }, + { + "epoch": 0.6259156478911972, + "grad_norm": 0.08325817435979843, + "learning_rate": 7.380984771390753e-06, + "loss": 1.1375, + "step": 50072 + }, + { + "epoch": 0.6259406485162129, + "grad_norm": 3.833946466445923, + "learning_rate": 7.380142554049053e-06, + "loss": 0.543, + "step": 50074 + }, + { + "epoch": 0.6259656491412285, + "grad_norm": 2.5408215522766113, + "learning_rate": 7.379300356659817e-06, + "loss": 1.256, + "step": 50076 + }, + { + "epoch": 0.6259906497662442, + "grad_norm": 4.517960071563721, + "learning_rate": 7.378458179229459e-06, + "loss": 1.8868, + "step": 50078 + }, + { + "epoch": 0.6260156503912598, + "grad_norm": 2.1802358627319336, + "learning_rate": 7.377616021764392e-06, + "loss": 1.4494, + "step": 50080 + }, + { + "epoch": 0.6260406510162754, + "grad_norm": 0.0009233180317096412, + "learning_rate": 7.3767738842710336e-06, + "loss": 0.6014, + "step": 50082 + }, + { + "epoch": 0.626065651641291, + "grad_norm": 2.5487756729125977, + "learning_rate": 7.375931766755791e-06, + "loss": 0.4216, + "step": 50084 + }, + { + "epoch": 0.6260906522663067, + "grad_norm": 4.463354110717773, + "learning_rate": 7.375089669225082e-06, + "loss": 1.1173, + "step": 50086 + }, + { + "epoch": 0.6261156528913223, + "grad_norm": 4.283017635345459, + "learning_rate": 7.374247591685317e-06, + "loss": 2.0019, + "step": 50088 + }, + { + "epoch": 0.626140653516338, + "grad_norm": 0.9107441306114197, + "learning_rate": 7.3734055341429135e-06, + "loss": 0.4792, + "step": 50090 + }, + { + "epoch": 0.6261656541413535, + "grad_norm": 4.232769966125488, + "learning_rate": 7.372563496604281e-06, + "loss": 0.6325, + "step": 50092 + }, + { + "epoch": 0.6261906547663691, + "grad_norm": 3.802626848220825, + "learning_rate": 7.3717214790758355e-06, + "loss": 0.9724, + "step": 50094 + }, + { + "epoch": 0.6262156553913848, + "grad_norm": 3.2754740715026855, + "learning_rate": 7.370879481563986e-06, + "loss": 1.6223, + "step": 50096 + }, + { + "epoch": 0.6262406560164004, + "grad_norm": 3.189382553100586, + "learning_rate": 7.370037504075145e-06, + "loss": 0.449, + "step": 50098 + }, + { + "epoch": 0.6262656566414161, + "grad_norm": 4.645147323608398, + "learning_rate": 7.369195546615728e-06, + "loss": 1.0198, + "step": 50100 + }, + { + "epoch": 0.6262906572664316, + "grad_norm": 3.801746368408203, + "learning_rate": 7.3683536091921446e-06, + "loss": 1.7793, + "step": 50102 + }, + { + "epoch": 0.6263156578914473, + "grad_norm": 0.002060693921521306, + "learning_rate": 7.367511691810809e-06, + "loss": 0.0001, + "step": 50104 + }, + { + "epoch": 0.6263406585164629, + "grad_norm": 0.014434066601097584, + "learning_rate": 7.366669794478129e-06, + "loss": 0.6104, + "step": 50106 + }, + { + "epoch": 0.6263656591414786, + "grad_norm": 1.3320293426513672, + "learning_rate": 7.365827917200527e-06, + "loss": 0.3535, + "step": 50108 + }, + { + "epoch": 0.6263906597664942, + "grad_norm": 0.0036819814704358578, + "learning_rate": 7.3649860599844015e-06, + "loss": 1.0446, + "step": 50110 + }, + { + "epoch": 0.6264156603915098, + "grad_norm": 0.0006191262509673834, + "learning_rate": 7.364144222836171e-06, + "loss": 0.7439, + "step": 50112 + }, + { + "epoch": 0.6264406610165254, + "grad_norm": 3.49442720413208, + "learning_rate": 7.363302405762244e-06, + "loss": 0.9207, + "step": 50114 + }, + { + "epoch": 0.626465661641541, + "grad_norm": 2.412846326828003, + "learning_rate": 7.362460608769033e-06, + "loss": 0.0962, + "step": 50116 + }, + { + "epoch": 0.6264906622665567, + "grad_norm": 2.343459367752075, + "learning_rate": 7.361618831862948e-06, + "loss": 1.0068, + "step": 50118 + }, + { + "epoch": 0.6265156628915723, + "grad_norm": 0.001388329896144569, + "learning_rate": 7.360777075050407e-06, + "loss": 1.1446, + "step": 50120 + }, + { + "epoch": 0.6265406635165879, + "grad_norm": 0.0007977125351317227, + "learning_rate": 7.359935338337809e-06, + "loss": 0.5658, + "step": 50122 + }, + { + "epoch": 0.6265656641416035, + "grad_norm": 3.0204083919525146, + "learning_rate": 7.359093621731572e-06, + "loss": 0.6685, + "step": 50124 + }, + { + "epoch": 0.6265906647666192, + "grad_norm": 6.794968605041504, + "learning_rate": 7.358251925238102e-06, + "loss": 0.4658, + "step": 50126 + }, + { + "epoch": 0.6266156653916348, + "grad_norm": 7.226635932922363, + "learning_rate": 7.357410248863814e-06, + "loss": 0.7153, + "step": 50128 + }, + { + "epoch": 0.6266406660166505, + "grad_norm": 4.059978485107422, + "learning_rate": 7.356568592615117e-06, + "loss": 0.8877, + "step": 50130 + }, + { + "epoch": 0.626665666641666, + "grad_norm": 3.2246429920196533, + "learning_rate": 7.355726956498415e-06, + "loss": 0.682, + "step": 50132 + }, + { + "epoch": 0.6266906672666817, + "grad_norm": 4.86124849319458, + "learning_rate": 7.354885340520129e-06, + "loss": 0.6263, + "step": 50134 + }, + { + "epoch": 0.6267156678916973, + "grad_norm": 3.179358720779419, + "learning_rate": 7.354043744686659e-06, + "loss": 2.2612, + "step": 50136 + }, + { + "epoch": 0.626740668516713, + "grad_norm": 4.3404459953308105, + "learning_rate": 7.353202169004417e-06, + "loss": 1.4851, + "step": 50138 + }, + { + "epoch": 0.6267656691417286, + "grad_norm": 3.2253623008728027, + "learning_rate": 7.352360613479814e-06, + "loss": 0.5518, + "step": 50140 + }, + { + "epoch": 0.6267906697667441, + "grad_norm": 4.310178756713867, + "learning_rate": 7.351519078119258e-06, + "loss": 0.5478, + "step": 50142 + }, + { + "epoch": 0.6268156703917598, + "grad_norm": 3.263822317123413, + "learning_rate": 7.350677562929156e-06, + "loss": 0.7252, + "step": 50144 + }, + { + "epoch": 0.6268406710167754, + "grad_norm": 3.1932249069213867, + "learning_rate": 7.349836067915924e-06, + "loss": 0.8691, + "step": 50146 + }, + { + "epoch": 0.6268656716417911, + "grad_norm": 4.132070541381836, + "learning_rate": 7.348994593085963e-06, + "loss": 1.3101, + "step": 50148 + }, + { + "epoch": 0.6268906722668067, + "grad_norm": 2.819857120513916, + "learning_rate": 7.348153138445684e-06, + "loss": 0.9187, + "step": 50150 + }, + { + "epoch": 0.6269156728918223, + "grad_norm": 2.9853267669677734, + "learning_rate": 7.347311704001495e-06, + "loss": 0.8747, + "step": 50152 + }, + { + "epoch": 0.6269406735168379, + "grad_norm": 2.7935965061187744, + "learning_rate": 7.346470289759806e-06, + "loss": 0.8579, + "step": 50154 + }, + { + "epoch": 0.6269656741418536, + "grad_norm": 3.6978044509887695, + "learning_rate": 7.345628895727022e-06, + "loss": 1.2702, + "step": 50156 + }, + { + "epoch": 0.6269906747668692, + "grad_norm": 0.008237859234213829, + "learning_rate": 7.344787521909554e-06, + "loss": 0.2783, + "step": 50158 + }, + { + "epoch": 0.6270156753918849, + "grad_norm": 3.3291268348693848, + "learning_rate": 7.343946168313814e-06, + "loss": 0.7454, + "step": 50160 + }, + { + "epoch": 0.6270406760169004, + "grad_norm": 0.0009411653154529631, + "learning_rate": 7.343104834946199e-06, + "loss": 0.712, + "step": 50162 + }, + { + "epoch": 0.627065676641916, + "grad_norm": 6.097038269042969, + "learning_rate": 7.342263521813122e-06, + "loss": 1.1332, + "step": 50164 + }, + { + "epoch": 0.6270906772669317, + "grad_norm": 2.6300392150878906, + "learning_rate": 7.341422228920991e-06, + "loss": 1.029, + "step": 50166 + }, + { + "epoch": 0.6271156778919473, + "grad_norm": 2.671173095703125, + "learning_rate": 7.340580956276209e-06, + "loss": 0.9479, + "step": 50168 + }, + { + "epoch": 0.627140678516963, + "grad_norm": 1.8678796291351318, + "learning_rate": 7.339739703885189e-06, + "loss": 0.469, + "step": 50170 + }, + { + "epoch": 0.6271656791419785, + "grad_norm": 2.6336469650268555, + "learning_rate": 7.338898471754341e-06, + "loss": 1.0104, + "step": 50172 + }, + { + "epoch": 0.6271906797669942, + "grad_norm": 2.644692897796631, + "learning_rate": 7.33805725989006e-06, + "loss": 0.8399, + "step": 50174 + }, + { + "epoch": 0.6272156803920098, + "grad_norm": 4.229374408721924, + "learning_rate": 7.33721606829876e-06, + "loss": 0.5025, + "step": 50176 + }, + { + "epoch": 0.6272406810170255, + "grad_norm": 3.7784016132354736, + "learning_rate": 7.336374896986844e-06, + "loss": 0.6921, + "step": 50178 + }, + { + "epoch": 0.6272656816420411, + "grad_norm": 2.798044204711914, + "learning_rate": 7.335533745960719e-06, + "loss": 0.6422, + "step": 50180 + }, + { + "epoch": 0.6272906822670566, + "grad_norm": 3.291508913040161, + "learning_rate": 7.334692615226794e-06, + "loss": 0.6007, + "step": 50182 + }, + { + "epoch": 0.6273156828920723, + "grad_norm": 3.221566915512085, + "learning_rate": 7.333851504791474e-06, + "loss": 0.1936, + "step": 50184 + }, + { + "epoch": 0.6273406835170879, + "grad_norm": 3.820084810256958, + "learning_rate": 7.333010414661167e-06, + "loss": 1.2221, + "step": 50186 + }, + { + "epoch": 0.6273656841421036, + "grad_norm": 4.2689595222473145, + "learning_rate": 7.332169344842271e-06, + "loss": 0.0743, + "step": 50188 + }, + { + "epoch": 0.6273906847671192, + "grad_norm": 3.081685781478882, + "learning_rate": 7.331328295341198e-06, + "loss": 0.6151, + "step": 50190 + }, + { + "epoch": 0.6274156853921348, + "grad_norm": 0.01621551252901554, + "learning_rate": 7.330487266164348e-06, + "loss": 0.2779, + "step": 50192 + }, + { + "epoch": 0.6274406860171504, + "grad_norm": 0.001137136248871684, + "learning_rate": 7.32964625731813e-06, + "loss": 0.3697, + "step": 50194 + }, + { + "epoch": 0.6274656866421661, + "grad_norm": 2.323716640472412, + "learning_rate": 7.3288052688089504e-06, + "loss": 1.013, + "step": 50196 + }, + { + "epoch": 0.6274906872671817, + "grad_norm": 2.088153839111328, + "learning_rate": 7.327964300643217e-06, + "loss": 0.4027, + "step": 50198 + }, + { + "epoch": 0.6275156878921974, + "grad_norm": 0.0016429839888587594, + "learning_rate": 7.327123352827323e-06, + "loss": 0.572, + "step": 50200 + }, + { + "epoch": 0.6275406885172129, + "grad_norm": 7.561900615692139, + "learning_rate": 7.3262824253676815e-06, + "loss": 1.7818, + "step": 50202 + }, + { + "epoch": 0.6275656891422285, + "grad_norm": 2.2621781826019287, + "learning_rate": 7.325441518270695e-06, + "loss": 0.3641, + "step": 50204 + }, + { + "epoch": 0.6275906897672442, + "grad_norm": 0.0010596484644338489, + "learning_rate": 7.324600631542764e-06, + "loss": 0.6221, + "step": 50206 + }, + { + "epoch": 0.6276156903922598, + "grad_norm": 6.1277265548706055, + "learning_rate": 7.323759765190299e-06, + "loss": 1.8854, + "step": 50208 + }, + { + "epoch": 0.6276406910172755, + "grad_norm": 3.7249596118927, + "learning_rate": 7.322918919219703e-06, + "loss": 1.6626, + "step": 50210 + }, + { + "epoch": 0.627665691642291, + "grad_norm": 2.577035665512085, + "learning_rate": 7.32207809363738e-06, + "loss": 0.6905, + "step": 50212 + }, + { + "epoch": 0.6276906922673067, + "grad_norm": 2.1786282062530518, + "learning_rate": 7.321237288449727e-06, + "loss": 0.1495, + "step": 50214 + }, + { + "epoch": 0.6277156928923223, + "grad_norm": 0.0012031329097226262, + "learning_rate": 7.320396503663154e-06, + "loss": 0.6898, + "step": 50216 + }, + { + "epoch": 0.627740693517338, + "grad_norm": 1.223284363746643, + "learning_rate": 7.319555739284059e-06, + "loss": 0.1054, + "step": 50218 + }, + { + "epoch": 0.6277656941423536, + "grad_norm": 2.472654104232788, + "learning_rate": 7.3187149953188505e-06, + "loss": 0.8853, + "step": 50220 + }, + { + "epoch": 0.6277906947673692, + "grad_norm": 4.571645259857178, + "learning_rate": 7.31787427177393e-06, + "loss": 1.0212, + "step": 50222 + }, + { + "epoch": 0.6278156953923848, + "grad_norm": 1.3799420595169067, + "learning_rate": 7.317033568655704e-06, + "loss": 0.1108, + "step": 50224 + }, + { + "epoch": 0.6278406960174004, + "grad_norm": 1.210472583770752, + "learning_rate": 7.316192885970566e-06, + "loss": 0.0382, + "step": 50226 + }, + { + "epoch": 0.6278656966424161, + "grad_norm": 3.453511953353882, + "learning_rate": 7.315352223724924e-06, + "loss": 1.1375, + "step": 50228 + }, + { + "epoch": 0.6278906972674317, + "grad_norm": 2.536329746246338, + "learning_rate": 7.314511581925176e-06, + "loss": 1.1414, + "step": 50230 + }, + { + "epoch": 0.6279156978924473, + "grad_norm": 0.002327150199562311, + "learning_rate": 7.313670960577732e-06, + "loss": 0.5955, + "step": 50232 + }, + { + "epoch": 0.6279406985174629, + "grad_norm": 5.384753227233887, + "learning_rate": 7.312830359688991e-06, + "loss": 1.0348, + "step": 50234 + }, + { + "epoch": 0.6279656991424786, + "grad_norm": 0.03706499934196472, + "learning_rate": 7.3119897792653525e-06, + "loss": 0.7574, + "step": 50236 + }, + { + "epoch": 0.6279906997674942, + "grad_norm": 4.3195929527282715, + "learning_rate": 7.311149219313225e-06, + "loss": 1.3036, + "step": 50238 + }, + { + "epoch": 0.6280157003925099, + "grad_norm": 0.4573155343532562, + "learning_rate": 7.310308679838999e-06, + "loss": 0.7819, + "step": 50240 + }, + { + "epoch": 0.6280407010175254, + "grad_norm": 2.8232345581054688, + "learning_rate": 7.30946816084908e-06, + "loss": 0.731, + "step": 50242 + }, + { + "epoch": 0.628065701642541, + "grad_norm": 3.0935916900634766, + "learning_rate": 7.308627662349874e-06, + "loss": 0.9263, + "step": 50244 + }, + { + "epoch": 0.6280907022675567, + "grad_norm": 3.156174421310425, + "learning_rate": 7.307787184347779e-06, + "loss": 0.6521, + "step": 50246 + }, + { + "epoch": 0.6281157028925723, + "grad_norm": 4.321309566497803, + "learning_rate": 7.306946726849195e-06, + "loss": 1.887, + "step": 50248 + }, + { + "epoch": 0.628140703517588, + "grad_norm": 3.6313774585723877, + "learning_rate": 7.306106289860524e-06, + "loss": 0.3856, + "step": 50250 + }, + { + "epoch": 0.6281657041426035, + "grad_norm": 2.4821524620056152, + "learning_rate": 7.305265873388172e-06, + "loss": 0.0412, + "step": 50252 + }, + { + "epoch": 0.6281907047676192, + "grad_norm": 2.8260464668273926, + "learning_rate": 7.304425477438527e-06, + "loss": 0.454, + "step": 50254 + }, + { + "epoch": 0.6282157053926348, + "grad_norm": 4.64766788482666, + "learning_rate": 7.303585102017998e-06, + "loss": 1.3348, + "step": 50256 + }, + { + "epoch": 0.6282407060176505, + "grad_norm": 0.0004884384106844664, + "learning_rate": 7.302744747132985e-06, + "loss": 0.014, + "step": 50258 + }, + { + "epoch": 0.6282657066426661, + "grad_norm": 4.250599384307861, + "learning_rate": 7.301904412789886e-06, + "loss": 1.2347, + "step": 50260 + }, + { + "epoch": 0.6282907072676817, + "grad_norm": 3.6730945110321045, + "learning_rate": 7.3010640989951005e-06, + "loss": 2.0409, + "step": 50262 + }, + { + "epoch": 0.6283157078926973, + "grad_norm": 0.6808512806892395, + "learning_rate": 7.300223805755035e-06, + "loss": 0.307, + "step": 50264 + }, + { + "epoch": 0.628340708517713, + "grad_norm": 3.334472417831421, + "learning_rate": 7.299383533076076e-06, + "loss": 1.2321, + "step": 50266 + }, + { + "epoch": 0.6283657091427286, + "grad_norm": 0.4894437789916992, + "learning_rate": 7.2985432809646316e-06, + "loss": 0.8234, + "step": 50268 + }, + { + "epoch": 0.6283907097677442, + "grad_norm": 4.248380661010742, + "learning_rate": 7.2977030494271e-06, + "loss": 2.0204, + "step": 50270 + }, + { + "epoch": 0.6284157103927598, + "grad_norm": 0.0009770561009645462, + "learning_rate": 7.29686283846988e-06, + "loss": 0.5179, + "step": 50272 + }, + { + "epoch": 0.6284407110177754, + "grad_norm": 6.394770622253418, + "learning_rate": 7.29602264809937e-06, + "loss": 1.5239, + "step": 50274 + }, + { + "epoch": 0.6284657116427911, + "grad_norm": 3.0365495681762695, + "learning_rate": 7.295182478321969e-06, + "loss": 1.5314, + "step": 50276 + }, + { + "epoch": 0.6284907122678067, + "grad_norm": 0.0008485926664434373, + "learning_rate": 7.294342329144081e-06, + "loss": 1.1144, + "step": 50278 + }, + { + "epoch": 0.6285157128928224, + "grad_norm": 2.446343421936035, + "learning_rate": 7.293502200572092e-06, + "loss": 0.663, + "step": 50280 + }, + { + "epoch": 0.6285407135178379, + "grad_norm": 3.9073729515075684, + "learning_rate": 7.2926620926124105e-06, + "loss": 0.632, + "step": 50282 + }, + { + "epoch": 0.6285657141428536, + "grad_norm": 9.367226600646973, + "learning_rate": 7.29182200527143e-06, + "loss": 2.0782, + "step": 50284 + }, + { + "epoch": 0.6285907147678692, + "grad_norm": 1.940356731414795, + "learning_rate": 7.2909819385555525e-06, + "loss": 1.2733, + "step": 50286 + }, + { + "epoch": 0.6286157153928849, + "grad_norm": 1.398170828819275, + "learning_rate": 7.290141892471172e-06, + "loss": 1.1224, + "step": 50288 + }, + { + "epoch": 0.6286407160179005, + "grad_norm": 3.180795669555664, + "learning_rate": 7.289301867024692e-06, + "loss": 0.6159, + "step": 50290 + }, + { + "epoch": 0.628665716642916, + "grad_norm": 1.9403979778289795, + "learning_rate": 7.2884618622224975e-06, + "loss": 1.3769, + "step": 50292 + }, + { + "epoch": 0.6286907172679317, + "grad_norm": 4.811060905456543, + "learning_rate": 7.287621878070999e-06, + "loss": 1.1826, + "step": 50294 + }, + { + "epoch": 0.6287157178929473, + "grad_norm": 0.0006797440000809729, + "learning_rate": 7.2867819145765885e-06, + "loss": 0.6748, + "step": 50296 + }, + { + "epoch": 0.628740718517963, + "grad_norm": 1.8489460945129395, + "learning_rate": 7.285941971745662e-06, + "loss": 0.0617, + "step": 50298 + }, + { + "epoch": 0.6287657191429786, + "grad_norm": 4.3422770500183105, + "learning_rate": 7.285102049584618e-06, + "loss": 2.1498, + "step": 50300 + }, + { + "epoch": 0.6287907197679942, + "grad_norm": 1.004117488861084, + "learning_rate": 7.284262148099852e-06, + "loss": 0.0571, + "step": 50302 + }, + { + "epoch": 0.6288157203930098, + "grad_norm": 5.047908306121826, + "learning_rate": 7.283422267297765e-06, + "loss": 0.538, + "step": 50304 + }, + { + "epoch": 0.6288407210180255, + "grad_norm": 0.03525864705443382, + "learning_rate": 7.282582407184747e-06, + "loss": 0.0004, + "step": 50306 + }, + { + "epoch": 0.6288657216430411, + "grad_norm": 0.17587263882160187, + "learning_rate": 7.2817425677671984e-06, + "loss": 0.6239, + "step": 50308 + }, + { + "epoch": 0.6288907222680568, + "grad_norm": 3.5549538135528564, + "learning_rate": 7.280902749051513e-06, + "loss": 0.7232, + "step": 50310 + }, + { + "epoch": 0.6289157228930723, + "grad_norm": 5.651093482971191, + "learning_rate": 7.280062951044087e-06, + "loss": 1.1479, + "step": 50312 + }, + { + "epoch": 0.6289407235180879, + "grad_norm": 3.6231675148010254, + "learning_rate": 7.279223173751317e-06, + "loss": 0.917, + "step": 50314 + }, + { + "epoch": 0.6289657241431036, + "grad_norm": 4.53679084777832, + "learning_rate": 7.278383417179602e-06, + "loss": 2.0516, + "step": 50316 + }, + { + "epoch": 0.6289907247681192, + "grad_norm": 5.267906665802002, + "learning_rate": 7.277543681335331e-06, + "loss": 1.1535, + "step": 50318 + }, + { + "epoch": 0.6290157253931349, + "grad_norm": 6.109743118286133, + "learning_rate": 7.276703966224903e-06, + "loss": 2.0463, + "step": 50320 + }, + { + "epoch": 0.6290407260181504, + "grad_norm": 4.237891674041748, + "learning_rate": 7.275864271854711e-06, + "loss": 1.0953, + "step": 50322 + }, + { + "epoch": 0.6290657266431661, + "grad_norm": 2.4076621532440186, + "learning_rate": 7.275024598231153e-06, + "loss": 0.2362, + "step": 50324 + }, + { + "epoch": 0.6290907272681817, + "grad_norm": 0.40161117911338806, + "learning_rate": 7.274184945360621e-06, + "loss": 0.1952, + "step": 50326 + }, + { + "epoch": 0.6291157278931974, + "grad_norm": 3.591367721557617, + "learning_rate": 7.273345313249512e-06, + "loss": 1.9632, + "step": 50328 + }, + { + "epoch": 0.629140728518213, + "grad_norm": 1.5988487005233765, + "learning_rate": 7.2725057019042226e-06, + "loss": 0.8097, + "step": 50330 + }, + { + "epoch": 0.6291657291432285, + "grad_norm": 4.759165287017822, + "learning_rate": 7.27166611133114e-06, + "loss": 0.7303, + "step": 50332 + }, + { + "epoch": 0.6291907297682442, + "grad_norm": 3.3806324005126953, + "learning_rate": 7.270826541536664e-06, + "loss": 0.7194, + "step": 50334 + }, + { + "epoch": 0.6292157303932598, + "grad_norm": 0.0492473728954792, + "learning_rate": 7.269986992527185e-06, + "loss": 0.0717, + "step": 50336 + }, + { + "epoch": 0.6292407310182755, + "grad_norm": 3.8096859455108643, + "learning_rate": 7.269147464309101e-06, + "loss": 0.6697, + "step": 50338 + }, + { + "epoch": 0.6292657316432911, + "grad_norm": 0.0018654477316886187, + "learning_rate": 7.268307956888803e-06, + "loss": 0.8658, + "step": 50340 + }, + { + "epoch": 0.6292907322683067, + "grad_norm": 4.995418548583984, + "learning_rate": 7.2674684702726875e-06, + "loss": 1.0982, + "step": 50342 + }, + { + "epoch": 0.6293157328933223, + "grad_norm": 3.9372968673706055, + "learning_rate": 7.2666290044671435e-06, + "loss": 0.5609, + "step": 50344 + }, + { + "epoch": 0.629340733518338, + "grad_norm": 3.605407238006592, + "learning_rate": 7.265789559478567e-06, + "loss": 0.9461, + "step": 50346 + }, + { + "epoch": 0.6293657341433536, + "grad_norm": 6.393402576446533, + "learning_rate": 7.26495013531335e-06, + "loss": 1.0791, + "step": 50348 + }, + { + "epoch": 0.6293907347683693, + "grad_norm": 2.7243924140930176, + "learning_rate": 7.264110731977887e-06, + "loss": 0.133, + "step": 50350 + }, + { + "epoch": 0.6294157353933848, + "grad_norm": 3.9004392623901367, + "learning_rate": 7.263271349478568e-06, + "loss": 0.6428, + "step": 50352 + }, + { + "epoch": 0.6294407360184004, + "grad_norm": 14.129731178283691, + "learning_rate": 7.262431987821788e-06, + "loss": 0.7005, + "step": 50354 + }, + { + "epoch": 0.6294657366434161, + "grad_norm": 3.0161170959472656, + "learning_rate": 7.261592647013942e-06, + "loss": 1.2527, + "step": 50356 + }, + { + "epoch": 0.6294907372684317, + "grad_norm": 4.138798713684082, + "learning_rate": 7.2607533270614165e-06, + "loss": 0.684, + "step": 50358 + }, + { + "epoch": 0.6295157378934474, + "grad_norm": 4.394496440887451, + "learning_rate": 7.259914027970605e-06, + "loss": 1.2866, + "step": 50360 + }, + { + "epoch": 0.6295407385184629, + "grad_norm": 2.4889299869537354, + "learning_rate": 7.259074749747903e-06, + "loss": 0.7874, + "step": 50362 + }, + { + "epoch": 0.6295657391434786, + "grad_norm": 0.0015703600365668535, + "learning_rate": 7.2582354923996986e-06, + "loss": 0.0004, + "step": 50364 + }, + { + "epoch": 0.6295907397684942, + "grad_norm": 4.138012886047363, + "learning_rate": 7.257396255932385e-06, + "loss": 0.7923, + "step": 50366 + }, + { + "epoch": 0.6296157403935099, + "grad_norm": 2.774965286254883, + "learning_rate": 7.2565570403523565e-06, + "loss": 1.1165, + "step": 50368 + }, + { + "epoch": 0.6296407410185255, + "grad_norm": 0.0005651334649883211, + "learning_rate": 7.255717845665999e-06, + "loss": 0.0456, + "step": 50370 + }, + { + "epoch": 0.629665741643541, + "grad_norm": 1.8804041147232056, + "learning_rate": 7.2548786718797056e-06, + "loss": 0.6278, + "step": 50372 + }, + { + "epoch": 0.6296907422685567, + "grad_norm": 2.948863983154297, + "learning_rate": 7.254039518999871e-06, + "loss": 0.8916, + "step": 50374 + }, + { + "epoch": 0.6297157428935723, + "grad_norm": 0.7431193590164185, + "learning_rate": 7.253200387032879e-06, + "loss": 0.0661, + "step": 50376 + }, + { + "epoch": 0.629740743518588, + "grad_norm": 9.213643074035645, + "learning_rate": 7.252361275985127e-06, + "loss": 1.1454, + "step": 50378 + }, + { + "epoch": 0.6297657441436036, + "grad_norm": 3.890937566757202, + "learning_rate": 7.251522185863002e-06, + "loss": 0.4299, + "step": 50380 + }, + { + "epoch": 0.6297907447686192, + "grad_norm": 2.1200268268585205, + "learning_rate": 7.250683116672898e-06, + "loss": 0.5322, + "step": 50382 + }, + { + "epoch": 0.6298157453936348, + "grad_norm": 0.7666321992874146, + "learning_rate": 7.249844068421201e-06, + "loss": 0.7591, + "step": 50384 + }, + { + "epoch": 0.6298407460186505, + "grad_norm": 5.256551742553711, + "learning_rate": 7.249005041114302e-06, + "loss": 0.7634, + "step": 50386 + }, + { + "epoch": 0.6298657466436661, + "grad_norm": 2.3259363174438477, + "learning_rate": 7.248166034758592e-06, + "loss": 0.1561, + "step": 50388 + }, + { + "epoch": 0.6298907472686818, + "grad_norm": 3.782216787338257, + "learning_rate": 7.247327049360459e-06, + "loss": 0.8876, + "step": 50390 + }, + { + "epoch": 0.6299157478936973, + "grad_norm": 3.7003769874572754, + "learning_rate": 7.246488084926295e-06, + "loss": 0.8614, + "step": 50392 + }, + { + "epoch": 0.629940748518713, + "grad_norm": 4.893348217010498, + "learning_rate": 7.245649141462492e-06, + "loss": 1.5241, + "step": 50394 + }, + { + "epoch": 0.6299657491437286, + "grad_norm": 0.001993623562157154, + "learning_rate": 7.244810218975431e-06, + "loss": 0.0175, + "step": 50396 + }, + { + "epoch": 0.6299907497687443, + "grad_norm": 3.175452470779419, + "learning_rate": 7.2439713174715064e-06, + "loss": 0.9307, + "step": 50398 + }, + { + "epoch": 0.6300157503937599, + "grad_norm": 3.602217435836792, + "learning_rate": 7.243132436957108e-06, + "loss": 1.994, + "step": 50400 + }, + { + "epoch": 0.6300407510187754, + "grad_norm": 0.9105775952339172, + "learning_rate": 7.242293577438622e-06, + "loss": 0.3739, + "step": 50402 + }, + { + "epoch": 0.6300657516437911, + "grad_norm": 0.0006561426562257111, + "learning_rate": 7.2414547389224375e-06, + "loss": 0.6286, + "step": 50404 + }, + { + "epoch": 0.6300907522688067, + "grad_norm": 2.9759535789489746, + "learning_rate": 7.240615921414944e-06, + "loss": 0.6704, + "step": 50406 + }, + { + "epoch": 0.6301157528938224, + "grad_norm": 3.154818296432495, + "learning_rate": 7.239777124922532e-06, + "loss": 1.1025, + "step": 50408 + }, + { + "epoch": 0.630140753518838, + "grad_norm": 3.8341894149780273, + "learning_rate": 7.238938349451584e-06, + "loss": 0.4012, + "step": 50410 + }, + { + "epoch": 0.6301657541438536, + "grad_norm": 6.251564025878906, + "learning_rate": 7.238099595008492e-06, + "loss": 1.0979, + "step": 50412 + }, + { + "epoch": 0.6301907547688692, + "grad_norm": 3.5258476734161377, + "learning_rate": 7.237260861599642e-06, + "loss": 1.0877, + "step": 50414 + }, + { + "epoch": 0.6302157553938849, + "grad_norm": 5.237618923187256, + "learning_rate": 7.2364221492314234e-06, + "loss": 0.3203, + "step": 50416 + }, + { + "epoch": 0.6302407560189005, + "grad_norm": 4.127844333648682, + "learning_rate": 7.235583457910223e-06, + "loss": 2.1427, + "step": 50418 + }, + { + "epoch": 0.6302657566439162, + "grad_norm": 2.607264280319214, + "learning_rate": 7.234744787642429e-06, + "loss": 1.2791, + "step": 50420 + }, + { + "epoch": 0.6302907572689317, + "grad_norm": 3.6109695434570312, + "learning_rate": 7.2339061384344275e-06, + "loss": 0.5576, + "step": 50422 + }, + { + "epoch": 0.6303157578939473, + "grad_norm": 3.2866222858428955, + "learning_rate": 7.233067510292604e-06, + "loss": 1.8472, + "step": 50424 + }, + { + "epoch": 0.630340758518963, + "grad_norm": 2.628575325012207, + "learning_rate": 7.2322289032233485e-06, + "loss": 0.5821, + "step": 50426 + }, + { + "epoch": 0.6303657591439786, + "grad_norm": 2.2823941707611084, + "learning_rate": 7.2313903172330444e-06, + "loss": 0.9236, + "step": 50428 + }, + { + "epoch": 0.6303907597689943, + "grad_norm": 2.937641143798828, + "learning_rate": 7.2305517523280814e-06, + "loss": 0.754, + "step": 50430 + }, + { + "epoch": 0.6304157603940098, + "grad_norm": 2.1880698204040527, + "learning_rate": 7.2297132085148445e-06, + "loss": 0.6583, + "step": 50432 + }, + { + "epoch": 0.6304407610190255, + "grad_norm": 0.915178120136261, + "learning_rate": 7.228874685799723e-06, + "loss": 0.7227, + "step": 50434 + }, + { + "epoch": 0.6304657616440411, + "grad_norm": 4.3688883781433105, + "learning_rate": 7.228036184189096e-06, + "loss": 1.1423, + "step": 50436 + }, + { + "epoch": 0.6304907622690568, + "grad_norm": 5.810098171234131, + "learning_rate": 7.227197703689355e-06, + "loss": 1.4797, + "step": 50438 + }, + { + "epoch": 0.6305157628940724, + "grad_norm": 3.869784116744995, + "learning_rate": 7.226359244306883e-06, + "loss": 1.2348, + "step": 50440 + }, + { + "epoch": 0.6305407635190879, + "grad_norm": 7.200867176055908, + "learning_rate": 7.225520806048067e-06, + "loss": 2.1057, + "step": 50442 + }, + { + "epoch": 0.6305657641441036, + "grad_norm": 2.5616509914398193, + "learning_rate": 7.224682388919293e-06, + "loss": 1.2697, + "step": 50444 + }, + { + "epoch": 0.6305907647691192, + "grad_norm": 3.4507834911346436, + "learning_rate": 7.223843992926947e-06, + "loss": 0.8149, + "step": 50446 + }, + { + "epoch": 0.6306157653941349, + "grad_norm": 4.376596450805664, + "learning_rate": 7.223005618077411e-06, + "loss": 0.8044, + "step": 50448 + }, + { + "epoch": 0.6306407660191505, + "grad_norm": 0.24838677048683167, + "learning_rate": 7.222167264377071e-06, + "loss": 0.4886, + "step": 50450 + }, + { + "epoch": 0.6306657666441661, + "grad_norm": 3.062086820602417, + "learning_rate": 7.221328931832312e-06, + "loss": 1.1601, + "step": 50452 + }, + { + "epoch": 0.6306907672691817, + "grad_norm": 7.492276668548584, + "learning_rate": 7.2204906204495195e-06, + "loss": 2.1875, + "step": 50454 + }, + { + "epoch": 0.6307157678941974, + "grad_norm": 4.292409896850586, + "learning_rate": 7.219652330235077e-06, + "loss": 0.5981, + "step": 50456 + }, + { + "epoch": 0.630740768519213, + "grad_norm": 5.4887847900390625, + "learning_rate": 7.218814061195369e-06, + "loss": 1.2164, + "step": 50458 + }, + { + "epoch": 0.6307657691442287, + "grad_norm": 2.7748587131500244, + "learning_rate": 7.217975813336784e-06, + "loss": 0.621, + "step": 50460 + }, + { + "epoch": 0.6307907697692442, + "grad_norm": 2.9535815715789795, + "learning_rate": 7.217137586665696e-06, + "loss": 0.9608, + "step": 50462 + }, + { + "epoch": 0.6308157703942598, + "grad_norm": 11.642765998840332, + "learning_rate": 7.216299381188497e-06, + "loss": 1.6777, + "step": 50464 + }, + { + "epoch": 0.6308407710192755, + "grad_norm": 3.486966371536255, + "learning_rate": 7.2154611969115684e-06, + "loss": 0.2119, + "step": 50466 + }, + { + "epoch": 0.6308657716442911, + "grad_norm": 1.141109585762024, + "learning_rate": 7.214623033841293e-06, + "loss": 0.037, + "step": 50468 + }, + { + "epoch": 0.6308907722693068, + "grad_norm": 2.800832509994507, + "learning_rate": 7.213784891984055e-06, + "loss": 1.0864, + "step": 50470 + }, + { + "epoch": 0.6309157728943223, + "grad_norm": 2.9828600883483887, + "learning_rate": 7.212946771346239e-06, + "loss": 1.5493, + "step": 50472 + }, + { + "epoch": 0.630940773519338, + "grad_norm": 0.0014813989400863647, + "learning_rate": 7.212108671934224e-06, + "loss": 0.1221, + "step": 50474 + }, + { + "epoch": 0.6309657741443536, + "grad_norm": 2.32169508934021, + "learning_rate": 7.211270593754396e-06, + "loss": 0.8343, + "step": 50476 + }, + { + "epoch": 0.6309907747693693, + "grad_norm": 1.9980289936065674, + "learning_rate": 7.2104325368131354e-06, + "loss": 1.118, + "step": 50478 + }, + { + "epoch": 0.6310157753943849, + "grad_norm": 5.179196834564209, + "learning_rate": 7.209594501116827e-06, + "loss": 0.5014, + "step": 50480 + }, + { + "epoch": 0.6310407760194005, + "grad_norm": 2.97794771194458, + "learning_rate": 7.2087564866718515e-06, + "loss": 0.715, + "step": 50482 + }, + { + "epoch": 0.6310657766444161, + "grad_norm": 2.4517838954925537, + "learning_rate": 7.207918493484594e-06, + "loss": 0.3928, + "step": 50484 + }, + { + "epoch": 0.6310907772694317, + "grad_norm": 0.000774106418248266, + "learning_rate": 7.207080521561434e-06, + "loss": 1.324, + "step": 50486 + }, + { + "epoch": 0.6311157778944474, + "grad_norm": 4.091993808746338, + "learning_rate": 7.206242570908753e-06, + "loss": 0.5283, + "step": 50488 + }, + { + "epoch": 0.631140778519463, + "grad_norm": 2.276074171066284, + "learning_rate": 7.205404641532933e-06, + "loss": 0.1367, + "step": 50490 + }, + { + "epoch": 0.6311657791444786, + "grad_norm": 4.559219837188721, + "learning_rate": 7.2045667334403554e-06, + "loss": 1.1115, + "step": 50492 + }, + { + "epoch": 0.6311907797694942, + "grad_norm": 1.132647156715393, + "learning_rate": 7.203728846637403e-06, + "loss": 0.6008, + "step": 50494 + }, + { + "epoch": 0.6312157803945099, + "grad_norm": 1.4433752298355103, + "learning_rate": 7.202890981130457e-06, + "loss": 0.0681, + "step": 50496 + }, + { + "epoch": 0.6312407810195255, + "grad_norm": 5.643654823303223, + "learning_rate": 7.202053136925895e-06, + "loss": 2.0505, + "step": 50498 + }, + { + "epoch": 0.6312657816445412, + "grad_norm": 3.5399837493896484, + "learning_rate": 7.201215314030108e-06, + "loss": 1.1714, + "step": 50500 + }, + { + "epoch": 0.6312907822695567, + "grad_norm": 1.8017640113830566, + "learning_rate": 7.200377512449463e-06, + "loss": 1.2941, + "step": 50502 + }, + { + "epoch": 0.6313157828945724, + "grad_norm": 0.32939982414245605, + "learning_rate": 7.199539732190348e-06, + "loss": 1.6656, + "step": 50504 + }, + { + "epoch": 0.631340783519588, + "grad_norm": 9.034295082092285, + "learning_rate": 7.1987019732591434e-06, + "loss": 1.924, + "step": 50506 + }, + { + "epoch": 0.6313657841446036, + "grad_norm": 4.249210357666016, + "learning_rate": 7.1978642356622266e-06, + "loss": 0.9149, + "step": 50508 + }, + { + "epoch": 0.6313907847696193, + "grad_norm": 11.908858299255371, + "learning_rate": 7.197026519405979e-06, + "loss": 2.0375, + "step": 50510 + }, + { + "epoch": 0.6314157853946348, + "grad_norm": 6.4283342361450195, + "learning_rate": 7.196188824496786e-06, + "loss": 1.296, + "step": 50512 + }, + { + "epoch": 0.6314407860196505, + "grad_norm": 7.3376145362854, + "learning_rate": 7.195351150941019e-06, + "loss": 0.5585, + "step": 50514 + }, + { + "epoch": 0.6314657866446661, + "grad_norm": 1.6619980335235596, + "learning_rate": 7.194513498745061e-06, + "loss": 0.3615, + "step": 50516 + }, + { + "epoch": 0.6314907872696818, + "grad_norm": 2.8536081314086914, + "learning_rate": 7.193675867915292e-06, + "loss": 0.6626, + "step": 50518 + }, + { + "epoch": 0.6315157878946974, + "grad_norm": 0.7105703353881836, + "learning_rate": 7.19283825845809e-06, + "loss": 0.0292, + "step": 50520 + }, + { + "epoch": 0.631540788519713, + "grad_norm": 0.6959567666053772, + "learning_rate": 7.1920006703798325e-06, + "loss": 0.1029, + "step": 50522 + }, + { + "epoch": 0.6315657891447286, + "grad_norm": 0.001006281585432589, + "learning_rate": 7.1911631036869034e-06, + "loss": 0.2418, + "step": 50524 + }, + { + "epoch": 0.6315907897697443, + "grad_norm": 4.030533790588379, + "learning_rate": 7.190325558385682e-06, + "loss": 0.792, + "step": 50526 + }, + { + "epoch": 0.6316157903947599, + "grad_norm": 0.000395496201235801, + "learning_rate": 7.189488034482541e-06, + "loss": 0.8352, + "step": 50528 + }, + { + "epoch": 0.6316407910197755, + "grad_norm": 0.001314805936999619, + "learning_rate": 7.188650531983863e-06, + "loss": 0.0989, + "step": 50530 + }, + { + "epoch": 0.6316657916447911, + "grad_norm": 3.405271291732788, + "learning_rate": 7.187813050896022e-06, + "loss": 0.6116, + "step": 50532 + }, + { + "epoch": 0.6316907922698067, + "grad_norm": 3.6976990699768066, + "learning_rate": 7.186975591225401e-06, + "loss": 0.2647, + "step": 50534 + }, + { + "epoch": 0.6317157928948224, + "grad_norm": 0.34625735878944397, + "learning_rate": 7.186138152978372e-06, + "loss": 1.0999, + "step": 50536 + }, + { + "epoch": 0.631740793519838, + "grad_norm": 8.3612699508667, + "learning_rate": 7.185300736161326e-06, + "loss": 0.5533, + "step": 50538 + }, + { + "epoch": 0.6317657941448537, + "grad_norm": 3.4286980628967285, + "learning_rate": 7.1844633407806256e-06, + "loss": 0.9217, + "step": 50540 + }, + { + "epoch": 0.6317907947698692, + "grad_norm": 9.021158218383789, + "learning_rate": 7.183625966842655e-06, + "loss": 1.8286, + "step": 50542 + }, + { + "epoch": 0.6318157953948849, + "grad_norm": 0.00041154082282446325, + "learning_rate": 7.182788614353791e-06, + "loss": 0.9777, + "step": 50544 + }, + { + "epoch": 0.6318407960199005, + "grad_norm": 2.899021625518799, + "learning_rate": 7.181951283320409e-06, + "loss": 1.253, + "step": 50546 + }, + { + "epoch": 0.6318657966449162, + "grad_norm": 3.0846400260925293, + "learning_rate": 7.181113973748887e-06, + "loss": 1.287, + "step": 50548 + }, + { + "epoch": 0.6318907972699318, + "grad_norm": 4.118391990661621, + "learning_rate": 7.180276685645604e-06, + "loss": 1.6051, + "step": 50550 + }, + { + "epoch": 0.6319157978949473, + "grad_norm": 3.9061317443847656, + "learning_rate": 7.179439419016938e-06, + "loss": 0.8868, + "step": 50552 + }, + { + "epoch": 0.631940798519963, + "grad_norm": 4.849267482757568, + "learning_rate": 7.17860217386926e-06, + "loss": 0.9955, + "step": 50554 + }, + { + "epoch": 0.6319657991449786, + "grad_norm": 3.1745738983154297, + "learning_rate": 7.177764950208947e-06, + "loss": 1.207, + "step": 50556 + }, + { + "epoch": 0.6319907997699943, + "grad_norm": 1.986120581626892, + "learning_rate": 7.17692774804238e-06, + "loss": 1.7892, + "step": 50558 + }, + { + "epoch": 0.6320158003950099, + "grad_norm": 2.7356059551239014, + "learning_rate": 7.176090567375928e-06, + "loss": 1.1955, + "step": 50560 + }, + { + "epoch": 0.6320408010200255, + "grad_norm": 5.363541126251221, + "learning_rate": 7.175253408215972e-06, + "loss": 1.7844, + "step": 50562 + }, + { + "epoch": 0.6320658016450411, + "grad_norm": 6.001918315887451, + "learning_rate": 7.174416270568892e-06, + "loss": 1.1098, + "step": 50564 + }, + { + "epoch": 0.6320908022700568, + "grad_norm": 3.253516912460327, + "learning_rate": 7.173579154441055e-06, + "loss": 0.6087, + "step": 50566 + }, + { + "epoch": 0.6321158028950724, + "grad_norm": 3.488121509552002, + "learning_rate": 7.172742059838837e-06, + "loss": 0.7866, + "step": 50568 + }, + { + "epoch": 0.6321408035200881, + "grad_norm": 4.353970527648926, + "learning_rate": 7.171904986768617e-06, + "loss": 1.0352, + "step": 50570 + }, + { + "epoch": 0.6321658041451036, + "grad_norm": 8.372625350952148, + "learning_rate": 7.171067935236767e-06, + "loss": 1.8786, + "step": 50572 + }, + { + "epoch": 0.6321908047701192, + "grad_norm": 0.0022896421141922474, + "learning_rate": 7.170230905249666e-06, + "loss": 0.0285, + "step": 50574 + }, + { + "epoch": 0.6322158053951349, + "grad_norm": 1.5864262580871582, + "learning_rate": 7.1693938968136855e-06, + "loss": 0.1948, + "step": 50576 + }, + { + "epoch": 0.6322408060201505, + "grad_norm": 0.00036931681097485125, + "learning_rate": 7.1685569099352046e-06, + "loss": 0.0435, + "step": 50578 + }, + { + "epoch": 0.6322658066451662, + "grad_norm": 4.359044551849365, + "learning_rate": 7.16771994462059e-06, + "loss": 1.0524, + "step": 50580 + }, + { + "epoch": 0.6322908072701817, + "grad_norm": 3.945751190185547, + "learning_rate": 7.16688300087622e-06, + "loss": 0.8374, + "step": 50582 + }, + { + "epoch": 0.6323158078951974, + "grad_norm": 3.752776622772217, + "learning_rate": 7.166046078708466e-06, + "loss": 0.8622, + "step": 50584 + }, + { + "epoch": 0.632340808520213, + "grad_norm": 1.615066409111023, + "learning_rate": 7.165209178123706e-06, + "loss": 0.5267, + "step": 50586 + }, + { + "epoch": 0.6323658091452287, + "grad_norm": 5.803128242492676, + "learning_rate": 7.164372299128313e-06, + "loss": 1.6982, + "step": 50588 + }, + { + "epoch": 0.6323908097702443, + "grad_norm": 3.6103553771972656, + "learning_rate": 7.163535441728664e-06, + "loss": 1.4129, + "step": 50590 + }, + { + "epoch": 0.6324158103952598, + "grad_norm": 0.000566535338293761, + "learning_rate": 7.162698605931123e-06, + "loss": 0.1995, + "step": 50592 + }, + { + "epoch": 0.6324408110202755, + "grad_norm": 2.4668145179748535, + "learning_rate": 7.161861791742069e-06, + "loss": 0.9463, + "step": 50594 + }, + { + "epoch": 0.6324658116452911, + "grad_norm": 3.888397216796875, + "learning_rate": 7.161024999167874e-06, + "loss": 0.9635, + "step": 50596 + }, + { + "epoch": 0.6324908122703068, + "grad_norm": 6.76105260848999, + "learning_rate": 7.160188228214908e-06, + "loss": 0.9648, + "step": 50598 + }, + { + "epoch": 0.6325158128953224, + "grad_norm": 1.801103949546814, + "learning_rate": 7.159351478889549e-06, + "loss": 0.7573, + "step": 50600 + }, + { + "epoch": 0.632540813520338, + "grad_norm": 4.258965492248535, + "learning_rate": 7.158514751198169e-06, + "loss": 1.5738, + "step": 50602 + }, + { + "epoch": 0.6325658141453536, + "grad_norm": 2.226741075515747, + "learning_rate": 7.157678045147141e-06, + "loss": 1.1576, + "step": 50604 + }, + { + "epoch": 0.6325908147703693, + "grad_norm": 3.2596428394317627, + "learning_rate": 7.1568413607428326e-06, + "loss": 0.5612, + "step": 50606 + }, + { + "epoch": 0.6326158153953849, + "grad_norm": 1.4100645780563354, + "learning_rate": 7.156004697991617e-06, + "loss": 0.4597, + "step": 50608 + }, + { + "epoch": 0.6326408160204006, + "grad_norm": 3.1360325813293457, + "learning_rate": 7.155168056899866e-06, + "loss": 0.4905, + "step": 50610 + }, + { + "epoch": 0.6326658166454161, + "grad_norm": 0.3577898144721985, + "learning_rate": 7.154331437473956e-06, + "loss": 0.8467, + "step": 50612 + }, + { + "epoch": 0.6326908172704317, + "grad_norm": 5.415145397186279, + "learning_rate": 7.153494839720254e-06, + "loss": 2.5788, + "step": 50614 + }, + { + "epoch": 0.6327158178954474, + "grad_norm": 4.008469104766846, + "learning_rate": 7.152658263645137e-06, + "loss": 0.7251, + "step": 50616 + }, + { + "epoch": 0.632740818520463, + "grad_norm": 0.7249493598937988, + "learning_rate": 7.151821709254968e-06, + "loss": 1.7933, + "step": 50618 + }, + { + "epoch": 0.6327658191454787, + "grad_norm": 1.0756357908248901, + "learning_rate": 7.150985176556122e-06, + "loss": 1.2159, + "step": 50620 + }, + { + "epoch": 0.6327908197704942, + "grad_norm": 3.035377264022827, + "learning_rate": 7.150148665554968e-06, + "loss": 1.234, + "step": 50622 + }, + { + "epoch": 0.6328158203955099, + "grad_norm": 2.855045795440674, + "learning_rate": 7.149312176257881e-06, + "loss": 0.6459, + "step": 50624 + }, + { + "epoch": 0.6328408210205255, + "grad_norm": 2.231776714324951, + "learning_rate": 7.148475708671229e-06, + "loss": 0.529, + "step": 50626 + }, + { + "epoch": 0.6328658216455412, + "grad_norm": 2.3180384635925293, + "learning_rate": 7.147639262801382e-06, + "loss": 0.9642, + "step": 50628 + }, + { + "epoch": 0.6328908222705568, + "grad_norm": 3.478320837020874, + "learning_rate": 7.146802838654717e-06, + "loss": 1.1115, + "step": 50630 + }, + { + "epoch": 0.6329158228955724, + "grad_norm": 4.855100154876709, + "learning_rate": 7.145966436237593e-06, + "loss": 1.4409, + "step": 50632 + }, + { + "epoch": 0.632940823520588, + "grad_norm": 3.4508960247039795, + "learning_rate": 7.1451300555563845e-06, + "loss": 0.5477, + "step": 50634 + }, + { + "epoch": 0.6329658241456037, + "grad_norm": 2.985107421875, + "learning_rate": 7.144293696617461e-06, + "loss": 0.5171, + "step": 50636 + }, + { + "epoch": 0.6329908247706193, + "grad_norm": 3.3709425926208496, + "learning_rate": 7.143457359427195e-06, + "loss": 0.9343, + "step": 50638 + }, + { + "epoch": 0.633015825395635, + "grad_norm": 3.7670962810516357, + "learning_rate": 7.142621043991953e-06, + "loss": 0.3095, + "step": 50640 + }, + { + "epoch": 0.6330408260206505, + "grad_norm": 3.26765775680542, + "learning_rate": 7.141784750318109e-06, + "loss": 0.569, + "step": 50642 + }, + { + "epoch": 0.6330658266456661, + "grad_norm": 8.333724975585938, + "learning_rate": 7.140948478412024e-06, + "loss": 1.3796, + "step": 50644 + }, + { + "epoch": 0.6330908272706818, + "grad_norm": 0.11330578476190567, + "learning_rate": 7.140112228280069e-06, + "loss": 0.5036, + "step": 50646 + }, + { + "epoch": 0.6331158278956974, + "grad_norm": 2.538393259048462, + "learning_rate": 7.139275999928619e-06, + "loss": 0.7119, + "step": 50648 + }, + { + "epoch": 0.6331408285207131, + "grad_norm": 5.47161865234375, + "learning_rate": 7.138439793364036e-06, + "loss": 1.1999, + "step": 50650 + }, + { + "epoch": 0.6331658291457286, + "grad_norm": 0.0020388199482113123, + "learning_rate": 7.1376036085926914e-06, + "loss": 0.6294, + "step": 50652 + }, + { + "epoch": 0.6331908297707443, + "grad_norm": 3.5113375186920166, + "learning_rate": 7.1367674456209535e-06, + "loss": 0.7948, + "step": 50654 + }, + { + "epoch": 0.6332158303957599, + "grad_norm": 1.67401921749115, + "learning_rate": 7.135931304455193e-06, + "loss": 0.0735, + "step": 50656 + }, + { + "epoch": 0.6332408310207756, + "grad_norm": 1.1324907541275024, + "learning_rate": 7.135095185101769e-06, + "loss": 0.8321, + "step": 50658 + }, + { + "epoch": 0.6332658316457912, + "grad_norm": 4.0401225090026855, + "learning_rate": 7.134259087567057e-06, + "loss": 1.5204, + "step": 50660 + }, + { + "epoch": 0.6332908322708067, + "grad_norm": 3.675105571746826, + "learning_rate": 7.133423011857423e-06, + "loss": 0.8601, + "step": 50662 + }, + { + "epoch": 0.6333158328958224, + "grad_norm": 3.7273645401000977, + "learning_rate": 7.132586957979233e-06, + "loss": 0.6412, + "step": 50664 + }, + { + "epoch": 0.633340833520838, + "grad_norm": 3.961435317993164, + "learning_rate": 7.131750925938856e-06, + "loss": 1.2247, + "step": 50666 + }, + { + "epoch": 0.6333658341458537, + "grad_norm": 3.9365296363830566, + "learning_rate": 7.130914915742663e-06, + "loss": 0.78, + "step": 50668 + }, + { + "epoch": 0.6333908347708693, + "grad_norm": 3.984771966934204, + "learning_rate": 7.13007892739701e-06, + "loss": 1.2125, + "step": 50670 + }, + { + "epoch": 0.6334158353958849, + "grad_norm": 3.27768874168396, + "learning_rate": 7.12924296090827e-06, + "loss": 0.9216, + "step": 50672 + }, + { + "epoch": 0.6334408360209005, + "grad_norm": 0.00043798566912300885, + "learning_rate": 7.128407016282811e-06, + "loss": 0.0448, + "step": 50674 + }, + { + "epoch": 0.6334658366459162, + "grad_norm": 3.107726812362671, + "learning_rate": 7.1275710935269996e-06, + "loss": 0.6994, + "step": 50676 + }, + { + "epoch": 0.6334908372709318, + "grad_norm": 0.1060759648680687, + "learning_rate": 7.126735192647198e-06, + "loss": 1.0215, + "step": 50678 + }, + { + "epoch": 0.6335158378959475, + "grad_norm": 0.720353901386261, + "learning_rate": 7.125899313649775e-06, + "loss": 0.9979, + "step": 50680 + }, + { + "epoch": 0.633540838520963, + "grad_norm": 0.12159816175699234, + "learning_rate": 7.125063456541103e-06, + "loss": 0.4195, + "step": 50682 + }, + { + "epoch": 0.6335658391459786, + "grad_norm": 4.866736888885498, + "learning_rate": 7.124227621327533e-06, + "loss": 0.4746, + "step": 50684 + }, + { + "epoch": 0.6335908397709943, + "grad_norm": 4.969725608825684, + "learning_rate": 7.123391808015441e-06, + "loss": 0.9136, + "step": 50686 + }, + { + "epoch": 0.6336158403960099, + "grad_norm": 2.0394387245178223, + "learning_rate": 7.122556016611191e-06, + "loss": 0.0745, + "step": 50688 + }, + { + "epoch": 0.6336408410210256, + "grad_norm": 4.007757663726807, + "learning_rate": 7.121720247121146e-06, + "loss": 0.3662, + "step": 50690 + }, + { + "epoch": 0.6336658416460411, + "grad_norm": 5.98060417175293, + "learning_rate": 7.120884499551675e-06, + "loss": 0.8226, + "step": 50692 + }, + { + "epoch": 0.6336908422710568, + "grad_norm": 3.6586568355560303, + "learning_rate": 7.120048773909143e-06, + "loss": 0.707, + "step": 50694 + }, + { + "epoch": 0.6337158428960724, + "grad_norm": 9.259367942810059, + "learning_rate": 7.119213070199906e-06, + "loss": 0.7249, + "step": 50696 + }, + { + "epoch": 0.6337408435210881, + "grad_norm": 1.2351723909378052, + "learning_rate": 7.118377388430338e-06, + "loss": 0.7082, + "step": 50698 + }, + { + "epoch": 0.6337658441461037, + "grad_norm": 5.528088092803955, + "learning_rate": 7.117541728606798e-06, + "loss": 1.7031, + "step": 50700 + }, + { + "epoch": 0.6337908447711192, + "grad_norm": 0.0035357431042939425, + "learning_rate": 7.116706090735655e-06, + "loss": 0.4423, + "step": 50702 + }, + { + "epoch": 0.6338158453961349, + "grad_norm": 0.0015671039000153542, + "learning_rate": 7.11587047482327e-06, + "loss": 1.0021, + "step": 50704 + }, + { + "epoch": 0.6338408460211505, + "grad_norm": 3.677307605743408, + "learning_rate": 7.1150348808760065e-06, + "loss": 0.9297, + "step": 50706 + }, + { + "epoch": 0.6338658466461662, + "grad_norm": 8.979536056518555, + "learning_rate": 7.114199308900233e-06, + "loss": 0.6968, + "step": 50708 + }, + { + "epoch": 0.6338908472711818, + "grad_norm": 0.06727071106433868, + "learning_rate": 7.1133637589023065e-06, + "loss": 0.0085, + "step": 50710 + }, + { + "epoch": 0.6339158478961974, + "grad_norm": 0.00041909964056685567, + "learning_rate": 7.112528230888594e-06, + "loss": 1.5128, + "step": 50712 + }, + { + "epoch": 0.633940848521213, + "grad_norm": 0.0025090863928198814, + "learning_rate": 7.111692724865458e-06, + "loss": 0.0, + "step": 50714 + }, + { + "epoch": 0.6339658491462287, + "grad_norm": 4.597709655761719, + "learning_rate": 7.110857240839262e-06, + "loss": 1.3773, + "step": 50716 + }, + { + "epoch": 0.6339908497712443, + "grad_norm": 3.2472636699676514, + "learning_rate": 7.1100217788163675e-06, + "loss": 1.3097, + "step": 50718 + }, + { + "epoch": 0.63401585039626, + "grad_norm": 0.0011273574782535434, + "learning_rate": 7.109186338803141e-06, + "loss": 0.0008, + "step": 50720 + }, + { + "epoch": 0.6340408510212755, + "grad_norm": 2.8254406452178955, + "learning_rate": 7.108350920805941e-06, + "loss": 0.9644, + "step": 50722 + }, + { + "epoch": 0.6340658516462911, + "grad_norm": 6.753553867340088, + "learning_rate": 7.107515524831131e-06, + "loss": 0.7995, + "step": 50724 + }, + { + "epoch": 0.6340908522713068, + "grad_norm": 2.8250558376312256, + "learning_rate": 7.106680150885074e-06, + "loss": 0.8349, + "step": 50726 + }, + { + "epoch": 0.6341158528963224, + "grad_norm": 3.469052791595459, + "learning_rate": 7.105844798974132e-06, + "loss": 1.1787, + "step": 50728 + }, + { + "epoch": 0.6341408535213381, + "grad_norm": 3.5398874282836914, + "learning_rate": 7.105009469104667e-06, + "loss": 0.995, + "step": 50730 + }, + { + "epoch": 0.6341658541463536, + "grad_norm": 3.7275993824005127, + "learning_rate": 7.10417416128304e-06, + "loss": 1.3981, + "step": 50732 + }, + { + "epoch": 0.6341908547713693, + "grad_norm": 5.532944202423096, + "learning_rate": 7.103338875515615e-06, + "loss": 1.1222, + "step": 50734 + }, + { + "epoch": 0.6342158553963849, + "grad_norm": 0.6296989917755127, + "learning_rate": 7.102503611808749e-06, + "loss": 1.458, + "step": 50736 + }, + { + "epoch": 0.6342408560214006, + "grad_norm": 1.1023825407028198, + "learning_rate": 7.101668370168806e-06, + "loss": 0.7379, + "step": 50738 + }, + { + "epoch": 0.6342658566464162, + "grad_norm": 0.24347880482673645, + "learning_rate": 7.100833150602148e-06, + "loss": 0.0194, + "step": 50740 + }, + { + "epoch": 0.6342908572714318, + "grad_norm": 0.0014109048061072826, + "learning_rate": 7.099997953115134e-06, + "loss": 0.8578, + "step": 50742 + }, + { + "epoch": 0.6343158578964474, + "grad_norm": 4.314843654632568, + "learning_rate": 7.0991627777141255e-06, + "loss": 1.2741, + "step": 50744 + }, + { + "epoch": 0.634340858521463, + "grad_norm": 0.056842755526304245, + "learning_rate": 7.098327624405485e-06, + "loss": 0.6802, + "step": 50746 + }, + { + "epoch": 0.6343658591464787, + "grad_norm": 0.0017735455185174942, + "learning_rate": 7.097492493195568e-06, + "loss": 0.0132, + "step": 50748 + }, + { + "epoch": 0.6343908597714943, + "grad_norm": 0.0006142017082311213, + "learning_rate": 7.096657384090739e-06, + "loss": 0.0, + "step": 50750 + }, + { + "epoch": 0.6344158603965099, + "grad_norm": 2.5441813468933105, + "learning_rate": 7.095822297097357e-06, + "loss": 0.1074, + "step": 50752 + }, + { + "epoch": 0.6344408610215255, + "grad_norm": 5.123103141784668, + "learning_rate": 7.094987232221781e-06, + "loss": 0.9033, + "step": 50754 + }, + { + "epoch": 0.6344658616465412, + "grad_norm": 3.1565747261047363, + "learning_rate": 7.094152189470372e-06, + "loss": 0.9981, + "step": 50756 + }, + { + "epoch": 0.6344908622715568, + "grad_norm": 4.360045909881592, + "learning_rate": 7.0933171688494884e-06, + "loss": 1.2884, + "step": 50758 + }, + { + "epoch": 0.6345158628965725, + "grad_norm": 1.7075310945510864, + "learning_rate": 7.092482170365492e-06, + "loss": 0.1179, + "step": 50760 + }, + { + "epoch": 0.634540863521588, + "grad_norm": 5.254124641418457, + "learning_rate": 7.091647194024739e-06, + "loss": 0.79, + "step": 50762 + }, + { + "epoch": 0.6345658641466037, + "grad_norm": 3.849857807159424, + "learning_rate": 7.090812239833589e-06, + "loss": 1.5889, + "step": 50764 + }, + { + "epoch": 0.6345908647716193, + "grad_norm": 0.8407412171363831, + "learning_rate": 7.089977307798402e-06, + "loss": 0.5684, + "step": 50766 + }, + { + "epoch": 0.634615865396635, + "grad_norm": 1.997767686843872, + "learning_rate": 7.089142397925537e-06, + "loss": 0.1586, + "step": 50768 + }, + { + "epoch": 0.6346408660216506, + "grad_norm": 3.556212902069092, + "learning_rate": 7.088307510221351e-06, + "loss": 0.5193, + "step": 50770 + }, + { + "epoch": 0.6346658666466661, + "grad_norm": 1.8574155569076538, + "learning_rate": 7.0874726446922036e-06, + "loss": 1.6534, + "step": 50772 + }, + { + "epoch": 0.6346908672716818, + "grad_norm": 3.1383934020996094, + "learning_rate": 7.086637801344454e-06, + "loss": 0.8777, + "step": 50774 + }, + { + "epoch": 0.6347158678966974, + "grad_norm": 0.0008340132189914584, + "learning_rate": 7.085802980184458e-06, + "loss": 0.3346, + "step": 50776 + }, + { + "epoch": 0.6347408685217131, + "grad_norm": 4.306807518005371, + "learning_rate": 7.084968181218572e-06, + "loss": 0.8321, + "step": 50778 + }, + { + "epoch": 0.6347658691467287, + "grad_norm": 0.012387379072606564, + "learning_rate": 7.084133404453159e-06, + "loss": 1.551, + "step": 50780 + }, + { + "epoch": 0.6347908697717443, + "grad_norm": 3.6256773471832275, + "learning_rate": 7.083298649894572e-06, + "loss": 1.5158, + "step": 50782 + }, + { + "epoch": 0.6348158703967599, + "grad_norm": 3.0970802307128906, + "learning_rate": 7.082463917549172e-06, + "loss": 0.8341, + "step": 50784 + }, + { + "epoch": 0.6348408710217756, + "grad_norm": 2.036832332611084, + "learning_rate": 7.081629207423314e-06, + "loss": 1.4434, + "step": 50786 + }, + { + "epoch": 0.6348658716467912, + "grad_norm": 3.367748498916626, + "learning_rate": 7.080794519523353e-06, + "loss": 0.7204, + "step": 50788 + }, + { + "epoch": 0.6348908722718068, + "grad_norm": 4.665306568145752, + "learning_rate": 7.079959853855649e-06, + "loss": 1.477, + "step": 50790 + }, + { + "epoch": 0.6349158728968224, + "grad_norm": 3.3524420261383057, + "learning_rate": 7.0791252104265585e-06, + "loss": 1.1784, + "step": 50792 + }, + { + "epoch": 0.634940873521838, + "grad_norm": 2.6840250492095947, + "learning_rate": 7.078290589242437e-06, + "loss": 0.8028, + "step": 50794 + }, + { + "epoch": 0.6349658741468537, + "grad_norm": 3.917288303375244, + "learning_rate": 7.077455990309641e-06, + "loss": 1.4778, + "step": 50796 + }, + { + "epoch": 0.6349908747718693, + "grad_norm": 2.226484775543213, + "learning_rate": 7.0766214136345266e-06, + "loss": 0.4209, + "step": 50798 + }, + { + "epoch": 0.635015875396885, + "grad_norm": 7.612183570861816, + "learning_rate": 7.075786859223452e-06, + "loss": 1.7259, + "step": 50800 + }, + { + "epoch": 0.6350408760219005, + "grad_norm": 1.1295456886291504, + "learning_rate": 7.07495232708277e-06, + "loss": 1.2677, + "step": 50802 + }, + { + "epoch": 0.6350658766469162, + "grad_norm": 0.001453121891245246, + "learning_rate": 7.074117817218837e-06, + "loss": 0.7771, + "step": 50804 + }, + { + "epoch": 0.6350908772719318, + "grad_norm": 2.7186033725738525, + "learning_rate": 7.07328332963801e-06, + "loss": 1.145, + "step": 50806 + }, + { + "epoch": 0.6351158778969475, + "grad_norm": 4.552781105041504, + "learning_rate": 7.072448864346642e-06, + "loss": 0.8499, + "step": 50808 + }, + { + "epoch": 0.6351408785219631, + "grad_norm": 6.242684841156006, + "learning_rate": 7.07161442135109e-06, + "loss": 0.7958, + "step": 50810 + }, + { + "epoch": 0.6351658791469786, + "grad_norm": 0.8936429023742676, + "learning_rate": 7.07078000065771e-06, + "loss": 0.7086, + "step": 50812 + }, + { + "epoch": 0.6351908797719943, + "grad_norm": 2.502434015274048, + "learning_rate": 7.069945602272853e-06, + "loss": 0.2188, + "step": 50814 + }, + { + "epoch": 0.6352158803970099, + "grad_norm": 2.5888590812683105, + "learning_rate": 7.069111226202876e-06, + "loss": 0.5001, + "step": 50816 + }, + { + "epoch": 0.6352408810220256, + "grad_norm": 3.012194871902466, + "learning_rate": 7.068276872454134e-06, + "loss": 1.2858, + "step": 50818 + }, + { + "epoch": 0.6352658816470412, + "grad_norm": 2.037911891937256, + "learning_rate": 7.0674425410329804e-06, + "loss": 0.3959, + "step": 50820 + }, + { + "epoch": 0.6352908822720568, + "grad_norm": 3.7686901092529297, + "learning_rate": 7.066608231945771e-06, + "loss": 0.7471, + "step": 50822 + }, + { + "epoch": 0.6353158828970724, + "grad_norm": 2.1816728115081787, + "learning_rate": 7.065773945198857e-06, + "loss": 0.759, + "step": 50824 + }, + { + "epoch": 0.6353408835220881, + "grad_norm": 2.314645290374756, + "learning_rate": 7.064939680798596e-06, + "loss": 1.0899, + "step": 50826 + }, + { + "epoch": 0.6353658841471037, + "grad_norm": 1.9704276323318481, + "learning_rate": 7.064105438751337e-06, + "loss": 0.2919, + "step": 50828 + }, + { + "epoch": 0.6353908847721194, + "grad_norm": 1.5950977802276611, + "learning_rate": 7.0632712190634365e-06, + "loss": 0.0188, + "step": 50830 + }, + { + "epoch": 0.6354158853971349, + "grad_norm": 5.039239406585693, + "learning_rate": 7.062437021741246e-06, + "loss": 2.1058, + "step": 50832 + }, + { + "epoch": 0.6354408860221505, + "grad_norm": 7.314911842346191, + "learning_rate": 7.061602846791121e-06, + "loss": 1.754, + "step": 50834 + }, + { + "epoch": 0.6354658866471662, + "grad_norm": 2.7392570972442627, + "learning_rate": 7.060768694219413e-06, + "loss": 0.367, + "step": 50836 + }, + { + "epoch": 0.6354908872721818, + "grad_norm": 3.8534348011016846, + "learning_rate": 7.0599345640324765e-06, + "loss": 0.7451, + "step": 50838 + }, + { + "epoch": 0.6355158878971975, + "grad_norm": 0.002035203156992793, + "learning_rate": 7.059100456236661e-06, + "loss": 0.057, + "step": 50840 + }, + { + "epoch": 0.635540888522213, + "grad_norm": 0.0016089484561234713, + "learning_rate": 7.05826637083832e-06, + "loss": 0.863, + "step": 50842 + }, + { + "epoch": 0.6355658891472287, + "grad_norm": 0.0012473184615373611, + "learning_rate": 7.057432307843808e-06, + "loss": 0.4518, + "step": 50844 + }, + { + "epoch": 0.6355908897722443, + "grad_norm": 2.773871898651123, + "learning_rate": 7.056598267259473e-06, + "loss": 0.4611, + "step": 50846 + }, + { + "epoch": 0.63561589039726, + "grad_norm": 2.338038206100464, + "learning_rate": 7.05576424909167e-06, + "loss": 0.849, + "step": 50848 + }, + { + "epoch": 0.6356408910222756, + "grad_norm": 4.405813694000244, + "learning_rate": 7.054930253346751e-06, + "loss": 1.3822, + "step": 50850 + }, + { + "epoch": 0.6356658916472911, + "grad_norm": 4.762069225311279, + "learning_rate": 7.054096280031068e-06, + "loss": 1.3862, + "step": 50852 + }, + { + "epoch": 0.6356908922723068, + "grad_norm": 3.671708106994629, + "learning_rate": 7.053262329150971e-06, + "loss": 1.327, + "step": 50854 + }, + { + "epoch": 0.6357158928973224, + "grad_norm": 3.4624686241149902, + "learning_rate": 7.05242840071281e-06, + "loss": 0.5383, + "step": 50856 + }, + { + "epoch": 0.6357408935223381, + "grad_norm": 7.6878156661987305, + "learning_rate": 7.051594494722937e-06, + "loss": 0.8877, + "step": 50858 + }, + { + "epoch": 0.6357658941473537, + "grad_norm": 0.42958080768585205, + "learning_rate": 7.050760611187703e-06, + "loss": 0.0357, + "step": 50860 + }, + { + "epoch": 0.6357908947723693, + "grad_norm": 0.7417863607406616, + "learning_rate": 7.049926750113461e-06, + "loss": 0.0308, + "step": 50862 + }, + { + "epoch": 0.6358158953973849, + "grad_norm": 4.104420185089111, + "learning_rate": 7.04909291150656e-06, + "loss": 1.8682, + "step": 50864 + }, + { + "epoch": 0.6358408960224006, + "grad_norm": 0.0009354427456855774, + "learning_rate": 7.048259095373349e-06, + "loss": 0.0972, + "step": 50866 + }, + { + "epoch": 0.6358658966474162, + "grad_norm": 6.12534236907959, + "learning_rate": 7.047425301720179e-06, + "loss": 1.3225, + "step": 50868 + }, + { + "epoch": 0.6358908972724319, + "grad_norm": 6.621542453765869, + "learning_rate": 7.0465915305534e-06, + "loss": 1.758, + "step": 50870 + }, + { + "epoch": 0.6359158978974474, + "grad_norm": 7.968045711517334, + "learning_rate": 7.045757781879362e-06, + "loss": 0.6705, + "step": 50872 + }, + { + "epoch": 0.635940898522463, + "grad_norm": 3.7434005737304688, + "learning_rate": 7.044924055704416e-06, + "loss": 0.7525, + "step": 50874 + }, + { + "epoch": 0.6359658991474787, + "grad_norm": 3.3745832443237305, + "learning_rate": 7.044090352034909e-06, + "loss": 1.0704, + "step": 50876 + }, + { + "epoch": 0.6359908997724943, + "grad_norm": 0.004154459107667208, + "learning_rate": 7.0432566708771955e-06, + "loss": 0.2353, + "step": 50878 + }, + { + "epoch": 0.63601590039751, + "grad_norm": 2.5979249477386475, + "learning_rate": 7.042423012237618e-06, + "loss": 0.9881, + "step": 50880 + }, + { + "epoch": 0.6360409010225255, + "grad_norm": 2.3346924781799316, + "learning_rate": 7.041589376122529e-06, + "loss": 0.9894, + "step": 50882 + }, + { + "epoch": 0.6360659016475412, + "grad_norm": 2.0974645614624023, + "learning_rate": 7.040755762538278e-06, + "loss": 0.8106, + "step": 50884 + }, + { + "epoch": 0.6360909022725568, + "grad_norm": 1.5296376943588257, + "learning_rate": 7.0399221714912115e-06, + "loss": 0.6347, + "step": 50886 + }, + { + "epoch": 0.6361159028975725, + "grad_norm": 2.614185094833374, + "learning_rate": 7.039088602987679e-06, + "loss": 0.1206, + "step": 50888 + }, + { + "epoch": 0.6361409035225881, + "grad_norm": 5.1273088455200195, + "learning_rate": 7.038255057034031e-06, + "loss": 2.4143, + "step": 50890 + }, + { + "epoch": 0.6361659041476037, + "grad_norm": 3.214951515197754, + "learning_rate": 7.037421533636613e-06, + "loss": 0.6855, + "step": 50892 + }, + { + "epoch": 0.6361909047726193, + "grad_norm": 3.518568754196167, + "learning_rate": 7.036588032801772e-06, + "loss": 0.8064, + "step": 50894 + }, + { + "epoch": 0.636215905397635, + "grad_norm": 2.6395692825317383, + "learning_rate": 7.035754554535858e-06, + "loss": 1.0577, + "step": 50896 + }, + { + "epoch": 0.6362409060226506, + "grad_norm": 3.7988383769989014, + "learning_rate": 7.034921098845219e-06, + "loss": 1.8822, + "step": 50898 + }, + { + "epoch": 0.6362659066476662, + "grad_norm": 3.406747579574585, + "learning_rate": 7.0340876657362e-06, + "loss": 0.7655, + "step": 50900 + }, + { + "epoch": 0.6362909072726818, + "grad_norm": 3.3186569213867188, + "learning_rate": 7.03325425521515e-06, + "loss": 0.8501, + "step": 50902 + }, + { + "epoch": 0.6363159078976974, + "grad_norm": 1.5058181285858154, + "learning_rate": 7.032420867288421e-06, + "loss": 0.8749, + "step": 50904 + }, + { + "epoch": 0.6363409085227131, + "grad_norm": 3.8636810779571533, + "learning_rate": 7.031587501962349e-06, + "loss": 0.9113, + "step": 50906 + }, + { + "epoch": 0.6363659091477287, + "grad_norm": 3.6759114265441895, + "learning_rate": 7.030754159243289e-06, + "loss": 1.1925, + "step": 50908 + }, + { + "epoch": 0.6363909097727444, + "grad_norm": 4.1051249504089355, + "learning_rate": 7.029920839137585e-06, + "loss": 1.4569, + "step": 50910 + }, + { + "epoch": 0.6364159103977599, + "grad_norm": 0.702987790107727, + "learning_rate": 7.029087541651585e-06, + "loss": 0.0263, + "step": 50912 + }, + { + "epoch": 0.6364409110227756, + "grad_norm": 5.237281322479248, + "learning_rate": 7.028254266791631e-06, + "loss": 2.0759, + "step": 50914 + }, + { + "epoch": 0.6364659116477912, + "grad_norm": 1.989828109741211, + "learning_rate": 7.027421014564077e-06, + "loss": 1.3512, + "step": 50916 + }, + { + "epoch": 0.6364909122728069, + "grad_norm": 1.7165316343307495, + "learning_rate": 7.026587784975263e-06, + "loss": 0.0866, + "step": 50918 + }, + { + "epoch": 0.6365159128978225, + "grad_norm": 7.354743957519531, + "learning_rate": 7.025754578031534e-06, + "loss": 1.431, + "step": 50920 + }, + { + "epoch": 0.636540913522838, + "grad_norm": 2.6350560188293457, + "learning_rate": 7.024921393739239e-06, + "loss": 1.4137, + "step": 50922 + }, + { + "epoch": 0.6365659141478537, + "grad_norm": 4.104562759399414, + "learning_rate": 7.02408823210472e-06, + "loss": 0.8524, + "step": 50924 + }, + { + "epoch": 0.6365909147728693, + "grad_norm": 3.4466488361358643, + "learning_rate": 7.023255093134326e-06, + "loss": 1.181, + "step": 50926 + }, + { + "epoch": 0.636615915397885, + "grad_norm": 0.0017393587622791529, + "learning_rate": 7.022421976834397e-06, + "loss": 0.5778, + "step": 50928 + }, + { + "epoch": 0.6366409160229006, + "grad_norm": 6.689764976501465, + "learning_rate": 7.0215888832112875e-06, + "loss": 0.8396, + "step": 50930 + }, + { + "epoch": 0.6366659166479162, + "grad_norm": 3.143709897994995, + "learning_rate": 7.020755812271333e-06, + "loss": 0.624, + "step": 50932 + }, + { + "epoch": 0.6366909172729318, + "grad_norm": 0.08475350588560104, + "learning_rate": 7.019922764020879e-06, + "loss": 0.4788, + "step": 50934 + }, + { + "epoch": 0.6367159178979475, + "grad_norm": 2.8682966232299805, + "learning_rate": 7.019089738466274e-06, + "loss": 0.4786, + "step": 50936 + }, + { + "epoch": 0.6367409185229631, + "grad_norm": 3.0678791999816895, + "learning_rate": 7.018256735613858e-06, + "loss": 1.2935, + "step": 50938 + }, + { + "epoch": 0.6367659191479788, + "grad_norm": 9.781718254089355, + "learning_rate": 7.017423755469975e-06, + "loss": 1.3156, + "step": 50940 + }, + { + "epoch": 0.6367909197729943, + "grad_norm": 0.19375979900360107, + "learning_rate": 7.016590798040978e-06, + "loss": 0.0047, + "step": 50942 + }, + { + "epoch": 0.6368159203980099, + "grad_norm": 4.176888942718506, + "learning_rate": 7.015757863333199e-06, + "loss": 1.0054, + "step": 50944 + }, + { + "epoch": 0.6368409210230256, + "grad_norm": 2.6173722743988037, + "learning_rate": 7.014924951352987e-06, + "loss": 1.0636, + "step": 50946 + }, + { + "epoch": 0.6368659216480412, + "grad_norm": 6.179560661315918, + "learning_rate": 7.0140920621066835e-06, + "loss": 0.4406, + "step": 50948 + }, + { + "epoch": 0.6368909222730569, + "grad_norm": 0.03842882812023163, + "learning_rate": 7.013259195600633e-06, + "loss": 0.0799, + "step": 50950 + }, + { + "epoch": 0.6369159228980724, + "grad_norm": 4.9557366371154785, + "learning_rate": 7.012426351841177e-06, + "loss": 1.4486, + "step": 50952 + }, + { + "epoch": 0.6369409235230881, + "grad_norm": 3.506654739379883, + "learning_rate": 7.0115935308346594e-06, + "loss": 0.3817, + "step": 50954 + }, + { + "epoch": 0.6369659241481037, + "grad_norm": 0.9732296466827393, + "learning_rate": 7.010760732587429e-06, + "loss": 1.4366, + "step": 50956 + }, + { + "epoch": 0.6369909247731194, + "grad_norm": 2.809821367263794, + "learning_rate": 7.009927957105817e-06, + "loss": 0.3264, + "step": 50958 + }, + { + "epoch": 0.637015925398135, + "grad_norm": 3.5497822761535645, + "learning_rate": 7.00909520439617e-06, + "loss": 1.1109, + "step": 50960 + }, + { + "epoch": 0.6370409260231505, + "grad_norm": 4.597183704376221, + "learning_rate": 7.008262474464833e-06, + "loss": 1.5639, + "step": 50962 + }, + { + "epoch": 0.6370659266481662, + "grad_norm": 3.297266960144043, + "learning_rate": 7.007429767318142e-06, + "loss": 1.5799, + "step": 50964 + }, + { + "epoch": 0.6370909272731818, + "grad_norm": 6.146159648895264, + "learning_rate": 7.006597082962446e-06, + "loss": 0.539, + "step": 50966 + }, + { + "epoch": 0.6371159278981975, + "grad_norm": 2.6813173294067383, + "learning_rate": 7.005764421404087e-06, + "loss": 0.2304, + "step": 50968 + }, + { + "epoch": 0.6371409285232131, + "grad_norm": 1.391095757484436, + "learning_rate": 7.004931782649398e-06, + "loss": 0.7279, + "step": 50970 + }, + { + "epoch": 0.6371659291482287, + "grad_norm": 3.926809549331665, + "learning_rate": 7.0040991667047265e-06, + "loss": 1.6191, + "step": 50972 + }, + { + "epoch": 0.6371909297732443, + "grad_norm": 5.019786357879639, + "learning_rate": 7.00326657357641e-06, + "loss": 1.1178, + "step": 50974 + }, + { + "epoch": 0.63721593039826, + "grad_norm": 5.404219627380371, + "learning_rate": 7.002434003270791e-06, + "loss": 2.3511, + "step": 50976 + }, + { + "epoch": 0.6372409310232756, + "grad_norm": 8.115324974060059, + "learning_rate": 7.0016014557942136e-06, + "loss": 1.9999, + "step": 50978 + }, + { + "epoch": 0.6372659316482913, + "grad_norm": 3.4391019344329834, + "learning_rate": 7.000768931153014e-06, + "loss": 1.5372, + "step": 50980 + }, + { + "epoch": 0.6372909322733068, + "grad_norm": 1.3605151176452637, + "learning_rate": 6.999936429353538e-06, + "loss": 0.4115, + "step": 50982 + }, + { + "epoch": 0.6373159328983224, + "grad_norm": 4.43638277053833, + "learning_rate": 6.9991039504021185e-06, + "loss": 1.3867, + "step": 50984 + }, + { + "epoch": 0.6373409335233381, + "grad_norm": 4.626322269439697, + "learning_rate": 6.998271494305099e-06, + "loss": 1.6892, + "step": 50986 + }, + { + "epoch": 0.6373659341483537, + "grad_norm": 0.03957902640104294, + "learning_rate": 6.997439061068818e-06, + "loss": 0.5857, + "step": 50988 + }, + { + "epoch": 0.6373909347733694, + "grad_norm": 6.980583667755127, + "learning_rate": 6.996606650699616e-06, + "loss": 0.9421, + "step": 50990 + }, + { + "epoch": 0.6374159353983849, + "grad_norm": 2.7933714389801025, + "learning_rate": 6.9957742632038354e-06, + "loss": 0.9156, + "step": 50992 + }, + { + "epoch": 0.6374409360234006, + "grad_norm": 4.748910427093506, + "learning_rate": 6.994941898587817e-06, + "loss": 2.5245, + "step": 50994 + }, + { + "epoch": 0.6374659366484162, + "grad_norm": 3.3547589778900146, + "learning_rate": 6.994109556857891e-06, + "loss": 0.7603, + "step": 50996 + }, + { + "epoch": 0.6374909372734319, + "grad_norm": 1.4289954900741577, + "learning_rate": 6.993277238020402e-06, + "loss": 0.9981, + "step": 50998 + }, + { + "epoch": 0.6375159378984475, + "grad_norm": 2.960028648376465, + "learning_rate": 6.992444942081689e-06, + "loss": 1.2236, + "step": 51000 + }, + { + "epoch": 0.637540938523463, + "grad_norm": 3.4013688564300537, + "learning_rate": 6.991612669048087e-06, + "loss": 0.5453, + "step": 51002 + }, + { + "epoch": 0.6375659391484787, + "grad_norm": 2.7238590717315674, + "learning_rate": 6.99078041892594e-06, + "loss": 1.0, + "step": 51004 + }, + { + "epoch": 0.6375909397734943, + "grad_norm": 3.184009552001953, + "learning_rate": 6.989948191721583e-06, + "loss": 1.371, + "step": 51006 + }, + { + "epoch": 0.63761594039851, + "grad_norm": 4.623490810394287, + "learning_rate": 6.9891159874413594e-06, + "loss": 0.9994, + "step": 51008 + }, + { + "epoch": 0.6376409410235256, + "grad_norm": 2.950672149658203, + "learning_rate": 6.9882838060916e-06, + "loss": 0.7386, + "step": 51010 + }, + { + "epoch": 0.6376659416485412, + "grad_norm": 0.0008326507522724569, + "learning_rate": 6.9874516476786445e-06, + "loss": 0.106, + "step": 51012 + }, + { + "epoch": 0.6376909422735568, + "grad_norm": 5.526440620422363, + "learning_rate": 6.986619512208827e-06, + "loss": 1.1064, + "step": 51014 + }, + { + "epoch": 0.6377159428985725, + "grad_norm": 0.43092310428619385, + "learning_rate": 6.985787399688495e-06, + "loss": 0.0348, + "step": 51016 + }, + { + "epoch": 0.6377409435235881, + "grad_norm": 6.174936294555664, + "learning_rate": 6.984955310123978e-06, + "loss": 1.1238, + "step": 51018 + }, + { + "epoch": 0.6377659441486038, + "grad_norm": 1.7571254968643188, + "learning_rate": 6.984123243521614e-06, + "loss": 1.5783, + "step": 51020 + }, + { + "epoch": 0.6377909447736193, + "grad_norm": 5.395869731903076, + "learning_rate": 6.983291199887746e-06, + "loss": 0.7768, + "step": 51022 + }, + { + "epoch": 0.637815945398635, + "grad_norm": 1.9353933334350586, + "learning_rate": 6.982459179228702e-06, + "loss": 0.6054, + "step": 51024 + }, + { + "epoch": 0.6378409460236506, + "grad_norm": 3.8490588665008545, + "learning_rate": 6.981627181550818e-06, + "loss": 1.505, + "step": 51026 + }, + { + "epoch": 0.6378659466486662, + "grad_norm": 0.0006083059124648571, + "learning_rate": 6.98079520686044e-06, + "loss": 0.1406, + "step": 51028 + }, + { + "epoch": 0.6378909472736819, + "grad_norm": 0.42648744583129883, + "learning_rate": 6.979963255163896e-06, + "loss": 0.056, + "step": 51030 + }, + { + "epoch": 0.6379159478986974, + "grad_norm": 3.744903802871704, + "learning_rate": 6.979131326467527e-06, + "loss": 1.2042, + "step": 51032 + }, + { + "epoch": 0.6379409485237131, + "grad_norm": 5.903531074523926, + "learning_rate": 6.978299420777669e-06, + "loss": 1.1104, + "step": 51034 + }, + { + "epoch": 0.6379659491487287, + "grad_norm": 7.3602423667907715, + "learning_rate": 6.977467538100651e-06, + "loss": 0.5031, + "step": 51036 + }, + { + "epoch": 0.6379909497737444, + "grad_norm": 2.9713664054870605, + "learning_rate": 6.976635678442811e-06, + "loss": 1.4926, + "step": 51038 + }, + { + "epoch": 0.63801595039876, + "grad_norm": 4.7511820793151855, + "learning_rate": 6.975803841810489e-06, + "loss": 0.9949, + "step": 51040 + }, + { + "epoch": 0.6380409510237756, + "grad_norm": 4.303913593292236, + "learning_rate": 6.9749720282100175e-06, + "loss": 0.9636, + "step": 51042 + }, + { + "epoch": 0.6380659516487912, + "grad_norm": 2.395010471343994, + "learning_rate": 6.97414023764773e-06, + "loss": 0.4611, + "step": 51044 + }, + { + "epoch": 0.6380909522738069, + "grad_norm": 4.288966178894043, + "learning_rate": 6.973308470129964e-06, + "loss": 0.848, + "step": 51046 + }, + { + "epoch": 0.6381159528988225, + "grad_norm": 1.0254806280136108, + "learning_rate": 6.972476725663056e-06, + "loss": 0.3735, + "step": 51048 + }, + { + "epoch": 0.6381409535238382, + "grad_norm": 7.344531059265137, + "learning_rate": 6.971645004253331e-06, + "loss": 2.1086, + "step": 51050 + }, + { + "epoch": 0.6381659541488537, + "grad_norm": 4.14522647857666, + "learning_rate": 6.970813305907131e-06, + "loss": 1.6278, + "step": 51052 + }, + { + "epoch": 0.6381909547738693, + "grad_norm": 2.569112777709961, + "learning_rate": 6.969981630630789e-06, + "loss": 0.5346, + "step": 51054 + }, + { + "epoch": 0.638215955398885, + "grad_norm": 0.0012899235589429736, + "learning_rate": 6.969149978430638e-06, + "loss": 0.9711, + "step": 51056 + }, + { + "epoch": 0.6382409560239006, + "grad_norm": 4.115455627441406, + "learning_rate": 6.968318349313013e-06, + "loss": 0.8061, + "step": 51058 + }, + { + "epoch": 0.6382659566489163, + "grad_norm": 3.965264081954956, + "learning_rate": 6.967486743284251e-06, + "loss": 1.3837, + "step": 51060 + }, + { + "epoch": 0.6382909572739318, + "grad_norm": 4.339714527130127, + "learning_rate": 6.966655160350675e-06, + "loss": 0.9022, + "step": 51062 + }, + { + "epoch": 0.6383159578989475, + "grad_norm": 0.004425887484103441, + "learning_rate": 6.965823600518624e-06, + "loss": 0.564, + "step": 51064 + }, + { + "epoch": 0.6383409585239631, + "grad_norm": 2.0320775508880615, + "learning_rate": 6.964992063794434e-06, + "loss": 0.2223, + "step": 51066 + }, + { + "epoch": 0.6383659591489788, + "grad_norm": 4.423220157623291, + "learning_rate": 6.964160550184433e-06, + "loss": 0.6894, + "step": 51068 + }, + { + "epoch": 0.6383909597739944, + "grad_norm": 3.650723695755005, + "learning_rate": 6.963329059694958e-06, + "loss": 0.6484, + "step": 51070 + }, + { + "epoch": 0.6384159603990099, + "grad_norm": 2.8548872470855713, + "learning_rate": 6.962497592332338e-06, + "loss": 0.5425, + "step": 51072 + }, + { + "epoch": 0.6384409610240256, + "grad_norm": 3.5079920291900635, + "learning_rate": 6.961666148102912e-06, + "loss": 1.25, + "step": 51074 + }, + { + "epoch": 0.6384659616490412, + "grad_norm": 4.51297664642334, + "learning_rate": 6.960834727013001e-06, + "loss": 1.1741, + "step": 51076 + }, + { + "epoch": 0.6384909622740569, + "grad_norm": 2.639159917831421, + "learning_rate": 6.960003329068944e-06, + "loss": 0.3143, + "step": 51078 + }, + { + "epoch": 0.6385159628990725, + "grad_norm": 5.905904293060303, + "learning_rate": 6.959171954277072e-06, + "loss": 0.6236, + "step": 51080 + }, + { + "epoch": 0.6385409635240881, + "grad_norm": 5.693204402923584, + "learning_rate": 6.958340602643716e-06, + "loss": 1.5154, + "step": 51082 + }, + { + "epoch": 0.6385659641491037, + "grad_norm": 0.0038024457171559334, + "learning_rate": 6.957509274175209e-06, + "loss": 0.0458, + "step": 51084 + }, + { + "epoch": 0.6385909647741194, + "grad_norm": 3.924187421798706, + "learning_rate": 6.956677968877885e-06, + "loss": 0.6932, + "step": 51086 + }, + { + "epoch": 0.638615965399135, + "grad_norm": 3.9562549591064453, + "learning_rate": 6.9558466867580645e-06, + "loss": 1.0711, + "step": 51088 + }, + { + "epoch": 0.6386409660241507, + "grad_norm": 5.958951473236084, + "learning_rate": 6.9550154278220875e-06, + "loss": 0.8801, + "step": 51090 + }, + { + "epoch": 0.6386659666491662, + "grad_norm": 2.9436967372894287, + "learning_rate": 6.954184192076282e-06, + "loss": 0.6256, + "step": 51092 + }, + { + "epoch": 0.6386909672741818, + "grad_norm": 4.675594806671143, + "learning_rate": 6.953352979526979e-06, + "loss": 1.3156, + "step": 51094 + }, + { + "epoch": 0.6387159678991975, + "grad_norm": 0.4086940586566925, + "learning_rate": 6.952521790180509e-06, + "loss": 0.3928, + "step": 51096 + }, + { + "epoch": 0.6387409685242131, + "grad_norm": 2.5564022064208984, + "learning_rate": 6.951690624043203e-06, + "loss": 1.5745, + "step": 51098 + }, + { + "epoch": 0.6387659691492288, + "grad_norm": 3.348324775695801, + "learning_rate": 6.9508594811213905e-06, + "loss": 0.4006, + "step": 51100 + }, + { + "epoch": 0.6387909697742443, + "grad_norm": 2.6327593326568604, + "learning_rate": 6.9500283614214e-06, + "loss": 0.9649, + "step": 51102 + }, + { + "epoch": 0.63881597039926, + "grad_norm": 3.311319351196289, + "learning_rate": 6.9491972649495635e-06, + "loss": 0.984, + "step": 51104 + }, + { + "epoch": 0.6388409710242756, + "grad_norm": 6.254487991333008, + "learning_rate": 6.948366191712207e-06, + "loss": 1.8508, + "step": 51106 + }, + { + "epoch": 0.6388659716492913, + "grad_norm": 2.7046420574188232, + "learning_rate": 6.947535141715662e-06, + "loss": 0.7433, + "step": 51108 + }, + { + "epoch": 0.6388909722743069, + "grad_norm": 2.79996395111084, + "learning_rate": 6.946704114966258e-06, + "loss": 0.5922, + "step": 51110 + }, + { + "epoch": 0.6389159728993224, + "grad_norm": 5.300802707672119, + "learning_rate": 6.945873111470327e-06, + "loss": 0.7863, + "step": 51112 + }, + { + "epoch": 0.6389409735243381, + "grad_norm": 3.3876566886901855, + "learning_rate": 6.945042131234192e-06, + "loss": 1.5282, + "step": 51114 + }, + { + "epoch": 0.6389659741493537, + "grad_norm": 4.294957637786865, + "learning_rate": 6.944211174264184e-06, + "loss": 1.8813, + "step": 51116 + }, + { + "epoch": 0.6389909747743694, + "grad_norm": 7.248355388641357, + "learning_rate": 6.943380240566631e-06, + "loss": 1.7254, + "step": 51118 + }, + { + "epoch": 0.639015975399385, + "grad_norm": 4.53693151473999, + "learning_rate": 6.942549330147863e-06, + "loss": 1.3524, + "step": 51120 + }, + { + "epoch": 0.6390409760244006, + "grad_norm": 0.001429858966730535, + "learning_rate": 6.941718443014206e-06, + "loss": 0.4177, + "step": 51122 + }, + { + "epoch": 0.6390659766494162, + "grad_norm": 5.163188457489014, + "learning_rate": 6.940887579171989e-06, + "loss": 1.1072, + "step": 51124 + }, + { + "epoch": 0.6390909772744319, + "grad_norm": 2.990569591522217, + "learning_rate": 6.940056738627541e-06, + "loss": 1.1542, + "step": 51126 + }, + { + "epoch": 0.6391159778994475, + "grad_norm": 3.3483147621154785, + "learning_rate": 6.939225921387187e-06, + "loss": 0.7119, + "step": 51128 + }, + { + "epoch": 0.6391409785244632, + "grad_norm": 3.014965057373047, + "learning_rate": 6.938395127457256e-06, + "loss": 0.3538, + "step": 51130 + }, + { + "epoch": 0.6391659791494787, + "grad_norm": 2.6673736572265625, + "learning_rate": 6.937564356844073e-06, + "loss": 1.5359, + "step": 51132 + }, + { + "epoch": 0.6391909797744943, + "grad_norm": 4.272195816040039, + "learning_rate": 6.936733609553968e-06, + "loss": 1.9767, + "step": 51134 + }, + { + "epoch": 0.63921598039951, + "grad_norm": 1.6832493543624878, + "learning_rate": 6.935902885593266e-06, + "loss": 0.0118, + "step": 51136 + }, + { + "epoch": 0.6392409810245256, + "grad_norm": 0.9444360733032227, + "learning_rate": 6.935072184968298e-06, + "loss": 0.038, + "step": 51138 + }, + { + "epoch": 0.6392659816495413, + "grad_norm": 6.6397318840026855, + "learning_rate": 6.934241507685384e-06, + "loss": 0.5948, + "step": 51140 + }, + { + "epoch": 0.6392909822745568, + "grad_norm": 1.7503678798675537, + "learning_rate": 6.933410853750853e-06, + "loss": 0.0502, + "step": 51142 + }, + { + "epoch": 0.6393159828995725, + "grad_norm": 5.011228561401367, + "learning_rate": 6.932580223171031e-06, + "loss": 2.7652, + "step": 51144 + }, + { + "epoch": 0.6393409835245881, + "grad_norm": 4.916162014007568, + "learning_rate": 6.931749615952244e-06, + "loss": 0.9878, + "step": 51146 + }, + { + "epoch": 0.6393659841496038, + "grad_norm": 3.830820322036743, + "learning_rate": 6.930919032100818e-06, + "loss": 0.9566, + "step": 51148 + }, + { + "epoch": 0.6393909847746194, + "grad_norm": 20.579483032226562, + "learning_rate": 6.93008847162308e-06, + "loss": 1.5047, + "step": 51150 + }, + { + "epoch": 0.639415985399635, + "grad_norm": 2.288240671157837, + "learning_rate": 6.929257934525355e-06, + "loss": 1.0098, + "step": 51152 + }, + { + "epoch": 0.6394409860246506, + "grad_norm": 2.524817943572998, + "learning_rate": 6.928427420813966e-06, + "loss": 1.0514, + "step": 51154 + }, + { + "epoch": 0.6394659866496663, + "grad_norm": 3.907644748687744, + "learning_rate": 6.927596930495237e-06, + "loss": 0.6644, + "step": 51156 + }, + { + "epoch": 0.6394909872746819, + "grad_norm": 3.249464273452759, + "learning_rate": 6.926766463575498e-06, + "loss": 1.369, + "step": 51158 + }, + { + "epoch": 0.6395159878996975, + "grad_norm": 3.6686577796936035, + "learning_rate": 6.92593602006107e-06, + "loss": 1.204, + "step": 51160 + }, + { + "epoch": 0.6395409885247131, + "grad_norm": 2.147468328475952, + "learning_rate": 6.925105599958279e-06, + "loss": 0.2277, + "step": 51162 + }, + { + "epoch": 0.6395659891497287, + "grad_norm": 3.4758083820343018, + "learning_rate": 6.924275203273451e-06, + "loss": 0.6909, + "step": 51164 + }, + { + "epoch": 0.6395909897747444, + "grad_norm": 31.69948959350586, + "learning_rate": 6.923444830012905e-06, + "loss": 1.1734, + "step": 51166 + }, + { + "epoch": 0.63961599039976, + "grad_norm": 4.473012924194336, + "learning_rate": 6.92261448018297e-06, + "loss": 1.5653, + "step": 51168 + }, + { + "epoch": 0.6396409910247757, + "grad_norm": 3.0821456909179688, + "learning_rate": 6.921784153789966e-06, + "loss": 0.4571, + "step": 51170 + }, + { + "epoch": 0.6396659916497912, + "grad_norm": 0.0027124101761728525, + "learning_rate": 6.92095385084022e-06, + "loss": 0.7018, + "step": 51172 + }, + { + "epoch": 0.6396909922748069, + "grad_norm": 3.5223188400268555, + "learning_rate": 6.9201235713400525e-06, + "loss": 1.192, + "step": 51174 + }, + { + "epoch": 0.6397159928998225, + "grad_norm": 4.039261341094971, + "learning_rate": 6.919293315295789e-06, + "loss": 1.2195, + "step": 51176 + }, + { + "epoch": 0.6397409935248382, + "grad_norm": 3.99564528465271, + "learning_rate": 6.918463082713755e-06, + "loss": 0.799, + "step": 51178 + }, + { + "epoch": 0.6397659941498538, + "grad_norm": 3.0312016010284424, + "learning_rate": 6.9176328736002675e-06, + "loss": 0.8251, + "step": 51180 + }, + { + "epoch": 0.6397909947748693, + "grad_norm": 0.48040834069252014, + "learning_rate": 6.916802687961652e-06, + "loss": 0.3398, + "step": 51182 + }, + { + "epoch": 0.639815995399885, + "grad_norm": 2.035776376724243, + "learning_rate": 6.915972525804231e-06, + "loss": 0.0379, + "step": 51184 + }, + { + "epoch": 0.6398409960249006, + "grad_norm": 1.7925587892532349, + "learning_rate": 6.915142387134328e-06, + "loss": 0.2122, + "step": 51186 + }, + { + "epoch": 0.6398659966499163, + "grad_norm": 3.2808656692504883, + "learning_rate": 6.914312271958265e-06, + "loss": 0.4133, + "step": 51188 + }, + { + "epoch": 0.6398909972749319, + "grad_norm": 0.4865111708641052, + "learning_rate": 6.913482180282364e-06, + "loss": 0.7906, + "step": 51190 + }, + { + "epoch": 0.6399159978999475, + "grad_norm": 1.6221553087234497, + "learning_rate": 6.912652112112944e-06, + "loss": 0.6531, + "step": 51192 + }, + { + "epoch": 0.6399409985249631, + "grad_norm": 4.418407440185547, + "learning_rate": 6.91182206745633e-06, + "loss": 1.1862, + "step": 51194 + }, + { + "epoch": 0.6399659991499788, + "grad_norm": 0.002851148834452033, + "learning_rate": 6.9109920463188416e-06, + "loss": 0.0001, + "step": 51196 + }, + { + "epoch": 0.6399909997749944, + "grad_norm": 4.000275135040283, + "learning_rate": 6.9101620487068015e-06, + "loss": 0.8654, + "step": 51198 + }, + { + "epoch": 0.64001600040001, + "grad_norm": 1.811432123184204, + "learning_rate": 6.90933207462653e-06, + "loss": 1.2106, + "step": 51200 + }, + { + "epoch": 0.6400410010250256, + "grad_norm": 1.1992433071136475, + "learning_rate": 6.908502124084349e-06, + "loss": 0.0184, + "step": 51202 + }, + { + "epoch": 0.6400660016500412, + "grad_norm": 3.7679624557495117, + "learning_rate": 6.907672197086579e-06, + "loss": 1.8634, + "step": 51204 + }, + { + "epoch": 0.6400910022750569, + "grad_norm": 5.618790149688721, + "learning_rate": 6.90684229363954e-06, + "loss": 0.8461, + "step": 51206 + }, + { + "epoch": 0.6401160029000725, + "grad_norm": 0.0007559390505775809, + "learning_rate": 6.906012413749552e-06, + "loss": 0.0067, + "step": 51208 + }, + { + "epoch": 0.6401410035250882, + "grad_norm": 3.4010229110717773, + "learning_rate": 6.905182557422936e-06, + "loss": 1.0767, + "step": 51210 + }, + { + "epoch": 0.6401660041501037, + "grad_norm": 4.074017524719238, + "learning_rate": 6.904352724666012e-06, + "loss": 1.0806, + "step": 51212 + }, + { + "epoch": 0.6401910047751194, + "grad_norm": 3.9125890731811523, + "learning_rate": 6.9035229154850994e-06, + "loss": 0.9406, + "step": 51214 + }, + { + "epoch": 0.640216005400135, + "grad_norm": 3.4249916076660156, + "learning_rate": 6.902693129886522e-06, + "loss": 1.4646, + "step": 51216 + }, + { + "epoch": 0.6402410060251507, + "grad_norm": 2.6821775436401367, + "learning_rate": 6.9018633678765916e-06, + "loss": 0.9587, + "step": 51218 + }, + { + "epoch": 0.6402660066501663, + "grad_norm": 2.2996599674224854, + "learning_rate": 6.901033629461633e-06, + "loss": 0.4909, + "step": 51220 + }, + { + "epoch": 0.6402910072751818, + "grad_norm": 5.252904891967773, + "learning_rate": 6.9002039146479624e-06, + "loss": 1.1106, + "step": 51222 + }, + { + "epoch": 0.6403160079001975, + "grad_norm": 3.0398998260498047, + "learning_rate": 6.899374223441903e-06, + "loss": 0.7756, + "step": 51224 + }, + { + "epoch": 0.6403410085252131, + "grad_norm": 3.1989054679870605, + "learning_rate": 6.898544555849769e-06, + "loss": 0.5001, + "step": 51226 + }, + { + "epoch": 0.6403660091502288, + "grad_norm": 3.5548830032348633, + "learning_rate": 6.8977149118778815e-06, + "loss": 1.512, + "step": 51228 + }, + { + "epoch": 0.6403910097752444, + "grad_norm": 4.115508556365967, + "learning_rate": 6.896885291532563e-06, + "loss": 0.8288, + "step": 51230 + }, + { + "epoch": 0.64041601040026, + "grad_norm": 3.0297913551330566, + "learning_rate": 6.896055694820122e-06, + "loss": 0.6697, + "step": 51232 + }, + { + "epoch": 0.6404410110252756, + "grad_norm": 3.742357015609741, + "learning_rate": 6.895226121746883e-06, + "loss": 0.6457, + "step": 51234 + }, + { + "epoch": 0.6404660116502913, + "grad_norm": 6.040389060974121, + "learning_rate": 6.894396572319162e-06, + "loss": 2.8696, + "step": 51236 + }, + { + "epoch": 0.6404910122753069, + "grad_norm": 1.0220094919204712, + "learning_rate": 6.893567046543279e-06, + "loss": 1.041, + "step": 51238 + }, + { + "epoch": 0.6405160129003226, + "grad_norm": 2.587592124938965, + "learning_rate": 6.892737544425549e-06, + "loss": 0.6304, + "step": 51240 + }, + { + "epoch": 0.6405410135253381, + "grad_norm": 1.5853599309921265, + "learning_rate": 6.891908065972294e-06, + "loss": 1.1242, + "step": 51242 + }, + { + "epoch": 0.6405660141503537, + "grad_norm": 2.34121036529541, + "learning_rate": 6.891078611189824e-06, + "loss": 0.526, + "step": 51244 + }, + { + "epoch": 0.6405910147753694, + "grad_norm": 1.9634084701538086, + "learning_rate": 6.890249180084461e-06, + "loss": 0.351, + "step": 51246 + }, + { + "epoch": 0.640616015400385, + "grad_norm": 6.55671501159668, + "learning_rate": 6.889419772662519e-06, + "loss": 0.5143, + "step": 51248 + }, + { + "epoch": 0.6406410160254007, + "grad_norm": 1.6186800003051758, + "learning_rate": 6.8885903889303165e-06, + "loss": 1.1013, + "step": 51250 + }, + { + "epoch": 0.6406660166504162, + "grad_norm": 6.354857921600342, + "learning_rate": 6.887761028894171e-06, + "loss": 0.7255, + "step": 51252 + }, + { + "epoch": 0.6406910172754319, + "grad_norm": 2.4640870094299316, + "learning_rate": 6.8869316925603956e-06, + "loss": 0.6464, + "step": 51254 + }, + { + "epoch": 0.6407160179004475, + "grad_norm": 2.982329845428467, + "learning_rate": 6.886102379935311e-06, + "loss": 0.1488, + "step": 51256 + }, + { + "epoch": 0.6407410185254632, + "grad_norm": 7.6234588623046875, + "learning_rate": 6.885273091025229e-06, + "loss": 1.7063, + "step": 51258 + }, + { + "epoch": 0.6407660191504788, + "grad_norm": 1.2654825448989868, + "learning_rate": 6.884443825836465e-06, + "loss": 0.454, + "step": 51260 + }, + { + "epoch": 0.6407910197754944, + "grad_norm": 3.7198235988616943, + "learning_rate": 6.883614584375338e-06, + "loss": 0.65, + "step": 51262 + }, + { + "epoch": 0.64081602040051, + "grad_norm": 3.4835903644561768, + "learning_rate": 6.882785366648161e-06, + "loss": 1.3635, + "step": 51264 + }, + { + "epoch": 0.6408410210255256, + "grad_norm": 8.1937894821167, + "learning_rate": 6.881956172661248e-06, + "loss": 1.3522, + "step": 51266 + }, + { + "epoch": 0.6408660216505413, + "grad_norm": 3.5613577365875244, + "learning_rate": 6.88112700242092e-06, + "loss": 0.5693, + "step": 51268 + }, + { + "epoch": 0.6408910222755569, + "grad_norm": 2.38809871673584, + "learning_rate": 6.880297855933487e-06, + "loss": 1.1236, + "step": 51270 + }, + { + "epoch": 0.6409160229005725, + "grad_norm": 4.908738136291504, + "learning_rate": 6.879468733205261e-06, + "loss": 1.651, + "step": 51272 + }, + { + "epoch": 0.6409410235255881, + "grad_norm": 3.7684028148651123, + "learning_rate": 6.878639634242561e-06, + "loss": 0.3749, + "step": 51274 + }, + { + "epoch": 0.6409660241506038, + "grad_norm": 2.3564672470092773, + "learning_rate": 6.877810559051701e-06, + "loss": 0.1636, + "step": 51276 + }, + { + "epoch": 0.6409910247756194, + "grad_norm": 5.690709114074707, + "learning_rate": 6.8769815076389935e-06, + "loss": 2.333, + "step": 51278 + }, + { + "epoch": 0.6410160254006351, + "grad_norm": 1.7340342998504639, + "learning_rate": 6.876152480010754e-06, + "loss": 0.1461, + "step": 51280 + }, + { + "epoch": 0.6410410260256506, + "grad_norm": 6.470892429351807, + "learning_rate": 6.875323476173297e-06, + "loss": 1.9198, + "step": 51282 + }, + { + "epoch": 0.6410660266506663, + "grad_norm": 3.9327046871185303, + "learning_rate": 6.874494496132933e-06, + "loss": 1.3875, + "step": 51284 + }, + { + "epoch": 0.6410910272756819, + "grad_norm": 3.607924461364746, + "learning_rate": 6.873665539895977e-06, + "loss": 2.3037, + "step": 51286 + }, + { + "epoch": 0.6411160279006975, + "grad_norm": 2.414520025253296, + "learning_rate": 6.87283660746874e-06, + "loss": 0.1966, + "step": 51288 + }, + { + "epoch": 0.6411410285257132, + "grad_norm": 3.1578166484832764, + "learning_rate": 6.872007698857539e-06, + "loss": 1.2637, + "step": 51290 + }, + { + "epoch": 0.6411660291507287, + "grad_norm": 5.109746932983398, + "learning_rate": 6.871178814068685e-06, + "loss": 1.0901, + "step": 51292 + }, + { + "epoch": 0.6411910297757444, + "grad_norm": 3.7192630767822266, + "learning_rate": 6.870349953108488e-06, + "loss": 0.8941, + "step": 51294 + }, + { + "epoch": 0.64121603040076, + "grad_norm": 2.489255666732788, + "learning_rate": 6.869521115983271e-06, + "loss": 0.7016, + "step": 51296 + }, + { + "epoch": 0.6412410310257757, + "grad_norm": 2.6173017024993896, + "learning_rate": 6.8686923026993335e-06, + "loss": 0.6751, + "step": 51298 + }, + { + "epoch": 0.6412660316507913, + "grad_norm": 2.1618072986602783, + "learning_rate": 6.867863513262991e-06, + "loss": 1.3695, + "step": 51300 + }, + { + "epoch": 0.6412910322758069, + "grad_norm": 2.6932318210601807, + "learning_rate": 6.867034747680558e-06, + "loss": 0.6572, + "step": 51302 + }, + { + "epoch": 0.6413160329008225, + "grad_norm": 5.609762668609619, + "learning_rate": 6.866206005958347e-06, + "loss": 1.2844, + "step": 51304 + }, + { + "epoch": 0.6413410335258382, + "grad_norm": 5.88283634185791, + "learning_rate": 6.865377288102664e-06, + "loss": 1.115, + "step": 51306 + }, + { + "epoch": 0.6413660341508538, + "grad_norm": 0.40416955947875977, + "learning_rate": 6.8645485941198306e-06, + "loss": 0.0926, + "step": 51308 + }, + { + "epoch": 0.6413910347758695, + "grad_norm": 6.144495010375977, + "learning_rate": 6.863719924016148e-06, + "loss": 1.1142, + "step": 51310 + }, + { + "epoch": 0.641416035400885, + "grad_norm": 0.003543997183442116, + "learning_rate": 6.862891277797932e-06, + "loss": 0.5529, + "step": 51312 + }, + { + "epoch": 0.6414410360259006, + "grad_norm": 0.0030145312193781137, + "learning_rate": 6.862062655471491e-06, + "loss": 1.8041, + "step": 51314 + }, + { + "epoch": 0.6414660366509163, + "grad_norm": 2.658236026763916, + "learning_rate": 6.8612340570431375e-06, + "loss": 1.155, + "step": 51316 + }, + { + "epoch": 0.6414910372759319, + "grad_norm": 0.1359272599220276, + "learning_rate": 6.860405482519181e-06, + "loss": 1.016, + "step": 51318 + }, + { + "epoch": 0.6415160379009476, + "grad_norm": 0.9621536135673523, + "learning_rate": 6.859576931905931e-06, + "loss": 0.6501, + "step": 51320 + }, + { + "epoch": 0.6415410385259631, + "grad_norm": 1.4710078239440918, + "learning_rate": 6.858748405209705e-06, + "loss": 0.1375, + "step": 51322 + }, + { + "epoch": 0.6415660391509788, + "grad_norm": 2.463005304336548, + "learning_rate": 6.857919902436803e-06, + "loss": 2.2271, + "step": 51324 + }, + { + "epoch": 0.6415910397759944, + "grad_norm": 3.6457436084747314, + "learning_rate": 6.857091423593538e-06, + "loss": 0.485, + "step": 51326 + }, + { + "epoch": 0.6416160404010101, + "grad_norm": 3.6866822242736816, + "learning_rate": 6.856262968686221e-06, + "loss": 1.7718, + "step": 51328 + }, + { + "epoch": 0.6416410410260257, + "grad_norm": 4.201974391937256, + "learning_rate": 6.8554345377211595e-06, + "loss": 0.3363, + "step": 51330 + }, + { + "epoch": 0.6416660416510412, + "grad_norm": 2.3882153034210205, + "learning_rate": 6.854606130704664e-06, + "loss": 2.5071, + "step": 51332 + }, + { + "epoch": 0.6416910422760569, + "grad_norm": 0.014131471514701843, + "learning_rate": 6.853777747643048e-06, + "loss": 0.0467, + "step": 51334 + }, + { + "epoch": 0.6417160429010725, + "grad_norm": 2.948420286178589, + "learning_rate": 6.852949388542611e-06, + "loss": 1.2016, + "step": 51336 + }, + { + "epoch": 0.6417410435260882, + "grad_norm": 3.254486322402954, + "learning_rate": 6.852121053409668e-06, + "loss": 0.764, + "step": 51338 + }, + { + "epoch": 0.6417660441511038, + "grad_norm": 3.337550640106201, + "learning_rate": 6.851292742250526e-06, + "loss": 1.0834, + "step": 51340 + }, + { + "epoch": 0.6417910447761194, + "grad_norm": 4.294588088989258, + "learning_rate": 6.8504644550714915e-06, + "loss": 0.6738, + "step": 51342 + }, + { + "epoch": 0.641816045401135, + "grad_norm": 2.3495261669158936, + "learning_rate": 6.849636191878873e-06, + "loss": 0.5103, + "step": 51344 + }, + { + "epoch": 0.6418410460261507, + "grad_norm": 7.195976257324219, + "learning_rate": 6.848807952678982e-06, + "loss": 2.6439, + "step": 51346 + }, + { + "epoch": 0.6418660466511663, + "grad_norm": 1.7296111583709717, + "learning_rate": 6.847979737478129e-06, + "loss": 0.5434, + "step": 51348 + }, + { + "epoch": 0.641891047276182, + "grad_norm": 2.5376033782958984, + "learning_rate": 6.847151546282611e-06, + "loss": 1.5507, + "step": 51350 + }, + { + "epoch": 0.6419160479011975, + "grad_norm": 0.33894601464271545, + "learning_rate": 6.846323379098743e-06, + "loss": 0.125, + "step": 51352 + }, + { + "epoch": 0.6419410485262131, + "grad_norm": 8.43575668334961, + "learning_rate": 6.845495235932829e-06, + "loss": 3.1767, + "step": 51354 + }, + { + "epoch": 0.6419660491512288, + "grad_norm": 4.108334541320801, + "learning_rate": 6.844667116791174e-06, + "loss": 0.9805, + "step": 51356 + }, + { + "epoch": 0.6419910497762444, + "grad_norm": 1.4148380756378174, + "learning_rate": 6.843839021680091e-06, + "loss": 0.6511, + "step": 51358 + }, + { + "epoch": 0.6420160504012601, + "grad_norm": 3.1673972606658936, + "learning_rate": 6.843010950605888e-06, + "loss": 2.0291, + "step": 51360 + }, + { + "epoch": 0.6420410510262756, + "grad_norm": 2.432037591934204, + "learning_rate": 6.842182903574863e-06, + "loss": 1.3131, + "step": 51362 + }, + { + "epoch": 0.6420660516512913, + "grad_norm": 4.379501819610596, + "learning_rate": 6.841354880593326e-06, + "loss": 0.8423, + "step": 51364 + }, + { + "epoch": 0.6420910522763069, + "grad_norm": 2.8623974323272705, + "learning_rate": 6.840526881667584e-06, + "loss": 0.7437, + "step": 51366 + }, + { + "epoch": 0.6421160529013226, + "grad_norm": 2.6367547512054443, + "learning_rate": 6.83969890680394e-06, + "loss": 1.0823, + "step": 51368 + }, + { + "epoch": 0.6421410535263382, + "grad_norm": 1.3448028564453125, + "learning_rate": 6.838870956008705e-06, + "loss": 0.5511, + "step": 51370 + }, + { + "epoch": 0.6421660541513537, + "grad_norm": 0.003207189729437232, + "learning_rate": 6.8380430292881815e-06, + "loss": 0.0752, + "step": 51372 + }, + { + "epoch": 0.6421910547763694, + "grad_norm": 2.6065990924835205, + "learning_rate": 6.8372151266486786e-06, + "loss": 1.2038, + "step": 51374 + }, + { + "epoch": 0.642216055401385, + "grad_norm": 3.714409351348877, + "learning_rate": 6.836387248096495e-06, + "loss": 1.3182, + "step": 51376 + }, + { + "epoch": 0.6422410560264007, + "grad_norm": 0.47171589732170105, + "learning_rate": 6.8355593936379384e-06, + "loss": 0.8733, + "step": 51378 + }, + { + "epoch": 0.6422660566514163, + "grad_norm": 10.141419410705566, + "learning_rate": 6.834731563279311e-06, + "loss": 1.4828, + "step": 51380 + }, + { + "epoch": 0.6422910572764319, + "grad_norm": 5.3054680824279785, + "learning_rate": 6.833903757026923e-06, + "loss": 0.4803, + "step": 51382 + }, + { + "epoch": 0.6423160579014475, + "grad_norm": 5.742537021636963, + "learning_rate": 6.833075974887077e-06, + "loss": 2.0244, + "step": 51384 + }, + { + "epoch": 0.6423410585264632, + "grad_norm": 3.316401243209839, + "learning_rate": 6.83224821686608e-06, + "loss": 0.6176, + "step": 51386 + }, + { + "epoch": 0.6423660591514788, + "grad_norm": 4.289228439331055, + "learning_rate": 6.831420482970228e-06, + "loss": 1.3208, + "step": 51388 + }, + { + "epoch": 0.6423910597764945, + "grad_norm": 1.2364857196807861, + "learning_rate": 6.830592773205831e-06, + "loss": 1.2582, + "step": 51390 + }, + { + "epoch": 0.64241606040151, + "grad_norm": 4.136531829833984, + "learning_rate": 6.82976508757919e-06, + "loss": 1.1013, + "step": 51392 + }, + { + "epoch": 0.6424410610265257, + "grad_norm": 4.093976020812988, + "learning_rate": 6.828937426096608e-06, + "loss": 1.3227, + "step": 51394 + }, + { + "epoch": 0.6424660616515413, + "grad_norm": 3.8897876739501953, + "learning_rate": 6.8281097887643924e-06, + "loss": 1.1805, + "step": 51396 + }, + { + "epoch": 0.642491062276557, + "grad_norm": 5.064013957977295, + "learning_rate": 6.827282175588843e-06, + "loss": 1.4651, + "step": 51398 + }, + { + "epoch": 0.6425160629015726, + "grad_norm": 4.885082721710205, + "learning_rate": 6.8264545865762695e-06, + "loss": 1.0807, + "step": 51400 + }, + { + "epoch": 0.6425410635265881, + "grad_norm": 3.0134549140930176, + "learning_rate": 6.825627021732965e-06, + "loss": 0.7585, + "step": 51402 + }, + { + "epoch": 0.6425660641516038, + "grad_norm": 3.531081438064575, + "learning_rate": 6.8247994810652355e-06, + "loss": 1.4977, + "step": 51404 + }, + { + "epoch": 0.6425910647766194, + "grad_norm": 2.8094847202301025, + "learning_rate": 6.823971964579381e-06, + "loss": 0.2621, + "step": 51406 + }, + { + "epoch": 0.6426160654016351, + "grad_norm": 1.2842974662780762, + "learning_rate": 6.823144472281711e-06, + "loss": 0.0363, + "step": 51408 + }, + { + "epoch": 0.6426410660266507, + "grad_norm": 0.005794829688966274, + "learning_rate": 6.8223170041785225e-06, + "loss": 0.0253, + "step": 51410 + }, + { + "epoch": 0.6426660666516663, + "grad_norm": 3.3749563694000244, + "learning_rate": 6.821489560276122e-06, + "loss": 0.8133, + "step": 51412 + }, + { + "epoch": 0.6426910672766819, + "grad_norm": 4.778616428375244, + "learning_rate": 6.820662140580805e-06, + "loss": 0.463, + "step": 51414 + }, + { + "epoch": 0.6427160679016976, + "grad_norm": 2.698654890060425, + "learning_rate": 6.819834745098875e-06, + "loss": 0.4041, + "step": 51416 + }, + { + "epoch": 0.6427410685267132, + "grad_norm": 6.0104780197143555, + "learning_rate": 6.81900737383663e-06, + "loss": 1.626, + "step": 51418 + }, + { + "epoch": 0.6427660691517288, + "grad_norm": 2.0893750190734863, + "learning_rate": 6.818180026800379e-06, + "loss": 1.6853, + "step": 51420 + }, + { + "epoch": 0.6427910697767444, + "grad_norm": 1.8203445672988892, + "learning_rate": 6.8173527039964184e-06, + "loss": 1.3438, + "step": 51422 + }, + { + "epoch": 0.64281607040176, + "grad_norm": 3.905363082885742, + "learning_rate": 6.81652540543105e-06, + "loss": 1.5129, + "step": 51424 + }, + { + "epoch": 0.6428410710267757, + "grad_norm": 2.5990986824035645, + "learning_rate": 6.8156981311105775e-06, + "loss": 1.1463, + "step": 51426 + }, + { + "epoch": 0.6428660716517913, + "grad_norm": 3.501373052597046, + "learning_rate": 6.814870881041293e-06, + "loss": 1.0028, + "step": 51428 + }, + { + "epoch": 0.642891072276807, + "grad_norm": 0.0028726651798933744, + "learning_rate": 6.8140436552295e-06, + "loss": 0.6349, + "step": 51430 + }, + { + "epoch": 0.6429160729018225, + "grad_norm": 1.3133143186569214, + "learning_rate": 6.813216453681501e-06, + "loss": 0.0799, + "step": 51432 + }, + { + "epoch": 0.6429410735268382, + "grad_norm": 4.117714881896973, + "learning_rate": 6.812389276403597e-06, + "loss": 1.9266, + "step": 51434 + }, + { + "epoch": 0.6429660741518538, + "grad_norm": 2.975118637084961, + "learning_rate": 6.811562123402085e-06, + "loss": 0.534, + "step": 51436 + }, + { + "epoch": 0.6429910747768695, + "grad_norm": 5.370077133178711, + "learning_rate": 6.810734994683268e-06, + "loss": 1.249, + "step": 51438 + }, + { + "epoch": 0.6430160754018851, + "grad_norm": 0.0039929975755512714, + "learning_rate": 6.8099078902534384e-06, + "loss": 0.0355, + "step": 51440 + }, + { + "epoch": 0.6430410760269006, + "grad_norm": 2.004958152770996, + "learning_rate": 6.809080810118897e-06, + "loss": 1.1055, + "step": 51442 + }, + { + "epoch": 0.6430660766519163, + "grad_norm": 0.001605630968697369, + "learning_rate": 6.808253754285948e-06, + "loss": 0.5872, + "step": 51444 + }, + { + "epoch": 0.6430910772769319, + "grad_norm": 0.001864421647042036, + "learning_rate": 6.807426722760886e-06, + "loss": 0.8412, + "step": 51446 + }, + { + "epoch": 0.6431160779019476, + "grad_norm": 3.084596633911133, + "learning_rate": 6.806599715550011e-06, + "loss": 0.8553, + "step": 51448 + }, + { + "epoch": 0.6431410785269632, + "grad_norm": 2.680272102355957, + "learning_rate": 6.805772732659621e-06, + "loss": 0.7871, + "step": 51450 + }, + { + "epoch": 0.6431660791519788, + "grad_norm": 0.7801234722137451, + "learning_rate": 6.804945774096019e-06, + "loss": 0.3712, + "step": 51452 + }, + { + "epoch": 0.6431910797769944, + "grad_norm": 3.244994878768921, + "learning_rate": 6.804118839865493e-06, + "loss": 0.2616, + "step": 51454 + }, + { + "epoch": 0.6432160804020101, + "grad_norm": 3.609534740447998, + "learning_rate": 6.803291929974345e-06, + "loss": 1.3537, + "step": 51456 + }, + { + "epoch": 0.6432410810270257, + "grad_norm": 1.3032089471817017, + "learning_rate": 6.802465044428874e-06, + "loss": 0.8417, + "step": 51458 + }, + { + "epoch": 0.6432660816520414, + "grad_norm": 0.0016671917401254177, + "learning_rate": 6.801638183235378e-06, + "loss": 0.6378, + "step": 51460 + }, + { + "epoch": 0.6432910822770569, + "grad_norm": 0.004145980346947908, + "learning_rate": 6.800811346400153e-06, + "loss": 0.6395, + "step": 51462 + }, + { + "epoch": 0.6433160829020725, + "grad_norm": 3.9184823036193848, + "learning_rate": 6.7999845339295024e-06, + "loss": 1.0781, + "step": 51464 + }, + { + "epoch": 0.6433410835270882, + "grad_norm": 3.1242570877075195, + "learning_rate": 6.79915774582971e-06, + "loss": 1.804, + "step": 51466 + }, + { + "epoch": 0.6433660841521038, + "grad_norm": 0.0009916683193296194, + "learning_rate": 6.79833098210708e-06, + "loss": 0.0417, + "step": 51468 + }, + { + "epoch": 0.6433910847771195, + "grad_norm": 3.9136765003204346, + "learning_rate": 6.7975042427679096e-06, + "loss": 1.3529, + "step": 51470 + }, + { + "epoch": 0.643416085402135, + "grad_norm": 2.177034854888916, + "learning_rate": 6.796677527818494e-06, + "loss": 0.5089, + "step": 51472 + }, + { + "epoch": 0.6434410860271507, + "grad_norm": 5.381553649902344, + "learning_rate": 6.795850837265128e-06, + "loss": 0.8708, + "step": 51474 + }, + { + "epoch": 0.6434660866521663, + "grad_norm": 0.0023484365083277225, + "learning_rate": 6.7950241711141104e-06, + "loss": 1.2372, + "step": 51476 + }, + { + "epoch": 0.643491087277182, + "grad_norm": 3.7261769771575928, + "learning_rate": 6.79419752937174e-06, + "loss": 0.6083, + "step": 51478 + }, + { + "epoch": 0.6435160879021976, + "grad_norm": 2.5788745880126953, + "learning_rate": 6.7933709120443e-06, + "loss": 1.5344, + "step": 51480 + }, + { + "epoch": 0.6435410885272131, + "grad_norm": 7.906331539154053, + "learning_rate": 6.792544319138097e-06, + "loss": 1.6476, + "step": 51482 + }, + { + "epoch": 0.6435660891522288, + "grad_norm": 3.217867136001587, + "learning_rate": 6.791717750659423e-06, + "loss": 0.7745, + "step": 51484 + }, + { + "epoch": 0.6435910897772444, + "grad_norm": 0.6599259972572327, + "learning_rate": 6.790891206614572e-06, + "loss": 0.0157, + "step": 51486 + }, + { + "epoch": 0.6436160904022601, + "grad_norm": 2.976945400238037, + "learning_rate": 6.790064687009841e-06, + "loss": 0.5869, + "step": 51488 + }, + { + "epoch": 0.6436410910272757, + "grad_norm": 6.828146934509277, + "learning_rate": 6.789238191851526e-06, + "loss": 1.5269, + "step": 51490 + }, + { + "epoch": 0.6436660916522913, + "grad_norm": 3.012847661972046, + "learning_rate": 6.788411721145913e-06, + "loss": 1.0135, + "step": 51492 + }, + { + "epoch": 0.6436910922773069, + "grad_norm": 0.0024256864562630653, + "learning_rate": 6.787585274899306e-06, + "loss": 0.0006, + "step": 51494 + }, + { + "epoch": 0.6437160929023226, + "grad_norm": 2.944791078567505, + "learning_rate": 6.786758853117994e-06, + "loss": 0.6625, + "step": 51496 + }, + { + "epoch": 0.6437410935273382, + "grad_norm": 2.970031499862671, + "learning_rate": 6.785932455808272e-06, + "loss": 1.8218, + "step": 51498 + }, + { + "epoch": 0.6437660941523539, + "grad_norm": 2.452031373977661, + "learning_rate": 6.785106082976434e-06, + "loss": 0.8966, + "step": 51500 + }, + { + "epoch": 0.6437910947773694, + "grad_norm": 3.5460081100463867, + "learning_rate": 6.784279734628774e-06, + "loss": 0.2433, + "step": 51502 + }, + { + "epoch": 0.643816095402385, + "grad_norm": 2.8155694007873535, + "learning_rate": 6.783453410771586e-06, + "loss": 0.9553, + "step": 51504 + }, + { + "epoch": 0.6438410960274007, + "grad_norm": 0.0008241980685852468, + "learning_rate": 6.78262711141116e-06, + "loss": 0.0827, + "step": 51506 + }, + { + "epoch": 0.6438660966524163, + "grad_norm": 1.9183069467544556, + "learning_rate": 6.781800836553791e-06, + "loss": 0.8674, + "step": 51508 + }, + { + "epoch": 0.643891097277432, + "grad_norm": 3.5696964263916016, + "learning_rate": 6.780974586205773e-06, + "loss": 1.7028, + "step": 51510 + }, + { + "epoch": 0.6439160979024475, + "grad_norm": 2.4101722240448, + "learning_rate": 6.780148360373396e-06, + "loss": 0.5344, + "step": 51512 + }, + { + "epoch": 0.6439410985274632, + "grad_norm": 4.028435230255127, + "learning_rate": 6.779322159062953e-06, + "loss": 1.199, + "step": 51514 + }, + { + "epoch": 0.6439660991524788, + "grad_norm": 2.7544336318969727, + "learning_rate": 6.778495982280742e-06, + "loss": 1.6821, + "step": 51516 + }, + { + "epoch": 0.6439910997774945, + "grad_norm": 3.0691332817077637, + "learning_rate": 6.777669830033045e-06, + "loss": 0.2337, + "step": 51518 + }, + { + "epoch": 0.6440161004025101, + "grad_norm": 2.76513934135437, + "learning_rate": 6.77684370232616e-06, + "loss": 1.1897, + "step": 51520 + }, + { + "epoch": 0.6440411010275257, + "grad_norm": 2.72330641746521, + "learning_rate": 6.776017599166378e-06, + "loss": 0.7917, + "step": 51522 + }, + { + "epoch": 0.6440661016525413, + "grad_norm": 1.039226770401001, + "learning_rate": 6.775191520559989e-06, + "loss": 0.0868, + "step": 51524 + }, + { + "epoch": 0.644091102277557, + "grad_norm": 1.4936025142669678, + "learning_rate": 6.7743654665132865e-06, + "loss": 0.9009, + "step": 51526 + }, + { + "epoch": 0.6441161029025726, + "grad_norm": 3.5050652027130127, + "learning_rate": 6.773539437032559e-06, + "loss": 2.7431, + "step": 51528 + }, + { + "epoch": 0.6441411035275882, + "grad_norm": 3.627789258956909, + "learning_rate": 6.772713432124103e-06, + "loss": 0.6684, + "step": 51530 + }, + { + "epoch": 0.6441661041526038, + "grad_norm": 3.9182040691375732, + "learning_rate": 6.771887451794202e-06, + "loss": 0.7204, + "step": 51532 + }, + { + "epoch": 0.6441911047776194, + "grad_norm": 2.0649116039276123, + "learning_rate": 6.7710614960491494e-06, + "loss": 1.1407, + "step": 51534 + }, + { + "epoch": 0.6442161054026351, + "grad_norm": 4.207291126251221, + "learning_rate": 6.770235564895236e-06, + "loss": 1.0106, + "step": 51536 + }, + { + "epoch": 0.6442411060276507, + "grad_norm": 0.00036203418858349323, + "learning_rate": 6.769409658338752e-06, + "loss": 0.7937, + "step": 51538 + }, + { + "epoch": 0.6442661066526664, + "grad_norm": 2.5336081981658936, + "learning_rate": 6.7685837763859865e-06, + "loss": 1.1709, + "step": 51540 + }, + { + "epoch": 0.6442911072776819, + "grad_norm": 1.6508127450942993, + "learning_rate": 6.767757919043232e-06, + "loss": 0.1091, + "step": 51542 + }, + { + "epoch": 0.6443161079026976, + "grad_norm": 3.8284265995025635, + "learning_rate": 6.766932086316775e-06, + "loss": 0.9087, + "step": 51544 + }, + { + "epoch": 0.6443411085277132, + "grad_norm": 3.782209634780884, + "learning_rate": 6.766106278212907e-06, + "loss": 0.8748, + "step": 51546 + }, + { + "epoch": 0.6443661091527288, + "grad_norm": 1.4684127569198608, + "learning_rate": 6.765280494737915e-06, + "loss": 1.1415, + "step": 51548 + }, + { + "epoch": 0.6443911097777445, + "grad_norm": 0.006324193440377712, + "learning_rate": 6.7644547358980896e-06, + "loss": 0.1564, + "step": 51550 + }, + { + "epoch": 0.64441611040276, + "grad_norm": 5.976436614990234, + "learning_rate": 6.763629001699718e-06, + "loss": 1.5337, + "step": 51552 + }, + { + "epoch": 0.6444411110277757, + "grad_norm": 0.6676735281944275, + "learning_rate": 6.762803292149092e-06, + "loss": 0.5431, + "step": 51554 + }, + { + "epoch": 0.6444661116527913, + "grad_norm": 5.8772873878479, + "learning_rate": 6.761977607252499e-06, + "loss": 0.6386, + "step": 51556 + }, + { + "epoch": 0.644491112277807, + "grad_norm": 2.838852643966675, + "learning_rate": 6.761151947016226e-06, + "loss": 0.3638, + "step": 51558 + }, + { + "epoch": 0.6445161129028226, + "grad_norm": 1.2340118885040283, + "learning_rate": 6.760326311446561e-06, + "loss": 0.0832, + "step": 51560 + }, + { + "epoch": 0.6445411135278382, + "grad_norm": 3.3646485805511475, + "learning_rate": 6.759500700549793e-06, + "loss": 1.1429, + "step": 51562 + }, + { + "epoch": 0.6445661141528538, + "grad_norm": 2.821284055709839, + "learning_rate": 6.758675114332209e-06, + "loss": 1.9821, + "step": 51564 + }, + { + "epoch": 0.6445911147778695, + "grad_norm": 3.7800843715667725, + "learning_rate": 6.757849552800097e-06, + "loss": 1.1064, + "step": 51566 + }, + { + "epoch": 0.6446161154028851, + "grad_norm": 3.27125883102417, + "learning_rate": 6.7570240159597445e-06, + "loss": 0.7002, + "step": 51568 + }, + { + "epoch": 0.6446411160279008, + "grad_norm": 0.0011648207437247038, + "learning_rate": 6.75619850381744e-06, + "loss": 0.0081, + "step": 51570 + }, + { + "epoch": 0.6446661166529163, + "grad_norm": 19.644346237182617, + "learning_rate": 6.7553730163794675e-06, + "loss": 1.8327, + "step": 51572 + }, + { + "epoch": 0.6446911172779319, + "grad_norm": 3.5786142349243164, + "learning_rate": 6.754547553652116e-06, + "loss": 0.7261, + "step": 51574 + }, + { + "epoch": 0.6447161179029476, + "grad_norm": 4.403493881225586, + "learning_rate": 6.75372211564167e-06, + "loss": 0.1699, + "step": 51576 + }, + { + "epoch": 0.6447411185279632, + "grad_norm": 9.9884033203125, + "learning_rate": 6.752896702354419e-06, + "loss": 1.2346, + "step": 51578 + }, + { + "epoch": 0.6447661191529789, + "grad_norm": 4.031597137451172, + "learning_rate": 6.752071313796647e-06, + "loss": 0.1626, + "step": 51580 + }, + { + "epoch": 0.6447911197779944, + "grad_norm": 5.162214756011963, + "learning_rate": 6.7512459499746405e-06, + "loss": 1.8621, + "step": 51582 + }, + { + "epoch": 0.6448161204030101, + "grad_norm": 3.8695433139801025, + "learning_rate": 6.750420610894686e-06, + "loss": 1.4837, + "step": 51584 + }, + { + "epoch": 0.6448411210280257, + "grad_norm": 0.3152795135974884, + "learning_rate": 6.749595296563067e-06, + "loss": 1.2448, + "step": 51586 + }, + { + "epoch": 0.6448661216530414, + "grad_norm": 3.4124884605407715, + "learning_rate": 6.74877000698607e-06, + "loss": 0.9753, + "step": 51588 + }, + { + "epoch": 0.644891122278057, + "grad_norm": 7.323472023010254, + "learning_rate": 6.747944742169983e-06, + "loss": 1.9404, + "step": 51590 + }, + { + "epoch": 0.6449161229030725, + "grad_norm": 3.9064626693725586, + "learning_rate": 6.7471195021210876e-06, + "loss": 1.5337, + "step": 51592 + }, + { + "epoch": 0.6449411235280882, + "grad_norm": 0.8524771332740784, + "learning_rate": 6.746294286845671e-06, + "loss": 0.8463, + "step": 51594 + }, + { + "epoch": 0.6449661241531038, + "grad_norm": 2.683131694793701, + "learning_rate": 6.745469096350018e-06, + "loss": 0.3334, + "step": 51596 + }, + { + "epoch": 0.6449911247781195, + "grad_norm": 4.072835445404053, + "learning_rate": 6.744643930640411e-06, + "loss": 1.3918, + "step": 51598 + }, + { + "epoch": 0.6450161254031351, + "grad_norm": 4.207740306854248, + "learning_rate": 6.743818789723136e-06, + "loss": 1.2038, + "step": 51600 + }, + { + "epoch": 0.6450411260281507, + "grad_norm": 1.627363920211792, + "learning_rate": 6.742993673604474e-06, + "loss": 0.489, + "step": 51602 + }, + { + "epoch": 0.6450661266531663, + "grad_norm": 0.0009937844006344676, + "learning_rate": 6.742168582290714e-06, + "loss": 0.3461, + "step": 51604 + }, + { + "epoch": 0.645091127278182, + "grad_norm": 12.878474235534668, + "learning_rate": 6.741343515788137e-06, + "loss": 1.4658, + "step": 51606 + }, + { + "epoch": 0.6451161279031976, + "grad_norm": 3.461993455886841, + "learning_rate": 6.740518474103028e-06, + "loss": 0.5917, + "step": 51608 + }, + { + "epoch": 0.6451411285282133, + "grad_norm": 5.647122383117676, + "learning_rate": 6.739693457241667e-06, + "loss": 0.9228, + "step": 51610 + }, + { + "epoch": 0.6451661291532288, + "grad_norm": 1.1141811609268188, + "learning_rate": 6.738868465210342e-06, + "loss": 1.1711, + "step": 51612 + }, + { + "epoch": 0.6451911297782444, + "grad_norm": 3.9459009170532227, + "learning_rate": 6.738043498015332e-06, + "loss": 2.0637, + "step": 51614 + }, + { + "epoch": 0.6452161304032601, + "grad_norm": 3.973418712615967, + "learning_rate": 6.737218555662921e-06, + "loss": 1.9046, + "step": 51616 + }, + { + "epoch": 0.6452411310282757, + "grad_norm": 3.823312759399414, + "learning_rate": 6.736393638159393e-06, + "loss": 0.7218, + "step": 51618 + }, + { + "epoch": 0.6452661316532914, + "grad_norm": 4.3739094734191895, + "learning_rate": 6.735568745511029e-06, + "loss": 1.7264, + "step": 51620 + }, + { + "epoch": 0.6452911322783069, + "grad_norm": 3.205704689025879, + "learning_rate": 6.734743877724115e-06, + "loss": 1.0828, + "step": 51622 + }, + { + "epoch": 0.6453161329033226, + "grad_norm": 0.001538856653496623, + "learning_rate": 6.7339190348049255e-06, + "loss": 0.0001, + "step": 51624 + }, + { + "epoch": 0.6453411335283382, + "grad_norm": 2.879821300506592, + "learning_rate": 6.733094216759748e-06, + "loss": 1.2555, + "step": 51626 + }, + { + "epoch": 0.6453661341533539, + "grad_norm": 1.95868718624115, + "learning_rate": 6.732269423594864e-06, + "loss": 0.6553, + "step": 51628 + }, + { + "epoch": 0.6453911347783695, + "grad_norm": 3.3177402019500732, + "learning_rate": 6.731444655316553e-06, + "loss": 1.1633, + "step": 51630 + }, + { + "epoch": 0.645416135403385, + "grad_norm": 0.11032602936029434, + "learning_rate": 6.730619911931098e-06, + "loss": 0.8871, + "step": 51632 + }, + { + "epoch": 0.6454411360284007, + "grad_norm": 3.0578548908233643, + "learning_rate": 6.7297951934447806e-06, + "loss": 0.6538, + "step": 51634 + }, + { + "epoch": 0.6454661366534163, + "grad_norm": 5.254130840301514, + "learning_rate": 6.728970499863878e-06, + "loss": 1.0751, + "step": 51636 + }, + { + "epoch": 0.645491137278432, + "grad_norm": 4.388941764831543, + "learning_rate": 6.728145831194674e-06, + "loss": 1.2578, + "step": 51638 + }, + { + "epoch": 0.6455161379034476, + "grad_norm": 3.8534913063049316, + "learning_rate": 6.7273211874434495e-06, + "loss": 1.4278, + "step": 51640 + }, + { + "epoch": 0.6455411385284632, + "grad_norm": 3.5176544189453125, + "learning_rate": 6.726496568616483e-06, + "loss": 1.0552, + "step": 51642 + }, + { + "epoch": 0.6455661391534788, + "grad_norm": 2.72898268699646, + "learning_rate": 6.725671974720056e-06, + "loss": 0.5197, + "step": 51644 + }, + { + "epoch": 0.6455911397784945, + "grad_norm": 1.9807966947555542, + "learning_rate": 6.724847405760449e-06, + "loss": 0.8871, + "step": 51646 + }, + { + "epoch": 0.6456161404035101, + "grad_norm": 2.0314881801605225, + "learning_rate": 6.7240228617439416e-06, + "loss": 0.3405, + "step": 51648 + }, + { + "epoch": 0.6456411410285258, + "grad_norm": 1.9583288431167603, + "learning_rate": 6.723198342676814e-06, + "loss": 1.431, + "step": 51650 + }, + { + "epoch": 0.6456661416535413, + "grad_norm": 4.710115909576416, + "learning_rate": 6.72237384856534e-06, + "loss": 1.4791, + "step": 51652 + }, + { + "epoch": 0.645691142278557, + "grad_norm": 1.258709192276001, + "learning_rate": 6.721549379415806e-06, + "loss": 1.1405, + "step": 51654 + }, + { + "epoch": 0.6457161429035726, + "grad_norm": 2.022956609725952, + "learning_rate": 6.720724935234487e-06, + "loss": 0.0808, + "step": 51656 + }, + { + "epoch": 0.6457411435285882, + "grad_norm": 3.5794272422790527, + "learning_rate": 6.7199005160276645e-06, + "loss": 0.7543, + "step": 51658 + }, + { + "epoch": 0.6457661441536039, + "grad_norm": 0.002400427358224988, + "learning_rate": 6.7190761218016174e-06, + "loss": 0.4035, + "step": 51660 + }, + { + "epoch": 0.6457911447786194, + "grad_norm": 4.810072422027588, + "learning_rate": 6.7182517525626205e-06, + "loss": 1.0026, + "step": 51662 + }, + { + "epoch": 0.6458161454036351, + "grad_norm": 1.4700825214385986, + "learning_rate": 6.7174274083169535e-06, + "loss": 0.8341, + "step": 51664 + }, + { + "epoch": 0.6458411460286507, + "grad_norm": 3.903419017791748, + "learning_rate": 6.716603089070896e-06, + "loss": 2.405, + "step": 51666 + }, + { + "epoch": 0.6458661466536664, + "grad_norm": 0.0014540872070938349, + "learning_rate": 6.715778794830724e-06, + "loss": 0.0, + "step": 51668 + }, + { + "epoch": 0.645891147278682, + "grad_norm": 5.688703536987305, + "learning_rate": 6.7149545256027175e-06, + "loss": 1.2308, + "step": 51670 + }, + { + "epoch": 0.6459161479036976, + "grad_norm": 1.209247350692749, + "learning_rate": 6.714130281393152e-06, + "loss": 0.0578, + "step": 51672 + }, + { + "epoch": 0.6459411485287132, + "grad_norm": 2.6986141204833984, + "learning_rate": 6.713306062208308e-06, + "loss": 0.6685, + "step": 51674 + }, + { + "epoch": 0.6459661491537289, + "grad_norm": 3.480128049850464, + "learning_rate": 6.712481868054458e-06, + "loss": 1.2033, + "step": 51676 + }, + { + "epoch": 0.6459911497787445, + "grad_norm": 0.5214464664459229, + "learning_rate": 6.711657698937882e-06, + "loss": 0.376, + "step": 51678 + }, + { + "epoch": 0.6460161504037601, + "grad_norm": 0.0016272346256300807, + "learning_rate": 6.710833554864855e-06, + "loss": 0.9646, + "step": 51680 + }, + { + "epoch": 0.6460411510287757, + "grad_norm": 0.5613439679145813, + "learning_rate": 6.710009435841655e-06, + "loss": 0.2403, + "step": 51682 + }, + { + "epoch": 0.6460661516537913, + "grad_norm": 3.57098388671875, + "learning_rate": 6.7091853418745574e-06, + "loss": 0.8721, + "step": 51684 + }, + { + "epoch": 0.646091152278807, + "grad_norm": 4.853269577026367, + "learning_rate": 6.708361272969841e-06, + "loss": 1.7453, + "step": 51686 + }, + { + "epoch": 0.6461161529038226, + "grad_norm": 2.279802083969116, + "learning_rate": 6.707537229133778e-06, + "loss": 1.5895, + "step": 51688 + }, + { + "epoch": 0.6461411535288383, + "grad_norm": 3.2822539806365967, + "learning_rate": 6.706713210372646e-06, + "loss": 0.6246, + "step": 51690 + }, + { + "epoch": 0.6461661541538538, + "grad_norm": 6.530099868774414, + "learning_rate": 6.70588921669272e-06, + "loss": 1.6314, + "step": 51692 + }, + { + "epoch": 0.6461911547788695, + "grad_norm": 0.13150542974472046, + "learning_rate": 6.705065248100276e-06, + "loss": 0.6779, + "step": 51694 + }, + { + "epoch": 0.6462161554038851, + "grad_norm": 0.002657263772562146, + "learning_rate": 6.704241304601588e-06, + "loss": 0.0003, + "step": 51696 + }, + { + "epoch": 0.6462411560289008, + "grad_norm": 3.3361825942993164, + "learning_rate": 6.703417386202932e-06, + "loss": 0.4273, + "step": 51698 + }, + { + "epoch": 0.6462661566539164, + "grad_norm": 3.6818811893463135, + "learning_rate": 6.702593492910588e-06, + "loss": 0.8988, + "step": 51700 + }, + { + "epoch": 0.6462911572789319, + "grad_norm": 3.7075631618499756, + "learning_rate": 6.7017696247308224e-06, + "loss": 0.7488, + "step": 51702 + }, + { + "epoch": 0.6463161579039476, + "grad_norm": 1.8676650524139404, + "learning_rate": 6.700945781669912e-06, + "loss": 0.5268, + "step": 51704 + }, + { + "epoch": 0.6463411585289632, + "grad_norm": 5.0357279777526855, + "learning_rate": 6.700121963734133e-06, + "loss": 1.8545, + "step": 51706 + }, + { + "epoch": 0.6463661591539789, + "grad_norm": 3.3066775798797607, + "learning_rate": 6.699298170929758e-06, + "loss": 1.2808, + "step": 51708 + }, + { + "epoch": 0.6463911597789945, + "grad_norm": 3.7484962940216064, + "learning_rate": 6.698474403263062e-06, + "loss": 1.0767, + "step": 51710 + }, + { + "epoch": 0.6464161604040101, + "grad_norm": 1.5551366806030273, + "learning_rate": 6.697650660740319e-06, + "loss": 1.1217, + "step": 51712 + }, + { + "epoch": 0.6464411610290257, + "grad_norm": 0.000882496649865061, + "learning_rate": 6.6968269433678e-06, + "loss": 0.9167, + "step": 51714 + }, + { + "epoch": 0.6464661616540414, + "grad_norm": 1.7547893524169922, + "learning_rate": 6.696003251151781e-06, + "loss": 0.7583, + "step": 51716 + }, + { + "epoch": 0.646491162279057, + "grad_norm": 2.3326547145843506, + "learning_rate": 6.695179584098532e-06, + "loss": 0.4501, + "step": 51718 + }, + { + "epoch": 0.6465161629040727, + "grad_norm": 2.3183608055114746, + "learning_rate": 6.694355942214329e-06, + "loss": 0.4034, + "step": 51720 + }, + { + "epoch": 0.6465411635290882, + "grad_norm": 3.833014965057373, + "learning_rate": 6.693532325505443e-06, + "loss": 0.3105, + "step": 51722 + }, + { + "epoch": 0.6465661641541038, + "grad_norm": 5.318154811859131, + "learning_rate": 6.692708733978146e-06, + "loss": 0.8456, + "step": 51724 + }, + { + "epoch": 0.6465911647791195, + "grad_norm": 0.008805293589830399, + "learning_rate": 6.691885167638717e-06, + "loss": 0.3006, + "step": 51726 + }, + { + "epoch": 0.6466161654041351, + "grad_norm": 3.033237934112549, + "learning_rate": 6.69106162649342e-06, + "loss": 1.2234, + "step": 51728 + }, + { + "epoch": 0.6466411660291508, + "grad_norm": 0.0014712755801156163, + "learning_rate": 6.690238110548528e-06, + "loss": 0.0, + "step": 51730 + }, + { + "epoch": 0.6466661666541663, + "grad_norm": 4.073507308959961, + "learning_rate": 6.689414619810316e-06, + "loss": 0.5085, + "step": 51732 + }, + { + "epoch": 0.646691167279182, + "grad_norm": 2.4235851764678955, + "learning_rate": 6.688591154285053e-06, + "loss": 1.1329, + "step": 51734 + }, + { + "epoch": 0.6467161679041976, + "grad_norm": 2.9988393783569336, + "learning_rate": 6.68776771397901e-06, + "loss": 1.2693, + "step": 51736 + }, + { + "epoch": 0.6467411685292133, + "grad_norm": 2.3467984199523926, + "learning_rate": 6.6869442988984655e-06, + "loss": 0.1217, + "step": 51738 + }, + { + "epoch": 0.6467661691542289, + "grad_norm": 1.5128865242004395, + "learning_rate": 6.686120909049682e-06, + "loss": 0.1393, + "step": 51740 + }, + { + "epoch": 0.6467911697792444, + "grad_norm": 2.939181089401245, + "learning_rate": 6.685297544438931e-06, + "loss": 1.2324, + "step": 51742 + }, + { + "epoch": 0.6468161704042601, + "grad_norm": 0.3892221450805664, + "learning_rate": 6.684474205072486e-06, + "loss": 0.5506, + "step": 51744 + }, + { + "epoch": 0.6468411710292757, + "grad_norm": 0.0012121198233217, + "learning_rate": 6.6836508909566166e-06, + "loss": 0.4082, + "step": 51746 + }, + { + "epoch": 0.6468661716542914, + "grad_norm": 0.5662848353385925, + "learning_rate": 6.6828276020975915e-06, + "loss": 0.6857, + "step": 51748 + }, + { + "epoch": 0.646891172279307, + "grad_norm": 2.745218276977539, + "learning_rate": 6.682004338501684e-06, + "loss": 1.0599, + "step": 51750 + }, + { + "epoch": 0.6469161729043226, + "grad_norm": 0.9936032295227051, + "learning_rate": 6.6811811001751665e-06, + "loss": 0.2451, + "step": 51752 + }, + { + "epoch": 0.6469411735293382, + "grad_norm": 3.850646734237671, + "learning_rate": 6.6803578871243e-06, + "loss": 1.1387, + "step": 51754 + }, + { + "epoch": 0.6469661741543539, + "grad_norm": 0.0008154172101058066, + "learning_rate": 6.679534699355359e-06, + "loss": 0.1745, + "step": 51756 + }, + { + "epoch": 0.6469911747793695, + "grad_norm": 0.007170539814978838, + "learning_rate": 6.678711536874611e-06, + "loss": 0.8621, + "step": 51758 + }, + { + "epoch": 0.6470161754043852, + "grad_norm": 7.233342170715332, + "learning_rate": 6.677888399688325e-06, + "loss": 1.135, + "step": 51760 + }, + { + "epoch": 0.6470411760294007, + "grad_norm": 4.336091995239258, + "learning_rate": 6.677065287802773e-06, + "loss": 1.1861, + "step": 51762 + }, + { + "epoch": 0.6470661766544163, + "grad_norm": 2.6100850105285645, + "learning_rate": 6.6762422012242255e-06, + "loss": 0.158, + "step": 51764 + }, + { + "epoch": 0.647091177279432, + "grad_norm": 0.00048391977907158434, + "learning_rate": 6.675419139958944e-06, + "loss": 1.0227, + "step": 51766 + }, + { + "epoch": 0.6471161779044476, + "grad_norm": 2.316272735595703, + "learning_rate": 6.6745961040132e-06, + "loss": 1.0846, + "step": 51768 + }, + { + "epoch": 0.6471411785294633, + "grad_norm": 0.0017165105091407895, + "learning_rate": 6.673773093393261e-06, + "loss": 0.8956, + "step": 51770 + }, + { + "epoch": 0.6471661791544788, + "grad_norm": 2.589637041091919, + "learning_rate": 6.672950108105395e-06, + "loss": 0.3441, + "step": 51772 + }, + { + "epoch": 0.6471911797794945, + "grad_norm": 3.097646951675415, + "learning_rate": 6.672127148155872e-06, + "loss": 0.6552, + "step": 51774 + }, + { + "epoch": 0.6472161804045101, + "grad_norm": 0.0038170020561665297, + "learning_rate": 6.6713042135509576e-06, + "loss": 0.2857, + "step": 51776 + }, + { + "epoch": 0.6472411810295258, + "grad_norm": 0.000704133533872664, + "learning_rate": 6.6704813042969256e-06, + "loss": 0.9606, + "step": 51778 + }, + { + "epoch": 0.6472661816545414, + "grad_norm": 4.948178768157959, + "learning_rate": 6.669658420400031e-06, + "loss": 1.3016, + "step": 51780 + }, + { + "epoch": 0.647291182279557, + "grad_norm": 3.9663450717926025, + "learning_rate": 6.668835561866547e-06, + "loss": 0.4779, + "step": 51782 + }, + { + "epoch": 0.6473161829045726, + "grad_norm": 3.5453007221221924, + "learning_rate": 6.668012728702742e-06, + "loss": 1.1779, + "step": 51784 + }, + { + "epoch": 0.6473411835295882, + "grad_norm": 1.3042887449264526, + "learning_rate": 6.667189920914876e-06, + "loss": 0.6207, + "step": 51786 + }, + { + "epoch": 0.6473661841546039, + "grad_norm": 2.9208896160125732, + "learning_rate": 6.666367138509225e-06, + "loss": 0.8859, + "step": 51788 + }, + { + "epoch": 0.6473911847796195, + "grad_norm": 3.2079780101776123, + "learning_rate": 6.665544381492053e-06, + "loss": 1.2684, + "step": 51790 + }, + { + "epoch": 0.6474161854046351, + "grad_norm": 4.423947334289551, + "learning_rate": 6.66472164986962e-06, + "loss": 1.3197, + "step": 51792 + }, + { + "epoch": 0.6474411860296507, + "grad_norm": 2.4882235527038574, + "learning_rate": 6.663898943648195e-06, + "loss": 0.7861, + "step": 51794 + }, + { + "epoch": 0.6474661866546664, + "grad_norm": 5.427084922790527, + "learning_rate": 6.663076262834044e-06, + "loss": 1.0995, + "step": 51796 + }, + { + "epoch": 0.647491187279682, + "grad_norm": 2.7420599460601807, + "learning_rate": 6.662253607433431e-06, + "loss": 1.0441, + "step": 51798 + }, + { + "epoch": 0.6475161879046977, + "grad_norm": 1.823817491531372, + "learning_rate": 6.661430977452623e-06, + "loss": 0.3911, + "step": 51800 + }, + { + "epoch": 0.6475411885297132, + "grad_norm": 0.401111900806427, + "learning_rate": 6.660608372897885e-06, + "loss": 0.2129, + "step": 51802 + }, + { + "epoch": 0.6475661891547289, + "grad_norm": 3.1475751399993896, + "learning_rate": 6.659785793775486e-06, + "loss": 1.3606, + "step": 51804 + }, + { + "epoch": 0.6475911897797445, + "grad_norm": 3.788851022720337, + "learning_rate": 6.658963240091683e-06, + "loss": 0.626, + "step": 51806 + }, + { + "epoch": 0.6476161904047602, + "grad_norm": 4.206897258758545, + "learning_rate": 6.658140711852741e-06, + "loss": 1.593, + "step": 51808 + }, + { + "epoch": 0.6476411910297758, + "grad_norm": 2.8428189754486084, + "learning_rate": 6.657318209064927e-06, + "loss": 0.8187, + "step": 51810 + }, + { + "epoch": 0.6476661916547913, + "grad_norm": 0.002281660446897149, + "learning_rate": 6.656495731734505e-06, + "loss": 0.7073, + "step": 51812 + }, + { + "epoch": 0.647691192279807, + "grad_norm": 6.459979057312012, + "learning_rate": 6.65567327986774e-06, + "loss": 0.7563, + "step": 51814 + }, + { + "epoch": 0.6477161929048226, + "grad_norm": 6.181162357330322, + "learning_rate": 6.6548508534708935e-06, + "loss": 2.3377, + "step": 51816 + }, + { + "epoch": 0.6477411935298383, + "grad_norm": 2.8656649589538574, + "learning_rate": 6.654028452550236e-06, + "loss": 0.4581, + "step": 51818 + }, + { + "epoch": 0.6477661941548539, + "grad_norm": 3.302476406097412, + "learning_rate": 6.653206077112018e-06, + "loss": 0.854, + "step": 51820 + }, + { + "epoch": 0.6477911947798695, + "grad_norm": 0.0018406397430226207, + "learning_rate": 6.652383727162508e-06, + "loss": 0.1137, + "step": 51822 + }, + { + "epoch": 0.6478161954048851, + "grad_norm": 4.2326836585998535, + "learning_rate": 6.6515614027079725e-06, + "loss": 0.8327, + "step": 51824 + }, + { + "epoch": 0.6478411960299008, + "grad_norm": 3.015871047973633, + "learning_rate": 6.6507391037546706e-06, + "loss": 0.8979, + "step": 51826 + }, + { + "epoch": 0.6478661966549164, + "grad_norm": 2.855131149291992, + "learning_rate": 6.649916830308868e-06, + "loss": 0.8848, + "step": 51828 + }, + { + "epoch": 0.647891197279932, + "grad_norm": 2.7928175926208496, + "learning_rate": 6.6490945823768285e-06, + "loss": 0.2423, + "step": 51830 + }, + { + "epoch": 0.6479161979049476, + "grad_norm": 1.1887576580047607, + "learning_rate": 6.648272359964807e-06, + "loss": 0.4216, + "step": 51832 + }, + { + "epoch": 0.6479411985299632, + "grad_norm": 4.585330486297607, + "learning_rate": 6.6474501630790655e-06, + "loss": 0.5035, + "step": 51834 + }, + { + "epoch": 0.6479661991549789, + "grad_norm": 4.066757678985596, + "learning_rate": 6.646627991725874e-06, + "loss": 0.8895, + "step": 51836 + }, + { + "epoch": 0.6479911997799945, + "grad_norm": 0.0012206628452986479, + "learning_rate": 6.645805845911489e-06, + "loss": 0.4304, + "step": 51838 + }, + { + "epoch": 0.6480162004050102, + "grad_norm": 1.1209975481033325, + "learning_rate": 6.6449837256421705e-06, + "loss": 0.268, + "step": 51840 + }, + { + "epoch": 0.6480412010300257, + "grad_norm": 3.0973451137542725, + "learning_rate": 6.644161630924182e-06, + "loss": 1.3214, + "step": 51842 + }, + { + "epoch": 0.6480662016550414, + "grad_norm": 0.9529418349266052, + "learning_rate": 6.6433395617637885e-06, + "loss": 0.0905, + "step": 51844 + }, + { + "epoch": 0.648091202280057, + "grad_norm": 5.091897487640381, + "learning_rate": 6.642517518167242e-06, + "loss": 1.6611, + "step": 51846 + }, + { + "epoch": 0.6481162029050727, + "grad_norm": 20.744789123535156, + "learning_rate": 6.641695500140805e-06, + "loss": 0.9405, + "step": 51848 + }, + { + "epoch": 0.6481412035300883, + "grad_norm": 8.463022232055664, + "learning_rate": 6.6408735076907435e-06, + "loss": 1.1338, + "step": 51850 + }, + { + "epoch": 0.6481662041551038, + "grad_norm": 5.492595672607422, + "learning_rate": 6.640051540823314e-06, + "loss": 1.3843, + "step": 51852 + }, + { + "epoch": 0.6481912047801195, + "grad_norm": 3.6705541610717773, + "learning_rate": 6.639229599544776e-06, + "loss": 0.8994, + "step": 51854 + }, + { + "epoch": 0.6482162054051351, + "grad_norm": 0.0011909565655514598, + "learning_rate": 6.638407683861394e-06, + "loss": 0.1086, + "step": 51856 + }, + { + "epoch": 0.6482412060301508, + "grad_norm": 4.4666056632995605, + "learning_rate": 6.6375857937794205e-06, + "loss": 0.9531, + "step": 51858 + }, + { + "epoch": 0.6482662066551664, + "grad_norm": 2.9990203380584717, + "learning_rate": 6.636763929305117e-06, + "loss": 1.1072, + "step": 51860 + }, + { + "epoch": 0.648291207280182, + "grad_norm": 6.156126499176025, + "learning_rate": 6.635942090444743e-06, + "loss": 1.9238, + "step": 51862 + }, + { + "epoch": 0.6483162079051976, + "grad_norm": 3.1632025241851807, + "learning_rate": 6.635120277204561e-06, + "loss": 0.5785, + "step": 51864 + }, + { + "epoch": 0.6483412085302133, + "grad_norm": 4.144054412841797, + "learning_rate": 6.634298489590826e-06, + "loss": 1.5181, + "step": 51866 + }, + { + "epoch": 0.6483662091552289, + "grad_norm": 5.859165668487549, + "learning_rate": 6.633476727609798e-06, + "loss": 1.5075, + "step": 51868 + }, + { + "epoch": 0.6483912097802446, + "grad_norm": 0.004960133694112301, + "learning_rate": 6.6326549912677386e-06, + "loss": 0.2795, + "step": 51870 + }, + { + "epoch": 0.6484162104052601, + "grad_norm": 4.30226469039917, + "learning_rate": 6.631833280570896e-06, + "loss": 1.0494, + "step": 51872 + }, + { + "epoch": 0.6484412110302757, + "grad_norm": 2.9712467193603516, + "learning_rate": 6.6310115955255386e-06, + "loss": 0.5984, + "step": 51874 + }, + { + "epoch": 0.6484662116552914, + "grad_norm": 2.469575881958008, + "learning_rate": 6.630189936137919e-06, + "loss": 0.6194, + "step": 51876 + }, + { + "epoch": 0.648491212280307, + "grad_norm": 3.326294183731079, + "learning_rate": 6.629368302414298e-06, + "loss": 1.1027, + "step": 51878 + }, + { + "epoch": 0.6485162129053227, + "grad_norm": 4.1119160652160645, + "learning_rate": 6.62854669436093e-06, + "loss": 0.7833, + "step": 51880 + }, + { + "epoch": 0.6485412135303382, + "grad_norm": 4.276101112365723, + "learning_rate": 6.627725111984078e-06, + "loss": 0.9381, + "step": 51882 + }, + { + "epoch": 0.6485662141553539, + "grad_norm": 5.712014675140381, + "learning_rate": 6.62690355528999e-06, + "loss": 0.2036, + "step": 51884 + }, + { + "epoch": 0.6485912147803695, + "grad_norm": 4.91707706451416, + "learning_rate": 6.6260820242849275e-06, + "loss": 1.538, + "step": 51886 + }, + { + "epoch": 0.6486162154053852, + "grad_norm": 4.19657039642334, + "learning_rate": 6.6252605189751476e-06, + "loss": 0.5834, + "step": 51888 + }, + { + "epoch": 0.6486412160304008, + "grad_norm": 1.2338985204696655, + "learning_rate": 6.624439039366905e-06, + "loss": 0.1912, + "step": 51890 + }, + { + "epoch": 0.6486662166554164, + "grad_norm": 5.852622985839844, + "learning_rate": 6.6236175854664595e-06, + "loss": 1.2619, + "step": 51892 + }, + { + "epoch": 0.648691217280432, + "grad_norm": 0.003318685805425048, + "learning_rate": 6.6227961572800645e-06, + "loss": 0.0001, + "step": 51894 + }, + { + "epoch": 0.6487162179054476, + "grad_norm": 3.807605028152466, + "learning_rate": 6.621974754813979e-06, + "loss": 0.8921, + "step": 51896 + }, + { + "epoch": 0.6487412185304633, + "grad_norm": 2.6154673099517822, + "learning_rate": 6.6211533780744545e-06, + "loss": 0.4466, + "step": 51898 + }, + { + "epoch": 0.6487662191554789, + "grad_norm": 3.8146326541900635, + "learning_rate": 6.620332027067746e-06, + "loss": 1.3053, + "step": 51900 + }, + { + "epoch": 0.6487912197804945, + "grad_norm": 1.9634559154510498, + "learning_rate": 6.619510701800111e-06, + "loss": 0.5632, + "step": 51902 + }, + { + "epoch": 0.6488162204055101, + "grad_norm": 2.0245630741119385, + "learning_rate": 6.618689402277806e-06, + "loss": 0.9334, + "step": 51904 + }, + { + "epoch": 0.6488412210305258, + "grad_norm": 2.4902334213256836, + "learning_rate": 6.617868128507083e-06, + "loss": 0.7953, + "step": 51906 + }, + { + "epoch": 0.6488662216555414, + "grad_norm": 3.6943039894104004, + "learning_rate": 6.6170468804942015e-06, + "loss": 1.0036, + "step": 51908 + }, + { + "epoch": 0.6488912222805571, + "grad_norm": 5.713532447814941, + "learning_rate": 6.61622565824541e-06, + "loss": 1.2581, + "step": 51910 + }, + { + "epoch": 0.6489162229055726, + "grad_norm": 7.2574286460876465, + "learning_rate": 6.615404461766964e-06, + "loss": 2.3074, + "step": 51912 + }, + { + "epoch": 0.6489412235305883, + "grad_norm": 2.5440845489501953, + "learning_rate": 6.6145832910651205e-06, + "loss": 1.1529, + "step": 51914 + }, + { + "epoch": 0.6489662241556039, + "grad_norm": 3.256186008453369, + "learning_rate": 6.6137621461461325e-06, + "loss": 1.0869, + "step": 51916 + }, + { + "epoch": 0.6489912247806195, + "grad_norm": 2.7124133110046387, + "learning_rate": 6.612941027016251e-06, + "loss": 1.4812, + "step": 51918 + }, + { + "epoch": 0.6490162254056352, + "grad_norm": 0.0013999644434079528, + "learning_rate": 6.612119933681734e-06, + "loss": 0.9066, + "step": 51920 + }, + { + "epoch": 0.6490412260306507, + "grad_norm": 3.400651693344116, + "learning_rate": 6.611298866148832e-06, + "loss": 0.3631, + "step": 51922 + }, + { + "epoch": 0.6490662266556664, + "grad_norm": 2.640629529953003, + "learning_rate": 6.610477824423797e-06, + "loss": 1.5679, + "step": 51924 + }, + { + "epoch": 0.649091227280682, + "grad_norm": 0.005589150357991457, + "learning_rate": 6.609656808512886e-06, + "loss": 0.469, + "step": 51926 + }, + { + "epoch": 0.6491162279056977, + "grad_norm": 1.7190214395523071, + "learning_rate": 6.608835818422348e-06, + "loss": 0.3901, + "step": 51928 + }, + { + "epoch": 0.6491412285307133, + "grad_norm": 7.297656059265137, + "learning_rate": 6.608014854158437e-06, + "loss": 0.4229, + "step": 51930 + }, + { + "epoch": 0.6491662291557289, + "grad_norm": 1.9627689123153687, + "learning_rate": 6.607193915727405e-06, + "loss": 0.4429, + "step": 51932 + }, + { + "epoch": 0.6491912297807445, + "grad_norm": 2.56101655960083, + "learning_rate": 6.606373003135505e-06, + "loss": 0.0912, + "step": 51934 + }, + { + "epoch": 0.6492162304057602, + "grad_norm": 0.0036808063741773367, + "learning_rate": 6.605552116388988e-06, + "loss": 0.4782, + "step": 51936 + }, + { + "epoch": 0.6492412310307758, + "grad_norm": 5.322113990783691, + "learning_rate": 6.604731255494106e-06, + "loss": 1.8172, + "step": 51938 + }, + { + "epoch": 0.6492662316557914, + "grad_norm": 3.2093496322631836, + "learning_rate": 6.6039104204571095e-06, + "loss": 1.6591, + "step": 51940 + }, + { + "epoch": 0.649291232280807, + "grad_norm": 3.4909093379974365, + "learning_rate": 6.6030896112842524e-06, + "loss": 1.8489, + "step": 51942 + }, + { + "epoch": 0.6493162329058226, + "grad_norm": 2.9115500450134277, + "learning_rate": 6.602268827981784e-06, + "loss": 1.3166, + "step": 51944 + }, + { + "epoch": 0.6493412335308383, + "grad_norm": 2.147885799407959, + "learning_rate": 6.6014480705559544e-06, + "loss": 1.0178, + "step": 51946 + }, + { + "epoch": 0.6493662341558539, + "grad_norm": 4.1839704513549805, + "learning_rate": 6.600627339013019e-06, + "loss": 1.3592, + "step": 51948 + }, + { + "epoch": 0.6493912347808696, + "grad_norm": 3.1416714191436768, + "learning_rate": 6.599806633359222e-06, + "loss": 1.6871, + "step": 51950 + }, + { + "epoch": 0.6494162354058851, + "grad_norm": 0.003420159686356783, + "learning_rate": 6.598985953600818e-06, + "loss": 0.6671, + "step": 51952 + }, + { + "epoch": 0.6494412360309008, + "grad_norm": 2.8391425609588623, + "learning_rate": 6.598165299744057e-06, + "loss": 1.2788, + "step": 51954 + }, + { + "epoch": 0.6494662366559164, + "grad_norm": 1.9054348468780518, + "learning_rate": 6.597344671795186e-06, + "loss": 0.0758, + "step": 51956 + }, + { + "epoch": 0.649491237280932, + "grad_norm": 2.8779006004333496, + "learning_rate": 6.596524069760457e-06, + "loss": 0.6386, + "step": 51958 + }, + { + "epoch": 0.6495162379059477, + "grad_norm": 5.099360466003418, + "learning_rate": 6.595703493646123e-06, + "loss": 1.0841, + "step": 51960 + }, + { + "epoch": 0.6495412385309632, + "grad_norm": 3.1764349937438965, + "learning_rate": 6.594882943458427e-06, + "loss": 1.6567, + "step": 51962 + }, + { + "epoch": 0.6495662391559789, + "grad_norm": 3.5135114192962646, + "learning_rate": 6.594062419203619e-06, + "loss": 1.2262, + "step": 51964 + }, + { + "epoch": 0.6495912397809945, + "grad_norm": 4.531885623931885, + "learning_rate": 6.5932419208879525e-06, + "loss": 1.2907, + "step": 51966 + }, + { + "epoch": 0.6496162404060102, + "grad_norm": 3.414475679397583, + "learning_rate": 6.592421448517674e-06, + "loss": 2.0592, + "step": 51968 + }, + { + "epoch": 0.6496412410310258, + "grad_norm": 8.450145721435547, + "learning_rate": 6.591601002099031e-06, + "loss": 0.2863, + "step": 51970 + }, + { + "epoch": 0.6496662416560414, + "grad_norm": 0.006990906782448292, + "learning_rate": 6.590780581638272e-06, + "loss": 0.9447, + "step": 51972 + }, + { + "epoch": 0.649691242281057, + "grad_norm": 2.3459784984588623, + "learning_rate": 6.589960187141649e-06, + "loss": 0.1901, + "step": 51974 + }, + { + "epoch": 0.6497162429060727, + "grad_norm": 5.965463161468506, + "learning_rate": 6.5891398186154045e-06, + "loss": 1.5176, + "step": 51976 + }, + { + "epoch": 0.6497412435310883, + "grad_norm": 2.775132656097412, + "learning_rate": 6.588319476065787e-06, + "loss": 0.4047, + "step": 51978 + }, + { + "epoch": 0.649766244156104, + "grad_norm": 0.0018275832990184426, + "learning_rate": 6.587499159499049e-06, + "loss": 0.0272, + "step": 51980 + }, + { + "epoch": 0.6497912447811195, + "grad_norm": 4.0926618576049805, + "learning_rate": 6.586678868921435e-06, + "loss": 0.6003, + "step": 51982 + }, + { + "epoch": 0.6498162454061351, + "grad_norm": 4.469593524932861, + "learning_rate": 6.585858604339189e-06, + "loss": 0.7059, + "step": 51984 + }, + { + "epoch": 0.6498412460311508, + "grad_norm": 0.01701538823544979, + "learning_rate": 6.585038365758565e-06, + "loss": 0.0796, + "step": 51986 + }, + { + "epoch": 0.6498662466561664, + "grad_norm": 0.0032264653127640486, + "learning_rate": 6.584218153185804e-06, + "loss": 0.5539, + "step": 51988 + }, + { + "epoch": 0.6498912472811821, + "grad_norm": 3.3368942737579346, + "learning_rate": 6.583397966627154e-06, + "loss": 0.7446, + "step": 51990 + }, + { + "epoch": 0.6499162479061976, + "grad_norm": 3.7892093658447266, + "learning_rate": 6.582577806088861e-06, + "loss": 1.2236, + "step": 51992 + }, + { + "epoch": 0.6499412485312133, + "grad_norm": 1.0822969675064087, + "learning_rate": 6.581757671577174e-06, + "loss": 0.6298, + "step": 51994 + }, + { + "epoch": 0.6499662491562289, + "grad_norm": 3.229858160018921, + "learning_rate": 6.580937563098336e-06, + "loss": 1.0334, + "step": 51996 + }, + { + "epoch": 0.6499912497812446, + "grad_norm": 4.99310302734375, + "learning_rate": 6.580117480658594e-06, + "loss": 0.5838, + "step": 51998 + }, + { + "epoch": 0.6500162504062602, + "grad_norm": 0.01270607765763998, + "learning_rate": 6.579297424264197e-06, + "loss": 0.7541, + "step": 52000 + }, + { + "epoch": 0.6500412510312757, + "grad_norm": 3.9304006099700928, + "learning_rate": 6.578477393921384e-06, + "loss": 1.3027, + "step": 52002 + }, + { + "epoch": 0.6500662516562914, + "grad_norm": 2.3328375816345215, + "learning_rate": 6.577657389636404e-06, + "loss": 0.7184, + "step": 52004 + }, + { + "epoch": 0.650091252281307, + "grad_norm": 5.618363380432129, + "learning_rate": 6.5768374114155e-06, + "loss": 2.0426, + "step": 52006 + }, + { + "epoch": 0.6501162529063227, + "grad_norm": 6.511850833892822, + "learning_rate": 6.57601745926492e-06, + "loss": 2.0739, + "step": 52008 + }, + { + "epoch": 0.6501412535313383, + "grad_norm": 4.017881870269775, + "learning_rate": 6.575197533190905e-06, + "loss": 1.3818, + "step": 52010 + }, + { + "epoch": 0.6501662541563539, + "grad_norm": 2.318779468536377, + "learning_rate": 6.574377633199705e-06, + "loss": 1.2339, + "step": 52012 + }, + { + "epoch": 0.6501912547813695, + "grad_norm": 2.569713592529297, + "learning_rate": 6.573557759297557e-06, + "loss": 0.8716, + "step": 52014 + }, + { + "epoch": 0.6502162554063852, + "grad_norm": 4.592899322509766, + "learning_rate": 6.57273791149071e-06, + "loss": 1.3666, + "step": 52016 + }, + { + "epoch": 0.6502412560314008, + "grad_norm": 3.735142469406128, + "learning_rate": 6.571918089785405e-06, + "loss": 0.2139, + "step": 52018 + }, + { + "epoch": 0.6502662566564165, + "grad_norm": 3.3900692462921143, + "learning_rate": 6.571098294187888e-06, + "loss": 1.2836, + "step": 52020 + }, + { + "epoch": 0.650291257281432, + "grad_norm": 2.889948844909668, + "learning_rate": 6.570278524704401e-06, + "loss": 1.6597, + "step": 52022 + }, + { + "epoch": 0.6503162579064476, + "grad_norm": 0.004842672497034073, + "learning_rate": 6.569458781341188e-06, + "loss": 0.5799, + "step": 52024 + }, + { + "epoch": 0.6503412585314633, + "grad_norm": 6.09465217590332, + "learning_rate": 6.568639064104494e-06, + "loss": 0.5485, + "step": 52026 + }, + { + "epoch": 0.6503662591564789, + "grad_norm": 6.020817756652832, + "learning_rate": 6.567819373000557e-06, + "loss": 1.2433, + "step": 52028 + }, + { + "epoch": 0.6503912597814946, + "grad_norm": 0.4969739019870758, + "learning_rate": 6.566999708035623e-06, + "loss": 0.6297, + "step": 52030 + }, + { + "epoch": 0.6504162604065101, + "grad_norm": 10.598001480102539, + "learning_rate": 6.566180069215933e-06, + "loss": 1.1021, + "step": 52032 + }, + { + "epoch": 0.6504412610315258, + "grad_norm": 3.9792251586914062, + "learning_rate": 6.565360456547732e-06, + "loss": 0.8347, + "step": 52034 + }, + { + "epoch": 0.6504662616565414, + "grad_norm": 2.558258056640625, + "learning_rate": 6.564540870037257e-06, + "loss": 0.875, + "step": 52036 + }, + { + "epoch": 0.6504912622815571, + "grad_norm": 3.0881519317626953, + "learning_rate": 6.563721309690756e-06, + "loss": 0.7654, + "step": 52038 + }, + { + "epoch": 0.6505162629065727, + "grad_norm": 0.0003727294970303774, + "learning_rate": 6.562901775514466e-06, + "loss": 0.0737, + "step": 52040 + }, + { + "epoch": 0.6505412635315883, + "grad_norm": 5.1318488121032715, + "learning_rate": 6.562082267514631e-06, + "loss": 2.1796, + "step": 52042 + }, + { + "epoch": 0.6505662641566039, + "grad_norm": 0.9537315964698792, + "learning_rate": 6.5612627856974895e-06, + "loss": 0.6008, + "step": 52044 + }, + { + "epoch": 0.6505912647816195, + "grad_norm": 1.772791862487793, + "learning_rate": 6.560443330069286e-06, + "loss": 1.1834, + "step": 52046 + }, + { + "epoch": 0.6506162654066352, + "grad_norm": 0.634503960609436, + "learning_rate": 6.559623900636259e-06, + "loss": 0.7894, + "step": 52048 + }, + { + "epoch": 0.6506412660316508, + "grad_norm": 4.368638515472412, + "learning_rate": 6.558804497404649e-06, + "loss": 1.5179, + "step": 52050 + }, + { + "epoch": 0.6506662666566664, + "grad_norm": 5.596896171569824, + "learning_rate": 6.557985120380701e-06, + "loss": 1.4695, + "step": 52052 + }, + { + "epoch": 0.650691267281682, + "grad_norm": 0.002516870852559805, + "learning_rate": 6.557165769570648e-06, + "loss": 0.4049, + "step": 52054 + }, + { + "epoch": 0.6507162679066977, + "grad_norm": 3.1822872161865234, + "learning_rate": 6.556346444980734e-06, + "loss": 0.8362, + "step": 52056 + }, + { + "epoch": 0.6507412685317133, + "grad_norm": 3.6849656105041504, + "learning_rate": 6.555527146617199e-06, + "loss": 1.3503, + "step": 52058 + }, + { + "epoch": 0.650766269156729, + "grad_norm": 1.5391446352005005, + "learning_rate": 6.554707874486281e-06, + "loss": 0.0496, + "step": 52060 + }, + { + "epoch": 0.6507912697817445, + "grad_norm": 1.0007745027542114, + "learning_rate": 6.553888628594222e-06, + "loss": 1.4793, + "step": 52062 + }, + { + "epoch": 0.6508162704067602, + "grad_norm": 0.0004920235951431096, + "learning_rate": 6.553069408947259e-06, + "loss": 0.5264, + "step": 52064 + }, + { + "epoch": 0.6508412710317758, + "grad_norm": 2.606337070465088, + "learning_rate": 6.552250215551635e-06, + "loss": 1.2372, + "step": 52066 + }, + { + "epoch": 0.6508662716567915, + "grad_norm": 4.108182430267334, + "learning_rate": 6.551431048413584e-06, + "loss": 2.5513, + "step": 52068 + }, + { + "epoch": 0.6508912722818071, + "grad_norm": 7.027408123016357, + "learning_rate": 6.5506119075393445e-06, + "loss": 0.5138, + "step": 52070 + }, + { + "epoch": 0.6509162729068226, + "grad_norm": 4.7535881996154785, + "learning_rate": 6.549792792935159e-06, + "loss": 0.9966, + "step": 52072 + }, + { + "epoch": 0.6509412735318383, + "grad_norm": 0.7938134074211121, + "learning_rate": 6.548973704607262e-06, + "loss": 1.0875, + "step": 52074 + }, + { + "epoch": 0.6509662741568539, + "grad_norm": 3.681165933609009, + "learning_rate": 6.548154642561894e-06, + "loss": 0.7716, + "step": 52076 + }, + { + "epoch": 0.6509912747818696, + "grad_norm": 5.247988700866699, + "learning_rate": 6.547335606805294e-06, + "loss": 1.3235, + "step": 52078 + }, + { + "epoch": 0.6510162754068852, + "grad_norm": 3.647115707397461, + "learning_rate": 6.546516597343696e-06, + "loss": 0.5304, + "step": 52080 + }, + { + "epoch": 0.6510412760319008, + "grad_norm": 2.4735054969787598, + "learning_rate": 6.545697614183339e-06, + "loss": 0.5293, + "step": 52082 + }, + { + "epoch": 0.6510662766569164, + "grad_norm": 3.1003243923187256, + "learning_rate": 6.54487865733046e-06, + "loss": 1.5672, + "step": 52084 + }, + { + "epoch": 0.6510912772819321, + "grad_norm": 2.2092700004577637, + "learning_rate": 6.544059726791298e-06, + "loss": 1.2362, + "step": 52086 + }, + { + "epoch": 0.6511162779069477, + "grad_norm": 0.0013540390646085143, + "learning_rate": 6.543240822572088e-06, + "loss": 0.1843, + "step": 52088 + }, + { + "epoch": 0.6511412785319634, + "grad_norm": 5.060713291168213, + "learning_rate": 6.542421944679064e-06, + "loss": 0.7235, + "step": 52090 + }, + { + "epoch": 0.6511662791569789, + "grad_norm": 3.5135691165924072, + "learning_rate": 6.541603093118471e-06, + "loss": 1.297, + "step": 52092 + }, + { + "epoch": 0.6511912797819945, + "grad_norm": 3.0853540897369385, + "learning_rate": 6.5407842678965385e-06, + "loss": 1.2296, + "step": 52094 + }, + { + "epoch": 0.6512162804070102, + "grad_norm": 5.097393989562988, + "learning_rate": 6.539965469019501e-06, + "loss": 0.6152, + "step": 52096 + }, + { + "epoch": 0.6512412810320258, + "grad_norm": 4.517378330230713, + "learning_rate": 6.539146696493599e-06, + "loss": 1.7881, + "step": 52098 + }, + { + "epoch": 0.6512662816570415, + "grad_norm": 8.326416015625, + "learning_rate": 6.5383279503250655e-06, + "loss": 1.2046, + "step": 52100 + }, + { + "epoch": 0.651291282282057, + "grad_norm": 3.815584421157837, + "learning_rate": 6.537509230520137e-06, + "loss": 1.3339, + "step": 52102 + }, + { + "epoch": 0.6513162829070727, + "grad_norm": 2.1803834438323975, + "learning_rate": 6.536690537085051e-06, + "loss": 0.3291, + "step": 52104 + }, + { + "epoch": 0.6513412835320883, + "grad_norm": 10.938986778259277, + "learning_rate": 6.535871870026038e-06, + "loss": 2.5007, + "step": 52106 + }, + { + "epoch": 0.651366284157104, + "grad_norm": 1.491459846496582, + "learning_rate": 6.535053229349335e-06, + "loss": 0.0259, + "step": 52108 + }, + { + "epoch": 0.6513912847821196, + "grad_norm": 4.155773639678955, + "learning_rate": 6.534234615061176e-06, + "loss": 0.5738, + "step": 52110 + }, + { + "epoch": 0.6514162854071351, + "grad_norm": 3.6501200199127197, + "learning_rate": 6.533416027167796e-06, + "loss": 1.0912, + "step": 52112 + }, + { + "epoch": 0.6514412860321508, + "grad_norm": 2.98408842086792, + "learning_rate": 6.5325974656754295e-06, + "loss": 0.842, + "step": 52114 + }, + { + "epoch": 0.6514662866571664, + "grad_norm": 2.1655173301696777, + "learning_rate": 6.531778930590309e-06, + "loss": 1.1675, + "step": 52116 + }, + { + "epoch": 0.6514912872821821, + "grad_norm": 2.3912458419799805, + "learning_rate": 6.530960421918675e-06, + "loss": 1.8119, + "step": 52118 + }, + { + "epoch": 0.6515162879071977, + "grad_norm": 9.02014446258545, + "learning_rate": 6.530141939666752e-06, + "loss": 1.2555, + "step": 52120 + }, + { + "epoch": 0.6515412885322133, + "grad_norm": 3.119892120361328, + "learning_rate": 6.5293234838407785e-06, + "loss": 0.6529, + "step": 52122 + }, + { + "epoch": 0.6515662891572289, + "grad_norm": 4.383110046386719, + "learning_rate": 6.528505054446985e-06, + "loss": 0.8235, + "step": 52124 + }, + { + "epoch": 0.6515912897822446, + "grad_norm": 2.0163419246673584, + "learning_rate": 6.527686651491607e-06, + "loss": 0.3053, + "step": 52126 + }, + { + "epoch": 0.6516162904072602, + "grad_norm": 4.586246490478516, + "learning_rate": 6.5268682749808735e-06, + "loss": 0.6933, + "step": 52128 + }, + { + "epoch": 0.6516412910322759, + "grad_norm": 1.3928872346878052, + "learning_rate": 6.526049924921027e-06, + "loss": 0.4271, + "step": 52130 + }, + { + "epoch": 0.6516662916572914, + "grad_norm": 0.5479072332382202, + "learning_rate": 6.525231601318288e-06, + "loss": 0.02, + "step": 52132 + }, + { + "epoch": 0.651691292282307, + "grad_norm": 5.155518531799316, + "learning_rate": 6.524413304178895e-06, + "loss": 1.2155, + "step": 52134 + }, + { + "epoch": 0.6517162929073227, + "grad_norm": 1.6546586751937866, + "learning_rate": 6.5235950335090784e-06, + "loss": 1.2022, + "step": 52136 + }, + { + "epoch": 0.6517412935323383, + "grad_norm": 0.047843512147665024, + "learning_rate": 6.522776789315069e-06, + "loss": 0.1279, + "step": 52138 + }, + { + "epoch": 0.651766294157354, + "grad_norm": 2.280433177947998, + "learning_rate": 6.5219585716030995e-06, + "loss": 0.9101, + "step": 52140 + }, + { + "epoch": 0.6517912947823695, + "grad_norm": 4.471336364746094, + "learning_rate": 6.521140380379403e-06, + "loss": 0.8193, + "step": 52142 + }, + { + "epoch": 0.6518162954073852, + "grad_norm": 2.213181257247925, + "learning_rate": 6.5203222156502145e-06, + "loss": 0.3323, + "step": 52144 + }, + { + "epoch": 0.6518412960324008, + "grad_norm": 4.111638069152832, + "learning_rate": 6.519504077421754e-06, + "loss": 1.747, + "step": 52146 + }, + { + "epoch": 0.6518662966574165, + "grad_norm": 0.0006799734546802938, + "learning_rate": 6.51868596570026e-06, + "loss": 0.4844, + "step": 52148 + }, + { + "epoch": 0.6518912972824321, + "grad_norm": 4.203853607177734, + "learning_rate": 6.51786788049196e-06, + "loss": 1.2995, + "step": 52150 + }, + { + "epoch": 0.6519162979074477, + "grad_norm": 0.0012489031068980694, + "learning_rate": 6.517049821803085e-06, + "loss": 0.0241, + "step": 52152 + }, + { + "epoch": 0.6519412985324633, + "grad_norm": 0.38662320375442505, + "learning_rate": 6.516231789639868e-06, + "loss": 0.8441, + "step": 52154 + }, + { + "epoch": 0.651966299157479, + "grad_norm": 4.529412269592285, + "learning_rate": 6.515413784008539e-06, + "loss": 0.0795, + "step": 52156 + }, + { + "epoch": 0.6519912997824946, + "grad_norm": 2.0793824195861816, + "learning_rate": 6.514595804915325e-06, + "loss": 1.8613, + "step": 52158 + }, + { + "epoch": 0.6520163004075102, + "grad_norm": 3.88515305519104, + "learning_rate": 6.513777852366454e-06, + "loss": 0.7801, + "step": 52160 + }, + { + "epoch": 0.6520413010325258, + "grad_norm": 2.139907121658325, + "learning_rate": 6.5129599263681585e-06, + "loss": 0.6634, + "step": 52162 + }, + { + "epoch": 0.6520663016575414, + "grad_norm": 3.2928154468536377, + "learning_rate": 6.5121420269266655e-06, + "loss": 1.3403, + "step": 52164 + }, + { + "epoch": 0.6520913022825571, + "grad_norm": 2.8040828704833984, + "learning_rate": 6.511324154048207e-06, + "loss": 1.4725, + "step": 52166 + }, + { + "epoch": 0.6521163029075727, + "grad_norm": 2.7805581092834473, + "learning_rate": 6.5105063077390116e-06, + "loss": 1.025, + "step": 52168 + }, + { + "epoch": 0.6521413035325884, + "grad_norm": 2.5348916053771973, + "learning_rate": 6.50968848800531e-06, + "loss": 0.2859, + "step": 52170 + }, + { + "epoch": 0.6521663041576039, + "grad_norm": 2.7725462913513184, + "learning_rate": 6.508870694853323e-06, + "loss": 0.4773, + "step": 52172 + }, + { + "epoch": 0.6521913047826196, + "grad_norm": 3.341923952102661, + "learning_rate": 6.508052928289284e-06, + "loss": 1.1379, + "step": 52174 + }, + { + "epoch": 0.6522163054076352, + "grad_norm": 3.2236883640289307, + "learning_rate": 6.507235188319418e-06, + "loss": 0.7101, + "step": 52176 + }, + { + "epoch": 0.6522413060326508, + "grad_norm": 3.565488576889038, + "learning_rate": 6.506417474949956e-06, + "loss": 0.7179, + "step": 52178 + }, + { + "epoch": 0.6522663066576665, + "grad_norm": 2.594979763031006, + "learning_rate": 6.5055997881871245e-06, + "loss": 0.181, + "step": 52180 + }, + { + "epoch": 0.652291307282682, + "grad_norm": 3.0181117057800293, + "learning_rate": 6.504782128037155e-06, + "loss": 1.0997, + "step": 52182 + }, + { + "epoch": 0.6523163079076977, + "grad_norm": 6.311415195465088, + "learning_rate": 6.5039644945062666e-06, + "loss": 1.2952, + "step": 52184 + }, + { + "epoch": 0.6523413085327133, + "grad_norm": 5.233455657958984, + "learning_rate": 6.503146887600691e-06, + "loss": 0.2459, + "step": 52186 + }, + { + "epoch": 0.652366309157729, + "grad_norm": 2.8190951347351074, + "learning_rate": 6.5023293073266524e-06, + "loss": 0.3095, + "step": 52188 + }, + { + "epoch": 0.6523913097827446, + "grad_norm": 2.729656934738159, + "learning_rate": 6.501511753690379e-06, + "loss": 0.5061, + "step": 52190 + }, + { + "epoch": 0.6524163104077602, + "grad_norm": 0.2850929796695709, + "learning_rate": 6.500694226698099e-06, + "loss": 0.546, + "step": 52192 + }, + { + "epoch": 0.6524413110327758, + "grad_norm": 3.7457849979400635, + "learning_rate": 6.499876726356036e-06, + "loss": 1.3125, + "step": 52194 + }, + { + "epoch": 0.6524663116577915, + "grad_norm": 2.612583875656128, + "learning_rate": 6.499059252670421e-06, + "loss": 0.6312, + "step": 52196 + }, + { + "epoch": 0.6524913122828071, + "grad_norm": 3.0410661697387695, + "learning_rate": 6.498241805647473e-06, + "loss": 0.5655, + "step": 52198 + }, + { + "epoch": 0.6525163129078227, + "grad_norm": 0.0008983595762401819, + "learning_rate": 6.497424385293419e-06, + "loss": 0.4979, + "step": 52200 + }, + { + "epoch": 0.6525413135328383, + "grad_norm": 3.321399211883545, + "learning_rate": 6.496606991614484e-06, + "loss": 0.6963, + "step": 52202 + }, + { + "epoch": 0.6525663141578539, + "grad_norm": 2.106973648071289, + "learning_rate": 6.495789624616896e-06, + "loss": 1.4205, + "step": 52204 + }, + { + "epoch": 0.6525913147828696, + "grad_norm": 3.929898738861084, + "learning_rate": 6.4949722843068795e-06, + "loss": 1.4127, + "step": 52206 + }, + { + "epoch": 0.6526163154078852, + "grad_norm": 3.9382407665252686, + "learning_rate": 6.4941549706906625e-06, + "loss": 1.0038, + "step": 52208 + }, + { + "epoch": 0.6526413160329009, + "grad_norm": 1.6567710638046265, + "learning_rate": 6.4933376837744614e-06, + "loss": 0.7569, + "step": 52210 + }, + { + "epoch": 0.6526663166579164, + "grad_norm": 2.2451212406158447, + "learning_rate": 6.492520423564504e-06, + "loss": 0.43, + "step": 52212 + }, + { + "epoch": 0.6526913172829321, + "grad_norm": 4.418778419494629, + "learning_rate": 6.491703190067014e-06, + "loss": 1.3747, + "step": 52214 + }, + { + "epoch": 0.6527163179079477, + "grad_norm": 4.515093803405762, + "learning_rate": 6.490885983288218e-06, + "loss": 0.9468, + "step": 52216 + }, + { + "epoch": 0.6527413185329634, + "grad_norm": 3.7592904567718506, + "learning_rate": 6.490068803234337e-06, + "loss": 1.9909, + "step": 52218 + }, + { + "epoch": 0.652766319157979, + "grad_norm": 0.9089808464050293, + "learning_rate": 6.4892516499115965e-06, + "loss": 1.0686, + "step": 52220 + }, + { + "epoch": 0.6527913197829945, + "grad_norm": 0.025424489751458168, + "learning_rate": 6.488434523326223e-06, + "loss": 0.0003, + "step": 52222 + }, + { + "epoch": 0.6528163204080102, + "grad_norm": 3.1356096267700195, + "learning_rate": 6.487617423484432e-06, + "loss": 1.2238, + "step": 52224 + }, + { + "epoch": 0.6528413210330258, + "grad_norm": 3.453770399093628, + "learning_rate": 6.486800350392448e-06, + "loss": 0.6502, + "step": 52226 + }, + { + "epoch": 0.6528663216580415, + "grad_norm": 10.721957206726074, + "learning_rate": 6.485983304056498e-06, + "loss": 1.4754, + "step": 52228 + }, + { + "epoch": 0.6528913222830571, + "grad_norm": 2.446366548538208, + "learning_rate": 6.4851662844828014e-06, + "loss": 0.4605, + "step": 52230 + }, + { + "epoch": 0.6529163229080727, + "grad_norm": 4.368247985839844, + "learning_rate": 6.484349291677581e-06, + "loss": 1.1899, + "step": 52232 + }, + { + "epoch": 0.6529413235330883, + "grad_norm": 3.9645955562591553, + "learning_rate": 6.483532325647064e-06, + "loss": 1.3341, + "step": 52234 + }, + { + "epoch": 0.652966324158104, + "grad_norm": 7.865978240966797, + "learning_rate": 6.482715386397464e-06, + "loss": 1.5415, + "step": 52236 + }, + { + "epoch": 0.6529913247831196, + "grad_norm": 5.0840277671813965, + "learning_rate": 6.481898473935004e-06, + "loss": 1.3834, + "step": 52238 + }, + { + "epoch": 0.6530163254081353, + "grad_norm": 0.125177800655365, + "learning_rate": 6.481081588265908e-06, + "loss": 0.0175, + "step": 52240 + }, + { + "epoch": 0.6530413260331508, + "grad_norm": 4.274889945983887, + "learning_rate": 6.4802647293963994e-06, + "loss": 0.9554, + "step": 52242 + }, + { + "epoch": 0.6530663266581664, + "grad_norm": 3.319234848022461, + "learning_rate": 6.479447897332696e-06, + "loss": 0.9756, + "step": 52244 + }, + { + "epoch": 0.6530913272831821, + "grad_norm": 2.91135311126709, + "learning_rate": 6.478631092081018e-06, + "loss": 1.1223, + "step": 52246 + }, + { + "epoch": 0.6531163279081977, + "grad_norm": 4.167426109313965, + "learning_rate": 6.477814313647594e-06, + "loss": 1.0776, + "step": 52248 + }, + { + "epoch": 0.6531413285332134, + "grad_norm": 2.2805471420288086, + "learning_rate": 6.476997562038633e-06, + "loss": 0.8631, + "step": 52250 + }, + { + "epoch": 0.6531663291582289, + "grad_norm": 3.8405394554138184, + "learning_rate": 6.476180837260357e-06, + "loss": 1.5329, + "step": 52252 + }, + { + "epoch": 0.6531913297832446, + "grad_norm": 1.0193432569503784, + "learning_rate": 6.475364139318994e-06, + "loss": 0.2314, + "step": 52254 + }, + { + "epoch": 0.6532163304082602, + "grad_norm": 2.7892537117004395, + "learning_rate": 6.474547468220757e-06, + "loss": 0.6089, + "step": 52256 + }, + { + "epoch": 0.6532413310332759, + "grad_norm": 3.337420701980591, + "learning_rate": 6.4737308239718696e-06, + "loss": 0.7076, + "step": 52258 + }, + { + "epoch": 0.6532663316582915, + "grad_norm": 1.0472697019577026, + "learning_rate": 6.4729142065785535e-06, + "loss": 0.2864, + "step": 52260 + }, + { + "epoch": 0.653291332283307, + "grad_norm": 1.5397812128067017, + "learning_rate": 6.47209761604702e-06, + "loss": 0.5097, + "step": 52262 + }, + { + "epoch": 0.6533163329083227, + "grad_norm": 4.489640712738037, + "learning_rate": 6.471281052383489e-06, + "loss": 1.2657, + "step": 52264 + }, + { + "epoch": 0.6533413335333383, + "grad_norm": 0.0275848601013422, + "learning_rate": 6.470464515594186e-06, + "loss": 0.1893, + "step": 52266 + }, + { + "epoch": 0.653366334158354, + "grad_norm": 1.2196475267410278, + "learning_rate": 6.469648005685325e-06, + "loss": 0.5945, + "step": 52268 + }, + { + "epoch": 0.6533913347833696, + "grad_norm": 3.5341904163360596, + "learning_rate": 6.468831522663126e-06, + "loss": 0.716, + "step": 52270 + }, + { + "epoch": 0.6534163354083852, + "grad_norm": 3.377706527709961, + "learning_rate": 6.468015066533807e-06, + "loss": 1.6482, + "step": 52272 + }, + { + "epoch": 0.6534413360334008, + "grad_norm": 4.466422080993652, + "learning_rate": 6.467198637303589e-06, + "loss": 1.1528, + "step": 52274 + }, + { + "epoch": 0.6534663366584165, + "grad_norm": 4.625186443328857, + "learning_rate": 6.466382234978681e-06, + "loss": 1.2613, + "step": 52276 + }, + { + "epoch": 0.6534913372834321, + "grad_norm": 3.6818387508392334, + "learning_rate": 6.465565859565309e-06, + "loss": 1.1999, + "step": 52278 + }, + { + "epoch": 0.6535163379084478, + "grad_norm": 3.595052480697632, + "learning_rate": 6.464749511069687e-06, + "loss": 0.6078, + "step": 52280 + }, + { + "epoch": 0.6535413385334633, + "grad_norm": 2.4577159881591797, + "learning_rate": 6.4639331894980326e-06, + "loss": 1.1146, + "step": 52282 + }, + { + "epoch": 0.653566339158479, + "grad_norm": 0.9880669116973877, + "learning_rate": 6.463116894856563e-06, + "loss": 1.1314, + "step": 52284 + }, + { + "epoch": 0.6535913397834946, + "grad_norm": 0.008263754658401012, + "learning_rate": 6.462300627151498e-06, + "loss": 0.3563, + "step": 52286 + }, + { + "epoch": 0.6536163404085102, + "grad_norm": 0.8202353119850159, + "learning_rate": 6.461484386389047e-06, + "loss": 0.8517, + "step": 52288 + }, + { + "epoch": 0.6536413410335259, + "grad_norm": 3.750088691711426, + "learning_rate": 6.460668172575433e-06, + "loss": 0.817, + "step": 52290 + }, + { + "epoch": 0.6536663416585414, + "grad_norm": 0.0017510215984657407, + "learning_rate": 6.459851985716867e-06, + "loss": 0.7351, + "step": 52292 + }, + { + "epoch": 0.6536913422835571, + "grad_norm": 4.612606525421143, + "learning_rate": 6.45903582581957e-06, + "loss": 0.1798, + "step": 52294 + }, + { + "epoch": 0.6537163429085727, + "grad_norm": 1.4887374639511108, + "learning_rate": 6.458219692889753e-06, + "loss": 0.5158, + "step": 52296 + }, + { + "epoch": 0.6537413435335884, + "grad_norm": 0.0008823073003441095, + "learning_rate": 6.457403586933637e-06, + "loss": 0.7802, + "step": 52298 + }, + { + "epoch": 0.653766344158604, + "grad_norm": 0.9044135808944702, + "learning_rate": 6.456587507957435e-06, + "loss": 0.4431, + "step": 52300 + }, + { + "epoch": 0.6537913447836196, + "grad_norm": 1.962436556816101, + "learning_rate": 6.455771455967359e-06, + "loss": 1.0363, + "step": 52302 + }, + { + "epoch": 0.6538163454086352, + "grad_norm": 5.777256011962891, + "learning_rate": 6.454955430969626e-06, + "loss": 2.2784, + "step": 52304 + }, + { + "epoch": 0.6538413460336508, + "grad_norm": 5.672646522521973, + "learning_rate": 6.454139432970452e-06, + "loss": 0.4934, + "step": 52306 + }, + { + "epoch": 0.6538663466586665, + "grad_norm": 3.838108777999878, + "learning_rate": 6.453323461976051e-06, + "loss": 0.291, + "step": 52308 + }, + { + "epoch": 0.6538913472836821, + "grad_norm": 4.205314636230469, + "learning_rate": 6.452507517992636e-06, + "loss": 1.1026, + "step": 52310 + }, + { + "epoch": 0.6539163479086977, + "grad_norm": 2.8476474285125732, + "learning_rate": 6.4516916010264265e-06, + "loss": 0.4081, + "step": 52312 + }, + { + "epoch": 0.6539413485337133, + "grad_norm": 4.068366050720215, + "learning_rate": 6.450875711083626e-06, + "loss": 0.9134, + "step": 52314 + }, + { + "epoch": 0.653966349158729, + "grad_norm": 4.986231803894043, + "learning_rate": 6.4500598481704556e-06, + "loss": 1.5503, + "step": 52316 + }, + { + "epoch": 0.6539913497837446, + "grad_norm": 0.010133028030395508, + "learning_rate": 6.449244012293128e-06, + "loss": 0.4061, + "step": 52318 + }, + { + "epoch": 0.6540163504087603, + "grad_norm": 0.0008607741328887641, + "learning_rate": 6.448428203457856e-06, + "loss": 0.0965, + "step": 52320 + }, + { + "epoch": 0.6540413510337758, + "grad_norm": 3.546941041946411, + "learning_rate": 6.447612421670853e-06, + "loss": 1.2475, + "step": 52322 + }, + { + "epoch": 0.6540663516587915, + "grad_norm": 2.731597423553467, + "learning_rate": 6.446796666938331e-06, + "loss": 1.6005, + "step": 52324 + }, + { + "epoch": 0.6540913522838071, + "grad_norm": 3.4003560543060303, + "learning_rate": 6.445980939266504e-06, + "loss": 0.5508, + "step": 52326 + }, + { + "epoch": 0.6541163529088228, + "grad_norm": 3.1575348377227783, + "learning_rate": 6.445165238661583e-06, + "loss": 0.8467, + "step": 52328 + }, + { + "epoch": 0.6541413535338384, + "grad_norm": 5.461516380310059, + "learning_rate": 6.444349565129779e-06, + "loss": 1.2157, + "step": 52330 + }, + { + "epoch": 0.6541663541588539, + "grad_norm": 4.347072601318359, + "learning_rate": 6.4435339186773085e-06, + "loss": 0.944, + "step": 52332 + }, + { + "epoch": 0.6541913547838696, + "grad_norm": 4.543852806091309, + "learning_rate": 6.44271829931038e-06, + "loss": 2.0991, + "step": 52334 + }, + { + "epoch": 0.6542163554088852, + "grad_norm": 0.0032680733129382133, + "learning_rate": 6.441902707035205e-06, + "loss": 0.0001, + "step": 52336 + }, + { + "epoch": 0.6542413560339009, + "grad_norm": 6.638027191162109, + "learning_rate": 6.441087141857996e-06, + "loss": 1.798, + "step": 52338 + }, + { + "epoch": 0.6542663566589165, + "grad_norm": 3.435964822769165, + "learning_rate": 6.440271603784967e-06, + "loss": 1.3856, + "step": 52340 + }, + { + "epoch": 0.6542913572839321, + "grad_norm": 4.1573920249938965, + "learning_rate": 6.439456092822323e-06, + "loss": 1.3098, + "step": 52342 + }, + { + "epoch": 0.6543163579089477, + "grad_norm": 3.7262914180755615, + "learning_rate": 6.438640608976278e-06, + "loss": 1.3393, + "step": 52344 + }, + { + "epoch": 0.6543413585339634, + "grad_norm": 3.7023730278015137, + "learning_rate": 6.437825152253042e-06, + "loss": 0.7781, + "step": 52346 + }, + { + "epoch": 0.654366359158979, + "grad_norm": 1.3328315019607544, + "learning_rate": 6.437009722658827e-06, + "loss": 0.1009, + "step": 52348 + }, + { + "epoch": 0.6543913597839947, + "grad_norm": 3.424483299255371, + "learning_rate": 6.436194320199842e-06, + "loss": 1.176, + "step": 52350 + }, + { + "epoch": 0.6544163604090102, + "grad_norm": 3.3165364265441895, + "learning_rate": 6.4353789448822976e-06, + "loss": 0.8201, + "step": 52352 + }, + { + "epoch": 0.6544413610340258, + "grad_norm": 0.01078178733587265, + "learning_rate": 6.434563596712403e-06, + "loss": 0.8798, + "step": 52354 + }, + { + "epoch": 0.6544663616590415, + "grad_norm": 0.0005209339433349669, + "learning_rate": 6.433748275696368e-06, + "loss": 0.3996, + "step": 52356 + }, + { + "epoch": 0.6544913622840571, + "grad_norm": 3.044234037399292, + "learning_rate": 6.432932981840401e-06, + "loss": 0.945, + "step": 52358 + }, + { + "epoch": 0.6545163629090728, + "grad_norm": 3.003131866455078, + "learning_rate": 6.432117715150711e-06, + "loss": 0.5635, + "step": 52360 + }, + { + "epoch": 0.6545413635340883, + "grad_norm": 1.2604451179504395, + "learning_rate": 6.43130247563351e-06, + "loss": 0.8555, + "step": 52362 + }, + { + "epoch": 0.654566364159104, + "grad_norm": 3.9718170166015625, + "learning_rate": 6.430487263295002e-06, + "loss": 0.3904, + "step": 52364 + }, + { + "epoch": 0.6545913647841196, + "grad_norm": 0.001784054678864777, + "learning_rate": 6.429672078141402e-06, + "loss": 0.7714, + "step": 52366 + }, + { + "epoch": 0.6546163654091353, + "grad_norm": 2.793652057647705, + "learning_rate": 6.428856920178912e-06, + "loss": 1.189, + "step": 52368 + }, + { + "epoch": 0.6546413660341509, + "grad_norm": 1.9842429161071777, + "learning_rate": 6.428041789413743e-06, + "loss": 0.4402, + "step": 52370 + }, + { + "epoch": 0.6546663666591664, + "grad_norm": 4.409769535064697, + "learning_rate": 6.4272266858521035e-06, + "loss": 1.1102, + "step": 52372 + }, + { + "epoch": 0.6546913672841821, + "grad_norm": 3.600761651992798, + "learning_rate": 6.426411609500199e-06, + "loss": 1.0621, + "step": 52374 + }, + { + "epoch": 0.6547163679091977, + "grad_norm": 2.987100124359131, + "learning_rate": 6.425596560364239e-06, + "loss": 0.3253, + "step": 52376 + }, + { + "epoch": 0.6547413685342134, + "grad_norm": 1.0422694683074951, + "learning_rate": 6.424781538450432e-06, + "loss": 0.6147, + "step": 52378 + }, + { + "epoch": 0.654766369159229, + "grad_norm": 0.24931472539901733, + "learning_rate": 6.423966543764981e-06, + "loss": 0.7213, + "step": 52380 + }, + { + "epoch": 0.6547913697842446, + "grad_norm": 2.4700865745544434, + "learning_rate": 6.423151576314095e-06, + "loss": 0.6345, + "step": 52382 + }, + { + "epoch": 0.6548163704092602, + "grad_norm": 3.6419780254364014, + "learning_rate": 6.422336636103981e-06, + "loss": 0.2083, + "step": 52384 + }, + { + "epoch": 0.6548413710342759, + "grad_norm": 6.347762584686279, + "learning_rate": 6.421521723140847e-06, + "loss": 1.2655, + "step": 52386 + }, + { + "epoch": 0.6548663716592915, + "grad_norm": 4.179400444030762, + "learning_rate": 6.4207068374308945e-06, + "loss": 0.8111, + "step": 52388 + }, + { + "epoch": 0.6548913722843072, + "grad_norm": 2.722297430038452, + "learning_rate": 6.4198919789803335e-06, + "loss": 0.7185, + "step": 52390 + }, + { + "epoch": 0.6549163729093227, + "grad_norm": 3.4960763454437256, + "learning_rate": 6.419077147795372e-06, + "loss": 1.9137, + "step": 52392 + }, + { + "epoch": 0.6549413735343383, + "grad_norm": 5.248880386352539, + "learning_rate": 6.418262343882209e-06, + "loss": 2.0358, + "step": 52394 + }, + { + "epoch": 0.654966374159354, + "grad_norm": 3.228560447692871, + "learning_rate": 6.417447567247055e-06, + "loss": 0.4612, + "step": 52396 + }, + { + "epoch": 0.6549913747843696, + "grad_norm": 0.5677122473716736, + "learning_rate": 6.416632817896114e-06, + "loss": 0.4906, + "step": 52398 + }, + { + "epoch": 0.6550163754093853, + "grad_norm": 2.825948715209961, + "learning_rate": 6.415818095835588e-06, + "loss": 0.3527, + "step": 52400 + }, + { + "epoch": 0.6550413760344008, + "grad_norm": 3.484976053237915, + "learning_rate": 6.4150034010716865e-06, + "loss": 1.1233, + "step": 52402 + }, + { + "epoch": 0.6550663766594165, + "grad_norm": 0.5411294102668762, + "learning_rate": 6.414188733610614e-06, + "loss": 0.0841, + "step": 52404 + }, + { + "epoch": 0.6550913772844321, + "grad_norm": 0.7335596084594727, + "learning_rate": 6.413374093458571e-06, + "loss": 0.0274, + "step": 52406 + }, + { + "epoch": 0.6551163779094478, + "grad_norm": 4.946324348449707, + "learning_rate": 6.412559480621765e-06, + "loss": 0.7948, + "step": 52408 + }, + { + "epoch": 0.6551413785344634, + "grad_norm": 3.785665273666382, + "learning_rate": 6.4117448951063965e-06, + "loss": 1.8574, + "step": 52410 + }, + { + "epoch": 0.655166379159479, + "grad_norm": 5.3968000411987305, + "learning_rate": 6.410930336918672e-06, + "loss": 1.1548, + "step": 52412 + }, + { + "epoch": 0.6551913797844946, + "grad_norm": 3.7250423431396484, + "learning_rate": 6.410115806064795e-06, + "loss": 1.5476, + "step": 52414 + }, + { + "epoch": 0.6552163804095102, + "grad_norm": 3.2649407386779785, + "learning_rate": 6.4093013025509675e-06, + "loss": 0.7829, + "step": 52416 + }, + { + "epoch": 0.6552413810345259, + "grad_norm": 0.0006163049838505685, + "learning_rate": 6.408486826383398e-06, + "loss": 0.2034, + "step": 52418 + }, + { + "epoch": 0.6552663816595415, + "grad_norm": 0.0009831972420215607, + "learning_rate": 6.40767237756828e-06, + "loss": 0.0096, + "step": 52420 + }, + { + "epoch": 0.6552913822845571, + "grad_norm": 3.282468795776367, + "learning_rate": 6.406857956111823e-06, + "loss": 1.6186, + "step": 52422 + }, + { + "epoch": 0.6553163829095727, + "grad_norm": 5.164287090301514, + "learning_rate": 6.406043562020227e-06, + "loss": 1.5145, + "step": 52424 + }, + { + "epoch": 0.6553413835345884, + "grad_norm": 5.129439830780029, + "learning_rate": 6.405229195299695e-06, + "loss": 1.1454, + "step": 52426 + }, + { + "epoch": 0.655366384159604, + "grad_norm": 2.9952962398529053, + "learning_rate": 6.404414855956429e-06, + "loss": 0.8783, + "step": 52428 + }, + { + "epoch": 0.6553913847846197, + "grad_norm": 3.7691473960876465, + "learning_rate": 6.403600543996633e-06, + "loss": 0.2258, + "step": 52430 + }, + { + "epoch": 0.6554163854096352, + "grad_norm": 2.9529871940612793, + "learning_rate": 6.402786259426504e-06, + "loss": 1.522, + "step": 52432 + }, + { + "epoch": 0.6554413860346509, + "grad_norm": 5.7507781982421875, + "learning_rate": 6.401972002252248e-06, + "loss": 1.9976, + "step": 52434 + }, + { + "epoch": 0.6554663866596665, + "grad_norm": 2.376732349395752, + "learning_rate": 6.401157772480062e-06, + "loss": 0.9951, + "step": 52436 + }, + { + "epoch": 0.6554913872846821, + "grad_norm": 2.2727184295654297, + "learning_rate": 6.400343570116152e-06, + "loss": 0.3671, + "step": 52438 + }, + { + "epoch": 0.6555163879096978, + "grad_norm": 3.860431671142578, + "learning_rate": 6.399529395166715e-06, + "loss": 0.8296, + "step": 52440 + }, + { + "epoch": 0.6555413885347133, + "grad_norm": 2.2011356353759766, + "learning_rate": 6.398715247637953e-06, + "loss": 0.3804, + "step": 52442 + }, + { + "epoch": 0.655566389159729, + "grad_norm": 2.9356565475463867, + "learning_rate": 6.397901127536068e-06, + "loss": 1.4595, + "step": 52444 + }, + { + "epoch": 0.6555913897847446, + "grad_norm": 3.773688793182373, + "learning_rate": 6.397087034867258e-06, + "loss": 0.4102, + "step": 52446 + }, + { + "epoch": 0.6556163904097603, + "grad_norm": 4.206977844238281, + "learning_rate": 6.396272969637722e-06, + "loss": 1.0065, + "step": 52448 + }, + { + "epoch": 0.6556413910347759, + "grad_norm": 2.0696961879730225, + "learning_rate": 6.395458931853663e-06, + "loss": 1.4435, + "step": 52450 + }, + { + "epoch": 0.6556663916597915, + "grad_norm": 4.063361167907715, + "learning_rate": 6.394644921521278e-06, + "loss": 1.3148, + "step": 52452 + }, + { + "epoch": 0.6556913922848071, + "grad_norm": 1.8213845491409302, + "learning_rate": 6.393830938646768e-06, + "loss": 0.3868, + "step": 52454 + }, + { + "epoch": 0.6557163929098228, + "grad_norm": 0.0012136594159528613, + "learning_rate": 6.393016983236333e-06, + "loss": 0.0, + "step": 52456 + }, + { + "epoch": 0.6557413935348384, + "grad_norm": 4.408487796783447, + "learning_rate": 6.392203055296169e-06, + "loss": 1.1142, + "step": 52458 + }, + { + "epoch": 0.655766394159854, + "grad_norm": 3.228825807571411, + "learning_rate": 6.391389154832477e-06, + "loss": 0.8193, + "step": 52460 + }, + { + "epoch": 0.6557913947848696, + "grad_norm": 3.207632303237915, + "learning_rate": 6.390575281851454e-06, + "loss": 0.6054, + "step": 52462 + }, + { + "epoch": 0.6558163954098852, + "grad_norm": 1.6542308330535889, + "learning_rate": 6.389761436359298e-06, + "loss": 1.2317, + "step": 52464 + }, + { + "epoch": 0.6558413960349009, + "grad_norm": 2.315751075744629, + "learning_rate": 6.388947618362211e-06, + "loss": 1.0726, + "step": 52466 + }, + { + "epoch": 0.6558663966599165, + "grad_norm": 3.840047597885132, + "learning_rate": 6.3881338278663875e-06, + "loss": 1.2243, + "step": 52468 + }, + { + "epoch": 0.6558913972849322, + "grad_norm": 5.08974552154541, + "learning_rate": 6.3873200648780265e-06, + "loss": 1.1607, + "step": 52470 + }, + { + "epoch": 0.6559163979099477, + "grad_norm": 1.8031092882156372, + "learning_rate": 6.386506329403325e-06, + "loss": 0.3862, + "step": 52472 + }, + { + "epoch": 0.6559413985349634, + "grad_norm": 2.5204710960388184, + "learning_rate": 6.385692621448479e-06, + "loss": 0.8142, + "step": 52474 + }, + { + "epoch": 0.655966399159979, + "grad_norm": 3.986133098602295, + "learning_rate": 6.384878941019688e-06, + "loss": 1.6616, + "step": 52476 + }, + { + "epoch": 0.6559913997849947, + "grad_norm": 2.263864755630493, + "learning_rate": 6.384065288123147e-06, + "loss": 0.5942, + "step": 52478 + }, + { + "epoch": 0.6560164004100103, + "grad_norm": 2.8649330139160156, + "learning_rate": 6.3832516627650534e-06, + "loss": 0.6354, + "step": 52480 + }, + { + "epoch": 0.6560414010350258, + "grad_norm": 4.225584983825684, + "learning_rate": 6.382438064951606e-06, + "loss": 1.8742, + "step": 52482 + }, + { + "epoch": 0.6560664016600415, + "grad_norm": 4.034264087677002, + "learning_rate": 6.381624494688996e-06, + "loss": 0.7859, + "step": 52484 + }, + { + "epoch": 0.6560914022850571, + "grad_norm": 2.856220006942749, + "learning_rate": 6.380810951983423e-06, + "loss": 1.6423, + "step": 52486 + }, + { + "epoch": 0.6561164029100728, + "grad_norm": 0.0034857538994401693, + "learning_rate": 6.379997436841081e-06, + "loss": 0.0606, + "step": 52488 + }, + { + "epoch": 0.6561414035350884, + "grad_norm": 0.003978375345468521, + "learning_rate": 6.379183949268167e-06, + "loss": 0.4683, + "step": 52490 + }, + { + "epoch": 0.656166404160104, + "grad_norm": 4.072356700897217, + "learning_rate": 6.378370489270877e-06, + "loss": 1.579, + "step": 52492 + }, + { + "epoch": 0.6561914047851196, + "grad_norm": 1.5083986520767212, + "learning_rate": 6.377557056855403e-06, + "loss": 0.4701, + "step": 52494 + }, + { + "epoch": 0.6562164054101353, + "grad_norm": 4.614348411560059, + "learning_rate": 6.376743652027947e-06, + "loss": 1.6085, + "step": 52496 + }, + { + "epoch": 0.6562414060351509, + "grad_norm": 0.260712593793869, + "learning_rate": 6.375930274794695e-06, + "loss": 0.9235, + "step": 52498 + }, + { + "epoch": 0.6562664066601666, + "grad_norm": 1.8212461471557617, + "learning_rate": 6.3751169251618464e-06, + "loss": 0.6561, + "step": 52500 + }, + { + "epoch": 0.6562914072851821, + "grad_norm": 3.313392400741577, + "learning_rate": 6.3743036031355945e-06, + "loss": 0.4821, + "step": 52502 + }, + { + "epoch": 0.6563164079101977, + "grad_norm": 0.7486669421195984, + "learning_rate": 6.373490308722134e-06, + "loss": 0.5958, + "step": 52504 + }, + { + "epoch": 0.6563414085352134, + "grad_norm": 2.0376064777374268, + "learning_rate": 6.372677041927657e-06, + "loss": 0.9083, + "step": 52506 + }, + { + "epoch": 0.656366409160229, + "grad_norm": 4.107186317443848, + "learning_rate": 6.371863802758363e-06, + "loss": 1.3899, + "step": 52508 + }, + { + "epoch": 0.6563914097852447, + "grad_norm": 1.1455905437469482, + "learning_rate": 6.371050591220438e-06, + "loss": 0.1571, + "step": 52510 + }, + { + "epoch": 0.6564164104102602, + "grad_norm": 3.192554235458374, + "learning_rate": 6.370237407320078e-06, + "loss": 2.0835, + "step": 52512 + }, + { + "epoch": 0.6564414110352759, + "grad_norm": 8.964424133300781, + "learning_rate": 6.369424251063476e-06, + "loss": 1.6795, + "step": 52514 + }, + { + "epoch": 0.6564664116602915, + "grad_norm": 1.1642576456069946, + "learning_rate": 6.368611122456827e-06, + "loss": 0.5557, + "step": 52516 + }, + { + "epoch": 0.6564914122853072, + "grad_norm": 3.3986434936523438, + "learning_rate": 6.367798021506322e-06, + "loss": 0.5178, + "step": 52518 + }, + { + "epoch": 0.6565164129103228, + "grad_norm": 0.0008018335211090744, + "learning_rate": 6.366984948218151e-06, + "loss": 0.5484, + "step": 52520 + }, + { + "epoch": 0.6565414135353383, + "grad_norm": 3.3834190368652344, + "learning_rate": 6.366171902598517e-06, + "loss": 0.9216, + "step": 52522 + }, + { + "epoch": 0.656566414160354, + "grad_norm": 4.217076301574707, + "learning_rate": 6.365358884653597e-06, + "loss": 0.2906, + "step": 52524 + }, + { + "epoch": 0.6565914147853696, + "grad_norm": 2.5943808555603027, + "learning_rate": 6.364545894389591e-06, + "loss": 0.9009, + "step": 52526 + }, + { + "epoch": 0.6566164154103853, + "grad_norm": 4.411833763122559, + "learning_rate": 6.36373293181269e-06, + "loss": 1.0411, + "step": 52528 + }, + { + "epoch": 0.6566414160354009, + "grad_norm": 0.9889907240867615, + "learning_rate": 6.362919996929085e-06, + "loss": 0.6797, + "step": 52530 + }, + { + "epoch": 0.6566664166604165, + "grad_norm": 9.644618034362793, + "learning_rate": 6.362107089744962e-06, + "loss": 0.9112, + "step": 52532 + }, + { + "epoch": 0.6566914172854321, + "grad_norm": 2.633297920227051, + "learning_rate": 6.361294210266527e-06, + "loss": 0.7314, + "step": 52534 + }, + { + "epoch": 0.6567164179104478, + "grad_norm": 3.908306837081909, + "learning_rate": 6.360481358499953e-06, + "loss": 0.8302, + "step": 52536 + }, + { + "epoch": 0.6567414185354634, + "grad_norm": 0.006377893965691328, + "learning_rate": 6.3596685344514405e-06, + "loss": 0.0029, + "step": 52538 + }, + { + "epoch": 0.6567664191604791, + "grad_norm": 2.51308536529541, + "learning_rate": 6.358855738127177e-06, + "loss": 0.9239, + "step": 52540 + }, + { + "epoch": 0.6567914197854946, + "grad_norm": 3.961792469024658, + "learning_rate": 6.358042969533353e-06, + "loss": 1.028, + "step": 52542 + }, + { + "epoch": 0.6568164204105102, + "grad_norm": 4.463066577911377, + "learning_rate": 6.3572302286761574e-06, + "loss": 1.5955, + "step": 52544 + }, + { + "epoch": 0.6568414210355259, + "grad_norm": 3.581282615661621, + "learning_rate": 6.3564175155617835e-06, + "loss": 0.3922, + "step": 52546 + }, + { + "epoch": 0.6568664216605415, + "grad_norm": 4.527950763702393, + "learning_rate": 6.355604830196422e-06, + "loss": 1.0173, + "step": 52548 + }, + { + "epoch": 0.6568914222855572, + "grad_norm": 12.721724510192871, + "learning_rate": 6.354792172586255e-06, + "loss": 1.0138, + "step": 52550 + }, + { + "epoch": 0.6569164229105727, + "grad_norm": 0.014554787427186966, + "learning_rate": 6.353979542737476e-06, + "loss": 0.7395, + "step": 52552 + }, + { + "epoch": 0.6569414235355884, + "grad_norm": 0.0009211825090460479, + "learning_rate": 6.353166940656272e-06, + "loss": 0.2726, + "step": 52554 + }, + { + "epoch": 0.656966424160604, + "grad_norm": 2.38303542137146, + "learning_rate": 6.352354366348832e-06, + "loss": 0.7491, + "step": 52556 + }, + { + "epoch": 0.6569914247856197, + "grad_norm": 3.6165740489959717, + "learning_rate": 6.351541819821345e-06, + "loss": 0.9004, + "step": 52558 + }, + { + "epoch": 0.6570164254106353, + "grad_norm": 8.126825332641602, + "learning_rate": 6.350729301080005e-06, + "loss": 0.5086, + "step": 52560 + }, + { + "epoch": 0.6570414260356509, + "grad_norm": 2.6761417388916016, + "learning_rate": 6.349916810130992e-06, + "loss": 0.7807, + "step": 52562 + }, + { + "epoch": 0.6570664266606665, + "grad_norm": 2.211165189743042, + "learning_rate": 6.349104346980495e-06, + "loss": 0.5176, + "step": 52564 + }, + { + "epoch": 0.6570914272856822, + "grad_norm": 0.008016769774258137, + "learning_rate": 6.348291911634703e-06, + "loss": 0.0002, + "step": 52566 + }, + { + "epoch": 0.6571164279106978, + "grad_norm": 3.185573101043701, + "learning_rate": 6.347479504099804e-06, + "loss": 0.6755, + "step": 52568 + }, + { + "epoch": 0.6571414285357134, + "grad_norm": 2.7442662715911865, + "learning_rate": 6.3466671243819825e-06, + "loss": 0.0944, + "step": 52570 + }, + { + "epoch": 0.657166429160729, + "grad_norm": 0.0016944914823397994, + "learning_rate": 6.345854772487429e-06, + "loss": 0.0, + "step": 52572 + }, + { + "epoch": 0.6571914297857446, + "grad_norm": 2.5447418689727783, + "learning_rate": 6.345042448422332e-06, + "loss": 0.032, + "step": 52574 + }, + { + "epoch": 0.6572164304107603, + "grad_norm": 0.003826880594715476, + "learning_rate": 6.344230152192871e-06, + "loss": 0.3728, + "step": 52576 + }, + { + "epoch": 0.6572414310357759, + "grad_norm": 3.009702205657959, + "learning_rate": 6.3434178838052385e-06, + "loss": 0.6694, + "step": 52578 + }, + { + "epoch": 0.6572664316607916, + "grad_norm": 0.006660505663603544, + "learning_rate": 6.342605643265617e-06, + "loss": 0.0004, + "step": 52580 + }, + { + "epoch": 0.6572914322858071, + "grad_norm": 3.0594005584716797, + "learning_rate": 6.341793430580192e-06, + "loss": 1.366, + "step": 52582 + }, + { + "epoch": 0.6573164329108228, + "grad_norm": 3.4488766193389893, + "learning_rate": 6.3409812457551515e-06, + "loss": 0.3907, + "step": 52584 + }, + { + "epoch": 0.6573414335358384, + "grad_norm": 4.8609938621521, + "learning_rate": 6.3401690887966815e-06, + "loss": 0.716, + "step": 52586 + }, + { + "epoch": 0.657366434160854, + "grad_norm": 3.716092109680176, + "learning_rate": 6.339356959710971e-06, + "loss": 0.974, + "step": 52588 + }, + { + "epoch": 0.6573914347858697, + "grad_norm": 4.587074279785156, + "learning_rate": 6.338544858504194e-06, + "loss": 1.2183, + "step": 52590 + }, + { + "epoch": 0.6574164354108852, + "grad_norm": 2.0561635494232178, + "learning_rate": 6.337732785182544e-06, + "loss": 1.0681, + "step": 52592 + }, + { + "epoch": 0.6574414360359009, + "grad_norm": 2.7998225688934326, + "learning_rate": 6.336920739752199e-06, + "loss": 0.576, + "step": 52594 + }, + { + "epoch": 0.6574664366609165, + "grad_norm": 11.270357131958008, + "learning_rate": 6.336108722219352e-06, + "loss": 1.3093, + "step": 52596 + }, + { + "epoch": 0.6574914372859322, + "grad_norm": 6.772453308105469, + "learning_rate": 6.335296732590182e-06, + "loss": 1.4314, + "step": 52598 + }, + { + "epoch": 0.6575164379109478, + "grad_norm": 4.083078384399414, + "learning_rate": 6.334484770870877e-06, + "loss": 2.1983, + "step": 52600 + }, + { + "epoch": 0.6575414385359634, + "grad_norm": 3.471750259399414, + "learning_rate": 6.333672837067614e-06, + "loss": 1.0337, + "step": 52602 + }, + { + "epoch": 0.657566439160979, + "grad_norm": 5.373144626617432, + "learning_rate": 6.33286093118658e-06, + "loss": 1.1546, + "step": 52604 + }, + { + "epoch": 0.6575914397859947, + "grad_norm": 2.8680953979492188, + "learning_rate": 6.332049053233958e-06, + "loss": 1.2602, + "step": 52606 + }, + { + "epoch": 0.6576164404110103, + "grad_norm": 4.717438697814941, + "learning_rate": 6.3312372032159316e-06, + "loss": 0.8593, + "step": 52608 + }, + { + "epoch": 0.657641441036026, + "grad_norm": 4.610990524291992, + "learning_rate": 6.330425381138685e-06, + "loss": 1.2916, + "step": 52610 + }, + { + "epoch": 0.6576664416610415, + "grad_norm": 1.0317060947418213, + "learning_rate": 6.329613587008399e-06, + "loss": 0.3684, + "step": 52612 + }, + { + "epoch": 0.6576914422860571, + "grad_norm": 5.905801296234131, + "learning_rate": 6.328801820831262e-06, + "loss": 1.9731, + "step": 52614 + }, + { + "epoch": 0.6577164429110728, + "grad_norm": 1.939900279045105, + "learning_rate": 6.327990082613446e-06, + "loss": 0.5273, + "step": 52616 + }, + { + "epoch": 0.6577414435360884, + "grad_norm": 2.689037799835205, + "learning_rate": 6.327178372361138e-06, + "loss": 1.3761, + "step": 52618 + }, + { + "epoch": 0.6577664441611041, + "grad_norm": 0.0011508796596899629, + "learning_rate": 6.326366690080521e-06, + "loss": 0.365, + "step": 52620 + }, + { + "epoch": 0.6577914447861196, + "grad_norm": 6.442606449127197, + "learning_rate": 6.325555035777775e-06, + "loss": 1.6836, + "step": 52622 + }, + { + "epoch": 0.6578164454111353, + "grad_norm": 2.834212064743042, + "learning_rate": 6.324743409459084e-06, + "loss": 1.6728, + "step": 52624 + }, + { + "epoch": 0.6578414460361509, + "grad_norm": 0.1054079681634903, + "learning_rate": 6.32393181113063e-06, + "loss": 0.0012, + "step": 52626 + }, + { + "epoch": 0.6578664466611666, + "grad_norm": 2.009925127029419, + "learning_rate": 6.323120240798588e-06, + "loss": 1.3212, + "step": 52628 + }, + { + "epoch": 0.6578914472861822, + "grad_norm": 2.7620270252227783, + "learning_rate": 6.322308698469141e-06, + "loss": 0.9413, + "step": 52630 + }, + { + "epoch": 0.6579164479111977, + "grad_norm": 6.136059284210205, + "learning_rate": 6.321497184148472e-06, + "loss": 0.9988, + "step": 52632 + }, + { + "epoch": 0.6579414485362134, + "grad_norm": 0.0006850529462099075, + "learning_rate": 6.32068569784276e-06, + "loss": 0.1611, + "step": 52634 + }, + { + "epoch": 0.657966449161229, + "grad_norm": 2.5062522888183594, + "learning_rate": 6.319874239558186e-06, + "loss": 1.0609, + "step": 52636 + }, + { + "epoch": 0.6579914497862447, + "grad_norm": 3.8172872066497803, + "learning_rate": 6.319062809300929e-06, + "loss": 0.3097, + "step": 52638 + }, + { + "epoch": 0.6580164504112603, + "grad_norm": 5.206635475158691, + "learning_rate": 6.318251407077174e-06, + "loss": 0.699, + "step": 52640 + }, + { + "epoch": 0.6580414510362759, + "grad_norm": 8.071921348571777, + "learning_rate": 6.317440032893091e-06, + "loss": 1.5681, + "step": 52642 + }, + { + "epoch": 0.6580664516612915, + "grad_norm": 0.0003381235874257982, + "learning_rate": 6.316628686754862e-06, + "loss": 0.7974, + "step": 52644 + }, + { + "epoch": 0.6580914522863072, + "grad_norm": 1.851320743560791, + "learning_rate": 6.31581736866867e-06, + "loss": 1.2927, + "step": 52646 + }, + { + "epoch": 0.6581164529113228, + "grad_norm": 2.565155506134033, + "learning_rate": 6.3150060786406906e-06, + "loss": 0.5345, + "step": 52648 + }, + { + "epoch": 0.6581414535363385, + "grad_norm": 2.7282938957214355, + "learning_rate": 6.314194816677106e-06, + "loss": 0.5547, + "step": 52650 + }, + { + "epoch": 0.658166454161354, + "grad_norm": 4.298083782196045, + "learning_rate": 6.313383582784096e-06, + "loss": 0.6283, + "step": 52652 + }, + { + "epoch": 0.6581914547863696, + "grad_norm": 5.201251983642578, + "learning_rate": 6.31257237696783e-06, + "loss": 2.0758, + "step": 52654 + }, + { + "epoch": 0.6582164554113853, + "grad_norm": 0.20649416744709015, + "learning_rate": 6.311761199234491e-06, + "loss": 0.0987, + "step": 52656 + }, + { + "epoch": 0.6582414560364009, + "grad_norm": 3.590481996536255, + "learning_rate": 6.310950049590259e-06, + "loss": 0.8383, + "step": 52658 + }, + { + "epoch": 0.6582664566614166, + "grad_norm": 2.263430118560791, + "learning_rate": 6.310138928041309e-06, + "loss": 0.7801, + "step": 52660 + }, + { + "epoch": 0.6582914572864321, + "grad_norm": 0.6065496206283569, + "learning_rate": 6.309327834593819e-06, + "loss": 0.6169, + "step": 52662 + }, + { + "epoch": 0.6583164579114478, + "grad_norm": 1.5541244745254517, + "learning_rate": 6.308516769253966e-06, + "loss": 0.5708, + "step": 52664 + }, + { + "epoch": 0.6583414585364634, + "grad_norm": 7.8342695236206055, + "learning_rate": 6.307705732027931e-06, + "loss": 0.432, + "step": 52666 + }, + { + "epoch": 0.6583664591614791, + "grad_norm": 0.7691400051116943, + "learning_rate": 6.3068947229218825e-06, + "loss": 0.072, + "step": 52668 + }, + { + "epoch": 0.6583914597864947, + "grad_norm": 1.9584952592849731, + "learning_rate": 6.306083741942003e-06, + "loss": 0.8973, + "step": 52670 + }, + { + "epoch": 0.6584164604115103, + "grad_norm": 0.029268065467476845, + "learning_rate": 6.305272789094468e-06, + "loss": 0.1511, + "step": 52672 + }, + { + "epoch": 0.6584414610365259, + "grad_norm": 2.8913075923919678, + "learning_rate": 6.304461864385452e-06, + "loss": 0.4825, + "step": 52674 + }, + { + "epoch": 0.6584664616615415, + "grad_norm": 2.552009105682373, + "learning_rate": 6.303650967821131e-06, + "loss": 1.3586, + "step": 52676 + }, + { + "epoch": 0.6584914622865572, + "grad_norm": 2.486565351486206, + "learning_rate": 6.302840099407686e-06, + "loss": 0.6875, + "step": 52678 + }, + { + "epoch": 0.6585164629115728, + "grad_norm": 2.249427318572998, + "learning_rate": 6.302029259151283e-06, + "loss": 0.3047, + "step": 52680 + }, + { + "epoch": 0.6585414635365884, + "grad_norm": 8.46056842803955, + "learning_rate": 6.3012184470581035e-06, + "loss": 1.0832, + "step": 52682 + }, + { + "epoch": 0.658566464161604, + "grad_norm": 3.910297393798828, + "learning_rate": 6.300407663134323e-06, + "loss": 1.6207, + "step": 52684 + }, + { + "epoch": 0.6585914647866197, + "grad_norm": 6.0566229820251465, + "learning_rate": 6.299596907386112e-06, + "loss": 1.6729, + "step": 52686 + }, + { + "epoch": 0.6586164654116353, + "grad_norm": 3.299781084060669, + "learning_rate": 6.298786179819649e-06, + "loss": 1.635, + "step": 52688 + }, + { + "epoch": 0.658641466036651, + "grad_norm": 3.850148916244507, + "learning_rate": 6.2979754804411065e-06, + "loss": 1.5582, + "step": 52690 + }, + { + "epoch": 0.6586664666616665, + "grad_norm": 4.190089702606201, + "learning_rate": 6.297164809256661e-06, + "loss": 1.3288, + "step": 52692 + }, + { + "epoch": 0.6586914672866822, + "grad_norm": 4.210469722747803, + "learning_rate": 6.296354166272483e-06, + "loss": 1.4085, + "step": 52694 + }, + { + "epoch": 0.6587164679116978, + "grad_norm": 2.640547513961792, + "learning_rate": 6.29554355149475e-06, + "loss": 1.1135, + "step": 52696 + }, + { + "epoch": 0.6587414685367134, + "grad_norm": 0.0032072626054286957, + "learning_rate": 6.29473296492963e-06, + "loss": 1.1109, + "step": 52698 + }, + { + "epoch": 0.6587664691617291, + "grad_norm": 0.0005414271727204323, + "learning_rate": 6.293922406583302e-06, + "loss": 0.4806, + "step": 52700 + }, + { + "epoch": 0.6587914697867446, + "grad_norm": 2.315666437149048, + "learning_rate": 6.293111876461936e-06, + "loss": 0.707, + "step": 52702 + }, + { + "epoch": 0.6588164704117603, + "grad_norm": 2.4565234184265137, + "learning_rate": 6.292301374571707e-06, + "loss": 1.5129, + "step": 52704 + }, + { + "epoch": 0.6588414710367759, + "grad_norm": 2.4150302410125732, + "learning_rate": 6.291490900918786e-06, + "loss": 0.9456, + "step": 52706 + }, + { + "epoch": 0.6588664716617916, + "grad_norm": 3.8582956790924072, + "learning_rate": 6.290680455509346e-06, + "loss": 2.0234, + "step": 52708 + }, + { + "epoch": 0.6588914722868072, + "grad_norm": 0.0008103155996650457, + "learning_rate": 6.289870038349557e-06, + "loss": 0.6209, + "step": 52710 + }, + { + "epoch": 0.6589164729118228, + "grad_norm": 2.71890926361084, + "learning_rate": 6.2890596494455945e-06, + "loss": 1.4743, + "step": 52712 + }, + { + "epoch": 0.6589414735368384, + "grad_norm": 0.001851518522016704, + "learning_rate": 6.2882492888036296e-06, + "loss": 0.4468, + "step": 52714 + }, + { + "epoch": 0.658966474161854, + "grad_norm": 1.5326772928237915, + "learning_rate": 6.287438956429832e-06, + "loss": 0.4604, + "step": 52716 + }, + { + "epoch": 0.6589914747868697, + "grad_norm": 2.002431869506836, + "learning_rate": 6.286628652330378e-06, + "loss": 0.0468, + "step": 52718 + }, + { + "epoch": 0.6590164754118853, + "grad_norm": 0.004400639794766903, + "learning_rate": 6.2858183765114315e-06, + "loss": 0.0001, + "step": 52720 + }, + { + "epoch": 0.6590414760369009, + "grad_norm": 3.3695833683013916, + "learning_rate": 6.2850081289791665e-06, + "loss": 1.1028, + "step": 52722 + }, + { + "epoch": 0.6590664766619165, + "grad_norm": 6.442490577697754, + "learning_rate": 6.284197909739755e-06, + "loss": 2.2775, + "step": 52724 + }, + { + "epoch": 0.6590914772869322, + "grad_norm": 1.010140299797058, + "learning_rate": 6.283387718799367e-06, + "loss": 0.4175, + "step": 52726 + }, + { + "epoch": 0.6591164779119478, + "grad_norm": 1.4846789836883545, + "learning_rate": 6.282577556164174e-06, + "loss": 0.0483, + "step": 52728 + }, + { + "epoch": 0.6591414785369635, + "grad_norm": 5.594419956207275, + "learning_rate": 6.281767421840343e-06, + "loss": 1.8946, + "step": 52730 + }, + { + "epoch": 0.659166479161979, + "grad_norm": 13.693339347839355, + "learning_rate": 6.280957315834046e-06, + "loss": 0.7973, + "step": 52732 + }, + { + "epoch": 0.6591914797869947, + "grad_norm": 0.0007711461512371898, + "learning_rate": 6.280147238151453e-06, + "loss": 0.3667, + "step": 52734 + }, + { + "epoch": 0.6592164804120103, + "grad_norm": 3.159313917160034, + "learning_rate": 6.279337188798731e-06, + "loss": 0.2618, + "step": 52736 + }, + { + "epoch": 0.659241481037026, + "grad_norm": 1.0169281959533691, + "learning_rate": 6.278527167782051e-06, + "loss": 0.6407, + "step": 52738 + }, + { + "epoch": 0.6592664816620416, + "grad_norm": 4.753260612487793, + "learning_rate": 6.277717175107582e-06, + "loss": 1.2371, + "step": 52740 + }, + { + "epoch": 0.6592914822870571, + "grad_norm": 3.7087972164154053, + "learning_rate": 6.2769072107814934e-06, + "loss": 1.0593, + "step": 52742 + }, + { + "epoch": 0.6593164829120728, + "grad_norm": 0.42147088050842285, + "learning_rate": 6.276097274809953e-06, + "loss": 0.5187, + "step": 52744 + }, + { + "epoch": 0.6593414835370884, + "grad_norm": 0.0025446403305977583, + "learning_rate": 6.2752873671991284e-06, + "loss": 0.1177, + "step": 52746 + }, + { + "epoch": 0.6593664841621041, + "grad_norm": 4.069004535675049, + "learning_rate": 6.274477487955188e-06, + "loss": 1.4769, + "step": 52748 + }, + { + "epoch": 0.6593914847871197, + "grad_norm": 2.6194069385528564, + "learning_rate": 6.273667637084302e-06, + "loss": 0.7888, + "step": 52750 + }, + { + "epoch": 0.6594164854121353, + "grad_norm": 2.321793556213379, + "learning_rate": 6.272857814592634e-06, + "loss": 0.3735, + "step": 52752 + }, + { + "epoch": 0.6594414860371509, + "grad_norm": 0.0003818320401478559, + "learning_rate": 6.272048020486356e-06, + "loss": 0.3544, + "step": 52754 + }, + { + "epoch": 0.6594664866621666, + "grad_norm": 4.233593463897705, + "learning_rate": 6.271238254771633e-06, + "loss": 0.985, + "step": 52756 + }, + { + "epoch": 0.6594914872871822, + "grad_norm": 4.1778035163879395, + "learning_rate": 6.270428517454632e-06, + "loss": 1.3817, + "step": 52758 + }, + { + "epoch": 0.6595164879121979, + "grad_norm": 0.0017260037129744887, + "learning_rate": 6.269618808541518e-06, + "loss": 0.6702, + "step": 52760 + }, + { + "epoch": 0.6595414885372134, + "grad_norm": 2.7909674644470215, + "learning_rate": 6.268809128038461e-06, + "loss": 0.8441, + "step": 52762 + }, + { + "epoch": 0.659566489162229, + "grad_norm": 6.60181999206543, + "learning_rate": 6.267999475951625e-06, + "loss": 1.4537, + "step": 52764 + }, + { + "epoch": 0.6595914897872447, + "grad_norm": 2.285113573074341, + "learning_rate": 6.267189852287179e-06, + "loss": 0.6516, + "step": 52766 + }, + { + "epoch": 0.6596164904122603, + "grad_norm": 0.0009723871480673552, + "learning_rate": 6.266380257051286e-06, + "loss": 0.6594, + "step": 52768 + }, + { + "epoch": 0.659641491037276, + "grad_norm": 2.7927799224853516, + "learning_rate": 6.265570690250115e-06, + "loss": 1.2396, + "step": 52770 + }, + { + "epoch": 0.6596664916622915, + "grad_norm": 7.385164737701416, + "learning_rate": 6.2647611518898275e-06, + "loss": 1.0818, + "step": 52772 + }, + { + "epoch": 0.6596914922873072, + "grad_norm": 3.1594254970550537, + "learning_rate": 6.263951641976591e-06, + "loss": 1.1934, + "step": 52774 + }, + { + "epoch": 0.6597164929123228, + "grad_norm": 3.639126777648926, + "learning_rate": 6.263142160516571e-06, + "loss": 0.5911, + "step": 52776 + }, + { + "epoch": 0.6597414935373385, + "grad_norm": 4.80533504486084, + "learning_rate": 6.2623327075159315e-06, + "loss": 2.2514, + "step": 52778 + }, + { + "epoch": 0.6597664941623541, + "grad_norm": 2.9961695671081543, + "learning_rate": 6.261523282980837e-06, + "loss": 0.5415, + "step": 52780 + }, + { + "epoch": 0.6597914947873696, + "grad_norm": 4.976912021636963, + "learning_rate": 6.260713886917456e-06, + "loss": 0.8144, + "step": 52782 + }, + { + "epoch": 0.6598164954123853, + "grad_norm": 10.904715538024902, + "learning_rate": 6.259904519331945e-06, + "loss": 0.6183, + "step": 52784 + }, + { + "epoch": 0.6598414960374009, + "grad_norm": 3.943173408508301, + "learning_rate": 6.259095180230474e-06, + "loss": 1.6046, + "step": 52786 + }, + { + "epoch": 0.6598664966624166, + "grad_norm": 2.897310733795166, + "learning_rate": 6.258285869619205e-06, + "loss": 1.2343, + "step": 52788 + }, + { + "epoch": 0.6598914972874322, + "grad_norm": 6.321012020111084, + "learning_rate": 6.257476587504301e-06, + "loss": 0.66, + "step": 52790 + }, + { + "epoch": 0.6599164979124478, + "grad_norm": 4.396327018737793, + "learning_rate": 6.256667333891926e-06, + "loss": 1.0981, + "step": 52792 + }, + { + "epoch": 0.6599414985374634, + "grad_norm": 0.008082942105829716, + "learning_rate": 6.255858108788244e-06, + "loss": 0.0817, + "step": 52794 + }, + { + "epoch": 0.6599664991624791, + "grad_norm": 0.0009657082846388221, + "learning_rate": 6.2550489121994185e-06, + "loss": 0.1456, + "step": 52796 + }, + { + "epoch": 0.6599914997874947, + "grad_norm": 2.7309422492980957, + "learning_rate": 6.25423974413161e-06, + "loss": 0.8361, + "step": 52798 + }, + { + "epoch": 0.6600165004125104, + "grad_norm": 3.906963348388672, + "learning_rate": 6.25343060459098e-06, + "loss": 1.0413, + "step": 52800 + }, + { + "epoch": 0.6600415010375259, + "grad_norm": 0.0012122952612116933, + "learning_rate": 6.252621493583694e-06, + "loss": 0.0, + "step": 52802 + }, + { + "epoch": 0.6600665016625415, + "grad_norm": 5.394120693206787, + "learning_rate": 6.251812411115912e-06, + "loss": 0.6843, + "step": 52804 + }, + { + "epoch": 0.6600915022875572, + "grad_norm": 1.7497458457946777, + "learning_rate": 6.251003357193797e-06, + "loss": 1.0989, + "step": 52806 + }, + { + "epoch": 0.6601165029125728, + "grad_norm": 2.664964199066162, + "learning_rate": 6.250194331823514e-06, + "loss": 0.966, + "step": 52808 + }, + { + "epoch": 0.6601415035375885, + "grad_norm": 7.638486862182617, + "learning_rate": 6.249385335011215e-06, + "loss": 1.7437, + "step": 52810 + }, + { + "epoch": 0.660166504162604, + "grad_norm": 3.414118528366089, + "learning_rate": 6.248576366763071e-06, + "loss": 0.4534, + "step": 52812 + }, + { + "epoch": 0.6601915047876197, + "grad_norm": 0.0026766222435981035, + "learning_rate": 6.247767427085236e-06, + "loss": 0.9686, + "step": 52814 + }, + { + "epoch": 0.6602165054126353, + "grad_norm": 0.9068687558174133, + "learning_rate": 6.2469585159838745e-06, + "loss": 0.6332, + "step": 52816 + }, + { + "epoch": 0.660241506037651, + "grad_norm": 12.16664981842041, + "learning_rate": 6.246149633465146e-06, + "loss": 0.5239, + "step": 52818 + }, + { + "epoch": 0.6602665066626666, + "grad_norm": 3.1067938804626465, + "learning_rate": 6.245340779535211e-06, + "loss": 1.2637, + "step": 52820 + }, + { + "epoch": 0.6602915072876822, + "grad_norm": 3.8525636196136475, + "learning_rate": 6.244531954200232e-06, + "loss": 1.1235, + "step": 52822 + }, + { + "epoch": 0.6603165079126978, + "grad_norm": 2.259442090988159, + "learning_rate": 6.243723157466366e-06, + "loss": 0.6278, + "step": 52824 + }, + { + "epoch": 0.6603415085377135, + "grad_norm": 1.9965165853500366, + "learning_rate": 6.2429143893397716e-06, + "loss": 0.7624, + "step": 52826 + }, + { + "epoch": 0.6603665091627291, + "grad_norm": 3.747821092605591, + "learning_rate": 6.24210564982661e-06, + "loss": 1.8775, + "step": 52828 + }, + { + "epoch": 0.6603915097877447, + "grad_norm": 3.4527385234832764, + "learning_rate": 6.24129693893304e-06, + "loss": 0.8216, + "step": 52830 + }, + { + "epoch": 0.6604165104127603, + "grad_norm": 2.287506103515625, + "learning_rate": 6.240488256665223e-06, + "loss": 0.3543, + "step": 52832 + }, + { + "epoch": 0.6604415110377759, + "grad_norm": 4.962436676025391, + "learning_rate": 6.239679603029316e-06, + "loss": 0.9216, + "step": 52834 + }, + { + "epoch": 0.6604665116627916, + "grad_norm": 5.09795618057251, + "learning_rate": 6.238870978031475e-06, + "loss": 1.5928, + "step": 52836 + }, + { + "epoch": 0.6604915122878072, + "grad_norm": 2.657170057296753, + "learning_rate": 6.23806238167786e-06, + "loss": 0.669, + "step": 52838 + }, + { + "epoch": 0.6605165129128229, + "grad_norm": 4.848464488983154, + "learning_rate": 6.237253813974633e-06, + "loss": 1.2897, + "step": 52840 + }, + { + "epoch": 0.6605415135378384, + "grad_norm": 2.1824874877929688, + "learning_rate": 6.236445274927947e-06, + "loss": 0.8019, + "step": 52842 + }, + { + "epoch": 0.6605665141628541, + "grad_norm": 4.853454113006592, + "learning_rate": 6.235636764543962e-06, + "loss": 0.7891, + "step": 52844 + }, + { + "epoch": 0.6605915147878697, + "grad_norm": 4.441528797149658, + "learning_rate": 6.234828282828835e-06, + "loss": 1.4506, + "step": 52846 + }, + { + "epoch": 0.6606165154128854, + "grad_norm": 3.233823776245117, + "learning_rate": 6.234019829788725e-06, + "loss": 1.4485, + "step": 52848 + }, + { + "epoch": 0.660641516037901, + "grad_norm": 2.8514325618743896, + "learning_rate": 6.233211405429785e-06, + "loss": 1.2072, + "step": 52850 + }, + { + "epoch": 0.6606665166629165, + "grad_norm": 3.160888671875, + "learning_rate": 6.232403009758174e-06, + "loss": 0.8557, + "step": 52852 + }, + { + "epoch": 0.6606915172879322, + "grad_norm": 3.36147141456604, + "learning_rate": 6.231594642780051e-06, + "loss": 1.2759, + "step": 52854 + }, + { + "epoch": 0.6607165179129478, + "grad_norm": 3.2369728088378906, + "learning_rate": 6.23078630450157e-06, + "loss": 0.7827, + "step": 52856 + }, + { + "epoch": 0.6607415185379635, + "grad_norm": 2.364539384841919, + "learning_rate": 6.229977994928886e-06, + "loss": 0.4008, + "step": 52858 + }, + { + "epoch": 0.6607665191629791, + "grad_norm": 2.54719877243042, + "learning_rate": 6.229169714068157e-06, + "loss": 1.3322, + "step": 52860 + }, + { + "epoch": 0.6607915197879947, + "grad_norm": 2.0931453704833984, + "learning_rate": 6.22836146192554e-06, + "loss": 0.3053, + "step": 52862 + }, + { + "epoch": 0.6608165204130103, + "grad_norm": 2.9963555335998535, + "learning_rate": 6.227553238507187e-06, + "loss": 0.287, + "step": 52864 + }, + { + "epoch": 0.660841521038026, + "grad_norm": 3.4434659481048584, + "learning_rate": 6.226745043819255e-06, + "loss": 0.677, + "step": 52866 + }, + { + "epoch": 0.6608665216630416, + "grad_norm": 6.666070938110352, + "learning_rate": 6.225936877867899e-06, + "loss": 1.2973, + "step": 52868 + }, + { + "epoch": 0.6608915222880573, + "grad_norm": 2.488572120666504, + "learning_rate": 6.225128740659274e-06, + "loss": 0.7798, + "step": 52870 + }, + { + "epoch": 0.6609165229130728, + "grad_norm": 4.759426593780518, + "learning_rate": 6.224320632199536e-06, + "loss": 0.4177, + "step": 52872 + }, + { + "epoch": 0.6609415235380884, + "grad_norm": 0.8349053263664246, + "learning_rate": 6.22351255249484e-06, + "loss": 0.0363, + "step": 52874 + }, + { + "epoch": 0.6609665241631041, + "grad_norm": 2.179741859436035, + "learning_rate": 6.222704501551334e-06, + "loss": 1.1746, + "step": 52876 + }, + { + "epoch": 0.6609915247881197, + "grad_norm": 0.45212194323539734, + "learning_rate": 6.221896479375178e-06, + "loss": 1.5538, + "step": 52878 + }, + { + "epoch": 0.6610165254131354, + "grad_norm": 1.452010154724121, + "learning_rate": 6.221088485972524e-06, + "loss": 0.3231, + "step": 52880 + }, + { + "epoch": 0.6610415260381509, + "grad_norm": 0.001050496124662459, + "learning_rate": 6.220280521349525e-06, + "loss": 0.1575, + "step": 52882 + }, + { + "epoch": 0.6610665266631666, + "grad_norm": 3.5737180709838867, + "learning_rate": 6.219472585512336e-06, + "loss": 0.1397, + "step": 52884 + }, + { + "epoch": 0.6610915272881822, + "grad_norm": 4.614834785461426, + "learning_rate": 6.218664678467109e-06, + "loss": 1.5747, + "step": 52886 + }, + { + "epoch": 0.6611165279131979, + "grad_norm": 0.0013270913623273373, + "learning_rate": 6.217856800219999e-06, + "loss": 0.5729, + "step": 52888 + }, + { + "epoch": 0.6611415285382135, + "grad_norm": 0.0011419325601309538, + "learning_rate": 6.217048950777155e-06, + "loss": 0.7306, + "step": 52890 + }, + { + "epoch": 0.661166529163229, + "grad_norm": 2.8227415084838867, + "learning_rate": 6.2162411301447315e-06, + "loss": 0.7262, + "step": 52892 + }, + { + "epoch": 0.6611915297882447, + "grad_norm": 6.078219413757324, + "learning_rate": 6.215433338328879e-06, + "loss": 0.6704, + "step": 52894 + }, + { + "epoch": 0.6612165304132603, + "grad_norm": 3.9025394916534424, + "learning_rate": 6.214625575335752e-06, + "loss": 1.0801, + "step": 52896 + }, + { + "epoch": 0.661241531038276, + "grad_norm": 6.9656805992126465, + "learning_rate": 6.213817841171501e-06, + "loss": 0.9763, + "step": 52898 + }, + { + "epoch": 0.6612665316632916, + "grad_norm": 3.715721845626831, + "learning_rate": 6.21301013584228e-06, + "loss": 1.197, + "step": 52900 + }, + { + "epoch": 0.6612915322883072, + "grad_norm": 3.5699663162231445, + "learning_rate": 6.212202459354238e-06, + "loss": 1.6921, + "step": 52902 + }, + { + "epoch": 0.6613165329133228, + "grad_norm": 3.462303876876831, + "learning_rate": 6.211394811713524e-06, + "loss": 0.9135, + "step": 52904 + }, + { + "epoch": 0.6613415335383385, + "grad_norm": 0.004337134771049023, + "learning_rate": 6.210587192926292e-06, + "loss": 0.001, + "step": 52906 + }, + { + "epoch": 0.6613665341633541, + "grad_norm": 7.895289897918701, + "learning_rate": 6.209779602998692e-06, + "loss": 0.6883, + "step": 52908 + }, + { + "epoch": 0.6613915347883698, + "grad_norm": 2.989208221435547, + "learning_rate": 6.208972041936875e-06, + "loss": 0.8922, + "step": 52910 + }, + { + "epoch": 0.6614165354133853, + "grad_norm": 1.2730640172958374, + "learning_rate": 6.208164509746988e-06, + "loss": 0.7783, + "step": 52912 + }, + { + "epoch": 0.661441536038401, + "grad_norm": 2.347116708755493, + "learning_rate": 6.2073570064351905e-06, + "loss": 0.8772, + "step": 52914 + }, + { + "epoch": 0.6614665366634166, + "grad_norm": 11.632835388183594, + "learning_rate": 6.206549532007622e-06, + "loss": 0.6658, + "step": 52916 + }, + { + "epoch": 0.6614915372884322, + "grad_norm": 3.8318140506744385, + "learning_rate": 6.205742086470436e-06, + "loss": 0.689, + "step": 52918 + }, + { + "epoch": 0.6615165379134479, + "grad_norm": 2.6376967430114746, + "learning_rate": 6.2049346698297805e-06, + "loss": 0.158, + "step": 52920 + }, + { + "epoch": 0.6615415385384634, + "grad_norm": 7.982534408569336, + "learning_rate": 6.2041272820918075e-06, + "loss": 1.9519, + "step": 52922 + }, + { + "epoch": 0.6615665391634791, + "grad_norm": 3.077540636062622, + "learning_rate": 6.203319923262661e-06, + "loss": 0.6791, + "step": 52924 + }, + { + "epoch": 0.6615915397884947, + "grad_norm": 3.968634605407715, + "learning_rate": 6.2025125933485e-06, + "loss": 0.908, + "step": 52926 + }, + { + "epoch": 0.6616165404135104, + "grad_norm": 7.1259684562683105, + "learning_rate": 6.201705292355462e-06, + "loss": 1.4938, + "step": 52928 + }, + { + "epoch": 0.661641541038526, + "grad_norm": 0.18878750503063202, + "learning_rate": 6.200898020289698e-06, + "loss": 1.0369, + "step": 52930 + }, + { + "epoch": 0.6616665416635416, + "grad_norm": 1.5628541707992554, + "learning_rate": 6.200090777157359e-06, + "loss": 0.7702, + "step": 52932 + }, + { + "epoch": 0.6616915422885572, + "grad_norm": 3.690645694732666, + "learning_rate": 6.1992835629645905e-06, + "loss": 0.3592, + "step": 52934 + }, + { + "epoch": 0.6617165429135728, + "grad_norm": 4.604020595550537, + "learning_rate": 6.19847637771754e-06, + "loss": 0.8712, + "step": 52936 + }, + { + "epoch": 0.6617415435385885, + "grad_norm": 3.851486921310425, + "learning_rate": 6.197669221422357e-06, + "loss": 0.7385, + "step": 52938 + }, + { + "epoch": 0.6617665441636041, + "grad_norm": 1.017431378364563, + "learning_rate": 6.196862094085191e-06, + "loss": 0.3712, + "step": 52940 + }, + { + "epoch": 0.6617915447886197, + "grad_norm": 5.205251216888428, + "learning_rate": 6.196054995712182e-06, + "loss": 1.5699, + "step": 52942 + }, + { + "epoch": 0.6618165454136353, + "grad_norm": 2.2544872760772705, + "learning_rate": 6.19524792630948e-06, + "loss": 0.8639, + "step": 52944 + }, + { + "epoch": 0.661841546038651, + "grad_norm": 3.9430863857269287, + "learning_rate": 6.194440885883233e-06, + "loss": 1.0435, + "step": 52946 + }, + { + "epoch": 0.6618665466636666, + "grad_norm": 3.412724733352661, + "learning_rate": 6.1936338744395814e-06, + "loss": 0.702, + "step": 52948 + }, + { + "epoch": 0.6618915472886823, + "grad_norm": 3.6554839611053467, + "learning_rate": 6.1928268919846805e-06, + "loss": 0.6657, + "step": 52950 + }, + { + "epoch": 0.6619165479136978, + "grad_norm": 2.864917755126953, + "learning_rate": 6.1920199385246745e-06, + "loss": 0.7619, + "step": 52952 + }, + { + "epoch": 0.6619415485387135, + "grad_norm": 3.660705089569092, + "learning_rate": 6.191213014065702e-06, + "loss": 0.8065, + "step": 52954 + }, + { + "epoch": 0.6619665491637291, + "grad_norm": 4.213589191436768, + "learning_rate": 6.190406118613913e-06, + "loss": 1.0988, + "step": 52956 + }, + { + "epoch": 0.6619915497887447, + "grad_norm": 2.2770960330963135, + "learning_rate": 6.189599252175452e-06, + "loss": 1.25, + "step": 52958 + }, + { + "epoch": 0.6620165504137604, + "grad_norm": 4.00986909866333, + "learning_rate": 6.1887924147564645e-06, + "loss": 0.9676, + "step": 52960 + }, + { + "epoch": 0.6620415510387759, + "grad_norm": 0.20637665688991547, + "learning_rate": 6.187985606363094e-06, + "loss": 0.0436, + "step": 52962 + }, + { + "epoch": 0.6620665516637916, + "grad_norm": 1.5018763542175293, + "learning_rate": 6.187178827001485e-06, + "loss": 0.9004, + "step": 52964 + }, + { + "epoch": 0.6620915522888072, + "grad_norm": 4.898282527923584, + "learning_rate": 6.18637207667779e-06, + "loss": 2.2187, + "step": 52966 + }, + { + "epoch": 0.6621165529138229, + "grad_norm": 0.0020004825200885534, + "learning_rate": 6.1855653553981405e-06, + "loss": 0.1805, + "step": 52968 + }, + { + "epoch": 0.6621415535388385, + "grad_norm": 4.165402412414551, + "learning_rate": 6.184758663168686e-06, + "loss": 1.825, + "step": 52970 + }, + { + "epoch": 0.6621665541638541, + "grad_norm": 3.346407890319824, + "learning_rate": 6.183951999995571e-06, + "loss": 0.9817, + "step": 52972 + }, + { + "epoch": 0.6621915547888697, + "grad_norm": 1.615642786026001, + "learning_rate": 6.1831453658849346e-06, + "loss": 0.3966, + "step": 52974 + }, + { + "epoch": 0.6622165554138854, + "grad_norm": 0.11253707855939865, + "learning_rate": 6.182338760842925e-06, + "loss": 0.0435, + "step": 52976 + }, + { + "epoch": 0.662241556038901, + "grad_norm": 3.4210946559906006, + "learning_rate": 6.18153218487569e-06, + "loss": 1.2622, + "step": 52978 + }, + { + "epoch": 0.6622665566639167, + "grad_norm": 3.1338436603546143, + "learning_rate": 6.1807256379893596e-06, + "loss": 0.9603, + "step": 52980 + }, + { + "epoch": 0.6622915572889322, + "grad_norm": 3.6069719791412354, + "learning_rate": 6.179919120190083e-06, + "loss": 2.0327, + "step": 52982 + }, + { + "epoch": 0.6623165579139478, + "grad_norm": 2.0928640365600586, + "learning_rate": 6.1791126314840035e-06, + "loss": 0.4273, + "step": 52984 + }, + { + "epoch": 0.6623415585389635, + "grad_norm": 2.025289535522461, + "learning_rate": 6.178306171877259e-06, + "loss": 0.0909, + "step": 52986 + }, + { + "epoch": 0.6623665591639791, + "grad_norm": 4.495223522186279, + "learning_rate": 6.177499741375997e-06, + "loss": 1.5407, + "step": 52988 + }, + { + "epoch": 0.6623915597889948, + "grad_norm": 5.415310382843018, + "learning_rate": 6.176693339986354e-06, + "loss": 1.8297, + "step": 52990 + }, + { + "epoch": 0.6624165604140103, + "grad_norm": 3.811042070388794, + "learning_rate": 6.175886967714479e-06, + "loss": 0.9204, + "step": 52992 + }, + { + "epoch": 0.662441561039026, + "grad_norm": 2.339874267578125, + "learning_rate": 6.175080624566505e-06, + "loss": 1.3375, + "step": 52994 + }, + { + "epoch": 0.6624665616640416, + "grad_norm": 5.591358661651611, + "learning_rate": 6.1742743105485755e-06, + "loss": 1.2658, + "step": 52996 + }, + { + "epoch": 0.6624915622890573, + "grad_norm": 3.365339517593384, + "learning_rate": 6.17346802566683e-06, + "loss": 0.7344, + "step": 52998 + }, + { + "epoch": 0.6625165629140729, + "grad_norm": 1.907726526260376, + "learning_rate": 6.172661769927413e-06, + "loss": 0.4315, + "step": 53000 + }, + { + "epoch": 0.6625415635390884, + "grad_norm": 1.0818161964416504, + "learning_rate": 6.171855543336462e-06, + "loss": 0.3036, + "step": 53002 + }, + { + "epoch": 0.6625665641641041, + "grad_norm": 0.7454864382743835, + "learning_rate": 6.171049345900122e-06, + "loss": 0.2664, + "step": 53004 + }, + { + "epoch": 0.6625915647891197, + "grad_norm": 2.2168612480163574, + "learning_rate": 6.170243177624527e-06, + "loss": 1.3509, + "step": 53006 + }, + { + "epoch": 0.6626165654141354, + "grad_norm": 2.8499603271484375, + "learning_rate": 6.169437038515816e-06, + "loss": 0.9177, + "step": 53008 + }, + { + "epoch": 0.662641566039151, + "grad_norm": 0.004533329047262669, + "learning_rate": 6.168630928580128e-06, + "loss": 0.1733, + "step": 53010 + }, + { + "epoch": 0.6626665666641666, + "grad_norm": 0.007467349991202354, + "learning_rate": 6.167824847823609e-06, + "loss": 0.9398, + "step": 53012 + }, + { + "epoch": 0.6626915672891822, + "grad_norm": 1.3581559658050537, + "learning_rate": 6.167018796252393e-06, + "loss": 0.7433, + "step": 53014 + }, + { + "epoch": 0.6627165679141979, + "grad_norm": 4.0733561515808105, + "learning_rate": 6.166212773872619e-06, + "loss": 0.9762, + "step": 53016 + }, + { + "epoch": 0.6627415685392135, + "grad_norm": 0.4014539122581482, + "learning_rate": 6.1654067806904326e-06, + "loss": 0.4623, + "step": 53018 + }, + { + "epoch": 0.6627665691642292, + "grad_norm": 3.118710517883301, + "learning_rate": 6.164600816711961e-06, + "loss": 0.5263, + "step": 53020 + }, + { + "epoch": 0.6627915697892447, + "grad_norm": 3.2143285274505615, + "learning_rate": 6.163794881943345e-06, + "loss": 1.1369, + "step": 53022 + }, + { + "epoch": 0.6628165704142603, + "grad_norm": 0.4205150306224823, + "learning_rate": 6.162988976390727e-06, + "loss": 0.1222, + "step": 53024 + }, + { + "epoch": 0.662841571039276, + "grad_norm": 2.527484178543091, + "learning_rate": 6.162183100060242e-06, + "loss": 0.7513, + "step": 53026 + }, + { + "epoch": 0.6628665716642916, + "grad_norm": 9.71003246307373, + "learning_rate": 6.161377252958027e-06, + "loss": 0.912, + "step": 53028 + }, + { + "epoch": 0.6628915722893073, + "grad_norm": 9.842073440551758, + "learning_rate": 6.160571435090226e-06, + "loss": 1.1839, + "step": 53030 + }, + { + "epoch": 0.6629165729143228, + "grad_norm": 3.7880992889404297, + "learning_rate": 6.159765646462965e-06, + "loss": 1.5979, + "step": 53032 + }, + { + "epoch": 0.6629415735393385, + "grad_norm": 2.861211061477661, + "learning_rate": 6.1589598870823855e-06, + "loss": 0.2104, + "step": 53034 + }, + { + "epoch": 0.6629665741643541, + "grad_norm": 3.9368984699249268, + "learning_rate": 6.158154156954622e-06, + "loss": 0.9507, + "step": 53036 + }, + { + "epoch": 0.6629915747893698, + "grad_norm": 2.256901502609253, + "learning_rate": 6.157348456085816e-06, + "loss": 0.7196, + "step": 53038 + }, + { + "epoch": 0.6630165754143854, + "grad_norm": 0.0006592096178792417, + "learning_rate": 6.1565427844821e-06, + "loss": 0.0922, + "step": 53040 + }, + { + "epoch": 0.663041576039401, + "grad_norm": 2.5308444499969482, + "learning_rate": 6.155737142149612e-06, + "loss": 0.3488, + "step": 53042 + }, + { + "epoch": 0.6630665766644166, + "grad_norm": 2.9044532775878906, + "learning_rate": 6.154931529094489e-06, + "loss": 1.0568, + "step": 53044 + }, + { + "epoch": 0.6630915772894322, + "grad_norm": 3.315197706222534, + "learning_rate": 6.15412594532286e-06, + "loss": 1.3264, + "step": 53046 + }, + { + "epoch": 0.6631165779144479, + "grad_norm": 1.5164763927459717, + "learning_rate": 6.153320390840862e-06, + "loss": 0.4914, + "step": 53048 + }, + { + "epoch": 0.6631415785394635, + "grad_norm": 1.6020162105560303, + "learning_rate": 6.152514865654633e-06, + "loss": 0.0348, + "step": 53050 + }, + { + "epoch": 0.6631665791644791, + "grad_norm": 3.8861935138702393, + "learning_rate": 6.151709369770309e-06, + "loss": 0.6423, + "step": 53052 + }, + { + "epoch": 0.6631915797894947, + "grad_norm": 1.356367588043213, + "learning_rate": 6.150903903194021e-06, + "loss": 0.5508, + "step": 53054 + }, + { + "epoch": 0.6632165804145104, + "grad_norm": 0.001140620093792677, + "learning_rate": 6.150098465931909e-06, + "loss": 1.1942, + "step": 53056 + }, + { + "epoch": 0.663241581039526, + "grad_norm": 0.0015139371389523149, + "learning_rate": 6.149293057990098e-06, + "loss": 0.6461, + "step": 53058 + }, + { + "epoch": 0.6632665816645417, + "grad_norm": 6.721431732177734, + "learning_rate": 6.1484876793747235e-06, + "loss": 0.3327, + "step": 53060 + }, + { + "epoch": 0.6632915822895572, + "grad_norm": 3.605896472930908, + "learning_rate": 6.147682330091926e-06, + "loss": 0.6767, + "step": 53062 + }, + { + "epoch": 0.6633165829145728, + "grad_norm": 3.4974491596221924, + "learning_rate": 6.146877010147833e-06, + "loss": 0.7928, + "step": 53064 + }, + { + "epoch": 0.6633415835395885, + "grad_norm": 0.8915759921073914, + "learning_rate": 6.146071719548582e-06, + "loss": 0.6556, + "step": 53066 + }, + { + "epoch": 0.6633665841646041, + "grad_norm": 1.2591700553894043, + "learning_rate": 6.145266458300303e-06, + "loss": 0.6988, + "step": 53068 + }, + { + "epoch": 0.6633915847896198, + "grad_norm": 5.780088424682617, + "learning_rate": 6.144461226409134e-06, + "loss": 1.7876, + "step": 53070 + }, + { + "epoch": 0.6634165854146353, + "grad_norm": 6.493328094482422, + "learning_rate": 6.143656023881197e-06, + "loss": 1.5857, + "step": 53072 + }, + { + "epoch": 0.663441586039651, + "grad_norm": 10.388960838317871, + "learning_rate": 6.142850850722631e-06, + "loss": 1.4951, + "step": 53074 + }, + { + "epoch": 0.6634665866646666, + "grad_norm": 2.337649345397949, + "learning_rate": 6.142045706939569e-06, + "loss": 0.847, + "step": 53076 + }, + { + "epoch": 0.6634915872896823, + "grad_norm": 4.323569297790527, + "learning_rate": 6.14124059253814e-06, + "loss": 1.5712, + "step": 53078 + }, + { + "epoch": 0.6635165879146979, + "grad_norm": 0.0030562817119061947, + "learning_rate": 6.140435507524478e-06, + "loss": 0.4836, + "step": 53080 + }, + { + "epoch": 0.6635415885397135, + "grad_norm": 3.1428768634796143, + "learning_rate": 6.139630451904718e-06, + "loss": 1.0895, + "step": 53082 + }, + { + "epoch": 0.6635665891647291, + "grad_norm": 2.756627321243286, + "learning_rate": 6.13882542568498e-06, + "loss": 0.9687, + "step": 53084 + }, + { + "epoch": 0.6635915897897448, + "grad_norm": 0.003185246605426073, + "learning_rate": 6.138020428871403e-06, + "loss": 0.1095, + "step": 53086 + }, + { + "epoch": 0.6636165904147604, + "grad_norm": 7.40887975692749, + "learning_rate": 6.137215461470116e-06, + "loss": 1.3753, + "step": 53088 + }, + { + "epoch": 0.663641591039776, + "grad_norm": 2.472928524017334, + "learning_rate": 6.136410523487251e-06, + "loss": 1.2379, + "step": 53090 + }, + { + "epoch": 0.6636665916647916, + "grad_norm": 0.22161930799484253, + "learning_rate": 6.135605614928936e-06, + "loss": 0.5157, + "step": 53092 + }, + { + "epoch": 0.6636915922898072, + "grad_norm": 3.2215487957000732, + "learning_rate": 6.134800735801303e-06, + "loss": 0.9091, + "step": 53094 + }, + { + "epoch": 0.6637165929148229, + "grad_norm": 1.7283188104629517, + "learning_rate": 6.133995886110483e-06, + "loss": 0.1416, + "step": 53096 + }, + { + "epoch": 0.6637415935398385, + "grad_norm": 5.447620868682861, + "learning_rate": 6.133191065862601e-06, + "loss": 0.3342, + "step": 53098 + }, + { + "epoch": 0.6637665941648542, + "grad_norm": 4.8496413230896, + "learning_rate": 6.1323862750637894e-06, + "loss": 1.3311, + "step": 53100 + }, + { + "epoch": 0.6637915947898697, + "grad_norm": 7.03510856628418, + "learning_rate": 6.131581513720177e-06, + "loss": 1.1588, + "step": 53102 + }, + { + "epoch": 0.6638165954148854, + "grad_norm": 2.155876636505127, + "learning_rate": 6.130776781837893e-06, + "loss": 0.393, + "step": 53104 + }, + { + "epoch": 0.663841596039901, + "grad_norm": 0.0011862831888720393, + "learning_rate": 6.129972079423066e-06, + "loss": 0.0931, + "step": 53106 + }, + { + "epoch": 0.6638665966649167, + "grad_norm": 3.118812322616577, + "learning_rate": 6.129167406481823e-06, + "loss": 0.6489, + "step": 53108 + }, + { + "epoch": 0.6638915972899323, + "grad_norm": 4.71648645401001, + "learning_rate": 6.128362763020297e-06, + "loss": 1.9896, + "step": 53110 + }, + { + "epoch": 0.6639165979149478, + "grad_norm": 4.418044090270996, + "learning_rate": 6.12755814904461e-06, + "loss": 0.822, + "step": 53112 + }, + { + "epoch": 0.6639415985399635, + "grad_norm": 3.634608268737793, + "learning_rate": 6.1267535645608926e-06, + "loss": 0.9033, + "step": 53114 + }, + { + "epoch": 0.6639665991649791, + "grad_norm": 9.588397026062012, + "learning_rate": 6.125949009575273e-06, + "loss": 2.8689, + "step": 53116 + }, + { + "epoch": 0.6639915997899948, + "grad_norm": 3.6999454498291016, + "learning_rate": 6.125144484093877e-06, + "loss": 0.8987, + "step": 53118 + }, + { + "epoch": 0.6640166004150104, + "grad_norm": 12.771564483642578, + "learning_rate": 6.124339988122834e-06, + "loss": 0.8142, + "step": 53120 + }, + { + "epoch": 0.664041601040026, + "grad_norm": 2.4386136531829834, + "learning_rate": 6.123535521668269e-06, + "loss": 0.3909, + "step": 53122 + }, + { + "epoch": 0.6640666016650416, + "grad_norm": 3.0465402603149414, + "learning_rate": 6.12273108473631e-06, + "loss": 0.9115, + "step": 53124 + }, + { + "epoch": 0.6640916022900573, + "grad_norm": 4.4376678466796875, + "learning_rate": 6.121926677333081e-06, + "loss": 0.8965, + "step": 53126 + }, + { + "epoch": 0.6641166029150729, + "grad_norm": 0.003430472919717431, + "learning_rate": 6.121122299464711e-06, + "loss": 0.5484, + "step": 53128 + }, + { + "epoch": 0.6641416035400886, + "grad_norm": 2.337733745574951, + "learning_rate": 6.1203179511373245e-06, + "loss": 0.9948, + "step": 53130 + }, + { + "epoch": 0.6641666041651041, + "grad_norm": 1.1039412021636963, + "learning_rate": 6.119513632357048e-06, + "loss": 0.6937, + "step": 53132 + }, + { + "epoch": 0.6641916047901197, + "grad_norm": 0.9933278560638428, + "learning_rate": 6.118709343130007e-06, + "loss": 0.2811, + "step": 53134 + }, + { + "epoch": 0.6642166054151354, + "grad_norm": 2.612590789794922, + "learning_rate": 6.11790508346233e-06, + "loss": 1.1734, + "step": 53136 + }, + { + "epoch": 0.664241606040151, + "grad_norm": 1.3839325904846191, + "learning_rate": 6.117100853360135e-06, + "loss": 0.029, + "step": 53138 + }, + { + "epoch": 0.6642666066651667, + "grad_norm": 3.749081611633301, + "learning_rate": 6.116296652829552e-06, + "loss": 0.8883, + "step": 53140 + }, + { + "epoch": 0.6642916072901822, + "grad_norm": 0.7658966779708862, + "learning_rate": 6.115492481876704e-06, + "loss": 0.1027, + "step": 53142 + }, + { + "epoch": 0.6643166079151979, + "grad_norm": 5.182880401611328, + "learning_rate": 6.1146883405077155e-06, + "loss": 1.563, + "step": 53144 + }, + { + "epoch": 0.6643416085402135, + "grad_norm": 1.8664665222167969, + "learning_rate": 6.113884228728712e-06, + "loss": 0.6282, + "step": 53146 + }, + { + "epoch": 0.6643666091652292, + "grad_norm": 5.902000427246094, + "learning_rate": 6.113080146545818e-06, + "loss": 1.7257, + "step": 53148 + }, + { + "epoch": 0.6643916097902448, + "grad_norm": 1.8530877828598022, + "learning_rate": 6.112276093965154e-06, + "loss": 0.9678, + "step": 53150 + }, + { + "epoch": 0.6644166104152603, + "grad_norm": 1.9053025245666504, + "learning_rate": 6.111472070992846e-06, + "loss": 0.9213, + "step": 53152 + }, + { + "epoch": 0.664441611040276, + "grad_norm": 0.003778040874749422, + "learning_rate": 6.1106680776350154e-06, + "loss": 0.5222, + "step": 53154 + }, + { + "epoch": 0.6644666116652916, + "grad_norm": 3.0428168773651123, + "learning_rate": 6.109864113897789e-06, + "loss": 0.6946, + "step": 53156 + }, + { + "epoch": 0.6644916122903073, + "grad_norm": 8.06164836883545, + "learning_rate": 6.109060179787286e-06, + "loss": 1.0164, + "step": 53158 + }, + { + "epoch": 0.6645166129153229, + "grad_norm": 3.602337121963501, + "learning_rate": 6.108256275309631e-06, + "loss": 1.355, + "step": 53160 + }, + { + "epoch": 0.6645416135403385, + "grad_norm": 3.6170947551727295, + "learning_rate": 6.107452400470948e-06, + "loss": 1.5794, + "step": 53162 + }, + { + "epoch": 0.6645666141653541, + "grad_norm": 7.4656758308410645, + "learning_rate": 6.1066485552773535e-06, + "loss": 1.5509, + "step": 53164 + }, + { + "epoch": 0.6645916147903698, + "grad_norm": 5.166683197021484, + "learning_rate": 6.105844739734974e-06, + "loss": 0.8027, + "step": 53166 + }, + { + "epoch": 0.6646166154153854, + "grad_norm": 2.0780017375946045, + "learning_rate": 6.105040953849931e-06, + "loss": 0.7356, + "step": 53168 + }, + { + "epoch": 0.6646416160404011, + "grad_norm": 6.16309928894043, + "learning_rate": 6.104237197628344e-06, + "loss": 0.42, + "step": 53170 + }, + { + "epoch": 0.6646666166654166, + "grad_norm": 3.3079819679260254, + "learning_rate": 6.103433471076336e-06, + "loss": 0.5249, + "step": 53172 + }, + { + "epoch": 0.6646916172904322, + "grad_norm": 6.7269487380981445, + "learning_rate": 6.102629774200031e-06, + "loss": 0.9362, + "step": 53174 + }, + { + "epoch": 0.6647166179154479, + "grad_norm": 3.0784530639648438, + "learning_rate": 6.101826107005544e-06, + "loss": 1.2395, + "step": 53176 + }, + { + "epoch": 0.6647416185404635, + "grad_norm": 2.7946255207061768, + "learning_rate": 6.101022469498997e-06, + "loss": 0.906, + "step": 53178 + }, + { + "epoch": 0.6647666191654792, + "grad_norm": 0.0013678857358172536, + "learning_rate": 6.100218861686512e-06, + "loss": 0.6887, + "step": 53180 + }, + { + "epoch": 0.6647916197904947, + "grad_norm": 5.188387393951416, + "learning_rate": 6.099415283574208e-06, + "loss": 0.9816, + "step": 53182 + }, + { + "epoch": 0.6648166204155104, + "grad_norm": 1.291719913482666, + "learning_rate": 6.0986117351682075e-06, + "loss": 0.6087, + "step": 53184 + }, + { + "epoch": 0.664841621040526, + "grad_norm": 2.898801326751709, + "learning_rate": 6.097808216474628e-06, + "loss": 0.7696, + "step": 53186 + }, + { + "epoch": 0.6648666216655417, + "grad_norm": 3.1871886253356934, + "learning_rate": 6.0970047274995895e-06, + "loss": 0.3426, + "step": 53188 + }, + { + "epoch": 0.6648916222905573, + "grad_norm": 3.3837661743164062, + "learning_rate": 6.09620126824921e-06, + "loss": 0.6573, + "step": 53190 + }, + { + "epoch": 0.6649166229155729, + "grad_norm": 0.3705313503742218, + "learning_rate": 6.09539783872961e-06, + "loss": 0.7418, + "step": 53192 + }, + { + "epoch": 0.6649416235405885, + "grad_norm": 2.9279491901397705, + "learning_rate": 6.094594438946908e-06, + "loss": 1.3125, + "step": 53194 + }, + { + "epoch": 0.6649666241656041, + "grad_norm": 5.729295253753662, + "learning_rate": 6.093791068907221e-06, + "loss": 1.4964, + "step": 53196 + }, + { + "epoch": 0.6649916247906198, + "grad_norm": 4.473818778991699, + "learning_rate": 6.092987728616671e-06, + "loss": 1.2928, + "step": 53198 + }, + { + "epoch": 0.6650166254156354, + "grad_norm": 5.726207256317139, + "learning_rate": 6.0921844180813735e-06, + "loss": 1.5974, + "step": 53200 + }, + { + "epoch": 0.665041626040651, + "grad_norm": 4.430768966674805, + "learning_rate": 6.091381137307446e-06, + "loss": 0.9827, + "step": 53202 + }, + { + "epoch": 0.6650666266656666, + "grad_norm": 4.885537624359131, + "learning_rate": 6.090577886301008e-06, + "loss": 1.8352, + "step": 53204 + }, + { + "epoch": 0.6650916272906823, + "grad_norm": 2.7088663578033447, + "learning_rate": 6.089774665068176e-06, + "loss": 1.0253, + "step": 53206 + }, + { + "epoch": 0.6651166279156979, + "grad_norm": 4.936390399932861, + "learning_rate": 6.088971473615066e-06, + "loss": 2.2472, + "step": 53208 + }, + { + "epoch": 0.6651416285407136, + "grad_norm": 0.7917482256889343, + "learning_rate": 6.0881683119477955e-06, + "loss": 0.0269, + "step": 53210 + }, + { + "epoch": 0.6651666291657291, + "grad_norm": 3.822058916091919, + "learning_rate": 6.087365180072484e-06, + "loss": 0.7066, + "step": 53212 + }, + { + "epoch": 0.6651916297907448, + "grad_norm": 1.4541397094726562, + "learning_rate": 6.086562077995246e-06, + "loss": 0.1536, + "step": 53214 + }, + { + "epoch": 0.6652166304157604, + "grad_norm": 1.525331974029541, + "learning_rate": 6.085759005722197e-06, + "loss": 0.4845, + "step": 53216 + }, + { + "epoch": 0.665241631040776, + "grad_norm": 3.994262933731079, + "learning_rate": 6.084955963259454e-06, + "loss": 1.3319, + "step": 53218 + }, + { + "epoch": 0.6652666316657917, + "grad_norm": 3.7943129539489746, + "learning_rate": 6.084152950613132e-06, + "loss": 1.5126, + "step": 53220 + }, + { + "epoch": 0.6652916322908072, + "grad_norm": 2.6245570182800293, + "learning_rate": 6.083349967789348e-06, + "loss": 1.0196, + "step": 53222 + }, + { + "epoch": 0.6653166329158229, + "grad_norm": 1.5453401803970337, + "learning_rate": 6.082547014794218e-06, + "loss": 0.8467, + "step": 53224 + }, + { + "epoch": 0.6653416335408385, + "grad_norm": 2.4777655601501465, + "learning_rate": 6.081744091633856e-06, + "loss": 0.8443, + "step": 53226 + }, + { + "epoch": 0.6653666341658542, + "grad_norm": 1.9088397026062012, + "learning_rate": 6.080941198314375e-06, + "loss": 0.9506, + "step": 53228 + }, + { + "epoch": 0.6653916347908698, + "grad_norm": 11.115095138549805, + "learning_rate": 6.080138334841892e-06, + "loss": 1.6994, + "step": 53230 + }, + { + "epoch": 0.6654166354158854, + "grad_norm": 1.657383680343628, + "learning_rate": 6.079335501222521e-06, + "loss": 0.0228, + "step": 53232 + }, + { + "epoch": 0.665441636040901, + "grad_norm": 4.180333137512207, + "learning_rate": 6.078532697462377e-06, + "loss": 0.5499, + "step": 53234 + }, + { + "epoch": 0.6654666366659167, + "grad_norm": 4.420166969299316, + "learning_rate": 6.077729923567573e-06, + "loss": 2.6778, + "step": 53236 + }, + { + "epoch": 0.6654916372909323, + "grad_norm": 2.664473056793213, + "learning_rate": 6.076927179544223e-06, + "loss": 1.0139, + "step": 53238 + }, + { + "epoch": 0.665516637915948, + "grad_norm": 5.824804306030273, + "learning_rate": 6.076124465398443e-06, + "loss": 1.3652, + "step": 53240 + }, + { + "epoch": 0.6655416385409635, + "grad_norm": 1.9575008153915405, + "learning_rate": 6.075321781136343e-06, + "loss": 0.7091, + "step": 53242 + }, + { + "epoch": 0.6655666391659791, + "grad_norm": 0.0010133334435522556, + "learning_rate": 6.074519126764036e-06, + "loss": 0.3465, + "step": 53244 + }, + { + "epoch": 0.6655916397909948, + "grad_norm": 2.5778114795684814, + "learning_rate": 6.073716502287636e-06, + "loss": 0.5645, + "step": 53246 + }, + { + "epoch": 0.6656166404160104, + "grad_norm": 5.129510879516602, + "learning_rate": 6.072913907713258e-06, + "loss": 0.9822, + "step": 53248 + }, + { + "epoch": 0.6656416410410261, + "grad_norm": 6.399082183837891, + "learning_rate": 6.072111343047011e-06, + "loss": 0.7866, + "step": 53250 + }, + { + "epoch": 0.6656666416660416, + "grad_norm": 4.693809986114502, + "learning_rate": 6.071308808295011e-06, + "loss": 0.6561, + "step": 53252 + }, + { + "epoch": 0.6656916422910573, + "grad_norm": 4.584537982940674, + "learning_rate": 6.0705063034633645e-06, + "loss": 1.2604, + "step": 53254 + }, + { + "epoch": 0.6657166429160729, + "grad_norm": 3.2891106605529785, + "learning_rate": 6.069703828558187e-06, + "loss": 0.4465, + "step": 53256 + }, + { + "epoch": 0.6657416435410886, + "grad_norm": 2.1094613075256348, + "learning_rate": 6.06890138358559e-06, + "loss": 1.2915, + "step": 53258 + }, + { + "epoch": 0.6657666441661042, + "grad_norm": 0.0005704160430468619, + "learning_rate": 6.068098968551684e-06, + "loss": 0.0661, + "step": 53260 + }, + { + "epoch": 0.6657916447911197, + "grad_norm": 2.2210376262664795, + "learning_rate": 6.0672965834625786e-06, + "loss": 0.4522, + "step": 53262 + }, + { + "epoch": 0.6658166454161354, + "grad_norm": 1.6034457683563232, + "learning_rate": 6.0664942283243875e-06, + "loss": 0.6702, + "step": 53264 + }, + { + "epoch": 0.665841646041151, + "grad_norm": 0.0008811778970994055, + "learning_rate": 6.0656919031432224e-06, + "loss": 0.6128, + "step": 53266 + }, + { + "epoch": 0.6658666466661667, + "grad_norm": 2.9920008182525635, + "learning_rate": 6.064889607925188e-06, + "loss": 1.0724, + "step": 53268 + }, + { + "epoch": 0.6658916472911823, + "grad_norm": 2.673100709915161, + "learning_rate": 6.064087342676399e-06, + "loss": 0.5355, + "step": 53270 + }, + { + "epoch": 0.6659166479161979, + "grad_norm": 0.0034851001109927893, + "learning_rate": 6.063285107402965e-06, + "loss": 0.123, + "step": 53272 + }, + { + "epoch": 0.6659416485412135, + "grad_norm": 5.312422752380371, + "learning_rate": 6.0624829021109935e-06, + "loss": 1.2791, + "step": 53274 + }, + { + "epoch": 0.6659666491662292, + "grad_norm": 3.516374349594116, + "learning_rate": 6.061680726806596e-06, + "loss": 1.0029, + "step": 53276 + }, + { + "epoch": 0.6659916497912448, + "grad_norm": 0.00274584349244833, + "learning_rate": 6.0608785814958836e-06, + "loss": 0.0001, + "step": 53278 + }, + { + "epoch": 0.6660166504162605, + "grad_norm": 4.518689155578613, + "learning_rate": 6.060076466184961e-06, + "loss": 1.4746, + "step": 53280 + }, + { + "epoch": 0.666041651041276, + "grad_norm": 1.0639617443084717, + "learning_rate": 6.059274380879938e-06, + "loss": 0.2943, + "step": 53282 + }, + { + "epoch": 0.6660666516662916, + "grad_norm": 0.003677346045151353, + "learning_rate": 6.0584723255869235e-06, + "loss": 0.0005, + "step": 53284 + }, + { + "epoch": 0.6660916522913073, + "grad_norm": 0.003190656891092658, + "learning_rate": 6.057670300312027e-06, + "loss": 0.0082, + "step": 53286 + }, + { + "epoch": 0.6661166529163229, + "grad_norm": 4.960997581481934, + "learning_rate": 6.056868305061357e-06, + "loss": 1.1831, + "step": 53288 + }, + { + "epoch": 0.6661416535413386, + "grad_norm": 3.770888090133667, + "learning_rate": 6.056066339841019e-06, + "loss": 2.0661, + "step": 53290 + }, + { + "epoch": 0.6661666541663541, + "grad_norm": 2.030116081237793, + "learning_rate": 6.055264404657124e-06, + "loss": 0.7831, + "step": 53292 + }, + { + "epoch": 0.6661916547913698, + "grad_norm": 3.4083874225616455, + "learning_rate": 6.054462499515776e-06, + "loss": 1.1078, + "step": 53294 + }, + { + "epoch": 0.6662166554163854, + "grad_norm": 2.2812860012054443, + "learning_rate": 6.053660624423084e-06, + "loss": 0.2189, + "step": 53296 + }, + { + "epoch": 0.6662416560414011, + "grad_norm": 3.1453757286071777, + "learning_rate": 6.052858779385156e-06, + "loss": 0.8991, + "step": 53298 + }, + { + "epoch": 0.6662666566664167, + "grad_norm": 1.3581109046936035, + "learning_rate": 6.052056964408095e-06, + "loss": 0.1262, + "step": 53300 + }, + { + "epoch": 0.6662916572914322, + "grad_norm": 2.624328136444092, + "learning_rate": 6.05125517949801e-06, + "loss": 1.0802, + "step": 53302 + }, + { + "epoch": 0.6663166579164479, + "grad_norm": 5.068424224853516, + "learning_rate": 6.050453424661009e-06, + "loss": 0.937, + "step": 53304 + }, + { + "epoch": 0.6663416585414635, + "grad_norm": 3.768566131591797, + "learning_rate": 6.049651699903196e-06, + "loss": 0.7989, + "step": 53306 + }, + { + "epoch": 0.6663666591664792, + "grad_norm": 5.048130035400391, + "learning_rate": 6.0488500052306755e-06, + "loss": 1.116, + "step": 53308 + }, + { + "epoch": 0.6663916597914948, + "grad_norm": 12.120898246765137, + "learning_rate": 6.048048340649555e-06, + "loss": 1.4479, + "step": 53310 + }, + { + "epoch": 0.6664166604165104, + "grad_norm": 3.5476958751678467, + "learning_rate": 6.047246706165939e-06, + "loss": 1.037, + "step": 53312 + }, + { + "epoch": 0.666441661041526, + "grad_norm": 2.0598483085632324, + "learning_rate": 6.046445101785934e-06, + "loss": 0.1993, + "step": 53314 + }, + { + "epoch": 0.6664666616665417, + "grad_norm": 1.9022866487503052, + "learning_rate": 6.0456435275156425e-06, + "loss": 0.3901, + "step": 53316 + }, + { + "epoch": 0.6664916622915573, + "grad_norm": 3.7716667652130127, + "learning_rate": 6.044841983361176e-06, + "loss": 1.3103, + "step": 53318 + }, + { + "epoch": 0.666516662916573, + "grad_norm": 3.6282498836517334, + "learning_rate": 6.0440404693286295e-06, + "loss": 0.3306, + "step": 53320 + }, + { + "epoch": 0.6665416635415885, + "grad_norm": 6.754810810089111, + "learning_rate": 6.043238985424111e-06, + "loss": 2.3653, + "step": 53322 + }, + { + "epoch": 0.6665666641666042, + "grad_norm": 12.565439224243164, + "learning_rate": 6.042437531653725e-06, + "loss": 0.3256, + "step": 53324 + }, + { + "epoch": 0.6665916647916198, + "grad_norm": 4.128292560577393, + "learning_rate": 6.0416361080235755e-06, + "loss": 1.2501, + "step": 53326 + }, + { + "epoch": 0.6666166654166354, + "grad_norm": 5.5491790771484375, + "learning_rate": 6.040834714539763e-06, + "loss": 0.5536, + "step": 53328 + }, + { + "epoch": 0.6666416660416511, + "grad_norm": 0.0033227873500436544, + "learning_rate": 6.040033351208401e-06, + "loss": 0.1452, + "step": 53330 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 5.709198951721191, + "learning_rate": 6.039232018035579e-06, + "loss": 0.8904, + "step": 53332 + }, + { + "epoch": 0.6666916672916823, + "grad_norm": 3.120722770690918, + "learning_rate": 6.038430715027407e-06, + "loss": 0.8158, + "step": 53334 + }, + { + "epoch": 0.6667166679166979, + "grad_norm": 3.3498475551605225, + "learning_rate": 6.037629442189986e-06, + "loss": 0.962, + "step": 53336 + }, + { + "epoch": 0.6667416685417136, + "grad_norm": 2.697807550430298, + "learning_rate": 6.036828199529418e-06, + "loss": 0.2493, + "step": 53338 + }, + { + "epoch": 0.6667666691667292, + "grad_norm": 4.421085834503174, + "learning_rate": 6.036026987051806e-06, + "loss": 1.0434, + "step": 53340 + }, + { + "epoch": 0.6667916697917448, + "grad_norm": 2.6948328018188477, + "learning_rate": 6.035225804763253e-06, + "loss": 0.6452, + "step": 53342 + }, + { + "epoch": 0.6668166704167604, + "grad_norm": 2.1659092903137207, + "learning_rate": 6.0344246526698636e-06, + "loss": 0.1167, + "step": 53344 + }, + { + "epoch": 0.666841671041776, + "grad_norm": 1.9444029331207275, + "learning_rate": 6.033623530777731e-06, + "loss": 0.357, + "step": 53346 + }, + { + "epoch": 0.6668666716667917, + "grad_norm": 5.540283203125, + "learning_rate": 6.032822439092961e-06, + "loss": 1.4639, + "step": 53348 + }, + { + "epoch": 0.6668916722918073, + "grad_norm": 13.920907974243164, + "learning_rate": 6.032021377621654e-06, + "loss": 2.1609, + "step": 53350 + }, + { + "epoch": 0.6669166729168229, + "grad_norm": 2.9719128608703613, + "learning_rate": 6.031220346369912e-06, + "loss": 0.6392, + "step": 53352 + }, + { + "epoch": 0.6669416735418385, + "grad_norm": 2.904470920562744, + "learning_rate": 6.030419345343832e-06, + "loss": 0.5086, + "step": 53354 + }, + { + "epoch": 0.6669666741668542, + "grad_norm": 0.9147413372993469, + "learning_rate": 6.029618374549522e-06, + "loss": 0.8066, + "step": 53356 + }, + { + "epoch": 0.6669916747918698, + "grad_norm": 7.114534854888916, + "learning_rate": 6.028817433993074e-06, + "loss": 1.6392, + "step": 53358 + }, + { + "epoch": 0.6670166754168855, + "grad_norm": 3.929307699203491, + "learning_rate": 6.028016523680591e-06, + "loss": 1.9446, + "step": 53360 + }, + { + "epoch": 0.667041676041901, + "grad_norm": 3.1815080642700195, + "learning_rate": 6.027215643618172e-06, + "loss": 0.1453, + "step": 53362 + }, + { + "epoch": 0.6670666766669167, + "grad_norm": 2.5458297729492188, + "learning_rate": 6.026414793811917e-06, + "loss": 0.3785, + "step": 53364 + }, + { + "epoch": 0.6670916772919323, + "grad_norm": 2.0074198246002197, + "learning_rate": 6.025613974267923e-06, + "loss": 0.6629, + "step": 53366 + }, + { + "epoch": 0.667116677916948, + "grad_norm": 3.9050400257110596, + "learning_rate": 6.024813184992293e-06, + "loss": 0.5254, + "step": 53368 + }, + { + "epoch": 0.6671416785419636, + "grad_norm": 3.0084726810455322, + "learning_rate": 6.024012425991127e-06, + "loss": 0.8832, + "step": 53370 + }, + { + "epoch": 0.6671666791669791, + "grad_norm": 3.149869441986084, + "learning_rate": 6.023211697270517e-06, + "loss": 1.1635, + "step": 53372 + }, + { + "epoch": 0.6671916797919948, + "grad_norm": 11.808589935302734, + "learning_rate": 6.022410998836563e-06, + "loss": 0.8236, + "step": 53374 + }, + { + "epoch": 0.6672166804170104, + "grad_norm": 4.541166305541992, + "learning_rate": 6.021610330695367e-06, + "loss": 1.3003, + "step": 53376 + }, + { + "epoch": 0.6672416810420261, + "grad_norm": 1.306852102279663, + "learning_rate": 6.02080969285302e-06, + "loss": 0.1196, + "step": 53378 + }, + { + "epoch": 0.6672666816670417, + "grad_norm": 2.623737096786499, + "learning_rate": 6.0200090853156254e-06, + "loss": 1.0684, + "step": 53380 + }, + { + "epoch": 0.6672916822920573, + "grad_norm": 1.850649356842041, + "learning_rate": 6.0192085080892805e-06, + "loss": 0.2026, + "step": 53382 + }, + { + "epoch": 0.6673166829170729, + "grad_norm": 2.8061015605926514, + "learning_rate": 6.018407961180084e-06, + "loss": 0.8938, + "step": 53384 + }, + { + "epoch": 0.6673416835420886, + "grad_norm": 2.3546500205993652, + "learning_rate": 6.017607444594124e-06, + "loss": 0.8979, + "step": 53386 + }, + { + "epoch": 0.6673666841671042, + "grad_norm": 4.154320240020752, + "learning_rate": 6.016806958337504e-06, + "loss": 0.4653, + "step": 53388 + }, + { + "epoch": 0.6673916847921199, + "grad_norm": 3.535487174987793, + "learning_rate": 6.0160065024163164e-06, + "loss": 1.5376, + "step": 53390 + }, + { + "epoch": 0.6674166854171354, + "grad_norm": 2.935591459274292, + "learning_rate": 6.015206076836662e-06, + "loss": 0.8347, + "step": 53392 + }, + { + "epoch": 0.667441686042151, + "grad_norm": 1.3217997550964355, + "learning_rate": 6.014405681604635e-06, + "loss": 0.0362, + "step": 53394 + }, + { + "epoch": 0.6674666866671667, + "grad_norm": 4.576011657714844, + "learning_rate": 6.013605316726334e-06, + "loss": 1.0245, + "step": 53396 + }, + { + "epoch": 0.6674916872921823, + "grad_norm": 2.9927783012390137, + "learning_rate": 6.012804982207848e-06, + "loss": 0.4678, + "step": 53398 + }, + { + "epoch": 0.667516687917198, + "grad_norm": 3.077338695526123, + "learning_rate": 6.0120046780552744e-06, + "loss": 0.6349, + "step": 53400 + }, + { + "epoch": 0.6675416885422135, + "grad_norm": 2.225555658340454, + "learning_rate": 6.0112044042747085e-06, + "loss": 1.2413, + "step": 53402 + }, + { + "epoch": 0.6675666891672292, + "grad_norm": 0.9749670028686523, + "learning_rate": 6.010404160872247e-06, + "loss": 0.5957, + "step": 53404 + }, + { + "epoch": 0.6675916897922448, + "grad_norm": 3.3481013774871826, + "learning_rate": 6.009603947853984e-06, + "loss": 1.3555, + "step": 53406 + }, + { + "epoch": 0.6676166904172605, + "grad_norm": 0.002172077540308237, + "learning_rate": 6.008803765226012e-06, + "loss": 0.0, + "step": 53408 + }, + { + "epoch": 0.6676416910422761, + "grad_norm": 0.3504483699798584, + "learning_rate": 6.008003612994431e-06, + "loss": 0.7425, + "step": 53410 + }, + { + "epoch": 0.6676666916672916, + "grad_norm": 5.242628574371338, + "learning_rate": 6.0072034911653255e-06, + "loss": 2.1836, + "step": 53412 + }, + { + "epoch": 0.6676916922923073, + "grad_norm": 2.929425001144409, + "learning_rate": 6.006403399744791e-06, + "loss": 0.6192, + "step": 53414 + }, + { + "epoch": 0.6677166929173229, + "grad_norm": 2.642643451690674, + "learning_rate": 6.005603338738926e-06, + "loss": 1.0686, + "step": 53416 + }, + { + "epoch": 0.6677416935423386, + "grad_norm": 0.0180340725928545, + "learning_rate": 6.004803308153821e-06, + "loss": 0.0001, + "step": 53418 + }, + { + "epoch": 0.6677666941673542, + "grad_norm": 3.7656307220458984, + "learning_rate": 6.004003307995569e-06, + "loss": 1.192, + "step": 53420 + }, + { + "epoch": 0.6677916947923698, + "grad_norm": 3.7806894779205322, + "learning_rate": 6.003203338270267e-06, + "loss": 1.3209, + "step": 53422 + }, + { + "epoch": 0.6678166954173854, + "grad_norm": 3.967289686203003, + "learning_rate": 6.002403398983999e-06, + "loss": 0.6197, + "step": 53424 + }, + { + "epoch": 0.6678416960424011, + "grad_norm": 9.846025466918945, + "learning_rate": 6.001603490142862e-06, + "loss": 1.3591, + "step": 53426 + }, + { + "epoch": 0.6678666966674167, + "grad_norm": 0.003463336732238531, + "learning_rate": 6.000803611752944e-06, + "loss": 0.0881, + "step": 53428 + }, + { + "epoch": 0.6678916972924324, + "grad_norm": 4.216668605804443, + "learning_rate": 6.000003763820343e-06, + "loss": 0.5915, + "step": 53430 + }, + { + "epoch": 0.6679166979174479, + "grad_norm": 1.518383264541626, + "learning_rate": 5.999203946351148e-06, + "loss": 0.156, + "step": 53432 + }, + { + "epoch": 0.6679416985424635, + "grad_norm": 1.7674429416656494, + "learning_rate": 5.99840415935145e-06, + "loss": 0.0924, + "step": 53434 + }, + { + "epoch": 0.6679666991674792, + "grad_norm": 2.7924880981445312, + "learning_rate": 5.997604402827344e-06, + "loss": 0.5957, + "step": 53436 + }, + { + "epoch": 0.6679916997924948, + "grad_norm": 1.4668595790863037, + "learning_rate": 5.996804676784913e-06, + "loss": 0.1725, + "step": 53438 + }, + { + "epoch": 0.6680167004175105, + "grad_norm": 2.76759672164917, + "learning_rate": 5.996004981230248e-06, + "loss": 0.3511, + "step": 53440 + }, + { + "epoch": 0.668041701042526, + "grad_norm": 2.8254997730255127, + "learning_rate": 5.9952053161694456e-06, + "loss": 1.0354, + "step": 53442 + }, + { + "epoch": 0.6680667016675417, + "grad_norm": 0.07608949393033981, + "learning_rate": 5.994405681608595e-06, + "loss": 0.0013, + "step": 53444 + }, + { + "epoch": 0.6680917022925573, + "grad_norm": 0.005123891402035952, + "learning_rate": 5.993606077553782e-06, + "loss": 0.1302, + "step": 53446 + }, + { + "epoch": 0.668116702917573, + "grad_norm": 3.760166883468628, + "learning_rate": 5.992806504011104e-06, + "loss": 0.6175, + "step": 53448 + }, + { + "epoch": 0.6681417035425886, + "grad_norm": 6.76651668548584, + "learning_rate": 5.99200696098664e-06, + "loss": 0.9325, + "step": 53450 + }, + { + "epoch": 0.6681667041676042, + "grad_norm": 3.0044302940368652, + "learning_rate": 5.991207448486483e-06, + "loss": 0.6098, + "step": 53452 + }, + { + "epoch": 0.6681917047926198, + "grad_norm": 0.0012666983529925346, + "learning_rate": 5.9904079665167246e-06, + "loss": 0.6325, + "step": 53454 + }, + { + "epoch": 0.6682167054176354, + "grad_norm": 2.825679063796997, + "learning_rate": 5.989608515083451e-06, + "loss": 1.1304, + "step": 53456 + }, + { + "epoch": 0.6682417060426511, + "grad_norm": 3.9577150344848633, + "learning_rate": 5.988809094192754e-06, + "loss": 1.4556, + "step": 53458 + }, + { + "epoch": 0.6682667066676667, + "grad_norm": 4.2089385986328125, + "learning_rate": 5.988009703850719e-06, + "loss": 1.0186, + "step": 53460 + }, + { + "epoch": 0.6682917072926823, + "grad_norm": 3.0528202056884766, + "learning_rate": 5.987210344063439e-06, + "loss": 1.1938, + "step": 53462 + }, + { + "epoch": 0.6683167079176979, + "grad_norm": 22.236881256103516, + "learning_rate": 5.98641101483699e-06, + "loss": 1.3606, + "step": 53464 + }, + { + "epoch": 0.6683417085427136, + "grad_norm": 0.0010731935035437346, + "learning_rate": 5.985611716177471e-06, + "loss": 0.0024, + "step": 53466 + }, + { + "epoch": 0.6683667091677292, + "grad_norm": 5.380650997161865, + "learning_rate": 5.984812448090964e-06, + "loss": 0.218, + "step": 53468 + }, + { + "epoch": 0.6683917097927449, + "grad_norm": 0.10731641203165054, + "learning_rate": 5.984013210583559e-06, + "loss": 0.3941, + "step": 53470 + }, + { + "epoch": 0.6684167104177604, + "grad_norm": 2.3791921138763428, + "learning_rate": 5.9832140036613394e-06, + "loss": 0.4784, + "step": 53472 + }, + { + "epoch": 0.668441711042776, + "grad_norm": 0.0011837505735456944, + "learning_rate": 5.982414827330401e-06, + "loss": 0.0001, + "step": 53474 + }, + { + "epoch": 0.6684667116677917, + "grad_norm": 4.023954391479492, + "learning_rate": 5.981615681596815e-06, + "loss": 1.3929, + "step": 53476 + }, + { + "epoch": 0.6684917122928073, + "grad_norm": 0.003035514149814844, + "learning_rate": 5.9808165664666785e-06, + "loss": 0.0825, + "step": 53478 + }, + { + "epoch": 0.668516712917823, + "grad_norm": 7.361518859863281, + "learning_rate": 5.9800174819460745e-06, + "loss": 0.7342, + "step": 53480 + }, + { + "epoch": 0.6685417135428385, + "grad_norm": 4.052616119384766, + "learning_rate": 5.979218428041087e-06, + "loss": 1.6061, + "step": 53482 + }, + { + "epoch": 0.6685667141678542, + "grad_norm": 4.448726654052734, + "learning_rate": 5.9784194047578055e-06, + "loss": 1.2411, + "step": 53484 + }, + { + "epoch": 0.6685917147928698, + "grad_norm": 3.370055913925171, + "learning_rate": 5.977620412102313e-06, + "loss": 0.3115, + "step": 53486 + }, + { + "epoch": 0.6686167154178855, + "grad_norm": 3.636453151702881, + "learning_rate": 5.976821450080697e-06, + "loss": 1.551, + "step": 53488 + }, + { + "epoch": 0.6686417160429011, + "grad_norm": 6.844027996063232, + "learning_rate": 5.976022518699036e-06, + "loss": 1.2105, + "step": 53490 + }, + { + "epoch": 0.6686667166679167, + "grad_norm": 4.470525741577148, + "learning_rate": 5.975223617963421e-06, + "loss": 0.3285, + "step": 53492 + }, + { + "epoch": 0.6686917172929323, + "grad_norm": 1.0559155941009521, + "learning_rate": 5.9744247478799326e-06, + "loss": 0.1428, + "step": 53494 + }, + { + "epoch": 0.668716717917948, + "grad_norm": 3.61639666557312, + "learning_rate": 5.9736259084546545e-06, + "loss": 1.2459, + "step": 53496 + }, + { + "epoch": 0.6687417185429636, + "grad_norm": 3.8851683139801025, + "learning_rate": 5.972827099693674e-06, + "loss": 0.1925, + "step": 53498 + }, + { + "epoch": 0.6687667191679793, + "grad_norm": 2.939345359802246, + "learning_rate": 5.972028321603076e-06, + "loss": 0.795, + "step": 53500 + }, + { + "epoch": 0.6687917197929948, + "grad_norm": 5.494799613952637, + "learning_rate": 5.9712295741889345e-06, + "loss": 1.8305, + "step": 53502 + }, + { + "epoch": 0.6688167204180104, + "grad_norm": 1.6131423711776733, + "learning_rate": 5.970430857457342e-06, + "loss": 0.1603, + "step": 53504 + }, + { + "epoch": 0.6688417210430261, + "grad_norm": 0.0012748356675729156, + "learning_rate": 5.969632171414378e-06, + "loss": 0.7454, + "step": 53506 + }, + { + "epoch": 0.6688667216680417, + "grad_norm": 0.6371281743049622, + "learning_rate": 5.968833516066125e-06, + "loss": 0.3988, + "step": 53508 + }, + { + "epoch": 0.6688917222930574, + "grad_norm": 0.21433129906654358, + "learning_rate": 5.968034891418666e-06, + "loss": 1.0062, + "step": 53510 + }, + { + "epoch": 0.6689167229180729, + "grad_norm": 4.134326934814453, + "learning_rate": 5.967236297478083e-06, + "loss": 1.6981, + "step": 53512 + }, + { + "epoch": 0.6689417235430886, + "grad_norm": 2.431647300720215, + "learning_rate": 5.96643773425046e-06, + "loss": 0.4374, + "step": 53514 + }, + { + "epoch": 0.6689667241681042, + "grad_norm": 3.1242618560791016, + "learning_rate": 5.965639201741875e-06, + "loss": 0.7266, + "step": 53516 + }, + { + "epoch": 0.6689917247931199, + "grad_norm": 4.357966899871826, + "learning_rate": 5.964840699958412e-06, + "loss": 1.7033, + "step": 53518 + }, + { + "epoch": 0.6690167254181355, + "grad_norm": 5.801967620849609, + "learning_rate": 5.96404222890615e-06, + "loss": 0.1368, + "step": 53520 + }, + { + "epoch": 0.669041726043151, + "grad_norm": 0.5588697195053101, + "learning_rate": 5.963243788591173e-06, + "loss": 1.071, + "step": 53522 + }, + { + "epoch": 0.6690667266681667, + "grad_norm": 0.0012467929627746344, + "learning_rate": 5.962445379019559e-06, + "loss": 0.0011, + "step": 53524 + }, + { + "epoch": 0.6690917272931823, + "grad_norm": 2.368256092071533, + "learning_rate": 5.9616470001973935e-06, + "loss": 0.6438, + "step": 53526 + }, + { + "epoch": 0.669116727918198, + "grad_norm": 2.7813146114349365, + "learning_rate": 5.960848652130751e-06, + "loss": 0.5242, + "step": 53528 + }, + { + "epoch": 0.6691417285432136, + "grad_norm": 3.0392463207244873, + "learning_rate": 5.9600503348257135e-06, + "loss": 1.1658, + "step": 53530 + }, + { + "epoch": 0.6691667291682292, + "grad_norm": 4.543844223022461, + "learning_rate": 5.959252048288362e-06, + "loss": 0.7936, + "step": 53532 + }, + { + "epoch": 0.6691917297932448, + "grad_norm": 3.0749781131744385, + "learning_rate": 5.958453792524774e-06, + "loss": 0.5246, + "step": 53534 + }, + { + "epoch": 0.6692167304182605, + "grad_norm": 3.76436710357666, + "learning_rate": 5.957655567541032e-06, + "loss": 1.8096, + "step": 53536 + }, + { + "epoch": 0.6692417310432761, + "grad_norm": 4.633909702301025, + "learning_rate": 5.956857373343214e-06, + "loss": 0.5686, + "step": 53538 + }, + { + "epoch": 0.6692667316682918, + "grad_norm": 2.6624467372894287, + "learning_rate": 5.9560592099374e-06, + "loss": 0.5033, + "step": 53540 + }, + { + "epoch": 0.6692917322933073, + "grad_norm": 1.2538944482803345, + "learning_rate": 5.955261077329664e-06, + "loss": 0.7008, + "step": 53542 + }, + { + "epoch": 0.6693167329183229, + "grad_norm": 3.717289924621582, + "learning_rate": 5.954462975526088e-06, + "loss": 1.8339, + "step": 53544 + }, + { + "epoch": 0.6693417335433386, + "grad_norm": 2.117990016937256, + "learning_rate": 5.953664904532751e-06, + "loss": 1.5596, + "step": 53546 + }, + { + "epoch": 0.6693667341683542, + "grad_norm": 1.268632411956787, + "learning_rate": 5.952866864355729e-06, + "loss": 0.0558, + "step": 53548 + }, + { + "epoch": 0.6693917347933699, + "grad_norm": 4.998178958892822, + "learning_rate": 5.952068855001101e-06, + "loss": 0.958, + "step": 53550 + }, + { + "epoch": 0.6694167354183854, + "grad_norm": 1.1784543991088867, + "learning_rate": 5.951270876474946e-06, + "loss": 1.2274, + "step": 53552 + }, + { + "epoch": 0.6694417360434011, + "grad_norm": 3.868335485458374, + "learning_rate": 5.950472928783337e-06, + "loss": 0.9023, + "step": 53554 + }, + { + "epoch": 0.6694667366684167, + "grad_norm": 3.5445611476898193, + "learning_rate": 5.949675011932355e-06, + "loss": 0.6705, + "step": 53556 + }, + { + "epoch": 0.6694917372934324, + "grad_norm": 3.2435474395751953, + "learning_rate": 5.948877125928074e-06, + "loss": 0.6004, + "step": 53558 + }, + { + "epoch": 0.669516737918448, + "grad_norm": 3.546302556991577, + "learning_rate": 5.948079270776572e-06, + "loss": 0.8595, + "step": 53560 + }, + { + "epoch": 0.6695417385434635, + "grad_norm": 1.0388058423995972, + "learning_rate": 5.947281446483926e-06, + "loss": 0.1339, + "step": 53562 + }, + { + "epoch": 0.6695667391684792, + "grad_norm": 3.352792263031006, + "learning_rate": 5.946483653056211e-06, + "loss": 1.1709, + "step": 53564 + }, + { + "epoch": 0.6695917397934948, + "grad_norm": 2.2156946659088135, + "learning_rate": 5.9456858904995054e-06, + "loss": 0.7895, + "step": 53566 + }, + { + "epoch": 0.6696167404185105, + "grad_norm": 2.9807229042053223, + "learning_rate": 5.9448881588198805e-06, + "loss": 0.4568, + "step": 53568 + }, + { + "epoch": 0.6696417410435261, + "grad_norm": 0.004575114697217941, + "learning_rate": 5.944090458023414e-06, + "loss": 0.0182, + "step": 53570 + }, + { + "epoch": 0.6696667416685417, + "grad_norm": 1.6333903074264526, + "learning_rate": 5.943292788116182e-06, + "loss": 1.5791, + "step": 53572 + }, + { + "epoch": 0.6696917422935573, + "grad_norm": 4.029293537139893, + "learning_rate": 5.942495149104257e-06, + "loss": 1.3532, + "step": 53574 + }, + { + "epoch": 0.669716742918573, + "grad_norm": 3.579563856124878, + "learning_rate": 5.941697540993715e-06, + "loss": 1.1952, + "step": 53576 + }, + { + "epoch": 0.6697417435435886, + "grad_norm": 0.0009781146654859185, + "learning_rate": 5.940899963790634e-06, + "loss": 0.4824, + "step": 53578 + }, + { + "epoch": 0.6697667441686043, + "grad_norm": 0.0010243335273116827, + "learning_rate": 5.940102417501082e-06, + "loss": 0.4879, + "step": 53580 + }, + { + "epoch": 0.6697917447936198, + "grad_norm": 3.818840742111206, + "learning_rate": 5.939304902131135e-06, + "loss": 1.4629, + "step": 53582 + }, + { + "epoch": 0.6698167454186355, + "grad_norm": 4.063169956207275, + "learning_rate": 5.938507417686869e-06, + "loss": 1.8213, + "step": 53584 + }, + { + "epoch": 0.6698417460436511, + "grad_norm": 14.155117988586426, + "learning_rate": 5.937709964174357e-06, + "loss": 2.4539, + "step": 53586 + }, + { + "epoch": 0.6698667466686667, + "grad_norm": 0.8780645132064819, + "learning_rate": 5.936912541599668e-06, + "loss": 0.3921, + "step": 53588 + }, + { + "epoch": 0.6698917472936824, + "grad_norm": 2.4972145557403564, + "learning_rate": 5.936115149968882e-06, + "loss": 0.956, + "step": 53590 + }, + { + "epoch": 0.6699167479186979, + "grad_norm": 3.8777568340301514, + "learning_rate": 5.935317789288069e-06, + "loss": 1.5409, + "step": 53592 + }, + { + "epoch": 0.6699417485437136, + "grad_norm": 5.935962677001953, + "learning_rate": 5.9345204595633e-06, + "loss": 0.5531, + "step": 53594 + }, + { + "epoch": 0.6699667491687292, + "grad_norm": 7.210122108459473, + "learning_rate": 5.933723160800647e-06, + "loss": 1.2018, + "step": 53596 + }, + { + "epoch": 0.6699917497937449, + "grad_norm": 0.5666861534118652, + "learning_rate": 5.932925893006183e-06, + "loss": 1.1783, + "step": 53598 + }, + { + "epoch": 0.6700167504187605, + "grad_norm": 2.8281705379486084, + "learning_rate": 5.9321286561859814e-06, + "loss": 1.1667, + "step": 53600 + }, + { + "epoch": 0.6700417510437761, + "grad_norm": 4.005496501922607, + "learning_rate": 5.931331450346112e-06, + "loss": 0.2407, + "step": 53602 + }, + { + "epoch": 0.6700667516687917, + "grad_norm": 2.867640733718872, + "learning_rate": 5.93053427549265e-06, + "loss": 0.9291, + "step": 53604 + }, + { + "epoch": 0.6700917522938074, + "grad_norm": 3.0689516067504883, + "learning_rate": 5.92973713163166e-06, + "loss": 0.5091, + "step": 53606 + }, + { + "epoch": 0.670116752918823, + "grad_norm": 3.5040175914764404, + "learning_rate": 5.928940018769217e-06, + "loss": 0.7818, + "step": 53608 + }, + { + "epoch": 0.6701417535438386, + "grad_norm": 2.440992593765259, + "learning_rate": 5.928142936911392e-06, + "loss": 1.0385, + "step": 53610 + }, + { + "epoch": 0.6701667541688542, + "grad_norm": 0.029138267040252686, + "learning_rate": 5.927345886064253e-06, + "loss": 0.26, + "step": 53612 + }, + { + "epoch": 0.6701917547938698, + "grad_norm": 3.091184377670288, + "learning_rate": 5.9265488662338725e-06, + "loss": 0.8299, + "step": 53614 + }, + { + "epoch": 0.6702167554188855, + "grad_norm": 19.72525978088379, + "learning_rate": 5.92575187742632e-06, + "loss": 1.4003, + "step": 53616 + }, + { + "epoch": 0.6702417560439011, + "grad_norm": 3.0313491821289062, + "learning_rate": 5.924954919647667e-06, + "loss": 1.0296, + "step": 53618 + }, + { + "epoch": 0.6702667566689168, + "grad_norm": 5.573887348175049, + "learning_rate": 5.924157992903978e-06, + "loss": 1.5552, + "step": 53620 + }, + { + "epoch": 0.6702917572939323, + "grad_norm": 0.6647267937660217, + "learning_rate": 5.923361097201328e-06, + "loss": 0.7772, + "step": 53622 + }, + { + "epoch": 0.670316757918948, + "grad_norm": 1.551563024520874, + "learning_rate": 5.922564232545781e-06, + "loss": 0.5323, + "step": 53624 + }, + { + "epoch": 0.6703417585439636, + "grad_norm": 8.164321899414062, + "learning_rate": 5.921767398943409e-06, + "loss": 2.9798, + "step": 53626 + }, + { + "epoch": 0.6703667591689793, + "grad_norm": 2.169248580932617, + "learning_rate": 5.920970596400279e-06, + "loss": 0.2285, + "step": 53628 + }, + { + "epoch": 0.6703917597939949, + "grad_norm": 2.756634473800659, + "learning_rate": 5.920173824922462e-06, + "loss": 2.0491, + "step": 53630 + }, + { + "epoch": 0.6704167604190104, + "grad_norm": 2.6789841651916504, + "learning_rate": 5.919377084516025e-06, + "loss": 0.6946, + "step": 53632 + }, + { + "epoch": 0.6704417610440261, + "grad_norm": 1.9532999992370605, + "learning_rate": 5.918580375187033e-06, + "loss": 1.2433, + "step": 53634 + }, + { + "epoch": 0.6704667616690417, + "grad_norm": 3.340630054473877, + "learning_rate": 5.917783696941555e-06, + "loss": 0.9611, + "step": 53636 + }, + { + "epoch": 0.6704917622940574, + "grad_norm": 2.571367025375366, + "learning_rate": 5.9169870497856605e-06, + "loss": 0.6529, + "step": 53638 + }, + { + "epoch": 0.670516762919073, + "grad_norm": 2.4632534980773926, + "learning_rate": 5.916190433725415e-06, + "loss": 1.3441, + "step": 53640 + }, + { + "epoch": 0.6705417635440886, + "grad_norm": 2.467013359069824, + "learning_rate": 5.915393848766884e-06, + "loss": 0.9139, + "step": 53642 + }, + { + "epoch": 0.6705667641691042, + "grad_norm": 5.710551738739014, + "learning_rate": 5.9145972949161405e-06, + "loss": 1.9406, + "step": 53644 + }, + { + "epoch": 0.6705917647941199, + "grad_norm": 3.5495405197143555, + "learning_rate": 5.913800772179242e-06, + "loss": 1.0467, + "step": 53646 + }, + { + "epoch": 0.6706167654191355, + "grad_norm": 3.127089500427246, + "learning_rate": 5.91300428056226e-06, + "loss": 0.7859, + "step": 53648 + }, + { + "epoch": 0.6706417660441512, + "grad_norm": 3.30145001411438, + "learning_rate": 5.91220782007126e-06, + "loss": 1.0574, + "step": 53650 + }, + { + "epoch": 0.6706667666691667, + "grad_norm": 0.014726249501109123, + "learning_rate": 5.911411390712306e-06, + "loss": 0.7231, + "step": 53652 + }, + { + "epoch": 0.6706917672941823, + "grad_norm": 2.520143747329712, + "learning_rate": 5.910614992491465e-06, + "loss": 1.079, + "step": 53654 + }, + { + "epoch": 0.670716767919198, + "grad_norm": 2.937008857727051, + "learning_rate": 5.909818625414803e-06, + "loss": 0.2319, + "step": 53656 + }, + { + "epoch": 0.6707417685442136, + "grad_norm": 3.0187437534332275, + "learning_rate": 5.909022289488384e-06, + "loss": 0.9127, + "step": 53658 + }, + { + "epoch": 0.6707667691692293, + "grad_norm": 2.741830587387085, + "learning_rate": 5.908225984718272e-06, + "loss": 0.8174, + "step": 53660 + }, + { + "epoch": 0.6707917697942448, + "grad_norm": 0.0022789083886891603, + "learning_rate": 5.907429711110531e-06, + "loss": 0.114, + "step": 53662 + }, + { + "epoch": 0.6708167704192605, + "grad_norm": 5.267176628112793, + "learning_rate": 5.906633468671228e-06, + "loss": 1.4558, + "step": 53664 + }, + { + "epoch": 0.6708417710442761, + "grad_norm": 0.0010532635496929288, + "learning_rate": 5.905837257406426e-06, + "loss": 0.1625, + "step": 53666 + }, + { + "epoch": 0.6708667716692918, + "grad_norm": 5.767613887786865, + "learning_rate": 5.905041077322187e-06, + "loss": 2.0496, + "step": 53668 + }, + { + "epoch": 0.6708917722943074, + "grad_norm": 4.44794225692749, + "learning_rate": 5.904244928424579e-06, + "loss": 1.1116, + "step": 53670 + }, + { + "epoch": 0.670916772919323, + "grad_norm": 1.9482622146606445, + "learning_rate": 5.903448810719661e-06, + "loss": 0.1403, + "step": 53672 + }, + { + "epoch": 0.6709417735443386, + "grad_norm": 4.890475749969482, + "learning_rate": 5.902652724213496e-06, + "loss": 1.9788, + "step": 53674 + }, + { + "epoch": 0.6709667741693542, + "grad_norm": 3.996009349822998, + "learning_rate": 5.90185666891215e-06, + "loss": 0.8518, + "step": 53676 + }, + { + "epoch": 0.6709917747943699, + "grad_norm": 0.0023126876913011074, + "learning_rate": 5.901060644821683e-06, + "loss": 0.2649, + "step": 53678 + }, + { + "epoch": 0.6710167754193855, + "grad_norm": 3.3061678409576416, + "learning_rate": 5.900264651948158e-06, + "loss": 1.0044, + "step": 53680 + }, + { + "epoch": 0.6710417760444011, + "grad_norm": 2.137275457382202, + "learning_rate": 5.899468690297639e-06, + "loss": 0.8033, + "step": 53682 + }, + { + "epoch": 0.6710667766694167, + "grad_norm": 2.612065315246582, + "learning_rate": 5.898672759876189e-06, + "loss": 0.6859, + "step": 53684 + }, + { + "epoch": 0.6710917772944324, + "grad_norm": 0.8259361982345581, + "learning_rate": 5.897876860689864e-06, + "loss": 0.1273, + "step": 53686 + }, + { + "epoch": 0.671116777919448, + "grad_norm": 0.0017876802012324333, + "learning_rate": 5.897080992744728e-06, + "loss": 0.3739, + "step": 53688 + }, + { + "epoch": 0.6711417785444637, + "grad_norm": 5.112282752990723, + "learning_rate": 5.896285156046844e-06, + "loss": 1.0715, + "step": 53690 + }, + { + "epoch": 0.6711667791694792, + "grad_norm": 2.919262170791626, + "learning_rate": 5.895489350602273e-06, + "loss": 1.2476, + "step": 53692 + }, + { + "epoch": 0.6711917797944948, + "grad_norm": 2.2247610092163086, + "learning_rate": 5.894693576417073e-06, + "loss": 1.2656, + "step": 53694 + }, + { + "epoch": 0.6712167804195105, + "grad_norm": 5.88203239440918, + "learning_rate": 5.893897833497309e-06, + "loss": 2.0236, + "step": 53696 + }, + { + "epoch": 0.6712417810445261, + "grad_norm": 2.712796211242676, + "learning_rate": 5.893102121849037e-06, + "loss": 0.1627, + "step": 53698 + }, + { + "epoch": 0.6712667816695418, + "grad_norm": 3.610661745071411, + "learning_rate": 5.892306441478317e-06, + "loss": 1.313, + "step": 53700 + }, + { + "epoch": 0.6712917822945573, + "grad_norm": 3.5874993801116943, + "learning_rate": 5.891510792391211e-06, + "loss": 0.9102, + "step": 53702 + }, + { + "epoch": 0.671316782919573, + "grad_norm": 2.7874650955200195, + "learning_rate": 5.890715174593777e-06, + "loss": 0.5407, + "step": 53704 + }, + { + "epoch": 0.6713417835445886, + "grad_norm": 4.431018829345703, + "learning_rate": 5.889919588092076e-06, + "loss": 0.7227, + "step": 53706 + }, + { + "epoch": 0.6713667841696043, + "grad_norm": 0.0010497755138203502, + "learning_rate": 5.889124032892164e-06, + "loss": 0.051, + "step": 53708 + }, + { + "epoch": 0.6713917847946199, + "grad_norm": 4.689193248748779, + "learning_rate": 5.888328509000108e-06, + "loss": 0.8242, + "step": 53710 + }, + { + "epoch": 0.6714167854196355, + "grad_norm": 3.9824507236480713, + "learning_rate": 5.887533016421958e-06, + "loss": 0.9752, + "step": 53712 + }, + { + "epoch": 0.6714417860446511, + "grad_norm": 0.0010000561596825719, + "learning_rate": 5.886737555163773e-06, + "loss": 0.7348, + "step": 53714 + }, + { + "epoch": 0.6714667866696667, + "grad_norm": 0.001855210866779089, + "learning_rate": 5.885942125231614e-06, + "loss": 0.0001, + "step": 53716 + }, + { + "epoch": 0.6714917872946824, + "grad_norm": 3.342444658279419, + "learning_rate": 5.8851467266315385e-06, + "loss": 1.1782, + "step": 53718 + }, + { + "epoch": 0.671516787919698, + "grad_norm": 2.916682720184326, + "learning_rate": 5.8843513593696e-06, + "loss": 0.9112, + "step": 53720 + }, + { + "epoch": 0.6715417885447136, + "grad_norm": 0.04859761893749237, + "learning_rate": 5.883556023451868e-06, + "loss": 0.5691, + "step": 53722 + }, + { + "epoch": 0.6715667891697292, + "grad_norm": 6.588162422180176, + "learning_rate": 5.882760718884385e-06, + "loss": 1.3341, + "step": 53724 + }, + { + "epoch": 0.6715917897947449, + "grad_norm": 3.814692735671997, + "learning_rate": 5.881965445673215e-06, + "loss": 0.6987, + "step": 53726 + }, + { + "epoch": 0.6716167904197605, + "grad_norm": 1.9940263032913208, + "learning_rate": 5.881170203824413e-06, + "loss": 1.6621, + "step": 53728 + }, + { + "epoch": 0.6716417910447762, + "grad_norm": 0.0004506021796260029, + "learning_rate": 5.8803749933440376e-06, + "loss": 1.0681, + "step": 53730 + }, + { + "epoch": 0.6716667916697917, + "grad_norm": 4.283456325531006, + "learning_rate": 5.879579814238141e-06, + "loss": 1.1598, + "step": 53732 + }, + { + "epoch": 0.6716917922948074, + "grad_norm": 0.0014348101103678346, + "learning_rate": 5.878784666512785e-06, + "loss": 0.7011, + "step": 53734 + }, + { + "epoch": 0.671716792919823, + "grad_norm": 3.176764965057373, + "learning_rate": 5.877989550174024e-06, + "loss": 0.4488, + "step": 53736 + }, + { + "epoch": 0.6717417935448387, + "grad_norm": 7.924734115600586, + "learning_rate": 5.877194465227909e-06, + "loss": 1.9942, + "step": 53738 + }, + { + "epoch": 0.6717667941698543, + "grad_norm": 0.464414119720459, + "learning_rate": 5.876399411680498e-06, + "loss": 0.5945, + "step": 53740 + }, + { + "epoch": 0.6717917947948698, + "grad_norm": 4.562501430511475, + "learning_rate": 5.8756043895378454e-06, + "loss": 2.1024, + "step": 53742 + }, + { + "epoch": 0.6718167954198855, + "grad_norm": 0.005634971894323826, + "learning_rate": 5.874809398806007e-06, + "loss": 0.0042, + "step": 53744 + }, + { + "epoch": 0.6718417960449011, + "grad_norm": 1.185228943824768, + "learning_rate": 5.874014439491034e-06, + "loss": 0.0374, + "step": 53746 + }, + { + "epoch": 0.6718667966699168, + "grad_norm": 1.3988980054855347, + "learning_rate": 5.873219511598991e-06, + "loss": 0.134, + "step": 53748 + }, + { + "epoch": 0.6718917972949324, + "grad_norm": 3.884603261947632, + "learning_rate": 5.872424615135919e-06, + "loss": 1.7082, + "step": 53750 + }, + { + "epoch": 0.671916797919948, + "grad_norm": 3.3206920623779297, + "learning_rate": 5.871629750107879e-06, + "loss": 0.9332, + "step": 53752 + }, + { + "epoch": 0.6719417985449636, + "grad_norm": 0.12160035967826843, + "learning_rate": 5.8708349165209225e-06, + "loss": 0.4159, + "step": 53754 + }, + { + "epoch": 0.6719667991699793, + "grad_norm": 3.9613194465637207, + "learning_rate": 5.870040114381102e-06, + "loss": 0.991, + "step": 53756 + }, + { + "epoch": 0.6719917997949949, + "grad_norm": 0.005639273673295975, + "learning_rate": 5.869245343694472e-06, + "loss": 0.7085, + "step": 53758 + }, + { + "epoch": 0.6720168004200106, + "grad_norm": 0.0013008562382310629, + "learning_rate": 5.868450604467087e-06, + "loss": 0.0901, + "step": 53760 + }, + { + "epoch": 0.6720418010450261, + "grad_norm": 0.06088604778051376, + "learning_rate": 5.867655896705e-06, + "loss": 1.0018, + "step": 53762 + }, + { + "epoch": 0.6720668016700417, + "grad_norm": 2.065232038497925, + "learning_rate": 5.866861220414259e-06, + "loss": 0.0666, + "step": 53764 + }, + { + "epoch": 0.6720918022950574, + "grad_norm": 3.6463284492492676, + "learning_rate": 5.866066575600919e-06, + "loss": 0.7323, + "step": 53766 + }, + { + "epoch": 0.672116802920073, + "grad_norm": 4.224194526672363, + "learning_rate": 5.865271962271031e-06, + "loss": 1.4055, + "step": 53768 + }, + { + "epoch": 0.6721418035450887, + "grad_norm": 6.8667426109313965, + "learning_rate": 5.864477380430644e-06, + "loss": 2.2633, + "step": 53770 + }, + { + "epoch": 0.6721668041701042, + "grad_norm": 1.7943792343139648, + "learning_rate": 5.863682830085815e-06, + "loss": 0.0996, + "step": 53772 + }, + { + "epoch": 0.6721918047951199, + "grad_norm": 2.2035751342773438, + "learning_rate": 5.862888311242596e-06, + "loss": 1.6151, + "step": 53774 + }, + { + "epoch": 0.6722168054201355, + "grad_norm": 3.845640182495117, + "learning_rate": 5.862093823907031e-06, + "loss": 1.4109, + "step": 53776 + }, + { + "epoch": 0.6722418060451512, + "grad_norm": 16.070446014404297, + "learning_rate": 5.861299368085176e-06, + "loss": 2.1152, + "step": 53778 + }, + { + "epoch": 0.6722668066701668, + "grad_norm": 4.187278747558594, + "learning_rate": 5.860504943783078e-06, + "loss": 1.8509, + "step": 53780 + }, + { + "epoch": 0.6722918072951823, + "grad_norm": 5.65327262878418, + "learning_rate": 5.859710551006788e-06, + "loss": 1.006, + "step": 53782 + }, + { + "epoch": 0.672316807920198, + "grad_norm": 1.160980224609375, + "learning_rate": 5.858916189762357e-06, + "loss": 0.0212, + "step": 53784 + }, + { + "epoch": 0.6723418085452136, + "grad_norm": 4.2697858810424805, + "learning_rate": 5.858121860055836e-06, + "loss": 1.3851, + "step": 53786 + }, + { + "epoch": 0.6723668091702293, + "grad_norm": 5.0377984046936035, + "learning_rate": 5.857327561893278e-06, + "loss": 1.5317, + "step": 53788 + }, + { + "epoch": 0.6723918097952449, + "grad_norm": 2.487006664276123, + "learning_rate": 5.856533295280722e-06, + "loss": 0.9916, + "step": 53790 + }, + { + "epoch": 0.6724168104202605, + "grad_norm": 3.0620534420013428, + "learning_rate": 5.8557390602242235e-06, + "loss": 1.3598, + "step": 53792 + }, + { + "epoch": 0.6724418110452761, + "grad_norm": 0.0009360249387100339, + "learning_rate": 5.854944856729827e-06, + "loss": 0.8006, + "step": 53794 + }, + { + "epoch": 0.6724668116702918, + "grad_norm": 3.437896490097046, + "learning_rate": 5.854150684803588e-06, + "loss": 1.1041, + "step": 53796 + }, + { + "epoch": 0.6724918122953074, + "grad_norm": 2.8337838649749756, + "learning_rate": 5.8533565444515495e-06, + "loss": 1.3089, + "step": 53798 + }, + { + "epoch": 0.6725168129203231, + "grad_norm": 3.2264997959136963, + "learning_rate": 5.852562435679766e-06, + "loss": 1.2582, + "step": 53800 + }, + { + "epoch": 0.6725418135453386, + "grad_norm": 0.0011618663556873798, + "learning_rate": 5.851768358494276e-06, + "loss": 0.0053, + "step": 53802 + }, + { + "epoch": 0.6725668141703542, + "grad_norm": 3.7752456665039062, + "learning_rate": 5.850974312901132e-06, + "loss": 1.1236, + "step": 53804 + }, + { + "epoch": 0.6725918147953699, + "grad_norm": 3.2582831382751465, + "learning_rate": 5.850180298906379e-06, + "loss": 0.5742, + "step": 53806 + }, + { + "epoch": 0.6726168154203855, + "grad_norm": 4.199601173400879, + "learning_rate": 5.849386316516068e-06, + "loss": 0.2863, + "step": 53808 + }, + { + "epoch": 0.6726418160454012, + "grad_norm": 3.165466547012329, + "learning_rate": 5.848592365736243e-06, + "loss": 0.3357, + "step": 53810 + }, + { + "epoch": 0.6726668166704167, + "grad_norm": 2.089657783508301, + "learning_rate": 5.847798446572953e-06, + "loss": 1.01, + "step": 53812 + }, + { + "epoch": 0.6726918172954324, + "grad_norm": 1.884691834449768, + "learning_rate": 5.847004559032246e-06, + "loss": 0.5911, + "step": 53814 + }, + { + "epoch": 0.672716817920448, + "grad_norm": 2.4913835525512695, + "learning_rate": 5.8462107031201605e-06, + "loss": 0.6151, + "step": 53816 + }, + { + "epoch": 0.6727418185454637, + "grad_norm": 2.7543177604675293, + "learning_rate": 5.845416878842749e-06, + "loss": 0.7822, + "step": 53818 + }, + { + "epoch": 0.6727668191704793, + "grad_norm": 3.673798084259033, + "learning_rate": 5.84462308620605e-06, + "loss": 0.9682, + "step": 53820 + }, + { + "epoch": 0.6727918197954948, + "grad_norm": 0.9108317494392395, + "learning_rate": 5.843829325216118e-06, + "loss": 0.7694, + "step": 53822 + }, + { + "epoch": 0.6728168204205105, + "grad_norm": 4.823187828063965, + "learning_rate": 5.843035595878994e-06, + "loss": 0.6226, + "step": 53824 + }, + { + "epoch": 0.6728418210455261, + "grad_norm": 0.677951991558075, + "learning_rate": 5.842241898200726e-06, + "loss": 0.3815, + "step": 53826 + }, + { + "epoch": 0.6728668216705418, + "grad_norm": 3.398723602294922, + "learning_rate": 5.841448232187352e-06, + "loss": 0.9128, + "step": 53828 + }, + { + "epoch": 0.6728918222955574, + "grad_norm": 3.9979400634765625, + "learning_rate": 5.8406545978449215e-06, + "loss": 1.4733, + "step": 53830 + }, + { + "epoch": 0.672916822920573, + "grad_norm": 4.4036736488342285, + "learning_rate": 5.839860995179473e-06, + "loss": 0.643, + "step": 53832 + }, + { + "epoch": 0.6729418235455886, + "grad_norm": 3.597943067550659, + "learning_rate": 5.83906742419706e-06, + "loss": 0.8471, + "step": 53834 + }, + { + "epoch": 0.6729668241706043, + "grad_norm": 8.259062767028809, + "learning_rate": 5.838273884903722e-06, + "loss": 2.4144, + "step": 53836 + }, + { + "epoch": 0.6729918247956199, + "grad_norm": 7.7808308601379395, + "learning_rate": 5.837480377305496e-06, + "loss": 1.9065, + "step": 53838 + }, + { + "epoch": 0.6730168254206356, + "grad_norm": 0.001125573879107833, + "learning_rate": 5.836686901408436e-06, + "loss": 0.6274, + "step": 53840 + }, + { + "epoch": 0.6730418260456511, + "grad_norm": 3.1453323364257812, + "learning_rate": 5.83589345721858e-06, + "loss": 1.4843, + "step": 53842 + }, + { + "epoch": 0.6730668266706668, + "grad_norm": 5.142296314239502, + "learning_rate": 5.835100044741966e-06, + "loss": 1.3705, + "step": 53844 + }, + { + "epoch": 0.6730918272956824, + "grad_norm": 4.919075012207031, + "learning_rate": 5.834306663984645e-06, + "loss": 1.6779, + "step": 53846 + }, + { + "epoch": 0.673116827920698, + "grad_norm": 0.0011359057389199734, + "learning_rate": 5.833513314952651e-06, + "loss": 0.2515, + "step": 53848 + }, + { + "epoch": 0.6731418285457137, + "grad_norm": 2.807385206222534, + "learning_rate": 5.8327199976520355e-06, + "loss": 0.9997, + "step": 53850 + }, + { + "epoch": 0.6731668291707292, + "grad_norm": 2.1855597496032715, + "learning_rate": 5.831926712088835e-06, + "loss": 0.4899, + "step": 53852 + }, + { + "epoch": 0.6731918297957449, + "grad_norm": 2.905780792236328, + "learning_rate": 5.831133458269087e-06, + "loss": 0.462, + "step": 53854 + }, + { + "epoch": 0.6732168304207605, + "grad_norm": 2.06109881401062, + "learning_rate": 5.8303402361988416e-06, + "loss": 0.5301, + "step": 53856 + }, + { + "epoch": 0.6732418310457762, + "grad_norm": 4.405740737915039, + "learning_rate": 5.82954704588413e-06, + "loss": 2.2031, + "step": 53858 + }, + { + "epoch": 0.6732668316707918, + "grad_norm": 2.5248379707336426, + "learning_rate": 5.828753887331002e-06, + "loss": 0.2465, + "step": 53860 + }, + { + "epoch": 0.6732918322958074, + "grad_norm": 3.745870590209961, + "learning_rate": 5.8279607605454925e-06, + "loss": 1.1771, + "step": 53862 + }, + { + "epoch": 0.673316832920823, + "grad_norm": 0.7977177500724792, + "learning_rate": 5.8271676655336475e-06, + "loss": 0.4942, + "step": 53864 + }, + { + "epoch": 0.6733418335458387, + "grad_norm": 3.579103469848633, + "learning_rate": 5.826374602301504e-06, + "loss": 0.7554, + "step": 53866 + }, + { + "epoch": 0.6733668341708543, + "grad_norm": 0.8910596966743469, + "learning_rate": 5.825581570855095e-06, + "loss": 0.137, + "step": 53868 + }, + { + "epoch": 0.67339183479587, + "grad_norm": 5.148011207580566, + "learning_rate": 5.824788571200471e-06, + "loss": 0.9936, + "step": 53870 + }, + { + "epoch": 0.6734168354208855, + "grad_norm": 0.7752041816711426, + "learning_rate": 5.823995603343664e-06, + "loss": 0.5774, + "step": 53872 + }, + { + "epoch": 0.6734418360459011, + "grad_norm": 1.3594003915786743, + "learning_rate": 5.8232026672907195e-06, + "loss": 0.7763, + "step": 53874 + }, + { + "epoch": 0.6734668366709168, + "grad_norm": 0.0007069379789754748, + "learning_rate": 5.82240976304767e-06, + "loss": 0.489, + "step": 53876 + }, + { + "epoch": 0.6734918372959324, + "grad_norm": 3.548352003097534, + "learning_rate": 5.821616890620559e-06, + "loss": 1.2126, + "step": 53878 + }, + { + "epoch": 0.6735168379209481, + "grad_norm": 0.003666672622784972, + "learning_rate": 5.820824050015424e-06, + "loss": 0.0362, + "step": 53880 + }, + { + "epoch": 0.6735418385459636, + "grad_norm": 8.039227485656738, + "learning_rate": 5.820031241238298e-06, + "loss": 1.6143, + "step": 53882 + }, + { + "epoch": 0.6735668391709793, + "grad_norm": 0.0006605380331166089, + "learning_rate": 5.819238464295225e-06, + "loss": 0.0, + "step": 53884 + }, + { + "epoch": 0.6735918397959949, + "grad_norm": 0.0007465052185580134, + "learning_rate": 5.818445719192238e-06, + "loss": 0.027, + "step": 53886 + }, + { + "epoch": 0.6736168404210106, + "grad_norm": 4.265413284301758, + "learning_rate": 5.817653005935382e-06, + "loss": 0.6356, + "step": 53888 + }, + { + "epoch": 0.6736418410460262, + "grad_norm": 3.3927228450775146, + "learning_rate": 5.816860324530685e-06, + "loss": 0.7787, + "step": 53890 + }, + { + "epoch": 0.6736668416710417, + "grad_norm": 1.1687922477722168, + "learning_rate": 5.816067674984192e-06, + "loss": 0.1322, + "step": 53892 + }, + { + "epoch": 0.6736918422960574, + "grad_norm": 6.127431392669678, + "learning_rate": 5.8152750573019356e-06, + "loss": 1.2101, + "step": 53894 + }, + { + "epoch": 0.673716842921073, + "grad_norm": 6.175034523010254, + "learning_rate": 5.814482471489948e-06, + "loss": 0.8808, + "step": 53896 + }, + { + "epoch": 0.6737418435460887, + "grad_norm": 4.111219882965088, + "learning_rate": 5.813689917554276e-06, + "loss": 0.992, + "step": 53898 + }, + { + "epoch": 0.6737668441711043, + "grad_norm": 3.4135937690734863, + "learning_rate": 5.812897395500945e-06, + "loss": 0.5075, + "step": 53900 + }, + { + "epoch": 0.6737918447961199, + "grad_norm": 3.444167137145996, + "learning_rate": 5.812104905335998e-06, + "loss": 0.853, + "step": 53902 + }, + { + "epoch": 0.6738168454211355, + "grad_norm": 5.4410600662231445, + "learning_rate": 5.811312447065464e-06, + "loss": 1.4399, + "step": 53904 + }, + { + "epoch": 0.6738418460461512, + "grad_norm": 0.0009394482476636767, + "learning_rate": 5.810520020695387e-06, + "loss": 0.9728, + "step": 53906 + }, + { + "epoch": 0.6738668466711668, + "grad_norm": 0.136969193816185, + "learning_rate": 5.809727626231797e-06, + "loss": 0.0216, + "step": 53908 + }, + { + "epoch": 0.6738918472961825, + "grad_norm": 4.245094299316406, + "learning_rate": 5.808935263680724e-06, + "loss": 0.4417, + "step": 53910 + }, + { + "epoch": 0.673916847921198, + "grad_norm": 3.950511932373047, + "learning_rate": 5.808142933048212e-06, + "loss": 1.8389, + "step": 53912 + }, + { + "epoch": 0.6739418485462136, + "grad_norm": 0.001077500288374722, + "learning_rate": 5.807350634340288e-06, + "loss": 0.4834, + "step": 53914 + }, + { + "epoch": 0.6739668491712293, + "grad_norm": 0.8037585020065308, + "learning_rate": 5.806558367562986e-06, + "loss": 0.6909, + "step": 53916 + }, + { + "epoch": 0.6739918497962449, + "grad_norm": 3.226148843765259, + "learning_rate": 5.805766132722352e-06, + "loss": 1.4724, + "step": 53918 + }, + { + "epoch": 0.6740168504212606, + "grad_norm": 4.566873550415039, + "learning_rate": 5.804973929824401e-06, + "loss": 1.8957, + "step": 53920 + }, + { + "epoch": 0.6740418510462761, + "grad_norm": 0.3492991626262665, + "learning_rate": 5.80418175887518e-06, + "loss": 0.0167, + "step": 53922 + }, + { + "epoch": 0.6740668516712918, + "grad_norm": 3.066512107849121, + "learning_rate": 5.8033896198807134e-06, + "loss": 1.6924, + "step": 53924 + }, + { + "epoch": 0.6740918522963074, + "grad_norm": 4.715975761413574, + "learning_rate": 5.802597512847043e-06, + "loss": 1.437, + "step": 53926 + }, + { + "epoch": 0.6741168529213231, + "grad_norm": 5.108456611633301, + "learning_rate": 5.80180543778019e-06, + "loss": 0.3837, + "step": 53928 + }, + { + "epoch": 0.6741418535463387, + "grad_norm": 4.424703121185303, + "learning_rate": 5.801013394686194e-06, + "loss": 1.0251, + "step": 53930 + }, + { + "epoch": 0.6741668541713542, + "grad_norm": 6.303045272827148, + "learning_rate": 5.800221383571096e-06, + "loss": 1.5288, + "step": 53932 + }, + { + "epoch": 0.6741918547963699, + "grad_norm": 2.9075982570648193, + "learning_rate": 5.799429404440908e-06, + "loss": 0.878, + "step": 53934 + }, + { + "epoch": 0.6742168554213855, + "grad_norm": 3.342672109603882, + "learning_rate": 5.798637457301677e-06, + "loss": 0.7716, + "step": 53936 + }, + { + "epoch": 0.6742418560464012, + "grad_norm": 3.2817063331604004, + "learning_rate": 5.7978455421594235e-06, + "loss": 0.8607, + "step": 53938 + }, + { + "epoch": 0.6742668566714168, + "grad_norm": 2.5425939559936523, + "learning_rate": 5.7970536590201895e-06, + "loss": 0.4667, + "step": 53940 + }, + { + "epoch": 0.6742918572964324, + "grad_norm": 2.8771817684173584, + "learning_rate": 5.796261807889996e-06, + "loss": 2.0756, + "step": 53942 + }, + { + "epoch": 0.674316857921448, + "grad_norm": 6.400671482086182, + "learning_rate": 5.7954699887748825e-06, + "loss": 2.0936, + "step": 53944 + }, + { + "epoch": 0.6743418585464637, + "grad_norm": 0.3954969644546509, + "learning_rate": 5.7946782016808745e-06, + "loss": 0.7661, + "step": 53946 + }, + { + "epoch": 0.6743668591714793, + "grad_norm": 2.6048519611358643, + "learning_rate": 5.793886446613999e-06, + "loss": 0.6861, + "step": 53948 + }, + { + "epoch": 0.674391859796495, + "grad_norm": 3.4692604541778564, + "learning_rate": 5.793094723580292e-06, + "loss": 0.3288, + "step": 53950 + }, + { + "epoch": 0.6744168604215105, + "grad_norm": 0.030885953456163406, + "learning_rate": 5.792303032585779e-06, + "loss": 0.0627, + "step": 53952 + }, + { + "epoch": 0.6744418610465261, + "grad_norm": 3.2372663021087646, + "learning_rate": 5.791511373636489e-06, + "loss": 0.5505, + "step": 53954 + }, + { + "epoch": 0.6744668616715418, + "grad_norm": 2.6355016231536865, + "learning_rate": 5.790719746738458e-06, + "loss": 1.0344, + "step": 53956 + }, + { + "epoch": 0.6744918622965574, + "grad_norm": 2.9833288192749023, + "learning_rate": 5.789928151897711e-06, + "loss": 0.8313, + "step": 53958 + }, + { + "epoch": 0.6745168629215731, + "grad_norm": 0.4467719793319702, + "learning_rate": 5.789136589120276e-06, + "loss": 0.0184, + "step": 53960 + }, + { + "epoch": 0.6745418635465886, + "grad_norm": 2.2499818801879883, + "learning_rate": 5.788345058412176e-06, + "loss": 0.1541, + "step": 53962 + }, + { + "epoch": 0.6745668641716043, + "grad_norm": 3.431236743927002, + "learning_rate": 5.787553559779448e-06, + "loss": 0.2387, + "step": 53964 + }, + { + "epoch": 0.6745918647966199, + "grad_norm": 15.226563453674316, + "learning_rate": 5.7867620932281134e-06, + "loss": 1.3412, + "step": 53966 + }, + { + "epoch": 0.6746168654216356, + "grad_norm": 2.210221767425537, + "learning_rate": 5.7859706587642035e-06, + "loss": 0.8005, + "step": 53968 + }, + { + "epoch": 0.6746418660466512, + "grad_norm": 3.7880373001098633, + "learning_rate": 5.785179256393753e-06, + "loss": 0.751, + "step": 53970 + }, + { + "epoch": 0.6746668666716668, + "grad_norm": 4.308682441711426, + "learning_rate": 5.784387886122772e-06, + "loss": 1.1957, + "step": 53972 + }, + { + "epoch": 0.6746918672966824, + "grad_norm": 0.9184035062789917, + "learning_rate": 5.783596547957302e-06, + "loss": 0.1735, + "step": 53974 + }, + { + "epoch": 0.674716867921698, + "grad_norm": 3.613860607147217, + "learning_rate": 5.7828052419033596e-06, + "loss": 0.6265, + "step": 53976 + }, + { + "epoch": 0.6747418685467137, + "grad_norm": 1.7297378778457642, + "learning_rate": 5.7820139679669764e-06, + "loss": 0.5204, + "step": 53978 + }, + { + "epoch": 0.6747668691717293, + "grad_norm": 0.0020002159290015697, + "learning_rate": 5.781222726154182e-06, + "loss": 1.0367, + "step": 53980 + }, + { + "epoch": 0.6747918697967449, + "grad_norm": 1.8139756917953491, + "learning_rate": 5.7804315164709945e-06, + "loss": 0.1199, + "step": 53982 + }, + { + "epoch": 0.6748168704217605, + "grad_norm": 2.9361984729766846, + "learning_rate": 5.779640338923453e-06, + "loss": 0.3247, + "step": 53984 + }, + { + "epoch": 0.6748418710467762, + "grad_norm": 1.176987886428833, + "learning_rate": 5.7788491935175664e-06, + "loss": 0.8338, + "step": 53986 + }, + { + "epoch": 0.6748668716717918, + "grad_norm": 13.919075012207031, + "learning_rate": 5.77805808025937e-06, + "loss": 1.3895, + "step": 53988 + }, + { + "epoch": 0.6748918722968075, + "grad_norm": 0.0005732705467380583, + "learning_rate": 5.7772669991548825e-06, + "loss": 1.5899, + "step": 53990 + }, + { + "epoch": 0.674916872921823, + "grad_norm": 4.7034759521484375, + "learning_rate": 5.776475950210132e-06, + "loss": 1.2434, + "step": 53992 + }, + { + "epoch": 0.6749418735468387, + "grad_norm": 3.2502710819244385, + "learning_rate": 5.775684933431147e-06, + "loss": 0.4601, + "step": 53994 + }, + { + "epoch": 0.6749668741718543, + "grad_norm": 3.7944793701171875, + "learning_rate": 5.77489394882395e-06, + "loss": 1.3253, + "step": 53996 + }, + { + "epoch": 0.67499187479687, + "grad_norm": 4.40015983581543, + "learning_rate": 5.774102996394561e-06, + "loss": 1.136, + "step": 53998 + }, + { + "epoch": 0.6750168754218856, + "grad_norm": 2.5204992294311523, + "learning_rate": 5.7733120761490046e-06, + "loss": 0.8559, + "step": 54000 + }, + { + "epoch": 0.6750418760469011, + "grad_norm": 0.0010491539724171162, + "learning_rate": 5.772521188093303e-06, + "loss": 0.0, + "step": 54002 + }, + { + "epoch": 0.6750668766719168, + "grad_norm": 3.1644787788391113, + "learning_rate": 5.7717303322334885e-06, + "loss": 1.546, + "step": 54004 + }, + { + "epoch": 0.6750918772969324, + "grad_norm": 5.208374500274658, + "learning_rate": 5.770939508575572e-06, + "loss": 0.5214, + "step": 54006 + }, + { + "epoch": 0.6751168779219481, + "grad_norm": 4.498179912567139, + "learning_rate": 5.7701487171255875e-06, + "loss": 1.8102, + "step": 54008 + }, + { + "epoch": 0.6751418785469637, + "grad_norm": 2.374509811401367, + "learning_rate": 5.769357957889552e-06, + "loss": 1.0599, + "step": 54010 + }, + { + "epoch": 0.6751668791719793, + "grad_norm": 4.0088114738464355, + "learning_rate": 5.7685672308734875e-06, + "loss": 0.2133, + "step": 54012 + }, + { + "epoch": 0.6751918797969949, + "grad_norm": 2.3930904865264893, + "learning_rate": 5.767776536083413e-06, + "loss": 1.2261, + "step": 54014 + }, + { + "epoch": 0.6752168804220106, + "grad_norm": 3.236494541168213, + "learning_rate": 5.7669858735253535e-06, + "loss": 0.5287, + "step": 54016 + }, + { + "epoch": 0.6752418810470262, + "grad_norm": 0.8553546071052551, + "learning_rate": 5.766195243205335e-06, + "loss": 0.6231, + "step": 54018 + }, + { + "epoch": 0.6752668816720419, + "grad_norm": 2.9154279232025146, + "learning_rate": 5.765404645129371e-06, + "loss": 1.0341, + "step": 54020 + }, + { + "epoch": 0.6752918822970574, + "grad_norm": 0.030671827495098114, + "learning_rate": 5.764614079303494e-06, + "loss": 0.1601, + "step": 54022 + }, + { + "epoch": 0.675316882922073, + "grad_norm": 0.7540687322616577, + "learning_rate": 5.763823545733711e-06, + "loss": 0.6429, + "step": 54024 + }, + { + "epoch": 0.6753418835470887, + "grad_norm": 5.556220054626465, + "learning_rate": 5.763033044426052e-06, + "loss": 2.7179, + "step": 54026 + }, + { + "epoch": 0.6753668841721043, + "grad_norm": 3.690534830093384, + "learning_rate": 5.7622425753865295e-06, + "loss": 1.1975, + "step": 54028 + }, + { + "epoch": 0.67539188479712, + "grad_norm": 0.005286644212901592, + "learning_rate": 5.761452138621169e-06, + "loss": 1.1719, + "step": 54030 + }, + { + "epoch": 0.6754168854221355, + "grad_norm": 1.501463770866394, + "learning_rate": 5.760661734135991e-06, + "loss": 0.3598, + "step": 54032 + }, + { + "epoch": 0.6754418860471512, + "grad_norm": 3.4242961406707764, + "learning_rate": 5.759871361937013e-06, + "loss": 1.8106, + "step": 54034 + }, + { + "epoch": 0.6754668866721668, + "grad_norm": 3.412513017654419, + "learning_rate": 5.7590810220302616e-06, + "loss": 1.1215, + "step": 54036 + }, + { + "epoch": 0.6754918872971825, + "grad_norm": 1.5750350952148438, + "learning_rate": 5.7582907144217415e-06, + "loss": 0.6628, + "step": 54038 + }, + { + "epoch": 0.6755168879221981, + "grad_norm": 0.7272787094116211, + "learning_rate": 5.757500439117481e-06, + "loss": 1.324, + "step": 54040 + }, + { + "epoch": 0.6755418885472136, + "grad_norm": 3.908911943435669, + "learning_rate": 5.756710196123499e-06, + "loss": 0.8261, + "step": 54042 + }, + { + "epoch": 0.6755668891722293, + "grad_norm": 1.3040120601654053, + "learning_rate": 5.755919985445809e-06, + "loss": 0.2034, + "step": 54044 + }, + { + "epoch": 0.6755918897972449, + "grad_norm": 4.16615104675293, + "learning_rate": 5.755129807090436e-06, + "loss": 1.2798, + "step": 54046 + }, + { + "epoch": 0.6756168904222606, + "grad_norm": 2.7396318912506104, + "learning_rate": 5.754339661063395e-06, + "loss": 0.7363, + "step": 54048 + }, + { + "epoch": 0.6756418910472762, + "grad_norm": 2.671006441116333, + "learning_rate": 5.753549547370703e-06, + "loss": 0.593, + "step": 54050 + }, + { + "epoch": 0.6756668916722918, + "grad_norm": 2.639738082885742, + "learning_rate": 5.752759466018372e-06, + "loss": 0.667, + "step": 54052 + }, + { + "epoch": 0.6756918922973074, + "grad_norm": 4.586050033569336, + "learning_rate": 5.751969417012424e-06, + "loss": 1.7584, + "step": 54054 + }, + { + "epoch": 0.6757168929223231, + "grad_norm": 2.6036484241485596, + "learning_rate": 5.75117940035888e-06, + "loss": 0.4246, + "step": 54056 + }, + { + "epoch": 0.6757418935473387, + "grad_norm": 0.0006296598585322499, + "learning_rate": 5.750389416063751e-06, + "loss": 0.8493, + "step": 54058 + }, + { + "epoch": 0.6757668941723544, + "grad_norm": 2.178332805633545, + "learning_rate": 5.749599464133057e-06, + "loss": 0.7637, + "step": 54060 + }, + { + "epoch": 0.6757918947973699, + "grad_norm": 0.0014649169752374291, + "learning_rate": 5.7488095445728135e-06, + "loss": 0.3884, + "step": 54062 + }, + { + "epoch": 0.6758168954223855, + "grad_norm": 1.4814928770065308, + "learning_rate": 5.74801965738903e-06, + "loss": 0.0779, + "step": 54064 + }, + { + "epoch": 0.6758418960474012, + "grad_norm": 2.8512871265411377, + "learning_rate": 5.7472298025877326e-06, + "loss": 0.559, + "step": 54066 + }, + { + "epoch": 0.6758668966724168, + "grad_norm": 8.568326950073242, + "learning_rate": 5.746439980174928e-06, + "loss": 2.0459, + "step": 54068 + }, + { + "epoch": 0.6758918972974325, + "grad_norm": 2.4604480266571045, + "learning_rate": 5.74565019015664e-06, + "loss": 0.4899, + "step": 54070 + }, + { + "epoch": 0.675916897922448, + "grad_norm": 4.6683030128479, + "learning_rate": 5.7448604325388725e-06, + "loss": 0.8544, + "step": 54072 + }, + { + "epoch": 0.6759418985474637, + "grad_norm": 1.5263952016830444, + "learning_rate": 5.744070707327655e-06, + "loss": 0.1221, + "step": 54074 + }, + { + "epoch": 0.6759668991724793, + "grad_norm": 3.4675533771514893, + "learning_rate": 5.743281014528985e-06, + "loss": 0.6169, + "step": 54076 + }, + { + "epoch": 0.675991899797495, + "grad_norm": 5.1933674812316895, + "learning_rate": 5.742491354148885e-06, + "loss": 0.8553, + "step": 54078 + }, + { + "epoch": 0.6760169004225106, + "grad_norm": 4.48777961730957, + "learning_rate": 5.741701726193372e-06, + "loss": 0.873, + "step": 54080 + }, + { + "epoch": 0.6760419010475262, + "grad_norm": 3.375523567199707, + "learning_rate": 5.740912130668453e-06, + "loss": 0.7797, + "step": 54082 + }, + { + "epoch": 0.6760669016725418, + "grad_norm": 4.143529891967773, + "learning_rate": 5.7401225675801495e-06, + "loss": 0.3916, + "step": 54084 + }, + { + "epoch": 0.6760919022975574, + "grad_norm": 4.048013687133789, + "learning_rate": 5.739333036934466e-06, + "loss": 1.3848, + "step": 54086 + }, + { + "epoch": 0.6761169029225731, + "grad_norm": 0.0007667281897738576, + "learning_rate": 5.738543538737426e-06, + "loss": 1.5133, + "step": 54088 + }, + { + "epoch": 0.6761419035475887, + "grad_norm": 3.475069522857666, + "learning_rate": 5.73775407299503e-06, + "loss": 0.5271, + "step": 54090 + }, + { + "epoch": 0.6761669041726043, + "grad_norm": 3.1526100635528564, + "learning_rate": 5.736964639713296e-06, + "loss": 1.3989, + "step": 54092 + }, + { + "epoch": 0.6761919047976199, + "grad_norm": 4.609859466552734, + "learning_rate": 5.736175238898238e-06, + "loss": 0.7086, + "step": 54094 + }, + { + "epoch": 0.6762169054226356, + "grad_norm": 2.66030216217041, + "learning_rate": 5.735385870555865e-06, + "loss": 0.6759, + "step": 54096 + }, + { + "epoch": 0.6762419060476512, + "grad_norm": 2.2413878440856934, + "learning_rate": 5.734596534692194e-06, + "loss": 0.7939, + "step": 54098 + }, + { + "epoch": 0.6762669066726669, + "grad_norm": 6.646224021911621, + "learning_rate": 5.733807231313231e-06, + "loss": 0.9548, + "step": 54100 + }, + { + "epoch": 0.6762919072976824, + "grad_norm": 3.637585401535034, + "learning_rate": 5.733017960424987e-06, + "loss": 0.8692, + "step": 54102 + }, + { + "epoch": 0.676316907922698, + "grad_norm": 0.4053956866264343, + "learning_rate": 5.732228722033477e-06, + "loss": 0.3786, + "step": 54104 + }, + { + "epoch": 0.6763419085477137, + "grad_norm": 3.9496726989746094, + "learning_rate": 5.731439516144706e-06, + "loss": 1.0254, + "step": 54106 + }, + { + "epoch": 0.6763669091727293, + "grad_norm": 5.68099308013916, + "learning_rate": 5.730650342764692e-06, + "loss": 1.6705, + "step": 54108 + }, + { + "epoch": 0.676391909797745, + "grad_norm": 2.0932233333587646, + "learning_rate": 5.729861201899438e-06, + "loss": 1.3047, + "step": 54110 + }, + { + "epoch": 0.6764169104227605, + "grad_norm": 3.225231885910034, + "learning_rate": 5.729072093554959e-06, + "loss": 0.8222, + "step": 54112 + }, + { + "epoch": 0.6764419110477762, + "grad_norm": 4.692440509796143, + "learning_rate": 5.728283017737266e-06, + "loss": 0.4293, + "step": 54114 + }, + { + "epoch": 0.6764669116727918, + "grad_norm": 3.069139003753662, + "learning_rate": 5.72749397445236e-06, + "loss": 0.9362, + "step": 54116 + }, + { + "epoch": 0.6764919122978075, + "grad_norm": 7.14246129989624, + "learning_rate": 5.72670496370626e-06, + "loss": 1.8011, + "step": 54118 + }, + { + "epoch": 0.6765169129228231, + "grad_norm": 2.2652833461761475, + "learning_rate": 5.725915985504966e-06, + "loss": 1.1934, + "step": 54120 + }, + { + "epoch": 0.6765419135478387, + "grad_norm": 2.6635003089904785, + "learning_rate": 5.725127039854495e-06, + "loss": 1.0613, + "step": 54122 + }, + { + "epoch": 0.6765669141728543, + "grad_norm": 5.8950018882751465, + "learning_rate": 5.724338126760849e-06, + "loss": 0.5269, + "step": 54124 + }, + { + "epoch": 0.67659191479787, + "grad_norm": 1.8445099592208862, + "learning_rate": 5.723549246230044e-06, + "loss": 0.5903, + "step": 54126 + }, + { + "epoch": 0.6766169154228856, + "grad_norm": 3.6359171867370605, + "learning_rate": 5.722760398268081e-06, + "loss": 0.4361, + "step": 54128 + }, + { + "epoch": 0.6766419160479012, + "grad_norm": 2.4734857082366943, + "learning_rate": 5.721971582880968e-06, + "loss": 0.6304, + "step": 54130 + }, + { + "epoch": 0.6766669166729168, + "grad_norm": 6.576739311218262, + "learning_rate": 5.721182800074718e-06, + "loss": 1.3496, + "step": 54132 + }, + { + "epoch": 0.6766919172979324, + "grad_norm": 1.7938131093978882, + "learning_rate": 5.720394049855329e-06, + "loss": 0.3166, + "step": 54134 + }, + { + "epoch": 0.6767169179229481, + "grad_norm": 4.213454246520996, + "learning_rate": 5.719605332228819e-06, + "loss": 1.3372, + "step": 54136 + }, + { + "epoch": 0.6767419185479637, + "grad_norm": 5.413175106048584, + "learning_rate": 5.7188166472011866e-06, + "loss": 0.7, + "step": 54138 + }, + { + "epoch": 0.6767669191729794, + "grad_norm": 2.745267868041992, + "learning_rate": 5.7180279947784434e-06, + "loss": 1.1045, + "step": 54140 + }, + { + "epoch": 0.6767919197979949, + "grad_norm": 3.2193734645843506, + "learning_rate": 5.717239374966594e-06, + "loss": 0.4331, + "step": 54142 + }, + { + "epoch": 0.6768169204230106, + "grad_norm": 3.2265625, + "learning_rate": 5.716450787771642e-06, + "loss": 0.6156, + "step": 54144 + }, + { + "epoch": 0.6768419210480262, + "grad_norm": 3.020688772201538, + "learning_rate": 5.7156622331995956e-06, + "loss": 0.8997, + "step": 54146 + }, + { + "epoch": 0.6768669216730419, + "grad_norm": 2.688262462615967, + "learning_rate": 5.714873711256458e-06, + "loss": 0.5452, + "step": 54148 + }, + { + "epoch": 0.6768919222980575, + "grad_norm": 0.8049380779266357, + "learning_rate": 5.71408522194824e-06, + "loss": 0.4939, + "step": 54150 + }, + { + "epoch": 0.676916922923073, + "grad_norm": 10.208114624023438, + "learning_rate": 5.713296765280938e-06, + "loss": 1.2768, + "step": 54152 + }, + { + "epoch": 0.6769419235480887, + "grad_norm": 3.7892191410064697, + "learning_rate": 5.7125083412605656e-06, + "loss": 0.6965, + "step": 54154 + }, + { + "epoch": 0.6769669241731043, + "grad_norm": 3.3577768802642822, + "learning_rate": 5.711719949893124e-06, + "loss": 0.858, + "step": 54156 + }, + { + "epoch": 0.67699192479812, + "grad_norm": 2.9161312580108643, + "learning_rate": 5.710931591184612e-06, + "loss": 0.4743, + "step": 54158 + }, + { + "epoch": 0.6770169254231356, + "grad_norm": 4.352248191833496, + "learning_rate": 5.710143265141044e-06, + "loss": 1.8475, + "step": 54160 + }, + { + "epoch": 0.6770419260481512, + "grad_norm": 2.0344624519348145, + "learning_rate": 5.709354971768412e-06, + "loss": 1.1522, + "step": 54162 + }, + { + "epoch": 0.6770669266731668, + "grad_norm": 3.996424436569214, + "learning_rate": 5.708566711072733e-06, + "loss": 0.6195, + "step": 54164 + }, + { + "epoch": 0.6770919272981825, + "grad_norm": 3.05238938331604, + "learning_rate": 5.70777848306e-06, + "loss": 1.1511, + "step": 54166 + }, + { + "epoch": 0.6771169279231981, + "grad_norm": 0.0015175881562754512, + "learning_rate": 5.706990287736217e-06, + "loss": 0.0001, + "step": 54168 + }, + { + "epoch": 0.6771419285482138, + "grad_norm": 0.0037016659043729305, + "learning_rate": 5.706202125107392e-06, + "loss": 0.1571, + "step": 54170 + }, + { + "epoch": 0.6771669291732293, + "grad_norm": 0.8804485201835632, + "learning_rate": 5.705413995179521e-06, + "loss": 0.0131, + "step": 54172 + }, + { + "epoch": 0.6771919297982449, + "grad_norm": 3.2050533294677734, + "learning_rate": 5.704625897958613e-06, + "loss": 0.259, + "step": 54174 + }, + { + "epoch": 0.6772169304232606, + "grad_norm": 1.4299695491790771, + "learning_rate": 5.703837833450663e-06, + "loss": 0.8618, + "step": 54176 + }, + { + "epoch": 0.6772419310482762, + "grad_norm": 0.26638922095298767, + "learning_rate": 5.703049801661681e-06, + "loss": 0.8477, + "step": 54178 + }, + { + "epoch": 0.6772669316732919, + "grad_norm": 3.253352165222168, + "learning_rate": 5.702261802597664e-06, + "loss": 0.1294, + "step": 54180 + }, + { + "epoch": 0.6772919322983074, + "grad_norm": 6.828380107879639, + "learning_rate": 5.701473836264608e-06, + "loss": 1.6839, + "step": 54182 + }, + { + "epoch": 0.6773169329233231, + "grad_norm": 2.5643224716186523, + "learning_rate": 5.700685902668526e-06, + "loss": 0.4713, + "step": 54184 + }, + { + "epoch": 0.6773419335483387, + "grad_norm": 2.217886209487915, + "learning_rate": 5.699898001815406e-06, + "loss": 0.7856, + "step": 54186 + }, + { + "epoch": 0.6773669341733544, + "grad_norm": 3.2470757961273193, + "learning_rate": 5.699110133711258e-06, + "loss": 0.7251, + "step": 54188 + }, + { + "epoch": 0.67739193479837, + "grad_norm": 2.2643494606018066, + "learning_rate": 5.698322298362078e-06, + "loss": 0.3398, + "step": 54190 + }, + { + "epoch": 0.6774169354233855, + "grad_norm": 3.0346317291259766, + "learning_rate": 5.697534495773869e-06, + "loss": 0.3893, + "step": 54192 + }, + { + "epoch": 0.6774419360484012, + "grad_norm": 3.817915201187134, + "learning_rate": 5.696746725952629e-06, + "loss": 0.8374, + "step": 54194 + }, + { + "epoch": 0.6774669366734168, + "grad_norm": 2.5514097213745117, + "learning_rate": 5.695958988904354e-06, + "loss": 0.7515, + "step": 54196 + }, + { + "epoch": 0.6774919372984325, + "grad_norm": 4.579986572265625, + "learning_rate": 5.695171284635051e-06, + "loss": 0.3769, + "step": 54198 + }, + { + "epoch": 0.6775169379234481, + "grad_norm": 2.232139825820923, + "learning_rate": 5.694383613150709e-06, + "loss": 0.2328, + "step": 54200 + }, + { + "epoch": 0.6775419385484637, + "grad_norm": 4.570107460021973, + "learning_rate": 5.693595974457338e-06, + "loss": 1.2124, + "step": 54202 + }, + { + "epoch": 0.6775669391734793, + "grad_norm": 3.697338581085205, + "learning_rate": 5.692808368560927e-06, + "loss": 1.1994, + "step": 54204 + }, + { + "epoch": 0.677591939798495, + "grad_norm": 2.5699987411499023, + "learning_rate": 5.692020795467483e-06, + "loss": 1.5169, + "step": 54206 + }, + { + "epoch": 0.6776169404235106, + "grad_norm": 2.9217159748077393, + "learning_rate": 5.691233255182997e-06, + "loss": 0.3797, + "step": 54208 + }, + { + "epoch": 0.6776419410485263, + "grad_norm": 5.7117204666137695, + "learning_rate": 5.690445747713467e-06, + "loss": 2.2499, + "step": 54210 + }, + { + "epoch": 0.6776669416735418, + "grad_norm": 3.1793861389160156, + "learning_rate": 5.689658273064898e-06, + "loss": 0.6385, + "step": 54212 + }, + { + "epoch": 0.6776919422985574, + "grad_norm": 4.495044231414795, + "learning_rate": 5.688870831243276e-06, + "loss": 0.6428, + "step": 54214 + }, + { + "epoch": 0.6777169429235731, + "grad_norm": 4.34173059463501, + "learning_rate": 5.688083422254609e-06, + "loss": 0.9742, + "step": 54216 + }, + { + "epoch": 0.6777419435485887, + "grad_norm": 5.782129287719727, + "learning_rate": 5.687296046104887e-06, + "loss": 1.3977, + "step": 54218 + }, + { + "epoch": 0.6777669441736044, + "grad_norm": 2.601911783218384, + "learning_rate": 5.686508702800107e-06, + "loss": 0.841, + "step": 54220 + }, + { + "epoch": 0.6777919447986199, + "grad_norm": 4.114945411682129, + "learning_rate": 5.68572139234627e-06, + "loss": 0.2952, + "step": 54222 + }, + { + "epoch": 0.6778169454236356, + "grad_norm": 6.783231735229492, + "learning_rate": 5.684934114749363e-06, + "loss": 0.9018, + "step": 54224 + }, + { + "epoch": 0.6778419460486512, + "grad_norm": 0.003112158505246043, + "learning_rate": 5.684146870015394e-06, + "loss": 1.3964, + "step": 54226 + }, + { + "epoch": 0.6778669466736669, + "grad_norm": 0.0009448503260500729, + "learning_rate": 5.683359658150347e-06, + "loss": 0.1924, + "step": 54228 + }, + { + "epoch": 0.6778919472986825, + "grad_norm": 5.1083083152771, + "learning_rate": 5.682572479160227e-06, + "loss": 1.2572, + "step": 54230 + }, + { + "epoch": 0.677916947923698, + "grad_norm": 0.0019160454394295812, + "learning_rate": 5.681785333051024e-06, + "loss": 1.1405, + "step": 54232 + }, + { + "epoch": 0.6779419485487137, + "grad_norm": 4.7004475593566895, + "learning_rate": 5.680998219828729e-06, + "loss": 0.4729, + "step": 54234 + }, + { + "epoch": 0.6779669491737293, + "grad_norm": 1.9270364046096802, + "learning_rate": 5.6802111394993455e-06, + "loss": 0.6567, + "step": 54236 + }, + { + "epoch": 0.677991949798745, + "grad_norm": 0.0012143815401941538, + "learning_rate": 5.6794240920688595e-06, + "loss": 0.0007, + "step": 54238 + }, + { + "epoch": 0.6780169504237606, + "grad_norm": 4.459053993225098, + "learning_rate": 5.678637077543271e-06, + "loss": 1.1907, + "step": 54240 + }, + { + "epoch": 0.6780419510487762, + "grad_norm": 3.3666539192199707, + "learning_rate": 5.677850095928568e-06, + "loss": 1.4423, + "step": 54242 + }, + { + "epoch": 0.6780669516737918, + "grad_norm": 9.399933815002441, + "learning_rate": 5.677063147230753e-06, + "loss": 1.5488, + "step": 54244 + }, + { + "epoch": 0.6780919522988075, + "grad_norm": 1.9140182733535767, + "learning_rate": 5.676276231455811e-06, + "loss": 0.3294, + "step": 54246 + }, + { + "epoch": 0.6781169529238231, + "grad_norm": 4.505308628082275, + "learning_rate": 5.675489348609735e-06, + "loss": 1.931, + "step": 54248 + }, + { + "epoch": 0.6781419535488388, + "grad_norm": 3.461021900177002, + "learning_rate": 5.674702498698526e-06, + "loss": 0.9699, + "step": 54250 + }, + { + "epoch": 0.6781669541738543, + "grad_norm": 4.582841396331787, + "learning_rate": 5.673915681728166e-06, + "loss": 0.4778, + "step": 54252 + }, + { + "epoch": 0.67819195479887, + "grad_norm": 0.0020200898870825768, + "learning_rate": 5.673128897704656e-06, + "loss": 0.0, + "step": 54254 + }, + { + "epoch": 0.6782169554238856, + "grad_norm": 0.7048358917236328, + "learning_rate": 5.67234214663398e-06, + "loss": 0.0805, + "step": 54256 + }, + { + "epoch": 0.6782419560489013, + "grad_norm": 3.931915760040283, + "learning_rate": 5.67155542852214e-06, + "loss": 1.3945, + "step": 54258 + }, + { + "epoch": 0.6782669566739169, + "grad_norm": 1.7288479804992676, + "learning_rate": 5.670768743375121e-06, + "loss": 0.9272, + "step": 54260 + }, + { + "epoch": 0.6782919572989324, + "grad_norm": 3.55012583732605, + "learning_rate": 5.6699820911989116e-06, + "loss": 0.707, + "step": 54262 + }, + { + "epoch": 0.6783169579239481, + "grad_norm": 3.163080930709839, + "learning_rate": 5.669195471999509e-06, + "loss": 0.8891, + "step": 54264 + }, + { + "epoch": 0.6783419585489637, + "grad_norm": 1.2438054084777832, + "learning_rate": 5.668408885782897e-06, + "loss": 0.7932, + "step": 54266 + }, + { + "epoch": 0.6783669591739794, + "grad_norm": 3.0684306621551514, + "learning_rate": 5.667622332555076e-06, + "loss": 1.703, + "step": 54268 + }, + { + "epoch": 0.678391959798995, + "grad_norm": 2.603915214538574, + "learning_rate": 5.666835812322029e-06, + "loss": 0.6809, + "step": 54270 + }, + { + "epoch": 0.6784169604240106, + "grad_norm": 4.036950588226318, + "learning_rate": 5.6660493250897446e-06, + "loss": 0.7732, + "step": 54272 + }, + { + "epoch": 0.6784419610490262, + "grad_norm": 2.7931346893310547, + "learning_rate": 5.66526287086422e-06, + "loss": 1.2956, + "step": 54274 + }, + { + "epoch": 0.6784669616740419, + "grad_norm": 4.1328349113464355, + "learning_rate": 5.664476449651436e-06, + "loss": 1.0276, + "step": 54276 + }, + { + "epoch": 0.6784919622990575, + "grad_norm": 4.70719575881958, + "learning_rate": 5.66369006145739e-06, + "loss": 2.12, + "step": 54278 + }, + { + "epoch": 0.6785169629240732, + "grad_norm": 2.4193360805511475, + "learning_rate": 5.662903706288063e-06, + "loss": 0.3051, + "step": 54280 + }, + { + "epoch": 0.6785419635490887, + "grad_norm": 4.505789756774902, + "learning_rate": 5.662117384149453e-06, + "loss": 0.5834, + "step": 54282 + }, + { + "epoch": 0.6785669641741043, + "grad_norm": 3.520622491836548, + "learning_rate": 5.661331095047543e-06, + "loss": 1.6912, + "step": 54284 + }, + { + "epoch": 0.67859196479912, + "grad_norm": 4.018017768859863, + "learning_rate": 5.660544838988318e-06, + "loss": 1.0482, + "step": 54286 + }, + { + "epoch": 0.6786169654241356, + "grad_norm": 2.4006447792053223, + "learning_rate": 5.659758615977774e-06, + "loss": 0.8261, + "step": 54288 + }, + { + "epoch": 0.6786419660491513, + "grad_norm": 2.612271547317505, + "learning_rate": 5.65897242602189e-06, + "loss": 0.5137, + "step": 54290 + }, + { + "epoch": 0.6786669666741668, + "grad_norm": 3.7212235927581787, + "learning_rate": 5.658186269126663e-06, + "loss": 1.2405, + "step": 54292 + }, + { + "epoch": 0.6786919672991825, + "grad_norm": 2.945742607116699, + "learning_rate": 5.657400145298071e-06, + "loss": 0.9118, + "step": 54294 + }, + { + "epoch": 0.6787169679241981, + "grad_norm": 2.0441083908081055, + "learning_rate": 5.656614054542109e-06, + "loss": 1.3591, + "step": 54296 + }, + { + "epoch": 0.6787419685492138, + "grad_norm": 13.797849655151367, + "learning_rate": 5.65582799686476e-06, + "loss": 0.2336, + "step": 54298 + }, + { + "epoch": 0.6787669691742294, + "grad_norm": 3.295640230178833, + "learning_rate": 5.655041972272009e-06, + "loss": 1.1305, + "step": 54300 + }, + { + "epoch": 0.6787919697992449, + "grad_norm": 2.804187774658203, + "learning_rate": 5.654255980769845e-06, + "loss": 1.9358, + "step": 54302 + }, + { + "epoch": 0.6788169704242606, + "grad_norm": 2.130981922149658, + "learning_rate": 5.653470022364249e-06, + "loss": 0.7241, + "step": 54304 + }, + { + "epoch": 0.6788419710492762, + "grad_norm": 5.453089714050293, + "learning_rate": 5.652684097061216e-06, + "loss": 1.0297, + "step": 54306 + }, + { + "epoch": 0.6788669716742919, + "grad_norm": 3.90934419631958, + "learning_rate": 5.651898204866721e-06, + "loss": 1.251, + "step": 54308 + }, + { + "epoch": 0.6788919722993075, + "grad_norm": 0.1857973039150238, + "learning_rate": 5.65111234578676e-06, + "loss": 0.9717, + "step": 54310 + }, + { + "epoch": 0.6789169729243231, + "grad_norm": 3.7743442058563232, + "learning_rate": 5.6503265198273116e-06, + "loss": 0.7076, + "step": 54312 + }, + { + "epoch": 0.6789419735493387, + "grad_norm": 0.5466343760490417, + "learning_rate": 5.649540726994358e-06, + "loss": 0.0212, + "step": 54314 + }, + { + "epoch": 0.6789669741743544, + "grad_norm": 4.314187526702881, + "learning_rate": 5.64875496729389e-06, + "loss": 1.7684, + "step": 54316 + }, + { + "epoch": 0.67899197479937, + "grad_norm": 0.0010871451813727617, + "learning_rate": 5.647969240731885e-06, + "loss": 0.0073, + "step": 54318 + }, + { + "epoch": 0.6790169754243857, + "grad_norm": 2.282240390777588, + "learning_rate": 5.647183547314331e-06, + "loss": 0.8809, + "step": 54320 + }, + { + "epoch": 0.6790419760494012, + "grad_norm": 1.6962839365005493, + "learning_rate": 5.646397887047219e-06, + "loss": 0.7058, + "step": 54322 + }, + { + "epoch": 0.6790669766744168, + "grad_norm": 3.4475290775299072, + "learning_rate": 5.645612259936518e-06, + "loss": 0.8665, + "step": 54324 + }, + { + "epoch": 0.6790919772994325, + "grad_norm": 3.147883176803589, + "learning_rate": 5.644826665988221e-06, + "loss": 0.6659, + "step": 54326 + }, + { + "epoch": 0.6791169779244481, + "grad_norm": 0.002600284293293953, + "learning_rate": 5.644041105208307e-06, + "loss": 0.0979, + "step": 54328 + }, + { + "epoch": 0.6791419785494638, + "grad_norm": 2.606637716293335, + "learning_rate": 5.643255577602761e-06, + "loss": 0.8066, + "step": 54330 + }, + { + "epoch": 0.6791669791744793, + "grad_norm": 0.0069081163965165615, + "learning_rate": 5.642470083177562e-06, + "loss": 0.6696, + "step": 54332 + }, + { + "epoch": 0.679191979799495, + "grad_norm": 3.3005335330963135, + "learning_rate": 5.6416846219386935e-06, + "loss": 0.6598, + "step": 54334 + }, + { + "epoch": 0.6792169804245106, + "grad_norm": 2.6559207439422607, + "learning_rate": 5.640899193892146e-06, + "loss": 0.6713, + "step": 54336 + }, + { + "epoch": 0.6792419810495263, + "grad_norm": 2.231884002685547, + "learning_rate": 5.640113799043888e-06, + "loss": 1.692, + "step": 54338 + }, + { + "epoch": 0.6792669816745419, + "grad_norm": 3.931636095046997, + "learning_rate": 5.639328437399909e-06, + "loss": 1.134, + "step": 54340 + }, + { + "epoch": 0.6792919822995575, + "grad_norm": 2.4880292415618896, + "learning_rate": 5.6385431089661855e-06, + "loss": 0.3037, + "step": 54342 + }, + { + "epoch": 0.6793169829245731, + "grad_norm": 5.410202980041504, + "learning_rate": 5.637757813748704e-06, + "loss": 0.982, + "step": 54344 + }, + { + "epoch": 0.6793419835495887, + "grad_norm": 2.742161512374878, + "learning_rate": 5.636972551753439e-06, + "loss": 0.8144, + "step": 54346 + }, + { + "epoch": 0.6793669841746044, + "grad_norm": 3.72316575050354, + "learning_rate": 5.636187322986377e-06, + "loss": 1.5398, + "step": 54348 + }, + { + "epoch": 0.67939198479962, + "grad_norm": 3.6247196197509766, + "learning_rate": 5.6354021274534956e-06, + "loss": 0.8496, + "step": 54350 + }, + { + "epoch": 0.6794169854246356, + "grad_norm": 3.2953908443450928, + "learning_rate": 5.634616965160771e-06, + "loss": 0.553, + "step": 54352 + }, + { + "epoch": 0.6794419860496512, + "grad_norm": 1.9832308292388916, + "learning_rate": 5.6338318361141885e-06, + "loss": 0.7218, + "step": 54354 + }, + { + "epoch": 0.6794669866746669, + "grad_norm": 4.589071273803711, + "learning_rate": 5.6330467403197234e-06, + "loss": 1.8478, + "step": 54356 + }, + { + "epoch": 0.6794919872996825, + "grad_norm": 2.9491848945617676, + "learning_rate": 5.632261677783355e-06, + "loss": 1.2306, + "step": 54358 + }, + { + "epoch": 0.6795169879246982, + "grad_norm": 3.5283901691436768, + "learning_rate": 5.631476648511069e-06, + "loss": 1.3787, + "step": 54360 + }, + { + "epoch": 0.6795419885497137, + "grad_norm": 0.0018484473694115877, + "learning_rate": 5.630691652508838e-06, + "loss": 1.0492, + "step": 54362 + }, + { + "epoch": 0.6795669891747294, + "grad_norm": 1.310733675956726, + "learning_rate": 5.6299066897826425e-06, + "loss": 0.2542, + "step": 54364 + }, + { + "epoch": 0.679591989799745, + "grad_norm": 5.400332450866699, + "learning_rate": 5.629121760338456e-06, + "loss": 1.6602, + "step": 54366 + }, + { + "epoch": 0.6796169904247606, + "grad_norm": 3.5456159114837646, + "learning_rate": 5.6283368641822636e-06, + "loss": 0.9685, + "step": 54368 + }, + { + "epoch": 0.6796419910497763, + "grad_norm": 2.9576635360717773, + "learning_rate": 5.627552001320036e-06, + "loss": 0.8486, + "step": 54370 + }, + { + "epoch": 0.6796669916747918, + "grad_norm": 4.290297508239746, + "learning_rate": 5.626767171757752e-06, + "loss": 1.3932, + "step": 54372 + }, + { + "epoch": 0.6796919922998075, + "grad_norm": 3.842487335205078, + "learning_rate": 5.625982375501401e-06, + "loss": 0.5451, + "step": 54374 + }, + { + "epoch": 0.6797169929248231, + "grad_norm": 5.083568572998047, + "learning_rate": 5.6251976125569404e-06, + "loss": 0.2726, + "step": 54376 + }, + { + "epoch": 0.6797419935498388, + "grad_norm": 2.269977331161499, + "learning_rate": 5.624412882930361e-06, + "loss": 0.2046, + "step": 54378 + }, + { + "epoch": 0.6797669941748544, + "grad_norm": 3.3208439350128174, + "learning_rate": 5.62362818662763e-06, + "loss": 1.8349, + "step": 54380 + }, + { + "epoch": 0.67979199479987, + "grad_norm": 2.560715675354004, + "learning_rate": 5.622843523654729e-06, + "loss": 1.648, + "step": 54382 + }, + { + "epoch": 0.6798169954248856, + "grad_norm": 8.444336891174316, + "learning_rate": 5.6220588940176345e-06, + "loss": 0.6264, + "step": 54384 + }, + { + "epoch": 0.6798419960499013, + "grad_norm": 3.2130751609802246, + "learning_rate": 5.621274297722318e-06, + "loss": 1.1904, + "step": 54386 + }, + { + "epoch": 0.6798669966749169, + "grad_norm": 3.7425196170806885, + "learning_rate": 5.620489734774764e-06, + "loss": 1.8381, + "step": 54388 + }, + { + "epoch": 0.6798919972999325, + "grad_norm": 1.5493905544281006, + "learning_rate": 5.619705205180935e-06, + "loss": 0.0568, + "step": 54390 + }, + { + "epoch": 0.6799169979249481, + "grad_norm": 4.302845478057861, + "learning_rate": 5.618920708946813e-06, + "loss": 1.6123, + "step": 54392 + }, + { + "epoch": 0.6799419985499637, + "grad_norm": 0.0008461700053885579, + "learning_rate": 5.618136246078369e-06, + "loss": 1.9569, + "step": 54394 + }, + { + "epoch": 0.6799669991749794, + "grad_norm": 0.0008934697252698243, + "learning_rate": 5.61735181658158e-06, + "loss": 0.5571, + "step": 54396 + }, + { + "epoch": 0.679991999799995, + "grad_norm": 5.747379302978516, + "learning_rate": 5.616567420462422e-06, + "loss": 1.3035, + "step": 54398 + }, + { + "epoch": 0.6800170004250107, + "grad_norm": 4.1505279541015625, + "learning_rate": 5.615783057726865e-06, + "loss": 0.9808, + "step": 54400 + }, + { + "epoch": 0.6800420010500262, + "grad_norm": 7.611835956573486, + "learning_rate": 5.6149987283808905e-06, + "loss": 0.7093, + "step": 54402 + }, + { + "epoch": 0.6800670016750419, + "grad_norm": 1.3567990064620972, + "learning_rate": 5.614214432430458e-06, + "loss": 0.0153, + "step": 54404 + }, + { + "epoch": 0.6800920023000575, + "grad_norm": 3.6861329078674316, + "learning_rate": 5.613430169881553e-06, + "loss": 1.5881, + "step": 54406 + }, + { + "epoch": 0.6801170029250732, + "grad_norm": 0.0012765744468197227, + "learning_rate": 5.612645940740138e-06, + "loss": 0.1906, + "step": 54408 + }, + { + "epoch": 0.6801420035500888, + "grad_norm": 2.3390159606933594, + "learning_rate": 5.611861745012192e-06, + "loss": 1.1614, + "step": 54410 + }, + { + "epoch": 0.6801670041751043, + "grad_norm": 6.968955993652344, + "learning_rate": 5.61107758270369e-06, + "loss": 1.4791, + "step": 54412 + }, + { + "epoch": 0.68019200480012, + "grad_norm": 2.4013330936431885, + "learning_rate": 5.6102934538206015e-06, + "loss": 0.6618, + "step": 54414 + }, + { + "epoch": 0.6802170054251356, + "grad_norm": 2.2972776889801025, + "learning_rate": 5.609509358368896e-06, + "loss": 1.1368, + "step": 54416 + }, + { + "epoch": 0.6802420060501513, + "grad_norm": 4.7892560958862305, + "learning_rate": 5.608725296354542e-06, + "loss": 1.7485, + "step": 54418 + }, + { + "epoch": 0.6802670066751669, + "grad_norm": 3.041365385055542, + "learning_rate": 5.6079412677835165e-06, + "loss": 0.7402, + "step": 54420 + }, + { + "epoch": 0.6802920073001825, + "grad_norm": 3.3725273609161377, + "learning_rate": 5.607157272661794e-06, + "loss": 0.1738, + "step": 54422 + }, + { + "epoch": 0.6803170079251981, + "grad_norm": 0.001001554075628519, + "learning_rate": 5.606373310995334e-06, + "loss": 0.4448, + "step": 54424 + }, + { + "epoch": 0.6803420085502138, + "grad_norm": 0.7350963950157166, + "learning_rate": 5.60558938279012e-06, + "loss": 0.0303, + "step": 54426 + }, + { + "epoch": 0.6803670091752294, + "grad_norm": 2.9700934886932373, + "learning_rate": 5.604805488052114e-06, + "loss": 1.008, + "step": 54428 + }, + { + "epoch": 0.6803920098002451, + "grad_norm": 2.6736690998077393, + "learning_rate": 5.604021626787288e-06, + "loss": 0.9011, + "step": 54430 + }, + { + "epoch": 0.6804170104252606, + "grad_norm": 2.762343168258667, + "learning_rate": 5.603237799001608e-06, + "loss": 0.5896, + "step": 54432 + }, + { + "epoch": 0.6804420110502762, + "grad_norm": 3.0798556804656982, + "learning_rate": 5.602454004701048e-06, + "loss": 0.1995, + "step": 54434 + }, + { + "epoch": 0.6804670116752919, + "grad_norm": 0.9092856645584106, + "learning_rate": 5.601670243891581e-06, + "loss": 0.5813, + "step": 54436 + }, + { + "epoch": 0.6804920123003075, + "grad_norm": 4.4639129638671875, + "learning_rate": 5.6008865165791645e-06, + "loss": 0.6635, + "step": 54438 + }, + { + "epoch": 0.6805170129253232, + "grad_norm": 10.727799415588379, + "learning_rate": 5.6001028227697855e-06, + "loss": 1.4837, + "step": 54440 + }, + { + "epoch": 0.6805420135503387, + "grad_norm": 3.740227460861206, + "learning_rate": 5.599319162469391e-06, + "loss": 1.0479, + "step": 54442 + }, + { + "epoch": 0.6805670141753544, + "grad_norm": 2.1608777046203613, + "learning_rate": 5.59853553568396e-06, + "loss": 0.7454, + "step": 54444 + }, + { + "epoch": 0.68059201480037, + "grad_norm": 3.362391233444214, + "learning_rate": 5.597751942419465e-06, + "loss": 0.2367, + "step": 54446 + }, + { + "epoch": 0.6806170154253857, + "grad_norm": 4.042654037475586, + "learning_rate": 5.596968382681863e-06, + "loss": 0.6867, + "step": 54448 + }, + { + "epoch": 0.6806420160504013, + "grad_norm": 1.3530519008636475, + "learning_rate": 5.596184856477131e-06, + "loss": 0.4641, + "step": 54450 + }, + { + "epoch": 0.6806670166754168, + "grad_norm": 1.6899800300598145, + "learning_rate": 5.59540136381123e-06, + "loss": 0.5261, + "step": 54452 + }, + { + "epoch": 0.6806920173004325, + "grad_norm": 2.8358049392700195, + "learning_rate": 5.5946179046901365e-06, + "loss": 0.1475, + "step": 54454 + }, + { + "epoch": 0.6807170179254481, + "grad_norm": 7.114369869232178, + "learning_rate": 5.593834479119802e-06, + "loss": 0.7561, + "step": 54456 + }, + { + "epoch": 0.6807420185504638, + "grad_norm": 2.490990400314331, + "learning_rate": 5.593051087106202e-06, + "loss": 1.2122, + "step": 54458 + }, + { + "epoch": 0.6807670191754794, + "grad_norm": 5.363205432891846, + "learning_rate": 5.592267728655305e-06, + "loss": 2.0639, + "step": 54460 + }, + { + "epoch": 0.680792019800495, + "grad_norm": 3.6357545852661133, + "learning_rate": 5.591484403773068e-06, + "loss": 0.6159, + "step": 54462 + }, + { + "epoch": 0.6808170204255106, + "grad_norm": 0.000590745301451534, + "learning_rate": 5.590701112465469e-06, + "loss": 0.7657, + "step": 54464 + }, + { + "epoch": 0.6808420210505263, + "grad_norm": 3.8580257892608643, + "learning_rate": 5.5899178547384656e-06, + "loss": 0.9479, + "step": 54466 + }, + { + "epoch": 0.6808670216755419, + "grad_norm": 2.508153200149536, + "learning_rate": 5.589134630598021e-06, + "loss": 1.8117, + "step": 54468 + }, + { + "epoch": 0.6808920223005576, + "grad_norm": 4.3030853271484375, + "learning_rate": 5.588351440050106e-06, + "loss": 0.9321, + "step": 54470 + }, + { + "epoch": 0.6809170229255731, + "grad_norm": 4.800486087799072, + "learning_rate": 5.587568283100679e-06, + "loss": 1.6822, + "step": 54472 + }, + { + "epoch": 0.6809420235505887, + "grad_norm": 3.194474697113037, + "learning_rate": 5.586785159755712e-06, + "loss": 0.7942, + "step": 54474 + }, + { + "epoch": 0.6809670241756044, + "grad_norm": 1.998000144958496, + "learning_rate": 5.586002070021161e-06, + "loss": 0.229, + "step": 54476 + }, + { + "epoch": 0.68099202480062, + "grad_norm": 2.6024105548858643, + "learning_rate": 5.585219013902998e-06, + "loss": 0.5431, + "step": 54478 + }, + { + "epoch": 0.6810170254256357, + "grad_norm": 5.235124588012695, + "learning_rate": 5.584435991407184e-06, + "loss": 1.1287, + "step": 54480 + }, + { + "epoch": 0.6810420260506512, + "grad_norm": 0.0020084951538592577, + "learning_rate": 5.583653002539675e-06, + "loss": 0.4477, + "step": 54482 + }, + { + "epoch": 0.6810670266756669, + "grad_norm": 4.072014331817627, + "learning_rate": 5.582870047306445e-06, + "loss": 0.5209, + "step": 54484 + }, + { + "epoch": 0.6810920273006825, + "grad_norm": 1.6893298625946045, + "learning_rate": 5.582087125713449e-06, + "loss": 0.5789, + "step": 54486 + }, + { + "epoch": 0.6811170279256982, + "grad_norm": 3.894779920578003, + "learning_rate": 5.581304237766656e-06, + "loss": 1.2715, + "step": 54488 + }, + { + "epoch": 0.6811420285507138, + "grad_norm": 3.083434581756592, + "learning_rate": 5.580521383472022e-06, + "loss": 1.0023, + "step": 54490 + }, + { + "epoch": 0.6811670291757294, + "grad_norm": 2.493591547012329, + "learning_rate": 5.579738562835519e-06, + "loss": 0.7412, + "step": 54492 + }, + { + "epoch": 0.681192029800745, + "grad_norm": 5.517939567565918, + "learning_rate": 5.578955775863094e-06, + "loss": 1.826, + "step": 54494 + }, + { + "epoch": 0.6812170304257607, + "grad_norm": 0.0014652549289166927, + "learning_rate": 5.578173022560718e-06, + "loss": 0.7013, + "step": 54496 + }, + { + "epoch": 0.6812420310507763, + "grad_norm": 5.344022274017334, + "learning_rate": 5.5773903029343535e-06, + "loss": 0.8877, + "step": 54498 + }, + { + "epoch": 0.681267031675792, + "grad_norm": 1.8046315908432007, + "learning_rate": 5.576607616989955e-06, + "loss": 0.4031, + "step": 54500 + }, + { + "epoch": 0.6812920323008075, + "grad_norm": 1.517276644706726, + "learning_rate": 5.575824964733492e-06, + "loss": 0.0598, + "step": 54502 + }, + { + "epoch": 0.6813170329258231, + "grad_norm": 4.097771167755127, + "learning_rate": 5.575042346170916e-06, + "loss": 0.9412, + "step": 54504 + }, + { + "epoch": 0.6813420335508388, + "grad_norm": 2.2454888820648193, + "learning_rate": 5.5742597613081965e-06, + "loss": 0.1862, + "step": 54506 + }, + { + "epoch": 0.6813670341758544, + "grad_norm": 5.334583282470703, + "learning_rate": 5.573477210151288e-06, + "loss": 1.5186, + "step": 54508 + }, + { + "epoch": 0.6813920348008701, + "grad_norm": 1.5055513381958008, + "learning_rate": 5.572694692706147e-06, + "loss": 0.419, + "step": 54510 + }, + { + "epoch": 0.6814170354258856, + "grad_norm": 2.390333414077759, + "learning_rate": 5.571912208978742e-06, + "loss": 0.943, + "step": 54512 + }, + { + "epoch": 0.6814420360509013, + "grad_norm": 3.1436550617218018, + "learning_rate": 5.571129758975023e-06, + "loss": 1.5077, + "step": 54514 + }, + { + "epoch": 0.6814670366759169, + "grad_norm": 0.0033331895247101784, + "learning_rate": 5.570347342700957e-06, + "loss": 0.0398, + "step": 54516 + }, + { + "epoch": 0.6814920373009326, + "grad_norm": 3.3733959197998047, + "learning_rate": 5.569564960162499e-06, + "loss": 0.2611, + "step": 54518 + }, + { + "epoch": 0.6815170379259482, + "grad_norm": 2.6467232704162598, + "learning_rate": 5.568782611365605e-06, + "loss": 0.8316, + "step": 54520 + }, + { + "epoch": 0.6815420385509637, + "grad_norm": 3.0266671180725098, + "learning_rate": 5.568000296316239e-06, + "loss": 0.8115, + "step": 54522 + }, + { + "epoch": 0.6815670391759794, + "grad_norm": 0.4268873631954193, + "learning_rate": 5.567218015020352e-06, + "loss": 1.0267, + "step": 54524 + }, + { + "epoch": 0.681592039800995, + "grad_norm": 3.602999687194824, + "learning_rate": 5.566435767483911e-06, + "loss": 1.4401, + "step": 54526 + }, + { + "epoch": 0.6816170404260107, + "grad_norm": 9.747818946838379, + "learning_rate": 5.565653553712863e-06, + "loss": 0.3023, + "step": 54528 + }, + { + "epoch": 0.6816420410510263, + "grad_norm": 3.519609212875366, + "learning_rate": 5.564871373713175e-06, + "loss": 1.3229, + "step": 54530 + }, + { + "epoch": 0.6816670416760419, + "grad_norm": 0.2196306735277176, + "learning_rate": 5.564089227490799e-06, + "loss": 0.5022, + "step": 54532 + }, + { + "epoch": 0.6816920423010575, + "grad_norm": 0.0006636077887378633, + "learning_rate": 5.563307115051688e-06, + "loss": 0.3645, + "step": 54534 + }, + { + "epoch": 0.6817170429260732, + "grad_norm": 0.001267691026441753, + "learning_rate": 5.562525036401808e-06, + "loss": 0.6897, + "step": 54536 + }, + { + "epoch": 0.6817420435510888, + "grad_norm": 6.333405017852783, + "learning_rate": 5.561742991547105e-06, + "loss": 0.2831, + "step": 54538 + }, + { + "epoch": 0.6817670441761045, + "grad_norm": 0.04983672499656677, + "learning_rate": 5.560960980493544e-06, + "loss": 0.8275, + "step": 54540 + }, + { + "epoch": 0.68179204480112, + "grad_norm": 2.8400096893310547, + "learning_rate": 5.56017900324707e-06, + "loss": 0.3616, + "step": 54542 + }, + { + "epoch": 0.6818170454261356, + "grad_norm": 1.8556451797485352, + "learning_rate": 5.5593970598136515e-06, + "loss": 0.8345, + "step": 54544 + }, + { + "epoch": 0.6818420460511513, + "grad_norm": 0.0008357648039236665, + "learning_rate": 5.558615150199237e-06, + "loss": 0.245, + "step": 54546 + }, + { + "epoch": 0.6818670466761669, + "grad_norm": 5.733431816101074, + "learning_rate": 5.5578332744097765e-06, + "loss": 2.337, + "step": 54548 + }, + { + "epoch": 0.6818920473011826, + "grad_norm": 5.0481486320495605, + "learning_rate": 5.5570514324512345e-06, + "loss": 0.5333, + "step": 54550 + }, + { + "epoch": 0.6819170479261981, + "grad_norm": 5.301467418670654, + "learning_rate": 5.5562696243295555e-06, + "loss": 1.2823, + "step": 54552 + }, + { + "epoch": 0.6819420485512138, + "grad_norm": 3.617572069168091, + "learning_rate": 5.555487850050703e-06, + "loss": 0.6319, + "step": 54554 + }, + { + "epoch": 0.6819670491762294, + "grad_norm": 1.9096328020095825, + "learning_rate": 5.554706109620622e-06, + "loss": 0.8568, + "step": 54556 + }, + { + "epoch": 0.6819920498012451, + "grad_norm": 0.0006551642436534166, + "learning_rate": 5.5539244030452764e-06, + "loss": 0.7386, + "step": 54558 + }, + { + "epoch": 0.6820170504262607, + "grad_norm": 5.5046281814575195, + "learning_rate": 5.553142730330611e-06, + "loss": 2.1356, + "step": 54560 + }, + { + "epoch": 0.6820420510512762, + "grad_norm": 7.542082786560059, + "learning_rate": 5.552361091482581e-06, + "loss": 1.369, + "step": 54562 + }, + { + "epoch": 0.6820670516762919, + "grad_norm": 3.1381657123565674, + "learning_rate": 5.551579486507141e-06, + "loss": 0.8292, + "step": 54564 + }, + { + "epoch": 0.6820920523013075, + "grad_norm": 5.766727447509766, + "learning_rate": 5.5507979154102405e-06, + "loss": 1.9018, + "step": 54566 + }, + { + "epoch": 0.6821170529263232, + "grad_norm": 3.8869380950927734, + "learning_rate": 5.550016378197837e-06, + "loss": 1.5007, + "step": 54568 + }, + { + "epoch": 0.6821420535513388, + "grad_norm": 1.959892749786377, + "learning_rate": 5.549234874875881e-06, + "loss": 0.67, + "step": 54570 + }, + { + "epoch": 0.6821670541763544, + "grad_norm": 3.4207639694213867, + "learning_rate": 5.548453405450318e-06, + "loss": 1.3032, + "step": 54572 + }, + { + "epoch": 0.68219205480137, + "grad_norm": 2.188520908355713, + "learning_rate": 5.547671969927108e-06, + "loss": 1.0706, + "step": 54574 + }, + { + "epoch": 0.6822170554263857, + "grad_norm": 2.8588056564331055, + "learning_rate": 5.546890568312193e-06, + "loss": 1.6302, + "step": 54576 + }, + { + "epoch": 0.6822420560514013, + "grad_norm": 1.010519027709961, + "learning_rate": 5.546109200611535e-06, + "loss": 1.35, + "step": 54578 + }, + { + "epoch": 0.682267056676417, + "grad_norm": 3.550558090209961, + "learning_rate": 5.5453278668310775e-06, + "loss": 0.5327, + "step": 54580 + }, + { + "epoch": 0.6822920573014325, + "grad_norm": 2.489105701446533, + "learning_rate": 5.544546566976775e-06, + "loss": 1.3889, + "step": 54582 + }, + { + "epoch": 0.6823170579264481, + "grad_norm": 2.4627742767333984, + "learning_rate": 5.543765301054576e-06, + "loss": 0.6911, + "step": 54584 + }, + { + "epoch": 0.6823420585514638, + "grad_norm": 0.0006793877109885216, + "learning_rate": 5.542984069070427e-06, + "loss": 0.222, + "step": 54586 + }, + { + "epoch": 0.6823670591764794, + "grad_norm": 4.254603862762451, + "learning_rate": 5.542202871030285e-06, + "loss": 0.9632, + "step": 54588 + }, + { + "epoch": 0.6823920598014951, + "grad_norm": 1.998926043510437, + "learning_rate": 5.54142170694009e-06, + "loss": 0.3406, + "step": 54590 + }, + { + "epoch": 0.6824170604265106, + "grad_norm": 0.7165873646736145, + "learning_rate": 5.5406405768058025e-06, + "loss": 0.1643, + "step": 54592 + }, + { + "epoch": 0.6824420610515263, + "grad_norm": 1.1531299352645874, + "learning_rate": 5.539859480633361e-06, + "loss": 0.5479, + "step": 54594 + }, + { + "epoch": 0.6824670616765419, + "grad_norm": 2.8643364906311035, + "learning_rate": 5.539078418428723e-06, + "loss": 0.9379, + "step": 54596 + }, + { + "epoch": 0.6824920623015576, + "grad_norm": 4.188985347747803, + "learning_rate": 5.538297390197833e-06, + "loss": 1.6149, + "step": 54598 + }, + { + "epoch": 0.6825170629265732, + "grad_norm": 7.635313034057617, + "learning_rate": 5.537516395946635e-06, + "loss": 1.1943, + "step": 54600 + }, + { + "epoch": 0.6825420635515888, + "grad_norm": 3.5555763244628906, + "learning_rate": 5.5367354356810846e-06, + "loss": 1.2465, + "step": 54602 + }, + { + "epoch": 0.6825670641766044, + "grad_norm": 2.973404884338379, + "learning_rate": 5.535954509407122e-06, + "loss": 0.593, + "step": 54604 + }, + { + "epoch": 0.68259206480162, + "grad_norm": 3.4285032749176025, + "learning_rate": 5.535173617130704e-06, + "loss": 0.6469, + "step": 54606 + }, + { + "epoch": 0.6826170654266357, + "grad_norm": 0.00090387201635167, + "learning_rate": 5.5343927588577675e-06, + "loss": 0.0423, + "step": 54608 + }, + { + "epoch": 0.6826420660516513, + "grad_norm": 0.0006212154985405505, + "learning_rate": 5.533611934594268e-06, + "loss": 0.6181, + "step": 54610 + }, + { + "epoch": 0.6826670666766669, + "grad_norm": 1.022500991821289, + "learning_rate": 5.5328311443461494e-06, + "loss": 0.2357, + "step": 54612 + }, + { + "epoch": 0.6826920673016825, + "grad_norm": 3.5321097373962402, + "learning_rate": 5.532050388119352e-06, + "loss": 0.5764, + "step": 54614 + }, + { + "epoch": 0.6827170679266982, + "grad_norm": 8.48268985748291, + "learning_rate": 5.5312696659198315e-06, + "loss": 1.2731, + "step": 54616 + }, + { + "epoch": 0.6827420685517138, + "grad_norm": 2.664027214050293, + "learning_rate": 5.5304889777535256e-06, + "loss": 0.4825, + "step": 54618 + }, + { + "epoch": 0.6827670691767295, + "grad_norm": 6.450899124145508, + "learning_rate": 5.529708323626386e-06, + "loss": 1.3346, + "step": 54620 + }, + { + "epoch": 0.682792069801745, + "grad_norm": 3.5358657836914062, + "learning_rate": 5.5289277035443575e-06, + "loss": 0.5264, + "step": 54622 + }, + { + "epoch": 0.6828170704267607, + "grad_norm": 2.836904287338257, + "learning_rate": 5.528147117513378e-06, + "loss": 0.9807, + "step": 54624 + }, + { + "epoch": 0.6828420710517763, + "grad_norm": 0.011203035712242126, + "learning_rate": 5.527366565539403e-06, + "loss": 0.0927, + "step": 54626 + }, + { + "epoch": 0.682867071676792, + "grad_norm": 3.464888095855713, + "learning_rate": 5.526586047628366e-06, + "loss": 0.6414, + "step": 54628 + }, + { + "epoch": 0.6828920723018076, + "grad_norm": 0.7585394382476807, + "learning_rate": 5.525805563786223e-06, + "loss": 0.3144, + "step": 54630 + }, + { + "epoch": 0.6829170729268231, + "grad_norm": 5.460597515106201, + "learning_rate": 5.525025114018907e-06, + "loss": 1.9521, + "step": 54632 + }, + { + "epoch": 0.6829420735518388, + "grad_norm": 5.373837947845459, + "learning_rate": 5.52424469833237e-06, + "loss": 0.9309, + "step": 54634 + }, + { + "epoch": 0.6829670741768544, + "grad_norm": 3.575270175933838, + "learning_rate": 5.523464316732554e-06, + "loss": 1.1005, + "step": 54636 + }, + { + "epoch": 0.6829920748018701, + "grad_norm": 0.004606170114129782, + "learning_rate": 5.5226839692253974e-06, + "loss": 0.0001, + "step": 54638 + }, + { + "epoch": 0.6830170754268857, + "grad_norm": 3.728362560272217, + "learning_rate": 5.52190365581685e-06, + "loss": 0.2768, + "step": 54640 + }, + { + "epoch": 0.6830420760519013, + "grad_norm": 7.180882453918457, + "learning_rate": 5.521123376512846e-06, + "loss": 1.7609, + "step": 54642 + }, + { + "epoch": 0.6830670766769169, + "grad_norm": 6.008814334869385, + "learning_rate": 5.520343131319337e-06, + "loss": 0.5117, + "step": 54644 + }, + { + "epoch": 0.6830920773019326, + "grad_norm": 4.62894344329834, + "learning_rate": 5.519562920242259e-06, + "loss": 0.9504, + "step": 54646 + }, + { + "epoch": 0.6831170779269482, + "grad_norm": 2.3513453006744385, + "learning_rate": 5.5187827432875586e-06, + "loss": 0.9111, + "step": 54648 + }, + { + "epoch": 0.6831420785519638, + "grad_norm": 5.2340874671936035, + "learning_rate": 5.518002600461174e-06, + "loss": 1.7925, + "step": 54650 + }, + { + "epoch": 0.6831670791769794, + "grad_norm": 0.001064862823113799, + "learning_rate": 5.5172224917690465e-06, + "loss": 0.2128, + "step": 54652 + }, + { + "epoch": 0.683192079801995, + "grad_norm": 5.734153747558594, + "learning_rate": 5.51644241721712e-06, + "loss": 1.4522, + "step": 54654 + }, + { + "epoch": 0.6832170804270107, + "grad_norm": 1.7754722833633423, + "learning_rate": 5.5156623768113325e-06, + "loss": 0.8211, + "step": 54656 + }, + { + "epoch": 0.6832420810520263, + "grad_norm": 4.08469295501709, + "learning_rate": 5.5148823705576294e-06, + "loss": 1.6626, + "step": 54658 + }, + { + "epoch": 0.683267081677042, + "grad_norm": 3.033698558807373, + "learning_rate": 5.514102398461944e-06, + "loss": 0.9863, + "step": 54660 + }, + { + "epoch": 0.6832920823020575, + "grad_norm": 3.8535783290863037, + "learning_rate": 5.513322460530224e-06, + "loss": 1.7072, + "step": 54662 + }, + { + "epoch": 0.6833170829270732, + "grad_norm": 2.587581157684326, + "learning_rate": 5.512542556768405e-06, + "loss": 1.4455, + "step": 54664 + }, + { + "epoch": 0.6833420835520888, + "grad_norm": 7.811402797698975, + "learning_rate": 5.511762687182424e-06, + "loss": 2.1201, + "step": 54666 + }, + { + "epoch": 0.6833670841771045, + "grad_norm": 4.498201847076416, + "learning_rate": 5.5109828517782285e-06, + "loss": 0.8313, + "step": 54668 + }, + { + "epoch": 0.6833920848021201, + "grad_norm": 2.8950042724609375, + "learning_rate": 5.510203050561746e-06, + "loss": 0.5966, + "step": 54670 + }, + { + "epoch": 0.6834170854271356, + "grad_norm": 2.524742841720581, + "learning_rate": 5.509423283538928e-06, + "loss": 1.6822, + "step": 54672 + }, + { + "epoch": 0.6834420860521513, + "grad_norm": 4.736425399780273, + "learning_rate": 5.508643550715705e-06, + "loss": 0.7461, + "step": 54674 + }, + { + "epoch": 0.6834670866771669, + "grad_norm": 6.160648345947266, + "learning_rate": 5.507863852098019e-06, + "loss": 0.2917, + "step": 54676 + }, + { + "epoch": 0.6834920873021826, + "grad_norm": 2.6389734745025635, + "learning_rate": 5.507084187691808e-06, + "loss": 0.3095, + "step": 54678 + }, + { + "epoch": 0.6835170879271982, + "grad_norm": 3.03983211517334, + "learning_rate": 5.506304557503004e-06, + "loss": 1.3946, + "step": 54680 + }, + { + "epoch": 0.6835420885522138, + "grad_norm": 8.970709800720215, + "learning_rate": 5.505524961537553e-06, + "loss": 0.4997, + "step": 54682 + }, + { + "epoch": 0.6835670891772294, + "grad_norm": 0.001365060918033123, + "learning_rate": 5.504745399801385e-06, + "loss": 0.2972, + "step": 54684 + }, + { + "epoch": 0.6835920898022451, + "grad_norm": 3.3997879028320312, + "learning_rate": 5.503965872300443e-06, + "loss": 0.9088, + "step": 54686 + }, + { + "epoch": 0.6836170904272607, + "grad_norm": 2.950249195098877, + "learning_rate": 5.503186379040662e-06, + "loss": 1.2167, + "step": 54688 + }, + { + "epoch": 0.6836420910522764, + "grad_norm": 3.614650011062622, + "learning_rate": 5.502406920027974e-06, + "loss": 0.5777, + "step": 54690 + }, + { + "epoch": 0.6836670916772919, + "grad_norm": 2.1382880210876465, + "learning_rate": 5.5016274952683225e-06, + "loss": 0.3118, + "step": 54692 + }, + { + "epoch": 0.6836920923023075, + "grad_norm": 3.431058406829834, + "learning_rate": 5.500848104767635e-06, + "loss": 0.9232, + "step": 54694 + }, + { + "epoch": 0.6837170929273232, + "grad_norm": 3.7702393531799316, + "learning_rate": 5.500068748531857e-06, + "loss": 1.1298, + "step": 54696 + }, + { + "epoch": 0.6837420935523388, + "grad_norm": 1.2415004968643188, + "learning_rate": 5.499289426566914e-06, + "loss": 0.5208, + "step": 54698 + }, + { + "epoch": 0.6837670941773545, + "grad_norm": 2.4614367485046387, + "learning_rate": 5.498510138878748e-06, + "loss": 0.3548, + "step": 54700 + }, + { + "epoch": 0.68379209480237, + "grad_norm": 2.3960154056549072, + "learning_rate": 5.497730885473299e-06, + "loss": 1.6437, + "step": 54702 + }, + { + "epoch": 0.6838170954273857, + "grad_norm": 2.9636967182159424, + "learning_rate": 5.496951666356487e-06, + "loss": 2.3846, + "step": 54704 + }, + { + "epoch": 0.6838420960524013, + "grad_norm": 5.0473551750183105, + "learning_rate": 5.496172481534259e-06, + "loss": 0.8751, + "step": 54706 + }, + { + "epoch": 0.683867096677417, + "grad_norm": 2.4645144939422607, + "learning_rate": 5.49539333101254e-06, + "loss": 0.1014, + "step": 54708 + }, + { + "epoch": 0.6838920973024326, + "grad_norm": 5.970719337463379, + "learning_rate": 5.494614214797272e-06, + "loss": 1.1816, + "step": 54710 + }, + { + "epoch": 0.6839170979274481, + "grad_norm": 0.000681419565808028, + "learning_rate": 5.493835132894383e-06, + "loss": 0.54, + "step": 54712 + }, + { + "epoch": 0.6839420985524638, + "grad_norm": 0.013491889461874962, + "learning_rate": 5.493056085309809e-06, + "loss": 0.7416, + "step": 54714 + }, + { + "epoch": 0.6839670991774794, + "grad_norm": 6.241276741027832, + "learning_rate": 5.492277072049486e-06, + "loss": 0.6313, + "step": 54716 + }, + { + "epoch": 0.6839920998024951, + "grad_norm": 4.746554374694824, + "learning_rate": 5.491498093119336e-06, + "loss": 0.4669, + "step": 54718 + }, + { + "epoch": 0.6840171004275107, + "grad_norm": 0.0025181970559060574, + "learning_rate": 5.4907191485253055e-06, + "loss": 0.2228, + "step": 54720 + }, + { + "epoch": 0.6840421010525263, + "grad_norm": 3.389467716217041, + "learning_rate": 5.489940238273316e-06, + "loss": 1.0739, + "step": 54722 + }, + { + "epoch": 0.6840671016775419, + "grad_norm": 3.8240458965301514, + "learning_rate": 5.489161362369307e-06, + "loss": 1.2725, + "step": 54724 + }, + { + "epoch": 0.6840921023025576, + "grad_norm": 2.578881025314331, + "learning_rate": 5.4883825208192045e-06, + "loss": 0.8741, + "step": 54726 + }, + { + "epoch": 0.6841171029275732, + "grad_norm": 2.5348684787750244, + "learning_rate": 5.487603713628945e-06, + "loss": 0.1541, + "step": 54728 + }, + { + "epoch": 0.6841421035525889, + "grad_norm": 10.215149879455566, + "learning_rate": 5.4868249408044585e-06, + "loss": 1.1601, + "step": 54730 + }, + { + "epoch": 0.6841671041776044, + "grad_norm": 2.047487258911133, + "learning_rate": 5.486046202351671e-06, + "loss": 0.3911, + "step": 54732 + }, + { + "epoch": 0.68419210480262, + "grad_norm": 2.474240779876709, + "learning_rate": 5.485267498276521e-06, + "loss": 1.0007, + "step": 54734 + }, + { + "epoch": 0.6842171054276357, + "grad_norm": 1.848783016204834, + "learning_rate": 5.484488828584931e-06, + "loss": 0.5282, + "step": 54736 + }, + { + "epoch": 0.6842421060526513, + "grad_norm": 4.396648406982422, + "learning_rate": 5.483710193282836e-06, + "loss": 1.0611, + "step": 54738 + }, + { + "epoch": 0.684267106677667, + "grad_norm": 0.0012399425031617284, + "learning_rate": 5.482931592376173e-06, + "loss": 0.5932, + "step": 54740 + }, + { + "epoch": 0.6842921073026825, + "grad_norm": 5.687936305999756, + "learning_rate": 5.482153025870857e-06, + "loss": 1.5175, + "step": 54742 + }, + { + "epoch": 0.6843171079276982, + "grad_norm": 3.65537428855896, + "learning_rate": 5.481374493772829e-06, + "loss": 0.6753, + "step": 54744 + }, + { + "epoch": 0.6843421085527138, + "grad_norm": 2.848417043685913, + "learning_rate": 5.480595996088008e-06, + "loss": 1.3606, + "step": 54746 + }, + { + "epoch": 0.6843671091777295, + "grad_norm": 0.06950939446687698, + "learning_rate": 5.479817532822334e-06, + "loss": 0.6016, + "step": 54748 + }, + { + "epoch": 0.6843921098027451, + "grad_norm": 7.998762607574463, + "learning_rate": 5.479039103981727e-06, + "loss": 1.0426, + "step": 54750 + }, + { + "epoch": 0.6844171104277607, + "grad_norm": 4.165536880493164, + "learning_rate": 5.4782607095721185e-06, + "loss": 1.0636, + "step": 54752 + }, + { + "epoch": 0.6844421110527763, + "grad_norm": 6.787189960479736, + "learning_rate": 5.477482349599444e-06, + "loss": 0.5341, + "step": 54754 + }, + { + "epoch": 0.684467111677792, + "grad_norm": 4.443241119384766, + "learning_rate": 5.476704024069618e-06, + "loss": 0.7946, + "step": 54756 + }, + { + "epoch": 0.6844921123028076, + "grad_norm": 0.0004898181650787592, + "learning_rate": 5.475925732988577e-06, + "loss": 0.0596, + "step": 54758 + }, + { + "epoch": 0.6845171129278232, + "grad_norm": 5.1800689697265625, + "learning_rate": 5.475147476362244e-06, + "loss": 0.4683, + "step": 54760 + }, + { + "epoch": 0.6845421135528388, + "grad_norm": 1.0890624523162842, + "learning_rate": 5.474369254196546e-06, + "loss": 0.6526, + "step": 54762 + }, + { + "epoch": 0.6845671141778544, + "grad_norm": 2.89162540435791, + "learning_rate": 5.473591066497416e-06, + "loss": 0.3272, + "step": 54764 + }, + { + "epoch": 0.6845921148028701, + "grad_norm": 3.6395037174224854, + "learning_rate": 5.472812913270776e-06, + "loss": 0.5804, + "step": 54766 + }, + { + "epoch": 0.6846171154278857, + "grad_norm": 1.0008119344711304, + "learning_rate": 5.472034794522554e-06, + "loss": 0.3103, + "step": 54768 + }, + { + "epoch": 0.6846421160529014, + "grad_norm": 6.3733649253845215, + "learning_rate": 5.47125671025867e-06, + "loss": 1.2642, + "step": 54770 + }, + { + "epoch": 0.6846671166779169, + "grad_norm": 4.870060443878174, + "learning_rate": 5.470478660485058e-06, + "loss": 0.69, + "step": 54772 + }, + { + "epoch": 0.6846921173029326, + "grad_norm": 3.2934281826019287, + "learning_rate": 5.469700645207637e-06, + "loss": 1.1682, + "step": 54774 + }, + { + "epoch": 0.6847171179279482, + "grad_norm": 3.085916757583618, + "learning_rate": 5.468922664432334e-06, + "loss": 0.9358, + "step": 54776 + }, + { + "epoch": 0.6847421185529639, + "grad_norm": 2.926927328109741, + "learning_rate": 5.468144718165081e-06, + "loss": 0.3935, + "step": 54778 + }, + { + "epoch": 0.6847671191779795, + "grad_norm": 10.231990814208984, + "learning_rate": 5.467366806411796e-06, + "loss": 1.3308, + "step": 54780 + }, + { + "epoch": 0.684792119802995, + "grad_norm": 3.697111129760742, + "learning_rate": 5.466588929178403e-06, + "loss": 0.5016, + "step": 54782 + }, + { + "epoch": 0.6848171204280107, + "grad_norm": 3.0797364711761475, + "learning_rate": 5.465811086470825e-06, + "loss": 0.575, + "step": 54784 + }, + { + "epoch": 0.6848421210530263, + "grad_norm": 0.0023875157348811626, + "learning_rate": 5.4650332782949885e-06, + "loss": 0.0284, + "step": 54786 + }, + { + "epoch": 0.684867121678042, + "grad_norm": 3.3665554523468018, + "learning_rate": 5.464255504656821e-06, + "loss": 0.9465, + "step": 54788 + }, + { + "epoch": 0.6848921223030576, + "grad_norm": 2.9230146408081055, + "learning_rate": 5.463477765562239e-06, + "loss": 0.5611, + "step": 54790 + }, + { + "epoch": 0.6849171229280732, + "grad_norm": 3.1149497032165527, + "learning_rate": 5.4627000610171765e-06, + "loss": 0.8354, + "step": 54792 + }, + { + "epoch": 0.6849421235530888, + "grad_norm": 3.3467652797698975, + "learning_rate": 5.46192239102754e-06, + "loss": 1.2698, + "step": 54794 + }, + { + "epoch": 0.6849671241781045, + "grad_norm": 3.7588307857513428, + "learning_rate": 5.461144755599266e-06, + "loss": 0.6362, + "step": 54796 + }, + { + "epoch": 0.6849921248031201, + "grad_norm": 4.50576114654541, + "learning_rate": 5.460367154738267e-06, + "loss": 1.3361, + "step": 54798 + }, + { + "epoch": 0.6850171254281358, + "grad_norm": 3.7253432273864746, + "learning_rate": 5.459589588450469e-06, + "loss": 0.6521, + "step": 54800 + }, + { + "epoch": 0.6850421260531513, + "grad_norm": 4.609163284301758, + "learning_rate": 5.4588120567418e-06, + "loss": 1.7293, + "step": 54802 + }, + { + "epoch": 0.6850671266781669, + "grad_norm": 0.0014563663862645626, + "learning_rate": 5.45803455961817e-06, + "loss": 0.3374, + "step": 54804 + }, + { + "epoch": 0.6850921273031826, + "grad_norm": 2.508244752883911, + "learning_rate": 5.457257097085517e-06, + "loss": 0.6203, + "step": 54806 + }, + { + "epoch": 0.6851171279281982, + "grad_norm": 5.289681434631348, + "learning_rate": 5.456479669149743e-06, + "loss": 1.1543, + "step": 54808 + }, + { + "epoch": 0.6851421285532139, + "grad_norm": 1.6070544719696045, + "learning_rate": 5.455702275816781e-06, + "loss": 0.0423, + "step": 54810 + }, + { + "epoch": 0.6851671291782294, + "grad_norm": 1.6334162950515747, + "learning_rate": 5.454924917092543e-06, + "loss": 0.7236, + "step": 54812 + }, + { + "epoch": 0.6851921298032451, + "grad_norm": 3.9777164459228516, + "learning_rate": 5.4541475929829545e-06, + "loss": 0.9391, + "step": 54814 + }, + { + "epoch": 0.6852171304282607, + "grad_norm": 2.6713640689849854, + "learning_rate": 5.45337030349394e-06, + "loss": 0.3869, + "step": 54816 + }, + { + "epoch": 0.6852421310532764, + "grad_norm": 2.7605371475219727, + "learning_rate": 5.452593048631413e-06, + "loss": 1.181, + "step": 54818 + }, + { + "epoch": 0.685267131678292, + "grad_norm": 2.7231931686401367, + "learning_rate": 5.4518158284012965e-06, + "loss": 0.7511, + "step": 54820 + }, + { + "epoch": 0.6852921323033075, + "grad_norm": 6.115370273590088, + "learning_rate": 5.451038642809501e-06, + "loss": 1.6798, + "step": 54822 + }, + { + "epoch": 0.6853171329283232, + "grad_norm": 4.124119758605957, + "learning_rate": 5.450261491861954e-06, + "loss": 1.1496, + "step": 54824 + }, + { + "epoch": 0.6853421335533388, + "grad_norm": 3.431138515472412, + "learning_rate": 5.449484375564575e-06, + "loss": 1.1027, + "step": 54826 + }, + { + "epoch": 0.6853671341783545, + "grad_norm": 0.001642425311729312, + "learning_rate": 5.448707293923275e-06, + "loss": 0.444, + "step": 54828 + }, + { + "epoch": 0.6853921348033701, + "grad_norm": 2.7156267166137695, + "learning_rate": 5.4479302469439824e-06, + "loss": 0.4987, + "step": 54830 + }, + { + "epoch": 0.6854171354283857, + "grad_norm": 4.21514892578125, + "learning_rate": 5.447153234632607e-06, + "loss": 1.6212, + "step": 54832 + }, + { + "epoch": 0.6854421360534013, + "grad_norm": 2.58657169342041, + "learning_rate": 5.446376256995069e-06, + "loss": 0.3822, + "step": 54834 + }, + { + "epoch": 0.685467136678417, + "grad_norm": 4.765333652496338, + "learning_rate": 5.445599314037283e-06, + "loss": 0.9767, + "step": 54836 + }, + { + "epoch": 0.6854921373034326, + "grad_norm": 0.0025882290210574865, + "learning_rate": 5.444822405765169e-06, + "loss": 0.6833, + "step": 54838 + }, + { + "epoch": 0.6855171379284483, + "grad_norm": 2.9096457958221436, + "learning_rate": 5.444045532184645e-06, + "loss": 1.1674, + "step": 54840 + }, + { + "epoch": 0.6855421385534638, + "grad_norm": 4.707595348358154, + "learning_rate": 5.443268693301624e-06, + "loss": 1.1807, + "step": 54842 + }, + { + "epoch": 0.6855671391784794, + "grad_norm": 4.067343235015869, + "learning_rate": 5.442491889122032e-06, + "loss": 0.9498, + "step": 54844 + }, + { + "epoch": 0.6855921398034951, + "grad_norm": 0.0007285888423211873, + "learning_rate": 5.441715119651768e-06, + "loss": 0.5964, + "step": 54846 + }, + { + "epoch": 0.6856171404285107, + "grad_norm": 2.8743577003479004, + "learning_rate": 5.440938384896758e-06, + "loss": 0.6053, + "step": 54848 + }, + { + "epoch": 0.6856421410535264, + "grad_norm": 1.972170114517212, + "learning_rate": 5.44016168486292e-06, + "loss": 1.3046, + "step": 54850 + }, + { + "epoch": 0.6856671416785419, + "grad_norm": 3.206460952758789, + "learning_rate": 5.439385019556163e-06, + "loss": 0.4745, + "step": 54852 + }, + { + "epoch": 0.6856921423035576, + "grad_norm": 0.0018358875531703234, + "learning_rate": 5.438608388982407e-06, + "loss": 0.7949, + "step": 54854 + }, + { + "epoch": 0.6857171429285732, + "grad_norm": 3.978154420852661, + "learning_rate": 5.437831793147561e-06, + "loss": 1.4396, + "step": 54856 + }, + { + "epoch": 0.6857421435535889, + "grad_norm": 3.661458969116211, + "learning_rate": 5.437055232057552e-06, + "loss": 1.4344, + "step": 54858 + }, + { + "epoch": 0.6857671441786045, + "grad_norm": 0.0003997894818894565, + "learning_rate": 5.436278705718276e-06, + "loss": 0.8195, + "step": 54860 + }, + { + "epoch": 0.68579214480362, + "grad_norm": 5.550076961517334, + "learning_rate": 5.435502214135657e-06, + "loss": 1.7899, + "step": 54862 + }, + { + "epoch": 0.6858171454286357, + "grad_norm": 3.302865982055664, + "learning_rate": 5.434725757315611e-06, + "loss": 1.0701, + "step": 54864 + }, + { + "epoch": 0.6858421460536513, + "grad_norm": 2.682016134262085, + "learning_rate": 5.4339493352640436e-06, + "loss": 0.4874, + "step": 54866 + }, + { + "epoch": 0.685867146678667, + "grad_norm": 5.2385149002075195, + "learning_rate": 5.433172947986877e-06, + "loss": 0.9374, + "step": 54868 + }, + { + "epoch": 0.6858921473036826, + "grad_norm": 2.120363235473633, + "learning_rate": 5.432396595490019e-06, + "loss": 0.331, + "step": 54870 + }, + { + "epoch": 0.6859171479286982, + "grad_norm": 3.3589084148406982, + "learning_rate": 5.431620277779383e-06, + "loss": 0.893, + "step": 54872 + }, + { + "epoch": 0.6859421485537138, + "grad_norm": 4.708523273468018, + "learning_rate": 5.430843994860877e-06, + "loss": 2.0485, + "step": 54874 + }, + { + "epoch": 0.6859671491787295, + "grad_norm": 3.8306567668914795, + "learning_rate": 5.430067746740417e-06, + "loss": 1.056, + "step": 54876 + }, + { + "epoch": 0.6859921498037451, + "grad_norm": 9.27209186553955, + "learning_rate": 5.429291533423919e-06, + "loss": 1.5976, + "step": 54878 + }, + { + "epoch": 0.6860171504287608, + "grad_norm": 3.2767722606658936, + "learning_rate": 5.428515354917285e-06, + "loss": 0.9885, + "step": 54880 + }, + { + "epoch": 0.6860421510537763, + "grad_norm": 5.097941875457764, + "learning_rate": 5.427739211226437e-06, + "loss": 1.468, + "step": 54882 + }, + { + "epoch": 0.686067151678792, + "grad_norm": 2.6996591091156006, + "learning_rate": 5.426963102357281e-06, + "loss": 0.6097, + "step": 54884 + }, + { + "epoch": 0.6860921523038076, + "grad_norm": 2.890523910522461, + "learning_rate": 5.426187028315721e-06, + "loss": 1.1279, + "step": 54886 + }, + { + "epoch": 0.6861171529288232, + "grad_norm": 4.670711517333984, + "learning_rate": 5.4254109891076805e-06, + "loss": 1.7733, + "step": 54888 + }, + { + "epoch": 0.6861421535538389, + "grad_norm": 3.9579520225524902, + "learning_rate": 5.424634984739058e-06, + "loss": 1.2645, + "step": 54890 + }, + { + "epoch": 0.6861671541788544, + "grad_norm": 4.764370441436768, + "learning_rate": 5.423859015215771e-06, + "loss": 0.5159, + "step": 54892 + }, + { + "epoch": 0.6861921548038701, + "grad_norm": 4.626981735229492, + "learning_rate": 5.423083080543725e-06, + "loss": 1.8998, + "step": 54894 + }, + { + "epoch": 0.6862171554288857, + "grad_norm": 4.334476470947266, + "learning_rate": 5.422307180728837e-06, + "loss": 0.0931, + "step": 54896 + }, + { + "epoch": 0.6862421560539014, + "grad_norm": 2.6834466457366943, + "learning_rate": 5.4215313157770035e-06, + "loss": 0.5854, + "step": 54898 + }, + { + "epoch": 0.686267156678917, + "grad_norm": 0.011462871916592121, + "learning_rate": 5.42075548569414e-06, + "loss": 0.7337, + "step": 54900 + }, + { + "epoch": 0.6862921573039326, + "grad_norm": 6.091381549835205, + "learning_rate": 5.419979690486159e-06, + "loss": 1.7966, + "step": 54902 + }, + { + "epoch": 0.6863171579289482, + "grad_norm": 1.964471697807312, + "learning_rate": 5.41920393015896e-06, + "loss": 1.0794, + "step": 54904 + }, + { + "epoch": 0.6863421585539639, + "grad_norm": 4.398092746734619, + "learning_rate": 5.418428204718461e-06, + "loss": 2.1518, + "step": 54906 + }, + { + "epoch": 0.6863671591789795, + "grad_norm": 2.672058582305908, + "learning_rate": 5.41765251417056e-06, + "loss": 1.3778, + "step": 54908 + }, + { + "epoch": 0.6863921598039951, + "grad_norm": 2.660921573638916, + "learning_rate": 5.416876858521174e-06, + "loss": 1.149, + "step": 54910 + }, + { + "epoch": 0.6864171604290107, + "grad_norm": 0.6995291709899902, + "learning_rate": 5.4161012377762064e-06, + "loss": 0.0125, + "step": 54912 + }, + { + "epoch": 0.6864421610540263, + "grad_norm": 4.512727737426758, + "learning_rate": 5.415325651941559e-06, + "loss": 1.6422, + "step": 54914 + }, + { + "epoch": 0.686467161679042, + "grad_norm": 1.8750590085983276, + "learning_rate": 5.4145501010231485e-06, + "loss": 1.0214, + "step": 54916 + }, + { + "epoch": 0.6864921623040576, + "grad_norm": 7.323897361755371, + "learning_rate": 5.41377458502687e-06, + "loss": 1.0277, + "step": 54918 + }, + { + "epoch": 0.6865171629290733, + "grad_norm": 7.440281391143799, + "learning_rate": 5.412999103958641e-06, + "loss": 1.4173, + "step": 54920 + }, + { + "epoch": 0.6865421635540888, + "grad_norm": 0.005564549937844276, + "learning_rate": 5.4122236578243555e-06, + "loss": 0.7882, + "step": 54922 + }, + { + "epoch": 0.6865671641791045, + "grad_norm": 4.709799289703369, + "learning_rate": 5.411448246629934e-06, + "loss": 1.5476, + "step": 54924 + }, + { + "epoch": 0.6865921648041201, + "grad_norm": 3.5663816928863525, + "learning_rate": 5.4106728703812705e-06, + "loss": 1.1776, + "step": 54926 + }, + { + "epoch": 0.6866171654291358, + "grad_norm": 3.0642735958099365, + "learning_rate": 5.40989752908427e-06, + "loss": 0.5429, + "step": 54928 + }, + { + "epoch": 0.6866421660541514, + "grad_norm": 0.0005118816625326872, + "learning_rate": 5.409122222744846e-06, + "loss": 0.4053, + "step": 54930 + }, + { + "epoch": 0.6866671666791669, + "grad_norm": 4.149877071380615, + "learning_rate": 5.408346951368893e-06, + "loss": 1.1387, + "step": 54932 + }, + { + "epoch": 0.6866921673041826, + "grad_norm": 4.171276569366455, + "learning_rate": 5.407571714962325e-06, + "loss": 1.189, + "step": 54934 + }, + { + "epoch": 0.6867171679291982, + "grad_norm": 3.659334659576416, + "learning_rate": 5.4067965135310405e-06, + "loss": 1.2127, + "step": 54936 + }, + { + "epoch": 0.6867421685542139, + "grad_norm": 2.8854362964630127, + "learning_rate": 5.406021347080941e-06, + "loss": 0.5453, + "step": 54938 + }, + { + "epoch": 0.6867671691792295, + "grad_norm": 0.00235704961232841, + "learning_rate": 5.405246215617937e-06, + "loss": 0.0595, + "step": 54940 + }, + { + "epoch": 0.6867921698042451, + "grad_norm": 0.006609465926885605, + "learning_rate": 5.404471119147925e-06, + "loss": 0.7194, + "step": 54942 + }, + { + "epoch": 0.6868171704292607, + "grad_norm": 5.042086124420166, + "learning_rate": 5.403696057676812e-06, + "loss": 1.7087, + "step": 54944 + }, + { + "epoch": 0.6868421710542764, + "grad_norm": 3.443516969680786, + "learning_rate": 5.4029210312105e-06, + "loss": 1.1166, + "step": 54946 + }, + { + "epoch": 0.686867171679292, + "grad_norm": 5.293257236480713, + "learning_rate": 5.402146039754893e-06, + "loss": 0.9002, + "step": 54948 + }, + { + "epoch": 0.6868921723043077, + "grad_norm": 3.746778964996338, + "learning_rate": 5.4013710833158915e-06, + "loss": 1.4776, + "step": 54950 + }, + { + "epoch": 0.6869171729293232, + "grad_norm": 3.2397446632385254, + "learning_rate": 5.400596161899394e-06, + "loss": 1.4757, + "step": 54952 + }, + { + "epoch": 0.6869421735543388, + "grad_norm": 7.495582103729248, + "learning_rate": 5.39982127551131e-06, + "loss": 1.3373, + "step": 54954 + }, + { + "epoch": 0.6869671741793545, + "grad_norm": 4.099538803100586, + "learning_rate": 5.399046424157532e-06, + "loss": 0.8242, + "step": 54956 + }, + { + "epoch": 0.6869921748043701, + "grad_norm": 2.449314832687378, + "learning_rate": 5.39827160784397e-06, + "loss": 1.0136, + "step": 54958 + }, + { + "epoch": 0.6870171754293858, + "grad_norm": 3.2259011268615723, + "learning_rate": 5.3974968265765174e-06, + "loss": 1.1217, + "step": 54960 + }, + { + "epoch": 0.6870421760544013, + "grad_norm": 3.312499523162842, + "learning_rate": 5.396722080361081e-06, + "loss": 1.0395, + "step": 54962 + }, + { + "epoch": 0.687067176679417, + "grad_norm": 1.5011146068572998, + "learning_rate": 5.3959473692035595e-06, + "loss": 0.2268, + "step": 54964 + }, + { + "epoch": 0.6870921773044326, + "grad_norm": 6.28914213180542, + "learning_rate": 5.395172693109847e-06, + "loss": 0.4353, + "step": 54966 + }, + { + "epoch": 0.6871171779294483, + "grad_norm": 3.8844985961914062, + "learning_rate": 5.3943980520858516e-06, + "loss": 0.6467, + "step": 54968 + }, + { + "epoch": 0.6871421785544639, + "grad_norm": 2.7811946868896484, + "learning_rate": 5.393623446137466e-06, + "loss": 0.6437, + "step": 54970 + }, + { + "epoch": 0.6871671791794794, + "grad_norm": 1.9273079633712769, + "learning_rate": 5.392848875270596e-06, + "loss": 0.1158, + "step": 54972 + }, + { + "epoch": 0.6871921798044951, + "grad_norm": 1.9944132566452026, + "learning_rate": 5.392074339491136e-06, + "loss": 1.2959, + "step": 54974 + }, + { + "epoch": 0.6872171804295107, + "grad_norm": 4.699462413787842, + "learning_rate": 5.391299838804988e-06, + "loss": 0.7643, + "step": 54976 + }, + { + "epoch": 0.6872421810545264, + "grad_norm": 3.3788716793060303, + "learning_rate": 5.390525373218048e-06, + "loss": 1.8825, + "step": 54978 + }, + { + "epoch": 0.687267181679542, + "grad_norm": 0.33896586298942566, + "learning_rate": 5.389750942736212e-06, + "loss": 0.0132, + "step": 54980 + }, + { + "epoch": 0.6872921823045576, + "grad_norm": 0.0008136416436173022, + "learning_rate": 5.388976547365386e-06, + "loss": 0.1775, + "step": 54982 + }, + { + "epoch": 0.6873171829295732, + "grad_norm": 7.493406772613525, + "learning_rate": 5.388202187111457e-06, + "loss": 2.0132, + "step": 54984 + }, + { + "epoch": 0.6873421835545889, + "grad_norm": 5.685007572174072, + "learning_rate": 5.387427861980332e-06, + "loss": 1.5414, + "step": 54986 + }, + { + "epoch": 0.6873671841796045, + "grad_norm": 4.789431095123291, + "learning_rate": 5.3866535719779045e-06, + "loss": 1.4085, + "step": 54988 + }, + { + "epoch": 0.6873921848046202, + "grad_norm": 0.9693359136581421, + "learning_rate": 5.3858793171100675e-06, + "loss": 0.6178, + "step": 54990 + }, + { + "epoch": 0.6874171854296357, + "grad_norm": 1.865880012512207, + "learning_rate": 5.385105097382724e-06, + "loss": 0.3945, + "step": 54992 + }, + { + "epoch": 0.6874421860546513, + "grad_norm": 4.376418113708496, + "learning_rate": 5.384330912801765e-06, + "loss": 0.1905, + "step": 54994 + }, + { + "epoch": 0.687467186679667, + "grad_norm": 3.0619325637817383, + "learning_rate": 5.383556763373092e-06, + "loss": 0.6237, + "step": 54996 + }, + { + "epoch": 0.6874921873046826, + "grad_norm": 2.110192060470581, + "learning_rate": 5.382782649102594e-06, + "loss": 0.1711, + "step": 54998 + }, + { + "epoch": 0.6875171879296983, + "grad_norm": 3.0596964359283447, + "learning_rate": 5.382008569996175e-06, + "loss": 1.02, + "step": 55000 + }, + { + "epoch": 0.6875421885547138, + "grad_norm": 5.536541938781738, + "learning_rate": 5.381234526059725e-06, + "loss": 0.9486, + "step": 55002 + }, + { + "epoch": 0.6875671891797295, + "grad_norm": 4.4355316162109375, + "learning_rate": 5.380460517299135e-06, + "loss": 1.4988, + "step": 55004 + }, + { + "epoch": 0.6875921898047451, + "grad_norm": 3.3684072494506836, + "learning_rate": 5.3796865437203095e-06, + "loss": 0.3285, + "step": 55006 + }, + { + "epoch": 0.6876171904297608, + "grad_norm": 4.582711219787598, + "learning_rate": 5.378912605329134e-06, + "loss": 1.4943, + "step": 55008 + }, + { + "epoch": 0.6876421910547764, + "grad_norm": 4.126511096954346, + "learning_rate": 5.37813870213151e-06, + "loss": 0.9734, + "step": 55010 + }, + { + "epoch": 0.687667191679792, + "grad_norm": 3.6910898685455322, + "learning_rate": 5.377364834133325e-06, + "loss": 1.739, + "step": 55012 + }, + { + "epoch": 0.6876921923048076, + "grad_norm": 0.4336293637752533, + "learning_rate": 5.376591001340479e-06, + "loss": 0.5136, + "step": 55014 + }, + { + "epoch": 0.6877171929298233, + "grad_norm": 4.747777938842773, + "learning_rate": 5.375817203758862e-06, + "loss": 1.8215, + "step": 55016 + }, + { + "epoch": 0.6877421935548389, + "grad_norm": 3.3741989135742188, + "learning_rate": 5.375043441394363e-06, + "loss": 0.9458, + "step": 55018 + }, + { + "epoch": 0.6877671941798545, + "grad_norm": 2.0637757778167725, + "learning_rate": 5.374269714252885e-06, + "loss": 0.2143, + "step": 55020 + }, + { + "epoch": 0.6877921948048701, + "grad_norm": 1.1433392763137817, + "learning_rate": 5.373496022340308e-06, + "loss": 0.0702, + "step": 55022 + }, + { + "epoch": 0.6878171954298857, + "grad_norm": 2.669438362121582, + "learning_rate": 5.372722365662537e-06, + "loss": 0.464, + "step": 55024 + }, + { + "epoch": 0.6878421960549014, + "grad_norm": 4.725068092346191, + "learning_rate": 5.371948744225452e-06, + "loss": 1.187, + "step": 55026 + }, + { + "epoch": 0.687867196679917, + "grad_norm": 3.3250670433044434, + "learning_rate": 5.371175158034958e-06, + "loss": 1.131, + "step": 55028 + }, + { + "epoch": 0.6878921973049327, + "grad_norm": 2.3591058254241943, + "learning_rate": 5.370401607096938e-06, + "loss": 1.4445, + "step": 55030 + }, + { + "epoch": 0.6879171979299482, + "grad_norm": 4.669010162353516, + "learning_rate": 5.36962809141728e-06, + "loss": 1.1295, + "step": 55032 + }, + { + "epoch": 0.6879421985549639, + "grad_norm": 0.0006409239722415805, + "learning_rate": 5.368854611001883e-06, + "loss": 0.0147, + "step": 55034 + }, + { + "epoch": 0.6879671991799795, + "grad_norm": 0.4553995728492737, + "learning_rate": 5.368081165856632e-06, + "loss": 0.2141, + "step": 55036 + }, + { + "epoch": 0.6879921998049952, + "grad_norm": 3.9515702724456787, + "learning_rate": 5.367307755987423e-06, + "loss": 1.1312, + "step": 55038 + }, + { + "epoch": 0.6880172004300108, + "grad_norm": 1.5809327363967896, + "learning_rate": 5.366534381400144e-06, + "loss": 0.796, + "step": 55040 + }, + { + "epoch": 0.6880422010550263, + "grad_norm": 2.6771798133850098, + "learning_rate": 5.365761042100679e-06, + "loss": 1.0824, + "step": 55042 + }, + { + "epoch": 0.688067201680042, + "grad_norm": 2.2832913398742676, + "learning_rate": 5.364987738094927e-06, + "loss": 0.6329, + "step": 55044 + }, + { + "epoch": 0.6880922023050576, + "grad_norm": 0.05141296610236168, + "learning_rate": 5.364214469388769e-06, + "loss": 0.709, + "step": 55046 + }, + { + "epoch": 0.6881172029300733, + "grad_norm": 3.3544061183929443, + "learning_rate": 5.363441235988102e-06, + "loss": 1.4532, + "step": 55048 + }, + { + "epoch": 0.6881422035550889, + "grad_norm": 0.00281059299595654, + "learning_rate": 5.3626680378988065e-06, + "loss": 0.0913, + "step": 55050 + }, + { + "epoch": 0.6881672041801045, + "grad_norm": 2.4196910858154297, + "learning_rate": 5.361894875126779e-06, + "loss": 0.9996, + "step": 55052 + }, + { + "epoch": 0.6881922048051201, + "grad_norm": 0.7865378260612488, + "learning_rate": 5.3611217476779055e-06, + "loss": 0.2948, + "step": 55054 + }, + { + "epoch": 0.6882172054301358, + "grad_norm": 3.084904909133911, + "learning_rate": 5.360348655558068e-06, + "loss": 1.0282, + "step": 55056 + }, + { + "epoch": 0.6882422060551514, + "grad_norm": 2.7730841636657715, + "learning_rate": 5.359575598773163e-06, + "loss": 0.668, + "step": 55058 + }, + { + "epoch": 0.688267206680167, + "grad_norm": 1.0815685987472534, + "learning_rate": 5.3588025773290695e-06, + "loss": 0.7383, + "step": 55060 + }, + { + "epoch": 0.6882922073051826, + "grad_norm": 4.450870990753174, + "learning_rate": 5.358029591231685e-06, + "loss": 0.7003, + "step": 55062 + }, + { + "epoch": 0.6883172079301982, + "grad_norm": 4.682647228240967, + "learning_rate": 5.357256640486885e-06, + "loss": 0.3212, + "step": 55064 + }, + { + "epoch": 0.6883422085552139, + "grad_norm": 11.620780944824219, + "learning_rate": 5.356483725100568e-06, + "loss": 1.8641, + "step": 55066 + }, + { + "epoch": 0.6883672091802295, + "grad_norm": 2.3373770713806152, + "learning_rate": 5.355710845078612e-06, + "loss": 0.652, + "step": 55068 + }, + { + "epoch": 0.6883922098052452, + "grad_norm": 0.0008800793439149857, + "learning_rate": 5.354938000426903e-06, + "loss": 0.2479, + "step": 55070 + }, + { + "epoch": 0.6884172104302607, + "grad_norm": 4.182572364807129, + "learning_rate": 5.3541651911513326e-06, + "loss": 0.8687, + "step": 55072 + }, + { + "epoch": 0.6884422110552764, + "grad_norm": 2.4754326343536377, + "learning_rate": 5.353392417257779e-06, + "loss": 0.8445, + "step": 55074 + }, + { + "epoch": 0.688467211680292, + "grad_norm": 5.013744354248047, + "learning_rate": 5.3526196787521354e-06, + "loss": 1.7587, + "step": 55076 + }, + { + "epoch": 0.6884922123053077, + "grad_norm": 2.9685864448547363, + "learning_rate": 5.351846975640279e-06, + "loss": 0.8561, + "step": 55078 + }, + { + "epoch": 0.6885172129303233, + "grad_norm": 2.3220503330230713, + "learning_rate": 5.351074307928104e-06, + "loss": 1.1149, + "step": 55080 + }, + { + "epoch": 0.6885422135553388, + "grad_norm": 18.37394142150879, + "learning_rate": 5.350301675621487e-06, + "loss": 1.1843, + "step": 55082 + }, + { + "epoch": 0.6885672141803545, + "grad_norm": 0.0020175003446638584, + "learning_rate": 5.349529078726315e-06, + "loss": 0.3899, + "step": 55084 + }, + { + "epoch": 0.6885922148053701, + "grad_norm": 0.6849501729011536, + "learning_rate": 5.348756517248472e-06, + "loss": 0.0482, + "step": 55086 + }, + { + "epoch": 0.6886172154303858, + "grad_norm": 4.170806407928467, + "learning_rate": 5.3479839911938405e-06, + "loss": 1.5857, + "step": 55088 + }, + { + "epoch": 0.6886422160554014, + "grad_norm": 1.8435479402542114, + "learning_rate": 5.3472115005683075e-06, + "loss": 1.0676, + "step": 55090 + }, + { + "epoch": 0.688667216680417, + "grad_norm": 6.725037574768066, + "learning_rate": 5.346439045377754e-06, + "loss": 0.8771, + "step": 55092 + }, + { + "epoch": 0.6886922173054326, + "grad_norm": 11.316161155700684, + "learning_rate": 5.345666625628059e-06, + "loss": 1.3485, + "step": 55094 + }, + { + "epoch": 0.6887172179304483, + "grad_norm": 2.4848904609680176, + "learning_rate": 5.3448942413251125e-06, + "loss": 0.7008, + "step": 55096 + }, + { + "epoch": 0.6887422185554639, + "grad_norm": 1.627326488494873, + "learning_rate": 5.3441218924747895e-06, + "loss": 0.5578, + "step": 55098 + }, + { + "epoch": 0.6887672191804796, + "grad_norm": 2.8466687202453613, + "learning_rate": 5.343349579082979e-06, + "loss": 0.6437, + "step": 55100 + }, + { + "epoch": 0.6887922198054951, + "grad_norm": 10.809813499450684, + "learning_rate": 5.342577301155556e-06, + "loss": 0.5851, + "step": 55102 + }, + { + "epoch": 0.6888172204305107, + "grad_norm": 0.0011802446097135544, + "learning_rate": 5.341805058698406e-06, + "loss": 0.7944, + "step": 55104 + }, + { + "epoch": 0.6888422210555264, + "grad_norm": 6.175104141235352, + "learning_rate": 5.341032851717418e-06, + "loss": 0.7605, + "step": 55106 + }, + { + "epoch": 0.688867221680542, + "grad_norm": 3.063082218170166, + "learning_rate": 5.340260680218457e-06, + "loss": 1.3008, + "step": 55108 + }, + { + "epoch": 0.6888922223055577, + "grad_norm": 2.895378351211548, + "learning_rate": 5.339488544207415e-06, + "loss": 1.429, + "step": 55110 + }, + { + "epoch": 0.6889172229305732, + "grad_norm": 3.7549080848693848, + "learning_rate": 5.3387164436901665e-06, + "loss": 1.4703, + "step": 55112 + }, + { + "epoch": 0.6889422235555889, + "grad_norm": 7.712357997894287, + "learning_rate": 5.337944378672597e-06, + "loss": 1.2496, + "step": 55114 + }, + { + "epoch": 0.6889672241806045, + "grad_norm": 3.827498197555542, + "learning_rate": 5.3371723491605795e-06, + "loss": 1.1162, + "step": 55116 + }, + { + "epoch": 0.6889922248056202, + "grad_norm": 6.893155574798584, + "learning_rate": 5.336400355160003e-06, + "loss": 0.7824, + "step": 55118 + }, + { + "epoch": 0.6890172254306358, + "grad_norm": 2.910818099975586, + "learning_rate": 5.3356283966767415e-06, + "loss": 1.2019, + "step": 55120 + }, + { + "epoch": 0.6890422260556514, + "grad_norm": 4.003190040588379, + "learning_rate": 5.3348564737166715e-06, + "loss": 0.9257, + "step": 55122 + }, + { + "epoch": 0.689067226680667, + "grad_norm": 3.5183122158050537, + "learning_rate": 5.334084586285677e-06, + "loss": 1.6397, + "step": 55124 + }, + { + "epoch": 0.6890922273056826, + "grad_norm": 4.476487636566162, + "learning_rate": 5.333312734389633e-06, + "loss": 1.9139, + "step": 55126 + }, + { + "epoch": 0.6891172279306983, + "grad_norm": 8.766759872436523, + "learning_rate": 5.3325409180344215e-06, + "loss": 0.9468, + "step": 55128 + }, + { + "epoch": 0.6891422285557139, + "grad_norm": 5.350936412811279, + "learning_rate": 5.331769137225915e-06, + "loss": 0.5626, + "step": 55130 + }, + { + "epoch": 0.6891672291807295, + "grad_norm": 0.9495681524276733, + "learning_rate": 5.330997391969998e-06, + "loss": 0.0866, + "step": 55132 + }, + { + "epoch": 0.6891922298057451, + "grad_norm": 3.6884512901306152, + "learning_rate": 5.330225682272545e-06, + "loss": 1.5116, + "step": 55134 + }, + { + "epoch": 0.6892172304307608, + "grad_norm": 3.550391912460327, + "learning_rate": 5.329454008139429e-06, + "loss": 1.3875, + "step": 55136 + }, + { + "epoch": 0.6892422310557764, + "grad_norm": 4.710766792297363, + "learning_rate": 5.328682369576534e-06, + "loss": 1.8308, + "step": 55138 + }, + { + "epoch": 0.6892672316807921, + "grad_norm": 0.6108308434486389, + "learning_rate": 5.32791076658973e-06, + "loss": 0.7834, + "step": 55140 + }, + { + "epoch": 0.6892922323058076, + "grad_norm": 2.615379571914673, + "learning_rate": 5.327139199184898e-06, + "loss": 1.0143, + "step": 55142 + }, + { + "epoch": 0.6893172329308233, + "grad_norm": 3.5805397033691406, + "learning_rate": 5.32636766736792e-06, + "loss": 1.0343, + "step": 55144 + }, + { + "epoch": 0.6893422335558389, + "grad_norm": 2.132350444793701, + "learning_rate": 5.3255961711446555e-06, + "loss": 0.3125, + "step": 55146 + }, + { + "epoch": 0.6893672341808545, + "grad_norm": 3.5335183143615723, + "learning_rate": 5.324824710520996e-06, + "loss": 1.1414, + "step": 55148 + }, + { + "epoch": 0.6893922348058702, + "grad_norm": 7.8344573974609375, + "learning_rate": 5.324053285502806e-06, + "loss": 1.869, + "step": 55150 + }, + { + "epoch": 0.6894172354308857, + "grad_norm": 3.5607621669769287, + "learning_rate": 5.323281896095968e-06, + "loss": 0.5783, + "step": 55152 + }, + { + "epoch": 0.6894422360559014, + "grad_norm": 2.990442991256714, + "learning_rate": 5.3225105423063496e-06, + "loss": 0.7079, + "step": 55154 + }, + { + "epoch": 0.689467236680917, + "grad_norm": 6.702207088470459, + "learning_rate": 5.32173922413983e-06, + "loss": 1.9532, + "step": 55156 + }, + { + "epoch": 0.6894922373059327, + "grad_norm": 2.3370087146759033, + "learning_rate": 5.3209679416022904e-06, + "loss": 0.9725, + "step": 55158 + }, + { + "epoch": 0.6895172379309483, + "grad_norm": 5.1468400955200195, + "learning_rate": 5.3201966946995895e-06, + "loss": 1.306, + "step": 55160 + }, + { + "epoch": 0.6895422385559639, + "grad_norm": 0.0023955225478857756, + "learning_rate": 5.319425483437613e-06, + "loss": 0.0001, + "step": 55162 + }, + { + "epoch": 0.6895672391809795, + "grad_norm": 9.408884048461914, + "learning_rate": 5.318654307822227e-06, + "loss": 1.3677, + "step": 55164 + }, + { + "epoch": 0.6895922398059952, + "grad_norm": 2.9719910621643066, + "learning_rate": 5.3178831678593064e-06, + "loss": 0.9338, + "step": 55166 + }, + { + "epoch": 0.6896172404310108, + "grad_norm": 0.013785496354103088, + "learning_rate": 5.31711206355473e-06, + "loss": 0.6447, + "step": 55168 + }, + { + "epoch": 0.6896422410560265, + "grad_norm": 11.016919136047363, + "learning_rate": 5.316340994914367e-06, + "loss": 0.919, + "step": 55170 + }, + { + "epoch": 0.689667241681042, + "grad_norm": 5.373638153076172, + "learning_rate": 5.315569961944087e-06, + "loss": 1.4782, + "step": 55172 + }, + { + "epoch": 0.6896922423060576, + "grad_norm": 5.343466758728027, + "learning_rate": 5.31479896464976e-06, + "loss": 1.7976, + "step": 55174 + }, + { + "epoch": 0.6897172429310733, + "grad_norm": 6.714556694030762, + "learning_rate": 5.3140280030372655e-06, + "loss": 0.3374, + "step": 55176 + }, + { + "epoch": 0.6897422435560889, + "grad_norm": 1.8444511890411377, + "learning_rate": 5.313257077112468e-06, + "loss": 0.0883, + "step": 55178 + }, + { + "epoch": 0.6897672441811046, + "grad_norm": 3.1282382011413574, + "learning_rate": 5.31248618688124e-06, + "loss": 0.4366, + "step": 55180 + }, + { + "epoch": 0.6897922448061201, + "grad_norm": 0.0011266714427620173, + "learning_rate": 5.311715332349461e-06, + "loss": 0.0711, + "step": 55182 + }, + { + "epoch": 0.6898172454311358, + "grad_norm": 0.037317223846912384, + "learning_rate": 5.310944513522993e-06, + "loss": 0.3999, + "step": 55184 + }, + { + "epoch": 0.6898422460561514, + "grad_norm": 3.0647287368774414, + "learning_rate": 5.310173730407708e-06, + "loss": 0.5844, + "step": 55186 + }, + { + "epoch": 0.6898672466811671, + "grad_norm": 3.742111921310425, + "learning_rate": 5.309402983009475e-06, + "loss": 1.1317, + "step": 55188 + }, + { + "epoch": 0.6898922473061827, + "grad_norm": 2.9220168590545654, + "learning_rate": 5.308632271334167e-06, + "loss": 0.4166, + "step": 55190 + }, + { + "epoch": 0.6899172479311982, + "grad_norm": 2.947657823562622, + "learning_rate": 5.307861595387651e-06, + "loss": 1.18, + "step": 55192 + }, + { + "epoch": 0.6899422485562139, + "grad_norm": 3.7666847705841064, + "learning_rate": 5.307090955175796e-06, + "loss": 0.9484, + "step": 55194 + }, + { + "epoch": 0.6899672491812295, + "grad_norm": 0.000643273931927979, + "learning_rate": 5.306320350704477e-06, + "loss": 0.3791, + "step": 55196 + }, + { + "epoch": 0.6899922498062452, + "grad_norm": 4.340051174163818, + "learning_rate": 5.3055497819795575e-06, + "loss": 1.3488, + "step": 55198 + }, + { + "epoch": 0.6900172504312608, + "grad_norm": 2.075754404067993, + "learning_rate": 5.304779249006908e-06, + "loss": 2.027, + "step": 55200 + }, + { + "epoch": 0.6900422510562764, + "grad_norm": 0.0009497070568613708, + "learning_rate": 5.304008751792391e-06, + "loss": 0.6336, + "step": 55202 + }, + { + "epoch": 0.690067251681292, + "grad_norm": 4.758893013000488, + "learning_rate": 5.30323829034188e-06, + "loss": 1.6184, + "step": 55204 + }, + { + "epoch": 0.6900922523063077, + "grad_norm": 4.172976493835449, + "learning_rate": 5.302467864661245e-06, + "loss": 0.5867, + "step": 55206 + }, + { + "epoch": 0.6901172529313233, + "grad_norm": 2.355771541595459, + "learning_rate": 5.301697474756347e-06, + "loss": 0.3666, + "step": 55208 + }, + { + "epoch": 0.690142253556339, + "grad_norm": 2.3592329025268555, + "learning_rate": 5.300927120633066e-06, + "loss": 0.5382, + "step": 55210 + }, + { + "epoch": 0.6901672541813545, + "grad_norm": 6.475348949432373, + "learning_rate": 5.30015680229725e-06, + "loss": 1.065, + "step": 55212 + }, + { + "epoch": 0.6901922548063701, + "grad_norm": 3.108582019805908, + "learning_rate": 5.299386519754779e-06, + "loss": 0.6408, + "step": 55214 + }, + { + "epoch": 0.6902172554313858, + "grad_norm": 2.9294064044952393, + "learning_rate": 5.298616273011512e-06, + "loss": 1.0499, + "step": 55216 + }, + { + "epoch": 0.6902422560564014, + "grad_norm": 3.0904548168182373, + "learning_rate": 5.29784606207332e-06, + "loss": 0.9472, + "step": 55218 + }, + { + "epoch": 0.6902672566814171, + "grad_norm": 11.509726524353027, + "learning_rate": 5.297075886946069e-06, + "loss": 1.1803, + "step": 55220 + }, + { + "epoch": 0.6902922573064326, + "grad_norm": 1.0173757076263428, + "learning_rate": 5.2963057476356205e-06, + "loss": 0.8098, + "step": 55222 + }, + { + "epoch": 0.6903172579314483, + "grad_norm": 3.6352429389953613, + "learning_rate": 5.29553564414785e-06, + "loss": 0.7045, + "step": 55224 + }, + { + "epoch": 0.6903422585564639, + "grad_norm": 0.0019856819417327642, + "learning_rate": 5.294765576488606e-06, + "loss": 1.047, + "step": 55226 + }, + { + "epoch": 0.6903672591814796, + "grad_norm": 9.247587203979492, + "learning_rate": 5.293995544663763e-06, + "loss": 1.1425, + "step": 55228 + }, + { + "epoch": 0.6903922598064952, + "grad_norm": 0.8780779838562012, + "learning_rate": 5.293225548679189e-06, + "loss": 1.1364, + "step": 55230 + }, + { + "epoch": 0.6904172604315107, + "grad_norm": 5.5409464836120605, + "learning_rate": 5.292455588540738e-06, + "loss": 1.6533, + "step": 55232 + }, + { + "epoch": 0.6904422610565264, + "grad_norm": 3.613612174987793, + "learning_rate": 5.2916856642542846e-06, + "loss": 0.629, + "step": 55234 + }, + { + "epoch": 0.690467261681542, + "grad_norm": 1.7676440477371216, + "learning_rate": 5.290915775825687e-06, + "loss": 0.3764, + "step": 55236 + }, + { + "epoch": 0.6904922623065577, + "grad_norm": 1.7016264200210571, + "learning_rate": 5.2901459232608084e-06, + "loss": 0.1273, + "step": 55238 + }, + { + "epoch": 0.6905172629315733, + "grad_norm": 0.0009689938160590827, + "learning_rate": 5.289376106565509e-06, + "loss": 0.692, + "step": 55240 + }, + { + "epoch": 0.6905422635565889, + "grad_norm": 2.3264358043670654, + "learning_rate": 5.288606325745655e-06, + "loss": 0.1415, + "step": 55242 + }, + { + "epoch": 0.6905672641816045, + "grad_norm": 5.810711860656738, + "learning_rate": 5.287836580807113e-06, + "loss": 1.7547, + "step": 55244 + }, + { + "epoch": 0.6905922648066202, + "grad_norm": 3.502610206604004, + "learning_rate": 5.2870668717557376e-06, + "loss": 0.6022, + "step": 55246 + }, + { + "epoch": 0.6906172654316358, + "grad_norm": 3.0925323963165283, + "learning_rate": 5.286297198597399e-06, + "loss": 0.7912, + "step": 55248 + }, + { + "epoch": 0.6906422660566515, + "grad_norm": 3.1766750812530518, + "learning_rate": 5.285527561337953e-06, + "loss": 1.5782, + "step": 55250 + }, + { + "epoch": 0.690667266681667, + "grad_norm": 1.7689411640167236, + "learning_rate": 5.284757959983257e-06, + "loss": 0.529, + "step": 55252 + }, + { + "epoch": 0.6906922673066827, + "grad_norm": 2.556462287902832, + "learning_rate": 5.2839883945391845e-06, + "loss": 1.0722, + "step": 55254 + }, + { + "epoch": 0.6907172679316983, + "grad_norm": 3.970520257949829, + "learning_rate": 5.283218865011585e-06, + "loss": 0.8338, + "step": 55256 + }, + { + "epoch": 0.690742268556714, + "grad_norm": 0.0007803558255545795, + "learning_rate": 5.282449371406326e-06, + "loss": 1.5606, + "step": 55258 + }, + { + "epoch": 0.6907672691817296, + "grad_norm": 0.9454952478408813, + "learning_rate": 5.281679913729263e-06, + "loss": 0.1561, + "step": 55260 + }, + { + "epoch": 0.6907922698067451, + "grad_norm": 5.198459148406982, + "learning_rate": 5.280910491986266e-06, + "loss": 0.7808, + "step": 55262 + }, + { + "epoch": 0.6908172704317608, + "grad_norm": 2.910081148147583, + "learning_rate": 5.280141106183179e-06, + "loss": 0.1808, + "step": 55264 + }, + { + "epoch": 0.6908422710567764, + "grad_norm": 0.0024767874274402857, + "learning_rate": 5.279371756325871e-06, + "loss": 0.2097, + "step": 55266 + }, + { + "epoch": 0.6908672716817921, + "grad_norm": 2.3320834636688232, + "learning_rate": 5.278602442420202e-06, + "loss": 0.9736, + "step": 55268 + }, + { + "epoch": 0.6908922723068077, + "grad_norm": 0.0006555502186529338, + "learning_rate": 5.277833164472027e-06, + "loss": 0.2363, + "step": 55270 + }, + { + "epoch": 0.6909172729318233, + "grad_norm": 0.6748582124710083, + "learning_rate": 5.277063922487211e-06, + "loss": 0.8593, + "step": 55272 + }, + { + "epoch": 0.6909422735568389, + "grad_norm": 0.43092411756515503, + "learning_rate": 5.276294716471604e-06, + "loss": 0.6465, + "step": 55274 + }, + { + "epoch": 0.6909672741818546, + "grad_norm": 0.7203629016876221, + "learning_rate": 5.275525546431075e-06, + "loss": 0.3042, + "step": 55276 + }, + { + "epoch": 0.6909922748068702, + "grad_norm": 4.556748390197754, + "learning_rate": 5.2747564123714686e-06, + "loss": 0.9934, + "step": 55278 + }, + { + "epoch": 0.6910172754318858, + "grad_norm": 3.0179226398468018, + "learning_rate": 5.27398731429865e-06, + "loss": 0.311, + "step": 55280 + }, + { + "epoch": 0.6910422760569014, + "grad_norm": 3.3229503631591797, + "learning_rate": 5.273218252218477e-06, + "loss": 0.6243, + "step": 55282 + }, + { + "epoch": 0.691067276681917, + "grad_norm": 3.1923739910125732, + "learning_rate": 5.2724492261368045e-06, + "loss": 0.5097, + "step": 55284 + }, + { + "epoch": 0.6910922773069327, + "grad_norm": 0.36493316292762756, + "learning_rate": 5.271680236059492e-06, + "loss": 0.6445, + "step": 55286 + }, + { + "epoch": 0.6911172779319483, + "grad_norm": 4.381524085998535, + "learning_rate": 5.270911281992394e-06, + "loss": 0.7966, + "step": 55288 + }, + { + "epoch": 0.691142278556964, + "grad_norm": 3.152608633041382, + "learning_rate": 5.270142363941364e-06, + "loss": 1.6074, + "step": 55290 + }, + { + "epoch": 0.6911672791819795, + "grad_norm": 3.109272003173828, + "learning_rate": 5.269373481912265e-06, + "loss": 0.8561, + "step": 55292 + }, + { + "epoch": 0.6911922798069952, + "grad_norm": 2.859248399734497, + "learning_rate": 5.268604635910946e-06, + "loss": 0.9386, + "step": 55294 + }, + { + "epoch": 0.6912172804320108, + "grad_norm": 3.4073357582092285, + "learning_rate": 5.267835825943266e-06, + "loss": 0.3902, + "step": 55296 + }, + { + "epoch": 0.6912422810570265, + "grad_norm": 5.659832954406738, + "learning_rate": 5.267067052015076e-06, + "loss": 1.1231, + "step": 55298 + }, + { + "epoch": 0.6912672816820421, + "grad_norm": 1.4814752340316772, + "learning_rate": 5.266298314132239e-06, + "loss": 0.7591, + "step": 55300 + }, + { + "epoch": 0.6912922823070576, + "grad_norm": 6.691812515258789, + "learning_rate": 5.265529612300604e-06, + "loss": 1.3867, + "step": 55302 + }, + { + "epoch": 0.6913172829320733, + "grad_norm": 3.0200865268707275, + "learning_rate": 5.264760946526022e-06, + "loss": 0.8091, + "step": 55304 + }, + { + "epoch": 0.6913422835570889, + "grad_norm": 4.8287353515625, + "learning_rate": 5.263992316814355e-06, + "loss": 0.745, + "step": 55306 + }, + { + "epoch": 0.6913672841821046, + "grad_norm": 0.0008133909432217479, + "learning_rate": 5.263223723171449e-06, + "loss": 0.0869, + "step": 55308 + }, + { + "epoch": 0.6913922848071202, + "grad_norm": 4.156511306762695, + "learning_rate": 5.262455165603165e-06, + "loss": 1.6843, + "step": 55310 + }, + { + "epoch": 0.6914172854321358, + "grad_norm": 0.003968777135014534, + "learning_rate": 5.261686644115349e-06, + "loss": 0.7367, + "step": 55312 + }, + { + "epoch": 0.6914422860571514, + "grad_norm": 2.4402363300323486, + "learning_rate": 5.2609181587138615e-06, + "loss": 0.3686, + "step": 55314 + }, + { + "epoch": 0.6914672866821671, + "grad_norm": 3.5384681224823, + "learning_rate": 5.260149709404549e-06, + "loss": 0.254, + "step": 55316 + }, + { + "epoch": 0.6914922873071827, + "grad_norm": 3.0699915885925293, + "learning_rate": 5.259381296193264e-06, + "loss": 0.9254, + "step": 55318 + }, + { + "epoch": 0.6915172879321984, + "grad_norm": 5.018610954284668, + "learning_rate": 5.258612919085866e-06, + "loss": 1.282, + "step": 55320 + }, + { + "epoch": 0.6915422885572139, + "grad_norm": 0.000554154918063432, + "learning_rate": 5.257844578088196e-06, + "loss": 0.0, + "step": 55322 + }, + { + "epoch": 0.6915672891822295, + "grad_norm": 2.8699026107788086, + "learning_rate": 5.257076273206114e-06, + "loss": 1.0618, + "step": 55324 + }, + { + "epoch": 0.6915922898072452, + "grad_norm": 2.5813517570495605, + "learning_rate": 5.256308004445466e-06, + "loss": 0.8739, + "step": 55326 + }, + { + "epoch": 0.6916172904322608, + "grad_norm": 4.1501641273498535, + "learning_rate": 5.255539771812108e-06, + "loss": 0.5874, + "step": 55328 + }, + { + "epoch": 0.6916422910572765, + "grad_norm": 6.706586837768555, + "learning_rate": 5.25477157531189e-06, + "loss": 0.9028, + "step": 55330 + }, + { + "epoch": 0.691667291682292, + "grad_norm": 0.9075060486793518, + "learning_rate": 5.2540034149506555e-06, + "loss": 0.1853, + "step": 55332 + }, + { + "epoch": 0.6916922923073077, + "grad_norm": 5.597611427307129, + "learning_rate": 5.253235290734264e-06, + "loss": 1.6896, + "step": 55334 + }, + { + "epoch": 0.6917172929323233, + "grad_norm": 0.23283492028713226, + "learning_rate": 5.2524672026685575e-06, + "loss": 0.6991, + "step": 55336 + }, + { + "epoch": 0.691742293557339, + "grad_norm": 4.676265239715576, + "learning_rate": 5.251699150759393e-06, + "loss": 1.2318, + "step": 55338 + }, + { + "epoch": 0.6917672941823546, + "grad_norm": 4.577881813049316, + "learning_rate": 5.2509311350126165e-06, + "loss": 0.6585, + "step": 55340 + }, + { + "epoch": 0.6917922948073701, + "grad_norm": 5.12650728225708, + "learning_rate": 5.2501631554340735e-06, + "loss": 1.722, + "step": 55342 + }, + { + "epoch": 0.6918172954323858, + "grad_norm": 2.632763147354126, + "learning_rate": 5.24939521202962e-06, + "loss": 1.2627, + "step": 55344 + }, + { + "epoch": 0.6918422960574014, + "grad_norm": 3.8128459453582764, + "learning_rate": 5.248627304805096e-06, + "loss": 0.3424, + "step": 55346 + }, + { + "epoch": 0.6918672966824171, + "grad_norm": 8.912308692932129, + "learning_rate": 5.247859433766358e-06, + "loss": 1.0895, + "step": 55348 + }, + { + "epoch": 0.6918922973074327, + "grad_norm": 5.991015911102295, + "learning_rate": 5.247091598919248e-06, + "loss": 1.1859, + "step": 55350 + }, + { + "epoch": 0.6919172979324483, + "grad_norm": 3.1873695850372314, + "learning_rate": 5.246323800269619e-06, + "loss": 1.2316, + "step": 55352 + }, + { + "epoch": 0.6919422985574639, + "grad_norm": 5.678859233856201, + "learning_rate": 5.2455560378233185e-06, + "loss": 1.8884, + "step": 55354 + }, + { + "epoch": 0.6919672991824796, + "grad_norm": 3.4159903526306152, + "learning_rate": 5.244788311586184e-06, + "loss": 1.5105, + "step": 55356 + }, + { + "epoch": 0.6919922998074952, + "grad_norm": 0.001974282553419471, + "learning_rate": 5.244020621564074e-06, + "loss": 0.7649, + "step": 55358 + }, + { + "epoch": 0.6920173004325109, + "grad_norm": 3.8373680114746094, + "learning_rate": 5.2432529677628264e-06, + "loss": 0.9051, + "step": 55360 + }, + { + "epoch": 0.6920423010575264, + "grad_norm": 4.77610969543457, + "learning_rate": 5.242485350188295e-06, + "loss": 1.0089, + "step": 55362 + }, + { + "epoch": 0.692067301682542, + "grad_norm": 2.3365349769592285, + "learning_rate": 5.241717768846319e-06, + "loss": 0.2702, + "step": 55364 + }, + { + "epoch": 0.6920923023075577, + "grad_norm": 3.3651444911956787, + "learning_rate": 5.240950223742752e-06, + "loss": 0.9272, + "step": 55366 + }, + { + "epoch": 0.6921173029325733, + "grad_norm": 6.5751237869262695, + "learning_rate": 5.240182714883435e-06, + "loss": 2.3171, + "step": 55368 + }, + { + "epoch": 0.692142303557589, + "grad_norm": 0.008489126339554787, + "learning_rate": 5.239415242274209e-06, + "loss": 0.7334, + "step": 55370 + }, + { + "epoch": 0.6921673041826045, + "grad_norm": 4.249578952789307, + "learning_rate": 5.238647805920928e-06, + "loss": 0.9312, + "step": 55372 + }, + { + "epoch": 0.6921923048076202, + "grad_norm": 2.6857309341430664, + "learning_rate": 5.237880405829428e-06, + "loss": 0.3528, + "step": 55374 + }, + { + "epoch": 0.6922173054326358, + "grad_norm": 3.8287744522094727, + "learning_rate": 5.237113042005559e-06, + "loss": 0.8161, + "step": 55376 + }, + { + "epoch": 0.6922423060576515, + "grad_norm": 2.923123598098755, + "learning_rate": 5.236345714455162e-06, + "loss": 1.3251, + "step": 55378 + }, + { + "epoch": 0.6922673066826671, + "grad_norm": 2.9768571853637695, + "learning_rate": 5.235578423184087e-06, + "loss": 1.5783, + "step": 55380 + }, + { + "epoch": 0.6922923073076827, + "grad_norm": 0.0019574628677219152, + "learning_rate": 5.23481116819817e-06, + "loss": 1.1796, + "step": 55382 + }, + { + "epoch": 0.6923173079326983, + "grad_norm": 4.598382472991943, + "learning_rate": 5.234043949503256e-06, + "loss": 1.7223, + "step": 55384 + }, + { + "epoch": 0.692342308557714, + "grad_norm": 9.164819717407227, + "learning_rate": 5.233276767105192e-06, + "loss": 2.7602, + "step": 55386 + }, + { + "epoch": 0.6923673091827296, + "grad_norm": 3.1906914710998535, + "learning_rate": 5.232509621009813e-06, + "loss": 0.9447, + "step": 55388 + }, + { + "epoch": 0.6923923098077452, + "grad_norm": 3.185211181640625, + "learning_rate": 5.231742511222972e-06, + "loss": 1.003, + "step": 55390 + }, + { + "epoch": 0.6924173104327608, + "grad_norm": 3.690178155899048, + "learning_rate": 5.230975437750507e-06, + "loss": 1.5118, + "step": 55392 + }, + { + "epoch": 0.6924423110577764, + "grad_norm": 4.097211837768555, + "learning_rate": 5.230208400598254e-06, + "loss": 1.0981, + "step": 55394 + }, + { + "epoch": 0.6924673116827921, + "grad_norm": 3.9389708042144775, + "learning_rate": 5.2294413997720614e-06, + "loss": 0.5502, + "step": 55396 + }, + { + "epoch": 0.6924923123078077, + "grad_norm": 1.817827820777893, + "learning_rate": 5.228674435277766e-06, + "loss": 0.7992, + "step": 55398 + }, + { + "epoch": 0.6925173129328234, + "grad_norm": 2.8967602252960205, + "learning_rate": 5.227907507121215e-06, + "loss": 0.5418, + "step": 55400 + }, + { + "epoch": 0.6925423135578389, + "grad_norm": 0.0019394145347177982, + "learning_rate": 5.227140615308241e-06, + "loss": 1.3779, + "step": 55402 + }, + { + "epoch": 0.6925673141828546, + "grad_norm": 3.117765426635742, + "learning_rate": 5.226373759844694e-06, + "loss": 0.7956, + "step": 55404 + }, + { + "epoch": 0.6925923148078702, + "grad_norm": 4.47227668762207, + "learning_rate": 5.225606940736409e-06, + "loss": 0.1593, + "step": 55406 + }, + { + "epoch": 0.6926173154328858, + "grad_norm": 1.736772060394287, + "learning_rate": 5.224840157989224e-06, + "loss": 0.5657, + "step": 55408 + }, + { + "epoch": 0.6926423160579015, + "grad_norm": 0.021631021052598953, + "learning_rate": 5.224073411608983e-06, + "loss": 1.0202, + "step": 55410 + }, + { + "epoch": 0.692667316682917, + "grad_norm": 2.742271661758423, + "learning_rate": 5.2233067016015205e-06, + "loss": 0.7151, + "step": 55412 + }, + { + "epoch": 0.6926923173079327, + "grad_norm": 4.135889530181885, + "learning_rate": 5.222540027972683e-06, + "loss": 1.6794, + "step": 55414 + }, + { + "epoch": 0.6927173179329483, + "grad_norm": 3.486440658569336, + "learning_rate": 5.2217733907283e-06, + "loss": 0.8939, + "step": 55416 + }, + { + "epoch": 0.692742318557964, + "grad_norm": 2.688302993774414, + "learning_rate": 5.221006789874219e-06, + "loss": 1.3905, + "step": 55418 + }, + { + "epoch": 0.6927673191829796, + "grad_norm": 2.856600761413574, + "learning_rate": 5.220240225416275e-06, + "loss": 0.8515, + "step": 55420 + }, + { + "epoch": 0.6927923198079952, + "grad_norm": 0.7439040541648865, + "learning_rate": 5.219473697360302e-06, + "loss": 0.2196, + "step": 55422 + }, + { + "epoch": 0.6928173204330108, + "grad_norm": 0.014756263233721256, + "learning_rate": 5.218707205712145e-06, + "loss": 0.5784, + "step": 55424 + }, + { + "epoch": 0.6928423210580265, + "grad_norm": 0.0008261574548669159, + "learning_rate": 5.217940750477633e-06, + "loss": 0.0, + "step": 55426 + }, + { + "epoch": 0.6928673216830421, + "grad_norm": 2.4676129817962646, + "learning_rate": 5.2171743316626135e-06, + "loss": 0.882, + "step": 55428 + }, + { + "epoch": 0.6928923223080578, + "grad_norm": 4.157198905944824, + "learning_rate": 5.216407949272913e-06, + "loss": 1.6464, + "step": 55430 + }, + { + "epoch": 0.6929173229330733, + "grad_norm": 2.8872573375701904, + "learning_rate": 5.215641603314376e-06, + "loss": 1.1182, + "step": 55432 + }, + { + "epoch": 0.6929423235580889, + "grad_norm": 3.868065357208252, + "learning_rate": 5.214875293792838e-06, + "loss": 0.2898, + "step": 55434 + }, + { + "epoch": 0.6929673241831046, + "grad_norm": 5.088186264038086, + "learning_rate": 5.214109020714129e-06, + "loss": 1.8909, + "step": 55436 + }, + { + "epoch": 0.6929923248081202, + "grad_norm": 0.0029686575289815664, + "learning_rate": 5.213342784084091e-06, + "loss": 0.0001, + "step": 55438 + }, + { + "epoch": 0.6930173254331359, + "grad_norm": 5.333212375640869, + "learning_rate": 5.212576583908555e-06, + "loss": 1.2353, + "step": 55440 + }, + { + "epoch": 0.6930423260581514, + "grad_norm": 2.1388211250305176, + "learning_rate": 5.2118104201933614e-06, + "loss": 1.4034, + "step": 55442 + }, + { + "epoch": 0.6930673266831671, + "grad_norm": 0.03474031016230583, + "learning_rate": 5.211044292944341e-06, + "loss": 0.4047, + "step": 55444 + }, + { + "epoch": 0.6930923273081827, + "grad_norm": 0.7709141373634338, + "learning_rate": 5.210278202167332e-06, + "loss": 0.1081, + "step": 55446 + }, + { + "epoch": 0.6931173279331984, + "grad_norm": 5.594468116760254, + "learning_rate": 5.209512147868167e-06, + "loss": 2.004, + "step": 55448 + }, + { + "epoch": 0.693142328558214, + "grad_norm": 1.944029688835144, + "learning_rate": 5.208746130052676e-06, + "loss": 0.0608, + "step": 55450 + }, + { + "epoch": 0.6931673291832295, + "grad_norm": 3.048137903213501, + "learning_rate": 5.207980148726702e-06, + "loss": 0.5344, + "step": 55452 + }, + { + "epoch": 0.6931923298082452, + "grad_norm": 4.162966728210449, + "learning_rate": 5.207214203896069e-06, + "loss": 1.6124, + "step": 55454 + }, + { + "epoch": 0.6932173304332608, + "grad_norm": 1.860154151916504, + "learning_rate": 5.2064482955666195e-06, + "loss": 0.44, + "step": 55456 + }, + { + "epoch": 0.6932423310582765, + "grad_norm": 4.474771976470947, + "learning_rate": 5.2056824237441806e-06, + "loss": 1.0586, + "step": 55458 + }, + { + "epoch": 0.6932673316832921, + "grad_norm": 2.1962382793426514, + "learning_rate": 5.204916588434583e-06, + "loss": 2.2235, + "step": 55460 + }, + { + "epoch": 0.6932923323083077, + "grad_norm": 4.180777549743652, + "learning_rate": 5.204150789643667e-06, + "loss": 1.5515, + "step": 55462 + }, + { + "epoch": 0.6933173329333233, + "grad_norm": 3.1548831462860107, + "learning_rate": 5.203385027377255e-06, + "loss": 1.3864, + "step": 55464 + }, + { + "epoch": 0.693342333558339, + "grad_norm": 6.319031715393066, + "learning_rate": 5.20261930164119e-06, + "loss": 0.6139, + "step": 55466 + }, + { + "epoch": 0.6933673341833546, + "grad_norm": 2.6432275772094727, + "learning_rate": 5.201853612441292e-06, + "loss": 0.5603, + "step": 55468 + }, + { + "epoch": 0.6933923348083703, + "grad_norm": 4.677997589111328, + "learning_rate": 5.201087959783404e-06, + "loss": 0.8191, + "step": 55470 + }, + { + "epoch": 0.6934173354333858, + "grad_norm": 0.7012310028076172, + "learning_rate": 5.200322343673352e-06, + "loss": 0.884, + "step": 55472 + }, + { + "epoch": 0.6934423360584014, + "grad_norm": 4.279788017272949, + "learning_rate": 5.1995567641169604e-06, + "loss": 1.6063, + "step": 55474 + }, + { + "epoch": 0.6934673366834171, + "grad_norm": 0.9355809688568115, + "learning_rate": 5.198791221120072e-06, + "loss": 0.7403, + "step": 55476 + }, + { + "epoch": 0.6934923373084327, + "grad_norm": 4.480103969573975, + "learning_rate": 5.198025714688505e-06, + "loss": 1.0954, + "step": 55478 + }, + { + "epoch": 0.6935173379334484, + "grad_norm": 4.983238697052002, + "learning_rate": 5.1972602448281e-06, + "loss": 1.4784, + "step": 55480 + }, + { + "epoch": 0.6935423385584639, + "grad_norm": 4.057696342468262, + "learning_rate": 5.196494811544677e-06, + "loss": 1.444, + "step": 55482 + }, + { + "epoch": 0.6935673391834796, + "grad_norm": 6.759614944458008, + "learning_rate": 5.195729414844074e-06, + "loss": 1.4665, + "step": 55484 + }, + { + "epoch": 0.6935923398084952, + "grad_norm": 4.227955341339111, + "learning_rate": 5.1949640547321165e-06, + "loss": 1.2451, + "step": 55486 + }, + { + "epoch": 0.6936173404335109, + "grad_norm": 2.418761730194092, + "learning_rate": 5.1941987312146304e-06, + "loss": 0.9606, + "step": 55488 + }, + { + "epoch": 0.6936423410585265, + "grad_norm": 2.169532537460327, + "learning_rate": 5.19343344429745e-06, + "loss": 0.4389, + "step": 55490 + }, + { + "epoch": 0.693667341683542, + "grad_norm": 2.672490119934082, + "learning_rate": 5.1926681939863965e-06, + "loss": 0.1641, + "step": 55492 + }, + { + "epoch": 0.6936923423085577, + "grad_norm": 0.0004940456128679216, + "learning_rate": 5.191902980287306e-06, + "loss": 0.0436, + "step": 55494 + }, + { + "epoch": 0.6937173429335733, + "grad_norm": 3.7718653678894043, + "learning_rate": 5.191137803206001e-06, + "loss": 1.2348, + "step": 55496 + }, + { + "epoch": 0.693742343558589, + "grad_norm": 3.6051218509674072, + "learning_rate": 5.190372662748313e-06, + "loss": 1.4962, + "step": 55498 + }, + { + "epoch": 0.6937673441836046, + "grad_norm": 2.06715989112854, + "learning_rate": 5.189607558920067e-06, + "loss": 0.4978, + "step": 55500 + }, + { + "epoch": 0.6937923448086202, + "grad_norm": 3.163282632827759, + "learning_rate": 5.188842491727084e-06, + "loss": 0.4261, + "step": 55502 + }, + { + "epoch": 0.6938173454336358, + "grad_norm": 12.771986961364746, + "learning_rate": 5.188077461175202e-06, + "loss": 1.928, + "step": 55504 + }, + { + "epoch": 0.6938423460586515, + "grad_norm": 1.2275891304016113, + "learning_rate": 5.187312467270236e-06, + "loss": 0.9415, + "step": 55506 + }, + { + "epoch": 0.6938673466836671, + "grad_norm": 4.310428142547607, + "learning_rate": 5.186547510018018e-06, + "loss": 0.9444, + "step": 55508 + }, + { + "epoch": 0.6938923473086828, + "grad_norm": 5.330392837524414, + "learning_rate": 5.185782589424383e-06, + "loss": 0.7587, + "step": 55510 + }, + { + "epoch": 0.6939173479336983, + "grad_norm": 5.317974090576172, + "learning_rate": 5.185017705495138e-06, + "loss": 1.0715, + "step": 55512 + }, + { + "epoch": 0.693942348558714, + "grad_norm": 2.975592613220215, + "learning_rate": 5.184252858236121e-06, + "loss": 0.6561, + "step": 55514 + }, + { + "epoch": 0.6939673491837296, + "grad_norm": 3.511106252670288, + "learning_rate": 5.183488047653149e-06, + "loss": 0.7468, + "step": 55516 + }, + { + "epoch": 0.6939923498087452, + "grad_norm": 3.6730456352233887, + "learning_rate": 5.182723273752056e-06, + "loss": 0.9491, + "step": 55518 + }, + { + "epoch": 0.6940173504337609, + "grad_norm": 0.004820779897272587, + "learning_rate": 5.181958536538657e-06, + "loss": 0.0003, + "step": 55520 + }, + { + "epoch": 0.6940423510587764, + "grad_norm": 3.4980082511901855, + "learning_rate": 5.181193836018781e-06, + "loss": 0.5087, + "step": 55522 + }, + { + "epoch": 0.6940673516837921, + "grad_norm": 5.83635950088501, + "learning_rate": 5.180429172198258e-06, + "loss": 0.8877, + "step": 55524 + }, + { + "epoch": 0.6940923523088077, + "grad_norm": 4.846188545227051, + "learning_rate": 5.179664545082897e-06, + "loss": 2.0212, + "step": 55526 + }, + { + "epoch": 0.6941173529338234, + "grad_norm": 8.635603904724121, + "learning_rate": 5.178899954678534e-06, + "loss": 0.7713, + "step": 55528 + }, + { + "epoch": 0.694142353558839, + "grad_norm": 2.3053181171417236, + "learning_rate": 5.178135400990982e-06, + "loss": 0.2854, + "step": 55530 + }, + { + "epoch": 0.6941673541838546, + "grad_norm": 0.0005429266602732241, + "learning_rate": 5.177370884026074e-06, + "loss": 0.8982, + "step": 55532 + }, + { + "epoch": 0.6941923548088702, + "grad_norm": 0.6634799242019653, + "learning_rate": 5.176606403789624e-06, + "loss": 0.3701, + "step": 55534 + }, + { + "epoch": 0.6942173554338859, + "grad_norm": 5.210912704467773, + "learning_rate": 5.1758419602874595e-06, + "loss": 1.8765, + "step": 55536 + }, + { + "epoch": 0.6942423560589015, + "grad_norm": 3.6235551834106445, + "learning_rate": 5.175077553525402e-06, + "loss": 1.4878, + "step": 55538 + }, + { + "epoch": 0.6942673566839171, + "grad_norm": 3.255890130996704, + "learning_rate": 5.1743131835092665e-06, + "loss": 0.5733, + "step": 55540 + }, + { + "epoch": 0.6942923573089327, + "grad_norm": 0.0027697905898094177, + "learning_rate": 5.173548850244884e-06, + "loss": 0.8982, + "step": 55542 + }, + { + "epoch": 0.6943173579339483, + "grad_norm": 1.3313181400299072, + "learning_rate": 5.172784553738067e-06, + "loss": 0.0588, + "step": 55544 + }, + { + "epoch": 0.694342358558964, + "grad_norm": 0.6892978549003601, + "learning_rate": 5.17202029399464e-06, + "loss": 0.4686, + "step": 55546 + }, + { + "epoch": 0.6943673591839796, + "grad_norm": 4.488131523132324, + "learning_rate": 5.171256071020427e-06, + "loss": 1.7444, + "step": 55548 + }, + { + "epoch": 0.6943923598089953, + "grad_norm": 2.4334301948547363, + "learning_rate": 5.170491884821247e-06, + "loss": 1.1084, + "step": 55550 + }, + { + "epoch": 0.6944173604340108, + "grad_norm": 4.3918046951293945, + "learning_rate": 5.169727735402916e-06, + "loss": 1.308, + "step": 55552 + }, + { + "epoch": 0.6944423610590265, + "grad_norm": 1.4679789543151855, + "learning_rate": 5.168963622771253e-06, + "loss": 0.5143, + "step": 55554 + }, + { + "epoch": 0.6944673616840421, + "grad_norm": 4.147263526916504, + "learning_rate": 5.1681995469320824e-06, + "loss": 1.0952, + "step": 55556 + }, + { + "epoch": 0.6944923623090578, + "grad_norm": 4.465320587158203, + "learning_rate": 5.167435507891217e-06, + "loss": 1.4422, + "step": 55558 + }, + { + "epoch": 0.6945173629340734, + "grad_norm": 2.907538414001465, + "learning_rate": 5.166671505654479e-06, + "loss": 1.3461, + "step": 55560 + }, + { + "epoch": 0.6945423635590889, + "grad_norm": 1.2769412994384766, + "learning_rate": 5.165907540227697e-06, + "loss": 1.0081, + "step": 55562 + }, + { + "epoch": 0.6945673641841046, + "grad_norm": 2.373424768447876, + "learning_rate": 5.1651436116166694e-06, + "loss": 0.4522, + "step": 55564 + }, + { + "epoch": 0.6945923648091202, + "grad_norm": 0.002377683762460947, + "learning_rate": 5.164379719827231e-06, + "loss": 0.0225, + "step": 55566 + }, + { + "epoch": 0.6946173654341359, + "grad_norm": 1.6985210180282593, + "learning_rate": 5.163615864865186e-06, + "loss": 0.68, + "step": 55568 + }, + { + "epoch": 0.6946423660591515, + "grad_norm": 2.5819039344787598, + "learning_rate": 5.16285204673636e-06, + "loss": 1.7657, + "step": 55570 + }, + { + "epoch": 0.6946673666841671, + "grad_norm": 8.179912567138672, + "learning_rate": 5.162088265446573e-06, + "loss": 1.4412, + "step": 55572 + }, + { + "epoch": 0.6946923673091827, + "grad_norm": 2.5174009799957275, + "learning_rate": 5.1613245210016334e-06, + "loss": 0.5421, + "step": 55574 + }, + { + "epoch": 0.6947173679341984, + "grad_norm": 3.1487843990325928, + "learning_rate": 5.160560813407369e-06, + "loss": 0.7716, + "step": 55576 + }, + { + "epoch": 0.694742368559214, + "grad_norm": 5.748461723327637, + "learning_rate": 5.159797142669582e-06, + "loss": 1.8655, + "step": 55578 + }, + { + "epoch": 0.6947673691842297, + "grad_norm": 6.445054531097412, + "learning_rate": 5.1590335087941e-06, + "loss": 1.3019, + "step": 55580 + }, + { + "epoch": 0.6947923698092452, + "grad_norm": 0.14296284317970276, + "learning_rate": 5.158269911786731e-06, + "loss": 0.0017, + "step": 55582 + }, + { + "epoch": 0.6948173704342608, + "grad_norm": 5.538046836853027, + "learning_rate": 5.157506351653292e-06, + "loss": 2.3661, + "step": 55584 + }, + { + "epoch": 0.6948423710592765, + "grad_norm": 1.0267268419265747, + "learning_rate": 5.156742828399605e-06, + "loss": 0.4076, + "step": 55586 + }, + { + "epoch": 0.6948673716842921, + "grad_norm": 0.0021654001902788877, + "learning_rate": 5.155979342031478e-06, + "loss": 0.0022, + "step": 55588 + }, + { + "epoch": 0.6948923723093078, + "grad_norm": 3.4785683155059814, + "learning_rate": 5.155215892554728e-06, + "loss": 1.0586, + "step": 55590 + }, + { + "epoch": 0.6949173729343233, + "grad_norm": 0.7512218952178955, + "learning_rate": 5.154452479975165e-06, + "loss": 0.0431, + "step": 55592 + }, + { + "epoch": 0.694942373559339, + "grad_norm": 2.890662431716919, + "learning_rate": 5.153689104298611e-06, + "loss": 1.3959, + "step": 55594 + }, + { + "epoch": 0.6949673741843546, + "grad_norm": 0.0008215022389777005, + "learning_rate": 5.1529257655308716e-06, + "loss": 0.5366, + "step": 55596 + }, + { + "epoch": 0.6949923748093703, + "grad_norm": 1.8762249946594238, + "learning_rate": 5.152162463677763e-06, + "loss": 0.6463, + "step": 55598 + }, + { + "epoch": 0.6950173754343859, + "grad_norm": 2.8591651916503906, + "learning_rate": 5.151399198745104e-06, + "loss": 0.5402, + "step": 55600 + }, + { + "epoch": 0.6950423760594014, + "grad_norm": 0.0013781001325696707, + "learning_rate": 5.150635970738704e-06, + "loss": 0.0572, + "step": 55602 + }, + { + "epoch": 0.6950673766844171, + "grad_norm": 3.314831256866455, + "learning_rate": 5.149872779664373e-06, + "loss": 0.7582, + "step": 55604 + }, + { + "epoch": 0.6950923773094327, + "grad_norm": 5.45521879196167, + "learning_rate": 5.149109625527923e-06, + "loss": 1.5279, + "step": 55606 + }, + { + "epoch": 0.6951173779344484, + "grad_norm": 1.2241482734680176, + "learning_rate": 5.148346508335167e-06, + "loss": 0.0436, + "step": 55608 + }, + { + "epoch": 0.695142378559464, + "grad_norm": 3.1010971069335938, + "learning_rate": 5.147583428091923e-06, + "loss": 0.2534, + "step": 55610 + }, + { + "epoch": 0.6951673791844796, + "grad_norm": 0.2295924872159958, + "learning_rate": 5.1468203848039925e-06, + "loss": 0.5236, + "step": 55612 + }, + { + "epoch": 0.6951923798094952, + "grad_norm": 6.441915512084961, + "learning_rate": 5.146057378477198e-06, + "loss": 1.0879, + "step": 55614 + }, + { + "epoch": 0.6952173804345109, + "grad_norm": 3.592729330062866, + "learning_rate": 5.145294409117339e-06, + "loss": 0.5446, + "step": 55616 + }, + { + "epoch": 0.6952423810595265, + "grad_norm": 0.33541345596313477, + "learning_rate": 5.144531476730232e-06, + "loss": 0.0096, + "step": 55618 + }, + { + "epoch": 0.6952673816845422, + "grad_norm": 6.414144992828369, + "learning_rate": 5.143768581321685e-06, + "loss": 1.016, + "step": 55620 + }, + { + "epoch": 0.6952923823095577, + "grad_norm": 8.081649780273438, + "learning_rate": 5.143005722897509e-06, + "loss": 1.3553, + "step": 55622 + }, + { + "epoch": 0.6953173829345733, + "grad_norm": 0.9467530846595764, + "learning_rate": 5.142242901463519e-06, + "loss": 0.4443, + "step": 55624 + }, + { + "epoch": 0.695342383559589, + "grad_norm": 0.004976713564246893, + "learning_rate": 5.1414801170255165e-06, + "loss": 0.1346, + "step": 55626 + }, + { + "epoch": 0.6953673841846046, + "grad_norm": 0.13645128905773163, + "learning_rate": 5.140717369589322e-06, + "loss": 1.2861, + "step": 55628 + }, + { + "epoch": 0.6953923848096203, + "grad_norm": 2.2014544010162354, + "learning_rate": 5.139954659160729e-06, + "loss": 0.5783, + "step": 55630 + }, + { + "epoch": 0.6954173854346358, + "grad_norm": 5.4100165367126465, + "learning_rate": 5.139191985745553e-06, + "loss": 0.4605, + "step": 55632 + }, + { + "epoch": 0.6954423860596515, + "grad_norm": 2.3322949409484863, + "learning_rate": 5.13842934934961e-06, + "loss": 1.1148, + "step": 55634 + }, + { + "epoch": 0.6954673866846671, + "grad_norm": 3.172675609588623, + "learning_rate": 5.137666749978694e-06, + "loss": 0.5158, + "step": 55636 + }, + { + "epoch": 0.6954923873096828, + "grad_norm": 8.16730785369873, + "learning_rate": 5.136904187638627e-06, + "loss": 0.7927, + "step": 55638 + }, + { + "epoch": 0.6955173879346984, + "grad_norm": 4.833345890045166, + "learning_rate": 5.136141662335209e-06, + "loss": 1.8628, + "step": 55640 + }, + { + "epoch": 0.695542388559714, + "grad_norm": 3.8143117427825928, + "learning_rate": 5.135379174074248e-06, + "loss": 1.235, + "step": 55642 + }, + { + "epoch": 0.6955673891847296, + "grad_norm": 6.064215183258057, + "learning_rate": 5.134616722861549e-06, + "loss": 1.2358, + "step": 55644 + }, + { + "epoch": 0.6955923898097452, + "grad_norm": 2.9487195014953613, + "learning_rate": 5.133854308702919e-06, + "loss": 1.3622, + "step": 55646 + }, + { + "epoch": 0.6956173904347609, + "grad_norm": 3.3476743698120117, + "learning_rate": 5.133091931604171e-06, + "loss": 2.1801, + "step": 55648 + }, + { + "epoch": 0.6956423910597765, + "grad_norm": 0.0010189410531893373, + "learning_rate": 5.132329591571104e-06, + "loss": 0.0, + "step": 55650 + }, + { + "epoch": 0.6956673916847921, + "grad_norm": 0.0007753691752441227, + "learning_rate": 5.131567288609529e-06, + "loss": 0.0018, + "step": 55652 + }, + { + "epoch": 0.6956923923098077, + "grad_norm": 5.684034824371338, + "learning_rate": 5.130805022725249e-06, + "loss": 0.9469, + "step": 55654 + }, + { + "epoch": 0.6957173929348234, + "grad_norm": 3.0590271949768066, + "learning_rate": 5.13004279392407e-06, + "loss": 0.4744, + "step": 55656 + }, + { + "epoch": 0.695742393559839, + "grad_norm": 1.723239541053772, + "learning_rate": 5.129280602211793e-06, + "loss": 0.697, + "step": 55658 + }, + { + "epoch": 0.6957673941848547, + "grad_norm": 1.2338674068450928, + "learning_rate": 5.128518447594224e-06, + "loss": 0.6128, + "step": 55660 + }, + { + "epoch": 0.6957923948098702, + "grad_norm": 0.0008277948363684118, + "learning_rate": 5.127756330077176e-06, + "loss": 0.8058, + "step": 55662 + }, + { + "epoch": 0.6958173954348859, + "grad_norm": 2.7065937519073486, + "learning_rate": 5.126994249666441e-06, + "loss": 0.8783, + "step": 55664 + }, + { + "epoch": 0.6958423960599015, + "grad_norm": 0.0005595933180302382, + "learning_rate": 5.126232206367838e-06, + "loss": 0.0632, + "step": 55666 + }, + { + "epoch": 0.6958673966849171, + "grad_norm": 1.7576026916503906, + "learning_rate": 5.1254702001871526e-06, + "loss": 0.5982, + "step": 55668 + }, + { + "epoch": 0.6958923973099328, + "grad_norm": 1.1881238222122192, + "learning_rate": 5.124708231130198e-06, + "loss": 0.5545, + "step": 55670 + }, + { + "epoch": 0.6959173979349483, + "grad_norm": 1.9958360195159912, + "learning_rate": 5.123946299202778e-06, + "loss": 0.3239, + "step": 55672 + }, + { + "epoch": 0.695942398559964, + "grad_norm": 2.3398962020874023, + "learning_rate": 5.123184404410691e-06, + "loss": 0.207, + "step": 55674 + }, + { + "epoch": 0.6959673991849796, + "grad_norm": 0.19307322800159454, + "learning_rate": 5.122422546759744e-06, + "loss": 0.3663, + "step": 55676 + }, + { + "epoch": 0.6959923998099953, + "grad_norm": 2.1992099285125732, + "learning_rate": 5.121660726255736e-06, + "loss": 1.6542, + "step": 55678 + }, + { + "epoch": 0.6960174004350109, + "grad_norm": 4.084205627441406, + "learning_rate": 5.120898942904476e-06, + "loss": 0.6457, + "step": 55680 + }, + { + "epoch": 0.6960424010600265, + "grad_norm": 2.3340675830841064, + "learning_rate": 5.120137196711753e-06, + "loss": 1.0098, + "step": 55682 + }, + { + "epoch": 0.6960674016850421, + "grad_norm": 2.8511972427368164, + "learning_rate": 5.119375487683374e-06, + "loss": 1.0984, + "step": 55684 + }, + { + "epoch": 0.6960924023100578, + "grad_norm": 10.671445846557617, + "learning_rate": 5.118613815825147e-06, + "loss": 0.7709, + "step": 55686 + }, + { + "epoch": 0.6961174029350734, + "grad_norm": 4.77312707901001, + "learning_rate": 5.1178521811428595e-06, + "loss": 1.1246, + "step": 55688 + }, + { + "epoch": 0.696142403560089, + "grad_norm": 0.25383761525154114, + "learning_rate": 5.117090583642325e-06, + "loss": 0.628, + "step": 55690 + }, + { + "epoch": 0.6961674041851046, + "grad_norm": 1.0657657384872437, + "learning_rate": 5.116329023329339e-06, + "loss": 0.5004, + "step": 55692 + }, + { + "epoch": 0.6961924048101202, + "grad_norm": 2.0402610301971436, + "learning_rate": 5.115567500209696e-06, + "loss": 1.0633, + "step": 55694 + }, + { + "epoch": 0.6962174054351359, + "grad_norm": 1.7397871017456055, + "learning_rate": 5.114806014289204e-06, + "loss": 0.8793, + "step": 55696 + }, + { + "epoch": 0.6962424060601515, + "grad_norm": 4.445596218109131, + "learning_rate": 5.114044565573655e-06, + "loss": 0.8804, + "step": 55698 + }, + { + "epoch": 0.6962674066851672, + "grad_norm": 2.3159003257751465, + "learning_rate": 5.113283154068856e-06, + "loss": 1.1253, + "step": 55700 + }, + { + "epoch": 0.6962924073101827, + "grad_norm": 2.9984865188598633, + "learning_rate": 5.112521779780596e-06, + "loss": 0.6382, + "step": 55702 + }, + { + "epoch": 0.6963174079351984, + "grad_norm": 0.04298878088593483, + "learning_rate": 5.111760442714684e-06, + "loss": 0.0186, + "step": 55704 + }, + { + "epoch": 0.696342408560214, + "grad_norm": 5.557373046875, + "learning_rate": 5.110999142876912e-06, + "loss": 0.9349, + "step": 55706 + }, + { + "epoch": 0.6963674091852297, + "grad_norm": 3.6698808670043945, + "learning_rate": 5.110237880273077e-06, + "loss": 0.669, + "step": 55708 + }, + { + "epoch": 0.6963924098102453, + "grad_norm": 0.708897590637207, + "learning_rate": 5.109476654908983e-06, + "loss": 0.7424, + "step": 55710 + }, + { + "epoch": 0.6964174104352608, + "grad_norm": 3.48769211769104, + "learning_rate": 5.108715466790417e-06, + "loss": 0.8869, + "step": 55712 + }, + { + "epoch": 0.6964424110602765, + "grad_norm": 2.777211904525757, + "learning_rate": 5.107954315923187e-06, + "loss": 0.6089, + "step": 55714 + }, + { + "epoch": 0.6964674116852921, + "grad_norm": 2.8174259662628174, + "learning_rate": 5.1071932023130825e-06, + "loss": 0.397, + "step": 55716 + }, + { + "epoch": 0.6964924123103078, + "grad_norm": 3.115734100341797, + "learning_rate": 5.106432125965906e-06, + "loss": 0.2803, + "step": 55718 + }, + { + "epoch": 0.6965174129353234, + "grad_norm": 3.8921279907226562, + "learning_rate": 5.1056710868874496e-06, + "loss": 1.3015, + "step": 55720 + }, + { + "epoch": 0.696542413560339, + "grad_norm": 2.719163417816162, + "learning_rate": 5.1049100850835065e-06, + "loss": 0.9819, + "step": 55722 + }, + { + "epoch": 0.6965674141853546, + "grad_norm": 4.6509785652160645, + "learning_rate": 5.104149120559881e-06, + "loss": 2.856, + "step": 55724 + }, + { + "epoch": 0.6965924148103703, + "grad_norm": 3.0184173583984375, + "learning_rate": 5.103388193322359e-06, + "loss": 0.6493, + "step": 55726 + }, + { + "epoch": 0.6966174154353859, + "grad_norm": 3.1047720909118652, + "learning_rate": 5.102627303376743e-06, + "loss": 1.426, + "step": 55728 + }, + { + "epoch": 0.6966424160604016, + "grad_norm": 4.4398908615112305, + "learning_rate": 5.1018664507288205e-06, + "loss": 0.8461, + "step": 55730 + }, + { + "epoch": 0.6966674166854171, + "grad_norm": 1.2470637559890747, + "learning_rate": 5.101105635384396e-06, + "loss": 1.2397, + "step": 55732 + }, + { + "epoch": 0.6966924173104327, + "grad_norm": 3.633408784866333, + "learning_rate": 5.100344857349258e-06, + "loss": 0.8893, + "step": 55734 + }, + { + "epoch": 0.6967174179354484, + "grad_norm": 4.016756057739258, + "learning_rate": 5.099584116629197e-06, + "loss": 0.7058, + "step": 55736 + }, + { + "epoch": 0.696742418560464, + "grad_norm": 2.8500382900238037, + "learning_rate": 5.098823413230013e-06, + "loss": 0.836, + "step": 55738 + }, + { + "epoch": 0.6967674191854797, + "grad_norm": 2.5893216133117676, + "learning_rate": 5.098062747157494e-06, + "loss": 1.1488, + "step": 55740 + }, + { + "epoch": 0.6967924198104952, + "grad_norm": 4.327065944671631, + "learning_rate": 5.09730211841744e-06, + "loss": 1.4111, + "step": 55742 + }, + { + "epoch": 0.6968174204355109, + "grad_norm": 5.648623466491699, + "learning_rate": 5.096541527015635e-06, + "loss": 0.7704, + "step": 55744 + }, + { + "epoch": 0.6968424210605265, + "grad_norm": 2.3158366680145264, + "learning_rate": 5.09578097295788e-06, + "loss": 0.2059, + "step": 55746 + }, + { + "epoch": 0.6968674216855422, + "grad_norm": 2.5647199153900146, + "learning_rate": 5.095020456249964e-06, + "loss": 1.6604, + "step": 55748 + }, + { + "epoch": 0.6968924223105578, + "grad_norm": 3.6831541061401367, + "learning_rate": 5.0942599768976755e-06, + "loss": 0.3168, + "step": 55750 + }, + { + "epoch": 0.6969174229355733, + "grad_norm": 3.1897523403167725, + "learning_rate": 5.093499534906812e-06, + "loss": 1.2203, + "step": 55752 + }, + { + "epoch": 0.696942423560589, + "grad_norm": 1.3629745244979858, + "learning_rate": 5.0927391302831595e-06, + "loss": 0.7807, + "step": 55754 + }, + { + "epoch": 0.6969674241856046, + "grad_norm": 2.7374303340911865, + "learning_rate": 5.091978763032515e-06, + "loss": 0.5273, + "step": 55756 + }, + { + "epoch": 0.6969924248106203, + "grad_norm": 4.325728893280029, + "learning_rate": 5.091218433160665e-06, + "loss": 0.612, + "step": 55758 + }, + { + "epoch": 0.6970174254356359, + "grad_norm": 3.4471211433410645, + "learning_rate": 5.0904581406734e-06, + "loss": 0.8985, + "step": 55760 + }, + { + "epoch": 0.6970424260606515, + "grad_norm": 10.485095024108887, + "learning_rate": 5.089697885576513e-06, + "loss": 0.7601, + "step": 55762 + }, + { + "epoch": 0.6970674266856671, + "grad_norm": 3.980051279067993, + "learning_rate": 5.088937667875791e-06, + "loss": 0.8349, + "step": 55764 + }, + { + "epoch": 0.6970924273106828, + "grad_norm": 4.051762580871582, + "learning_rate": 5.088177487577026e-06, + "loss": 1.3253, + "step": 55766 + }, + { + "epoch": 0.6971174279356984, + "grad_norm": 5.531460762023926, + "learning_rate": 5.087417344686005e-06, + "loss": 1.9897, + "step": 55768 + }, + { + "epoch": 0.6971424285607141, + "grad_norm": 6.084460258483887, + "learning_rate": 5.086657239208524e-06, + "loss": 1.0623, + "step": 55770 + }, + { + "epoch": 0.6971674291857296, + "grad_norm": 0.0017850331496447325, + "learning_rate": 5.0858971711503635e-06, + "loss": 0.2671, + "step": 55772 + }, + { + "epoch": 0.6971924298107453, + "grad_norm": 2.5676214694976807, + "learning_rate": 5.085137140517313e-06, + "loss": 0.4378, + "step": 55774 + }, + { + "epoch": 0.6972174304357609, + "grad_norm": 4.120687961578369, + "learning_rate": 5.084377147315167e-06, + "loss": 0.3531, + "step": 55776 + }, + { + "epoch": 0.6972424310607765, + "grad_norm": 3.793792486190796, + "learning_rate": 5.083617191549704e-06, + "loss": 1.5359, + "step": 55778 + }, + { + "epoch": 0.6972674316857922, + "grad_norm": 5.212194442749023, + "learning_rate": 5.082857273226723e-06, + "loss": 1.1355, + "step": 55780 + }, + { + "epoch": 0.6972924323108077, + "grad_norm": 2.706482410430908, + "learning_rate": 5.082097392352001e-06, + "loss": 0.1411, + "step": 55782 + }, + { + "epoch": 0.6973174329358234, + "grad_norm": 0.3112867474555969, + "learning_rate": 5.081337548931334e-06, + "loss": 0.4647, + "step": 55784 + }, + { + "epoch": 0.697342433560839, + "grad_norm": 7.439986705780029, + "learning_rate": 5.080577742970505e-06, + "loss": 1.6275, + "step": 55786 + }, + { + "epoch": 0.6973674341858547, + "grad_norm": 0.6240730285644531, + "learning_rate": 5.079817974475296e-06, + "loss": 0.59, + "step": 55788 + }, + { + "epoch": 0.6973924348108703, + "grad_norm": 2.3554797172546387, + "learning_rate": 5.0790582434515e-06, + "loss": 0.773, + "step": 55790 + }, + { + "epoch": 0.6974174354358859, + "grad_norm": 2.857170343399048, + "learning_rate": 5.078298549904899e-06, + "loss": 0.2237, + "step": 55792 + }, + { + "epoch": 0.6974424360609015, + "grad_norm": 4.449275970458984, + "learning_rate": 5.077538893841283e-06, + "loss": 1.7868, + "step": 55794 + }, + { + "epoch": 0.6974674366859172, + "grad_norm": 0.0018548067891970277, + "learning_rate": 5.0767792752664305e-06, + "loss": 0.0001, + "step": 55796 + }, + { + "epoch": 0.6974924373109328, + "grad_norm": 3.108489513397217, + "learning_rate": 5.076019694186136e-06, + "loss": 1.2978, + "step": 55798 + }, + { + "epoch": 0.6975174379359484, + "grad_norm": 4.138986587524414, + "learning_rate": 5.075260150606178e-06, + "loss": 1.584, + "step": 55800 + }, + { + "epoch": 0.697542438560964, + "grad_norm": 4.642518997192383, + "learning_rate": 5.074500644532339e-06, + "loss": 0.8178, + "step": 55802 + }, + { + "epoch": 0.6975674391859796, + "grad_norm": 3.0534512996673584, + "learning_rate": 5.073741175970409e-06, + "loss": 1.4083, + "step": 55804 + }, + { + "epoch": 0.6975924398109953, + "grad_norm": 4.203403472900391, + "learning_rate": 5.072981744926166e-06, + "loss": 0.4778, + "step": 55806 + }, + { + "epoch": 0.6976174404360109, + "grad_norm": 4.747321128845215, + "learning_rate": 5.0722223514054024e-06, + "loss": 1.7187, + "step": 55808 + }, + { + "epoch": 0.6976424410610266, + "grad_norm": 10.253571510314941, + "learning_rate": 5.071462995413896e-06, + "loss": 1.0237, + "step": 55810 + }, + { + "epoch": 0.6976674416860421, + "grad_norm": 3.9403395652770996, + "learning_rate": 5.0707036769574265e-06, + "loss": 0.7843, + "step": 55812 + }, + { + "epoch": 0.6976924423110578, + "grad_norm": 4.679047107696533, + "learning_rate": 5.069944396041786e-06, + "loss": 0.4747, + "step": 55814 + }, + { + "epoch": 0.6977174429360734, + "grad_norm": 4.389347553253174, + "learning_rate": 5.069185152672747e-06, + "loss": 1.438, + "step": 55816 + }, + { + "epoch": 0.697742443561089, + "grad_norm": 3.28460693359375, + "learning_rate": 5.068425946856099e-06, + "loss": 0.2825, + "step": 55818 + }, + { + "epoch": 0.6977674441861047, + "grad_norm": 15.774767875671387, + "learning_rate": 5.06766677859762e-06, + "loss": 1.67, + "step": 55820 + }, + { + "epoch": 0.6977924448111202, + "grad_norm": 4.1756391525268555, + "learning_rate": 5.066907647903096e-06, + "loss": 1.3184, + "step": 55822 + }, + { + "epoch": 0.6978174454361359, + "grad_norm": 2.13149094581604, + "learning_rate": 5.066148554778305e-06, + "loss": 0.1519, + "step": 55824 + }, + { + "epoch": 0.6978424460611515, + "grad_norm": 7.553281307220459, + "learning_rate": 5.065389499229027e-06, + "loss": 0.9008, + "step": 55826 + }, + { + "epoch": 0.6978674466861672, + "grad_norm": 0.007980876602232456, + "learning_rate": 5.064630481261047e-06, + "loss": 0.328, + "step": 55828 + }, + { + "epoch": 0.6978924473111828, + "grad_norm": 2.386552095413208, + "learning_rate": 5.0638715008801395e-06, + "loss": 0.5263, + "step": 55830 + }, + { + "epoch": 0.6979174479361984, + "grad_norm": 1.1892166137695312, + "learning_rate": 5.0631125580920935e-06, + "loss": 0.5766, + "step": 55832 + }, + { + "epoch": 0.697942448561214, + "grad_norm": 0.004928316920995712, + "learning_rate": 5.06235365290268e-06, + "loss": 0.0001, + "step": 55834 + }, + { + "epoch": 0.6979674491862297, + "grad_norm": 2.348721504211426, + "learning_rate": 5.0615947853176865e-06, + "loss": 1.2257, + "step": 55836 + }, + { + "epoch": 0.6979924498112453, + "grad_norm": 6.397147178649902, + "learning_rate": 5.060835955342889e-06, + "loss": 1.1604, + "step": 55838 + }, + { + "epoch": 0.698017450436261, + "grad_norm": 4.438618183135986, + "learning_rate": 5.060077162984062e-06, + "loss": 1.2629, + "step": 55840 + }, + { + "epoch": 0.6980424510612765, + "grad_norm": 1.6001770496368408, + "learning_rate": 5.059318408246993e-06, + "loss": 0.7223, + "step": 55842 + }, + { + "epoch": 0.6980674516862921, + "grad_norm": 2.5209949016571045, + "learning_rate": 5.058559691137452e-06, + "loss": 0.6253, + "step": 55844 + }, + { + "epoch": 0.6980924523113078, + "grad_norm": 4.5070624351501465, + "learning_rate": 5.057801011661225e-06, + "loss": 0.6036, + "step": 55846 + }, + { + "epoch": 0.6981174529363234, + "grad_norm": 3.492013931274414, + "learning_rate": 5.057042369824083e-06, + "loss": 1.0835, + "step": 55848 + }, + { + "epoch": 0.6981424535613391, + "grad_norm": 0.9614786505699158, + "learning_rate": 5.056283765631813e-06, + "loss": 0.5355, + "step": 55850 + }, + { + "epoch": 0.6981674541863546, + "grad_norm": 4.516911029815674, + "learning_rate": 5.055525199090186e-06, + "loss": 2.0806, + "step": 55852 + }, + { + "epoch": 0.6981924548113703, + "grad_norm": 2.25539493560791, + "learning_rate": 5.054766670204975e-06, + "loss": 0.7098, + "step": 55854 + }, + { + "epoch": 0.6982174554363859, + "grad_norm": 0.0013331702211871743, + "learning_rate": 5.054008178981966e-06, + "loss": 0.0, + "step": 55856 + }, + { + "epoch": 0.6982424560614016, + "grad_norm": 0.0008555043605156243, + "learning_rate": 5.0532497254269275e-06, + "loss": 0.0, + "step": 55858 + }, + { + "epoch": 0.6982674566864172, + "grad_norm": 2.298590898513794, + "learning_rate": 5.0524913095456426e-06, + "loss": 1.0813, + "step": 55860 + }, + { + "epoch": 0.6982924573114327, + "grad_norm": 2.3963985443115234, + "learning_rate": 5.051732931343885e-06, + "loss": 1.7207, + "step": 55862 + }, + { + "epoch": 0.6983174579364484, + "grad_norm": 2.8071587085723877, + "learning_rate": 5.050974590827427e-06, + "loss": 1.3607, + "step": 55864 + }, + { + "epoch": 0.698342458561464, + "grad_norm": 2.734498977661133, + "learning_rate": 5.0502162880020474e-06, + "loss": 0.5634, + "step": 55866 + }, + { + "epoch": 0.6983674591864797, + "grad_norm": 7.712624549865723, + "learning_rate": 5.049458022873519e-06, + "loss": 1.7248, + "step": 55868 + }, + { + "epoch": 0.6983924598114953, + "grad_norm": 4.112741470336914, + "learning_rate": 5.048699795447623e-06, + "loss": 1.2817, + "step": 55870 + }, + { + "epoch": 0.6984174604365109, + "grad_norm": 3.9565980434417725, + "learning_rate": 5.047941605730123e-06, + "loss": 1.275, + "step": 55872 + }, + { + "epoch": 0.6984424610615265, + "grad_norm": 0.005992561113089323, + "learning_rate": 5.047183453726805e-06, + "loss": 0.1694, + "step": 55874 + }, + { + "epoch": 0.6984674616865422, + "grad_norm": 4.311939239501953, + "learning_rate": 5.046425339443437e-06, + "loss": 1.682, + "step": 55876 + }, + { + "epoch": 0.6984924623115578, + "grad_norm": 11.262730598449707, + "learning_rate": 5.04566726288579e-06, + "loss": 0.5426, + "step": 55878 + }, + { + "epoch": 0.6985174629365735, + "grad_norm": 6.248051166534424, + "learning_rate": 5.044909224059642e-06, + "loss": 0.7574, + "step": 55880 + }, + { + "epoch": 0.698542463561589, + "grad_norm": 2.981048583984375, + "learning_rate": 5.044151222970763e-06, + "loss": 0.8689, + "step": 55882 + }, + { + "epoch": 0.6985674641866046, + "grad_norm": 0.003725230460986495, + "learning_rate": 5.043393259624929e-06, + "loss": 0.2575, + "step": 55884 + }, + { + "epoch": 0.6985924648116203, + "grad_norm": 2.3173320293426514, + "learning_rate": 5.04263533402791e-06, + "loss": 0.5763, + "step": 55886 + }, + { + "epoch": 0.6986174654366359, + "grad_norm": 1.9102177619934082, + "learning_rate": 5.041877446185481e-06, + "loss": 0.5889, + "step": 55888 + }, + { + "epoch": 0.6986424660616516, + "grad_norm": 3.545149087905884, + "learning_rate": 5.041119596103413e-06, + "loss": 0.5824, + "step": 55890 + }, + { + "epoch": 0.6986674666866671, + "grad_norm": 0.00033957784762606025, + "learning_rate": 5.040361783787473e-06, + "loss": 0.0003, + "step": 55892 + }, + { + "epoch": 0.6986924673116828, + "grad_norm": 4.673576354980469, + "learning_rate": 5.03960400924344e-06, + "loss": 0.9763, + "step": 55894 + }, + { + "epoch": 0.6987174679366984, + "grad_norm": 3.9259893894195557, + "learning_rate": 5.038846272477078e-06, + "loss": 1.7342, + "step": 55896 + }, + { + "epoch": 0.6987424685617141, + "grad_norm": 5.486513614654541, + "learning_rate": 5.038088573494165e-06, + "loss": 1.8205, + "step": 55898 + }, + { + "epoch": 0.6987674691867297, + "grad_norm": 3.36047625541687, + "learning_rate": 5.037330912300464e-06, + "loss": 0.8926, + "step": 55900 + }, + { + "epoch": 0.6987924698117453, + "grad_norm": 0.0005851444439031184, + "learning_rate": 5.036573288901753e-06, + "loss": 0.5949, + "step": 55902 + }, + { + "epoch": 0.6988174704367609, + "grad_norm": 0.01479838602244854, + "learning_rate": 5.035815703303797e-06, + "loss": 0.7596, + "step": 55904 + }, + { + "epoch": 0.6988424710617765, + "grad_norm": 6.54729700088501, + "learning_rate": 5.035058155512364e-06, + "loss": 2.0169, + "step": 55906 + }, + { + "epoch": 0.6988674716867922, + "grad_norm": 2.0059304237365723, + "learning_rate": 5.0343006455332275e-06, + "loss": 0.7014, + "step": 55908 + }, + { + "epoch": 0.6988924723118078, + "grad_norm": 3.7163965702056885, + "learning_rate": 5.033543173372153e-06, + "loss": 1.3515, + "step": 55910 + }, + { + "epoch": 0.6989174729368234, + "grad_norm": 2.3402976989746094, + "learning_rate": 5.032785739034916e-06, + "loss": 0.8553, + "step": 55912 + }, + { + "epoch": 0.698942473561839, + "grad_norm": 7.7722392082214355, + "learning_rate": 5.03202834252728e-06, + "loss": 0.4095, + "step": 55914 + }, + { + "epoch": 0.6989674741868547, + "grad_norm": 3.75789213180542, + "learning_rate": 5.031270983855008e-06, + "loss": 0.553, + "step": 55916 + }, + { + "epoch": 0.6989924748118703, + "grad_norm": 4.1046295166015625, + "learning_rate": 5.030513663023879e-06, + "loss": 0.6362, + "step": 55918 + }, + { + "epoch": 0.699017475436886, + "grad_norm": 0.036804620176553726, + "learning_rate": 5.029756380039651e-06, + "loss": 1.5109, + "step": 55920 + }, + { + "epoch": 0.6990424760619015, + "grad_norm": 0.5429956912994385, + "learning_rate": 5.028999134908099e-06, + "loss": 0.261, + "step": 55922 + }, + { + "epoch": 0.6990674766869172, + "grad_norm": 8.384459495544434, + "learning_rate": 5.028241927634982e-06, + "loss": 1.5374, + "step": 55924 + }, + { + "epoch": 0.6990924773119328, + "grad_norm": 6.495294570922852, + "learning_rate": 5.027484758226074e-06, + "loss": 2.0096, + "step": 55926 + }, + { + "epoch": 0.6991174779369485, + "grad_norm": 0.0005019315867684782, + "learning_rate": 5.026727626687144e-06, + "loss": 0.0001, + "step": 55928 + }, + { + "epoch": 0.6991424785619641, + "grad_norm": 3.060774087905884, + "learning_rate": 5.025970533023947e-06, + "loss": 0.8491, + "step": 55930 + }, + { + "epoch": 0.6991674791869796, + "grad_norm": 4.186481475830078, + "learning_rate": 5.025213477242259e-06, + "loss": 0.9038, + "step": 55932 + }, + { + "epoch": 0.6991924798119953, + "grad_norm": 6.517246246337891, + "learning_rate": 5.024456459347837e-06, + "loss": 1.2434, + "step": 55934 + }, + { + "epoch": 0.6992174804370109, + "grad_norm": 3.2922377586364746, + "learning_rate": 5.023699479346455e-06, + "loss": 0.0506, + "step": 55936 + }, + { + "epoch": 0.6992424810620266, + "grad_norm": 4.873842239379883, + "learning_rate": 5.02294253724387e-06, + "loss": 1.1265, + "step": 55938 + }, + { + "epoch": 0.6992674816870422, + "grad_norm": 3.1445796489715576, + "learning_rate": 5.022185633045855e-06, + "loss": 1.4204, + "step": 55940 + }, + { + "epoch": 0.6992924823120578, + "grad_norm": 1.943274736404419, + "learning_rate": 5.021428766758169e-06, + "loss": 1.0566, + "step": 55942 + }, + { + "epoch": 0.6993174829370734, + "grad_norm": 2.854041337966919, + "learning_rate": 5.020671938386576e-06, + "loss": 0.5981, + "step": 55944 + }, + { + "epoch": 0.6993424835620891, + "grad_norm": 0.0043467809446156025, + "learning_rate": 5.019915147936845e-06, + "loss": 0.638, + "step": 55946 + }, + { + "epoch": 0.6993674841871047, + "grad_norm": 0.8191444277763367, + "learning_rate": 5.01915839541473e-06, + "loss": 0.7983, + "step": 55948 + }, + { + "epoch": 0.6993924848121204, + "grad_norm": 4.232560634613037, + "learning_rate": 5.018401680826001e-06, + "loss": 2.4653, + "step": 55950 + }, + { + "epoch": 0.6994174854371359, + "grad_norm": 9.40993881225586, + "learning_rate": 5.017645004176425e-06, + "loss": 2.2006, + "step": 55952 + }, + { + "epoch": 0.6994424860621515, + "grad_norm": 2.4783785343170166, + "learning_rate": 5.01688836547176e-06, + "loss": 0.6614, + "step": 55954 + }, + { + "epoch": 0.6994674866871672, + "grad_norm": 4.040619373321533, + "learning_rate": 5.016131764717769e-06, + "loss": 1.4931, + "step": 55956 + }, + { + "epoch": 0.6994924873121828, + "grad_norm": 3.3799939155578613, + "learning_rate": 5.015375201920209e-06, + "loss": 1.1138, + "step": 55958 + }, + { + "epoch": 0.6995174879371985, + "grad_norm": 3.3837947845458984, + "learning_rate": 5.014618677084851e-06, + "loss": 0.9036, + "step": 55960 + }, + { + "epoch": 0.699542488562214, + "grad_norm": 2.8109779357910156, + "learning_rate": 5.013862190217448e-06, + "loss": 1.4384, + "step": 55962 + }, + { + "epoch": 0.6995674891872297, + "grad_norm": 0.5922942161560059, + "learning_rate": 5.013105741323765e-06, + "loss": 0.6721, + "step": 55964 + }, + { + "epoch": 0.6995924898122453, + "grad_norm": 4.15576696395874, + "learning_rate": 5.012349330409567e-06, + "loss": 1.0912, + "step": 55966 + }, + { + "epoch": 0.699617490437261, + "grad_norm": 4.024199962615967, + "learning_rate": 5.011592957480612e-06, + "loss": 1.6122, + "step": 55968 + }, + { + "epoch": 0.6996424910622766, + "grad_norm": 0.942593514919281, + "learning_rate": 5.01083662254266e-06, + "loss": 0.0695, + "step": 55970 + }, + { + "epoch": 0.6996674916872921, + "grad_norm": 5.115944862365723, + "learning_rate": 5.0100803256014664e-06, + "loss": 1.6613, + "step": 55972 + }, + { + "epoch": 0.6996924923123078, + "grad_norm": 6.50339412689209, + "learning_rate": 5.009324066662795e-06, + "loss": 0.9082, + "step": 55974 + }, + { + "epoch": 0.6997174929373234, + "grad_norm": 7.249623775482178, + "learning_rate": 5.008567845732411e-06, + "loss": 1.5224, + "step": 55976 + }, + { + "epoch": 0.6997424935623391, + "grad_norm": 5.310179710388184, + "learning_rate": 5.007811662816062e-06, + "loss": 1.8519, + "step": 55978 + }, + { + "epoch": 0.6997674941873547, + "grad_norm": 4.7325639724731445, + "learning_rate": 5.007055517919523e-06, + "loss": 1.7721, + "step": 55980 + }, + { + "epoch": 0.6997924948123703, + "grad_norm": 1.88788902759552, + "learning_rate": 5.006299411048535e-06, + "loss": 0.4884, + "step": 55982 + }, + { + "epoch": 0.6998174954373859, + "grad_norm": 6.350560188293457, + "learning_rate": 5.0055433422088695e-06, + "loss": 1.8599, + "step": 55984 + }, + { + "epoch": 0.6998424960624016, + "grad_norm": 3.1114931106567383, + "learning_rate": 5.004787311406274e-06, + "loss": 0.2191, + "step": 55986 + }, + { + "epoch": 0.6998674966874172, + "grad_norm": 2.0242133140563965, + "learning_rate": 5.0040313186465125e-06, + "loss": 0.5913, + "step": 55988 + }, + { + "epoch": 0.6998924973124329, + "grad_norm": 3.0730485916137695, + "learning_rate": 5.003275363935347e-06, + "loss": 1.283, + "step": 55990 + }, + { + "epoch": 0.6999174979374484, + "grad_norm": 2.1636006832122803, + "learning_rate": 5.002519447278525e-06, + "loss": 0.6808, + "step": 55992 + }, + { + "epoch": 0.699942498562464, + "grad_norm": 2.876203775405884, + "learning_rate": 5.001763568681814e-06, + "loss": 1.0323, + "step": 55994 + }, + { + "epoch": 0.6999674991874797, + "grad_norm": 4.483892440795898, + "learning_rate": 5.001007728150959e-06, + "loss": 0.8166, + "step": 55996 + }, + { + "epoch": 0.6999924998124953, + "grad_norm": 0.0006083152838982642, + "learning_rate": 5.000251925691727e-06, + "loss": 0.5947, + "step": 55998 + }, + { + "epoch": 0.700017500437511, + "grad_norm": 4.384176731109619, + "learning_rate": 4.999496161309865e-06, + "loss": 2.0524, + "step": 56000 + }, + { + "epoch": 0.7000425010625265, + "grad_norm": 2.806307315826416, + "learning_rate": 4.998740435011132e-06, + "loss": 0.5381, + "step": 56002 + }, + { + "epoch": 0.7000675016875422, + "grad_norm": 3.1556460857391357, + "learning_rate": 4.997984746801289e-06, + "loss": 1.0592, + "step": 56004 + }, + { + "epoch": 0.7000925023125578, + "grad_norm": 4.469796180725098, + "learning_rate": 4.997229096686087e-06, + "loss": 1.9398, + "step": 56006 + }, + { + "epoch": 0.7001175029375735, + "grad_norm": 5.483685493469238, + "learning_rate": 4.996473484671281e-06, + "loss": 0.8177, + "step": 56008 + }, + { + "epoch": 0.7001425035625891, + "grad_norm": 2.1377758979797363, + "learning_rate": 4.9957179107626204e-06, + "loss": 0.4749, + "step": 56010 + }, + { + "epoch": 0.7001675041876047, + "grad_norm": 3.3119118213653564, + "learning_rate": 4.994962374965865e-06, + "loss": 0.0879, + "step": 56012 + }, + { + "epoch": 0.7001925048126203, + "grad_norm": 2.3895375728607178, + "learning_rate": 4.994206877286772e-06, + "loss": 0.7146, + "step": 56014 + }, + { + "epoch": 0.700217505437636, + "grad_norm": 0.025074204429984093, + "learning_rate": 4.9934514177310885e-06, + "loss": 0.0254, + "step": 56016 + }, + { + "epoch": 0.7002425060626516, + "grad_norm": 9.883782386779785, + "learning_rate": 4.992695996304575e-06, + "loss": 0.4731, + "step": 56018 + }, + { + "epoch": 0.7002675066876672, + "grad_norm": 0.0038622755091637373, + "learning_rate": 4.99194061301298e-06, + "loss": 0.337, + "step": 56020 + }, + { + "epoch": 0.7002925073126828, + "grad_norm": 0.5300626754760742, + "learning_rate": 4.991185267862057e-06, + "loss": 1.0658, + "step": 56022 + }, + { + "epoch": 0.7003175079376984, + "grad_norm": 2.6767306327819824, + "learning_rate": 4.990429960857556e-06, + "loss": 0.4254, + "step": 56024 + }, + { + "epoch": 0.7003425085627141, + "grad_norm": 6.122647762298584, + "learning_rate": 4.989674692005231e-06, + "loss": 1.167, + "step": 56026 + }, + { + "epoch": 0.7003675091877297, + "grad_norm": 4.646449089050293, + "learning_rate": 4.988919461310838e-06, + "loss": 1.1594, + "step": 56028 + }, + { + "epoch": 0.7003925098127454, + "grad_norm": 5.004050254821777, + "learning_rate": 4.988164268780124e-06, + "loss": 1.5169, + "step": 56030 + }, + { + "epoch": 0.7004175104377609, + "grad_norm": 6.143773078918457, + "learning_rate": 4.987409114418847e-06, + "loss": 0.9482, + "step": 56032 + }, + { + "epoch": 0.7004425110627766, + "grad_norm": 9.920243263244629, + "learning_rate": 4.986653998232749e-06, + "loss": 1.9271, + "step": 56034 + }, + { + "epoch": 0.7004675116877922, + "grad_norm": 3.748800754547119, + "learning_rate": 4.985898920227583e-06, + "loss": 0.7348, + "step": 56036 + }, + { + "epoch": 0.7004925123128078, + "grad_norm": 1.8554691076278687, + "learning_rate": 4.985143880409106e-06, + "loss": 0.4224, + "step": 56038 + }, + { + "epoch": 0.7005175129378235, + "grad_norm": 6.130740642547607, + "learning_rate": 4.9843888787830605e-06, + "loss": 1.6303, + "step": 56040 + }, + { + "epoch": 0.700542513562839, + "grad_norm": 3.171708345413208, + "learning_rate": 4.983633915355204e-06, + "loss": 0.4372, + "step": 56042 + }, + { + "epoch": 0.7005675141878547, + "grad_norm": 4.101770401000977, + "learning_rate": 4.982878990131278e-06, + "loss": 1.2574, + "step": 56044 + }, + { + "epoch": 0.7005925148128703, + "grad_norm": 2.218590259552002, + "learning_rate": 4.982124103117044e-06, + "loss": 0.1471, + "step": 56046 + }, + { + "epoch": 0.700617515437886, + "grad_norm": 2.52470326423645, + "learning_rate": 4.981369254318235e-06, + "loss": 0.3252, + "step": 56048 + }, + { + "epoch": 0.7006425160629016, + "grad_norm": 1.9668216705322266, + "learning_rate": 4.980614443740609e-06, + "loss": 1.3851, + "step": 56050 + }, + { + "epoch": 0.7006675166879172, + "grad_norm": 2.56762433052063, + "learning_rate": 4.9798596713899175e-06, + "loss": 0.4137, + "step": 56052 + }, + { + "epoch": 0.7006925173129328, + "grad_norm": 6.261695384979248, + "learning_rate": 4.9791049372719e-06, + "loss": 0.8118, + "step": 56054 + }, + { + "epoch": 0.7007175179379485, + "grad_norm": 0.007065965794026852, + "learning_rate": 4.978350241392313e-06, + "loss": 0.0026, + "step": 56056 + }, + { + "epoch": 0.7007425185629641, + "grad_norm": 7.205075263977051, + "learning_rate": 4.977595583756902e-06, + "loss": 1.6479, + "step": 56058 + }, + { + "epoch": 0.7007675191879797, + "grad_norm": 3.872631549835205, + "learning_rate": 4.976840964371412e-06, + "loss": 1.1204, + "step": 56060 + }, + { + "epoch": 0.7007925198129953, + "grad_norm": 2.5208141803741455, + "learning_rate": 4.976086383241587e-06, + "loss": 1.2067, + "step": 56062 + }, + { + "epoch": 0.7008175204380109, + "grad_norm": 3.793384552001953, + "learning_rate": 4.975331840373177e-06, + "loss": 0.3267, + "step": 56064 + }, + { + "epoch": 0.7008425210630266, + "grad_norm": 2.320995569229126, + "learning_rate": 4.974577335771934e-06, + "loss": 0.9505, + "step": 56066 + }, + { + "epoch": 0.7008675216880422, + "grad_norm": 2.8279128074645996, + "learning_rate": 4.973822869443595e-06, + "loss": 0.637, + "step": 56068 + }, + { + "epoch": 0.7008925223130579, + "grad_norm": 3.908684253692627, + "learning_rate": 4.9730684413939144e-06, + "loss": 1.0078, + "step": 56070 + }, + { + "epoch": 0.7009175229380734, + "grad_norm": 2.4819159507751465, + "learning_rate": 4.972314051628633e-06, + "loss": 1.5699, + "step": 56072 + }, + { + "epoch": 0.7009425235630891, + "grad_norm": 3.1221601963043213, + "learning_rate": 4.971559700153493e-06, + "loss": 1.4638, + "step": 56074 + }, + { + "epoch": 0.7009675241881047, + "grad_norm": 3.2728986740112305, + "learning_rate": 4.970805386974248e-06, + "loss": 1.4418, + "step": 56076 + }, + { + "epoch": 0.7009925248131204, + "grad_norm": 0.6446560621261597, + "learning_rate": 4.9700511120966335e-06, + "loss": 0.3169, + "step": 56078 + }, + { + "epoch": 0.701017525438136, + "grad_norm": 3.315638303756714, + "learning_rate": 4.969296875526403e-06, + "loss": 1.5352, + "step": 56080 + }, + { + "epoch": 0.7010425260631515, + "grad_norm": 2.4752941131591797, + "learning_rate": 4.968542677269291e-06, + "loss": 0.784, + "step": 56082 + }, + { + "epoch": 0.7010675266881672, + "grad_norm": 2.755959987640381, + "learning_rate": 4.967788517331055e-06, + "loss": 1.2094, + "step": 56084 + }, + { + "epoch": 0.7010925273131828, + "grad_norm": 3.915574550628662, + "learning_rate": 4.967034395717423e-06, + "loss": 0.9771, + "step": 56086 + }, + { + "epoch": 0.7011175279381985, + "grad_norm": 4.564250946044922, + "learning_rate": 4.966280312434144e-06, + "loss": 1.525, + "step": 56088 + }, + { + "epoch": 0.7011425285632141, + "grad_norm": 5.677192687988281, + "learning_rate": 4.965526267486967e-06, + "loss": 0.9527, + "step": 56090 + }, + { + "epoch": 0.7011675291882297, + "grad_norm": 0.004857060965150595, + "learning_rate": 4.964772260881625e-06, + "loss": 0.0067, + "step": 56092 + }, + { + "epoch": 0.7011925298132453, + "grad_norm": 1.6109404563903809, + "learning_rate": 4.96401829262387e-06, + "loss": 0.807, + "step": 56094 + }, + { + "epoch": 0.701217530438261, + "grad_norm": 0.8297342658042908, + "learning_rate": 4.9632643627194356e-06, + "loss": 0.4274, + "step": 56096 + }, + { + "epoch": 0.7012425310632766, + "grad_norm": 4.574345111846924, + "learning_rate": 4.962510471174072e-06, + "loss": 1.5172, + "step": 56098 + }, + { + "epoch": 0.7012675316882923, + "grad_norm": 3.432196855545044, + "learning_rate": 4.9617566179935164e-06, + "loss": 0.6993, + "step": 56100 + }, + { + "epoch": 0.7012925323133078, + "grad_norm": 3.091994047164917, + "learning_rate": 4.961002803183504e-06, + "loss": 0.4579, + "step": 56102 + }, + { + "epoch": 0.7013175329383234, + "grad_norm": 2.477783441543579, + "learning_rate": 4.960249026749789e-06, + "loss": 0.1535, + "step": 56104 + }, + { + "epoch": 0.7013425335633391, + "grad_norm": 5.279888153076172, + "learning_rate": 4.959495288698097e-06, + "loss": 1.7298, + "step": 56106 + }, + { + "epoch": 0.7013675341883547, + "grad_norm": 2.2378180027008057, + "learning_rate": 4.9587415890341825e-06, + "loss": 1.3363, + "step": 56108 + }, + { + "epoch": 0.7013925348133704, + "grad_norm": 1.671736478805542, + "learning_rate": 4.957987927763779e-06, + "loss": 1.3056, + "step": 56110 + }, + { + "epoch": 0.7014175354383859, + "grad_norm": 0.9412493705749512, + "learning_rate": 4.957234304892622e-06, + "loss": 0.2022, + "step": 56112 + }, + { + "epoch": 0.7014425360634016, + "grad_norm": 0.0006829231278970838, + "learning_rate": 4.9564807204264585e-06, + "loss": 0.164, + "step": 56114 + }, + { + "epoch": 0.7014675366884172, + "grad_norm": 3.2535696029663086, + "learning_rate": 4.955727174371022e-06, + "loss": 0.5675, + "step": 56116 + }, + { + "epoch": 0.7014925373134329, + "grad_norm": 4.956060886383057, + "learning_rate": 4.954973666732058e-06, + "loss": 0.3071, + "step": 56118 + }, + { + "epoch": 0.7015175379384485, + "grad_norm": 2.0000526905059814, + "learning_rate": 4.954220197515296e-06, + "loss": 0.4983, + "step": 56120 + }, + { + "epoch": 0.701542538563464, + "grad_norm": 1.562709927558899, + "learning_rate": 4.953466766726485e-06, + "loss": 0.0442, + "step": 56122 + }, + { + "epoch": 0.7015675391884797, + "grad_norm": 4.137310981750488, + "learning_rate": 4.952713374371357e-06, + "loss": 1.4906, + "step": 56124 + }, + { + "epoch": 0.7015925398134953, + "grad_norm": 3.4083762168884277, + "learning_rate": 4.951960020455645e-06, + "loss": 1.115, + "step": 56126 + }, + { + "epoch": 0.701617540438511, + "grad_norm": 7.5364484786987305, + "learning_rate": 4.951206704985097e-06, + "loss": 1.0375, + "step": 56128 + }, + { + "epoch": 0.7016425410635266, + "grad_norm": 3.3548386096954346, + "learning_rate": 4.950453427965439e-06, + "loss": 1.319, + "step": 56130 + }, + { + "epoch": 0.7016675416885422, + "grad_norm": 4.0377421379089355, + "learning_rate": 4.9497001894024185e-06, + "loss": 1.4866, + "step": 56132 + }, + { + "epoch": 0.7016925423135578, + "grad_norm": 3.2680466175079346, + "learning_rate": 4.948946989301764e-06, + "loss": 0.7792, + "step": 56134 + }, + { + "epoch": 0.7017175429385735, + "grad_norm": 0.00417143851518631, + "learning_rate": 4.948193827669219e-06, + "loss": 1.0566, + "step": 56136 + }, + { + "epoch": 0.7017425435635891, + "grad_norm": 5.680421352386475, + "learning_rate": 4.947440704510514e-06, + "loss": 1.3146, + "step": 56138 + }, + { + "epoch": 0.7017675441886048, + "grad_norm": 3.2453453540802, + "learning_rate": 4.946687619831383e-06, + "loss": 2.0518, + "step": 56140 + }, + { + "epoch": 0.7017925448136203, + "grad_norm": 3.100752830505371, + "learning_rate": 4.945934573637569e-06, + "loss": 1.343, + "step": 56142 + }, + { + "epoch": 0.701817545438636, + "grad_norm": 2.981917142868042, + "learning_rate": 4.945181565934798e-06, + "loss": 0.7681, + "step": 56144 + }, + { + "epoch": 0.7018425460636516, + "grad_norm": 4.349386692047119, + "learning_rate": 4.944428596728813e-06, + "loss": 1.3507, + "step": 56146 + }, + { + "epoch": 0.7018675466886672, + "grad_norm": 2.0988547801971436, + "learning_rate": 4.943675666025342e-06, + "loss": 1.086, + "step": 56148 + }, + { + "epoch": 0.7018925473136829, + "grad_norm": 2.620352029800415, + "learning_rate": 4.9429227738301245e-06, + "loss": 0.8825, + "step": 56150 + }, + { + "epoch": 0.7019175479386984, + "grad_norm": 3.8668124675750732, + "learning_rate": 4.942169920148892e-06, + "loss": 0.9387, + "step": 56152 + }, + { + "epoch": 0.7019425485637141, + "grad_norm": 1.9751253128051758, + "learning_rate": 4.941417104987376e-06, + "loss": 0.8399, + "step": 56154 + }, + { + "epoch": 0.7019675491887297, + "grad_norm": 5.368791580200195, + "learning_rate": 4.940664328351315e-06, + "loss": 0.9085, + "step": 56156 + }, + { + "epoch": 0.7019925498137454, + "grad_norm": 4.705262184143066, + "learning_rate": 4.939911590246435e-06, + "loss": 1.3074, + "step": 56158 + }, + { + "epoch": 0.702017550438761, + "grad_norm": 4.863358974456787, + "learning_rate": 4.939158890678476e-06, + "loss": 1.572, + "step": 56160 + }, + { + "epoch": 0.7020425510637766, + "grad_norm": 4.433504581451416, + "learning_rate": 4.938406229653169e-06, + "loss": 1.2833, + "step": 56162 + }, + { + "epoch": 0.7020675516887922, + "grad_norm": 5.491816997528076, + "learning_rate": 4.93765360717624e-06, + "loss": 2.1294, + "step": 56164 + }, + { + "epoch": 0.7020925523138078, + "grad_norm": 4.147956371307373, + "learning_rate": 4.936901023253427e-06, + "loss": 1.4524, + "step": 56166 + }, + { + "epoch": 0.7021175529388235, + "grad_norm": 3.41689133644104, + "learning_rate": 4.936148477890457e-06, + "loss": 1.0272, + "step": 56168 + }, + { + "epoch": 0.7021425535638391, + "grad_norm": 4.618997573852539, + "learning_rate": 4.9353959710930695e-06, + "loss": 1.2947, + "step": 56170 + }, + { + "epoch": 0.7021675541888547, + "grad_norm": 3.706571340560913, + "learning_rate": 4.934643502866985e-06, + "loss": 0.716, + "step": 56172 + }, + { + "epoch": 0.7021925548138703, + "grad_norm": 0.7081949710845947, + "learning_rate": 4.933891073217944e-06, + "loss": 0.5727, + "step": 56174 + }, + { + "epoch": 0.702217555438886, + "grad_norm": 2.3585073947906494, + "learning_rate": 4.933138682151671e-06, + "loss": 1.6791, + "step": 56176 + }, + { + "epoch": 0.7022425560639016, + "grad_norm": 0.780845046043396, + "learning_rate": 4.932386329673894e-06, + "loss": 0.8557, + "step": 56178 + }, + { + "epoch": 0.7022675566889173, + "grad_norm": 4.70845890045166, + "learning_rate": 4.93163401579035e-06, + "loss": 0.8985, + "step": 56180 + }, + { + "epoch": 0.7022925573139328, + "grad_norm": 4.734375476837158, + "learning_rate": 4.93088174050676e-06, + "loss": 0.8423, + "step": 56182 + }, + { + "epoch": 0.7023175579389485, + "grad_norm": 4.369705677032471, + "learning_rate": 4.930129503828863e-06, + "loss": 0.4594, + "step": 56184 + }, + { + "epoch": 0.7023425585639641, + "grad_norm": 2.3633909225463867, + "learning_rate": 4.929377305762379e-06, + "loss": 1.1638, + "step": 56186 + }, + { + "epoch": 0.7023675591889798, + "grad_norm": 3.5709450244903564, + "learning_rate": 4.928625146313042e-06, + "loss": 1.1318, + "step": 56188 + }, + { + "epoch": 0.7023925598139954, + "grad_norm": 2.0983219146728516, + "learning_rate": 4.92787302548658e-06, + "loss": 0.8316, + "step": 56190 + }, + { + "epoch": 0.7024175604390109, + "grad_norm": 0.0009287820430472493, + "learning_rate": 4.927120943288717e-06, + "loss": 0.6211, + "step": 56192 + }, + { + "epoch": 0.7024425610640266, + "grad_norm": 2.9741768836975098, + "learning_rate": 4.9263688997251855e-06, + "loss": 0.9363, + "step": 56194 + }, + { + "epoch": 0.7024675616890422, + "grad_norm": 0.0007634261273778975, + "learning_rate": 4.9256168948017095e-06, + "loss": 0.662, + "step": 56196 + }, + { + "epoch": 0.7024925623140579, + "grad_norm": 3.118438720703125, + "learning_rate": 4.924864928524019e-06, + "loss": 0.33, + "step": 56198 + }, + { + "epoch": 0.7025175629390735, + "grad_norm": 2.307555675506592, + "learning_rate": 4.924113000897837e-06, + "loss": 0.4964, + "step": 56200 + }, + { + "epoch": 0.7025425635640891, + "grad_norm": 1.8779479265213013, + "learning_rate": 4.923361111928896e-06, + "loss": 0.1134, + "step": 56202 + }, + { + "epoch": 0.7025675641891047, + "grad_norm": 2.533388137817383, + "learning_rate": 4.92260926162292e-06, + "loss": 0.9575, + "step": 56204 + }, + { + "epoch": 0.7025925648141204, + "grad_norm": 4.322776794433594, + "learning_rate": 4.921857449985628e-06, + "loss": 1.5503, + "step": 56206 + }, + { + "epoch": 0.702617565439136, + "grad_norm": 2.5789387226104736, + "learning_rate": 4.921105677022758e-06, + "loss": 1.7859, + "step": 56208 + }, + { + "epoch": 0.7026425660641517, + "grad_norm": 2.3593995571136475, + "learning_rate": 4.9203539427400226e-06, + "loss": 1.0739, + "step": 56210 + }, + { + "epoch": 0.7026675666891672, + "grad_norm": 0.0028185141272842884, + "learning_rate": 4.919602247143158e-06, + "loss": 0.0001, + "step": 56212 + }, + { + "epoch": 0.7026925673141828, + "grad_norm": 0.1961909681558609, + "learning_rate": 4.918850590237884e-06, + "loss": 0.3806, + "step": 56214 + }, + { + "epoch": 0.7027175679391985, + "grad_norm": 0.0015076815616339445, + "learning_rate": 4.918098972029922e-06, + "loss": 0.0341, + "step": 56216 + }, + { + "epoch": 0.7027425685642141, + "grad_norm": 0.1271953135728836, + "learning_rate": 4.917347392525004e-06, + "loss": 0.0029, + "step": 56218 + }, + { + "epoch": 0.7027675691892298, + "grad_norm": 4.113500595092773, + "learning_rate": 4.916595851728845e-06, + "loss": 1.9246, + "step": 56220 + }, + { + "epoch": 0.7027925698142453, + "grad_norm": 0.5470415949821472, + "learning_rate": 4.915844349647177e-06, + "loss": 0.9844, + "step": 56222 + }, + { + "epoch": 0.702817570439261, + "grad_norm": 0.1467929631471634, + "learning_rate": 4.915092886285715e-06, + "loss": 0.0105, + "step": 56224 + }, + { + "epoch": 0.7028425710642766, + "grad_norm": 3.1056501865386963, + "learning_rate": 4.91434146165019e-06, + "loss": 0.4555, + "step": 56226 + }, + { + "epoch": 0.7028675716892923, + "grad_norm": 1.6819480657577515, + "learning_rate": 4.913590075746323e-06, + "loss": 0.6267, + "step": 56228 + }, + { + "epoch": 0.7028925723143079, + "grad_norm": 2.1472296714782715, + "learning_rate": 4.912838728579829e-06, + "loss": 0.7734, + "step": 56230 + }, + { + "epoch": 0.7029175729393234, + "grad_norm": 0.24288254976272583, + "learning_rate": 4.9120874201564415e-06, + "loss": 0.6749, + "step": 56232 + }, + { + "epoch": 0.7029425735643391, + "grad_norm": 4.1846489906311035, + "learning_rate": 4.911336150481873e-06, + "loss": 1.3871, + "step": 56234 + }, + { + "epoch": 0.7029675741893547, + "grad_norm": 3.942094087600708, + "learning_rate": 4.910584919561851e-06, + "loss": 1.1898, + "step": 56236 + }, + { + "epoch": 0.7029925748143704, + "grad_norm": 0.0009161317721009254, + "learning_rate": 4.909833727402092e-06, + "loss": 0.01, + "step": 56238 + }, + { + "epoch": 0.703017575439386, + "grad_norm": 2.813507080078125, + "learning_rate": 4.909082574008323e-06, + "loss": 0.6623, + "step": 56240 + }, + { + "epoch": 0.7030425760644016, + "grad_norm": 3.794511556625366, + "learning_rate": 4.90833145938626e-06, + "loss": 1.5815, + "step": 56242 + }, + { + "epoch": 0.7030675766894172, + "grad_norm": 8.527420043945312, + "learning_rate": 4.907580383541622e-06, + "loss": 1.1967, + "step": 56244 + }, + { + "epoch": 0.7030925773144329, + "grad_norm": 0.4046018719673157, + "learning_rate": 4.906829346480135e-06, + "loss": 0.8429, + "step": 56246 + }, + { + "epoch": 0.7031175779394485, + "grad_norm": 1.6411898136138916, + "learning_rate": 4.906078348207511e-06, + "loss": 1.314, + "step": 56248 + }, + { + "epoch": 0.7031425785644642, + "grad_norm": 3.365614891052246, + "learning_rate": 4.905327388729479e-06, + "loss": 1.7957, + "step": 56250 + }, + { + "epoch": 0.7031675791894797, + "grad_norm": 3.1495633125305176, + "learning_rate": 4.90457646805175e-06, + "loss": 1.3734, + "step": 56252 + }, + { + "epoch": 0.7031925798144953, + "grad_norm": 3.0955846309661865, + "learning_rate": 4.903825586180048e-06, + "loss": 1.2484, + "step": 56254 + }, + { + "epoch": 0.703217580439511, + "grad_norm": 2.3552238941192627, + "learning_rate": 4.903074743120091e-06, + "loss": 0.4458, + "step": 56256 + }, + { + "epoch": 0.7032425810645266, + "grad_norm": 0.0007110409787856042, + "learning_rate": 4.902323938877591e-06, + "loss": 1.3107, + "step": 56258 + }, + { + "epoch": 0.7032675816895423, + "grad_norm": 4.00529146194458, + "learning_rate": 4.901573173458275e-06, + "loss": 1.0887, + "step": 56260 + }, + { + "epoch": 0.7032925823145578, + "grad_norm": 4.027798652648926, + "learning_rate": 4.900822446867853e-06, + "loss": 0.7161, + "step": 56262 + }, + { + "epoch": 0.7033175829395735, + "grad_norm": 2.3993654251098633, + "learning_rate": 4.900071759112052e-06, + "loss": 0.0767, + "step": 56264 + }, + { + "epoch": 0.7033425835645891, + "grad_norm": 0.6541582345962524, + "learning_rate": 4.899321110196576e-06, + "loss": 0.9529, + "step": 56266 + }, + { + "epoch": 0.7033675841896048, + "grad_norm": 0.0005678880261257291, + "learning_rate": 4.898570500127156e-06, + "loss": 1.077, + "step": 56268 + }, + { + "epoch": 0.7033925848146204, + "grad_norm": 2.523599147796631, + "learning_rate": 4.897819928909501e-06, + "loss": 0.6659, + "step": 56270 + }, + { + "epoch": 0.703417585439636, + "grad_norm": 2.7036304473876953, + "learning_rate": 4.897069396549323e-06, + "loss": 1.6505, + "step": 56272 + }, + { + "epoch": 0.7034425860646516, + "grad_norm": 0.0007645124569535255, + "learning_rate": 4.896318903052347e-06, + "loss": 0.0242, + "step": 56274 + }, + { + "epoch": 0.7034675866896672, + "grad_norm": 1.3352272510528564, + "learning_rate": 4.895568448424281e-06, + "loss": 1.3142, + "step": 56276 + }, + { + "epoch": 0.7034925873146829, + "grad_norm": 0.002695131115615368, + "learning_rate": 4.894818032670847e-06, + "loss": 0.0238, + "step": 56278 + }, + { + "epoch": 0.7035175879396985, + "grad_norm": 3.8215885162353516, + "learning_rate": 4.894067655797757e-06, + "loss": 1.9065, + "step": 56280 + }, + { + "epoch": 0.7035425885647141, + "grad_norm": 4.568901538848877, + "learning_rate": 4.893317317810723e-06, + "loss": 1.3708, + "step": 56282 + }, + { + "epoch": 0.7035675891897297, + "grad_norm": 2.5554511547088623, + "learning_rate": 4.892567018715465e-06, + "loss": 0.5951, + "step": 56284 + }, + { + "epoch": 0.7035925898147454, + "grad_norm": 5.413978099822998, + "learning_rate": 4.891816758517692e-06, + "loss": 1.3055, + "step": 56286 + }, + { + "epoch": 0.703617590439761, + "grad_norm": 2.5878894329071045, + "learning_rate": 4.8910665372231226e-06, + "loss": 1.057, + "step": 56288 + }, + { + "epoch": 0.7036425910647767, + "grad_norm": 0.0024533329997211695, + "learning_rate": 4.890316354837464e-06, + "loss": 0.3385, + "step": 56290 + }, + { + "epoch": 0.7036675916897922, + "grad_norm": 3.8076751232147217, + "learning_rate": 4.889566211366434e-06, + "loss": 1.1586, + "step": 56292 + }, + { + "epoch": 0.7036925923148079, + "grad_norm": 3.827638864517212, + "learning_rate": 4.888816106815752e-06, + "loss": 0.9422, + "step": 56294 + }, + { + "epoch": 0.7037175929398235, + "grad_norm": 3.399787187576294, + "learning_rate": 4.8880660411911175e-06, + "loss": 1.0271, + "step": 56296 + }, + { + "epoch": 0.7037425935648391, + "grad_norm": 3.9494245052337646, + "learning_rate": 4.887316014498252e-06, + "loss": 0.4674, + "step": 56298 + }, + { + "epoch": 0.7037675941898548, + "grad_norm": 4.456002712249756, + "learning_rate": 4.886566026742861e-06, + "loss": 2.017, + "step": 56300 + }, + { + "epoch": 0.7037925948148703, + "grad_norm": 0.11152353137731552, + "learning_rate": 4.885816077930664e-06, + "loss": 0.2829, + "step": 56302 + }, + { + "epoch": 0.703817595439886, + "grad_norm": 5.6995673179626465, + "learning_rate": 4.885066168067365e-06, + "loss": 2.0397, + "step": 56304 + }, + { + "epoch": 0.7038425960649016, + "grad_norm": 0.016786469146609306, + "learning_rate": 4.884316297158682e-06, + "loss": 1.2101, + "step": 56306 + }, + { + "epoch": 0.7038675966899173, + "grad_norm": 0.41552287340164185, + "learning_rate": 4.8835664652103224e-06, + "loss": 0.6311, + "step": 56308 + }, + { + "epoch": 0.7038925973149329, + "grad_norm": 9.630607604980469, + "learning_rate": 4.882816672227994e-06, + "loss": 0.5798, + "step": 56310 + }, + { + "epoch": 0.7039175979399485, + "grad_norm": 0.0011286166263744235, + "learning_rate": 4.882066918217414e-06, + "loss": 0.0952, + "step": 56312 + }, + { + "epoch": 0.7039425985649641, + "grad_norm": 0.001197759178467095, + "learning_rate": 4.881317203184284e-06, + "loss": 0.3351, + "step": 56314 + }, + { + "epoch": 0.7039675991899798, + "grad_norm": 2.6638827323913574, + "learning_rate": 4.880567527134323e-06, + "loss": 0.4691, + "step": 56316 + }, + { + "epoch": 0.7039925998149954, + "grad_norm": 3.659414052963257, + "learning_rate": 4.879817890073232e-06, + "loss": 0.1533, + "step": 56318 + }, + { + "epoch": 0.704017600440011, + "grad_norm": 0.0011447344440966845, + "learning_rate": 4.8790682920067265e-06, + "loss": 0.9158, + "step": 56320 + }, + { + "epoch": 0.7040426010650266, + "grad_norm": 1.78746497631073, + "learning_rate": 4.878318732940513e-06, + "loss": 0.9373, + "step": 56322 + }, + { + "epoch": 0.7040676016900422, + "grad_norm": 5.2270989418029785, + "learning_rate": 4.877569212880298e-06, + "loss": 1.6708, + "step": 56324 + }, + { + "epoch": 0.7040926023150579, + "grad_norm": 2.857201337814331, + "learning_rate": 4.876819731831793e-06, + "loss": 1.6041, + "step": 56326 + }, + { + "epoch": 0.7041176029400735, + "grad_norm": 2.9270999431610107, + "learning_rate": 4.876070289800702e-06, + "loss": 0.7152, + "step": 56328 + }, + { + "epoch": 0.7041426035650892, + "grad_norm": 3.3739428520202637, + "learning_rate": 4.875320886792733e-06, + "loss": 0.1151, + "step": 56330 + }, + { + "epoch": 0.7041676041901047, + "grad_norm": 2.795865297317505, + "learning_rate": 4.874571522813606e-06, + "loss": 0.6203, + "step": 56332 + }, + { + "epoch": 0.7041926048151204, + "grad_norm": 0.001093789585866034, + "learning_rate": 4.873822197869008e-06, + "loss": 0.3866, + "step": 56334 + }, + { + "epoch": 0.704217605440136, + "grad_norm": 2.137364149093628, + "learning_rate": 4.873072911964659e-06, + "loss": 1.1081, + "step": 56336 + }, + { + "epoch": 0.7042426060651517, + "grad_norm": 0.526490330696106, + "learning_rate": 4.872323665106259e-06, + "loss": 0.1629, + "step": 56338 + }, + { + "epoch": 0.7042676066901673, + "grad_norm": 3.753462791442871, + "learning_rate": 4.871574457299519e-06, + "loss": 1.0647, + "step": 56340 + }, + { + "epoch": 0.7042926073151828, + "grad_norm": 0.048799656331539154, + "learning_rate": 4.8708252885501405e-06, + "loss": 0.5257, + "step": 56342 + }, + { + "epoch": 0.7043176079401985, + "grad_norm": 0.6420413851737976, + "learning_rate": 4.870076158863832e-06, + "loss": 0.2416, + "step": 56344 + }, + { + "epoch": 0.7043426085652141, + "grad_norm": 1.1870638132095337, + "learning_rate": 4.869327068246303e-06, + "loss": 0.0613, + "step": 56346 + }, + { + "epoch": 0.7043676091902298, + "grad_norm": 13.196566581726074, + "learning_rate": 4.868578016703247e-06, + "loss": 2.1445, + "step": 56348 + }, + { + "epoch": 0.7043926098152454, + "grad_norm": 0.9785969853401184, + "learning_rate": 4.8678290042403795e-06, + "loss": 0.0528, + "step": 56350 + }, + { + "epoch": 0.704417610440261, + "grad_norm": 4.854925632476807, + "learning_rate": 4.867080030863397e-06, + "loss": 1.2017, + "step": 56352 + }, + { + "epoch": 0.7044426110652766, + "grad_norm": 1.2284975051879883, + "learning_rate": 4.8663310965780064e-06, + "loss": 0.4734, + "step": 56354 + }, + { + "epoch": 0.7044676116902923, + "grad_norm": 1.8233760595321655, + "learning_rate": 4.865582201389915e-06, + "loss": 0.0511, + "step": 56356 + }, + { + "epoch": 0.7044926123153079, + "grad_norm": 3.52872633934021, + "learning_rate": 4.864833345304826e-06, + "loss": 0.6404, + "step": 56358 + }, + { + "epoch": 0.7045176129403236, + "grad_norm": 0.15562967956066132, + "learning_rate": 4.864084528328439e-06, + "loss": 0.0022, + "step": 56360 + }, + { + "epoch": 0.7045426135653391, + "grad_norm": 0.6130157113075256, + "learning_rate": 4.863335750466453e-06, + "loss": 0.0231, + "step": 56362 + }, + { + "epoch": 0.7045676141903547, + "grad_norm": 4.028345108032227, + "learning_rate": 4.86258701172458e-06, + "loss": 0.9923, + "step": 56364 + }, + { + "epoch": 0.7045926148153704, + "grad_norm": 5.713435173034668, + "learning_rate": 4.861838312108514e-06, + "loss": 1.3042, + "step": 56366 + }, + { + "epoch": 0.704617615440386, + "grad_norm": 3.4345788955688477, + "learning_rate": 4.8610896516239615e-06, + "loss": 0.5326, + "step": 56368 + }, + { + "epoch": 0.7046426160654017, + "grad_norm": 3.634819269180298, + "learning_rate": 4.860341030276627e-06, + "loss": 1.6165, + "step": 56370 + }, + { + "epoch": 0.7046676166904172, + "grad_norm": 5.030182838439941, + "learning_rate": 4.859592448072208e-06, + "loss": 1.4627, + "step": 56372 + }, + { + "epoch": 0.7046926173154329, + "grad_norm": 4.620659828186035, + "learning_rate": 4.858843905016406e-06, + "loss": 0.8665, + "step": 56374 + }, + { + "epoch": 0.7047176179404485, + "grad_norm": 2.354865312576294, + "learning_rate": 4.8580954011149185e-06, + "loss": 0.2199, + "step": 56376 + }, + { + "epoch": 0.7047426185654642, + "grad_norm": 1.0061185359954834, + "learning_rate": 4.857346936373453e-06, + "loss": 0.1836, + "step": 56378 + }, + { + "epoch": 0.7047676191904798, + "grad_norm": 4.370864391326904, + "learning_rate": 4.856598510797702e-06, + "loss": 1.288, + "step": 56380 + }, + { + "epoch": 0.7047926198154953, + "grad_norm": 0.0005604828475043178, + "learning_rate": 4.85585012439337e-06, + "loss": 0.556, + "step": 56382 + }, + { + "epoch": 0.704817620440511, + "grad_norm": 5.3805131912231445, + "learning_rate": 4.855101777166163e-06, + "loss": 1.5655, + "step": 56384 + }, + { + "epoch": 0.7048426210655266, + "grad_norm": 3.9634671211242676, + "learning_rate": 4.854353469121765e-06, + "loss": 1.2787, + "step": 56386 + }, + { + "epoch": 0.7048676216905423, + "grad_norm": 4.2364606857299805, + "learning_rate": 4.8536052002658885e-06, + "loss": 1.0827, + "step": 56388 + }, + { + "epoch": 0.7048926223155579, + "grad_norm": 3.5224061012268066, + "learning_rate": 4.852856970604222e-06, + "loss": 1.3172, + "step": 56390 + }, + { + "epoch": 0.7049176229405735, + "grad_norm": 5.182425498962402, + "learning_rate": 4.852108780142469e-06, + "loss": 1.6471, + "step": 56392 + }, + { + "epoch": 0.7049426235655891, + "grad_norm": 2.6238911151885986, + "learning_rate": 4.8513606288863325e-06, + "loss": 1.0708, + "step": 56394 + }, + { + "epoch": 0.7049676241906048, + "grad_norm": 6.774723052978516, + "learning_rate": 4.850612516841501e-06, + "loss": 1.6784, + "step": 56396 + }, + { + "epoch": 0.7049926248156204, + "grad_norm": 0.20049268007278442, + "learning_rate": 4.849864444013685e-06, + "loss": 0.1752, + "step": 56398 + }, + { + "epoch": 0.7050176254406361, + "grad_norm": 1.6603679656982422, + "learning_rate": 4.849116410408563e-06, + "loss": 0.8932, + "step": 56400 + }, + { + "epoch": 0.7050426260656516, + "grad_norm": 1.0065922737121582, + "learning_rate": 4.848368416031847e-06, + "loss": 0.4658, + "step": 56402 + }, + { + "epoch": 0.7050676266906672, + "grad_norm": 4.564453125, + "learning_rate": 4.8476204608892254e-06, + "loss": 1.455, + "step": 56404 + }, + { + "epoch": 0.7050926273156829, + "grad_norm": 0.0026774145662784576, + "learning_rate": 4.8468725449863975e-06, + "loss": 0.6202, + "step": 56406 + }, + { + "epoch": 0.7051176279406985, + "grad_norm": 4.491024971008301, + "learning_rate": 4.846124668329063e-06, + "loss": 0.8628, + "step": 56408 + }, + { + "epoch": 0.7051426285657142, + "grad_norm": 2.9019644260406494, + "learning_rate": 4.8453768309229155e-06, + "loss": 0.9533, + "step": 56410 + }, + { + "epoch": 0.7051676291907297, + "grad_norm": 0.007016643416136503, + "learning_rate": 4.844629032773648e-06, + "loss": 0.2182, + "step": 56412 + }, + { + "epoch": 0.7051926298157454, + "grad_norm": 0.0012233657762408257, + "learning_rate": 4.843881273886952e-06, + "loss": 0.366, + "step": 56414 + }, + { + "epoch": 0.705217630440761, + "grad_norm": 0.8194915652275085, + "learning_rate": 4.843133554268529e-06, + "loss": 0.568, + "step": 56416 + }, + { + "epoch": 0.7052426310657767, + "grad_norm": 0.0050656464882195, + "learning_rate": 4.842385873924075e-06, + "loss": 0.755, + "step": 56418 + }, + { + "epoch": 0.7052676316907923, + "grad_norm": 3.085357189178467, + "learning_rate": 4.841638232859276e-06, + "loss": 1.2075, + "step": 56420 + }, + { + "epoch": 0.7052926323158079, + "grad_norm": 7.036930561065674, + "learning_rate": 4.8408906310798355e-06, + "loss": 0.5461, + "step": 56422 + }, + { + "epoch": 0.7053176329408235, + "grad_norm": 5.437419891357422, + "learning_rate": 4.840143068591442e-06, + "loss": 1.1894, + "step": 56424 + }, + { + "epoch": 0.7053426335658391, + "grad_norm": 3.020188808441162, + "learning_rate": 4.8393955453997886e-06, + "loss": 2.5948, + "step": 56426 + }, + { + "epoch": 0.7053676341908548, + "grad_norm": 3.161496877670288, + "learning_rate": 4.838648061510567e-06, + "loss": 0.2022, + "step": 56428 + }, + { + "epoch": 0.7053926348158704, + "grad_norm": 3.5036356449127197, + "learning_rate": 4.837900616929469e-06, + "loss": 0.9505, + "step": 56430 + }, + { + "epoch": 0.705417635440886, + "grad_norm": 0.003056109184399247, + "learning_rate": 4.837153211662197e-06, + "loss": 0.8491, + "step": 56432 + }, + { + "epoch": 0.7054426360659016, + "grad_norm": 5.721590995788574, + "learning_rate": 4.836405845714431e-06, + "loss": 0.7251, + "step": 56434 + }, + { + "epoch": 0.7054676366909173, + "grad_norm": 3.511079788208008, + "learning_rate": 4.835658519091874e-06, + "loss": 1.0102, + "step": 56436 + }, + { + "epoch": 0.7054926373159329, + "grad_norm": 3.5131378173828125, + "learning_rate": 4.834911231800205e-06, + "loss": 1.1894, + "step": 56438 + }, + { + "epoch": 0.7055176379409486, + "grad_norm": 5.2479939460754395, + "learning_rate": 4.834163983845126e-06, + "loss": 1.6869, + "step": 56440 + }, + { + "epoch": 0.7055426385659641, + "grad_norm": 1.5885769128799438, + "learning_rate": 4.833416775232319e-06, + "loss": 0.4051, + "step": 56442 + }, + { + "epoch": 0.7055676391909798, + "grad_norm": 0.0012840897543355823, + "learning_rate": 4.832669605967479e-06, + "loss": 0.04, + "step": 56444 + }, + { + "epoch": 0.7055926398159954, + "grad_norm": 0.0007936353795230389, + "learning_rate": 4.831922476056301e-06, + "loss": 0.8567, + "step": 56446 + }, + { + "epoch": 0.705617640441011, + "grad_norm": 4.568698406219482, + "learning_rate": 4.831175385504466e-06, + "loss": 1.4008, + "step": 56448 + }, + { + "epoch": 0.7056426410660267, + "grad_norm": 5.50263786315918, + "learning_rate": 4.830428334317677e-06, + "loss": 1.2114, + "step": 56450 + }, + { + "epoch": 0.7056676416910422, + "grad_norm": 12.536057472229004, + "learning_rate": 4.829681322501607e-06, + "loss": 0.4084, + "step": 56452 + }, + { + "epoch": 0.7056926423160579, + "grad_norm": 7.8674635887146, + "learning_rate": 4.828934350061952e-06, + "loss": 1.1226, + "step": 56454 + }, + { + "epoch": 0.7057176429410735, + "grad_norm": 2.6003663539886475, + "learning_rate": 4.8281874170044065e-06, + "loss": 1.7284, + "step": 56456 + }, + { + "epoch": 0.7057426435660892, + "grad_norm": 7.780792236328125, + "learning_rate": 4.82744052333465e-06, + "loss": 0.582, + "step": 56458 + }, + { + "epoch": 0.7057676441911048, + "grad_norm": 2.8421216011047363, + "learning_rate": 4.826693669058379e-06, + "loss": 0.5098, + "step": 56460 + }, + { + "epoch": 0.7057926448161204, + "grad_norm": 2.304385185241699, + "learning_rate": 4.825946854181277e-06, + "loss": 0.5594, + "step": 56462 + }, + { + "epoch": 0.705817645441136, + "grad_norm": 3.3542022705078125, + "learning_rate": 4.825200078709032e-06, + "loss": 0.7489, + "step": 56464 + }, + { + "epoch": 0.7058426460661517, + "grad_norm": 4.502331733703613, + "learning_rate": 4.8244533426473285e-06, + "loss": 0.2389, + "step": 56466 + }, + { + "epoch": 0.7058676466911673, + "grad_norm": 3.9940736293792725, + "learning_rate": 4.823706646001856e-06, + "loss": 0.6576, + "step": 56468 + }, + { + "epoch": 0.705892647316183, + "grad_norm": 4.156460762023926, + "learning_rate": 4.822959988778305e-06, + "loss": 1.0317, + "step": 56470 + }, + { + "epoch": 0.7059176479411985, + "grad_norm": 2.7197206020355225, + "learning_rate": 4.822213370982356e-06, + "loss": 0.7511, + "step": 56472 + }, + { + "epoch": 0.7059426485662141, + "grad_norm": 5.30215311050415, + "learning_rate": 4.8214667926197005e-06, + "loss": 2.5305, + "step": 56474 + }, + { + "epoch": 0.7059676491912298, + "grad_norm": 4.700710773468018, + "learning_rate": 4.820720253696022e-06, + "loss": 1.4846, + "step": 56476 + }, + { + "epoch": 0.7059926498162454, + "grad_norm": 2.614065647125244, + "learning_rate": 4.819973754217002e-06, + "loss": 0.8959, + "step": 56478 + }, + { + "epoch": 0.7060176504412611, + "grad_norm": 4.61769437789917, + "learning_rate": 4.819227294188332e-06, + "loss": 1.4555, + "step": 56480 + }, + { + "epoch": 0.7060426510662766, + "grad_norm": 5.169277667999268, + "learning_rate": 4.818480873615692e-06, + "loss": 1.9324, + "step": 56482 + }, + { + "epoch": 0.7060676516912923, + "grad_norm": 7.398719310760498, + "learning_rate": 4.817734492504772e-06, + "loss": 2.0359, + "step": 56484 + }, + { + "epoch": 0.7060926523163079, + "grad_norm": 5.316622257232666, + "learning_rate": 4.816988150861251e-06, + "loss": 1.5035, + "step": 56486 + }, + { + "epoch": 0.7061176529413236, + "grad_norm": 3.7366788387298584, + "learning_rate": 4.816241848690817e-06, + "loss": 1.2898, + "step": 56488 + }, + { + "epoch": 0.7061426535663392, + "grad_norm": 3.05527663230896, + "learning_rate": 4.815495585999153e-06, + "loss": 2.3388, + "step": 56490 + }, + { + "epoch": 0.7061676541913547, + "grad_norm": 5.6501784324646, + "learning_rate": 4.814749362791938e-06, + "loss": 0.4197, + "step": 56492 + }, + { + "epoch": 0.7061926548163704, + "grad_norm": 4.460643291473389, + "learning_rate": 4.8140031790748605e-06, + "loss": 0.7537, + "step": 56494 + }, + { + "epoch": 0.706217655441386, + "grad_norm": 7.35010290145874, + "learning_rate": 4.813257034853599e-06, + "loss": 1.1285, + "step": 56496 + }, + { + "epoch": 0.7062426560664017, + "grad_norm": 13.191986083984375, + "learning_rate": 4.812510930133843e-06, + "loss": 1.6704, + "step": 56498 + }, + { + "epoch": 0.7062676566914173, + "grad_norm": 3.7822229862213135, + "learning_rate": 4.811764864921264e-06, + "loss": 1.2137, + "step": 56500 + }, + { + "epoch": 0.7062926573164329, + "grad_norm": 2.3627407550811768, + "learning_rate": 4.811018839221556e-06, + "loss": 0.8918, + "step": 56502 + }, + { + "epoch": 0.7063176579414485, + "grad_norm": 3.0825183391571045, + "learning_rate": 4.810272853040394e-06, + "loss": 0.1338, + "step": 56504 + }, + { + "epoch": 0.7063426585664642, + "grad_norm": 1.8284844160079956, + "learning_rate": 4.809526906383457e-06, + "loss": 0.1871, + "step": 56506 + }, + { + "epoch": 0.7063676591914798, + "grad_norm": 2.654933452606201, + "learning_rate": 4.808780999256433e-06, + "loss": 0.7206, + "step": 56508 + }, + { + "epoch": 0.7063926598164955, + "grad_norm": 3.4419260025024414, + "learning_rate": 4.808035131664995e-06, + "loss": 1.117, + "step": 56510 + }, + { + "epoch": 0.706417660441511, + "grad_norm": 4.1803202629089355, + "learning_rate": 4.807289303614829e-06, + "loss": 0.6652, + "step": 56512 + }, + { + "epoch": 0.7064426610665266, + "grad_norm": 1.6630131006240845, + "learning_rate": 4.806543515111611e-06, + "loss": 1.3927, + "step": 56514 + }, + { + "epoch": 0.7064676616915423, + "grad_norm": 0.8164571523666382, + "learning_rate": 4.805797766161026e-06, + "loss": 0.197, + "step": 56516 + }, + { + "epoch": 0.7064926623165579, + "grad_norm": 1.0068899393081665, + "learning_rate": 4.805052056768752e-06, + "loss": 1.3222, + "step": 56518 + }, + { + "epoch": 0.7065176629415736, + "grad_norm": 2.5515313148498535, + "learning_rate": 4.804306386940462e-06, + "loss": 0.9907, + "step": 56520 + }, + { + "epoch": 0.7065426635665891, + "grad_norm": 4.173154830932617, + "learning_rate": 4.803560756681845e-06, + "loss": 1.4117, + "step": 56522 + }, + { + "epoch": 0.7065676641916048, + "grad_norm": 0.023146767169237137, + "learning_rate": 4.80281516599857e-06, + "loss": 0.6057, + "step": 56524 + }, + { + "epoch": 0.7065926648166204, + "grad_norm": 1.584739089012146, + "learning_rate": 4.802069614896325e-06, + "loss": 0.1376, + "step": 56526 + }, + { + "epoch": 0.7066176654416361, + "grad_norm": 4.151222229003906, + "learning_rate": 4.80132410338078e-06, + "loss": 1.9195, + "step": 56528 + }, + { + "epoch": 0.7066426660666517, + "grad_norm": 5.159866809844971, + "learning_rate": 4.800578631457613e-06, + "loss": 1.8968, + "step": 56530 + }, + { + "epoch": 0.7066676666916673, + "grad_norm": 5.8568291664123535, + "learning_rate": 4.799833199132507e-06, + "loss": 1.8711, + "step": 56532 + }, + { + "epoch": 0.7066926673166829, + "grad_norm": 8.057698249816895, + "learning_rate": 4.799087806411133e-06, + "loss": 0.6135, + "step": 56534 + }, + { + "epoch": 0.7067176679416985, + "grad_norm": 0.9566470384597778, + "learning_rate": 4.7983424532991754e-06, + "loss": 0.0725, + "step": 56536 + }, + { + "epoch": 0.7067426685667142, + "grad_norm": 2.9779305458068848, + "learning_rate": 4.797597139802301e-06, + "loss": 0.2442, + "step": 56538 + }, + { + "epoch": 0.7067676691917298, + "grad_norm": 0.3017178475856781, + "learning_rate": 4.796851865926196e-06, + "loss": 0.0498, + "step": 56540 + }, + { + "epoch": 0.7067926698167454, + "grad_norm": 11.118935585021973, + "learning_rate": 4.796106631676529e-06, + "loss": 1.7054, + "step": 56542 + }, + { + "epoch": 0.706817670441761, + "grad_norm": 4.888700485229492, + "learning_rate": 4.795361437058976e-06, + "loss": 1.8611, + "step": 56544 + }, + { + "epoch": 0.7068426710667767, + "grad_norm": 0.005862680729478598, + "learning_rate": 4.794616282079217e-06, + "loss": 0.4236, + "step": 56546 + }, + { + "epoch": 0.7068676716917923, + "grad_norm": 0.0009348897146992385, + "learning_rate": 4.793871166742923e-06, + "loss": 1.0928, + "step": 56548 + }, + { + "epoch": 0.706892672316808, + "grad_norm": 6.015658378601074, + "learning_rate": 4.793126091055772e-06, + "loss": 0.9088, + "step": 56550 + }, + { + "epoch": 0.7069176729418235, + "grad_norm": 4.332516670227051, + "learning_rate": 4.792381055023432e-06, + "loss": 1.8567, + "step": 56552 + }, + { + "epoch": 0.7069426735668392, + "grad_norm": 3.5717923641204834, + "learning_rate": 4.791636058651586e-06, + "loss": 0.7034, + "step": 56554 + }, + { + "epoch": 0.7069676741918548, + "grad_norm": 2.1019840240478516, + "learning_rate": 4.790891101945904e-06, + "loss": 1.3135, + "step": 56556 + }, + { + "epoch": 0.7069926748168704, + "grad_norm": 1.16506826877594, + "learning_rate": 4.790146184912054e-06, + "loss": 0.098, + "step": 56558 + }, + { + "epoch": 0.7070176754418861, + "grad_norm": 2.631427049636841, + "learning_rate": 4.789401307555718e-06, + "loss": 0.2503, + "step": 56560 + }, + { + "epoch": 0.7070426760669016, + "grad_norm": 8.456863403320312, + "learning_rate": 4.78865646988256e-06, + "loss": 0.452, + "step": 56562 + }, + { + "epoch": 0.7070676766919173, + "grad_norm": 5.589136600494385, + "learning_rate": 4.787911671898262e-06, + "loss": 1.8074, + "step": 56564 + }, + { + "epoch": 0.7070926773169329, + "grad_norm": 5.173182964324951, + "learning_rate": 4.7871669136084875e-06, + "loss": 1.057, + "step": 56566 + }, + { + "epoch": 0.7071176779419486, + "grad_norm": 2.3213655948638916, + "learning_rate": 4.786422195018917e-06, + "loss": 0.684, + "step": 56568 + }, + { + "epoch": 0.7071426785669642, + "grad_norm": 3.3287620544433594, + "learning_rate": 4.785677516135218e-06, + "loss": 0.5635, + "step": 56570 + }, + { + "epoch": 0.7071676791919798, + "grad_norm": 1.9721179008483887, + "learning_rate": 4.784932876963058e-06, + "loss": 1.2897, + "step": 56572 + }, + { + "epoch": 0.7071926798169954, + "grad_norm": 3.4822757244110107, + "learning_rate": 4.7841882775081135e-06, + "loss": 1.7842, + "step": 56574 + }, + { + "epoch": 0.707217680442011, + "grad_norm": 4.487489700317383, + "learning_rate": 4.7834437177760515e-06, + "loss": 1.5298, + "step": 56576 + }, + { + "epoch": 0.7072426810670267, + "grad_norm": 5.301706790924072, + "learning_rate": 4.782699197772549e-06, + "loss": 0.2221, + "step": 56578 + }, + { + "epoch": 0.7072676816920423, + "grad_norm": 1.9428149461746216, + "learning_rate": 4.78195471750327e-06, + "loss": 0.2282, + "step": 56580 + }, + { + "epoch": 0.7072926823170579, + "grad_norm": 1.7809995412826538, + "learning_rate": 4.7812102769738826e-06, + "loss": 0.5747, + "step": 56582 + }, + { + "epoch": 0.7073176829420735, + "grad_norm": 3.440998077392578, + "learning_rate": 4.780465876190064e-06, + "loss": 0.8006, + "step": 56584 + }, + { + "epoch": 0.7073426835670892, + "grad_norm": 0.0004859094915445894, + "learning_rate": 4.779721515157475e-06, + "loss": 0.0, + "step": 56586 + }, + { + "epoch": 0.7073676841921048, + "grad_norm": 1.0978882312774658, + "learning_rate": 4.778977193881792e-06, + "loss": 0.4969, + "step": 56588 + }, + { + "epoch": 0.7073926848171205, + "grad_norm": 3.828385591506958, + "learning_rate": 4.778232912368678e-06, + "loss": 1.7569, + "step": 56590 + }, + { + "epoch": 0.707417685442136, + "grad_norm": 0.9509457945823669, + "learning_rate": 4.777488670623807e-06, + "loss": 0.036, + "step": 56592 + }, + { + "epoch": 0.7074426860671517, + "grad_norm": 0.8767444491386414, + "learning_rate": 4.776744468652843e-06, + "loss": 0.6086, + "step": 56594 + }, + { + "epoch": 0.7074676866921673, + "grad_norm": 0.0007467762916348875, + "learning_rate": 4.7760003064614506e-06, + "loss": 0.2642, + "step": 56596 + }, + { + "epoch": 0.707492687317183, + "grad_norm": 5.238777160644531, + "learning_rate": 4.7752561840553055e-06, + "loss": 0.3963, + "step": 56598 + }, + { + "epoch": 0.7075176879421986, + "grad_norm": 4.411881923675537, + "learning_rate": 4.774512101440066e-06, + "loss": 0.8109, + "step": 56600 + }, + { + "epoch": 0.7075426885672141, + "grad_norm": 0.000721139251254499, + "learning_rate": 4.773768058621409e-06, + "loss": 0.123, + "step": 56602 + }, + { + "epoch": 0.7075676891922298, + "grad_norm": 2.179483652114868, + "learning_rate": 4.77302405560499e-06, + "loss": 0.5601, + "step": 56604 + }, + { + "epoch": 0.7075926898172454, + "grad_norm": 4.199672222137451, + "learning_rate": 4.772280092396484e-06, + "loss": 1.0087, + "step": 56606 + }, + { + "epoch": 0.7076176904422611, + "grad_norm": 2.7734971046447754, + "learning_rate": 4.7715361690015545e-06, + "loss": 0.4862, + "step": 56608 + }, + { + "epoch": 0.7076426910672767, + "grad_norm": 2.063854932785034, + "learning_rate": 4.770792285425863e-06, + "loss": 1.3045, + "step": 56610 + }, + { + "epoch": 0.7076676916922923, + "grad_norm": 0.4497281610965729, + "learning_rate": 4.77004844167508e-06, + "loss": 1.6799, + "step": 56612 + }, + { + "epoch": 0.7076926923173079, + "grad_norm": 0.0022201775573194027, + "learning_rate": 4.769304637754866e-06, + "loss": 0.3099, + "step": 56614 + }, + { + "epoch": 0.7077176929423236, + "grad_norm": 3.334698438644409, + "learning_rate": 4.768560873670892e-06, + "loss": 0.4692, + "step": 56616 + }, + { + "epoch": 0.7077426935673392, + "grad_norm": 6.843847751617432, + "learning_rate": 4.7678171494288136e-06, + "loss": 1.1785, + "step": 56618 + }, + { + "epoch": 0.7077676941923549, + "grad_norm": 8.325864791870117, + "learning_rate": 4.767073465034305e-06, + "loss": 1.4891, + "step": 56620 + }, + { + "epoch": 0.7077926948173704, + "grad_norm": 2.7751200199127197, + "learning_rate": 4.7663298204930254e-06, + "loss": 0.8221, + "step": 56622 + }, + { + "epoch": 0.707817695442386, + "grad_norm": 12.301054000854492, + "learning_rate": 4.765586215810633e-06, + "loss": 0.5118, + "step": 56624 + }, + { + "epoch": 0.7078426960674017, + "grad_norm": 0.0022823719773441553, + "learning_rate": 4.764842650992799e-06, + "loss": 0.1156, + "step": 56626 + }, + { + "epoch": 0.7078676966924173, + "grad_norm": 2.722548007965088, + "learning_rate": 4.764099126045179e-06, + "loss": 1.0389, + "step": 56628 + }, + { + "epoch": 0.707892697317433, + "grad_norm": 5.8617048263549805, + "learning_rate": 4.763355640973445e-06, + "loss": 0.7017, + "step": 56630 + }, + { + "epoch": 0.7079176979424485, + "grad_norm": 3.022522211074829, + "learning_rate": 4.7626121957832524e-06, + "loss": 0.4969, + "step": 56632 + }, + { + "epoch": 0.7079426985674642, + "grad_norm": 4.717041492462158, + "learning_rate": 4.761868790480263e-06, + "loss": 1.1546, + "step": 56634 + }, + { + "epoch": 0.7079676991924798, + "grad_norm": 3.428898334503174, + "learning_rate": 4.761125425070142e-06, + "loss": 0.582, + "step": 56636 + }, + { + "epoch": 0.7079926998174955, + "grad_norm": 2.256608009338379, + "learning_rate": 4.760382099558546e-06, + "loss": 0.6291, + "step": 56638 + }, + { + "epoch": 0.7080177004425111, + "grad_norm": 3.202043056488037, + "learning_rate": 4.759638813951143e-06, + "loss": 0.7487, + "step": 56640 + }, + { + "epoch": 0.7080427010675266, + "grad_norm": 3.012651205062866, + "learning_rate": 4.758895568253585e-06, + "loss": 1.2169, + "step": 56642 + }, + { + "epoch": 0.7080677016925423, + "grad_norm": 0.48386651277542114, + "learning_rate": 4.758152362471541e-06, + "loss": 0.6622, + "step": 56644 + }, + { + "epoch": 0.7080927023175579, + "grad_norm": 2.8756394386291504, + "learning_rate": 4.757409196610669e-06, + "loss": 0.7351, + "step": 56646 + }, + { + "epoch": 0.7081177029425736, + "grad_norm": 5.326900482177734, + "learning_rate": 4.756666070676621e-06, + "loss": 1.8329, + "step": 56648 + }, + { + "epoch": 0.7081427035675892, + "grad_norm": 3.57035756111145, + "learning_rate": 4.7559229846750675e-06, + "loss": 0.9066, + "step": 56650 + }, + { + "epoch": 0.7081677041926048, + "grad_norm": 2.990630865097046, + "learning_rate": 4.755179938611661e-06, + "loss": 0.9518, + "step": 56652 + }, + { + "epoch": 0.7081927048176204, + "grad_norm": 2.2048535346984863, + "learning_rate": 4.754436932492064e-06, + "loss": 0.1691, + "step": 56654 + }, + { + "epoch": 0.7082177054426361, + "grad_norm": 1.5592145919799805, + "learning_rate": 4.7536939663219295e-06, + "loss": 1.2903, + "step": 56656 + }, + { + "epoch": 0.7082427060676517, + "grad_norm": 1.9966493844985962, + "learning_rate": 4.752951040106925e-06, + "loss": 0.5637, + "step": 56658 + }, + { + "epoch": 0.7082677066926674, + "grad_norm": 0.060443371534347534, + "learning_rate": 4.752208153852703e-06, + "loss": 0.1575, + "step": 56660 + }, + { + "epoch": 0.7082927073176829, + "grad_norm": 6.986965656280518, + "learning_rate": 4.751465307564917e-06, + "loss": 1.3376, + "step": 56662 + }, + { + "epoch": 0.7083177079426985, + "grad_norm": 5.641208171844482, + "learning_rate": 4.750722501249234e-06, + "loss": 1.2074, + "step": 56664 + }, + { + "epoch": 0.7083427085677142, + "grad_norm": 4.49351167678833, + "learning_rate": 4.749979734911302e-06, + "loss": 0.4494, + "step": 56666 + }, + { + "epoch": 0.7083677091927298, + "grad_norm": 2.6981663703918457, + "learning_rate": 4.749237008556785e-06, + "loss": 1.0944, + "step": 56668 + }, + { + "epoch": 0.7083927098177455, + "grad_norm": 1.2310200929641724, + "learning_rate": 4.748494322191333e-06, + "loss": 0.313, + "step": 56670 + }, + { + "epoch": 0.708417710442761, + "grad_norm": 5.171476364135742, + "learning_rate": 4.7477516758206096e-06, + "loss": 0.8641, + "step": 56672 + }, + { + "epoch": 0.7084427110677767, + "grad_norm": 2.1880154609680176, + "learning_rate": 4.747009069450267e-06, + "loss": 0.5901, + "step": 56674 + }, + { + "epoch": 0.7084677116927923, + "grad_norm": 4.078946113586426, + "learning_rate": 4.746266503085956e-06, + "loss": 1.5553, + "step": 56676 + }, + { + "epoch": 0.708492712317808, + "grad_norm": 5.2121734619140625, + "learning_rate": 4.74552397673334e-06, + "loss": 0.4394, + "step": 56678 + }, + { + "epoch": 0.7085177129428236, + "grad_norm": 3.5691585540771484, + "learning_rate": 4.744781490398067e-06, + "loss": 1.0021, + "step": 56680 + }, + { + "epoch": 0.7085427135678392, + "grad_norm": 6.609676361083984, + "learning_rate": 4.7440390440858e-06, + "loss": 2.0722, + "step": 56682 + }, + { + "epoch": 0.7085677141928548, + "grad_norm": 0.0006730161840096116, + "learning_rate": 4.743296637802186e-06, + "loss": 0.0, + "step": 56684 + }, + { + "epoch": 0.7085927148178705, + "grad_norm": 6.440779209136963, + "learning_rate": 4.7425542715528795e-06, + "loss": 2.4228, + "step": 56686 + }, + { + "epoch": 0.7086177154428861, + "grad_norm": 1.8572038412094116, + "learning_rate": 4.741811945343539e-06, + "loss": 0.4848, + "step": 56688 + }, + { + "epoch": 0.7086427160679017, + "grad_norm": 0.19550175964832306, + "learning_rate": 4.741069659179812e-06, + "loss": 0.2238, + "step": 56690 + }, + { + "epoch": 0.7086677166929173, + "grad_norm": 0.025206737220287323, + "learning_rate": 4.740327413067358e-06, + "loss": 1.8903, + "step": 56692 + }, + { + "epoch": 0.7086927173179329, + "grad_norm": 5.506618499755859, + "learning_rate": 4.7395852070118234e-06, + "loss": 0.8496, + "step": 56694 + }, + { + "epoch": 0.7087177179429486, + "grad_norm": 0.008458146825432777, + "learning_rate": 4.738843041018868e-06, + "loss": 0.1562, + "step": 56696 + }, + { + "epoch": 0.7087427185679642, + "grad_norm": 3.125880718231201, + "learning_rate": 4.738100915094139e-06, + "loss": 1.1079, + "step": 56698 + }, + { + "epoch": 0.7087677191929799, + "grad_norm": 3.1422595977783203, + "learning_rate": 4.737358829243286e-06, + "loss": 1.1486, + "step": 56700 + }, + { + "epoch": 0.7087927198179954, + "grad_norm": 5.953192234039307, + "learning_rate": 4.736616783471968e-06, + "loss": 1.2243, + "step": 56702 + }, + { + "epoch": 0.7088177204430111, + "grad_norm": 6.567090034484863, + "learning_rate": 4.735874777785828e-06, + "loss": 1.5946, + "step": 56704 + }, + { + "epoch": 0.7088427210680267, + "grad_norm": 3.1395726203918457, + "learning_rate": 4.735132812190526e-06, + "loss": 1.141, + "step": 56706 + }, + { + "epoch": 0.7088677216930424, + "grad_norm": 0.003873602719977498, + "learning_rate": 4.734390886691703e-06, + "loss": 0.3567, + "step": 56708 + }, + { + "epoch": 0.708892722318058, + "grad_norm": 0.04310221970081329, + "learning_rate": 4.733649001295019e-06, + "loss": 0.9555, + "step": 56710 + }, + { + "epoch": 0.7089177229430735, + "grad_norm": 2.2701659202575684, + "learning_rate": 4.732907156006119e-06, + "loss": 0.8078, + "step": 56712 + }, + { + "epoch": 0.7089427235680892, + "grad_norm": 6.125415325164795, + "learning_rate": 4.73216535083065e-06, + "loss": 0.9293, + "step": 56714 + }, + { + "epoch": 0.7089677241931048, + "grad_norm": 1.0748368501663208, + "learning_rate": 4.731423585774267e-06, + "loss": 0.0201, + "step": 56716 + }, + { + "epoch": 0.7089927248181205, + "grad_norm": 2.534390687942505, + "learning_rate": 4.730681860842615e-06, + "loss": 0.2425, + "step": 56718 + }, + { + "epoch": 0.7090177254431361, + "grad_norm": 2.5537075996398926, + "learning_rate": 4.729940176041347e-06, + "loss": 1.3673, + "step": 56720 + }, + { + "epoch": 0.7090427260681517, + "grad_norm": 4.0714945793151855, + "learning_rate": 4.729198531376108e-06, + "loss": 2.0652, + "step": 56722 + }, + { + "epoch": 0.7090677266931673, + "grad_norm": 0.0011229758383706212, + "learning_rate": 4.728456926852549e-06, + "loss": 1.133, + "step": 56724 + }, + { + "epoch": 0.709092727318183, + "grad_norm": 2.6074626445770264, + "learning_rate": 4.727715362476318e-06, + "loss": 0.3403, + "step": 56726 + }, + { + "epoch": 0.7091177279431986, + "grad_norm": 1.6039090156555176, + "learning_rate": 4.726973838253057e-06, + "loss": 1.18, + "step": 56728 + }, + { + "epoch": 0.7091427285682143, + "grad_norm": 1.7474197149276733, + "learning_rate": 4.726232354188422e-06, + "loss": 0.7028, + "step": 56730 + }, + { + "epoch": 0.7091677291932298, + "grad_norm": 5.301721572875977, + "learning_rate": 4.725490910288051e-06, + "loss": 1.0582, + "step": 56732 + }, + { + "epoch": 0.7091927298182454, + "grad_norm": 3.647469997406006, + "learning_rate": 4.724749506557595e-06, + "loss": 1.3525, + "step": 56734 + }, + { + "epoch": 0.7092177304432611, + "grad_norm": 1.9150151014328003, + "learning_rate": 4.724008143002705e-06, + "loss": 0.7923, + "step": 56736 + }, + { + "epoch": 0.7092427310682767, + "grad_norm": 1.0498417615890503, + "learning_rate": 4.723266819629023e-06, + "loss": 0.0863, + "step": 56738 + }, + { + "epoch": 0.7092677316932924, + "grad_norm": 2.1125073432922363, + "learning_rate": 4.722525536442194e-06, + "loss": 0.4104, + "step": 56740 + }, + { + "epoch": 0.7092927323183079, + "grad_norm": 0.8265984058380127, + "learning_rate": 4.721784293447862e-06, + "loss": 0.3827, + "step": 56742 + }, + { + "epoch": 0.7093177329433236, + "grad_norm": 7.766053676605225, + "learning_rate": 4.721043090651677e-06, + "loss": 1.1184, + "step": 56744 + }, + { + "epoch": 0.7093427335683392, + "grad_norm": 2.9617388248443604, + "learning_rate": 4.720301928059278e-06, + "loss": 1.0649, + "step": 56746 + }, + { + "epoch": 0.7093677341933549, + "grad_norm": 4.3821001052856445, + "learning_rate": 4.719560805676312e-06, + "loss": 1.4115, + "step": 56748 + }, + { + "epoch": 0.7093927348183705, + "grad_norm": 5.436820983886719, + "learning_rate": 4.7188197235084325e-06, + "loss": 1.5181, + "step": 56750 + }, + { + "epoch": 0.709417735443386, + "grad_norm": 2.0735888481140137, + "learning_rate": 4.718078681561268e-06, + "loss": 1.0529, + "step": 56752 + }, + { + "epoch": 0.7094427360684017, + "grad_norm": 0.004614069126546383, + "learning_rate": 4.717337679840472e-06, + "loss": 0.5287, + "step": 56754 + }, + { + "epoch": 0.7094677366934173, + "grad_norm": 5.7196221351623535, + "learning_rate": 4.716596718351682e-06, + "loss": 0.4583, + "step": 56756 + }, + { + "epoch": 0.709492737318433, + "grad_norm": 3.6681606769561768, + "learning_rate": 4.715855797100545e-06, + "loss": 0.7166, + "step": 56758 + }, + { + "epoch": 0.7095177379434486, + "grad_norm": 4.788825511932373, + "learning_rate": 4.7151149160927055e-06, + "loss": 0.795, + "step": 56760 + }, + { + "epoch": 0.7095427385684642, + "grad_norm": 1.2106716632843018, + "learning_rate": 4.714374075333801e-06, + "loss": 0.6714, + "step": 56762 + }, + { + "epoch": 0.7095677391934798, + "grad_norm": 3.837425947189331, + "learning_rate": 4.713633274829482e-06, + "loss": 1.5313, + "step": 56764 + }, + { + "epoch": 0.7095927398184955, + "grad_norm": 2.1369004249572754, + "learning_rate": 4.712892514585378e-06, + "loss": 0.5664, + "step": 56766 + }, + { + "epoch": 0.7096177404435111, + "grad_norm": 2.7235867977142334, + "learning_rate": 4.71215179460714e-06, + "loss": 0.7235, + "step": 56768 + }, + { + "epoch": 0.7096427410685268, + "grad_norm": 1.732540488243103, + "learning_rate": 4.711411114900403e-06, + "loss": 0.3449, + "step": 56770 + }, + { + "epoch": 0.7096677416935423, + "grad_norm": 3.1141960620880127, + "learning_rate": 4.710670475470811e-06, + "loss": 0.999, + "step": 56772 + }, + { + "epoch": 0.709692742318558, + "grad_norm": 2.9925954341888428, + "learning_rate": 4.709929876324009e-06, + "loss": 0.567, + "step": 56774 + }, + { + "epoch": 0.7097177429435736, + "grad_norm": 3.4350523948669434, + "learning_rate": 4.7091893174656325e-06, + "loss": 0.8284, + "step": 56776 + }, + { + "epoch": 0.7097427435685892, + "grad_norm": 0.0006991393165662885, + "learning_rate": 4.708448798901322e-06, + "loss": 0.0, + "step": 56778 + }, + { + "epoch": 0.7097677441936049, + "grad_norm": 2.290435314178467, + "learning_rate": 4.7077083206367125e-06, + "loss": 1.6821, + "step": 56780 + }, + { + "epoch": 0.7097927448186204, + "grad_norm": 2.6312062740325928, + "learning_rate": 4.706967882677452e-06, + "loss": 1.1516, + "step": 56782 + }, + { + "epoch": 0.7098177454436361, + "grad_norm": 3.8906633853912354, + "learning_rate": 4.706227485029173e-06, + "loss": 1.5341, + "step": 56784 + }, + { + "epoch": 0.7098427460686517, + "grad_norm": 0.7903380990028381, + "learning_rate": 4.705487127697517e-06, + "loss": 0.1992, + "step": 56786 + }, + { + "epoch": 0.7098677466936674, + "grad_norm": 5.149321556091309, + "learning_rate": 4.704746810688125e-06, + "loss": 1.1124, + "step": 56788 + }, + { + "epoch": 0.709892747318683, + "grad_norm": 1.8603777885437012, + "learning_rate": 4.704006534006632e-06, + "loss": 1.0833, + "step": 56790 + }, + { + "epoch": 0.7099177479436986, + "grad_norm": 5.733494281768799, + "learning_rate": 4.703266297658678e-06, + "loss": 1.1123, + "step": 56792 + }, + { + "epoch": 0.7099427485687142, + "grad_norm": 0.002663195366039872, + "learning_rate": 4.702526101649894e-06, + "loss": 0.3856, + "step": 56794 + }, + { + "epoch": 0.7099677491937298, + "grad_norm": 4.391610145568848, + "learning_rate": 4.7017859459859215e-06, + "loss": 1.2967, + "step": 56796 + }, + { + "epoch": 0.7099927498187455, + "grad_norm": 3.9860408306121826, + "learning_rate": 4.701045830672403e-06, + "loss": 0.9685, + "step": 56798 + }, + { + "epoch": 0.7100177504437611, + "grad_norm": 3.316265821456909, + "learning_rate": 4.700305755714966e-06, + "loss": 1.2577, + "step": 56800 + }, + { + "epoch": 0.7100427510687767, + "grad_norm": 3.708383321762085, + "learning_rate": 4.699565721119258e-06, + "loss": 0.7615, + "step": 56802 + }, + { + "epoch": 0.7100677516937923, + "grad_norm": 3.977198362350464, + "learning_rate": 4.698825726890901e-06, + "loss": 1.3426, + "step": 56804 + }, + { + "epoch": 0.710092752318808, + "grad_norm": 2.2560765743255615, + "learning_rate": 4.698085773035541e-06, + "loss": 0.8865, + "step": 56806 + }, + { + "epoch": 0.7101177529438236, + "grad_norm": 3.2163379192352295, + "learning_rate": 4.697345859558806e-06, + "loss": 2.1388, + "step": 56808 + }, + { + "epoch": 0.7101427535688393, + "grad_norm": 4.612335205078125, + "learning_rate": 4.696605986466336e-06, + "loss": 1.1895, + "step": 56810 + }, + { + "epoch": 0.7101677541938548, + "grad_norm": 5.468186855316162, + "learning_rate": 4.695866153763768e-06, + "loss": 3.4912, + "step": 56812 + }, + { + "epoch": 0.7101927548188705, + "grad_norm": 3.096165895462036, + "learning_rate": 4.695126361456729e-06, + "loss": 1.5438, + "step": 56814 + }, + { + "epoch": 0.7102177554438861, + "grad_norm": 3.2558584213256836, + "learning_rate": 4.6943866095508665e-06, + "loss": 0.6148, + "step": 56816 + }, + { + "epoch": 0.7102427560689017, + "grad_norm": 4.450795650482178, + "learning_rate": 4.693646898051796e-06, + "loss": 0.9881, + "step": 56818 + }, + { + "epoch": 0.7102677566939174, + "grad_norm": 2.0075953006744385, + "learning_rate": 4.692907226965161e-06, + "loss": 0.9602, + "step": 56820 + }, + { + "epoch": 0.7102927573189329, + "grad_norm": 3.2678961753845215, + "learning_rate": 4.692167596296598e-06, + "loss": 1.2044, + "step": 56822 + }, + { + "epoch": 0.7103177579439486, + "grad_norm": 3.216996669769287, + "learning_rate": 4.691428006051732e-06, + "loss": 0.8498, + "step": 56824 + }, + { + "epoch": 0.7103427585689642, + "grad_norm": 0.002325172768905759, + "learning_rate": 4.690688456236204e-06, + "loss": 0.5437, + "step": 56826 + }, + { + "epoch": 0.7103677591939799, + "grad_norm": 1.2935742139816284, + "learning_rate": 4.689948946855641e-06, + "loss": 0.7495, + "step": 56828 + }, + { + "epoch": 0.7103927598189955, + "grad_norm": 4.900768280029297, + "learning_rate": 4.689209477915677e-06, + "loss": 1.547, + "step": 56830 + }, + { + "epoch": 0.7104177604440111, + "grad_norm": 3.8781678676605225, + "learning_rate": 4.688470049421938e-06, + "loss": 1.3728, + "step": 56832 + }, + { + "epoch": 0.7104427610690267, + "grad_norm": 0.002499382011592388, + "learning_rate": 4.68773066138006e-06, + "loss": 0.6643, + "step": 56834 + }, + { + "epoch": 0.7104677616940424, + "grad_norm": 7.933950424194336, + "learning_rate": 4.686991313795678e-06, + "loss": 0.3837, + "step": 56836 + }, + { + "epoch": 0.710492762319058, + "grad_norm": 2.988715887069702, + "learning_rate": 4.686252006674415e-06, + "loss": 0.2229, + "step": 56838 + }, + { + "epoch": 0.7105177629440736, + "grad_norm": 6.22080135345459, + "learning_rate": 4.685512740021909e-06, + "loss": 0.7803, + "step": 56840 + }, + { + "epoch": 0.7105427635690892, + "grad_norm": 0.000740277988370508, + "learning_rate": 4.684773513843787e-06, + "loss": 0.3447, + "step": 56842 + }, + { + "epoch": 0.7105677641941048, + "grad_norm": 0.10079462081193924, + "learning_rate": 4.684034328145678e-06, + "loss": 0.0989, + "step": 56844 + }, + { + "epoch": 0.7105927648191205, + "grad_norm": 4.069825649261475, + "learning_rate": 4.683295182933208e-06, + "loss": 1.1199, + "step": 56846 + }, + { + "epoch": 0.7106177654441361, + "grad_norm": 3.9969069957733154, + "learning_rate": 4.68255607821201e-06, + "loss": 0.8542, + "step": 56848 + }, + { + "epoch": 0.7106427660691518, + "grad_norm": 4.645850658416748, + "learning_rate": 4.681817013987716e-06, + "loss": 1.0336, + "step": 56850 + }, + { + "epoch": 0.7106677666941673, + "grad_norm": 3.0964064598083496, + "learning_rate": 4.681077990265949e-06, + "loss": 1.3071, + "step": 56852 + }, + { + "epoch": 0.710692767319183, + "grad_norm": 4.1822285652160645, + "learning_rate": 4.680339007052347e-06, + "loss": 0.732, + "step": 56854 + }, + { + "epoch": 0.7107177679441986, + "grad_norm": 3.9587020874023438, + "learning_rate": 4.679600064352523e-06, + "loss": 0.9233, + "step": 56856 + }, + { + "epoch": 0.7107427685692143, + "grad_norm": 0.5195274949073792, + "learning_rate": 4.678861162172113e-06, + "loss": 0.0109, + "step": 56858 + }, + { + "epoch": 0.7107677691942299, + "grad_norm": 6.061429500579834, + "learning_rate": 4.678122300516748e-06, + "loss": 0.4731, + "step": 56860 + }, + { + "epoch": 0.7107927698192454, + "grad_norm": 4.531704425811768, + "learning_rate": 4.677383479392046e-06, + "loss": 0.8117, + "step": 56862 + }, + { + "epoch": 0.7108177704442611, + "grad_norm": 2.9164063930511475, + "learning_rate": 4.676644698803643e-06, + "loss": 0.7207, + "step": 56864 + }, + { + "epoch": 0.7108427710692767, + "grad_norm": 5.483366012573242, + "learning_rate": 4.675905958757158e-06, + "loss": 0.6474, + "step": 56866 + }, + { + "epoch": 0.7108677716942924, + "grad_norm": 2.502523422241211, + "learning_rate": 4.675167259258226e-06, + "loss": 0.5048, + "step": 56868 + }, + { + "epoch": 0.710892772319308, + "grad_norm": 2.6524300575256348, + "learning_rate": 4.674428600312462e-06, + "loss": 0.5827, + "step": 56870 + }, + { + "epoch": 0.7109177729443236, + "grad_norm": 4.614184856414795, + "learning_rate": 4.673689981925495e-06, + "loss": 1.3541, + "step": 56872 + }, + { + "epoch": 0.7109427735693392, + "grad_norm": 2.3916380405426025, + "learning_rate": 4.672951404102956e-06, + "loss": 0.7395, + "step": 56874 + }, + { + "epoch": 0.7109677741943549, + "grad_norm": 0.05415380001068115, + "learning_rate": 4.672212866850463e-06, + "loss": 0.8611, + "step": 56876 + }, + { + "epoch": 0.7109927748193705, + "grad_norm": 3.1122565269470215, + "learning_rate": 4.671474370173646e-06, + "loss": 0.8026, + "step": 56878 + }, + { + "epoch": 0.7110177754443862, + "grad_norm": 2.872129201889038, + "learning_rate": 4.670735914078127e-06, + "loss": 1.0351, + "step": 56880 + }, + { + "epoch": 0.7110427760694017, + "grad_norm": 0.0016792056849226356, + "learning_rate": 4.6699974985695265e-06, + "loss": 1.1856, + "step": 56882 + }, + { + "epoch": 0.7110677766944173, + "grad_norm": 3.037853240966797, + "learning_rate": 4.6692591236534745e-06, + "loss": 1.8514, + "step": 56884 + }, + { + "epoch": 0.711092777319433, + "grad_norm": 8.766682624816895, + "learning_rate": 4.668520789335588e-06, + "loss": 0.5591, + "step": 56886 + }, + { + "epoch": 0.7111177779444486, + "grad_norm": 4.086602210998535, + "learning_rate": 4.667782495621497e-06, + "loss": 1.6736, + "step": 56888 + }, + { + "epoch": 0.7111427785694643, + "grad_norm": 0.061481792479753494, + "learning_rate": 4.667044242516816e-06, + "loss": 0.002, + "step": 56890 + }, + { + "epoch": 0.7111677791944798, + "grad_norm": 1.988216519355774, + "learning_rate": 4.666306030027176e-06, + "loss": 0.4889, + "step": 56892 + }, + { + "epoch": 0.7111927798194955, + "grad_norm": 8.88945484161377, + "learning_rate": 4.665567858158194e-06, + "loss": 1.3246, + "step": 56894 + }, + { + "epoch": 0.7112177804445111, + "grad_norm": 2.6456639766693115, + "learning_rate": 4.66482972691549e-06, + "loss": 0.609, + "step": 56896 + }, + { + "epoch": 0.7112427810695268, + "grad_norm": 4.848476886749268, + "learning_rate": 4.664091636304693e-06, + "loss": 0.9846, + "step": 56898 + }, + { + "epoch": 0.7112677816945424, + "grad_norm": 7.948532581329346, + "learning_rate": 4.663353586331415e-06, + "loss": 2.4826, + "step": 56900 + }, + { + "epoch": 0.711292782319558, + "grad_norm": 2.2707314491271973, + "learning_rate": 4.662615577001285e-06, + "loss": 1.0815, + "step": 56902 + }, + { + "epoch": 0.7113177829445736, + "grad_norm": 2.6308412551879883, + "learning_rate": 4.661877608319916e-06, + "loss": 0.4896, + "step": 56904 + }, + { + "epoch": 0.7113427835695892, + "grad_norm": 2.254603862762451, + "learning_rate": 4.66113968029294e-06, + "loss": 1.0127, + "step": 56906 + }, + { + "epoch": 0.7113677841946049, + "grad_norm": 1.1921566724777222, + "learning_rate": 4.660401792925961e-06, + "loss": 0.8111, + "step": 56908 + }, + { + "epoch": 0.7113927848196205, + "grad_norm": 3.2168219089508057, + "learning_rate": 4.6596639462246085e-06, + "loss": 0.0965, + "step": 56910 + }, + { + "epoch": 0.7114177854446361, + "grad_norm": 4.051665306091309, + "learning_rate": 4.658926140194503e-06, + "loss": 1.0482, + "step": 56912 + }, + { + "epoch": 0.7114427860696517, + "grad_norm": 2.582740545272827, + "learning_rate": 4.658188374841257e-06, + "loss": 0.1732, + "step": 56914 + }, + { + "epoch": 0.7114677866946674, + "grad_norm": 3.422896385192871, + "learning_rate": 4.657450650170498e-06, + "loss": 1.5626, + "step": 56916 + }, + { + "epoch": 0.711492787319683, + "grad_norm": 2.6479060649871826, + "learning_rate": 4.6567129661878326e-06, + "loss": 1.3267, + "step": 56918 + }, + { + "epoch": 0.7115177879446987, + "grad_norm": 0.7800050973892212, + "learning_rate": 4.6559753228988926e-06, + "loss": 0.0428, + "step": 56920 + }, + { + "epoch": 0.7115427885697142, + "grad_norm": 3.132577419281006, + "learning_rate": 4.655237720309287e-06, + "loss": 0.9193, + "step": 56922 + }, + { + "epoch": 0.7115677891947298, + "grad_norm": 4.3766913414001465, + "learning_rate": 4.654500158424632e-06, + "loss": 0.7817, + "step": 56924 + }, + { + "epoch": 0.7115927898197455, + "grad_norm": 0.0007569340523332357, + "learning_rate": 4.653762637250551e-06, + "loss": 0.3079, + "step": 56926 + }, + { + "epoch": 0.7116177904447611, + "grad_norm": 4.234554767608643, + "learning_rate": 4.653025156792655e-06, + "loss": 1.4396, + "step": 56928 + }, + { + "epoch": 0.7116427910697768, + "grad_norm": 3.866610527038574, + "learning_rate": 4.6522877170565654e-06, + "loss": 0.254, + "step": 56930 + }, + { + "epoch": 0.7116677916947923, + "grad_norm": 3.6813013553619385, + "learning_rate": 4.651550318047897e-06, + "loss": 1.2887, + "step": 56932 + }, + { + "epoch": 0.711692792319808, + "grad_norm": 2.9848103523254395, + "learning_rate": 4.650812959772262e-06, + "loss": 1.3341, + "step": 56934 + }, + { + "epoch": 0.7117177929448236, + "grad_norm": 0.0006432258524000645, + "learning_rate": 4.650075642235282e-06, + "loss": 0.3003, + "step": 56936 + }, + { + "epoch": 0.7117427935698393, + "grad_norm": 4.390383720397949, + "learning_rate": 4.649338365442566e-06, + "loss": 0.9408, + "step": 56938 + }, + { + "epoch": 0.7117677941948549, + "grad_norm": 2.575896739959717, + "learning_rate": 4.648601129399736e-06, + "loss": 0.4812, + "step": 56940 + }, + { + "epoch": 0.7117927948198705, + "grad_norm": 0.0005635535344481468, + "learning_rate": 4.6478639341124e-06, + "loss": 0.5375, + "step": 56942 + }, + { + "epoch": 0.7118177954448861, + "grad_norm": 1.0128064155578613, + "learning_rate": 4.647126779586177e-06, + "loss": 0.0446, + "step": 56944 + }, + { + "epoch": 0.7118427960699018, + "grad_norm": 0.67889004945755, + "learning_rate": 4.6463896658266815e-06, + "loss": 0.012, + "step": 56946 + }, + { + "epoch": 0.7118677966949174, + "grad_norm": 1.97588050365448, + "learning_rate": 4.645652592839522e-06, + "loss": 1.0345, + "step": 56948 + }, + { + "epoch": 0.711892797319933, + "grad_norm": 5.362234115600586, + "learning_rate": 4.644915560630318e-06, + "loss": 2.1919, + "step": 56950 + }, + { + "epoch": 0.7119177979449486, + "grad_norm": 6.984341621398926, + "learning_rate": 4.6441785692046745e-06, + "loss": 1.7029, + "step": 56952 + }, + { + "epoch": 0.7119427985699642, + "grad_norm": 5.322007179260254, + "learning_rate": 4.643441618568215e-06, + "loss": 1.1854, + "step": 56954 + }, + { + "epoch": 0.7119677991949799, + "grad_norm": 4.371517181396484, + "learning_rate": 4.6427047087265435e-06, + "loss": 0.2911, + "step": 56956 + }, + { + "epoch": 0.7119927998199955, + "grad_norm": 5.624016284942627, + "learning_rate": 4.641967839685278e-06, + "loss": 0.7873, + "step": 56958 + }, + { + "epoch": 0.7120178004450112, + "grad_norm": 0.0015450895298272371, + "learning_rate": 4.641231011450028e-06, + "loss": 0.4532, + "step": 56960 + }, + { + "epoch": 0.7120428010700267, + "grad_norm": 8.580269813537598, + "learning_rate": 4.640494224026402e-06, + "loss": 1.1521, + "step": 56962 + }, + { + "epoch": 0.7120678016950424, + "grad_norm": 0.0005582838784903288, + "learning_rate": 4.639757477420016e-06, + "loss": 0.1592, + "step": 56964 + }, + { + "epoch": 0.712092802320058, + "grad_norm": 0.000858115148730576, + "learning_rate": 4.639020771636477e-06, + "loss": 0.5194, + "step": 56966 + }, + { + "epoch": 0.7121178029450737, + "grad_norm": 4.181730270385742, + "learning_rate": 4.6382841066814014e-06, + "loss": 0.5218, + "step": 56968 + }, + { + "epoch": 0.7121428035700893, + "grad_norm": 2.725738048553467, + "learning_rate": 4.63754748256039e-06, + "loss": 1.7647, + "step": 56970 + }, + { + "epoch": 0.7121678041951048, + "grad_norm": 8.341617584228516, + "learning_rate": 4.6368108992790635e-06, + "loss": 1.3912, + "step": 56972 + }, + { + "epoch": 0.7121928048201205, + "grad_norm": 1.3176957368850708, + "learning_rate": 4.6360743568430274e-06, + "loss": 0.7258, + "step": 56974 + }, + { + "epoch": 0.7122178054451361, + "grad_norm": 2.46415114402771, + "learning_rate": 4.635337855257886e-06, + "loss": 0.8185, + "step": 56976 + }, + { + "epoch": 0.7122428060701518, + "grad_norm": 2.1740541458129883, + "learning_rate": 4.634601394529258e-06, + "loss": 2.2129, + "step": 56978 + }, + { + "epoch": 0.7122678066951674, + "grad_norm": 1.3624136447906494, + "learning_rate": 4.633864974662743e-06, + "loss": 0.8196, + "step": 56980 + }, + { + "epoch": 0.712292807320183, + "grad_norm": 2.6251840591430664, + "learning_rate": 4.633128595663956e-06, + "loss": 1.2045, + "step": 56982 + }, + { + "epoch": 0.7123178079451986, + "grad_norm": 0.37842467427253723, + "learning_rate": 4.632392257538503e-06, + "loss": 0.3401, + "step": 56984 + }, + { + "epoch": 0.7123428085702143, + "grad_norm": 3.233342170715332, + "learning_rate": 4.631655960291989e-06, + "loss": 1.2372, + "step": 56986 + }, + { + "epoch": 0.7123678091952299, + "grad_norm": 2.1046369075775146, + "learning_rate": 4.630919703930026e-06, + "loss": 0.7907, + "step": 56988 + }, + { + "epoch": 0.7123928098202456, + "grad_norm": 1.7522107362747192, + "learning_rate": 4.630183488458216e-06, + "loss": 0.8357, + "step": 56990 + }, + { + "epoch": 0.7124178104452611, + "grad_norm": 3.4388909339904785, + "learning_rate": 4.629447313882174e-06, + "loss": 1.4422, + "step": 56992 + }, + { + "epoch": 0.7124428110702767, + "grad_norm": 3.878793239593506, + "learning_rate": 4.6287111802074965e-06, + "loss": 2.0517, + "step": 56994 + }, + { + "epoch": 0.7124678116952924, + "grad_norm": 4.887357234954834, + "learning_rate": 4.6279750874398e-06, + "loss": 1.0623, + "step": 56996 + }, + { + "epoch": 0.712492812320308, + "grad_norm": 3.9476003646850586, + "learning_rate": 4.6272390355846854e-06, + "loss": 1.6183, + "step": 56998 + }, + { + "epoch": 0.7125178129453237, + "grad_norm": 4.529545307159424, + "learning_rate": 4.6265030246477545e-06, + "loss": 0.9203, + "step": 57000 + }, + { + "epoch": 0.7125428135703392, + "grad_norm": 2.3196933269500732, + "learning_rate": 4.62576705463462e-06, + "loss": 1.2876, + "step": 57002 + }, + { + "epoch": 0.7125678141953549, + "grad_norm": 4.266037940979004, + "learning_rate": 4.6250311255508805e-06, + "loss": 1.8, + "step": 57004 + }, + { + "epoch": 0.7125928148203705, + "grad_norm": 0.5068632364273071, + "learning_rate": 4.624295237402148e-06, + "loss": 0.6122, + "step": 57006 + }, + { + "epoch": 0.7126178154453862, + "grad_norm": 7.501426696777344, + "learning_rate": 4.623559390194019e-06, + "loss": 0.9206, + "step": 57008 + }, + { + "epoch": 0.7126428160704018, + "grad_norm": 3.7112832069396973, + "learning_rate": 4.622823583932104e-06, + "loss": 1.7105, + "step": 57010 + }, + { + "epoch": 0.7126678166954173, + "grad_norm": 2.674475908279419, + "learning_rate": 4.622087818622005e-06, + "loss": 0.1766, + "step": 57012 + }, + { + "epoch": 0.712692817320433, + "grad_norm": 0.3914961516857147, + "learning_rate": 4.621352094269321e-06, + "loss": 0.011, + "step": 57014 + }, + { + "epoch": 0.7127178179454486, + "grad_norm": 14.41478157043457, + "learning_rate": 4.620616410879663e-06, + "loss": 2.3848, + "step": 57016 + }, + { + "epoch": 0.7127428185704643, + "grad_norm": 1.8297041654586792, + "learning_rate": 4.6198807684586244e-06, + "loss": 1.6957, + "step": 57018 + }, + { + "epoch": 0.7127678191954799, + "grad_norm": 5.359989643096924, + "learning_rate": 4.619145167011818e-06, + "loss": 0.9085, + "step": 57020 + }, + { + "epoch": 0.7127928198204955, + "grad_norm": 5.395732879638672, + "learning_rate": 4.618409606544837e-06, + "loss": 0.7136, + "step": 57022 + }, + { + "epoch": 0.7128178204455111, + "grad_norm": 0.003865635022521019, + "learning_rate": 4.617674087063291e-06, + "loss": 0.5339, + "step": 57024 + }, + { + "epoch": 0.7128428210705268, + "grad_norm": 3.48382306098938, + "learning_rate": 4.616938608572778e-06, + "loss": 1.3769, + "step": 57026 + }, + { + "epoch": 0.7128678216955424, + "grad_norm": 3.548647880554199, + "learning_rate": 4.616203171078897e-06, + "loss": 0.4223, + "step": 57028 + }, + { + "epoch": 0.7128928223205581, + "grad_norm": 2.5661749839782715, + "learning_rate": 4.615467774587253e-06, + "loss": 0.4922, + "step": 57030 + }, + { + "epoch": 0.7129178229455736, + "grad_norm": 3.19567608833313, + "learning_rate": 4.614732419103442e-06, + "loss": 0.6804, + "step": 57032 + }, + { + "epoch": 0.7129428235705892, + "grad_norm": 0.4023374319076538, + "learning_rate": 4.6139971046330714e-06, + "loss": 0.6193, + "step": 57034 + }, + { + "epoch": 0.7129678241956049, + "grad_norm": 2.162166118621826, + "learning_rate": 4.613261831181733e-06, + "loss": 0.4155, + "step": 57036 + }, + { + "epoch": 0.7129928248206205, + "grad_norm": 6.496057510375977, + "learning_rate": 4.612526598755035e-06, + "loss": 2.0997, + "step": 57038 + }, + { + "epoch": 0.7130178254456362, + "grad_norm": 3.5233068466186523, + "learning_rate": 4.611791407358572e-06, + "loss": 1.0977, + "step": 57040 + }, + { + "epoch": 0.7130428260706517, + "grad_norm": 1.7897145748138428, + "learning_rate": 4.6110562569979415e-06, + "loss": 0.0966, + "step": 57042 + }, + { + "epoch": 0.7130678266956674, + "grad_norm": 4.254757881164551, + "learning_rate": 4.6103211476787464e-06, + "loss": 1.3085, + "step": 57044 + }, + { + "epoch": 0.713092827320683, + "grad_norm": 2.2169461250305176, + "learning_rate": 4.609586079406581e-06, + "loss": 0.4639, + "step": 57046 + }, + { + "epoch": 0.7131178279456987, + "grad_norm": 0.0006222593365237117, + "learning_rate": 4.608851052187048e-06, + "loss": 0.6481, + "step": 57048 + }, + { + "epoch": 0.7131428285707143, + "grad_norm": 3.256122589111328, + "learning_rate": 4.608116066025743e-06, + "loss": 1.2247, + "step": 57050 + }, + { + "epoch": 0.7131678291957299, + "grad_norm": 3.5111289024353027, + "learning_rate": 4.60738112092826e-06, + "loss": 0.4767, + "step": 57052 + }, + { + "epoch": 0.7131928298207455, + "grad_norm": 9.127365112304688, + "learning_rate": 4.606646216900205e-06, + "loss": 1.3531, + "step": 57054 + }, + { + "epoch": 0.7132178304457611, + "grad_norm": 2.7358205318450928, + "learning_rate": 4.605911353947164e-06, + "loss": 0.2446, + "step": 57056 + }, + { + "epoch": 0.7132428310707768, + "grad_norm": 6.585366725921631, + "learning_rate": 4.605176532074744e-06, + "loss": 0.9144, + "step": 57058 + }, + { + "epoch": 0.7132678316957924, + "grad_norm": 0.010360220447182655, + "learning_rate": 4.604441751288533e-06, + "loss": 0.0028, + "step": 57060 + }, + { + "epoch": 0.713292832320808, + "grad_norm": 6.309264659881592, + "learning_rate": 4.603707011594135e-06, + "loss": 1.3528, + "step": 57062 + }, + { + "epoch": 0.7133178329458236, + "grad_norm": 0.09801576286554337, + "learning_rate": 4.60297231299714e-06, + "loss": 0.795, + "step": 57064 + }, + { + "epoch": 0.7133428335708393, + "grad_norm": 0.9785887002944946, + "learning_rate": 4.602237655503142e-06, + "loss": 0.5852, + "step": 57066 + }, + { + "epoch": 0.7133678341958549, + "grad_norm": 0.005792148411273956, + "learning_rate": 4.601503039117741e-06, + "loss": 0.6321, + "step": 57068 + }, + { + "epoch": 0.7133928348208706, + "grad_norm": 1.9872353076934814, + "learning_rate": 4.600768463846528e-06, + "loss": 0.5669, + "step": 57070 + }, + { + "epoch": 0.7134178354458861, + "grad_norm": 3.2141435146331787, + "learning_rate": 4.6000339296951e-06, + "loss": 1.3775, + "step": 57072 + }, + { + "epoch": 0.7134428360709018, + "grad_norm": 0.19541728496551514, + "learning_rate": 4.599299436669048e-06, + "loss": 0.5183, + "step": 57074 + }, + { + "epoch": 0.7134678366959174, + "grad_norm": 4.461981296539307, + "learning_rate": 4.598564984773971e-06, + "loss": 1.2695, + "step": 57076 + }, + { + "epoch": 0.713492837320933, + "grad_norm": 5.402286529541016, + "learning_rate": 4.59783057401546e-06, + "loss": 0.8017, + "step": 57078 + }, + { + "epoch": 0.7135178379459487, + "grad_norm": 0.0009536254219710827, + "learning_rate": 4.597096204399102e-06, + "loss": 0.7008, + "step": 57080 + }, + { + "epoch": 0.7135428385709642, + "grad_norm": 3.975322723388672, + "learning_rate": 4.596361875930501e-06, + "loss": 2.2081, + "step": 57082 + }, + { + "epoch": 0.7135678391959799, + "grad_norm": 3.9166629314422607, + "learning_rate": 4.595627588615238e-06, + "loss": 0.6937, + "step": 57084 + }, + { + "epoch": 0.7135928398209955, + "grad_norm": 2.5889627933502197, + "learning_rate": 4.594893342458916e-06, + "loss": 0.5493, + "step": 57086 + }, + { + "epoch": 0.7136178404460112, + "grad_norm": 1.9378609657287598, + "learning_rate": 4.594159137467118e-06, + "loss": 0.2456, + "step": 57088 + }, + { + "epoch": 0.7136428410710268, + "grad_norm": 0.12685029208660126, + "learning_rate": 4.593424973645443e-06, + "loss": 1.1751, + "step": 57090 + }, + { + "epoch": 0.7136678416960424, + "grad_norm": 1.8663979768753052, + "learning_rate": 4.592690850999479e-06, + "loss": 0.4617, + "step": 57092 + }, + { + "epoch": 0.713692842321058, + "grad_norm": 3.5009076595306396, + "learning_rate": 4.591956769534814e-06, + "loss": 0.8074, + "step": 57094 + }, + { + "epoch": 0.7137178429460737, + "grad_norm": 3.9918065071105957, + "learning_rate": 4.591222729257043e-06, + "loss": 0.3254, + "step": 57096 + }, + { + "epoch": 0.7137428435710893, + "grad_norm": 3.8702175617218018, + "learning_rate": 4.590488730171753e-06, + "loss": 0.5942, + "step": 57098 + }, + { + "epoch": 0.713767844196105, + "grad_norm": 0.7502560019493103, + "learning_rate": 4.589754772284539e-06, + "loss": 0.115, + "step": 57100 + }, + { + "epoch": 0.7137928448211205, + "grad_norm": 4.807784080505371, + "learning_rate": 4.589020855600987e-06, + "loss": 0.4807, + "step": 57102 + }, + { + "epoch": 0.7138178454461361, + "grad_norm": 0.0016813291003927588, + "learning_rate": 4.588286980126685e-06, + "loss": 0.0672, + "step": 57104 + }, + { + "epoch": 0.7138428460711518, + "grad_norm": 1.6867115497589111, + "learning_rate": 4.587553145867226e-06, + "loss": 0.6161, + "step": 57106 + }, + { + "epoch": 0.7138678466961674, + "grad_norm": 6.319526672363281, + "learning_rate": 4.586819352828195e-06, + "loss": 3.081, + "step": 57108 + }, + { + "epoch": 0.7138928473211831, + "grad_norm": 2.7371814250946045, + "learning_rate": 4.5860856010151845e-06, + "loss": 1.1487, + "step": 57110 + }, + { + "epoch": 0.7139178479461986, + "grad_norm": 4.866673946380615, + "learning_rate": 4.5853518904337765e-06, + "loss": 2.1064, + "step": 57112 + }, + { + "epoch": 0.7139428485712143, + "grad_norm": 5.510365009307861, + "learning_rate": 4.584618221089564e-06, + "loss": 1.4193, + "step": 57114 + }, + { + "epoch": 0.7139678491962299, + "grad_norm": 2.5805225372314453, + "learning_rate": 4.58388459298814e-06, + "loss": 1.4239, + "step": 57116 + }, + { + "epoch": 0.7139928498212456, + "grad_norm": 3.1729447841644287, + "learning_rate": 4.583151006135077e-06, + "loss": 0.462, + "step": 57118 + }, + { + "epoch": 0.7140178504462612, + "grad_norm": 4.913583755493164, + "learning_rate": 4.582417460535975e-06, + "loss": 1.5965, + "step": 57120 + }, + { + "epoch": 0.7140428510712767, + "grad_norm": 5.904716968536377, + "learning_rate": 4.581683956196411e-06, + "loss": 1.194, + "step": 57122 + }, + { + "epoch": 0.7140678516962924, + "grad_norm": 3.555328369140625, + "learning_rate": 4.580950493121979e-06, + "loss": 0.6507, + "step": 57124 + }, + { + "epoch": 0.714092852321308, + "grad_norm": 5.775871276855469, + "learning_rate": 4.5802170713182595e-06, + "loss": 1.8998, + "step": 57126 + }, + { + "epoch": 0.7141178529463237, + "grad_norm": 4.047651767730713, + "learning_rate": 4.579483690790843e-06, + "loss": 0.517, + "step": 57128 + }, + { + "epoch": 0.7141428535713393, + "grad_norm": 3.9859442710876465, + "learning_rate": 4.578750351545312e-06, + "loss": 1.5244, + "step": 57130 + }, + { + "epoch": 0.7141678541963549, + "grad_norm": 2.3226168155670166, + "learning_rate": 4.57801705358725e-06, + "loss": 1.4726, + "step": 57132 + }, + { + "epoch": 0.7141928548213705, + "grad_norm": 1.0918376445770264, + "learning_rate": 4.577283796922244e-06, + "loss": 1.0733, + "step": 57134 + }, + { + "epoch": 0.7142178554463862, + "grad_norm": 3.5355944633483887, + "learning_rate": 4.576550581555876e-06, + "loss": 1.0257, + "step": 57136 + }, + { + "epoch": 0.7142428560714018, + "grad_norm": 3.0017316341400146, + "learning_rate": 4.575817407493731e-06, + "loss": 1.354, + "step": 57138 + }, + { + "epoch": 0.7142678566964175, + "grad_norm": 3.4900667667388916, + "learning_rate": 4.575084274741399e-06, + "loss": 1.9825, + "step": 57140 + }, + { + "epoch": 0.714292857321433, + "grad_norm": 2.425126314163208, + "learning_rate": 4.574351183304455e-06, + "loss": 0.3323, + "step": 57142 + }, + { + "epoch": 0.7143178579464486, + "grad_norm": 0.3675045371055603, + "learning_rate": 4.573618133188488e-06, + "loss": 0.741, + "step": 57144 + }, + { + "epoch": 0.7143428585714643, + "grad_norm": 0.9194542765617371, + "learning_rate": 4.5728851243990725e-06, + "loss": 0.0834, + "step": 57146 + }, + { + "epoch": 0.7143678591964799, + "grad_norm": 3.5629937648773193, + "learning_rate": 4.5721521569418e-06, + "loss": 0.5598, + "step": 57148 + }, + { + "epoch": 0.7143928598214956, + "grad_norm": 1.6075270175933838, + "learning_rate": 4.5714192308222476e-06, + "loss": 0.2531, + "step": 57150 + }, + { + "epoch": 0.7144178604465111, + "grad_norm": 1.962179183959961, + "learning_rate": 4.570686346045996e-06, + "loss": 0.1748, + "step": 57152 + }, + { + "epoch": 0.7144428610715268, + "grad_norm": 5.507516384124756, + "learning_rate": 4.5699535026186385e-06, + "loss": 1.2575, + "step": 57154 + }, + { + "epoch": 0.7144678616965424, + "grad_norm": 7.033752918243408, + "learning_rate": 4.569220700545739e-06, + "loss": 0.7922, + "step": 57156 + }, + { + "epoch": 0.7144928623215581, + "grad_norm": 0.08983631432056427, + "learning_rate": 4.568487939832891e-06, + "loss": 0.7007, + "step": 57158 + }, + { + "epoch": 0.7145178629465737, + "grad_norm": 3.0431079864501953, + "learning_rate": 4.567755220485667e-06, + "loss": 0.4843, + "step": 57160 + }, + { + "epoch": 0.7145428635715892, + "grad_norm": 8.072644233703613, + "learning_rate": 4.567022542509654e-06, + "loss": 1.3193, + "step": 57162 + }, + { + "epoch": 0.7145678641966049, + "grad_norm": 3.343790054321289, + "learning_rate": 4.566289905910426e-06, + "loss": 1.1337, + "step": 57164 + }, + { + "epoch": 0.7145928648216205, + "grad_norm": 0.001893546897917986, + "learning_rate": 4.5655573106935645e-06, + "loss": 0.9732, + "step": 57166 + }, + { + "epoch": 0.7146178654466362, + "grad_norm": 4.483638286590576, + "learning_rate": 4.564824756864658e-06, + "loss": 0.8746, + "step": 57168 + }, + { + "epoch": 0.7146428660716518, + "grad_norm": 5.201230049133301, + "learning_rate": 4.5640922444292715e-06, + "loss": 0.4348, + "step": 57170 + }, + { + "epoch": 0.7146678666966674, + "grad_norm": 3.125814437866211, + "learning_rate": 4.563359773392993e-06, + "loss": 0.7755, + "step": 57172 + }, + { + "epoch": 0.714692867321683, + "grad_norm": 3.489527463912964, + "learning_rate": 4.562627343761394e-06, + "loss": 0.8836, + "step": 57174 + }, + { + "epoch": 0.7147178679466987, + "grad_norm": 0.004205481614917517, + "learning_rate": 4.5618949555400546e-06, + "loss": 0.8222, + "step": 57176 + }, + { + "epoch": 0.7147428685717143, + "grad_norm": 9.917013168334961, + "learning_rate": 4.561162608734559e-06, + "loss": 1.1496, + "step": 57178 + }, + { + "epoch": 0.71476786919673, + "grad_norm": 2.7540290355682373, + "learning_rate": 4.56043030335048e-06, + "loss": 0.5044, + "step": 57180 + }, + { + "epoch": 0.7147928698217455, + "grad_norm": 4.57583475112915, + "learning_rate": 4.559698039393394e-06, + "loss": 1.2653, + "step": 57182 + }, + { + "epoch": 0.7148178704467611, + "grad_norm": 1.0264368057250977, + "learning_rate": 4.558965816868876e-06, + "loss": 0.9031, + "step": 57184 + }, + { + "epoch": 0.7148428710717768, + "grad_norm": 4.996059417724609, + "learning_rate": 4.558233635782508e-06, + "loss": 2.4119, + "step": 57186 + }, + { + "epoch": 0.7148678716967924, + "grad_norm": 3.5974791049957275, + "learning_rate": 4.557501496139859e-06, + "loss": 0.2681, + "step": 57188 + }, + { + "epoch": 0.7148928723218081, + "grad_norm": 3.7044246196746826, + "learning_rate": 4.5567693979465085e-06, + "loss": 0.6526, + "step": 57190 + }, + { + "epoch": 0.7149178729468236, + "grad_norm": 3.4710681438446045, + "learning_rate": 4.556037341208037e-06, + "loss": 1.2537, + "step": 57192 + }, + { + "epoch": 0.7149428735718393, + "grad_norm": 2.8958568572998047, + "learning_rate": 4.555305325930015e-06, + "loss": 0.8912, + "step": 57194 + }, + { + "epoch": 0.7149678741968549, + "grad_norm": 3.7086923122406006, + "learning_rate": 4.554573352118018e-06, + "loss": 0.9263, + "step": 57196 + }, + { + "epoch": 0.7149928748218706, + "grad_norm": 3.8026490211486816, + "learning_rate": 4.553841419777616e-06, + "loss": 1.0316, + "step": 57198 + }, + { + "epoch": 0.7150178754468862, + "grad_norm": 1.9844924211502075, + "learning_rate": 4.553109528914388e-06, + "loss": 0.9368, + "step": 57200 + }, + { + "epoch": 0.7150428760719018, + "grad_norm": 2.7274982929229736, + "learning_rate": 4.552377679533912e-06, + "loss": 0.1564, + "step": 57202 + }, + { + "epoch": 0.7150678766969174, + "grad_norm": 4.028035640716553, + "learning_rate": 4.551645871641752e-06, + "loss": 1.6856, + "step": 57204 + }, + { + "epoch": 0.715092877321933, + "grad_norm": 3.103623390197754, + "learning_rate": 4.550914105243495e-06, + "loss": 1.1668, + "step": 57206 + }, + { + "epoch": 0.7151178779469487, + "grad_norm": 2.296973943710327, + "learning_rate": 4.550182380344698e-06, + "loss": 0.2023, + "step": 57208 + }, + { + "epoch": 0.7151428785719643, + "grad_norm": 0.005036758258938789, + "learning_rate": 4.549450696950945e-06, + "loss": 0.5777, + "step": 57210 + }, + { + "epoch": 0.7151678791969799, + "grad_norm": 3.383425712585449, + "learning_rate": 4.5487190550678e-06, + "loss": 0.8501, + "step": 57212 + }, + { + "epoch": 0.7151928798219955, + "grad_norm": 3.22989559173584, + "learning_rate": 4.547987454700842e-06, + "loss": 0.66, + "step": 57214 + }, + { + "epoch": 0.7152178804470112, + "grad_norm": 2.9586493968963623, + "learning_rate": 4.547255895855643e-06, + "loss": 0.2848, + "step": 57216 + }, + { + "epoch": 0.7152428810720268, + "grad_norm": 2.379228353500366, + "learning_rate": 4.546524378537767e-06, + "loss": 0.9805, + "step": 57218 + }, + { + "epoch": 0.7152678816970425, + "grad_norm": 2.802501678466797, + "learning_rate": 4.5457929027528e-06, + "loss": 0.4192, + "step": 57220 + }, + { + "epoch": 0.715292882322058, + "grad_norm": 3.9012670516967773, + "learning_rate": 4.545061468506295e-06, + "loss": 1.1718, + "step": 57222 + }, + { + "epoch": 0.7153178829470737, + "grad_norm": 5.061776161193848, + "learning_rate": 4.54433007580383e-06, + "loss": 0.8713, + "step": 57224 + }, + { + "epoch": 0.7153428835720893, + "grad_norm": 4.189209461212158, + "learning_rate": 4.543598724650978e-06, + "loss": 1.425, + "step": 57226 + }, + { + "epoch": 0.715367884197105, + "grad_norm": 11.191667556762695, + "learning_rate": 4.542867415053305e-06, + "loss": 0.9375, + "step": 57228 + }, + { + "epoch": 0.7153928848221206, + "grad_norm": 2.5324249267578125, + "learning_rate": 4.542136147016385e-06, + "loss": 1.2094, + "step": 57230 + }, + { + "epoch": 0.7154178854471361, + "grad_norm": 4.145934104919434, + "learning_rate": 4.5414049205457835e-06, + "loss": 1.313, + "step": 57232 + }, + { + "epoch": 0.7154428860721518, + "grad_norm": 3.473240375518799, + "learning_rate": 4.540673735647071e-06, + "loss": 0.6862, + "step": 57234 + }, + { + "epoch": 0.7154678866971674, + "grad_norm": 0.5275630950927734, + "learning_rate": 4.539942592325811e-06, + "loss": 0.0339, + "step": 57236 + }, + { + "epoch": 0.7154928873221831, + "grad_norm": 0.0004690907953772694, + "learning_rate": 4.539211490587577e-06, + "loss": 1.0265, + "step": 57238 + }, + { + "epoch": 0.7155178879471987, + "grad_norm": 4.6909356117248535, + "learning_rate": 4.538480430437939e-06, + "loss": 0.2991, + "step": 57240 + }, + { + "epoch": 0.7155428885722143, + "grad_norm": 3.810478448867798, + "learning_rate": 4.537749411882458e-06, + "loss": 1.1605, + "step": 57242 + }, + { + "epoch": 0.7155678891972299, + "grad_norm": 2.813251256942749, + "learning_rate": 4.53701843492671e-06, + "loss": 1.8628, + "step": 57244 + }, + { + "epoch": 0.7155928898222456, + "grad_norm": 1.9657621383666992, + "learning_rate": 4.536287499576257e-06, + "loss": 0.7871, + "step": 57246 + }, + { + "epoch": 0.7156178904472612, + "grad_norm": 3.6594369411468506, + "learning_rate": 4.535556605836664e-06, + "loss": 0.7634, + "step": 57248 + }, + { + "epoch": 0.7156428910722769, + "grad_norm": 2.917116641998291, + "learning_rate": 4.5348257537134956e-06, + "loss": 0.898, + "step": 57250 + }, + { + "epoch": 0.7156678916972924, + "grad_norm": 2.4484522342681885, + "learning_rate": 4.534094943212323e-06, + "loss": 0.6614, + "step": 57252 + }, + { + "epoch": 0.715692892322308, + "grad_norm": 1.7520371675491333, + "learning_rate": 4.533364174338713e-06, + "loss": 0.8578, + "step": 57254 + }, + { + "epoch": 0.7157178929473237, + "grad_norm": 2.221435546875, + "learning_rate": 4.532633447098225e-06, + "loss": 0.7347, + "step": 57256 + }, + { + "epoch": 0.7157428935723393, + "grad_norm": 3.029078960418701, + "learning_rate": 4.531902761496431e-06, + "loss": 1.0529, + "step": 57258 + }, + { + "epoch": 0.715767894197355, + "grad_norm": 4.720851898193359, + "learning_rate": 4.531172117538892e-06, + "loss": 0.8172, + "step": 57260 + }, + { + "epoch": 0.7157928948223705, + "grad_norm": 5.162237167358398, + "learning_rate": 4.5304415152311695e-06, + "loss": 1.0571, + "step": 57262 + }, + { + "epoch": 0.7158178954473862, + "grad_norm": 3.5143840312957764, + "learning_rate": 4.529710954578835e-06, + "loss": 0.9782, + "step": 57264 + }, + { + "epoch": 0.7158428960724018, + "grad_norm": 1.7867587804794312, + "learning_rate": 4.528980435587445e-06, + "loss": 1.2583, + "step": 57266 + }, + { + "epoch": 0.7158678966974175, + "grad_norm": 4.971754550933838, + "learning_rate": 4.528249958262569e-06, + "loss": 0.7587, + "step": 57268 + }, + { + "epoch": 0.7158928973224331, + "grad_norm": 0.0010475781746208668, + "learning_rate": 4.5275195226097635e-06, + "loss": 0.6937, + "step": 57270 + }, + { + "epoch": 0.7159178979474486, + "grad_norm": 0.8657791614532471, + "learning_rate": 4.526789128634603e-06, + "loss": 0.3759, + "step": 57272 + }, + { + "epoch": 0.7159428985724643, + "grad_norm": 5.564692974090576, + "learning_rate": 4.526058776342636e-06, + "loss": 1.2969, + "step": 57274 + }, + { + "epoch": 0.7159678991974799, + "grad_norm": 3.0418834686279297, + "learning_rate": 4.525328465739432e-06, + "loss": 1.17, + "step": 57276 + }, + { + "epoch": 0.7159928998224956, + "grad_norm": 3.997756004333496, + "learning_rate": 4.524598196830554e-06, + "loss": 1.1666, + "step": 57278 + }, + { + "epoch": 0.7160179004475112, + "grad_norm": 2.1495654582977295, + "learning_rate": 4.523867969621558e-06, + "loss": 0.3796, + "step": 57280 + }, + { + "epoch": 0.7160429010725268, + "grad_norm": 0.13791251182556152, + "learning_rate": 4.523137784118014e-06, + "loss": 0.5885, + "step": 57282 + }, + { + "epoch": 0.7160679016975424, + "grad_norm": 3.6947851181030273, + "learning_rate": 4.522407640325475e-06, + "loss": 1.3055, + "step": 57284 + }, + { + "epoch": 0.7160929023225581, + "grad_norm": 2.309515953063965, + "learning_rate": 4.521677538249507e-06, + "loss": 0.3419, + "step": 57286 + }, + { + "epoch": 0.7161179029475737, + "grad_norm": 0.07618293911218643, + "learning_rate": 4.52094747789567e-06, + "loss": 0.0015, + "step": 57288 + }, + { + "epoch": 0.7161429035725894, + "grad_norm": 3.763805866241455, + "learning_rate": 4.520217459269517e-06, + "loss": 0.7813, + "step": 57290 + }, + { + "epoch": 0.7161679041976049, + "grad_norm": 3.1978328227996826, + "learning_rate": 4.519487482376617e-06, + "loss": 1.4521, + "step": 57292 + }, + { + "epoch": 0.7161929048226205, + "grad_norm": 2.764125347137451, + "learning_rate": 4.518757547222521e-06, + "loss": 0.4659, + "step": 57294 + }, + { + "epoch": 0.7162179054476362, + "grad_norm": 3.0992069244384766, + "learning_rate": 4.518027653812797e-06, + "loss": 0.6456, + "step": 57296 + }, + { + "epoch": 0.7162429060726518, + "grad_norm": 2.0843563079833984, + "learning_rate": 4.517297802152999e-06, + "loss": 1.3463, + "step": 57298 + }, + { + "epoch": 0.7162679066976675, + "grad_norm": 4.002749919891357, + "learning_rate": 4.516567992248681e-06, + "loss": 1.5463, + "step": 57300 + }, + { + "epoch": 0.716292907322683, + "grad_norm": 1.451774001121521, + "learning_rate": 4.515838224105411e-06, + "loss": 0.2214, + "step": 57302 + }, + { + "epoch": 0.7163179079476987, + "grad_norm": 3.1436824798583984, + "learning_rate": 4.515108497728737e-06, + "loss": 1.0225, + "step": 57304 + }, + { + "epoch": 0.7163429085727143, + "grad_norm": 0.7324339747428894, + "learning_rate": 4.514378813124225e-06, + "loss": 0.0041, + "step": 57306 + }, + { + "epoch": 0.71636790919773, + "grad_norm": 0.0007144464179873466, + "learning_rate": 4.5136491702974235e-06, + "loss": 0.9281, + "step": 57308 + }, + { + "epoch": 0.7163929098227456, + "grad_norm": 5.612936496734619, + "learning_rate": 4.5129195692538996e-06, + "loss": 1.1252, + "step": 57310 + }, + { + "epoch": 0.7164179104477612, + "grad_norm": 4.906733989715576, + "learning_rate": 4.5121900099992035e-06, + "loss": 0.7228, + "step": 57312 + }, + { + "epoch": 0.7164429110727768, + "grad_norm": 3.080552816390991, + "learning_rate": 4.5114604925388885e-06, + "loss": 1.136, + "step": 57314 + }, + { + "epoch": 0.7164679116977924, + "grad_norm": 4.95328426361084, + "learning_rate": 4.510731016878518e-06, + "loss": 1.421, + "step": 57316 + }, + { + "epoch": 0.7164929123228081, + "grad_norm": 3.3704781532287598, + "learning_rate": 4.51000158302364e-06, + "loss": 0.5812, + "step": 57318 + }, + { + "epoch": 0.7165179129478237, + "grad_norm": 1.9169611930847168, + "learning_rate": 4.509272190979817e-06, + "loss": 0.1591, + "step": 57320 + }, + { + "epoch": 0.7165429135728393, + "grad_norm": 3.7124075889587402, + "learning_rate": 4.508542840752597e-06, + "loss": 0.3168, + "step": 57322 + }, + { + "epoch": 0.7165679141978549, + "grad_norm": 2.7480721473693848, + "learning_rate": 4.507813532347542e-06, + "loss": 0.3825, + "step": 57324 + }, + { + "epoch": 0.7165929148228706, + "grad_norm": 0.004152753856033087, + "learning_rate": 4.5070842657702016e-06, + "loss": 1.5091, + "step": 57326 + }, + { + "epoch": 0.7166179154478862, + "grad_norm": 0.01465463638305664, + "learning_rate": 4.506355041026128e-06, + "loss": 0.0002, + "step": 57328 + }, + { + "epoch": 0.7166429160729019, + "grad_norm": 3.4890706539154053, + "learning_rate": 4.505625858120881e-06, + "loss": 0.8417, + "step": 57330 + }, + { + "epoch": 0.7166679166979174, + "grad_norm": 3.0384418964385986, + "learning_rate": 4.504896717060007e-06, + "loss": 1.4695, + "step": 57332 + }, + { + "epoch": 0.716692917322933, + "grad_norm": 2.557363748550415, + "learning_rate": 4.5041676178490656e-06, + "loss": 0.1313, + "step": 57334 + }, + { + "epoch": 0.7167179179479487, + "grad_norm": 2.834249258041382, + "learning_rate": 4.5034385604936034e-06, + "loss": 1.5429, + "step": 57336 + }, + { + "epoch": 0.7167429185729643, + "grad_norm": 3.468170166015625, + "learning_rate": 4.5027095449991795e-06, + "loss": 1.6774, + "step": 57338 + }, + { + "epoch": 0.71676791919798, + "grad_norm": 2.6967971324920654, + "learning_rate": 4.501980571371343e-06, + "loss": 1.0171, + "step": 57340 + }, + { + "epoch": 0.7167929198229955, + "grad_norm": 2.7420737743377686, + "learning_rate": 4.501251639615639e-06, + "loss": 0.4902, + "step": 57342 + }, + { + "epoch": 0.7168179204480112, + "grad_norm": 4.085235595703125, + "learning_rate": 4.50052274973763e-06, + "loss": 1.1904, + "step": 57344 + }, + { + "epoch": 0.7168429210730268, + "grad_norm": 3.5023865699768066, + "learning_rate": 4.499793901742859e-06, + "loss": 1.6826, + "step": 57346 + }, + { + "epoch": 0.7168679216980425, + "grad_norm": 0.0006503673503175378, + "learning_rate": 4.499065095636883e-06, + "loss": 0.6641, + "step": 57348 + }, + { + "epoch": 0.7168929223230581, + "grad_norm": 6.899412155151367, + "learning_rate": 4.498336331425249e-06, + "loss": 0.6887, + "step": 57350 + }, + { + "epoch": 0.7169179229480737, + "grad_norm": 4.324584484100342, + "learning_rate": 4.497607609113505e-06, + "loss": 2.2943, + "step": 57352 + }, + { + "epoch": 0.7169429235730893, + "grad_norm": 10.090411186218262, + "learning_rate": 4.4968789287072055e-06, + "loss": 0.8907, + "step": 57354 + }, + { + "epoch": 0.716967924198105, + "grad_norm": 4.0952959060668945, + "learning_rate": 4.496150290211895e-06, + "loss": 1.5798, + "step": 57356 + }, + { + "epoch": 0.7169929248231206, + "grad_norm": 5.112757205963135, + "learning_rate": 4.495421693633128e-06, + "loss": 1.5136, + "step": 57358 + }, + { + "epoch": 0.7170179254481363, + "grad_norm": 1.7697644233703613, + "learning_rate": 4.494693138976449e-06, + "loss": 0.0863, + "step": 57360 + }, + { + "epoch": 0.7170429260731518, + "grad_norm": 0.4608623683452606, + "learning_rate": 4.4939646262474114e-06, + "loss": 0.0071, + "step": 57362 + }, + { + "epoch": 0.7170679266981674, + "grad_norm": 0.0009800862753763795, + "learning_rate": 4.493236155451561e-06, + "loss": 0.3545, + "step": 57364 + }, + { + "epoch": 0.7170929273231831, + "grad_norm": 3.806366205215454, + "learning_rate": 4.492507726594442e-06, + "loss": 1.4827, + "step": 57366 + }, + { + "epoch": 0.7171179279481987, + "grad_norm": 7.9366068840026855, + "learning_rate": 4.491779339681608e-06, + "loss": 2.0826, + "step": 57368 + }, + { + "epoch": 0.7171429285732144, + "grad_norm": 2.393341302871704, + "learning_rate": 4.4910509947186e-06, + "loss": 0.4627, + "step": 57370 + }, + { + "epoch": 0.7171679291982299, + "grad_norm": 7.742493629455566, + "learning_rate": 4.490322691710972e-06, + "loss": 1.4407, + "step": 57372 + }, + { + "epoch": 0.7171929298232456, + "grad_norm": 4.561343669891357, + "learning_rate": 4.489594430664265e-06, + "loss": 0.452, + "step": 57374 + }, + { + "epoch": 0.7172179304482612, + "grad_norm": 4.350724697113037, + "learning_rate": 4.48886621158403e-06, + "loss": 1.0475, + "step": 57376 + }, + { + "epoch": 0.7172429310732769, + "grad_norm": 3.5870652198791504, + "learning_rate": 4.4881380344758106e-06, + "loss": 1.6878, + "step": 57378 + }, + { + "epoch": 0.7172679316982925, + "grad_norm": 3.4616620540618896, + "learning_rate": 4.487409899345149e-06, + "loss": 1.5236, + "step": 57380 + }, + { + "epoch": 0.717292932323308, + "grad_norm": 3.4817018508911133, + "learning_rate": 4.486681806197598e-06, + "loss": 0.634, + "step": 57382 + }, + { + "epoch": 0.7173179329483237, + "grad_norm": 2.4931938648223877, + "learning_rate": 4.485953755038694e-06, + "loss": 0.5636, + "step": 57384 + }, + { + "epoch": 0.7173429335733393, + "grad_norm": 3.974807024002075, + "learning_rate": 4.485225745873993e-06, + "loss": 2.0048, + "step": 57386 + }, + { + "epoch": 0.717367934198355, + "grad_norm": 2.631148338317871, + "learning_rate": 4.484497778709026e-06, + "loss": 0.6263, + "step": 57388 + }, + { + "epoch": 0.7173929348233706, + "grad_norm": 3.2684409618377686, + "learning_rate": 4.483769853549349e-06, + "loss": 1.3065, + "step": 57390 + }, + { + "epoch": 0.7174179354483862, + "grad_norm": 2.118910312652588, + "learning_rate": 4.483041970400501e-06, + "loss": 0.5227, + "step": 57392 + }, + { + "epoch": 0.7174429360734018, + "grad_norm": 3.6142475605010986, + "learning_rate": 4.482314129268021e-06, + "loss": 0.7419, + "step": 57394 + }, + { + "epoch": 0.7174679366984175, + "grad_norm": 2.5483412742614746, + "learning_rate": 4.481586330157461e-06, + "loss": 0.2918, + "step": 57396 + }, + { + "epoch": 0.7174929373234331, + "grad_norm": 5.004948616027832, + "learning_rate": 4.4808585730743535e-06, + "loss": 1.3736, + "step": 57398 + }, + { + "epoch": 0.7175179379484488, + "grad_norm": 2.333861827850342, + "learning_rate": 4.480130858024251e-06, + "loss": 0.9646, + "step": 57400 + }, + { + "epoch": 0.7175429385734643, + "grad_norm": 0.0018713197205215693, + "learning_rate": 4.479403185012692e-06, + "loss": 0.3517, + "step": 57402 + }, + { + "epoch": 0.7175679391984799, + "grad_norm": 12.567392349243164, + "learning_rate": 4.478675554045213e-06, + "loss": 0.9532, + "step": 57404 + }, + { + "epoch": 0.7175929398234956, + "grad_norm": 2.8552215099334717, + "learning_rate": 4.477947965127365e-06, + "loss": 0.7167, + "step": 57406 + }, + { + "epoch": 0.7176179404485112, + "grad_norm": 4.476876258850098, + "learning_rate": 4.47722041826468e-06, + "loss": 0.1347, + "step": 57408 + }, + { + "epoch": 0.7176429410735269, + "grad_norm": 3.3617570400238037, + "learning_rate": 4.476492913462708e-06, + "loss": 0.595, + "step": 57410 + }, + { + "epoch": 0.7176679416985424, + "grad_norm": 0.015656357631087303, + "learning_rate": 4.47576545072698e-06, + "loss": 0.5833, + "step": 57412 + }, + { + "epoch": 0.7176929423235581, + "grad_norm": 3.194772481918335, + "learning_rate": 4.475038030063045e-06, + "loss": 0.7899, + "step": 57414 + }, + { + "epoch": 0.7177179429485737, + "grad_norm": 6.128640651702881, + "learning_rate": 4.4743106514764385e-06, + "loss": 1.7104, + "step": 57416 + }, + { + "epoch": 0.7177429435735894, + "grad_norm": 7.161775588989258, + "learning_rate": 4.4735833149727e-06, + "loss": 1.2179, + "step": 57418 + }, + { + "epoch": 0.717767944198605, + "grad_norm": 3.962838649749756, + "learning_rate": 4.47285602055737e-06, + "loss": 0.3776, + "step": 57420 + }, + { + "epoch": 0.7177929448236205, + "grad_norm": 4.893354415893555, + "learning_rate": 4.4721287682359835e-06, + "loss": 1.6442, + "step": 57422 + }, + { + "epoch": 0.7178179454486362, + "grad_norm": 4.345388412475586, + "learning_rate": 4.471401558014087e-06, + "loss": 0.8188, + "step": 57424 + }, + { + "epoch": 0.7178429460736518, + "grad_norm": 2.2944557666778564, + "learning_rate": 4.470674389897212e-06, + "loss": 1.1582, + "step": 57426 + }, + { + "epoch": 0.7178679466986675, + "grad_norm": 0.8552253246307373, + "learning_rate": 4.4699472638909e-06, + "loss": 0.0095, + "step": 57428 + }, + { + "epoch": 0.7178929473236831, + "grad_norm": 3.605849504470825, + "learning_rate": 4.46922018000069e-06, + "loss": 1.6653, + "step": 57430 + }, + { + "epoch": 0.7179179479486987, + "grad_norm": 3.1542203426361084, + "learning_rate": 4.4684931382321116e-06, + "loss": 0.8853, + "step": 57432 + }, + { + "epoch": 0.7179429485737143, + "grad_norm": 5.1506853103637695, + "learning_rate": 4.467766138590711e-06, + "loss": 1.1141, + "step": 57434 + }, + { + "epoch": 0.71796794919873, + "grad_norm": 2.5511538982391357, + "learning_rate": 4.467039181082018e-06, + "loss": 1.2547, + "step": 57436 + }, + { + "epoch": 0.7179929498237456, + "grad_norm": 1.1660277843475342, + "learning_rate": 4.466312265711576e-06, + "loss": 0.5691, + "step": 57438 + }, + { + "epoch": 0.7180179504487613, + "grad_norm": 6.473354339599609, + "learning_rate": 4.465585392484913e-06, + "loss": 1.6854, + "step": 57440 + }, + { + "epoch": 0.7180429510737768, + "grad_norm": 0.00041378766763955355, + "learning_rate": 4.464858561407574e-06, + "loss": 0.6733, + "step": 57442 + }, + { + "epoch": 0.7180679516987925, + "grad_norm": 0.0003542780759744346, + "learning_rate": 4.464131772485089e-06, + "loss": 0.2171, + "step": 57444 + }, + { + "epoch": 0.7180929523238081, + "grad_norm": 1.2820779085159302, + "learning_rate": 4.463405025722989e-06, + "loss": 0.358, + "step": 57446 + }, + { + "epoch": 0.7181179529488237, + "grad_norm": 5.975713729858398, + "learning_rate": 4.462678321126818e-06, + "loss": 1.6899, + "step": 57448 + }, + { + "epoch": 0.7181429535738394, + "grad_norm": 3.8145434856414795, + "learning_rate": 4.461951658702102e-06, + "loss": 0.8526, + "step": 57450 + }, + { + "epoch": 0.7181679541988549, + "grad_norm": 4.593341827392578, + "learning_rate": 4.461225038454382e-06, + "loss": 0.7275, + "step": 57452 + }, + { + "epoch": 0.7181929548238706, + "grad_norm": 7.534420967102051, + "learning_rate": 4.460498460389189e-06, + "loss": 1.2881, + "step": 57454 + }, + { + "epoch": 0.7182179554488862, + "grad_norm": 3.77839994430542, + "learning_rate": 4.459771924512053e-06, + "loss": 1.0914, + "step": 57456 + }, + { + "epoch": 0.7182429560739019, + "grad_norm": 3.9289557933807373, + "learning_rate": 4.459045430828515e-06, + "loss": 1.0989, + "step": 57458 + }, + { + "epoch": 0.7182679566989175, + "grad_norm": 4.604511737823486, + "learning_rate": 4.458318979344098e-06, + "loss": 1.4812, + "step": 57460 + }, + { + "epoch": 0.7182929573239331, + "grad_norm": 4.159510135650635, + "learning_rate": 4.457592570064345e-06, + "loss": 0.7471, + "step": 57462 + }, + { + "epoch": 0.7183179579489487, + "grad_norm": 4.049649238586426, + "learning_rate": 4.456866202994779e-06, + "loss": 0.8723, + "step": 57464 + }, + { + "epoch": 0.7183429585739644, + "grad_norm": 2.446810245513916, + "learning_rate": 4.456139878140941e-06, + "loss": 0.9184, + "step": 57466 + }, + { + "epoch": 0.71836795919898, + "grad_norm": 4.776371002197266, + "learning_rate": 4.455413595508355e-06, + "loss": 1.1951, + "step": 57468 + }, + { + "epoch": 0.7183929598239956, + "grad_norm": 2.8947131633758545, + "learning_rate": 4.4546873551025535e-06, + "loss": 0.5054, + "step": 57470 + }, + { + "epoch": 0.7184179604490112, + "grad_norm": 2.325650215148926, + "learning_rate": 4.453961156929072e-06, + "loss": 1.2012, + "step": 57472 + }, + { + "epoch": 0.7184429610740268, + "grad_norm": 6.149085998535156, + "learning_rate": 4.453235000993434e-06, + "loss": 0.953, + "step": 57474 + }, + { + "epoch": 0.7184679616990425, + "grad_norm": 0.10406994074583054, + "learning_rate": 4.452508887301179e-06, + "loss": 0.8288, + "step": 57476 + }, + { + "epoch": 0.7184929623240581, + "grad_norm": 0.7213717103004456, + "learning_rate": 4.451782815857827e-06, + "loss": 0.5798, + "step": 57478 + }, + { + "epoch": 0.7185179629490738, + "grad_norm": 3.767493724822998, + "learning_rate": 4.451056786668916e-06, + "loss": 0.7978, + "step": 57480 + }, + { + "epoch": 0.7185429635740893, + "grad_norm": 2.9083714485168457, + "learning_rate": 4.450330799739972e-06, + "loss": 0.7814, + "step": 57482 + }, + { + "epoch": 0.718567964199105, + "grad_norm": 5.249962329864502, + "learning_rate": 4.44960485507652e-06, + "loss": 1.7774, + "step": 57484 + }, + { + "epoch": 0.7185929648241206, + "grad_norm": 0.4995834231376648, + "learning_rate": 4.448878952684097e-06, + "loss": 1.0345, + "step": 57486 + }, + { + "epoch": 0.7186179654491363, + "grad_norm": 2.902745485305786, + "learning_rate": 4.4481530925682224e-06, + "loss": 0.944, + "step": 57488 + }, + { + "epoch": 0.7186429660741519, + "grad_norm": 0.007640415336936712, + "learning_rate": 4.447427274734434e-06, + "loss": 1.0306, + "step": 57490 + }, + { + "epoch": 0.7186679666991674, + "grad_norm": 0.449590802192688, + "learning_rate": 4.446701499188249e-06, + "loss": 0.1857, + "step": 57492 + }, + { + "epoch": 0.7186929673241831, + "grad_norm": 2.0918750762939453, + "learning_rate": 4.4459757659352035e-06, + "loss": 0.3493, + "step": 57494 + }, + { + "epoch": 0.7187179679491987, + "grad_norm": 0.0013707260368391871, + "learning_rate": 4.445250074980821e-06, + "loss": 0.0125, + "step": 57496 + }, + { + "epoch": 0.7187429685742144, + "grad_norm": 7.5320258140563965, + "learning_rate": 4.4445244263306265e-06, + "loss": 1.1987, + "step": 57498 + }, + { + "epoch": 0.71876796919923, + "grad_norm": 2.869102716445923, + "learning_rate": 4.443798819990151e-06, + "loss": 2.0963, + "step": 57500 + }, + { + "epoch": 0.7187929698242456, + "grad_norm": 0.09462034702301025, + "learning_rate": 4.443073255964914e-06, + "loss": 0.278, + "step": 57502 + }, + { + "epoch": 0.7188179704492612, + "grad_norm": 1.1008774042129517, + "learning_rate": 4.4423477342604495e-06, + "loss": 0.7967, + "step": 57504 + }, + { + "epoch": 0.7188429710742769, + "grad_norm": 0.002871089382097125, + "learning_rate": 4.441622254882278e-06, + "loss": 0.8631, + "step": 57506 + }, + { + "epoch": 0.7188679716992925, + "grad_norm": 2.4553112983703613, + "learning_rate": 4.440896817835922e-06, + "loss": 0.6463, + "step": 57508 + }, + { + "epoch": 0.7188929723243082, + "grad_norm": 2.007578134536743, + "learning_rate": 4.440171423126915e-06, + "loss": 1.2237, + "step": 57510 + }, + { + "epoch": 0.7189179729493237, + "grad_norm": 5.915619850158691, + "learning_rate": 4.43944607076077e-06, + "loss": 1.6822, + "step": 57512 + }, + { + "epoch": 0.7189429735743393, + "grad_norm": 5.6610846519470215, + "learning_rate": 4.438720760743023e-06, + "loss": 2.258, + "step": 57514 + }, + { + "epoch": 0.718967974199355, + "grad_norm": 3.843019723892212, + "learning_rate": 4.437995493079187e-06, + "loss": 1.1295, + "step": 57516 + }, + { + "epoch": 0.7189929748243706, + "grad_norm": 10.02797794342041, + "learning_rate": 4.437270267774792e-06, + "loss": 1.391, + "step": 57518 + }, + { + "epoch": 0.7190179754493863, + "grad_norm": 4.887080192565918, + "learning_rate": 4.436545084835366e-06, + "loss": 0.8746, + "step": 57520 + }, + { + "epoch": 0.7190429760744018, + "grad_norm": 0.0017682602629065514, + "learning_rate": 4.435819944266419e-06, + "loss": 0.2082, + "step": 57522 + }, + { + "epoch": 0.7190679766994175, + "grad_norm": 4.440225601196289, + "learning_rate": 4.4350948460734854e-06, + "loss": 2.0848, + "step": 57524 + }, + { + "epoch": 0.7190929773244331, + "grad_norm": 4.396122455596924, + "learning_rate": 4.434369790262079e-06, + "loss": 1.7292, + "step": 57526 + }, + { + "epoch": 0.7191179779494488, + "grad_norm": 3.193697452545166, + "learning_rate": 4.433644776837727e-06, + "loss": 1.252, + "step": 57528 + }, + { + "epoch": 0.7191429785744644, + "grad_norm": 0.027689827606081963, + "learning_rate": 4.432919805805946e-06, + "loss": 0.0373, + "step": 57530 + }, + { + "epoch": 0.71916797919948, + "grad_norm": 7.651549816131592, + "learning_rate": 4.4321948771722616e-06, + "loss": 0.3875, + "step": 57532 + }, + { + "epoch": 0.7191929798244956, + "grad_norm": 3.211209774017334, + "learning_rate": 4.4314699909422e-06, + "loss": 0.9878, + "step": 57534 + }, + { + "epoch": 0.7192179804495112, + "grad_norm": 0.002863652538508177, + "learning_rate": 4.430745147121268e-06, + "loss": 0.5148, + "step": 57536 + }, + { + "epoch": 0.7192429810745269, + "grad_norm": 1.8852964639663696, + "learning_rate": 4.430020345714998e-06, + "loss": 0.0676, + "step": 57538 + }, + { + "epoch": 0.7192679816995425, + "grad_norm": 0.45407968759536743, + "learning_rate": 4.4292955867289015e-06, + "loss": 0.2711, + "step": 57540 + }, + { + "epoch": 0.7192929823245581, + "grad_norm": 4.146296977996826, + "learning_rate": 4.428570870168502e-06, + "loss": 0.7019, + "step": 57542 + }, + { + "epoch": 0.7193179829495737, + "grad_norm": 4.5898847579956055, + "learning_rate": 4.427846196039322e-06, + "loss": 0.2861, + "step": 57544 + }, + { + "epoch": 0.7193429835745894, + "grad_norm": 3.427725076675415, + "learning_rate": 4.427121564346878e-06, + "loss": 1.0052, + "step": 57546 + }, + { + "epoch": 0.719367984199605, + "grad_norm": 0.4152275025844574, + "learning_rate": 4.426396975096687e-06, + "loss": 0.6136, + "step": 57548 + }, + { + "epoch": 0.7193929848246207, + "grad_norm": 3.1508257389068604, + "learning_rate": 4.425672428294266e-06, + "loss": 0.8116, + "step": 57550 + }, + { + "epoch": 0.7194179854496362, + "grad_norm": 2.320486068725586, + "learning_rate": 4.424947923945139e-06, + "loss": 1.1697, + "step": 57552 + }, + { + "epoch": 0.7194429860746518, + "grad_norm": 4.131695747375488, + "learning_rate": 4.424223462054816e-06, + "loss": 1.2592, + "step": 57554 + }, + { + "epoch": 0.7194679866996675, + "grad_norm": 2.20881724357605, + "learning_rate": 4.423499042628819e-06, + "loss": 0.8042, + "step": 57556 + }, + { + "epoch": 0.7194929873246831, + "grad_norm": 4.00732421875, + "learning_rate": 4.422774665672669e-06, + "loss": 1.2095, + "step": 57558 + }, + { + "epoch": 0.7195179879496988, + "grad_norm": 0.04731566086411476, + "learning_rate": 4.4220503311918774e-06, + "loss": 0.5359, + "step": 57560 + }, + { + "epoch": 0.7195429885747143, + "grad_norm": 3.103285551071167, + "learning_rate": 4.421326039191962e-06, + "loss": 0.7292, + "step": 57562 + }, + { + "epoch": 0.71956798919973, + "grad_norm": 7.074927806854248, + "learning_rate": 4.420601789678436e-06, + "loss": 0.2224, + "step": 57564 + }, + { + "epoch": 0.7195929898247456, + "grad_norm": 1.6378329992294312, + "learning_rate": 4.41987758265682e-06, + "loss": 0.6329, + "step": 57566 + }, + { + "epoch": 0.7196179904497613, + "grad_norm": 2.627457618713379, + "learning_rate": 4.4191534181326235e-06, + "loss": 0.7795, + "step": 57568 + }, + { + "epoch": 0.7196429910747769, + "grad_norm": 2.8574554920196533, + "learning_rate": 4.4184292961113665e-06, + "loss": 2.0988, + "step": 57570 + }, + { + "epoch": 0.7196679916997925, + "grad_norm": 5.760087490081787, + "learning_rate": 4.417705216598569e-06, + "loss": 1.8248, + "step": 57572 + }, + { + "epoch": 0.7196929923248081, + "grad_norm": 2.900881052017212, + "learning_rate": 4.416981179599732e-06, + "loss": 0.951, + "step": 57574 + }, + { + "epoch": 0.7197179929498237, + "grad_norm": 4.073978424072266, + "learning_rate": 4.416257185120381e-06, + "loss": 1.5103, + "step": 57576 + }, + { + "epoch": 0.7197429935748394, + "grad_norm": 4.730403423309326, + "learning_rate": 4.415533233166022e-06, + "loss": 1.0456, + "step": 57578 + }, + { + "epoch": 0.719767994199855, + "grad_norm": 2.5397183895111084, + "learning_rate": 4.414809323742172e-06, + "loss": 0.9536, + "step": 57580 + }, + { + "epoch": 0.7197929948248706, + "grad_norm": 3.1175339221954346, + "learning_rate": 4.4140854568543475e-06, + "loss": 0.7197, + "step": 57582 + }, + { + "epoch": 0.7198179954498862, + "grad_norm": 0.0007852762355469167, + "learning_rate": 4.413361632508055e-06, + "loss": 0.6234, + "step": 57584 + }, + { + "epoch": 0.7198429960749019, + "grad_norm": 4.39866304397583, + "learning_rate": 4.412637850708817e-06, + "loss": 1.0359, + "step": 57586 + }, + { + "epoch": 0.7198679966999175, + "grad_norm": 0.0002855588390957564, + "learning_rate": 4.411914111462134e-06, + "loss": 0.0006, + "step": 57588 + }, + { + "epoch": 0.7198929973249332, + "grad_norm": 2.1595661640167236, + "learning_rate": 4.4111904147735244e-06, + "loss": 1.6819, + "step": 57590 + }, + { + "epoch": 0.7199179979499487, + "grad_norm": 2.3192358016967773, + "learning_rate": 4.4104667606484944e-06, + "loss": 0.5025, + "step": 57592 + }, + { + "epoch": 0.7199429985749644, + "grad_norm": 3.1980526447296143, + "learning_rate": 4.40974314909256e-06, + "loss": 1.2075, + "step": 57594 + }, + { + "epoch": 0.71996799919998, + "grad_norm": 4.801733493804932, + "learning_rate": 4.409019580111235e-06, + "loss": 1.6029, + "step": 57596 + }, + { + "epoch": 0.7199929998249956, + "grad_norm": 0.00081439787754789, + "learning_rate": 4.408296053710026e-06, + "loss": 0.5362, + "step": 57598 + }, + { + "epoch": 0.7200180004500113, + "grad_norm": 6.497045993804932, + "learning_rate": 4.407572569894443e-06, + "loss": 1.7715, + "step": 57600 + }, + { + "epoch": 0.7200430010750268, + "grad_norm": 4.189678192138672, + "learning_rate": 4.406849128669993e-06, + "loss": 0.7933, + "step": 57602 + }, + { + "epoch": 0.7200680017000425, + "grad_norm": 2.8394956588745117, + "learning_rate": 4.40612573004219e-06, + "loss": 1.2113, + "step": 57604 + }, + { + "epoch": 0.7200930023250581, + "grad_norm": 0.6624743938446045, + "learning_rate": 4.405402374016546e-06, + "loss": 0.1813, + "step": 57606 + }, + { + "epoch": 0.7201180029500738, + "grad_norm": 2.2498068809509277, + "learning_rate": 4.404679060598561e-06, + "loss": 0.9506, + "step": 57608 + }, + { + "epoch": 0.7201430035750894, + "grad_norm": 2.3499844074249268, + "learning_rate": 4.403955789793755e-06, + "loss": 0.4482, + "step": 57610 + }, + { + "epoch": 0.720168004200105, + "grad_norm": 4.204911231994629, + "learning_rate": 4.403232561607629e-06, + "loss": 1.048, + "step": 57612 + }, + { + "epoch": 0.7201930048251206, + "grad_norm": 1.5504180192947388, + "learning_rate": 4.402509376045691e-06, + "loss": 1.8089, + "step": 57614 + }, + { + "epoch": 0.7202180054501363, + "grad_norm": 4.833550930023193, + "learning_rate": 4.4017862331134485e-06, + "loss": 1.4783, + "step": 57616 + }, + { + "epoch": 0.7202430060751519, + "grad_norm": 2.233384370803833, + "learning_rate": 4.40106313281641e-06, + "loss": 0.8319, + "step": 57618 + }, + { + "epoch": 0.7202680067001676, + "grad_norm": 1.962164044380188, + "learning_rate": 4.4003400751600865e-06, + "loss": 1.13, + "step": 57620 + }, + { + "epoch": 0.7202930073251831, + "grad_norm": 2.667475938796997, + "learning_rate": 4.399617060149976e-06, + "loss": 0.4677, + "step": 57622 + }, + { + "epoch": 0.7203180079501987, + "grad_norm": 7.470390796661377, + "learning_rate": 4.3988940877915995e-06, + "loss": 2.2943, + "step": 57624 + }, + { + "epoch": 0.7203430085752144, + "grad_norm": 6.618129730224609, + "learning_rate": 4.398171158090445e-06, + "loss": 2.4245, + "step": 57626 + }, + { + "epoch": 0.72036800920023, + "grad_norm": 3.3510677814483643, + "learning_rate": 4.3974482710520314e-06, + "loss": 0.2646, + "step": 57628 + }, + { + "epoch": 0.7203930098252457, + "grad_norm": 3.2460124492645264, + "learning_rate": 4.396725426681855e-06, + "loss": 0.442, + "step": 57630 + }, + { + "epoch": 0.7204180104502612, + "grad_norm": 2.626192569732666, + "learning_rate": 4.3960026249854256e-06, + "loss": 0.5845, + "step": 57632 + }, + { + "epoch": 0.7204430110752769, + "grad_norm": 5.809661388397217, + "learning_rate": 4.395279865968252e-06, + "loss": 1.7181, + "step": 57634 + }, + { + "epoch": 0.7204680117002925, + "grad_norm": 0.0009776868391782045, + "learning_rate": 4.39455714963583e-06, + "loss": 0.2838, + "step": 57636 + }, + { + "epoch": 0.7204930123253082, + "grad_norm": 5.785306930541992, + "learning_rate": 4.3938344759936755e-06, + "loss": 1.303, + "step": 57638 + }, + { + "epoch": 0.7205180129503238, + "grad_norm": 7.733002662658691, + "learning_rate": 4.393111845047278e-06, + "loss": 0.7931, + "step": 57640 + }, + { + "epoch": 0.7205430135753393, + "grad_norm": 2.4120051860809326, + "learning_rate": 4.392389256802147e-06, + "loss": 0.056, + "step": 57642 + }, + { + "epoch": 0.720568014200355, + "grad_norm": 3.594229221343994, + "learning_rate": 4.3916667112637905e-06, + "loss": 1.6684, + "step": 57644 + }, + { + "epoch": 0.7205930148253706, + "grad_norm": 4.640929698944092, + "learning_rate": 4.390944208437703e-06, + "loss": 1.0176, + "step": 57646 + }, + { + "epoch": 0.7206180154503863, + "grad_norm": 1.0735793113708496, + "learning_rate": 4.390221748329396e-06, + "loss": 0.8602, + "step": 57648 + }, + { + "epoch": 0.7206430160754019, + "grad_norm": 1.2598648071289062, + "learning_rate": 4.3894993309443655e-06, + "loss": 0.102, + "step": 57650 + }, + { + "epoch": 0.7206680167004175, + "grad_norm": 3.712261438369751, + "learning_rate": 4.388776956288117e-06, + "loss": 1.5159, + "step": 57652 + }, + { + "epoch": 0.7206930173254331, + "grad_norm": 4.383984088897705, + "learning_rate": 4.388054624366144e-06, + "loss": 1.1443, + "step": 57654 + }, + { + "epoch": 0.7207180179504488, + "grad_norm": 2.985962390899658, + "learning_rate": 4.387332335183954e-06, + "loss": 1.0916, + "step": 57656 + }, + { + "epoch": 0.7207430185754644, + "grad_norm": 4.238259792327881, + "learning_rate": 4.386610088747051e-06, + "loss": 1.5292, + "step": 57658 + }, + { + "epoch": 0.7207680192004801, + "grad_norm": 3.7092390060424805, + "learning_rate": 4.385887885060928e-06, + "loss": 1.6355, + "step": 57660 + }, + { + "epoch": 0.7207930198254956, + "grad_norm": 5.125227451324463, + "learning_rate": 4.385165724131092e-06, + "loss": 2.2271, + "step": 57662 + }, + { + "epoch": 0.7208180204505112, + "grad_norm": 0.3735237419605255, + "learning_rate": 4.3844436059630405e-06, + "loss": 0.5229, + "step": 57664 + }, + { + "epoch": 0.7208430210755269, + "grad_norm": 0.0019309030612930655, + "learning_rate": 4.383721530562268e-06, + "loss": 1.092, + "step": 57666 + }, + { + "epoch": 0.7208680217005425, + "grad_norm": 2.6333255767822266, + "learning_rate": 4.382999497934283e-06, + "loss": 1.4515, + "step": 57668 + }, + { + "epoch": 0.7208930223255582, + "grad_norm": 0.6914471387863159, + "learning_rate": 4.382277508084575e-06, + "loss": 0.8911, + "step": 57670 + }, + { + "epoch": 0.7209180229505737, + "grad_norm": 2.7610273361206055, + "learning_rate": 4.381555561018651e-06, + "loss": 0.625, + "step": 57672 + }, + { + "epoch": 0.7209430235755894, + "grad_norm": 3.2042460441589355, + "learning_rate": 4.380833656742001e-06, + "loss": 0.2841, + "step": 57674 + }, + { + "epoch": 0.720968024200605, + "grad_norm": 4.253652095794678, + "learning_rate": 4.380111795260136e-06, + "loss": 0.5116, + "step": 57676 + }, + { + "epoch": 0.7209930248256207, + "grad_norm": 3.6667447090148926, + "learning_rate": 4.379389976578536e-06, + "loss": 1.8276, + "step": 57678 + }, + { + "epoch": 0.7210180254506363, + "grad_norm": 3.074178695678711, + "learning_rate": 4.378668200702708e-06, + "loss": 0.7774, + "step": 57680 + }, + { + "epoch": 0.7210430260756518, + "grad_norm": 3.961189031600952, + "learning_rate": 4.377946467638151e-06, + "loss": 0.8647, + "step": 57682 + }, + { + "epoch": 0.7210680267006675, + "grad_norm": 2.748753547668457, + "learning_rate": 4.377224777390356e-06, + "loss": 1.8771, + "step": 57684 + }, + { + "epoch": 0.7210930273256831, + "grad_norm": 0.019997818395495415, + "learning_rate": 4.376503129964825e-06, + "loss": 0.5475, + "step": 57686 + }, + { + "epoch": 0.7211180279506988, + "grad_norm": 7.769643306732178, + "learning_rate": 4.375781525367048e-06, + "loss": 1.8878, + "step": 57688 + }, + { + "epoch": 0.7211430285757144, + "grad_norm": 10.201554298400879, + "learning_rate": 4.375059963602526e-06, + "loss": 0.7109, + "step": 57690 + }, + { + "epoch": 0.72116802920073, + "grad_norm": 3.7011148929595947, + "learning_rate": 4.374338444676752e-06, + "loss": 1.7282, + "step": 57692 + }, + { + "epoch": 0.7211930298257456, + "grad_norm": 4.104349613189697, + "learning_rate": 4.373616968595218e-06, + "loss": 0.8492, + "step": 57694 + }, + { + "epoch": 0.7212180304507613, + "grad_norm": 3.2617740631103516, + "learning_rate": 4.372895535363425e-06, + "loss": 2.3744, + "step": 57696 + }, + { + "epoch": 0.7212430310757769, + "grad_norm": 3.13559889793396, + "learning_rate": 4.372174144986861e-06, + "loss": 1.4972, + "step": 57698 + }, + { + "epoch": 0.7212680317007926, + "grad_norm": 6.875464916229248, + "learning_rate": 4.371452797471025e-06, + "loss": 0.9166, + "step": 57700 + }, + { + "epoch": 0.7212930323258081, + "grad_norm": 3.3217811584472656, + "learning_rate": 4.370731492821409e-06, + "loss": 0.234, + "step": 57702 + }, + { + "epoch": 0.7213180329508238, + "grad_norm": 3.4206714630126953, + "learning_rate": 4.370010231043502e-06, + "loss": 0.4108, + "step": 57704 + }, + { + "epoch": 0.7213430335758394, + "grad_norm": 3.8588924407958984, + "learning_rate": 4.369289012142806e-06, + "loss": 1.0268, + "step": 57706 + }, + { + "epoch": 0.721368034200855, + "grad_norm": 5.478219985961914, + "learning_rate": 4.368567836124804e-06, + "loss": 1.5882, + "step": 57708 + }, + { + "epoch": 0.7213930348258707, + "grad_norm": 1.7821224927902222, + "learning_rate": 4.367846702994997e-06, + "loss": 0.7946, + "step": 57710 + }, + { + "epoch": 0.7214180354508862, + "grad_norm": 0.04152142256498337, + "learning_rate": 4.367125612758868e-06, + "loss": 0.0004, + "step": 57712 + }, + { + "epoch": 0.7214430360759019, + "grad_norm": 0.000576506950892508, + "learning_rate": 4.366404565421919e-06, + "loss": 0.5791, + "step": 57714 + }, + { + "epoch": 0.7214680367009175, + "grad_norm": 2.045095682144165, + "learning_rate": 4.365683560989635e-06, + "loss": 1.0417, + "step": 57716 + }, + { + "epoch": 0.7214930373259332, + "grad_norm": 5.090242385864258, + "learning_rate": 4.364962599467506e-06, + "loss": 2.1432, + "step": 57718 + }, + { + "epoch": 0.7215180379509488, + "grad_norm": 3.79295015335083, + "learning_rate": 4.364241680861028e-06, + "loss": 0.65, + "step": 57720 + }, + { + "epoch": 0.7215430385759644, + "grad_norm": 3.0148468017578125, + "learning_rate": 4.363520805175684e-06, + "loss": 0.5897, + "step": 57722 + }, + { + "epoch": 0.72156803920098, + "grad_norm": 2.2924296855926514, + "learning_rate": 4.362799972416973e-06, + "loss": 0.2184, + "step": 57724 + }, + { + "epoch": 0.7215930398259957, + "grad_norm": 0.002571091754361987, + "learning_rate": 4.362079182590376e-06, + "loss": 0.6662, + "step": 57726 + }, + { + "epoch": 0.7216180404510113, + "grad_norm": 2.897794246673584, + "learning_rate": 4.36135843570139e-06, + "loss": 1.2291, + "step": 57728 + }, + { + "epoch": 0.721643041076027, + "grad_norm": 6.785633087158203, + "learning_rate": 4.360637731755502e-06, + "loss": 0.4139, + "step": 57730 + }, + { + "epoch": 0.7216680417010425, + "grad_norm": 3.2593603134155273, + "learning_rate": 4.359917070758195e-06, + "loss": 0.848, + "step": 57732 + }, + { + "epoch": 0.7216930423260581, + "grad_norm": 6.434744834899902, + "learning_rate": 4.359196452714967e-06, + "loss": 1.0327, + "step": 57734 + }, + { + "epoch": 0.7217180429510738, + "grad_norm": 0.0034187324345111847, + "learning_rate": 4.358475877631295e-06, + "loss": 0.0, + "step": 57736 + }, + { + "epoch": 0.7217430435760894, + "grad_norm": 3.7714688777923584, + "learning_rate": 4.357755345512679e-06, + "loss": 0.5805, + "step": 57738 + }, + { + "epoch": 0.7217680442011051, + "grad_norm": 1.9719581604003906, + "learning_rate": 4.357034856364596e-06, + "loss": 0.3005, + "step": 57740 + }, + { + "epoch": 0.7217930448261206, + "grad_norm": 5.339937210083008, + "learning_rate": 4.35631441019254e-06, + "loss": 2.0827, + "step": 57742 + }, + { + "epoch": 0.7218180454511363, + "grad_norm": 8.096475601196289, + "learning_rate": 4.355594007001997e-06, + "loss": 1.3165, + "step": 57744 + }, + { + "epoch": 0.7218430460761519, + "grad_norm": 3.004638671875, + "learning_rate": 4.354873646798448e-06, + "loss": 1.4584, + "step": 57746 + }, + { + "epoch": 0.7218680467011676, + "grad_norm": 2.521432638168335, + "learning_rate": 4.354153329587386e-06, + "loss": 0.9396, + "step": 57748 + }, + { + "epoch": 0.7218930473261832, + "grad_norm": 0.030164437368512154, + "learning_rate": 4.35343305537429e-06, + "loss": 0.0005, + "step": 57750 + }, + { + "epoch": 0.7219180479511987, + "grad_norm": 3.101323127746582, + "learning_rate": 4.352712824164653e-06, + "loss": 1.1792, + "step": 57752 + }, + { + "epoch": 0.7219430485762144, + "grad_norm": 0.004297820385545492, + "learning_rate": 4.351992635963958e-06, + "loss": 1.153, + "step": 57754 + }, + { + "epoch": 0.72196804920123, + "grad_norm": 2.333364486694336, + "learning_rate": 4.351272490777682e-06, + "loss": 1.603, + "step": 57756 + }, + { + "epoch": 0.7219930498262457, + "grad_norm": 4.529886245727539, + "learning_rate": 4.350552388611322e-06, + "loss": 1.2249, + "step": 57758 + }, + { + "epoch": 0.7220180504512613, + "grad_norm": 4.107892990112305, + "learning_rate": 4.349832329470352e-06, + "loss": 0.7302, + "step": 57760 + }, + { + "epoch": 0.7220430510762769, + "grad_norm": 4.3165059089660645, + "learning_rate": 4.349112313360263e-06, + "loss": 2.3601, + "step": 57762 + }, + { + "epoch": 0.7220680517012925, + "grad_norm": 2.05533766746521, + "learning_rate": 4.348392340286532e-06, + "loss": 0.1321, + "step": 57764 + }, + { + "epoch": 0.7220930523263082, + "grad_norm": 0.002328180940821767, + "learning_rate": 4.34767241025465e-06, + "loss": 0.7128, + "step": 57766 + }, + { + "epoch": 0.7221180529513238, + "grad_norm": 2.5610146522521973, + "learning_rate": 4.346952523270095e-06, + "loss": 0.702, + "step": 57768 + }, + { + "epoch": 0.7221430535763395, + "grad_norm": 4.8134846687316895, + "learning_rate": 4.346232679338347e-06, + "loss": 0.4808, + "step": 57770 + }, + { + "epoch": 0.722168054201355, + "grad_norm": 3.3389604091644287, + "learning_rate": 4.3455128784648945e-06, + "loss": 1.0141, + "step": 57772 + }, + { + "epoch": 0.7221930548263706, + "grad_norm": 5.4948906898498535, + "learning_rate": 4.344793120655213e-06, + "loss": 0.8764, + "step": 57774 + }, + { + "epoch": 0.7222180554513863, + "grad_norm": 7.4866228103637695, + "learning_rate": 4.34407340591479e-06, + "loss": 2.1389, + "step": 57776 + }, + { + "epoch": 0.7222430560764019, + "grad_norm": 2.059258460998535, + "learning_rate": 4.343353734249101e-06, + "loss": 0.4828, + "step": 57778 + }, + { + "epoch": 0.7222680567014176, + "grad_norm": 6.266711235046387, + "learning_rate": 4.342634105663636e-06, + "loss": 0.9514, + "step": 57780 + }, + { + "epoch": 0.7222930573264331, + "grad_norm": 5.071078300476074, + "learning_rate": 4.341914520163867e-06, + "loss": 1.991, + "step": 57782 + }, + { + "epoch": 0.7223180579514488, + "grad_norm": 1.0761661529541016, + "learning_rate": 4.341194977755274e-06, + "loss": 0.8149, + "step": 57784 + }, + { + "epoch": 0.7223430585764644, + "grad_norm": 3.608071804046631, + "learning_rate": 4.340475478443343e-06, + "loss": 1.6494, + "step": 57786 + }, + { + "epoch": 0.7223680592014801, + "grad_norm": 4.833410263061523, + "learning_rate": 4.339756022233548e-06, + "loss": 1.4822, + "step": 57788 + }, + { + "epoch": 0.7223930598264957, + "grad_norm": 4.961493015289307, + "learning_rate": 4.339036609131373e-06, + "loss": 1.4939, + "step": 57790 + }, + { + "epoch": 0.7224180604515112, + "grad_norm": 2.84480881690979, + "learning_rate": 4.338317239142291e-06, + "loss": 0.3246, + "step": 57792 + }, + { + "epoch": 0.7224430610765269, + "grad_norm": 3.7135510444641113, + "learning_rate": 4.337597912271789e-06, + "loss": 1.7957, + "step": 57794 + }, + { + "epoch": 0.7224680617015425, + "grad_norm": 3.803863525390625, + "learning_rate": 4.336878628525339e-06, + "loss": 0.6798, + "step": 57796 + }, + { + "epoch": 0.7224930623265582, + "grad_norm": 4.170661449432373, + "learning_rate": 4.3361593879084185e-06, + "loss": 1.1857, + "step": 57798 + }, + { + "epoch": 0.7225180629515738, + "grad_norm": 3.9885482788085938, + "learning_rate": 4.335440190426511e-06, + "loss": 1.4446, + "step": 57800 + }, + { + "epoch": 0.7225430635765894, + "grad_norm": 2.571748733520508, + "learning_rate": 4.334721036085085e-06, + "loss": 0.8391, + "step": 57802 + }, + { + "epoch": 0.722568064201605, + "grad_norm": 0.4108163118362427, + "learning_rate": 4.334001924889627e-06, + "loss": 0.5047, + "step": 57804 + }, + { + "epoch": 0.7225930648266207, + "grad_norm": 5.109845161437988, + "learning_rate": 4.333282856845604e-06, + "loss": 1.5126, + "step": 57806 + }, + { + "epoch": 0.7226180654516363, + "grad_norm": 2.792894124984741, + "learning_rate": 4.332563831958503e-06, + "loss": 0.6136, + "step": 57808 + }, + { + "epoch": 0.722643066076652, + "grad_norm": 6.848844528198242, + "learning_rate": 4.331844850233794e-06, + "loss": 2.8268, + "step": 57810 + }, + { + "epoch": 0.7226680667016675, + "grad_norm": 3.7103075981140137, + "learning_rate": 4.331125911676949e-06, + "loss": 1.1099, + "step": 57812 + }, + { + "epoch": 0.7226930673266831, + "grad_norm": 7.913585662841797, + "learning_rate": 4.3304070162934504e-06, + "loss": 2.1169, + "step": 57814 + }, + { + "epoch": 0.7227180679516988, + "grad_norm": 0.4747733771800995, + "learning_rate": 4.329688164088767e-06, + "loss": 0.8549, + "step": 57816 + }, + { + "epoch": 0.7227430685767144, + "grad_norm": 4.506714344024658, + "learning_rate": 4.328969355068381e-06, + "loss": 2.1232, + "step": 57818 + }, + { + "epoch": 0.7227680692017301, + "grad_norm": 2.995043992996216, + "learning_rate": 4.328250589237761e-06, + "loss": 1.0681, + "step": 57820 + }, + { + "epoch": 0.7227930698267456, + "grad_norm": 2.4397709369659424, + "learning_rate": 4.32753186660238e-06, + "loss": 0.5927, + "step": 57822 + }, + { + "epoch": 0.7228180704517613, + "grad_norm": 0.0008550735656172037, + "learning_rate": 4.326813187167718e-06, + "loss": 0.0, + "step": 57824 + }, + { + "epoch": 0.7228430710767769, + "grad_norm": 3.7072057723999023, + "learning_rate": 4.326094550939241e-06, + "loss": 0.9968, + "step": 57826 + }, + { + "epoch": 0.7228680717017926, + "grad_norm": 4.135404586791992, + "learning_rate": 4.325375957922427e-06, + "loss": 1.2088, + "step": 57828 + }, + { + "epoch": 0.7228930723268082, + "grad_norm": 3.5749082565307617, + "learning_rate": 4.324657408122747e-06, + "loss": 0.9753, + "step": 57830 + }, + { + "epoch": 0.7229180729518238, + "grad_norm": 2.977233409881592, + "learning_rate": 4.3239389015456745e-06, + "loss": 0.8493, + "step": 57832 + }, + { + "epoch": 0.7229430735768394, + "grad_norm": 7.758374214172363, + "learning_rate": 4.323220438196681e-06, + "loss": 1.7271, + "step": 57834 + }, + { + "epoch": 0.722968074201855, + "grad_norm": 10.169515609741211, + "learning_rate": 4.322502018081236e-06, + "loss": 0.2613, + "step": 57836 + }, + { + "epoch": 0.7229930748268707, + "grad_norm": 5.165036201477051, + "learning_rate": 4.321783641204815e-06, + "loss": 0.0563, + "step": 57838 + }, + { + "epoch": 0.7230180754518863, + "grad_norm": 4.3250226974487305, + "learning_rate": 4.321065307572884e-06, + "loss": 0.6118, + "step": 57840 + }, + { + "epoch": 0.7230430760769019, + "grad_norm": 3.0916099548339844, + "learning_rate": 4.320347017190919e-06, + "loss": 2.2838, + "step": 57842 + }, + { + "epoch": 0.7230680767019175, + "grad_norm": 3.196150779724121, + "learning_rate": 4.319628770064385e-06, + "loss": 0.9447, + "step": 57844 + }, + { + "epoch": 0.7230930773269332, + "grad_norm": 6.751786231994629, + "learning_rate": 4.318910566198759e-06, + "loss": 1.6796, + "step": 57846 + }, + { + "epoch": 0.7231180779519488, + "grad_norm": 2.5042309761047363, + "learning_rate": 4.318192405599505e-06, + "loss": 1.1183, + "step": 57848 + }, + { + "epoch": 0.7231430785769645, + "grad_norm": 4.1819939613342285, + "learning_rate": 4.317474288272092e-06, + "loss": 1.1369, + "step": 57850 + }, + { + "epoch": 0.72316807920198, + "grad_norm": 2.8875365257263184, + "learning_rate": 4.316756214221995e-06, + "loss": 1.5046, + "step": 57852 + }, + { + "epoch": 0.7231930798269957, + "grad_norm": 5.648613452911377, + "learning_rate": 4.3160381834546745e-06, + "loss": 0.8821, + "step": 57854 + }, + { + "epoch": 0.7232180804520113, + "grad_norm": 3.7026822566986084, + "learning_rate": 4.315320195975609e-06, + "loss": 0.9664, + "step": 57856 + }, + { + "epoch": 0.723243081077027, + "grad_norm": 2.1310667991638184, + "learning_rate": 4.3146022517902554e-06, + "loss": 0.9535, + "step": 57858 + }, + { + "epoch": 0.7232680817020426, + "grad_norm": 0.0007050128187984228, + "learning_rate": 4.3138843509040904e-06, + "loss": 0.7474, + "step": 57860 + }, + { + "epoch": 0.7232930823270581, + "grad_norm": 3.850987672805786, + "learning_rate": 4.313166493322579e-06, + "loss": 1.265, + "step": 57862 + }, + { + "epoch": 0.7233180829520738, + "grad_norm": 4.809805870056152, + "learning_rate": 4.312448679051184e-06, + "loss": 0.7263, + "step": 57864 + }, + { + "epoch": 0.7233430835770894, + "grad_norm": 3.371648073196411, + "learning_rate": 4.31173090809538e-06, + "loss": 3.0583, + "step": 57866 + }, + { + "epoch": 0.7233680842021051, + "grad_norm": 3.4895660877227783, + "learning_rate": 4.311013180460624e-06, + "loss": 1.2817, + "step": 57868 + }, + { + "epoch": 0.7233930848271207, + "grad_norm": 0.018816759809851646, + "learning_rate": 4.31029549615239e-06, + "loss": 0.1657, + "step": 57870 + }, + { + "epoch": 0.7234180854521363, + "grad_norm": 1.2008655071258545, + "learning_rate": 4.309577855176142e-06, + "loss": 1.3061, + "step": 57872 + }, + { + "epoch": 0.7234430860771519, + "grad_norm": 1.6959365606307983, + "learning_rate": 4.3088602575373404e-06, + "loss": 0.1606, + "step": 57874 + }, + { + "epoch": 0.7234680867021676, + "grad_norm": 5.131625652313232, + "learning_rate": 4.308142703241459e-06, + "loss": 1.8284, + "step": 57876 + }, + { + "epoch": 0.7234930873271832, + "grad_norm": 4.290126800537109, + "learning_rate": 4.307425192293954e-06, + "loss": 0.8787, + "step": 57878 + }, + { + "epoch": 0.7235180879521989, + "grad_norm": 3.8950650691986084, + "learning_rate": 4.306707724700297e-06, + "loss": 1.0995, + "step": 57880 + }, + { + "epoch": 0.7235430885772144, + "grad_norm": 4.522661209106445, + "learning_rate": 4.305990300465944e-06, + "loss": 0.7366, + "step": 57882 + }, + { + "epoch": 0.72356808920223, + "grad_norm": 4.034261226654053, + "learning_rate": 4.305272919596369e-06, + "loss": 1.1639, + "step": 57884 + }, + { + "epoch": 0.7235930898272457, + "grad_norm": 4.002914905548096, + "learning_rate": 4.304555582097029e-06, + "loss": 1.0504, + "step": 57886 + }, + { + "epoch": 0.7236180904522613, + "grad_norm": 4.054551124572754, + "learning_rate": 4.3038382879733856e-06, + "loss": 1.2052, + "step": 57888 + }, + { + "epoch": 0.723643091077277, + "grad_norm": 0.003769134171307087, + "learning_rate": 4.3031210372309074e-06, + "loss": 0.0001, + "step": 57890 + }, + { + "epoch": 0.7236680917022925, + "grad_norm": 3.18009090423584, + "learning_rate": 4.302403829875049e-06, + "loss": 0.6955, + "step": 57892 + }, + { + "epoch": 0.7236930923273082, + "grad_norm": 6.148874282836914, + "learning_rate": 4.301686665911283e-06, + "loss": 1.4913, + "step": 57894 + }, + { + "epoch": 0.7237180929523238, + "grad_norm": 4.172973155975342, + "learning_rate": 4.300969545345061e-06, + "loss": 0.7713, + "step": 57896 + }, + { + "epoch": 0.7237430935773395, + "grad_norm": 3.4461538791656494, + "learning_rate": 4.300252468181852e-06, + "loss": 0.4714, + "step": 57898 + }, + { + "epoch": 0.7237680942023551, + "grad_norm": 2.668632984161377, + "learning_rate": 4.299535434427116e-06, + "loss": 0.1061, + "step": 57900 + }, + { + "epoch": 0.7237930948273706, + "grad_norm": 4.076539039611816, + "learning_rate": 4.2988184440863065e-06, + "loss": 0.1967, + "step": 57902 + }, + { + "epoch": 0.7238180954523863, + "grad_norm": 0.0006176471943035722, + "learning_rate": 4.298101497164894e-06, + "loss": 0.0973, + "step": 57904 + }, + { + "epoch": 0.7238430960774019, + "grad_norm": 2.6244332790374756, + "learning_rate": 4.29738459366833e-06, + "loss": 1.3633, + "step": 57906 + }, + { + "epoch": 0.7238680967024176, + "grad_norm": 1.4129360914230347, + "learning_rate": 4.2966677336020825e-06, + "loss": 0.0405, + "step": 57908 + }, + { + "epoch": 0.7238930973274332, + "grad_norm": 2.160524845123291, + "learning_rate": 4.295950916971604e-06, + "loss": 0.416, + "step": 57910 + }, + { + "epoch": 0.7239180979524488, + "grad_norm": 1.7201263904571533, + "learning_rate": 4.295234143782359e-06, + "loss": 1.7273, + "step": 57912 + }, + { + "epoch": 0.7239430985774644, + "grad_norm": 4.055148601531982, + "learning_rate": 4.294517414039805e-06, + "loss": 1.1882, + "step": 57914 + }, + { + "epoch": 0.7239680992024801, + "grad_norm": 2.2162063121795654, + "learning_rate": 4.293800727749396e-06, + "loss": 1.5446, + "step": 57916 + }, + { + "epoch": 0.7239930998274957, + "grad_norm": 1.5552546977996826, + "learning_rate": 4.293084084916596e-06, + "loss": 0.0368, + "step": 57918 + }, + { + "epoch": 0.7240181004525114, + "grad_norm": 5.147934436798096, + "learning_rate": 4.292367485546859e-06, + "loss": 1.6878, + "step": 57920 + }, + { + "epoch": 0.7240431010775269, + "grad_norm": 4.407673358917236, + "learning_rate": 4.291650929645643e-06, + "loss": 0.5608, + "step": 57922 + }, + { + "epoch": 0.7240681017025425, + "grad_norm": 6.954507827758789, + "learning_rate": 4.290934417218414e-06, + "loss": 2.1873, + "step": 57924 + }, + { + "epoch": 0.7240931023275582, + "grad_norm": 0.004902060143649578, + "learning_rate": 4.290217948270614e-06, + "loss": 0.5303, + "step": 57926 + }, + { + "epoch": 0.7241181029525738, + "grad_norm": 0.0010594271589070559, + "learning_rate": 4.28950152280771e-06, + "loss": 0.9298, + "step": 57928 + }, + { + "epoch": 0.7241431035775895, + "grad_norm": 3.4745702743530273, + "learning_rate": 4.288785140835151e-06, + "loss": 1.4389, + "step": 57930 + }, + { + "epoch": 0.724168104202605, + "grad_norm": 3.009385824203491, + "learning_rate": 4.288068802358401e-06, + "loss": 0.6232, + "step": 57932 + }, + { + "epoch": 0.7241931048276207, + "grad_norm": 3.424051284790039, + "learning_rate": 4.287352507382906e-06, + "loss": 0.3153, + "step": 57934 + }, + { + "epoch": 0.7242181054526363, + "grad_norm": 3.9429008960723877, + "learning_rate": 4.286636255914127e-06, + "loss": 1.4594, + "step": 57936 + }, + { + "epoch": 0.724243106077652, + "grad_norm": 3.731274127960205, + "learning_rate": 4.285920047957527e-06, + "loss": 0.394, + "step": 57938 + }, + { + "epoch": 0.7242681067026676, + "grad_norm": 2.4547784328460693, + "learning_rate": 4.285203883518544e-06, + "loss": 0.4483, + "step": 57940 + }, + { + "epoch": 0.7242931073276831, + "grad_norm": 5.308531284332275, + "learning_rate": 4.2844877626026424e-06, + "loss": 1.0451, + "step": 57942 + }, + { + "epoch": 0.7243181079526988, + "grad_norm": 3.477008819580078, + "learning_rate": 4.283771685215272e-06, + "loss": 0.4492, + "step": 57944 + }, + { + "epoch": 0.7243431085777144, + "grad_norm": 8.144590377807617, + "learning_rate": 4.2830556513618905e-06, + "loss": 2.2743, + "step": 57946 + }, + { + "epoch": 0.7243681092027301, + "grad_norm": 4.696617126464844, + "learning_rate": 4.2823396610479466e-06, + "loss": 1.1649, + "step": 57948 + }, + { + "epoch": 0.7243931098277457, + "grad_norm": 3.431159257888794, + "learning_rate": 4.281623714278897e-06, + "loss": 1.6945, + "step": 57950 + }, + { + "epoch": 0.7244181104527613, + "grad_norm": 2.9388418197631836, + "learning_rate": 4.280907811060193e-06, + "loss": 1.2946, + "step": 57952 + }, + { + "epoch": 0.7244431110777769, + "grad_norm": 3.9559333324432373, + "learning_rate": 4.280191951397283e-06, + "loss": 1.124, + "step": 57954 + }, + { + "epoch": 0.7244681117027926, + "grad_norm": 3.4969353675842285, + "learning_rate": 4.279476135295626e-06, + "loss": 0.6694, + "step": 57956 + }, + { + "epoch": 0.7244931123278082, + "grad_norm": 2.645456075668335, + "learning_rate": 4.2787603627606665e-06, + "loss": 0.6047, + "step": 57958 + }, + { + "epoch": 0.7245181129528239, + "grad_norm": 5.35253381729126, + "learning_rate": 4.278044633797858e-06, + "loss": 0.857, + "step": 57960 + }, + { + "epoch": 0.7245431135778394, + "grad_norm": 2.36096453666687, + "learning_rate": 4.277328948412657e-06, + "loss": 0.7616, + "step": 57962 + }, + { + "epoch": 0.724568114202855, + "grad_norm": 1.5858038663864136, + "learning_rate": 4.27661330661051e-06, + "loss": 0.4884, + "step": 57964 + }, + { + "epoch": 0.7245931148278707, + "grad_norm": 0.3623104393482208, + "learning_rate": 4.275897708396865e-06, + "loss": 0.3734, + "step": 57966 + }, + { + "epoch": 0.7246181154528863, + "grad_norm": 3.4393770694732666, + "learning_rate": 4.275182153777171e-06, + "loss": 0.6233, + "step": 57968 + }, + { + "epoch": 0.724643116077902, + "grad_norm": 3.236984968185425, + "learning_rate": 4.2744666427568835e-06, + "loss": 0.7482, + "step": 57970 + }, + { + "epoch": 0.7246681167029175, + "grad_norm": 2.7145497798919678, + "learning_rate": 4.273751175341445e-06, + "loss": 0.1846, + "step": 57972 + }, + { + "epoch": 0.7246931173279332, + "grad_norm": 2.4837806224823, + "learning_rate": 4.273035751536308e-06, + "loss": 1.1201, + "step": 57974 + }, + { + "epoch": 0.7247181179529488, + "grad_norm": 2.472425699234009, + "learning_rate": 4.272320371346928e-06, + "loss": 0.758, + "step": 57976 + }, + { + "epoch": 0.7247431185779645, + "grad_norm": 1.8272510766983032, + "learning_rate": 4.271605034778738e-06, + "loss": 2.2503, + "step": 57978 + }, + { + "epoch": 0.7247681192029801, + "grad_norm": 1.6608136892318726, + "learning_rate": 4.270889741837199e-06, + "loss": 0.3482, + "step": 57980 + }, + { + "epoch": 0.7247931198279957, + "grad_norm": 1.9709932804107666, + "learning_rate": 4.270174492527748e-06, + "loss": 0.1294, + "step": 57982 + }, + { + "epoch": 0.7248181204530113, + "grad_norm": 0.0009273071773350239, + "learning_rate": 4.269459286855839e-06, + "loss": 0.603, + "step": 57984 + }, + { + "epoch": 0.724843121078027, + "grad_norm": 1.8928813934326172, + "learning_rate": 4.268744124826921e-06, + "loss": 0.938, + "step": 57986 + }, + { + "epoch": 0.7248681217030426, + "grad_norm": 2.8361310958862305, + "learning_rate": 4.268029006446435e-06, + "loss": 1.0132, + "step": 57988 + }, + { + "epoch": 0.7248931223280582, + "grad_norm": 3.92110276222229, + "learning_rate": 4.267313931719835e-06, + "loss": 0.8642, + "step": 57990 + }, + { + "epoch": 0.7249181229530738, + "grad_norm": 3.963728904724121, + "learning_rate": 4.266598900652554e-06, + "loss": 0.7442, + "step": 57992 + }, + { + "epoch": 0.7249431235780894, + "grad_norm": 2.6523330211639404, + "learning_rate": 4.2658839132500506e-06, + "loss": 0.62, + "step": 57994 + }, + { + "epoch": 0.7249681242031051, + "grad_norm": 0.3503108322620392, + "learning_rate": 4.265168969517759e-06, + "loss": 0.1945, + "step": 57996 + }, + { + "epoch": 0.7249931248281207, + "grad_norm": 2.5890862941741943, + "learning_rate": 4.26445406946113e-06, + "loss": 0.7999, + "step": 57998 + }, + { + "epoch": 0.7250181254531364, + "grad_norm": 2.5741629600524902, + "learning_rate": 4.2637392130856115e-06, + "loss": 0.893, + "step": 58000 + }, + { + "epoch": 0.7250431260781519, + "grad_norm": 6.919600009918213, + "learning_rate": 4.2630244003966444e-06, + "loss": 1.1814, + "step": 58002 + }, + { + "epoch": 0.7250681267031676, + "grad_norm": 5.972253799438477, + "learning_rate": 4.262309631399671e-06, + "loss": 0.905, + "step": 58004 + }, + { + "epoch": 0.7250931273281832, + "grad_norm": 1.1576001644134521, + "learning_rate": 4.261594906100134e-06, + "loss": 0.6665, + "step": 58006 + }, + { + "epoch": 0.7251181279531989, + "grad_norm": 3.7646422386169434, + "learning_rate": 4.2608802245034785e-06, + "loss": 0.5591, + "step": 58008 + }, + { + "epoch": 0.7251431285782145, + "grad_norm": 2.5253500938415527, + "learning_rate": 4.260165586615151e-06, + "loss": 0.5213, + "step": 58010 + }, + { + "epoch": 0.72516812920323, + "grad_norm": 2.2254762649536133, + "learning_rate": 4.259450992440586e-06, + "loss": 0.6966, + "step": 58012 + }, + { + "epoch": 0.7251931298282457, + "grad_norm": 2.7814953327178955, + "learning_rate": 4.258736441985235e-06, + "loss": 0.7249, + "step": 58014 + }, + { + "epoch": 0.7252181304532613, + "grad_norm": 0.0005175707628950477, + "learning_rate": 4.258021935254535e-06, + "loss": 1.0121, + "step": 58016 + }, + { + "epoch": 0.725243131078277, + "grad_norm": 3.747112989425659, + "learning_rate": 4.2573074722539285e-06, + "loss": 0.427, + "step": 58018 + }, + { + "epoch": 0.7252681317032926, + "grad_norm": 1.8971681594848633, + "learning_rate": 4.256593052988852e-06, + "loss": 0.6763, + "step": 58020 + }, + { + "epoch": 0.7252931323283082, + "grad_norm": 11.822345733642578, + "learning_rate": 4.255878677464751e-06, + "loss": 1.055, + "step": 58022 + }, + { + "epoch": 0.7253181329533238, + "grad_norm": 2.6830222606658936, + "learning_rate": 4.255164345687069e-06, + "loss": 1.3788, + "step": 58024 + }, + { + "epoch": 0.7253431335783395, + "grad_norm": 6.337523937225342, + "learning_rate": 4.254450057661241e-06, + "loss": 1.9931, + "step": 58026 + }, + { + "epoch": 0.7253681342033551, + "grad_norm": 5.699283123016357, + "learning_rate": 4.253735813392713e-06, + "loss": 1.5878, + "step": 58028 + }, + { + "epoch": 0.7253931348283708, + "grad_norm": 2.3486108779907227, + "learning_rate": 4.253021612886915e-06, + "loss": 0.5309, + "step": 58030 + }, + { + "epoch": 0.7254181354533863, + "grad_norm": 3.915623664855957, + "learning_rate": 4.252307456149296e-06, + "loss": 1.7723, + "step": 58032 + }, + { + "epoch": 0.7254431360784019, + "grad_norm": 3.4225306510925293, + "learning_rate": 4.251593343185285e-06, + "loss": 1.4872, + "step": 58034 + }, + { + "epoch": 0.7254681367034176, + "grad_norm": 2.4361605644226074, + "learning_rate": 4.250879274000328e-06, + "loss": 0.4466, + "step": 58036 + }, + { + "epoch": 0.7254931373284332, + "grad_norm": 0.015090033411979675, + "learning_rate": 4.250165248599864e-06, + "loss": 0.1175, + "step": 58038 + }, + { + "epoch": 0.7255181379534489, + "grad_norm": 4.5094895362854, + "learning_rate": 4.249451266989327e-06, + "loss": 1.4069, + "step": 58040 + }, + { + "epoch": 0.7255431385784644, + "grad_norm": 1.2467039823532104, + "learning_rate": 4.24873732917416e-06, + "loss": 0.7355, + "step": 58042 + }, + { + "epoch": 0.7255681392034801, + "grad_norm": 2.7913477420806885, + "learning_rate": 4.2480234351597905e-06, + "loss": 0.6101, + "step": 58044 + }, + { + "epoch": 0.7255931398284957, + "grad_norm": 3.679868459701538, + "learning_rate": 4.247309584951661e-06, + "loss": 0.7701, + "step": 58046 + }, + { + "epoch": 0.7256181404535114, + "grad_norm": 4.186407089233398, + "learning_rate": 4.246595778555211e-06, + "loss": 0.6659, + "step": 58048 + }, + { + "epoch": 0.725643141078527, + "grad_norm": 2.7344985008239746, + "learning_rate": 4.245882015975872e-06, + "loss": 0.4253, + "step": 58050 + }, + { + "epoch": 0.7256681417035425, + "grad_norm": 4.282251834869385, + "learning_rate": 4.245168297219084e-06, + "loss": 1.0041, + "step": 58052 + }, + { + "epoch": 0.7256931423285582, + "grad_norm": 2.5643885135650635, + "learning_rate": 4.244454622290278e-06, + "loss": 0.115, + "step": 58054 + }, + { + "epoch": 0.7257181429535738, + "grad_norm": 3.0506649017333984, + "learning_rate": 4.243740991194898e-06, + "loss": 1.1863, + "step": 58056 + }, + { + "epoch": 0.7257431435785895, + "grad_norm": 5.922696590423584, + "learning_rate": 4.243027403938365e-06, + "loss": 1.3691, + "step": 58058 + }, + { + "epoch": 0.7257681442036051, + "grad_norm": 5.953049182891846, + "learning_rate": 4.2423138605261226e-06, + "loss": 1.4862, + "step": 58060 + }, + { + "epoch": 0.7257931448286207, + "grad_norm": 0.0024407533928751945, + "learning_rate": 4.241600360963607e-06, + "loss": 0.2624, + "step": 58062 + }, + { + "epoch": 0.7258181454536363, + "grad_norm": 4.010163307189941, + "learning_rate": 4.240886905256244e-06, + "loss": 0.4898, + "step": 58064 + }, + { + "epoch": 0.725843146078652, + "grad_norm": 3.841688871383667, + "learning_rate": 4.240173493409476e-06, + "loss": 0.979, + "step": 58066 + }, + { + "epoch": 0.7258681467036676, + "grad_norm": 4.666578769683838, + "learning_rate": 4.239460125428732e-06, + "loss": 1.9183, + "step": 58068 + }, + { + "epoch": 0.7258931473286833, + "grad_norm": 3.920639991760254, + "learning_rate": 4.238746801319442e-06, + "loss": 1.3458, + "step": 58070 + }, + { + "epoch": 0.7259181479536988, + "grad_norm": 4.467203617095947, + "learning_rate": 4.238033521087046e-06, + "loss": 0.991, + "step": 58072 + }, + { + "epoch": 0.7259431485787144, + "grad_norm": 4.6588287353515625, + "learning_rate": 4.2373202847369665e-06, + "loss": 1.3435, + "step": 58074 + }, + { + "epoch": 0.7259681492037301, + "grad_norm": 4.39201545715332, + "learning_rate": 4.236607092274646e-06, + "loss": 1.3044, + "step": 58076 + }, + { + "epoch": 0.7259931498287457, + "grad_norm": 3.740001678466797, + "learning_rate": 4.235893943705506e-06, + "loss": 1.8834, + "step": 58078 + }, + { + "epoch": 0.7260181504537614, + "grad_norm": 3.711672782897949, + "learning_rate": 4.235180839034987e-06, + "loss": 1.1764, + "step": 58080 + }, + { + "epoch": 0.7260431510787769, + "grad_norm": 2.6530306339263916, + "learning_rate": 4.234467778268515e-06, + "loss": 0.2873, + "step": 58082 + }, + { + "epoch": 0.7260681517037926, + "grad_norm": 9.732016563415527, + "learning_rate": 4.233754761411518e-06, + "loss": 1.5599, + "step": 58084 + }, + { + "epoch": 0.7260931523288082, + "grad_norm": 4.287088871002197, + "learning_rate": 4.233041788469432e-06, + "loss": 1.5052, + "step": 58086 + }, + { + "epoch": 0.7261181529538239, + "grad_norm": 3.249420166015625, + "learning_rate": 4.232328859447682e-06, + "loss": 1.4977, + "step": 58088 + }, + { + "epoch": 0.7261431535788395, + "grad_norm": 3.5963993072509766, + "learning_rate": 4.2316159743517025e-06, + "loss": 0.9576, + "step": 58090 + }, + { + "epoch": 0.726168154203855, + "grad_norm": 5.7447991371154785, + "learning_rate": 4.230903133186916e-06, + "loss": 0.7385, + "step": 58092 + }, + { + "epoch": 0.7261931548288707, + "grad_norm": 4.219420909881592, + "learning_rate": 4.230190335958763e-06, + "loss": 2.4602, + "step": 58094 + }, + { + "epoch": 0.7262181554538863, + "grad_norm": 2.7906548976898193, + "learning_rate": 4.229477582672658e-06, + "loss": 0.8588, + "step": 58096 + }, + { + "epoch": 0.726243156078902, + "grad_norm": 5.1983819007873535, + "learning_rate": 4.228764873334035e-06, + "loss": 0.6763, + "step": 58098 + }, + { + "epoch": 0.7262681567039176, + "grad_norm": 1.1460907459259033, + "learning_rate": 4.228052207948326e-06, + "loss": 0.5601, + "step": 58100 + }, + { + "epoch": 0.7262931573289332, + "grad_norm": 0.05018458515405655, + "learning_rate": 4.227339586520952e-06, + "loss": 1.0453, + "step": 58102 + }, + { + "epoch": 0.7263181579539488, + "grad_norm": 2.7439537048339844, + "learning_rate": 4.226627009057347e-06, + "loss": 0.0884, + "step": 58104 + }, + { + "epoch": 0.7263431585789645, + "grad_norm": 0.4092054069042206, + "learning_rate": 4.22591447556293e-06, + "loss": 0.03, + "step": 58106 + }, + { + "epoch": 0.7263681592039801, + "grad_norm": 0.8520744442939758, + "learning_rate": 4.225201986043136e-06, + "loss": 0.2821, + "step": 58108 + }, + { + "epoch": 0.7263931598289958, + "grad_norm": 4.850588321685791, + "learning_rate": 4.224489540503387e-06, + "loss": 0.6914, + "step": 58110 + }, + { + "epoch": 0.7264181604540113, + "grad_norm": 3.950044631958008, + "learning_rate": 4.2237771389491055e-06, + "loss": 0.2589, + "step": 58112 + }, + { + "epoch": 0.726443161079027, + "grad_norm": 6.323301792144775, + "learning_rate": 4.223064781385724e-06, + "loss": 0.7125, + "step": 58114 + }, + { + "epoch": 0.7264681617040426, + "grad_norm": 0.0017879949882626534, + "learning_rate": 4.222352467818661e-06, + "loss": 0.461, + "step": 58116 + }, + { + "epoch": 0.7264931623290583, + "grad_norm": 2.6089041233062744, + "learning_rate": 4.221640198253347e-06, + "loss": 0.3246, + "step": 58118 + }, + { + "epoch": 0.7265181629540739, + "grad_norm": 4.124059200286865, + "learning_rate": 4.220927972695205e-06, + "loss": 0.684, + "step": 58120 + }, + { + "epoch": 0.7265431635790894, + "grad_norm": 27.27193832397461, + "learning_rate": 4.220215791149655e-06, + "loss": 1.8514, + "step": 58122 + }, + { + "epoch": 0.7265681642041051, + "grad_norm": 4.475716590881348, + "learning_rate": 4.219503653622127e-06, + "loss": 1.3133, + "step": 58124 + }, + { + "epoch": 0.7265931648291207, + "grad_norm": 4.8520989418029785, + "learning_rate": 4.218791560118038e-06, + "loss": 1.9302, + "step": 58126 + }, + { + "epoch": 0.7266181654541364, + "grad_norm": 0.0016419771127402782, + "learning_rate": 4.21807951064282e-06, + "loss": 0.6053, + "step": 58128 + }, + { + "epoch": 0.726643166079152, + "grad_norm": 4.448164939880371, + "learning_rate": 4.217367505201886e-06, + "loss": 0.1726, + "step": 58130 + }, + { + "epoch": 0.7266681667041676, + "grad_norm": 4.1735124588012695, + "learning_rate": 4.216655543800666e-06, + "loss": 1.7061, + "step": 58132 + }, + { + "epoch": 0.7266931673291832, + "grad_norm": 5.407140731811523, + "learning_rate": 4.2159436264445816e-06, + "loss": 1.2236, + "step": 58134 + }, + { + "epoch": 0.7267181679541989, + "grad_norm": 2.4810245037078857, + "learning_rate": 4.2152317531390474e-06, + "loss": 1.0748, + "step": 58136 + }, + { + "epoch": 0.7267431685792145, + "grad_norm": 0.921238362789154, + "learning_rate": 4.214519923889495e-06, + "loss": 0.4913, + "step": 58138 + }, + { + "epoch": 0.7267681692042302, + "grad_norm": 5.435069561004639, + "learning_rate": 4.213808138701336e-06, + "loss": 1.3151, + "step": 58140 + }, + { + "epoch": 0.7267931698292457, + "grad_norm": 2.738582134246826, + "learning_rate": 4.213096397580001e-06, + "loss": 0.3379, + "step": 58142 + }, + { + "epoch": 0.7268181704542613, + "grad_norm": 2.659411907196045, + "learning_rate": 4.2123847005309004e-06, + "loss": 0.7122, + "step": 58144 + }, + { + "epoch": 0.726843171079277, + "grad_norm": 0.7187864780426025, + "learning_rate": 4.2116730475594636e-06, + "loss": 0.0259, + "step": 58146 + }, + { + "epoch": 0.7268681717042926, + "grad_norm": 2.7859134674072266, + "learning_rate": 4.210961438671106e-06, + "loss": 1.1896, + "step": 58148 + }, + { + "epoch": 0.7268931723293083, + "grad_norm": 0.9541284441947937, + "learning_rate": 4.210249873871245e-06, + "loss": 0.0485, + "step": 58150 + }, + { + "epoch": 0.7269181729543238, + "grad_norm": 0.0008456585928797722, + "learning_rate": 4.209538353165304e-06, + "loss": 0.5356, + "step": 58152 + }, + { + "epoch": 0.7269431735793395, + "grad_norm": 0.4625234603881836, + "learning_rate": 4.208826876558698e-06, + "loss": 0.6534, + "step": 58154 + }, + { + "epoch": 0.7269681742043551, + "grad_norm": 3.653803825378418, + "learning_rate": 4.2081154440568504e-06, + "loss": 1.5416, + "step": 58156 + }, + { + "epoch": 0.7269931748293708, + "grad_norm": 3.196502447128296, + "learning_rate": 4.207404055665174e-06, + "loss": 0.5012, + "step": 58158 + }, + { + "epoch": 0.7270181754543864, + "grad_norm": 0.0015133069828152657, + "learning_rate": 4.2066927113890924e-06, + "loss": 0.0453, + "step": 58160 + }, + { + "epoch": 0.7270431760794019, + "grad_norm": 2.5076422691345215, + "learning_rate": 4.205981411234019e-06, + "loss": 1.0725, + "step": 58162 + }, + { + "epoch": 0.7270681767044176, + "grad_norm": 0.011747114360332489, + "learning_rate": 4.205270155205369e-06, + "loss": 0.0003, + "step": 58164 + }, + { + "epoch": 0.7270931773294332, + "grad_norm": 5.556085586547852, + "learning_rate": 4.204558943308565e-06, + "loss": 0.8691, + "step": 58166 + }, + { + "epoch": 0.7271181779544489, + "grad_norm": 3.429652214050293, + "learning_rate": 4.203847775549017e-06, + "loss": 0.2562, + "step": 58168 + }, + { + "epoch": 0.7271431785794645, + "grad_norm": 0.12025924772024155, + "learning_rate": 4.2031366519321484e-06, + "loss": 0.9635, + "step": 58170 + }, + { + "epoch": 0.7271681792044801, + "grad_norm": 5.847402095794678, + "learning_rate": 4.2024255724633726e-06, + "loss": 1.8116, + "step": 58172 + }, + { + "epoch": 0.7271931798294957, + "grad_norm": 0.9393474459648132, + "learning_rate": 4.201714537148098e-06, + "loss": 0.5977, + "step": 58174 + }, + { + "epoch": 0.7272181804545114, + "grad_norm": 5.403253078460693, + "learning_rate": 4.201003545991751e-06, + "loss": 0.6818, + "step": 58176 + }, + { + "epoch": 0.727243181079527, + "grad_norm": 3.2498953342437744, + "learning_rate": 4.200292598999736e-06, + "loss": 1.6181, + "step": 58178 + }, + { + "epoch": 0.7272681817045427, + "grad_norm": 0.0005264367791824043, + "learning_rate": 4.199581696177476e-06, + "loss": 0.8466, + "step": 58180 + }, + { + "epoch": 0.7272931823295582, + "grad_norm": 6.909716606140137, + "learning_rate": 4.198870837530379e-06, + "loss": 1.4683, + "step": 58182 + }, + { + "epoch": 0.7273181829545738, + "grad_norm": 0.2105589509010315, + "learning_rate": 4.198160023063864e-06, + "loss": 0.0073, + "step": 58184 + }, + { + "epoch": 0.7273431835795895, + "grad_norm": 2.8295886516571045, + "learning_rate": 4.197449252783342e-06, + "loss": 0.5403, + "step": 58186 + }, + { + "epoch": 0.7273681842046051, + "grad_norm": 5.040171146392822, + "learning_rate": 4.1967385266942225e-06, + "loss": 0.9587, + "step": 58188 + }, + { + "epoch": 0.7273931848296208, + "grad_norm": 3.1907739639282227, + "learning_rate": 4.196027844801925e-06, + "loss": 1.2015, + "step": 58190 + }, + { + "epoch": 0.7274181854546363, + "grad_norm": 4.112447261810303, + "learning_rate": 4.195317207111856e-06, + "loss": 0.9086, + "step": 58192 + }, + { + "epoch": 0.727443186079652, + "grad_norm": 0.0006405087769962847, + "learning_rate": 4.194606613629434e-06, + "loss": 1.5841, + "step": 58194 + }, + { + "epoch": 0.7274681867046676, + "grad_norm": 2.6663460731506348, + "learning_rate": 4.193896064360063e-06, + "loss": 0.5815, + "step": 58196 + }, + { + "epoch": 0.7274931873296833, + "grad_norm": 2.5365850925445557, + "learning_rate": 4.193185559309162e-06, + "loss": 0.7165, + "step": 58198 + }, + { + "epoch": 0.7275181879546989, + "grad_norm": 4.987830638885498, + "learning_rate": 4.192475098482138e-06, + "loss": 0.984, + "step": 58200 + }, + { + "epoch": 0.7275431885797145, + "grad_norm": 0.7363549470901489, + "learning_rate": 4.191764681884399e-06, + "loss": 1.1371, + "step": 58202 + }, + { + "epoch": 0.7275681892047301, + "grad_norm": 2.262960195541382, + "learning_rate": 4.191054309521363e-06, + "loss": 0.766, + "step": 58204 + }, + { + "epoch": 0.7275931898297457, + "grad_norm": 6.310757160186768, + "learning_rate": 4.190343981398433e-06, + "loss": 0.8287, + "step": 58206 + }, + { + "epoch": 0.7276181904547614, + "grad_norm": 4.185176849365234, + "learning_rate": 4.189633697521024e-06, + "loss": 0.922, + "step": 58208 + }, + { + "epoch": 0.727643191079777, + "grad_norm": 4.2532572746276855, + "learning_rate": 4.188923457894539e-06, + "loss": 0.9161, + "step": 58210 + }, + { + "epoch": 0.7276681917047926, + "grad_norm": 1.793131947517395, + "learning_rate": 4.1882132625243964e-06, + "loss": 0.1308, + "step": 58212 + }, + { + "epoch": 0.7276931923298082, + "grad_norm": 5.183750152587891, + "learning_rate": 4.187503111415999e-06, + "loss": 2.0705, + "step": 58214 + }, + { + "epoch": 0.7277181929548239, + "grad_norm": 0.6944376826286316, + "learning_rate": 4.1867930045747516e-06, + "loss": 0.6198, + "step": 58216 + }, + { + "epoch": 0.7277431935798395, + "grad_norm": 3.955118417739868, + "learning_rate": 4.186082942006071e-06, + "loss": 0.8461, + "step": 58218 + }, + { + "epoch": 0.7277681942048552, + "grad_norm": 0.0015469673089683056, + "learning_rate": 4.185372923715357e-06, + "loss": 1.2573, + "step": 58220 + }, + { + "epoch": 0.7277931948298707, + "grad_norm": 5.2963080406188965, + "learning_rate": 4.184662949708025e-06, + "loss": 1.4392, + "step": 58222 + }, + { + "epoch": 0.7278181954548864, + "grad_norm": 3.6782021522521973, + "learning_rate": 4.183953019989476e-06, + "loss": 1.3395, + "step": 58224 + }, + { + "epoch": 0.727843196079902, + "grad_norm": 2.66430926322937, + "learning_rate": 4.183243134565116e-06, + "loss": 2.0032, + "step": 58226 + }, + { + "epoch": 0.7278681967049176, + "grad_norm": 0.0018189487745985389, + "learning_rate": 4.182533293440356e-06, + "loss": 0.8278, + "step": 58228 + }, + { + "epoch": 0.7278931973299333, + "grad_norm": 4.5433573722839355, + "learning_rate": 4.181823496620597e-06, + "loss": 1.8215, + "step": 58230 + }, + { + "epoch": 0.7279181979549488, + "grad_norm": 2.5573360919952393, + "learning_rate": 4.181113744111251e-06, + "loss": 1.6119, + "step": 58232 + }, + { + "epoch": 0.7279431985799645, + "grad_norm": 3.1108195781707764, + "learning_rate": 4.180404035917716e-06, + "loss": 0.6026, + "step": 58234 + }, + { + "epoch": 0.7279681992049801, + "grad_norm": 3.9291412830352783, + "learning_rate": 4.1796943720454055e-06, + "loss": 1.7569, + "step": 58236 + }, + { + "epoch": 0.7279931998299958, + "grad_norm": 0.4055681526660919, + "learning_rate": 4.178984752499719e-06, + "loss": 0.008, + "step": 58238 + }, + { + "epoch": 0.7280182004550114, + "grad_norm": 5.945723533630371, + "learning_rate": 4.178275177286058e-06, + "loss": 1.6821, + "step": 58240 + }, + { + "epoch": 0.728043201080027, + "grad_norm": 2.6377787590026855, + "learning_rate": 4.177565646409832e-06, + "loss": 1.0022, + "step": 58242 + }, + { + "epoch": 0.7280682017050426, + "grad_norm": 4.44045352935791, + "learning_rate": 4.176856159876442e-06, + "loss": 0.9526, + "step": 58244 + }, + { + "epoch": 0.7280932023300583, + "grad_norm": 0.004046217538416386, + "learning_rate": 4.176146717691293e-06, + "loss": 0.0071, + "step": 58246 + }, + { + "epoch": 0.7281182029550739, + "grad_norm": 2.838479518890381, + "learning_rate": 4.1754373198597844e-06, + "loss": 1.1213, + "step": 58248 + }, + { + "epoch": 0.7281432035800895, + "grad_norm": 8.363661766052246, + "learning_rate": 4.174727966387325e-06, + "loss": 2.1006, + "step": 58250 + }, + { + "epoch": 0.7281682042051051, + "grad_norm": 0.6322648525238037, + "learning_rate": 4.174018657279314e-06, + "loss": 0.8419, + "step": 58252 + }, + { + "epoch": 0.7281932048301207, + "grad_norm": 0.0007616687216795981, + "learning_rate": 4.1733093925411496e-06, + "loss": 0.2955, + "step": 58254 + }, + { + "epoch": 0.7282182054551364, + "grad_norm": 0.0065483152866363525, + "learning_rate": 4.172600172178239e-06, + "loss": 0.0847, + "step": 58256 + }, + { + "epoch": 0.728243206080152, + "grad_norm": 2.643498420715332, + "learning_rate": 4.1718909961959795e-06, + "loss": 1.6607, + "step": 58258 + }, + { + "epoch": 0.7282682067051677, + "grad_norm": 3.3148345947265625, + "learning_rate": 4.171181864599777e-06, + "loss": 0.5866, + "step": 58260 + }, + { + "epoch": 0.7282932073301832, + "grad_norm": 1.969035029411316, + "learning_rate": 4.170472777395026e-06, + "loss": 0.4436, + "step": 58262 + }, + { + "epoch": 0.7283182079551989, + "grad_norm": 8.19098949432373, + "learning_rate": 4.169763734587132e-06, + "loss": 1.2512, + "step": 58264 + }, + { + "epoch": 0.7283432085802145, + "grad_norm": 3.498992681503296, + "learning_rate": 4.169054736181495e-06, + "loss": 0.7664, + "step": 58266 + }, + { + "epoch": 0.7283682092052302, + "grad_norm": 0.6123443245887756, + "learning_rate": 4.168345782183507e-06, + "loss": 0.1936, + "step": 58268 + }, + { + "epoch": 0.7283932098302458, + "grad_norm": 0.0013330972287803888, + "learning_rate": 4.167636872598578e-06, + "loss": 0.6084, + "step": 58270 + }, + { + "epoch": 0.7284182104552613, + "grad_norm": 1.978820562362671, + "learning_rate": 4.166928007432098e-06, + "loss": 0.8984, + "step": 58272 + }, + { + "epoch": 0.728443211080277, + "grad_norm": 2.7793939113616943, + "learning_rate": 4.166219186689473e-06, + "loss": 1.9798, + "step": 58274 + }, + { + "epoch": 0.7284682117052926, + "grad_norm": 0.0012495552655309439, + "learning_rate": 4.165510410376097e-06, + "loss": 0.0689, + "step": 58276 + }, + { + "epoch": 0.7284932123303083, + "grad_norm": 0.5333527326583862, + "learning_rate": 4.164801678497366e-06, + "loss": 0.0209, + "step": 58278 + }, + { + "epoch": 0.7285182129553239, + "grad_norm": 4.608140468597412, + "learning_rate": 4.164092991058683e-06, + "loss": 0.942, + "step": 58280 + }, + { + "epoch": 0.7285432135803395, + "grad_norm": 3.501579523086548, + "learning_rate": 4.163384348065439e-06, + "loss": 1.2986, + "step": 58282 + }, + { + "epoch": 0.7285682142053551, + "grad_norm": 3.4334027767181396, + "learning_rate": 4.162675749523039e-06, + "loss": 0.3064, + "step": 58284 + }, + { + "epoch": 0.7285932148303708, + "grad_norm": 0.0013605857966467738, + "learning_rate": 4.16196719543687e-06, + "loss": 0.9061, + "step": 58286 + }, + { + "epoch": 0.7286182154553864, + "grad_norm": 5.466736316680908, + "learning_rate": 4.161258685812337e-06, + "loss": 0.184, + "step": 58288 + }, + { + "epoch": 0.7286432160804021, + "grad_norm": 4.329421520233154, + "learning_rate": 4.160550220654833e-06, + "loss": 1.3633, + "step": 58290 + }, + { + "epoch": 0.7286682167054176, + "grad_norm": 0.09536640346050262, + "learning_rate": 4.159841799969749e-06, + "loss": 0.8932, + "step": 58292 + }, + { + "epoch": 0.7286932173304332, + "grad_norm": 3.9201395511627197, + "learning_rate": 4.159133423762487e-06, + "loss": 0.3615, + "step": 58294 + }, + { + "epoch": 0.7287182179554489, + "grad_norm": 7.233372211456299, + "learning_rate": 4.1584250920384365e-06, + "loss": 0.5252, + "step": 58296 + }, + { + "epoch": 0.7287432185804645, + "grad_norm": 0.0004399582976475358, + "learning_rate": 4.157716804802997e-06, + "loss": 0.0467, + "step": 58298 + }, + { + "epoch": 0.7287682192054802, + "grad_norm": 1.9290672540664673, + "learning_rate": 4.157008562061556e-06, + "loss": 0.599, + "step": 58300 + }, + { + "epoch": 0.7287932198304957, + "grad_norm": 0.0004929840215481818, + "learning_rate": 4.156300363819512e-06, + "loss": 0.04, + "step": 58302 + }, + { + "epoch": 0.7288182204555114, + "grad_norm": 0.6104797124862671, + "learning_rate": 4.155592210082266e-06, + "loss": 0.8737, + "step": 58304 + }, + { + "epoch": 0.728843221080527, + "grad_norm": 3.2629756927490234, + "learning_rate": 4.154884100855195e-06, + "loss": 1.021, + "step": 58306 + }, + { + "epoch": 0.7288682217055427, + "grad_norm": 2.41921329498291, + "learning_rate": 4.154176036143704e-06, + "loss": 1.2176, + "step": 58308 + }, + { + "epoch": 0.7288932223305583, + "grad_norm": 0.8457388877868652, + "learning_rate": 4.1534680159531796e-06, + "loss": 0.2127, + "step": 58310 + }, + { + "epoch": 0.7289182229555738, + "grad_norm": 3.2830018997192383, + "learning_rate": 4.152760040289018e-06, + "loss": 1.5161, + "step": 58312 + }, + { + "epoch": 0.7289432235805895, + "grad_norm": 1.8518658876419067, + "learning_rate": 4.152052109156606e-06, + "loss": 1.0731, + "step": 58314 + }, + { + "epoch": 0.7289682242056051, + "grad_norm": 2.4779751300811768, + "learning_rate": 4.151344222561341e-06, + "loss": 1.1266, + "step": 58316 + }, + { + "epoch": 0.7289932248306208, + "grad_norm": 5.426497936248779, + "learning_rate": 4.150636380508613e-06, + "loss": 1.1722, + "step": 58318 + }, + { + "epoch": 0.7290182254556364, + "grad_norm": 1.6796656847000122, + "learning_rate": 4.1499285830038065e-06, + "loss": 1.5113, + "step": 58320 + }, + { + "epoch": 0.729043226080652, + "grad_norm": 3.808748483657837, + "learning_rate": 4.14922083005232e-06, + "loss": 0.7847, + "step": 58322 + }, + { + "epoch": 0.7290682267056676, + "grad_norm": 3.753758192062378, + "learning_rate": 4.148513121659536e-06, + "loss": 1.3579, + "step": 58324 + }, + { + "epoch": 0.7290932273306833, + "grad_norm": 3.5084307193756104, + "learning_rate": 4.1478054578308505e-06, + "loss": 1.4327, + "step": 58326 + }, + { + "epoch": 0.7291182279556989, + "grad_norm": 3.629145860671997, + "learning_rate": 4.147097838571653e-06, + "loss": 1.3343, + "step": 58328 + }, + { + "epoch": 0.7291432285807146, + "grad_norm": 2.9623115062713623, + "learning_rate": 4.146390263887331e-06, + "loss": 0.5121, + "step": 58330 + }, + { + "epoch": 0.7291682292057301, + "grad_norm": 2.6668291091918945, + "learning_rate": 4.1456827337832724e-06, + "loss": 0.7589, + "step": 58332 + }, + { + "epoch": 0.7291932298307457, + "grad_norm": 3.1217284202575684, + "learning_rate": 4.144975248264864e-06, + "loss": 0.687, + "step": 58334 + }, + { + "epoch": 0.7292182304557614, + "grad_norm": 1.4547040462493896, + "learning_rate": 4.1442678073374995e-06, + "loss": 0.8996, + "step": 58336 + }, + { + "epoch": 0.729243231080777, + "grad_norm": 1.1108100414276123, + "learning_rate": 4.1435604110065595e-06, + "loss": 0.8388, + "step": 58338 + }, + { + "epoch": 0.7292682317057927, + "grad_norm": 0.9198706150054932, + "learning_rate": 4.142853059277435e-06, + "loss": 0.2206, + "step": 58340 + }, + { + "epoch": 0.7292932323308082, + "grad_norm": 3.527900218963623, + "learning_rate": 4.142145752155521e-06, + "loss": 0.6259, + "step": 58342 + }, + { + "epoch": 0.7293182329558239, + "grad_norm": 1.9459069967269897, + "learning_rate": 4.141438489646189e-06, + "loss": 0.1361, + "step": 58344 + }, + { + "epoch": 0.7293432335808395, + "grad_norm": 1.948269248008728, + "learning_rate": 4.1407312717548364e-06, + "loss": 0.2488, + "step": 58346 + }, + { + "epoch": 0.7293682342058552, + "grad_norm": 3.893270492553711, + "learning_rate": 4.1400240984868425e-06, + "loss": 1.7315, + "step": 58348 + }, + { + "epoch": 0.7293932348308708, + "grad_norm": 5.042393684387207, + "learning_rate": 4.1393169698476e-06, + "loss": 1.2103, + "step": 58350 + }, + { + "epoch": 0.7294182354558864, + "grad_norm": 2.3531484603881836, + "learning_rate": 4.138609885842488e-06, + "loss": 0.8822, + "step": 58352 + }, + { + "epoch": 0.729443236080902, + "grad_norm": 0.0006850906647741795, + "learning_rate": 4.1379028464768924e-06, + "loss": 0.0, + "step": 58354 + }, + { + "epoch": 0.7294682367059176, + "grad_norm": 1.9118149280548096, + "learning_rate": 4.137195851756208e-06, + "loss": 0.191, + "step": 58356 + }, + { + "epoch": 0.7294932373309333, + "grad_norm": 0.0005378737696446478, + "learning_rate": 4.136488901685803e-06, + "loss": 0.2504, + "step": 58358 + }, + { + "epoch": 0.729518237955949, + "grad_norm": 3.2440552711486816, + "learning_rate": 4.135781996271074e-06, + "loss": 0.7686, + "step": 58360 + }, + { + "epoch": 0.7295432385809645, + "grad_norm": 3.041729688644409, + "learning_rate": 4.135075135517395e-06, + "loss": 1.2573, + "step": 58362 + }, + { + "epoch": 0.7295682392059801, + "grad_norm": 4.011902332305908, + "learning_rate": 4.134368319430156e-06, + "loss": 0.4853, + "step": 58364 + }, + { + "epoch": 0.7295932398309958, + "grad_norm": 3.1616926193237305, + "learning_rate": 4.133661548014742e-06, + "loss": 1.3279, + "step": 58366 + }, + { + "epoch": 0.7296182404560114, + "grad_norm": 1.5889227390289307, + "learning_rate": 4.132954821276531e-06, + "loss": 0.6898, + "step": 58368 + }, + { + "epoch": 0.7296432410810271, + "grad_norm": 20.23634147644043, + "learning_rate": 4.1322481392209064e-06, + "loss": 0.4899, + "step": 58370 + }, + { + "epoch": 0.7296682417060426, + "grad_norm": 6.804671287536621, + "learning_rate": 4.1315415018532475e-06, + "loss": 1.6022, + "step": 58372 + }, + { + "epoch": 0.7296932423310583, + "grad_norm": 4.6799798011779785, + "learning_rate": 4.130834909178942e-06, + "loss": 1.0961, + "step": 58374 + }, + { + "epoch": 0.7297182429560739, + "grad_norm": 3.7850382328033447, + "learning_rate": 4.130128361203364e-06, + "loss": 1.3249, + "step": 58376 + }, + { + "epoch": 0.7297432435810896, + "grad_norm": 0.0006672142772004008, + "learning_rate": 4.129421857931899e-06, + "loss": 0.0173, + "step": 58378 + }, + { + "epoch": 0.7297682442061052, + "grad_norm": 4.661909580230713, + "learning_rate": 4.128715399369929e-06, + "loss": 2.315, + "step": 58380 + }, + { + "epoch": 0.7297932448311207, + "grad_norm": 3.538954734802246, + "learning_rate": 4.128008985522834e-06, + "loss": 1.955, + "step": 58382 + }, + { + "epoch": 0.7298182454561364, + "grad_norm": 4.012548446655273, + "learning_rate": 4.127302616395991e-06, + "loss": 0.6967, + "step": 58384 + }, + { + "epoch": 0.729843246081152, + "grad_norm": 2.7829978466033936, + "learning_rate": 4.126596291994778e-06, + "loss": 0.9392, + "step": 58386 + }, + { + "epoch": 0.7298682467061677, + "grad_norm": 1.1103084087371826, + "learning_rate": 4.125890012324576e-06, + "loss": 0.7674, + "step": 58388 + }, + { + "epoch": 0.7298932473311833, + "grad_norm": 7.676881313323975, + "learning_rate": 4.12518377739077e-06, + "loss": 0.8758, + "step": 58390 + }, + { + "epoch": 0.7299182479561989, + "grad_norm": 5.419440746307373, + "learning_rate": 4.1244775871987285e-06, + "loss": 0.4615, + "step": 58392 + }, + { + "epoch": 0.7299432485812145, + "grad_norm": 2.686166763305664, + "learning_rate": 4.1237714417538435e-06, + "loss": 0.495, + "step": 58394 + }, + { + "epoch": 0.7299682492062302, + "grad_norm": 2.6869659423828125, + "learning_rate": 4.123065341061475e-06, + "loss": 0.7432, + "step": 58396 + }, + { + "epoch": 0.7299932498312458, + "grad_norm": 5.056814670562744, + "learning_rate": 4.122359285127015e-06, + "loss": 0.2044, + "step": 58398 + }, + { + "epoch": 0.7300182504562615, + "grad_norm": 1.5636714696884155, + "learning_rate": 4.121653273955833e-06, + "loss": 0.049, + "step": 58400 + }, + { + "epoch": 0.730043251081277, + "grad_norm": 2.498762845993042, + "learning_rate": 4.120947307553306e-06, + "loss": 0.4978, + "step": 58402 + }, + { + "epoch": 0.7300682517062926, + "grad_norm": 2.1524784564971924, + "learning_rate": 4.120241385924817e-06, + "loss": 1.7774, + "step": 58404 + }, + { + "epoch": 0.7300932523313083, + "grad_norm": 0.0013275537639856339, + "learning_rate": 4.119535509075735e-06, + "loss": 0.3187, + "step": 58406 + }, + { + "epoch": 0.7301182529563239, + "grad_norm": 0.0007586553110741079, + "learning_rate": 4.118829677011447e-06, + "loss": 0.1856, + "step": 58408 + }, + { + "epoch": 0.7301432535813396, + "grad_norm": 3.7738940715789795, + "learning_rate": 4.118123889737312e-06, + "loss": 1.1375, + "step": 58410 + }, + { + "epoch": 0.7301682542063551, + "grad_norm": 8.580755233764648, + "learning_rate": 4.117418147258718e-06, + "loss": 0.0888, + "step": 58412 + }, + { + "epoch": 0.7301932548313708, + "grad_norm": 3.359093427658081, + "learning_rate": 4.116712449581032e-06, + "loss": 1.0066, + "step": 58414 + }, + { + "epoch": 0.7302182554563864, + "grad_norm": 10.133543968200684, + "learning_rate": 4.11600679670963e-06, + "loss": 1.249, + "step": 58416 + }, + { + "epoch": 0.7302432560814021, + "grad_norm": 2.0273597240448, + "learning_rate": 4.115301188649895e-06, + "loss": 0.8373, + "step": 58418 + }, + { + "epoch": 0.7302682567064177, + "grad_norm": 0.6604974269866943, + "learning_rate": 4.114595625407192e-06, + "loss": 0.4304, + "step": 58420 + }, + { + "epoch": 0.7302932573314332, + "grad_norm": 1.0579562187194824, + "learning_rate": 4.113890106986898e-06, + "loss": 0.6415, + "step": 58422 + }, + { + "epoch": 0.7303182579564489, + "grad_norm": 4.400474548339844, + "learning_rate": 4.113184633394379e-06, + "loss": 1.0394, + "step": 58424 + }, + { + "epoch": 0.7303432585814645, + "grad_norm": 2.012563467025757, + "learning_rate": 4.112479204635013e-06, + "loss": 1.332, + "step": 58426 + }, + { + "epoch": 0.7303682592064802, + "grad_norm": 0.0008802458178251982, + "learning_rate": 4.111773820714179e-06, + "loss": 1.1053, + "step": 58428 + }, + { + "epoch": 0.7303932598314958, + "grad_norm": 5.7884697914123535, + "learning_rate": 4.111068481637238e-06, + "loss": 0.9325, + "step": 58430 + }, + { + "epoch": 0.7304182604565114, + "grad_norm": 2.601073741912842, + "learning_rate": 4.110363187409569e-06, + "loss": 0.5152, + "step": 58432 + }, + { + "epoch": 0.730443261081527, + "grad_norm": 2.8962650299072266, + "learning_rate": 4.109657938036543e-06, + "loss": 1.5705, + "step": 58434 + }, + { + "epoch": 0.7304682617065427, + "grad_norm": 3.8210132122039795, + "learning_rate": 4.108952733523529e-06, + "loss": 1.3033, + "step": 58436 + }, + { + "epoch": 0.7304932623315583, + "grad_norm": 0.003643125295639038, + "learning_rate": 4.1082475738758935e-06, + "loss": 0.3501, + "step": 58438 + }, + { + "epoch": 0.730518262956574, + "grad_norm": 8.268707275390625, + "learning_rate": 4.107542459099012e-06, + "loss": 0.3979, + "step": 58440 + }, + { + "epoch": 0.7305432635815895, + "grad_norm": 2.769378423690796, + "learning_rate": 4.106837389198257e-06, + "loss": 1.6798, + "step": 58442 + }, + { + "epoch": 0.7305682642066051, + "grad_norm": 2.2930753231048584, + "learning_rate": 4.106132364178992e-06, + "loss": 0.3088, + "step": 58444 + }, + { + "epoch": 0.7305932648316208, + "grad_norm": 5.1617889404296875, + "learning_rate": 4.105427384046595e-06, + "loss": 1.9904, + "step": 58446 + }, + { + "epoch": 0.7306182654566364, + "grad_norm": 6.755107879638672, + "learning_rate": 4.1047224488064245e-06, + "loss": 1.6074, + "step": 58448 + }, + { + "epoch": 0.7306432660816521, + "grad_norm": 3.3281259536743164, + "learning_rate": 4.104017558463853e-06, + "loss": 1.4405, + "step": 58450 + }, + { + "epoch": 0.7306682667066676, + "grad_norm": 1.6598063707351685, + "learning_rate": 4.103312713024253e-06, + "loss": 0.1308, + "step": 58452 + }, + { + "epoch": 0.7306932673316833, + "grad_norm": 0.5029045939445496, + "learning_rate": 4.102607912492988e-06, + "loss": 0.4237, + "step": 58454 + }, + { + "epoch": 0.7307182679566989, + "grad_norm": 0.0005548310000449419, + "learning_rate": 4.1019031568754285e-06, + "loss": 0.0, + "step": 58456 + }, + { + "epoch": 0.7307432685817146, + "grad_norm": 5.546159267425537, + "learning_rate": 4.101198446176939e-06, + "loss": 0.6786, + "step": 58458 + }, + { + "epoch": 0.7307682692067302, + "grad_norm": 4.029373645782471, + "learning_rate": 4.100493780402894e-06, + "loss": 0.8031, + "step": 58460 + }, + { + "epoch": 0.7307932698317458, + "grad_norm": 2.601619243621826, + "learning_rate": 4.099789159558646e-06, + "loss": 1.35, + "step": 58462 + }, + { + "epoch": 0.7308182704567614, + "grad_norm": 3.8548977375030518, + "learning_rate": 4.09908458364957e-06, + "loss": 1.0181, + "step": 58464 + }, + { + "epoch": 0.730843271081777, + "grad_norm": 0.0016839896561577916, + "learning_rate": 4.0983800526810345e-06, + "loss": 0.0001, + "step": 58466 + }, + { + "epoch": 0.7308682717067927, + "grad_norm": 0.0015832611825317144, + "learning_rate": 4.0976755666584e-06, + "loss": 0.5608, + "step": 58468 + }, + { + "epoch": 0.7308932723318083, + "grad_norm": 0.04743903502821922, + "learning_rate": 4.096971125587036e-06, + "loss": 0.2418, + "step": 58470 + }, + { + "epoch": 0.7309182729568239, + "grad_norm": 1.2208970785140991, + "learning_rate": 4.0962667294723055e-06, + "loss": 0.8004, + "step": 58472 + }, + { + "epoch": 0.7309432735818395, + "grad_norm": 7.63909387588501, + "learning_rate": 4.095562378319569e-06, + "loss": 1.7277, + "step": 58474 + }, + { + "epoch": 0.7309682742068552, + "grad_norm": 3.148606538772583, + "learning_rate": 4.094858072134199e-06, + "loss": 0.4106, + "step": 58476 + }, + { + "epoch": 0.7309932748318708, + "grad_norm": 4.904124736785889, + "learning_rate": 4.09415381092155e-06, + "loss": 0.8837, + "step": 58478 + }, + { + "epoch": 0.7310182754568865, + "grad_norm": 4.630638599395752, + "learning_rate": 4.093449594686994e-06, + "loss": 0.8692, + "step": 58480 + }, + { + "epoch": 0.731043276081902, + "grad_norm": 4.327075481414795, + "learning_rate": 4.092745423435887e-06, + "loss": 0.7992, + "step": 58482 + }, + { + "epoch": 0.7310682767069177, + "grad_norm": 6.0976033210754395, + "learning_rate": 4.092041297173601e-06, + "loss": 0.5315, + "step": 58484 + }, + { + "epoch": 0.7310932773319333, + "grad_norm": 2.5426344871520996, + "learning_rate": 4.091337215905491e-06, + "loss": 1.1721, + "step": 58486 + }, + { + "epoch": 0.731118277956949, + "grad_norm": 2.6742429733276367, + "learning_rate": 4.090633179636919e-06, + "loss": 0.9669, + "step": 58488 + }, + { + "epoch": 0.7311432785819646, + "grad_norm": 4.748541355133057, + "learning_rate": 4.089929188373252e-06, + "loss": 1.1717, + "step": 58490 + }, + { + "epoch": 0.7311682792069801, + "grad_norm": 5.11470890045166, + "learning_rate": 4.089225242119844e-06, + "loss": 0.6073, + "step": 58492 + }, + { + "epoch": 0.7311932798319958, + "grad_norm": 3.065462350845337, + "learning_rate": 4.088521340882066e-06, + "loss": 0.5127, + "step": 58494 + }, + { + "epoch": 0.7312182804570114, + "grad_norm": 5.472479343414307, + "learning_rate": 4.0878174846652685e-06, + "loss": 0.6911, + "step": 58496 + }, + { + "epoch": 0.7312432810820271, + "grad_norm": 3.5628397464752197, + "learning_rate": 4.087113673474825e-06, + "loss": 0.923, + "step": 58498 + }, + { + "epoch": 0.7312682817070427, + "grad_norm": 0.000411399727454409, + "learning_rate": 4.0864099073160805e-06, + "loss": 0.0532, + "step": 58500 + }, + { + "epoch": 0.7312932823320583, + "grad_norm": 0.0015632271533831954, + "learning_rate": 4.085706186194402e-06, + "loss": 0.4747, + "step": 58502 + }, + { + "epoch": 0.7313182829570739, + "grad_norm": 1.7934178113937378, + "learning_rate": 4.0850025101151514e-06, + "loss": 0.2379, + "step": 58504 + }, + { + "epoch": 0.7313432835820896, + "grad_norm": 6.350856781005859, + "learning_rate": 4.084298879083682e-06, + "loss": 1.4314, + "step": 58506 + }, + { + "epoch": 0.7313682842071052, + "grad_norm": 0.00200800783932209, + "learning_rate": 4.083595293105359e-06, + "loss": 0.8291, + "step": 58508 + }, + { + "epoch": 0.7313932848321208, + "grad_norm": 1.448028326034546, + "learning_rate": 4.082891752185535e-06, + "loss": 0.6142, + "step": 58510 + }, + { + "epoch": 0.7314182854571364, + "grad_norm": 1.0092127323150635, + "learning_rate": 4.082188256329573e-06, + "loss": 0.0554, + "step": 58512 + }, + { + "epoch": 0.731443286082152, + "grad_norm": 3.3187713623046875, + "learning_rate": 4.08148480554283e-06, + "loss": 0.7249, + "step": 58514 + }, + { + "epoch": 0.7314682867071677, + "grad_norm": 3.5094552040100098, + "learning_rate": 4.080781399830656e-06, + "loss": 1.3132, + "step": 58516 + }, + { + "epoch": 0.7314932873321833, + "grad_norm": 2.0844359397888184, + "learning_rate": 4.08007803919842e-06, + "loss": 0.7476, + "step": 58518 + }, + { + "epoch": 0.731518287957199, + "grad_norm": 0.0014770227717235684, + "learning_rate": 4.079374723651468e-06, + "loss": 0.196, + "step": 58520 + }, + { + "epoch": 0.7315432885822145, + "grad_norm": 2.855876922607422, + "learning_rate": 4.078671453195163e-06, + "loss": 0.8505, + "step": 58522 + }, + { + "epoch": 0.7315682892072302, + "grad_norm": 3.6744322776794434, + "learning_rate": 4.077968227834861e-06, + "loss": 1.0677, + "step": 58524 + }, + { + "epoch": 0.7315932898322458, + "grad_norm": 3.439349889755249, + "learning_rate": 4.07726504757591e-06, + "loss": 0.4638, + "step": 58526 + }, + { + "epoch": 0.7316182904572615, + "grad_norm": 2.499450445175171, + "learning_rate": 4.076561912423676e-06, + "loss": 1.3774, + "step": 58528 + }, + { + "epoch": 0.7316432910822771, + "grad_norm": 3.6900599002838135, + "learning_rate": 4.075858822383505e-06, + "loss": 1.8335, + "step": 58530 + }, + { + "epoch": 0.7316682917072926, + "grad_norm": 4.720659255981445, + "learning_rate": 4.075155777460758e-06, + "loss": 0.7289, + "step": 58532 + }, + { + "epoch": 0.7316932923323083, + "grad_norm": 2.7762231826782227, + "learning_rate": 4.074452777660785e-06, + "loss": 0.6613, + "step": 58534 + }, + { + "epoch": 0.7317182929573239, + "grad_norm": 4.400595664978027, + "learning_rate": 4.0737498229889425e-06, + "loss": 1.2539, + "step": 58536 + }, + { + "epoch": 0.7317432935823396, + "grad_norm": 2.355855703353882, + "learning_rate": 4.073046913450586e-06, + "loss": 1.0682, + "step": 58538 + }, + { + "epoch": 0.7317682942073552, + "grad_norm": 2.6179163455963135, + "learning_rate": 4.072344049051062e-06, + "loss": 0.6623, + "step": 58540 + }, + { + "epoch": 0.7317932948323708, + "grad_norm": 4.556344985961914, + "learning_rate": 4.071641229795729e-06, + "loss": 0.4799, + "step": 58542 + }, + { + "epoch": 0.7318182954573864, + "grad_norm": 0.5791910290718079, + "learning_rate": 4.070938455689936e-06, + "loss": 0.1616, + "step": 58544 + }, + { + "epoch": 0.7318432960824021, + "grad_norm": 3.377264976501465, + "learning_rate": 4.070235726739041e-06, + "loss": 0.3695, + "step": 58546 + }, + { + "epoch": 0.7318682967074177, + "grad_norm": 10.415592193603516, + "learning_rate": 4.069533042948388e-06, + "loss": 1.9643, + "step": 58548 + }, + { + "epoch": 0.7318932973324334, + "grad_norm": 3.335914373397827, + "learning_rate": 4.068830404323336e-06, + "loss": 0.3574, + "step": 58550 + }, + { + "epoch": 0.7319182979574489, + "grad_norm": 3.088015079498291, + "learning_rate": 4.0681278108692345e-06, + "loss": 0.807, + "step": 58552 + }, + { + "epoch": 0.7319432985824645, + "grad_norm": 6.641053676605225, + "learning_rate": 4.067425262591428e-06, + "loss": 0.677, + "step": 58554 + }, + { + "epoch": 0.7319682992074802, + "grad_norm": 6.436838150024414, + "learning_rate": 4.066722759495275e-06, + "loss": 0.5423, + "step": 58556 + }, + { + "epoch": 0.7319932998324958, + "grad_norm": 3.934776544570923, + "learning_rate": 4.06602030158612e-06, + "loss": 1.194, + "step": 58558 + }, + { + "epoch": 0.7320183004575115, + "grad_norm": 0.0031385431066155434, + "learning_rate": 4.065317888869318e-06, + "loss": 0.639, + "step": 58560 + }, + { + "epoch": 0.732043301082527, + "grad_norm": 2.127685070037842, + "learning_rate": 4.0646155213502124e-06, + "loss": 0.4053, + "step": 58562 + }, + { + "epoch": 0.7320683017075427, + "grad_norm": 0.1891605108976364, + "learning_rate": 4.063913199034159e-06, + "loss": 0.4343, + "step": 58564 + }, + { + "epoch": 0.7320933023325583, + "grad_norm": 2.605738401412964, + "learning_rate": 4.063210921926504e-06, + "loss": 1.2344, + "step": 58566 + }, + { + "epoch": 0.732118302957574, + "grad_norm": 5.181753158569336, + "learning_rate": 4.062508690032591e-06, + "loss": 1.5276, + "step": 58568 + }, + { + "epoch": 0.7321433035825896, + "grad_norm": 8.090267181396484, + "learning_rate": 4.061806503357775e-06, + "loss": 0.3938, + "step": 58570 + }, + { + "epoch": 0.7321683042076051, + "grad_norm": 3.9814984798431396, + "learning_rate": 4.061104361907398e-06, + "loss": 1.3132, + "step": 58572 + }, + { + "epoch": 0.7321933048326208, + "grad_norm": 0.08002184331417084, + "learning_rate": 4.0604022656868135e-06, + "loss": 0.0905, + "step": 58574 + }, + { + "epoch": 0.7322183054576364, + "grad_norm": 3.339966058731079, + "learning_rate": 4.059700214701362e-06, + "loss": 1.8293, + "step": 58576 + }, + { + "epoch": 0.7322433060826521, + "grad_norm": 2.114759922027588, + "learning_rate": 4.058998208956398e-06, + "loss": 1.3106, + "step": 58578 + }, + { + "epoch": 0.7322683067076677, + "grad_norm": 9.447622299194336, + "learning_rate": 4.058296248457263e-06, + "loss": 0.2334, + "step": 58580 + }, + { + "epoch": 0.7322933073326833, + "grad_norm": 0.004592845216393471, + "learning_rate": 4.057594333209299e-06, + "loss": 0.4373, + "step": 58582 + }, + { + "epoch": 0.7323183079576989, + "grad_norm": 3.8821022510528564, + "learning_rate": 4.056892463217862e-06, + "loss": 1.5515, + "step": 58584 + }, + { + "epoch": 0.7323433085827146, + "grad_norm": 0.8301311731338501, + "learning_rate": 4.056190638488287e-06, + "loss": 1.1314, + "step": 58586 + }, + { + "epoch": 0.7323683092077302, + "grad_norm": 5.122482776641846, + "learning_rate": 4.055488859025926e-06, + "loss": 0.5386, + "step": 58588 + }, + { + "epoch": 0.7323933098327459, + "grad_norm": 0.17982237040996552, + "learning_rate": 4.054787124836122e-06, + "loss": 0.3371, + "step": 58590 + }, + { + "epoch": 0.7324183104577614, + "grad_norm": 8.039855003356934, + "learning_rate": 4.054085435924215e-06, + "loss": 1.2213, + "step": 58592 + }, + { + "epoch": 0.732443311082777, + "grad_norm": 0.7239254117012024, + "learning_rate": 4.0533837922955566e-06, + "loss": 0.3786, + "step": 58594 + }, + { + "epoch": 0.7324683117077927, + "grad_norm": 4.0367431640625, + "learning_rate": 4.052682193955481e-06, + "loss": 0.2709, + "step": 58596 + }, + { + "epoch": 0.7324933123328083, + "grad_norm": 4.038626194000244, + "learning_rate": 4.051980640909342e-06, + "loss": 0.8834, + "step": 58598 + }, + { + "epoch": 0.732518312957824, + "grad_norm": 2.0681076049804688, + "learning_rate": 4.051279133162474e-06, + "loss": 2.6198, + "step": 58600 + }, + { + "epoch": 0.7325433135828395, + "grad_norm": 3.4631197452545166, + "learning_rate": 4.050577670720226e-06, + "loss": 0.8038, + "step": 58602 + }, + { + "epoch": 0.7325683142078552, + "grad_norm": 5.762209415435791, + "learning_rate": 4.049876253587937e-06, + "loss": 1.0238, + "step": 58604 + }, + { + "epoch": 0.7325933148328708, + "grad_norm": 3.7592504024505615, + "learning_rate": 4.049174881770946e-06, + "loss": 1.8719, + "step": 58606 + }, + { + "epoch": 0.7326183154578865, + "grad_norm": 0.0007601699326187372, + "learning_rate": 4.048473555274599e-06, + "loss": 0.9477, + "step": 58608 + }, + { + "epoch": 0.7326433160829021, + "grad_norm": 2.9210867881774902, + "learning_rate": 4.047772274104235e-06, + "loss": 0.6992, + "step": 58610 + }, + { + "epoch": 0.7326683167079177, + "grad_norm": 3.1917803287506104, + "learning_rate": 4.047071038265197e-06, + "loss": 0.9366, + "step": 58612 + }, + { + "epoch": 0.7326933173329333, + "grad_norm": 3.9980056285858154, + "learning_rate": 4.046369847762822e-06, + "loss": 0.5524, + "step": 58614 + }, + { + "epoch": 0.732718317957949, + "grad_norm": 5.568158149719238, + "learning_rate": 4.045668702602456e-06, + "loss": 1.3377, + "step": 58616 + }, + { + "epoch": 0.7327433185829646, + "grad_norm": 3.929670572280884, + "learning_rate": 4.044967602789433e-06, + "loss": 1.1478, + "step": 58618 + }, + { + "epoch": 0.7327683192079802, + "grad_norm": 8.639167785644531, + "learning_rate": 4.044266548329093e-06, + "loss": 1.8875, + "step": 58620 + }, + { + "epoch": 0.7327933198329958, + "grad_norm": 1.9674824476242065, + "learning_rate": 4.0435655392267805e-06, + "loss": 0.1657, + "step": 58622 + }, + { + "epoch": 0.7328183204580114, + "grad_norm": 7.01093864440918, + "learning_rate": 4.0428645754878255e-06, + "loss": 1.2455, + "step": 58624 + }, + { + "epoch": 0.7328433210830271, + "grad_norm": 3.458425283432007, + "learning_rate": 4.042163657117577e-06, + "loss": 0.6171, + "step": 58626 + }, + { + "epoch": 0.7328683217080427, + "grad_norm": 3.154273509979248, + "learning_rate": 4.041462784121363e-06, + "loss": 1.656, + "step": 58628 + }, + { + "epoch": 0.7328933223330584, + "grad_norm": 4.04136323928833, + "learning_rate": 4.040761956504528e-06, + "loss": 1.0463, + "step": 58630 + }, + { + "epoch": 0.7329183229580739, + "grad_norm": 4.191244125366211, + "learning_rate": 4.04006117427241e-06, + "loss": 1.0607, + "step": 58632 + }, + { + "epoch": 0.7329433235830896, + "grad_norm": 2.9953536987304688, + "learning_rate": 4.039360437430338e-06, + "loss": 1.1284, + "step": 58634 + }, + { + "epoch": 0.7329683242081052, + "grad_norm": 4.242580413818359, + "learning_rate": 4.038659745983658e-06, + "loss": 1.4551, + "step": 58636 + }, + { + "epoch": 0.7329933248331209, + "grad_norm": 2.560854196548462, + "learning_rate": 4.037959099937699e-06, + "loss": 0.8265, + "step": 58638 + }, + { + "epoch": 0.7330183254581365, + "grad_norm": 3.1364870071411133, + "learning_rate": 4.037258499297804e-06, + "loss": 0.3326, + "step": 58640 + }, + { + "epoch": 0.733043326083152, + "grad_norm": 0.03640824556350708, + "learning_rate": 4.036557944069304e-06, + "loss": 1.0751, + "step": 58642 + }, + { + "epoch": 0.7330683267081677, + "grad_norm": 2.611804485321045, + "learning_rate": 4.035857434257532e-06, + "loss": 0.4894, + "step": 58644 + }, + { + "epoch": 0.7330933273331833, + "grad_norm": 5.148764610290527, + "learning_rate": 4.03515696986783e-06, + "loss": 0.4447, + "step": 58646 + }, + { + "epoch": 0.733118327958199, + "grad_norm": 4.567553520202637, + "learning_rate": 4.034456550905527e-06, + "loss": 1.0913, + "step": 58648 + }, + { + "epoch": 0.7331433285832146, + "grad_norm": 3.036313533782959, + "learning_rate": 4.0337561773759615e-06, + "loss": 1.9105, + "step": 58650 + }, + { + "epoch": 0.7331683292082302, + "grad_norm": 1.715029239654541, + "learning_rate": 4.033055849284461e-06, + "loss": 0.894, + "step": 58652 + }, + { + "epoch": 0.7331933298332458, + "grad_norm": 5.207637310028076, + "learning_rate": 4.032355566636369e-06, + "loss": 1.1883, + "step": 58654 + }, + { + "epoch": 0.7332183304582615, + "grad_norm": 12.569632530212402, + "learning_rate": 4.031655329437012e-06, + "loss": 1.0147, + "step": 58656 + }, + { + "epoch": 0.7332433310832771, + "grad_norm": 4.042059898376465, + "learning_rate": 4.03095513769172e-06, + "loss": 0.6344, + "step": 58658 + }, + { + "epoch": 0.7332683317082928, + "grad_norm": 4.325420379638672, + "learning_rate": 4.030254991405833e-06, + "loss": 1.6358, + "step": 58660 + }, + { + "epoch": 0.7332933323333083, + "grad_norm": 4.203026294708252, + "learning_rate": 4.029554890584677e-06, + "loss": 1.5021, + "step": 58662 + }, + { + "epoch": 0.7333183329583239, + "grad_norm": 0.0014087262097746134, + "learning_rate": 4.02885483523359e-06, + "loss": 0.1196, + "step": 58664 + }, + { + "epoch": 0.7333433335833396, + "grad_norm": 0.3832176625728607, + "learning_rate": 4.028154825357897e-06, + "loss": 0.228, + "step": 58666 + }, + { + "epoch": 0.7333683342083552, + "grad_norm": 6.454981327056885, + "learning_rate": 4.027454860962936e-06, + "loss": 1.2794, + "step": 58668 + }, + { + "epoch": 0.7333933348333709, + "grad_norm": 0.0005542593426071107, + "learning_rate": 4.0267549420540335e-06, + "loss": 0.3441, + "step": 58670 + }, + { + "epoch": 0.7334183354583864, + "grad_norm": 2.1634738445281982, + "learning_rate": 4.026055068636518e-06, + "loss": 0.6411, + "step": 58672 + }, + { + "epoch": 0.7334433360834021, + "grad_norm": 1.3827204704284668, + "learning_rate": 4.025355240715724e-06, + "loss": 0.8208, + "step": 58674 + }, + { + "epoch": 0.7334683367084177, + "grad_norm": 3.5714211463928223, + "learning_rate": 4.024655458296978e-06, + "loss": 1.3435, + "step": 58676 + }, + { + "epoch": 0.7334933373334334, + "grad_norm": 3.9134697914123535, + "learning_rate": 4.023955721385613e-06, + "loss": 1.1261, + "step": 58678 + }, + { + "epoch": 0.733518337958449, + "grad_norm": 0.9463223218917847, + "learning_rate": 4.023256029986954e-06, + "loss": 1.3441, + "step": 58680 + }, + { + "epoch": 0.7335433385834645, + "grad_norm": 3.106614828109741, + "learning_rate": 4.022556384106334e-06, + "loss": 1.2031, + "step": 58682 + }, + { + "epoch": 0.7335683392084802, + "grad_norm": 2.302276849746704, + "learning_rate": 4.02185678374908e-06, + "loss": 1.2871, + "step": 58684 + }, + { + "epoch": 0.7335933398334958, + "grad_norm": 0.0051529621705412865, + "learning_rate": 4.021157228920517e-06, + "loss": 0.5819, + "step": 58686 + }, + { + "epoch": 0.7336183404585115, + "grad_norm": 0.5411816835403442, + "learning_rate": 4.020457719625977e-06, + "loss": 0.0194, + "step": 58688 + }, + { + "epoch": 0.7336433410835271, + "grad_norm": 1.643330693244934, + "learning_rate": 4.019758255870783e-06, + "loss": 0.9816, + "step": 58690 + }, + { + "epoch": 0.7336683417085427, + "grad_norm": 2.853724718093872, + "learning_rate": 4.019058837660267e-06, + "loss": 0.7126, + "step": 58692 + }, + { + "epoch": 0.7336933423335583, + "grad_norm": 4.005518436431885, + "learning_rate": 4.018359464999754e-06, + "loss": 1.2592, + "step": 58694 + }, + { + "epoch": 0.733718342958574, + "grad_norm": 0.049393776804208755, + "learning_rate": 4.0176601378945644e-06, + "loss": 0.5938, + "step": 58696 + }, + { + "epoch": 0.7337433435835896, + "grad_norm": 0.002622229279950261, + "learning_rate": 4.016960856350034e-06, + "loss": 0.3602, + "step": 58698 + }, + { + "epoch": 0.7337683442086053, + "grad_norm": 5.290361404418945, + "learning_rate": 4.01626162037148e-06, + "loss": 0.6999, + "step": 58700 + }, + { + "epoch": 0.7337933448336208, + "grad_norm": 3.5939009189605713, + "learning_rate": 4.0155624299642345e-06, + "loss": 0.5503, + "step": 58702 + }, + { + "epoch": 0.7338183454586364, + "grad_norm": 4.160584449768066, + "learning_rate": 4.014863285133617e-06, + "loss": 1.919, + "step": 58704 + }, + { + "epoch": 0.7338433460836521, + "grad_norm": 4.2538628578186035, + "learning_rate": 4.0141641858849525e-06, + "loss": 1.3648, + "step": 58706 + }, + { + "epoch": 0.7338683467086677, + "grad_norm": 9.36911392211914, + "learning_rate": 4.013465132223575e-06, + "loss": 1.982, + "step": 58708 + }, + { + "epoch": 0.7338933473336834, + "grad_norm": 5.377242088317871, + "learning_rate": 4.012766124154793e-06, + "loss": 1.0004, + "step": 58710 + }, + { + "epoch": 0.7339183479586989, + "grad_norm": 3.6693601608276367, + "learning_rate": 4.012067161683941e-06, + "loss": 1.4703, + "step": 58712 + }, + { + "epoch": 0.7339433485837146, + "grad_norm": 0.001548866624943912, + "learning_rate": 4.011368244816336e-06, + "loss": 0.0, + "step": 58714 + }, + { + "epoch": 0.7339683492087302, + "grad_norm": 0.1700308620929718, + "learning_rate": 4.010669373557306e-06, + "loss": 1.1191, + "step": 58716 + }, + { + "epoch": 0.7339933498337459, + "grad_norm": 2.8721585273742676, + "learning_rate": 4.009970547912169e-06, + "loss": 0.7419, + "step": 58718 + }, + { + "epoch": 0.7340183504587615, + "grad_norm": 0.02053491398692131, + "learning_rate": 4.009271767886253e-06, + "loss": 0.059, + "step": 58720 + }, + { + "epoch": 0.734043351083777, + "grad_norm": 4.651118278503418, + "learning_rate": 4.008573033484874e-06, + "loss": 2.0656, + "step": 58722 + }, + { + "epoch": 0.7340683517087927, + "grad_norm": 2.6883177757263184, + "learning_rate": 4.007874344713354e-06, + "loss": 0.937, + "step": 58724 + }, + { + "epoch": 0.7340933523338083, + "grad_norm": 9.094969749450684, + "learning_rate": 4.007175701577018e-06, + "loss": 1.9589, + "step": 58726 + }, + { + "epoch": 0.734118352958824, + "grad_norm": 3.884129762649536, + "learning_rate": 4.006477104081181e-06, + "loss": 0.7082, + "step": 58728 + }, + { + "epoch": 0.7341433535838396, + "grad_norm": 9.01857852935791, + "learning_rate": 4.00577855223117e-06, + "loss": 0.7959, + "step": 58730 + }, + { + "epoch": 0.7341683542088552, + "grad_norm": 1.8647279739379883, + "learning_rate": 4.005080046032299e-06, + "loss": 0.0598, + "step": 58732 + }, + { + "epoch": 0.7341933548338708, + "grad_norm": 5.716782569885254, + "learning_rate": 4.004381585489894e-06, + "loss": 1.1081, + "step": 58734 + }, + { + "epoch": 0.7342183554588865, + "grad_norm": 0.0009880587458610535, + "learning_rate": 4.003683170609271e-06, + "loss": 0.172, + "step": 58736 + }, + { + "epoch": 0.7342433560839021, + "grad_norm": 3.661815881729126, + "learning_rate": 4.002984801395745e-06, + "loss": 0.2444, + "step": 58738 + }, + { + "epoch": 0.7342683567089178, + "grad_norm": 0.5897805094718933, + "learning_rate": 4.002286477854643e-06, + "loss": 0.048, + "step": 58740 + }, + { + "epoch": 0.7342933573339333, + "grad_norm": 0.10169832408428192, + "learning_rate": 4.001588199991275e-06, + "loss": 0.2431, + "step": 58742 + }, + { + "epoch": 0.734318357958949, + "grad_norm": 4.223202228546143, + "learning_rate": 4.000889967810963e-06, + "loss": 1.0011, + "step": 58744 + }, + { + "epoch": 0.7343433585839646, + "grad_norm": 2.719780683517456, + "learning_rate": 4.0001917813190316e-06, + "loss": 0.5826, + "step": 58746 + }, + { + "epoch": 0.7343683592089802, + "grad_norm": 3.1054155826568604, + "learning_rate": 3.999493640520785e-06, + "loss": 0.3821, + "step": 58748 + }, + { + "epoch": 0.7343933598339959, + "grad_norm": 0.002250785008072853, + "learning_rate": 3.998795545421549e-06, + "loss": 0.0596, + "step": 58750 + }, + { + "epoch": 0.7344183604590114, + "grad_norm": 2.4910247325897217, + "learning_rate": 3.998097496026634e-06, + "loss": 1.0803, + "step": 58752 + }, + { + "epoch": 0.7344433610840271, + "grad_norm": 3.4176025390625, + "learning_rate": 3.997399492341363e-06, + "loss": 1.0641, + "step": 58754 + }, + { + "epoch": 0.7344683617090427, + "grad_norm": 3.084237813949585, + "learning_rate": 3.996701534371045e-06, + "loss": 0.1722, + "step": 58756 + }, + { + "epoch": 0.7344933623340584, + "grad_norm": 2.6502676010131836, + "learning_rate": 3.9960036221210005e-06, + "loss": 0.2208, + "step": 58758 + }, + { + "epoch": 0.734518362959074, + "grad_norm": 5.267757892608643, + "learning_rate": 3.995305755596549e-06, + "loss": 1.4853, + "step": 58760 + }, + { + "epoch": 0.7345433635840896, + "grad_norm": 3.1974620819091797, + "learning_rate": 3.994607934802994e-06, + "loss": 0.6085, + "step": 58762 + }, + { + "epoch": 0.7345683642091052, + "grad_norm": 3.9947919845581055, + "learning_rate": 3.9939101597456585e-06, + "loss": 1.9228, + "step": 58764 + }, + { + "epoch": 0.7345933648341209, + "grad_norm": 7.398528575897217, + "learning_rate": 3.99321243042985e-06, + "loss": 1.061, + "step": 58766 + }, + { + "epoch": 0.7346183654591365, + "grad_norm": 1.8788539171218872, + "learning_rate": 3.992514746860886e-06, + "loss": 1.7327, + "step": 58768 + }, + { + "epoch": 0.7346433660841521, + "grad_norm": 3.5900790691375732, + "learning_rate": 3.991817109044085e-06, + "loss": 1.8637, + "step": 58770 + }, + { + "epoch": 0.7346683667091677, + "grad_norm": 3.9004323482513428, + "learning_rate": 3.991119516984754e-06, + "loss": 0.4132, + "step": 58772 + }, + { + "epoch": 0.7346933673341833, + "grad_norm": 4.177427768707275, + "learning_rate": 3.990421970688207e-06, + "loss": 0.9222, + "step": 58774 + }, + { + "epoch": 0.734718367959199, + "grad_norm": 2.806246519088745, + "learning_rate": 3.989724470159754e-06, + "loss": 0.6674, + "step": 58776 + }, + { + "epoch": 0.7347433685842146, + "grad_norm": 0.5140167474746704, + "learning_rate": 3.989027015404713e-06, + "loss": 0.0066, + "step": 58778 + }, + { + "epoch": 0.7347683692092303, + "grad_norm": 5.2325615882873535, + "learning_rate": 3.988329606428388e-06, + "loss": 2.6167, + "step": 58780 + }, + { + "epoch": 0.7347933698342458, + "grad_norm": 5.449748516082764, + "learning_rate": 3.987632243236095e-06, + "loss": 0.5338, + "step": 58782 + }, + { + "epoch": 0.7348183704592615, + "grad_norm": 2.9730148315429688, + "learning_rate": 3.986934925833147e-06, + "loss": 0.9328, + "step": 58784 + }, + { + "epoch": 0.7348433710842771, + "grad_norm": 6.7243428230285645, + "learning_rate": 3.986237654224854e-06, + "loss": 2.3121, + "step": 58786 + }, + { + "epoch": 0.7348683717092928, + "grad_norm": 2.2641983032226562, + "learning_rate": 3.985540428416523e-06, + "loss": 0.4447, + "step": 58788 + }, + { + "epoch": 0.7348933723343084, + "grad_norm": 1.2655174732208252, + "learning_rate": 3.984843248413464e-06, + "loss": 0.4932, + "step": 58790 + }, + { + "epoch": 0.7349183729593239, + "grad_norm": 0.001482296851463616, + "learning_rate": 3.984146114220986e-06, + "loss": 0.0, + "step": 58792 + }, + { + "epoch": 0.7349433735843396, + "grad_norm": 4.097254753112793, + "learning_rate": 3.983449025844405e-06, + "loss": 0.8114, + "step": 58794 + }, + { + "epoch": 0.7349683742093552, + "grad_norm": 3.4241116046905518, + "learning_rate": 3.982751983289021e-06, + "loss": 0.6566, + "step": 58796 + }, + { + "epoch": 0.7349933748343709, + "grad_norm": 2.487121343612671, + "learning_rate": 3.982054986560155e-06, + "loss": 1.4828, + "step": 58798 + }, + { + "epoch": 0.7350183754593865, + "grad_norm": 0.0009106436627916992, + "learning_rate": 3.981358035663101e-06, + "loss": 0.4581, + "step": 58800 + }, + { + "epoch": 0.7350433760844021, + "grad_norm": 5.037826061248779, + "learning_rate": 3.9806611306031765e-06, + "loss": 1.3607, + "step": 58802 + }, + { + "epoch": 0.7350683767094177, + "grad_norm": 2.8858768939971924, + "learning_rate": 3.97996427138568e-06, + "loss": 1.3064, + "step": 58804 + }, + { + "epoch": 0.7350933773344334, + "grad_norm": 3.742295265197754, + "learning_rate": 3.979267458015926e-06, + "loss": 1.37, + "step": 58806 + }, + { + "epoch": 0.735118377959449, + "grad_norm": 4.344287872314453, + "learning_rate": 3.978570690499223e-06, + "loss": 0.5526, + "step": 58808 + }, + { + "epoch": 0.7351433785844647, + "grad_norm": 6.440994739532471, + "learning_rate": 3.97787396884087e-06, + "loss": 1.7108, + "step": 58810 + }, + { + "epoch": 0.7351683792094802, + "grad_norm": 3.0314157009124756, + "learning_rate": 3.977177293046185e-06, + "loss": 0.987, + "step": 58812 + }, + { + "epoch": 0.7351933798344958, + "grad_norm": 3.2000529766082764, + "learning_rate": 3.976480663120459e-06, + "loss": 0.0961, + "step": 58814 + }, + { + "epoch": 0.7352183804595115, + "grad_norm": 4.465906143188477, + "learning_rate": 3.975784079069007e-06, + "loss": 0.9752, + "step": 58816 + }, + { + "epoch": 0.7352433810845271, + "grad_norm": 6.5090436935424805, + "learning_rate": 3.975087540897129e-06, + "loss": 0.8784, + "step": 58818 + }, + { + "epoch": 0.7352683817095428, + "grad_norm": 9.634933471679688, + "learning_rate": 3.9743910486101315e-06, + "loss": 1.6415, + "step": 58820 + }, + { + "epoch": 0.7352933823345583, + "grad_norm": 6.082615375518799, + "learning_rate": 3.973694602213324e-06, + "loss": 1.8354, + "step": 58822 + }, + { + "epoch": 0.735318382959574, + "grad_norm": 6.079268932342529, + "learning_rate": 3.972998201712002e-06, + "loss": 1.5581, + "step": 58824 + }, + { + "epoch": 0.7353433835845896, + "grad_norm": 2.3500611782073975, + "learning_rate": 3.97230184711148e-06, + "loss": 0.7315, + "step": 58826 + }, + { + "epoch": 0.7353683842096053, + "grad_norm": 1.241075038909912, + "learning_rate": 3.971605538417049e-06, + "loss": 0.064, + "step": 58828 + }, + { + "epoch": 0.7353933848346209, + "grad_norm": 4.08867883682251, + "learning_rate": 3.970909275634017e-06, + "loss": 0.789, + "step": 58830 + }, + { + "epoch": 0.7354183854596364, + "grad_norm": 2.394934892654419, + "learning_rate": 3.970213058767692e-06, + "loss": 0.2293, + "step": 58832 + }, + { + "epoch": 0.7354433860846521, + "grad_norm": 0.005038886796683073, + "learning_rate": 3.969516887823367e-06, + "loss": 0.1827, + "step": 58834 + }, + { + "epoch": 0.7354683867096677, + "grad_norm": 4.20461893081665, + "learning_rate": 3.968820762806351e-06, + "loss": 0.5051, + "step": 58836 + }, + { + "epoch": 0.7354933873346834, + "grad_norm": 4.205390453338623, + "learning_rate": 3.968124683721946e-06, + "loss": 0.7652, + "step": 58838 + }, + { + "epoch": 0.735518387959699, + "grad_norm": 0.0017729257233440876, + "learning_rate": 3.967428650575448e-06, + "loss": 0.0021, + "step": 58840 + }, + { + "epoch": 0.7355433885847146, + "grad_norm": 0.7260801792144775, + "learning_rate": 3.966732663372158e-06, + "loss": 0.7526, + "step": 58842 + }, + { + "epoch": 0.7355683892097302, + "grad_norm": 2.478501558303833, + "learning_rate": 3.966036722117378e-06, + "loss": 1.2208, + "step": 58844 + }, + { + "epoch": 0.7355933898347459, + "grad_norm": 3.914698362350464, + "learning_rate": 3.965340826816414e-06, + "loss": 0.707, + "step": 58846 + }, + { + "epoch": 0.7356183904597615, + "grad_norm": 1.0647764205932617, + "learning_rate": 3.964644977474556e-06, + "loss": 0.03, + "step": 58848 + }, + { + "epoch": 0.7356433910847772, + "grad_norm": 2.959623336791992, + "learning_rate": 3.9639491740971125e-06, + "loss": 1.2271, + "step": 58850 + }, + { + "epoch": 0.7356683917097927, + "grad_norm": 3.318587064743042, + "learning_rate": 3.963253416689379e-06, + "loss": 1.46, + "step": 58852 + }, + { + "epoch": 0.7356933923348083, + "grad_norm": 4.0135579109191895, + "learning_rate": 3.962557705256649e-06, + "loss": 0.8857, + "step": 58854 + }, + { + "epoch": 0.735718392959824, + "grad_norm": 0.5235176086425781, + "learning_rate": 3.961862039804229e-06, + "loss": 0.2568, + "step": 58856 + }, + { + "epoch": 0.7357433935848396, + "grad_norm": 4.544454097747803, + "learning_rate": 3.961166420337412e-06, + "loss": 0.4144, + "step": 58858 + }, + { + "epoch": 0.7357683942098553, + "grad_norm": 7.914394855499268, + "learning_rate": 3.9604708468615e-06, + "loss": 0.6153, + "step": 58860 + }, + { + "epoch": 0.7357933948348708, + "grad_norm": 0.37329134345054626, + "learning_rate": 3.959775319381785e-06, + "loss": 0.0056, + "step": 58862 + }, + { + "epoch": 0.7358183954598865, + "grad_norm": 3.3111865520477295, + "learning_rate": 3.9590798379035745e-06, + "loss": 1.1866, + "step": 58864 + }, + { + "epoch": 0.7358433960849021, + "grad_norm": 1.824778437614441, + "learning_rate": 3.9583844024321495e-06, + "loss": 0.058, + "step": 58866 + }, + { + "epoch": 0.7358683967099178, + "grad_norm": 3.5221400260925293, + "learning_rate": 3.957689012972815e-06, + "loss": 1.2596, + "step": 58868 + }, + { + "epoch": 0.7358933973349334, + "grad_norm": 9.618128776550293, + "learning_rate": 3.9569936695308705e-06, + "loss": 1.5963, + "step": 58870 + }, + { + "epoch": 0.735918397959949, + "grad_norm": 0.620597243309021, + "learning_rate": 3.9562983721116035e-06, + "loss": 0.2664, + "step": 58872 + }, + { + "epoch": 0.7359433985849646, + "grad_norm": 0.02124132215976715, + "learning_rate": 3.955603120720317e-06, + "loss": 0.0367, + "step": 58874 + }, + { + "epoch": 0.7359683992099803, + "grad_norm": 2.2313032150268555, + "learning_rate": 3.9549079153623e-06, + "loss": 0.695, + "step": 58876 + }, + { + "epoch": 0.7359933998349959, + "grad_norm": 1.9307541847229004, + "learning_rate": 3.954212756042854e-06, + "loss": 0.6432, + "step": 58878 + }, + { + "epoch": 0.7360184004600115, + "grad_norm": 5.333553314208984, + "learning_rate": 3.953517642767264e-06, + "loss": 0.8801, + "step": 58880 + }, + { + "epoch": 0.7360434010850271, + "grad_norm": 4.397122383117676, + "learning_rate": 3.952822575540828e-06, + "loss": 0.1551, + "step": 58882 + }, + { + "epoch": 0.7360684017100427, + "grad_norm": 2.9041247367858887, + "learning_rate": 3.9521275543688435e-06, + "loss": 0.417, + "step": 58884 + }, + { + "epoch": 0.7360934023350584, + "grad_norm": 2.413114309310913, + "learning_rate": 3.9514325792565965e-06, + "loss": 1.5836, + "step": 58886 + }, + { + "epoch": 0.736118402960074, + "grad_norm": 8.062721252441406, + "learning_rate": 3.9507376502093875e-06, + "loss": 0.8046, + "step": 58888 + }, + { + "epoch": 0.7361434035850897, + "grad_norm": 0.0005555233219638467, + "learning_rate": 3.950042767232505e-06, + "loss": 0.3057, + "step": 58890 + }, + { + "epoch": 0.7361684042101052, + "grad_norm": 0.004114687908440828, + "learning_rate": 3.949347930331238e-06, + "loss": 0.6227, + "step": 58892 + }, + { + "epoch": 0.7361934048351209, + "grad_norm": 5.911014080047607, + "learning_rate": 3.948653139510884e-06, + "loss": 1.7863, + "step": 58894 + }, + { + "epoch": 0.7362184054601365, + "grad_norm": 4.276829242706299, + "learning_rate": 3.947958394776729e-06, + "loss": 0.7753, + "step": 58896 + }, + { + "epoch": 0.7362434060851522, + "grad_norm": 7.675234317779541, + "learning_rate": 3.947263696134071e-06, + "loss": 0.3206, + "step": 58898 + }, + { + "epoch": 0.7362684067101678, + "grad_norm": 1.317548155784607, + "learning_rate": 3.946569043588193e-06, + "loss": 0.2578, + "step": 58900 + }, + { + "epoch": 0.7362934073351833, + "grad_norm": 2.240507125854492, + "learning_rate": 3.945874437144392e-06, + "loss": 0.3399, + "step": 58902 + }, + { + "epoch": 0.736318407960199, + "grad_norm": 1.30653977394104, + "learning_rate": 3.945179876807956e-06, + "loss": 0.1007, + "step": 58904 + }, + { + "epoch": 0.7363434085852146, + "grad_norm": 4.216272354125977, + "learning_rate": 3.944485362584171e-06, + "loss": 1.1569, + "step": 58906 + }, + { + "epoch": 0.7363684092102303, + "grad_norm": 3.04301381111145, + "learning_rate": 3.943790894478331e-06, + "loss": 0.5319, + "step": 58908 + }, + { + "epoch": 0.7363934098352459, + "grad_norm": 1.7331851720809937, + "learning_rate": 3.94309647249572e-06, + "loss": 0.0281, + "step": 58910 + }, + { + "epoch": 0.7364184104602615, + "grad_norm": 0.001007585320621729, + "learning_rate": 3.942402096641633e-06, + "loss": 1.3986, + "step": 58912 + }, + { + "epoch": 0.7364434110852771, + "grad_norm": 6.919440269470215, + "learning_rate": 3.941707766921354e-06, + "loss": 0.4809, + "step": 58914 + }, + { + "epoch": 0.7364684117102928, + "grad_norm": 4.905506134033203, + "learning_rate": 3.941013483340173e-06, + "loss": 1.4389, + "step": 58916 + }, + { + "epoch": 0.7364934123353084, + "grad_norm": 0.00031699688406661153, + "learning_rate": 3.940319245903377e-06, + "loss": 0.1301, + "step": 58918 + }, + { + "epoch": 0.736518412960324, + "grad_norm": 2.3452234268188477, + "learning_rate": 3.939625054616249e-06, + "loss": 0.4633, + "step": 58920 + }, + { + "epoch": 0.7365434135853396, + "grad_norm": 3.1683855056762695, + "learning_rate": 3.9389309094840834e-06, + "loss": 0.5573, + "step": 58922 + }, + { + "epoch": 0.7365684142103552, + "grad_norm": 1.9821834564208984, + "learning_rate": 3.938236810512159e-06, + "loss": 1.3942, + "step": 58924 + }, + { + "epoch": 0.7365934148353709, + "grad_norm": 4.019975185394287, + "learning_rate": 3.937542757705769e-06, + "loss": 1.084, + "step": 58926 + }, + { + "epoch": 0.7366184154603865, + "grad_norm": 2.972911834716797, + "learning_rate": 3.936848751070193e-06, + "loss": 0.8735, + "step": 58928 + }, + { + "epoch": 0.7366434160854022, + "grad_norm": 3.561983346939087, + "learning_rate": 3.936154790610724e-06, + "loss": 0.7955, + "step": 58930 + }, + { + "epoch": 0.7366684167104177, + "grad_norm": 3.222435474395752, + "learning_rate": 3.9354608763326415e-06, + "loss": 1.4184, + "step": 58932 + }, + { + "epoch": 0.7366934173354334, + "grad_norm": 2.498605728149414, + "learning_rate": 3.934767008241228e-06, + "loss": 0.2303, + "step": 58934 + }, + { + "epoch": 0.736718417960449, + "grad_norm": 3.7101523876190186, + "learning_rate": 3.9340731863417745e-06, + "loss": 1.3384, + "step": 58936 + }, + { + "epoch": 0.7367434185854647, + "grad_norm": 2.5114994049072266, + "learning_rate": 3.93337941063956e-06, + "loss": 0.0943, + "step": 58938 + }, + { + "epoch": 0.7367684192104803, + "grad_norm": 0.10471836477518082, + "learning_rate": 3.932685681139872e-06, + "loss": 0.5037, + "step": 58940 + }, + { + "epoch": 0.7367934198354958, + "grad_norm": 3.0873067378997803, + "learning_rate": 3.931991997847991e-06, + "loss": 0.6825, + "step": 58942 + }, + { + "epoch": 0.7368184204605115, + "grad_norm": 2.2409255504608154, + "learning_rate": 3.931298360769199e-06, + "loss": 2.056, + "step": 58944 + }, + { + "epoch": 0.7368434210855271, + "grad_norm": 0.7567727565765381, + "learning_rate": 3.930604769908783e-06, + "loss": 0.1482, + "step": 58946 + }, + { + "epoch": 0.7368684217105428, + "grad_norm": 3.431528329849243, + "learning_rate": 3.929911225272019e-06, + "loss": 1.0948, + "step": 58948 + }, + { + "epoch": 0.7368934223355584, + "grad_norm": 3.4147326946258545, + "learning_rate": 3.929217726864197e-06, + "loss": 1.2555, + "step": 58950 + }, + { + "epoch": 0.736918422960574, + "grad_norm": 0.004674571566283703, + "learning_rate": 3.928524274690591e-06, + "loss": 0.1523, + "step": 58952 + }, + { + "epoch": 0.7369434235855896, + "grad_norm": 3.244580030441284, + "learning_rate": 3.927830868756489e-06, + "loss": 1.1935, + "step": 58954 + }, + { + "epoch": 0.7369684242106053, + "grad_norm": 3.1979291439056396, + "learning_rate": 3.9271375090671685e-06, + "loss": 0.6995, + "step": 58956 + }, + { + "epoch": 0.7369934248356209, + "grad_norm": 4.186368942260742, + "learning_rate": 3.926444195627906e-06, + "loss": 1.9097, + "step": 58958 + }, + { + "epoch": 0.7370184254606366, + "grad_norm": 0.5513981580734253, + "learning_rate": 3.92575092844399e-06, + "loss": 0.3821, + "step": 58960 + }, + { + "epoch": 0.7370434260856521, + "grad_norm": 3.551143169403076, + "learning_rate": 3.9250577075206905e-06, + "loss": 1.2228, + "step": 58962 + }, + { + "epoch": 0.7370684267106677, + "grad_norm": 4.136531829833984, + "learning_rate": 3.924364532863297e-06, + "loss": 0.2534, + "step": 58964 + }, + { + "epoch": 0.7370934273356834, + "grad_norm": 2.827425479888916, + "learning_rate": 3.923671404477082e-06, + "loss": 0.3822, + "step": 58966 + }, + { + "epoch": 0.737118427960699, + "grad_norm": 1.0850703716278076, + "learning_rate": 3.922978322367328e-06, + "loss": 0.7643, + "step": 58968 + }, + { + "epoch": 0.7371434285857147, + "grad_norm": 4.922219753265381, + "learning_rate": 3.9222852865393126e-06, + "loss": 1.6606, + "step": 58970 + }, + { + "epoch": 0.7371684292107302, + "grad_norm": 1.5235697031021118, + "learning_rate": 3.921592296998309e-06, + "loss": 0.3139, + "step": 58972 + }, + { + "epoch": 0.7371934298357459, + "grad_norm": 1.4720224142074585, + "learning_rate": 3.920899353749602e-06, + "loss": 1.1217, + "step": 58974 + }, + { + "epoch": 0.7372184304607615, + "grad_norm": 0.019573597237467766, + "learning_rate": 3.920206456798463e-06, + "loss": 1.4993, + "step": 58976 + }, + { + "epoch": 0.7372434310857772, + "grad_norm": 2.9279160499572754, + "learning_rate": 3.919513606150175e-06, + "loss": 1.3465, + "step": 58978 + }, + { + "epoch": 0.7372684317107928, + "grad_norm": 4.961765289306641, + "learning_rate": 3.918820801810008e-06, + "loss": 0.8519, + "step": 58980 + }, + { + "epoch": 0.7372934323358084, + "grad_norm": 0.0027995379641652107, + "learning_rate": 3.9181280437832454e-06, + "loss": 0.5186, + "step": 58982 + }, + { + "epoch": 0.737318432960824, + "grad_norm": 4.481189727783203, + "learning_rate": 3.9174353320751604e-06, + "loss": 1.334, + "step": 58984 + }, + { + "epoch": 0.7373434335858396, + "grad_norm": 1.0624061822891235, + "learning_rate": 3.9167426666910236e-06, + "loss": 0.5545, + "step": 58986 + }, + { + "epoch": 0.7373684342108553, + "grad_norm": 3.4143593311309814, + "learning_rate": 3.916050047636118e-06, + "loss": 2.1846, + "step": 58988 + }, + { + "epoch": 0.7373934348358709, + "grad_norm": 4.2360358238220215, + "learning_rate": 3.915357474915711e-06, + "loss": 0.3847, + "step": 58990 + }, + { + "epoch": 0.7374184354608865, + "grad_norm": 0.0007962103118188679, + "learning_rate": 3.914664948535085e-06, + "loss": 0.6062, + "step": 58992 + }, + { + "epoch": 0.7374434360859021, + "grad_norm": 2.1172125339508057, + "learning_rate": 3.9139724684995095e-06, + "loss": 1.4221, + "step": 58994 + }, + { + "epoch": 0.7374684367109178, + "grad_norm": 0.0021883139852434397, + "learning_rate": 3.913280034814257e-06, + "loss": 1.0001, + "step": 58996 + }, + { + "epoch": 0.7374934373359334, + "grad_norm": 0.43281418085098267, + "learning_rate": 3.912587647484605e-06, + "loss": 0.5708, + "step": 58998 + }, + { + "epoch": 0.7375184379609491, + "grad_norm": 0.013359188102185726, + "learning_rate": 3.911895306515821e-06, + "loss": 0.2873, + "step": 59000 + }, + { + "epoch": 0.7375434385859646, + "grad_norm": 4.399435043334961, + "learning_rate": 3.911203011913186e-06, + "loss": 1.282, + "step": 59002 + }, + { + "epoch": 0.7375684392109803, + "grad_norm": 1.9262754917144775, + "learning_rate": 3.910510763681964e-06, + "loss": 1.8543, + "step": 59004 + }, + { + "epoch": 0.7375934398359959, + "grad_norm": 5.21234130859375, + "learning_rate": 3.9098185618274344e-06, + "loss": 1.3519, + "step": 59006 + }, + { + "epoch": 0.7376184404610115, + "grad_norm": 3.5076887607574463, + "learning_rate": 3.909126406354866e-06, + "loss": 0.6073, + "step": 59008 + }, + { + "epoch": 0.7376434410860272, + "grad_norm": 5.455447673797607, + "learning_rate": 3.908434297269526e-06, + "loss": 1.8119, + "step": 59010 + }, + { + "epoch": 0.7376684417110427, + "grad_norm": 5.063310146331787, + "learning_rate": 3.907742234576691e-06, + "loss": 1.2718, + "step": 59012 + }, + { + "epoch": 0.7376934423360584, + "grad_norm": 2.178386688232422, + "learning_rate": 3.907050218281627e-06, + "loss": 0.9429, + "step": 59014 + }, + { + "epoch": 0.737718442961074, + "grad_norm": 2.4910480976104736, + "learning_rate": 3.90635824838961e-06, + "loss": 0.2894, + "step": 59016 + }, + { + "epoch": 0.7377434435860897, + "grad_norm": 4.205465793609619, + "learning_rate": 3.905666324905904e-06, + "loss": 0.7069, + "step": 59018 + }, + { + "epoch": 0.7377684442111053, + "grad_norm": 3.5119075775146484, + "learning_rate": 3.904974447835785e-06, + "loss": 1.6902, + "step": 59020 + }, + { + "epoch": 0.7377934448361209, + "grad_norm": 4.267745494842529, + "learning_rate": 3.9042826171845185e-06, + "loss": 0.4962, + "step": 59022 + }, + { + "epoch": 0.7378184454611365, + "grad_norm": 5.814836025238037, + "learning_rate": 3.90359083295737e-06, + "loss": 1.802, + "step": 59024 + }, + { + "epoch": 0.7378434460861522, + "grad_norm": 3.0661346912384033, + "learning_rate": 3.902899095159615e-06, + "loss": 0.7312, + "step": 59026 + }, + { + "epoch": 0.7378684467111678, + "grad_norm": 1.4574482440948486, + "learning_rate": 3.9022074037965154e-06, + "loss": 0.6531, + "step": 59028 + }, + { + "epoch": 0.7378934473361835, + "grad_norm": 3.108189582824707, + "learning_rate": 3.901515758873344e-06, + "loss": 0.7809, + "step": 59030 + }, + { + "epoch": 0.737918447961199, + "grad_norm": 2.1409008502960205, + "learning_rate": 3.900824160395363e-06, + "loss": 0.9596, + "step": 59032 + }, + { + "epoch": 0.7379434485862146, + "grad_norm": 3.512617349624634, + "learning_rate": 3.900132608367847e-06, + "loss": 0.7496, + "step": 59034 + }, + { + "epoch": 0.7379684492112303, + "grad_norm": 5.203232765197754, + "learning_rate": 3.899441102796059e-06, + "loss": 1.0345, + "step": 59036 + }, + { + "epoch": 0.7379934498362459, + "grad_norm": 3.5803072452545166, + "learning_rate": 3.89874964368526e-06, + "loss": 0.5023, + "step": 59038 + }, + { + "epoch": 0.7380184504612616, + "grad_norm": 9.23348617553711, + "learning_rate": 3.898058231040726e-06, + "loss": 1.0767, + "step": 59040 + }, + { + "epoch": 0.7380434510862771, + "grad_norm": 0.001738366554491222, + "learning_rate": 3.897366864867712e-06, + "loss": 0.0, + "step": 59042 + }, + { + "epoch": 0.7380684517112928, + "grad_norm": 3.3545563220977783, + "learning_rate": 3.8966755451714945e-06, + "loss": 1.1299, + "step": 59044 + }, + { + "epoch": 0.7380934523363084, + "grad_norm": 2.9431962966918945, + "learning_rate": 3.895984271957332e-06, + "loss": 1.8132, + "step": 59046 + }, + { + "epoch": 0.7381184529613241, + "grad_norm": 3.3939263820648193, + "learning_rate": 3.895293045230486e-06, + "loss": 0.9653, + "step": 59048 + }, + { + "epoch": 0.7381434535863397, + "grad_norm": 6.4720587730407715, + "learning_rate": 3.894601864996229e-06, + "loss": 2.5633, + "step": 59050 + }, + { + "epoch": 0.7381684542113552, + "grad_norm": 1.9706331491470337, + "learning_rate": 3.893910731259818e-06, + "loss": 0.5615, + "step": 59052 + }, + { + "epoch": 0.7381934548363709, + "grad_norm": 0.003882323857396841, + "learning_rate": 3.893219644026521e-06, + "loss": 1.2056, + "step": 59054 + }, + { + "epoch": 0.7382184554613865, + "grad_norm": 1.162949800491333, + "learning_rate": 3.892528603301599e-06, + "loss": 0.2063, + "step": 59056 + }, + { + "epoch": 0.7382434560864022, + "grad_norm": 2.4026174545288086, + "learning_rate": 3.891837609090317e-06, + "loss": 0.6888, + "step": 59058 + }, + { + "epoch": 0.7382684567114178, + "grad_norm": 2.5206072330474854, + "learning_rate": 3.891146661397936e-06, + "loss": 1.2017, + "step": 59060 + }, + { + "epoch": 0.7382934573364334, + "grad_norm": 2.492042303085327, + "learning_rate": 3.890455760229714e-06, + "loss": 1.5816, + "step": 59062 + }, + { + "epoch": 0.738318457961449, + "grad_norm": 5.308276176452637, + "learning_rate": 3.889764905590923e-06, + "loss": 2.1414, + "step": 59064 + }, + { + "epoch": 0.7383434585864647, + "grad_norm": 2.3159539699554443, + "learning_rate": 3.889074097486813e-06, + "loss": 1.0505, + "step": 59066 + }, + { + "epoch": 0.7383684592114803, + "grad_norm": 5.893343448638916, + "learning_rate": 3.8883833359226545e-06, + "loss": 0.8074, + "step": 59068 + }, + { + "epoch": 0.738393459836496, + "grad_norm": 1.9821867942810059, + "learning_rate": 3.8876926209037e-06, + "loss": 1.0598, + "step": 59070 + }, + { + "epoch": 0.7384184604615115, + "grad_norm": 2.036606550216675, + "learning_rate": 3.887001952435219e-06, + "loss": 0.0948, + "step": 59072 + }, + { + "epoch": 0.7384434610865271, + "grad_norm": 2.580432176589966, + "learning_rate": 3.886311330522465e-06, + "loss": 1.2224, + "step": 59074 + }, + { + "epoch": 0.7384684617115428, + "grad_norm": 4.949625015258789, + "learning_rate": 3.885620755170697e-06, + "loss": 0.8003, + "step": 59076 + }, + { + "epoch": 0.7384934623365584, + "grad_norm": 3.953494071960449, + "learning_rate": 3.88493022638518e-06, + "loss": 1.6125, + "step": 59078 + }, + { + "epoch": 0.7385184629615741, + "grad_norm": 4.277381420135498, + "learning_rate": 3.884239744171167e-06, + "loss": 0.955, + "step": 59080 + }, + { + "epoch": 0.7385434635865896, + "grad_norm": 1.1631171703338623, + "learning_rate": 3.88354930853392e-06, + "loss": 0.6754, + "step": 59082 + }, + { + "epoch": 0.7385684642116053, + "grad_norm": 2.42063045501709, + "learning_rate": 3.882858919478696e-06, + "loss": 0.5449, + "step": 59084 + }, + { + "epoch": 0.7385934648366209, + "grad_norm": 2.2291598320007324, + "learning_rate": 3.882168577010755e-06, + "loss": 0.6986, + "step": 59086 + }, + { + "epoch": 0.7386184654616366, + "grad_norm": 0.1523483544588089, + "learning_rate": 3.8814782811353545e-06, + "loss": 0.0829, + "step": 59088 + }, + { + "epoch": 0.7386434660866522, + "grad_norm": 5.265429973602295, + "learning_rate": 3.880788031857746e-06, + "loss": 0.5576, + "step": 59090 + }, + { + "epoch": 0.7386684667116677, + "grad_norm": 1.583314299583435, + "learning_rate": 3.880097829183194e-06, + "loss": 0.3764, + "step": 59092 + }, + { + "epoch": 0.7386934673366834, + "grad_norm": 2.2759134769439697, + "learning_rate": 3.879407673116948e-06, + "loss": 0.7609, + "step": 59094 + }, + { + "epoch": 0.738718467961699, + "grad_norm": 5.572700023651123, + "learning_rate": 3.8787175636642704e-06, + "loss": 0.9829, + "step": 59096 + }, + { + "epoch": 0.7387434685867147, + "grad_norm": 1.3367332220077515, + "learning_rate": 3.878027500830413e-06, + "loss": 0.4648, + "step": 59098 + }, + { + "epoch": 0.7387684692117303, + "grad_norm": 5.9852094650268555, + "learning_rate": 3.877337484620634e-06, + "loss": 1.2856, + "step": 59100 + }, + { + "epoch": 0.7387934698367459, + "grad_norm": 3.220322847366333, + "learning_rate": 3.876647515040186e-06, + "loss": 1.7973, + "step": 59102 + }, + { + "epoch": 0.7388184704617615, + "grad_norm": 4.198592662811279, + "learning_rate": 3.875957592094322e-06, + "loss": 1.1626, + "step": 59104 + }, + { + "epoch": 0.7388434710867772, + "grad_norm": 1.324127435684204, + "learning_rate": 3.875267715788304e-06, + "loss": 0.3104, + "step": 59106 + }, + { + "epoch": 0.7388684717117928, + "grad_norm": 8.980009078979492, + "learning_rate": 3.8745778861273766e-06, + "loss": 1.14, + "step": 59108 + }, + { + "epoch": 0.7388934723368085, + "grad_norm": 7.515524387359619, + "learning_rate": 3.873888103116797e-06, + "loss": 1.2329, + "step": 59110 + }, + { + "epoch": 0.738918472961824, + "grad_norm": 3.8102316856384277, + "learning_rate": 3.873198366761826e-06, + "loss": 0.8562, + "step": 59112 + }, + { + "epoch": 0.7389434735868396, + "grad_norm": 3.1511330604553223, + "learning_rate": 3.872508677067705e-06, + "loss": 1.5799, + "step": 59114 + }, + { + "epoch": 0.7389684742118553, + "grad_norm": 4.190750598907471, + "learning_rate": 3.871819034039693e-06, + "loss": 1.0923, + "step": 59116 + }, + { + "epoch": 0.738993474836871, + "grad_norm": 7.044009685516357, + "learning_rate": 3.871129437683038e-06, + "loss": 2.4316, + "step": 59118 + }, + { + "epoch": 0.7390184754618866, + "grad_norm": 3.94008207321167, + "learning_rate": 3.870439888002997e-06, + "loss": 1.7969, + "step": 59120 + }, + { + "epoch": 0.7390434760869021, + "grad_norm": 5.44291353225708, + "learning_rate": 3.869750385004817e-06, + "loss": 1.9945, + "step": 59122 + }, + { + "epoch": 0.7390684767119178, + "grad_norm": 4.242302417755127, + "learning_rate": 3.86906092869375e-06, + "loss": 1.1494, + "step": 59124 + }, + { + "epoch": 0.7390934773369334, + "grad_norm": 5.620450496673584, + "learning_rate": 3.8683715190750545e-06, + "loss": 1.8024, + "step": 59126 + }, + { + "epoch": 0.7391184779619491, + "grad_norm": 4.378505706787109, + "learning_rate": 3.867682156153969e-06, + "loss": 0.967, + "step": 59128 + }, + { + "epoch": 0.7391434785869647, + "grad_norm": 2.819866895675659, + "learning_rate": 3.8669928399357516e-06, + "loss": 0.6648, + "step": 59130 + }, + { + "epoch": 0.7391684792119803, + "grad_norm": 4.507092475891113, + "learning_rate": 3.866303570425646e-06, + "loss": 1.6734, + "step": 59132 + }, + { + "epoch": 0.7391934798369959, + "grad_norm": 2.4053452014923096, + "learning_rate": 3.865614347628909e-06, + "loss": 0.4084, + "step": 59134 + }, + { + "epoch": 0.7392184804620116, + "grad_norm": 6.771753311157227, + "learning_rate": 3.864925171550781e-06, + "loss": 0.7795, + "step": 59136 + }, + { + "epoch": 0.7392434810870272, + "grad_norm": 3.7777864933013916, + "learning_rate": 3.8642360421965196e-06, + "loss": 0.5999, + "step": 59138 + }, + { + "epoch": 0.7392684817120428, + "grad_norm": 2.29318904876709, + "learning_rate": 3.86354695957137e-06, + "loss": 0.3929, + "step": 59140 + }, + { + "epoch": 0.7392934823370584, + "grad_norm": 2.692122220993042, + "learning_rate": 3.862857923680573e-06, + "loss": 0.3055, + "step": 59142 + }, + { + "epoch": 0.739318482962074, + "grad_norm": 8.427855491638184, + "learning_rate": 3.862168934529388e-06, + "loss": 1.2732, + "step": 59144 + }, + { + "epoch": 0.7393434835870897, + "grad_norm": 3.349241018295288, + "learning_rate": 3.8614799921230515e-06, + "loss": 0.6429, + "step": 59146 + }, + { + "epoch": 0.7393684842121053, + "grad_norm": 4.35512638092041, + "learning_rate": 3.8607910964668145e-06, + "loss": 1.9189, + "step": 59148 + }, + { + "epoch": 0.739393484837121, + "grad_norm": 2.306476354598999, + "learning_rate": 3.86010224756593e-06, + "loss": 0.277, + "step": 59150 + }, + { + "epoch": 0.7394184854621365, + "grad_norm": 4.337221145629883, + "learning_rate": 3.859413445425637e-06, + "loss": 1.6412, + "step": 59152 + }, + { + "epoch": 0.7394434860871522, + "grad_norm": 1.633075475692749, + "learning_rate": 3.858724690051184e-06, + "loss": 0.1201, + "step": 59154 + }, + { + "epoch": 0.7394684867121678, + "grad_norm": 0.0082301776856184, + "learning_rate": 3.85803598144781e-06, + "loss": 0.2599, + "step": 59156 + }, + { + "epoch": 0.7394934873371835, + "grad_norm": 0.06707842648029327, + "learning_rate": 3.8573473196207715e-06, + "loss": 0.0011, + "step": 59158 + }, + { + "epoch": 0.7395184879621991, + "grad_norm": 4.294687271118164, + "learning_rate": 3.856658704575302e-06, + "loss": 1.1925, + "step": 59160 + }, + { + "epoch": 0.7395434885872146, + "grad_norm": 0.0005175459082238376, + "learning_rate": 3.8559701363166515e-06, + "loss": 0.3554, + "step": 59162 + }, + { + "epoch": 0.7395684892122303, + "grad_norm": 3.489450454711914, + "learning_rate": 3.85528161485007e-06, + "loss": 0.5736, + "step": 59164 + }, + { + "epoch": 0.7395934898372459, + "grad_norm": 3.781947612762451, + "learning_rate": 3.854593140180789e-06, + "loss": 1.0658, + "step": 59166 + }, + { + "epoch": 0.7396184904622616, + "grad_norm": 4.5520195960998535, + "learning_rate": 3.85390471231406e-06, + "loss": 1.7449, + "step": 59168 + }, + { + "epoch": 0.7396434910872772, + "grad_norm": 0.0009034487884491682, + "learning_rate": 3.853216331255121e-06, + "loss": 0.2411, + "step": 59170 + }, + { + "epoch": 0.7396684917122928, + "grad_norm": 4.057891845703125, + "learning_rate": 3.852527997009216e-06, + "loss": 0.6615, + "step": 59172 + }, + { + "epoch": 0.7396934923373084, + "grad_norm": 2.7075600624084473, + "learning_rate": 3.851839709581593e-06, + "loss": 0.8121, + "step": 59174 + }, + { + "epoch": 0.7397184929623241, + "grad_norm": 0.5157952904701233, + "learning_rate": 3.8511514689774855e-06, + "loss": 0.3602, + "step": 59176 + }, + { + "epoch": 0.7397434935873397, + "grad_norm": 3.772613763809204, + "learning_rate": 3.850463275202145e-06, + "loss": 1.4797, + "step": 59178 + }, + { + "epoch": 0.7397684942123554, + "grad_norm": 4.871333599090576, + "learning_rate": 3.849775128260801e-06, + "loss": 1.8257, + "step": 59180 + }, + { + "epoch": 0.7397934948373709, + "grad_norm": 0.3358011245727539, + "learning_rate": 3.849087028158702e-06, + "loss": 0.5107, + "step": 59182 + }, + { + "epoch": 0.7398184954623865, + "grad_norm": 4.644045352935791, + "learning_rate": 3.848398974901084e-06, + "loss": 1.3727, + "step": 59184 + }, + { + "epoch": 0.7398434960874022, + "grad_norm": 1.796787142753601, + "learning_rate": 3.847710968493189e-06, + "loss": 0.274, + "step": 59186 + }, + { + "epoch": 0.7398684967124178, + "grad_norm": 4.026123046875, + "learning_rate": 3.847023008940262e-06, + "loss": 1.0179, + "step": 59188 + }, + { + "epoch": 0.7398934973374335, + "grad_norm": 3.2280080318450928, + "learning_rate": 3.846335096247535e-06, + "loss": 0.1598, + "step": 59190 + }, + { + "epoch": 0.739918497962449, + "grad_norm": 0.0034246928989887238, + "learning_rate": 3.845647230420252e-06, + "loss": 0.0545, + "step": 59192 + }, + { + "epoch": 0.7399434985874647, + "grad_norm": 3.1188366413116455, + "learning_rate": 3.8449594114636445e-06, + "loss": 0.6394, + "step": 59194 + }, + { + "epoch": 0.7399684992124803, + "grad_norm": 0.0009223941015079618, + "learning_rate": 3.8442716393829604e-06, + "loss": 0.0045, + "step": 59196 + }, + { + "epoch": 0.739993499837496, + "grad_norm": 1.071946382522583, + "learning_rate": 3.843583914183428e-06, + "loss": 0.0297, + "step": 59198 + }, + { + "epoch": 0.7400185004625116, + "grad_norm": 4.291068077087402, + "learning_rate": 3.842896235870292e-06, + "loss": 1.1636, + "step": 59200 + }, + { + "epoch": 0.7400435010875271, + "grad_norm": 4.166114807128906, + "learning_rate": 3.842208604448789e-06, + "loss": 0.8971, + "step": 59202 + }, + { + "epoch": 0.7400685017125428, + "grad_norm": 2.121645450592041, + "learning_rate": 3.841521019924156e-06, + "loss": 0.4192, + "step": 59204 + }, + { + "epoch": 0.7400935023375584, + "grad_norm": 2.777900218963623, + "learning_rate": 3.840833482301628e-06, + "loss": 1.5034, + "step": 59206 + }, + { + "epoch": 0.7401185029625741, + "grad_norm": 7.961658000946045, + "learning_rate": 3.840145991586437e-06, + "loss": 1.6228, + "step": 59208 + }, + { + "epoch": 0.7401435035875897, + "grad_norm": 2.613462448120117, + "learning_rate": 3.839458547783823e-06, + "loss": 0.6651, + "step": 59210 + }, + { + "epoch": 0.7401685042126053, + "grad_norm": 5.887876033782959, + "learning_rate": 3.838771150899026e-06, + "loss": 0.4074, + "step": 59212 + }, + { + "epoch": 0.7401935048376209, + "grad_norm": 0.385797917842865, + "learning_rate": 3.838083800937271e-06, + "loss": 0.0469, + "step": 59214 + }, + { + "epoch": 0.7402185054626366, + "grad_norm": 0.8710152506828308, + "learning_rate": 3.837396497903807e-06, + "loss": 0.7872, + "step": 59216 + }, + { + "epoch": 0.7402435060876522, + "grad_norm": 4.704588413238525, + "learning_rate": 3.836709241803853e-06, + "loss": 1.9281, + "step": 59218 + }, + { + "epoch": 0.7402685067126679, + "grad_norm": 2.343698740005493, + "learning_rate": 3.836022032642652e-06, + "loss": 0.7039, + "step": 59220 + }, + { + "epoch": 0.7402935073376834, + "grad_norm": 4.606010437011719, + "learning_rate": 3.835334870425433e-06, + "loss": 0.2106, + "step": 59222 + }, + { + "epoch": 0.740318507962699, + "grad_norm": 2.1793344020843506, + "learning_rate": 3.834647755157432e-06, + "loss": 0.6959, + "step": 59224 + }, + { + "epoch": 0.7403435085877147, + "grad_norm": 1.920140266418457, + "learning_rate": 3.833960686843885e-06, + "loss": 0.7357, + "step": 59226 + }, + { + "epoch": 0.7403685092127303, + "grad_norm": 7.7931413650512695, + "learning_rate": 3.833273665490019e-06, + "loss": 2.2034, + "step": 59228 + }, + { + "epoch": 0.740393509837746, + "grad_norm": 0.0004691427166108042, + "learning_rate": 3.832586691101074e-06, + "loss": 0.9082, + "step": 59230 + }, + { + "epoch": 0.7404185104627615, + "grad_norm": 0.009100687690079212, + "learning_rate": 3.83189976368227e-06, + "loss": 0.0173, + "step": 59232 + }, + { + "epoch": 0.7404435110877772, + "grad_norm": 4.696408748626709, + "learning_rate": 3.8312128832388464e-06, + "loss": 0.8429, + "step": 59234 + }, + { + "epoch": 0.7404685117127928, + "grad_norm": 0.0018061011796817183, + "learning_rate": 3.8305260497760354e-06, + "loss": 0.0346, + "step": 59236 + }, + { + "epoch": 0.7404935123378085, + "grad_norm": 1.5654160976409912, + "learning_rate": 3.829839263299062e-06, + "loss": 0.1313, + "step": 59238 + }, + { + "epoch": 0.7405185129628241, + "grad_norm": 4.3568925857543945, + "learning_rate": 3.829152523813165e-06, + "loss": 0.5453, + "step": 59240 + }, + { + "epoch": 0.7405435135878397, + "grad_norm": 4.892093658447266, + "learning_rate": 3.828465831323569e-06, + "loss": 1.1043, + "step": 59242 + }, + { + "epoch": 0.7405685142128553, + "grad_norm": 1.5595312118530273, + "learning_rate": 3.827779185835504e-06, + "loss": 0.3185, + "step": 59244 + }, + { + "epoch": 0.740593514837871, + "grad_norm": 3.3302409648895264, + "learning_rate": 3.827092587354197e-06, + "loss": 1.3626, + "step": 59246 + }, + { + "epoch": 0.7406185154628866, + "grad_norm": 2.097808361053467, + "learning_rate": 3.8264060358848795e-06, + "loss": 1.4392, + "step": 59248 + }, + { + "epoch": 0.7406435160879022, + "grad_norm": 9.573956489562988, + "learning_rate": 3.825719531432784e-06, + "loss": 0.5437, + "step": 59250 + }, + { + "epoch": 0.7406685167129178, + "grad_norm": 2.9976649284362793, + "learning_rate": 3.8250330740031325e-06, + "loss": 0.9386, + "step": 59252 + }, + { + "epoch": 0.7406935173379334, + "grad_norm": 0.0006230863509699702, + "learning_rate": 3.824346663601159e-06, + "loss": 0.6996, + "step": 59254 + }, + { + "epoch": 0.7407185179629491, + "grad_norm": 2.6955015659332275, + "learning_rate": 3.823660300232089e-06, + "loss": 0.4235, + "step": 59256 + }, + { + "epoch": 0.7407435185879647, + "grad_norm": 1.0729528665542603, + "learning_rate": 3.822973983901144e-06, + "loss": 0.1183, + "step": 59258 + }, + { + "epoch": 0.7407685192129804, + "grad_norm": 3.093534469604492, + "learning_rate": 3.822287714613559e-06, + "loss": 1.0993, + "step": 59260 + }, + { + "epoch": 0.7407935198379959, + "grad_norm": 0.001344402669928968, + "learning_rate": 3.8216014923745545e-06, + "loss": 0.0001, + "step": 59262 + }, + { + "epoch": 0.7408185204630116, + "grad_norm": 2.946577310562134, + "learning_rate": 3.820915317189363e-06, + "loss": 0.5773, + "step": 59264 + }, + { + "epoch": 0.7408435210880272, + "grad_norm": 3.312391996383667, + "learning_rate": 3.820229189063202e-06, + "loss": 1.2937, + "step": 59266 + }, + { + "epoch": 0.7408685217130428, + "grad_norm": 3.8542816638946533, + "learning_rate": 3.81954310800131e-06, + "loss": 1.1547, + "step": 59268 + }, + { + "epoch": 0.7408935223380585, + "grad_norm": 2.5707311630249023, + "learning_rate": 3.818857074008896e-06, + "loss": 0.5667, + "step": 59270 + }, + { + "epoch": 0.740918522963074, + "grad_norm": 0.36392879486083984, + "learning_rate": 3.8181710870911915e-06, + "loss": 1.2108, + "step": 59272 + }, + { + "epoch": 0.7409435235880897, + "grad_norm": 2.5620272159576416, + "learning_rate": 3.817485147253427e-06, + "loss": 1.5, + "step": 59274 + }, + { + "epoch": 0.7409685242131053, + "grad_norm": 0.4094669818878174, + "learning_rate": 3.816799254500817e-06, + "loss": 0.0519, + "step": 59276 + }, + { + "epoch": 0.740993524838121, + "grad_norm": 4.37549352645874, + "learning_rate": 3.816113408838592e-06, + "loss": 0.9704, + "step": 59278 + }, + { + "epoch": 0.7410185254631366, + "grad_norm": 5.09498405456543, + "learning_rate": 3.815427610271971e-06, + "loss": 1.4553, + "step": 59280 + }, + { + "epoch": 0.7410435260881522, + "grad_norm": 4.414772987365723, + "learning_rate": 3.814741858806185e-06, + "loss": 1.2492, + "step": 59282 + }, + { + "epoch": 0.7410685267131678, + "grad_norm": 2.141841173171997, + "learning_rate": 3.814056154446444e-06, + "loss": 0.794, + "step": 59284 + }, + { + "epoch": 0.7410935273381835, + "grad_norm": 6.757528781890869, + "learning_rate": 3.8133704971979755e-06, + "loss": 2.1719, + "step": 59286 + }, + { + "epoch": 0.7411185279631991, + "grad_norm": 3.320207118988037, + "learning_rate": 3.812684887066006e-06, + "loss": 1.399, + "step": 59288 + }, + { + "epoch": 0.7411435285882148, + "grad_norm": 4.011758327484131, + "learning_rate": 3.8119993240557506e-06, + "loss": 0.8512, + "step": 59290 + }, + { + "epoch": 0.7411685292132303, + "grad_norm": 1.497403621673584, + "learning_rate": 3.8113138081724364e-06, + "loss": 0.0661, + "step": 59292 + }, + { + "epoch": 0.7411935298382459, + "grad_norm": 4.222424030303955, + "learning_rate": 3.8106283394212807e-06, + "loss": 0.7497, + "step": 59294 + }, + { + "epoch": 0.7412185304632616, + "grad_norm": 1.8383358716964722, + "learning_rate": 3.8099429178075022e-06, + "loss": 0.8341, + "step": 59296 + }, + { + "epoch": 0.7412435310882772, + "grad_norm": 0.3064265549182892, + "learning_rate": 3.8092575433363254e-06, + "loss": 1.1, + "step": 59298 + }, + { + "epoch": 0.7412685317132929, + "grad_norm": 1.383658766746521, + "learning_rate": 3.8085722160129645e-06, + "loss": 0.2867, + "step": 59300 + }, + { + "epoch": 0.7412935323383084, + "grad_norm": 3.1731529235839844, + "learning_rate": 3.807886935842645e-06, + "loss": 0.7948, + "step": 59302 + }, + { + "epoch": 0.7413185329633241, + "grad_norm": 4.748812198638916, + "learning_rate": 3.8072017028305795e-06, + "loss": 1.9214, + "step": 59304 + }, + { + "epoch": 0.7413435335883397, + "grad_norm": 4.249701976776123, + "learning_rate": 3.806516516981995e-06, + "loss": 0.7078, + "step": 59306 + }, + { + "epoch": 0.7413685342133554, + "grad_norm": 2.500246286392212, + "learning_rate": 3.8058313783021038e-06, + "loss": 0.7616, + "step": 59308 + }, + { + "epoch": 0.741393534838371, + "grad_norm": 3.1052052974700928, + "learning_rate": 3.805146286796121e-06, + "loss": 1.0268, + "step": 59310 + }, + { + "epoch": 0.7414185354633865, + "grad_norm": 3.4886059761047363, + "learning_rate": 3.804461242469272e-06, + "loss": 1.0997, + "step": 59312 + }, + { + "epoch": 0.7414435360884022, + "grad_norm": 2.857531785964966, + "learning_rate": 3.8037762453267667e-06, + "loss": 1.1208, + "step": 59314 + }, + { + "epoch": 0.7414685367134178, + "grad_norm": 4.529269695281982, + "learning_rate": 3.8030912953738286e-06, + "loss": 1.1972, + "step": 59316 + }, + { + "epoch": 0.7414935373384335, + "grad_norm": 0.44276028871536255, + "learning_rate": 3.8024063926156673e-06, + "loss": 0.0281, + "step": 59318 + }, + { + "epoch": 0.7415185379634491, + "grad_norm": 0.00045364911784417927, + "learning_rate": 3.801721537057507e-06, + "loss": 0.5349, + "step": 59320 + }, + { + "epoch": 0.7415435385884647, + "grad_norm": 3.1164093017578125, + "learning_rate": 3.801036728704558e-06, + "loss": 0.9263, + "step": 59322 + }, + { + "epoch": 0.7415685392134803, + "grad_norm": 5.303782939910889, + "learning_rate": 3.8003519675620326e-06, + "loss": 0.645, + "step": 59324 + }, + { + "epoch": 0.741593539838496, + "grad_norm": 2.6222450733184814, + "learning_rate": 3.7996672536351543e-06, + "loss": 0.554, + "step": 59326 + }, + { + "epoch": 0.7416185404635116, + "grad_norm": 0.20196764171123505, + "learning_rate": 3.7989825869291296e-06, + "loss": 1.1031, + "step": 59328 + }, + { + "epoch": 0.7416435410885273, + "grad_norm": 5.2721967697143555, + "learning_rate": 3.7982979674491804e-06, + "loss": 1.1593, + "step": 59330 + }, + { + "epoch": 0.7416685417135428, + "grad_norm": 4.1648077964782715, + "learning_rate": 3.7976133952005133e-06, + "loss": 0.3779, + "step": 59332 + }, + { + "epoch": 0.7416935423385584, + "grad_norm": 2.444807767868042, + "learning_rate": 3.7969288701883487e-06, + "loss": 1.0357, + "step": 59334 + }, + { + "epoch": 0.7417185429635741, + "grad_norm": 4.38827657699585, + "learning_rate": 3.796244392417897e-06, + "loss": 2.2011, + "step": 59336 + }, + { + "epoch": 0.7417435435885897, + "grad_norm": 3.437558889389038, + "learning_rate": 3.7955599618943673e-06, + "loss": 1.1818, + "step": 59338 + }, + { + "epoch": 0.7417685442136054, + "grad_norm": 2.3244011402130127, + "learning_rate": 3.7948755786229783e-06, + "loss": 0.1337, + "step": 59340 + }, + { + "epoch": 0.7417935448386209, + "grad_norm": 4.277824878692627, + "learning_rate": 3.794191242608937e-06, + "loss": 1.0383, + "step": 59342 + }, + { + "epoch": 0.7418185454636366, + "grad_norm": 3.5245280265808105, + "learning_rate": 3.79350695385746e-06, + "loss": 1.2287, + "step": 59344 + }, + { + "epoch": 0.7418435460886522, + "grad_norm": 2.047942638397217, + "learning_rate": 3.7928227123737526e-06, + "loss": 1.2831, + "step": 59346 + }, + { + "epoch": 0.7418685467136679, + "grad_norm": 4.562151908874512, + "learning_rate": 3.792138518163034e-06, + "loss": 1.5318, + "step": 59348 + }, + { + "epoch": 0.7418935473386835, + "grad_norm": 3.8441684246063232, + "learning_rate": 3.791454371230511e-06, + "loss": 0.9234, + "step": 59350 + }, + { + "epoch": 0.741918547963699, + "grad_norm": 6.984798431396484, + "learning_rate": 3.79077027158139e-06, + "loss": 1.9549, + "step": 59352 + }, + { + "epoch": 0.7419435485887147, + "grad_norm": 0.5516245365142822, + "learning_rate": 3.7900862192208863e-06, + "loss": 0.0468, + "step": 59354 + }, + { + "epoch": 0.7419685492137303, + "grad_norm": 2.056434154510498, + "learning_rate": 3.7894022141542064e-06, + "loss": 1.054, + "step": 59356 + }, + { + "epoch": 0.741993549838746, + "grad_norm": 4.464687347412109, + "learning_rate": 3.7887182563865634e-06, + "loss": 0.9899, + "step": 59358 + }, + { + "epoch": 0.7420185504637616, + "grad_norm": 1.8065849542617798, + "learning_rate": 3.788034345923164e-06, + "loss": 1.2329, + "step": 59360 + }, + { + "epoch": 0.7420435510887772, + "grad_norm": 3.300919532775879, + "learning_rate": 3.787350482769213e-06, + "loss": 1.1512, + "step": 59362 + }, + { + "epoch": 0.7420685517137928, + "grad_norm": 2.4450783729553223, + "learning_rate": 3.786666666929926e-06, + "loss": 0.1693, + "step": 59364 + }, + { + "epoch": 0.7420935523388085, + "grad_norm": 8.036063194274902, + "learning_rate": 3.785982898410503e-06, + "loss": 0.6388, + "step": 59366 + }, + { + "epoch": 0.7421185529638241, + "grad_norm": 0.8310394287109375, + "learning_rate": 3.7852991772161597e-06, + "loss": 0.3341, + "step": 59368 + }, + { + "epoch": 0.7421435535888398, + "grad_norm": 5.276579856872559, + "learning_rate": 3.784615503352095e-06, + "loss": 1.6899, + "step": 59370 + }, + { + "epoch": 0.7421685542138553, + "grad_norm": 4.011631011962891, + "learning_rate": 3.7839318768235234e-06, + "loss": 1.6841, + "step": 59372 + }, + { + "epoch": 0.742193554838871, + "grad_norm": 2.339191198348999, + "learning_rate": 3.7832482976356476e-06, + "loss": 0.3985, + "step": 59374 + }, + { + "epoch": 0.7422185554638866, + "grad_norm": 5.1551337242126465, + "learning_rate": 3.7825647657936694e-06, + "loss": 0.5752, + "step": 59376 + }, + { + "epoch": 0.7422435560889022, + "grad_norm": 2.706467866897583, + "learning_rate": 3.7818812813028028e-06, + "loss": 0.6995, + "step": 59378 + }, + { + "epoch": 0.7422685567139179, + "grad_norm": 0.05125531554222107, + "learning_rate": 3.7811978441682463e-06, + "loss": 0.1103, + "step": 59380 + }, + { + "epoch": 0.7422935573389334, + "grad_norm": 2.7507259845733643, + "learning_rate": 3.78051445439521e-06, + "loss": 0.2755, + "step": 59382 + }, + { + "epoch": 0.7423185579639491, + "grad_norm": 0.012532306835055351, + "learning_rate": 3.7798311119888933e-06, + "loss": 0.4686, + "step": 59384 + }, + { + "epoch": 0.7423435585889647, + "grad_norm": 0.0005209416849538684, + "learning_rate": 3.779147816954506e-06, + "loss": 0.7128, + "step": 59386 + }, + { + "epoch": 0.7423685592139804, + "grad_norm": 5.754718780517578, + "learning_rate": 3.7784645692972497e-06, + "loss": 1.1482, + "step": 59388 + }, + { + "epoch": 0.742393559838996, + "grad_norm": 5.327642440795898, + "learning_rate": 3.777781369022324e-06, + "loss": 1.256, + "step": 59390 + }, + { + "epoch": 0.7424185604640116, + "grad_norm": 0.0006125052459537983, + "learning_rate": 3.777098216134939e-06, + "loss": 0.5227, + "step": 59392 + }, + { + "epoch": 0.7424435610890272, + "grad_norm": 3.0199546813964844, + "learning_rate": 3.7764151106402903e-06, + "loss": 1.0284, + "step": 59394 + }, + { + "epoch": 0.7424685617140429, + "grad_norm": 1.2835428714752197, + "learning_rate": 3.775732052543588e-06, + "loss": 0.1595, + "step": 59396 + }, + { + "epoch": 0.7424935623390585, + "grad_norm": 3.1836133003234863, + "learning_rate": 3.775049041850026e-06, + "loss": 0.7277, + "step": 59398 + }, + { + "epoch": 0.7425185629640741, + "grad_norm": 1.5543416738510132, + "learning_rate": 3.774366078564814e-06, + "loss": 0.2547, + "step": 59400 + }, + { + "epoch": 0.7425435635890897, + "grad_norm": 1.7176990509033203, + "learning_rate": 3.773683162693149e-06, + "loss": 1.3015, + "step": 59402 + }, + { + "epoch": 0.7425685642141053, + "grad_norm": 2.3950705528259277, + "learning_rate": 3.77300029424023e-06, + "loss": 0.576, + "step": 59404 + }, + { + "epoch": 0.742593564839121, + "grad_norm": 0.0029347033705562353, + "learning_rate": 3.772317473211263e-06, + "loss": 0.5062, + "step": 59406 + }, + { + "epoch": 0.7426185654641366, + "grad_norm": 0.0026131414342671633, + "learning_rate": 3.771634699611443e-06, + "loss": 0.9001, + "step": 59408 + }, + { + "epoch": 0.7426435660891523, + "grad_norm": 1.5663208961486816, + "learning_rate": 3.7709519734459744e-06, + "loss": 0.5461, + "step": 59410 + }, + { + "epoch": 0.7426685667141678, + "grad_norm": 5.950190544128418, + "learning_rate": 3.770269294720056e-06, + "loss": 0.7426, + "step": 59412 + }, + { + "epoch": 0.7426935673391835, + "grad_norm": 2.8956167697906494, + "learning_rate": 3.769586663438881e-06, + "loss": 0.6148, + "step": 59414 + }, + { + "epoch": 0.7427185679641991, + "grad_norm": 4.025360107421875, + "learning_rate": 3.768904079607657e-06, + "loss": 1.4501, + "step": 59416 + }, + { + "epoch": 0.7427435685892148, + "grad_norm": 0.7694838643074036, + "learning_rate": 3.768221543231575e-06, + "loss": 0.6261, + "step": 59418 + }, + { + "epoch": 0.7427685692142304, + "grad_norm": 4.010732173919678, + "learning_rate": 3.7675390543158395e-06, + "loss": 1.3481, + "step": 59420 + }, + { + "epoch": 0.7427935698392459, + "grad_norm": 4.208222389221191, + "learning_rate": 3.7668566128656427e-06, + "loss": 0.9689, + "step": 59422 + }, + { + "epoch": 0.7428185704642616, + "grad_norm": 4.041647434234619, + "learning_rate": 3.7661742188861874e-06, + "loss": 0.8962, + "step": 59424 + }, + { + "epoch": 0.7428435710892772, + "grad_norm": 2.4502429962158203, + "learning_rate": 3.7654918723826674e-06, + "loss": 0.8068, + "step": 59426 + }, + { + "epoch": 0.7428685717142929, + "grad_norm": 7.717380523681641, + "learning_rate": 3.7648095733602764e-06, + "loss": 0.6459, + "step": 59428 + }, + { + "epoch": 0.7428935723393085, + "grad_norm": 4.524613857269287, + "learning_rate": 3.764127321824218e-06, + "loss": 0.6089, + "step": 59430 + }, + { + "epoch": 0.7429185729643241, + "grad_norm": 3.1216228008270264, + "learning_rate": 3.7634451177796793e-06, + "loss": 0.686, + "step": 59432 + }, + { + "epoch": 0.7429435735893397, + "grad_norm": 7.6203789710998535, + "learning_rate": 3.762762961231865e-06, + "loss": 1.6578, + "step": 59434 + }, + { + "epoch": 0.7429685742143554, + "grad_norm": 4.582988262176514, + "learning_rate": 3.762080852185963e-06, + "loss": 0.9876, + "step": 59436 + }, + { + "epoch": 0.742993574839371, + "grad_norm": 0.0006975048454478383, + "learning_rate": 3.761398790647174e-06, + "loss": 0.0, + "step": 59438 + }, + { + "epoch": 0.7430185754643867, + "grad_norm": 6.077481746673584, + "learning_rate": 3.7607167766206898e-06, + "loss": 0.67, + "step": 59440 + }, + { + "epoch": 0.7430435760894022, + "grad_norm": 1.7763547897338867, + "learning_rate": 3.7600348101117e-06, + "loss": 0.888, + "step": 59442 + }, + { + "epoch": 0.7430685767144178, + "grad_norm": 5.228619575500488, + "learning_rate": 3.7593528911254073e-06, + "loss": 1.5357, + "step": 59444 + }, + { + "epoch": 0.7430935773394335, + "grad_norm": 0.0005137133994139731, + "learning_rate": 3.7586710196669962e-06, + "loss": 0.0, + "step": 59446 + }, + { + "epoch": 0.7431185779644491, + "grad_norm": 2.611825704574585, + "learning_rate": 3.757989195741668e-06, + "loss": 0.4396, + "step": 59448 + }, + { + "epoch": 0.7431435785894648, + "grad_norm": 0.02617204561829567, + "learning_rate": 3.7573074193546077e-06, + "loss": 0.0003, + "step": 59450 + }, + { + "epoch": 0.7431685792144803, + "grad_norm": 3.526564121246338, + "learning_rate": 3.7566256905110145e-06, + "loss": 0.423, + "step": 59452 + }, + { + "epoch": 0.743193579839496, + "grad_norm": 3.207612991333008, + "learning_rate": 3.755944009216077e-06, + "loss": 0.2399, + "step": 59454 + }, + { + "epoch": 0.7432185804645116, + "grad_norm": 3.281724214553833, + "learning_rate": 3.755262375474984e-06, + "loss": 0.9136, + "step": 59456 + }, + { + "epoch": 0.7432435810895273, + "grad_norm": 0.0007701884023845196, + "learning_rate": 3.7545807892929333e-06, + "loss": 1.1682, + "step": 59458 + }, + { + "epoch": 0.7432685817145429, + "grad_norm": 7.255830764770508, + "learning_rate": 3.7538992506751083e-06, + "loss": 0.717, + "step": 59460 + }, + { + "epoch": 0.7432935823395584, + "grad_norm": 3.798189401626587, + "learning_rate": 3.753217759626706e-06, + "loss": 1.4625, + "step": 59462 + }, + { + "epoch": 0.7433185829645741, + "grad_norm": 3.2077956199645996, + "learning_rate": 3.752536316152915e-06, + "loss": 0.7495, + "step": 59464 + }, + { + "epoch": 0.7433435835895897, + "grad_norm": 6.390495300292969, + "learning_rate": 3.7518549202589206e-06, + "loss": 0.314, + "step": 59466 + }, + { + "epoch": 0.7433685842146054, + "grad_norm": 2.44940447807312, + "learning_rate": 3.7511735719499188e-06, + "loss": 0.7384, + "step": 59468 + }, + { + "epoch": 0.743393584839621, + "grad_norm": 3.3464014530181885, + "learning_rate": 3.750492271231091e-06, + "loss": 1.4119, + "step": 59470 + }, + { + "epoch": 0.7434185854646366, + "grad_norm": 3.443540334701538, + "learning_rate": 3.7498110181076355e-06, + "loss": 1.921, + "step": 59472 + }, + { + "epoch": 0.7434435860896522, + "grad_norm": 1.4156246185302734, + "learning_rate": 3.749129812584731e-06, + "loss": 0.9276, + "step": 59474 + }, + { + "epoch": 0.7434685867146679, + "grad_norm": 8.234601020812988, + "learning_rate": 3.7484486546675726e-06, + "loss": 1.0739, + "step": 59476 + }, + { + "epoch": 0.7434935873396835, + "grad_norm": 0.7180995345115662, + "learning_rate": 3.747767544361346e-06, + "loss": 0.0548, + "step": 59478 + }, + { + "epoch": 0.7435185879646992, + "grad_norm": 2.5926456451416016, + "learning_rate": 3.747086481671234e-06, + "loss": 0.6159, + "step": 59480 + }, + { + "epoch": 0.7435435885897147, + "grad_norm": 3.9303712844848633, + "learning_rate": 3.746405466602431e-06, + "loss": 0.7615, + "step": 59482 + }, + { + "epoch": 0.7435685892147303, + "grad_norm": 0.0005071815685369074, + "learning_rate": 3.745724499160116e-06, + "loss": 0.0, + "step": 59484 + }, + { + "epoch": 0.743593589839746, + "grad_norm": 0.1380506455898285, + "learning_rate": 3.7450435793494823e-06, + "loss": 0.002, + "step": 59486 + }, + { + "epoch": 0.7436185904647616, + "grad_norm": 3.343674659729004, + "learning_rate": 3.744362707175708e-06, + "loss": 0.9799, + "step": 59488 + }, + { + "epoch": 0.7436435910897773, + "grad_norm": 3.0158417224884033, + "learning_rate": 3.743681882643987e-06, + "loss": 0.454, + "step": 59490 + }, + { + "epoch": 0.7436685917147928, + "grad_norm": 4.0970025062561035, + "learning_rate": 3.7430011057594994e-06, + "loss": 1.0242, + "step": 59492 + }, + { + "epoch": 0.7436935923398085, + "grad_norm": 4.459066390991211, + "learning_rate": 3.7423203765274275e-06, + "loss": 0.8275, + "step": 59494 + }, + { + "epoch": 0.7437185929648241, + "grad_norm": 3.132816791534424, + "learning_rate": 3.741639694952962e-06, + "loss": 0.5878, + "step": 59496 + }, + { + "epoch": 0.7437435935898398, + "grad_norm": 5.678032398223877, + "learning_rate": 3.7409590610412805e-06, + "loss": 1.2299, + "step": 59498 + }, + { + "epoch": 0.7437685942148554, + "grad_norm": 2.828860282897949, + "learning_rate": 3.740278474797573e-06, + "loss": 0.2997, + "step": 59500 + }, + { + "epoch": 0.743793594839871, + "grad_norm": 2.571150064468384, + "learning_rate": 3.7395979362270153e-06, + "loss": 1.4823, + "step": 59502 + }, + { + "epoch": 0.7438185954648866, + "grad_norm": 5.510653495788574, + "learning_rate": 3.7389174453347986e-06, + "loss": 1.4382, + "step": 59504 + }, + { + "epoch": 0.7438435960899022, + "grad_norm": 0.010292493738234043, + "learning_rate": 3.7382370021261006e-06, + "loss": 0.3853, + "step": 59506 + }, + { + "epoch": 0.7438685967149179, + "grad_norm": 4.474753379821777, + "learning_rate": 3.7375566066061007e-06, + "loss": 1.9248, + "step": 59508 + }, + { + "epoch": 0.7438935973399335, + "grad_norm": 2.2287418842315674, + "learning_rate": 3.736876258779988e-06, + "loss": 0.1984, + "step": 59510 + }, + { + "epoch": 0.7439185979649491, + "grad_norm": 2.6581666469573975, + "learning_rate": 3.7361959586529373e-06, + "loss": 0.1759, + "step": 59512 + }, + { + "epoch": 0.7439435985899647, + "grad_norm": 2.9191553592681885, + "learning_rate": 3.735515706230135e-06, + "loss": 0.4649, + "step": 59514 + }, + { + "epoch": 0.7439685992149804, + "grad_norm": 4.137172698974609, + "learning_rate": 3.7348355015167593e-06, + "loss": 0.6916, + "step": 59516 + }, + { + "epoch": 0.743993599839996, + "grad_norm": 1.4876400232315063, + "learning_rate": 3.734155344517987e-06, + "loss": 0.9371, + "step": 59518 + }, + { + "epoch": 0.7440186004650117, + "grad_norm": 5.728714942932129, + "learning_rate": 3.7334752352390047e-06, + "loss": 1.1439, + "step": 59520 + }, + { + "epoch": 0.7440436010900272, + "grad_norm": 3.7732038497924805, + "learning_rate": 3.7327951736849856e-06, + "loss": 0.6913, + "step": 59522 + }, + { + "epoch": 0.7440686017150429, + "grad_norm": 1.022112250328064, + "learning_rate": 3.732115159861115e-06, + "loss": 0.14, + "step": 59524 + }, + { + "epoch": 0.7440936023400585, + "grad_norm": 2.7060089111328125, + "learning_rate": 3.731435193772567e-06, + "loss": 0.4619, + "step": 59526 + }, + { + "epoch": 0.7441186029650741, + "grad_norm": 5.135191440582275, + "learning_rate": 3.73075527542452e-06, + "loss": 1.4963, + "step": 59528 + }, + { + "epoch": 0.7441436035900898, + "grad_norm": 0.0013267018366605043, + "learning_rate": 3.7300754048221634e-06, + "loss": 1.094, + "step": 59530 + }, + { + "epoch": 0.7441686042151053, + "grad_norm": 0.0011934509966522455, + "learning_rate": 3.729395581970657e-06, + "loss": 0.9949, + "step": 59532 + }, + { + "epoch": 0.744193604840121, + "grad_norm": 6.788021087646484, + "learning_rate": 3.728715806875193e-06, + "loss": 0.4142, + "step": 59534 + }, + { + "epoch": 0.7442186054651366, + "grad_norm": 2.5066637992858887, + "learning_rate": 3.7280360795409367e-06, + "loss": 0.3535, + "step": 59536 + }, + { + "epoch": 0.7442436060901523, + "grad_norm": 0.6365141272544861, + "learning_rate": 3.7273563999730757e-06, + "loss": 0.1172, + "step": 59538 + }, + { + "epoch": 0.7442686067151679, + "grad_norm": 6.67377233505249, + "learning_rate": 3.726676768176777e-06, + "loss": 1.0531, + "step": 59540 + }, + { + "epoch": 0.7442936073401835, + "grad_norm": 0.001126434886828065, + "learning_rate": 3.725997184157225e-06, + "loss": 0.5659, + "step": 59542 + }, + { + "epoch": 0.7443186079651991, + "grad_norm": 1.5136607885360718, + "learning_rate": 3.7253176479195905e-06, + "loss": 0.604, + "step": 59544 + }, + { + "epoch": 0.7443436085902148, + "grad_norm": 2.025891065597534, + "learning_rate": 3.724638159469046e-06, + "loss": 0.1085, + "step": 59546 + }, + { + "epoch": 0.7443686092152304, + "grad_norm": 4.116602897644043, + "learning_rate": 3.723958718810774e-06, + "loss": 1.5334, + "step": 59548 + }, + { + "epoch": 0.744393609840246, + "grad_norm": 0.0005379143403843045, + "learning_rate": 3.723279325949941e-06, + "loss": 0.2694, + "step": 59550 + }, + { + "epoch": 0.7444186104652616, + "grad_norm": 0.6835973858833313, + "learning_rate": 3.722599980891726e-06, + "loss": 0.9629, + "step": 59552 + }, + { + "epoch": 0.7444436110902772, + "grad_norm": 4.309904098510742, + "learning_rate": 3.7219206836413035e-06, + "loss": 0.485, + "step": 59554 + }, + { + "epoch": 0.7444686117152929, + "grad_norm": 1.5853102207183838, + "learning_rate": 3.721241434203847e-06, + "loss": 0.4176, + "step": 59556 + }, + { + "epoch": 0.7444936123403085, + "grad_norm": 5.896960258483887, + "learning_rate": 3.720562232584527e-06, + "loss": 0.8601, + "step": 59558 + }, + { + "epoch": 0.7445186129653242, + "grad_norm": 4.450893878936768, + "learning_rate": 3.719883078788513e-06, + "loss": 1.8456, + "step": 59560 + }, + { + "epoch": 0.7445436135903397, + "grad_norm": 1.661872148513794, + "learning_rate": 3.719203972820986e-06, + "loss": 0.7406, + "step": 59562 + }, + { + "epoch": 0.7445686142153554, + "grad_norm": 4.252975940704346, + "learning_rate": 3.7185249146871093e-06, + "loss": 0.7404, + "step": 59564 + }, + { + "epoch": 0.744593614840371, + "grad_norm": 2.5285072326660156, + "learning_rate": 3.717845904392059e-06, + "loss": 0.7116, + "step": 59566 + }, + { + "epoch": 0.7446186154653867, + "grad_norm": 1.423426628112793, + "learning_rate": 3.717166941941013e-06, + "loss": 0.7935, + "step": 59568 + }, + { + "epoch": 0.7446436160904023, + "grad_norm": 5.208241939544678, + "learning_rate": 3.7164880273391268e-06, + "loss": 1.3255, + "step": 59570 + }, + { + "epoch": 0.7446686167154178, + "grad_norm": 29.95479393005371, + "learning_rate": 3.7158091605915837e-06, + "loss": 4.2636, + "step": 59572 + }, + { + "epoch": 0.7446936173404335, + "grad_norm": 0.000739287817850709, + "learning_rate": 3.7151303417035456e-06, + "loss": 1.0497, + "step": 59574 + }, + { + "epoch": 0.7447186179654491, + "grad_norm": 2.8155250549316406, + "learning_rate": 3.7144515706801866e-06, + "loss": 0.3924, + "step": 59576 + }, + { + "epoch": 0.7447436185904648, + "grad_norm": 3.2334887981414795, + "learning_rate": 3.713772847526679e-06, + "loss": 1.2415, + "step": 59578 + }, + { + "epoch": 0.7447686192154804, + "grad_norm": 3.487323045730591, + "learning_rate": 3.7130941722481837e-06, + "loss": 0.4086, + "step": 59580 + }, + { + "epoch": 0.744793619840496, + "grad_norm": 0.9077767729759216, + "learning_rate": 3.7124155448498822e-06, + "loss": 0.0293, + "step": 59582 + }, + { + "epoch": 0.7448186204655116, + "grad_norm": 2.3750975131988525, + "learning_rate": 3.7117369653369282e-06, + "loss": 0.8852, + "step": 59584 + }, + { + "epoch": 0.7448436210905273, + "grad_norm": 1.9575448036193848, + "learning_rate": 3.7110584337144996e-06, + "loss": 0.0986, + "step": 59586 + }, + { + "epoch": 0.7448686217155429, + "grad_norm": 3.8226935863494873, + "learning_rate": 3.7103799499877557e-06, + "loss": 0.2921, + "step": 59588 + }, + { + "epoch": 0.7448936223405586, + "grad_norm": 3.150357246398926, + "learning_rate": 3.709701514161871e-06, + "loss": 0.4549, + "step": 59590 + }, + { + "epoch": 0.7449186229655741, + "grad_norm": 3.1414859294891357, + "learning_rate": 3.709023126242012e-06, + "loss": 0.4987, + "step": 59592 + }, + { + "epoch": 0.7449436235905897, + "grad_norm": 4.514470100402832, + "learning_rate": 3.7083447862333434e-06, + "loss": 0.5535, + "step": 59594 + }, + { + "epoch": 0.7449686242156054, + "grad_norm": 5.028131484985352, + "learning_rate": 3.7076664941410313e-06, + "loss": 0.9885, + "step": 59596 + }, + { + "epoch": 0.744993624840621, + "grad_norm": 2.5936906337738037, + "learning_rate": 3.706988249970237e-06, + "loss": 1.112, + "step": 59598 + }, + { + "epoch": 0.7450186254656367, + "grad_norm": 6.142353534698486, + "learning_rate": 3.7063100537261354e-06, + "loss": 1.1686, + "step": 59600 + }, + { + "epoch": 0.7450436260906522, + "grad_norm": 6.316567420959473, + "learning_rate": 3.705631905413881e-06, + "loss": 2.0301, + "step": 59602 + }, + { + "epoch": 0.7450686267156679, + "grad_norm": 0.053121570497751236, + "learning_rate": 3.7049538050386456e-06, + "loss": 1.3319, + "step": 59604 + }, + { + "epoch": 0.7450936273406835, + "grad_norm": 7.539845943450928, + "learning_rate": 3.704275752605595e-06, + "loss": 3.016, + "step": 59606 + }, + { + "epoch": 0.7451186279656992, + "grad_norm": 6.082583427429199, + "learning_rate": 3.7035977481198893e-06, + "loss": 1.2811, + "step": 59608 + }, + { + "epoch": 0.7451436285907148, + "grad_norm": 4.471166610717773, + "learning_rate": 3.702919791586693e-06, + "loss": 1.5935, + "step": 59610 + }, + { + "epoch": 0.7451686292157303, + "grad_norm": 3.9433863162994385, + "learning_rate": 3.7022418830111663e-06, + "loss": 1.9897, + "step": 59612 + }, + { + "epoch": 0.745193629840746, + "grad_norm": 0.00031011452665552497, + "learning_rate": 3.701564022398474e-06, + "loss": 0.0339, + "step": 59614 + }, + { + "epoch": 0.7452186304657616, + "grad_norm": 0.8473196029663086, + "learning_rate": 3.7008862097537834e-06, + "loss": 0.3696, + "step": 59616 + }, + { + "epoch": 0.7452436310907773, + "grad_norm": 2.977012872695923, + "learning_rate": 3.7002084450822484e-06, + "loss": 1.1769, + "step": 59618 + }, + { + "epoch": 0.7452686317157929, + "grad_norm": 0.7710309028625488, + "learning_rate": 3.6995307283890392e-06, + "loss": 0.0304, + "step": 59620 + }, + { + "epoch": 0.7452936323408085, + "grad_norm": 7.6617913246154785, + "learning_rate": 3.6988530596793136e-06, + "loss": 0.9971, + "step": 59622 + }, + { + "epoch": 0.7453186329658241, + "grad_norm": 3.9168834686279297, + "learning_rate": 3.6981754389582315e-06, + "loss": 1.1986, + "step": 59624 + }, + { + "epoch": 0.7453436335908398, + "grad_norm": 5.132206916809082, + "learning_rate": 3.69749786623095e-06, + "loss": 1.5967, + "step": 59626 + }, + { + "epoch": 0.7453686342158554, + "grad_norm": 2.5985302925109863, + "learning_rate": 3.696820341502634e-06, + "loss": 0.0722, + "step": 59628 + }, + { + "epoch": 0.7453936348408711, + "grad_norm": 2.5527710914611816, + "learning_rate": 3.696142864778446e-06, + "loss": 1.8827, + "step": 59630 + }, + { + "epoch": 0.7454186354658866, + "grad_norm": 3.218186378479004, + "learning_rate": 3.695465436063539e-06, + "loss": 1.025, + "step": 59632 + }, + { + "epoch": 0.7454436360909023, + "grad_norm": 2.969696521759033, + "learning_rate": 3.6947880553630834e-06, + "loss": 1.1861, + "step": 59634 + }, + { + "epoch": 0.7454686367159179, + "grad_norm": 2.58611798286438, + "learning_rate": 3.6941107226822224e-06, + "loss": 1.0054, + "step": 59636 + }, + { + "epoch": 0.7454936373409335, + "grad_norm": 0.7542201280593872, + "learning_rate": 3.693433438026124e-06, + "loss": 0.6521, + "step": 59638 + }, + { + "epoch": 0.7455186379659492, + "grad_norm": 5.5778913497924805, + "learning_rate": 3.6927562013999464e-06, + "loss": 0.9211, + "step": 59640 + }, + { + "epoch": 0.7455436385909647, + "grad_norm": 5.410684108734131, + "learning_rate": 3.6920790128088435e-06, + "loss": 0.8349, + "step": 59642 + }, + { + "epoch": 0.7455686392159804, + "grad_norm": 1.0994782447814941, + "learning_rate": 3.691401872257977e-06, + "loss": 0.4874, + "step": 59644 + }, + { + "epoch": 0.745593639840996, + "grad_norm": 4.251780986785889, + "learning_rate": 3.6907247797524993e-06, + "loss": 1.3875, + "step": 59646 + }, + { + "epoch": 0.7456186404660117, + "grad_norm": 3.1709280014038086, + "learning_rate": 3.690047735297576e-06, + "loss": 1.3656, + "step": 59648 + }, + { + "epoch": 0.7456436410910273, + "grad_norm": 7.4666290283203125, + "learning_rate": 3.68937073889835e-06, + "loss": 0.8013, + "step": 59650 + }, + { + "epoch": 0.7456686417160429, + "grad_norm": 2.0330233573913574, + "learning_rate": 3.688693790559984e-06, + "loss": 1.3896, + "step": 59652 + }, + { + "epoch": 0.7456936423410585, + "grad_norm": 3.6812565326690674, + "learning_rate": 3.6880168902876377e-06, + "loss": 1.208, + "step": 59654 + }, + { + "epoch": 0.7457186429660742, + "grad_norm": 2.004899024963379, + "learning_rate": 3.6873400380864575e-06, + "loss": 1.4735, + "step": 59656 + }, + { + "epoch": 0.7457436435910898, + "grad_norm": 0.9738825559616089, + "learning_rate": 3.686663233961607e-06, + "loss": 0.1342, + "step": 59658 + }, + { + "epoch": 0.7457686442161054, + "grad_norm": 4.816232204437256, + "learning_rate": 3.6859864779182366e-06, + "loss": 1.6412, + "step": 59660 + }, + { + "epoch": 0.745793644841121, + "grad_norm": 0.005170738324522972, + "learning_rate": 3.6853097699615004e-06, + "loss": 0.2467, + "step": 59662 + }, + { + "epoch": 0.7458186454661366, + "grad_norm": 3.0318453311920166, + "learning_rate": 3.684633110096548e-06, + "loss": 0.8276, + "step": 59664 + }, + { + "epoch": 0.7458436460911523, + "grad_norm": 0.6394221186637878, + "learning_rate": 3.683956498328537e-06, + "loss": 0.0226, + "step": 59666 + }, + { + "epoch": 0.7458686467161679, + "grad_norm": 3.351316213607788, + "learning_rate": 3.6832799346626235e-06, + "loss": 1.2816, + "step": 59668 + }, + { + "epoch": 0.7458936473411836, + "grad_norm": 0.000634324736893177, + "learning_rate": 3.6826034191039537e-06, + "loss": 0.0, + "step": 59670 + }, + { + "epoch": 0.7459186479661991, + "grad_norm": 2.329216957092285, + "learning_rate": 3.6819269516576862e-06, + "loss": 0.4026, + "step": 59672 + }, + { + "epoch": 0.7459436485912148, + "grad_norm": 4.865286827087402, + "learning_rate": 3.6812505323289693e-06, + "loss": 1.6764, + "step": 59674 + }, + { + "epoch": 0.7459686492162304, + "grad_norm": 2.532571792602539, + "learning_rate": 3.6805741611229516e-06, + "loss": 0.5981, + "step": 59676 + }, + { + "epoch": 0.745993649841246, + "grad_norm": 1.3591989278793335, + "learning_rate": 3.6798978380447912e-06, + "loss": 0.0468, + "step": 59678 + }, + { + "epoch": 0.7460186504662617, + "grad_norm": 2.6588850021362305, + "learning_rate": 3.679221563099631e-06, + "loss": 1.1341, + "step": 59680 + }, + { + "epoch": 0.7460436510912772, + "grad_norm": 3.7558887004852295, + "learning_rate": 3.6785453362926293e-06, + "loss": 0.3607, + "step": 59682 + }, + { + "epoch": 0.7460686517162929, + "grad_norm": 3.9288854598999023, + "learning_rate": 3.6778691576289294e-06, + "loss": 2.7884, + "step": 59684 + }, + { + "epoch": 0.7460936523413085, + "grad_norm": 4.39567756652832, + "learning_rate": 3.6771930271136904e-06, + "loss": 1.4382, + "step": 59686 + }, + { + "epoch": 0.7461186529663242, + "grad_norm": 6.041753768920898, + "learning_rate": 3.676516944752048e-06, + "loss": 1.3489, + "step": 59688 + }, + { + "epoch": 0.7461436535913398, + "grad_norm": 7.724535942077637, + "learning_rate": 3.675840910549159e-06, + "loss": 1.0799, + "step": 59690 + }, + { + "epoch": 0.7461686542163554, + "grad_norm": 4.921268463134766, + "learning_rate": 3.675164924510174e-06, + "loss": 0.8781, + "step": 59692 + }, + { + "epoch": 0.746193654841371, + "grad_norm": 6.876956462860107, + "learning_rate": 3.6744889866402356e-06, + "loss": 0.1687, + "step": 59694 + }, + { + "epoch": 0.7462186554663867, + "grad_norm": 3.322070360183716, + "learning_rate": 3.6738130969444976e-06, + "loss": 2.3403, + "step": 59696 + }, + { + "epoch": 0.7462436560914023, + "grad_norm": 2.910789728164673, + "learning_rate": 3.673137255428101e-06, + "loss": 1.1368, + "step": 59698 + }, + { + "epoch": 0.746268656716418, + "grad_norm": 0.3370034694671631, + "learning_rate": 3.6724614620962007e-06, + "loss": 0.3521, + "step": 59700 + }, + { + "epoch": 0.7462936573414335, + "grad_norm": 7.58681583404541, + "learning_rate": 3.6717857169539382e-06, + "loss": 1.7869, + "step": 59702 + }, + { + "epoch": 0.7463186579664491, + "grad_norm": 3.6902947425842285, + "learning_rate": 3.6711100200064576e-06, + "loss": 1.8046, + "step": 59704 + }, + { + "epoch": 0.7463436585914648, + "grad_norm": 9.484880447387695, + "learning_rate": 3.670434371258912e-06, + "loss": 1.5374, + "step": 59706 + }, + { + "epoch": 0.7463686592164804, + "grad_norm": 4.025233268737793, + "learning_rate": 3.6697587707164384e-06, + "loss": 1.7739, + "step": 59708 + }, + { + "epoch": 0.7463936598414961, + "grad_norm": 2.587662935256958, + "learning_rate": 3.6690832183841918e-06, + "loss": 0.8207, + "step": 59710 + }, + { + "epoch": 0.7464186604665116, + "grad_norm": 5.2406134605407715, + "learning_rate": 3.6684077142673115e-06, + "loss": 0.7906, + "step": 59712 + }, + { + "epoch": 0.7464436610915273, + "grad_norm": 6.527905464172363, + "learning_rate": 3.6677322583709386e-06, + "loss": 0.9826, + "step": 59714 + }, + { + "epoch": 0.7464686617165429, + "grad_norm": 0.0015259860083460808, + "learning_rate": 3.6670568507002256e-06, + "loss": 0.164, + "step": 59716 + }, + { + "epoch": 0.7464936623415586, + "grad_norm": 0.023729929700493813, + "learning_rate": 3.666381491260309e-06, + "loss": 0.4173, + "step": 59718 + }, + { + "epoch": 0.7465186629665742, + "grad_norm": 3.314786672592163, + "learning_rate": 3.6657061800563377e-06, + "loss": 1.3148, + "step": 59720 + }, + { + "epoch": 0.7465436635915897, + "grad_norm": 3.2419328689575195, + "learning_rate": 3.6650309170934496e-06, + "loss": 0.2462, + "step": 59722 + }, + { + "epoch": 0.7465686642166054, + "grad_norm": 6.187475681304932, + "learning_rate": 3.664355702376793e-06, + "loss": 1.1095, + "step": 59724 + }, + { + "epoch": 0.746593664841621, + "grad_norm": 3.7477686405181885, + "learning_rate": 3.663680535911508e-06, + "loss": 0.9711, + "step": 59726 + }, + { + "epoch": 0.7466186654666367, + "grad_norm": 4.825451374053955, + "learning_rate": 3.663005417702733e-06, + "loss": 0.8104, + "step": 59728 + }, + { + "epoch": 0.7466436660916523, + "grad_norm": 3.9006550312042236, + "learning_rate": 3.662330347755615e-06, + "loss": 0.281, + "step": 59730 + }, + { + "epoch": 0.7466686667166679, + "grad_norm": 0.0004734551766887307, + "learning_rate": 3.6616553260752898e-06, + "loss": 0.5025, + "step": 59732 + }, + { + "epoch": 0.7466936673416835, + "grad_norm": 0.00031356606632471085, + "learning_rate": 3.660980352666905e-06, + "loss": 0.1449, + "step": 59734 + }, + { + "epoch": 0.7467186679666992, + "grad_norm": 1.397139072418213, + "learning_rate": 3.6603054275355944e-06, + "loss": 0.2682, + "step": 59736 + }, + { + "epoch": 0.7467436685917148, + "grad_norm": 0.1620813012123108, + "learning_rate": 3.659630550686504e-06, + "loss": 0.22, + "step": 59738 + }, + { + "epoch": 0.7467686692167305, + "grad_norm": 0.5338738560676575, + "learning_rate": 3.6589557221247706e-06, + "loss": 0.7225, + "step": 59740 + }, + { + "epoch": 0.746793669841746, + "grad_norm": 0.10708481073379517, + "learning_rate": 3.65828094185553e-06, + "loss": 0.3199, + "step": 59742 + }, + { + "epoch": 0.7468186704667616, + "grad_norm": 3.9373600482940674, + "learning_rate": 3.657606209883929e-06, + "loss": 0.4429, + "step": 59744 + }, + { + "epoch": 0.7468436710917773, + "grad_norm": 3.478947162628174, + "learning_rate": 3.6569315262150984e-06, + "loss": 0.6073, + "step": 59746 + }, + { + "epoch": 0.7468686717167929, + "grad_norm": 4.490262985229492, + "learning_rate": 3.656256890854184e-06, + "loss": 1.1348, + "step": 59748 + }, + { + "epoch": 0.7468936723418086, + "grad_norm": 4.152325630187988, + "learning_rate": 3.6555823038063166e-06, + "loss": 1.0748, + "step": 59750 + }, + { + "epoch": 0.7469186729668241, + "grad_norm": 3.6379997730255127, + "learning_rate": 3.654907765076641e-06, + "loss": 1.1576, + "step": 59752 + }, + { + "epoch": 0.7469436735918398, + "grad_norm": 3.6746439933776855, + "learning_rate": 3.6542332746702892e-06, + "loss": 1.1647, + "step": 59754 + }, + { + "epoch": 0.7469686742168554, + "grad_norm": 4.843943119049072, + "learning_rate": 3.6535588325923977e-06, + "loss": 0.9246, + "step": 59756 + }, + { + "epoch": 0.7469936748418711, + "grad_norm": 6.64115047454834, + "learning_rate": 3.6528844388481064e-06, + "loss": 0.3457, + "step": 59758 + }, + { + "epoch": 0.7470186754668867, + "grad_norm": 0.001572457724250853, + "learning_rate": 3.6522100934425476e-06, + "loss": 0.7974, + "step": 59760 + }, + { + "epoch": 0.7470436760919023, + "grad_norm": 4.569797515869141, + "learning_rate": 3.651535796380862e-06, + "loss": 1.0077, + "step": 59762 + }, + { + "epoch": 0.7470686767169179, + "grad_norm": 2.7192978858947754, + "learning_rate": 3.650861547668182e-06, + "loss": 0.6482, + "step": 59764 + }, + { + "epoch": 0.7470936773419335, + "grad_norm": 0.13129572570323944, + "learning_rate": 3.6501873473096394e-06, + "loss": 0.2038, + "step": 59766 + }, + { + "epoch": 0.7471186779669492, + "grad_norm": 2.740543842315674, + "learning_rate": 3.6495131953103745e-06, + "loss": 0.932, + "step": 59768 + }, + { + "epoch": 0.7471436785919648, + "grad_norm": 12.45435619354248, + "learning_rate": 3.6488390916755165e-06, + "loss": 1.4489, + "step": 59770 + }, + { + "epoch": 0.7471686792169804, + "grad_norm": 4.649338245391846, + "learning_rate": 3.6481650364102052e-06, + "loss": 0.3464, + "step": 59772 + }, + { + "epoch": 0.747193679841996, + "grad_norm": 3.6069722175598145, + "learning_rate": 3.6474910295195677e-06, + "loss": 0.5947, + "step": 59774 + }, + { + "epoch": 0.7472186804670117, + "grad_norm": 4.198913097381592, + "learning_rate": 3.6468170710087424e-06, + "loss": 1.02, + "step": 59776 + }, + { + "epoch": 0.7472436810920273, + "grad_norm": 2.7090816497802734, + "learning_rate": 3.646143160882861e-06, + "loss": 0.7338, + "step": 59778 + }, + { + "epoch": 0.747268681717043, + "grad_norm": 3.9677822589874268, + "learning_rate": 3.6454692991470506e-06, + "loss": 1.5104, + "step": 59780 + }, + { + "epoch": 0.7472936823420585, + "grad_norm": 2.146491765975952, + "learning_rate": 3.6447954858064514e-06, + "loss": 0.4086, + "step": 59782 + }, + { + "epoch": 0.7473186829670742, + "grad_norm": 0.29167336225509644, + "learning_rate": 3.644121720866188e-06, + "loss": 0.4895, + "step": 59784 + }, + { + "epoch": 0.7473436835920898, + "grad_norm": 9.541926383972168, + "learning_rate": 3.643448004331398e-06, + "loss": 1.336, + "step": 59786 + }, + { + "epoch": 0.7473686842171055, + "grad_norm": 2.85738205909729, + "learning_rate": 3.6427743362072056e-06, + "loss": 1.1195, + "step": 59788 + }, + { + "epoch": 0.7473936848421211, + "grad_norm": 5.013154983520508, + "learning_rate": 3.6421007164987477e-06, + "loss": 1.6893, + "step": 59790 + }, + { + "epoch": 0.7474186854671366, + "grad_norm": 2.631120204925537, + "learning_rate": 3.6414271452111514e-06, + "loss": 1.2068, + "step": 59792 + }, + { + "epoch": 0.7474436860921523, + "grad_norm": 3.9778892993927, + "learning_rate": 3.6407536223495443e-06, + "loss": 0.2242, + "step": 59794 + }, + { + "epoch": 0.7474686867171679, + "grad_norm": 1.9487510919570923, + "learning_rate": 3.640080147919062e-06, + "loss": 0.7428, + "step": 59796 + }, + { + "epoch": 0.7474936873421836, + "grad_norm": 4.922828197479248, + "learning_rate": 3.6394067219248253e-06, + "loss": 2.0892, + "step": 59798 + }, + { + "epoch": 0.7475186879671992, + "grad_norm": 3.851031541824341, + "learning_rate": 3.638733344371972e-06, + "loss": 1.5369, + "step": 59800 + }, + { + "epoch": 0.7475436885922148, + "grad_norm": 3.880697250366211, + "learning_rate": 3.638060015265622e-06, + "loss": 1.5494, + "step": 59802 + }, + { + "epoch": 0.7475686892172304, + "grad_norm": 3.9859447479248047, + "learning_rate": 3.6373867346109103e-06, + "loss": 1.5729, + "step": 59804 + }, + { + "epoch": 0.7475936898422461, + "grad_norm": 6.304971694946289, + "learning_rate": 3.636713502412962e-06, + "loss": 1.4223, + "step": 59806 + }, + { + "epoch": 0.7476186904672617, + "grad_norm": 2.516347646713257, + "learning_rate": 3.6360403186769012e-06, + "loss": 0.346, + "step": 59808 + }, + { + "epoch": 0.7476436910922774, + "grad_norm": 0.4992237985134125, + "learning_rate": 3.6353671834078595e-06, + "loss": 0.9118, + "step": 59810 + }, + { + "epoch": 0.7476686917172929, + "grad_norm": 6.0093302726745605, + "learning_rate": 3.634694096610958e-06, + "loss": 2.1359, + "step": 59812 + }, + { + "epoch": 0.7476936923423085, + "grad_norm": 5.640985012054443, + "learning_rate": 3.6340210582913304e-06, + "loss": 1.6096, + "step": 59814 + }, + { + "epoch": 0.7477186929673242, + "grad_norm": 1.4226429462432861, + "learning_rate": 3.6333480684540976e-06, + "loss": 0.062, + "step": 59816 + }, + { + "epoch": 0.7477436935923398, + "grad_norm": 4.612201690673828, + "learning_rate": 3.6326751271043825e-06, + "loss": 1.99, + "step": 59818 + }, + { + "epoch": 0.7477686942173555, + "grad_norm": 0.32911860942840576, + "learning_rate": 3.6320022342473162e-06, + "loss": 0.0692, + "step": 59820 + }, + { + "epoch": 0.747793694842371, + "grad_norm": 0.7341756820678711, + "learning_rate": 3.6313293898880175e-06, + "loss": 0.5088, + "step": 59822 + }, + { + "epoch": 0.7478186954673867, + "grad_norm": 0.0013760189758613706, + "learning_rate": 3.6306565940316165e-06, + "loss": 1.0172, + "step": 59824 + }, + { + "epoch": 0.7478436960924023, + "grad_norm": 5.31585693359375, + "learning_rate": 3.6299838466832304e-06, + "loss": 0.5901, + "step": 59826 + }, + { + "epoch": 0.747868696717418, + "grad_norm": 6.4748663902282715, + "learning_rate": 3.6293111478479905e-06, + "loss": 1.8615, + "step": 59828 + }, + { + "epoch": 0.7478936973424336, + "grad_norm": 6.852685928344727, + "learning_rate": 3.6286384975310165e-06, + "loss": 1.8837, + "step": 59830 + }, + { + "epoch": 0.7479186979674491, + "grad_norm": 2.8652424812316895, + "learning_rate": 3.6279658957374263e-06, + "loss": 1.3008, + "step": 59832 + }, + { + "epoch": 0.7479436985924648, + "grad_norm": 0.03077341802418232, + "learning_rate": 3.62729334247235e-06, + "loss": 0.6303, + "step": 59834 + }, + { + "epoch": 0.7479686992174804, + "grad_norm": 0.01862017624080181, + "learning_rate": 3.626620837740904e-06, + "loss": 0.5481, + "step": 59836 + }, + { + "epoch": 0.7479936998424961, + "grad_norm": 4.413544654846191, + "learning_rate": 3.6259483815482153e-06, + "loss": 2.3057, + "step": 59838 + }, + { + "epoch": 0.7480187004675117, + "grad_norm": 1.1380183696746826, + "learning_rate": 3.6252759738993993e-06, + "loss": 0.5046, + "step": 59840 + }, + { + "epoch": 0.7480437010925273, + "grad_norm": 4.209578037261963, + "learning_rate": 3.624603614799583e-06, + "loss": 0.8796, + "step": 59842 + }, + { + "epoch": 0.7480687017175429, + "grad_norm": 4.6001176834106445, + "learning_rate": 3.6239313042538848e-06, + "loss": 0.9253, + "step": 59844 + }, + { + "epoch": 0.7480937023425586, + "grad_norm": 0.0009816683596000075, + "learning_rate": 3.6232590422674197e-06, + "loss": 0.0816, + "step": 59846 + }, + { + "epoch": 0.7481187029675742, + "grad_norm": 0.00065487006213516, + "learning_rate": 3.622586828845316e-06, + "loss": 0.0, + "step": 59848 + }, + { + "epoch": 0.7481437035925899, + "grad_norm": 8.326077461242676, + "learning_rate": 3.621914663992685e-06, + "loss": 1.5727, + "step": 59850 + }, + { + "epoch": 0.7481687042176054, + "grad_norm": 4.215082168579102, + "learning_rate": 3.621242547714654e-06, + "loss": 1.2067, + "step": 59852 + }, + { + "epoch": 0.748193704842621, + "grad_norm": 4.076967716217041, + "learning_rate": 3.6205704800163344e-06, + "loss": 1.5681, + "step": 59854 + }, + { + "epoch": 0.7482187054676367, + "grad_norm": 0.10599690675735474, + "learning_rate": 3.619898460902851e-06, + "loss": 1.1334, + "step": 59856 + }, + { + "epoch": 0.7482437060926523, + "grad_norm": 3.2473347187042236, + "learning_rate": 3.6192264903793193e-06, + "loss": 1.1196, + "step": 59858 + }, + { + "epoch": 0.748268706717668, + "grad_norm": 3.5506937503814697, + "learning_rate": 3.6185545684508527e-06, + "loss": 1.3125, + "step": 59860 + }, + { + "epoch": 0.7482937073426835, + "grad_norm": 4.183155536651611, + "learning_rate": 3.6178826951225743e-06, + "loss": 0.9464, + "step": 59862 + }, + { + "epoch": 0.7483187079676992, + "grad_norm": 5.783804416656494, + "learning_rate": 3.617210870399597e-06, + "loss": 1.5406, + "step": 59864 + }, + { + "epoch": 0.7483437085927148, + "grad_norm": 0.4184447228908539, + "learning_rate": 3.6165390942870414e-06, + "loss": 0.0107, + "step": 59866 + }, + { + "epoch": 0.7483687092177305, + "grad_norm": 2.4023005962371826, + "learning_rate": 3.615867366790019e-06, + "loss": 0.0698, + "step": 59868 + }, + { + "epoch": 0.7483937098427461, + "grad_norm": 7.996368408203125, + "learning_rate": 3.6151956879136507e-06, + "loss": 0.3709, + "step": 59870 + }, + { + "epoch": 0.7484187104677616, + "grad_norm": 0.005250633228570223, + "learning_rate": 3.614524057663049e-06, + "loss": 1.2159, + "step": 59872 + }, + { + "epoch": 0.7484437110927773, + "grad_norm": 4.37506103515625, + "learning_rate": 3.613852476043326e-06, + "loss": 1.7446, + "step": 59874 + }, + { + "epoch": 0.748468711717793, + "grad_norm": 3.5160393714904785, + "learning_rate": 3.613180943059602e-06, + "loss": 1.072, + "step": 59876 + }, + { + "epoch": 0.7484937123428086, + "grad_norm": 6.8563385009765625, + "learning_rate": 3.6125094587169862e-06, + "loss": 2.1612, + "step": 59878 + }, + { + "epoch": 0.7485187129678242, + "grad_norm": 2.3679771423339844, + "learning_rate": 3.6118380230205984e-06, + "loss": 1.0538, + "step": 59880 + }, + { + "epoch": 0.7485437135928398, + "grad_norm": 4.409750461578369, + "learning_rate": 3.6111666359755495e-06, + "loss": 1.0762, + "step": 59882 + }, + { + "epoch": 0.7485687142178554, + "grad_norm": 4.437763214111328, + "learning_rate": 3.610495297586948e-06, + "loss": 0.4742, + "step": 59884 + }, + { + "epoch": 0.7485937148428711, + "grad_norm": 4.239274978637695, + "learning_rate": 3.6098240078599135e-06, + "loss": 0.7918, + "step": 59886 + }, + { + "epoch": 0.7486187154678867, + "grad_norm": 4.533013343811035, + "learning_rate": 3.609152766799553e-06, + "loss": 0.9145, + "step": 59888 + }, + { + "epoch": 0.7486437160929024, + "grad_norm": 2.3104684352874756, + "learning_rate": 3.6084815744109847e-06, + "loss": 1.2564, + "step": 59890 + }, + { + "epoch": 0.7486687167179179, + "grad_norm": 4.428776741027832, + "learning_rate": 3.6078104306993133e-06, + "loss": 1.4613, + "step": 59892 + }, + { + "epoch": 0.7486937173429336, + "grad_norm": 2.7360877990722656, + "learning_rate": 3.607139335669654e-06, + "loss": 0.6485, + "step": 59894 + }, + { + "epoch": 0.7487187179679492, + "grad_norm": 4.444235801696777, + "learning_rate": 3.6064682893271243e-06, + "loss": 0.7556, + "step": 59896 + }, + { + "epoch": 0.7487437185929648, + "grad_norm": 9.76106071472168, + "learning_rate": 3.6057972916768203e-06, + "loss": 2.1448, + "step": 59898 + }, + { + "epoch": 0.7487687192179805, + "grad_norm": 5.722609519958496, + "learning_rate": 3.6051263427238647e-06, + "loss": 2.1018, + "step": 59900 + }, + { + "epoch": 0.748793719842996, + "grad_norm": 0.00781263317912817, + "learning_rate": 3.604455442473358e-06, + "loss": 0.0438, + "step": 59902 + }, + { + "epoch": 0.7488187204680117, + "grad_norm": 3.5927953720092773, + "learning_rate": 3.603784590930418e-06, + "loss": 1.2635, + "step": 59904 + }, + { + "epoch": 0.7488437210930273, + "grad_norm": 4.821234703063965, + "learning_rate": 3.6031137881001467e-06, + "loss": 2.3476, + "step": 59906 + }, + { + "epoch": 0.748868721718043, + "grad_norm": 3.561187267303467, + "learning_rate": 3.6024430339876605e-06, + "loss": 0.8045, + "step": 59908 + }, + { + "epoch": 0.7488937223430586, + "grad_norm": 4.732290267944336, + "learning_rate": 3.601772328598062e-06, + "loss": 2.0018, + "step": 59910 + }, + { + "epoch": 0.7489187229680742, + "grad_norm": 3.1199331283569336, + "learning_rate": 3.601101671936458e-06, + "loss": 0.9453, + "step": 59912 + }, + { + "epoch": 0.7489437235930898, + "grad_norm": 11.11905574798584, + "learning_rate": 3.600431064007962e-06, + "loss": 0.8278, + "step": 59914 + }, + { + "epoch": 0.7489687242181055, + "grad_norm": 4.001079082489014, + "learning_rate": 3.599760504817674e-06, + "loss": 0.8525, + "step": 59916 + }, + { + "epoch": 0.7489937248431211, + "grad_norm": 0.07839231938123703, + "learning_rate": 3.5990899943707094e-06, + "loss": 0.0368, + "step": 59918 + }, + { + "epoch": 0.7490187254681367, + "grad_norm": 3.244093179702759, + "learning_rate": 3.5984195326721662e-06, + "loss": 0.242, + "step": 59920 + }, + { + "epoch": 0.7490437260931523, + "grad_norm": 3.781127452850342, + "learning_rate": 3.5977491197271586e-06, + "loss": 1.836, + "step": 59922 + }, + { + "epoch": 0.7490687267181679, + "grad_norm": 4.745610237121582, + "learning_rate": 3.597078755540788e-06, + "loss": 0.7427, + "step": 59924 + }, + { + "epoch": 0.7490937273431836, + "grad_norm": 4.890625953674316, + "learning_rate": 3.5964084401181566e-06, + "loss": 1.2049, + "step": 59926 + }, + { + "epoch": 0.7491187279681992, + "grad_norm": 0.01635635457932949, + "learning_rate": 3.595738173464376e-06, + "loss": 0.4102, + "step": 59928 + }, + { + "epoch": 0.7491437285932149, + "grad_norm": 2.967893362045288, + "learning_rate": 3.5950679555845446e-06, + "loss": 0.7245, + "step": 59930 + }, + { + "epoch": 0.7491687292182304, + "grad_norm": 0.00044408615212887526, + "learning_rate": 3.594397786483771e-06, + "loss": 1.5713, + "step": 59932 + }, + { + "epoch": 0.7491937298432461, + "grad_norm": 0.6251998543739319, + "learning_rate": 3.593727666167164e-06, + "loss": 0.0072, + "step": 59934 + }, + { + "epoch": 0.7492187304682617, + "grad_norm": 0.0023194882087409496, + "learning_rate": 3.593057594639814e-06, + "loss": 0.1396, + "step": 59936 + }, + { + "epoch": 0.7492437310932774, + "grad_norm": 2.145869493484497, + "learning_rate": 3.5923875719068348e-06, + "loss": 1.5397, + "step": 59938 + }, + { + "epoch": 0.749268731718293, + "grad_norm": 5.739567279815674, + "learning_rate": 3.591717597973323e-06, + "loss": 1.0672, + "step": 59940 + }, + { + "epoch": 0.7492937323433085, + "grad_norm": 4.06074857711792, + "learning_rate": 3.5910476728443865e-06, + "loss": 0.9325, + "step": 59942 + }, + { + "epoch": 0.7493187329683242, + "grad_norm": 2.8492164611816406, + "learning_rate": 3.5903777965251218e-06, + "loss": 0.9014, + "step": 59944 + }, + { + "epoch": 0.7493437335933398, + "grad_norm": 4.4735107421875, + "learning_rate": 3.5897079690206326e-06, + "loss": 1.2673, + "step": 59946 + }, + { + "epoch": 0.7493687342183555, + "grad_norm": 3.6430184841156006, + "learning_rate": 3.5890381903360282e-06, + "loss": 1.013, + "step": 59948 + }, + { + "epoch": 0.7493937348433711, + "grad_norm": 0.014002055861055851, + "learning_rate": 3.5883684604763956e-06, + "loss": 0.5949, + "step": 59950 + }, + { + "epoch": 0.7494187354683867, + "grad_norm": 3.1767401695251465, + "learning_rate": 3.5876987794468466e-06, + "loss": 1.4693, + "step": 59952 + }, + { + "epoch": 0.7494437360934023, + "grad_norm": 2.0350210666656494, + "learning_rate": 3.5870291472524722e-06, + "loss": 0.2232, + "step": 59954 + }, + { + "epoch": 0.749468736718418, + "grad_norm": 2.0944712162017822, + "learning_rate": 3.586359563898377e-06, + "loss": 0.5389, + "step": 59956 + }, + { + "epoch": 0.7494937373434336, + "grad_norm": 0.0011795952450484037, + "learning_rate": 3.5856900293896648e-06, + "loss": 0.5134, + "step": 59958 + }, + { + "epoch": 0.7495187379684493, + "grad_norm": 5.907042980194092, + "learning_rate": 3.585020543731429e-06, + "loss": 0.1854, + "step": 59960 + }, + { + "epoch": 0.7495437385934648, + "grad_norm": 2.2280781269073486, + "learning_rate": 3.5843511069287693e-06, + "loss": 0.8646, + "step": 59962 + }, + { + "epoch": 0.7495687392184804, + "grad_norm": 3.421247959136963, + "learning_rate": 3.5836817189867824e-06, + "loss": 0.6846, + "step": 59964 + }, + { + "epoch": 0.7495937398434961, + "grad_norm": 4.764869213104248, + "learning_rate": 3.583012379910571e-06, + "loss": 1.1729, + "step": 59966 + }, + { + "epoch": 0.7496187404685117, + "grad_norm": 0.0005482214619405568, + "learning_rate": 3.582343089705226e-06, + "loss": 0.7262, + "step": 59968 + }, + { + "epoch": 0.7496437410935274, + "grad_norm": 4.163729667663574, + "learning_rate": 3.5816738483758484e-06, + "loss": 0.9227, + "step": 59970 + }, + { + "epoch": 0.7496687417185429, + "grad_norm": 5.085626602172852, + "learning_rate": 3.581004655927539e-06, + "loss": 1.858, + "step": 59972 + }, + { + "epoch": 0.7496937423435586, + "grad_norm": 3.268043279647827, + "learning_rate": 3.5803355123653904e-06, + "loss": 0.636, + "step": 59974 + }, + { + "epoch": 0.7497187429685742, + "grad_norm": 3.4372782707214355, + "learning_rate": 3.5796664176944995e-06, + "loss": 0.8876, + "step": 59976 + }, + { + "epoch": 0.7497437435935899, + "grad_norm": 5.684850215911865, + "learning_rate": 3.578997371919957e-06, + "loss": 2.0099, + "step": 59978 + }, + { + "epoch": 0.7497687442186055, + "grad_norm": 0.0004909216077066958, + "learning_rate": 3.5783283750468657e-06, + "loss": 0.3096, + "step": 59980 + }, + { + "epoch": 0.749793744843621, + "grad_norm": 4.724764823913574, + "learning_rate": 3.5776594270803143e-06, + "loss": 1.6651, + "step": 59982 + }, + { + "epoch": 0.7498187454686367, + "grad_norm": 2.1794626712799072, + "learning_rate": 3.5769905280254e-06, + "loss": 0.1899, + "step": 59984 + }, + { + "epoch": 0.7498437460936523, + "grad_norm": 2.950965166091919, + "learning_rate": 3.5763216778872256e-06, + "loss": 0.9291, + "step": 59986 + }, + { + "epoch": 0.749868746718668, + "grad_norm": 0.8581411242485046, + "learning_rate": 3.575652876670869e-06, + "loss": 0.6638, + "step": 59988 + }, + { + "epoch": 0.7498937473436836, + "grad_norm": 2.2693018913269043, + "learning_rate": 3.5749841243814355e-06, + "loss": 1.3657, + "step": 59990 + }, + { + "epoch": 0.7499187479686992, + "grad_norm": 4.278670310974121, + "learning_rate": 3.5743154210240106e-06, + "loss": 1.0455, + "step": 59992 + }, + { + "epoch": 0.7499437485937148, + "grad_norm": 6.718434810638428, + "learning_rate": 3.573646766603691e-06, + "loss": 1.439, + "step": 59994 + }, + { + "epoch": 0.7499687492187305, + "grad_norm": 1.4811550378799438, + "learning_rate": 3.572978161125572e-06, + "loss": 0.8369, + "step": 59996 + }, + { + "epoch": 0.7499937498437461, + "grad_norm": 3.2876498699188232, + "learning_rate": 3.5723096045947393e-06, + "loss": 0.559, + "step": 59998 + }, + { + "epoch": 0.7500187504687618, + "grad_norm": 3.6030194759368896, + "learning_rate": 3.571641097016294e-06, + "loss": 1.0682, + "step": 60000 + }, + { + "epoch": 0.7500437510937773, + "grad_norm": 2.686718225479126, + "learning_rate": 3.5709726383953148e-06, + "loss": 2.4604, + "step": 60002 + }, + { + "epoch": 0.750068751718793, + "grad_norm": 1.1323280334472656, + "learning_rate": 3.5703042287369017e-06, + "loss": 1.0261, + "step": 60004 + }, + { + "epoch": 0.7500937523438086, + "grad_norm": 1.7711701393127441, + "learning_rate": 3.5696358680461398e-06, + "loss": 0.6751, + "step": 60006 + }, + { + "epoch": 0.7501187529688242, + "grad_norm": 0.0038409745320677757, + "learning_rate": 3.568967556328121e-06, + "loss": 0.9173, + "step": 60008 + }, + { + "epoch": 0.7501437535938399, + "grad_norm": 0.016216207295656204, + "learning_rate": 3.56829929358794e-06, + "loss": 0.6741, + "step": 60010 + }, + { + "epoch": 0.7501687542188554, + "grad_norm": 10.783707618713379, + "learning_rate": 3.567631079830681e-06, + "loss": 1.8562, + "step": 60012 + }, + { + "epoch": 0.7501937548438711, + "grad_norm": 0.9498391151428223, + "learning_rate": 3.566962915061435e-06, + "loss": 0.0332, + "step": 60014 + }, + { + "epoch": 0.7502187554688867, + "grad_norm": 4.38272762298584, + "learning_rate": 3.5662947992852857e-06, + "loss": 1.5189, + "step": 60016 + }, + { + "epoch": 0.7502437560939024, + "grad_norm": 3.7851779460906982, + "learning_rate": 3.5656267325073268e-06, + "loss": 0.9873, + "step": 60018 + }, + { + "epoch": 0.750268756718918, + "grad_norm": 3.112513780593872, + "learning_rate": 3.5649587147326468e-06, + "loss": 1.0093, + "step": 60020 + }, + { + "epoch": 0.7502937573439336, + "grad_norm": 0.0008714106515981257, + "learning_rate": 3.564290745966328e-06, + "loss": 0.1451, + "step": 60022 + }, + { + "epoch": 0.7503187579689492, + "grad_norm": 3.0570478439331055, + "learning_rate": 3.5636228262134642e-06, + "loss": 1.1582, + "step": 60024 + }, + { + "epoch": 0.7503437585939648, + "grad_norm": 3.1586673259735107, + "learning_rate": 3.5629549554791396e-06, + "loss": 2.0351, + "step": 60026 + }, + { + "epoch": 0.7503687592189805, + "grad_norm": 0.0044589899480342865, + "learning_rate": 3.5622871337684393e-06, + "loss": 0.0017, + "step": 60028 + }, + { + "epoch": 0.7503937598439961, + "grad_norm": 2.724426507949829, + "learning_rate": 3.561619361086446e-06, + "loss": 0.8229, + "step": 60030 + }, + { + "epoch": 0.7504187604690117, + "grad_norm": 2.1033236980438232, + "learning_rate": 3.560951637438249e-06, + "loss": 1.1563, + "step": 60032 + }, + { + "epoch": 0.7504437610940273, + "grad_norm": 1.3668187856674194, + "learning_rate": 3.5602839628289374e-06, + "loss": 0.4943, + "step": 60034 + }, + { + "epoch": 0.750468761719043, + "grad_norm": 6.501430511474609, + "learning_rate": 3.5596163372635895e-06, + "loss": 0.8596, + "step": 60036 + }, + { + "epoch": 0.7504937623440586, + "grad_norm": 0.21324032545089722, + "learning_rate": 3.5589487607472992e-06, + "loss": 0.0049, + "step": 60038 + }, + { + "epoch": 0.7505187629690743, + "grad_norm": 0.0012569194659590721, + "learning_rate": 3.558281233285138e-06, + "loss": 0.0421, + "step": 60040 + }, + { + "epoch": 0.7505437635940898, + "grad_norm": 0.5107778906822205, + "learning_rate": 3.5576137548821956e-06, + "loss": 0.0449, + "step": 60042 + }, + { + "epoch": 0.7505687642191055, + "grad_norm": 0.940555989742279, + "learning_rate": 3.55694632554356e-06, + "loss": 0.1254, + "step": 60044 + }, + { + "epoch": 0.7505937648441211, + "grad_norm": 1.7030763626098633, + "learning_rate": 3.556278945274305e-06, + "loss": 0.9384, + "step": 60046 + }, + { + "epoch": 0.7506187654691368, + "grad_norm": 1.0800654888153076, + "learning_rate": 3.5556116140795227e-06, + "loss": 0.3813, + "step": 60048 + }, + { + "epoch": 0.7506437660941524, + "grad_norm": 4.466142654418945, + "learning_rate": 3.5549443319642886e-06, + "loss": 2.0335, + "step": 60050 + }, + { + "epoch": 0.7506687667191679, + "grad_norm": 6.376153945922852, + "learning_rate": 3.5542770989336926e-06, + "loss": 0.7766, + "step": 60052 + }, + { + "epoch": 0.7506937673441836, + "grad_norm": 2.821502923965454, + "learning_rate": 3.553609914992804e-06, + "loss": 0.5742, + "step": 60054 + }, + { + "epoch": 0.7507187679691992, + "grad_norm": 4.809049606323242, + "learning_rate": 3.552942780146711e-06, + "loss": 1.2484, + "step": 60056 + }, + { + "epoch": 0.7507437685942149, + "grad_norm": 3.0911617279052734, + "learning_rate": 3.5522756944004966e-06, + "loss": 0.1404, + "step": 60058 + }, + { + "epoch": 0.7507687692192305, + "grad_norm": 0.4316067099571228, + "learning_rate": 3.5516086577592367e-06, + "loss": 0.213, + "step": 60060 + }, + { + "epoch": 0.7507937698442461, + "grad_norm": 0.4229832589626312, + "learning_rate": 3.5509416702280165e-06, + "loss": 1.1028, + "step": 60062 + }, + { + "epoch": 0.7508187704692617, + "grad_norm": 4.445936679840088, + "learning_rate": 3.550274731811911e-06, + "loss": 1.2999, + "step": 60064 + }, + { + "epoch": 0.7508437710942774, + "grad_norm": 8.377400398254395, + "learning_rate": 3.549607842516003e-06, + "loss": 1.0147, + "step": 60066 + }, + { + "epoch": 0.750868771719293, + "grad_norm": 3.1113545894622803, + "learning_rate": 3.5489410023453653e-06, + "loss": 1.4461, + "step": 60068 + }, + { + "epoch": 0.7508937723443087, + "grad_norm": 1.6459269523620605, + "learning_rate": 3.5482742113050806e-06, + "loss": 0.6531, + "step": 60070 + }, + { + "epoch": 0.7509187729693242, + "grad_norm": 4.4601874351501465, + "learning_rate": 3.5476074694002303e-06, + "loss": 0.8811, + "step": 60072 + }, + { + "epoch": 0.7509437735943398, + "grad_norm": 2.5868937969207764, + "learning_rate": 3.5469407766358867e-06, + "loss": 1.0745, + "step": 60074 + }, + { + "epoch": 0.7509687742193555, + "grad_norm": 5.582062721252441, + "learning_rate": 3.5462741330171324e-06, + "loss": 1.2741, + "step": 60076 + }, + { + "epoch": 0.7509937748443711, + "grad_norm": 3.044816732406616, + "learning_rate": 3.5456075385490417e-06, + "loss": 1.4286, + "step": 60078 + }, + { + "epoch": 0.7510187754693868, + "grad_norm": 2.9653689861297607, + "learning_rate": 3.5449409932366873e-06, + "loss": 0.9209, + "step": 60080 + }, + { + "epoch": 0.7510437760944023, + "grad_norm": 3.2399308681488037, + "learning_rate": 3.5442744970851537e-06, + "loss": 2.3179, + "step": 60082 + }, + { + "epoch": 0.751068776719418, + "grad_norm": 5.699101448059082, + "learning_rate": 3.5436080500995086e-06, + "loss": 1.6543, + "step": 60084 + }, + { + "epoch": 0.7510937773444336, + "grad_norm": 1.1156691312789917, + "learning_rate": 3.5429416522848346e-06, + "loss": 0.5627, + "step": 60086 + }, + { + "epoch": 0.7511187779694493, + "grad_norm": 3.220350980758667, + "learning_rate": 3.542275303646202e-06, + "loss": 0.9215, + "step": 60088 + }, + { + "epoch": 0.7511437785944649, + "grad_norm": 3.4816036224365234, + "learning_rate": 3.541609004188693e-06, + "loss": 1.0484, + "step": 60090 + }, + { + "epoch": 0.7511687792194804, + "grad_norm": 6.881178379058838, + "learning_rate": 3.5409427539173704e-06, + "loss": 1.9563, + "step": 60092 + }, + { + "epoch": 0.7511937798444961, + "grad_norm": 3.469433069229126, + "learning_rate": 3.540276552837314e-06, + "loss": 0.7035, + "step": 60094 + }, + { + "epoch": 0.7512187804695117, + "grad_norm": 2.3093326091766357, + "learning_rate": 3.5396104009536015e-06, + "loss": 1.7152, + "step": 60096 + }, + { + "epoch": 0.7512437810945274, + "grad_norm": 3.832749128341675, + "learning_rate": 3.5389442982712984e-06, + "loss": 0.6298, + "step": 60098 + }, + { + "epoch": 0.751268781719543, + "grad_norm": 3.2288410663604736, + "learning_rate": 3.5382782447954867e-06, + "loss": 0.9678, + "step": 60100 + }, + { + "epoch": 0.7512937823445586, + "grad_norm": 6.120292663574219, + "learning_rate": 3.5376122405312296e-06, + "loss": 1.0306, + "step": 60102 + }, + { + "epoch": 0.7513187829695742, + "grad_norm": 3.5524585247039795, + "learning_rate": 3.536946285483608e-06, + "loss": 1.1346, + "step": 60104 + }, + { + "epoch": 0.7513437835945899, + "grad_norm": 0.0013251769123598933, + "learning_rate": 3.5362803796576885e-06, + "loss": 0.0999, + "step": 60106 + }, + { + "epoch": 0.7513687842196055, + "grad_norm": 4.408078670501709, + "learning_rate": 3.5356145230585405e-06, + "loss": 1.3698, + "step": 60108 + }, + { + "epoch": 0.7513937848446212, + "grad_norm": 0.547452449798584, + "learning_rate": 3.534948715691242e-06, + "loss": 0.1955, + "step": 60110 + }, + { + "epoch": 0.7514187854696367, + "grad_norm": 2.436988592147827, + "learning_rate": 3.534282957560856e-06, + "loss": 0.54, + "step": 60112 + }, + { + "epoch": 0.7514437860946523, + "grad_norm": 1.023766040802002, + "learning_rate": 3.5336172486724605e-06, + "loss": 1.2877, + "step": 60114 + }, + { + "epoch": 0.751468786719668, + "grad_norm": 9.417835235595703, + "learning_rate": 3.5329515890311182e-06, + "loss": 1.484, + "step": 60116 + }, + { + "epoch": 0.7514937873446836, + "grad_norm": 4.858190059661865, + "learning_rate": 3.532285978641905e-06, + "loss": 1.5364, + "step": 60118 + }, + { + "epoch": 0.7515187879696993, + "grad_norm": 2.4701731204986572, + "learning_rate": 3.5316204175098876e-06, + "loss": 0.9957, + "step": 60120 + }, + { + "epoch": 0.7515437885947148, + "grad_norm": 5.316465854644775, + "learning_rate": 3.5309549056401303e-06, + "loss": 0.9128, + "step": 60122 + }, + { + "epoch": 0.7515687892197305, + "grad_norm": 1.7395042181015015, + "learning_rate": 3.5302894430377097e-06, + "loss": 0.8924, + "step": 60124 + }, + { + "epoch": 0.7515937898447461, + "grad_norm": 1.3863872289657593, + "learning_rate": 3.5296240297076866e-06, + "loss": 0.7314, + "step": 60126 + }, + { + "epoch": 0.7516187904697618, + "grad_norm": 0.8306437730789185, + "learning_rate": 3.5289586656551346e-06, + "loss": 0.044, + "step": 60128 + }, + { + "epoch": 0.7516437910947774, + "grad_norm": 4.052818298339844, + "learning_rate": 3.5282933508851193e-06, + "loss": 1.3918, + "step": 60130 + }, + { + "epoch": 0.751668791719793, + "grad_norm": 2.3248512744903564, + "learning_rate": 3.5276280854027024e-06, + "loss": 1.0959, + "step": 60132 + }, + { + "epoch": 0.7516937923448086, + "grad_norm": 8.2622652053833, + "learning_rate": 3.5269628692129586e-06, + "loss": 1.3266, + "step": 60134 + }, + { + "epoch": 0.7517187929698242, + "grad_norm": 3.3410091400146484, + "learning_rate": 3.5262977023209467e-06, + "loss": 0.5008, + "step": 60136 + }, + { + "epoch": 0.7517437935948399, + "grad_norm": 0.0038577422965317965, + "learning_rate": 3.5256325847317384e-06, + "loss": 0.5603, + "step": 60138 + }, + { + "epoch": 0.7517687942198555, + "grad_norm": 4.86037015914917, + "learning_rate": 3.524967516450395e-06, + "loss": 2.6209, + "step": 60140 + }, + { + "epoch": 0.7517937948448711, + "grad_norm": 4.927167892456055, + "learning_rate": 3.5243024974819862e-06, + "loss": 0.9136, + "step": 60142 + }, + { + "epoch": 0.7518187954698867, + "grad_norm": 0.18225590884685516, + "learning_rate": 3.5236375278315736e-06, + "loss": 0.5551, + "step": 60144 + }, + { + "epoch": 0.7518437960949024, + "grad_norm": 6.707351207733154, + "learning_rate": 3.522972607504218e-06, + "loss": 1.7516, + "step": 60146 + }, + { + "epoch": 0.751868796719918, + "grad_norm": 4.939416885375977, + "learning_rate": 3.5223077365049907e-06, + "loss": 0.5847, + "step": 60148 + }, + { + "epoch": 0.7518937973449337, + "grad_norm": 3.3225200176239014, + "learning_rate": 3.5216429148389488e-06, + "loss": 0.4477, + "step": 60150 + }, + { + "epoch": 0.7519187979699492, + "grad_norm": 0.0008744343649595976, + "learning_rate": 3.520978142511161e-06, + "loss": 0.0677, + "step": 60152 + }, + { + "epoch": 0.7519437985949649, + "grad_norm": 0.0014364065136760473, + "learning_rate": 3.520313419526685e-06, + "loss": 0.0352, + "step": 60154 + }, + { + "epoch": 0.7519687992199805, + "grad_norm": 3.8094518184661865, + "learning_rate": 3.519648745890588e-06, + "loss": 1.6205, + "step": 60156 + }, + { + "epoch": 0.7519937998449961, + "grad_norm": 2.0835132598876953, + "learning_rate": 3.51898412160793e-06, + "loss": 0.9248, + "step": 60158 + }, + { + "epoch": 0.7520188004700118, + "grad_norm": 2.682133674621582, + "learning_rate": 3.5183195466837693e-06, + "loss": 1.3671, + "step": 60160 + }, + { + "epoch": 0.7520438010950273, + "grad_norm": 5.273442268371582, + "learning_rate": 3.5176550211231743e-06, + "loss": 0.5642, + "step": 60162 + }, + { + "epoch": 0.752068801720043, + "grad_norm": 4.909492492675781, + "learning_rate": 3.516990544931198e-06, + "loss": 1.171, + "step": 60164 + }, + { + "epoch": 0.7520938023450586, + "grad_norm": 1.5072612762451172, + "learning_rate": 3.5163261181129084e-06, + "loss": 0.962, + "step": 60166 + }, + { + "epoch": 0.7521188029700743, + "grad_norm": 0.0003259859513491392, + "learning_rate": 3.5156617406733584e-06, + "loss": 0.8555, + "step": 60168 + }, + { + "epoch": 0.7521438035950899, + "grad_norm": 4.349963665008545, + "learning_rate": 3.5149974126176155e-06, + "loss": 1.0498, + "step": 60170 + }, + { + "epoch": 0.7521688042201055, + "grad_norm": 4.403369426727295, + "learning_rate": 3.514333133950736e-06, + "loss": 1.7913, + "step": 60172 + }, + { + "epoch": 0.7521938048451211, + "grad_norm": 0.2552557587623596, + "learning_rate": 3.5136689046777727e-06, + "loss": 0.0417, + "step": 60174 + }, + { + "epoch": 0.7522188054701368, + "grad_norm": 4.195678234100342, + "learning_rate": 3.5130047248037947e-06, + "loss": 1.0356, + "step": 60176 + }, + { + "epoch": 0.7522438060951524, + "grad_norm": 0.0004635814402718097, + "learning_rate": 3.5123405943338516e-06, + "loss": 0.5806, + "step": 60178 + }, + { + "epoch": 0.752268806720168, + "grad_norm": 4.855332851409912, + "learning_rate": 3.511676513273009e-06, + "loss": 1.5861, + "step": 60180 + }, + { + "epoch": 0.7522938073451836, + "grad_norm": 1.7312504053115845, + "learning_rate": 3.51101248162632e-06, + "loss": 0.6641, + "step": 60182 + }, + { + "epoch": 0.7523188079701992, + "grad_norm": 0.0005519871483556926, + "learning_rate": 3.510348499398839e-06, + "loss": 0.4508, + "step": 60184 + }, + { + "epoch": 0.7523438085952149, + "grad_norm": 2.983720064163208, + "learning_rate": 3.509684566595629e-06, + "loss": 0.7591, + "step": 60186 + }, + { + "epoch": 0.7523688092202305, + "grad_norm": 1.8612538576126099, + "learning_rate": 3.509020683221741e-06, + "loss": 0.0474, + "step": 60188 + }, + { + "epoch": 0.7523938098452462, + "grad_norm": 8.324556350708008, + "learning_rate": 3.5083568492822363e-06, + "loss": 0.62, + "step": 60190 + }, + { + "epoch": 0.7524188104702617, + "grad_norm": 3.9666781425476074, + "learning_rate": 3.507693064782165e-06, + "loss": 1.0275, + "step": 60192 + }, + { + "epoch": 0.7524438110952774, + "grad_norm": 4.490095615386963, + "learning_rate": 3.507029329726589e-06, + "loss": 2.2767, + "step": 60194 + }, + { + "epoch": 0.752468811720293, + "grad_norm": 2.5344762802124023, + "learning_rate": 3.5063656441205585e-06, + "loss": 1.5029, + "step": 60196 + }, + { + "epoch": 0.7524938123453087, + "grad_norm": 3.4471957683563232, + "learning_rate": 3.5057020079691263e-06, + "loss": 1.2289, + "step": 60198 + }, + { + "epoch": 0.7525188129703243, + "grad_norm": 0.001977887935936451, + "learning_rate": 3.5050384212773514e-06, + "loss": 1.1973, + "step": 60200 + }, + { + "epoch": 0.7525438135953398, + "grad_norm": 4.824570655822754, + "learning_rate": 3.5043748840502835e-06, + "loss": 0.7098, + "step": 60202 + }, + { + "epoch": 0.7525688142203555, + "grad_norm": 5.004386901855469, + "learning_rate": 3.5037113962929805e-06, + "loss": 0.7036, + "step": 60204 + }, + { + "epoch": 0.7525938148453711, + "grad_norm": 0.0036526061594486237, + "learning_rate": 3.503047958010489e-06, + "loss": 1.0427, + "step": 60206 + }, + { + "epoch": 0.7526188154703868, + "grad_norm": 10.070487022399902, + "learning_rate": 3.5023845692078695e-06, + "loss": 0.687, + "step": 60208 + }, + { + "epoch": 0.7526438160954024, + "grad_norm": 6.835954666137695, + "learning_rate": 3.5017212298901703e-06, + "loss": 1.9247, + "step": 60210 + }, + { + "epoch": 0.752668816720418, + "grad_norm": 4.406761646270752, + "learning_rate": 3.50105794006244e-06, + "loss": 0.4109, + "step": 60212 + }, + { + "epoch": 0.7526938173454336, + "grad_norm": 0.0015497832791879773, + "learning_rate": 3.5003946997297366e-06, + "loss": 0.3212, + "step": 60214 + }, + { + "epoch": 0.7527188179704493, + "grad_norm": 0.0009804365690797567, + "learning_rate": 3.499731508897105e-06, + "loss": 0.495, + "step": 60216 + }, + { + "epoch": 0.7527438185954649, + "grad_norm": 2.568284749984741, + "learning_rate": 3.4990683675696015e-06, + "loss": 0.9442, + "step": 60218 + }, + { + "epoch": 0.7527688192204806, + "grad_norm": 7.2507219314575195, + "learning_rate": 3.498405275752271e-06, + "loss": 1.019, + "step": 60220 + }, + { + "epoch": 0.7527938198454961, + "grad_norm": 1.9796475172042847, + "learning_rate": 3.497742233450171e-06, + "loss": 1.4431, + "step": 60222 + }, + { + "epoch": 0.7528188204705117, + "grad_norm": 0.07439261674880981, + "learning_rate": 3.497079240668345e-06, + "loss": 0.7751, + "step": 60224 + }, + { + "epoch": 0.7528438210955274, + "grad_norm": 5.127336025238037, + "learning_rate": 3.496416297411841e-06, + "loss": 1.8263, + "step": 60226 + }, + { + "epoch": 0.752868821720543, + "grad_norm": 0.0010114723118022084, + "learning_rate": 3.4957534036857144e-06, + "loss": 0.8639, + "step": 60228 + }, + { + "epoch": 0.7528938223455587, + "grad_norm": 4.010876178741455, + "learning_rate": 3.495090559495007e-06, + "loss": 1.5813, + "step": 60230 + }, + { + "epoch": 0.7529188229705742, + "grad_norm": 4.594336032867432, + "learning_rate": 3.4944277648447734e-06, + "loss": 1.6462, + "step": 60232 + }, + { + "epoch": 0.7529438235955899, + "grad_norm": 1.0063611268997192, + "learning_rate": 3.4937650197400575e-06, + "loss": 1.0951, + "step": 60234 + }, + { + "epoch": 0.7529688242206055, + "grad_norm": 3.447092056274414, + "learning_rate": 3.4931023241859043e-06, + "loss": 1.1857, + "step": 60236 + }, + { + "epoch": 0.7529938248456212, + "grad_norm": 5.9332380294799805, + "learning_rate": 3.492439678187367e-06, + "loss": 1.0721, + "step": 60238 + }, + { + "epoch": 0.7530188254706368, + "grad_norm": 0.002386811189353466, + "learning_rate": 3.4917770817494843e-06, + "loss": 0.0876, + "step": 60240 + }, + { + "epoch": 0.7530438260956523, + "grad_norm": 4.8751678466796875, + "learning_rate": 3.4911145348773113e-06, + "loss": 1.059, + "step": 60242 + }, + { + "epoch": 0.753068826720668, + "grad_norm": 2.5654513835906982, + "learning_rate": 3.4904520375758865e-06, + "loss": 0.4156, + "step": 60244 + }, + { + "epoch": 0.7530938273456836, + "grad_norm": 3.309666156768799, + "learning_rate": 3.489789589850261e-06, + "loss": 1.0341, + "step": 60246 + }, + { + "epoch": 0.7531188279706993, + "grad_norm": 2.996504068374634, + "learning_rate": 3.489127191705478e-06, + "loss": 1.4539, + "step": 60248 + }, + { + "epoch": 0.7531438285957149, + "grad_norm": 0.0011929719476029277, + "learning_rate": 3.4884648431465784e-06, + "loss": 1.2768, + "step": 60250 + }, + { + "epoch": 0.7531688292207305, + "grad_norm": 2.8796956539154053, + "learning_rate": 3.4878025441786124e-06, + "loss": 0.8418, + "step": 60252 + }, + { + "epoch": 0.7531938298457461, + "grad_norm": 3.5225512981414795, + "learning_rate": 3.487140294806618e-06, + "loss": 1.409, + "step": 60254 + }, + { + "epoch": 0.7532188304707618, + "grad_norm": 1.9125149250030518, + "learning_rate": 3.486478095035646e-06, + "loss": 0.3171, + "step": 60256 + }, + { + "epoch": 0.7532438310957774, + "grad_norm": 5.6690826416015625, + "learning_rate": 3.485815944870733e-06, + "loss": 1.1904, + "step": 60258 + }, + { + "epoch": 0.7532688317207931, + "grad_norm": 2.681796073913574, + "learning_rate": 3.4851538443169262e-06, + "loss": 0.2751, + "step": 60260 + }, + { + "epoch": 0.7532938323458086, + "grad_norm": 0.42874854803085327, + "learning_rate": 3.4844917933792675e-06, + "loss": 0.405, + "step": 60262 + }, + { + "epoch": 0.7533188329708242, + "grad_norm": 2.0915958881378174, + "learning_rate": 3.483829792062795e-06, + "loss": 0.1249, + "step": 60264 + }, + { + "epoch": 0.7533438335958399, + "grad_norm": 2.5369863510131836, + "learning_rate": 3.4831678403725553e-06, + "loss": 0.5148, + "step": 60266 + }, + { + "epoch": 0.7533688342208555, + "grad_norm": 4.023709297180176, + "learning_rate": 3.482505938313585e-06, + "loss": 0.7273, + "step": 60268 + }, + { + "epoch": 0.7533938348458712, + "grad_norm": 3.0946078300476074, + "learning_rate": 3.481844085890932e-06, + "loss": 0.7028, + "step": 60270 + }, + { + "epoch": 0.7534188354708867, + "grad_norm": 0.005669731181114912, + "learning_rate": 3.4811822831096285e-06, + "loss": 1.0962, + "step": 60272 + }, + { + "epoch": 0.7534438360959024, + "grad_norm": 4.078193664550781, + "learning_rate": 3.4805205299747225e-06, + "loss": 1.1555, + "step": 60274 + }, + { + "epoch": 0.753468836720918, + "grad_norm": 0.6500536203384399, + "learning_rate": 3.47985882649125e-06, + "loss": 0.202, + "step": 60276 + }, + { + "epoch": 0.7534938373459337, + "grad_norm": 0.10031449049711227, + "learning_rate": 3.479197172664247e-06, + "loss": 0.4824, + "step": 60278 + }, + { + "epoch": 0.7535188379709493, + "grad_norm": 3.8565640449523926, + "learning_rate": 3.478535568498761e-06, + "loss": 0.7804, + "step": 60280 + }, + { + "epoch": 0.7535438385959649, + "grad_norm": 1.0346418619155884, + "learning_rate": 3.4778740139998214e-06, + "loss": 0.6458, + "step": 60282 + }, + { + "epoch": 0.7535688392209805, + "grad_norm": 2.6653835773468018, + "learning_rate": 3.477212509172474e-06, + "loss": 0.1457, + "step": 60284 + }, + { + "epoch": 0.7535938398459961, + "grad_norm": 4.791109561920166, + "learning_rate": 3.476551054021754e-06, + "loss": 1.8687, + "step": 60286 + }, + { + "epoch": 0.7536188404710118, + "grad_norm": 4.007233619689941, + "learning_rate": 3.475889648552696e-06, + "loss": 1.9094, + "step": 60288 + }, + { + "epoch": 0.7536438410960274, + "grad_norm": 0.0009453289676457644, + "learning_rate": 3.475228292770343e-06, + "loss": 0.8919, + "step": 60290 + }, + { + "epoch": 0.753668841721043, + "grad_norm": 4.274954795837402, + "learning_rate": 3.474566986679725e-06, + "loss": 1.3244, + "step": 60292 + }, + { + "epoch": 0.7536938423460586, + "grad_norm": 3.4959638118743896, + "learning_rate": 3.4739057302858848e-06, + "loss": 1.1191, + "step": 60294 + }, + { + "epoch": 0.7537188429710743, + "grad_norm": 6.851508140563965, + "learning_rate": 3.473244523593853e-06, + "loss": 0.4939, + "step": 60296 + }, + { + "epoch": 0.7537438435960899, + "grad_norm": 6.182052135467529, + "learning_rate": 3.472583366608668e-06, + "loss": 1.1137, + "step": 60298 + }, + { + "epoch": 0.7537688442211056, + "grad_norm": 4.127972602844238, + "learning_rate": 3.471922259335372e-06, + "loss": 1.7689, + "step": 60300 + }, + { + "epoch": 0.7537938448461211, + "grad_norm": 0.46957167983055115, + "learning_rate": 3.4712612017789847e-06, + "loss": 1.229, + "step": 60302 + }, + { + "epoch": 0.7538188454711368, + "grad_norm": 5.018930435180664, + "learning_rate": 3.4706001939445544e-06, + "loss": 0.6215, + "step": 60304 + }, + { + "epoch": 0.7538438460961524, + "grad_norm": 4.015385627746582, + "learning_rate": 3.469939235837104e-06, + "loss": 0.9679, + "step": 60306 + }, + { + "epoch": 0.753868846721168, + "grad_norm": 0.9219885468482971, + "learning_rate": 3.469278327461678e-06, + "loss": 0.7431, + "step": 60308 + }, + { + "epoch": 0.7538938473461837, + "grad_norm": 5.040283203125, + "learning_rate": 3.468617468823301e-06, + "loss": 0.6029, + "step": 60310 + }, + { + "epoch": 0.7539188479711992, + "grad_norm": 5.235987663269043, + "learning_rate": 3.4679566599270133e-06, + "loss": 1.0268, + "step": 60312 + }, + { + "epoch": 0.7539438485962149, + "grad_norm": 3.2212841510772705, + "learning_rate": 3.467295900777844e-06, + "loss": 1.2063, + "step": 60314 + }, + { + "epoch": 0.7539688492212305, + "grad_norm": 2.68033766746521, + "learning_rate": 3.4666351913808216e-06, + "loss": 1.2533, + "step": 60316 + }, + { + "epoch": 0.7539938498462462, + "grad_norm": 4.213086128234863, + "learning_rate": 3.4659745317409844e-06, + "loss": 0.9275, + "step": 60318 + }, + { + "epoch": 0.7540188504712618, + "grad_norm": 2.9538228511810303, + "learning_rate": 3.465313921863359e-06, + "loss": 0.4961, + "step": 60320 + }, + { + "epoch": 0.7540438510962774, + "grad_norm": 2.9936606884002686, + "learning_rate": 3.464653361752981e-06, + "loss": 0.9587, + "step": 60322 + }, + { + "epoch": 0.754068851721293, + "grad_norm": 4.980240345001221, + "learning_rate": 3.4639928514148757e-06, + "loss": 2.8426, + "step": 60324 + }, + { + "epoch": 0.7540938523463087, + "grad_norm": 0.42770156264305115, + "learning_rate": 3.4633323908540806e-06, + "loss": 0.9479, + "step": 60326 + }, + { + "epoch": 0.7541188529713243, + "grad_norm": 5.2307634353637695, + "learning_rate": 3.4626719800756213e-06, + "loss": 1.0858, + "step": 60328 + }, + { + "epoch": 0.75414385359634, + "grad_norm": 2.6735446453094482, + "learning_rate": 3.4620116190845245e-06, + "loss": 1.0634, + "step": 60330 + }, + { + "epoch": 0.7541688542213555, + "grad_norm": 0.0004777464200742543, + "learning_rate": 3.4613513078858263e-06, + "loss": 0.0001, + "step": 60332 + }, + { + "epoch": 0.7541938548463711, + "grad_norm": 9.161012649536133, + "learning_rate": 3.460691046484548e-06, + "loss": 1.2311, + "step": 60334 + }, + { + "epoch": 0.7542188554713868, + "grad_norm": 2.6319849491119385, + "learning_rate": 3.4600308348857213e-06, + "loss": 1.2225, + "step": 60336 + }, + { + "epoch": 0.7542438560964024, + "grad_norm": 1.8018276691436768, + "learning_rate": 3.459370673094382e-06, + "loss": 0.4695, + "step": 60338 + }, + { + "epoch": 0.7542688567214181, + "grad_norm": 3.00132155418396, + "learning_rate": 3.4587105611155447e-06, + "loss": 0.6925, + "step": 60340 + }, + { + "epoch": 0.7542938573464336, + "grad_norm": 0.0010938214836642146, + "learning_rate": 3.4580504989542463e-06, + "loss": 0.4896, + "step": 60342 + }, + { + "epoch": 0.7543188579714493, + "grad_norm": 3.5119571685791016, + "learning_rate": 3.457390486615506e-06, + "loss": 0.8143, + "step": 60344 + }, + { + "epoch": 0.7543438585964649, + "grad_norm": 5.411811351776123, + "learning_rate": 3.456730524104358e-06, + "loss": 1.4681, + "step": 60346 + }, + { + "epoch": 0.7543688592214806, + "grad_norm": 10.386909484863281, + "learning_rate": 3.4560706114258226e-06, + "loss": 1.5277, + "step": 60348 + }, + { + "epoch": 0.7543938598464962, + "grad_norm": 4.926874160766602, + "learning_rate": 3.4554107485849262e-06, + "loss": 1.777, + "step": 60350 + }, + { + "epoch": 0.7544188604715117, + "grad_norm": 2.3960211277008057, + "learning_rate": 3.454750935586705e-06, + "loss": 1.5783, + "step": 60352 + }, + { + "epoch": 0.7544438610965274, + "grad_norm": 2.2321598529815674, + "learning_rate": 3.454091172436167e-06, + "loss": 1.5132, + "step": 60354 + }, + { + "epoch": 0.754468861721543, + "grad_norm": 0.7294546365737915, + "learning_rate": 3.4534314591383477e-06, + "loss": 0.9179, + "step": 60356 + }, + { + "epoch": 0.7544938623465587, + "grad_norm": 4.559466361999512, + "learning_rate": 3.4527717956982665e-06, + "loss": 1.3857, + "step": 60358 + }, + { + "epoch": 0.7545188629715743, + "grad_norm": 3.041247606277466, + "learning_rate": 3.452112182120949e-06, + "loss": 0.4785, + "step": 60360 + }, + { + "epoch": 0.7545438635965899, + "grad_norm": 3.6170177459716797, + "learning_rate": 3.4514526184114227e-06, + "loss": 2.0931, + "step": 60362 + }, + { + "epoch": 0.7545688642216055, + "grad_norm": 3.3115475177764893, + "learning_rate": 3.4507931045747077e-06, + "loss": 1.5062, + "step": 60364 + }, + { + "epoch": 0.7545938648466212, + "grad_norm": 0.26411429047584534, + "learning_rate": 3.450133640615825e-06, + "loss": 0.0038, + "step": 60366 + }, + { + "epoch": 0.7546188654716368, + "grad_norm": 3.067497730255127, + "learning_rate": 3.4494742265397964e-06, + "loss": 1.1675, + "step": 60368 + }, + { + "epoch": 0.7546438660966525, + "grad_norm": 0.0014384419191628695, + "learning_rate": 3.448814862351648e-06, + "loss": 0.078, + "step": 60370 + }, + { + "epoch": 0.754668866721668, + "grad_norm": 2.0549402236938477, + "learning_rate": 3.4481555480563954e-06, + "loss": 1.1389, + "step": 60372 + }, + { + "epoch": 0.7546938673466836, + "grad_norm": 0.0010516257025301456, + "learning_rate": 3.447496283659065e-06, + "loss": 0.2444, + "step": 60374 + }, + { + "epoch": 0.7547188679716993, + "grad_norm": 3.9903335571289062, + "learning_rate": 3.446837069164679e-06, + "loss": 2.0983, + "step": 60376 + }, + { + "epoch": 0.7547438685967149, + "grad_norm": 0.009825563058257103, + "learning_rate": 3.446177904578255e-06, + "loss": 0.026, + "step": 60378 + }, + { + "epoch": 0.7547688692217306, + "grad_norm": 1.9714909791946411, + "learning_rate": 3.445518789904814e-06, + "loss": 0.4574, + "step": 60380 + }, + { + "epoch": 0.7547938698467461, + "grad_norm": 0.7763288021087646, + "learning_rate": 3.444859725149371e-06, + "loss": 0.486, + "step": 60382 + }, + { + "epoch": 0.7548188704717618, + "grad_norm": 1.9189883470535278, + "learning_rate": 3.4442007103169528e-06, + "loss": 0.5804, + "step": 60384 + }, + { + "epoch": 0.7548438710967774, + "grad_norm": 0.0019347203196957707, + "learning_rate": 3.443541745412572e-06, + "loss": 0.8247, + "step": 60386 + }, + { + "epoch": 0.7548688717217931, + "grad_norm": 6.369159698486328, + "learning_rate": 3.4428828304412486e-06, + "loss": 0.6415, + "step": 60388 + }, + { + "epoch": 0.7548938723468087, + "grad_norm": 2.6296536922454834, + "learning_rate": 3.442223965408007e-06, + "loss": 0.8918, + "step": 60390 + }, + { + "epoch": 0.7549188729718243, + "grad_norm": 4.965398788452148, + "learning_rate": 3.4415651503178603e-06, + "loss": 0.228, + "step": 60392 + }, + { + "epoch": 0.7549438735968399, + "grad_norm": 6.1368560791015625, + "learning_rate": 3.4409063851758264e-06, + "loss": 1.9061, + "step": 60394 + }, + { + "epoch": 0.7549688742218555, + "grad_norm": 0.032885998487472534, + "learning_rate": 3.4402476699869168e-06, + "loss": 0.2313, + "step": 60396 + }, + { + "epoch": 0.7549938748468712, + "grad_norm": 3.2209582328796387, + "learning_rate": 3.439589004756155e-06, + "loss": 1.1383, + "step": 60398 + }, + { + "epoch": 0.7550188754718868, + "grad_norm": 3.8189172744750977, + "learning_rate": 3.4389303894885583e-06, + "loss": 1.0305, + "step": 60400 + }, + { + "epoch": 0.7550438760969024, + "grad_norm": 1.2451040744781494, + "learning_rate": 3.4382718241891366e-06, + "loss": 0.1711, + "step": 60402 + }, + { + "epoch": 0.755068876721918, + "grad_norm": 3.2970731258392334, + "learning_rate": 3.437613308862916e-06, + "loss": 1.2618, + "step": 60404 + }, + { + "epoch": 0.7550938773469337, + "grad_norm": 0.36660677194595337, + "learning_rate": 3.436954843514897e-06, + "loss": 0.7465, + "step": 60406 + }, + { + "epoch": 0.7551188779719493, + "grad_norm": 3.453814744949341, + "learning_rate": 3.4362964281501055e-06, + "loss": 0.7831, + "step": 60408 + }, + { + "epoch": 0.755143878596965, + "grad_norm": 3.1470701694488525, + "learning_rate": 3.4356380627735486e-06, + "loss": 0.5335, + "step": 60410 + }, + { + "epoch": 0.7551688792219805, + "grad_norm": 4.012382984161377, + "learning_rate": 3.4349797473902446e-06, + "loss": 1.5299, + "step": 60412 + }, + { + "epoch": 0.7551938798469962, + "grad_norm": 0.6609206795692444, + "learning_rate": 3.4343214820052093e-06, + "loss": 0.7296, + "step": 60414 + }, + { + "epoch": 0.7552188804720118, + "grad_norm": 3.702636241912842, + "learning_rate": 3.4336632666234495e-06, + "loss": 1.3529, + "step": 60416 + }, + { + "epoch": 0.7552438810970274, + "grad_norm": 2.0634374618530273, + "learning_rate": 3.433005101249989e-06, + "loss": 0.2276, + "step": 60418 + }, + { + "epoch": 0.7552688817220431, + "grad_norm": 3.5705316066741943, + "learning_rate": 3.432346985889826e-06, + "loss": 0.7998, + "step": 60420 + }, + { + "epoch": 0.7552938823470586, + "grad_norm": 1.360399603843689, + "learning_rate": 3.4316889205479796e-06, + "loss": 0.4326, + "step": 60422 + }, + { + "epoch": 0.7553188829720743, + "grad_norm": 5.231497287750244, + "learning_rate": 3.4310309052294655e-06, + "loss": 0.9701, + "step": 60424 + }, + { + "epoch": 0.7553438835970899, + "grad_norm": 4.472206115722656, + "learning_rate": 3.430372939939287e-06, + "loss": 0.9195, + "step": 60426 + }, + { + "epoch": 0.7553688842221056, + "grad_norm": 3.897935390472412, + "learning_rate": 3.4297150246824627e-06, + "loss": 1.5709, + "step": 60428 + }, + { + "epoch": 0.7553938848471212, + "grad_norm": 3.659348964691162, + "learning_rate": 3.429057159464e-06, + "loss": 0.9747, + "step": 60430 + }, + { + "epoch": 0.7554188854721368, + "grad_norm": 0.0020893560722470284, + "learning_rate": 3.4283993442889075e-06, + "loss": 0.511, + "step": 60432 + }, + { + "epoch": 0.7554438860971524, + "grad_norm": 5.367919445037842, + "learning_rate": 3.4277415791621937e-06, + "loss": 1.1424, + "step": 60434 + }, + { + "epoch": 0.755468886722168, + "grad_norm": 0.007719905115664005, + "learning_rate": 3.4270838640888716e-06, + "loss": 0.1546, + "step": 60436 + }, + { + "epoch": 0.7554938873471837, + "grad_norm": 2.6133406162261963, + "learning_rate": 3.4264261990739513e-06, + "loss": 0.2463, + "step": 60438 + }, + { + "epoch": 0.7555188879721993, + "grad_norm": 6.899759769439697, + "learning_rate": 3.4257685841224363e-06, + "loss": 2.0957, + "step": 60440 + }, + { + "epoch": 0.7555438885972149, + "grad_norm": 2.597149133682251, + "learning_rate": 3.425111019239342e-06, + "loss": 0.7236, + "step": 60442 + }, + { + "epoch": 0.7555688892222305, + "grad_norm": 3.4458820819854736, + "learning_rate": 3.424453504429672e-06, + "loss": 1.2277, + "step": 60444 + }, + { + "epoch": 0.7555938898472462, + "grad_norm": 10.719667434692383, + "learning_rate": 3.423796039698435e-06, + "loss": 0.8734, + "step": 60446 + }, + { + "epoch": 0.7556188904722618, + "grad_norm": 2.7972021102905273, + "learning_rate": 3.423138625050633e-06, + "loss": 1.5806, + "step": 60448 + }, + { + "epoch": 0.7556438910972775, + "grad_norm": 11.890796661376953, + "learning_rate": 3.4224812604912783e-06, + "loss": 0.7575, + "step": 60450 + }, + { + "epoch": 0.755668891722293, + "grad_norm": 4.44374942779541, + "learning_rate": 3.4218239460253786e-06, + "loss": 1.2785, + "step": 60452 + }, + { + "epoch": 0.7556938923473087, + "grad_norm": 3.3975207805633545, + "learning_rate": 3.421166681657934e-06, + "loss": 0.4404, + "step": 60454 + }, + { + "epoch": 0.7557188929723243, + "grad_norm": 0.7902300357818604, + "learning_rate": 3.4205094673939608e-06, + "loss": 0.0402, + "step": 60456 + }, + { + "epoch": 0.75574389359734, + "grad_norm": 3.068120241165161, + "learning_rate": 3.41985230323845e-06, + "loss": 0.6844, + "step": 60458 + }, + { + "epoch": 0.7557688942223556, + "grad_norm": 9.708442687988281, + "learning_rate": 3.4191951891964147e-06, + "loss": 1.1451, + "step": 60460 + }, + { + "epoch": 0.7557938948473711, + "grad_norm": 7.747573375701904, + "learning_rate": 3.41853812527286e-06, + "loss": 1.2095, + "step": 60462 + }, + { + "epoch": 0.7558188954723868, + "grad_norm": 2.066901206970215, + "learning_rate": 3.417881111472786e-06, + "loss": 0.9436, + "step": 60464 + }, + { + "epoch": 0.7558438960974024, + "grad_norm": 0.4631270170211792, + "learning_rate": 3.4172241478012014e-06, + "loss": 0.4181, + "step": 60466 + }, + { + "epoch": 0.7558688967224181, + "grad_norm": 0.0009559186873957515, + "learning_rate": 3.416567234263104e-06, + "loss": 0.1628, + "step": 60468 + }, + { + "epoch": 0.7558938973474337, + "grad_norm": 3.2943804264068604, + "learning_rate": 3.4159103708635055e-06, + "loss": 1.3025, + "step": 60470 + }, + { + "epoch": 0.7559188979724493, + "grad_norm": 4.239063262939453, + "learning_rate": 3.4152535576073964e-06, + "loss": 0.4434, + "step": 60472 + }, + { + "epoch": 0.7559438985974649, + "grad_norm": 3.7751669883728027, + "learning_rate": 3.414596794499785e-06, + "loss": 1.3516, + "step": 60474 + }, + { + "epoch": 0.7559688992224806, + "grad_norm": 2.3825860023498535, + "learning_rate": 3.413940081545677e-06, + "loss": 0.755, + "step": 60476 + }, + { + "epoch": 0.7559938998474962, + "grad_norm": 4.988068103790283, + "learning_rate": 3.4132834187500652e-06, + "loss": 1.3075, + "step": 60478 + }, + { + "epoch": 0.7560189004725119, + "grad_norm": 4.146622180938721, + "learning_rate": 3.41262680611796e-06, + "loss": 1.9583, + "step": 60480 + }, + { + "epoch": 0.7560439010975274, + "grad_norm": 0.0009199652704410255, + "learning_rate": 3.4119702436543565e-06, + "loss": 0.0005, + "step": 60482 + }, + { + "epoch": 0.756068901722543, + "grad_norm": 2.25106143951416, + "learning_rate": 3.411313731364254e-06, + "loss": 0.9242, + "step": 60484 + }, + { + "epoch": 0.7560939023475587, + "grad_norm": 1.3367853164672852, + "learning_rate": 3.4106572692526573e-06, + "loss": 0.3783, + "step": 60486 + }, + { + "epoch": 0.7561189029725743, + "grad_norm": 0.8775941729545593, + "learning_rate": 3.4100008573245593e-06, + "loss": 0.034, + "step": 60488 + }, + { + "epoch": 0.75614390359759, + "grad_norm": 1.5788252353668213, + "learning_rate": 3.4093444955849663e-06, + "loss": 0.8604, + "step": 60490 + }, + { + "epoch": 0.7561689042226055, + "grad_norm": 4.355382442474365, + "learning_rate": 3.40868818403887e-06, + "loss": 0.3537, + "step": 60492 + }, + { + "epoch": 0.7561939048476212, + "grad_norm": 3.5076773166656494, + "learning_rate": 3.408031922691276e-06, + "loss": 0.5219, + "step": 60494 + }, + { + "epoch": 0.7562189054726368, + "grad_norm": 2.958627700805664, + "learning_rate": 3.4073757115471785e-06, + "loss": 0.4512, + "step": 60496 + }, + { + "epoch": 0.7562439060976525, + "grad_norm": 5.38818359375, + "learning_rate": 3.4067195506115725e-06, + "loss": 1.4259, + "step": 60498 + }, + { + "epoch": 0.7562689067226681, + "grad_norm": 1.1005076169967651, + "learning_rate": 3.406063439889461e-06, + "loss": 0.9996, + "step": 60500 + }, + { + "epoch": 0.7562939073476836, + "grad_norm": 3.588912010192871, + "learning_rate": 3.4054073793858343e-06, + "loss": 1.0078, + "step": 60502 + }, + { + "epoch": 0.7563189079726993, + "grad_norm": 3.371140718460083, + "learning_rate": 3.4047513691056967e-06, + "loss": 0.78, + "step": 60504 + }, + { + "epoch": 0.7563439085977149, + "grad_norm": 1.6158493757247925, + "learning_rate": 3.4040954090540355e-06, + "loss": 0.7715, + "step": 60506 + }, + { + "epoch": 0.7563689092227306, + "grad_norm": 2.536356210708618, + "learning_rate": 3.4034394992358543e-06, + "loss": 0.4956, + "step": 60508 + }, + { + "epoch": 0.7563939098477462, + "grad_norm": 3.864219903945923, + "learning_rate": 3.4027836396561463e-06, + "loss": 0.3981, + "step": 60510 + }, + { + "epoch": 0.7564189104727618, + "grad_norm": 7.24855899810791, + "learning_rate": 3.4021278303199014e-06, + "loss": 1.7538, + "step": 60512 + }, + { + "epoch": 0.7564439110977774, + "grad_norm": 2.034266471862793, + "learning_rate": 3.4014720712321205e-06, + "loss": 1.096, + "step": 60514 + }, + { + "epoch": 0.7564689117227931, + "grad_norm": 2.820662498474121, + "learning_rate": 3.400816362397792e-06, + "loss": 0.4934, + "step": 60516 + }, + { + "epoch": 0.7564939123478087, + "grad_norm": 3.096508502960205, + "learning_rate": 3.4001607038219162e-06, + "loss": 1.0621, + "step": 60518 + }, + { + "epoch": 0.7565189129728244, + "grad_norm": 1.9351401329040527, + "learning_rate": 3.399505095509481e-06, + "loss": 0.1867, + "step": 60520 + }, + { + "epoch": 0.7565439135978399, + "grad_norm": 3.9994430541992188, + "learning_rate": 3.398849537465484e-06, + "loss": 0.6277, + "step": 60522 + }, + { + "epoch": 0.7565689142228555, + "grad_norm": 3.292285442352295, + "learning_rate": 3.3981940296949158e-06, + "loss": 0.7185, + "step": 60524 + }, + { + "epoch": 0.7565939148478712, + "grad_norm": 7.579169273376465, + "learning_rate": 3.3975385722027643e-06, + "loss": 1.2364, + "step": 60526 + }, + { + "epoch": 0.7566189154728868, + "grad_norm": 3.3162052631378174, + "learning_rate": 3.3968831649940283e-06, + "loss": 1.7784, + "step": 60528 + }, + { + "epoch": 0.7566439160979025, + "grad_norm": 3.796726942062378, + "learning_rate": 3.3962278080736943e-06, + "loss": 1.01, + "step": 60530 + }, + { + "epoch": 0.756668916722918, + "grad_norm": 2.34849214553833, + "learning_rate": 3.3955725014467576e-06, + "loss": 1.0684, + "step": 60532 + }, + { + "epoch": 0.7566939173479337, + "grad_norm": 1.761067271232605, + "learning_rate": 3.3949172451182078e-06, + "loss": 0.7627, + "step": 60534 + }, + { + "epoch": 0.7567189179729493, + "grad_norm": 3.276113510131836, + "learning_rate": 3.3942620390930304e-06, + "loss": 1.7879, + "step": 60536 + }, + { + "epoch": 0.756743918597965, + "grad_norm": 2.0961670875549316, + "learning_rate": 3.3936068833762225e-06, + "loss": 1.0135, + "step": 60538 + }, + { + "epoch": 0.7567689192229806, + "grad_norm": 2.7552759647369385, + "learning_rate": 3.392951777972767e-06, + "loss": 0.6592, + "step": 60540 + }, + { + "epoch": 0.7567939198479962, + "grad_norm": 2.234806776046753, + "learning_rate": 3.3922967228876603e-06, + "loss": 0.1197, + "step": 60542 + }, + { + "epoch": 0.7568189204730118, + "grad_norm": 1.3609334230422974, + "learning_rate": 3.391641718125884e-06, + "loss": 0.2842, + "step": 60544 + }, + { + "epoch": 0.7568439210980275, + "grad_norm": 5.529524803161621, + "learning_rate": 3.3909867636924322e-06, + "loss": 1.6133, + "step": 60546 + }, + { + "epoch": 0.7568689217230431, + "grad_norm": 2.4328465461730957, + "learning_rate": 3.3903318595922916e-06, + "loss": 0.7612, + "step": 60548 + }, + { + "epoch": 0.7568939223480587, + "grad_norm": 9.244497299194336, + "learning_rate": 3.3896770058304453e-06, + "loss": 0.8425, + "step": 60550 + }, + { + "epoch": 0.7569189229730743, + "grad_norm": 3.380023956298828, + "learning_rate": 3.3890222024118866e-06, + "loss": 1.4638, + "step": 60552 + }, + { + "epoch": 0.7569439235980899, + "grad_norm": 4.135724067687988, + "learning_rate": 3.3883674493415974e-06, + "loss": 0.6875, + "step": 60554 + }, + { + "epoch": 0.7569689242231056, + "grad_norm": 2.542933225631714, + "learning_rate": 3.387712746624571e-06, + "loss": 1.1501, + "step": 60556 + }, + { + "epoch": 0.7569939248481212, + "grad_norm": 4.569477558135986, + "learning_rate": 3.387058094265785e-06, + "loss": 1.0163, + "step": 60558 + }, + { + "epoch": 0.7570189254731369, + "grad_norm": 3.620394468307495, + "learning_rate": 3.3864034922702317e-06, + "loss": 0.4975, + "step": 60560 + }, + { + "epoch": 0.7570439260981524, + "grad_norm": 5.129706382751465, + "learning_rate": 3.3857489406428956e-06, + "loss": 0.8055, + "step": 60562 + }, + { + "epoch": 0.7570689267231681, + "grad_norm": 2.998460292816162, + "learning_rate": 3.385094439388756e-06, + "loss": 0.7918, + "step": 60564 + }, + { + "epoch": 0.7570939273481837, + "grad_norm": 2.0555148124694824, + "learning_rate": 3.3844399885128056e-06, + "loss": 0.341, + "step": 60566 + }, + { + "epoch": 0.7571189279731994, + "grad_norm": 3.3817248344421387, + "learning_rate": 3.383785588020021e-06, + "loss": 0.6401, + "step": 60568 + }, + { + "epoch": 0.757143928598215, + "grad_norm": 2.98451828956604, + "learning_rate": 3.3831312379153936e-06, + "loss": 1.0332, + "step": 60570 + }, + { + "epoch": 0.7571689292232305, + "grad_norm": 5.769014358520508, + "learning_rate": 3.3824769382038993e-06, + "loss": 1.0512, + "step": 60572 + }, + { + "epoch": 0.7571939298482462, + "grad_norm": 2.1232097148895264, + "learning_rate": 3.381822688890528e-06, + "loss": 1.2297, + "step": 60574 + }, + { + "epoch": 0.7572189304732618, + "grad_norm": 2.6861679553985596, + "learning_rate": 3.38116848998026e-06, + "loss": 0.9228, + "step": 60576 + }, + { + "epoch": 0.7572439310982775, + "grad_norm": 4.61817741394043, + "learning_rate": 3.380514341478073e-06, + "loss": 1.7077, + "step": 60578 + }, + { + "epoch": 0.7572689317232931, + "grad_norm": 4.171323299407959, + "learning_rate": 3.379860243388956e-06, + "loss": 0.8748, + "step": 60580 + }, + { + "epoch": 0.7572939323483087, + "grad_norm": 5.386810302734375, + "learning_rate": 3.3792061957178836e-06, + "loss": 1.3968, + "step": 60582 + }, + { + "epoch": 0.7573189329733243, + "grad_norm": 3.3717286586761475, + "learning_rate": 3.3785521984698443e-06, + "loss": 0.3085, + "step": 60584 + }, + { + "epoch": 0.75734393359834, + "grad_norm": 11.776561737060547, + "learning_rate": 3.377898251649815e-06, + "loss": 0.5355, + "step": 60586 + }, + { + "epoch": 0.7573689342233556, + "grad_norm": 5.254590034484863, + "learning_rate": 3.377244355262772e-06, + "loss": 1.3311, + "step": 60588 + }, + { + "epoch": 0.7573939348483713, + "grad_norm": 1.49288010597229, + "learning_rate": 3.3765905093137043e-06, + "loss": 1.0488, + "step": 60590 + }, + { + "epoch": 0.7574189354733868, + "grad_norm": 0.7660984396934509, + "learning_rate": 3.3759367138075826e-06, + "loss": 0.482, + "step": 60592 + }, + { + "epoch": 0.7574439360984024, + "grad_norm": 8.177332878112793, + "learning_rate": 3.3752829687493927e-06, + "loss": 1.8622, + "step": 60594 + }, + { + "epoch": 0.7574689367234181, + "grad_norm": 3.8911118507385254, + "learning_rate": 3.374629274144109e-06, + "loss": 1.3392, + "step": 60596 + }, + { + "epoch": 0.7574939373484337, + "grad_norm": 0.004160676151514053, + "learning_rate": 3.373975629996714e-06, + "loss": 0.6753, + "step": 60598 + }, + { + "epoch": 0.7575189379734494, + "grad_norm": 4.122830390930176, + "learning_rate": 3.3733220363121844e-06, + "loss": 1.1709, + "step": 60600 + }, + { + "epoch": 0.7575439385984649, + "grad_norm": 6.602232456207275, + "learning_rate": 3.3726684930954935e-06, + "loss": 0.7749, + "step": 60602 + }, + { + "epoch": 0.7575689392234806, + "grad_norm": 2.0938336849212646, + "learning_rate": 3.372015000351625e-06, + "loss": 1.1452, + "step": 60604 + }, + { + "epoch": 0.7575939398484962, + "grad_norm": 1.8337689638137817, + "learning_rate": 3.3713615580855496e-06, + "loss": 1.1044, + "step": 60606 + }, + { + "epoch": 0.7576189404735119, + "grad_norm": 6.774747848510742, + "learning_rate": 3.3707081663022514e-06, + "loss": 0.126, + "step": 60608 + }, + { + "epoch": 0.7576439410985275, + "grad_norm": 0.23262880742549896, + "learning_rate": 3.370054825006699e-06, + "loss": 0.3727, + "step": 60610 + }, + { + "epoch": 0.757668941723543, + "grad_norm": 0.7175073027610779, + "learning_rate": 3.369401534203874e-06, + "loss": 0.8942, + "step": 60612 + }, + { + "epoch": 0.7576939423485587, + "grad_norm": 2.372730016708374, + "learning_rate": 3.368748293898749e-06, + "loss": 0.8103, + "step": 60614 + }, + { + "epoch": 0.7577189429735743, + "grad_norm": 4.066653251647949, + "learning_rate": 3.368095104096297e-06, + "loss": 0.9928, + "step": 60616 + }, + { + "epoch": 0.75774394359859, + "grad_norm": 0.004788817837834358, + "learning_rate": 3.3674419648014976e-06, + "loss": 0.0005, + "step": 60618 + }, + { + "epoch": 0.7577689442236056, + "grad_norm": 3.5834712982177734, + "learning_rate": 3.366788876019319e-06, + "loss": 0.8544, + "step": 60620 + }, + { + "epoch": 0.7577939448486212, + "grad_norm": 3.077575206756592, + "learning_rate": 3.3661358377547425e-06, + "loss": 0.7934, + "step": 60622 + }, + { + "epoch": 0.7578189454736368, + "grad_norm": 2.5836493968963623, + "learning_rate": 3.3654828500127322e-06, + "loss": 1.0912, + "step": 60624 + }, + { + "epoch": 0.7578439460986525, + "grad_norm": 4.636635780334473, + "learning_rate": 3.364829912798271e-06, + "loss": 1.5535, + "step": 60626 + }, + { + "epoch": 0.7578689467236681, + "grad_norm": 3.998244047164917, + "learning_rate": 3.3641770261163266e-06, + "loss": 0.5806, + "step": 60628 + }, + { + "epoch": 0.7578939473486838, + "grad_norm": 1.5454471111297607, + "learning_rate": 3.3635241899718683e-06, + "loss": 0.1195, + "step": 60630 + }, + { + "epoch": 0.7579189479736993, + "grad_norm": 3.183239698410034, + "learning_rate": 3.362871404369875e-06, + "loss": 1.6946, + "step": 60632 + }, + { + "epoch": 0.757943948598715, + "grad_norm": 3.463257074356079, + "learning_rate": 3.36221866931531e-06, + "loss": 0.9645, + "step": 60634 + }, + { + "epoch": 0.7579689492237306, + "grad_norm": 3.023469924926758, + "learning_rate": 3.3615659848131533e-06, + "loss": 1.9564, + "step": 60636 + }, + { + "epoch": 0.7579939498487462, + "grad_norm": 4.189001560211182, + "learning_rate": 3.360913350868368e-06, + "loss": 1.8519, + "step": 60638 + }, + { + "epoch": 0.7580189504737619, + "grad_norm": 0.005101943388581276, + "learning_rate": 3.3602607674859313e-06, + "loss": 0.4132, + "step": 60640 + }, + { + "epoch": 0.7580439510987774, + "grad_norm": 0.398805171251297, + "learning_rate": 3.359608234670808e-06, + "loss": 0.5784, + "step": 60642 + }, + { + "epoch": 0.7580689517237931, + "grad_norm": 1.4195207357406616, + "learning_rate": 3.358955752427967e-06, + "loss": 0.6782, + "step": 60644 + }, + { + "epoch": 0.7580939523488087, + "grad_norm": 3.1559040546417236, + "learning_rate": 3.3583033207623837e-06, + "loss": 1.3772, + "step": 60646 + }, + { + "epoch": 0.7581189529738244, + "grad_norm": 1.1844550371170044, + "learning_rate": 3.357650939679019e-06, + "loss": 0.0448, + "step": 60648 + }, + { + "epoch": 0.75814395359884, + "grad_norm": 6.278709411621094, + "learning_rate": 3.35699860918285e-06, + "loss": 1.1767, + "step": 60650 + }, + { + "epoch": 0.7581689542238556, + "grad_norm": 2.2375593185424805, + "learning_rate": 3.3563463292788388e-06, + "loss": 1.1154, + "step": 60652 + }, + { + "epoch": 0.7581939548488712, + "grad_norm": 1.0984715223312378, + "learning_rate": 3.3556940999719523e-06, + "loss": 0.0599, + "step": 60654 + }, + { + "epoch": 0.7582189554738868, + "grad_norm": 0.0005982201546430588, + "learning_rate": 3.3550419212671626e-06, + "loss": 0.0538, + "step": 60656 + }, + { + "epoch": 0.7582439560989025, + "grad_norm": 3.166175365447998, + "learning_rate": 3.3543897931694305e-06, + "loss": 0.9608, + "step": 60658 + }, + { + "epoch": 0.7582689567239181, + "grad_norm": 3.0595014095306396, + "learning_rate": 3.353737715683729e-06, + "loss": 0.9609, + "step": 60660 + }, + { + "epoch": 0.7582939573489337, + "grad_norm": 2.89758563041687, + "learning_rate": 3.353085688815019e-06, + "loss": 1.7038, + "step": 60662 + }, + { + "epoch": 0.7583189579739493, + "grad_norm": 1.1893905401229858, + "learning_rate": 3.3524337125682704e-06, + "loss": 1.0282, + "step": 60664 + }, + { + "epoch": 0.758343958598965, + "grad_norm": 4.0143232345581055, + "learning_rate": 3.351781786948447e-06, + "loss": 1.1856, + "step": 60666 + }, + { + "epoch": 0.7583689592239806, + "grad_norm": 0.001639898749999702, + "learning_rate": 3.35112991196051e-06, + "loss": 0.5832, + "step": 60668 + }, + { + "epoch": 0.7583939598489963, + "grad_norm": 0.8202118873596191, + "learning_rate": 3.350478087609431e-06, + "loss": 0.0685, + "step": 60670 + }, + { + "epoch": 0.7584189604740118, + "grad_norm": 4.884792804718018, + "learning_rate": 3.3498263139001675e-06, + "loss": 1.1922, + "step": 60672 + }, + { + "epoch": 0.7584439610990275, + "grad_norm": 1.1086732149124146, + "learning_rate": 3.3491745908376893e-06, + "loss": 0.5381, + "step": 60674 + }, + { + "epoch": 0.7584689617240431, + "grad_norm": 4.7879462242126465, + "learning_rate": 3.3485229184269543e-06, + "loss": 0.9846, + "step": 60676 + }, + { + "epoch": 0.7584939623490587, + "grad_norm": 3.0854265689849854, + "learning_rate": 3.347871296672931e-06, + "loss": 0.9144, + "step": 60678 + }, + { + "epoch": 0.7585189629740744, + "grad_norm": 0.0011444552801549435, + "learning_rate": 3.347219725580578e-06, + "loss": 0.4926, + "step": 60680 + }, + { + "epoch": 0.7585439635990899, + "grad_norm": 7.2100372314453125, + "learning_rate": 3.3465682051548576e-06, + "loss": 1.8133, + "step": 60682 + }, + { + "epoch": 0.7585689642241056, + "grad_norm": 4.092147350311279, + "learning_rate": 3.345916735400735e-06, + "loss": 1.6519, + "step": 60684 + }, + { + "epoch": 0.7585939648491212, + "grad_norm": 0.43504565954208374, + "learning_rate": 3.3452653163231663e-06, + "loss": 0.6293, + "step": 60686 + }, + { + "epoch": 0.7586189654741369, + "grad_norm": 3.304715394973755, + "learning_rate": 3.344613947927119e-06, + "loss": 0.7601, + "step": 60688 + }, + { + "epoch": 0.7586439660991525, + "grad_norm": 3.405027151107788, + "learning_rate": 3.3439626302175475e-06, + "loss": 1.007, + "step": 60690 + }, + { + "epoch": 0.7586689667241681, + "grad_norm": 2.666006565093994, + "learning_rate": 3.3433113631994196e-06, + "loss": 1.5251, + "step": 60692 + }, + { + "epoch": 0.7586939673491837, + "grad_norm": 0.0007641041884198785, + "learning_rate": 3.34266014687769e-06, + "loss": 1.0278, + "step": 60694 + }, + { + "epoch": 0.7587189679741994, + "grad_norm": 2.288954257965088, + "learning_rate": 3.3420089812573165e-06, + "loss": 0.2304, + "step": 60696 + }, + { + "epoch": 0.758743968599215, + "grad_norm": 0.48086580634117126, + "learning_rate": 3.3413578663432632e-06, + "loss": 0.7219, + "step": 60698 + }, + { + "epoch": 0.7587689692242306, + "grad_norm": 0.5678996443748474, + "learning_rate": 3.3407068021404854e-06, + "loss": 0.394, + "step": 60700 + }, + { + "epoch": 0.7587939698492462, + "grad_norm": 2.6854090690612793, + "learning_rate": 3.3400557886539443e-06, + "loss": 1.1065, + "step": 60702 + }, + { + "epoch": 0.7588189704742618, + "grad_norm": 0.001365677104331553, + "learning_rate": 3.339404825888598e-06, + "loss": 0.5399, + "step": 60704 + }, + { + "epoch": 0.7588439710992775, + "grad_norm": 3.32126522064209, + "learning_rate": 3.3387539138493984e-06, + "loss": 0.6109, + "step": 60706 + }, + { + "epoch": 0.7588689717242931, + "grad_norm": 3.31380295753479, + "learning_rate": 3.3381030525413107e-06, + "loss": 0.5734, + "step": 60708 + }, + { + "epoch": 0.7588939723493088, + "grad_norm": 2.2575979232788086, + "learning_rate": 3.3374522419692845e-06, + "loss": 0.5329, + "step": 60710 + }, + { + "epoch": 0.7589189729743243, + "grad_norm": 0.7790845036506653, + "learning_rate": 3.3368014821382833e-06, + "loss": 0.1035, + "step": 60712 + }, + { + "epoch": 0.75894397359934, + "grad_norm": 3.0101988315582275, + "learning_rate": 3.336150773053257e-06, + "loss": 1.6001, + "step": 60714 + }, + { + "epoch": 0.7589689742243556, + "grad_norm": 0.0027102364692837, + "learning_rate": 3.3355001147191634e-06, + "loss": 0.8012, + "step": 60716 + }, + { + "epoch": 0.7589939748493713, + "grad_norm": 3.5990421772003174, + "learning_rate": 3.334849507140965e-06, + "loss": 1.9588, + "step": 60718 + }, + { + "epoch": 0.7590189754743869, + "grad_norm": 2.621527910232544, + "learning_rate": 3.334198950323604e-06, + "loss": 0.24, + "step": 60720 + }, + { + "epoch": 0.7590439760994024, + "grad_norm": 4.6583476066589355, + "learning_rate": 3.3335484442720444e-06, + "loss": 0.7214, + "step": 60722 + }, + { + "epoch": 0.7590689767244181, + "grad_norm": 4.311223983764648, + "learning_rate": 3.3328979889912337e-06, + "loss": 1.0529, + "step": 60724 + }, + { + "epoch": 0.7590939773494337, + "grad_norm": 5.688976764678955, + "learning_rate": 3.3322475844861324e-06, + "loss": 1.3069, + "step": 60726 + }, + { + "epoch": 0.7591189779744494, + "grad_norm": 4.710403919219971, + "learning_rate": 3.331597230761687e-06, + "loss": 1.3996, + "step": 60728 + }, + { + "epoch": 0.759143978599465, + "grad_norm": 2.174899101257324, + "learning_rate": 3.330946927822858e-06, + "loss": 1.3986, + "step": 60730 + }, + { + "epoch": 0.7591689792244806, + "grad_norm": 0.9487651586532593, + "learning_rate": 3.330296675674595e-06, + "loss": 0.9222, + "step": 60732 + }, + { + "epoch": 0.7591939798494962, + "grad_norm": 2.1729607582092285, + "learning_rate": 3.329646474321844e-06, + "loss": 1.2588, + "step": 60734 + }, + { + "epoch": 0.7592189804745119, + "grad_norm": 0.0009566702065058053, + "learning_rate": 3.3289963237695664e-06, + "loss": 0.5631, + "step": 60736 + }, + { + "epoch": 0.7592439810995275, + "grad_norm": 0.9164433479309082, + "learning_rate": 3.328346224022706e-06, + "loss": 0.0353, + "step": 60738 + }, + { + "epoch": 0.7592689817245432, + "grad_norm": 3.3483381271362305, + "learning_rate": 3.327696175086218e-06, + "loss": 0.6283, + "step": 60740 + }, + { + "epoch": 0.7592939823495587, + "grad_norm": 4.120086193084717, + "learning_rate": 3.3270461769650554e-06, + "loss": 1.5535, + "step": 60742 + }, + { + "epoch": 0.7593189829745743, + "grad_norm": 2.9224038124084473, + "learning_rate": 3.326396229664165e-06, + "loss": 0.9662, + "step": 60744 + }, + { + "epoch": 0.75934398359959, + "grad_norm": 8.044983863830566, + "learning_rate": 3.3257463331884966e-06, + "loss": 2.0952, + "step": 60746 + }, + { + "epoch": 0.7593689842246056, + "grad_norm": 0.9386069178581238, + "learning_rate": 3.325096487542998e-06, + "loss": 0.1772, + "step": 60748 + }, + { + "epoch": 0.7593939848496213, + "grad_norm": 2.7748610973358154, + "learning_rate": 3.3244466927326236e-06, + "loss": 0.9811, + "step": 60750 + }, + { + "epoch": 0.7594189854746368, + "grad_norm": 3.453012228012085, + "learning_rate": 3.3237969487623167e-06, + "loss": 1.0103, + "step": 60752 + }, + { + "epoch": 0.7594439860996525, + "grad_norm": 1.8840872049331665, + "learning_rate": 3.323147255637027e-06, + "loss": 0.559, + "step": 60754 + }, + { + "epoch": 0.7594689867246681, + "grad_norm": 1.93742835521698, + "learning_rate": 3.3224976133617103e-06, + "loss": 0.6964, + "step": 60756 + }, + { + "epoch": 0.7594939873496838, + "grad_norm": 8.29323673248291, + "learning_rate": 3.321848021941302e-06, + "loss": 2.0344, + "step": 60758 + }, + { + "epoch": 0.7595189879746994, + "grad_norm": 3.686129570007324, + "learning_rate": 3.321198481380756e-06, + "loss": 1.1207, + "step": 60760 + }, + { + "epoch": 0.759543988599715, + "grad_norm": 3.3091988563537598, + "learning_rate": 3.3205489916850165e-06, + "loss": 1.4108, + "step": 60762 + }, + { + "epoch": 0.7595689892247306, + "grad_norm": 0.9476190209388733, + "learning_rate": 3.3198995528590306e-06, + "loss": 0.654, + "step": 60764 + }, + { + "epoch": 0.7595939898497462, + "grad_norm": 2.250178575515747, + "learning_rate": 3.319250164907749e-06, + "loss": 0.7768, + "step": 60766 + }, + { + "epoch": 0.7596189904747619, + "grad_norm": 1.9175071716308594, + "learning_rate": 3.318600827836109e-06, + "loss": 0.1892, + "step": 60768 + }, + { + "epoch": 0.7596439910997775, + "grad_norm": 3.8530867099761963, + "learning_rate": 3.3179515416490683e-06, + "loss": 1.7414, + "step": 60770 + }, + { + "epoch": 0.7596689917247931, + "grad_norm": 3.8777363300323486, + "learning_rate": 3.317302306351555e-06, + "loss": 0.7801, + "step": 60772 + }, + { + "epoch": 0.7596939923498087, + "grad_norm": 0.6448186635971069, + "learning_rate": 3.3166531219485274e-06, + "loss": 0.5424, + "step": 60774 + }, + { + "epoch": 0.7597189929748244, + "grad_norm": 5.07872200012207, + "learning_rate": 3.316003988444921e-06, + "loss": 0.7507, + "step": 60776 + }, + { + "epoch": 0.75974399359984, + "grad_norm": 0.014085585251450539, + "learning_rate": 3.3153549058456826e-06, + "loss": 1.4443, + "step": 60778 + }, + { + "epoch": 0.7597689942248557, + "grad_norm": 12.648582458496094, + "learning_rate": 3.3147058741557593e-06, + "loss": 2.1286, + "step": 60780 + }, + { + "epoch": 0.7597939948498712, + "grad_norm": 1.0759302377700806, + "learning_rate": 3.3140568933800908e-06, + "loss": 0.3035, + "step": 60782 + }, + { + "epoch": 0.7598189954748868, + "grad_norm": 3.749504804611206, + "learning_rate": 3.3134079635236193e-06, + "loss": 0.5188, + "step": 60784 + }, + { + "epoch": 0.7598439960999025, + "grad_norm": 1.2076059579849243, + "learning_rate": 3.3127590845912837e-06, + "loss": 0.0349, + "step": 60786 + }, + { + "epoch": 0.7598689967249181, + "grad_norm": 2.6355273723602295, + "learning_rate": 3.3121102565880335e-06, + "loss": 1.4004, + "step": 60788 + }, + { + "epoch": 0.7598939973499338, + "grad_norm": 1.9784269332885742, + "learning_rate": 3.3114614795188014e-06, + "loss": 0.6705, + "step": 60790 + }, + { + "epoch": 0.7599189979749493, + "grad_norm": 0.0023278144653886557, + "learning_rate": 3.3108127533885337e-06, + "loss": 0.0202, + "step": 60792 + }, + { + "epoch": 0.759943998599965, + "grad_norm": 2.6606228351593018, + "learning_rate": 3.310164078202173e-06, + "loss": 0.9304, + "step": 60794 + }, + { + "epoch": 0.7599689992249806, + "grad_norm": 5.106963634490967, + "learning_rate": 3.309515453964657e-06, + "loss": 2.3172, + "step": 60796 + }, + { + "epoch": 0.7599939998499963, + "grad_norm": 4.918137073516846, + "learning_rate": 3.308866880680924e-06, + "loss": 1.0404, + "step": 60798 + }, + { + "epoch": 0.7600190004750119, + "grad_norm": 3.5309178829193115, + "learning_rate": 3.3082183583559133e-06, + "loss": 2.2363, + "step": 60800 + }, + { + "epoch": 0.7600440011000275, + "grad_norm": 2.9303011894226074, + "learning_rate": 3.3075698869945638e-06, + "loss": 0.0751, + "step": 60802 + }, + { + "epoch": 0.7600690017250431, + "grad_norm": 0.0005399655783548951, + "learning_rate": 3.306921466601819e-06, + "loss": 0.4892, + "step": 60804 + }, + { + "epoch": 0.7600940023500588, + "grad_norm": 3.8999276161193848, + "learning_rate": 3.306273097182612e-06, + "loss": 1.0307, + "step": 60806 + }, + { + "epoch": 0.7601190029750744, + "grad_norm": 1.6753005981445312, + "learning_rate": 3.305624778741887e-06, + "loss": 0.4847, + "step": 60808 + }, + { + "epoch": 0.76014400360009, + "grad_norm": 0.47942662239074707, + "learning_rate": 3.3049765112845723e-06, + "loss": 0.3071, + "step": 60810 + }, + { + "epoch": 0.7601690042251056, + "grad_norm": 2.814577579498291, + "learning_rate": 3.304328294815612e-06, + "loss": 1.4529, + "step": 60812 + }, + { + "epoch": 0.7601940048501212, + "grad_norm": 2.2387003898620605, + "learning_rate": 3.303680129339937e-06, + "loss": 0.5743, + "step": 60814 + }, + { + "epoch": 0.7602190054751369, + "grad_norm": 4.857999801635742, + "learning_rate": 3.3030320148624873e-06, + "loss": 1.1699, + "step": 60816 + }, + { + "epoch": 0.7602440061001525, + "grad_norm": 1.13772451877594, + "learning_rate": 3.3023839513882027e-06, + "loss": 0.0577, + "step": 60818 + }, + { + "epoch": 0.7602690067251682, + "grad_norm": 3.9982919692993164, + "learning_rate": 3.3017359389220117e-06, + "loss": 0.2031, + "step": 60820 + }, + { + "epoch": 0.7602940073501837, + "grad_norm": 2.9506633281707764, + "learning_rate": 3.3010879774688577e-06, + "loss": 1.1221, + "step": 60822 + }, + { + "epoch": 0.7603190079751994, + "grad_norm": 5.98234748840332, + "learning_rate": 3.3004400670336644e-06, + "loss": 0.7149, + "step": 60824 + }, + { + "epoch": 0.760344008600215, + "grad_norm": 0.561211347579956, + "learning_rate": 3.2997922076213727e-06, + "loss": 0.9368, + "step": 60826 + }, + { + "epoch": 0.7603690092252307, + "grad_norm": 1.7569506168365479, + "learning_rate": 3.2991443992369186e-06, + "loss": 0.7387, + "step": 60828 + }, + { + "epoch": 0.7603940098502463, + "grad_norm": 0.00935299601405859, + "learning_rate": 3.2984966418852314e-06, + "loss": 0.1812, + "step": 60830 + }, + { + "epoch": 0.7604190104752618, + "grad_norm": 1.5310399532318115, + "learning_rate": 3.2978489355712485e-06, + "loss": 0.8578, + "step": 60832 + }, + { + "epoch": 0.7604440111002775, + "grad_norm": 5.2125020027160645, + "learning_rate": 3.2972012802999e-06, + "loss": 2.5272, + "step": 60834 + }, + { + "epoch": 0.7604690117252931, + "grad_norm": 1.0675086975097656, + "learning_rate": 3.2965536760761186e-06, + "loss": 0.1912, + "step": 60836 + }, + { + "epoch": 0.7604940123503088, + "grad_norm": 0.4683547616004944, + "learning_rate": 3.295906122904833e-06, + "loss": 0.42, + "step": 60838 + }, + { + "epoch": 0.7605190129753244, + "grad_norm": 2.817997694015503, + "learning_rate": 3.2952586207909787e-06, + "loss": 0.2867, + "step": 60840 + }, + { + "epoch": 0.76054401360034, + "grad_norm": 5.13368034362793, + "learning_rate": 3.29461116973949e-06, + "loss": 1.4016, + "step": 60842 + }, + { + "epoch": 0.7605690142253556, + "grad_norm": 4.159456253051758, + "learning_rate": 3.2939637697552907e-06, + "loss": 0.5101, + "step": 60844 + }, + { + "epoch": 0.7605940148503713, + "grad_norm": 1.0926172733306885, + "learning_rate": 3.293316420843319e-06, + "loss": 0.024, + "step": 60846 + }, + { + "epoch": 0.7606190154753869, + "grad_norm": 2.0869626998901367, + "learning_rate": 3.2926691230085006e-06, + "loss": 0.7957, + "step": 60848 + }, + { + "epoch": 0.7606440161004026, + "grad_norm": 0.0010587220313027501, + "learning_rate": 3.2920218762557656e-06, + "loss": 0.0127, + "step": 60850 + }, + { + "epoch": 0.7606690167254181, + "grad_norm": 4.788615703582764, + "learning_rate": 3.29137468059004e-06, + "loss": 0.3617, + "step": 60852 + }, + { + "epoch": 0.7606940173504337, + "grad_norm": 3.0294883251190186, + "learning_rate": 3.290727536016257e-06, + "loss": 1.132, + "step": 60854 + }, + { + "epoch": 0.7607190179754494, + "grad_norm": 4.748729228973389, + "learning_rate": 3.290080442539346e-06, + "loss": 1.8352, + "step": 60856 + }, + { + "epoch": 0.760744018600465, + "grad_norm": 1.3422001600265503, + "learning_rate": 3.2894334001642315e-06, + "loss": 0.8978, + "step": 60858 + }, + { + "epoch": 0.7607690192254807, + "grad_norm": 3.337899684906006, + "learning_rate": 3.28878640889585e-06, + "loss": 1.0898, + "step": 60860 + }, + { + "epoch": 0.7607940198504962, + "grad_norm": 3.135878801345825, + "learning_rate": 3.2881394687391154e-06, + "loss": 0.5215, + "step": 60862 + }, + { + "epoch": 0.7608190204755119, + "grad_norm": 5.749073505401611, + "learning_rate": 3.287492579698961e-06, + "loss": 1.6768, + "step": 60864 + }, + { + "epoch": 0.7608440211005275, + "grad_norm": 4.687914848327637, + "learning_rate": 3.2868457417803166e-06, + "loss": 0.7474, + "step": 60866 + }, + { + "epoch": 0.7608690217255432, + "grad_norm": 2.7651097774505615, + "learning_rate": 3.2861989549881035e-06, + "loss": 0.5366, + "step": 60868 + }, + { + "epoch": 0.7608940223505588, + "grad_norm": 2.694324493408203, + "learning_rate": 3.285552219327253e-06, + "loss": 1.1798, + "step": 60870 + }, + { + "epoch": 0.7609190229755743, + "grad_norm": 0.155620276927948, + "learning_rate": 3.2849055348026836e-06, + "loss": 0.1265, + "step": 60872 + }, + { + "epoch": 0.76094402360059, + "grad_norm": 3.270500421524048, + "learning_rate": 3.2842589014193304e-06, + "loss": 0.7805, + "step": 60874 + }, + { + "epoch": 0.7609690242256056, + "grad_norm": 2.965292453765869, + "learning_rate": 3.283612319182106e-06, + "loss": 0.2366, + "step": 60876 + }, + { + "epoch": 0.7609940248506213, + "grad_norm": 6.853362560272217, + "learning_rate": 3.2829657880959398e-06, + "loss": 0.4506, + "step": 60878 + }, + { + "epoch": 0.7610190254756369, + "grad_norm": 2.918536901473999, + "learning_rate": 3.282319308165759e-06, + "loss": 1.8553, + "step": 60880 + }, + { + "epoch": 0.7610440261006525, + "grad_norm": 4.144326686859131, + "learning_rate": 3.2816728793964816e-06, + "loss": 0.8453, + "step": 60882 + }, + { + "epoch": 0.7610690267256681, + "grad_norm": 2.5797324180603027, + "learning_rate": 3.2810265017930366e-06, + "loss": 1.1942, + "step": 60884 + }, + { + "epoch": 0.7610940273506838, + "grad_norm": 3.0963871479034424, + "learning_rate": 3.2803801753603436e-06, + "loss": 1.195, + "step": 60886 + }, + { + "epoch": 0.7611190279756994, + "grad_norm": 3.665318489074707, + "learning_rate": 3.279733900103322e-06, + "loss": 0.6968, + "step": 60888 + }, + { + "epoch": 0.7611440286007151, + "grad_norm": 3.969874620437622, + "learning_rate": 3.2790876760269008e-06, + "loss": 0.9041, + "step": 60890 + }, + { + "epoch": 0.7611690292257306, + "grad_norm": 0.7871903777122498, + "learning_rate": 3.278441503135993e-06, + "loss": 0.0748, + "step": 60892 + }, + { + "epoch": 0.7611940298507462, + "grad_norm": 0.9951147437095642, + "learning_rate": 3.277795381435528e-06, + "loss": 0.2646, + "step": 60894 + }, + { + "epoch": 0.7612190304757619, + "grad_norm": 5.4365386962890625, + "learning_rate": 3.2771493109304185e-06, + "loss": 0.958, + "step": 60896 + }, + { + "epoch": 0.7612440311007775, + "grad_norm": 5.417089462280273, + "learning_rate": 3.2765032916255936e-06, + "loss": 1.1307, + "step": 60898 + }, + { + "epoch": 0.7612690317257932, + "grad_norm": 1.2617087364196777, + "learning_rate": 3.2758573235259682e-06, + "loss": 0.9986, + "step": 60900 + }, + { + "epoch": 0.7612940323508087, + "grad_norm": 0.015755048021674156, + "learning_rate": 3.2752114066364594e-06, + "loss": 0.6655, + "step": 60902 + }, + { + "epoch": 0.7613190329758244, + "grad_norm": 3.557065725326538, + "learning_rate": 3.2745655409619926e-06, + "loss": 1.1687, + "step": 60904 + }, + { + "epoch": 0.76134403360084, + "grad_norm": 0.38656511902809143, + "learning_rate": 3.2739197265074806e-06, + "loss": 0.007, + "step": 60906 + }, + { + "epoch": 0.7613690342258557, + "grad_norm": 2.402127981185913, + "learning_rate": 3.2732739632778486e-06, + "loss": 0.6763, + "step": 60908 + }, + { + "epoch": 0.7613940348508713, + "grad_norm": 4.556275367736816, + "learning_rate": 3.272628251278007e-06, + "loss": 0.5504, + "step": 60910 + }, + { + "epoch": 0.7614190354758869, + "grad_norm": 0.5448907017707825, + "learning_rate": 3.271982590512881e-06, + "loss": 0.0169, + "step": 60912 + }, + { + "epoch": 0.7614440361009025, + "grad_norm": 2.946852207183838, + "learning_rate": 3.271336980987383e-06, + "loss": 0.8512, + "step": 60914 + }, + { + "epoch": 0.7614690367259181, + "grad_norm": 3.2298643589019775, + "learning_rate": 3.270691422706429e-06, + "loss": 1.0052, + "step": 60916 + }, + { + "epoch": 0.7614940373509338, + "grad_norm": 5.069606781005859, + "learning_rate": 3.27004591567494e-06, + "loss": 0.6744, + "step": 60918 + }, + { + "epoch": 0.7615190379759494, + "grad_norm": 4.07796573638916, + "learning_rate": 3.269400459897827e-06, + "loss": 1.1943, + "step": 60920 + }, + { + "epoch": 0.761544038600965, + "grad_norm": 2.854308605194092, + "learning_rate": 3.268755055380011e-06, + "loss": 1.5321, + "step": 60922 + }, + { + "epoch": 0.7615690392259806, + "grad_norm": 0.9822505712509155, + "learning_rate": 3.268109702126402e-06, + "loss": 0.464, + "step": 60924 + }, + { + "epoch": 0.7615940398509963, + "grad_norm": 3.531933546066284, + "learning_rate": 3.2674644001419196e-06, + "loss": 1.7554, + "step": 60926 + }, + { + "epoch": 0.7616190404760119, + "grad_norm": 4.296471118927002, + "learning_rate": 3.266819149431477e-06, + "loss": 2.1943, + "step": 60928 + }, + { + "epoch": 0.7616440411010276, + "grad_norm": 3.4055228233337402, + "learning_rate": 3.2661739499999855e-06, + "loss": 2.1517, + "step": 60930 + }, + { + "epoch": 0.7616690417260431, + "grad_norm": 6.9124579429626465, + "learning_rate": 3.265528801852362e-06, + "loss": 1.2383, + "step": 60932 + }, + { + "epoch": 0.7616940423510588, + "grad_norm": 5.063586235046387, + "learning_rate": 3.264883704993518e-06, + "loss": 1.4304, + "step": 60934 + }, + { + "epoch": 0.7617190429760744, + "grad_norm": 3.026566982269287, + "learning_rate": 3.264238659428368e-06, + "loss": 0.0834, + "step": 60936 + }, + { + "epoch": 0.76174404360109, + "grad_norm": 3.05547833442688, + "learning_rate": 3.263593665161823e-06, + "loss": 1.0801, + "step": 60938 + }, + { + "epoch": 0.7617690442261057, + "grad_norm": 2.464155912399292, + "learning_rate": 3.262948722198798e-06, + "loss": 0.9138, + "step": 60940 + }, + { + "epoch": 0.7617940448511212, + "grad_norm": 4.259618759155273, + "learning_rate": 3.2623038305442024e-06, + "loss": 0.692, + "step": 60942 + }, + { + "epoch": 0.7618190454761369, + "grad_norm": 0.19056694209575653, + "learning_rate": 3.261658990202945e-06, + "loss": 0.3854, + "step": 60944 + }, + { + "epoch": 0.7618440461011525, + "grad_norm": 1.4341400861740112, + "learning_rate": 3.261014201179944e-06, + "loss": 1.3125, + "step": 60946 + }, + { + "epoch": 0.7618690467261682, + "grad_norm": 3.7139997482299805, + "learning_rate": 3.260369463480101e-06, + "loss": 0.5345, + "step": 60948 + }, + { + "epoch": 0.7618940473511838, + "grad_norm": 6.521985054016113, + "learning_rate": 3.2597247771083362e-06, + "loss": 2.2364, + "step": 60950 + }, + { + "epoch": 0.7619190479761994, + "grad_norm": 3.425795078277588, + "learning_rate": 3.259080142069553e-06, + "loss": 1.1321, + "step": 60952 + }, + { + "epoch": 0.761944048601215, + "grad_norm": 4.097689628601074, + "learning_rate": 3.258435558368659e-06, + "loss": 1.1115, + "step": 60954 + }, + { + "epoch": 0.7619690492262307, + "grad_norm": 3.2601585388183594, + "learning_rate": 3.2577910260105704e-06, + "loss": 1.1808, + "step": 60956 + }, + { + "epoch": 0.7619940498512463, + "grad_norm": 4.986323833465576, + "learning_rate": 3.2571465450001883e-06, + "loss": 2.2246, + "step": 60958 + }, + { + "epoch": 0.762019050476262, + "grad_norm": 0.8227254152297974, + "learning_rate": 3.256502115342428e-06, + "loss": 1.2053, + "step": 60960 + }, + { + "epoch": 0.7620440511012775, + "grad_norm": 0.0008964231237769127, + "learning_rate": 3.2558577370421906e-06, + "loss": 0.3073, + "step": 60962 + }, + { + "epoch": 0.7620690517262931, + "grad_norm": 8.340296745300293, + "learning_rate": 3.2552134101043895e-06, + "loss": 1.2582, + "step": 60964 + }, + { + "epoch": 0.7620940523513088, + "grad_norm": 3.961477756500244, + "learning_rate": 3.2545691345339304e-06, + "loss": 1.8315, + "step": 60966 + }, + { + "epoch": 0.7621190529763244, + "grad_norm": 1.199817419052124, + "learning_rate": 3.253924910335715e-06, + "loss": 0.4439, + "step": 60968 + }, + { + "epoch": 0.7621440536013401, + "grad_norm": 0.022848952561616898, + "learning_rate": 3.253280737514657e-06, + "loss": 0.0137, + "step": 60970 + }, + { + "epoch": 0.7621690542263556, + "grad_norm": 3.6680421829223633, + "learning_rate": 3.252636616075656e-06, + "loss": 1.6082, + "step": 60972 + }, + { + "epoch": 0.7621940548513713, + "grad_norm": 1.1385375261306763, + "learning_rate": 3.2519925460236236e-06, + "loss": 0.1065, + "step": 60974 + }, + { + "epoch": 0.7622190554763869, + "grad_norm": 3.884338140487671, + "learning_rate": 3.2513485273634572e-06, + "loss": 0.5604, + "step": 60976 + }, + { + "epoch": 0.7622440561014026, + "grad_norm": 2.4901044368743896, + "learning_rate": 3.2507045601000707e-06, + "loss": 0.1047, + "step": 60978 + }, + { + "epoch": 0.7622690567264182, + "grad_norm": 3.8248488903045654, + "learning_rate": 3.250060644238363e-06, + "loss": 1.1897, + "step": 60980 + }, + { + "epoch": 0.7622940573514337, + "grad_norm": 5.0009846687316895, + "learning_rate": 3.2494167797832367e-06, + "loss": 1.64, + "step": 60982 + }, + { + "epoch": 0.7623190579764494, + "grad_norm": 3.560452699661255, + "learning_rate": 3.2487729667395996e-06, + "loss": 0.1807, + "step": 60984 + }, + { + "epoch": 0.762344058601465, + "grad_norm": 4.418520927429199, + "learning_rate": 3.2481292051123503e-06, + "loss": 1.0808, + "step": 60986 + }, + { + "epoch": 0.7623690592264807, + "grad_norm": 4.836862087249756, + "learning_rate": 3.247485494906397e-06, + "loss": 1.2344, + "step": 60988 + }, + { + "epoch": 0.7623940598514963, + "grad_norm": 4.275575160980225, + "learning_rate": 3.246841836126636e-06, + "loss": 0.8148, + "step": 60990 + }, + { + "epoch": 0.7624190604765119, + "grad_norm": 0.0018421501154080033, + "learning_rate": 3.246198228777976e-06, + "loss": 0.4034, + "step": 60992 + }, + { + "epoch": 0.7624440611015275, + "grad_norm": 1.0957298278808594, + "learning_rate": 3.2455546728653155e-06, + "loss": 1.1091, + "step": 60994 + }, + { + "epoch": 0.7624690617265432, + "grad_norm": 1.5647621154785156, + "learning_rate": 3.2449111683935517e-06, + "loss": 0.5137, + "step": 60996 + }, + { + "epoch": 0.7624940623515588, + "grad_norm": 3.2977752685546875, + "learning_rate": 3.2442677153675927e-06, + "loss": 0.9602, + "step": 60998 + }, + { + "epoch": 0.7625190629765745, + "grad_norm": 5.755178451538086, + "learning_rate": 3.243624313792333e-06, + "loss": 1.177, + "step": 61000 + }, + { + "epoch": 0.76254406360159, + "grad_norm": 2.9501328468322754, + "learning_rate": 3.2429809636726773e-06, + "loss": 0.5939, + "step": 61002 + }, + { + "epoch": 0.7625690642266056, + "grad_norm": 1.9891666173934937, + "learning_rate": 3.242337665013523e-06, + "loss": 1.1007, + "step": 61004 + }, + { + "epoch": 0.7625940648516213, + "grad_norm": 6.962606430053711, + "learning_rate": 3.2416944178197663e-06, + "loss": 0.9334, + "step": 61006 + }, + { + "epoch": 0.7626190654766369, + "grad_norm": 2.6502435207366943, + "learning_rate": 3.2410512220963132e-06, + "loss": 1.0877, + "step": 61008 + }, + { + "epoch": 0.7626440661016526, + "grad_norm": 1.9068303108215332, + "learning_rate": 3.240408077848054e-06, + "loss": 0.649, + "step": 61010 + }, + { + "epoch": 0.7626690667266681, + "grad_norm": 0.009640434756875038, + "learning_rate": 3.239764985079895e-06, + "loss": 0.0002, + "step": 61012 + }, + { + "epoch": 0.7626940673516838, + "grad_norm": 3.8883471488952637, + "learning_rate": 3.2391219437967257e-06, + "loss": 1.086, + "step": 61014 + }, + { + "epoch": 0.7627190679766994, + "grad_norm": 3.0004541873931885, + "learning_rate": 3.238478954003452e-06, + "loss": 1.0031, + "step": 61016 + }, + { + "epoch": 0.7627440686017151, + "grad_norm": 4.9006266593933105, + "learning_rate": 3.237836015704966e-06, + "loss": 1.1917, + "step": 61018 + }, + { + "epoch": 0.7627690692267307, + "grad_norm": 6.315701484680176, + "learning_rate": 3.237193128906161e-06, + "loss": 0.6511, + "step": 61020 + }, + { + "epoch": 0.7627940698517462, + "grad_norm": 0.002075859112665057, + "learning_rate": 3.23655029361194e-06, + "loss": 0.1308, + "step": 61022 + }, + { + "epoch": 0.7628190704767619, + "grad_norm": 3.7943155765533447, + "learning_rate": 3.235907509827193e-06, + "loss": 1.8114, + "step": 61024 + }, + { + "epoch": 0.7628440711017775, + "grad_norm": 2.0403993129730225, + "learning_rate": 3.23526477755682e-06, + "loss": 1.5692, + "step": 61026 + }, + { + "epoch": 0.7628690717267932, + "grad_norm": 1.7392795085906982, + "learning_rate": 3.2346220968057117e-06, + "loss": 0.7746, + "step": 61028 + }, + { + "epoch": 0.7628940723518088, + "grad_norm": 1.9342392683029175, + "learning_rate": 3.233979467578767e-06, + "loss": 0.5142, + "step": 61030 + }, + { + "epoch": 0.7629190729768244, + "grad_norm": 0.2726365923881531, + "learning_rate": 3.2333368898808794e-06, + "loss": 0.3639, + "step": 61032 + }, + { + "epoch": 0.76294407360184, + "grad_norm": 2.6525936126708984, + "learning_rate": 3.2326943637169363e-06, + "loss": 0.2491, + "step": 61034 + }, + { + "epoch": 0.7629690742268557, + "grad_norm": 4.411500453948975, + "learning_rate": 3.2320518890918406e-06, + "loss": 1.7656, + "step": 61036 + }, + { + "epoch": 0.7629940748518713, + "grad_norm": 5.844754695892334, + "learning_rate": 3.231409466010477e-06, + "loss": 1.2053, + "step": 61038 + }, + { + "epoch": 0.763019075476887, + "grad_norm": 2.0743567943573, + "learning_rate": 3.2307670944777434e-06, + "loss": 1.3567, + "step": 61040 + }, + { + "epoch": 0.7630440761019025, + "grad_norm": 5.4800567626953125, + "learning_rate": 3.2301247744985285e-06, + "loss": 0.524, + "step": 61042 + }, + { + "epoch": 0.7630690767269181, + "grad_norm": 0.014265358448028564, + "learning_rate": 3.2294825060777293e-06, + "loss": 0.6712, + "step": 61044 + }, + { + "epoch": 0.7630940773519338, + "grad_norm": 5.146373748779297, + "learning_rate": 3.2288402892202343e-06, + "loss": 0.7722, + "step": 61046 + }, + { + "epoch": 0.7631190779769494, + "grad_norm": 0.0005354839377105236, + "learning_rate": 3.2281981239309302e-06, + "loss": 1.2047, + "step": 61048 + }, + { + "epoch": 0.7631440786019651, + "grad_norm": 0.5164008140563965, + "learning_rate": 3.227556010214714e-06, + "loss": 0.9313, + "step": 61050 + }, + { + "epoch": 0.7631690792269806, + "grad_norm": 3.9081404209136963, + "learning_rate": 3.226913948076471e-06, + "loss": 0.6189, + "step": 61052 + }, + { + "epoch": 0.7631940798519963, + "grad_norm": 4.004461765289307, + "learning_rate": 3.2262719375210973e-06, + "loss": 0.8397, + "step": 61054 + }, + { + "epoch": 0.7632190804770119, + "grad_norm": 6.274811267852783, + "learning_rate": 3.225629978553478e-06, + "loss": 1.0141, + "step": 61056 + }, + { + "epoch": 0.7632440811020276, + "grad_norm": 0.0015444769524037838, + "learning_rate": 3.224988071178501e-06, + "loss": 0.4738, + "step": 61058 + }, + { + "epoch": 0.7632690817270432, + "grad_norm": 1.624319314956665, + "learning_rate": 3.2243462154010585e-06, + "loss": 0.0803, + "step": 61060 + }, + { + "epoch": 0.7632940823520588, + "grad_norm": 2.935291290283203, + "learning_rate": 3.223704411226034e-06, + "loss": 0.1236, + "step": 61062 + }, + { + "epoch": 0.7633190829770744, + "grad_norm": 0.6141936182975769, + "learning_rate": 3.223062658658321e-06, + "loss": 0.0757, + "step": 61064 + }, + { + "epoch": 0.76334408360209, + "grad_norm": 3.9992194175720215, + "learning_rate": 3.222420957702803e-06, + "loss": 1.0341, + "step": 61066 + }, + { + "epoch": 0.7633690842271057, + "grad_norm": 0.0009900174336507916, + "learning_rate": 3.2217793083643713e-06, + "loss": 0.3672, + "step": 61068 + }, + { + "epoch": 0.7633940848521213, + "grad_norm": 3.011165142059326, + "learning_rate": 3.2211377106479092e-06, + "loss": 0.8076, + "step": 61070 + }, + { + "epoch": 0.7634190854771369, + "grad_norm": 3.5697245597839355, + "learning_rate": 3.2204961645583e-06, + "loss": 0.6299, + "step": 61072 + }, + { + "epoch": 0.7634440861021525, + "grad_norm": 1.9281878471374512, + "learning_rate": 3.2198546701004373e-06, + "loss": 0.3815, + "step": 61074 + }, + { + "epoch": 0.7634690867271682, + "grad_norm": 4.682060241699219, + "learning_rate": 3.2192132272791997e-06, + "loss": 0.6331, + "step": 61076 + }, + { + "epoch": 0.7634940873521838, + "grad_norm": 9.370803833007812, + "learning_rate": 3.218571836099478e-06, + "loss": 0.938, + "step": 61078 + }, + { + "epoch": 0.7635190879771995, + "grad_norm": 2.4429547786712646, + "learning_rate": 3.2179304965661507e-06, + "loss": 0.7857, + "step": 61080 + }, + { + "epoch": 0.763544088602215, + "grad_norm": 2.6249759197235107, + "learning_rate": 3.2172892086841103e-06, + "loss": 1.1571, + "step": 61082 + }, + { + "epoch": 0.7635690892272307, + "grad_norm": 3.0610098838806152, + "learning_rate": 3.2166479724582344e-06, + "loss": 0.7756, + "step": 61084 + }, + { + "epoch": 0.7635940898522463, + "grad_norm": 4.146885871887207, + "learning_rate": 3.2160067878934054e-06, + "loss": 1.5597, + "step": 61086 + }, + { + "epoch": 0.763619090477262, + "grad_norm": 1.5533052682876587, + "learning_rate": 3.2153656549945135e-06, + "loss": 0.084, + "step": 61088 + }, + { + "epoch": 0.7636440911022776, + "grad_norm": 2.9979097843170166, + "learning_rate": 3.2147245737664335e-06, + "loss": 0.5936, + "step": 61090 + }, + { + "epoch": 0.7636690917272931, + "grad_norm": 2.4017248153686523, + "learning_rate": 3.214083544214055e-06, + "loss": 1.2059, + "step": 61092 + }, + { + "epoch": 0.7636940923523088, + "grad_norm": 3.8115012645721436, + "learning_rate": 3.213442566342253e-06, + "loss": 1.8933, + "step": 61094 + }, + { + "epoch": 0.7637190929773244, + "grad_norm": 4.423490047454834, + "learning_rate": 3.212801640155916e-06, + "loss": 1.044, + "step": 61096 + }, + { + "epoch": 0.7637440936023401, + "grad_norm": 1.1049549579620361, + "learning_rate": 3.212160765659923e-06, + "loss": 0.6344, + "step": 61098 + }, + { + "epoch": 0.7637690942273557, + "grad_norm": 4.71026611328125, + "learning_rate": 3.211519942859149e-06, + "loss": 1.3033, + "step": 61100 + }, + { + "epoch": 0.7637940948523713, + "grad_norm": 0.0010655575897544622, + "learning_rate": 3.2108791717584832e-06, + "loss": 0.8647, + "step": 61102 + }, + { + "epoch": 0.7638190954773869, + "grad_norm": 2.5256495475769043, + "learning_rate": 3.210238452362798e-06, + "loss": 1.4467, + "step": 61104 + }, + { + "epoch": 0.7638440961024026, + "grad_norm": 1.3535914421081543, + "learning_rate": 3.20959778467698e-06, + "loss": 0.0977, + "step": 61106 + }, + { + "epoch": 0.7638690967274182, + "grad_norm": 5.33596134185791, + "learning_rate": 3.208957168705905e-06, + "loss": 1.1324, + "step": 61108 + }, + { + "epoch": 0.7638940973524339, + "grad_norm": 4.040293216705322, + "learning_rate": 3.2083166044544488e-06, + "loss": 0.9196, + "step": 61110 + }, + { + "epoch": 0.7639190979774494, + "grad_norm": 8.86044692993164, + "learning_rate": 3.2076760919274964e-06, + "loss": 1.2787, + "step": 61112 + }, + { + "epoch": 0.763944098602465, + "grad_norm": 9.836567878723145, + "learning_rate": 3.2070356311299176e-06, + "loss": 1.7571, + "step": 61114 + }, + { + "epoch": 0.7639690992274807, + "grad_norm": 5.280313491821289, + "learning_rate": 3.2063952220665994e-06, + "loss": 0.8762, + "step": 61116 + }, + { + "epoch": 0.7639940998524963, + "grad_norm": 5.631299018859863, + "learning_rate": 3.205754864742411e-06, + "loss": 1.5603, + "step": 61118 + }, + { + "epoch": 0.764019100477512, + "grad_norm": 3.790902853012085, + "learning_rate": 3.2051145591622335e-06, + "loss": 1.0726, + "step": 61120 + }, + { + "epoch": 0.7640441011025275, + "grad_norm": 2.3155763149261475, + "learning_rate": 3.204474305330948e-06, + "loss": 0.8567, + "step": 61122 + }, + { + "epoch": 0.7640691017275432, + "grad_norm": 1.295339584350586, + "learning_rate": 3.2038341032534194e-06, + "loss": 0.8839, + "step": 61124 + }, + { + "epoch": 0.7640941023525588, + "grad_norm": 3.0212242603302, + "learning_rate": 3.203193952934531e-06, + "loss": 1.3407, + "step": 61126 + }, + { + "epoch": 0.7641191029775745, + "grad_norm": 2.9689383506774902, + "learning_rate": 3.2025538543791546e-06, + "loss": 1.0256, + "step": 61128 + }, + { + "epoch": 0.7641441036025901, + "grad_norm": 0.7348242998123169, + "learning_rate": 3.2019138075921696e-06, + "loss": 1.8015, + "step": 61130 + }, + { + "epoch": 0.7641691042276056, + "grad_norm": 3.6257615089416504, + "learning_rate": 3.2012738125784448e-06, + "loss": 1.4118, + "step": 61132 + }, + { + "epoch": 0.7641941048526213, + "grad_norm": 9.743130683898926, + "learning_rate": 3.2006338693428606e-06, + "loss": 1.5833, + "step": 61134 + }, + { + "epoch": 0.7642191054776369, + "grad_norm": 2.0031747817993164, + "learning_rate": 3.1999939778902866e-06, + "loss": 1.1906, + "step": 61136 + }, + { + "epoch": 0.7642441061026526, + "grad_norm": 4.5264081954956055, + "learning_rate": 3.199354138225594e-06, + "loss": 0.2385, + "step": 61138 + }, + { + "epoch": 0.7642691067276682, + "grad_norm": 4.066230773925781, + "learning_rate": 3.198714350353661e-06, + "loss": 1.6696, + "step": 61140 + }, + { + "epoch": 0.7642941073526838, + "grad_norm": 3.4857516288757324, + "learning_rate": 3.198074614279356e-06, + "loss": 0.9673, + "step": 61142 + }, + { + "epoch": 0.7643191079776994, + "grad_norm": 10.580997467041016, + "learning_rate": 3.1974349300075512e-06, + "loss": 0.7751, + "step": 61144 + }, + { + "epoch": 0.7643441086027151, + "grad_norm": 2.5362043380737305, + "learning_rate": 3.196795297543125e-06, + "loss": 0.7776, + "step": 61146 + }, + { + "epoch": 0.7643691092277307, + "grad_norm": 1.012454867362976, + "learning_rate": 3.196155716890943e-06, + "loss": 1.2361, + "step": 61148 + }, + { + "epoch": 0.7643941098527464, + "grad_norm": 4.400866508483887, + "learning_rate": 3.1955161880558762e-06, + "loss": 1.1099, + "step": 61150 + }, + { + "epoch": 0.7644191104777619, + "grad_norm": 0.0005142256850376725, + "learning_rate": 3.194876711042794e-06, + "loss": 0.4801, + "step": 61152 + }, + { + "epoch": 0.7644441111027775, + "grad_norm": 4.0881147384643555, + "learning_rate": 3.194237285856571e-06, + "loss": 1.312, + "step": 61154 + }, + { + "epoch": 0.7644691117277932, + "grad_norm": 1.6863274574279785, + "learning_rate": 3.19359791250207e-06, + "loss": 1.1238, + "step": 61156 + }, + { + "epoch": 0.7644941123528088, + "grad_norm": 3.9198837280273438, + "learning_rate": 3.1929585909841664e-06, + "loss": 1.6886, + "step": 61158 + }, + { + "epoch": 0.7645191129778245, + "grad_norm": 3.0540859699249268, + "learning_rate": 3.1923193213077296e-06, + "loss": 0.6895, + "step": 61160 + }, + { + "epoch": 0.76454411360284, + "grad_norm": 0.001064568990841508, + "learning_rate": 3.191680103477627e-06, + "loss": 0.3764, + "step": 61162 + }, + { + "epoch": 0.7645691142278557, + "grad_norm": 4.74360990524292, + "learning_rate": 3.191040937498726e-06, + "loss": 2.0484, + "step": 61164 + }, + { + "epoch": 0.7645941148528713, + "grad_norm": 5.022472858428955, + "learning_rate": 3.1904018233758903e-06, + "loss": 1.1254, + "step": 61166 + }, + { + "epoch": 0.764619115477887, + "grad_norm": 2.9674642086029053, + "learning_rate": 3.1897627611139957e-06, + "loss": 0.5778, + "step": 61168 + }, + { + "epoch": 0.7646441161029026, + "grad_norm": 2.760448455810547, + "learning_rate": 3.1891237507179007e-06, + "loss": 0.7883, + "step": 61170 + }, + { + "epoch": 0.7646691167279182, + "grad_norm": 2.495922565460205, + "learning_rate": 3.1884847921924767e-06, + "loss": 2.2674, + "step": 61172 + }, + { + "epoch": 0.7646941173529338, + "grad_norm": 0.31336158514022827, + "learning_rate": 3.1878458855425954e-06, + "loss": 0.0056, + "step": 61174 + }, + { + "epoch": 0.7647191179779494, + "grad_norm": 4.948886871337891, + "learning_rate": 3.1872070307731107e-06, + "loss": 0.856, + "step": 61176 + }, + { + "epoch": 0.7647441186029651, + "grad_norm": 3.7699341773986816, + "learning_rate": 3.1865682278888964e-06, + "loss": 0.5933, + "step": 61178 + }, + { + "epoch": 0.7647691192279807, + "grad_norm": 0.0007008984684944153, + "learning_rate": 3.185929476894811e-06, + "loss": 0.1634, + "step": 61180 + }, + { + "epoch": 0.7647941198529963, + "grad_norm": 2.72318959236145, + "learning_rate": 3.185290777795724e-06, + "loss": 0.6828, + "step": 61182 + }, + { + "epoch": 0.7648191204780119, + "grad_norm": 4.438253402709961, + "learning_rate": 3.1846521305965026e-06, + "loss": 0.8492, + "step": 61184 + }, + { + "epoch": 0.7648441211030276, + "grad_norm": 0.0008285744115710258, + "learning_rate": 3.184013535302003e-06, + "loss": 0.0003, + "step": 61186 + }, + { + "epoch": 0.7648691217280432, + "grad_norm": 0.9424312114715576, + "learning_rate": 3.1833749919170986e-06, + "loss": 0.9441, + "step": 61188 + }, + { + "epoch": 0.7648941223530589, + "grad_norm": 2.201908588409424, + "learning_rate": 3.1827365004466404e-06, + "loss": 0.3692, + "step": 61190 + }, + { + "epoch": 0.7649191229780744, + "grad_norm": 2.6752281188964844, + "learning_rate": 3.1820980608955e-06, + "loss": 1.1451, + "step": 61192 + }, + { + "epoch": 0.76494412360309, + "grad_norm": 0.0003365148149896413, + "learning_rate": 3.1814596732685333e-06, + "loss": 0.5174, + "step": 61194 + }, + { + "epoch": 0.7649691242281057, + "grad_norm": 1.4006136655807495, + "learning_rate": 3.1808213375706053e-06, + "loss": 0.4659, + "step": 61196 + }, + { + "epoch": 0.7649941248531213, + "grad_norm": 2.916893720626831, + "learning_rate": 3.1801830538065802e-06, + "loss": 0.4576, + "step": 61198 + }, + { + "epoch": 0.765019125478137, + "grad_norm": 1.1224223375320435, + "learning_rate": 3.1795448219813174e-06, + "loss": 0.1377, + "step": 61200 + }, + { + "epoch": 0.7650441261031525, + "grad_norm": 4.964963912963867, + "learning_rate": 3.178906642099677e-06, + "loss": 1.1076, + "step": 61202 + }, + { + "epoch": 0.7650691267281682, + "grad_norm": 3.370652198791504, + "learning_rate": 3.178268514166516e-06, + "loss": 1.0378, + "step": 61204 + }, + { + "epoch": 0.7650941273531838, + "grad_norm": 4.725106716156006, + "learning_rate": 3.177630438186696e-06, + "loss": 0.9601, + "step": 61206 + }, + { + "epoch": 0.7651191279781995, + "grad_norm": 7.6754069328308105, + "learning_rate": 3.176992414165082e-06, + "loss": 0.4926, + "step": 61208 + }, + { + "epoch": 0.7651441286032151, + "grad_norm": 1.9799526929855347, + "learning_rate": 3.176354442106526e-06, + "loss": 0.1164, + "step": 61210 + }, + { + "epoch": 0.7651691292282307, + "grad_norm": 2.002793550491333, + "learning_rate": 3.1757165220158924e-06, + "loss": 1.4095, + "step": 61212 + }, + { + "epoch": 0.7651941298532463, + "grad_norm": 3.08341646194458, + "learning_rate": 3.1750786538980373e-06, + "loss": 0.3554, + "step": 61214 + }, + { + "epoch": 0.765219130478262, + "grad_norm": 2.8768579959869385, + "learning_rate": 3.1744408377578174e-06, + "loss": 0.3522, + "step": 61216 + }, + { + "epoch": 0.7652441311032776, + "grad_norm": 3.741156578063965, + "learning_rate": 3.173803073600087e-06, + "loss": 1.1717, + "step": 61218 + }, + { + "epoch": 0.7652691317282933, + "grad_norm": 7.208263874053955, + "learning_rate": 3.1731653614297087e-06, + "loss": 1.399, + "step": 61220 + }, + { + "epoch": 0.7652941323533088, + "grad_norm": 3.3152921199798584, + "learning_rate": 3.1725277012515397e-06, + "loss": 0.6345, + "step": 61222 + }, + { + "epoch": 0.7653191329783244, + "grad_norm": 0.00031114951707422733, + "learning_rate": 3.1718900930704312e-06, + "loss": 0.0659, + "step": 61224 + }, + { + "epoch": 0.7653441336033401, + "grad_norm": 2.193721294403076, + "learning_rate": 3.1712525368912483e-06, + "loss": 1.3625, + "step": 61226 + }, + { + "epoch": 0.7653691342283557, + "grad_norm": 2.264101982116699, + "learning_rate": 3.1706150327188346e-06, + "loss": 0.0774, + "step": 61228 + }, + { + "epoch": 0.7653941348533714, + "grad_norm": 8.880990982055664, + "learning_rate": 3.1699775805580534e-06, + "loss": 1.1408, + "step": 61230 + }, + { + "epoch": 0.7654191354783869, + "grad_norm": 3.2881557941436768, + "learning_rate": 3.1693401804137548e-06, + "loss": 0.3652, + "step": 61232 + }, + { + "epoch": 0.7654441361034026, + "grad_norm": 4.127604007720947, + "learning_rate": 3.1687028322907952e-06, + "loss": 1.5062, + "step": 61234 + }, + { + "epoch": 0.7654691367284182, + "grad_norm": 7.389291763305664, + "learning_rate": 3.1680655361940317e-06, + "loss": 1.0505, + "step": 61236 + }, + { + "epoch": 0.7654941373534339, + "grad_norm": 0.0005673312698490918, + "learning_rate": 3.1674282921283116e-06, + "loss": 0.5933, + "step": 61238 + }, + { + "epoch": 0.7655191379784495, + "grad_norm": 5.529262065887451, + "learning_rate": 3.1667911000984987e-06, + "loss": 1.1247, + "step": 61240 + }, + { + "epoch": 0.765544138603465, + "grad_norm": 2.160088062286377, + "learning_rate": 3.1661539601094317e-06, + "loss": 0.6892, + "step": 61242 + }, + { + "epoch": 0.7655691392284807, + "grad_norm": 0.0011940774274989963, + "learning_rate": 3.165516872165969e-06, + "loss": 0.17, + "step": 61244 + }, + { + "epoch": 0.7655941398534963, + "grad_norm": 2.600912094116211, + "learning_rate": 3.164879836272967e-06, + "loss": 0.5968, + "step": 61246 + }, + { + "epoch": 0.765619140478512, + "grad_norm": 2.7321043014526367, + "learning_rate": 3.164242852435271e-06, + "loss": 0.8652, + "step": 61248 + }, + { + "epoch": 0.7656441411035276, + "grad_norm": 2.8903257846832275, + "learning_rate": 3.1636059206577376e-06, + "loss": 1.3333, + "step": 61250 + }, + { + "epoch": 0.7656691417285432, + "grad_norm": 3.2818076610565186, + "learning_rate": 3.162969040945215e-06, + "loss": 0.1627, + "step": 61252 + }, + { + "epoch": 0.7656941423535588, + "grad_norm": 3.928741693496704, + "learning_rate": 3.1623322133025526e-06, + "loss": 0.7581, + "step": 61254 + }, + { + "epoch": 0.7657191429785745, + "grad_norm": 5.974032878875732, + "learning_rate": 3.1616954377345987e-06, + "loss": 0.8521, + "step": 61256 + }, + { + "epoch": 0.7657441436035901, + "grad_norm": 5.963344097137451, + "learning_rate": 3.1610587142462046e-06, + "loss": 1.1753, + "step": 61258 + }, + { + "epoch": 0.7657691442286058, + "grad_norm": 4.838668346405029, + "learning_rate": 3.160422042842224e-06, + "loss": 0.6344, + "step": 61260 + }, + { + "epoch": 0.7657941448536213, + "grad_norm": 0.6772949695587158, + "learning_rate": 3.159785423527498e-06, + "loss": 0.0079, + "step": 61262 + }, + { + "epoch": 0.7658191454786369, + "grad_norm": 5.054482460021973, + "learning_rate": 3.1591488563068827e-06, + "loss": 1.3653, + "step": 61264 + }, + { + "epoch": 0.7658441461036526, + "grad_norm": 5.875047206878662, + "learning_rate": 3.1585123411852225e-06, + "loss": 1.0765, + "step": 61266 + }, + { + "epoch": 0.7658691467286682, + "grad_norm": 1.3014370203018188, + "learning_rate": 3.1578758781673613e-06, + "loss": 0.4257, + "step": 61268 + }, + { + "epoch": 0.7658941473536839, + "grad_norm": 2.8776907920837402, + "learning_rate": 3.157239467258153e-06, + "loss": 1.2603, + "step": 61270 + }, + { + "epoch": 0.7659191479786994, + "grad_norm": 2.584059953689575, + "learning_rate": 3.1566031084624383e-06, + "loss": 1.5145, + "step": 61272 + }, + { + "epoch": 0.7659441486037151, + "grad_norm": 4.576301097869873, + "learning_rate": 3.1559668017850696e-06, + "loss": 0.2463, + "step": 61274 + }, + { + "epoch": 0.7659691492287307, + "grad_norm": 0.0016471322160214186, + "learning_rate": 3.1553305472308873e-06, + "loss": 0.5046, + "step": 61276 + }, + { + "epoch": 0.7659941498537464, + "grad_norm": 2.162775993347168, + "learning_rate": 3.154694344804745e-06, + "loss": 0.4498, + "step": 61278 + }, + { + "epoch": 0.766019150478762, + "grad_norm": 2.0139830112457275, + "learning_rate": 3.154058194511477e-06, + "loss": 0.9169, + "step": 61280 + }, + { + "epoch": 0.7660441511037775, + "grad_norm": 5.175442218780518, + "learning_rate": 3.1534220963559324e-06, + "loss": 1.0232, + "step": 61282 + }, + { + "epoch": 0.7660691517287932, + "grad_norm": 3.294447660446167, + "learning_rate": 3.152786050342962e-06, + "loss": 1.2956, + "step": 61284 + }, + { + "epoch": 0.7660941523538088, + "grad_norm": 1.506393313407898, + "learning_rate": 3.1521500564774e-06, + "loss": 0.4897, + "step": 61286 + }, + { + "epoch": 0.7661191529788245, + "grad_norm": 3.6222517490386963, + "learning_rate": 3.1515141147640995e-06, + "loss": 1.4917, + "step": 61288 + }, + { + "epoch": 0.7661441536038401, + "grad_norm": 6.957711219787598, + "learning_rate": 3.150878225207894e-06, + "loss": 0.6685, + "step": 61290 + }, + { + "epoch": 0.7661691542288557, + "grad_norm": 0.0005947590689174831, + "learning_rate": 3.1502423878136358e-06, + "loss": 1.2562, + "step": 61292 + }, + { + "epoch": 0.7661941548538713, + "grad_norm": 4.405770778656006, + "learning_rate": 3.149606602586163e-06, + "loss": 2.5887, + "step": 61294 + }, + { + "epoch": 0.766219155478887, + "grad_norm": 4.552875518798828, + "learning_rate": 3.1489708695303133e-06, + "loss": 0.8818, + "step": 61296 + }, + { + "epoch": 0.7662441561039026, + "grad_norm": 2.50763201713562, + "learning_rate": 3.1483351886509374e-06, + "loss": 1.4394, + "step": 61298 + }, + { + "epoch": 0.7662691567289183, + "grad_norm": 3.316826820373535, + "learning_rate": 3.147699559952867e-06, + "loss": 1.0098, + "step": 61300 + }, + { + "epoch": 0.7662941573539338, + "grad_norm": 0.0014700305182486773, + "learning_rate": 3.147063983440952e-06, + "loss": 0.7886, + "step": 61302 + }, + { + "epoch": 0.7663191579789495, + "grad_norm": 4.912408351898193, + "learning_rate": 3.146428459120028e-06, + "loss": 0.7839, + "step": 61304 + }, + { + "epoch": 0.7663441586039651, + "grad_norm": 2.5128958225250244, + "learning_rate": 3.145792986994933e-06, + "loss": 0.3792, + "step": 61306 + }, + { + "epoch": 0.7663691592289807, + "grad_norm": 3.9696271419525146, + "learning_rate": 3.1451575670705124e-06, + "loss": 1.8005, + "step": 61308 + }, + { + "epoch": 0.7663941598539964, + "grad_norm": 2.0607664585113525, + "learning_rate": 3.1445221993515994e-06, + "loss": 0.5268, + "step": 61310 + }, + { + "epoch": 0.7664191604790119, + "grad_norm": 2.404740810394287, + "learning_rate": 3.143886883843038e-06, + "loss": 1.003, + "step": 61312 + }, + { + "epoch": 0.7664441611040276, + "grad_norm": 4.195521831512451, + "learning_rate": 3.143251620549662e-06, + "loss": 1.5275, + "step": 61314 + }, + { + "epoch": 0.7664691617290432, + "grad_norm": 0.004127690568566322, + "learning_rate": 3.1426164094763158e-06, + "loss": 0.4907, + "step": 61316 + }, + { + "epoch": 0.7664941623540589, + "grad_norm": 2.032172203063965, + "learning_rate": 3.1419812506278336e-06, + "loss": 0.7582, + "step": 61318 + }, + { + "epoch": 0.7665191629790745, + "grad_norm": 0.0017086791340261698, + "learning_rate": 3.1413461440090476e-06, + "loss": 0.8788, + "step": 61320 + }, + { + "epoch": 0.7665441636040901, + "grad_norm": 0.0016842962941154838, + "learning_rate": 3.140711089624804e-06, + "loss": 0.078, + "step": 61322 + }, + { + "epoch": 0.7665691642291057, + "grad_norm": 5.032011032104492, + "learning_rate": 3.1400760874799305e-06, + "loss": 1.3922, + "step": 61324 + }, + { + "epoch": 0.7665941648541214, + "grad_norm": 3.204404592514038, + "learning_rate": 3.1394411375792723e-06, + "loss": 0.8556, + "step": 61326 + }, + { + "epoch": 0.766619165479137, + "grad_norm": 0.0032666262704879045, + "learning_rate": 3.1388062399276554e-06, + "loss": 0.4618, + "step": 61328 + }, + { + "epoch": 0.7666441661041526, + "grad_norm": 3.835798978805542, + "learning_rate": 3.1381713945299242e-06, + "loss": 0.1804, + "step": 61330 + }, + { + "epoch": 0.7666691667291682, + "grad_norm": 3.203390598297119, + "learning_rate": 3.137536601390908e-06, + "loss": 1.1393, + "step": 61332 + }, + { + "epoch": 0.7666941673541838, + "grad_norm": 3.7063398361206055, + "learning_rate": 3.13690186051544e-06, + "loss": 0.8085, + "step": 61334 + }, + { + "epoch": 0.7667191679791995, + "grad_norm": 3.935396671295166, + "learning_rate": 3.1362671719083602e-06, + "loss": 1.4008, + "step": 61336 + }, + { + "epoch": 0.7667441686042151, + "grad_norm": 4.410689353942871, + "learning_rate": 3.1356325355744963e-06, + "loss": 2.5671, + "step": 61338 + }, + { + "epoch": 0.7667691692292308, + "grad_norm": 4.031057357788086, + "learning_rate": 3.134997951518687e-06, + "loss": 0.6744, + "step": 61340 + }, + { + "epoch": 0.7667941698542463, + "grad_norm": 0.9605569839477539, + "learning_rate": 3.1343634197457595e-06, + "loss": 0.447, + "step": 61342 + }, + { + "epoch": 0.766819170479262, + "grad_norm": 1.6502294540405273, + "learning_rate": 3.133728940260553e-06, + "loss": 0.9836, + "step": 61344 + }, + { + "epoch": 0.7668441711042776, + "grad_norm": 3.4466092586517334, + "learning_rate": 3.133094513067895e-06, + "loss": 0.5351, + "step": 61346 + }, + { + "epoch": 0.7668691717292933, + "grad_norm": 0.0004492508596740663, + "learning_rate": 3.132460138172615e-06, + "loss": 0.6841, + "step": 61348 + }, + { + "epoch": 0.7668941723543089, + "grad_norm": 0.0004908432601951063, + "learning_rate": 3.131825815579551e-06, + "loss": 0.2634, + "step": 61350 + }, + { + "epoch": 0.7669191729793244, + "grad_norm": 4.224722862243652, + "learning_rate": 3.131191545293527e-06, + "loss": 0.5991, + "step": 61352 + }, + { + "epoch": 0.7669441736043401, + "grad_norm": 3.4364402294158936, + "learning_rate": 3.13055732731938e-06, + "loss": 0.6431, + "step": 61354 + }, + { + "epoch": 0.7669691742293557, + "grad_norm": 4.96785831451416, + "learning_rate": 3.1299231616619376e-06, + "loss": 1.4111, + "step": 61356 + }, + { + "epoch": 0.7669941748543714, + "grad_norm": 3.320936918258667, + "learning_rate": 3.129289048326024e-06, + "loss": 1.3494, + "step": 61358 + }, + { + "epoch": 0.767019175479387, + "grad_norm": 2.9442129135131836, + "learning_rate": 3.1286549873164783e-06, + "loss": 0.4004, + "step": 61360 + }, + { + "epoch": 0.7670441761044026, + "grad_norm": 5.4646830558776855, + "learning_rate": 3.128020978638121e-06, + "loss": 1.3925, + "step": 61362 + }, + { + "epoch": 0.7670691767294182, + "grad_norm": 4.376415252685547, + "learning_rate": 3.127387022295787e-06, + "loss": 0.9816, + "step": 61364 + }, + { + "epoch": 0.7670941773544339, + "grad_norm": 6.7760910987854, + "learning_rate": 3.126753118294298e-06, + "loss": 1.7302, + "step": 61366 + }, + { + "epoch": 0.7671191779794495, + "grad_norm": 3.5574676990509033, + "learning_rate": 3.1261192666384886e-06, + "loss": 0.8176, + "step": 61368 + }, + { + "epoch": 0.7671441786044652, + "grad_norm": 4.122377872467041, + "learning_rate": 3.1254854673331837e-06, + "loss": 0.551, + "step": 61370 + }, + { + "epoch": 0.7671691792294807, + "grad_norm": 3.7336721420288086, + "learning_rate": 3.124851720383205e-06, + "loss": 2.0834, + "step": 61372 + }, + { + "epoch": 0.7671941798544963, + "grad_norm": 1.9630414247512817, + "learning_rate": 3.1242180257933875e-06, + "loss": 0.9061, + "step": 61374 + }, + { + "epoch": 0.767219180479512, + "grad_norm": 3.284985303878784, + "learning_rate": 3.12358438356855e-06, + "loss": 1.0287, + "step": 61376 + }, + { + "epoch": 0.7672441811045276, + "grad_norm": 5.835268020629883, + "learning_rate": 3.1229507937135237e-06, + "loss": 2.0987, + "step": 61378 + }, + { + "epoch": 0.7672691817295433, + "grad_norm": 0.0007367771468125284, + "learning_rate": 3.1223172562331294e-06, + "loss": 0.2167, + "step": 61380 + }, + { + "epoch": 0.7672941823545588, + "grad_norm": 2.9888060092926025, + "learning_rate": 3.1216837711321966e-06, + "loss": 0.2074, + "step": 61382 + }, + { + "epoch": 0.7673191829795745, + "grad_norm": 0.02589537389576435, + "learning_rate": 3.121050338415549e-06, + "loss": 0.1899, + "step": 61384 + }, + { + "epoch": 0.7673441836045901, + "grad_norm": 1.1425694227218628, + "learning_rate": 3.120416958088004e-06, + "loss": 0.3172, + "step": 61386 + }, + { + "epoch": 0.7673691842296058, + "grad_norm": 5.992547035217285, + "learning_rate": 3.119783630154395e-06, + "loss": 0.629, + "step": 61388 + }, + { + "epoch": 0.7673941848546214, + "grad_norm": 1.9444526433944702, + "learning_rate": 3.119150354619538e-06, + "loss": 1.0584, + "step": 61390 + }, + { + "epoch": 0.767419185479637, + "grad_norm": 8.96001148223877, + "learning_rate": 3.118517131488261e-06, + "loss": 2.411, + "step": 61392 + }, + { + "epoch": 0.7674441861046526, + "grad_norm": 5.209620952606201, + "learning_rate": 3.117883960765382e-06, + "loss": 0.877, + "step": 61394 + }, + { + "epoch": 0.7674691867296682, + "grad_norm": 1.3778047561645508, + "learning_rate": 3.117250842455729e-06, + "loss": 0.1018, + "step": 61396 + }, + { + "epoch": 0.7674941873546839, + "grad_norm": 3.880328893661499, + "learning_rate": 3.1166177765641193e-06, + "loss": 0.3944, + "step": 61398 + }, + { + "epoch": 0.7675191879796995, + "grad_norm": 2.6059796810150146, + "learning_rate": 3.115984763095373e-06, + "loss": 0.9666, + "step": 61400 + }, + { + "epoch": 0.7675441886047151, + "grad_norm": 3.017392635345459, + "learning_rate": 3.115351802054316e-06, + "loss": 0.9402, + "step": 61402 + }, + { + "epoch": 0.7675691892297307, + "grad_norm": 1.5391921997070312, + "learning_rate": 3.1147188934457628e-06, + "loss": 0.7624, + "step": 61404 + }, + { + "epoch": 0.7675941898547464, + "grad_norm": 6.268191814422607, + "learning_rate": 3.1140860372745395e-06, + "loss": 1.2656, + "step": 61406 + }, + { + "epoch": 0.767619190479762, + "grad_norm": 4.333344459533691, + "learning_rate": 3.1134532335454647e-06, + "loss": 1.3682, + "step": 61408 + }, + { + "epoch": 0.7676441911047777, + "grad_norm": 6.555422782897949, + "learning_rate": 3.112820482263351e-06, + "loss": 1.9494, + "step": 61410 + }, + { + "epoch": 0.7676691917297932, + "grad_norm": 3.4164538383483887, + "learning_rate": 3.1121877834330273e-06, + "loss": 1.0584, + "step": 61412 + }, + { + "epoch": 0.7676941923548088, + "grad_norm": 1.9975197315216064, + "learning_rate": 3.1115551370593044e-06, + "loss": 0.7576, + "step": 61414 + }, + { + "epoch": 0.7677191929798245, + "grad_norm": 2.3860435485839844, + "learning_rate": 3.110922543147006e-06, + "loss": 0.145, + "step": 61416 + }, + { + "epoch": 0.7677441936048401, + "grad_norm": 0.032858461141586304, + "learning_rate": 3.110290001700944e-06, + "loss": 0.7419, + "step": 61418 + }, + { + "epoch": 0.7677691942298558, + "grad_norm": 2.8847086429595947, + "learning_rate": 3.109657512725942e-06, + "loss": 1.2309, + "step": 61420 + }, + { + "epoch": 0.7677941948548713, + "grad_norm": 3.16902232170105, + "learning_rate": 3.109025076226815e-06, + "loss": 0.5685, + "step": 61422 + }, + { + "epoch": 0.767819195479887, + "grad_norm": 7.229469299316406, + "learning_rate": 3.108392692208375e-06, + "loss": 1.5107, + "step": 61424 + }, + { + "epoch": 0.7678441961049026, + "grad_norm": 6.0152153968811035, + "learning_rate": 3.1077603606754447e-06, + "loss": 0.163, + "step": 61426 + }, + { + "epoch": 0.7678691967299183, + "grad_norm": 5.318579196929932, + "learning_rate": 3.107128081632833e-06, + "loss": 1.9735, + "step": 61428 + }, + { + "epoch": 0.7678941973549339, + "grad_norm": 2.78159761428833, + "learning_rate": 3.106495855085363e-06, + "loss": 1.0779, + "step": 61430 + }, + { + "epoch": 0.7679191979799495, + "grad_norm": 5.510058879852295, + "learning_rate": 3.105863681037843e-06, + "loss": 0.7455, + "step": 61432 + }, + { + "epoch": 0.7679441986049651, + "grad_norm": 0.7060774564743042, + "learning_rate": 3.105231559495093e-06, + "loss": 0.7523, + "step": 61434 + }, + { + "epoch": 0.7679691992299807, + "grad_norm": 2.48494029045105, + "learning_rate": 3.104599490461924e-06, + "loss": 0.5817, + "step": 61436 + }, + { + "epoch": 0.7679941998549964, + "grad_norm": 2.6592113971710205, + "learning_rate": 3.1039674739431482e-06, + "loss": 0.4712, + "step": 61438 + }, + { + "epoch": 0.768019200480012, + "grad_norm": 4.523705959320068, + "learning_rate": 3.1033355099435824e-06, + "loss": 1.2587, + "step": 61440 + }, + { + "epoch": 0.7680442011050276, + "grad_norm": 2.9352686405181885, + "learning_rate": 3.1027035984680366e-06, + "loss": 0.5945, + "step": 61442 + }, + { + "epoch": 0.7680692017300432, + "grad_norm": 2.3510420322418213, + "learning_rate": 3.1020717395213263e-06, + "loss": 0.6282, + "step": 61444 + }, + { + "epoch": 0.7680942023550589, + "grad_norm": 5.192712783813477, + "learning_rate": 3.10143993310826e-06, + "loss": 1.4081, + "step": 61446 + }, + { + "epoch": 0.7681192029800745, + "grad_norm": 2.5616366863250732, + "learning_rate": 3.1008081792336555e-06, + "loss": 0.574, + "step": 61448 + }, + { + "epoch": 0.7681442036050902, + "grad_norm": 3.2783362865448, + "learning_rate": 3.1001764779023193e-06, + "loss": 0.8411, + "step": 61450 + }, + { + "epoch": 0.7681692042301057, + "grad_norm": 5.495917320251465, + "learning_rate": 3.09954482911906e-06, + "loss": 1.2498, + "step": 61452 + }, + { + "epoch": 0.7681942048551214, + "grad_norm": 6.474449634552002, + "learning_rate": 3.098913232888695e-06, + "loss": 1.0629, + "step": 61454 + }, + { + "epoch": 0.768219205480137, + "grad_norm": 0.5608630180358887, + "learning_rate": 3.0982816892160273e-06, + "loss": 0.2541, + "step": 61456 + }, + { + "epoch": 0.7682442061051526, + "grad_norm": 5.9994025230407715, + "learning_rate": 3.0976501981058736e-06, + "loss": 1.8865, + "step": 61458 + }, + { + "epoch": 0.7682692067301683, + "grad_norm": 3.587935209274292, + "learning_rate": 3.0970187595630373e-06, + "loss": 0.8701, + "step": 61460 + }, + { + "epoch": 0.7682942073551838, + "grad_norm": 4.358165264129639, + "learning_rate": 3.0963873735923335e-06, + "loss": 1.4969, + "step": 61462 + }, + { + "epoch": 0.7683192079801995, + "grad_norm": 3.095841646194458, + "learning_rate": 3.0957560401985664e-06, + "loss": 0.7811, + "step": 61464 + }, + { + "epoch": 0.7683442086052151, + "grad_norm": 0.989588737487793, + "learning_rate": 3.0951247593865417e-06, + "loss": 0.4751, + "step": 61466 + }, + { + "epoch": 0.7683692092302308, + "grad_norm": 5.10311222076416, + "learning_rate": 3.0944935311610736e-06, + "loss": 1.211, + "step": 61468 + }, + { + "epoch": 0.7683942098552464, + "grad_norm": 3.2826266288757324, + "learning_rate": 3.093862355526963e-06, + "loss": 1.137, + "step": 61470 + }, + { + "epoch": 0.768419210480262, + "grad_norm": 2.6703975200653076, + "learning_rate": 3.0932312324890236e-06, + "loss": 0.5914, + "step": 61472 + }, + { + "epoch": 0.7684442111052776, + "grad_norm": 3.410935640335083, + "learning_rate": 3.0926001620520584e-06, + "loss": 0.7035, + "step": 61474 + }, + { + "epoch": 0.7684692117302933, + "grad_norm": 2.3659420013427734, + "learning_rate": 3.09196914422087e-06, + "loss": 1.426, + "step": 61476 + }, + { + "epoch": 0.7684942123553089, + "grad_norm": 0.7773085236549377, + "learning_rate": 3.0913381790002715e-06, + "loss": 0.0328, + "step": 61478 + }, + { + "epoch": 0.7685192129803246, + "grad_norm": 3.3460395336151123, + "learning_rate": 3.090707266395061e-06, + "loss": 1.6337, + "step": 61480 + }, + { + "epoch": 0.7685442136053401, + "grad_norm": 4.8776164054870605, + "learning_rate": 3.0900764064100495e-06, + "loss": 0.4262, + "step": 61482 + }, + { + "epoch": 0.7685692142303557, + "grad_norm": 5.540456295013428, + "learning_rate": 3.089445599050037e-06, + "loss": 0.9981, + "step": 61484 + }, + { + "epoch": 0.7685942148553714, + "grad_norm": 5.262026309967041, + "learning_rate": 3.0888148443198316e-06, + "loss": 0.8545, + "step": 61486 + }, + { + "epoch": 0.768619215480387, + "grad_norm": 1.5098727941513062, + "learning_rate": 3.0881841422242352e-06, + "loss": 1.0148, + "step": 61488 + }, + { + "epoch": 0.7686442161054027, + "grad_norm": 2.3613133430480957, + "learning_rate": 3.0875534927680486e-06, + "loss": 0.3587, + "step": 61490 + }, + { + "epoch": 0.7686692167304182, + "grad_norm": 3.0125885009765625, + "learning_rate": 3.0869228959560793e-06, + "loss": 1.6485, + "step": 61492 + }, + { + "epoch": 0.7686942173554339, + "grad_norm": 5.289372444152832, + "learning_rate": 3.0862923517931255e-06, + "loss": 2.4374, + "step": 61494 + }, + { + "epoch": 0.7687192179804495, + "grad_norm": 4.632274150848389, + "learning_rate": 3.0856618602839937e-06, + "loss": 0.3078, + "step": 61496 + }, + { + "epoch": 0.7687442186054652, + "grad_norm": 2.007556200027466, + "learning_rate": 3.085031421433481e-06, + "loss": 1.6979, + "step": 61498 + }, + { + "epoch": 0.7687692192304808, + "grad_norm": 2.069988250732422, + "learning_rate": 3.084401035246394e-06, + "loss": 0.3417, + "step": 61500 + }, + { + "epoch": 0.7687942198554963, + "grad_norm": 0.9270830154418945, + "learning_rate": 3.0837707017275297e-06, + "loss": 0.7916, + "step": 61502 + }, + { + "epoch": 0.768819220480512, + "grad_norm": 2.626898765563965, + "learning_rate": 3.0831404208816874e-06, + "loss": 1.2859, + "step": 61504 + }, + { + "epoch": 0.7688442211055276, + "grad_norm": 4.752254486083984, + "learning_rate": 3.0825101927136734e-06, + "loss": 1.4565, + "step": 61506 + }, + { + "epoch": 0.7688692217305433, + "grad_norm": 1.7676180601119995, + "learning_rate": 3.081880017228279e-06, + "loss": 0.5589, + "step": 61508 + }, + { + "epoch": 0.7688942223555589, + "grad_norm": 2.85284423828125, + "learning_rate": 3.081249894430313e-06, + "loss": 0.5155, + "step": 61510 + }, + { + "epoch": 0.7689192229805745, + "grad_norm": 0.002920745639130473, + "learning_rate": 3.0806198243245645e-06, + "loss": 0.4758, + "step": 61512 + }, + { + "epoch": 0.7689442236055901, + "grad_norm": 0.002392473863437772, + "learning_rate": 3.0799898069158406e-06, + "loss": 0.6607, + "step": 61514 + }, + { + "epoch": 0.7689692242306058, + "grad_norm": 4.007440567016602, + "learning_rate": 3.0793598422089367e-06, + "loss": 0.6591, + "step": 61516 + }, + { + "epoch": 0.7689942248556214, + "grad_norm": 0.0012469067005440593, + "learning_rate": 3.0787299302086448e-06, + "loss": 0.0228, + "step": 61518 + }, + { + "epoch": 0.7690192254806371, + "grad_norm": 5.0244364738464355, + "learning_rate": 3.0781000709197717e-06, + "loss": 1.6949, + "step": 61520 + }, + { + "epoch": 0.7690442261056526, + "grad_norm": 2.0684754848480225, + "learning_rate": 3.0774702643471055e-06, + "loss": 0.3784, + "step": 61522 + }, + { + "epoch": 0.7690692267306682, + "grad_norm": 2.2808785438537598, + "learning_rate": 3.076840510495447e-06, + "loss": 0.5476, + "step": 61524 + }, + { + "epoch": 0.7690942273556839, + "grad_norm": 3.1375832557678223, + "learning_rate": 3.0762108093695986e-06, + "loss": 1.4303, + "step": 61526 + }, + { + "epoch": 0.7691192279806995, + "grad_norm": 4.18324613571167, + "learning_rate": 3.0755811609743425e-06, + "loss": 2.0086, + "step": 61528 + }, + { + "epoch": 0.7691442286057152, + "grad_norm": 31.648805618286133, + "learning_rate": 3.0749515653144857e-06, + "loss": 1.6144, + "step": 61530 + }, + { + "epoch": 0.7691692292307307, + "grad_norm": 2.1280534267425537, + "learning_rate": 3.0743220223948143e-06, + "loss": 0.584, + "step": 61532 + }, + { + "epoch": 0.7691942298557464, + "grad_norm": 2.0702600479125977, + "learning_rate": 3.0736925322201306e-06, + "loss": 0.7819, + "step": 61534 + }, + { + "epoch": 0.769219230480762, + "grad_norm": 1.7267745733261108, + "learning_rate": 3.0730630947952212e-06, + "loss": 0.6327, + "step": 61536 + }, + { + "epoch": 0.7692442311057777, + "grad_norm": 1.3550204038619995, + "learning_rate": 3.0724337101248845e-06, + "loss": 0.2444, + "step": 61538 + }, + { + "epoch": 0.7692692317307933, + "grad_norm": 3.818938970565796, + "learning_rate": 3.0718043782139194e-06, + "loss": 0.5329, + "step": 61540 + }, + { + "epoch": 0.7692942323558088, + "grad_norm": 0.6723132133483887, + "learning_rate": 3.0711750990671052e-06, + "loss": 0.7598, + "step": 61542 + }, + { + "epoch": 0.7693192329808245, + "grad_norm": 1.5109301805496216, + "learning_rate": 3.0705458726892456e-06, + "loss": 1.0563, + "step": 61544 + }, + { + "epoch": 0.7693442336058401, + "grad_norm": 3.3005895614624023, + "learning_rate": 3.069916699085126e-06, + "loss": 0.6783, + "step": 61546 + }, + { + "epoch": 0.7693692342308558, + "grad_norm": 1.3264989852905273, + "learning_rate": 3.06928757825954e-06, + "loss": 0.5472, + "step": 61548 + }, + { + "epoch": 0.7693942348558714, + "grad_norm": 5.979709148406982, + "learning_rate": 3.0686585102172826e-06, + "loss": 1.0596, + "step": 61550 + }, + { + "epoch": 0.769419235480887, + "grad_norm": 2.4543323516845703, + "learning_rate": 3.0680294949631427e-06, + "loss": 0.4435, + "step": 61552 + }, + { + "epoch": 0.7694442361059026, + "grad_norm": 4.002350807189941, + "learning_rate": 3.0674005325019097e-06, + "loss": 1.2886, + "step": 61554 + }, + { + "epoch": 0.7694692367309183, + "grad_norm": 7.692275524139404, + "learning_rate": 3.0667716228383705e-06, + "loss": 1.7658, + "step": 61556 + }, + { + "epoch": 0.7694942373559339, + "grad_norm": 3.8300793170928955, + "learning_rate": 3.066142765977321e-06, + "loss": 0.8717, + "step": 61558 + }, + { + "epoch": 0.7695192379809496, + "grad_norm": 2.300942897796631, + "learning_rate": 3.065513961923544e-06, + "loss": 0.7956, + "step": 61560 + }, + { + "epoch": 0.7695442386059651, + "grad_norm": 3.0090713500976562, + "learning_rate": 3.064885210681834e-06, + "loss": 1.0828, + "step": 61562 + }, + { + "epoch": 0.7695692392309808, + "grad_norm": 4.554634094238281, + "learning_rate": 3.064256512256979e-06, + "loss": 1.6316, + "step": 61564 + }, + { + "epoch": 0.7695942398559964, + "grad_norm": 3.377619504928589, + "learning_rate": 3.0636278666537657e-06, + "loss": 0.6255, + "step": 61566 + }, + { + "epoch": 0.769619240481012, + "grad_norm": 3.6069633960723877, + "learning_rate": 3.062999273876982e-06, + "loss": 0.2463, + "step": 61568 + }, + { + "epoch": 0.7696442411060277, + "grad_norm": 3.51969575881958, + "learning_rate": 3.0623707339314124e-06, + "loss": 0.845, + "step": 61570 + }, + { + "epoch": 0.7696692417310432, + "grad_norm": 0.007306392770260572, + "learning_rate": 3.061742246821848e-06, + "loss": 0.239, + "step": 61572 + }, + { + "epoch": 0.7696942423560589, + "grad_norm": 0.004822278395295143, + "learning_rate": 3.0611138125530714e-06, + "loss": 0.9542, + "step": 61574 + }, + { + "epoch": 0.7697192429810745, + "grad_norm": 3.741025924682617, + "learning_rate": 3.0604854311298704e-06, + "loss": 0.6492, + "step": 61576 + }, + { + "epoch": 0.7697442436060902, + "grad_norm": 3.355144500732422, + "learning_rate": 3.059857102557038e-06, + "loss": 1.1645, + "step": 61578 + }, + { + "epoch": 0.7697692442311058, + "grad_norm": 0.42983540892601013, + "learning_rate": 3.0592288268393457e-06, + "loss": 0.2498, + "step": 61580 + }, + { + "epoch": 0.7697942448561214, + "grad_norm": 3.166172504425049, + "learning_rate": 3.0586006039815875e-06, + "loss": 1.5738, + "step": 61582 + }, + { + "epoch": 0.769819245481137, + "grad_norm": 3.5823757648468018, + "learning_rate": 3.057972433988543e-06, + "loss": 0.7099, + "step": 61584 + }, + { + "epoch": 0.7698442461061527, + "grad_norm": 0.5010144114494324, + "learning_rate": 3.0573443168649987e-06, + "loss": 0.0159, + "step": 61586 + }, + { + "epoch": 0.7698692467311683, + "grad_norm": 2.7270777225494385, + "learning_rate": 3.0567162526157425e-06, + "loss": 0.0906, + "step": 61588 + }, + { + "epoch": 0.769894247356184, + "grad_norm": 2.0640792846679688, + "learning_rate": 3.0560882412455484e-06, + "loss": 0.7542, + "step": 61590 + }, + { + "epoch": 0.7699192479811995, + "grad_norm": 3.380892276763916, + "learning_rate": 3.0554602827592117e-06, + "loss": 1.7599, + "step": 61592 + }, + { + "epoch": 0.7699442486062151, + "grad_norm": 4.1712117195129395, + "learning_rate": 3.054832377161501e-06, + "loss": 1.9827, + "step": 61594 + }, + { + "epoch": 0.7699692492312308, + "grad_norm": 3.7958266735076904, + "learning_rate": 3.0542045244572083e-06, + "loss": 0.8328, + "step": 61596 + }, + { + "epoch": 0.7699942498562464, + "grad_norm": 1.6273612976074219, + "learning_rate": 3.053576724651107e-06, + "loss": 0.6471, + "step": 61598 + }, + { + "epoch": 0.7700192504812621, + "grad_norm": 0.8286917805671692, + "learning_rate": 3.052948977747985e-06, + "loss": 0.0925, + "step": 61600 + }, + { + "epoch": 0.7700442511062776, + "grad_norm": 11.987977027893066, + "learning_rate": 3.052321283752624e-06, + "loss": 0.6645, + "step": 61602 + }, + { + "epoch": 0.7700692517312933, + "grad_norm": 4.766299247741699, + "learning_rate": 3.0516936426698006e-06, + "loss": 1.1357, + "step": 61604 + }, + { + "epoch": 0.7700942523563089, + "grad_norm": 5.090693950653076, + "learning_rate": 3.051066054504297e-06, + "loss": 0.5331, + "step": 61606 + }, + { + "epoch": 0.7701192529813246, + "grad_norm": 5.131807804107666, + "learning_rate": 3.0504385192608875e-06, + "loss": 0.6023, + "step": 61608 + }, + { + "epoch": 0.7701442536063402, + "grad_norm": 2.747476100921631, + "learning_rate": 3.0498110369443556e-06, + "loss": 0.5537, + "step": 61610 + }, + { + "epoch": 0.7701692542313557, + "grad_norm": 3.852781057357788, + "learning_rate": 3.049183607559484e-06, + "loss": 1.1034, + "step": 61612 + }, + { + "epoch": 0.7701942548563714, + "grad_norm": 3.279568672180176, + "learning_rate": 3.0485562311110427e-06, + "loss": 1.2102, + "step": 61614 + }, + { + "epoch": 0.770219255481387, + "grad_norm": 4.988009452819824, + "learning_rate": 3.0479289076038176e-06, + "loss": 1.4494, + "step": 61616 + }, + { + "epoch": 0.7702442561064027, + "grad_norm": 3.730295419692993, + "learning_rate": 3.0473016370425832e-06, + "loss": 0.2504, + "step": 61618 + }, + { + "epoch": 0.7702692567314183, + "grad_norm": 0.7367613315582275, + "learning_rate": 3.0466744194321164e-06, + "loss": 0.1736, + "step": 61620 + }, + { + "epoch": 0.7702942573564339, + "grad_norm": 0.14408545196056366, + "learning_rate": 3.0460472547771915e-06, + "loss": 0.1861, + "step": 61622 + }, + { + "epoch": 0.7703192579814495, + "grad_norm": 4.178711414337158, + "learning_rate": 3.0454201430825857e-06, + "loss": 0.6244, + "step": 61624 + }, + { + "epoch": 0.7703442586064652, + "grad_norm": 3.7472267150878906, + "learning_rate": 3.0447930843530806e-06, + "loss": 0.8534, + "step": 61626 + }, + { + "epoch": 0.7703692592314808, + "grad_norm": 0.7663730978965759, + "learning_rate": 3.0441660785934444e-06, + "loss": 0.2396, + "step": 61628 + }, + { + "epoch": 0.7703942598564965, + "grad_norm": 3.844228506088257, + "learning_rate": 3.0435391258084634e-06, + "loss": 1.2163, + "step": 61630 + }, + { + "epoch": 0.770419260481512, + "grad_norm": 2.377674102783203, + "learning_rate": 3.0429122260028966e-06, + "loss": 0.7148, + "step": 61632 + }, + { + "epoch": 0.7704442611065276, + "grad_norm": 3.4045934677124023, + "learning_rate": 3.04228537918153e-06, + "loss": 1.1642, + "step": 61634 + }, + { + "epoch": 0.7704692617315433, + "grad_norm": 0.0005982082802802324, + "learning_rate": 3.041658585349132e-06, + "loss": 0.0001, + "step": 61636 + }, + { + "epoch": 0.7704942623565589, + "grad_norm": 3.7320098876953125, + "learning_rate": 3.0410318445104768e-06, + "loss": 0.3565, + "step": 61638 + }, + { + "epoch": 0.7705192629815746, + "grad_norm": 2.760603189468384, + "learning_rate": 3.040405156670343e-06, + "loss": 1.4684, + "step": 61640 + }, + { + "epoch": 0.7705442636065901, + "grad_norm": 3.1747303009033203, + "learning_rate": 3.0397785218334953e-06, + "loss": 1.1835, + "step": 61642 + }, + { + "epoch": 0.7705692642316058, + "grad_norm": 0.4166218340396881, + "learning_rate": 3.0391519400047178e-06, + "loss": 0.7535, + "step": 61644 + }, + { + "epoch": 0.7705942648566214, + "grad_norm": 1.070573329925537, + "learning_rate": 3.0385254111887674e-06, + "loss": 0.134, + "step": 61646 + }, + { + "epoch": 0.7706192654816371, + "grad_norm": 2.870187997817993, + "learning_rate": 3.037898935390423e-06, + "loss": 0.6915, + "step": 61648 + }, + { + "epoch": 0.7706442661066527, + "grad_norm": 3.0750482082366943, + "learning_rate": 3.0372725126144597e-06, + "loss": 0.639, + "step": 61650 + }, + { + "epoch": 0.7706692667316682, + "grad_norm": 4.992583751678467, + "learning_rate": 3.0366461428656403e-06, + "loss": 2.0203, + "step": 61652 + }, + { + "epoch": 0.7706942673566839, + "grad_norm": 1.0649640560150146, + "learning_rate": 3.0360198261487427e-06, + "loss": 0.8751, + "step": 61654 + }, + { + "epoch": 0.7707192679816995, + "grad_norm": 3.552985429763794, + "learning_rate": 3.0353935624685337e-06, + "loss": 0.7378, + "step": 61656 + }, + { + "epoch": 0.7707442686067152, + "grad_norm": 0.08333871513605118, + "learning_rate": 3.0347673518297814e-06, + "loss": 0.456, + "step": 61658 + }, + { + "epoch": 0.7707692692317308, + "grad_norm": 0.0013848546659573913, + "learning_rate": 3.0341411942372544e-06, + "loss": 0.112, + "step": 61660 + }, + { + "epoch": 0.7707942698567464, + "grad_norm": 2.6073100566864014, + "learning_rate": 3.033515089695722e-06, + "loss": 1.0009, + "step": 61662 + }, + { + "epoch": 0.770819270481762, + "grad_norm": 2.018132448196411, + "learning_rate": 3.0328890382099562e-06, + "loss": 0.7456, + "step": 61664 + }, + { + "epoch": 0.7708442711067777, + "grad_norm": 5.992070198059082, + "learning_rate": 3.0322630397847197e-06, + "loss": 1.2573, + "step": 61666 + }, + { + "epoch": 0.7708692717317933, + "grad_norm": 3.873131513595581, + "learning_rate": 3.031637094424786e-06, + "loss": 0.8463, + "step": 61668 + }, + { + "epoch": 0.770894272356809, + "grad_norm": 2.495286226272583, + "learning_rate": 3.031011202134918e-06, + "loss": 0.819, + "step": 61670 + }, + { + "epoch": 0.7709192729818245, + "grad_norm": 1.6561825275421143, + "learning_rate": 3.030385362919881e-06, + "loss": 0.0617, + "step": 61672 + }, + { + "epoch": 0.7709442736068401, + "grad_norm": 2.6598827838897705, + "learning_rate": 3.0297595767844466e-06, + "loss": 1.5333, + "step": 61674 + }, + { + "epoch": 0.7709692742318558, + "grad_norm": 3.793527364730835, + "learning_rate": 3.029133843733374e-06, + "loss": 0.7573, + "step": 61676 + }, + { + "epoch": 0.7709942748568714, + "grad_norm": 3.86145281791687, + "learning_rate": 3.0285081637714352e-06, + "loss": 1.0347, + "step": 61678 + }, + { + "epoch": 0.7710192754818871, + "grad_norm": 0.0006131810368970037, + "learning_rate": 3.0278825369033895e-06, + "loss": 0.0036, + "step": 61680 + }, + { + "epoch": 0.7710442761069026, + "grad_norm": 0.04600333422422409, + "learning_rate": 3.0272569631340075e-06, + "loss": 0.3522, + "step": 61682 + }, + { + "epoch": 0.7710692767319183, + "grad_norm": 3.2863657474517822, + "learning_rate": 3.02663144246805e-06, + "loss": 0.9647, + "step": 61684 + }, + { + "epoch": 0.7710942773569339, + "grad_norm": 3.1258585453033447, + "learning_rate": 3.026005974910279e-06, + "loss": 0.7711, + "step": 61686 + }, + { + "epoch": 0.7711192779819496, + "grad_norm": 1.150323510169983, + "learning_rate": 3.025380560465463e-06, + "loss": 0.1724, + "step": 61688 + }, + { + "epoch": 0.7711442786069652, + "grad_norm": 0.002755307825282216, + "learning_rate": 3.0247551991383595e-06, + "loss": 0.6612, + "step": 61690 + }, + { + "epoch": 0.7711692792319808, + "grad_norm": 2.44450306892395, + "learning_rate": 3.0241298909337367e-06, + "loss": 0.5274, + "step": 61692 + }, + { + "epoch": 0.7711942798569964, + "grad_norm": 2.3180418014526367, + "learning_rate": 3.0235046358563515e-06, + "loss": 0.4531, + "step": 61694 + }, + { + "epoch": 0.771219280482012, + "grad_norm": 5.101783275604248, + "learning_rate": 3.0228794339109735e-06, + "loss": 0.5743, + "step": 61696 + }, + { + "epoch": 0.7712442811070277, + "grad_norm": 5.238574504852295, + "learning_rate": 3.022254285102354e-06, + "loss": 1.7783, + "step": 61698 + }, + { + "epoch": 0.7712692817320433, + "grad_norm": 3.4597575664520264, + "learning_rate": 3.021629189435259e-06, + "loss": 0.6348, + "step": 61700 + }, + { + "epoch": 0.7712942823570589, + "grad_norm": 0.0005954844527877867, + "learning_rate": 3.0210041469144525e-06, + "loss": 0.6575, + "step": 61702 + }, + { + "epoch": 0.7713192829820745, + "grad_norm": 2.5804872512817383, + "learning_rate": 3.0203791575446884e-06, + "loss": 1.2853, + "step": 61704 + }, + { + "epoch": 0.7713442836070902, + "grad_norm": 0.0013204972492530942, + "learning_rate": 3.0197542213307318e-06, + "loss": 0.7327, + "step": 61706 + }, + { + "epoch": 0.7713692842321058, + "grad_norm": 4.798312187194824, + "learning_rate": 3.019129338277338e-06, + "loss": 1.4841, + "step": 61708 + }, + { + "epoch": 0.7713942848571215, + "grad_norm": 0.0004415105504449457, + "learning_rate": 3.0185045083892708e-06, + "loss": 0.0001, + "step": 61710 + }, + { + "epoch": 0.771419285482137, + "grad_norm": 1.1602659225463867, + "learning_rate": 3.017879731671286e-06, + "loss": 0.5997, + "step": 61712 + }, + { + "epoch": 0.7714442861071527, + "grad_norm": 7.836066722869873, + "learning_rate": 3.017255008128138e-06, + "loss": 1.0177, + "step": 61714 + }, + { + "epoch": 0.7714692867321683, + "grad_norm": 3.047635078430176, + "learning_rate": 3.0166303377645924e-06, + "loss": 0.9491, + "step": 61716 + }, + { + "epoch": 0.771494287357184, + "grad_norm": 5.478667259216309, + "learning_rate": 3.016005720585399e-06, + "loss": 0.9465, + "step": 61718 + }, + { + "epoch": 0.7715192879821996, + "grad_norm": 5.50649881362915, + "learning_rate": 3.0153811565953217e-06, + "loss": 0.9339, + "step": 61720 + }, + { + "epoch": 0.7715442886072151, + "grad_norm": 5.913200378417969, + "learning_rate": 3.014756645799113e-06, + "loss": 1.3856, + "step": 61722 + }, + { + "epoch": 0.7715692892322308, + "grad_norm": 0.0007568667060695589, + "learning_rate": 3.0141321882015273e-06, + "loss": 0.4371, + "step": 61724 + }, + { + "epoch": 0.7715942898572464, + "grad_norm": 3.6126813888549805, + "learning_rate": 3.0135077838073266e-06, + "loss": 0.6105, + "step": 61726 + }, + { + "epoch": 0.7716192904822621, + "grad_norm": 3.2463858127593994, + "learning_rate": 3.0128834326212597e-06, + "loss": 0.6689, + "step": 61728 + }, + { + "epoch": 0.7716442911072777, + "grad_norm": 0.0005384223186410964, + "learning_rate": 3.0122591346480866e-06, + "loss": 1.1881, + "step": 61730 + }, + { + "epoch": 0.7716692917322933, + "grad_norm": 1.6992820501327515, + "learning_rate": 3.0116348898925575e-06, + "loss": 1.1207, + "step": 61732 + }, + { + "epoch": 0.7716942923573089, + "grad_norm": 3.5894548892974854, + "learning_rate": 3.0110106983594313e-06, + "loss": 0.6551, + "step": 61734 + }, + { + "epoch": 0.7717192929823246, + "grad_norm": 3.224883556365967, + "learning_rate": 3.01038656005346e-06, + "loss": 0.2837, + "step": 61736 + }, + { + "epoch": 0.7717442936073402, + "grad_norm": 5.532029628753662, + "learning_rate": 3.009762474979393e-06, + "loss": 1.9189, + "step": 61738 + }, + { + "epoch": 0.7717692942323559, + "grad_norm": 5.996877670288086, + "learning_rate": 3.009138443141989e-06, + "loss": 1.289, + "step": 61740 + }, + { + "epoch": 0.7717942948573714, + "grad_norm": 2.3694212436676025, + "learning_rate": 3.0085144645459954e-06, + "loss": 1.0091, + "step": 61742 + }, + { + "epoch": 0.771819295482387, + "grad_norm": 2.3756277561187744, + "learning_rate": 3.00789053919617e-06, + "loss": 0.4494, + "step": 61744 + }, + { + "epoch": 0.7718442961074027, + "grad_norm": 2.706105947494507, + "learning_rate": 3.0072666670972583e-06, + "loss": 0.7333, + "step": 61746 + }, + { + "epoch": 0.7718692967324183, + "grad_norm": 7.9528350830078125, + "learning_rate": 3.0066428482540177e-06, + "loss": 1.935, + "step": 61748 + }, + { + "epoch": 0.771894297357434, + "grad_norm": 5.404927730560303, + "learning_rate": 3.0060190826711966e-06, + "loss": 1.4025, + "step": 61750 + }, + { + "epoch": 0.7719192979824495, + "grad_norm": 2.9727768898010254, + "learning_rate": 3.0053953703535422e-06, + "loss": 2.2315, + "step": 61752 + }, + { + "epoch": 0.7719442986074652, + "grad_norm": 5.537830829620361, + "learning_rate": 3.0047717113058094e-06, + "loss": 1.3515, + "step": 61754 + }, + { + "epoch": 0.7719692992324808, + "grad_norm": 2.9936585426330566, + "learning_rate": 3.0041481055327436e-06, + "loss": 0.9628, + "step": 61756 + }, + { + "epoch": 0.7719942998574965, + "grad_norm": 4.304751873016357, + "learning_rate": 3.0035245530391e-06, + "loss": 0.2734, + "step": 61758 + }, + { + "epoch": 0.7720193004825121, + "grad_norm": 4.6571946144104, + "learning_rate": 3.002901053829621e-06, + "loss": 0.6771, + "step": 61760 + }, + { + "epoch": 0.7720443011075276, + "grad_norm": 4.992539882659912, + "learning_rate": 3.002277607909061e-06, + "loss": 1.2805, + "step": 61762 + }, + { + "epoch": 0.7720693017325433, + "grad_norm": 0.3453105390071869, + "learning_rate": 3.001654215282165e-06, + "loss": 0.3945, + "step": 61764 + }, + { + "epoch": 0.7720943023575589, + "grad_norm": 4.856956481933594, + "learning_rate": 3.001030875953678e-06, + "loss": 1.7474, + "step": 61766 + }, + { + "epoch": 0.7721193029825746, + "grad_norm": 7.3556365966796875, + "learning_rate": 3.0004075899283523e-06, + "loss": 1.6279, + "step": 61768 + }, + { + "epoch": 0.7721443036075902, + "grad_norm": 5.309106349945068, + "learning_rate": 2.999784357210931e-06, + "loss": 2.6978, + "step": 61770 + }, + { + "epoch": 0.7721693042326058, + "grad_norm": 0.000939944526180625, + "learning_rate": 2.9991611778061646e-06, + "loss": 0.3219, + "step": 61772 + }, + { + "epoch": 0.7721943048576214, + "grad_norm": 0.0006065445486456156, + "learning_rate": 2.9985380517187966e-06, + "loss": 0.0446, + "step": 61774 + }, + { + "epoch": 0.7722193054826371, + "grad_norm": 1.904796838760376, + "learning_rate": 2.997914978953569e-06, + "loss": 0.3689, + "step": 61776 + }, + { + "epoch": 0.7722443061076527, + "grad_norm": 0.9965678453445435, + "learning_rate": 2.997291959515235e-06, + "loss": 0.1054, + "step": 61778 + }, + { + "epoch": 0.7722693067326684, + "grad_norm": 5.251357555389404, + "learning_rate": 2.9966689934085313e-06, + "loss": 0.8793, + "step": 61780 + }, + { + "epoch": 0.7722943073576839, + "grad_norm": 3.574906826019287, + "learning_rate": 2.9960460806382085e-06, + "loss": 0.7813, + "step": 61782 + }, + { + "epoch": 0.7723193079826995, + "grad_norm": 5.66261100769043, + "learning_rate": 2.9954232212090062e-06, + "loss": 0.8373, + "step": 61784 + }, + { + "epoch": 0.7723443086077152, + "grad_norm": 4.544052600860596, + "learning_rate": 2.994800415125674e-06, + "loss": 1.8881, + "step": 61786 + }, + { + "epoch": 0.7723693092327308, + "grad_norm": 2.418520450592041, + "learning_rate": 2.9941776623929497e-06, + "loss": 1.4698, + "step": 61788 + }, + { + "epoch": 0.7723943098577465, + "grad_norm": 2.494657039642334, + "learning_rate": 2.9935549630155746e-06, + "loss": 0.932, + "step": 61790 + }, + { + "epoch": 0.772419310482762, + "grad_norm": 3.4218711853027344, + "learning_rate": 2.9929323169982983e-06, + "loss": 1.3912, + "step": 61792 + }, + { + "epoch": 0.7724443111077777, + "grad_norm": 6.133209228515625, + "learning_rate": 2.9923097243458544e-06, + "loss": 1.1863, + "step": 61794 + }, + { + "epoch": 0.7724693117327933, + "grad_norm": 3.0784926414489746, + "learning_rate": 2.9916871850629927e-06, + "loss": 2.3198, + "step": 61796 + }, + { + "epoch": 0.772494312357809, + "grad_norm": 1.7122355699539185, + "learning_rate": 2.9910646991544467e-06, + "loss": 0.0434, + "step": 61798 + }, + { + "epoch": 0.7725193129828246, + "grad_norm": 2.3319733142852783, + "learning_rate": 2.990442266624964e-06, + "loss": 0.1062, + "step": 61800 + }, + { + "epoch": 0.7725443136078401, + "grad_norm": 3.294342041015625, + "learning_rate": 2.9898198874792817e-06, + "loss": 0.4785, + "step": 61802 + }, + { + "epoch": 0.7725693142328558, + "grad_norm": 3.3558316230773926, + "learning_rate": 2.9891975617221367e-06, + "loss": 1.163, + "step": 61804 + }, + { + "epoch": 0.7725943148578714, + "grad_norm": 0.8991928100585938, + "learning_rate": 2.988575289358275e-06, + "loss": 0.6, + "step": 61806 + }, + { + "epoch": 0.7726193154828871, + "grad_norm": 0.003432221245020628, + "learning_rate": 2.9879530703924285e-06, + "loss": 0.1511, + "step": 61808 + }, + { + "epoch": 0.7726443161079027, + "grad_norm": 1.5519952774047852, + "learning_rate": 2.987330904829344e-06, + "loss": 0.923, + "step": 61810 + }, + { + "epoch": 0.7726693167329183, + "grad_norm": 2.047455310821533, + "learning_rate": 2.9867087926737527e-06, + "loss": 0.5451, + "step": 61812 + }, + { + "epoch": 0.7726943173579339, + "grad_norm": 0.03941889852285385, + "learning_rate": 2.9860867339303976e-06, + "loss": 0.0007, + "step": 61814 + }, + { + "epoch": 0.7727193179829496, + "grad_norm": 6.721909999847412, + "learning_rate": 2.985464728604014e-06, + "loss": 1.211, + "step": 61816 + }, + { + "epoch": 0.7727443186079652, + "grad_norm": 3.22831392288208, + "learning_rate": 2.984842776699337e-06, + "loss": 0.1668, + "step": 61818 + }, + { + "epoch": 0.7727693192329809, + "grad_norm": 0.0026115330401808023, + "learning_rate": 2.984220878221108e-06, + "loss": 0.377, + "step": 61820 + }, + { + "epoch": 0.7727943198579964, + "grad_norm": 2.244391918182373, + "learning_rate": 2.983599033174057e-06, + "loss": 0.9953, + "step": 61822 + }, + { + "epoch": 0.772819320483012, + "grad_norm": 1.962173581123352, + "learning_rate": 2.9829772415629267e-06, + "loss": 0.3503, + "step": 61824 + }, + { + "epoch": 0.7728443211080277, + "grad_norm": 0.0004082100640516728, + "learning_rate": 2.98235550339245e-06, + "loss": 0.7794, + "step": 61826 + }, + { + "epoch": 0.7728693217330433, + "grad_norm": 5.123106956481934, + "learning_rate": 2.9817338186673585e-06, + "loss": 1.5982, + "step": 61828 + }, + { + "epoch": 0.772894322358059, + "grad_norm": 4.001110553741455, + "learning_rate": 2.9811121873923922e-06, + "loss": 1.0004, + "step": 61830 + }, + { + "epoch": 0.7729193229830745, + "grad_norm": 6.075371265411377, + "learning_rate": 2.9804906095722807e-06, + "loss": 1.4976, + "step": 61832 + }, + { + "epoch": 0.7729443236080902, + "grad_norm": 0.3570877015590668, + "learning_rate": 2.9798690852117617e-06, + "loss": 1.29, + "step": 61834 + }, + { + "epoch": 0.7729693242331058, + "grad_norm": 3.2151362895965576, + "learning_rate": 2.979247614315566e-06, + "loss": 1.8764, + "step": 61836 + }, + { + "epoch": 0.7729943248581215, + "grad_norm": 4.4410271644592285, + "learning_rate": 2.978626196888429e-06, + "loss": 1.2667, + "step": 61838 + }, + { + "epoch": 0.7730193254831371, + "grad_norm": 0.0003652128216344863, + "learning_rate": 2.978004832935082e-06, + "loss": 1.0944, + "step": 61840 + }, + { + "epoch": 0.7730443261081527, + "grad_norm": 1.7290139198303223, + "learning_rate": 2.9773835224602555e-06, + "loss": 0.401, + "step": 61842 + }, + { + "epoch": 0.7730693267331683, + "grad_norm": 6.085134029388428, + "learning_rate": 2.9767622654686843e-06, + "loss": 1.7619, + "step": 61844 + }, + { + "epoch": 0.773094327358184, + "grad_norm": 1.1203457117080688, + "learning_rate": 2.9761410619650966e-06, + "loss": 0.4492, + "step": 61846 + }, + { + "epoch": 0.7731193279831996, + "grad_norm": 6.312357425689697, + "learning_rate": 2.9755199119542276e-06, + "loss": 0.3092, + "step": 61848 + }, + { + "epoch": 0.7731443286082152, + "grad_norm": 5.811513423919678, + "learning_rate": 2.9748988154408033e-06, + "loss": 1.6836, + "step": 61850 + }, + { + "epoch": 0.7731693292332308, + "grad_norm": 5.975757122039795, + "learning_rate": 2.974277772429559e-06, + "loss": 0.7197, + "step": 61852 + }, + { + "epoch": 0.7731943298582464, + "grad_norm": 2.508779287338257, + "learning_rate": 2.9736567829252214e-06, + "loss": 0.9504, + "step": 61854 + }, + { + "epoch": 0.7732193304832621, + "grad_norm": 0.0009248738642781973, + "learning_rate": 2.9730358469325184e-06, + "loss": 0.5699, + "step": 61856 + }, + { + "epoch": 0.7732443311082777, + "grad_norm": 2.487067937850952, + "learning_rate": 2.9724149644561816e-06, + "loss": 1.3442, + "step": 61858 + }, + { + "epoch": 0.7732693317332934, + "grad_norm": 4.516107559204102, + "learning_rate": 2.971794135500937e-06, + "loss": 1.1852, + "step": 61860 + }, + { + "epoch": 0.7732943323583089, + "grad_norm": 0.0007181546534411609, + "learning_rate": 2.9711733600715175e-06, + "loss": 0.6807, + "step": 61862 + }, + { + "epoch": 0.7733193329833246, + "grad_norm": 3.010660171508789, + "learning_rate": 2.9705526381726447e-06, + "loss": 1.3099, + "step": 61864 + }, + { + "epoch": 0.7733443336083402, + "grad_norm": 5.920000076293945, + "learning_rate": 2.969931969809051e-06, + "loss": 1.5416, + "step": 61866 + }, + { + "epoch": 0.7733693342333559, + "grad_norm": 7.814838886260986, + "learning_rate": 2.9693113549854623e-06, + "loss": 1.0993, + "step": 61868 + }, + { + "epoch": 0.7733943348583715, + "grad_norm": 4.71687126159668, + "learning_rate": 2.9686907937066e-06, + "loss": 1.5381, + "step": 61870 + }, + { + "epoch": 0.773419335483387, + "grad_norm": 4.786325931549072, + "learning_rate": 2.9680702859771983e-06, + "loss": 1.0518, + "step": 61872 + }, + { + "epoch": 0.7734443361084027, + "grad_norm": 0.000421212607761845, + "learning_rate": 2.9674498318019752e-06, + "loss": 0.244, + "step": 61874 + }, + { + "epoch": 0.7734693367334183, + "grad_norm": 3.6065218448638916, + "learning_rate": 2.9668294311856627e-06, + "loss": 0.556, + "step": 61876 + }, + { + "epoch": 0.773494337358434, + "grad_norm": 2.3409769535064697, + "learning_rate": 2.9662090841329837e-06, + "loss": 1.1771, + "step": 61878 + }, + { + "epoch": 0.7735193379834496, + "grad_norm": 6.53112268447876, + "learning_rate": 2.965588790648658e-06, + "loss": 1.6597, + "step": 61880 + }, + { + "epoch": 0.7735443386084652, + "grad_norm": 3.142910957336426, + "learning_rate": 2.964968550737416e-06, + "loss": 1.2539, + "step": 61882 + }, + { + "epoch": 0.7735693392334808, + "grad_norm": 4.0027570724487305, + "learning_rate": 2.964348364403976e-06, + "loss": 1.6166, + "step": 61884 + }, + { + "epoch": 0.7735943398584965, + "grad_norm": 0.0005344548262655735, + "learning_rate": 2.9637282316530668e-06, + "loss": 0.3316, + "step": 61886 + }, + { + "epoch": 0.7736193404835121, + "grad_norm": 0.0027748537249863148, + "learning_rate": 2.9631081524894055e-06, + "loss": 0.5403, + "step": 61888 + }, + { + "epoch": 0.7736443411085278, + "grad_norm": 5.800195693969727, + "learning_rate": 2.96248812691772e-06, + "loss": 2.2745, + "step": 61890 + }, + { + "epoch": 0.7736693417335433, + "grad_norm": 4.876147270202637, + "learning_rate": 2.9618681549427297e-06, + "loss": 0.7514, + "step": 61892 + }, + { + "epoch": 0.7736943423585589, + "grad_norm": 3.3541271686553955, + "learning_rate": 2.9612482365691542e-06, + "loss": 1.0683, + "step": 61894 + }, + { + "epoch": 0.7737193429835746, + "grad_norm": 0.0025648283772170544, + "learning_rate": 2.9606283718017194e-06, + "loss": 0.8145, + "step": 61896 + }, + { + "epoch": 0.7737443436085902, + "grad_norm": 4.100259780883789, + "learning_rate": 2.9600085606451403e-06, + "loss": 1.0937, + "step": 61898 + }, + { + "epoch": 0.7737693442336059, + "grad_norm": 1.554423213005066, + "learning_rate": 2.959388803104143e-06, + "loss": 0.1996, + "step": 61900 + }, + { + "epoch": 0.7737943448586214, + "grad_norm": 0.2939918041229248, + "learning_rate": 2.9587690991834417e-06, + "loss": 0.5017, + "step": 61902 + }, + { + "epoch": 0.7738193454836371, + "grad_norm": 3.612316608428955, + "learning_rate": 2.958149448887763e-06, + "loss": 0.7427, + "step": 61904 + }, + { + "epoch": 0.7738443461086527, + "grad_norm": 2.3131792545318604, + "learning_rate": 2.957529852221822e-06, + "loss": 0.3372, + "step": 61906 + }, + { + "epoch": 0.7738693467336684, + "grad_norm": 1.7187591791152954, + "learning_rate": 2.956910309190335e-06, + "loss": 0.5264, + "step": 61908 + }, + { + "epoch": 0.773894347358684, + "grad_norm": 2.0454511642456055, + "learning_rate": 2.956290819798027e-06, + "loss": 0.0739, + "step": 61910 + }, + { + "epoch": 0.7739193479836995, + "grad_norm": 3.982783555984497, + "learning_rate": 2.9556713840496076e-06, + "loss": 1.5, + "step": 61912 + }, + { + "epoch": 0.7739443486087152, + "grad_norm": 4.662473678588867, + "learning_rate": 2.955052001949802e-06, + "loss": 1.1912, + "step": 61914 + }, + { + "epoch": 0.7739693492337308, + "grad_norm": 4.907127380371094, + "learning_rate": 2.9544326735033212e-06, + "loss": 0.9887, + "step": 61916 + }, + { + "epoch": 0.7739943498587465, + "grad_norm": 2.8803744316101074, + "learning_rate": 2.9538133987148874e-06, + "loss": 0.427, + "step": 61918 + }, + { + "epoch": 0.7740193504837621, + "grad_norm": 3.098271131515503, + "learning_rate": 2.953194177589216e-06, + "loss": 0.5215, + "step": 61920 + }, + { + "epoch": 0.7740443511087777, + "grad_norm": 4.219414234161377, + "learning_rate": 2.952575010131017e-06, + "loss": 1.0291, + "step": 61922 + }, + { + "epoch": 0.7740693517337933, + "grad_norm": 1.355936050415039, + "learning_rate": 2.951955896345012e-06, + "loss": 0.0818, + "step": 61924 + }, + { + "epoch": 0.774094352358809, + "grad_norm": 1.4332776069641113, + "learning_rate": 2.951336836235913e-06, + "loss": 0.1204, + "step": 61926 + }, + { + "epoch": 0.7741193529838246, + "grad_norm": 4.896000385284424, + "learning_rate": 2.950717829808435e-06, + "loss": 1.539, + "step": 61928 + }, + { + "epoch": 0.7741443536088403, + "grad_norm": 2.5469114780426025, + "learning_rate": 2.9500988770673e-06, + "loss": 1.989, + "step": 61930 + }, + { + "epoch": 0.7741693542338558, + "grad_norm": 1.2669378519058228, + "learning_rate": 2.9494799780172078e-06, + "loss": 0.0478, + "step": 61932 + }, + { + "epoch": 0.7741943548588714, + "grad_norm": 1.474564552307129, + "learning_rate": 2.9488611326628836e-06, + "loss": 0.0538, + "step": 61934 + }, + { + "epoch": 0.7742193554838871, + "grad_norm": 0.0009794571669772267, + "learning_rate": 2.948242341009032e-06, + "loss": 0.5415, + "step": 61936 + }, + { + "epoch": 0.7742443561089027, + "grad_norm": 2.1856882572174072, + "learning_rate": 2.9476236030603724e-06, + "loss": 0.849, + "step": 61938 + }, + { + "epoch": 0.7742693567339184, + "grad_norm": 3.0658864974975586, + "learning_rate": 2.947004918821611e-06, + "loss": 0.6024, + "step": 61940 + }, + { + "epoch": 0.7742943573589339, + "grad_norm": 3.3522775173187256, + "learning_rate": 2.9463862882974637e-06, + "loss": 0.3807, + "step": 61942 + }, + { + "epoch": 0.7743193579839496, + "grad_norm": 4.100788593292236, + "learning_rate": 2.9457677114926463e-06, + "loss": 0.3914, + "step": 61944 + }, + { + "epoch": 0.7743443586089652, + "grad_norm": 3.8446991443634033, + "learning_rate": 2.9451491884118576e-06, + "loss": 1.0077, + "step": 61946 + }, + { + "epoch": 0.7743693592339809, + "grad_norm": 5.842966079711914, + "learning_rate": 2.9445307190598192e-06, + "loss": 1.2767, + "step": 61948 + }, + { + "epoch": 0.7743943598589965, + "grad_norm": 0.38296979665756226, + "learning_rate": 2.9439123034412344e-06, + "loss": 0.3466, + "step": 61950 + }, + { + "epoch": 0.774419360484012, + "grad_norm": 3.2511959075927734, + "learning_rate": 2.9432939415608173e-06, + "loss": 0.914, + "step": 61952 + }, + { + "epoch": 0.7744443611090277, + "grad_norm": 3.3661577701568604, + "learning_rate": 2.942675633423273e-06, + "loss": 1.8031, + "step": 61954 + }, + { + "epoch": 0.7744693617340433, + "grad_norm": 3.1792564392089844, + "learning_rate": 2.942057379033314e-06, + "loss": 0.5695, + "step": 61956 + }, + { + "epoch": 0.774494362359059, + "grad_norm": 4.309042930603027, + "learning_rate": 2.941439178395654e-06, + "loss": 1.6391, + "step": 61958 + }, + { + "epoch": 0.7745193629840746, + "grad_norm": 2.7069315910339355, + "learning_rate": 2.9408210315149876e-06, + "loss": 0.3467, + "step": 61960 + }, + { + "epoch": 0.7745443636090902, + "grad_norm": 7.2148613929748535, + "learning_rate": 2.940202938396034e-06, + "loss": 0.7542, + "step": 61962 + }, + { + "epoch": 0.7745693642341058, + "grad_norm": 4.315525054931641, + "learning_rate": 2.939584899043494e-06, + "loss": 1.2806, + "step": 61964 + }, + { + "epoch": 0.7745943648591215, + "grad_norm": 2.3479461669921875, + "learning_rate": 2.938966913462076e-06, + "loss": 0.1239, + "step": 61966 + }, + { + "epoch": 0.7746193654841371, + "grad_norm": 1.6287283897399902, + "learning_rate": 2.9383489816564904e-06, + "loss": 0.5889, + "step": 61968 + }, + { + "epoch": 0.7746443661091528, + "grad_norm": 3.218597173690796, + "learning_rate": 2.9377311036314403e-06, + "loss": 0.5503, + "step": 61970 + }, + { + "epoch": 0.7746693667341683, + "grad_norm": 0.00045448727905750275, + "learning_rate": 2.9371132793916323e-06, + "loss": 0.159, + "step": 61972 + }, + { + "epoch": 0.774694367359184, + "grad_norm": 2.2087767124176025, + "learning_rate": 2.9364955089417668e-06, + "loss": 1.5881, + "step": 61974 + }, + { + "epoch": 0.7747193679841996, + "grad_norm": 4.769234657287598, + "learning_rate": 2.935877792286557e-06, + "loss": 0.5682, + "step": 61976 + }, + { + "epoch": 0.7747443686092153, + "grad_norm": 2.586798906326294, + "learning_rate": 2.935260129430699e-06, + "loss": 1.4513, + "step": 61978 + }, + { + "epoch": 0.7747693692342309, + "grad_norm": 0.899158775806427, + "learning_rate": 2.934642520378901e-06, + "loss": 0.0187, + "step": 61980 + }, + { + "epoch": 0.7747943698592464, + "grad_norm": 4.082297325134277, + "learning_rate": 2.9340249651358688e-06, + "loss": 1.6108, + "step": 61982 + }, + { + "epoch": 0.7748193704842621, + "grad_norm": 3.0428686141967773, + "learning_rate": 2.9334074637063037e-06, + "loss": 1.4327, + "step": 61984 + }, + { + "epoch": 0.7748443711092777, + "grad_norm": 2.905447244644165, + "learning_rate": 2.9327900160949087e-06, + "loss": 0.7763, + "step": 61986 + }, + { + "epoch": 0.7748693717342934, + "grad_norm": 4.255249977111816, + "learning_rate": 2.9321726223063816e-06, + "loss": 0.4061, + "step": 61988 + }, + { + "epoch": 0.774894372359309, + "grad_norm": 0.3569040894508362, + "learning_rate": 2.9315552823454287e-06, + "loss": 0.689, + "step": 61990 + }, + { + "epoch": 0.7749193729843246, + "grad_norm": 3.259819507598877, + "learning_rate": 2.930937996216754e-06, + "loss": 0.8047, + "step": 61992 + }, + { + "epoch": 0.7749443736093402, + "grad_norm": 0.16243131458759308, + "learning_rate": 2.930320763925053e-06, + "loss": 0.357, + "step": 61994 + }, + { + "epoch": 0.7749693742343559, + "grad_norm": 0.06768417358398438, + "learning_rate": 2.9297035854750354e-06, + "loss": 0.0011, + "step": 61996 + }, + { + "epoch": 0.7749943748593715, + "grad_norm": 2.3960912227630615, + "learning_rate": 2.92908646087139e-06, + "loss": 1.5257, + "step": 61998 + }, + { + "epoch": 0.7750193754843872, + "grad_norm": 0.00038627925096079707, + "learning_rate": 2.9284693901188245e-06, + "loss": 0.0, + "step": 62000 + }, + { + "epoch": 0.7750443761094027, + "grad_norm": 1.4406477212905884, + "learning_rate": 2.927852373222033e-06, + "loss": 0.9797, + "step": 62002 + }, + { + "epoch": 0.7750693767344183, + "grad_norm": 4.762993335723877, + "learning_rate": 2.927235410185718e-06, + "loss": 1.6028, + "step": 62004 + }, + { + "epoch": 0.775094377359434, + "grad_norm": 4.467766284942627, + "learning_rate": 2.926618501014581e-06, + "loss": 1.0137, + "step": 62006 + }, + { + "epoch": 0.7751193779844496, + "grad_norm": 2.3341708183288574, + "learning_rate": 2.9260016457133144e-06, + "loss": 0.8565, + "step": 62008 + }, + { + "epoch": 0.7751443786094653, + "grad_norm": 4.05273962020874, + "learning_rate": 2.925384844286625e-06, + "loss": 1.5092, + "step": 62010 + }, + { + "epoch": 0.7751693792344808, + "grad_norm": 2.292024850845337, + "learning_rate": 2.924768096739198e-06, + "loss": 1.1323, + "step": 62012 + }, + { + "epoch": 0.7751943798594965, + "grad_norm": 5.031254768371582, + "learning_rate": 2.9241514030757356e-06, + "loss": 2.0127, + "step": 62014 + }, + { + "epoch": 0.7752193804845121, + "grad_norm": 3.384760856628418, + "learning_rate": 2.9235347633009403e-06, + "loss": 0.7608, + "step": 62016 + }, + { + "epoch": 0.7752443811095278, + "grad_norm": 4.369751930236816, + "learning_rate": 2.922918177419499e-06, + "loss": 0.532, + "step": 62018 + }, + { + "epoch": 0.7752693817345434, + "grad_norm": 11.314619064331055, + "learning_rate": 2.9223016454361164e-06, + "loss": 2.4891, + "step": 62020 + }, + { + "epoch": 0.7752943823595589, + "grad_norm": 6.937460899353027, + "learning_rate": 2.9216851673554824e-06, + "loss": 1.7171, + "step": 62022 + }, + { + "epoch": 0.7753193829845746, + "grad_norm": 4.671358585357666, + "learning_rate": 2.921068743182294e-06, + "loss": 1.7396, + "step": 62024 + }, + { + "epoch": 0.7753443836095902, + "grad_norm": 0.0004987911670468748, + "learning_rate": 2.9204523729212418e-06, + "loss": 0.4978, + "step": 62026 + }, + { + "epoch": 0.7753693842346059, + "grad_norm": 2.4749696254730225, + "learning_rate": 2.9198360565770225e-06, + "loss": 0.9683, + "step": 62028 + }, + { + "epoch": 0.7753943848596215, + "grad_norm": 3.503390073776245, + "learning_rate": 2.9192197941543344e-06, + "loss": 0.5686, + "step": 62030 + }, + { + "epoch": 0.7754193854846371, + "grad_norm": 4.21837854385376, + "learning_rate": 2.9186035856578643e-06, + "loss": 0.8124, + "step": 62032 + }, + { + "epoch": 0.7754443861096527, + "grad_norm": 3.362621307373047, + "learning_rate": 2.9179874310923093e-06, + "loss": 1.1255, + "step": 62034 + }, + { + "epoch": 0.7754693867346684, + "grad_norm": 3.7223496437072754, + "learning_rate": 2.917371330462362e-06, + "loss": 2.0326, + "step": 62036 + }, + { + "epoch": 0.775494387359684, + "grad_norm": 2.9498767852783203, + "learning_rate": 2.9167552837727133e-06, + "loss": 0.3515, + "step": 62038 + }, + { + "epoch": 0.7755193879846997, + "grad_norm": 1.611490249633789, + "learning_rate": 2.9161392910280506e-06, + "loss": 0.1398, + "step": 62040 + }, + { + "epoch": 0.7755443886097152, + "grad_norm": 2.9022743701934814, + "learning_rate": 2.9155233522330695e-06, + "loss": 1.3785, + "step": 62042 + }, + { + "epoch": 0.7755693892347308, + "grad_norm": 2.5557494163513184, + "learning_rate": 2.914907467392465e-06, + "loss": 0.519, + "step": 62044 + }, + { + "epoch": 0.7755943898597465, + "grad_norm": 3.842895746231079, + "learning_rate": 2.9142916365109188e-06, + "loss": 0.7586, + "step": 62046 + }, + { + "epoch": 0.7756193904847621, + "grad_norm": 3.266159772872925, + "learning_rate": 2.9136758595931326e-06, + "loss": 0.5314, + "step": 62048 + }, + { + "epoch": 0.7756443911097778, + "grad_norm": 0.0003911909880116582, + "learning_rate": 2.913060136643782e-06, + "loss": 0.007, + "step": 62050 + }, + { + "epoch": 0.7756693917347933, + "grad_norm": 3.4534924030303955, + "learning_rate": 2.9124444676675635e-06, + "loss": 0.6499, + "step": 62052 + }, + { + "epoch": 0.775694392359809, + "grad_norm": 4.990004062652588, + "learning_rate": 2.9118288526691686e-06, + "loss": 2.2556, + "step": 62054 + }, + { + "epoch": 0.7757193929848246, + "grad_norm": 3.3298864364624023, + "learning_rate": 2.91121329165328e-06, + "loss": 1.2126, + "step": 62056 + }, + { + "epoch": 0.7757443936098403, + "grad_norm": 2.1887011528015137, + "learning_rate": 2.9105977846245912e-06, + "loss": 1.2509, + "step": 62058 + }, + { + "epoch": 0.7757693942348559, + "grad_norm": 2.8777098655700684, + "learning_rate": 2.909982331587785e-06, + "loss": 1.1659, + "step": 62060 + }, + { + "epoch": 0.7757943948598715, + "grad_norm": 3.2846524715423584, + "learning_rate": 2.909366932547556e-06, + "loss": 0.7869, + "step": 62062 + }, + { + "epoch": 0.7758193954848871, + "grad_norm": 2.702471971511841, + "learning_rate": 2.9087515875085804e-06, + "loss": 0.6607, + "step": 62064 + }, + { + "epoch": 0.7758443961099027, + "grad_norm": 4.557599067687988, + "learning_rate": 2.908136296475549e-06, + "loss": 1.4064, + "step": 62066 + }, + { + "epoch": 0.7758693967349184, + "grad_norm": 5.548174858093262, + "learning_rate": 2.9075210594531524e-06, + "loss": 1.2178, + "step": 62068 + }, + { + "epoch": 0.775894397359934, + "grad_norm": 3.3123648166656494, + "learning_rate": 2.906905876446069e-06, + "loss": 0.8471, + "step": 62070 + }, + { + "epoch": 0.7759193979849496, + "grad_norm": 5.378952503204346, + "learning_rate": 2.9062907474589907e-06, + "loss": 1.0787, + "step": 62072 + }, + { + "epoch": 0.7759443986099652, + "grad_norm": 2.7329139709472656, + "learning_rate": 2.9056756724965983e-06, + "loss": 1.2807, + "step": 62074 + }, + { + "epoch": 0.7759693992349809, + "grad_norm": 3.693469524383545, + "learning_rate": 2.9050606515635747e-06, + "loss": 1.382, + "step": 62076 + }, + { + "epoch": 0.7759943998599965, + "grad_norm": 2.929349422454834, + "learning_rate": 2.9044456846646085e-06, + "loss": 0.3981, + "step": 62078 + }, + { + "epoch": 0.7760194004850122, + "grad_norm": 3.2073605060577393, + "learning_rate": 2.9038307718043768e-06, + "loss": 0.7582, + "step": 62080 + }, + { + "epoch": 0.7760444011100277, + "grad_norm": 5.3996052742004395, + "learning_rate": 2.9032159129875702e-06, + "loss": 1.8941, + "step": 62082 + }, + { + "epoch": 0.7760694017350434, + "grad_norm": 0.0003529268142301589, + "learning_rate": 2.902601108218864e-06, + "loss": 1.2002, + "step": 62084 + }, + { + "epoch": 0.776094402360059, + "grad_norm": 4.421680927276611, + "learning_rate": 2.901986357502948e-06, + "loss": 1.8817, + "step": 62086 + }, + { + "epoch": 0.7761194029850746, + "grad_norm": 2.5795130729675293, + "learning_rate": 2.9013716608445e-06, + "loss": 0.2866, + "step": 62088 + }, + { + "epoch": 0.7761444036100903, + "grad_norm": 3.6508445739746094, + "learning_rate": 2.900757018248198e-06, + "loss": 0.5479, + "step": 62090 + }, + { + "epoch": 0.7761694042351058, + "grad_norm": 3.4465062618255615, + "learning_rate": 2.900142429718731e-06, + "loss": 2.0421, + "step": 62092 + }, + { + "epoch": 0.7761944048601215, + "grad_norm": 2.2430689334869385, + "learning_rate": 2.899527895260771e-06, + "loss": 0.9556, + "step": 62094 + }, + { + "epoch": 0.7762194054851371, + "grad_norm": 1.234269380569458, + "learning_rate": 2.8989134148790055e-06, + "loss": 0.2616, + "step": 62096 + }, + { + "epoch": 0.7762444061101528, + "grad_norm": 2.367486000061035, + "learning_rate": 2.898298988578109e-06, + "loss": 1.165, + "step": 62098 + }, + { + "epoch": 0.7762694067351684, + "grad_norm": 2.959244728088379, + "learning_rate": 2.897684616362768e-06, + "loss": 1.2156, + "step": 62100 + }, + { + "epoch": 0.776294407360184, + "grad_norm": 2.445566415786743, + "learning_rate": 2.897070298237651e-06, + "loss": 0.3326, + "step": 62102 + }, + { + "epoch": 0.7763194079851996, + "grad_norm": 5.361914157867432, + "learning_rate": 2.8964560342074423e-06, + "loss": 1.0183, + "step": 62104 + }, + { + "epoch": 0.7763444086102153, + "grad_norm": 2.844869375228882, + "learning_rate": 2.8958418242768226e-06, + "loss": 1.015, + "step": 62106 + }, + { + "epoch": 0.7763694092352309, + "grad_norm": 5.127340793609619, + "learning_rate": 2.8952276684504642e-06, + "loss": 0.8046, + "step": 62108 + }, + { + "epoch": 0.7763944098602465, + "grad_norm": 1.0289779901504517, + "learning_rate": 2.894613566733051e-06, + "loss": 1.1116, + "step": 62110 + }, + { + "epoch": 0.7764194104852621, + "grad_norm": 2.675504684448242, + "learning_rate": 2.8939995191292524e-06, + "loss": 0.4695, + "step": 62112 + }, + { + "epoch": 0.7764444111102777, + "grad_norm": 0.05787481367588043, + "learning_rate": 2.8933855256437514e-06, + "loss": 0.7476, + "step": 62114 + }, + { + "epoch": 0.7764694117352934, + "grad_norm": 3.941762685775757, + "learning_rate": 2.892771586281221e-06, + "loss": 1.1408, + "step": 62116 + }, + { + "epoch": 0.776494412360309, + "grad_norm": 4.065406322479248, + "learning_rate": 2.8921577010463355e-06, + "loss": 2.4316, + "step": 62118 + }, + { + "epoch": 0.7765194129853247, + "grad_norm": 1.2470147609710693, + "learning_rate": 2.891543869943774e-06, + "loss": 0.5316, + "step": 62120 + }, + { + "epoch": 0.7765444136103402, + "grad_norm": 4.4233856201171875, + "learning_rate": 2.8909300929782045e-06, + "loss": 1.1993, + "step": 62122 + }, + { + "epoch": 0.7765694142353559, + "grad_norm": 0.0003497344732750207, + "learning_rate": 2.890316370154311e-06, + "loss": 0.9219, + "step": 62124 + }, + { + "epoch": 0.7765944148603715, + "grad_norm": 0.00027762583340518177, + "learning_rate": 2.889702701476763e-06, + "loss": 0.7423, + "step": 62126 + }, + { + "epoch": 0.7766194154853872, + "grad_norm": 0.004978731274604797, + "learning_rate": 2.8890890869502295e-06, + "loss": 0.74, + "step": 62128 + }, + { + "epoch": 0.7766444161104028, + "grad_norm": 2.3706111907958984, + "learning_rate": 2.8884755265793916e-06, + "loss": 0.3721, + "step": 62130 + }, + { + "epoch": 0.7766694167354183, + "grad_norm": 0.2383573055267334, + "learning_rate": 2.8878620203689157e-06, + "loss": 0.9833, + "step": 62132 + }, + { + "epoch": 0.776694417360434, + "grad_norm": 2.4870283603668213, + "learning_rate": 2.8872485683234786e-06, + "loss": 0.7942, + "step": 62134 + }, + { + "epoch": 0.7767194179854496, + "grad_norm": 3.8380942344665527, + "learning_rate": 2.886635170447749e-06, + "loss": 1.7604, + "step": 62136 + }, + { + "epoch": 0.7767444186104653, + "grad_norm": 3.963115930557251, + "learning_rate": 2.8860218267464022e-06, + "loss": 0.6912, + "step": 62138 + }, + { + "epoch": 0.7767694192354809, + "grad_norm": 3.2789411544799805, + "learning_rate": 2.885408537224107e-06, + "loss": 1.6152, + "step": 62140 + }, + { + "epoch": 0.7767944198604965, + "grad_norm": 2.6676077842712402, + "learning_rate": 2.8847953018855324e-06, + "loss": 1.1635, + "step": 62142 + }, + { + "epoch": 0.7768194204855121, + "grad_norm": 0.033801306039094925, + "learning_rate": 2.8841821207353527e-06, + "loss": 0.9203, + "step": 62144 + }, + { + "epoch": 0.7768444211105278, + "grad_norm": 2.261871814727783, + "learning_rate": 2.883568993778233e-06, + "loss": 0.731, + "step": 62146 + }, + { + "epoch": 0.7768694217355434, + "grad_norm": 4.300609588623047, + "learning_rate": 2.8829559210188484e-06, + "loss": 1.7086, + "step": 62148 + }, + { + "epoch": 0.7768944223605591, + "grad_norm": 2.827005386352539, + "learning_rate": 2.882342902461862e-06, + "loss": 0.5285, + "step": 62150 + }, + { + "epoch": 0.7769194229855746, + "grad_norm": 2.130542278289795, + "learning_rate": 2.881729938111948e-06, + "loss": 0.1446, + "step": 62152 + }, + { + "epoch": 0.7769444236105902, + "grad_norm": 4.742413520812988, + "learning_rate": 2.881117027973773e-06, + "loss": 1.0647, + "step": 62154 + }, + { + "epoch": 0.7769694242356059, + "grad_norm": 2.227489709854126, + "learning_rate": 2.880504172052001e-06, + "loss": 0.3471, + "step": 62156 + }, + { + "epoch": 0.7769944248606215, + "grad_norm": 1.5498418807983398, + "learning_rate": 2.879891370351305e-06, + "loss": 0.7065, + "step": 62158 + }, + { + "epoch": 0.7770194254856372, + "grad_norm": 3.2053093910217285, + "learning_rate": 2.879278622876347e-06, + "loss": 0.9759, + "step": 62160 + }, + { + "epoch": 0.7770444261106527, + "grad_norm": 2.1688878536224365, + "learning_rate": 2.8786659296317985e-06, + "loss": 0.4673, + "step": 62162 + }, + { + "epoch": 0.7770694267356684, + "grad_norm": 4.944073677062988, + "learning_rate": 2.8780532906223204e-06, + "loss": 1.6877, + "step": 62164 + }, + { + "epoch": 0.777094427360684, + "grad_norm": 3.2970900535583496, + "learning_rate": 2.8774407058525843e-06, + "loss": 0.9144, + "step": 62166 + }, + { + "epoch": 0.7771194279856997, + "grad_norm": 3.8135032653808594, + "learning_rate": 2.8768281753272533e-06, + "loss": 1.0867, + "step": 62168 + }, + { + "epoch": 0.7771444286107153, + "grad_norm": 0.00038076486089266837, + "learning_rate": 2.876215699050987e-06, + "loss": 0.6782, + "step": 62170 + }, + { + "epoch": 0.7771694292357308, + "grad_norm": 2.6932857036590576, + "learning_rate": 2.875603277028459e-06, + "loss": 0.4768, + "step": 62172 + }, + { + "epoch": 0.7771944298607465, + "grad_norm": 3.104865550994873, + "learning_rate": 2.8749909092643246e-06, + "loss": 1.2416, + "step": 62174 + }, + { + "epoch": 0.7772194304857621, + "grad_norm": 3.236264705657959, + "learning_rate": 2.8743785957632562e-06, + "loss": 0.8112, + "step": 62176 + }, + { + "epoch": 0.7772444311107778, + "grad_norm": 2.8004109859466553, + "learning_rate": 2.873766336529913e-06, + "loss": 1.0113, + "step": 62178 + }, + { + "epoch": 0.7772694317357934, + "grad_norm": 3.960698127746582, + "learning_rate": 2.8731541315689538e-06, + "loss": 1.7785, + "step": 62180 + }, + { + "epoch": 0.777294432360809, + "grad_norm": 3.6035308837890625, + "learning_rate": 2.8725419808850476e-06, + "loss": 1.3768, + "step": 62182 + }, + { + "epoch": 0.7773194329858246, + "grad_norm": 5.623703956604004, + "learning_rate": 2.8719298844828515e-06, + "loss": 1.5406, + "step": 62184 + }, + { + "epoch": 0.7773444336108403, + "grad_norm": 7.75333309173584, + "learning_rate": 2.8713178423670317e-06, + "loss": 2.3874, + "step": 62186 + }, + { + "epoch": 0.7773694342358559, + "grad_norm": 2.7795608043670654, + "learning_rate": 2.870705854542244e-06, + "loss": 0.1994, + "step": 62188 + }, + { + "epoch": 0.7773944348608716, + "grad_norm": 2.775261163711548, + "learning_rate": 2.8700939210131563e-06, + "loss": 1.6184, + "step": 62190 + }, + { + "epoch": 0.7774194354858871, + "grad_norm": 2.5471134185791016, + "learning_rate": 2.8694820417844247e-06, + "loss": 0.4142, + "step": 62192 + }, + { + "epoch": 0.7774444361109027, + "grad_norm": 2.4987902641296387, + "learning_rate": 2.8688702168607063e-06, + "loss": 0.4908, + "step": 62194 + }, + { + "epoch": 0.7774694367359184, + "grad_norm": 3.4058048725128174, + "learning_rate": 2.8682584462466665e-06, + "loss": 1.4472, + "step": 62196 + }, + { + "epoch": 0.777494437360934, + "grad_norm": 3.346782922744751, + "learning_rate": 2.867646729946959e-06, + "loss": 2.0628, + "step": 62198 + }, + { + "epoch": 0.7775194379859497, + "grad_norm": 2.583763837814331, + "learning_rate": 2.8670350679662495e-06, + "loss": 0.8665, + "step": 62200 + }, + { + "epoch": 0.7775444386109652, + "grad_norm": 3.4735207557678223, + "learning_rate": 2.866423460309188e-06, + "loss": 0.4853, + "step": 62202 + }, + { + "epoch": 0.7775694392359809, + "grad_norm": 2.5726137161254883, + "learning_rate": 2.86581190698044e-06, + "loss": 1.1678, + "step": 62204 + }, + { + "epoch": 0.7775944398609965, + "grad_norm": 4.475280284881592, + "learning_rate": 2.8652004079846597e-06, + "loss": 1.0676, + "step": 62206 + }, + { + "epoch": 0.7776194404860122, + "grad_norm": 4.009522914886475, + "learning_rate": 2.864588963326501e-06, + "loss": 1.4016, + "step": 62208 + }, + { + "epoch": 0.7776444411110278, + "grad_norm": 3.1069202423095703, + "learning_rate": 2.863977573010628e-06, + "loss": 1.3043, + "step": 62210 + }, + { + "epoch": 0.7776694417360434, + "grad_norm": 7.146910190582275, + "learning_rate": 2.8633662370416882e-06, + "loss": 1.2102, + "step": 62212 + }, + { + "epoch": 0.777694442361059, + "grad_norm": 2.901160955429077, + "learning_rate": 2.862754955424345e-06, + "loss": 1.5733, + "step": 62214 + }, + { + "epoch": 0.7777194429860746, + "grad_norm": 0.003303816309198737, + "learning_rate": 2.862143728163248e-06, + "loss": 0.3544, + "step": 62216 + }, + { + "epoch": 0.7777444436110903, + "grad_norm": 5.107573986053467, + "learning_rate": 2.861532555263058e-06, + "loss": 1.6817, + "step": 62218 + }, + { + "epoch": 0.777769444236106, + "grad_norm": 2.9945249557495117, + "learning_rate": 2.8609214367284266e-06, + "loss": 1.2363, + "step": 62220 + }, + { + "epoch": 0.7777944448611215, + "grad_norm": 3.157090425491333, + "learning_rate": 2.8603103725640057e-06, + "loss": 1.6621, + "step": 62222 + }, + { + "epoch": 0.7778194454861371, + "grad_norm": 3.0526275634765625, + "learning_rate": 2.8596993627744527e-06, + "loss": 0.2193, + "step": 62224 + }, + { + "epoch": 0.7778444461111528, + "grad_norm": 4.652505874633789, + "learning_rate": 2.859088407364419e-06, + "loss": 1.8642, + "step": 62226 + }, + { + "epoch": 0.7778694467361684, + "grad_norm": 0.6868504881858826, + "learning_rate": 2.8584775063385597e-06, + "loss": 0.0963, + "step": 62228 + }, + { + "epoch": 0.7778944473611841, + "grad_norm": 2.4164180755615234, + "learning_rate": 2.8578666597015225e-06, + "loss": 1.1864, + "step": 62230 + }, + { + "epoch": 0.7779194479861996, + "grad_norm": 3.3088340759277344, + "learning_rate": 2.8572558674579664e-06, + "loss": 0.5968, + "step": 62232 + }, + { + "epoch": 0.7779444486112153, + "grad_norm": 3.389425754547119, + "learning_rate": 2.8566451296125387e-06, + "loss": 1.7339, + "step": 62234 + }, + { + "epoch": 0.7779694492362309, + "grad_norm": 5.346900463104248, + "learning_rate": 2.8560344461698895e-06, + "loss": 0.5679, + "step": 62236 + }, + { + "epoch": 0.7779944498612466, + "grad_norm": 3.2887685298919678, + "learning_rate": 2.855423817134675e-06, + "loss": 0.4354, + "step": 62238 + }, + { + "epoch": 0.7780194504862622, + "grad_norm": 1.7848113775253296, + "learning_rate": 2.854813242511538e-06, + "loss": 0.3432, + "step": 62240 + }, + { + "epoch": 0.7780444511112777, + "grad_norm": 5.000954627990723, + "learning_rate": 2.8542027223051373e-06, + "loss": 1.9007, + "step": 62242 + }, + { + "epoch": 0.7780694517362934, + "grad_norm": 3.7352352142333984, + "learning_rate": 2.8535922565201167e-06, + "loss": 0.5683, + "step": 62244 + }, + { + "epoch": 0.778094452361309, + "grad_norm": 3.8827033042907715, + "learning_rate": 2.8529818451611245e-06, + "loss": 0.7347, + "step": 62246 + }, + { + "epoch": 0.7781194529863247, + "grad_norm": 1.0966233015060425, + "learning_rate": 2.852371488232816e-06, + "loss": 0.3739, + "step": 62248 + }, + { + "epoch": 0.7781444536113403, + "grad_norm": 5.261806964874268, + "learning_rate": 2.851761185739831e-06, + "loss": 0.8659, + "step": 62250 + }, + { + "epoch": 0.7781694542363559, + "grad_norm": 7.7463154792785645, + "learning_rate": 2.851150937686826e-06, + "loss": 0.4388, + "step": 62252 + }, + { + "epoch": 0.7781944548613715, + "grad_norm": 3.3573408126831055, + "learning_rate": 2.8505407440784404e-06, + "loss": 1.0505, + "step": 62254 + }, + { + "epoch": 0.7782194554863872, + "grad_norm": 3.1464288234710693, + "learning_rate": 2.84993060491933e-06, + "loss": 0.239, + "step": 62256 + }, + { + "epoch": 0.7782444561114028, + "grad_norm": 2.2100439071655273, + "learning_rate": 2.849320520214136e-06, + "loss": 0.4316, + "step": 62258 + }, + { + "epoch": 0.7782694567364185, + "grad_norm": 8.9866361618042, + "learning_rate": 2.8487104899675024e-06, + "loss": 1.622, + "step": 62260 + }, + { + "epoch": 0.778294457361434, + "grad_norm": 2.9172987937927246, + "learning_rate": 2.848100514184082e-06, + "loss": 0.5383, + "step": 62262 + }, + { + "epoch": 0.7783194579864496, + "grad_norm": 8.973244667053223, + "learning_rate": 2.8474905928685137e-06, + "loss": 0.4648, + "step": 62264 + }, + { + "epoch": 0.7783444586114653, + "grad_norm": 2.1200308799743652, + "learning_rate": 2.8468807260254482e-06, + "loss": 0.7953, + "step": 62266 + }, + { + "epoch": 0.7783694592364809, + "grad_norm": 3.6977663040161133, + "learning_rate": 2.8462709136595257e-06, + "loss": 1.4506, + "step": 62268 + }, + { + "epoch": 0.7783944598614966, + "grad_norm": 4.952503681182861, + "learning_rate": 2.8456611557753942e-06, + "loss": 1.5391, + "step": 62270 + }, + { + "epoch": 0.7784194604865121, + "grad_norm": 5.162966728210449, + "learning_rate": 2.8450514523776964e-06, + "loss": 1.5968, + "step": 62272 + }, + { + "epoch": 0.7784444611115278, + "grad_norm": 2.01302433013916, + "learning_rate": 2.844441803471071e-06, + "loss": 0.9707, + "step": 62274 + }, + { + "epoch": 0.7784694617365434, + "grad_norm": 2.5626425743103027, + "learning_rate": 2.8438322090601678e-06, + "loss": 0.3606, + "step": 62276 + }, + { + "epoch": 0.7784944623615591, + "grad_norm": 4.781041145324707, + "learning_rate": 2.8432226691496247e-06, + "loss": 1.5948, + "step": 62278 + }, + { + "epoch": 0.7785194629865747, + "grad_norm": 4.062313079833984, + "learning_rate": 2.8426131837440873e-06, + "loss": 0.9429, + "step": 62280 + }, + { + "epoch": 0.7785444636115902, + "grad_norm": 2.529533863067627, + "learning_rate": 2.8420037528481935e-06, + "loss": 0.5457, + "step": 62282 + }, + { + "epoch": 0.7785694642366059, + "grad_norm": 3.4619898796081543, + "learning_rate": 2.8413943764665906e-06, + "loss": 1.3388, + "step": 62284 + }, + { + "epoch": 0.7785944648616215, + "grad_norm": 0.41663068532943726, + "learning_rate": 2.840785054603915e-06, + "loss": 0.0076, + "step": 62286 + }, + { + "epoch": 0.7786194654866372, + "grad_norm": 1.880312204360962, + "learning_rate": 2.8401757872648052e-06, + "loss": 0.7589, + "step": 62288 + }, + { + "epoch": 0.7786444661116528, + "grad_norm": 1.7931870222091675, + "learning_rate": 2.839566574453907e-06, + "loss": 0.3886, + "step": 62290 + }, + { + "epoch": 0.7786694667366684, + "grad_norm": 2.737727165222168, + "learning_rate": 2.8389574161758537e-06, + "loss": 0.4544, + "step": 62292 + }, + { + "epoch": 0.778694467361684, + "grad_norm": 5.1668782234191895, + "learning_rate": 2.838348312435292e-06, + "loss": 0.7865, + "step": 62294 + }, + { + "epoch": 0.7787194679866997, + "grad_norm": 5.899395942687988, + "learning_rate": 2.837739263236856e-06, + "loss": 1.7127, + "step": 62296 + }, + { + "epoch": 0.7787444686117153, + "grad_norm": 1.829870343208313, + "learning_rate": 2.837130268585182e-06, + "loss": 0.8273, + "step": 62298 + }, + { + "epoch": 0.778769469236731, + "grad_norm": 2.1094448566436768, + "learning_rate": 2.8365213284849147e-06, + "loss": 0.287, + "step": 62300 + }, + { + "epoch": 0.7787944698617465, + "grad_norm": 2.01054048538208, + "learning_rate": 2.835912442940685e-06, + "loss": 0.1403, + "step": 62302 + }, + { + "epoch": 0.7788194704867621, + "grad_norm": 3.0007026195526123, + "learning_rate": 2.8353036119571352e-06, + "loss": 0.732, + "step": 62304 + }, + { + "epoch": 0.7788444711117778, + "grad_norm": 5.027185916900635, + "learning_rate": 2.834694835538897e-06, + "loss": 1.5367, + "step": 62306 + }, + { + "epoch": 0.7788694717367934, + "grad_norm": 3.0496318340301514, + "learning_rate": 2.8340861136906105e-06, + "loss": 0.9661, + "step": 62308 + }, + { + "epoch": 0.7788944723618091, + "grad_norm": 3.340092420578003, + "learning_rate": 2.8334774464169157e-06, + "loss": 0.7206, + "step": 62310 + }, + { + "epoch": 0.7789194729868246, + "grad_norm": 2.992042064666748, + "learning_rate": 2.8328688337224386e-06, + "loss": 1.1918, + "step": 62312 + }, + { + "epoch": 0.7789444736118403, + "grad_norm": 0.0024744111578911543, + "learning_rate": 2.832260275611821e-06, + "loss": 0.0863, + "step": 62314 + }, + { + "epoch": 0.7789694742368559, + "grad_norm": 0.7190389633178711, + "learning_rate": 2.8316517720896918e-06, + "loss": 0.0226, + "step": 62316 + }, + { + "epoch": 0.7789944748618716, + "grad_norm": 4.267020225524902, + "learning_rate": 2.831043323160694e-06, + "loss": 1.8485, + "step": 62318 + }, + { + "epoch": 0.7790194754868872, + "grad_norm": 0.0006945444038137794, + "learning_rate": 2.830434928829452e-06, + "loss": 0.6086, + "step": 62320 + }, + { + "epoch": 0.7790444761119028, + "grad_norm": 0.3244941532611847, + "learning_rate": 2.829826589100607e-06, + "loss": 0.4136, + "step": 62322 + }, + { + "epoch": 0.7790694767369184, + "grad_norm": 2.802521228790283, + "learning_rate": 2.829218303978789e-06, + "loss": 0.873, + "step": 62324 + }, + { + "epoch": 0.779094477361934, + "grad_norm": 5.393187999725342, + "learning_rate": 2.828610073468626e-06, + "loss": 1.5268, + "step": 62326 + }, + { + "epoch": 0.7791194779869497, + "grad_norm": 0.00043762591667473316, + "learning_rate": 2.828001897574759e-06, + "loss": 0.7912, + "step": 62328 + }, + { + "epoch": 0.7791444786119653, + "grad_norm": 3.3672285079956055, + "learning_rate": 2.8273937763018123e-06, + "loss": 0.335, + "step": 62330 + }, + { + "epoch": 0.7791694792369809, + "grad_norm": 10.564209938049316, + "learning_rate": 2.8267857096544205e-06, + "loss": 1.0259, + "step": 62332 + }, + { + "epoch": 0.7791944798619965, + "grad_norm": 19.42474365234375, + "learning_rate": 2.826177697637217e-06, + "loss": 0.9966, + "step": 62334 + }, + { + "epoch": 0.7792194804870122, + "grad_norm": 3.4394824504852295, + "learning_rate": 2.8255697402548286e-06, + "loss": 0.729, + "step": 62336 + }, + { + "epoch": 0.7792444811120278, + "grad_norm": 5.270298004150391, + "learning_rate": 2.8249618375118883e-06, + "loss": 1.2173, + "step": 62338 + }, + { + "epoch": 0.7792694817370435, + "grad_norm": 4.284474849700928, + "learning_rate": 2.82435398941302e-06, + "loss": 0.3573, + "step": 62340 + }, + { + "epoch": 0.779294482362059, + "grad_norm": 1.5825601816177368, + "learning_rate": 2.8237461959628598e-06, + "loss": 0.154, + "step": 62342 + }, + { + "epoch": 0.7793194829870747, + "grad_norm": 1.7897050380706787, + "learning_rate": 2.8231384571660313e-06, + "loss": 0.9711, + "step": 62344 + }, + { + "epoch": 0.7793444836120903, + "grad_norm": 5.015264511108398, + "learning_rate": 2.8225307730271654e-06, + "loss": 0.7104, + "step": 62346 + }, + { + "epoch": 0.779369484237106, + "grad_norm": 3.7129862308502197, + "learning_rate": 2.8219231435508952e-06, + "loss": 1.6511, + "step": 62348 + }, + { + "epoch": 0.7793944848621216, + "grad_norm": 3.8411872386932373, + "learning_rate": 2.8213155687418392e-06, + "loss": 0.2182, + "step": 62350 + }, + { + "epoch": 0.7794194854871371, + "grad_norm": 0.0005214913398958743, + "learning_rate": 2.8207080486046302e-06, + "loss": 0.3912, + "step": 62352 + }, + { + "epoch": 0.7794444861121528, + "grad_norm": 0.0008711077971383929, + "learning_rate": 2.820100583143891e-06, + "loss": 0.0, + "step": 62354 + }, + { + "epoch": 0.7794694867371684, + "grad_norm": 6.769376277923584, + "learning_rate": 2.8194931723642526e-06, + "loss": 0.9131, + "step": 62356 + }, + { + "epoch": 0.7794944873621841, + "grad_norm": 6.03290319442749, + "learning_rate": 2.818885816270337e-06, + "loss": 2.126, + "step": 62358 + }, + { + "epoch": 0.7795194879871997, + "grad_norm": 2.753938913345337, + "learning_rate": 2.81827851486677e-06, + "loss": 0.9021, + "step": 62360 + }, + { + "epoch": 0.7795444886122153, + "grad_norm": 3.6090381145477295, + "learning_rate": 2.817671268158185e-06, + "loss": 1.0049, + "step": 62362 + }, + { + "epoch": 0.7795694892372309, + "grad_norm": 4.199965953826904, + "learning_rate": 2.8170640761491953e-06, + "loss": 1.3171, + "step": 62364 + }, + { + "epoch": 0.7795944898622466, + "grad_norm": 2.2755074501037598, + "learning_rate": 2.8164569388444308e-06, + "loss": 1.1963, + "step": 62366 + }, + { + "epoch": 0.7796194904872622, + "grad_norm": 1.8293994665145874, + "learning_rate": 2.8158498562485125e-06, + "loss": 0.6694, + "step": 62368 + }, + { + "epoch": 0.7796444911122778, + "grad_norm": 1.524648904800415, + "learning_rate": 2.815242828366065e-06, + "loss": 0.0469, + "step": 62370 + }, + { + "epoch": 0.7796694917372934, + "grad_norm": 3.1871817111968994, + "learning_rate": 2.814635855201716e-06, + "loss": 0.672, + "step": 62372 + }, + { + "epoch": 0.779694492362309, + "grad_norm": 4.175292015075684, + "learning_rate": 2.8140289367600838e-06, + "loss": 0.9381, + "step": 62374 + }, + { + "epoch": 0.7797194929873247, + "grad_norm": 3.1815168857574463, + "learning_rate": 2.813422073045792e-06, + "loss": 1.052, + "step": 62376 + }, + { + "epoch": 0.7797444936123403, + "grad_norm": 2.0747530460357666, + "learning_rate": 2.8128152640634566e-06, + "loss": 0.6412, + "step": 62378 + }, + { + "epoch": 0.779769494237356, + "grad_norm": 2.1675503253936768, + "learning_rate": 2.8122085098177076e-06, + "loss": 1.1648, + "step": 62380 + }, + { + "epoch": 0.7797944948623715, + "grad_norm": 2.7783687114715576, + "learning_rate": 2.8116018103131594e-06, + "loss": 0.969, + "step": 62382 + }, + { + "epoch": 0.7798194954873872, + "grad_norm": 6.413761138916016, + "learning_rate": 2.8109951655544356e-06, + "loss": 1.85, + "step": 62384 + }, + { + "epoch": 0.7798444961124028, + "grad_norm": 4.038856029510498, + "learning_rate": 2.810388575546158e-06, + "loss": 2.3493, + "step": 62386 + }, + { + "epoch": 0.7798694967374185, + "grad_norm": 3.1224799156188965, + "learning_rate": 2.8097820402929445e-06, + "loss": 0.9525, + "step": 62388 + }, + { + "epoch": 0.7798944973624341, + "grad_norm": 6.017664909362793, + "learning_rate": 2.8091755597994154e-06, + "loss": 0.686, + "step": 62390 + }, + { + "epoch": 0.7799194979874496, + "grad_norm": 0.4962504506111145, + "learning_rate": 2.8085691340701836e-06, + "loss": 0.9226, + "step": 62392 + }, + { + "epoch": 0.7799444986124653, + "grad_norm": 2.6545615196228027, + "learning_rate": 2.807962763109874e-06, + "loss": 1.2355, + "step": 62394 + }, + { + "epoch": 0.7799694992374809, + "grad_norm": 5.606464862823486, + "learning_rate": 2.807356446923104e-06, + "loss": 0.6086, + "step": 62396 + }, + { + "epoch": 0.7799944998624966, + "grad_norm": 2.7400829792022705, + "learning_rate": 2.8067501855144887e-06, + "loss": 0.359, + "step": 62398 + }, + { + "epoch": 0.7800195004875122, + "grad_norm": 0.0005959481350146234, + "learning_rate": 2.8061439788886525e-06, + "loss": 0.39, + "step": 62400 + }, + { + "epoch": 0.7800445011125278, + "grad_norm": 4.350989818572998, + "learning_rate": 2.8055378270502e-06, + "loss": 1.6518, + "step": 62402 + }, + { + "epoch": 0.7800695017375434, + "grad_norm": 3.6447763442993164, + "learning_rate": 2.804931730003758e-06, + "loss": 1.0296, + "step": 62404 + }, + { + "epoch": 0.7800945023625591, + "grad_norm": 0.14962387084960938, + "learning_rate": 2.804325687753935e-06, + "loss": 0.5543, + "step": 62406 + }, + { + "epoch": 0.7801195029875747, + "grad_norm": 7.572839260101318, + "learning_rate": 2.8037197003053497e-06, + "loss": 0.6376, + "step": 62408 + }, + { + "epoch": 0.7801445036125904, + "grad_norm": 3.505079746246338, + "learning_rate": 2.80311376766262e-06, + "loss": 0.1113, + "step": 62410 + }, + { + "epoch": 0.7801695042376059, + "grad_norm": 0.00021503434982150793, + "learning_rate": 2.802507889830356e-06, + "loss": 0.4846, + "step": 62412 + }, + { + "epoch": 0.7801945048626215, + "grad_norm": 5.988457202911377, + "learning_rate": 2.8019020668131803e-06, + "loss": 0.8907, + "step": 62414 + }, + { + "epoch": 0.7802195054876372, + "grad_norm": 3.359074115753174, + "learning_rate": 2.801296298615693e-06, + "loss": 1.5058, + "step": 62416 + }, + { + "epoch": 0.7802445061126528, + "grad_norm": 5.047893047332764, + "learning_rate": 2.8006905852425205e-06, + "loss": 1.888, + "step": 62418 + }, + { + "epoch": 0.7802695067376685, + "grad_norm": 0.00034217731445096433, + "learning_rate": 2.8000849266982654e-06, + "loss": 0.9323, + "step": 62420 + }, + { + "epoch": 0.780294507362684, + "grad_norm": 2.9873671531677246, + "learning_rate": 2.799479322987546e-06, + "loss": 0.8624, + "step": 62422 + }, + { + "epoch": 0.7803195079876997, + "grad_norm": 1.1500704288482666, + "learning_rate": 2.7988737741149775e-06, + "loss": 0.2036, + "step": 62424 + }, + { + "epoch": 0.7803445086127153, + "grad_norm": 5.237533092498779, + "learning_rate": 2.7982682800851667e-06, + "loss": 0.1521, + "step": 62426 + }, + { + "epoch": 0.780369509237731, + "grad_norm": 2.4267818927764893, + "learning_rate": 2.7976628409027273e-06, + "loss": 0.4115, + "step": 62428 + }, + { + "epoch": 0.7803945098627466, + "grad_norm": 2.1194205284118652, + "learning_rate": 2.7970574565722653e-06, + "loss": 0.7465, + "step": 62430 + }, + { + "epoch": 0.7804195104877621, + "grad_norm": 1.733366847038269, + "learning_rate": 2.796452127098396e-06, + "loss": 0.4106, + "step": 62432 + }, + { + "epoch": 0.7804445111127778, + "grad_norm": 5.05369758605957, + "learning_rate": 2.795846852485732e-06, + "loss": 1.3486, + "step": 62434 + }, + { + "epoch": 0.7804695117377934, + "grad_norm": 4.666787147521973, + "learning_rate": 2.7952416327388755e-06, + "loss": 1.2515, + "step": 62436 + }, + { + "epoch": 0.7804945123628091, + "grad_norm": 5.08780574798584, + "learning_rate": 2.7946364678624427e-06, + "loss": 0.7645, + "step": 62438 + }, + { + "epoch": 0.7805195129878247, + "grad_norm": 1.8315778970718384, + "learning_rate": 2.79403135786104e-06, + "loss": 0.9891, + "step": 62440 + }, + { + "epoch": 0.7805445136128403, + "grad_norm": 2.851440668106079, + "learning_rate": 2.793426302739276e-06, + "loss": 0.5365, + "step": 62442 + }, + { + "epoch": 0.7805695142378559, + "grad_norm": 3.1146838665008545, + "learning_rate": 2.792821302501755e-06, + "loss": 1.2927, + "step": 62444 + }, + { + "epoch": 0.7805945148628716, + "grad_norm": 0.6627799868583679, + "learning_rate": 2.792216357153088e-06, + "loss": 0.9624, + "step": 62446 + }, + { + "epoch": 0.7806195154878872, + "grad_norm": 0.00040382854058407247, + "learning_rate": 2.7916114666978844e-06, + "loss": 0.4325, + "step": 62448 + }, + { + "epoch": 0.7806445161129029, + "grad_norm": 2.721735715866089, + "learning_rate": 2.7910066311407456e-06, + "loss": 0.4537, + "step": 62450 + }, + { + "epoch": 0.7806695167379184, + "grad_norm": 3.752426862716675, + "learning_rate": 2.7904018504862873e-06, + "loss": 1.0874, + "step": 62452 + }, + { + "epoch": 0.780694517362934, + "grad_norm": 9.064699172973633, + "learning_rate": 2.789797124739102e-06, + "loss": 1.4713, + "step": 62454 + }, + { + "epoch": 0.7807195179879497, + "grad_norm": 0.003385102842003107, + "learning_rate": 2.7891924539038027e-06, + "loss": 0.5619, + "step": 62456 + }, + { + "epoch": 0.7807445186129653, + "grad_norm": 3.236938714981079, + "learning_rate": 2.7885878379849974e-06, + "loss": 0.6763, + "step": 62458 + }, + { + "epoch": 0.780769519237981, + "grad_norm": 2.921964168548584, + "learning_rate": 2.787983276987284e-06, + "loss": 0.9689, + "step": 62460 + }, + { + "epoch": 0.7807945198629965, + "grad_norm": 4.370216369628906, + "learning_rate": 2.7873787709152734e-06, + "loss": 0.6967, + "step": 62462 + }, + { + "epoch": 0.7808195204880122, + "grad_norm": 3.0549824237823486, + "learning_rate": 2.786774319773563e-06, + "loss": 1.4433, + "step": 62464 + }, + { + "epoch": 0.7808445211130278, + "grad_norm": 4.8109822273254395, + "learning_rate": 2.786169923566765e-06, + "loss": 2.2802, + "step": 62466 + }, + { + "epoch": 0.7808695217380435, + "grad_norm": 3.6231441497802734, + "learning_rate": 2.785565582299471e-06, + "loss": 0.6717, + "step": 62468 + }, + { + "epoch": 0.7808945223630591, + "grad_norm": 0.007603970356285572, + "learning_rate": 2.7849612959762897e-06, + "loss": 0.0001, + "step": 62470 + }, + { + "epoch": 0.7809195229880747, + "grad_norm": 3.1336777210235596, + "learning_rate": 2.7843570646018246e-06, + "loss": 0.936, + "step": 62472 + }, + { + "epoch": 0.7809445236130903, + "grad_norm": 2.1357884407043457, + "learning_rate": 2.783752888180673e-06, + "loss": 1.1678, + "step": 62474 + }, + { + "epoch": 0.780969524238106, + "grad_norm": 0.3342214822769165, + "learning_rate": 2.783148766717443e-06, + "loss": 0.536, + "step": 62476 + }, + { + "epoch": 0.7809945248631216, + "grad_norm": 2.758737325668335, + "learning_rate": 2.782544700216728e-06, + "loss": 0.1154, + "step": 62478 + }, + { + "epoch": 0.7810195254881372, + "grad_norm": 3.419031858444214, + "learning_rate": 2.7819406886831344e-06, + "loss": 1.5853, + "step": 62480 + }, + { + "epoch": 0.7810445261131528, + "grad_norm": 3.449601411819458, + "learning_rate": 2.78133673212126e-06, + "loss": 1.4895, + "step": 62482 + }, + { + "epoch": 0.7810695267381684, + "grad_norm": 4.616324424743652, + "learning_rate": 2.780732830535702e-06, + "loss": 0.9165, + "step": 62484 + }, + { + "epoch": 0.7810945273631841, + "grad_norm": 3.62703275680542, + "learning_rate": 2.780128983931064e-06, + "loss": 2.1062, + "step": 62486 + }, + { + "epoch": 0.7811195279881997, + "grad_norm": 4.10847806930542, + "learning_rate": 2.7795251923119404e-06, + "loss": 0.5297, + "step": 62488 + }, + { + "epoch": 0.7811445286132154, + "grad_norm": 0.00023311465338338166, + "learning_rate": 2.7789214556829345e-06, + "loss": 0.0, + "step": 62490 + }, + { + "epoch": 0.7811695292382309, + "grad_norm": 2.268738031387329, + "learning_rate": 2.778317774048641e-06, + "loss": 0.0805, + "step": 62492 + }, + { + "epoch": 0.7811945298632466, + "grad_norm": 0.41598454117774963, + "learning_rate": 2.777714147413656e-06, + "loss": 0.4194, + "step": 62494 + }, + { + "epoch": 0.7812195304882622, + "grad_norm": 4.126543045043945, + "learning_rate": 2.7771105757825813e-06, + "loss": 1.4994, + "step": 62496 + }, + { + "epoch": 0.7812445311132779, + "grad_norm": 0.004905971232801676, + "learning_rate": 2.7765070591600087e-06, + "loss": 1.3178, + "step": 62498 + }, + { + "epoch": 0.7812695317382935, + "grad_norm": 0.0049554225988686085, + "learning_rate": 2.775903597550539e-06, + "loss": 0.5673, + "step": 62500 + }, + { + "epoch": 0.781294532363309, + "grad_norm": 3.8144898414611816, + "learning_rate": 2.775300190958763e-06, + "loss": 0.8812, + "step": 62502 + }, + { + "epoch": 0.7813195329883247, + "grad_norm": 0.00020048308942932636, + "learning_rate": 2.7746968393892827e-06, + "loss": 0.0974, + "step": 62504 + }, + { + "epoch": 0.7813445336133403, + "grad_norm": 0.0006015296676196158, + "learning_rate": 2.7740935428466885e-06, + "loss": 1.182, + "step": 62506 + }, + { + "epoch": 0.781369534238356, + "grad_norm": 3.9716343879699707, + "learning_rate": 2.773490301335574e-06, + "loss": 0.3248, + "step": 62508 + }, + { + "epoch": 0.7813945348633716, + "grad_norm": 3.3588647842407227, + "learning_rate": 2.7728871148605387e-06, + "loss": 0.7876, + "step": 62510 + }, + { + "epoch": 0.7814195354883872, + "grad_norm": 2.286391258239746, + "learning_rate": 2.7722839834261683e-06, + "loss": 1.5816, + "step": 62512 + }, + { + "epoch": 0.7814445361134028, + "grad_norm": 3.2181990146636963, + "learning_rate": 2.7716809070370653e-06, + "loss": 0.124, + "step": 62514 + }, + { + "epoch": 0.7814695367384185, + "grad_norm": 3.260404348373413, + "learning_rate": 2.771077885697816e-06, + "loss": 1.1278, + "step": 62516 + }, + { + "epoch": 0.7814945373634341, + "grad_norm": 0.0003569630498532206, + "learning_rate": 2.7704749194130177e-06, + "loss": 0.2075, + "step": 62518 + }, + { + "epoch": 0.7815195379884498, + "grad_norm": 3.4916346073150635, + "learning_rate": 2.76987200818726e-06, + "loss": 1.0479, + "step": 62520 + }, + { + "epoch": 0.7815445386134653, + "grad_norm": 11.226329803466797, + "learning_rate": 2.7692691520251313e-06, + "loss": 1.4436, + "step": 62522 + }, + { + "epoch": 0.7815695392384809, + "grad_norm": 3.818070411682129, + "learning_rate": 2.7686663509312297e-06, + "loss": 1.8952, + "step": 62524 + }, + { + "epoch": 0.7815945398634966, + "grad_norm": 2.631380319595337, + "learning_rate": 2.7680636049101385e-06, + "loss": 0.464, + "step": 62526 + }, + { + "epoch": 0.7816195404885122, + "grad_norm": 2.4183332920074463, + "learning_rate": 2.7674609139664567e-06, + "loss": 0.4775, + "step": 62528 + }, + { + "epoch": 0.7816445411135279, + "grad_norm": 6.328229904174805, + "learning_rate": 2.766858278104766e-06, + "loss": 0.9027, + "step": 62530 + }, + { + "epoch": 0.7816695417385434, + "grad_norm": 0.0001854242873378098, + "learning_rate": 2.766255697329663e-06, + "loss": 0.145, + "step": 62532 + }, + { + "epoch": 0.7816945423635591, + "grad_norm": 3.853320837020874, + "learning_rate": 2.7656531716457336e-06, + "loss": 0.6406, + "step": 62534 + }, + { + "epoch": 0.7817195429885747, + "grad_norm": 4.9359025955200195, + "learning_rate": 2.7650507010575633e-06, + "loss": 1.1124, + "step": 62536 + }, + { + "epoch": 0.7817445436135904, + "grad_norm": 3.557727575302124, + "learning_rate": 2.7644482855697473e-06, + "loss": 0.6716, + "step": 62538 + }, + { + "epoch": 0.781769544238606, + "grad_norm": 2.404505729675293, + "learning_rate": 2.7638459251868666e-06, + "loss": 0.6406, + "step": 62540 + }, + { + "epoch": 0.7817945448636215, + "grad_norm": 0.0004877682658843696, + "learning_rate": 2.763243619913515e-06, + "loss": 1.0148, + "step": 62542 + }, + { + "epoch": 0.7818195454886372, + "grad_norm": 1.943795919418335, + "learning_rate": 2.7626413697542765e-06, + "loss": 0.7578, + "step": 62544 + }, + { + "epoch": 0.7818445461136528, + "grad_norm": 0.000378578610252589, + "learning_rate": 2.7620391747137354e-06, + "loss": 0.6158, + "step": 62546 + }, + { + "epoch": 0.7818695467386685, + "grad_norm": 0.7416337728500366, + "learning_rate": 2.7614370347964835e-06, + "loss": 0.8317, + "step": 62548 + }, + { + "epoch": 0.7818945473636841, + "grad_norm": 3.777695655822754, + "learning_rate": 2.7608349500070997e-06, + "loss": 0.7233, + "step": 62550 + }, + { + "epoch": 0.7819195479886997, + "grad_norm": 1.704457402229309, + "learning_rate": 2.7602329203501764e-06, + "loss": 0.0194, + "step": 62552 + }, + { + "epoch": 0.7819445486137153, + "grad_norm": 6.555458068847656, + "learning_rate": 2.7596309458302928e-06, + "loss": 1.0227, + "step": 62554 + }, + { + "epoch": 0.781969549238731, + "grad_norm": 0.0024679696653038263, + "learning_rate": 2.7590290264520393e-06, + "loss": 0.5491, + "step": 62556 + }, + { + "epoch": 0.7819945498637466, + "grad_norm": 4.319546222686768, + "learning_rate": 2.7584271622199967e-06, + "loss": 0.8245, + "step": 62558 + }, + { + "epoch": 0.7820195504887623, + "grad_norm": 4.468087196350098, + "learning_rate": 2.7578253531387445e-06, + "loss": 1.5155, + "step": 62560 + }, + { + "epoch": 0.7820445511137778, + "grad_norm": 2.0550992488861084, + "learning_rate": 2.757223599212875e-06, + "loss": 1.277, + "step": 62562 + }, + { + "epoch": 0.7820695517387934, + "grad_norm": 4.909121513366699, + "learning_rate": 2.7566219004469617e-06, + "loss": 1.2567, + "step": 62564 + }, + { + "epoch": 0.7820945523638091, + "grad_norm": 3.9166533946990967, + "learning_rate": 2.756020256845596e-06, + "loss": 0.9065, + "step": 62566 + }, + { + "epoch": 0.7821195529888247, + "grad_norm": 4.891870498657227, + "learning_rate": 2.755418668413352e-06, + "loss": 0.8628, + "step": 62568 + }, + { + "epoch": 0.7821445536138404, + "grad_norm": 3.0485429763793945, + "learning_rate": 2.754817135154818e-06, + "loss": 0.6354, + "step": 62570 + }, + { + "epoch": 0.7821695542388559, + "grad_norm": 3.00058650970459, + "learning_rate": 2.7542156570745725e-06, + "loss": 1.2097, + "step": 62572 + }, + { + "epoch": 0.7821945548638716, + "grad_norm": 1.934018611907959, + "learning_rate": 2.7536142341771933e-06, + "loss": 1.567, + "step": 62574 + }, + { + "epoch": 0.7822195554888872, + "grad_norm": 6.119134902954102, + "learning_rate": 2.753012866467265e-06, + "loss": 1.0718, + "step": 62576 + }, + { + "epoch": 0.7822445561139029, + "grad_norm": 3.535271167755127, + "learning_rate": 2.7524115539493647e-06, + "loss": 1.4871, + "step": 62578 + }, + { + "epoch": 0.7822695567389185, + "grad_norm": 0.002576156985014677, + "learning_rate": 2.7518102966280746e-06, + "loss": 0.0001, + "step": 62580 + }, + { + "epoch": 0.782294557363934, + "grad_norm": 0.00045829612645320594, + "learning_rate": 2.7512090945079697e-06, + "loss": 1.5711, + "step": 62582 + }, + { + "epoch": 0.7823195579889497, + "grad_norm": 5.2571001052856445, + "learning_rate": 2.7506079475936343e-06, + "loss": 1.5342, + "step": 62584 + }, + { + "epoch": 0.7823445586139653, + "grad_norm": 15.225520133972168, + "learning_rate": 2.7500068558896433e-06, + "loss": 1.6269, + "step": 62586 + }, + { + "epoch": 0.782369559238981, + "grad_norm": 4.085319995880127, + "learning_rate": 2.7494058194005724e-06, + "loss": 1.1674, + "step": 62588 + }, + { + "epoch": 0.7823945598639966, + "grad_norm": 0.000529891811311245, + "learning_rate": 2.748804838131004e-06, + "loss": 0.6529, + "step": 62590 + }, + { + "epoch": 0.7824195604890122, + "grad_norm": 0.0013957114424556494, + "learning_rate": 2.7482039120855097e-06, + "loss": 1.0882, + "step": 62592 + }, + { + "epoch": 0.7824445611140278, + "grad_norm": 1.1498628854751587, + "learning_rate": 2.7476030412686726e-06, + "loss": 0.4453, + "step": 62594 + }, + { + "epoch": 0.7824695617390435, + "grad_norm": 8.832733154296875, + "learning_rate": 2.7470022256850638e-06, + "loss": 1.5693, + "step": 62596 + }, + { + "epoch": 0.7824945623640591, + "grad_norm": 11.438611030578613, + "learning_rate": 2.746401465339258e-06, + "loss": 1.2128, + "step": 62598 + }, + { + "epoch": 0.7825195629890748, + "grad_norm": 2.7918636798858643, + "learning_rate": 2.7458007602358362e-06, + "loss": 0.4021, + "step": 62600 + }, + { + "epoch": 0.7825445636140903, + "grad_norm": 3.3742735385894775, + "learning_rate": 2.7452001103793668e-06, + "loss": 1.1918, + "step": 62602 + }, + { + "epoch": 0.782569564239106, + "grad_norm": 3.2020161151885986, + "learning_rate": 2.7445995157744308e-06, + "loss": 1.0144, + "step": 62604 + }, + { + "epoch": 0.7825945648641216, + "grad_norm": 1.496304988861084, + "learning_rate": 2.7439989764255947e-06, + "loss": 0.1931, + "step": 62606 + }, + { + "epoch": 0.7826195654891372, + "grad_norm": 2.892524480819702, + "learning_rate": 2.74339849233744e-06, + "loss": 0.9291, + "step": 62608 + }, + { + "epoch": 0.7826445661141529, + "grad_norm": 0.7975290417671204, + "learning_rate": 2.7427980635145366e-06, + "loss": 0.5952, + "step": 62610 + }, + { + "epoch": 0.7826695667391684, + "grad_norm": 2.904104709625244, + "learning_rate": 2.7421976899614546e-06, + "loss": 0.7308, + "step": 62612 + }, + { + "epoch": 0.7826945673641841, + "grad_norm": 2.8302247524261475, + "learning_rate": 2.7415973716827704e-06, + "loss": 0.8125, + "step": 62614 + }, + { + "epoch": 0.7827195679891997, + "grad_norm": 0.5558329820632935, + "learning_rate": 2.740997108683051e-06, + "loss": 0.3707, + "step": 62616 + }, + { + "epoch": 0.7827445686142154, + "grad_norm": 4.708212375640869, + "learning_rate": 2.740396900966874e-06, + "loss": 1.8752, + "step": 62618 + }, + { + "epoch": 0.782769569239231, + "grad_norm": 0.018721435219049454, + "learning_rate": 2.7397967485388054e-06, + "loss": 1.0612, + "step": 62620 + }, + { + "epoch": 0.7827945698642466, + "grad_norm": 0.00048818415962159634, + "learning_rate": 2.739196651403421e-06, + "loss": 0.4753, + "step": 62622 + }, + { + "epoch": 0.7828195704892622, + "grad_norm": 3.7103359699249268, + "learning_rate": 2.7385966095652884e-06, + "loss": 0.3857, + "step": 62624 + }, + { + "epoch": 0.7828445711142779, + "grad_norm": 3.881525993347168, + "learning_rate": 2.737996623028973e-06, + "loss": 1.3289, + "step": 62626 + }, + { + "epoch": 0.7828695717392935, + "grad_norm": 2.9772894382476807, + "learning_rate": 2.7373966917990526e-06, + "loss": 0.5406, + "step": 62628 + }, + { + "epoch": 0.7828945723643091, + "grad_norm": 3.939594268798828, + "learning_rate": 2.736796815880088e-06, + "loss": 0.9387, + "step": 62630 + }, + { + "epoch": 0.7829195729893247, + "grad_norm": 3.167494773864746, + "learning_rate": 2.7361969952766553e-06, + "loss": 0.8691, + "step": 62632 + }, + { + "epoch": 0.7829445736143403, + "grad_norm": 2.0573296546936035, + "learning_rate": 2.7355972299933165e-06, + "loss": 0.8165, + "step": 62634 + }, + { + "epoch": 0.782969574239356, + "grad_norm": 4.690356731414795, + "learning_rate": 2.7349975200346445e-06, + "loss": 1.2972, + "step": 62636 + }, + { + "epoch": 0.7829945748643716, + "grad_norm": 4.349647521972656, + "learning_rate": 2.734397865405204e-06, + "loss": 1.8759, + "step": 62638 + }, + { + "epoch": 0.7830195754893873, + "grad_norm": 3.8685269355773926, + "learning_rate": 2.7337982661095585e-06, + "loss": 1.1348, + "step": 62640 + }, + { + "epoch": 0.7830445761144028, + "grad_norm": 3.175062656402588, + "learning_rate": 2.7331987221522814e-06, + "loss": 1.0948, + "step": 62642 + }, + { + "epoch": 0.7830695767394185, + "grad_norm": 4.540202617645264, + "learning_rate": 2.7325992335379315e-06, + "loss": 0.6763, + "step": 62644 + }, + { + "epoch": 0.7830945773644341, + "grad_norm": 0.03703425079584122, + "learning_rate": 2.7319998002710823e-06, + "loss": 0.9512, + "step": 62646 + }, + { + "epoch": 0.7831195779894498, + "grad_norm": 2.868195056915283, + "learning_rate": 2.731400422356294e-06, + "loss": 0.1889, + "step": 62648 + }, + { + "epoch": 0.7831445786144654, + "grad_norm": 1.7856258153915405, + "learning_rate": 2.7308010997981295e-06, + "loss": 0.2184, + "step": 62650 + }, + { + "epoch": 0.7831695792394809, + "grad_norm": 0.0007527267443947494, + "learning_rate": 2.730201832601158e-06, + "loss": 0.3269, + "step": 62652 + }, + { + "epoch": 0.7831945798644966, + "grad_norm": 2.0978729724884033, + "learning_rate": 2.729602620769939e-06, + "loss": 0.0518, + "step": 62654 + }, + { + "epoch": 0.7832195804895122, + "grad_norm": 2.3814587593078613, + "learning_rate": 2.729003464309041e-06, + "loss": 1.4081, + "step": 62656 + }, + { + "epoch": 0.7832445811145279, + "grad_norm": 3.4302210807800293, + "learning_rate": 2.7284043632230228e-06, + "loss": 1.0597, + "step": 62658 + }, + { + "epoch": 0.7832695817395435, + "grad_norm": 0.00043138800538145006, + "learning_rate": 2.727805317516451e-06, + "loss": 0.4571, + "step": 62660 + }, + { + "epoch": 0.7832945823645591, + "grad_norm": 0.008673124015331268, + "learning_rate": 2.7272063271938842e-06, + "loss": 0.5667, + "step": 62662 + }, + { + "epoch": 0.7833195829895747, + "grad_norm": 0.12401802092790604, + "learning_rate": 2.7266073922598833e-06, + "loss": 0.0141, + "step": 62664 + }, + { + "epoch": 0.7833445836145904, + "grad_norm": 5.23486852645874, + "learning_rate": 2.726008512719016e-06, + "loss": 1.9187, + "step": 62666 + }, + { + "epoch": 0.783369584239606, + "grad_norm": 4.416899681091309, + "learning_rate": 2.725409688575835e-06, + "loss": 0.9226, + "step": 62668 + }, + { + "epoch": 0.7833945848646217, + "grad_norm": 9.571011543273926, + "learning_rate": 2.724810919834908e-06, + "loss": 2.0743, + "step": 62670 + }, + { + "epoch": 0.7834195854896372, + "grad_norm": 2.667424440383911, + "learning_rate": 2.72421220650079e-06, + "loss": 0.4473, + "step": 62672 + }, + { + "epoch": 0.7834445861146528, + "grad_norm": 1.317612886428833, + "learning_rate": 2.723613548578046e-06, + "loss": 0.0555, + "step": 62674 + }, + { + "epoch": 0.7834695867396685, + "grad_norm": 5.129024028778076, + "learning_rate": 2.723014946071232e-06, + "loss": 1.9166, + "step": 62676 + }, + { + "epoch": 0.7834945873646841, + "grad_norm": 6.393711090087891, + "learning_rate": 2.722416398984904e-06, + "loss": 2.1521, + "step": 62678 + }, + { + "epoch": 0.7835195879896998, + "grad_norm": 7.693868637084961, + "learning_rate": 2.721817907323626e-06, + "loss": 1.2245, + "step": 62680 + }, + { + "epoch": 0.7835445886147153, + "grad_norm": 2.529033660888672, + "learning_rate": 2.7212194710919517e-06, + "loss": 0.7763, + "step": 62682 + }, + { + "epoch": 0.783569589239731, + "grad_norm": 3.489481210708618, + "learning_rate": 2.720621090294443e-06, + "loss": 1.5545, + "step": 62684 + }, + { + "epoch": 0.7835945898647466, + "grad_norm": 3.522996425628662, + "learning_rate": 2.7200227649356514e-06, + "loss": 1.3906, + "step": 62686 + }, + { + "epoch": 0.7836195904897623, + "grad_norm": 3.683335065841675, + "learning_rate": 2.719424495020141e-06, + "loss": 1.4281, + "step": 62688 + }, + { + "epoch": 0.7836445911147779, + "grad_norm": 4.981250286102295, + "learning_rate": 2.7188262805524634e-06, + "loss": 1.4563, + "step": 62690 + }, + { + "epoch": 0.7836695917397934, + "grad_norm": 1.8796204328536987, + "learning_rate": 2.718228121537172e-06, + "loss": 0.8913, + "step": 62692 + }, + { + "epoch": 0.7836945923648091, + "grad_norm": 2.926760673522949, + "learning_rate": 2.7176300179788283e-06, + "loss": 1.4808, + "step": 62694 + }, + { + "epoch": 0.7837195929898247, + "grad_norm": 0.0015548658557236195, + "learning_rate": 2.7170319698819826e-06, + "loss": 0.0005, + "step": 62696 + }, + { + "epoch": 0.7837445936148404, + "grad_norm": 3.795846462249756, + "learning_rate": 2.716433977251194e-06, + "loss": 0.9362, + "step": 62698 + }, + { + "epoch": 0.783769594239856, + "grad_norm": 4.975165843963623, + "learning_rate": 2.7158360400910135e-06, + "loss": 1.5051, + "step": 62700 + }, + { + "epoch": 0.7837945948648716, + "grad_norm": 4.985528945922852, + "learning_rate": 2.7152381584059926e-06, + "loss": 0.7716, + "step": 62702 + }, + { + "epoch": 0.7838195954898872, + "grad_norm": 0.0003361243288964033, + "learning_rate": 2.714640332200692e-06, + "loss": 0.533, + "step": 62704 + }, + { + "epoch": 0.7838445961149029, + "grad_norm": 2.3502144813537598, + "learning_rate": 2.714042561479656e-06, + "loss": 0.4251, + "step": 62706 + }, + { + "epoch": 0.7838695967399185, + "grad_norm": 5.908234596252441, + "learning_rate": 2.7134448462474463e-06, + "loss": 2.757, + "step": 62708 + }, + { + "epoch": 0.7838945973649342, + "grad_norm": 1.855973720550537, + "learning_rate": 2.712847186508606e-06, + "loss": 0.5468, + "step": 62710 + }, + { + "epoch": 0.7839195979899497, + "grad_norm": 9.458769798278809, + "learning_rate": 2.712249582267691e-06, + "loss": 1.2465, + "step": 62712 + }, + { + "epoch": 0.7839445986149653, + "grad_norm": 2.1566121578216553, + "learning_rate": 2.7116520335292586e-06, + "loss": 1.4433, + "step": 62714 + }, + { + "epoch": 0.783969599239981, + "grad_norm": 4.490414619445801, + "learning_rate": 2.711054540297848e-06, + "loss": 0.8686, + "step": 62716 + }, + { + "epoch": 0.7839945998649966, + "grad_norm": 3.46124529838562, + "learning_rate": 2.710457102578018e-06, + "loss": 1.0788, + "step": 62718 + }, + { + "epoch": 0.7840196004900123, + "grad_norm": 2.0627973079681396, + "learning_rate": 2.709859720374313e-06, + "loss": 0.2117, + "step": 62720 + }, + { + "epoch": 0.7840446011150278, + "grad_norm": 1.1305193901062012, + "learning_rate": 2.709262393691289e-06, + "loss": 0.84, + "step": 62722 + }, + { + "epoch": 0.7840696017400435, + "grad_norm": 4.519521713256836, + "learning_rate": 2.708665122533488e-06, + "loss": 1.0319, + "step": 62724 + }, + { + "epoch": 0.7840946023650591, + "grad_norm": 4.177855014801025, + "learning_rate": 2.7080679069054615e-06, + "loss": 0.8938, + "step": 62726 + }, + { + "epoch": 0.7841196029900748, + "grad_norm": 2.8790884017944336, + "learning_rate": 2.7074707468117666e-06, + "loss": 0.0717, + "step": 62728 + }, + { + "epoch": 0.7841446036150904, + "grad_norm": 4.78074312210083, + "learning_rate": 2.7068736422569353e-06, + "loss": 1.0686, + "step": 62730 + }, + { + "epoch": 0.784169604240106, + "grad_norm": 4.39107084274292, + "learning_rate": 2.7062765932455273e-06, + "loss": 1.4335, + "step": 62732 + }, + { + "epoch": 0.7841946048651216, + "grad_norm": 3.2057974338531494, + "learning_rate": 2.7056795997820816e-06, + "loss": 0.459, + "step": 62734 + }, + { + "epoch": 0.7842196054901373, + "grad_norm": 0.9915226101875305, + "learning_rate": 2.7050826618711525e-06, + "loss": 0.5982, + "step": 62736 + }, + { + "epoch": 0.7842446061151529, + "grad_norm": 8.973564147949219, + "learning_rate": 2.7044857795172787e-06, + "loss": 1.6229, + "step": 62738 + }, + { + "epoch": 0.7842696067401685, + "grad_norm": 0.5488244295120239, + "learning_rate": 2.703888952725012e-06, + "loss": 0.0217, + "step": 62740 + }, + { + "epoch": 0.7842946073651841, + "grad_norm": 6.735034942626953, + "learning_rate": 2.7032921814988956e-06, + "loss": 0.8551, + "step": 62742 + }, + { + "epoch": 0.7843196079901997, + "grad_norm": 2.51417875289917, + "learning_rate": 2.702695465843471e-06, + "loss": 1.3788, + "step": 62744 + }, + { + "epoch": 0.7843446086152154, + "grad_norm": 0.15001770853996277, + "learning_rate": 2.702098805763289e-06, + "loss": 0.1165, + "step": 62746 + }, + { + "epoch": 0.784369609240231, + "grad_norm": 1.2160006761550903, + "learning_rate": 2.701502201262888e-06, + "loss": 0.4828, + "step": 62748 + }, + { + "epoch": 0.7843946098652467, + "grad_norm": 5.522154331207275, + "learning_rate": 2.7009056523468134e-06, + "loss": 0.7166, + "step": 62750 + }, + { + "epoch": 0.7844196104902622, + "grad_norm": 7.962871551513672, + "learning_rate": 2.7003091590196127e-06, + "loss": 1.444, + "step": 62752 + }, + { + "epoch": 0.7844446111152779, + "grad_norm": 3.7472431659698486, + "learning_rate": 2.6997127212858254e-06, + "loss": 1.2738, + "step": 62754 + }, + { + "epoch": 0.7844696117402935, + "grad_norm": 0.7899855375289917, + "learning_rate": 2.6991163391499932e-06, + "loss": 0.457, + "step": 62756 + }, + { + "epoch": 0.7844946123653092, + "grad_norm": 4.067032814025879, + "learning_rate": 2.6985200126166542e-06, + "loss": 1.3879, + "step": 62758 + }, + { + "epoch": 0.7845196129903248, + "grad_norm": 3.1388120651245117, + "learning_rate": 2.697923741690359e-06, + "loss": 1.3016, + "step": 62760 + }, + { + "epoch": 0.7845446136153403, + "grad_norm": 4.489813804626465, + "learning_rate": 2.6973275263756406e-06, + "loss": 1.2129, + "step": 62762 + }, + { + "epoch": 0.784569614240356, + "grad_norm": 2.8444294929504395, + "learning_rate": 2.696731366677043e-06, + "loss": 0.8622, + "step": 62764 + }, + { + "epoch": 0.7845946148653716, + "grad_norm": 0.03722445294260979, + "learning_rate": 2.696135262599112e-06, + "loss": 1.5807, + "step": 62766 + }, + { + "epoch": 0.7846196154903873, + "grad_norm": 4.876801013946533, + "learning_rate": 2.6955392141463775e-06, + "loss": 0.8434, + "step": 62768 + }, + { + "epoch": 0.7846446161154029, + "grad_norm": 2.1079888343811035, + "learning_rate": 2.694943221323385e-06, + "loss": 0.8253, + "step": 62770 + }, + { + "epoch": 0.7846696167404185, + "grad_norm": 0.0005099075497128069, + "learning_rate": 2.6943472841346695e-06, + "loss": 0.0448, + "step": 62772 + }, + { + "epoch": 0.7846946173654341, + "grad_norm": 1.7347866296768188, + "learning_rate": 2.6937514025847713e-06, + "loss": 1.3736, + "step": 62774 + }, + { + "epoch": 0.7847196179904498, + "grad_norm": 0.37323474884033203, + "learning_rate": 2.6931555766782325e-06, + "loss": 0.0056, + "step": 62776 + }, + { + "epoch": 0.7847446186154654, + "grad_norm": 1.4549922943115234, + "learning_rate": 2.6925598064195858e-06, + "loss": 0.0932, + "step": 62778 + }, + { + "epoch": 0.784769619240481, + "grad_norm": 0.17674969136714935, + "learning_rate": 2.691964091813375e-06, + "loss": 0.5252, + "step": 62780 + }, + { + "epoch": 0.7847946198654966, + "grad_norm": 2.617027997970581, + "learning_rate": 2.6913684328641265e-06, + "loss": 0.2369, + "step": 62782 + }, + { + "epoch": 0.7848196204905122, + "grad_norm": 2.3374433517456055, + "learning_rate": 2.6907728295763845e-06, + "loss": 0.7502, + "step": 62784 + }, + { + "epoch": 0.7848446211155279, + "grad_norm": 1.3355391025543213, + "learning_rate": 2.6901772819546813e-06, + "loss": 1.0394, + "step": 62786 + }, + { + "epoch": 0.7848696217405435, + "grad_norm": 0.00028640314121730626, + "learning_rate": 2.6895817900035536e-06, + "loss": 0.2444, + "step": 62788 + }, + { + "epoch": 0.7848946223655592, + "grad_norm": 2.138291120529175, + "learning_rate": 2.6889863537275407e-06, + "loss": 0.8036, + "step": 62790 + }, + { + "epoch": 0.7849196229905747, + "grad_norm": 3.242967128753662, + "learning_rate": 2.688390973131173e-06, + "loss": 0.5219, + "step": 62792 + }, + { + "epoch": 0.7849446236155904, + "grad_norm": 3.2865078449249268, + "learning_rate": 2.687795648218987e-06, + "loss": 0.699, + "step": 62794 + }, + { + "epoch": 0.784969624240606, + "grad_norm": 2.0881659984588623, + "learning_rate": 2.6872003789955117e-06, + "loss": 0.5241, + "step": 62796 + }, + { + "epoch": 0.7849946248656217, + "grad_norm": 1.3412643671035767, + "learning_rate": 2.6866051654652835e-06, + "loss": 0.7656, + "step": 62798 + }, + { + "epoch": 0.7850196254906373, + "grad_norm": 3.0722262859344482, + "learning_rate": 2.68601000763284e-06, + "loss": 0.5707, + "step": 62800 + }, + { + "epoch": 0.7850446261156528, + "grad_norm": 2.981937885284424, + "learning_rate": 2.6854149055027068e-06, + "loss": 1.0262, + "step": 62802 + }, + { + "epoch": 0.7850696267406685, + "grad_norm": 0.9770050048828125, + "learning_rate": 2.6848198590794215e-06, + "loss": 0.5191, + "step": 62804 + }, + { + "epoch": 0.7850946273656841, + "grad_norm": 2.9101381301879883, + "learning_rate": 2.6842248683675154e-06, + "loss": 0.7132, + "step": 62806 + }, + { + "epoch": 0.7851196279906998, + "grad_norm": 5.514429569244385, + "learning_rate": 2.6836299333715164e-06, + "loss": 0.4852, + "step": 62808 + }, + { + "epoch": 0.7851446286157154, + "grad_norm": 3.669407844543457, + "learning_rate": 2.6830350540959547e-06, + "loss": 1.5344, + "step": 62810 + }, + { + "epoch": 0.785169629240731, + "grad_norm": 13.449077606201172, + "learning_rate": 2.6824402305453647e-06, + "loss": 2.0201, + "step": 62812 + }, + { + "epoch": 0.7851946298657466, + "grad_norm": 3.0762202739715576, + "learning_rate": 2.681845462724276e-06, + "loss": 0.5327, + "step": 62814 + }, + { + "epoch": 0.7852196304907623, + "grad_norm": 4.554424285888672, + "learning_rate": 2.6812507506372156e-06, + "loss": 2.138, + "step": 62816 + }, + { + "epoch": 0.7852446311157779, + "grad_norm": 3.706146001815796, + "learning_rate": 2.6806560942887207e-06, + "loss": 1.24, + "step": 62818 + }, + { + "epoch": 0.7852696317407936, + "grad_norm": 4.233435153961182, + "learning_rate": 2.680061493683307e-06, + "loss": 1.6431, + "step": 62820 + }, + { + "epoch": 0.7852946323658091, + "grad_norm": 3.1995787620544434, + "learning_rate": 2.679466948825513e-06, + "loss": 1.0115, + "step": 62822 + }, + { + "epoch": 0.7853196329908247, + "grad_norm": 2.246044397354126, + "learning_rate": 2.678872459719861e-06, + "loss": 1.2151, + "step": 62824 + }, + { + "epoch": 0.7853446336158404, + "grad_norm": 0.0014412773307412863, + "learning_rate": 2.678278026370882e-06, + "loss": 0.2549, + "step": 62826 + }, + { + "epoch": 0.785369634240856, + "grad_norm": 5.409621238708496, + "learning_rate": 2.6776836487831037e-06, + "loss": 0.7135, + "step": 62828 + }, + { + "epoch": 0.7853946348658717, + "grad_norm": 4.54732608795166, + "learning_rate": 2.677089326961049e-06, + "loss": 1.3344, + "step": 62830 + }, + { + "epoch": 0.7854196354908872, + "grad_norm": 3.1351146697998047, + "learning_rate": 2.676495060909251e-06, + "loss": 1.5901, + "step": 62832 + }, + { + "epoch": 0.7854446361159029, + "grad_norm": 7.327057361602783, + "learning_rate": 2.675900850632227e-06, + "loss": 1.352, + "step": 62834 + }, + { + "epoch": 0.7854696367409185, + "grad_norm": 5.2622199058532715, + "learning_rate": 2.675306696134505e-06, + "loss": 1.0497, + "step": 62836 + }, + { + "epoch": 0.7854946373659342, + "grad_norm": 2.1256601810455322, + "learning_rate": 2.6747125974206144e-06, + "loss": 1.3715, + "step": 62838 + }, + { + "epoch": 0.7855196379909498, + "grad_norm": 15.391606330871582, + "learning_rate": 2.6741185544950744e-06, + "loss": 1.036, + "step": 62840 + }, + { + "epoch": 0.7855446386159654, + "grad_norm": 3.041487216949463, + "learning_rate": 2.6735245673624142e-06, + "loss": 1.1813, + "step": 62842 + }, + { + "epoch": 0.785569639240981, + "grad_norm": 2.1622018814086914, + "learning_rate": 2.6729306360271555e-06, + "loss": 0.7884, + "step": 62844 + }, + { + "epoch": 0.7855946398659966, + "grad_norm": 4.890955924987793, + "learning_rate": 2.67233676049382e-06, + "loss": 0.8459, + "step": 62846 + }, + { + "epoch": 0.7856196404910123, + "grad_norm": 3.3929696083068848, + "learning_rate": 2.671742940766929e-06, + "loss": 0.7577, + "step": 62848 + }, + { + "epoch": 0.7856446411160279, + "grad_norm": 5.384769439697266, + "learning_rate": 2.671149176851008e-06, + "loss": 0.4101, + "step": 62850 + }, + { + "epoch": 0.7856696417410435, + "grad_norm": 3.384378671646118, + "learning_rate": 2.6705554687505797e-06, + "loss": 1.4584, + "step": 62852 + }, + { + "epoch": 0.7856946423660591, + "grad_norm": 3.5111639499664307, + "learning_rate": 2.6699618164701637e-06, + "loss": 0.2872, + "step": 62854 + }, + { + "epoch": 0.7857196429910748, + "grad_norm": 6.724358081817627, + "learning_rate": 2.6693682200142832e-06, + "loss": 1.6969, + "step": 62856 + }, + { + "epoch": 0.7857446436160904, + "grad_norm": 1.3526822328567505, + "learning_rate": 2.668774679387458e-06, + "loss": 1.1619, + "step": 62858 + }, + { + "epoch": 0.7857696442411061, + "grad_norm": 5.149864196777344, + "learning_rate": 2.6681811945942058e-06, + "loss": 1.2114, + "step": 62860 + }, + { + "epoch": 0.7857946448661216, + "grad_norm": 0.00042383105028420687, + "learning_rate": 2.6675877656390513e-06, + "loss": 1.339, + "step": 62862 + }, + { + "epoch": 0.7858196454911373, + "grad_norm": 5.875726699829102, + "learning_rate": 2.6669943925265085e-06, + "loss": 0.7206, + "step": 62864 + }, + { + "epoch": 0.7858446461161529, + "grad_norm": 1.5194262266159058, + "learning_rate": 2.6664010752611023e-06, + "loss": 0.7909, + "step": 62866 + }, + { + "epoch": 0.7858696467411685, + "grad_norm": 3.4217073917388916, + "learning_rate": 2.6658078138473466e-06, + "loss": 1.8099, + "step": 62868 + }, + { + "epoch": 0.7858946473661842, + "grad_norm": 5.125515460968018, + "learning_rate": 2.6652146082897665e-06, + "loss": 1.1105, + "step": 62870 + }, + { + "epoch": 0.7859196479911997, + "grad_norm": 2.701162338256836, + "learning_rate": 2.664621458592869e-06, + "loss": 1.7225, + "step": 62872 + }, + { + "epoch": 0.7859446486162154, + "grad_norm": 4.0153608322143555, + "learning_rate": 2.6640283647611766e-06, + "loss": 1.1838, + "step": 62874 + }, + { + "epoch": 0.785969649241231, + "grad_norm": 0.0003733457997441292, + "learning_rate": 2.6634353267992097e-06, + "loss": 0.3571, + "step": 62876 + }, + { + "epoch": 0.7859946498662467, + "grad_norm": 5.086755275726318, + "learning_rate": 2.6628423447114793e-06, + "loss": 1.237, + "step": 62878 + }, + { + "epoch": 0.7860196504912623, + "grad_norm": 0.7127009034156799, + "learning_rate": 2.662249418502507e-06, + "loss": 0.0825, + "step": 62880 + }, + { + "epoch": 0.7860446511162779, + "grad_norm": 5.330981731414795, + "learning_rate": 2.661656548176803e-06, + "loss": 0.7888, + "step": 62882 + }, + { + "epoch": 0.7860696517412935, + "grad_norm": 1.6080539226531982, + "learning_rate": 2.66106373373889e-06, + "loss": 1.0374, + "step": 62884 + }, + { + "epoch": 0.7860946523663092, + "grad_norm": 5.201395034790039, + "learning_rate": 2.660470975193272e-06, + "loss": 2.213, + "step": 62886 + }, + { + "epoch": 0.7861196529913248, + "grad_norm": 4.16369104385376, + "learning_rate": 2.6598782725444695e-06, + "loss": 1.1075, + "step": 62888 + }, + { + "epoch": 0.7861446536163404, + "grad_norm": 3.832077741622925, + "learning_rate": 2.659285625796999e-06, + "loss": 0.6164, + "step": 62890 + }, + { + "epoch": 0.786169654241356, + "grad_norm": 3.1258623600006104, + "learning_rate": 2.6586930349553684e-06, + "loss": 0.9542, + "step": 62892 + }, + { + "epoch": 0.7861946548663716, + "grad_norm": 3.8261919021606445, + "learning_rate": 2.6581005000240957e-06, + "loss": 1.376, + "step": 62894 + }, + { + "epoch": 0.7862196554913873, + "grad_norm": 2.625110387802124, + "learning_rate": 2.657508021007692e-06, + "loss": 1.3975, + "step": 62896 + }, + { + "epoch": 0.7862446561164029, + "grad_norm": 0.19232313334941864, + "learning_rate": 2.6569155979106664e-06, + "loss": 0.0254, + "step": 62898 + }, + { + "epoch": 0.7862696567414186, + "grad_norm": 0.7208192944526672, + "learning_rate": 2.6563232307375354e-06, + "loss": 0.9414, + "step": 62900 + }, + { + "epoch": 0.7862946573664341, + "grad_norm": 11.433815002441406, + "learning_rate": 2.655730919492805e-06, + "loss": 2.3975, + "step": 62902 + }, + { + "epoch": 0.7863196579914498, + "grad_norm": 0.00048621889436617494, + "learning_rate": 2.655138664180993e-06, + "loss": 0.0107, + "step": 62904 + }, + { + "epoch": 0.7863446586164654, + "grad_norm": 4.603884696960449, + "learning_rate": 2.654546464806602e-06, + "loss": 1.347, + "step": 62906 + }, + { + "epoch": 0.7863696592414811, + "grad_norm": 5.435504913330078, + "learning_rate": 2.6539543213741503e-06, + "loss": 1.2807, + "step": 62908 + }, + { + "epoch": 0.7863946598664967, + "grad_norm": 3.35147762298584, + "learning_rate": 2.6533622338881437e-06, + "loss": 0.7309, + "step": 62910 + }, + { + "epoch": 0.7864196604915122, + "grad_norm": 5.437448978424072, + "learning_rate": 2.6527702023530886e-06, + "loss": 1.0852, + "step": 62912 + }, + { + "epoch": 0.7864446611165279, + "grad_norm": 2.935636281967163, + "learning_rate": 2.6521782267734997e-06, + "loss": 2.0473, + "step": 62914 + }, + { + "epoch": 0.7864696617415435, + "grad_norm": 2.9102675914764404, + "learning_rate": 2.6515863071538794e-06, + "loss": 1.3689, + "step": 62916 + }, + { + "epoch": 0.7864946623665592, + "grad_norm": 1.4580307006835938, + "learning_rate": 2.650994443498741e-06, + "loss": 0.1147, + "step": 62918 + }, + { + "epoch": 0.7865196629915748, + "grad_norm": 2.609499931335449, + "learning_rate": 2.6504026358125867e-06, + "loss": 0.4559, + "step": 62920 + }, + { + "epoch": 0.7865446636165904, + "grad_norm": 6.837405204772949, + "learning_rate": 2.6498108840999305e-06, + "loss": 1.5175, + "step": 62922 + }, + { + "epoch": 0.786569664241606, + "grad_norm": 2.3081982135772705, + "learning_rate": 2.649219188365274e-06, + "loss": 0.5825, + "step": 62924 + }, + { + "epoch": 0.7865946648666217, + "grad_norm": 0.4130173325538635, + "learning_rate": 2.648627548613123e-06, + "loss": 0.359, + "step": 62926 + }, + { + "epoch": 0.7866196654916373, + "grad_norm": 0.0027236253954470158, + "learning_rate": 2.648035964847987e-06, + "loss": 0.0001, + "step": 62928 + }, + { + "epoch": 0.786644666116653, + "grad_norm": 2.8879597187042236, + "learning_rate": 2.647444437074367e-06, + "loss": 0.9631, + "step": 62930 + }, + { + "epoch": 0.7866696667416685, + "grad_norm": 3.636770486831665, + "learning_rate": 2.6468529652967724e-06, + "loss": 0.692, + "step": 62932 + }, + { + "epoch": 0.7866946673666841, + "grad_norm": 4.085689067840576, + "learning_rate": 2.646261549519704e-06, + "loss": 0.3267, + "step": 62934 + }, + { + "epoch": 0.7867196679916998, + "grad_norm": 1.4162758588790894, + "learning_rate": 2.64567018974767e-06, + "loss": 0.6541, + "step": 62936 + }, + { + "epoch": 0.7867446686167154, + "grad_norm": 0.007687460631132126, + "learning_rate": 2.6450788859851716e-06, + "loss": 0.2191, + "step": 62938 + }, + { + "epoch": 0.7867696692417311, + "grad_norm": 3.2732908725738525, + "learning_rate": 2.6444876382367102e-06, + "loss": 0.5338, + "step": 62940 + }, + { + "epoch": 0.7867946698667466, + "grad_norm": 1.9469703435897827, + "learning_rate": 2.6438964465067928e-06, + "loss": 0.2446, + "step": 62942 + }, + { + "epoch": 0.7868196704917623, + "grad_norm": 3.3997347354888916, + "learning_rate": 2.6433053107999186e-06, + "loss": 1.2783, + "step": 62944 + }, + { + "epoch": 0.7868446711167779, + "grad_norm": 2.4852514266967773, + "learning_rate": 2.6427142311205923e-06, + "loss": 0.1478, + "step": 62946 + }, + { + "epoch": 0.7868696717417936, + "grad_norm": 3.0067343711853027, + "learning_rate": 2.642123207473314e-06, + "loss": 0.9567, + "step": 62948 + }, + { + "epoch": 0.7868946723668092, + "grad_norm": 2.6571669578552246, + "learning_rate": 2.641532239862582e-06, + "loss": 0.5761, + "step": 62950 + }, + { + "epoch": 0.7869196729918247, + "grad_norm": 9.502943992614746, + "learning_rate": 2.640941328292903e-06, + "loss": 0.908, + "step": 62952 + }, + { + "epoch": 0.7869446736168404, + "grad_norm": 0.0003235414333175868, + "learning_rate": 2.6403504727687714e-06, + "loss": 0.4808, + "step": 62954 + }, + { + "epoch": 0.786969674241856, + "grad_norm": 3.3068370819091797, + "learning_rate": 2.6397596732946928e-06, + "loss": 1.1883, + "step": 62956 + }, + { + "epoch": 0.7869946748668717, + "grad_norm": 0.0002005484711844474, + "learning_rate": 2.63916892987516e-06, + "loss": 2.1721, + "step": 62958 + }, + { + "epoch": 0.7870196754918873, + "grad_norm": 0.0018395412480458617, + "learning_rate": 2.63857824251468e-06, + "loss": 0.462, + "step": 62960 + }, + { + "epoch": 0.7870446761169029, + "grad_norm": 5.79350471496582, + "learning_rate": 2.637987611217746e-06, + "loss": 0.6783, + "step": 62962 + }, + { + "epoch": 0.7870696767419185, + "grad_norm": 3.7071478366851807, + "learning_rate": 2.6373970359888545e-06, + "loss": 0.6229, + "step": 62964 + }, + { + "epoch": 0.7870946773669342, + "grad_norm": 2.9128501415252686, + "learning_rate": 2.6368065168325095e-06, + "loss": 0.6208, + "step": 62966 + }, + { + "epoch": 0.7871196779919498, + "grad_norm": 4.873527526855469, + "learning_rate": 2.636216053753201e-06, + "loss": 0.4684, + "step": 62968 + }, + { + "epoch": 0.7871446786169655, + "grad_norm": 2.807656764984131, + "learning_rate": 2.6356256467554343e-06, + "loss": 1.4671, + "step": 62970 + }, + { + "epoch": 0.787169679241981, + "grad_norm": 0.00040720831020735204, + "learning_rate": 2.6350352958436965e-06, + "loss": 0.0013, + "step": 62972 + }, + { + "epoch": 0.7871946798669966, + "grad_norm": 3.280710220336914, + "learning_rate": 2.634445001022492e-06, + "loss": 0.5473, + "step": 62974 + }, + { + "epoch": 0.7872196804920123, + "grad_norm": 0.0464584082365036, + "learning_rate": 2.6338547622963128e-06, + "loss": 0.0008, + "step": 62976 + }, + { + "epoch": 0.787244681117028, + "grad_norm": 2.215181350708008, + "learning_rate": 2.633264579669651e-06, + "loss": 0.1244, + "step": 62978 + }, + { + "epoch": 0.7872696817420436, + "grad_norm": 11.439576148986816, + "learning_rate": 2.632674453147007e-06, + "loss": 2.2371, + "step": 62980 + }, + { + "epoch": 0.7872946823670591, + "grad_norm": 4.435305595397949, + "learning_rate": 2.6320843827328703e-06, + "loss": 1.0521, + "step": 62982 + }, + { + "epoch": 0.7873196829920748, + "grad_norm": 0.006262996233999729, + "learning_rate": 2.6314943684317408e-06, + "loss": 1.5551, + "step": 62984 + }, + { + "epoch": 0.7873446836170904, + "grad_norm": 2.0661613941192627, + "learning_rate": 2.630904410248103e-06, + "loss": 0.442, + "step": 62986 + }, + { + "epoch": 0.7873696842421061, + "grad_norm": 4.514853000640869, + "learning_rate": 2.63031450818646e-06, + "loss": 1.6286, + "step": 62988 + }, + { + "epoch": 0.7873946848671217, + "grad_norm": 0.40835514664649963, + "learning_rate": 2.6297246622512985e-06, + "loss": 0.0254, + "step": 62990 + }, + { + "epoch": 0.7874196854921373, + "grad_norm": 4.549759387969971, + "learning_rate": 2.6291348724471087e-06, + "loss": 1.4074, + "step": 62992 + }, + { + "epoch": 0.7874446861171529, + "grad_norm": 1.8621442317962646, + "learning_rate": 2.628545138778389e-06, + "loss": 0.9032, + "step": 62994 + }, + { + "epoch": 0.7874696867421686, + "grad_norm": 1.7059880495071411, + "learning_rate": 2.6279554612496238e-06, + "loss": 0.9249, + "step": 62996 + }, + { + "epoch": 0.7874946873671842, + "grad_norm": 3.455594301223755, + "learning_rate": 2.62736583986531e-06, + "loss": 0.8222, + "step": 62998 + }, + { + "epoch": 0.7875196879921998, + "grad_norm": 4.137648582458496, + "learning_rate": 2.626776274629933e-06, + "loss": 1.3484, + "step": 63000 + }, + { + "epoch": 0.7875446886172154, + "grad_norm": 3.105815887451172, + "learning_rate": 2.6261867655479877e-06, + "loss": 1.1679, + "step": 63002 + }, + { + "epoch": 0.787569689242231, + "grad_norm": 6.265934467315674, + "learning_rate": 2.625597312623962e-06, + "loss": 1.701, + "step": 63004 + }, + { + "epoch": 0.7875946898672467, + "grad_norm": 0.00047456639003939927, + "learning_rate": 2.625007915862341e-06, + "loss": 1.2955, + "step": 63006 + }, + { + "epoch": 0.7876196904922623, + "grad_norm": 1.9796736240386963, + "learning_rate": 2.6244185752676187e-06, + "loss": 0.5228, + "step": 63008 + }, + { + "epoch": 0.787644691117278, + "grad_norm": 0.0005631504463963211, + "learning_rate": 2.6238292908442796e-06, + "loss": 0.5195, + "step": 63010 + }, + { + "epoch": 0.7876696917422935, + "grad_norm": 3.3000495433807373, + "learning_rate": 2.623240062596817e-06, + "loss": 1.3748, + "step": 63012 + }, + { + "epoch": 0.7876946923673092, + "grad_norm": 3.5099096298217773, + "learning_rate": 2.6226508905297143e-06, + "loss": 1.4904, + "step": 63014 + }, + { + "epoch": 0.7877196929923248, + "grad_norm": 0.6966572999954224, + "learning_rate": 2.6220617746474554e-06, + "loss": 0.9965, + "step": 63016 + }, + { + "epoch": 0.7877446936173405, + "grad_norm": 0.002833663485944271, + "learning_rate": 2.621472714954535e-06, + "loss": 0.1771, + "step": 63018 + }, + { + "epoch": 0.7877696942423561, + "grad_norm": 2.07137393951416, + "learning_rate": 2.620883711455431e-06, + "loss": 0.1462, + "step": 63020 + }, + { + "epoch": 0.7877946948673716, + "grad_norm": 5.697810173034668, + "learning_rate": 2.620294764154637e-06, + "loss": 1.2447, + "step": 63022 + }, + { + "epoch": 0.7878196954923873, + "grad_norm": 2.867568254470825, + "learning_rate": 2.6197058730566306e-06, + "loss": 0.2253, + "step": 63024 + }, + { + "epoch": 0.7878446961174029, + "grad_norm": 2.541637897491455, + "learning_rate": 2.6191170381659035e-06, + "loss": 0.4993, + "step": 63026 + }, + { + "epoch": 0.7878696967424186, + "grad_norm": 2.442068338394165, + "learning_rate": 2.6185282594869386e-06, + "loss": 0.8121, + "step": 63028 + }, + { + "epoch": 0.7878946973674342, + "grad_norm": 5.237120151519775, + "learning_rate": 2.6179395370242143e-06, + "loss": 1.4083, + "step": 63030 + }, + { + "epoch": 0.7879196979924498, + "grad_norm": 4.482678413391113, + "learning_rate": 2.6173508707822226e-06, + "loss": 1.8048, + "step": 63032 + }, + { + "epoch": 0.7879446986174654, + "grad_norm": 4.181687831878662, + "learning_rate": 2.6167622607654395e-06, + "loss": 1.1908, + "step": 63034 + }, + { + "epoch": 0.7879696992424811, + "grad_norm": 5.2223615646362305, + "learning_rate": 2.6161737069783533e-06, + "loss": 0.9758, + "step": 63036 + }, + { + "epoch": 0.7879946998674967, + "grad_norm": 0.00039960024878382683, + "learning_rate": 2.615585209425442e-06, + "loss": 0.7177, + "step": 63038 + }, + { + "epoch": 0.7880197004925124, + "grad_norm": 3.904161214828491, + "learning_rate": 2.614996768111191e-06, + "loss": 0.8595, + "step": 63040 + }, + { + "epoch": 0.7880447011175279, + "grad_norm": 2.525028944015503, + "learning_rate": 2.614408383040081e-06, + "loss": 1.8007, + "step": 63042 + }, + { + "epoch": 0.7880697017425435, + "grad_norm": 0.0003845762403216213, + "learning_rate": 2.6138200542165905e-06, + "loss": 0.3005, + "step": 63044 + }, + { + "epoch": 0.7880947023675592, + "grad_norm": 3.1657536029815674, + "learning_rate": 2.6132317816452034e-06, + "loss": 1.0457, + "step": 63046 + }, + { + "epoch": 0.7881197029925748, + "grad_norm": 2.3101119995117188, + "learning_rate": 2.612643565330396e-06, + "loss": 1.0433, + "step": 63048 + }, + { + "epoch": 0.7881447036175905, + "grad_norm": 4.356960296630859, + "learning_rate": 2.6120554052766535e-06, + "loss": 1.3329, + "step": 63050 + }, + { + "epoch": 0.788169704242606, + "grad_norm": 2.874237537384033, + "learning_rate": 2.6114673014884494e-06, + "loss": 1.0932, + "step": 63052 + }, + { + "epoch": 0.7881947048676217, + "grad_norm": 6.462766170501709, + "learning_rate": 2.6108792539702676e-06, + "loss": 1.658, + "step": 63054 + }, + { + "epoch": 0.7882197054926373, + "grad_norm": 1.9603374004364014, + "learning_rate": 2.6102912627265862e-06, + "loss": 0.4244, + "step": 63056 + }, + { + "epoch": 0.788244706117653, + "grad_norm": 7.057004928588867, + "learning_rate": 2.609703327761878e-06, + "loss": 1.1002, + "step": 63058 + }, + { + "epoch": 0.7882697067426686, + "grad_norm": 3.1060919761657715, + "learning_rate": 2.609115449080626e-06, + "loss": 1.6654, + "step": 63060 + }, + { + "epoch": 0.7882947073676841, + "grad_norm": 3.3202626705169678, + "learning_rate": 2.608527626687304e-06, + "loss": 0.7827, + "step": 63062 + }, + { + "epoch": 0.7883197079926998, + "grad_norm": 5.117029666900635, + "learning_rate": 2.6079398605863927e-06, + "loss": 0.9706, + "step": 63064 + }, + { + "epoch": 0.7883447086177154, + "grad_norm": 3.0746309757232666, + "learning_rate": 2.607352150782365e-06, + "loss": 0.8515, + "step": 63066 + }, + { + "epoch": 0.7883697092427311, + "grad_norm": 5.4792985916137695, + "learning_rate": 2.6067644972796967e-06, + "loss": 0.9841, + "step": 63068 + }, + { + "epoch": 0.7883947098677467, + "grad_norm": 0.002508212346583605, + "learning_rate": 2.6061769000828663e-06, + "loss": 0.5363, + "step": 63070 + }, + { + "epoch": 0.7884197104927623, + "grad_norm": 0.00040377621189691126, + "learning_rate": 2.6055893591963453e-06, + "loss": 0.6604, + "step": 63072 + }, + { + "epoch": 0.7884447111177779, + "grad_norm": 2.1586313247680664, + "learning_rate": 2.605001874624612e-06, + "loss": 0.9899, + "step": 63074 + }, + { + "epoch": 0.7884697117427936, + "grad_norm": 6.552255630493164, + "learning_rate": 2.6044144463721364e-06, + "loss": 3.1354, + "step": 63076 + }, + { + "epoch": 0.7884947123678092, + "grad_norm": 4.5746307373046875, + "learning_rate": 2.603827074443397e-06, + "loss": 1.0202, + "step": 63078 + }, + { + "epoch": 0.7885197129928249, + "grad_norm": 0.000409288564696908, + "learning_rate": 2.603239758842865e-06, + "loss": 0.0223, + "step": 63080 + }, + { + "epoch": 0.7885447136178404, + "grad_norm": 1.3733792304992676, + "learning_rate": 2.602652499575009e-06, + "loss": 0.0384, + "step": 63082 + }, + { + "epoch": 0.788569714242856, + "grad_norm": 2.8280179500579834, + "learning_rate": 2.6020652966443095e-06, + "loss": 1.5282, + "step": 63084 + }, + { + "epoch": 0.7885947148678717, + "grad_norm": 4.483364582061768, + "learning_rate": 2.601478150055232e-06, + "loss": 0.8835, + "step": 63086 + }, + { + "epoch": 0.7886197154928873, + "grad_norm": 2.2731075286865234, + "learning_rate": 2.6008910598122527e-06, + "loss": 0.43, + "step": 63088 + }, + { + "epoch": 0.788644716117903, + "grad_norm": 0.001450381358154118, + "learning_rate": 2.600304025919839e-06, + "loss": 0.5642, + "step": 63090 + }, + { + "epoch": 0.7886697167429185, + "grad_norm": 11.577896118164062, + "learning_rate": 2.599717048382464e-06, + "loss": 2.2189, + "step": 63092 + }, + { + "epoch": 0.7886947173679342, + "grad_norm": 2.1575021743774414, + "learning_rate": 2.5991301272046e-06, + "loss": 0.8411, + "step": 63094 + }, + { + "epoch": 0.7887197179929498, + "grad_norm": 9.39592456817627, + "learning_rate": 2.5985432623907103e-06, + "loss": 0.7262, + "step": 63096 + }, + { + "epoch": 0.7887447186179655, + "grad_norm": 2.580152988433838, + "learning_rate": 2.597956453945271e-06, + "loss": 0.4357, + "step": 63098 + }, + { + "epoch": 0.7887697192429811, + "grad_norm": 2.8250551223754883, + "learning_rate": 2.5973697018727463e-06, + "loss": 0.6203, + "step": 63100 + }, + { + "epoch": 0.7887947198679967, + "grad_norm": 10.536385536193848, + "learning_rate": 2.59678300617761e-06, + "loss": 3.1287, + "step": 63102 + }, + { + "epoch": 0.7888197204930123, + "grad_norm": 3.528259515762329, + "learning_rate": 2.5961963668643242e-06, + "loss": 1.2947, + "step": 63104 + }, + { + "epoch": 0.788844721118028, + "grad_norm": 1.8329870700836182, + "learning_rate": 2.595609783937363e-06, + "loss": 0.1083, + "step": 63106 + }, + { + "epoch": 0.7888697217430436, + "grad_norm": 8.233062744140625, + "learning_rate": 2.59502325740119e-06, + "loss": 0.5708, + "step": 63108 + }, + { + "epoch": 0.7888947223680592, + "grad_norm": 7.878648281097412, + "learning_rate": 2.5944367872602714e-06, + "loss": 1.5969, + "step": 63110 + }, + { + "epoch": 0.7889197229930748, + "grad_norm": 3.6926956176757812, + "learning_rate": 2.5938503735190768e-06, + "loss": 1.6146, + "step": 63112 + }, + { + "epoch": 0.7889447236180904, + "grad_norm": 0.00046624275273643434, + "learning_rate": 2.5932640161820678e-06, + "loss": 0.8604, + "step": 63114 + }, + { + "epoch": 0.7889697242431061, + "grad_norm": 2.4863178730010986, + "learning_rate": 2.592677715253712e-06, + "loss": 0.5816, + "step": 63116 + }, + { + "epoch": 0.7889947248681217, + "grad_norm": 0.632644534111023, + "learning_rate": 2.5920914707384813e-06, + "loss": 0.6795, + "step": 63118 + }, + { + "epoch": 0.7890197254931374, + "grad_norm": 3.1654741764068604, + "learning_rate": 2.5915052826408295e-06, + "loss": 0.8611, + "step": 63120 + }, + { + "epoch": 0.7890447261181529, + "grad_norm": 2.8051183223724365, + "learning_rate": 2.5909191509652278e-06, + "loss": 1.5579, + "step": 63122 + }, + { + "epoch": 0.7890697267431686, + "grad_norm": 3.9137704372406006, + "learning_rate": 2.590333075716135e-06, + "loss": 1.2553, + "step": 63124 + }, + { + "epoch": 0.7890947273681842, + "grad_norm": 0.0006230546277947724, + "learning_rate": 2.589747056898021e-06, + "loss": 0.2783, + "step": 63126 + }, + { + "epoch": 0.7891197279931998, + "grad_norm": 3.084277868270874, + "learning_rate": 2.5891610945153423e-06, + "loss": 0.7184, + "step": 63128 + }, + { + "epoch": 0.7891447286182155, + "grad_norm": 3.351012945175171, + "learning_rate": 2.5885751885725653e-06, + "loss": 1.2819, + "step": 63130 + }, + { + "epoch": 0.789169729243231, + "grad_norm": 5.0874128341674805, + "learning_rate": 2.5879893390741573e-06, + "loss": 0.6954, + "step": 63132 + }, + { + "epoch": 0.7891947298682467, + "grad_norm": 9.304024696350098, + "learning_rate": 2.587403546024568e-06, + "loss": 0.4879, + "step": 63134 + }, + { + "epoch": 0.7892197304932623, + "grad_norm": 0.7232513427734375, + "learning_rate": 2.5868178094282692e-06, + "loss": 0.4633, + "step": 63136 + }, + { + "epoch": 0.789244731118278, + "grad_norm": 2.6677873134613037, + "learning_rate": 2.586232129289713e-06, + "loss": 0.9566, + "step": 63138 + }, + { + "epoch": 0.7892697317432936, + "grad_norm": 7.582891464233398, + "learning_rate": 2.585646505613368e-06, + "loss": 1.6432, + "step": 63140 + }, + { + "epoch": 0.7892947323683092, + "grad_norm": 0.013294690288603306, + "learning_rate": 2.5850609384036883e-06, + "loss": 0.1881, + "step": 63142 + }, + { + "epoch": 0.7893197329933248, + "grad_norm": 4.100065231323242, + "learning_rate": 2.5844754276651384e-06, + "loss": 0.426, + "step": 63144 + }, + { + "epoch": 0.7893447336183405, + "grad_norm": 1.8413190841674805, + "learning_rate": 2.5838899734021762e-06, + "loss": 0.2167, + "step": 63146 + }, + { + "epoch": 0.7893697342433561, + "grad_norm": 2.0084714889526367, + "learning_rate": 2.5833045756192544e-06, + "loss": 0.6958, + "step": 63148 + }, + { + "epoch": 0.7893947348683718, + "grad_norm": 4.8411641120910645, + "learning_rate": 2.582719234320841e-06, + "loss": 1.7081, + "step": 63150 + }, + { + "epoch": 0.7894197354933873, + "grad_norm": 1.235764503479004, + "learning_rate": 2.582133949511384e-06, + "loss": 0.901, + "step": 63152 + }, + { + "epoch": 0.7894447361184029, + "grad_norm": 0.0004073001036886126, + "learning_rate": 2.581548721195347e-06, + "loss": 0.0, + "step": 63154 + }, + { + "epoch": 0.7894697367434186, + "grad_norm": 0.7199477553367615, + "learning_rate": 2.580963549377189e-06, + "loss": 0.0075, + "step": 63156 + }, + { + "epoch": 0.7894947373684342, + "grad_norm": 2.7675693035125732, + "learning_rate": 2.5803784340613635e-06, + "loss": 0.3856, + "step": 63158 + }, + { + "epoch": 0.7895197379934499, + "grad_norm": 3.26613712310791, + "learning_rate": 2.5797933752523265e-06, + "loss": 1.3542, + "step": 63160 + }, + { + "epoch": 0.7895447386184654, + "grad_norm": 2.2452967166900635, + "learning_rate": 2.5792083729545304e-06, + "loss": 0.3455, + "step": 63162 + }, + { + "epoch": 0.7895697392434811, + "grad_norm": 2.82802152633667, + "learning_rate": 2.5786234271724374e-06, + "loss": 1.1616, + "step": 63164 + }, + { + "epoch": 0.7895947398684967, + "grad_norm": 3.3841981887817383, + "learning_rate": 2.5780385379104957e-06, + "loss": 0.8186, + "step": 63166 + }, + { + "epoch": 0.7896197404935124, + "grad_norm": 2.6473515033721924, + "learning_rate": 2.577453705173163e-06, + "loss": 0.4997, + "step": 63168 + }, + { + "epoch": 0.789644741118528, + "grad_norm": 5.987565040588379, + "learning_rate": 2.5768689289648986e-06, + "loss": 0.4931, + "step": 63170 + }, + { + "epoch": 0.7896697417435435, + "grad_norm": 3.576779365539551, + "learning_rate": 2.5762842092901452e-06, + "loss": 1.4685, + "step": 63172 + }, + { + "epoch": 0.7896947423685592, + "grad_norm": 1.0614761114120483, + "learning_rate": 2.575699546153365e-06, + "loss": 1.1392, + "step": 63174 + }, + { + "epoch": 0.7897197429935748, + "grad_norm": 1.7824959754943848, + "learning_rate": 2.5751149395590034e-06, + "loss": 0.5552, + "step": 63176 + }, + { + "epoch": 0.7897447436185905, + "grad_norm": 5.852721214294434, + "learning_rate": 2.574530389511517e-06, + "loss": 1.0795, + "step": 63178 + }, + { + "epoch": 0.7897697442436061, + "grad_norm": 3.636509656906128, + "learning_rate": 2.5739458960153598e-06, + "loss": 1.0414, + "step": 63180 + }, + { + "epoch": 0.7897947448686217, + "grad_norm": 0.003373447572812438, + "learning_rate": 2.573361459074978e-06, + "loss": 0.5951, + "step": 63182 + }, + { + "epoch": 0.7898197454936373, + "grad_norm": 2.914881944656372, + "learning_rate": 2.5727770786948303e-06, + "loss": 0.3433, + "step": 63184 + }, + { + "epoch": 0.789844746118653, + "grad_norm": 2.5339608192443848, + "learning_rate": 2.572192754879357e-06, + "loss": 0.5826, + "step": 63186 + }, + { + "epoch": 0.7898697467436686, + "grad_norm": 5.840229034423828, + "learning_rate": 2.571608487633015e-06, + "loss": 0.3652, + "step": 63188 + }, + { + "epoch": 0.7898947473686843, + "grad_norm": 6.59467077255249, + "learning_rate": 2.57102427696025e-06, + "loss": 0.7383, + "step": 63190 + }, + { + "epoch": 0.7899197479936998, + "grad_norm": 0.7189579010009766, + "learning_rate": 2.570440122865514e-06, + "loss": 0.9087, + "step": 63192 + }, + { + "epoch": 0.7899447486187154, + "grad_norm": 2.5795130729675293, + "learning_rate": 2.569856025353258e-06, + "loss": 0.9286, + "step": 63194 + }, + { + "epoch": 0.7899697492437311, + "grad_norm": 2.458411931991577, + "learning_rate": 2.569271984427928e-06, + "loss": 0.6104, + "step": 63196 + }, + { + "epoch": 0.7899947498687467, + "grad_norm": 4.484563827514648, + "learning_rate": 2.568688000093971e-06, + "loss": 0.983, + "step": 63198 + }, + { + "epoch": 0.7900197504937624, + "grad_norm": 3.1751809120178223, + "learning_rate": 2.5681040723558326e-06, + "loss": 1.1385, + "step": 63200 + }, + { + "epoch": 0.7900447511187779, + "grad_norm": 2.510235071182251, + "learning_rate": 2.5675202012179656e-06, + "loss": 1.1748, + "step": 63202 + }, + { + "epoch": 0.7900697517437936, + "grad_norm": 6.457179069519043, + "learning_rate": 2.5669363866848117e-06, + "loss": 0.2254, + "step": 63204 + }, + { + "epoch": 0.7900947523688092, + "grad_norm": 0.0003421590372454375, + "learning_rate": 2.5663526287608188e-06, + "loss": 0.4637, + "step": 63206 + }, + { + "epoch": 0.7901197529938249, + "grad_norm": 7.42515230178833, + "learning_rate": 2.565768927450435e-06, + "loss": 2.4328, + "step": 63208 + }, + { + "epoch": 0.7901447536188405, + "grad_norm": 1.811844825744629, + "learning_rate": 2.565185282758105e-06, + "loss": 0.5367, + "step": 63210 + }, + { + "epoch": 0.790169754243856, + "grad_norm": 2.6966328620910645, + "learning_rate": 2.5646016946882714e-06, + "loss": 1.0299, + "step": 63212 + }, + { + "epoch": 0.7901947548688717, + "grad_norm": 1.3341457843780518, + "learning_rate": 2.5640181632453777e-06, + "loss": 0.6882, + "step": 63214 + }, + { + "epoch": 0.7902197554938873, + "grad_norm": 3.150658369064331, + "learning_rate": 2.563434688433869e-06, + "loss": 1.3215, + "step": 63216 + }, + { + "epoch": 0.790244756118903, + "grad_norm": 4.206563472747803, + "learning_rate": 2.5628512702581944e-06, + "loss": 0.7361, + "step": 63218 + }, + { + "epoch": 0.7902697567439186, + "grad_norm": 2.774099111557007, + "learning_rate": 2.56226790872279e-06, + "loss": 1.5326, + "step": 63220 + }, + { + "epoch": 0.7902947573689342, + "grad_norm": 3.403273344039917, + "learning_rate": 2.5616846038321062e-06, + "loss": 1.3381, + "step": 63222 + }, + { + "epoch": 0.7903197579939498, + "grad_norm": 8.164868354797363, + "learning_rate": 2.561101355590575e-06, + "loss": 1.2719, + "step": 63224 + }, + { + "epoch": 0.7903447586189655, + "grad_norm": 3.1044535636901855, + "learning_rate": 2.5605181640026467e-06, + "loss": 1.2025, + "step": 63226 + }, + { + "epoch": 0.7903697592439811, + "grad_norm": 3.7201528549194336, + "learning_rate": 2.5599350290727578e-06, + "loss": 0.6805, + "step": 63228 + }, + { + "epoch": 0.7903947598689968, + "grad_norm": 3.36533784866333, + "learning_rate": 2.559351950805351e-06, + "loss": 0.8262, + "step": 63230 + }, + { + "epoch": 0.7904197604940123, + "grad_norm": 1.9441354274749756, + "learning_rate": 2.55876892920487e-06, + "loss": 1.0468, + "step": 63232 + }, + { + "epoch": 0.790444761119028, + "grad_norm": 2.404855966567993, + "learning_rate": 2.558185964275751e-06, + "loss": 1.208, + "step": 63234 + }, + { + "epoch": 0.7904697617440436, + "grad_norm": 0.39573928713798523, + "learning_rate": 2.557603056022441e-06, + "loss": 0.4183, + "step": 63236 + }, + { + "epoch": 0.7904947623690592, + "grad_norm": 2.620441198348999, + "learning_rate": 2.5570202044493665e-06, + "loss": 0.8098, + "step": 63238 + }, + { + "epoch": 0.7905197629940749, + "grad_norm": 4.330294609069824, + "learning_rate": 2.5564374095609745e-06, + "loss": 0.6771, + "step": 63240 + }, + { + "epoch": 0.7905447636190904, + "grad_norm": 5.042585849761963, + "learning_rate": 2.555854671361705e-06, + "loss": 0.8988, + "step": 63242 + }, + { + "epoch": 0.7905697642441061, + "grad_norm": 5.187068939208984, + "learning_rate": 2.5552719898559906e-06, + "loss": 0.4167, + "step": 63244 + }, + { + "epoch": 0.7905947648691217, + "grad_norm": 0.2089235782623291, + "learning_rate": 2.5546893650482764e-06, + "loss": 2.8741, + "step": 63246 + }, + { + "epoch": 0.7906197654941374, + "grad_norm": 2.4680731296539307, + "learning_rate": 2.554106796942991e-06, + "loss": 0.6101, + "step": 63248 + }, + { + "epoch": 0.790644766119153, + "grad_norm": 0.133096382021904, + "learning_rate": 2.553524285544581e-06, + "loss": 0.633, + "step": 63250 + }, + { + "epoch": 0.7906697667441686, + "grad_norm": 3.4588968753814697, + "learning_rate": 2.5529418308574714e-06, + "loss": 0.7838, + "step": 63252 + }, + { + "epoch": 0.7906947673691842, + "grad_norm": 2.00644850730896, + "learning_rate": 2.5523594328861033e-06, + "loss": 0.4913, + "step": 63254 + }, + { + "epoch": 0.7907197679941999, + "grad_norm": 5.535581111907959, + "learning_rate": 2.551777091634917e-06, + "loss": 1.2152, + "step": 63256 + }, + { + "epoch": 0.7907447686192155, + "grad_norm": 2.9332435131073, + "learning_rate": 2.551194807108339e-06, + "loss": 0.7634, + "step": 63258 + }, + { + "epoch": 0.7907697692442311, + "grad_norm": 2.0684800148010254, + "learning_rate": 2.5506125793108105e-06, + "loss": 0.9234, + "step": 63260 + }, + { + "epoch": 0.7907947698692467, + "grad_norm": 5.72921895980835, + "learning_rate": 2.5500304082467643e-06, + "loss": 1.6078, + "step": 63262 + }, + { + "epoch": 0.7908197704942623, + "grad_norm": 10.025237083435059, + "learning_rate": 2.549448293920629e-06, + "loss": 1.1071, + "step": 63264 + }, + { + "epoch": 0.790844771119278, + "grad_norm": 7.2797393798828125, + "learning_rate": 2.5488662363368453e-06, + "loss": 1.2616, + "step": 63266 + }, + { + "epoch": 0.7908697717442936, + "grad_norm": 4.03243350982666, + "learning_rate": 2.5482842354998396e-06, + "loss": 1.8137, + "step": 63268 + }, + { + "epoch": 0.7908947723693093, + "grad_norm": 0.00020536994270514697, + "learning_rate": 2.54770229141405e-06, + "loss": 0.0696, + "step": 63270 + }, + { + "epoch": 0.7909197729943248, + "grad_norm": 3.8324193954467773, + "learning_rate": 2.5471204040839037e-06, + "loss": 0.389, + "step": 63272 + }, + { + "epoch": 0.7909447736193405, + "grad_norm": 9.878703117370605, + "learning_rate": 2.546538573513837e-06, + "loss": 1.0842, + "step": 63274 + }, + { + "epoch": 0.7909697742443561, + "grad_norm": 0.00021602670312859118, + "learning_rate": 2.545956799708279e-06, + "loss": 0.5702, + "step": 63276 + }, + { + "epoch": 0.7909947748693718, + "grad_norm": 2.4650816917419434, + "learning_rate": 2.5453750826716563e-06, + "loss": 0.6585, + "step": 63278 + }, + { + "epoch": 0.7910197754943874, + "grad_norm": 0.20188167691230774, + "learning_rate": 2.544793422408406e-06, + "loss": 0.5552, + "step": 63280 + }, + { + "epoch": 0.7910447761194029, + "grad_norm": 0.0002470080798957497, + "learning_rate": 2.5442118189229525e-06, + "loss": 0.0879, + "step": 63282 + }, + { + "epoch": 0.7910697767444186, + "grad_norm": 3.3709487915039062, + "learning_rate": 2.5436302722197295e-06, + "loss": 0.9684, + "step": 63284 + }, + { + "epoch": 0.7910947773694342, + "grad_norm": 0.00019345844339113683, + "learning_rate": 2.5430487823031613e-06, + "loss": 0.0, + "step": 63286 + }, + { + "epoch": 0.7911197779944499, + "grad_norm": 1.7222075462341309, + "learning_rate": 2.542467349177684e-06, + "loss": 0.1233, + "step": 63288 + }, + { + "epoch": 0.7911447786194655, + "grad_norm": 3.43668794631958, + "learning_rate": 2.541885972847716e-06, + "loss": 0.8104, + "step": 63290 + }, + { + "epoch": 0.7911697792444811, + "grad_norm": 1.995893120765686, + "learning_rate": 2.54130465331769e-06, + "loss": 0.3126, + "step": 63292 + }, + { + "epoch": 0.7911947798694967, + "grad_norm": 3.968372344970703, + "learning_rate": 2.540723390592035e-06, + "loss": 1.3873, + "step": 63294 + }, + { + "epoch": 0.7912197804945124, + "grad_norm": 3.264866590499878, + "learning_rate": 2.540142184675174e-06, + "loss": 0.2271, + "step": 63296 + }, + { + "epoch": 0.791244781119528, + "grad_norm": 0.04005701094865799, + "learning_rate": 2.539561035571537e-06, + "loss": 0.5076, + "step": 63298 + }, + { + "epoch": 0.7912697817445437, + "grad_norm": 3.0176889896392822, + "learning_rate": 2.5389799432855453e-06, + "loss": 1.7325, + "step": 63300 + }, + { + "epoch": 0.7912947823695592, + "grad_norm": 4.153360843658447, + "learning_rate": 2.538398907821631e-06, + "loss": 1.447, + "step": 63302 + }, + { + "epoch": 0.7913197829945748, + "grad_norm": 4.103911876678467, + "learning_rate": 2.5378179291842155e-06, + "loss": 0.7685, + "step": 63304 + }, + { + "epoch": 0.7913447836195905, + "grad_norm": 8.068717956542969, + "learning_rate": 2.5372370073777196e-06, + "loss": 1.9389, + "step": 63306 + }, + { + "epoch": 0.7913697842446061, + "grad_norm": 0.32681143283843994, + "learning_rate": 2.536656142406575e-06, + "loss": 0.5132, + "step": 63308 + }, + { + "epoch": 0.7913947848696218, + "grad_norm": 6.237192153930664, + "learning_rate": 2.5360753342751975e-06, + "loss": 1.6055, + "step": 63310 + }, + { + "epoch": 0.7914197854946373, + "grad_norm": 2.896562337875366, + "learning_rate": 2.5354945829880196e-06, + "loss": 0.7481, + "step": 63312 + }, + { + "epoch": 0.791444786119653, + "grad_norm": 2.814612865447998, + "learning_rate": 2.5349138885494585e-06, + "loss": 0.5322, + "step": 63314 + }, + { + "epoch": 0.7914697867446686, + "grad_norm": 2.935377836227417, + "learning_rate": 2.534333250963934e-06, + "loss": 0.4821, + "step": 63316 + }, + { + "epoch": 0.7914947873696843, + "grad_norm": 0.8356362581253052, + "learning_rate": 2.533752670235875e-06, + "loss": 0.3287, + "step": 63318 + }, + { + "epoch": 0.7915197879946999, + "grad_norm": 2.606569528579712, + "learning_rate": 2.533172146369698e-06, + "loss": 1.1161, + "step": 63320 + }, + { + "epoch": 0.7915447886197154, + "grad_norm": 3.609267473220825, + "learning_rate": 2.532591679369828e-06, + "loss": 0.7401, + "step": 63322 + }, + { + "epoch": 0.7915697892447311, + "grad_norm": 0.0007879878976382315, + "learning_rate": 2.5320112692406808e-06, + "loss": 0.2182, + "step": 63324 + }, + { + "epoch": 0.7915947898697467, + "grad_norm": 3.8506808280944824, + "learning_rate": 2.5314309159866833e-06, + "loss": 0.5971, + "step": 63326 + }, + { + "epoch": 0.7916197904947624, + "grad_norm": 4.143181324005127, + "learning_rate": 2.5308506196122506e-06, + "loss": 1.0099, + "step": 63328 + }, + { + "epoch": 0.791644791119778, + "grad_norm": 5.376300811767578, + "learning_rate": 2.530270380121801e-06, + "loss": 1.2352, + "step": 63330 + }, + { + "epoch": 0.7916697917447936, + "grad_norm": 2.9823741912841797, + "learning_rate": 2.5296901975197598e-06, + "loss": 0.4796, + "step": 63332 + }, + { + "epoch": 0.7916947923698092, + "grad_norm": 3.2414400577545166, + "learning_rate": 2.5291100718105375e-06, + "loss": 1.1983, + "step": 63334 + }, + { + "epoch": 0.7917197929948249, + "grad_norm": 4.009243965148926, + "learning_rate": 2.5285300029985593e-06, + "loss": 1.7442, + "step": 63336 + }, + { + "epoch": 0.7917447936198405, + "grad_norm": 2.6035878658294678, + "learning_rate": 2.5279499910882377e-06, + "loss": 0.3864, + "step": 63338 + }, + { + "epoch": 0.7917697942448562, + "grad_norm": 4.327352046966553, + "learning_rate": 2.5273700360839946e-06, + "loss": 0.6626, + "step": 63340 + }, + { + "epoch": 0.7917947948698717, + "grad_norm": 2.0314817428588867, + "learning_rate": 2.526790137990246e-06, + "loss": 0.3711, + "step": 63342 + }, + { + "epoch": 0.7918197954948873, + "grad_norm": 2.839102268218994, + "learning_rate": 2.5262102968114035e-06, + "loss": 2.0287, + "step": 63344 + }, + { + "epoch": 0.791844796119903, + "grad_norm": 0.0003503946354612708, + "learning_rate": 2.5256305125518887e-06, + "loss": 0.6912, + "step": 63346 + }, + { + "epoch": 0.7918697967449186, + "grad_norm": 0.05122537538409233, + "learning_rate": 2.5250507852161123e-06, + "loss": 0.6382, + "step": 63348 + }, + { + "epoch": 0.7918947973699343, + "grad_norm": 2.6432881355285645, + "learning_rate": 2.5244711148084944e-06, + "loss": 0.4872, + "step": 63350 + }, + { + "epoch": 0.7919197979949498, + "grad_norm": 3.1018409729003906, + "learning_rate": 2.523891501333445e-06, + "loss": 1.5659, + "step": 63352 + }, + { + "epoch": 0.7919447986199655, + "grad_norm": 4.961455345153809, + "learning_rate": 2.5233119447953836e-06, + "loss": 1.6497, + "step": 63354 + }, + { + "epoch": 0.7919697992449811, + "grad_norm": 3.5098090171813965, + "learning_rate": 2.522732445198721e-06, + "loss": 1.1707, + "step": 63356 + }, + { + "epoch": 0.7919947998699968, + "grad_norm": 2.473823308944702, + "learning_rate": 2.522153002547868e-06, + "loss": 0.11, + "step": 63358 + }, + { + "epoch": 0.7920198004950124, + "grad_norm": 2.7231924533843994, + "learning_rate": 2.5215736168472436e-06, + "loss": 0.8858, + "step": 63360 + }, + { + "epoch": 0.792044801120028, + "grad_norm": 2.7340798377990723, + "learning_rate": 2.5209942881012527e-06, + "loss": 0.4785, + "step": 63362 + }, + { + "epoch": 0.7920698017450436, + "grad_norm": 0.7378794550895691, + "learning_rate": 2.520415016314316e-06, + "loss": 0.615, + "step": 63364 + }, + { + "epoch": 0.7920948023700592, + "grad_norm": 2.3554513454437256, + "learning_rate": 2.519835801490841e-06, + "loss": 0.2301, + "step": 63366 + }, + { + "epoch": 0.7921198029950749, + "grad_norm": 2.92893648147583, + "learning_rate": 2.519256643635235e-06, + "loss": 1.0161, + "step": 63368 + }, + { + "epoch": 0.7921448036200905, + "grad_norm": 3.1768033504486084, + "learning_rate": 2.518677542751915e-06, + "loss": 0.7272, + "step": 63370 + }, + { + "epoch": 0.7921698042451061, + "grad_norm": 4.586240291595459, + "learning_rate": 2.5180984988452863e-06, + "loss": 1.9911, + "step": 63372 + }, + { + "epoch": 0.7921948048701217, + "grad_norm": 4.769920349121094, + "learning_rate": 2.517519511919765e-06, + "loss": 1.4048, + "step": 63374 + }, + { + "epoch": 0.7922198054951374, + "grad_norm": 3.1116411685943604, + "learning_rate": 2.5169405819797534e-06, + "loss": 0.899, + "step": 63376 + }, + { + "epoch": 0.792244806120153, + "grad_norm": 4.285565376281738, + "learning_rate": 2.516361709029668e-06, + "loss": 0.7232, + "step": 63378 + }, + { + "epoch": 0.7922698067451687, + "grad_norm": 0.39458581805229187, + "learning_rate": 2.5157828930739127e-06, + "loss": 0.2829, + "step": 63380 + }, + { + "epoch": 0.7922948073701842, + "grad_norm": 3.69518780708313, + "learning_rate": 2.515204134116893e-06, + "loss": 1.0294, + "step": 63382 + }, + { + "epoch": 0.7923198079951999, + "grad_norm": 2.9570491313934326, + "learning_rate": 2.5146254321630237e-06, + "loss": 1.1138, + "step": 63384 + }, + { + "epoch": 0.7923448086202155, + "grad_norm": 1.45645010471344, + "learning_rate": 2.514046787216706e-06, + "loss": 0.8299, + "step": 63386 + }, + { + "epoch": 0.7923698092452311, + "grad_norm": 4.217167377471924, + "learning_rate": 2.5134681992823518e-06, + "loss": 1.525, + "step": 63388 + }, + { + "epoch": 0.7923948098702468, + "grad_norm": 3.8171567916870117, + "learning_rate": 2.5128896683643634e-06, + "loss": 1.1958, + "step": 63390 + }, + { + "epoch": 0.7924198104952623, + "grad_norm": 2.7604706287384033, + "learning_rate": 2.5123111944671495e-06, + "loss": 0.4371, + "step": 63392 + }, + { + "epoch": 0.792444811120278, + "grad_norm": 2.674809455871582, + "learning_rate": 2.5117327775951163e-06, + "loss": 0.8564, + "step": 63394 + }, + { + "epoch": 0.7924698117452936, + "grad_norm": 2.277446985244751, + "learning_rate": 2.5111544177526636e-06, + "loss": 0.3559, + "step": 63396 + }, + { + "epoch": 0.7924948123703093, + "grad_norm": 3.231842517852783, + "learning_rate": 2.510576114944203e-06, + "loss": 1.635, + "step": 63398 + }, + { + "epoch": 0.7925198129953249, + "grad_norm": 0.0002443710982333869, + "learning_rate": 2.509997869174132e-06, + "loss": 0.5043, + "step": 63400 + }, + { + "epoch": 0.7925448136203405, + "grad_norm": 1.726828932762146, + "learning_rate": 2.509419680446862e-06, + "loss": 1.0341, + "step": 63402 + }, + { + "epoch": 0.7925698142453561, + "grad_norm": 4.0448760986328125, + "learning_rate": 2.5088415487667892e-06, + "loss": 0.863, + "step": 63404 + }, + { + "epoch": 0.7925948148703718, + "grad_norm": 3.3081374168395996, + "learning_rate": 2.5082634741383226e-06, + "loss": 0.8876, + "step": 63406 + }, + { + "epoch": 0.7926198154953874, + "grad_norm": 1.5236667394638062, + "learning_rate": 2.5076854565658615e-06, + "loss": 0.3185, + "step": 63408 + }, + { + "epoch": 0.792644816120403, + "grad_norm": 1.275486946105957, + "learning_rate": 2.5071074960538067e-06, + "loss": 0.4358, + "step": 63410 + }, + { + "epoch": 0.7926698167454186, + "grad_norm": 2.364764451980591, + "learning_rate": 2.5065295926065635e-06, + "loss": 0.484, + "step": 63412 + }, + { + "epoch": 0.7926948173704342, + "grad_norm": 0.9385095834732056, + "learning_rate": 2.505951746228529e-06, + "loss": 0.3591, + "step": 63414 + }, + { + "epoch": 0.7927198179954499, + "grad_norm": 5.775638103485107, + "learning_rate": 2.505373956924109e-06, + "loss": 1.701, + "step": 63416 + }, + { + "epoch": 0.7927448186204655, + "grad_norm": 3.8964765071868896, + "learning_rate": 2.504796224697701e-06, + "loss": 0.8769, + "step": 63418 + }, + { + "epoch": 0.7927698192454812, + "grad_norm": 3.424619436264038, + "learning_rate": 2.504218549553702e-06, + "loss": 0.5924, + "step": 63420 + }, + { + "epoch": 0.7927948198704967, + "grad_norm": 0.29947689175605774, + "learning_rate": 2.503640931496517e-06, + "loss": 0.0169, + "step": 63422 + }, + { + "epoch": 0.7928198204955124, + "grad_norm": 5.244414329528809, + "learning_rate": 2.503063370530541e-06, + "loss": 1.1559, + "step": 63424 + }, + { + "epoch": 0.792844821120528, + "grad_norm": 0.002135267946869135, + "learning_rate": 2.5024858666601757e-06, + "loss": 0.6009, + "step": 63426 + }, + { + "epoch": 0.7928698217455437, + "grad_norm": 2.906768798828125, + "learning_rate": 2.5019084198898156e-06, + "loss": 0.836, + "step": 63428 + }, + { + "epoch": 0.7928948223705593, + "grad_norm": 1.8589458465576172, + "learning_rate": 2.5013310302238634e-06, + "loss": 0.2594, + "step": 63430 + }, + { + "epoch": 0.7929198229955748, + "grad_norm": 3.6570160388946533, + "learning_rate": 2.5007536976667135e-06, + "loss": 1.1065, + "step": 63432 + }, + { + "epoch": 0.7929448236205905, + "grad_norm": 2.5887842178344727, + "learning_rate": 2.500176422222761e-06, + "loss": 0.8606, + "step": 63434 + }, + { + "epoch": 0.7929698242456061, + "grad_norm": 3.0441431999206543, + "learning_rate": 2.4995992038964056e-06, + "loss": 0.6903, + "step": 63436 + }, + { + "epoch": 0.7929948248706218, + "grad_norm": 4.277717113494873, + "learning_rate": 2.499022042692041e-06, + "loss": 1.2121, + "step": 63438 + }, + { + "epoch": 0.7930198254956374, + "grad_norm": 2.092824935913086, + "learning_rate": 2.4984449386140653e-06, + "loss": 0.1777, + "step": 63440 + }, + { + "epoch": 0.793044826120653, + "grad_norm": 3.4078328609466553, + "learning_rate": 2.49786789166687e-06, + "loss": 0.9624, + "step": 63442 + }, + { + "epoch": 0.7930698267456686, + "grad_norm": 4.934790134429932, + "learning_rate": 2.497290901854854e-06, + "loss": 0.2681, + "step": 63444 + }, + { + "epoch": 0.7930948273706843, + "grad_norm": 1.9763402938842773, + "learning_rate": 2.4967139691824106e-06, + "loss": 0.9659, + "step": 63446 + }, + { + "epoch": 0.7931198279956999, + "grad_norm": 0.09086548537015915, + "learning_rate": 2.496137093653929e-06, + "loss": 0.2662, + "step": 63448 + }, + { + "epoch": 0.7931448286207156, + "grad_norm": 1.5403848886489868, + "learning_rate": 2.4955602752738084e-06, + "loss": 0.799, + "step": 63450 + }, + { + "epoch": 0.7931698292457311, + "grad_norm": 5.444638252258301, + "learning_rate": 2.4949835140464373e-06, + "loss": 1.9098, + "step": 63452 + }, + { + "epoch": 0.7931948298707467, + "grad_norm": 0.00036387768341228366, + "learning_rate": 2.494406809976213e-06, + "loss": 0.0156, + "step": 63454 + }, + { + "epoch": 0.7932198304957624, + "grad_norm": 2.3806896209716797, + "learning_rate": 2.4938301630675233e-06, + "loss": 1.2441, + "step": 63456 + }, + { + "epoch": 0.793244831120778, + "grad_norm": 2.5032660961151123, + "learning_rate": 2.493253573324763e-06, + "loss": 0.4086, + "step": 63458 + }, + { + "epoch": 0.7932698317457937, + "grad_norm": 0.02287299744784832, + "learning_rate": 2.492677040752323e-06, + "loss": 0.4536, + "step": 63460 + }, + { + "epoch": 0.7932948323708092, + "grad_norm": 2.912365674972534, + "learning_rate": 2.4921005653545892e-06, + "loss": 0.568, + "step": 63462 + }, + { + "epoch": 0.7933198329958249, + "grad_norm": 4.297474384307861, + "learning_rate": 2.4915241471359596e-06, + "loss": 1.8444, + "step": 63464 + }, + { + "epoch": 0.7933448336208405, + "grad_norm": 0.22559164464473724, + "learning_rate": 2.490947786100817e-06, + "loss": 0.0999, + "step": 63466 + }, + { + "epoch": 0.7933698342458562, + "grad_norm": 5.083062648773193, + "learning_rate": 2.490371482253557e-06, + "loss": 0.93, + "step": 63468 + }, + { + "epoch": 0.7933948348708718, + "grad_norm": 4.102453231811523, + "learning_rate": 2.4897952355985665e-06, + "loss": 1.3202, + "step": 63470 + }, + { + "epoch": 0.7934198354958873, + "grad_norm": 5.92770528793335, + "learning_rate": 2.4892190461402298e-06, + "loss": 0.8667, + "step": 63472 + }, + { + "epoch": 0.793444836120903, + "grad_norm": 4.787564277648926, + "learning_rate": 2.4886429138829415e-06, + "loss": 2.0008, + "step": 63474 + }, + { + "epoch": 0.7934698367459186, + "grad_norm": 4.537515163421631, + "learning_rate": 2.4880668388310835e-06, + "loss": 0.7158, + "step": 63476 + }, + { + "epoch": 0.7934948373709343, + "grad_norm": 3.4776580333709717, + "learning_rate": 2.4874908209890504e-06, + "loss": 0.8463, + "step": 63478 + }, + { + "epoch": 0.7935198379959499, + "grad_norm": 4.726982593536377, + "learning_rate": 2.4869148603612214e-06, + "loss": 1.4662, + "step": 63480 + }, + { + "epoch": 0.7935448386209655, + "grad_norm": 2.3167808055877686, + "learning_rate": 2.486338956951988e-06, + "loss": 1.5376, + "step": 63482 + }, + { + "epoch": 0.7935698392459811, + "grad_norm": 1.8849976062774658, + "learning_rate": 2.4857631107657366e-06, + "loss": 0.5348, + "step": 63484 + }, + { + "epoch": 0.7935948398709968, + "grad_norm": 2.589339017868042, + "learning_rate": 2.4851873218068467e-06, + "loss": 0.9676, + "step": 63486 + }, + { + "epoch": 0.7936198404960124, + "grad_norm": 1.5509196519851685, + "learning_rate": 2.48461159007971e-06, + "loss": 0.5879, + "step": 63488 + }, + { + "epoch": 0.7936448411210281, + "grad_norm": 3.1227059364318848, + "learning_rate": 2.4840359155887063e-06, + "loss": 0.8144, + "step": 63490 + }, + { + "epoch": 0.7936698417460436, + "grad_norm": 0.0004183082783129066, + "learning_rate": 2.483460298338225e-06, + "loss": 0.0, + "step": 63492 + }, + { + "epoch": 0.7936948423710593, + "grad_norm": 0.12868927419185638, + "learning_rate": 2.4828847383326437e-06, + "loss": 1.1744, + "step": 63494 + }, + { + "epoch": 0.7937198429960749, + "grad_norm": 0.0007920754142105579, + "learning_rate": 2.48230923557635e-06, + "loss": 0.7794, + "step": 63496 + }, + { + "epoch": 0.7937448436210905, + "grad_norm": 8.030648231506348, + "learning_rate": 2.4817337900737315e-06, + "loss": 2.1347, + "step": 63498 + }, + { + "epoch": 0.7937698442461062, + "grad_norm": 4.447685241699219, + "learning_rate": 2.48115840182916e-06, + "loss": 2.1513, + "step": 63500 + }, + { + "epoch": 0.7937948448711217, + "grad_norm": 5.845694065093994, + "learning_rate": 2.480583070847026e-06, + "loss": 1.2147, + "step": 63502 + }, + { + "epoch": 0.7938198454961374, + "grad_norm": 3.6116626262664795, + "learning_rate": 2.4800077971317037e-06, + "loss": 1.1246, + "step": 63504 + }, + { + "epoch": 0.793844846121153, + "grad_norm": 3.415031671524048, + "learning_rate": 2.479432580687583e-06, + "loss": 1.3023, + "step": 63506 + }, + { + "epoch": 0.7938698467461687, + "grad_norm": 3.1831793785095215, + "learning_rate": 2.4788574215190365e-06, + "loss": 0.4891, + "step": 63508 + }, + { + "epoch": 0.7938948473711843, + "grad_norm": 3.916071653366089, + "learning_rate": 2.478282319630451e-06, + "loss": 1.002, + "step": 63510 + }, + { + "epoch": 0.7939198479961999, + "grad_norm": 4.498559474945068, + "learning_rate": 2.4777072750262045e-06, + "loss": 1.0264, + "step": 63512 + }, + { + "epoch": 0.7939448486212155, + "grad_norm": 6.1682000160217285, + "learning_rate": 2.477132287710672e-06, + "loss": 0.9132, + "step": 63514 + }, + { + "epoch": 0.7939698492462312, + "grad_norm": 2.3059933185577393, + "learning_rate": 2.4765573576882383e-06, + "loss": 0.6385, + "step": 63516 + }, + { + "epoch": 0.7939948498712468, + "grad_norm": 3.9135477542877197, + "learning_rate": 2.4759824849632786e-06, + "loss": 0.9449, + "step": 63518 + }, + { + "epoch": 0.7940198504962624, + "grad_norm": 0.018266018480062485, + "learning_rate": 2.475407669540174e-06, + "loss": 0.0072, + "step": 63520 + }, + { + "epoch": 0.794044851121278, + "grad_norm": 3.3266172409057617, + "learning_rate": 2.4748329114232984e-06, + "loss": 1.3438, + "step": 63522 + }, + { + "epoch": 0.7940698517462936, + "grad_norm": 7.31752872467041, + "learning_rate": 2.4742582106170343e-06, + "loss": 1.7428, + "step": 63524 + }, + { + "epoch": 0.7940948523713093, + "grad_norm": 0.09540510922670364, + "learning_rate": 2.473683567125755e-06, + "loss": 0.1347, + "step": 63526 + }, + { + "epoch": 0.7941198529963249, + "grad_norm": 5.2775702476501465, + "learning_rate": 2.4731089809538345e-06, + "loss": 1.884, + "step": 63528 + }, + { + "epoch": 0.7941448536213406, + "grad_norm": 2.6708884239196777, + "learning_rate": 2.4725344521056547e-06, + "loss": 0.6601, + "step": 63530 + }, + { + "epoch": 0.7941698542463561, + "grad_norm": 3.4790501594543457, + "learning_rate": 2.4719599805855843e-06, + "loss": 1.715, + "step": 63532 + }, + { + "epoch": 0.7941948548713718, + "grad_norm": 4.021480083465576, + "learning_rate": 2.4713855663980023e-06, + "loss": 0.5695, + "step": 63534 + }, + { + "epoch": 0.7942198554963874, + "grad_norm": 2.741940498352051, + "learning_rate": 2.4708112095472905e-06, + "loss": 0.4995, + "step": 63536 + }, + { + "epoch": 0.794244856121403, + "grad_norm": 3.428651809692383, + "learning_rate": 2.470236910037809e-06, + "loss": 1.755, + "step": 63538 + }, + { + "epoch": 0.7942698567464187, + "grad_norm": 4.838274955749512, + "learning_rate": 2.469662667873941e-06, + "loss": 0.6793, + "step": 63540 + }, + { + "epoch": 0.7942948573714342, + "grad_norm": 0.5653951168060303, + "learning_rate": 2.469088483060054e-06, + "loss": 0.4429, + "step": 63542 + }, + { + "epoch": 0.7943198579964499, + "grad_norm": 2.802265167236328, + "learning_rate": 2.4685143556005276e-06, + "loss": 0.9161, + "step": 63544 + }, + { + "epoch": 0.7943448586214655, + "grad_norm": 3.439753532409668, + "learning_rate": 2.467940285499728e-06, + "loss": 0.5857, + "step": 63546 + }, + { + "epoch": 0.7943698592464812, + "grad_norm": 3.0278890132904053, + "learning_rate": 2.4673662727620297e-06, + "loss": 0.8988, + "step": 63548 + }, + { + "epoch": 0.7943948598714968, + "grad_norm": 0.9105360507965088, + "learning_rate": 2.4667923173918106e-06, + "loss": 0.039, + "step": 63550 + }, + { + "epoch": 0.7944198604965124, + "grad_norm": 2.843085289001465, + "learning_rate": 2.46621841939343e-06, + "loss": 0.8918, + "step": 63552 + }, + { + "epoch": 0.794444861121528, + "grad_norm": 4.077816009521484, + "learning_rate": 2.465644578771268e-06, + "loss": 1.1271, + "step": 63554 + }, + { + "epoch": 0.7944698617465437, + "grad_norm": 2.7688193321228027, + "learning_rate": 2.465070795529688e-06, + "loss": 0.5486, + "step": 63556 + }, + { + "epoch": 0.7944948623715593, + "grad_norm": 3.3719871044158936, + "learning_rate": 2.464497069673064e-06, + "loss": 1.1837, + "step": 63558 + }, + { + "epoch": 0.794519862996575, + "grad_norm": 8.722612380981445, + "learning_rate": 2.4639234012057666e-06, + "loss": 1.607, + "step": 63560 + }, + { + "epoch": 0.7945448636215905, + "grad_norm": 0.00026452698512002826, + "learning_rate": 2.4633497901321636e-06, + "loss": 0.6921, + "step": 63562 + }, + { + "epoch": 0.7945698642466061, + "grad_norm": 4.8124613761901855, + "learning_rate": 2.4627762364566233e-06, + "loss": 0.6223, + "step": 63564 + }, + { + "epoch": 0.7945948648716218, + "grad_norm": 0.2945779263973236, + "learning_rate": 2.462202740183509e-06, + "loss": 0.6083, + "step": 63566 + }, + { + "epoch": 0.7946198654966374, + "grad_norm": 1.6395338773727417, + "learning_rate": 2.4616293013171966e-06, + "loss": 0.1082, + "step": 63568 + }, + { + "epoch": 0.7946448661216531, + "grad_norm": 6.011685848236084, + "learning_rate": 2.4610559198620465e-06, + "loss": 0.8391, + "step": 63570 + }, + { + "epoch": 0.7946698667466686, + "grad_norm": 2.37724232673645, + "learning_rate": 2.4604825958224276e-06, + "loss": 0.0861, + "step": 63572 + }, + { + "epoch": 0.7946948673716843, + "grad_norm": 0.0003915422421414405, + "learning_rate": 2.459909329202711e-06, + "loss": 0.0, + "step": 63574 + }, + { + "epoch": 0.7947198679966999, + "grad_norm": 1.5656133890151978, + "learning_rate": 2.4593361200072574e-06, + "loss": 0.6603, + "step": 63576 + }, + { + "epoch": 0.7947448686217156, + "grad_norm": 3.4351749420166016, + "learning_rate": 2.4587629682404345e-06, + "loss": 0.9578, + "step": 63578 + }, + { + "epoch": 0.7947698692467312, + "grad_norm": 2.324254035949707, + "learning_rate": 2.4581898739066035e-06, + "loss": 1.0019, + "step": 63580 + }, + { + "epoch": 0.7947948698717467, + "grad_norm": 3.7595651149749756, + "learning_rate": 2.4576168370101318e-06, + "loss": 2.0242, + "step": 63582 + }, + { + "epoch": 0.7948198704967624, + "grad_norm": 3.344027280807495, + "learning_rate": 2.457043857555387e-06, + "loss": 1.2752, + "step": 63584 + }, + { + "epoch": 0.794844871121778, + "grad_norm": 6.575798511505127, + "learning_rate": 2.4564709355467253e-06, + "loss": 1.5287, + "step": 63586 + }, + { + "epoch": 0.7948698717467937, + "grad_norm": 4.905905246734619, + "learning_rate": 2.455898070988522e-06, + "loss": 0.9044, + "step": 63588 + }, + { + "epoch": 0.7948948723718093, + "grad_norm": 6.539172649383545, + "learning_rate": 2.4553252638851243e-06, + "loss": 0.2289, + "step": 63590 + }, + { + "epoch": 0.7949198729968249, + "grad_norm": 0.06756862998008728, + "learning_rate": 2.454752514240907e-06, + "loss": 0.0935, + "step": 63592 + }, + { + "epoch": 0.7949448736218405, + "grad_norm": 3.5879290103912354, + "learning_rate": 2.454179822060223e-06, + "loss": 0.6193, + "step": 63594 + }, + { + "epoch": 0.7949698742468562, + "grad_norm": 2.787517786026001, + "learning_rate": 2.45360718734744e-06, + "loss": 1.8715, + "step": 63596 + }, + { + "epoch": 0.7949948748718718, + "grad_norm": 0.0006826039170846343, + "learning_rate": 2.4530346101069193e-06, + "loss": 0.2203, + "step": 63598 + }, + { + "epoch": 0.7950198754968875, + "grad_norm": 3.308065891265869, + "learning_rate": 2.4524620903430176e-06, + "loss": 0.9298, + "step": 63600 + }, + { + "epoch": 0.795044876121903, + "grad_norm": 2.693528652191162, + "learning_rate": 2.4518896280601023e-06, + "loss": 0.4569, + "step": 63602 + }, + { + "epoch": 0.7950698767469186, + "grad_norm": 5.663914203643799, + "learning_rate": 2.4513172232625227e-06, + "loss": 1.4608, + "step": 63604 + }, + { + "epoch": 0.7950948773719343, + "grad_norm": 3.4722790718078613, + "learning_rate": 2.450744875954647e-06, + "loss": 1.7186, + "step": 63606 + }, + { + "epoch": 0.7951198779969499, + "grad_norm": 3.818359851837158, + "learning_rate": 2.450172586140828e-06, + "loss": 0.5615, + "step": 63608 + }, + { + "epoch": 0.7951448786219656, + "grad_norm": 7.690761566162109, + "learning_rate": 2.4496003538254265e-06, + "loss": 1.2073, + "step": 63610 + }, + { + "epoch": 0.7951698792469811, + "grad_norm": 2.636457920074463, + "learning_rate": 2.4490281790128033e-06, + "loss": 0.7158, + "step": 63612 + }, + { + "epoch": 0.7951948798719968, + "grad_norm": 0.0004958666395395994, + "learning_rate": 2.448456061707315e-06, + "loss": 0.5125, + "step": 63614 + }, + { + "epoch": 0.7952198804970124, + "grad_norm": 3.1226160526275635, + "learning_rate": 2.4478840019133164e-06, + "loss": 1.0468, + "step": 63616 + }, + { + "epoch": 0.7952448811220281, + "grad_norm": 4.866318225860596, + "learning_rate": 2.4473119996351636e-06, + "loss": 1.3005, + "step": 63618 + }, + { + "epoch": 0.7952698817470437, + "grad_norm": 0.0006145549123175442, + "learning_rate": 2.4467400548772136e-06, + "loss": 0.6481, + "step": 63620 + }, + { + "epoch": 0.7952948823720593, + "grad_norm": 3.7988579273223877, + "learning_rate": 2.4461681676438253e-06, + "loss": 0.5689, + "step": 63622 + }, + { + "epoch": 0.7953198829970749, + "grad_norm": 1.6627002954483032, + "learning_rate": 2.445596337939351e-06, + "loss": 0.7103, + "step": 63624 + }, + { + "epoch": 0.7953448836220905, + "grad_norm": 3.5061843395233154, + "learning_rate": 2.4450245657681483e-06, + "loss": 0.933, + "step": 63626 + }, + { + "epoch": 0.7953698842471062, + "grad_norm": 0.0003580487973522395, + "learning_rate": 2.44445285113457e-06, + "loss": 0.4368, + "step": 63628 + }, + { + "epoch": 0.7953948848721218, + "grad_norm": 0.3316511809825897, + "learning_rate": 2.4438811940429706e-06, + "loss": 0.5097, + "step": 63630 + }, + { + "epoch": 0.7954198854971374, + "grad_norm": 0.00031633349135518074, + "learning_rate": 2.4433095944977016e-06, + "loss": 0.1522, + "step": 63632 + }, + { + "epoch": 0.795444886122153, + "grad_norm": 0.7698673605918884, + "learning_rate": 2.4427380525031176e-06, + "loss": 0.1619, + "step": 63634 + }, + { + "epoch": 0.7954698867471687, + "grad_norm": 4.314940452575684, + "learning_rate": 2.442166568063574e-06, + "loss": 1.2954, + "step": 63636 + }, + { + "epoch": 0.7954948873721843, + "grad_norm": 4.528428077697754, + "learning_rate": 2.441595141183419e-06, + "loss": 1.4676, + "step": 63638 + }, + { + "epoch": 0.7955198879972, + "grad_norm": 3.380760431289673, + "learning_rate": 2.4410237718670114e-06, + "loss": 1.591, + "step": 63640 + }, + { + "epoch": 0.7955448886222155, + "grad_norm": 2.800856113433838, + "learning_rate": 2.440452460118693e-06, + "loss": 0.3053, + "step": 63642 + }, + { + "epoch": 0.7955698892472312, + "grad_norm": 1.1921640634536743, + "learning_rate": 2.439881205942819e-06, + "loss": 0.56, + "step": 63644 + }, + { + "epoch": 0.7955948898722468, + "grad_norm": 0.0008126032189466059, + "learning_rate": 2.439310009343744e-06, + "loss": 0.2694, + "step": 63646 + }, + { + "epoch": 0.7956198904972624, + "grad_norm": 3.3147597312927246, + "learning_rate": 2.438738870325812e-06, + "loss": 0.8012, + "step": 63648 + }, + { + "epoch": 0.7956448911222781, + "grad_norm": 3.426496744155884, + "learning_rate": 2.4381677888933786e-06, + "loss": 1.4714, + "step": 63650 + }, + { + "epoch": 0.7956698917472936, + "grad_norm": 3.0219967365264893, + "learning_rate": 2.4375967650507857e-06, + "loss": 1.3202, + "step": 63652 + }, + { + "epoch": 0.7956948923723093, + "grad_norm": 6.6507110595703125, + "learning_rate": 2.437025798802394e-06, + "loss": 1.5368, + "step": 63654 + }, + { + "epoch": 0.7957198929973249, + "grad_norm": 5.59628438949585, + "learning_rate": 2.4364548901525374e-06, + "loss": 0.9445, + "step": 63656 + }, + { + "epoch": 0.7957448936223406, + "grad_norm": 3.121802806854248, + "learning_rate": 2.4358840391055704e-06, + "loss": 0.7967, + "step": 63658 + }, + { + "epoch": 0.7957698942473562, + "grad_norm": 1.7359989881515503, + "learning_rate": 2.4353132456658444e-06, + "loss": 0.7245, + "step": 63660 + }, + { + "epoch": 0.7957948948723718, + "grad_norm": 2.8964200019836426, + "learning_rate": 2.434742509837701e-06, + "loss": 0.885, + "step": 63662 + }, + { + "epoch": 0.7958198954973874, + "grad_norm": 4.149097919464111, + "learning_rate": 2.43417183162549e-06, + "loss": 1.5337, + "step": 63664 + }, + { + "epoch": 0.7958448961224031, + "grad_norm": 3.693509817123413, + "learning_rate": 2.433601211033557e-06, + "loss": 0.7274, + "step": 63666 + }, + { + "epoch": 0.7958698967474187, + "grad_norm": 2.0847432613372803, + "learning_rate": 2.433030648066248e-06, + "loss": 0.8076, + "step": 63668 + }, + { + "epoch": 0.7958948973724344, + "grad_norm": 3.4538278579711914, + "learning_rate": 2.4324601427279037e-06, + "loss": 0.7609, + "step": 63670 + }, + { + "epoch": 0.7959198979974499, + "grad_norm": 3.5337882041931152, + "learning_rate": 2.4318896950228732e-06, + "loss": 0.9184, + "step": 63672 + }, + { + "epoch": 0.7959448986224655, + "grad_norm": 2.5964515209198, + "learning_rate": 2.431319304955504e-06, + "loss": 0.7565, + "step": 63674 + }, + { + "epoch": 0.7959698992474812, + "grad_norm": 6.375503063201904, + "learning_rate": 2.4307489725301326e-06, + "loss": 1.5734, + "step": 63676 + }, + { + "epoch": 0.7959948998724968, + "grad_norm": 1.9044597148895264, + "learning_rate": 2.43017869775111e-06, + "loss": 0.2073, + "step": 63678 + }, + { + "epoch": 0.7960199004975125, + "grad_norm": 1.1187045574188232, + "learning_rate": 2.429608480622776e-06, + "loss": 0.8697, + "step": 63680 + }, + { + "epoch": 0.796044901122528, + "grad_norm": 4.9310994148254395, + "learning_rate": 2.4290383211494708e-06, + "loss": 1.0175, + "step": 63682 + }, + { + "epoch": 0.7960699017475437, + "grad_norm": 3.651242971420288, + "learning_rate": 2.4284682193355412e-06, + "loss": 0.5763, + "step": 63684 + }, + { + "epoch": 0.7960949023725593, + "grad_norm": 7.105751991271973, + "learning_rate": 2.427898175185325e-06, + "loss": 2.0034, + "step": 63686 + }, + { + "epoch": 0.796119902997575, + "grad_norm": 2.676333427429199, + "learning_rate": 2.4273281887031676e-06, + "loss": 0.4668, + "step": 63688 + }, + { + "epoch": 0.7961449036225906, + "grad_norm": 4.851684093475342, + "learning_rate": 2.426758259893406e-06, + "loss": 1.2966, + "step": 63690 + }, + { + "epoch": 0.7961699042476061, + "grad_norm": 2.1056969165802, + "learning_rate": 2.426188388760388e-06, + "loss": 0.8815, + "step": 63692 + }, + { + "epoch": 0.7961949048726218, + "grad_norm": 0.000578360806684941, + "learning_rate": 2.4256185753084414e-06, + "loss": 0.7745, + "step": 63694 + }, + { + "epoch": 0.7962199054976374, + "grad_norm": 2.736584424972534, + "learning_rate": 2.425048819541913e-06, + "loss": 0.5367, + "step": 63696 + }, + { + "epoch": 0.7962449061226531, + "grad_norm": 3.0120203495025635, + "learning_rate": 2.424479121465144e-06, + "loss": 0.2811, + "step": 63698 + }, + { + "epoch": 0.7962699067476687, + "grad_norm": 0.49799495935440063, + "learning_rate": 2.4239094810824694e-06, + "loss": 0.1979, + "step": 63700 + }, + { + "epoch": 0.7962949073726843, + "grad_norm": 1.0596997737884521, + "learning_rate": 2.4233398983982303e-06, + "loss": 0.1385, + "step": 63702 + }, + { + "epoch": 0.7963199079976999, + "grad_norm": 3.5863442420959473, + "learning_rate": 2.422770373416761e-06, + "loss": 1.1304, + "step": 63704 + }, + { + "epoch": 0.7963449086227156, + "grad_norm": 0.03902808204293251, + "learning_rate": 2.4222009061424036e-06, + "loss": 0.7946, + "step": 63706 + }, + { + "epoch": 0.7963699092477312, + "grad_norm": 0.19370262324810028, + "learning_rate": 2.421631496579493e-06, + "loss": 0.0307, + "step": 63708 + }, + { + "epoch": 0.7963949098727469, + "grad_norm": 4.707447052001953, + "learning_rate": 2.421062144732361e-06, + "loss": 0.7585, + "step": 63710 + }, + { + "epoch": 0.7964199104977624, + "grad_norm": 11.68078327178955, + "learning_rate": 2.4204928506053525e-06, + "loss": 1.7246, + "step": 63712 + }, + { + "epoch": 0.796444911122778, + "grad_norm": 3.0122382640838623, + "learning_rate": 2.4199236142027947e-06, + "loss": 0.92, + "step": 63714 + }, + { + "epoch": 0.7964699117477937, + "grad_norm": 1.9824517965316772, + "learning_rate": 2.4193544355290297e-06, + "loss": 0.3543, + "step": 63716 + }, + { + "epoch": 0.7964949123728093, + "grad_norm": 2.3243908882141113, + "learning_rate": 2.41878531458839e-06, + "loss": 0.5182, + "step": 63718 + }, + { + "epoch": 0.796519912997825, + "grad_norm": 4.1468048095703125, + "learning_rate": 2.4182162513852058e-06, + "loss": 0.8372, + "step": 63720 + }, + { + "epoch": 0.7965449136228405, + "grad_norm": 6.425539493560791, + "learning_rate": 2.4176472459238174e-06, + "loss": 0.8833, + "step": 63722 + }, + { + "epoch": 0.7965699142478562, + "grad_norm": 3.115715503692627, + "learning_rate": 2.4170782982085526e-06, + "loss": 0.782, + "step": 63724 + }, + { + "epoch": 0.7965949148728718, + "grad_norm": 4.044013977050781, + "learning_rate": 2.41650940824375e-06, + "loss": 0.8361, + "step": 63726 + }, + { + "epoch": 0.7966199154978875, + "grad_norm": 3.7908520698547363, + "learning_rate": 2.4159405760337363e-06, + "loss": 0.7036, + "step": 63728 + }, + { + "epoch": 0.7966449161229031, + "grad_norm": 3.8525702953338623, + "learning_rate": 2.4153718015828496e-06, + "loss": 1.8911, + "step": 63730 + }, + { + "epoch": 0.7966699167479186, + "grad_norm": 2.0017268657684326, + "learning_rate": 2.4148030848954183e-06, + "loss": 0.0805, + "step": 63732 + }, + { + "epoch": 0.7966949173729343, + "grad_norm": 0.00023012208112049848, + "learning_rate": 2.414234425975772e-06, + "loss": 0.0002, + "step": 63734 + }, + { + "epoch": 0.79671991799795, + "grad_norm": 4.9337615966796875, + "learning_rate": 2.413665824828246e-06, + "loss": 0.7291, + "step": 63736 + }, + { + "epoch": 0.7967449186229656, + "grad_norm": 4.154391288757324, + "learning_rate": 2.413097281457166e-06, + "loss": 0.6472, + "step": 63738 + }, + { + "epoch": 0.7967699192479812, + "grad_norm": 0.00021389358153101057, + "learning_rate": 2.4125287958668664e-06, + "loss": 0.275, + "step": 63740 + }, + { + "epoch": 0.7967949198729968, + "grad_norm": 9.27829647064209, + "learning_rate": 2.4119603680616722e-06, + "loss": 1.4424, + "step": 63742 + }, + { + "epoch": 0.7968199204980124, + "grad_norm": 5.138416290283203, + "learning_rate": 2.4113919980459167e-06, + "loss": 1.0572, + "step": 63744 + }, + { + "epoch": 0.7968449211230281, + "grad_norm": 2.5735700130462646, + "learning_rate": 2.4108236858239277e-06, + "loss": 0.7597, + "step": 63746 + }, + { + "epoch": 0.7968699217480437, + "grad_norm": 4.148698329925537, + "learning_rate": 2.4102554314000293e-06, + "loss": 0.962, + "step": 63748 + }, + { + "epoch": 0.7968949223730594, + "grad_norm": 3.4148967266082764, + "learning_rate": 2.4096872347785538e-06, + "loss": 0.4998, + "step": 63750 + }, + { + "epoch": 0.7969199229980749, + "grad_norm": 3.114100694656372, + "learning_rate": 2.4091190959638255e-06, + "loss": 1.7803, + "step": 63752 + }, + { + "epoch": 0.7969449236230906, + "grad_norm": 3.8589892387390137, + "learning_rate": 2.4085510149601755e-06, + "loss": 1.5987, + "step": 63754 + }, + { + "epoch": 0.7969699242481062, + "grad_norm": 5.694945335388184, + "learning_rate": 2.4079829917719243e-06, + "loss": 1.6453, + "step": 63756 + }, + { + "epoch": 0.7969949248731218, + "grad_norm": 4.800338268280029, + "learning_rate": 2.4074150264034034e-06, + "loss": 1.2864, + "step": 63758 + }, + { + "epoch": 0.7970199254981375, + "grad_norm": 4.323770999908447, + "learning_rate": 2.406847118858936e-06, + "loss": 1.7453, + "step": 63760 + }, + { + "epoch": 0.797044926123153, + "grad_norm": 1.731576919555664, + "learning_rate": 2.406279269142845e-06, + "loss": 0.176, + "step": 63762 + }, + { + "epoch": 0.7970699267481687, + "grad_norm": 2.0745131969451904, + "learning_rate": 2.4057114772594603e-06, + "loss": 0.7581, + "step": 63764 + }, + { + "epoch": 0.7970949273731843, + "grad_norm": 0.6303327083587646, + "learning_rate": 2.4051437432130995e-06, + "loss": 0.8292, + "step": 63766 + }, + { + "epoch": 0.7971199279982, + "grad_norm": 0.609521746635437, + "learning_rate": 2.404576067008093e-06, + "loss": 1.3792, + "step": 63768 + }, + { + "epoch": 0.7971449286232156, + "grad_norm": 2.0954365730285645, + "learning_rate": 2.404008448648758e-06, + "loss": 0.4896, + "step": 63770 + }, + { + "epoch": 0.7971699292482312, + "grad_norm": 3.042295455932617, + "learning_rate": 2.403440888139423e-06, + "loss": 0.3757, + "step": 63772 + }, + { + "epoch": 0.7971949298732468, + "grad_norm": 2.3482131958007812, + "learning_rate": 2.402873385484409e-06, + "loss": 0.506, + "step": 63774 + }, + { + "epoch": 0.7972199304982625, + "grad_norm": 1.2964403629302979, + "learning_rate": 2.4023059406880332e-06, + "loss": 0.4853, + "step": 63776 + }, + { + "epoch": 0.7972449311232781, + "grad_norm": 3.881488800048828, + "learning_rate": 2.4017385537546244e-06, + "loss": 1.8685, + "step": 63778 + }, + { + "epoch": 0.7972699317482937, + "grad_norm": 7.152557373046875, + "learning_rate": 2.401171224688498e-06, + "loss": 2.2501, + "step": 63780 + }, + { + "epoch": 0.7972949323733093, + "grad_norm": 3.796649932861328, + "learning_rate": 2.4006039534939784e-06, + "loss": 0.6403, + "step": 63782 + }, + { + "epoch": 0.7973199329983249, + "grad_norm": 0.00019952855654992163, + "learning_rate": 2.4000367401753856e-06, + "loss": 0.1035, + "step": 63784 + }, + { + "epoch": 0.7973449336233406, + "grad_norm": 0.0006373578798957169, + "learning_rate": 2.3994695847370354e-06, + "loss": 1.1855, + "step": 63786 + }, + { + "epoch": 0.7973699342483562, + "grad_norm": 0.6325730085372925, + "learning_rate": 2.3989024871832524e-06, + "loss": 0.1189, + "step": 63788 + }, + { + "epoch": 0.7973949348733719, + "grad_norm": 4.7221293449401855, + "learning_rate": 2.3983354475183498e-06, + "loss": 1.0489, + "step": 63790 + }, + { + "epoch": 0.7974199354983874, + "grad_norm": 3.4659154415130615, + "learning_rate": 2.3977684657466528e-06, + "loss": 0.7246, + "step": 63792 + }, + { + "epoch": 0.7974449361234031, + "grad_norm": 2.1535184383392334, + "learning_rate": 2.3972015418724737e-06, + "loss": 1.4564, + "step": 63794 + }, + { + "epoch": 0.7974699367484187, + "grad_norm": 3.4798834323883057, + "learning_rate": 2.3966346759001345e-06, + "loss": 1.3664, + "step": 63796 + }, + { + "epoch": 0.7974949373734344, + "grad_norm": 3.1267497539520264, + "learning_rate": 2.396067867833951e-06, + "loss": 0.6292, + "step": 63798 + }, + { + "epoch": 0.79751993799845, + "grad_norm": 5.386789798736572, + "learning_rate": 2.395501117678236e-06, + "loss": 1.2075, + "step": 63800 + }, + { + "epoch": 0.7975449386234655, + "grad_norm": 0.8489389419555664, + "learning_rate": 2.394934425437313e-06, + "loss": 0.0513, + "step": 63802 + }, + { + "epoch": 0.7975699392484812, + "grad_norm": 0.0006872350932098925, + "learning_rate": 2.3943677911154905e-06, + "loss": 0.0, + "step": 63804 + }, + { + "epoch": 0.7975949398734968, + "grad_norm": 1.3911199569702148, + "learning_rate": 2.39380121471709e-06, + "loss": 0.2609, + "step": 63806 + }, + { + "epoch": 0.7976199404985125, + "grad_norm": 0.9395549297332764, + "learning_rate": 2.3932346962464204e-06, + "loss": 0.5457, + "step": 63808 + }, + { + "epoch": 0.7976449411235281, + "grad_norm": 4.550543308258057, + "learning_rate": 2.3926682357078037e-06, + "loss": 0.4624, + "step": 63810 + }, + { + "epoch": 0.7976699417485437, + "grad_norm": 5.851678371429443, + "learning_rate": 2.392101833105549e-06, + "loss": 1.2921, + "step": 63812 + }, + { + "epoch": 0.7976949423735593, + "grad_norm": 0.00036467015161179006, + "learning_rate": 2.391535488443969e-06, + "loss": 0.4529, + "step": 63814 + }, + { + "epoch": 0.797719942998575, + "grad_norm": 6.483273983001709, + "learning_rate": 2.390969201727381e-06, + "loss": 0.5163, + "step": 63816 + }, + { + "epoch": 0.7977449436235906, + "grad_norm": 4.125933647155762, + "learning_rate": 2.390402972960093e-06, + "loss": 1.4427, + "step": 63818 + }, + { + "epoch": 0.7977699442486063, + "grad_norm": 0.0006580568733625114, + "learning_rate": 2.3898368021464226e-06, + "loss": 0.4814, + "step": 63820 + }, + { + "epoch": 0.7977949448736218, + "grad_norm": 3.326220750808716, + "learning_rate": 2.389270689290676e-06, + "loss": 1.589, + "step": 63822 + }, + { + "epoch": 0.7978199454986374, + "grad_norm": 5.685774326324463, + "learning_rate": 2.388704634397171e-06, + "loss": 0.8371, + "step": 63824 + }, + { + "epoch": 0.7978449461236531, + "grad_norm": 2.087242603302002, + "learning_rate": 2.388138637470214e-06, + "loss": 0.8657, + "step": 63826 + }, + { + "epoch": 0.7978699467486687, + "grad_norm": 3.1777520179748535, + "learning_rate": 2.3875726985141144e-06, + "loss": 1.1003, + "step": 63828 + }, + { + "epoch": 0.7978949473736844, + "grad_norm": 0.18811315298080444, + "learning_rate": 2.3870068175331884e-06, + "loss": 0.6335, + "step": 63830 + }, + { + "epoch": 0.7979199479986999, + "grad_norm": 2.4222793579101562, + "learning_rate": 2.386440994531738e-06, + "loss": 0.675, + "step": 63832 + }, + { + "epoch": 0.7979449486237156, + "grad_norm": 0.5591918230056763, + "learning_rate": 2.3858752295140797e-06, + "loss": 0.0225, + "step": 63834 + }, + { + "epoch": 0.7979699492487312, + "grad_norm": 0.00033046334283426404, + "learning_rate": 2.385309522484518e-06, + "loss": 0.0526, + "step": 63836 + }, + { + "epoch": 0.7979949498737469, + "grad_norm": 3.8595259189605713, + "learning_rate": 2.3847438734473605e-06, + "loss": 2.1317, + "step": 63838 + }, + { + "epoch": 0.7980199504987625, + "grad_norm": 6.317294597625732, + "learning_rate": 2.384178282406918e-06, + "loss": 0.2124, + "step": 63840 + }, + { + "epoch": 0.798044951123778, + "grad_norm": 3.5015647411346436, + "learning_rate": 2.3836127493674953e-06, + "loss": 0.9662, + "step": 63842 + }, + { + "epoch": 0.7980699517487937, + "grad_norm": 0.0005370358121581376, + "learning_rate": 2.3830472743334032e-06, + "loss": 0.6644, + "step": 63844 + }, + { + "epoch": 0.7980949523738093, + "grad_norm": 0.0002393345203017816, + "learning_rate": 2.3824818573089436e-06, + "loss": 0.3991, + "step": 63846 + }, + { + "epoch": 0.798119952998825, + "grad_norm": 2.0995662212371826, + "learning_rate": 2.381916498298428e-06, + "loss": 0.1747, + "step": 63848 + }, + { + "epoch": 0.7981449536238406, + "grad_norm": 2.455374002456665, + "learning_rate": 2.3813511973061576e-06, + "loss": 0.4288, + "step": 63850 + }, + { + "epoch": 0.7981699542488562, + "grad_norm": 0.0003164967929478735, + "learning_rate": 2.3807859543364375e-06, + "loss": 1.0894, + "step": 63852 + }, + { + "epoch": 0.7981949548738718, + "grad_norm": 5.646968841552734, + "learning_rate": 2.3802207693935763e-06, + "loss": 1.8022, + "step": 63854 + }, + { + "epoch": 0.7982199554988875, + "grad_norm": 7.51942777633667, + "learning_rate": 2.379655642481874e-06, + "loss": 0.7309, + "step": 63856 + }, + { + "epoch": 0.7982449561239031, + "grad_norm": 1.6442413330078125, + "learning_rate": 2.379090573605639e-06, + "loss": 1.4949, + "step": 63858 + }, + { + "epoch": 0.7982699567489188, + "grad_norm": 0.00027190905530005693, + "learning_rate": 2.3785255627691707e-06, + "loss": 0.7193, + "step": 63860 + }, + { + "epoch": 0.7982949573739343, + "grad_norm": 0.8767200708389282, + "learning_rate": 2.3779606099767748e-06, + "loss": 0.0925, + "step": 63862 + }, + { + "epoch": 0.79831995799895, + "grad_norm": 4.829193115234375, + "learning_rate": 2.377395715232754e-06, + "loss": 2.7056, + "step": 63864 + }, + { + "epoch": 0.7983449586239656, + "grad_norm": 3.303938388824463, + "learning_rate": 2.376830878541407e-06, + "loss": 0.8138, + "step": 63866 + }, + { + "epoch": 0.7983699592489812, + "grad_norm": 4.581635475158691, + "learning_rate": 2.376266099907041e-06, + "loss": 1.7492, + "step": 63868 + }, + { + "epoch": 0.7983949598739969, + "grad_norm": 1.9930046796798706, + "learning_rate": 2.375701379333951e-06, + "loss": 0.6937, + "step": 63870 + }, + { + "epoch": 0.7984199604990124, + "grad_norm": 1.9486180543899536, + "learning_rate": 2.375136716826445e-06, + "loss": 1.0321, + "step": 63872 + }, + { + "epoch": 0.7984449611240281, + "grad_norm": 4.846444129943848, + "learning_rate": 2.3745721123888154e-06, + "loss": 1.1896, + "step": 63874 + }, + { + "epoch": 0.7984699617490437, + "grad_norm": 5.583050727844238, + "learning_rate": 2.3740075660253692e-06, + "loss": 0.8557, + "step": 63876 + }, + { + "epoch": 0.7984949623740594, + "grad_norm": 0.8826406002044678, + "learning_rate": 2.3734430777404037e-06, + "loss": 0.9991, + "step": 63878 + }, + { + "epoch": 0.798519962999075, + "grad_norm": 3.8264074325561523, + "learning_rate": 2.3728786475382135e-06, + "loss": 1.347, + "step": 63880 + }, + { + "epoch": 0.7985449636240906, + "grad_norm": 5.898036479949951, + "learning_rate": 2.3723142754231043e-06, + "loss": 0.9329, + "step": 63882 + }, + { + "epoch": 0.7985699642491062, + "grad_norm": 3.9763405323028564, + "learning_rate": 2.371749961399368e-06, + "loss": 1.9675, + "step": 63884 + }, + { + "epoch": 0.7985949648741218, + "grad_norm": 2.3536288738250732, + "learning_rate": 2.371185705471308e-06, + "loss": 0.6724, + "step": 63886 + }, + { + "epoch": 0.7986199654991375, + "grad_norm": 0.0016434243880212307, + "learning_rate": 2.3706215076432195e-06, + "loss": 0.7454, + "step": 63888 + }, + { + "epoch": 0.7986449661241531, + "grad_norm": 3.458956480026245, + "learning_rate": 2.370057367919395e-06, + "loss": 0.3201, + "step": 63890 + }, + { + "epoch": 0.7986699667491687, + "grad_norm": 5.138427257537842, + "learning_rate": 2.3694932863041377e-06, + "loss": 1.9236, + "step": 63892 + }, + { + "epoch": 0.7986949673741843, + "grad_norm": 3.630352735519409, + "learning_rate": 2.368929262801738e-06, + "loss": 1.1714, + "step": 63894 + }, + { + "epoch": 0.7987199679992, + "grad_norm": 3.8365795612335205, + "learning_rate": 2.368365297416496e-06, + "loss": 1.6073, + "step": 63896 + }, + { + "epoch": 0.7987449686242156, + "grad_norm": 2.3285229206085205, + "learning_rate": 2.367801390152703e-06, + "loss": 1.1459, + "step": 63898 + }, + { + "epoch": 0.7987699692492313, + "grad_norm": 3.431300640106201, + "learning_rate": 2.367237541014654e-06, + "loss": 0.4896, + "step": 63900 + }, + { + "epoch": 0.7987949698742468, + "grad_norm": 5.370325088500977, + "learning_rate": 2.3666737500066517e-06, + "loss": 0.1882, + "step": 63902 + }, + { + "epoch": 0.7988199704992625, + "grad_norm": 1.5342111587524414, + "learning_rate": 2.366110017132975e-06, + "loss": 0.1139, + "step": 63904 + }, + { + "epoch": 0.7988449711242781, + "grad_norm": 3.486016273498535, + "learning_rate": 2.3655463423979295e-06, + "loss": 0.9893, + "step": 63906 + }, + { + "epoch": 0.7988699717492938, + "grad_norm": 3.7553937435150146, + "learning_rate": 2.3649827258058e-06, + "loss": 0.6921, + "step": 63908 + }, + { + "epoch": 0.7988949723743094, + "grad_norm": 2.034879446029663, + "learning_rate": 2.364419167360885e-06, + "loss": 0.142, + "step": 63910 + }, + { + "epoch": 0.7989199729993249, + "grad_norm": 0.000998442410491407, + "learning_rate": 2.363855667067472e-06, + "loss": 0.2718, + "step": 63912 + }, + { + "epoch": 0.7989449736243406, + "grad_norm": 3.6339592933654785, + "learning_rate": 2.3632922249298563e-06, + "loss": 0.5556, + "step": 63914 + }, + { + "epoch": 0.7989699742493562, + "grad_norm": 0.00016440608305856586, + "learning_rate": 2.362728840952326e-06, + "loss": 0.631, + "step": 63916 + }, + { + "epoch": 0.7989949748743719, + "grad_norm": 3.520256996154785, + "learning_rate": 2.362165515139172e-06, + "loss": 0.5877, + "step": 63918 + }, + { + "epoch": 0.7990199754993875, + "grad_norm": 0.00031889823731034994, + "learning_rate": 2.3616022474946878e-06, + "loss": 0.6335, + "step": 63920 + }, + { + "epoch": 0.7990449761244031, + "grad_norm": 0.00021087127970531583, + "learning_rate": 2.361039038023156e-06, + "loss": 1.4619, + "step": 63922 + }, + { + "epoch": 0.7990699767494187, + "grad_norm": 4.1051154136657715, + "learning_rate": 2.3604758867288757e-06, + "loss": 0.8109, + "step": 63924 + }, + { + "epoch": 0.7990949773744344, + "grad_norm": 1.226655125617981, + "learning_rate": 2.359912793616127e-06, + "loss": 0.133, + "step": 63926 + }, + { + "epoch": 0.79911997799945, + "grad_norm": 3.563807725906372, + "learning_rate": 2.3593497586892056e-06, + "loss": 1.6129, + "step": 63928 + }, + { + "epoch": 0.7991449786244657, + "grad_norm": 3.510249137878418, + "learning_rate": 2.3587867819523945e-06, + "loss": 1.4368, + "step": 63930 + }, + { + "epoch": 0.7991699792494812, + "grad_norm": 7.617905616760254, + "learning_rate": 2.358223863409981e-06, + "loss": 2.4567, + "step": 63932 + }, + { + "epoch": 0.7991949798744968, + "grad_norm": 2.805541515350342, + "learning_rate": 2.357661003066257e-06, + "loss": 0.4982, + "step": 63934 + }, + { + "epoch": 0.7992199804995125, + "grad_norm": 1.9845685958862305, + "learning_rate": 2.3570982009255026e-06, + "loss": 1.2053, + "step": 63936 + }, + { + "epoch": 0.7992449811245281, + "grad_norm": 3.6271262168884277, + "learning_rate": 2.3565354569920074e-06, + "loss": 1.4196, + "step": 63938 + }, + { + "epoch": 0.7992699817495438, + "grad_norm": 0.0003676433698274195, + "learning_rate": 2.355972771270063e-06, + "loss": 0.164, + "step": 63940 + }, + { + "epoch": 0.7992949823745593, + "grad_norm": 4.135836601257324, + "learning_rate": 2.3554101437639432e-06, + "loss": 1.4461, + "step": 63942 + }, + { + "epoch": 0.799319982999575, + "grad_norm": 3.096485137939453, + "learning_rate": 2.354847574477942e-06, + "loss": 0.9903, + "step": 63944 + }, + { + "epoch": 0.7993449836245906, + "grad_norm": 4.157342910766602, + "learning_rate": 2.3542850634163385e-06, + "loss": 1.3116, + "step": 63946 + }, + { + "epoch": 0.7993699842496063, + "grad_norm": 2.346505641937256, + "learning_rate": 2.35372261058342e-06, + "loss": 1.165, + "step": 63948 + }, + { + "epoch": 0.7993949848746219, + "grad_norm": 0.6799388527870178, + "learning_rate": 2.3531602159834666e-06, + "loss": 0.249, + "step": 63950 + }, + { + "epoch": 0.7994199854996374, + "grad_norm": 2.375260353088379, + "learning_rate": 2.352597879620764e-06, + "loss": 0.9371, + "step": 63952 + }, + { + "epoch": 0.7994449861246531, + "grad_norm": 4.183367729187012, + "learning_rate": 2.3520356014995995e-06, + "loss": 1.0544, + "step": 63954 + }, + { + "epoch": 0.7994699867496687, + "grad_norm": 3.089646339416504, + "learning_rate": 2.3514733816242452e-06, + "loss": 0.9619, + "step": 63956 + }, + { + "epoch": 0.7994949873746844, + "grad_norm": 1.2864819765090942, + "learning_rate": 2.35091121999899e-06, + "loss": 0.1516, + "step": 63958 + }, + { + "epoch": 0.7995199879997, + "grad_norm": 2.3742995262145996, + "learning_rate": 2.350349116628111e-06, + "loss": 0.9315, + "step": 63960 + }, + { + "epoch": 0.7995449886247156, + "grad_norm": 3.6527388095855713, + "learning_rate": 2.3497870715158923e-06, + "loss": 0.931, + "step": 63962 + }, + { + "epoch": 0.7995699892497312, + "grad_norm": 3.905322790145874, + "learning_rate": 2.349225084666615e-06, + "loss": 1.8049, + "step": 63964 + }, + { + "epoch": 0.7995949898747469, + "grad_norm": 3.4137516021728516, + "learning_rate": 2.3486631560845587e-06, + "loss": 0.8723, + "step": 63966 + }, + { + "epoch": 0.7996199904997625, + "grad_norm": 4.7743659019470215, + "learning_rate": 2.3481012857740004e-06, + "loss": 1.9847, + "step": 63968 + }, + { + "epoch": 0.7996449911247782, + "grad_norm": 0.8641712069511414, + "learning_rate": 2.3475394737392187e-06, + "loss": 1.0324, + "step": 63970 + }, + { + "epoch": 0.7996699917497937, + "grad_norm": 2.3226983547210693, + "learning_rate": 2.3469777199844965e-06, + "loss": 0.735, + "step": 63972 + }, + { + "epoch": 0.7996949923748093, + "grad_norm": 2.1858325004577637, + "learning_rate": 2.3464160245141077e-06, + "loss": 1.2314, + "step": 63974 + }, + { + "epoch": 0.799719992999825, + "grad_norm": 2.398599147796631, + "learning_rate": 2.3458543873323313e-06, + "loss": 1.1154, + "step": 63976 + }, + { + "epoch": 0.7997449936248406, + "grad_norm": 2.2713351249694824, + "learning_rate": 2.345292808443449e-06, + "loss": 1.1208, + "step": 63978 + }, + { + "epoch": 0.7997699942498563, + "grad_norm": 4.243142604827881, + "learning_rate": 2.344731287851734e-06, + "loss": 1.4731, + "step": 63980 + }, + { + "epoch": 0.7997949948748718, + "grad_norm": 3.8199820518493652, + "learning_rate": 2.344169825561463e-06, + "loss": 0.8323, + "step": 63982 + }, + { + "epoch": 0.7998199954998875, + "grad_norm": 3.8962855339050293, + "learning_rate": 2.3436084215769085e-06, + "loss": 0.2712, + "step": 63984 + }, + { + "epoch": 0.7998449961249031, + "grad_norm": 3.9447250366210938, + "learning_rate": 2.3430470759023527e-06, + "loss": 1.7362, + "step": 63986 + }, + { + "epoch": 0.7998699967499188, + "grad_norm": 0.0002722275094129145, + "learning_rate": 2.342485788542066e-06, + "loss": 0.0498, + "step": 63988 + }, + { + "epoch": 0.7998949973749344, + "grad_norm": 0.00034510312252677977, + "learning_rate": 2.3419245595003227e-06, + "loss": 0.4638, + "step": 63990 + }, + { + "epoch": 0.79991999799995, + "grad_norm": 2.1130831241607666, + "learning_rate": 2.3413633887814056e-06, + "loss": 1.1327, + "step": 63992 + }, + { + "epoch": 0.7999449986249656, + "grad_norm": 1.7149213552474976, + "learning_rate": 2.3408022763895766e-06, + "loss": 1.3163, + "step": 63994 + }, + { + "epoch": 0.7999699992499812, + "grad_norm": 1.2587169408798218, + "learning_rate": 2.340241222329117e-06, + "loss": 0.3595, + "step": 63996 + }, + { + "epoch": 0.7999949998749969, + "grad_norm": 2.4390666484832764, + "learning_rate": 2.339680226604294e-06, + "loss": 0.515, + "step": 63998 + }, + { + "epoch": 0.8000200005000125, + "grad_norm": 1.8710503578186035, + "learning_rate": 2.3391192892193826e-06, + "loss": 0.8392, + "step": 64000 + }, + { + "epoch": 0.8000450011250281, + "grad_norm": 5.1417927742004395, + "learning_rate": 2.3385584101786596e-06, + "loss": 1.7474, + "step": 64002 + }, + { + "epoch": 0.8000700017500437, + "grad_norm": 4.7936248779296875, + "learning_rate": 2.3379975894863883e-06, + "loss": 0.2777, + "step": 64004 + }, + { + "epoch": 0.8000950023750594, + "grad_norm": 0.46999749541282654, + "learning_rate": 2.3374368271468494e-06, + "loss": 0.1062, + "step": 64006 + }, + { + "epoch": 0.800120003000075, + "grad_norm": 0.0003095008432865143, + "learning_rate": 2.3368761231643036e-06, + "loss": 0.4798, + "step": 64008 + }, + { + "epoch": 0.8001450036250907, + "grad_norm": 0.12797971069812775, + "learning_rate": 2.3363154775430274e-06, + "loss": 0.0524, + "step": 64010 + }, + { + "epoch": 0.8001700042501062, + "grad_norm": 0.27704721689224243, + "learning_rate": 2.3357548902872863e-06, + "loss": 1.6417, + "step": 64012 + }, + { + "epoch": 0.8001950048751219, + "grad_norm": 0.001548113883472979, + "learning_rate": 2.3351943614013516e-06, + "loss": 0.0297, + "step": 64014 + }, + { + "epoch": 0.8002200055001375, + "grad_norm": 5.2118659019470215, + "learning_rate": 2.334633890889496e-06, + "loss": 1.4544, + "step": 64016 + }, + { + "epoch": 0.8002450061251531, + "grad_norm": 3.37530517578125, + "learning_rate": 2.3340734787559815e-06, + "loss": 1.4263, + "step": 64018 + }, + { + "epoch": 0.8002700067501688, + "grad_norm": 1.8859922885894775, + "learning_rate": 2.333513125005086e-06, + "loss": 0.5048, + "step": 64020 + }, + { + "epoch": 0.8002950073751843, + "grad_norm": 3.2761683464050293, + "learning_rate": 2.332952829641064e-06, + "loss": 0.5366, + "step": 64022 + }, + { + "epoch": 0.8003200080002, + "grad_norm": 2.304469108581543, + "learning_rate": 2.332392592668189e-06, + "loss": 0.4475, + "step": 64024 + }, + { + "epoch": 0.8003450086252156, + "grad_norm": 5.073299407958984, + "learning_rate": 2.33183241409073e-06, + "loss": 0.4213, + "step": 64026 + }, + { + "epoch": 0.8003700092502313, + "grad_norm": 0.21306917071342468, + "learning_rate": 2.3312722939129497e-06, + "loss": 0.1022, + "step": 64028 + }, + { + "epoch": 0.8003950098752469, + "grad_norm": 4.116667747497559, + "learning_rate": 2.330712232139116e-06, + "loss": 1.4173, + "step": 64030 + }, + { + "epoch": 0.8004200105002625, + "grad_norm": 3.6327710151672363, + "learning_rate": 2.3301522287734957e-06, + "loss": 1.595, + "step": 64032 + }, + { + "epoch": 0.8004450111252781, + "grad_norm": 0.6223086714744568, + "learning_rate": 2.32959228382035e-06, + "loss": 0.0085, + "step": 64034 + }, + { + "epoch": 0.8004700117502938, + "grad_norm": 2.8278403282165527, + "learning_rate": 2.3290323972839413e-06, + "loss": 0.6555, + "step": 64036 + }, + { + "epoch": 0.8004950123753094, + "grad_norm": 0.03134068101644516, + "learning_rate": 2.3284725691685385e-06, + "loss": 0.4617, + "step": 64038 + }, + { + "epoch": 0.800520013000325, + "grad_norm": 10.70305061340332, + "learning_rate": 2.3279127994784056e-06, + "loss": 1.8117, + "step": 64040 + }, + { + "epoch": 0.8005450136253406, + "grad_norm": 4.622124671936035, + "learning_rate": 2.327353088217802e-06, + "loss": 1.1721, + "step": 64042 + }, + { + "epoch": 0.8005700142503562, + "grad_norm": 2.0074119567871094, + "learning_rate": 2.326793435390995e-06, + "loss": 0.1123, + "step": 64044 + }, + { + "epoch": 0.8005950148753719, + "grad_norm": 0.00027735024923458695, + "learning_rate": 2.3262338410022435e-06, + "loss": 0.0974, + "step": 64046 + }, + { + "epoch": 0.8006200155003875, + "grad_norm": 0.24030441045761108, + "learning_rate": 2.325674305055807e-06, + "loss": 0.1216, + "step": 64048 + }, + { + "epoch": 0.8006450161254032, + "grad_norm": 3.3988993167877197, + "learning_rate": 2.325114827555953e-06, + "loss": 0.7029, + "step": 64050 + }, + { + "epoch": 0.8006700167504187, + "grad_norm": 5.92164421081543, + "learning_rate": 2.3245554085069354e-06, + "loss": 1.9369, + "step": 64052 + }, + { + "epoch": 0.8006950173754344, + "grad_norm": 3.4699923992156982, + "learning_rate": 2.3239960479130217e-06, + "loss": 0.7788, + "step": 64054 + }, + { + "epoch": 0.80072001800045, + "grad_norm": 1.764432430267334, + "learning_rate": 2.3234367457784667e-06, + "loss": 0.0398, + "step": 64056 + }, + { + "epoch": 0.8007450186254657, + "grad_norm": 3.6047496795654297, + "learning_rate": 2.322877502107537e-06, + "loss": 1.7041, + "step": 64058 + }, + { + "epoch": 0.8007700192504813, + "grad_norm": 1.3759907484054565, + "learning_rate": 2.3223183169044806e-06, + "loss": 0.2836, + "step": 64060 + }, + { + "epoch": 0.8007950198754968, + "grad_norm": 3.5595033168792725, + "learning_rate": 2.3217591901735625e-06, + "loss": 1.1837, + "step": 64062 + }, + { + "epoch": 0.8008200205005125, + "grad_norm": 2.567800760269165, + "learning_rate": 2.3212001219190437e-06, + "loss": 0.3461, + "step": 64064 + }, + { + "epoch": 0.8008450211255281, + "grad_norm": 16.0616455078125, + "learning_rate": 2.3206411121451754e-06, + "loss": 1.4118, + "step": 64066 + }, + { + "epoch": 0.8008700217505438, + "grad_norm": 2.2527029514312744, + "learning_rate": 2.3200821608562217e-06, + "loss": 0.8343, + "step": 64068 + }, + { + "epoch": 0.8008950223755594, + "grad_norm": 7.718369007110596, + "learning_rate": 2.319523268056433e-06, + "loss": 1.8678, + "step": 64070 + }, + { + "epoch": 0.800920023000575, + "grad_norm": 4.052631855010986, + "learning_rate": 2.318964433750075e-06, + "loss": 1.8195, + "step": 64072 + }, + { + "epoch": 0.8009450236255906, + "grad_norm": 2.669734239578247, + "learning_rate": 2.3184056579413918e-06, + "loss": 1.0533, + "step": 64074 + }, + { + "epoch": 0.8009700242506063, + "grad_norm": 10.485465049743652, + "learning_rate": 2.317846940634645e-06, + "loss": 1.6084, + "step": 64076 + }, + { + "epoch": 0.8009950248756219, + "grad_norm": 13.912153244018555, + "learning_rate": 2.3172882818340924e-06, + "loss": 1.5608, + "step": 64078 + }, + { + "epoch": 0.8010200255006376, + "grad_norm": 2.749191999435425, + "learning_rate": 2.3167296815439834e-06, + "loss": 0.9595, + "step": 64080 + }, + { + "epoch": 0.8010450261256531, + "grad_norm": 4.070300579071045, + "learning_rate": 2.316171139768576e-06, + "loss": 1.0881, + "step": 64082 + }, + { + "epoch": 0.8010700267506687, + "grad_norm": 0.00022623904806096107, + "learning_rate": 2.315612656512124e-06, + "loss": 0.5176, + "step": 64084 + }, + { + "epoch": 0.8010950273756844, + "grad_norm": 11.441125869750977, + "learning_rate": 2.315054231778876e-06, + "loss": 1.2234, + "step": 64086 + }, + { + "epoch": 0.8011200280007, + "grad_norm": 6.043304920196533, + "learning_rate": 2.314495865573091e-06, + "loss": 1.1232, + "step": 64088 + }, + { + "epoch": 0.8011450286257157, + "grad_norm": 4.038326740264893, + "learning_rate": 2.3139375578990164e-06, + "loss": 1.0847, + "step": 64090 + }, + { + "epoch": 0.8011700292507312, + "grad_norm": 1.6793758869171143, + "learning_rate": 2.3133793087609092e-06, + "loss": 0.38, + "step": 64092 + }, + { + "epoch": 0.8011950298757469, + "grad_norm": 5.247110843658447, + "learning_rate": 2.3128211181630143e-06, + "loss": 1.1714, + "step": 64094 + }, + { + "epoch": 0.8012200305007625, + "grad_norm": 2.251307725906372, + "learning_rate": 2.31226298610959e-06, + "loss": 1.5885, + "step": 64096 + }, + { + "epoch": 0.8012450311257782, + "grad_norm": 4.5929388999938965, + "learning_rate": 2.311704912604884e-06, + "loss": 0.9152, + "step": 64098 + }, + { + "epoch": 0.8012700317507938, + "grad_norm": 0.00023571646306663752, + "learning_rate": 2.3111468976531437e-06, + "loss": 0.7608, + "step": 64100 + }, + { + "epoch": 0.8012950323758093, + "grad_norm": 5.4569621086120605, + "learning_rate": 2.3105889412586237e-06, + "loss": 0.9568, + "step": 64102 + }, + { + "epoch": 0.801320033000825, + "grad_norm": 5.359350204467773, + "learning_rate": 2.310031043425568e-06, + "loss": 0.861, + "step": 64104 + }, + { + "epoch": 0.8013450336258406, + "grad_norm": 1.392582893371582, + "learning_rate": 2.309473204158231e-06, + "loss": 0.0788, + "step": 64106 + }, + { + "epoch": 0.8013700342508563, + "grad_norm": 1.9132329225540161, + "learning_rate": 2.308915423460857e-06, + "loss": 0.6974, + "step": 64108 + }, + { + "epoch": 0.8013950348758719, + "grad_norm": 3.048138380050659, + "learning_rate": 2.3083577013376968e-06, + "loss": 1.1426, + "step": 64110 + }, + { + "epoch": 0.8014200355008875, + "grad_norm": 2.9345171451568604, + "learning_rate": 2.3078000377929975e-06, + "loss": 0.9924, + "step": 64112 + }, + { + "epoch": 0.8014450361259031, + "grad_norm": 1.7329822778701782, + "learning_rate": 2.307242432831002e-06, + "loss": 0.9314, + "step": 64114 + }, + { + "epoch": 0.8014700367509188, + "grad_norm": 2.828490734100342, + "learning_rate": 2.3066848864559632e-06, + "loss": 0.6403, + "step": 64116 + }, + { + "epoch": 0.8014950373759344, + "grad_norm": 3.649768590927124, + "learning_rate": 2.3061273986721222e-06, + "loss": 0.6456, + "step": 64118 + }, + { + "epoch": 0.8015200380009501, + "grad_norm": 4.140209674835205, + "learning_rate": 2.3055699694837285e-06, + "loss": 1.6456, + "step": 64120 + }, + { + "epoch": 0.8015450386259656, + "grad_norm": 1.0785839557647705, + "learning_rate": 2.3050125988950247e-06, + "loss": 0.5474, + "step": 64122 + }, + { + "epoch": 0.8015700392509812, + "grad_norm": 1.2962521314620972, + "learning_rate": 2.3044552869102575e-06, + "loss": 0.8803, + "step": 64124 + }, + { + "epoch": 0.8015950398759969, + "grad_norm": 2.5622456073760986, + "learning_rate": 2.3038980335336723e-06, + "loss": 0.4555, + "step": 64126 + }, + { + "epoch": 0.8016200405010125, + "grad_norm": 4.200594902038574, + "learning_rate": 2.3033408387695076e-06, + "loss": 0.2622, + "step": 64128 + }, + { + "epoch": 0.8016450411260282, + "grad_norm": 4.480568885803223, + "learning_rate": 2.3027837026220146e-06, + "loss": 0.5513, + "step": 64130 + }, + { + "epoch": 0.8016700417510437, + "grad_norm": 0.0032851763535290956, + "learning_rate": 2.3022266250954285e-06, + "loss": 0.002, + "step": 64132 + }, + { + "epoch": 0.8016950423760594, + "grad_norm": 3.5459749698638916, + "learning_rate": 2.3016696061939993e-06, + "loss": 0.8588, + "step": 64134 + }, + { + "epoch": 0.801720043001075, + "grad_norm": 0.00020169121853541583, + "learning_rate": 2.3011126459219656e-06, + "loss": 0.1415, + "step": 64136 + }, + { + "epoch": 0.8017450436260907, + "grad_norm": 4.664730072021484, + "learning_rate": 2.300555744283566e-06, + "loss": 0.8977, + "step": 64138 + }, + { + "epoch": 0.8017700442511063, + "grad_norm": 3.210890054702759, + "learning_rate": 2.299998901283048e-06, + "loss": 0.9354, + "step": 64140 + }, + { + "epoch": 0.8017950448761219, + "grad_norm": 0.8202126622200012, + "learning_rate": 2.2994421169246473e-06, + "loss": 0.0302, + "step": 64142 + }, + { + "epoch": 0.8018200455011375, + "grad_norm": 1.9861994981765747, + "learning_rate": 2.2988853912126084e-06, + "loss": 0.1466, + "step": 64144 + }, + { + "epoch": 0.8018450461261531, + "grad_norm": 1.391295075416565, + "learning_rate": 2.2983287241511675e-06, + "loss": 0.0607, + "step": 64146 + }, + { + "epoch": 0.8018700467511688, + "grad_norm": 3.343071222305298, + "learning_rate": 2.297772115744568e-06, + "loss": 0.6837, + "step": 64148 + }, + { + "epoch": 0.8018950473761844, + "grad_norm": 9.215069770812988, + "learning_rate": 2.2972155659970474e-06, + "loss": 1.3391, + "step": 64150 + }, + { + "epoch": 0.8019200480012, + "grad_norm": 1.8176994323730469, + "learning_rate": 2.29665907491284e-06, + "loss": 0.2116, + "step": 64152 + }, + { + "epoch": 0.8019450486262156, + "grad_norm": 0.002421475248411298, + "learning_rate": 2.296102642496192e-06, + "loss": 0.0194, + "step": 64154 + }, + { + "epoch": 0.8019700492512313, + "grad_norm": 3.361859083175659, + "learning_rate": 2.295546268751333e-06, + "loss": 1.061, + "step": 64156 + }, + { + "epoch": 0.8019950498762469, + "grad_norm": 1.7763450145721436, + "learning_rate": 2.2949899536825073e-06, + "loss": 0.5559, + "step": 64158 + }, + { + "epoch": 0.8020200505012626, + "grad_norm": 4.165273189544678, + "learning_rate": 2.294433697293945e-06, + "loss": 1.44, + "step": 64160 + }, + { + "epoch": 0.8020450511262781, + "grad_norm": 3.0072031021118164, + "learning_rate": 2.2938774995898894e-06, + "loss": 0.8205, + "step": 64162 + }, + { + "epoch": 0.8020700517512938, + "grad_norm": 1.6369526386260986, + "learning_rate": 2.293321360574573e-06, + "loss": 0.2792, + "step": 64164 + }, + { + "epoch": 0.8020950523763094, + "grad_norm": 7.524991512298584, + "learning_rate": 2.2927652802522283e-06, + "loss": 2.1157, + "step": 64166 + }, + { + "epoch": 0.802120053001325, + "grad_norm": 4.67897367477417, + "learning_rate": 2.292209258627096e-06, + "loss": 1.357, + "step": 64168 + }, + { + "epoch": 0.8021450536263407, + "grad_norm": 3.1052701473236084, + "learning_rate": 2.2916532957034055e-06, + "loss": 0.7722, + "step": 64170 + }, + { + "epoch": 0.8021700542513562, + "grad_norm": 13.11517333984375, + "learning_rate": 2.2910973914853964e-06, + "loss": 1.3458, + "step": 64172 + }, + { + "epoch": 0.8021950548763719, + "grad_norm": 4.585109710693359, + "learning_rate": 2.290541545977296e-06, + "loss": 0.9544, + "step": 64174 + }, + { + "epoch": 0.8022200555013875, + "grad_norm": 0.0003209417045582086, + "learning_rate": 2.289985759183345e-06, + "loss": 0.0001, + "step": 64176 + }, + { + "epoch": 0.8022450561264032, + "grad_norm": 2.6402721405029297, + "learning_rate": 2.2894300311077712e-06, + "loss": 0.805, + "step": 64178 + }, + { + "epoch": 0.8022700567514188, + "grad_norm": 0.7622001767158508, + "learning_rate": 2.2888743617548057e-06, + "loss": 0.5949, + "step": 64180 + }, + { + "epoch": 0.8022950573764344, + "grad_norm": 4.480014801025391, + "learning_rate": 2.288318751128685e-06, + "loss": 1.2949, + "step": 64182 + }, + { + "epoch": 0.80232005800145, + "grad_norm": 2.868659496307373, + "learning_rate": 2.2877631992336345e-06, + "loss": 0.9408, + "step": 64184 + }, + { + "epoch": 0.8023450586264657, + "grad_norm": 5.5413818359375, + "learning_rate": 2.2872077060738927e-06, + "loss": 2.052, + "step": 64186 + }, + { + "epoch": 0.8023700592514813, + "grad_norm": 6.041866779327393, + "learning_rate": 2.2866522716536865e-06, + "loss": 0.8848, + "step": 64188 + }, + { + "epoch": 0.802395059876497, + "grad_norm": 0.8166294693946838, + "learning_rate": 2.2860968959772434e-06, + "loss": 0.5675, + "step": 64190 + }, + { + "epoch": 0.8024200605015125, + "grad_norm": 5.186323165893555, + "learning_rate": 2.285541579048797e-06, + "loss": 1.1391, + "step": 64192 + }, + { + "epoch": 0.8024450611265281, + "grad_norm": 1.1090079545974731, + "learning_rate": 2.284986320872573e-06, + "loss": 0.8475, + "step": 64194 + }, + { + "epoch": 0.8024700617515438, + "grad_norm": 0.07329469919204712, + "learning_rate": 2.2844311214528046e-06, + "loss": 0.003, + "step": 64196 + }, + { + "epoch": 0.8024950623765594, + "grad_norm": 2.6249282360076904, + "learning_rate": 2.283875980793715e-06, + "loss": 0.8573, + "step": 64198 + }, + { + "epoch": 0.8025200630015751, + "grad_norm": 0.10001122951507568, + "learning_rate": 2.283320898899538e-06, + "loss": 0.925, + "step": 64200 + }, + { + "epoch": 0.8025450636265906, + "grad_norm": 0.00025887644733302295, + "learning_rate": 2.2827658757744977e-06, + "loss": 0.174, + "step": 64202 + }, + { + "epoch": 0.8025700642516063, + "grad_norm": 1.9470125436782837, + "learning_rate": 2.2822109114228175e-06, + "loss": 1.0931, + "step": 64204 + }, + { + "epoch": 0.8025950648766219, + "grad_norm": 1.3084499835968018, + "learning_rate": 2.2816560058487313e-06, + "loss": 0.5482, + "step": 64206 + }, + { + "epoch": 0.8026200655016376, + "grad_norm": 0.00039319245843216777, + "learning_rate": 2.281101159056458e-06, + "loss": 0.0003, + "step": 64208 + }, + { + "epoch": 0.8026450661266532, + "grad_norm": 2.6883652210235596, + "learning_rate": 2.2805463710502305e-06, + "loss": 0.9013, + "step": 64210 + }, + { + "epoch": 0.8026700667516687, + "grad_norm": 5.287676811218262, + "learning_rate": 2.279991641834267e-06, + "loss": 1.3283, + "step": 64212 + }, + { + "epoch": 0.8026950673766844, + "grad_norm": 4.66258430480957, + "learning_rate": 2.2794369714127982e-06, + "loss": 1.4635, + "step": 64214 + }, + { + "epoch": 0.8027200680017, + "grad_norm": 0.10253861546516418, + "learning_rate": 2.2788823597900454e-06, + "loss": 0.4497, + "step": 64216 + }, + { + "epoch": 0.8027450686267157, + "grad_norm": 5.08729362487793, + "learning_rate": 2.27832780697023e-06, + "loss": 0.7231, + "step": 64218 + }, + { + "epoch": 0.8027700692517313, + "grad_norm": 5.256126880645752, + "learning_rate": 2.2777733129575807e-06, + "loss": 1.7387, + "step": 64220 + }, + { + "epoch": 0.8027950698767469, + "grad_norm": 3.8276476860046387, + "learning_rate": 2.2772188777563155e-06, + "loss": 2.1387, + "step": 64222 + }, + { + "epoch": 0.8028200705017625, + "grad_norm": 0.4109824001789093, + "learning_rate": 2.2766645013706612e-06, + "loss": 0.8814, + "step": 64224 + }, + { + "epoch": 0.8028450711267782, + "grad_norm": 4.126816272735596, + "learning_rate": 2.276110183804836e-06, + "loss": 1.4665, + "step": 64226 + }, + { + "epoch": 0.8028700717517938, + "grad_norm": 2.589592218399048, + "learning_rate": 2.2755559250630654e-06, + "loss": 0.6145, + "step": 64228 + }, + { + "epoch": 0.8028950723768095, + "grad_norm": 3.1918442249298096, + "learning_rate": 2.275001725149569e-06, + "loss": 0.5623, + "step": 64230 + }, + { + "epoch": 0.802920073001825, + "grad_norm": 3.034334897994995, + "learning_rate": 2.2744475840685644e-06, + "loss": 1.4696, + "step": 64232 + }, + { + "epoch": 0.8029450736268406, + "grad_norm": 4.742550373077393, + "learning_rate": 2.273893501824277e-06, + "loss": 0.1594, + "step": 64234 + }, + { + "epoch": 0.8029700742518563, + "grad_norm": 1.9877275228500366, + "learning_rate": 2.2733394784209218e-06, + "loss": 0.7658, + "step": 64236 + }, + { + "epoch": 0.8029950748768719, + "grad_norm": 2.7571423053741455, + "learning_rate": 2.272785513862723e-06, + "loss": 1.3845, + "step": 64238 + }, + { + "epoch": 0.8030200755018876, + "grad_norm": 3.3611395359039307, + "learning_rate": 2.2722316081538965e-06, + "loss": 0.6882, + "step": 64240 + }, + { + "epoch": 0.8030450761269031, + "grad_norm": 3.813194751739502, + "learning_rate": 2.2716777612986595e-06, + "loss": 1.4919, + "step": 64242 + }, + { + "epoch": 0.8030700767519188, + "grad_norm": 3.687875509262085, + "learning_rate": 2.2711239733012335e-06, + "loss": 1.2911, + "step": 64244 + }, + { + "epoch": 0.8030950773769344, + "grad_norm": 4.627037048339844, + "learning_rate": 2.270570244165833e-06, + "loss": 1.3021, + "step": 64246 + }, + { + "epoch": 0.8031200780019501, + "grad_norm": 1.5736626386642456, + "learning_rate": 2.2700165738966773e-06, + "loss": 0.6037, + "step": 64248 + }, + { + "epoch": 0.8031450786269657, + "grad_norm": 3.9756340980529785, + "learning_rate": 2.2694629624979814e-06, + "loss": 0.7641, + "step": 64250 + }, + { + "epoch": 0.8031700792519813, + "grad_norm": 2.5293216705322266, + "learning_rate": 2.268909409973964e-06, + "loss": 0.596, + "step": 64252 + }, + { + "epoch": 0.8031950798769969, + "grad_norm": 3.755943775177002, + "learning_rate": 2.2683559163288393e-06, + "loss": 1.002, + "step": 64254 + }, + { + "epoch": 0.8032200805020125, + "grad_norm": 3.043152332305908, + "learning_rate": 2.2678024815668197e-06, + "loss": 1.2108, + "step": 64256 + }, + { + "epoch": 0.8032450811270282, + "grad_norm": 3.1240386962890625, + "learning_rate": 2.2672491056921265e-06, + "loss": 0.4065, + "step": 64258 + }, + { + "epoch": 0.8032700817520438, + "grad_norm": 0.015221218578517437, + "learning_rate": 2.266695788708967e-06, + "loss": 0.0657, + "step": 64260 + }, + { + "epoch": 0.8032950823770594, + "grad_norm": 4.4096150398254395, + "learning_rate": 2.266142530621562e-06, + "loss": 0.8567, + "step": 64262 + }, + { + "epoch": 0.803320083002075, + "grad_norm": 6.793869495391846, + "learning_rate": 2.265589331434119e-06, + "loss": 1.1005, + "step": 64264 + }, + { + "epoch": 0.8033450836270907, + "grad_norm": 3.944988965988159, + "learning_rate": 2.2650361911508566e-06, + "loss": 1.1123, + "step": 64266 + }, + { + "epoch": 0.8033700842521063, + "grad_norm": 5.875859260559082, + "learning_rate": 2.2644831097759844e-06, + "loss": 1.8254, + "step": 64268 + }, + { + "epoch": 0.803395084877122, + "grad_norm": 0.00026228578644804657, + "learning_rate": 2.2639300873137126e-06, + "loss": 1.2699, + "step": 64270 + }, + { + "epoch": 0.8034200855021375, + "grad_norm": 1.9523332118988037, + "learning_rate": 2.263377123768259e-06, + "loss": 0.5439, + "step": 64272 + }, + { + "epoch": 0.8034450861271532, + "grad_norm": 0.00027819391107186675, + "learning_rate": 2.2628242191438267e-06, + "loss": 0.9433, + "step": 64274 + }, + { + "epoch": 0.8034700867521688, + "grad_norm": 6.89654016494751, + "learning_rate": 2.2622713734446344e-06, + "loss": 0.6462, + "step": 64276 + }, + { + "epoch": 0.8034950873771844, + "grad_norm": 4.879794597625732, + "learning_rate": 2.261718586674887e-06, + "loss": 1.1312, + "step": 64278 + }, + { + "epoch": 0.8035200880022001, + "grad_norm": 0.00023748553940095007, + "learning_rate": 2.2611658588387986e-06, + "loss": 0.6501, + "step": 64280 + }, + { + "epoch": 0.8035450886272156, + "grad_norm": 10.25814151763916, + "learning_rate": 2.260613189940577e-06, + "loss": 1.3175, + "step": 64282 + }, + { + "epoch": 0.8035700892522313, + "grad_norm": 8.339207649230957, + "learning_rate": 2.2600605799844276e-06, + "loss": 1.781, + "step": 64284 + }, + { + "epoch": 0.8035950898772469, + "grad_norm": 3.348162889480591, + "learning_rate": 2.2595080289745652e-06, + "loss": 1.4471, + "step": 64286 + }, + { + "epoch": 0.8036200905022626, + "grad_norm": 1.9581986665725708, + "learning_rate": 2.258955536915193e-06, + "loss": 1.2906, + "step": 64288 + }, + { + "epoch": 0.8036450911272782, + "grad_norm": 0.7311505675315857, + "learning_rate": 2.2584031038105226e-06, + "loss": 0.8784, + "step": 64290 + }, + { + "epoch": 0.8036700917522938, + "grad_norm": 5.150203227996826, + "learning_rate": 2.2578507296647577e-06, + "loss": 1.4864, + "step": 64292 + }, + { + "epoch": 0.8036950923773094, + "grad_norm": 3.772071361541748, + "learning_rate": 2.2572984144821086e-06, + "loss": 0.3865, + "step": 64294 + }, + { + "epoch": 0.803720093002325, + "grad_norm": 0.03814809024333954, + "learning_rate": 2.2567461582667805e-06, + "loss": 0.3702, + "step": 64296 + }, + { + "epoch": 0.8037450936273407, + "grad_norm": 2.828874349594116, + "learning_rate": 2.2561939610229755e-06, + "loss": 1.0235, + "step": 64298 + }, + { + "epoch": 0.8037700942523563, + "grad_norm": 0.0024881151039153337, + "learning_rate": 2.255641822754905e-06, + "loss": 0.5444, + "step": 64300 + }, + { + "epoch": 0.8037950948773719, + "grad_norm": 2.7525622844696045, + "learning_rate": 2.2550897434667685e-06, + "loss": 1.4733, + "step": 64302 + }, + { + "epoch": 0.8038200955023875, + "grad_norm": 1.6285080909729004, + "learning_rate": 2.254537723162773e-06, + "loss": 0.0942, + "step": 64304 + }, + { + "epoch": 0.8038450961274032, + "grad_norm": 3.5751516819000244, + "learning_rate": 2.2539857618471285e-06, + "loss": 2.0173, + "step": 64306 + }, + { + "epoch": 0.8038700967524188, + "grad_norm": 3.42199969291687, + "learning_rate": 2.253433859524028e-06, + "loss": 1.7694, + "step": 64308 + }, + { + "epoch": 0.8038950973774345, + "grad_norm": 4.657139301300049, + "learning_rate": 2.2528820161976827e-06, + "loss": 0.8626, + "step": 64310 + }, + { + "epoch": 0.80392009800245, + "grad_norm": 0.0324995219707489, + "learning_rate": 2.252330231872288e-06, + "loss": 0.0005, + "step": 64312 + }, + { + "epoch": 0.8039450986274657, + "grad_norm": 2.9741554260253906, + "learning_rate": 2.2517785065520558e-06, + "loss": 1.221, + "step": 64314 + }, + { + "epoch": 0.8039700992524813, + "grad_norm": 3.7347631454467773, + "learning_rate": 2.2512268402411788e-06, + "loss": 0.6885, + "step": 64316 + }, + { + "epoch": 0.803995099877497, + "grad_norm": 2.5584404468536377, + "learning_rate": 2.2506752329438618e-06, + "loss": 0.9117, + "step": 64318 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 2.8766398429870605, + "learning_rate": 2.2501236846643125e-06, + "loss": 0.4784, + "step": 64320 + }, + { + "epoch": 0.8040451011275281, + "grad_norm": 3.2500784397125244, + "learning_rate": 2.2495721954067207e-06, + "loss": 0.8134, + "step": 64322 + }, + { + "epoch": 0.8040701017525438, + "grad_norm": 2.4601330757141113, + "learning_rate": 2.249020765175293e-06, + "loss": 0.5558, + "step": 64324 + }, + { + "epoch": 0.8040951023775594, + "grad_norm": 3.4242660999298096, + "learning_rate": 2.2484693939742242e-06, + "loss": 1.028, + "step": 64326 + }, + { + "epoch": 0.8041201030025751, + "grad_norm": 0.47715526819229126, + "learning_rate": 2.2479180818077183e-06, + "loss": 0.0022, + "step": 64328 + }, + { + "epoch": 0.8041451036275907, + "grad_norm": 3.1482551097869873, + "learning_rate": 2.24736682867997e-06, + "loss": 0.6456, + "step": 64330 + }, + { + "epoch": 0.8041701042526063, + "grad_norm": 1.1616343259811401, + "learning_rate": 2.246815634595183e-06, + "loss": 1.2251, + "step": 64332 + }, + { + "epoch": 0.8041951048776219, + "grad_norm": 1.2962404489517212, + "learning_rate": 2.246264499557551e-06, + "loss": 1.0508, + "step": 64334 + }, + { + "epoch": 0.8042201055026376, + "grad_norm": 4.92149543762207, + "learning_rate": 2.2457134235712696e-06, + "loss": 0.2662, + "step": 64336 + }, + { + "epoch": 0.8042451061276532, + "grad_norm": 1.0736112594604492, + "learning_rate": 2.245162406640541e-06, + "loss": 0.6659, + "step": 64338 + }, + { + "epoch": 0.8042701067526689, + "grad_norm": 6.204352378845215, + "learning_rate": 2.244611448769556e-06, + "loss": 1.1719, + "step": 64340 + }, + { + "epoch": 0.8042951073776844, + "grad_norm": 3.582643747329712, + "learning_rate": 2.2440605499625134e-06, + "loss": 0.7552, + "step": 64342 + }, + { + "epoch": 0.8043201080027, + "grad_norm": 1.0641248226165771, + "learning_rate": 2.2435097102236113e-06, + "loss": 0.5623, + "step": 64344 + }, + { + "epoch": 0.8043451086277157, + "grad_norm": 4.111990928649902, + "learning_rate": 2.242958929557043e-06, + "loss": 2.0257, + "step": 64346 + }, + { + "epoch": 0.8043701092527313, + "grad_norm": 3.5548529624938965, + "learning_rate": 2.242408207967003e-06, + "loss": 1.594, + "step": 64348 + }, + { + "epoch": 0.804395109877747, + "grad_norm": 2.718574285507202, + "learning_rate": 2.2418575454576817e-06, + "loss": 0.767, + "step": 64350 + }, + { + "epoch": 0.8044201105027625, + "grad_norm": 0.0002517595421522856, + "learning_rate": 2.241306942033279e-06, + "loss": 0.2075, + "step": 64352 + }, + { + "epoch": 0.8044451111277782, + "grad_norm": 4.977739334106445, + "learning_rate": 2.240756397697983e-06, + "loss": 0.5148, + "step": 64354 + }, + { + "epoch": 0.8044701117527938, + "grad_norm": 0.0004536281048785895, + "learning_rate": 2.2402059124559883e-06, + "loss": 0.3365, + "step": 64356 + }, + { + "epoch": 0.8044951123778095, + "grad_norm": 0.005089948419481516, + "learning_rate": 2.239655486311494e-06, + "loss": 0.6334, + "step": 64358 + }, + { + "epoch": 0.8045201130028251, + "grad_norm": 3.3886404037475586, + "learning_rate": 2.239105119268681e-06, + "loss": 1.145, + "step": 64360 + }, + { + "epoch": 0.8045451136278406, + "grad_norm": 0.0021017310209572315, + "learning_rate": 2.2385548113317478e-06, + "loss": 0.69, + "step": 64362 + }, + { + "epoch": 0.8045701142528563, + "grad_norm": 3.143697500228882, + "learning_rate": 2.2380045625048806e-06, + "loss": 0.1537, + "step": 64364 + }, + { + "epoch": 0.8045951148778719, + "grad_norm": 3.071819543838501, + "learning_rate": 2.237454372792274e-06, + "loss": 1.2294, + "step": 64366 + }, + { + "epoch": 0.8046201155028876, + "grad_norm": 3.880279779434204, + "learning_rate": 2.236904242198119e-06, + "loss": 0.6623, + "step": 64368 + }, + { + "epoch": 0.8046451161279032, + "grad_norm": 6.203239917755127, + "learning_rate": 2.2363541707266025e-06, + "loss": 0.7143, + "step": 64370 + }, + { + "epoch": 0.8046701167529188, + "grad_norm": 2.1202337741851807, + "learning_rate": 2.235804158381919e-06, + "loss": 0.8756, + "step": 64372 + }, + { + "epoch": 0.8046951173779344, + "grad_norm": 2.7888875007629395, + "learning_rate": 2.235254205168247e-06, + "loss": 0.9645, + "step": 64374 + }, + { + "epoch": 0.8047201180029501, + "grad_norm": 3.658233165740967, + "learning_rate": 2.2347043110897847e-06, + "loss": 0.7135, + "step": 64376 + }, + { + "epoch": 0.8047451186279657, + "grad_norm": 15.896313667297363, + "learning_rate": 2.2341544761507127e-06, + "loss": 1.7848, + "step": 64378 + }, + { + "epoch": 0.8047701192529814, + "grad_norm": 4.378205299377441, + "learning_rate": 2.233604700355223e-06, + "loss": 0.4956, + "step": 64380 + }, + { + "epoch": 0.8047951198779969, + "grad_norm": 3.44321346282959, + "learning_rate": 2.2330549837075035e-06, + "loss": 1.2146, + "step": 64382 + }, + { + "epoch": 0.8048201205030125, + "grad_norm": 12.151470184326172, + "learning_rate": 2.232505326211739e-06, + "loss": 0.6254, + "step": 64384 + }, + { + "epoch": 0.8048451211280282, + "grad_norm": 1.7505998611450195, + "learning_rate": 2.231955727872116e-06, + "loss": 0.0911, + "step": 64386 + }, + { + "epoch": 0.8048701217530438, + "grad_norm": 2.950390338897705, + "learning_rate": 2.231406188692816e-06, + "loss": 0.5532, + "step": 64388 + }, + { + "epoch": 0.8048951223780595, + "grad_norm": 1.950844168663025, + "learning_rate": 2.230856708678032e-06, + "loss": 0.2481, + "step": 64390 + }, + { + "epoch": 0.804920123003075, + "grad_norm": 4.7614030838012695, + "learning_rate": 2.230307287831941e-06, + "loss": 1.2048, + "step": 64392 + }, + { + "epoch": 0.8049451236280907, + "grad_norm": 5.919525146484375, + "learning_rate": 2.229757926158731e-06, + "loss": 0.482, + "step": 64394 + }, + { + "epoch": 0.8049701242531063, + "grad_norm": 0.12374749034643173, + "learning_rate": 2.229208623662589e-06, + "loss": 1.877, + "step": 64396 + }, + { + "epoch": 0.804995124878122, + "grad_norm": 0.003581108059734106, + "learning_rate": 2.228659380347695e-06, + "loss": 1.8868, + "step": 64398 + }, + { + "epoch": 0.8050201255031376, + "grad_norm": 5.5144524574279785, + "learning_rate": 2.2281101962182326e-06, + "loss": 1.3389, + "step": 64400 + }, + { + "epoch": 0.8050451261281532, + "grad_norm": 0.9591916799545288, + "learning_rate": 2.2275610712783803e-06, + "loss": 0.1759, + "step": 64402 + }, + { + "epoch": 0.8050701267531688, + "grad_norm": 3.268498182296753, + "learning_rate": 2.227012005532324e-06, + "loss": 1.6595, + "step": 64404 + }, + { + "epoch": 0.8050951273781844, + "grad_norm": 0.007268151268362999, + "learning_rate": 2.226462998984248e-06, + "loss": 0.1226, + "step": 64406 + }, + { + "epoch": 0.8051201280032001, + "grad_norm": 4.136341571807861, + "learning_rate": 2.2259140516383282e-06, + "loss": 0.8404, + "step": 64408 + }, + { + "epoch": 0.8051451286282157, + "grad_norm": 6.224153518676758, + "learning_rate": 2.2253651634987528e-06, + "loss": 2.1293, + "step": 64410 + }, + { + "epoch": 0.8051701292532313, + "grad_norm": 4.586027145385742, + "learning_rate": 2.2248163345696915e-06, + "loss": 1.0627, + "step": 64412 + }, + { + "epoch": 0.8051951298782469, + "grad_norm": 0.0001767140201991424, + "learning_rate": 2.2242675648553314e-06, + "loss": 0.8712, + "step": 64414 + }, + { + "epoch": 0.8052201305032626, + "grad_norm": 2.5615415573120117, + "learning_rate": 2.2237188543598475e-06, + "loss": 1.9274, + "step": 64416 + }, + { + "epoch": 0.8052451311282782, + "grad_norm": 3.4227027893066406, + "learning_rate": 2.2231702030874224e-06, + "loss": 1.0167, + "step": 64418 + }, + { + "epoch": 0.8052701317532939, + "grad_norm": 5.469700336456299, + "learning_rate": 2.222621611042235e-06, + "loss": 0.6427, + "step": 64420 + }, + { + "epoch": 0.8052951323783094, + "grad_norm": 1.8841577768325806, + "learning_rate": 2.222073078228458e-06, + "loss": 0.3528, + "step": 64422 + }, + { + "epoch": 0.8053201330033251, + "grad_norm": 3.957524299621582, + "learning_rate": 2.221524604650279e-06, + "loss": 1.8719, + "step": 64424 + }, + { + "epoch": 0.8053451336283407, + "grad_norm": 0.44177040457725525, + "learning_rate": 2.2209761903118634e-06, + "loss": 0.013, + "step": 64426 + }, + { + "epoch": 0.8053701342533564, + "grad_norm": 2.640035629272461, + "learning_rate": 2.2204278352173938e-06, + "loss": 1.0568, + "step": 64428 + }, + { + "epoch": 0.805395134878372, + "grad_norm": 3.201962947845459, + "learning_rate": 2.219879539371047e-06, + "loss": 0.5121, + "step": 64430 + }, + { + "epoch": 0.8054201355033875, + "grad_norm": 5.022463798522949, + "learning_rate": 2.2193313027769947e-06, + "loss": 1.4852, + "step": 64432 + }, + { + "epoch": 0.8054451361284032, + "grad_norm": 0.04868312552571297, + "learning_rate": 2.2187831254394177e-06, + "loss": 0.162, + "step": 64434 + }, + { + "epoch": 0.8054701367534188, + "grad_norm": 3.2933592796325684, + "learning_rate": 2.2182350073624893e-06, + "loss": 1.8318, + "step": 64436 + }, + { + "epoch": 0.8054951373784345, + "grad_norm": 3.260366439819336, + "learning_rate": 2.2176869485503817e-06, + "loss": 1.3164, + "step": 64438 + }, + { + "epoch": 0.8055201380034501, + "grad_norm": 0.0005257731536403298, + "learning_rate": 2.217138949007267e-06, + "loss": 0.5744, + "step": 64440 + }, + { + "epoch": 0.8055451386284657, + "grad_norm": 2.640326738357544, + "learning_rate": 2.216591008737321e-06, + "loss": 0.1899, + "step": 64442 + }, + { + "epoch": 0.8055701392534813, + "grad_norm": 0.14577557146549225, + "learning_rate": 2.21604312774472e-06, + "loss": 0.4997, + "step": 64444 + }, + { + "epoch": 0.805595139878497, + "grad_norm": 1.389392614364624, + "learning_rate": 2.2154953060336326e-06, + "loss": 0.7964, + "step": 64446 + }, + { + "epoch": 0.8056201405035126, + "grad_norm": 4.447258472442627, + "learning_rate": 2.2149475436082336e-06, + "loss": 0.7591, + "step": 64448 + }, + { + "epoch": 0.8056451411285283, + "grad_norm": 3.852489709854126, + "learning_rate": 2.214399840472693e-06, + "loss": 0.8981, + "step": 64450 + }, + { + "epoch": 0.8056701417535438, + "grad_norm": 4.590394973754883, + "learning_rate": 2.213852196631183e-06, + "loss": 0.8214, + "step": 64452 + }, + { + "epoch": 0.8056951423785594, + "grad_norm": 3.209111452102661, + "learning_rate": 2.2133046120878698e-06, + "loss": 1.7298, + "step": 64454 + }, + { + "epoch": 0.8057201430035751, + "grad_norm": 3.360931634902954, + "learning_rate": 2.2127570868469285e-06, + "loss": 1.1811, + "step": 64456 + }, + { + "epoch": 0.8057451436285907, + "grad_norm": 0.2972449064254761, + "learning_rate": 2.212209620912531e-06, + "loss": 0.0062, + "step": 64458 + }, + { + "epoch": 0.8057701442536064, + "grad_norm": 5.329573154449463, + "learning_rate": 2.21166221428884e-06, + "loss": 0.9133, + "step": 64460 + }, + { + "epoch": 0.8057951448786219, + "grad_norm": 4.431012153625488, + "learning_rate": 2.2111148669800343e-06, + "loss": 1.0404, + "step": 64462 + }, + { + "epoch": 0.8058201455036376, + "grad_norm": 7.671663761138916, + "learning_rate": 2.2105675789902704e-06, + "loss": 1.2979, + "step": 64464 + }, + { + "epoch": 0.8058451461286532, + "grad_norm": 2.095106363296509, + "learning_rate": 2.210020350323723e-06, + "loss": 0.7757, + "step": 64466 + }, + { + "epoch": 0.8058701467536689, + "grad_norm": 0.002616127720102668, + "learning_rate": 2.2094731809845613e-06, + "loss": 0.0001, + "step": 64468 + }, + { + "epoch": 0.8058951473786845, + "grad_norm": 4.875922203063965, + "learning_rate": 2.208926070976948e-06, + "loss": 1.4377, + "step": 64470 + }, + { + "epoch": 0.8059201480037, + "grad_norm": 2.611078977584839, + "learning_rate": 2.208379020305054e-06, + "loss": 0.4934, + "step": 64472 + }, + { + "epoch": 0.8059451486287157, + "grad_norm": 0.00039316131733357906, + "learning_rate": 2.2078320289730405e-06, + "loss": 0.5409, + "step": 64474 + }, + { + "epoch": 0.8059701492537313, + "grad_norm": 4.290868282318115, + "learning_rate": 2.207285096985082e-06, + "loss": 1.5824, + "step": 64476 + }, + { + "epoch": 0.805995149878747, + "grad_norm": 5.010056018829346, + "learning_rate": 2.2067382243453338e-06, + "loss": 0.4308, + "step": 64478 + }, + { + "epoch": 0.8060201505037626, + "grad_norm": 1.4075376987457275, + "learning_rate": 2.206191411057964e-06, + "loss": 0.7902, + "step": 64480 + }, + { + "epoch": 0.8060451511287782, + "grad_norm": 4.537186622619629, + "learning_rate": 2.2056446571271416e-06, + "loss": 0.3421, + "step": 64482 + }, + { + "epoch": 0.8060701517537938, + "grad_norm": 4.0600433349609375, + "learning_rate": 2.2050979625570244e-06, + "loss": 0.741, + "step": 64484 + }, + { + "epoch": 0.8060951523788095, + "grad_norm": 7.516021728515625, + "learning_rate": 2.2045513273517814e-06, + "loss": 2.2916, + "step": 64486 + }, + { + "epoch": 0.8061201530038251, + "grad_norm": 0.00031648509320802987, + "learning_rate": 2.2040047515155737e-06, + "loss": 0.9783, + "step": 64488 + }, + { + "epoch": 0.8061451536288408, + "grad_norm": 3.537883996963501, + "learning_rate": 2.20345823505256e-06, + "loss": 0.9658, + "step": 64490 + }, + { + "epoch": 0.8061701542538563, + "grad_norm": 2.9294633865356445, + "learning_rate": 2.2029117779669097e-06, + "loss": 0.8339, + "step": 64492 + }, + { + "epoch": 0.806195154878872, + "grad_norm": 7.413250923156738, + "learning_rate": 2.2023653802627764e-06, + "loss": 2.0126, + "step": 64494 + }, + { + "epoch": 0.8062201555038876, + "grad_norm": 2.143989086151123, + "learning_rate": 2.2018190419443297e-06, + "loss": 0.1651, + "step": 64496 + }, + { + "epoch": 0.8062451561289032, + "grad_norm": 2.6491780281066895, + "learning_rate": 2.2012727630157226e-06, + "loss": 1.074, + "step": 64498 + }, + { + "epoch": 0.8062701567539189, + "grad_norm": 3.9373953342437744, + "learning_rate": 2.200726543481123e-06, + "loss": 1.1455, + "step": 64500 + }, + { + "epoch": 0.8062951573789344, + "grad_norm": 2.397826671600342, + "learning_rate": 2.2001803833446867e-06, + "loss": 0.3952, + "step": 64502 + }, + { + "epoch": 0.8063201580039501, + "grad_norm": 2.698390245437622, + "learning_rate": 2.19963428261057e-06, + "loss": 1.2335, + "step": 64504 + }, + { + "epoch": 0.8063451586289657, + "grad_norm": 4.524659633636475, + "learning_rate": 2.1990882412829393e-06, + "loss": 1.6348, + "step": 64506 + }, + { + "epoch": 0.8063701592539814, + "grad_norm": 2.6716949939727783, + "learning_rate": 2.1985422593659465e-06, + "loss": 1.0964, + "step": 64508 + }, + { + "epoch": 0.806395159878997, + "grad_norm": 4.515312194824219, + "learning_rate": 2.1979963368637557e-06, + "loss": 1.1676, + "step": 64510 + }, + { + "epoch": 0.8064201605040126, + "grad_norm": 8.126249313354492, + "learning_rate": 2.197450473780518e-06, + "loss": 2.0414, + "step": 64512 + }, + { + "epoch": 0.8064451611290282, + "grad_norm": 3.012216091156006, + "learning_rate": 2.1969046701203976e-06, + "loss": 0.7599, + "step": 64514 + }, + { + "epoch": 0.8064701617540438, + "grad_norm": 4.531891822814941, + "learning_rate": 2.196358925887547e-06, + "loss": 1.4139, + "step": 64516 + }, + { + "epoch": 0.8064951623790595, + "grad_norm": 2.287107467651367, + "learning_rate": 2.195813241086121e-06, + "loss": 1.2305, + "step": 64518 + }, + { + "epoch": 0.8065201630040751, + "grad_norm": 4.507152080535889, + "learning_rate": 2.1952676157202803e-06, + "loss": 1.6935, + "step": 64520 + }, + { + "epoch": 0.8065451636290907, + "grad_norm": 3.469434976577759, + "learning_rate": 2.1947220497941757e-06, + "loss": 1.0094, + "step": 64522 + }, + { + "epoch": 0.8065701642541063, + "grad_norm": 3.6511917114257812, + "learning_rate": 2.1941765433119654e-06, + "loss": 0.8196, + "step": 64524 + }, + { + "epoch": 0.806595164879122, + "grad_norm": 4.144286632537842, + "learning_rate": 2.1936310962778016e-06, + "loss": 1.0615, + "step": 64526 + }, + { + "epoch": 0.8066201655041376, + "grad_norm": 2.866922616958618, + "learning_rate": 2.1930857086958414e-06, + "loss": 0.677, + "step": 64528 + }, + { + "epoch": 0.8066451661291533, + "grad_norm": 3.1792397499084473, + "learning_rate": 2.1925403805702363e-06, + "loss": 0.9554, + "step": 64530 + }, + { + "epoch": 0.8066701667541688, + "grad_norm": 10.659075736999512, + "learning_rate": 2.1919951119051374e-06, + "loss": 0.4297, + "step": 64532 + }, + { + "epoch": 0.8066951673791845, + "grad_norm": 3.539499282836914, + "learning_rate": 2.191449902704702e-06, + "loss": 1.5064, + "step": 64534 + }, + { + "epoch": 0.8067201680042001, + "grad_norm": 4.681786060333252, + "learning_rate": 2.1909047529730777e-06, + "loss": 1.5558, + "step": 64536 + }, + { + "epoch": 0.8067451686292157, + "grad_norm": 5.506661891937256, + "learning_rate": 2.1903596627144196e-06, + "loss": 1.577, + "step": 64538 + }, + { + "epoch": 0.8067701692542314, + "grad_norm": 3.126020908355713, + "learning_rate": 2.189814631932877e-06, + "loss": 0.8105, + "step": 64540 + }, + { + "epoch": 0.8067951698792469, + "grad_norm": 3.5381104946136475, + "learning_rate": 2.1892696606326036e-06, + "loss": 1.9445, + "step": 64542 + }, + { + "epoch": 0.8068201705042626, + "grad_norm": 3.027695655822754, + "learning_rate": 2.1887247488177475e-06, + "loss": 0.4318, + "step": 64544 + }, + { + "epoch": 0.8068451711292782, + "grad_norm": 3.8903744220733643, + "learning_rate": 2.1881798964924573e-06, + "loss": 1.2873, + "step": 64546 + }, + { + "epoch": 0.8068701717542939, + "grad_norm": 2.7283012866973877, + "learning_rate": 2.187635103660887e-06, + "loss": 1.1398, + "step": 64548 + }, + { + "epoch": 0.8068951723793095, + "grad_norm": 3.056999921798706, + "learning_rate": 2.1870903703271794e-06, + "loss": 0.8927, + "step": 64550 + }, + { + "epoch": 0.8069201730043251, + "grad_norm": 6.021769046783447, + "learning_rate": 2.18654569649549e-06, + "loss": 1.5733, + "step": 64552 + }, + { + "epoch": 0.8069451736293407, + "grad_norm": 0.07099568098783493, + "learning_rate": 2.186001082169964e-06, + "loss": 0.553, + "step": 64554 + }, + { + "epoch": 0.8069701742543564, + "grad_norm": 0.014838799834251404, + "learning_rate": 2.1854565273547455e-06, + "loss": 0.0358, + "step": 64556 + }, + { + "epoch": 0.806995174879372, + "grad_norm": 2.8471131324768066, + "learning_rate": 2.1849120320539887e-06, + "loss": 2.0333, + "step": 64558 + }, + { + "epoch": 0.8070201755043876, + "grad_norm": 2.398484230041504, + "learning_rate": 2.1843675962718323e-06, + "loss": 0.173, + "step": 64560 + }, + { + "epoch": 0.8070451761294032, + "grad_norm": 4.767170429229736, + "learning_rate": 2.183823220012431e-06, + "loss": 0.3098, + "step": 64562 + }, + { + "epoch": 0.8070701767544188, + "grad_norm": 5.281458854675293, + "learning_rate": 2.1832789032799238e-06, + "loss": 0.5995, + "step": 64564 + }, + { + "epoch": 0.8070951773794345, + "grad_norm": 2.780757427215576, + "learning_rate": 2.1827346460784615e-06, + "loss": 0.2759, + "step": 64566 + }, + { + "epoch": 0.8071201780044501, + "grad_norm": 4.296359539031982, + "learning_rate": 2.182190448412187e-06, + "loss": 0.8145, + "step": 64568 + }, + { + "epoch": 0.8071451786294658, + "grad_norm": 3.55879282951355, + "learning_rate": 2.1816463102852414e-06, + "loss": 2.0377, + "step": 64570 + }, + { + "epoch": 0.8071701792544813, + "grad_norm": 2.7157793045043945, + "learning_rate": 2.1811022317017747e-06, + "loss": 0.9282, + "step": 64572 + }, + { + "epoch": 0.807195179879497, + "grad_norm": 3.139603853225708, + "learning_rate": 2.1805582126659254e-06, + "loss": 1.2022, + "step": 64574 + }, + { + "epoch": 0.8072201805045126, + "grad_norm": 4.164053440093994, + "learning_rate": 2.1800142531818413e-06, + "loss": 1.4988, + "step": 64576 + }, + { + "epoch": 0.8072451811295283, + "grad_norm": 3.4494030475616455, + "learning_rate": 2.179470353253661e-06, + "loss": 1.3151, + "step": 64578 + }, + { + "epoch": 0.8072701817545439, + "grad_norm": 3.2887771129608154, + "learning_rate": 2.1789265128855306e-06, + "loss": 1.1953, + "step": 64580 + }, + { + "epoch": 0.8072951823795594, + "grad_norm": 3.1199889183044434, + "learning_rate": 2.17838273208159e-06, + "loss": 0.1723, + "step": 64582 + }, + { + "epoch": 0.8073201830045751, + "grad_norm": 1.303328514099121, + "learning_rate": 2.1778390108459767e-06, + "loss": 0.2495, + "step": 64584 + }, + { + "epoch": 0.8073451836295907, + "grad_norm": 2.437551259994507, + "learning_rate": 2.1772953491828395e-06, + "loss": 0.4166, + "step": 64586 + }, + { + "epoch": 0.8073701842546064, + "grad_norm": 3.7619903087615967, + "learning_rate": 2.1767517470963107e-06, + "loss": 1.6052, + "step": 64588 + }, + { + "epoch": 0.807395184879622, + "grad_norm": 5.491699695587158, + "learning_rate": 2.1762082045905375e-06, + "loss": 1.563, + "step": 64590 + }, + { + "epoch": 0.8074201855046376, + "grad_norm": 3.1080570220947266, + "learning_rate": 2.1756647216696545e-06, + "loss": 1.155, + "step": 64592 + }, + { + "epoch": 0.8074451861296532, + "grad_norm": 3.408585548400879, + "learning_rate": 2.1751212983378045e-06, + "loss": 1.4079, + "step": 64594 + }, + { + "epoch": 0.8074701867546689, + "grad_norm": 5.058828830718994, + "learning_rate": 2.1745779345991235e-06, + "loss": 0.1457, + "step": 64596 + }, + { + "epoch": 0.8074951873796845, + "grad_norm": 3.4753966331481934, + "learning_rate": 2.1740346304577488e-06, + "loss": 1.8746, + "step": 64598 + }, + { + "epoch": 0.8075201880047002, + "grad_norm": 4.846635818481445, + "learning_rate": 2.1734913859178208e-06, + "loss": 1.6495, + "step": 64600 + }, + { + "epoch": 0.8075451886297157, + "grad_norm": 6.4941253662109375, + "learning_rate": 2.1729482009834746e-06, + "loss": 0.6613, + "step": 64602 + }, + { + "epoch": 0.8075701892547313, + "grad_norm": 2.3305020332336426, + "learning_rate": 2.17240507565885e-06, + "loss": 1.4631, + "step": 64604 + }, + { + "epoch": 0.807595189879747, + "grad_norm": 3.367405414581299, + "learning_rate": 2.1718620099480815e-06, + "loss": 1.3443, + "step": 64606 + }, + { + "epoch": 0.8076201905047626, + "grad_norm": 0.6095860004425049, + "learning_rate": 2.1713190038553024e-06, + "loss": 0.2919, + "step": 64608 + }, + { + "epoch": 0.8076451911297783, + "grad_norm": 2.931415557861328, + "learning_rate": 2.1707760573846524e-06, + "loss": 1.3154, + "step": 64610 + }, + { + "epoch": 0.8076701917547938, + "grad_norm": 2.6439261436462402, + "learning_rate": 2.1702331705402637e-06, + "loss": 0.9615, + "step": 64612 + }, + { + "epoch": 0.8076951923798095, + "grad_norm": 4.009838581085205, + "learning_rate": 2.169690343326274e-06, + "loss": 1.0982, + "step": 64614 + }, + { + "epoch": 0.8077201930048251, + "grad_norm": 1.3384088277816772, + "learning_rate": 2.1691475757468116e-06, + "loss": 0.0812, + "step": 64616 + }, + { + "epoch": 0.8077451936298408, + "grad_norm": 3.340122938156128, + "learning_rate": 2.1686048678060178e-06, + "loss": 1.1849, + "step": 64618 + }, + { + "epoch": 0.8077701942548564, + "grad_norm": 2.525564432144165, + "learning_rate": 2.1680622195080215e-06, + "loss": 1.4026, + "step": 64620 + }, + { + "epoch": 0.807795194879872, + "grad_norm": 0.0004957039491273463, + "learning_rate": 2.1675196308569537e-06, + "loss": 0.0552, + "step": 64622 + }, + { + "epoch": 0.8078201955048876, + "grad_norm": 2.8713126182556152, + "learning_rate": 2.166977101856951e-06, + "loss": 1.105, + "step": 64624 + }, + { + "epoch": 0.8078451961299032, + "grad_norm": 1.541463017463684, + "learning_rate": 2.16643463251214e-06, + "loss": 0.665, + "step": 64626 + }, + { + "epoch": 0.8078701967549189, + "grad_norm": 3.784497022628784, + "learning_rate": 2.165892222826658e-06, + "loss": 1.0051, + "step": 64628 + }, + { + "epoch": 0.8078951973799345, + "grad_norm": 3.7111659049987793, + "learning_rate": 2.1653498728046295e-06, + "loss": 0.8964, + "step": 64630 + }, + { + "epoch": 0.8079201980049501, + "grad_norm": 3.2231640815734863, + "learning_rate": 2.1648075824501923e-06, + "loss": 0.7185, + "step": 64632 + }, + { + "epoch": 0.8079451986299657, + "grad_norm": 4.600854396820068, + "learning_rate": 2.1642653517674715e-06, + "loss": 1.3644, + "step": 64634 + }, + { + "epoch": 0.8079701992549814, + "grad_norm": 1.6219793558120728, + "learning_rate": 2.163723180760596e-06, + "loss": 1.6138, + "step": 64636 + }, + { + "epoch": 0.807995199879997, + "grad_norm": 2.8498706817626953, + "learning_rate": 2.1631810694336996e-06, + "loss": 0.1668, + "step": 64638 + }, + { + "epoch": 0.8080202005050127, + "grad_norm": 6.392828941345215, + "learning_rate": 2.162639017790904e-06, + "loss": 2.2305, + "step": 64640 + }, + { + "epoch": 0.8080452011300282, + "grad_norm": 0.47480082511901855, + "learning_rate": 2.162097025836345e-06, + "loss": 0.1295, + "step": 64642 + }, + { + "epoch": 0.8080702017550438, + "grad_norm": 1.589971661567688, + "learning_rate": 2.1615550935741437e-06, + "loss": 1.0491, + "step": 64644 + }, + { + "epoch": 0.8080952023800595, + "grad_norm": 4.5447998046875, + "learning_rate": 2.161013221008433e-06, + "loss": 1.032, + "step": 64646 + }, + { + "epoch": 0.8081202030050751, + "grad_norm": 16.409530639648438, + "learning_rate": 2.1604714081433366e-06, + "loss": 1.2925, + "step": 64648 + }, + { + "epoch": 0.8081452036300908, + "grad_norm": 3.7270421981811523, + "learning_rate": 2.1599296549829797e-06, + "loss": 1.6572, + "step": 64650 + }, + { + "epoch": 0.8081702042551063, + "grad_norm": 1.8048957586288452, + "learning_rate": 2.159387961531492e-06, + "loss": 1.262, + "step": 64652 + }, + { + "epoch": 0.808195204880122, + "grad_norm": 2.291590452194214, + "learning_rate": 2.158846327792994e-06, + "loss": 0.3716, + "step": 64654 + }, + { + "epoch": 0.8082202055051376, + "grad_norm": 3.6697092056274414, + "learning_rate": 2.158304753771616e-06, + "loss": 0.6582, + "step": 64656 + }, + { + "epoch": 0.8082452061301533, + "grad_norm": 3.15291428565979, + "learning_rate": 2.157763239471481e-06, + "loss": 1.2368, + "step": 64658 + }, + { + "epoch": 0.8082702067551689, + "grad_norm": 2.0251688957214355, + "learning_rate": 2.157221784896708e-06, + "loss": 0.2606, + "step": 64660 + }, + { + "epoch": 0.8082952073801845, + "grad_norm": 4.742263317108154, + "learning_rate": 2.1566803900514287e-06, + "loss": 1.5797, + "step": 64662 + }, + { + "epoch": 0.8083202080052001, + "grad_norm": 3.397014856338501, + "learning_rate": 2.1561390549397598e-06, + "loss": 0.7861, + "step": 64664 + }, + { + "epoch": 0.8083452086302158, + "grad_norm": 3.3487603664398193, + "learning_rate": 2.155597779565829e-06, + "loss": 1.6773, + "step": 64666 + }, + { + "epoch": 0.8083702092552314, + "grad_norm": 0.7261534333229065, + "learning_rate": 2.155056563933753e-06, + "loss": 0.5776, + "step": 64668 + }, + { + "epoch": 0.808395209880247, + "grad_norm": 0.012238036841154099, + "learning_rate": 2.1545154080476603e-06, + "loss": 1.1294, + "step": 64670 + }, + { + "epoch": 0.8084202105052626, + "grad_norm": 5.051850318908691, + "learning_rate": 2.1539743119116685e-06, + "loss": 1.5137, + "step": 64672 + }, + { + "epoch": 0.8084452111302782, + "grad_norm": 6.74190616607666, + "learning_rate": 2.153433275529896e-06, + "loss": 1.9225, + "step": 64674 + }, + { + "epoch": 0.8084702117552939, + "grad_norm": 0.743100106716156, + "learning_rate": 2.1528922989064684e-06, + "loss": 0.5466, + "step": 64676 + }, + { + "epoch": 0.8084952123803095, + "grad_norm": 4.23688268661499, + "learning_rate": 2.1523513820455013e-06, + "loss": 1.0479, + "step": 64678 + }, + { + "epoch": 0.8085202130053252, + "grad_norm": 4.522265911102295, + "learning_rate": 2.151810524951119e-06, + "loss": 0.9997, + "step": 64680 + }, + { + "epoch": 0.8085452136303407, + "grad_norm": 2.706270933151245, + "learning_rate": 2.1512697276274355e-06, + "loss": 0.5695, + "step": 64682 + }, + { + "epoch": 0.8085702142553564, + "grad_norm": 5.070512294769287, + "learning_rate": 2.1507289900785732e-06, + "loss": 1.8105, + "step": 64684 + }, + { + "epoch": 0.808595214880372, + "grad_norm": 2.797712802886963, + "learning_rate": 2.150188312308651e-06, + "loss": 0.2758, + "step": 64686 + }, + { + "epoch": 0.8086202155053877, + "grad_norm": 0.15902671217918396, + "learning_rate": 2.1496476943217803e-06, + "loss": 0.9741, + "step": 64688 + }, + { + "epoch": 0.8086452161304033, + "grad_norm": 2.5828311443328857, + "learning_rate": 2.149107136122085e-06, + "loss": 0.4503, + "step": 64690 + }, + { + "epoch": 0.8086702167554188, + "grad_norm": 2.9155383110046387, + "learning_rate": 2.1485666377136783e-06, + "loss": 0.6266, + "step": 64692 + }, + { + "epoch": 0.8086952173804345, + "grad_norm": 7.762085437774658, + "learning_rate": 2.148026199100679e-06, + "loss": 0.6687, + "step": 64694 + }, + { + "epoch": 0.8087202180054501, + "grad_norm": 0.280394047498703, + "learning_rate": 2.1474858202872005e-06, + "loss": 0.0034, + "step": 64696 + }, + { + "epoch": 0.8087452186304658, + "grad_norm": 10.379861831665039, + "learning_rate": 2.146945501277361e-06, + "loss": 0.3362, + "step": 64698 + }, + { + "epoch": 0.8087702192554814, + "grad_norm": 0.0005359255592338741, + "learning_rate": 2.1464052420752746e-06, + "loss": 0.7092, + "step": 64700 + }, + { + "epoch": 0.808795219880497, + "grad_norm": 3.4098942279815674, + "learning_rate": 2.1458650426850524e-06, + "loss": 1.5446, + "step": 64702 + }, + { + "epoch": 0.8088202205055126, + "grad_norm": 1.6420297622680664, + "learning_rate": 2.1453249031108146e-06, + "loss": 0.9052, + "step": 64704 + }, + { + "epoch": 0.8088452211305283, + "grad_norm": 0.4175359606742859, + "learning_rate": 2.144784823356668e-06, + "loss": 0.6743, + "step": 64706 + }, + { + "epoch": 0.8088702217555439, + "grad_norm": 3.5666775703430176, + "learning_rate": 2.144244803426733e-06, + "loss": 0.7893, + "step": 64708 + }, + { + "epoch": 0.8088952223805596, + "grad_norm": 5.424480438232422, + "learning_rate": 2.1437048433251186e-06, + "loss": 1.617, + "step": 64710 + }, + { + "epoch": 0.8089202230055751, + "grad_norm": 0.7549620270729065, + "learning_rate": 2.1431649430559343e-06, + "loss": 0.0532, + "step": 64712 + }, + { + "epoch": 0.8089452236305907, + "grad_norm": 3.657747983932495, + "learning_rate": 2.142625102623297e-06, + "loss": 0.9332, + "step": 64714 + }, + { + "epoch": 0.8089702242556064, + "grad_norm": 2.6000092029571533, + "learning_rate": 2.1420853220313132e-06, + "loss": 1.3149, + "step": 64716 + }, + { + "epoch": 0.808995224880622, + "grad_norm": 5.561631202697754, + "learning_rate": 2.1415456012841e-06, + "loss": 0.2732, + "step": 64718 + }, + { + "epoch": 0.8090202255056377, + "grad_norm": 2.9481263160705566, + "learning_rate": 2.14100594038576e-06, + "loss": 0.4448, + "step": 64720 + }, + { + "epoch": 0.8090452261306532, + "grad_norm": 0.00024146346549969167, + "learning_rate": 2.1404663393404078e-06, + "loss": 0.701, + "step": 64722 + }, + { + "epoch": 0.8090702267556689, + "grad_norm": 3.56626296043396, + "learning_rate": 2.139926798152159e-06, + "loss": 0.6025, + "step": 64724 + }, + { + "epoch": 0.8090952273806845, + "grad_norm": 2.8115956783294678, + "learning_rate": 2.1393873168251087e-06, + "loss": 0.8026, + "step": 64726 + }, + { + "epoch": 0.8091202280057002, + "grad_norm": 0.8217528462409973, + "learning_rate": 2.1388478953633775e-06, + "loss": 0.7158, + "step": 64728 + }, + { + "epoch": 0.8091452286307158, + "grad_norm": 0.4614184498786926, + "learning_rate": 2.1383085337710662e-06, + "loss": 0.0252, + "step": 64730 + }, + { + "epoch": 0.8091702292557313, + "grad_norm": 0.45951783657073975, + "learning_rate": 2.137769232052288e-06, + "loss": 0.7563, + "step": 64732 + }, + { + "epoch": 0.809195229880747, + "grad_norm": 3.248896837234497, + "learning_rate": 2.137229990211144e-06, + "loss": 1.0719, + "step": 64734 + }, + { + "epoch": 0.8092202305057626, + "grad_norm": 0.0003742569242604077, + "learning_rate": 2.136690808251748e-06, + "loss": 1.7794, + "step": 64736 + }, + { + "epoch": 0.8092452311307783, + "grad_norm": 3.7484230995178223, + "learning_rate": 2.136151686178203e-06, + "loss": 0.9246, + "step": 64738 + }, + { + "epoch": 0.8092702317557939, + "grad_norm": 3.8353848457336426, + "learning_rate": 2.1356126239946107e-06, + "loss": 0.9318, + "step": 64740 + }, + { + "epoch": 0.8092952323808095, + "grad_norm": 2.760713577270508, + "learning_rate": 2.135073621705085e-06, + "loss": 1.1271, + "step": 64742 + }, + { + "epoch": 0.8093202330058251, + "grad_norm": 4.1183390617370605, + "learning_rate": 2.134534679313722e-06, + "loss": 1.0478, + "step": 64744 + }, + { + "epoch": 0.8093452336308408, + "grad_norm": 4.760234832763672, + "learning_rate": 2.1339957968246317e-06, + "loss": 0.756, + "step": 64746 + }, + { + "epoch": 0.8093702342558564, + "grad_norm": 3.349491596221924, + "learning_rate": 2.1334569742419198e-06, + "loss": 1.0461, + "step": 64748 + }, + { + "epoch": 0.8093952348808721, + "grad_norm": 0.0003062996838707477, + "learning_rate": 2.132918211569687e-06, + "loss": 0.5487, + "step": 64750 + }, + { + "epoch": 0.8094202355058876, + "grad_norm": 3.0259077548980713, + "learning_rate": 2.132379508812036e-06, + "loss": 0.7798, + "step": 64752 + }, + { + "epoch": 0.8094452361309032, + "grad_norm": 3.5655739307403564, + "learning_rate": 2.1318408659730685e-06, + "loss": 1.855, + "step": 64754 + }, + { + "epoch": 0.8094702367559189, + "grad_norm": 3.8462088108062744, + "learning_rate": 2.13130228305689e-06, + "loss": 1.4946, + "step": 64756 + }, + { + "epoch": 0.8094952373809345, + "grad_norm": 1.0761148929595947, + "learning_rate": 2.1307637600675977e-06, + "loss": 0.0486, + "step": 64758 + }, + { + "epoch": 0.8095202380059502, + "grad_norm": 3.781545639038086, + "learning_rate": 2.1302252970092963e-06, + "loss": 1.3388, + "step": 64760 + }, + { + "epoch": 0.8095452386309657, + "grad_norm": 1.571448564529419, + "learning_rate": 2.1296868938860917e-06, + "loss": 0.5245, + "step": 64762 + }, + { + "epoch": 0.8095702392559814, + "grad_norm": 0.013084516860544682, + "learning_rate": 2.1291485507020736e-06, + "loss": 0.6897, + "step": 64764 + }, + { + "epoch": 0.809595239880997, + "grad_norm": 3.3635451793670654, + "learning_rate": 2.1286102674613495e-06, + "loss": 1.2066, + "step": 64766 + }, + { + "epoch": 0.8096202405060127, + "grad_norm": 1.3444055318832397, + "learning_rate": 2.1280720441680135e-06, + "loss": 0.4456, + "step": 64768 + }, + { + "epoch": 0.8096452411310283, + "grad_norm": 4.628678321838379, + "learning_rate": 2.12753388082617e-06, + "loss": 0.8691, + "step": 64770 + }, + { + "epoch": 0.8096702417560439, + "grad_norm": 0.7491821646690369, + "learning_rate": 2.126995777439913e-06, + "loss": 0.5915, + "step": 64772 + }, + { + "epoch": 0.8096952423810595, + "grad_norm": 3.9101779460906982, + "learning_rate": 2.126457734013343e-06, + "loss": 1.5854, + "step": 64774 + }, + { + "epoch": 0.8097202430060751, + "grad_norm": 4.200852394104004, + "learning_rate": 2.1259197505505626e-06, + "loss": 1.2357, + "step": 64776 + }, + { + "epoch": 0.8097452436310908, + "grad_norm": 2.4284863471984863, + "learning_rate": 2.1253818270556592e-06, + "loss": 1.0109, + "step": 64778 + }, + { + "epoch": 0.8097702442561064, + "grad_norm": 0.0002736454480327666, + "learning_rate": 2.1248439635327358e-06, + "loss": 0.6678, + "step": 64780 + }, + { + "epoch": 0.809795244881122, + "grad_norm": 2.53973388671875, + "learning_rate": 2.124306159985885e-06, + "loss": 0.8585, + "step": 64782 + }, + { + "epoch": 0.8098202455061376, + "grad_norm": 4.494327068328857, + "learning_rate": 2.123768416419204e-06, + "loss": 1.3786, + "step": 64784 + }, + { + "epoch": 0.8098452461311533, + "grad_norm": 3.4998326301574707, + "learning_rate": 2.123230732836793e-06, + "loss": 1.5298, + "step": 64786 + }, + { + "epoch": 0.8098702467561689, + "grad_norm": 2.4371840953826904, + "learning_rate": 2.1226931092427418e-06, + "loss": 0.5382, + "step": 64788 + }, + { + "epoch": 0.8098952473811846, + "grad_norm": 3.7707765102386475, + "learning_rate": 2.122155545641147e-06, + "loss": 1.4924, + "step": 64790 + }, + { + "epoch": 0.8099202480062001, + "grad_norm": 3.121812582015991, + "learning_rate": 2.1216180420360977e-06, + "loss": 0.7067, + "step": 64792 + }, + { + "epoch": 0.8099452486312158, + "grad_norm": 4.179256916046143, + "learning_rate": 2.121080598431695e-06, + "loss": 1.382, + "step": 64794 + }, + { + "epoch": 0.8099702492562314, + "grad_norm": 0.00037143335794098675, + "learning_rate": 2.120543214832025e-06, + "loss": 0.4645, + "step": 64796 + }, + { + "epoch": 0.809995249881247, + "grad_norm": 4.700697898864746, + "learning_rate": 2.1200058912411836e-06, + "loss": 1.1183, + "step": 64798 + }, + { + "epoch": 0.8100202505062627, + "grad_norm": 2.786644697189331, + "learning_rate": 2.1194686276632658e-06, + "loss": 1.333, + "step": 64800 + }, + { + "epoch": 0.8100452511312782, + "grad_norm": 1.1502251625061035, + "learning_rate": 2.1189314241023608e-06, + "loss": 0.0764, + "step": 64802 + }, + { + "epoch": 0.8100702517562939, + "grad_norm": 5.802058696746826, + "learning_rate": 2.118394280562559e-06, + "loss": 1.5713, + "step": 64804 + }, + { + "epoch": 0.8100952523813095, + "grad_norm": 3.4974234104156494, + "learning_rate": 2.117857197047949e-06, + "loss": 1.8088, + "step": 64806 + }, + { + "epoch": 0.8101202530063252, + "grad_norm": 1.4131097793579102, + "learning_rate": 2.117320173562625e-06, + "loss": 0.2182, + "step": 64808 + }, + { + "epoch": 0.8101452536313408, + "grad_norm": 0.001107160234823823, + "learning_rate": 2.116783210110677e-06, + "loss": 0.7519, + "step": 64810 + }, + { + "epoch": 0.8101702542563564, + "grad_norm": 2.9057559967041016, + "learning_rate": 2.1162463066961914e-06, + "loss": 1.0642, + "step": 64812 + }, + { + "epoch": 0.810195254881372, + "grad_norm": 2.301187753677368, + "learning_rate": 2.115709463323262e-06, + "loss": 0.943, + "step": 64814 + }, + { + "epoch": 0.8102202555063877, + "grad_norm": 2.5917463302612305, + "learning_rate": 2.1151726799959737e-06, + "loss": 0.9767, + "step": 64816 + }, + { + "epoch": 0.8102452561314033, + "grad_norm": 3.7387192249298096, + "learning_rate": 2.114635956718416e-06, + "loss": 0.7053, + "step": 64818 + }, + { + "epoch": 0.810270256756419, + "grad_norm": 4.077371597290039, + "learning_rate": 2.1140992934946737e-06, + "loss": 1.1366, + "step": 64820 + }, + { + "epoch": 0.8102952573814345, + "grad_norm": 2.1453773975372314, + "learning_rate": 2.1135626903288342e-06, + "loss": 0.1209, + "step": 64822 + }, + { + "epoch": 0.8103202580064501, + "grad_norm": 4.949993133544922, + "learning_rate": 2.11302614722499e-06, + "loss": 0.7506, + "step": 64824 + }, + { + "epoch": 0.8103452586314658, + "grad_norm": 3.5421993732452393, + "learning_rate": 2.1124896641872204e-06, + "loss": 1.6595, + "step": 64826 + }, + { + "epoch": 0.8103702592564814, + "grad_norm": 1.9963483810424805, + "learning_rate": 2.11195324121962e-06, + "loss": 1.0608, + "step": 64828 + }, + { + "epoch": 0.8103952598814971, + "grad_norm": 4.55129337310791, + "learning_rate": 2.1114168783262613e-06, + "loss": 1.3403, + "step": 64830 + }, + { + "epoch": 0.8104202605065126, + "grad_norm": 5.101686954498291, + "learning_rate": 2.1108805755112373e-06, + "loss": 1.5667, + "step": 64832 + }, + { + "epoch": 0.8104452611315283, + "grad_norm": 1.7683185338974, + "learning_rate": 2.110344332778633e-06, + "loss": 0.808, + "step": 64834 + }, + { + "epoch": 0.8104702617565439, + "grad_norm": 1.513716697692871, + "learning_rate": 2.1098081501325296e-06, + "loss": 0.8003, + "step": 64836 + }, + { + "epoch": 0.8104952623815596, + "grad_norm": 2.213721513748169, + "learning_rate": 2.109272027577013e-06, + "loss": 0.8769, + "step": 64838 + }, + { + "epoch": 0.8105202630065752, + "grad_norm": 1.749337911605835, + "learning_rate": 2.1087359651161623e-06, + "loss": 0.6949, + "step": 64840 + }, + { + "epoch": 0.8105452636315907, + "grad_norm": 3.8602256774902344, + "learning_rate": 2.1081999627540673e-06, + "loss": 1.2321, + "step": 64842 + }, + { + "epoch": 0.8105702642566064, + "grad_norm": 2.908550977706909, + "learning_rate": 2.1076640204948007e-06, + "loss": 1.0781, + "step": 64844 + }, + { + "epoch": 0.810595264881622, + "grad_norm": 4.642405986785889, + "learning_rate": 2.107128138342449e-06, + "loss": 1.1126, + "step": 64846 + }, + { + "epoch": 0.8106202655066377, + "grad_norm": 6.291177749633789, + "learning_rate": 2.106592316301096e-06, + "loss": 0.8143, + "step": 64848 + }, + { + "epoch": 0.8106452661316533, + "grad_norm": 2.4375598430633545, + "learning_rate": 2.106056554374817e-06, + "loss": 1.1825, + "step": 64850 + }, + { + "epoch": 0.8106702667566689, + "grad_norm": 3.8743560314178467, + "learning_rate": 2.1055208525676974e-06, + "loss": 1.74, + "step": 64852 + }, + { + "epoch": 0.8106952673816845, + "grad_norm": 4.339580059051514, + "learning_rate": 2.104985210883814e-06, + "loss": 0.7185, + "step": 64854 + }, + { + "epoch": 0.8107202680067002, + "grad_norm": 5.308068752288818, + "learning_rate": 2.1044496293272474e-06, + "loss": 0.5488, + "step": 64856 + }, + { + "epoch": 0.8107452686317158, + "grad_norm": 0.00031403079628944397, + "learning_rate": 2.103914107902073e-06, + "loss": 0.0007, + "step": 64858 + }, + { + "epoch": 0.8107702692567315, + "grad_norm": 0.0002666909131221473, + "learning_rate": 2.103378646612373e-06, + "loss": 0.0975, + "step": 64860 + }, + { + "epoch": 0.810795269881747, + "grad_norm": 4.733613967895508, + "learning_rate": 2.102843245462226e-06, + "loss": 1.3396, + "step": 64862 + }, + { + "epoch": 0.8108202705067626, + "grad_norm": 4.1614789962768555, + "learning_rate": 2.102307904455706e-06, + "loss": 2.0753, + "step": 64864 + }, + { + "epoch": 0.8108452711317783, + "grad_norm": 3.5520224571228027, + "learning_rate": 2.101772623596895e-06, + "loss": 1.1432, + "step": 64866 + }, + { + "epoch": 0.8108702717567939, + "grad_norm": 2.3231687545776367, + "learning_rate": 2.1012374028898664e-06, + "loss": 0.9457, + "step": 64868 + }, + { + "epoch": 0.8108952723818096, + "grad_norm": 2.3082242012023926, + "learning_rate": 2.1007022423386937e-06, + "loss": 0.9022, + "step": 64870 + }, + { + "epoch": 0.8109202730068251, + "grad_norm": 3.409868001937866, + "learning_rate": 2.100167141947459e-06, + "loss": 0.1668, + "step": 64872 + }, + { + "epoch": 0.8109452736318408, + "grad_norm": 4.656825065612793, + "learning_rate": 2.0996321017202313e-06, + "loss": 1.949, + "step": 64874 + }, + { + "epoch": 0.8109702742568564, + "grad_norm": 2.6957333087921143, + "learning_rate": 2.099097121661091e-06, + "loss": 0.6746, + "step": 64876 + }, + { + "epoch": 0.8109952748818721, + "grad_norm": 8.449694633483887, + "learning_rate": 2.098562201774108e-06, + "loss": 2.2213, + "step": 64878 + }, + { + "epoch": 0.8110202755068877, + "grad_norm": 3.2269299030303955, + "learning_rate": 2.0980273420633622e-06, + "loss": 0.6725, + "step": 64880 + }, + { + "epoch": 0.8110452761319032, + "grad_norm": 2.9878833293914795, + "learning_rate": 2.0974925425329172e-06, + "loss": 0.9872, + "step": 64882 + }, + { + "epoch": 0.8110702767569189, + "grad_norm": 3.0287928581237793, + "learning_rate": 2.096957803186852e-06, + "loss": 0.2813, + "step": 64884 + }, + { + "epoch": 0.8110952773819345, + "grad_norm": 1.1295133829116821, + "learning_rate": 2.096423124029241e-06, + "loss": 1.1124, + "step": 64886 + }, + { + "epoch": 0.8111202780069502, + "grad_norm": 9.009883880615234, + "learning_rate": 2.095888505064152e-06, + "loss": 0.4878, + "step": 64888 + }, + { + "epoch": 0.8111452786319658, + "grad_norm": 2.9493191242218018, + "learning_rate": 2.0953539462956597e-06, + "loss": 1.0581, + "step": 64890 + }, + { + "epoch": 0.8111702792569814, + "grad_norm": 1.143957495689392, + "learning_rate": 2.0948194477278314e-06, + "loss": 0.5983, + "step": 64892 + }, + { + "epoch": 0.811195279881997, + "grad_norm": 4.035531044006348, + "learning_rate": 2.094285009364744e-06, + "loss": 1.0665, + "step": 64894 + }, + { + "epoch": 0.8112202805070127, + "grad_norm": 4.107020378112793, + "learning_rate": 2.093750631210464e-06, + "loss": 0.7025, + "step": 64896 + }, + { + "epoch": 0.8112452811320283, + "grad_norm": 3.179515838623047, + "learning_rate": 2.0932163132690574e-06, + "loss": 0.733, + "step": 64898 + }, + { + "epoch": 0.811270281757044, + "grad_norm": 4.602647304534912, + "learning_rate": 2.0926820555446004e-06, + "loss": 0.4649, + "step": 64900 + }, + { + "epoch": 0.8112952823820595, + "grad_norm": 3.207775115966797, + "learning_rate": 2.0921478580411557e-06, + "loss": 1.0379, + "step": 64902 + }, + { + "epoch": 0.8113202830070751, + "grad_norm": 2.1554925441741943, + "learning_rate": 2.0916137207627983e-06, + "loss": 1.2663, + "step": 64904 + }, + { + "epoch": 0.8113452836320908, + "grad_norm": 0.9713231921195984, + "learning_rate": 2.0910796437135915e-06, + "loss": 0.6035, + "step": 64906 + }, + { + "epoch": 0.8113702842571064, + "grad_norm": 3.422964096069336, + "learning_rate": 2.090545626897601e-06, + "loss": 1.0654, + "step": 64908 + }, + { + "epoch": 0.8113952848821221, + "grad_norm": 2.656816244125366, + "learning_rate": 2.0900116703189e-06, + "loss": 1.2165, + "step": 64910 + }, + { + "epoch": 0.8114202855071376, + "grad_norm": 3.9154675006866455, + "learning_rate": 2.0894777739815488e-06, + "loss": 1.5745, + "step": 64912 + }, + { + "epoch": 0.8114452861321533, + "grad_norm": 0.0004963506944477558, + "learning_rate": 2.088943937889618e-06, + "loss": 0.5521, + "step": 64914 + }, + { + "epoch": 0.8114702867571689, + "grad_norm": 3.825993299484253, + "learning_rate": 2.0884101620471696e-06, + "loss": 0.8786, + "step": 64916 + }, + { + "epoch": 0.8114952873821846, + "grad_norm": 4.557299613952637, + "learning_rate": 2.087876446458272e-06, + "loss": 1.7292, + "step": 64918 + }, + { + "epoch": 0.8115202880072002, + "grad_norm": 3.863065004348755, + "learning_rate": 2.0873427911269904e-06, + "loss": 1.3648, + "step": 64920 + }, + { + "epoch": 0.8115452886322158, + "grad_norm": 5.594873428344727, + "learning_rate": 2.0868091960573835e-06, + "loss": 1.1502, + "step": 64922 + }, + { + "epoch": 0.8115702892572314, + "grad_norm": 2.7565550804138184, + "learning_rate": 2.0862756612535216e-06, + "loss": 0.4709, + "step": 64924 + }, + { + "epoch": 0.811595289882247, + "grad_norm": 5.148769378662109, + "learning_rate": 2.0857421867194615e-06, + "loss": 1.7472, + "step": 64926 + }, + { + "epoch": 0.8116202905072627, + "grad_norm": 1.6611214876174927, + "learning_rate": 2.085208772459274e-06, + "loss": 1.3403, + "step": 64928 + }, + { + "epoch": 0.8116452911322783, + "grad_norm": 0.7488422393798828, + "learning_rate": 2.0846754184770135e-06, + "loss": 0.2591, + "step": 64930 + }, + { + "epoch": 0.8116702917572939, + "grad_norm": 1.8055102825164795, + "learning_rate": 2.084142124776749e-06, + "loss": 0.2278, + "step": 64932 + }, + { + "epoch": 0.8116952923823095, + "grad_norm": 3.543928623199463, + "learning_rate": 2.0836088913625386e-06, + "loss": 0.7306, + "step": 64934 + }, + { + "epoch": 0.8117202930073252, + "grad_norm": 6.70405912399292, + "learning_rate": 2.0830757182384396e-06, + "loss": 2.0732, + "step": 64936 + }, + { + "epoch": 0.8117452936323408, + "grad_norm": 3.6981985569000244, + "learning_rate": 2.082542605408521e-06, + "loss": 0.7751, + "step": 64938 + }, + { + "epoch": 0.8117702942573565, + "grad_norm": 2.149648904800415, + "learning_rate": 2.0820095528768337e-06, + "loss": 2.0132, + "step": 64940 + }, + { + "epoch": 0.811795294882372, + "grad_norm": 2.664952516555786, + "learning_rate": 2.0814765606474453e-06, + "loss": 0.2007, + "step": 64942 + }, + { + "epoch": 0.8118202955073877, + "grad_norm": 3.1190974712371826, + "learning_rate": 2.080943628724409e-06, + "loss": 0.6128, + "step": 64944 + }, + { + "epoch": 0.8118452961324033, + "grad_norm": 2.0880727767944336, + "learning_rate": 2.0804107571117893e-06, + "loss": 0.9169, + "step": 64946 + }, + { + "epoch": 0.811870296757419, + "grad_norm": 3.284470558166504, + "learning_rate": 2.079877945813641e-06, + "loss": 0.9656, + "step": 64948 + }, + { + "epoch": 0.8118952973824346, + "grad_norm": 3.777500867843628, + "learning_rate": 2.079345194834019e-06, + "loss": 0.5874, + "step": 64950 + }, + { + "epoch": 0.8119202980074501, + "grad_norm": 3.4836068153381348, + "learning_rate": 2.078812504176988e-06, + "loss": 0.9808, + "step": 64952 + }, + { + "epoch": 0.8119452986324658, + "grad_norm": 0.0034553399309515953, + "learning_rate": 2.078279873846597e-06, + "loss": 0.3733, + "step": 64954 + }, + { + "epoch": 0.8119702992574814, + "grad_norm": 2.966916084289551, + "learning_rate": 2.0777473038469086e-06, + "loss": 1.0075, + "step": 64956 + }, + { + "epoch": 0.8119952998824971, + "grad_norm": 3.247316360473633, + "learning_rate": 2.0772147941819777e-06, + "loss": 1.2648, + "step": 64958 + }, + { + "epoch": 0.8120203005075127, + "grad_norm": 7.15998649597168, + "learning_rate": 2.0766823448558547e-06, + "loss": 0.3248, + "step": 64960 + }, + { + "epoch": 0.8120453011325283, + "grad_norm": 3.8265790939331055, + "learning_rate": 2.076149955872602e-06, + "loss": 1.1716, + "step": 64962 + }, + { + "epoch": 0.8120703017575439, + "grad_norm": 3.431586503982544, + "learning_rate": 2.075617627236268e-06, + "loss": 1.0082, + "step": 64964 + }, + { + "epoch": 0.8120953023825596, + "grad_norm": 3.487589120864868, + "learning_rate": 2.0750853589509125e-06, + "loss": 1.6075, + "step": 64966 + }, + { + "epoch": 0.8121203030075752, + "grad_norm": 3.4701883792877197, + "learning_rate": 2.074553151020582e-06, + "loss": 1.4986, + "step": 64968 + }, + { + "epoch": 0.8121453036325909, + "grad_norm": 3.648547887802124, + "learning_rate": 2.074021003449338e-06, + "loss": 1.1034, + "step": 64970 + }, + { + "epoch": 0.8121703042576064, + "grad_norm": 0.0013715199893340468, + "learning_rate": 2.073488916241229e-06, + "loss": 0.281, + "step": 64972 + }, + { + "epoch": 0.812195304882622, + "grad_norm": 4.903792381286621, + "learning_rate": 2.072956889400305e-06, + "loss": 0.2826, + "step": 64974 + }, + { + "epoch": 0.8122203055076377, + "grad_norm": 0.7690901160240173, + "learning_rate": 2.0724249229306226e-06, + "loss": 0.4844, + "step": 64976 + }, + { + "epoch": 0.8122453061326533, + "grad_norm": 0.04480157047510147, + "learning_rate": 2.0718930168362274e-06, + "loss": 0.6648, + "step": 64978 + }, + { + "epoch": 0.812270306757669, + "grad_norm": 6.067092418670654, + "learning_rate": 2.071361171121178e-06, + "loss": 2.4247, + "step": 64980 + }, + { + "epoch": 0.8122953073826845, + "grad_norm": 3.561228036880493, + "learning_rate": 2.0708293857895168e-06, + "loss": 2.0633, + "step": 64982 + }, + { + "epoch": 0.8123203080077002, + "grad_norm": 1.988223671913147, + "learning_rate": 2.070297660845302e-06, + "loss": 0.9629, + "step": 64984 + }, + { + "epoch": 0.8123453086327158, + "grad_norm": 3.480933666229248, + "learning_rate": 2.0697659962925774e-06, + "loss": 1.1734, + "step": 64986 + }, + { + "epoch": 0.8123703092577315, + "grad_norm": 4.486696243286133, + "learning_rate": 2.069234392135391e-06, + "loss": 1.1565, + "step": 64988 + }, + { + "epoch": 0.8123953098827471, + "grad_norm": 0.655378520488739, + "learning_rate": 2.068702848377797e-06, + "loss": 0.1673, + "step": 64990 + }, + { + "epoch": 0.8124203105077626, + "grad_norm": 2.5535073280334473, + "learning_rate": 2.0681713650238378e-06, + "loss": 0.817, + "step": 64992 + }, + { + "epoch": 0.8124453111327783, + "grad_norm": 0.00023460100055672228, + "learning_rate": 2.067639942077567e-06, + "loss": 0.0, + "step": 64994 + }, + { + "epoch": 0.8124703117577939, + "grad_norm": 0.00035782670602202415, + "learning_rate": 2.0671085795430256e-06, + "loss": 0.0013, + "step": 64996 + }, + { + "epoch": 0.8124953123828096, + "grad_norm": 6.9333906173706055, + "learning_rate": 2.0665772774242656e-06, + "loss": 1.3457, + "step": 64998 + }, + { + "epoch": 0.8125203130078252, + "grad_norm": 0.16817088425159454, + "learning_rate": 2.066046035725332e-06, + "loss": 0.1143, + "step": 65000 + }, + { + "epoch": 0.8125453136328408, + "grad_norm": 0.001236241078004241, + "learning_rate": 2.065514854450267e-06, + "loss": 0.5487, + "step": 65002 + }, + { + "epoch": 0.8125703142578564, + "grad_norm": 3.1949775218963623, + "learning_rate": 2.064983733603121e-06, + "loss": 1.1495, + "step": 65004 + }, + { + "epoch": 0.8125953148828721, + "grad_norm": 3.454172134399414, + "learning_rate": 2.0644526731879342e-06, + "loss": 0.8657, + "step": 65006 + }, + { + "epoch": 0.8126203155078877, + "grad_norm": 3.099663734436035, + "learning_rate": 2.0639216732087564e-06, + "loss": 0.865, + "step": 65008 + }, + { + "epoch": 0.8126453161329034, + "grad_norm": 0.0004056219768244773, + "learning_rate": 2.063390733669629e-06, + "loss": 0.5277, + "step": 65010 + }, + { + "epoch": 0.8126703167579189, + "grad_norm": 3.723021984100342, + "learning_rate": 2.0628598545745925e-06, + "loss": 0.2629, + "step": 65012 + }, + { + "epoch": 0.8126953173829345, + "grad_norm": 5.327544689178467, + "learning_rate": 2.062329035927696e-06, + "loss": 1.1991, + "step": 65014 + }, + { + "epoch": 0.8127203180079502, + "grad_norm": 2.361414909362793, + "learning_rate": 2.061798277732977e-06, + "loss": 0.4657, + "step": 65016 + }, + { + "epoch": 0.8127453186329658, + "grad_norm": 6.952465534210205, + "learning_rate": 2.0612675799944814e-06, + "loss": 1.2904, + "step": 65018 + }, + { + "epoch": 0.8127703192579815, + "grad_norm": 2.7830371856689453, + "learning_rate": 2.0607369427162472e-06, + "loss": 0.1761, + "step": 65020 + }, + { + "epoch": 0.812795319882997, + "grad_norm": 2.7387008666992188, + "learning_rate": 2.0602063659023195e-06, + "loss": 1.2937, + "step": 65022 + }, + { + "epoch": 0.8128203205080127, + "grad_norm": 3.0337929725646973, + "learning_rate": 2.059675849556738e-06, + "loss": 0.9813, + "step": 65024 + }, + { + "epoch": 0.8128453211330283, + "grad_norm": 4.424925804138184, + "learning_rate": 2.059145393683539e-06, + "loss": 1.115, + "step": 65026 + }, + { + "epoch": 0.812870321758044, + "grad_norm": 0.41689038276672363, + "learning_rate": 2.05861499828677e-06, + "loss": 0.0633, + "step": 65028 + }, + { + "epoch": 0.8128953223830596, + "grad_norm": 1.2096583843231201, + "learning_rate": 2.058084663370462e-06, + "loss": 0.6455, + "step": 65030 + }, + { + "epoch": 0.8129203230080752, + "grad_norm": 3.5812408924102783, + "learning_rate": 2.0575543889386617e-06, + "loss": 1.3061, + "step": 65032 + }, + { + "epoch": 0.8129453236330908, + "grad_norm": 2.3594613075256348, + "learning_rate": 2.0570241749954003e-06, + "loss": 0.4995, + "step": 65034 + }, + { + "epoch": 0.8129703242581064, + "grad_norm": 1.749715805053711, + "learning_rate": 2.0564940215447226e-06, + "loss": 0.3075, + "step": 65036 + }, + { + "epoch": 0.8129953248831221, + "grad_norm": 2.3954501152038574, + "learning_rate": 2.055963928590664e-06, + "loss": 0.244, + "step": 65038 + }, + { + "epoch": 0.8130203255081377, + "grad_norm": 3.633943796157837, + "learning_rate": 2.055433896137258e-06, + "loss": 0.8999, + "step": 65040 + }, + { + "epoch": 0.8130453261331533, + "grad_norm": 3.011323928833008, + "learning_rate": 2.0549039241885447e-06, + "loss": 1.2987, + "step": 65042 + }, + { + "epoch": 0.8130703267581689, + "grad_norm": 1.0841937065124512, + "learning_rate": 2.054374012748559e-06, + "loss": 0.2242, + "step": 65044 + }, + { + "epoch": 0.8130953273831846, + "grad_norm": 0.0015530207892879844, + "learning_rate": 2.0538441618213388e-06, + "loss": 0.3959, + "step": 65046 + }, + { + "epoch": 0.8131203280082002, + "grad_norm": 2.7126946449279785, + "learning_rate": 2.053314371410916e-06, + "loss": 1.0241, + "step": 65048 + }, + { + "epoch": 0.8131453286332159, + "grad_norm": 0.0003000391006935388, + "learning_rate": 2.0527846415213294e-06, + "loss": 0.1163, + "step": 65050 + }, + { + "epoch": 0.8131703292582314, + "grad_norm": 5.963180065155029, + "learning_rate": 2.052254972156611e-06, + "loss": 1.2563, + "step": 65052 + }, + { + "epoch": 0.813195329883247, + "grad_norm": 2.2043988704681396, + "learning_rate": 2.051725363320792e-06, + "loss": 0.7045, + "step": 65054 + }, + { + "epoch": 0.8132203305082627, + "grad_norm": 2.6620235443115234, + "learning_rate": 2.051195815017911e-06, + "loss": 0.5642, + "step": 65056 + }, + { + "epoch": 0.8132453311332783, + "grad_norm": 4.958466053009033, + "learning_rate": 2.0506663272519966e-06, + "loss": 0.9235, + "step": 65058 + }, + { + "epoch": 0.813270331758294, + "grad_norm": 2.746786594390869, + "learning_rate": 2.0501369000270865e-06, + "loss": 0.4473, + "step": 65060 + }, + { + "epoch": 0.8132953323833095, + "grad_norm": 5.942471504211426, + "learning_rate": 2.0496075333472055e-06, + "loss": 2.2774, + "step": 65062 + }, + { + "epoch": 0.8133203330083252, + "grad_norm": 6.280802249908447, + "learning_rate": 2.0490782272163935e-06, + "loss": 2.0229, + "step": 65064 + }, + { + "epoch": 0.8133453336333408, + "grad_norm": 5.649764537811279, + "learning_rate": 2.048548981638676e-06, + "loss": 0.9279, + "step": 65066 + }, + { + "epoch": 0.8133703342583565, + "grad_norm": 4.653924942016602, + "learning_rate": 2.0480197966180824e-06, + "loss": 0.8236, + "step": 65068 + }, + { + "epoch": 0.8133953348833721, + "grad_norm": 3.7415645122528076, + "learning_rate": 2.0474906721586496e-06, + "loss": 1.9898, + "step": 65070 + }, + { + "epoch": 0.8134203355083877, + "grad_norm": 0.0004423638165462762, + "learning_rate": 2.0469616082643993e-06, + "loss": 0.1082, + "step": 65072 + }, + { + "epoch": 0.8134453361334033, + "grad_norm": 3.672919750213623, + "learning_rate": 2.0464326049393677e-06, + "loss": 1.0221, + "step": 65074 + }, + { + "epoch": 0.813470336758419, + "grad_norm": 2.289686918258667, + "learning_rate": 2.0459036621875807e-06, + "loss": 0.9167, + "step": 65076 + }, + { + "epoch": 0.8134953373834346, + "grad_norm": 4.922574043273926, + "learning_rate": 2.045374780013064e-06, + "loss": 1.2327, + "step": 65078 + }, + { + "epoch": 0.8135203380084503, + "grad_norm": 7.78794527053833, + "learning_rate": 2.04484595841985e-06, + "loss": 2.0481, + "step": 65080 + }, + { + "epoch": 0.8135453386334658, + "grad_norm": 3.151050567626953, + "learning_rate": 2.0443171974119626e-06, + "loss": 1.0474, + "step": 65082 + }, + { + "epoch": 0.8135703392584814, + "grad_norm": 4.9604692459106445, + "learning_rate": 2.043788496993433e-06, + "loss": 0.6392, + "step": 65084 + }, + { + "epoch": 0.8135953398834971, + "grad_norm": 2.1711034774780273, + "learning_rate": 2.0432598571682817e-06, + "loss": 1.3022, + "step": 65086 + }, + { + "epoch": 0.8136203405085127, + "grad_norm": 2.7460241317749023, + "learning_rate": 2.0427312779405384e-06, + "loss": 0.3561, + "step": 65088 + }, + { + "epoch": 0.8136453411335284, + "grad_norm": 1.9569387435913086, + "learning_rate": 2.0422027593142336e-06, + "loss": 0.3568, + "step": 65090 + }, + { + "epoch": 0.8136703417585439, + "grad_norm": 3.6768429279327393, + "learning_rate": 2.0416743012933827e-06, + "loss": 1.129, + "step": 65092 + }, + { + "epoch": 0.8136953423835596, + "grad_norm": 3.9250895977020264, + "learning_rate": 2.041145903882017e-06, + "loss": 0.65, + "step": 65094 + }, + { + "epoch": 0.8137203430085752, + "grad_norm": 2.797905921936035, + "learning_rate": 2.0406175670841556e-06, + "loss": 0.851, + "step": 65096 + }, + { + "epoch": 0.8137453436335909, + "grad_norm": 1.0316325426101685, + "learning_rate": 2.0400892909038283e-06, + "loss": 1.3385, + "step": 65098 + }, + { + "epoch": 0.8137703442586065, + "grad_norm": 1.9791302680969238, + "learning_rate": 2.0395610753450536e-06, + "loss": 0.3837, + "step": 65100 + }, + { + "epoch": 0.813795344883622, + "grad_norm": 5.522444725036621, + "learning_rate": 2.0390329204118573e-06, + "loss": 0.939, + "step": 65102 + }, + { + "epoch": 0.8138203455086377, + "grad_norm": 3.0484657287597656, + "learning_rate": 2.0385048261082605e-06, + "loss": 1.6348, + "step": 65104 + }, + { + "epoch": 0.8138453461336533, + "grad_norm": 2.064479351043701, + "learning_rate": 2.037976792438283e-06, + "loss": 0.0646, + "step": 65106 + }, + { + "epoch": 0.813870346758669, + "grad_norm": 1.0070501565933228, + "learning_rate": 2.0374488194059515e-06, + "loss": 0.0126, + "step": 65108 + }, + { + "epoch": 0.8138953473836846, + "grad_norm": 7.219589710235596, + "learning_rate": 2.0369209070152805e-06, + "loss": 0.9172, + "step": 65110 + }, + { + "epoch": 0.8139203480087002, + "grad_norm": 2.2756030559539795, + "learning_rate": 2.036393055270296e-06, + "loss": 0.1007, + "step": 65112 + }, + { + "epoch": 0.8139453486337158, + "grad_norm": 0.15038232505321503, + "learning_rate": 2.0358652641750144e-06, + "loss": 0.0029, + "step": 65114 + }, + { + "epoch": 0.8139703492587315, + "grad_norm": 3.016378402709961, + "learning_rate": 2.035337533733458e-06, + "loss": 0.4486, + "step": 65116 + }, + { + "epoch": 0.8139953498837471, + "grad_norm": 0.7986680865287781, + "learning_rate": 2.034809863949646e-06, + "loss": 0.6757, + "step": 65118 + }, + { + "epoch": 0.8140203505087628, + "grad_norm": 0.16626757383346558, + "learning_rate": 2.0342822548275922e-06, + "loss": 0.5101, + "step": 65120 + }, + { + "epoch": 0.8140453511337783, + "grad_norm": 2.174020290374756, + "learning_rate": 2.0337547063713203e-06, + "loss": 0.1528, + "step": 65122 + }, + { + "epoch": 0.8140703517587939, + "grad_norm": 7.38098669052124, + "learning_rate": 2.0332272185848446e-06, + "loss": 1.33, + "step": 65124 + }, + { + "epoch": 0.8140953523838096, + "grad_norm": 0.34484100341796875, + "learning_rate": 2.0326997914721834e-06, + "loss": 0.3141, + "step": 65126 + }, + { + "epoch": 0.8141203530088252, + "grad_norm": 0.0045156823471188545, + "learning_rate": 2.032172425037359e-06, + "loss": 0.4346, + "step": 65128 + }, + { + "epoch": 0.8141453536338409, + "grad_norm": 3.54459547996521, + "learning_rate": 2.031645119284378e-06, + "loss": 0.7539, + "step": 65130 + }, + { + "epoch": 0.8141703542588564, + "grad_norm": 0.00034852372482419014, + "learning_rate": 2.0311178742172633e-06, + "loss": 1.1621, + "step": 65132 + }, + { + "epoch": 0.8141953548838721, + "grad_norm": 2.6081295013427734, + "learning_rate": 2.0305906898400243e-06, + "loss": 0.5086, + "step": 65134 + }, + { + "epoch": 0.8142203555088877, + "grad_norm": 3.7066235542297363, + "learning_rate": 2.0300635661566838e-06, + "loss": 0.6824, + "step": 65136 + }, + { + "epoch": 0.8142453561339034, + "grad_norm": 0.595924973487854, + "learning_rate": 2.0295365031712486e-06, + "loss": 0.3419, + "step": 65138 + }, + { + "epoch": 0.814270356758919, + "grad_norm": 3.4581985473632812, + "learning_rate": 2.0290095008877364e-06, + "loss": 1.3998, + "step": 65140 + }, + { + "epoch": 0.8142953573839345, + "grad_norm": 3.3116631507873535, + "learning_rate": 2.0284825593101653e-06, + "loss": 0.677, + "step": 65142 + }, + { + "epoch": 0.8143203580089502, + "grad_norm": 3.152639150619507, + "learning_rate": 2.027955678442539e-06, + "loss": 1.447, + "step": 65144 + }, + { + "epoch": 0.8143453586339658, + "grad_norm": 5.121102809906006, + "learning_rate": 2.0274288582888778e-06, + "loss": 1.4452, + "step": 65146 + }, + { + "epoch": 0.8143703592589815, + "grad_norm": 2.522772789001465, + "learning_rate": 2.0269020988531886e-06, + "loss": 0.9497, + "step": 65148 + }, + { + "epoch": 0.8143953598839971, + "grad_norm": 0.5530203580856323, + "learning_rate": 2.0263754001394855e-06, + "loss": 0.6859, + "step": 65150 + }, + { + "epoch": 0.8144203605090127, + "grad_norm": 0.0027807201258838177, + "learning_rate": 2.0258487621517817e-06, + "loss": 0.7317, + "step": 65152 + }, + { + "epoch": 0.8144453611340283, + "grad_norm": 1.7281780242919922, + "learning_rate": 2.0253221848940864e-06, + "loss": 0.6804, + "step": 65154 + }, + { + "epoch": 0.814470361759044, + "grad_norm": 2.4571568965911865, + "learning_rate": 2.0247956683704097e-06, + "loss": 1.0679, + "step": 65156 + }, + { + "epoch": 0.8144953623840596, + "grad_norm": 5.520649433135986, + "learning_rate": 2.0242692125847585e-06, + "loss": 0.9045, + "step": 65158 + }, + { + "epoch": 0.8145203630090753, + "grad_norm": 6.767752647399902, + "learning_rate": 2.023742817541149e-06, + "loss": 1.4095, + "step": 65160 + }, + { + "epoch": 0.8145453636340908, + "grad_norm": 4.567732334136963, + "learning_rate": 2.023216483243582e-06, + "loss": 1.5222, + "step": 65162 + }, + { + "epoch": 0.8145703642591064, + "grad_norm": 4.936336517333984, + "learning_rate": 2.022690209696071e-06, + "loss": 1.2835, + "step": 65164 + }, + { + "epoch": 0.8145953648841221, + "grad_norm": 5.747333526611328, + "learning_rate": 2.022163996902625e-06, + "loss": 0.6238, + "step": 65166 + }, + { + "epoch": 0.8146203655091377, + "grad_norm": 2.475749969482422, + "learning_rate": 2.0216378448672515e-06, + "loss": 0.556, + "step": 65168 + }, + { + "epoch": 0.8146453661341534, + "grad_norm": 4.341180801391602, + "learning_rate": 2.021111753593955e-06, + "loss": 1.7089, + "step": 65170 + }, + { + "epoch": 0.8146703667591689, + "grad_norm": 2.471313953399658, + "learning_rate": 2.0205857230867397e-06, + "loss": 0.3868, + "step": 65172 + }, + { + "epoch": 0.8146953673841846, + "grad_norm": 0.00024096257402561605, + "learning_rate": 2.020059753349619e-06, + "loss": 0.0, + "step": 65174 + }, + { + "epoch": 0.8147203680092002, + "grad_norm": 3.428864002227783, + "learning_rate": 2.019533844386592e-06, + "loss": 0.7313, + "step": 65176 + }, + { + "epoch": 0.8147453686342159, + "grad_norm": 4.385712623596191, + "learning_rate": 2.019007996201666e-06, + "loss": 1.5345, + "step": 65178 + }, + { + "epoch": 0.8147703692592315, + "grad_norm": 5.463025093078613, + "learning_rate": 2.018482208798852e-06, + "loss": 1.5758, + "step": 65180 + }, + { + "epoch": 0.8147953698842471, + "grad_norm": 2.310756206512451, + "learning_rate": 2.017956482182144e-06, + "loss": 1.0835, + "step": 65182 + }, + { + "epoch": 0.8148203705092627, + "grad_norm": 5.184326171875, + "learning_rate": 2.017430816355552e-06, + "loss": 0.8067, + "step": 65184 + }, + { + "epoch": 0.8148453711342784, + "grad_norm": 3.0280330181121826, + "learning_rate": 2.016905211323077e-06, + "loss": 0.708, + "step": 65186 + }, + { + "epoch": 0.814870371759294, + "grad_norm": 0.000493421743158251, + "learning_rate": 2.0163796670887224e-06, + "loss": 0.0192, + "step": 65188 + }, + { + "epoch": 0.8148953723843096, + "grad_norm": 2.470219373703003, + "learning_rate": 2.0158541836564937e-06, + "loss": 0.479, + "step": 65190 + }, + { + "epoch": 0.8149203730093252, + "grad_norm": 2.459475040435791, + "learning_rate": 2.0153287610303886e-06, + "loss": 1.1667, + "step": 65192 + }, + { + "epoch": 0.8149453736343408, + "grad_norm": 3.163419008255005, + "learning_rate": 2.0148033992144144e-06, + "loss": 1.6599, + "step": 65194 + }, + { + "epoch": 0.8149703742593565, + "grad_norm": 6.6779046058654785, + "learning_rate": 2.014278098212564e-06, + "loss": 1.3288, + "step": 65196 + }, + { + "epoch": 0.8149953748843721, + "grad_norm": 2.672177314758301, + "learning_rate": 2.0137528580288457e-06, + "loss": 0.4542, + "step": 65198 + }, + { + "epoch": 0.8150203755093878, + "grad_norm": 4.475246906280518, + "learning_rate": 2.013227678667252e-06, + "loss": 0.8528, + "step": 65200 + }, + { + "epoch": 0.8150453761344033, + "grad_norm": 2.023885726928711, + "learning_rate": 2.0127025601317875e-06, + "loss": 1.3912, + "step": 65202 + }, + { + "epoch": 0.815070376759419, + "grad_norm": 2.0638961791992188, + "learning_rate": 2.0121775024264535e-06, + "loss": 1.3005, + "step": 65204 + }, + { + "epoch": 0.8150953773844346, + "grad_norm": 1.8989341259002686, + "learning_rate": 2.011652505555246e-06, + "loss": 0.3585, + "step": 65206 + }, + { + "epoch": 0.8151203780094503, + "grad_norm": 7.470399856567383, + "learning_rate": 2.0111275695221645e-06, + "loss": 2.6623, + "step": 65208 + }, + { + "epoch": 0.8151453786344659, + "grad_norm": 2.7772035598754883, + "learning_rate": 2.010602694331202e-06, + "loss": 1.1404, + "step": 65210 + }, + { + "epoch": 0.8151703792594814, + "grad_norm": 0.24974437057971954, + "learning_rate": 2.0100778799863584e-06, + "loss": 0.0969, + "step": 65212 + }, + { + "epoch": 0.8151953798844971, + "grad_norm": 2.5843052864074707, + "learning_rate": 2.0095531264916355e-06, + "loss": 0.6554, + "step": 65214 + }, + { + "epoch": 0.8152203805095127, + "grad_norm": 0.00020898046204820275, + "learning_rate": 2.0090284338510244e-06, + "loss": 0.2258, + "step": 65216 + }, + { + "epoch": 0.8152453811345284, + "grad_norm": 5.040201663970947, + "learning_rate": 2.0085038020685245e-06, + "loss": 1.5262, + "step": 65218 + }, + { + "epoch": 0.815270381759544, + "grad_norm": 3.9913859367370605, + "learning_rate": 2.0079792311481294e-06, + "loss": 0.5686, + "step": 65220 + }, + { + "epoch": 0.8152953823845596, + "grad_norm": 3.473327875137329, + "learning_rate": 2.0074547210938345e-06, + "loss": 0.7725, + "step": 65222 + }, + { + "epoch": 0.8153203830095752, + "grad_norm": 4.237627983093262, + "learning_rate": 2.0069302719096317e-06, + "loss": 0.6636, + "step": 65224 + }, + { + "epoch": 0.8153453836345909, + "grad_norm": 2.4672906398773193, + "learning_rate": 2.006405883599517e-06, + "loss": 0.5824, + "step": 65226 + }, + { + "epoch": 0.8153703842596065, + "grad_norm": 5.842265605926514, + "learning_rate": 2.005881556167488e-06, + "loss": 1.1037, + "step": 65228 + }, + { + "epoch": 0.8153953848846222, + "grad_norm": 3.9034230709075928, + "learning_rate": 2.005357289617531e-06, + "loss": 1.3429, + "step": 65230 + }, + { + "epoch": 0.8154203855096377, + "grad_norm": 2.6521265506744385, + "learning_rate": 2.004833083953649e-06, + "loss": 1.2554, + "step": 65232 + }, + { + "epoch": 0.8154453861346533, + "grad_norm": 2.2808470726013184, + "learning_rate": 2.004308939179822e-06, + "loss": 0.6109, + "step": 65234 + }, + { + "epoch": 0.815470386759669, + "grad_norm": 3.1353368759155273, + "learning_rate": 2.0037848553000496e-06, + "loss": 0.7839, + "step": 65236 + }, + { + "epoch": 0.8154953873846846, + "grad_norm": 1.5396760702133179, + "learning_rate": 2.0032608323183177e-06, + "loss": 0.5477, + "step": 65238 + }, + { + "epoch": 0.8155203880097003, + "grad_norm": 0.4519244134426117, + "learning_rate": 2.002736870238622e-06, + "loss": 0.0272, + "step": 65240 + }, + { + "epoch": 0.8155453886347158, + "grad_norm": 1.4150868654251099, + "learning_rate": 2.0022129690649526e-06, + "loss": 0.0543, + "step": 65242 + }, + { + "epoch": 0.8155703892597315, + "grad_norm": 0.8039318323135376, + "learning_rate": 2.0016891288012963e-06, + "loss": 0.7736, + "step": 65244 + }, + { + "epoch": 0.8155953898847471, + "grad_norm": 10.896690368652344, + "learning_rate": 2.0011653494516502e-06, + "loss": 1.3441, + "step": 65246 + }, + { + "epoch": 0.8156203905097628, + "grad_norm": 2.577281951904297, + "learning_rate": 2.0006416310199916e-06, + "loss": 0.9626, + "step": 65248 + }, + { + "epoch": 0.8156453911347784, + "grad_norm": 2.1545090675354004, + "learning_rate": 2.000117973510316e-06, + "loss": 0.889, + "step": 65250 + }, + { + "epoch": 0.815670391759794, + "grad_norm": 5.226194858551025, + "learning_rate": 1.9995943769266123e-06, + "loss": 1.1707, + "step": 65252 + }, + { + "epoch": 0.8156953923848096, + "grad_norm": 3.038818359375, + "learning_rate": 1.9990708412728645e-06, + "loss": 1.1432, + "step": 65254 + }, + { + "epoch": 0.8157203930098252, + "grad_norm": 3.854353427886963, + "learning_rate": 1.9985473665530643e-06, + "loss": 1.6021, + "step": 65256 + }, + { + "epoch": 0.8157453936348409, + "grad_norm": 4.033385753631592, + "learning_rate": 1.9980239527711954e-06, + "loss": 1.6226, + "step": 65258 + }, + { + "epoch": 0.8157703942598565, + "grad_norm": 2.160903215408325, + "learning_rate": 1.9975005999312446e-06, + "loss": 0.5078, + "step": 65260 + }, + { + "epoch": 0.8157953948848721, + "grad_norm": 2.9817821979522705, + "learning_rate": 1.9969773080371957e-06, + "loss": 0.5714, + "step": 65262 + }, + { + "epoch": 0.8158203955098877, + "grad_norm": 2.401440382003784, + "learning_rate": 1.996454077093035e-06, + "loss": 0.9984, + "step": 65264 + }, + { + "epoch": 0.8158453961349034, + "grad_norm": 3.5450291633605957, + "learning_rate": 1.9959309071027513e-06, + "loss": 0.7378, + "step": 65266 + }, + { + "epoch": 0.815870396759919, + "grad_norm": 2.1047959327697754, + "learning_rate": 1.9954077980703225e-06, + "loss": 0.4149, + "step": 65268 + }, + { + "epoch": 0.8158953973849347, + "grad_norm": 4.042044162750244, + "learning_rate": 1.9948847499997392e-06, + "loss": 1.6441, + "step": 65270 + }, + { + "epoch": 0.8159203980099502, + "grad_norm": 2.1817548274993896, + "learning_rate": 1.9943617628949817e-06, + "loss": 0.7989, + "step": 65272 + }, + { + "epoch": 0.8159453986349658, + "grad_norm": 2.836681604385376, + "learning_rate": 1.9938388367600305e-06, + "loss": 1.0692, + "step": 65274 + }, + { + "epoch": 0.8159703992599815, + "grad_norm": 1.946323275566101, + "learning_rate": 1.9933159715988735e-06, + "loss": 0.2355, + "step": 65276 + }, + { + "epoch": 0.8159953998849971, + "grad_norm": 3.698861837387085, + "learning_rate": 1.9927931674154867e-06, + "loss": 0.4517, + "step": 65278 + }, + { + "epoch": 0.8160204005100128, + "grad_norm": 3.9472968578338623, + "learning_rate": 1.992270424213857e-06, + "loss": 1.9768, + "step": 65280 + }, + { + "epoch": 0.8160454011350283, + "grad_norm": 2.078040599822998, + "learning_rate": 1.9917477419979613e-06, + "loss": 0.8535, + "step": 65282 + }, + { + "epoch": 0.816070401760044, + "grad_norm": 0.00032703811302781105, + "learning_rate": 1.991225120771787e-06, + "loss": 0.0, + "step": 65284 + }, + { + "epoch": 0.8160954023850596, + "grad_norm": 1.6789979934692383, + "learning_rate": 1.990702560539305e-06, + "loss": 0.2044, + "step": 65286 + }, + { + "epoch": 0.8161204030100753, + "grad_norm": 3.382293462753296, + "learning_rate": 1.9901800613045e-06, + "loss": 1.4331, + "step": 65288 + }, + { + "epoch": 0.8161454036350909, + "grad_norm": 3.546665906906128, + "learning_rate": 1.9896576230713527e-06, + "loss": 1.4374, + "step": 65290 + }, + { + "epoch": 0.8161704042601065, + "grad_norm": 2.971907138824463, + "learning_rate": 1.9891352458438375e-06, + "loss": 0.9047, + "step": 65292 + }, + { + "epoch": 0.8161954048851221, + "grad_norm": 4.2803473472595215, + "learning_rate": 1.9886129296259384e-06, + "loss": 0.7517, + "step": 65294 + }, + { + "epoch": 0.8162204055101377, + "grad_norm": 2.8011364936828613, + "learning_rate": 1.988090674421628e-06, + "loss": 1.0423, + "step": 65296 + }, + { + "epoch": 0.8162454061351534, + "grad_norm": 7.161423206329346, + "learning_rate": 1.9875684802348884e-06, + "loss": 1.1245, + "step": 65298 + }, + { + "epoch": 0.816270406760169, + "grad_norm": 1.8789016008377075, + "learning_rate": 1.987046347069694e-06, + "loss": 0.715, + "step": 65300 + }, + { + "epoch": 0.8162954073851846, + "grad_norm": 3.9128220081329346, + "learning_rate": 1.9865242749300194e-06, + "loss": 0.8438, + "step": 65302 + }, + { + "epoch": 0.8163204080102002, + "grad_norm": 3.886172294616699, + "learning_rate": 1.986002263819845e-06, + "loss": 1.6572, + "step": 65304 + }, + { + "epoch": 0.8163454086352159, + "grad_norm": 18.467084884643555, + "learning_rate": 1.985480313743141e-06, + "loss": 1.2791, + "step": 65306 + }, + { + "epoch": 0.8163704092602315, + "grad_norm": 1.4526129961013794, + "learning_rate": 1.984958424703889e-06, + "loss": 0.7255, + "step": 65308 + }, + { + "epoch": 0.8163954098852472, + "grad_norm": 0.6939133405685425, + "learning_rate": 1.9844365967060598e-06, + "loss": 0.5587, + "step": 65310 + }, + { + "epoch": 0.8164204105102627, + "grad_norm": 1.910637378692627, + "learning_rate": 1.983914829753626e-06, + "loss": 0.6407, + "step": 65312 + }, + { + "epoch": 0.8164454111352784, + "grad_norm": 4.436357021331787, + "learning_rate": 1.9833931238505643e-06, + "loss": 1.8989, + "step": 65314 + }, + { + "epoch": 0.816470411760294, + "grad_norm": 2.9008281230926514, + "learning_rate": 1.982871479000845e-06, + "loss": 2.2289, + "step": 65316 + }, + { + "epoch": 0.8164954123853096, + "grad_norm": 0.8338413238525391, + "learning_rate": 1.9823498952084462e-06, + "loss": 0.4763, + "step": 65318 + }, + { + "epoch": 0.8165204130103253, + "grad_norm": 2.2936768531799316, + "learning_rate": 1.9818283724773334e-06, + "loss": 0.8047, + "step": 65320 + }, + { + "epoch": 0.8165454136353408, + "grad_norm": 0.0002938241232186556, + "learning_rate": 1.981306910811486e-06, + "loss": 0.0, + "step": 65322 + }, + { + "epoch": 0.8165704142603565, + "grad_norm": 0.00038657820550724864, + "learning_rate": 1.980785510214869e-06, + "loss": 0.717, + "step": 65324 + }, + { + "epoch": 0.8165954148853721, + "grad_norm": 2.181331157684326, + "learning_rate": 1.980264170691454e-06, + "loss": 0.4847, + "step": 65326 + }, + { + "epoch": 0.8166204155103878, + "grad_norm": 2.038832902908325, + "learning_rate": 1.9797428922452156e-06, + "loss": 0.0379, + "step": 65328 + }, + { + "epoch": 0.8166454161354034, + "grad_norm": 0.002313686767593026, + "learning_rate": 1.9792216748801186e-06, + "loss": 0.5328, + "step": 65330 + }, + { + "epoch": 0.816670416760419, + "grad_norm": 2.55918288230896, + "learning_rate": 1.9787005186001374e-06, + "loss": 0.4773, + "step": 65332 + }, + { + "epoch": 0.8166954173854346, + "grad_norm": 0.03718779608607292, + "learning_rate": 1.9781794234092367e-06, + "loss": 1.0063, + "step": 65334 + }, + { + "epoch": 0.8167204180104503, + "grad_norm": 3.067884683609009, + "learning_rate": 1.9776583893113887e-06, + "loss": 0.3897, + "step": 65336 + }, + { + "epoch": 0.8167454186354659, + "grad_norm": 1.2846111059188843, + "learning_rate": 1.9771374163105607e-06, + "loss": 0.122, + "step": 65338 + }, + { + "epoch": 0.8167704192604816, + "grad_norm": 1.5767128467559814, + "learning_rate": 1.9766165044107165e-06, + "loss": 0.6482, + "step": 65340 + }, + { + "epoch": 0.8167954198854971, + "grad_norm": 2.892617702484131, + "learning_rate": 1.976095653615829e-06, + "loss": 1.1987, + "step": 65342 + }, + { + "epoch": 0.8168204205105127, + "grad_norm": 0.7363359928131104, + "learning_rate": 1.975574863929861e-06, + "loss": 0.1007, + "step": 65344 + }, + { + "epoch": 0.8168454211355284, + "grad_norm": 3.760528564453125, + "learning_rate": 1.975054135356782e-06, + "loss": 1.0499, + "step": 65346 + }, + { + "epoch": 0.816870421760544, + "grad_norm": 3.010089874267578, + "learning_rate": 1.974533467900552e-06, + "loss": 0.6055, + "step": 65348 + }, + { + "epoch": 0.8168954223855597, + "grad_norm": 3.1938116550445557, + "learning_rate": 1.9740128615651442e-06, + "loss": 1.6344, + "step": 65350 + }, + { + "epoch": 0.8169204230105752, + "grad_norm": 0.9487180709838867, + "learning_rate": 1.9734923163545196e-06, + "loss": 0.2152, + "step": 65352 + }, + { + "epoch": 0.8169454236355909, + "grad_norm": 0.0009161620400846004, + "learning_rate": 1.9729718322726386e-06, + "loss": 0.5545, + "step": 65354 + }, + { + "epoch": 0.8169704242606065, + "grad_norm": 3.4953255653381348, + "learning_rate": 1.9724514093234726e-06, + "loss": 0.9853, + "step": 65356 + }, + { + "epoch": 0.8169954248856222, + "grad_norm": 4.807554244995117, + "learning_rate": 1.9719310475109788e-06, + "loss": 1.2886, + "step": 65358 + }, + { + "epoch": 0.8170204255106378, + "grad_norm": 6.152560710906982, + "learning_rate": 1.971410746839125e-06, + "loss": 1.8003, + "step": 65360 + }, + { + "epoch": 0.8170454261356533, + "grad_norm": 6.899741172790527, + "learning_rate": 1.97089050731187e-06, + "loss": 0.9442, + "step": 65362 + }, + { + "epoch": 0.817070426760669, + "grad_norm": 5.228041648864746, + "learning_rate": 1.970370328933179e-06, + "loss": 1.8635, + "step": 65364 + }, + { + "epoch": 0.8170954273856846, + "grad_norm": 1.9056804180145264, + "learning_rate": 1.9698502117070128e-06, + "loss": 0.5116, + "step": 65366 + }, + { + "epoch": 0.8171204280107003, + "grad_norm": 5.703260898590088, + "learning_rate": 1.969330155637329e-06, + "loss": 0.8282, + "step": 65368 + }, + { + "epoch": 0.8171454286357159, + "grad_norm": 0.0034692855551838875, + "learning_rate": 1.968810160728093e-06, + "loss": 1.5694, + "step": 65370 + }, + { + "epoch": 0.8171704292607315, + "grad_norm": 0.0016270700143650174, + "learning_rate": 1.9682902269832614e-06, + "loss": 0.0, + "step": 65372 + }, + { + "epoch": 0.8171954298857471, + "grad_norm": 0.0014133855002000928, + "learning_rate": 1.9677703544067984e-06, + "loss": 0.0002, + "step": 65374 + }, + { + "epoch": 0.8172204305107628, + "grad_norm": 6.658536434173584, + "learning_rate": 1.96725054300266e-06, + "loss": 2.3446, + "step": 65376 + }, + { + "epoch": 0.8172454311357784, + "grad_norm": 2.7226760387420654, + "learning_rate": 1.9667307927748027e-06, + "loss": 1.2003, + "step": 65378 + }, + { + "epoch": 0.8172704317607941, + "grad_norm": 2.433748722076416, + "learning_rate": 1.96621110372719e-06, + "loss": 0.5077, + "step": 65380 + }, + { + "epoch": 0.8172954323858096, + "grad_norm": 0.0002973630034830421, + "learning_rate": 1.9656914758637756e-06, + "loss": 0.5264, + "step": 65382 + }, + { + "epoch": 0.8173204330108252, + "grad_norm": 2.934840202331543, + "learning_rate": 1.9651719091885215e-06, + "loss": 1.2674, + "step": 65384 + }, + { + "epoch": 0.8173454336358409, + "grad_norm": 1.8531494140625, + "learning_rate": 1.9646524037053792e-06, + "loss": 0.7389, + "step": 65386 + }, + { + "epoch": 0.8173704342608565, + "grad_norm": 4.759679317474365, + "learning_rate": 1.96413295941831e-06, + "loss": 0.6945, + "step": 65388 + }, + { + "epoch": 0.8173954348858722, + "grad_norm": 2.6357719898223877, + "learning_rate": 1.963613576331268e-06, + "loss": 0.5259, + "step": 65390 + }, + { + "epoch": 0.8174204355108877, + "grad_norm": 3.40901780128479, + "learning_rate": 1.963094254448206e-06, + "loss": 0.7671, + "step": 65392 + }, + { + "epoch": 0.8174454361359034, + "grad_norm": 1.3920830488204956, + "learning_rate": 1.9625749937730843e-06, + "loss": 0.4396, + "step": 65394 + }, + { + "epoch": 0.817470436760919, + "grad_norm": 5.839798450469971, + "learning_rate": 1.962055794309853e-06, + "loss": 0.5866, + "step": 65396 + }, + { + "epoch": 0.8174954373859347, + "grad_norm": 0.0009295573690906167, + "learning_rate": 1.9615366560624693e-06, + "loss": 0.1404, + "step": 65398 + }, + { + "epoch": 0.8175204380109503, + "grad_norm": 2.734485626220703, + "learning_rate": 1.961017579034884e-06, + "loss": 0.617, + "step": 65400 + }, + { + "epoch": 0.8175454386359658, + "grad_norm": 3.0220320224761963, + "learning_rate": 1.9604985632310537e-06, + "loss": 0.5639, + "step": 65402 + }, + { + "epoch": 0.8175704392609815, + "grad_norm": 0.00035800138721242547, + "learning_rate": 1.9599796086549305e-06, + "loss": 0.0361, + "step": 65404 + }, + { + "epoch": 0.8175954398859971, + "grad_norm": 6.089951515197754, + "learning_rate": 1.959460715310463e-06, + "loss": 1.3069, + "step": 65406 + }, + { + "epoch": 0.8176204405110128, + "grad_norm": 5.611191749572754, + "learning_rate": 1.958941883201607e-06, + "loss": 1.3948, + "step": 65408 + }, + { + "epoch": 0.8176454411360284, + "grad_norm": 9.83467960357666, + "learning_rate": 1.9584231123323115e-06, + "loss": 0.3461, + "step": 65410 + }, + { + "epoch": 0.817670441761044, + "grad_norm": 0.00046161876525729895, + "learning_rate": 1.9579044027065297e-06, + "loss": 1.0423, + "step": 65412 + }, + { + "epoch": 0.8176954423860596, + "grad_norm": 0.007953534834086895, + "learning_rate": 1.9573857543282095e-06, + "loss": 0.1138, + "step": 65414 + }, + { + "epoch": 0.8177204430110753, + "grad_norm": 3.9606692790985107, + "learning_rate": 1.9568671672013027e-06, + "loss": 1.0028, + "step": 65416 + }, + { + "epoch": 0.8177454436360909, + "grad_norm": 1.0156290531158447, + "learning_rate": 1.9563486413297596e-06, + "loss": 0.5014, + "step": 65418 + }, + { + "epoch": 0.8177704442611066, + "grad_norm": 2.374537944793701, + "learning_rate": 1.9558301767175247e-06, + "loss": 0.1611, + "step": 65420 + }, + { + "epoch": 0.8177954448861221, + "grad_norm": 0.15662658214569092, + "learning_rate": 1.9553117733685533e-06, + "loss": 0.2634, + "step": 65422 + }, + { + "epoch": 0.8178204455111378, + "grad_norm": 2.4873733520507812, + "learning_rate": 1.9547934312867858e-06, + "loss": 0.3308, + "step": 65424 + }, + { + "epoch": 0.8178454461361534, + "grad_norm": 4.219841480255127, + "learning_rate": 1.954275150476177e-06, + "loss": 0.7759, + "step": 65426 + }, + { + "epoch": 0.817870446761169, + "grad_norm": 2.2328543663024902, + "learning_rate": 1.9537569309406712e-06, + "loss": 0.5626, + "step": 65428 + }, + { + "epoch": 0.8178954473861847, + "grad_norm": 3.534288167953491, + "learning_rate": 1.953238772684212e-06, + "loss": 1.0647, + "step": 65430 + }, + { + "epoch": 0.8179204480112002, + "grad_norm": 0.06375820189714432, + "learning_rate": 1.952720675710751e-06, + "loss": 0.2668, + "step": 65432 + }, + { + "epoch": 0.8179454486362159, + "grad_norm": 1.923012137413025, + "learning_rate": 1.9522026400242276e-06, + "loss": 0.4914, + "step": 65434 + }, + { + "epoch": 0.8179704492612315, + "grad_norm": 11.002310752868652, + "learning_rate": 1.9516846656285946e-06, + "loss": 0.7777, + "step": 65436 + }, + { + "epoch": 0.8179954498862472, + "grad_norm": 4.0849761962890625, + "learning_rate": 1.9511667525277898e-06, + "loss": 0.9421, + "step": 65438 + }, + { + "epoch": 0.8180204505112628, + "grad_norm": 3.609593391418457, + "learning_rate": 1.9506489007257645e-06, + "loss": 0.7613, + "step": 65440 + }, + { + "epoch": 0.8180454511362784, + "grad_norm": 4.1266770362854, + "learning_rate": 1.9501311102264587e-06, + "loss": 1.4682, + "step": 65442 + }, + { + "epoch": 0.818070451761294, + "grad_norm": 2.4373950958251953, + "learning_rate": 1.9496133810338124e-06, + "loss": 1.0444, + "step": 65444 + }, + { + "epoch": 0.8180954523863097, + "grad_norm": 6.604625701904297, + "learning_rate": 1.9490957131517753e-06, + "loss": 0.5859, + "step": 65446 + }, + { + "epoch": 0.8181204530113253, + "grad_norm": 1.8736441135406494, + "learning_rate": 1.9485781065842844e-06, + "loss": 0.122, + "step": 65448 + }, + { + "epoch": 0.818145453636341, + "grad_norm": 0.3280069828033447, + "learning_rate": 1.9480605613352866e-06, + "loss": 0.1684, + "step": 65450 + }, + { + "epoch": 0.8181704542613565, + "grad_norm": 2.001370668411255, + "learning_rate": 1.947543077408718e-06, + "loss": 0.1334, + "step": 65452 + }, + { + "epoch": 0.8181954548863721, + "grad_norm": 4.902711391448975, + "learning_rate": 1.9470256548085254e-06, + "loss": 1.4719, + "step": 65454 + }, + { + "epoch": 0.8182204555113878, + "grad_norm": 0.00022486249508801848, + "learning_rate": 1.946508293538646e-06, + "loss": 0.8337, + "step": 65456 + }, + { + "epoch": 0.8182454561364034, + "grad_norm": 1.5979197025299072, + "learning_rate": 1.9459909936030188e-06, + "loss": 0.5809, + "step": 65458 + }, + { + "epoch": 0.8182704567614191, + "grad_norm": 9.25420093536377, + "learning_rate": 1.945473755005588e-06, + "loss": 2.0466, + "step": 65460 + }, + { + "epoch": 0.8182954573864346, + "grad_norm": 0.6051129102706909, + "learning_rate": 1.9449565777502865e-06, + "loss": 0.0061, + "step": 65462 + }, + { + "epoch": 0.8183204580114503, + "grad_norm": 2.3174071311950684, + "learning_rate": 1.94443946184106e-06, + "loss": 0.8359, + "step": 65464 + }, + { + "epoch": 0.8183454586364659, + "grad_norm": 3.7101964950561523, + "learning_rate": 1.9439224072818407e-06, + "loss": 0.9803, + "step": 65466 + }, + { + "epoch": 0.8183704592614816, + "grad_norm": 2.7601723670959473, + "learning_rate": 1.943405414076571e-06, + "loss": 1.2412, + "step": 65468 + }, + { + "epoch": 0.8183954598864972, + "grad_norm": 5.332818508148193, + "learning_rate": 1.942888482229187e-06, + "loss": 1.0057, + "step": 65470 + }, + { + "epoch": 0.8184204605115127, + "grad_norm": 4.416900634765625, + "learning_rate": 1.942371611743622e-06, + "loss": 2.3572, + "step": 65472 + }, + { + "epoch": 0.8184454611365284, + "grad_norm": 4.97290563583374, + "learning_rate": 1.9418548026238183e-06, + "loss": 1.5853, + "step": 65474 + }, + { + "epoch": 0.818470461761544, + "grad_norm": 2.749825954437256, + "learning_rate": 1.941338054873707e-06, + "loss": 0.9901, + "step": 65476 + }, + { + "epoch": 0.8184954623865597, + "grad_norm": 2.7764525413513184, + "learning_rate": 1.9408213684972267e-06, + "loss": 0.9023, + "step": 65478 + }, + { + "epoch": 0.8185204630115753, + "grad_norm": 2.7819483280181885, + "learning_rate": 1.9403047434983123e-06, + "loss": 0.1598, + "step": 65480 + }, + { + "epoch": 0.8185454636365909, + "grad_norm": 4.572462558746338, + "learning_rate": 1.9397881798808936e-06, + "loss": 1.6556, + "step": 65482 + }, + { + "epoch": 0.8185704642616065, + "grad_norm": 4.569291591644287, + "learning_rate": 1.939271677648912e-06, + "loss": 1.4245, + "step": 65484 + }, + { + "epoch": 0.8185954648866222, + "grad_norm": 9.176033973693848, + "learning_rate": 1.9387552368062934e-06, + "loss": 1.4262, + "step": 65486 + }, + { + "epoch": 0.8186204655116378, + "grad_norm": 4.382154941558838, + "learning_rate": 1.938238857356979e-06, + "loss": 0.3133, + "step": 65488 + }, + { + "epoch": 0.8186454661366535, + "grad_norm": 7.833604335784912, + "learning_rate": 1.9377225393048935e-06, + "loss": 1.0868, + "step": 65490 + }, + { + "epoch": 0.818670466761669, + "grad_norm": 2.4528887271881104, + "learning_rate": 1.937206282653977e-06, + "loss": 0.8288, + "step": 65492 + }, + { + "epoch": 0.8186954673866846, + "grad_norm": 0.00038507895078510046, + "learning_rate": 1.936690087408156e-06, + "loss": 0.9504, + "step": 65494 + }, + { + "epoch": 0.8187204680117003, + "grad_norm": 0.2907424569129944, + "learning_rate": 1.936173953571362e-06, + "loss": 0.7226, + "step": 65496 + }, + { + "epoch": 0.8187454686367159, + "grad_norm": 5.965343952178955, + "learning_rate": 1.935657881147528e-06, + "loss": 0.3715, + "step": 65498 + }, + { + "epoch": 0.8187704692617316, + "grad_norm": 8.942808151245117, + "learning_rate": 1.935141870140581e-06, + "loss": 1.5054, + "step": 65500 + }, + { + "epoch": 0.8187954698867471, + "grad_norm": 4.425244331359863, + "learning_rate": 1.934625920554456e-06, + "loss": 0.7681, + "step": 65502 + }, + { + "epoch": 0.8188204705117628, + "grad_norm": 0.32100915908813477, + "learning_rate": 1.934110032393076e-06, + "loss": 0.0212, + "step": 65504 + }, + { + "epoch": 0.8188454711367784, + "grad_norm": 3.129687547683716, + "learning_rate": 1.9335942056603774e-06, + "loss": 0.6402, + "step": 65506 + }, + { + "epoch": 0.8188704717617941, + "grad_norm": 2.629634141921997, + "learning_rate": 1.9330784403602832e-06, + "loss": 1.1126, + "step": 65508 + }, + { + "epoch": 0.8188954723868097, + "grad_norm": 4.196718692779541, + "learning_rate": 1.9325627364967216e-06, + "loss": 1.1034, + "step": 65510 + }, + { + "epoch": 0.8189204730118252, + "grad_norm": 2.8720781803131104, + "learning_rate": 1.9320470940736227e-06, + "loss": 0.5023, + "step": 65512 + }, + { + "epoch": 0.8189454736368409, + "grad_norm": 2.6734158992767334, + "learning_rate": 1.9315315130949095e-06, + "loss": 0.8062, + "step": 65514 + }, + { + "epoch": 0.8189704742618565, + "grad_norm": 5.45378303527832, + "learning_rate": 1.9310159935645145e-06, + "loss": 1.2642, + "step": 65516 + }, + { + "epoch": 0.8189954748868722, + "grad_norm": 6.87241792678833, + "learning_rate": 1.930500535486358e-06, + "loss": 2.1544, + "step": 65518 + }, + { + "epoch": 0.8190204755118878, + "grad_norm": 2.3438353538513184, + "learning_rate": 1.929985138864372e-06, + "loss": 0.873, + "step": 65520 + }, + { + "epoch": 0.8190454761369034, + "grad_norm": 2.3392434120178223, + "learning_rate": 1.9294698037024764e-06, + "loss": 0.6683, + "step": 65522 + }, + { + "epoch": 0.819070476761919, + "grad_norm": 4.855995178222656, + "learning_rate": 1.9289545300045953e-06, + "loss": 1.1768, + "step": 65524 + }, + { + "epoch": 0.8190954773869347, + "grad_norm": 0.0005578835844062269, + "learning_rate": 1.9284393177746576e-06, + "loss": 1.0724, + "step": 65526 + }, + { + "epoch": 0.8191204780119503, + "grad_norm": 2.99507474899292, + "learning_rate": 1.9279241670165827e-06, + "loss": 1.0178, + "step": 65528 + }, + { + "epoch": 0.819145478636966, + "grad_norm": 4.991928577423096, + "learning_rate": 1.9274090777342956e-06, + "loss": 1.498, + "step": 65530 + }, + { + "epoch": 0.8191704792619815, + "grad_norm": 3.629270076751709, + "learning_rate": 1.9268940499317244e-06, + "loss": 1.0232, + "step": 65532 + }, + { + "epoch": 0.8191954798869971, + "grad_norm": 2.1981587409973145, + "learning_rate": 1.9263790836127817e-06, + "loss": 1.286, + "step": 65534 + }, + { + "epoch": 0.8192204805120128, + "grad_norm": 4.223278999328613, + "learning_rate": 1.925864178781396e-06, + "loss": 0.4926, + "step": 65536 + }, + { + "epoch": 0.8192454811370284, + "grad_norm": 2.7031819820404053, + "learning_rate": 1.925349335441484e-06, + "loss": 0.2158, + "step": 65538 + }, + { + "epoch": 0.8192704817620441, + "grad_norm": 6.11545467376709, + "learning_rate": 1.9248345535969736e-06, + "loss": 0.3414, + "step": 65540 + }, + { + "epoch": 0.8192954823870596, + "grad_norm": 0.003172502852976322, + "learning_rate": 1.9243198332517778e-06, + "loss": 0.0198, + "step": 65542 + }, + { + "epoch": 0.8193204830120753, + "grad_norm": 5.436120986938477, + "learning_rate": 1.923805174409821e-06, + "loss": 0.2601, + "step": 65544 + }, + { + "epoch": 0.8193454836370909, + "grad_norm": 1.9870920181274414, + "learning_rate": 1.9232905770750267e-06, + "loss": 1.2583, + "step": 65546 + }, + { + "epoch": 0.8193704842621066, + "grad_norm": 0.003228376619517803, + "learning_rate": 1.922776041251303e-06, + "loss": 0.0001, + "step": 65548 + }, + { + "epoch": 0.8193954848871222, + "grad_norm": 0.04837556555867195, + "learning_rate": 1.922261566942578e-06, + "loss": 0.6955, + "step": 65550 + }, + { + "epoch": 0.8194204855121378, + "grad_norm": 1.8742733001708984, + "learning_rate": 1.9217471541527644e-06, + "loss": 0.4708, + "step": 65552 + }, + { + "epoch": 0.8194454861371534, + "grad_norm": 0.00037314844666980207, + "learning_rate": 1.921232802885782e-06, + "loss": 0.539, + "step": 65554 + }, + { + "epoch": 0.819470486762169, + "grad_norm": 0.002403656952083111, + "learning_rate": 1.920718513145551e-06, + "loss": 0.0001, + "step": 65556 + }, + { + "epoch": 0.8194954873871847, + "grad_norm": 5.985976219177246, + "learning_rate": 1.9202042849359847e-06, + "loss": 1.1345, + "step": 65558 + }, + { + "epoch": 0.8195204880122003, + "grad_norm": 3.8901309967041016, + "learning_rate": 1.9196901182609996e-06, + "loss": 0.7723, + "step": 65560 + }, + { + "epoch": 0.8195454886372159, + "grad_norm": 1.265270471572876, + "learning_rate": 1.9191760131245095e-06, + "loss": 1.2913, + "step": 65562 + }, + { + "epoch": 0.8195704892622315, + "grad_norm": 2.606484889984131, + "learning_rate": 1.9186619695304353e-06, + "loss": 0.5214, + "step": 65564 + }, + { + "epoch": 0.8195954898872472, + "grad_norm": 3.6008410453796387, + "learning_rate": 1.918147987482687e-06, + "loss": 0.5042, + "step": 65566 + }, + { + "epoch": 0.8196204905122628, + "grad_norm": 2.1295008659362793, + "learning_rate": 1.917634066985179e-06, + "loss": 1.022, + "step": 65568 + }, + { + "epoch": 0.8196454911372785, + "grad_norm": 3.943955421447754, + "learning_rate": 1.9171202080418306e-06, + "loss": 0.7803, + "step": 65570 + }, + { + "epoch": 0.819670491762294, + "grad_norm": 9.141357421875, + "learning_rate": 1.9166064106565507e-06, + "loss": 1.5495, + "step": 65572 + }, + { + "epoch": 0.8196954923873097, + "grad_norm": 0.00038464547833427787, + "learning_rate": 1.9160926748332543e-06, + "loss": 0.1819, + "step": 65574 + }, + { + "epoch": 0.8197204930123253, + "grad_norm": 3.1556203365325928, + "learning_rate": 1.9155790005758502e-06, + "loss": 0.8736, + "step": 65576 + }, + { + "epoch": 0.819745493637341, + "grad_norm": 0.979841411113739, + "learning_rate": 1.915065387888255e-06, + "loss": 0.4922, + "step": 65578 + }, + { + "epoch": 0.8197704942623566, + "grad_norm": 7.178637504577637, + "learning_rate": 1.9145518367743766e-06, + "loss": 1.2323, + "step": 65580 + }, + { + "epoch": 0.8197954948873721, + "grad_norm": 3.6405999660491943, + "learning_rate": 1.9140383472381286e-06, + "loss": 1.7881, + "step": 65582 + }, + { + "epoch": 0.8198204955123878, + "grad_norm": 1.834892749786377, + "learning_rate": 1.9135249192834226e-06, + "loss": 0.5057, + "step": 65584 + }, + { + "epoch": 0.8198454961374034, + "grad_norm": 0.0006418965640477836, + "learning_rate": 1.913011552914168e-06, + "loss": 0.6053, + "step": 65586 + }, + { + "epoch": 0.8198704967624191, + "grad_norm": 4.2383880615234375, + "learning_rate": 1.9124982481342737e-06, + "loss": 1.8169, + "step": 65588 + }, + { + "epoch": 0.8198954973874347, + "grad_norm": 1.123551607131958, + "learning_rate": 1.9119850049476464e-06, + "loss": 0.0512, + "step": 65590 + }, + { + "epoch": 0.8199204980124503, + "grad_norm": 11.689988136291504, + "learning_rate": 1.9114718233581965e-06, + "loss": 2.2559, + "step": 65592 + }, + { + "epoch": 0.8199454986374659, + "grad_norm": 0.36261186003685, + "learning_rate": 1.9109587033698373e-06, + "loss": 0.1619, + "step": 65594 + }, + { + "epoch": 0.8199704992624816, + "grad_norm": 3.4852113723754883, + "learning_rate": 1.9104456449864693e-06, + "loss": 0.8179, + "step": 65596 + }, + { + "epoch": 0.8199954998874972, + "grad_norm": 4.388449192047119, + "learning_rate": 1.9099326482120094e-06, + "loss": 0.2147, + "step": 65598 + }, + { + "epoch": 0.8200205005125129, + "grad_norm": 0.00021185248624533415, + "learning_rate": 1.9094197130503524e-06, + "loss": 0.4627, + "step": 65600 + }, + { + "epoch": 0.8200455011375284, + "grad_norm": 4.347294330596924, + "learning_rate": 1.9089068395054135e-06, + "loss": 0.7434, + "step": 65602 + }, + { + "epoch": 0.820070501762544, + "grad_norm": 2.389897108078003, + "learning_rate": 1.908394027581094e-06, + "loss": 1.1486, + "step": 65604 + }, + { + "epoch": 0.8200955023875597, + "grad_norm": 4.757639408111572, + "learning_rate": 1.9078812772813006e-06, + "loss": 1.3836, + "step": 65606 + }, + { + "epoch": 0.8201205030125753, + "grad_norm": 2.5819454193115234, + "learning_rate": 1.9073685886099414e-06, + "loss": 0.6421, + "step": 65608 + }, + { + "epoch": 0.820145503637591, + "grad_norm": 3.6678295135498047, + "learning_rate": 1.906855961570917e-06, + "loss": 1.6532, + "step": 65610 + }, + { + "epoch": 0.8201705042626065, + "grad_norm": 1.7753722667694092, + "learning_rate": 1.9063433961681377e-06, + "loss": 0.0928, + "step": 65612 + }, + { + "epoch": 0.8201955048876222, + "grad_norm": 2.431809902191162, + "learning_rate": 1.905830892405497e-06, + "loss": 0.5017, + "step": 65614 + }, + { + "epoch": 0.8202205055126378, + "grad_norm": 1.8380918502807617, + "learning_rate": 1.9053184502869049e-06, + "loss": 0.7059, + "step": 65616 + }, + { + "epoch": 0.8202455061376535, + "grad_norm": 3.6074132919311523, + "learning_rate": 1.9048060698162641e-06, + "loss": 1.0655, + "step": 65618 + }, + { + "epoch": 0.8202705067626691, + "grad_norm": 3.103123188018799, + "learning_rate": 1.9042937509974735e-06, + "loss": 1.0929, + "step": 65620 + }, + { + "epoch": 0.8202955073876846, + "grad_norm": 4.0559797286987305, + "learning_rate": 1.903781493834439e-06, + "loss": 1.3352, + "step": 65622 + }, + { + "epoch": 0.8203205080127003, + "grad_norm": 4.813910007476807, + "learning_rate": 1.9032692983310608e-06, + "loss": 0.6409, + "step": 65624 + }, + { + "epoch": 0.8203455086377159, + "grad_norm": 1.7577625513076782, + "learning_rate": 1.9027571644912379e-06, + "loss": 0.0214, + "step": 65626 + }, + { + "epoch": 0.8203705092627316, + "grad_norm": 2.3229997158050537, + "learning_rate": 1.9022450923188684e-06, + "loss": 1.2187, + "step": 65628 + }, + { + "epoch": 0.8203955098877472, + "grad_norm": 2.709526300430298, + "learning_rate": 1.9017330818178558e-06, + "loss": 0.8875, + "step": 65630 + }, + { + "epoch": 0.8204205105127628, + "grad_norm": 0.0017849646974354982, + "learning_rate": 1.9012211329921004e-06, + "loss": 1.4864, + "step": 65632 + }, + { + "epoch": 0.8204455111377784, + "grad_norm": 4.6530890464782715, + "learning_rate": 1.900709245845498e-06, + "loss": 1.3986, + "step": 65634 + }, + { + "epoch": 0.8204705117627941, + "grad_norm": 8.552578926086426, + "learning_rate": 1.9001974203819517e-06, + "loss": 0.2792, + "step": 65636 + }, + { + "epoch": 0.8204955123878097, + "grad_norm": 10.158926963806152, + "learning_rate": 1.8996856566053557e-06, + "loss": 1.1499, + "step": 65638 + }, + { + "epoch": 0.8205205130128254, + "grad_norm": 4.269698143005371, + "learning_rate": 1.8991739545196074e-06, + "loss": 0.5437, + "step": 65640 + }, + { + "epoch": 0.8205455136378409, + "grad_norm": 3.379220485687256, + "learning_rate": 1.8986623141286031e-06, + "loss": 0.651, + "step": 65642 + }, + { + "epoch": 0.8205705142628565, + "grad_norm": 4.255279541015625, + "learning_rate": 1.8981507354362417e-06, + "loss": 1.0944, + "step": 65644 + }, + { + "epoch": 0.8205955148878722, + "grad_norm": 1.452486276626587, + "learning_rate": 1.8976392184464198e-06, + "loss": 0.9309, + "step": 65646 + }, + { + "epoch": 0.8206205155128878, + "grad_norm": 3.1437721252441406, + "learning_rate": 1.8971277631630304e-06, + "loss": 0.5034, + "step": 65648 + }, + { + "epoch": 0.8206455161379035, + "grad_norm": 2.919888973236084, + "learning_rate": 1.8966163695899743e-06, + "loss": 0.7737, + "step": 65650 + }, + { + "epoch": 0.820670516762919, + "grad_norm": 2.535008668899536, + "learning_rate": 1.8961050377311373e-06, + "loss": 0.6184, + "step": 65652 + }, + { + "epoch": 0.8206955173879347, + "grad_norm": 3.1604888439178467, + "learning_rate": 1.8955937675904178e-06, + "loss": 0.4039, + "step": 65654 + }, + { + "epoch": 0.8207205180129503, + "grad_norm": 0.01381381880491972, + "learning_rate": 1.8950825591717137e-06, + "loss": 0.4958, + "step": 65656 + }, + { + "epoch": 0.820745518637966, + "grad_norm": 4.471946716308594, + "learning_rate": 1.8945714124789117e-06, + "loss": 0.8429, + "step": 65658 + }, + { + "epoch": 0.8207705192629816, + "grad_norm": 2.4633710384368896, + "learning_rate": 1.8940603275159097e-06, + "loss": 0.7329, + "step": 65660 + }, + { + "epoch": 0.8207955198879971, + "grad_norm": 0.8324260711669922, + "learning_rate": 1.893549304286596e-06, + "loss": 0.7274, + "step": 65662 + }, + { + "epoch": 0.8208205205130128, + "grad_norm": 2.16623592376709, + "learning_rate": 1.8930383427948696e-06, + "loss": 0.3652, + "step": 65664 + }, + { + "epoch": 0.8208455211380284, + "grad_norm": 3.8972785472869873, + "learning_rate": 1.8925274430446105e-06, + "loss": 0.9295, + "step": 65666 + }, + { + "epoch": 0.8208705217630441, + "grad_norm": 3.507702350616455, + "learning_rate": 1.892016605039717e-06, + "loss": 0.4973, + "step": 65668 + }, + { + "epoch": 0.8208955223880597, + "grad_norm": 4.746088981628418, + "learning_rate": 1.8915058287840804e-06, + "loss": 0.2957, + "step": 65670 + }, + { + "epoch": 0.8209205230130753, + "grad_norm": 7.358096599578857, + "learning_rate": 1.8909951142815875e-06, + "loss": 0.1099, + "step": 65672 + }, + { + "epoch": 0.8209455236380909, + "grad_norm": 3.254019260406494, + "learning_rate": 1.8904844615361306e-06, + "loss": 1.9912, + "step": 65674 + }, + { + "epoch": 0.8209705242631066, + "grad_norm": 0.0004504315438680351, + "learning_rate": 1.889973870551597e-06, + "loss": 0.0063, + "step": 65676 + }, + { + "epoch": 0.8209955248881222, + "grad_norm": 3.433636426925659, + "learning_rate": 1.889463341331873e-06, + "loss": 1.743, + "step": 65678 + }, + { + "epoch": 0.8210205255131379, + "grad_norm": 1.9151334762573242, + "learning_rate": 1.8889528738808527e-06, + "loss": 0.0798, + "step": 65680 + }, + { + "epoch": 0.8210455261381534, + "grad_norm": 5.021240234375, + "learning_rate": 1.8884424682024173e-06, + "loss": 1.0552, + "step": 65682 + }, + { + "epoch": 0.821070526763169, + "grad_norm": 2.5848615169525146, + "learning_rate": 1.887932124300459e-06, + "loss": 1.8632, + "step": 65684 + }, + { + "epoch": 0.8210955273881847, + "grad_norm": 3.098125457763672, + "learning_rate": 1.8874218421788603e-06, + "loss": 1.2119, + "step": 65686 + }, + { + "epoch": 0.8211205280132003, + "grad_norm": 2.5988924503326416, + "learning_rate": 1.8869116218415128e-06, + "loss": 0.6109, + "step": 65688 + }, + { + "epoch": 0.821145528638216, + "grad_norm": 2.581444025039673, + "learning_rate": 1.8864014632922977e-06, + "loss": 0.2861, + "step": 65690 + }, + { + "epoch": 0.8211705292632315, + "grad_norm": 1.2362520694732666, + "learning_rate": 1.8858913665351008e-06, + "loss": 0.4105, + "step": 65692 + }, + { + "epoch": 0.8211955298882472, + "grad_norm": 3.621746301651001, + "learning_rate": 1.885381331573809e-06, + "loss": 0.2156, + "step": 65694 + }, + { + "epoch": 0.8212205305132628, + "grad_norm": 2.0397205352783203, + "learning_rate": 1.8848713584123035e-06, + "loss": 1.3883, + "step": 65696 + }, + { + "epoch": 0.8212455311382785, + "grad_norm": 0.8033571243286133, + "learning_rate": 1.8843614470544735e-06, + "loss": 0.4797, + "step": 65698 + }, + { + "epoch": 0.8212705317632941, + "grad_norm": 5.145036220550537, + "learning_rate": 1.8838515975041948e-06, + "loss": 0.8557, + "step": 65700 + }, + { + "epoch": 0.8212955323883097, + "grad_norm": 1.3047667741775513, + "learning_rate": 1.8833418097653612e-06, + "loss": 0.1199, + "step": 65702 + }, + { + "epoch": 0.8213205330133253, + "grad_norm": 4.813412189483643, + "learning_rate": 1.8828320838418423e-06, + "loss": 0.2683, + "step": 65704 + }, + { + "epoch": 0.821345533638341, + "grad_norm": 2.9510886669158936, + "learning_rate": 1.8823224197375278e-06, + "loss": 1.1881, + "step": 65706 + }, + { + "epoch": 0.8213705342633566, + "grad_norm": 3.0460081100463867, + "learning_rate": 1.881812817456299e-06, + "loss": 1.2751, + "step": 65708 + }, + { + "epoch": 0.8213955348883722, + "grad_norm": 3.2147815227508545, + "learning_rate": 1.8813032770020333e-06, + "loss": 0.7748, + "step": 65710 + }, + { + "epoch": 0.8214205355133878, + "grad_norm": 3.801377534866333, + "learning_rate": 1.8807937983786174e-06, + "loss": 0.6423, + "step": 65712 + }, + { + "epoch": 0.8214455361384034, + "grad_norm": 0.7054719924926758, + "learning_rate": 1.8802843815899242e-06, + "loss": 0.8782, + "step": 65714 + }, + { + "epoch": 0.8214705367634191, + "grad_norm": 0.0004221029521431774, + "learning_rate": 1.8797750266398396e-06, + "loss": 0.0, + "step": 65716 + }, + { + "epoch": 0.8214955373884347, + "grad_norm": 0.00046223701792769134, + "learning_rate": 1.8792657335322406e-06, + "loss": 0.1641, + "step": 65718 + }, + { + "epoch": 0.8215205380134504, + "grad_norm": 9.027448654174805, + "learning_rate": 1.878756502271003e-06, + "loss": 0.6134, + "step": 65720 + }, + { + "epoch": 0.8215455386384659, + "grad_norm": 6.630893230438232, + "learning_rate": 1.8782473328600093e-06, + "loss": 0.985, + "step": 65722 + }, + { + "epoch": 0.8215705392634816, + "grad_norm": 3.240553379058838, + "learning_rate": 1.8777382253031329e-06, + "loss": 0.6247, + "step": 65724 + }, + { + "epoch": 0.8215955398884972, + "grad_norm": 3.183783769607544, + "learning_rate": 1.8772291796042562e-06, + "loss": 1.8376, + "step": 65726 + }, + { + "epoch": 0.8216205405135129, + "grad_norm": 3.873162031173706, + "learning_rate": 1.8767201957672544e-06, + "loss": 1.3582, + "step": 65728 + }, + { + "epoch": 0.8216455411385285, + "grad_norm": 3.204094171524048, + "learning_rate": 1.8762112737959993e-06, + "loss": 1.4017, + "step": 65730 + }, + { + "epoch": 0.821670541763544, + "grad_norm": 4.5781941413879395, + "learning_rate": 1.8757024136943725e-06, + "loss": 1.8161, + "step": 65732 + }, + { + "epoch": 0.8216955423885597, + "grad_norm": 2.0549333095550537, + "learning_rate": 1.8751936154662454e-06, + "loss": 0.9664, + "step": 65734 + }, + { + "epoch": 0.8217205430135753, + "grad_norm": 1.0411521196365356, + "learning_rate": 1.874684879115497e-06, + "loss": 0.2326, + "step": 65736 + }, + { + "epoch": 0.821745543638591, + "grad_norm": 0.0002324545494047925, + "learning_rate": 1.8741762046459978e-06, + "loss": 1.009, + "step": 65738 + }, + { + "epoch": 0.8217705442636066, + "grad_norm": 3.7522048950195312, + "learning_rate": 1.8736675920616254e-06, + "loss": 0.6637, + "step": 65740 + }, + { + "epoch": 0.8217955448886222, + "grad_norm": 2.260826349258423, + "learning_rate": 1.8731590413662504e-06, + "loss": 1.4705, + "step": 65742 + }, + { + "epoch": 0.8218205455136378, + "grad_norm": 2.69130277633667, + "learning_rate": 1.8726505525637462e-06, + "loss": 1.4489, + "step": 65744 + }, + { + "epoch": 0.8218455461386535, + "grad_norm": 5.9105730056762695, + "learning_rate": 1.8721421256579875e-06, + "loss": 0.8166, + "step": 65746 + }, + { + "epoch": 0.8218705467636691, + "grad_norm": 5.158490180969238, + "learning_rate": 1.8716337606528422e-06, + "loss": 1.1784, + "step": 65748 + }, + { + "epoch": 0.8218955473886848, + "grad_norm": 2.601914644241333, + "learning_rate": 1.8711254575521887e-06, + "loss": 1.37, + "step": 65750 + }, + { + "epoch": 0.8219205480137003, + "grad_norm": 2.1643426418304443, + "learning_rate": 1.8706172163598902e-06, + "loss": 0.3079, + "step": 65752 + }, + { + "epoch": 0.8219455486387159, + "grad_norm": 3.4271113872528076, + "learning_rate": 1.8701090370798248e-06, + "loss": 0.8196, + "step": 65754 + }, + { + "epoch": 0.8219705492637316, + "grad_norm": 0.0004073076415807009, + "learning_rate": 1.8696009197158594e-06, + "loss": 0.388, + "step": 65756 + }, + { + "epoch": 0.8219955498887472, + "grad_norm": 5.192305564880371, + "learning_rate": 1.869092864271861e-06, + "loss": 1.8262, + "step": 65758 + }, + { + "epoch": 0.8220205505137629, + "grad_norm": 0.3632703721523285, + "learning_rate": 1.868584870751703e-06, + "loss": 0.3956, + "step": 65760 + }, + { + "epoch": 0.8220455511387784, + "grad_norm": 2.468050956726074, + "learning_rate": 1.8680769391592513e-06, + "loss": 0.1206, + "step": 65762 + }, + { + "epoch": 0.8220705517637941, + "grad_norm": 3.863475799560547, + "learning_rate": 1.8675690694983784e-06, + "loss": 0.8416, + "step": 65764 + }, + { + "epoch": 0.8220955523888097, + "grad_norm": 0.03241368383169174, + "learning_rate": 1.8670612617729456e-06, + "loss": 0.078, + "step": 65766 + }, + { + "epoch": 0.8221205530138254, + "grad_norm": 0.0005925054429098964, + "learning_rate": 1.8665535159868276e-06, + "loss": 0.6353, + "step": 65768 + }, + { + "epoch": 0.822145553638841, + "grad_norm": 4.256045818328857, + "learning_rate": 1.8660458321438878e-06, + "loss": 2.8403, + "step": 65770 + }, + { + "epoch": 0.8221705542638565, + "grad_norm": 2.207655668258667, + "learning_rate": 1.8655382102479902e-06, + "loss": 1.0431, + "step": 65772 + }, + { + "epoch": 0.8221955548888722, + "grad_norm": 0.808752179145813, + "learning_rate": 1.8650306503030059e-06, + "loss": 0.056, + "step": 65774 + }, + { + "epoch": 0.8222205555138878, + "grad_norm": 2.4420478343963623, + "learning_rate": 1.8645231523127949e-06, + "loss": 0.2876, + "step": 65776 + }, + { + "epoch": 0.8222455561389035, + "grad_norm": 2.9589293003082275, + "learning_rate": 1.8640157162812278e-06, + "loss": 0.7661, + "step": 65778 + }, + { + "epoch": 0.8222705567639191, + "grad_norm": 3.692575216293335, + "learning_rate": 1.8635083422121659e-06, + "loss": 1.316, + "step": 65780 + }, + { + "epoch": 0.8222955573889347, + "grad_norm": 3.3694043159484863, + "learning_rate": 1.8630010301094726e-06, + "loss": 1.1198, + "step": 65782 + }, + { + "epoch": 0.8223205580139503, + "grad_norm": 2.1305222511291504, + "learning_rate": 1.8624937799770137e-06, + "loss": 0.505, + "step": 65784 + }, + { + "epoch": 0.822345558638966, + "grad_norm": 1.8210468292236328, + "learning_rate": 1.8619865918186509e-06, + "loss": 1.4761, + "step": 65786 + }, + { + "epoch": 0.8223705592639816, + "grad_norm": 4.527100563049316, + "learning_rate": 1.8614794656382485e-06, + "loss": 1.239, + "step": 65788 + }, + { + "epoch": 0.8223955598889973, + "grad_norm": 8.797255516052246, + "learning_rate": 1.8609724014396645e-06, + "loss": 1.5644, + "step": 65790 + }, + { + "epoch": 0.8224205605140128, + "grad_norm": 3.5434913635253906, + "learning_rate": 1.8604653992267674e-06, + "loss": 1.414, + "step": 65792 + }, + { + "epoch": 0.8224455611390284, + "grad_norm": 3.49979305267334, + "learning_rate": 1.8599584590034148e-06, + "loss": 0.2797, + "step": 65794 + }, + { + "epoch": 0.8224705617640441, + "grad_norm": 4.744443893432617, + "learning_rate": 1.8594515807734648e-06, + "loss": 1.4834, + "step": 65796 + }, + { + "epoch": 0.8224955623890597, + "grad_norm": 4.116426944732666, + "learning_rate": 1.8589447645407832e-06, + "loss": 1.3799, + "step": 65798 + }, + { + "epoch": 0.8225205630140754, + "grad_norm": 3.267876625061035, + "learning_rate": 1.8584380103092237e-06, + "loss": 0.5242, + "step": 65800 + }, + { + "epoch": 0.8225455636390909, + "grad_norm": 0.34285226464271545, + "learning_rate": 1.857931318082652e-06, + "loss": 0.5329, + "step": 65802 + }, + { + "epoch": 0.8225705642641066, + "grad_norm": 3.773206949234009, + "learning_rate": 1.8574246878649215e-06, + "loss": 1.6328, + "step": 65804 + }, + { + "epoch": 0.8225955648891222, + "grad_norm": 3.9753572940826416, + "learning_rate": 1.8569181196598951e-06, + "loss": 0.615, + "step": 65806 + }, + { + "epoch": 0.8226205655141379, + "grad_norm": 0.0003221076913177967, + "learning_rate": 1.8564116134714293e-06, + "loss": 0.3604, + "step": 65808 + }, + { + "epoch": 0.8226455661391535, + "grad_norm": 0.5529852509498596, + "learning_rate": 1.855905169303378e-06, + "loss": 0.3615, + "step": 65810 + }, + { + "epoch": 0.822670566764169, + "grad_norm": 1.9565593004226685, + "learning_rate": 1.8553987871596047e-06, + "loss": 1.1166, + "step": 65812 + }, + { + "epoch": 0.8226955673891847, + "grad_norm": 0.004725360311567783, + "learning_rate": 1.8548924670439594e-06, + "loss": 0.5505, + "step": 65814 + }, + { + "epoch": 0.8227205680142003, + "grad_norm": 3.269890546798706, + "learning_rate": 1.8543862089603038e-06, + "loss": 1.313, + "step": 65816 + }, + { + "epoch": 0.822745568639216, + "grad_norm": 4.793460845947266, + "learning_rate": 1.853880012912488e-06, + "loss": 0.9165, + "step": 65818 + }, + { + "epoch": 0.8227705692642316, + "grad_norm": 7.194170951843262, + "learning_rate": 1.853373878904373e-06, + "loss": 1.0629, + "step": 65820 + }, + { + "epoch": 0.8227955698892472, + "grad_norm": 6.1305742263793945, + "learning_rate": 1.852867806939811e-06, + "loss": 2.8151, + "step": 65822 + }, + { + "epoch": 0.8228205705142628, + "grad_norm": 3.9096522331237793, + "learning_rate": 1.8523617970226527e-06, + "loss": 1.4205, + "step": 65824 + }, + { + "epoch": 0.8228455711392785, + "grad_norm": 6.654819488525391, + "learning_rate": 1.8518558491567573e-06, + "loss": 1.7447, + "step": 65826 + }, + { + "epoch": 0.8228705717642941, + "grad_norm": 2.5066065788269043, + "learning_rate": 1.851349963345973e-06, + "loss": 0.9624, + "step": 65828 + }, + { + "epoch": 0.8228955723893098, + "grad_norm": 0.0017837201012298465, + "learning_rate": 1.8508441395941568e-06, + "loss": 0.0, + "step": 65830 + }, + { + "epoch": 0.8229205730143253, + "grad_norm": 0.0003151183482259512, + "learning_rate": 1.8503383779051598e-06, + "loss": 0.4441, + "step": 65832 + }, + { + "epoch": 0.822945573639341, + "grad_norm": 2.195634603500366, + "learning_rate": 1.8498326782828313e-06, + "loss": 0.1405, + "step": 65834 + }, + { + "epoch": 0.8229705742643566, + "grad_norm": 3.1859652996063232, + "learning_rate": 1.8493270407310272e-06, + "loss": 1.6165, + "step": 65836 + }, + { + "epoch": 0.8229955748893723, + "grad_norm": 4.832687854766846, + "learning_rate": 1.848821465253592e-06, + "loss": 0.9938, + "step": 65838 + }, + { + "epoch": 0.8230205755143879, + "grad_norm": 0.6715834736824036, + "learning_rate": 1.8483159518543836e-06, + "loss": 0.3509, + "step": 65840 + }, + { + "epoch": 0.8230455761394034, + "grad_norm": 3.8963074684143066, + "learning_rate": 1.847810500537245e-06, + "loss": 1.2804, + "step": 65842 + }, + { + "epoch": 0.8230705767644191, + "grad_norm": 3.382272958755493, + "learning_rate": 1.8473051113060314e-06, + "loss": 1.9116, + "step": 65844 + }, + { + "epoch": 0.8230955773894347, + "grad_norm": 2.3074941635131836, + "learning_rate": 1.846799784164589e-06, + "loss": 1.1192, + "step": 65846 + }, + { + "epoch": 0.8231205780144504, + "grad_norm": 3.1426913738250732, + "learning_rate": 1.8462945191167636e-06, + "loss": 0.7588, + "step": 65848 + }, + { + "epoch": 0.823145578639466, + "grad_norm": 0.0003470324445515871, + "learning_rate": 1.8457893161664098e-06, + "loss": 0.3142, + "step": 65850 + }, + { + "epoch": 0.8231705792644816, + "grad_norm": 3.789212703704834, + "learning_rate": 1.8452841753173678e-06, + "loss": 0.5213, + "step": 65852 + }, + { + "epoch": 0.8231955798894972, + "grad_norm": 3.584336280822754, + "learning_rate": 1.8447790965734914e-06, + "loss": 1.4497, + "step": 65854 + }, + { + "epoch": 0.8232205805145129, + "grad_norm": 2.2633328437805176, + "learning_rate": 1.8442740799386215e-06, + "loss": 0.4061, + "step": 65856 + }, + { + "epoch": 0.8232455811395285, + "grad_norm": 10.243038177490234, + "learning_rate": 1.8437691254166101e-06, + "loss": 1.6429, + "step": 65858 + }, + { + "epoch": 0.8232705817645442, + "grad_norm": 4.1624860763549805, + "learning_rate": 1.8432642330112994e-06, + "loss": 2.0492, + "step": 65860 + }, + { + "epoch": 0.8232955823895597, + "grad_norm": 2.6601338386535645, + "learning_rate": 1.8427594027265316e-06, + "loss": 1.0471, + "step": 65862 + }, + { + "epoch": 0.8233205830145753, + "grad_norm": 2.3462512493133545, + "learning_rate": 1.8422546345661573e-06, + "loss": 0.1133, + "step": 65864 + }, + { + "epoch": 0.823345583639591, + "grad_norm": 0.0010754676768556237, + "learning_rate": 1.8417499285340157e-06, + "loss": 0.0, + "step": 65866 + }, + { + "epoch": 0.8233705842646066, + "grad_norm": 2.167980194091797, + "learning_rate": 1.8412452846339557e-06, + "loss": 0.9156, + "step": 65868 + }, + { + "epoch": 0.8233955848896223, + "grad_norm": 0.00031390207004733384, + "learning_rate": 1.8407407028698154e-06, + "loss": 1.2821, + "step": 65870 + }, + { + "epoch": 0.8234205855146378, + "grad_norm": 3.58789324760437, + "learning_rate": 1.8402361832454418e-06, + "loss": 0.9432, + "step": 65872 + }, + { + "epoch": 0.8234455861396535, + "grad_norm": 3.1141951084136963, + "learning_rate": 1.8397317257646763e-06, + "loss": 1.0729, + "step": 65874 + }, + { + "epoch": 0.8234705867646691, + "grad_norm": 3.910109043121338, + "learning_rate": 1.839227330431358e-06, + "loss": 1.1923, + "step": 65876 + }, + { + "epoch": 0.8234955873896848, + "grad_norm": 6.652489185333252, + "learning_rate": 1.8387229972493315e-06, + "loss": 1.5926, + "step": 65878 + }, + { + "epoch": 0.8235205880147004, + "grad_norm": 0.01776685006916523, + "learning_rate": 1.8382187262224349e-06, + "loss": 0.6683, + "step": 65880 + }, + { + "epoch": 0.8235455886397159, + "grad_norm": 0.9104242324829102, + "learning_rate": 1.837714517354513e-06, + "loss": 0.9079, + "step": 65882 + }, + { + "epoch": 0.8235705892647316, + "grad_norm": 4.5338521003723145, + "learning_rate": 1.8372103706494004e-06, + "loss": 1.8598, + "step": 65884 + }, + { + "epoch": 0.8235955898897472, + "grad_norm": 5.719921588897705, + "learning_rate": 1.8367062861109408e-06, + "loss": 1.6806, + "step": 65886 + }, + { + "epoch": 0.8236205905147629, + "grad_norm": 3.5765933990478516, + "learning_rate": 1.8362022637429732e-06, + "loss": 0.7464, + "step": 65888 + }, + { + "epoch": 0.8236455911397785, + "grad_norm": 0.003910595551133156, + "learning_rate": 1.8356983035493315e-06, + "loss": 0.6888, + "step": 65890 + }, + { + "epoch": 0.8236705917647941, + "grad_norm": 0.30776599049568176, + "learning_rate": 1.8351944055338588e-06, + "loss": 0.0076, + "step": 65892 + }, + { + "epoch": 0.8236955923898097, + "grad_norm": 1.702918291091919, + "learning_rate": 1.834690569700389e-06, + "loss": 1.2576, + "step": 65894 + }, + { + "epoch": 0.8237205930148254, + "grad_norm": 3.388979911804199, + "learning_rate": 1.8341867960527648e-06, + "loss": 1.0119, + "step": 65896 + }, + { + "epoch": 0.823745593639841, + "grad_norm": 4.91171932220459, + "learning_rate": 1.8336830845948172e-06, + "loss": 1.5038, + "step": 65898 + }, + { + "epoch": 0.8237705942648567, + "grad_norm": 3.8117666244506836, + "learning_rate": 1.8331794353303834e-06, + "loss": 0.5112, + "step": 65900 + }, + { + "epoch": 0.8237955948898722, + "grad_norm": 3.675961971282959, + "learning_rate": 1.8326758482633022e-06, + "loss": 0.199, + "step": 65902 + }, + { + "epoch": 0.8238205955148878, + "grad_norm": 2.6079254150390625, + "learning_rate": 1.8321723233974042e-06, + "loss": 0.7494, + "step": 65904 + }, + { + "epoch": 0.8238455961399035, + "grad_norm": 1.8061636686325073, + "learning_rate": 1.8316688607365285e-06, + "loss": 0.1139, + "step": 65906 + }, + { + "epoch": 0.8238705967649191, + "grad_norm": 3.894615888595581, + "learning_rate": 1.8311654602845063e-06, + "loss": 1.5341, + "step": 65908 + }, + { + "epoch": 0.8238955973899348, + "grad_norm": 5.064352512359619, + "learning_rate": 1.8306621220451726e-06, + "loss": 1.8906, + "step": 65910 + }, + { + "epoch": 0.8239205980149503, + "grad_norm": 4.313133239746094, + "learning_rate": 1.8301588460223651e-06, + "loss": 1.0394, + "step": 65912 + }, + { + "epoch": 0.823945598639966, + "grad_norm": 0.003995007835328579, + "learning_rate": 1.8296556322199088e-06, + "loss": 0.854, + "step": 65914 + }, + { + "epoch": 0.8239705992649816, + "grad_norm": 0.00034889779635705054, + "learning_rate": 1.8291524806416416e-06, + "loss": 0.9929, + "step": 65916 + }, + { + "epoch": 0.8239955998899973, + "grad_norm": 4.172585964202881, + "learning_rate": 1.8286493912913904e-06, + "loss": 1.1897, + "step": 65918 + }, + { + "epoch": 0.8240206005150129, + "grad_norm": 3.772298812866211, + "learning_rate": 1.8281463641729935e-06, + "loss": 1.7802, + "step": 65920 + }, + { + "epoch": 0.8240456011400284, + "grad_norm": 2.144869804382324, + "learning_rate": 1.8276433992902753e-06, + "loss": 1.1778, + "step": 65922 + }, + { + "epoch": 0.8240706017650441, + "grad_norm": 4.824394702911377, + "learning_rate": 1.8271404966470717e-06, + "loss": 0.7721, + "step": 65924 + }, + { + "epoch": 0.8240956023900597, + "grad_norm": 3.198967933654785, + "learning_rate": 1.826637656247211e-06, + "loss": 1.5989, + "step": 65926 + }, + { + "epoch": 0.8241206030150754, + "grad_norm": 5.052848815917969, + "learning_rate": 1.8261348780945188e-06, + "loss": 0.8968, + "step": 65928 + }, + { + "epoch": 0.824145603640091, + "grad_norm": 4.7031779289245605, + "learning_rate": 1.8256321621928308e-06, + "loss": 0.8459, + "step": 65930 + }, + { + "epoch": 0.8241706042651066, + "grad_norm": 7.216620445251465, + "learning_rate": 1.8251295085459697e-06, + "loss": 0.3522, + "step": 65932 + }, + { + "epoch": 0.8241956048901222, + "grad_norm": 4.685080528259277, + "learning_rate": 1.8246269171577658e-06, + "loss": 0.3274, + "step": 65934 + }, + { + "epoch": 0.8242206055151379, + "grad_norm": 1.3348214626312256, + "learning_rate": 1.8241243880320492e-06, + "loss": 0.6555, + "step": 65936 + }, + { + "epoch": 0.8242456061401535, + "grad_norm": 3.8980679512023926, + "learning_rate": 1.8236219211726459e-06, + "loss": 0.6441, + "step": 65938 + }, + { + "epoch": 0.8242706067651692, + "grad_norm": 2.8613150119781494, + "learning_rate": 1.8231195165833815e-06, + "loss": 0.5314, + "step": 65940 + }, + { + "epoch": 0.8242956073901847, + "grad_norm": 3.7239668369293213, + "learning_rate": 1.8226171742680798e-06, + "loss": 1.1533, + "step": 65942 + }, + { + "epoch": 0.8243206080152004, + "grad_norm": 3.6251306533813477, + "learning_rate": 1.8221148942305722e-06, + "loss": 1.9198, + "step": 65944 + }, + { + "epoch": 0.824345608640216, + "grad_norm": 4.044515609741211, + "learning_rate": 1.8216126764746788e-06, + "loss": 1.2586, + "step": 65946 + }, + { + "epoch": 0.8243706092652316, + "grad_norm": 2.3432700634002686, + "learning_rate": 1.8211105210042269e-06, + "loss": 0.2591, + "step": 65948 + }, + { + "epoch": 0.8243956098902473, + "grad_norm": 3.440000534057617, + "learning_rate": 1.8206084278230451e-06, + "loss": 0.8391, + "step": 65950 + }, + { + "epoch": 0.8244206105152628, + "grad_norm": 4.430228233337402, + "learning_rate": 1.8201063969349474e-06, + "loss": 1.2813, + "step": 65952 + }, + { + "epoch": 0.8244456111402785, + "grad_norm": 3.2257485389709473, + "learning_rate": 1.8196044283437663e-06, + "loss": 1.1169, + "step": 65954 + }, + { + "epoch": 0.8244706117652941, + "grad_norm": 0.00023674973635934293, + "learning_rate": 1.8191025220533177e-06, + "loss": 0.2007, + "step": 65956 + }, + { + "epoch": 0.8244956123903098, + "grad_norm": 3.756819486618042, + "learning_rate": 1.8186006780674304e-06, + "loss": 0.8816, + "step": 65958 + }, + { + "epoch": 0.8245206130153254, + "grad_norm": 2.0792675018310547, + "learning_rate": 1.8180988963899204e-06, + "loss": 0.6962, + "step": 65960 + }, + { + "epoch": 0.824545613640341, + "grad_norm": 5.1826605796813965, + "learning_rate": 1.8175971770246127e-06, + "loss": 0.9983, + "step": 65962 + }, + { + "epoch": 0.8245706142653566, + "grad_norm": 4.565158843994141, + "learning_rate": 1.8170955199753314e-06, + "loss": 2.3344, + "step": 65964 + }, + { + "epoch": 0.8245956148903723, + "grad_norm": 2.782770872116089, + "learning_rate": 1.8165939252458896e-06, + "loss": 0.3103, + "step": 65966 + }, + { + "epoch": 0.8246206155153879, + "grad_norm": 0.01835663989186287, + "learning_rate": 1.8160923928401142e-06, + "loss": 0.009, + "step": 65968 + }, + { + "epoch": 0.8246456161404035, + "grad_norm": 5.043245792388916, + "learning_rate": 1.8155909227618184e-06, + "loss": 1.1819, + "step": 65970 + }, + { + "epoch": 0.8246706167654191, + "grad_norm": 2.8278555870056152, + "learning_rate": 1.8150895150148262e-06, + "loss": 0.8848, + "step": 65972 + }, + { + "epoch": 0.8246956173904347, + "grad_norm": 3.785421133041382, + "learning_rate": 1.8145881696029555e-06, + "loss": 1.255, + "step": 65974 + }, + { + "epoch": 0.8247206180154504, + "grad_norm": 3.493131637573242, + "learning_rate": 1.8140868865300254e-06, + "loss": 0.1483, + "step": 65976 + }, + { + "epoch": 0.824745618640466, + "grad_norm": 5.771923065185547, + "learning_rate": 1.8135856657998508e-06, + "loss": 1.2474, + "step": 65978 + }, + { + "epoch": 0.8247706192654817, + "grad_norm": 5.712176322937012, + "learning_rate": 1.8130845074162473e-06, + "loss": 2.0642, + "step": 65980 + }, + { + "epoch": 0.8247956198904972, + "grad_norm": 0.000261286593740806, + "learning_rate": 1.8125834113830377e-06, + "loss": 1.311, + "step": 65982 + }, + { + "epoch": 0.8248206205155129, + "grad_norm": 3.9387264251708984, + "learning_rate": 1.8120823777040331e-06, + "loss": 1.8455, + "step": 65984 + }, + { + "epoch": 0.8248456211405285, + "grad_norm": 3.5347671508789062, + "learning_rate": 1.8115814063830506e-06, + "loss": 1.8179, + "step": 65986 + }, + { + "epoch": 0.8248706217655442, + "grad_norm": 3.045271635055542, + "learning_rate": 1.8110804974239082e-06, + "loss": 1.3497, + "step": 65988 + }, + { + "epoch": 0.8248956223905598, + "grad_norm": 5.992265701293945, + "learning_rate": 1.8105796508304196e-06, + "loss": 2.5072, + "step": 65990 + }, + { + "epoch": 0.8249206230155753, + "grad_norm": 3.1965975761413574, + "learning_rate": 1.8100788666063973e-06, + "loss": 1.4357, + "step": 65992 + }, + { + "epoch": 0.824945623640591, + "grad_norm": 0.0773906260728836, + "learning_rate": 1.809578144755655e-06, + "loss": 0.8623, + "step": 65994 + }, + { + "epoch": 0.8249706242656066, + "grad_norm": 2.8055622577667236, + "learning_rate": 1.8090774852820058e-06, + "loss": 0.7433, + "step": 65996 + }, + { + "epoch": 0.8249956248906223, + "grad_norm": 1.9799143075942993, + "learning_rate": 1.8085768881892673e-06, + "loss": 0.4598, + "step": 65998 + }, + { + "epoch": 0.8250206255156379, + "grad_norm": 0.001242302474565804, + "learning_rate": 1.8080763534812473e-06, + "loss": 0.7919, + "step": 66000 + }, + { + "epoch": 0.8250456261406535, + "grad_norm": 2.5726451873779297, + "learning_rate": 1.8075758811617627e-06, + "loss": 1.403, + "step": 66002 + }, + { + "epoch": 0.8250706267656691, + "grad_norm": 3.009432077407837, + "learning_rate": 1.8070754712346183e-06, + "loss": 0.9047, + "step": 66004 + }, + { + "epoch": 0.8250956273906848, + "grad_norm": 3.305006980895996, + "learning_rate": 1.80657512370363e-06, + "loss": 0.5136, + "step": 66006 + }, + { + "epoch": 0.8251206280157004, + "grad_norm": 4.736882209777832, + "learning_rate": 1.8060748385726046e-06, + "loss": 0.6975, + "step": 66008 + }, + { + "epoch": 0.8251456286407161, + "grad_norm": 1.1874408721923828, + "learning_rate": 1.8055746158453546e-06, + "loss": 0.0315, + "step": 66010 + }, + { + "epoch": 0.8251706292657316, + "grad_norm": 2.990875720977783, + "learning_rate": 1.8050744555256916e-06, + "loss": 0.8659, + "step": 66012 + }, + { + "epoch": 0.8251956298907472, + "grad_norm": 3.8134067058563232, + "learning_rate": 1.8045743576174213e-06, + "loss": 1.3541, + "step": 66014 + }, + { + "epoch": 0.8252206305157629, + "grad_norm": 1.0490130186080933, + "learning_rate": 1.8040743221243574e-06, + "loss": 0.6259, + "step": 66016 + }, + { + "epoch": 0.8252456311407785, + "grad_norm": 3.785101890563965, + "learning_rate": 1.8035743490503e-06, + "loss": 1.2597, + "step": 66018 + }, + { + "epoch": 0.8252706317657942, + "grad_norm": 1.4611998796463013, + "learning_rate": 1.8030744383990617e-06, + "loss": 0.4847, + "step": 66020 + }, + { + "epoch": 0.8252956323908097, + "grad_norm": 1.9573508501052856, + "learning_rate": 1.8025745901744507e-06, + "loss": 1.063, + "step": 66022 + }, + { + "epoch": 0.8253206330158254, + "grad_norm": 3.6639437675476074, + "learning_rate": 1.8020748043802716e-06, + "loss": 1.241, + "step": 66024 + }, + { + "epoch": 0.825345633640841, + "grad_norm": 5.6974382400512695, + "learning_rate": 1.8015750810203326e-06, + "loss": 1.1672, + "step": 66026 + }, + { + "epoch": 0.8253706342658567, + "grad_norm": 4.0876007080078125, + "learning_rate": 1.8010754200984392e-06, + "loss": 1.2067, + "step": 66028 + }, + { + "epoch": 0.8253956348908723, + "grad_norm": 1.2807910442352295, + "learning_rate": 1.8005758216183955e-06, + "loss": 0.1879, + "step": 66030 + }, + { + "epoch": 0.8254206355158878, + "grad_norm": 1.0391544103622437, + "learning_rate": 1.8000762855840048e-06, + "loss": 0.6408, + "step": 66032 + }, + { + "epoch": 0.8254456361409035, + "grad_norm": 2.456223726272583, + "learning_rate": 1.799576811999073e-06, + "loss": 1.4077, + "step": 66034 + }, + { + "epoch": 0.8254706367659191, + "grad_norm": 1.5282635688781738, + "learning_rate": 1.7990774008674073e-06, + "loss": 0.1612, + "step": 66036 + }, + { + "epoch": 0.8254956373909348, + "grad_norm": 7.223811626434326, + "learning_rate": 1.7985780521928053e-06, + "loss": 0.5873, + "step": 66038 + }, + { + "epoch": 0.8255206380159504, + "grad_norm": 3.9219915866851807, + "learning_rate": 1.7980787659790766e-06, + "loss": 0.9542, + "step": 66040 + }, + { + "epoch": 0.825545638640966, + "grad_norm": 3.586407423019409, + "learning_rate": 1.797579542230019e-06, + "loss": 1.3845, + "step": 66042 + }, + { + "epoch": 0.8255706392659816, + "grad_norm": 2.728445053100586, + "learning_rate": 1.7970803809494374e-06, + "loss": 0.2255, + "step": 66044 + }, + { + "epoch": 0.8255956398909973, + "grad_norm": 2.477167844772339, + "learning_rate": 1.7965812821411277e-06, + "loss": 0.6242, + "step": 66046 + }, + { + "epoch": 0.8256206405160129, + "grad_norm": 3.929070234298706, + "learning_rate": 1.7960822458088956e-06, + "loss": 1.4499, + "step": 66048 + }, + { + "epoch": 0.8256456411410286, + "grad_norm": 1.0862293243408203, + "learning_rate": 1.7955832719565424e-06, + "loss": 0.37, + "step": 66050 + }, + { + "epoch": 0.8256706417660441, + "grad_norm": 2.942636251449585, + "learning_rate": 1.7950843605878654e-06, + "loss": 0.8552, + "step": 66052 + }, + { + "epoch": 0.8256956423910597, + "grad_norm": 4.591558456420898, + "learning_rate": 1.7945855117066703e-06, + "loss": 0.7191, + "step": 66054 + }, + { + "epoch": 0.8257206430160754, + "grad_norm": 2.7866921424865723, + "learning_rate": 1.794086725316747e-06, + "loss": 0.5814, + "step": 66056 + }, + { + "epoch": 0.825745643641091, + "grad_norm": 2.403564929962158, + "learning_rate": 1.7935880014218976e-06, + "loss": 0.6564, + "step": 66058 + }, + { + "epoch": 0.8257706442661067, + "grad_norm": 3.4764809608459473, + "learning_rate": 1.7930893400259253e-06, + "loss": 1.0837, + "step": 66060 + }, + { + "epoch": 0.8257956448911222, + "grad_norm": 0.16911309957504272, + "learning_rate": 1.7925907411326203e-06, + "loss": 0.5128, + "step": 66062 + }, + { + "epoch": 0.8258206455161379, + "grad_norm": 2.9043831825256348, + "learning_rate": 1.7920922047457878e-06, + "loss": 1.3687, + "step": 66064 + }, + { + "epoch": 0.8258456461411535, + "grad_norm": 3.313724994659424, + "learning_rate": 1.7915937308692166e-06, + "loss": 1.2128, + "step": 66066 + }, + { + "epoch": 0.8258706467661692, + "grad_norm": 3.5569684505462646, + "learning_rate": 1.7910953195067116e-06, + "loss": 1.021, + "step": 66068 + }, + { + "epoch": 0.8258956473911848, + "grad_norm": 2.3025972843170166, + "learning_rate": 1.7905969706620597e-06, + "loss": 0.7579, + "step": 66070 + }, + { + "epoch": 0.8259206480162004, + "grad_norm": 2.7422258853912354, + "learning_rate": 1.79009868433906e-06, + "loss": 0.9769, + "step": 66072 + }, + { + "epoch": 0.825945648641216, + "grad_norm": 3.2727787494659424, + "learning_rate": 1.7896004605415096e-06, + "loss": 1.6276, + "step": 66074 + }, + { + "epoch": 0.8259706492662316, + "grad_norm": 5.796758651733398, + "learning_rate": 1.7891022992731988e-06, + "loss": 1.2763, + "step": 66076 + }, + { + "epoch": 0.8259956498912473, + "grad_norm": 2.4514362812042236, + "learning_rate": 1.7886042005379268e-06, + "loss": 0.8203, + "step": 66078 + }, + { + "epoch": 0.8260206505162629, + "grad_norm": 2.9821701049804688, + "learning_rate": 1.7881061643394836e-06, + "loss": 0.1147, + "step": 66080 + }, + { + "epoch": 0.8260456511412785, + "grad_norm": 2.9524102210998535, + "learning_rate": 1.78760819068166e-06, + "loss": 0.9648, + "step": 66082 + }, + { + "epoch": 0.8260706517662941, + "grad_norm": 3.034797191619873, + "learning_rate": 1.7871102795682538e-06, + "loss": 0.8033, + "step": 66084 + }, + { + "epoch": 0.8260956523913098, + "grad_norm": 1.160331130027771, + "learning_rate": 1.786612431003052e-06, + "loss": 0.061, + "step": 66086 + }, + { + "epoch": 0.8261206530163254, + "grad_norm": 4.106475353240967, + "learning_rate": 1.7861146449898502e-06, + "loss": 0.9561, + "step": 66088 + }, + { + "epoch": 0.8261456536413411, + "grad_norm": 0.0010141582461073995, + "learning_rate": 1.785616921532436e-06, + "loss": 0.2944, + "step": 66090 + }, + { + "epoch": 0.8261706542663566, + "grad_norm": 4.562253952026367, + "learning_rate": 1.7851192606346046e-06, + "loss": 1.2623, + "step": 66092 + }, + { + "epoch": 0.8261956548913723, + "grad_norm": 6.93940544128418, + "learning_rate": 1.7846216623001422e-06, + "loss": 0.7487, + "step": 66094 + }, + { + "epoch": 0.8262206555163879, + "grad_norm": 3.604530096054077, + "learning_rate": 1.784124126532837e-06, + "loss": 1.264, + "step": 66096 + }, + { + "epoch": 0.8262456561414036, + "grad_norm": 2.326247453689575, + "learning_rate": 1.7836266533364832e-06, + "loss": 0.5586, + "step": 66098 + }, + { + "epoch": 0.8262706567664192, + "grad_norm": 7.041337966918945, + "learning_rate": 1.783129242714865e-06, + "loss": 1.0226, + "step": 66100 + }, + { + "epoch": 0.8262956573914347, + "grad_norm": 2.905385732650757, + "learning_rate": 1.7826318946717746e-06, + "loss": 1.3638, + "step": 66102 + }, + { + "epoch": 0.8263206580164504, + "grad_norm": 2.526149272918701, + "learning_rate": 1.7821346092109958e-06, + "loss": 1.2136, + "step": 66104 + }, + { + "epoch": 0.826345658641466, + "grad_norm": 0.0005488564493134618, + "learning_rate": 1.781637386336319e-06, + "loss": 0.5788, + "step": 66106 + }, + { + "epoch": 0.8263706592664817, + "grad_norm": 3.9365110397338867, + "learning_rate": 1.781140226051531e-06, + "loss": 1.2051, + "step": 66108 + }, + { + "epoch": 0.8263956598914973, + "grad_norm": 3.5368845462799072, + "learning_rate": 1.780643128360413e-06, + "loss": 1.6646, + "step": 66110 + }, + { + "epoch": 0.8264206605165129, + "grad_norm": 0.00041911343578249216, + "learning_rate": 1.7801460932667581e-06, + "loss": 0.1762, + "step": 66112 + }, + { + "epoch": 0.8264456611415285, + "grad_norm": 0.0004502916126511991, + "learning_rate": 1.779649120774345e-06, + "loss": 0.7284, + "step": 66114 + }, + { + "epoch": 0.8264706617665442, + "grad_norm": 21.26226043701172, + "learning_rate": 1.7791522108869641e-06, + "loss": 1.0163, + "step": 66116 + }, + { + "epoch": 0.8264956623915598, + "grad_norm": 5.466238975524902, + "learning_rate": 1.7786553636083958e-06, + "loss": 0.8007, + "step": 66118 + }, + { + "epoch": 0.8265206630165755, + "grad_norm": 5.158237457275391, + "learning_rate": 1.7781585789424271e-06, + "loss": 2.4692, + "step": 66120 + }, + { + "epoch": 0.826545663641591, + "grad_norm": 3.7709896564483643, + "learning_rate": 1.7776618568928405e-06, + "loss": 1.6786, + "step": 66122 + }, + { + "epoch": 0.8265706642666066, + "grad_norm": 2.9857707023620605, + "learning_rate": 1.7771651974634163e-06, + "loss": 0.4593, + "step": 66124 + }, + { + "epoch": 0.8265956648916223, + "grad_norm": 4.990049839019775, + "learning_rate": 1.7766686006579414e-06, + "loss": 1.1838, + "step": 66126 + }, + { + "epoch": 0.8266206655166379, + "grad_norm": 2.1667261123657227, + "learning_rate": 1.7761720664801929e-06, + "loss": 0.063, + "step": 66128 + }, + { + "epoch": 0.8266456661416536, + "grad_norm": 2.224836587905884, + "learning_rate": 1.7756755949339578e-06, + "loss": 1.9407, + "step": 66130 + }, + { + "epoch": 0.8266706667666691, + "grad_norm": 0.0003568749816622585, + "learning_rate": 1.7751791860230116e-06, + "loss": 0.08, + "step": 66132 + }, + { + "epoch": 0.8266956673916848, + "grad_norm": 7.945469856262207, + "learning_rate": 1.7746828397511406e-06, + "loss": 2.251, + "step": 66134 + }, + { + "epoch": 0.8267206680167004, + "grad_norm": 3.3672475814819336, + "learning_rate": 1.7741865561221216e-06, + "loss": 0.4491, + "step": 66136 + }, + { + "epoch": 0.8267456686417161, + "grad_norm": 0.005136319901794195, + "learning_rate": 1.7736903351397317e-06, + "loss": 0.0002, + "step": 66138 + }, + { + "epoch": 0.8267706692667317, + "grad_norm": 2.576514959335327, + "learning_rate": 1.7731941768077555e-06, + "loss": 1.3115, + "step": 66140 + }, + { + "epoch": 0.8267956698917472, + "grad_norm": 4.156780242919922, + "learning_rate": 1.7726980811299665e-06, + "loss": 1.1022, + "step": 66142 + }, + { + "epoch": 0.8268206705167629, + "grad_norm": 3.308375358581543, + "learning_rate": 1.7722020481101488e-06, + "loss": 1.3547, + "step": 66144 + }, + { + "epoch": 0.8268456711417785, + "grad_norm": 2.4877281188964844, + "learning_rate": 1.7717060777520755e-06, + "loss": 0.6102, + "step": 66146 + }, + { + "epoch": 0.8268706717667942, + "grad_norm": 3.6127753257751465, + "learning_rate": 1.771210170059524e-06, + "loss": 2.2812, + "step": 66148 + }, + { + "epoch": 0.8268956723918098, + "grad_norm": 3.106217861175537, + "learning_rate": 1.7707143250362736e-06, + "loss": 1.469, + "step": 66150 + }, + { + "epoch": 0.8269206730168254, + "grad_norm": 3.209839344024658, + "learning_rate": 1.7702185426860975e-06, + "loss": 1.3428, + "step": 66152 + }, + { + "epoch": 0.826945673641841, + "grad_norm": 2.4494576454162598, + "learning_rate": 1.769722823012775e-06, + "loss": 0.4981, + "step": 66154 + }, + { + "epoch": 0.8269706742668567, + "grad_norm": 4.261344909667969, + "learning_rate": 1.7692271660200777e-06, + "loss": 1.0736, + "step": 66156 + }, + { + "epoch": 0.8269956748918723, + "grad_norm": 2.4026482105255127, + "learning_rate": 1.7687315717117838e-06, + "loss": 1.4591, + "step": 66158 + }, + { + "epoch": 0.827020675516888, + "grad_norm": 4.254002571105957, + "learning_rate": 1.7682360400916675e-06, + "loss": 1.1188, + "step": 66160 + }, + { + "epoch": 0.8270456761419035, + "grad_norm": 2.457305431365967, + "learning_rate": 1.7677405711634987e-06, + "loss": 1.3346, + "step": 66162 + }, + { + "epoch": 0.8270706767669191, + "grad_norm": 0.8500862121582031, + "learning_rate": 1.7672451649310552e-06, + "loss": 0.025, + "step": 66164 + }, + { + "epoch": 0.8270956773919348, + "grad_norm": 1.2450928688049316, + "learning_rate": 1.7667498213981071e-06, + "loss": 0.9608, + "step": 66166 + }, + { + "epoch": 0.8271206780169504, + "grad_norm": 3.512540340423584, + "learning_rate": 1.7662545405684294e-06, + "loss": 1.3619, + "step": 66168 + }, + { + "epoch": 0.8271456786419661, + "grad_norm": 0.7929957509040833, + "learning_rate": 1.7657593224457902e-06, + "loss": 0.2479, + "step": 66170 + }, + { + "epoch": 0.8271706792669816, + "grad_norm": 6.4179606437683105, + "learning_rate": 1.7652641670339677e-06, + "loss": 2.2115, + "step": 66172 + }, + { + "epoch": 0.8271956798919973, + "grad_norm": 2.445054769515991, + "learning_rate": 1.7647690743367274e-06, + "loss": 0.5504, + "step": 66174 + }, + { + "epoch": 0.8272206805170129, + "grad_norm": 2.655454158782959, + "learning_rate": 1.7642740443578389e-06, + "loss": 0.4614, + "step": 66176 + }, + { + "epoch": 0.8272456811420286, + "grad_norm": 5.9515252113342285, + "learning_rate": 1.7637790771010777e-06, + "loss": 0.8029, + "step": 66178 + }, + { + "epoch": 0.8272706817670442, + "grad_norm": 8.287944793701172, + "learning_rate": 1.7632841725702066e-06, + "loss": 1.6105, + "step": 66180 + }, + { + "epoch": 0.8272956823920598, + "grad_norm": 5.7265849113464355, + "learning_rate": 1.7627893307690025e-06, + "loss": 1.8533, + "step": 66182 + }, + { + "epoch": 0.8273206830170754, + "grad_norm": 0.00031503415084443986, + "learning_rate": 1.762294551701227e-06, + "loss": 0.4842, + "step": 66184 + }, + { + "epoch": 0.827345683642091, + "grad_norm": 3.9889631271362305, + "learning_rate": 1.7617998353706533e-06, + "loss": 0.5458, + "step": 66186 + }, + { + "epoch": 0.8273706842671067, + "grad_norm": 2.1592650413513184, + "learning_rate": 1.7613051817810468e-06, + "loss": 0.8948, + "step": 66188 + }, + { + "epoch": 0.8273956848921223, + "grad_norm": 3.0724987983703613, + "learning_rate": 1.7608105909361739e-06, + "loss": 0.3771, + "step": 66190 + }, + { + "epoch": 0.8274206855171379, + "grad_norm": 0.0033177309669554234, + "learning_rate": 1.760316062839804e-06, + "loss": 0.0004, + "step": 66192 + }, + { + "epoch": 0.8274456861421535, + "grad_norm": 3.1663460731506348, + "learning_rate": 1.7598215974956989e-06, + "loss": 1.1345, + "step": 66194 + }, + { + "epoch": 0.8274706867671692, + "grad_norm": 4.8106608390808105, + "learning_rate": 1.7593271949076307e-06, + "loss": 1.0943, + "step": 66196 + }, + { + "epoch": 0.8274956873921848, + "grad_norm": 2.030607223510742, + "learning_rate": 1.7588328550793599e-06, + "loss": 0.1422, + "step": 66198 + }, + { + "epoch": 0.8275206880172005, + "grad_norm": 3.691429853439331, + "learning_rate": 1.758338578014651e-06, + "loss": 0.9422, + "step": 66200 + }, + { + "epoch": 0.827545688642216, + "grad_norm": 0.15952126681804657, + "learning_rate": 1.7578443637172726e-06, + "loss": 0.0961, + "step": 66202 + }, + { + "epoch": 0.8275706892672317, + "grad_norm": 4.321213245391846, + "learning_rate": 1.7573502121909836e-06, + "loss": 0.9255, + "step": 66204 + }, + { + "epoch": 0.8275956898922473, + "grad_norm": 3.768474817276001, + "learning_rate": 1.7568561234395521e-06, + "loss": 0.6756, + "step": 66206 + }, + { + "epoch": 0.827620690517263, + "grad_norm": 0.00023474845511373132, + "learning_rate": 1.7563620974667362e-06, + "loss": 0.7081, + "step": 66208 + }, + { + "epoch": 0.8276456911422786, + "grad_norm": 8.048026084899902, + "learning_rate": 1.7558681342763028e-06, + "loss": 1.3192, + "step": 66210 + }, + { + "epoch": 0.8276706917672941, + "grad_norm": 1.4570318460464478, + "learning_rate": 1.7553742338720126e-06, + "loss": 0.6692, + "step": 66212 + }, + { + "epoch": 0.8276956923923098, + "grad_norm": 1.6250756978988647, + "learning_rate": 1.7548803962576233e-06, + "loss": 0.9969, + "step": 66214 + }, + { + "epoch": 0.8277206930173254, + "grad_norm": 0.6536098122596741, + "learning_rate": 1.754386621436902e-06, + "loss": 0.875, + "step": 66216 + }, + { + "epoch": 0.8277456936423411, + "grad_norm": 3.8641374111175537, + "learning_rate": 1.7538929094136037e-06, + "loss": 0.6909, + "step": 66218 + }, + { + "epoch": 0.8277706942673567, + "grad_norm": 3.7668416500091553, + "learning_rate": 1.7533992601914928e-06, + "loss": 1.395, + "step": 66220 + }, + { + "epoch": 0.8277956948923723, + "grad_norm": 2.372798442840576, + "learning_rate": 1.7529056737743244e-06, + "loss": 0.7555, + "step": 66222 + }, + { + "epoch": 0.8278206955173879, + "grad_norm": 5.555505752563477, + "learning_rate": 1.752412150165863e-06, + "loss": 0.8653, + "step": 66224 + }, + { + "epoch": 0.8278456961424036, + "grad_norm": 4.741435527801514, + "learning_rate": 1.7519186893698648e-06, + "loss": 0.8444, + "step": 66226 + }, + { + "epoch": 0.8278706967674192, + "grad_norm": 0.0003633357409853488, + "learning_rate": 1.7514252913900843e-06, + "loss": 0.9668, + "step": 66228 + }, + { + "epoch": 0.8278956973924348, + "grad_norm": 4.197598934173584, + "learning_rate": 1.750931956230285e-06, + "loss": 1.4727, + "step": 66230 + }, + { + "epoch": 0.8279206980174504, + "grad_norm": 4.215932846069336, + "learning_rate": 1.7504386838942189e-06, + "loss": 1.374, + "step": 66232 + }, + { + "epoch": 0.827945698642466, + "grad_norm": 2.8972415924072266, + "learning_rate": 1.749945474385647e-06, + "loss": 1.2369, + "step": 66234 + }, + { + "epoch": 0.8279706992674817, + "grad_norm": 3.6496567726135254, + "learning_rate": 1.7494523277083219e-06, + "loss": 0.7878, + "step": 66236 + }, + { + "epoch": 0.8279956998924973, + "grad_norm": 2.8450028896331787, + "learning_rate": 1.7489592438660041e-06, + "loss": 0.6035, + "step": 66238 + }, + { + "epoch": 0.828020700517513, + "grad_norm": 4.6264472007751465, + "learning_rate": 1.7484662228624449e-06, + "loss": 0.9999, + "step": 66240 + }, + { + "epoch": 0.8280457011425285, + "grad_norm": 6.110164642333984, + "learning_rate": 1.7479732647013992e-06, + "loss": 0.9245, + "step": 66242 + }, + { + "epoch": 0.8280707017675442, + "grad_norm": 4.142098426818848, + "learning_rate": 1.7474803693866227e-06, + "loss": 1.8778, + "step": 66244 + }, + { + "epoch": 0.8280957023925598, + "grad_norm": 3.318638801574707, + "learning_rate": 1.7469875369218669e-06, + "loss": 0.9011, + "step": 66246 + }, + { + "epoch": 0.8281207030175755, + "grad_norm": 4.896227836608887, + "learning_rate": 1.74649476731089e-06, + "loss": 0.7868, + "step": 66248 + }, + { + "epoch": 0.8281457036425911, + "grad_norm": 3.9318385124206543, + "learning_rate": 1.7460020605574412e-06, + "loss": 0.5852, + "step": 66250 + }, + { + "epoch": 0.8281707042676066, + "grad_norm": 3.7146694660186768, + "learning_rate": 1.7455094166652719e-06, + "loss": 0.557, + "step": 66252 + }, + { + "epoch": 0.8281957048926223, + "grad_norm": 4.773582935333252, + "learning_rate": 1.745016835638137e-06, + "loss": 1.3409, + "step": 66254 + }, + { + "epoch": 0.8282207055176379, + "grad_norm": 2.841383218765259, + "learning_rate": 1.7445243174797844e-06, + "loss": 0.4709, + "step": 66256 + }, + { + "epoch": 0.8282457061426536, + "grad_norm": 2.005887269973755, + "learning_rate": 1.74403186219397e-06, + "loss": 0.8393, + "step": 66258 + }, + { + "epoch": 0.8282707067676692, + "grad_norm": 0.0003127622476313263, + "learning_rate": 1.7435394697844387e-06, + "loss": 1.0536, + "step": 66260 + }, + { + "epoch": 0.8282957073926848, + "grad_norm": 0.0004779692681040615, + "learning_rate": 1.7430471402549455e-06, + "loss": 0.7374, + "step": 66262 + }, + { + "epoch": 0.8283207080177004, + "grad_norm": 2.302769660949707, + "learning_rate": 1.742554873609238e-06, + "loss": 1.1669, + "step": 66264 + }, + { + "epoch": 0.8283457086427161, + "grad_norm": 4.362380027770996, + "learning_rate": 1.7420626698510612e-06, + "loss": 1.6588, + "step": 66266 + }, + { + "epoch": 0.8283707092677317, + "grad_norm": 2.232682704925537, + "learning_rate": 1.7415705289841712e-06, + "loss": 0.2136, + "step": 66268 + }, + { + "epoch": 0.8283957098927474, + "grad_norm": 8.56028938293457, + "learning_rate": 1.741078451012309e-06, + "loss": 1.2808, + "step": 66270 + }, + { + "epoch": 0.8284207105177629, + "grad_norm": 0.00045398916699923575, + "learning_rate": 1.7405864359392288e-06, + "loss": 0.1131, + "step": 66272 + }, + { + "epoch": 0.8284457111427785, + "grad_norm": 2.369910478591919, + "learning_rate": 1.7400944837686706e-06, + "loss": 0.3759, + "step": 66274 + }, + { + "epoch": 0.8284707117677942, + "grad_norm": 6.081723690032959, + "learning_rate": 1.7396025945043881e-06, + "loss": 1.8068, + "step": 66276 + }, + { + "epoch": 0.8284957123928098, + "grad_norm": 1.8704966306686401, + "learning_rate": 1.739110768150124e-06, + "loss": 0.4313, + "step": 66278 + }, + { + "epoch": 0.8285207130178255, + "grad_norm": 2.3956472873687744, + "learning_rate": 1.7386190047096218e-06, + "loss": 1.2042, + "step": 66280 + }, + { + "epoch": 0.828545713642841, + "grad_norm": 0.0003071495739277452, + "learning_rate": 1.7381273041866309e-06, + "loss": 0.5447, + "step": 66282 + }, + { + "epoch": 0.8285707142678567, + "grad_norm": 3.0992650985717773, + "learning_rate": 1.7376356665848914e-06, + "loss": 1.2585, + "step": 66284 + }, + { + "epoch": 0.8285957148928723, + "grad_norm": 5.424062252044678, + "learning_rate": 1.737144091908154e-06, + "loss": 0.148, + "step": 66286 + }, + { + "epoch": 0.828620715517888, + "grad_norm": 0.0006461080629378557, + "learning_rate": 1.7366525801601553e-06, + "loss": 0.5921, + "step": 66288 + }, + { + "epoch": 0.8286457161429036, + "grad_norm": 7.389684677124023, + "learning_rate": 1.7361611313446447e-06, + "loss": 0.2865, + "step": 66290 + }, + { + "epoch": 0.8286707167679191, + "grad_norm": 6.874575614929199, + "learning_rate": 1.7356697454653614e-06, + "loss": 1.2524, + "step": 66292 + }, + { + "epoch": 0.8286957173929348, + "grad_norm": 3.2411725521087646, + "learning_rate": 1.7351784225260481e-06, + "loss": 0.9164, + "step": 66294 + }, + { + "epoch": 0.8287207180179504, + "grad_norm": 0.0003004029858857393, + "learning_rate": 1.7346871625304484e-06, + "loss": 0.9618, + "step": 66296 + }, + { + "epoch": 0.8287457186429661, + "grad_norm": 0.009163287468254566, + "learning_rate": 1.7341959654823005e-06, + "loss": 0.6011, + "step": 66298 + }, + { + "epoch": 0.8287707192679817, + "grad_norm": 2.742021322250366, + "learning_rate": 1.7337048313853488e-06, + "loss": 0.9911, + "step": 66300 + }, + { + "epoch": 0.8287957198929973, + "grad_norm": 0.0003823024744633585, + "learning_rate": 1.733213760243333e-06, + "loss": 0.378, + "step": 66302 + }, + { + "epoch": 0.8288207205180129, + "grad_norm": 3.1326394081115723, + "learning_rate": 1.7327227520599887e-06, + "loss": 1.293, + "step": 66304 + }, + { + "epoch": 0.8288457211430286, + "grad_norm": 2.65275239944458, + "learning_rate": 1.732231806839062e-06, + "loss": 1.0181, + "step": 66306 + }, + { + "epoch": 0.8288707217680442, + "grad_norm": 7.991662979125977, + "learning_rate": 1.7317409245842864e-06, + "loss": 1.0879, + "step": 66308 + }, + { + "epoch": 0.8288957223930599, + "grad_norm": 0.015284408815205097, + "learning_rate": 1.7312501052994058e-06, + "loss": 0.0132, + "step": 66310 + }, + { + "epoch": 0.8289207230180754, + "grad_norm": 6.331485748291016, + "learning_rate": 1.7307593489881513e-06, + "loss": 2.0583, + "step": 66312 + }, + { + "epoch": 0.828945723643091, + "grad_norm": 5.091580390930176, + "learning_rate": 1.7302686556542646e-06, + "loss": 0.9554, + "step": 66314 + }, + { + "epoch": 0.8289707242681067, + "grad_norm": 0.0004444293736014515, + "learning_rate": 1.7297780253014874e-06, + "loss": 0.027, + "step": 66316 + }, + { + "epoch": 0.8289957248931223, + "grad_norm": 2.0470077991485596, + "learning_rate": 1.7292874579335462e-06, + "loss": 1.1273, + "step": 66318 + }, + { + "epoch": 0.829020725518138, + "grad_norm": 5.198929309844971, + "learning_rate": 1.7287969535541848e-06, + "loss": 1.0792, + "step": 66320 + }, + { + "epoch": 0.8290457261431535, + "grad_norm": 3.9207355976104736, + "learning_rate": 1.728306512167134e-06, + "loss": 0.9309, + "step": 66322 + }, + { + "epoch": 0.8290707267681692, + "grad_norm": 4.592631816864014, + "learning_rate": 1.7278161337761335e-06, + "loss": 2.009, + "step": 66324 + }, + { + "epoch": 0.8290957273931848, + "grad_norm": 2.8085243701934814, + "learning_rate": 1.7273258183849117e-06, + "loss": 0.4783, + "step": 66326 + }, + { + "epoch": 0.8291207280182005, + "grad_norm": 3.7971150875091553, + "learning_rate": 1.7268355659972103e-06, + "loss": 1.2436, + "step": 66328 + }, + { + "epoch": 0.8291457286432161, + "grad_norm": 0.00039248395478352904, + "learning_rate": 1.7263453766167594e-06, + "loss": 0.0, + "step": 66330 + }, + { + "epoch": 0.8291707292682317, + "grad_norm": 3.4957194328308105, + "learning_rate": 1.725855250247289e-06, + "loss": 0.7057, + "step": 66332 + }, + { + "epoch": 0.8291957298932473, + "grad_norm": 1.6911122798919678, + "learning_rate": 1.725365186892538e-06, + "loss": 0.5049, + "step": 66334 + }, + { + "epoch": 0.829220730518263, + "grad_norm": 1.9648711681365967, + "learning_rate": 1.724875186556233e-06, + "loss": 0.7707, + "step": 66336 + }, + { + "epoch": 0.8292457311432786, + "grad_norm": 3.4181177616119385, + "learning_rate": 1.7243852492421086e-06, + "loss": 0.9331, + "step": 66338 + }, + { + "epoch": 0.8292707317682942, + "grad_norm": 4.778432846069336, + "learning_rate": 1.7238953749538979e-06, + "loss": 1.2962, + "step": 66340 + }, + { + "epoch": 0.8292957323933098, + "grad_norm": 3.9207589626312256, + "learning_rate": 1.7234055636953296e-06, + "loss": 1.2509, + "step": 66342 + }, + { + "epoch": 0.8293207330183254, + "grad_norm": 3.5353455543518066, + "learning_rate": 1.7229158154701341e-06, + "loss": 1.2368, + "step": 66344 + }, + { + "epoch": 0.8293457336433411, + "grad_norm": 2.1495521068573, + "learning_rate": 1.7224261302820388e-06, + "loss": 1.004, + "step": 66346 + }, + { + "epoch": 0.8293707342683567, + "grad_norm": 3.2022626399993896, + "learning_rate": 1.7219365081347783e-06, + "loss": 0.9019, + "step": 66348 + }, + { + "epoch": 0.8293957348933724, + "grad_norm": 3.3026578426361084, + "learning_rate": 1.7214469490320762e-06, + "loss": 1.2434, + "step": 66350 + }, + { + "epoch": 0.8294207355183879, + "grad_norm": 4.215468406677246, + "learning_rate": 1.720957452977664e-06, + "loss": 0.3818, + "step": 66352 + }, + { + "epoch": 0.8294457361434036, + "grad_norm": 2.731447219848633, + "learning_rate": 1.720468019975271e-06, + "loss": 0.8429, + "step": 66354 + }, + { + "epoch": 0.8294707367684192, + "grad_norm": 1.1076889038085938, + "learning_rate": 1.719978650028622e-06, + "loss": 0.5579, + "step": 66356 + }, + { + "epoch": 0.8294957373934349, + "grad_norm": 2.615739583969116, + "learning_rate": 1.719489343141446e-06, + "loss": 0.6356, + "step": 66358 + }, + { + "epoch": 0.8295207380184505, + "grad_norm": 0.00045899272663518786, + "learning_rate": 1.719000099317465e-06, + "loss": 0.7774, + "step": 66360 + }, + { + "epoch": 0.829545738643466, + "grad_norm": 3.5868680477142334, + "learning_rate": 1.718510918560411e-06, + "loss": 0.2302, + "step": 66362 + }, + { + "epoch": 0.8295707392684817, + "grad_norm": 0.9750949740409851, + "learning_rate": 1.7180218008740035e-06, + "loss": 0.1191, + "step": 66364 + }, + { + "epoch": 0.8295957398934973, + "grad_norm": 4.2162957191467285, + "learning_rate": 1.7175327462619717e-06, + "loss": 0.8941, + "step": 66366 + }, + { + "epoch": 0.829620740518513, + "grad_norm": 3.0059914588928223, + "learning_rate": 1.7170437547280438e-06, + "loss": 0.8778, + "step": 66368 + }, + { + "epoch": 0.8296457411435286, + "grad_norm": 5.254086971282959, + "learning_rate": 1.7165548262759335e-06, + "loss": 1.7399, + "step": 66370 + }, + { + "epoch": 0.8296707417685442, + "grad_norm": 5.4212727546691895, + "learning_rate": 1.7160659609093743e-06, + "loss": 1.1956, + "step": 66372 + }, + { + "epoch": 0.8296957423935598, + "grad_norm": 2.4975149631500244, + "learning_rate": 1.7155771586320823e-06, + "loss": 0.4242, + "step": 66374 + }, + { + "epoch": 0.8297207430185755, + "grad_norm": 0.031797632575035095, + "learning_rate": 1.7150884194477824e-06, + "loss": 0.0002, + "step": 66376 + }, + { + "epoch": 0.8297457436435911, + "grad_norm": 3.0807583332061768, + "learning_rate": 1.7145997433602013e-06, + "loss": 0.45, + "step": 66378 + }, + { + "epoch": 0.8297707442686068, + "grad_norm": 0.9675164818763733, + "learning_rate": 1.714111130373053e-06, + "loss": 1.0841, + "step": 66380 + }, + { + "epoch": 0.8297957448936223, + "grad_norm": 2.780871868133545, + "learning_rate": 1.7136225804900686e-06, + "loss": 0.882, + "step": 66382 + }, + { + "epoch": 0.8298207455186379, + "grad_norm": 3.8343288898468018, + "learning_rate": 1.7131340937149577e-06, + "loss": 1.4736, + "step": 66384 + }, + { + "epoch": 0.8298457461436536, + "grad_norm": 3.57808780670166, + "learning_rate": 1.7126456700514481e-06, + "loss": 0.9175, + "step": 66386 + }, + { + "epoch": 0.8298707467686692, + "grad_norm": 5.115841865539551, + "learning_rate": 1.712157309503254e-06, + "loss": 0.5254, + "step": 66388 + }, + { + "epoch": 0.8298957473936849, + "grad_norm": 1.9739091396331787, + "learning_rate": 1.7116690120740985e-06, + "loss": 0.7165, + "step": 66390 + }, + { + "epoch": 0.8299207480187004, + "grad_norm": 0.012628596276044846, + "learning_rate": 1.7111807777677024e-06, + "loss": 0.8029, + "step": 66392 + }, + { + "epoch": 0.8299457486437161, + "grad_norm": 2.547518491744995, + "learning_rate": 1.7106926065877806e-06, + "loss": 0.9247, + "step": 66394 + }, + { + "epoch": 0.8299707492687317, + "grad_norm": 4.110832214355469, + "learning_rate": 1.7102044985380518e-06, + "loss": 1.1458, + "step": 66396 + }, + { + "epoch": 0.8299957498937474, + "grad_norm": 0.00029846435063518584, + "learning_rate": 1.709716453622231e-06, + "loss": 0.0249, + "step": 66398 + }, + { + "epoch": 0.830020750518763, + "grad_norm": 2.506603956222534, + "learning_rate": 1.7092284718440378e-06, + "loss": 1.4139, + "step": 66400 + }, + { + "epoch": 0.8300457511437785, + "grad_norm": 2.9510557651519775, + "learning_rate": 1.7087405532071899e-06, + "loss": 1.5225, + "step": 66402 + }, + { + "epoch": 0.8300707517687942, + "grad_norm": 4.0300397872924805, + "learning_rate": 1.7082526977154e-06, + "loss": 0.8364, + "step": 66404 + }, + { + "epoch": 0.8300957523938098, + "grad_norm": 9.66238784790039, + "learning_rate": 1.7077649053723866e-06, + "loss": 0.8679, + "step": 66406 + }, + { + "epoch": 0.8301207530188255, + "grad_norm": 3.2651729583740234, + "learning_rate": 1.707277176181863e-06, + "loss": 1.0055, + "step": 66408 + }, + { + "epoch": 0.8301457536438411, + "grad_norm": 2.4432077407836914, + "learning_rate": 1.706789510147544e-06, + "loss": 0.4415, + "step": 66410 + }, + { + "epoch": 0.8301707542688567, + "grad_norm": 4.360109329223633, + "learning_rate": 1.7063019072731413e-06, + "loss": 1.4222, + "step": 66412 + }, + { + "epoch": 0.8301957548938723, + "grad_norm": 0.0002474311622790992, + "learning_rate": 1.7058143675623695e-06, + "loss": 0.8561, + "step": 66414 + }, + { + "epoch": 0.830220755518888, + "grad_norm": 1.8046168088912964, + "learning_rate": 1.7053268910189458e-06, + "loss": 0.2589, + "step": 66416 + }, + { + "epoch": 0.8302457561439036, + "grad_norm": 0.1232917457818985, + "learning_rate": 1.704839477646577e-06, + "loss": 0.5263, + "step": 66418 + }, + { + "epoch": 0.8302707567689193, + "grad_norm": 0.0002284643123857677, + "learning_rate": 1.7043521274489817e-06, + "loss": 0.3699, + "step": 66420 + }, + { + "epoch": 0.8302957573939348, + "grad_norm": 3.9601974487304688, + "learning_rate": 1.7038648404298642e-06, + "loss": 1.6504, + "step": 66422 + }, + { + "epoch": 0.8303207580189504, + "grad_norm": 3.3443198204040527, + "learning_rate": 1.7033776165929393e-06, + "loss": 0.7521, + "step": 66424 + }, + { + "epoch": 0.8303457586439661, + "grad_norm": 2.951314687728882, + "learning_rate": 1.7028904559419168e-06, + "loss": 0.7055, + "step": 66426 + }, + { + "epoch": 0.8303707592689817, + "grad_norm": 6.172350883483887, + "learning_rate": 1.7024033584805056e-06, + "loss": 1.5841, + "step": 66428 + }, + { + "epoch": 0.8303957598939974, + "grad_norm": 3.423043966293335, + "learning_rate": 1.7019163242124203e-06, + "loss": 0.9022, + "step": 66430 + }, + { + "epoch": 0.8304207605190129, + "grad_norm": 2.528848648071289, + "learning_rate": 1.7014293531413651e-06, + "loss": 0.8501, + "step": 66432 + }, + { + "epoch": 0.8304457611440286, + "grad_norm": 5.524026393890381, + "learning_rate": 1.7009424452710544e-06, + "loss": 0.4314, + "step": 66434 + }, + { + "epoch": 0.8304707617690442, + "grad_norm": 3.5888214111328125, + "learning_rate": 1.7004556006051876e-06, + "loss": 1.4747, + "step": 66436 + }, + { + "epoch": 0.8304957623940599, + "grad_norm": 2.882563829421997, + "learning_rate": 1.6999688191474772e-06, + "loss": 0.8285, + "step": 66438 + }, + { + "epoch": 0.8305207630190755, + "grad_norm": 4.347578525543213, + "learning_rate": 1.6994821009016328e-06, + "loss": 0.6519, + "step": 66440 + }, + { + "epoch": 0.830545763644091, + "grad_norm": 1.9237463474273682, + "learning_rate": 1.6989954458713575e-06, + "loss": 1.3121, + "step": 66442 + }, + { + "epoch": 0.8305707642691067, + "grad_norm": 3.6506905555725098, + "learning_rate": 1.69850885406036e-06, + "loss": 0.3722, + "step": 66444 + }, + { + "epoch": 0.8305957648941223, + "grad_norm": 4.425179958343506, + "learning_rate": 1.698022325472346e-06, + "loss": 1.1377, + "step": 66446 + }, + { + "epoch": 0.830620765519138, + "grad_norm": 3.7813832759857178, + "learning_rate": 1.6975358601110203e-06, + "loss": 0.7141, + "step": 66448 + }, + { + "epoch": 0.8306457661441536, + "grad_norm": 3.4759151935577393, + "learning_rate": 1.6970494579800844e-06, + "loss": 1.5855, + "step": 66450 + }, + { + "epoch": 0.8306707667691692, + "grad_norm": 0.00046369724441319704, + "learning_rate": 1.6965631190832455e-06, + "loss": 0.3767, + "step": 66452 + }, + { + "epoch": 0.8306957673941848, + "grad_norm": 3.040539264678955, + "learning_rate": 1.6960768434242103e-06, + "loss": 1.408, + "step": 66454 + }, + { + "epoch": 0.8307207680192005, + "grad_norm": 0.00032395657035522163, + "learning_rate": 1.6955906310066773e-06, + "loss": 0.0002, + "step": 66456 + }, + { + "epoch": 0.8307457686442161, + "grad_norm": 4.044501304626465, + "learning_rate": 1.6951044818343543e-06, + "loss": 0.8536, + "step": 66458 + }, + { + "epoch": 0.8307707692692318, + "grad_norm": 3.614011526107788, + "learning_rate": 1.6946183959109407e-06, + "loss": 1.6046, + "step": 66460 + }, + { + "epoch": 0.8307957698942473, + "grad_norm": 3.2911128997802734, + "learning_rate": 1.694132373240137e-06, + "loss": 0.9941, + "step": 66462 + }, + { + "epoch": 0.830820770519263, + "grad_norm": 3.8970274925231934, + "learning_rate": 1.6936464138256493e-06, + "loss": 0.7577, + "step": 66464 + }, + { + "epoch": 0.8308457711442786, + "grad_norm": 3.1633145809173584, + "learning_rate": 1.693160517671173e-06, + "loss": 0.5637, + "step": 66466 + }, + { + "epoch": 0.8308707717692942, + "grad_norm": 0.00037654850166291, + "learning_rate": 1.6926746847804131e-06, + "loss": 0.874, + "step": 66468 + }, + { + "epoch": 0.8308957723943099, + "grad_norm": 0.6089878082275391, + "learning_rate": 1.692188915157067e-06, + "loss": 0.0219, + "step": 66470 + }, + { + "epoch": 0.8309207730193254, + "grad_norm": 5.460864543914795, + "learning_rate": 1.691703208804839e-06, + "loss": 0.5998, + "step": 66472 + }, + { + "epoch": 0.8309457736443411, + "grad_norm": 0.00042150882654823363, + "learning_rate": 1.691217565727421e-06, + "loss": 0.0, + "step": 66474 + }, + { + "epoch": 0.8309707742693567, + "grad_norm": 0.0003524583007674664, + "learning_rate": 1.6907319859285131e-06, + "loss": 1.3542, + "step": 66476 + }, + { + "epoch": 0.8309957748943724, + "grad_norm": 4.546672344207764, + "learning_rate": 1.6902464694118192e-06, + "loss": 0.8979, + "step": 66478 + }, + { + "epoch": 0.831020775519388, + "grad_norm": 1.8897687196731567, + "learning_rate": 1.6897610161810308e-06, + "loss": 1.2349, + "step": 66480 + }, + { + "epoch": 0.8310457761444036, + "grad_norm": 4.9263153076171875, + "learning_rate": 1.689275626239849e-06, + "loss": 0.9246, + "step": 66482 + }, + { + "epoch": 0.8310707767694192, + "grad_norm": 2.456345319747925, + "learning_rate": 1.6887902995919669e-06, + "loss": 0.6747, + "step": 66484 + }, + { + "epoch": 0.8310957773944349, + "grad_norm": 3.2852678298950195, + "learning_rate": 1.6883050362410868e-06, + "loss": 1.3256, + "step": 66486 + }, + { + "epoch": 0.8311207780194505, + "grad_norm": 0.1899346262216568, + "learning_rate": 1.687819836190896e-06, + "loss": 1.0696, + "step": 66488 + }, + { + "epoch": 0.8311457786444661, + "grad_norm": 3.4656851291656494, + "learning_rate": 1.6873346994450935e-06, + "loss": 1.0905, + "step": 66490 + }, + { + "epoch": 0.8311707792694817, + "grad_norm": 5.260101318359375, + "learning_rate": 1.6868496260073763e-06, + "loss": 1.526, + "step": 66492 + }, + { + "epoch": 0.8311957798944973, + "grad_norm": 1.0082343816757202, + "learning_rate": 1.6863646158814351e-06, + "loss": 0.6657, + "step": 66494 + }, + { + "epoch": 0.831220780519513, + "grad_norm": 2.9529662132263184, + "learning_rate": 1.6858796690709679e-06, + "loss": 1.3052, + "step": 66496 + }, + { + "epoch": 0.8312457811445286, + "grad_norm": 2.6266872882843018, + "learning_rate": 1.685394785579665e-06, + "loss": 0.5173, + "step": 66498 + }, + { + "epoch": 0.8312707817695443, + "grad_norm": 3.969957113265991, + "learning_rate": 1.6849099654112178e-06, + "loss": 0.8852, + "step": 66500 + }, + { + "epoch": 0.8312957823945598, + "grad_norm": 1.8641437292099, + "learning_rate": 1.6844252085693214e-06, + "loss": 0.4182, + "step": 66502 + }, + { + "epoch": 0.8313207830195755, + "grad_norm": 5.11187744140625, + "learning_rate": 1.683940515057666e-06, + "loss": 0.8588, + "step": 66504 + }, + { + "epoch": 0.8313457836445911, + "grad_norm": 3.1305558681488037, + "learning_rate": 1.6834558848799454e-06, + "loss": 0.6616, + "step": 66506 + }, + { + "epoch": 0.8313707842696068, + "grad_norm": 2.684718370437622, + "learning_rate": 1.6829713180398454e-06, + "loss": 0.2198, + "step": 66508 + }, + { + "epoch": 0.8313957848946224, + "grad_norm": 1.5763319730758667, + "learning_rate": 1.6824868145410633e-06, + "loss": 0.6529, + "step": 66510 + }, + { + "epoch": 0.8314207855196379, + "grad_norm": 6.140044212341309, + "learning_rate": 1.682002374387285e-06, + "loss": 0.6432, + "step": 66512 + }, + { + "epoch": 0.8314457861446536, + "grad_norm": 5.1661224365234375, + "learning_rate": 1.6815179975821982e-06, + "loss": 1.3186, + "step": 66514 + }, + { + "epoch": 0.8314707867696692, + "grad_norm": 0.001949392375536263, + "learning_rate": 1.6810336841294962e-06, + "loss": 0.1164, + "step": 66516 + }, + { + "epoch": 0.8314957873946849, + "grad_norm": 2.448187828063965, + "learning_rate": 1.6805494340328621e-06, + "loss": 0.4557, + "step": 66518 + }, + { + "epoch": 0.8315207880197005, + "grad_norm": 3.593759536743164, + "learning_rate": 1.68006524729599e-06, + "loss": 1.1522, + "step": 66520 + }, + { + "epoch": 0.8315457886447161, + "grad_norm": 4.5961384773254395, + "learning_rate": 1.6795811239225622e-06, + "loss": 0.4189, + "step": 66522 + }, + { + "epoch": 0.8315707892697317, + "grad_norm": 3.0743088722229004, + "learning_rate": 1.6790970639162707e-06, + "loss": 1.9811, + "step": 66524 + }, + { + "epoch": 0.8315957898947474, + "grad_norm": 1.121841311454773, + "learning_rate": 1.6786130672807988e-06, + "loss": 0.0194, + "step": 66526 + }, + { + "epoch": 0.831620790519763, + "grad_norm": 2.762308120727539, + "learning_rate": 1.6781291340198313e-06, + "loss": 0.4903, + "step": 66528 + }, + { + "epoch": 0.8316457911447787, + "grad_norm": 2.0901410579681396, + "learning_rate": 1.6776452641370566e-06, + "loss": 0.1179, + "step": 66530 + }, + { + "epoch": 0.8316707917697942, + "grad_norm": 4.906775951385498, + "learning_rate": 1.6771614576361572e-06, + "loss": 1.6267, + "step": 66532 + }, + { + "epoch": 0.8316957923948098, + "grad_norm": 0.13457486033439636, + "learning_rate": 1.6766777145208213e-06, + "loss": 0.6167, + "step": 66534 + }, + { + "epoch": 0.8317207930198255, + "grad_norm": 2.702103614807129, + "learning_rate": 1.6761940347947292e-06, + "loss": 0.9662, + "step": 66536 + }, + { + "epoch": 0.8317457936448411, + "grad_norm": 2.5344431400299072, + "learning_rate": 1.675710418461568e-06, + "loss": 0.2875, + "step": 66538 + }, + { + "epoch": 0.8317707942698568, + "grad_norm": 2.411036252975464, + "learning_rate": 1.6752268655250192e-06, + "loss": 0.8832, + "step": 66540 + }, + { + "epoch": 0.8317957948948723, + "grad_norm": 4.7490925788879395, + "learning_rate": 1.674743375988762e-06, + "loss": 1.0066, + "step": 66542 + }, + { + "epoch": 0.831820795519888, + "grad_norm": 0.048528581857681274, + "learning_rate": 1.674259949856486e-06, + "loss": 1.1236, + "step": 66544 + }, + { + "epoch": 0.8318457961449036, + "grad_norm": 3.3849339485168457, + "learning_rate": 1.6737765871318657e-06, + "loss": 0.9602, + "step": 66546 + }, + { + "epoch": 0.8318707967699193, + "grad_norm": 4.709310531616211, + "learning_rate": 1.673293287818587e-06, + "loss": 0.8476, + "step": 66548 + }, + { + "epoch": 0.8318957973949349, + "grad_norm": 2.3476669788360596, + "learning_rate": 1.6728100519203306e-06, + "loss": 0.7058, + "step": 66550 + }, + { + "epoch": 0.8319207980199504, + "grad_norm": 0.00032299928716383874, + "learning_rate": 1.672326879440772e-06, + "loss": 0.0436, + "step": 66552 + }, + { + "epoch": 0.8319457986449661, + "grad_norm": 3.968532085418701, + "learning_rate": 1.6718437703835966e-06, + "loss": 1.2537, + "step": 66554 + }, + { + "epoch": 0.8319707992699817, + "grad_norm": 5.247161388397217, + "learning_rate": 1.67136072475248e-06, + "loss": 1.2739, + "step": 66556 + }, + { + "epoch": 0.8319957998949974, + "grad_norm": 3.2398898601531982, + "learning_rate": 1.6708777425511037e-06, + "loss": 0.5366, + "step": 66558 + }, + { + "epoch": 0.832020800520013, + "grad_norm": 2.0730717182159424, + "learning_rate": 1.6703948237831424e-06, + "loss": 0.6861, + "step": 66560 + }, + { + "epoch": 0.8320458011450286, + "grad_norm": 0.8027196526527405, + "learning_rate": 1.669911968452279e-06, + "loss": 0.2765, + "step": 66562 + }, + { + "epoch": 0.8320708017700442, + "grad_norm": 2.620802640914917, + "learning_rate": 1.6694291765621872e-06, + "loss": 1.0289, + "step": 66564 + }, + { + "epoch": 0.8320958023950599, + "grad_norm": 0.0006784051656723022, + "learning_rate": 1.668946448116543e-06, + "loss": 0.0431, + "step": 66566 + }, + { + "epoch": 0.8321208030200755, + "grad_norm": 0.0002455572539474815, + "learning_rate": 1.6684637831190276e-06, + "loss": 0.0001, + "step": 66568 + }, + { + "epoch": 0.8321458036450912, + "grad_norm": 2.5573651790618896, + "learning_rate": 1.6679811815733104e-06, + "loss": 1.2838, + "step": 66570 + }, + { + "epoch": 0.8321708042701067, + "grad_norm": 2.4986321926116943, + "learning_rate": 1.667498643483073e-06, + "loss": 0.9298, + "step": 66572 + }, + { + "epoch": 0.8321958048951223, + "grad_norm": 6.417139053344727, + "learning_rate": 1.6670161688519848e-06, + "loss": 1.139, + "step": 66574 + }, + { + "epoch": 0.832220805520138, + "grad_norm": 3.4455814361572266, + "learning_rate": 1.6665337576837258e-06, + "loss": 0.4654, + "step": 66576 + }, + { + "epoch": 0.8322458061451536, + "grad_norm": 3.138517379760742, + "learning_rate": 1.666051409981968e-06, + "loss": 0.7855, + "step": 66578 + }, + { + "epoch": 0.8322708067701693, + "grad_norm": 5.439611911773682, + "learning_rate": 1.6655691257503815e-06, + "loss": 1.4435, + "step": 66580 + }, + { + "epoch": 0.8322958073951848, + "grad_norm": 6.009882926940918, + "learning_rate": 1.6650869049926433e-06, + "loss": 1.4478, + "step": 66582 + }, + { + "epoch": 0.8323208080202005, + "grad_norm": 5.756918430328369, + "learning_rate": 1.6646047477124228e-06, + "loss": 0.6904, + "step": 66584 + }, + { + "epoch": 0.8323458086452161, + "grad_norm": 1.3235949277877808, + "learning_rate": 1.6641226539133959e-06, + "loss": 1.5569, + "step": 66586 + }, + { + "epoch": 0.8323708092702318, + "grad_norm": 3.207864284515381, + "learning_rate": 1.6636406235992298e-06, + "loss": 1.8306, + "step": 66588 + }, + { + "epoch": 0.8323958098952474, + "grad_norm": 2.7529337406158447, + "learning_rate": 1.6631586567736013e-06, + "loss": 1.1076, + "step": 66590 + }, + { + "epoch": 0.832420810520263, + "grad_norm": 2.1814990043640137, + "learning_rate": 1.6626767534401766e-06, + "loss": 0.9618, + "step": 66592 + }, + { + "epoch": 0.8324458111452786, + "grad_norm": 2.4731545448303223, + "learning_rate": 1.662194913602624e-06, + "loss": 0.5336, + "step": 66594 + }, + { + "epoch": 0.8324708117702943, + "grad_norm": 0.04480215162038803, + "learning_rate": 1.6617131372646188e-06, + "loss": 0.1358, + "step": 66596 + }, + { + "epoch": 0.8324958123953099, + "grad_norm": 3.7671806812286377, + "learning_rate": 1.6612314244298245e-06, + "loss": 0.8788, + "step": 66598 + }, + { + "epoch": 0.8325208130203255, + "grad_norm": 3.720158338546753, + "learning_rate": 1.6607497751019153e-06, + "loss": 1.251, + "step": 66600 + }, + { + "epoch": 0.8325458136453411, + "grad_norm": 1.1441242694854736, + "learning_rate": 1.6602681892845562e-06, + "loss": 1.1099, + "step": 66602 + }, + { + "epoch": 0.8325708142703567, + "grad_norm": 2.9510481357574463, + "learning_rate": 1.6597866669814123e-06, + "loss": 0.1873, + "step": 66604 + }, + { + "epoch": 0.8325958148953724, + "grad_norm": 4.3643364906311035, + "learning_rate": 1.659305208196157e-06, + "loss": 0.8636, + "step": 66606 + }, + { + "epoch": 0.832620815520388, + "grad_norm": 0.005498261656612158, + "learning_rate": 1.6588238129324508e-06, + "loss": 0.4868, + "step": 66608 + }, + { + "epoch": 0.8326458161454037, + "grad_norm": 4.222882270812988, + "learning_rate": 1.6583424811939664e-06, + "loss": 1.0688, + "step": 66610 + }, + { + "epoch": 0.8326708167704192, + "grad_norm": 4.005533218383789, + "learning_rate": 1.657861212984362e-06, + "loss": 0.3453, + "step": 66612 + }, + { + "epoch": 0.8326958173954349, + "grad_norm": 0.0003830904606729746, + "learning_rate": 1.65738000830731e-06, + "loss": 0.0, + "step": 66614 + }, + { + "epoch": 0.8327208180204505, + "grad_norm": 3.2891881465911865, + "learning_rate": 1.656898867166472e-06, + "loss": 0.8537, + "step": 66616 + }, + { + "epoch": 0.8327458186454662, + "grad_norm": 1.9246678352355957, + "learning_rate": 1.656417789565511e-06, + "loss": 0.3604, + "step": 66618 + }, + { + "epoch": 0.8327708192704818, + "grad_norm": 3.6629226207733154, + "learning_rate": 1.6559367755080936e-06, + "loss": 0.6852, + "step": 66620 + }, + { + "epoch": 0.8327958198954973, + "grad_norm": 1.9802720546722412, + "learning_rate": 1.655455824997879e-06, + "loss": 0.2598, + "step": 66622 + }, + { + "epoch": 0.832820820520513, + "grad_norm": 3.568674087524414, + "learning_rate": 1.654974938038536e-06, + "loss": 0.5799, + "step": 66624 + }, + { + "epoch": 0.8328458211455286, + "grad_norm": 2.888326644897461, + "learning_rate": 1.6544941146337213e-06, + "loss": 1.3401, + "step": 66626 + }, + { + "epoch": 0.8328708217705443, + "grad_norm": 8.768976211547852, + "learning_rate": 1.6540133547871019e-06, + "loss": 1.2949, + "step": 66628 + }, + { + "epoch": 0.8328958223955599, + "grad_norm": 2.659632682800293, + "learning_rate": 1.6535326585023359e-06, + "loss": 1.2273, + "step": 66630 + }, + { + "epoch": 0.8329208230205755, + "grad_norm": 3.8579132556915283, + "learning_rate": 1.6530520257830828e-06, + "loss": 1.0664, + "step": 66632 + }, + { + "epoch": 0.8329458236455911, + "grad_norm": 9.500768661499023, + "learning_rate": 1.6525714566330075e-06, + "loss": 1.4813, + "step": 66634 + }, + { + "epoch": 0.8329708242706068, + "grad_norm": 2.6014928817749023, + "learning_rate": 1.6520909510557658e-06, + "loss": 0.4009, + "step": 66636 + }, + { + "epoch": 0.8329958248956224, + "grad_norm": 3.696168899536133, + "learning_rate": 1.6516105090550206e-06, + "loss": 1.7411, + "step": 66638 + }, + { + "epoch": 0.833020825520638, + "grad_norm": 2.8909542560577393, + "learning_rate": 1.6511301306344264e-06, + "loss": 0.6856, + "step": 66640 + }, + { + "epoch": 0.8330458261456536, + "grad_norm": 3.3405940532684326, + "learning_rate": 1.650649815797648e-06, + "loss": 0.3839, + "step": 66642 + }, + { + "epoch": 0.8330708267706692, + "grad_norm": 1.0710127353668213, + "learning_rate": 1.6501695645483384e-06, + "loss": 0.98, + "step": 66644 + }, + { + "epoch": 0.8330958273956849, + "grad_norm": 3.5583910942077637, + "learning_rate": 1.6496893768901557e-06, + "loss": 0.672, + "step": 66646 + }, + { + "epoch": 0.8331208280207005, + "grad_norm": 10.376797676086426, + "learning_rate": 1.649209252826759e-06, + "loss": 1.4043, + "step": 66648 + }, + { + "epoch": 0.8331458286457162, + "grad_norm": 3.1766905784606934, + "learning_rate": 1.648729192361802e-06, + "loss": 0.5947, + "step": 66650 + }, + { + "epoch": 0.8331708292707317, + "grad_norm": 0.00045314489398151636, + "learning_rate": 1.6482491954989454e-06, + "loss": 0.6508, + "step": 66652 + }, + { + "epoch": 0.8331958298957474, + "grad_norm": 3.4628396034240723, + "learning_rate": 1.6477692622418385e-06, + "loss": 1.351, + "step": 66654 + }, + { + "epoch": 0.833220830520763, + "grad_norm": 0.0031028292141854763, + "learning_rate": 1.6472893925941425e-06, + "loss": 0.4203, + "step": 66656 + }, + { + "epoch": 0.8332458311457787, + "grad_norm": 2.765608549118042, + "learning_rate": 1.6468095865595103e-06, + "loss": 0.4228, + "step": 66658 + }, + { + "epoch": 0.8332708317707943, + "grad_norm": 3.5945043563842773, + "learning_rate": 1.646329844141591e-06, + "loss": 1.7409, + "step": 66660 + }, + { + "epoch": 0.8332958323958098, + "grad_norm": 2.463015556335449, + "learning_rate": 1.645850165344046e-06, + "loss": 1.3425, + "step": 66662 + }, + { + "epoch": 0.8333208330208255, + "grad_norm": 3.888042449951172, + "learning_rate": 1.645370550170522e-06, + "loss": 1.508, + "step": 66664 + }, + { + "epoch": 0.8333458336458411, + "grad_norm": 6.939014911651611, + "learning_rate": 1.6448909986246763e-06, + "loss": 2.6632, + "step": 66666 + }, + { + "epoch": 0.8333708342708568, + "grad_norm": 3.7998313903808594, + "learning_rate": 1.64441151071016e-06, + "loss": 1.6962, + "step": 66668 + }, + { + "epoch": 0.8333958348958724, + "grad_norm": 2.8994345664978027, + "learning_rate": 1.6439320864306207e-06, + "loss": 0.754, + "step": 66670 + }, + { + "epoch": 0.833420835520888, + "grad_norm": 2.667767286300659, + "learning_rate": 1.6434527257897159e-06, + "loss": 0.7139, + "step": 66672 + }, + { + "epoch": 0.8334458361459036, + "grad_norm": 4.612042427062988, + "learning_rate": 1.6429734287910914e-06, + "loss": 0.8908, + "step": 66674 + }, + { + "epoch": 0.8334708367709193, + "grad_norm": 3.648618698120117, + "learning_rate": 1.6424941954384011e-06, + "loss": 1.3859, + "step": 66676 + }, + { + "epoch": 0.8334958373959349, + "grad_norm": 3.2283527851104736, + "learning_rate": 1.6420150257352908e-06, + "loss": 1.3563, + "step": 66678 + }, + { + "epoch": 0.8335208380209506, + "grad_norm": 15.673036575317383, + "learning_rate": 1.6415359196854153e-06, + "loss": 0.8229, + "step": 66680 + }, + { + "epoch": 0.8335458386459661, + "grad_norm": 4.3514251708984375, + "learning_rate": 1.6410568772924195e-06, + "loss": 0.4147, + "step": 66682 + }, + { + "epoch": 0.8335708392709817, + "grad_norm": 4.623247146606445, + "learning_rate": 1.6405778985599507e-06, + "loss": 1.0252, + "step": 66684 + }, + { + "epoch": 0.8335958398959974, + "grad_norm": 5.9600300788879395, + "learning_rate": 1.6400989834916603e-06, + "loss": 0.4946, + "step": 66686 + }, + { + "epoch": 0.833620840521013, + "grad_norm": 2.18300724029541, + "learning_rate": 1.639620132091192e-06, + "loss": 0.9956, + "step": 66688 + }, + { + "epoch": 0.8336458411460287, + "grad_norm": 1.2552011013031006, + "learning_rate": 1.6391413443621974e-06, + "loss": 0.6175, + "step": 66690 + }, + { + "epoch": 0.8336708417710442, + "grad_norm": 3.5593669414520264, + "learning_rate": 1.6386626203083166e-06, + "loss": 1.4061, + "step": 66692 + }, + { + "epoch": 0.8336958423960599, + "grad_norm": 3.758526086807251, + "learning_rate": 1.6381839599332005e-06, + "loss": 0.3301, + "step": 66694 + }, + { + "epoch": 0.8337208430210755, + "grad_norm": 3.7309584617614746, + "learning_rate": 1.637705363240495e-06, + "loss": 1.1335, + "step": 66696 + }, + { + "epoch": 0.8337458436460912, + "grad_norm": 2.5972700119018555, + "learning_rate": 1.637226830233839e-06, + "loss": 0.8134, + "step": 66698 + }, + { + "epoch": 0.8337708442711068, + "grad_norm": 4.354442596435547, + "learning_rate": 1.6367483609168833e-06, + "loss": 0.5396, + "step": 66700 + }, + { + "epoch": 0.8337958448961224, + "grad_norm": 0.0004387922817841172, + "learning_rate": 1.636269955293268e-06, + "loss": 0.5293, + "step": 66702 + }, + { + "epoch": 0.833820845521138, + "grad_norm": 2.4388909339904785, + "learning_rate": 1.6357916133666395e-06, + "loss": 1.3261, + "step": 66704 + }, + { + "epoch": 0.8338458461461536, + "grad_norm": 6.0786452293396, + "learning_rate": 1.635313335140637e-06, + "loss": 0.4237, + "step": 66706 + }, + { + "epoch": 0.8338708467711693, + "grad_norm": 3.758768320083618, + "learning_rate": 1.6348351206189085e-06, + "loss": 0.3774, + "step": 66708 + }, + { + "epoch": 0.8338958473961849, + "grad_norm": 1.194922924041748, + "learning_rate": 1.6343569698050922e-06, + "loss": 0.4045, + "step": 66710 + }, + { + "epoch": 0.8339208480212005, + "grad_norm": 0.00043410633224993944, + "learning_rate": 1.6338788827028273e-06, + "loss": 0.752, + "step": 66712 + }, + { + "epoch": 0.8339458486462161, + "grad_norm": 3.9522407054901123, + "learning_rate": 1.6334008593157603e-06, + "loss": 1.2776, + "step": 66714 + }, + { + "epoch": 0.8339708492712318, + "grad_norm": 2.3871591091156006, + "learning_rate": 1.6329228996475266e-06, + "loss": 0.5977, + "step": 66716 + }, + { + "epoch": 0.8339958498962474, + "grad_norm": 0.6817660331726074, + "learning_rate": 1.6324450037017703e-06, + "loss": 0.2992, + "step": 66718 + }, + { + "epoch": 0.8340208505212631, + "grad_norm": 2.8648648262023926, + "learning_rate": 1.6319671714821329e-06, + "loss": 0.1827, + "step": 66720 + }, + { + "epoch": 0.8340458511462786, + "grad_norm": 3.4323575496673584, + "learning_rate": 1.6314894029922456e-06, + "loss": 0.5075, + "step": 66722 + }, + { + "epoch": 0.8340708517712943, + "grad_norm": 9.871174812316895, + "learning_rate": 1.631011698235755e-06, + "loss": 1.7411, + "step": 66724 + }, + { + "epoch": 0.8340958523963099, + "grad_norm": 4.073610782623291, + "learning_rate": 1.630534057216292e-06, + "loss": 1.01, + "step": 66726 + }, + { + "epoch": 0.8341208530213255, + "grad_norm": 3.223764419555664, + "learning_rate": 1.6300564799375018e-06, + "loss": 0.1543, + "step": 66728 + }, + { + "epoch": 0.8341458536463412, + "grad_norm": 3.202059745788574, + "learning_rate": 1.6295789664030149e-06, + "loss": 0.7792, + "step": 66730 + }, + { + "epoch": 0.8341708542713567, + "grad_norm": 3.231379985809326, + "learning_rate": 1.6291015166164704e-06, + "loss": 0.9429, + "step": 66732 + }, + { + "epoch": 0.8341958548963724, + "grad_norm": 4.666204929351807, + "learning_rate": 1.628624130581511e-06, + "loss": 1.5995, + "step": 66734 + }, + { + "epoch": 0.834220855521388, + "grad_norm": 0.00025166364503093064, + "learning_rate": 1.6281468083017605e-06, + "loss": 0.6248, + "step": 66736 + }, + { + "epoch": 0.8342458561464037, + "grad_norm": 2.5368847846984863, + "learning_rate": 1.6276695497808626e-06, + "loss": 0.8464, + "step": 66738 + }, + { + "epoch": 0.8342708567714193, + "grad_norm": 6.820088863372803, + "learning_rate": 1.6271923550224466e-06, + "loss": 1.5225, + "step": 66740 + }, + { + "epoch": 0.8342958573964349, + "grad_norm": 2.366981267929077, + "learning_rate": 1.626715224030153e-06, + "loss": 0.9677, + "step": 66742 + }, + { + "epoch": 0.8343208580214505, + "grad_norm": 1.9169493913650513, + "learning_rate": 1.6262381568076092e-06, + "loss": 0.2075, + "step": 66744 + }, + { + "epoch": 0.8343458586464662, + "grad_norm": 2.8345513343811035, + "learning_rate": 1.625761153358454e-06, + "loss": 0.6039, + "step": 66746 + }, + { + "epoch": 0.8343708592714818, + "grad_norm": 0.00031748710898682475, + "learning_rate": 1.625284213686318e-06, + "loss": 0.4962, + "step": 66748 + }, + { + "epoch": 0.8343958598964974, + "grad_norm": 3.3477225303649902, + "learning_rate": 1.6248073377948304e-06, + "loss": 1.2726, + "step": 66750 + }, + { + "epoch": 0.834420860521513, + "grad_norm": 2.774575710296631, + "learning_rate": 1.6243305256876285e-06, + "loss": 0.9174, + "step": 66752 + }, + { + "epoch": 0.8344458611465286, + "grad_norm": 3.578700304031372, + "learning_rate": 1.6238537773683371e-06, + "loss": 0.5978, + "step": 66754 + }, + { + "epoch": 0.8344708617715443, + "grad_norm": 1.6747393608093262, + "learning_rate": 1.6233770928405922e-06, + "loss": 0.3535, + "step": 66756 + }, + { + "epoch": 0.8344958623965599, + "grad_norm": 4.602316856384277, + "learning_rate": 1.6229004721080255e-06, + "loss": 1.2094, + "step": 66758 + }, + { + "epoch": 0.8345208630215756, + "grad_norm": 3.497891902923584, + "learning_rate": 1.6224239151742626e-06, + "loss": 1.7619, + "step": 66760 + }, + { + "epoch": 0.8345458636465911, + "grad_norm": 2.7710583209991455, + "learning_rate": 1.6219474220429354e-06, + "loss": 0.6496, + "step": 66762 + }, + { + "epoch": 0.8345708642716068, + "grad_norm": 2.635671615600586, + "learning_rate": 1.6214709927176697e-06, + "loss": 1.9872, + "step": 66764 + }, + { + "epoch": 0.8345958648966224, + "grad_norm": 3.8640248775482178, + "learning_rate": 1.6209946272020982e-06, + "loss": 1.9107, + "step": 66766 + }, + { + "epoch": 0.8346208655216381, + "grad_norm": 0.0075921411626040936, + "learning_rate": 1.6205183254998446e-06, + "loss": 0.4069, + "step": 66768 + }, + { + "epoch": 0.8346458661466537, + "grad_norm": 5.37053108215332, + "learning_rate": 1.6200420876145383e-06, + "loss": 0.1014, + "step": 66770 + }, + { + "epoch": 0.8346708667716692, + "grad_norm": 3.3853673934936523, + "learning_rate": 1.619565913549811e-06, + "loss": 0.875, + "step": 66772 + }, + { + "epoch": 0.8346958673966849, + "grad_norm": 2.5821657180786133, + "learning_rate": 1.6190898033092806e-06, + "loss": 0.482, + "step": 66774 + }, + { + "epoch": 0.8347208680217005, + "grad_norm": 3.8847270011901855, + "learning_rate": 1.6186137568965786e-06, + "loss": 1.0146, + "step": 66776 + }, + { + "epoch": 0.8347458686467162, + "grad_norm": 0.8358567357063293, + "learning_rate": 1.6181377743153271e-06, + "loss": 0.0916, + "step": 66778 + }, + { + "epoch": 0.8347708692717318, + "grad_norm": 4.227666854858398, + "learning_rate": 1.6176618555691526e-06, + "loss": 0.8368, + "step": 66780 + }, + { + "epoch": 0.8347958698967474, + "grad_norm": 3.279772996902466, + "learning_rate": 1.6171860006616824e-06, + "loss": 0.5433, + "step": 66782 + }, + { + "epoch": 0.834820870521763, + "grad_norm": 8.714902877807617, + "learning_rate": 1.6167102095965359e-06, + "loss": 1.242, + "step": 66784 + }, + { + "epoch": 0.8348458711467787, + "grad_norm": 3.225151300430298, + "learning_rate": 1.6162344823773435e-06, + "loss": 1.0617, + "step": 66786 + }, + { + "epoch": 0.8348708717717943, + "grad_norm": 3.601526975631714, + "learning_rate": 1.61575881900772e-06, + "loss": 1.6808, + "step": 66788 + }, + { + "epoch": 0.83489587239681, + "grad_norm": 0.0946604460477829, + "learning_rate": 1.6152832194912927e-06, + "loss": 0.002, + "step": 66790 + }, + { + "epoch": 0.8349208730218255, + "grad_norm": 1.4958159923553467, + "learning_rate": 1.6148076838316818e-06, + "loss": 0.6125, + "step": 66792 + }, + { + "epoch": 0.8349458736468411, + "grad_norm": 2.77523136138916, + "learning_rate": 1.6143322120325088e-06, + "loss": 0.3108, + "step": 66794 + }, + { + "epoch": 0.8349708742718568, + "grad_norm": 4.1046037673950195, + "learning_rate": 1.613856804097398e-06, + "loss": 1.1307, + "step": 66796 + }, + { + "epoch": 0.8349958748968724, + "grad_norm": 2.383591890335083, + "learning_rate": 1.613381460029968e-06, + "loss": 1.2585, + "step": 66798 + }, + { + "epoch": 0.8350208755218881, + "grad_norm": 2.7740073204040527, + "learning_rate": 1.6129061798338397e-06, + "loss": 1.028, + "step": 66800 + }, + { + "epoch": 0.8350458761469036, + "grad_norm": 1.8769378662109375, + "learning_rate": 1.612430963512629e-06, + "loss": 1.1608, + "step": 66802 + }, + { + "epoch": 0.8350708767719193, + "grad_norm": 0.46083706617355347, + "learning_rate": 1.6119558110699574e-06, + "loss": 0.5142, + "step": 66804 + }, + { + "epoch": 0.8350958773969349, + "grad_norm": 1.1939952373504639, + "learning_rate": 1.6114807225094464e-06, + "loss": 0.0519, + "step": 66806 + }, + { + "epoch": 0.8351208780219506, + "grad_norm": 3.085388660430908, + "learning_rate": 1.611005697834711e-06, + "loss": 0.53, + "step": 66808 + }, + { + "epoch": 0.8351458786469662, + "grad_norm": 2.2648370265960693, + "learning_rate": 1.6105307370493707e-06, + "loss": 0.6728, + "step": 66810 + }, + { + "epoch": 0.8351708792719817, + "grad_norm": 6.884477138519287, + "learning_rate": 1.6100558401570432e-06, + "loss": 0.7959, + "step": 66812 + }, + { + "epoch": 0.8351958798969974, + "grad_norm": 2.2333922386169434, + "learning_rate": 1.6095810071613437e-06, + "loss": 0.6659, + "step": 66814 + }, + { + "epoch": 0.835220880522013, + "grad_norm": 0.5832917094230652, + "learning_rate": 1.6091062380658862e-06, + "loss": 1.0189, + "step": 66816 + }, + { + "epoch": 0.8352458811470287, + "grad_norm": 3.3614726066589355, + "learning_rate": 1.6086315328742885e-06, + "loss": 1.2844, + "step": 66818 + }, + { + "epoch": 0.8352708817720443, + "grad_norm": 0.0009713254403322935, + "learning_rate": 1.608156891590169e-06, + "loss": 0.824, + "step": 66820 + }, + { + "epoch": 0.8352958823970599, + "grad_norm": 2.705998182296753, + "learning_rate": 1.6076823142171383e-06, + "loss": 0.9017, + "step": 66822 + }, + { + "epoch": 0.8353208830220755, + "grad_norm": 2.5888946056365967, + "learning_rate": 1.6072078007588165e-06, + "loss": 1.2934, + "step": 66824 + }, + { + "epoch": 0.8353458836470912, + "grad_norm": 5.994946479797363, + "learning_rate": 1.6067333512188076e-06, + "loss": 0.9105, + "step": 66826 + }, + { + "epoch": 0.8353708842721068, + "grad_norm": 2.193716526031494, + "learning_rate": 1.6062589656007343e-06, + "loss": 0.8541, + "step": 66828 + }, + { + "epoch": 0.8353958848971225, + "grad_norm": 5.225773334503174, + "learning_rate": 1.6057846439082025e-06, + "loss": 2.2225, + "step": 66830 + }, + { + "epoch": 0.835420885522138, + "grad_norm": 5.144654273986816, + "learning_rate": 1.6053103861448283e-06, + "loss": 0.9412, + "step": 66832 + }, + { + "epoch": 0.8354458861471536, + "grad_norm": 2.572819232940674, + "learning_rate": 1.6048361923142254e-06, + "loss": 0.4519, + "step": 66834 + }, + { + "epoch": 0.8354708867721693, + "grad_norm": 2.7855117321014404, + "learning_rate": 1.6043620624200007e-06, + "loss": 1.602, + "step": 66836 + }, + { + "epoch": 0.8354958873971849, + "grad_norm": 10.267952919006348, + "learning_rate": 1.6038879964657716e-06, + "loss": 1.9772, + "step": 66838 + }, + { + "epoch": 0.8355208880222006, + "grad_norm": 0.00038719738950021565, + "learning_rate": 1.6034139944551397e-06, + "loss": 0.261, + "step": 66840 + }, + { + "epoch": 0.8355458886472161, + "grad_norm": 3.4170117378234863, + "learning_rate": 1.6029400563917197e-06, + "loss": 0.5175, + "step": 66842 + }, + { + "epoch": 0.8355708892722318, + "grad_norm": 0.003004938829690218, + "learning_rate": 1.6024661822791232e-06, + "loss": 1.5223, + "step": 66844 + }, + { + "epoch": 0.8355958898972474, + "grad_norm": 3.4897854328155518, + "learning_rate": 1.6019923721209541e-06, + "loss": 1.2996, + "step": 66846 + }, + { + "epoch": 0.8356208905222631, + "grad_norm": 3.7258858680725098, + "learning_rate": 1.601518625920825e-06, + "loss": 0.572, + "step": 66848 + }, + { + "epoch": 0.8356458911472787, + "grad_norm": 2.0200388431549072, + "learning_rate": 1.6010449436823438e-06, + "loss": 0.2316, + "step": 66850 + }, + { + "epoch": 0.8356708917722943, + "grad_norm": 3.517979621887207, + "learning_rate": 1.6005713254091159e-06, + "loss": 0.8768, + "step": 66852 + }, + { + "epoch": 0.8356958923973099, + "grad_norm": 0.0002847591240424663, + "learning_rate": 1.600097771104746e-06, + "loss": 0.0, + "step": 66854 + }, + { + "epoch": 0.8357208930223256, + "grad_norm": 4.37774658203125, + "learning_rate": 1.5996242807728446e-06, + "loss": 1.5498, + "step": 66856 + }, + { + "epoch": 0.8357458936473412, + "grad_norm": 2.593687057495117, + "learning_rate": 1.5991508544170188e-06, + "loss": 1.4086, + "step": 66858 + }, + { + "epoch": 0.8357708942723568, + "grad_norm": 0.0036501078866422176, + "learning_rate": 1.59867749204087e-06, + "loss": 0.1076, + "step": 66860 + }, + { + "epoch": 0.8357958948973724, + "grad_norm": 5.040279865264893, + "learning_rate": 1.5982041936480076e-06, + "loss": 1.2675, + "step": 66862 + }, + { + "epoch": 0.835820895522388, + "grad_norm": 2.7814559936523438, + "learning_rate": 1.5977309592420332e-06, + "loss": 0.5633, + "step": 66864 + }, + { + "epoch": 0.8358458961474037, + "grad_norm": 7.052460193634033, + "learning_rate": 1.5972577888265505e-06, + "loss": 1.6988, + "step": 66866 + }, + { + "epoch": 0.8358708967724193, + "grad_norm": 2.183911085128784, + "learning_rate": 1.5967846824051657e-06, + "loss": 0.6744, + "step": 66868 + }, + { + "epoch": 0.835895897397435, + "grad_norm": 5.121216297149658, + "learning_rate": 1.5963116399814783e-06, + "loss": 2.7169, + "step": 66870 + }, + { + "epoch": 0.8359208980224505, + "grad_norm": 1.7233161926269531, + "learning_rate": 1.595838661559096e-06, + "loss": 0.8529, + "step": 66872 + }, + { + "epoch": 0.8359458986474662, + "grad_norm": 1.7116870880126953, + "learning_rate": 1.5953657471416152e-06, + "loss": 0.3211, + "step": 66874 + }, + { + "epoch": 0.8359708992724818, + "grad_norm": 2.4883365631103516, + "learning_rate": 1.5948928967326427e-06, + "loss": 1.4468, + "step": 66876 + }, + { + "epoch": 0.8359958998974975, + "grad_norm": 2.106250762939453, + "learning_rate": 1.594420110335777e-06, + "loss": 0.5288, + "step": 66878 + }, + { + "epoch": 0.8360209005225131, + "grad_norm": 2.9889614582061768, + "learning_rate": 1.5939473879546162e-06, + "loss": 0.9622, + "step": 66880 + }, + { + "epoch": 0.8360459011475286, + "grad_norm": 6.319304943084717, + "learning_rate": 1.5934747295927666e-06, + "loss": 2.8051, + "step": 66882 + }, + { + "epoch": 0.8360709017725443, + "grad_norm": 0.7823590040206909, + "learning_rate": 1.5930021352538228e-06, + "loss": 0.04, + "step": 66884 + }, + { + "epoch": 0.8360959023975599, + "grad_norm": 2.2638933658599854, + "learning_rate": 1.5925296049413874e-06, + "loss": 0.8488, + "step": 66886 + }, + { + "epoch": 0.8361209030225756, + "grad_norm": 0.00033403810812160373, + "learning_rate": 1.5920571386590555e-06, + "loss": 0.8279, + "step": 66888 + }, + { + "epoch": 0.8361459036475912, + "grad_norm": 4.201358318328857, + "learning_rate": 1.591584736410432e-06, + "loss": 0.8587, + "step": 66890 + }, + { + "epoch": 0.8361709042726068, + "grad_norm": 1.8758313655853271, + "learning_rate": 1.591112398199106e-06, + "loss": 1.379, + "step": 66892 + }, + { + "epoch": 0.8361959048976224, + "grad_norm": 2.455937147140503, + "learning_rate": 1.5906401240286783e-06, + "loss": 1.207, + "step": 66894 + }, + { + "epoch": 0.8362209055226381, + "grad_norm": 0.6829497218132019, + "learning_rate": 1.5901679139027482e-06, + "loss": 0.9416, + "step": 66896 + }, + { + "epoch": 0.8362459061476537, + "grad_norm": 0.0003024300967808813, + "learning_rate": 1.589695767824908e-06, + "loss": 0.1699, + "step": 66898 + }, + { + "epoch": 0.8362709067726694, + "grad_norm": 3.590268850326538, + "learning_rate": 1.5892236857987586e-06, + "loss": 0.6651, + "step": 66900 + }, + { + "epoch": 0.8362959073976849, + "grad_norm": 4.633072853088379, + "learning_rate": 1.5887516678278892e-06, + "loss": 1.323, + "step": 66902 + }, + { + "epoch": 0.8363209080227005, + "grad_norm": 3.847432851791382, + "learning_rate": 1.5882797139159e-06, + "loss": 1.1522, + "step": 66904 + }, + { + "epoch": 0.8363459086477162, + "grad_norm": 1.2265803813934326, + "learning_rate": 1.5878078240663831e-06, + "loss": 0.4743, + "step": 66906 + }, + { + "epoch": 0.8363709092727318, + "grad_norm": 2.721123456954956, + "learning_rate": 1.5873359982829295e-06, + "loss": 1.298, + "step": 66908 + }, + { + "epoch": 0.8363959098977475, + "grad_norm": 2.630981922149658, + "learning_rate": 1.5868642365691378e-06, + "loss": 1.5219, + "step": 66910 + }, + { + "epoch": 0.836420910522763, + "grad_norm": 2.286111831665039, + "learning_rate": 1.586392538928596e-06, + "loss": 1.2386, + "step": 66912 + }, + { + "epoch": 0.8364459111477787, + "grad_norm": 0.6517578959465027, + "learning_rate": 1.5859209053649016e-06, + "loss": 0.6318, + "step": 66914 + }, + { + "epoch": 0.8364709117727943, + "grad_norm": 3.2357099056243896, + "learning_rate": 1.5854493358816436e-06, + "loss": 0.4225, + "step": 66916 + }, + { + "epoch": 0.83649591239781, + "grad_norm": 4.24198579788208, + "learning_rate": 1.5849778304824104e-06, + "loss": 1.0154, + "step": 66918 + }, + { + "epoch": 0.8365209130228256, + "grad_norm": 3.9358890056610107, + "learning_rate": 1.584506389170799e-06, + "loss": 1.2958, + "step": 66920 + }, + { + "epoch": 0.8365459136478411, + "grad_norm": 2.0876617431640625, + "learning_rate": 1.5840350119503945e-06, + "loss": 0.6523, + "step": 66922 + }, + { + "epoch": 0.8365709142728568, + "grad_norm": 3.8119540214538574, + "learning_rate": 1.5835636988247916e-06, + "loss": 1.0157, + "step": 66924 + }, + { + "epoch": 0.8365959148978724, + "grad_norm": 3.215101480484009, + "learning_rate": 1.5830924497975753e-06, + "loss": 0.3232, + "step": 66926 + }, + { + "epoch": 0.8366209155228881, + "grad_norm": 0.5618418455123901, + "learning_rate": 1.5826212648723393e-06, + "loss": 0.4812, + "step": 66928 + }, + { + "epoch": 0.8366459161479037, + "grad_norm": 2.7758357524871826, + "learning_rate": 1.5821501440526688e-06, + "loss": 1.3787, + "step": 66930 + }, + { + "epoch": 0.8366709167729193, + "grad_norm": 2.4928109645843506, + "learning_rate": 1.5816790873421495e-06, + "loss": 0.4686, + "step": 66932 + }, + { + "epoch": 0.8366959173979349, + "grad_norm": 3.997654438018799, + "learning_rate": 1.5812080947443752e-06, + "loss": 1.3407, + "step": 66934 + }, + { + "epoch": 0.8367209180229506, + "grad_norm": 10.979897499084473, + "learning_rate": 1.5807371662629268e-06, + "loss": 0.7746, + "step": 66936 + }, + { + "epoch": 0.8367459186479662, + "grad_norm": 0.0012671774020418525, + "learning_rate": 1.5802663019013953e-06, + "loss": 0.501, + "step": 66938 + }, + { + "epoch": 0.8367709192729819, + "grad_norm": 1.6012498140335083, + "learning_rate": 1.5797955016633638e-06, + "loss": 0.9178, + "step": 66940 + }, + { + "epoch": 0.8367959198979974, + "grad_norm": 0.0002560504653956741, + "learning_rate": 1.5793247655524203e-06, + "loss": 0.3824, + "step": 66942 + }, + { + "epoch": 0.836820920523013, + "grad_norm": 0.0255692508071661, + "learning_rate": 1.5788540935721496e-06, + "loss": 0.0004, + "step": 66944 + }, + { + "epoch": 0.8368459211480287, + "grad_norm": 0.1507224142551422, + "learning_rate": 1.5783834857261316e-06, + "loss": 0.0026, + "step": 66946 + }, + { + "epoch": 0.8368709217730443, + "grad_norm": 2.717966079711914, + "learning_rate": 1.5779129420179572e-06, + "loss": 0.7533, + "step": 66948 + }, + { + "epoch": 0.83689592239806, + "grad_norm": 2.671341896057129, + "learning_rate": 1.5774424624512054e-06, + "loss": 0.474, + "step": 66950 + }, + { + "epoch": 0.8369209230230755, + "grad_norm": 3.1782591342926025, + "learning_rate": 1.5769720470294613e-06, + "loss": 0.776, + "step": 66952 + }, + { + "epoch": 0.8369459236480912, + "grad_norm": 3.2624387741088867, + "learning_rate": 1.5765016957563062e-06, + "loss": 0.5799, + "step": 66954 + }, + { + "epoch": 0.8369709242731068, + "grad_norm": 3.084254741668701, + "learning_rate": 1.5760314086353245e-06, + "loss": 0.2149, + "step": 66956 + }, + { + "epoch": 0.8369959248981225, + "grad_norm": 1.5895286798477173, + "learning_rate": 1.575561185670096e-06, + "loss": 0.2462, + "step": 66958 + }, + { + "epoch": 0.8370209255231381, + "grad_norm": 0.0032604627776890993, + "learning_rate": 1.5750910268642006e-06, + "loss": 1.0691, + "step": 66960 + }, + { + "epoch": 0.8370459261481537, + "grad_norm": 2.46109938621521, + "learning_rate": 1.5746209322212224e-06, + "loss": 0.7233, + "step": 66962 + }, + { + "epoch": 0.8370709267731693, + "grad_norm": 2.853036403656006, + "learning_rate": 1.5741509017447376e-06, + "loss": 0.7981, + "step": 66964 + }, + { + "epoch": 0.837095927398185, + "grad_norm": 2.973500967025757, + "learning_rate": 1.5736809354383299e-06, + "loss": 1.1116, + "step": 66966 + }, + { + "epoch": 0.8371209280232006, + "grad_norm": 1.2069979906082153, + "learning_rate": 1.5732110333055772e-06, + "loss": 0.9797, + "step": 66968 + }, + { + "epoch": 0.8371459286482162, + "grad_norm": 0.002100208308547735, + "learning_rate": 1.5727411953500548e-06, + "loss": 0.7865, + "step": 66970 + }, + { + "epoch": 0.8371709292732318, + "grad_norm": 2.958569288253784, + "learning_rate": 1.5722714215753464e-06, + "loss": 1.6843, + "step": 66972 + }, + { + "epoch": 0.8371959298982474, + "grad_norm": 2.4543139934539795, + "learning_rate": 1.5718017119850249e-06, + "loss": 1.1173, + "step": 66974 + }, + { + "epoch": 0.8372209305232631, + "grad_norm": 2.259570360183716, + "learning_rate": 1.5713320665826726e-06, + "loss": 0.3986, + "step": 66976 + }, + { + "epoch": 0.8372459311482787, + "grad_norm": 4.179986476898193, + "learning_rate": 1.5708624853718602e-06, + "loss": 1.9155, + "step": 66978 + }, + { + "epoch": 0.8372709317732944, + "grad_norm": 3.174776792526245, + "learning_rate": 1.5703929683561692e-06, + "loss": 1.5747, + "step": 66980 + }, + { + "epoch": 0.8372959323983099, + "grad_norm": 5.984955787658691, + "learning_rate": 1.5699235155391734e-06, + "loss": 0.6467, + "step": 66982 + }, + { + "epoch": 0.8373209330233256, + "grad_norm": 2.6140904426574707, + "learning_rate": 1.5694541269244456e-06, + "loss": 0.4646, + "step": 66984 + }, + { + "epoch": 0.8373459336483412, + "grad_norm": 3.0701138973236084, + "learning_rate": 1.5689848025155664e-06, + "loss": 0.83, + "step": 66986 + }, + { + "epoch": 0.8373709342733568, + "grad_norm": 0.45518141984939575, + "learning_rate": 1.5685155423161035e-06, + "loss": 0.7858, + "step": 66988 + }, + { + "epoch": 0.8373959348983725, + "grad_norm": 4.411336421966553, + "learning_rate": 1.5680463463296358e-06, + "loss": 1.2368, + "step": 66990 + }, + { + "epoch": 0.837420935523388, + "grad_norm": 4.245410919189453, + "learning_rate": 1.5675772145597335e-06, + "loss": 1.5108, + "step": 66992 + }, + { + "epoch": 0.8374459361484037, + "grad_norm": 1.9493496417999268, + "learning_rate": 1.5671081470099726e-06, + "loss": 0.502, + "step": 66994 + }, + { + "epoch": 0.8374709367734193, + "grad_norm": 4.0964274406433105, + "learning_rate": 1.5666391436839234e-06, + "loss": 1.7941, + "step": 66996 + }, + { + "epoch": 0.837495937398435, + "grad_norm": 2.0237162113189697, + "learning_rate": 1.5661702045851546e-06, + "loss": 0.7758, + "step": 66998 + }, + { + "epoch": 0.8375209380234506, + "grad_norm": 3.860139846801758, + "learning_rate": 1.5657013297172442e-06, + "loss": 0.6988, + "step": 67000 + }, + { + "epoch": 0.8375459386484662, + "grad_norm": 6.474483013153076, + "learning_rate": 1.565232519083758e-06, + "loss": 0.8409, + "step": 67002 + }, + { + "epoch": 0.8375709392734818, + "grad_norm": 0.008562936447560787, + "learning_rate": 1.5647637726882692e-06, + "loss": 0.505, + "step": 67004 + }, + { + "epoch": 0.8375959398984975, + "grad_norm": 3.6452245712280273, + "learning_rate": 1.5642950905343456e-06, + "loss": 1.1325, + "step": 67006 + }, + { + "epoch": 0.8376209405235131, + "grad_norm": 7.117458343505859, + "learning_rate": 1.5638264726255593e-06, + "loss": 1.6216, + "step": 67008 + }, + { + "epoch": 0.8376459411485287, + "grad_norm": 5.602930068969727, + "learning_rate": 1.563357918965478e-06, + "loss": 1.6216, + "step": 67010 + }, + { + "epoch": 0.8376709417735443, + "grad_norm": 4.188891887664795, + "learning_rate": 1.5628894295576668e-06, + "loss": 0.2861, + "step": 67012 + }, + { + "epoch": 0.8376959423985599, + "grad_norm": 4.355608940124512, + "learning_rate": 1.5624210044056987e-06, + "loss": 0.6377, + "step": 67014 + }, + { + "epoch": 0.8377209430235756, + "grad_norm": 3.902221441268921, + "learning_rate": 1.561952643513137e-06, + "loss": 1.4042, + "step": 67016 + }, + { + "epoch": 0.8377459436485912, + "grad_norm": 4.01355504989624, + "learning_rate": 1.5614843468835538e-06, + "loss": 1.8388, + "step": 67018 + }, + { + "epoch": 0.8377709442736069, + "grad_norm": 3.9387624263763428, + "learning_rate": 1.5610161145205117e-06, + "loss": 0.7982, + "step": 67020 + }, + { + "epoch": 0.8377959448986224, + "grad_norm": 0.8214191198348999, + "learning_rate": 1.5605479464275752e-06, + "loss": 1.1571, + "step": 67022 + }, + { + "epoch": 0.8378209455236381, + "grad_norm": 4.114807605743408, + "learning_rate": 1.5600798426083152e-06, + "loss": 1.0041, + "step": 67024 + }, + { + "epoch": 0.8378459461486537, + "grad_norm": 0.3086559772491455, + "learning_rate": 1.5596118030662899e-06, + "loss": 0.0037, + "step": 67026 + }, + { + "epoch": 0.8378709467736694, + "grad_norm": 39.89582824707031, + "learning_rate": 1.5591438278050708e-06, + "loss": 0.6478, + "step": 67028 + }, + { + "epoch": 0.837895947398685, + "grad_norm": 3.018763303756714, + "learning_rate": 1.5586759168282162e-06, + "loss": 1.6058, + "step": 67030 + }, + { + "epoch": 0.8379209480237005, + "grad_norm": 0.000323141721310094, + "learning_rate": 1.5582080701392954e-06, + "loss": 0.9854, + "step": 67032 + }, + { + "epoch": 0.8379459486487162, + "grad_norm": 3.0699644088745117, + "learning_rate": 1.5577402877418669e-06, + "loss": 0.2949, + "step": 67034 + }, + { + "epoch": 0.8379709492737318, + "grad_norm": 0.0006752285407856107, + "learning_rate": 1.557272569639492e-06, + "loss": 0.4656, + "step": 67036 + }, + { + "epoch": 0.8379959498987475, + "grad_norm": 0.00030817260267212987, + "learning_rate": 1.556804915835739e-06, + "loss": 0.805, + "step": 67038 + }, + { + "epoch": 0.8380209505237631, + "grad_norm": 0.0002181392628699541, + "learning_rate": 1.5563373263341618e-06, + "loss": 0.0599, + "step": 67040 + }, + { + "epoch": 0.8380459511487787, + "grad_norm": 3.480299711227417, + "learning_rate": 1.5558698011383288e-06, + "loss": 1.034, + "step": 67042 + }, + { + "epoch": 0.8380709517737943, + "grad_norm": 0.7554264664649963, + "learning_rate": 1.5554023402517947e-06, + "loss": 0.3985, + "step": 67044 + }, + { + "epoch": 0.83809595239881, + "grad_norm": 1.6038087606430054, + "learning_rate": 1.5549349436781248e-06, + "loss": 1.5962, + "step": 67046 + }, + { + "epoch": 0.8381209530238256, + "grad_norm": 4.4800639152526855, + "learning_rate": 1.5544676114208757e-06, + "loss": 0.6956, + "step": 67048 + }, + { + "epoch": 0.8381459536488413, + "grad_norm": 0.00029482709942385554, + "learning_rate": 1.554000343483605e-06, + "loss": 0.8487, + "step": 67050 + }, + { + "epoch": 0.8381709542738568, + "grad_norm": 0.00031442538602277637, + "learning_rate": 1.553533139869875e-06, + "loss": 0.8068, + "step": 67052 + }, + { + "epoch": 0.8381959548988724, + "grad_norm": 1.0481222867965698, + "learning_rate": 1.553066000583241e-06, + "loss": 0.1846, + "step": 67054 + }, + { + "epoch": 0.8382209555238881, + "grad_norm": 3.619633913040161, + "learning_rate": 1.5525989256272632e-06, + "loss": 1.1476, + "step": 67056 + }, + { + "epoch": 0.8382459561489037, + "grad_norm": 3.106950283050537, + "learning_rate": 1.552131915005496e-06, + "loss": 1.5103, + "step": 67058 + }, + { + "epoch": 0.8382709567739194, + "grad_norm": 0.8438042998313904, + "learning_rate": 1.5516649687214992e-06, + "loss": 0.0351, + "step": 67060 + }, + { + "epoch": 0.8382959573989349, + "grad_norm": 1.966569185256958, + "learning_rate": 1.5511980867788268e-06, + "loss": 1.8135, + "step": 67062 + }, + { + "epoch": 0.8383209580239506, + "grad_norm": 2.81624436378479, + "learning_rate": 1.5507312691810328e-06, + "loss": 1.5676, + "step": 67064 + }, + { + "epoch": 0.8383459586489662, + "grad_norm": 4.244131565093994, + "learning_rate": 1.5502645159316775e-06, + "loss": 1.9347, + "step": 67066 + }, + { + "epoch": 0.8383709592739819, + "grad_norm": 7.804323673248291, + "learning_rate": 1.5497978270343106e-06, + "loss": 2.7314, + "step": 67068 + }, + { + "epoch": 0.8383959598989975, + "grad_norm": 3.241847038269043, + "learning_rate": 1.54933120249249e-06, + "loss": 1.245, + "step": 67070 + }, + { + "epoch": 0.838420960524013, + "grad_norm": 7.347081184387207, + "learning_rate": 1.5488646423097686e-06, + "loss": 1.7373, + "step": 67072 + }, + { + "epoch": 0.8384459611490287, + "grad_norm": 2.566725015640259, + "learning_rate": 1.548398146489697e-06, + "loss": 0.2034, + "step": 67074 + }, + { + "epoch": 0.8384709617740443, + "grad_norm": 5.245454788208008, + "learning_rate": 1.5479317150358308e-06, + "loss": 0.5269, + "step": 67076 + }, + { + "epoch": 0.83849596239906, + "grad_norm": 6.251821994781494, + "learning_rate": 1.5474653479517198e-06, + "loss": 1.3839, + "step": 67078 + }, + { + "epoch": 0.8385209630240756, + "grad_norm": 2.089210271835327, + "learning_rate": 1.54699904524092e-06, + "loss": 1.0488, + "step": 67080 + }, + { + "epoch": 0.8385459636490912, + "grad_norm": 2.3905205726623535, + "learning_rate": 1.5465328069069785e-06, + "loss": 0.8202, + "step": 67082 + }, + { + "epoch": 0.8385709642741068, + "grad_norm": 2.771002769470215, + "learning_rate": 1.546066632953449e-06, + "loss": 1.4489, + "step": 67084 + }, + { + "epoch": 0.8385959648991225, + "grad_norm": 1.0617595911026, + "learning_rate": 1.545600523383881e-06, + "loss": 0.7378, + "step": 67086 + }, + { + "epoch": 0.8386209655241381, + "grad_norm": 0.000535026250872761, + "learning_rate": 1.5451344782018208e-06, + "loss": 0.0001, + "step": 67088 + }, + { + "epoch": 0.8386459661491538, + "grad_norm": 0.013790917582809925, + "learning_rate": 1.5446684974108239e-06, + "loss": 0.7631, + "step": 67090 + }, + { + "epoch": 0.8386709667741693, + "grad_norm": 3.3736913204193115, + "learning_rate": 1.5442025810144334e-06, + "loss": 0.9754, + "step": 67092 + }, + { + "epoch": 0.838695967399185, + "grad_norm": 5.466731071472168, + "learning_rate": 1.5437367290162032e-06, + "loss": 0.6899, + "step": 67094 + }, + { + "epoch": 0.8387209680242006, + "grad_norm": 0.5052135586738586, + "learning_rate": 1.5432709414196757e-06, + "loss": 1.4288, + "step": 67096 + }, + { + "epoch": 0.8387459686492162, + "grad_norm": 1.4332751035690308, + "learning_rate": 1.5428052182284026e-06, + "loss": 0.4912, + "step": 67098 + }, + { + "epoch": 0.8387709692742319, + "grad_norm": 3.1250035762786865, + "learning_rate": 1.542339559445929e-06, + "loss": 0.426, + "step": 67100 + }, + { + "epoch": 0.8387959698992474, + "grad_norm": 2.591127395629883, + "learning_rate": 1.5418739650757986e-06, + "loss": 0.5682, + "step": 67102 + }, + { + "epoch": 0.8388209705242631, + "grad_norm": 4.420110702514648, + "learning_rate": 1.541408435121563e-06, + "loss": 1.3191, + "step": 67104 + }, + { + "epoch": 0.8388459711492787, + "grad_norm": 3.7885947227478027, + "learning_rate": 1.5409429695867629e-06, + "loss": 2.1427, + "step": 67106 + }, + { + "epoch": 0.8388709717742944, + "grad_norm": 4.027336120605469, + "learning_rate": 1.5404775684749463e-06, + "loss": 0.6205, + "step": 67108 + }, + { + "epoch": 0.83889597239931, + "grad_norm": 2.2013020515441895, + "learning_rate": 1.5400122317896537e-06, + "loss": 1.2805, + "step": 67110 + }, + { + "epoch": 0.8389209730243256, + "grad_norm": 0.0011291785631328821, + "learning_rate": 1.5395469595344348e-06, + "loss": 0.2988, + "step": 67112 + }, + { + "epoch": 0.8389459736493412, + "grad_norm": 2.1573848724365234, + "learning_rate": 1.5390817517128299e-06, + "loss": 0.3759, + "step": 67114 + }, + { + "epoch": 0.8389709742743569, + "grad_norm": 1.2317816019058228, + "learning_rate": 1.5386166083283782e-06, + "loss": 0.1731, + "step": 67116 + }, + { + "epoch": 0.8389959748993725, + "grad_norm": 5.592869281768799, + "learning_rate": 1.5381515293846294e-06, + "loss": 1.4446, + "step": 67118 + }, + { + "epoch": 0.8390209755243881, + "grad_norm": 4.957502365112305, + "learning_rate": 1.5376865148851195e-06, + "loss": 1.1131, + "step": 67120 + }, + { + "epoch": 0.8390459761494037, + "grad_norm": 6.530697345733643, + "learning_rate": 1.537221564833392e-06, + "loss": 2.0488, + "step": 67122 + }, + { + "epoch": 0.8390709767744193, + "grad_norm": 2.1476926803588867, + "learning_rate": 1.5367566792329935e-06, + "loss": 0.1578, + "step": 67124 + }, + { + "epoch": 0.839095977399435, + "grad_norm": 6.398064136505127, + "learning_rate": 1.5362918580874542e-06, + "loss": 1.8486, + "step": 67126 + }, + { + "epoch": 0.8391209780244506, + "grad_norm": 1.4764397144317627, + "learning_rate": 1.5358271014003223e-06, + "loss": 0.8345, + "step": 67128 + }, + { + "epoch": 0.8391459786494663, + "grad_norm": 4.9328155517578125, + "learning_rate": 1.5353624091751307e-06, + "loss": 0.2876, + "step": 67130 + }, + { + "epoch": 0.8391709792744818, + "grad_norm": 2.128051519393921, + "learning_rate": 1.5348977814154254e-06, + "loss": 1.3811, + "step": 67132 + }, + { + "epoch": 0.8391959798994975, + "grad_norm": 1.2974278926849365, + "learning_rate": 1.534433218124739e-06, + "loss": 0.1839, + "step": 67134 + }, + { + "epoch": 0.8392209805245131, + "grad_norm": 10.469305038452148, + "learning_rate": 1.533968719306611e-06, + "loss": 1.4077, + "step": 67136 + }, + { + "epoch": 0.8392459811495288, + "grad_norm": 5.319235324859619, + "learning_rate": 1.5335042849645864e-06, + "loss": 0.7163, + "step": 67138 + }, + { + "epoch": 0.8392709817745444, + "grad_norm": 2.565540313720703, + "learning_rate": 1.5330399151021901e-06, + "loss": 0.9973, + "step": 67140 + }, + { + "epoch": 0.8392959823995599, + "grad_norm": 3.9569084644317627, + "learning_rate": 1.532575609722967e-06, + "loss": 0.7924, + "step": 67142 + }, + { + "epoch": 0.8393209830245756, + "grad_norm": 1.3486312627792358, + "learning_rate": 1.532111368830448e-06, + "loss": 0.6356, + "step": 67144 + }, + { + "epoch": 0.8393459836495912, + "grad_norm": 0.6843755841255188, + "learning_rate": 1.5316471924281738e-06, + "loss": 0.4582, + "step": 67146 + }, + { + "epoch": 0.8393709842746069, + "grad_norm": 3.7218003273010254, + "learning_rate": 1.531183080519676e-06, + "loss": 1.4864, + "step": 67148 + }, + { + "epoch": 0.8393959848996225, + "grad_norm": 2.7904868125915527, + "learning_rate": 1.5307190331084887e-06, + "loss": 1.8885, + "step": 67150 + }, + { + "epoch": 0.8394209855246381, + "grad_norm": 7.090725421905518, + "learning_rate": 1.5302550501981528e-06, + "loss": 1.524, + "step": 67152 + }, + { + "epoch": 0.8394459861496537, + "grad_norm": 4.539011478424072, + "learning_rate": 1.529791131792192e-06, + "loss": 1.3389, + "step": 67154 + }, + { + "epoch": 0.8394709867746694, + "grad_norm": 0.00031989329727366567, + "learning_rate": 1.5293272778941471e-06, + "loss": 0.0011, + "step": 67156 + }, + { + "epoch": 0.839495987399685, + "grad_norm": 3.6123664379119873, + "learning_rate": 1.5288634885075449e-06, + "loss": 1.163, + "step": 67158 + }, + { + "epoch": 0.8395209880247007, + "grad_norm": 4.6987223625183105, + "learning_rate": 1.5283997636359205e-06, + "loss": 1.2345, + "step": 67160 + }, + { + "epoch": 0.8395459886497162, + "grad_norm": 0.0009032521047629416, + "learning_rate": 1.5279361032828078e-06, + "loss": 0.0401, + "step": 67162 + }, + { + "epoch": 0.8395709892747318, + "grad_norm": 5.6201252937316895, + "learning_rate": 1.5274725074517348e-06, + "loss": 1.3747, + "step": 67164 + }, + { + "epoch": 0.8395959898997475, + "grad_norm": 0.4424808919429779, + "learning_rate": 1.5270089761462336e-06, + "loss": 0.6629, + "step": 67166 + }, + { + "epoch": 0.8396209905247631, + "grad_norm": 5.361371040344238, + "learning_rate": 1.5265455093698323e-06, + "loss": 0.3218, + "step": 67168 + }, + { + "epoch": 0.8396459911497788, + "grad_norm": 0.0010011602425947785, + "learning_rate": 1.5260821071260644e-06, + "loss": 1.3237, + "step": 67170 + }, + { + "epoch": 0.8396709917747943, + "grad_norm": 3.3852031230926514, + "learning_rate": 1.5256187694184532e-06, + "loss": 0.8269, + "step": 67172 + }, + { + "epoch": 0.83969599239981, + "grad_norm": 4.107690811157227, + "learning_rate": 1.5251554962505322e-06, + "loss": 1.0571, + "step": 67174 + }, + { + "epoch": 0.8397209930248256, + "grad_norm": 3.001014471054077, + "learning_rate": 1.5246922876258308e-06, + "loss": 1.9782, + "step": 67176 + }, + { + "epoch": 0.8397459936498413, + "grad_norm": 1.8519748449325562, + "learning_rate": 1.524229143547874e-06, + "loss": 0.8447, + "step": 67178 + }, + { + "epoch": 0.8397709942748569, + "grad_norm": 3.6627440452575684, + "learning_rate": 1.5237660640201902e-06, + "loss": 1.3392, + "step": 67180 + }, + { + "epoch": 0.8397959948998724, + "grad_norm": 1.953748345375061, + "learning_rate": 1.5233030490463018e-06, + "loss": 0.1591, + "step": 67182 + }, + { + "epoch": 0.8398209955248881, + "grad_norm": 2.8667330741882324, + "learning_rate": 1.5228400986297398e-06, + "loss": 0.11, + "step": 67184 + }, + { + "epoch": 0.8398459961499037, + "grad_norm": 3.080105781555176, + "learning_rate": 1.5223772127740312e-06, + "loss": 0.8522, + "step": 67186 + }, + { + "epoch": 0.8398709967749194, + "grad_norm": 4.119684219360352, + "learning_rate": 1.5219143914826961e-06, + "loss": 1.2544, + "step": 67188 + }, + { + "epoch": 0.839895997399935, + "grad_norm": 4.732580184936523, + "learning_rate": 1.5214516347592666e-06, + "loss": 1.1206, + "step": 67190 + }, + { + "epoch": 0.8399209980249506, + "grad_norm": 5.828629970550537, + "learning_rate": 1.5209889426072577e-06, + "loss": 1.29, + "step": 67192 + }, + { + "epoch": 0.8399459986499662, + "grad_norm": 4.832705974578857, + "learning_rate": 1.5205263150302007e-06, + "loss": 2.0959, + "step": 67194 + }, + { + "epoch": 0.8399709992749819, + "grad_norm": 3.1269965171813965, + "learning_rate": 1.520063752031614e-06, + "loss": 1.1436, + "step": 67196 + }, + { + "epoch": 0.8399959998999975, + "grad_norm": 2.8895626068115234, + "learning_rate": 1.5196012536150217e-06, + "loss": 0.6605, + "step": 67198 + }, + { + "epoch": 0.8400210005250132, + "grad_norm": 2.4749062061309814, + "learning_rate": 1.5191388197839507e-06, + "loss": 0.5584, + "step": 67200 + }, + { + "epoch": 0.8400460011500287, + "grad_norm": 2.390291452407837, + "learning_rate": 1.518676450541916e-06, + "loss": 0.9364, + "step": 67202 + }, + { + "epoch": 0.8400710017750443, + "grad_norm": 3.9929709434509277, + "learning_rate": 1.5182141458924471e-06, + "loss": 1.2955, + "step": 67204 + }, + { + "epoch": 0.84009600240006, + "grad_norm": 2.167126417160034, + "learning_rate": 1.5177519058390543e-06, + "loss": 0.6822, + "step": 67206 + }, + { + "epoch": 0.8401210030250756, + "grad_norm": 8.982507705688477, + "learning_rate": 1.5172897303852662e-06, + "loss": 1.4326, + "step": 67208 + }, + { + "epoch": 0.8401460036500913, + "grad_norm": 2.566333293914795, + "learning_rate": 1.5168276195345989e-06, + "loss": 0.7154, + "step": 67210 + }, + { + "epoch": 0.8401710042751068, + "grad_norm": 2.8971099853515625, + "learning_rate": 1.5163655732905714e-06, + "loss": 1.52, + "step": 67212 + }, + { + "epoch": 0.8401960049001225, + "grad_norm": 2.1757774353027344, + "learning_rate": 1.5159035916567067e-06, + "loss": 0.1662, + "step": 67214 + }, + { + "epoch": 0.8402210055251381, + "grad_norm": 0.0002927156747318804, + "learning_rate": 1.5154416746365208e-06, + "loss": 0.4295, + "step": 67216 + }, + { + "epoch": 0.8402460061501538, + "grad_norm": 3.3666138648986816, + "learning_rate": 1.514979822233531e-06, + "loss": 0.265, + "step": 67218 + }, + { + "epoch": 0.8402710067751694, + "grad_norm": 2.335035562515259, + "learning_rate": 1.5145180344512522e-06, + "loss": 1.185, + "step": 67220 + }, + { + "epoch": 0.840296007400185, + "grad_norm": 2.477478265762329, + "learning_rate": 1.514056311293205e-06, + "loss": 0.1115, + "step": 67222 + }, + { + "epoch": 0.8403210080252006, + "grad_norm": 2.8128888607025146, + "learning_rate": 1.5135946527629064e-06, + "loss": 1.1326, + "step": 67224 + }, + { + "epoch": 0.8403460086502162, + "grad_norm": 3.074594259262085, + "learning_rate": 1.5131330588638692e-06, + "loss": 0.9974, + "step": 67226 + }, + { + "epoch": 0.8403710092752319, + "grad_norm": 7.502741813659668, + "learning_rate": 1.512671529599612e-06, + "loss": 1.6586, + "step": 67228 + }, + { + "epoch": 0.8403960099002475, + "grad_norm": 5.945122241973877, + "learning_rate": 1.5122100649736493e-06, + "loss": 1.94, + "step": 67230 + }, + { + "epoch": 0.8404210105252631, + "grad_norm": 0.643957793712616, + "learning_rate": 1.511748664989493e-06, + "loss": 0.867, + "step": 67232 + }, + { + "epoch": 0.8404460111502787, + "grad_norm": 0.00097756483592093, + "learning_rate": 1.511287329650657e-06, + "loss": 0.1833, + "step": 67234 + }, + { + "epoch": 0.8404710117752944, + "grad_norm": 3.267958641052246, + "learning_rate": 1.5108260589606572e-06, + "loss": 0.5854, + "step": 67236 + }, + { + "epoch": 0.84049601240031, + "grad_norm": 0.00038289549411274493, + "learning_rate": 1.5103648529230065e-06, + "loss": 0.1072, + "step": 67238 + }, + { + "epoch": 0.8405210130253257, + "grad_norm": 0.09563922137022018, + "learning_rate": 1.5099037115412153e-06, + "loss": 0.0398, + "step": 67240 + }, + { + "epoch": 0.8405460136503412, + "grad_norm": 2.988406181335449, + "learning_rate": 1.5094426348188007e-06, + "loss": 0.6262, + "step": 67242 + }, + { + "epoch": 0.8405710142753569, + "grad_norm": 3.4009664058685303, + "learning_rate": 1.5089816227592658e-06, + "loss": 0.6689, + "step": 67244 + }, + { + "epoch": 0.8405960149003725, + "grad_norm": 0.0003701749083120376, + "learning_rate": 1.5085206753661264e-06, + "loss": 0.0004, + "step": 67246 + }, + { + "epoch": 0.8406210155253881, + "grad_norm": 3.1667826175689697, + "learning_rate": 1.5080597926428953e-06, + "loss": 1.1915, + "step": 67248 + }, + { + "epoch": 0.8406460161504038, + "grad_norm": 4.394358158111572, + "learning_rate": 1.5075989745930775e-06, + "loss": 0.0815, + "step": 67250 + }, + { + "epoch": 0.8406710167754193, + "grad_norm": 4.330540180206299, + "learning_rate": 1.5071382212201868e-06, + "loss": 0.5997, + "step": 67252 + }, + { + "epoch": 0.840696017400435, + "grad_norm": 4.855966091156006, + "learning_rate": 1.5066775325277283e-06, + "loss": 1.7779, + "step": 67254 + }, + { + "epoch": 0.8407210180254506, + "grad_norm": 3.420762538909912, + "learning_rate": 1.5062169085192168e-06, + "loss": 1.3975, + "step": 67256 + }, + { + "epoch": 0.8407460186504663, + "grad_norm": 5.906611442565918, + "learning_rate": 1.505756349198153e-06, + "loss": 1.0501, + "step": 67258 + }, + { + "epoch": 0.8407710192754819, + "grad_norm": 1.8758455514907837, + "learning_rate": 1.5052958545680463e-06, + "loss": 0.8226, + "step": 67260 + }, + { + "epoch": 0.8407960199004975, + "grad_norm": 2.7807953357696533, + "learning_rate": 1.504835424632408e-06, + "loss": 0.5553, + "step": 67262 + }, + { + "epoch": 0.8408210205255131, + "grad_norm": 2.7081007957458496, + "learning_rate": 1.5043750593947392e-06, + "loss": 1.5586, + "step": 67264 + }, + { + "epoch": 0.8408460211505288, + "grad_norm": 1.855921745300293, + "learning_rate": 1.5039147588585512e-06, + "loss": 1.1302, + "step": 67266 + }, + { + "epoch": 0.8408710217755444, + "grad_norm": 3.286207437515259, + "learning_rate": 1.5034545230273466e-06, + "loss": 0.6537, + "step": 67268 + }, + { + "epoch": 0.84089602240056, + "grad_norm": 4.3513078689575195, + "learning_rate": 1.5029943519046286e-06, + "loss": 1.2013, + "step": 67270 + }, + { + "epoch": 0.8409210230255756, + "grad_norm": 1.3527579307556152, + "learning_rate": 1.5025342454939062e-06, + "loss": 0.057, + "step": 67272 + }, + { + "epoch": 0.8409460236505912, + "grad_norm": 3.7474730014801025, + "learning_rate": 1.5020742037986802e-06, + "loss": 1.214, + "step": 67274 + }, + { + "epoch": 0.8409710242756069, + "grad_norm": 3.044879674911499, + "learning_rate": 1.5016142268224566e-06, + "loss": 1.3935, + "step": 67276 + }, + { + "epoch": 0.8409960249006225, + "grad_norm": 2.7232978343963623, + "learning_rate": 1.5011543145687346e-06, + "loss": 0.9005, + "step": 67278 + }, + { + "epoch": 0.8410210255256382, + "grad_norm": 2.337954521179199, + "learning_rate": 1.500694467041023e-06, + "loss": 0.5958, + "step": 67280 + }, + { + "epoch": 0.8410460261506537, + "grad_norm": 4.838448524475098, + "learning_rate": 1.5002346842428194e-06, + "loss": 0.5762, + "step": 67282 + }, + { + "epoch": 0.8410710267756694, + "grad_norm": 1.7616853713989258, + "learning_rate": 1.499774966177624e-06, + "loss": 0.6786, + "step": 67284 + }, + { + "epoch": 0.841096027400685, + "grad_norm": 5.597928524017334, + "learning_rate": 1.4993153128489435e-06, + "loss": 0.9086, + "step": 67286 + }, + { + "epoch": 0.8411210280257007, + "grad_norm": 0.06709087640047073, + "learning_rate": 1.498855724260273e-06, + "loss": 0.4567, + "step": 67288 + }, + { + "epoch": 0.8411460286507163, + "grad_norm": 1.113852620124817, + "learning_rate": 1.4983962004151166e-06, + "loss": 0.5017, + "step": 67290 + }, + { + "epoch": 0.8411710292757318, + "grad_norm": 0.0002931229828391224, + "learning_rate": 1.497936741316971e-06, + "loss": 0.7903, + "step": 67292 + }, + { + "epoch": 0.8411960299007475, + "grad_norm": 2.5088703632354736, + "learning_rate": 1.4974773469693416e-06, + "loss": 1.614, + "step": 67294 + }, + { + "epoch": 0.8412210305257631, + "grad_norm": 1.596478819847107, + "learning_rate": 1.4970180173757176e-06, + "loss": 0.4809, + "step": 67296 + }, + { + "epoch": 0.8412460311507788, + "grad_norm": 6.546162128448486, + "learning_rate": 1.496558752539602e-06, + "loss": 1.2853, + "step": 67298 + }, + { + "epoch": 0.8412710317757944, + "grad_norm": 3.8946704864501953, + "learning_rate": 1.4960995524644951e-06, + "loss": 0.9435, + "step": 67300 + }, + { + "epoch": 0.84129603240081, + "grad_norm": 1.7621397972106934, + "learning_rate": 1.4956404171538886e-06, + "loss": 1.4842, + "step": 67302 + }, + { + "epoch": 0.8413210330258256, + "grad_norm": 2.9193482398986816, + "learning_rate": 1.4951813466112842e-06, + "loss": 1.7832, + "step": 67304 + }, + { + "epoch": 0.8413460336508413, + "grad_norm": 2.091360330581665, + "learning_rate": 1.4947223408401735e-06, + "loss": 0.757, + "step": 67306 + }, + { + "epoch": 0.8413710342758569, + "grad_norm": 4.038087844848633, + "learning_rate": 1.494263399844058e-06, + "loss": 0.7059, + "step": 67308 + }, + { + "epoch": 0.8413960349008726, + "grad_norm": 4.405951023101807, + "learning_rate": 1.4938045236264287e-06, + "loss": 1.4523, + "step": 67310 + }, + { + "epoch": 0.8414210355258881, + "grad_norm": 1.7053091526031494, + "learning_rate": 1.4933457121907791e-06, + "loss": 0.9681, + "step": 67312 + }, + { + "epoch": 0.8414460361509037, + "grad_norm": 0.5844992399215698, + "learning_rate": 1.4928869655406086e-06, + "loss": 0.4186, + "step": 67314 + }, + { + "epoch": 0.8414710367759194, + "grad_norm": 1.6145983934402466, + "learning_rate": 1.4924282836794046e-06, + "loss": 0.2426, + "step": 67316 + }, + { + "epoch": 0.841496037400935, + "grad_norm": 0.03638787195086479, + "learning_rate": 1.4919696666106664e-06, + "loss": 1.23, + "step": 67318 + }, + { + "epoch": 0.8415210380259507, + "grad_norm": 9.17260456085205, + "learning_rate": 1.4915111143378847e-06, + "loss": 0.6678, + "step": 67320 + }, + { + "epoch": 0.8415460386509662, + "grad_norm": 3.8846089839935303, + "learning_rate": 1.4910526268645486e-06, + "loss": 1.1995, + "step": 67322 + }, + { + "epoch": 0.8415710392759819, + "grad_norm": 2.83697772026062, + "learning_rate": 1.4905942041941545e-06, + "loss": 1.5764, + "step": 67324 + }, + { + "epoch": 0.8415960399009975, + "grad_norm": 0.00048337396583519876, + "learning_rate": 1.4901358463301884e-06, + "loss": 0.5636, + "step": 67326 + }, + { + "epoch": 0.8416210405260132, + "grad_norm": 4.575723171234131, + "learning_rate": 1.4896775532761477e-06, + "loss": 0.3474, + "step": 67328 + }, + { + "epoch": 0.8416460411510288, + "grad_norm": 1.2909785509109497, + "learning_rate": 1.489219325035517e-06, + "loss": 0.9638, + "step": 67330 + }, + { + "epoch": 0.8416710417760443, + "grad_norm": 3.7893974781036377, + "learning_rate": 1.4887611616117904e-06, + "loss": 1.6438, + "step": 67332 + }, + { + "epoch": 0.84169604240106, + "grad_norm": 0.864132285118103, + "learning_rate": 1.4883030630084561e-06, + "loss": 0.5736, + "step": 67334 + }, + { + "epoch": 0.8417210430260756, + "grad_norm": 3.1097967624664307, + "learning_rate": 1.487845029228998e-06, + "loss": 0.1873, + "step": 67336 + }, + { + "epoch": 0.8417460436510913, + "grad_norm": 7.439566612243652, + "learning_rate": 1.4873870602769114e-06, + "loss": 1.3444, + "step": 67338 + }, + { + "epoch": 0.8417710442761069, + "grad_norm": 7.26464319229126, + "learning_rate": 1.4869291561556798e-06, + "loss": 1.2708, + "step": 67340 + }, + { + "epoch": 0.8417960449011225, + "grad_norm": 1.3935266733169556, + "learning_rate": 1.4864713168687928e-06, + "loss": 0.2877, + "step": 67342 + }, + { + "epoch": 0.8418210455261381, + "grad_norm": 1.497833490371704, + "learning_rate": 1.4860135424197354e-06, + "loss": 0.0936, + "step": 67344 + }, + { + "epoch": 0.8418460461511538, + "grad_norm": 3.1891844272613525, + "learning_rate": 1.4855558328119957e-06, + "loss": 1.2295, + "step": 67346 + }, + { + "epoch": 0.8418710467761694, + "grad_norm": 2.3675596714019775, + "learning_rate": 1.4850981880490601e-06, + "loss": 0.3303, + "step": 67348 + }, + { + "epoch": 0.8418960474011851, + "grad_norm": 4.013378620147705, + "learning_rate": 1.48464060813441e-06, + "loss": 0.4348, + "step": 67350 + }, + { + "epoch": 0.8419210480262006, + "grad_norm": 3.8725736141204834, + "learning_rate": 1.484183093071534e-06, + "loss": 1.4874, + "step": 67352 + }, + { + "epoch": 0.8419460486512163, + "grad_norm": 3.0671849250793457, + "learning_rate": 1.4837256428639147e-06, + "loss": 0.6799, + "step": 67354 + }, + { + "epoch": 0.8419710492762319, + "grad_norm": 2.149874687194824, + "learning_rate": 1.483268257515037e-06, + "loss": 0.9439, + "step": 67356 + }, + { + "epoch": 0.8419960499012475, + "grad_norm": 0.1446719765663147, + "learning_rate": 1.4828109370283828e-06, + "loss": 0.0452, + "step": 67358 + }, + { + "epoch": 0.8420210505262632, + "grad_norm": 1.0507365465164185, + "learning_rate": 1.4823536814074379e-06, + "loss": 0.7745, + "step": 67360 + }, + { + "epoch": 0.8420460511512787, + "grad_norm": 8.517119407653809, + "learning_rate": 1.4818964906556833e-06, + "loss": 0.6714, + "step": 67362 + }, + { + "epoch": 0.8420710517762944, + "grad_norm": 3.4334921836853027, + "learning_rate": 1.481439364776598e-06, + "loss": 1.1097, + "step": 67364 + }, + { + "epoch": 0.84209605240131, + "grad_norm": 0.0021376877557486296, + "learning_rate": 1.4809823037736682e-06, + "loss": 0.1701, + "step": 67366 + }, + { + "epoch": 0.8421210530263257, + "grad_norm": 3.876417875289917, + "learning_rate": 1.48052530765037e-06, + "loss": 1.4743, + "step": 67368 + }, + { + "epoch": 0.8421460536513413, + "grad_norm": 4.869822025299072, + "learning_rate": 1.4800683764101898e-06, + "loss": 0.9656, + "step": 67370 + }, + { + "epoch": 0.8421710542763569, + "grad_norm": 0.00970365945249796, + "learning_rate": 1.4796115100566022e-06, + "loss": 0.0002, + "step": 67372 + }, + { + "epoch": 0.8421960549013725, + "grad_norm": 2.957737684249878, + "learning_rate": 1.479154708593087e-06, + "loss": 1.0852, + "step": 67374 + }, + { + "epoch": 0.8422210555263882, + "grad_norm": 3.22965145111084, + "learning_rate": 1.4786979720231265e-06, + "loss": 1.1783, + "step": 67376 + }, + { + "epoch": 0.8422460561514038, + "grad_norm": 5.202032089233398, + "learning_rate": 1.478241300350195e-06, + "loss": 1.1267, + "step": 67378 + }, + { + "epoch": 0.8422710567764194, + "grad_norm": 7.0454421043396, + "learning_rate": 1.4777846935777761e-06, + "loss": 1.0222, + "step": 67380 + }, + { + "epoch": 0.842296057401435, + "grad_norm": 2.875913143157959, + "learning_rate": 1.4773281517093397e-06, + "loss": 0.3626, + "step": 67382 + }, + { + "epoch": 0.8423210580264506, + "grad_norm": 3.815350294113159, + "learning_rate": 1.4768716747483703e-06, + "loss": 1.9281, + "step": 67384 + }, + { + "epoch": 0.8423460586514663, + "grad_norm": 4.748416900634766, + "learning_rate": 1.4764152626983408e-06, + "loss": 0.2485, + "step": 67386 + }, + { + "epoch": 0.8423710592764819, + "grad_norm": 4.501800060272217, + "learning_rate": 1.4759589155627251e-06, + "loss": 1.2516, + "step": 67388 + }, + { + "epoch": 0.8423960599014976, + "grad_norm": 1.968138575553894, + "learning_rate": 1.4755026333450029e-06, + "loss": 0.8778, + "step": 67390 + }, + { + "epoch": 0.8424210605265131, + "grad_norm": 1.887291431427002, + "learning_rate": 1.4750464160486456e-06, + "loss": 0.7188, + "step": 67392 + }, + { + "epoch": 0.8424460611515288, + "grad_norm": 4.936456680297852, + "learning_rate": 1.4745902636771304e-06, + "loss": 1.0807, + "step": 67394 + }, + { + "epoch": 0.8424710617765444, + "grad_norm": 4.243724822998047, + "learning_rate": 1.474134176233929e-06, + "loss": 0.9113, + "step": 67396 + }, + { + "epoch": 0.84249606240156, + "grad_norm": 2.8478972911834717, + "learning_rate": 1.4736781537225176e-06, + "loss": 0.8059, + "step": 67398 + }, + { + "epoch": 0.8425210630265757, + "grad_norm": 3.4369924068450928, + "learning_rate": 1.4732221961463677e-06, + "loss": 1.0074, + "step": 67400 + }, + { + "epoch": 0.8425460636515912, + "grad_norm": 2.9875237941741943, + "learning_rate": 1.472766303508949e-06, + "loss": 0.6244, + "step": 67402 + }, + { + "epoch": 0.8425710642766069, + "grad_norm": 6.996487617492676, + "learning_rate": 1.4723104758137397e-06, + "loss": 0.8239, + "step": 67404 + }, + { + "epoch": 0.8425960649016225, + "grad_norm": 3.1900510787963867, + "learning_rate": 1.4718547130642047e-06, + "loss": 0.9518, + "step": 67406 + }, + { + "epoch": 0.8426210655266382, + "grad_norm": 15.292494773864746, + "learning_rate": 1.4713990152638213e-06, + "loss": 0.4629, + "step": 67408 + }, + { + "epoch": 0.8426460661516538, + "grad_norm": 0.3728581964969635, + "learning_rate": 1.4709433824160535e-06, + "loss": 0.0747, + "step": 67410 + }, + { + "epoch": 0.8426710667766694, + "grad_norm": 0.7011399269104004, + "learning_rate": 1.4704878145243784e-06, + "loss": 1.0965, + "step": 67412 + }, + { + "epoch": 0.842696067401685, + "grad_norm": 7.4210076332092285, + "learning_rate": 1.470032311592261e-06, + "loss": 1.2306, + "step": 67414 + }, + { + "epoch": 0.8427210680267007, + "grad_norm": 0.003784866537898779, + "learning_rate": 1.4695768736231686e-06, + "loss": 0.0001, + "step": 67416 + }, + { + "epoch": 0.8427460686517163, + "grad_norm": 0.3569278419017792, + "learning_rate": 1.469121500620576e-06, + "loss": 0.3361, + "step": 67418 + }, + { + "epoch": 0.842771069276732, + "grad_norm": 2.947829008102417, + "learning_rate": 1.4686661925879442e-06, + "loss": 1.1778, + "step": 67420 + }, + { + "epoch": 0.8427960699017475, + "grad_norm": 3.9277288913726807, + "learning_rate": 1.4682109495287478e-06, + "loss": 1.5753, + "step": 67422 + }, + { + "epoch": 0.8428210705267631, + "grad_norm": 3.9945592880249023, + "learning_rate": 1.4677557714464475e-06, + "loss": 1.0844, + "step": 67424 + }, + { + "epoch": 0.8428460711517788, + "grad_norm": 3.8571269512176514, + "learning_rate": 1.467300658344515e-06, + "loss": 1.0062, + "step": 67426 + }, + { + "epoch": 0.8428710717767944, + "grad_norm": 5.700949668884277, + "learning_rate": 1.466845610226413e-06, + "loss": 0.7683, + "step": 67428 + }, + { + "epoch": 0.8428960724018101, + "grad_norm": 3.9311776161193848, + "learning_rate": 1.4663906270956063e-06, + "loss": 0.2018, + "step": 67430 + }, + { + "epoch": 0.8429210730268256, + "grad_norm": 0.2872699201107025, + "learning_rate": 1.4659357089555647e-06, + "loss": 0.4173, + "step": 67432 + }, + { + "epoch": 0.8429460736518413, + "grad_norm": 2.485370635986328, + "learning_rate": 1.4654808558097477e-06, + "loss": 0.7391, + "step": 67434 + }, + { + "epoch": 0.8429710742768569, + "grad_norm": 2.4655885696411133, + "learning_rate": 1.4650260676616235e-06, + "loss": 1.1923, + "step": 67436 + }, + { + "epoch": 0.8429960749018726, + "grad_norm": 8.045769691467285, + "learning_rate": 1.4645713445146536e-06, + "loss": 1.7993, + "step": 67438 + }, + { + "epoch": 0.8430210755268882, + "grad_norm": 14.367131233215332, + "learning_rate": 1.4641166863723e-06, + "loss": 1.6861, + "step": 67440 + }, + { + "epoch": 0.8430460761519037, + "grad_norm": 4.093576431274414, + "learning_rate": 1.4636620932380286e-06, + "loss": 1.5005, + "step": 67442 + }, + { + "epoch": 0.8430710767769194, + "grad_norm": 4.023831367492676, + "learning_rate": 1.4632075651152967e-06, + "loss": 0.6028, + "step": 67444 + }, + { + "epoch": 0.843096077401935, + "grad_norm": 1.697007656097412, + "learning_rate": 1.4627531020075713e-06, + "loss": 0.1183, + "step": 67446 + }, + { + "epoch": 0.8431210780269507, + "grad_norm": 5.124373912811279, + "learning_rate": 1.4622987039183078e-06, + "loss": 0.459, + "step": 67448 + }, + { + "epoch": 0.8431460786519663, + "grad_norm": 3.8304970264434814, + "learning_rate": 1.4618443708509733e-06, + "loss": 1.7196, + "step": 67450 + }, + { + "epoch": 0.8431710792769819, + "grad_norm": 4.552552223205566, + "learning_rate": 1.4613901028090249e-06, + "loss": 1.0256, + "step": 67452 + }, + { + "epoch": 0.8431960799019975, + "grad_norm": 4.121535301208496, + "learning_rate": 1.46093589979592e-06, + "loss": 0.8817, + "step": 67454 + }, + { + "epoch": 0.8432210805270132, + "grad_norm": 6.228553771972656, + "learning_rate": 1.4604817618151212e-06, + "loss": 2.2643, + "step": 67456 + }, + { + "epoch": 0.8432460811520288, + "grad_norm": 4.538053035736084, + "learning_rate": 1.4600276888700837e-06, + "loss": 2.1443, + "step": 67458 + }, + { + "epoch": 0.8432710817770445, + "grad_norm": 3.998900890350342, + "learning_rate": 1.4595736809642701e-06, + "loss": 0.6439, + "step": 67460 + }, + { + "epoch": 0.84329608240206, + "grad_norm": 2.274874210357666, + "learning_rate": 1.4591197381011323e-06, + "loss": 0.572, + "step": 67462 + }, + { + "epoch": 0.8433210830270756, + "grad_norm": 0.00017480720998719335, + "learning_rate": 1.4586658602841342e-06, + "loss": 0.0001, + "step": 67464 + }, + { + "epoch": 0.8433460836520913, + "grad_norm": 3.606823682785034, + "learning_rate": 1.4582120475167293e-06, + "loss": 1.0404, + "step": 67466 + }, + { + "epoch": 0.8433710842771069, + "grad_norm": 2.880462169647217, + "learning_rate": 1.4577582998023709e-06, + "loss": 0.8915, + "step": 67468 + }, + { + "epoch": 0.8433960849021226, + "grad_norm": 4.426868438720703, + "learning_rate": 1.4573046171445194e-06, + "loss": 1.6389, + "step": 67470 + }, + { + "epoch": 0.8434210855271381, + "grad_norm": 3.792367458343506, + "learning_rate": 1.456850999546625e-06, + "loss": 1.6879, + "step": 67472 + }, + { + "epoch": 0.8434460861521538, + "grad_norm": 5.742709159851074, + "learning_rate": 1.4563974470121489e-06, + "loss": 0.9282, + "step": 67474 + }, + { + "epoch": 0.8434710867771694, + "grad_norm": 0.13566046953201294, + "learning_rate": 1.455943959544538e-06, + "loss": 0.8615, + "step": 67476 + }, + { + "epoch": 0.8434960874021851, + "grad_norm": 3.814497232437134, + "learning_rate": 1.4554905371472528e-06, + "loss": 1.2984, + "step": 67478 + }, + { + "epoch": 0.8435210880272007, + "grad_norm": 2.3993613719940186, + "learning_rate": 1.4550371798237428e-06, + "loss": 0.5596, + "step": 67480 + }, + { + "epoch": 0.8435460886522163, + "grad_norm": 2.718306541442871, + "learning_rate": 1.4545838875774597e-06, + "loss": 1.0125, + "step": 67482 + }, + { + "epoch": 0.8435710892772319, + "grad_norm": 4.93004846572876, + "learning_rate": 1.4541306604118588e-06, + "loss": 1.5641, + "step": 67484 + }, + { + "epoch": 0.8435960899022475, + "grad_norm": 4.481496810913086, + "learning_rate": 1.4536774983303893e-06, + "loss": 0.9163, + "step": 67486 + }, + { + "epoch": 0.8436210905272632, + "grad_norm": 1.47828209400177, + "learning_rate": 1.4532244013365048e-06, + "loss": 0.3947, + "step": 67488 + }, + { + "epoch": 0.8436460911522788, + "grad_norm": 2.2487125396728516, + "learning_rate": 1.4527713694336543e-06, + "loss": 0.6151, + "step": 67490 + }, + { + "epoch": 0.8436710917772944, + "grad_norm": 4.138599872589111, + "learning_rate": 1.4523184026252868e-06, + "loss": 1.6845, + "step": 67492 + }, + { + "epoch": 0.84369609240231, + "grad_norm": 2.345864772796631, + "learning_rate": 1.4518655009148553e-06, + "loss": 0.0957, + "step": 67494 + }, + { + "epoch": 0.8437210930273257, + "grad_norm": 1.557193398475647, + "learning_rate": 1.4514126643058047e-06, + "loss": 0.9838, + "step": 67496 + }, + { + "epoch": 0.8437460936523413, + "grad_norm": 2.594038248062134, + "learning_rate": 1.45095989280159e-06, + "loss": 0.9609, + "step": 67498 + }, + { + "epoch": 0.843771094277357, + "grad_norm": 0.5880458950996399, + "learning_rate": 1.4505071864056531e-06, + "loss": 0.7276, + "step": 67500 + }, + { + "epoch": 0.8437960949023725, + "grad_norm": 6.385830879211426, + "learning_rate": 1.4500545451214432e-06, + "loss": 2.0185, + "step": 67502 + }, + { + "epoch": 0.8438210955273882, + "grad_norm": 2.420539140701294, + "learning_rate": 1.4496019689524144e-06, + "loss": 1.5428, + "step": 67504 + }, + { + "epoch": 0.8438460961524038, + "grad_norm": 2.8365161418914795, + "learning_rate": 1.4491494579020039e-06, + "loss": 0.8201, + "step": 67506 + }, + { + "epoch": 0.8438710967774194, + "grad_norm": 2.22245192527771, + "learning_rate": 1.4486970119736632e-06, + "loss": 0.5211, + "step": 67508 + }, + { + "epoch": 0.8438960974024351, + "grad_norm": 2.4840855598449707, + "learning_rate": 1.4482446311708342e-06, + "loss": 0.9032, + "step": 67510 + }, + { + "epoch": 0.8439210980274506, + "grad_norm": 4.723783493041992, + "learning_rate": 1.4477923154969687e-06, + "loss": 1.3536, + "step": 67512 + }, + { + "epoch": 0.8439460986524663, + "grad_norm": 3.2139735221862793, + "learning_rate": 1.4473400649555035e-06, + "loss": 0.8219, + "step": 67514 + }, + { + "epoch": 0.8439710992774819, + "grad_norm": 2.1626689434051514, + "learning_rate": 1.4468878795498898e-06, + "loss": 1.1161, + "step": 67516 + }, + { + "epoch": 0.8439960999024976, + "grad_norm": 4.948878288269043, + "learning_rate": 1.4464357592835687e-06, + "loss": 1.004, + "step": 67518 + }, + { + "epoch": 0.8440211005275132, + "grad_norm": 0.000687026244122535, + "learning_rate": 1.4459837041599812e-06, + "loss": 0.8662, + "step": 67520 + }, + { + "epoch": 0.8440461011525288, + "grad_norm": 3.633849620819092, + "learning_rate": 1.4455317141825742e-06, + "loss": 0.9291, + "step": 67522 + }, + { + "epoch": 0.8440711017775444, + "grad_norm": 4.57000207901001, + "learning_rate": 1.4450797893547853e-06, + "loss": 1.7001, + "step": 67524 + }, + { + "epoch": 0.8440961024025601, + "grad_norm": 9.800390243530273, + "learning_rate": 1.4446279296800615e-06, + "loss": 0.0676, + "step": 67526 + }, + { + "epoch": 0.8441211030275757, + "grad_norm": 2.6953561305999756, + "learning_rate": 1.44417613516184e-06, + "loss": 0.8015, + "step": 67528 + }, + { + "epoch": 0.8441461036525914, + "grad_norm": 5.091212272644043, + "learning_rate": 1.4437244058035638e-06, + "loss": 1.9623, + "step": 67530 + }, + { + "epoch": 0.8441711042776069, + "grad_norm": 1.8911919593811035, + "learning_rate": 1.4432727416086744e-06, + "loss": 0.7266, + "step": 67532 + }, + { + "epoch": 0.8441961049026225, + "grad_norm": 2.069885730743408, + "learning_rate": 1.442821142580606e-06, + "loss": 0.697, + "step": 67534 + }, + { + "epoch": 0.8442211055276382, + "grad_norm": 0.17512483894824982, + "learning_rate": 1.4423696087228046e-06, + "loss": 0.0036, + "step": 67536 + }, + { + "epoch": 0.8442461061526538, + "grad_norm": 2.8852131366729736, + "learning_rate": 1.441918140038704e-06, + "loss": 0.6613, + "step": 67538 + }, + { + "epoch": 0.8442711067776695, + "grad_norm": 0.4706827700138092, + "learning_rate": 1.4414667365317448e-06, + "loss": 0.0084, + "step": 67540 + }, + { + "epoch": 0.844296107402685, + "grad_norm": 2.0898635387420654, + "learning_rate": 1.4410153982053686e-06, + "loss": 1.5308, + "step": 67542 + }, + { + "epoch": 0.8443211080277007, + "grad_norm": 3.0343873500823975, + "learning_rate": 1.4405641250630042e-06, + "loss": 0.2637, + "step": 67544 + }, + { + "epoch": 0.8443461086527163, + "grad_norm": 5.567007541656494, + "learning_rate": 1.440112917108095e-06, + "loss": 1.1411, + "step": 67546 + }, + { + "epoch": 0.844371109277732, + "grad_norm": 2.6824235916137695, + "learning_rate": 1.4396617743440743e-06, + "loss": 0.5016, + "step": 67548 + }, + { + "epoch": 0.8443961099027476, + "grad_norm": 3.457460403442383, + "learning_rate": 1.4392106967743801e-06, + "loss": 0.868, + "step": 67550 + }, + { + "epoch": 0.8444211105277631, + "grad_norm": 2.700775623321533, + "learning_rate": 1.4387596844024454e-06, + "loss": 1.237, + "step": 67552 + }, + { + "epoch": 0.8444461111527788, + "grad_norm": 5.045226097106934, + "learning_rate": 1.4383087372317051e-06, + "loss": 1.8838, + "step": 67554 + }, + { + "epoch": 0.8444711117777944, + "grad_norm": 3.507978677749634, + "learning_rate": 1.4378578552655998e-06, + "loss": 1.2444, + "step": 67556 + }, + { + "epoch": 0.8444961124028101, + "grad_norm": 3.7618253231048584, + "learning_rate": 1.4374070385075533e-06, + "loss": 0.716, + "step": 67558 + }, + { + "epoch": 0.8445211130278257, + "grad_norm": 4.051703929901123, + "learning_rate": 1.4369562869610076e-06, + "loss": 0.7745, + "step": 67560 + }, + { + "epoch": 0.8445461136528413, + "grad_norm": 1.077327847480774, + "learning_rate": 1.4365056006293888e-06, + "loss": 0.7281, + "step": 67562 + }, + { + "epoch": 0.8445711142778569, + "grad_norm": 5.693680286407471, + "learning_rate": 1.4360549795161316e-06, + "loss": 1.1788, + "step": 67564 + }, + { + "epoch": 0.8445961149028726, + "grad_norm": 2.2938389778137207, + "learning_rate": 1.4356044236246713e-06, + "loss": 0.2303, + "step": 67566 + }, + { + "epoch": 0.8446211155278882, + "grad_norm": 0.8259631395339966, + "learning_rate": 1.4351539329584375e-06, + "loss": 0.1882, + "step": 67568 + }, + { + "epoch": 0.8446461161529039, + "grad_norm": 2.9670462608337402, + "learning_rate": 1.4347035075208593e-06, + "loss": 1.8155, + "step": 67570 + }, + { + "epoch": 0.8446711167779194, + "grad_norm": 3.163015127182007, + "learning_rate": 1.4342531473153664e-06, + "loss": 1.2801, + "step": 67572 + }, + { + "epoch": 0.844696117402935, + "grad_norm": 0.2609400749206543, + "learning_rate": 1.4338028523453918e-06, + "loss": 1.5716, + "step": 67574 + }, + { + "epoch": 0.8447211180279507, + "grad_norm": 3.6238245964050293, + "learning_rate": 1.4333526226143612e-06, + "loss": 0.2908, + "step": 67576 + }, + { + "epoch": 0.8447461186529663, + "grad_norm": 5.562420845031738, + "learning_rate": 1.4329024581257055e-06, + "loss": 0.7072, + "step": 67578 + }, + { + "epoch": 0.844771119277982, + "grad_norm": 5.717624664306641, + "learning_rate": 1.4324523588828555e-06, + "loss": 0.7794, + "step": 67580 + }, + { + "epoch": 0.8447961199029975, + "grad_norm": 5.458358287811279, + "learning_rate": 1.4320023248892368e-06, + "loss": 0.5046, + "step": 67582 + }, + { + "epoch": 0.8448211205280132, + "grad_norm": 2.766714572906494, + "learning_rate": 1.4315523561482769e-06, + "loss": 0.3949, + "step": 67584 + }, + { + "epoch": 0.8448461211530288, + "grad_norm": 2.2784535884857178, + "learning_rate": 1.4311024526634e-06, + "loss": 1.3453, + "step": 67586 + }, + { + "epoch": 0.8448711217780445, + "grad_norm": 0.33119428157806396, + "learning_rate": 1.430652614438035e-06, + "loss": 0.002, + "step": 67588 + }, + { + "epoch": 0.8448961224030601, + "grad_norm": 2.353461742401123, + "learning_rate": 1.4302028414756108e-06, + "loss": 0.135, + "step": 67590 + }, + { + "epoch": 0.8449211230280756, + "grad_norm": 1.354673147201538, + "learning_rate": 1.429753133779548e-06, + "loss": 0.0276, + "step": 67592 + }, + { + "epoch": 0.8449461236530913, + "grad_norm": 2.2603659629821777, + "learning_rate": 1.4293034913532766e-06, + "loss": 0.1665, + "step": 67594 + }, + { + "epoch": 0.8449711242781069, + "grad_norm": 3.9175827503204346, + "learning_rate": 1.4288539142002133e-06, + "loss": 1.6785, + "step": 67596 + }, + { + "epoch": 0.8449961249031226, + "grad_norm": 3.1547341346740723, + "learning_rate": 1.4284044023237898e-06, + "loss": 0.9691, + "step": 67598 + }, + { + "epoch": 0.8450211255281382, + "grad_norm": 2.0287466049194336, + "learning_rate": 1.4279549557274231e-06, + "loss": 1.465, + "step": 67600 + }, + { + "epoch": 0.8450461261531538, + "grad_norm": 1.8913860321044922, + "learning_rate": 1.4275055744145394e-06, + "loss": 1.0041, + "step": 67602 + }, + { + "epoch": 0.8450711267781694, + "grad_norm": 6.04406213760376, + "learning_rate": 1.4270562583885639e-06, + "loss": 0.2112, + "step": 67604 + }, + { + "epoch": 0.8450961274031851, + "grad_norm": 4.848961353302002, + "learning_rate": 1.4266070076529126e-06, + "loss": 1.2657, + "step": 67606 + }, + { + "epoch": 0.8451211280282007, + "grad_norm": 5.509280681610107, + "learning_rate": 1.426157822211015e-06, + "loss": 1.9932, + "step": 67608 + }, + { + "epoch": 0.8451461286532164, + "grad_norm": 1.2370227575302124, + "learning_rate": 1.425708702066283e-06, + "loss": 0.3611, + "step": 67610 + }, + { + "epoch": 0.8451711292782319, + "grad_norm": 1.4334371089935303, + "learning_rate": 1.4252596472221424e-06, + "loss": 0.5331, + "step": 67612 + }, + { + "epoch": 0.8451961299032476, + "grad_norm": 0.03890451043844223, + "learning_rate": 1.4248106576820099e-06, + "loss": 0.0886, + "step": 67614 + }, + { + "epoch": 0.8452211305282632, + "grad_norm": 6.258566379547119, + "learning_rate": 1.4243617334493066e-06, + "loss": 0.9704, + "step": 67616 + }, + { + "epoch": 0.8452461311532788, + "grad_norm": 4.635763645172119, + "learning_rate": 1.4239128745274533e-06, + "loss": 0.9654, + "step": 67618 + }, + { + "epoch": 0.8452711317782945, + "grad_norm": 7.009677886962891, + "learning_rate": 1.4234640809198673e-06, + "loss": 0.784, + "step": 67620 + }, + { + "epoch": 0.84529613240331, + "grad_norm": 3.0488593578338623, + "learning_rate": 1.4230153526299661e-06, + "loss": 1.2119, + "step": 67622 + }, + { + "epoch": 0.8453211330283257, + "grad_norm": 5.415680885314941, + "learning_rate": 1.4225666896611646e-06, + "loss": 1.5789, + "step": 67624 + }, + { + "epoch": 0.8453461336533413, + "grad_norm": 4.142527103424072, + "learning_rate": 1.422118092016882e-06, + "loss": 0.2795, + "step": 67626 + }, + { + "epoch": 0.845371134278357, + "grad_norm": 3.4782979488372803, + "learning_rate": 1.421669559700538e-06, + "loss": 1.2986, + "step": 67628 + }, + { + "epoch": 0.8453961349033726, + "grad_norm": 5.465322494506836, + "learning_rate": 1.4212210927155424e-06, + "loss": 2.973, + "step": 67630 + }, + { + "epoch": 0.8454211355283882, + "grad_norm": 7.384944438934326, + "learning_rate": 1.4207726910653174e-06, + "loss": 1.2438, + "step": 67632 + }, + { + "epoch": 0.8454461361534038, + "grad_norm": 2.1982386112213135, + "learning_rate": 1.420324354753273e-06, + "loss": 0.7017, + "step": 67634 + }, + { + "epoch": 0.8454711367784195, + "grad_norm": 4.676225662231445, + "learning_rate": 1.4198760837828263e-06, + "loss": 0.7135, + "step": 67636 + }, + { + "epoch": 0.8454961374034351, + "grad_norm": 1.2645986080169678, + "learning_rate": 1.4194278781573868e-06, + "loss": 0.054, + "step": 67638 + }, + { + "epoch": 0.8455211380284507, + "grad_norm": 5.6019062995910645, + "learning_rate": 1.4189797378803715e-06, + "loss": 1.0844, + "step": 67640 + }, + { + "epoch": 0.8455461386534663, + "grad_norm": 3.0393917560577393, + "learning_rate": 1.4185316629551947e-06, + "loss": 1.3058, + "step": 67642 + }, + { + "epoch": 0.8455711392784819, + "grad_norm": 4.705806732177734, + "learning_rate": 1.418083653385266e-06, + "loss": 1.2047, + "step": 67644 + }, + { + "epoch": 0.8455961399034976, + "grad_norm": 2.7018535137176514, + "learning_rate": 1.4176357091740022e-06, + "loss": 0.6819, + "step": 67646 + }, + { + "epoch": 0.8456211405285132, + "grad_norm": 3.986485481262207, + "learning_rate": 1.4171878303248065e-06, + "loss": 0.9393, + "step": 67648 + }, + { + "epoch": 0.8456461411535289, + "grad_norm": 1.9984393119812012, + "learning_rate": 1.4167400168410961e-06, + "loss": 0.6975, + "step": 67650 + }, + { + "epoch": 0.8456711417785444, + "grad_norm": 0.0030622845515608788, + "learning_rate": 1.4162922687262804e-06, + "loss": 0.4313, + "step": 67652 + }, + { + "epoch": 0.8456961424035601, + "grad_norm": 3.474205255508423, + "learning_rate": 1.4158445859837677e-06, + "loss": 1.6565, + "step": 67654 + }, + { + "epoch": 0.8457211430285757, + "grad_norm": 10.560306549072266, + "learning_rate": 1.415396968616971e-06, + "loss": 1.2579, + "step": 67656 + }, + { + "epoch": 0.8457461436535914, + "grad_norm": 2.748105525970459, + "learning_rate": 1.4149494166292943e-06, + "loss": 0.4024, + "step": 67658 + }, + { + "epoch": 0.845771144278607, + "grad_norm": 2.695244073867798, + "learning_rate": 1.4145019300241535e-06, + "loss": 0.6685, + "step": 67660 + }, + { + "epoch": 0.8457961449036225, + "grad_norm": 0.00034009534283541143, + "learning_rate": 1.4140545088049484e-06, + "loss": 0.6499, + "step": 67662 + }, + { + "epoch": 0.8458211455286382, + "grad_norm": 2.3303539752960205, + "learning_rate": 1.4136071529750882e-06, + "loss": 1.1184, + "step": 67664 + }, + { + "epoch": 0.8458461461536538, + "grad_norm": 0.29594114422798157, + "learning_rate": 1.4131598625379849e-06, + "loss": 0.6807, + "step": 67666 + }, + { + "epoch": 0.8458711467786695, + "grad_norm": 2.509437084197998, + "learning_rate": 1.41271263749704e-06, + "loss": 0.7998, + "step": 67668 + }, + { + "epoch": 0.8458961474036851, + "grad_norm": 3.197122573852539, + "learning_rate": 1.4122654778556632e-06, + "loss": 0.1866, + "step": 67670 + }, + { + "epoch": 0.8459211480287007, + "grad_norm": 2.264101505279541, + "learning_rate": 1.411818383617255e-06, + "loss": 1.0786, + "step": 67672 + }, + { + "epoch": 0.8459461486537163, + "grad_norm": 2.015584945678711, + "learning_rate": 1.4113713547852292e-06, + "loss": 0.5994, + "step": 67674 + }, + { + "epoch": 0.845971149278732, + "grad_norm": 0.0005204502958804369, + "learning_rate": 1.4109243913629788e-06, + "loss": 0.0024, + "step": 67676 + }, + { + "epoch": 0.8459961499037476, + "grad_norm": 4.0821123123168945, + "learning_rate": 1.4104774933539146e-06, + "loss": 1.1296, + "step": 67678 + }, + { + "epoch": 0.8460211505287633, + "grad_norm": 1.2168067693710327, + "learning_rate": 1.4100306607614412e-06, + "loss": 0.39, + "step": 67680 + }, + { + "epoch": 0.8460461511537788, + "grad_norm": 1.7178819179534912, + "learning_rate": 1.4095838935889572e-06, + "loss": 0.2272, + "step": 67682 + }, + { + "epoch": 0.8460711517787944, + "grad_norm": 3.6709065437316895, + "learning_rate": 1.40913719183987e-06, + "loss": 0.5951, + "step": 67684 + }, + { + "epoch": 0.8460961524038101, + "grad_norm": 4.240015983581543, + "learning_rate": 1.408690555517579e-06, + "loss": 1.2295, + "step": 67686 + }, + { + "epoch": 0.8461211530288257, + "grad_norm": 0.43035948276519775, + "learning_rate": 1.4082439846254837e-06, + "loss": 0.9592, + "step": 67688 + }, + { + "epoch": 0.8461461536538414, + "grad_norm": 0.14391621947288513, + "learning_rate": 1.4077974791669891e-06, + "loss": 0.002, + "step": 67690 + }, + { + "epoch": 0.8461711542788569, + "grad_norm": 0.0009897130075842142, + "learning_rate": 1.4073510391454925e-06, + "loss": 1.0801, + "step": 67692 + }, + { + "epoch": 0.8461961549038726, + "grad_norm": 4.024447917938232, + "learning_rate": 1.406904664564397e-06, + "loss": 0.8754, + "step": 67694 + }, + { + "epoch": 0.8462211555288882, + "grad_norm": 1.5185123682022095, + "learning_rate": 1.4064583554270971e-06, + "loss": 0.3394, + "step": 67696 + }, + { + "epoch": 0.8462461561539039, + "grad_norm": 0.4128195345401764, + "learning_rate": 1.4060121117369985e-06, + "loss": 1.0545, + "step": 67698 + }, + { + "epoch": 0.8462711567789195, + "grad_norm": 1.5353643894195557, + "learning_rate": 1.4055659334974958e-06, + "loss": 0.0698, + "step": 67700 + }, + { + "epoch": 0.846296157403935, + "grad_norm": 3.154536247253418, + "learning_rate": 1.4051198207119854e-06, + "loss": 1.4854, + "step": 67702 + }, + { + "epoch": 0.8463211580289507, + "grad_norm": 4.877204895019531, + "learning_rate": 1.40467377338387e-06, + "loss": 0.725, + "step": 67704 + }, + { + "epoch": 0.8463461586539663, + "grad_norm": 4.679262161254883, + "learning_rate": 1.4042277915165414e-06, + "loss": 1.8316, + "step": 67706 + }, + { + "epoch": 0.846371159278982, + "grad_norm": 3.3644697666168213, + "learning_rate": 1.403781875113399e-06, + "loss": 0.9254, + "step": 67708 + }, + { + "epoch": 0.8463961599039976, + "grad_norm": 2.823700189590454, + "learning_rate": 1.4033360241778382e-06, + "loss": 0.8235, + "step": 67710 + }, + { + "epoch": 0.8464211605290132, + "grad_norm": 2.716043472290039, + "learning_rate": 1.4028902387132548e-06, + "loss": 0.525, + "step": 67712 + }, + { + "epoch": 0.8464461611540288, + "grad_norm": 3.561615228652954, + "learning_rate": 1.4024445187230451e-06, + "loss": 0.6081, + "step": 67714 + }, + { + "epoch": 0.8464711617790445, + "grad_norm": 4.700857639312744, + "learning_rate": 1.4019988642105997e-06, + "loss": 1.5482, + "step": 67716 + }, + { + "epoch": 0.8464961624040601, + "grad_norm": 0.00486611807718873, + "learning_rate": 1.401553275179317e-06, + "loss": 0.5266, + "step": 67718 + }, + { + "epoch": 0.8465211630290758, + "grad_norm": 2.2086896896362305, + "learning_rate": 1.4011077516325855e-06, + "loss": 1.5207, + "step": 67720 + }, + { + "epoch": 0.8465461636540913, + "grad_norm": 3.869715690612793, + "learning_rate": 1.4006622935738046e-06, + "loss": 1.2357, + "step": 67722 + }, + { + "epoch": 0.846571164279107, + "grad_norm": 2.1887433528900146, + "learning_rate": 1.4002169010063616e-06, + "loss": 0.594, + "step": 67724 + }, + { + "epoch": 0.8465961649041226, + "grad_norm": 4.825669765472412, + "learning_rate": 1.3997715739336527e-06, + "loss": 1.7799, + "step": 67726 + }, + { + "epoch": 0.8466211655291382, + "grad_norm": 3.085339307785034, + "learning_rate": 1.3993263123590662e-06, + "loss": 0.6086, + "step": 67728 + }, + { + "epoch": 0.8466461661541539, + "grad_norm": 0.48512735962867737, + "learning_rate": 1.3988811162859928e-06, + "loss": 0.276, + "step": 67730 + }, + { + "epoch": 0.8466711667791694, + "grad_norm": 2.940789222717285, + "learning_rate": 1.3984359857178276e-06, + "loss": 0.5099, + "step": 67732 + }, + { + "epoch": 0.8466961674041851, + "grad_norm": 3.0482711791992188, + "learning_rate": 1.3979909206579545e-06, + "loss": 0.4497, + "step": 67734 + }, + { + "epoch": 0.8467211680292007, + "grad_norm": 4.0280585289001465, + "learning_rate": 1.3975459211097674e-06, + "loss": 0.9681, + "step": 67736 + }, + { + "epoch": 0.8467461686542164, + "grad_norm": 3.4643611907958984, + "learning_rate": 1.397100987076656e-06, + "loss": 0.8211, + "step": 67738 + }, + { + "epoch": 0.846771169279232, + "grad_norm": 2.398390531539917, + "learning_rate": 1.3966561185620031e-06, + "loss": 0.3003, + "step": 67740 + }, + { + "epoch": 0.8467961699042476, + "grad_norm": 3.2189366817474365, + "learning_rate": 1.3962113155692037e-06, + "loss": 0.722, + "step": 67742 + }, + { + "epoch": 0.8468211705292632, + "grad_norm": 3.142500400543213, + "learning_rate": 1.3957665781016394e-06, + "loss": 1.3664, + "step": 67744 + }, + { + "epoch": 0.8468461711542788, + "grad_norm": 0.0008537239045836031, + "learning_rate": 1.3953219061627032e-06, + "loss": 0.2631, + "step": 67746 + }, + { + "epoch": 0.8468711717792945, + "grad_norm": 3.2428853511810303, + "learning_rate": 1.3948772997557758e-06, + "loss": 0.8734, + "step": 67748 + }, + { + "epoch": 0.8468961724043101, + "grad_norm": 2.7788636684417725, + "learning_rate": 1.3944327588842477e-06, + "loss": 0.8123, + "step": 67750 + }, + { + "epoch": 0.8469211730293257, + "grad_norm": 1.8768671751022339, + "learning_rate": 1.3939882835515039e-06, + "loss": 0.607, + "step": 67752 + }, + { + "epoch": 0.8469461736543413, + "grad_norm": 3.658297061920166, + "learning_rate": 1.3935438737609252e-06, + "loss": 1.3637, + "step": 67754 + }, + { + "epoch": 0.846971174279357, + "grad_norm": 4.018137454986572, + "learning_rate": 1.3930995295159023e-06, + "loss": 1.4341, + "step": 67756 + }, + { + "epoch": 0.8469961749043726, + "grad_norm": 1.6821244955062866, + "learning_rate": 1.3926552508198132e-06, + "loss": 0.4564, + "step": 67758 + }, + { + "epoch": 0.8470211755293883, + "grad_norm": 4.283552646636963, + "learning_rate": 1.3922110376760467e-06, + "loss": 0.9104, + "step": 67760 + }, + { + "epoch": 0.8470461761544038, + "grad_norm": 3.303985357284546, + "learning_rate": 1.391766890087982e-06, + "loss": 1.2998, + "step": 67762 + }, + { + "epoch": 0.8470711767794195, + "grad_norm": 7.785148620605469, + "learning_rate": 1.3913228080590059e-06, + "loss": 1.0554, + "step": 67764 + }, + { + "epoch": 0.8470961774044351, + "grad_norm": 8.185432434082031, + "learning_rate": 1.3908787915924971e-06, + "loss": 1.0036, + "step": 67766 + }, + { + "epoch": 0.8471211780294507, + "grad_norm": 1.0252711772918701, + "learning_rate": 1.3904348406918366e-06, + "loss": 0.9434, + "step": 67768 + }, + { + "epoch": 0.8471461786544664, + "grad_norm": 3.9685497283935547, + "learning_rate": 1.3899909553604084e-06, + "loss": 0.0735, + "step": 67770 + }, + { + "epoch": 0.8471711792794819, + "grad_norm": 3.8333334922790527, + "learning_rate": 1.3895471356015888e-06, + "loss": 1.457, + "step": 67772 + }, + { + "epoch": 0.8471961799044976, + "grad_norm": 2.9020042419433594, + "learning_rate": 1.3891033814187638e-06, + "loss": 1.4231, + "step": 67774 + }, + { + "epoch": 0.8472211805295132, + "grad_norm": 3.7930073738098145, + "learning_rate": 1.3886596928153074e-06, + "loss": 1.616, + "step": 67776 + }, + { + "epoch": 0.8472461811545289, + "grad_norm": 2.905050754547119, + "learning_rate": 1.3882160697946034e-06, + "loss": 0.6761, + "step": 67778 + }, + { + "epoch": 0.8472711817795445, + "grad_norm": 0.001261957106180489, + "learning_rate": 1.3877725123600272e-06, + "loss": 0.5832, + "step": 67780 + }, + { + "epoch": 0.8472961824045601, + "grad_norm": 1.2310385704040527, + "learning_rate": 1.3873290205149559e-06, + "loss": 0.6554, + "step": 67782 + }, + { + "epoch": 0.8473211830295757, + "grad_norm": 3.931044101715088, + "learning_rate": 1.3868855942627713e-06, + "loss": 0.5275, + "step": 67784 + }, + { + "epoch": 0.8473461836545914, + "grad_norm": 0.7036658525466919, + "learning_rate": 1.3864422336068461e-06, + "loss": 0.0208, + "step": 67786 + }, + { + "epoch": 0.847371184279607, + "grad_norm": 0.0005313886795192957, + "learning_rate": 1.38599893855056e-06, + "loss": 0.4662, + "step": 67788 + }, + { + "epoch": 0.8473961849046227, + "grad_norm": 1.9840620756149292, + "learning_rate": 1.3855557090972882e-06, + "loss": 0.092, + "step": 67790 + }, + { + "epoch": 0.8474211855296382, + "grad_norm": 4.712894439697266, + "learning_rate": 1.3851125452504045e-06, + "loss": 1.2045, + "step": 67792 + }, + { + "epoch": 0.8474461861546538, + "grad_norm": 2.653687000274658, + "learning_rate": 1.384669447013286e-06, + "loss": 0.7192, + "step": 67794 + }, + { + "epoch": 0.8474711867796695, + "grad_norm": 2.306544780731201, + "learning_rate": 1.3842264143893058e-06, + "loss": 0.1138, + "step": 67796 + }, + { + "epoch": 0.8474961874046851, + "grad_norm": 3.7044479846954346, + "learning_rate": 1.3837834473818412e-06, + "loss": 1.4032, + "step": 67798 + }, + { + "epoch": 0.8475211880297008, + "grad_norm": 1.998504877090454, + "learning_rate": 1.3833405459942606e-06, + "loss": 1.0326, + "step": 67800 + }, + { + "epoch": 0.8475461886547163, + "grad_norm": 2.419738531112671, + "learning_rate": 1.3828977102299412e-06, + "loss": 0.5817, + "step": 67802 + }, + { + "epoch": 0.847571189279732, + "grad_norm": 0.00037564689409919083, + "learning_rate": 1.382454940092256e-06, + "loss": 0.915, + "step": 67804 + }, + { + "epoch": 0.8475961899047476, + "grad_norm": 0.00033678431645967066, + "learning_rate": 1.3820122355845723e-06, + "loss": 0.7333, + "step": 67806 + }, + { + "epoch": 0.8476211905297633, + "grad_norm": 5.572726726531982, + "learning_rate": 1.381569596710267e-06, + "loss": 2.4997, + "step": 67808 + }, + { + "epoch": 0.8476461911547789, + "grad_norm": 3.29144287109375, + "learning_rate": 1.381127023472707e-06, + "loss": 0.5857, + "step": 67810 + }, + { + "epoch": 0.8476711917797944, + "grad_norm": 4.075125217437744, + "learning_rate": 1.380684515875267e-06, + "loss": 0.2291, + "step": 67812 + }, + { + "epoch": 0.8476961924048101, + "grad_norm": 4.821120262145996, + "learning_rate": 1.3802420739213118e-06, + "loss": 0.8122, + "step": 67814 + }, + { + "epoch": 0.8477211930298257, + "grad_norm": 0.0787767842411995, + "learning_rate": 1.379799697614217e-06, + "loss": 0.103, + "step": 67816 + }, + { + "epoch": 0.8477461936548414, + "grad_norm": 1.9323498010635376, + "learning_rate": 1.3793573869573485e-06, + "loss": 0.9653, + "step": 67818 + }, + { + "epoch": 0.847771194279857, + "grad_norm": 5.775004863739014, + "learning_rate": 1.3789151419540725e-06, + "loss": 1.2503, + "step": 67820 + }, + { + "epoch": 0.8477961949048726, + "grad_norm": 5.234135150909424, + "learning_rate": 1.378472962607762e-06, + "loss": 1.712, + "step": 67822 + }, + { + "epoch": 0.8478211955298882, + "grad_norm": 0.13236570358276367, + "learning_rate": 1.3780308489217797e-06, + "loss": 0.5883, + "step": 67824 + }, + { + "epoch": 0.8478461961549039, + "grad_norm": 2.838449239730835, + "learning_rate": 1.3775888008994986e-06, + "loss": 1.0726, + "step": 67826 + }, + { + "epoch": 0.8478711967799195, + "grad_norm": 2.464909553527832, + "learning_rate": 1.3771468185442783e-06, + "loss": 0.4073, + "step": 67828 + }, + { + "epoch": 0.8478961974049352, + "grad_norm": 0.009685876779258251, + "learning_rate": 1.3767049018594913e-06, + "loss": 0.6966, + "step": 67830 + }, + { + "epoch": 0.8479211980299507, + "grad_norm": 0.04569113999605179, + "learning_rate": 1.376263050848501e-06, + "loss": 0.6948, + "step": 67832 + }, + { + "epoch": 0.8479461986549663, + "grad_norm": 2.360478639602661, + "learning_rate": 1.3758212655146686e-06, + "loss": 0.3815, + "step": 67834 + }, + { + "epoch": 0.847971199279982, + "grad_norm": 1.0815824270248413, + "learning_rate": 1.375379545861365e-06, + "loss": 0.9137, + "step": 67836 + }, + { + "epoch": 0.8479961999049976, + "grad_norm": 0.5896012187004089, + "learning_rate": 1.3749378918919476e-06, + "loss": 0.8061, + "step": 67838 + }, + { + "epoch": 0.8480212005300133, + "grad_norm": 0.03345578908920288, + "learning_rate": 1.3744963036097868e-06, + "loss": 0.5598, + "step": 67840 + }, + { + "epoch": 0.8480462011550288, + "grad_norm": 3.5409011840820312, + "learning_rate": 1.3740547810182424e-06, + "loss": 0.7815, + "step": 67842 + }, + { + "epoch": 0.8480712017800445, + "grad_norm": 4.155179977416992, + "learning_rate": 1.373613324120674e-06, + "loss": 0.8911, + "step": 67844 + }, + { + "epoch": 0.8480962024050601, + "grad_norm": 4.535635948181152, + "learning_rate": 1.3731719329204496e-06, + "loss": 1.1514, + "step": 67846 + }, + { + "epoch": 0.8481212030300758, + "grad_norm": 0.02793365903198719, + "learning_rate": 1.3727306074209245e-06, + "loss": 0.0003, + "step": 67848 + }, + { + "epoch": 0.8481462036550914, + "grad_norm": 2.7554733753204346, + "learning_rate": 1.3722893476254663e-06, + "loss": 0.8304, + "step": 67850 + }, + { + "epoch": 0.848171204280107, + "grad_norm": 3.2449939250946045, + "learning_rate": 1.3718481535374306e-06, + "loss": 1.073, + "step": 67852 + }, + { + "epoch": 0.8481962049051226, + "grad_norm": 1.9940674304962158, + "learning_rate": 1.3714070251601808e-06, + "loss": 0.9844, + "step": 67854 + }, + { + "epoch": 0.8482212055301382, + "grad_norm": 2.3189520835876465, + "learning_rate": 1.370965962497075e-06, + "loss": 0.5679, + "step": 67856 + }, + { + "epoch": 0.8482462061551539, + "grad_norm": 1.9635109901428223, + "learning_rate": 1.3705249655514696e-06, + "loss": 0.0661, + "step": 67858 + }, + { + "epoch": 0.8482712067801695, + "grad_norm": 3.0050666332244873, + "learning_rate": 1.3700840343267285e-06, + "loss": 0.3203, + "step": 67860 + }, + { + "epoch": 0.8482962074051851, + "grad_norm": 1.6132566928863525, + "learning_rate": 1.3696431688262034e-06, + "loss": 0.0358, + "step": 67862 + }, + { + "epoch": 0.8483212080302007, + "grad_norm": 2.1927127838134766, + "learning_rate": 1.3692023690532597e-06, + "loss": 0.846, + "step": 67864 + }, + { + "epoch": 0.8483462086552164, + "grad_norm": 3.7250802516937256, + "learning_rate": 1.3687616350112465e-06, + "loss": 0.9735, + "step": 67866 + }, + { + "epoch": 0.848371209280232, + "grad_norm": 2.3230016231536865, + "learning_rate": 1.3683209667035268e-06, + "loss": 0.7564, + "step": 67868 + }, + { + "epoch": 0.8483962099052477, + "grad_norm": 0.0034385656472295523, + "learning_rate": 1.367880364133455e-06, + "loss": 1.0368, + "step": 67870 + }, + { + "epoch": 0.8484212105302632, + "grad_norm": 2.0607917308807373, + "learning_rate": 1.3674398273043821e-06, + "loss": 0.0785, + "step": 67872 + }, + { + "epoch": 0.8484462111552789, + "grad_norm": 7.403401851654053, + "learning_rate": 1.3669993562196693e-06, + "loss": 1.4691, + "step": 67874 + }, + { + "epoch": 0.8484712117802945, + "grad_norm": 8.909649848937988, + "learning_rate": 1.3665589508826671e-06, + "loss": 0.9246, + "step": 67876 + }, + { + "epoch": 0.8484962124053101, + "grad_norm": 4.908854007720947, + "learning_rate": 1.3661186112967328e-06, + "loss": 2.7377, + "step": 67878 + }, + { + "epoch": 0.8485212130303258, + "grad_norm": 2.2240800857543945, + "learning_rate": 1.365678337465215e-06, + "loss": 1.4041, + "step": 67880 + }, + { + "epoch": 0.8485462136553413, + "grad_norm": 4.056021213531494, + "learning_rate": 1.365238129391473e-06, + "loss": 0.6968, + "step": 67882 + }, + { + "epoch": 0.848571214280357, + "grad_norm": 4.790792465209961, + "learning_rate": 1.3647979870788563e-06, + "loss": 0.8558, + "step": 67884 + }, + { + "epoch": 0.8485962149053726, + "grad_norm": 0.00030330600566230714, + "learning_rate": 1.364357910530716e-06, + "loss": 0.6096, + "step": 67886 + }, + { + "epoch": 0.8486212155303883, + "grad_norm": 4.61621618270874, + "learning_rate": 1.3639178997504055e-06, + "loss": 1.5564, + "step": 67888 + }, + { + "epoch": 0.8486462161554039, + "grad_norm": 4.280765533447266, + "learning_rate": 1.3634779547412724e-06, + "loss": 1.4994, + "step": 67890 + }, + { + "epoch": 0.8486712167804195, + "grad_norm": 0.95926433801651, + "learning_rate": 1.363038075506673e-06, + "loss": 0.7541, + "step": 67892 + }, + { + "epoch": 0.8486962174054351, + "grad_norm": 4.331091403961182, + "learning_rate": 1.3625982620499545e-06, + "loss": 0.6456, + "step": 67894 + }, + { + "epoch": 0.8487212180304508, + "grad_norm": 3.2152633666992188, + "learning_rate": 1.362158514374463e-06, + "loss": 0.827, + "step": 67896 + }, + { + "epoch": 0.8487462186554664, + "grad_norm": 2.268413782119751, + "learning_rate": 1.361718832483554e-06, + "loss": 0.6397, + "step": 67898 + }, + { + "epoch": 0.848771219280482, + "grad_norm": 2.0975725650787354, + "learning_rate": 1.36127921638057e-06, + "loss": 0.5457, + "step": 67900 + }, + { + "epoch": 0.8487962199054976, + "grad_norm": 5.396991729736328, + "learning_rate": 1.3608396660688638e-06, + "loss": 1.8833, + "step": 67902 + }, + { + "epoch": 0.8488212205305132, + "grad_norm": 4.3597941398620605, + "learning_rate": 1.3604001815517787e-06, + "loss": 0.8231, + "step": 67904 + }, + { + "epoch": 0.8488462211555289, + "grad_norm": 3.1366307735443115, + "learning_rate": 1.359960762832665e-06, + "loss": 1.5555, + "step": 67906 + }, + { + "epoch": 0.8488712217805445, + "grad_norm": 4.242733955383301, + "learning_rate": 1.3595214099148724e-06, + "loss": 0.8407, + "step": 67908 + }, + { + "epoch": 0.8488962224055602, + "grad_norm": 0.0005155952530913055, + "learning_rate": 1.359082122801738e-06, + "loss": 0.0, + "step": 67910 + }, + { + "epoch": 0.8489212230305757, + "grad_norm": 0.0003241327649448067, + "learning_rate": 1.358642901496615e-06, + "loss": 0.8373, + "step": 67912 + }, + { + "epoch": 0.8489462236555914, + "grad_norm": 4.698373794555664, + "learning_rate": 1.358203746002843e-06, + "loss": 1.8085, + "step": 67914 + }, + { + "epoch": 0.848971224280607, + "grad_norm": 3.2179601192474365, + "learning_rate": 1.3577646563237712e-06, + "loss": 0.5675, + "step": 67916 + }, + { + "epoch": 0.8489962249056227, + "grad_norm": 3.91467547416687, + "learning_rate": 1.3573256324627404e-06, + "loss": 1.5752, + "step": 67918 + }, + { + "epoch": 0.8490212255306383, + "grad_norm": 0.31124386191368103, + "learning_rate": 1.3568866744230947e-06, + "loss": 1.488, + "step": 67920 + }, + { + "epoch": 0.8490462261556538, + "grad_norm": 3.9483940601348877, + "learning_rate": 1.3564477822081822e-06, + "loss": 1.9375, + "step": 67922 + }, + { + "epoch": 0.8490712267806695, + "grad_norm": 4.95396614074707, + "learning_rate": 1.3560089558213374e-06, + "loss": 0.8965, + "step": 67924 + }, + { + "epoch": 0.8490962274056851, + "grad_norm": 0.0002662327024154365, + "learning_rate": 1.3555701952659073e-06, + "loss": 0.8109, + "step": 67926 + }, + { + "epoch": 0.8491212280307008, + "grad_norm": 1.1626509428024292, + "learning_rate": 1.3551315005452303e-06, + "loss": 1.5063, + "step": 67928 + }, + { + "epoch": 0.8491462286557164, + "grad_norm": 1.9680566787719727, + "learning_rate": 1.3546928716626517e-06, + "loss": 0.7752, + "step": 67930 + }, + { + "epoch": 0.849171229280732, + "grad_norm": 5.570167541503906, + "learning_rate": 1.3542543086215076e-06, + "loss": 1.8108, + "step": 67932 + }, + { + "epoch": 0.8491962299057476, + "grad_norm": 3.2218728065490723, + "learning_rate": 1.353815811425142e-06, + "loss": 1.1011, + "step": 67934 + }, + { + "epoch": 0.8492212305307633, + "grad_norm": 5.613654136657715, + "learning_rate": 1.3533773800768923e-06, + "loss": 1.7899, + "step": 67936 + }, + { + "epoch": 0.8492462311557789, + "grad_norm": 3.445345401763916, + "learning_rate": 1.3529390145800959e-06, + "loss": 2.3817, + "step": 67938 + }, + { + "epoch": 0.8492712317807946, + "grad_norm": 3.8955910205841064, + "learning_rate": 1.3525007149380943e-06, + "loss": 0.8991, + "step": 67940 + }, + { + "epoch": 0.8492962324058101, + "grad_norm": 2.3845911026000977, + "learning_rate": 1.3520624811542238e-06, + "loss": 1.4146, + "step": 67942 + }, + { + "epoch": 0.8493212330308257, + "grad_norm": 2.9842143058776855, + "learning_rate": 1.3516243132318207e-06, + "loss": 0.4274, + "step": 67944 + }, + { + "epoch": 0.8493462336558414, + "grad_norm": 1.4681661128997803, + "learning_rate": 1.351186211174228e-06, + "loss": 0.6032, + "step": 67946 + }, + { + "epoch": 0.849371234280857, + "grad_norm": 2.4285435676574707, + "learning_rate": 1.3507481749847773e-06, + "loss": 0.6981, + "step": 67948 + }, + { + "epoch": 0.8493962349058727, + "grad_norm": 2.8698315620422363, + "learning_rate": 1.3503102046668048e-06, + "loss": 0.3301, + "step": 67950 + }, + { + "epoch": 0.8494212355308882, + "grad_norm": 2.7640576362609863, + "learning_rate": 1.3498723002236447e-06, + "loss": 0.5464, + "step": 67952 + }, + { + "epoch": 0.8494462361559039, + "grad_norm": 3.0537922382354736, + "learning_rate": 1.3494344616586364e-06, + "loss": 1.2016, + "step": 67954 + }, + { + "epoch": 0.8494712367809195, + "grad_norm": 4.081871509552002, + "learning_rate": 1.3489966889751094e-06, + "loss": 0.7643, + "step": 67956 + }, + { + "epoch": 0.8494962374059352, + "grad_norm": 0.00026861828519031405, + "learning_rate": 1.3485589821764e-06, + "loss": 0.5481, + "step": 67958 + }, + { + "epoch": 0.8495212380309508, + "grad_norm": 3.1442244052886963, + "learning_rate": 1.3481213412658477e-06, + "loss": 0.8997, + "step": 67960 + }, + { + "epoch": 0.8495462386559663, + "grad_norm": 0.2684796452522278, + "learning_rate": 1.3476837662467746e-06, + "loss": 0.446, + "step": 67962 + }, + { + "epoch": 0.849571239280982, + "grad_norm": 2.7182061672210693, + "learning_rate": 1.3472462571225197e-06, + "loss": 0.5845, + "step": 67964 + }, + { + "epoch": 0.8495962399059976, + "grad_norm": 2.867790460586548, + "learning_rate": 1.3468088138964131e-06, + "loss": 0.2108, + "step": 67966 + }, + { + "epoch": 0.8496212405310133, + "grad_norm": 2.974825382232666, + "learning_rate": 1.346371436571785e-06, + "loss": 0.9761, + "step": 67968 + }, + { + "epoch": 0.8496462411560289, + "grad_norm": 2.8558597564697266, + "learning_rate": 1.345934125151972e-06, + "loss": 0.7597, + "step": 67970 + }, + { + "epoch": 0.8496712417810445, + "grad_norm": 0.019596654921770096, + "learning_rate": 1.345496879640298e-06, + "loss": 0.9616, + "step": 67972 + }, + { + "epoch": 0.8496962424060601, + "grad_norm": 17.47335433959961, + "learning_rate": 1.345059700040101e-06, + "loss": 2.0282, + "step": 67974 + }, + { + "epoch": 0.8497212430310758, + "grad_norm": 0.00020516022050287575, + "learning_rate": 1.3446225863547001e-06, + "loss": 0.5719, + "step": 67976 + }, + { + "epoch": 0.8497462436560914, + "grad_norm": 4.381975173950195, + "learning_rate": 1.3441855385874324e-06, + "loss": 1.4258, + "step": 67978 + }, + { + "epoch": 0.8497712442811071, + "grad_norm": 3.318190336227417, + "learning_rate": 1.343748556741622e-06, + "loss": 1.329, + "step": 67980 + }, + { + "epoch": 0.8497962449061226, + "grad_norm": 0.03670942410826683, + "learning_rate": 1.343311640820597e-06, + "loss": 0.6181, + "step": 67982 + }, + { + "epoch": 0.8498212455311382, + "grad_norm": 2.5310022830963135, + "learning_rate": 1.3428747908276896e-06, + "loss": 1.1726, + "step": 67984 + }, + { + "epoch": 0.8498462461561539, + "grad_norm": 0.037650808691978455, + "learning_rate": 1.3424380067662235e-06, + "loss": 0.2937, + "step": 67986 + }, + { + "epoch": 0.8498712467811695, + "grad_norm": 3.3407115936279297, + "learning_rate": 1.3420012886395252e-06, + "loss": 1.2102, + "step": 67988 + }, + { + "epoch": 0.8498962474061852, + "grad_norm": 4.988718509674072, + "learning_rate": 1.3415646364509182e-06, + "loss": 1.0302, + "step": 67990 + }, + { + "epoch": 0.8499212480312007, + "grad_norm": 2.755145311355591, + "learning_rate": 1.3411280502037327e-06, + "loss": 0.6054, + "step": 67992 + }, + { + "epoch": 0.8499462486562164, + "grad_norm": 3.7314341068267822, + "learning_rate": 1.34069152990129e-06, + "loss": 2.0037, + "step": 67994 + }, + { + "epoch": 0.849971249281232, + "grad_norm": 2.462881088256836, + "learning_rate": 1.3402550755469157e-06, + "loss": 0.3474, + "step": 67996 + }, + { + "epoch": 0.8499962499062477, + "grad_norm": 2.678356885910034, + "learning_rate": 1.3398186871439357e-06, + "loss": 0.524, + "step": 67998 + }, + { + "epoch": 0.8500212505312633, + "grad_norm": 2.6840832233428955, + "learning_rate": 1.3393823646956728e-06, + "loss": 0.4195, + "step": 68000 + }, + { + "epoch": 0.8500462511562789, + "grad_norm": 0.0004084914398845285, + "learning_rate": 1.3389461082054489e-06, + "loss": 0.4419, + "step": 68002 + }, + { + "epoch": 0.8500712517812945, + "grad_norm": 4.8344011306762695, + "learning_rate": 1.3385099176765837e-06, + "loss": 0.7391, + "step": 68004 + }, + { + "epoch": 0.8500962524063101, + "grad_norm": 2.7589333057403564, + "learning_rate": 1.3380737931124032e-06, + "loss": 0.5593, + "step": 68006 + }, + { + "epoch": 0.8501212530313258, + "grad_norm": 2.6602671146392822, + "learning_rate": 1.3376377345162295e-06, + "loss": 0.9849, + "step": 68008 + }, + { + "epoch": 0.8501462536563414, + "grad_norm": 3.4606406688690186, + "learning_rate": 1.3372017418913797e-06, + "loss": 0.6334, + "step": 68010 + }, + { + "epoch": 0.850171254281357, + "grad_norm": 4.025097370147705, + "learning_rate": 1.336765815241181e-06, + "loss": 0.3211, + "step": 68012 + }, + { + "epoch": 0.8501962549063726, + "grad_norm": 0.004928638692945242, + "learning_rate": 1.3363299545689434e-06, + "loss": 0.6269, + "step": 68014 + }, + { + "epoch": 0.8502212555313883, + "grad_norm": 0.7826253771781921, + "learning_rate": 1.335894159877995e-06, + "loss": 0.8106, + "step": 68016 + }, + { + "epoch": 0.8502462561564039, + "grad_norm": 0.5983144640922546, + "learning_rate": 1.335458431171649e-06, + "loss": 0.5349, + "step": 68018 + }, + { + "epoch": 0.8502712567814196, + "grad_norm": 2.6643753051757812, + "learning_rate": 1.3350227684532257e-06, + "loss": 0.4867, + "step": 68020 + }, + { + "epoch": 0.8502962574064351, + "grad_norm": 2.909597396850586, + "learning_rate": 1.3345871717260473e-06, + "loss": 0.7691, + "step": 68022 + }, + { + "epoch": 0.8503212580314508, + "grad_norm": 0.9834843277931213, + "learning_rate": 1.3341516409934242e-06, + "loss": 0.7789, + "step": 68024 + }, + { + "epoch": 0.8503462586564664, + "grad_norm": 0.45241567492485046, + "learning_rate": 1.3337161762586814e-06, + "loss": 0.9237, + "step": 68026 + }, + { + "epoch": 0.850371259281482, + "grad_norm": 1.5416483879089355, + "learning_rate": 1.3332807775251255e-06, + "loss": 0.6958, + "step": 68028 + }, + { + "epoch": 0.8503962599064977, + "grad_norm": 0.5716912746429443, + "learning_rate": 1.3328454447960793e-06, + "loss": 1.0787, + "step": 68030 + }, + { + "epoch": 0.8504212605315132, + "grad_norm": 2.6361448764801025, + "learning_rate": 1.3324101780748578e-06, + "loss": 0.6114, + "step": 68032 + }, + { + "epoch": 0.8504462611565289, + "grad_norm": 2.959609031677246, + "learning_rate": 1.3319749773647717e-06, + "loss": 0.3476, + "step": 68034 + }, + { + "epoch": 0.8504712617815445, + "grad_norm": 3.6664187908172607, + "learning_rate": 1.3315398426691416e-06, + "loss": 0.8424, + "step": 68036 + }, + { + "epoch": 0.8504962624065602, + "grad_norm": 1.3010932207107544, + "learning_rate": 1.3311047739912785e-06, + "loss": 0.5771, + "step": 68038 + }, + { + "epoch": 0.8505212630315758, + "grad_norm": 0.014631293714046478, + "learning_rate": 1.3306697713344952e-06, + "loss": 0.0002, + "step": 68040 + }, + { + "epoch": 0.8505462636565914, + "grad_norm": 2.323537826538086, + "learning_rate": 1.3302348347021021e-06, + "loss": 0.8556, + "step": 68042 + }, + { + "epoch": 0.850571264281607, + "grad_norm": 3.94921612739563, + "learning_rate": 1.3297999640974158e-06, + "loss": 0.5021, + "step": 68044 + }, + { + "epoch": 0.8505962649066227, + "grad_norm": 2.1813716888427734, + "learning_rate": 1.3293651595237479e-06, + "loss": 1.3364, + "step": 68046 + }, + { + "epoch": 0.8506212655316383, + "grad_norm": 3.852339267730713, + "learning_rate": 1.3289304209844068e-06, + "loss": 1.2106, + "step": 68048 + }, + { + "epoch": 0.850646266156654, + "grad_norm": 2.817709445953369, + "learning_rate": 1.3284957484827077e-06, + "loss": 0.9811, + "step": 68050 + }, + { + "epoch": 0.8506712667816695, + "grad_norm": 0.056869227439165115, + "learning_rate": 1.3280611420219592e-06, + "loss": 0.1342, + "step": 68052 + }, + { + "epoch": 0.8506962674066851, + "grad_norm": 6.794241905212402, + "learning_rate": 1.3276266016054685e-06, + "loss": 1.8689, + "step": 68054 + }, + { + "epoch": 0.8507212680317008, + "grad_norm": 1.855570673942566, + "learning_rate": 1.3271921272365485e-06, + "loss": 0.355, + "step": 68056 + }, + { + "epoch": 0.8507462686567164, + "grad_norm": 5.0128912925720215, + "learning_rate": 1.3267577189185055e-06, + "loss": 1.24, + "step": 68058 + }, + { + "epoch": 0.8507712692817321, + "grad_norm": 3.6425695419311523, + "learning_rate": 1.3263233766546512e-06, + "loss": 1.3706, + "step": 68060 + }, + { + "epoch": 0.8507962699067476, + "grad_norm": 4.7020745277404785, + "learning_rate": 1.3258891004482887e-06, + "loss": 0.9053, + "step": 68062 + }, + { + "epoch": 0.8508212705317633, + "grad_norm": 6.523824691772461, + "learning_rate": 1.325454890302733e-06, + "loss": 1.125, + "step": 68064 + }, + { + "epoch": 0.8508462711567789, + "grad_norm": 3.058562994003296, + "learning_rate": 1.3250207462212828e-06, + "loss": 1.0346, + "step": 68066 + }, + { + "epoch": 0.8508712717817946, + "grad_norm": 1.4667598009109497, + "learning_rate": 1.3245866682072462e-06, + "loss": 0.0851, + "step": 68068 + }, + { + "epoch": 0.8508962724068102, + "grad_norm": 4.909803867340088, + "learning_rate": 1.324152656263934e-06, + "loss": 0.3561, + "step": 68070 + }, + { + "epoch": 0.8509212730318257, + "grad_norm": 0.9333927035331726, + "learning_rate": 1.3237187103946458e-06, + "loss": 0.5336, + "step": 68072 + }, + { + "epoch": 0.8509462736568414, + "grad_norm": 4.016931056976318, + "learning_rate": 1.3232848306026914e-06, + "loss": 0.5012, + "step": 68074 + }, + { + "epoch": 0.850971274281857, + "grad_norm": 5.670274257659912, + "learning_rate": 1.3228510168913699e-06, + "loss": 1.0421, + "step": 68076 + }, + { + "epoch": 0.8509962749068727, + "grad_norm": 3.2330198287963867, + "learning_rate": 1.3224172692639925e-06, + "loss": 1.7726, + "step": 68078 + }, + { + "epoch": 0.8510212755318883, + "grad_norm": 4.162823677062988, + "learning_rate": 1.321983587723854e-06, + "loss": 2.0691, + "step": 68080 + }, + { + "epoch": 0.8510462761569039, + "grad_norm": 3.3882265090942383, + "learning_rate": 1.3215499722742608e-06, + "loss": 0.7589, + "step": 68082 + }, + { + "epoch": 0.8510712767819195, + "grad_norm": 3.1794228553771973, + "learning_rate": 1.3211164229185191e-06, + "loss": 1.3634, + "step": 68084 + }, + { + "epoch": 0.8510962774069352, + "grad_norm": 4.494282245635986, + "learning_rate": 1.3206829396599242e-06, + "loss": 1.448, + "step": 68086 + }, + { + "epoch": 0.8511212780319508, + "grad_norm": 2.8429348468780518, + "learning_rate": 1.320249522501783e-06, + "loss": 0.8236, + "step": 68088 + }, + { + "epoch": 0.8511462786569665, + "grad_norm": 3.208366632461548, + "learning_rate": 1.3198161714473945e-06, + "loss": 1.534, + "step": 68090 + }, + { + "epoch": 0.851171279281982, + "grad_norm": 2.695462942123413, + "learning_rate": 1.3193828865000558e-06, + "loss": 0.7962, + "step": 68092 + }, + { + "epoch": 0.8511962799069976, + "grad_norm": 4.477488994598389, + "learning_rate": 1.318949667663072e-06, + "loss": 1.3917, + "step": 68094 + }, + { + "epoch": 0.8512212805320133, + "grad_norm": 5.571846008300781, + "learning_rate": 1.3185165149397372e-06, + "loss": 1.1501, + "step": 68096 + }, + { + "epoch": 0.8512462811570289, + "grad_norm": 0.0004536111664492637, + "learning_rate": 1.3180834283333555e-06, + "loss": 0.086, + "step": 68098 + }, + { + "epoch": 0.8512712817820446, + "grad_norm": 1.6686675548553467, + "learning_rate": 1.3176504078472207e-06, + "loss": 0.5376, + "step": 68100 + }, + { + "epoch": 0.8512962824070601, + "grad_norm": 3.176443099975586, + "learning_rate": 1.317217453484635e-06, + "loss": 0.6807, + "step": 68102 + }, + { + "epoch": 0.8513212830320758, + "grad_norm": 4.818305969238281, + "learning_rate": 1.316784565248893e-06, + "loss": 1.0639, + "step": 68104 + }, + { + "epoch": 0.8513462836570914, + "grad_norm": 0.4294893443584442, + "learning_rate": 1.3163517431432894e-06, + "loss": 1.6387, + "step": 68106 + }, + { + "epoch": 0.8513712842821071, + "grad_norm": 0.5698035359382629, + "learning_rate": 1.3159189871711254e-06, + "loss": 0.0383, + "step": 68108 + }, + { + "epoch": 0.8513962849071227, + "grad_norm": 3.1589889526367188, + "learning_rate": 1.315486297335692e-06, + "loss": 1.0393, + "step": 68110 + }, + { + "epoch": 0.8514212855321383, + "grad_norm": 2.0851516723632812, + "learning_rate": 1.3150536736402886e-06, + "loss": 0.9701, + "step": 68112 + }, + { + "epoch": 0.8514462861571539, + "grad_norm": 2.159501552581787, + "learning_rate": 1.3146211160882073e-06, + "loss": 1.2276, + "step": 68114 + }, + { + "epoch": 0.8514712867821695, + "grad_norm": 4.218241214752197, + "learning_rate": 1.314188624682744e-06, + "loss": 0.5337, + "step": 68116 + }, + { + "epoch": 0.8514962874071852, + "grad_norm": 3.6834065914154053, + "learning_rate": 1.313756199427193e-06, + "loss": 0.777, + "step": 68118 + }, + { + "epoch": 0.8515212880322008, + "grad_norm": 0.00024358615337405354, + "learning_rate": 1.3133238403248438e-06, + "loss": 0.5457, + "step": 68120 + }, + { + "epoch": 0.8515462886572164, + "grad_norm": 0.0003319730458315462, + "learning_rate": 1.3128915473789948e-06, + "loss": 0.7393, + "step": 68122 + }, + { + "epoch": 0.851571289282232, + "grad_norm": 2.978287935256958, + "learning_rate": 1.3124593205929325e-06, + "loss": 1.2782, + "step": 68124 + }, + { + "epoch": 0.8515962899072477, + "grad_norm": 4.2237935066223145, + "learning_rate": 1.312027159969953e-06, + "loss": 2.2364, + "step": 68126 + }, + { + "epoch": 0.8516212905322633, + "grad_norm": 1.1865615844726562, + "learning_rate": 1.3115950655133448e-06, + "loss": 0.1348, + "step": 68128 + }, + { + "epoch": 0.851646291157279, + "grad_norm": 3.970391273498535, + "learning_rate": 1.3111630372264016e-06, + "loss": 0.7857, + "step": 68130 + }, + { + "epoch": 0.8516712917822945, + "grad_norm": 4.940066814422607, + "learning_rate": 1.3107310751124125e-06, + "loss": 1.5199, + "step": 68132 + }, + { + "epoch": 0.8516962924073102, + "grad_norm": 2.0413296222686768, + "learning_rate": 1.3102991791746644e-06, + "loss": 0.2745, + "step": 68134 + }, + { + "epoch": 0.8517212930323258, + "grad_norm": 4.757173538208008, + "learning_rate": 1.3098673494164505e-06, + "loss": 0.7896, + "step": 68136 + }, + { + "epoch": 0.8517462936573414, + "grad_norm": 0.49248188734054565, + "learning_rate": 1.3094355858410568e-06, + "loss": 0.0803, + "step": 68138 + }, + { + "epoch": 0.8517712942823571, + "grad_norm": 4.743361949920654, + "learning_rate": 1.3090038884517741e-06, + "loss": 0.9361, + "step": 68140 + }, + { + "epoch": 0.8517962949073726, + "grad_norm": 3.778238534927368, + "learning_rate": 1.3085722572518889e-06, + "loss": 0.8672, + "step": 68142 + }, + { + "epoch": 0.8518212955323883, + "grad_norm": 0.8848227262496948, + "learning_rate": 1.308140692244687e-06, + "loss": 0.0394, + "step": 68144 + }, + { + "epoch": 0.8518462961574039, + "grad_norm": 1.7839783430099487, + "learning_rate": 1.3077091934334574e-06, + "loss": 1.0419, + "step": 68146 + }, + { + "epoch": 0.8518712967824196, + "grad_norm": 3.0261363983154297, + "learning_rate": 1.3072777608214836e-06, + "loss": 1.3032, + "step": 68148 + }, + { + "epoch": 0.8518962974074352, + "grad_norm": 0.007273182738572359, + "learning_rate": 1.3068463944120558e-06, + "loss": 0.6127, + "step": 68150 + }, + { + "epoch": 0.8519212980324508, + "grad_norm": 4.460886001586914, + "learning_rate": 1.3064150942084542e-06, + "loss": 1.5351, + "step": 68152 + }, + { + "epoch": 0.8519462986574664, + "grad_norm": 5.129831314086914, + "learning_rate": 1.3059838602139675e-06, + "loss": 1.1732, + "step": 68154 + }, + { + "epoch": 0.851971299282482, + "grad_norm": 4.3676066398620605, + "learning_rate": 1.3055526924318783e-06, + "loss": 1.3818, + "step": 68156 + }, + { + "epoch": 0.8519962999074977, + "grad_norm": 0.00027771323220804334, + "learning_rate": 1.305121590865469e-06, + "loss": 0.8739, + "step": 68158 + }, + { + "epoch": 0.8520213005325133, + "grad_norm": 3.063400983810425, + "learning_rate": 1.3046905555180256e-06, + "loss": 0.751, + "step": 68160 + }, + { + "epoch": 0.8520463011575289, + "grad_norm": 2.4007270336151123, + "learning_rate": 1.3042595863928264e-06, + "loss": 0.496, + "step": 68162 + }, + { + "epoch": 0.8520713017825445, + "grad_norm": 2.097717046737671, + "learning_rate": 1.3038286834931602e-06, + "loss": 1.2057, + "step": 68164 + }, + { + "epoch": 0.8520963024075602, + "grad_norm": 0.5646105408668518, + "learning_rate": 1.3033978468223018e-06, + "loss": 0.6309, + "step": 68166 + }, + { + "epoch": 0.8521213030325758, + "grad_norm": 0.0004521186638157815, + "learning_rate": 1.3029670763835378e-06, + "loss": 0.3171, + "step": 68168 + }, + { + "epoch": 0.8521463036575915, + "grad_norm": 3.183528184890747, + "learning_rate": 1.3025363721801476e-06, + "loss": 1.5089, + "step": 68170 + }, + { + "epoch": 0.852171304282607, + "grad_norm": 3.5800952911376953, + "learning_rate": 1.3021057342154075e-06, + "loss": 0.7576, + "step": 68172 + }, + { + "epoch": 0.8521963049076227, + "grad_norm": 2.1771960258483887, + "learning_rate": 1.3016751624926027e-06, + "loss": 0.229, + "step": 68174 + }, + { + "epoch": 0.8522213055326383, + "grad_norm": 2.1656668186187744, + "learning_rate": 1.3012446570150073e-06, + "loss": 1.0208, + "step": 68176 + }, + { + "epoch": 0.852246306157654, + "grad_norm": 6.9527459144592285, + "learning_rate": 1.300814217785904e-06, + "loss": 1.0998, + "step": 68178 + }, + { + "epoch": 0.8522713067826696, + "grad_norm": 7.813051700592041, + "learning_rate": 1.3003838448085683e-06, + "loss": 1.3669, + "step": 68180 + }, + { + "epoch": 0.8522963074076851, + "grad_norm": 3.088519334793091, + "learning_rate": 1.2999535380862805e-06, + "loss": 0.3894, + "step": 68182 + }, + { + "epoch": 0.8523213080327008, + "grad_norm": 2.7206196784973145, + "learning_rate": 1.2995232976223171e-06, + "loss": 1.0104, + "step": 68184 + }, + { + "epoch": 0.8523463086577164, + "grad_norm": 0.0016903909854590893, + "learning_rate": 1.299093123419951e-06, + "loss": 0.0131, + "step": 68186 + }, + { + "epoch": 0.8523713092827321, + "grad_norm": 0.48190033435821533, + "learning_rate": 1.2986630154824642e-06, + "loss": 0.859, + "step": 68188 + }, + { + "epoch": 0.8523963099077477, + "grad_norm": 3.4007632732391357, + "learning_rate": 1.298232973813126e-06, + "loss": 0.8603, + "step": 68190 + }, + { + "epoch": 0.8524213105327633, + "grad_norm": 9.402093887329102, + "learning_rate": 1.2978029984152185e-06, + "loss": 1.9687, + "step": 68192 + }, + { + "epoch": 0.8524463111577789, + "grad_norm": 5.377352237701416, + "learning_rate": 1.29737308929201e-06, + "loss": 2.0463, + "step": 68194 + }, + { + "epoch": 0.8524713117827946, + "grad_norm": 0.0003401061112526804, + "learning_rate": 1.296943246446779e-06, + "loss": 0.0, + "step": 68196 + }, + { + "epoch": 0.8524963124078102, + "grad_norm": 4.301118850708008, + "learning_rate": 1.2965134698827975e-06, + "loss": 0.624, + "step": 68198 + }, + { + "epoch": 0.8525213130328259, + "grad_norm": 0.6430444121360779, + "learning_rate": 1.2960837596033371e-06, + "loss": 1.1697, + "step": 68200 + }, + { + "epoch": 0.8525463136578414, + "grad_norm": 5.084498405456543, + "learning_rate": 1.2956541156116743e-06, + "loss": 1.9102, + "step": 68202 + }, + { + "epoch": 0.852571314282857, + "grad_norm": 2.183605909347534, + "learning_rate": 1.2952245379110761e-06, + "loss": 0.4357, + "step": 68204 + }, + { + "epoch": 0.8525963149078727, + "grad_norm": 2.1876943111419678, + "learning_rate": 1.2947950265048192e-06, + "loss": 0.6008, + "step": 68206 + }, + { + "epoch": 0.8526213155328883, + "grad_norm": 3.134336233139038, + "learning_rate": 1.294365581396172e-06, + "loss": 1.2304, + "step": 68208 + }, + { + "epoch": 0.852646316157904, + "grad_norm": 4.637001991271973, + "learning_rate": 1.293936202588404e-06, + "loss": 0.6289, + "step": 68210 + }, + { + "epoch": 0.8526713167829195, + "grad_norm": 4.680248260498047, + "learning_rate": 1.293506890084789e-06, + "loss": 0.6558, + "step": 68212 + }, + { + "epoch": 0.8526963174079352, + "grad_norm": 4.930566787719727, + "learning_rate": 1.2930776438885916e-06, + "loss": 0.2698, + "step": 68214 + }, + { + "epoch": 0.8527213180329508, + "grad_norm": 5.641340732574463, + "learning_rate": 1.2926484640030857e-06, + "loss": 1.8003, + "step": 68216 + }, + { + "epoch": 0.8527463186579665, + "grad_norm": 3.571002960205078, + "learning_rate": 1.2922193504315362e-06, + "loss": 2.2256, + "step": 68218 + }, + { + "epoch": 0.8527713192829821, + "grad_norm": 3.013089179992676, + "learning_rate": 1.2917903031772138e-06, + "loss": 0.758, + "step": 68220 + }, + { + "epoch": 0.8527963199079976, + "grad_norm": 3.0583317279815674, + "learning_rate": 1.2913613222433864e-06, + "loss": 0.26, + "step": 68222 + }, + { + "epoch": 0.8528213205330133, + "grad_norm": 2.0070080757141113, + "learning_rate": 1.2909324076333163e-06, + "loss": 1.9696, + "step": 68224 + }, + { + "epoch": 0.8528463211580289, + "grad_norm": 6.441838264465332, + "learning_rate": 1.2905035593502757e-06, + "loss": 1.3749, + "step": 68226 + }, + { + "epoch": 0.8528713217830446, + "grad_norm": 2.982149839401245, + "learning_rate": 1.2900747773975264e-06, + "loss": 1.3525, + "step": 68228 + }, + { + "epoch": 0.8528963224080602, + "grad_norm": 5.147100448608398, + "learning_rate": 1.289646061778338e-06, + "loss": 1.8365, + "step": 68230 + }, + { + "epoch": 0.8529213230330758, + "grad_norm": 4.34443998336792, + "learning_rate": 1.2892174124959722e-06, + "loss": 1.0236, + "step": 68232 + }, + { + "epoch": 0.8529463236580914, + "grad_norm": 2.503725051879883, + "learning_rate": 1.2887888295536954e-06, + "loss": 0.5907, + "step": 68234 + }, + { + "epoch": 0.8529713242831071, + "grad_norm": 6.542285919189453, + "learning_rate": 1.2883603129547728e-06, + "loss": 1.1182, + "step": 68236 + }, + { + "epoch": 0.8529963249081227, + "grad_norm": 3.2056539058685303, + "learning_rate": 1.2879318627024628e-06, + "loss": 1.5287, + "step": 68238 + }, + { + "epoch": 0.8530213255331384, + "grad_norm": 5.879857540130615, + "learning_rate": 1.287503478800034e-06, + "loss": 0.3705, + "step": 68240 + }, + { + "epoch": 0.8530463261581539, + "grad_norm": 2.350999593734741, + "learning_rate": 1.2870751612507437e-06, + "loss": 0.8628, + "step": 68242 + }, + { + "epoch": 0.8530713267831695, + "grad_norm": 5.78900671005249, + "learning_rate": 1.2866469100578604e-06, + "loss": 1.8095, + "step": 68244 + }, + { + "epoch": 0.8530963274081852, + "grad_norm": 0.08717610687017441, + "learning_rate": 1.2862187252246394e-06, + "loss": 0.6569, + "step": 68246 + }, + { + "epoch": 0.8531213280332008, + "grad_norm": 2.8717923164367676, + "learning_rate": 1.2857906067543468e-06, + "loss": 1.1846, + "step": 68248 + }, + { + "epoch": 0.8531463286582165, + "grad_norm": 2.9680163860321045, + "learning_rate": 1.2853625546502402e-06, + "loss": 0.8027, + "step": 68250 + }, + { + "epoch": 0.853171329283232, + "grad_norm": 5.988571643829346, + "learning_rate": 1.2849345689155778e-06, + "loss": 1.4046, + "step": 68252 + }, + { + "epoch": 0.8531963299082477, + "grad_norm": 3.74314022064209, + "learning_rate": 1.2845066495536229e-06, + "loss": 0.7385, + "step": 68254 + }, + { + "epoch": 0.8532213305332633, + "grad_norm": 1.7943578958511353, + "learning_rate": 1.2840787965676316e-06, + "loss": 0.0609, + "step": 68256 + }, + { + "epoch": 0.853246331158279, + "grad_norm": 1.37368643283844, + "learning_rate": 1.2836510099608656e-06, + "loss": 0.1709, + "step": 68258 + }, + { + "epoch": 0.8532713317832946, + "grad_norm": 3.7613978385925293, + "learning_rate": 1.2832232897365815e-06, + "loss": 1.6676, + "step": 68260 + }, + { + "epoch": 0.8532963324083102, + "grad_norm": 4.224874973297119, + "learning_rate": 1.2827956358980332e-06, + "loss": 1.1538, + "step": 68262 + }, + { + "epoch": 0.8533213330333258, + "grad_norm": 3.081519842147827, + "learning_rate": 1.2823680484484835e-06, + "loss": 0.6307, + "step": 68264 + }, + { + "epoch": 0.8533463336583414, + "grad_norm": 2.7465901374816895, + "learning_rate": 1.2819405273911834e-06, + "loss": 0.9097, + "step": 68266 + }, + { + "epoch": 0.8533713342833571, + "grad_norm": 0.403985857963562, + "learning_rate": 1.2815130727293933e-06, + "loss": 0.567, + "step": 68268 + }, + { + "epoch": 0.8533963349083727, + "grad_norm": 4.9200663566589355, + "learning_rate": 1.2810856844663643e-06, + "loss": 1.4678, + "step": 68270 + }, + { + "epoch": 0.8534213355333883, + "grad_norm": 10.676817893981934, + "learning_rate": 1.2806583626053558e-06, + "loss": 1.2172, + "step": 68272 + }, + { + "epoch": 0.8534463361584039, + "grad_norm": 2.2639248371124268, + "learning_rate": 1.2802311071496209e-06, + "loss": 0.1765, + "step": 68274 + }, + { + "epoch": 0.8534713367834196, + "grad_norm": 0.9546429514884949, + "learning_rate": 1.2798039181024114e-06, + "loss": 0.0622, + "step": 68276 + }, + { + "epoch": 0.8534963374084352, + "grad_norm": 0.0002780260401777923, + "learning_rate": 1.2793767954669834e-06, + "loss": 0.0173, + "step": 68278 + }, + { + "epoch": 0.8535213380334509, + "grad_norm": 2.6931426525115967, + "learning_rate": 1.2789497392465855e-06, + "loss": 0.499, + "step": 68280 + }, + { + "epoch": 0.8535463386584664, + "grad_norm": 0.0002595706610009074, + "learning_rate": 1.2785227494444763e-06, + "loss": 0.2926, + "step": 68282 + }, + { + "epoch": 0.8535713392834821, + "grad_norm": 0.00030868302565068007, + "learning_rate": 1.278095826063902e-06, + "loss": 0.6325, + "step": 68284 + }, + { + "epoch": 0.8535963399084977, + "grad_norm": 1.5174938440322876, + "learning_rate": 1.277668969108119e-06, + "loss": 0.1154, + "step": 68286 + }, + { + "epoch": 0.8536213405335134, + "grad_norm": 2.8547680377960205, + "learning_rate": 1.2772421785803757e-06, + "loss": 0.588, + "step": 68288 + }, + { + "epoch": 0.853646341158529, + "grad_norm": 1.880340576171875, + "learning_rate": 1.2768154544839196e-06, + "loss": 0.348, + "step": 68290 + }, + { + "epoch": 0.8536713417835445, + "grad_norm": 0.0003616459434852004, + "learning_rate": 1.2763887968220067e-06, + "loss": 0.8712, + "step": 68292 + }, + { + "epoch": 0.8536963424085602, + "grad_norm": 0.00044457483454607427, + "learning_rate": 1.2759622055978805e-06, + "loss": 0.4838, + "step": 68294 + }, + { + "epoch": 0.8537213430335758, + "grad_norm": 1.5892354249954224, + "learning_rate": 1.2755356808147933e-06, + "loss": 0.2865, + "step": 68296 + }, + { + "epoch": 0.8537463436585915, + "grad_norm": 0.1934070885181427, + "learning_rate": 1.275109222475992e-06, + "loss": 0.8801, + "step": 68298 + }, + { + "epoch": 0.8537713442836071, + "grad_norm": 1.1143330335617065, + "learning_rate": 1.274682830584727e-06, + "loss": 0.9122, + "step": 68300 + }, + { + "epoch": 0.8537963449086227, + "grad_norm": 2.625663995742798, + "learning_rate": 1.2742565051442435e-06, + "loss": 0.7182, + "step": 68302 + }, + { + "epoch": 0.8538213455336383, + "grad_norm": 0.00020409910939633846, + "learning_rate": 1.2738302461577868e-06, + "loss": 0.0, + "step": 68304 + }, + { + "epoch": 0.853846346158654, + "grad_norm": 0.00676891952753067, + "learning_rate": 1.2734040536286064e-06, + "loss": 0.5766, + "step": 68306 + }, + { + "epoch": 0.8538713467836696, + "grad_norm": 2.3554797172546387, + "learning_rate": 1.272977927559944e-06, + "loss": 0.497, + "step": 68308 + }, + { + "epoch": 0.8538963474086853, + "grad_norm": 4.587167263031006, + "learning_rate": 1.2725518679550507e-06, + "loss": 1.4178, + "step": 68310 + }, + { + "epoch": 0.8539213480337008, + "grad_norm": 3.90134859085083, + "learning_rate": 1.2721258748171684e-06, + "loss": 1.6322, + "step": 68312 + }, + { + "epoch": 0.8539463486587164, + "grad_norm": 0.0007174030179157853, + "learning_rate": 1.2716999481495384e-06, + "loss": 0.3505, + "step": 68314 + }, + { + "epoch": 0.8539713492837321, + "grad_norm": 2.6524033546447754, + "learning_rate": 1.2712740879554097e-06, + "loss": 0.8926, + "step": 68316 + }, + { + "epoch": 0.8539963499087477, + "grad_norm": 4.096085548400879, + "learning_rate": 1.2708482942380207e-06, + "loss": 1.3819, + "step": 68318 + }, + { + "epoch": 0.8540213505337634, + "grad_norm": 2.4106593132019043, + "learning_rate": 1.2704225670006198e-06, + "loss": 0.5079, + "step": 68320 + }, + { + "epoch": 0.8540463511587789, + "grad_norm": 3.5787646770477295, + "learning_rate": 1.2699969062464435e-06, + "loss": 1.1412, + "step": 68322 + }, + { + "epoch": 0.8540713517837946, + "grad_norm": 0.06947365403175354, + "learning_rate": 1.2695713119787357e-06, + "loss": 0.5616, + "step": 68324 + }, + { + "epoch": 0.8540963524088102, + "grad_norm": 8.732595443725586, + "learning_rate": 1.2691457842007427e-06, + "loss": 0.7135, + "step": 68326 + }, + { + "epoch": 0.8541213530338259, + "grad_norm": 2.870224952697754, + "learning_rate": 1.2687203229156975e-06, + "loss": 0.7759, + "step": 68328 + }, + { + "epoch": 0.8541463536588415, + "grad_norm": 0.7677173614501953, + "learning_rate": 1.2682949281268443e-06, + "loss": 0.0812, + "step": 68330 + }, + { + "epoch": 0.854171354283857, + "grad_norm": 3.5611817836761475, + "learning_rate": 1.2678695998374203e-06, + "loss": 1.4453, + "step": 68332 + }, + { + "epoch": 0.8541963549088727, + "grad_norm": 5.6940693855285645, + "learning_rate": 1.2674443380506695e-06, + "loss": 1.0583, + "step": 68334 + }, + { + "epoch": 0.8542213555338883, + "grad_norm": 2.4081974029541016, + "learning_rate": 1.2670191427698253e-06, + "loss": 0.1177, + "step": 68336 + }, + { + "epoch": 0.854246356158904, + "grad_norm": 0.00022896332666277885, + "learning_rate": 1.2665940139981304e-06, + "loss": 1.0008, + "step": 68338 + }, + { + "epoch": 0.8542713567839196, + "grad_norm": 1.7810008525848389, + "learning_rate": 1.2661689517388198e-06, + "loss": 1.1489, + "step": 68340 + }, + { + "epoch": 0.8542963574089352, + "grad_norm": 20.743839263916016, + "learning_rate": 1.2657439559951302e-06, + "loss": 2.2845, + "step": 68342 + }, + { + "epoch": 0.8543213580339508, + "grad_norm": 0.001299329218454659, + "learning_rate": 1.2653190267703008e-06, + "loss": 0.6501, + "step": 68344 + }, + { + "epoch": 0.8543463586589665, + "grad_norm": 0.5230403542518616, + "learning_rate": 1.264894164067565e-06, + "loss": 0.0145, + "step": 68346 + }, + { + "epoch": 0.8543713592839821, + "grad_norm": 3.926459550857544, + "learning_rate": 1.26446936789016e-06, + "loss": 0.6094, + "step": 68348 + }, + { + "epoch": 0.8543963599089978, + "grad_norm": 2.427565336227417, + "learning_rate": 1.2640446382413229e-06, + "loss": 1.2903, + "step": 68350 + }, + { + "epoch": 0.8544213605340133, + "grad_norm": 2.223856210708618, + "learning_rate": 1.2636199751242873e-06, + "loss": 1.0271, + "step": 68352 + }, + { + "epoch": 0.8544463611590289, + "grad_norm": 0.47180283069610596, + "learning_rate": 1.2631953785422857e-06, + "loss": 1.1494, + "step": 68354 + }, + { + "epoch": 0.8544713617840446, + "grad_norm": 0.0003945200296584517, + "learning_rate": 1.2627708484985502e-06, + "loss": 0.4393, + "step": 68356 + }, + { + "epoch": 0.8544963624090602, + "grad_norm": 2.099020004272461, + "learning_rate": 1.2623463849963192e-06, + "loss": 0.9829, + "step": 68358 + }, + { + "epoch": 0.8545213630340759, + "grad_norm": 3.671771764755249, + "learning_rate": 1.261921988038819e-06, + "loss": 0.8618, + "step": 68360 + }, + { + "epoch": 0.8545463636590914, + "grad_norm": 3.406393051147461, + "learning_rate": 1.261497657629287e-06, + "loss": 0.9585, + "step": 68362 + }, + { + "epoch": 0.8545713642841071, + "grad_norm": 4.577750205993652, + "learning_rate": 1.2610733937709563e-06, + "loss": 0.4636, + "step": 68364 + }, + { + "epoch": 0.8545963649091227, + "grad_norm": 3.2187907695770264, + "learning_rate": 1.2606491964670508e-06, + "loss": 1.0154, + "step": 68366 + }, + { + "epoch": 0.8546213655341384, + "grad_norm": 4.494721412658691, + "learning_rate": 1.260225065720807e-06, + "loss": 0.6261, + "step": 68368 + }, + { + "epoch": 0.854646366159154, + "grad_norm": 1.89927077293396, + "learning_rate": 1.2598010015354522e-06, + "loss": 0.6739, + "step": 68370 + }, + { + "epoch": 0.8546713667841696, + "grad_norm": 0.00033287311089225113, + "learning_rate": 1.259377003914216e-06, + "loss": 0.5466, + "step": 68372 + }, + { + "epoch": 0.8546963674091852, + "grad_norm": 0.00022693078790325671, + "learning_rate": 1.2589530728603305e-06, + "loss": 0.0317, + "step": 68374 + }, + { + "epoch": 0.8547213680342008, + "grad_norm": 5.8349409103393555, + "learning_rate": 1.2585292083770207e-06, + "loss": 1.1436, + "step": 68376 + }, + { + "epoch": 0.8547463686592165, + "grad_norm": 1.8529928922653198, + "learning_rate": 1.2581054104675205e-06, + "loss": 0.6918, + "step": 68378 + }, + { + "epoch": 0.8547713692842321, + "grad_norm": 4.394157886505127, + "learning_rate": 1.2576816791350488e-06, + "loss": 1.6536, + "step": 68380 + }, + { + "epoch": 0.8547963699092477, + "grad_norm": 1.6494306325912476, + "learning_rate": 1.2572580143828394e-06, + "loss": 0.1449, + "step": 68382 + }, + { + "epoch": 0.8548213705342633, + "grad_norm": 5.145859718322754, + "learning_rate": 1.2568344162141144e-06, + "loss": 0.7687, + "step": 68384 + }, + { + "epoch": 0.854846371159279, + "grad_norm": 4.108336925506592, + "learning_rate": 1.2564108846321022e-06, + "loss": 0.9891, + "step": 68386 + }, + { + "epoch": 0.8548713717842946, + "grad_norm": 2.9595606327056885, + "learning_rate": 1.2559874196400311e-06, + "loss": 1.0815, + "step": 68388 + }, + { + "epoch": 0.8548963724093103, + "grad_norm": 2.8291220664978027, + "learning_rate": 1.2555640212411225e-06, + "loss": 0.1537, + "step": 68390 + }, + { + "epoch": 0.8549213730343258, + "grad_norm": 6.462687969207764, + "learning_rate": 1.255140689438602e-06, + "loss": 1.9442, + "step": 68392 + }, + { + "epoch": 0.8549463736593415, + "grad_norm": 3.23215913772583, + "learning_rate": 1.2547174242356909e-06, + "loss": 1.1501, + "step": 68394 + }, + { + "epoch": 0.8549713742843571, + "grad_norm": 3.544764995574951, + "learning_rate": 1.2542942256356172e-06, + "loss": 0.932, + "step": 68396 + }, + { + "epoch": 0.8549963749093727, + "grad_norm": 4.8168816566467285, + "learning_rate": 1.2538710936416e-06, + "loss": 1.1054, + "step": 68398 + }, + { + "epoch": 0.8550213755343884, + "grad_norm": 2.994013786315918, + "learning_rate": 1.2534480282568639e-06, + "loss": 1.2238, + "step": 68400 + }, + { + "epoch": 0.8550463761594039, + "grad_norm": 3.255361795425415, + "learning_rate": 1.2530250294846324e-06, + "loss": 0.4971, + "step": 68402 + }, + { + "epoch": 0.8550713767844196, + "grad_norm": 1.0765881538391113, + "learning_rate": 1.2526020973281261e-06, + "loss": 0.133, + "step": 68404 + }, + { + "epoch": 0.8550963774094352, + "grad_norm": 0.00043240308878012, + "learning_rate": 1.2521792317905646e-06, + "loss": 1.2302, + "step": 68406 + }, + { + "epoch": 0.8551213780344509, + "grad_norm": 4.476316452026367, + "learning_rate": 1.2517564328751675e-06, + "loss": 0.8935, + "step": 68408 + }, + { + "epoch": 0.8551463786594665, + "grad_norm": 11.693167686462402, + "learning_rate": 1.2513337005851556e-06, + "loss": 1.272, + "step": 68410 + }, + { + "epoch": 0.8551713792844821, + "grad_norm": 6.888691425323486, + "learning_rate": 1.2509110349237519e-06, + "loss": 1.1948, + "step": 68412 + }, + { + "epoch": 0.8551963799094977, + "grad_norm": 5.87497091293335, + "learning_rate": 1.2504884358941706e-06, + "loss": 0.7003, + "step": 68414 + }, + { + "epoch": 0.8552213805345134, + "grad_norm": 2.59144926071167, + "learning_rate": 1.2500659034996355e-06, + "loss": 0.4686, + "step": 68416 + }, + { + "epoch": 0.855246381159529, + "grad_norm": 1.500771164894104, + "learning_rate": 1.2496434377433563e-06, + "loss": 0.3754, + "step": 68418 + }, + { + "epoch": 0.8552713817845446, + "grad_norm": 5.363354682922363, + "learning_rate": 1.2492210386285585e-06, + "loss": 1.526, + "step": 68420 + }, + { + "epoch": 0.8552963824095602, + "grad_norm": 0.5969058871269226, + "learning_rate": 1.2487987061584538e-06, + "loss": 0.0266, + "step": 68422 + }, + { + "epoch": 0.8553213830345758, + "grad_norm": 1.9655611515045166, + "learning_rate": 1.2483764403362597e-06, + "loss": 0.3185, + "step": 68424 + }, + { + "epoch": 0.8553463836595915, + "grad_norm": 3.2116904258728027, + "learning_rate": 1.2479542411651957e-06, + "loss": 0.6114, + "step": 68426 + }, + { + "epoch": 0.8553713842846071, + "grad_norm": 3.135385274887085, + "learning_rate": 1.2475321086484727e-06, + "loss": 0.7031, + "step": 68428 + }, + { + "epoch": 0.8553963849096228, + "grad_norm": 1.2537065744400024, + "learning_rate": 1.2471100427893114e-06, + "loss": 1.0048, + "step": 68430 + }, + { + "epoch": 0.8554213855346383, + "grad_norm": 2.591034412384033, + "learning_rate": 1.246688043590918e-06, + "loss": 0.3134, + "step": 68432 + }, + { + "epoch": 0.855446386159654, + "grad_norm": 1.813257098197937, + "learning_rate": 1.246266111056511e-06, + "loss": 0.4999, + "step": 68434 + }, + { + "epoch": 0.8554713867846696, + "grad_norm": 0.0006327331648208201, + "learning_rate": 1.245844245189305e-06, + "loss": 0.7798, + "step": 68436 + }, + { + "epoch": 0.8554963874096853, + "grad_norm": 0.0002770045248325914, + "learning_rate": 1.2454224459925091e-06, + "loss": 0.2258, + "step": 68438 + }, + { + "epoch": 0.8555213880347009, + "grad_norm": 2.5871775150299072, + "learning_rate": 1.2450007134693398e-06, + "loss": 0.7235, + "step": 68440 + }, + { + "epoch": 0.8555463886597164, + "grad_norm": 1.07267427444458, + "learning_rate": 1.2445790476230057e-06, + "loss": 0.2634, + "step": 68442 + }, + { + "epoch": 0.8555713892847321, + "grad_norm": 2.505063056945801, + "learning_rate": 1.2441574484567232e-06, + "loss": 1.1014, + "step": 68444 + }, + { + "epoch": 0.8555963899097477, + "grad_norm": 0.34218132495880127, + "learning_rate": 1.243735915973695e-06, + "loss": 0.1482, + "step": 68446 + }, + { + "epoch": 0.8556213905347634, + "grad_norm": 0.00038907863199710846, + "learning_rate": 1.2433144501771365e-06, + "loss": 0.1553, + "step": 68448 + }, + { + "epoch": 0.855646391159779, + "grad_norm": 3.5368964672088623, + "learning_rate": 1.2428930510702585e-06, + "loss": 0.4642, + "step": 68450 + }, + { + "epoch": 0.8556713917847946, + "grad_norm": 3.8455657958984375, + "learning_rate": 1.2424717186562662e-06, + "loss": 0.7006, + "step": 68452 + }, + { + "epoch": 0.8556963924098102, + "grad_norm": 4.034083843231201, + "learning_rate": 1.2420504529383736e-06, + "loss": 0.2643, + "step": 68454 + }, + { + "epoch": 0.8557213930348259, + "grad_norm": 2.8863131999969482, + "learning_rate": 1.2416292539197861e-06, + "loss": 1.1198, + "step": 68456 + }, + { + "epoch": 0.8557463936598415, + "grad_norm": 4.308566093444824, + "learning_rate": 1.2412081216037109e-06, + "loss": 1.1586, + "step": 68458 + }, + { + "epoch": 0.8557713942848572, + "grad_norm": 5.424520015716553, + "learning_rate": 1.2407870559933543e-06, + "loss": 2.2334, + "step": 68460 + }, + { + "epoch": 0.8557963949098727, + "grad_norm": 3.350687265396118, + "learning_rate": 1.2403660570919251e-06, + "loss": 1.6795, + "step": 68462 + }, + { + "epoch": 0.8558213955348883, + "grad_norm": 3.5941638946533203, + "learning_rate": 1.2399451249026317e-06, + "loss": 1.4593, + "step": 68464 + }, + { + "epoch": 0.855846396159904, + "grad_norm": 3.550442934036255, + "learning_rate": 1.2395242594286749e-06, + "loss": 0.5348, + "step": 68466 + }, + { + "epoch": 0.8558713967849196, + "grad_norm": 0.3385328948497772, + "learning_rate": 1.2391034606732643e-06, + "loss": 0.0624, + "step": 68468 + }, + { + "epoch": 0.8558963974099353, + "grad_norm": 2.581794023513794, + "learning_rate": 1.2386827286396042e-06, + "loss": 1.2818, + "step": 68470 + }, + { + "epoch": 0.8559213980349508, + "grad_norm": 6.067652225494385, + "learning_rate": 1.2382620633308938e-06, + "loss": 1.6275, + "step": 68472 + }, + { + "epoch": 0.8559463986599665, + "grad_norm": 3.0986759662628174, + "learning_rate": 1.2378414647503434e-06, + "loss": 0.7593, + "step": 68474 + }, + { + "epoch": 0.8559713992849821, + "grad_norm": 3.533935308456421, + "learning_rate": 1.237420932901151e-06, + "loss": 1.5713, + "step": 68476 + }, + { + "epoch": 0.8559963999099978, + "grad_norm": 0.003667520359158516, + "learning_rate": 1.2370004677865233e-06, + "loss": 1.7252, + "step": 68478 + }, + { + "epoch": 0.8560214005350134, + "grad_norm": 2.8644285202026367, + "learning_rate": 1.2365800694096596e-06, + "loss": 0.2099, + "step": 68480 + }, + { + "epoch": 0.856046401160029, + "grad_norm": 1.3497403860092163, + "learning_rate": 1.2361597377737666e-06, + "loss": 0.5437, + "step": 68482 + }, + { + "epoch": 0.8560714017850446, + "grad_norm": 1.480231761932373, + "learning_rate": 1.2357394728820371e-06, + "loss": 0.7285, + "step": 68484 + }, + { + "epoch": 0.8560964024100602, + "grad_norm": 1.5856400728225708, + "learning_rate": 1.2353192747376775e-06, + "loss": 1.0931, + "step": 68486 + }, + { + "epoch": 0.8561214030350759, + "grad_norm": 2.0015625953674316, + "learning_rate": 1.2348991433438884e-06, + "loss": 1.0459, + "step": 68488 + }, + { + "epoch": 0.8561464036600915, + "grad_norm": 1.0954294204711914, + "learning_rate": 1.2344790787038652e-06, + "loss": 0.1674, + "step": 68490 + }, + { + "epoch": 0.8561714042851071, + "grad_norm": 2.736936092376709, + "learning_rate": 1.234059080820813e-06, + "loss": 2.2696, + "step": 68492 + }, + { + "epoch": 0.8561964049101227, + "grad_norm": 3.6027588844299316, + "learning_rate": 1.2336391496979238e-06, + "loss": 1.095, + "step": 68494 + }, + { + "epoch": 0.8562214055351384, + "grad_norm": 0.00030181778129190207, + "learning_rate": 1.2332192853384029e-06, + "loss": 0.7095, + "step": 68496 + }, + { + "epoch": 0.856246406160154, + "grad_norm": 1.4564288854599, + "learning_rate": 1.232799487745443e-06, + "loss": 0.0883, + "step": 68498 + }, + { + "epoch": 0.8562714067851697, + "grad_norm": 5.089742183685303, + "learning_rate": 1.2323797569222407e-06, + "loss": 1.8409, + "step": 68500 + }, + { + "epoch": 0.8562964074101852, + "grad_norm": 2.3195366859436035, + "learning_rate": 1.2319600928719966e-06, + "loss": 0.3328, + "step": 68502 + }, + { + "epoch": 0.8563214080352008, + "grad_norm": 14.223114013671875, + "learning_rate": 1.2315404955979015e-06, + "loss": 1.44, + "step": 68504 + }, + { + "epoch": 0.8563464086602165, + "grad_norm": 6.072400093078613, + "learning_rate": 1.2311209651031564e-06, + "loss": 1.3578, + "step": 68506 + }, + { + "epoch": 0.8563714092852321, + "grad_norm": 4.399918079376221, + "learning_rate": 1.2307015013909552e-06, + "loss": 1.0762, + "step": 68508 + }, + { + "epoch": 0.8563964099102478, + "grad_norm": 1.9677156209945679, + "learning_rate": 1.2302821044644874e-06, + "loss": 1.1831, + "step": 68510 + }, + { + "epoch": 0.8564214105352633, + "grad_norm": 2.9978466033935547, + "learning_rate": 1.2298627743269542e-06, + "loss": 0.1948, + "step": 68512 + }, + { + "epoch": 0.856446411160279, + "grad_norm": 3.93041729927063, + "learning_rate": 1.229443510981544e-06, + "loss": 0.9705, + "step": 68514 + }, + { + "epoch": 0.8564714117852946, + "grad_norm": 1.8469197750091553, + "learning_rate": 1.2290243144314529e-06, + "loss": 0.3558, + "step": 68516 + }, + { + "epoch": 0.8564964124103103, + "grad_norm": 2.496173143386841, + "learning_rate": 1.2286051846798707e-06, + "loss": 0.1661, + "step": 68518 + }, + { + "epoch": 0.8565214130353259, + "grad_norm": 4.548779487609863, + "learning_rate": 1.228186121729994e-06, + "loss": 1.4481, + "step": 68520 + }, + { + "epoch": 0.8565464136603415, + "grad_norm": 2.8700194358825684, + "learning_rate": 1.2277671255850099e-06, + "loss": 0.8955, + "step": 68522 + }, + { + "epoch": 0.8565714142853571, + "grad_norm": 0.00027288959245197475, + "learning_rate": 1.2273481962481093e-06, + "loss": 0.0006, + "step": 68524 + }, + { + "epoch": 0.8565964149103727, + "grad_norm": 1.3729689121246338, + "learning_rate": 1.2269293337224863e-06, + "loss": 0.5495, + "step": 68526 + }, + { + "epoch": 0.8566214155353884, + "grad_norm": 3.242971420288086, + "learning_rate": 1.2265105380113273e-06, + "loss": 1.2717, + "step": 68528 + }, + { + "epoch": 0.856646416160404, + "grad_norm": 6.872707843780518, + "learning_rate": 1.2260918091178253e-06, + "loss": 1.1645, + "step": 68530 + }, + { + "epoch": 0.8566714167854196, + "grad_norm": 14.887097358703613, + "learning_rate": 1.2256731470451643e-06, + "loss": 1.5398, + "step": 68532 + }, + { + "epoch": 0.8566964174104352, + "grad_norm": 3.495026111602783, + "learning_rate": 1.2252545517965387e-06, + "loss": 1.2779, + "step": 68534 + }, + { + "epoch": 0.8567214180354509, + "grad_norm": 5.370555877685547, + "learning_rate": 1.2248360233751333e-06, + "loss": 2.1592, + "step": 68536 + }, + { + "epoch": 0.8567464186604665, + "grad_norm": 4.213783264160156, + "learning_rate": 1.2244175617841348e-06, + "loss": 1.4974, + "step": 68538 + }, + { + "epoch": 0.8567714192854822, + "grad_norm": 5.18112325668335, + "learning_rate": 1.2239991670267314e-06, + "loss": 0.193, + "step": 68540 + }, + { + "epoch": 0.8567964199104977, + "grad_norm": 4.334871768951416, + "learning_rate": 1.2235808391061087e-06, + "loss": 1.7001, + "step": 68542 + }, + { + "epoch": 0.8568214205355134, + "grad_norm": 1.8385437726974487, + "learning_rate": 1.2231625780254552e-06, + "loss": 1.152, + "step": 68544 + }, + { + "epoch": 0.856846421160529, + "grad_norm": 3.5493006706237793, + "learning_rate": 1.2227443837879517e-06, + "loss": 2.5134, + "step": 68546 + }, + { + "epoch": 0.8568714217855447, + "grad_norm": 8.580954551696777, + "learning_rate": 1.2223262563967874e-06, + "loss": 1.5149, + "step": 68548 + }, + { + "epoch": 0.8568964224105603, + "grad_norm": 4.028344631195068, + "learning_rate": 1.221908195855146e-06, + "loss": 0.3992, + "step": 68550 + }, + { + "epoch": 0.8569214230355758, + "grad_norm": 3.9320738315582275, + "learning_rate": 1.2214902021662079e-06, + "loss": 1.0232, + "step": 68552 + }, + { + "epoch": 0.8569464236605915, + "grad_norm": 6.155043601989746, + "learning_rate": 1.2210722753331617e-06, + "loss": 1.5814, + "step": 68554 + }, + { + "epoch": 0.8569714242856071, + "grad_norm": 2.7347874641418457, + "learning_rate": 1.2206544153591849e-06, + "loss": 0.4776, + "step": 68556 + }, + { + "epoch": 0.8569964249106228, + "grad_norm": 3.5614051818847656, + "learning_rate": 1.220236622247465e-06, + "loss": 0.201, + "step": 68558 + }, + { + "epoch": 0.8570214255356384, + "grad_norm": 4.340160846710205, + "learning_rate": 1.2198188960011815e-06, + "loss": 0.9208, + "step": 68560 + }, + { + "epoch": 0.857046426160654, + "grad_norm": 4.551723480224609, + "learning_rate": 1.219401236623513e-06, + "loss": 1.144, + "step": 68562 + }, + { + "epoch": 0.8570714267856696, + "grad_norm": 0.037685807794332504, + "learning_rate": 1.218983644117645e-06, + "loss": 1.1388, + "step": 68564 + }, + { + "epoch": 0.8570964274106853, + "grad_norm": 0.6186362504959106, + "learning_rate": 1.2185661184867536e-06, + "loss": 1.1464, + "step": 68566 + }, + { + "epoch": 0.8571214280357009, + "grad_norm": 3.0150725841522217, + "learning_rate": 1.2181486597340242e-06, + "loss": 0.4938, + "step": 68568 + }, + { + "epoch": 0.8571464286607166, + "grad_norm": 1.6250144243240356, + "learning_rate": 1.2177312678626296e-06, + "loss": 1.1128, + "step": 68570 + }, + { + "epoch": 0.8571714292857321, + "grad_norm": 0.6510348916053772, + "learning_rate": 1.217313942875754e-06, + "loss": 0.4594, + "step": 68572 + }, + { + "epoch": 0.8571964299107477, + "grad_norm": 3.2094385623931885, + "learning_rate": 1.216896684776574e-06, + "loss": 1.2441, + "step": 68574 + }, + { + "epoch": 0.8572214305357634, + "grad_norm": 4.187432289123535, + "learning_rate": 1.2164794935682634e-06, + "loss": 0.8393, + "step": 68576 + }, + { + "epoch": 0.857246431160779, + "grad_norm": 0.00046485415077768266, + "learning_rate": 1.2160623692540064e-06, + "loss": 0.0, + "step": 68578 + }, + { + "epoch": 0.8572714317857947, + "grad_norm": 2.921596050262451, + "learning_rate": 1.2156453118369738e-06, + "loss": 1.0622, + "step": 68580 + }, + { + "epoch": 0.8572964324108102, + "grad_norm": 0.0002814136678352952, + "learning_rate": 1.2152283213203453e-06, + "loss": 0.4122, + "step": 68582 + }, + { + "epoch": 0.8573214330358259, + "grad_norm": 4.242845058441162, + "learning_rate": 1.2148113977072951e-06, + "loss": 1.9849, + "step": 68584 + }, + { + "epoch": 0.8573464336608415, + "grad_norm": 5.875977993011475, + "learning_rate": 1.2143945410010005e-06, + "loss": 1.2441, + "step": 68586 + }, + { + "epoch": 0.8573714342858572, + "grad_norm": 0.6318991184234619, + "learning_rate": 1.2139777512046336e-06, + "loss": 0.4943, + "step": 68588 + }, + { + "epoch": 0.8573964349108728, + "grad_norm": 3.8937649726867676, + "learning_rate": 1.2135610283213683e-06, + "loss": 1.0083, + "step": 68590 + }, + { + "epoch": 0.8574214355358883, + "grad_norm": 2.6699540615081787, + "learning_rate": 1.2131443723543824e-06, + "loss": 1.0605, + "step": 68592 + }, + { + "epoch": 0.857446436160904, + "grad_norm": 1.0925666093826294, + "learning_rate": 1.212727783306844e-06, + "loss": 0.2605, + "step": 68594 + }, + { + "epoch": 0.8574714367859196, + "grad_norm": 2.5500545501708984, + "learning_rate": 1.2123112611819299e-06, + "loss": 0.4423, + "step": 68596 + }, + { + "epoch": 0.8574964374109353, + "grad_norm": 0.14138859510421753, + "learning_rate": 1.2118948059828084e-06, + "loss": 0.6076, + "step": 68598 + }, + { + "epoch": 0.8575214380359509, + "grad_norm": 0.2828323245048523, + "learning_rate": 1.211478417712656e-06, + "loss": 0.6235, + "step": 68600 + }, + { + "epoch": 0.8575464386609665, + "grad_norm": 4.342393398284912, + "learning_rate": 1.211062096374641e-06, + "loss": 1.0182, + "step": 68602 + }, + { + "epoch": 0.8575714392859821, + "grad_norm": 2.1985998153686523, + "learning_rate": 1.2106458419719325e-06, + "loss": 0.88, + "step": 68604 + }, + { + "epoch": 0.8575964399109978, + "grad_norm": 2.8204238414764404, + "learning_rate": 1.210229654507703e-06, + "loss": 0.4681, + "step": 68606 + }, + { + "epoch": 0.8576214405360134, + "grad_norm": 2.175895929336548, + "learning_rate": 1.2098135339851202e-06, + "loss": 0.8425, + "step": 68608 + }, + { + "epoch": 0.8576464411610291, + "grad_norm": 4.735413074493408, + "learning_rate": 1.2093974804073572e-06, + "loss": 0.9101, + "step": 68610 + }, + { + "epoch": 0.8576714417860446, + "grad_norm": 2.670288562774658, + "learning_rate": 1.208981493777579e-06, + "loss": 1.1156, + "step": 68612 + }, + { + "epoch": 0.8576964424110602, + "grad_norm": 2.785644054412842, + "learning_rate": 1.2085655740989532e-06, + "loss": 0.1289, + "step": 68614 + }, + { + "epoch": 0.8577214430360759, + "grad_norm": 6.5186662673950195, + "learning_rate": 1.2081497213746507e-06, + "loss": 1.0402, + "step": 68616 + }, + { + "epoch": 0.8577464436610915, + "grad_norm": 0.0002861542743630707, + "learning_rate": 1.2077339356078343e-06, + "loss": 0.4722, + "step": 68618 + }, + { + "epoch": 0.8577714442861072, + "grad_norm": 3.476982831954956, + "learning_rate": 1.207318216801675e-06, + "loss": 0.9158, + "step": 68620 + }, + { + "epoch": 0.8577964449111227, + "grad_norm": 1.1897052526474, + "learning_rate": 1.2069025649593347e-06, + "loss": 0.2138, + "step": 68622 + }, + { + "epoch": 0.8578214455361384, + "grad_norm": 5.074094772338867, + "learning_rate": 1.206486980083983e-06, + "loss": 1.2312, + "step": 68624 + }, + { + "epoch": 0.857846446161154, + "grad_norm": 10.251388549804688, + "learning_rate": 1.2060714621787828e-06, + "loss": 1.0834, + "step": 68626 + }, + { + "epoch": 0.8578714467861697, + "grad_norm": 4.097299575805664, + "learning_rate": 1.2056560112468963e-06, + "loss": 0.8427, + "step": 68628 + }, + { + "epoch": 0.8578964474111853, + "grad_norm": 0.0028325554449111223, + "learning_rate": 1.2052406272914928e-06, + "loss": 0.3122, + "step": 68630 + }, + { + "epoch": 0.8579214480362009, + "grad_norm": 4.151708126068115, + "learning_rate": 1.2048253103157315e-06, + "loss": 1.1854, + "step": 68632 + }, + { + "epoch": 0.8579464486612165, + "grad_norm": 1.4394853115081787, + "learning_rate": 1.2044100603227781e-06, + "loss": 0.1456, + "step": 68634 + }, + { + "epoch": 0.8579714492862321, + "grad_norm": 6.979204177856445, + "learning_rate": 1.2039948773157917e-06, + "loss": 1.1201, + "step": 68636 + }, + { + "epoch": 0.8579964499112478, + "grad_norm": 2.3747475147247314, + "learning_rate": 1.2035797612979394e-06, + "loss": 0.266, + "step": 68638 + }, + { + "epoch": 0.8580214505362634, + "grad_norm": 3.1833572387695312, + "learning_rate": 1.2031647122723788e-06, + "loss": 1.7179, + "step": 68640 + }, + { + "epoch": 0.858046451161279, + "grad_norm": 4.692580699920654, + "learning_rate": 1.2027497302422698e-06, + "loss": 1.883, + "step": 68642 + }, + { + "epoch": 0.8580714517862946, + "grad_norm": 2.91218638420105, + "learning_rate": 1.2023348152107773e-06, + "loss": 0.716, + "step": 68644 + }, + { + "epoch": 0.8580964524113103, + "grad_norm": 1.9350746870040894, + "learning_rate": 1.2019199671810567e-06, + "loss": 0.9127, + "step": 68646 + }, + { + "epoch": 0.8581214530363259, + "grad_norm": 3.72953462600708, + "learning_rate": 1.201505186156271e-06, + "loss": 1.6932, + "step": 68648 + }, + { + "epoch": 0.8581464536613416, + "grad_norm": 4.528003215789795, + "learning_rate": 1.2010904721395756e-06, + "loss": 1.0044, + "step": 68650 + }, + { + "epoch": 0.8581714542863571, + "grad_norm": 0.06951814889907837, + "learning_rate": 1.2006758251341333e-06, + "loss": 1.0652, + "step": 68652 + }, + { + "epoch": 0.8581964549113728, + "grad_norm": 2.573060989379883, + "learning_rate": 1.2002612451430995e-06, + "loss": 0.8247, + "step": 68654 + }, + { + "epoch": 0.8582214555363884, + "grad_norm": 3.608376979827881, + "learning_rate": 1.1998467321696294e-06, + "loss": 1.3951, + "step": 68656 + }, + { + "epoch": 0.858246456161404, + "grad_norm": 4.777130126953125, + "learning_rate": 1.199432286216884e-06, + "loss": 1.4793, + "step": 68658 + }, + { + "epoch": 0.8582714567864197, + "grad_norm": 1.3556410074234009, + "learning_rate": 1.199017907288016e-06, + "loss": 0.3642, + "step": 68660 + }, + { + "epoch": 0.8582964574114352, + "grad_norm": 5.262945175170898, + "learning_rate": 1.1986035953861853e-06, + "loss": 0.8449, + "step": 68662 + }, + { + "epoch": 0.8583214580364509, + "grad_norm": 4.000781059265137, + "learning_rate": 1.1981893505145447e-06, + "loss": 0.9549, + "step": 68664 + }, + { + "epoch": 0.8583464586614665, + "grad_norm": 2.6939334869384766, + "learning_rate": 1.1977751726762477e-06, + "loss": 0.7711, + "step": 68666 + }, + { + "epoch": 0.8583714592864822, + "grad_norm": 2.22955322265625, + "learning_rate": 1.1973610618744513e-06, + "loss": 0.9213, + "step": 68668 + }, + { + "epoch": 0.8583964599114978, + "grad_norm": 10.932634353637695, + "learning_rate": 1.1969470181123077e-06, + "loss": 1.5029, + "step": 68670 + }, + { + "epoch": 0.8584214605365134, + "grad_norm": 4.98832368850708, + "learning_rate": 1.196533041392972e-06, + "loss": 1.71, + "step": 68672 + }, + { + "epoch": 0.858446461161529, + "grad_norm": 3.2493419647216797, + "learning_rate": 1.196119131719593e-06, + "loss": 0.9507, + "step": 68674 + }, + { + "epoch": 0.8584714617865447, + "grad_norm": 4.196104049682617, + "learning_rate": 1.1957052890953292e-06, + "loss": 1.3915, + "step": 68676 + }, + { + "epoch": 0.8584964624115603, + "grad_norm": 3.9946868419647217, + "learning_rate": 1.195291513523328e-06, + "loss": 1.8704, + "step": 68678 + }, + { + "epoch": 0.858521463036576, + "grad_norm": 4.525375843048096, + "learning_rate": 1.1948778050067388e-06, + "loss": 1.4986, + "step": 68680 + }, + { + "epoch": 0.8585464636615915, + "grad_norm": 3.339313507080078, + "learning_rate": 1.1944641635487186e-06, + "loss": 0.461, + "step": 68682 + }, + { + "epoch": 0.8585714642866071, + "grad_norm": 3.0818498134613037, + "learning_rate": 1.1940505891524112e-06, + "loss": 1.0012, + "step": 68684 + }, + { + "epoch": 0.8585964649116228, + "grad_norm": 1.878974437713623, + "learning_rate": 1.193637081820972e-06, + "loss": 1.6385, + "step": 68686 + }, + { + "epoch": 0.8586214655366384, + "grad_norm": 2.4514529705047607, + "learning_rate": 1.1932236415575448e-06, + "loss": 0.8794, + "step": 68688 + }, + { + "epoch": 0.8586464661616541, + "grad_norm": 4.984861373901367, + "learning_rate": 1.1928102683652832e-06, + "loss": 0.5185, + "step": 68690 + }, + { + "epoch": 0.8586714667866696, + "grad_norm": 0.0019992573652416468, + "learning_rate": 1.192396962247333e-06, + "loss": 0.3, + "step": 68692 + }, + { + "epoch": 0.8586964674116853, + "grad_norm": 4.567023277282715, + "learning_rate": 1.1919837232068398e-06, + "loss": 0.228, + "step": 68694 + }, + { + "epoch": 0.8587214680367009, + "grad_norm": 3.678056240081787, + "learning_rate": 1.1915705512469556e-06, + "loss": 1.2589, + "step": 68696 + }, + { + "epoch": 0.8587464686617166, + "grad_norm": 4.3427581787109375, + "learning_rate": 1.1911574463708208e-06, + "loss": 0.8975, + "step": 68698 + }, + { + "epoch": 0.8587714692867322, + "grad_norm": 5.184295654296875, + "learning_rate": 1.190744408581589e-06, + "loss": 1.8709, + "step": 68700 + }, + { + "epoch": 0.8587964699117477, + "grad_norm": 2.236539363861084, + "learning_rate": 1.1903314378823994e-06, + "loss": 0.4973, + "step": 68702 + }, + { + "epoch": 0.8588214705367634, + "grad_norm": 3.9064483642578125, + "learning_rate": 1.1899185342764018e-06, + "loss": 0.7143, + "step": 68704 + }, + { + "epoch": 0.858846471161779, + "grad_norm": 0.38244810700416565, + "learning_rate": 1.1895056977667395e-06, + "loss": 0.008, + "step": 68706 + }, + { + "epoch": 0.8588714717867947, + "grad_norm": 10.590531349182129, + "learning_rate": 1.1890929283565534e-06, + "loss": 1.2047, + "step": 68708 + }, + { + "epoch": 0.8588964724118103, + "grad_norm": 2.4877779483795166, + "learning_rate": 1.1886802260489915e-06, + "loss": 0.7461, + "step": 68710 + }, + { + "epoch": 0.8589214730368259, + "grad_norm": 2.6465156078338623, + "learning_rate": 1.188267590847193e-06, + "loss": 0.7919, + "step": 68712 + }, + { + "epoch": 0.8589464736618415, + "grad_norm": 3.822103977203369, + "learning_rate": 1.187855022754304e-06, + "loss": 0.617, + "step": 68714 + }, + { + "epoch": 0.8589714742868572, + "grad_norm": 0.00043162042857147753, + "learning_rate": 1.187442521773463e-06, + "loss": 0.7152, + "step": 68716 + }, + { + "epoch": 0.8589964749118728, + "grad_norm": 3.8687350749969482, + "learning_rate": 1.1870300879078167e-06, + "loss": 1.68, + "step": 68718 + }, + { + "epoch": 0.8590214755368885, + "grad_norm": 1.8971788883209229, + "learning_rate": 1.1866177211605024e-06, + "loss": 1.3285, + "step": 68720 + }, + { + "epoch": 0.859046476161904, + "grad_norm": 3.378164052963257, + "learning_rate": 1.1862054215346585e-06, + "loss": 0.8366, + "step": 68722 + }, + { + "epoch": 0.8590714767869196, + "grad_norm": 5.051997661590576, + "learning_rate": 1.1857931890334306e-06, + "loss": 1.327, + "step": 68724 + }, + { + "epoch": 0.8590964774119353, + "grad_norm": 4.555660247802734, + "learning_rate": 1.1853810236599528e-06, + "loss": 0.5453, + "step": 68726 + }, + { + "epoch": 0.8591214780369509, + "grad_norm": 5.682955265045166, + "learning_rate": 1.184968925417367e-06, + "loss": 1.2795, + "step": 68728 + }, + { + "epoch": 0.8591464786619666, + "grad_norm": 3.1006438732147217, + "learning_rate": 1.1845568943088148e-06, + "loss": 1.3476, + "step": 68730 + }, + { + "epoch": 0.8591714792869821, + "grad_norm": 0.0003171447024215013, + "learning_rate": 1.1841449303374263e-06, + "loss": 0.4476, + "step": 68732 + }, + { + "epoch": 0.8591964799119978, + "grad_norm": 3.8492250442504883, + "learning_rate": 1.1837330335063468e-06, + "loss": 0.336, + "step": 68734 + }, + { + "epoch": 0.8592214805370134, + "grad_norm": 3.9948480129241943, + "learning_rate": 1.183321203818707e-06, + "loss": 1.0861, + "step": 68736 + }, + { + "epoch": 0.8592464811620291, + "grad_norm": 0.7439460754394531, + "learning_rate": 1.1829094412776476e-06, + "loss": 0.7658, + "step": 68738 + }, + { + "epoch": 0.8592714817870447, + "grad_norm": 5.256791591644287, + "learning_rate": 1.182497745886302e-06, + "loss": 2.0426, + "step": 68740 + }, + { + "epoch": 0.8592964824120602, + "grad_norm": 4.473076343536377, + "learning_rate": 1.182086117647806e-06, + "loss": 0.9717, + "step": 68742 + }, + { + "epoch": 0.8593214830370759, + "grad_norm": 5.953083515167236, + "learning_rate": 1.1816745565653e-06, + "loss": 0.9551, + "step": 68744 + }, + { + "epoch": 0.8593464836620915, + "grad_norm": 0.00037995760794728994, + "learning_rate": 1.1812630626419085e-06, + "loss": 0.7831, + "step": 68746 + }, + { + "epoch": 0.8593714842871072, + "grad_norm": 0.000267780851572752, + "learning_rate": 1.180851635880773e-06, + "loss": 0.302, + "step": 68748 + }, + { + "epoch": 0.8593964849121228, + "grad_norm": 3.3895130157470703, + "learning_rate": 1.180440276285022e-06, + "loss": 1.1355, + "step": 68750 + }, + { + "epoch": 0.8594214855371384, + "grad_norm": 1.8856092691421509, + "learning_rate": 1.1800289838577906e-06, + "loss": 0.1858, + "step": 68752 + }, + { + "epoch": 0.859446486162154, + "grad_norm": 9.452892303466797, + "learning_rate": 1.1796177586022128e-06, + "loss": 0.7024, + "step": 68754 + }, + { + "epoch": 0.8594714867871697, + "grad_norm": 3.3094727993011475, + "learning_rate": 1.1792066005214198e-06, + "loss": 0.5638, + "step": 68756 + }, + { + "epoch": 0.8594964874121853, + "grad_norm": 2.2434518337249756, + "learning_rate": 1.1787955096185422e-06, + "loss": 0.9101, + "step": 68758 + }, + { + "epoch": 0.859521488037201, + "grad_norm": 8.02289867401123, + "learning_rate": 1.1783844858967076e-06, + "loss": 1.2674, + "step": 68760 + }, + { + "epoch": 0.8595464886622165, + "grad_norm": 2.7487640380859375, + "learning_rate": 1.1779735293590522e-06, + "loss": 0.8326, + "step": 68762 + }, + { + "epoch": 0.8595714892872321, + "grad_norm": 3.6925065517425537, + "learning_rate": 1.1775626400087003e-06, + "loss": 1.3219, + "step": 68764 + }, + { + "epoch": 0.8595964899122478, + "grad_norm": 0.00033914108644239604, + "learning_rate": 1.1771518178487829e-06, + "loss": 0.52, + "step": 68766 + }, + { + "epoch": 0.8596214905372634, + "grad_norm": 0.0002105167368426919, + "learning_rate": 1.1767410628824328e-06, + "loss": 0.972, + "step": 68768 + }, + { + "epoch": 0.8596464911622791, + "grad_norm": 6.475778579711914, + "learning_rate": 1.1763303751127742e-06, + "loss": 1.5422, + "step": 68770 + }, + { + "epoch": 0.8596714917872946, + "grad_norm": 3.755650758743286, + "learning_rate": 1.1759197545429368e-06, + "loss": 0.973, + "step": 68772 + }, + { + "epoch": 0.8596964924123103, + "grad_norm": 2.559096336364746, + "learning_rate": 1.1755092011760438e-06, + "loss": 1.0542, + "step": 68774 + }, + { + "epoch": 0.8597214930373259, + "grad_norm": 3.618504047393799, + "learning_rate": 1.1750987150152272e-06, + "loss": 1.759, + "step": 68776 + }, + { + "epoch": 0.8597464936623416, + "grad_norm": 1.6677355766296387, + "learning_rate": 1.1746882960636086e-06, + "loss": 0.4107, + "step": 68778 + }, + { + "epoch": 0.8597714942873572, + "grad_norm": 5.487758636474609, + "learning_rate": 1.1742779443243157e-06, + "loss": 2.2444, + "step": 68780 + }, + { + "epoch": 0.8597964949123728, + "grad_norm": 3.2076151371002197, + "learning_rate": 1.1738676598004783e-06, + "loss": 0.4131, + "step": 68782 + }, + { + "epoch": 0.8598214955373884, + "grad_norm": 0.004651067778468132, + "learning_rate": 1.1734574424952128e-06, + "loss": 0.68, + "step": 68784 + }, + { + "epoch": 0.859846496162404, + "grad_norm": 3.9407622814178467, + "learning_rate": 1.1730472924116498e-06, + "loss": 0.7313, + "step": 68786 + }, + { + "epoch": 0.8598714967874197, + "grad_norm": 2.0557973384857178, + "learning_rate": 1.172637209552907e-06, + "loss": 0.9518, + "step": 68788 + }, + { + "epoch": 0.8598964974124353, + "grad_norm": 7.581433296203613, + "learning_rate": 1.1722271939221119e-06, + "loss": 1.4145, + "step": 68790 + }, + { + "epoch": 0.8599214980374509, + "grad_norm": 0.6473644971847534, + "learning_rate": 1.1718172455223876e-06, + "loss": 0.0315, + "step": 68792 + }, + { + "epoch": 0.8599464986624665, + "grad_norm": 2.8344428539276123, + "learning_rate": 1.1714073643568524e-06, + "loss": 0.3495, + "step": 68794 + }, + { + "epoch": 0.8599714992874822, + "grad_norm": 2.589841604232788, + "learning_rate": 1.1709975504286352e-06, + "loss": 0.4482, + "step": 68796 + }, + { + "epoch": 0.8599964999124978, + "grad_norm": 2.220264196395874, + "learning_rate": 1.170587803740848e-06, + "loss": 0.2848, + "step": 68798 + }, + { + "epoch": 0.8600215005375135, + "grad_norm": 0.17758336663246155, + "learning_rate": 1.170178124296617e-06, + "loss": 0.5625, + "step": 68800 + }, + { + "epoch": 0.860046501162529, + "grad_norm": 0.5857773423194885, + "learning_rate": 1.1697685120990587e-06, + "loss": 0.5725, + "step": 68802 + }, + { + "epoch": 0.8600715017875447, + "grad_norm": 4.4866156578063965, + "learning_rate": 1.169358967151295e-06, + "loss": 1.0028, + "step": 68804 + }, + { + "epoch": 0.8600965024125603, + "grad_norm": 2.6142797470092773, + "learning_rate": 1.1689494894564468e-06, + "loss": 1.1031, + "step": 68806 + }, + { + "epoch": 0.860121503037576, + "grad_norm": 1.9287937879562378, + "learning_rate": 1.1685400790176305e-06, + "loss": 0.0157, + "step": 68808 + }, + { + "epoch": 0.8601465036625916, + "grad_norm": 2.5512261390686035, + "learning_rate": 1.1681307358379634e-06, + "loss": 1.1247, + "step": 68810 + }, + { + "epoch": 0.8601715042876071, + "grad_norm": 3.216172933578491, + "learning_rate": 1.1677214599205623e-06, + "loss": 0.7715, + "step": 68812 + }, + { + "epoch": 0.8601965049126228, + "grad_norm": 13.723774909973145, + "learning_rate": 1.1673122512685453e-06, + "loss": 1.5601, + "step": 68814 + }, + { + "epoch": 0.8602215055376384, + "grad_norm": 2.524441719055176, + "learning_rate": 1.1669031098850314e-06, + "loss": 0.7732, + "step": 68816 + }, + { + "epoch": 0.8602465061626541, + "grad_norm": 0.0003443452005740255, + "learning_rate": 1.1664940357731313e-06, + "loss": 0.0018, + "step": 68818 + }, + { + "epoch": 0.8602715067876697, + "grad_norm": 4.275108814239502, + "learning_rate": 1.166085028935967e-06, + "loss": 0.4646, + "step": 68820 + }, + { + "epoch": 0.8602965074126853, + "grad_norm": 1.9990755319595337, + "learning_rate": 1.1656760893766483e-06, + "loss": 0.9137, + "step": 68822 + }, + { + "epoch": 0.8603215080377009, + "grad_norm": 6.2738752365112305, + "learning_rate": 1.1652672170982925e-06, + "loss": 1.9877, + "step": 68824 + }, + { + "epoch": 0.8603465086627166, + "grad_norm": 2.318075180053711, + "learning_rate": 1.1648584121040097e-06, + "loss": 0.6027, + "step": 68826 + }, + { + "epoch": 0.8603715092877322, + "grad_norm": 1.7465804815292358, + "learning_rate": 1.1644496743969158e-06, + "loss": 0.2349, + "step": 68828 + }, + { + "epoch": 0.8603965099127479, + "grad_norm": 4.172154426574707, + "learning_rate": 1.1640410039801264e-06, + "loss": 0.7754, + "step": 68830 + }, + { + "epoch": 0.8604215105377634, + "grad_norm": 6.086153984069824, + "learning_rate": 1.163632400856748e-06, + "loss": 1.8436, + "step": 68832 + }, + { + "epoch": 0.860446511162779, + "grad_norm": 2.391731023788452, + "learning_rate": 1.1632238650299011e-06, + "loss": 1.4204, + "step": 68834 + }, + { + "epoch": 0.8604715117877947, + "grad_norm": 3.704740285873413, + "learning_rate": 1.1628153965026868e-06, + "loss": 0.5804, + "step": 68836 + }, + { + "epoch": 0.8604965124128103, + "grad_norm": 7.989070892333984, + "learning_rate": 1.1624069952782202e-06, + "loss": 1.7281, + "step": 68838 + }, + { + "epoch": 0.860521513037826, + "grad_norm": 0.70335453748703, + "learning_rate": 1.1619986613596146e-06, + "loss": 0.5708, + "step": 68840 + }, + { + "epoch": 0.8605465136628415, + "grad_norm": 3.0832629203796387, + "learning_rate": 1.1615903947499762e-06, + "loss": 0.3803, + "step": 68842 + }, + { + "epoch": 0.8605715142878572, + "grad_norm": 2.5684123039245605, + "learning_rate": 1.161182195452416e-06, + "loss": 0.0531, + "step": 68844 + }, + { + "epoch": 0.8605965149128728, + "grad_norm": 0.0004923278465867043, + "learning_rate": 1.1607740634700415e-06, + "loss": 0.2589, + "step": 68846 + }, + { + "epoch": 0.8606215155378885, + "grad_norm": 4.453373432159424, + "learning_rate": 1.1603659988059657e-06, + "loss": 1.9266, + "step": 68848 + }, + { + "epoch": 0.8606465161629041, + "grad_norm": 0.1514563113451004, + "learning_rate": 1.1599580014632884e-06, + "loss": 0.0646, + "step": 68850 + }, + { + "epoch": 0.8606715167879196, + "grad_norm": 2.1171891689300537, + "learning_rate": 1.1595500714451202e-06, + "loss": 1.1475, + "step": 68852 + }, + { + "epoch": 0.8606965174129353, + "grad_norm": 5.4398112297058105, + "learning_rate": 1.15914220875457e-06, + "loss": 0.5425, + "step": 68854 + }, + { + "epoch": 0.8607215180379509, + "grad_norm": 3.1256604194641113, + "learning_rate": 1.158734413394742e-06, + "loss": 0.6431, + "step": 68856 + }, + { + "epoch": 0.8607465186629666, + "grad_norm": 2.512915849685669, + "learning_rate": 1.1583266853687436e-06, + "loss": 0.4014, + "step": 68858 + }, + { + "epoch": 0.8607715192879822, + "grad_norm": 3.9683797359466553, + "learning_rate": 1.157919024679679e-06, + "loss": 0.8879, + "step": 68860 + }, + { + "epoch": 0.8607965199129978, + "grad_norm": 0.5678799748420715, + "learning_rate": 1.1575114313306535e-06, + "loss": 0.5158, + "step": 68862 + }, + { + "epoch": 0.8608215205380134, + "grad_norm": 3.1819071769714355, + "learning_rate": 1.1571039053247668e-06, + "loss": 0.6273, + "step": 68864 + }, + { + "epoch": 0.8608465211630291, + "grad_norm": 7.4234299659729, + "learning_rate": 1.1566964466651276e-06, + "loss": 1.5963, + "step": 68866 + }, + { + "epoch": 0.8608715217880447, + "grad_norm": 5.897549629211426, + "learning_rate": 1.1562890553548389e-06, + "loss": 0.8118, + "step": 68868 + }, + { + "epoch": 0.8608965224130604, + "grad_norm": 3.5154387950897217, + "learning_rate": 1.1558817313970005e-06, + "loss": 0.6368, + "step": 68870 + }, + { + "epoch": 0.8609215230380759, + "grad_norm": 0.15491856634616852, + "learning_rate": 1.1554744747947189e-06, + "loss": 0.3882, + "step": 68872 + }, + { + "epoch": 0.8609465236630915, + "grad_norm": 3.985743284225464, + "learning_rate": 1.1550672855510925e-06, + "loss": 1.3167, + "step": 68874 + }, + { + "epoch": 0.8609715242881072, + "grad_norm": 2.9126334190368652, + "learning_rate": 1.1546601636692201e-06, + "loss": 0.7522, + "step": 68876 + }, + { + "epoch": 0.8609965249131228, + "grad_norm": 1.9613094329833984, + "learning_rate": 1.154253109152208e-06, + "loss": 0.2164, + "step": 68878 + }, + { + "epoch": 0.8610215255381385, + "grad_norm": 3.440765380859375, + "learning_rate": 1.1538461220031504e-06, + "loss": 0.918, + "step": 68880 + }, + { + "epoch": 0.861046526163154, + "grad_norm": 0.45035362243652344, + "learning_rate": 1.1534392022251527e-06, + "loss": 0.6646, + "step": 68882 + }, + { + "epoch": 0.8610715267881697, + "grad_norm": 4.825018405914307, + "learning_rate": 1.153032349821308e-06, + "loss": 1.4349, + "step": 68884 + }, + { + "epoch": 0.8610965274131853, + "grad_norm": 3.123997449874878, + "learning_rate": 1.1526255647947227e-06, + "loss": 1.0709, + "step": 68886 + }, + { + "epoch": 0.861121528038201, + "grad_norm": 2.874901056289673, + "learning_rate": 1.1522188471484864e-06, + "loss": 0.9599, + "step": 68888 + }, + { + "epoch": 0.8611465286632166, + "grad_norm": 1.7028454542160034, + "learning_rate": 1.1518121968856989e-06, + "loss": 0.1133, + "step": 68890 + }, + { + "epoch": 0.8611715292882322, + "grad_norm": 5.234585762023926, + "learning_rate": 1.1514056140094609e-06, + "loss": 2.4835, + "step": 68892 + }, + { + "epoch": 0.8611965299132478, + "grad_norm": 7.92177152633667, + "learning_rate": 1.1509990985228647e-06, + "loss": 0.9261, + "step": 68894 + }, + { + "epoch": 0.8612215305382634, + "grad_norm": 0.0013239751569926739, + "learning_rate": 1.15059265042901e-06, + "loss": 0.4411, + "step": 68896 + }, + { + "epoch": 0.8612465311632791, + "grad_norm": 0.05942416191101074, + "learning_rate": 1.1501862697309885e-06, + "loss": 0.0006, + "step": 68898 + }, + { + "epoch": 0.8612715317882947, + "grad_norm": 1.3684016466140747, + "learning_rate": 1.149779956431898e-06, + "loss": 0.5263, + "step": 68900 + }, + { + "epoch": 0.8612965324133103, + "grad_norm": 3.2820353507995605, + "learning_rate": 1.1493737105348324e-06, + "loss": 1.1466, + "step": 68902 + }, + { + "epoch": 0.8613215330383259, + "grad_norm": 2.5299718379974365, + "learning_rate": 1.1489675320428839e-06, + "loss": 0.8827, + "step": 68904 + }, + { + "epoch": 0.8613465336633416, + "grad_norm": 6.2200140953063965, + "learning_rate": 1.1485614209591478e-06, + "loss": 1.039, + "step": 68906 + }, + { + "epoch": 0.8613715342883572, + "grad_norm": 3.3049018383026123, + "learning_rate": 1.1481553772867148e-06, + "loss": 1.2427, + "step": 68908 + }, + { + "epoch": 0.8613965349133729, + "grad_norm": 6.047571659088135, + "learning_rate": 1.1477494010286805e-06, + "loss": 0.3328, + "step": 68910 + }, + { + "epoch": 0.8614215355383884, + "grad_norm": 4.292600631713867, + "learning_rate": 1.1473434921881343e-06, + "loss": 1.6349, + "step": 68912 + }, + { + "epoch": 0.861446536163404, + "grad_norm": 2.0487544536590576, + "learning_rate": 1.1469376507681674e-06, + "loss": 0.9733, + "step": 68914 + }, + { + "epoch": 0.8614715367884197, + "grad_norm": 2.068366527557373, + "learning_rate": 1.1465318767718725e-06, + "loss": 0.3105, + "step": 68916 + }, + { + "epoch": 0.8614965374134353, + "grad_norm": 5.203795433044434, + "learning_rate": 1.1461261702023362e-06, + "loss": 2.5032, + "step": 68918 + }, + { + "epoch": 0.861521538038451, + "grad_norm": 6.057796955108643, + "learning_rate": 1.1457205310626541e-06, + "loss": 1.2948, + "step": 68920 + }, + { + "epoch": 0.8615465386634665, + "grad_norm": 0.1565566211938858, + "learning_rate": 1.145314959355911e-06, + "loss": 0.2469, + "step": 68922 + }, + { + "epoch": 0.8615715392884822, + "grad_norm": 2.5371181964874268, + "learning_rate": 1.144909455085198e-06, + "loss": 0.8241, + "step": 68924 + }, + { + "epoch": 0.8615965399134978, + "grad_norm": 0.00037918868474662304, + "learning_rate": 1.1445040182536016e-06, + "loss": 0.003, + "step": 68926 + }, + { + "epoch": 0.8616215405385135, + "grad_norm": 4.134670257568359, + "learning_rate": 1.1440986488642092e-06, + "loss": 0.7242, + "step": 68928 + }, + { + "epoch": 0.8616465411635291, + "grad_norm": 5.292377948760986, + "learning_rate": 1.1436933469201117e-06, + "loss": 0.9207, + "step": 68930 + }, + { + "epoch": 0.8616715417885447, + "grad_norm": 3.263545274734497, + "learning_rate": 1.14328811242439e-06, + "loss": 0.9116, + "step": 68932 + }, + { + "epoch": 0.8616965424135603, + "grad_norm": 1.6781362295150757, + "learning_rate": 1.1428829453801371e-06, + "loss": 0.7666, + "step": 68934 + }, + { + "epoch": 0.861721543038576, + "grad_norm": 4.240833282470703, + "learning_rate": 1.1424778457904317e-06, + "loss": 0.5265, + "step": 68936 + }, + { + "epoch": 0.8617465436635916, + "grad_norm": 4.954859256744385, + "learning_rate": 1.1420728136583659e-06, + "loss": 1.0258, + "step": 68938 + }, + { + "epoch": 0.8617715442886072, + "grad_norm": 4.896688938140869, + "learning_rate": 1.1416678489870192e-06, + "loss": 0.5295, + "step": 68940 + }, + { + "epoch": 0.8617965449136228, + "grad_norm": 4.311563968658447, + "learning_rate": 1.1412629517794771e-06, + "loss": 0.2064, + "step": 68942 + }, + { + "epoch": 0.8618215455386384, + "grad_norm": 3.5568974018096924, + "learning_rate": 1.140858122038825e-06, + "loss": 1.087, + "step": 68944 + }, + { + "epoch": 0.8618465461636541, + "grad_norm": 3.8666961193084717, + "learning_rate": 1.1404533597681422e-06, + "loss": 0.425, + "step": 68946 + }, + { + "epoch": 0.8618715467886697, + "grad_norm": 13.51473617553711, + "learning_rate": 1.1400486649705166e-06, + "loss": 0.8025, + "step": 68948 + }, + { + "epoch": 0.8618965474136854, + "grad_norm": 0.0004996521165594459, + "learning_rate": 1.1396440376490248e-06, + "loss": 0.6248, + "step": 68950 + }, + { + "epoch": 0.8619215480387009, + "grad_norm": 3.1752874851226807, + "learning_rate": 1.139239477806753e-06, + "loss": 1.189, + "step": 68952 + }, + { + "epoch": 0.8619465486637166, + "grad_norm": 1.8644495010375977, + "learning_rate": 1.1388349854467806e-06, + "loss": 0.8908, + "step": 68954 + }, + { + "epoch": 0.8619715492887322, + "grad_norm": 0.06059227138757706, + "learning_rate": 1.1384305605721857e-06, + "loss": 0.2828, + "step": 68956 + }, + { + "epoch": 0.8619965499137479, + "grad_norm": 4.288197994232178, + "learning_rate": 1.1380262031860523e-06, + "loss": 1.114, + "step": 68958 + }, + { + "epoch": 0.8620215505387635, + "grad_norm": 1.8206584453582764, + "learning_rate": 1.1376219132914567e-06, + "loss": 0.8554, + "step": 68960 + }, + { + "epoch": 0.862046551163779, + "grad_norm": 1.3865286111831665, + "learning_rate": 1.1372176908914811e-06, + "loss": 0.0558, + "step": 68962 + }, + { + "epoch": 0.8620715517887947, + "grad_norm": 1.0087664127349854, + "learning_rate": 1.1368135359892008e-06, + "loss": 0.2025, + "step": 68964 + }, + { + "epoch": 0.8620965524138103, + "grad_norm": 3.353761911392212, + "learning_rate": 1.1364094485876965e-06, + "loss": 1.4593, + "step": 68966 + }, + { + "epoch": 0.862121553038826, + "grad_norm": 3.0555460453033447, + "learning_rate": 1.1360054286900445e-06, + "loss": 0.1844, + "step": 68968 + }, + { + "epoch": 0.8621465536638416, + "grad_norm": 4.350818634033203, + "learning_rate": 1.1356014762993195e-06, + "loss": 0.6771, + "step": 68970 + }, + { + "epoch": 0.8621715542888572, + "grad_norm": 0.09174980968236923, + "learning_rate": 1.135197591418603e-06, + "loss": 0.0165, + "step": 68972 + }, + { + "epoch": 0.8621965549138728, + "grad_norm": 3.79172945022583, + "learning_rate": 1.1347937740509651e-06, + "loss": 1.4714, + "step": 68974 + }, + { + "epoch": 0.8622215555388885, + "grad_norm": 2.195509195327759, + "learning_rate": 1.1343900241994865e-06, + "loss": 0.2496, + "step": 68976 + }, + { + "epoch": 0.8622465561639041, + "grad_norm": 2.3441338539123535, + "learning_rate": 1.1339863418672404e-06, + "loss": 0.2934, + "step": 68978 + }, + { + "epoch": 0.8622715567889198, + "grad_norm": 3.8547394275665283, + "learning_rate": 1.1335827270572975e-06, + "loss": 1.4707, + "step": 68980 + }, + { + "epoch": 0.8622965574139353, + "grad_norm": 0.5752614736557007, + "learning_rate": 1.1331791797727375e-06, + "loss": 0.6196, + "step": 68982 + }, + { + "epoch": 0.8623215580389509, + "grad_norm": 2.407890558242798, + "learning_rate": 1.1327757000166284e-06, + "loss": 1.9356, + "step": 68984 + }, + { + "epoch": 0.8623465586639666, + "grad_norm": 4.732546806335449, + "learning_rate": 1.1323722877920473e-06, + "loss": 1.5642, + "step": 68986 + }, + { + "epoch": 0.8623715592889822, + "grad_norm": 2.515697479248047, + "learning_rate": 1.131968943102064e-06, + "loss": 0.5861, + "step": 68988 + }, + { + "epoch": 0.8623965599139979, + "grad_norm": 0.11972635984420776, + "learning_rate": 1.1315656659497532e-06, + "loss": 0.8331, + "step": 68990 + }, + { + "epoch": 0.8624215605390134, + "grad_norm": 4.360021591186523, + "learning_rate": 1.1311624563381828e-06, + "loss": 1.2021, + "step": 68992 + }, + { + "epoch": 0.8624465611640291, + "grad_norm": 0.9276880621910095, + "learning_rate": 1.1307593142704242e-06, + "loss": 0.4018, + "step": 68994 + }, + { + "epoch": 0.8624715617890447, + "grad_norm": 3.501539468765259, + "learning_rate": 1.1303562397495494e-06, + "loss": 1.4591, + "step": 68996 + }, + { + "epoch": 0.8624965624140604, + "grad_norm": 1.0687806606292725, + "learning_rate": 1.1299532327786267e-06, + "loss": 0.0665, + "step": 68998 + }, + { + "epoch": 0.862521563039076, + "grad_norm": 0.11496620625257492, + "learning_rate": 1.129550293360726e-06, + "loss": 0.1335, + "step": 69000 + }, + { + "epoch": 0.8625465636640915, + "grad_norm": 4.5865960121154785, + "learning_rate": 1.1291474214989152e-06, + "loss": 0.2504, + "step": 69002 + }, + { + "epoch": 0.8625715642891072, + "grad_norm": 0.0005079868133179843, + "learning_rate": 1.1287446171962646e-06, + "loss": 1.1975, + "step": 69004 + }, + { + "epoch": 0.8625965649141228, + "grad_norm": 4.029027462005615, + "learning_rate": 1.128341880455841e-06, + "loss": 1.186, + "step": 69006 + }, + { + "epoch": 0.8626215655391385, + "grad_norm": 4.043574333190918, + "learning_rate": 1.1279392112807087e-06, + "loss": 0.6949, + "step": 69008 + }, + { + "epoch": 0.8626465661641541, + "grad_norm": 3.2347803115844727, + "learning_rate": 1.1275366096739394e-06, + "loss": 1.2005, + "step": 69010 + }, + { + "epoch": 0.8626715667891697, + "grad_norm": 1.15037202835083, + "learning_rate": 1.127134075638594e-06, + "loss": 0.5309, + "step": 69012 + }, + { + "epoch": 0.8626965674141853, + "grad_norm": 0.0003490836243145168, + "learning_rate": 1.1267316091777424e-06, + "loss": 0.0, + "step": 69014 + }, + { + "epoch": 0.862721568039201, + "grad_norm": 4.160574436187744, + "learning_rate": 1.1263292102944466e-06, + "loss": 0.2375, + "step": 69016 + }, + { + "epoch": 0.8627465686642166, + "grad_norm": 0.0006209835992194712, + "learning_rate": 1.1259268789917743e-06, + "loss": 0.3869, + "step": 69018 + }, + { + "epoch": 0.8627715692892323, + "grad_norm": 3.560640811920166, + "learning_rate": 1.1255246152727883e-06, + "loss": 0.7461, + "step": 69020 + }, + { + "epoch": 0.8627965699142478, + "grad_norm": 3.8059487342834473, + "learning_rate": 1.1251224191405496e-06, + "loss": 0.1775, + "step": 69022 + }, + { + "epoch": 0.8628215705392634, + "grad_norm": 0.0002996433759108186, + "learning_rate": 1.1247202905981259e-06, + "loss": 0.4131, + "step": 69024 + }, + { + "epoch": 0.8628465711642791, + "grad_norm": 0.006828242912888527, + "learning_rate": 1.1243182296485756e-06, + "loss": 0.3108, + "step": 69026 + }, + { + "epoch": 0.8628715717892947, + "grad_norm": 0.0004975774209015071, + "learning_rate": 1.123916236294964e-06, + "loss": 0.0005, + "step": 69028 + }, + { + "epoch": 0.8628965724143104, + "grad_norm": 9.19838809967041, + "learning_rate": 1.1235143105403513e-06, + "loss": 1.4241, + "step": 69030 + }, + { + "epoch": 0.8629215730393259, + "grad_norm": 3.9045135974884033, + "learning_rate": 1.1231124523877956e-06, + "loss": 0.676, + "step": 69032 + }, + { + "epoch": 0.8629465736643416, + "grad_norm": 3.197521686553955, + "learning_rate": 1.1227106618403627e-06, + "loss": 1.2282, + "step": 69034 + }, + { + "epoch": 0.8629715742893572, + "grad_norm": 0.46227505803108215, + "learning_rate": 1.1223089389011089e-06, + "loss": 0.6002, + "step": 69036 + }, + { + "epoch": 0.8629965749143729, + "grad_norm": 10.833161354064941, + "learning_rate": 1.121907283573096e-06, + "loss": 0.8905, + "step": 69038 + }, + { + "epoch": 0.8630215755393885, + "grad_norm": 2.9530282020568848, + "learning_rate": 1.1215056958593795e-06, + "loss": 0.9117, + "step": 69040 + }, + { + "epoch": 0.8630465761644041, + "grad_norm": 6.065873146057129, + "learning_rate": 1.1211041757630225e-06, + "loss": 1.2051, + "step": 69042 + }, + { + "epoch": 0.8630715767894197, + "grad_norm": 3.365851402282715, + "learning_rate": 1.1207027232870804e-06, + "loss": 1.8893, + "step": 69044 + }, + { + "epoch": 0.8630965774144354, + "grad_norm": 2.813150644302368, + "learning_rate": 1.1203013384346095e-06, + "loss": 0.9975, + "step": 69046 + }, + { + "epoch": 0.863121578039451, + "grad_norm": 4.385968208312988, + "learning_rate": 1.1199000212086685e-06, + "loss": 0.9911, + "step": 69048 + }, + { + "epoch": 0.8631465786644666, + "grad_norm": 3.028801202774048, + "learning_rate": 1.1194987716123118e-06, + "loss": 0.3601, + "step": 69050 + }, + { + "epoch": 0.8631715792894822, + "grad_norm": 2.098388910293579, + "learning_rate": 1.119097589648599e-06, + "loss": 0.343, + "step": 69052 + }, + { + "epoch": 0.8631965799144978, + "grad_norm": 3.875922203063965, + "learning_rate": 1.1186964753205808e-06, + "loss": 1.2445, + "step": 69054 + }, + { + "epoch": 0.8632215805395135, + "grad_norm": 3.0336546897888184, + "learning_rate": 1.1182954286313164e-06, + "loss": 1.6332, + "step": 69056 + }, + { + "epoch": 0.8632465811645291, + "grad_norm": 11.373920440673828, + "learning_rate": 1.1178944495838573e-06, + "loss": 1.6627, + "step": 69058 + }, + { + "epoch": 0.8632715817895448, + "grad_norm": 3.18320894241333, + "learning_rate": 1.1174935381812567e-06, + "loss": 0.4598, + "step": 69060 + }, + { + "epoch": 0.8632965824145603, + "grad_norm": 7.240933895111084, + "learning_rate": 1.1170926944265713e-06, + "loss": 1.099, + "step": 69062 + }, + { + "epoch": 0.863321583039576, + "grad_norm": 2.4874391555786133, + "learning_rate": 1.1166919183228497e-06, + "loss": 0.4034, + "step": 69064 + }, + { + "epoch": 0.8633465836645916, + "grad_norm": 6.658421039581299, + "learning_rate": 1.1162912098731482e-06, + "loss": 1.5335, + "step": 69066 + }, + { + "epoch": 0.8633715842896073, + "grad_norm": 0.0005503013962879777, + "learning_rate": 1.1158905690805145e-06, + "loss": 0.0238, + "step": 69068 + }, + { + "epoch": 0.8633965849146229, + "grad_norm": 6.922439098358154, + "learning_rate": 1.1154899959480048e-06, + "loss": 0.8861, + "step": 69070 + }, + { + "epoch": 0.8634215855396384, + "grad_norm": 0.00034317790414206684, + "learning_rate": 1.1150894904786659e-06, + "loss": 0.1053, + "step": 69072 + }, + { + "epoch": 0.8634465861646541, + "grad_norm": 5.602084636688232, + "learning_rate": 1.114689052675547e-06, + "loss": 1.0208, + "step": 69074 + }, + { + "epoch": 0.8634715867896697, + "grad_norm": 0.003976218402385712, + "learning_rate": 1.114288682541702e-06, + "loss": 0.3837, + "step": 69076 + }, + { + "epoch": 0.8634965874146854, + "grad_norm": 2.107421875, + "learning_rate": 1.113888380080176e-06, + "loss": 1.1429, + "step": 69078 + }, + { + "epoch": 0.863521588039701, + "grad_norm": 3.930020332336426, + "learning_rate": 1.1134881452940205e-06, + "loss": 1.0668, + "step": 69080 + }, + { + "epoch": 0.8635465886647166, + "grad_norm": 1.2288870811462402, + "learning_rate": 1.1130879781862835e-06, + "loss": 0.1415, + "step": 69082 + }, + { + "epoch": 0.8635715892897322, + "grad_norm": 46.428260803222656, + "learning_rate": 1.1126878787600103e-06, + "loss": 2.652, + "step": 69084 + }, + { + "epoch": 0.8635965899147479, + "grad_norm": 1.344484567642212, + "learning_rate": 1.1122878470182508e-06, + "loss": 0.2464, + "step": 69086 + }, + { + "epoch": 0.8636215905397635, + "grad_norm": 3.5619866847991943, + "learning_rate": 1.1118878829640478e-06, + "loss": 1.193, + "step": 69088 + }, + { + "epoch": 0.8636465911647792, + "grad_norm": 1.8694862127304077, + "learning_rate": 1.1114879866004525e-06, + "loss": 0.872, + "step": 69090 + }, + { + "epoch": 0.8636715917897947, + "grad_norm": 2.907097578048706, + "learning_rate": 1.1110881579305055e-06, + "loss": 0.9352, + "step": 69092 + }, + { + "epoch": 0.8636965924148103, + "grad_norm": 5.656069278717041, + "learning_rate": 1.1106883969572536e-06, + "loss": 2.2882, + "step": 69094 + }, + { + "epoch": 0.863721593039826, + "grad_norm": 3.4268555641174316, + "learning_rate": 1.1102887036837461e-06, + "loss": 0.2475, + "step": 69096 + }, + { + "epoch": 0.8637465936648416, + "grad_norm": 9.169282913208008, + "learning_rate": 1.109889078113019e-06, + "loss": 2.1773, + "step": 69098 + }, + { + "epoch": 0.8637715942898573, + "grad_norm": 3.461351156234741, + "learning_rate": 1.1094895202481216e-06, + "loss": 0.7096, + "step": 69100 + }, + { + "epoch": 0.8637965949148728, + "grad_norm": 7.278377056121826, + "learning_rate": 1.1090900300920925e-06, + "loss": 0.75, + "step": 69102 + }, + { + "epoch": 0.8638215955398885, + "grad_norm": 3.59251070022583, + "learning_rate": 1.1086906076479787e-06, + "loss": 1.5722, + "step": 69104 + }, + { + "epoch": 0.8638465961649041, + "grad_norm": 2.915910243988037, + "learning_rate": 1.1082912529188172e-06, + "loss": 1.4604, + "step": 69106 + }, + { + "epoch": 0.8638715967899198, + "grad_norm": 1.6642919778823853, + "learning_rate": 1.1078919659076549e-06, + "loss": 0.2726, + "step": 69108 + }, + { + "epoch": 0.8638965974149354, + "grad_norm": 3.598898410797119, + "learning_rate": 1.1074927466175289e-06, + "loss": 1.6843, + "step": 69110 + }, + { + "epoch": 0.8639215980399509, + "grad_norm": 2.324798345565796, + "learning_rate": 1.1070935950514794e-06, + "loss": 0.9612, + "step": 69112 + }, + { + "epoch": 0.8639465986649666, + "grad_norm": 3.3598098754882812, + "learning_rate": 1.1066945112125494e-06, + "loss": 1.7364, + "step": 69114 + }, + { + "epoch": 0.8639715992899822, + "grad_norm": 3.518165349960327, + "learning_rate": 1.1062954951037753e-06, + "loss": 0.6052, + "step": 69116 + }, + { + "epoch": 0.8639965999149979, + "grad_norm": 0.0013360934099182487, + "learning_rate": 1.1058965467281979e-06, + "loss": 0.0031, + "step": 69118 + }, + { + "epoch": 0.8640216005400135, + "grad_norm": 0.6044525504112244, + "learning_rate": 1.1054976660888527e-06, + "loss": 0.4566, + "step": 69120 + }, + { + "epoch": 0.8640466011650291, + "grad_norm": 0.0004716848488897085, + "learning_rate": 1.1050988531887818e-06, + "loss": 0.4894, + "step": 69122 + }, + { + "epoch": 0.8640716017900447, + "grad_norm": 0.00046605238458141685, + "learning_rate": 1.1047001080310194e-06, + "loss": 0.5603, + "step": 69124 + }, + { + "epoch": 0.8640966024150604, + "grad_norm": 4.687991619110107, + "learning_rate": 1.1043014306186018e-06, + "loss": 0.5977, + "step": 69126 + }, + { + "epoch": 0.864121603040076, + "grad_norm": 3.5663983821868896, + "learning_rate": 1.1039028209545689e-06, + "loss": 1.1222, + "step": 69128 + }, + { + "epoch": 0.8641466036650917, + "grad_norm": 3.5992074012756348, + "learning_rate": 1.1035042790419515e-06, + "loss": 1.6411, + "step": 69130 + }, + { + "epoch": 0.8641716042901072, + "grad_norm": 2.5672221183776855, + "learning_rate": 1.1031058048837874e-06, + "loss": 0.9396, + "step": 69132 + }, + { + "epoch": 0.8641966049151228, + "grad_norm": 7.09141206741333, + "learning_rate": 1.102707398483116e-06, + "loss": 1.0436, + "step": 69134 + }, + { + "epoch": 0.8642216055401385, + "grad_norm": 2.4790632724761963, + "learning_rate": 1.102309059842962e-06, + "loss": 0.4523, + "step": 69136 + }, + { + "epoch": 0.8642466061651541, + "grad_norm": 4.567460536956787, + "learning_rate": 1.101910788966366e-06, + "loss": 0.8344, + "step": 69138 + }, + { + "epoch": 0.8642716067901698, + "grad_norm": 0.3606843650341034, + "learning_rate": 1.101512585856358e-06, + "loss": 0.0108, + "step": 69140 + }, + { + "epoch": 0.8642966074151853, + "grad_norm": 13.225763320922852, + "learning_rate": 1.1011144505159731e-06, + "loss": 0.6317, + "step": 69142 + }, + { + "epoch": 0.864321608040201, + "grad_norm": 5.715422630310059, + "learning_rate": 1.1007163829482403e-06, + "loss": 0.4365, + "step": 69144 + }, + { + "epoch": 0.8643466086652166, + "grad_norm": 0.5969904661178589, + "learning_rate": 1.1003183831561937e-06, + "loss": 0.1155, + "step": 69146 + }, + { + "epoch": 0.8643716092902323, + "grad_norm": 2.6816160678863525, + "learning_rate": 1.0999204511428663e-06, + "loss": 0.8067, + "step": 69148 + }, + { + "epoch": 0.8643966099152479, + "grad_norm": 2.7609188556671143, + "learning_rate": 1.0995225869112836e-06, + "loss": 0.5583, + "step": 69150 + }, + { + "epoch": 0.8644216105402635, + "grad_norm": 4.030146598815918, + "learning_rate": 1.0991247904644798e-06, + "loss": 0.9901, + "step": 69152 + }, + { + "epoch": 0.8644466111652791, + "grad_norm": 2.6454174518585205, + "learning_rate": 1.0987270618054802e-06, + "loss": 0.5119, + "step": 69154 + }, + { + "epoch": 0.8644716117902947, + "grad_norm": 7.205633640289307, + "learning_rate": 1.098329400937317e-06, + "loss": 0.7832, + "step": 69156 + }, + { + "epoch": 0.8644966124153104, + "grad_norm": 4.10421895980835, + "learning_rate": 1.0979318078630209e-06, + "loss": 0.4685, + "step": 69158 + }, + { + "epoch": 0.864521613040326, + "grad_norm": 1.2592756748199463, + "learning_rate": 1.097534282585616e-06, + "loss": 0.5773, + "step": 69160 + }, + { + "epoch": 0.8645466136653416, + "grad_norm": 2.3493707180023193, + "learning_rate": 1.0971368251081315e-06, + "loss": 0.3477, + "step": 69162 + }, + { + "epoch": 0.8645716142903572, + "grad_norm": 2.5712990760803223, + "learning_rate": 1.0967394354335926e-06, + "loss": 0.5809, + "step": 69164 + }, + { + "epoch": 0.8645966149153729, + "grad_norm": 3.7673556804656982, + "learning_rate": 1.0963421135650288e-06, + "loss": 0.8852, + "step": 69166 + }, + { + "epoch": 0.8646216155403885, + "grad_norm": 3.4345059394836426, + "learning_rate": 1.0959448595054622e-06, + "loss": 0.6298, + "step": 69168 + }, + { + "epoch": 0.8646466161654042, + "grad_norm": 4.801084518432617, + "learning_rate": 1.0955476732579206e-06, + "loss": 1.72, + "step": 69170 + }, + { + "epoch": 0.8646716167904197, + "grad_norm": 2.927006721496582, + "learning_rate": 1.0951505548254303e-06, + "loss": 0.6997, + "step": 69172 + }, + { + "epoch": 0.8646966174154354, + "grad_norm": 5.787810325622559, + "learning_rate": 1.0947535042110147e-06, + "loss": 1.7249, + "step": 69174 + }, + { + "epoch": 0.864721618040451, + "grad_norm": 0.0007833588751964271, + "learning_rate": 1.0943565214176976e-06, + "loss": 0.0, + "step": 69176 + }, + { + "epoch": 0.8647466186654666, + "grad_norm": 1.2777047157287598, + "learning_rate": 1.0939596064484991e-06, + "loss": 0.3071, + "step": 69178 + }, + { + "epoch": 0.8647716192904823, + "grad_norm": 3.2744572162628174, + "learning_rate": 1.093562759306447e-06, + "loss": 0.4951, + "step": 69180 + }, + { + "epoch": 0.8647966199154978, + "grad_norm": 4.779860496520996, + "learning_rate": 1.0931659799945593e-06, + "loss": 0.7307, + "step": 69182 + }, + { + "epoch": 0.8648216205405135, + "grad_norm": 0.0005770285497419536, + "learning_rate": 1.0927692685158596e-06, + "loss": 0.3717, + "step": 69184 + }, + { + "epoch": 0.8648466211655291, + "grad_norm": 8.04089641571045, + "learning_rate": 1.0923726248733735e-06, + "loss": 2.3813, + "step": 69186 + }, + { + "epoch": 0.8648716217905448, + "grad_norm": 0.39232107996940613, + "learning_rate": 1.0919760490701148e-06, + "loss": 0.8568, + "step": 69188 + }, + { + "epoch": 0.8648966224155604, + "grad_norm": 1.2811260223388672, + "learning_rate": 1.0915795411091079e-06, + "loss": 0.0171, + "step": 69190 + }, + { + "epoch": 0.864921623040576, + "grad_norm": 2.168929100036621, + "learning_rate": 1.0911831009933705e-06, + "loss": 0.6907, + "step": 69192 + }, + { + "epoch": 0.8649466236655916, + "grad_norm": 2.3995301723480225, + "learning_rate": 1.0907867287259211e-06, + "loss": 0.6362, + "step": 69194 + }, + { + "epoch": 0.8649716242906073, + "grad_norm": 0.0003219111531507224, + "learning_rate": 1.090390424309784e-06, + "loss": 0.0, + "step": 69196 + }, + { + "epoch": 0.8649966249156229, + "grad_norm": 3.308501958847046, + "learning_rate": 1.0899941877479702e-06, + "loss": 1.0573, + "step": 69198 + }, + { + "epoch": 0.8650216255406386, + "grad_norm": 2.070500373840332, + "learning_rate": 1.0895980190435051e-06, + "loss": 0.3549, + "step": 69200 + }, + { + "epoch": 0.8650466261656541, + "grad_norm": 0.003455254714936018, + "learning_rate": 1.0892019181993973e-06, + "loss": 0.0001, + "step": 69202 + }, + { + "epoch": 0.8650716267906697, + "grad_norm": 3.2966866493225098, + "learning_rate": 1.08880588521867e-06, + "loss": 0.8969, + "step": 69204 + }, + { + "epoch": 0.8650966274156854, + "grad_norm": 4.948931694030762, + "learning_rate": 1.0884099201043341e-06, + "loss": 1.0884, + "step": 69206 + }, + { + "epoch": 0.865121628040701, + "grad_norm": 3.504329204559326, + "learning_rate": 1.0880140228594084e-06, + "loss": 1.3838, + "step": 69208 + }, + { + "epoch": 0.8651466286657167, + "grad_norm": 3.214099884033203, + "learning_rate": 1.0876181934869101e-06, + "loss": 0.5756, + "step": 69210 + }, + { + "epoch": 0.8651716292907322, + "grad_norm": 8.435783386230469, + "learning_rate": 1.0872224319898494e-06, + "loss": 1.5486, + "step": 69212 + }, + { + "epoch": 0.8651966299157479, + "grad_norm": 0.00028676819056272507, + "learning_rate": 1.086826738371245e-06, + "loss": 0.3402, + "step": 69214 + }, + { + "epoch": 0.8652216305407635, + "grad_norm": 1.4091356992721558, + "learning_rate": 1.0864311126341055e-06, + "loss": 0.1347, + "step": 69216 + }, + { + "epoch": 0.8652466311657792, + "grad_norm": 1.773464560508728, + "learning_rate": 1.086035554781445e-06, + "loss": 0.7498, + "step": 69218 + }, + { + "epoch": 0.8652716317907948, + "grad_norm": 3.1131861209869385, + "learning_rate": 1.085640064816279e-06, + "loss": 0.9887, + "step": 69220 + }, + { + "epoch": 0.8652966324158103, + "grad_norm": 3.353685140609741, + "learning_rate": 1.0852446427416163e-06, + "loss": 1.4906, + "step": 69222 + }, + { + "epoch": 0.865321633040826, + "grad_norm": 0.020476629957556725, + "learning_rate": 1.0848492885604712e-06, + "loss": 0.9752, + "step": 69224 + }, + { + "epoch": 0.8653466336658416, + "grad_norm": 4.590409278869629, + "learning_rate": 1.0844540022758542e-06, + "loss": 1.1533, + "step": 69226 + }, + { + "epoch": 0.8653716342908573, + "grad_norm": 2.5190701484680176, + "learning_rate": 1.0840587838907735e-06, + "loss": 0.6127, + "step": 69228 + }, + { + "epoch": 0.8653966349158729, + "grad_norm": 2.6473190784454346, + "learning_rate": 1.0836636334082385e-06, + "loss": 0.7949, + "step": 69230 + }, + { + "epoch": 0.8654216355408885, + "grad_norm": 3.928687810897827, + "learning_rate": 1.0832685508312613e-06, + "loss": 2.0404, + "step": 69232 + }, + { + "epoch": 0.8654466361659041, + "grad_norm": 1.7453478574752808, + "learning_rate": 1.0828735361628505e-06, + "loss": 0.1492, + "step": 69234 + }, + { + "epoch": 0.8654716367909198, + "grad_norm": 4.774459362030029, + "learning_rate": 1.0824785894060131e-06, + "loss": 0.4533, + "step": 69236 + }, + { + "epoch": 0.8654966374159354, + "grad_norm": 3.9608864784240723, + "learning_rate": 1.0820837105637595e-06, + "loss": 0.5936, + "step": 69238 + }, + { + "epoch": 0.8655216380409511, + "grad_norm": 3.1833341121673584, + "learning_rate": 1.081688899639095e-06, + "loss": 1.2282, + "step": 69240 + }, + { + "epoch": 0.8655466386659666, + "grad_norm": 4.038604259490967, + "learning_rate": 1.0812941566350266e-06, + "loss": 0.6026, + "step": 69242 + }, + { + "epoch": 0.8655716392909822, + "grad_norm": 1.9419375658035278, + "learning_rate": 1.0808994815545593e-06, + "loss": 0.7882, + "step": 69244 + }, + { + "epoch": 0.8655966399159979, + "grad_norm": 2.721705436706543, + "learning_rate": 1.0805048744006997e-06, + "loss": 1.2268, + "step": 69246 + }, + { + "epoch": 0.8656216405410135, + "grad_norm": 4.250235557556152, + "learning_rate": 1.0801103351764553e-06, + "loss": 0.6852, + "step": 69248 + }, + { + "epoch": 0.8656466411660292, + "grad_norm": 0.00025816261768341064, + "learning_rate": 1.079715863884827e-06, + "loss": 0.0873, + "step": 69250 + }, + { + "epoch": 0.8656716417910447, + "grad_norm": 3.5013351440429688, + "learning_rate": 1.079321460528826e-06, + "loss": 1.761, + "step": 69252 + }, + { + "epoch": 0.8656966424160604, + "grad_norm": 5.1584625244140625, + "learning_rate": 1.0789271251114475e-06, + "loss": 1.9055, + "step": 69254 + }, + { + "epoch": 0.865721643041076, + "grad_norm": 3.080000638961792, + "learning_rate": 1.078532857635698e-06, + "loss": 1.0592, + "step": 69256 + }, + { + "epoch": 0.8657466436660917, + "grad_norm": 5.121523857116699, + "learning_rate": 1.0781386581045827e-06, + "loss": 0.584, + "step": 69258 + }, + { + "epoch": 0.8657716442911073, + "grad_norm": 2.832984685897827, + "learning_rate": 1.0777445265210996e-06, + "loss": 0.59, + "step": 69260 + }, + { + "epoch": 0.8657966449161228, + "grad_norm": 3.679703712463379, + "learning_rate": 1.0773504628882548e-06, + "loss": 0.8578, + "step": 69262 + }, + { + "epoch": 0.8658216455411385, + "grad_norm": 3.7391679286956787, + "learning_rate": 1.076956467209045e-06, + "loss": 1.2311, + "step": 69264 + }, + { + "epoch": 0.8658466461661541, + "grad_norm": 0.0002598319260869175, + "learning_rate": 1.0765625394864766e-06, + "loss": 0.7596, + "step": 69266 + }, + { + "epoch": 0.8658716467911698, + "grad_norm": 2.899916648864746, + "learning_rate": 1.0761686797235427e-06, + "loss": 0.6052, + "step": 69268 + }, + { + "epoch": 0.8658966474161854, + "grad_norm": 3.586168050765991, + "learning_rate": 1.0757748879232454e-06, + "loss": 0.3491, + "step": 69270 + }, + { + "epoch": 0.865921648041201, + "grad_norm": 9.326552391052246, + "learning_rate": 1.075381164088587e-06, + "loss": 2.4696, + "step": 69272 + }, + { + "epoch": 0.8659466486662166, + "grad_norm": 3.930323362350464, + "learning_rate": 1.0749875082225614e-06, + "loss": 1.4351, + "step": 69274 + }, + { + "epoch": 0.8659716492912323, + "grad_norm": 3.904489278793335, + "learning_rate": 1.0745939203281709e-06, + "loss": 1.5687, + "step": 69276 + }, + { + "epoch": 0.8659966499162479, + "grad_norm": 4.235343933105469, + "learning_rate": 1.0742004004084105e-06, + "loss": 1.6875, + "step": 69278 + }, + { + "epoch": 0.8660216505412636, + "grad_norm": 3.4254748821258545, + "learning_rate": 1.0738069484662762e-06, + "loss": 0.9145, + "step": 69280 + }, + { + "epoch": 0.8660466511662791, + "grad_norm": 3.2005341053009033, + "learning_rate": 1.073413564504767e-06, + "loss": 1.6896, + "step": 69282 + }, + { + "epoch": 0.8660716517912947, + "grad_norm": 3.0185177326202393, + "learning_rate": 1.073020248526877e-06, + "loss": 1.3092, + "step": 69284 + }, + { + "epoch": 0.8660966524163104, + "grad_norm": 4.601210594177246, + "learning_rate": 1.0726270005356044e-06, + "loss": 1.4872, + "step": 69286 + }, + { + "epoch": 0.866121653041326, + "grad_norm": 2.2569777965545654, + "learning_rate": 1.0722338205339388e-06, + "loss": 1.4993, + "step": 69288 + }, + { + "epoch": 0.8661466536663417, + "grad_norm": 3.732780694961548, + "learning_rate": 1.0718407085248817e-06, + "loss": 0.8856, + "step": 69290 + }, + { + "epoch": 0.8661716542913572, + "grad_norm": 0.5723733901977539, + "learning_rate": 1.0714476645114213e-06, + "loss": 0.0954, + "step": 69292 + }, + { + "epoch": 0.8661966549163729, + "grad_norm": 5.48671817779541, + "learning_rate": 1.0710546884965523e-06, + "loss": 0.3204, + "step": 69294 + }, + { + "epoch": 0.8662216555413885, + "grad_norm": 3.6809558868408203, + "learning_rate": 1.0706617804832698e-06, + "loss": 1.2096, + "step": 69296 + }, + { + "epoch": 0.8662466561664042, + "grad_norm": 0.0423700213432312, + "learning_rate": 1.0702689404745626e-06, + "loss": 0.4273, + "step": 69298 + }, + { + "epoch": 0.8662716567914198, + "grad_norm": 4.589632034301758, + "learning_rate": 1.069876168473426e-06, + "loss": 1.7058, + "step": 69300 + }, + { + "epoch": 0.8662966574164354, + "grad_norm": 4.623205184936523, + "learning_rate": 1.0694834644828478e-06, + "loss": 1.1704, + "step": 69302 + }, + { + "epoch": 0.866321658041451, + "grad_norm": 4.810157299041748, + "learning_rate": 1.069090828505822e-06, + "loss": 1.5027, + "step": 69304 + }, + { + "epoch": 0.8663466586664667, + "grad_norm": 1.6984061002731323, + "learning_rate": 1.0686982605453377e-06, + "loss": 0.2929, + "step": 69306 + }, + { + "epoch": 0.8663716592914823, + "grad_norm": 3.5128931999206543, + "learning_rate": 1.0683057606043834e-06, + "loss": 0.5563, + "step": 69308 + }, + { + "epoch": 0.866396659916498, + "grad_norm": 4.629360198974609, + "learning_rate": 1.067913328685951e-06, + "loss": 0.973, + "step": 69310 + }, + { + "epoch": 0.8664216605415135, + "grad_norm": 2.205594778060913, + "learning_rate": 1.067520964793025e-06, + "loss": 0.6639, + "step": 69312 + }, + { + "epoch": 0.8664466611665291, + "grad_norm": 4.4497551918029785, + "learning_rate": 1.0671286689285988e-06, + "loss": 2.1487, + "step": 69314 + }, + { + "epoch": 0.8664716617915448, + "grad_norm": 2.810218572616577, + "learning_rate": 1.0667364410956548e-06, + "loss": 1.282, + "step": 69316 + }, + { + "epoch": 0.8664966624165604, + "grad_norm": 3.4766862392425537, + "learning_rate": 1.0663442812971858e-06, + "loss": 1.5898, + "step": 69318 + }, + { + "epoch": 0.8665216630415761, + "grad_norm": 4.478790760040283, + "learning_rate": 1.0659521895361745e-06, + "loss": 1.7452, + "step": 69320 + }, + { + "epoch": 0.8665466636665916, + "grad_norm": 4.74147891998291, + "learning_rate": 1.0655601658156055e-06, + "loss": 0.9179, + "step": 69322 + }, + { + "epoch": 0.8665716642916073, + "grad_norm": 3.116797924041748, + "learning_rate": 1.0651682101384696e-06, + "loss": 0.9726, + "step": 69324 + }, + { + "epoch": 0.8665966649166229, + "grad_norm": 3.29307222366333, + "learning_rate": 1.0647763225077478e-06, + "loss": 1.1435, + "step": 69326 + }, + { + "epoch": 0.8666216655416386, + "grad_norm": 0.047524306923151016, + "learning_rate": 1.0643845029264266e-06, + "loss": 0.4361, + "step": 69328 + }, + { + "epoch": 0.8666466661666542, + "grad_norm": 0.00048728022375144064, + "learning_rate": 1.06399275139749e-06, + "loss": 0.0249, + "step": 69330 + }, + { + "epoch": 0.8666716667916697, + "grad_norm": 1.893481731414795, + "learning_rate": 1.0636010679239194e-06, + "loss": 0.9151, + "step": 69332 + }, + { + "epoch": 0.8666966674166854, + "grad_norm": 6.088120937347412, + "learning_rate": 1.0632094525087011e-06, + "loss": 0.5747, + "step": 69334 + }, + { + "epoch": 0.866721668041701, + "grad_norm": 0.00036223410279490054, + "learning_rate": 1.0628179051548138e-06, + "loss": 0.3272, + "step": 69336 + }, + { + "epoch": 0.8667466686667167, + "grad_norm": 1.0442930459976196, + "learning_rate": 1.062426425865243e-06, + "loss": 0.2034, + "step": 69338 + }, + { + "epoch": 0.8667716692917323, + "grad_norm": 2.033830404281616, + "learning_rate": 1.062035014642967e-06, + "loss": 0.9605, + "step": 69340 + }, + { + "epoch": 0.8667966699167479, + "grad_norm": 2.817577600479126, + "learning_rate": 1.0616436714909706e-06, + "loss": 0.7221, + "step": 69342 + }, + { + "epoch": 0.8668216705417635, + "grad_norm": 0.00028268483583815396, + "learning_rate": 1.0612523964122323e-06, + "loss": 0.0, + "step": 69344 + }, + { + "epoch": 0.8668466711667792, + "grad_norm": 4.0772199630737305, + "learning_rate": 1.0608611894097287e-06, + "loss": 1.7137, + "step": 69346 + }, + { + "epoch": 0.8668716717917948, + "grad_norm": 3.58876633644104, + "learning_rate": 1.0604700504864451e-06, + "loss": 1.6503, + "step": 69348 + }, + { + "epoch": 0.8668966724168105, + "grad_norm": 0.0451783686876297, + "learning_rate": 1.0600789796453547e-06, + "loss": 2.1445, + "step": 69350 + }, + { + "epoch": 0.866921673041826, + "grad_norm": 2.615753173828125, + "learning_rate": 1.0596879768894407e-06, + "loss": 0.3241, + "step": 69352 + }, + { + "epoch": 0.8669466736668416, + "grad_norm": 2.485123872756958, + "learning_rate": 1.0592970422216774e-06, + "loss": 1.1362, + "step": 69354 + }, + { + "epoch": 0.8669716742918573, + "grad_norm": 7.656957149505615, + "learning_rate": 1.0589061756450448e-06, + "loss": 2.55, + "step": 69356 + }, + { + "epoch": 0.8669966749168729, + "grad_norm": 0.5930277705192566, + "learning_rate": 1.058515377162519e-06, + "loss": 0.6805, + "step": 69358 + }, + { + "epoch": 0.8670216755418886, + "grad_norm": 5.295533180236816, + "learning_rate": 1.0581246467770734e-06, + "loss": 1.4202, + "step": 69360 + }, + { + "epoch": 0.8670466761669041, + "grad_norm": 2.604871988296509, + "learning_rate": 1.0577339844916879e-06, + "loss": 0.8823, + "step": 69362 + }, + { + "epoch": 0.8670716767919198, + "grad_norm": 2.969647169113159, + "learning_rate": 1.0573433903093333e-06, + "loss": 0.9308, + "step": 69364 + }, + { + "epoch": 0.8670966774169354, + "grad_norm": 1.2811906337738037, + "learning_rate": 1.0569528642329897e-06, + "loss": 1.132, + "step": 69366 + }, + { + "epoch": 0.8671216780419511, + "grad_norm": 5.118912220001221, + "learning_rate": 1.0565624062656265e-06, + "loss": 1.4535, + "step": 69368 + }, + { + "epoch": 0.8671466786669667, + "grad_norm": 4.873229503631592, + "learning_rate": 1.0561720164102207e-06, + "loss": 1.6528, + "step": 69370 + }, + { + "epoch": 0.8671716792919822, + "grad_norm": 2.9389030933380127, + "learning_rate": 1.055781694669744e-06, + "loss": 1.8682, + "step": 69372 + }, + { + "epoch": 0.8671966799169979, + "grad_norm": 2.274430274963379, + "learning_rate": 1.0553914410471677e-06, + "loss": 0.6502, + "step": 69374 + }, + { + "epoch": 0.8672216805420135, + "grad_norm": 0.00030198259628377855, + "learning_rate": 1.055001255545467e-06, + "loss": 0.451, + "step": 69376 + }, + { + "epoch": 0.8672466811670292, + "grad_norm": 0.3937903642654419, + "learning_rate": 1.0546111381676094e-06, + "loss": 1.0911, + "step": 69378 + }, + { + "epoch": 0.8672716817920448, + "grad_norm": 6.488049507141113, + "learning_rate": 1.0542210889165704e-06, + "loss": 0.4529, + "step": 69380 + }, + { + "epoch": 0.8672966824170604, + "grad_norm": 0.7309221029281616, + "learning_rate": 1.0538311077953189e-06, + "loss": 0.7544, + "step": 69382 + }, + { + "epoch": 0.867321683042076, + "grad_norm": 2.6924304962158203, + "learning_rate": 1.0534411948068225e-06, + "loss": 0.6748, + "step": 69384 + }, + { + "epoch": 0.8673466836670917, + "grad_norm": 1.879668116569519, + "learning_rate": 1.053051349954055e-06, + "loss": 1.8727, + "step": 69386 + }, + { + "epoch": 0.8673716842921073, + "grad_norm": 5.135912895202637, + "learning_rate": 1.0526615732399813e-06, + "loss": 0.2785, + "step": 69388 + }, + { + "epoch": 0.867396684917123, + "grad_norm": 0.001447398099116981, + "learning_rate": 1.0522718646675734e-06, + "loss": 0.5823, + "step": 69390 + }, + { + "epoch": 0.8674216855421385, + "grad_norm": 1.2862801551818848, + "learning_rate": 1.0518822242397952e-06, + "loss": 0.7141, + "step": 69392 + }, + { + "epoch": 0.8674466861671541, + "grad_norm": 4.758810997009277, + "learning_rate": 1.0514926519596203e-06, + "loss": 0.9891, + "step": 69394 + }, + { + "epoch": 0.8674716867921698, + "grad_norm": 3.130080461502075, + "learning_rate": 1.0511031478300104e-06, + "loss": 1.1941, + "step": 69396 + }, + { + "epoch": 0.8674966874171854, + "grad_norm": 2.273252487182617, + "learning_rate": 1.0507137118539312e-06, + "loss": 0.2753, + "step": 69398 + }, + { + "epoch": 0.8675216880422011, + "grad_norm": 0.00020691860117949545, + "learning_rate": 1.0503243440343537e-06, + "loss": 0.6105, + "step": 69400 + }, + { + "epoch": 0.8675466886672166, + "grad_norm": 2.8330516815185547, + "learning_rate": 1.0499350443742384e-06, + "loss": 0.2739, + "step": 69402 + }, + { + "epoch": 0.8675716892922323, + "grad_norm": 1.6899065971374512, + "learning_rate": 1.0495458128765533e-06, + "loss": 0.8942, + "step": 69404 + }, + { + "epoch": 0.8675966899172479, + "grad_norm": 4.256518363952637, + "learning_rate": 1.0491566495442606e-06, + "loss": 1.8959, + "step": 69406 + }, + { + "epoch": 0.8676216905422636, + "grad_norm": 11.218511581420898, + "learning_rate": 1.0487675543803265e-06, + "loss": 1.1056, + "step": 69408 + }, + { + "epoch": 0.8676466911672792, + "grad_norm": 6.03151273727417, + "learning_rate": 1.0483785273877134e-06, + "loss": 1.2786, + "step": 69410 + }, + { + "epoch": 0.8676716917922948, + "grad_norm": 3.1595709323883057, + "learning_rate": 1.047989568569382e-06, + "loss": 0.5624, + "step": 69412 + }, + { + "epoch": 0.8676966924173104, + "grad_norm": 0.02413506805896759, + "learning_rate": 1.0476006779282966e-06, + "loss": 0.9012, + "step": 69414 + }, + { + "epoch": 0.867721693042326, + "grad_norm": 2.3170390129089355, + "learning_rate": 1.0472118554674172e-06, + "loss": 1.4146, + "step": 69416 + }, + { + "epoch": 0.8677466936673417, + "grad_norm": 4.051825523376465, + "learning_rate": 1.046823101189709e-06, + "loss": 1.3383, + "step": 69418 + }, + { + "epoch": 0.8677716942923573, + "grad_norm": 3.8482916355133057, + "learning_rate": 1.0464344150981265e-06, + "loss": 0.9321, + "step": 69420 + }, + { + "epoch": 0.8677966949173729, + "grad_norm": 4.306450366973877, + "learning_rate": 1.046045797195636e-06, + "loss": 1.1297, + "step": 69422 + }, + { + "epoch": 0.8678216955423885, + "grad_norm": 4.391994476318359, + "learning_rate": 1.0456572474851945e-06, + "loss": 0.8131, + "step": 69424 + }, + { + "epoch": 0.8678466961674042, + "grad_norm": 3.60064697265625, + "learning_rate": 1.0452687659697602e-06, + "loss": 1.5862, + "step": 69426 + }, + { + "epoch": 0.8678716967924198, + "grad_norm": 1.6576684713363647, + "learning_rate": 1.0448803526522933e-06, + "loss": 0.2325, + "step": 69428 + }, + { + "epoch": 0.8678966974174355, + "grad_norm": 0.0004743480239994824, + "learning_rate": 1.04449200753575e-06, + "loss": 0.1641, + "step": 69430 + }, + { + "epoch": 0.867921698042451, + "grad_norm": 2.1814379692077637, + "learning_rate": 1.0441037306230906e-06, + "loss": 0.2009, + "step": 69432 + }, + { + "epoch": 0.8679466986674667, + "grad_norm": 1.9377249479293823, + "learning_rate": 1.0437155219172712e-06, + "loss": 0.1057, + "step": 69434 + }, + { + "epoch": 0.8679716992924823, + "grad_norm": 0.6305138468742371, + "learning_rate": 1.0433273814212453e-06, + "loss": 0.0362, + "step": 69436 + }, + { + "epoch": 0.867996699917498, + "grad_norm": 0.008867660537362099, + "learning_rate": 1.0429393091379736e-06, + "loss": 0.7344, + "step": 69438 + }, + { + "epoch": 0.8680217005425136, + "grad_norm": 3.6473827362060547, + "learning_rate": 1.0425513050704073e-06, + "loss": 1.6342, + "step": 69440 + }, + { + "epoch": 0.8680467011675291, + "grad_norm": 5.0185980796813965, + "learning_rate": 1.042163369221506e-06, + "loss": 1.5378, + "step": 69442 + }, + { + "epoch": 0.8680717017925448, + "grad_norm": 2.6703262329101562, + "learning_rate": 1.04177550159422e-06, + "loss": 0.859, + "step": 69444 + }, + { + "epoch": 0.8680967024175604, + "grad_norm": 2.145996332168579, + "learning_rate": 1.0413877021915065e-06, + "loss": 0.3044, + "step": 69446 + }, + { + "epoch": 0.8681217030425761, + "grad_norm": 5.269992828369141, + "learning_rate": 1.0409999710163177e-06, + "loss": 0.3827, + "step": 69448 + }, + { + "epoch": 0.8681467036675917, + "grad_norm": 1.4288208484649658, + "learning_rate": 1.0406123080716035e-06, + "loss": 0.6582, + "step": 69450 + }, + { + "epoch": 0.8681717042926073, + "grad_norm": 3.237638473510742, + "learning_rate": 1.0402247133603217e-06, + "loss": 1.1356, + "step": 69452 + }, + { + "epoch": 0.8681967049176229, + "grad_norm": 3.329761028289795, + "learning_rate": 1.0398371868854196e-06, + "loss": 0.3396, + "step": 69454 + }, + { + "epoch": 0.8682217055426386, + "grad_norm": 3.424268960952759, + "learning_rate": 1.0394497286498516e-06, + "loss": 1.2467, + "step": 69456 + }, + { + "epoch": 0.8682467061676542, + "grad_norm": 3.809723138809204, + "learning_rate": 1.0390623386565656e-06, + "loss": 1.3261, + "step": 69458 + }, + { + "epoch": 0.8682717067926699, + "grad_norm": 0.0003777733654715121, + "learning_rate": 1.0386750169085158e-06, + "loss": 0.0, + "step": 69460 + }, + { + "epoch": 0.8682967074176854, + "grad_norm": 0.26492640376091003, + "learning_rate": 1.0382877634086509e-06, + "loss": 0.4469, + "step": 69462 + }, + { + "epoch": 0.868321708042701, + "grad_norm": 0.002002368913963437, + "learning_rate": 1.037900578159915e-06, + "loss": 0.5831, + "step": 69464 + }, + { + "epoch": 0.8683467086677167, + "grad_norm": 3.6362037658691406, + "learning_rate": 1.0375134611652648e-06, + "loss": 1.39, + "step": 69466 + }, + { + "epoch": 0.8683717092927323, + "grad_norm": 4.218625068664551, + "learning_rate": 1.0371264124276414e-06, + "loss": 0.6911, + "step": 69468 + }, + { + "epoch": 0.868396709917748, + "grad_norm": 3.4674184322357178, + "learning_rate": 1.036739431949998e-06, + "loss": 0.5508, + "step": 69470 + }, + { + "epoch": 0.8684217105427635, + "grad_norm": 0.3639933168888092, + "learning_rate": 1.0363525197352786e-06, + "loss": 0.4837, + "step": 69472 + }, + { + "epoch": 0.8684467111677792, + "grad_norm": 4.443980693817139, + "learning_rate": 1.0359656757864311e-06, + "loss": 2.184, + "step": 69474 + }, + { + "epoch": 0.8684717117927948, + "grad_norm": 3.760268449783325, + "learning_rate": 1.0355789001064032e-06, + "loss": 1.3487, + "step": 69476 + }, + { + "epoch": 0.8684967124178105, + "grad_norm": 5.28375244140625, + "learning_rate": 1.0351921926981357e-06, + "loss": 1.3595, + "step": 69478 + }, + { + "epoch": 0.8685217130428261, + "grad_norm": 0.011264807544648647, + "learning_rate": 1.0348055535645785e-06, + "loss": 0.3039, + "step": 69480 + }, + { + "epoch": 0.8685467136678416, + "grad_norm": 5.109421730041504, + "learning_rate": 1.0344189827086725e-06, + "loss": 0.2338, + "step": 69482 + }, + { + "epoch": 0.8685717142928573, + "grad_norm": 0.0005095445085316896, + "learning_rate": 1.0340324801333669e-06, + "loss": 0.0, + "step": 69484 + }, + { + "epoch": 0.8685967149178729, + "grad_norm": 0.0008975497330538929, + "learning_rate": 1.0336460458415986e-06, + "loss": 0.372, + "step": 69486 + }, + { + "epoch": 0.8686217155428886, + "grad_norm": 5.3819403648376465, + "learning_rate": 1.033259679836316e-06, + "loss": 1.1408, + "step": 69488 + }, + { + "epoch": 0.8686467161679042, + "grad_norm": 0.002459077863022685, + "learning_rate": 1.0328733821204607e-06, + "loss": 0.0611, + "step": 69490 + }, + { + "epoch": 0.8686717167929198, + "grad_norm": 11.466737747192383, + "learning_rate": 1.0324871526969716e-06, + "loss": 1.3569, + "step": 69492 + }, + { + "epoch": 0.8686967174179354, + "grad_norm": 3.3670814037323, + "learning_rate": 1.032100991568793e-06, + "loss": 1.8805, + "step": 69494 + }, + { + "epoch": 0.8687217180429511, + "grad_norm": 2.8073530197143555, + "learning_rate": 1.0317148987388637e-06, + "loss": 1.2165, + "step": 69496 + }, + { + "epoch": 0.8687467186679667, + "grad_norm": 5.226265907287598, + "learning_rate": 1.031328874210128e-06, + "loss": 1.4519, + "step": 69498 + }, + { + "epoch": 0.8687717192929824, + "grad_norm": 2.6480448246002197, + "learning_rate": 1.0309429179855236e-06, + "loss": 0.8065, + "step": 69500 + }, + { + "epoch": 0.8687967199179979, + "grad_norm": 2.1192586421966553, + "learning_rate": 1.0305570300679868e-06, + "loss": 0.0849, + "step": 69502 + }, + { + "epoch": 0.8688217205430135, + "grad_norm": 12.511223793029785, + "learning_rate": 1.0301712104604612e-06, + "loss": 1.3716, + "step": 69504 + }, + { + "epoch": 0.8688467211680292, + "grad_norm": 4.753992080688477, + "learning_rate": 1.0297854591658808e-06, + "loss": 2.418, + "step": 69506 + }, + { + "epoch": 0.8688717217930448, + "grad_norm": 2.423992395401001, + "learning_rate": 1.0293997761871889e-06, + "loss": 0.3092, + "step": 69508 + }, + { + "epoch": 0.8688967224180605, + "grad_norm": 4.29567289352417, + "learning_rate": 1.0290141615273174e-06, + "loss": 0.6747, + "step": 69510 + }, + { + "epoch": 0.868921723043076, + "grad_norm": 2.7485480308532715, + "learning_rate": 1.0286286151892055e-06, + "loss": 0.8203, + "step": 69512 + }, + { + "epoch": 0.8689467236680917, + "grad_norm": 5.069015979766846, + "learning_rate": 1.0282431371757928e-06, + "loss": 1.1603, + "step": 69514 + }, + { + "epoch": 0.8689717242931073, + "grad_norm": 0.00036131381057202816, + "learning_rate": 1.0278577274900081e-06, + "loss": 0.6802, + "step": 69516 + }, + { + "epoch": 0.868996724918123, + "grad_norm": 4.194921493530273, + "learning_rate": 1.0274723861347925e-06, + "loss": 1.248, + "step": 69518 + }, + { + "epoch": 0.8690217255431386, + "grad_norm": 2.1643474102020264, + "learning_rate": 1.0270871131130755e-06, + "loss": 1.7947, + "step": 69520 + }, + { + "epoch": 0.8690467261681541, + "grad_norm": 4.602427959442139, + "learning_rate": 1.0267019084277974e-06, + "loss": 1.3351, + "step": 69522 + }, + { + "epoch": 0.8690717267931698, + "grad_norm": 9.199324607849121, + "learning_rate": 1.0263167720818857e-06, + "loss": 0.9875, + "step": 69524 + }, + { + "epoch": 0.8690967274181854, + "grad_norm": 0.24936959147453308, + "learning_rate": 1.025931704078279e-06, + "loss": 0.6113, + "step": 69526 + }, + { + "epoch": 0.8691217280432011, + "grad_norm": 4.446163654327393, + "learning_rate": 1.0255467044199064e-06, + "loss": 1.2232, + "step": 69528 + }, + { + "epoch": 0.8691467286682167, + "grad_norm": 4.71360445022583, + "learning_rate": 1.0251617731096997e-06, + "loss": 1.4266, + "step": 69530 + }, + { + "epoch": 0.8691717292932323, + "grad_norm": 4.742656707763672, + "learning_rate": 1.0247769101505944e-06, + "loss": 0.2915, + "step": 69532 + }, + { + "epoch": 0.8691967299182479, + "grad_norm": 2.912691354751587, + "learning_rate": 1.024392115545516e-06, + "loss": 1.1316, + "step": 69534 + }, + { + "epoch": 0.8692217305432636, + "grad_norm": 3.0932767391204834, + "learning_rate": 1.0240073892973989e-06, + "loss": 1.3321, + "step": 69536 + }, + { + "epoch": 0.8692467311682792, + "grad_norm": 4.412789821624756, + "learning_rate": 1.0236227314091728e-06, + "loss": 0.7774, + "step": 69538 + }, + { + "epoch": 0.8692717317932949, + "grad_norm": 3.00226092338562, + "learning_rate": 1.023238141883768e-06, + "loss": 1.581, + "step": 69540 + }, + { + "epoch": 0.8692967324183104, + "grad_norm": 0.0005491269403137267, + "learning_rate": 1.0228536207241114e-06, + "loss": 0.3625, + "step": 69542 + }, + { + "epoch": 0.869321733043326, + "grad_norm": 7.623091697692871, + "learning_rate": 1.02246916793313e-06, + "loss": 1.6381, + "step": 69544 + }, + { + "epoch": 0.8693467336683417, + "grad_norm": 6.315311431884766, + "learning_rate": 1.0220847835137559e-06, + "loss": 0.803, + "step": 69546 + }, + { + "epoch": 0.8693717342933573, + "grad_norm": 0.00042933630174957216, + "learning_rate": 1.0217004674689134e-06, + "loss": 0.7946, + "step": 69548 + }, + { + "epoch": 0.869396734918373, + "grad_norm": 13.004560470581055, + "learning_rate": 1.0213162198015292e-06, + "loss": 1.5436, + "step": 69550 + }, + { + "epoch": 0.8694217355433885, + "grad_norm": 5.159585475921631, + "learning_rate": 1.020932040514536e-06, + "loss": 1.607, + "step": 69552 + }, + { + "epoch": 0.8694467361684042, + "grad_norm": 3.9860315322875977, + "learning_rate": 1.02054792961085e-06, + "loss": 1.2528, + "step": 69554 + }, + { + "epoch": 0.8694717367934198, + "grad_norm": 6.100371837615967, + "learning_rate": 1.0201638870934027e-06, + "loss": 0.5359, + "step": 69556 + }, + { + "epoch": 0.8694967374184355, + "grad_norm": 0.2645571827888489, + "learning_rate": 1.0197799129651165e-06, + "loss": 0.3529, + "step": 69558 + }, + { + "epoch": 0.8695217380434511, + "grad_norm": 3.334691047668457, + "learning_rate": 1.0193960072289178e-06, + "loss": 0.4962, + "step": 69560 + }, + { + "epoch": 0.8695467386684667, + "grad_norm": 2.6243858337402344, + "learning_rate": 1.0190121698877275e-06, + "loss": 1.1885, + "step": 69562 + }, + { + "epoch": 0.8695717392934823, + "grad_norm": 0.1436624377965927, + "learning_rate": 1.0186284009444714e-06, + "loss": 0.6042, + "step": 69564 + }, + { + "epoch": 0.869596739918498, + "grad_norm": 2.894533634185791, + "learning_rate": 1.0182447004020734e-06, + "loss": 0.3105, + "step": 69566 + }, + { + "epoch": 0.8696217405435136, + "grad_norm": 2.772040843963623, + "learning_rate": 1.0178610682634504e-06, + "loss": 1.1305, + "step": 69568 + }, + { + "epoch": 0.8696467411685292, + "grad_norm": 3.0247321128845215, + "learning_rate": 1.01747750453153e-06, + "loss": 0.6898, + "step": 69570 + }, + { + "epoch": 0.8696717417935448, + "grad_norm": 2.862166166305542, + "learning_rate": 1.0170940092092285e-06, + "loss": 0.6089, + "step": 69572 + }, + { + "epoch": 0.8696967424185604, + "grad_norm": 6.241024017333984, + "learning_rate": 1.0167105822994683e-06, + "loss": 0.623, + "step": 69574 + }, + { + "epoch": 0.8697217430435761, + "grad_norm": 1.9331825971603394, + "learning_rate": 1.0163272238051714e-06, + "loss": 0.4075, + "step": 69576 + }, + { + "epoch": 0.8697467436685917, + "grad_norm": 0.6834672689437866, + "learning_rate": 1.0159439337292565e-06, + "loss": 0.3853, + "step": 69578 + }, + { + "epoch": 0.8697717442936074, + "grad_norm": 3.4613828659057617, + "learning_rate": 1.0155607120746425e-06, + "loss": 1.1484, + "step": 69580 + }, + { + "epoch": 0.8697967449186229, + "grad_norm": 2.615572214126587, + "learning_rate": 1.015177558844246e-06, + "loss": 1.1474, + "step": 69582 + }, + { + "epoch": 0.8698217455436386, + "grad_norm": 2.316016674041748, + "learning_rate": 1.0147944740409876e-06, + "loss": 0.8525, + "step": 69584 + }, + { + "epoch": 0.8698467461686542, + "grad_norm": 2.4835546016693115, + "learning_rate": 1.014411457667782e-06, + "loss": 1.2836, + "step": 69586 + }, + { + "epoch": 0.8698717467936699, + "grad_norm": 1.0375752449035645, + "learning_rate": 1.0140285097275483e-06, + "loss": 0.3583, + "step": 69588 + }, + { + "epoch": 0.8698967474186855, + "grad_norm": 4.668759346008301, + "learning_rate": 1.0136456302232045e-06, + "loss": 1.1272, + "step": 69590 + }, + { + "epoch": 0.869921748043701, + "grad_norm": 3.1085383892059326, + "learning_rate": 1.0132628191576655e-06, + "loss": 1.006, + "step": 69592 + }, + { + "epoch": 0.8699467486687167, + "grad_norm": 5.569960594177246, + "learning_rate": 1.0128800765338453e-06, + "loss": 0.8521, + "step": 69594 + }, + { + "epoch": 0.8699717492937323, + "grad_norm": 2.368934154510498, + "learning_rate": 1.0124974023546575e-06, + "loss": 1.0113, + "step": 69596 + }, + { + "epoch": 0.869996749918748, + "grad_norm": 2.3727786540985107, + "learning_rate": 1.0121147966230193e-06, + "loss": 1.1357, + "step": 69598 + }, + { + "epoch": 0.8700217505437636, + "grad_norm": 1.9395737648010254, + "learning_rate": 1.0117322593418456e-06, + "loss": 1.0982, + "step": 69600 + }, + { + "epoch": 0.8700467511687792, + "grad_norm": 0.0053650070913136005, + "learning_rate": 1.0113497905140456e-06, + "loss": 0.756, + "step": 69602 + }, + { + "epoch": 0.8700717517937948, + "grad_norm": 2.518568992614746, + "learning_rate": 1.0109673901425387e-06, + "loss": 0.8495, + "step": 69604 + }, + { + "epoch": 0.8700967524188105, + "grad_norm": 2.4506869316101074, + "learning_rate": 1.010585058230229e-06, + "loss": 0.7214, + "step": 69606 + }, + { + "epoch": 0.8701217530438261, + "grad_norm": 5.117612838745117, + "learning_rate": 1.0102027947800342e-06, + "loss": 0.4086, + "step": 69608 + }, + { + "epoch": 0.8701467536688418, + "grad_norm": 3.3886425495147705, + "learning_rate": 1.0098205997948618e-06, + "loss": 1.1701, + "step": 69610 + }, + { + "epoch": 0.8701717542938573, + "grad_norm": 2.9481635093688965, + "learning_rate": 1.009438473277624e-06, + "loss": 0.9308, + "step": 69612 + }, + { + "epoch": 0.8701967549188729, + "grad_norm": 0.0003587203682400286, + "learning_rate": 1.0090564152312332e-06, + "loss": 0.7926, + "step": 69614 + }, + { + "epoch": 0.8702217555438886, + "grad_norm": 3.2062084674835205, + "learning_rate": 1.0086744256585957e-06, + "loss": 1.5631, + "step": 69616 + }, + { + "epoch": 0.8702467561689042, + "grad_norm": 2.817922830581665, + "learning_rate": 1.0082925045626269e-06, + "loss": 0.7629, + "step": 69618 + }, + { + "epoch": 0.8702717567939199, + "grad_norm": 4.317267417907715, + "learning_rate": 1.0079106519462257e-06, + "loss": 1.5295, + "step": 69620 + }, + { + "epoch": 0.8702967574189354, + "grad_norm": 3.4083433151245117, + "learning_rate": 1.0075288678123064e-06, + "loss": 1.4858, + "step": 69622 + }, + { + "epoch": 0.8703217580439511, + "grad_norm": 0.011352794244885445, + "learning_rate": 1.0071471521637766e-06, + "loss": 0.3263, + "step": 69624 + }, + { + "epoch": 0.8703467586689667, + "grad_norm": 1.639796495437622, + "learning_rate": 1.0067655050035408e-06, + "loss": 0.1185, + "step": 69626 + }, + { + "epoch": 0.8703717592939824, + "grad_norm": 0.4249057471752167, + "learning_rate": 1.00638392633451e-06, + "loss": 0.0717, + "step": 69628 + }, + { + "epoch": 0.870396759918998, + "grad_norm": 3.898709774017334, + "learning_rate": 1.0060024161595862e-06, + "loss": 0.9047, + "step": 69630 + }, + { + "epoch": 0.8704217605440135, + "grad_norm": 2.7621543407440186, + "learning_rate": 1.0056209744816759e-06, + "loss": 0.5517, + "step": 69632 + }, + { + "epoch": 0.8704467611690292, + "grad_norm": 1.350809931755066, + "learning_rate": 1.0052396013036837e-06, + "loss": 1.0242, + "step": 69634 + }, + { + "epoch": 0.8704717617940448, + "grad_norm": 1.9788120985031128, + "learning_rate": 1.0048582966285137e-06, + "loss": 1.4455, + "step": 69636 + }, + { + "epoch": 0.8704967624190605, + "grad_norm": 3.8650732040405273, + "learning_rate": 1.0044770604590737e-06, + "loss": 0.8117, + "step": 69638 + }, + { + "epoch": 0.8705217630440761, + "grad_norm": 2.4129669666290283, + "learning_rate": 1.0040958927982614e-06, + "loss": 0.6903, + "step": 69640 + }, + { + "epoch": 0.8705467636690917, + "grad_norm": 6.9397969245910645, + "learning_rate": 1.0037147936489854e-06, + "loss": 0.6934, + "step": 69642 + }, + { + "epoch": 0.8705717642941073, + "grad_norm": 3.660433769226074, + "learning_rate": 1.0033337630141448e-06, + "loss": 0.6811, + "step": 69644 + }, + { + "epoch": 0.870596764919123, + "grad_norm": 2.703526735305786, + "learning_rate": 1.0029528008966427e-06, + "loss": 1.2427, + "step": 69646 + }, + { + "epoch": 0.8706217655441386, + "grad_norm": 3.3762333393096924, + "learning_rate": 1.0025719072993768e-06, + "loss": 1.6061, + "step": 69648 + }, + { + "epoch": 0.8706467661691543, + "grad_norm": 3.0834896564483643, + "learning_rate": 1.0021910822252512e-06, + "loss": 0.5112, + "step": 69650 + }, + { + "epoch": 0.8706717667941698, + "grad_norm": 2.145142078399658, + "learning_rate": 1.0018103256771683e-06, + "loss": 0.4996, + "step": 69652 + }, + { + "epoch": 0.8706967674191854, + "grad_norm": 3.3029911518096924, + "learning_rate": 1.0014296376580236e-06, + "loss": 0.4975, + "step": 69654 + }, + { + "epoch": 0.8707217680442011, + "grad_norm": 0.0004562189569696784, + "learning_rate": 1.0010490181707222e-06, + "loss": 0.0, + "step": 69656 + }, + { + "epoch": 0.8707467686692167, + "grad_norm": 1.2999790906906128, + "learning_rate": 1.0006684672181543e-06, + "loss": 0.6963, + "step": 69658 + }, + { + "epoch": 0.8707717692942324, + "grad_norm": 3.78318452835083, + "learning_rate": 1.0002879848032231e-06, + "loss": 0.995, + "step": 69660 + }, + { + "epoch": 0.8707967699192479, + "grad_norm": 1.9427295923233032, + "learning_rate": 9.999075709288287e-07, + "loss": 0.0885, + "step": 69662 + }, + { + "epoch": 0.8708217705442636, + "grad_norm": 0.9458834528923035, + "learning_rate": 9.995272255978628e-07, + "loss": 0.031, + "step": 69664 + }, + { + "epoch": 0.8708467711692792, + "grad_norm": 2.68634033203125, + "learning_rate": 9.991469488132265e-07, + "loss": 1.4637, + "step": 69666 + }, + { + "epoch": 0.8708717717942949, + "grad_norm": 0.00033385667484253645, + "learning_rate": 9.987667405778123e-07, + "loss": 0.3325, + "step": 69668 + }, + { + "epoch": 0.8708967724193105, + "grad_norm": 3.289402961730957, + "learning_rate": 9.98386600894522e-07, + "loss": 1.2388, + "step": 69670 + }, + { + "epoch": 0.870921773044326, + "grad_norm": 4.484533786773682, + "learning_rate": 9.980065297662422e-07, + "loss": 1.3864, + "step": 69672 + }, + { + "epoch": 0.8709467736693417, + "grad_norm": 1.2638051509857178, + "learning_rate": 9.976265271958719e-07, + "loss": 0.1333, + "step": 69674 + }, + { + "epoch": 0.8709717742943573, + "grad_norm": 5.1690754890441895, + "learning_rate": 9.972465931863063e-07, + "loss": 0.71, + "step": 69676 + }, + { + "epoch": 0.870996774919373, + "grad_norm": 3.7604870796203613, + "learning_rate": 9.968667277404364e-07, + "loss": 0.7801, + "step": 69678 + }, + { + "epoch": 0.8710217755443886, + "grad_norm": 2.8110191822052, + "learning_rate": 9.96486930861158e-07, + "loss": 0.5946, + "step": 69680 + }, + { + "epoch": 0.8710467761694042, + "grad_norm": 1.998476266860962, + "learning_rate": 9.961072025513619e-07, + "loss": 0.8865, + "step": 69682 + }, + { + "epoch": 0.8710717767944198, + "grad_norm": 2.3645052909851074, + "learning_rate": 9.95727542813938e-07, + "loss": 0.5142, + "step": 69684 + }, + { + "epoch": 0.8710967774194355, + "grad_norm": 2.3546223640441895, + "learning_rate": 9.953479516517817e-07, + "loss": 0.2725, + "step": 69686 + }, + { + "epoch": 0.8711217780444511, + "grad_norm": 4.586484432220459, + "learning_rate": 9.949684290677808e-07, + "loss": 1.6511, + "step": 69688 + }, + { + "epoch": 0.8711467786694668, + "grad_norm": 1.8888185024261475, + "learning_rate": 9.945889750648286e-07, + "loss": 0.4688, + "step": 69690 + }, + { + "epoch": 0.8711717792944823, + "grad_norm": 0.0004472467990126461, + "learning_rate": 9.942095896458104e-07, + "loss": 1.6509, + "step": 69692 + }, + { + "epoch": 0.871196779919498, + "grad_norm": 2.302064895629883, + "learning_rate": 9.938302728136207e-07, + "loss": 0.1486, + "step": 69694 + }, + { + "epoch": 0.8712217805445136, + "grad_norm": 6.919672012329102, + "learning_rate": 9.93451024571147e-07, + "loss": 0.9105, + "step": 69696 + }, + { + "epoch": 0.8712467811695292, + "grad_norm": 4.849577903747559, + "learning_rate": 9.93071844921274e-07, + "loss": 0.9284, + "step": 69698 + }, + { + "epoch": 0.8712717817945449, + "grad_norm": 2.9392051696777344, + "learning_rate": 9.926927338668934e-07, + "loss": 1.7511, + "step": 69700 + }, + { + "epoch": 0.8712967824195604, + "grad_norm": 5.058174133300781, + "learning_rate": 9.92313691410891e-07, + "loss": 1.0833, + "step": 69702 + }, + { + "epoch": 0.8713217830445761, + "grad_norm": 4.053861618041992, + "learning_rate": 9.919347175561544e-07, + "loss": 0.8927, + "step": 69704 + }, + { + "epoch": 0.8713467836695917, + "grad_norm": 0.9318457245826721, + "learning_rate": 9.915558123055669e-07, + "loss": 0.161, + "step": 69706 + }, + { + "epoch": 0.8713717842946074, + "grad_norm": 0.9402448534965515, + "learning_rate": 9.911769756620216e-07, + "loss": 0.2507, + "step": 69708 + }, + { + "epoch": 0.871396784919623, + "grad_norm": 0.000413755071349442, + "learning_rate": 9.90798207628394e-07, + "loss": 0.0245, + "step": 69710 + }, + { + "epoch": 0.8714217855446386, + "grad_norm": 0.017364351078867912, + "learning_rate": 9.90419508207573e-07, + "loss": 0.1604, + "step": 69712 + }, + { + "epoch": 0.8714467861696542, + "grad_norm": 2.7418124675750732, + "learning_rate": 9.90040877402445e-07, + "loss": 1.3935, + "step": 69714 + }, + { + "epoch": 0.8714717867946699, + "grad_norm": 3.1796722412109375, + "learning_rate": 9.89662315215889e-07, + "loss": 0.9821, + "step": 69716 + }, + { + "epoch": 0.8714967874196855, + "grad_norm": 5.6204514503479, + "learning_rate": 9.89283821650794e-07, + "loss": 0.9934, + "step": 69718 + }, + { + "epoch": 0.8715217880447012, + "grad_norm": 2.045182704925537, + "learning_rate": 9.88905396710036e-07, + "loss": 1.1454, + "step": 69720 + }, + { + "epoch": 0.8715467886697167, + "grad_norm": 0.4886002540588379, + "learning_rate": 9.88527040396503e-07, + "loss": 0.1921, + "step": 69722 + }, + { + "epoch": 0.8715717892947323, + "grad_norm": 1.2528046369552612, + "learning_rate": 9.88148752713073e-07, + "loss": 0.0819, + "step": 69724 + }, + { + "epoch": 0.871596789919748, + "grad_norm": 0.0003759860701393336, + "learning_rate": 9.877705336626265e-07, + "loss": 0.8093, + "step": 69726 + }, + { + "epoch": 0.8716217905447636, + "grad_norm": 3.776200294494629, + "learning_rate": 9.873923832480458e-07, + "loss": 0.8267, + "step": 69728 + }, + { + "epoch": 0.8716467911697793, + "grad_norm": 13.969378471374512, + "learning_rate": 9.8701430147221e-07, + "loss": 0.8942, + "step": 69730 + }, + { + "epoch": 0.8716717917947948, + "grad_norm": 2.171647787094116, + "learning_rate": 9.866362883379997e-07, + "loss": 0.52, + "step": 69732 + }, + { + "epoch": 0.8716967924198105, + "grad_norm": 0.0008613274549134076, + "learning_rate": 9.86258343848291e-07, + "loss": 0.009, + "step": 69734 + }, + { + "epoch": 0.8717217930448261, + "grad_norm": 2.620527744293213, + "learning_rate": 9.858804680059663e-07, + "loss": 0.3968, + "step": 69736 + }, + { + "epoch": 0.8717467936698418, + "grad_norm": 5.24027156829834, + "learning_rate": 9.855026608139007e-07, + "loss": 0.3269, + "step": 69738 + }, + { + "epoch": 0.8717717942948574, + "grad_norm": 4.031972885131836, + "learning_rate": 9.851249222749703e-07, + "loss": 1.5559, + "step": 69740 + }, + { + "epoch": 0.8717967949198729, + "grad_norm": 5.202877521514893, + "learning_rate": 9.847472523920553e-07, + "loss": 1.1641, + "step": 69742 + }, + { + "epoch": 0.8718217955448886, + "grad_norm": 3.2024338245391846, + "learning_rate": 9.843696511680289e-07, + "loss": 0.3885, + "step": 69744 + }, + { + "epoch": 0.8718467961699042, + "grad_norm": 1.4421565532684326, + "learning_rate": 9.83992118605771e-07, + "loss": 0.8704, + "step": 69746 + }, + { + "epoch": 0.8718717967949199, + "grad_norm": 0.0003510575625114143, + "learning_rate": 9.836146547081526e-07, + "loss": 0.7212, + "step": 69748 + }, + { + "epoch": 0.8718967974199355, + "grad_norm": 0.012186537496745586, + "learning_rate": 9.832372594780492e-07, + "loss": 0.5135, + "step": 69750 + }, + { + "epoch": 0.8719217980449511, + "grad_norm": 6.146344184875488, + "learning_rate": 9.828599329183375e-07, + "loss": 0.8047, + "step": 69752 + }, + { + "epoch": 0.8719467986699667, + "grad_norm": 0.00039341303636319935, + "learning_rate": 9.824826750318873e-07, + "loss": 0.859, + "step": 69754 + }, + { + "epoch": 0.8719717992949824, + "grad_norm": 3.865480899810791, + "learning_rate": 9.821054858215751e-07, + "loss": 1.3011, + "step": 69756 + }, + { + "epoch": 0.871996799919998, + "grad_norm": 3.850982904434204, + "learning_rate": 9.81728365290271e-07, + "loss": 1.5083, + "step": 69758 + }, + { + "epoch": 0.8720218005450137, + "grad_norm": 10.548331260681152, + "learning_rate": 9.813513134408503e-07, + "loss": 0.6863, + "step": 69760 + }, + { + "epoch": 0.8720468011700292, + "grad_norm": 3.5091958045959473, + "learning_rate": 9.809743302761832e-07, + "loss": 1.4798, + "step": 69762 + }, + { + "epoch": 0.8720718017950448, + "grad_norm": 0.0054749236442148685, + "learning_rate": 9.80597415799137e-07, + "loss": 0.1188, + "step": 69764 + }, + { + "epoch": 0.8720968024200605, + "grad_norm": 3.4220516681671143, + "learning_rate": 9.802205700125888e-07, + "loss": 0.9375, + "step": 69766 + }, + { + "epoch": 0.8721218030450761, + "grad_norm": 3.210883140563965, + "learning_rate": 9.798437929194028e-07, + "loss": 1.1992, + "step": 69768 + }, + { + "epoch": 0.8721468036700918, + "grad_norm": 0.00038242802838794887, + "learning_rate": 9.79467084522452e-07, + "loss": 0.3583, + "step": 69770 + }, + { + "epoch": 0.8721718042951073, + "grad_norm": 1.6472954750061035, + "learning_rate": 9.79090444824603e-07, + "loss": 0.5208, + "step": 69772 + }, + { + "epoch": 0.872196804920123, + "grad_norm": 2.180874824523926, + "learning_rate": 9.787138738287272e-07, + "loss": 0.1038, + "step": 69774 + }, + { + "epoch": 0.8722218055451386, + "grad_norm": 4.671292781829834, + "learning_rate": 9.78337371537691e-07, + "loss": 1.4648, + "step": 69776 + }, + { + "epoch": 0.8722468061701543, + "grad_norm": 3.4177072048187256, + "learning_rate": 9.779609379543597e-07, + "loss": 0.5167, + "step": 69778 + }, + { + "epoch": 0.8722718067951699, + "grad_norm": 2.6417739391326904, + "learning_rate": 9.775845730816036e-07, + "loss": 1.6712, + "step": 69780 + }, + { + "epoch": 0.8722968074201854, + "grad_norm": 3.019911766052246, + "learning_rate": 9.772082769222858e-07, + "loss": 0.3296, + "step": 69782 + }, + { + "epoch": 0.8723218080452011, + "grad_norm": 1.2379971742630005, + "learning_rate": 9.76832049479276e-07, + "loss": 0.0635, + "step": 69784 + }, + { + "epoch": 0.8723468086702167, + "grad_norm": 3.3572895526885986, + "learning_rate": 9.764558907554356e-07, + "loss": 0.802, + "step": 69786 + }, + { + "epoch": 0.8723718092952324, + "grad_norm": 2.3543448448181152, + "learning_rate": 9.760798007536322e-07, + "loss": 0.1162, + "step": 69788 + }, + { + "epoch": 0.872396809920248, + "grad_norm": 6.800994873046875, + "learning_rate": 9.7570377947673e-07, + "loss": 0.5301, + "step": 69790 + }, + { + "epoch": 0.8724218105452636, + "grad_norm": 0.023280050605535507, + "learning_rate": 9.753278269275889e-07, + "loss": 1.0097, + "step": 69792 + }, + { + "epoch": 0.8724468111702792, + "grad_norm": 9.548715591430664, + "learning_rate": 9.749519431090771e-07, + "loss": 1.2556, + "step": 69794 + }, + { + "epoch": 0.8724718117952949, + "grad_norm": 0.00033511462970636785, + "learning_rate": 9.74576128024053e-07, + "loss": 0.3047, + "step": 69796 + }, + { + "epoch": 0.8724968124203105, + "grad_norm": 3.123033046722412, + "learning_rate": 9.742003816753831e-07, + "loss": 2.2419, + "step": 69798 + }, + { + "epoch": 0.8725218130453262, + "grad_norm": 2.6570873260498047, + "learning_rate": 9.738247040659254e-07, + "loss": 1.2082, + "step": 69800 + }, + { + "epoch": 0.8725468136703417, + "grad_norm": 0.0004783766926266253, + "learning_rate": 9.73449095198542e-07, + "loss": 0.1506, + "step": 69802 + }, + { + "epoch": 0.8725718142953574, + "grad_norm": 0.0002996595576405525, + "learning_rate": 9.73073555076095e-07, + "loss": 0.5819, + "step": 69804 + }, + { + "epoch": 0.872596814920373, + "grad_norm": 2.981550931930542, + "learning_rate": 9.726980837014421e-07, + "loss": 0.6962, + "step": 69806 + }, + { + "epoch": 0.8726218155453886, + "grad_norm": 8.190605163574219, + "learning_rate": 9.723226810774444e-07, + "loss": 1.8315, + "step": 69808 + }, + { + "epoch": 0.8726468161704043, + "grad_norm": 4.777068614959717, + "learning_rate": 9.719473472069597e-07, + "loss": 1.0136, + "step": 69810 + }, + { + "epoch": 0.8726718167954198, + "grad_norm": 4.535807132720947, + "learning_rate": 9.715720820928497e-07, + "loss": 2.0179, + "step": 69812 + }, + { + "epoch": 0.8726968174204355, + "grad_norm": 4.055264472961426, + "learning_rate": 9.711968857379693e-07, + "loss": 1.537, + "step": 69814 + }, + { + "epoch": 0.8727218180454511, + "grad_norm": 0.11353317648172379, + "learning_rate": 9.708217581451751e-07, + "loss": 0.006, + "step": 69816 + }, + { + "epoch": 0.8727468186704668, + "grad_norm": 3.860720634460449, + "learning_rate": 9.704466993173267e-07, + "loss": 0.5865, + "step": 69818 + }, + { + "epoch": 0.8727718192954824, + "grad_norm": 1.2050518989562988, + "learning_rate": 9.700717092572776e-07, + "loss": 0.5955, + "step": 69820 + }, + { + "epoch": 0.872796819920498, + "grad_norm": 3.2899036407470703, + "learning_rate": 9.696967879678875e-07, + "loss": 0.4044, + "step": 69822 + }, + { + "epoch": 0.8728218205455136, + "grad_norm": 2.7603366374969482, + "learning_rate": 9.693219354520077e-07, + "loss": 1.0972, + "step": 69824 + }, + { + "epoch": 0.8728468211705293, + "grad_norm": 2.9828555583953857, + "learning_rate": 9.689471517124983e-07, + "loss": 0.5233, + "step": 69826 + }, + { + "epoch": 0.8728718217955449, + "grad_norm": 2.971890449523926, + "learning_rate": 9.685724367522088e-07, + "loss": 1.8894, + "step": 69828 + }, + { + "epoch": 0.8728968224205605, + "grad_norm": 0.00020729041716549546, + "learning_rate": 9.681977905739926e-07, + "loss": 0.0093, + "step": 69830 + }, + { + "epoch": 0.8729218230455761, + "grad_norm": 2.086193561553955, + "learning_rate": 9.678232131807063e-07, + "loss": 0.8559, + "step": 69832 + }, + { + "epoch": 0.8729468236705917, + "grad_norm": 3.2520086765289307, + "learning_rate": 9.674487045752e-07, + "loss": 0.9701, + "step": 69834 + }, + { + "epoch": 0.8729718242956074, + "grad_norm": 5.425647735595703, + "learning_rate": 9.670742647603292e-07, + "loss": 1.1632, + "step": 69836 + }, + { + "epoch": 0.872996824920623, + "grad_norm": 2.586135149002075, + "learning_rate": 9.666998937389415e-07, + "loss": 0.3791, + "step": 69838 + }, + { + "epoch": 0.8730218255456387, + "grad_norm": 3.0114896297454834, + "learning_rate": 9.663255915138914e-07, + "loss": 0.7319, + "step": 69840 + }, + { + "epoch": 0.8730468261706542, + "grad_norm": 4.649776935577393, + "learning_rate": 9.659513580880276e-07, + "loss": 0.4255, + "step": 69842 + }, + { + "epoch": 0.8730718267956699, + "grad_norm": 0.00026666861958801746, + "learning_rate": 9.655771934642e-07, + "loss": 0.6316, + "step": 69844 + }, + { + "epoch": 0.8730968274206855, + "grad_norm": 4.822169303894043, + "learning_rate": 9.652030976452597e-07, + "loss": 1.4173, + "step": 69846 + }, + { + "epoch": 0.8731218280457012, + "grad_norm": 0.0016129331197589636, + "learning_rate": 9.648290706340546e-07, + "loss": 0.8761, + "step": 69848 + }, + { + "epoch": 0.8731468286707168, + "grad_norm": 2.2136504650115967, + "learning_rate": 9.644551124334344e-07, + "loss": 1.089, + "step": 69850 + }, + { + "epoch": 0.8731718292957323, + "grad_norm": 1.302269458770752, + "learning_rate": 9.640812230462459e-07, + "loss": 0.1149, + "step": 69852 + }, + { + "epoch": 0.873196829920748, + "grad_norm": 2.9907824993133545, + "learning_rate": 9.637074024753358e-07, + "loss": 0.4743, + "step": 69854 + }, + { + "epoch": 0.8732218305457636, + "grad_norm": 0.00037572611472569406, + "learning_rate": 9.633336507235546e-07, + "loss": 0.5798, + "step": 69856 + }, + { + "epoch": 0.8732468311707793, + "grad_norm": 5.460038185119629, + "learning_rate": 9.62959967793744e-07, + "loss": 0.6315, + "step": 69858 + }, + { + "epoch": 0.8732718317957949, + "grad_norm": 7.63491153717041, + "learning_rate": 9.625863536887537e-07, + "loss": 0.8131, + "step": 69860 + }, + { + "epoch": 0.8732968324208105, + "grad_norm": 3.4545135498046875, + "learning_rate": 9.62212808411427e-07, + "loss": 1.1384, + "step": 69862 + }, + { + "epoch": 0.8733218330458261, + "grad_norm": 4.155966281890869, + "learning_rate": 9.618393319646101e-07, + "loss": 0.8257, + "step": 69864 + }, + { + "epoch": 0.8733468336708418, + "grad_norm": 3.282827615737915, + "learning_rate": 9.614659243511482e-07, + "loss": 1.2159, + "step": 69866 + }, + { + "epoch": 0.8733718342958574, + "grad_norm": 2.982496976852417, + "learning_rate": 9.610925855738806e-07, + "loss": 0.7654, + "step": 69868 + }, + { + "epoch": 0.873396834920873, + "grad_norm": 5.062349796295166, + "learning_rate": 9.607193156356553e-07, + "loss": 1.1302, + "step": 69870 + }, + { + "epoch": 0.8734218355458886, + "grad_norm": 2.854281425476074, + "learning_rate": 9.603461145393122e-07, + "loss": 1.1164, + "step": 69872 + }, + { + "epoch": 0.8734468361709042, + "grad_norm": 3.088728189468384, + "learning_rate": 9.599729822876968e-07, + "loss": 0.4885, + "step": 69874 + }, + { + "epoch": 0.8734718367959199, + "grad_norm": 3.8496956825256348, + "learning_rate": 9.595999188836468e-07, + "loss": 0.2384, + "step": 69876 + }, + { + "epoch": 0.8734968374209355, + "grad_norm": 5.574532508850098, + "learning_rate": 9.592269243300067e-07, + "loss": 0.3327, + "step": 69878 + }, + { + "epoch": 0.8735218380459512, + "grad_norm": 3.883629560470581, + "learning_rate": 9.588539986296164e-07, + "loss": 1.7653, + "step": 69880 + }, + { + "epoch": 0.8735468386709667, + "grad_norm": 4.352689743041992, + "learning_rate": 9.584811417853136e-07, + "loss": 1.4179, + "step": 69882 + }, + { + "epoch": 0.8735718392959824, + "grad_norm": 3.6232388019561768, + "learning_rate": 9.581083537999425e-07, + "loss": 1.2268, + "step": 69884 + }, + { + "epoch": 0.873596839920998, + "grad_norm": 1.0681183338165283, + "learning_rate": 9.577356346763366e-07, + "loss": 1.0125, + "step": 69886 + }, + { + "epoch": 0.8736218405460137, + "grad_norm": 0.00042820186354219913, + "learning_rate": 9.573629844173394e-07, + "loss": 0.6207, + "step": 69888 + }, + { + "epoch": 0.8736468411710293, + "grad_norm": 2.859598398208618, + "learning_rate": 9.569904030257849e-07, + "loss": 0.5936, + "step": 69890 + }, + { + "epoch": 0.8736718417960448, + "grad_norm": 1.7560510635375977, + "learning_rate": 9.566178905045154e-07, + "loss": 0.8074, + "step": 69892 + }, + { + "epoch": 0.8736968424210605, + "grad_norm": 6.370253562927246, + "learning_rate": 9.562454468563642e-07, + "loss": 0.7283, + "step": 69894 + }, + { + "epoch": 0.8737218430460761, + "grad_norm": 3.8998284339904785, + "learning_rate": 9.55873072084168e-07, + "loss": 1.1193, + "step": 69896 + }, + { + "epoch": 0.8737468436710918, + "grad_norm": 1.0488015413284302, + "learning_rate": 9.555007661907646e-07, + "loss": 0.4046, + "step": 69898 + }, + { + "epoch": 0.8737718442961074, + "grad_norm": 1.8544871807098389, + "learning_rate": 9.55128529178987e-07, + "loss": 0.2059, + "step": 69900 + }, + { + "epoch": 0.873796844921123, + "grad_norm": 3.8169500827789307, + "learning_rate": 9.54756361051672e-07, + "loss": 1.0807, + "step": 69902 + }, + { + "epoch": 0.8738218455461386, + "grad_norm": 3.3382740020751953, + "learning_rate": 9.54384261811655e-07, + "loss": 0.7856, + "step": 69904 + }, + { + "epoch": 0.8738468461711543, + "grad_norm": 2.6623005867004395, + "learning_rate": 9.54012231461766e-07, + "loss": 0.7421, + "step": 69906 + }, + { + "epoch": 0.8738718467961699, + "grad_norm": 8.264396667480469, + "learning_rate": 9.536402700048419e-07, + "loss": 0.1368, + "step": 69908 + }, + { + "epoch": 0.8738968474211856, + "grad_norm": 6.221904754638672, + "learning_rate": 9.532683774437134e-07, + "loss": 1.1773, + "step": 69910 + }, + { + "epoch": 0.8739218480462011, + "grad_norm": 3.9023919105529785, + "learning_rate": 9.52896553781214e-07, + "loss": 1.6944, + "step": 69912 + }, + { + "epoch": 0.8739468486712167, + "grad_norm": 6.2385406494140625, + "learning_rate": 9.525247990201747e-07, + "loss": 1.7155, + "step": 69914 + }, + { + "epoch": 0.8739718492962324, + "grad_norm": 0.21690592169761658, + "learning_rate": 9.521531131634265e-07, + "loss": 0.0057, + "step": 69916 + }, + { + "epoch": 0.873996849921248, + "grad_norm": 1.1097928285598755, + "learning_rate": 9.517814962138039e-07, + "loss": 0.5489, + "step": 69918 + }, + { + "epoch": 0.8740218505462637, + "grad_norm": 4.322918891906738, + "learning_rate": 9.514099481741312e-07, + "loss": 0.8674, + "step": 69920 + }, + { + "epoch": 0.8740468511712792, + "grad_norm": 5.1042938232421875, + "learning_rate": 9.510384690472419e-07, + "loss": 1.3962, + "step": 69922 + }, + { + "epoch": 0.8740718517962949, + "grad_norm": 0.00039672304410487413, + "learning_rate": 9.506670588359623e-07, + "loss": 0.3214, + "step": 69924 + }, + { + "epoch": 0.8740968524213105, + "grad_norm": 3.896763324737549, + "learning_rate": 9.502957175431249e-07, + "loss": 1.4148, + "step": 69926 + }, + { + "epoch": 0.8741218530463262, + "grad_norm": 2.599395990371704, + "learning_rate": 9.499244451715528e-07, + "loss": 0.5235, + "step": 69928 + }, + { + "epoch": 0.8741468536713418, + "grad_norm": 5.662552356719971, + "learning_rate": 9.495532417240793e-07, + "loss": 2.2065, + "step": 69930 + }, + { + "epoch": 0.8741718542963574, + "grad_norm": 1.7770825624465942, + "learning_rate": 9.491821072035279e-07, + "loss": 0.2064, + "step": 69932 + }, + { + "epoch": 0.874196854921373, + "grad_norm": 2.5841543674468994, + "learning_rate": 9.488110416127239e-07, + "loss": 0.456, + "step": 69934 + }, + { + "epoch": 0.8742218555463886, + "grad_norm": 3.301711320877075, + "learning_rate": 9.484400449544973e-07, + "loss": 0.5376, + "step": 69936 + }, + { + "epoch": 0.8742468561714043, + "grad_norm": 4.421021938323975, + "learning_rate": 9.480691172316692e-07, + "loss": 2.0301, + "step": 69938 + }, + { + "epoch": 0.8742718567964199, + "grad_norm": 0.00029622740112245083, + "learning_rate": 9.476982584470662e-07, + "loss": 0.5419, + "step": 69940 + }, + { + "epoch": 0.8742968574214355, + "grad_norm": 4.409088134765625, + "learning_rate": 9.47327468603515e-07, + "loss": 1.8871, + "step": 69942 + }, + { + "epoch": 0.8743218580464511, + "grad_norm": 3.9295897483825684, + "learning_rate": 9.469567477038377e-07, + "loss": 0.252, + "step": 69944 + }, + { + "epoch": 0.8743468586714668, + "grad_norm": 2.600904941558838, + "learning_rate": 9.465860957508577e-07, + "loss": 1.2576, + "step": 69946 + }, + { + "epoch": 0.8743718592964824, + "grad_norm": 2.3560972213745117, + "learning_rate": 9.46215512747396e-07, + "loss": 1.0662, + "step": 69948 + }, + { + "epoch": 0.8743968599214981, + "grad_norm": 6.072525978088379, + "learning_rate": 9.458449986962792e-07, + "loss": 0.2521, + "step": 69950 + }, + { + "epoch": 0.8744218605465136, + "grad_norm": 3.90557861328125, + "learning_rate": 9.45474553600324e-07, + "loss": 0.7494, + "step": 69952 + }, + { + "epoch": 0.8744468611715293, + "grad_norm": 3.094810962677002, + "learning_rate": 9.451041774623538e-07, + "loss": 1.4696, + "step": 69954 + }, + { + "epoch": 0.8744718617965449, + "grad_norm": 10.63254165649414, + "learning_rate": 9.447338702851938e-07, + "loss": 1.366, + "step": 69956 + }, + { + "epoch": 0.8744968624215606, + "grad_norm": 2.6780242919921875, + "learning_rate": 9.443636320716565e-07, + "loss": 0.4788, + "step": 69958 + }, + { + "epoch": 0.8745218630465762, + "grad_norm": 3.8894338607788086, + "learning_rate": 9.439934628245661e-07, + "loss": 2.0304, + "step": 69960 + }, + { + "epoch": 0.8745468636715917, + "grad_norm": 3.616151809692383, + "learning_rate": 9.436233625467406e-07, + "loss": 0.3215, + "step": 69962 + }, + { + "epoch": 0.8745718642966074, + "grad_norm": 0.25447842478752136, + "learning_rate": 9.432533312409998e-07, + "loss": 0.8619, + "step": 69964 + }, + { + "epoch": 0.874596864921623, + "grad_norm": 2.5353243350982666, + "learning_rate": 9.428833689101591e-07, + "loss": 0.9586, + "step": 69966 + }, + { + "epoch": 0.8746218655466387, + "grad_norm": 1.6010537147521973, + "learning_rate": 9.425134755570375e-07, + "loss": 0.5338, + "step": 69968 + }, + { + "epoch": 0.8746468661716543, + "grad_norm": 2.8241686820983887, + "learning_rate": 9.421436511844573e-07, + "loss": 0.3692, + "step": 69970 + }, + { + "epoch": 0.8746718667966699, + "grad_norm": 1.9941319227218628, + "learning_rate": 9.417738957952249e-07, + "loss": 0.6843, + "step": 69972 + }, + { + "epoch": 0.8746968674216855, + "grad_norm": 2.736919641494751, + "learning_rate": 9.414042093921649e-07, + "loss": 1.0664, + "step": 69974 + }, + { + "epoch": 0.8747218680467012, + "grad_norm": 4.6119489669799805, + "learning_rate": 9.410345919780861e-07, + "loss": 2.2048, + "step": 69976 + }, + { + "epoch": 0.8747468686717168, + "grad_norm": 2.802344560623169, + "learning_rate": 9.406650435558085e-07, + "loss": 0.2502, + "step": 69978 + }, + { + "epoch": 0.8747718692967325, + "grad_norm": 4.315661907196045, + "learning_rate": 9.402955641281453e-07, + "loss": 2.006, + "step": 69980 + }, + { + "epoch": 0.874796869921748, + "grad_norm": 4.807710647583008, + "learning_rate": 9.399261536979098e-07, + "loss": 1.7488, + "step": 69982 + }, + { + "epoch": 0.8748218705467636, + "grad_norm": 3.1995792388916016, + "learning_rate": 9.395568122679166e-07, + "loss": 0.7746, + "step": 69984 + }, + { + "epoch": 0.8748468711717793, + "grad_norm": 2.6526718139648438, + "learning_rate": 9.391875398409756e-07, + "loss": 0.9116, + "step": 69986 + }, + { + "epoch": 0.8748718717967949, + "grad_norm": 4.718897342681885, + "learning_rate": 9.388183364199021e-07, + "loss": 0.6357, + "step": 69988 + }, + { + "epoch": 0.8748968724218106, + "grad_norm": 2.9974663257598877, + "learning_rate": 9.384492020075053e-07, + "loss": 0.4247, + "step": 69990 + }, + { + "epoch": 0.8749218730468261, + "grad_norm": 3.592388868331909, + "learning_rate": 9.380801366065973e-07, + "loss": 0.5739, + "step": 69992 + }, + { + "epoch": 0.8749468736718418, + "grad_norm": 4.724371910095215, + "learning_rate": 9.377111402199924e-07, + "loss": 1.1028, + "step": 69994 + }, + { + "epoch": 0.8749718742968574, + "grad_norm": 8.471739768981934, + "learning_rate": 9.373422128504972e-07, + "loss": 1.6234, + "step": 69996 + }, + { + "epoch": 0.8749968749218731, + "grad_norm": 4.515848636627197, + "learning_rate": 9.369733545009218e-07, + "loss": 1.3067, + "step": 69998 + }, + { + "epoch": 0.8750218755468887, + "grad_norm": 3.684385299682617, + "learning_rate": 9.366045651740751e-07, + "loss": 1.1377, + "step": 70000 + }, + { + "epoch": 0.8750468761719044, + "grad_norm": 10.46810531616211, + "learning_rate": 9.362358448727649e-07, + "loss": 1.2983, + "step": 70002 + }, + { + "epoch": 0.8750718767969199, + "grad_norm": 4.224719524383545, + "learning_rate": 9.358671935998043e-07, + "loss": 1.1154, + "step": 70004 + }, + { + "epoch": 0.8750968774219355, + "grad_norm": 4.012893199920654, + "learning_rate": 9.354986113579945e-07, + "loss": 0.4609, + "step": 70006 + }, + { + "epoch": 0.8751218780469512, + "grad_norm": 3.744143486022949, + "learning_rate": 9.351300981501465e-07, + "loss": 0.3855, + "step": 70008 + }, + { + "epoch": 0.8751468786719668, + "grad_norm": 4.291162967681885, + "learning_rate": 9.347616539790671e-07, + "loss": 0.6507, + "step": 70010 + }, + { + "epoch": 0.8751718792969825, + "grad_norm": 5.855499744415283, + "learning_rate": 9.343932788475607e-07, + "loss": 1.2628, + "step": 70012 + }, + { + "epoch": 0.875196879921998, + "grad_norm": 4.23696756362915, + "learning_rate": 9.340249727584316e-07, + "loss": 1.4973, + "step": 70014 + }, + { + "epoch": 0.8752218805470137, + "grad_norm": 3.8237667083740234, + "learning_rate": 9.336567357144854e-07, + "loss": 0.6789, + "step": 70016 + }, + { + "epoch": 0.8752468811720293, + "grad_norm": 0.00021719676442444324, + "learning_rate": 9.3328856771853e-07, + "loss": 0.8355, + "step": 70018 + }, + { + "epoch": 0.875271881797045, + "grad_norm": 3.051382064819336, + "learning_rate": 9.32920468773364e-07, + "loss": 0.711, + "step": 70020 + }, + { + "epoch": 0.8752968824220606, + "grad_norm": 0.0019458706956356764, + "learning_rate": 9.325524388817974e-07, + "loss": 0.314, + "step": 70022 + }, + { + "epoch": 0.8753218830470761, + "grad_norm": 4.157981872558594, + "learning_rate": 9.321844780466261e-07, + "loss": 1.1039, + "step": 70024 + }, + { + "epoch": 0.8753468836720918, + "grad_norm": 0.6500702500343323, + "learning_rate": 9.318165862706575e-07, + "loss": 1.1156, + "step": 70026 + }, + { + "epoch": 0.8753718842971074, + "grad_norm": 3.0152533054351807, + "learning_rate": 9.314487635566882e-07, + "loss": 1.0355, + "step": 70028 + }, + { + "epoch": 0.8753968849221231, + "grad_norm": 0.0035909635480493307, + "learning_rate": 9.31081009907524e-07, + "loss": 0.9116, + "step": 70030 + }, + { + "epoch": 0.8754218855471387, + "grad_norm": 3.2829840183258057, + "learning_rate": 9.307133253259659e-07, + "loss": 0.4001, + "step": 70032 + }, + { + "epoch": 0.8754468861721543, + "grad_norm": 3.2792959213256836, + "learning_rate": 9.303457098148106e-07, + "loss": 1.3567, + "step": 70034 + }, + { + "epoch": 0.8754718867971699, + "grad_norm": 0.0003918309521395713, + "learning_rate": 9.299781633768635e-07, + "loss": 1.0386, + "step": 70036 + }, + { + "epoch": 0.8754968874221856, + "grad_norm": 2.9976511001586914, + "learning_rate": 9.296106860149168e-07, + "loss": 0.6522, + "step": 70038 + }, + { + "epoch": 0.8755218880472012, + "grad_norm": 2.5232834815979004, + "learning_rate": 9.292432777317717e-07, + "loss": 0.7789, + "step": 70040 + }, + { + "epoch": 0.8755468886722169, + "grad_norm": 10.952726364135742, + "learning_rate": 9.288759385302293e-07, + "loss": 0.5244, + "step": 70042 + }, + { + "epoch": 0.8755718892972324, + "grad_norm": 2.621281147003174, + "learning_rate": 9.28508668413084e-07, + "loss": 0.6836, + "step": 70044 + }, + { + "epoch": 0.875596889922248, + "grad_norm": 2.3209798336029053, + "learning_rate": 9.281414673831346e-07, + "loss": 0.17, + "step": 70046 + }, + { + "epoch": 0.8756218905472637, + "grad_norm": 2.9535574913024902, + "learning_rate": 9.277743354431767e-07, + "loss": 1.5372, + "step": 70048 + }, + { + "epoch": 0.8756468911722793, + "grad_norm": 6.5572028160095215, + "learning_rate": 9.27407272596007e-07, + "loss": 1.1737, + "step": 70050 + }, + { + "epoch": 0.875671891797295, + "grad_norm": 3.4492251873016357, + "learning_rate": 9.270402788444189e-07, + "loss": 1.2286, + "step": 70052 + }, + { + "epoch": 0.8756968924223105, + "grad_norm": 0.4913478493690491, + "learning_rate": 9.266733541912088e-07, + "loss": 0.0149, + "step": 70054 + }, + { + "epoch": 0.8757218930473262, + "grad_norm": 0.0006607795367017388, + "learning_rate": 9.263064986391735e-07, + "loss": 0.3342, + "step": 70056 + }, + { + "epoch": 0.8757468936723418, + "grad_norm": 3.3351895809173584, + "learning_rate": 9.259397121911029e-07, + "loss": 0.8804, + "step": 70058 + }, + { + "epoch": 0.8757718942973575, + "grad_norm": 4.342390537261963, + "learning_rate": 9.255729948497938e-07, + "loss": 1.2276, + "step": 70060 + }, + { + "epoch": 0.8757968949223731, + "grad_norm": 6.436772346496582, + "learning_rate": 9.252063466180372e-07, + "loss": 0.85, + "step": 70062 + }, + { + "epoch": 0.8758218955473887, + "grad_norm": 2.764481782913208, + "learning_rate": 9.248397674986242e-07, + "loss": 0.1685, + "step": 70064 + }, + { + "epoch": 0.8758468961724043, + "grad_norm": 2.9506969451904297, + "learning_rate": 9.244732574943504e-07, + "loss": 0.0725, + "step": 70066 + }, + { + "epoch": 0.87587189679742, + "grad_norm": 0.0003906319907400757, + "learning_rate": 9.241068166080025e-07, + "loss": 0.7251, + "step": 70068 + }, + { + "epoch": 0.8758968974224356, + "grad_norm": 2.910102605819702, + "learning_rate": 9.237404448423759e-07, + "loss": 1.2003, + "step": 70070 + }, + { + "epoch": 0.8759218980474512, + "grad_norm": 2.5745534896850586, + "learning_rate": 9.233741422002574e-07, + "loss": 0.7533, + "step": 70072 + }, + { + "epoch": 0.8759468986724668, + "grad_norm": 5.3986101150512695, + "learning_rate": 9.230079086844401e-07, + "loss": 1.076, + "step": 70074 + }, + { + "epoch": 0.8759718992974824, + "grad_norm": 3.29846453666687, + "learning_rate": 9.226417442977087e-07, + "loss": 0.9093, + "step": 70076 + }, + { + "epoch": 0.8759968999224981, + "grad_norm": 3.4861655235290527, + "learning_rate": 9.22275649042853e-07, + "loss": 1.3504, + "step": 70078 + }, + { + "epoch": 0.8760219005475137, + "grad_norm": 3.3183586597442627, + "learning_rate": 9.219096229226643e-07, + "loss": 0.731, + "step": 70080 + }, + { + "epoch": 0.8760469011725294, + "grad_norm": 3.658670425415039, + "learning_rate": 9.215436659399268e-07, + "loss": 0.8358, + "step": 70082 + }, + { + "epoch": 0.8760719017975449, + "grad_norm": 0.615479052066803, + "learning_rate": 9.211777780974296e-07, + "loss": 0.0735, + "step": 70084 + }, + { + "epoch": 0.8760969024225606, + "grad_norm": 3.5890164375305176, + "learning_rate": 9.208119593979581e-07, + "loss": 0.5429, + "step": 70086 + }, + { + "epoch": 0.8761219030475762, + "grad_norm": 4.4629597663879395, + "learning_rate": 9.204462098443001e-07, + "loss": 1.3733, + "step": 70088 + }, + { + "epoch": 0.8761469036725918, + "grad_norm": 6.709796905517578, + "learning_rate": 9.200805294392401e-07, + "loss": 1.2191, + "step": 70090 + }, + { + "epoch": 0.8761719042976075, + "grad_norm": 6.440074920654297, + "learning_rate": 9.197149181855602e-07, + "loss": 0.8945, + "step": 70092 + }, + { + "epoch": 0.876196904922623, + "grad_norm": 6.027232646942139, + "learning_rate": 9.193493760860494e-07, + "loss": 0.9318, + "step": 70094 + }, + { + "epoch": 0.8762219055476387, + "grad_norm": 2.866309881210327, + "learning_rate": 9.189839031434888e-07, + "loss": 0.4504, + "step": 70096 + }, + { + "epoch": 0.8762469061726543, + "grad_norm": 2.4451982975006104, + "learning_rate": 9.186184993606639e-07, + "loss": 1.2091, + "step": 70098 + }, + { + "epoch": 0.87627190679767, + "grad_norm": 3.226323127746582, + "learning_rate": 9.182531647403558e-07, + "loss": 1.5939, + "step": 70100 + }, + { + "epoch": 0.8762969074226856, + "grad_norm": 0.008472807705402374, + "learning_rate": 9.178878992853457e-07, + "loss": 0.3902, + "step": 70102 + }, + { + "epoch": 0.8763219080477012, + "grad_norm": 2.044862985610962, + "learning_rate": 9.175227029984191e-07, + "loss": 0.4367, + "step": 70104 + }, + { + "epoch": 0.8763469086727168, + "grad_norm": 7.451045036315918, + "learning_rate": 9.171575758823536e-07, + "loss": 1.6461, + "step": 70106 + }, + { + "epoch": 0.8763719092977325, + "grad_norm": 5.696883678436279, + "learning_rate": 9.167925179399329e-07, + "loss": 1.9617, + "step": 70108 + }, + { + "epoch": 0.8763969099227481, + "grad_norm": 4.5501790046691895, + "learning_rate": 9.164275291739344e-07, + "loss": 1.3533, + "step": 70110 + }, + { + "epoch": 0.8764219105477637, + "grad_norm": 0.8581405282020569, + "learning_rate": 9.160626095871406e-07, + "loss": 0.0691, + "step": 70112 + }, + { + "epoch": 0.8764469111727793, + "grad_norm": 3.878326654434204, + "learning_rate": 9.156977591823302e-07, + "loss": 1.2372, + "step": 70114 + }, + { + "epoch": 0.8764719117977949, + "grad_norm": 0.0002671617839951068, + "learning_rate": 9.153329779622788e-07, + "loss": 0.8204, + "step": 70116 + }, + { + "epoch": 0.8764969124228106, + "grad_norm": 3.9716246128082275, + "learning_rate": 9.149682659297687e-07, + "loss": 1.6964, + "step": 70118 + }, + { + "epoch": 0.8765219130478262, + "grad_norm": 0.6572034955024719, + "learning_rate": 9.146036230875733e-07, + "loss": 0.7844, + "step": 70120 + }, + { + "epoch": 0.8765469136728419, + "grad_norm": 4.735570907592773, + "learning_rate": 9.142390494384734e-07, + "loss": 1.0904, + "step": 70122 + }, + { + "epoch": 0.8765719142978574, + "grad_norm": 2.822666645050049, + "learning_rate": 9.138745449852438e-07, + "loss": 1.4873, + "step": 70124 + }, + { + "epoch": 0.8765969149228731, + "grad_norm": 1.844651699066162, + "learning_rate": 9.13510109730661e-07, + "loss": 1.1329, + "step": 70126 + }, + { + "epoch": 0.8766219155478887, + "grad_norm": 2.5052316188812256, + "learning_rate": 9.131457436775015e-07, + "loss": 1.6759, + "step": 70128 + }, + { + "epoch": 0.8766469161729044, + "grad_norm": 4.932122707366943, + "learning_rate": 9.127814468285357e-07, + "loss": 1.7758, + "step": 70130 + }, + { + "epoch": 0.87667191679792, + "grad_norm": 3.5217950344085693, + "learning_rate": 9.124172191865443e-07, + "loss": 1.1762, + "step": 70132 + }, + { + "epoch": 0.8766969174229355, + "grad_norm": 7.184302806854248, + "learning_rate": 9.120530607542966e-07, + "loss": 1.5746, + "step": 70134 + }, + { + "epoch": 0.8767219180479512, + "grad_norm": 3.3249332904815674, + "learning_rate": 9.11688971534569e-07, + "loss": 0.277, + "step": 70136 + }, + { + "epoch": 0.8767469186729668, + "grad_norm": 5.817295551300049, + "learning_rate": 9.113249515301315e-07, + "loss": 1.3943, + "step": 70138 + }, + { + "epoch": 0.8767719192979825, + "grad_norm": 5.205615043640137, + "learning_rate": 9.109610007437586e-07, + "loss": 1.555, + "step": 70140 + }, + { + "epoch": 0.8767969199229981, + "grad_norm": 4.335294246673584, + "learning_rate": 9.105971191782226e-07, + "loss": 1.1568, + "step": 70142 + }, + { + "epoch": 0.8768219205480137, + "grad_norm": 5.203145503997803, + "learning_rate": 9.102333068362912e-07, + "loss": 1.0317, + "step": 70144 + }, + { + "epoch": 0.8768469211730293, + "grad_norm": 1.243251085281372, + "learning_rate": 9.0986956372074e-07, + "loss": 0.106, + "step": 70146 + }, + { + "epoch": 0.876871921798045, + "grad_norm": 4.330714702606201, + "learning_rate": 9.095058898343346e-07, + "loss": 1.0625, + "step": 70148 + }, + { + "epoch": 0.8768969224230606, + "grad_norm": 5.460206985473633, + "learning_rate": 9.091422851798482e-07, + "loss": 1.3265, + "step": 70150 + }, + { + "epoch": 0.8769219230480763, + "grad_norm": 2.012054920196533, + "learning_rate": 9.087787497600487e-07, + "loss": 0.4343, + "step": 70152 + }, + { + "epoch": 0.8769469236730918, + "grad_norm": 4.535928726196289, + "learning_rate": 9.084152835777038e-07, + "loss": 1.0284, + "step": 70154 + }, + { + "epoch": 0.8769719242981074, + "grad_norm": 1.739879846572876, + "learning_rate": 9.080518866355837e-07, + "loss": 0.7655, + "step": 70156 + }, + { + "epoch": 0.8769969249231231, + "grad_norm": 5.014541149139404, + "learning_rate": 9.076885589364525e-07, + "loss": 1.027, + "step": 70158 + }, + { + "epoch": 0.8770219255481387, + "grad_norm": 0.4942890405654907, + "learning_rate": 9.073253004830818e-07, + "loss": 1.1804, + "step": 70160 + }, + { + "epoch": 0.8770469261731544, + "grad_norm": 5.129858016967773, + "learning_rate": 9.069621112782334e-07, + "loss": 0.7971, + "step": 70162 + }, + { + "epoch": 0.8770719267981699, + "grad_norm": 4.249237060546875, + "learning_rate": 9.065989913246775e-07, + "loss": 1.3587, + "step": 70164 + }, + { + "epoch": 0.8770969274231856, + "grad_norm": 1.600527286529541, + "learning_rate": 9.062359406251775e-07, + "loss": 0.6077, + "step": 70166 + }, + { + "epoch": 0.8771219280482012, + "grad_norm": 4.449036598205566, + "learning_rate": 9.058729591824977e-07, + "loss": 1.7962, + "step": 70168 + }, + { + "epoch": 0.8771469286732169, + "grad_norm": 2.7532973289489746, + "learning_rate": 9.055100469994049e-07, + "loss": 0.8364, + "step": 70170 + }, + { + "epoch": 0.8771719292982325, + "grad_norm": 0.017969513311982155, + "learning_rate": 9.051472040786602e-07, + "loss": 0.3242, + "step": 70172 + }, + { + "epoch": 0.877196929923248, + "grad_norm": 2.229515790939331, + "learning_rate": 9.047844304230291e-07, + "loss": 0.7967, + "step": 70174 + }, + { + "epoch": 0.8772219305482637, + "grad_norm": 3.7570571899414062, + "learning_rate": 9.044217260352716e-07, + "loss": 0.9982, + "step": 70176 + }, + { + "epoch": 0.8772469311732793, + "grad_norm": 0.0035117408260703087, + "learning_rate": 9.040590909181546e-07, + "loss": 0.0001, + "step": 70178 + }, + { + "epoch": 0.877271931798295, + "grad_norm": 1.0808734893798828, + "learning_rate": 9.036965250744378e-07, + "loss": 0.6754, + "step": 70180 + }, + { + "epoch": 0.8772969324233106, + "grad_norm": 4.7234625816345215, + "learning_rate": 9.033340285068792e-07, + "loss": 0.5905, + "step": 70182 + }, + { + "epoch": 0.8773219330483262, + "grad_norm": 0.8140360713005066, + "learning_rate": 9.029716012182444e-07, + "loss": 0.5154, + "step": 70184 + }, + { + "epoch": 0.8773469336733418, + "grad_norm": 0.00038128026062622666, + "learning_rate": 9.026092432112888e-07, + "loss": 0.0916, + "step": 70186 + }, + { + "epoch": 0.8773719342983575, + "grad_norm": 3.3081934452056885, + "learning_rate": 9.02246954488778e-07, + "loss": 1.5053, + "step": 70188 + }, + { + "epoch": 0.8773969349233731, + "grad_norm": 3.597681999206543, + "learning_rate": 9.018847350534655e-07, + "loss": 1.0905, + "step": 70190 + }, + { + "epoch": 0.8774219355483888, + "grad_norm": 1.8944177627563477, + "learning_rate": 9.015225849081133e-07, + "loss": 0.1188, + "step": 70192 + }, + { + "epoch": 0.8774469361734043, + "grad_norm": 3.2545905113220215, + "learning_rate": 9.011605040554794e-07, + "loss": 0.4974, + "step": 70194 + }, + { + "epoch": 0.87747193679842, + "grad_norm": 3.1413581371307373, + "learning_rate": 9.007984924983182e-07, + "loss": 1.3763, + "step": 70196 + }, + { + "epoch": 0.8774969374234356, + "grad_norm": 3.4893248081207275, + "learning_rate": 9.004365502393908e-07, + "loss": 0.7478, + "step": 70198 + }, + { + "epoch": 0.8775219380484512, + "grad_norm": 0.16188651323318481, + "learning_rate": 9.000746772814495e-07, + "loss": 0.7691, + "step": 70200 + }, + { + "epoch": 0.8775469386734669, + "grad_norm": 0.015931833535432816, + "learning_rate": 8.997128736272554e-07, + "loss": 0.5809, + "step": 70202 + }, + { + "epoch": 0.8775719392984824, + "grad_norm": 2.89902400970459, + "learning_rate": 8.993511392795606e-07, + "loss": 0.8151, + "step": 70204 + }, + { + "epoch": 0.8775969399234981, + "grad_norm": 3.732205867767334, + "learning_rate": 8.989894742411188e-07, + "loss": 1.3997, + "step": 70206 + }, + { + "epoch": 0.8776219405485137, + "grad_norm": 0.019359730184078217, + "learning_rate": 8.986278785146884e-07, + "loss": 0.0003, + "step": 70208 + }, + { + "epoch": 0.8776469411735294, + "grad_norm": 4.675252914428711, + "learning_rate": 8.982663521030189e-07, + "loss": 1.221, + "step": 70210 + }, + { + "epoch": 0.877671941798545, + "grad_norm": 13.114442825317383, + "learning_rate": 8.979048950088665e-07, + "loss": 1.1203, + "step": 70212 + }, + { + "epoch": 0.8776969424235606, + "grad_norm": 2.576306104660034, + "learning_rate": 8.975435072349825e-07, + "loss": 0.6634, + "step": 70214 + }, + { + "epoch": 0.8777219430485762, + "grad_norm": 2.725947380065918, + "learning_rate": 8.971821887841226e-07, + "loss": 1.1455, + "step": 70216 + }, + { + "epoch": 0.8777469436735919, + "grad_norm": 3.7730371952056885, + "learning_rate": 8.968209396590344e-07, + "loss": 1.8208, + "step": 70218 + }, + { + "epoch": 0.8777719442986075, + "grad_norm": 2.300919532775879, + "learning_rate": 8.964597598624691e-07, + "loss": 1.4814, + "step": 70220 + }, + { + "epoch": 0.8777969449236231, + "grad_norm": 4.232408046722412, + "learning_rate": 8.960986493971813e-07, + "loss": 1.2173, + "step": 70222 + }, + { + "epoch": 0.8778219455486387, + "grad_norm": 3.5725326538085938, + "learning_rate": 8.957376082659175e-07, + "loss": 0.2758, + "step": 70224 + }, + { + "epoch": 0.8778469461736543, + "grad_norm": 0.7581161260604858, + "learning_rate": 8.953766364714289e-07, + "loss": 0.04, + "step": 70226 + }, + { + "epoch": 0.87787194679867, + "grad_norm": 7.9717230796813965, + "learning_rate": 8.950157340164634e-07, + "loss": 1.8435, + "step": 70228 + }, + { + "epoch": 0.8778969474236856, + "grad_norm": 5.697115421295166, + "learning_rate": 8.946549009037719e-07, + "loss": 0.4561, + "step": 70230 + }, + { + "epoch": 0.8779219480487013, + "grad_norm": 4.1382598876953125, + "learning_rate": 8.942941371361013e-07, + "loss": 0.5621, + "step": 70232 + }, + { + "epoch": 0.8779469486737168, + "grad_norm": 3.0971620082855225, + "learning_rate": 8.939334427161972e-07, + "loss": 0.9586, + "step": 70234 + }, + { + "epoch": 0.8779719492987325, + "grad_norm": 2.5388834476470947, + "learning_rate": 8.935728176468106e-07, + "loss": 0.3322, + "step": 70236 + }, + { + "epoch": 0.8779969499237481, + "grad_norm": 3.6106040477752686, + "learning_rate": 8.932122619306826e-07, + "loss": 0.6934, + "step": 70238 + }, + { + "epoch": 0.8780219505487638, + "grad_norm": 4.235141277313232, + "learning_rate": 8.928517755705646e-07, + "loss": 0.6014, + "step": 70240 + }, + { + "epoch": 0.8780469511737794, + "grad_norm": 4.199860095977783, + "learning_rate": 8.924913585691985e-07, + "loss": 1.1699, + "step": 70242 + }, + { + "epoch": 0.8780719517987949, + "grad_norm": 1.21963632106781, + "learning_rate": 8.921310109293313e-07, + "loss": 1.097, + "step": 70244 + }, + { + "epoch": 0.8780969524238106, + "grad_norm": 2.876838445663452, + "learning_rate": 8.917707326537062e-07, + "loss": 0.7502, + "step": 70246 + }, + { + "epoch": 0.8781219530488262, + "grad_norm": 3.7125368118286133, + "learning_rate": 8.914105237450654e-07, + "loss": 1.285, + "step": 70248 + }, + { + "epoch": 0.8781469536738419, + "grad_norm": 3.2839972972869873, + "learning_rate": 8.910503842061569e-07, + "loss": 1.2022, + "step": 70250 + }, + { + "epoch": 0.8781719542988575, + "grad_norm": 2.325507879257202, + "learning_rate": 8.906903140397172e-07, + "loss": 0.2128, + "step": 70252 + }, + { + "epoch": 0.8781969549238731, + "grad_norm": 3.801605701446533, + "learning_rate": 8.903303132484952e-07, + "loss": 1.4355, + "step": 70254 + }, + { + "epoch": 0.8782219555488887, + "grad_norm": 3.1526920795440674, + "learning_rate": 8.899703818352268e-07, + "loss": 0.6343, + "step": 70256 + }, + { + "epoch": 0.8782469561739044, + "grad_norm": 0.31704840064048767, + "learning_rate": 8.896105198026573e-07, + "loss": 0.6443, + "step": 70258 + }, + { + "epoch": 0.87827195679892, + "grad_norm": 1.8696916103363037, + "learning_rate": 8.892507271535256e-07, + "loss": 1.7936, + "step": 70260 + }, + { + "epoch": 0.8782969574239357, + "grad_norm": 2.0074992179870605, + "learning_rate": 8.888910038905707e-07, + "loss": 0.9473, + "step": 70262 + }, + { + "epoch": 0.8783219580489512, + "grad_norm": 2.4419143199920654, + "learning_rate": 8.885313500165349e-07, + "loss": 0.901, + "step": 70264 + }, + { + "epoch": 0.8783469586739668, + "grad_norm": 0.0002486797166056931, + "learning_rate": 8.881717655341538e-07, + "loss": 0.72, + "step": 70266 + }, + { + "epoch": 0.8783719592989825, + "grad_norm": 1.162025809288025, + "learning_rate": 8.878122504461705e-07, + "loss": 0.204, + "step": 70268 + }, + { + "epoch": 0.8783969599239981, + "grad_norm": 3.497360944747925, + "learning_rate": 8.874528047553199e-07, + "loss": 0.6777, + "step": 70270 + }, + { + "epoch": 0.8784219605490138, + "grad_norm": 3.7060370445251465, + "learning_rate": 8.870934284643396e-07, + "loss": 1.3914, + "step": 70272 + }, + { + "epoch": 0.8784469611740293, + "grad_norm": 3.319380521774292, + "learning_rate": 8.867341215759673e-07, + "loss": 0.6691, + "step": 70274 + }, + { + "epoch": 0.878471961799045, + "grad_norm": 4.415209770202637, + "learning_rate": 8.863748840929387e-07, + "loss": 2.1013, + "step": 70276 + }, + { + "epoch": 0.8784969624240606, + "grad_norm": 12.579564094543457, + "learning_rate": 8.860157160179906e-07, + "loss": 0.1518, + "step": 70278 + }, + { + "epoch": 0.8785219630490763, + "grad_norm": 3.219996929168701, + "learning_rate": 8.856566173538572e-07, + "loss": 0.327, + "step": 70280 + }, + { + "epoch": 0.8785469636740919, + "grad_norm": 3.9016287326812744, + "learning_rate": 8.852975881032755e-07, + "loss": 1.2999, + "step": 70282 + }, + { + "epoch": 0.8785719642991074, + "grad_norm": 6.484681606292725, + "learning_rate": 8.849386282689787e-07, + "loss": 1.8531, + "step": 70284 + }, + { + "epoch": 0.8785969649241231, + "grad_norm": 5.175666332244873, + "learning_rate": 8.84579737853698e-07, + "loss": 1.8864, + "step": 70286 + }, + { + "epoch": 0.8786219655491387, + "grad_norm": 3.2640774250030518, + "learning_rate": 8.842209168601712e-07, + "loss": 0.3971, + "step": 70288 + }, + { + "epoch": 0.8786469661741544, + "grad_norm": 5.746345520019531, + "learning_rate": 8.838621652911272e-07, + "loss": 0.9064, + "step": 70290 + }, + { + "epoch": 0.87867196679917, + "grad_norm": 1.9113833904266357, + "learning_rate": 8.835034831493017e-07, + "loss": 0.2349, + "step": 70292 + }, + { + "epoch": 0.8786969674241856, + "grad_norm": 0.000575618352741003, + "learning_rate": 8.831448704374224e-07, + "loss": 0.1388, + "step": 70294 + }, + { + "epoch": 0.8787219680492012, + "grad_norm": 3.612034797668457, + "learning_rate": 8.827863271582237e-07, + "loss": 0.8267, + "step": 70296 + }, + { + "epoch": 0.8787469686742169, + "grad_norm": 5.246352195739746, + "learning_rate": 8.824278533144359e-07, + "loss": 1.5581, + "step": 70298 + }, + { + "epoch": 0.8787719692992325, + "grad_norm": 4.167449474334717, + "learning_rate": 8.820694489087856e-07, + "loss": 0.5654, + "step": 70300 + }, + { + "epoch": 0.8787969699242482, + "grad_norm": 5.911131381988525, + "learning_rate": 8.817111139440082e-07, + "loss": 1.379, + "step": 70302 + }, + { + "epoch": 0.8788219705492637, + "grad_norm": 4.127119541168213, + "learning_rate": 8.813528484228262e-07, + "loss": 1.8142, + "step": 70304 + }, + { + "epoch": 0.8788469711742793, + "grad_norm": 0.9427717328071594, + "learning_rate": 8.809946523479739e-07, + "loss": 1.3329, + "step": 70306 + }, + { + "epoch": 0.878871971799295, + "grad_norm": 5.428214073181152, + "learning_rate": 8.806365257221749e-07, + "loss": 1.1524, + "step": 70308 + }, + { + "epoch": 0.8788969724243106, + "grad_norm": 0.0004261624999344349, + "learning_rate": 8.802784685481602e-07, + "loss": 1.0912, + "step": 70310 + }, + { + "epoch": 0.8789219730493263, + "grad_norm": 4.901113510131836, + "learning_rate": 8.799204808286554e-07, + "loss": 0.9559, + "step": 70312 + }, + { + "epoch": 0.8789469736743418, + "grad_norm": 3.045623540878296, + "learning_rate": 8.79562562566385e-07, + "loss": 1.2453, + "step": 70314 + }, + { + "epoch": 0.8789719742993575, + "grad_norm": 0.6292222142219543, + "learning_rate": 8.792047137640769e-07, + "loss": 0.134, + "step": 70316 + }, + { + "epoch": 0.8789969749243731, + "grad_norm": 1.7828378677368164, + "learning_rate": 8.788469344244555e-07, + "loss": 0.7967, + "step": 70318 + }, + { + "epoch": 0.8790219755493888, + "grad_norm": 3.054581880569458, + "learning_rate": 8.784892245502452e-07, + "loss": 0.5141, + "step": 70320 + }, + { + "epoch": 0.8790469761744044, + "grad_norm": 2.6759402751922607, + "learning_rate": 8.781315841441751e-07, + "loss": 0.9343, + "step": 70322 + }, + { + "epoch": 0.87907197679942, + "grad_norm": 6.749361991882324, + "learning_rate": 8.777740132089608e-07, + "loss": 1.3985, + "step": 70324 + }, + { + "epoch": 0.8790969774244356, + "grad_norm": 5.275951862335205, + "learning_rate": 8.774165117473321e-07, + "loss": 2.6542, + "step": 70326 + }, + { + "epoch": 0.8791219780494512, + "grad_norm": 2.7859926223754883, + "learning_rate": 8.770590797620071e-07, + "loss": 1.0939, + "step": 70328 + }, + { + "epoch": 0.8791469786744669, + "grad_norm": 3.6294350624084473, + "learning_rate": 8.767017172557113e-07, + "loss": 1.5435, + "step": 70330 + }, + { + "epoch": 0.8791719792994825, + "grad_norm": 0.00051023910054937, + "learning_rate": 8.763444242311636e-07, + "loss": 0.3626, + "step": 70332 + }, + { + "epoch": 0.8791969799244981, + "grad_norm": 1.9350694417953491, + "learning_rate": 8.759872006910864e-07, + "loss": 0.7235, + "step": 70334 + }, + { + "epoch": 0.8792219805495137, + "grad_norm": 3.4843037128448486, + "learning_rate": 8.756300466382051e-07, + "loss": 1.1696, + "step": 70336 + }, + { + "epoch": 0.8792469811745294, + "grad_norm": 0.0013220208929851651, + "learning_rate": 8.752729620752298e-07, + "loss": 1.2035, + "step": 70338 + }, + { + "epoch": 0.879271981799545, + "grad_norm": 5.977605819702148, + "learning_rate": 8.749159470048885e-07, + "loss": 0.7874, + "step": 70340 + }, + { + "epoch": 0.8792969824245607, + "grad_norm": 3.4949681758880615, + "learning_rate": 8.745590014298944e-07, + "loss": 0.873, + "step": 70342 + }, + { + "epoch": 0.8793219830495762, + "grad_norm": 2.95774245262146, + "learning_rate": 8.742021253529687e-07, + "loss": 1.12, + "step": 70344 + }, + { + "epoch": 0.8793469836745919, + "grad_norm": 2.9767513275146484, + "learning_rate": 8.738453187768315e-07, + "loss": 1.9478, + "step": 70346 + }, + { + "epoch": 0.8793719842996075, + "grad_norm": 0.00025035199359990656, + "learning_rate": 8.73488581704196e-07, + "loss": 0.6895, + "step": 70348 + }, + { + "epoch": 0.8793969849246231, + "grad_norm": 4.518837928771973, + "learning_rate": 8.731319141377825e-07, + "loss": 0.7661, + "step": 70350 + }, + { + "epoch": 0.8794219855496388, + "grad_norm": 5.27939510345459, + "learning_rate": 8.727753160803032e-07, + "loss": 0.8509, + "step": 70352 + }, + { + "epoch": 0.8794469861746543, + "grad_norm": 4.987534999847412, + "learning_rate": 8.724187875344791e-07, + "loss": 1.6072, + "step": 70354 + }, + { + "epoch": 0.87947198679967, + "grad_norm": 3.2155909538269043, + "learning_rate": 8.720623285030194e-07, + "loss": 0.6566, + "step": 70356 + }, + { + "epoch": 0.8794969874246856, + "grad_norm": 3.91645884513855, + "learning_rate": 8.717059389886439e-07, + "loss": 1.056, + "step": 70358 + }, + { + "epoch": 0.8795219880497013, + "grad_norm": 4.355213165283203, + "learning_rate": 8.713496189940662e-07, + "loss": 1.4879, + "step": 70360 + }, + { + "epoch": 0.8795469886747169, + "grad_norm": 2.912523031234741, + "learning_rate": 8.709933685219984e-07, + "loss": 0.5849, + "step": 70362 + }, + { + "epoch": 0.8795719892997325, + "grad_norm": 4.074985027313232, + "learning_rate": 8.70637187575154e-07, + "loss": 1.0762, + "step": 70364 + }, + { + "epoch": 0.8795969899247481, + "grad_norm": 5.6404948234558105, + "learning_rate": 8.702810761562452e-07, + "loss": 1.7819, + "step": 70366 + }, + { + "epoch": 0.8796219905497638, + "grad_norm": 2.2649474143981934, + "learning_rate": 8.699250342679854e-07, + "loss": 1.0991, + "step": 70368 + }, + { + "epoch": 0.8796469911747794, + "grad_norm": 4.33589506149292, + "learning_rate": 8.695690619130836e-07, + "loss": 1.3791, + "step": 70370 + }, + { + "epoch": 0.879671991799795, + "grad_norm": 6.490644454956055, + "learning_rate": 8.692131590942532e-07, + "loss": 2.1019, + "step": 70372 + }, + { + "epoch": 0.8796969924248106, + "grad_norm": 0.0009464036556892097, + "learning_rate": 8.688573258142074e-07, + "loss": 0.4321, + "step": 70374 + }, + { + "epoch": 0.8797219930498262, + "grad_norm": 6.63519811630249, + "learning_rate": 8.685015620756499e-07, + "loss": 1.8665, + "step": 70376 + }, + { + "epoch": 0.8797469936748419, + "grad_norm": 3.606773614883423, + "learning_rate": 8.681458678812948e-07, + "loss": 1.0062, + "step": 70378 + }, + { + "epoch": 0.8797719942998575, + "grad_norm": 4.602218151092529, + "learning_rate": 8.67790243233847e-07, + "loss": 0.8673, + "step": 70380 + }, + { + "epoch": 0.8797969949248732, + "grad_norm": 3.0354435443878174, + "learning_rate": 8.674346881360184e-07, + "loss": 0.9731, + "step": 70382 + }, + { + "epoch": 0.8798219955498887, + "grad_norm": 1.8788753747940063, + "learning_rate": 8.670792025905162e-07, + "loss": 1.3881, + "step": 70384 + }, + { + "epoch": 0.8798469961749044, + "grad_norm": 3.490663528442383, + "learning_rate": 8.667237866000466e-07, + "loss": 0.6011, + "step": 70386 + }, + { + "epoch": 0.87987199679992, + "grad_norm": 2.168342351913452, + "learning_rate": 8.6636844016732e-07, + "loss": 0.4668, + "step": 70388 + }, + { + "epoch": 0.8798969974249357, + "grad_norm": 2.6737914085388184, + "learning_rate": 8.660131632950363e-07, + "loss": 0.8134, + "step": 70390 + }, + { + "epoch": 0.8799219980499513, + "grad_norm": 2.292457342147827, + "learning_rate": 8.656579559859068e-07, + "loss": 1.4593, + "step": 70392 + }, + { + "epoch": 0.8799469986749668, + "grad_norm": 3.3431310653686523, + "learning_rate": 8.653028182426326e-07, + "loss": 1.6581, + "step": 70394 + }, + { + "epoch": 0.8799719992999825, + "grad_norm": 3.250307321548462, + "learning_rate": 8.649477500679205e-07, + "loss": 1.4559, + "step": 70396 + }, + { + "epoch": 0.8799969999249981, + "grad_norm": 0.2408892959356308, + "learning_rate": 8.645927514644758e-07, + "loss": 0.2482, + "step": 70398 + }, + { + "epoch": 0.8800220005500138, + "grad_norm": 2.5086867809295654, + "learning_rate": 8.642378224350012e-07, + "loss": 0.9759, + "step": 70400 + }, + { + "epoch": 0.8800470011750294, + "grad_norm": 4.600602626800537, + "learning_rate": 8.638829629821998e-07, + "loss": 1.2488, + "step": 70402 + }, + { + "epoch": 0.880072001800045, + "grad_norm": 7.683218002319336, + "learning_rate": 8.635281731087719e-07, + "loss": 0.5821, + "step": 70404 + }, + { + "epoch": 0.8800970024250606, + "grad_norm": 0.47109049558639526, + "learning_rate": 8.631734528174207e-07, + "loss": 0.4918, + "step": 70406 + }, + { + "epoch": 0.8801220030500763, + "grad_norm": 0.012314965017139912, + "learning_rate": 8.628188021108508e-07, + "loss": 1.0157, + "step": 70408 + }, + { + "epoch": 0.8801470036750919, + "grad_norm": 2.774388074874878, + "learning_rate": 8.624642209917588e-07, + "loss": 0.6173, + "step": 70410 + }, + { + "epoch": 0.8801720043001076, + "grad_norm": 4.7761759757995605, + "learning_rate": 8.621097094628494e-07, + "loss": 1.4886, + "step": 70412 + }, + { + "epoch": 0.8801970049251231, + "grad_norm": 2.452359676361084, + "learning_rate": 8.617552675268204e-07, + "loss": 1.1139, + "step": 70414 + }, + { + "epoch": 0.8802220055501387, + "grad_norm": 0.0017778719775378704, + "learning_rate": 8.614008951863695e-07, + "loss": 0.5566, + "step": 70416 + }, + { + "epoch": 0.8802470061751544, + "grad_norm": 2.1589667797088623, + "learning_rate": 8.610465924441969e-07, + "loss": 1.2904, + "step": 70418 + }, + { + "epoch": 0.88027200680017, + "grad_norm": 2.640003204345703, + "learning_rate": 8.606923593030003e-07, + "loss": 0.3708, + "step": 70420 + }, + { + "epoch": 0.8802970074251857, + "grad_norm": 0.3429153561592102, + "learning_rate": 8.6033819576548e-07, + "loss": 1.0052, + "step": 70422 + }, + { + "epoch": 0.8803220080502012, + "grad_norm": 0.002451214473694563, + "learning_rate": 8.599841018343303e-07, + "loss": 0.676, + "step": 70424 + }, + { + "epoch": 0.8803470086752169, + "grad_norm": 4.249973297119141, + "learning_rate": 8.596300775122523e-07, + "loss": 1.1966, + "step": 70426 + }, + { + "epoch": 0.8803720093002325, + "grad_norm": 5.180800437927246, + "learning_rate": 8.592761228019353e-07, + "loss": 0.4325, + "step": 70428 + }, + { + "epoch": 0.8803970099252482, + "grad_norm": 3.257089376449585, + "learning_rate": 8.589222377060813e-07, + "loss": 0.7478, + "step": 70430 + }, + { + "epoch": 0.8804220105502638, + "grad_norm": 2.3354263305664062, + "learning_rate": 8.585684222273804e-07, + "loss": 0.3945, + "step": 70432 + }, + { + "epoch": 0.8804470111752793, + "grad_norm": 0.36415717005729675, + "learning_rate": 8.582146763685295e-07, + "loss": 1.3484, + "step": 70434 + }, + { + "epoch": 0.880472011800295, + "grad_norm": 1.0334975719451904, + "learning_rate": 8.578610001322252e-07, + "loss": 0.6331, + "step": 70436 + }, + { + "epoch": 0.8804970124253106, + "grad_norm": 14.144158363342285, + "learning_rate": 8.575073935211564e-07, + "loss": 0.8275, + "step": 70438 + }, + { + "epoch": 0.8805220130503263, + "grad_norm": 4.583415508270264, + "learning_rate": 8.571538565380222e-07, + "loss": 1.0666, + "step": 70440 + }, + { + "epoch": 0.8805470136753419, + "grad_norm": 1.569581389427185, + "learning_rate": 8.568003891855081e-07, + "loss": 0.3075, + "step": 70442 + }, + { + "epoch": 0.8805720143003575, + "grad_norm": 2.9529762268066406, + "learning_rate": 8.564469914663087e-07, + "loss": 0.8918, + "step": 70444 + }, + { + "epoch": 0.8805970149253731, + "grad_norm": 1.7758934497833252, + "learning_rate": 8.560936633831185e-07, + "loss": 0.1564, + "step": 70446 + }, + { + "epoch": 0.8806220155503888, + "grad_norm": 0.5101172924041748, + "learning_rate": 8.557404049386243e-07, + "loss": 0.0354, + "step": 70448 + }, + { + "epoch": 0.8806470161754044, + "grad_norm": 3.395033597946167, + "learning_rate": 8.553872161355192e-07, + "loss": 1.29, + "step": 70450 + }, + { + "epoch": 0.8806720168004201, + "grad_norm": 2.620054244995117, + "learning_rate": 8.550340969764914e-07, + "loss": 0.6247, + "step": 70452 + }, + { + "epoch": 0.8806970174254356, + "grad_norm": 3.427764892578125, + "learning_rate": 8.546810474642308e-07, + "loss": 1.414, + "step": 70454 + }, + { + "epoch": 0.8807220180504512, + "grad_norm": 4.35736608505249, + "learning_rate": 8.543280676014253e-07, + "loss": 0.2222, + "step": 70456 + }, + { + "epoch": 0.8807470186754669, + "grad_norm": 2.9855918884277344, + "learning_rate": 8.53975157390764e-07, + "loss": 1.4914, + "step": 70458 + }, + { + "epoch": 0.8807720193004825, + "grad_norm": 12.311464309692383, + "learning_rate": 8.536223168349355e-07, + "loss": 1.686, + "step": 70460 + }, + { + "epoch": 0.8807970199254982, + "grad_norm": 3.6347312927246094, + "learning_rate": 8.532695459366236e-07, + "loss": 0.8355, + "step": 70462 + }, + { + "epoch": 0.8808220205505137, + "grad_norm": 7.7467427253723145, + "learning_rate": 8.529168446985204e-07, + "loss": 1.7355, + "step": 70464 + }, + { + "epoch": 0.8808470211755294, + "grad_norm": 0.8780310153961182, + "learning_rate": 8.525642131233091e-07, + "loss": 0.4538, + "step": 70466 + }, + { + "epoch": 0.880872021800545, + "grad_norm": 3.5196566581726074, + "learning_rate": 8.522116512136724e-07, + "loss": 1.7585, + "step": 70468 + }, + { + "epoch": 0.8808970224255607, + "grad_norm": 4.385420799255371, + "learning_rate": 8.518591589723002e-07, + "loss": 1.4257, + "step": 70470 + }, + { + "epoch": 0.8809220230505763, + "grad_norm": 3.060286521911621, + "learning_rate": 8.515067364018736e-07, + "loss": 1.4589, + "step": 70472 + }, + { + "epoch": 0.8809470236755919, + "grad_norm": 3.4189889430999756, + "learning_rate": 8.511543835050795e-07, + "loss": 1.3865, + "step": 70474 + }, + { + "epoch": 0.8809720243006075, + "grad_norm": 7.917154312133789, + "learning_rate": 8.50802100284599e-07, + "loss": 1.0453, + "step": 70476 + }, + { + "epoch": 0.8809970249256232, + "grad_norm": 2.8971800804138184, + "learning_rate": 8.504498867431177e-07, + "loss": 2.0447, + "step": 70478 + }, + { + "epoch": 0.8810220255506388, + "grad_norm": 0.9806902408599854, + "learning_rate": 8.500977428833135e-07, + "loss": 0.5591, + "step": 70480 + }, + { + "epoch": 0.8810470261756544, + "grad_norm": 0.00037786521716043353, + "learning_rate": 8.497456687078709e-07, + "loss": 0.1759, + "step": 70482 + }, + { + "epoch": 0.88107202680067, + "grad_norm": 3.0436179637908936, + "learning_rate": 8.493936642194733e-07, + "loss": 1.7591, + "step": 70484 + }, + { + "epoch": 0.8810970274256856, + "grad_norm": 4.543521881103516, + "learning_rate": 8.490417294207987e-07, + "loss": 1.3081, + "step": 70486 + }, + { + "epoch": 0.8811220280507013, + "grad_norm": 1.3095952272415161, + "learning_rate": 8.48689864314528e-07, + "loss": 0.9297, + "step": 70488 + }, + { + "epoch": 0.8811470286757169, + "grad_norm": 7.268970966339111, + "learning_rate": 8.483380689033404e-07, + "loss": 1.0046, + "step": 70490 + }, + { + "epoch": 0.8811720293007326, + "grad_norm": 2.2953927516937256, + "learning_rate": 8.479863431899204e-07, + "loss": 0.9175, + "step": 70492 + }, + { + "epoch": 0.8811970299257481, + "grad_norm": 5.150190830230713, + "learning_rate": 8.47634687176937e-07, + "loss": 0.765, + "step": 70494 + }, + { + "epoch": 0.8812220305507638, + "grad_norm": 1.0475679636001587, + "learning_rate": 8.472831008670745e-07, + "loss": 0.6038, + "step": 70496 + }, + { + "epoch": 0.8812470311757794, + "grad_norm": 2.4314730167388916, + "learning_rate": 8.46931584263011e-07, + "loss": 0.4403, + "step": 70498 + }, + { + "epoch": 0.881272031800795, + "grad_norm": 1.7810558080673218, + "learning_rate": 8.46580137367421e-07, + "loss": 0.035, + "step": 70500 + }, + { + "epoch": 0.8812970324258107, + "grad_norm": 3.4231770038604736, + "learning_rate": 8.462287601829832e-07, + "loss": 0.9259, + "step": 70502 + }, + { + "epoch": 0.8813220330508262, + "grad_norm": 4.693436145782471, + "learning_rate": 8.458774527123736e-07, + "loss": 1.1721, + "step": 70504 + }, + { + "epoch": 0.8813470336758419, + "grad_norm": 0.00034149931161664426, + "learning_rate": 8.455262149582644e-07, + "loss": 0.7684, + "step": 70506 + }, + { + "epoch": 0.8813720343008575, + "grad_norm": 0.0030376394279301167, + "learning_rate": 8.451750469233344e-07, + "loss": 0.8657, + "step": 70508 + }, + { + "epoch": 0.8813970349258732, + "grad_norm": 0.00035831163404509425, + "learning_rate": 8.448239486102561e-07, + "loss": 1.1307, + "step": 70510 + }, + { + "epoch": 0.8814220355508888, + "grad_norm": 0.18566516041755676, + "learning_rate": 8.44472920021705e-07, + "loss": 0.1075, + "step": 70512 + }, + { + "epoch": 0.8814470361759044, + "grad_norm": 3.8249051570892334, + "learning_rate": 8.441219611603513e-07, + "loss": 1.3784, + "step": 70514 + }, + { + "epoch": 0.88147203680092, + "grad_norm": 2.336275815963745, + "learning_rate": 8.437710720288716e-07, + "loss": 0.6256, + "step": 70516 + }, + { + "epoch": 0.8814970374259357, + "grad_norm": 2.0450057983398438, + "learning_rate": 8.434202526299373e-07, + "loss": 0.3358, + "step": 70518 + }, + { + "epoch": 0.8815220380509513, + "grad_norm": 3.348902463912964, + "learning_rate": 8.430695029662173e-07, + "loss": 0.7992, + "step": 70520 + }, + { + "epoch": 0.881547038675967, + "grad_norm": 2.7455801963806152, + "learning_rate": 8.427188230403871e-07, + "loss": 0.6937, + "step": 70522 + }, + { + "epoch": 0.8815720393009825, + "grad_norm": 5.167927265167236, + "learning_rate": 8.423682128551136e-07, + "loss": 1.2126, + "step": 70524 + }, + { + "epoch": 0.8815970399259981, + "grad_norm": 3.8830912113189697, + "learning_rate": 8.420176724130713e-07, + "loss": 0.8041, + "step": 70526 + }, + { + "epoch": 0.8816220405510138, + "grad_norm": 9.68147087097168, + "learning_rate": 8.416672017169247e-07, + "loss": 1.0526, + "step": 70528 + }, + { + "epoch": 0.8816470411760294, + "grad_norm": 1.0759012699127197, + "learning_rate": 8.413168007693473e-07, + "loss": 0.5604, + "step": 70530 + }, + { + "epoch": 0.8816720418010451, + "grad_norm": 1.8576146364212036, + "learning_rate": 8.409664695730057e-07, + "loss": 0.8052, + "step": 70532 + }, + { + "epoch": 0.8816970424260606, + "grad_norm": 1.9937875270843506, + "learning_rate": 8.406162081305669e-07, + "loss": 0.8133, + "step": 70534 + }, + { + "epoch": 0.8817220430510763, + "grad_norm": 0.8577234148979187, + "learning_rate": 8.402660164447018e-07, + "loss": 0.3334, + "step": 70536 + }, + { + "epoch": 0.8817470436760919, + "grad_norm": 0.031122101470828056, + "learning_rate": 8.399158945180741e-07, + "loss": 0.0944, + "step": 70538 + }, + { + "epoch": 0.8817720443011076, + "grad_norm": 0.8686288595199585, + "learning_rate": 8.395658423533526e-07, + "loss": 1.0103, + "step": 70540 + }, + { + "epoch": 0.8817970449261232, + "grad_norm": 3.246584892272949, + "learning_rate": 8.392158599532008e-07, + "loss": 1.4631, + "step": 70542 + }, + { + "epoch": 0.8818220455511387, + "grad_norm": 4.485286712646484, + "learning_rate": 8.388659473202865e-07, + "loss": 1.3578, + "step": 70544 + }, + { + "epoch": 0.8818470461761544, + "grad_norm": 2.40891170501709, + "learning_rate": 8.385161044572743e-07, + "loss": 0.3668, + "step": 70546 + }, + { + "epoch": 0.88187204680117, + "grad_norm": 3.933889627456665, + "learning_rate": 8.381663313668265e-07, + "loss": 0.692, + "step": 70548 + }, + { + "epoch": 0.8818970474261857, + "grad_norm": 3.0038230419158936, + "learning_rate": 8.378166280516098e-07, + "loss": 0.8065, + "step": 70550 + }, + { + "epoch": 0.8819220480512013, + "grad_norm": 2.7755188941955566, + "learning_rate": 8.374669945142844e-07, + "loss": 0.4232, + "step": 70552 + }, + { + "epoch": 0.8819470486762169, + "grad_norm": 0.0003168147522956133, + "learning_rate": 8.371174307575159e-07, + "loss": 0.7763, + "step": 70554 + }, + { + "epoch": 0.8819720493012325, + "grad_norm": 2.8494720458984375, + "learning_rate": 8.367679367839632e-07, + "loss": 1.7699, + "step": 70556 + }, + { + "epoch": 0.8819970499262482, + "grad_norm": 2.222592353820801, + "learning_rate": 8.364185125962932e-07, + "loss": 0.5974, + "step": 70558 + }, + { + "epoch": 0.8820220505512638, + "grad_norm": 3.6713201999664307, + "learning_rate": 8.360691581971625e-07, + "loss": 0.2166, + "step": 70560 + }, + { + "epoch": 0.8820470511762795, + "grad_norm": 3.553880214691162, + "learning_rate": 8.357198735892324e-07, + "loss": 0.9003, + "step": 70562 + }, + { + "epoch": 0.882072051801295, + "grad_norm": 3.978362560272217, + "learning_rate": 8.353706587751653e-07, + "loss": 0.8066, + "step": 70564 + }, + { + "epoch": 0.8820970524263106, + "grad_norm": 4.522070407867432, + "learning_rate": 8.350215137576179e-07, + "loss": 0.8203, + "step": 70566 + }, + { + "epoch": 0.8821220530513263, + "grad_norm": 3.891186475753784, + "learning_rate": 8.346724385392512e-07, + "loss": 0.7525, + "step": 70568 + }, + { + "epoch": 0.8821470536763419, + "grad_norm": 2.3361318111419678, + "learning_rate": 8.343234331227246e-07, + "loss": 0.7406, + "step": 70570 + }, + { + "epoch": 0.8821720543013576, + "grad_norm": 0.0003426068869885057, + "learning_rate": 8.339744975106923e-07, + "loss": 0.6158, + "step": 70572 + }, + { + "epoch": 0.8821970549263731, + "grad_norm": 2.5365982055664062, + "learning_rate": 8.336256317058156e-07, + "loss": 0.7139, + "step": 70574 + }, + { + "epoch": 0.8822220555513888, + "grad_norm": 2.099382162094116, + "learning_rate": 8.33276835710748e-07, + "loss": 0.8088, + "step": 70576 + }, + { + "epoch": 0.8822470561764044, + "grad_norm": 5.095826148986816, + "learning_rate": 8.329281095281494e-07, + "loss": 0.9952, + "step": 70578 + }, + { + "epoch": 0.8822720568014201, + "grad_norm": 0.05333424732089043, + "learning_rate": 8.325794531606724e-07, + "loss": 0.5301, + "step": 70580 + }, + { + "epoch": 0.8822970574264357, + "grad_norm": 3.6322152614593506, + "learning_rate": 8.322308666109758e-07, + "loss": 0.884, + "step": 70582 + }, + { + "epoch": 0.8823220580514513, + "grad_norm": 0.9389338493347168, + "learning_rate": 8.318823498817119e-07, + "loss": 0.0542, + "step": 70584 + }, + { + "epoch": 0.8823470586764669, + "grad_norm": 0.000236164647503756, + "learning_rate": 8.315339029755343e-07, + "loss": 0.3088, + "step": 70586 + }, + { + "epoch": 0.8823720593014825, + "grad_norm": 0.8426515460014343, + "learning_rate": 8.311855258950996e-07, + "loss": 1.0239, + "step": 70588 + }, + { + "epoch": 0.8823970599264982, + "grad_norm": 3.289041757583618, + "learning_rate": 8.30837218643058e-07, + "loss": 0.3233, + "step": 70590 + }, + { + "epoch": 0.8824220605515138, + "grad_norm": 2.3957817554473877, + "learning_rate": 8.304889812220651e-07, + "loss": 0.3325, + "step": 70592 + }, + { + "epoch": 0.8824470611765294, + "grad_norm": 1.2781727313995361, + "learning_rate": 8.301408136347689e-07, + "loss": 1.3723, + "step": 70594 + }, + { + "epoch": 0.882472061801545, + "grad_norm": 3.4638450145721436, + "learning_rate": 8.29792715883826e-07, + "loss": 0.2803, + "step": 70596 + }, + { + "epoch": 0.8824970624265607, + "grad_norm": 3.0144991874694824, + "learning_rate": 8.294446879718853e-07, + "loss": 0.9264, + "step": 70598 + }, + { + "epoch": 0.8825220630515763, + "grad_norm": 4.51116418838501, + "learning_rate": 8.29096729901595e-07, + "loss": 0.3732, + "step": 70600 + }, + { + "epoch": 0.882547063676592, + "grad_norm": 3.4652297496795654, + "learning_rate": 8.287488416756084e-07, + "loss": 2.2762, + "step": 70602 + }, + { + "epoch": 0.8825720643016075, + "grad_norm": 3.4938669204711914, + "learning_rate": 8.284010232965723e-07, + "loss": 1.6206, + "step": 70604 + }, + { + "epoch": 0.8825970649266232, + "grad_norm": 3.383409023284912, + "learning_rate": 8.280532747671389e-07, + "loss": 1.6911, + "step": 70606 + }, + { + "epoch": 0.8826220655516388, + "grad_norm": 7.063632965087891, + "learning_rate": 8.277055960899527e-07, + "loss": 1.3072, + "step": 70608 + }, + { + "epoch": 0.8826470661766544, + "grad_norm": 2.393681049346924, + "learning_rate": 8.273579872676651e-07, + "loss": 0.3927, + "step": 70610 + }, + { + "epoch": 0.8826720668016701, + "grad_norm": 0.0023008554708212614, + "learning_rate": 8.270104483029229e-07, + "loss": 0.696, + "step": 70612 + }, + { + "epoch": 0.8826970674266856, + "grad_norm": 5.757070541381836, + "learning_rate": 8.266629791983693e-07, + "loss": 2.2659, + "step": 70614 + }, + { + "epoch": 0.8827220680517013, + "grad_norm": 0.9256977438926697, + "learning_rate": 8.263155799566547e-07, + "loss": 0.1257, + "step": 70616 + }, + { + "epoch": 0.8827470686767169, + "grad_norm": 12.933756828308105, + "learning_rate": 8.259682505804223e-07, + "loss": 1.8202, + "step": 70618 + }, + { + "epoch": 0.8827720693017326, + "grad_norm": 4.0290679931640625, + "learning_rate": 8.256209910723201e-07, + "loss": 1.0497, + "step": 70620 + }, + { + "epoch": 0.8827970699267482, + "grad_norm": 2.713503122329712, + "learning_rate": 8.252738014349914e-07, + "loss": 0.5339, + "step": 70622 + }, + { + "epoch": 0.8828220705517638, + "grad_norm": 5.606415271759033, + "learning_rate": 8.249266816710777e-07, + "loss": 2.0362, + "step": 70624 + }, + { + "epoch": 0.8828470711767794, + "grad_norm": 2.3643717765808105, + "learning_rate": 8.245796317832266e-07, + "loss": 0.2095, + "step": 70626 + }, + { + "epoch": 0.882872071801795, + "grad_norm": 1.028888463973999, + "learning_rate": 8.242326517740773e-07, + "loss": 0.1521, + "step": 70628 + }, + { + "epoch": 0.8828970724268107, + "grad_norm": 3.9312355518341064, + "learning_rate": 8.238857416462776e-07, + "loss": 1.168, + "step": 70630 + }, + { + "epoch": 0.8829220730518264, + "grad_norm": 2.877589464187622, + "learning_rate": 8.235389014024642e-07, + "loss": 0.8318, + "step": 70632 + }, + { + "epoch": 0.8829470736768419, + "grad_norm": 3.542896032333374, + "learning_rate": 8.231921310452817e-07, + "loss": 1.0144, + "step": 70634 + }, + { + "epoch": 0.8829720743018575, + "grad_norm": 8.878511428833008, + "learning_rate": 8.228454305773715e-07, + "loss": 1.5199, + "step": 70636 + }, + { + "epoch": 0.8829970749268732, + "grad_norm": 3.3913192749023438, + "learning_rate": 8.2249880000137e-07, + "loss": 0.1458, + "step": 70638 + }, + { + "epoch": 0.8830220755518888, + "grad_norm": 4.042184352874756, + "learning_rate": 8.221522393199221e-07, + "loss": 0.4685, + "step": 70640 + }, + { + "epoch": 0.8830470761769045, + "grad_norm": 3.755007743835449, + "learning_rate": 8.218057485356634e-07, + "loss": 0.9201, + "step": 70642 + }, + { + "epoch": 0.88307207680192, + "grad_norm": 5.4565205574035645, + "learning_rate": 8.214593276512351e-07, + "loss": 0.9423, + "step": 70644 + }, + { + "epoch": 0.8830970774269357, + "grad_norm": 4.381154537200928, + "learning_rate": 8.211129766692738e-07, + "loss": 2.0848, + "step": 70646 + }, + { + "epoch": 0.8831220780519513, + "grad_norm": 3.9201102256774902, + "learning_rate": 8.207666955924199e-07, + "loss": 1.019, + "step": 70648 + }, + { + "epoch": 0.883147078676967, + "grad_norm": 3.7110488414764404, + "learning_rate": 8.204204844233077e-07, + "loss": 1.3974, + "step": 70650 + }, + { + "epoch": 0.8831720793019826, + "grad_norm": 0.3266282081604004, + "learning_rate": 8.200743431645741e-07, + "loss": 0.7089, + "step": 70652 + }, + { + "epoch": 0.8831970799269981, + "grad_norm": 0.8489500284194946, + "learning_rate": 8.197282718188571e-07, + "loss": 0.7195, + "step": 70654 + }, + { + "epoch": 0.8832220805520138, + "grad_norm": 2.0382649898529053, + "learning_rate": 8.193822703887899e-07, + "loss": 0.6273, + "step": 70656 + }, + { + "epoch": 0.8832470811770294, + "grad_norm": 4.83976936340332, + "learning_rate": 8.190363388770117e-07, + "loss": 1.5879, + "step": 70658 + }, + { + "epoch": 0.8832720818020451, + "grad_norm": 4.376728057861328, + "learning_rate": 8.186904772861515e-07, + "loss": 0.9313, + "step": 70660 + }, + { + "epoch": 0.8832970824270607, + "grad_norm": 4.627411365509033, + "learning_rate": 8.183446856188493e-07, + "loss": 1.6994, + "step": 70662 + }, + { + "epoch": 0.8833220830520763, + "grad_norm": 2.80975341796875, + "learning_rate": 8.179989638777342e-07, + "loss": 0.6085, + "step": 70664 + }, + { + "epoch": 0.8833470836770919, + "grad_norm": 0.0018656650790944695, + "learning_rate": 8.176533120654395e-07, + "loss": 0.0331, + "step": 70666 + }, + { + "epoch": 0.8833720843021076, + "grad_norm": 0.00041273486567661166, + "learning_rate": 8.173077301846e-07, + "loss": 0.2578, + "step": 70668 + }, + { + "epoch": 0.8833970849271232, + "grad_norm": 3.7577767372131348, + "learning_rate": 8.169622182378456e-07, + "loss": 0.8679, + "step": 70670 + }, + { + "epoch": 0.8834220855521389, + "grad_norm": 5.09326696395874, + "learning_rate": 8.166167762278088e-07, + "loss": 0.9904, + "step": 70672 + }, + { + "epoch": 0.8834470861771544, + "grad_norm": 2.8855748176574707, + "learning_rate": 8.162714041571207e-07, + "loss": 0.5245, + "step": 70674 + }, + { + "epoch": 0.88347208680217, + "grad_norm": 1.6544766426086426, + "learning_rate": 8.159261020284093e-07, + "loss": 0.1562, + "step": 70676 + }, + { + "epoch": 0.8834970874271857, + "grad_norm": 3.9970948696136475, + "learning_rate": 8.155808698443068e-07, + "loss": 1.1103, + "step": 70678 + }, + { + "epoch": 0.8835220880522013, + "grad_norm": 3.3560123443603516, + "learning_rate": 8.152357076074402e-07, + "loss": 1.1828, + "step": 70680 + }, + { + "epoch": 0.883547088677217, + "grad_norm": 1.2434269189834595, + "learning_rate": 8.148906153204416e-07, + "loss": 0.9992, + "step": 70682 + }, + { + "epoch": 0.8835720893022325, + "grad_norm": 6.430785655975342, + "learning_rate": 8.145455929859359e-07, + "loss": 1.7721, + "step": 70684 + }, + { + "epoch": 0.8835970899272482, + "grad_norm": 5.091826915740967, + "learning_rate": 8.142006406065528e-07, + "loss": 1.7755, + "step": 70686 + }, + { + "epoch": 0.8836220905522638, + "grad_norm": 2.2226312160491943, + "learning_rate": 8.138557581849193e-07, + "loss": 1.1455, + "step": 70688 + }, + { + "epoch": 0.8836470911772795, + "grad_norm": 8.639128684997559, + "learning_rate": 8.135109457236601e-07, + "loss": 1.1311, + "step": 70690 + }, + { + "epoch": 0.8836720918022951, + "grad_norm": 4.4894537925720215, + "learning_rate": 8.131662032254029e-07, + "loss": 1.4177, + "step": 70692 + }, + { + "epoch": 0.8836970924273106, + "grad_norm": 2.479297161102295, + "learning_rate": 8.128215306927711e-07, + "loss": 0.6094, + "step": 70694 + }, + { + "epoch": 0.8837220930523263, + "grad_norm": 0.0003773312782868743, + "learning_rate": 8.12476928128394e-07, + "loss": 0.0004, + "step": 70696 + }, + { + "epoch": 0.8837470936773419, + "grad_norm": 3.479659080505371, + "learning_rate": 8.121323955348915e-07, + "loss": 0.8336, + "step": 70698 + }, + { + "epoch": 0.8837720943023576, + "grad_norm": 4.055294036865234, + "learning_rate": 8.117879329148903e-07, + "loss": 1.0942, + "step": 70700 + }, + { + "epoch": 0.8837970949273732, + "grad_norm": 3.303100109100342, + "learning_rate": 8.114435402710142e-07, + "loss": 0.5943, + "step": 70702 + }, + { + "epoch": 0.8838220955523888, + "grad_norm": 1.3364226818084717, + "learning_rate": 8.110992176058818e-07, + "loss": 0.7614, + "step": 70704 + }, + { + "epoch": 0.8838470961774044, + "grad_norm": 3.81866717338562, + "learning_rate": 8.107549649221202e-07, + "loss": 1.1786, + "step": 70706 + }, + { + "epoch": 0.8838720968024201, + "grad_norm": 3.4462504386901855, + "learning_rate": 8.104107822223484e-07, + "loss": 1.156, + "step": 70708 + }, + { + "epoch": 0.8838970974274357, + "grad_norm": 3.444758653640747, + "learning_rate": 8.100666695091897e-07, + "loss": 0.5303, + "step": 70710 + }, + { + "epoch": 0.8839220980524514, + "grad_norm": 3.492230176925659, + "learning_rate": 8.09722626785262e-07, + "loss": 1.4471, + "step": 70712 + }, + { + "epoch": 0.8839470986774669, + "grad_norm": 3.664999485015869, + "learning_rate": 8.093786540531889e-07, + "loss": 1.5677, + "step": 70714 + }, + { + "epoch": 0.8839720993024826, + "grad_norm": 3.872114896774292, + "learning_rate": 8.090347513155882e-07, + "loss": 1.2021, + "step": 70716 + }, + { + "epoch": 0.8839970999274982, + "grad_norm": 2.722804069519043, + "learning_rate": 8.086909185750769e-07, + "loss": 0.1256, + "step": 70718 + }, + { + "epoch": 0.8840221005525138, + "grad_norm": 3.3003594875335693, + "learning_rate": 8.083471558342781e-07, + "loss": 0.6676, + "step": 70720 + }, + { + "epoch": 0.8840471011775295, + "grad_norm": 2.338498115539551, + "learning_rate": 8.080034630958056e-07, + "loss": 0.967, + "step": 70722 + }, + { + "epoch": 0.884072101802545, + "grad_norm": 2.097339630126953, + "learning_rate": 8.076598403622793e-07, + "loss": 1.1377, + "step": 70724 + }, + { + "epoch": 0.8840971024275607, + "grad_norm": 2.15030837059021, + "learning_rate": 8.073162876363194e-07, + "loss": 0.2647, + "step": 70726 + }, + { + "epoch": 0.8841221030525763, + "grad_norm": 2.4531307220458984, + "learning_rate": 8.06972804920535e-07, + "loss": 1.126, + "step": 70728 + }, + { + "epoch": 0.884147103677592, + "grad_norm": 4.586704254150391, + "learning_rate": 8.066293922175472e-07, + "loss": 0.6554, + "step": 70730 + }, + { + "epoch": 0.8841721043026076, + "grad_norm": 2.2365806102752686, + "learning_rate": 8.062860495299674e-07, + "loss": 1.4046, + "step": 70732 + }, + { + "epoch": 0.8841971049276232, + "grad_norm": 1.9345694780349731, + "learning_rate": 8.059427768604156e-07, + "loss": 0.811, + "step": 70734 + }, + { + "epoch": 0.8842221055526388, + "grad_norm": 2.2112884521484375, + "learning_rate": 8.055995742115019e-07, + "loss": 0.8163, + "step": 70736 + }, + { + "epoch": 0.8842471061776545, + "grad_norm": 2.9023897647857666, + "learning_rate": 8.05256441585841e-07, + "loss": 1.1526, + "step": 70738 + }, + { + "epoch": 0.8842721068026701, + "grad_norm": 3.174349308013916, + "learning_rate": 8.049133789860508e-07, + "loss": 1.577, + "step": 70740 + }, + { + "epoch": 0.8842971074276857, + "grad_norm": 0.00043985265074297786, + "learning_rate": 8.045703864147358e-07, + "loss": 0.8851, + "step": 70742 + }, + { + "epoch": 0.8843221080527013, + "grad_norm": 6.099990367889404, + "learning_rate": 8.042274638745151e-07, + "loss": 1.3966, + "step": 70744 + }, + { + "epoch": 0.8843471086777169, + "grad_norm": 2.259536027908325, + "learning_rate": 8.038846113679955e-07, + "loss": 0.3216, + "step": 70746 + }, + { + "epoch": 0.8843721093027326, + "grad_norm": 5.342036724090576, + "learning_rate": 8.035418288977914e-07, + "loss": 0.9014, + "step": 70748 + }, + { + "epoch": 0.8843971099277482, + "grad_norm": 3.130765676498413, + "learning_rate": 8.03199116466511e-07, + "loss": 0.6962, + "step": 70750 + }, + { + "epoch": 0.8844221105527639, + "grad_norm": 2.5034022331237793, + "learning_rate": 8.028564740767675e-07, + "loss": 0.6845, + "step": 70752 + }, + { + "epoch": 0.8844471111777794, + "grad_norm": 1.7930959463119507, + "learning_rate": 8.025139017311678e-07, + "loss": 0.6706, + "step": 70754 + }, + { + "epoch": 0.8844721118027951, + "grad_norm": 2.836508274078369, + "learning_rate": 8.02171399432321e-07, + "loss": 0.8843, + "step": 70756 + }, + { + "epoch": 0.8844971124278107, + "grad_norm": 3.2369422912597656, + "learning_rate": 8.018289671828372e-07, + "loss": 0.2994, + "step": 70758 + }, + { + "epoch": 0.8845221130528264, + "grad_norm": 2.6699507236480713, + "learning_rate": 8.014866049853209e-07, + "loss": 0.6186, + "step": 70760 + }, + { + "epoch": 0.884547113677842, + "grad_norm": 1.227211356163025, + "learning_rate": 8.011443128423824e-07, + "loss": 1.1759, + "step": 70762 + }, + { + "epoch": 0.8845721143028575, + "grad_norm": 0.0004387452208902687, + "learning_rate": 8.008020907566305e-07, + "loss": 0.5529, + "step": 70764 + }, + { + "epoch": 0.8845971149278732, + "grad_norm": 3.1755714416503906, + "learning_rate": 8.004599387306677e-07, + "loss": 0.6994, + "step": 70766 + }, + { + "epoch": 0.8846221155528888, + "grad_norm": 1.5836775302886963, + "learning_rate": 8.00117856767102e-07, + "loss": 0.7572, + "step": 70768 + }, + { + "epoch": 0.8846471161779045, + "grad_norm": 3.3101696968078613, + "learning_rate": 7.997758448685355e-07, + "loss": 1.1867, + "step": 70770 + }, + { + "epoch": 0.8846721168029201, + "grad_norm": 2.3162877559661865, + "learning_rate": 7.994339030375775e-07, + "loss": 1.3318, + "step": 70772 + }, + { + "epoch": 0.8846971174279357, + "grad_norm": 5.77397346496582, + "learning_rate": 7.990920312768269e-07, + "loss": 1.7528, + "step": 70774 + }, + { + "epoch": 0.8847221180529513, + "grad_norm": 2.4609861373901367, + "learning_rate": 7.987502295888916e-07, + "loss": 1.1776, + "step": 70776 + }, + { + "epoch": 0.884747118677967, + "grad_norm": 4.5125837326049805, + "learning_rate": 7.98408497976374e-07, + "loss": 0.682, + "step": 70778 + }, + { + "epoch": 0.8847721193029826, + "grad_norm": 4.262731075286865, + "learning_rate": 7.980668364418765e-07, + "loss": 0.7534, + "step": 70780 + }, + { + "epoch": 0.8847971199279983, + "grad_norm": 3.358975648880005, + "learning_rate": 7.977252449880013e-07, + "loss": 0.3136, + "step": 70782 + }, + { + "epoch": 0.8848221205530138, + "grad_norm": 5.439634799957275, + "learning_rate": 7.973837236173465e-07, + "loss": 0.9137, + "step": 70784 + }, + { + "epoch": 0.8848471211780294, + "grad_norm": 6.127176761627197, + "learning_rate": 7.970422723325166e-07, + "loss": 1.0582, + "step": 70786 + }, + { + "epoch": 0.8848721218030451, + "grad_norm": 2.06589937210083, + "learning_rate": 7.967008911361129e-07, + "loss": 0.1077, + "step": 70788 + }, + { + "epoch": 0.8848971224280607, + "grad_norm": 4.419684886932373, + "learning_rate": 7.963595800307322e-07, + "loss": 0.4964, + "step": 70790 + }, + { + "epoch": 0.8849221230530764, + "grad_norm": 5.114164352416992, + "learning_rate": 7.96018339018978e-07, + "loss": 0.1434, + "step": 70792 + }, + { + "epoch": 0.8849471236780919, + "grad_norm": 3.867670774459839, + "learning_rate": 7.956771681034436e-07, + "loss": 0.1773, + "step": 70794 + }, + { + "epoch": 0.8849721243031076, + "grad_norm": 3.8331923484802246, + "learning_rate": 7.953360672867327e-07, + "loss": 0.81, + "step": 70796 + }, + { + "epoch": 0.8849971249281232, + "grad_norm": 1.9952656030654907, + "learning_rate": 7.949950365714387e-07, + "loss": 0.579, + "step": 70798 + }, + { + "epoch": 0.8850221255531389, + "grad_norm": 3.879779815673828, + "learning_rate": 7.946540759601595e-07, + "loss": 0.9627, + "step": 70800 + }, + { + "epoch": 0.8850471261781545, + "grad_norm": 5.314337730407715, + "learning_rate": 7.943131854554965e-07, + "loss": 1.7171, + "step": 70802 + }, + { + "epoch": 0.88507212680317, + "grad_norm": 0.6166759729385376, + "learning_rate": 7.939723650600395e-07, + "loss": 0.1215, + "step": 70804 + }, + { + "epoch": 0.8850971274281857, + "grad_norm": 2.2616183757781982, + "learning_rate": 7.936316147763912e-07, + "loss": 0.8961, + "step": 70806 + }, + { + "epoch": 0.8851221280532013, + "grad_norm": 2.655998706817627, + "learning_rate": 7.932909346071394e-07, + "loss": 0.9983, + "step": 70808 + }, + { + "epoch": 0.885147128678217, + "grad_norm": 5.305008411407471, + "learning_rate": 7.929503245548819e-07, + "loss": 0.5799, + "step": 70810 + }, + { + "epoch": 0.8851721293032326, + "grad_norm": 5.594204902648926, + "learning_rate": 7.926097846222136e-07, + "loss": 0.4719, + "step": 70812 + }, + { + "epoch": 0.8851971299282482, + "grad_norm": 1.5619029998779297, + "learning_rate": 7.922693148117267e-07, + "loss": 0.8992, + "step": 70814 + }, + { + "epoch": 0.8852221305532638, + "grad_norm": 1.946219801902771, + "learning_rate": 7.919289151260157e-07, + "loss": 0.9045, + "step": 70816 + }, + { + "epoch": 0.8852471311782795, + "grad_norm": 2.053166627883911, + "learning_rate": 7.915885855676719e-07, + "loss": 0.0764, + "step": 70818 + }, + { + "epoch": 0.8852721318032951, + "grad_norm": 3.166308641433716, + "learning_rate": 7.912483261392867e-07, + "loss": 1.2705, + "step": 70820 + }, + { + "epoch": 0.8852971324283108, + "grad_norm": 6.479848384857178, + "learning_rate": 7.909081368434513e-07, + "loss": 1.4165, + "step": 70822 + }, + { + "epoch": 0.8853221330533263, + "grad_norm": 4.042604446411133, + "learning_rate": 7.905680176827569e-07, + "loss": 0.7525, + "step": 70824 + }, + { + "epoch": 0.885347133678342, + "grad_norm": 2.3412325382232666, + "learning_rate": 7.902279686597958e-07, + "loss": 1.0018, + "step": 70826 + }, + { + "epoch": 0.8853721343033576, + "grad_norm": 0.00032555891084484756, + "learning_rate": 7.898879897771539e-07, + "loss": 0.561, + "step": 70828 + }, + { + "epoch": 0.8853971349283732, + "grad_norm": 4.4906206130981445, + "learning_rate": 7.895480810374245e-07, + "loss": 1.4229, + "step": 70830 + }, + { + "epoch": 0.8854221355533889, + "grad_norm": 3.029869556427002, + "learning_rate": 7.892082424431946e-07, + "loss": 1.1207, + "step": 70832 + }, + { + "epoch": 0.8854471361784044, + "grad_norm": 2.4894421100616455, + "learning_rate": 7.888684739970531e-07, + "loss": 0.5523, + "step": 70834 + }, + { + "epoch": 0.8854721368034201, + "grad_norm": 1.065468668937683, + "learning_rate": 7.885287757015836e-07, + "loss": 0.8836, + "step": 70836 + }, + { + "epoch": 0.8854971374284357, + "grad_norm": 2.359901189804077, + "learning_rate": 7.881891475593772e-07, + "loss": 0.4741, + "step": 70838 + }, + { + "epoch": 0.8855221380534514, + "grad_norm": 5.42970085144043, + "learning_rate": 7.87849589573021e-07, + "loss": 0.5188, + "step": 70840 + }, + { + "epoch": 0.885547138678467, + "grad_norm": 2.9660747051239014, + "learning_rate": 7.875101017450981e-07, + "loss": 1.2947, + "step": 70842 + }, + { + "epoch": 0.8855721393034826, + "grad_norm": 4.957876682281494, + "learning_rate": 7.87170684078199e-07, + "loss": 1.4462, + "step": 70844 + }, + { + "epoch": 0.8855971399284982, + "grad_norm": 0.7237566709518433, + "learning_rate": 7.868313365749014e-07, + "loss": 0.7557, + "step": 70846 + }, + { + "epoch": 0.8856221405535138, + "grad_norm": 12.029375076293945, + "learning_rate": 7.864920592377934e-07, + "loss": 1.3326, + "step": 70848 + }, + { + "epoch": 0.8856471411785295, + "grad_norm": 2.699718475341797, + "learning_rate": 7.861528520694606e-07, + "loss": 1.2034, + "step": 70850 + }, + { + "epoch": 0.8856721418035451, + "grad_norm": 0.0009191320859827101, + "learning_rate": 7.858137150724832e-07, + "loss": 1.3071, + "step": 70852 + }, + { + "epoch": 0.8856971424285607, + "grad_norm": 0.00022303291189018637, + "learning_rate": 7.85474648249448e-07, + "loss": 0.1339, + "step": 70854 + }, + { + "epoch": 0.8857221430535763, + "grad_norm": 3.694336414337158, + "learning_rate": 7.851356516029318e-07, + "loss": 1.6776, + "step": 70856 + }, + { + "epoch": 0.885747143678592, + "grad_norm": 3.220951557159424, + "learning_rate": 7.847967251355238e-07, + "loss": 1.0559, + "step": 70858 + }, + { + "epoch": 0.8857721443036076, + "grad_norm": 0.9245240688323975, + "learning_rate": 7.844578688497973e-07, + "loss": 0.4572, + "step": 70860 + }, + { + "epoch": 0.8857971449286233, + "grad_norm": 3.1512274742126465, + "learning_rate": 7.841190827483358e-07, + "loss": 1.1796, + "step": 70862 + }, + { + "epoch": 0.8858221455536388, + "grad_norm": 2.475101947784424, + "learning_rate": 7.837803668337219e-07, + "loss": 0.7489, + "step": 70864 + }, + { + "epoch": 0.8858471461786545, + "grad_norm": 0.0003769448958337307, + "learning_rate": 7.834417211085322e-07, + "loss": 0.6681, + "step": 70866 + }, + { + "epoch": 0.8858721468036701, + "grad_norm": 0.00028361185104586184, + "learning_rate": 7.831031455753491e-07, + "loss": 0.3635, + "step": 70868 + }, + { + "epoch": 0.8858971474286857, + "grad_norm": 4.175540924072266, + "learning_rate": 7.827646402367484e-07, + "loss": 1.5673, + "step": 70870 + }, + { + "epoch": 0.8859221480537014, + "grad_norm": 4.659206390380859, + "learning_rate": 7.824262050953069e-07, + "loss": 0.8274, + "step": 70872 + }, + { + "epoch": 0.8859471486787169, + "grad_norm": 2.681835651397705, + "learning_rate": 7.820878401536059e-07, + "loss": 1.068, + "step": 70874 + }, + { + "epoch": 0.8859721493037326, + "grad_norm": 7.517820358276367, + "learning_rate": 7.817495454142188e-07, + "loss": 0.4394, + "step": 70876 + }, + { + "epoch": 0.8859971499287482, + "grad_norm": 0.001897703972645104, + "learning_rate": 7.814113208797247e-07, + "loss": 0.5339, + "step": 70878 + }, + { + "epoch": 0.8860221505537639, + "grad_norm": 6.179091930389404, + "learning_rate": 7.810731665526971e-07, + "loss": 1.9359, + "step": 70880 + }, + { + "epoch": 0.8860471511787795, + "grad_norm": 0.00028925028163939714, + "learning_rate": 7.807350824357151e-07, + "loss": 0.5322, + "step": 70882 + }, + { + "epoch": 0.8860721518037951, + "grad_norm": 6.568845272064209, + "learning_rate": 7.8039706853135e-07, + "loss": 1.0552, + "step": 70884 + }, + { + "epoch": 0.8860971524288107, + "grad_norm": 0.00020901740936096758, + "learning_rate": 7.800591248421751e-07, + "loss": 0.6906, + "step": 70886 + }, + { + "epoch": 0.8861221530538264, + "grad_norm": 0.9350041151046753, + "learning_rate": 7.797212513707686e-07, + "loss": 1.0422, + "step": 70888 + }, + { + "epoch": 0.886147153678842, + "grad_norm": 3.0255918502807617, + "learning_rate": 7.793834481196993e-07, + "loss": 1.7708, + "step": 70890 + }, + { + "epoch": 0.8861721543038577, + "grad_norm": 0.7535623908042908, + "learning_rate": 7.790457150915432e-07, + "loss": 0.0237, + "step": 70892 + }, + { + "epoch": 0.8861971549288732, + "grad_norm": 0.5909402370452881, + "learning_rate": 7.78708052288869e-07, + "loss": 0.0933, + "step": 70894 + }, + { + "epoch": 0.8862221555538888, + "grad_norm": 6.512519836425781, + "learning_rate": 7.783704597142549e-07, + "loss": 0.6601, + "step": 70896 + }, + { + "epoch": 0.8862471561789045, + "grad_norm": 0.00019510556012392044, + "learning_rate": 7.780329373702622e-07, + "loss": 1.5837, + "step": 70898 + }, + { + "epoch": 0.8862721568039201, + "grad_norm": 0.00037675429484806955, + "learning_rate": 7.776954852594676e-07, + "loss": 0.8617, + "step": 70900 + }, + { + "epoch": 0.8862971574289358, + "grad_norm": 6.816939353942871, + "learning_rate": 7.773581033844413e-07, + "loss": 0.8477, + "step": 70902 + }, + { + "epoch": 0.8863221580539513, + "grad_norm": 4.259542942047119, + "learning_rate": 7.770207917477502e-07, + "loss": 1.1574, + "step": 70904 + }, + { + "epoch": 0.886347158678967, + "grad_norm": 3.0415329933166504, + "learning_rate": 7.766835503519665e-07, + "loss": 1.7869, + "step": 70906 + }, + { + "epoch": 0.8863721593039826, + "grad_norm": 3.0072174072265625, + "learning_rate": 7.763463791996539e-07, + "loss": 0.3304, + "step": 70908 + }, + { + "epoch": 0.8863971599289983, + "grad_norm": 0.2722390592098236, + "learning_rate": 7.760092782933859e-07, + "loss": 0.0316, + "step": 70910 + }, + { + "epoch": 0.8864221605540139, + "grad_norm": 0.5588765144348145, + "learning_rate": 7.756722476357259e-07, + "loss": 1.4294, + "step": 70912 + }, + { + "epoch": 0.8864471611790294, + "grad_norm": 4.884261608123779, + "learning_rate": 7.753352872292396e-07, + "loss": 1.2639, + "step": 70914 + }, + { + "epoch": 0.8864721618040451, + "grad_norm": 5.608513355255127, + "learning_rate": 7.749983970764973e-07, + "loss": 0.4716, + "step": 70916 + }, + { + "epoch": 0.8864971624290607, + "grad_norm": 5.757101058959961, + "learning_rate": 7.746615771800614e-07, + "loss": 0.5166, + "step": 70918 + }, + { + "epoch": 0.8865221630540764, + "grad_norm": 5.32585334777832, + "learning_rate": 7.743248275424997e-07, + "loss": 1.5936, + "step": 70920 + }, + { + "epoch": 0.886547163679092, + "grad_norm": 2.91864013671875, + "learning_rate": 7.739881481663758e-07, + "loss": 1.5409, + "step": 70922 + }, + { + "epoch": 0.8865721643041076, + "grad_norm": 2.2582595348358154, + "learning_rate": 7.73651539054252e-07, + "loss": 0.5596, + "step": 70924 + }, + { + "epoch": 0.8865971649291232, + "grad_norm": 5.71399450302124, + "learning_rate": 7.733150002086942e-07, + "loss": 2.0042, + "step": 70926 + }, + { + "epoch": 0.8866221655541389, + "grad_norm": 2.3410470485687256, + "learning_rate": 7.729785316322635e-07, + "loss": 0.6802, + "step": 70928 + }, + { + "epoch": 0.8866471661791545, + "grad_norm": 3.878277063369751, + "learning_rate": 7.726421333275247e-07, + "loss": 1.2858, + "step": 70930 + }, + { + "epoch": 0.8866721668041702, + "grad_norm": 2.7742695808410645, + "learning_rate": 7.723058052970378e-07, + "loss": 0.529, + "step": 70932 + }, + { + "epoch": 0.8866971674291857, + "grad_norm": 4.017216682434082, + "learning_rate": 7.719695475433653e-07, + "loss": 1.092, + "step": 70934 + }, + { + "epoch": 0.8867221680542013, + "grad_norm": 3.633385181427002, + "learning_rate": 7.716333600690684e-07, + "loss": 0.4448, + "step": 70936 + }, + { + "epoch": 0.886747168679217, + "grad_norm": 0.012587357312440872, + "learning_rate": 7.712972428767051e-07, + "loss": 0.0873, + "step": 70938 + }, + { + "epoch": 0.8867721693042326, + "grad_norm": 2.8811447620391846, + "learning_rate": 7.70961195968839e-07, + "loss": 0.1492, + "step": 70940 + }, + { + "epoch": 0.8867971699292483, + "grad_norm": 0.13039392232894897, + "learning_rate": 7.706252193480257e-07, + "loss": 0.0901, + "step": 70942 + }, + { + "epoch": 0.8868221705542638, + "grad_norm": 4.608048915863037, + "learning_rate": 7.702893130168266e-07, + "loss": 0.2519, + "step": 70944 + }, + { + "epoch": 0.8868471711792795, + "grad_norm": 2.037428855895996, + "learning_rate": 7.699534769777984e-07, + "loss": 1.3724, + "step": 70946 + }, + { + "epoch": 0.8868721718042951, + "grad_norm": 3.06550931930542, + "learning_rate": 7.696177112335002e-07, + "loss": 1.7131, + "step": 70948 + }, + { + "epoch": 0.8868971724293108, + "grad_norm": 4.395516872406006, + "learning_rate": 7.692820157864877e-07, + "loss": 0.3395, + "step": 70950 + }, + { + "epoch": 0.8869221730543264, + "grad_norm": 0.00047935423208400607, + "learning_rate": 7.689463906393169e-07, + "loss": 0.3791, + "step": 70952 + }, + { + "epoch": 0.886947173679342, + "grad_norm": 0.00025959950289689004, + "learning_rate": 7.686108357945465e-07, + "loss": 0.47, + "step": 70954 + }, + { + "epoch": 0.8869721743043576, + "grad_norm": 1.6843550205230713, + "learning_rate": 7.682753512547291e-07, + "loss": 0.0934, + "step": 70956 + }, + { + "epoch": 0.8869971749293732, + "grad_norm": 3.8940398693084717, + "learning_rate": 7.679399370224228e-07, + "loss": 1.0634, + "step": 70958 + }, + { + "epoch": 0.8870221755543889, + "grad_norm": 2.2720377445220947, + "learning_rate": 7.676045931001786e-07, + "loss": 0.3893, + "step": 70960 + }, + { + "epoch": 0.8870471761794045, + "grad_norm": 2.311544418334961, + "learning_rate": 7.672693194905533e-07, + "loss": 0.3546, + "step": 70962 + }, + { + "epoch": 0.8870721768044201, + "grad_norm": 4.534090995788574, + "learning_rate": 7.669341161960997e-07, + "loss": 0.9903, + "step": 70964 + }, + { + "epoch": 0.8870971774294357, + "grad_norm": 0.0003353865467943251, + "learning_rate": 7.665989832193687e-07, + "loss": 0.8456, + "step": 70966 + }, + { + "epoch": 0.8871221780544514, + "grad_norm": 0.0002007970178965479, + "learning_rate": 7.662639205629164e-07, + "loss": 0.807, + "step": 70968 + }, + { + "epoch": 0.887147178679467, + "grad_norm": 5.459753513336182, + "learning_rate": 7.659289282292892e-07, + "loss": 1.4692, + "step": 70970 + }, + { + "epoch": 0.8871721793044827, + "grad_norm": 3.5811517238616943, + "learning_rate": 7.655940062210442e-07, + "loss": 0.5229, + "step": 70972 + }, + { + "epoch": 0.8871971799294982, + "grad_norm": 2.115126132965088, + "learning_rate": 7.652591545407295e-07, + "loss": 0.5311, + "step": 70974 + }, + { + "epoch": 0.8872221805545139, + "grad_norm": 1.9601502418518066, + "learning_rate": 7.649243731908928e-07, + "loss": 0.1935, + "step": 70976 + }, + { + "epoch": 0.8872471811795295, + "grad_norm": 2.424787759780884, + "learning_rate": 7.645896621740879e-07, + "loss": 1.2345, + "step": 70978 + }, + { + "epoch": 0.8872721818045451, + "grad_norm": 7.234478950500488, + "learning_rate": 7.642550214928601e-07, + "loss": 1.6956, + "step": 70980 + }, + { + "epoch": 0.8872971824295608, + "grad_norm": 2.976186752319336, + "learning_rate": 7.639204511497633e-07, + "loss": 0.3121, + "step": 70982 + }, + { + "epoch": 0.8873221830545763, + "grad_norm": 3.1779308319091797, + "learning_rate": 7.635859511473387e-07, + "loss": 1.3336, + "step": 70984 + }, + { + "epoch": 0.887347183679592, + "grad_norm": 2.964198350906372, + "learning_rate": 7.632515214881398e-07, + "loss": 1.4142, + "step": 70986 + }, + { + "epoch": 0.8873721843046076, + "grad_norm": 0.00034435137058608234, + "learning_rate": 7.629171621747112e-07, + "loss": 1.2817, + "step": 70988 + }, + { + "epoch": 0.8873971849296233, + "grad_norm": 2.5792016983032227, + "learning_rate": 7.625828732095975e-07, + "loss": 0.4664, + "step": 70990 + }, + { + "epoch": 0.8874221855546389, + "grad_norm": 1.320599913597107, + "learning_rate": 7.622486545953478e-07, + "loss": 0.3372, + "step": 70992 + }, + { + "epoch": 0.8874471861796545, + "grad_norm": 2.5933899879455566, + "learning_rate": 7.619145063345046e-07, + "loss": 0.8926, + "step": 70994 + }, + { + "epoch": 0.8874721868046701, + "grad_norm": 5.001857757568359, + "learning_rate": 7.615804284296157e-07, + "loss": 0.6681, + "step": 70996 + }, + { + "epoch": 0.8874971874296858, + "grad_norm": 0.007437755353748798, + "learning_rate": 7.612464208832227e-07, + "loss": 0.0002, + "step": 70998 + }, + { + "epoch": 0.8875221880547014, + "grad_norm": 3.1304092407226562, + "learning_rate": 7.609124836978721e-07, + "loss": 1.0153, + "step": 71000 + }, + { + "epoch": 0.887547188679717, + "grad_norm": 2.3073418140411377, + "learning_rate": 7.605786168761042e-07, + "loss": 0.9014, + "step": 71002 + }, + { + "epoch": 0.8875721893047326, + "grad_norm": 3.140596389770508, + "learning_rate": 7.602448204204627e-07, + "loss": 0.4715, + "step": 71004 + }, + { + "epoch": 0.8875971899297482, + "grad_norm": 4.065768241882324, + "learning_rate": 7.599110943334909e-07, + "loss": 1.2068, + "step": 71006 + }, + { + "epoch": 0.8876221905547639, + "grad_norm": 2.9815938472747803, + "learning_rate": 7.595774386177279e-07, + "loss": 0.4103, + "step": 71008 + }, + { + "epoch": 0.8876471911797795, + "grad_norm": 0.0004500711220316589, + "learning_rate": 7.592438532757185e-07, + "loss": 1.0774, + "step": 71010 + }, + { + "epoch": 0.8876721918047952, + "grad_norm": 4.040605545043945, + "learning_rate": 7.589103383099994e-07, + "loss": 1.103, + "step": 71012 + }, + { + "epoch": 0.8876971924298107, + "grad_norm": 4.123427867889404, + "learning_rate": 7.585768937231142e-07, + "loss": 0.7393, + "step": 71014 + }, + { + "epoch": 0.8877221930548264, + "grad_norm": 2.835245370864868, + "learning_rate": 7.582435195176008e-07, + "loss": 0.6623, + "step": 71016 + }, + { + "epoch": 0.887747193679842, + "grad_norm": 8.669523239135742, + "learning_rate": 7.57910215695995e-07, + "loss": 0.7486, + "step": 71018 + }, + { + "epoch": 0.8877721943048577, + "grad_norm": 3.4067704677581787, + "learning_rate": 7.575769822608403e-07, + "loss": 1.1324, + "step": 71020 + }, + { + "epoch": 0.8877971949298733, + "grad_norm": 0.0070967706851661205, + "learning_rate": 7.572438192146713e-07, + "loss": 0.6801, + "step": 71022 + }, + { + "epoch": 0.8878221955548888, + "grad_norm": 2.9804227352142334, + "learning_rate": 7.569107265600273e-07, + "loss": 0.6964, + "step": 71024 + }, + { + "epoch": 0.8878471961799045, + "grad_norm": 2.1809990406036377, + "learning_rate": 7.565777042994437e-07, + "loss": 0.0384, + "step": 71026 + }, + { + "epoch": 0.8878721968049201, + "grad_norm": 0.00997749250382185, + "learning_rate": 7.562447524354565e-07, + "loss": 0.3389, + "step": 71028 + }, + { + "epoch": 0.8878971974299358, + "grad_norm": 3.352438449859619, + "learning_rate": 7.559118709706026e-07, + "loss": 0.9282, + "step": 71030 + }, + { + "epoch": 0.8879221980549514, + "grad_norm": 3.9613969326019287, + "learning_rate": 7.555790599074164e-07, + "loss": 0.7551, + "step": 71032 + }, + { + "epoch": 0.887947198679967, + "grad_norm": 2.6211729049682617, + "learning_rate": 7.552463192484338e-07, + "loss": 1.2451, + "step": 71034 + }, + { + "epoch": 0.8879721993049826, + "grad_norm": 1.0472438335418701, + "learning_rate": 7.549136489961862e-07, + "loss": 1.1908, + "step": 71036 + }, + { + "epoch": 0.8879971999299983, + "grad_norm": 4.218872547149658, + "learning_rate": 7.545810491532102e-07, + "loss": 0.4225, + "step": 71038 + }, + { + "epoch": 0.8880222005550139, + "grad_norm": 0.00051783089293167, + "learning_rate": 7.542485197220384e-07, + "loss": 0.7886, + "step": 71040 + }, + { + "epoch": 0.8880472011800296, + "grad_norm": 2.381575584411621, + "learning_rate": 7.539160607051998e-07, + "loss": 0.5007, + "step": 71042 + }, + { + "epoch": 0.8880722018050451, + "grad_norm": 2.346914768218994, + "learning_rate": 7.535836721052303e-07, + "loss": 0.4059, + "step": 71044 + }, + { + "epoch": 0.8880972024300607, + "grad_norm": 5.777833461761475, + "learning_rate": 7.5325135392466e-07, + "loss": 1.42, + "step": 71046 + }, + { + "epoch": 0.8881222030550764, + "grad_norm": 2.4521420001983643, + "learning_rate": 7.529191061660201e-07, + "loss": 0.5271, + "step": 71048 + }, + { + "epoch": 0.888147203680092, + "grad_norm": 2.951835870742798, + "learning_rate": 7.525869288318388e-07, + "loss": 1.3971, + "step": 71050 + }, + { + "epoch": 0.8881722043051077, + "grad_norm": 3.729607343673706, + "learning_rate": 7.522548219246506e-07, + "loss": 1.2732, + "step": 71052 + }, + { + "epoch": 0.8881972049301232, + "grad_norm": 5.9094061851501465, + "learning_rate": 7.519227854469813e-07, + "loss": 1.7669, + "step": 71054 + }, + { + "epoch": 0.8882222055551389, + "grad_norm": 4.6009440422058105, + "learning_rate": 7.515908194013588e-07, + "loss": 0.847, + "step": 71056 + }, + { + "epoch": 0.8882472061801545, + "grad_norm": 5.6682963371276855, + "learning_rate": 7.512589237903134e-07, + "loss": 2.4609, + "step": 71058 + }, + { + "epoch": 0.8882722068051702, + "grad_norm": 3.8893775939941406, + "learning_rate": 7.509270986163719e-07, + "loss": 1.4501, + "step": 71060 + }, + { + "epoch": 0.8882972074301858, + "grad_norm": 2.8810434341430664, + "learning_rate": 7.505953438820623e-07, + "loss": 1.0334, + "step": 71062 + }, + { + "epoch": 0.8883222080552013, + "grad_norm": 6.612236499786377, + "learning_rate": 7.502636595899094e-07, + "loss": 2.0139, + "step": 71064 + }, + { + "epoch": 0.888347208680217, + "grad_norm": 3.65621018409729, + "learning_rate": 7.49932045742442e-07, + "loss": 1.3407, + "step": 71066 + }, + { + "epoch": 0.8883722093052326, + "grad_norm": 6.389959812164307, + "learning_rate": 7.496005023421848e-07, + "loss": 1.3516, + "step": 71068 + }, + { + "epoch": 0.8883972099302483, + "grad_norm": 3.102027654647827, + "learning_rate": 7.492690293916593e-07, + "loss": 1.6467, + "step": 71070 + }, + { + "epoch": 0.8884222105552639, + "grad_norm": 0.0013556934427469969, + "learning_rate": 7.489376268933946e-07, + "loss": 0.8275, + "step": 71072 + }, + { + "epoch": 0.8884472111802795, + "grad_norm": 1.2517775297164917, + "learning_rate": 7.486062948499118e-07, + "loss": 1.0235, + "step": 71074 + }, + { + "epoch": 0.8884722118052951, + "grad_norm": 4.3158650398254395, + "learning_rate": 7.482750332637356e-07, + "loss": 2.0669, + "step": 71076 + }, + { + "epoch": 0.8884972124303108, + "grad_norm": 3.7426459789276123, + "learning_rate": 7.479438421373874e-07, + "loss": 1.3039, + "step": 71078 + }, + { + "epoch": 0.8885222130553264, + "grad_norm": 2.853623628616333, + "learning_rate": 7.476127214733919e-07, + "loss": 0.4749, + "step": 71080 + }, + { + "epoch": 0.8885472136803421, + "grad_norm": 0.12235385179519653, + "learning_rate": 7.472816712742704e-07, + "loss": 0.0026, + "step": 71082 + }, + { + "epoch": 0.8885722143053576, + "grad_norm": 4.029816150665283, + "learning_rate": 7.469506915425407e-07, + "loss": 2.0185, + "step": 71084 + }, + { + "epoch": 0.8885972149303732, + "grad_norm": 5.464600086212158, + "learning_rate": 7.466197822807286e-07, + "loss": 1.0578, + "step": 71086 + }, + { + "epoch": 0.8886222155553889, + "grad_norm": 2.412376880645752, + "learning_rate": 7.46288943491349e-07, + "loss": 0.9132, + "step": 71088 + }, + { + "epoch": 0.8886472161804045, + "grad_norm": 4.153970241546631, + "learning_rate": 7.459581751769263e-07, + "loss": 0.9884, + "step": 71090 + }, + { + "epoch": 0.8886722168054202, + "grad_norm": 3.1238527297973633, + "learning_rate": 7.456274773399775e-07, + "loss": 0.592, + "step": 71092 + }, + { + "epoch": 0.8886972174304357, + "grad_norm": 2.2150840759277344, + "learning_rate": 7.452968499830193e-07, + "loss": 1.1015, + "step": 71094 + }, + { + "epoch": 0.8887222180554514, + "grad_norm": 3.820746898651123, + "learning_rate": 7.449662931085744e-07, + "loss": 1.0133, + "step": 71096 + }, + { + "epoch": 0.888747218680467, + "grad_norm": 0.0006765251164324582, + "learning_rate": 7.44635806719154e-07, + "loss": 0.0308, + "step": 71098 + }, + { + "epoch": 0.8887722193054827, + "grad_norm": 2.4555866718292236, + "learning_rate": 7.443053908172815e-07, + "loss": 0.6167, + "step": 71100 + }, + { + "epoch": 0.8887972199304983, + "grad_norm": 7.436548709869385, + "learning_rate": 7.439750454054684e-07, + "loss": 1.9249, + "step": 71102 + }, + { + "epoch": 0.8888222205555139, + "grad_norm": 4.114525318145752, + "learning_rate": 7.436447704862315e-07, + "loss": 1.4304, + "step": 71104 + }, + { + "epoch": 0.8888472211805295, + "grad_norm": 4.533778667449951, + "learning_rate": 7.433145660620911e-07, + "loss": 0.9871, + "step": 71106 + }, + { + "epoch": 0.8888722218055451, + "grad_norm": 3.216588020324707, + "learning_rate": 7.42984432135554e-07, + "loss": 0.3177, + "step": 71108 + }, + { + "epoch": 0.8888972224305608, + "grad_norm": 1.7906310558319092, + "learning_rate": 7.426543687091414e-07, + "loss": 1.0933, + "step": 71110 + }, + { + "epoch": 0.8889222230555764, + "grad_norm": 0.8873746991157532, + "learning_rate": 7.423243757853615e-07, + "loss": 0.2475, + "step": 71112 + }, + { + "epoch": 0.888947223680592, + "grad_norm": 3.1225435733795166, + "learning_rate": 7.419944533667311e-07, + "loss": 0.7785, + "step": 71114 + }, + { + "epoch": 0.8889722243056076, + "grad_norm": 4.426878452301025, + "learning_rate": 7.416646014557604e-07, + "loss": 0.848, + "step": 71116 + }, + { + "epoch": 0.8889972249306233, + "grad_norm": 3.9227590560913086, + "learning_rate": 7.413348200549652e-07, + "loss": 1.6564, + "step": 71118 + }, + { + "epoch": 0.8890222255556389, + "grad_norm": 1.0018196105957031, + "learning_rate": 7.410051091668547e-07, + "loss": 0.1395, + "step": 71120 + }, + { + "epoch": 0.8890472261806546, + "grad_norm": 3.786210060119629, + "learning_rate": 7.406754687939377e-07, + "loss": 0.5652, + "step": 71122 + }, + { + "epoch": 0.8890722268056701, + "grad_norm": 4.432653427124023, + "learning_rate": 7.40345898938728e-07, + "loss": 2.6003, + "step": 71124 + }, + { + "epoch": 0.8890972274306858, + "grad_norm": 6.186622619628906, + "learning_rate": 7.400163996037346e-07, + "loss": 0.9976, + "step": 71126 + }, + { + "epoch": 0.8891222280557014, + "grad_norm": 0.003971456084400415, + "learning_rate": 7.396869707914655e-07, + "loss": 0.164, + "step": 71128 + }, + { + "epoch": 0.889147228680717, + "grad_norm": 0.5230004191398621, + "learning_rate": 7.393576125044333e-07, + "loss": 0.5528, + "step": 71130 + }, + { + "epoch": 0.8891722293057327, + "grad_norm": 3.8964760303497314, + "learning_rate": 7.390283247451435e-07, + "loss": 1.1185, + "step": 71132 + }, + { + "epoch": 0.8891972299307482, + "grad_norm": 2.596202850341797, + "learning_rate": 7.386991075161043e-07, + "loss": 0.9458, + "step": 71134 + }, + { + "epoch": 0.8892222305557639, + "grad_norm": 2.867766857147217, + "learning_rate": 7.383699608198214e-07, + "loss": 0.7584, + "step": 71136 + }, + { + "epoch": 0.8892472311807795, + "grad_norm": 2.1393258571624756, + "learning_rate": 7.38040884658805e-07, + "loss": 1.6083, + "step": 71138 + }, + { + "epoch": 0.8892722318057952, + "grad_norm": 2.8352715969085693, + "learning_rate": 7.377118790355575e-07, + "loss": 1.1856, + "step": 71140 + }, + { + "epoch": 0.8892972324308108, + "grad_norm": 4.251688003540039, + "learning_rate": 7.37382943952587e-07, + "loss": 1.1941, + "step": 71142 + }, + { + "epoch": 0.8893222330558264, + "grad_norm": 0.842711329460144, + "learning_rate": 7.370540794124004e-07, + "loss": 0.6618, + "step": 71144 + }, + { + "epoch": 0.889347233680842, + "grad_norm": 4.500119686126709, + "learning_rate": 7.367252854174978e-07, + "loss": 1.2657, + "step": 71146 + }, + { + "epoch": 0.8893722343058577, + "grad_norm": 4.190402507781982, + "learning_rate": 7.363965619703873e-07, + "loss": 0.5614, + "step": 71148 + }, + { + "epoch": 0.8893972349308733, + "grad_norm": 17.928691864013672, + "learning_rate": 7.360679090735679e-07, + "loss": 3.4727, + "step": 71150 + }, + { + "epoch": 0.889422235555889, + "grad_norm": 2.8036272525787354, + "learning_rate": 7.357393267295476e-07, + "loss": 1.424, + "step": 71152 + }, + { + "epoch": 0.8894472361809045, + "grad_norm": 7.022988796234131, + "learning_rate": 7.354108149408245e-07, + "loss": 1.6261, + "step": 71154 + }, + { + "epoch": 0.8894722368059201, + "grad_norm": 3.6207361221313477, + "learning_rate": 7.350823737099034e-07, + "loss": 1.2556, + "step": 71156 + }, + { + "epoch": 0.8894972374309358, + "grad_norm": 1.0629818439483643, + "learning_rate": 7.347540030392875e-07, + "loss": 0.1004, + "step": 71158 + }, + { + "epoch": 0.8895222380559514, + "grad_norm": 0.21043291687965393, + "learning_rate": 7.344257029314716e-07, + "loss": 0.0454, + "step": 71160 + }, + { + "epoch": 0.8895472386809671, + "grad_norm": 9.871406555175781, + "learning_rate": 7.340974733889605e-07, + "loss": 2.597, + "step": 71162 + }, + { + "epoch": 0.8895722393059826, + "grad_norm": 4.694386005401611, + "learning_rate": 7.33769314414251e-07, + "loss": 1.5393, + "step": 71164 + }, + { + "epoch": 0.8895972399309983, + "grad_norm": 6.1183929443359375, + "learning_rate": 7.334412260098445e-07, + "loss": 0.7363, + "step": 71166 + }, + { + "epoch": 0.8896222405560139, + "grad_norm": 2.7248027324676514, + "learning_rate": 7.33113208178241e-07, + "loss": 0.6811, + "step": 71168 + }, + { + "epoch": 0.8896472411810296, + "grad_norm": 0.9234654307365417, + "learning_rate": 7.327852609219366e-07, + "loss": 1.567, + "step": 71170 + }, + { + "epoch": 0.8896722418060452, + "grad_norm": 2.9549150466918945, + "learning_rate": 7.324573842434291e-07, + "loss": 1.2859, + "step": 71172 + }, + { + "epoch": 0.8896972424310607, + "grad_norm": 2.566640853881836, + "learning_rate": 7.321295781452143e-07, + "loss": 1.2368, + "step": 71174 + }, + { + "epoch": 0.8897222430560764, + "grad_norm": 1.7753350734710693, + "learning_rate": 7.318018426297913e-07, + "loss": 0.1267, + "step": 71176 + }, + { + "epoch": 0.889747243681092, + "grad_norm": 4.660467147827148, + "learning_rate": 7.314741776996537e-07, + "loss": 1.4591, + "step": 71178 + }, + { + "epoch": 0.8897722443061077, + "grad_norm": 2.2233691215515137, + "learning_rate": 7.311465833572973e-07, + "loss": 2.2959, + "step": 71180 + }, + { + "epoch": 0.8897972449311233, + "grad_norm": 6.331672191619873, + "learning_rate": 7.3081905960522e-07, + "loss": 0.8853, + "step": 71182 + }, + { + "epoch": 0.8898222455561389, + "grad_norm": 3.0911881923675537, + "learning_rate": 7.304916064459144e-07, + "loss": 0.3624, + "step": 71184 + }, + { + "epoch": 0.8898472461811545, + "grad_norm": 4.288876533508301, + "learning_rate": 7.301642238818729e-07, + "loss": 1.2812, + "step": 71186 + }, + { + "epoch": 0.8898722468061702, + "grad_norm": 6.469064712524414, + "learning_rate": 7.29836911915589e-07, + "loss": 1.0222, + "step": 71188 + }, + { + "epoch": 0.8898972474311858, + "grad_norm": 2.7555124759674072, + "learning_rate": 7.295096705495553e-07, + "loss": 1.3138, + "step": 71190 + }, + { + "epoch": 0.8899222480562015, + "grad_norm": 3.3085527420043945, + "learning_rate": 7.291824997862673e-07, + "loss": 0.7791, + "step": 71192 + }, + { + "epoch": 0.889947248681217, + "grad_norm": 0.07266320288181305, + "learning_rate": 7.288553996282122e-07, + "loss": 0.0014, + "step": 71194 + }, + { + "epoch": 0.8899722493062326, + "grad_norm": 4.039715766906738, + "learning_rate": 7.285283700778877e-07, + "loss": 1.4818, + "step": 71196 + }, + { + "epoch": 0.8899972499312483, + "grad_norm": 3.8238046169281006, + "learning_rate": 7.282014111377755e-07, + "loss": 1.3629, + "step": 71198 + }, + { + "epoch": 0.8900222505562639, + "grad_norm": 1.1295841932296753, + "learning_rate": 7.278745228103723e-07, + "loss": 0.1368, + "step": 71200 + }, + { + "epoch": 0.8900472511812796, + "grad_norm": 4.391594409942627, + "learning_rate": 7.275477050981639e-07, + "loss": 0.9538, + "step": 71202 + }, + { + "epoch": 0.8900722518062951, + "grad_norm": 7.310844421386719, + "learning_rate": 7.272209580036416e-07, + "loss": 1.3886, + "step": 71204 + }, + { + "epoch": 0.8900972524313108, + "grad_norm": 2.262995958328247, + "learning_rate": 7.268942815292934e-07, + "loss": 0.5555, + "step": 71206 + }, + { + "epoch": 0.8901222530563264, + "grad_norm": 3.2928593158721924, + "learning_rate": 7.265676756776064e-07, + "loss": 1.3148, + "step": 71208 + }, + { + "epoch": 0.8901472536813421, + "grad_norm": 0.0006162587087601423, + "learning_rate": 7.262411404510716e-07, + "loss": 0.6768, + "step": 71210 + }, + { + "epoch": 0.8901722543063577, + "grad_norm": 6.5487799644470215, + "learning_rate": 7.259146758521696e-07, + "loss": 0.6039, + "step": 71212 + }, + { + "epoch": 0.8901972549313732, + "grad_norm": 2.7452621459960938, + "learning_rate": 7.255882818833915e-07, + "loss": 1.1714, + "step": 71214 + }, + { + "epoch": 0.8902222555563889, + "grad_norm": 4.122274875640869, + "learning_rate": 7.252619585472209e-07, + "loss": 2.195, + "step": 71216 + }, + { + "epoch": 0.8902472561814045, + "grad_norm": 4.0559282302856445, + "learning_rate": 7.249357058461426e-07, + "loss": 0.7277, + "step": 71218 + }, + { + "epoch": 0.8902722568064202, + "grad_norm": 0.8774189352989197, + "learning_rate": 7.246095237826445e-07, + "loss": 0.5546, + "step": 71220 + }, + { + "epoch": 0.8902972574314358, + "grad_norm": 1.3086142539978027, + "learning_rate": 7.24283412359208e-07, + "loss": 0.5586, + "step": 71222 + }, + { + "epoch": 0.8903222580564514, + "grad_norm": 3.313330888748169, + "learning_rate": 7.239573715783177e-07, + "loss": 0.7803, + "step": 71224 + }, + { + "epoch": 0.890347258681467, + "grad_norm": 0.0013226986629888415, + "learning_rate": 7.236314014424539e-07, + "loss": 0.0001, + "step": 71226 + }, + { + "epoch": 0.8903722593064827, + "grad_norm": 0.8583608269691467, + "learning_rate": 7.233055019541024e-07, + "loss": 0.0906, + "step": 71228 + }, + { + "epoch": 0.8903972599314983, + "grad_norm": 6.67000675201416, + "learning_rate": 7.229796731157456e-07, + "loss": 1.2672, + "step": 71230 + }, + { + "epoch": 0.890422260556514, + "grad_norm": 3.875993013381958, + "learning_rate": 7.226539149298628e-07, + "loss": 1.0705, + "step": 71232 + }, + { + "epoch": 0.8904472611815295, + "grad_norm": 0.0013965376419946551, + "learning_rate": 7.223282273989373e-07, + "loss": 0.6677, + "step": 71234 + }, + { + "epoch": 0.8904722618065452, + "grad_norm": 0.004097173921763897, + "learning_rate": 7.220026105254474e-07, + "loss": 0.8335, + "step": 71236 + }, + { + "epoch": 0.8904972624315608, + "grad_norm": 4.503860950469971, + "learning_rate": 7.216770643118743e-07, + "loss": 0.9136, + "step": 71238 + }, + { + "epoch": 0.8905222630565764, + "grad_norm": 4.228760719299316, + "learning_rate": 7.213515887606948e-07, + "loss": 0.52, + "step": 71240 + }, + { + "epoch": 0.8905472636815921, + "grad_norm": 1.97828209400177, + "learning_rate": 7.210261838743882e-07, + "loss": 0.8665, + "step": 71242 + }, + { + "epoch": 0.8905722643066076, + "grad_norm": 0.7800239324569702, + "learning_rate": 7.207008496554369e-07, + "loss": 0.6099, + "step": 71244 + }, + { + "epoch": 0.8905972649316233, + "grad_norm": 2.608616352081299, + "learning_rate": 7.203755861063145e-07, + "loss": 1.4187, + "step": 71246 + }, + { + "epoch": 0.8906222655566389, + "grad_norm": 2.9865448474884033, + "learning_rate": 7.200503932295022e-07, + "loss": 1.5475, + "step": 71248 + }, + { + "epoch": 0.8906472661816546, + "grad_norm": 6.083220958709717, + "learning_rate": 7.197252710274705e-07, + "loss": 0.4168, + "step": 71250 + }, + { + "epoch": 0.8906722668066702, + "grad_norm": 3.9043447971343994, + "learning_rate": 7.194002195026984e-07, + "loss": 0.8318, + "step": 71252 + }, + { + "epoch": 0.8906972674316858, + "grad_norm": 6.288630962371826, + "learning_rate": 7.190752386576638e-07, + "loss": 1.3119, + "step": 71254 + }, + { + "epoch": 0.8907222680567014, + "grad_norm": 0.0008720428450033069, + "learning_rate": 7.187503284948383e-07, + "loss": 0.4661, + "step": 71256 + }, + { + "epoch": 0.890747268681717, + "grad_norm": 3.2164809703826904, + "learning_rate": 7.184254890166986e-07, + "loss": 0.6897, + "step": 71258 + }, + { + "epoch": 0.8907722693067327, + "grad_norm": 3.032327890396118, + "learning_rate": 7.181007202257173e-07, + "loss": 0.806, + "step": 71260 + }, + { + "epoch": 0.8907972699317483, + "grad_norm": 8.459016799926758, + "learning_rate": 7.177760221243701e-07, + "loss": 2.2613, + "step": 71262 + }, + { + "epoch": 0.8908222705567639, + "grad_norm": 5.369602203369141, + "learning_rate": 7.174513947151263e-07, + "loss": 1.6521, + "step": 71264 + }, + { + "epoch": 0.8908472711817795, + "grad_norm": 9.225066184997559, + "learning_rate": 7.171268380004592e-07, + "loss": 0.8934, + "step": 71266 + }, + { + "epoch": 0.8908722718067952, + "grad_norm": 3.3741557598114014, + "learning_rate": 7.168023519828438e-07, + "loss": 0.8269, + "step": 71268 + }, + { + "epoch": 0.8908972724318108, + "grad_norm": 2.506885290145874, + "learning_rate": 7.164779366647467e-07, + "loss": 1.5577, + "step": 71270 + }, + { + "epoch": 0.8909222730568265, + "grad_norm": 3.8885486125946045, + "learning_rate": 7.161535920486429e-07, + "loss": 0.7688, + "step": 71272 + }, + { + "epoch": 0.890947273681842, + "grad_norm": 0.034861695021390915, + "learning_rate": 7.158293181370002e-07, + "loss": 0.3131, + "step": 71274 + }, + { + "epoch": 0.8909722743068577, + "grad_norm": 1.078256607055664, + "learning_rate": 7.155051149322878e-07, + "loss": 0.4894, + "step": 71276 + }, + { + "epoch": 0.8909972749318733, + "grad_norm": 3.0489556789398193, + "learning_rate": 7.151809824369749e-07, + "loss": 1.1914, + "step": 71278 + }, + { + "epoch": 0.891022275556889, + "grad_norm": 5.396022796630859, + "learning_rate": 7.148569206535305e-07, + "loss": 1.1493, + "step": 71280 + }, + { + "epoch": 0.8910472761819046, + "grad_norm": 6.009615421295166, + "learning_rate": 7.145329295844238e-07, + "loss": 1.654, + "step": 71282 + }, + { + "epoch": 0.8910722768069201, + "grad_norm": 3.3972103595733643, + "learning_rate": 7.142090092321207e-07, + "loss": 1.5687, + "step": 71284 + }, + { + "epoch": 0.8910972774319358, + "grad_norm": 5.611331462860107, + "learning_rate": 7.138851595990893e-07, + "loss": 0.7495, + "step": 71286 + }, + { + "epoch": 0.8911222780569514, + "grad_norm": 6.770740509033203, + "learning_rate": 7.135613806877962e-07, + "loss": 0.4905, + "step": 71288 + }, + { + "epoch": 0.8911472786819671, + "grad_norm": 3.1189627647399902, + "learning_rate": 7.13237672500704e-07, + "loss": 1.2279, + "step": 71290 + }, + { + "epoch": 0.8911722793069827, + "grad_norm": 6.078277587890625, + "learning_rate": 7.129140350402819e-07, + "loss": 1.8481, + "step": 71292 + }, + { + "epoch": 0.8911972799319983, + "grad_norm": 3.490460157394409, + "learning_rate": 7.125904683089923e-07, + "loss": 1.0623, + "step": 71294 + }, + { + "epoch": 0.8912222805570139, + "grad_norm": 5.086845397949219, + "learning_rate": 7.122669723093012e-07, + "loss": 0.4729, + "step": 71296 + }, + { + "epoch": 0.8912472811820296, + "grad_norm": 3.3782970905303955, + "learning_rate": 7.119435470436697e-07, + "loss": 0.3888, + "step": 71298 + }, + { + "epoch": 0.8912722818070452, + "grad_norm": 3.949127435684204, + "learning_rate": 7.11620192514565e-07, + "loss": 1.0006, + "step": 71300 + }, + { + "epoch": 0.8912972824320609, + "grad_norm": 3.3506646156311035, + "learning_rate": 7.11296908724447e-07, + "loss": 0.7493, + "step": 71302 + }, + { + "epoch": 0.8913222830570764, + "grad_norm": 0.3482532501220703, + "learning_rate": 7.109736956757773e-07, + "loss": 0.0147, + "step": 71304 + }, + { + "epoch": 0.891347283682092, + "grad_norm": 4.17694616317749, + "learning_rate": 7.106505533710195e-07, + "loss": 1.389, + "step": 71306 + }, + { + "epoch": 0.8913722843071077, + "grad_norm": 6.306843280792236, + "learning_rate": 7.103274818126316e-07, + "loss": 1.3252, + "step": 71308 + }, + { + "epoch": 0.8913972849321233, + "grad_norm": 2.766981601715088, + "learning_rate": 7.100044810030782e-07, + "loss": 0.7366, + "step": 71310 + }, + { + "epoch": 0.891422285557139, + "grad_norm": 3.078773021697998, + "learning_rate": 7.096815509448151e-07, + "loss": 0.2425, + "step": 71312 + }, + { + "epoch": 0.8914472861821545, + "grad_norm": 6.390204906463623, + "learning_rate": 7.09358691640305e-07, + "loss": 0.3428, + "step": 71314 + }, + { + "epoch": 0.8914722868071702, + "grad_norm": 4.594175338745117, + "learning_rate": 7.090359030920046e-07, + "loss": 1.1841, + "step": 71316 + }, + { + "epoch": 0.8914972874321858, + "grad_norm": 3.317911148071289, + "learning_rate": 7.087131853023721e-07, + "loss": 0.6887, + "step": 71318 + }, + { + "epoch": 0.8915222880572015, + "grad_norm": 0.2832058370113373, + "learning_rate": 7.083905382738676e-07, + "loss": 0.012, + "step": 71320 + }, + { + "epoch": 0.8915472886822171, + "grad_norm": 6.427731513977051, + "learning_rate": 7.080679620089448e-07, + "loss": 1.1527, + "step": 71322 + }, + { + "epoch": 0.8915722893072326, + "grad_norm": 2.481827974319458, + "learning_rate": 7.07745456510065e-07, + "loss": 1.0693, + "step": 71324 + }, + { + "epoch": 0.8915972899322483, + "grad_norm": 0.004890932235866785, + "learning_rate": 7.074230217796784e-07, + "loss": 1.0526, + "step": 71326 + }, + { + "epoch": 0.8916222905572639, + "grad_norm": 0.00023346695525106043, + "learning_rate": 7.071006578202466e-07, + "loss": 0.7035, + "step": 71328 + }, + { + "epoch": 0.8916472911822796, + "grad_norm": 3.3676445484161377, + "learning_rate": 7.06778364634223e-07, + "loss": 2.0325, + "step": 71330 + }, + { + "epoch": 0.8916722918072952, + "grad_norm": 0.9426450729370117, + "learning_rate": 7.064561422240579e-07, + "loss": 0.2124, + "step": 71332 + }, + { + "epoch": 0.8916972924323108, + "grad_norm": 3.865973949432373, + "learning_rate": 7.061339905922115e-07, + "loss": 0.8825, + "step": 71334 + }, + { + "epoch": 0.8917222930573264, + "grad_norm": 2.2019429206848145, + "learning_rate": 7.05811909741132e-07, + "loss": 0.399, + "step": 71336 + }, + { + "epoch": 0.8917472936823421, + "grad_norm": 1.3378100395202637, + "learning_rate": 7.054898996732762e-07, + "loss": 0.0305, + "step": 71338 + }, + { + "epoch": 0.8917722943073577, + "grad_norm": 3.995513916015625, + "learning_rate": 7.051679603910955e-07, + "loss": 0.7932, + "step": 71340 + }, + { + "epoch": 0.8917972949323734, + "grad_norm": 5.639397621154785, + "learning_rate": 7.048460918970401e-07, + "loss": 1.3322, + "step": 71342 + }, + { + "epoch": 0.8918222955573889, + "grad_norm": 3.427562713623047, + "learning_rate": 7.045242941935626e-07, + "loss": 0.2385, + "step": 71344 + }, + { + "epoch": 0.8918472961824045, + "grad_norm": 0.3521572947502136, + "learning_rate": 7.042025672831132e-07, + "loss": 0.4933, + "step": 71346 + }, + { + "epoch": 0.8918722968074202, + "grad_norm": 4.9398980140686035, + "learning_rate": 7.038809111681433e-07, + "loss": 0.9715, + "step": 71348 + }, + { + "epoch": 0.8918972974324358, + "grad_norm": 2.397125482559204, + "learning_rate": 7.035593258511009e-07, + "loss": 0.4382, + "step": 71350 + }, + { + "epoch": 0.8919222980574515, + "grad_norm": 3.9839096069335938, + "learning_rate": 7.032378113344363e-07, + "loss": 1.2475, + "step": 71352 + }, + { + "epoch": 0.891947298682467, + "grad_norm": 0.1141972616314888, + "learning_rate": 7.029163676205997e-07, + "loss": 0.1862, + "step": 71354 + }, + { + "epoch": 0.8919722993074827, + "grad_norm": 0.0035244536120444536, + "learning_rate": 7.025949947120347e-07, + "loss": 0.1554, + "step": 71356 + }, + { + "epoch": 0.8919972999324983, + "grad_norm": 3.3905560970306396, + "learning_rate": 7.022736926111928e-07, + "loss": 1.0861, + "step": 71358 + }, + { + "epoch": 0.892022300557514, + "grad_norm": 2.213117837905884, + "learning_rate": 7.019524613205187e-07, + "loss": 0.7332, + "step": 71360 + }, + { + "epoch": 0.8920473011825296, + "grad_norm": 2.8037543296813965, + "learning_rate": 7.016313008424603e-07, + "loss": 1.2949, + "step": 71362 + }, + { + "epoch": 0.8920723018075452, + "grad_norm": 3.994805097579956, + "learning_rate": 7.013102111794623e-07, + "loss": 0.6831, + "step": 71364 + }, + { + "epoch": 0.8920973024325608, + "grad_norm": 3.406360626220703, + "learning_rate": 7.009891923339729e-07, + "loss": 0.1999, + "step": 71366 + }, + { + "epoch": 0.8921223030575764, + "grad_norm": 15.020923614501953, + "learning_rate": 7.006682443084334e-07, + "loss": 1.8868, + "step": 71368 + }, + { + "epoch": 0.8921473036825921, + "grad_norm": 0.00030595072894357145, + "learning_rate": 7.003473671052885e-07, + "loss": 0.4091, + "step": 71370 + }, + { + "epoch": 0.8921723043076077, + "grad_norm": 4.381083011627197, + "learning_rate": 7.000265607269852e-07, + "loss": 0.7656, + "step": 71372 + }, + { + "epoch": 0.8921973049326233, + "grad_norm": 3.6456027030944824, + "learning_rate": 6.997058251759625e-07, + "loss": 1.2055, + "step": 71374 + }, + { + "epoch": 0.8922223055576389, + "grad_norm": 10.501198768615723, + "learning_rate": 6.993851604546664e-07, + "loss": 1.021, + "step": 71376 + }, + { + "epoch": 0.8922473061826546, + "grad_norm": 5.8390021324157715, + "learning_rate": 6.99064566565536e-07, + "loss": 1.0942, + "step": 71378 + }, + { + "epoch": 0.8922723068076702, + "grad_norm": 3.095903158187866, + "learning_rate": 6.987440435110171e-07, + "loss": 1.0829, + "step": 71380 + }, + { + "epoch": 0.8922973074326859, + "grad_norm": 3.4054782390594482, + "learning_rate": 6.984235912935467e-07, + "loss": 0.5693, + "step": 71382 + }, + { + "epoch": 0.8923223080577014, + "grad_norm": 1.0674934387207031, + "learning_rate": 6.981032099155672e-07, + "loss": 0.7354, + "step": 71384 + }, + { + "epoch": 0.892347308682717, + "grad_norm": 2.2713229656219482, + "learning_rate": 6.977828993795177e-07, + "loss": 1.7908, + "step": 71386 + }, + { + "epoch": 0.8923723093077327, + "grad_norm": 1.497519612312317, + "learning_rate": 6.974626596878387e-07, + "loss": 0.6767, + "step": 71388 + }, + { + "epoch": 0.8923973099327484, + "grad_norm": 0.0012473521055653691, + "learning_rate": 6.971424908429691e-07, + "loss": 0.5682, + "step": 71390 + }, + { + "epoch": 0.892422310557764, + "grad_norm": 3.886098861694336, + "learning_rate": 6.968223928473461e-07, + "loss": 0.732, + "step": 71392 + }, + { + "epoch": 0.8924473111827795, + "grad_norm": 1.0330443382263184, + "learning_rate": 6.965023657034075e-07, + "loss": 0.3154, + "step": 71394 + }, + { + "epoch": 0.8924723118077952, + "grad_norm": 0.00018167795496992767, + "learning_rate": 6.961824094135927e-07, + "loss": 0.0, + "step": 71396 + }, + { + "epoch": 0.8924973124328108, + "grad_norm": 3.1345150470733643, + "learning_rate": 6.958625239803352e-07, + "loss": 0.9797, + "step": 71398 + }, + { + "epoch": 0.8925223130578265, + "grad_norm": 3.607609987258911, + "learning_rate": 6.955427094060752e-07, + "loss": 1.4588, + "step": 71400 + }, + { + "epoch": 0.8925473136828421, + "grad_norm": 16.776588439941406, + "learning_rate": 6.952229656932441e-07, + "loss": 0.5431, + "step": 71402 + }, + { + "epoch": 0.8925723143078577, + "grad_norm": 1.5427472591400146, + "learning_rate": 6.949032928442801e-07, + "loss": 0.1536, + "step": 71404 + }, + { + "epoch": 0.8925973149328733, + "grad_norm": 2.5092294216156006, + "learning_rate": 6.945836908616177e-07, + "loss": 1.2851, + "step": 71406 + }, + { + "epoch": 0.892622315557889, + "grad_norm": 6.061415195465088, + "learning_rate": 6.942641597476874e-07, + "loss": 0.6991, + "step": 71408 + }, + { + "epoch": 0.8926473161829046, + "grad_norm": 3.833293914794922, + "learning_rate": 6.939446995049282e-07, + "loss": 0.5499, + "step": 71410 + }, + { + "epoch": 0.8926723168079203, + "grad_norm": 3.951551675796509, + "learning_rate": 6.93625310135767e-07, + "loss": 0.9153, + "step": 71412 + }, + { + "epoch": 0.8926973174329358, + "grad_norm": 1.1834944486618042, + "learning_rate": 6.933059916426421e-07, + "loss": 0.2147, + "step": 71414 + }, + { + "epoch": 0.8927223180579514, + "grad_norm": 3.886702299118042, + "learning_rate": 6.929867440279814e-07, + "loss": 1.2686, + "step": 71416 + }, + { + "epoch": 0.8927473186829671, + "grad_norm": 4.732375621795654, + "learning_rate": 6.926675672942185e-07, + "loss": 0.9844, + "step": 71418 + }, + { + "epoch": 0.8927723193079827, + "grad_norm": 2.3517510890960693, + "learning_rate": 6.923484614437837e-07, + "loss": 1.2117, + "step": 71420 + }, + { + "epoch": 0.8927973199329984, + "grad_norm": 3.799771547317505, + "learning_rate": 6.920294264791039e-07, + "loss": 1.3004, + "step": 71422 + }, + { + "epoch": 0.8928223205580139, + "grad_norm": 0.0004811969120055437, + "learning_rate": 6.91710462402615e-07, + "loss": 0.7389, + "step": 71424 + }, + { + "epoch": 0.8928473211830296, + "grad_norm": 4.914641857147217, + "learning_rate": 6.913915692167406e-07, + "loss": 1.1601, + "step": 71426 + }, + { + "epoch": 0.8928723218080452, + "grad_norm": 6.194427013397217, + "learning_rate": 6.910727469239131e-07, + "loss": 0.1244, + "step": 71428 + }, + { + "epoch": 0.8928973224330609, + "grad_norm": 1.8055939674377441, + "learning_rate": 6.907539955265574e-07, + "loss": 1.3848, + "step": 71430 + }, + { + "epoch": 0.8929223230580765, + "grad_norm": 5.317751884460449, + "learning_rate": 6.904353150271048e-07, + "loss": 0.6302, + "step": 71432 + }, + { + "epoch": 0.892947323683092, + "grad_norm": 3.971651077270508, + "learning_rate": 6.9011670542798e-07, + "loss": 1.7398, + "step": 71434 + }, + { + "epoch": 0.8929723243081077, + "grad_norm": 0.0002964229788631201, + "learning_rate": 6.897981667316089e-07, + "loss": 0.7648, + "step": 71436 + }, + { + "epoch": 0.8929973249331233, + "grad_norm": 8.491313934326172, + "learning_rate": 6.894796989404196e-07, + "loss": 1.5366, + "step": 71438 + }, + { + "epoch": 0.893022325558139, + "grad_norm": 0.0014024751726537943, + "learning_rate": 6.891613020568345e-07, + "loss": 0.0, + "step": 71440 + }, + { + "epoch": 0.8930473261831546, + "grad_norm": 4.098048210144043, + "learning_rate": 6.888429760832816e-07, + "loss": 1.1276, + "step": 71442 + }, + { + "epoch": 0.8930723268081702, + "grad_norm": 0.00023786256497260183, + "learning_rate": 6.885247210221846e-07, + "loss": 0.7236, + "step": 71444 + }, + { + "epoch": 0.8930973274331858, + "grad_norm": 1.6939977407455444, + "learning_rate": 6.88206536875965e-07, + "loss": 0.4295, + "step": 71446 + }, + { + "epoch": 0.8931223280582015, + "grad_norm": 0.23042792081832886, + "learning_rate": 6.878884236470484e-07, + "loss": 0.6358, + "step": 71448 + }, + { + "epoch": 0.8931473286832171, + "grad_norm": 2.9551777839660645, + "learning_rate": 6.875703813378554e-07, + "loss": 0.9773, + "step": 71450 + }, + { + "epoch": 0.8931723293082328, + "grad_norm": 5.190028190612793, + "learning_rate": 6.872524099508115e-07, + "loss": 1.3625, + "step": 71452 + }, + { + "epoch": 0.8931973299332483, + "grad_norm": 4.036556720733643, + "learning_rate": 6.869345094883339e-07, + "loss": 0.7569, + "step": 71454 + }, + { + "epoch": 0.8932223305582639, + "grad_norm": 7.696493148803711, + "learning_rate": 6.866166799528484e-07, + "loss": 2.168, + "step": 71456 + }, + { + "epoch": 0.8932473311832796, + "grad_norm": 3.592153310775757, + "learning_rate": 6.86298921346773e-07, + "loss": 1.8318, + "step": 71458 + }, + { + "epoch": 0.8932723318082952, + "grad_norm": 3.3353967666625977, + "learning_rate": 6.859812336725269e-07, + "loss": 0.6302, + "step": 71460 + }, + { + "epoch": 0.8932973324333109, + "grad_norm": 0.0005163667956367135, + "learning_rate": 6.856636169325315e-07, + "loss": 0.0001, + "step": 71462 + }, + { + "epoch": 0.8933223330583264, + "grad_norm": 1.9125083684921265, + "learning_rate": 6.853460711292037e-07, + "loss": 0.6679, + "step": 71464 + }, + { + "epoch": 0.8933473336833421, + "grad_norm": 0.00043652928434312344, + "learning_rate": 6.850285962649638e-07, + "loss": 0.1717, + "step": 71466 + }, + { + "epoch": 0.8933723343083577, + "grad_norm": 13.349382400512695, + "learning_rate": 6.847111923422278e-07, + "loss": 1.4603, + "step": 71468 + }, + { + "epoch": 0.8933973349333734, + "grad_norm": 5.906194686889648, + "learning_rate": 6.843938593634158e-07, + "loss": 1.9424, + "step": 71470 + }, + { + "epoch": 0.893422335558389, + "grad_norm": 2.1259729862213135, + "learning_rate": 6.840765973309427e-07, + "loss": 0.1383, + "step": 71472 + }, + { + "epoch": 0.8934473361834046, + "grad_norm": 0.00044295101542957127, + "learning_rate": 6.83759406247223e-07, + "loss": 0.4777, + "step": 71474 + }, + { + "epoch": 0.8934723368084202, + "grad_norm": 3.823965072631836, + "learning_rate": 6.834422861146762e-07, + "loss": 1.4317, + "step": 71476 + }, + { + "epoch": 0.8934973374334358, + "grad_norm": 0.4904816150665283, + "learning_rate": 6.831252369357145e-07, + "loss": 0.1835, + "step": 71478 + }, + { + "epoch": 0.8935223380584515, + "grad_norm": 4.680001735687256, + "learning_rate": 6.828082587127538e-07, + "loss": 1.513, + "step": 71480 + }, + { + "epoch": 0.8935473386834671, + "grad_norm": 4.19504976272583, + "learning_rate": 6.824913514482067e-07, + "loss": 1.5019, + "step": 71482 + }, + { + "epoch": 0.8935723393084827, + "grad_norm": 0.03254720941185951, + "learning_rate": 6.821745151444892e-07, + "loss": 0.8518, + "step": 71484 + }, + { + "epoch": 0.8935973399334983, + "grad_norm": 2.555418014526367, + "learning_rate": 6.818577498040136e-07, + "loss": 0.7004, + "step": 71486 + }, + { + "epoch": 0.893622340558514, + "grad_norm": 2.665306806564331, + "learning_rate": 6.815410554291901e-07, + "loss": 0.1984, + "step": 71488 + }, + { + "epoch": 0.8936473411835296, + "grad_norm": 2.540961980819702, + "learning_rate": 6.812244320224327e-07, + "loss": 1.0322, + "step": 71490 + }, + { + "epoch": 0.8936723418085453, + "grad_norm": 7.99270486831665, + "learning_rate": 6.809078795861513e-07, + "loss": 1.274, + "step": 71492 + }, + { + "epoch": 0.8936973424335608, + "grad_norm": 3.8022077083587646, + "learning_rate": 6.805913981227597e-07, + "loss": 1.6704, + "step": 71494 + }, + { + "epoch": 0.8937223430585765, + "grad_norm": 4.660725116729736, + "learning_rate": 6.802749876346659e-07, + "loss": 1.0756, + "step": 71496 + }, + { + "epoch": 0.8937473436835921, + "grad_norm": 2.6832659244537354, + "learning_rate": 6.799586481242771e-07, + "loss": 1.0161, + "step": 71498 + }, + { + "epoch": 0.8937723443086077, + "grad_norm": 3.8936750888824463, + "learning_rate": 6.796423795940088e-07, + "loss": 1.1838, + "step": 71500 + }, + { + "epoch": 0.8937973449336234, + "grad_norm": 1.6918054819107056, + "learning_rate": 6.793261820462626e-07, + "loss": 0.3121, + "step": 71502 + }, + { + "epoch": 0.8938223455586389, + "grad_norm": 2.7953476905822754, + "learning_rate": 6.790100554834533e-07, + "loss": 0.9671, + "step": 71504 + }, + { + "epoch": 0.8938473461836546, + "grad_norm": 0.5457997918128967, + "learning_rate": 6.786939999079823e-07, + "loss": 0.8182, + "step": 71506 + }, + { + "epoch": 0.8938723468086702, + "grad_norm": 2.0526959896087646, + "learning_rate": 6.783780153222608e-07, + "loss": 0.221, + "step": 71508 + }, + { + "epoch": 0.8938973474336859, + "grad_norm": 5.673611640930176, + "learning_rate": 6.78062101728697e-07, + "loss": 2.4382, + "step": 71510 + }, + { + "epoch": 0.8939223480587015, + "grad_norm": 5.31222677230835, + "learning_rate": 6.777462591296913e-07, + "loss": 2.4133, + "step": 71512 + }, + { + "epoch": 0.8939473486837171, + "grad_norm": 3.37869930267334, + "learning_rate": 6.774304875276527e-07, + "loss": 1.4064, + "step": 71514 + }, + { + "epoch": 0.8939723493087327, + "grad_norm": 8.57203197479248, + "learning_rate": 6.771147869249827e-07, + "loss": 1.2029, + "step": 71516 + }, + { + "epoch": 0.8939973499337484, + "grad_norm": 3.9588844776153564, + "learning_rate": 6.767991573240918e-07, + "loss": 0.7024, + "step": 71518 + }, + { + "epoch": 0.894022350558764, + "grad_norm": 5.45583438873291, + "learning_rate": 6.764835987273765e-07, + "loss": 1.3242, + "step": 71520 + }, + { + "epoch": 0.8940473511837796, + "grad_norm": 4.874203205108643, + "learning_rate": 6.761681111372465e-07, + "loss": 1.2401, + "step": 71522 + }, + { + "epoch": 0.8940723518087952, + "grad_norm": 4.570105075836182, + "learning_rate": 6.758526945561017e-07, + "loss": 1.273, + "step": 71524 + }, + { + "epoch": 0.8940973524338108, + "grad_norm": 2.82539439201355, + "learning_rate": 6.755373489863415e-07, + "loss": 1.108, + "step": 71526 + }, + { + "epoch": 0.8941223530588265, + "grad_norm": 4.142199516296387, + "learning_rate": 6.752220744303728e-07, + "loss": 1.8461, + "step": 71528 + }, + { + "epoch": 0.8941473536838421, + "grad_norm": 1.0536420345306396, + "learning_rate": 6.749068708905926e-07, + "loss": 0.8822, + "step": 71530 + }, + { + "epoch": 0.8941723543088578, + "grad_norm": 11.453902244567871, + "learning_rate": 6.745917383694045e-07, + "loss": 1.2221, + "step": 71532 + }, + { + "epoch": 0.8941973549338733, + "grad_norm": 5.921652317047119, + "learning_rate": 6.742766768692056e-07, + "loss": 2.0342, + "step": 71534 + }, + { + "epoch": 0.894222355558889, + "grad_norm": 2.85861873626709, + "learning_rate": 6.73961686392397e-07, + "loss": 1.6366, + "step": 71536 + }, + { + "epoch": 0.8942473561839046, + "grad_norm": 4.462713718414307, + "learning_rate": 6.736467669413793e-07, + "loss": 1.0348, + "step": 71538 + }, + { + "epoch": 0.8942723568089203, + "grad_norm": 2.5085299015045166, + "learning_rate": 6.733319185185461e-07, + "loss": 0.1447, + "step": 71540 + }, + { + "epoch": 0.8942973574339359, + "grad_norm": 8.594169616699219, + "learning_rate": 6.730171411262998e-07, + "loss": 1.4281, + "step": 71542 + }, + { + "epoch": 0.8943223580589514, + "grad_norm": 6.316748142242432, + "learning_rate": 6.727024347670341e-07, + "loss": 0.4743, + "step": 71544 + }, + { + "epoch": 0.8943473586839671, + "grad_norm": 3.618730068206787, + "learning_rate": 6.723877994431483e-07, + "loss": 1.6445, + "step": 71546 + }, + { + "epoch": 0.8943723593089827, + "grad_norm": 0.408223956823349, + "learning_rate": 6.720732351570414e-07, + "loss": 0.7171, + "step": 71548 + }, + { + "epoch": 0.8943973599339984, + "grad_norm": 1.9773927927017212, + "learning_rate": 6.717587419111016e-07, + "loss": 0.6482, + "step": 71550 + }, + { + "epoch": 0.894422360559014, + "grad_norm": 2.8430728912353516, + "learning_rate": 6.714443197077303e-07, + "loss": 1.2076, + "step": 71552 + }, + { + "epoch": 0.8944473611840296, + "grad_norm": 0.0004517212510108948, + "learning_rate": 6.711299685493189e-07, + "loss": 0.7153, + "step": 71554 + }, + { + "epoch": 0.8944723618090452, + "grad_norm": 7.015354633331299, + "learning_rate": 6.708156884382633e-07, + "loss": 0.8797, + "step": 71556 + }, + { + "epoch": 0.8944973624340609, + "grad_norm": 2.7472646236419678, + "learning_rate": 6.705014793769537e-07, + "loss": 0.9225, + "step": 71558 + }, + { + "epoch": 0.8945223630590765, + "grad_norm": 1.7751413583755493, + "learning_rate": 6.701873413677861e-07, + "loss": 0.1483, + "step": 71560 + }, + { + "epoch": 0.8945473636840922, + "grad_norm": 2.890319347381592, + "learning_rate": 6.698732744131564e-07, + "loss": 0.5406, + "step": 71562 + }, + { + "epoch": 0.8945723643091077, + "grad_norm": 3.496263027191162, + "learning_rate": 6.695592785154492e-07, + "loss": 1.2752, + "step": 71564 + }, + { + "epoch": 0.8945973649341233, + "grad_norm": 3.6323249340057373, + "learning_rate": 6.692453536770593e-07, + "loss": 1.8014, + "step": 71566 + }, + { + "epoch": 0.894622365559139, + "grad_norm": 2.11185622215271, + "learning_rate": 6.68931499900377e-07, + "loss": 0.1095, + "step": 71568 + }, + { + "epoch": 0.8946473661841546, + "grad_norm": 3.7936689853668213, + "learning_rate": 6.686177171877928e-07, + "loss": 0.6752, + "step": 71570 + }, + { + "epoch": 0.8946723668091703, + "grad_norm": 2.937854528427124, + "learning_rate": 6.683040055416967e-07, + "loss": 0.9471, + "step": 71572 + }, + { + "epoch": 0.8946973674341858, + "grad_norm": 5.202270030975342, + "learning_rate": 6.679903649644781e-07, + "loss": 1.8633, + "step": 71574 + }, + { + "epoch": 0.8947223680592015, + "grad_norm": 1.568184494972229, + "learning_rate": 6.676767954585273e-07, + "loss": 1.0484, + "step": 71576 + }, + { + "epoch": 0.8947473686842171, + "grad_norm": 2.4417216777801514, + "learning_rate": 6.673632970262267e-07, + "loss": 0.5537, + "step": 71578 + }, + { + "epoch": 0.8947723693092328, + "grad_norm": 0.3996679484844208, + "learning_rate": 6.670498696699701e-07, + "loss": 1.1172, + "step": 71580 + }, + { + "epoch": 0.8947973699342484, + "grad_norm": 2.3226451873779297, + "learning_rate": 6.667365133921389e-07, + "loss": 0.5383, + "step": 71582 + }, + { + "epoch": 0.894822370559264, + "grad_norm": 3.202134132385254, + "learning_rate": 6.664232281951244e-07, + "loss": 0.713, + "step": 71584 + }, + { + "epoch": 0.8948473711842796, + "grad_norm": 5.219593048095703, + "learning_rate": 6.661100140813104e-07, + "loss": 0.5192, + "step": 71586 + }, + { + "epoch": 0.8948723718092952, + "grad_norm": 0.05790723115205765, + "learning_rate": 6.657968710530838e-07, + "loss": 0.425, + "step": 71588 + }, + { + "epoch": 0.8948973724343109, + "grad_norm": 4.759683132171631, + "learning_rate": 6.654837991128282e-07, + "loss": 1.2144, + "step": 71590 + }, + { + "epoch": 0.8949223730593265, + "grad_norm": 2.493774890899658, + "learning_rate": 6.651707982629263e-07, + "loss": 0.9405, + "step": 71592 + }, + { + "epoch": 0.8949473736843421, + "grad_norm": 5.970463752746582, + "learning_rate": 6.648578685057638e-07, + "loss": 1.9237, + "step": 71594 + }, + { + "epoch": 0.8949723743093577, + "grad_norm": 2.1204922199249268, + "learning_rate": 6.645450098437245e-07, + "loss": 1.4194, + "step": 71596 + }, + { + "epoch": 0.8949973749343734, + "grad_norm": 3.0125482082366943, + "learning_rate": 6.642322222791887e-07, + "loss": 0.8225, + "step": 71598 + }, + { + "epoch": 0.895022375559389, + "grad_norm": 6.153406143188477, + "learning_rate": 6.63919505814542e-07, + "loss": 0.5393, + "step": 71600 + }, + { + "epoch": 0.8950473761844047, + "grad_norm": 5.987281799316406, + "learning_rate": 6.63606860452164e-07, + "loss": 0.839, + "step": 71602 + }, + { + "epoch": 0.8950723768094202, + "grad_norm": 6.6416401863098145, + "learning_rate": 6.632942861944369e-07, + "loss": 0.989, + "step": 71604 + }, + { + "epoch": 0.8950973774344358, + "grad_norm": 1.3456116914749146, + "learning_rate": 6.629817830437368e-07, + "loss": 0.1522, + "step": 71606 + }, + { + "epoch": 0.8951223780594515, + "grad_norm": 4.4945173263549805, + "learning_rate": 6.626693510024485e-07, + "loss": 1.6905, + "step": 71608 + }, + { + "epoch": 0.8951473786844671, + "grad_norm": 0.00046176830073818564, + "learning_rate": 6.62356990072951e-07, + "loss": 0.6515, + "step": 71610 + }, + { + "epoch": 0.8951723793094828, + "grad_norm": 4.144737720489502, + "learning_rate": 6.620447002576202e-07, + "loss": 1.2127, + "step": 71612 + }, + { + "epoch": 0.8951973799344983, + "grad_norm": 0.0005151600926183164, + "learning_rate": 6.6173248155884e-07, + "loss": 0.5863, + "step": 71614 + }, + { + "epoch": 0.895222380559514, + "grad_norm": 0.6022035479545593, + "learning_rate": 6.614203339789815e-07, + "loss": 0.5268, + "step": 71616 + }, + { + "epoch": 0.8952473811845296, + "grad_norm": 12.262979507446289, + "learning_rate": 6.611082575204275e-07, + "loss": 1.4353, + "step": 71618 + }, + { + "epoch": 0.8952723818095453, + "grad_norm": 5.2130889892578125, + "learning_rate": 6.607962521855505e-07, + "loss": 0.2792, + "step": 71620 + }, + { + "epoch": 0.8952973824345609, + "grad_norm": 10.318476676940918, + "learning_rate": 6.604843179767273e-07, + "loss": 1.9708, + "step": 71622 + }, + { + "epoch": 0.8953223830595765, + "grad_norm": 2.217674732208252, + "learning_rate": 6.601724548963373e-07, + "loss": 0.3136, + "step": 71624 + }, + { + "epoch": 0.8953473836845921, + "grad_norm": 1.0611120462417603, + "learning_rate": 6.598606629467519e-07, + "loss": 0.2444, + "step": 71626 + }, + { + "epoch": 0.8953723843096077, + "grad_norm": 3.98946213722229, + "learning_rate": 6.595489421303491e-07, + "loss": 0.9206, + "step": 71628 + }, + { + "epoch": 0.8953973849346234, + "grad_norm": 3.3757262229919434, + "learning_rate": 6.592372924494983e-07, + "loss": 0.8439, + "step": 71630 + }, + { + "epoch": 0.895422385559639, + "grad_norm": 2.4441707134246826, + "learning_rate": 6.589257139065752e-07, + "loss": 0.9464, + "step": 71632 + }, + { + "epoch": 0.8954473861846546, + "grad_norm": 2.4414634704589844, + "learning_rate": 6.586142065039546e-07, + "loss": 0.7276, + "step": 71634 + }, + { + "epoch": 0.8954723868096702, + "grad_norm": 5.7215800285339355, + "learning_rate": 6.58302770244006e-07, + "loss": 1.0968, + "step": 71636 + }, + { + "epoch": 0.8954973874346859, + "grad_norm": 0.012668797746300697, + "learning_rate": 6.579914051291026e-07, + "loss": 0.6543, + "step": 71638 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 4.498831272125244, + "learning_rate": 6.576801111616171e-07, + "loss": 0.8457, + "step": 71640 + }, + { + "epoch": 0.8955473886847172, + "grad_norm": 0.5577155947685242, + "learning_rate": 6.573688883439178e-07, + "loss": 0.2903, + "step": 71642 + }, + { + "epoch": 0.8955723893097327, + "grad_norm": 0.6462774276733398, + "learning_rate": 6.570577366783748e-07, + "loss": 0.1984, + "step": 71644 + }, + { + "epoch": 0.8955973899347484, + "grad_norm": 2.016265869140625, + "learning_rate": 6.567466561673585e-07, + "loss": 0.9722, + "step": 71646 + }, + { + "epoch": 0.895622390559764, + "grad_norm": 4.621136665344238, + "learning_rate": 6.564356468132405e-07, + "loss": 0.9028, + "step": 71648 + }, + { + "epoch": 0.8956473911847797, + "grad_norm": 0.02853524312376976, + "learning_rate": 6.561247086183853e-07, + "loss": 0.9774, + "step": 71650 + }, + { + "epoch": 0.8956723918097953, + "grad_norm": 0.6657100319862366, + "learning_rate": 6.558138415851645e-07, + "loss": 0.8609, + "step": 71652 + }, + { + "epoch": 0.8956973924348108, + "grad_norm": 2.823183059692383, + "learning_rate": 6.55503045715944e-07, + "loss": 1.327, + "step": 71654 + }, + { + "epoch": 0.8957223930598265, + "grad_norm": 2.546865224838257, + "learning_rate": 6.551923210130895e-07, + "loss": 1.5464, + "step": 71656 + }, + { + "epoch": 0.8957473936848421, + "grad_norm": 1.105271339416504, + "learning_rate": 6.548816674789704e-07, + "loss": 0.3651, + "step": 71658 + }, + { + "epoch": 0.8957723943098578, + "grad_norm": 0.4971705377101898, + "learning_rate": 6.545710851159503e-07, + "loss": 0.6962, + "step": 71660 + }, + { + "epoch": 0.8957973949348734, + "grad_norm": 1.6211106777191162, + "learning_rate": 6.542605739263963e-07, + "loss": 0.8906, + "step": 71662 + }, + { + "epoch": 0.895822395559889, + "grad_norm": 3.24108624458313, + "learning_rate": 6.539501339126697e-07, + "loss": 1.0539, + "step": 71664 + }, + { + "epoch": 0.8958473961849046, + "grad_norm": 0.1294233500957489, + "learning_rate": 6.536397650771409e-07, + "loss": 0.1955, + "step": 71666 + }, + { + "epoch": 0.8958723968099203, + "grad_norm": 0.0003116870648227632, + "learning_rate": 6.533294674221668e-07, + "loss": 0.4148, + "step": 71668 + }, + { + "epoch": 0.8958973974349359, + "grad_norm": 3.051715135574341, + "learning_rate": 6.530192409501146e-07, + "loss": 0.4943, + "step": 71670 + }, + { + "epoch": 0.8959223980599516, + "grad_norm": 3.0218374729156494, + "learning_rate": 6.527090856633467e-07, + "loss": 0.4839, + "step": 71672 + }, + { + "epoch": 0.8959473986849671, + "grad_norm": 0.007770870812237263, + "learning_rate": 6.523990015642223e-07, + "loss": 0.4844, + "step": 71674 + }, + { + "epoch": 0.8959723993099827, + "grad_norm": 3.22566294670105, + "learning_rate": 6.520889886551085e-07, + "loss": 1.6352, + "step": 71676 + }, + { + "epoch": 0.8959973999349984, + "grad_norm": 8.330930709838867, + "learning_rate": 6.5177904693836e-07, + "loss": 0.4879, + "step": 71678 + }, + { + "epoch": 0.896022400560014, + "grad_norm": 2.993966817855835, + "learning_rate": 6.51469176416345e-07, + "loss": 0.6666, + "step": 71680 + }, + { + "epoch": 0.8960474011850297, + "grad_norm": 0.0002988508203998208, + "learning_rate": 6.511593770914137e-07, + "loss": 0.4567, + "step": 71682 + }, + { + "epoch": 0.8960724018100452, + "grad_norm": 2.3572614192962646, + "learning_rate": 6.508496489659322e-07, + "loss": 0.9728, + "step": 71684 + }, + { + "epoch": 0.8960974024350609, + "grad_norm": 1.9545663595199585, + "learning_rate": 6.505399920422583e-07, + "loss": 0.7355, + "step": 71686 + }, + { + "epoch": 0.8961224030600765, + "grad_norm": 0.0027764507103711367, + "learning_rate": 6.502304063227483e-07, + "loss": 0.5278, + "step": 71688 + }, + { + "epoch": 0.8961474036850922, + "grad_norm": 2.1799144744873047, + "learning_rate": 6.499208918097633e-07, + "loss": 0.9003, + "step": 71690 + }, + { + "epoch": 0.8961724043101078, + "grad_norm": 8.44062614440918, + "learning_rate": 6.496114485056581e-07, + "loss": 1.2961, + "step": 71692 + }, + { + "epoch": 0.8961974049351233, + "grad_norm": 4.290688514709473, + "learning_rate": 6.493020764127889e-07, + "loss": 1.3502, + "step": 71694 + }, + { + "epoch": 0.896222405560139, + "grad_norm": 0.04535983130335808, + "learning_rate": 6.489927755335134e-07, + "loss": 0.501, + "step": 71696 + }, + { + "epoch": 0.8962474061851546, + "grad_norm": 2.7424352169036865, + "learning_rate": 6.486835458701857e-07, + "loss": 0.7426, + "step": 71698 + }, + { + "epoch": 0.8962724068101703, + "grad_norm": 4.613335609436035, + "learning_rate": 6.483743874251636e-07, + "loss": 0.9418, + "step": 71700 + }, + { + "epoch": 0.8962974074351859, + "grad_norm": 4.2923688888549805, + "learning_rate": 6.480653002007975e-07, + "loss": 0.8161, + "step": 71702 + }, + { + "epoch": 0.8963224080602015, + "grad_norm": 0.06218844652175903, + "learning_rate": 6.477562841994455e-07, + "loss": 0.5138, + "step": 71704 + }, + { + "epoch": 0.8963474086852171, + "grad_norm": 0.00032524202833883464, + "learning_rate": 6.474473394234593e-07, + "loss": 1.0639, + "step": 71706 + }, + { + "epoch": 0.8963724093102328, + "grad_norm": 4.054293155670166, + "learning_rate": 6.47138465875189e-07, + "loss": 1.2642, + "step": 71708 + }, + { + "epoch": 0.8963974099352484, + "grad_norm": 7.252566814422607, + "learning_rate": 6.468296635569926e-07, + "loss": 1.6129, + "step": 71710 + }, + { + "epoch": 0.8964224105602641, + "grad_norm": 0.7376516461372375, + "learning_rate": 6.465209324712163e-07, + "loss": 0.9713, + "step": 71712 + }, + { + "epoch": 0.8964474111852796, + "grad_norm": 4.351621150970459, + "learning_rate": 6.462122726202158e-07, + "loss": 1.7447, + "step": 71714 + }, + { + "epoch": 0.8964724118102952, + "grad_norm": 2.9610111713409424, + "learning_rate": 6.45903684006337e-07, + "loss": 0.2563, + "step": 71716 + }, + { + "epoch": 0.8964974124353109, + "grad_norm": 2.8338656425476074, + "learning_rate": 6.45595166631936e-07, + "loss": 1.1123, + "step": 71718 + }, + { + "epoch": 0.8965224130603265, + "grad_norm": 5.2219557762146, + "learning_rate": 6.452867204993596e-07, + "loss": 0.3198, + "step": 71720 + }, + { + "epoch": 0.8965474136853422, + "grad_norm": 3.0830001831054688, + "learning_rate": 6.449783456109537e-07, + "loss": 0.5751, + "step": 71722 + }, + { + "epoch": 0.8965724143103577, + "grad_norm": 2.2027194499969482, + "learning_rate": 6.446700419690721e-07, + "loss": 0.3533, + "step": 71724 + }, + { + "epoch": 0.8965974149353734, + "grad_norm": 0.6329027414321899, + "learning_rate": 6.443618095760596e-07, + "loss": 0.6874, + "step": 71726 + }, + { + "epoch": 0.896622415560389, + "grad_norm": 2.352517604827881, + "learning_rate": 6.440536484342652e-07, + "loss": 0.7035, + "step": 71728 + }, + { + "epoch": 0.8966474161854047, + "grad_norm": 6.7729268074035645, + "learning_rate": 6.437455585460339e-07, + "loss": 0.9518, + "step": 71730 + }, + { + "epoch": 0.8966724168104203, + "grad_norm": 2.276519298553467, + "learning_rate": 6.434375399137149e-07, + "loss": 0.6202, + "step": 71732 + }, + { + "epoch": 0.8966974174354359, + "grad_norm": 3.786898612976074, + "learning_rate": 6.43129592539653e-07, + "loss": 1.8132, + "step": 71734 + }, + { + "epoch": 0.8967224180604515, + "grad_norm": 0.5030069947242737, + "learning_rate": 6.428217164261908e-07, + "loss": 1.4197, + "step": 71736 + }, + { + "epoch": 0.8967474186854671, + "grad_norm": 0.0007054125308059156, + "learning_rate": 6.425139115756763e-07, + "loss": 0.0091, + "step": 71738 + }, + { + "epoch": 0.8967724193104828, + "grad_norm": 6.204906463623047, + "learning_rate": 6.422061779904521e-07, + "loss": 1.472, + "step": 71740 + }, + { + "epoch": 0.8967974199354984, + "grad_norm": 2.0373404026031494, + "learning_rate": 6.41898515672863e-07, + "loss": 0.4024, + "step": 71742 + }, + { + "epoch": 0.896822420560514, + "grad_norm": 4.474635124206543, + "learning_rate": 6.415909246252516e-07, + "loss": 0.945, + "step": 71744 + }, + { + "epoch": 0.8968474211855296, + "grad_norm": 4.430233478546143, + "learning_rate": 6.412834048499594e-07, + "loss": 1.4787, + "step": 71746 + }, + { + "epoch": 0.8968724218105453, + "grad_norm": 5.986750602722168, + "learning_rate": 6.409759563493301e-07, + "loss": 0.744, + "step": 71748 + }, + { + "epoch": 0.8968974224355609, + "grad_norm": 3.3331727981567383, + "learning_rate": 6.406685791257028e-07, + "loss": 1.1365, + "step": 71750 + }, + { + "epoch": 0.8969224230605766, + "grad_norm": 1.7373937368392944, + "learning_rate": 6.403612731814223e-07, + "loss": 0.2173, + "step": 71752 + }, + { + "epoch": 0.8969474236855921, + "grad_norm": 0.0006696981145069003, + "learning_rate": 6.400540385188248e-07, + "loss": 0.3912, + "step": 71754 + }, + { + "epoch": 0.8969724243106078, + "grad_norm": 4.298147678375244, + "learning_rate": 6.397468751402536e-07, + "loss": 1.1993, + "step": 71756 + }, + { + "epoch": 0.8969974249356234, + "grad_norm": 7.259187698364258, + "learning_rate": 6.39439783048047e-07, + "loss": 0.6886, + "step": 71758 + }, + { + "epoch": 0.897022425560639, + "grad_norm": 2.392155885696411, + "learning_rate": 6.39132762244542e-07, + "loss": 0.6017, + "step": 71760 + }, + { + "epoch": 0.8970474261856547, + "grad_norm": 8.73050594329834, + "learning_rate": 6.38825812732079e-07, + "loss": 1.9783, + "step": 71762 + }, + { + "epoch": 0.8970724268106702, + "grad_norm": 0.00020387685799505562, + "learning_rate": 6.385189345129938e-07, + "loss": 0.5373, + "step": 71764 + }, + { + "epoch": 0.8970974274356859, + "grad_norm": 5.162194728851318, + "learning_rate": 6.382121275896258e-07, + "loss": 0.8063, + "step": 71766 + }, + { + "epoch": 0.8971224280607015, + "grad_norm": 0.030083833262324333, + "learning_rate": 6.379053919643086e-07, + "loss": 0.1394, + "step": 71768 + }, + { + "epoch": 0.8971474286857172, + "grad_norm": 3.101632595062256, + "learning_rate": 6.375987276393813e-07, + "loss": 0.1866, + "step": 71770 + }, + { + "epoch": 0.8971724293107328, + "grad_norm": 3.049875020980835, + "learning_rate": 6.37292134617179e-07, + "loss": 1.1142, + "step": 71772 + }, + { + "epoch": 0.8971974299357484, + "grad_norm": 9.101057052612305, + "learning_rate": 6.36985612900034e-07, + "loss": 2.2404, + "step": 71774 + }, + { + "epoch": 0.897222430560764, + "grad_norm": 1.664167881011963, + "learning_rate": 6.366791624902835e-07, + "loss": 1.3041, + "step": 71776 + }, + { + "epoch": 0.8972474311857797, + "grad_norm": 0.0003391015634406358, + "learning_rate": 6.3637278339026e-07, + "loss": 0.518, + "step": 71778 + }, + { + "epoch": 0.8972724318107953, + "grad_norm": 3.447622060775757, + "learning_rate": 6.360664756022972e-07, + "loss": 0.4687, + "step": 71780 + }, + { + "epoch": 0.897297432435811, + "grad_norm": 2.525104522705078, + "learning_rate": 6.357602391287276e-07, + "loss": 1.1713, + "step": 71782 + }, + { + "epoch": 0.8973224330608265, + "grad_norm": 2.079413414001465, + "learning_rate": 6.354540739718851e-07, + "loss": 0.784, + "step": 71784 + }, + { + "epoch": 0.8973474336858421, + "grad_norm": 4.0645270347595215, + "learning_rate": 6.351479801340999e-07, + "loss": 1.1028, + "step": 71786 + }, + { + "epoch": 0.8973724343108578, + "grad_norm": 12.339452743530273, + "learning_rate": 6.348419576177023e-07, + "loss": 1.0544, + "step": 71788 + }, + { + "epoch": 0.8973974349358734, + "grad_norm": 4.057015895843506, + "learning_rate": 6.34536006425025e-07, + "loss": 0.7705, + "step": 71790 + }, + { + "epoch": 0.8974224355608891, + "grad_norm": 5.235095500946045, + "learning_rate": 6.34230126558395e-07, + "loss": 0.923, + "step": 71792 + }, + { + "epoch": 0.8974474361859046, + "grad_norm": 3.5011463165283203, + "learning_rate": 6.339243180201471e-07, + "loss": 1.3855, + "step": 71794 + }, + { + "epoch": 0.8974724368109203, + "grad_norm": 5.35679292678833, + "learning_rate": 6.33618580812606e-07, + "loss": 0.3751, + "step": 71796 + }, + { + "epoch": 0.8974974374359359, + "grad_norm": 4.2483625411987305, + "learning_rate": 6.333129149381001e-07, + "loss": 0.8007, + "step": 71798 + }, + { + "epoch": 0.8975224380609516, + "grad_norm": 3.547447919845581, + "learning_rate": 6.330073203989595e-07, + "loss": 0.6514, + "step": 71800 + }, + { + "epoch": 0.8975474386859672, + "grad_norm": 2.5966062545776367, + "learning_rate": 6.327017971975091e-07, + "loss": 1.7353, + "step": 71802 + }, + { + "epoch": 0.8975724393109827, + "grad_norm": 3.0703554153442383, + "learning_rate": 6.323963453360793e-07, + "loss": 2.1909, + "step": 71804 + }, + { + "epoch": 0.8975974399359984, + "grad_norm": 3.1906895637512207, + "learning_rate": 6.320909648169915e-07, + "loss": 1.1307, + "step": 71806 + }, + { + "epoch": 0.897622440561014, + "grad_norm": 5.510862350463867, + "learning_rate": 6.317856556425772e-07, + "loss": 1.5097, + "step": 71808 + }, + { + "epoch": 0.8976474411860297, + "grad_norm": 5.9057440757751465, + "learning_rate": 6.31480417815158e-07, + "loss": 1.8854, + "step": 71810 + }, + { + "epoch": 0.8976724418110453, + "grad_norm": 1.7191575765609741, + "learning_rate": 6.311752513370573e-07, + "loss": 0.4965, + "step": 71812 + }, + { + "epoch": 0.8976974424360609, + "grad_norm": 1.7536779642105103, + "learning_rate": 6.308701562106024e-07, + "loss": 0.2028, + "step": 71814 + }, + { + "epoch": 0.8977224430610765, + "grad_norm": 11.674324035644531, + "learning_rate": 6.305651324381145e-07, + "loss": 1.9109, + "step": 71816 + }, + { + "epoch": 0.8977474436860922, + "grad_norm": 0.0014511981280520558, + "learning_rate": 6.302601800219188e-07, + "loss": 0.5825, + "step": 71818 + }, + { + "epoch": 0.8977724443111078, + "grad_norm": 0.8084474802017212, + "learning_rate": 6.299552989643354e-07, + "loss": 0.3035, + "step": 71820 + }, + { + "epoch": 0.8977974449361235, + "grad_norm": 4.247025966644287, + "learning_rate": 6.29650489267688e-07, + "loss": 0.5427, + "step": 71822 + }, + { + "epoch": 0.897822445561139, + "grad_norm": 0.0002928953617811203, + "learning_rate": 6.293457509342982e-07, + "loss": 0.3996, + "step": 71824 + }, + { + "epoch": 0.8978474461861546, + "grad_norm": 4.6886396408081055, + "learning_rate": 6.29041083966484e-07, + "loss": 1.1692, + "step": 71826 + }, + { + "epoch": 0.8978724468111703, + "grad_norm": 2.622281789779663, + "learning_rate": 6.287364883665703e-07, + "loss": 0.9045, + "step": 71828 + }, + { + "epoch": 0.8978974474361859, + "grad_norm": 6.261919021606445, + "learning_rate": 6.28431964136872e-07, + "loss": 0.7706, + "step": 71830 + }, + { + "epoch": 0.8979224480612016, + "grad_norm": 0.9072429537773132, + "learning_rate": 6.281275112797114e-07, + "loss": 0.5792, + "step": 71832 + }, + { + "epoch": 0.8979474486862171, + "grad_norm": 0.0003344865108374506, + "learning_rate": 6.278231297974058e-07, + "loss": 0.0247, + "step": 71834 + }, + { + "epoch": 0.8979724493112328, + "grad_norm": 2.540104866027832, + "learning_rate": 6.275188196922755e-07, + "loss": 0.7212, + "step": 71836 + }, + { + "epoch": 0.8979974499362484, + "grad_norm": 2.0856635570526123, + "learning_rate": 6.272145809666364e-07, + "loss": 0.6563, + "step": 71838 + }, + { + "epoch": 0.8980224505612641, + "grad_norm": 5.276228904724121, + "learning_rate": 6.269104136228033e-07, + "loss": 1.8579, + "step": 71840 + }, + { + "epoch": 0.8980474511862797, + "grad_norm": 0.0018388156313449144, + "learning_rate": 6.266063176630976e-07, + "loss": 0.765, + "step": 71842 + }, + { + "epoch": 0.8980724518112952, + "grad_norm": 6.304903507232666, + "learning_rate": 6.26302293089831e-07, + "loss": 1.4985, + "step": 71844 + }, + { + "epoch": 0.8980974524363109, + "grad_norm": 0.0005765830865129828, + "learning_rate": 6.259983399053215e-07, + "loss": 0.6952, + "step": 71846 + }, + { + "epoch": 0.8981224530613265, + "grad_norm": 0.0003264820552431047, + "learning_rate": 6.256944581118817e-07, + "loss": 0.0, + "step": 71848 + }, + { + "epoch": 0.8981474536863422, + "grad_norm": 1.7720823287963867, + "learning_rate": 6.253906477118287e-07, + "loss": 0.9009, + "step": 71850 + }, + { + "epoch": 0.8981724543113578, + "grad_norm": 3.7288174629211426, + "learning_rate": 6.250869087074752e-07, + "loss": 0.6433, + "step": 71852 + }, + { + "epoch": 0.8981974549363734, + "grad_norm": 5.071084976196289, + "learning_rate": 6.247832411011323e-07, + "loss": 1.4805, + "step": 71854 + }, + { + "epoch": 0.898222455561389, + "grad_norm": 1.9459809064865112, + "learning_rate": 6.244796448951162e-07, + "loss": 0.5606, + "step": 71856 + }, + { + "epoch": 0.8982474561864047, + "grad_norm": 0.0007893344736658037, + "learning_rate": 6.241761200917351e-07, + "loss": 0.4782, + "step": 71858 + }, + { + "epoch": 0.8982724568114203, + "grad_norm": 4.716616153717041, + "learning_rate": 6.238726666933049e-07, + "loss": 1.0372, + "step": 71860 + }, + { + "epoch": 0.898297457436436, + "grad_norm": 5.413111209869385, + "learning_rate": 6.235692847021346e-07, + "loss": 0.4621, + "step": 71862 + }, + { + "epoch": 0.8983224580614515, + "grad_norm": 0.6678018569946289, + "learning_rate": 6.232659741205338e-07, + "loss": 0.0464, + "step": 71864 + }, + { + "epoch": 0.8983474586864671, + "grad_norm": 3.7157177925109863, + "learning_rate": 6.229627349508138e-07, + "loss": 1.8236, + "step": 71866 + }, + { + "epoch": 0.8983724593114828, + "grad_norm": 2.7858588695526123, + "learning_rate": 6.226595671952829e-07, + "loss": 0.541, + "step": 71868 + }, + { + "epoch": 0.8983974599364984, + "grad_norm": 2.9586799144744873, + "learning_rate": 6.223564708562512e-07, + "loss": 1.2927, + "step": 71870 + }, + { + "epoch": 0.8984224605615141, + "grad_norm": 0.005424892995506525, + "learning_rate": 6.22053445936025e-07, + "loss": 0.4158, + "step": 71872 + }, + { + "epoch": 0.8984474611865296, + "grad_norm": 3.0433859825134277, + "learning_rate": 6.217504924369166e-07, + "loss": 0.8645, + "step": 71874 + }, + { + "epoch": 0.8984724618115453, + "grad_norm": 1.7556365728378296, + "learning_rate": 6.214476103612289e-07, + "loss": 0.7434, + "step": 71876 + }, + { + "epoch": 0.8984974624365609, + "grad_norm": 3.9197001457214355, + "learning_rate": 6.211447997112686e-07, + "loss": 0.9648, + "step": 71878 + }, + { + "epoch": 0.8985224630615766, + "grad_norm": 4.341955184936523, + "learning_rate": 6.208420604893451e-07, + "loss": 1.0525, + "step": 71880 + }, + { + "epoch": 0.8985474636865922, + "grad_norm": 2.6112372875213623, + "learning_rate": 6.205393926977599e-07, + "loss": 0.6325, + "step": 71882 + }, + { + "epoch": 0.8985724643116078, + "grad_norm": 3.5785329341888428, + "learning_rate": 6.202367963388223e-07, + "loss": 0.6845, + "step": 71884 + }, + { + "epoch": 0.8985974649366234, + "grad_norm": 4.668650150299072, + "learning_rate": 6.199342714148338e-07, + "loss": 1.2478, + "step": 71886 + }, + { + "epoch": 0.898622465561639, + "grad_norm": 2.678561210632324, + "learning_rate": 6.196318179281002e-07, + "loss": 0.7721, + "step": 71888 + }, + { + "epoch": 0.8986474661866547, + "grad_norm": 2.8207828998565674, + "learning_rate": 6.193294358809243e-07, + "loss": 1.0334, + "step": 71890 + }, + { + "epoch": 0.8986724668116703, + "grad_norm": 0.0041497377678751945, + "learning_rate": 6.190271252756086e-07, + "loss": 0.0002, + "step": 71892 + }, + { + "epoch": 0.8986974674366859, + "grad_norm": 3.0630486011505127, + "learning_rate": 6.187248861144557e-07, + "loss": 0.7827, + "step": 71894 + }, + { + "epoch": 0.8987224680617015, + "grad_norm": 0.0005196637939661741, + "learning_rate": 6.18422718399766e-07, + "loss": 0.0584, + "step": 71896 + }, + { + "epoch": 0.8987474686867172, + "grad_norm": 2.7110495567321777, + "learning_rate": 6.181206221338442e-07, + "loss": 1.5262, + "step": 71898 + }, + { + "epoch": 0.8987724693117328, + "grad_norm": 2.4090802669525146, + "learning_rate": 6.178185973189877e-07, + "loss": 0.2556, + "step": 71900 + }, + { + "epoch": 0.8987974699367485, + "grad_norm": 2.8640499114990234, + "learning_rate": 6.175166439574997e-07, + "loss": 0.9655, + "step": 71902 + }, + { + "epoch": 0.898822470561764, + "grad_norm": 1.5671055316925049, + "learning_rate": 6.172147620516778e-07, + "loss": 0.6323, + "step": 71904 + }, + { + "epoch": 0.8988474711867797, + "grad_norm": 3.6315793991088867, + "learning_rate": 6.169129516038197e-07, + "loss": 1.0196, + "step": 71906 + }, + { + "epoch": 0.8988724718117953, + "grad_norm": 0.40298178791999817, + "learning_rate": 6.166112126162271e-07, + "loss": 0.1328, + "step": 71908 + }, + { + "epoch": 0.898897472436811, + "grad_norm": 4.758366584777832, + "learning_rate": 6.163095450911949e-07, + "loss": 0.5053, + "step": 71910 + }, + { + "epoch": 0.8989224730618266, + "grad_norm": 2.9271607398986816, + "learning_rate": 6.160079490310222e-07, + "loss": 0.4275, + "step": 71912 + }, + { + "epoch": 0.8989474736868421, + "grad_norm": 6.923584461212158, + "learning_rate": 6.157064244380095e-07, + "loss": 1.6538, + "step": 71914 + }, + { + "epoch": 0.8989724743118578, + "grad_norm": 4.5937042236328125, + "learning_rate": 6.154049713144472e-07, + "loss": 1.8876, + "step": 71916 + }, + { + "epoch": 0.8989974749368734, + "grad_norm": 5.465989589691162, + "learning_rate": 6.151035896626334e-07, + "loss": 0.9143, + "step": 71918 + }, + { + "epoch": 0.8990224755618891, + "grad_norm": 1.999324917793274, + "learning_rate": 6.14802279484863e-07, + "loss": 0.6439, + "step": 71920 + }, + { + "epoch": 0.8990474761869047, + "grad_norm": 0.00019181908282916993, + "learning_rate": 6.145010407834329e-07, + "loss": 0.493, + "step": 71922 + }, + { + "epoch": 0.8990724768119203, + "grad_norm": 3.6652414798736572, + "learning_rate": 6.141998735606336e-07, + "loss": 1.1441, + "step": 71924 + }, + { + "epoch": 0.8990974774369359, + "grad_norm": 1.1021625995635986, + "learning_rate": 6.138987778187611e-07, + "loss": 0.4002, + "step": 71926 + }, + { + "epoch": 0.8991224780619516, + "grad_norm": 0.0004289224452804774, + "learning_rate": 6.135977535601112e-07, + "loss": 0.9437, + "step": 71928 + }, + { + "epoch": 0.8991474786869672, + "grad_norm": 0.002651934279128909, + "learning_rate": 6.132968007869711e-07, + "loss": 0.0021, + "step": 71930 + }, + { + "epoch": 0.8991724793119829, + "grad_norm": 4.170710563659668, + "learning_rate": 6.129959195016355e-07, + "loss": 1.1442, + "step": 71932 + }, + { + "epoch": 0.8991974799369984, + "grad_norm": 3.683555841445923, + "learning_rate": 6.126951097063949e-07, + "loss": 0.9702, + "step": 71934 + }, + { + "epoch": 0.899222480562014, + "grad_norm": 0.37861862778663635, + "learning_rate": 6.123943714035418e-07, + "loss": 0.7679, + "step": 71936 + }, + { + "epoch": 0.8992474811870297, + "grad_norm": 2.706211805343628, + "learning_rate": 6.120937045953646e-07, + "loss": 0.8433, + "step": 71938 + }, + { + "epoch": 0.8992724818120453, + "grad_norm": 3.7693216800689697, + "learning_rate": 6.117931092841545e-07, + "loss": 1.591, + "step": 71940 + }, + { + "epoch": 0.899297482437061, + "grad_norm": 0.8195602893829346, + "learning_rate": 6.11492585472202e-07, + "loss": 0.5362, + "step": 71942 + }, + { + "epoch": 0.8993224830620765, + "grad_norm": 2.142483711242676, + "learning_rate": 6.11192133161792e-07, + "loss": 0.6161, + "step": 71944 + }, + { + "epoch": 0.8993474836870922, + "grad_norm": 1.936488389968872, + "learning_rate": 6.108917523552171e-07, + "loss": 1.1889, + "step": 71946 + }, + { + "epoch": 0.8993724843121078, + "grad_norm": 5.240917205810547, + "learning_rate": 6.10591443054761e-07, + "loss": 1.424, + "step": 71948 + }, + { + "epoch": 0.8993974849371235, + "grad_norm": 0.9629038572311401, + "learning_rate": 6.102912052627119e-07, + "loss": 0.1707, + "step": 71950 + }, + { + "epoch": 0.8994224855621391, + "grad_norm": 4.0432915687561035, + "learning_rate": 6.099910389813602e-07, + "loss": 1.1257, + "step": 71952 + }, + { + "epoch": 0.8994474861871546, + "grad_norm": 3.2922632694244385, + "learning_rate": 6.096909442129872e-07, + "loss": 0.8135, + "step": 71954 + }, + { + "epoch": 0.8994724868121703, + "grad_norm": 0.00026201875880360603, + "learning_rate": 6.093909209598814e-07, + "loss": 0.654, + "step": 71956 + }, + { + "epoch": 0.8994974874371859, + "grad_norm": 1.9162898063659668, + "learning_rate": 6.090909692243241e-07, + "loss": 0.5386, + "step": 71958 + }, + { + "epoch": 0.8995224880622016, + "grad_norm": 3.347161293029785, + "learning_rate": 6.087910890086047e-07, + "loss": 1.0614, + "step": 71960 + }, + { + "epoch": 0.8995474886872172, + "grad_norm": 2.206325054168701, + "learning_rate": 6.084912803150012e-07, + "loss": 0.4461, + "step": 71962 + }, + { + "epoch": 0.8995724893122328, + "grad_norm": 3.694117784500122, + "learning_rate": 6.081915431458007e-07, + "loss": 1.1651, + "step": 71964 + }, + { + "epoch": 0.8995974899372484, + "grad_norm": 2.9109983444213867, + "learning_rate": 6.078918775032882e-07, + "loss": 0.4384, + "step": 71966 + }, + { + "epoch": 0.8996224905622641, + "grad_norm": 3.101691246032715, + "learning_rate": 6.075922833897408e-07, + "loss": 0.1636, + "step": 71968 + }, + { + "epoch": 0.8996474911872797, + "grad_norm": 2.2078330516815186, + "learning_rate": 6.07292760807443e-07, + "loss": 0.1106, + "step": 71970 + }, + { + "epoch": 0.8996724918122954, + "grad_norm": 1.691951870918274, + "learning_rate": 6.069933097586744e-07, + "loss": 0.061, + "step": 71972 + }, + { + "epoch": 0.8996974924373109, + "grad_norm": 5.872833728790283, + "learning_rate": 6.066939302457153e-07, + "loss": 1.1, + "step": 71974 + }, + { + "epoch": 0.8997224930623265, + "grad_norm": 1.9938395023345947, + "learning_rate": 6.063946222708495e-07, + "loss": 0.7029, + "step": 71976 + }, + { + "epoch": 0.8997474936873422, + "grad_norm": 3.765778064727783, + "learning_rate": 6.060953858363527e-07, + "loss": 1.053, + "step": 71978 + }, + { + "epoch": 0.8997724943123578, + "grad_norm": 10.146306991577148, + "learning_rate": 6.057962209445068e-07, + "loss": 1.5027, + "step": 71980 + }, + { + "epoch": 0.8997974949373735, + "grad_norm": 2.8007595539093018, + "learning_rate": 6.054971275975863e-07, + "loss": 0.627, + "step": 71982 + }, + { + "epoch": 0.899822495562389, + "grad_norm": 0.001085942261852324, + "learning_rate": 6.051981057978717e-07, + "loss": 0.0, + "step": 71984 + }, + { + "epoch": 0.8998474961874047, + "grad_norm": 2.3957388401031494, + "learning_rate": 6.048991555476391e-07, + "loss": 0.2138, + "step": 71986 + }, + { + "epoch": 0.8998724968124203, + "grad_norm": 2.213279962539673, + "learning_rate": 6.046002768491666e-07, + "loss": 1.9751, + "step": 71988 + }, + { + "epoch": 0.899897497437436, + "grad_norm": 3.764479398727417, + "learning_rate": 6.0430146970473e-07, + "loss": 0.9346, + "step": 71990 + }, + { + "epoch": 0.8999224980624516, + "grad_norm": 8.526823043823242, + "learning_rate": 6.040027341166054e-07, + "loss": 0.4851, + "step": 71992 + }, + { + "epoch": 0.8999474986874672, + "grad_norm": 2.941897392272949, + "learning_rate": 6.037040700870667e-07, + "loss": 1.2839, + "step": 71994 + }, + { + "epoch": 0.8999724993124828, + "grad_norm": 0.0002286435483256355, + "learning_rate": 6.034054776183885e-07, + "loss": 0.9213, + "step": 71996 + }, + { + "epoch": 0.8999974999374984, + "grad_norm": 3.6185946464538574, + "learning_rate": 6.031069567128456e-07, + "loss": 1.2994, + "step": 71998 + }, + { + "epoch": 0.9000225005625141, + "grad_norm": 4.856564998626709, + "learning_rate": 6.028085073727108e-07, + "loss": 0.3381, + "step": 72000 + }, + { + "epoch": 0.9000475011875297, + "grad_norm": 4.315091609954834, + "learning_rate": 6.025101296002566e-07, + "loss": 1.4197, + "step": 72002 + }, + { + "epoch": 0.9000725018125453, + "grad_norm": 0.00032854589517228305, + "learning_rate": 6.022118233977592e-07, + "loss": 0.6008, + "step": 72004 + }, + { + "epoch": 0.9000975024375609, + "grad_norm": 4.680701732635498, + "learning_rate": 6.019135887674866e-07, + "loss": 1.0963, + "step": 72006 + }, + { + "epoch": 0.9001225030625766, + "grad_norm": 3.471207857131958, + "learning_rate": 6.016154257117113e-07, + "loss": 1.4351, + "step": 72008 + }, + { + "epoch": 0.9001475036875922, + "grad_norm": 3.399137020111084, + "learning_rate": 6.013173342327028e-07, + "loss": 1.8387, + "step": 72010 + }, + { + "epoch": 0.9001725043126079, + "grad_norm": 8.153467178344727, + "learning_rate": 6.010193143327314e-07, + "loss": 1.5111, + "step": 72012 + }, + { + "epoch": 0.9001975049376234, + "grad_norm": 0.5313218832015991, + "learning_rate": 6.007213660140699e-07, + "loss": 0.2508, + "step": 72014 + }, + { + "epoch": 0.900222505562639, + "grad_norm": 5.843019008636475, + "learning_rate": 6.004234892789839e-07, + "loss": 2.1623, + "step": 72016 + }, + { + "epoch": 0.9002475061876547, + "grad_norm": 1.548524260520935, + "learning_rate": 6.001256841297465e-07, + "loss": 0.5736, + "step": 72018 + }, + { + "epoch": 0.9002725068126703, + "grad_norm": 0.5128567218780518, + "learning_rate": 5.998279505686188e-07, + "loss": 0.0171, + "step": 72020 + }, + { + "epoch": 0.900297507437686, + "grad_norm": 0.061258383095264435, + "learning_rate": 5.995302885978738e-07, + "loss": 0.2422, + "step": 72022 + }, + { + "epoch": 0.9003225080627015, + "grad_norm": 3.6405999660491943, + "learning_rate": 5.992326982197761e-07, + "loss": 1.5265, + "step": 72024 + }, + { + "epoch": 0.9003475086877172, + "grad_norm": 5.4765400886535645, + "learning_rate": 5.989351794365928e-07, + "loss": 1.5503, + "step": 72026 + }, + { + "epoch": 0.9003725093127328, + "grad_norm": 6.700597286224365, + "learning_rate": 5.986377322505899e-07, + "loss": 1.3122, + "step": 72028 + }, + { + "epoch": 0.9003975099377485, + "grad_norm": 4.110498905181885, + "learning_rate": 5.983403566640322e-07, + "loss": 1.3367, + "step": 72030 + }, + { + "epoch": 0.9004225105627641, + "grad_norm": 4.321615219116211, + "learning_rate": 5.98043052679188e-07, + "loss": 1.253, + "step": 72032 + }, + { + "epoch": 0.9004475111877797, + "grad_norm": 5.8758039474487305, + "learning_rate": 5.977458202983144e-07, + "loss": 1.4605, + "step": 72034 + }, + { + "epoch": 0.9004725118127953, + "grad_norm": 1.851819634437561, + "learning_rate": 5.974486595236795e-07, + "loss": 1.0121, + "step": 72036 + }, + { + "epoch": 0.900497512437811, + "grad_norm": 0.0005486284499056637, + "learning_rate": 5.97151570357547e-07, + "loss": 1.3758, + "step": 72038 + }, + { + "epoch": 0.9005225130628266, + "grad_norm": 0.10402079671621323, + "learning_rate": 5.968545528021785e-07, + "loss": 0.0522, + "step": 72040 + }, + { + "epoch": 0.9005475136878422, + "grad_norm": 0.00033982592867687345, + "learning_rate": 5.965576068598355e-07, + "loss": 0.0008, + "step": 72042 + }, + { + "epoch": 0.9005725143128578, + "grad_norm": 0.00034065506770275533, + "learning_rate": 5.962607325327818e-07, + "loss": 0.6693, + "step": 72044 + }, + { + "epoch": 0.9005975149378734, + "grad_norm": 0.0002766150573734194, + "learning_rate": 5.959639298232756e-07, + "loss": 0.0, + "step": 72046 + }, + { + "epoch": 0.9006225155628891, + "grad_norm": 4.202404975891113, + "learning_rate": 5.956671987335771e-07, + "loss": 0.8867, + "step": 72048 + }, + { + "epoch": 0.9006475161879047, + "grad_norm": 7.364485740661621, + "learning_rate": 5.95370539265947e-07, + "loss": 1.8151, + "step": 72050 + }, + { + "epoch": 0.9006725168129204, + "grad_norm": 2.4304935932159424, + "learning_rate": 5.950739514226456e-07, + "loss": 0.298, + "step": 72052 + }, + { + "epoch": 0.9006975174379359, + "grad_norm": 3.5303115844726562, + "learning_rate": 5.94777435205931e-07, + "loss": 0.9485, + "step": 72054 + }, + { + "epoch": 0.9007225180629516, + "grad_norm": 2.48781156539917, + "learning_rate": 5.944809906180626e-07, + "loss": 0.5891, + "step": 72056 + }, + { + "epoch": 0.9007475186879672, + "grad_norm": 2.760970115661621, + "learning_rate": 5.941846176612965e-07, + "loss": 1.3378, + "step": 72058 + }, + { + "epoch": 0.9007725193129829, + "grad_norm": 5.464852333068848, + "learning_rate": 5.938883163378883e-07, + "loss": 0.6028, + "step": 72060 + }, + { + "epoch": 0.9007975199379985, + "grad_norm": 3.549137592315674, + "learning_rate": 5.935920866500989e-07, + "loss": 0.9421, + "step": 72062 + }, + { + "epoch": 0.900822520563014, + "grad_norm": 0.3881415128707886, + "learning_rate": 5.932959286001805e-07, + "loss": 0.6658, + "step": 72064 + }, + { + "epoch": 0.9008475211880297, + "grad_norm": 4.268006324768066, + "learning_rate": 5.929998421903916e-07, + "loss": 1.2911, + "step": 72066 + }, + { + "epoch": 0.9008725218130453, + "grad_norm": 5.709308624267578, + "learning_rate": 5.927038274229834e-07, + "loss": 1.7277, + "step": 72068 + }, + { + "epoch": 0.900897522438061, + "grad_norm": 6.0485758781433105, + "learning_rate": 5.924078843002168e-07, + "loss": 1.8909, + "step": 72070 + }, + { + "epoch": 0.9009225230630766, + "grad_norm": 1.2764625549316406, + "learning_rate": 5.921120128243374e-07, + "loss": 0.5456, + "step": 72072 + }, + { + "epoch": 0.9009475236880922, + "grad_norm": 0.009255905635654926, + "learning_rate": 5.918162129976024e-07, + "loss": 0.0291, + "step": 72074 + }, + { + "epoch": 0.9009725243131078, + "grad_norm": 4.708743572235107, + "learning_rate": 5.915204848222666e-07, + "loss": 1.1421, + "step": 72076 + }, + { + "epoch": 0.9009975249381235, + "grad_norm": 4.282244682312012, + "learning_rate": 5.912248283005783e-07, + "loss": 0.9974, + "step": 72078 + }, + { + "epoch": 0.9010225255631391, + "grad_norm": 4.2786946296691895, + "learning_rate": 5.909292434347936e-07, + "loss": 0.7697, + "step": 72080 + }, + { + "epoch": 0.9010475261881548, + "grad_norm": 0.0003771613701246679, + "learning_rate": 5.906337302271592e-07, + "loss": 0.8441, + "step": 72082 + }, + { + "epoch": 0.9010725268131703, + "grad_norm": 3.3355259895324707, + "learning_rate": 5.903382886799302e-07, + "loss": 0.5816, + "step": 72084 + }, + { + "epoch": 0.9010975274381859, + "grad_norm": 8.571127891540527, + "learning_rate": 5.900429187953516e-07, + "loss": 0.8154, + "step": 72086 + }, + { + "epoch": 0.9011225280632016, + "grad_norm": 4.514206886291504, + "learning_rate": 5.897476205756758e-07, + "loss": 0.7838, + "step": 72088 + }, + { + "epoch": 0.9011475286882172, + "grad_norm": 1.5471365451812744, + "learning_rate": 5.894523940231522e-07, + "loss": 1.1967, + "step": 72090 + }, + { + "epoch": 0.9011725293132329, + "grad_norm": 7.154496669769287, + "learning_rate": 5.89157239140028e-07, + "loss": 1.2285, + "step": 72092 + }, + { + "epoch": 0.9011975299382484, + "grad_norm": 1.6179451942443848, + "learning_rate": 5.888621559285524e-07, + "loss": 0.1248, + "step": 72094 + }, + { + "epoch": 0.9012225305632641, + "grad_norm": 2.876025676727295, + "learning_rate": 5.885671443909702e-07, + "loss": 0.7111, + "step": 72096 + }, + { + "epoch": 0.9012475311882797, + "grad_norm": 4.039312839508057, + "learning_rate": 5.882722045295308e-07, + "loss": 0.8446, + "step": 72098 + }, + { + "epoch": 0.9012725318132954, + "grad_norm": 0.0003752603370230645, + "learning_rate": 5.879773363464791e-07, + "loss": 0.2218, + "step": 72100 + }, + { + "epoch": 0.901297532438311, + "grad_norm": 0.6884267926216125, + "learning_rate": 5.876825398440611e-07, + "loss": 0.6381, + "step": 72102 + }, + { + "epoch": 0.9013225330633265, + "grad_norm": 2.184974193572998, + "learning_rate": 5.873878150245215e-07, + "loss": 0.8123, + "step": 72104 + }, + { + "epoch": 0.9013475336883422, + "grad_norm": 3.725306987762451, + "learning_rate": 5.870931618901054e-07, + "loss": 0.4676, + "step": 72106 + }, + { + "epoch": 0.9013725343133578, + "grad_norm": 0.4117679297924042, + "learning_rate": 5.867985804430564e-07, + "loss": 0.7215, + "step": 72108 + }, + { + "epoch": 0.9013975349383735, + "grad_norm": 3.6649649143218994, + "learning_rate": 5.865040706856196e-07, + "loss": 1.844, + "step": 72110 + }, + { + "epoch": 0.9014225355633891, + "grad_norm": 4.760343551635742, + "learning_rate": 5.862096326200351e-07, + "loss": 1.2412, + "step": 72112 + }, + { + "epoch": 0.9014475361884047, + "grad_norm": 1.6103452444076538, + "learning_rate": 5.859152662485479e-07, + "loss": 0.4016, + "step": 72114 + }, + { + "epoch": 0.9014725368134203, + "grad_norm": 4.669138431549072, + "learning_rate": 5.856209715733974e-07, + "loss": 1.0603, + "step": 72116 + }, + { + "epoch": 0.901497537438436, + "grad_norm": 3.624856472015381, + "learning_rate": 5.853267485968272e-07, + "loss": 0.791, + "step": 72118 + }, + { + "epoch": 0.9015225380634516, + "grad_norm": 5.170909881591797, + "learning_rate": 5.850325973210768e-07, + "loss": 1.8129, + "step": 72120 + }, + { + "epoch": 0.9015475386884673, + "grad_norm": 5.4355387687683105, + "learning_rate": 5.847385177483878e-07, + "loss": 0.8333, + "step": 72122 + }, + { + "epoch": 0.9015725393134828, + "grad_norm": 3.273129940032959, + "learning_rate": 5.844445098809981e-07, + "loss": 1.2652, + "step": 72124 + }, + { + "epoch": 0.9015975399384984, + "grad_norm": 4.349183559417725, + "learning_rate": 5.841505737211462e-07, + "loss": 1.2782, + "step": 72126 + }, + { + "epoch": 0.9016225405635141, + "grad_norm": 5.52272367477417, + "learning_rate": 5.838567092710745e-07, + "loss": 1.3757, + "step": 72128 + }, + { + "epoch": 0.9016475411885297, + "grad_norm": 3.5506861209869385, + "learning_rate": 5.835629165330159e-07, + "loss": 1.9034, + "step": 72130 + }, + { + "epoch": 0.9016725418135454, + "grad_norm": 4.050374507904053, + "learning_rate": 5.832691955092129e-07, + "loss": 1.6835, + "step": 72132 + }, + { + "epoch": 0.9016975424385609, + "grad_norm": 0.6143119931221008, + "learning_rate": 5.829755462018982e-07, + "loss": 0.0092, + "step": 72134 + }, + { + "epoch": 0.9017225430635766, + "grad_norm": 5.933391094207764, + "learning_rate": 5.826819686133112e-07, + "loss": 0.7555, + "step": 72136 + }, + { + "epoch": 0.9017475436885922, + "grad_norm": 0.0002044949505943805, + "learning_rate": 5.823884627456866e-07, + "loss": 0.0099, + "step": 72138 + }, + { + "epoch": 0.9017725443136079, + "grad_norm": 3.1221227645874023, + "learning_rate": 5.820950286012584e-07, + "loss": 0.5413, + "step": 72140 + }, + { + "epoch": 0.9017975449386235, + "grad_norm": 3.867594003677368, + "learning_rate": 5.818016661822645e-07, + "loss": 0.5098, + "step": 72142 + }, + { + "epoch": 0.901822545563639, + "grad_norm": 3.91062068939209, + "learning_rate": 5.815083754909356e-07, + "loss": 0.8016, + "step": 72144 + }, + { + "epoch": 0.9018475461886547, + "grad_norm": 3.2734107971191406, + "learning_rate": 5.812151565295087e-07, + "loss": 1.1085, + "step": 72146 + }, + { + "epoch": 0.9018725468136704, + "grad_norm": 2.774885654449463, + "learning_rate": 5.809220093002133e-07, + "loss": 0.7821, + "step": 72148 + }, + { + "epoch": 0.901897547438686, + "grad_norm": 0.27632734179496765, + "learning_rate": 5.80628933805285e-07, + "loss": 1.0643, + "step": 72150 + }, + { + "epoch": 0.9019225480637016, + "grad_norm": 5.203001976013184, + "learning_rate": 5.803359300469558e-07, + "loss": 1.3773, + "step": 72152 + }, + { + "epoch": 0.9019475486887172, + "grad_norm": 5.310014724731445, + "learning_rate": 5.800429980274536e-07, + "loss": 1.8664, + "step": 72154 + }, + { + "epoch": 0.9019725493137328, + "grad_norm": 1.900346279144287, + "learning_rate": 5.797501377490144e-07, + "loss": 1.4602, + "step": 72156 + }, + { + "epoch": 0.9019975499387485, + "grad_norm": 16.991411209106445, + "learning_rate": 5.794573492138633e-07, + "loss": 1.6146, + "step": 72158 + }, + { + "epoch": 0.9020225505637641, + "grad_norm": 5.540290355682373, + "learning_rate": 5.79164632424235e-07, + "loss": 1.5876, + "step": 72160 + }, + { + "epoch": 0.9020475511887798, + "grad_norm": 1.5649940967559814, + "learning_rate": 5.788719873823567e-07, + "loss": 0.4969, + "step": 72162 + }, + { + "epoch": 0.9020725518137953, + "grad_norm": 2.842665910720825, + "learning_rate": 5.785794140904555e-07, + "loss": 1.2047, + "step": 72164 + }, + { + "epoch": 0.902097552438811, + "grad_norm": 0.00038268149364739656, + "learning_rate": 5.782869125507628e-07, + "loss": 0.7759, + "step": 72166 + }, + { + "epoch": 0.9021225530638266, + "grad_norm": 2.201012134552002, + "learning_rate": 5.779944827655026e-07, + "loss": 0.1055, + "step": 72168 + }, + { + "epoch": 0.9021475536888423, + "grad_norm": 3.2844138145446777, + "learning_rate": 5.777021247369063e-07, + "loss": 0.8075, + "step": 72170 + }, + { + "epoch": 0.9021725543138579, + "grad_norm": 0.9012537002563477, + "learning_rate": 5.774098384671956e-07, + "loss": 0.0405, + "step": 72172 + }, + { + "epoch": 0.9021975549388734, + "grad_norm": 4.043436527252197, + "learning_rate": 5.771176239586018e-07, + "loss": 1.4114, + "step": 72174 + }, + { + "epoch": 0.9022225555638891, + "grad_norm": 2.7967021465301514, + "learning_rate": 5.768254812133467e-07, + "loss": 0.521, + "step": 72176 + }, + { + "epoch": 0.9022475561889047, + "grad_norm": 0.6587178707122803, + "learning_rate": 5.765334102336551e-07, + "loss": 1.1082, + "step": 72178 + }, + { + "epoch": 0.9022725568139204, + "grad_norm": 7.395159721374512, + "learning_rate": 5.76241411021754e-07, + "loss": 0.8128, + "step": 72180 + }, + { + "epoch": 0.902297557438936, + "grad_norm": 3.4757304191589355, + "learning_rate": 5.75949483579864e-07, + "loss": 1.0062, + "step": 72182 + }, + { + "epoch": 0.9023225580639516, + "grad_norm": 3.4065451622009277, + "learning_rate": 5.756576279102122e-07, + "loss": 0.8591, + "step": 72184 + }, + { + "epoch": 0.9023475586889672, + "grad_norm": 0.000530602817889303, + "learning_rate": 5.753658440150167e-07, + "loss": 0.2272, + "step": 72186 + }, + { + "epoch": 0.9023725593139829, + "grad_norm": 2.001577377319336, + "learning_rate": 5.750741318965047e-07, + "loss": 1.4759, + "step": 72188 + }, + { + "epoch": 0.9023975599389985, + "grad_norm": 3.696012258529663, + "learning_rate": 5.747824915568945e-07, + "loss": 0.1855, + "step": 72190 + }, + { + "epoch": 0.9024225605640142, + "grad_norm": 5.712766647338867, + "learning_rate": 5.744909229984064e-07, + "loss": 0.6136, + "step": 72192 + }, + { + "epoch": 0.9024475611890297, + "grad_norm": 11.763415336608887, + "learning_rate": 5.741994262232642e-07, + "loss": 1.2262, + "step": 72194 + }, + { + "epoch": 0.9024725618140453, + "grad_norm": 4.845830917358398, + "learning_rate": 5.739080012336851e-07, + "loss": 1.5067, + "step": 72196 + }, + { + "epoch": 0.902497562439061, + "grad_norm": 4.311565399169922, + "learning_rate": 5.736166480318905e-07, + "loss": 1.1304, + "step": 72198 + }, + { + "epoch": 0.9025225630640766, + "grad_norm": 1.1328232288360596, + "learning_rate": 5.733253666200978e-07, + "loss": 0.7648, + "step": 72200 + }, + { + "epoch": 0.9025475636890923, + "grad_norm": 3.9323408603668213, + "learning_rate": 5.730341570005272e-07, + "loss": 0.8952, + "step": 72202 + }, + { + "epoch": 0.9025725643141078, + "grad_norm": 0.8451297879219055, + "learning_rate": 5.727430191753958e-07, + "loss": 0.888, + "step": 72204 + }, + { + "epoch": 0.9025975649391235, + "grad_norm": 0.98016357421875, + "learning_rate": 5.724519531469186e-07, + "loss": 0.618, + "step": 72206 + }, + { + "epoch": 0.9026225655641391, + "grad_norm": 0.0003894023539032787, + "learning_rate": 5.721609589173149e-07, + "loss": 0.0, + "step": 72208 + }, + { + "epoch": 0.9026475661891548, + "grad_norm": 0.0003112588601652533, + "learning_rate": 5.718700364887997e-07, + "loss": 0.35, + "step": 72210 + }, + { + "epoch": 0.9026725668141704, + "grad_norm": 4.556100845336914, + "learning_rate": 5.7157918586359e-07, + "loss": 2.0424, + "step": 72212 + }, + { + "epoch": 0.9026975674391859, + "grad_norm": 3.5363550186157227, + "learning_rate": 5.712884070439007e-07, + "loss": 1.0932, + "step": 72214 + }, + { + "epoch": 0.9027225680642016, + "grad_norm": 0.8080353140830994, + "learning_rate": 5.709977000319433e-07, + "loss": 0.0277, + "step": 72216 + }, + { + "epoch": 0.9027475686892172, + "grad_norm": 0.000646403175778687, + "learning_rate": 5.707070648299351e-07, + "loss": 0.4907, + "step": 72218 + }, + { + "epoch": 0.9027725693142329, + "grad_norm": 0.0015587100060656667, + "learning_rate": 5.704165014400875e-07, + "loss": 0.1016, + "step": 72220 + }, + { + "epoch": 0.9027975699392485, + "grad_norm": 3.2567856311798096, + "learning_rate": 5.701260098646167e-07, + "loss": 0.8674, + "step": 72222 + }, + { + "epoch": 0.9028225705642641, + "grad_norm": 2.9883365631103516, + "learning_rate": 5.698355901057296e-07, + "loss": 0.528, + "step": 72224 + }, + { + "epoch": 0.9028475711892797, + "grad_norm": 4.349588871002197, + "learning_rate": 5.695452421656433e-07, + "loss": 1.1918, + "step": 72226 + }, + { + "epoch": 0.9028725718142954, + "grad_norm": 0.08617499470710754, + "learning_rate": 5.692549660465673e-07, + "loss": 0.0117, + "step": 72228 + }, + { + "epoch": 0.902897572439311, + "grad_norm": 3.7555625438690186, + "learning_rate": 5.689647617507099e-07, + "loss": 0.7389, + "step": 72230 + }, + { + "epoch": 0.9029225730643267, + "grad_norm": 6.168188095092773, + "learning_rate": 5.686746292802847e-07, + "loss": 1.5178, + "step": 72232 + }, + { + "epoch": 0.9029475736893422, + "grad_norm": 5.082901477813721, + "learning_rate": 5.683845686374989e-07, + "loss": 1.0161, + "step": 72234 + }, + { + "epoch": 0.9029725743143578, + "grad_norm": 3.698223829269409, + "learning_rate": 5.68094579824563e-07, + "loss": 1.1061, + "step": 72236 + }, + { + "epoch": 0.9029975749393735, + "grad_norm": 0.010762765072286129, + "learning_rate": 5.67804662843684e-07, + "loss": 0.3929, + "step": 72238 + }, + { + "epoch": 0.9030225755643891, + "grad_norm": 4.202150821685791, + "learning_rate": 5.675148176970724e-07, + "loss": 0.998, + "step": 72240 + }, + { + "epoch": 0.9030475761894048, + "grad_norm": 3.9235711097717285, + "learning_rate": 5.672250443869342e-07, + "loss": 0.2527, + "step": 72242 + }, + { + "epoch": 0.9030725768144203, + "grad_norm": 2.6561179161071777, + "learning_rate": 5.669353429154745e-07, + "loss": 0.4341, + "step": 72244 + }, + { + "epoch": 0.903097577439436, + "grad_norm": 5.644532203674316, + "learning_rate": 5.666457132849024e-07, + "loss": 0.3606, + "step": 72246 + }, + { + "epoch": 0.9031225780644516, + "grad_norm": 6.386544704437256, + "learning_rate": 5.663561554974218e-07, + "loss": 0.5036, + "step": 72248 + }, + { + "epoch": 0.9031475786894673, + "grad_norm": 5.9844231605529785, + "learning_rate": 5.660666695552397e-07, + "loss": 1.1076, + "step": 72250 + }, + { + "epoch": 0.9031725793144829, + "grad_norm": 3.5791542530059814, + "learning_rate": 5.657772554605589e-07, + "loss": 0.6669, + "step": 72252 + }, + { + "epoch": 0.9031975799394985, + "grad_norm": 4.680492877960205, + "learning_rate": 5.654879132155844e-07, + "loss": 0.8854, + "step": 72254 + }, + { + "epoch": 0.9032225805645141, + "grad_norm": 4.3098578453063965, + "learning_rate": 5.651986428225209e-07, + "loss": 1.2662, + "step": 72256 + }, + { + "epoch": 0.9032475811895297, + "grad_norm": 3.375857353210449, + "learning_rate": 5.64909444283569e-07, + "loss": 1.1344, + "step": 72258 + }, + { + "epoch": 0.9032725818145454, + "grad_norm": 0.0005141962319612503, + "learning_rate": 5.646203176009335e-07, + "loss": 0.7102, + "step": 72260 + }, + { + "epoch": 0.903297582439561, + "grad_norm": 5.519587516784668, + "learning_rate": 5.643312627768139e-07, + "loss": 1.0259, + "step": 72262 + }, + { + "epoch": 0.9033225830645766, + "grad_norm": 2.7938270568847656, + "learning_rate": 5.640422798134148e-07, + "loss": 0.7984, + "step": 72264 + }, + { + "epoch": 0.9033475836895922, + "grad_norm": 3.5081329345703125, + "learning_rate": 5.637533687129359e-07, + "loss": 0.6868, + "step": 72266 + }, + { + "epoch": 0.9033725843146079, + "grad_norm": 7.238347053527832, + "learning_rate": 5.634645294775743e-07, + "loss": 0.4172, + "step": 72268 + }, + { + "epoch": 0.9033975849396235, + "grad_norm": 2.791252613067627, + "learning_rate": 5.631757621095346e-07, + "loss": 0.9465, + "step": 72270 + }, + { + "epoch": 0.9034225855646392, + "grad_norm": 2.206165313720703, + "learning_rate": 5.62887066611012e-07, + "loss": 0.8766, + "step": 72272 + }, + { + "epoch": 0.9034475861896547, + "grad_norm": 3.825728178024292, + "learning_rate": 5.625984429842091e-07, + "loss": 0.902, + "step": 72274 + }, + { + "epoch": 0.9034725868146704, + "grad_norm": 0.0008427784778177738, + "learning_rate": 5.623098912313196e-07, + "loss": 0.5103, + "step": 72276 + }, + { + "epoch": 0.903497587439686, + "grad_norm": 2.800973653793335, + "learning_rate": 5.620214113545453e-07, + "loss": 0.7157, + "step": 72278 + }, + { + "epoch": 0.9035225880647016, + "grad_norm": 6.474029064178467, + "learning_rate": 5.617330033560808e-07, + "loss": 0.7101, + "step": 72280 + }, + { + "epoch": 0.9035475886897173, + "grad_norm": 1.1278630495071411, + "learning_rate": 5.614446672381224e-07, + "loss": 1.2955, + "step": 72282 + }, + { + "epoch": 0.9035725893147328, + "grad_norm": 2.122697353363037, + "learning_rate": 5.611564030028672e-07, + "loss": 0.0927, + "step": 72284 + }, + { + "epoch": 0.9035975899397485, + "grad_norm": 0.727997899055481, + "learning_rate": 5.608682106525087e-07, + "loss": 0.8806, + "step": 72286 + }, + { + "epoch": 0.9036225905647641, + "grad_norm": 1.23879075050354, + "learning_rate": 5.605800901892455e-07, + "loss": 0.0904, + "step": 72288 + }, + { + "epoch": 0.9036475911897798, + "grad_norm": 3.6898884773254395, + "learning_rate": 5.602920416152668e-07, + "loss": 0.8164, + "step": 72290 + }, + { + "epoch": 0.9036725918147954, + "grad_norm": 3.0170063972473145, + "learning_rate": 5.60004064932771e-07, + "loss": 0.9595, + "step": 72292 + }, + { + "epoch": 0.903697592439811, + "grad_norm": 8.072426795959473, + "learning_rate": 5.597161601439493e-07, + "loss": 1.0126, + "step": 72294 + }, + { + "epoch": 0.9037225930648266, + "grad_norm": 4.094122409820557, + "learning_rate": 5.594283272509937e-07, + "loss": 1.7941, + "step": 72296 + }, + { + "epoch": 0.9037475936898423, + "grad_norm": 1.1376464366912842, + "learning_rate": 5.591405662560978e-07, + "loss": 0.4108, + "step": 72298 + }, + { + "epoch": 0.9037725943148579, + "grad_norm": 5.534291744232178, + "learning_rate": 5.588528771614499e-07, + "loss": 2.1181, + "step": 72300 + }, + { + "epoch": 0.9037975949398735, + "grad_norm": 2.37958025932312, + "learning_rate": 5.585652599692459e-07, + "loss": 1.2662, + "step": 72302 + }, + { + "epoch": 0.9038225955648891, + "grad_norm": 3.291578769683838, + "learning_rate": 5.582777146816721e-07, + "loss": 0.8271, + "step": 72304 + }, + { + "epoch": 0.9038475961899047, + "grad_norm": 3.417327880859375, + "learning_rate": 5.57990241300922e-07, + "loss": 0.2065, + "step": 72306 + }, + { + "epoch": 0.9038725968149204, + "grad_norm": 2.001373052597046, + "learning_rate": 5.577028398291817e-07, + "loss": 0.876, + "step": 72308 + }, + { + "epoch": 0.903897597439936, + "grad_norm": 2.420586109161377, + "learning_rate": 5.574155102686407e-07, + "loss": 1.0444, + "step": 72310 + }, + { + "epoch": 0.9039225980649517, + "grad_norm": 2.2152981758117676, + "learning_rate": 5.571282526214894e-07, + "loss": 0.8862, + "step": 72312 + }, + { + "epoch": 0.9039475986899672, + "grad_norm": 0.00043876731069758534, + "learning_rate": 5.568410668899127e-07, + "loss": 0.3705, + "step": 72314 + }, + { + "epoch": 0.9039725993149829, + "grad_norm": 3.7089927196502686, + "learning_rate": 5.565539530760999e-07, + "loss": 1.0559, + "step": 72316 + }, + { + "epoch": 0.9039975999399985, + "grad_norm": 4.914072513580322, + "learning_rate": 5.562669111822372e-07, + "loss": 0.2764, + "step": 72318 + }, + { + "epoch": 0.9040226005650142, + "grad_norm": 2.749135732650757, + "learning_rate": 5.559799412105083e-07, + "loss": 0.8739, + "step": 72320 + }, + { + "epoch": 0.9040476011900298, + "grad_norm": 6.734626770019531, + "learning_rate": 5.556930431631024e-07, + "loss": 0.8185, + "step": 72322 + }, + { + "epoch": 0.9040726018150453, + "grad_norm": 3.7078161239624023, + "learning_rate": 5.554062170422014e-07, + "loss": 0.8748, + "step": 72324 + }, + { + "epoch": 0.904097602440061, + "grad_norm": 2.060753107070923, + "learning_rate": 5.551194628499923e-07, + "loss": 1.212, + "step": 72326 + }, + { + "epoch": 0.9041226030650766, + "grad_norm": 3.4211063385009766, + "learning_rate": 5.548327805886566e-07, + "loss": 1.3728, + "step": 72328 + }, + { + "epoch": 0.9041476036900923, + "grad_norm": 2.792855739593506, + "learning_rate": 5.545461702603783e-07, + "loss": 1.0224, + "step": 72330 + }, + { + "epoch": 0.9041726043151079, + "grad_norm": 2.6035385131835938, + "learning_rate": 5.542596318673443e-07, + "loss": 1.3561, + "step": 72332 + }, + { + "epoch": 0.9041976049401235, + "grad_norm": 5.227026462554932, + "learning_rate": 5.539731654117298e-07, + "loss": 0.7955, + "step": 72334 + }, + { + "epoch": 0.9042226055651391, + "grad_norm": 0.8870852589607239, + "learning_rate": 5.536867708957216e-07, + "loss": 0.7552, + "step": 72336 + }, + { + "epoch": 0.9042476061901548, + "grad_norm": 3.2308895587921143, + "learning_rate": 5.534004483214972e-07, + "loss": 0.7064, + "step": 72338 + }, + { + "epoch": 0.9042726068151704, + "grad_norm": 5.918200492858887, + "learning_rate": 5.531141976912413e-07, + "loss": 1.0774, + "step": 72340 + }, + { + "epoch": 0.9042976074401861, + "grad_norm": 1.7138910293579102, + "learning_rate": 5.528280190071289e-07, + "loss": 0.8292, + "step": 72342 + }, + { + "epoch": 0.9043226080652016, + "grad_norm": 2.4262664318084717, + "learning_rate": 5.525419122713438e-07, + "loss": 1.254, + "step": 72344 + }, + { + "epoch": 0.9043476086902172, + "grad_norm": 2.660557508468628, + "learning_rate": 5.522558774860665e-07, + "loss": 0.825, + "step": 72346 + }, + { + "epoch": 0.9043726093152329, + "grad_norm": 2.220766067504883, + "learning_rate": 5.519699146534685e-07, + "loss": 0.3971, + "step": 72348 + }, + { + "epoch": 0.9043976099402485, + "grad_norm": 2.596439838409424, + "learning_rate": 5.516840237757337e-07, + "loss": 0.653, + "step": 72350 + }, + { + "epoch": 0.9044226105652642, + "grad_norm": 5.526555061340332, + "learning_rate": 5.513982048550348e-07, + "loss": 1.5255, + "step": 72352 + }, + { + "epoch": 0.9044476111902797, + "grad_norm": 1.9236090183258057, + "learning_rate": 5.511124578935523e-07, + "loss": 0.2376, + "step": 72354 + }, + { + "epoch": 0.9044726118152954, + "grad_norm": 2.2168993949890137, + "learning_rate": 5.508267828934621e-07, + "loss": 0.6832, + "step": 72356 + }, + { + "epoch": 0.904497612440311, + "grad_norm": 0.6558440923690796, + "learning_rate": 5.505411798569382e-07, + "loss": 0.6328, + "step": 72358 + }, + { + "epoch": 0.9045226130653267, + "grad_norm": 5.494024276733398, + "learning_rate": 5.502556487861576e-07, + "loss": 1.2913, + "step": 72360 + }, + { + "epoch": 0.9045476136903423, + "grad_norm": 2.35311222076416, + "learning_rate": 5.49970189683291e-07, + "loss": 0.0517, + "step": 72362 + }, + { + "epoch": 0.9045726143153578, + "grad_norm": 3.1985106468200684, + "learning_rate": 5.496848025505175e-07, + "loss": 1.0843, + "step": 72364 + }, + { + "epoch": 0.9045976149403735, + "grad_norm": 3.438495635986328, + "learning_rate": 5.493994873900066e-07, + "loss": 0.9731, + "step": 72366 + }, + { + "epoch": 0.9046226155653891, + "grad_norm": 3.693570137023926, + "learning_rate": 5.491142442039321e-07, + "loss": 0.987, + "step": 72368 + }, + { + "epoch": 0.9046476161904048, + "grad_norm": 2.1576006412506104, + "learning_rate": 5.488290729944689e-07, + "loss": 0.3713, + "step": 72370 + }, + { + "epoch": 0.9046726168154204, + "grad_norm": 1.7394355535507202, + "learning_rate": 5.485439737637877e-07, + "loss": 0.4659, + "step": 72372 + }, + { + "epoch": 0.904697617440436, + "grad_norm": 3.439647674560547, + "learning_rate": 5.482589465140576e-07, + "loss": 1.3794, + "step": 72374 + }, + { + "epoch": 0.9047226180654516, + "grad_norm": 1.3904900550842285, + "learning_rate": 5.479739912474502e-07, + "loss": 0.3387, + "step": 72376 + }, + { + "epoch": 0.9047476186904673, + "grad_norm": 0.00040183772216551006, + "learning_rate": 5.476891079661362e-07, + "loss": 1.185, + "step": 72378 + }, + { + "epoch": 0.9047726193154829, + "grad_norm": 2.5381431579589844, + "learning_rate": 5.47404296672287e-07, + "loss": 1.0569, + "step": 72380 + }, + { + "epoch": 0.9047976199404986, + "grad_norm": 5.519543170928955, + "learning_rate": 5.471195573680676e-07, + "loss": 0.4998, + "step": 72382 + }, + { + "epoch": 0.9048226205655141, + "grad_norm": 3.1305859088897705, + "learning_rate": 5.468348900556519e-07, + "loss": 1.2227, + "step": 72384 + }, + { + "epoch": 0.9048476211905297, + "grad_norm": 3.2374942302703857, + "learning_rate": 5.465502947372025e-07, + "loss": 0.7145, + "step": 72386 + }, + { + "epoch": 0.9048726218155454, + "grad_norm": 0.00038061305531300604, + "learning_rate": 5.462657714148911e-07, + "loss": 0.2263, + "step": 72388 + }, + { + "epoch": 0.904897622440561, + "grad_norm": 2.6583473682403564, + "learning_rate": 5.459813200908803e-07, + "loss": 0.8111, + "step": 72390 + }, + { + "epoch": 0.9049226230655767, + "grad_norm": 3.580805778503418, + "learning_rate": 5.456969407673396e-07, + "loss": 0.9545, + "step": 72392 + }, + { + "epoch": 0.9049476236905922, + "grad_norm": 3.537747621536255, + "learning_rate": 5.454126334464349e-07, + "loss": 0.7486, + "step": 72394 + }, + { + "epoch": 0.9049726243156079, + "grad_norm": 3.1687188148498535, + "learning_rate": 5.45128398130329e-07, + "loss": 1.0713, + "step": 72396 + }, + { + "epoch": 0.9049976249406235, + "grad_norm": 0.0003215561737306416, + "learning_rate": 5.448442348211924e-07, + "loss": 0.5877, + "step": 72398 + }, + { + "epoch": 0.9050226255656392, + "grad_norm": 5.212507724761963, + "learning_rate": 5.445601435211812e-07, + "loss": 1.2804, + "step": 72400 + }, + { + "epoch": 0.9050476261906548, + "grad_norm": 2.198254108428955, + "learning_rate": 5.442761242324635e-07, + "loss": 0.3468, + "step": 72402 + }, + { + "epoch": 0.9050726268156704, + "grad_norm": 0.0004429995024111122, + "learning_rate": 5.43992176957201e-07, + "loss": 0.7503, + "step": 72404 + }, + { + "epoch": 0.905097627440686, + "grad_norm": 0.0006903452449478209, + "learning_rate": 5.437083016975575e-07, + "loss": 0.0694, + "step": 72406 + }, + { + "epoch": 0.9051226280657017, + "grad_norm": 2.1150152683258057, + "learning_rate": 5.434244984556947e-07, + "loss": 1.2941, + "step": 72408 + }, + { + "epoch": 0.9051476286907173, + "grad_norm": 2.222583293914795, + "learning_rate": 5.431407672337741e-07, + "loss": 1.2359, + "step": 72410 + }, + { + "epoch": 0.905172629315733, + "grad_norm": 1.5359059572219849, + "learning_rate": 5.428571080339561e-07, + "loss": 0.691, + "step": 72412 + }, + { + "epoch": 0.9051976299407485, + "grad_norm": 0.9721527099609375, + "learning_rate": 5.425735208583994e-07, + "loss": 0.5258, + "step": 72414 + }, + { + "epoch": 0.9052226305657641, + "grad_norm": 11.562705993652344, + "learning_rate": 5.422900057092661e-07, + "loss": 1.0547, + "step": 72416 + }, + { + "epoch": 0.9052476311907798, + "grad_norm": 1.11244797706604, + "learning_rate": 5.420065625887149e-07, + "loss": 0.8844, + "step": 72418 + }, + { + "epoch": 0.9052726318157954, + "grad_norm": 4.48927116394043, + "learning_rate": 5.417231914989041e-07, + "loss": 0.956, + "step": 72420 + }, + { + "epoch": 0.9052976324408111, + "grad_norm": 8.641955375671387, + "learning_rate": 5.414398924419928e-07, + "loss": 0.5504, + "step": 72422 + }, + { + "epoch": 0.9053226330658266, + "grad_norm": 1.2991377115249634, + "learning_rate": 5.411566654201373e-07, + "loss": 0.7393, + "step": 72424 + }, + { + "epoch": 0.9053476336908423, + "grad_norm": 3.1888976097106934, + "learning_rate": 5.408735104354957e-07, + "loss": 0.7076, + "step": 72426 + }, + { + "epoch": 0.9053726343158579, + "grad_norm": 3.464813232421875, + "learning_rate": 5.405904274902219e-07, + "loss": 0.8146, + "step": 72428 + }, + { + "epoch": 0.9053976349408736, + "grad_norm": 2.362375497817993, + "learning_rate": 5.403074165864731e-07, + "loss": 0.4978, + "step": 72430 + }, + { + "epoch": 0.9054226355658892, + "grad_norm": 3.949223518371582, + "learning_rate": 5.400244777264074e-07, + "loss": 1.5696, + "step": 72432 + }, + { + "epoch": 0.9054476361909047, + "grad_norm": 4.170993328094482, + "learning_rate": 5.397416109121767e-07, + "loss": 1.2043, + "step": 72434 + }, + { + "epoch": 0.9054726368159204, + "grad_norm": 0.36273691058158875, + "learning_rate": 5.394588161459379e-07, + "loss": 0.8195, + "step": 72436 + }, + { + "epoch": 0.905497637440936, + "grad_norm": 3.099703073501587, + "learning_rate": 5.391760934298407e-07, + "loss": 0.8908, + "step": 72438 + }, + { + "epoch": 0.9055226380659517, + "grad_norm": 4.039369106292725, + "learning_rate": 5.388934427660408e-07, + "loss": 2.02, + "step": 72440 + }, + { + "epoch": 0.9055476386909673, + "grad_norm": 0.0004899140330962837, + "learning_rate": 5.386108641566912e-07, + "loss": 0.8064, + "step": 72442 + }, + { + "epoch": 0.9055726393159829, + "grad_norm": 0.8009732961654663, + "learning_rate": 5.383283576039422e-07, + "loss": 0.4697, + "step": 72444 + }, + { + "epoch": 0.9055976399409985, + "grad_norm": 0.40214866399765015, + "learning_rate": 5.380459231099477e-07, + "loss": 0.3257, + "step": 72446 + }, + { + "epoch": 0.9056226405660142, + "grad_norm": 3.9243485927581787, + "learning_rate": 5.377635606768561e-07, + "loss": 1.2494, + "step": 72448 + }, + { + "epoch": 0.9056476411910298, + "grad_norm": 0.0006095714634284377, + "learning_rate": 5.374812703068222e-07, + "loss": 0.2908, + "step": 72450 + }, + { + "epoch": 0.9056726418160455, + "grad_norm": 3.7125244140625, + "learning_rate": 5.371990520019899e-07, + "loss": 0.932, + "step": 72452 + }, + { + "epoch": 0.905697642441061, + "grad_norm": 3.5089709758758545, + "learning_rate": 5.369169057645118e-07, + "loss": 0.8468, + "step": 72454 + }, + { + "epoch": 0.9057226430660766, + "grad_norm": 0.000469594553578645, + "learning_rate": 5.366348315965364e-07, + "loss": 0.0296, + "step": 72456 + }, + { + "epoch": 0.9057476436910923, + "grad_norm": 3.040361166000366, + "learning_rate": 5.363528295002119e-07, + "loss": 1.2994, + "step": 72458 + }, + { + "epoch": 0.9057726443161079, + "grad_norm": 3.1403651237487793, + "learning_rate": 5.360708994776864e-07, + "loss": 0.962, + "step": 72460 + }, + { + "epoch": 0.9057976449411236, + "grad_norm": 2.4475042819976807, + "learning_rate": 5.357890415311073e-07, + "loss": 0.488, + "step": 72462 + }, + { + "epoch": 0.9058226455661391, + "grad_norm": 3.582071542739868, + "learning_rate": 5.355072556626206e-07, + "loss": 0.6577, + "step": 72464 + }, + { + "epoch": 0.9058476461911548, + "grad_norm": 2.3795053958892822, + "learning_rate": 5.352255418743702e-07, + "loss": 0.6113, + "step": 72466 + }, + { + "epoch": 0.9058726468161704, + "grad_norm": 6.726069927215576, + "learning_rate": 5.349439001685041e-07, + "loss": 0.2371, + "step": 72468 + }, + { + "epoch": 0.9058976474411861, + "grad_norm": 0.05769222602248192, + "learning_rate": 5.346623305471677e-07, + "loss": 1.0124, + "step": 72470 + }, + { + "epoch": 0.9059226480662017, + "grad_norm": 5.632935523986816, + "learning_rate": 5.343808330125033e-07, + "loss": 1.8166, + "step": 72472 + }, + { + "epoch": 0.9059476486912172, + "grad_norm": 2.6364753246307373, + "learning_rate": 5.340994075666583e-07, + "loss": 0.8847, + "step": 72474 + }, + { + "epoch": 0.9059726493162329, + "grad_norm": 0.0009362848359160125, + "learning_rate": 5.338180542117721e-07, + "loss": 0.3543, + "step": 72476 + }, + { + "epoch": 0.9059976499412485, + "grad_norm": 4.197122097015381, + "learning_rate": 5.335367729499885e-07, + "loss": 1.8157, + "step": 72478 + }, + { + "epoch": 0.9060226505662642, + "grad_norm": 3.0937070846557617, + "learning_rate": 5.332555637834514e-07, + "loss": 0.6155, + "step": 72480 + }, + { + "epoch": 0.9060476511912798, + "grad_norm": 1.90403413772583, + "learning_rate": 5.329744267142989e-07, + "loss": 0.942, + "step": 72482 + }, + { + "epoch": 0.9060726518162954, + "grad_norm": 0.004577910527586937, + "learning_rate": 5.326933617446772e-07, + "loss": 0.0001, + "step": 72484 + }, + { + "epoch": 0.906097652441311, + "grad_norm": 8.513005256652832, + "learning_rate": 5.324123688767213e-07, + "loss": 1.2659, + "step": 72486 + }, + { + "epoch": 0.9061226530663267, + "grad_norm": 2.6794185638427734, + "learning_rate": 5.321314481125783e-07, + "loss": 1.1402, + "step": 72488 + }, + { + "epoch": 0.9061476536913423, + "grad_norm": 3.9488770961761475, + "learning_rate": 5.318505994543787e-07, + "loss": 0.9207, + "step": 72490 + }, + { + "epoch": 0.906172654316358, + "grad_norm": 1.968517780303955, + "learning_rate": 5.315698229042676e-07, + "loss": 1.072, + "step": 72492 + }, + { + "epoch": 0.9061976549413735, + "grad_norm": 2.2155439853668213, + "learning_rate": 5.31289118464382e-07, + "loss": 0.4607, + "step": 72494 + }, + { + "epoch": 0.9062226555663891, + "grad_norm": 2.189635753631592, + "learning_rate": 5.310084861368592e-07, + "loss": 1.0753, + "step": 72496 + }, + { + "epoch": 0.9062476561914048, + "grad_norm": 1.963252305984497, + "learning_rate": 5.307279259238374e-07, + "loss": 0.7344, + "step": 72498 + }, + { + "epoch": 0.9062726568164204, + "grad_norm": 2.5094375610351562, + "learning_rate": 5.304474378274515e-07, + "loss": 1.1561, + "step": 72500 + }, + { + "epoch": 0.9062976574414361, + "grad_norm": 2.8805081844329834, + "learning_rate": 5.3016702184984e-07, + "loss": 1.2086, + "step": 72502 + }, + { + "epoch": 0.9063226580664516, + "grad_norm": 1.1984633207321167, + "learning_rate": 5.298866779931377e-07, + "loss": 0.8587, + "step": 72504 + }, + { + "epoch": 0.9063476586914673, + "grad_norm": 0.0003831386275123805, + "learning_rate": 5.296064062594775e-07, + "loss": 0.5374, + "step": 72506 + }, + { + "epoch": 0.9063726593164829, + "grad_norm": 3.4828011989593506, + "learning_rate": 5.293262066509974e-07, + "loss": 0.9247, + "step": 72508 + }, + { + "epoch": 0.9063976599414986, + "grad_norm": 3.508316993713379, + "learning_rate": 5.290460791698282e-07, + "loss": 1.1586, + "step": 72510 + }, + { + "epoch": 0.9064226605665142, + "grad_norm": 3.690833330154419, + "learning_rate": 5.287660238181058e-07, + "loss": 0.142, + "step": 72512 + }, + { + "epoch": 0.9064476611915298, + "grad_norm": 1.4394409656524658, + "learning_rate": 5.284860405979619e-07, + "loss": 0.3438, + "step": 72514 + }, + { + "epoch": 0.9064726618165454, + "grad_norm": 0.40238770842552185, + "learning_rate": 5.282061295115281e-07, + "loss": 0.6686, + "step": 72516 + }, + { + "epoch": 0.906497662441561, + "grad_norm": 3.5746376514434814, + "learning_rate": 5.279262905609372e-07, + "loss": 1.5312, + "step": 72518 + }, + { + "epoch": 0.9065226630665767, + "grad_norm": 8.359354019165039, + "learning_rate": 5.276465237483198e-07, + "loss": 2.0097, + "step": 72520 + }, + { + "epoch": 0.9065476636915923, + "grad_norm": 5.455760955810547, + "learning_rate": 5.273668290758083e-07, + "loss": 1.5063, + "step": 72522 + }, + { + "epoch": 0.9065726643166079, + "grad_norm": 5.571521759033203, + "learning_rate": 5.270872065455302e-07, + "loss": 0.6518, + "step": 72524 + }, + { + "epoch": 0.9065976649416235, + "grad_norm": 3.5479698181152344, + "learning_rate": 5.268076561596181e-07, + "loss": 0.6982, + "step": 72526 + }, + { + "epoch": 0.9066226655666392, + "grad_norm": 1.713330864906311, + "learning_rate": 5.265281779201981e-07, + "loss": 0.9335, + "step": 72528 + }, + { + "epoch": 0.9066476661916548, + "grad_norm": 6.023283004760742, + "learning_rate": 5.262487718293996e-07, + "loss": 1.0739, + "step": 72530 + }, + { + "epoch": 0.9066726668166705, + "grad_norm": 0.00028709505568258464, + "learning_rate": 5.259694378893509e-07, + "loss": 0.361, + "step": 72532 + }, + { + "epoch": 0.906697667441686, + "grad_norm": 2.401923656463623, + "learning_rate": 5.256901761021793e-07, + "loss": 0.2224, + "step": 72534 + }, + { + "epoch": 0.9067226680667017, + "grad_norm": 2.6007556915283203, + "learning_rate": 5.254109864700118e-07, + "loss": 0.8283, + "step": 72536 + }, + { + "epoch": 0.9067476686917173, + "grad_norm": 1.18576979637146, + "learning_rate": 5.251318689949735e-07, + "loss": 0.0556, + "step": 72538 + }, + { + "epoch": 0.906772669316733, + "grad_norm": 4.3069281578063965, + "learning_rate": 5.248528236791917e-07, + "loss": 0.1891, + "step": 72540 + }, + { + "epoch": 0.9067976699417486, + "grad_norm": 3.569160223007202, + "learning_rate": 5.24573850524791e-07, + "loss": 0.5382, + "step": 72542 + }, + { + "epoch": 0.9068226705667641, + "grad_norm": 3.282064199447632, + "learning_rate": 5.242949495338956e-07, + "loss": 0.8306, + "step": 72544 + }, + { + "epoch": 0.9068476711917798, + "grad_norm": 2.3114335536956787, + "learning_rate": 5.240161207086303e-07, + "loss": 1.0292, + "step": 72546 + }, + { + "epoch": 0.9068726718167954, + "grad_norm": 4.93745231628418, + "learning_rate": 5.237373640511179e-07, + "loss": 1.521, + "step": 72548 + }, + { + "epoch": 0.9068976724418111, + "grad_norm": 2.387608528137207, + "learning_rate": 5.234586795634822e-07, + "loss": 0.9009, + "step": 72550 + }, + { + "epoch": 0.9069226730668267, + "grad_norm": 3.9342570304870605, + "learning_rate": 5.231800672478438e-07, + "loss": 0.7917, + "step": 72552 + }, + { + "epoch": 0.9069476736918423, + "grad_norm": 2.343583822250366, + "learning_rate": 5.229015271063265e-07, + "loss": 1.0658, + "step": 72554 + }, + { + "epoch": 0.9069726743168579, + "grad_norm": 4.558408260345459, + "learning_rate": 5.226230591410519e-07, + "loss": 1.2792, + "step": 72556 + }, + { + "epoch": 0.9069976749418736, + "grad_norm": 4.145689964294434, + "learning_rate": 5.223446633541384e-07, + "loss": 1.6889, + "step": 72558 + }, + { + "epoch": 0.9070226755668892, + "grad_norm": 1.5023521184921265, + "learning_rate": 5.220663397477088e-07, + "loss": 0.1308, + "step": 72560 + }, + { + "epoch": 0.9070476761919049, + "grad_norm": 3.471991777420044, + "learning_rate": 5.217880883238802e-07, + "loss": 1.4432, + "step": 72562 + }, + { + "epoch": 0.9070726768169204, + "grad_norm": 5.178956508636475, + "learning_rate": 5.215099090847752e-07, + "loss": 0.2813, + "step": 72564 + }, + { + "epoch": 0.907097677441936, + "grad_norm": 3.4224531650543213, + "learning_rate": 5.212318020325102e-07, + "loss": 1.1963, + "step": 72566 + }, + { + "epoch": 0.9071226780669517, + "grad_norm": 0.10572486370801926, + "learning_rate": 5.20953767169201e-07, + "loss": 0.9611, + "step": 72568 + }, + { + "epoch": 0.9071476786919673, + "grad_norm": 0.012373574078083038, + "learning_rate": 5.206758044969695e-07, + "loss": 0.1894, + "step": 72570 + }, + { + "epoch": 0.907172679316983, + "grad_norm": 3.4374401569366455, + "learning_rate": 5.203979140179293e-07, + "loss": 0.2412, + "step": 72572 + }, + { + "epoch": 0.9071976799419985, + "grad_norm": 0.620215892791748, + "learning_rate": 5.201200957342001e-07, + "loss": 0.822, + "step": 72574 + }, + { + "epoch": 0.9072226805670142, + "grad_norm": 5.383777618408203, + "learning_rate": 5.198423496478933e-07, + "loss": 2.0374, + "step": 72576 + }, + { + "epoch": 0.9072476811920298, + "grad_norm": 2.033201217651367, + "learning_rate": 5.195646757611284e-07, + "loss": 0.5438, + "step": 72578 + }, + { + "epoch": 0.9072726818170455, + "grad_norm": 2.2835705280303955, + "learning_rate": 5.192870740760181e-07, + "loss": 0.5218, + "step": 72580 + }, + { + "epoch": 0.9072976824420611, + "grad_norm": 7.832005500793457, + "learning_rate": 5.190095445946752e-07, + "loss": 0.4095, + "step": 72582 + }, + { + "epoch": 0.9073226830670766, + "grad_norm": 7.911864757537842, + "learning_rate": 5.18732087319217e-07, + "loss": 1.5347, + "step": 72584 + }, + { + "epoch": 0.9073476836920923, + "grad_norm": 2.084167242050171, + "learning_rate": 5.184547022517516e-07, + "loss": 0.0738, + "step": 72586 + }, + { + "epoch": 0.9073726843171079, + "grad_norm": 3.5770263671875, + "learning_rate": 5.181773893943964e-07, + "loss": 1.3233, + "step": 72588 + }, + { + "epoch": 0.9073976849421236, + "grad_norm": 4.603903293609619, + "learning_rate": 5.179001487492607e-07, + "loss": 0.4792, + "step": 72590 + }, + { + "epoch": 0.9074226855671392, + "grad_norm": 2.7063283920288086, + "learning_rate": 5.176229803184573e-07, + "loss": 0.505, + "step": 72592 + }, + { + "epoch": 0.9074476861921548, + "grad_norm": 0.0002975548559334129, + "learning_rate": 5.173458841040968e-07, + "loss": 0.0, + "step": 72594 + }, + { + "epoch": 0.9074726868171704, + "grad_norm": 0.023232534527778625, + "learning_rate": 5.170688601082873e-07, + "loss": 0.1373, + "step": 72596 + }, + { + "epoch": 0.9074976874421861, + "grad_norm": 0.3925924003124237, + "learning_rate": 5.167919083331418e-07, + "loss": 0.2064, + "step": 72598 + }, + { + "epoch": 0.9075226880672017, + "grad_norm": 3.5975160598754883, + "learning_rate": 5.165150287807674e-07, + "loss": 1.0904, + "step": 72600 + }, + { + "epoch": 0.9075476886922174, + "grad_norm": 0.4272085726261139, + "learning_rate": 5.162382214532736e-07, + "loss": 0.4719, + "step": 72602 + }, + { + "epoch": 0.9075726893172329, + "grad_norm": 1.1348813772201538, + "learning_rate": 5.159614863527674e-07, + "loss": 0.026, + "step": 72604 + }, + { + "epoch": 0.9075976899422485, + "grad_norm": 3.486290216445923, + "learning_rate": 5.156848234813594e-07, + "loss": 0.7212, + "step": 72606 + }, + { + "epoch": 0.9076226905672642, + "grad_norm": 0.0003317710943520069, + "learning_rate": 5.154082328411536e-07, + "loss": 0.6568, + "step": 72608 + }, + { + "epoch": 0.9076476911922798, + "grad_norm": 2.3918888568878174, + "learning_rate": 5.151317144342572e-07, + "loss": 0.7867, + "step": 72610 + }, + { + "epoch": 0.9076726918172955, + "grad_norm": 3.452953577041626, + "learning_rate": 5.148552682627772e-07, + "loss": 1.7408, + "step": 72612 + }, + { + "epoch": 0.907697692442311, + "grad_norm": 0.7666354775428772, + "learning_rate": 5.145788943288166e-07, + "loss": 0.4429, + "step": 72614 + }, + { + "epoch": 0.9077226930673267, + "grad_norm": 3.8627161979675293, + "learning_rate": 5.143025926344836e-07, + "loss": 1.8143, + "step": 72616 + }, + { + "epoch": 0.9077476936923423, + "grad_norm": 2.305673837661743, + "learning_rate": 5.140263631818787e-07, + "loss": 0.6099, + "step": 72618 + }, + { + "epoch": 0.907772694317358, + "grad_norm": 1.1471320390701294, + "learning_rate": 5.137502059731092e-07, + "loss": 0.1117, + "step": 72620 + }, + { + "epoch": 0.9077976949423736, + "grad_norm": 0.002827135846018791, + "learning_rate": 5.134741210102767e-07, + "loss": 0.7898, + "step": 72622 + }, + { + "epoch": 0.9078226955673891, + "grad_norm": 16.822195053100586, + "learning_rate": 5.131981082954818e-07, + "loss": 1.6872, + "step": 72624 + }, + { + "epoch": 0.9078476961924048, + "grad_norm": 4.3676676750183105, + "learning_rate": 5.129221678308305e-07, + "loss": 1.4623, + "step": 72626 + }, + { + "epoch": 0.9078726968174204, + "grad_norm": 5.048422813415527, + "learning_rate": 5.126462996184212e-07, + "loss": 1.1732, + "step": 72628 + }, + { + "epoch": 0.9078976974424361, + "grad_norm": 4.101471900939941, + "learning_rate": 5.123705036603577e-07, + "loss": 2.2541, + "step": 72630 + }, + { + "epoch": 0.9079226980674517, + "grad_norm": 5.485106468200684, + "learning_rate": 5.120947799587383e-07, + "loss": 1.5666, + "step": 72632 + }, + { + "epoch": 0.9079476986924673, + "grad_norm": 3.87495756149292, + "learning_rate": 5.118191285156615e-07, + "loss": 1.0905, + "step": 72634 + }, + { + "epoch": 0.9079726993174829, + "grad_norm": 4.302545070648193, + "learning_rate": 5.11543549333231e-07, + "loss": 0.1633, + "step": 72636 + }, + { + "epoch": 0.9079976999424986, + "grad_norm": 4.611143589019775, + "learning_rate": 5.112680424135407e-07, + "loss": 0.8005, + "step": 72638 + }, + { + "epoch": 0.9080227005675142, + "grad_norm": 3.638429880142212, + "learning_rate": 5.109926077586924e-07, + "loss": 0.7876, + "step": 72640 + }, + { + "epoch": 0.9080477011925299, + "grad_norm": 1.6404227018356323, + "learning_rate": 5.10717245370782e-07, + "loss": 0.4324, + "step": 72642 + }, + { + "epoch": 0.9080727018175454, + "grad_norm": 4.616758823394775, + "learning_rate": 5.10441955251908e-07, + "loss": 1.6884, + "step": 72644 + }, + { + "epoch": 0.908097702442561, + "grad_norm": 4.042145252227783, + "learning_rate": 5.101667374041664e-07, + "loss": 1.2572, + "step": 72646 + }, + { + "epoch": 0.9081227030675767, + "grad_norm": 0.3794533610343933, + "learning_rate": 5.09891591829651e-07, + "loss": 0.0468, + "step": 72648 + }, + { + "epoch": 0.9081477036925923, + "grad_norm": 0.00030573143158107996, + "learning_rate": 5.096165185304613e-07, + "loss": 0.9034, + "step": 72650 + }, + { + "epoch": 0.908172704317608, + "grad_norm": 0.0004057823680341244, + "learning_rate": 5.093415175086879e-07, + "loss": 0.5326, + "step": 72652 + }, + { + "epoch": 0.9081977049426235, + "grad_norm": 6.957130432128906, + "learning_rate": 5.090665887664292e-07, + "loss": 0.6403, + "step": 72654 + }, + { + "epoch": 0.9082227055676392, + "grad_norm": 2.8409039974212646, + "learning_rate": 5.087917323057768e-07, + "loss": 0.8258, + "step": 72656 + }, + { + "epoch": 0.9082477061926548, + "grad_norm": 0.000581679749302566, + "learning_rate": 5.085169481288243e-07, + "loss": 1.4346, + "step": 72658 + }, + { + "epoch": 0.9082727068176705, + "grad_norm": 2.7520549297332764, + "learning_rate": 5.082422362376649e-07, + "loss": 1.1533, + "step": 72660 + }, + { + "epoch": 0.9082977074426861, + "grad_norm": 2.9600367546081543, + "learning_rate": 5.0796759663439e-07, + "loss": 1.2337, + "step": 72662 + }, + { + "epoch": 0.9083227080677017, + "grad_norm": 3.172652244567871, + "learning_rate": 5.076930293210913e-07, + "loss": 1.9141, + "step": 72664 + }, + { + "epoch": 0.9083477086927173, + "grad_norm": 4.893239974975586, + "learning_rate": 5.074185342998606e-07, + "loss": 0.4869, + "step": 72666 + }, + { + "epoch": 0.908372709317733, + "grad_norm": 3.5017919540405273, + "learning_rate": 5.071441115727881e-07, + "loss": 0.4214, + "step": 72668 + }, + { + "epoch": 0.9083977099427486, + "grad_norm": 0.0002796451735775918, + "learning_rate": 5.068697611419626e-07, + "loss": 0.8162, + "step": 72670 + }, + { + "epoch": 0.9084227105677642, + "grad_norm": 0.00022550445282831788, + "learning_rate": 5.065954830094755e-07, + "loss": 1.3762, + "step": 72672 + }, + { + "epoch": 0.9084477111927798, + "grad_norm": 1.9953689575195312, + "learning_rate": 5.063212771774151e-07, + "loss": 0.1399, + "step": 72674 + }, + { + "epoch": 0.9084727118177954, + "grad_norm": 2.8092384338378906, + "learning_rate": 5.060471436478686e-07, + "loss": 0.9019, + "step": 72676 + }, + { + "epoch": 0.9084977124428111, + "grad_norm": 3.228940725326538, + "learning_rate": 5.057730824229246e-07, + "loss": 0.3565, + "step": 72678 + }, + { + "epoch": 0.9085227130678267, + "grad_norm": 0.015910256654024124, + "learning_rate": 5.054990935046699e-07, + "loss": 0.1191, + "step": 72680 + }, + { + "epoch": 0.9085477136928424, + "grad_norm": 0.0009650171850807965, + "learning_rate": 5.052251768951922e-07, + "loss": 0.0933, + "step": 72682 + }, + { + "epoch": 0.9085727143178579, + "grad_norm": 3.1132757663726807, + "learning_rate": 5.049513325965772e-07, + "loss": 1.0276, + "step": 72684 + }, + { + "epoch": 0.9085977149428736, + "grad_norm": 2.3085451126098633, + "learning_rate": 5.046775606109089e-07, + "loss": 0.3603, + "step": 72686 + }, + { + "epoch": 0.9086227155678892, + "grad_norm": 3.03454852104187, + "learning_rate": 5.044038609402746e-07, + "loss": 0.4901, + "step": 72688 + }, + { + "epoch": 0.9086477161929049, + "grad_norm": 3.523949146270752, + "learning_rate": 5.04130233586756e-07, + "loss": 1.3756, + "step": 72690 + }, + { + "epoch": 0.9086727168179205, + "grad_norm": 3.515028238296509, + "learning_rate": 5.038566785524401e-07, + "loss": 1.4683, + "step": 72692 + }, + { + "epoch": 0.908697717442936, + "grad_norm": 3.0866432189941406, + "learning_rate": 5.035831958394077e-07, + "loss": 0.8206, + "step": 72694 + }, + { + "epoch": 0.9087227180679517, + "grad_norm": 3.632763385772705, + "learning_rate": 5.033097854497437e-07, + "loss": 0.733, + "step": 72696 + }, + { + "epoch": 0.9087477186929673, + "grad_norm": 0.07575347274541855, + "learning_rate": 5.030364473855309e-07, + "loss": 0.8748, + "step": 72698 + }, + { + "epoch": 0.908772719317983, + "grad_norm": 5.2764153480529785, + "learning_rate": 5.027631816488476e-07, + "loss": 1.9217, + "step": 72700 + }, + { + "epoch": 0.9087977199429986, + "grad_norm": 3.6024606227874756, + "learning_rate": 5.024899882417778e-07, + "loss": 1.0341, + "step": 72702 + }, + { + "epoch": 0.9088227205680142, + "grad_norm": 1.526222586631775, + "learning_rate": 5.022168671663996e-07, + "loss": 1.0522, + "step": 72704 + }, + { + "epoch": 0.9088477211930298, + "grad_norm": 1.2005008459091187, + "learning_rate": 5.01943818424796e-07, + "loss": 0.0953, + "step": 72706 + }, + { + "epoch": 0.9088727218180455, + "grad_norm": 0.000394664442865178, + "learning_rate": 5.016708420190442e-07, + "loss": 0.329, + "step": 72708 + }, + { + "epoch": 0.9088977224430611, + "grad_norm": 2.631284475326538, + "learning_rate": 5.013979379512257e-07, + "loss": 0.8924, + "step": 72710 + }, + { + "epoch": 0.9089227230680768, + "grad_norm": 3.2342283725738525, + "learning_rate": 5.011251062234168e-07, + "loss": 1.2977, + "step": 72712 + }, + { + "epoch": 0.9089477236930923, + "grad_norm": 3.329102039337158, + "learning_rate": 5.008523468376947e-07, + "loss": 1.8799, + "step": 72714 + }, + { + "epoch": 0.9089727243181079, + "grad_norm": 2.4712724685668945, + "learning_rate": 5.005796597961388e-07, + "loss": 0.439, + "step": 72716 + }, + { + "epoch": 0.9089977249431236, + "grad_norm": 0.000380776560632512, + "learning_rate": 5.003070451008241e-07, + "loss": 0.0, + "step": 72718 + }, + { + "epoch": 0.9090227255681392, + "grad_norm": 4.048574924468994, + "learning_rate": 5.000345027538279e-07, + "loss": 1.0446, + "step": 72720 + }, + { + "epoch": 0.9090477261931549, + "grad_norm": 3.498669147491455, + "learning_rate": 4.99762032757225e-07, + "loss": 1.5505, + "step": 72722 + }, + { + "epoch": 0.9090727268181704, + "grad_norm": 2.276378870010376, + "learning_rate": 4.994896351130917e-07, + "loss": 1.2981, + "step": 72724 + }, + { + "epoch": 0.9090977274431861, + "grad_norm": 2.518911123275757, + "learning_rate": 4.992173098235031e-07, + "loss": 0.4749, + "step": 72726 + }, + { + "epoch": 0.9091227280682017, + "grad_norm": 3.5645415782928467, + "learning_rate": 4.989450568905296e-07, + "loss": 1.3971, + "step": 72728 + }, + { + "epoch": 0.9091477286932174, + "grad_norm": 3.169473648071289, + "learning_rate": 4.986728763162485e-07, + "loss": 1.1528, + "step": 72730 + }, + { + "epoch": 0.909172729318233, + "grad_norm": 2.0484683513641357, + "learning_rate": 4.984007681027303e-07, + "loss": 0.7711, + "step": 72732 + }, + { + "epoch": 0.9091977299432485, + "grad_norm": 0.846760094165802, + "learning_rate": 4.981287322520467e-07, + "loss": 0.21, + "step": 72734 + }, + { + "epoch": 0.9092227305682642, + "grad_norm": 3.902116537094116, + "learning_rate": 4.978567687662749e-07, + "loss": 0.8339, + "step": 72736 + }, + { + "epoch": 0.9092477311932798, + "grad_norm": 3.857907772064209, + "learning_rate": 4.975848776474801e-07, + "loss": 1.0195, + "step": 72738 + }, + { + "epoch": 0.9092727318182955, + "grad_norm": 5.278767108917236, + "learning_rate": 4.973130588977359e-07, + "loss": 0.5975, + "step": 72740 + }, + { + "epoch": 0.9092977324433111, + "grad_norm": 3.3666751384735107, + "learning_rate": 4.970413125191098e-07, + "loss": 0.842, + "step": 72742 + }, + { + "epoch": 0.9093227330683267, + "grad_norm": 3.665581703186035, + "learning_rate": 4.967696385136745e-07, + "loss": 0.7248, + "step": 72744 + }, + { + "epoch": 0.9093477336933423, + "grad_norm": 3.3379058837890625, + "learning_rate": 4.964980368834971e-07, + "loss": 0.268, + "step": 72746 + }, + { + "epoch": 0.909372734318358, + "grad_norm": 3.0667366981506348, + "learning_rate": 4.96226507630646e-07, + "loss": 0.7896, + "step": 72748 + }, + { + "epoch": 0.9093977349433736, + "grad_norm": 0.0003590489213820547, + "learning_rate": 4.959550507571931e-07, + "loss": 0.7644, + "step": 72750 + }, + { + "epoch": 0.9094227355683893, + "grad_norm": 3.36257266998291, + "learning_rate": 4.956836662651998e-07, + "loss": 0.9173, + "step": 72752 + }, + { + "epoch": 0.9094477361934048, + "grad_norm": 1.3576239347457886, + "learning_rate": 4.954123541567369e-07, + "loss": 0.3828, + "step": 72754 + }, + { + "epoch": 0.9094727368184204, + "grad_norm": 0.001579425879754126, + "learning_rate": 4.951411144338681e-07, + "loss": 0.0991, + "step": 72756 + }, + { + "epoch": 0.9094977374434361, + "grad_norm": 5.309293270111084, + "learning_rate": 4.948699470986607e-07, + "loss": 0.7554, + "step": 72758 + }, + { + "epoch": 0.9095227380684517, + "grad_norm": 0.0008778494084253907, + "learning_rate": 4.945988521531808e-07, + "loss": 0.0, + "step": 72760 + }, + { + "epoch": 0.9095477386934674, + "grad_norm": 0.00042023963760584593, + "learning_rate": 4.943278295994925e-07, + "loss": 0.0, + "step": 72762 + }, + { + "epoch": 0.9095727393184829, + "grad_norm": 3.0239031314849854, + "learning_rate": 4.940568794396583e-07, + "loss": 1.6224, + "step": 72764 + }, + { + "epoch": 0.9095977399434986, + "grad_norm": 3.919515609741211, + "learning_rate": 4.937860016757412e-07, + "loss": 0.9333, + "step": 72766 + }, + { + "epoch": 0.9096227405685142, + "grad_norm": 2.415231227874756, + "learning_rate": 4.935151963098073e-07, + "loss": 0.8695, + "step": 72768 + }, + { + "epoch": 0.9096477411935299, + "grad_norm": 3.429117441177368, + "learning_rate": 4.93244463343916e-07, + "loss": 0.5603, + "step": 72770 + }, + { + "epoch": 0.9096727418185455, + "grad_norm": 2.934221029281616, + "learning_rate": 4.929738027801301e-07, + "loss": 0.7834, + "step": 72772 + }, + { + "epoch": 0.909697742443561, + "grad_norm": 0.330458402633667, + "learning_rate": 4.927032146205135e-07, + "loss": 0.5056, + "step": 72774 + }, + { + "epoch": 0.9097227430685767, + "grad_norm": 2.126152515411377, + "learning_rate": 4.924326988671246e-07, + "loss": 1.2654, + "step": 72776 + }, + { + "epoch": 0.9097477436935923, + "grad_norm": 4.287145614624023, + "learning_rate": 4.921622555220229e-07, + "loss": 1.0338, + "step": 72778 + }, + { + "epoch": 0.909772744318608, + "grad_norm": 0.00030999621958471835, + "learning_rate": 4.918918845872689e-07, + "loss": 0.2638, + "step": 72780 + }, + { + "epoch": 0.9097977449436236, + "grad_norm": 1.6557276248931885, + "learning_rate": 4.91621586064922e-07, + "loss": 0.3742, + "step": 72782 + }, + { + "epoch": 0.9098227455686392, + "grad_norm": 4.015910625457764, + "learning_rate": 4.913513599570397e-07, + "loss": 0.8625, + "step": 72784 + }, + { + "epoch": 0.9098477461936548, + "grad_norm": 0.9582393765449524, + "learning_rate": 4.9108120626568e-07, + "loss": 0.7569, + "step": 72786 + }, + { + "epoch": 0.9098727468186705, + "grad_norm": 4.07214879989624, + "learning_rate": 4.908111249929049e-07, + "loss": 1.0127, + "step": 72788 + }, + { + "epoch": 0.9098977474436861, + "grad_norm": 0.0005814051837660372, + "learning_rate": 4.905411161407647e-07, + "loss": 0.6764, + "step": 72790 + }, + { + "epoch": 0.9099227480687018, + "grad_norm": 3.0458028316497803, + "learning_rate": 4.90271179711319e-07, + "loss": 0.5878, + "step": 72792 + }, + { + "epoch": 0.9099477486937173, + "grad_norm": 0.0033027390018105507, + "learning_rate": 4.90001315706623e-07, + "loss": 0.6957, + "step": 72794 + }, + { + "epoch": 0.909972749318733, + "grad_norm": 1.6934107542037964, + "learning_rate": 4.897315241287315e-07, + "loss": 0.0749, + "step": 72796 + }, + { + "epoch": 0.9099977499437486, + "grad_norm": 10.976547241210938, + "learning_rate": 4.894618049797006e-07, + "loss": 1.4044, + "step": 72798 + }, + { + "epoch": 0.9100227505687642, + "grad_norm": 1.590408205986023, + "learning_rate": 4.891921582615833e-07, + "loss": 0.8127, + "step": 72800 + }, + { + "epoch": 0.9100477511937799, + "grad_norm": 2.9147756099700928, + "learning_rate": 4.889225839764367e-07, + "loss": 1.0849, + "step": 72802 + }, + { + "epoch": 0.9100727518187954, + "grad_norm": 0.00034766027238219976, + "learning_rate": 4.886530821263069e-07, + "loss": 0.0198, + "step": 72804 + }, + { + "epoch": 0.9100977524438111, + "grad_norm": 1.6252204179763794, + "learning_rate": 4.883836527132524e-07, + "loss": 0.3508, + "step": 72806 + }, + { + "epoch": 0.9101227530688267, + "grad_norm": 1.7507078647613525, + "learning_rate": 4.881142957393203e-07, + "loss": 0.5453, + "step": 72808 + }, + { + "epoch": 0.9101477536938424, + "grad_norm": 2.3648719787597656, + "learning_rate": 4.878450112065658e-07, + "loss": 0.7931, + "step": 72810 + }, + { + "epoch": 0.910172754318858, + "grad_norm": 3.9714252948760986, + "learning_rate": 4.875757991170405e-07, + "loss": 1.0947, + "step": 72812 + }, + { + "epoch": 0.9101977549438736, + "grad_norm": 2.481264591217041, + "learning_rate": 4.873066594727915e-07, + "loss": 0.108, + "step": 72814 + }, + { + "epoch": 0.9102227555688892, + "grad_norm": 1.6198817491531372, + "learning_rate": 4.870375922758697e-07, + "loss": 0.8603, + "step": 72816 + }, + { + "epoch": 0.9102477561939049, + "grad_norm": 3.4663681983947754, + "learning_rate": 4.867685975283232e-07, + "loss": 1.1318, + "step": 72818 + }, + { + "epoch": 0.9102727568189205, + "grad_norm": 6.096198558807373, + "learning_rate": 4.864996752322027e-07, + "loss": 1.48, + "step": 72820 + }, + { + "epoch": 0.9102977574439362, + "grad_norm": 2.93617582321167, + "learning_rate": 4.862308253895553e-07, + "loss": 0.8928, + "step": 72822 + }, + { + "epoch": 0.9103227580689517, + "grad_norm": 0.2508431077003479, + "learning_rate": 4.859620480024285e-07, + "loss": 0.0094, + "step": 72824 + }, + { + "epoch": 0.9103477586939673, + "grad_norm": 0.5824588537216187, + "learning_rate": 4.856933430728695e-07, + "loss": 1.021, + "step": 72826 + }, + { + "epoch": 0.910372759318983, + "grad_norm": 7.793373107910156, + "learning_rate": 4.854247106029253e-07, + "loss": 0.1984, + "step": 72828 + }, + { + "epoch": 0.9103977599439986, + "grad_norm": 3.1151273250579834, + "learning_rate": 4.851561505946412e-07, + "loss": 0.9885, + "step": 72830 + }, + { + "epoch": 0.9104227605690143, + "grad_norm": 3.184208631515503, + "learning_rate": 4.848876630500599e-07, + "loss": 1.2236, + "step": 72832 + }, + { + "epoch": 0.9104477611940298, + "grad_norm": 5.570200443267822, + "learning_rate": 4.846192479712297e-07, + "loss": 1.1042, + "step": 72834 + }, + { + "epoch": 0.9104727618190455, + "grad_norm": 2.12656307220459, + "learning_rate": 4.843509053601947e-07, + "loss": 0.6669, + "step": 72836 + }, + { + "epoch": 0.9104977624440611, + "grad_norm": 4.275284290313721, + "learning_rate": 4.840826352189964e-07, + "loss": 2.3359, + "step": 72838 + }, + { + "epoch": 0.9105227630690768, + "grad_norm": 0.0002941332058981061, + "learning_rate": 4.838144375496812e-07, + "loss": 0.6152, + "step": 72840 + }, + { + "epoch": 0.9105477636940924, + "grad_norm": 5.091569423675537, + "learning_rate": 4.835463123542872e-07, + "loss": 1.2269, + "step": 72842 + }, + { + "epoch": 0.9105727643191079, + "grad_norm": 3.6369400024414062, + "learning_rate": 4.832782596348596e-07, + "loss": 1.1311, + "step": 72844 + }, + { + "epoch": 0.9105977649441236, + "grad_norm": 6.568601608276367, + "learning_rate": 4.830102793934389e-07, + "loss": 0.8934, + "step": 72846 + }, + { + "epoch": 0.9106227655691392, + "grad_norm": 4.29121208190918, + "learning_rate": 4.827423716320656e-07, + "loss": 1.0996, + "step": 72848 + }, + { + "epoch": 0.9106477661941549, + "grad_norm": 4.092860698699951, + "learning_rate": 4.824745363527816e-07, + "loss": 1.3582, + "step": 72850 + }, + { + "epoch": 0.9106727668191705, + "grad_norm": 3.4487709999084473, + "learning_rate": 4.822067735576242e-07, + "loss": 1.2583, + "step": 72852 + }, + { + "epoch": 0.9106977674441861, + "grad_norm": 0.017551546916365623, + "learning_rate": 4.81939083248637e-07, + "loss": 0.0372, + "step": 72854 + }, + { + "epoch": 0.9107227680692017, + "grad_norm": 1.3001842498779297, + "learning_rate": 4.816714654278532e-07, + "loss": 0.4763, + "step": 72856 + }, + { + "epoch": 0.9107477686942174, + "grad_norm": 3.433155059814453, + "learning_rate": 4.814039200973131e-07, + "loss": 1.03, + "step": 72858 + }, + { + "epoch": 0.910772769319233, + "grad_norm": 0.12196087837219238, + "learning_rate": 4.811364472590562e-07, + "loss": 0.495, + "step": 72860 + }, + { + "epoch": 0.9107977699442487, + "grad_norm": 0.0007282322039827704, + "learning_rate": 4.808690469151167e-07, + "loss": 0.4554, + "step": 72862 + }, + { + "epoch": 0.9108227705692642, + "grad_norm": 3.0645782947540283, + "learning_rate": 4.806017190675338e-07, + "loss": 1.3361, + "step": 72864 + }, + { + "epoch": 0.9108477711942798, + "grad_norm": 3.2655091285705566, + "learning_rate": 4.803344637183404e-07, + "loss": 1.1373, + "step": 72866 + }, + { + "epoch": 0.9108727718192955, + "grad_norm": 2.509110450744629, + "learning_rate": 4.800672808695761e-07, + "loss": 0.3861, + "step": 72868 + }, + { + "epoch": 0.9108977724443111, + "grad_norm": 3.5529379844665527, + "learning_rate": 4.798001705232714e-07, + "loss": 0.7134, + "step": 72870 + }, + { + "epoch": 0.9109227730693268, + "grad_norm": 3.193126678466797, + "learning_rate": 4.795331326814611e-07, + "loss": 0.5246, + "step": 72872 + }, + { + "epoch": 0.9109477736943423, + "grad_norm": 3.9822018146514893, + "learning_rate": 4.792661673461818e-07, + "loss": 0.8257, + "step": 72874 + }, + { + "epoch": 0.910972774319358, + "grad_norm": 2.012887716293335, + "learning_rate": 4.78999274519465e-07, + "loss": 0.3614, + "step": 72876 + }, + { + "epoch": 0.9109977749443736, + "grad_norm": 11.054484367370605, + "learning_rate": 4.787324542033433e-07, + "loss": 0.352, + "step": 72878 + }, + { + "epoch": 0.9110227755693893, + "grad_norm": 1.8792333602905273, + "learning_rate": 4.784657063998488e-07, + "loss": 0.2016, + "step": 72880 + }, + { + "epoch": 0.9110477761944049, + "grad_norm": 3.9721555709838867, + "learning_rate": 4.781990311110118e-07, + "loss": 1.1919, + "step": 72882 + }, + { + "epoch": 0.9110727768194204, + "grad_norm": 0.0006187137332744896, + "learning_rate": 4.779324283388664e-07, + "loss": 0.0, + "step": 72884 + }, + { + "epoch": 0.9110977774444361, + "grad_norm": 3.9448776245117188, + "learning_rate": 4.776658980854399e-07, + "loss": 0.3508, + "step": 72886 + }, + { + "epoch": 0.9111227780694517, + "grad_norm": 4.77545690536499, + "learning_rate": 4.773994403527638e-07, + "loss": 1.4481, + "step": 72888 + }, + { + "epoch": 0.9111477786944674, + "grad_norm": 2.2609503269195557, + "learning_rate": 4.771330551428654e-07, + "loss": 0.1705, + "step": 72890 + }, + { + "epoch": 0.911172779319483, + "grad_norm": 3.751556396484375, + "learning_rate": 4.768667424577778e-07, + "loss": 1.0692, + "step": 72892 + }, + { + "epoch": 0.9111977799444986, + "grad_norm": 2.8571341037750244, + "learning_rate": 4.766005022995246e-07, + "loss": 1.2138, + "step": 72894 + }, + { + "epoch": 0.9112227805695142, + "grad_norm": 3.7655422687530518, + "learning_rate": 4.7633433467013544e-07, + "loss": 1.4599, + "step": 72896 + }, + { + "epoch": 0.9112477811945299, + "grad_norm": 2.851820230484009, + "learning_rate": 4.760682395716376e-07, + "loss": 1.1748, + "step": 72898 + }, + { + "epoch": 0.9112727818195455, + "grad_norm": 0.0008460321114398539, + "learning_rate": 4.7580221700605613e-07, + "loss": 0.539, + "step": 72900 + }, + { + "epoch": 0.9112977824445612, + "grad_norm": 3.3505680561065674, + "learning_rate": 4.755362669754193e-07, + "loss": 0.832, + "step": 72902 + }, + { + "epoch": 0.9113227830695767, + "grad_norm": 3.693263530731201, + "learning_rate": 4.752703894817512e-07, + "loss": 1.23, + "step": 72904 + }, + { + "epoch": 0.9113477836945924, + "grad_norm": 3.8152272701263428, + "learning_rate": 4.7500458452707674e-07, + "loss": 0.6242, + "step": 72906 + }, + { + "epoch": 0.911372784319608, + "grad_norm": 3.267094850540161, + "learning_rate": 4.747388521134211e-07, + "loss": 0.8144, + "step": 72908 + }, + { + "epoch": 0.9113977849446236, + "grad_norm": 4.92988395690918, + "learning_rate": 4.744731922428059e-07, + "loss": 0.2138, + "step": 72910 + }, + { + "epoch": 0.9114227855696393, + "grad_norm": 2.3205888271331787, + "learning_rate": 4.7420760491725727e-07, + "loss": 0.3885, + "step": 72912 + }, + { + "epoch": 0.9114477861946548, + "grad_norm": 4.407576560974121, + "learning_rate": 4.739420901387948e-07, + "loss": 1.5088, + "step": 72914 + }, + { + "epoch": 0.9114727868196705, + "grad_norm": 1.2966846227645874, + "learning_rate": 4.7367664790944456e-07, + "loss": 0.5756, + "step": 72916 + }, + { + "epoch": 0.9114977874446861, + "grad_norm": 5.082022666931152, + "learning_rate": 4.734112782312239e-07, + "loss": 1.4461, + "step": 72918 + }, + { + "epoch": 0.9115227880697018, + "grad_norm": 3.2549586296081543, + "learning_rate": 4.7314598110615674e-07, + "loss": 0.7503, + "step": 72920 + }, + { + "epoch": 0.9115477886947174, + "grad_norm": 0.0012004905147477984, + "learning_rate": 4.728807565362625e-07, + "loss": 0.4002, + "step": 72922 + }, + { + "epoch": 0.911572789319733, + "grad_norm": 5.60755729675293, + "learning_rate": 4.726156045235608e-07, + "loss": 0.722, + "step": 72924 + }, + { + "epoch": 0.9115977899447486, + "grad_norm": 0.3710702061653137, + "learning_rate": 4.723505250700722e-07, + "loss": 0.9917, + "step": 72926 + }, + { + "epoch": 0.9116227905697643, + "grad_norm": 2.9201955795288086, + "learning_rate": 4.720855181778128e-07, + "loss": 0.7529, + "step": 72928 + }, + { + "epoch": 0.9116477911947799, + "grad_norm": 0.0015273279277607799, + "learning_rate": 4.7182058384880435e-07, + "loss": 0.0432, + "step": 72930 + }, + { + "epoch": 0.9116727918197955, + "grad_norm": 4.179591178894043, + "learning_rate": 4.715557220850631e-07, + "loss": 1.6753, + "step": 72932 + }, + { + "epoch": 0.9116977924448111, + "grad_norm": 5.232846736907959, + "learning_rate": 4.7129093288860503e-07, + "loss": 1.2224, + "step": 72934 + }, + { + "epoch": 0.9117227930698267, + "grad_norm": 2.362082004547119, + "learning_rate": 4.710262162614476e-07, + "loss": 0.5474, + "step": 72936 + }, + { + "epoch": 0.9117477936948424, + "grad_norm": 1.9851093292236328, + "learning_rate": 4.707615722056069e-07, + "loss": 0.5581, + "step": 72938 + }, + { + "epoch": 0.911772794319858, + "grad_norm": 2.8503377437591553, + "learning_rate": 4.7049700072310023e-07, + "loss": 0.6872, + "step": 72940 + }, + { + "epoch": 0.9117977949448737, + "grad_norm": 0.00030660824268125, + "learning_rate": 4.7023250181593817e-07, + "loss": 0.0, + "step": 72942 + }, + { + "epoch": 0.9118227955698892, + "grad_norm": 3.3585011959075928, + "learning_rate": 4.699680754861402e-07, + "loss": 0.4594, + "step": 72944 + }, + { + "epoch": 0.9118477961949049, + "grad_norm": 1.252841830253601, + "learning_rate": 4.6970372173571696e-07, + "loss": 0.7903, + "step": 72946 + }, + { + "epoch": 0.9118727968199205, + "grad_norm": 5.126715660095215, + "learning_rate": 4.6943944056668244e-07, + "loss": 1.1278, + "step": 72948 + }, + { + "epoch": 0.9118977974449362, + "grad_norm": 4.359589099884033, + "learning_rate": 4.691752319810494e-07, + "loss": 0.8844, + "step": 72950 + }, + { + "epoch": 0.9119227980699518, + "grad_norm": 3.2030229568481445, + "learning_rate": 4.689110959808285e-07, + "loss": 0.6923, + "step": 72952 + }, + { + "epoch": 0.9119477986949673, + "grad_norm": 0.00023006605624686927, + "learning_rate": 4.686470325680348e-07, + "loss": 0.7515, + "step": 72954 + }, + { + "epoch": 0.911972799319983, + "grad_norm": 4.616121292114258, + "learning_rate": 4.683830417446766e-07, + "loss": 1.402, + "step": 72956 + }, + { + "epoch": 0.9119977999449986, + "grad_norm": 0.0003912129031959921, + "learning_rate": 4.681191235127658e-07, + "loss": 0.0075, + "step": 72958 + }, + { + "epoch": 0.9120228005700143, + "grad_norm": 1.064903736114502, + "learning_rate": 4.678552778743117e-07, + "loss": 0.4762, + "step": 72960 + }, + { + "epoch": 0.9120478011950299, + "grad_norm": 4.825992584228516, + "learning_rate": 4.675915048313229e-07, + "loss": 0.8115, + "step": 72962 + }, + { + "epoch": 0.9120728018200455, + "grad_norm": 2.414700984954834, + "learning_rate": 4.673278043858109e-07, + "loss": 0.2721, + "step": 72964 + }, + { + "epoch": 0.9120978024450611, + "grad_norm": 0.01826595887541771, + "learning_rate": 4.6706417653978097e-07, + "loss": 0.0002, + "step": 72966 + }, + { + "epoch": 0.9121228030700768, + "grad_norm": 6.552701950073242, + "learning_rate": 4.668006212952425e-07, + "loss": 1.5915, + "step": 72968 + }, + { + "epoch": 0.9121478036950924, + "grad_norm": 11.204023361206055, + "learning_rate": 4.665371386542017e-07, + "loss": 4.0525, + "step": 72970 + }, + { + "epoch": 0.912172804320108, + "grad_norm": 1.9294161796569824, + "learning_rate": 4.66273728618668e-07, + "loss": 0.6427, + "step": 72972 + }, + { + "epoch": 0.9121978049451236, + "grad_norm": 5.91573429107666, + "learning_rate": 4.6601039119064437e-07, + "loss": 0.4852, + "step": 72974 + }, + { + "epoch": 0.9122228055701392, + "grad_norm": 1.7602553367614746, + "learning_rate": 4.6574712637213583e-07, + "loss": 0.8378, + "step": 72976 + }, + { + "epoch": 0.9122478061951549, + "grad_norm": 6.675272464752197, + "learning_rate": 4.654839341651507e-07, + "loss": 0.4758, + "step": 72978 + }, + { + "epoch": 0.9122728068201705, + "grad_norm": 4.15360689163208, + "learning_rate": 4.652208145716908e-07, + "loss": 1.0799, + "step": 72980 + }, + { + "epoch": 0.9122978074451862, + "grad_norm": 0.16598264873027802, + "learning_rate": 4.649577675937611e-07, + "loss": 0.0248, + "step": 72982 + }, + { + "epoch": 0.9123228080702017, + "grad_norm": 2.3344357013702393, + "learning_rate": 4.6469479323336566e-07, + "loss": 0.4325, + "step": 72984 + }, + { + "epoch": 0.9123478086952174, + "grad_norm": 1.8908590078353882, + "learning_rate": 4.644318914925039e-07, + "loss": 0.7164, + "step": 72986 + }, + { + "epoch": 0.912372809320233, + "grad_norm": 6.715976715087891, + "learning_rate": 4.641690623731832e-07, + "loss": 1.5616, + "step": 72988 + }, + { + "epoch": 0.9123978099452487, + "grad_norm": 6.536635875701904, + "learning_rate": 4.639063058773996e-07, + "loss": 0.85, + "step": 72990 + }, + { + "epoch": 0.9124228105702643, + "grad_norm": 3.0765185356140137, + "learning_rate": 4.636436220071594e-07, + "loss": 1.0795, + "step": 72992 + }, + { + "epoch": 0.9124478111952798, + "grad_norm": 3.1523470878601074, + "learning_rate": 4.6338101076445874e-07, + "loss": 1.1815, + "step": 72994 + }, + { + "epoch": 0.9124728118202955, + "grad_norm": 4.112295627593994, + "learning_rate": 4.631184721513016e-07, + "loss": 0.6908, + "step": 72996 + }, + { + "epoch": 0.9124978124453111, + "grad_norm": 3.2404046058654785, + "learning_rate": 4.628560061696852e-07, + "loss": 1.137, + "step": 72998 + }, + { + "epoch": 0.9125228130703268, + "grad_norm": 0.7987357378005981, + "learning_rate": 4.6259361282160797e-07, + "loss": 1.0197, + "step": 73000 + }, + { + "epoch": 0.9125478136953424, + "grad_norm": 3.141798734664917, + "learning_rate": 4.623312921090695e-07, + "loss": 0.4453, + "step": 73002 + }, + { + "epoch": 0.912572814320358, + "grad_norm": 12.805309295654297, + "learning_rate": 4.6206904403406696e-07, + "loss": 2.6525, + "step": 73004 + }, + { + "epoch": 0.9125978149453736, + "grad_norm": 0.5162785053253174, + "learning_rate": 4.618068685985977e-07, + "loss": 0.4019, + "step": 73006 + }, + { + "epoch": 0.9126228155703893, + "grad_norm": 1.6740326881408691, + "learning_rate": 4.615447658046579e-07, + "loss": 1.4935, + "step": 73008 + }, + { + "epoch": 0.9126478161954049, + "grad_norm": 2.540477991104126, + "learning_rate": 4.6128273565424595e-07, + "loss": 0.8298, + "step": 73010 + }, + { + "epoch": 0.9126728168204206, + "grad_norm": 3.099397659301758, + "learning_rate": 4.610207781493559e-07, + "loss": 0.793, + "step": 73012 + }, + { + "epoch": 0.9126978174454361, + "grad_norm": 3.119476556777954, + "learning_rate": 4.607588932919804e-07, + "loss": 1.4193, + "step": 73014 + }, + { + "epoch": 0.9127228180704517, + "grad_norm": 3.837506055831909, + "learning_rate": 4.604970810841192e-07, + "loss": 1.8733, + "step": 73016 + }, + { + "epoch": 0.9127478186954674, + "grad_norm": 2.9188003540039062, + "learning_rate": 4.602353415277605e-07, + "loss": 0.3332, + "step": 73018 + }, + { + "epoch": 0.912772819320483, + "grad_norm": 3.0328614711761475, + "learning_rate": 4.5997367462490173e-07, + "loss": 1.2104, + "step": 73020 + }, + { + "epoch": 0.9127978199454987, + "grad_norm": 3.4414520263671875, + "learning_rate": 4.5971208037753344e-07, + "loss": 1.3958, + "step": 73022 + }, + { + "epoch": 0.9128228205705142, + "grad_norm": 4.300813674926758, + "learning_rate": 4.5945055878764965e-07, + "loss": 1.6676, + "step": 73024 + }, + { + "epoch": 0.9128478211955299, + "grad_norm": 2.7909083366394043, + "learning_rate": 4.591891098572421e-07, + "loss": 0.9091, + "step": 73026 + }, + { + "epoch": 0.9128728218205455, + "grad_norm": 3.2582530975341797, + "learning_rate": 4.5892773358829914e-07, + "loss": 0.552, + "step": 73028 + }, + { + "epoch": 0.9128978224455612, + "grad_norm": 4.302642345428467, + "learning_rate": 4.5866642998281474e-07, + "loss": 1.6266, + "step": 73030 + }, + { + "epoch": 0.9129228230705768, + "grad_norm": 2.6661291122436523, + "learning_rate": 4.5840519904277626e-07, + "loss": 1.2429, + "step": 73032 + }, + { + "epoch": 0.9129478236955924, + "grad_norm": 2.195774793624878, + "learning_rate": 4.581440407701765e-07, + "loss": 0.7315, + "step": 73034 + }, + { + "epoch": 0.912972824320608, + "grad_norm": 3.480600357055664, + "learning_rate": 4.5788295516700164e-07, + "loss": 0.725, + "step": 73036 + }, + { + "epoch": 0.9129978249456236, + "grad_norm": 1.0114860534667969, + "learning_rate": 4.5762194223523904e-07, + "loss": 0.0456, + "step": 73038 + }, + { + "epoch": 0.9130228255706393, + "grad_norm": 2.9829261302948, + "learning_rate": 4.573610019768804e-07, + "loss": 0.6801, + "step": 73040 + }, + { + "epoch": 0.9130478261956549, + "grad_norm": 3.778836250305176, + "learning_rate": 4.5710013439390965e-07, + "loss": 1.0573, + "step": 73042 + }, + { + "epoch": 0.9130728268206705, + "grad_norm": 8.43163013458252, + "learning_rate": 4.568393394883164e-07, + "loss": 1.7528, + "step": 73044 + }, + { + "epoch": 0.9130978274456861, + "grad_norm": 2.638612747192383, + "learning_rate": 4.565786172620823e-07, + "loss": 1.1965, + "step": 73046 + }, + { + "epoch": 0.9131228280707018, + "grad_norm": 0.026825718581676483, + "learning_rate": 4.5631796771719914e-07, + "loss": 0.0003, + "step": 73048 + }, + { + "epoch": 0.9131478286957174, + "grad_norm": 6.0156474113464355, + "learning_rate": 4.560573908556476e-07, + "loss": 1.2162, + "step": 73050 + }, + { + "epoch": 0.9131728293207331, + "grad_norm": 3.823782444000244, + "learning_rate": 4.5579688667941267e-07, + "loss": 0.741, + "step": 73052 + }, + { + "epoch": 0.9131978299457486, + "grad_norm": 5.8662614822387695, + "learning_rate": 4.555364551904795e-07, + "loss": 0.3668, + "step": 73054 + }, + { + "epoch": 0.9132228305707643, + "grad_norm": 4.129038333892822, + "learning_rate": 4.552760963908298e-07, + "loss": 0.6056, + "step": 73056 + }, + { + "epoch": 0.9132478311957799, + "grad_norm": 3.9864561557769775, + "learning_rate": 4.550158102824498e-07, + "loss": 1.3824, + "step": 73058 + }, + { + "epoch": 0.9132728318207955, + "grad_norm": 1.9784202575683594, + "learning_rate": 4.5475559686731784e-07, + "loss": 0.3051, + "step": 73060 + }, + { + "epoch": 0.9132978324458112, + "grad_norm": 4.245480537414551, + "learning_rate": 4.5449545614741797e-07, + "loss": 1.3973, + "step": 73062 + }, + { + "epoch": 0.9133228330708267, + "grad_norm": 3.9850456714630127, + "learning_rate": 4.5423538812473187e-07, + "loss": 1.1169, + "step": 73064 + }, + { + "epoch": 0.9133478336958424, + "grad_norm": 4.118398189544678, + "learning_rate": 4.539753928012369e-07, + "loss": 1.7443, + "step": 73066 + }, + { + "epoch": 0.913372834320858, + "grad_norm": 4.742023468017578, + "learning_rate": 4.53715470178917e-07, + "loss": 1.9164, + "step": 73068 + }, + { + "epoch": 0.9133978349458737, + "grad_norm": 4.582141876220703, + "learning_rate": 4.534556202597495e-07, + "loss": 1.9499, + "step": 73070 + }, + { + "epoch": 0.9134228355708893, + "grad_norm": 6.181379795074463, + "learning_rate": 4.5319584304571507e-07, + "loss": 1.4772, + "step": 73072 + }, + { + "epoch": 0.9134478361959049, + "grad_norm": 3.23288893699646, + "learning_rate": 4.529361385387898e-07, + "loss": 1.4721, + "step": 73074 + }, + { + "epoch": 0.9134728368209205, + "grad_norm": 4.01885986328125, + "learning_rate": 4.526765067409533e-07, + "loss": 0.7345, + "step": 73076 + }, + { + "epoch": 0.9134978374459362, + "grad_norm": 0.4899844527244568, + "learning_rate": 4.524169476541829e-07, + "loss": 0.6601, + "step": 73078 + }, + { + "epoch": 0.9135228380709518, + "grad_norm": 5.605592727661133, + "learning_rate": 4.5215746128045356e-07, + "loss": 0.9879, + "step": 73080 + }, + { + "epoch": 0.9135478386959675, + "grad_norm": 3.877488136291504, + "learning_rate": 4.518980476217449e-07, + "loss": 1.0869, + "step": 73082 + }, + { + "epoch": 0.913572839320983, + "grad_norm": 2.0465028285980225, + "learning_rate": 4.5163870668002876e-07, + "loss": 1.5124, + "step": 73084 + }, + { + "epoch": 0.9135978399459986, + "grad_norm": 2.7154788970947266, + "learning_rate": 4.5137943845728226e-07, + "loss": 1.0464, + "step": 73086 + }, + { + "epoch": 0.9136228405710143, + "grad_norm": 4.822416305541992, + "learning_rate": 4.5112024295548065e-07, + "loss": 1.2183, + "step": 73088 + }, + { + "epoch": 0.9136478411960299, + "grad_norm": 2.8555877208709717, + "learning_rate": 4.5086112017659556e-07, + "loss": 0.5829, + "step": 73090 + }, + { + "epoch": 0.9136728418210456, + "grad_norm": 3.349771738052368, + "learning_rate": 4.5060207012260327e-07, + "loss": 1.3775, + "step": 73092 + }, + { + "epoch": 0.9136978424460611, + "grad_norm": 0.6440176963806152, + "learning_rate": 4.503430927954733e-07, + "loss": 0.8071, + "step": 73094 + }, + { + "epoch": 0.9137228430710768, + "grad_norm": 0.7987896800041199, + "learning_rate": 4.500841881971818e-07, + "loss": 0.0294, + "step": 73096 + }, + { + "epoch": 0.9137478436960924, + "grad_norm": 1.9999535083770752, + "learning_rate": 4.4982535632969613e-07, + "loss": 0.3833, + "step": 73098 + }, + { + "epoch": 0.9137728443211081, + "grad_norm": 3.0060677528381348, + "learning_rate": 4.495665971949925e-07, + "loss": 0.3803, + "step": 73100 + }, + { + "epoch": 0.9137978449461237, + "grad_norm": 0.00030044536106288433, + "learning_rate": 4.4930791079503935e-07, + "loss": 0.1742, + "step": 73102 + }, + { + "epoch": 0.9138228455711392, + "grad_norm": 1.4110921621322632, + "learning_rate": 4.490492971318039e-07, + "loss": 0.4737, + "step": 73104 + }, + { + "epoch": 0.9138478461961549, + "grad_norm": 2.1788532733917236, + "learning_rate": 4.487907562072613e-07, + "loss": 0.672, + "step": 73106 + }, + { + "epoch": 0.9138728468211705, + "grad_norm": 1.9353249073028564, + "learning_rate": 4.4853228802337556e-07, + "loss": 1.3717, + "step": 73108 + }, + { + "epoch": 0.9138978474461862, + "grad_norm": 2.0242574214935303, + "learning_rate": 4.482738925821184e-07, + "loss": 0.1931, + "step": 73110 + }, + { + "epoch": 0.9139228480712018, + "grad_norm": 0.5727595090866089, + "learning_rate": 4.480155698854549e-07, + "loss": 0.0177, + "step": 73112 + }, + { + "epoch": 0.9139478486962174, + "grad_norm": 1.58999764919281, + "learning_rate": 4.4775731993535466e-07, + "loss": 0.305, + "step": 73114 + }, + { + "epoch": 0.913972849321233, + "grad_norm": 2.8766801357269287, + "learning_rate": 4.4749914273378603e-07, + "loss": 0.9484, + "step": 73116 + }, + { + "epoch": 0.9139978499462487, + "grad_norm": 8.661799430847168, + "learning_rate": 4.4724103828271083e-07, + "loss": 0.4485, + "step": 73118 + }, + { + "epoch": 0.9140228505712643, + "grad_norm": 0.8104547262191772, + "learning_rate": 4.4698300658409856e-07, + "loss": 0.5332, + "step": 73120 + }, + { + "epoch": 0.91404785119628, + "grad_norm": 3.084794282913208, + "learning_rate": 4.4672504763991096e-07, + "loss": 0.6084, + "step": 73122 + }, + { + "epoch": 0.9140728518212955, + "grad_norm": 2.6623024940490723, + "learning_rate": 4.4646716145211657e-07, + "loss": 1.3799, + "step": 73124 + }, + { + "epoch": 0.9140978524463111, + "grad_norm": 3.6138827800750732, + "learning_rate": 4.462093480226748e-07, + "loss": 0.713, + "step": 73126 + }, + { + "epoch": 0.9141228530713268, + "grad_norm": 3.6542725563049316, + "learning_rate": 4.4595160735355303e-07, + "loss": 1.1738, + "step": 73128 + }, + { + "epoch": 0.9141478536963424, + "grad_norm": 5.083822250366211, + "learning_rate": 4.456939394467141e-07, + "loss": 2.337, + "step": 73130 + }, + { + "epoch": 0.9141728543213581, + "grad_norm": 2.7631707191467285, + "learning_rate": 4.454363443041165e-07, + "loss": 1.3453, + "step": 73132 + }, + { + "epoch": 0.9141978549463736, + "grad_norm": 1.816892147064209, + "learning_rate": 4.451788219277264e-07, + "loss": 1.0008, + "step": 73134 + }, + { + "epoch": 0.9142228555713893, + "grad_norm": 4.964273929595947, + "learning_rate": 4.449213723195012e-07, + "loss": 1.4622, + "step": 73136 + }, + { + "epoch": 0.9142478561964049, + "grad_norm": 12.784173011779785, + "learning_rate": 4.446639954814047e-07, + "loss": 2.2534, + "step": 73138 + }, + { + "epoch": 0.9142728568214206, + "grad_norm": 3.47855281829834, + "learning_rate": 4.444066914153966e-07, + "loss": 0.4987, + "step": 73140 + }, + { + "epoch": 0.9142978574464362, + "grad_norm": 0.00031587883131578565, + "learning_rate": 4.4414946012343527e-07, + "loss": 0.0909, + "step": 73142 + }, + { + "epoch": 0.9143228580714517, + "grad_norm": 3.6398961544036865, + "learning_rate": 4.438923016074814e-07, + "loss": 1.6122, + "step": 73144 + }, + { + "epoch": 0.9143478586964674, + "grad_norm": 2.803847551345825, + "learning_rate": 4.436352158694901e-07, + "loss": 0.8898, + "step": 73146 + }, + { + "epoch": 0.914372859321483, + "grad_norm": 3.6494386196136475, + "learning_rate": 4.433782029114242e-07, + "loss": 0.7187, + "step": 73148 + }, + { + "epoch": 0.9143978599464987, + "grad_norm": 3.5720036029815674, + "learning_rate": 4.431212627352366e-07, + "loss": 1.3954, + "step": 73150 + }, + { + "epoch": 0.9144228605715143, + "grad_norm": 5.338090896606445, + "learning_rate": 4.428643953428857e-07, + "loss": 1.8845, + "step": 73152 + }, + { + "epoch": 0.9144478611965299, + "grad_norm": 2.535158395767212, + "learning_rate": 4.4260760073633113e-07, + "loss": 0.9951, + "step": 73154 + }, + { + "epoch": 0.9144728618215455, + "grad_norm": 3.91636061668396, + "learning_rate": 4.423508789175224e-07, + "loss": 0.4819, + "step": 73156 + }, + { + "epoch": 0.9144978624465612, + "grad_norm": 3.4140050411224365, + "learning_rate": 4.42094229888419e-07, + "loss": 0.0679, + "step": 73158 + }, + { + "epoch": 0.9145228630715768, + "grad_norm": 0.0004654518561437726, + "learning_rate": 4.418376536509739e-07, + "loss": 0.533, + "step": 73160 + }, + { + "epoch": 0.9145478636965925, + "grad_norm": 3.3285553455352783, + "learning_rate": 4.41581150207141e-07, + "loss": 0.5846, + "step": 73162 + }, + { + "epoch": 0.914572864321608, + "grad_norm": 4.273853302001953, + "learning_rate": 4.4132471955887546e-07, + "loss": 1.1827, + "step": 73164 + }, + { + "epoch": 0.9145978649466237, + "grad_norm": 7.0623955726623535, + "learning_rate": 4.4106836170812796e-07, + "loss": 0.456, + "step": 73166 + }, + { + "epoch": 0.9146228655716393, + "grad_norm": 2.608182191848755, + "learning_rate": 4.408120766568547e-07, + "loss": 0.7405, + "step": 73168 + }, + { + "epoch": 0.914647866196655, + "grad_norm": 2.171731948852539, + "learning_rate": 4.4055586440700293e-07, + "loss": 0.3587, + "step": 73170 + }, + { + "epoch": 0.9146728668216706, + "grad_norm": 4.366030216217041, + "learning_rate": 4.4029972496052673e-07, + "loss": 3.056, + "step": 73172 + }, + { + "epoch": 0.9146978674466861, + "grad_norm": 3.688347816467285, + "learning_rate": 4.4004365831937456e-07, + "loss": 0.8704, + "step": 73174 + }, + { + "epoch": 0.9147228680717018, + "grad_norm": 5.942173004150391, + "learning_rate": 4.3978766448549816e-07, + "loss": 1.2212, + "step": 73176 + }, + { + "epoch": 0.9147478686967174, + "grad_norm": 2.5748329162597656, + "learning_rate": 4.3953174346084706e-07, + "loss": 1.4472, + "step": 73178 + }, + { + "epoch": 0.9147728693217331, + "grad_norm": 2.6092841625213623, + "learning_rate": 4.3927589524737193e-07, + "loss": 1.1289, + "step": 73180 + }, + { + "epoch": 0.9147978699467487, + "grad_norm": 4.350643634796143, + "learning_rate": 4.39020119847019e-07, + "loss": 0.7018, + "step": 73182 + }, + { + "epoch": 0.9148228705717643, + "grad_norm": 0.24904149770736694, + "learning_rate": 4.387644172617356e-07, + "loss": 0.6248, + "step": 73184 + }, + { + "epoch": 0.9148478711967799, + "grad_norm": 0.23890924453735352, + "learning_rate": 4.3850878749347125e-07, + "loss": 0.2397, + "step": 73186 + }, + { + "epoch": 0.9148728718217956, + "grad_norm": 0.3738502264022827, + "learning_rate": 4.3825323054417003e-07, + "loss": 0.4467, + "step": 73188 + }, + { + "epoch": 0.9148978724468112, + "grad_norm": 3.6777615547180176, + "learning_rate": 4.3799774641578143e-07, + "loss": 0.538, + "step": 73190 + }, + { + "epoch": 0.9149228730718268, + "grad_norm": 0.00039177254075184464, + "learning_rate": 4.3774233511025055e-07, + "loss": 0.4242, + "step": 73192 + }, + { + "epoch": 0.9149478736968424, + "grad_norm": 3.399547815322876, + "learning_rate": 4.3748699662952144e-07, + "loss": 0.394, + "step": 73194 + }, + { + "epoch": 0.914972874321858, + "grad_norm": 3.4794223308563232, + "learning_rate": 4.3723173097553916e-07, + "loss": 1.0412, + "step": 73196 + }, + { + "epoch": 0.9149978749468737, + "grad_norm": 2.5289485454559326, + "learning_rate": 4.369765381502467e-07, + "loss": 1.1785, + "step": 73198 + }, + { + "epoch": 0.9150228755718893, + "grad_norm": 0.8426396250724792, + "learning_rate": 4.36721418155589e-07, + "loss": 0.0794, + "step": 73200 + }, + { + "epoch": 0.915047876196905, + "grad_norm": 5.525049686431885, + "learning_rate": 4.364663709935091e-07, + "loss": 1.2238, + "step": 73202 + }, + { + "epoch": 0.9150728768219205, + "grad_norm": 0.0005324810626916587, + "learning_rate": 4.3621139666594757e-07, + "loss": 0.0056, + "step": 73204 + }, + { + "epoch": 0.9150978774469362, + "grad_norm": 2.882169008255005, + "learning_rate": 4.359564951748507e-07, + "loss": 0.7666, + "step": 73206 + }, + { + "epoch": 0.9151228780719518, + "grad_norm": 0.9109042286872864, + "learning_rate": 4.3570166652215364e-07, + "loss": 0.4965, + "step": 73208 + }, + { + "epoch": 0.9151478786969675, + "grad_norm": 2.0860249996185303, + "learning_rate": 4.354469107098025e-07, + "loss": 0.1982, + "step": 73210 + }, + { + "epoch": 0.9151728793219831, + "grad_norm": 5.786198139190674, + "learning_rate": 4.3519222773973245e-07, + "loss": 1.6525, + "step": 73212 + }, + { + "epoch": 0.9151978799469986, + "grad_norm": 0.0005136919789947569, + "learning_rate": 4.3493761761388643e-07, + "loss": 0.7239, + "step": 73214 + }, + { + "epoch": 0.9152228805720143, + "grad_norm": 3.3826208114624023, + "learning_rate": 4.3468308033420503e-07, + "loss": 0.2248, + "step": 73216 + }, + { + "epoch": 0.9152478811970299, + "grad_norm": 0.3564557731151581, + "learning_rate": 4.344286159026223e-07, + "loss": 0.767, + "step": 73218 + }, + { + "epoch": 0.9152728818220456, + "grad_norm": 1.618245005607605, + "learning_rate": 4.341742243210811e-07, + "loss": 0.082, + "step": 73220 + }, + { + "epoch": 0.9152978824470612, + "grad_norm": 0.00019202266412321478, + "learning_rate": 4.3391990559151443e-07, + "loss": 0.0, + "step": 73222 + }, + { + "epoch": 0.9153228830720768, + "grad_norm": 4.871653079986572, + "learning_rate": 4.336656597158606e-07, + "loss": 0.8964, + "step": 73224 + }, + { + "epoch": 0.9153478836970924, + "grad_norm": 2.6598966121673584, + "learning_rate": 4.3341148669605706e-07, + "loss": 0.8686, + "step": 73226 + }, + { + "epoch": 0.9153728843221081, + "grad_norm": 1.794313907623291, + "learning_rate": 4.331573865340377e-07, + "loss": 0.8804, + "step": 73228 + }, + { + "epoch": 0.9153978849471237, + "grad_norm": 0.00039768038550391793, + "learning_rate": 4.329033592317411e-07, + "loss": 0.0978, + "step": 73230 + }, + { + "epoch": 0.9154228855721394, + "grad_norm": 2.068018913269043, + "learning_rate": 4.32649404791099e-07, + "loss": 0.4022, + "step": 73232 + }, + { + "epoch": 0.9154478861971549, + "grad_norm": 3.507948875427246, + "learning_rate": 4.323955232140453e-07, + "loss": 0.616, + "step": 73234 + }, + { + "epoch": 0.9154728868221705, + "grad_norm": 5.203859806060791, + "learning_rate": 4.321417145025142e-07, + "loss": 0.9338, + "step": 73236 + }, + { + "epoch": 0.9154978874471862, + "grad_norm": 2.9163856506347656, + "learning_rate": 4.318879786584385e-07, + "loss": 1.5139, + "step": 73238 + }, + { + "epoch": 0.9155228880722018, + "grad_norm": 15.696112632751465, + "learning_rate": 4.3163431568375103e-07, + "loss": 0.6184, + "step": 73240 + }, + { + "epoch": 0.9155478886972175, + "grad_norm": 4.240445137023926, + "learning_rate": 4.3138072558038367e-07, + "loss": 1.5209, + "step": 73242 + }, + { + "epoch": 0.915572889322233, + "grad_norm": 4.462273597717285, + "learning_rate": 4.3112720835026825e-07, + "loss": 0.8893, + "step": 73244 + }, + { + "epoch": 0.9155978899472487, + "grad_norm": 4.763452529907227, + "learning_rate": 4.308737639953353e-07, + "loss": 1.287, + "step": 73246 + }, + { + "epoch": 0.9156228905722643, + "grad_norm": 3.2075138092041016, + "learning_rate": 4.3062039251751343e-07, + "loss": 2.4592, + "step": 73248 + }, + { + "epoch": 0.91564789119728, + "grad_norm": 2.8446743488311768, + "learning_rate": 4.303670939187332e-07, + "loss": 1.2725, + "step": 73250 + }, + { + "epoch": 0.9156728918222956, + "grad_norm": 3.765699625015259, + "learning_rate": 4.3011386820092424e-07, + "loss": 1.5471, + "step": 73252 + }, + { + "epoch": 0.9156978924473111, + "grad_norm": 4.5305094718933105, + "learning_rate": 4.29860715366015e-07, + "loss": 1.1677, + "step": 73254 + }, + { + "epoch": 0.9157228930723268, + "grad_norm": 2.071941614151001, + "learning_rate": 4.2960763541593287e-07, + "loss": 0.1075, + "step": 73256 + }, + { + "epoch": 0.9157478936973424, + "grad_norm": 0.0006729047745466232, + "learning_rate": 4.2935462835260734e-07, + "loss": 0.4064, + "step": 73258 + }, + { + "epoch": 0.9157728943223581, + "grad_norm": 5.920596599578857, + "learning_rate": 4.291016941779613e-07, + "loss": 1.8073, + "step": 73260 + }, + { + "epoch": 0.9157978949473737, + "grad_norm": 3.0447332859039307, + "learning_rate": 4.288488328939233e-07, + "loss": 0.5134, + "step": 73262 + }, + { + "epoch": 0.9158228955723893, + "grad_norm": 1.5542831420898438, + "learning_rate": 4.2859604450242067e-07, + "loss": 1.0608, + "step": 73264 + }, + { + "epoch": 0.9158478961974049, + "grad_norm": 1.3442559242248535, + "learning_rate": 4.283433290053751e-07, + "loss": 0.0455, + "step": 73266 + }, + { + "epoch": 0.9158728968224206, + "grad_norm": 0.0004549501754809171, + "learning_rate": 4.2809068640471517e-07, + "loss": 0.7254, + "step": 73268 + }, + { + "epoch": 0.9158978974474362, + "grad_norm": 4.265873432159424, + "learning_rate": 4.278381167023615e-07, + "loss": 1.6608, + "step": 73270 + }, + { + "epoch": 0.9159228980724519, + "grad_norm": 3.406344175338745, + "learning_rate": 4.275856199002415e-07, + "loss": 1.0664, + "step": 73272 + }, + { + "epoch": 0.9159478986974674, + "grad_norm": 1.9650278091430664, + "learning_rate": 4.2733319600027354e-07, + "loss": 1.1281, + "step": 73274 + }, + { + "epoch": 0.915972899322483, + "grad_norm": 5.826096057891846, + "learning_rate": 4.2708084500438173e-07, + "loss": 1.3955, + "step": 73276 + }, + { + "epoch": 0.9159978999474987, + "grad_norm": 7.857484340667725, + "learning_rate": 4.2682856691449e-07, + "loss": 1.2268, + "step": 73278 + }, + { + "epoch": 0.9160229005725143, + "grad_norm": 4.371170520782471, + "learning_rate": 4.2657636173251693e-07, + "loss": 1.2898, + "step": 73280 + }, + { + "epoch": 0.91604790119753, + "grad_norm": 0.15026400983333588, + "learning_rate": 4.2632422946038534e-07, + "loss": 0.3913, + "step": 73282 + }, + { + "epoch": 0.9160729018225455, + "grad_norm": 3.694025993347168, + "learning_rate": 4.260721701000148e-07, + "loss": 1.2742, + "step": 73284 + }, + { + "epoch": 0.9160979024475612, + "grad_norm": 3.788973808288574, + "learning_rate": 4.258201836533238e-07, + "loss": 2.0807, + "step": 73286 + }, + { + "epoch": 0.9161229030725768, + "grad_norm": 1.0968172550201416, + "learning_rate": 4.255682701222341e-07, + "loss": 0.7521, + "step": 73288 + }, + { + "epoch": 0.9161479036975925, + "grad_norm": 5.0309672355651855, + "learning_rate": 4.253164295086598e-07, + "loss": 1.3633, + "step": 73290 + }, + { + "epoch": 0.9161729043226081, + "grad_norm": 9.408803939819336, + "learning_rate": 4.2506466181452376e-07, + "loss": 1.0232, + "step": 73292 + }, + { + "epoch": 0.9161979049476237, + "grad_norm": 4.204697608947754, + "learning_rate": 4.2481296704174e-07, + "loss": 1.4377, + "step": 73294 + }, + { + "epoch": 0.9162229055726393, + "grad_norm": 1.2444536685943604, + "learning_rate": 4.2456134519222814e-07, + "loss": 0.6639, + "step": 73296 + }, + { + "epoch": 0.916247906197655, + "grad_norm": 4.163136959075928, + "learning_rate": 4.2430979626790214e-07, + "loss": 0.2091, + "step": 73298 + }, + { + "epoch": 0.9162729068226706, + "grad_norm": 0.30645763874053955, + "learning_rate": 4.2405832027067827e-07, + "loss": 0.6735, + "step": 73300 + }, + { + "epoch": 0.9162979074476862, + "grad_norm": 3.4210472106933594, + "learning_rate": 4.238069172024728e-07, + "loss": 1.565, + "step": 73302 + }, + { + "epoch": 0.9163229080727018, + "grad_norm": 3.8275864124298096, + "learning_rate": 4.235555870651986e-07, + "loss": 0.662, + "step": 73304 + }, + { + "epoch": 0.9163479086977174, + "grad_norm": 2.617924928665161, + "learning_rate": 4.2330432986077195e-07, + "loss": 1.1399, + "step": 73306 + }, + { + "epoch": 0.9163729093227331, + "grad_norm": 3.6706655025482178, + "learning_rate": 4.230531455911036e-07, + "loss": 0.6162, + "step": 73308 + }, + { + "epoch": 0.9163979099477487, + "grad_norm": 2.97556209564209, + "learning_rate": 4.228020342581096e-07, + "loss": 0.1441, + "step": 73310 + }, + { + "epoch": 0.9164229105727644, + "grad_norm": 3.3427603244781494, + "learning_rate": 4.2255099586370196e-07, + "loss": 1.9368, + "step": 73312 + }, + { + "epoch": 0.9164479111977799, + "grad_norm": 4.307849884033203, + "learning_rate": 4.2230003040978905e-07, + "loss": 0.1917, + "step": 73314 + }, + { + "epoch": 0.9164729118227956, + "grad_norm": 2.548607110977173, + "learning_rate": 4.2204913789828605e-07, + "loss": 0.3598, + "step": 73316 + }, + { + "epoch": 0.9164979124478112, + "grad_norm": 5.9787797927856445, + "learning_rate": 4.217983183311014e-07, + "loss": 1.1357, + "step": 73318 + }, + { + "epoch": 0.9165229130728269, + "grad_norm": 4.512179374694824, + "learning_rate": 4.21547571710148e-07, + "loss": 1.3115, + "step": 73320 + }, + { + "epoch": 0.9165479136978425, + "grad_norm": 0.6195053458213806, + "learning_rate": 4.212968980373322e-07, + "loss": 0.1526, + "step": 73322 + }, + { + "epoch": 0.916572914322858, + "grad_norm": 4.238134860992432, + "learning_rate": 4.210462973145657e-07, + "loss": 0.4475, + "step": 73324 + }, + { + "epoch": 0.9165979149478737, + "grad_norm": 2.9986281394958496, + "learning_rate": 4.20795769543757e-07, + "loss": 1.3265, + "step": 73326 + }, + { + "epoch": 0.9166229155728893, + "grad_norm": 0.5934002995491028, + "learning_rate": 4.2054531472681016e-07, + "loss": 0.0213, + "step": 73328 + }, + { + "epoch": 0.916647916197905, + "grad_norm": 3.5787627696990967, + "learning_rate": 4.2029493286563804e-07, + "loss": 1.0138, + "step": 73330 + }, + { + "epoch": 0.9166729168229206, + "grad_norm": 3.1626689434051514, + "learning_rate": 4.2004462396214365e-07, + "loss": 0.7652, + "step": 73332 + }, + { + "epoch": 0.9166979174479362, + "grad_norm": 3.5076820850372314, + "learning_rate": 4.1979438801823536e-07, + "loss": 1.1961, + "step": 73334 + }, + { + "epoch": 0.9167229180729518, + "grad_norm": 2.2748360633850098, + "learning_rate": 4.1954422503581836e-07, + "loss": 0.5762, + "step": 73336 + }, + { + "epoch": 0.9167479186979675, + "grad_norm": 2.651447296142578, + "learning_rate": 4.192941350167967e-07, + "loss": 0.2877, + "step": 73338 + }, + { + "epoch": 0.9167729193229831, + "grad_norm": 2.7359373569488525, + "learning_rate": 4.1904411796307776e-07, + "loss": 0.4741, + "step": 73340 + }, + { + "epoch": 0.9167979199479988, + "grad_norm": 4.672708034515381, + "learning_rate": 4.1879417387656216e-07, + "loss": 1.1971, + "step": 73342 + }, + { + "epoch": 0.9168229205730143, + "grad_norm": 4.568997383117676, + "learning_rate": 4.1854430275915627e-07, + "loss": 1.1345, + "step": 73344 + }, + { + "epoch": 0.9168479211980299, + "grad_norm": 4.294739246368408, + "learning_rate": 4.182945046127618e-07, + "loss": 0.9552, + "step": 73346 + }, + { + "epoch": 0.9168729218230456, + "grad_norm": 4.784492492675781, + "learning_rate": 4.180447794392817e-07, + "loss": 1.1845, + "step": 73348 + }, + { + "epoch": 0.9168979224480612, + "grad_norm": 1.8925740718841553, + "learning_rate": 4.1779512724061776e-07, + "loss": 1.1291, + "step": 73350 + }, + { + "epoch": 0.9169229230730769, + "grad_norm": 0.002319920575246215, + "learning_rate": 4.1754554801866964e-07, + "loss": 0.0396, + "step": 73352 + }, + { + "epoch": 0.9169479236980924, + "grad_norm": 5.722174644470215, + "learning_rate": 4.1729604177534135e-07, + "loss": 1.2426, + "step": 73354 + }, + { + "epoch": 0.9169729243231081, + "grad_norm": 4.938220500946045, + "learning_rate": 4.170466085125302e-07, + "loss": 2.262, + "step": 73356 + }, + { + "epoch": 0.9169979249481237, + "grad_norm": 3.0000264644622803, + "learning_rate": 4.1679724823213695e-07, + "loss": 1.416, + "step": 73358 + }, + { + "epoch": 0.9170229255731394, + "grad_norm": 5.552342414855957, + "learning_rate": 4.165479609360601e-07, + "loss": 0.6654, + "step": 73360 + }, + { + "epoch": 0.917047926198155, + "grad_norm": 2.762104034423828, + "learning_rate": 4.162987466262003e-07, + "loss": 0.9816, + "step": 73362 + }, + { + "epoch": 0.9170729268231705, + "grad_norm": 0.0004377835721243173, + "learning_rate": 4.1604960530445384e-07, + "loss": 0.8906, + "step": 73364 + }, + { + "epoch": 0.9170979274481862, + "grad_norm": 4.01175594329834, + "learning_rate": 4.1580053697271696e-07, + "loss": 0.7407, + "step": 73366 + }, + { + "epoch": 0.9171229280732018, + "grad_norm": 2.8872792720794678, + "learning_rate": 4.155515416328892e-07, + "loss": 1.6478, + "step": 73368 + }, + { + "epoch": 0.9171479286982175, + "grad_norm": 2.312507390975952, + "learning_rate": 4.153026192868648e-07, + "loss": 0.7986, + "step": 73370 + }, + { + "epoch": 0.9171729293232331, + "grad_norm": 3.3332290649414062, + "learning_rate": 4.150537699365409e-07, + "loss": 1.5114, + "step": 73372 + }, + { + "epoch": 0.9171979299482487, + "grad_norm": 2.7336344718933105, + "learning_rate": 4.1480499358381057e-07, + "loss": 0.9088, + "step": 73374 + }, + { + "epoch": 0.9172229305732643, + "grad_norm": 4.828310489654541, + "learning_rate": 4.1455629023057107e-07, + "loss": 0.3681, + "step": 73376 + }, + { + "epoch": 0.91724793119828, + "grad_norm": 0.8477926254272461, + "learning_rate": 4.1430765987871546e-07, + "loss": 0.653, + "step": 73378 + }, + { + "epoch": 0.9172729318232956, + "grad_norm": 2.2083683013916016, + "learning_rate": 4.140591025301366e-07, + "loss": 1.298, + "step": 73380 + }, + { + "epoch": 0.9172979324483113, + "grad_norm": 0.8920416831970215, + "learning_rate": 4.138106181867285e-07, + "loss": 0.0411, + "step": 73382 + }, + { + "epoch": 0.9173229330733268, + "grad_norm": 3.712393283843994, + "learning_rate": 4.135622068503831e-07, + "loss": 1.2336, + "step": 73384 + }, + { + "epoch": 0.9173479336983424, + "grad_norm": 2.748900890350342, + "learning_rate": 4.1331386852299206e-07, + "loss": 0.7192, + "step": 73386 + }, + { + "epoch": 0.9173729343233581, + "grad_norm": 0.0010809298837557435, + "learning_rate": 4.130656032064462e-07, + "loss": 0.0949, + "step": 73388 + }, + { + "epoch": 0.9173979349483737, + "grad_norm": 0.00044920292566530406, + "learning_rate": 4.1281741090263837e-07, + "loss": 0.0036, + "step": 73390 + }, + { + "epoch": 0.9174229355733894, + "grad_norm": 10.1973876953125, + "learning_rate": 4.1256929161345717e-07, + "loss": 1.7019, + "step": 73392 + }, + { + "epoch": 0.9174479361984049, + "grad_norm": 3.7262604236602783, + "learning_rate": 4.123212453407921e-07, + "loss": 2.1162, + "step": 73394 + }, + { + "epoch": 0.9174729368234206, + "grad_norm": 3.527100086212158, + "learning_rate": 4.120732720865328e-07, + "loss": 0.8598, + "step": 73396 + }, + { + "epoch": 0.9174979374484362, + "grad_norm": 4.629373073577881, + "learning_rate": 4.1182537185256666e-07, + "loss": 0.824, + "step": 73398 + }, + { + "epoch": 0.9175229380734519, + "grad_norm": 2.727572441101074, + "learning_rate": 4.1157754464078436e-07, + "loss": 0.7388, + "step": 73400 + }, + { + "epoch": 0.9175479386984675, + "grad_norm": 5.873121738433838, + "learning_rate": 4.113297904530711e-07, + "loss": 1.3302, + "step": 73402 + }, + { + "epoch": 0.917572939323483, + "grad_norm": 4.081933498382568, + "learning_rate": 4.110821092913131e-07, + "loss": 1.2715, + "step": 73404 + }, + { + "epoch": 0.9175979399484987, + "grad_norm": 4.875082492828369, + "learning_rate": 4.108345011573989e-07, + "loss": 0.6291, + "step": 73406 + }, + { + "epoch": 0.9176229405735143, + "grad_norm": 4.986215114593506, + "learning_rate": 4.1058696605321133e-07, + "loss": 0.4364, + "step": 73408 + }, + { + "epoch": 0.91764794119853, + "grad_norm": 3.672293186187744, + "learning_rate": 4.1033950398063906e-07, + "loss": 1.1203, + "step": 73410 + }, + { + "epoch": 0.9176729418235456, + "grad_norm": 3.9727213382720947, + "learning_rate": 4.1009211494156376e-07, + "loss": 1.4296, + "step": 73412 + }, + { + "epoch": 0.9176979424485612, + "grad_norm": 4.504862308502197, + "learning_rate": 4.098447989378718e-07, + "loss": 1.4172, + "step": 73414 + }, + { + "epoch": 0.9177229430735768, + "grad_norm": 3.937746524810791, + "learning_rate": 4.0959755597144603e-07, + "loss": 0.805, + "step": 73416 + }, + { + "epoch": 0.9177479436985925, + "grad_norm": 3.4365618228912354, + "learning_rate": 4.0935038604416834e-07, + "loss": 1.329, + "step": 73418 + }, + { + "epoch": 0.9177729443236081, + "grad_norm": 4.50123929977417, + "learning_rate": 4.091032891579216e-07, + "loss": 1.0407, + "step": 73420 + }, + { + "epoch": 0.9177979449486238, + "grad_norm": 0.3335460126399994, + "learning_rate": 4.088562653145878e-07, + "loss": 0.0878, + "step": 73422 + }, + { + "epoch": 0.9178229455736393, + "grad_norm": 0.0008114050142467022, + "learning_rate": 4.086093145160497e-07, + "loss": 0.0359, + "step": 73424 + }, + { + "epoch": 0.917847946198655, + "grad_norm": 2.3250181674957275, + "learning_rate": 4.0836243676418585e-07, + "loss": 0.8506, + "step": 73426 + }, + { + "epoch": 0.9178729468236706, + "grad_norm": 0.004292970057576895, + "learning_rate": 4.08115632060877e-07, + "loss": 1.0251, + "step": 73428 + }, + { + "epoch": 0.9178979474486862, + "grad_norm": 3.3021974563598633, + "learning_rate": 4.0786890040800385e-07, + "loss": 1.252, + "step": 73430 + }, + { + "epoch": 0.9179229480737019, + "grad_norm": 2.763979434967041, + "learning_rate": 4.076222418074438e-07, + "loss": 0.2614, + "step": 73432 + }, + { + "epoch": 0.9179479486987174, + "grad_norm": 3.1354176998138428, + "learning_rate": 4.073756562610776e-07, + "loss": 0.9839, + "step": 73434 + }, + { + "epoch": 0.9179729493237331, + "grad_norm": 4.2177910804748535, + "learning_rate": 4.0712914377078027e-07, + "loss": 1.326, + "step": 73436 + }, + { + "epoch": 0.9179979499487487, + "grad_norm": 4.86088228225708, + "learning_rate": 4.0688270433843267e-07, + "loss": 1.7879, + "step": 73438 + }, + { + "epoch": 0.9180229505737644, + "grad_norm": 1.3108125925064087, + "learning_rate": 4.066363379659077e-07, + "loss": 0.4738, + "step": 73440 + }, + { + "epoch": 0.91804795119878, + "grad_norm": 0.0005911667249165475, + "learning_rate": 4.0639004465508504e-07, + "loss": 0.046, + "step": 73442 + }, + { + "epoch": 0.9180729518237956, + "grad_norm": 1.567444920539856, + "learning_rate": 4.061438244078397e-07, + "loss": 1.1336, + "step": 73444 + }, + { + "epoch": 0.9180979524488112, + "grad_norm": 5.405207633972168, + "learning_rate": 4.0589767722604476e-07, + "loss": 1.7548, + "step": 73446 + }, + { + "epoch": 0.9181229530738269, + "grad_norm": 4.214452743530273, + "learning_rate": 4.0565160311157756e-07, + "loss": 1.3634, + "step": 73448 + }, + { + "epoch": 0.9181479536988425, + "grad_norm": 3.6640565395355225, + "learning_rate": 4.0540560206631e-07, + "loss": 1.8195, + "step": 73450 + }, + { + "epoch": 0.9181729543238581, + "grad_norm": 3.1803576946258545, + "learning_rate": 4.051596740921171e-07, + "loss": 1.0336, + "step": 73452 + }, + { + "epoch": 0.9181979549488737, + "grad_norm": 1.7787846326828003, + "learning_rate": 4.049138191908719e-07, + "loss": 0.077, + "step": 73454 + }, + { + "epoch": 0.9182229555738893, + "grad_norm": 1.2394949197769165, + "learning_rate": 4.0466803736444515e-07, + "loss": 0.5312, + "step": 73456 + }, + { + "epoch": 0.918247956198905, + "grad_norm": 2.5297863483428955, + "learning_rate": 4.0442232861471086e-07, + "loss": 1.3921, + "step": 73458 + }, + { + "epoch": 0.9182729568239206, + "grad_norm": 0.0016289543127641082, + "learning_rate": 4.0417669294353756e-07, + "loss": 0.6329, + "step": 73460 + }, + { + "epoch": 0.9182979574489363, + "grad_norm": 3.90149188041687, + "learning_rate": 4.039311303527993e-07, + "loss": 0.7707, + "step": 73462 + }, + { + "epoch": 0.9183229580739518, + "grad_norm": 2.2086472511291504, + "learning_rate": 4.0368564084436236e-07, + "loss": 1.2581, + "step": 73464 + }, + { + "epoch": 0.9183479586989675, + "grad_norm": 4.066539287567139, + "learning_rate": 4.034402244201008e-07, + "loss": 0.8368, + "step": 73466 + }, + { + "epoch": 0.9183729593239831, + "grad_norm": 5.647051811218262, + "learning_rate": 4.031948810818809e-07, + "loss": 1.3903, + "step": 73468 + }, + { + "epoch": 0.9183979599489988, + "grad_norm": 7.654580116271973, + "learning_rate": 4.0294961083157116e-07, + "loss": 1.5876, + "step": 73470 + }, + { + "epoch": 0.9184229605740144, + "grad_norm": 2.480360984802246, + "learning_rate": 4.027044136710401e-07, + "loss": 0.9569, + "step": 73472 + }, + { + "epoch": 0.9184479611990299, + "grad_norm": 0.00033949274802580476, + "learning_rate": 4.0245928960215507e-07, + "loss": 0.8224, + "step": 73474 + }, + { + "epoch": 0.9184729618240456, + "grad_norm": 2.718095302581787, + "learning_rate": 4.0221423862678354e-07, + "loss": 0.6696, + "step": 73476 + }, + { + "epoch": 0.9184979624490612, + "grad_norm": 2.4184165000915527, + "learning_rate": 4.0196926074678957e-07, + "loss": 1.4055, + "step": 73478 + }, + { + "epoch": 0.9185229630740769, + "grad_norm": 2.430866003036499, + "learning_rate": 4.017243559640416e-07, + "loss": 0.1361, + "step": 73480 + }, + { + "epoch": 0.9185479636990925, + "grad_norm": 2.1343250274658203, + "learning_rate": 4.0147952428040484e-07, + "loss": 1.2219, + "step": 73482 + }, + { + "epoch": 0.9185729643241081, + "grad_norm": 2.3451578617095947, + "learning_rate": 4.0123476569774003e-07, + "loss": 0.0804, + "step": 73484 + }, + { + "epoch": 0.9185979649491237, + "grad_norm": 0.00029492739122360945, + "learning_rate": 4.0099008021791566e-07, + "loss": 0.6493, + "step": 73486 + }, + { + "epoch": 0.9186229655741394, + "grad_norm": 3.456577777862549, + "learning_rate": 4.0074546784279244e-07, + "loss": 1.0307, + "step": 73488 + }, + { + "epoch": 0.918647966199155, + "grad_norm": 2.909092426300049, + "learning_rate": 4.005009285742345e-07, + "loss": 0.5603, + "step": 73490 + }, + { + "epoch": 0.9186729668241707, + "grad_norm": 5.267882823944092, + "learning_rate": 4.0025646241410365e-07, + "loss": 0.8553, + "step": 73492 + }, + { + "epoch": 0.9186979674491862, + "grad_norm": 2.1039018630981445, + "learning_rate": 4.0001206936426285e-07, + "loss": 0.9795, + "step": 73494 + }, + { + "epoch": 0.9187229680742018, + "grad_norm": 1.7723718881607056, + "learning_rate": 3.997677494265728e-07, + "loss": 0.5508, + "step": 73496 + }, + { + "epoch": 0.9187479686992175, + "grad_norm": 3.0787534713745117, + "learning_rate": 3.995235026028932e-07, + "loss": 0.6844, + "step": 73498 + }, + { + "epoch": 0.9187729693242331, + "grad_norm": 0.4529317617416382, + "learning_rate": 3.992793288950858e-07, + "loss": 0.4448, + "step": 73500 + }, + { + "epoch": 0.9187979699492488, + "grad_norm": 0.00036400373210199177, + "learning_rate": 3.9903522830500805e-07, + "loss": 0.002, + "step": 73502 + }, + { + "epoch": 0.9188229705742643, + "grad_norm": 2.608449935913086, + "learning_rate": 3.9879120083452185e-07, + "loss": 1.146, + "step": 73504 + }, + { + "epoch": 0.91884797119928, + "grad_norm": 0.002364059444516897, + "learning_rate": 3.985472464854834e-07, + "loss": 0.4256, + "step": 73506 + }, + { + "epoch": 0.9188729718242956, + "grad_norm": 2.374890089035034, + "learning_rate": 3.983033652597512e-07, + "loss": 1.2404, + "step": 73508 + }, + { + "epoch": 0.9188979724493113, + "grad_norm": 5.464552879333496, + "learning_rate": 3.980595571591828e-07, + "loss": 0.4296, + "step": 73510 + }, + { + "epoch": 0.9189229730743269, + "grad_norm": 3.2809953689575195, + "learning_rate": 3.978158221856343e-07, + "loss": 0.1455, + "step": 73512 + }, + { + "epoch": 0.9189479736993424, + "grad_norm": 4.077727317810059, + "learning_rate": 3.9757216034096436e-07, + "loss": 1.0779, + "step": 73514 + }, + { + "epoch": 0.9189729743243581, + "grad_norm": 2.8726089000701904, + "learning_rate": 3.9732857162702474e-07, + "loss": 0.0967, + "step": 73516 + }, + { + "epoch": 0.9189979749493737, + "grad_norm": 4.568107604980469, + "learning_rate": 3.9708505604567295e-07, + "loss": 0.5318, + "step": 73518 + }, + { + "epoch": 0.9190229755743894, + "grad_norm": 0.0003702164685819298, + "learning_rate": 3.968416135987663e-07, + "loss": 0.1081, + "step": 73520 + }, + { + "epoch": 0.919047976199405, + "grad_norm": 4.548489570617676, + "learning_rate": 3.965982442881533e-07, + "loss": 1.7212, + "step": 73522 + }, + { + "epoch": 0.9190729768244206, + "grad_norm": 4.161060333251953, + "learning_rate": 3.9635494811569144e-07, + "loss": 0.49, + "step": 73524 + }, + { + "epoch": 0.9190979774494362, + "grad_norm": 1.8679652214050293, + "learning_rate": 3.961117250832314e-07, + "loss": 0.6793, + "step": 73526 + }, + { + "epoch": 0.9191229780744519, + "grad_norm": 4.294548988342285, + "learning_rate": 3.958685751926261e-07, + "loss": 0.4763, + "step": 73528 + }, + { + "epoch": 0.9191479786994675, + "grad_norm": 3.9046802520751953, + "learning_rate": 3.956254984457275e-07, + "loss": 0.8473, + "step": 73530 + }, + { + "epoch": 0.9191729793244832, + "grad_norm": 5.562395095825195, + "learning_rate": 3.953824948443885e-07, + "loss": 1.1669, + "step": 73532 + }, + { + "epoch": 0.9191979799494987, + "grad_norm": 4.709702968597412, + "learning_rate": 3.9513956439045766e-07, + "loss": 1.2141, + "step": 73534 + }, + { + "epoch": 0.9192229805745143, + "grad_norm": 2.42183780670166, + "learning_rate": 3.948967070857845e-07, + "loss": 0.4272, + "step": 73536 + }, + { + "epoch": 0.91924798119953, + "grad_norm": 2.6898109912872314, + "learning_rate": 3.9465392293221994e-07, + "loss": 0.9981, + "step": 73538 + }, + { + "epoch": 0.9192729818245456, + "grad_norm": 2.9815752506256104, + "learning_rate": 3.9441121193161347e-07, + "loss": 1.6586, + "step": 73540 + }, + { + "epoch": 0.9192979824495613, + "grad_norm": 5.280250549316406, + "learning_rate": 3.9416857408581143e-07, + "loss": 1.2531, + "step": 73542 + }, + { + "epoch": 0.9193229830745768, + "grad_norm": 0.00040806023753248155, + "learning_rate": 3.939260093966646e-07, + "loss": 0.1608, + "step": 73544 + }, + { + "epoch": 0.9193479836995925, + "grad_norm": 0.0003766875306610018, + "learning_rate": 3.936835178660192e-07, + "loss": 0.4929, + "step": 73546 + }, + { + "epoch": 0.9193729843246081, + "grad_norm": 9.369837760925293, + "learning_rate": 3.934410994957216e-07, + "loss": 1.1549, + "step": 73548 + }, + { + "epoch": 0.9193979849496238, + "grad_norm": 5.189172744750977, + "learning_rate": 3.93198754287617e-07, + "loss": 1.8305, + "step": 73550 + }, + { + "epoch": 0.9194229855746394, + "grad_norm": 2.8531551361083984, + "learning_rate": 3.9295648224355277e-07, + "loss": 0.7172, + "step": 73552 + }, + { + "epoch": 0.919447986199655, + "grad_norm": 0.0003539146273396909, + "learning_rate": 3.92714283365373e-07, + "loss": 0.5074, + "step": 73554 + }, + { + "epoch": 0.9194729868246706, + "grad_norm": 0.9423628449440002, + "learning_rate": 3.9247215765492175e-07, + "loss": 0.8566, + "step": 73556 + }, + { + "epoch": 0.9194979874496862, + "grad_norm": 1.9701639413833618, + "learning_rate": 3.9223010511404644e-07, + "loss": 1.163, + "step": 73558 + }, + { + "epoch": 0.9195229880747019, + "grad_norm": 3.323740243911743, + "learning_rate": 3.919881257445868e-07, + "loss": 0.8664, + "step": 73560 + }, + { + "epoch": 0.9195479886997175, + "grad_norm": 2.0577759742736816, + "learning_rate": 3.9174621954838675e-07, + "loss": 0.7674, + "step": 73562 + }, + { + "epoch": 0.9195729893247331, + "grad_norm": 2.791053295135498, + "learning_rate": 3.915043865272883e-07, + "loss": 0.9434, + "step": 73564 + }, + { + "epoch": 0.9195979899497487, + "grad_norm": 2.277702808380127, + "learning_rate": 3.912626266831343e-07, + "loss": 1.0346, + "step": 73566 + }, + { + "epoch": 0.9196229905747644, + "grad_norm": 4.681425094604492, + "learning_rate": 3.910209400177645e-07, + "loss": 1.3756, + "step": 73568 + }, + { + "epoch": 0.91964799119978, + "grad_norm": 0.0003983468341175467, + "learning_rate": 3.9077932653302063e-07, + "loss": 0.5119, + "step": 73570 + }, + { + "epoch": 0.9196729918247957, + "grad_norm": 3.5392537117004395, + "learning_rate": 3.9053778623074466e-07, + "loss": 0.7062, + "step": 73572 + }, + { + "epoch": 0.9196979924498112, + "grad_norm": 6.482712745666504, + "learning_rate": 3.902963191127718e-07, + "loss": 0.5536, + "step": 73574 + }, + { + "epoch": 0.9197229930748269, + "grad_norm": 4.929720401763916, + "learning_rate": 3.9005492518094487e-07, + "loss": 1.1461, + "step": 73576 + }, + { + "epoch": 0.9197479936998425, + "grad_norm": 2.998976230621338, + "learning_rate": 3.8981360443709814e-07, + "loss": 1.2741, + "step": 73578 + }, + { + "epoch": 0.9197729943248582, + "grad_norm": 3.1530165672302246, + "learning_rate": 3.895723568830734e-07, + "loss": 0.8653, + "step": 73580 + }, + { + "epoch": 0.9197979949498738, + "grad_norm": 8.865569114685059, + "learning_rate": 3.893311825207069e-07, + "loss": 1.392, + "step": 73582 + }, + { + "epoch": 0.9198229955748893, + "grad_norm": 2.991304397583008, + "learning_rate": 3.89090081351835e-07, + "loss": 0.7548, + "step": 73584 + }, + { + "epoch": 0.919847996199905, + "grad_norm": 3.6706833839416504, + "learning_rate": 3.88849053378294e-07, + "loss": 0.7539, + "step": 73586 + }, + { + "epoch": 0.9198729968249206, + "grad_norm": 2.2257845401763916, + "learning_rate": 3.8860809860191915e-07, + "loss": 0.3073, + "step": 73588 + }, + { + "epoch": 0.9198979974499363, + "grad_norm": 1.7188007831573486, + "learning_rate": 3.8836721702454563e-07, + "loss": 1.1498, + "step": 73590 + }, + { + "epoch": 0.9199229980749519, + "grad_norm": 3.157820701599121, + "learning_rate": 3.8812640864800743e-07, + "loss": 0.5833, + "step": 73592 + }, + { + "epoch": 0.9199479986999675, + "grad_norm": 2.4474430084228516, + "learning_rate": 3.8788567347413986e-07, + "loss": 0.6337, + "step": 73594 + }, + { + "epoch": 0.9199729993249831, + "grad_norm": 4.059326171875, + "learning_rate": 3.8764501150477696e-07, + "loss": 0.74, + "step": 73596 + }, + { + "epoch": 0.9199979999499988, + "grad_norm": 3.498008966445923, + "learning_rate": 3.874044227417495e-07, + "loss": 1.0809, + "step": 73598 + }, + { + "epoch": 0.9200230005750144, + "grad_norm": 5.9236369132995605, + "learning_rate": 3.8716390718689154e-07, + "loss": 0.5041, + "step": 73600 + }, + { + "epoch": 0.92004800120003, + "grad_norm": 4.01795768737793, + "learning_rate": 3.869234648420317e-07, + "loss": 0.7535, + "step": 73602 + }, + { + "epoch": 0.9200730018250456, + "grad_norm": 2.7592077255249023, + "learning_rate": 3.86683095709004e-07, + "loss": 0.9617, + "step": 73604 + }, + { + "epoch": 0.9200980024500612, + "grad_norm": 0.007912106812000275, + "learning_rate": 3.8644279978963916e-07, + "loss": 0.9109, + "step": 73606 + }, + { + "epoch": 0.9201230030750769, + "grad_norm": 2.170043468475342, + "learning_rate": 3.862025770857658e-07, + "loss": 0.7302, + "step": 73608 + }, + { + "epoch": 0.9201480037000925, + "grad_norm": 1.0454566478729248, + "learning_rate": 3.8596242759921685e-07, + "loss": 1.149, + "step": 73610 + }, + { + "epoch": 0.9201730043251082, + "grad_norm": 5.816181659698486, + "learning_rate": 3.8572235133181536e-07, + "loss": 1.5001, + "step": 73612 + }, + { + "epoch": 0.9201980049501237, + "grad_norm": 3.112464189529419, + "learning_rate": 3.8548234828539423e-07, + "loss": 0.7358, + "step": 73614 + }, + { + "epoch": 0.9202230055751394, + "grad_norm": 5.995100021362305, + "learning_rate": 3.8524241846177866e-07, + "loss": 0.8602, + "step": 73616 + }, + { + "epoch": 0.920248006200155, + "grad_norm": 2.927715539932251, + "learning_rate": 3.850025618627973e-07, + "loss": 0.8848, + "step": 73618 + }, + { + "epoch": 0.9202730068251707, + "grad_norm": 3.4529478549957275, + "learning_rate": 3.847627784902774e-07, + "loss": 1.4083, + "step": 73620 + }, + { + "epoch": 0.9202980074501863, + "grad_norm": 4.652298927307129, + "learning_rate": 3.8452306834604437e-07, + "loss": 0.8379, + "step": 73622 + }, + { + "epoch": 0.9203230080752018, + "grad_norm": 2.294910192489624, + "learning_rate": 3.842834314319255e-07, + "loss": 0.4669, + "step": 73624 + }, + { + "epoch": 0.9203480087002175, + "grad_norm": 0.08761901408433914, + "learning_rate": 3.8404386774974157e-07, + "loss": 0.7598, + "step": 73626 + }, + { + "epoch": 0.9203730093252331, + "grad_norm": 1.6190412044525146, + "learning_rate": 3.8380437730132004e-07, + "loss": 0.4557, + "step": 73628 + }, + { + "epoch": 0.9203980099502488, + "grad_norm": 4.84454345703125, + "learning_rate": 3.835649600884861e-07, + "loss": 1.6269, + "step": 73630 + }, + { + "epoch": 0.9204230105752644, + "grad_norm": 3.395765542984009, + "learning_rate": 3.8332561611306054e-07, + "loss": 0.5285, + "step": 73632 + }, + { + "epoch": 0.92044801120028, + "grad_norm": 13.711349487304688, + "learning_rate": 3.830863453768674e-07, + "loss": 1.4327, + "step": 73634 + }, + { + "epoch": 0.9204730118252956, + "grad_norm": 0.0012423984007909894, + "learning_rate": 3.828471478817286e-07, + "loss": 0.3557, + "step": 73636 + }, + { + "epoch": 0.9204980124503113, + "grad_norm": 3.755754232406616, + "learning_rate": 3.826080236294671e-07, + "loss": 1.2519, + "step": 73638 + }, + { + "epoch": 0.9205230130753269, + "grad_norm": 3.770610809326172, + "learning_rate": 3.823689726219015e-07, + "loss": 1.4982, + "step": 73640 + }, + { + "epoch": 0.9205480137003426, + "grad_norm": 0.4230577051639557, + "learning_rate": 3.821299948608537e-07, + "loss": 1.5189, + "step": 73642 + }, + { + "epoch": 0.9205730143253581, + "grad_norm": 0.09909039735794067, + "learning_rate": 3.8189109034814433e-07, + "loss": 0.1383, + "step": 73644 + }, + { + "epoch": 0.9205980149503737, + "grad_norm": 4.231833457946777, + "learning_rate": 3.816522590855909e-07, + "loss": 1.4731, + "step": 73646 + }, + { + "epoch": 0.9206230155753894, + "grad_norm": 4.570984363555908, + "learning_rate": 3.8141350107501527e-07, + "loss": 1.5233, + "step": 73648 + }, + { + "epoch": 0.920648016200405, + "grad_norm": 0.00031431776005774736, + "learning_rate": 3.8117481631823495e-07, + "loss": 0.2011, + "step": 73650 + }, + { + "epoch": 0.9206730168254207, + "grad_norm": 1.061776041984558, + "learning_rate": 3.8093620481706616e-07, + "loss": 0.7813, + "step": 73652 + }, + { + "epoch": 0.9206980174504362, + "grad_norm": 0.00017739288159646094, + "learning_rate": 3.806976665733264e-07, + "loss": 0.7372, + "step": 73654 + }, + { + "epoch": 0.9207230180754519, + "grad_norm": 0.0279071144759655, + "learning_rate": 3.8045920158883197e-07, + "loss": 0.3414, + "step": 73656 + }, + { + "epoch": 0.9207480187004675, + "grad_norm": 3.9372236728668213, + "learning_rate": 3.8022080986540146e-07, + "loss": 1.026, + "step": 73658 + }, + { + "epoch": 0.9207730193254832, + "grad_norm": 4.487189769744873, + "learning_rate": 3.799824914048489e-07, + "loss": 1.9235, + "step": 73660 + }, + { + "epoch": 0.9207980199504988, + "grad_norm": 0.030317846685647964, + "learning_rate": 3.7974424620898953e-07, + "loss": 1.7221, + "step": 73662 + }, + { + "epoch": 0.9208230205755144, + "grad_norm": 1.685603141784668, + "learning_rate": 3.795060742796375e-07, + "loss": 1.5304, + "step": 73664 + }, + { + "epoch": 0.92084802120053, + "grad_norm": 3.309540033340454, + "learning_rate": 3.7926797561860573e-07, + "loss": 1.0164, + "step": 73666 + }, + { + "epoch": 0.9208730218255456, + "grad_norm": 1.979636549949646, + "learning_rate": 3.790299502277095e-07, + "loss": 0.6979, + "step": 73668 + }, + { + "epoch": 0.9208980224505613, + "grad_norm": 0.0007959076319821179, + "learning_rate": 3.787919981087596e-07, + "loss": 0.7803, + "step": 73670 + }, + { + "epoch": 0.9209230230755769, + "grad_norm": 2.9462852478027344, + "learning_rate": 3.7855411926357e-07, + "loss": 0.2895, + "step": 73672 + }, + { + "epoch": 0.9209480237005925, + "grad_norm": 0.000739885785151273, + "learning_rate": 3.783163136939505e-07, + "loss": 0.1939, + "step": 73674 + }, + { + "epoch": 0.9209730243256081, + "grad_norm": 4.096836090087891, + "learning_rate": 3.780785814017163e-07, + "loss": 1.7568, + "step": 73676 + }, + { + "epoch": 0.9209980249506238, + "grad_norm": 3.4328250885009766, + "learning_rate": 3.778409223886725e-07, + "loss": 0.6123, + "step": 73678 + }, + { + "epoch": 0.9210230255756394, + "grad_norm": 5.358226299285889, + "learning_rate": 3.776033366566312e-07, + "loss": 1.0661, + "step": 73680 + }, + { + "epoch": 0.9210480262006551, + "grad_norm": 3.8564207553863525, + "learning_rate": 3.773658242074041e-07, + "loss": 1.1843, + "step": 73682 + }, + { + "epoch": 0.9210730268256706, + "grad_norm": 3.2200729846954346, + "learning_rate": 3.7712838504279645e-07, + "loss": 1.3627, + "step": 73684 + }, + { + "epoch": 0.9210980274506863, + "grad_norm": 6.816051483154297, + "learning_rate": 3.7689101916461913e-07, + "loss": 0.7628, + "step": 73686 + }, + { + "epoch": 0.9211230280757019, + "grad_norm": 0.0007462764042429626, + "learning_rate": 3.7665372657467834e-07, + "loss": 0.0802, + "step": 73688 + }, + { + "epoch": 0.9211480287007175, + "grad_norm": 2.328367233276367, + "learning_rate": 3.7641650727478274e-07, + "loss": 1.6108, + "step": 73690 + }, + { + "epoch": 0.9211730293257332, + "grad_norm": 0.5237823128700256, + "learning_rate": 3.761793612667375e-07, + "loss": 0.4107, + "step": 73692 + }, + { + "epoch": 0.9211980299507487, + "grad_norm": 0.0005493963835760951, + "learning_rate": 3.759422885523489e-07, + "loss": 0.0865, + "step": 73694 + }, + { + "epoch": 0.9212230305757644, + "grad_norm": 3.994243621826172, + "learning_rate": 3.757052891334245e-07, + "loss": 1.251, + "step": 73696 + }, + { + "epoch": 0.92124803120078, + "grad_norm": 2.470842123031616, + "learning_rate": 3.7546836301176614e-07, + "loss": 0.3862, + "step": 73698 + }, + { + "epoch": 0.9212730318257957, + "grad_norm": 3.6874852180480957, + "learning_rate": 3.7523151018918013e-07, + "loss": 0.9829, + "step": 73700 + }, + { + "epoch": 0.9212980324508113, + "grad_norm": 4.096985816955566, + "learning_rate": 3.749947306674706e-07, + "loss": 0.8316, + "step": 73702 + }, + { + "epoch": 0.9213230330758269, + "grad_norm": 0.009308742359280586, + "learning_rate": 3.7475802444843833e-07, + "loss": 0.0002, + "step": 73704 + }, + { + "epoch": 0.9213480337008425, + "grad_norm": 3.637319564819336, + "learning_rate": 3.7452139153388964e-07, + "loss": 2.2181, + "step": 73706 + }, + { + "epoch": 0.9213730343258582, + "grad_norm": 2.405813455581665, + "learning_rate": 3.7428483192562314e-07, + "loss": 1.6705, + "step": 73708 + }, + { + "epoch": 0.9213980349508738, + "grad_norm": 3.104644775390625, + "learning_rate": 3.74048345625444e-07, + "loss": 1.2508, + "step": 73710 + }, + { + "epoch": 0.9214230355758894, + "grad_norm": 3.8376073837280273, + "learning_rate": 3.738119326351497e-07, + "loss": 2.4878, + "step": 73712 + }, + { + "epoch": 0.921448036200905, + "grad_norm": 0.0004684576124418527, + "learning_rate": 3.7357559295654323e-07, + "loss": 1.0036, + "step": 73714 + }, + { + "epoch": 0.9214730368259206, + "grad_norm": 1.8333226442337036, + "learning_rate": 3.7333932659142426e-07, + "loss": 0.7885, + "step": 73716 + }, + { + "epoch": 0.9214980374509363, + "grad_norm": 2.888847589492798, + "learning_rate": 3.7310313354159134e-07, + "loss": 0.5346, + "step": 73718 + }, + { + "epoch": 0.9215230380759519, + "grad_norm": 2.3311269283294678, + "learning_rate": 3.7286701380884303e-07, + "loss": 1.3737, + "step": 73720 + }, + { + "epoch": 0.9215480387009676, + "grad_norm": 1.8223212957382202, + "learning_rate": 3.7263096739497796e-07, + "loss": 0.2013, + "step": 73722 + }, + { + "epoch": 0.9215730393259831, + "grad_norm": 0.8612595796585083, + "learning_rate": 3.723949943017957e-07, + "loss": 0.1454, + "step": 73724 + }, + { + "epoch": 0.9215980399509988, + "grad_norm": 0.006452547386288643, + "learning_rate": 3.7215909453108935e-07, + "loss": 0.0001, + "step": 73726 + }, + { + "epoch": 0.9216230405760144, + "grad_norm": 0.010778263211250305, + "learning_rate": 3.7192326808465963e-07, + "loss": 0.2363, + "step": 73728 + }, + { + "epoch": 0.92164804120103, + "grad_norm": 4.711559772491455, + "learning_rate": 3.716875149643007e-07, + "loss": 1.2516, + "step": 73730 + }, + { + "epoch": 0.9216730418260457, + "grad_norm": 2.8555219173431396, + "learning_rate": 3.714518351718066e-07, + "loss": 1.0006, + "step": 73732 + }, + { + "epoch": 0.9216980424510612, + "grad_norm": 5.666835784912109, + "learning_rate": 3.71216228708976e-07, + "loss": 1.4618, + "step": 73734 + }, + { + "epoch": 0.9217230430760769, + "grad_norm": 2.7879104614257812, + "learning_rate": 3.7098069557759853e-07, + "loss": 0.2609, + "step": 73736 + }, + { + "epoch": 0.9217480437010925, + "grad_norm": 3.492584466934204, + "learning_rate": 3.707452357794716e-07, + "loss": 1.2256, + "step": 73738 + }, + { + "epoch": 0.9217730443261082, + "grad_norm": 3.4780755043029785, + "learning_rate": 3.705098493163872e-07, + "loss": 0.7879, + "step": 73740 + }, + { + "epoch": 0.9217980449511238, + "grad_norm": 3.5763895511627197, + "learning_rate": 3.702745361901383e-07, + "loss": 1.2462, + "step": 73742 + }, + { + "epoch": 0.9218230455761394, + "grad_norm": 3.5861644744873047, + "learning_rate": 3.7003929640251676e-07, + "loss": 0.7194, + "step": 73744 + }, + { + "epoch": 0.921848046201155, + "grad_norm": 6.181232452392578, + "learning_rate": 3.6980412995531344e-07, + "loss": 0.8304, + "step": 73746 + }, + { + "epoch": 0.9218730468261707, + "grad_norm": 3.960233211517334, + "learning_rate": 3.6956903685032133e-07, + "loss": 0.5902, + "step": 73748 + }, + { + "epoch": 0.9218980474511863, + "grad_norm": 15.655877113342285, + "learning_rate": 3.6933401708932783e-07, + "loss": 1.0855, + "step": 73750 + }, + { + "epoch": 0.921923048076202, + "grad_norm": 6.615760326385498, + "learning_rate": 3.69099070674126e-07, + "loss": 1.4371, + "step": 73752 + }, + { + "epoch": 0.9219480487012175, + "grad_norm": 2.15094256401062, + "learning_rate": 3.688641976065044e-07, + "loss": 0.5216, + "step": 73754 + }, + { + "epoch": 0.9219730493262331, + "grad_norm": 1.9723490476608276, + "learning_rate": 3.6862939788824827e-07, + "loss": 0.1229, + "step": 73756 + }, + { + "epoch": 0.9219980499512488, + "grad_norm": 6.426925182342529, + "learning_rate": 3.683946715211517e-07, + "loss": 1.4, + "step": 73758 + }, + { + "epoch": 0.9220230505762644, + "grad_norm": 2.5350100994110107, + "learning_rate": 3.681600185069967e-07, + "loss": 0.713, + "step": 73760 + }, + { + "epoch": 0.9220480512012801, + "grad_norm": 4.5632853507995605, + "learning_rate": 3.6792543884757506e-07, + "loss": 1.1568, + "step": 73762 + }, + { + "epoch": 0.9220730518262956, + "grad_norm": 2.5269782543182373, + "learning_rate": 3.676909325446687e-07, + "loss": 1.5272, + "step": 73764 + }, + { + "epoch": 0.9220980524513113, + "grad_norm": 2.605337142944336, + "learning_rate": 3.6745649960006846e-07, + "loss": 1.0633, + "step": 73766 + }, + { + "epoch": 0.9221230530763269, + "grad_norm": 3.5228705406188965, + "learning_rate": 3.6722214001555736e-07, + "loss": 0.626, + "step": 73768 + }, + { + "epoch": 0.9221480537013426, + "grad_norm": 2.9093523025512695, + "learning_rate": 3.6698785379291837e-07, + "loss": 0.8368, + "step": 73770 + }, + { + "epoch": 0.9221730543263582, + "grad_norm": 4.113607406616211, + "learning_rate": 3.6675364093393895e-07, + "loss": 1.5567, + "step": 73772 + }, + { + "epoch": 0.9221980549513737, + "grad_norm": 2.947103261947632, + "learning_rate": 3.6651950144040104e-07, + "loss": 0.159, + "step": 73774 + }, + { + "epoch": 0.9222230555763894, + "grad_norm": 8.27884292602539, + "learning_rate": 3.6628543531408875e-07, + "loss": 1.8892, + "step": 73776 + }, + { + "epoch": 0.922248056201405, + "grad_norm": 5.962690830230713, + "learning_rate": 3.6605144255678294e-07, + "loss": 1.3146, + "step": 73778 + }, + { + "epoch": 0.9222730568264207, + "grad_norm": 2.637223720550537, + "learning_rate": 3.6581752317026875e-07, + "loss": 1.9864, + "step": 73780 + }, + { + "epoch": 0.9222980574514363, + "grad_norm": 4.3587446212768555, + "learning_rate": 3.6558367715632593e-07, + "loss": 1.7568, + "step": 73782 + }, + { + "epoch": 0.9223230580764519, + "grad_norm": 2.9976563453674316, + "learning_rate": 3.6534990451673414e-07, + "loss": 1.129, + "step": 73784 + }, + { + "epoch": 0.9223480587014675, + "grad_norm": 2.594339609146118, + "learning_rate": 3.6511620525327527e-07, + "loss": 0.2214, + "step": 73786 + }, + { + "epoch": 0.9223730593264832, + "grad_norm": 2.217346668243408, + "learning_rate": 3.648825793677291e-07, + "loss": 1.1112, + "step": 73788 + }, + { + "epoch": 0.9223980599514988, + "grad_norm": 2.8831238746643066, + "learning_rate": 3.646490268618752e-07, + "loss": 0.2775, + "step": 73790 + }, + { + "epoch": 0.9224230605765145, + "grad_norm": 3.670764923095703, + "learning_rate": 3.6441554773749e-07, + "loss": 0.3081, + "step": 73792 + }, + { + "epoch": 0.92244806120153, + "grad_norm": 2.562992811203003, + "learning_rate": 3.641821419963554e-07, + "loss": 1.2032, + "step": 73794 + }, + { + "epoch": 0.9224730618265456, + "grad_norm": 4.47944974899292, + "learning_rate": 3.639488096402466e-07, + "loss": 1.9993, + "step": 73796 + }, + { + "epoch": 0.9224980624515613, + "grad_norm": 3.039259195327759, + "learning_rate": 3.6371555067094e-07, + "loss": 1.2507, + "step": 73798 + }, + { + "epoch": 0.9225230630765769, + "grad_norm": 0.00021584997011814266, + "learning_rate": 3.6348236509021417e-07, + "loss": 0.0132, + "step": 73800 + }, + { + "epoch": 0.9225480637015926, + "grad_norm": 2.840603828430176, + "learning_rate": 3.6324925289984324e-07, + "loss": 1.0083, + "step": 73802 + }, + { + "epoch": 0.9225730643266081, + "grad_norm": 3.8814609050750732, + "learning_rate": 3.6301621410160357e-07, + "loss": 1.2306, + "step": 73804 + }, + { + "epoch": 0.9225980649516238, + "grad_norm": 1.0445218086242676, + "learning_rate": 3.6278324869727043e-07, + "loss": 0.7159, + "step": 73806 + }, + { + "epoch": 0.9226230655766394, + "grad_norm": 3.851951837539673, + "learning_rate": 3.625503566886157e-07, + "loss": 0.766, + "step": 73808 + }, + { + "epoch": 0.9226480662016551, + "grad_norm": 2.0685834884643555, + "learning_rate": 3.623175380774158e-07, + "loss": 1.4582, + "step": 73810 + }, + { + "epoch": 0.9226730668266707, + "grad_norm": 3.9929707050323486, + "learning_rate": 3.620847928654414e-07, + "loss": 1.3494, + "step": 73812 + }, + { + "epoch": 0.9226980674516863, + "grad_norm": 5.233363628387451, + "learning_rate": 3.6185212105446675e-07, + "loss": 0.8041, + "step": 73814 + }, + { + "epoch": 0.9227230680767019, + "grad_norm": 3.005850076675415, + "learning_rate": 3.616195226462638e-07, + "loss": 1.0105, + "step": 73816 + }, + { + "epoch": 0.9227480687017175, + "grad_norm": 3.9969370365142822, + "learning_rate": 3.6138699764260323e-07, + "loss": 1.5565, + "step": 73818 + }, + { + "epoch": 0.9227730693267332, + "grad_norm": 4.946326732635498, + "learning_rate": 3.61154546045257e-07, + "loss": 1.3849, + "step": 73820 + }, + { + "epoch": 0.9227980699517488, + "grad_norm": 0.0006943743792362511, + "learning_rate": 3.6092216785599377e-07, + "loss": 0.5825, + "step": 73822 + }, + { + "epoch": 0.9228230705767644, + "grad_norm": 3.0275375843048096, + "learning_rate": 3.606898630765854e-07, + "loss": 0.5068, + "step": 73824 + }, + { + "epoch": 0.92284807120178, + "grad_norm": 0.17240384221076965, + "learning_rate": 3.604576317087982e-07, + "loss": 0.5249, + "step": 73826 + }, + { + "epoch": 0.9228730718267957, + "grad_norm": 3.525439739227295, + "learning_rate": 3.602254737544031e-07, + "loss": 1.3553, + "step": 73828 + }, + { + "epoch": 0.9228980724518113, + "grad_norm": 0.4449099004268646, + "learning_rate": 3.599933892151675e-07, + "loss": 0.0136, + "step": 73830 + }, + { + "epoch": 0.922923073076827, + "grad_norm": 3.286327600479126, + "learning_rate": 3.597613780928599e-07, + "loss": 1.5241, + "step": 73832 + }, + { + "epoch": 0.9229480737018425, + "grad_norm": 2.219949722290039, + "learning_rate": 3.5952944038924687e-07, + "loss": 0.5886, + "step": 73834 + }, + { + "epoch": 0.9229730743268582, + "grad_norm": 1.6622376441955566, + "learning_rate": 3.592975761060924e-07, + "loss": 0.3232, + "step": 73836 + }, + { + "epoch": 0.9229980749518738, + "grad_norm": 6.328354358673096, + "learning_rate": 3.5906578524516623e-07, + "loss": 0.9801, + "step": 73838 + }, + { + "epoch": 0.9230230755768895, + "grad_norm": 0.28583022952079773, + "learning_rate": 3.588340678082292e-07, + "loss": 1.4399, + "step": 73840 + }, + { + "epoch": 0.9230480762019051, + "grad_norm": 2.724482774734497, + "learning_rate": 3.586024237970509e-07, + "loss": 1.3321, + "step": 73842 + }, + { + "epoch": 0.9230730768269206, + "grad_norm": 5.359805107116699, + "learning_rate": 3.583708532133912e-07, + "loss": 1.1682, + "step": 73844 + }, + { + "epoch": 0.9230980774519363, + "grad_norm": 2.7667925357818604, + "learning_rate": 3.581393560590174e-07, + "loss": 0.8002, + "step": 73846 + }, + { + "epoch": 0.9231230780769519, + "grad_norm": 0.00048352565499953926, + "learning_rate": 3.5790793233569046e-07, + "loss": 0.4758, + "step": 73848 + }, + { + "epoch": 0.9231480787019676, + "grad_norm": 2.857849597930908, + "learning_rate": 3.576765820451722e-07, + "loss": 1.8906, + "step": 73850 + }, + { + "epoch": 0.9231730793269832, + "grad_norm": 0.0005508398171514273, + "learning_rate": 3.574453051892268e-07, + "loss": 0.0247, + "step": 73852 + }, + { + "epoch": 0.9231980799519988, + "grad_norm": 2.8484745025634766, + "learning_rate": 3.572141017696129e-07, + "loss": 1.787, + "step": 73854 + }, + { + "epoch": 0.9232230805770144, + "grad_norm": 2.5505011081695557, + "learning_rate": 3.569829717880946e-07, + "loss": 0.3612, + "step": 73856 + }, + { + "epoch": 0.9232480812020301, + "grad_norm": 0.0005491377087309957, + "learning_rate": 3.567519152464294e-07, + "loss": 0.017, + "step": 73858 + }, + { + "epoch": 0.9232730818270457, + "grad_norm": 2.3834409713745117, + "learning_rate": 3.5652093214637694e-07, + "loss": 0.6595, + "step": 73860 + }, + { + "epoch": 0.9232980824520614, + "grad_norm": 3.235391616821289, + "learning_rate": 3.5629002248969923e-07, + "loss": 0.5151, + "step": 73862 + }, + { + "epoch": 0.9233230830770769, + "grad_norm": 0.0003773546195589006, + "learning_rate": 3.560591862781515e-07, + "loss": 0.038, + "step": 73864 + }, + { + "epoch": 0.9233480837020925, + "grad_norm": 0.12860293686389923, + "learning_rate": 3.5582842351349453e-07, + "loss": 0.4435, + "step": 73866 + }, + { + "epoch": 0.9233730843271082, + "grad_norm": 3.3607335090637207, + "learning_rate": 3.555977341974826e-07, + "loss": 0.8269, + "step": 73868 + }, + { + "epoch": 0.9233980849521238, + "grad_norm": 0.00032886056578718126, + "learning_rate": 3.553671183318763e-07, + "loss": 0.3676, + "step": 73870 + }, + { + "epoch": 0.9234230855771395, + "grad_norm": 2.963456153869629, + "learning_rate": 3.5513657591843e-07, + "loss": 1.3527, + "step": 73872 + }, + { + "epoch": 0.923448086202155, + "grad_norm": 1.23800528049469, + "learning_rate": 3.5490610695889773e-07, + "loss": 1.2315, + "step": 73874 + }, + { + "epoch": 0.9234730868271707, + "grad_norm": 4.559752941131592, + "learning_rate": 3.5467571145503923e-07, + "loss": 0.3562, + "step": 73876 + }, + { + "epoch": 0.9234980874521863, + "grad_norm": 0.0005037762457504869, + "learning_rate": 3.544453894086042e-07, + "loss": 0.8073, + "step": 73878 + }, + { + "epoch": 0.923523088077202, + "grad_norm": 3.750048875808716, + "learning_rate": 3.5421514082135123e-07, + "loss": 0.7385, + "step": 73880 + }, + { + "epoch": 0.9235480887022176, + "grad_norm": 3.8840696811676025, + "learning_rate": 3.539849656950289e-07, + "loss": 0.5533, + "step": 73882 + }, + { + "epoch": 0.9235730893272331, + "grad_norm": 4.665553092956543, + "learning_rate": 3.5375486403139593e-07, + "loss": 1.6326, + "step": 73884 + }, + { + "epoch": 0.9235980899522488, + "grad_norm": 3.1220808029174805, + "learning_rate": 3.535248358322008e-07, + "loss": 0.847, + "step": 73886 + }, + { + "epoch": 0.9236230905772644, + "grad_norm": 2.5293972492218018, + "learning_rate": 3.532948810991954e-07, + "loss": 0.9727, + "step": 73888 + }, + { + "epoch": 0.9236480912022801, + "grad_norm": 4.256239414215088, + "learning_rate": 3.53064999834134e-07, + "loss": 0.3243, + "step": 73890 + }, + { + "epoch": 0.9236730918272957, + "grad_norm": 6.404379367828369, + "learning_rate": 3.5283519203876293e-07, + "loss": 1.2648, + "step": 73892 + }, + { + "epoch": 0.9236980924523113, + "grad_norm": 2.5378501415252686, + "learning_rate": 3.526054577148363e-07, + "loss": 0.6527, + "step": 73894 + }, + { + "epoch": 0.9237230930773269, + "grad_norm": 6.934049606323242, + "learning_rate": 3.5237579686410175e-07, + "loss": 0.759, + "step": 73896 + }, + { + "epoch": 0.9237480937023426, + "grad_norm": 0.00037521778722293675, + "learning_rate": 3.521462094883099e-07, + "loss": 0.4502, + "step": 73898 + }, + { + "epoch": 0.9237730943273582, + "grad_norm": 2.5593082904815674, + "learning_rate": 3.5191669558920725e-07, + "loss": 0.5206, + "step": 73900 + }, + { + "epoch": 0.9237980949523739, + "grad_norm": 3.0477302074432373, + "learning_rate": 3.516872551685424e-07, + "loss": 0.6207, + "step": 73902 + }, + { + "epoch": 0.9238230955773894, + "grad_norm": 3.2409818172454834, + "learning_rate": 3.5145788822806395e-07, + "loss": 0.7152, + "step": 73904 + }, + { + "epoch": 0.923848096202405, + "grad_norm": 3.651310682296753, + "learning_rate": 3.5122859476951597e-07, + "loss": 1.4225, + "step": 73906 + }, + { + "epoch": 0.9238730968274207, + "grad_norm": 2.661980390548706, + "learning_rate": 3.509993747946494e-07, + "loss": 0.9952, + "step": 73908 + }, + { + "epoch": 0.9238980974524363, + "grad_norm": 5.036074161529541, + "learning_rate": 3.507702283052039e-07, + "loss": 0.3804, + "step": 73910 + }, + { + "epoch": 0.923923098077452, + "grad_norm": 3.1002426147460938, + "learning_rate": 3.505411553029303e-07, + "loss": 1.2236, + "step": 73912 + }, + { + "epoch": 0.9239480987024675, + "grad_norm": 0.0007293170201592147, + "learning_rate": 3.5031215578957055e-07, + "loss": 0.0401, + "step": 73914 + }, + { + "epoch": 0.9239730993274832, + "grad_norm": 3.8066704273223877, + "learning_rate": 3.500832297668677e-07, + "loss": 1.7249, + "step": 73916 + }, + { + "epoch": 0.9239980999524988, + "grad_norm": 4.279597282409668, + "learning_rate": 3.49854377236567e-07, + "loss": 0.369, + "step": 73918 + }, + { + "epoch": 0.9240231005775145, + "grad_norm": 2.680988311767578, + "learning_rate": 3.496255982004093e-07, + "loss": 0.8466, + "step": 73920 + }, + { + "epoch": 0.9240481012025301, + "grad_norm": 3.05586576461792, + "learning_rate": 3.493968926601399e-07, + "loss": 0.2467, + "step": 73922 + }, + { + "epoch": 0.9240731018275457, + "grad_norm": 2.7726359367370605, + "learning_rate": 3.4916826061749955e-07, + "loss": 0.3661, + "step": 73924 + }, + { + "epoch": 0.9240981024525613, + "grad_norm": 3.0264525413513184, + "learning_rate": 3.48939702074228e-07, + "loss": 0.503, + "step": 73926 + }, + { + "epoch": 0.924123103077577, + "grad_norm": 3.3893864154815674, + "learning_rate": 3.487112170320672e-07, + "loss": 1.0901, + "step": 73928 + }, + { + "epoch": 0.9241481037025926, + "grad_norm": 0.21426638960838318, + "learning_rate": 3.484828054927558e-07, + "loss": 0.3125, + "step": 73930 + }, + { + "epoch": 0.9241731043276082, + "grad_norm": 4.5229926109313965, + "learning_rate": 3.4825446745803684e-07, + "loss": 1.2345, + "step": 73932 + }, + { + "epoch": 0.9241981049526238, + "grad_norm": 3.3154335021972656, + "learning_rate": 3.480262029296444e-07, + "loss": 0.8293, + "step": 73934 + }, + { + "epoch": 0.9242231055776394, + "grad_norm": 2.3287293910980225, + "learning_rate": 3.477980119093205e-07, + "loss": 1.3358, + "step": 73936 + }, + { + "epoch": 0.9242481062026551, + "grad_norm": 3.0456275939941406, + "learning_rate": 3.475698943988037e-07, + "loss": 1.2198, + "step": 73938 + }, + { + "epoch": 0.9242731068276707, + "grad_norm": 3.0533087253570557, + "learning_rate": 3.4734185039982716e-07, + "loss": 0.6031, + "step": 73940 + }, + { + "epoch": 0.9242981074526864, + "grad_norm": 2.988574266433716, + "learning_rate": 3.471138799141316e-07, + "loss": 1.1422, + "step": 73942 + }, + { + "epoch": 0.9243231080777019, + "grad_norm": 5.633120059967041, + "learning_rate": 3.468859829434501e-07, + "loss": 0.6764, + "step": 73944 + }, + { + "epoch": 0.9243481087027176, + "grad_norm": 3.2887911796569824, + "learning_rate": 3.466581594895202e-07, + "loss": 0.9006, + "step": 73946 + }, + { + "epoch": 0.9243731093277332, + "grad_norm": 2.6801955699920654, + "learning_rate": 3.4643040955407825e-07, + "loss": 0.621, + "step": 73948 + }, + { + "epoch": 0.9243981099527488, + "grad_norm": 5.122457027435303, + "learning_rate": 3.462027331388573e-07, + "loss": 1.282, + "step": 73950 + }, + { + "epoch": 0.9244231105777645, + "grad_norm": 4.801807403564453, + "learning_rate": 3.459751302455905e-07, + "loss": 0.8965, + "step": 73952 + }, + { + "epoch": 0.92444811120278, + "grad_norm": 8.463688850402832, + "learning_rate": 3.457476008760108e-07, + "loss": 1.2199, + "step": 73954 + }, + { + "epoch": 0.9244731118277957, + "grad_norm": 1.8371182680130005, + "learning_rate": 3.455201450318535e-07, + "loss": 1.0526, + "step": 73956 + }, + { + "epoch": 0.9244981124528113, + "grad_norm": 3.415313482284546, + "learning_rate": 3.452927627148483e-07, + "loss": 0.6453, + "step": 73958 + }, + { + "epoch": 0.924523113077827, + "grad_norm": 6.657103061676025, + "learning_rate": 3.450654539267273e-07, + "loss": 0.2047, + "step": 73960 + }, + { + "epoch": 0.9245481137028426, + "grad_norm": 6.3108062744140625, + "learning_rate": 3.4483821866922454e-07, + "loss": 2.4768, + "step": 73962 + }, + { + "epoch": 0.9245731143278582, + "grad_norm": 3.860114812850952, + "learning_rate": 3.446110569440686e-07, + "loss": 1.1604, + "step": 73964 + }, + { + "epoch": 0.9245981149528738, + "grad_norm": 3.527318000793457, + "learning_rate": 3.4438396875298927e-07, + "loss": 1.0771, + "step": 73966 + }, + { + "epoch": 0.9246231155778895, + "grad_norm": 6.519839763641357, + "learning_rate": 3.441569540977152e-07, + "loss": 0.9496, + "step": 73968 + }, + { + "epoch": 0.9246481162029051, + "grad_norm": 2.2628061771392822, + "learning_rate": 3.439300129799772e-07, + "loss": 0.7305, + "step": 73970 + }, + { + "epoch": 0.9246731168279207, + "grad_norm": 0.19012324512004852, + "learning_rate": 3.437031454015016e-07, + "loss": 0.0311, + "step": 73972 + }, + { + "epoch": 0.9246981174529363, + "grad_norm": 3.614095449447632, + "learning_rate": 3.434763513640171e-07, + "loss": 1.6019, + "step": 73974 + }, + { + "epoch": 0.9247231180779519, + "grad_norm": 0.04322730377316475, + "learning_rate": 3.4324963086925347e-07, + "loss": 0.1188, + "step": 73976 + }, + { + "epoch": 0.9247481187029676, + "grad_norm": 2.907019853591919, + "learning_rate": 3.4302298391893364e-07, + "loss": 1.6231, + "step": 73978 + }, + { + "epoch": 0.9247731193279832, + "grad_norm": 3.1463170051574707, + "learning_rate": 3.427964105147852e-07, + "loss": 0.7991, + "step": 73980 + }, + { + "epoch": 0.9247981199529989, + "grad_norm": 3.02372145652771, + "learning_rate": 3.425699106585334e-07, + "loss": 0.7764, + "step": 73982 + }, + { + "epoch": 0.9248231205780144, + "grad_norm": 3.671132802963257, + "learning_rate": 3.4234348435190245e-07, + "loss": 0.7054, + "step": 73984 + }, + { + "epoch": 0.9248481212030301, + "grad_norm": 2.281754732131958, + "learning_rate": 3.4211713159661985e-07, + "loss": 1.5719, + "step": 73986 + }, + { + "epoch": 0.9248731218280457, + "grad_norm": 3.080305337905884, + "learning_rate": 3.4189085239440646e-07, + "loss": 0.6179, + "step": 73988 + }, + { + "epoch": 0.9248981224530614, + "grad_norm": 4.615530014038086, + "learning_rate": 3.416646467469875e-07, + "loss": 1.0449, + "step": 73990 + }, + { + "epoch": 0.924923123078077, + "grad_norm": 5.745196342468262, + "learning_rate": 3.414385146560839e-07, + "loss": 1.1327, + "step": 73992 + }, + { + "epoch": 0.9249481237030925, + "grad_norm": 0.00023049379524309188, + "learning_rate": 3.4121245612341867e-07, + "loss": 0.3965, + "step": 73994 + }, + { + "epoch": 0.9249731243281082, + "grad_norm": 2.8778445720672607, + "learning_rate": 3.4098647115071384e-07, + "loss": 0.7814, + "step": 73996 + }, + { + "epoch": 0.9249981249531238, + "grad_norm": 3.2711474895477295, + "learning_rate": 3.4076055973968905e-07, + "loss": 2.5485, + "step": 73998 + }, + { + "epoch": 0.9250231255781395, + "grad_norm": 7.863706588745117, + "learning_rate": 3.405347218920674e-07, + "loss": 0.4624, + "step": 74000 + }, + { + "epoch": 0.9250481262031551, + "grad_norm": 4.2868781089782715, + "learning_rate": 3.403089576095675e-07, + "loss": 1.3554, + "step": 74002 + }, + { + "epoch": 0.9250731268281707, + "grad_norm": 2.212825298309326, + "learning_rate": 3.40083266893908e-07, + "loss": 0.9679, + "step": 74004 + }, + { + "epoch": 0.9250981274531863, + "grad_norm": 1.6062220335006714, + "learning_rate": 3.398576497468076e-07, + "loss": 0.2008, + "step": 74006 + }, + { + "epoch": 0.925123128078202, + "grad_norm": 1.8256332874298096, + "learning_rate": 3.396321061699848e-07, + "loss": 0.9795, + "step": 74008 + }, + { + "epoch": 0.9251481287032176, + "grad_norm": 2.6249685287475586, + "learning_rate": 3.394066361651593e-07, + "loss": 0.5169, + "step": 74010 + }, + { + "epoch": 0.9251731293282333, + "grad_norm": 5.109199523925781, + "learning_rate": 3.3918123973404547e-07, + "loss": 1.0391, + "step": 74012 + }, + { + "epoch": 0.9251981299532488, + "grad_norm": 5.353515148162842, + "learning_rate": 3.389559168783629e-07, + "loss": 0.8421, + "step": 74014 + }, + { + "epoch": 0.9252231305782644, + "grad_norm": 0.8158013224601746, + "learning_rate": 3.3873066759982474e-07, + "loss": 0.1989, + "step": 74016 + }, + { + "epoch": 0.9252481312032801, + "grad_norm": 4.8275604248046875, + "learning_rate": 3.3850549190014846e-07, + "loss": 0.2858, + "step": 74018 + }, + { + "epoch": 0.9252731318282957, + "grad_norm": 3.752451181411743, + "learning_rate": 3.3828038978104603e-07, + "loss": 1.3396, + "step": 74020 + }, + { + "epoch": 0.9252981324533114, + "grad_norm": 2.512702465057373, + "learning_rate": 3.3805536124423497e-07, + "loss": 0.4863, + "step": 74022 + }, + { + "epoch": 0.9253231330783269, + "grad_norm": 4.951037406921387, + "learning_rate": 3.378304062914284e-07, + "loss": 1.6985, + "step": 74024 + }, + { + "epoch": 0.9253481337033426, + "grad_norm": 3.2138214111328125, + "learning_rate": 3.376055249243382e-07, + "loss": 0.8277, + "step": 74026 + }, + { + "epoch": 0.9253731343283582, + "grad_norm": 0.00024479784769937396, + "learning_rate": 3.3738071714468077e-07, + "loss": 0.2165, + "step": 74028 + }, + { + "epoch": 0.9253981349533739, + "grad_norm": 4.505611419677734, + "learning_rate": 3.371559829541626e-07, + "loss": 0.321, + "step": 74030 + }, + { + "epoch": 0.9254231355783895, + "grad_norm": 1.6711550951004028, + "learning_rate": 3.36931322354499e-07, + "loss": 1.1492, + "step": 74032 + }, + { + "epoch": 0.925448136203405, + "grad_norm": 3.9567317962646484, + "learning_rate": 3.367067353473985e-07, + "loss": 1.877, + "step": 74034 + }, + { + "epoch": 0.9254731368284207, + "grad_norm": 5.975855350494385, + "learning_rate": 3.3648222193457424e-07, + "loss": 1.3373, + "step": 74036 + }, + { + "epoch": 0.9254981374534363, + "grad_norm": 2.394129991531372, + "learning_rate": 3.3625778211773485e-07, + "loss": 0.5059, + "step": 74038 + }, + { + "epoch": 0.925523138078452, + "grad_norm": 8.178505897521973, + "learning_rate": 3.3603341589858896e-07, + "loss": 1.1999, + "step": 74040 + }, + { + "epoch": 0.9255481387034676, + "grad_norm": 5.150544166564941, + "learning_rate": 3.358091232788474e-07, + "loss": 1.574, + "step": 74042 + }, + { + "epoch": 0.9255731393284832, + "grad_norm": 1.8053786754608154, + "learning_rate": 3.355849042602155e-07, + "loss": 0.4653, + "step": 74044 + }, + { + "epoch": 0.9255981399534988, + "grad_norm": 5.646595001220703, + "learning_rate": 3.353607588444019e-07, + "loss": 1.1715, + "step": 74046 + }, + { + "epoch": 0.9256231405785145, + "grad_norm": 2.038161277770996, + "learning_rate": 3.351366870331152e-07, + "loss": 0.8163, + "step": 74048 + }, + { + "epoch": 0.9256481412035301, + "grad_norm": 1.7780146598815918, + "learning_rate": 3.349126888280607e-07, + "loss": 0.5252, + "step": 74050 + }, + { + "epoch": 0.9256731418285458, + "grad_norm": 4.810639381408691, + "learning_rate": 3.3468876423094375e-07, + "loss": 1.1417, + "step": 74052 + }, + { + "epoch": 0.9256981424535613, + "grad_norm": 3.5063116550445557, + "learning_rate": 3.3446491324347186e-07, + "loss": 0.8651, + "step": 74054 + }, + { + "epoch": 0.925723143078577, + "grad_norm": 3.084890604019165, + "learning_rate": 3.342411358673481e-07, + "loss": 1.3441, + "step": 74056 + }, + { + "epoch": 0.9257481437035926, + "grad_norm": 3.823641300201416, + "learning_rate": 3.340174321042755e-07, + "loss": 1.3594, + "step": 74058 + }, + { + "epoch": 0.9257731443286082, + "grad_norm": 0.21578247845172882, + "learning_rate": 3.337938019559595e-07, + "loss": 0.3178, + "step": 74060 + }, + { + "epoch": 0.9257981449536239, + "grad_norm": 5.008357048034668, + "learning_rate": 3.3357024542410297e-07, + "loss": 2.0448, + "step": 74062 + }, + { + "epoch": 0.9258231455786394, + "grad_norm": 0.017402641475200653, + "learning_rate": 3.333467625104081e-07, + "loss": 0.6595, + "step": 74064 + }, + { + "epoch": 0.9258481462036551, + "grad_norm": 9.665699005126953, + "learning_rate": 3.3312335321657896e-07, + "loss": 2.597, + "step": 74066 + }, + { + "epoch": 0.9258731468286707, + "grad_norm": 3.6797468662261963, + "learning_rate": 3.329000175443153e-07, + "loss": 1.033, + "step": 74068 + }, + { + "epoch": 0.9258981474536864, + "grad_norm": 6.987462520599365, + "learning_rate": 3.32676755495317e-07, + "loss": 0.3764, + "step": 74070 + }, + { + "epoch": 0.925923148078702, + "grad_norm": 4.755854606628418, + "learning_rate": 3.3245356707128583e-07, + "loss": 1.9439, + "step": 74072 + }, + { + "epoch": 0.9259481487037176, + "grad_norm": 7.785167694091797, + "learning_rate": 3.322304522739206e-07, + "loss": 1.7077, + "step": 74074 + }, + { + "epoch": 0.9259731493287332, + "grad_norm": 6.986399173736572, + "learning_rate": 3.320074111049221e-07, + "loss": 1.4583, + "step": 74076 + }, + { + "epoch": 0.9259981499537489, + "grad_norm": 3.626105546951294, + "learning_rate": 3.317844435659867e-07, + "loss": 1.3569, + "step": 74078 + }, + { + "epoch": 0.9260231505787645, + "grad_norm": 3.7396583557128906, + "learning_rate": 3.315615496588154e-07, + "loss": 1.3815, + "step": 74080 + }, + { + "epoch": 0.9260481512037801, + "grad_norm": 4.571283340454102, + "learning_rate": 3.3133872938510337e-07, + "loss": 1.2518, + "step": 74082 + }, + { + "epoch": 0.9260731518287957, + "grad_norm": 3.3213400840759277, + "learning_rate": 3.311159827465471e-07, + "loss": 0.4028, + "step": 74084 + }, + { + "epoch": 0.9260981524538113, + "grad_norm": 2.926147222518921, + "learning_rate": 3.3089330974484526e-07, + "loss": 0.9498, + "step": 74086 + }, + { + "epoch": 0.926123153078827, + "grad_norm": 3.692307233810425, + "learning_rate": 3.3067071038169196e-07, + "loss": 1.6195, + "step": 74088 + }, + { + "epoch": 0.9261481537038426, + "grad_norm": 2.3215601444244385, + "learning_rate": 3.304481846587848e-07, + "loss": 1.2475, + "step": 74090 + }, + { + "epoch": 0.9261731543288583, + "grad_norm": 3.0169312953948975, + "learning_rate": 3.3022573257781463e-07, + "loss": 0.7027, + "step": 74092 + }, + { + "epoch": 0.9261981549538738, + "grad_norm": 3.7256875038146973, + "learning_rate": 3.300033541404801e-07, + "loss": 0.859, + "step": 74094 + }, + { + "epoch": 0.9262231555788895, + "grad_norm": 1.1767078638076782, + "learning_rate": 3.297810493484721e-07, + "loss": 0.6725, + "step": 74096 + }, + { + "epoch": 0.9262481562039051, + "grad_norm": 3.8522119522094727, + "learning_rate": 3.2955881820348256e-07, + "loss": 1.2635, + "step": 74098 + }, + { + "epoch": 0.9262731568289208, + "grad_norm": 0.0002606109483167529, + "learning_rate": 3.2933666070720794e-07, + "loss": 0.0006, + "step": 74100 + }, + { + "epoch": 0.9262981574539364, + "grad_norm": 4.723198890686035, + "learning_rate": 3.291145768613357e-07, + "loss": 1.3677, + "step": 74102 + }, + { + "epoch": 0.9263231580789519, + "grad_norm": 3.4862003326416016, + "learning_rate": 3.288925666675602e-07, + "loss": 1.3379, + "step": 74104 + }, + { + "epoch": 0.9263481587039676, + "grad_norm": 2.510798215866089, + "learning_rate": 3.2867063012757105e-07, + "loss": 1.3645, + "step": 74106 + }, + { + "epoch": 0.9263731593289832, + "grad_norm": 3.465529203414917, + "learning_rate": 3.284487672430581e-07, + "loss": 0.2416, + "step": 74108 + }, + { + "epoch": 0.9263981599539989, + "grad_norm": 2.5492687225341797, + "learning_rate": 3.282269780157132e-07, + "loss": 0.895, + "step": 74110 + }, + { + "epoch": 0.9264231605790145, + "grad_norm": 3.9557430744171143, + "learning_rate": 3.2800526244722186e-07, + "loss": 0.9239, + "step": 74112 + }, + { + "epoch": 0.9264481612040301, + "grad_norm": 6.0891571044921875, + "learning_rate": 3.27783620539277e-07, + "loss": 1.1646, + "step": 74114 + }, + { + "epoch": 0.9264731618290457, + "grad_norm": 3.022365093231201, + "learning_rate": 3.2756205229356186e-07, + "loss": 1.0624, + "step": 74116 + }, + { + "epoch": 0.9264981624540614, + "grad_norm": 0.003912654705345631, + "learning_rate": 3.273405577117683e-07, + "loss": 0.8556, + "step": 74118 + }, + { + "epoch": 0.926523163079077, + "grad_norm": 2.52934193611145, + "learning_rate": 3.271191367955806e-07, + "loss": 0.5059, + "step": 74120 + }, + { + "epoch": 0.9265481637040927, + "grad_norm": 2.9145169258117676, + "learning_rate": 3.2689778954668514e-07, + "loss": 1.4809, + "step": 74122 + }, + { + "epoch": 0.9265731643291082, + "grad_norm": 2.5644712448120117, + "learning_rate": 3.2667651596676955e-07, + "loss": 1.0942, + "step": 74124 + }, + { + "epoch": 0.9265981649541238, + "grad_norm": 3.2603538036346436, + "learning_rate": 3.264553160575157e-07, + "loss": 0.9263, + "step": 74126 + }, + { + "epoch": 0.9266231655791395, + "grad_norm": 2.1144347190856934, + "learning_rate": 3.262341898206123e-07, + "loss": 0.3095, + "step": 74128 + }, + { + "epoch": 0.9266481662041551, + "grad_norm": 2.314552068710327, + "learning_rate": 3.260131372577391e-07, + "loss": 1.3726, + "step": 74130 + }, + { + "epoch": 0.9266731668291708, + "grad_norm": 3.5494682788848877, + "learning_rate": 3.2579215837058364e-07, + "loss": 0.6106, + "step": 74132 + }, + { + "epoch": 0.9266981674541863, + "grad_norm": 2.26932430267334, + "learning_rate": 3.2557125316082685e-07, + "loss": 0.8638, + "step": 74134 + }, + { + "epoch": 0.926723168079202, + "grad_norm": 5.34711217880249, + "learning_rate": 3.253504216301506e-07, + "loss": 0.4665, + "step": 74136 + }, + { + "epoch": 0.9267481687042176, + "grad_norm": 1.2387367486953735, + "learning_rate": 3.2512966378023815e-07, + "loss": 1.2587, + "step": 74138 + }, + { + "epoch": 0.9267731693292333, + "grad_norm": 2.468816041946411, + "learning_rate": 3.249089796127691e-07, + "loss": 0.8082, + "step": 74140 + }, + { + "epoch": 0.9267981699542489, + "grad_norm": 0.008286084979772568, + "learning_rate": 3.246883691294267e-07, + "loss": 0.5851, + "step": 74142 + }, + { + "epoch": 0.9268231705792644, + "grad_norm": 0.6677560806274414, + "learning_rate": 3.2446783233188727e-07, + "loss": 0.0225, + "step": 74144 + }, + { + "epoch": 0.9268481712042801, + "grad_norm": 2.325817823410034, + "learning_rate": 3.2424736922183395e-07, + "loss": 0.6651, + "step": 74146 + }, + { + "epoch": 0.9268731718292957, + "grad_norm": 7.396994113922119, + "learning_rate": 3.240269798009443e-07, + "loss": 2.0273, + "step": 74148 + }, + { + "epoch": 0.9268981724543114, + "grad_norm": 3.8632514476776123, + "learning_rate": 3.23806664070897e-07, + "loss": 1.1472, + "step": 74150 + }, + { + "epoch": 0.926923173079327, + "grad_norm": 4.093138217926025, + "learning_rate": 3.2358642203336954e-07, + "loss": 1.06, + "step": 74152 + }, + { + "epoch": 0.9269481737043426, + "grad_norm": 3.2479476928710938, + "learning_rate": 3.233662536900384e-07, + "loss": 0.5762, + "step": 74154 + }, + { + "epoch": 0.9269731743293582, + "grad_norm": 4.514893531799316, + "learning_rate": 3.231461590425833e-07, + "loss": 1.2809, + "step": 74156 + }, + { + "epoch": 0.9269981749543739, + "grad_norm": 4.282078742980957, + "learning_rate": 3.229261380926774e-07, + "loss": 0.7828, + "step": 74158 + }, + { + "epoch": 0.9270231755793895, + "grad_norm": 2.4533095359802246, + "learning_rate": 3.227061908419982e-07, + "loss": 0.0656, + "step": 74160 + }, + { + "epoch": 0.9270481762044052, + "grad_norm": 4.825974941253662, + "learning_rate": 3.22486317292221e-07, + "loss": 1.1897, + "step": 74162 + }, + { + "epoch": 0.9270731768294207, + "grad_norm": 2.0919957160949707, + "learning_rate": 3.2226651744501793e-07, + "loss": 0.1491, + "step": 74164 + }, + { + "epoch": 0.9270981774544363, + "grad_norm": 1.8537307977676392, + "learning_rate": 3.220467913020653e-07, + "loss": 0.192, + "step": 74166 + }, + { + "epoch": 0.927123178079452, + "grad_norm": 3.0920238494873047, + "learning_rate": 3.2182713886503516e-07, + "loss": 0.8945, + "step": 74168 + }, + { + "epoch": 0.9271481787044676, + "grad_norm": 3.651093006134033, + "learning_rate": 3.216075601356017e-07, + "loss": 0.7065, + "step": 74170 + }, + { + "epoch": 0.9271731793294833, + "grad_norm": 7.411417007446289, + "learning_rate": 3.213880551154358e-07, + "loss": 1.3808, + "step": 74172 + }, + { + "epoch": 0.9271981799544988, + "grad_norm": 1.6613080501556396, + "learning_rate": 3.211686238062095e-07, + "loss": 1.2249, + "step": 74174 + }, + { + "epoch": 0.9272231805795145, + "grad_norm": 0.0009129870450124145, + "learning_rate": 3.209492662095959e-07, + "loss": 0.0637, + "step": 74176 + }, + { + "epoch": 0.9272481812045301, + "grad_norm": 4.694689750671387, + "learning_rate": 3.2072998232726137e-07, + "loss": 0.5416, + "step": 74178 + }, + { + "epoch": 0.9272731818295458, + "grad_norm": 0.0003763374115806073, + "learning_rate": 3.2051077216088025e-07, + "loss": 0.6506, + "step": 74180 + }, + { + "epoch": 0.9272981824545614, + "grad_norm": 6.609662055969238, + "learning_rate": 3.2029163571211994e-07, + "loss": 0.6618, + "step": 74182 + }, + { + "epoch": 0.927323183079577, + "grad_norm": 0.0004757392161991447, + "learning_rate": 3.200725729826504e-07, + "loss": 0.0262, + "step": 74184 + }, + { + "epoch": 0.9273481837045926, + "grad_norm": 0.00032666409970261157, + "learning_rate": 3.19853583974139e-07, + "loss": 0.5067, + "step": 74186 + }, + { + "epoch": 0.9273731843296082, + "grad_norm": 3.360860586166382, + "learning_rate": 3.1963466868825234e-07, + "loss": 1.2837, + "step": 74188 + }, + { + "epoch": 0.9273981849546239, + "grad_norm": 13.90356159210205, + "learning_rate": 3.194158271266601e-07, + "loss": 1.3466, + "step": 74190 + }, + { + "epoch": 0.9274231855796395, + "grad_norm": 2.714768886566162, + "learning_rate": 3.1919705929102763e-07, + "loss": 1.1392, + "step": 74192 + }, + { + "epoch": 0.9274481862046551, + "grad_norm": 4.243574142456055, + "learning_rate": 3.189783651830225e-07, + "loss": 0.87, + "step": 74194 + }, + { + "epoch": 0.9274731868296707, + "grad_norm": 2.3722269535064697, + "learning_rate": 3.187597448043078e-07, + "loss": 0.4763, + "step": 74196 + }, + { + "epoch": 0.9274981874546864, + "grad_norm": 2.1167259216308594, + "learning_rate": 3.1854119815655117e-07, + "loss": 0.9639, + "step": 74198 + }, + { + "epoch": 0.927523188079702, + "grad_norm": 2.23405122756958, + "learning_rate": 3.183227252414145e-07, + "loss": 1.2657, + "step": 74200 + }, + { + "epoch": 0.9275481887047177, + "grad_norm": 4.232844829559326, + "learning_rate": 3.1810432606056317e-07, + "loss": 1.3733, + "step": 74202 + }, + { + "epoch": 0.9275731893297332, + "grad_norm": 2.323255777359009, + "learning_rate": 3.1788600061566033e-07, + "loss": 0.5412, + "step": 74204 + }, + { + "epoch": 0.9275981899547489, + "grad_norm": 0.8272014260292053, + "learning_rate": 3.176677489083679e-07, + "loss": 0.851, + "step": 74206 + }, + { + "epoch": 0.9276231905797645, + "grad_norm": 3.1167171001434326, + "learning_rate": 3.1744957094035025e-07, + "loss": 1.0773, + "step": 74208 + }, + { + "epoch": 0.9276481912047801, + "grad_norm": 2.4106411933898926, + "learning_rate": 3.1723146671326586e-07, + "loss": 0.9923, + "step": 74210 + }, + { + "epoch": 0.9276731918297958, + "grad_norm": 0.30893006920814514, + "learning_rate": 3.17013436228778e-07, + "loss": 0.5872, + "step": 74212 + }, + { + "epoch": 0.9276981924548113, + "grad_norm": 0.882559597492218, + "learning_rate": 3.167954794885464e-07, + "loss": 0.0367, + "step": 74214 + }, + { + "epoch": 0.927723193079827, + "grad_norm": 3.2569785118103027, + "learning_rate": 3.165775964942308e-07, + "loss": 1.3475, + "step": 74216 + }, + { + "epoch": 0.9277481937048426, + "grad_norm": 3.229138135910034, + "learning_rate": 3.1635978724749215e-07, + "loss": 1.4528, + "step": 74218 + }, + { + "epoch": 0.9277731943298583, + "grad_norm": 4.875815391540527, + "learning_rate": 3.161420517499858e-07, + "loss": 1.2068, + "step": 74220 + }, + { + "epoch": 0.9277981949548739, + "grad_norm": 3.3579373359680176, + "learning_rate": 3.159243900033737e-07, + "loss": 1.5359, + "step": 74222 + }, + { + "epoch": 0.9278231955798895, + "grad_norm": 0.0002816494961734861, + "learning_rate": 3.1570680200931234e-07, + "loss": 0.2076, + "step": 74224 + }, + { + "epoch": 0.9278481962049051, + "grad_norm": 4.805879592895508, + "learning_rate": 3.154892877694571e-07, + "loss": 1.6728, + "step": 74226 + }, + { + "epoch": 0.9278731968299208, + "grad_norm": 2.0552637577056885, + "learning_rate": 3.1527184728546765e-07, + "loss": 0.2416, + "step": 74228 + }, + { + "epoch": 0.9278981974549364, + "grad_norm": 0.20421166718006134, + "learning_rate": 3.150544805589961e-07, + "loss": 0.5533, + "step": 74230 + }, + { + "epoch": 0.927923198079952, + "grad_norm": 0.030754465609788895, + "learning_rate": 3.148371875917022e-07, + "loss": 0.5168, + "step": 74232 + }, + { + "epoch": 0.9279481987049676, + "grad_norm": 3.780100107192993, + "learning_rate": 3.146199683852369e-07, + "loss": 0.5146, + "step": 74234 + }, + { + "epoch": 0.9279731993299832, + "grad_norm": 6.062312126159668, + "learning_rate": 3.1440282294125766e-07, + "loss": 1.5764, + "step": 74236 + }, + { + "epoch": 0.9279981999549989, + "grad_norm": 5.202154159545898, + "learning_rate": 3.141857512614166e-07, + "loss": 1.0927, + "step": 74238 + }, + { + "epoch": 0.9280232005800145, + "grad_norm": 2.3389313220977783, + "learning_rate": 3.1396875334736676e-07, + "loss": 0.5766, + "step": 74240 + }, + { + "epoch": 0.9280482012050302, + "grad_norm": 0.000724516692571342, + "learning_rate": 3.137518292007613e-07, + "loss": 0.8482, + "step": 74242 + }, + { + "epoch": 0.9280732018300457, + "grad_norm": 5.10520076751709, + "learning_rate": 3.135349788232511e-07, + "loss": 1.6388, + "step": 74244 + }, + { + "epoch": 0.9280982024550614, + "grad_norm": 4.28087043762207, + "learning_rate": 3.1331820221649044e-07, + "loss": 0.7888, + "step": 74246 + }, + { + "epoch": 0.928123203080077, + "grad_norm": 0.8206465840339661, + "learning_rate": 3.1310149938212685e-07, + "loss": 0.3289, + "step": 74248 + }, + { + "epoch": 0.9281482037050927, + "grad_norm": 11.399371147155762, + "learning_rate": 3.1288487032181457e-07, + "loss": 0.8505, + "step": 74250 + }, + { + "epoch": 0.9281732043301083, + "grad_norm": 4.431585311889648, + "learning_rate": 3.126683150372001e-07, + "loss": 1.6533, + "step": 74252 + }, + { + "epoch": 0.9281982049551238, + "grad_norm": 2.7416415214538574, + "learning_rate": 3.1245183352993314e-07, + "loss": 0.651, + "step": 74254 + }, + { + "epoch": 0.9282232055801395, + "grad_norm": 0.0017690033419057727, + "learning_rate": 3.1223542580166356e-07, + "loss": 0.2916, + "step": 74256 + }, + { + "epoch": 0.9282482062051551, + "grad_norm": 2.683065176010132, + "learning_rate": 3.120190918540389e-07, + "loss": 0.136, + "step": 74258 + }, + { + "epoch": 0.9282732068301708, + "grad_norm": 2.5674171447753906, + "learning_rate": 3.1180283168870785e-07, + "loss": 0.4291, + "step": 74260 + }, + { + "epoch": 0.9282982074551864, + "grad_norm": 4.0359673500061035, + "learning_rate": 3.115866453073146e-07, + "loss": 0.8943, + "step": 74262 + }, + { + "epoch": 0.928323208080202, + "grad_norm": 5.039588451385498, + "learning_rate": 3.113705327115091e-07, + "loss": 1.4884, + "step": 74264 + }, + { + "epoch": 0.9283482087052176, + "grad_norm": 3.8537650108337402, + "learning_rate": 3.111544939029354e-07, + "loss": 0.8835, + "step": 74266 + }, + { + "epoch": 0.9283732093302333, + "grad_norm": 2.847208023071289, + "learning_rate": 3.1093852888323785e-07, + "loss": 1.7146, + "step": 74268 + }, + { + "epoch": 0.9283982099552489, + "grad_norm": 0.0006697241333313286, + "learning_rate": 3.1072263765406286e-07, + "loss": 0.1729, + "step": 74270 + }, + { + "epoch": 0.9284232105802646, + "grad_norm": 3.8719122409820557, + "learning_rate": 3.1050682021705357e-07, + "loss": 0.6474, + "step": 74272 + }, + { + "epoch": 0.9284482112052801, + "grad_norm": 6.899633407592773, + "learning_rate": 3.102910765738543e-07, + "loss": 0.2215, + "step": 74274 + }, + { + "epoch": 0.9284732118302957, + "grad_norm": 0.6452634930610657, + "learning_rate": 3.1007540672610914e-07, + "loss": 0.0357, + "step": 74276 + }, + { + "epoch": 0.9284982124553114, + "grad_norm": 4.043148517608643, + "learning_rate": 3.09859810675458e-07, + "loss": 0.8269, + "step": 74278 + }, + { + "epoch": 0.928523213080327, + "grad_norm": 0.00026253407122567296, + "learning_rate": 3.096442884235451e-07, + "loss": 1.1179, + "step": 74280 + }, + { + "epoch": 0.9285482137053427, + "grad_norm": 4.5746917724609375, + "learning_rate": 3.0942883997201023e-07, + "loss": 1.1186, + "step": 74282 + }, + { + "epoch": 0.9285732143303582, + "grad_norm": 3.652888774871826, + "learning_rate": 3.0921346532249543e-07, + "loss": 0.3796, + "step": 74284 + }, + { + "epoch": 0.9285982149553739, + "grad_norm": 4.202857494354248, + "learning_rate": 3.0899816447663935e-07, + "loss": 1.3959, + "step": 74286 + }, + { + "epoch": 0.9286232155803895, + "grad_norm": 0.0006718240911141038, + "learning_rate": 3.087829374360851e-07, + "loss": 0.0017, + "step": 74288 + }, + { + "epoch": 0.9286482162054052, + "grad_norm": 3.1189968585968018, + "learning_rate": 3.085677842024681e-07, + "loss": 1.1691, + "step": 74290 + }, + { + "epoch": 0.9286732168304208, + "grad_norm": 8.171896934509277, + "learning_rate": 3.0835270477742817e-07, + "loss": 1.5309, + "step": 74292 + }, + { + "epoch": 0.9286982174554363, + "grad_norm": 3.1196963787078857, + "learning_rate": 3.0813769916260393e-07, + "loss": 1.3146, + "step": 74294 + }, + { + "epoch": 0.928723218080452, + "grad_norm": 3.2838194370269775, + "learning_rate": 3.0792276735963187e-07, + "loss": 1.325, + "step": 74296 + }, + { + "epoch": 0.9287482187054676, + "grad_norm": 2.120039463043213, + "learning_rate": 3.0770790937015073e-07, + "loss": 0.1295, + "step": 74298 + }, + { + "epoch": 0.9287732193304833, + "grad_norm": 3.14424729347229, + "learning_rate": 3.0749312519579353e-07, + "loss": 0.3026, + "step": 74300 + }, + { + "epoch": 0.9287982199554989, + "grad_norm": 4.437918186187744, + "learning_rate": 3.072784148382002e-07, + "loss": 1.4291, + "step": 74302 + }, + { + "epoch": 0.9288232205805145, + "grad_norm": 0.0007821692852303386, + "learning_rate": 3.070637782990027e-07, + "loss": 0.7383, + "step": 74304 + }, + { + "epoch": 0.9288482212055301, + "grad_norm": 2.046571731567383, + "learning_rate": 3.068492155798364e-07, + "loss": 0.8057, + "step": 74306 + }, + { + "epoch": 0.9288732218305458, + "grad_norm": 3.3937296867370605, + "learning_rate": 3.0663472668233663e-07, + "loss": 2.3088, + "step": 74308 + }, + { + "epoch": 0.9288982224555614, + "grad_norm": 0.9066612124443054, + "learning_rate": 3.0642031160813543e-07, + "loss": 0.044, + "step": 74310 + }, + { + "epoch": 0.9289232230805771, + "grad_norm": 0.05672204867005348, + "learning_rate": 3.0620597035886714e-07, + "loss": 0.4804, + "step": 74312 + }, + { + "epoch": 0.9289482237055926, + "grad_norm": 3.3800110816955566, + "learning_rate": 3.0599170293616144e-07, + "loss": 0.684, + "step": 74314 + }, + { + "epoch": 0.9289732243306082, + "grad_norm": 0.0007119204383343458, + "learning_rate": 3.0577750934165375e-07, + "loss": 0.5041, + "step": 74316 + }, + { + "epoch": 0.9289982249556239, + "grad_norm": 9.239564895629883, + "learning_rate": 3.0556338957697386e-07, + "loss": 1.0512, + "step": 74318 + }, + { + "epoch": 0.9290232255806395, + "grad_norm": 2.0881261825561523, + "learning_rate": 3.0534934364375157e-07, + "loss": 0.8052, + "step": 74320 + }, + { + "epoch": 0.9290482262056552, + "grad_norm": 3.979403257369995, + "learning_rate": 3.0513537154361784e-07, + "loss": 1.0727, + "step": 74322 + }, + { + "epoch": 0.9290732268306707, + "grad_norm": 0.0003722356341313571, + "learning_rate": 3.049214732782013e-07, + "loss": 0.3724, + "step": 74324 + }, + { + "epoch": 0.9290982274556864, + "grad_norm": 4.391733169555664, + "learning_rate": 3.0470764884913187e-07, + "loss": 1.1592, + "step": 74326 + }, + { + "epoch": 0.929123228080702, + "grad_norm": 0.0018901844741776586, + "learning_rate": 3.0449389825804033e-07, + "loss": 0.0391, + "step": 74328 + }, + { + "epoch": 0.9291482287057177, + "grad_norm": 4.280683994293213, + "learning_rate": 3.042802215065499e-07, + "loss": 1.3978, + "step": 74330 + }, + { + "epoch": 0.9291732293307333, + "grad_norm": 0.45942389965057373, + "learning_rate": 3.040666185962915e-07, + "loss": 0.1701, + "step": 74332 + }, + { + "epoch": 0.9291982299557489, + "grad_norm": 5.080862522125244, + "learning_rate": 3.038530895288894e-07, + "loss": 1.0413, + "step": 74334 + }, + { + "epoch": 0.9292232305807645, + "grad_norm": 13.095565795898438, + "learning_rate": 3.036396343059722e-07, + "loss": 1.1051, + "step": 74336 + }, + { + "epoch": 0.9292482312057802, + "grad_norm": 0.00026417701155878603, + "learning_rate": 3.034262529291632e-07, + "loss": 0.2241, + "step": 74338 + }, + { + "epoch": 0.9292732318307958, + "grad_norm": 2.871384859085083, + "learning_rate": 3.032129454000887e-07, + "loss": 0.9046, + "step": 74340 + }, + { + "epoch": 0.9292982324558114, + "grad_norm": 0.00044499317300505936, + "learning_rate": 3.0299971172037533e-07, + "loss": 0.1848, + "step": 74342 + }, + { + "epoch": 0.929323233080827, + "grad_norm": 3.004756450653076, + "learning_rate": 3.0278655189164285e-07, + "loss": 1.1352, + "step": 74344 + }, + { + "epoch": 0.9293482337058426, + "grad_norm": 3.321131944656372, + "learning_rate": 3.0257346591551663e-07, + "loss": 0.7488, + "step": 74346 + }, + { + "epoch": 0.9293732343308583, + "grad_norm": 3.3568637371063232, + "learning_rate": 3.0236045379361976e-07, + "loss": 0.6428, + "step": 74348 + }, + { + "epoch": 0.9293982349558739, + "grad_norm": 3.622436761856079, + "learning_rate": 3.021475155275744e-07, + "loss": 0.693, + "step": 74350 + }, + { + "epoch": 0.9294232355808896, + "grad_norm": 0.006951333023607731, + "learning_rate": 3.0193465111900245e-07, + "loss": 0.927, + "step": 74352 + }, + { + "epoch": 0.9294482362059051, + "grad_norm": 3.431279420852661, + "learning_rate": 3.017218605695249e-07, + "loss": 1.7433, + "step": 74354 + }, + { + "epoch": 0.9294732368309208, + "grad_norm": 2.281062602996826, + "learning_rate": 3.0150914388076267e-07, + "loss": 0.5992, + "step": 74356 + }, + { + "epoch": 0.9294982374559364, + "grad_norm": 0.00038094152114354074, + "learning_rate": 3.012965010543334e-07, + "loss": 0.3528, + "step": 74358 + }, + { + "epoch": 0.929523238080952, + "grad_norm": 2.6785099506378174, + "learning_rate": 3.0108393209186016e-07, + "loss": 1.3737, + "step": 74360 + }, + { + "epoch": 0.9295482387059677, + "grad_norm": 2.9841508865356445, + "learning_rate": 3.0087143699495835e-07, + "loss": 0.4192, + "step": 74362 + }, + { + "epoch": 0.9295732393309832, + "grad_norm": 7.04289436340332, + "learning_rate": 3.006590157652489e-07, + "loss": 0.4068, + "step": 74364 + }, + { + "epoch": 0.9295982399559989, + "grad_norm": 1.3847801685333252, + "learning_rate": 3.0044666840434945e-07, + "loss": 0.5677, + "step": 74366 + }, + { + "epoch": 0.9296232405810145, + "grad_norm": 4.176321983337402, + "learning_rate": 3.002343949138775e-07, + "loss": 1.2809, + "step": 74368 + }, + { + "epoch": 0.9296482412060302, + "grad_norm": 6.099161148071289, + "learning_rate": 3.000221952954474e-07, + "loss": 1.1206, + "step": 74370 + }, + { + "epoch": 0.9296732418310458, + "grad_norm": 4.062197208404541, + "learning_rate": 2.9981006955067673e-07, + "loss": 1.5667, + "step": 74372 + }, + { + "epoch": 0.9296982424560614, + "grad_norm": 2.5011990070343018, + "learning_rate": 2.995980176811808e-07, + "loss": 1.0126, + "step": 74374 + }, + { + "epoch": 0.929723243081077, + "grad_norm": 0.00044132000766694546, + "learning_rate": 2.9938603968857396e-07, + "loss": 0.0003, + "step": 74376 + }, + { + "epoch": 0.9297482437060927, + "grad_norm": 2.045289993286133, + "learning_rate": 2.991741355744715e-07, + "loss": 0.1114, + "step": 74378 + }, + { + "epoch": 0.9297732443311083, + "grad_norm": 4.0800981521606445, + "learning_rate": 2.9896230534048884e-07, + "loss": 1.2826, + "step": 74380 + }, + { + "epoch": 0.929798244956124, + "grad_norm": 0.912742018699646, + "learning_rate": 2.9875054898823474e-07, + "loss": 0.5363, + "step": 74382 + }, + { + "epoch": 0.9298232455811395, + "grad_norm": 1.313299536705017, + "learning_rate": 2.9853886651932674e-07, + "loss": 0.2898, + "step": 74384 + }, + { + "epoch": 0.9298482462061551, + "grad_norm": 2.212761163711548, + "learning_rate": 2.9832725793537356e-07, + "loss": 0.2486, + "step": 74386 + }, + { + "epoch": 0.9298732468311708, + "grad_norm": 3.843132257461548, + "learning_rate": 2.981157232379872e-07, + "loss": 1.0496, + "step": 74388 + }, + { + "epoch": 0.9298982474561864, + "grad_norm": 2.2559783458709717, + "learning_rate": 2.979042624287809e-07, + "loss": 0.6931, + "step": 74390 + }, + { + "epoch": 0.9299232480812021, + "grad_norm": 5.6508684158325195, + "learning_rate": 2.9769287550936333e-07, + "loss": 2.1247, + "step": 74392 + }, + { + "epoch": 0.9299482487062176, + "grad_norm": 1.558916687965393, + "learning_rate": 2.974815624813465e-07, + "loss": 0.4931, + "step": 74394 + }, + { + "epoch": 0.9299732493312333, + "grad_norm": 2.3477487564086914, + "learning_rate": 2.9727032334633586e-07, + "loss": 0.9464, + "step": 74396 + }, + { + "epoch": 0.9299982499562489, + "grad_norm": 1.5140800476074219, + "learning_rate": 2.9705915810594344e-07, + "loss": 2.0522, + "step": 74398 + }, + { + "epoch": 0.9300232505812646, + "grad_norm": 3.262005090713501, + "learning_rate": 2.9684806676177457e-07, + "loss": 1.1008, + "step": 74400 + }, + { + "epoch": 0.9300482512062802, + "grad_norm": 1.9415956735610962, + "learning_rate": 2.9663704931544023e-07, + "loss": 1.541, + "step": 74402 + }, + { + "epoch": 0.9300732518312957, + "grad_norm": 10.879607200622559, + "learning_rate": 2.964261057685458e-07, + "loss": 2.6517, + "step": 74404 + }, + { + "epoch": 0.9300982524563114, + "grad_norm": 6.340888023376465, + "learning_rate": 2.9621523612269777e-07, + "loss": 0.5138, + "step": 74406 + }, + { + "epoch": 0.930123253081327, + "grad_norm": 2.1184239387512207, + "learning_rate": 2.960044403795037e-07, + "loss": 0.1553, + "step": 74408 + }, + { + "epoch": 0.9301482537063427, + "grad_norm": 3.288102865219116, + "learning_rate": 2.957937185405646e-07, + "loss": 1.1041, + "step": 74410 + }, + { + "epoch": 0.9301732543313583, + "grad_norm": 3.0112264156341553, + "learning_rate": 2.9558307060749025e-07, + "loss": 0.9525, + "step": 74412 + }, + { + "epoch": 0.9301982549563739, + "grad_norm": 2.6068947315216064, + "learning_rate": 2.953724965818827e-07, + "loss": 0.8953, + "step": 74414 + }, + { + "epoch": 0.9302232555813895, + "grad_norm": 5.394816875457764, + "learning_rate": 2.951619964653452e-07, + "loss": 2.4909, + "step": 74416 + }, + { + "epoch": 0.9302482562064052, + "grad_norm": 4.150025844573975, + "learning_rate": 2.9495157025948183e-07, + "loss": 0.8822, + "step": 74418 + }, + { + "epoch": 0.9302732568314208, + "grad_norm": 5.086023330688477, + "learning_rate": 2.9474121796589594e-07, + "loss": 1.3582, + "step": 74420 + }, + { + "epoch": 0.9302982574564365, + "grad_norm": 3.2953312397003174, + "learning_rate": 2.945309395861873e-07, + "loss": 1.4576, + "step": 74422 + }, + { + "epoch": 0.930323258081452, + "grad_norm": 0.10788390040397644, + "learning_rate": 2.94320735121959e-07, + "loss": 0.868, + "step": 74424 + }, + { + "epoch": 0.9303482587064676, + "grad_norm": 0.0009770361939445138, + "learning_rate": 2.941106045748099e-07, + "loss": 0.418, + "step": 74426 + }, + { + "epoch": 0.9303732593314833, + "grad_norm": 3.824889898300171, + "learning_rate": 2.9390054794634415e-07, + "loss": 1.4747, + "step": 74428 + }, + { + "epoch": 0.9303982599564989, + "grad_norm": 4.276776313781738, + "learning_rate": 2.936905652381583e-07, + "loss": 0.8999, + "step": 74430 + }, + { + "epoch": 0.9304232605815146, + "grad_norm": 5.484948635101318, + "learning_rate": 2.934806564518522e-07, + "loss": 0.7206, + "step": 74432 + }, + { + "epoch": 0.9304482612065301, + "grad_norm": 5.625718116760254, + "learning_rate": 2.9327082158902564e-07, + "loss": 1.6862, + "step": 74434 + }, + { + "epoch": 0.9304732618315458, + "grad_norm": 2.0140135288238525, + "learning_rate": 2.9306106065127626e-07, + "loss": 0.1572, + "step": 74436 + }, + { + "epoch": 0.9304982624565614, + "grad_norm": 5.124980926513672, + "learning_rate": 2.9285137364019945e-07, + "loss": 2.5712, + "step": 74438 + }, + { + "epoch": 0.9305232630815771, + "grad_norm": 3.6774790287017822, + "learning_rate": 2.926417605573939e-07, + "loss": 0.6506, + "step": 74440 + }, + { + "epoch": 0.9305482637065927, + "grad_norm": 3.6782333850860596, + "learning_rate": 2.9243222140445727e-07, + "loss": 0.9971, + "step": 74442 + }, + { + "epoch": 0.9305732643316083, + "grad_norm": 7.175498008728027, + "learning_rate": 2.922227561829838e-07, + "loss": 1.3239, + "step": 74444 + }, + { + "epoch": 0.9305982649566239, + "grad_norm": 3.5106394290924072, + "learning_rate": 2.9201336489457e-07, + "loss": 0.3708, + "step": 74446 + }, + { + "epoch": 0.9306232655816395, + "grad_norm": 7.74019193649292, + "learning_rate": 2.9180404754080795e-07, + "loss": 1.9417, + "step": 74448 + }, + { + "epoch": 0.9306482662066552, + "grad_norm": 7.27269172668457, + "learning_rate": 2.91594804123293e-07, + "loss": 1.9925, + "step": 74450 + }, + { + "epoch": 0.9306732668316708, + "grad_norm": 4.242763042449951, + "learning_rate": 2.913856346436206e-07, + "loss": 2.7037, + "step": 74452 + }, + { + "epoch": 0.9306982674566864, + "grad_norm": 3.229156494140625, + "learning_rate": 2.911765391033805e-07, + "loss": 1.7993, + "step": 74454 + }, + { + "epoch": 0.930723268081702, + "grad_norm": 10.988532066345215, + "learning_rate": 2.909675175041693e-07, + "loss": 1.3533, + "step": 74456 + }, + { + "epoch": 0.9307482687067177, + "grad_norm": 1.8747117519378662, + "learning_rate": 2.9075856984757456e-07, + "loss": 0.5206, + "step": 74458 + }, + { + "epoch": 0.9307732693317333, + "grad_norm": 2.924314022064209, + "learning_rate": 2.9054969613519167e-07, + "loss": 1.0957, + "step": 74460 + }, + { + "epoch": 0.930798269956749, + "grad_norm": 4.853010654449463, + "learning_rate": 2.903408963686072e-07, + "loss": 1.3261, + "step": 74462 + }, + { + "epoch": 0.9308232705817645, + "grad_norm": 4.145198345184326, + "learning_rate": 2.901321705494131e-07, + "loss": 1.4781, + "step": 74464 + }, + { + "epoch": 0.9308482712067802, + "grad_norm": 1.8710495233535767, + "learning_rate": 2.8992351867920046e-07, + "loss": 1.0446, + "step": 74466 + }, + { + "epoch": 0.9308732718317958, + "grad_norm": 4.152340888977051, + "learning_rate": 2.897149407595568e-07, + "loss": 1.5964, + "step": 74468 + }, + { + "epoch": 0.9308982724568114, + "grad_norm": 0.00017848242714535445, + "learning_rate": 2.8950643679207194e-07, + "loss": 0.4623, + "step": 74470 + }, + { + "epoch": 0.9309232730818271, + "grad_norm": 1.9388192892074585, + "learning_rate": 2.8929800677833243e-07, + "loss": 0.4319, + "step": 74472 + }, + { + "epoch": 0.9309482737068426, + "grad_norm": 4.303530216217041, + "learning_rate": 2.890896507199248e-07, + "loss": 0.9984, + "step": 74474 + }, + { + "epoch": 0.9309732743318583, + "grad_norm": 1.9299545288085938, + "learning_rate": 2.8888136861843883e-07, + "loss": 0.9853, + "step": 74476 + }, + { + "epoch": 0.9309982749568739, + "grad_norm": 2.3192877769470215, + "learning_rate": 2.886731604754578e-07, + "loss": 0.3969, + "step": 74478 + }, + { + "epoch": 0.9310232755818896, + "grad_norm": 4.628261566162109, + "learning_rate": 2.8846502629256925e-07, + "loss": 0.9336, + "step": 74480 + }, + { + "epoch": 0.9310482762069052, + "grad_norm": 0.42221739888191223, + "learning_rate": 2.8825696607135635e-07, + "loss": 0.3494, + "step": 74482 + }, + { + "epoch": 0.9310732768319208, + "grad_norm": 3.5867509841918945, + "learning_rate": 2.8804897981340785e-07, + "loss": 1.7645, + "step": 74484 + }, + { + "epoch": 0.9310982774569364, + "grad_norm": 3.8977842330932617, + "learning_rate": 2.878410675203036e-07, + "loss": 2.0796, + "step": 74486 + }, + { + "epoch": 0.931123278081952, + "grad_norm": 0.023292087018489838, + "learning_rate": 2.876332291936268e-07, + "loss": 0.8939, + "step": 74488 + }, + { + "epoch": 0.9311482787069677, + "grad_norm": 0.0011068428866565228, + "learning_rate": 2.8742546483496393e-07, + "loss": 0.5299, + "step": 74490 + }, + { + "epoch": 0.9311732793319834, + "grad_norm": 2.1122100353240967, + "learning_rate": 2.8721777444589373e-07, + "loss": 1.0322, + "step": 74492 + }, + { + "epoch": 0.9311982799569989, + "grad_norm": 6.422214508056641, + "learning_rate": 2.870101580280016e-07, + "loss": 0.7353, + "step": 74494 + }, + { + "epoch": 0.9312232805820145, + "grad_norm": 3.313145875930786, + "learning_rate": 2.8680261558286403e-07, + "loss": 1.581, + "step": 74496 + }, + { + "epoch": 0.9312482812070302, + "grad_norm": 2.408487319946289, + "learning_rate": 2.865951471120676e-07, + "loss": 1.3705, + "step": 74498 + }, + { + "epoch": 0.9312732818320458, + "grad_norm": 1.916606068611145, + "learning_rate": 2.8638775261718543e-07, + "loss": 0.067, + "step": 74500 + }, + { + "epoch": 0.9312982824570615, + "grad_norm": 0.0002481757546775043, + "learning_rate": 2.8618043209980183e-07, + "loss": 0.2669, + "step": 74502 + }, + { + "epoch": 0.931323283082077, + "grad_norm": 15.356779098510742, + "learning_rate": 2.8597318556149446e-07, + "loss": 1.9798, + "step": 74504 + }, + { + "epoch": 0.9313482837070927, + "grad_norm": 1.794295310974121, + "learning_rate": 2.8576601300384086e-07, + "loss": 0.1343, + "step": 74506 + }, + { + "epoch": 0.9313732843321083, + "grad_norm": 1.4447622299194336, + "learning_rate": 2.85558914428421e-07, + "loss": 0.3006, + "step": 74508 + }, + { + "epoch": 0.931398284957124, + "grad_norm": 3.9148190021514893, + "learning_rate": 2.8535188983680907e-07, + "loss": 1.3642, + "step": 74510 + }, + { + "epoch": 0.9314232855821396, + "grad_norm": 4.7834014892578125, + "learning_rate": 2.8514493923058497e-07, + "loss": 2.065, + "step": 74512 + }, + { + "epoch": 0.9314482862071551, + "grad_norm": 3.7968266010284424, + "learning_rate": 2.8493806261132297e-07, + "loss": 0.5742, + "step": 74514 + }, + { + "epoch": 0.9314732868321708, + "grad_norm": 4.015836715698242, + "learning_rate": 2.847312599805974e-07, + "loss": 1.2963, + "step": 74516 + }, + { + "epoch": 0.9314982874571864, + "grad_norm": 3.7405433654785156, + "learning_rate": 2.845245313399858e-07, + "loss": 1.2135, + "step": 74518 + }, + { + "epoch": 0.9315232880822021, + "grad_norm": 4.119523048400879, + "learning_rate": 2.843178766910615e-07, + "loss": 0.9847, + "step": 74520 + }, + { + "epoch": 0.9315482887072177, + "grad_norm": 2.8409535884857178, + "learning_rate": 2.8411129603539757e-07, + "loss": 1.0697, + "step": 74522 + }, + { + "epoch": 0.9315732893322333, + "grad_norm": 0.09995441138744354, + "learning_rate": 2.8390478937456946e-07, + "loss": 0.2811, + "step": 74524 + }, + { + "epoch": 0.9315982899572489, + "grad_norm": 4.590169429779053, + "learning_rate": 2.83698356710147e-07, + "loss": 0.7169, + "step": 74526 + }, + { + "epoch": 0.9316232905822646, + "grad_norm": 4.09657621383667, + "learning_rate": 2.8349199804370563e-07, + "loss": 0.8052, + "step": 74528 + }, + { + "epoch": 0.9316482912072802, + "grad_norm": 3.1673662662506104, + "learning_rate": 2.83285713376813e-07, + "loss": 1.0992, + "step": 74530 + }, + { + "epoch": 0.9316732918322959, + "grad_norm": 5.055721282958984, + "learning_rate": 2.8307950271104335e-07, + "loss": 1.4803, + "step": 74532 + }, + { + "epoch": 0.9316982924573114, + "grad_norm": 5.255970001220703, + "learning_rate": 2.8287336604796655e-07, + "loss": 1.2372, + "step": 74534 + }, + { + "epoch": 0.931723293082327, + "grad_norm": 3.713366746902466, + "learning_rate": 2.826673033891514e-07, + "loss": 1.6072, + "step": 74536 + }, + { + "epoch": 0.9317482937073427, + "grad_norm": 3.797492265701294, + "learning_rate": 2.824613147361688e-07, + "loss": 1.6267, + "step": 74538 + }, + { + "epoch": 0.9317732943323583, + "grad_norm": 7.510563373565674, + "learning_rate": 2.822554000905853e-07, + "loss": 1.4548, + "step": 74540 + }, + { + "epoch": 0.931798294957374, + "grad_norm": 2.040276288986206, + "learning_rate": 2.820495594539718e-07, + "loss": 1.2362, + "step": 74542 + }, + { + "epoch": 0.9318232955823895, + "grad_norm": 3.4821274280548096, + "learning_rate": 2.8184379282789273e-07, + "loss": 1.2027, + "step": 74544 + }, + { + "epoch": 0.9318482962074052, + "grad_norm": 3.5186526775360107, + "learning_rate": 2.8163810021391903e-07, + "loss": 0.6727, + "step": 74546 + }, + { + "epoch": 0.9318732968324208, + "grad_norm": 4.170868873596191, + "learning_rate": 2.814324816136138e-07, + "loss": 0.8348, + "step": 74548 + }, + { + "epoch": 0.9318982974574365, + "grad_norm": 0.000487271579913795, + "learning_rate": 2.8122693702854585e-07, + "loss": 0.4934, + "step": 74550 + }, + { + "epoch": 0.9319232980824521, + "grad_norm": 3.214160203933716, + "learning_rate": 2.810214664602795e-07, + "loss": 1.3266, + "step": 74552 + }, + { + "epoch": 0.9319482987074676, + "grad_norm": 7.23137092590332, + "learning_rate": 2.808160699103779e-07, + "loss": 1.9163, + "step": 74554 + }, + { + "epoch": 0.9319732993324833, + "grad_norm": 3.1514952182769775, + "learning_rate": 2.806107473804076e-07, + "loss": 1.1287, + "step": 74556 + }, + { + "epoch": 0.9319982999574989, + "grad_norm": 2.0409908294677734, + "learning_rate": 2.8040549887192956e-07, + "loss": 0.1809, + "step": 74558 + }, + { + "epoch": 0.9320233005825146, + "grad_norm": 3.3547587394714355, + "learning_rate": 2.8020032438651034e-07, + "loss": 1.7511, + "step": 74560 + }, + { + "epoch": 0.9320483012075302, + "grad_norm": 3.174104928970337, + "learning_rate": 2.799952239257109e-07, + "loss": 0.7488, + "step": 74562 + }, + { + "epoch": 0.9320733018325458, + "grad_norm": 2.5264124870300293, + "learning_rate": 2.7979019749109326e-07, + "loss": 1.0855, + "step": 74564 + }, + { + "epoch": 0.9320983024575614, + "grad_norm": 0.0004381404141895473, + "learning_rate": 2.7958524508421957e-07, + "loss": 0.1705, + "step": 74566 + }, + { + "epoch": 0.9321233030825771, + "grad_norm": 6.033085346221924, + "learning_rate": 2.793803667066486e-07, + "loss": 0.5465, + "step": 74568 + }, + { + "epoch": 0.9321483037075927, + "grad_norm": 5.274843215942383, + "learning_rate": 2.7917556235994346e-07, + "loss": 2.0816, + "step": 74570 + }, + { + "epoch": 0.9321733043326084, + "grad_norm": 6.125481605529785, + "learning_rate": 2.7897083204566076e-07, + "loss": 0.8994, + "step": 74572 + }, + { + "epoch": 0.9321983049576239, + "grad_norm": 3.9559080600738525, + "learning_rate": 2.7876617576536367e-07, + "loss": 1.8562, + "step": 74574 + }, + { + "epoch": 0.9322233055826395, + "grad_norm": 4.499716281890869, + "learning_rate": 2.785615935206076e-07, + "loss": 1.0943, + "step": 74576 + }, + { + "epoch": 0.9322483062076552, + "grad_norm": 1.7503243684768677, + "learning_rate": 2.783570853129514e-07, + "loss": 0.3295, + "step": 74578 + }, + { + "epoch": 0.9322733068326708, + "grad_norm": 0.0016839217860251665, + "learning_rate": 2.7815265114395364e-07, + "loss": 0.9318, + "step": 74580 + }, + { + "epoch": 0.9322983074576865, + "grad_norm": 3.152928590774536, + "learning_rate": 2.7794829101516985e-07, + "loss": 0.5706, + "step": 74582 + }, + { + "epoch": 0.932323308082702, + "grad_norm": 2.386643648147583, + "learning_rate": 2.7774400492815765e-07, + "loss": 0.9705, + "step": 74584 + }, + { + "epoch": 0.9323483087077177, + "grad_norm": 7.5886077880859375, + "learning_rate": 2.775397928844714e-07, + "loss": 1.1958, + "step": 74586 + }, + { + "epoch": 0.9323733093327333, + "grad_norm": 4.329014778137207, + "learning_rate": 2.773356548856676e-07, + "loss": 0.3, + "step": 74588 + }, + { + "epoch": 0.932398309957749, + "grad_norm": 2.5021955966949463, + "learning_rate": 2.7713159093330166e-07, + "loss": 1.8143, + "step": 74590 + }, + { + "epoch": 0.9324233105827646, + "grad_norm": 1.7707451581954956, + "learning_rate": 2.769276010289246e-07, + "loss": 0.9707, + "step": 74592 + }, + { + "epoch": 0.9324483112077802, + "grad_norm": 6.574367523193359, + "learning_rate": 2.76723685174094e-07, + "loss": 1.5167, + "step": 74594 + }, + { + "epoch": 0.9324733118327958, + "grad_norm": 4.206098556518555, + "learning_rate": 2.7651984337035865e-07, + "loss": 0.8995, + "step": 74596 + }, + { + "epoch": 0.9324983124578115, + "grad_norm": 3.257443904876709, + "learning_rate": 2.763160756192751e-07, + "loss": 1.451, + "step": 74598 + }, + { + "epoch": 0.9325233130828271, + "grad_norm": 7.034785747528076, + "learning_rate": 2.761123819223921e-07, + "loss": 0.6054, + "step": 74600 + }, + { + "epoch": 0.9325483137078427, + "grad_norm": 1.2080167531967163, + "learning_rate": 2.7590876228126284e-07, + "loss": 0.9876, + "step": 74602 + }, + { + "epoch": 0.9325733143328583, + "grad_norm": 7.3601765632629395, + "learning_rate": 2.757052166974372e-07, + "loss": 1.017, + "step": 74604 + }, + { + "epoch": 0.9325983149578739, + "grad_norm": 3.299339532852173, + "learning_rate": 2.755017451724651e-07, + "loss": 1.141, + "step": 74606 + }, + { + "epoch": 0.9326233155828896, + "grad_norm": 1.7984545230865479, + "learning_rate": 2.752983477078963e-07, + "loss": 1.0967, + "step": 74608 + }, + { + "epoch": 0.9326483162079052, + "grad_norm": 3.1670804023742676, + "learning_rate": 2.750950243052808e-07, + "loss": 1.511, + "step": 74610 + }, + { + "epoch": 0.9326733168329209, + "grad_norm": 3.432474136352539, + "learning_rate": 2.7489177496616614e-07, + "loss": 1.3008, + "step": 74612 + }, + { + "epoch": 0.9326983174579364, + "grad_norm": 4.30885648727417, + "learning_rate": 2.7468859969210007e-07, + "loss": 1.3049, + "step": 74614 + }, + { + "epoch": 0.9327233180829521, + "grad_norm": 0.00027508012135513127, + "learning_rate": 2.744854984846312e-07, + "loss": 0.7597, + "step": 74616 + }, + { + "epoch": 0.9327483187079677, + "grad_norm": 4.464713096618652, + "learning_rate": 2.7428247134530517e-07, + "loss": 1.029, + "step": 74618 + }, + { + "epoch": 0.9327733193329834, + "grad_norm": 3.5625112056732178, + "learning_rate": 2.7407951827566727e-07, + "loss": 0.5554, + "step": 74620 + }, + { + "epoch": 0.932798319957999, + "grad_norm": 9.444068908691406, + "learning_rate": 2.738766392772663e-07, + "loss": 1.3664, + "step": 74622 + }, + { + "epoch": 0.9328233205830145, + "grad_norm": 0.02547016553580761, + "learning_rate": 2.7367383435164316e-07, + "loss": 0.71, + "step": 74624 + }, + { + "epoch": 0.9328483212080302, + "grad_norm": 3.9440178871154785, + "learning_rate": 2.7347110350034676e-07, + "loss": 0.7849, + "step": 74626 + }, + { + "epoch": 0.9328733218330458, + "grad_norm": 0.22024276852607727, + "learning_rate": 2.7326844672491916e-07, + "loss": 0.3735, + "step": 74628 + }, + { + "epoch": 0.9328983224580615, + "grad_norm": 2.7769572734832764, + "learning_rate": 2.7306586402690126e-07, + "loss": 0.5262, + "step": 74630 + }, + { + "epoch": 0.9329233230830771, + "grad_norm": 2.520090341567993, + "learning_rate": 2.728633554078408e-07, + "loss": 0.707, + "step": 74632 + }, + { + "epoch": 0.9329483237080927, + "grad_norm": 7.935665130615234, + "learning_rate": 2.726609208692754e-07, + "loss": 1.435, + "step": 74634 + }, + { + "epoch": 0.9329733243331083, + "grad_norm": 0.07881806045770645, + "learning_rate": 2.7245856041275054e-07, + "loss": 0.0017, + "step": 74636 + }, + { + "epoch": 0.932998324958124, + "grad_norm": 0.5148319005966187, + "learning_rate": 2.7225627403980384e-07, + "loss": 0.4529, + "step": 74638 + }, + { + "epoch": 0.9330233255831396, + "grad_norm": 9.800917625427246, + "learning_rate": 2.7205406175197956e-07, + "loss": 1.6576, + "step": 74640 + }, + { + "epoch": 0.9330483262081553, + "grad_norm": 4.983843803405762, + "learning_rate": 2.7185192355081544e-07, + "loss": 1.2692, + "step": 74642 + }, + { + "epoch": 0.9330733268331708, + "grad_norm": 2.6200852394104004, + "learning_rate": 2.716498594378503e-07, + "loss": 0.4287, + "step": 74644 + }, + { + "epoch": 0.9330983274581864, + "grad_norm": 2.306840658187866, + "learning_rate": 2.714478694146261e-07, + "loss": 1.4668, + "step": 74646 + }, + { + "epoch": 0.9331233280832021, + "grad_norm": 4.358966827392578, + "learning_rate": 2.7124595348267726e-07, + "loss": 0.3551, + "step": 74648 + }, + { + "epoch": 0.9331483287082177, + "grad_norm": 3.7214372158050537, + "learning_rate": 2.710441116435447e-07, + "loss": 1.0998, + "step": 74650 + }, + { + "epoch": 0.9331733293332334, + "grad_norm": 3.462430715560913, + "learning_rate": 2.708423438987629e-07, + "loss": 1.4348, + "step": 74652 + }, + { + "epoch": 0.9331983299582489, + "grad_norm": 0.0005276559968478978, + "learning_rate": 2.706406502498715e-07, + "loss": 0.0, + "step": 74654 + }, + { + "epoch": 0.9332233305832646, + "grad_norm": 11.100811958312988, + "learning_rate": 2.7043903069840504e-07, + "loss": 1.2361, + "step": 74656 + }, + { + "epoch": 0.9332483312082802, + "grad_norm": 3.9935123920440674, + "learning_rate": 2.702374852458978e-07, + "loss": 0.8621, + "step": 74658 + }, + { + "epoch": 0.9332733318332959, + "grad_norm": 0.00023629984934814274, + "learning_rate": 2.7003601389388736e-07, + "loss": 0.7293, + "step": 74660 + }, + { + "epoch": 0.9332983324583115, + "grad_norm": 2.4653308391571045, + "learning_rate": 2.6983461664390475e-07, + "loss": 0.0417, + "step": 74662 + }, + { + "epoch": 0.933323333083327, + "grad_norm": 6.001299858093262, + "learning_rate": 2.696332934974877e-07, + "loss": 1.4047, + "step": 74664 + }, + { + "epoch": 0.9333483337083427, + "grad_norm": 8.073724746704102, + "learning_rate": 2.69432044456166e-07, + "loss": 1.8613, + "step": 74666 + }, + { + "epoch": 0.9333733343333583, + "grad_norm": 5.254336833953857, + "learning_rate": 2.6923086952147516e-07, + "loss": 1.3712, + "step": 74668 + }, + { + "epoch": 0.933398334958374, + "grad_norm": 5.623380184173584, + "learning_rate": 2.6902976869494614e-07, + "loss": 0.5514, + "step": 74670 + }, + { + "epoch": 0.9334233355833896, + "grad_norm": 4.358211994171143, + "learning_rate": 2.688287419781088e-07, + "loss": 1.2851, + "step": 74672 + }, + { + "epoch": 0.9334483362084052, + "grad_norm": 0.5856791138648987, + "learning_rate": 2.686277893724976e-07, + "loss": 0.6199, + "step": 74674 + }, + { + "epoch": 0.9334733368334208, + "grad_norm": 3.7648043632507324, + "learning_rate": 2.684269108796389e-07, + "loss": 0.6292, + "step": 74676 + }, + { + "epoch": 0.9334983374584365, + "grad_norm": 2.9542059898376465, + "learning_rate": 2.6822610650106607e-07, + "loss": 0.7208, + "step": 74678 + }, + { + "epoch": 0.9335233380834521, + "grad_norm": 3.6090686321258545, + "learning_rate": 2.680253762383056e-07, + "loss": 1.2783, + "step": 74680 + }, + { + "epoch": 0.9335483387084678, + "grad_norm": 0.0005373923340812325, + "learning_rate": 2.678247200928896e-07, + "loss": 0.4545, + "step": 74682 + }, + { + "epoch": 0.9335733393334833, + "grad_norm": 0.5084547996520996, + "learning_rate": 2.676241380663436e-07, + "loss": 0.0228, + "step": 74684 + }, + { + "epoch": 0.933598339958499, + "grad_norm": 8.701971054077148, + "learning_rate": 2.6742363016019514e-07, + "loss": 0.5491, + "step": 74686 + }, + { + "epoch": 0.9336233405835146, + "grad_norm": 4.0506672859191895, + "learning_rate": 2.67223196375973e-07, + "loss": 0.6682, + "step": 74688 + }, + { + "epoch": 0.9336483412085302, + "grad_norm": 6.251906394958496, + "learning_rate": 2.6702283671520056e-07, + "loss": 0.5888, + "step": 74690 + }, + { + "epoch": 0.9336733418335459, + "grad_norm": 2.1216046810150146, + "learning_rate": 2.668225511794087e-07, + "loss": 0.8544, + "step": 74692 + }, + { + "epoch": 0.9336983424585614, + "grad_norm": 0.000479083857499063, + "learning_rate": 2.6662233977011844e-07, + "loss": 0.026, + "step": 74694 + }, + { + "epoch": 0.9337233430835771, + "grad_norm": 4.547019958496094, + "learning_rate": 2.664222024888552e-07, + "loss": 1.1625, + "step": 74696 + }, + { + "epoch": 0.9337483437085927, + "grad_norm": 1.6609907150268555, + "learning_rate": 2.6622213933714557e-07, + "loss": 0.0734, + "step": 74698 + }, + { + "epoch": 0.9337733443336084, + "grad_norm": 4.206412315368652, + "learning_rate": 2.6602215031651055e-07, + "loss": 0.8569, + "step": 74700 + }, + { + "epoch": 0.933798344958624, + "grad_norm": 5.745494365692139, + "learning_rate": 2.6582223542847563e-07, + "loss": 1.5832, + "step": 74702 + }, + { + "epoch": 0.9338233455836396, + "grad_norm": 3.923469066619873, + "learning_rate": 2.6562239467456065e-07, + "loss": 0.3199, + "step": 74704 + }, + { + "epoch": 0.9338483462086552, + "grad_norm": 5.095101833343506, + "learning_rate": 2.6542262805628884e-07, + "loss": 1.112, + "step": 74706 + }, + { + "epoch": 0.9338733468336708, + "grad_norm": 3.6516849994659424, + "learning_rate": 2.6522293557518343e-07, + "loss": 0.4924, + "step": 74708 + }, + { + "epoch": 0.9338983474586865, + "grad_norm": 0.00027932750526815653, + "learning_rate": 2.650233172327621e-07, + "loss": 0.4329, + "step": 74710 + }, + { + "epoch": 0.9339233480837021, + "grad_norm": 6.367856502532959, + "learning_rate": 2.6482377303054696e-07, + "loss": 0.3353, + "step": 74712 + }, + { + "epoch": 0.9339483487087177, + "grad_norm": 6.543343544006348, + "learning_rate": 2.646243029700568e-07, + "loss": 1.1289, + "step": 74714 + }, + { + "epoch": 0.9339733493337333, + "grad_norm": 1.596691370010376, + "learning_rate": 2.6442490705281266e-07, + "loss": 0.0844, + "step": 74716 + }, + { + "epoch": 0.933998349958749, + "grad_norm": 2.983854055404663, + "learning_rate": 2.6422558528032995e-07, + "loss": 0.7863, + "step": 74718 + }, + { + "epoch": 0.9340233505837646, + "grad_norm": 4.453397274017334, + "learning_rate": 2.640263376541297e-07, + "loss": 0.6639, + "step": 74720 + }, + { + "epoch": 0.9340483512087803, + "grad_norm": 3.2782537937164307, + "learning_rate": 2.638271641757273e-07, + "loss": 1.1101, + "step": 74722 + }, + { + "epoch": 0.9340733518337958, + "grad_norm": 1.8713749647140503, + "learning_rate": 2.636280648466405e-07, + "loss": 0.8244, + "step": 74724 + }, + { + "epoch": 0.9340983524588115, + "grad_norm": 6.498240947723389, + "learning_rate": 2.6342903966838584e-07, + "loss": 1.4626, + "step": 74726 + }, + { + "epoch": 0.9341233530838271, + "grad_norm": 0.059415291994810104, + "learning_rate": 2.632300886424777e-07, + "loss": 0.8186, + "step": 74728 + }, + { + "epoch": 0.9341483537088427, + "grad_norm": 4.838092803955078, + "learning_rate": 2.630312117704326e-07, + "loss": 1.0494, + "step": 74730 + }, + { + "epoch": 0.9341733543338584, + "grad_norm": 2.494415044784546, + "learning_rate": 2.6283240905376593e-07, + "loss": 0.0768, + "step": 74732 + }, + { + "epoch": 0.9341983549588739, + "grad_norm": 2.9485039710998535, + "learning_rate": 2.6263368049399107e-07, + "loss": 1.8533, + "step": 74734 + }, + { + "epoch": 0.9342233555838896, + "grad_norm": 6.995232582092285, + "learning_rate": 2.6243502609262006e-07, + "loss": 1.8705, + "step": 74736 + }, + { + "epoch": 0.9342483562089052, + "grad_norm": 2.158768892288208, + "learning_rate": 2.6223644585116617e-07, + "loss": 0.2959, + "step": 74738 + }, + { + "epoch": 0.9342733568339209, + "grad_norm": 3.894131898880005, + "learning_rate": 2.620379397711437e-07, + "loss": 1.1364, + "step": 74740 + }, + { + "epoch": 0.9342983574589365, + "grad_norm": 3.8434596061706543, + "learning_rate": 2.6183950785406256e-07, + "loss": 0.8641, + "step": 74742 + }, + { + "epoch": 0.9343233580839521, + "grad_norm": 3.7541205883026123, + "learning_rate": 2.616411501014349e-07, + "loss": 1.3831, + "step": 74744 + }, + { + "epoch": 0.9343483587089677, + "grad_norm": 0.0006207427941262722, + "learning_rate": 2.614428665147717e-07, + "loss": 1.192, + "step": 74746 + }, + { + "epoch": 0.9343733593339834, + "grad_norm": 3.742393732070923, + "learning_rate": 2.6124465709558177e-07, + "loss": 0.6241, + "step": 74748 + }, + { + "epoch": 0.934398359958999, + "grad_norm": 1.9513670206069946, + "learning_rate": 2.6104652184537616e-07, + "loss": 0.7139, + "step": 74750 + }, + { + "epoch": 0.9344233605840147, + "grad_norm": 0.0003885526384692639, + "learning_rate": 2.608484607656614e-07, + "loss": 0.6121, + "step": 74752 + }, + { + "epoch": 0.9344483612090302, + "grad_norm": 3.6200051307678223, + "learning_rate": 2.606504738579485e-07, + "loss": 0.4432, + "step": 74754 + }, + { + "epoch": 0.9344733618340458, + "grad_norm": 2.73507022857666, + "learning_rate": 2.6045256112374405e-07, + "loss": 1.3059, + "step": 74756 + }, + { + "epoch": 0.9344983624590615, + "grad_norm": 2.3204238414764404, + "learning_rate": 2.6025472256455465e-07, + "loss": 0.511, + "step": 74758 + }, + { + "epoch": 0.9345233630840771, + "grad_norm": 2.8745288848876953, + "learning_rate": 2.6005695818189126e-07, + "loss": 0.885, + "step": 74760 + }, + { + "epoch": 0.9345483637090928, + "grad_norm": 0.00032940396340563893, + "learning_rate": 2.598592679772538e-07, + "loss": 0.0619, + "step": 74762 + }, + { + "epoch": 0.9345733643341083, + "grad_norm": 3.397176504135132, + "learning_rate": 2.596616519521522e-07, + "loss": 1.7251, + "step": 74764 + }, + { + "epoch": 0.934598364959124, + "grad_norm": 4.522034645080566, + "learning_rate": 2.594641101080886e-07, + "loss": 1.9584, + "step": 74766 + }, + { + "epoch": 0.9346233655841396, + "grad_norm": 2.5128684043884277, + "learning_rate": 2.592666424465684e-07, + "loss": 0.44, + "step": 74768 + }, + { + "epoch": 0.9346483662091553, + "grad_norm": 0.00035347725497558713, + "learning_rate": 2.5906924896909826e-07, + "loss": 0.0406, + "step": 74770 + }, + { + "epoch": 0.9346733668341709, + "grad_norm": 4.858247756958008, + "learning_rate": 2.58871929677178e-07, + "loss": 1.1889, + "step": 74772 + }, + { + "epoch": 0.9346983674591864, + "grad_norm": 4.780028343200684, + "learning_rate": 2.5867468457231313e-07, + "loss": 1.139, + "step": 74774 + }, + { + "epoch": 0.9347233680842021, + "grad_norm": 3.8827216625213623, + "learning_rate": 2.584775136560025e-07, + "loss": 0.7289, + "step": 74776 + }, + { + "epoch": 0.9347483687092177, + "grad_norm": 8.47346019744873, + "learning_rate": 2.5828041692975035e-07, + "loss": 1.485, + "step": 74778 + }, + { + "epoch": 0.9347733693342334, + "grad_norm": 14.159733772277832, + "learning_rate": 2.580833943950556e-07, + "loss": 2.5076, + "step": 74780 + }, + { + "epoch": 0.934798369959249, + "grad_norm": 4.440831661224365, + "learning_rate": 2.5788644605342026e-07, + "loss": 1.2883, + "step": 74782 + }, + { + "epoch": 0.9348233705842646, + "grad_norm": 3.5030388832092285, + "learning_rate": 2.576895719063455e-07, + "loss": 1.2226, + "step": 74784 + }, + { + "epoch": 0.9348483712092802, + "grad_norm": 0.0005868162261322141, + "learning_rate": 2.5749277195533e-07, + "loss": 1.4178, + "step": 74786 + }, + { + "epoch": 0.9348733718342959, + "grad_norm": 3.2037198543548584, + "learning_rate": 2.572960462018703e-07, + "loss": 0.2737, + "step": 74788 + }, + { + "epoch": 0.9348983724593115, + "grad_norm": 2.5481364727020264, + "learning_rate": 2.570993946474665e-07, + "loss": 0.4947, + "step": 74790 + }, + { + "epoch": 0.9349233730843272, + "grad_norm": 3.860588788986206, + "learning_rate": 2.56902817293615e-07, + "loss": 0.7042, + "step": 74792 + }, + { + "epoch": 0.9349483737093427, + "grad_norm": 2.212181806564331, + "learning_rate": 2.567063141418147e-07, + "loss": 0.3345, + "step": 74794 + }, + { + "epoch": 0.9349733743343583, + "grad_norm": 2.8070147037506104, + "learning_rate": 2.56509885193561e-07, + "loss": 1.348, + "step": 74796 + }, + { + "epoch": 0.934998374959374, + "grad_norm": 4.181332111358643, + "learning_rate": 2.563135304503517e-07, + "loss": 0.8212, + "step": 74798 + }, + { + "epoch": 0.9350233755843896, + "grad_norm": 2.8126583099365234, + "learning_rate": 2.561172499136788e-07, + "loss": 0.8878, + "step": 74800 + }, + { + "epoch": 0.9350483762094053, + "grad_norm": 0.0003979430184699595, + "learning_rate": 2.559210435850401e-07, + "loss": 0.0234, + "step": 74802 + }, + { + "epoch": 0.9350733768344208, + "grad_norm": 3.1834776401519775, + "learning_rate": 2.5572491146592883e-07, + "loss": 1.0441, + "step": 74804 + }, + { + "epoch": 0.9350983774594365, + "grad_norm": 0.00022161044762469828, + "learning_rate": 2.555288535578371e-07, + "loss": 0.1025, + "step": 74806 + }, + { + "epoch": 0.9351233780844521, + "grad_norm": 2.969985008239746, + "learning_rate": 2.5533286986226146e-07, + "loss": 1.3642, + "step": 74808 + }, + { + "epoch": 0.9351483787094678, + "grad_norm": 2.93283748626709, + "learning_rate": 2.551369603806908e-07, + "loss": 0.6551, + "step": 74810 + }, + { + "epoch": 0.9351733793344834, + "grad_norm": 0.0003886394260916859, + "learning_rate": 2.5494112511462165e-07, + "loss": 0.6788, + "step": 74812 + }, + { + "epoch": 0.935198379959499, + "grad_norm": 2.428622245788574, + "learning_rate": 2.5474536406554065e-07, + "loss": 0.5006, + "step": 74814 + }, + { + "epoch": 0.9352233805845146, + "grad_norm": 4.963048934936523, + "learning_rate": 2.5454967723494206e-07, + "loss": 1.6695, + "step": 74816 + }, + { + "epoch": 0.9352483812095302, + "grad_norm": 2.311220645904541, + "learning_rate": 2.5435406462431365e-07, + "loss": 0.8094, + "step": 74818 + }, + { + "epoch": 0.9352733818345459, + "grad_norm": 0.00041208701441064477, + "learning_rate": 2.5415852623514756e-07, + "loss": 0.9461, + "step": 74820 + }, + { + "epoch": 0.9352983824595615, + "grad_norm": 5.5739617347717285, + "learning_rate": 2.539630620689315e-07, + "loss": 2.5821, + "step": 74822 + }, + { + "epoch": 0.9353233830845771, + "grad_norm": 0.00038136434159241617, + "learning_rate": 2.5376767212715537e-07, + "loss": 0.7882, + "step": 74824 + }, + { + "epoch": 0.9353483837095927, + "grad_norm": 3.3841726779937744, + "learning_rate": 2.5357235641130574e-07, + "loss": 1.34, + "step": 74826 + }, + { + "epoch": 0.9353733843346084, + "grad_norm": 2.9636363983154297, + "learning_rate": 2.533771149228703e-07, + "loss": 0.6378, + "step": 74828 + }, + { + "epoch": 0.935398384959624, + "grad_norm": 0.44320812821388245, + "learning_rate": 2.531819476633368e-07, + "loss": 1.6918, + "step": 74830 + }, + { + "epoch": 0.9354233855846397, + "grad_norm": 2.457773208618164, + "learning_rate": 2.529868546341918e-07, + "loss": 1.1203, + "step": 74832 + }, + { + "epoch": 0.9354483862096552, + "grad_norm": 3.238093852996826, + "learning_rate": 2.527918358369197e-07, + "loss": 0.7503, + "step": 74834 + }, + { + "epoch": 0.9354733868346709, + "grad_norm": 1.959341287612915, + "learning_rate": 2.52596891273007e-07, + "loss": 2.0256, + "step": 74836 + }, + { + "epoch": 0.9354983874596865, + "grad_norm": 3.83776593208313, + "learning_rate": 2.5240202094393927e-07, + "loss": 0.7385, + "step": 74838 + }, + { + "epoch": 0.9355233880847021, + "grad_norm": 3.5600154399871826, + "learning_rate": 2.5220722485119865e-07, + "loss": 0.9921, + "step": 74840 + }, + { + "epoch": 0.9355483887097178, + "grad_norm": 0.5958260893821716, + "learning_rate": 2.5201250299626836e-07, + "loss": 1.113, + "step": 74842 + }, + { + "epoch": 0.9355733893347333, + "grad_norm": 3.346240282058716, + "learning_rate": 2.5181785538063166e-07, + "loss": 0.8064, + "step": 74844 + }, + { + "epoch": 0.935598389959749, + "grad_norm": 4.527527332305908, + "learning_rate": 2.516232820057729e-07, + "loss": 0.8983, + "step": 74846 + }, + { + "epoch": 0.9356233905847646, + "grad_norm": 2.75557017326355, + "learning_rate": 2.5142878287317206e-07, + "loss": 1.7325, + "step": 74848 + }, + { + "epoch": 0.9356483912097803, + "grad_norm": 4.3346662521362305, + "learning_rate": 2.512343579843135e-07, + "loss": 1.9753, + "step": 74850 + }, + { + "epoch": 0.9356733918347959, + "grad_norm": 1.5599910020828247, + "learning_rate": 2.510400073406727e-07, + "loss": 0.8686, + "step": 74852 + }, + { + "epoch": 0.9356983924598115, + "grad_norm": 1.9429388046264648, + "learning_rate": 2.508457309437329e-07, + "loss": 0.4013, + "step": 74854 + }, + { + "epoch": 0.9357233930848271, + "grad_norm": 4.328695774078369, + "learning_rate": 2.50651528794974e-07, + "loss": 1.1779, + "step": 74856 + }, + { + "epoch": 0.9357483937098428, + "grad_norm": 2.1541378498077393, + "learning_rate": 2.504574008958738e-07, + "loss": 0.3977, + "step": 74858 + }, + { + "epoch": 0.9357733943348584, + "grad_norm": 3.2394087314605713, + "learning_rate": 2.5026334724791213e-07, + "loss": 1.2281, + "step": 74860 + }, + { + "epoch": 0.935798394959874, + "grad_norm": 10.007871627807617, + "learning_rate": 2.5006936785256566e-07, + "loss": 1.9045, + "step": 74862 + }, + { + "epoch": 0.9358233955848896, + "grad_norm": 3.9348268508911133, + "learning_rate": 2.4987546271131314e-07, + "loss": 1.4499, + "step": 74864 + }, + { + "epoch": 0.9358483962099052, + "grad_norm": 4.475656032562256, + "learning_rate": 2.49681631825629e-07, + "loss": 0.9703, + "step": 74866 + }, + { + "epoch": 0.9358733968349209, + "grad_norm": 0.0003631038125604391, + "learning_rate": 2.494878751969898e-07, + "loss": 0.6228, + "step": 74868 + }, + { + "epoch": 0.9358983974599365, + "grad_norm": 2.3792037963867188, + "learning_rate": 2.492941928268744e-07, + "loss": 0.103, + "step": 74870 + }, + { + "epoch": 0.9359233980849522, + "grad_norm": 1.9241664409637451, + "learning_rate": 2.491005847167538e-07, + "loss": 1.1848, + "step": 74872 + }, + { + "epoch": 0.9359483987099677, + "grad_norm": 9.93982219696045, + "learning_rate": 2.489070508681057e-07, + "loss": 0.7189, + "step": 74874 + }, + { + "epoch": 0.9359733993349834, + "grad_norm": 0.37141406536102295, + "learning_rate": 2.4871359128240233e-07, + "loss": 0.3421, + "step": 74876 + }, + { + "epoch": 0.935998399959999, + "grad_norm": 3.4395108222961426, + "learning_rate": 2.485202059611158e-07, + "loss": 0.8546, + "step": 74878 + }, + { + "epoch": 0.9360234005850147, + "grad_norm": 3.56813645362854, + "learning_rate": 2.483268949057216e-07, + "loss": 1.0859, + "step": 74880 + }, + { + "epoch": 0.9360484012100303, + "grad_norm": 0.00039199989987537265, + "learning_rate": 2.4813365811768964e-07, + "loss": 1.0136, + "step": 74882 + }, + { + "epoch": 0.9360734018350458, + "grad_norm": 0.002878605155274272, + "learning_rate": 2.479404955984943e-07, + "loss": 0.7577, + "step": 74884 + }, + { + "epoch": 0.9360984024600615, + "grad_norm": 3.1260154247283936, + "learning_rate": 2.4774740734960443e-07, + "loss": 0.5762, + "step": 74886 + }, + { + "epoch": 0.9361234030850771, + "grad_norm": 2.421891927719116, + "learning_rate": 2.475543933724911e-07, + "loss": 1.1541, + "step": 74888 + }, + { + "epoch": 0.9361484037100928, + "grad_norm": 0.00047663928125984967, + "learning_rate": 2.473614536686253e-07, + "loss": 0.5888, + "step": 74890 + }, + { + "epoch": 0.9361734043351084, + "grad_norm": 1.2619526386260986, + "learning_rate": 2.4716858823947365e-07, + "loss": 0.0518, + "step": 74892 + }, + { + "epoch": 0.936198404960124, + "grad_norm": 2.412248134613037, + "learning_rate": 2.4697579708650833e-07, + "loss": 1.0238, + "step": 74894 + }, + { + "epoch": 0.9362234055851396, + "grad_norm": 5.239733695983887, + "learning_rate": 2.467830802111948e-07, + "loss": 1.3749, + "step": 74896 + }, + { + "epoch": 0.9362484062101553, + "grad_norm": 4.115893840789795, + "learning_rate": 2.4659043761500414e-07, + "loss": 2.0223, + "step": 74898 + }, + { + "epoch": 0.9362734068351709, + "grad_norm": 0.0003966902440879494, + "learning_rate": 2.4639786929939955e-07, + "loss": 0.1161, + "step": 74900 + }, + { + "epoch": 0.9362984074601866, + "grad_norm": 2.2697699069976807, + "learning_rate": 2.4620537526585107e-07, + "loss": 0.9125, + "step": 74902 + }, + { + "epoch": 0.9363234080852021, + "grad_norm": 0.0014936440857127309, + "learning_rate": 2.4601295551582193e-07, + "loss": 0.1084, + "step": 74904 + }, + { + "epoch": 0.9363484087102177, + "grad_norm": 3.5613062381744385, + "learning_rate": 2.4582061005077873e-07, + "loss": 0.8651, + "step": 74906 + }, + { + "epoch": 0.9363734093352334, + "grad_norm": 2.892226457595825, + "learning_rate": 2.4562833887218693e-07, + "loss": 0.7783, + "step": 74908 + }, + { + "epoch": 0.936398409960249, + "grad_norm": 4.126266002655029, + "learning_rate": 2.454361419815099e-07, + "loss": 0.3776, + "step": 74910 + }, + { + "epoch": 0.9364234105852647, + "grad_norm": 3.0594260692596436, + "learning_rate": 2.4524401938021304e-07, + "loss": 1.1451, + "step": 74912 + }, + { + "epoch": 0.9364484112102802, + "grad_norm": 2.271418571472168, + "learning_rate": 2.4505197106975633e-07, + "loss": 0.2071, + "step": 74914 + }, + { + "epoch": 0.9364734118352959, + "grad_norm": 3.0623347759246826, + "learning_rate": 2.4485999705160636e-07, + "loss": 0.41, + "step": 74916 + }, + { + "epoch": 0.9364984124603115, + "grad_norm": 0.0002818934735842049, + "learning_rate": 2.4466809732722197e-07, + "loss": 0.4649, + "step": 74918 + }, + { + "epoch": 0.9365234130853272, + "grad_norm": 1.7485042810440063, + "learning_rate": 2.444762718980653e-07, + "loss": 0.6655, + "step": 74920 + }, + { + "epoch": 0.9365484137103428, + "grad_norm": 5.715327262878418, + "learning_rate": 2.442845207655986e-07, + "loss": 1.7138, + "step": 74922 + }, + { + "epoch": 0.9365734143353583, + "grad_norm": 4.2226996421813965, + "learning_rate": 2.4409284393128176e-07, + "loss": 0.9727, + "step": 74924 + }, + { + "epoch": 0.936598414960374, + "grad_norm": 1.5466878414154053, + "learning_rate": 2.439012413965736e-07, + "loss": 0.6511, + "step": 74926 + }, + { + "epoch": 0.9366234155853896, + "grad_norm": 3.1221940517425537, + "learning_rate": 2.43709713162934e-07, + "loss": 1.3992, + "step": 74928 + }, + { + "epoch": 0.9366484162104053, + "grad_norm": 2.4189910888671875, + "learning_rate": 2.4351825923182194e-07, + "loss": 1.1571, + "step": 74930 + }, + { + "epoch": 0.9366734168354209, + "grad_norm": 3.5732641220092773, + "learning_rate": 2.43326879604695e-07, + "loss": 0.7678, + "step": 74932 + }, + { + "epoch": 0.9366984174604365, + "grad_norm": 2.9393181800842285, + "learning_rate": 2.4313557428300994e-07, + "loss": 0.559, + "step": 74934 + }, + { + "epoch": 0.9367234180854521, + "grad_norm": 0.012774449773132801, + "learning_rate": 2.429443432682255e-07, + "loss": 0.6514, + "step": 74936 + }, + { + "epoch": 0.9367484187104678, + "grad_norm": 0.0005391744780354202, + "learning_rate": 2.4275318656179713e-07, + "loss": 0.646, + "step": 74938 + }, + { + "epoch": 0.9367734193354834, + "grad_norm": 1.2720913887023926, + "learning_rate": 2.4256210416518046e-07, + "loss": 0.0371, + "step": 74940 + }, + { + "epoch": 0.9367984199604991, + "grad_norm": 1.030594825744629, + "learning_rate": 2.42371096079832e-07, + "loss": 0.102, + "step": 74942 + }, + { + "epoch": 0.9368234205855146, + "grad_norm": 2.5420401096343994, + "learning_rate": 2.42180162307204e-07, + "loss": 0.361, + "step": 74944 + }, + { + "epoch": 0.9368484212105302, + "grad_norm": 3.097900629043579, + "learning_rate": 2.41989302848753e-07, + "loss": 0.5682, + "step": 74946 + }, + { + "epoch": 0.9368734218355459, + "grad_norm": 1.5526854991912842, + "learning_rate": 2.417985177059301e-07, + "loss": 0.4545, + "step": 74948 + }, + { + "epoch": 0.9368984224605615, + "grad_norm": 2.2898008823394775, + "learning_rate": 2.4160780688019083e-07, + "loss": 1.1013, + "step": 74950 + }, + { + "epoch": 0.9369234230855772, + "grad_norm": 3.2452166080474854, + "learning_rate": 2.414171703729862e-07, + "loss": 0.5977, + "step": 74952 + }, + { + "epoch": 0.9369484237105927, + "grad_norm": 5.761171340942383, + "learning_rate": 2.412266081857684e-07, + "loss": 1.2562, + "step": 74954 + }, + { + "epoch": 0.9369734243356084, + "grad_norm": 3.441051959991455, + "learning_rate": 2.4103612031998957e-07, + "loss": 1.6501, + "step": 74956 + }, + { + "epoch": 0.936998424960624, + "grad_norm": 6.951074600219727, + "learning_rate": 2.408457067770986e-07, + "loss": 1.1016, + "step": 74958 + }, + { + "epoch": 0.9370234255856397, + "grad_norm": 2.5113325119018555, + "learning_rate": 2.406553675585477e-07, + "loss": 0.8778, + "step": 74960 + }, + { + "epoch": 0.9370484262106553, + "grad_norm": 3.4853899478912354, + "learning_rate": 2.4046510266578447e-07, + "loss": 1.2283, + "step": 74962 + }, + { + "epoch": 0.9370734268356709, + "grad_norm": 3.7123963832855225, + "learning_rate": 2.40274912100259e-07, + "loss": 1.2306, + "step": 74964 + }, + { + "epoch": 0.9370984274606865, + "grad_norm": 3.2225301265716553, + "learning_rate": 2.4008479586342006e-07, + "loss": 1.5233, + "step": 74966 + }, + { + "epoch": 0.9371234280857021, + "grad_norm": 2.574002504348755, + "learning_rate": 2.3989475395671535e-07, + "loss": 0.4976, + "step": 74968 + }, + { + "epoch": 0.9371484287107178, + "grad_norm": 0.003384338691830635, + "learning_rate": 2.397047863815916e-07, + "loss": 0.3607, + "step": 74970 + }, + { + "epoch": 0.9371734293357334, + "grad_norm": 3.890587329864502, + "learning_rate": 2.395148931394953e-07, + "loss": 0.5436, + "step": 74972 + }, + { + "epoch": 0.937198429960749, + "grad_norm": 0.0593395009636879, + "learning_rate": 2.3932507423187425e-07, + "loss": 0.4687, + "step": 74974 + }, + { + "epoch": 0.9372234305857646, + "grad_norm": 2.7678885459899902, + "learning_rate": 2.391353296601728e-07, + "loss": 2.3397, + "step": 74976 + }, + { + "epoch": 0.9372484312107803, + "grad_norm": 2.7741236686706543, + "learning_rate": 2.389456594258366e-07, + "loss": 0.8579, + "step": 74978 + }, + { + "epoch": 0.9372734318357959, + "grad_norm": 8.788436889648438, + "learning_rate": 2.3875606353030876e-07, + "loss": 1.208, + "step": 74980 + }, + { + "epoch": 0.9372984324608116, + "grad_norm": 1.6871272325515747, + "learning_rate": 2.3856654197503603e-07, + "loss": 0.7129, + "step": 74982 + }, + { + "epoch": 0.9373234330858271, + "grad_norm": 2.7002921104431152, + "learning_rate": 2.3837709476146053e-07, + "loss": 0.7688, + "step": 74984 + }, + { + "epoch": 0.9373484337108428, + "grad_norm": 3.3412728309631348, + "learning_rate": 2.3818772189102224e-07, + "loss": 1.2092, + "step": 74986 + }, + { + "epoch": 0.9373734343358584, + "grad_norm": 3.553842067718506, + "learning_rate": 2.3799842336516777e-07, + "loss": 1.2036, + "step": 74988 + }, + { + "epoch": 0.937398434960874, + "grad_norm": 3.1428136825561523, + "learning_rate": 2.3780919918533596e-07, + "loss": 0.5134, + "step": 74990 + }, + { + "epoch": 0.9374234355858897, + "grad_norm": 2.127660036087036, + "learning_rate": 2.37620049352969e-07, + "loss": 0.9687, + "step": 74992 + }, + { + "epoch": 0.9374484362109052, + "grad_norm": 0.00026671867817640305, + "learning_rate": 2.3743097386950797e-07, + "loss": 0.0, + "step": 74994 + }, + { + "epoch": 0.9374734368359209, + "grad_norm": 0.0012321554822847247, + "learning_rate": 2.372419727363906e-07, + "loss": 0.7865, + "step": 74996 + }, + { + "epoch": 0.9374984374609365, + "grad_norm": 0.0004970654845237732, + "learning_rate": 2.3705304595505906e-07, + "loss": 0.8484, + "step": 74998 + }, + { + "epoch": 0.9375234380859522, + "grad_norm": 5.452506065368652, + "learning_rate": 2.3686419352695e-07, + "loss": 1.4108, + "step": 75000 + }, + { + "epoch": 0.9375484387109678, + "grad_norm": 2.256993055343628, + "learning_rate": 2.3667541545350336e-07, + "loss": 0.5446, + "step": 75002 + }, + { + "epoch": 0.9375734393359834, + "grad_norm": 3.724605083465576, + "learning_rate": 2.3648671173615578e-07, + "loss": 1.9664, + "step": 75004 + }, + { + "epoch": 0.937598439960999, + "grad_norm": 2.0880446434020996, + "learning_rate": 2.36298082376345e-07, + "loss": 1.5061, + "step": 75006 + }, + { + "epoch": 0.9376234405860147, + "grad_norm": 2.068772554397583, + "learning_rate": 2.3610952737550762e-07, + "loss": 1.1939, + "step": 75008 + }, + { + "epoch": 0.9376484412110303, + "grad_norm": 0.00036599315353669226, + "learning_rate": 2.359210467350781e-07, + "loss": 0.0, + "step": 75010 + }, + { + "epoch": 0.937673441836046, + "grad_norm": 4.138408184051514, + "learning_rate": 2.3573264045649414e-07, + "loss": 1.3049, + "step": 75012 + }, + { + "epoch": 0.9376984424610615, + "grad_norm": 3.9904448986053467, + "learning_rate": 2.3554430854118903e-07, + "loss": 1.1763, + "step": 75014 + }, + { + "epoch": 0.9377234430860771, + "grad_norm": 5.535028457641602, + "learning_rate": 2.3535605099059723e-07, + "loss": 1.7202, + "step": 75016 + }, + { + "epoch": 0.9377484437110928, + "grad_norm": 11.356961250305176, + "learning_rate": 2.3516786780615307e-07, + "loss": 1.2686, + "step": 75018 + }, + { + "epoch": 0.9377734443361084, + "grad_norm": 3.8459348678588867, + "learning_rate": 2.3497975898929104e-07, + "loss": 1.3011, + "step": 75020 + }, + { + "epoch": 0.9377984449611241, + "grad_norm": 1.8247069120407104, + "learning_rate": 2.3479172454144107e-07, + "loss": 1.1791, + "step": 75022 + }, + { + "epoch": 0.9378234455861396, + "grad_norm": 2.7312378883361816, + "learning_rate": 2.3460376446403643e-07, + "loss": 0.6343, + "step": 75024 + }, + { + "epoch": 0.9378484462111553, + "grad_norm": 2.5001182556152344, + "learning_rate": 2.3441587875850936e-07, + "loss": 0.6571, + "step": 75026 + }, + { + "epoch": 0.9378734468361709, + "grad_norm": 2.54431414604187, + "learning_rate": 2.3422806742628868e-07, + "loss": 0.9225, + "step": 75028 + }, + { + "epoch": 0.9378984474611866, + "grad_norm": 2.90493106842041, + "learning_rate": 2.3404033046880659e-07, + "loss": 1.2905, + "step": 75030 + }, + { + "epoch": 0.9379234480862022, + "grad_norm": 5.134329795837402, + "learning_rate": 2.3385266788749196e-07, + "loss": 1.6148, + "step": 75032 + }, + { + "epoch": 0.9379484487112177, + "grad_norm": 3.5398876667022705, + "learning_rate": 2.3366507968377583e-07, + "loss": 0.3613, + "step": 75034 + }, + { + "epoch": 0.9379734493362334, + "grad_norm": 1.8224215507507324, + "learning_rate": 2.3347756585908488e-07, + "loss": 0.876, + "step": 75036 + }, + { + "epoch": 0.937998449961249, + "grad_norm": 3.5358376502990723, + "learning_rate": 2.3329012641484572e-07, + "loss": 0.3764, + "step": 75038 + }, + { + "epoch": 0.9380234505862647, + "grad_norm": 3.8973889350891113, + "learning_rate": 2.3310276135249054e-07, + "loss": 1.8534, + "step": 75040 + }, + { + "epoch": 0.9380484512112803, + "grad_norm": 4.505960941314697, + "learning_rate": 2.3291547067344157e-07, + "loss": 1.6771, + "step": 75042 + }, + { + "epoch": 0.9380734518362959, + "grad_norm": 2.633021354675293, + "learning_rate": 2.327282543791276e-07, + "loss": 1.2507, + "step": 75044 + }, + { + "epoch": 0.9380984524613115, + "grad_norm": 2.0299293994903564, + "learning_rate": 2.3254111247097533e-07, + "loss": 1.1288, + "step": 75046 + }, + { + "epoch": 0.9381234530863272, + "grad_norm": 1.9816789627075195, + "learning_rate": 2.3235404495040693e-07, + "loss": 0.8535, + "step": 75048 + }, + { + "epoch": 0.9381484537113428, + "grad_norm": 4.204047679901123, + "learning_rate": 2.3216705181885014e-07, + "loss": 2.199, + "step": 75050 + }, + { + "epoch": 0.9381734543363585, + "grad_norm": 5.055904388427734, + "learning_rate": 2.3198013307772604e-07, + "loss": 0.6157, + "step": 75052 + }, + { + "epoch": 0.938198454961374, + "grad_norm": 3.6182587146759033, + "learning_rate": 2.3179328872846242e-07, + "loss": 0.8191, + "step": 75054 + }, + { + "epoch": 0.9382234555863896, + "grad_norm": 1.9191170930862427, + "learning_rate": 2.3160651877247696e-07, + "loss": 1.0473, + "step": 75056 + }, + { + "epoch": 0.9382484562114053, + "grad_norm": 4.364383697509766, + "learning_rate": 2.3141982321119748e-07, + "loss": 0.7876, + "step": 75058 + }, + { + "epoch": 0.9382734568364209, + "grad_norm": 4.219630241394043, + "learning_rate": 2.3123320204604281e-07, + "loss": 0.6958, + "step": 75060 + }, + { + "epoch": 0.9382984574614366, + "grad_norm": 3.9209437370300293, + "learning_rate": 2.3104665527843295e-07, + "loss": 2.5544, + "step": 75062 + }, + { + "epoch": 0.9383234580864521, + "grad_norm": 5.007072925567627, + "learning_rate": 2.3086018290979228e-07, + "loss": 0.9617, + "step": 75064 + }, + { + "epoch": 0.9383484587114678, + "grad_norm": 3.4693262577056885, + "learning_rate": 2.3067378494153747e-07, + "loss": 0.7394, + "step": 75066 + }, + { + "epoch": 0.9383734593364834, + "grad_norm": 4.667388916015625, + "learning_rate": 2.3048746137508958e-07, + "loss": 1.3028, + "step": 75068 + }, + { + "epoch": 0.9383984599614991, + "grad_norm": 5.05582857131958, + "learning_rate": 2.3030121221186864e-07, + "loss": 1.9248, + "step": 75070 + }, + { + "epoch": 0.9384234605865147, + "grad_norm": 2.951749086380005, + "learning_rate": 2.301150374532912e-07, + "loss": 0.3175, + "step": 75072 + }, + { + "epoch": 0.9384484612115302, + "grad_norm": 3.5167462825775146, + "learning_rate": 2.2992893710077734e-07, + "loss": 0.4743, + "step": 75074 + }, + { + "epoch": 0.9384734618365459, + "grad_norm": 1.6019072532653809, + "learning_rate": 2.2974291115574142e-07, + "loss": 1.02, + "step": 75076 + }, + { + "epoch": 0.9384984624615615, + "grad_norm": 0.0004605262365657836, + "learning_rate": 2.2955695961960344e-07, + "loss": 0.6776, + "step": 75078 + }, + { + "epoch": 0.9385234630865772, + "grad_norm": 1.057486653327942, + "learning_rate": 2.2937108249377672e-07, + "loss": 0.5078, + "step": 75080 + }, + { + "epoch": 0.9385484637115928, + "grad_norm": 4.161149978637695, + "learning_rate": 2.2918527977967898e-07, + "loss": 1.7172, + "step": 75082 + }, + { + "epoch": 0.9385734643366084, + "grad_norm": 0.001214517978951335, + "learning_rate": 2.2899955147872355e-07, + "loss": 0.0441, + "step": 75084 + }, + { + "epoch": 0.938598464961624, + "grad_norm": 2.08821177482605, + "learning_rate": 2.288138975923271e-07, + "loss": 0.5653, + "step": 75086 + }, + { + "epoch": 0.9386234655866397, + "grad_norm": 1.7039830684661865, + "learning_rate": 2.286283181219018e-07, + "loss": 0.5026, + "step": 75088 + }, + { + "epoch": 0.9386484662116553, + "grad_norm": 10.378896713256836, + "learning_rate": 2.2844281306886096e-07, + "loss": 0.56, + "step": 75090 + }, + { + "epoch": 0.938673466836671, + "grad_norm": 0.0012459703721106052, + "learning_rate": 2.282573824346179e-07, + "loss": 0.9744, + "step": 75092 + }, + { + "epoch": 0.9386984674616865, + "grad_norm": 2.501418113708496, + "learning_rate": 2.280720262205849e-07, + "loss": 1.0101, + "step": 75094 + }, + { + "epoch": 0.9387234680867022, + "grad_norm": 0.4152597486972809, + "learning_rate": 2.2788674442817405e-07, + "loss": 0.2695, + "step": 75096 + }, + { + "epoch": 0.9387484687117178, + "grad_norm": 2.131434440612793, + "learning_rate": 2.277015370587954e-07, + "loss": 1.0856, + "step": 75098 + }, + { + "epoch": 0.9387734693367334, + "grad_norm": 1.4406071901321411, + "learning_rate": 2.275164041138589e-07, + "loss": 0.5234, + "step": 75100 + }, + { + "epoch": 0.9387984699617491, + "grad_norm": 4.051215648651123, + "learning_rate": 2.2733134559477676e-07, + "loss": 1.9025, + "step": 75102 + }, + { + "epoch": 0.9388234705867646, + "grad_norm": 3.7096338272094727, + "learning_rate": 2.2714636150295676e-07, + "loss": 1.0852, + "step": 75104 + }, + { + "epoch": 0.9388484712117803, + "grad_norm": 6.015870571136475, + "learning_rate": 2.2696145183980777e-07, + "loss": 2.2162, + "step": 75106 + }, + { + "epoch": 0.9388734718367959, + "grad_norm": 2.552938222885132, + "learning_rate": 2.2677661660673868e-07, + "loss": 0.7951, + "step": 75108 + }, + { + "epoch": 0.9388984724618116, + "grad_norm": 4.187899589538574, + "learning_rate": 2.265918558051561e-07, + "loss": 2.1118, + "step": 75110 + }, + { + "epoch": 0.9389234730868272, + "grad_norm": 4.737877368927002, + "learning_rate": 2.2640716943647002e-07, + "loss": 1.8547, + "step": 75112 + }, + { + "epoch": 0.9389484737118428, + "grad_norm": 0.5302411913871765, + "learning_rate": 2.2622255750208265e-07, + "loss": 1.0878, + "step": 75114 + }, + { + "epoch": 0.9389734743368584, + "grad_norm": 5.313374042510986, + "learning_rate": 2.2603802000340292e-07, + "loss": 0.9816, + "step": 75116 + }, + { + "epoch": 0.938998474961874, + "grad_norm": 2.8297438621520996, + "learning_rate": 2.258535569418352e-07, + "loss": 0.5509, + "step": 75118 + }, + { + "epoch": 0.9390234755868897, + "grad_norm": 3.042750835418701, + "learning_rate": 2.2566916831878504e-07, + "loss": 1.0349, + "step": 75120 + }, + { + "epoch": 0.9390484762119053, + "grad_norm": 3.3042516708374023, + "learning_rate": 2.254848541356547e-07, + "loss": 1.0064, + "step": 75122 + }, + { + "epoch": 0.9390734768369209, + "grad_norm": 4.2566375732421875, + "learning_rate": 2.2530061439385186e-07, + "loss": 1.88, + "step": 75124 + }, + { + "epoch": 0.9390984774619365, + "grad_norm": 2.360151529312134, + "learning_rate": 2.2511644909477658e-07, + "loss": 0.6049, + "step": 75126 + }, + { + "epoch": 0.9391234780869522, + "grad_norm": 11.61207389831543, + "learning_rate": 2.2493235823983107e-07, + "loss": 1.5208, + "step": 75128 + }, + { + "epoch": 0.9391484787119678, + "grad_norm": 3.424403190612793, + "learning_rate": 2.247483418304197e-07, + "loss": 1.2735, + "step": 75130 + }, + { + "epoch": 0.9391734793369835, + "grad_norm": 3.2569291591644287, + "learning_rate": 2.2456439986794144e-07, + "loss": 0.4163, + "step": 75132 + }, + { + "epoch": 0.939198479961999, + "grad_norm": 0.00017645298794377595, + "learning_rate": 2.243805323537984e-07, + "loss": 0.0929, + "step": 75134 + }, + { + "epoch": 0.9392234805870147, + "grad_norm": 1.6825454235076904, + "learning_rate": 2.2419673928939178e-07, + "loss": 0.0669, + "step": 75136 + }, + { + "epoch": 0.9392484812120303, + "grad_norm": 1.8022602796554565, + "learning_rate": 2.240130206761193e-07, + "loss": 0.2859, + "step": 75138 + }, + { + "epoch": 0.939273481837046, + "grad_norm": 0.00018133445701096207, + "learning_rate": 2.2382937651538318e-07, + "loss": 0.6251, + "step": 75140 + }, + { + "epoch": 0.9392984824620616, + "grad_norm": 2.729600667953491, + "learning_rate": 2.2364580680857784e-07, + "loss": 0.8716, + "step": 75142 + }, + { + "epoch": 0.9393234830870771, + "grad_norm": 2.3510565757751465, + "learning_rate": 2.234623115571044e-07, + "loss": 0.1625, + "step": 75144 + }, + { + "epoch": 0.9393484837120928, + "grad_norm": 1.2206475734710693, + "learning_rate": 2.2327889076235953e-07, + "loss": 0.5774, + "step": 75146 + }, + { + "epoch": 0.9393734843371084, + "grad_norm": 3.0832607746124268, + "learning_rate": 2.2309554442573879e-07, + "loss": 0.999, + "step": 75148 + }, + { + "epoch": 0.9393984849621241, + "grad_norm": 3.6426548957824707, + "learning_rate": 2.2291227254864212e-07, + "loss": 0.9005, + "step": 75150 + }, + { + "epoch": 0.9394234855871397, + "grad_norm": 3.193206548690796, + "learning_rate": 2.2272907513246178e-07, + "loss": 1.0364, + "step": 75152 + }, + { + "epoch": 0.9394484862121553, + "grad_norm": 5.563206672668457, + "learning_rate": 2.2254595217859443e-07, + "loss": 0.5295, + "step": 75154 + }, + { + "epoch": 0.9394734868371709, + "grad_norm": 8.05706787109375, + "learning_rate": 2.223629036884334e-07, + "loss": 1.2436, + "step": 75156 + }, + { + "epoch": 0.9394984874621866, + "grad_norm": 2.0381596088409424, + "learning_rate": 2.2217992966337532e-07, + "loss": 0.982, + "step": 75158 + }, + { + "epoch": 0.9395234880872022, + "grad_norm": 2.5824337005615234, + "learning_rate": 2.2199703010481133e-07, + "loss": 1.005, + "step": 75160 + }, + { + "epoch": 0.9395484887122179, + "grad_norm": 1.961102843284607, + "learning_rate": 2.2181420501413476e-07, + "loss": 0.2773, + "step": 75162 + }, + { + "epoch": 0.9395734893372334, + "grad_norm": 2.6639039516448975, + "learning_rate": 2.2163145439274002e-07, + "loss": 1.1154, + "step": 75164 + }, + { + "epoch": 0.939598489962249, + "grad_norm": 4.887164115905762, + "learning_rate": 2.2144877824201606e-07, + "loss": 2.0121, + "step": 75166 + }, + { + "epoch": 0.9396234905872647, + "grad_norm": 0.0005715861334465444, + "learning_rate": 2.2126617656335613e-07, + "loss": 0.0771, + "step": 75168 + }, + { + "epoch": 0.9396484912122803, + "grad_norm": 3.2095203399658203, + "learning_rate": 2.210836493581492e-07, + "loss": 0.778, + "step": 75170 + }, + { + "epoch": 0.939673491837296, + "grad_norm": 2.785482883453369, + "learning_rate": 2.209011966277863e-07, + "loss": 0.884, + "step": 75172 + }, + { + "epoch": 0.9396984924623115, + "grad_norm": 2.4661858081817627, + "learning_rate": 2.207188183736586e-07, + "loss": 0.3064, + "step": 75174 + }, + { + "epoch": 0.9397234930873272, + "grad_norm": 2.1742467880249023, + "learning_rate": 2.2053651459715274e-07, + "loss": 3.1409, + "step": 75176 + }, + { + "epoch": 0.9397484937123428, + "grad_norm": 0.9475222826004028, + "learning_rate": 2.2035428529965764e-07, + "loss": 0.3629, + "step": 75178 + }, + { + "epoch": 0.9397734943373585, + "grad_norm": 0.02370646595954895, + "learning_rate": 2.2017213048256104e-07, + "loss": 0.0883, + "step": 75180 + }, + { + "epoch": 0.9397984949623741, + "grad_norm": 6.409485340118408, + "learning_rate": 2.1999005014725183e-07, + "loss": 0.9618, + "step": 75182 + }, + { + "epoch": 0.9398234955873896, + "grad_norm": 2.4407081604003906, + "learning_rate": 2.1980804429511338e-07, + "loss": 0.9532, + "step": 75184 + }, + { + "epoch": 0.9398484962124053, + "grad_norm": 0.9092492461204529, + "learning_rate": 2.1962611292753455e-07, + "loss": 0.4396, + "step": 75186 + }, + { + "epoch": 0.9398734968374209, + "grad_norm": 1.8407583236694336, + "learning_rate": 2.194442560458998e-07, + "loss": 0.6124, + "step": 75188 + }, + { + "epoch": 0.9398984974624366, + "grad_norm": 0.0006888685165904462, + "learning_rate": 2.192624736515958e-07, + "loss": 0.2907, + "step": 75190 + }, + { + "epoch": 0.9399234980874522, + "grad_norm": 3.1677069664001465, + "learning_rate": 2.190807657460059e-07, + "loss": 0.1669, + "step": 75192 + }, + { + "epoch": 0.9399484987124678, + "grad_norm": 10.69668960571289, + "learning_rate": 2.188991323305123e-07, + "loss": 1.2712, + "step": 75194 + }, + { + "epoch": 0.9399734993374834, + "grad_norm": 3.570373773574829, + "learning_rate": 2.1871757340649945e-07, + "loss": 0.2788, + "step": 75196 + }, + { + "epoch": 0.9399984999624991, + "grad_norm": 5.762576580047607, + "learning_rate": 2.1853608897535184e-07, + "loss": 2.2419, + "step": 75198 + }, + { + "epoch": 0.9400235005875147, + "grad_norm": 0.7830904126167297, + "learning_rate": 2.183546790384483e-07, + "loss": 0.0228, + "step": 75200 + }, + { + "epoch": 0.9400485012125304, + "grad_norm": 0.000403371115680784, + "learning_rate": 2.1817334359717446e-07, + "loss": 0.6316, + "step": 75202 + }, + { + "epoch": 0.9400735018375459, + "grad_norm": 3.8185067176818848, + "learning_rate": 2.1799208265290805e-07, + "loss": 0.5078, + "step": 75204 + }, + { + "epoch": 0.9400985024625615, + "grad_norm": 3.2383651733398438, + "learning_rate": 2.1781089620703133e-07, + "loss": 0.1721, + "step": 75206 + }, + { + "epoch": 0.9401235030875772, + "grad_norm": 4.406518459320068, + "learning_rate": 2.1762978426092208e-07, + "loss": 0.8591, + "step": 75208 + }, + { + "epoch": 0.9401485037125928, + "grad_norm": 1.957275390625, + "learning_rate": 2.174487468159614e-07, + "loss": 0.4596, + "step": 75210 + }, + { + "epoch": 0.9401735043376085, + "grad_norm": 3.5527381896972656, + "learning_rate": 2.172677838735282e-07, + "loss": 1.3624, + "step": 75212 + }, + { + "epoch": 0.940198504962624, + "grad_norm": 3.8175554275512695, + "learning_rate": 2.170868954350003e-07, + "loss": 0.3862, + "step": 75214 + }, + { + "epoch": 0.9402235055876397, + "grad_norm": 0.00018542692123446614, + "learning_rate": 2.169060815017554e-07, + "loss": 0.0693, + "step": 75216 + }, + { + "epoch": 0.9402485062126553, + "grad_norm": 4.854370594024658, + "learning_rate": 2.1672534207516914e-07, + "loss": 0.9407, + "step": 75218 + }, + { + "epoch": 0.940273506837671, + "grad_norm": 3.0540151596069336, + "learning_rate": 2.1654467715662042e-07, + "loss": 1.098, + "step": 75220 + }, + { + "epoch": 0.9402985074626866, + "grad_norm": 0.00020987491006962955, + "learning_rate": 2.1636408674748367e-07, + "loss": 0.8363, + "step": 75222 + }, + { + "epoch": 0.9403235080877022, + "grad_norm": 0.0001727446069708094, + "learning_rate": 2.1618357084913332e-07, + "loss": 0.8214, + "step": 75224 + }, + { + "epoch": 0.9403485087127178, + "grad_norm": 3.2020328044891357, + "learning_rate": 2.160031294629472e-07, + "loss": 0.8679, + "step": 75226 + }, + { + "epoch": 0.9403735093377334, + "grad_norm": 2.142092704772949, + "learning_rate": 2.1582276259029644e-07, + "loss": 0.9073, + "step": 75228 + }, + { + "epoch": 0.9403985099627491, + "grad_norm": 1.0322402715682983, + "learning_rate": 2.1564247023255658e-07, + "loss": 0.2507, + "step": 75230 + }, + { + "epoch": 0.9404235105877647, + "grad_norm": 3.974339008331299, + "learning_rate": 2.1546225239109985e-07, + "loss": 0.7463, + "step": 75232 + }, + { + "epoch": 0.9404485112127803, + "grad_norm": 0.0005135111277922988, + "learning_rate": 2.1528210906729852e-07, + "loss": 0.44, + "step": 75234 + }, + { + "epoch": 0.9404735118377959, + "grad_norm": 1.3072192668914795, + "learning_rate": 2.151020402625248e-07, + "loss": 0.3746, + "step": 75236 + }, + { + "epoch": 0.9404985124628116, + "grad_norm": 4.2340898513793945, + "learning_rate": 2.1492204597815092e-07, + "loss": 1.1354, + "step": 75238 + }, + { + "epoch": 0.9405235130878272, + "grad_norm": 6.366970539093018, + "learning_rate": 2.1474212621554692e-07, + "loss": 1.3534, + "step": 75240 + }, + { + "epoch": 0.9405485137128429, + "grad_norm": 2.493645429611206, + "learning_rate": 2.1456228097608278e-07, + "loss": 1.4386, + "step": 75242 + }, + { + "epoch": 0.9405735143378584, + "grad_norm": 1.0918643474578857, + "learning_rate": 2.1438251026112966e-07, + "loss": 0.0435, + "step": 75244 + }, + { + "epoch": 0.940598514962874, + "grad_norm": 0.8732810020446777, + "learning_rate": 2.1420281407205422e-07, + "loss": 0.0375, + "step": 75246 + }, + { + "epoch": 0.9406235155878897, + "grad_norm": 3.3354878425598145, + "learning_rate": 2.140231924102254e-07, + "loss": 1.6175, + "step": 75248 + }, + { + "epoch": 0.9406485162129054, + "grad_norm": 4.749920845031738, + "learning_rate": 2.138436452770143e-07, + "loss": 1.8258, + "step": 75250 + }, + { + "epoch": 0.940673516837921, + "grad_norm": 0.02885291911661625, + "learning_rate": 2.1366417267378426e-07, + "loss": 0.5956, + "step": 75252 + }, + { + "epoch": 0.9406985174629365, + "grad_norm": 3.2267274856567383, + "learning_rate": 2.134847746019053e-07, + "loss": 0.2182, + "step": 75254 + }, + { + "epoch": 0.9407235180879522, + "grad_norm": 2.829392910003662, + "learning_rate": 2.133054510627419e-07, + "loss": 1.1975, + "step": 75256 + }, + { + "epoch": 0.9407485187129678, + "grad_norm": 2.690270185470581, + "learning_rate": 2.1312620205765967e-07, + "loss": 0.6185, + "step": 75258 + }, + { + "epoch": 0.9407735193379835, + "grad_norm": 1.1771808862686157, + "learning_rate": 2.129470275880252e-07, + "loss": 0.7077, + "step": 75260 + }, + { + "epoch": 0.9407985199629991, + "grad_norm": 6.207597255706787, + "learning_rate": 2.1276792765520194e-07, + "loss": 1.3086, + "step": 75262 + }, + { + "epoch": 0.9408235205880147, + "grad_norm": 0.8179572820663452, + "learning_rate": 2.1258890226055428e-07, + "loss": 0.9059, + "step": 75264 + }, + { + "epoch": 0.9408485212130303, + "grad_norm": 0.4830131232738495, + "learning_rate": 2.124099514054445e-07, + "loss": 0.0118, + "step": 75266 + }, + { + "epoch": 0.940873521838046, + "grad_norm": 3.542574882507324, + "learning_rate": 2.122310750912382e-07, + "loss": 0.9774, + "step": 75268 + }, + { + "epoch": 0.9408985224630616, + "grad_norm": 1.8899251222610474, + "learning_rate": 2.1205227331929424e-07, + "loss": 0.1023, + "step": 75270 + }, + { + "epoch": 0.9409235230880773, + "grad_norm": 0.9765603542327881, + "learning_rate": 2.118735460909771e-07, + "loss": 0.1542, + "step": 75272 + }, + { + "epoch": 0.9409485237130928, + "grad_norm": 3.186734199523926, + "learning_rate": 2.116948934076457e-07, + "loss": 1.5212, + "step": 75274 + }, + { + "epoch": 0.9409735243381084, + "grad_norm": 0.2638726830482483, + "learning_rate": 2.115163152706623e-07, + "loss": 1.4592, + "step": 75276 + }, + { + "epoch": 0.9409985249631241, + "grad_norm": 3.5657472610473633, + "learning_rate": 2.113378116813869e-07, + "loss": 0.7957, + "step": 75278 + }, + { + "epoch": 0.9410235255881397, + "grad_norm": 1.5548779964447021, + "learning_rate": 2.111593826411784e-07, + "loss": 0.4488, + "step": 75280 + }, + { + "epoch": 0.9410485262131554, + "grad_norm": 0.001550266402773559, + "learning_rate": 2.1098102815139687e-07, + "loss": 0.297, + "step": 75282 + }, + { + "epoch": 0.9410735268381709, + "grad_norm": 4.127170562744141, + "learning_rate": 2.1080274821339674e-07, + "loss": 1.1256, + "step": 75284 + }, + { + "epoch": 0.9410985274631866, + "grad_norm": 6.071408271789551, + "learning_rate": 2.106245428285403e-07, + "loss": 1.7594, + "step": 75286 + }, + { + "epoch": 0.9411235280882022, + "grad_norm": 4.151013374328613, + "learning_rate": 2.1044641199818305e-07, + "loss": 1.2958, + "step": 75288 + }, + { + "epoch": 0.9411485287132179, + "grad_norm": 6.004596710205078, + "learning_rate": 2.1026835572368066e-07, + "loss": 1.4122, + "step": 75290 + }, + { + "epoch": 0.9411735293382335, + "grad_norm": 3.2472126483917236, + "learning_rate": 2.1009037400639087e-07, + "loss": 1.265, + "step": 75292 + }, + { + "epoch": 0.941198529963249, + "grad_norm": 0.0004631214833352715, + "learning_rate": 2.099124668476682e-07, + "loss": 0.1152, + "step": 75294 + }, + { + "epoch": 0.9412235305882647, + "grad_norm": 0.9254395961761475, + "learning_rate": 2.097346342488682e-07, + "loss": 0.5924, + "step": 75296 + }, + { + "epoch": 0.9412485312132803, + "grad_norm": 4.93066930770874, + "learning_rate": 2.0955687621134425e-07, + "loss": 1.7738, + "step": 75298 + }, + { + "epoch": 0.941273531838296, + "grad_norm": 4.05320405960083, + "learning_rate": 2.093791927364497e-07, + "loss": 0.8136, + "step": 75300 + }, + { + "epoch": 0.9412985324633116, + "grad_norm": 2.875225782394409, + "learning_rate": 2.0920158382554011e-07, + "loss": 0.5362, + "step": 75302 + }, + { + "epoch": 0.9413235330883272, + "grad_norm": 4.81714391708374, + "learning_rate": 2.090240494799667e-07, + "loss": 1.1526, + "step": 75304 + }, + { + "epoch": 0.9413485337133428, + "grad_norm": 4.163534164428711, + "learning_rate": 2.0884658970108052e-07, + "loss": 1.6461, + "step": 75306 + }, + { + "epoch": 0.9413735343383585, + "grad_norm": 2.0995945930480957, + "learning_rate": 2.086692044902361e-07, + "loss": 0.1588, + "step": 75308 + }, + { + "epoch": 0.9413985349633741, + "grad_norm": 0.8116206526756287, + "learning_rate": 2.084918938487801e-07, + "loss": 0.8444, + "step": 75310 + }, + { + "epoch": 0.9414235355883898, + "grad_norm": 0.00037309955223463476, + "learning_rate": 2.0831465777806593e-07, + "loss": 0.6465, + "step": 75312 + }, + { + "epoch": 0.9414485362134053, + "grad_norm": 3.6136600971221924, + "learning_rate": 2.0813749627944247e-07, + "loss": 0.6177, + "step": 75314 + }, + { + "epoch": 0.9414735368384209, + "grad_norm": 2.531341314315796, + "learning_rate": 2.0796040935425977e-07, + "loss": 0.4964, + "step": 75316 + }, + { + "epoch": 0.9414985374634366, + "grad_norm": 0.00031839037546887994, + "learning_rate": 2.0778339700386563e-07, + "loss": 0.0013, + "step": 75318 + }, + { + "epoch": 0.9415235380884522, + "grad_norm": 0.0024804675485938787, + "learning_rate": 2.07606459229609e-07, + "loss": 0.3765, + "step": 75320 + }, + { + "epoch": 0.9415485387134679, + "grad_norm": 5.671746730804443, + "learning_rate": 2.0742959603283652e-07, + "loss": 1.9231, + "step": 75322 + }, + { + "epoch": 0.9415735393384834, + "grad_norm": 3.7087223529815674, + "learning_rate": 2.0725280741489383e-07, + "loss": 0.7139, + "step": 75324 + }, + { + "epoch": 0.9415985399634991, + "grad_norm": 3.002939224243164, + "learning_rate": 2.0707609337713098e-07, + "loss": 0.5592, + "step": 75326 + }, + { + "epoch": 0.9416235405885147, + "grad_norm": 1.5380027294158936, + "learning_rate": 2.0689945392089018e-07, + "loss": 0.2136, + "step": 75328 + }, + { + "epoch": 0.9416485412135304, + "grad_norm": 3.1652586460113525, + "learning_rate": 2.0672288904752036e-07, + "loss": 0.872, + "step": 75330 + }, + { + "epoch": 0.941673541838546, + "grad_norm": 4.105292797088623, + "learning_rate": 2.0654639875836158e-07, + "loss": 1.1427, + "step": 75332 + }, + { + "epoch": 0.9416985424635615, + "grad_norm": 3.1607234477996826, + "learning_rate": 2.0636998305476275e-07, + "loss": 1.3531, + "step": 75334 + }, + { + "epoch": 0.9417235430885772, + "grad_norm": 3.374434471130371, + "learning_rate": 2.0619364193806392e-07, + "loss": 0.5164, + "step": 75336 + }, + { + "epoch": 0.9417485437135928, + "grad_norm": 4.797994136810303, + "learning_rate": 2.0601737540960954e-07, + "loss": 0.3727, + "step": 75338 + }, + { + "epoch": 0.9417735443386085, + "grad_norm": 6.028538703918457, + "learning_rate": 2.058411834707419e-07, + "loss": 1.2799, + "step": 75340 + }, + { + "epoch": 0.9417985449636241, + "grad_norm": 4.48268985748291, + "learning_rate": 2.0566506612280324e-07, + "loss": 1.3157, + "step": 75342 + }, + { + "epoch": 0.9418235455886397, + "grad_norm": 0.0021769481245428324, + "learning_rate": 2.0548902336713362e-07, + "loss": 0.8711, + "step": 75344 + }, + { + "epoch": 0.9418485462136553, + "grad_norm": 5.113984107971191, + "learning_rate": 2.0531305520507527e-07, + "loss": 0.4359, + "step": 75346 + }, + { + "epoch": 0.941873546838671, + "grad_norm": 4.509839057922363, + "learning_rate": 2.0513716163796716e-07, + "loss": 1.5727, + "step": 75348 + }, + { + "epoch": 0.9418985474636866, + "grad_norm": 0.5268940329551697, + "learning_rate": 2.0496134266714928e-07, + "loss": 0.871, + "step": 75350 + }, + { + "epoch": 0.9419235480887023, + "grad_norm": 3.1710000038146973, + "learning_rate": 2.047855982939606e-07, + "loss": 1.2194, + "step": 75352 + }, + { + "epoch": 0.9419485487137178, + "grad_norm": 3.738546848297119, + "learning_rate": 2.0460992851974003e-07, + "loss": 0.6411, + "step": 75354 + }, + { + "epoch": 0.9419735493387335, + "grad_norm": 4.0011444091796875, + "learning_rate": 2.044343333458243e-07, + "loss": 0.6042, + "step": 75356 + }, + { + "epoch": 0.9419985499637491, + "grad_norm": 3.2035646438598633, + "learning_rate": 2.0425881277355232e-07, + "loss": 1.4411, + "step": 75358 + }, + { + "epoch": 0.9420235505887647, + "grad_norm": 2.767183303833008, + "learning_rate": 2.0408336680425966e-07, + "loss": 0.4771, + "step": 75360 + }, + { + "epoch": 0.9420485512137804, + "grad_norm": 3.5172433853149414, + "learning_rate": 2.0390799543928308e-07, + "loss": 0.5809, + "step": 75362 + }, + { + "epoch": 0.9420735518387959, + "grad_norm": 7.531170845031738, + "learning_rate": 2.0373269867995817e-07, + "loss": 0.8981, + "step": 75364 + }, + { + "epoch": 0.9420985524638116, + "grad_norm": 5.80732536315918, + "learning_rate": 2.0355747652761826e-07, + "loss": 1.2664, + "step": 75366 + }, + { + "epoch": 0.9421235530888272, + "grad_norm": 3.2226192951202393, + "learning_rate": 2.033823289836001e-07, + "loss": 0.8683, + "step": 75368 + }, + { + "epoch": 0.9421485537138429, + "grad_norm": 4.021514415740967, + "learning_rate": 2.0320725604923708e-07, + "loss": 0.7184, + "step": 75370 + }, + { + "epoch": 0.9421735543388585, + "grad_norm": 2.734571933746338, + "learning_rate": 2.0303225772586144e-07, + "loss": 1.0752, + "step": 75372 + }, + { + "epoch": 0.9421985549638741, + "grad_norm": 2.6003828048706055, + "learning_rate": 2.0285733401480767e-07, + "loss": 0.3571, + "step": 75374 + }, + { + "epoch": 0.9422235555888897, + "grad_norm": 5.20252799987793, + "learning_rate": 2.0268248491740693e-07, + "loss": 0.5489, + "step": 75376 + }, + { + "epoch": 0.9422485562139054, + "grad_norm": 8.588139533996582, + "learning_rate": 2.025077104349904e-07, + "loss": 1.5082, + "step": 75378 + }, + { + "epoch": 0.942273556838921, + "grad_norm": 5.999099254608154, + "learning_rate": 2.0233301056888922e-07, + "loss": 1.7603, + "step": 75380 + }, + { + "epoch": 0.9422985574639366, + "grad_norm": 1.8109427690505981, + "learning_rate": 2.0215838532043563e-07, + "loss": 0.9443, + "step": 75382 + }, + { + "epoch": 0.9423235580889522, + "grad_norm": 1.2811863422393799, + "learning_rate": 2.019838346909564e-07, + "loss": 0.0438, + "step": 75384 + }, + { + "epoch": 0.9423485587139678, + "grad_norm": 4.571571350097656, + "learning_rate": 2.0180935868178487e-07, + "loss": 1.1573, + "step": 75386 + }, + { + "epoch": 0.9423735593389835, + "grad_norm": 2.5029008388519287, + "learning_rate": 2.0163495729424664e-07, + "loss": 1.0428, + "step": 75388 + }, + { + "epoch": 0.9423985599639991, + "grad_norm": 0.00030924356542527676, + "learning_rate": 2.0146063052967068e-07, + "loss": 0.0469, + "step": 75390 + }, + { + "epoch": 0.9424235605890148, + "grad_norm": 0.19493916630744934, + "learning_rate": 2.012863783893859e-07, + "loss": 0.0019, + "step": 75392 + }, + { + "epoch": 0.9424485612140303, + "grad_norm": 5.741832256317139, + "learning_rate": 2.0111220087471683e-07, + "loss": 1.2588, + "step": 75394 + }, + { + "epoch": 0.942473561839046, + "grad_norm": 2.3378305435180664, + "learning_rate": 2.009380979869924e-07, + "loss": 1.0401, + "step": 75396 + }, + { + "epoch": 0.9424985624640616, + "grad_norm": 4.931442737579346, + "learning_rate": 2.0076406972753814e-07, + "loss": 1.1334, + "step": 75398 + }, + { + "epoch": 0.9425235630890773, + "grad_norm": 0.0006817976827733219, + "learning_rate": 2.0059011609767864e-07, + "loss": 0.5295, + "step": 75400 + }, + { + "epoch": 0.9425485637140929, + "grad_norm": 4.7290730476379395, + "learning_rate": 2.0041623709873948e-07, + "loss": 2.0801, + "step": 75402 + }, + { + "epoch": 0.9425735643391084, + "grad_norm": 7.3297014236450195, + "learning_rate": 2.002424327320429e-07, + "loss": 1.1499, + "step": 75404 + }, + { + "epoch": 0.9425985649641241, + "grad_norm": 3.197100877761841, + "learning_rate": 2.0006870299891566e-07, + "loss": 0.6912, + "step": 75406 + }, + { + "epoch": 0.9426235655891397, + "grad_norm": 3.1203858852386475, + "learning_rate": 1.998950479006778e-07, + "loss": 0.9213, + "step": 75408 + }, + { + "epoch": 0.9426485662141554, + "grad_norm": 0.0006451135850511491, + "learning_rate": 1.9972146743865496e-07, + "loss": 0.1647, + "step": 75410 + }, + { + "epoch": 0.942673566839171, + "grad_norm": 1.0647464990615845, + "learning_rate": 1.9954796161416824e-07, + "loss": 0.4667, + "step": 75412 + }, + { + "epoch": 0.9426985674641866, + "grad_norm": 3.0297458171844482, + "learning_rate": 1.993745304285366e-07, + "loss": 1.3794, + "step": 75414 + }, + { + "epoch": 0.9427235680892022, + "grad_norm": 3.370445489883423, + "learning_rate": 1.9920117388308347e-07, + "loss": 1.4559, + "step": 75416 + }, + { + "epoch": 0.9427485687142179, + "grad_norm": 5.044151306152344, + "learning_rate": 1.9902789197912665e-07, + "loss": 0.3199, + "step": 75418 + }, + { + "epoch": 0.9427735693392335, + "grad_norm": 2.5866196155548096, + "learning_rate": 1.9885468471798952e-07, + "loss": 1.0749, + "step": 75420 + }, + { + "epoch": 0.9427985699642492, + "grad_norm": 3.0036916732788086, + "learning_rate": 1.986815521009877e-07, + "loss": 1.0971, + "step": 75422 + }, + { + "epoch": 0.9428235705892647, + "grad_norm": 2.544802188873291, + "learning_rate": 1.985084941294424e-07, + "loss": 0.5731, + "step": 75424 + }, + { + "epoch": 0.9428485712142803, + "grad_norm": 0.4858142137527466, + "learning_rate": 1.983355108046703e-07, + "loss": 0.5073, + "step": 75426 + }, + { + "epoch": 0.942873571839296, + "grad_norm": 3.3036105632781982, + "learning_rate": 1.9816260212798698e-07, + "loss": 0.653, + "step": 75428 + }, + { + "epoch": 0.9428985724643116, + "grad_norm": 3.541616201400757, + "learning_rate": 1.979897681007137e-07, + "loss": 0.7067, + "step": 75430 + }, + { + "epoch": 0.9429235730893273, + "grad_norm": 2.801896095275879, + "learning_rate": 1.9781700872416153e-07, + "loss": 0.8864, + "step": 75432 + }, + { + "epoch": 0.9429485737143428, + "grad_norm": 4.326540946960449, + "learning_rate": 1.976443239996506e-07, + "loss": 0.9324, + "step": 75434 + }, + { + "epoch": 0.9429735743393585, + "grad_norm": 4.280068397521973, + "learning_rate": 1.974717139284943e-07, + "loss": 0.2249, + "step": 75436 + }, + { + "epoch": 0.9429985749643741, + "grad_norm": 2.0608348846435547, + "learning_rate": 1.9729917851200707e-07, + "loss": 0.8034, + "step": 75438 + }, + { + "epoch": 0.9430235755893898, + "grad_norm": 9.124180793762207, + "learning_rate": 1.9712671775150348e-07, + "loss": 1.5836, + "step": 75440 + }, + { + "epoch": 0.9430485762144054, + "grad_norm": 3.44694447517395, + "learning_rate": 1.969543316482958e-07, + "loss": 0.7676, + "step": 75442 + }, + { + "epoch": 0.943073576839421, + "grad_norm": 2.7730181217193604, + "learning_rate": 1.967820202036974e-07, + "loss": 1.1317, + "step": 75444 + }, + { + "epoch": 0.9430985774644366, + "grad_norm": 1.042612075805664, + "learning_rate": 1.966097834190217e-07, + "loss": 0.5143, + "step": 75446 + }, + { + "epoch": 0.9431235780894522, + "grad_norm": 6.357030391693115, + "learning_rate": 1.9643762129557874e-07, + "loss": 1.1896, + "step": 75448 + }, + { + "epoch": 0.9431485787144679, + "grad_norm": 2.9035751819610596, + "learning_rate": 1.9626553383468084e-07, + "loss": 0.4905, + "step": 75450 + }, + { + "epoch": 0.9431735793394835, + "grad_norm": 0.004340843763202429, + "learning_rate": 1.9609352103763912e-07, + "loss": 0.0881, + "step": 75452 + }, + { + "epoch": 0.9431985799644991, + "grad_norm": 2.360069513320923, + "learning_rate": 1.9592158290576147e-07, + "loss": 0.7417, + "step": 75454 + }, + { + "epoch": 0.9432235805895147, + "grad_norm": 0.0008248414378613234, + "learning_rate": 1.9574971944035904e-07, + "loss": 0.0552, + "step": 75456 + }, + { + "epoch": 0.9432485812145304, + "grad_norm": 2.7835962772369385, + "learning_rate": 1.9557793064274077e-07, + "loss": 0.3442, + "step": 75458 + }, + { + "epoch": 0.943273581839546, + "grad_norm": 8.86698055267334, + "learning_rate": 1.9540621651421344e-07, + "loss": 0.9326, + "step": 75460 + }, + { + "epoch": 0.9432985824645617, + "grad_norm": 2.756131649017334, + "learning_rate": 1.9523457705608707e-07, + "loss": 0.5923, + "step": 75462 + }, + { + "epoch": 0.9433235830895772, + "grad_norm": 3.335923433303833, + "learning_rate": 1.9506301226966728e-07, + "loss": 1.3393, + "step": 75464 + }, + { + "epoch": 0.9433485837145928, + "grad_norm": 3.9180376529693604, + "learning_rate": 1.9489152215626083e-07, + "loss": 1.568, + "step": 75466 + }, + { + "epoch": 0.9433735843396085, + "grad_norm": 3.9473633766174316, + "learning_rate": 1.9472010671717446e-07, + "loss": 1.9544, + "step": 75468 + }, + { + "epoch": 0.9433985849646241, + "grad_norm": 1.575857162475586, + "learning_rate": 1.9454876595371264e-07, + "loss": 0.3818, + "step": 75470 + }, + { + "epoch": 0.9434235855896398, + "grad_norm": 5.19008207321167, + "learning_rate": 1.94377499867181e-07, + "loss": 1.3436, + "step": 75472 + }, + { + "epoch": 0.9434485862146553, + "grad_norm": 2.362459659576416, + "learning_rate": 1.94206308458883e-07, + "loss": 1.0777, + "step": 75474 + }, + { + "epoch": 0.943473586839671, + "grad_norm": 3.3621482849121094, + "learning_rate": 1.940351917301242e-07, + "loss": 1.0497, + "step": 75476 + }, + { + "epoch": 0.9434985874646866, + "grad_norm": 0.0031974513549357653, + "learning_rate": 1.938641496822069e-07, + "loss": 0.6268, + "step": 75478 + }, + { + "epoch": 0.9435235880897023, + "grad_norm": 2.935974359512329, + "learning_rate": 1.936931823164323e-07, + "loss": 0.635, + "step": 75480 + }, + { + "epoch": 0.9435485887147179, + "grad_norm": 4.373347759246826, + "learning_rate": 1.935222896341038e-07, + "loss": 0.2911, + "step": 75482 + }, + { + "epoch": 0.9435735893397335, + "grad_norm": 1.05618417263031, + "learning_rate": 1.9335147163652256e-07, + "loss": 0.8111, + "step": 75484 + }, + { + "epoch": 0.9435985899647491, + "grad_norm": 2.533372640609741, + "learning_rate": 1.931807283249909e-07, + "loss": 1.0798, + "step": 75486 + }, + { + "epoch": 0.9436235905897647, + "grad_norm": 3.774327516555786, + "learning_rate": 1.9301005970080667e-07, + "loss": 0.928, + "step": 75488 + }, + { + "epoch": 0.9436485912147804, + "grad_norm": 0.00036332817398943007, + "learning_rate": 1.9283946576527213e-07, + "loss": 1.6243, + "step": 75490 + }, + { + "epoch": 0.943673591839796, + "grad_norm": 6.049635410308838, + "learning_rate": 1.9266894651968627e-07, + "loss": 1.5588, + "step": 75492 + }, + { + "epoch": 0.9436985924648116, + "grad_norm": 2.698153018951416, + "learning_rate": 1.924985019653447e-07, + "loss": 0.68, + "step": 75494 + }, + { + "epoch": 0.9437235930898272, + "grad_norm": 5.24209451675415, + "learning_rate": 1.9232813210354862e-07, + "loss": 1.0549, + "step": 75496 + }, + { + "epoch": 0.9437485937148429, + "grad_norm": 3.7642343044281006, + "learning_rate": 1.9215783693559364e-07, + "loss": 0.3176, + "step": 75498 + }, + { + "epoch": 0.9437735943398585, + "grad_norm": 0.6728330850601196, + "learning_rate": 1.9198761646277875e-07, + "loss": 0.0315, + "step": 75500 + }, + { + "epoch": 0.9437985949648742, + "grad_norm": 4.225147247314453, + "learning_rate": 1.9181747068639846e-07, + "loss": 1.2483, + "step": 75502 + }, + { + "epoch": 0.9438235955898897, + "grad_norm": 0.013050908222794533, + "learning_rate": 1.9164739960775057e-07, + "loss": 0.455, + "step": 75504 + }, + { + "epoch": 0.9438485962149054, + "grad_norm": 3.555602550506592, + "learning_rate": 1.9147740322812857e-07, + "loss": 1.4956, + "step": 75506 + }, + { + "epoch": 0.943873596839921, + "grad_norm": 3.4887802600860596, + "learning_rate": 1.9130748154882696e-07, + "loss": 1.2076, + "step": 75508 + }, + { + "epoch": 0.9438985974649367, + "grad_norm": 4.192656993865967, + "learning_rate": 1.9113763457114132e-07, + "loss": 1.427, + "step": 75510 + }, + { + "epoch": 0.9439235980899523, + "grad_norm": 4.2303032875061035, + "learning_rate": 1.9096786229636287e-07, + "loss": 0.7845, + "step": 75512 + }, + { + "epoch": 0.9439485987149678, + "grad_norm": 2.0973756313323975, + "learning_rate": 1.9079816472578616e-07, + "loss": 0.6298, + "step": 75514 + }, + { + "epoch": 0.9439735993399835, + "grad_norm": 3.866886615753174, + "learning_rate": 1.9062854186070568e-07, + "loss": 0.8208, + "step": 75516 + }, + { + "epoch": 0.9439985999649991, + "grad_norm": 3.085289239883423, + "learning_rate": 1.9045899370240927e-07, + "loss": 0.5259, + "step": 75518 + }, + { + "epoch": 0.9440236005900148, + "grad_norm": 3.2578117847442627, + "learning_rate": 1.9028952025219037e-07, + "loss": 0.7196, + "step": 75520 + }, + { + "epoch": 0.9440486012150304, + "grad_norm": 2.7279393672943115, + "learning_rate": 1.9012012151133907e-07, + "loss": 0.8965, + "step": 75522 + }, + { + "epoch": 0.944073601840046, + "grad_norm": 0.005843591410666704, + "learning_rate": 1.8995079748114654e-07, + "loss": 1.3468, + "step": 75524 + }, + { + "epoch": 0.9440986024650616, + "grad_norm": 2.6246047019958496, + "learning_rate": 1.897815481629006e-07, + "loss": 0.2189, + "step": 75526 + }, + { + "epoch": 0.9441236030900773, + "grad_norm": 1.6029952764511108, + "learning_rate": 1.896123735578903e-07, + "loss": 0.0657, + "step": 75528 + }, + { + "epoch": 0.9441486037150929, + "grad_norm": 1.0316270589828491, + "learning_rate": 1.8944327366740788e-07, + "loss": 0.1886, + "step": 75530 + }, + { + "epoch": 0.9441736043401086, + "grad_norm": 3.879241466522217, + "learning_rate": 1.8927424849273567e-07, + "loss": 2.0342, + "step": 75532 + }, + { + "epoch": 0.9441986049651241, + "grad_norm": 4.383461952209473, + "learning_rate": 1.8910529803516375e-07, + "loss": 1.4598, + "step": 75534 + }, + { + "epoch": 0.9442236055901397, + "grad_norm": 1.8192750215530396, + "learning_rate": 1.8893642229597776e-07, + "loss": 0.9847, + "step": 75536 + }, + { + "epoch": 0.9442486062151554, + "grad_norm": 0.8610544204711914, + "learning_rate": 1.8876762127646665e-07, + "loss": 0.044, + "step": 75538 + }, + { + "epoch": 0.944273606840171, + "grad_norm": 0.0012769006425514817, + "learning_rate": 1.8859889497791163e-07, + "loss": 0.1186, + "step": 75540 + }, + { + "epoch": 0.9442986074651867, + "grad_norm": 5.709754943847656, + "learning_rate": 1.8843024340160165e-07, + "loss": 1.7198, + "step": 75542 + }, + { + "epoch": 0.9443236080902022, + "grad_norm": 0.561021089553833, + "learning_rate": 1.8826166654881906e-07, + "loss": 0.1676, + "step": 75544 + }, + { + "epoch": 0.9443486087152179, + "grad_norm": 9.248371124267578, + "learning_rate": 1.8809316442084724e-07, + "loss": 1.4165, + "step": 75546 + }, + { + "epoch": 0.9443736093402335, + "grad_norm": 0.8361867070198059, + "learning_rate": 1.8792473701897185e-07, + "loss": 0.5592, + "step": 75548 + }, + { + "epoch": 0.9443986099652492, + "grad_norm": 2.5518784523010254, + "learning_rate": 1.8775638434447297e-07, + "loss": 0.7571, + "step": 75550 + }, + { + "epoch": 0.9444236105902648, + "grad_norm": 2.9616212844848633, + "learning_rate": 1.875881063986329e-07, + "loss": 1.3746, + "step": 75552 + }, + { + "epoch": 0.9444486112152803, + "grad_norm": 2.9839110374450684, + "learning_rate": 1.8741990318273617e-07, + "loss": 0.5953, + "step": 75554 + }, + { + "epoch": 0.944473611840296, + "grad_norm": 6.092386722564697, + "learning_rate": 1.8725177469806178e-07, + "loss": 1.6689, + "step": 75556 + }, + { + "epoch": 0.9444986124653116, + "grad_norm": 3.8212153911590576, + "learning_rate": 1.870837209458898e-07, + "loss": 1.4002, + "step": 75558 + }, + { + "epoch": 0.9445236130903273, + "grad_norm": 3.5553040504455566, + "learning_rate": 1.8691574192750027e-07, + "loss": 0.5222, + "step": 75560 + }, + { + "epoch": 0.9445486137153429, + "grad_norm": 6.187264442443848, + "learning_rate": 1.8674783764417225e-07, + "loss": 0.3007, + "step": 75562 + }, + { + "epoch": 0.9445736143403585, + "grad_norm": 3.108199119567871, + "learning_rate": 1.865800080971858e-07, + "loss": 1.182, + "step": 75564 + }, + { + "epoch": 0.9445986149653741, + "grad_norm": 4.505890369415283, + "learning_rate": 1.8641225328781765e-07, + "loss": 0.9776, + "step": 75566 + }, + { + "epoch": 0.9446236155903898, + "grad_norm": 4.529221534729004, + "learning_rate": 1.862445732173468e-07, + "loss": 1.1345, + "step": 75568 + }, + { + "epoch": 0.9446486162154054, + "grad_norm": 3.7636163234710693, + "learning_rate": 1.8607696788704887e-07, + "loss": 1.7408, + "step": 75570 + }, + { + "epoch": 0.9446736168404211, + "grad_norm": 2.765573501586914, + "learning_rate": 1.8590943729820176e-07, + "loss": 0.7906, + "step": 75572 + }, + { + "epoch": 0.9446986174654366, + "grad_norm": 0.024207666516304016, + "learning_rate": 1.8574198145207888e-07, + "loss": 0.1935, + "step": 75574 + }, + { + "epoch": 0.9447236180904522, + "grad_norm": 2.6850697994232178, + "learning_rate": 1.8557460034995812e-07, + "loss": 0.7483, + "step": 75576 + }, + { + "epoch": 0.9447486187154679, + "grad_norm": 6.96180534362793, + "learning_rate": 1.8540729399311396e-07, + "loss": 0.6749, + "step": 75578 + }, + { + "epoch": 0.9447736193404835, + "grad_norm": 2.2725791931152344, + "learning_rate": 1.852400623828188e-07, + "loss": 0.3464, + "step": 75580 + }, + { + "epoch": 0.9447986199654992, + "grad_norm": 3.822253465652466, + "learning_rate": 1.8507290552034929e-07, + "loss": 1.2344, + "step": 75582 + }, + { + "epoch": 0.9448236205905147, + "grad_norm": 2.0199410915374756, + "learning_rate": 1.849058234069745e-07, + "loss": 0.9347, + "step": 75584 + }, + { + "epoch": 0.9448486212155304, + "grad_norm": 2.0236196517944336, + "learning_rate": 1.8473881604397003e-07, + "loss": 1.3919, + "step": 75586 + }, + { + "epoch": 0.944873621840546, + "grad_norm": 1.5240064859390259, + "learning_rate": 1.84571883432606e-07, + "loss": 1.1014, + "step": 75588 + }, + { + "epoch": 0.9448986224655617, + "grad_norm": 0.08382584899663925, + "learning_rate": 1.8440502557415475e-07, + "loss": 0.848, + "step": 75590 + }, + { + "epoch": 0.9449236230905773, + "grad_norm": 6.9820098876953125, + "learning_rate": 1.8423824246988741e-07, + "loss": 0.4616, + "step": 75592 + }, + { + "epoch": 0.9449486237155929, + "grad_norm": 0.005481067579239607, + "learning_rate": 1.8407153412107413e-07, + "loss": 0.148, + "step": 75594 + }, + { + "epoch": 0.9449736243406085, + "grad_norm": 3.4045326709747314, + "learning_rate": 1.839049005289828e-07, + "loss": 0.7341, + "step": 75596 + }, + { + "epoch": 0.9449986249656241, + "grad_norm": 4.918987274169922, + "learning_rate": 1.8373834169488348e-07, + "loss": 1.762, + "step": 75598 + }, + { + "epoch": 0.9450236255906398, + "grad_norm": 3.987114667892456, + "learning_rate": 1.8357185762004403e-07, + "loss": 0.9664, + "step": 75600 + }, + { + "epoch": 0.9450486262156554, + "grad_norm": 5.791675090789795, + "learning_rate": 1.834054483057346e-07, + "loss": 1.4306, + "step": 75602 + }, + { + "epoch": 0.945073626840671, + "grad_norm": 0.5433483719825745, + "learning_rate": 1.832391137532208e-07, + "loss": 0.3583, + "step": 75604 + }, + { + "epoch": 0.9450986274656866, + "grad_norm": 0.0003916104033123702, + "learning_rate": 1.830728539637694e-07, + "loss": 0.2228, + "step": 75606 + }, + { + "epoch": 0.9451236280907023, + "grad_norm": 4.242395877838135, + "learning_rate": 1.8290666893864827e-07, + "loss": 0.804, + "step": 75608 + }, + { + "epoch": 0.9451486287157179, + "grad_norm": 1.9644075632095337, + "learning_rate": 1.827405586791209e-07, + "loss": 0.9164, + "step": 75610 + }, + { + "epoch": 0.9451736293407336, + "grad_norm": 2.095008134841919, + "learning_rate": 1.8257452318645175e-07, + "loss": 1.0646, + "step": 75612 + }, + { + "epoch": 0.9451986299657491, + "grad_norm": 1.8025805950164795, + "learning_rate": 1.8240856246190763e-07, + "loss": 0.2545, + "step": 75614 + }, + { + "epoch": 0.9452236305907648, + "grad_norm": 4.062134265899658, + "learning_rate": 1.8224267650675198e-07, + "loss": 0.9298, + "step": 75616 + }, + { + "epoch": 0.9452486312157804, + "grad_norm": 4.759594917297363, + "learning_rate": 1.82076865322246e-07, + "loss": 1.9789, + "step": 75618 + }, + { + "epoch": 0.945273631840796, + "grad_norm": 2.6204593181610107, + "learning_rate": 1.8191112890965757e-07, + "loss": 0.7698, + "step": 75620 + }, + { + "epoch": 0.9452986324658117, + "grad_norm": 0.5201612114906311, + "learning_rate": 1.8174546727024344e-07, + "loss": 0.2975, + "step": 75622 + }, + { + "epoch": 0.9453236330908272, + "grad_norm": 5.741332054138184, + "learning_rate": 1.8157988040526707e-07, + "loss": 1.0162, + "step": 75624 + }, + { + "epoch": 0.9453486337158429, + "grad_norm": 2.685354232788086, + "learning_rate": 1.814143683159908e-07, + "loss": 1.3049, + "step": 75626 + }, + { + "epoch": 0.9453736343408585, + "grad_norm": 6.214623928070068, + "learning_rate": 1.8124893100367358e-07, + "loss": 0.9575, + "step": 75628 + }, + { + "epoch": 0.9453986349658742, + "grad_norm": 3.260946035385132, + "learning_rate": 1.8108356846957665e-07, + "loss": 1.1298, + "step": 75630 + }, + { + "epoch": 0.9454236355908898, + "grad_norm": 1.8627392053604126, + "learning_rate": 1.8091828071495787e-07, + "loss": 0.8761, + "step": 75632 + }, + { + "epoch": 0.9454486362159054, + "grad_norm": 4.453280925750732, + "learning_rate": 1.8075306774107848e-07, + "loss": 0.7108, + "step": 75634 + }, + { + "epoch": 0.945473636840921, + "grad_norm": 3.4977614879608154, + "learning_rate": 1.80587929549193e-07, + "loss": 1.1147, + "step": 75636 + }, + { + "epoch": 0.9454986374659367, + "grad_norm": 7.872206687927246, + "learning_rate": 1.804228661405627e-07, + "loss": 0.4419, + "step": 75638 + }, + { + "epoch": 0.9455236380909523, + "grad_norm": 6.821880340576172, + "learning_rate": 1.802578775164432e-07, + "loss": 0.4386, + "step": 75640 + }, + { + "epoch": 0.945548638715968, + "grad_norm": 2.8201849460601807, + "learning_rate": 1.8009296367809016e-07, + "loss": 1.2686, + "step": 75642 + }, + { + "epoch": 0.9455736393409835, + "grad_norm": 5.619753837585449, + "learning_rate": 1.7992812462676147e-07, + "loss": 1.3974, + "step": 75644 + }, + { + "epoch": 0.9455986399659991, + "grad_norm": 0.07733900845050812, + "learning_rate": 1.7976336036371166e-07, + "loss": 0.1202, + "step": 75646 + }, + { + "epoch": 0.9456236405910148, + "grad_norm": 2.9950168132781982, + "learning_rate": 1.7959867089019424e-07, + "loss": 0.8454, + "step": 75648 + }, + { + "epoch": 0.9456486412160304, + "grad_norm": 3.322242259979248, + "learning_rate": 1.794340562074648e-07, + "loss": 0.2803, + "step": 75650 + }, + { + "epoch": 0.9456736418410461, + "grad_norm": 3.317042827606201, + "learning_rate": 1.792695163167768e-07, + "loss": 1.7634, + "step": 75652 + }, + { + "epoch": 0.9456986424660616, + "grad_norm": 0.00045053043868392706, + "learning_rate": 1.7910505121938483e-07, + "loss": 0.019, + "step": 75654 + }, + { + "epoch": 0.9457236430910773, + "grad_norm": 4.760013103485107, + "learning_rate": 1.7894066091653782e-07, + "loss": 1.7782, + "step": 75656 + }, + { + "epoch": 0.9457486437160929, + "grad_norm": 2.2039108276367188, + "learning_rate": 1.787763454094915e-07, + "loss": 0.9512, + "step": 75658 + }, + { + "epoch": 0.9457736443411086, + "grad_norm": 0.00028094041044823825, + "learning_rate": 1.7861210469949597e-07, + "loss": 0.0461, + "step": 75660 + }, + { + "epoch": 0.9457986449661242, + "grad_norm": 0.00024890212807804346, + "learning_rate": 1.784479387878002e-07, + "loss": 0.0001, + "step": 75662 + }, + { + "epoch": 0.9458236455911397, + "grad_norm": 0.00037614928442053497, + "learning_rate": 1.7828384767565765e-07, + "loss": 0.2369, + "step": 75664 + }, + { + "epoch": 0.9458486462161554, + "grad_norm": 3.2567081451416016, + "learning_rate": 1.781198313643151e-07, + "loss": 0.7928, + "step": 75666 + }, + { + "epoch": 0.945873646841171, + "grad_norm": 3.5055813789367676, + "learning_rate": 1.7795588985502378e-07, + "loss": 1.1868, + "step": 75668 + }, + { + "epoch": 0.9458986474661867, + "grad_norm": 4.842618942260742, + "learning_rate": 1.7779202314903043e-07, + "loss": 0.7348, + "step": 75670 + }, + { + "epoch": 0.9459236480912023, + "grad_norm": 5.73556661605835, + "learning_rate": 1.7762823124758633e-07, + "loss": 2.7022, + "step": 75672 + }, + { + "epoch": 0.9459486487162179, + "grad_norm": 6.636937141418457, + "learning_rate": 1.7746451415193489e-07, + "loss": 1.0612, + "step": 75674 + }, + { + "epoch": 0.9459736493412335, + "grad_norm": 3.5501677989959717, + "learning_rate": 1.77300871863324e-07, + "loss": 1.4164, + "step": 75676 + }, + { + "epoch": 0.9459986499662492, + "grad_norm": 0.0003083823248744011, + "learning_rate": 1.771373043830016e-07, + "loss": 0.2206, + "step": 75678 + }, + { + "epoch": 0.9460236505912648, + "grad_norm": 1.962920069694519, + "learning_rate": 1.7697381171221218e-07, + "loss": 0.7637, + "step": 75680 + }, + { + "epoch": 0.9460486512162805, + "grad_norm": 0.0036125825718045235, + "learning_rate": 1.7681039385220144e-07, + "loss": 0.5566, + "step": 75682 + }, + { + "epoch": 0.946073651841296, + "grad_norm": 1.8174277544021606, + "learning_rate": 1.7664705080421284e-07, + "loss": 1.285, + "step": 75684 + }, + { + "epoch": 0.9460986524663116, + "grad_norm": 3.3561840057373047, + "learning_rate": 1.7648378256949316e-07, + "loss": 0.7238, + "step": 75686 + }, + { + "epoch": 0.9461236530913273, + "grad_norm": 3.3641631603240967, + "learning_rate": 1.7632058914928252e-07, + "loss": 0.8188, + "step": 75688 + }, + { + "epoch": 0.9461486537163429, + "grad_norm": 5.079200744628906, + "learning_rate": 1.7615747054482434e-07, + "loss": 1.3054, + "step": 75690 + }, + { + "epoch": 0.9461736543413586, + "grad_norm": 0.12570473551750183, + "learning_rate": 1.759944267573621e-07, + "loss": 0.0652, + "step": 75692 + }, + { + "epoch": 0.9461986549663741, + "grad_norm": 0.0002480406838003546, + "learning_rate": 1.7583145778813705e-07, + "loss": 0.6465, + "step": 75694 + }, + { + "epoch": 0.9462236555913898, + "grad_norm": 3.770034074783325, + "learning_rate": 1.7566856363839035e-07, + "loss": 1.1826, + "step": 75696 + }, + { + "epoch": 0.9462486562164054, + "grad_norm": 0.0006431216024793684, + "learning_rate": 1.7550574430936328e-07, + "loss": 0.0335, + "step": 75698 + }, + { + "epoch": 0.9462736568414211, + "grad_norm": 4.569036483764648, + "learning_rate": 1.7534299980229484e-07, + "loss": 1.5731, + "step": 75700 + }, + { + "epoch": 0.9462986574664367, + "grad_norm": 3.452805519104004, + "learning_rate": 1.7518033011842406e-07, + "loss": 1.1455, + "step": 75702 + }, + { + "epoch": 0.9463236580914522, + "grad_norm": 3.8238770961761475, + "learning_rate": 1.7501773525899102e-07, + "loss": 0.6965, + "step": 75704 + }, + { + "epoch": 0.9463486587164679, + "grad_norm": 0.0004765418416354805, + "learning_rate": 1.7485521522523363e-07, + "loss": 0.6683, + "step": 75706 + }, + { + "epoch": 0.9463736593414835, + "grad_norm": 4.408145904541016, + "learning_rate": 1.746927700183887e-07, + "loss": 0.8695, + "step": 75708 + }, + { + "epoch": 0.9463986599664992, + "grad_norm": 5.371250152587891, + "learning_rate": 1.745303996396963e-07, + "loss": 0.3319, + "step": 75710 + }, + { + "epoch": 0.9464236605915148, + "grad_norm": 3.1118297576904297, + "learning_rate": 1.7436810409038995e-07, + "loss": 0.5205, + "step": 75712 + }, + { + "epoch": 0.9464486612165304, + "grad_norm": 0.000569713010918349, + "learning_rate": 1.7420588337170529e-07, + "loss": 0.4693, + "step": 75714 + }, + { + "epoch": 0.946473661841546, + "grad_norm": 0.0004399950848892331, + "learning_rate": 1.7404373748488024e-07, + "loss": 0.0296, + "step": 75716 + }, + { + "epoch": 0.9464986624665617, + "grad_norm": 0.00022493998403660953, + "learning_rate": 1.7388166643114824e-07, + "loss": 0.238, + "step": 75718 + }, + { + "epoch": 0.9465236630915773, + "grad_norm": 1.1896945238113403, + "learning_rate": 1.7371967021174497e-07, + "loss": 0.367, + "step": 75720 + }, + { + "epoch": 0.946548663716593, + "grad_norm": 4.227677345275879, + "learning_rate": 1.7355774882790166e-07, + "loss": 0.7073, + "step": 75722 + }, + { + "epoch": 0.9465736643416085, + "grad_norm": 1.833182454109192, + "learning_rate": 1.73395902280854e-07, + "loss": 0.7015, + "step": 75724 + }, + { + "epoch": 0.9465986649666241, + "grad_norm": 2.908947229385376, + "learning_rate": 1.7323413057183324e-07, + "loss": 1.4635, + "step": 75726 + }, + { + "epoch": 0.9466236655916398, + "grad_norm": 4.653380393981934, + "learning_rate": 1.730724337020706e-07, + "loss": 0.4226, + "step": 75728 + }, + { + "epoch": 0.9466486662166554, + "grad_norm": 4.433050155639648, + "learning_rate": 1.7291081167280066e-07, + "loss": 1.6126, + "step": 75730 + }, + { + "epoch": 0.9466736668416711, + "grad_norm": 0.015829134732484818, + "learning_rate": 1.727492644852502e-07, + "loss": 0.0003, + "step": 75732 + }, + { + "epoch": 0.9466986674666866, + "grad_norm": 4.36277437210083, + "learning_rate": 1.725877921406538e-07, + "loss": 1.2729, + "step": 75734 + }, + { + "epoch": 0.9467236680917023, + "grad_norm": 4.040924072265625, + "learning_rate": 1.7242639464023714e-07, + "loss": 1.0569, + "step": 75736 + }, + { + "epoch": 0.9467486687167179, + "grad_norm": 1.9453293085098267, + "learning_rate": 1.7226507198523257e-07, + "loss": 0.1375, + "step": 75738 + }, + { + "epoch": 0.9467736693417336, + "grad_norm": 9.398141860961914, + "learning_rate": 1.7210382417686687e-07, + "loss": 1.0795, + "step": 75740 + }, + { + "epoch": 0.9467986699667492, + "grad_norm": 5.110440731048584, + "learning_rate": 1.719426512163691e-07, + "loss": 1.6219, + "step": 75742 + }, + { + "epoch": 0.9468236705917648, + "grad_norm": 2.8743700981140137, + "learning_rate": 1.7178155310496603e-07, + "loss": 0.9409, + "step": 75744 + }, + { + "epoch": 0.9468486712167804, + "grad_norm": 2.522998094558716, + "learning_rate": 1.7162052984388443e-07, + "loss": 0.8642, + "step": 75746 + }, + { + "epoch": 0.946873671841796, + "grad_norm": 0.00022468817769549787, + "learning_rate": 1.7145958143435115e-07, + "loss": 0.0035, + "step": 75748 + }, + { + "epoch": 0.9468986724668117, + "grad_norm": 3.5779547691345215, + "learning_rate": 1.7129870787759183e-07, + "loss": 0.3152, + "step": 75750 + }, + { + "epoch": 0.9469236730918273, + "grad_norm": 2.0710325241088867, + "learning_rate": 1.711379091748322e-07, + "loss": 1.2606, + "step": 75752 + }, + { + "epoch": 0.9469486737168429, + "grad_norm": 3.4086742401123047, + "learning_rate": 1.7097718532729678e-07, + "loss": 1.1286, + "step": 75754 + }, + { + "epoch": 0.9469736743418585, + "grad_norm": 4.4914164543151855, + "learning_rate": 1.70816536336208e-07, + "loss": 1.7577, + "step": 75756 + }, + { + "epoch": 0.9469986749668742, + "grad_norm": 3.712455987930298, + "learning_rate": 1.7065596220279145e-07, + "loss": 1.0571, + "step": 75758 + }, + { + "epoch": 0.9470236755918898, + "grad_norm": 0.3050572872161865, + "learning_rate": 1.7049546292826735e-07, + "loss": 0.0932, + "step": 75760 + }, + { + "epoch": 0.9470486762169055, + "grad_norm": 2.1332740783691406, + "learning_rate": 1.7033503851386135e-07, + "loss": 1.274, + "step": 75762 + }, + { + "epoch": 0.947073676841921, + "grad_norm": 3.884472370147705, + "learning_rate": 1.7017468896079358e-07, + "loss": 1.0961, + "step": 75764 + }, + { + "epoch": 0.9470986774669367, + "grad_norm": 9.477798461914062, + "learning_rate": 1.700144142702853e-07, + "loss": 2.0781, + "step": 75766 + }, + { + "epoch": 0.9471236780919523, + "grad_norm": 1.6832298040390015, + "learning_rate": 1.6985421444355665e-07, + "loss": 0.2001, + "step": 75768 + }, + { + "epoch": 0.947148678716968, + "grad_norm": 2.7713589668273926, + "learning_rate": 1.6969408948182775e-07, + "loss": 0.6157, + "step": 75770 + }, + { + "epoch": 0.9471736793419836, + "grad_norm": 1.7516943216323853, + "learning_rate": 1.6953403938631984e-07, + "loss": 0.9924, + "step": 75772 + }, + { + "epoch": 0.9471986799669991, + "grad_norm": 0.0002869941818062216, + "learning_rate": 1.6937406415824975e-07, + "loss": 1.3233, + "step": 75774 + }, + { + "epoch": 0.9472236805920148, + "grad_norm": 0.0030535347759723663, + "learning_rate": 1.6921416379883649e-07, + "loss": 0.1787, + "step": 75776 + }, + { + "epoch": 0.9472486812170304, + "grad_norm": 4.02292013168335, + "learning_rate": 1.6905433830929907e-07, + "loss": 1.0432, + "step": 75778 + }, + { + "epoch": 0.9472736818420461, + "grad_norm": 2.622182607650757, + "learning_rate": 1.688945876908521e-07, + "loss": 0.7006, + "step": 75780 + }, + { + "epoch": 0.9472986824670617, + "grad_norm": 5.882516384124756, + "learning_rate": 1.687349119447146e-07, + "loss": 1.2682, + "step": 75782 + }, + { + "epoch": 0.9473236830920773, + "grad_norm": 5.768380165100098, + "learning_rate": 1.6857531107210111e-07, + "loss": 1.0938, + "step": 75784 + }, + { + "epoch": 0.9473486837170929, + "grad_norm": 3.137599229812622, + "learning_rate": 1.6841578507422852e-07, + "loss": 0.5484, + "step": 75786 + }, + { + "epoch": 0.9473736843421086, + "grad_norm": 0.0016654179198667407, + "learning_rate": 1.6825633395231023e-07, + "loss": 1.3009, + "step": 75788 + }, + { + "epoch": 0.9473986849671242, + "grad_norm": 0.02886090986430645, + "learning_rate": 1.6809695770756196e-07, + "loss": 0.0953, + "step": 75790 + }, + { + "epoch": 0.9474236855921399, + "grad_norm": 1.9610716104507446, + "learning_rate": 1.6793765634119718e-07, + "loss": 0.2839, + "step": 75792 + }, + { + "epoch": 0.9474486862171554, + "grad_norm": 0.0004080890794284642, + "learning_rate": 1.6777842985442715e-07, + "loss": 0.5739, + "step": 75794 + }, + { + "epoch": 0.947473686842171, + "grad_norm": 1.6182966232299805, + "learning_rate": 1.676192782484687e-07, + "loss": 0.4605, + "step": 75796 + }, + { + "epoch": 0.9474986874671867, + "grad_norm": 4.174654483795166, + "learning_rate": 1.6746020152452969e-07, + "loss": 1.3154, + "step": 75798 + }, + { + "epoch": 0.9475236880922023, + "grad_norm": 0.2790625989437103, + "learning_rate": 1.6730119968382363e-07, + "loss": 0.1482, + "step": 75800 + }, + { + "epoch": 0.947548688717218, + "grad_norm": 4.36328125, + "learning_rate": 1.6714227272756068e-07, + "loss": 1.3815, + "step": 75802 + }, + { + "epoch": 0.9475736893422335, + "grad_norm": 0.3207199275493622, + "learning_rate": 1.6698342065695316e-07, + "loss": 0.035, + "step": 75804 + }, + { + "epoch": 0.9475986899672492, + "grad_norm": 3.0465657711029053, + "learning_rate": 1.6682464347320791e-07, + "loss": 0.6757, + "step": 75806 + }, + { + "epoch": 0.9476236905922648, + "grad_norm": 1.4434027671813965, + "learning_rate": 1.6666594117753622e-07, + "loss": 0.9256, + "step": 75808 + }, + { + "epoch": 0.9476486912172805, + "grad_norm": 3.105337142944336, + "learning_rate": 1.6650731377114705e-07, + "loss": 0.6394, + "step": 75810 + }, + { + "epoch": 0.9476736918422961, + "grad_norm": 2.667400360107422, + "learning_rate": 1.6634876125524614e-07, + "loss": 1.6792, + "step": 75812 + }, + { + "epoch": 0.9476986924673116, + "grad_norm": 3.3025550842285156, + "learning_rate": 1.661902836310425e-07, + "loss": 1.2668, + "step": 75814 + }, + { + "epoch": 0.9477236930923273, + "grad_norm": 8.310596466064453, + "learning_rate": 1.660318808997441e-07, + "loss": 1.4247, + "step": 75816 + }, + { + "epoch": 0.9477486937173429, + "grad_norm": 3.0333566665649414, + "learning_rate": 1.658735530625555e-07, + "loss": 0.6646, + "step": 75818 + }, + { + "epoch": 0.9477736943423586, + "grad_norm": 4.4193572998046875, + "learning_rate": 1.6571530012068348e-07, + "loss": 0.7493, + "step": 75820 + }, + { + "epoch": 0.9477986949673742, + "grad_norm": 4.022397994995117, + "learning_rate": 1.655571220753327e-07, + "loss": 1.688, + "step": 75822 + }, + { + "epoch": 0.9478236955923898, + "grad_norm": 0.0020592589862644672, + "learning_rate": 1.653990189277088e-07, + "loss": 0.0001, + "step": 75824 + }, + { + "epoch": 0.9478486962174054, + "grad_norm": 4.544352054595947, + "learning_rate": 1.6524099067901534e-07, + "loss": 2.3761, + "step": 75826 + }, + { + "epoch": 0.9478736968424211, + "grad_norm": 3.9088916778564453, + "learning_rate": 1.6508303733045572e-07, + "loss": 0.5775, + "step": 75828 + }, + { + "epoch": 0.9478986974674367, + "grad_norm": 3.1224007606506348, + "learning_rate": 1.6492515888323235e-07, + "loss": 1.1407, + "step": 75830 + }, + { + "epoch": 0.9479236980924524, + "grad_norm": 4.143520832061768, + "learning_rate": 1.647673553385476e-07, + "loss": 0.8721, + "step": 75832 + }, + { + "epoch": 0.9479486987174679, + "grad_norm": 2.4653191566467285, + "learning_rate": 1.6460962669760494e-07, + "loss": 1.565, + "step": 75834 + }, + { + "epoch": 0.9479736993424835, + "grad_norm": 3.113801956176758, + "learning_rate": 1.6445197296160453e-07, + "loss": 1.1933, + "step": 75836 + }, + { + "epoch": 0.9479986999674992, + "grad_norm": 3.3549251556396484, + "learning_rate": 1.6429439413174653e-07, + "loss": 1.4168, + "step": 75838 + }, + { + "epoch": 0.9480237005925148, + "grad_norm": 0.0003123768838122487, + "learning_rate": 1.6413689020923107e-07, + "loss": 0.6663, + "step": 75840 + }, + { + "epoch": 0.9480487012175305, + "grad_norm": 3.2799887657165527, + "learning_rate": 1.6397946119525943e-07, + "loss": 1.2088, + "step": 75842 + }, + { + "epoch": 0.948073701842546, + "grad_norm": 0.0007044282392598689, + "learning_rate": 1.6382210709102842e-07, + "loss": 0.8233, + "step": 75844 + }, + { + "epoch": 0.9480987024675617, + "grad_norm": 2.647321939468384, + "learning_rate": 1.6366482789773707e-07, + "loss": 1.054, + "step": 75846 + }, + { + "epoch": 0.9481237030925773, + "grad_norm": 1.5565423965454102, + "learning_rate": 1.6350762361658445e-07, + "loss": 0.1457, + "step": 75848 + }, + { + "epoch": 0.948148703717593, + "grad_norm": 2.2200520038604736, + "learning_rate": 1.6335049424876514e-07, + "loss": 0.7332, + "step": 75850 + }, + { + "epoch": 0.9481737043426086, + "grad_norm": 1.5799592733383179, + "learning_rate": 1.631934397954793e-07, + "loss": 0.9696, + "step": 75852 + }, + { + "epoch": 0.9481987049676242, + "grad_norm": 3.996122360229492, + "learning_rate": 1.6303646025791931e-07, + "loss": 0.7073, + "step": 75854 + }, + { + "epoch": 0.9482237055926398, + "grad_norm": 6.426408290863037, + "learning_rate": 1.628795556372842e-07, + "loss": 1.9703, + "step": 75856 + }, + { + "epoch": 0.9482487062176554, + "grad_norm": 2.185354471206665, + "learning_rate": 1.6272272593476635e-07, + "loss": 0.4239, + "step": 75858 + }, + { + "epoch": 0.9482737068426711, + "grad_norm": 0.0012277488131076097, + "learning_rate": 1.6256597115156149e-07, + "loss": 0.2477, + "step": 75860 + }, + { + "epoch": 0.9482987074676867, + "grad_norm": 2.3402204513549805, + "learning_rate": 1.6240929128886307e-07, + "loss": 0.177, + "step": 75862 + }, + { + "epoch": 0.9483237080927023, + "grad_norm": 0.39374351501464844, + "learning_rate": 1.622526863478635e-07, + "loss": 0.6954, + "step": 75864 + }, + { + "epoch": 0.9483487087177179, + "grad_norm": 2.742009162902832, + "learning_rate": 1.620961563297574e-07, + "loss": 1.4352, + "step": 75866 + }, + { + "epoch": 0.9483737093427336, + "grad_norm": 4.692372798919678, + "learning_rate": 1.61939701235736e-07, + "loss": 1.8217, + "step": 75868 + }, + { + "epoch": 0.9483987099677492, + "grad_norm": 0.14477932453155518, + "learning_rate": 1.6178332106698947e-07, + "loss": 0.0048, + "step": 75870 + }, + { + "epoch": 0.9484237105927649, + "grad_norm": 3.9640097618103027, + "learning_rate": 1.6162701582471018e-07, + "loss": 1.4841, + "step": 75872 + }, + { + "epoch": 0.9484487112177804, + "grad_norm": 4.703187465667725, + "learning_rate": 1.6147078551008831e-07, + "loss": 0.9194, + "step": 75874 + }, + { + "epoch": 0.948473711842796, + "grad_norm": 0.0003194469027221203, + "learning_rate": 1.6131463012431404e-07, + "loss": 0.6511, + "step": 75876 + }, + { + "epoch": 0.9484987124678117, + "grad_norm": 6.0660624504089355, + "learning_rate": 1.6115854966857525e-07, + "loss": 1.5362, + "step": 75878 + }, + { + "epoch": 0.9485237130928273, + "grad_norm": 1.9314384460449219, + "learning_rate": 1.6100254414406325e-07, + "loss": 1.7138, + "step": 75880 + }, + { + "epoch": 0.948548713717843, + "grad_norm": 4.132101058959961, + "learning_rate": 1.6084661355196374e-07, + "loss": 0.5697, + "step": 75882 + }, + { + "epoch": 0.9485737143428585, + "grad_norm": 2.7240233421325684, + "learning_rate": 1.6069075789346466e-07, + "loss": 1.8605, + "step": 75884 + }, + { + "epoch": 0.9485987149678742, + "grad_norm": 5.513194561004639, + "learning_rate": 1.6053497716975398e-07, + "loss": 0.8119, + "step": 75886 + }, + { + "epoch": 0.9486237155928898, + "grad_norm": 2.9433035850524902, + "learning_rate": 1.6037927138201627e-07, + "loss": 0.4155, + "step": 75888 + }, + { + "epoch": 0.9486487162179055, + "grad_norm": 2.5805752277374268, + "learning_rate": 1.6022364053143947e-07, + "loss": 0.9099, + "step": 75890 + }, + { + "epoch": 0.9486737168429211, + "grad_norm": 4.12620210647583, + "learning_rate": 1.600680846192082e-07, + "loss": 0.6935, + "step": 75892 + }, + { + "epoch": 0.9486987174679367, + "grad_norm": 4.146432399749756, + "learning_rate": 1.5991260364650708e-07, + "loss": 0.8234, + "step": 75894 + }, + { + "epoch": 0.9487237180929523, + "grad_norm": 0.5611013174057007, + "learning_rate": 1.597571976145196e-07, + "loss": 0.3968, + "step": 75896 + }, + { + "epoch": 0.948748718717968, + "grad_norm": 0.9821516275405884, + "learning_rate": 1.5960186652442922e-07, + "loss": 0.9981, + "step": 75898 + }, + { + "epoch": 0.9487737193429836, + "grad_norm": 2.7981441020965576, + "learning_rate": 1.594466103774206e-07, + "loss": 0.5692, + "step": 75900 + }, + { + "epoch": 0.9487987199679992, + "grad_norm": 2.6407980918884277, + "learning_rate": 1.5929142917467388e-07, + "loss": 1.1741, + "step": 75902 + }, + { + "epoch": 0.9488237205930148, + "grad_norm": 3.3071188926696777, + "learning_rate": 1.5913632291737258e-07, + "loss": 1.1943, + "step": 75904 + }, + { + "epoch": 0.9488487212180304, + "grad_norm": 0.0003647291741799563, + "learning_rate": 1.5898129160669683e-07, + "loss": 0.3206, + "step": 75906 + }, + { + "epoch": 0.9488737218430461, + "grad_norm": 3.339423179626465, + "learning_rate": 1.5882633524382907e-07, + "loss": 0.969, + "step": 75908 + }, + { + "epoch": 0.9488987224680617, + "grad_norm": 5.41055965423584, + "learning_rate": 1.586714538299483e-07, + "loss": 1.6591, + "step": 75910 + }, + { + "epoch": 0.9489237230930774, + "grad_norm": 3.0957257747650146, + "learning_rate": 1.5851664736623251e-07, + "loss": 1.3878, + "step": 75912 + }, + { + "epoch": 0.9489487237180929, + "grad_norm": 3.0759353637695312, + "learning_rate": 1.5836191585386406e-07, + "loss": 1.7576, + "step": 75914 + }, + { + "epoch": 0.9489737243431086, + "grad_norm": 0.003549467772245407, + "learning_rate": 1.582072592940176e-07, + "loss": 0.742, + "step": 75916 + }, + { + "epoch": 0.9489987249681242, + "grad_norm": 0.0003275586059316993, + "learning_rate": 1.5805267768787325e-07, + "loss": 1.1033, + "step": 75918 + }, + { + "epoch": 0.9490237255931399, + "grad_norm": 2.4216179847717285, + "learning_rate": 1.578981710366101e-07, + "loss": 1.1269, + "step": 75920 + }, + { + "epoch": 0.9490487262181555, + "grad_norm": 3.4944651126861572, + "learning_rate": 1.577437393414005e-07, + "loss": 1.1981, + "step": 75922 + }, + { + "epoch": 0.949073726843171, + "grad_norm": 4.52548360824585, + "learning_rate": 1.575893826034236e-07, + "loss": 0.9189, + "step": 75924 + }, + { + "epoch": 0.9490987274681867, + "grad_norm": 2.2353692054748535, + "learning_rate": 1.5743510082385394e-07, + "loss": 0.4412, + "step": 75926 + }, + { + "epoch": 0.9491237280932023, + "grad_norm": 5.927974224090576, + "learning_rate": 1.5728089400386726e-07, + "loss": 1.2842, + "step": 75928 + }, + { + "epoch": 0.949148728718218, + "grad_norm": 3.1893742084503174, + "learning_rate": 1.5712676214463597e-07, + "loss": 1.1451, + "step": 75930 + }, + { + "epoch": 0.9491737293432336, + "grad_norm": 0.004074183292686939, + "learning_rate": 1.5697270524733577e-07, + "loss": 0.0881, + "step": 75932 + }, + { + "epoch": 0.9491987299682492, + "grad_norm": 2.2512600421905518, + "learning_rate": 1.568187233131413e-07, + "loss": 0.6287, + "step": 75934 + }, + { + "epoch": 0.9492237305932648, + "grad_norm": 0.00022013419948052615, + "learning_rate": 1.5666481634322162e-07, + "loss": 0.6578, + "step": 75936 + }, + { + "epoch": 0.9492487312182805, + "grad_norm": 11.520195960998535, + "learning_rate": 1.5651098433875244e-07, + "loss": 1.196, + "step": 75938 + }, + { + "epoch": 0.9492737318432961, + "grad_norm": 0.0004277643165551126, + "learning_rate": 1.5635722730090174e-07, + "loss": 0.9066, + "step": 75940 + }, + { + "epoch": 0.9492987324683118, + "grad_norm": 3.9587104320526123, + "learning_rate": 1.5620354523084303e-07, + "loss": 0.6993, + "step": 75942 + }, + { + "epoch": 0.9493237330933273, + "grad_norm": 4.318394660949707, + "learning_rate": 1.5604993812974534e-07, + "loss": 1.4747, + "step": 75944 + }, + { + "epoch": 0.9493487337183429, + "grad_norm": 4.125748157501221, + "learning_rate": 1.558964059987811e-07, + "loss": 1.7036, + "step": 75946 + }, + { + "epoch": 0.9493737343433586, + "grad_norm": 0.3233034610748291, + "learning_rate": 1.5574294883911712e-07, + "loss": 0.2988, + "step": 75948 + }, + { + "epoch": 0.9493987349683742, + "grad_norm": 0.00015936665295157582, + "learning_rate": 1.5558956665192136e-07, + "loss": 0.8464, + "step": 75950 + }, + { + "epoch": 0.9494237355933899, + "grad_norm": 4.734501361846924, + "learning_rate": 1.5543625943836515e-07, + "loss": 0.7838, + "step": 75952 + }, + { + "epoch": 0.9494487362184054, + "grad_norm": 5.190439224243164, + "learning_rate": 1.5528302719961307e-07, + "loss": 1.3543, + "step": 75954 + }, + { + "epoch": 0.9494737368434211, + "grad_norm": 3.257434606552124, + "learning_rate": 1.551298699368331e-07, + "loss": 1.4776, + "step": 75956 + }, + { + "epoch": 0.9494987374684367, + "grad_norm": 1.1658546924591064, + "learning_rate": 1.5497678765119207e-07, + "loss": 0.0516, + "step": 75958 + }, + { + "epoch": 0.9495237380934524, + "grad_norm": 3.4518353939056396, + "learning_rate": 1.5482378034385682e-07, + "loss": 1.5829, + "step": 75960 + }, + { + "epoch": 0.949548738718468, + "grad_norm": 2.9914205074310303, + "learning_rate": 1.5467084801599087e-07, + "loss": 1.1507, + "step": 75962 + }, + { + "epoch": 0.9495737393434835, + "grad_norm": 1.1269330978393555, + "learning_rate": 1.5451799066875882e-07, + "loss": 0.729, + "step": 75964 + }, + { + "epoch": 0.9495987399684992, + "grad_norm": 2.6449267864227295, + "learning_rate": 1.5436520830332536e-07, + "loss": 0.8307, + "step": 75966 + }, + { + "epoch": 0.9496237405935148, + "grad_norm": 1.6897261142730713, + "learning_rate": 1.5421250092085393e-07, + "loss": 0.1595, + "step": 75968 + }, + { + "epoch": 0.9496487412185305, + "grad_norm": 3.0309031009674072, + "learning_rate": 1.540598685225081e-07, + "loss": 1.1415, + "step": 75970 + }, + { + "epoch": 0.9496737418435461, + "grad_norm": 0.35303083062171936, + "learning_rate": 1.5390731110945134e-07, + "loss": 0.0916, + "step": 75972 + }, + { + "epoch": 0.9496987424685617, + "grad_norm": 0.010655614547431469, + "learning_rate": 1.5375482868284276e-07, + "loss": 0.2913, + "step": 75974 + }, + { + "epoch": 0.9497237430935773, + "grad_norm": 2.818939685821533, + "learning_rate": 1.5360242124384582e-07, + "loss": 1.4963, + "step": 75976 + }, + { + "epoch": 0.949748743718593, + "grad_norm": 4.576440811157227, + "learning_rate": 1.5345008879361967e-07, + "loss": 1.8355, + "step": 75978 + }, + { + "epoch": 0.9497737443436086, + "grad_norm": 2.54561185836792, + "learning_rate": 1.5329783133332554e-07, + "loss": 0.2937, + "step": 75980 + }, + { + "epoch": 0.9497987449686243, + "grad_norm": 2.338334321975708, + "learning_rate": 1.5314564886412253e-07, + "loss": 0.321, + "step": 75982 + }, + { + "epoch": 0.9498237455936398, + "grad_norm": 0.9385776519775391, + "learning_rate": 1.529935413871697e-07, + "loss": 0.3587, + "step": 75984 + }, + { + "epoch": 0.9498487462186554, + "grad_norm": 2.887383222579956, + "learning_rate": 1.5284150890362616e-07, + "loss": 0.6881, + "step": 75986 + }, + { + "epoch": 0.9498737468436711, + "grad_norm": 4.3408613204956055, + "learning_rate": 1.5268955141464869e-07, + "loss": 0.5928, + "step": 75988 + }, + { + "epoch": 0.9498987474686867, + "grad_norm": 3.267357587814331, + "learning_rate": 1.5253766892139532e-07, + "loss": 0.7751, + "step": 75990 + }, + { + "epoch": 0.9499237480937024, + "grad_norm": 4.818069934844971, + "learning_rate": 1.5238586142502175e-07, + "loss": 1.6594, + "step": 75992 + }, + { + "epoch": 0.9499487487187179, + "grad_norm": 4.963078022003174, + "learning_rate": 1.52234128926686e-07, + "loss": 0.8162, + "step": 75994 + }, + { + "epoch": 0.9499737493437336, + "grad_norm": 2.508004665374756, + "learning_rate": 1.520824714275415e-07, + "loss": 1.0082, + "step": 75996 + }, + { + "epoch": 0.9499987499687492, + "grad_norm": 2.7194149494171143, + "learning_rate": 1.5193088892874519e-07, + "loss": 1.1309, + "step": 75998 + }, + { + "epoch": 0.9500237505937649, + "grad_norm": 2.437260150909424, + "learning_rate": 1.5177938143145056e-07, + "loss": 0.3592, + "step": 76000 + }, + { + "epoch": 0.9500487512187805, + "grad_norm": 3.5075793266296387, + "learning_rate": 1.516279489368111e-07, + "loss": 1.3382, + "step": 76002 + }, + { + "epoch": 0.950073751843796, + "grad_norm": 2.4949827194213867, + "learning_rate": 1.5147659144598037e-07, + "loss": 0.616, + "step": 76004 + }, + { + "epoch": 0.9500987524688117, + "grad_norm": 2.99652099609375, + "learning_rate": 1.5132530896011078e-07, + "loss": 1.4408, + "step": 76006 + }, + { + "epoch": 0.9501237530938274, + "grad_norm": 1.118647813796997, + "learning_rate": 1.5117410148035582e-07, + "loss": 0.4472, + "step": 76008 + }, + { + "epoch": 0.950148753718843, + "grad_norm": 1.2814183235168457, + "learning_rate": 1.5102296900786572e-07, + "loss": 0.8123, + "step": 76010 + }, + { + "epoch": 0.9501737543438586, + "grad_norm": 3.128121852874756, + "learning_rate": 1.5087191154379287e-07, + "loss": 0.7825, + "step": 76012 + }, + { + "epoch": 0.9501987549688742, + "grad_norm": 2.8471367359161377, + "learning_rate": 1.5072092908928526e-07, + "loss": 0.9658, + "step": 76014 + }, + { + "epoch": 0.9502237555938898, + "grad_norm": 1.4203139543533325, + "learning_rate": 1.5057002164549528e-07, + "loss": 0.4805, + "step": 76016 + }, + { + "epoch": 0.9502487562189055, + "grad_norm": 0.685969352722168, + "learning_rate": 1.504191892135698e-07, + "loss": 0.4558, + "step": 76018 + }, + { + "epoch": 0.9502737568439211, + "grad_norm": 1.123938798904419, + "learning_rate": 1.5026843179466011e-07, + "loss": 1.4805, + "step": 76020 + }, + { + "epoch": 0.9502987574689368, + "grad_norm": 4.225462913513184, + "learning_rate": 1.5011774938991307e-07, + "loss": 1.4202, + "step": 76022 + }, + { + "epoch": 0.9503237580939523, + "grad_norm": 6.043886661529541, + "learning_rate": 1.499671420004767e-07, + "loss": 0.879, + "step": 76024 + }, + { + "epoch": 0.950348758718968, + "grad_norm": 3.468320369720459, + "learning_rate": 1.4981660962749668e-07, + "loss": 0.6071, + "step": 76026 + }, + { + "epoch": 0.9503737593439836, + "grad_norm": 5.161895751953125, + "learning_rate": 1.496661522721221e-07, + "loss": 0.801, + "step": 76028 + }, + { + "epoch": 0.9503987599689993, + "grad_norm": 5.890124797821045, + "learning_rate": 1.4951576993549544e-07, + "loss": 1.5908, + "step": 76030 + }, + { + "epoch": 0.9504237605940149, + "grad_norm": 4.043274879455566, + "learning_rate": 1.493654626187635e-07, + "loss": 1.318, + "step": 76032 + }, + { + "epoch": 0.9504487612190304, + "grad_norm": 4.2094550132751465, + "learning_rate": 1.4921523032307207e-07, + "loss": 2.1889, + "step": 76034 + }, + { + "epoch": 0.9504737618440461, + "grad_norm": 3.209044933319092, + "learning_rate": 1.4906507304956352e-07, + "loss": 1.4024, + "step": 76036 + }, + { + "epoch": 0.9504987624690617, + "grad_norm": 4.585031032562256, + "learning_rate": 1.4891499079938477e-07, + "loss": 1.2042, + "step": 76038 + }, + { + "epoch": 0.9505237630940774, + "grad_norm": 3.4423131942749023, + "learning_rate": 1.4876498357367375e-07, + "loss": 1.2892, + "step": 76040 + }, + { + "epoch": 0.950548763719093, + "grad_norm": 3.5701210498809814, + "learning_rate": 1.4861505137357624e-07, + "loss": 1.6421, + "step": 76042 + }, + { + "epoch": 0.9505737643441086, + "grad_norm": 4.244009971618652, + "learning_rate": 1.4846519420023352e-07, + "loss": 1.5839, + "step": 76044 + }, + { + "epoch": 0.9505987649691242, + "grad_norm": 3.432485580444336, + "learning_rate": 1.4831541205478694e-07, + "loss": 1.6501, + "step": 76046 + }, + { + "epoch": 0.9506237655941399, + "grad_norm": 8.570279121398926, + "learning_rate": 1.4816570493837668e-07, + "loss": 0.3847, + "step": 76048 + }, + { + "epoch": 0.9506487662191555, + "grad_norm": 0.6709827184677124, + "learning_rate": 1.4801607285214293e-07, + "loss": 0.4851, + "step": 76050 + }, + { + "epoch": 0.9506737668441712, + "grad_norm": 6.750051021575928, + "learning_rate": 1.478665157972281e-07, + "loss": 1.0589, + "step": 76052 + }, + { + "epoch": 0.9506987674691867, + "grad_norm": 0.001373576931655407, + "learning_rate": 1.4771703377476575e-07, + "loss": 0.2128, + "step": 76054 + }, + { + "epoch": 0.9507237680942023, + "grad_norm": 7.501468658447266, + "learning_rate": 1.475676267858983e-07, + "loss": 1.4749, + "step": 76056 + }, + { + "epoch": 0.950748768719218, + "grad_norm": 0.20167383551597595, + "learning_rate": 1.4741829483176262e-07, + "loss": 0.7944, + "step": 76058 + }, + { + "epoch": 0.9507737693442336, + "grad_norm": 1.0635825395584106, + "learning_rate": 1.4726903791349446e-07, + "loss": 0.3003, + "step": 76060 + }, + { + "epoch": 0.9507987699692493, + "grad_norm": 0.0002697127347346395, + "learning_rate": 1.47119856032234e-07, + "loss": 0.2294, + "step": 76062 + }, + { + "epoch": 0.9508237705942648, + "grad_norm": 2.8755276203155518, + "learning_rate": 1.469707491891148e-07, + "loss": 0.8696, + "step": 76064 + }, + { + "epoch": 0.9508487712192805, + "grad_norm": 3.7829296588897705, + "learning_rate": 1.4682171738527373e-07, + "loss": 1.2017, + "step": 76066 + }, + { + "epoch": 0.9508737718442961, + "grad_norm": 3.7904040813446045, + "learning_rate": 1.466727606218432e-07, + "loss": 1.5847, + "step": 76068 + }, + { + "epoch": 0.9508987724693118, + "grad_norm": 2.404052734375, + "learning_rate": 1.4652387889996012e-07, + "loss": 1.324, + "step": 76070 + }, + { + "epoch": 0.9509237730943274, + "grad_norm": 3.8841145038604736, + "learning_rate": 1.4637507222075908e-07, + "loss": 1.7331, + "step": 76072 + }, + { + "epoch": 0.9509487737193429, + "grad_norm": 0.0035986576694995165, + "learning_rate": 1.4622634058537033e-07, + "loss": 0.642, + "step": 76074 + }, + { + "epoch": 0.9509737743443586, + "grad_norm": 4.448277473449707, + "learning_rate": 1.4607768399492962e-07, + "loss": 1.1203, + "step": 76076 + }, + { + "epoch": 0.9509987749693742, + "grad_norm": 3.570497512817383, + "learning_rate": 1.4592910245056823e-07, + "loss": 0.5031, + "step": 76078 + }, + { + "epoch": 0.9510237755943899, + "grad_norm": 0.4621167778968811, + "learning_rate": 1.4578059595341532e-07, + "loss": 0.8034, + "step": 76080 + }, + { + "epoch": 0.9510487762194055, + "grad_norm": 3.3220367431640625, + "learning_rate": 1.4563216450460548e-07, + "loss": 0.3838, + "step": 76082 + }, + { + "epoch": 0.9510737768444211, + "grad_norm": 2.345900297164917, + "learning_rate": 1.4548380810526674e-07, + "loss": 1.0596, + "step": 76084 + }, + { + "epoch": 0.9510987774694367, + "grad_norm": 1.7340675592422485, + "learning_rate": 1.453355267565304e-07, + "loss": 0.4071, + "step": 76086 + }, + { + "epoch": 0.9511237780944524, + "grad_norm": 2.3227264881134033, + "learning_rate": 1.4518732045952444e-07, + "loss": 1.6619, + "step": 76088 + }, + { + "epoch": 0.951148778719468, + "grad_norm": 5.478211879730225, + "learning_rate": 1.4503918921538018e-07, + "loss": 1.0885, + "step": 76090 + }, + { + "epoch": 0.9511737793444837, + "grad_norm": 4.840412616729736, + "learning_rate": 1.4489113302522118e-07, + "loss": 1.1253, + "step": 76092 + }, + { + "epoch": 0.9511987799694992, + "grad_norm": 2.383331775665283, + "learning_rate": 1.4474315189017875e-07, + "loss": 0.873, + "step": 76094 + }, + { + "epoch": 0.9512237805945148, + "grad_norm": 5.585999965667725, + "learning_rate": 1.4459524581137862e-07, + "loss": 0.4301, + "step": 76096 + }, + { + "epoch": 0.9512487812195305, + "grad_norm": 0.00028487606323324144, + "learning_rate": 1.4444741478994773e-07, + "loss": 1.151, + "step": 76098 + }, + { + "epoch": 0.9512737818445461, + "grad_norm": 1.1646056175231934, + "learning_rate": 1.442996588270118e-07, + "loss": 0.0202, + "step": 76100 + }, + { + "epoch": 0.9512987824695618, + "grad_norm": 3.535262107849121, + "learning_rate": 1.4415197792369552e-07, + "loss": 0.9462, + "step": 76102 + }, + { + "epoch": 0.9513237830945773, + "grad_norm": 3.366241455078125, + "learning_rate": 1.440043720811235e-07, + "loss": 1.3416, + "step": 76104 + }, + { + "epoch": 0.951348783719593, + "grad_norm": 0.13359154760837555, + "learning_rate": 1.4385684130042154e-07, + "loss": 0.5665, + "step": 76106 + }, + { + "epoch": 0.9513737843446086, + "grad_norm": 4.352390766143799, + "learning_rate": 1.4370938558271208e-07, + "loss": 1.0828, + "step": 76108 + }, + { + "epoch": 0.9513987849696243, + "grad_norm": 2.530893564224243, + "learning_rate": 1.4356200492911755e-07, + "loss": 0.7135, + "step": 76110 + }, + { + "epoch": 0.9514237855946399, + "grad_norm": 2.8975894451141357, + "learning_rate": 1.4341469934076036e-07, + "loss": 0.1459, + "step": 76112 + }, + { + "epoch": 0.9514487862196555, + "grad_norm": 10.569952964782715, + "learning_rate": 1.4326746881876408e-07, + "loss": 0.6361, + "step": 76114 + }, + { + "epoch": 0.9514737868446711, + "grad_norm": 3.054989814758301, + "learning_rate": 1.4312031336424891e-07, + "loss": 1.168, + "step": 76116 + }, + { + "epoch": 0.9514987874696867, + "grad_norm": 6.3489556312561035, + "learning_rate": 1.4297323297833508e-07, + "loss": 1.1989, + "step": 76118 + }, + { + "epoch": 0.9515237880947024, + "grad_norm": 3.3847007751464844, + "learning_rate": 1.4282622766214393e-07, + "loss": 0.4801, + "step": 76120 + }, + { + "epoch": 0.951548788719718, + "grad_norm": 5.308392524719238, + "learning_rate": 1.4267929741679344e-07, + "loss": 1.5606, + "step": 76122 + }, + { + "epoch": 0.9515737893447336, + "grad_norm": 2.7084803581237793, + "learning_rate": 1.425324422434038e-07, + "loss": 0.3571, + "step": 76124 + }, + { + "epoch": 0.9515987899697492, + "grad_norm": 4.032651424407959, + "learning_rate": 1.423856621430919e-07, + "loss": 0.3862, + "step": 76126 + }, + { + "epoch": 0.9516237905947649, + "grad_norm": 1.4905301332473755, + "learning_rate": 1.422389571169791e-07, + "loss": 1.0537, + "step": 76128 + }, + { + "epoch": 0.9516487912197805, + "grad_norm": 2.053173303604126, + "learning_rate": 1.4209232716617894e-07, + "loss": 1.2707, + "step": 76130 + }, + { + "epoch": 0.9516737918447962, + "grad_norm": 4.575772762298584, + "learning_rate": 1.419457722918094e-07, + "loss": 1.3681, + "step": 76132 + }, + { + "epoch": 0.9516987924698117, + "grad_norm": 0.45981839299201965, + "learning_rate": 1.4179929249498735e-07, + "loss": 0.0234, + "step": 76134 + }, + { + "epoch": 0.9517237930948274, + "grad_norm": 3.6286420822143555, + "learning_rate": 1.416528877768275e-07, + "loss": 0.8849, + "step": 76136 + }, + { + "epoch": 0.951748793719843, + "grad_norm": 1.2869396209716797, + "learning_rate": 1.415065581384456e-07, + "loss": 1.0007, + "step": 76138 + }, + { + "epoch": 0.9517737943448586, + "grad_norm": 1.7910956144332886, + "learning_rate": 1.413603035809552e-07, + "loss": 0.4544, + "step": 76140 + }, + { + "epoch": 0.9517987949698743, + "grad_norm": 2.5140180587768555, + "learning_rate": 1.4121412410547096e-07, + "loss": 1.2258, + "step": 76142 + }, + { + "epoch": 0.9518237955948898, + "grad_norm": 2.4787487983703613, + "learning_rate": 1.4106801971310647e-07, + "loss": 0.1806, + "step": 76144 + }, + { + "epoch": 0.9518487962199055, + "grad_norm": 3.207738161087036, + "learning_rate": 1.409219904049719e-07, + "loss": 0.712, + "step": 76146 + }, + { + "epoch": 0.9518737968449211, + "grad_norm": 3.7136528491973877, + "learning_rate": 1.4077603618218306e-07, + "loss": 0.6167, + "step": 76148 + }, + { + "epoch": 0.9518987974699368, + "grad_norm": 0.0003518316661939025, + "learning_rate": 1.4063015704584793e-07, + "loss": 0.0202, + "step": 76150 + }, + { + "epoch": 0.9519237980949524, + "grad_norm": 2.585191249847412, + "learning_rate": 1.404843529970812e-07, + "loss": 1.5574, + "step": 76152 + }, + { + "epoch": 0.951948798719968, + "grad_norm": 3.2939836978912354, + "learning_rate": 1.4033862403698973e-07, + "loss": 0.0723, + "step": 76154 + }, + { + "epoch": 0.9519737993449836, + "grad_norm": 3.3669865131378174, + "learning_rate": 1.4019297016668599e-07, + "loss": 0.7263, + "step": 76156 + }, + { + "epoch": 0.9519987999699993, + "grad_norm": 2.234768867492676, + "learning_rate": 1.4004739138727797e-07, + "loss": 0.7306, + "step": 76158 + }, + { + "epoch": 0.9520238005950149, + "grad_norm": 0.11105494201183319, + "learning_rate": 1.3990188769987478e-07, + "loss": 0.1734, + "step": 76160 + }, + { + "epoch": 0.9520488012200305, + "grad_norm": 0.26310279965400696, + "learning_rate": 1.3975645910558445e-07, + "loss": 0.4202, + "step": 76162 + }, + { + "epoch": 0.9520738018450461, + "grad_norm": 3.8835225105285645, + "learning_rate": 1.3961110560551495e-07, + "loss": 0.961, + "step": 76164 + }, + { + "epoch": 0.9520988024700617, + "grad_norm": 3.3256521224975586, + "learning_rate": 1.3946582720077205e-07, + "loss": 0.9232, + "step": 76166 + }, + { + "epoch": 0.9521238030950774, + "grad_norm": 2.8554351329803467, + "learning_rate": 1.3932062389246381e-07, + "loss": 0.8443, + "step": 76168 + }, + { + "epoch": 0.952148803720093, + "grad_norm": 0.0005569629138335586, + "learning_rate": 1.3917549568169486e-07, + "loss": 0.9126, + "step": 76170 + }, + { + "epoch": 0.9521738043451087, + "grad_norm": 0.29669487476348877, + "learning_rate": 1.3903044256957098e-07, + "loss": 0.3953, + "step": 76172 + }, + { + "epoch": 0.9521988049701242, + "grad_norm": 3.7718780040740967, + "learning_rate": 1.3888546455719686e-07, + "loss": 1.7306, + "step": 76174 + }, + { + "epoch": 0.9522238055951399, + "grad_norm": 0.0002928270259872079, + "learning_rate": 1.3874056164567607e-07, + "loss": 0.4943, + "step": 76176 + }, + { + "epoch": 0.9522488062201555, + "grad_norm": 0.6672832369804382, + "learning_rate": 1.3859573383611325e-07, + "loss": 0.5032, + "step": 76178 + }, + { + "epoch": 0.9522738068451712, + "grad_norm": 0.0005713916034437716, + "learning_rate": 1.3845098112961087e-07, + "loss": 0.4638, + "step": 76180 + }, + { + "epoch": 0.9522988074701868, + "grad_norm": 4.15949010848999, + "learning_rate": 1.3830630352727137e-07, + "loss": 1.6201, + "step": 76182 + }, + { + "epoch": 0.9523238080952023, + "grad_norm": 5.420559406280518, + "learning_rate": 1.38161701030195e-07, + "loss": 1.3661, + "step": 76184 + }, + { + "epoch": 0.952348808720218, + "grad_norm": 1.8337574005126953, + "learning_rate": 1.380171736394864e-07, + "loss": 1.2222, + "step": 76186 + }, + { + "epoch": 0.9523738093452336, + "grad_norm": 3.21185302734375, + "learning_rate": 1.3787272135624252e-07, + "loss": 0.8013, + "step": 76188 + }, + { + "epoch": 0.9523988099702493, + "grad_norm": 5.807855129241943, + "learning_rate": 1.377283441815669e-07, + "loss": 0.7639, + "step": 76190 + }, + { + "epoch": 0.9524238105952649, + "grad_norm": 5.306439399719238, + "learning_rate": 1.3758404211655752e-07, + "loss": 0.735, + "step": 76192 + }, + { + "epoch": 0.9524488112202805, + "grad_norm": 4.599575519561768, + "learning_rate": 1.374398151623124e-07, + "loss": 1.8978, + "step": 76194 + }, + { + "epoch": 0.9524738118452961, + "grad_norm": 2.507063627243042, + "learning_rate": 1.3729566331993182e-07, + "loss": 0.5247, + "step": 76196 + }, + { + "epoch": 0.9524988124703118, + "grad_norm": 0.0009585529915057123, + "learning_rate": 1.3715158659051264e-07, + "loss": 0.6919, + "step": 76198 + }, + { + "epoch": 0.9525238130953274, + "grad_norm": 3.514146327972412, + "learning_rate": 1.3700758497515177e-07, + "loss": 0.2103, + "step": 76200 + }, + { + "epoch": 0.9525488137203431, + "grad_norm": 0.028983458876609802, + "learning_rate": 1.368636584749472e-07, + "loss": 0.0004, + "step": 76202 + }, + { + "epoch": 0.9525738143453586, + "grad_norm": 4.7756500244140625, + "learning_rate": 1.3671980709099364e-07, + "loss": 1.776, + "step": 76204 + }, + { + "epoch": 0.9525988149703742, + "grad_norm": 0.000351116614183411, + "learning_rate": 1.3657603082438685e-07, + "loss": 0.3108, + "step": 76206 + }, + { + "epoch": 0.9526238155953899, + "grad_norm": 2.9086952209472656, + "learning_rate": 1.3643232967622265e-07, + "loss": 1.2708, + "step": 76208 + }, + { + "epoch": 0.9526488162204055, + "grad_norm": 3.8008670806884766, + "learning_rate": 1.3628870364759572e-07, + "loss": 2.0301, + "step": 76210 + }, + { + "epoch": 0.9526738168454212, + "grad_norm": 12.972917556762695, + "learning_rate": 1.361451527395985e-07, + "loss": 1.1833, + "step": 76212 + }, + { + "epoch": 0.9526988174704367, + "grad_norm": 3.1630680561065674, + "learning_rate": 1.3600167695332456e-07, + "loss": 1.2756, + "step": 76214 + }, + { + "epoch": 0.9527238180954524, + "grad_norm": 6.72694730758667, + "learning_rate": 1.3585827628986748e-07, + "loss": 1.9806, + "step": 76216 + }, + { + "epoch": 0.952748818720468, + "grad_norm": 5.664734363555908, + "learning_rate": 1.357149507503186e-07, + "loss": 1.5327, + "step": 76218 + }, + { + "epoch": 0.9527738193454837, + "grad_norm": 3.6372196674346924, + "learning_rate": 1.3557170033577039e-07, + "loss": 0.7958, + "step": 76220 + }, + { + "epoch": 0.9527988199704993, + "grad_norm": 0.0004946363624185324, + "learning_rate": 1.3542852504731196e-07, + "loss": 0.4789, + "step": 76222 + }, + { + "epoch": 0.9528238205955148, + "grad_norm": 3.5216987133026123, + "learning_rate": 1.352854248860358e-07, + "loss": 0.8274, + "step": 76224 + }, + { + "epoch": 0.9528488212205305, + "grad_norm": 4.258301258087158, + "learning_rate": 1.351423998530299e-07, + "loss": 0.8624, + "step": 76226 + }, + { + "epoch": 0.9528738218455461, + "grad_norm": 0.04485993832349777, + "learning_rate": 1.3499944994938562e-07, + "loss": 0.3066, + "step": 76228 + }, + { + "epoch": 0.9528988224705618, + "grad_norm": 4.613467693328857, + "learning_rate": 1.3485657517618878e-07, + "loss": 1.3056, + "step": 76230 + }, + { + "epoch": 0.9529238230955774, + "grad_norm": 4.11663293838501, + "learning_rate": 1.3471377553453068e-07, + "loss": 1.1855, + "step": 76232 + }, + { + "epoch": 0.952948823720593, + "grad_norm": 5.297861576080322, + "learning_rate": 1.3457105102549828e-07, + "loss": 0.1601, + "step": 76234 + }, + { + "epoch": 0.9529738243456086, + "grad_norm": 5.567453861236572, + "learning_rate": 1.3442840165017512e-07, + "loss": 1.1205, + "step": 76236 + }, + { + "epoch": 0.9529988249706243, + "grad_norm": 2.9527080059051514, + "learning_rate": 1.3428582740965258e-07, + "loss": 1.8177, + "step": 76238 + }, + { + "epoch": 0.9530238255956399, + "grad_norm": 2.690755844116211, + "learning_rate": 1.3414332830501197e-07, + "loss": 1.9714, + "step": 76240 + }, + { + "epoch": 0.9530488262206556, + "grad_norm": 4.868483066558838, + "learning_rate": 1.3400090433734247e-07, + "loss": 1.8303, + "step": 76242 + }, + { + "epoch": 0.9530738268456711, + "grad_norm": 0.9792012572288513, + "learning_rate": 1.3385855550772543e-07, + "loss": 0.3933, + "step": 76244 + }, + { + "epoch": 0.9530988274706867, + "grad_norm": 4.563051223754883, + "learning_rate": 1.3371628181724771e-07, + "loss": 1.6242, + "step": 76246 + }, + { + "epoch": 0.9531238280957024, + "grad_norm": 0.00034270581090822816, + "learning_rate": 1.3357408326699073e-07, + "loss": 0.1589, + "step": 76248 + }, + { + "epoch": 0.953148828720718, + "grad_norm": 2.4166958332061768, + "learning_rate": 1.3343195985803803e-07, + "loss": 0.4532, + "step": 76250 + }, + { + "epoch": 0.9531738293457337, + "grad_norm": 4.27855110168457, + "learning_rate": 1.332899115914732e-07, + "loss": 1.1895, + "step": 76252 + }, + { + "epoch": 0.9531988299707492, + "grad_norm": 0.7537791728973389, + "learning_rate": 1.3314793846837649e-07, + "loss": 0.0609, + "step": 76254 + }, + { + "epoch": 0.9532238305957649, + "grad_norm": 1.2403686046600342, + "learning_rate": 1.3300604048982923e-07, + "loss": 0.2368, + "step": 76256 + }, + { + "epoch": 0.9532488312207805, + "grad_norm": 1.2962263822555542, + "learning_rate": 1.328642176569128e-07, + "loss": 1.0409, + "step": 76258 + }, + { + "epoch": 0.9532738318457962, + "grad_norm": 5.670342922210693, + "learning_rate": 1.327224699707086e-07, + "loss": 1.3647, + "step": 76260 + }, + { + "epoch": 0.9532988324708118, + "grad_norm": 4.056421756744385, + "learning_rate": 1.3258079743229456e-07, + "loss": 0.3302, + "step": 76262 + }, + { + "epoch": 0.9533238330958274, + "grad_norm": 0.2981058657169342, + "learning_rate": 1.3243920004274768e-07, + "loss": 1.0406, + "step": 76264 + }, + { + "epoch": 0.953348833720843, + "grad_norm": 1.10748291015625, + "learning_rate": 1.322976778031504e-07, + "loss": 0.0463, + "step": 76266 + }, + { + "epoch": 0.9533738343458587, + "grad_norm": 2.7170794010162354, + "learning_rate": 1.321562307145774e-07, + "loss": 0.4626, + "step": 76268 + }, + { + "epoch": 0.9533988349708743, + "grad_norm": 0.8340241312980652, + "learning_rate": 1.3201485877810783e-07, + "loss": 0.8034, + "step": 76270 + }, + { + "epoch": 0.95342383559589, + "grad_norm": 2.679880142211914, + "learning_rate": 1.318735619948175e-07, + "loss": 1.4905, + "step": 76272 + }, + { + "epoch": 0.9534488362209055, + "grad_norm": 4.310401916503906, + "learning_rate": 1.317323403657833e-07, + "loss": 0.4895, + "step": 76274 + }, + { + "epoch": 0.9534738368459211, + "grad_norm": 0.38282936811447144, + "learning_rate": 1.3159119389207886e-07, + "loss": 0.7957, + "step": 76276 + }, + { + "epoch": 0.9534988374709368, + "grad_norm": 0.38277751207351685, + "learning_rate": 1.3145012257478107e-07, + "loss": 0.0153, + "step": 76278 + }, + { + "epoch": 0.9535238380959524, + "grad_norm": 3.5530576705932617, + "learning_rate": 1.3130912641496351e-07, + "loss": 0.7446, + "step": 76280 + }, + { + "epoch": 0.9535488387209681, + "grad_norm": 3.188570737838745, + "learning_rate": 1.311682054136998e-07, + "loss": 1.2906, + "step": 76282 + }, + { + "epoch": 0.9535738393459836, + "grad_norm": 1.1508523225784302, + "learning_rate": 1.3102735957206348e-07, + "loss": 0.6034, + "step": 76284 + }, + { + "epoch": 0.9535988399709993, + "grad_norm": 3.319878101348877, + "learning_rate": 1.3088658889112815e-07, + "loss": 0.9953, + "step": 76286 + }, + { + "epoch": 0.9536238405960149, + "grad_norm": 2.39909029006958, + "learning_rate": 1.3074589337196407e-07, + "loss": 0.4417, + "step": 76288 + }, + { + "epoch": 0.9536488412210306, + "grad_norm": 3.9056217670440674, + "learning_rate": 1.3060527301564375e-07, + "loss": 1.2271, + "step": 76290 + }, + { + "epoch": 0.9536738418460462, + "grad_norm": 0.0013404982164502144, + "learning_rate": 1.3046472782323737e-07, + "loss": 0.4641, + "step": 76292 + }, + { + "epoch": 0.9536988424710617, + "grad_norm": 3.6078944206237793, + "learning_rate": 1.3032425779581747e-07, + "loss": 0.7324, + "step": 76294 + }, + { + "epoch": 0.9537238430960774, + "grad_norm": 0.002926257671788335, + "learning_rate": 1.3018386293445095e-07, + "loss": 0.052, + "step": 76296 + }, + { + "epoch": 0.953748843721093, + "grad_norm": 2.865511894226074, + "learning_rate": 1.3004354324020807e-07, + "loss": 0.5433, + "step": 76298 + }, + { + "epoch": 0.9537738443461087, + "grad_norm": 1.2535074949264526, + "learning_rate": 1.2990329871416018e-07, + "loss": 1.183, + "step": 76300 + }, + { + "epoch": 0.9537988449711243, + "grad_norm": 6.372671604156494, + "learning_rate": 1.297631293573709e-07, + "loss": 0.0692, + "step": 76302 + }, + { + "epoch": 0.9538238455961399, + "grad_norm": 2.607619524002075, + "learning_rate": 1.2962303517091046e-07, + "loss": 0.5501, + "step": 76304 + }, + { + "epoch": 0.9538488462211555, + "grad_norm": 0.17569510638713837, + "learning_rate": 1.294830161558447e-07, + "loss": 0.5959, + "step": 76306 + }, + { + "epoch": 0.9538738468461712, + "grad_norm": 3.511087417602539, + "learning_rate": 1.2934307231324162e-07, + "loss": 0.9868, + "step": 76308 + }, + { + "epoch": 0.9538988474711868, + "grad_norm": 3.315413236618042, + "learning_rate": 1.2920320364416484e-07, + "loss": 0.5377, + "step": 76310 + }, + { + "epoch": 0.9539238480962025, + "grad_norm": 3.8844714164733887, + "learning_rate": 1.2906341014968126e-07, + "loss": 0.8303, + "step": 76312 + }, + { + "epoch": 0.953948848721218, + "grad_norm": 1.35026216506958, + "learning_rate": 1.2892369183085453e-07, + "loss": 0.4623, + "step": 76314 + }, + { + "epoch": 0.9539738493462336, + "grad_norm": 4.645650863647461, + "learning_rate": 1.2878404868874926e-07, + "loss": 0.3178, + "step": 76316 + }, + { + "epoch": 0.9539988499712493, + "grad_norm": 4.0650129318237305, + "learning_rate": 1.2864448072442804e-07, + "loss": 1.1919, + "step": 76318 + }, + { + "epoch": 0.9540238505962649, + "grad_norm": 4.934536457061768, + "learning_rate": 1.2850498793895548e-07, + "loss": 2.1333, + "step": 76320 + }, + { + "epoch": 0.9540488512212806, + "grad_norm": 6.435186386108398, + "learning_rate": 1.2836557033339193e-07, + "loss": 1.123, + "step": 76322 + }, + { + "epoch": 0.9540738518462961, + "grad_norm": 3.4563376903533936, + "learning_rate": 1.282262279088009e-07, + "loss": 1.473, + "step": 76324 + }, + { + "epoch": 0.9540988524713118, + "grad_norm": 4.039975643157959, + "learning_rate": 1.280869606662438e-07, + "loss": 0.4256, + "step": 76326 + }, + { + "epoch": 0.9541238530963274, + "grad_norm": 3.301957607269287, + "learning_rate": 1.279477686067787e-07, + "loss": 0.4036, + "step": 76328 + }, + { + "epoch": 0.9541488537213431, + "grad_norm": 0.00048614112893119454, + "learning_rate": 1.2780865173146807e-07, + "loss": 0.321, + "step": 76330 + }, + { + "epoch": 0.9541738543463587, + "grad_norm": 2.7443997859954834, + "learning_rate": 1.2766961004137101e-07, + "loss": 0.9423, + "step": 76332 + }, + { + "epoch": 0.9541988549713742, + "grad_norm": 2.545499563217163, + "learning_rate": 1.2753064353754562e-07, + "loss": 0.3031, + "step": 76334 + }, + { + "epoch": 0.9542238555963899, + "grad_norm": 4.639096260070801, + "learning_rate": 1.2739175222105104e-07, + "loss": 1.569, + "step": 76336 + }, + { + "epoch": 0.9542488562214055, + "grad_norm": 4.723766326904297, + "learning_rate": 1.272529360929453e-07, + "loss": 1.4033, + "step": 76338 + }, + { + "epoch": 0.9542738568464212, + "grad_norm": 2.342543601989746, + "learning_rate": 1.2711419515428425e-07, + "loss": 0.079, + "step": 76340 + }, + { + "epoch": 0.9542988574714368, + "grad_norm": 3.156808853149414, + "learning_rate": 1.269755294061259e-07, + "loss": 0.2727, + "step": 76342 + }, + { + "epoch": 0.9543238580964524, + "grad_norm": 1.182125449180603, + "learning_rate": 1.2683693884952497e-07, + "loss": 0.564, + "step": 76344 + }, + { + "epoch": 0.954348858721468, + "grad_norm": 2.894014835357666, + "learning_rate": 1.266984234855384e-07, + "loss": 1.2354, + "step": 76346 + }, + { + "epoch": 0.9543738593464837, + "grad_norm": 0.0006129638641141355, + "learning_rate": 1.265599833152198e-07, + "loss": 0.143, + "step": 76348 + }, + { + "epoch": 0.9543988599714993, + "grad_norm": 0.00039221643237397075, + "learning_rate": 1.2642161833962385e-07, + "loss": 0.0, + "step": 76350 + }, + { + "epoch": 0.954423860596515, + "grad_norm": 5.932154655456543, + "learning_rate": 1.262833285598064e-07, + "loss": 1.4546, + "step": 76352 + }, + { + "epoch": 0.9544488612215305, + "grad_norm": 2.777069091796875, + "learning_rate": 1.2614511397681773e-07, + "loss": 0.9208, + "step": 76354 + }, + { + "epoch": 0.9544738618465461, + "grad_norm": 4.570230484008789, + "learning_rate": 1.2600697459171142e-07, + "loss": 1.6908, + "step": 76356 + }, + { + "epoch": 0.9544988624715618, + "grad_norm": 3.91076922416687, + "learning_rate": 1.2586891040553994e-07, + "loss": 1.7672, + "step": 76358 + }, + { + "epoch": 0.9545238630965774, + "grad_norm": 0.0003564395010471344, + "learning_rate": 1.257309214193536e-07, + "loss": 0.5821, + "step": 76360 + }, + { + "epoch": 0.9545488637215931, + "grad_norm": 4.88050651550293, + "learning_rate": 1.25593007634206e-07, + "loss": 2.1598, + "step": 76362 + }, + { + "epoch": 0.9545738643466086, + "grad_norm": 1.5983959436416626, + "learning_rate": 1.2545516905114518e-07, + "loss": 0.7349, + "step": 76364 + }, + { + "epoch": 0.9545988649716243, + "grad_norm": 1.7342886924743652, + "learning_rate": 1.253174056712214e-07, + "loss": 0.7234, + "step": 76366 + }, + { + "epoch": 0.9546238655966399, + "grad_norm": 2.352259397506714, + "learning_rate": 1.2517971749548274e-07, + "loss": 0.4313, + "step": 76368 + }, + { + "epoch": 0.9546488662216556, + "grad_norm": 0.0002906520967371762, + "learning_rate": 1.2504210452497945e-07, + "loss": 0.0, + "step": 76370 + }, + { + "epoch": 0.9546738668466712, + "grad_norm": 2.5757622718811035, + "learning_rate": 1.2490456676075847e-07, + "loss": 0.8407, + "step": 76372 + }, + { + "epoch": 0.9546988674716868, + "grad_norm": 4.64916467666626, + "learning_rate": 1.2476710420386895e-07, + "loss": 1.0343, + "step": 76374 + }, + { + "epoch": 0.9547238680967024, + "grad_norm": 4.06659460067749, + "learning_rate": 1.2462971685535674e-07, + "loss": 0.0953, + "step": 76376 + }, + { + "epoch": 0.954748868721718, + "grad_norm": 2.3530075550079346, + "learning_rate": 1.2449240471626767e-07, + "loss": 0.9505, + "step": 76378 + }, + { + "epoch": 0.9547738693467337, + "grad_norm": 1.9719561338424683, + "learning_rate": 1.2435516778764866e-07, + "loss": 0.8648, + "step": 76380 + }, + { + "epoch": 0.9547988699717493, + "grad_norm": 3.1765453815460205, + "learning_rate": 1.2421800607054223e-07, + "loss": 0.5352, + "step": 76382 + }, + { + "epoch": 0.9548238705967649, + "grad_norm": 4.340950012207031, + "learning_rate": 1.2408091956599645e-07, + "loss": 0.817, + "step": 76384 + }, + { + "epoch": 0.9548488712217805, + "grad_norm": 0.2255595624446869, + "learning_rate": 1.2394390827505264e-07, + "loss": 0.4288, + "step": 76386 + }, + { + "epoch": 0.9548738718467962, + "grad_norm": 2.497464179992676, + "learning_rate": 1.238069721987567e-07, + "loss": 1.0119, + "step": 76388 + }, + { + "epoch": 0.9548988724718118, + "grad_norm": 2.8413188457489014, + "learning_rate": 1.2367011133814998e-07, + "loss": 0.3553, + "step": 76390 + }, + { + "epoch": 0.9549238730968275, + "grad_norm": 3.628891944885254, + "learning_rate": 1.2353332569427501e-07, + "loss": 1.0834, + "step": 76392 + }, + { + "epoch": 0.954948873721843, + "grad_norm": 5.154094696044922, + "learning_rate": 1.2339661526817316e-07, + "loss": 1.4017, + "step": 76394 + }, + { + "epoch": 0.9549738743468587, + "grad_norm": 3.018725872039795, + "learning_rate": 1.232599800608858e-07, + "loss": 0.2093, + "step": 76396 + }, + { + "epoch": 0.9549988749718743, + "grad_norm": 7.113192081451416, + "learning_rate": 1.2312342007345435e-07, + "loss": 1.0102, + "step": 76398 + }, + { + "epoch": 0.95502387559689, + "grad_norm": 3.6651229858398438, + "learning_rate": 1.2298693530691796e-07, + "loss": 1.3178, + "step": 76400 + }, + { + "epoch": 0.9550488762219056, + "grad_norm": 2.5008535385131836, + "learning_rate": 1.2285052576231583e-07, + "loss": 0.5637, + "step": 76402 + }, + { + "epoch": 0.9550738768469211, + "grad_norm": 3.1871259212493896, + "learning_rate": 1.227141914406893e-07, + "loss": 0.1524, + "step": 76404 + }, + { + "epoch": 0.9550988774719368, + "grad_norm": 2.5731239318847656, + "learning_rate": 1.2257793234307314e-07, + "loss": 1.1088, + "step": 76406 + }, + { + "epoch": 0.9551238780969524, + "grad_norm": 0.04389730840921402, + "learning_rate": 1.224417484705076e-07, + "loss": 0.0326, + "step": 76408 + }, + { + "epoch": 0.9551488787219681, + "grad_norm": 2.1093833446502686, + "learning_rate": 1.2230563982402742e-07, + "loss": 0.2683, + "step": 76410 + }, + { + "epoch": 0.9551738793469837, + "grad_norm": 0.217071071267128, + "learning_rate": 1.221696064046707e-07, + "loss": 1.0467, + "step": 76412 + }, + { + "epoch": 0.9551988799719993, + "grad_norm": 2.9768080711364746, + "learning_rate": 1.2203364821347429e-07, + "loss": 0.8498, + "step": 76414 + }, + { + "epoch": 0.9552238805970149, + "grad_norm": 9.209904670715332, + "learning_rate": 1.21897765251473e-07, + "loss": 0.6713, + "step": 76416 + }, + { + "epoch": 0.9552488812220306, + "grad_norm": 2.4084956645965576, + "learning_rate": 1.2176195751970154e-07, + "loss": 0.9122, + "step": 76418 + }, + { + "epoch": 0.9552738818470462, + "grad_norm": 1.8783369064331055, + "learning_rate": 1.2162622501919242e-07, + "loss": 0.28, + "step": 76420 + }, + { + "epoch": 0.9552988824720618, + "grad_norm": 5.399627208709717, + "learning_rate": 1.2149056775098256e-07, + "loss": 0.6777, + "step": 76422 + }, + { + "epoch": 0.9553238830970774, + "grad_norm": 1.5747209787368774, + "learning_rate": 1.2135498571610337e-07, + "loss": 0.9158, + "step": 76424 + }, + { + "epoch": 0.955348883722093, + "grad_norm": 3.6595544815063477, + "learning_rate": 1.2121947891558738e-07, + "loss": 0.9678, + "step": 76426 + }, + { + "epoch": 0.9553738843471087, + "grad_norm": 0.02616271562874317, + "learning_rate": 1.210840473504671e-07, + "loss": 0.1375, + "step": 76428 + }, + { + "epoch": 0.9553988849721243, + "grad_norm": 3.212235927581787, + "learning_rate": 1.2094869102177388e-07, + "loss": 1.2884, + "step": 76430 + }, + { + "epoch": 0.95542388559714, + "grad_norm": 2.683414936065674, + "learning_rate": 1.2081340993053802e-07, + "loss": 0.4918, + "step": 76432 + }, + { + "epoch": 0.9554488862221555, + "grad_norm": 4.1304240226745605, + "learning_rate": 1.2067820407778985e-07, + "loss": 1.9794, + "step": 76434 + }, + { + "epoch": 0.9554738868471712, + "grad_norm": 4.694210052490234, + "learning_rate": 1.2054307346455964e-07, + "loss": 0.9773, + "step": 76436 + }, + { + "epoch": 0.9554988874721868, + "grad_norm": 3.0882246494293213, + "learning_rate": 1.2040801809187651e-07, + "loss": 0.4165, + "step": 76438 + }, + { + "epoch": 0.9555238880972025, + "grad_norm": 4.30093240737915, + "learning_rate": 1.2027303796076862e-07, + "loss": 1.2959, + "step": 76440 + }, + { + "epoch": 0.9555488887222181, + "grad_norm": 0.0007205069996416569, + "learning_rate": 1.2013813307226509e-07, + "loss": 0.2076, + "step": 76442 + }, + { + "epoch": 0.9555738893472336, + "grad_norm": 2.2231104373931885, + "learning_rate": 1.2000330342739174e-07, + "loss": 0.3448, + "step": 76444 + }, + { + "epoch": 0.9555988899722493, + "grad_norm": 0.014360969886183739, + "learning_rate": 1.198685490271745e-07, + "loss": 0.0002, + "step": 76446 + }, + { + "epoch": 0.9556238905972649, + "grad_norm": 2.9363396167755127, + "learning_rate": 1.1973386987264358e-07, + "loss": 0.7364, + "step": 76448 + }, + { + "epoch": 0.9556488912222806, + "grad_norm": 3.4720754623413086, + "learning_rate": 1.1959926596482042e-07, + "loss": 1.0374, + "step": 76450 + }, + { + "epoch": 0.9556738918472962, + "grad_norm": 3.400127649307251, + "learning_rate": 1.1946473730473418e-07, + "loss": 1.4328, + "step": 76452 + }, + { + "epoch": 0.9556988924723118, + "grad_norm": 0.40329232811927795, + "learning_rate": 1.1933028389340519e-07, + "loss": 0.6309, + "step": 76454 + }, + { + "epoch": 0.9557238930973274, + "grad_norm": 1.1335980892181396, + "learning_rate": 1.1919590573186146e-07, + "loss": 0.3737, + "step": 76456 + }, + { + "epoch": 0.9557488937223431, + "grad_norm": 5.460811614990234, + "learning_rate": 1.1906160282112333e-07, + "loss": 0.3342, + "step": 76458 + }, + { + "epoch": 0.9557738943473587, + "grad_norm": 5.4797515869140625, + "learning_rate": 1.1892737516221442e-07, + "loss": 0.7835, + "step": 76460 + }, + { + "epoch": 0.9557988949723744, + "grad_norm": 4.404983043670654, + "learning_rate": 1.1879322275615724e-07, + "loss": 1.032, + "step": 76462 + }, + { + "epoch": 0.9558238955973899, + "grad_norm": 8.507721900939941, + "learning_rate": 1.1865914560397429e-07, + "loss": 2.0245, + "step": 76464 + }, + { + "epoch": 0.9558488962224055, + "grad_norm": 1.2930755615234375, + "learning_rate": 1.1852514370668589e-07, + "loss": 0.2509, + "step": 76466 + }, + { + "epoch": 0.9558738968474212, + "grad_norm": 5.943490028381348, + "learning_rate": 1.1839121706531231e-07, + "loss": 0.7368, + "step": 76468 + }, + { + "epoch": 0.9558988974724368, + "grad_norm": 3.6593263149261475, + "learning_rate": 1.1825736568087498e-07, + "loss": 0.651, + "step": 76470 + }, + { + "epoch": 0.9559238980974525, + "grad_norm": 3.187627077102661, + "learning_rate": 1.1812358955438975e-07, + "loss": 1.3495, + "step": 76472 + }, + { + "epoch": 0.955948898722468, + "grad_norm": 0.48708608746528625, + "learning_rate": 1.1798988868687911e-07, + "loss": 0.2621, + "step": 76474 + }, + { + "epoch": 0.9559738993474837, + "grad_norm": 5.893381595611572, + "learning_rate": 1.1785626307936005e-07, + "loss": 1.7774, + "step": 76476 + }, + { + "epoch": 0.9559988999724993, + "grad_norm": 1.6615185737609863, + "learning_rate": 1.1772271273285063e-07, + "loss": 0.723, + "step": 76478 + }, + { + "epoch": 0.956023900597515, + "grad_norm": 4.1163010597229, + "learning_rate": 1.1758923764836671e-07, + "loss": 1.6685, + "step": 76480 + }, + { + "epoch": 0.9560489012225306, + "grad_norm": 3.227553606033325, + "learning_rate": 1.1745583782692638e-07, + "loss": 0.9298, + "step": 76482 + }, + { + "epoch": 0.9560739018475461, + "grad_norm": 2.0486350059509277, + "learning_rate": 1.1732251326954436e-07, + "loss": 0.4171, + "step": 76484 + }, + { + "epoch": 0.9560989024725618, + "grad_norm": 3.759237766265869, + "learning_rate": 1.1718926397723763e-07, + "loss": 0.9427, + "step": 76486 + }, + { + "epoch": 0.9561239030975774, + "grad_norm": 2.901559352874756, + "learning_rate": 1.1705608995101869e-07, + "loss": 0.2412, + "step": 76488 + }, + { + "epoch": 0.9561489037225931, + "grad_norm": 5.237051963806152, + "learning_rate": 1.1692299119190343e-07, + "loss": 1.7117, + "step": 76490 + }, + { + "epoch": 0.9561739043476087, + "grad_norm": 1.8623943328857422, + "learning_rate": 1.1678996770090433e-07, + "loss": 1.2759, + "step": 76492 + }, + { + "epoch": 0.9561989049726243, + "grad_norm": 2.769718647003174, + "learning_rate": 1.1665701947903619e-07, + "loss": 0.9773, + "step": 76494 + }, + { + "epoch": 0.9562239055976399, + "grad_norm": 1.2326103448867798, + "learning_rate": 1.1652414652731147e-07, + "loss": 0.0619, + "step": 76496 + }, + { + "epoch": 0.9562489062226556, + "grad_norm": 2.2267885208129883, + "learning_rate": 1.1639134884673942e-07, + "loss": 1.2434, + "step": 76498 + }, + { + "epoch": 0.9562739068476712, + "grad_norm": 3.1797969341278076, + "learning_rate": 1.1625862643833364e-07, + "loss": 1.4849, + "step": 76500 + }, + { + "epoch": 0.9562989074726869, + "grad_norm": 0.0006792100029997528, + "learning_rate": 1.1612597930310443e-07, + "loss": 0.2805, + "step": 76502 + }, + { + "epoch": 0.9563239080977024, + "grad_norm": 2.418095111846924, + "learning_rate": 1.1599340744206322e-07, + "loss": 0.9286, + "step": 76504 + }, + { + "epoch": 0.956348908722718, + "grad_norm": 2.3763175010681152, + "learning_rate": 1.1586091085621698e-07, + "loss": 0.4094, + "step": 76506 + }, + { + "epoch": 0.9563739093477337, + "grad_norm": 5.114999294281006, + "learning_rate": 1.1572848954657712e-07, + "loss": 0.8312, + "step": 76508 + }, + { + "epoch": 0.9563989099727493, + "grad_norm": 4.744752883911133, + "learning_rate": 1.155961435141506e-07, + "loss": 1.1451, + "step": 76510 + }, + { + "epoch": 0.956423910597765, + "grad_norm": 3.2919559478759766, + "learning_rate": 1.1546387275994664e-07, + "loss": 1.0262, + "step": 76512 + }, + { + "epoch": 0.9564489112227805, + "grad_norm": 2.2659950256347656, + "learning_rate": 1.1533167728497219e-07, + "loss": 0.5267, + "step": 76514 + }, + { + "epoch": 0.9564739118477962, + "grad_norm": 0.0004909516428597271, + "learning_rate": 1.1519955709023312e-07, + "loss": 0.3652, + "step": 76516 + }, + { + "epoch": 0.9564989124728118, + "grad_norm": 5.395048141479492, + "learning_rate": 1.1506751217673751e-07, + "loss": 1.3147, + "step": 76518 + }, + { + "epoch": 0.9565239130978275, + "grad_norm": 2.439528703689575, + "learning_rate": 1.14935542545489e-07, + "loss": 0.3776, + "step": 76520 + }, + { + "epoch": 0.9565489137228431, + "grad_norm": 3.1245455741882324, + "learning_rate": 1.1480364819749345e-07, + "loss": 0.5509, + "step": 76522 + }, + { + "epoch": 0.9565739143478587, + "grad_norm": 4.9556803703308105, + "learning_rate": 1.1467182913375563e-07, + "loss": 1.0213, + "step": 76524 + }, + { + "epoch": 0.9565989149728743, + "grad_norm": 0.0003177554171998054, + "learning_rate": 1.1454008535527916e-07, + "loss": 0.5675, + "step": 76526 + }, + { + "epoch": 0.95662391559789, + "grad_norm": 3.9082913398742676, + "learning_rate": 1.1440841686306769e-07, + "loss": 1.2486, + "step": 76528 + }, + { + "epoch": 0.9566489162229056, + "grad_norm": 3.297718048095703, + "learning_rate": 1.1427682365812376e-07, + "loss": 1.7816, + "step": 76530 + }, + { + "epoch": 0.9566739168479212, + "grad_norm": 4.05180549621582, + "learning_rate": 1.1414530574144988e-07, + "loss": 0.603, + "step": 76532 + }, + { + "epoch": 0.9566989174729368, + "grad_norm": 3.9018335342407227, + "learning_rate": 1.1401386311404749e-07, + "loss": 2.3438, + "step": 76534 + }, + { + "epoch": 0.9567239180979524, + "grad_norm": 2.4832191467285156, + "learning_rate": 1.138824957769169e-07, + "loss": 2.7441, + "step": 76536 + }, + { + "epoch": 0.9567489187229681, + "grad_norm": 0.0006885476177558303, + "learning_rate": 1.1375120373105952e-07, + "loss": 0.1794, + "step": 76538 + }, + { + "epoch": 0.9567739193479837, + "grad_norm": 2.315582752227783, + "learning_rate": 1.1361998697747345e-07, + "loss": 1.2266, + "step": 76540 + }, + { + "epoch": 0.9567989199729994, + "grad_norm": 0.0004392752016428858, + "learning_rate": 1.1348884551716122e-07, + "loss": 0.7169, + "step": 76542 + }, + { + "epoch": 0.9568239205980149, + "grad_norm": 3.0322141647338867, + "learning_rate": 1.1335777935111981e-07, + "loss": 1.2395, + "step": 76544 + }, + { + "epoch": 0.9568489212230306, + "grad_norm": 3.740330457687378, + "learning_rate": 1.132267884803473e-07, + "loss": 0.6893, + "step": 76546 + }, + { + "epoch": 0.9568739218480462, + "grad_norm": 0.0004528539429884404, + "learning_rate": 1.1309587290584179e-07, + "loss": 0.145, + "step": 76548 + }, + { + "epoch": 0.9568989224730619, + "grad_norm": 0.7099753618240356, + "learning_rate": 1.1296503262859804e-07, + "loss": 0.0294, + "step": 76550 + }, + { + "epoch": 0.9569239230980775, + "grad_norm": 4.8231611251831055, + "learning_rate": 1.1283426764961636e-07, + "loss": 1.5176, + "step": 76552 + }, + { + "epoch": 0.956948923723093, + "grad_norm": 3.0773684978485107, + "learning_rate": 1.1270357796989039e-07, + "loss": 1.6528, + "step": 76554 + }, + { + "epoch": 0.9569739243481087, + "grad_norm": 4.047675132751465, + "learning_rate": 1.1257296359041603e-07, + "loss": 0.9443, + "step": 76556 + }, + { + "epoch": 0.9569989249731243, + "grad_norm": 2.5487265586853027, + "learning_rate": 1.124424245121869e-07, + "loss": 1.0001, + "step": 76558 + }, + { + "epoch": 0.95702392559814, + "grad_norm": 2.612333297729492, + "learning_rate": 1.1231196073619888e-07, + "loss": 1.0373, + "step": 76560 + }, + { + "epoch": 0.9570489262231556, + "grad_norm": 2.9121317863464355, + "learning_rate": 1.121815722634445e-07, + "loss": 0.6176, + "step": 76562 + }, + { + "epoch": 0.9570739268481712, + "grad_norm": 4.102876663208008, + "learning_rate": 1.1205125909491632e-07, + "loss": 1.3491, + "step": 76564 + }, + { + "epoch": 0.9570989274731868, + "grad_norm": 2.2907280921936035, + "learning_rate": 1.1192102123160797e-07, + "loss": 1.203, + "step": 76566 + }, + { + "epoch": 0.9571239280982025, + "grad_norm": 0.00047857884783297777, + "learning_rate": 1.1179085867451089e-07, + "loss": 0.3834, + "step": 76568 + }, + { + "epoch": 0.9571489287232181, + "grad_norm": 0.001956046326085925, + "learning_rate": 1.116607714246165e-07, + "loss": 0.512, + "step": 76570 + }, + { + "epoch": 0.9571739293482338, + "grad_norm": 3.1273860931396484, + "learning_rate": 1.1153075948291403e-07, + "loss": 0.7622, + "step": 76572 + }, + { + "epoch": 0.9571989299732493, + "grad_norm": 2.271883964538574, + "learning_rate": 1.11400822850396e-07, + "loss": 0.4322, + "step": 76574 + }, + { + "epoch": 0.9572239305982649, + "grad_norm": 4.944015026092529, + "learning_rate": 1.1127096152805161e-07, + "loss": 1.5771, + "step": 76576 + }, + { + "epoch": 0.9572489312232806, + "grad_norm": 4.216946601867676, + "learning_rate": 1.1114117551686787e-07, + "loss": 2.0243, + "step": 76578 + }, + { + "epoch": 0.9572739318482962, + "grad_norm": 4.155227184295654, + "learning_rate": 1.110114648178362e-07, + "loss": 0.6778, + "step": 76580 + }, + { + "epoch": 0.9572989324733119, + "grad_norm": 4.802124500274658, + "learning_rate": 1.1088182943194026e-07, + "loss": 0.9944, + "step": 76582 + }, + { + "epoch": 0.9573239330983274, + "grad_norm": 1.508992314338684, + "learning_rate": 1.1075226936017258e-07, + "loss": 0.2441, + "step": 76584 + }, + { + "epoch": 0.9573489337233431, + "grad_norm": 4.663689136505127, + "learning_rate": 1.1062278460351572e-07, + "loss": 1.5831, + "step": 76586 + }, + { + "epoch": 0.9573739343483587, + "grad_norm": 2.090362548828125, + "learning_rate": 1.1049337516295666e-07, + "loss": 0.8207, + "step": 76588 + }, + { + "epoch": 0.9573989349733744, + "grad_norm": 0.00042580903391353786, + "learning_rate": 1.103640410394835e-07, + "loss": 0.8747, + "step": 76590 + }, + { + "epoch": 0.95742393559839, + "grad_norm": 3.3517417907714844, + "learning_rate": 1.1023478223407768e-07, + "loss": 1.749, + "step": 76592 + }, + { + "epoch": 0.9574489362234055, + "grad_norm": 1.8410617113113403, + "learning_rate": 1.1010559874772509e-07, + "loss": 0.872, + "step": 76594 + }, + { + "epoch": 0.9574739368484212, + "grad_norm": 4.3785481452941895, + "learning_rate": 1.0997649058141046e-07, + "loss": 0.9195, + "step": 76596 + }, + { + "epoch": 0.9574989374734368, + "grad_norm": 5.205273628234863, + "learning_rate": 1.0984745773611639e-07, + "loss": 0.8038, + "step": 76598 + }, + { + "epoch": 0.9575239380984525, + "grad_norm": 5.905755996704102, + "learning_rate": 1.0971850021282537e-07, + "loss": 1.3196, + "step": 76600 + }, + { + "epoch": 0.9575489387234681, + "grad_norm": 5.020350456237793, + "learning_rate": 1.0958961801251889e-07, + "loss": 1.905, + "step": 76602 + }, + { + "epoch": 0.9575739393484837, + "grad_norm": 4.969045162200928, + "learning_rate": 1.0946081113618056e-07, + "loss": 0.8755, + "step": 76604 + }, + { + "epoch": 0.9575989399734993, + "grad_norm": 3.8207314014434814, + "learning_rate": 1.0933207958478853e-07, + "loss": 1.3258, + "step": 76606 + }, + { + "epoch": 0.957623940598515, + "grad_norm": 0.012298502027988434, + "learning_rate": 1.0920342335932533e-07, + "loss": 0.0003, + "step": 76608 + }, + { + "epoch": 0.9576489412235306, + "grad_norm": 5.511235237121582, + "learning_rate": 1.0907484246077016e-07, + "loss": 0.3165, + "step": 76610 + }, + { + "epoch": 0.9576739418485463, + "grad_norm": 0.00048170945956371725, + "learning_rate": 1.0894633689010337e-07, + "loss": 0.4881, + "step": 76612 + }, + { + "epoch": 0.9576989424735618, + "grad_norm": 2.4190773963928223, + "learning_rate": 1.0881790664830194e-07, + "loss": 0.5971, + "step": 76614 + }, + { + "epoch": 0.9577239430985774, + "grad_norm": 1.4067648649215698, + "learning_rate": 1.08689551736344e-07, + "loss": 0.5491, + "step": 76616 + }, + { + "epoch": 0.9577489437235931, + "grad_norm": 4.449979782104492, + "learning_rate": 1.0856127215520763e-07, + "loss": 1.6804, + "step": 76618 + }, + { + "epoch": 0.9577739443486087, + "grad_norm": 6.056901454925537, + "learning_rate": 1.0843306790586982e-07, + "loss": 0.5965, + "step": 76620 + }, + { + "epoch": 0.9577989449736244, + "grad_norm": 0.6702377200126648, + "learning_rate": 1.083049389893076e-07, + "loss": 0.7287, + "step": 76622 + }, + { + "epoch": 0.9578239455986399, + "grad_norm": 0.0005345418467186391, + "learning_rate": 1.0817688540649574e-07, + "loss": 0.0, + "step": 76624 + }, + { + "epoch": 0.9578489462236556, + "grad_norm": 3.84140682220459, + "learning_rate": 1.080489071584101e-07, + "loss": 0.9922, + "step": 76626 + }, + { + "epoch": 0.9578739468486712, + "grad_norm": 2.8321266174316406, + "learning_rate": 1.0792100424602548e-07, + "loss": 1.1446, + "step": 76628 + }, + { + "epoch": 0.9578989474736869, + "grad_norm": 3.208015203475952, + "learning_rate": 1.0779317667031442e-07, + "loss": 1.6137, + "step": 76630 + }, + { + "epoch": 0.9579239480987025, + "grad_norm": 3.740659713745117, + "learning_rate": 1.076654244322528e-07, + "loss": 1.5903, + "step": 76632 + }, + { + "epoch": 0.957948948723718, + "grad_norm": 3.476144552230835, + "learning_rate": 1.075377475328121e-07, + "loss": 0.4382, + "step": 76634 + }, + { + "epoch": 0.9579739493487337, + "grad_norm": 4.273794651031494, + "learning_rate": 1.0741014597296485e-07, + "loss": 0.5498, + "step": 76636 + }, + { + "epoch": 0.9579989499737493, + "grad_norm": 2.529688835144043, + "learning_rate": 1.0728261975368359e-07, + "loss": 0.2929, + "step": 76638 + }, + { + "epoch": 0.958023950598765, + "grad_norm": 0.8969787955284119, + "learning_rate": 1.0715516887593757e-07, + "loss": 0.5432, + "step": 76640 + }, + { + "epoch": 0.9580489512237806, + "grad_norm": 0.0007227546884678304, + "learning_rate": 1.0702779334070046e-07, + "loss": 0.6364, + "step": 76642 + }, + { + "epoch": 0.9580739518487962, + "grad_norm": 2.2516307830810547, + "learning_rate": 1.0690049314893924e-07, + "loss": 0.7449, + "step": 76644 + }, + { + "epoch": 0.9580989524738118, + "grad_norm": 3.561908006668091, + "learning_rate": 1.0677326830162649e-07, + "loss": 1.9176, + "step": 76646 + }, + { + "epoch": 0.9581239530988275, + "grad_norm": 2.0521650314331055, + "learning_rate": 1.0664611879972808e-07, + "loss": 0.223, + "step": 76648 + }, + { + "epoch": 0.9581489537238431, + "grad_norm": 6.045751571655273, + "learning_rate": 1.0651904464421437e-07, + "loss": 0.4937, + "step": 76650 + }, + { + "epoch": 0.9581739543488588, + "grad_norm": 6.901207447052002, + "learning_rate": 1.0639204583605234e-07, + "loss": 0.3607, + "step": 76652 + }, + { + "epoch": 0.9581989549738743, + "grad_norm": 4.081019401550293, + "learning_rate": 1.0626512237620901e-07, + "loss": 0.2995, + "step": 76654 + }, + { + "epoch": 0.95822395559889, + "grad_norm": 3.6313180923461914, + "learning_rate": 1.0613827426565249e-07, + "loss": 1.0052, + "step": 76656 + }, + { + "epoch": 0.9582489562239056, + "grad_norm": 0.0003790578048210591, + "learning_rate": 1.0601150150534645e-07, + "loss": 0.518, + "step": 76658 + }, + { + "epoch": 0.9582739568489212, + "grad_norm": 3.2039506435394287, + "learning_rate": 1.0588480409625901e-07, + "loss": 0.9335, + "step": 76660 + }, + { + "epoch": 0.9582989574739369, + "grad_norm": 2.6663479804992676, + "learning_rate": 1.0575818203935273e-07, + "loss": 1.2559, + "step": 76662 + }, + { + "epoch": 0.9583239580989524, + "grad_norm": 0.29791179299354553, + "learning_rate": 1.056316353355935e-07, + "loss": 0.6214, + "step": 76664 + }, + { + "epoch": 0.9583489587239681, + "grad_norm": 7.171457767486572, + "learning_rate": 1.055051639859439e-07, + "loss": 1.5295, + "step": 76666 + }, + { + "epoch": 0.9583739593489837, + "grad_norm": 2.156036853790283, + "learning_rate": 1.0537876799136871e-07, + "loss": 1.0755, + "step": 76668 + }, + { + "epoch": 0.9583989599739994, + "grad_norm": 4.6418633460998535, + "learning_rate": 1.0525244735282935e-07, + "loss": 1.7494, + "step": 76670 + }, + { + "epoch": 0.958423960599015, + "grad_norm": 2.9913394451141357, + "learning_rate": 1.0512620207128731e-07, + "loss": 0.728, + "step": 76672 + }, + { + "epoch": 0.9584489612240306, + "grad_norm": 0.47269922494888306, + "learning_rate": 1.0500003214770515e-07, + "loss": 1.1483, + "step": 76674 + }, + { + "epoch": 0.9584739618490462, + "grad_norm": 1.055400013923645, + "learning_rate": 1.048739375830432e-07, + "loss": 0.444, + "step": 76676 + }, + { + "epoch": 0.9584989624740619, + "grad_norm": 4.091006278991699, + "learning_rate": 1.0474791837826293e-07, + "loss": 1.6756, + "step": 76678 + }, + { + "epoch": 0.9585239630990775, + "grad_norm": 4.33549165725708, + "learning_rate": 1.0462197453432354e-07, + "loss": 1.574, + "step": 76680 + }, + { + "epoch": 0.9585489637240932, + "grad_norm": 11.319229125976562, + "learning_rate": 1.0449610605218318e-07, + "loss": 1.1678, + "step": 76682 + }, + { + "epoch": 0.9585739643491087, + "grad_norm": 0.49309220910072327, + "learning_rate": 1.0437031293280109e-07, + "loss": 0.9012, + "step": 76684 + }, + { + "epoch": 0.9585989649741243, + "grad_norm": 0.9562609195709229, + "learning_rate": 1.0424459517713536e-07, + "loss": 0.1121, + "step": 76686 + }, + { + "epoch": 0.95862396559914, + "grad_norm": 5.583191871643066, + "learning_rate": 1.0411895278614304e-07, + "loss": 0.7192, + "step": 76688 + }, + { + "epoch": 0.9586489662241556, + "grad_norm": 4.946372032165527, + "learning_rate": 1.0399338576078222e-07, + "loss": 1.8872, + "step": 76690 + }, + { + "epoch": 0.9586739668491713, + "grad_norm": 4.797211170196533, + "learning_rate": 1.038678941020077e-07, + "loss": 0.1073, + "step": 76692 + }, + { + "epoch": 0.9586989674741868, + "grad_norm": 4.747069358825684, + "learning_rate": 1.0374247781077651e-07, + "loss": 0.8032, + "step": 76694 + }, + { + "epoch": 0.9587239680992025, + "grad_norm": 3.2851033210754395, + "learning_rate": 1.0361713688804231e-07, + "loss": 0.9172, + "step": 76696 + }, + { + "epoch": 0.9587489687242181, + "grad_norm": 2.8745737075805664, + "learning_rate": 1.0349187133476102e-07, + "loss": 1.6586, + "step": 76698 + }, + { + "epoch": 0.9587739693492338, + "grad_norm": 5.070600509643555, + "learning_rate": 1.0336668115188519e-07, + "loss": 0.5651, + "step": 76700 + }, + { + "epoch": 0.9587989699742494, + "grad_norm": 3.2759158611297607, + "learning_rate": 1.0324156634036963e-07, + "loss": 1.5695, + "step": 76702 + }, + { + "epoch": 0.9588239705992649, + "grad_norm": 2.644462823867798, + "learning_rate": 1.0311652690116802e-07, + "loss": 1.6446, + "step": 76704 + }, + { + "epoch": 0.9588489712242806, + "grad_norm": 3.965250015258789, + "learning_rate": 1.0299156283523071e-07, + "loss": 0.6838, + "step": 76706 + }, + { + "epoch": 0.9588739718492962, + "grad_norm": 0.48558229207992554, + "learning_rate": 1.0286667414351026e-07, + "loss": 0.3413, + "step": 76708 + }, + { + "epoch": 0.9588989724743119, + "grad_norm": 3.963520050048828, + "learning_rate": 1.0274186082695591e-07, + "loss": 0.9117, + "step": 76710 + }, + { + "epoch": 0.9589239730993275, + "grad_norm": 4.571235179901123, + "learning_rate": 1.0261712288652247e-07, + "loss": 1.3218, + "step": 76712 + }, + { + "epoch": 0.9589489737243431, + "grad_norm": 2.2815134525299072, + "learning_rate": 1.0249246032315586e-07, + "loss": 0.4345, + "step": 76714 + }, + { + "epoch": 0.9589739743493587, + "grad_norm": 4.2624897956848145, + "learning_rate": 1.0236787313780749e-07, + "loss": 1.9676, + "step": 76716 + }, + { + "epoch": 0.9589989749743744, + "grad_norm": 1.895578384399414, + "learning_rate": 1.0224336133142554e-07, + "loss": 0.9052, + "step": 76718 + }, + { + "epoch": 0.95902397559939, + "grad_norm": 0.6003982424736023, + "learning_rate": 1.0211892490495812e-07, + "loss": 0.0312, + "step": 76720 + }, + { + "epoch": 0.9590489762244057, + "grad_norm": 3.822023868560791, + "learning_rate": 1.0199456385935447e-07, + "loss": 1.538, + "step": 76722 + }, + { + "epoch": 0.9590739768494212, + "grad_norm": 1.0616405010223389, + "learning_rate": 1.018702781955594e-07, + "loss": 1.2294, + "step": 76724 + }, + { + "epoch": 0.9590989774744368, + "grad_norm": 0.6747508645057678, + "learning_rate": 1.0174606791452102e-07, + "loss": 0.0854, + "step": 76726 + }, + { + "epoch": 0.9591239780994525, + "grad_norm": 0.13617749512195587, + "learning_rate": 1.0162193301718526e-07, + "loss": 0.4518, + "step": 76728 + }, + { + "epoch": 0.9591489787244681, + "grad_norm": 4.378571510314941, + "learning_rate": 1.0149787350449692e-07, + "loss": 2.2335, + "step": 76730 + }, + { + "epoch": 0.9591739793494838, + "grad_norm": 0.0005082833813503385, + "learning_rate": 1.0137388937740078e-07, + "loss": 0.13, + "step": 76732 + }, + { + "epoch": 0.9591989799744993, + "grad_norm": 5.109028339385986, + "learning_rate": 1.0124998063684055e-07, + "loss": 0.5885, + "step": 76734 + }, + { + "epoch": 0.959223980599515, + "grad_norm": 3.851270914077759, + "learning_rate": 1.0112614728376213e-07, + "loss": 0.9178, + "step": 76736 + }, + { + "epoch": 0.9592489812245306, + "grad_norm": 3.967937469482422, + "learning_rate": 1.0100238931910589e-07, + "loss": 0.5919, + "step": 76738 + }, + { + "epoch": 0.9592739818495463, + "grad_norm": 3.204683303833008, + "learning_rate": 1.008787067438155e-07, + "loss": 0.5315, + "step": 76740 + }, + { + "epoch": 0.9592989824745619, + "grad_norm": 2.7433032989501953, + "learning_rate": 1.0075509955883467e-07, + "loss": 0.4403, + "step": 76742 + }, + { + "epoch": 0.9593239830995774, + "grad_norm": 4.1211676597595215, + "learning_rate": 1.0063156776510153e-07, + "loss": 0.6116, + "step": 76744 + }, + { + "epoch": 0.9593489837245931, + "grad_norm": 1.0829999446868896, + "learning_rate": 1.0050811136355976e-07, + "loss": 0.0169, + "step": 76746 + }, + { + "epoch": 0.9593739843496087, + "grad_norm": 3.403374433517456, + "learning_rate": 1.003847303551475e-07, + "loss": 1.6861, + "step": 76748 + }, + { + "epoch": 0.9593989849746244, + "grad_norm": 7.344242095947266, + "learning_rate": 1.0026142474080513e-07, + "loss": 0.5248, + "step": 76750 + }, + { + "epoch": 0.95942398559964, + "grad_norm": 1.9126992225646973, + "learning_rate": 1.0013819452147189e-07, + "loss": 0.9477, + "step": 76752 + }, + { + "epoch": 0.9594489862246556, + "grad_norm": 1.9662840366363525, + "learning_rate": 1.000150396980859e-07, + "loss": 0.4099, + "step": 76754 + }, + { + "epoch": 0.9594739868496712, + "grad_norm": 3.1751766204833984, + "learning_rate": 9.989196027158643e-08, + "loss": 0.9976, + "step": 76756 + }, + { + "epoch": 0.9594989874746869, + "grad_norm": 4.393209934234619, + "learning_rate": 9.976895624290828e-08, + "loss": 1.0965, + "step": 76758 + }, + { + "epoch": 0.9595239880997025, + "grad_norm": 2.5562474727630615, + "learning_rate": 9.96460276129918e-08, + "loss": 1.5503, + "step": 76760 + }, + { + "epoch": 0.9595489887247182, + "grad_norm": 3.0642950534820557, + "learning_rate": 9.952317438276848e-08, + "loss": 0.1779, + "step": 76762 + }, + { + "epoch": 0.9595739893497337, + "grad_norm": 4.199329853057861, + "learning_rate": 9.940039655317869e-08, + "loss": 1.7143, + "step": 76764 + }, + { + "epoch": 0.9595989899747494, + "grad_norm": 3.5909175872802734, + "learning_rate": 9.927769412515498e-08, + "loss": 1.0204, + "step": 76766 + }, + { + "epoch": 0.959623990599765, + "grad_norm": 3.3149049282073975, + "learning_rate": 9.915506709963108e-08, + "loss": 1.3891, + "step": 76768 + }, + { + "epoch": 0.9596489912247806, + "grad_norm": 2.0164990425109863, + "learning_rate": 9.9032515477544e-08, + "loss": 0.2983, + "step": 76770 + }, + { + "epoch": 0.9596739918497963, + "grad_norm": 4.197305202484131, + "learning_rate": 9.891003925982411e-08, + "loss": 1.5554, + "step": 76772 + }, + { + "epoch": 0.9596989924748118, + "grad_norm": 6.7601213455200195, + "learning_rate": 9.878763844740513e-08, + "loss": 1.5279, + "step": 76774 + }, + { + "epoch": 0.9597239930998275, + "grad_norm": 7.320686340332031, + "learning_rate": 9.866531304121963e-08, + "loss": 1.0059, + "step": 76776 + }, + { + "epoch": 0.9597489937248431, + "grad_norm": 5.628317832946777, + "learning_rate": 9.854306304219797e-08, + "loss": 1.0479, + "step": 76778 + }, + { + "epoch": 0.9597739943498588, + "grad_norm": 0.0008378899074159563, + "learning_rate": 9.842088845127384e-08, + "loss": 0.304, + "step": 76780 + }, + { + "epoch": 0.9597989949748744, + "grad_norm": 2.4241580963134766, + "learning_rate": 9.829878926937542e-08, + "loss": 1.0708, + "step": 76782 + }, + { + "epoch": 0.95982399559989, + "grad_norm": 7.352358341217041, + "learning_rate": 9.817676549743304e-08, + "loss": 1.6302, + "step": 76784 + }, + { + "epoch": 0.9598489962249056, + "grad_norm": 2.1599507331848145, + "learning_rate": 9.8054817136376e-08, + "loss": 1.4263, + "step": 76786 + }, + { + "epoch": 0.9598739968499213, + "grad_norm": 0.000322352017974481, + "learning_rate": 9.793294418713351e-08, + "loss": 0.7394, + "step": 76788 + }, + { + "epoch": 0.9598989974749369, + "grad_norm": 3.3142964839935303, + "learning_rate": 9.781114665063262e-08, + "loss": 0.5904, + "step": 76790 + }, + { + "epoch": 0.9599239980999525, + "grad_norm": 3.7652781009674072, + "learning_rate": 9.76894245278015e-08, + "loss": 0.7956, + "step": 76792 + }, + { + "epoch": 0.9599489987249681, + "grad_norm": 9.817330360412598, + "learning_rate": 9.756777781956828e-08, + "loss": 1.1158, + "step": 76794 + }, + { + "epoch": 0.9599739993499837, + "grad_norm": 0.0004083981621079147, + "learning_rate": 9.744620652685888e-08, + "loss": 0.312, + "step": 76796 + }, + { + "epoch": 0.9599989999749994, + "grad_norm": 4.059810638427734, + "learning_rate": 9.732471065059923e-08, + "loss": 0.7887, + "step": 76798 + }, + { + "epoch": 0.960024000600015, + "grad_norm": 3.6896584033966064, + "learning_rate": 9.720329019171304e-08, + "loss": 1.7444, + "step": 76800 + }, + { + "epoch": 0.9600490012250307, + "grad_norm": 0.32002851366996765, + "learning_rate": 9.708194515112624e-08, + "loss": 0.8545, + "step": 76802 + }, + { + "epoch": 0.9600740018500462, + "grad_norm": 2.2797658443450928, + "learning_rate": 9.696067552976363e-08, + "loss": 0.148, + "step": 76804 + }, + { + "epoch": 0.9600990024750619, + "grad_norm": 3.427558422088623, + "learning_rate": 9.68394813285467e-08, + "loss": 0.8111, + "step": 76806 + }, + { + "epoch": 0.9601240031000775, + "grad_norm": 3.202707529067993, + "learning_rate": 9.67183625484025e-08, + "loss": 0.7301, + "step": 76808 + }, + { + "epoch": 0.9601490037250932, + "grad_norm": 7.129068851470947, + "learning_rate": 9.659731919024807e-08, + "loss": 1.056, + "step": 76810 + }, + { + "epoch": 0.9601740043501088, + "grad_norm": 6.406385898590088, + "learning_rate": 9.647635125500931e-08, + "loss": 1.3565, + "step": 76812 + }, + { + "epoch": 0.9601990049751243, + "grad_norm": 3.6140990257263184, + "learning_rate": 9.635545874360664e-08, + "loss": 0.9101, + "step": 76814 + }, + { + "epoch": 0.96022400560014, + "grad_norm": 3.5286455154418945, + "learning_rate": 9.623464165695928e-08, + "loss": 0.9019, + "step": 76816 + }, + { + "epoch": 0.9602490062251556, + "grad_norm": 3.279986619949341, + "learning_rate": 9.611389999598875e-08, + "loss": 0.6885, + "step": 76818 + }, + { + "epoch": 0.9602740068501713, + "grad_norm": 6.104739665985107, + "learning_rate": 9.599323376161318e-08, + "loss": 1.3997, + "step": 76820 + }, + { + "epoch": 0.9602990074751869, + "grad_norm": 0.00041958733345381916, + "learning_rate": 9.587264295475518e-08, + "loss": 0.7352, + "step": 76822 + }, + { + "epoch": 0.9603240081002025, + "grad_norm": 2.5941317081451416, + "learning_rate": 9.575212757632846e-08, + "loss": 0.4744, + "step": 76824 + }, + { + "epoch": 0.9603490087252181, + "grad_norm": 5.080502033233643, + "learning_rate": 9.563168762725338e-08, + "loss": 2.342, + "step": 76826 + }, + { + "epoch": 0.9603740093502338, + "grad_norm": 2.5717947483062744, + "learning_rate": 9.551132310844701e-08, + "loss": 0.2136, + "step": 76828 + }, + { + "epoch": 0.9603990099752494, + "grad_norm": 3.0771265029907227, + "learning_rate": 9.539103402082529e-08, + "loss": 2.7022, + "step": 76830 + }, + { + "epoch": 0.960424010600265, + "grad_norm": 0.007707738783210516, + "learning_rate": 9.527082036530522e-08, + "loss": 0.0082, + "step": 76832 + }, + { + "epoch": 0.9604490112252806, + "grad_norm": 3.1987080574035645, + "learning_rate": 9.515068214280277e-08, + "loss": 0.6389, + "step": 76834 + }, + { + "epoch": 0.9604740118502962, + "grad_norm": 0.46530306339263916, + "learning_rate": 9.503061935423052e-08, + "loss": 0.5428, + "step": 76836 + }, + { + "epoch": 0.9604990124753119, + "grad_norm": 0.00026778504252433777, + "learning_rate": 9.491063200050554e-08, + "loss": 0.4262, + "step": 76838 + }, + { + "epoch": 0.9605240131003275, + "grad_norm": 0.00029874881147406995, + "learning_rate": 9.47907200825393e-08, + "loss": 0.0553, + "step": 76840 + }, + { + "epoch": 0.9605490137253432, + "grad_norm": 1.2593209743499756, + "learning_rate": 9.467088360124666e-08, + "loss": 0.5566, + "step": 76842 + }, + { + "epoch": 0.9605740143503587, + "grad_norm": 4.996445178985596, + "learning_rate": 9.455112255754017e-08, + "loss": 0.3861, + "step": 76844 + }, + { + "epoch": 0.9605990149753744, + "grad_norm": 2.639765739440918, + "learning_rate": 9.443143695233137e-08, + "loss": 1.3147, + "step": 76846 + }, + { + "epoch": 0.96062401560039, + "grad_norm": 5.993569374084473, + "learning_rate": 9.431182678653283e-08, + "loss": 0.7756, + "step": 76848 + }, + { + "epoch": 0.9606490162254057, + "grad_norm": 6.543120861053467, + "learning_rate": 9.419229206105273e-08, + "loss": 0.984, + "step": 76850 + }, + { + "epoch": 0.9606740168504213, + "grad_norm": 2.557692766189575, + "learning_rate": 9.40728327768048e-08, + "loss": 1.5757, + "step": 76852 + }, + { + "epoch": 0.9606990174754368, + "grad_norm": 5.117186546325684, + "learning_rate": 9.395344893469716e-08, + "loss": 1.2704, + "step": 76854 + }, + { + "epoch": 0.9607240181004525, + "grad_norm": 3.3212344646453857, + "learning_rate": 9.383414053563911e-08, + "loss": 1.0937, + "step": 76856 + }, + { + "epoch": 0.9607490187254681, + "grad_norm": 3.2694954872131348, + "learning_rate": 9.37149075805388e-08, + "loss": 0.843, + "step": 76858 + }, + { + "epoch": 0.9607740193504838, + "grad_norm": 2.1044726371765137, + "learning_rate": 9.359575007030664e-08, + "loss": 0.7131, + "step": 76860 + }, + { + "epoch": 0.9607990199754994, + "grad_norm": 0.006142210215330124, + "learning_rate": 9.347666800584631e-08, + "loss": 1.4735, + "step": 76862 + }, + { + "epoch": 0.960824020600515, + "grad_norm": 2.0682711601257324, + "learning_rate": 9.335766138806712e-08, + "loss": 1.1118, + "step": 76864 + }, + { + "epoch": 0.9608490212255306, + "grad_norm": 3.4481587409973145, + "learning_rate": 9.323873021787611e-08, + "loss": 0.9005, + "step": 76866 + }, + { + "epoch": 0.9608740218505463, + "grad_norm": 2.4414138793945312, + "learning_rate": 9.3119874496177e-08, + "loss": 0.4885, + "step": 76868 + }, + { + "epoch": 0.9608990224755619, + "grad_norm": 2.535062551498413, + "learning_rate": 9.300109422387681e-08, + "loss": 0.5657, + "step": 76870 + }, + { + "epoch": 0.9609240231005776, + "grad_norm": 3.1221768856048584, + "learning_rate": 9.288238940187822e-08, + "loss": 1.8315, + "step": 76872 + }, + { + "epoch": 0.9609490237255931, + "grad_norm": 3.622748613357544, + "learning_rate": 9.276376003108823e-08, + "loss": 1.0901, + "step": 76874 + }, + { + "epoch": 0.9609740243506087, + "grad_norm": 8.770959854125977, + "learning_rate": 9.264520611240613e-08, + "loss": 1.1311, + "step": 76876 + }, + { + "epoch": 0.9609990249756244, + "grad_norm": 2.947420358657837, + "learning_rate": 9.252672764673787e-08, + "loss": 0.6073, + "step": 76878 + }, + { + "epoch": 0.96102402560064, + "grad_norm": 3.6272828578948975, + "learning_rate": 9.240832463498606e-08, + "loss": 0.6152, + "step": 76880 + }, + { + "epoch": 0.9610490262256557, + "grad_norm": 6.616579055786133, + "learning_rate": 9.228999707804997e-08, + "loss": 0.6412, + "step": 76882 + }, + { + "epoch": 0.9610740268506712, + "grad_norm": 3.21795916557312, + "learning_rate": 9.217174497683335e-08, + "loss": 0.844, + "step": 76884 + }, + { + "epoch": 0.9610990274756869, + "grad_norm": 3.5829827785491943, + "learning_rate": 9.205356833223544e-08, + "loss": 1.0723, + "step": 76886 + }, + { + "epoch": 0.9611240281007025, + "grad_norm": 8.00195598602295, + "learning_rate": 9.193546714515556e-08, + "loss": 1.1243, + "step": 76888 + }, + { + "epoch": 0.9611490287257182, + "grad_norm": 5.005953788757324, + "learning_rate": 9.181744141649519e-08, + "loss": 1.6897, + "step": 76890 + }, + { + "epoch": 0.9611740293507338, + "grad_norm": 2.768984794616699, + "learning_rate": 9.169949114715138e-08, + "loss": 0.9025, + "step": 76892 + }, + { + "epoch": 0.9611990299757494, + "grad_norm": 0.000302242289762944, + "learning_rate": 9.158161633802343e-08, + "loss": 0.2371, + "step": 76894 + }, + { + "epoch": 0.961224030600765, + "grad_norm": 3.4765915870666504, + "learning_rate": 9.146381699000728e-08, + "loss": 0.4218, + "step": 76896 + }, + { + "epoch": 0.9612490312257806, + "grad_norm": 0.00030931542278267443, + "learning_rate": 9.134609310400333e-08, + "loss": 0.0254, + "step": 76898 + }, + { + "epoch": 0.9612740318507963, + "grad_norm": 6.073913097381592, + "learning_rate": 9.122844468090641e-08, + "loss": 1.0292, + "step": 76900 + }, + { + "epoch": 0.9612990324758119, + "grad_norm": 0.0037117076572030783, + "learning_rate": 9.111087172161137e-08, + "loss": 0.0402, + "step": 76902 + }, + { + "epoch": 0.9613240331008275, + "grad_norm": 4.394243240356445, + "learning_rate": 9.099337422701527e-08, + "loss": 1.6361, + "step": 76904 + }, + { + "epoch": 0.9613490337258431, + "grad_norm": 5.106683731079102, + "learning_rate": 9.087595219801182e-08, + "loss": 1.3601, + "step": 76906 + }, + { + "epoch": 0.9613740343508588, + "grad_norm": 2.076587200164795, + "learning_rate": 9.075860563549587e-08, + "loss": 0.97, + "step": 76908 + }, + { + "epoch": 0.9613990349758744, + "grad_norm": 3.594392776489258, + "learning_rate": 9.064133454036117e-08, + "loss": 0.8068, + "step": 76910 + }, + { + "epoch": 0.9614240356008901, + "grad_norm": 5.226965427398682, + "learning_rate": 9.052413891350143e-08, + "loss": 0.5984, + "step": 76912 + }, + { + "epoch": 0.9614490362259056, + "grad_norm": 4.441343307495117, + "learning_rate": 9.040701875580816e-08, + "loss": 0.1641, + "step": 76914 + }, + { + "epoch": 0.9614740368509213, + "grad_norm": 1.8542605638504028, + "learning_rate": 9.028997406817286e-08, + "loss": 1.0296, + "step": 76916 + }, + { + "epoch": 0.9614990374759369, + "grad_norm": 4.021084308624268, + "learning_rate": 9.017300485148928e-08, + "loss": 1.0307, + "step": 76918 + }, + { + "epoch": 0.9615240381009525, + "grad_norm": 0.9013424515724182, + "learning_rate": 9.005611110664558e-08, + "loss": 0.0469, + "step": 76920 + }, + { + "epoch": 0.9615490387259682, + "grad_norm": 1.7775262594223022, + "learning_rate": 8.993929283453329e-08, + "loss": 0.0636, + "step": 76922 + }, + { + "epoch": 0.9615740393509837, + "grad_norm": 0.00045371026499196887, + "learning_rate": 8.982255003604168e-08, + "loss": 0.4795, + "step": 76924 + }, + { + "epoch": 0.9615990399759994, + "grad_norm": 2.269655466079712, + "learning_rate": 8.970588271206115e-08, + "loss": 0.6985, + "step": 76926 + }, + { + "epoch": 0.961624040601015, + "grad_norm": 4.366917133331299, + "learning_rate": 8.958929086347878e-08, + "loss": 1.5791, + "step": 76928 + }, + { + "epoch": 0.9616490412260307, + "grad_norm": 1.6351423263549805, + "learning_rate": 8.947277449118163e-08, + "loss": 0.0507, + "step": 76930 + }, + { + "epoch": 0.9616740418510463, + "grad_norm": 2.3396189212799072, + "learning_rate": 8.93563335960601e-08, + "loss": 1.1727, + "step": 76932 + }, + { + "epoch": 0.9616990424760619, + "grad_norm": 1.1458580493927002, + "learning_rate": 8.923996817899794e-08, + "loss": 0.599, + "step": 76934 + }, + { + "epoch": 0.9617240431010775, + "grad_norm": 2.860644578933716, + "learning_rate": 8.912367824088441e-08, + "loss": 1.476, + "step": 76936 + }, + { + "epoch": 0.9617490437260932, + "grad_norm": 4.061940670013428, + "learning_rate": 8.900746378260216e-08, + "loss": 0.966, + "step": 76938 + }, + { + "epoch": 0.9617740443511088, + "grad_norm": 4.875743865966797, + "learning_rate": 8.889132480503716e-08, + "loss": 0.9126, + "step": 76940 + }, + { + "epoch": 0.9617990449761245, + "grad_norm": 3.3654634952545166, + "learning_rate": 8.877526130907533e-08, + "loss": 0.5863, + "step": 76942 + }, + { + "epoch": 0.96182404560114, + "grad_norm": 2.7238640785217285, + "learning_rate": 8.86592732955982e-08, + "loss": 0.8229, + "step": 76944 + }, + { + "epoch": 0.9618490462261556, + "grad_norm": 1.8068965673446655, + "learning_rate": 8.854336076549174e-08, + "loss": 0.2595, + "step": 76946 + }, + { + "epoch": 0.9618740468511713, + "grad_norm": 3.332174062728882, + "learning_rate": 8.842752371963637e-08, + "loss": 1.6036, + "step": 76948 + }, + { + "epoch": 0.9618990474761869, + "grad_norm": 8.050803184509277, + "learning_rate": 8.83117621589158e-08, + "loss": 2.5291, + "step": 76950 + }, + { + "epoch": 0.9619240481012026, + "grad_norm": 9.082014083862305, + "learning_rate": 8.819607608421265e-08, + "loss": 1.2031, + "step": 76952 + }, + { + "epoch": 0.9619490487262181, + "grad_norm": 0.5893180966377258, + "learning_rate": 8.808046549640403e-08, + "loss": 1.3801, + "step": 76954 + }, + { + "epoch": 0.9619740493512338, + "grad_norm": 0.0001785138447303325, + "learning_rate": 8.796493039637478e-08, + "loss": 0.5664, + "step": 76956 + }, + { + "epoch": 0.9619990499762494, + "grad_norm": 0.009515171870589256, + "learning_rate": 8.784947078500195e-08, + "loss": 0.6723, + "step": 76958 + }, + { + "epoch": 0.9620240506012651, + "grad_norm": 2.643275022506714, + "learning_rate": 8.773408666316596e-08, + "loss": 0.9909, + "step": 76960 + }, + { + "epoch": 0.9620490512262807, + "grad_norm": 3.4817910194396973, + "learning_rate": 8.761877803174501e-08, + "loss": 1.3077, + "step": 76962 + }, + { + "epoch": 0.9620740518512962, + "grad_norm": 10.690011024475098, + "learning_rate": 8.750354489161949e-08, + "loss": 1.6338, + "step": 76964 + }, + { + "epoch": 0.9620990524763119, + "grad_norm": 1.6880152225494385, + "learning_rate": 8.738838724366428e-08, + "loss": 0.4231, + "step": 76966 + }, + { + "epoch": 0.9621240531013275, + "grad_norm": 3.354764699935913, + "learning_rate": 8.727330508875642e-08, + "loss": 0.7941, + "step": 76968 + }, + { + "epoch": 0.9621490537263432, + "grad_norm": 2.9470431804656982, + "learning_rate": 8.715829842777413e-08, + "loss": 1.0325, + "step": 76970 + }, + { + "epoch": 0.9621740543513588, + "grad_norm": 9.034883499145508, + "learning_rate": 8.704336726159224e-08, + "loss": 0.9265, + "step": 76972 + }, + { + "epoch": 0.9621990549763744, + "grad_norm": 0.0005612008390016854, + "learning_rate": 8.692851159108673e-08, + "loss": 0.4116, + "step": 76974 + }, + { + "epoch": 0.96222405560139, + "grad_norm": 0.008497381582856178, + "learning_rate": 8.681373141713023e-08, + "loss": 0.0001, + "step": 76976 + }, + { + "epoch": 0.9622490562264057, + "grad_norm": 3.6415750980377197, + "learning_rate": 8.669902674060093e-08, + "loss": 1.3797, + "step": 76978 + }, + { + "epoch": 0.9622740568514213, + "grad_norm": 3.671339988708496, + "learning_rate": 8.658439756236814e-08, + "loss": 1.0872, + "step": 76980 + }, + { + "epoch": 0.962299057476437, + "grad_norm": 2.653850555419922, + "learning_rate": 8.64698438833067e-08, + "loss": 0.4901, + "step": 76982 + }, + { + "epoch": 0.9623240581014525, + "grad_norm": 3.254808187484741, + "learning_rate": 8.635536570429037e-08, + "loss": 0.982, + "step": 76984 + }, + { + "epoch": 0.9623490587264681, + "grad_norm": 0.00027629092801362276, + "learning_rate": 8.624096302618845e-08, + "loss": 0.6827, + "step": 76986 + }, + { + "epoch": 0.9623740593514838, + "grad_norm": 1.163127064704895, + "learning_rate": 8.612663584987469e-08, + "loss": 0.0324, + "step": 76988 + }, + { + "epoch": 0.9623990599764994, + "grad_norm": 3.7480218410491943, + "learning_rate": 8.601238417621727e-08, + "loss": 0.9377, + "step": 76990 + }, + { + "epoch": 0.9624240606015151, + "grad_norm": 2.047262191772461, + "learning_rate": 8.589820800608772e-08, + "loss": 1.1875, + "step": 76992 + }, + { + "epoch": 0.9624490612265306, + "grad_norm": 2.1122047901153564, + "learning_rate": 8.578410734035647e-08, + "loss": 0.5757, + "step": 76994 + }, + { + "epoch": 0.9624740618515463, + "grad_norm": 2.774357318878174, + "learning_rate": 8.567008217988947e-08, + "loss": 1.2115, + "step": 76996 + }, + { + "epoch": 0.9624990624765619, + "grad_norm": 2.947049617767334, + "learning_rate": 8.555613252555938e-08, + "loss": 0.8801, + "step": 76998 + }, + { + "epoch": 0.9625240631015776, + "grad_norm": 3.355140209197998, + "learning_rate": 8.544225837822995e-08, + "loss": 0.7857, + "step": 77000 + }, + { + "epoch": 0.9625490637265932, + "grad_norm": 3.639664649963379, + "learning_rate": 8.532845973877158e-08, + "loss": 0.8324, + "step": 77002 + }, + { + "epoch": 0.9625740643516087, + "grad_norm": 3.3223824501037598, + "learning_rate": 8.521473660804913e-08, + "loss": 0.6285, + "step": 77004 + }, + { + "epoch": 0.9625990649766244, + "grad_norm": 0.05591164156794548, + "learning_rate": 8.51010889869286e-08, + "loss": 0.0673, + "step": 77006 + }, + { + "epoch": 0.96262406560164, + "grad_norm": 3.9490582942962646, + "learning_rate": 8.498751687627705e-08, + "loss": 0.4929, + "step": 77008 + }, + { + "epoch": 0.9626490662266557, + "grad_norm": 4.054315567016602, + "learning_rate": 8.487402027695713e-08, + "loss": 1.6344, + "step": 77010 + }, + { + "epoch": 0.9626740668516713, + "grad_norm": 0.0003043512115254998, + "learning_rate": 8.476059918983593e-08, + "loss": 0.1023, + "step": 77012 + }, + { + "epoch": 0.9626990674766869, + "grad_norm": 2.5329673290252686, + "learning_rate": 8.464725361577497e-08, + "loss": 0.678, + "step": 77014 + }, + { + "epoch": 0.9627240681017025, + "grad_norm": 7.3814005851745605, + "learning_rate": 8.453398355563913e-08, + "loss": 1.3909, + "step": 77016 + }, + { + "epoch": 0.9627490687267182, + "grad_norm": 4.565813064575195, + "learning_rate": 8.442078901028994e-08, + "loss": 0.34, + "step": 77018 + }, + { + "epoch": 0.9627740693517338, + "grad_norm": 0.7856403589248657, + "learning_rate": 8.430766998059004e-08, + "loss": 0.7442, + "step": 77020 + }, + { + "epoch": 0.9627990699767495, + "grad_norm": 6.487961769104004, + "learning_rate": 8.419462646739985e-08, + "loss": 0.5551, + "step": 77022 + }, + { + "epoch": 0.962824070601765, + "grad_norm": 4.010485649108887, + "learning_rate": 8.408165847158201e-08, + "loss": 1.8647, + "step": 77024 + }, + { + "epoch": 0.9628490712267807, + "grad_norm": 4.734721660614014, + "learning_rate": 8.396876599399583e-08, + "loss": 1.4492, + "step": 77026 + }, + { + "epoch": 0.9628740718517963, + "grad_norm": 2.6821377277374268, + "learning_rate": 8.385594903550065e-08, + "loss": 1.7286, + "step": 77028 + }, + { + "epoch": 0.962899072476812, + "grad_norm": 2.932342290878296, + "learning_rate": 8.374320759695687e-08, + "loss": 0.3244, + "step": 77030 + }, + { + "epoch": 0.9629240731018276, + "grad_norm": 8.093620300292969, + "learning_rate": 8.363054167922269e-08, + "loss": 2.1252, + "step": 77032 + }, + { + "epoch": 0.9629490737268431, + "grad_norm": 3.203930616378784, + "learning_rate": 8.351795128315631e-08, + "loss": 1.0316, + "step": 77034 + }, + { + "epoch": 0.9629740743518588, + "grad_norm": 7.194113254547119, + "learning_rate": 8.340543640961485e-08, + "loss": 1.4562, + "step": 77036 + }, + { + "epoch": 0.9629990749768744, + "grad_norm": 3.4276583194732666, + "learning_rate": 8.329299705945427e-08, + "loss": 1.5057, + "step": 77038 + }, + { + "epoch": 0.9630240756018901, + "grad_norm": 0.0004786312929354608, + "learning_rate": 8.318063323353275e-08, + "loss": 0.0038, + "step": 77040 + }, + { + "epoch": 0.9630490762269057, + "grad_norm": 4.144120693206787, + "learning_rate": 8.30683449327041e-08, + "loss": 1.4187, + "step": 77042 + }, + { + "epoch": 0.9630740768519213, + "grad_norm": 0.0007330308435484767, + "learning_rate": 8.295613215782538e-08, + "loss": 0.5279, + "step": 77044 + }, + { + "epoch": 0.9630990774769369, + "grad_norm": 2.528134822845459, + "learning_rate": 8.284399490975037e-08, + "loss": 1.1136, + "step": 77046 + }, + { + "epoch": 0.9631240781019526, + "grad_norm": 3.24572491645813, + "learning_rate": 8.273193318933281e-08, + "loss": 1.2967, + "step": 77048 + }, + { + "epoch": 0.9631490787269682, + "grad_norm": 3.1071722507476807, + "learning_rate": 8.261994699742759e-08, + "loss": 2.0462, + "step": 77050 + }, + { + "epoch": 0.9631740793519838, + "grad_norm": 2.7010035514831543, + "learning_rate": 8.250803633488403e-08, + "loss": 1.2482, + "step": 77052 + }, + { + "epoch": 0.9631990799769994, + "grad_norm": 0.00043402245501056314, + "learning_rate": 8.23962012025592e-08, + "loss": 0.5785, + "step": 77054 + }, + { + "epoch": 0.963224080602015, + "grad_norm": 7.790963649749756, + "learning_rate": 8.228444160130134e-08, + "loss": 2.553, + "step": 77056 + }, + { + "epoch": 0.9632490812270307, + "grad_norm": 4.828174591064453, + "learning_rate": 8.217275753196197e-08, + "loss": 0.6364, + "step": 77058 + }, + { + "epoch": 0.9632740818520463, + "grad_norm": 5.6448974609375, + "learning_rate": 8.206114899539375e-08, + "loss": 0.8161, + "step": 77060 + }, + { + "epoch": 0.963299082477062, + "grad_norm": 2.4740161895751953, + "learning_rate": 8.194961599244489e-08, + "loss": 1.1199, + "step": 77062 + }, + { + "epoch": 0.9633240831020775, + "grad_norm": 3.169163703918457, + "learning_rate": 8.183815852396471e-08, + "loss": 1.4257, + "step": 77064 + }, + { + "epoch": 0.9633490837270932, + "grad_norm": 0.000298298749839887, + "learning_rate": 8.172677659080253e-08, + "loss": 0.0118, + "step": 77066 + }, + { + "epoch": 0.9633740843521088, + "grad_norm": 2.958400249481201, + "learning_rate": 8.161547019380767e-08, + "loss": 0.4968, + "step": 77068 + }, + { + "epoch": 0.9633990849771245, + "grad_norm": 3.445265531539917, + "learning_rate": 8.150423933382612e-08, + "loss": 0.4144, + "step": 77070 + }, + { + "epoch": 0.9634240856021401, + "grad_norm": 3.260629177093506, + "learning_rate": 8.139308401170498e-08, + "loss": 1.1889, + "step": 77072 + }, + { + "epoch": 0.9634490862271556, + "grad_norm": 0.45138683915138245, + "learning_rate": 8.128200422829247e-08, + "loss": 0.018, + "step": 77074 + }, + { + "epoch": 0.9634740868521713, + "grad_norm": 2.3234119415283203, + "learning_rate": 8.117099998443345e-08, + "loss": 0.3193, + "step": 77076 + }, + { + "epoch": 0.9634990874771869, + "grad_norm": 3.996091604232788, + "learning_rate": 8.106007128097392e-08, + "loss": 0.6326, + "step": 77078 + }, + { + "epoch": 0.9635240881022026, + "grad_norm": 5.3878631591796875, + "learning_rate": 8.094921811875656e-08, + "loss": 0.6491, + "step": 77080 + }, + { + "epoch": 0.9635490887272182, + "grad_norm": 4.210326671600342, + "learning_rate": 8.083844049862844e-08, + "loss": 1.7741, + "step": 77082 + }, + { + "epoch": 0.9635740893522338, + "grad_norm": 4.712044715881348, + "learning_rate": 8.072773842143223e-08, + "loss": 1.3525, + "step": 77084 + }, + { + "epoch": 0.9635990899772494, + "grad_norm": 3.5110201835632324, + "learning_rate": 8.061711188801058e-08, + "loss": 1.0556, + "step": 77086 + }, + { + "epoch": 0.9636240906022651, + "grad_norm": 6.328485012054443, + "learning_rate": 8.050656089920617e-08, + "loss": 1.9164, + "step": 77088 + }, + { + "epoch": 0.9636490912272807, + "grad_norm": 2.9088633060455322, + "learning_rate": 8.039608545586164e-08, + "loss": 0.6249, + "step": 77090 + }, + { + "epoch": 0.9636740918522964, + "grad_norm": 4.157039165496826, + "learning_rate": 8.028568555881744e-08, + "loss": 0.2775, + "step": 77092 + }, + { + "epoch": 0.9636990924773119, + "grad_norm": 3.052647590637207, + "learning_rate": 8.017536120891401e-08, + "loss": 0.8496, + "step": 77094 + }, + { + "epoch": 0.9637240931023275, + "grad_norm": 3.513170003890991, + "learning_rate": 8.006511240699288e-08, + "loss": 1.4907, + "step": 77096 + }, + { + "epoch": 0.9637490937273432, + "grad_norm": 18.528942108154297, + "learning_rate": 7.995493915389341e-08, + "loss": 1.0556, + "step": 77098 + }, + { + "epoch": 0.9637740943523588, + "grad_norm": 3.974860191345215, + "learning_rate": 7.984484145045379e-08, + "loss": 1.0428, + "step": 77100 + }, + { + "epoch": 0.9637990949773745, + "grad_norm": 2.634293794631958, + "learning_rate": 7.973481929751226e-08, + "loss": 0.2984, + "step": 77102 + }, + { + "epoch": 0.96382409560239, + "grad_norm": 2.904086112976074, + "learning_rate": 7.962487269590813e-08, + "loss": 0.5985, + "step": 77104 + }, + { + "epoch": 0.9638490962274057, + "grad_norm": 3.6749916076660156, + "learning_rate": 7.951500164647852e-08, + "loss": 0.7411, + "step": 77106 + }, + { + "epoch": 0.9638740968524213, + "grad_norm": 3.259615182876587, + "learning_rate": 7.940520615006054e-08, + "loss": 1.833, + "step": 77108 + }, + { + "epoch": 0.963899097477437, + "grad_norm": 3.2872893810272217, + "learning_rate": 7.929548620748795e-08, + "loss": 0.3764, + "step": 77110 + }, + { + "epoch": 0.9639240981024526, + "grad_norm": 7.678630352020264, + "learning_rate": 7.918584181959899e-08, + "loss": 1.9269, + "step": 77112 + }, + { + "epoch": 0.9639490987274681, + "grad_norm": 3.865720510482788, + "learning_rate": 7.90762729872263e-08, + "loss": 1.0715, + "step": 77114 + }, + { + "epoch": 0.9639740993524838, + "grad_norm": 2.836139678955078, + "learning_rate": 7.896677971120703e-08, + "loss": 1.183, + "step": 77116 + }, + { + "epoch": 0.9639990999774994, + "grad_norm": 8.707124710083008, + "learning_rate": 7.885736199237381e-08, + "loss": 0.3773, + "step": 77118 + }, + { + "epoch": 0.9640241006025151, + "grad_norm": 0.0002903503191191703, + "learning_rate": 7.874801983155932e-08, + "loss": 0.8159, + "step": 77120 + }, + { + "epoch": 0.9640491012275307, + "grad_norm": 2.290870428085327, + "learning_rate": 7.863875322959847e-08, + "loss": 1.2419, + "step": 77122 + }, + { + "epoch": 0.9640741018525463, + "grad_norm": 3.5530691146850586, + "learning_rate": 7.852956218732056e-08, + "loss": 0.7316, + "step": 77124 + }, + { + "epoch": 0.9640991024775619, + "grad_norm": 3.113720655441284, + "learning_rate": 7.842044670555826e-08, + "loss": 0.6196, + "step": 77126 + }, + { + "epoch": 0.9641241031025776, + "grad_norm": 2.9975438117980957, + "learning_rate": 7.831140678514316e-08, + "loss": 0.1533, + "step": 77128 + }, + { + "epoch": 0.9641491037275932, + "grad_norm": 3.812655448913574, + "learning_rate": 7.820244242690567e-08, + "loss": 1.2168, + "step": 77130 + }, + { + "epoch": 0.9641741043526089, + "grad_norm": 4.728493690490723, + "learning_rate": 7.809355363167514e-08, + "loss": 1.473, + "step": 77132 + }, + { + "epoch": 0.9641991049776244, + "grad_norm": 1.9605374336242676, + "learning_rate": 7.798474040028092e-08, + "loss": 1.6433, + "step": 77134 + }, + { + "epoch": 0.96422410560264, + "grad_norm": 3.2500569820404053, + "learning_rate": 7.787600273355234e-08, + "loss": 1.3563, + "step": 77136 + }, + { + "epoch": 0.9642491062276557, + "grad_norm": 0.5579735040664673, + "learning_rate": 7.776734063231539e-08, + "loss": 0.0074, + "step": 77138 + }, + { + "epoch": 0.9642741068526713, + "grad_norm": 5.833561420440674, + "learning_rate": 7.765875409740053e-08, + "loss": 0.9007, + "step": 77140 + }, + { + "epoch": 0.964299107477687, + "grad_norm": 1.471470594406128, + "learning_rate": 7.755024312963377e-08, + "loss": 0.569, + "step": 77142 + }, + { + "epoch": 0.9643241081027025, + "grad_norm": 6.2693681716918945, + "learning_rate": 7.744180772984e-08, + "loss": 0.4266, + "step": 77144 + }, + { + "epoch": 0.9643491087277182, + "grad_norm": 3.6425135135650635, + "learning_rate": 7.733344789884745e-08, + "loss": 1.5095, + "step": 77146 + }, + { + "epoch": 0.9643741093527338, + "grad_norm": 0.000246138108195737, + "learning_rate": 7.72251636374799e-08, + "loss": 0.7441, + "step": 77148 + }, + { + "epoch": 0.9643991099777495, + "grad_norm": 3.6630303859710693, + "learning_rate": 7.711695494656224e-08, + "loss": 0.8372, + "step": 77150 + }, + { + "epoch": 0.9644241106027651, + "grad_norm": 3.35432505607605, + "learning_rate": 7.700882182691716e-08, + "loss": 1.2434, + "step": 77152 + }, + { + "epoch": 0.9644491112277807, + "grad_norm": 4.202193260192871, + "learning_rate": 7.690076427937177e-08, + "loss": 0.9362, + "step": 77154 + }, + { + "epoch": 0.9644741118527963, + "grad_norm": 2.6996660232543945, + "learning_rate": 7.67927823047454e-08, + "loss": 0.9105, + "step": 77156 + }, + { + "epoch": 0.964499112477812, + "grad_norm": 3.835319757461548, + "learning_rate": 7.668487590386298e-08, + "loss": 0.0543, + "step": 77158 + }, + { + "epoch": 0.9645241131028276, + "grad_norm": 3.4013261795043945, + "learning_rate": 7.657704507754493e-08, + "loss": 0.6292, + "step": 77160 + }, + { + "epoch": 0.9645491137278432, + "grad_norm": 3.160618543624878, + "learning_rate": 7.646928982661283e-08, + "loss": 1.0548, + "step": 77162 + }, + { + "epoch": 0.9645741143528588, + "grad_norm": 3.391430377960205, + "learning_rate": 7.636161015188715e-08, + "loss": 1.5409, + "step": 77164 + }, + { + "epoch": 0.9645991149778744, + "grad_norm": 3.534219980239868, + "learning_rate": 7.625400605418831e-08, + "loss": 0.9541, + "step": 77166 + }, + { + "epoch": 0.9646241156028901, + "grad_norm": 1.6478712558746338, + "learning_rate": 7.614647753433457e-08, + "loss": 0.9774, + "step": 77168 + }, + { + "epoch": 0.9646491162279057, + "grad_norm": 4.024774074554443, + "learning_rate": 7.603902459314749e-08, + "loss": 1.2535, + "step": 77170 + }, + { + "epoch": 0.9646741168529214, + "grad_norm": 8.75839900970459, + "learning_rate": 7.593164723144308e-08, + "loss": 2.0469, + "step": 77172 + }, + { + "epoch": 0.9646991174779369, + "grad_norm": 0.00021144318452570587, + "learning_rate": 7.582434545004069e-08, + "loss": 0.7444, + "step": 77174 + }, + { + "epoch": 0.9647241181029526, + "grad_norm": 3.311807155609131, + "learning_rate": 7.571711924975522e-08, + "loss": 1.1561, + "step": 77176 + }, + { + "epoch": 0.9647491187279682, + "grad_norm": 2.3834054470062256, + "learning_rate": 7.560996863140602e-08, + "loss": 0.2337, + "step": 77178 + }, + { + "epoch": 0.9647741193529838, + "grad_norm": 4.275136947631836, + "learning_rate": 7.550289359580687e-08, + "loss": 1.2339, + "step": 77180 + }, + { + "epoch": 0.9647991199779995, + "grad_norm": 3.071554183959961, + "learning_rate": 7.539589414377379e-08, + "loss": 0.748, + "step": 77182 + }, + { + "epoch": 0.964824120603015, + "grad_norm": 3.6864757537841797, + "learning_rate": 7.528897027612392e-08, + "loss": 0.6207, + "step": 77184 + }, + { + "epoch": 0.9648491212280307, + "grad_norm": 2.585545539855957, + "learning_rate": 7.518212199366881e-08, + "loss": 1.1536, + "step": 77186 + }, + { + "epoch": 0.9648741218530463, + "grad_norm": 3.3380041122436523, + "learning_rate": 7.507534929722338e-08, + "loss": 0.603, + "step": 77188 + }, + { + "epoch": 0.964899122478062, + "grad_norm": 0.746802031993866, + "learning_rate": 7.49686521876003e-08, + "loss": 0.7705, + "step": 77190 + }, + { + "epoch": 0.9649241231030776, + "grad_norm": 4.336633205413818, + "learning_rate": 7.486203066561226e-08, + "loss": 1.6318, + "step": 77192 + }, + { + "epoch": 0.9649491237280932, + "grad_norm": 2.244168281555176, + "learning_rate": 7.475548473207195e-08, + "loss": 1.6732, + "step": 77194 + }, + { + "epoch": 0.9649741243531088, + "grad_norm": 3.643364429473877, + "learning_rate": 7.464901438778982e-08, + "loss": 0.8679, + "step": 77196 + }, + { + "epoch": 0.9649991249781245, + "grad_norm": 0.013032164424657822, + "learning_rate": 7.454261963357746e-08, + "loss": 0.3241, + "step": 77198 + }, + { + "epoch": 0.9650241256031401, + "grad_norm": 2.1833624839782715, + "learning_rate": 7.44363004702453e-08, + "loss": 0.3178, + "step": 77200 + }, + { + "epoch": 0.9650491262281558, + "grad_norm": 2.730485677719116, + "learning_rate": 7.433005689860273e-08, + "loss": 1.0052, + "step": 77202 + }, + { + "epoch": 0.9650741268531713, + "grad_norm": 0.00020735186990350485, + "learning_rate": 7.422388891945798e-08, + "loss": 0.0, + "step": 77204 + }, + { + "epoch": 0.9650991274781869, + "grad_norm": 0.0005084911826997995, + "learning_rate": 7.411779653362038e-08, + "loss": 0.0, + "step": 77206 + }, + { + "epoch": 0.9651241281032026, + "grad_norm": 2.2621877193450928, + "learning_rate": 7.401177974189822e-08, + "loss": 0.4876, + "step": 77208 + }, + { + "epoch": 0.9651491287282182, + "grad_norm": 3.2797601222991943, + "learning_rate": 7.39058385450997e-08, + "loss": 0.6693, + "step": 77210 + }, + { + "epoch": 0.9651741293532339, + "grad_norm": 3.61045241355896, + "learning_rate": 7.379997294402974e-08, + "loss": 1.3275, + "step": 77212 + }, + { + "epoch": 0.9651991299782494, + "grad_norm": 2.884831666946411, + "learning_rate": 7.369418293949548e-08, + "loss": 0.554, + "step": 77214 + }, + { + "epoch": 0.9652241306032651, + "grad_norm": 32.84169006347656, + "learning_rate": 7.358846853230295e-08, + "loss": 3.7207, + "step": 77216 + }, + { + "epoch": 0.9652491312282807, + "grad_norm": 0.6993057727813721, + "learning_rate": 7.348282972325704e-08, + "loss": 1.4151, + "step": 77218 + }, + { + "epoch": 0.9652741318532964, + "grad_norm": 0.12735190987586975, + "learning_rate": 7.337726651316156e-08, + "loss": 0.1086, + "step": 77220 + }, + { + "epoch": 0.965299132478312, + "grad_norm": 3.4074959754943848, + "learning_rate": 7.327177890282145e-08, + "loss": 1.0409, + "step": 77222 + }, + { + "epoch": 0.9653241331033275, + "grad_norm": 0.00037434365367516875, + "learning_rate": 7.316636689303936e-08, + "loss": 1.063, + "step": 77224 + }, + { + "epoch": 0.9653491337283432, + "grad_norm": 8.126232147216797, + "learning_rate": 7.30610304846191e-08, + "loss": 2.4727, + "step": 77226 + }, + { + "epoch": 0.9653741343533588, + "grad_norm": 8.358935356140137, + "learning_rate": 7.295576967836226e-08, + "loss": 1.0686, + "step": 77228 + }, + { + "epoch": 0.9653991349783745, + "grad_norm": 2.730572462081909, + "learning_rate": 7.285058447506932e-08, + "loss": 0.359, + "step": 77230 + }, + { + "epoch": 0.9654241356033901, + "grad_norm": 3.5661818981170654, + "learning_rate": 7.274547487554407e-08, + "loss": 0.9388, + "step": 77232 + }, + { + "epoch": 0.9654491362284057, + "grad_norm": 3.9915902614593506, + "learning_rate": 7.264044088058365e-08, + "loss": 0.8711, + "step": 77234 + }, + { + "epoch": 0.9654741368534213, + "grad_norm": 6.483860969543457, + "learning_rate": 7.253548249099074e-08, + "loss": 0.9265, + "step": 77236 + }, + { + "epoch": 0.965499137478437, + "grad_norm": 5.270255088806152, + "learning_rate": 7.243059970756472e-08, + "loss": 1.4513, + "step": 77238 + }, + { + "epoch": 0.9655241381034526, + "grad_norm": 6.059805870056152, + "learning_rate": 7.232579253110273e-08, + "loss": 2.1197, + "step": 77240 + }, + { + "epoch": 0.9655491387284683, + "grad_norm": 0.0008647648501209915, + "learning_rate": 7.2221060962403e-08, + "loss": 0.6209, + "step": 77242 + }, + { + "epoch": 0.9655741393534838, + "grad_norm": 0.0002810492005664855, + "learning_rate": 7.21164050022638e-08, + "loss": 0.8502, + "step": 77244 + }, + { + "epoch": 0.9655991399784994, + "grad_norm": 3.871035575866699, + "learning_rate": 7.201182465148227e-08, + "loss": 1.7004, + "step": 77246 + }, + { + "epoch": 0.9656241406035151, + "grad_norm": 1.3189358711242676, + "learning_rate": 7.190731991085442e-08, + "loss": 0.2958, + "step": 77248 + }, + { + "epoch": 0.9656491412285307, + "grad_norm": 3.315598487854004, + "learning_rate": 7.180289078117742e-08, + "loss": 0.6011, + "step": 77250 + }, + { + "epoch": 0.9656741418535464, + "grad_norm": 4.138558387756348, + "learning_rate": 7.169853726324506e-08, + "loss": 1.0092, + "step": 77252 + }, + { + "epoch": 0.9656991424785619, + "grad_norm": 4.308567523956299, + "learning_rate": 7.159425935785335e-08, + "loss": 0.1896, + "step": 77254 + }, + { + "epoch": 0.9657241431035776, + "grad_norm": 0.00021464518795255572, + "learning_rate": 7.149005706579393e-08, + "loss": 0.2517, + "step": 77256 + }, + { + "epoch": 0.9657491437285932, + "grad_norm": 5.138978958129883, + "learning_rate": 7.13859303878639e-08, + "loss": 0.4477, + "step": 77258 + }, + { + "epoch": 0.9657741443536089, + "grad_norm": 3.3957037925720215, + "learning_rate": 7.128187932485376e-08, + "loss": 2.4479, + "step": 77260 + }, + { + "epoch": 0.9657991449786245, + "grad_norm": 2.2809979915618896, + "learning_rate": 7.11779038775573e-08, + "loss": 0.3276, + "step": 77262 + }, + { + "epoch": 0.96582414560364, + "grad_norm": 3.1610212326049805, + "learning_rate": 7.107400404676612e-08, + "loss": 0.5409, + "step": 77264 + }, + { + "epoch": 0.9658491462286557, + "grad_norm": 3.3361332416534424, + "learning_rate": 7.097017983327071e-08, + "loss": 0.9885, + "step": 77266 + }, + { + "epoch": 0.9658741468536713, + "grad_norm": 0.023750439286231995, + "learning_rate": 7.086643123786152e-08, + "loss": 0.0004, + "step": 77268 + }, + { + "epoch": 0.965899147478687, + "grad_norm": 2.4120378494262695, + "learning_rate": 7.076275826133017e-08, + "loss": 1.116, + "step": 77270 + }, + { + "epoch": 0.9659241481037026, + "grad_norm": 3.0689873695373535, + "learning_rate": 7.065916090446601e-08, + "loss": 1.4356, + "step": 77272 + }, + { + "epoch": 0.9659491487287182, + "grad_norm": 0.42677128314971924, + "learning_rate": 7.055563916805729e-08, + "loss": 0.658, + "step": 77274 + }, + { + "epoch": 0.9659741493537338, + "grad_norm": 0.0003347323799971491, + "learning_rate": 7.04521930528923e-08, + "loss": 0.1276, + "step": 77276 + }, + { + "epoch": 0.9659991499787495, + "grad_norm": 0.0030649686232209206, + "learning_rate": 7.034882255976039e-08, + "loss": 0.5311, + "step": 77278 + }, + { + "epoch": 0.9660241506037651, + "grad_norm": 4.863086700439453, + "learning_rate": 7.024552768944647e-08, + "loss": 1.2488, + "step": 77280 + }, + { + "epoch": 0.9660491512287808, + "grad_norm": 0.00037518891622312367, + "learning_rate": 7.014230844273884e-08, + "loss": 0.1342, + "step": 77282 + }, + { + "epoch": 0.9660741518537963, + "grad_norm": 0.00017252999532502145, + "learning_rate": 7.003916482042238e-08, + "loss": 0.0, + "step": 77284 + }, + { + "epoch": 0.966099152478812, + "grad_norm": 0.9452822208404541, + "learning_rate": 6.993609682328428e-08, + "loss": 0.8295, + "step": 77286 + }, + { + "epoch": 0.9661241531038276, + "grad_norm": 1.818057894706726, + "learning_rate": 6.983310445210833e-08, + "loss": 0.8612, + "step": 77288 + }, + { + "epoch": 0.9661491537288432, + "grad_norm": 7.042483806610107, + "learning_rate": 6.973018770767948e-08, + "loss": 0.8702, + "step": 77290 + }, + { + "epoch": 0.9661741543538589, + "grad_norm": 2.2683913707733154, + "learning_rate": 6.962734659078152e-08, + "loss": 0.7634, + "step": 77292 + }, + { + "epoch": 0.9661991549788744, + "grad_norm": 3.578918695449829, + "learning_rate": 6.952458110219717e-08, + "loss": 1.0328, + "step": 77294 + }, + { + "epoch": 0.9662241556038901, + "grad_norm": 3.743875741958618, + "learning_rate": 6.942189124270915e-08, + "loss": 0.7189, + "step": 77296 + }, + { + "epoch": 0.9662491562289057, + "grad_norm": 9.276946067810059, + "learning_rate": 6.931927701310014e-08, + "loss": 0.8163, + "step": 77298 + }, + { + "epoch": 0.9662741568539214, + "grad_norm": 2.3992905616760254, + "learning_rate": 6.921673841415066e-08, + "loss": 0.3163, + "step": 77300 + }, + { + "epoch": 0.966299157478937, + "grad_norm": 9.717236518859863, + "learning_rate": 6.911427544664228e-08, + "loss": 0.8299, + "step": 77302 + }, + { + "epoch": 0.9663241581039526, + "grad_norm": 0.00031631693127565086, + "learning_rate": 6.90118881113555e-08, + "loss": 0.6393, + "step": 77304 + }, + { + "epoch": 0.9663491587289682, + "grad_norm": 0.364803284406662, + "learning_rate": 6.890957640906859e-08, + "loss": 0.031, + "step": 77306 + }, + { + "epoch": 0.9663741593539839, + "grad_norm": 2.3906586170196533, + "learning_rate": 6.880734034056313e-08, + "loss": 1.5925, + "step": 77308 + }, + { + "epoch": 0.9663991599789995, + "grad_norm": 1.230241298675537, + "learning_rate": 6.87051799066163e-08, + "loss": 0.8428, + "step": 77310 + }, + { + "epoch": 0.9664241606040151, + "grad_norm": 3.515866994857788, + "learning_rate": 6.860309510800633e-08, + "loss": 1.4595, + "step": 77312 + }, + { + "epoch": 0.9664491612290307, + "grad_norm": 2.8391902446746826, + "learning_rate": 6.850108594551042e-08, + "loss": 0.9528, + "step": 77314 + }, + { + "epoch": 0.9664741618540463, + "grad_norm": 1.6246238946914673, + "learning_rate": 6.83991524199068e-08, + "loss": 0.4976, + "step": 77316 + }, + { + "epoch": 0.966499162479062, + "grad_norm": 1.0636754035949707, + "learning_rate": 6.829729453197043e-08, + "loss": 0.8143, + "step": 77318 + }, + { + "epoch": 0.9665241631040776, + "grad_norm": 0.0002607898786664009, + "learning_rate": 6.819551228247734e-08, + "loss": 0.4765, + "step": 77320 + }, + { + "epoch": 0.9665491637290933, + "grad_norm": 3.30770206451416, + "learning_rate": 6.809380567220247e-08, + "loss": 1.3742, + "step": 77322 + }, + { + "epoch": 0.9665741643541088, + "grad_norm": 5.04067325592041, + "learning_rate": 6.799217470192077e-08, + "loss": 0.7941, + "step": 77324 + }, + { + "epoch": 0.9665991649791245, + "grad_norm": 1.6832550764083862, + "learning_rate": 6.789061937240605e-08, + "loss": 0.8992, + "step": 77326 + }, + { + "epoch": 0.9666241656041401, + "grad_norm": 4.204353332519531, + "learning_rate": 6.778913968443212e-08, + "loss": 1.5192, + "step": 77328 + }, + { + "epoch": 0.9666491662291558, + "grad_norm": 0.00035598137765191495, + "learning_rate": 6.768773563877173e-08, + "loss": 0.0743, + "step": 77330 + }, + { + "epoch": 0.9666741668541714, + "grad_norm": 2.87894606590271, + "learning_rate": 6.758640723619648e-08, + "loss": 1.3271, + "step": 77332 + }, + { + "epoch": 0.9666991674791869, + "grad_norm": 2.5191028118133545, + "learning_rate": 6.748515447747905e-08, + "loss": 0.9955, + "step": 77334 + }, + { + "epoch": 0.9667241681042026, + "grad_norm": 0.9092104434967041, + "learning_rate": 6.738397736338997e-08, + "loss": 0.9323, + "step": 77336 + }, + { + "epoch": 0.9667491687292182, + "grad_norm": 7.022828578948975, + "learning_rate": 6.728287589469973e-08, + "loss": 0.8731, + "step": 77338 + }, + { + "epoch": 0.9667741693542339, + "grad_norm": 4.520029067993164, + "learning_rate": 6.718185007217881e-08, + "loss": 0.9758, + "step": 77340 + }, + { + "epoch": 0.9667991699792495, + "grad_norm": 0.9148229956626892, + "learning_rate": 6.70808998965955e-08, + "loss": 0.5178, + "step": 77342 + }, + { + "epoch": 0.9668241706042651, + "grad_norm": 8.06041145324707, + "learning_rate": 6.698002536872028e-08, + "loss": 2.0906, + "step": 77344 + }, + { + "epoch": 0.9668491712292807, + "grad_norm": 1.0480146408081055, + "learning_rate": 6.687922648932033e-08, + "loss": 0.5786, + "step": 77346 + }, + { + "epoch": 0.9668741718542964, + "grad_norm": 2.8616445064544678, + "learning_rate": 6.67785032591628e-08, + "loss": 0.8277, + "step": 77348 + }, + { + "epoch": 0.966899172479312, + "grad_norm": 3.1286628246307373, + "learning_rate": 6.667785567901597e-08, + "loss": 1.4937, + "step": 77350 + }, + { + "epoch": 0.9669241731043277, + "grad_norm": 0.0003603924997150898, + "learning_rate": 6.65772837496459e-08, + "loss": 0.0238, + "step": 77352 + }, + { + "epoch": 0.9669491737293432, + "grad_norm": 3.152974843978882, + "learning_rate": 6.647678747181973e-08, + "loss": 0.8107, + "step": 77354 + }, + { + "epoch": 0.9669741743543588, + "grad_norm": 1.6477373838424683, + "learning_rate": 6.637636684630023e-08, + "loss": 0.1924, + "step": 77356 + }, + { + "epoch": 0.9669991749793745, + "grad_norm": 2.830549955368042, + "learning_rate": 6.62760218738534e-08, + "loss": 0.9699, + "step": 77358 + }, + { + "epoch": 0.9670241756043901, + "grad_norm": 3.581888198852539, + "learning_rate": 6.617575255524422e-08, + "loss": 0.6049, + "step": 77360 + }, + { + "epoch": 0.9670491762294058, + "grad_norm": 2.411525011062622, + "learning_rate": 6.60755588912354e-08, + "loss": 1.2805, + "step": 77362 + }, + { + "epoch": 0.9670741768544213, + "grad_norm": 2.5025136470794678, + "learning_rate": 6.597544088259078e-08, + "loss": 0.7857, + "step": 77364 + }, + { + "epoch": 0.967099177479437, + "grad_norm": 3.013460397720337, + "learning_rate": 6.587539853007196e-08, + "loss": 1.4424, + "step": 77366 + }, + { + "epoch": 0.9671241781044526, + "grad_norm": 2.7807106971740723, + "learning_rate": 6.577543183444168e-08, + "loss": 0.8879, + "step": 77368 + }, + { + "epoch": 0.9671491787294683, + "grad_norm": 3.5463125705718994, + "learning_rate": 6.567554079646154e-08, + "loss": 0.6594, + "step": 77370 + }, + { + "epoch": 0.9671741793544839, + "grad_norm": 2.638200521469116, + "learning_rate": 6.557572541688983e-08, + "loss": 1.2473, + "step": 77372 + }, + { + "epoch": 0.9671991799794994, + "grad_norm": 2.3171257972717285, + "learning_rate": 6.547598569649039e-08, + "loss": 0.3222, + "step": 77374 + }, + { + "epoch": 0.9672241806045151, + "grad_norm": 2.2450075149536133, + "learning_rate": 6.537632163601925e-08, + "loss": 0.6049, + "step": 77376 + }, + { + "epoch": 0.9672491812295307, + "grad_norm": 3.5642359256744385, + "learning_rate": 6.527673323623918e-08, + "loss": 0.9246, + "step": 77378 + }, + { + "epoch": 0.9672741818545464, + "grad_norm": 7.138927936553955, + "learning_rate": 6.51772204979051e-08, + "loss": 1.3219, + "step": 77380 + }, + { + "epoch": 0.967299182479562, + "grad_norm": 4.177647113800049, + "learning_rate": 6.507778342177639e-08, + "loss": 2.939, + "step": 77382 + }, + { + "epoch": 0.9673241831045776, + "grad_norm": 0.029455572366714478, + "learning_rate": 6.497842200861137e-08, + "loss": 0.0007, + "step": 77384 + }, + { + "epoch": 0.9673491837295932, + "grad_norm": 7.914447784423828, + "learning_rate": 6.487913625916498e-08, + "loss": 0.3417, + "step": 77386 + }, + { + "epoch": 0.9673741843546089, + "grad_norm": 0.31051525473594666, + "learning_rate": 6.477992617419437e-08, + "loss": 0.2491, + "step": 77388 + }, + { + "epoch": 0.9673991849796245, + "grad_norm": 2.415919303894043, + "learning_rate": 6.468079175445451e-08, + "loss": 0.8661, + "step": 77390 + }, + { + "epoch": 0.9674241856046402, + "grad_norm": 3.9243016242980957, + "learning_rate": 6.458173300070148e-08, + "loss": 0.7006, + "step": 77392 + }, + { + "epoch": 0.9674491862296557, + "grad_norm": 3.2652103900909424, + "learning_rate": 6.448274991368797e-08, + "loss": 1.6553, + "step": 77394 + }, + { + "epoch": 0.9674741868546713, + "grad_norm": 0.4244977533817291, + "learning_rate": 6.438384249417007e-08, + "loss": 0.2663, + "step": 77396 + }, + { + "epoch": 0.967499187479687, + "grad_norm": 4.928109645843506, + "learning_rate": 6.428501074289829e-08, + "loss": 1.7806, + "step": 77398 + }, + { + "epoch": 0.9675241881047026, + "grad_norm": 4.5157670974731445, + "learning_rate": 6.418625466062756e-08, + "loss": 0.8677, + "step": 77400 + }, + { + "epoch": 0.9675491887297183, + "grad_norm": 3.1560356616973877, + "learning_rate": 6.408757424810952e-08, + "loss": 1.7724, + "step": 77402 + }, + { + "epoch": 0.9675741893547338, + "grad_norm": 3.753512144088745, + "learning_rate": 6.398896950609579e-08, + "loss": 1.4876, + "step": 77404 + }, + { + "epoch": 0.9675991899797495, + "grad_norm": 0.0006806927849538624, + "learning_rate": 6.389044043533688e-08, + "loss": 0.1609, + "step": 77406 + }, + { + "epoch": 0.9676241906047651, + "grad_norm": 5.5998311042785645, + "learning_rate": 6.379198703658219e-08, + "loss": 1.6567, + "step": 77408 + }, + { + "epoch": 0.9676491912297808, + "grad_norm": 3.9611802101135254, + "learning_rate": 6.369360931058333e-08, + "loss": 1.6217, + "step": 77410 + }, + { + "epoch": 0.9676741918547964, + "grad_norm": 4.2475080490112305, + "learning_rate": 6.35953072580886e-08, + "loss": 1.4423, + "step": 77412 + }, + { + "epoch": 0.967699192479812, + "grad_norm": 2.6773016452789307, + "learning_rate": 6.34970808798474e-08, + "loss": 0.9112, + "step": 77414 + }, + { + "epoch": 0.9677241931048276, + "grad_norm": 3.628303050994873, + "learning_rate": 6.339893017660692e-08, + "loss": 0.7914, + "step": 77416 + }, + { + "epoch": 0.9677491937298432, + "grad_norm": 3.0969760417938232, + "learning_rate": 6.330085514911543e-08, + "loss": 1.1267, + "step": 77418 + }, + { + "epoch": 0.9677741943548589, + "grad_norm": 3.9180965423583984, + "learning_rate": 6.320285579811902e-08, + "loss": 0.3823, + "step": 77420 + }, + { + "epoch": 0.9677991949798745, + "grad_norm": 2.1650192737579346, + "learning_rate": 6.310493212436486e-08, + "loss": 1.1239, + "step": 77422 + }, + { + "epoch": 0.9678241956048901, + "grad_norm": 0.8945733308792114, + "learning_rate": 6.300708412859791e-08, + "loss": 0.7605, + "step": 77424 + }, + { + "epoch": 0.9678491962299057, + "grad_norm": 3.1542203426361084, + "learning_rate": 6.290931181156424e-08, + "loss": 1.0949, + "step": 77426 + }, + { + "epoch": 0.9678741968549214, + "grad_norm": 2.2660791873931885, + "learning_rate": 6.281161517400657e-08, + "loss": 0.3457, + "step": 77428 + }, + { + "epoch": 0.967899197479937, + "grad_norm": 0.0002499278343748301, + "learning_rate": 6.271399421667212e-08, + "loss": 0.0, + "step": 77430 + }, + { + "epoch": 0.9679241981049527, + "grad_norm": 4.14509916305542, + "learning_rate": 6.261644894030139e-08, + "loss": 1.4258, + "step": 77432 + }, + { + "epoch": 0.9679491987299682, + "grad_norm": 5.015936374664307, + "learning_rate": 6.251897934563933e-08, + "loss": 1.1828, + "step": 77434 + }, + { + "epoch": 0.9679741993549839, + "grad_norm": 4.521237373352051, + "learning_rate": 6.242158543342758e-08, + "loss": 1.5173, + "step": 77436 + }, + { + "epoch": 0.9679991999799995, + "grad_norm": 0.10302402824163437, + "learning_rate": 6.232426720440776e-08, + "loss": 0.0009, + "step": 77438 + }, + { + "epoch": 0.9680242006050152, + "grad_norm": 3.481038808822632, + "learning_rate": 6.22270246593204e-08, + "loss": 0.5946, + "step": 77440 + }, + { + "epoch": 0.9680492012300308, + "grad_norm": 0.48152992129325867, + "learning_rate": 6.212985779890713e-08, + "loss": 0.218, + "step": 77442 + }, + { + "epoch": 0.9680742018550463, + "grad_norm": 0.00028419712907634676, + "learning_rate": 6.203276662390734e-08, + "loss": 0.0, + "step": 77444 + }, + { + "epoch": 0.968099202480062, + "grad_norm": 3.2629311084747314, + "learning_rate": 6.193575113506045e-08, + "loss": 0.8678, + "step": 77446 + }, + { + "epoch": 0.9681242031050776, + "grad_norm": 3.6424996852874756, + "learning_rate": 6.183881133310588e-08, + "loss": 1.2256, + "step": 77448 + }, + { + "epoch": 0.9681492037300933, + "grad_norm": 3.1497087478637695, + "learning_rate": 6.174194721878079e-08, + "loss": 0.7426, + "step": 77450 + }, + { + "epoch": 0.9681742043551089, + "grad_norm": 3.1984059810638428, + "learning_rate": 6.164515879282462e-08, + "loss": 1.3336, + "step": 77452 + }, + { + "epoch": 0.9681992049801245, + "grad_norm": 2.9616940021514893, + "learning_rate": 6.154844605597343e-08, + "loss": 1.8345, + "step": 77454 + }, + { + "epoch": 0.9682242056051401, + "grad_norm": 3.7381412982940674, + "learning_rate": 6.14518090089633e-08, + "loss": 1.3703, + "step": 77456 + }, + { + "epoch": 0.9682492062301558, + "grad_norm": 0.0005542978178709745, + "learning_rate": 6.135524765253031e-08, + "loss": 0.0681, + "step": 77458 + }, + { + "epoch": 0.9682742068551714, + "grad_norm": 6.095678329467773, + "learning_rate": 6.125876198741165e-08, + "loss": 1.1683, + "step": 77460 + }, + { + "epoch": 0.968299207480187, + "grad_norm": 6.130204200744629, + "learning_rate": 6.116235201434006e-08, + "loss": 1.495, + "step": 77462 + }, + { + "epoch": 0.9683242081052026, + "grad_norm": 0.8828749060630798, + "learning_rate": 6.106601773405053e-08, + "loss": 0.5495, + "step": 77464 + }, + { + "epoch": 0.9683492087302182, + "grad_norm": 4.8955302238464355, + "learning_rate": 6.096975914727687e-08, + "loss": 0.0547, + "step": 77466 + }, + { + "epoch": 0.9683742093552339, + "grad_norm": 0.00035272521199658513, + "learning_rate": 6.087357625475187e-08, + "loss": 0.5913, + "step": 77468 + }, + { + "epoch": 0.9683992099802495, + "grad_norm": 4.008708953857422, + "learning_rate": 6.077746905720827e-08, + "loss": 0.3969, + "step": 77470 + }, + { + "epoch": 0.9684242106052652, + "grad_norm": 0.027390331029891968, + "learning_rate": 6.06814375553777e-08, + "loss": 0.0362, + "step": 77472 + }, + { + "epoch": 0.9684492112302807, + "grad_norm": 0.08001253008842468, + "learning_rate": 6.058548174999179e-08, + "loss": 1.3255, + "step": 77474 + }, + { + "epoch": 0.9684742118552964, + "grad_norm": 0.00039295933675020933, + "learning_rate": 6.048960164178108e-08, + "loss": 0.5785, + "step": 77476 + }, + { + "epoch": 0.968499212480312, + "grad_norm": 5.122011661529541, + "learning_rate": 6.03937972314772e-08, + "loss": 0.6679, + "step": 77478 + }, + { + "epoch": 0.9685242131053277, + "grad_norm": 3.9131548404693604, + "learning_rate": 6.029806851980735e-08, + "loss": 0.947, + "step": 77480 + }, + { + "epoch": 0.9685492137303433, + "grad_norm": 3.739097833633423, + "learning_rate": 6.020241550750205e-08, + "loss": 0.5324, + "step": 77482 + }, + { + "epoch": 0.9685742143553588, + "grad_norm": 3.7748324871063232, + "learning_rate": 6.010683819528962e-08, + "loss": 1.1827, + "step": 77484 + }, + { + "epoch": 0.9685992149803745, + "grad_norm": 2.9366235733032227, + "learning_rate": 6.001133658389835e-08, + "loss": 0.5351, + "step": 77486 + }, + { + "epoch": 0.9686242156053901, + "grad_norm": 7.466938018798828, + "learning_rate": 5.991591067405434e-08, + "loss": 1.4171, + "step": 77488 + }, + { + "epoch": 0.9686492162304058, + "grad_norm": 4.567121505737305, + "learning_rate": 5.982056046648588e-08, + "loss": 1.0427, + "step": 77490 + }, + { + "epoch": 0.9686742168554214, + "grad_norm": 0.0003139479085803032, + "learning_rate": 5.972528596191796e-08, + "loss": 1.5222, + "step": 77492 + }, + { + "epoch": 0.968699217480437, + "grad_norm": 3.3241195678710938, + "learning_rate": 5.963008716107554e-08, + "loss": 1.2837, + "step": 77494 + }, + { + "epoch": 0.9687242181054526, + "grad_norm": 3.4875094890594482, + "learning_rate": 5.953496406468584e-08, + "loss": 1.4693, + "step": 77496 + }, + { + "epoch": 0.9687492187304683, + "grad_norm": 0.00040470247040502727, + "learning_rate": 5.9439916673471596e-08, + "loss": 0.097, + "step": 77498 + }, + { + "epoch": 0.9687742193554839, + "grad_norm": 2.8921003341674805, + "learning_rate": 5.9344944988157797e-08, + "loss": 1.1634, + "step": 77500 + }, + { + "epoch": 0.9687992199804996, + "grad_norm": 2.534900665283203, + "learning_rate": 5.925004900946718e-08, + "loss": 1.0789, + "step": 77502 + }, + { + "epoch": 0.9688242206055151, + "grad_norm": 2.1207516193389893, + "learning_rate": 5.915522873812252e-08, + "loss": 0.171, + "step": 77504 + }, + { + "epoch": 0.9688492212305307, + "grad_norm": 3.1994762420654297, + "learning_rate": 5.9060484174845444e-08, + "loss": 0.562, + "step": 77506 + }, + { + "epoch": 0.9688742218555464, + "grad_norm": 5.135496139526367, + "learning_rate": 5.896581532035761e-08, + "loss": 1.6474, + "step": 77508 + }, + { + "epoch": 0.968899222480562, + "grad_norm": 3.9799203872680664, + "learning_rate": 5.887122217538177e-08, + "loss": 1.911, + "step": 77510 + }, + { + "epoch": 0.9689242231055777, + "grad_norm": 2.3929929733276367, + "learning_rate": 5.8776704740636235e-08, + "loss": 0.5794, + "step": 77512 + }, + { + "epoch": 0.9689492237305932, + "grad_norm": 4.223155975341797, + "learning_rate": 5.8682263016840435e-08, + "loss": 0.404, + "step": 77514 + }, + { + "epoch": 0.9689742243556089, + "grad_norm": 5.161324977874756, + "learning_rate": 5.858789700471601e-08, + "loss": 1.7935, + "step": 77516 + }, + { + "epoch": 0.9689992249806245, + "grad_norm": 3.9410243034362793, + "learning_rate": 5.849360670497906e-08, + "loss": 1.6232, + "step": 77518 + }, + { + "epoch": 0.9690242256056402, + "grad_norm": 2.971163749694824, + "learning_rate": 5.839939211835011e-08, + "loss": 1.4246, + "step": 77520 + }, + { + "epoch": 0.9690492262306558, + "grad_norm": 4.474086761474609, + "learning_rate": 5.830525324554526e-08, + "loss": 1.3837, + "step": 77522 + }, + { + "epoch": 0.9690742268556714, + "grad_norm": 1.3563029766082764, + "learning_rate": 5.82111900872806e-08, + "loss": 0.7323, + "step": 77524 + }, + { + "epoch": 0.969099227480687, + "grad_norm": 3.2957029342651367, + "learning_rate": 5.811720264427556e-08, + "loss": 0.5037, + "step": 77526 + }, + { + "epoch": 0.9691242281057026, + "grad_norm": 2.804590940475464, + "learning_rate": 5.80232909172429e-08, + "loss": 0.7106, + "step": 77528 + }, + { + "epoch": 0.9691492287307183, + "grad_norm": 0.00022584378893952817, + "learning_rate": 5.7929454906898716e-08, + "loss": 0.6148, + "step": 77530 + }, + { + "epoch": 0.9691742293557339, + "grad_norm": 2.1083102226257324, + "learning_rate": 5.783569461395799e-08, + "loss": 0.618, + "step": 77532 + }, + { + "epoch": 0.9691992299807495, + "grad_norm": 0.0020178863778710365, + "learning_rate": 5.7742010039135706e-08, + "loss": 0.054, + "step": 77534 + }, + { + "epoch": 0.9692242306057651, + "grad_norm": 1.870086669921875, + "learning_rate": 5.764840118314241e-08, + "loss": 0.4723, + "step": 77536 + }, + { + "epoch": 0.9692492312307808, + "grad_norm": 3.939443826675415, + "learning_rate": 5.755486804669419e-08, + "loss": 0.9542, + "step": 77538 + }, + { + "epoch": 0.9692742318557964, + "grad_norm": 2.738096237182617, + "learning_rate": 5.74614106305027e-08, + "loss": 0.8836, + "step": 77540 + }, + { + "epoch": 0.9692992324808121, + "grad_norm": 6.998571872711182, + "learning_rate": 5.736802893527737e-08, + "loss": 1.4533, + "step": 77542 + }, + { + "epoch": 0.9693242331058276, + "grad_norm": 0.0005597459967248142, + "learning_rate": 5.7274722961732086e-08, + "loss": 0.6419, + "step": 77544 + }, + { + "epoch": 0.9693492337308433, + "grad_norm": 0.0002780054637696594, + "learning_rate": 5.718149271057627e-08, + "loss": 0.4733, + "step": 77546 + }, + { + "epoch": 0.9693742343558589, + "grad_norm": 4.363052845001221, + "learning_rate": 5.708833818252046e-08, + "loss": 1.5493, + "step": 77548 + }, + { + "epoch": 0.9693992349808745, + "grad_norm": 4.2018537521362305, + "learning_rate": 5.69952593782741e-08, + "loss": 0.68, + "step": 77550 + }, + { + "epoch": 0.9694242356058902, + "grad_norm": 9.980175971984863, + "learning_rate": 5.690225629854551e-08, + "loss": 1.2602, + "step": 77552 + }, + { + "epoch": 0.9694492362309057, + "grad_norm": 1.5612441301345825, + "learning_rate": 5.680932894404301e-08, + "loss": 0.5195, + "step": 77554 + }, + { + "epoch": 0.9694742368559214, + "grad_norm": 0.03158070519566536, + "learning_rate": 5.671647731547491e-08, + "loss": 0.576, + "step": 77556 + }, + { + "epoch": 0.969499237480937, + "grad_norm": 0.08629193156957626, + "learning_rate": 5.662370141354734e-08, + "loss": 0.0639, + "step": 77558 + }, + { + "epoch": 0.9695242381059527, + "grad_norm": 7.117581844329834, + "learning_rate": 5.6531001238967485e-08, + "loss": 0.5547, + "step": 77560 + }, + { + "epoch": 0.9695492387309683, + "grad_norm": 2.978679656982422, + "learning_rate": 5.643837679244146e-08, + "loss": 0.304, + "step": 77562 + }, + { + "epoch": 0.9695742393559839, + "grad_norm": 0.0006575190927833319, + "learning_rate": 5.634582807467537e-08, + "loss": 0.712, + "step": 77564 + }, + { + "epoch": 0.9695992399809995, + "grad_norm": 4.704699516296387, + "learning_rate": 5.6253355086373086e-08, + "loss": 1.7357, + "step": 77566 + }, + { + "epoch": 0.9696242406060152, + "grad_norm": 1.5806090831756592, + "learning_rate": 5.61609578282396e-08, + "loss": 0.5025, + "step": 77568 + }, + { + "epoch": 0.9696492412310308, + "grad_norm": 4.774125099182129, + "learning_rate": 5.606863630097659e-08, + "loss": 0.4644, + "step": 77570 + }, + { + "epoch": 0.9696742418560464, + "grad_norm": 1.5389155149459839, + "learning_rate": 5.5976390505290136e-08, + "loss": 0.6188, + "step": 77572 + }, + { + "epoch": 0.969699242481062, + "grad_norm": 4.924237251281738, + "learning_rate": 5.58842204418808e-08, + "loss": 2.4738, + "step": 77574 + }, + { + "epoch": 0.9697242431060776, + "grad_norm": 3.9392402172088623, + "learning_rate": 5.579212611145024e-08, + "loss": 0.6991, + "step": 77576 + }, + { + "epoch": 0.9697492437310933, + "grad_norm": 3.618396282196045, + "learning_rate": 5.570010751470123e-08, + "loss": 1.9409, + "step": 77578 + }, + { + "epoch": 0.9697742443561089, + "grad_norm": 2.8398053646087646, + "learning_rate": 5.5608164652334315e-08, + "loss": 0.8389, + "step": 77580 + }, + { + "epoch": 0.9697992449811246, + "grad_norm": 0.7938181757926941, + "learning_rate": 5.551629752504895e-08, + "loss": 0.6834, + "step": 77582 + }, + { + "epoch": 0.9698242456061401, + "grad_norm": 1.924784779548645, + "learning_rate": 5.5424506133544554e-08, + "loss": 0.5767, + "step": 77584 + }, + { + "epoch": 0.9698492462311558, + "grad_norm": 6.101443767547607, + "learning_rate": 5.533279047852169e-08, + "loss": 0.6232, + "step": 77586 + }, + { + "epoch": 0.9698742468561714, + "grad_norm": 2.9785103797912598, + "learning_rate": 5.5241150560677584e-08, + "loss": 0.5564, + "step": 77588 + }, + { + "epoch": 0.969899247481187, + "grad_norm": 2.231224298477173, + "learning_rate": 5.514958638071055e-08, + "loss": 0.6182, + "step": 77590 + }, + { + "epoch": 0.9699242481062027, + "grad_norm": 5.21632194519043, + "learning_rate": 5.505809793931893e-08, + "loss": 1.9517, + "step": 77592 + }, + { + "epoch": 0.9699492487312182, + "grad_norm": 2.0852339267730713, + "learning_rate": 5.496668523719773e-08, + "loss": 0.3868, + "step": 77594 + }, + { + "epoch": 0.9699742493562339, + "grad_norm": 2.317793846130371, + "learning_rate": 5.487534827504415e-08, + "loss": 1.8315, + "step": 77596 + }, + { + "epoch": 0.9699992499812495, + "grad_norm": 3.363054037094116, + "learning_rate": 5.478408705355209e-08, + "loss": 1.4625, + "step": 77598 + }, + { + "epoch": 0.9700242506062652, + "grad_norm": 0.05185110867023468, + "learning_rate": 5.4692901573418777e-08, + "loss": 0.5219, + "step": 77600 + }, + { + "epoch": 0.9700492512312808, + "grad_norm": 3.723193883895874, + "learning_rate": 5.46017918353392e-08, + "loss": 1.0709, + "step": 77602 + }, + { + "epoch": 0.9700742518562964, + "grad_norm": 2.711179256439209, + "learning_rate": 5.451075784000504e-08, + "loss": 0.4409, + "step": 77604 + }, + { + "epoch": 0.970099252481312, + "grad_norm": 1.625293493270874, + "learning_rate": 5.441979958811128e-08, + "loss": 0.7078, + "step": 77606 + }, + { + "epoch": 0.9701242531063277, + "grad_norm": 2.3665292263031006, + "learning_rate": 5.4328917080348486e-08, + "loss": 0.4223, + "step": 77608 + }, + { + "epoch": 0.9701492537313433, + "grad_norm": 9.770167350769043, + "learning_rate": 5.423811031741055e-08, + "loss": 1.5648, + "step": 77610 + }, + { + "epoch": 0.970174254356359, + "grad_norm": 2.69142484664917, + "learning_rate": 5.4147379299989146e-08, + "loss": 1.0664, + "step": 77612 + }, + { + "epoch": 0.9701992549813745, + "grad_norm": 5.180204391479492, + "learning_rate": 5.405672402877482e-08, + "loss": 0.6309, + "step": 77614 + }, + { + "epoch": 0.9702242556063901, + "grad_norm": 1.4793527126312256, + "learning_rate": 5.396614450445814e-08, + "loss": 0.0869, + "step": 77616 + }, + { + "epoch": 0.9702492562314058, + "grad_norm": 3.0533485412597656, + "learning_rate": 5.387564072772855e-08, + "loss": 0.6878, + "step": 77618 + }, + { + "epoch": 0.9702742568564214, + "grad_norm": 8.58735466003418, + "learning_rate": 5.378521269927661e-08, + "loss": 1.321, + "step": 77620 + }, + { + "epoch": 0.9702992574814371, + "grad_norm": 0.9674652814865112, + "learning_rate": 5.3694860419788444e-08, + "loss": 0.7092, + "step": 77622 + }, + { + "epoch": 0.9703242581064526, + "grad_norm": 4.20629358291626, + "learning_rate": 5.36045838899546e-08, + "loss": 1.1818, + "step": 77624 + }, + { + "epoch": 0.9703492587314683, + "grad_norm": 2.863278865814209, + "learning_rate": 5.35143831104612e-08, + "loss": 0.7914, + "step": 77626 + }, + { + "epoch": 0.9703742593564839, + "grad_norm": 2.2838945388793945, + "learning_rate": 5.3424258081995475e-08, + "loss": 0.0929, + "step": 77628 + }, + { + "epoch": 0.9703992599814996, + "grad_norm": 4.886204242706299, + "learning_rate": 5.333420880524465e-08, + "loss": 1.8729, + "step": 77630 + }, + { + "epoch": 0.9704242606065152, + "grad_norm": 1.7246590852737427, + "learning_rate": 5.324423528089373e-08, + "loss": 0.1295, + "step": 77632 + }, + { + "epoch": 0.9704492612315307, + "grad_norm": 9.367440223693848, + "learning_rate": 5.315433750962773e-08, + "loss": 0.4261, + "step": 77634 + }, + { + "epoch": 0.9704742618565464, + "grad_norm": 4.072595596313477, + "learning_rate": 5.306451549213276e-08, + "loss": 1.1451, + "step": 77636 + }, + { + "epoch": 0.970499262481562, + "grad_norm": 0.7272661328315735, + "learning_rate": 5.297476922909051e-08, + "loss": 0.8132, + "step": 77638 + }, + { + "epoch": 0.9705242631065777, + "grad_norm": 2.779876708984375, + "learning_rate": 5.2885098721185965e-08, + "loss": 0.824, + "step": 77640 + }, + { + "epoch": 0.9705492637315933, + "grad_norm": 3.035381555557251, + "learning_rate": 5.279550396910194e-08, + "loss": 1.6927, + "step": 77642 + }, + { + "epoch": 0.9705742643566089, + "grad_norm": 4.0692219734191895, + "learning_rate": 5.270598497352231e-08, + "loss": 0.8818, + "step": 77644 + }, + { + "epoch": 0.9705992649816245, + "grad_norm": 0.9209955930709839, + "learning_rate": 5.2616541735125426e-08, + "loss": 0.0879, + "step": 77646 + }, + { + "epoch": 0.9706242656066402, + "grad_norm": 4.224839687347412, + "learning_rate": 5.252717425459408e-08, + "loss": 0.9217, + "step": 77648 + }, + { + "epoch": 0.9706492662316558, + "grad_norm": 3.407515048980713, + "learning_rate": 5.2437882532609955e-08, + "loss": 0.7745, + "step": 77650 + }, + { + "epoch": 0.9706742668566715, + "grad_norm": 2.829407215118408, + "learning_rate": 5.2348666569852493e-08, + "loss": 0.1432, + "step": 77652 + }, + { + "epoch": 0.970699267481687, + "grad_norm": 4.444319725036621, + "learning_rate": 5.225952636700005e-08, + "loss": 0.7951, + "step": 77654 + }, + { + "epoch": 0.9707242681067026, + "grad_norm": 3.431403160095215, + "learning_rate": 5.2170461924733187e-08, + "loss": 1.6406, + "step": 77656 + }, + { + "epoch": 0.9707492687317183, + "grad_norm": 3.2018003463745117, + "learning_rate": 5.208147324372914e-08, + "loss": 1.1783, + "step": 77658 + }, + { + "epoch": 0.9707742693567339, + "grad_norm": 3.815870523452759, + "learning_rate": 5.1992560324665154e-08, + "loss": 0.7769, + "step": 77660 + }, + { + "epoch": 0.9707992699817496, + "grad_norm": 3.8702080249786377, + "learning_rate": 5.1903723168218454e-08, + "loss": 0.9531, + "step": 77662 + }, + { + "epoch": 0.9708242706067651, + "grad_norm": 3.947977066040039, + "learning_rate": 5.1814961775067395e-08, + "loss": 1.0975, + "step": 77664 + }, + { + "epoch": 0.9708492712317808, + "grad_norm": 3.993504762649536, + "learning_rate": 5.172627614588699e-08, + "loss": 0.7159, + "step": 77666 + }, + { + "epoch": 0.9708742718567964, + "grad_norm": 4.532383441925049, + "learning_rate": 5.163766628135114e-08, + "loss": 0.3585, + "step": 77668 + }, + { + "epoch": 0.9708992724818121, + "grad_norm": 3.426638603210449, + "learning_rate": 5.154913218213709e-08, + "loss": 0.7736, + "step": 77670 + }, + { + "epoch": 0.9709242731068277, + "grad_norm": 0.05142558738589287, + "learning_rate": 5.146067384891762e-08, + "loss": 0.4446, + "step": 77672 + }, + { + "epoch": 0.9709492737318433, + "grad_norm": 5.439691543579102, + "learning_rate": 5.137229128236776e-08, + "loss": 0.8283, + "step": 77674 + }, + { + "epoch": 0.9709742743568589, + "grad_norm": 6.326639175415039, + "learning_rate": 5.128398448315808e-08, + "loss": 1.119, + "step": 77676 + }, + { + "epoch": 0.9709992749818745, + "grad_norm": 2.2129571437835693, + "learning_rate": 5.1195753451963593e-08, + "loss": 1.159, + "step": 77678 + }, + { + "epoch": 0.9710242756068902, + "grad_norm": 5.128485202789307, + "learning_rate": 5.1107598189454874e-08, + "loss": 1.5313, + "step": 77680 + }, + { + "epoch": 0.9710492762319058, + "grad_norm": 1.729599952697754, + "learning_rate": 5.1019518696304726e-08, + "loss": 0.6248, + "step": 77682 + }, + { + "epoch": 0.9710742768569214, + "grad_norm": 1.218967318534851, + "learning_rate": 5.09315149731826e-08, + "loss": 0.3202, + "step": 77684 + }, + { + "epoch": 0.971099277481937, + "grad_norm": 4.9469380378723145, + "learning_rate": 5.084358702075909e-08, + "loss": 1.6178, + "step": 77686 + }, + { + "epoch": 0.9711242781069527, + "grad_norm": 5.190689563751221, + "learning_rate": 5.075573483970364e-08, + "loss": 1.1444, + "step": 77688 + }, + { + "epoch": 0.9711492787319683, + "grad_norm": 0.00043003782047890127, + "learning_rate": 5.066795843068573e-08, + "loss": 0.6658, + "step": 77690 + }, + { + "epoch": 0.971174279356984, + "grad_norm": 3.8106696605682373, + "learning_rate": 5.05802577943737e-08, + "loss": 0.5076, + "step": 77692 + }, + { + "epoch": 0.9711992799819995, + "grad_norm": 0.00032706503407098353, + "learning_rate": 5.049263293143481e-08, + "loss": 0.6504, + "step": 77694 + }, + { + "epoch": 0.9712242806070152, + "grad_norm": 2.2716283798217773, + "learning_rate": 5.0405083842537396e-08, + "loss": 0.7656, + "step": 77696 + }, + { + "epoch": 0.9712492812320308, + "grad_norm": 0.0038984669372439384, + "learning_rate": 5.0317610528348715e-08, + "loss": 0.1667, + "step": 77698 + }, + { + "epoch": 0.9712742818570465, + "grad_norm": 2.288330316543579, + "learning_rate": 5.023021298953379e-08, + "loss": 0.6323, + "step": 77700 + }, + { + "epoch": 0.9712992824820621, + "grad_norm": 2.6118266582489014, + "learning_rate": 5.014289122675764e-08, + "loss": 0.534, + "step": 77702 + }, + { + "epoch": 0.9713242831070776, + "grad_norm": 6.452050685882568, + "learning_rate": 5.00556452406864e-08, + "loss": 0.2332, + "step": 77704 + }, + { + "epoch": 0.9713492837320933, + "grad_norm": 2.5544660091400146, + "learning_rate": 4.99684750319851e-08, + "loss": 0.6759, + "step": 77706 + }, + { + "epoch": 0.9713742843571089, + "grad_norm": 0.5646159052848816, + "learning_rate": 4.988138060131653e-08, + "loss": 0.7523, + "step": 77708 + }, + { + "epoch": 0.9713992849821246, + "grad_norm": 5.0916748046875, + "learning_rate": 4.9794361949344616e-08, + "loss": 1.8053, + "step": 77710 + }, + { + "epoch": 0.9714242856071402, + "grad_norm": 2.859938383102417, + "learning_rate": 4.970741907673105e-08, + "loss": 0.7509, + "step": 77712 + }, + { + "epoch": 0.9714492862321558, + "grad_norm": 1.9170563220977783, + "learning_rate": 4.962055198413862e-08, + "loss": 0.7628, + "step": 77714 + }, + { + "epoch": 0.9714742868571714, + "grad_norm": 0.0005385535187087953, + "learning_rate": 4.953376067222904e-08, + "loss": 1.2063, + "step": 77716 + }, + { + "epoch": 0.9714992874821871, + "grad_norm": 3.0403873920440674, + "learning_rate": 4.944704514166399e-08, + "loss": 1.0807, + "step": 77718 + }, + { + "epoch": 0.9715242881072027, + "grad_norm": 2.2149269580841064, + "learning_rate": 4.936040539310294e-08, + "loss": 0.1896, + "step": 77720 + }, + { + "epoch": 0.9715492887322184, + "grad_norm": 3.725121021270752, + "learning_rate": 4.927384142720537e-08, + "loss": 2.0136, + "step": 77722 + }, + { + "epoch": 0.9715742893572339, + "grad_norm": 2.126690149307251, + "learning_rate": 4.918735324463075e-08, + "loss": 2.3283, + "step": 77724 + }, + { + "epoch": 0.9715992899822495, + "grad_norm": 3.538681745529175, + "learning_rate": 4.9100940846038555e-08, + "loss": 0.8996, + "step": 77726 + }, + { + "epoch": 0.9716242906072652, + "grad_norm": 4.805455684661865, + "learning_rate": 4.901460423208604e-08, + "loss": 0.1917, + "step": 77728 + }, + { + "epoch": 0.9716492912322808, + "grad_norm": 2.0182602405548096, + "learning_rate": 4.8928343403431556e-08, + "loss": 0.71, + "step": 77730 + }, + { + "epoch": 0.9716742918572965, + "grad_norm": 3.2396743297576904, + "learning_rate": 4.8842158360731254e-08, + "loss": 1.004, + "step": 77732 + }, + { + "epoch": 0.971699292482312, + "grad_norm": 3.118471622467041, + "learning_rate": 4.875604910464238e-08, + "loss": 0.2146, + "step": 77734 + }, + { + "epoch": 0.9717242931073277, + "grad_norm": 2.5166447162628174, + "learning_rate": 4.867001563582108e-08, + "loss": 1.1054, + "step": 77736 + }, + { + "epoch": 0.9717492937323433, + "grad_norm": 1.9673004150390625, + "learning_rate": 4.858405795492016e-08, + "loss": 0.9488, + "step": 77738 + }, + { + "epoch": 0.971774294357359, + "grad_norm": 5.16480827331543, + "learning_rate": 4.849817606259577e-08, + "loss": 3.081, + "step": 77740 + }, + { + "epoch": 0.9717992949823746, + "grad_norm": 2.331357479095459, + "learning_rate": 4.8412369959502934e-08, + "loss": 0.2287, + "step": 77742 + }, + { + "epoch": 0.9718242956073901, + "grad_norm": 0.0043608397245407104, + "learning_rate": 4.832663964629336e-08, + "loss": 0.1394, + "step": 77744 + }, + { + "epoch": 0.9718492962324058, + "grad_norm": 5.965434551239014, + "learning_rate": 4.8240985123620965e-08, + "loss": 2.1279, + "step": 77746 + }, + { + "epoch": 0.9718742968574214, + "grad_norm": 3.198632001876831, + "learning_rate": 4.815540639213856e-08, + "loss": 1.6833, + "step": 77748 + }, + { + "epoch": 0.9718992974824371, + "grad_norm": 7.320096015930176, + "learning_rate": 4.806990345249674e-08, + "loss": 0.8188, + "step": 77750 + }, + { + "epoch": 0.9719242981074527, + "grad_norm": 1.271949052810669, + "learning_rate": 4.798447630534719e-08, + "loss": 1.0598, + "step": 77752 + }, + { + "epoch": 0.9719492987324683, + "grad_norm": 4.334837913513184, + "learning_rate": 4.7899124951340526e-08, + "loss": 2.274, + "step": 77754 + }, + { + "epoch": 0.9719742993574839, + "grad_norm": 2.6143293380737305, + "learning_rate": 4.781384939112621e-08, + "loss": 1.0842, + "step": 77756 + }, + { + "epoch": 0.9719992999824996, + "grad_norm": 4.354399681091309, + "learning_rate": 4.772864962535595e-08, + "loss": 1.0928, + "step": 77758 + }, + { + "epoch": 0.9720243006075152, + "grad_norm": 4.960532188415527, + "learning_rate": 4.764352565467589e-08, + "loss": 1.4661, + "step": 77760 + }, + { + "epoch": 0.9720493012325309, + "grad_norm": 2.7021682262420654, + "learning_rate": 4.755847747973441e-08, + "loss": 1.1151, + "step": 77762 + }, + { + "epoch": 0.9720743018575464, + "grad_norm": 3.8175110816955566, + "learning_rate": 4.747350510118098e-08, + "loss": 1.6143, + "step": 77764 + }, + { + "epoch": 0.972099302482562, + "grad_norm": 1.5649428367614746, + "learning_rate": 4.738860851966176e-08, + "loss": 0.693, + "step": 77766 + }, + { + "epoch": 0.9721243031075777, + "grad_norm": 3.6539080142974854, + "learning_rate": 4.7303787735824006e-08, + "loss": 1.5682, + "step": 77768 + }, + { + "epoch": 0.9721493037325933, + "grad_norm": 2.5908291339874268, + "learning_rate": 4.721904275031275e-08, + "loss": 0.9053, + "step": 77770 + }, + { + "epoch": 0.972174304357609, + "grad_norm": 2.4616146087646484, + "learning_rate": 4.713437356377415e-08, + "loss": 0.1814, + "step": 77772 + }, + { + "epoch": 0.9721993049826245, + "grad_norm": 1.7981985807418823, + "learning_rate": 4.7049780176853244e-08, + "loss": 1.4117, + "step": 77774 + }, + { + "epoch": 0.9722243056076402, + "grad_norm": 4.834909915924072, + "learning_rate": 4.696526259019285e-08, + "loss": 0.7344, + "step": 77776 + }, + { + "epoch": 0.9722493062326558, + "grad_norm": 0.0003566330415196717, + "learning_rate": 4.68808208044369e-08, + "loss": 0.6341, + "step": 77778 + }, + { + "epoch": 0.9722743068576715, + "grad_norm": 4.236727237701416, + "learning_rate": 4.679645482023043e-08, + "loss": 0.949, + "step": 77780 + }, + { + "epoch": 0.9722993074826871, + "grad_norm": 2.7528159618377686, + "learning_rate": 4.671216463821404e-08, + "loss": 0.8207, + "step": 77782 + }, + { + "epoch": 0.9723243081077027, + "grad_norm": 2.894472360610962, + "learning_rate": 4.662795025903055e-08, + "loss": 1.4874, + "step": 77784 + }, + { + "epoch": 0.9723493087327183, + "grad_norm": 10.594888687133789, + "learning_rate": 4.654381168332056e-08, + "loss": 0.4147, + "step": 77786 + }, + { + "epoch": 0.972374309357734, + "grad_norm": 1.153796911239624, + "learning_rate": 4.6459748911725776e-08, + "loss": 0.3872, + "step": 77788 + }, + { + "epoch": 0.9723993099827496, + "grad_norm": 3.7997963428497314, + "learning_rate": 4.637576194488569e-08, + "loss": 0.9924, + "step": 77790 + }, + { + "epoch": 0.9724243106077652, + "grad_norm": 3.4434728622436523, + "learning_rate": 4.62918507834409e-08, + "loss": 0.4321, + "step": 77792 + }, + { + "epoch": 0.9724493112327808, + "grad_norm": 0.027279281988739967, + "learning_rate": 4.6208015428028665e-08, + "loss": 0.7299, + "step": 77794 + }, + { + "epoch": 0.9724743118577964, + "grad_norm": 4.143573760986328, + "learning_rate": 4.612425587928959e-08, + "loss": 0.8492, + "step": 77796 + }, + { + "epoch": 0.9724993124828121, + "grad_norm": 0.002780722454190254, + "learning_rate": 4.604057213785984e-08, + "loss": 0.1009, + "step": 77798 + }, + { + "epoch": 0.9725243131078277, + "grad_norm": 3.565701723098755, + "learning_rate": 4.595696420437779e-08, + "loss": 1.3941, + "step": 77800 + }, + { + "epoch": 0.9725493137328434, + "grad_norm": 4.565027236938477, + "learning_rate": 4.5873432079480697e-08, + "loss": 0.4127, + "step": 77802 + }, + { + "epoch": 0.9725743143578589, + "grad_norm": 1.0914530754089355, + "learning_rate": 4.578997576380251e-08, + "loss": 0.7193, + "step": 77804 + }, + { + "epoch": 0.9725993149828746, + "grad_norm": 2.0242600440979004, + "learning_rate": 4.570659525798049e-08, + "loss": 0.1295, + "step": 77806 + }, + { + "epoch": 0.9726243156078902, + "grad_norm": 2.940382242202759, + "learning_rate": 4.562329056264969e-08, + "loss": 1.7892, + "step": 77808 + }, + { + "epoch": 0.9726493162329058, + "grad_norm": 1.005173683166504, + "learning_rate": 4.554006167844516e-08, + "loss": 0.0273, + "step": 77810 + }, + { + "epoch": 0.9726743168579215, + "grad_norm": 2.1327717304229736, + "learning_rate": 4.54569086059986e-08, + "loss": 0.5888, + "step": 77812 + }, + { + "epoch": 0.972699317482937, + "grad_norm": 0.005561984144151211, + "learning_rate": 4.5373831345945086e-08, + "loss": 0.0398, + "step": 77814 + }, + { + "epoch": 0.9727243181079527, + "grad_norm": 5.649768352508545, + "learning_rate": 4.529082989891631e-08, + "loss": 1.6937, + "step": 77816 + }, + { + "epoch": 0.9727493187329683, + "grad_norm": 3.5611963272094727, + "learning_rate": 4.520790426554511e-08, + "loss": 1.4128, + "step": 77818 + }, + { + "epoch": 0.972774319357984, + "grad_norm": 2.740238666534424, + "learning_rate": 4.512505444646209e-08, + "loss": 1.1592, + "step": 77820 + }, + { + "epoch": 0.9727993199829996, + "grad_norm": 2.497117757797241, + "learning_rate": 4.504228044230008e-08, + "loss": 1.2743, + "step": 77822 + }, + { + "epoch": 0.9728243206080152, + "grad_norm": 2.7912120819091797, + "learning_rate": 4.495958225368746e-08, + "loss": 0.6986, + "step": 77824 + }, + { + "epoch": 0.9728493212330308, + "grad_norm": 0.0003156060411129147, + "learning_rate": 4.487695988125595e-08, + "loss": 0.0055, + "step": 77826 + }, + { + "epoch": 0.9728743218580465, + "grad_norm": 2.9760048389434814, + "learning_rate": 4.4794413325632837e-08, + "loss": 0.2445, + "step": 77828 + }, + { + "epoch": 0.9728993224830621, + "grad_norm": 3.9546031951904297, + "learning_rate": 4.4711942587447596e-08, + "loss": 0.9027, + "step": 77830 + }, + { + "epoch": 0.9729243231080777, + "grad_norm": 5.653923988342285, + "learning_rate": 4.4629547667328635e-08, + "loss": 1.7438, + "step": 77832 + }, + { + "epoch": 0.9729493237330933, + "grad_norm": 1.0558379888534546, + "learning_rate": 4.454722856590321e-08, + "loss": 0.5907, + "step": 77834 + }, + { + "epoch": 0.9729743243581089, + "grad_norm": 5.5152788162231445, + "learning_rate": 4.44649852837975e-08, + "loss": 1.7085, + "step": 77836 + }, + { + "epoch": 0.9729993249831246, + "grad_norm": 9.009042739868164, + "learning_rate": 4.438281782163989e-08, + "loss": 2.2132, + "step": 77838 + }, + { + "epoch": 0.9730243256081402, + "grad_norm": 1.0220415592193604, + "learning_rate": 4.430072618005432e-08, + "loss": 0.4083, + "step": 77840 + }, + { + "epoch": 0.9730493262331559, + "grad_norm": 3.2546169757843018, + "learning_rate": 4.421871035966696e-08, + "loss": 1.4573, + "step": 77842 + }, + { + "epoch": 0.9730743268581714, + "grad_norm": 3.4262821674346924, + "learning_rate": 4.413677036110175e-08, + "loss": 1.4331, + "step": 77844 + }, + { + "epoch": 0.9730993274831871, + "grad_norm": 2.7440345287323, + "learning_rate": 4.405490618498265e-08, + "loss": 0.9864, + "step": 77846 + }, + { + "epoch": 0.9731243281082027, + "grad_norm": 6.005273342132568, + "learning_rate": 4.3973117831933586e-08, + "loss": 1.6083, + "step": 77848 + }, + { + "epoch": 0.9731493287332184, + "grad_norm": 7.169722557067871, + "learning_rate": 4.3891405302577406e-08, + "loss": 1.154, + "step": 77850 + }, + { + "epoch": 0.973174329358234, + "grad_norm": 0.1199931725859642, + "learning_rate": 4.3809768597536936e-08, + "loss": 0.0634, + "step": 77852 + }, + { + "epoch": 0.9731993299832495, + "grad_norm": 3.060560941696167, + "learning_rate": 4.37282077174328e-08, + "loss": 1.628, + "step": 77854 + }, + { + "epoch": 0.9732243306082652, + "grad_norm": 0.2630830407142639, + "learning_rate": 4.364672266288672e-08, + "loss": 0.866, + "step": 77856 + }, + { + "epoch": 0.9732493312332808, + "grad_norm": 4.384719371795654, + "learning_rate": 4.356531343451931e-08, + "loss": 0.6552, + "step": 77858 + }, + { + "epoch": 0.9732743318582965, + "grad_norm": 2.9193050861358643, + "learning_rate": 4.348398003295118e-08, + "loss": 1.3207, + "step": 77860 + }, + { + "epoch": 0.9732993324833121, + "grad_norm": 4.251197338104248, + "learning_rate": 4.340272245879962e-08, + "loss": 1.3128, + "step": 77862 + }, + { + "epoch": 0.9733243331083277, + "grad_norm": 0.0003890042717102915, + "learning_rate": 4.332154071268635e-08, + "loss": 0.1119, + "step": 77864 + }, + { + "epoch": 0.9733493337333433, + "grad_norm": 4.5199055671691895, + "learning_rate": 4.324043479522755e-08, + "loss": 0.5713, + "step": 77866 + }, + { + "epoch": 0.973374334358359, + "grad_norm": 2.1082773208618164, + "learning_rate": 4.315940470704161e-08, + "loss": 0.3346, + "step": 77868 + }, + { + "epoch": 0.9733993349833746, + "grad_norm": 3.080615758895874, + "learning_rate": 4.3078450448745815e-08, + "loss": 1.1577, + "step": 77870 + }, + { + "epoch": 0.9734243356083903, + "grad_norm": 2.0555918216705322, + "learning_rate": 4.299757202095634e-08, + "loss": 0.9946, + "step": 77872 + }, + { + "epoch": 0.9734493362334058, + "grad_norm": 3.5358998775482178, + "learning_rate": 4.2916769424289354e-08, + "loss": 0.9023, + "step": 77874 + }, + { + "epoch": 0.9734743368584214, + "grad_norm": 0.0010361491004005075, + "learning_rate": 4.283604265935992e-08, + "loss": 0.5586, + "step": 77876 + }, + { + "epoch": 0.9734993374834371, + "grad_norm": 3.8959481716156006, + "learning_rate": 4.275539172678311e-08, + "loss": 0.5443, + "step": 77878 + }, + { + "epoch": 0.9735243381084527, + "grad_norm": 2.0968446731567383, + "learning_rate": 4.267481662717399e-08, + "loss": 1.4738, + "step": 77880 + }, + { + "epoch": 0.9735493387334684, + "grad_norm": 3.5974700450897217, + "learning_rate": 4.259431736114428e-08, + "loss": 0.845, + "step": 77882 + }, + { + "epoch": 0.9735743393584839, + "grad_norm": 3.9386367797851562, + "learning_rate": 4.2513893929309044e-08, + "loss": 1.8249, + "step": 77884 + }, + { + "epoch": 0.9735993399834996, + "grad_norm": 4.088767051696777, + "learning_rate": 4.243354633227892e-08, + "loss": 1.8511, + "step": 77886 + }, + { + "epoch": 0.9736243406085152, + "grad_norm": 0.0008939466788433492, + "learning_rate": 4.2353274570667845e-08, + "loss": 0.5418, + "step": 77888 + }, + { + "epoch": 0.9736493412335309, + "grad_norm": 2.41180157661438, + "learning_rate": 4.227307864508534e-08, + "loss": 0.3974, + "step": 77890 + }, + { + "epoch": 0.9736743418585465, + "grad_norm": 4.042524337768555, + "learning_rate": 4.219295855614314e-08, + "loss": 1.5705, + "step": 77892 + }, + { + "epoch": 0.973699342483562, + "grad_norm": 0.9928611516952515, + "learning_rate": 4.2112914304450747e-08, + "loss": 0.8098, + "step": 77894 + }, + { + "epoch": 0.9737243431085777, + "grad_norm": 8.112704277038574, + "learning_rate": 4.20329458906199e-08, + "loss": 0.7949, + "step": 77896 + }, + { + "epoch": 0.9737493437335933, + "grad_norm": 4.36374568939209, + "learning_rate": 4.1953053315255676e-08, + "loss": 1.2666, + "step": 77898 + }, + { + "epoch": 0.973774344358609, + "grad_norm": 2.886569023132324, + "learning_rate": 4.187323657897091e-08, + "loss": 0.9366, + "step": 77900 + }, + { + "epoch": 0.9737993449836246, + "grad_norm": 0.002810067730024457, + "learning_rate": 4.179349568236957e-08, + "loss": 0.2055, + "step": 77902 + }, + { + "epoch": 0.9738243456086402, + "grad_norm": 3.4289541244506836, + "learning_rate": 4.171383062606116e-08, + "loss": 0.9077, + "step": 77904 + }, + { + "epoch": 0.9738493462336558, + "grad_norm": 1.5239980220794678, + "learning_rate": 4.1634241410652976e-08, + "loss": 0.1928, + "step": 77906 + }, + { + "epoch": 0.9738743468586715, + "grad_norm": 3.544114828109741, + "learning_rate": 4.1554728036748984e-08, + "loss": 0.7112, + "step": 77908 + }, + { + "epoch": 0.9738993474836871, + "grad_norm": 2.1762001514434814, + "learning_rate": 4.147529050495758e-08, + "loss": 0.132, + "step": 77910 + }, + { + "epoch": 0.9739243481087028, + "grad_norm": 2.4971585273742676, + "learning_rate": 4.1395928815880504e-08, + "loss": 1.2843, + "step": 77912 + }, + { + "epoch": 0.9739493487337183, + "grad_norm": 3.8791794776916504, + "learning_rate": 4.1316642970123944e-08, + "loss": 1.3473, + "step": 77914 + }, + { + "epoch": 0.973974349358734, + "grad_norm": 0.5782489776611328, + "learning_rate": 4.1237432968291854e-08, + "loss": 0.4887, + "step": 77916 + }, + { + "epoch": 0.9739993499837496, + "grad_norm": 12.732962608337402, + "learning_rate": 4.1158298810987096e-08, + "loss": 1.1344, + "step": 77918 + }, + { + "epoch": 0.9740243506087652, + "grad_norm": 0.0003359009278938174, + "learning_rate": 4.107924049881251e-08, + "loss": 0.4105, + "step": 77920 + }, + { + "epoch": 0.9740493512337809, + "grad_norm": 4.771395206451416, + "learning_rate": 4.1000258032369847e-08, + "loss": 0.8561, + "step": 77922 + }, + { + "epoch": 0.9740743518587964, + "grad_norm": 1.09977388381958, + "learning_rate": 4.092135141226194e-08, + "loss": 0.7186, + "step": 77924 + }, + { + "epoch": 0.9740993524838121, + "grad_norm": 3.109671115875244, + "learning_rate": 4.084252063908722e-08, + "loss": 0.7575, + "step": 77926 + }, + { + "epoch": 0.9741243531088277, + "grad_norm": 2.327895164489746, + "learning_rate": 4.0763765713448534e-08, + "loss": 0.6958, + "step": 77928 + }, + { + "epoch": 0.9741493537338434, + "grad_norm": 5.276062965393066, + "learning_rate": 4.068508663594428e-08, + "loss": 0.3619, + "step": 77930 + }, + { + "epoch": 0.974174354358859, + "grad_norm": 0.0005422226968221366, + "learning_rate": 4.060648340717399e-08, + "loss": 0.3055, + "step": 77932 + }, + { + "epoch": 0.9741993549838746, + "grad_norm": 3.6673223972320557, + "learning_rate": 4.052795602773718e-08, + "loss": 1.4601, + "step": 77934 + }, + { + "epoch": 0.9742243556088902, + "grad_norm": 5.882850646972656, + "learning_rate": 4.044950449823004e-08, + "loss": 1.4366, + "step": 77936 + }, + { + "epoch": 0.9742493562339058, + "grad_norm": 3.428779125213623, + "learning_rate": 4.037112881925098e-08, + "loss": 1.4786, + "step": 77938 + }, + { + "epoch": 0.9742743568589215, + "grad_norm": 0.5264500379562378, + "learning_rate": 4.0292828991397306e-08, + "loss": 0.8755, + "step": 77940 + }, + { + "epoch": 0.9742993574839371, + "grad_norm": 2.9589521884918213, + "learning_rate": 4.02146050152652e-08, + "loss": 2.3953, + "step": 77942 + }, + { + "epoch": 0.9743243581089527, + "grad_norm": 2.126347780227661, + "learning_rate": 4.0136456891449736e-08, + "loss": 0.3342, + "step": 77944 + }, + { + "epoch": 0.9743493587339683, + "grad_norm": 3.7380800247192383, + "learning_rate": 4.005838462054712e-08, + "loss": 1.4203, + "step": 77946 + }, + { + "epoch": 0.974374359358984, + "grad_norm": 3.6563870906829834, + "learning_rate": 3.998038820315131e-08, + "loss": 0.6695, + "step": 77948 + }, + { + "epoch": 0.9743993599839996, + "grad_norm": 1.7757487297058105, + "learning_rate": 3.990246763985628e-08, + "loss": 1.3395, + "step": 77950 + }, + { + "epoch": 0.9744243606090153, + "grad_norm": 3.3070287704467773, + "learning_rate": 3.982462293125489e-08, + "loss": 0.615, + "step": 77952 + }, + { + "epoch": 0.9744493612340308, + "grad_norm": 3.385622262954712, + "learning_rate": 3.97468540779411e-08, + "loss": 0.6813, + "step": 77954 + }, + { + "epoch": 0.9744743618590465, + "grad_norm": 3.8155601024627686, + "learning_rate": 3.966916108050778e-08, + "loss": 0.8171, + "step": 77956 + }, + { + "epoch": 0.9744993624840621, + "grad_norm": 2.5859298706054688, + "learning_rate": 3.9591543939544454e-08, + "loss": 1.1357, + "step": 77958 + }, + { + "epoch": 0.9745243631090778, + "grad_norm": 3.840855121612549, + "learning_rate": 3.951400265564398e-08, + "loss": 1.6946, + "step": 77960 + }, + { + "epoch": 0.9745493637340934, + "grad_norm": 3.130486249923706, + "learning_rate": 3.943653722939589e-08, + "loss": 1.341, + "step": 77962 + }, + { + "epoch": 0.9745743643591089, + "grad_norm": 2.9784817695617676, + "learning_rate": 3.935914766139082e-08, + "loss": 0.8259, + "step": 77964 + }, + { + "epoch": 0.9745993649841246, + "grad_norm": 2.9475131034851074, + "learning_rate": 3.9281833952217185e-08, + "loss": 0.6191, + "step": 77966 + }, + { + "epoch": 0.9746243656091402, + "grad_norm": 5.446700096130371, + "learning_rate": 3.920459610246563e-08, + "loss": 1.2674, + "step": 77968 + }, + { + "epoch": 0.9746493662341559, + "grad_norm": 4.0113372802734375, + "learning_rate": 3.912743411272346e-08, + "loss": 1.2356, + "step": 77970 + }, + { + "epoch": 0.9746743668591715, + "grad_norm": 0.6010177731513977, + "learning_rate": 3.905034798357687e-08, + "loss": 0.0556, + "step": 77972 + }, + { + "epoch": 0.9746993674841871, + "grad_norm": 2.7234933376312256, + "learning_rate": 3.89733377156154e-08, + "loss": 0.1394, + "step": 77974 + }, + { + "epoch": 0.9747243681092027, + "grad_norm": 3.3098220825195312, + "learning_rate": 3.8896403309424126e-08, + "loss": 0.7085, + "step": 77976 + }, + { + "epoch": 0.9747493687342184, + "grad_norm": 3.8531124591827393, + "learning_rate": 3.8819544765588135e-08, + "loss": 0.3288, + "step": 77978 + }, + { + "epoch": 0.974774369359234, + "grad_norm": 0.6927313804626465, + "learning_rate": 3.874276208469474e-08, + "loss": 0.8138, + "step": 77980 + }, + { + "epoch": 0.9747993699842497, + "grad_norm": 4.0873284339904785, + "learning_rate": 3.866605526732792e-08, + "loss": 0.4152, + "step": 77982 + }, + { + "epoch": 0.9748243706092652, + "grad_norm": 0.049379684031009674, + "learning_rate": 3.858942431407164e-08, + "loss": 0.4553, + "step": 77984 + }, + { + "epoch": 0.9748493712342808, + "grad_norm": 1.4840610027313232, + "learning_rate": 3.85128692255099e-08, + "loss": 0.0316, + "step": 77986 + }, + { + "epoch": 0.9748743718592965, + "grad_norm": 2.253502607345581, + "learning_rate": 3.8436390002225545e-08, + "loss": 1.0185, + "step": 77988 + }, + { + "epoch": 0.9748993724843121, + "grad_norm": 7.560648441314697, + "learning_rate": 3.835998664480034e-08, + "loss": 0.937, + "step": 77990 + }, + { + "epoch": 0.9749243731093278, + "grad_norm": 0.7046217322349548, + "learning_rate": 3.8283659153817154e-08, + "loss": 0.4083, + "step": 77992 + }, + { + "epoch": 0.9749493737343433, + "grad_norm": 4.916574954986572, + "learning_rate": 3.8207407529856636e-08, + "loss": 1.256, + "step": 77994 + }, + { + "epoch": 0.974974374359359, + "grad_norm": 4.756954193115234, + "learning_rate": 3.813123177350053e-08, + "loss": 1.1441, + "step": 77996 + }, + { + "epoch": 0.9749993749843746, + "grad_norm": 5.4614105224609375, + "learning_rate": 3.805513188532728e-08, + "loss": 2.2629, + "step": 77998 + }, + { + "epoch": 0.9750243756093903, + "grad_norm": 2.1804184913635254, + "learning_rate": 3.797910786591752e-08, + "loss": 1.3737, + "step": 78000 + }, + { + "epoch": 0.9750493762344059, + "grad_norm": 4.5353264808654785, + "learning_rate": 3.790315971584968e-08, + "loss": 0.6344, + "step": 78002 + }, + { + "epoch": 0.9750743768594214, + "grad_norm": 4.474067687988281, + "learning_rate": 3.78272874357033e-08, + "loss": 1.8091, + "step": 78004 + }, + { + "epoch": 0.9750993774844371, + "grad_norm": 2.37434720993042, + "learning_rate": 3.7751491026055686e-08, + "loss": 0.9285, + "step": 78006 + }, + { + "epoch": 0.9751243781094527, + "grad_norm": 0.04072798043489456, + "learning_rate": 3.767577048748416e-08, + "loss": 0.8228, + "step": 78008 + }, + { + "epoch": 0.9751493787344684, + "grad_norm": 0.0003691554593387991, + "learning_rate": 3.760012582056494e-08, + "loss": 0.0, + "step": 78010 + }, + { + "epoch": 0.975174379359484, + "grad_norm": 3.063063859939575, + "learning_rate": 3.752455702587421e-08, + "loss": 0.5353, + "step": 78012 + }, + { + "epoch": 0.9751993799844996, + "grad_norm": 3.6668426990509033, + "learning_rate": 3.744906410398708e-08, + "loss": 0.6615, + "step": 78014 + }, + { + "epoch": 0.9752243806095152, + "grad_norm": 0.0003793178766500205, + "learning_rate": 3.737364705547974e-08, + "loss": 0.9866, + "step": 78016 + }, + { + "epoch": 0.9752493812345309, + "grad_norm": 3.603363275527954, + "learning_rate": 3.729830588092509e-08, + "loss": 1.5349, + "step": 78018 + }, + { + "epoch": 0.9752743818595465, + "grad_norm": 3.9276742935180664, + "learning_rate": 3.722304058089821e-08, + "loss": 1.439, + "step": 78020 + }, + { + "epoch": 0.9752993824845622, + "grad_norm": 2.3307530879974365, + "learning_rate": 3.714785115597197e-08, + "loss": 1.0874, + "step": 78022 + }, + { + "epoch": 0.9753243831095777, + "grad_norm": 0.8129187822341919, + "learning_rate": 3.7072737606719254e-08, + "loss": 0.0314, + "step": 78024 + }, + { + "epoch": 0.9753493837345933, + "grad_norm": 3.1693294048309326, + "learning_rate": 3.699769993371072e-08, + "loss": 1.7051, + "step": 78026 + }, + { + "epoch": 0.975374384359609, + "grad_norm": 3.867866277694702, + "learning_rate": 3.692273813751923e-08, + "loss": 0.7444, + "step": 78028 + }, + { + "epoch": 0.9753993849846246, + "grad_norm": 3.065279960632324, + "learning_rate": 3.6847852218715454e-08, + "loss": 0.5082, + "step": 78030 + }, + { + "epoch": 0.9754243856096403, + "grad_norm": 0.0002915689256042242, + "learning_rate": 3.677304217786892e-08, + "loss": 0.4735, + "step": 78032 + }, + { + "epoch": 0.9754493862346558, + "grad_norm": 3.5775375366210938, + "learning_rate": 3.66983080155503e-08, + "loss": 0.4331, + "step": 78034 + }, + { + "epoch": 0.9754743868596715, + "grad_norm": 0.0029022342059761286, + "learning_rate": 3.6623649732329123e-08, + "loss": 0.0001, + "step": 78036 + }, + { + "epoch": 0.9754993874846871, + "grad_norm": 5.2629265785217285, + "learning_rate": 3.6549067328772725e-08, + "loss": 0.5287, + "step": 78038 + }, + { + "epoch": 0.9755243881097028, + "grad_norm": 0.05830179154872894, + "learning_rate": 3.647456080544842e-08, + "loss": 0.6658, + "step": 78040 + }, + { + "epoch": 0.9755493887347184, + "grad_norm": 3.423886775970459, + "learning_rate": 3.640013016292687e-08, + "loss": 1.2841, + "step": 78042 + }, + { + "epoch": 0.975574389359734, + "grad_norm": 7.497816562652588, + "learning_rate": 3.632577540177207e-08, + "loss": 2.4354, + "step": 78044 + }, + { + "epoch": 0.9755993899847496, + "grad_norm": 3.772427558898926, + "learning_rate": 3.625149652255022e-08, + "loss": 0.6658, + "step": 78046 + }, + { + "epoch": 0.9756243906097652, + "grad_norm": 0.0003273680340498686, + "learning_rate": 3.617729352582977e-08, + "loss": 0.0751, + "step": 78048 + }, + { + "epoch": 0.9756493912347809, + "grad_norm": 2.935258626937866, + "learning_rate": 3.610316641217359e-08, + "loss": 0.6453, + "step": 78050 + }, + { + "epoch": 0.9756743918597965, + "grad_norm": 2.856797456741333, + "learning_rate": 3.602911518214569e-08, + "loss": 1.3398, + "step": 78052 + }, + { + "epoch": 0.9756993924848121, + "grad_norm": 5.779574871063232, + "learning_rate": 3.595513983631227e-08, + "loss": 1.9077, + "step": 78054 + }, + { + "epoch": 0.9757243931098277, + "grad_norm": 2.5163886547088623, + "learning_rate": 3.5881240375233993e-08, + "loss": 0.943, + "step": 78056 + }, + { + "epoch": 0.9757493937348434, + "grad_norm": 3.102884531021118, + "learning_rate": 3.580741679947597e-08, + "loss": 1.543, + "step": 78058 + }, + { + "epoch": 0.975774394359859, + "grad_norm": 4.338369369506836, + "learning_rate": 3.5733669109599967e-08, + "loss": 0.8835, + "step": 78060 + }, + { + "epoch": 0.9757993949848747, + "grad_norm": 4.221302509307861, + "learning_rate": 3.565999730616776e-08, + "loss": 1.2707, + "step": 78062 + }, + { + "epoch": 0.9758243956098902, + "grad_norm": 5.242423057556152, + "learning_rate": 3.55864013897389e-08, + "loss": 1.5887, + "step": 78064 + }, + { + "epoch": 0.9758493962349059, + "grad_norm": 2.3220343589782715, + "learning_rate": 3.551288136087405e-08, + "loss": 1.0771, + "step": 78066 + }, + { + "epoch": 0.9758743968599215, + "grad_norm": 0.16932198405265808, + "learning_rate": 3.543943722013499e-08, + "loss": 0.0031, + "step": 78068 + }, + { + "epoch": 0.9758993974849371, + "grad_norm": 1.4160411357879639, + "learning_rate": 3.5366068968080145e-08, + "loss": 0.3158, + "step": 78070 + }, + { + "epoch": 0.9759243981099528, + "grad_norm": 2.5922470092773438, + "learning_rate": 3.529277660526908e-08, + "loss": 0.8168, + "step": 78072 + }, + { + "epoch": 0.9759493987349683, + "grad_norm": 3.149233102798462, + "learning_rate": 3.5219560132258026e-08, + "loss": 0.8615, + "step": 78074 + }, + { + "epoch": 0.975974399359984, + "grad_norm": 4.836028575897217, + "learning_rate": 3.51464195496054e-08, + "loss": 0.6178, + "step": 78076 + }, + { + "epoch": 0.9759993999849996, + "grad_norm": 4.176300048828125, + "learning_rate": 3.5073354857868556e-08, + "loss": 1.4154, + "step": 78078 + }, + { + "epoch": 0.9760244006100153, + "grad_norm": 3.2032737731933594, + "learning_rate": 3.500036605760482e-08, + "loss": 0.2455, + "step": 78080 + }, + { + "epoch": 0.9760494012350309, + "grad_norm": 0.011379566974937916, + "learning_rate": 3.4927453149368187e-08, + "loss": 0.5604, + "step": 78082 + }, + { + "epoch": 0.9760744018600465, + "grad_norm": 2.1976451873779297, + "learning_rate": 3.485461613371488e-08, + "loss": 1.3754, + "step": 78084 + }, + { + "epoch": 0.9760994024850621, + "grad_norm": 0.006286727264523506, + "learning_rate": 3.4781855011200016e-08, + "loss": 0.0583, + "step": 78086 + }, + { + "epoch": 0.9761244031100778, + "grad_norm": 1.9523053169250488, + "learning_rate": 3.470916978237759e-08, + "loss": 1.1892, + "step": 78088 + }, + { + "epoch": 0.9761494037350934, + "grad_norm": 3.538792848587036, + "learning_rate": 3.4636560447800504e-08, + "loss": 0.8544, + "step": 78090 + }, + { + "epoch": 0.976174404360109, + "grad_norm": 2.117598533630371, + "learning_rate": 3.4564027008021637e-08, + "loss": 0.9581, + "step": 78092 + }, + { + "epoch": 0.9761994049851246, + "grad_norm": 2.003483295440674, + "learning_rate": 3.4491569463594996e-08, + "loss": 0.1582, + "step": 78094 + }, + { + "epoch": 0.9762244056101402, + "grad_norm": 3.6070170402526855, + "learning_rate": 3.441918781507014e-08, + "loss": 1.1922, + "step": 78096 + }, + { + "epoch": 0.9762494062351559, + "grad_norm": 3.0667078495025635, + "learning_rate": 3.4346882062999967e-08, + "loss": 0.3778, + "step": 78098 + }, + { + "epoch": 0.9762744068601715, + "grad_norm": 2.9933459758758545, + "learning_rate": 3.4274652207935136e-08, + "loss": 1.2974, + "step": 78100 + }, + { + "epoch": 0.9762994074851872, + "grad_norm": 3.6740829944610596, + "learning_rate": 3.420249825042521e-08, + "loss": 1.2304, + "step": 78102 + }, + { + "epoch": 0.9763244081102027, + "grad_norm": 3.6058542728424072, + "learning_rate": 3.413042019101975e-08, + "loss": 0.8685, + "step": 78104 + }, + { + "epoch": 0.9763494087352184, + "grad_norm": 4.846529006958008, + "learning_rate": 3.4058418030267214e-08, + "loss": 0.6029, + "step": 78106 + }, + { + "epoch": 0.976374409360234, + "grad_norm": 0.0014984721783548594, + "learning_rate": 3.398649176871715e-08, + "loss": 0.6511, + "step": 78108 + }, + { + "epoch": 0.9763994099852497, + "grad_norm": 1.8022297620773315, + "learning_rate": 3.3914641406915803e-08, + "loss": 0.772, + "step": 78110 + }, + { + "epoch": 0.9764244106102653, + "grad_norm": 0.00021497426496353, + "learning_rate": 3.384286694541161e-08, + "loss": 0.0001, + "step": 78112 + }, + { + "epoch": 0.9764494112352808, + "grad_norm": 5.649435997009277, + "learning_rate": 3.3771168384750805e-08, + "loss": 1.8288, + "step": 78114 + }, + { + "epoch": 0.9764744118602965, + "grad_norm": 6.260303974151611, + "learning_rate": 3.369954572547962e-08, + "loss": 2.7467, + "step": 78116 + }, + { + "epoch": 0.9764994124853121, + "grad_norm": 1.8794808387756348, + "learning_rate": 3.3627998968143175e-08, + "loss": 0.6685, + "step": 78118 + }, + { + "epoch": 0.9765244131103278, + "grad_norm": 2.644171714782715, + "learning_rate": 3.355652811328658e-08, + "loss": 0.6881, + "step": 78120 + }, + { + "epoch": 0.9765494137353434, + "grad_norm": 2.8916749954223633, + "learning_rate": 3.3485133161453854e-08, + "loss": 0.6997, + "step": 78122 + }, + { + "epoch": 0.976574414360359, + "grad_norm": 1.7982569932937622, + "learning_rate": 3.3413814113189004e-08, + "loss": 0.7012, + "step": 78124 + }, + { + "epoch": 0.9765994149853746, + "grad_norm": 3.4912800788879395, + "learning_rate": 3.3342570969034926e-08, + "loss": 1.0175, + "step": 78126 + }, + { + "epoch": 0.9766244156103903, + "grad_norm": 7.041758060455322, + "learning_rate": 3.327140372953452e-08, + "loss": 1.3662, + "step": 78128 + }, + { + "epoch": 0.9766494162354059, + "grad_norm": 2.4847500324249268, + "learning_rate": 3.3200312395230695e-08, + "loss": 1.5958, + "step": 78130 + }, + { + "epoch": 0.9766744168604216, + "grad_norm": 0.4033398926258087, + "learning_rate": 3.3129296966663004e-08, + "loss": 0.7112, + "step": 78132 + }, + { + "epoch": 0.9766994174854371, + "grad_norm": 4.3985066413879395, + "learning_rate": 3.3058357444373246e-08, + "loss": 0.65, + "step": 78134 + }, + { + "epoch": 0.9767244181104527, + "grad_norm": 3.1911282539367676, + "learning_rate": 3.298749382890099e-08, + "loss": 1.2337, + "step": 78136 + }, + { + "epoch": 0.9767494187354684, + "grad_norm": 2.5286107063293457, + "learning_rate": 3.29167061207869e-08, + "loss": 0.4818, + "step": 78138 + }, + { + "epoch": 0.976774419360484, + "grad_norm": 2.481353998184204, + "learning_rate": 3.2845994320570564e-08, + "loss": 1.1258, + "step": 78140 + }, + { + "epoch": 0.9767994199854997, + "grad_norm": 2.3570687770843506, + "learning_rate": 3.2775358428788204e-08, + "loss": 1.2533, + "step": 78142 + }, + { + "epoch": 0.9768244206105152, + "grad_norm": 3.0208489894866943, + "learning_rate": 3.270479844598051e-08, + "loss": 0.4142, + "step": 78144 + }, + { + "epoch": 0.9768494212355309, + "grad_norm": 3.847729206085205, + "learning_rate": 3.2634314372682606e-08, + "loss": 1.1204, + "step": 78146 + }, + { + "epoch": 0.9768744218605465, + "grad_norm": 2.191052198410034, + "learning_rate": 3.256390620943295e-08, + "loss": 0.3369, + "step": 78148 + }, + { + "epoch": 0.9768994224855622, + "grad_norm": 4.42460298538208, + "learning_rate": 3.249357395676667e-08, + "loss": 0.9979, + "step": 78150 + }, + { + "epoch": 0.9769244231105778, + "grad_norm": 6.1873393058776855, + "learning_rate": 3.242331761522e-08, + "loss": 1.824, + "step": 78152 + }, + { + "epoch": 0.9769494237355933, + "grad_norm": 2.329411268234253, + "learning_rate": 3.235313718532806e-08, + "loss": 1.1187, + "step": 78154 + }, + { + "epoch": 0.976974424360609, + "grad_norm": 5.26267671585083, + "learning_rate": 3.228303266762489e-08, + "loss": 2.2411, + "step": 78156 + }, + { + "epoch": 0.9769994249856246, + "grad_norm": 4.853044033050537, + "learning_rate": 3.221300406264449e-08, + "loss": 0.6777, + "step": 78158 + }, + { + "epoch": 0.9770244256106403, + "grad_norm": 1.2395938634872437, + "learning_rate": 3.2143051370919776e-08, + "loss": 0.3234, + "step": 78160 + }, + { + "epoch": 0.9770494262356559, + "grad_norm": 1.5383596420288086, + "learning_rate": 3.207317459298476e-08, + "loss": 0.5013, + "step": 78162 + }, + { + "epoch": 0.9770744268606715, + "grad_norm": 4.373340129852295, + "learning_rate": 3.200337372937013e-08, + "loss": 0.6002, + "step": 78164 + }, + { + "epoch": 0.9770994274856871, + "grad_norm": 2.96211314201355, + "learning_rate": 3.193364878060878e-08, + "loss": 0.8669, + "step": 78166 + }, + { + "epoch": 0.9771244281107028, + "grad_norm": 0.004717252217233181, + "learning_rate": 3.186399974723142e-08, + "loss": 1.264, + "step": 78168 + }, + { + "epoch": 0.9771494287357184, + "grad_norm": 4.107375144958496, + "learning_rate": 3.179442662976762e-08, + "loss": 1.215, + "step": 78170 + }, + { + "epoch": 0.9771744293607341, + "grad_norm": 2.943643569946289, + "learning_rate": 3.1724929428748055e-08, + "loss": 0.6977, + "step": 78172 + }, + { + "epoch": 0.9771994299857496, + "grad_norm": 1.0761692523956299, + "learning_rate": 3.16555081447012e-08, + "loss": 0.7355, + "step": 78174 + }, + { + "epoch": 0.9772244306107652, + "grad_norm": 5.043749809265137, + "learning_rate": 3.158616277815662e-08, + "loss": 2.1806, + "step": 78176 + }, + { + "epoch": 0.9772494312357809, + "grad_norm": 3.508619785308838, + "learning_rate": 3.151689332964281e-08, + "loss": 0.4598, + "step": 78178 + }, + { + "epoch": 0.9772744318607965, + "grad_norm": 2.4178056716918945, + "learning_rate": 3.144769979968598e-08, + "loss": 0.3742, + "step": 78180 + }, + { + "epoch": 0.9772994324858122, + "grad_norm": 2.3196802139282227, + "learning_rate": 3.137858218881462e-08, + "loss": 0.9986, + "step": 78182 + }, + { + "epoch": 0.9773244331108277, + "grad_norm": 4.522439956665039, + "learning_rate": 3.130954049755275e-08, + "loss": 0.8403, + "step": 78184 + }, + { + "epoch": 0.9773494337358434, + "grad_norm": 2.650413990020752, + "learning_rate": 3.124057472642883e-08, + "loss": 0.4266, + "step": 78186 + }, + { + "epoch": 0.977374434360859, + "grad_norm": 2.280886173248291, + "learning_rate": 3.117168487596689e-08, + "loss": 1.1593, + "step": 78188 + }, + { + "epoch": 0.9773994349858747, + "grad_norm": 0.09963278472423553, + "learning_rate": 3.110287094669207e-08, + "loss": 0.1184, + "step": 78190 + }, + { + "epoch": 0.9774244356108903, + "grad_norm": 5.729998588562012, + "learning_rate": 3.1034132939127274e-08, + "loss": 1.7624, + "step": 78192 + }, + { + "epoch": 0.9774494362359059, + "grad_norm": 2.3663439750671387, + "learning_rate": 3.096547085379764e-08, + "loss": 1.2293, + "step": 78194 + }, + { + "epoch": 0.9774744368609215, + "grad_norm": 3.3045032024383545, + "learning_rate": 3.089688469122498e-08, + "loss": 0.6913, + "step": 78196 + }, + { + "epoch": 0.9774994374859372, + "grad_norm": 3.5326056480407715, + "learning_rate": 3.08283744519311e-08, + "loss": 0.7274, + "step": 78198 + }, + { + "epoch": 0.9775244381109528, + "grad_norm": 13.687426567077637, + "learning_rate": 3.07599401364389e-08, + "loss": 1.0125, + "step": 78200 + }, + { + "epoch": 0.9775494387359684, + "grad_norm": 3.4620227813720703, + "learning_rate": 3.0691581745269095e-08, + "loss": 1.2545, + "step": 78202 + }, + { + "epoch": 0.977574439360984, + "grad_norm": 0.002532911952584982, + "learning_rate": 3.0623299278943476e-08, + "loss": 0.1276, + "step": 78204 + }, + { + "epoch": 0.9775994399859996, + "grad_norm": 2.544588804244995, + "learning_rate": 3.055509273798052e-08, + "loss": 1.3104, + "step": 78206 + }, + { + "epoch": 0.9776244406110153, + "grad_norm": 3.325678586959839, + "learning_rate": 3.048696212289981e-08, + "loss": 0.6451, + "step": 78208 + }, + { + "epoch": 0.9776494412360309, + "grad_norm": 4.160577297210693, + "learning_rate": 3.041890743421982e-08, + "loss": 1.0019, + "step": 78210 + }, + { + "epoch": 0.9776744418610466, + "grad_norm": 3.0453686714172363, + "learning_rate": 3.035092867246125e-08, + "loss": 1.209, + "step": 78212 + }, + { + "epoch": 0.9776994424860621, + "grad_norm": 1.66775381565094, + "learning_rate": 3.0283025838139246e-08, + "loss": 1.4213, + "step": 78214 + }, + { + "epoch": 0.9777244431110778, + "grad_norm": 1.3045374155044556, + "learning_rate": 3.021519893177116e-08, + "loss": 0.0476, + "step": 78216 + }, + { + "epoch": 0.9777494437360934, + "grad_norm": 1.6622681617736816, + "learning_rate": 3.014744795387547e-08, + "loss": 0.154, + "step": 78218 + }, + { + "epoch": 0.977774444361109, + "grad_norm": 12.673011779785156, + "learning_rate": 3.007977290496733e-08, + "loss": 2.2476, + "step": 78220 + }, + { + "epoch": 0.9777994449861247, + "grad_norm": 0.00029758596792817116, + "learning_rate": 3.001217378556076e-08, + "loss": 0.6262, + "step": 78222 + }, + { + "epoch": 0.9778244456111402, + "grad_norm": 3.3618576526641846, + "learning_rate": 2.994465059617202e-08, + "loss": 1.1413, + "step": 78224 + }, + { + "epoch": 0.9778494462361559, + "grad_norm": 2.1619813442230225, + "learning_rate": 2.987720333731403e-08, + "loss": 0.578, + "step": 78226 + }, + { + "epoch": 0.9778744468611715, + "grad_norm": 6.938656330108643, + "learning_rate": 2.980983200950305e-08, + "loss": 0.8859, + "step": 78228 + }, + { + "epoch": 0.9778994474861872, + "grad_norm": 1.9115768671035767, + "learning_rate": 2.9742536613248662e-08, + "loss": 1.2481, + "step": 78230 + }, + { + "epoch": 0.9779244481112028, + "grad_norm": 0.0002890515315812081, + "learning_rate": 2.9675317149066018e-08, + "loss": 0.0, + "step": 78232 + }, + { + "epoch": 0.9779494487362184, + "grad_norm": 2.071838140487671, + "learning_rate": 2.960817361746693e-08, + "loss": 1.1625, + "step": 78234 + }, + { + "epoch": 0.977974449361234, + "grad_norm": 2.100557327270508, + "learning_rate": 2.9541106018960985e-08, + "loss": 0.7998, + "step": 78236 + }, + { + "epoch": 0.9779994499862497, + "grad_norm": 0.4421014189720154, + "learning_rate": 2.9474114354061112e-08, + "loss": 0.0184, + "step": 78238 + }, + { + "epoch": 0.9780244506112653, + "grad_norm": 2.906860113143921, + "learning_rate": 2.940719862327468e-08, + "loss": 1.5836, + "step": 78240 + }, + { + "epoch": 0.978049451236281, + "grad_norm": 3.914097785949707, + "learning_rate": 2.9340358827114612e-08, + "loss": 1.2759, + "step": 78242 + }, + { + "epoch": 0.9780744518612965, + "grad_norm": 0.0003949182864744216, + "learning_rate": 2.9273594966088283e-08, + "loss": 0.5754, + "step": 78244 + }, + { + "epoch": 0.9780994524863121, + "grad_norm": 3.6773664951324463, + "learning_rate": 2.9206907040703058e-08, + "loss": 0.7562, + "step": 78246 + }, + { + "epoch": 0.9781244531113278, + "grad_norm": 3.247779607772827, + "learning_rate": 2.9140295051469646e-08, + "loss": 0.1769, + "step": 78248 + }, + { + "epoch": 0.9781494537363434, + "grad_norm": 4.0886454582214355, + "learning_rate": 2.9073758998892086e-08, + "loss": 0.7821, + "step": 78250 + }, + { + "epoch": 0.9781744543613591, + "grad_norm": 0.3899106979370117, + "learning_rate": 2.9007298883479973e-08, + "loss": 0.0101, + "step": 78252 + }, + { + "epoch": 0.9781994549863746, + "grad_norm": 2.9140727519989014, + "learning_rate": 2.8940914705737343e-08, + "loss": 1.1916, + "step": 78254 + }, + { + "epoch": 0.9782244556113903, + "grad_norm": 3.4744226932525635, + "learning_rate": 2.8874606466171573e-08, + "loss": 1.074, + "step": 78256 + }, + { + "epoch": 0.9782494562364059, + "grad_norm": 4.156827449798584, + "learning_rate": 2.88083741652867e-08, + "loss": 0.3785, + "step": 78258 + }, + { + "epoch": 0.9782744568614216, + "grad_norm": 2.8606150150299072, + "learning_rate": 2.8742217803586768e-08, + "loss": 1.3228, + "step": 78260 + }, + { + "epoch": 0.9782994574864372, + "grad_norm": 0.17620982229709625, + "learning_rate": 2.867613738157693e-08, + "loss": 0.7988, + "step": 78262 + }, + { + "epoch": 0.9783244581114527, + "grad_norm": 2.9479100704193115, + "learning_rate": 2.861013289975789e-08, + "loss": 0.736, + "step": 78264 + }, + { + "epoch": 0.9783494587364684, + "grad_norm": 4.149778366088867, + "learning_rate": 2.8544204358634808e-08, + "loss": 0.7304, + "step": 78266 + }, + { + "epoch": 0.978374459361484, + "grad_norm": 5.074477672576904, + "learning_rate": 2.8478351758709498e-08, + "loss": 1.9586, + "step": 78268 + }, + { + "epoch": 0.9783994599864997, + "grad_norm": 3.153738260269165, + "learning_rate": 2.8412575100482676e-08, + "loss": 0.8673, + "step": 78270 + }, + { + "epoch": 0.9784244606115153, + "grad_norm": 3.8178744316101074, + "learning_rate": 2.8346874384455048e-08, + "loss": 0.1445, + "step": 78272 + }, + { + "epoch": 0.9784494612365309, + "grad_norm": 6.386853218078613, + "learning_rate": 2.8281249611128435e-08, + "loss": 1.9052, + "step": 78274 + }, + { + "epoch": 0.9784744618615465, + "grad_norm": 3.745487689971924, + "learning_rate": 2.8215700781001332e-08, + "loss": 1.0265, + "step": 78276 + }, + { + "epoch": 0.9784994624865622, + "grad_norm": 0.004103466868400574, + "learning_rate": 2.8150227894572225e-08, + "loss": 0.5027, + "step": 78278 + }, + { + "epoch": 0.9785244631115778, + "grad_norm": 2.9604454040527344, + "learning_rate": 2.8084830952341822e-08, + "loss": 1.1239, + "step": 78280 + }, + { + "epoch": 0.9785494637365935, + "grad_norm": 2.151520252227783, + "learning_rate": 2.801950995480751e-08, + "loss": 0.6803, + "step": 78282 + }, + { + "epoch": 0.978574464361609, + "grad_norm": 2.5404741764068604, + "learning_rate": 2.7954264902465554e-08, + "loss": 1.4903, + "step": 78284 + }, + { + "epoch": 0.9785994649866246, + "grad_norm": 2.22769832611084, + "learning_rate": 2.7889095795814446e-08, + "loss": 0.1041, + "step": 78286 + }, + { + "epoch": 0.9786244656116403, + "grad_norm": 3.2318367958068848, + "learning_rate": 2.7824002635349344e-08, + "loss": 1.209, + "step": 78288 + }, + { + "epoch": 0.9786494662366559, + "grad_norm": 3.337338447570801, + "learning_rate": 2.775898542156763e-08, + "loss": 0.6597, + "step": 78290 + }, + { + "epoch": 0.9786744668616716, + "grad_norm": 4.798957824707031, + "learning_rate": 2.769404415496113e-08, + "loss": 1.7324, + "step": 78292 + }, + { + "epoch": 0.9786994674866871, + "grad_norm": 8.448922157287598, + "learning_rate": 2.7629178836028337e-08, + "loss": 0.7355, + "step": 78294 + }, + { + "epoch": 0.9787244681117028, + "grad_norm": 0.0003411358338780701, + "learning_rate": 2.7564389465261076e-08, + "loss": 0.0138, + "step": 78296 + }, + { + "epoch": 0.9787494687367184, + "grad_norm": 3.266096353530884, + "learning_rate": 2.74996760431534e-08, + "loss": 2.3917, + "step": 78298 + }, + { + "epoch": 0.9787744693617341, + "grad_norm": 0.7106884121894836, + "learning_rate": 2.7435038570197135e-08, + "loss": 0.8588, + "step": 78300 + }, + { + "epoch": 0.9787994699867497, + "grad_norm": 6.529136657714844, + "learning_rate": 2.7370477046886334e-08, + "loss": 1.6492, + "step": 78302 + }, + { + "epoch": 0.9788244706117653, + "grad_norm": 2.408921957015991, + "learning_rate": 2.7305991473710604e-08, + "loss": 0.7747, + "step": 78304 + }, + { + "epoch": 0.9788494712367809, + "grad_norm": 4.028102397918701, + "learning_rate": 2.724158185116288e-08, + "loss": 1.1198, + "step": 78306 + }, + { + "epoch": 0.9788744718617965, + "grad_norm": 6.572484016418457, + "learning_rate": 2.7177248179732773e-08, + "loss": 0.9979, + "step": 78308 + }, + { + "epoch": 0.9788994724868122, + "grad_norm": 2.0723350048065186, + "learning_rate": 2.7112990459911004e-08, + "loss": 0.2383, + "step": 78310 + }, + { + "epoch": 0.9789244731118278, + "grad_norm": 0.16533736884593964, + "learning_rate": 2.7048808692186068e-08, + "loss": 0.5571, + "step": 78312 + }, + { + "epoch": 0.9789494737368434, + "grad_norm": 1.9311636686325073, + "learning_rate": 2.6984702877047574e-08, + "loss": 1.1257, + "step": 78314 + }, + { + "epoch": 0.978974474361859, + "grad_norm": 0.0005040206015110016, + "learning_rate": 2.69206730149818e-08, + "loss": 0.5427, + "step": 78316 + }, + { + "epoch": 0.9789994749868747, + "grad_norm": 2.003260374069214, + "learning_rate": 2.6856719106479466e-08, + "loss": 1.109, + "step": 78318 + }, + { + "epoch": 0.9790244756118903, + "grad_norm": 2.929262638092041, + "learning_rate": 2.679284115202463e-08, + "loss": 1.0818, + "step": 78320 + }, + { + "epoch": 0.979049476236906, + "grad_norm": 2.383413314819336, + "learning_rate": 2.6729039152105786e-08, + "loss": 0.374, + "step": 78322 + }, + { + "epoch": 0.9790744768619215, + "grad_norm": 6.297611236572266, + "learning_rate": 2.6665313107209213e-08, + "loss": 0.8802, + "step": 78324 + }, + { + "epoch": 0.9790994774869372, + "grad_norm": 1.5535192489624023, + "learning_rate": 2.6601663017817858e-08, + "loss": 0.0755, + "step": 78326 + }, + { + "epoch": 0.9791244781119528, + "grad_norm": 3.6396822929382324, + "learning_rate": 2.653808888441911e-08, + "loss": 1.048, + "step": 78328 + }, + { + "epoch": 0.9791494787369684, + "grad_norm": 2.543827772140503, + "learning_rate": 2.6474590707494806e-08, + "loss": 1.1738, + "step": 78330 + }, + { + "epoch": 0.9791744793619841, + "grad_norm": 2.4422059059143066, + "learning_rate": 2.6411168487530113e-08, + "loss": 0.1984, + "step": 78332 + }, + { + "epoch": 0.9791994799869996, + "grad_norm": 2.493278741836548, + "learning_rate": 2.634782222500687e-08, + "loss": 1.0875, + "step": 78334 + }, + { + "epoch": 0.9792244806120153, + "grad_norm": 2.2097010612487793, + "learning_rate": 2.6284551920409128e-08, + "loss": 0.9553, + "step": 78336 + }, + { + "epoch": 0.9792494812370309, + "grad_norm": 4.673165798187256, + "learning_rate": 2.622135757421762e-08, + "loss": 1.2193, + "step": 78338 + }, + { + "epoch": 0.9792744818620466, + "grad_norm": 2.914459228515625, + "learning_rate": 2.615823918691418e-08, + "loss": 1.3249, + "step": 78340 + }, + { + "epoch": 0.9792994824870622, + "grad_norm": 1.1268625259399414, + "learning_rate": 2.6095196758978426e-08, + "loss": 0.5641, + "step": 78342 + }, + { + "epoch": 0.9793244831120778, + "grad_norm": 3.5178942680358887, + "learning_rate": 2.603223029089108e-08, + "loss": 1.4769, + "step": 78344 + }, + { + "epoch": 0.9793494837370934, + "grad_norm": 3.3042454719543457, + "learning_rate": 2.5969339783132876e-08, + "loss": 1.1368, + "step": 78346 + }, + { + "epoch": 0.979374484362109, + "grad_norm": 1.076525330543518, + "learning_rate": 2.59065252361812e-08, + "loss": 0.8392, + "step": 78348 + }, + { + "epoch": 0.9793994849871247, + "grad_norm": 32.468780517578125, + "learning_rate": 2.5843786650514568e-08, + "loss": 2.5295, + "step": 78350 + }, + { + "epoch": 0.9794244856121403, + "grad_norm": 2.269669532775879, + "learning_rate": 2.5781124026611483e-08, + "loss": 0.9187, + "step": 78352 + }, + { + "epoch": 0.9794494862371559, + "grad_norm": 1.3100919723510742, + "learning_rate": 2.5718537364948226e-08, + "loss": 0.1843, + "step": 78354 + }, + { + "epoch": 0.9794744868621715, + "grad_norm": 3.454495429992676, + "learning_rate": 2.565602666600331e-08, + "loss": 0.5504, + "step": 78356 + }, + { + "epoch": 0.9794994874871872, + "grad_norm": 2.7416069507598877, + "learning_rate": 2.5593591930250794e-08, + "loss": 0.9077, + "step": 78358 + }, + { + "epoch": 0.9795244881122028, + "grad_norm": 0.00020799081539735198, + "learning_rate": 2.553123315816697e-08, + "loss": 0.3032, + "step": 78360 + }, + { + "epoch": 0.9795494887372185, + "grad_norm": 4.4652228355407715, + "learning_rate": 2.5468950350228118e-08, + "loss": 0.9271, + "step": 78362 + }, + { + "epoch": 0.979574489362234, + "grad_norm": 2.226935625076294, + "learning_rate": 2.5406743506904973e-08, + "loss": 1.0358, + "step": 78364 + }, + { + "epoch": 0.9795994899872497, + "grad_norm": 0.0013430084800347686, + "learning_rate": 2.5344612628674936e-08, + "loss": 1.491, + "step": 78366 + }, + { + "epoch": 0.9796244906122653, + "grad_norm": 3.2915332317352295, + "learning_rate": 2.5282557716009848e-08, + "loss": 1.2191, + "step": 78368 + }, + { + "epoch": 0.979649491237281, + "grad_norm": 4.429869651794434, + "learning_rate": 2.5220578769381555e-08, + "loss": 0.9945, + "step": 78370 + }, + { + "epoch": 0.9796744918622966, + "grad_norm": 3.7563347816467285, + "learning_rate": 2.515867578926301e-08, + "loss": 1.5554, + "step": 78372 + }, + { + "epoch": 0.9796994924873121, + "grad_norm": 3.3997788429260254, + "learning_rate": 2.509684877612606e-08, + "loss": 0.8142, + "step": 78374 + }, + { + "epoch": 0.9797244931123278, + "grad_norm": 4.1719889640808105, + "learning_rate": 2.5035097730440328e-08, + "loss": 1.2932, + "step": 78376 + }, + { + "epoch": 0.9797494937373434, + "grad_norm": 1.6862372159957886, + "learning_rate": 2.497342265267655e-08, + "loss": 0.5298, + "step": 78378 + }, + { + "epoch": 0.9797744943623591, + "grad_norm": 1.0070273876190186, + "learning_rate": 2.491182354330546e-08, + "loss": 0.0986, + "step": 78380 + }, + { + "epoch": 0.9797994949873747, + "grad_norm": 4.211023807525635, + "learning_rate": 2.4850300402794458e-08, + "loss": 0.6688, + "step": 78382 + }, + { + "epoch": 0.9798244956123903, + "grad_norm": 4.302189350128174, + "learning_rate": 2.4788853231613176e-08, + "loss": 0.9815, + "step": 78384 + }, + { + "epoch": 0.9798494962374059, + "grad_norm": 4.180663108825684, + "learning_rate": 2.472748203022901e-08, + "loss": 1.834, + "step": 78386 + }, + { + "epoch": 0.9798744968624216, + "grad_norm": 0.001959794433787465, + "learning_rate": 2.466618679911048e-08, + "loss": 0.0416, + "step": 78388 + }, + { + "epoch": 0.9798994974874372, + "grad_norm": 3.6649978160858154, + "learning_rate": 2.460496753872277e-08, + "loss": 0.656, + "step": 78390 + }, + { + "epoch": 0.9799244981124529, + "grad_norm": 3.2997279167175293, + "learning_rate": 2.4543824249533277e-08, + "loss": 1.6528, + "step": 78392 + }, + { + "epoch": 0.9799494987374684, + "grad_norm": 2.3391849994659424, + "learning_rate": 2.448275693200719e-08, + "loss": 0.8463, + "step": 78394 + }, + { + "epoch": 0.979974499362484, + "grad_norm": 5.006272792816162, + "learning_rate": 2.44217655866108e-08, + "loss": 1.3383, + "step": 78396 + }, + { + "epoch": 0.9799994999874997, + "grad_norm": 4.727162837982178, + "learning_rate": 2.4360850213805963e-08, + "loss": 1.0427, + "step": 78398 + }, + { + "epoch": 0.9800245006125153, + "grad_norm": 3.5581822395324707, + "learning_rate": 2.430001081406008e-08, + "loss": 0.9699, + "step": 78400 + }, + { + "epoch": 0.980049501237531, + "grad_norm": 3.88352108001709, + "learning_rate": 2.4239247387833896e-08, + "loss": 0.4887, + "step": 78402 + }, + { + "epoch": 0.9800745018625465, + "grad_norm": 0.00038731444510631263, + "learning_rate": 2.4178559935590374e-08, + "loss": 0.0812, + "step": 78404 + }, + { + "epoch": 0.9800995024875622, + "grad_norm": 0.0005490063922479749, + "learning_rate": 2.4117948457792474e-08, + "loss": 1.7496, + "step": 78406 + }, + { + "epoch": 0.9801245031125778, + "grad_norm": 2.633145332336426, + "learning_rate": 2.405741295490094e-08, + "loss": 0.6549, + "step": 78408 + }, + { + "epoch": 0.9801495037375935, + "grad_norm": 0.01598222926259041, + "learning_rate": 2.3996953427378733e-08, + "loss": 0.9667, + "step": 78410 + }, + { + "epoch": 0.9801745043626091, + "grad_norm": 3.881844997406006, + "learning_rate": 2.393656987568438e-08, + "loss": 1.5628, + "step": 78412 + }, + { + "epoch": 0.9801995049876246, + "grad_norm": 3.217895030975342, + "learning_rate": 2.3876262300278618e-08, + "loss": 1.227, + "step": 78414 + }, + { + "epoch": 0.9802245056126403, + "grad_norm": 3.2103271484375, + "learning_rate": 2.3816030701619975e-08, + "loss": 0.7483, + "step": 78416 + }, + { + "epoch": 0.9802495062376559, + "grad_norm": 0.00027908655465580523, + "learning_rate": 2.3755875080166968e-08, + "loss": 0.0245, + "step": 78418 + }, + { + "epoch": 0.9802745068626716, + "grad_norm": 1.8788362741470337, + "learning_rate": 2.3695795436380343e-08, + "loss": 0.192, + "step": 78420 + }, + { + "epoch": 0.9802995074876872, + "grad_norm": 4.6815972328186035, + "learning_rate": 2.3635791770714177e-08, + "loss": 0.3026, + "step": 78422 + }, + { + "epoch": 0.9803245081127028, + "grad_norm": 3.2216224670410156, + "learning_rate": 2.3575864083628107e-08, + "loss": 0.7199, + "step": 78424 + }, + { + "epoch": 0.9803495087377184, + "grad_norm": 3.919191598892212, + "learning_rate": 2.3516012375577325e-08, + "loss": 1.8624, + "step": 78426 + }, + { + "epoch": 0.9803745093627341, + "grad_norm": 0.0002692842681426555, + "learning_rate": 2.3456236647017017e-08, + "loss": 0.9269, + "step": 78428 + }, + { + "epoch": 0.9803995099877497, + "grad_norm": 2.151540517807007, + "learning_rate": 2.339653689840349e-08, + "loss": 0.1301, + "step": 78430 + }, + { + "epoch": 0.9804245106127654, + "grad_norm": 0.0005035031354054809, + "learning_rate": 2.3336913130190818e-08, + "loss": 0.1951, + "step": 78432 + }, + { + "epoch": 0.9804495112377809, + "grad_norm": 1.3510470390319824, + "learning_rate": 2.32773653428342e-08, + "loss": 0.2407, + "step": 78434 + }, + { + "epoch": 0.9804745118627965, + "grad_norm": 4.240108966827393, + "learning_rate": 2.3217893536785497e-08, + "loss": 1.3008, + "step": 78436 + }, + { + "epoch": 0.9804995124878122, + "grad_norm": 5.599216461181641, + "learning_rate": 2.3158497712499893e-08, + "loss": 0.4739, + "step": 78438 + }, + { + "epoch": 0.9805245131128278, + "grad_norm": 2.312277317047119, + "learning_rate": 2.3099177870427035e-08, + "loss": 1.2662, + "step": 78440 + }, + { + "epoch": 0.9805495137378435, + "grad_norm": 3.6698591709136963, + "learning_rate": 2.3039934011021002e-08, + "loss": 1.2574, + "step": 78442 + }, + { + "epoch": 0.980574514362859, + "grad_norm": 0.025641130283474922, + "learning_rate": 2.2980766134730325e-08, + "loss": 0.0007, + "step": 78444 + }, + { + "epoch": 0.9805995149878747, + "grad_norm": 3.059539556503296, + "learning_rate": 2.2921674242007974e-08, + "loss": 0.7954, + "step": 78446 + }, + { + "epoch": 0.9806245156128903, + "grad_norm": 24.08504867553711, + "learning_rate": 2.28626583333047e-08, + "loss": 1.2483, + "step": 78448 + }, + { + "epoch": 0.980649516237906, + "grad_norm": 4.109630107879639, + "learning_rate": 2.280371840906681e-08, + "loss": 1.6591, + "step": 78450 + }, + { + "epoch": 0.9806745168629216, + "grad_norm": 2.5510737895965576, + "learning_rate": 2.2744854469746168e-08, + "loss": 1.1499, + "step": 78452 + }, + { + "epoch": 0.9806995174879372, + "grad_norm": 4.412069797515869, + "learning_rate": 2.2686066515787975e-08, + "loss": 1.1875, + "step": 78454 + }, + { + "epoch": 0.9807245181129528, + "grad_norm": 0.023874938488006592, + "learning_rate": 2.2627354547642976e-08, + "loss": 0.7943, + "step": 78456 + }, + { + "epoch": 0.9807495187379685, + "grad_norm": 3.5159740447998047, + "learning_rate": 2.2568718565757487e-08, + "loss": 1.0949, + "step": 78458 + }, + { + "epoch": 0.9807745193629841, + "grad_norm": 3.1773617267608643, + "learning_rate": 2.25101585705767e-08, + "loss": 0.5596, + "step": 78460 + }, + { + "epoch": 0.9807995199879997, + "grad_norm": 0.4123838543891907, + "learning_rate": 2.245167456254804e-08, + "loss": 0.5242, + "step": 78462 + }, + { + "epoch": 0.9808245206130153, + "grad_norm": 1.593700647354126, + "learning_rate": 2.239326654211671e-08, + "loss": 0.9821, + "step": 78464 + }, + { + "epoch": 0.9808495212380309, + "grad_norm": 2.0175790786743164, + "learning_rate": 2.23349345097279e-08, + "loss": 0.9215, + "step": 78466 + }, + { + "epoch": 0.9808745218630466, + "grad_norm": 2.727654457092285, + "learning_rate": 2.2276678465823488e-08, + "loss": 0.9399, + "step": 78468 + }, + { + "epoch": 0.9808995224880622, + "grad_norm": 0.00023793843865860254, + "learning_rate": 2.2218498410849775e-08, + "loss": 0.3791, + "step": 78470 + }, + { + "epoch": 0.9809245231130779, + "grad_norm": 2.5457839965820312, + "learning_rate": 2.2160394345249747e-08, + "loss": 1.1507, + "step": 78472 + }, + { + "epoch": 0.9809495237380934, + "grad_norm": 4.559357643127441, + "learning_rate": 2.210236626946416e-08, + "loss": 1.1138, + "step": 78474 + }, + { + "epoch": 0.9809745243631091, + "grad_norm": 3.2999377250671387, + "learning_rate": 2.20444141839371e-08, + "loss": 0.6232, + "step": 78476 + }, + { + "epoch": 0.9809995249881247, + "grad_norm": 11.158760070800781, + "learning_rate": 2.1986538089108222e-08, + "loss": 1.5415, + "step": 78478 + }, + { + "epoch": 0.9810245256131404, + "grad_norm": 1.0874825716018677, + "learning_rate": 2.192873798541828e-08, + "loss": 0.137, + "step": 78480 + }, + { + "epoch": 0.981049526238156, + "grad_norm": 4.147866725921631, + "learning_rate": 2.1871013873308034e-08, + "loss": 0.9726, + "step": 78482 + }, + { + "epoch": 0.9810745268631715, + "grad_norm": 11.676239013671875, + "learning_rate": 2.1813365753217132e-08, + "loss": 1.343, + "step": 78484 + }, + { + "epoch": 0.9810995274881872, + "grad_norm": 0.0034904794301837683, + "learning_rate": 2.175579362558522e-08, + "loss": 0.1174, + "step": 78486 + }, + { + "epoch": 0.9811245281132028, + "grad_norm": 2.9644758701324463, + "learning_rate": 2.169829749084973e-08, + "loss": 1.2882, + "step": 78488 + }, + { + "epoch": 0.9811495287382185, + "grad_norm": 2.874293088912964, + "learning_rate": 2.1640877349449195e-08, + "loss": 1.0909, + "step": 78490 + }, + { + "epoch": 0.9811745293632341, + "grad_norm": 5.086489677429199, + "learning_rate": 2.158353320182105e-08, + "loss": 0.4717, + "step": 78492 + }, + { + "epoch": 0.9811995299882497, + "grad_norm": 4.502803325653076, + "learning_rate": 2.1526265048400497e-08, + "loss": 1.7012, + "step": 78494 + }, + { + "epoch": 0.9812245306132653, + "grad_norm": 1.4550879001617432, + "learning_rate": 2.1469072889626073e-08, + "loss": 0.239, + "step": 78496 + }, + { + "epoch": 0.981249531238281, + "grad_norm": 3.073307991027832, + "learning_rate": 2.1411956725931882e-08, + "loss": 0.9816, + "step": 78498 + }, + { + "epoch": 0.9812745318632966, + "grad_norm": 3.1712706089019775, + "learning_rate": 2.1354916557752013e-08, + "loss": 0.8793, + "step": 78500 + }, + { + "epoch": 0.9812995324883123, + "grad_norm": 0.0037291711196303368, + "learning_rate": 2.1297952385522792e-08, + "loss": 1.1874, + "step": 78502 + }, + { + "epoch": 0.9813245331133278, + "grad_norm": 4.003004550933838, + "learning_rate": 2.1241064209677198e-08, + "loss": 1.0012, + "step": 78504 + }, + { + "epoch": 0.9813495337383434, + "grad_norm": 4.165358066558838, + "learning_rate": 2.1184252030649332e-08, + "loss": 1.3988, + "step": 78506 + }, + { + "epoch": 0.9813745343633591, + "grad_norm": 5.24838924407959, + "learning_rate": 2.1127515848869963e-08, + "loss": 1.3443, + "step": 78508 + }, + { + "epoch": 0.9813995349883747, + "grad_norm": 2.5766408443450928, + "learning_rate": 2.107085566477207e-08, + "loss": 0.4161, + "step": 78510 + }, + { + "epoch": 0.9814245356133904, + "grad_norm": 3.12117600440979, + "learning_rate": 2.101427147878754e-08, + "loss": 1.3979, + "step": 78512 + }, + { + "epoch": 0.9814495362384059, + "grad_norm": 3.5493507385253906, + "learning_rate": 2.0957763291348244e-08, + "loss": 1.1552, + "step": 78514 + }, + { + "epoch": 0.9814745368634216, + "grad_norm": 2.0308287143707275, + "learning_rate": 2.0901331102882725e-08, + "loss": 0.3594, + "step": 78516 + }, + { + "epoch": 0.9814995374884372, + "grad_norm": 3.5003912448883057, + "learning_rate": 2.0844974913821757e-08, + "loss": 0.3293, + "step": 78518 + }, + { + "epoch": 0.9815245381134529, + "grad_norm": 11.53121566772461, + "learning_rate": 2.078869472459388e-08, + "loss": 1.101, + "step": 78520 + }, + { + "epoch": 0.9815495387384685, + "grad_norm": 2.7912583351135254, + "learning_rate": 2.0732490535627648e-08, + "loss": 0.5281, + "step": 78522 + }, + { + "epoch": 0.981574539363484, + "grad_norm": 0.0002724036166910082, + "learning_rate": 2.0676362347352706e-08, + "loss": 1.0544, + "step": 78524 + }, + { + "epoch": 0.9815995399884997, + "grad_norm": 0.0003915956476703286, + "learning_rate": 2.062031016019428e-08, + "loss": 0.669, + "step": 78526 + }, + { + "epoch": 0.9816245406135153, + "grad_norm": 0.00040065989014692605, + "learning_rate": 2.056433397458202e-08, + "loss": 0.4938, + "step": 78528 + }, + { + "epoch": 0.981649541238531, + "grad_norm": 2.4657461643218994, + "learning_rate": 2.050843379093892e-08, + "loss": 0.6265, + "step": 78530 + }, + { + "epoch": 0.9816745418635466, + "grad_norm": 1.99416184425354, + "learning_rate": 2.045260960969353e-08, + "loss": 0.5574, + "step": 78532 + }, + { + "epoch": 0.9816995424885622, + "grad_norm": 3.483978033065796, + "learning_rate": 2.0396861431268843e-08, + "loss": 1.2223, + "step": 78534 + }, + { + "epoch": 0.9817245431135778, + "grad_norm": 3.5775558948516846, + "learning_rate": 2.0341189256091188e-08, + "loss": 0.3765, + "step": 78536 + }, + { + "epoch": 0.9817495437385935, + "grad_norm": 2.9807746410369873, + "learning_rate": 2.0285593084583555e-08, + "loss": 0.4974, + "step": 78538 + }, + { + "epoch": 0.9817745443636091, + "grad_norm": 6.244427680969238, + "learning_rate": 2.0230072917168943e-08, + "loss": 1.2956, + "step": 78540 + }, + { + "epoch": 0.9817995449886248, + "grad_norm": 0.24123182892799377, + "learning_rate": 2.0174628754271452e-08, + "loss": 0.75, + "step": 78542 + }, + { + "epoch": 0.9818245456136403, + "grad_norm": 1.829243779182434, + "learning_rate": 2.011926059631297e-08, + "loss": 0.1006, + "step": 78544 + }, + { + "epoch": 0.981849546238656, + "grad_norm": 3.3146286010742188, + "learning_rate": 2.0063968443714275e-08, + "loss": 1.404, + "step": 78546 + }, + { + "epoch": 0.9818745468636716, + "grad_norm": 4.797082424163818, + "learning_rate": 2.0008752296897248e-08, + "loss": 1.8182, + "step": 78548 + }, + { + "epoch": 0.9818995474886872, + "grad_norm": 7.028620719909668, + "learning_rate": 1.9953612156282664e-08, + "loss": 1.1126, + "step": 78550 + }, + { + "epoch": 0.9819245481137029, + "grad_norm": 2.488771677017212, + "learning_rate": 1.9898548022289076e-08, + "loss": 0.8026, + "step": 78552 + }, + { + "epoch": 0.9819495487387184, + "grad_norm": 0.0005317401955835521, + "learning_rate": 1.984355989533837e-08, + "loss": 0.7945, + "step": 78554 + }, + { + "epoch": 0.9819745493637341, + "grad_norm": 0.09551596641540527, + "learning_rate": 1.978864777584688e-08, + "loss": 0.851, + "step": 78556 + }, + { + "epoch": 0.9819995499887497, + "grad_norm": 0.45401206612586975, + "learning_rate": 1.973381166423316e-08, + "loss": 0.3657, + "step": 78558 + }, + { + "epoch": 0.9820245506137654, + "grad_norm": 6.233402729034424, + "learning_rate": 1.9679051560916874e-08, + "loss": 1.0121, + "step": 78560 + }, + { + "epoch": 0.982049551238781, + "grad_norm": 4.415598392486572, + "learning_rate": 1.9624367466312134e-08, + "loss": 1.6504, + "step": 78562 + }, + { + "epoch": 0.9820745518637966, + "grad_norm": 4.154367446899414, + "learning_rate": 1.9569759380837493e-08, + "loss": 0.824, + "step": 78564 + }, + { + "epoch": 0.9820995524888122, + "grad_norm": 1.4256415367126465, + "learning_rate": 1.951522730490818e-08, + "loss": 1.0702, + "step": 78566 + }, + { + "epoch": 0.9821245531138278, + "grad_norm": 4.304340839385986, + "learning_rate": 1.9460771238939413e-08, + "loss": 0.2518, + "step": 78568 + }, + { + "epoch": 0.9821495537388435, + "grad_norm": 7.238150119781494, + "learning_rate": 1.9406391183346417e-08, + "loss": 1.528, + "step": 78570 + }, + { + "epoch": 0.9821745543638591, + "grad_norm": 1.4150711297988892, + "learning_rate": 1.9352087138543308e-08, + "loss": 1.0379, + "step": 78572 + }, + { + "epoch": 0.9821995549888747, + "grad_norm": 4.3079047203063965, + "learning_rate": 1.9297859104941972e-08, + "loss": 1.6981, + "step": 78574 + }, + { + "epoch": 0.9822245556138903, + "grad_norm": 0.0002506720193196088, + "learning_rate": 1.9243707082957642e-08, + "loss": 0.4804, + "step": 78576 + }, + { + "epoch": 0.982249556238906, + "grad_norm": 7.789626121520996, + "learning_rate": 1.918963107300109e-08, + "loss": 0.7977, + "step": 78578 + }, + { + "epoch": 0.9822745568639216, + "grad_norm": 2.422283172607422, + "learning_rate": 1.913563107548644e-08, + "loss": 0.4463, + "step": 78580 + }, + { + "epoch": 0.9822995574889373, + "grad_norm": 3.1430041790008545, + "learning_rate": 1.9081707090822242e-08, + "loss": 1.8712, + "step": 78582 + }, + { + "epoch": 0.9823245581139528, + "grad_norm": 3.7802717685699463, + "learning_rate": 1.90278591194204e-08, + "loss": 0.7784, + "step": 78584 + }, + { + "epoch": 0.9823495587389685, + "grad_norm": 1.0049391984939575, + "learning_rate": 1.897408716169058e-08, + "loss": 0.0582, + "step": 78586 + }, + { + "epoch": 0.9823745593639841, + "grad_norm": 4.409783840179443, + "learning_rate": 1.8920391218043564e-08, + "loss": 0.4756, + "step": 78588 + }, + { + "epoch": 0.9823995599889997, + "grad_norm": 3.031548500061035, + "learning_rate": 1.886677128888681e-08, + "loss": 1.7976, + "step": 78590 + }, + { + "epoch": 0.9824245606140154, + "grad_norm": 2.7774600982666016, + "learning_rate": 1.8813227374629984e-08, + "loss": 0.2806, + "step": 78592 + }, + { + "epoch": 0.9824495612390309, + "grad_norm": 4.0578389167785645, + "learning_rate": 1.875975947567943e-08, + "loss": 1.0364, + "step": 78594 + }, + { + "epoch": 0.9824745618640466, + "grad_norm": 0.0005987230688333511, + "learning_rate": 1.870636759244371e-08, + "loss": 0.6798, + "step": 78596 + }, + { + "epoch": 0.9824995624890622, + "grad_norm": 0.00033654493745416403, + "learning_rate": 1.8653051725328052e-08, + "loss": 0.3147, + "step": 78598 + }, + { + "epoch": 0.9825245631140779, + "grad_norm": 4.460149765014648, + "learning_rate": 1.8599811874739915e-08, + "loss": 2.0492, + "step": 78600 + }, + { + "epoch": 0.9825495637390935, + "grad_norm": 4.973156452178955, + "learning_rate": 1.8546648041083415e-08, + "loss": 1.0493, + "step": 78602 + }, + { + "epoch": 0.9825745643641091, + "grad_norm": 2.158381938934326, + "learning_rate": 1.8493560224763785e-08, + "loss": 0.9679, + "step": 78604 + }, + { + "epoch": 0.9825995649891247, + "grad_norm": 3.1812357902526855, + "learning_rate": 1.8440548426186256e-08, + "loss": 0.5563, + "step": 78606 + }, + { + "epoch": 0.9826245656141404, + "grad_norm": 15.518559455871582, + "learning_rate": 1.8387612645754948e-08, + "loss": 1.6948, + "step": 78608 + }, + { + "epoch": 0.982649566239156, + "grad_norm": 3.5563528537750244, + "learning_rate": 1.8334752883870654e-08, + "loss": 0.9358, + "step": 78610 + }, + { + "epoch": 0.9826745668641717, + "grad_norm": 4.570616722106934, + "learning_rate": 1.8281969140938604e-08, + "loss": 1.3281, + "step": 78612 + }, + { + "epoch": 0.9826995674891872, + "grad_norm": 3.558192253112793, + "learning_rate": 1.822926141735848e-08, + "loss": 0.6441, + "step": 78614 + }, + { + "epoch": 0.9827245681142028, + "grad_norm": 1.366843581199646, + "learning_rate": 1.8176629713533288e-08, + "loss": 0.0563, + "step": 78616 + }, + { + "epoch": 0.9827495687392185, + "grad_norm": 0.6222503185272217, + "learning_rate": 1.8124074029862713e-08, + "loss": 0.7112, + "step": 78618 + }, + { + "epoch": 0.9827745693642341, + "grad_norm": 4.465773105621338, + "learning_rate": 1.8071594366748657e-08, + "loss": 1.1928, + "step": 78620 + }, + { + "epoch": 0.9827995699892498, + "grad_norm": 4.345905780792236, + "learning_rate": 1.8019190724588576e-08, + "loss": 0.8279, + "step": 78622 + }, + { + "epoch": 0.9828245706142653, + "grad_norm": 0.442015677690506, + "learning_rate": 1.796686310378437e-08, + "loss": 0.4948, + "step": 78624 + }, + { + "epoch": 0.982849571239281, + "grad_norm": 0.0012285879347473383, + "learning_rate": 1.7914611504731283e-08, + "loss": 0.2294, + "step": 78626 + }, + { + "epoch": 0.9828745718642966, + "grad_norm": 0.6125035881996155, + "learning_rate": 1.78624359278301e-08, + "loss": 0.0091, + "step": 78628 + }, + { + "epoch": 0.9828995724893123, + "grad_norm": 2.1348016262054443, + "learning_rate": 1.7810336373476067e-08, + "loss": 0.4334, + "step": 78630 + }, + { + "epoch": 0.9829245731143279, + "grad_norm": 2.020751476287842, + "learning_rate": 1.775831284206775e-08, + "loss": 0.4475, + "step": 78632 + }, + { + "epoch": 0.9829495737393434, + "grad_norm": 0.12927520275115967, + "learning_rate": 1.7706365333999276e-08, + "loss": 0.2469, + "step": 78634 + }, + { + "epoch": 0.9829745743643591, + "grad_norm": 2.155529260635376, + "learning_rate": 1.765449384966922e-08, + "loss": 0.5495, + "step": 78636 + }, + { + "epoch": 0.9829995749893747, + "grad_norm": 5.011422634124756, + "learning_rate": 1.7602698389469487e-08, + "loss": 0.889, + "step": 78638 + }, + { + "epoch": 0.9830245756143904, + "grad_norm": 0.010018808767199516, + "learning_rate": 1.7550978953796427e-08, + "loss": 0.633, + "step": 78640 + }, + { + "epoch": 0.983049576239406, + "grad_norm": 0.00027968818903900683, + "learning_rate": 1.7499335543044172e-08, + "loss": 0.4319, + "step": 78642 + }, + { + "epoch": 0.9830745768644216, + "grad_norm": 0.0010011522099375725, + "learning_rate": 1.7447768157603517e-08, + "loss": 0.2943, + "step": 78644 + }, + { + "epoch": 0.9830995774894372, + "grad_norm": 1.608559012413025, + "learning_rate": 1.7396276797870815e-08, + "loss": 0.2267, + "step": 78646 + }, + { + "epoch": 0.9831245781144529, + "grad_norm": 3.624572515487671, + "learning_rate": 1.7344861464235752e-08, + "loss": 0.5545, + "step": 78648 + }, + { + "epoch": 0.9831495787394685, + "grad_norm": 0.0006090040551498532, + "learning_rate": 1.7293522157090236e-08, + "loss": 1.4884, + "step": 78650 + }, + { + "epoch": 0.9831745793644842, + "grad_norm": 0.00032263752655126154, + "learning_rate": 1.724225887682618e-08, + "loss": 0.0447, + "step": 78652 + }, + { + "epoch": 0.9831995799894997, + "grad_norm": 1.9170703887939453, + "learning_rate": 1.7191071623833265e-08, + "loss": 1.0854, + "step": 78654 + }, + { + "epoch": 0.9832245806145153, + "grad_norm": 3.989929676055908, + "learning_rate": 1.713996039850119e-08, + "loss": 0.6611, + "step": 78656 + }, + { + "epoch": 0.983249581239531, + "grad_norm": 0.000314219476422295, + "learning_rate": 1.708892520121963e-08, + "loss": 0.6283, + "step": 78658 + }, + { + "epoch": 0.9832745818645466, + "grad_norm": 1.945730209350586, + "learning_rate": 1.7037966032377172e-08, + "loss": 0.5915, + "step": 78660 + }, + { + "epoch": 0.9832995824895623, + "grad_norm": 3.794612407684326, + "learning_rate": 1.6987082892361285e-08, + "loss": 0.5727, + "step": 78662 + }, + { + "epoch": 0.9833245831145778, + "grad_norm": 0.0003083691408392042, + "learning_rate": 1.693627578156054e-08, + "loss": 1.3132, + "step": 78664 + }, + { + "epoch": 0.9833495837395935, + "grad_norm": 3.168376922607422, + "learning_rate": 1.68855447003613e-08, + "loss": 0.5887, + "step": 78666 + }, + { + "epoch": 0.9833745843646091, + "grad_norm": 3.882564067840576, + "learning_rate": 1.683488964914992e-08, + "loss": 0.7349, + "step": 78668 + }, + { + "epoch": 0.9833995849896248, + "grad_norm": 4.590577125549316, + "learning_rate": 1.678431062831276e-08, + "loss": 1.466, + "step": 78670 + }, + { + "epoch": 0.9834245856146404, + "grad_norm": 19.99538803100586, + "learning_rate": 1.6733807638233957e-08, + "loss": 1.2246, + "step": 78672 + }, + { + "epoch": 0.983449586239656, + "grad_norm": 4.136224746704102, + "learning_rate": 1.6683380679298755e-08, + "loss": 1.1241, + "step": 78674 + }, + { + "epoch": 0.9834745868646716, + "grad_norm": 2.925996780395508, + "learning_rate": 1.6633029751891295e-08, + "loss": 0.656, + "step": 78676 + }, + { + "epoch": 0.9834995874896872, + "grad_norm": 2.5495574474334717, + "learning_rate": 1.65827548563946e-08, + "loss": 1.189, + "step": 78678 + }, + { + "epoch": 0.9835245881147029, + "grad_norm": 3.2251522541046143, + "learning_rate": 1.653255599319281e-08, + "loss": 0.4316, + "step": 78680 + }, + { + "epoch": 0.9835495887397185, + "grad_norm": 0.0014123691944405437, + "learning_rate": 1.6482433162666732e-08, + "loss": 0.2849, + "step": 78682 + }, + { + "epoch": 0.9835745893647341, + "grad_norm": 3.6808760166168213, + "learning_rate": 1.6432386365198283e-08, + "loss": 0.748, + "step": 78684 + }, + { + "epoch": 0.9835995899897497, + "grad_norm": 4.089905738830566, + "learning_rate": 1.6382415601169376e-08, + "loss": 0.7971, + "step": 78686 + }, + { + "epoch": 0.9836245906147654, + "grad_norm": 4.535163402557373, + "learning_rate": 1.6332520870959713e-08, + "loss": 2.09, + "step": 78688 + }, + { + "epoch": 0.983649591239781, + "grad_norm": 2.2569663524627686, + "learning_rate": 1.6282702174950094e-08, + "loss": 0.562, + "step": 78690 + }, + { + "epoch": 0.9836745918647967, + "grad_norm": 4.417870998382568, + "learning_rate": 1.6232959513519108e-08, + "loss": 1.0075, + "step": 78692 + }, + { + "epoch": 0.9836995924898122, + "grad_norm": 3.111933469772339, + "learning_rate": 1.6183292887047565e-08, + "loss": 0.795, + "step": 78694 + }, + { + "epoch": 0.9837245931148278, + "grad_norm": 4.606257438659668, + "learning_rate": 1.6133702295910714e-08, + "loss": 1.5549, + "step": 78696 + }, + { + "epoch": 0.9837495937398435, + "grad_norm": 0.005550777539610863, + "learning_rate": 1.6084187740488257e-08, + "loss": 1.4495, + "step": 78698 + }, + { + "epoch": 0.9837745943648591, + "grad_norm": 4.445783615112305, + "learning_rate": 1.6034749221156554e-08, + "loss": 1.8701, + "step": 78700 + }, + { + "epoch": 0.9837995949898748, + "grad_norm": 0.11744362860918045, + "learning_rate": 1.5985386738293085e-08, + "loss": 0.7159, + "step": 78702 + }, + { + "epoch": 0.9838245956148903, + "grad_norm": 1.8861448764801025, + "learning_rate": 1.5936100292271994e-08, + "loss": 0.3703, + "step": 78704 + }, + { + "epoch": 0.983849596239906, + "grad_norm": 2.9108221530914307, + "learning_rate": 1.588688988347076e-08, + "loss": 1.0972, + "step": 78706 + }, + { + "epoch": 0.9838745968649216, + "grad_norm": 2.7170538902282715, + "learning_rate": 1.583775551226241e-08, + "loss": 1.4506, + "step": 78708 + }, + { + "epoch": 0.9838995974899373, + "grad_norm": 3.730365514755249, + "learning_rate": 1.5788697179021094e-08, + "loss": 0.7149, + "step": 78710 + }, + { + "epoch": 0.9839245981149529, + "grad_norm": 5.594034671783447, + "learning_rate": 1.573971488412207e-08, + "loss": 0.6029, + "step": 78712 + }, + { + "epoch": 0.9839495987399685, + "grad_norm": 3.6964406967163086, + "learning_rate": 1.569080862793837e-08, + "loss": 0.9734, + "step": 78714 + }, + { + "epoch": 0.9839745993649841, + "grad_norm": 11.686387062072754, + "learning_rate": 1.5641978410839697e-08, + "loss": 1.8916, + "step": 78716 + }, + { + "epoch": 0.9839995999899998, + "grad_norm": 3.6831891536712646, + "learning_rate": 1.559322423320131e-08, + "loss": 1.2358, + "step": 78718 + }, + { + "epoch": 0.9840246006150154, + "grad_norm": 3.4361109733581543, + "learning_rate": 1.55445460953918e-08, + "loss": 0.4785, + "step": 78720 + }, + { + "epoch": 0.984049601240031, + "grad_norm": 3.028456926345825, + "learning_rate": 1.5495943997784202e-08, + "loss": 1.0464, + "step": 78722 + }, + { + "epoch": 0.9840746018650466, + "grad_norm": 4.447925567626953, + "learning_rate": 1.544741794074711e-08, + "loss": 1.0684, + "step": 78724 + }, + { + "epoch": 0.9840996024900622, + "grad_norm": 0.8034788370132446, + "learning_rate": 1.5398967924650233e-08, + "loss": 0.4144, + "step": 78726 + }, + { + "epoch": 0.9841246031150779, + "grad_norm": 4.659369468688965, + "learning_rate": 1.535059394986327e-08, + "loss": 1.5945, + "step": 78728 + }, + { + "epoch": 0.9841496037400935, + "grad_norm": 1.3335038423538208, + "learning_rate": 1.5302296016753705e-08, + "loss": 0.3946, + "step": 78730 + }, + { + "epoch": 0.9841746043651092, + "grad_norm": 2.8395090103149414, + "learning_rate": 1.525407412569124e-08, + "loss": 0.1371, + "step": 78732 + }, + { + "epoch": 0.9841996049901247, + "grad_norm": 3.477611541748047, + "learning_rate": 1.520592827704004e-08, + "loss": 1.2655, + "step": 78734 + }, + { + "epoch": 0.9842246056151404, + "grad_norm": 4.260745525360107, + "learning_rate": 1.5157858471169796e-08, + "loss": 2.2942, + "step": 78736 + }, + { + "epoch": 0.984249606240156, + "grad_norm": 3.4101922512054443, + "learning_rate": 1.5109864708444666e-08, + "loss": 0.7472, + "step": 78738 + }, + { + "epoch": 0.9842746068651717, + "grad_norm": 5.805279731750488, + "learning_rate": 1.5061946989231024e-08, + "loss": 1.7893, + "step": 78740 + }, + { + "epoch": 0.9842996074901873, + "grad_norm": 1.855464220046997, + "learning_rate": 1.5014105313893025e-08, + "loss": 0.7348, + "step": 78742 + }, + { + "epoch": 0.9843246081152028, + "grad_norm": 3.8404688835144043, + "learning_rate": 1.4966339682795928e-08, + "loss": 1.9473, + "step": 78744 + }, + { + "epoch": 0.9843496087402185, + "grad_norm": 3.8495569229125977, + "learning_rate": 1.4918650096303888e-08, + "loss": 1.6205, + "step": 78746 + }, + { + "epoch": 0.9843746093652341, + "grad_norm": 0.03494355082511902, + "learning_rate": 1.487103655477884e-08, + "loss": 0.0118, + "step": 78748 + }, + { + "epoch": 0.9843996099902498, + "grad_norm": 3.271488666534424, + "learning_rate": 1.4823499058583823e-08, + "loss": 1.4087, + "step": 78750 + }, + { + "epoch": 0.9844246106152654, + "grad_norm": 2.5193898677825928, + "learning_rate": 1.4776037608081884e-08, + "loss": 0.9239, + "step": 78752 + }, + { + "epoch": 0.984449611240281, + "grad_norm": 3.770728349685669, + "learning_rate": 1.4728652203632731e-08, + "loss": 1.1237, + "step": 78754 + }, + { + "epoch": 0.9844746118652966, + "grad_norm": 3.5589234828948975, + "learning_rate": 1.46813428455983e-08, + "loss": 1.2649, + "step": 78756 + }, + { + "epoch": 0.9844996124903123, + "grad_norm": 1.7664837837219238, + "learning_rate": 1.4634109534338303e-08, + "loss": 0.1858, + "step": 78758 + }, + { + "epoch": 0.9845246131153279, + "grad_norm": 11.763641357421875, + "learning_rate": 1.4586952270213562e-08, + "loss": 1.6337, + "step": 78760 + }, + { + "epoch": 0.9845496137403436, + "grad_norm": 1.6822401285171509, + "learning_rate": 1.4539871053581567e-08, + "loss": 0.5831, + "step": 78762 + }, + { + "epoch": 0.9845746143653591, + "grad_norm": 3.346858024597168, + "learning_rate": 1.4492865884802032e-08, + "loss": 1.1735, + "step": 78764 + }, + { + "epoch": 0.9845996149903747, + "grad_norm": 5.674169063568115, + "learning_rate": 1.444593676423356e-08, + "loss": 1.364, + "step": 78766 + }, + { + "epoch": 0.9846246156153904, + "grad_norm": 6.515005111694336, + "learning_rate": 1.439908369223253e-08, + "loss": 0.7925, + "step": 78768 + }, + { + "epoch": 0.984649616240406, + "grad_norm": 3.7970592975616455, + "learning_rate": 1.4352306669155325e-08, + "loss": 0.385, + "step": 78770 + }, + { + "epoch": 0.9846746168654217, + "grad_norm": 3.03202223777771, + "learning_rate": 1.4305605695359437e-08, + "loss": 1.314, + "step": 78772 + }, + { + "epoch": 0.9846996174904372, + "grad_norm": 2.0320284366607666, + "learning_rate": 1.4258980771200136e-08, + "loss": 0.9085, + "step": 78774 + }, + { + "epoch": 0.9847246181154529, + "grad_norm": 3.488698720932007, + "learning_rate": 1.4212431897031587e-08, + "loss": 1.6532, + "step": 78776 + }, + { + "epoch": 0.9847496187404685, + "grad_norm": 6.016997814178467, + "learning_rate": 1.4165959073210167e-08, + "loss": 1.2967, + "step": 78778 + }, + { + "epoch": 0.9847746193654842, + "grad_norm": 3.6023707389831543, + "learning_rate": 1.411956230008782e-08, + "loss": 1.0943, + "step": 78780 + }, + { + "epoch": 0.9847996199904998, + "grad_norm": 2.2387380599975586, + "learning_rate": 1.4073241578018703e-08, + "loss": 0.778, + "step": 78782 + }, + { + "epoch": 0.9848246206155153, + "grad_norm": 4.56941556930542, + "learning_rate": 1.4026996907355872e-08, + "loss": 1.6742, + "step": 78784 + }, + { + "epoch": 0.984849621240531, + "grad_norm": 0.021112171933054924, + "learning_rate": 1.3980828288451264e-08, + "loss": 0.8503, + "step": 78786 + }, + { + "epoch": 0.9848746218655466, + "grad_norm": 2.8517565727233887, + "learning_rate": 1.3934735721656823e-08, + "loss": 0.8301, + "step": 78788 + }, + { + "epoch": 0.9848996224905623, + "grad_norm": 2.9509944915771484, + "learning_rate": 1.3888719207323375e-08, + "loss": 0.8529, + "step": 78790 + }, + { + "epoch": 0.9849246231155779, + "grad_norm": 3.1356430053710938, + "learning_rate": 1.3842778745800644e-08, + "loss": 1.4476, + "step": 78792 + }, + { + "epoch": 0.9849496237405935, + "grad_norm": 2.8662941455841064, + "learning_rate": 1.379691433743946e-08, + "loss": 0.382, + "step": 78794 + }, + { + "epoch": 0.9849746243656091, + "grad_norm": 0.6007251143455505, + "learning_rate": 1.3751125982588431e-08, + "loss": 0.3896, + "step": 78796 + }, + { + "epoch": 0.9849996249906248, + "grad_norm": 3.7871947288513184, + "learning_rate": 1.370541368159728e-08, + "loss": 1.3308, + "step": 78798 + }, + { + "epoch": 0.9850246256156404, + "grad_norm": 4.527100563049316, + "learning_rate": 1.3659777434813504e-08, + "loss": 0.9156, + "step": 78800 + }, + { + "epoch": 0.9850496262406561, + "grad_norm": 4.808640956878662, + "learning_rate": 1.3614217242583495e-08, + "loss": 2.055, + "step": 78802 + }, + { + "epoch": 0.9850746268656716, + "grad_norm": 0.840692400932312, + "learning_rate": 1.3568733105256971e-08, + "loss": 0.3679, + "step": 78804 + }, + { + "epoch": 0.9850996274906872, + "grad_norm": 9.837312698364258, + "learning_rate": 1.3523325023178103e-08, + "loss": 0.86, + "step": 78806 + }, + { + "epoch": 0.9851246281157029, + "grad_norm": 2.5918996334075928, + "learning_rate": 1.3477992996693279e-08, + "loss": 2.6064, + "step": 78808 + }, + { + "epoch": 0.9851496287407185, + "grad_norm": 4.409870624542236, + "learning_rate": 1.3432737026147779e-08, + "loss": 1.1592, + "step": 78810 + }, + { + "epoch": 0.9851746293657342, + "grad_norm": 0.14444813132286072, + "learning_rate": 1.3387557111886885e-08, + "loss": 0.9869, + "step": 78812 + }, + { + "epoch": 0.9851996299907497, + "grad_norm": 8.528326988220215, + "learning_rate": 1.3342453254253652e-08, + "loss": 2.4566, + "step": 78814 + }, + { + "epoch": 0.9852246306157654, + "grad_norm": 5.4541916847229, + "learning_rate": 1.3297425453591139e-08, + "loss": 2.1797, + "step": 78816 + }, + { + "epoch": 0.985249631240781, + "grad_norm": 0.8905065059661865, + "learning_rate": 1.325247371024463e-08, + "loss": 0.4757, + "step": 78818 + }, + { + "epoch": 0.9852746318657967, + "grad_norm": 2.872840404510498, + "learning_rate": 1.3207598024554957e-08, + "loss": 1.4623, + "step": 78820 + }, + { + "epoch": 0.9852996324908123, + "grad_norm": 1.1482247114181519, + "learning_rate": 1.3162798396862964e-08, + "loss": 0.2998, + "step": 78822 + }, + { + "epoch": 0.9853246331158279, + "grad_norm": 4.51438570022583, + "learning_rate": 1.3118074827511706e-08, + "loss": 0.3938, + "step": 78824 + }, + { + "epoch": 0.9853496337408435, + "grad_norm": 5.816954612731934, + "learning_rate": 1.3073427316840914e-08, + "loss": 0.7919, + "step": 78826 + }, + { + "epoch": 0.9853746343658591, + "grad_norm": 3.336616277694702, + "learning_rate": 1.3028855865190315e-08, + "loss": 1.2528, + "step": 78828 + }, + { + "epoch": 0.9853996349908748, + "grad_norm": 2.799463987350464, + "learning_rate": 1.2984360472899637e-08, + "loss": 0.1632, + "step": 78830 + }, + { + "epoch": 0.9854246356158904, + "grad_norm": 2.8580174446105957, + "learning_rate": 1.2939941140307499e-08, + "loss": 0.8816, + "step": 78832 + }, + { + "epoch": 0.985449636240906, + "grad_norm": 3.7628307342529297, + "learning_rate": 1.2895597867752519e-08, + "loss": 0.1476, + "step": 78834 + }, + { + "epoch": 0.9854746368659216, + "grad_norm": 1.7152568101882935, + "learning_rate": 1.2851330655572202e-08, + "loss": 0.81, + "step": 78836 + }, + { + "epoch": 0.9854996374909373, + "grad_norm": 5.029011249542236, + "learning_rate": 1.280713950410517e-08, + "loss": 0.6331, + "step": 78838 + }, + { + "epoch": 0.9855246381159529, + "grad_norm": 0.013312969356775284, + "learning_rate": 1.2763024413685598e-08, + "loss": 0.5012, + "step": 78840 + }, + { + "epoch": 0.9855496387409686, + "grad_norm": 2.5054619312286377, + "learning_rate": 1.2718985384650995e-08, + "loss": 0.4087, + "step": 78842 + }, + { + "epoch": 0.9855746393659841, + "grad_norm": 3.7254085540771484, + "learning_rate": 1.2675022417335537e-08, + "loss": 1.0412, + "step": 78844 + }, + { + "epoch": 0.9855996399909998, + "grad_norm": 6.21696662902832, + "learning_rate": 1.263113551207562e-08, + "loss": 0.9577, + "step": 78846 + }, + { + "epoch": 0.9856246406160154, + "grad_norm": 0.6520244479179382, + "learning_rate": 1.2587324669203205e-08, + "loss": 0.5191, + "step": 78848 + }, + { + "epoch": 0.985649641241031, + "grad_norm": 2.99833607673645, + "learning_rate": 1.2543589889054686e-08, + "loss": 2.0471, + "step": 78850 + }, + { + "epoch": 0.9856746418660467, + "grad_norm": 5.959710121154785, + "learning_rate": 1.249993117196202e-08, + "loss": 1.1543, + "step": 78852 + }, + { + "epoch": 0.9856996424910622, + "grad_norm": 0.47910305857658386, + "learning_rate": 1.2456348518256056e-08, + "loss": 0.2229, + "step": 78854 + }, + { + "epoch": 0.9857246431160779, + "grad_norm": 0.0013873698189854622, + "learning_rate": 1.241284192827097e-08, + "loss": 0.8832, + "step": 78856 + }, + { + "epoch": 0.9857496437410935, + "grad_norm": 0.0004953276948072016, + "learning_rate": 1.2369411402337606e-08, + "loss": 0.7275, + "step": 78858 + }, + { + "epoch": 0.9857746443661092, + "grad_norm": 2.5601282119750977, + "learning_rate": 1.2326056940786812e-08, + "loss": 0.7598, + "step": 78860 + }, + { + "epoch": 0.9857996449911248, + "grad_norm": 3.959341049194336, + "learning_rate": 1.2282778543947216e-08, + "loss": 0.8499, + "step": 78862 + }, + { + "epoch": 0.9858246456161404, + "grad_norm": 1.4977432489395142, + "learning_rate": 1.2239576212150772e-08, + "loss": 0.0657, + "step": 78864 + }, + { + "epoch": 0.985849646241156, + "grad_norm": 3.1626522541046143, + "learning_rate": 1.2196449945724997e-08, + "loss": 0.9894, + "step": 78866 + }, + { + "epoch": 0.9858746468661717, + "grad_norm": 3.7390992641448975, + "learning_rate": 1.2153399744997407e-08, + "loss": 1.0342, + "step": 78868 + }, + { + "epoch": 0.9858996474911873, + "grad_norm": 2.713733434677124, + "learning_rate": 1.2110425610298848e-08, + "loss": 1.2009, + "step": 78870 + }, + { + "epoch": 0.985924648116203, + "grad_norm": 11.747106552124023, + "learning_rate": 1.2067527541954616e-08, + "loss": 1.0899, + "step": 78872 + }, + { + "epoch": 0.9859496487412185, + "grad_norm": 0.34771859645843506, + "learning_rate": 1.2024705540291115e-08, + "loss": 0.4778, + "step": 78874 + }, + { + "epoch": 0.9859746493662341, + "grad_norm": 1.3422905206680298, + "learning_rate": 1.1981959605634752e-08, + "loss": 0.4956, + "step": 78876 + }, + { + "epoch": 0.9859996499912498, + "grad_norm": 3.494642496109009, + "learning_rate": 1.1939289738311932e-08, + "loss": 1.1211, + "step": 78878 + }, + { + "epoch": 0.9860246506162654, + "grad_norm": 0.0003291540779173374, + "learning_rate": 1.189669593864684e-08, + "loss": 0.1254, + "step": 78880 + }, + { + "epoch": 0.9860496512412811, + "grad_norm": 6.596864700317383, + "learning_rate": 1.1854178206963662e-08, + "loss": 0.8604, + "step": 78882 + }, + { + "epoch": 0.9860746518662966, + "grad_norm": 2.9564852714538574, + "learning_rate": 1.1811736543586582e-08, + "loss": 1.2755, + "step": 78884 + }, + { + "epoch": 0.9860996524913123, + "grad_norm": 3.4893743991851807, + "learning_rate": 1.1769370948839786e-08, + "loss": 0.7298, + "step": 78886 + }, + { + "epoch": 0.9861246531163279, + "grad_norm": 3.646752119064331, + "learning_rate": 1.172708142304413e-08, + "loss": 0.7341, + "step": 78888 + }, + { + "epoch": 0.9861496537413436, + "grad_norm": 0.0018379753455519676, + "learning_rate": 1.1684867966523795e-08, + "loss": 0.0001, + "step": 78890 + }, + { + "epoch": 0.9861746543663592, + "grad_norm": 8.678203582763672, + "learning_rate": 1.1642730579597417e-08, + "loss": 1.778, + "step": 78892 + }, + { + "epoch": 0.9861996549913747, + "grad_norm": 6.9557929039001465, + "learning_rate": 1.1600669262588071e-08, + "loss": 0.8518, + "step": 78894 + }, + { + "epoch": 0.9862246556163904, + "grad_norm": 0.45154327154159546, + "learning_rate": 1.1558684015816613e-08, + "loss": 0.5849, + "step": 78896 + }, + { + "epoch": 0.986249656241406, + "grad_norm": 3.438638210296631, + "learning_rate": 1.1516774839600565e-08, + "loss": 1.3554, + "step": 78898 + }, + { + "epoch": 0.9862746568664217, + "grad_norm": 3.9464826583862305, + "learning_rate": 1.147494173426078e-08, + "loss": 1.3624, + "step": 78900 + }, + { + "epoch": 0.9862996574914373, + "grad_norm": 2.123302936553955, + "learning_rate": 1.1433184700114786e-08, + "loss": 1.1529, + "step": 78902 + }, + { + "epoch": 0.9863246581164529, + "grad_norm": 4.58343505859375, + "learning_rate": 1.1391503737481214e-08, + "loss": 1.4827, + "step": 78904 + }, + { + "epoch": 0.9863496587414685, + "grad_norm": 0.0007233916549012065, + "learning_rate": 1.1349898846677587e-08, + "loss": 1.2816, + "step": 78906 + }, + { + "epoch": 0.9863746593664842, + "grad_norm": 4.146087646484375, + "learning_rate": 1.1308370028020321e-08, + "loss": 0.8407, + "step": 78908 + }, + { + "epoch": 0.9863996599914998, + "grad_norm": 0.00020771812705788761, + "learning_rate": 1.1266917281826939e-08, + "loss": 0.0173, + "step": 78910 + }, + { + "epoch": 0.9864246606165155, + "grad_norm": 5.622827053070068, + "learning_rate": 1.1225540608411634e-08, + "loss": 2.2631, + "step": 78912 + }, + { + "epoch": 0.986449661241531, + "grad_norm": 1.823514461517334, + "learning_rate": 1.1184240008089709e-08, + "loss": 0.7068, + "step": 78914 + }, + { + "epoch": 0.9864746618665466, + "grad_norm": 2.3879055976867676, + "learning_rate": 1.1143015481176467e-08, + "loss": 1.0194, + "step": 78916 + }, + { + "epoch": 0.9864996624915623, + "grad_norm": 3.436295986175537, + "learning_rate": 1.1101867027984992e-08, + "loss": 1.6577, + "step": 78918 + }, + { + "epoch": 0.9865246631165779, + "grad_norm": 1.0880608558654785, + "learning_rate": 1.1060794648829475e-08, + "loss": 0.1316, + "step": 78920 + }, + { + "epoch": 0.9865496637415936, + "grad_norm": 0.0003540976031217724, + "learning_rate": 1.1019798344023003e-08, + "loss": 0.0, + "step": 78922 + }, + { + "epoch": 0.9865746643666091, + "grad_norm": 3.230511426925659, + "learning_rate": 1.0978878113876434e-08, + "loss": 1.7849, + "step": 78924 + }, + { + "epoch": 0.9865996649916248, + "grad_norm": 3.9803946018218994, + "learning_rate": 1.0938033958702853e-08, + "loss": 0.8664, + "step": 78926 + }, + { + "epoch": 0.9866246656166404, + "grad_norm": 2.572373867034912, + "learning_rate": 1.0897265878812014e-08, + "loss": 0.5809, + "step": 78928 + }, + { + "epoch": 0.9866496662416561, + "grad_norm": 0.28786033391952515, + "learning_rate": 1.0856573874514774e-08, + "loss": 0.1711, + "step": 78930 + }, + { + "epoch": 0.9866746668666717, + "grad_norm": 5.323698997497559, + "learning_rate": 1.0815957946122002e-08, + "loss": 0.9254, + "step": 78932 + }, + { + "epoch": 0.9866996674916872, + "grad_norm": 2.94470477104187, + "learning_rate": 1.0775418093941226e-08, + "loss": 1.6034, + "step": 78934 + }, + { + "epoch": 0.9867246681167029, + "grad_norm": 1.6708018779754639, + "learning_rate": 1.0734954318283309e-08, + "loss": 0.8941, + "step": 78936 + }, + { + "epoch": 0.9867496687417185, + "grad_norm": 2.4543333053588867, + "learning_rate": 1.0694566619454671e-08, + "loss": 0.1705, + "step": 78938 + }, + { + "epoch": 0.9867746693667342, + "grad_norm": 1.992149829864502, + "learning_rate": 1.0654254997763958e-08, + "loss": 2.0524, + "step": 78940 + }, + { + "epoch": 0.9867996699917498, + "grad_norm": 3.4561564922332764, + "learning_rate": 1.0614019453517588e-08, + "loss": 0.535, + "step": 78942 + }, + { + "epoch": 0.9868246706167654, + "grad_norm": 3.728104591369629, + "learning_rate": 1.0573859987021984e-08, + "loss": 0.5979, + "step": 78944 + }, + { + "epoch": 0.986849671241781, + "grad_norm": 3.483280897140503, + "learning_rate": 1.0533776598584678e-08, + "loss": 0.278, + "step": 78946 + }, + { + "epoch": 0.9868746718667967, + "grad_norm": 3.576338052749634, + "learning_rate": 1.049376928850765e-08, + "loss": 0.8469, + "step": 78948 + }, + { + "epoch": 0.9868996724918123, + "grad_norm": 4.457220077514648, + "learning_rate": 1.0453838057098432e-08, + "loss": 1.7443, + "step": 78950 + }, + { + "epoch": 0.986924673116828, + "grad_norm": 0.000397468771552667, + "learning_rate": 1.0413982904660113e-08, + "loss": 1.0887, + "step": 78952 + }, + { + "epoch": 0.9869496737418435, + "grad_norm": 4.612015724182129, + "learning_rate": 1.0374203831495788e-08, + "loss": 0.8828, + "step": 78954 + }, + { + "epoch": 0.9869746743668592, + "grad_norm": 3.2808570861816406, + "learning_rate": 1.0334500837909655e-08, + "loss": 0.3616, + "step": 78956 + }, + { + "epoch": 0.9869996749918748, + "grad_norm": 4.419154167175293, + "learning_rate": 1.0294873924202587e-08, + "loss": 1.8732, + "step": 78958 + }, + { + "epoch": 0.9870246756168904, + "grad_norm": 0.0004947613342665136, + "learning_rate": 1.0255323090677671e-08, + "loss": 0.8836, + "step": 78960 + }, + { + "epoch": 0.9870496762419061, + "grad_norm": 3.8170082569122314, + "learning_rate": 1.0215848337634671e-08, + "loss": 0.3156, + "step": 78962 + }, + { + "epoch": 0.9870746768669216, + "grad_norm": 0.7179153561592102, + "learning_rate": 1.0176449665375565e-08, + "loss": 0.4889, + "step": 78964 + }, + { + "epoch": 0.9870996774919373, + "grad_norm": 4.264349460601807, + "learning_rate": 1.0137127074200115e-08, + "loss": 0.5481, + "step": 78966 + }, + { + "epoch": 0.9871246781169529, + "grad_norm": 0.7641893029212952, + "learning_rate": 1.009788056440697e-08, + "loss": 0.3667, + "step": 78968 + }, + { + "epoch": 0.9871496787419686, + "grad_norm": 3.7179300785064697, + "learning_rate": 1.0058710136297e-08, + "loss": 0.534, + "step": 78970 + }, + { + "epoch": 0.9871746793669842, + "grad_norm": 0.0003380689595360309, + "learning_rate": 1.0019615790165527e-08, + "loss": 0.0, + "step": 78972 + }, + { + "epoch": 0.9871996799919998, + "grad_norm": 3.466212511062622, + "learning_rate": 9.980597526313419e-09, + "loss": 1.1897, + "step": 78974 + }, + { + "epoch": 0.9872246806170154, + "grad_norm": 5.57037878036499, + "learning_rate": 9.941655345035994e-09, + "loss": 0.9954, + "step": 78976 + }, + { + "epoch": 0.987249681242031, + "grad_norm": 4.003226280212402, + "learning_rate": 9.902789246629685e-09, + "loss": 1.5045, + "step": 78978 + }, + { + "epoch": 0.9872746818670467, + "grad_norm": 7.006101131439209, + "learning_rate": 9.86399923139092e-09, + "loss": 2.3904, + "step": 78980 + }, + { + "epoch": 0.9872996824920623, + "grad_norm": 2.194058895111084, + "learning_rate": 9.825285299615017e-09, + "loss": 0.3817, + "step": 78982 + }, + { + "epoch": 0.9873246831170779, + "grad_norm": 0.030803866684436798, + "learning_rate": 9.786647451598409e-09, + "loss": 0.556, + "step": 78984 + }, + { + "epoch": 0.9873496837420935, + "grad_norm": 2.7462668418884277, + "learning_rate": 9.748085687631969e-09, + "loss": 0.9823, + "step": 78986 + }, + { + "epoch": 0.9873746843671092, + "grad_norm": 4.035213947296143, + "learning_rate": 9.709600008013243e-09, + "loss": 0.6959, + "step": 78988 + }, + { + "epoch": 0.9873996849921248, + "grad_norm": 0.008174384012818336, + "learning_rate": 9.671190413031994e-09, + "loss": 0.0001, + "step": 78990 + }, + { + "epoch": 0.9874246856171405, + "grad_norm": 1.2127376794815063, + "learning_rate": 9.632856902983545e-09, + "loss": 0.5966, + "step": 78992 + }, + { + "epoch": 0.987449686242156, + "grad_norm": 2.3690145015716553, + "learning_rate": 9.594599478157663e-09, + "loss": 0.5917, + "step": 78994 + }, + { + "epoch": 0.9874746868671717, + "grad_norm": 2.5997347831726074, + "learning_rate": 9.556418138846334e-09, + "loss": 0.4398, + "step": 78996 + }, + { + "epoch": 0.9874996874921873, + "grad_norm": 2.6064155101776123, + "learning_rate": 9.518312885340442e-09, + "loss": 0.8885, + "step": 78998 + }, + { + "epoch": 0.987524688117203, + "grad_norm": 2.7205824851989746, + "learning_rate": 9.480283717930861e-09, + "loss": 1.4247, + "step": 79000 + }, + { + "epoch": 0.9875496887422186, + "grad_norm": 3.170909881591797, + "learning_rate": 9.44233063690736e-09, + "loss": 0.4793, + "step": 79002 + }, + { + "epoch": 0.9875746893672341, + "grad_norm": 2.672548532485962, + "learning_rate": 9.404453642557488e-09, + "loss": 0.1188, + "step": 79004 + }, + { + "epoch": 0.9875996899922498, + "grad_norm": 2.417294979095459, + "learning_rate": 9.3666527351699e-09, + "loss": 1.4818, + "step": 79006 + }, + { + "epoch": 0.9876246906172654, + "grad_norm": 2.779205799102783, + "learning_rate": 9.328927915033259e-09, + "loss": 0.9328, + "step": 79008 + }, + { + "epoch": 0.9876496912422811, + "grad_norm": 1.8073829412460327, + "learning_rate": 9.291279182435109e-09, + "loss": 1.2566, + "step": 79010 + }, + { + "epoch": 0.9876746918672967, + "grad_norm": 7.0758137702941895, + "learning_rate": 9.253706537661888e-09, + "loss": 1.6233, + "step": 79012 + }, + { + "epoch": 0.9876996924923123, + "grad_norm": 1.88614022731781, + "learning_rate": 9.216209981000036e-09, + "loss": 1.0416, + "step": 79014 + }, + { + "epoch": 0.9877246931173279, + "grad_norm": 6.737674236297607, + "learning_rate": 9.178789512733765e-09, + "loss": 0.7343, + "step": 79016 + }, + { + "epoch": 0.9877496937423436, + "grad_norm": 6.399205684661865, + "learning_rate": 9.141445133149518e-09, + "loss": 1.0283, + "step": 79018 + }, + { + "epoch": 0.9877746943673592, + "grad_norm": 0.0003690740850288421, + "learning_rate": 9.1041768425304e-09, + "loss": 1.196, + "step": 79020 + }, + { + "epoch": 0.9877996949923749, + "grad_norm": 0.34064093232154846, + "learning_rate": 9.066984641161736e-09, + "loss": 0.0654, + "step": 79022 + }, + { + "epoch": 0.9878246956173904, + "grad_norm": 5.52321195602417, + "learning_rate": 9.029868529326635e-09, + "loss": 0.5468, + "step": 79024 + }, + { + "epoch": 0.987849696242406, + "grad_norm": 0.8983264565467834, + "learning_rate": 8.992828507305983e-09, + "loss": 0.1671, + "step": 79026 + }, + { + "epoch": 0.9878746968674217, + "grad_norm": 2.8690333366394043, + "learning_rate": 8.955864575383999e-09, + "loss": 1.2742, + "step": 79028 + }, + { + "epoch": 0.9878996974924373, + "grad_norm": 4.303831577301025, + "learning_rate": 8.918976733840456e-09, + "loss": 1.1743, + "step": 79030 + }, + { + "epoch": 0.987924698117453, + "grad_norm": 7.334763526916504, + "learning_rate": 8.882164982958464e-09, + "loss": 0.9791, + "step": 79032 + }, + { + "epoch": 0.9879496987424685, + "grad_norm": 2.346665382385254, + "learning_rate": 8.845429323015575e-09, + "loss": 0.8408, + "step": 79034 + }, + { + "epoch": 0.9879746993674842, + "grad_norm": 2.918691873550415, + "learning_rate": 8.808769754293789e-09, + "loss": 1.844, + "step": 79036 + }, + { + "epoch": 0.9879996999924998, + "grad_norm": 3.957803249359131, + "learning_rate": 8.77218627707066e-09, + "loss": 1.5138, + "step": 79038 + }, + { + "epoch": 0.9880247006175155, + "grad_norm": 7.114437580108643, + "learning_rate": 8.735678891627076e-09, + "loss": 0.5909, + "step": 79040 + }, + { + "epoch": 0.9880497012425311, + "grad_norm": 0.2620559334754944, + "learning_rate": 8.69924759823948e-09, + "loss": 1.0257, + "step": 79042 + }, + { + "epoch": 0.9880747018675466, + "grad_norm": 0.00044321510358713567, + "learning_rate": 8.662892397184319e-09, + "loss": 1.1031, + "step": 79044 + }, + { + "epoch": 0.9880997024925623, + "grad_norm": 0.00023538507230114192, + "learning_rate": 8.62661328874026e-09, + "loss": 0.3287, + "step": 79046 + }, + { + "epoch": 0.9881247031175779, + "grad_norm": 1.642342448234558, + "learning_rate": 8.590410273183747e-09, + "loss": 0.8026, + "step": 79048 + }, + { + "epoch": 0.9881497037425936, + "grad_norm": 3.2970006465911865, + "learning_rate": 8.554283350789005e-09, + "loss": 0.6784, + "step": 79050 + }, + { + "epoch": 0.9881747043676092, + "grad_norm": 2.8180291652679443, + "learning_rate": 8.518232521831371e-09, + "loss": 1.1656, + "step": 79052 + }, + { + "epoch": 0.9881997049926248, + "grad_norm": 2.769467830657959, + "learning_rate": 8.482257786587288e-09, + "loss": 0.5762, + "step": 79054 + }, + { + "epoch": 0.9882247056176404, + "grad_norm": 1.5509159564971924, + "learning_rate": 8.446359145328763e-09, + "loss": 0.0404, + "step": 79056 + }, + { + "epoch": 0.9882497062426561, + "grad_norm": 1.6725393533706665, + "learning_rate": 8.410536598328911e-09, + "loss": 0.8084, + "step": 79058 + }, + { + "epoch": 0.9882747068676717, + "grad_norm": 3.633986473083496, + "learning_rate": 8.374790145863065e-09, + "loss": 1.5945, + "step": 79060 + }, + { + "epoch": 0.9882997074926874, + "grad_norm": 3.125627040863037, + "learning_rate": 8.339119788201011e-09, + "loss": 1.1492, + "step": 79062 + }, + { + "epoch": 0.9883247081177029, + "grad_norm": 3.3314409255981445, + "learning_rate": 8.303525525615863e-09, + "loss": 0.5222, + "step": 79064 + }, + { + "epoch": 0.9883497087427185, + "grad_norm": 3.483954668045044, + "learning_rate": 8.268007358377406e-09, + "loss": 0.6805, + "step": 79066 + }, + { + "epoch": 0.9883747093677342, + "grad_norm": 0.004983924794942141, + "learning_rate": 8.232565286756533e-09, + "loss": 0.6142, + "step": 79068 + }, + { + "epoch": 0.9883997099927498, + "grad_norm": 4.443792819976807, + "learning_rate": 8.19719931102414e-09, + "loss": 1.7602, + "step": 79070 + }, + { + "epoch": 0.9884247106177655, + "grad_norm": 0.0003116494044661522, + "learning_rate": 8.1619094314489e-09, + "loss": 0.7589, + "step": 79072 + }, + { + "epoch": 0.988449711242781, + "grad_norm": 7.497478008270264, + "learning_rate": 8.126695648299487e-09, + "loss": 2.3347, + "step": 79074 + }, + { + "epoch": 0.9884747118677967, + "grad_norm": 2.768660545349121, + "learning_rate": 8.091557961844576e-09, + "loss": 0.9387, + "step": 79076 + }, + { + "epoch": 0.9884997124928123, + "grad_norm": 2.285031318664551, + "learning_rate": 8.05649637235062e-09, + "loss": 0.3348, + "step": 79078 + }, + { + "epoch": 0.988524713117828, + "grad_norm": 4.938207626342773, + "learning_rate": 8.021510880086292e-09, + "loss": 0.6015, + "step": 79080 + }, + { + "epoch": 0.9885497137428436, + "grad_norm": 1.6340734958648682, + "learning_rate": 7.986601485316936e-09, + "loss": 0.7206, + "step": 79082 + }, + { + "epoch": 0.9885747143678592, + "grad_norm": 2.91219425201416, + "learning_rate": 7.951768188307896e-09, + "loss": 0.9853, + "step": 79084 + }, + { + "epoch": 0.9885997149928748, + "grad_norm": 5.62523078918457, + "learning_rate": 7.917010989325624e-09, + "loss": 1.8336, + "step": 79086 + }, + { + "epoch": 0.9886247156178904, + "grad_norm": 5.97376823425293, + "learning_rate": 7.882329888634355e-09, + "loss": 0.7088, + "step": 79088 + }, + { + "epoch": 0.9886497162429061, + "grad_norm": 0.0002406137646175921, + "learning_rate": 7.84772488649832e-09, + "loss": 0.4224, + "step": 79090 + }, + { + "epoch": 0.9886747168679217, + "grad_norm": 0.0004054697055835277, + "learning_rate": 7.813195983180644e-09, + "loss": 0.1336, + "step": 79092 + }, + { + "epoch": 0.9886997174929373, + "grad_norm": 4.996142387390137, + "learning_rate": 7.778743178945557e-09, + "loss": 0.9637, + "step": 79094 + }, + { + "epoch": 0.9887247181179529, + "grad_norm": 0.00036019805702380836, + "learning_rate": 7.744366474053965e-09, + "loss": 0.4836, + "step": 79096 + }, + { + "epoch": 0.9887497187429686, + "grad_norm": 5.112180709838867, + "learning_rate": 7.710065868767879e-09, + "loss": 0.3597, + "step": 79098 + }, + { + "epoch": 0.9887747193679842, + "grad_norm": 2.0162065029144287, + "learning_rate": 7.675841363349312e-09, + "loss": 0.3797, + "step": 79100 + }, + { + "epoch": 0.9887997199929999, + "grad_norm": 0.0013121163938194513, + "learning_rate": 7.641692958058056e-09, + "loss": 0.0, + "step": 79102 + }, + { + "epoch": 0.9888247206180154, + "grad_norm": 0.0032034809701144695, + "learning_rate": 7.607620653155012e-09, + "loss": 0.0001, + "step": 79104 + }, + { + "epoch": 0.988849721243031, + "grad_norm": 2.2420899868011475, + "learning_rate": 7.573624448899975e-09, + "loss": 0.6228, + "step": 79106 + }, + { + "epoch": 0.9888747218680467, + "grad_norm": 1.4787962436676025, + "learning_rate": 7.539704345550514e-09, + "loss": 0.0795, + "step": 79108 + }, + { + "epoch": 0.9888997224930623, + "grad_norm": 3.1797592639923096, + "learning_rate": 7.505860343365312e-09, + "loss": 1.6905, + "step": 79110 + }, + { + "epoch": 0.988924723118078, + "grad_norm": 3.1952860355377197, + "learning_rate": 7.472092442601941e-09, + "loss": 0.2332, + "step": 79112 + }, + { + "epoch": 0.9889497237430935, + "grad_norm": 2.7654919624328613, + "learning_rate": 7.438400643519083e-09, + "loss": 1.0335, + "step": 79114 + }, + { + "epoch": 0.9889747243681092, + "grad_norm": 4.169835090637207, + "learning_rate": 7.40478494637209e-09, + "loss": 1.8035, + "step": 79116 + }, + { + "epoch": 0.9889997249931248, + "grad_norm": 2.7016422748565674, + "learning_rate": 7.371245351416312e-09, + "loss": 0.441, + "step": 79118 + }, + { + "epoch": 0.9890247256181405, + "grad_norm": 4.180826187133789, + "learning_rate": 7.337781858909321e-09, + "loss": 2.2216, + "step": 79120 + }, + { + "epoch": 0.9890497262431561, + "grad_norm": 1.521850347518921, + "learning_rate": 7.304394469104248e-09, + "loss": 0.157, + "step": 79122 + }, + { + "epoch": 0.9890747268681717, + "grad_norm": 2.7336490154266357, + "learning_rate": 7.271083182255334e-09, + "loss": 0.4295, + "step": 79124 + }, + { + "epoch": 0.9890997274931873, + "grad_norm": 0.00040442688623443246, + "learning_rate": 7.23784799861682e-09, + "loss": 0.0204, + "step": 79126 + }, + { + "epoch": 0.989124728118203, + "grad_norm": 6.366827487945557, + "learning_rate": 7.204688918441838e-09, + "loss": 1.5393, + "step": 79128 + }, + { + "epoch": 0.9891497287432186, + "grad_norm": 4.134634971618652, + "learning_rate": 7.171605941983517e-09, + "loss": 0.8548, + "step": 79130 + }, + { + "epoch": 0.9891747293682343, + "grad_norm": 3.3578298091888428, + "learning_rate": 7.138599069492769e-09, + "loss": 0.8584, + "step": 79132 + }, + { + "epoch": 0.9891997299932498, + "grad_norm": 5.094082355499268, + "learning_rate": 7.105668301220503e-09, + "loss": 1.4404, + "step": 79134 + }, + { + "epoch": 0.9892247306182654, + "grad_norm": 12.558112144470215, + "learning_rate": 7.072813637419851e-09, + "loss": 1.4323, + "step": 79136 + }, + { + "epoch": 0.9892497312432811, + "grad_norm": 4.352288722991943, + "learning_rate": 7.040035078338392e-09, + "loss": 1.7007, + "step": 79138 + }, + { + "epoch": 0.9892747318682967, + "grad_norm": 1.064680814743042, + "learning_rate": 7.007332624227037e-09, + "loss": 0.0358, + "step": 79140 + }, + { + "epoch": 0.9892997324933124, + "grad_norm": 0.000319183396641165, + "learning_rate": 6.974706275334475e-09, + "loss": 0.4288, + "step": 79142 + }, + { + "epoch": 0.9893247331183279, + "grad_norm": 0.8715663552284241, + "learning_rate": 6.9421560319105075e-09, + "loss": 0.7001, + "step": 79144 + }, + { + "epoch": 0.9893497337433436, + "grad_norm": 0.0031926098745316267, + "learning_rate": 6.909681894201603e-09, + "loss": 0.0366, + "step": 79146 + }, + { + "epoch": 0.9893747343683592, + "grad_norm": 1.836565375328064, + "learning_rate": 6.877283862454231e-09, + "loss": 0.2934, + "step": 79148 + }, + { + "epoch": 0.9893997349933749, + "grad_norm": 0.0004948152927681804, + "learning_rate": 6.844961936917083e-09, + "loss": 0.3578, + "step": 79150 + }, + { + "epoch": 0.9894247356183905, + "grad_norm": 4.121945858001709, + "learning_rate": 6.812716117835516e-09, + "loss": 1.3297, + "step": 79152 + }, + { + "epoch": 0.989449736243406, + "grad_norm": 0.023606780916452408, + "learning_rate": 6.780546405456001e-09, + "loss": 0.3779, + "step": 79154 + }, + { + "epoch": 0.9894747368684217, + "grad_norm": 4.216557502746582, + "learning_rate": 6.748452800021676e-09, + "loss": 1.194, + "step": 79156 + }, + { + "epoch": 0.9894997374934373, + "grad_norm": 3.754575252532959, + "learning_rate": 6.716435301777902e-09, + "loss": 1.1392, + "step": 79158 + }, + { + "epoch": 0.989524738118453, + "grad_norm": 3.3933088779449463, + "learning_rate": 6.684493910968925e-09, + "loss": 1.2356, + "step": 79160 + }, + { + "epoch": 0.9895497387434686, + "grad_norm": 3.378551959991455, + "learning_rate": 6.6526286278378875e-09, + "loss": 1.5547, + "step": 79162 + }, + { + "epoch": 0.9895747393684842, + "grad_norm": 0.291999489068985, + "learning_rate": 6.620839452626815e-09, + "loss": 0.0069, + "step": 79164 + }, + { + "epoch": 0.9895997399934998, + "grad_norm": 2.5869202613830566, + "learning_rate": 6.589126385577738e-09, + "loss": 0.871, + "step": 79166 + }, + { + "epoch": 0.9896247406185155, + "grad_norm": 3.5414865016937256, + "learning_rate": 6.557489426933794e-09, + "loss": 1.7574, + "step": 79168 + }, + { + "epoch": 0.9896497412435311, + "grad_norm": 8.725521087646484, + "learning_rate": 6.525928576934793e-09, + "loss": 1.2401, + "step": 79170 + }, + { + "epoch": 0.9896747418685468, + "grad_norm": 5.057125091552734, + "learning_rate": 6.494443835820541e-09, + "loss": 0.8757, + "step": 79172 + }, + { + "epoch": 0.9896997424935623, + "grad_norm": 2.6175835132598877, + "learning_rate": 6.463035203830847e-09, + "loss": 1.0686, + "step": 79174 + }, + { + "epoch": 0.9897247431185779, + "grad_norm": 6.918074131011963, + "learning_rate": 6.431702681206631e-09, + "loss": 1.9788, + "step": 79176 + }, + { + "epoch": 0.9897497437435936, + "grad_norm": 2.5974857807159424, + "learning_rate": 6.400446268184368e-09, + "loss": 0.5431, + "step": 79178 + }, + { + "epoch": 0.9897747443686092, + "grad_norm": 2.325796604156494, + "learning_rate": 6.369265965003868e-09, + "loss": 1.0661, + "step": 79180 + }, + { + "epoch": 0.9897997449936249, + "grad_norm": 0.0004047048860229552, + "learning_rate": 6.338161771900497e-09, + "loss": 0.4389, + "step": 79182 + }, + { + "epoch": 0.9898247456186404, + "grad_norm": 2.8081610202789307, + "learning_rate": 6.307133689114064e-09, + "loss": 0.7897, + "step": 79184 + }, + { + "epoch": 0.9898497462436561, + "grad_norm": 4.9655985832214355, + "learning_rate": 6.276181716878827e-09, + "loss": 1.8752, + "step": 79186 + }, + { + "epoch": 0.9898747468686717, + "grad_norm": 2.7945117950439453, + "learning_rate": 6.24530585543015e-09, + "loss": 1.113, + "step": 79188 + }, + { + "epoch": 0.9898997474936874, + "grad_norm": 4.119742393493652, + "learning_rate": 6.214506105004514e-09, + "loss": 1.4684, + "step": 79190 + }, + { + "epoch": 0.989924748118703, + "grad_norm": 0.8864877223968506, + "learning_rate": 6.183782465836175e-09, + "loss": 0.9439, + "step": 79192 + }, + { + "epoch": 0.9899497487437185, + "grad_norm": 2.7073628902435303, + "learning_rate": 6.153134938159388e-09, + "loss": 0.3032, + "step": 79194 + }, + { + "epoch": 0.9899747493687342, + "grad_norm": 2.166494369506836, + "learning_rate": 6.122563522206193e-09, + "loss": 0.6865, + "step": 79196 + }, + { + "epoch": 0.9899997499937498, + "grad_norm": 1.023006558418274, + "learning_rate": 6.092068218210845e-09, + "loss": 0.1939, + "step": 79198 + }, + { + "epoch": 0.9900247506187655, + "grad_norm": 3.0383377075195312, + "learning_rate": 6.061649026404271e-09, + "loss": 0.8112, + "step": 79200 + }, + { + "epoch": 0.9900497512437811, + "grad_norm": 2.804351806640625, + "learning_rate": 6.0313059470196164e-09, + "loss": 0.62, + "step": 79202 + }, + { + "epoch": 0.9900747518687967, + "grad_norm": 3.460620641708374, + "learning_rate": 6.0010389802867e-09, + "loss": 0.9173, + "step": 79204 + }, + { + "epoch": 0.9900997524938123, + "grad_norm": 3.830132246017456, + "learning_rate": 5.970848126436446e-09, + "loss": 0.5005, + "step": 79206 + }, + { + "epoch": 0.990124753118828, + "grad_norm": 2.1434266567230225, + "learning_rate": 5.940733385699782e-09, + "loss": 0.6962, + "step": 79208 + }, + { + "epoch": 0.9901497537438436, + "grad_norm": 1.1829895973205566, + "learning_rate": 5.910694758305413e-09, + "loss": 1.72, + "step": 79210 + }, + { + "epoch": 0.9901747543688593, + "grad_norm": 6.669473648071289, + "learning_rate": 5.880732244480936e-09, + "loss": 1.6423, + "step": 79212 + }, + { + "epoch": 0.9901997549938748, + "grad_norm": 0.0004228632024023682, + "learning_rate": 5.8508458444561655e-09, + "loss": 0.9718, + "step": 79214 + }, + { + "epoch": 0.9902247556188905, + "grad_norm": 2.579540491104126, + "learning_rate": 5.821035558457589e-09, + "loss": 1.1433, + "step": 79216 + }, + { + "epoch": 0.9902497562439061, + "grad_norm": 2.0483670234680176, + "learning_rate": 5.7913013867128e-09, + "loss": 0.5136, + "step": 79218 + }, + { + "epoch": 0.9902747568689217, + "grad_norm": 3.088156223297119, + "learning_rate": 5.761643329448285e-09, + "loss": 2.066, + "step": 79220 + }, + { + "epoch": 0.9902997574939374, + "grad_norm": 3.34442138671875, + "learning_rate": 5.732061386889421e-09, + "loss": 0.5837, + "step": 79222 + }, + { + "epoch": 0.9903247581189529, + "grad_norm": 0.00019257045642007142, + "learning_rate": 5.70255555926158e-09, + "loss": 0.0, + "step": 79224 + }, + { + "epoch": 0.9903497587439686, + "grad_norm": 0.00034459109883755445, + "learning_rate": 5.67312584679125e-09, + "loss": 0.0, + "step": 79226 + }, + { + "epoch": 0.9903747593689842, + "grad_norm": 2.8296854496002197, + "learning_rate": 5.643772249699364e-09, + "loss": 0.8606, + "step": 79228 + }, + { + "epoch": 0.9903997599939999, + "grad_norm": 2.1133711338043213, + "learning_rate": 5.614494768212408e-09, + "loss": 1.3805, + "step": 79230 + }, + { + "epoch": 0.9904247606190155, + "grad_norm": 5.005618572235107, + "learning_rate": 5.585293402551317e-09, + "loss": 1.5911, + "step": 79232 + }, + { + "epoch": 0.9904497612440311, + "grad_norm": 2.944528102874756, + "learning_rate": 5.556168152939245e-09, + "loss": 1.9392, + "step": 79234 + }, + { + "epoch": 0.9904747618690467, + "grad_norm": 2.4220633506774902, + "learning_rate": 5.527119019599348e-09, + "loss": 0.7938, + "step": 79236 + }, + { + "epoch": 0.9904997624940624, + "grad_norm": 3.3969948291778564, + "learning_rate": 5.4981460027503375e-09, + "loss": 1.3845, + "step": 79238 + }, + { + "epoch": 0.990524763119078, + "grad_norm": 0.001328842481598258, + "learning_rate": 5.4692491026142605e-09, + "loss": 0.0, + "step": 79240 + }, + { + "epoch": 0.9905497637440936, + "grad_norm": 0.0030317946802824736, + "learning_rate": 5.44042831941094e-09, + "loss": 0.4959, + "step": 79242 + }, + { + "epoch": 0.9905747643691092, + "grad_norm": 0.6782177090644836, + "learning_rate": 5.4116836533602e-09, + "loss": 0.3084, + "step": 79244 + }, + { + "epoch": 0.9905997649941248, + "grad_norm": 3.415444850921631, + "learning_rate": 5.383015104680755e-09, + "loss": 0.2155, + "step": 79246 + }, + { + "epoch": 0.9906247656191405, + "grad_norm": 4.23134708404541, + "learning_rate": 5.3544226735913195e-09, + "loss": 0.5488, + "step": 79248 + }, + { + "epoch": 0.9906497662441561, + "grad_norm": 4.117277145385742, + "learning_rate": 5.325906360308386e-09, + "loss": 0.922, + "step": 79250 + }, + { + "epoch": 0.9906747668691718, + "grad_norm": 0.011867302469909191, + "learning_rate": 5.297466165050669e-09, + "loss": 0.5166, + "step": 79252 + }, + { + "epoch": 0.9906997674941873, + "grad_norm": 1.3669612407684326, + "learning_rate": 5.2691020880335505e-09, + "loss": 0.8777, + "step": 79254 + }, + { + "epoch": 0.990724768119203, + "grad_norm": 4.3147478103637695, + "learning_rate": 5.240814129473526e-09, + "loss": 1.2847, + "step": 79256 + }, + { + "epoch": 0.9907497687442186, + "grad_norm": 2.839900255203247, + "learning_rate": 5.212602289587087e-09, + "loss": 0.582, + "step": 79258 + }, + { + "epoch": 0.9907747693692343, + "grad_norm": 2.4496304988861084, + "learning_rate": 5.184466568587399e-09, + "loss": 0.939, + "step": 79260 + }, + { + "epoch": 0.9907997699942499, + "grad_norm": 6.064820766448975, + "learning_rate": 5.156406966688732e-09, + "loss": 1.9053, + "step": 79262 + }, + { + "epoch": 0.9908247706192654, + "grad_norm": 0.0006901675369590521, + "learning_rate": 5.1284234841064705e-09, + "loss": 0.5815, + "step": 79264 + }, + { + "epoch": 0.9908497712442811, + "grad_norm": 4.681797981262207, + "learning_rate": 5.100516121052667e-09, + "loss": 1.0396, + "step": 79266 + }, + { + "epoch": 0.9908747718692967, + "grad_norm": 1.0688278675079346, + "learning_rate": 5.072684877739376e-09, + "loss": 0.6548, + "step": 79268 + }, + { + "epoch": 0.9908997724943124, + "grad_norm": 3.23160457611084, + "learning_rate": 5.0449297543786464e-09, + "loss": 0.1983, + "step": 79270 + }, + { + "epoch": 0.990924773119328, + "grad_norm": 7.105162620544434, + "learning_rate": 5.017250751183644e-09, + "loss": 0.5829, + "step": 79272 + }, + { + "epoch": 0.9909497737443436, + "grad_norm": 2.907440423965454, + "learning_rate": 4.98964786836309e-09, + "loss": 0.8936, + "step": 79274 + }, + { + "epoch": 0.9909747743693592, + "grad_norm": 2.6241636276245117, + "learning_rate": 4.962121106127926e-09, + "loss": 0.9209, + "step": 79276 + }, + { + "epoch": 0.9909997749943749, + "grad_norm": 2.7234768867492676, + "learning_rate": 4.934670464687985e-09, + "loss": 0.1904, + "step": 79278 + }, + { + "epoch": 0.9910247756193905, + "grad_norm": 3.2368886470794678, + "learning_rate": 4.907295944251989e-09, + "loss": 1.408, + "step": 79280 + }, + { + "epoch": 0.9910497762444062, + "grad_norm": 5.046431064605713, + "learning_rate": 4.879997545028658e-09, + "loss": 1.8236, + "step": 79282 + }, + { + "epoch": 0.9910747768694217, + "grad_norm": 2.5478646755218506, + "learning_rate": 4.852775267226717e-09, + "loss": 1.7467, + "step": 79284 + }, + { + "epoch": 0.9910997774944373, + "grad_norm": 4.841800689697266, + "learning_rate": 4.825629111051555e-09, + "loss": 0.5018, + "step": 79286 + }, + { + "epoch": 0.991124778119453, + "grad_norm": 1.235473394393921, + "learning_rate": 4.798559076710785e-09, + "loss": 0.6638, + "step": 79288 + }, + { + "epoch": 0.9911497787444686, + "grad_norm": 3.668419122695923, + "learning_rate": 4.7715651644120174e-09, + "loss": 1.155, + "step": 79290 + }, + { + "epoch": 0.9911747793694843, + "grad_norm": 2.4012551307678223, + "learning_rate": 4.744647374358424e-09, + "loss": 1.2657, + "step": 79292 + }, + { + "epoch": 0.9911997799944998, + "grad_norm": 3.128526210784912, + "learning_rate": 4.717805706756506e-09, + "loss": 0.9077, + "step": 79294 + }, + { + "epoch": 0.9912247806195155, + "grad_norm": 1.6880810260772705, + "learning_rate": 4.691040161809435e-09, + "loss": 0.0716, + "step": 79296 + }, + { + "epoch": 0.9912497812445311, + "grad_norm": 5.204298973083496, + "learning_rate": 4.664350739722601e-09, + "loss": 0.6293, + "step": 79298 + }, + { + "epoch": 0.9912747818695468, + "grad_norm": 3.8941473960876465, + "learning_rate": 4.637737440699175e-09, + "loss": 1.0322, + "step": 79300 + }, + { + "epoch": 0.9912997824945624, + "grad_norm": 1.1016159057617188, + "learning_rate": 4.611200264941218e-09, + "loss": 0.5355, + "step": 79302 + }, + { + "epoch": 0.991324783119578, + "grad_norm": 3.2606663703918457, + "learning_rate": 4.58473921264968e-09, + "loss": 0.4554, + "step": 79304 + }, + { + "epoch": 0.9913497837445936, + "grad_norm": 9.309544563293457, + "learning_rate": 4.558354284028843e-09, + "loss": 1.6486, + "step": 79306 + }, + { + "epoch": 0.9913747843696092, + "grad_norm": 0.00027126280474476516, + "learning_rate": 4.532045479277436e-09, + "loss": 0.1334, + "step": 79308 + }, + { + "epoch": 0.9913997849946249, + "grad_norm": 4.0586113929748535, + "learning_rate": 4.505812798596409e-09, + "loss": 1.321, + "step": 79310 + }, + { + "epoch": 0.9914247856196405, + "grad_norm": 3.4124646186828613, + "learning_rate": 4.479656242185604e-09, + "loss": 0.3928, + "step": 79312 + }, + { + "epoch": 0.9914497862446561, + "grad_norm": 3.8904736042022705, + "learning_rate": 4.453575810243749e-09, + "loss": 1.2182, + "step": 79314 + }, + { + "epoch": 0.9914747868696717, + "grad_norm": 0.14626365900039673, + "learning_rate": 4.427571502970684e-09, + "loss": 0.6155, + "step": 79316 + }, + { + "epoch": 0.9914997874946874, + "grad_norm": 3.9416489601135254, + "learning_rate": 4.401643320564031e-09, + "loss": 0.8829, + "step": 79318 + }, + { + "epoch": 0.991524788119703, + "grad_norm": 0.5543373227119446, + "learning_rate": 4.375791263220297e-09, + "loss": 0.7776, + "step": 79320 + }, + { + "epoch": 0.9915497887447187, + "grad_norm": 3.0844662189483643, + "learning_rate": 4.350015331137103e-09, + "loss": 0.7285, + "step": 79322 + }, + { + "epoch": 0.9915747893697342, + "grad_norm": 4.602908611297607, + "learning_rate": 4.324315524509848e-09, + "loss": 1.1668, + "step": 79324 + }, + { + "epoch": 0.9915997899947498, + "grad_norm": 3.597536087036133, + "learning_rate": 4.298691843536151e-09, + "loss": 1.7682, + "step": 79326 + }, + { + "epoch": 0.9916247906197655, + "grad_norm": 1.9505271911621094, + "learning_rate": 4.273144288410302e-09, + "loss": 0.7749, + "step": 79328 + }, + { + "epoch": 0.9916497912447811, + "grad_norm": 0.01796312816441059, + "learning_rate": 4.247672859326591e-09, + "loss": 0.0817, + "step": 79330 + }, + { + "epoch": 0.9916747918697968, + "grad_norm": 3.854398250579834, + "learning_rate": 4.222277556478194e-09, + "loss": 0.5923, + "step": 79332 + }, + { + "epoch": 0.9916997924948123, + "grad_norm": 5.0731024742126465, + "learning_rate": 4.196958380060512e-09, + "loss": 2.177, + "step": 79334 + }, + { + "epoch": 0.991724793119828, + "grad_norm": 0.000652286980766803, + "learning_rate": 4.171715330264503e-09, + "loss": 0.4107, + "step": 79336 + }, + { + "epoch": 0.9917497937448436, + "grad_norm": 3.9390006065368652, + "learning_rate": 4.146548407283346e-09, + "loss": 1.1319, + "step": 79338 + }, + { + "epoch": 0.9917747943698593, + "grad_norm": 1.3980602025985718, + "learning_rate": 4.1214576113091095e-09, + "loss": 0.5425, + "step": 79340 + }, + { + "epoch": 0.9917997949948749, + "grad_norm": 4.47852897644043, + "learning_rate": 4.096442942531642e-09, + "loss": 1.4414, + "step": 79342 + }, + { + "epoch": 0.9918247956198905, + "grad_norm": 1.2312216758728027, + "learning_rate": 4.071504401141901e-09, + "loss": 0.6513, + "step": 79344 + }, + { + "epoch": 0.9918497962449061, + "grad_norm": 3.2441787719726562, + "learning_rate": 4.046641987330846e-09, + "loss": 1.4925, + "step": 79346 + }, + { + "epoch": 0.9918747968699217, + "grad_norm": 3.84622859954834, + "learning_rate": 4.021855701286104e-09, + "loss": 0.6409, + "step": 79348 + }, + { + "epoch": 0.9918997974949374, + "grad_norm": 4.564626693725586, + "learning_rate": 3.9971455431975225e-09, + "loss": 0.774, + "step": 79350 + }, + { + "epoch": 0.991924798119953, + "grad_norm": 3.0900065898895264, + "learning_rate": 3.9725115132538405e-09, + "loss": 0.901, + "step": 79352 + }, + { + "epoch": 0.9919497987449686, + "grad_norm": 6.048504829406738, + "learning_rate": 3.9479536116415754e-09, + "loss": 1.5149, + "step": 79354 + }, + { + "epoch": 0.9919747993699842, + "grad_norm": 13.767480850219727, + "learning_rate": 3.923471838548354e-09, + "loss": 2.7187, + "step": 79356 + }, + { + "epoch": 0.9919997999949999, + "grad_norm": 0.682937741279602, + "learning_rate": 3.899066194159584e-09, + "loss": 0.0328, + "step": 79358 + }, + { + "epoch": 0.9920248006200155, + "grad_norm": 2.673563003540039, + "learning_rate": 3.874736678662894e-09, + "loss": 0.3443, + "step": 79360 + }, + { + "epoch": 0.9920498012450312, + "grad_norm": 0.7719922065734863, + "learning_rate": 3.850483292241469e-09, + "loss": 0.1715, + "step": 79362 + }, + { + "epoch": 0.9920748018700467, + "grad_norm": 3.5712199211120605, + "learning_rate": 3.826306035081828e-09, + "loss": 1.3007, + "step": 79364 + }, + { + "epoch": 0.9920998024950624, + "grad_norm": 2.448744058609009, + "learning_rate": 3.8022049073682675e-09, + "loss": 0.3964, + "step": 79366 + }, + { + "epoch": 0.992124803120078, + "grad_norm": 4.107679843902588, + "learning_rate": 3.778179909282864e-09, + "loss": 0.9531, + "step": 79368 + }, + { + "epoch": 0.9921498037450937, + "grad_norm": 2.4595847129821777, + "learning_rate": 3.754231041008805e-09, + "loss": 0.728, + "step": 79370 + }, + { + "epoch": 0.9921748043701093, + "grad_norm": 2.1820366382598877, + "learning_rate": 3.730358302729275e-09, + "loss": 0.3442, + "step": 79372 + }, + { + "epoch": 0.9921998049951248, + "grad_norm": 4.137874603271484, + "learning_rate": 3.7065616946263537e-09, + "loss": 0.3683, + "step": 79374 + }, + { + "epoch": 0.9922248056201405, + "grad_norm": 4.598886966705322, + "learning_rate": 3.6828412168798955e-09, + "loss": 1.0787, + "step": 79376 + }, + { + "epoch": 0.9922498062451561, + "grad_norm": 3.4347734451293945, + "learning_rate": 3.6591968696708668e-09, + "loss": 0.6525, + "step": 79378 + }, + { + "epoch": 0.9922748068701718, + "grad_norm": 6.210668563842773, + "learning_rate": 3.635628653180234e-09, + "loss": 1.2416, + "step": 79380 + }, + { + "epoch": 0.9922998074951874, + "grad_norm": 2.190375566482544, + "learning_rate": 3.6121365675878538e-09, + "loss": 0.1434, + "step": 79382 + }, + { + "epoch": 0.992324808120203, + "grad_norm": 0.00029950044699944556, + "learning_rate": 3.5887206130713613e-09, + "loss": 0.4997, + "step": 79384 + }, + { + "epoch": 0.9923498087452186, + "grad_norm": 3.431417465209961, + "learning_rate": 3.5653807898083927e-09, + "loss": 0.7195, + "step": 79386 + }, + { + "epoch": 0.9923748093702343, + "grad_norm": 3.9511680603027344, + "learning_rate": 3.5421170979788033e-09, + "loss": 1.5524, + "step": 79388 + }, + { + "epoch": 0.9923998099952499, + "grad_norm": 3.6218481063842773, + "learning_rate": 3.5189295377591193e-09, + "loss": 0.7373, + "step": 79390 + }, + { + "epoch": 0.9924248106202656, + "grad_norm": 5.501852512359619, + "learning_rate": 3.4958181093258657e-09, + "loss": 2.1338, + "step": 79392 + }, + { + "epoch": 0.9924498112452811, + "grad_norm": 0.001980218570679426, + "learning_rate": 3.4727828128544584e-09, + "loss": 0.0164, + "step": 79394 + }, + { + "epoch": 0.9924748118702967, + "grad_norm": 11.627636909484863, + "learning_rate": 3.449823648520312e-09, + "loss": 1.9272, + "step": 79396 + }, + { + "epoch": 0.9924998124953124, + "grad_norm": 3.887136459350586, + "learning_rate": 3.426940616498842e-09, + "loss": 0.8422, + "step": 79398 + }, + { + "epoch": 0.992524813120328, + "grad_norm": 2.7042346000671387, + "learning_rate": 3.404133716964353e-09, + "loss": 0.8461, + "step": 79400 + }, + { + "epoch": 0.9925498137453437, + "grad_norm": 4.346796989440918, + "learning_rate": 3.3814029500900403e-09, + "loss": 1.2226, + "step": 79402 + }, + { + "epoch": 0.9925748143703592, + "grad_norm": 1.1696748733520508, + "learning_rate": 3.3587483160490985e-09, + "loss": 0.4671, + "step": 79404 + }, + { + "epoch": 0.9925998149953749, + "grad_norm": 3.5368292331695557, + "learning_rate": 3.3361698150147228e-09, + "loss": 0.9709, + "step": 79406 + }, + { + "epoch": 0.9926248156203905, + "grad_norm": 4.419189453125, + "learning_rate": 3.3136674471589967e-09, + "loss": 0.6662, + "step": 79408 + }, + { + "epoch": 0.9926498162454062, + "grad_norm": 2.509932279586792, + "learning_rate": 3.2912412126517857e-09, + "loss": 1.4763, + "step": 79410 + }, + { + "epoch": 0.9926748168704218, + "grad_norm": 3.3086297512054443, + "learning_rate": 3.2688911116651734e-09, + "loss": 0.2351, + "step": 79412 + }, + { + "epoch": 0.9926998174954373, + "grad_norm": 3.210329532623291, + "learning_rate": 3.2466171443690243e-09, + "loss": 0.9156, + "step": 79414 + }, + { + "epoch": 0.992724818120453, + "grad_norm": 0.00045030866749584675, + "learning_rate": 3.2244193109332024e-09, + "loss": 0.7352, + "step": 79416 + }, + { + "epoch": 0.9927498187454686, + "grad_norm": 0.0008931062766350806, + "learning_rate": 3.2022976115264616e-09, + "loss": 1.0698, + "step": 79418 + }, + { + "epoch": 0.9927748193704843, + "grad_norm": 0.0007093835156410933, + "learning_rate": 3.1802520463164455e-09, + "loss": 0.4861, + "step": 79420 + }, + { + "epoch": 0.9927998199954999, + "grad_norm": 0.03460674360394478, + "learning_rate": 3.1582826154730184e-09, + "loss": 0.001, + "step": 79422 + }, + { + "epoch": 0.9928248206205155, + "grad_norm": 1.2086864709854126, + "learning_rate": 3.1363893191616037e-09, + "loss": 0.0622, + "step": 79424 + }, + { + "epoch": 0.9928498212455311, + "grad_norm": 3.55908203125, + "learning_rate": 3.1145721575498443e-09, + "loss": 1.3189, + "step": 79426 + }, + { + "epoch": 0.9928748218705468, + "grad_norm": 1.6524908542633057, + "learning_rate": 3.0928311308042746e-09, + "loss": 0.663, + "step": 79428 + }, + { + "epoch": 0.9928998224955624, + "grad_norm": 1.7115397453308105, + "learning_rate": 3.0711662390903173e-09, + "loss": 0.8095, + "step": 79430 + }, + { + "epoch": 0.9929248231205781, + "grad_norm": 3.2144742012023926, + "learning_rate": 3.0495774825722857e-09, + "loss": 0.5818, + "step": 79432 + }, + { + "epoch": 0.9929498237455936, + "grad_norm": 3.3247010707855225, + "learning_rate": 3.0280648614144926e-09, + "loss": 1.352, + "step": 79434 + }, + { + "epoch": 0.9929748243706092, + "grad_norm": 0.0002938227553386241, + "learning_rate": 3.006628375782361e-09, + "loss": 0.05, + "step": 79436 + }, + { + "epoch": 0.9929998249956249, + "grad_norm": 5.130789279937744, + "learning_rate": 2.9852680258368737e-09, + "loss": 1.7356, + "step": 79438 + }, + { + "epoch": 0.9930248256206405, + "grad_norm": 0.00021125894272699952, + "learning_rate": 2.9639838117423436e-09, + "loss": 0.0931, + "step": 79440 + }, + { + "epoch": 0.9930498262456562, + "grad_norm": 3.2305331230163574, + "learning_rate": 2.942775733660863e-09, + "loss": 0.7082, + "step": 79442 + }, + { + "epoch": 0.9930748268706717, + "grad_norm": 4.522041320800781, + "learning_rate": 2.9216437917534144e-09, + "loss": 0.6437, + "step": 79444 + }, + { + "epoch": 0.9930998274956874, + "grad_norm": 1.469078540802002, + "learning_rate": 2.9005879861809805e-09, + "loss": 0.0391, + "step": 79446 + }, + { + "epoch": 0.993124828120703, + "grad_norm": 2.2055587768554688, + "learning_rate": 2.879608317103433e-09, + "loss": 0.2626, + "step": 79448 + }, + { + "epoch": 0.9931498287457187, + "grad_norm": 0.4028800129890442, + "learning_rate": 2.8587047846806435e-09, + "loss": 0.3936, + "step": 79450 + }, + { + "epoch": 0.9931748293707343, + "grad_norm": 3.8909032344818115, + "learning_rate": 2.8378773890735956e-09, + "loss": 0.6288, + "step": 79452 + }, + { + "epoch": 0.9931998299957498, + "grad_norm": 2.098691463470459, + "learning_rate": 2.8171261304388298e-09, + "loss": 1.1237, + "step": 79454 + }, + { + "epoch": 0.9932248306207655, + "grad_norm": 4.891333103179932, + "learning_rate": 2.7964510089351083e-09, + "loss": 1.363, + "step": 79456 + }, + { + "epoch": 0.9932498312457811, + "grad_norm": 0.0006375760422088206, + "learning_rate": 2.7758520247189723e-09, + "loss": 0.4384, + "step": 79458 + }, + { + "epoch": 0.9932748318707968, + "grad_norm": 2.9036049842834473, + "learning_rate": 2.7553291779491842e-09, + "loss": 1.896, + "step": 79460 + }, + { + "epoch": 0.9932998324958124, + "grad_norm": 2.6683156490325928, + "learning_rate": 2.7348824687811746e-09, + "loss": 0.5718, + "step": 79462 + }, + { + "epoch": 0.993324833120828, + "grad_norm": 1.9839513301849365, + "learning_rate": 2.7145118973703756e-09, + "loss": 1.2748, + "step": 79464 + }, + { + "epoch": 0.9933498337458436, + "grad_norm": 0.06751073896884918, + "learning_rate": 2.6942174638722174e-09, + "loss": 0.007, + "step": 79466 + }, + { + "epoch": 0.9933748343708593, + "grad_norm": 6.05349063873291, + "learning_rate": 2.6739991684410217e-09, + "loss": 1.396, + "step": 79468 + }, + { + "epoch": 0.9933998349958749, + "grad_norm": 0.0003092929255217314, + "learning_rate": 2.653857011231109e-09, + "loss": 0.6963, + "step": 79470 + }, + { + "epoch": 0.9934248356208906, + "grad_norm": 7.600192546844482, + "learning_rate": 2.6337909923956908e-09, + "loss": 1.3928, + "step": 79472 + }, + { + "epoch": 0.9934498362459061, + "grad_norm": 0.00029219125281088054, + "learning_rate": 2.6138011120868667e-09, + "loss": 0.4956, + "step": 79474 + }, + { + "epoch": 0.9934748368709218, + "grad_norm": 0.006093061063438654, + "learning_rate": 2.5938873704589584e-09, + "loss": 0.1136, + "step": 79476 + }, + { + "epoch": 0.9934998374959374, + "grad_norm": 3.914048433303833, + "learning_rate": 2.5740497676607355e-09, + "loss": 0.8598, + "step": 79478 + }, + { + "epoch": 0.993524838120953, + "grad_norm": 3.0344786643981934, + "learning_rate": 2.554288303845409e-09, + "loss": 1.0685, + "step": 79480 + }, + { + "epoch": 0.9935498387459687, + "grad_norm": 3.558758497238159, + "learning_rate": 2.534602979162859e-09, + "loss": 1.1575, + "step": 79482 + }, + { + "epoch": 0.9935748393709842, + "grad_norm": 0.23497691750526428, + "learning_rate": 2.5149937937629653e-09, + "loss": 0.0056, + "step": 79484 + }, + { + "epoch": 0.9935998399959999, + "grad_norm": 4.050661563873291, + "learning_rate": 2.495460747795608e-09, + "loss": 0.9895, + "step": 79486 + }, + { + "epoch": 0.9936248406210155, + "grad_norm": 3.397362232208252, + "learning_rate": 2.4760038414084473e-09, + "loss": 1.2941, + "step": 79488 + }, + { + "epoch": 0.9936498412460312, + "grad_norm": 3.299558639526367, + "learning_rate": 2.456623074750253e-09, + "loss": 1.5528, + "step": 79490 + }, + { + "epoch": 0.9936748418710468, + "grad_norm": 5.68729829788208, + "learning_rate": 2.437318447968684e-09, + "loss": 0.1676, + "step": 79492 + }, + { + "epoch": 0.9936998424960624, + "grad_norm": 2.8303897380828857, + "learning_rate": 2.4180899612114007e-09, + "loss": 1.6603, + "step": 79494 + }, + { + "epoch": 0.993724843121078, + "grad_norm": 4.485755920410156, + "learning_rate": 2.398937614623842e-09, + "loss": 1.022, + "step": 79496 + }, + { + "epoch": 0.9937498437460937, + "grad_norm": 0.7916280627250671, + "learning_rate": 2.379861408351447e-09, + "loss": 0.6028, + "step": 79498 + }, + { + "epoch": 0.9937748443711093, + "grad_norm": 3.376810312271118, + "learning_rate": 2.3608613425407657e-09, + "loss": 1.1638, + "step": 79500 + }, + { + "epoch": 0.993799844996125, + "grad_norm": 0.02977890893816948, + "learning_rate": 2.3419374173361264e-09, + "loss": 0.0887, + "step": 79502 + }, + { + "epoch": 0.9938248456211405, + "grad_norm": 0.7375848889350891, + "learning_rate": 2.3230896328807486e-09, + "loss": 0.0386, + "step": 79504 + }, + { + "epoch": 0.9938498462461561, + "grad_norm": 2.776925802230835, + "learning_rate": 2.304317989320071e-09, + "loss": 0.0948, + "step": 79506 + }, + { + "epoch": 0.9938748468711718, + "grad_norm": 0.061128824949264526, + "learning_rate": 2.2856224867950915e-09, + "loss": 0.327, + "step": 79508 + }, + { + "epoch": 0.9938998474961874, + "grad_norm": 8.527914047241211, + "learning_rate": 2.26700312544903e-09, + "loss": 1.1356, + "step": 79510 + }, + { + "epoch": 0.9939248481212031, + "grad_norm": 0.8321837782859802, + "learning_rate": 2.2484599054239944e-09, + "loss": 0.0612, + "step": 79512 + }, + { + "epoch": 0.9939498487462186, + "grad_norm": 4.313229084014893, + "learning_rate": 2.229992826860983e-09, + "loss": 0.8324, + "step": 79514 + }, + { + "epoch": 0.9939748493712343, + "grad_norm": 5.663010120391846, + "learning_rate": 2.2116018898998835e-09, + "loss": 2.5113, + "step": 79516 + }, + { + "epoch": 0.9939998499962499, + "grad_norm": 2.5372564792633057, + "learning_rate": 2.1932870946816955e-09, + "loss": 1.1036, + "step": 79518 + }, + { + "epoch": 0.9940248506212656, + "grad_norm": 3.640706777572632, + "learning_rate": 2.175048441346306e-09, + "loss": 1.3579, + "step": 79520 + }, + { + "epoch": 0.9940498512462812, + "grad_norm": 1.7243261337280273, + "learning_rate": 2.1568859300313827e-09, + "loss": 0.7598, + "step": 79522 + }, + { + "epoch": 0.9940748518712967, + "grad_norm": 4.653295993804932, + "learning_rate": 2.138799560875704e-09, + "loss": 1.3166, + "step": 79524 + }, + { + "epoch": 0.9940998524963124, + "grad_norm": 3.510471820831299, + "learning_rate": 2.1207893340169373e-09, + "loss": 0.6275, + "step": 79526 + }, + { + "epoch": 0.994124853121328, + "grad_norm": 0.8042493462562561, + "learning_rate": 2.10285524959275e-09, + "loss": 0.4549, + "step": 79528 + }, + { + "epoch": 0.9941498537463437, + "grad_norm": 1.955871343612671, + "learning_rate": 2.08499730773859e-09, + "loss": 0.761, + "step": 79530 + }, + { + "epoch": 0.9941748543713593, + "grad_norm": 3.0038774013519287, + "learning_rate": 2.067215508592124e-09, + "loss": 0.6767, + "step": 79532 + }, + { + "epoch": 0.9941998549963749, + "grad_norm": 2.5960302352905273, + "learning_rate": 2.04950985228769e-09, + "loss": 0.8652, + "step": 79534 + }, + { + "epoch": 0.9942248556213905, + "grad_norm": 0.8836200833320618, + "learning_rate": 2.0318803389607345e-09, + "loss": 1.0349, + "step": 79536 + }, + { + "epoch": 0.9942498562464062, + "grad_norm": 1.8726893663406372, + "learning_rate": 2.0143269687444845e-09, + "loss": 0.575, + "step": 79538 + }, + { + "epoch": 0.9942748568714218, + "grad_norm": 3.7409610748291016, + "learning_rate": 1.996849741773277e-09, + "loss": 1.4335, + "step": 79540 + }, + { + "epoch": 0.9942998574964375, + "grad_norm": 9.178518295288086, + "learning_rate": 1.979448658181449e-09, + "loss": 2.2074, + "step": 79542 + }, + { + "epoch": 0.994324858121453, + "grad_norm": 8.408122062683105, + "learning_rate": 1.962123718098896e-09, + "loss": 1.1878, + "step": 79544 + }, + { + "epoch": 0.9943498587464686, + "grad_norm": 0.8655619025230408, + "learning_rate": 1.944874921658846e-09, + "loss": 0.2317, + "step": 79546 + }, + { + "epoch": 0.9943748593714843, + "grad_norm": 3.2138633728027344, + "learning_rate": 1.9277022689934145e-09, + "loss": 1.5057, + "step": 79548 + }, + { + "epoch": 0.9943998599964999, + "grad_norm": 8.82323169708252, + "learning_rate": 1.9106057602336082e-09, + "loss": 0.2169, + "step": 79550 + }, + { + "epoch": 0.9944248606215156, + "grad_norm": 2.4840872287750244, + "learning_rate": 1.893585395507103e-09, + "loss": 0.3095, + "step": 79552 + }, + { + "epoch": 0.9944498612465311, + "grad_norm": 0.5880182385444641, + "learning_rate": 1.8766411749460147e-09, + "loss": 0.4544, + "step": 79554 + }, + { + "epoch": 0.9944748618715468, + "grad_norm": 0.0003380657872185111, + "learning_rate": 1.8597730986791295e-09, + "loss": 0.5355, + "step": 79556 + }, + { + "epoch": 0.9944998624965624, + "grad_norm": 5.401800155639648, + "learning_rate": 1.8429811668330133e-09, + "loss": 1.5517, + "step": 79558 + }, + { + "epoch": 0.9945248631215781, + "grad_norm": 3.637343645095825, + "learning_rate": 1.8262653795375618e-09, + "loss": 1.069, + "step": 79560 + }, + { + "epoch": 0.9945498637465937, + "grad_norm": 1.1998273134231567, + "learning_rate": 1.8096257369193405e-09, + "loss": 0.0846, + "step": 79562 + }, + { + "epoch": 0.9945748643716092, + "grad_norm": 1.4652403593063354, + "learning_rate": 1.7930622391060248e-09, + "loss": 0.5185, + "step": 79564 + }, + { + "epoch": 0.9945998649966249, + "grad_norm": 4.243132591247559, + "learning_rate": 1.7765748862219602e-09, + "loss": 0.8787, + "step": 79566 + }, + { + "epoch": 0.9946248656216405, + "grad_norm": 0.00033208029344677925, + "learning_rate": 1.7601636783937115e-09, + "loss": 0.159, + "step": 79568 + }, + { + "epoch": 0.9946498662466562, + "grad_norm": 2.2949674129486084, + "learning_rate": 1.7438286157456242e-09, + "loss": 0.4835, + "step": 79570 + }, + { + "epoch": 0.9946748668716718, + "grad_norm": 3.558187246322632, + "learning_rate": 1.7275696984031532e-09, + "loss": 0.1822, + "step": 79572 + }, + { + "epoch": 0.9946998674966874, + "grad_norm": 0.5523428320884705, + "learning_rate": 1.7113869264895334e-09, + "loss": 0.5468, + "step": 79574 + }, + { + "epoch": 0.994724868121703, + "grad_norm": 3.979735851287842, + "learning_rate": 1.6952803001291095e-09, + "loss": 1.641, + "step": 79576 + }, + { + "epoch": 0.9947498687467187, + "grad_norm": 4.333659648895264, + "learning_rate": 1.6792498194428963e-09, + "loss": 1.3801, + "step": 79578 + }, + { + "epoch": 0.9947748693717343, + "grad_norm": 0.002299453830346465, + "learning_rate": 1.663295484553018e-09, + "loss": 0.2268, + "step": 79580 + }, + { + "epoch": 0.99479986999675, + "grad_norm": 2.0828380584716797, + "learning_rate": 1.6474172955815993e-09, + "loss": 0.1742, + "step": 79582 + }, + { + "epoch": 0.9948248706217655, + "grad_norm": 1.5783634185791016, + "learning_rate": 1.6316152526496542e-09, + "loss": 0.0503, + "step": 79584 + }, + { + "epoch": 0.9948498712467811, + "grad_norm": 4.080379009246826, + "learning_rate": 1.6158893558781974e-09, + "loss": 0.7682, + "step": 79586 + }, + { + "epoch": 0.9948748718717968, + "grad_norm": 3.1554081439971924, + "learning_rate": 1.6002396053860225e-09, + "loss": 0.8363, + "step": 79588 + }, + { + "epoch": 0.9948998724968124, + "grad_norm": 2.179093599319458, + "learning_rate": 1.5846660012919236e-09, + "loss": 0.5938, + "step": 79590 + }, + { + "epoch": 0.9949248731218281, + "grad_norm": 3.239154815673828, + "learning_rate": 1.5691685437158045e-09, + "loss": 2.0105, + "step": 79592 + }, + { + "epoch": 0.9949498737468436, + "grad_norm": 2.9438154697418213, + "learning_rate": 1.5537472327742387e-09, + "loss": 0.7782, + "step": 79594 + }, + { + "epoch": 0.9949748743718593, + "grad_norm": 1.55923593044281, + "learning_rate": 1.53840206858602e-09, + "loss": 0.6643, + "step": 79596 + }, + { + "epoch": 0.9949998749968749, + "grad_norm": 1.9836483001708984, + "learning_rate": 1.5231330512677224e-09, + "loss": 0.7491, + "step": 79598 + }, + { + "epoch": 0.9950248756218906, + "grad_norm": 4.4720964431762695, + "learning_rate": 1.5079401809348082e-09, + "loss": 1.2087, + "step": 79600 + }, + { + "epoch": 0.9950498762469062, + "grad_norm": 2.8041956424713135, + "learning_rate": 1.4928234577027411e-09, + "loss": 1.0107, + "step": 79602 + }, + { + "epoch": 0.9950748768719218, + "grad_norm": 0.017274625599384308, + "learning_rate": 1.4777828816880945e-09, + "loss": 0.0004, + "step": 79604 + }, + { + "epoch": 0.9950998774969374, + "grad_norm": 4.106812953948975, + "learning_rate": 1.4628184530052214e-09, + "loss": 0.9082, + "step": 79606 + }, + { + "epoch": 0.995124878121953, + "grad_norm": 2.523606538772583, + "learning_rate": 1.4479301717673643e-09, + "loss": 0.9287, + "step": 79608 + }, + { + "epoch": 0.9951498787469687, + "grad_norm": 4.941367149353027, + "learning_rate": 1.4331180380877663e-09, + "loss": 1.0606, + "step": 79610 + }, + { + "epoch": 0.9951748793719843, + "grad_norm": 3.3226945400238037, + "learning_rate": 1.4183820520796698e-09, + "loss": 1.2439, + "step": 79612 + }, + { + "epoch": 0.9951998799969999, + "grad_norm": 0.00022580462973564863, + "learning_rate": 1.4037222138563178e-09, + "loss": 0.5078, + "step": 79614 + }, + { + "epoch": 0.9952248806220155, + "grad_norm": 3.9953269958496094, + "learning_rate": 1.389138523526512e-09, + "loss": 1.7814, + "step": 79616 + }, + { + "epoch": 0.9952498812470312, + "grad_norm": 3.3800370693206787, + "learning_rate": 1.3746309812046054e-09, + "loss": 0.3907, + "step": 79618 + }, + { + "epoch": 0.9952748818720468, + "grad_norm": 2.846156597137451, + "learning_rate": 1.3601995869982897e-09, + "loss": 0.7279, + "step": 79620 + }, + { + "epoch": 0.9952998824970625, + "grad_norm": 3.3166861534118652, + "learning_rate": 1.3458443410196975e-09, + "loss": 0.4716, + "step": 79622 + }, + { + "epoch": 0.995324883122078, + "grad_norm": 2.0498623847961426, + "learning_rate": 1.3315652433765203e-09, + "loss": 1.1172, + "step": 79624 + }, + { + "epoch": 0.9953498837470937, + "grad_norm": 1.9905709028244019, + "learning_rate": 1.3173622941786701e-09, + "loss": 1.0638, + "step": 79626 + }, + { + "epoch": 0.9953748843721093, + "grad_norm": 4.843832969665527, + "learning_rate": 1.3032354935338387e-09, + "loss": 0.3651, + "step": 79628 + }, + { + "epoch": 0.995399884997125, + "grad_norm": 3.007969856262207, + "learning_rate": 1.2891848415497176e-09, + "loss": 0.5803, + "step": 79630 + }, + { + "epoch": 0.9954248856221406, + "grad_norm": 3.6012496948242188, + "learning_rate": 1.2752103383328885e-09, + "loss": 1.3054, + "step": 79632 + }, + { + "epoch": 0.9954498862471561, + "grad_norm": 2.180325746536255, + "learning_rate": 1.2613119839910426e-09, + "loss": 0.8034, + "step": 79634 + }, + { + "epoch": 0.9954748868721718, + "grad_norm": 3.0118343830108643, + "learning_rate": 1.2474897786285412e-09, + "loss": 0.4435, + "step": 79636 + }, + { + "epoch": 0.9954998874971874, + "grad_norm": 6.159955978393555, + "learning_rate": 1.2337437223508553e-09, + "loss": 1.9379, + "step": 79638 + }, + { + "epoch": 0.9955248881222031, + "grad_norm": 2.788670063018799, + "learning_rate": 1.2200738152634562e-09, + "loss": 1.0567, + "step": 79640 + }, + { + "epoch": 0.9955498887472187, + "grad_norm": 2.9255738258361816, + "learning_rate": 1.2064800574695945e-09, + "loss": 0.9099, + "step": 79642 + }, + { + "epoch": 0.9955748893722343, + "grad_norm": 1.4851559400558472, + "learning_rate": 1.1929624490736313e-09, + "loss": 0.7081, + "step": 79644 + }, + { + "epoch": 0.9955998899972499, + "grad_norm": 0.22907398641109467, + "learning_rate": 1.179520990177707e-09, + "loss": 0.0983, + "step": 79646 + }, + { + "epoch": 0.9956248906222656, + "grad_norm": 2.421781539916992, + "learning_rate": 1.1661556808839624e-09, + "loss": 0.1845, + "step": 79648 + }, + { + "epoch": 0.9956498912472812, + "grad_norm": 0.000565846567042172, + "learning_rate": 1.1528665212956481e-09, + "loss": 0.6578, + "step": 79650 + }, + { + "epoch": 0.9956748918722969, + "grad_norm": 3.1183385848999023, + "learning_rate": 1.139653511513794e-09, + "loss": 0.6154, + "step": 79652 + }, + { + "epoch": 0.9956998924973124, + "grad_norm": 3.7708442211151123, + "learning_rate": 1.1265166516372106e-09, + "loss": 0.7753, + "step": 79654 + }, + { + "epoch": 0.995724893122328, + "grad_norm": 1.1847479343414307, + "learning_rate": 1.1134559417680379e-09, + "loss": 0.1704, + "step": 79656 + }, + { + "epoch": 0.9957498937473437, + "grad_norm": 3.2832894325256348, + "learning_rate": 1.100471382003976e-09, + "loss": 0.9396, + "step": 79658 + }, + { + "epoch": 0.9957748943723593, + "grad_norm": 0.7933589816093445, + "learning_rate": 1.0875629724449444e-09, + "loss": 0.0608, + "step": 79660 + }, + { + "epoch": 0.995799894997375, + "grad_norm": 4.423354625701904, + "learning_rate": 1.0747307131897533e-09, + "loss": 0.2587, + "step": 79662 + }, + { + "epoch": 0.9958248956223905, + "grad_norm": 4.726137638092041, + "learning_rate": 1.061974604334992e-09, + "loss": 1.0465, + "step": 79664 + }, + { + "epoch": 0.9958498962474062, + "grad_norm": 15.378212928771973, + "learning_rate": 1.0492946459783604e-09, + "loss": 0.8851, + "step": 79666 + }, + { + "epoch": 0.9958748968724218, + "grad_norm": 4.076320171356201, + "learning_rate": 1.0366908382153372e-09, + "loss": 1.8465, + "step": 79668 + }, + { + "epoch": 0.9958998974974375, + "grad_norm": 3.4085936546325684, + "learning_rate": 1.0241631811436225e-09, + "loss": 0.5941, + "step": 79670 + }, + { + "epoch": 0.9959248981224531, + "grad_norm": 2.5939266681671143, + "learning_rate": 1.0117116748586954e-09, + "loss": 1.1137, + "step": 79672 + }, + { + "epoch": 0.9959498987474686, + "grad_norm": 2.1891818046569824, + "learning_rate": 9.993363194538142e-10, + "loss": 0.4552, + "step": 79674 + }, + { + "epoch": 0.9959748993724843, + "grad_norm": 1.2552765607833862, + "learning_rate": 9.870371150233482e-10, + "loss": 0.0451, + "step": 79676 + }, + { + "epoch": 0.9959998999974999, + "grad_norm": 3.464733600616455, + "learning_rate": 9.748140616627765e-10, + "loss": 0.863, + "step": 79678 + }, + { + "epoch": 0.9960249006225156, + "grad_norm": 4.645319938659668, + "learning_rate": 9.626671594631376e-10, + "loss": 1.3574, + "step": 79680 + }, + { + "epoch": 0.9960499012475312, + "grad_norm": 0.15291570127010345, + "learning_rate": 9.505964085188003e-10, + "loss": 0.7192, + "step": 79682 + }, + { + "epoch": 0.9960749018725468, + "grad_norm": 2.086918592453003, + "learning_rate": 9.386018089196924e-10, + "loss": 1.0893, + "step": 79684 + }, + { + "epoch": 0.9960999024975624, + "grad_norm": 3.5893607139587402, + "learning_rate": 9.266833607590731e-10, + "loss": 0.8205, + "step": 79686 + }, + { + "epoch": 0.9961249031225781, + "grad_norm": 0.005084471311420202, + "learning_rate": 9.148410641268701e-10, + "loss": 0.3565, + "step": 79688 + }, + { + "epoch": 0.9961499037475937, + "grad_norm": 3.689812183380127, + "learning_rate": 9.030749191119015e-10, + "loss": 2.2732, + "step": 79690 + }, + { + "epoch": 0.9961749043726094, + "grad_norm": 2.6311960220336914, + "learning_rate": 8.913849258063156e-10, + "loss": 0.5441, + "step": 79692 + }, + { + "epoch": 0.9961999049976249, + "grad_norm": 4.8500518798828125, + "learning_rate": 8.797710842978203e-10, + "loss": 1.3243, + "step": 79694 + }, + { + "epoch": 0.9962249056226405, + "grad_norm": 3.603494644165039, + "learning_rate": 8.68233394675233e-10, + "loss": 2.2479, + "step": 79696 + }, + { + "epoch": 0.9962499062476562, + "grad_norm": 6.61830997467041, + "learning_rate": 8.567718570251516e-10, + "loss": 0.7695, + "step": 79698 + }, + { + "epoch": 0.9962749068726718, + "grad_norm": 1.0897670984268188, + "learning_rate": 8.453864714363936e-10, + "loss": 0.7269, + "step": 79700 + }, + { + "epoch": 0.9962999074976875, + "grad_norm": 9.394497871398926, + "learning_rate": 8.340772379955564e-10, + "loss": 0.6954, + "step": 79702 + }, + { + "epoch": 0.996324908122703, + "grad_norm": 3.3592827320098877, + "learning_rate": 8.228441567881273e-10, + "loss": 1.1763, + "step": 79704 + }, + { + "epoch": 0.9963499087477187, + "grad_norm": 0.00822974182665348, + "learning_rate": 8.116872278995935e-10, + "loss": 0.4232, + "step": 79706 + }, + { + "epoch": 0.9963749093727343, + "grad_norm": 3.506685733795166, + "learning_rate": 8.006064514154421e-10, + "loss": 0.7951, + "step": 79708 + }, + { + "epoch": 0.99639990999775, + "grad_norm": 2.9227294921875, + "learning_rate": 7.896018274200501e-10, + "loss": 0.9393, + "step": 79710 + }, + { + "epoch": 0.9964249106227656, + "grad_norm": 7.791175842285156, + "learning_rate": 7.786733559977944e-10, + "loss": 1.8313, + "step": 79712 + }, + { + "epoch": 0.9964499112477812, + "grad_norm": 4.6176276206970215, + "learning_rate": 7.678210372297213e-10, + "loss": 2.0666, + "step": 79714 + }, + { + "epoch": 0.9964749118727968, + "grad_norm": 4.313295841217041, + "learning_rate": 7.570448712002076e-10, + "loss": 1.5932, + "step": 79716 + }, + { + "epoch": 0.9964999124978124, + "grad_norm": 3.4666903018951416, + "learning_rate": 7.463448579914101e-10, + "loss": 0.2918, + "step": 79718 + }, + { + "epoch": 0.9965249131228281, + "grad_norm": 3.572413682937622, + "learning_rate": 7.357209976843749e-10, + "loss": 1.3636, + "step": 79720 + }, + { + "epoch": 0.9965499137478437, + "grad_norm": 6.307526588439941, + "learning_rate": 7.251732903601483e-10, + "loss": 1.1742, + "step": 79722 + }, + { + "epoch": 0.9965749143728593, + "grad_norm": 3.0007548332214355, + "learning_rate": 7.147017360986664e-10, + "loss": 0.5678, + "step": 79724 + }, + { + "epoch": 0.9965999149978749, + "grad_norm": 5.545618534088135, + "learning_rate": 7.043063349798651e-10, + "loss": 1.8141, + "step": 79726 + }, + { + "epoch": 0.9966249156228906, + "grad_norm": 2.410789966583252, + "learning_rate": 6.939870870836807e-10, + "loss": 0.2987, + "step": 79728 + }, + { + "epoch": 0.9966499162479062, + "grad_norm": 2.8653271198272705, + "learning_rate": 6.837439924878287e-10, + "loss": 0.4534, + "step": 79730 + }, + { + "epoch": 0.9966749168729219, + "grad_norm": 4.762612342834473, + "learning_rate": 6.735770512700246e-10, + "loss": 0.7641, + "step": 79732 + }, + { + "epoch": 0.9966999174979374, + "grad_norm": 0.993643045425415, + "learning_rate": 6.634862635090944e-10, + "loss": 0.0442, + "step": 79734 + }, + { + "epoch": 0.996724918122953, + "grad_norm": 3.293487548828125, + "learning_rate": 6.534716292805332e-10, + "loss": 0.768, + "step": 79736 + }, + { + "epoch": 0.9967499187479687, + "grad_norm": 7.4836835861206055, + "learning_rate": 6.435331486609464e-10, + "loss": 2.1234, + "step": 79738 + }, + { + "epoch": 0.9967749193729843, + "grad_norm": 1.5351288318634033, + "learning_rate": 6.336708217258292e-10, + "loss": 0.5619, + "step": 79740 + }, + { + "epoch": 0.996799919998, + "grad_norm": 3.7976064682006836, + "learning_rate": 6.238846485517869e-10, + "loss": 0.4516, + "step": 79742 + }, + { + "epoch": 0.9968249206230155, + "grad_norm": 1.0068656206130981, + "learning_rate": 6.141746292109841e-10, + "loss": 0.0947, + "step": 79744 + }, + { + "epoch": 0.9968499212480312, + "grad_norm": 3.003676652908325, + "learning_rate": 6.04540763780026e-10, + "loss": 1.6476, + "step": 79746 + }, + { + "epoch": 0.9968749218730468, + "grad_norm": 0.00017663172911852598, + "learning_rate": 5.949830523299671e-10, + "loss": 0.0152, + "step": 79748 + }, + { + "epoch": 0.9968999224980625, + "grad_norm": 0.5904773473739624, + "learning_rate": 5.85501494934082e-10, + "loss": 0.6932, + "step": 79750 + }, + { + "epoch": 0.9969249231230781, + "grad_norm": 3.327974796295166, + "learning_rate": 5.760960916656455e-10, + "loss": 1.1635, + "step": 79752 + }, + { + "epoch": 0.9969499237480937, + "grad_norm": 3.571770668029785, + "learning_rate": 5.667668425946016e-10, + "loss": 0.8774, + "step": 79754 + }, + { + "epoch": 0.9969749243731093, + "grad_norm": 1.8380765914916992, + "learning_rate": 5.57513747794225e-10, + "loss": 0.0836, + "step": 79756 + }, + { + "epoch": 0.996999924998125, + "grad_norm": 2.755059242248535, + "learning_rate": 5.483368073333495e-10, + "loss": 0.647, + "step": 79758 + }, + { + "epoch": 0.9970249256231406, + "grad_norm": 4.458232402801514, + "learning_rate": 5.392360212819192e-10, + "loss": 1.4385, + "step": 79760 + }, + { + "epoch": 0.9970499262481562, + "grad_norm": 0.0011470619356259704, + "learning_rate": 5.302113897098781e-10, + "loss": 0.4062, + "step": 79762 + }, + { + "epoch": 0.9970749268731718, + "grad_norm": 0.7802861928939819, + "learning_rate": 5.212629126860602e-10, + "loss": 0.7338, + "step": 79764 + }, + { + "epoch": 0.9970999274981874, + "grad_norm": 4.524814128875732, + "learning_rate": 5.12390590278189e-10, + "loss": 1.1361, + "step": 79766 + }, + { + "epoch": 0.9971249281232031, + "grad_norm": 5.425355434417725, + "learning_rate": 5.035944225528777e-10, + "loss": 1.9035, + "step": 79768 + }, + { + "epoch": 0.9971499287482187, + "grad_norm": 4.143605709075928, + "learning_rate": 4.948744095789604e-10, + "loss": 0.6027, + "step": 79770 + }, + { + "epoch": 0.9971749293732344, + "grad_norm": 4.077476978302002, + "learning_rate": 4.862305514219401e-10, + "loss": 0.5645, + "step": 79772 + }, + { + "epoch": 0.9971999299982499, + "grad_norm": 0.0068418788723647594, + "learning_rate": 4.776628481484302e-10, + "loss": 0.0835, + "step": 79774 + }, + { + "epoch": 0.9972249306232656, + "grad_norm": 3.518474817276001, + "learning_rate": 4.691712998217135e-10, + "loss": 0.6736, + "step": 79776 + }, + { + "epoch": 0.9972499312482812, + "grad_norm": 3.40678334236145, + "learning_rate": 4.6075590650840327e-10, + "loss": 1.2227, + "step": 79778 + }, + { + "epoch": 0.9972749318732969, + "grad_norm": 0.0005359516362659633, + "learning_rate": 4.5241666827178233e-10, + "loss": 1.1923, + "step": 79780 + }, + { + "epoch": 0.9972999324983125, + "grad_norm": 4.805984973907471, + "learning_rate": 4.441535851762435e-10, + "loss": 2.4235, + "step": 79782 + }, + { + "epoch": 0.997324933123328, + "grad_norm": 2.2744557857513428, + "learning_rate": 4.3596665728284916e-10, + "loss": 0.2978, + "step": 79784 + }, + { + "epoch": 0.9973499337483437, + "grad_norm": 1.819068431854248, + "learning_rate": 4.2785588465488193e-10, + "loss": 0.7036, + "step": 79786 + }, + { + "epoch": 0.9973749343733593, + "grad_norm": 3.687711477279663, + "learning_rate": 4.198212673556246e-10, + "loss": 1.157, + "step": 79788 + }, + { + "epoch": 0.997399934998375, + "grad_norm": 5.877627849578857, + "learning_rate": 4.118628054439189e-10, + "loss": 1.6111, + "step": 79790 + }, + { + "epoch": 0.9974249356233906, + "grad_norm": 6.110001564025879, + "learning_rate": 4.0398049898193737e-10, + "loss": 2.0784, + "step": 79792 + }, + { + "epoch": 0.9974499362484062, + "grad_norm": 7.984957695007324, + "learning_rate": 3.9617434802852185e-10, + "loss": 0.3669, + "step": 79794 + }, + { + "epoch": 0.9974749368734218, + "grad_norm": 0.00036231157719157636, + "learning_rate": 3.884443526447346e-10, + "loss": 0.4585, + "step": 79796 + }, + { + "epoch": 0.9974999374984375, + "grad_norm": 5.491449356079102, + "learning_rate": 3.8079051288719693e-10, + "loss": 0.4005, + "step": 79798 + }, + { + "epoch": 0.9975249381234531, + "grad_norm": 3.2017662525177, + "learning_rate": 3.732128288169712e-10, + "loss": 1.4548, + "step": 79800 + }, + { + "epoch": 0.9975499387484688, + "grad_norm": 0.741891622543335, + "learning_rate": 3.6571130048956847e-10, + "loss": 0.7616, + "step": 79802 + }, + { + "epoch": 0.9975749393734843, + "grad_norm": 4.8388848304748535, + "learning_rate": 3.5828592796272046e-10, + "loss": 1.1676, + "step": 79804 + }, + { + "epoch": 0.9975999399984999, + "grad_norm": 3.4485714435577393, + "learning_rate": 3.5093671129304843e-10, + "loss": 0.9943, + "step": 79806 + }, + { + "epoch": 0.9976249406235156, + "grad_norm": 3.1281464099884033, + "learning_rate": 3.4366365053606355e-10, + "loss": 1.2219, + "step": 79808 + }, + { + "epoch": 0.9976499412485312, + "grad_norm": 2.214315891265869, + "learning_rate": 3.364667457483872e-10, + "loss": 0.5297, + "step": 79810 + }, + { + "epoch": 0.9976749418735469, + "grad_norm": 4.331299304962158, + "learning_rate": 3.293459969844204e-10, + "loss": 1.4686, + "step": 79812 + }, + { + "epoch": 0.9976999424985624, + "grad_norm": 2.490161180496216, + "learning_rate": 3.223014042974537e-10, + "loss": 1.275, + "step": 79814 + }, + { + "epoch": 0.9977249431235781, + "grad_norm": 0.7265616059303284, + "learning_rate": 3.1533296774188816e-10, + "loss": 0.126, + "step": 79816 + }, + { + "epoch": 0.9977499437485937, + "grad_norm": 3.9701015949249268, + "learning_rate": 3.084406873710144e-10, + "loss": 1.5619, + "step": 79818 + }, + { + "epoch": 0.9977749443736094, + "grad_norm": 4.800939559936523, + "learning_rate": 3.0162456323590273e-10, + "loss": 1.0064, + "step": 79820 + }, + { + "epoch": 0.997799944998625, + "grad_norm": 2.408612012863159, + "learning_rate": 2.9488459539095406e-10, + "loss": 0.7915, + "step": 79822 + }, + { + "epoch": 0.9978249456236405, + "grad_norm": 2.6793107986450195, + "learning_rate": 2.8822078388501817e-10, + "loss": 0.931, + "step": 79824 + }, + { + "epoch": 0.9978499462486562, + "grad_norm": 2.5547032356262207, + "learning_rate": 2.816331287702756e-10, + "loss": 1.0019, + "step": 79826 + }, + { + "epoch": 0.9978749468736718, + "grad_norm": 9.214025497436523, + "learning_rate": 2.7512163009668635e-10, + "loss": 1.6135, + "step": 79828 + }, + { + "epoch": 0.9978999474986875, + "grad_norm": 1.4873005151748657, + "learning_rate": 2.6868628791310024e-10, + "loss": 0.3424, + "step": 79830 + }, + { + "epoch": 0.9979249481237031, + "grad_norm": 3.510800838470459, + "learning_rate": 2.623271022694773e-10, + "loss": 1.3084, + "step": 79832 + }, + { + "epoch": 0.9979499487487187, + "grad_norm": 3.0116779804229736, + "learning_rate": 2.5604407321466737e-10, + "loss": 0.2225, + "step": 79834 + }, + { + "epoch": 0.9979749493737343, + "grad_norm": 2.64800763130188, + "learning_rate": 2.498372007941896e-10, + "loss": 0.7319, + "step": 79836 + }, + { + "epoch": 0.99799994999875, + "grad_norm": 4.608682632446289, + "learning_rate": 2.4370648505800396e-10, + "loss": 1.1527, + "step": 79838 + }, + { + "epoch": 0.9980249506237656, + "grad_norm": 4.5514702796936035, + "learning_rate": 2.3765192605162966e-10, + "loss": 0.2996, + "step": 79840 + }, + { + "epoch": 0.9980499512487813, + "grad_norm": 3.0957486629486084, + "learning_rate": 2.3167352382058582e-10, + "loss": 0.7951, + "step": 79842 + }, + { + "epoch": 0.9980749518737968, + "grad_norm": 3.1305174827575684, + "learning_rate": 2.2577127841150182e-10, + "loss": 0.5452, + "step": 79844 + }, + { + "epoch": 0.9980999524988124, + "grad_norm": 5.207897663116455, + "learning_rate": 2.1994518986878654e-10, + "loss": 1.5205, + "step": 79846 + }, + { + "epoch": 0.9981249531238281, + "grad_norm": 0.00623599998652935, + "learning_rate": 2.1419525823684894e-10, + "loss": 0.5586, + "step": 79848 + }, + { + "epoch": 0.9981499537488437, + "grad_norm": 7.927812576293945, + "learning_rate": 2.0852148355898772e-10, + "loss": 1.4326, + "step": 79850 + }, + { + "epoch": 0.9981749543738594, + "grad_norm": 3.4279627799987793, + "learning_rate": 2.0292386587961178e-10, + "loss": 1.146, + "step": 79852 + }, + { + "epoch": 0.9981999549988749, + "grad_norm": 2.9752883911132812, + "learning_rate": 1.9740240524090958e-10, + "loss": 0.8296, + "step": 79854 + }, + { + "epoch": 0.9982249556238906, + "grad_norm": 3.6817264556884766, + "learning_rate": 1.9195710168395944e-10, + "loss": 1.9475, + "step": 79856 + }, + { + "epoch": 0.9982499562489062, + "grad_norm": 4.841670989990234, + "learning_rate": 1.8658795525094976e-10, + "loss": 1.0691, + "step": 79858 + }, + { + "epoch": 0.9982749568739219, + "grad_norm": 3.105499029159546, + "learning_rate": 1.8129496598295882e-10, + "loss": 1.4311, + "step": 79860 + }, + { + "epoch": 0.9982999574989375, + "grad_norm": 3.2147111892700195, + "learning_rate": 1.7607813392106486e-10, + "loss": 1.3313, + "step": 79862 + }, + { + "epoch": 0.998324958123953, + "grad_norm": 1.8873860836029053, + "learning_rate": 1.7093745910301552e-10, + "loss": 0.4178, + "step": 79864 + }, + { + "epoch": 0.9983499587489687, + "grad_norm": 3.9889185428619385, + "learning_rate": 1.6587294156877875e-10, + "loss": 0.5772, + "step": 79866 + }, + { + "epoch": 0.9983749593739843, + "grad_norm": 5.487823486328125, + "learning_rate": 1.608845813572124e-10, + "loss": 0.4651, + "step": 79868 + }, + { + "epoch": 0.998399959999, + "grad_norm": 2.8853423595428467, + "learning_rate": 1.5597237850717428e-10, + "loss": 0.8136, + "step": 79870 + }, + { + "epoch": 0.9984249606240156, + "grad_norm": 0.8449936509132385, + "learning_rate": 1.511363330541915e-10, + "loss": 1.0771, + "step": 79872 + }, + { + "epoch": 0.9984499612490312, + "grad_norm": 1.088343620300293, + "learning_rate": 1.463764450371219e-10, + "loss": 1.0849, + "step": 79874 + }, + { + "epoch": 0.9984749618740468, + "grad_norm": 2.5101847648620605, + "learning_rate": 1.4169271449038235e-10, + "loss": 1.3564, + "step": 79876 + }, + { + "epoch": 0.9984999624990625, + "grad_norm": 2.0821917057037354, + "learning_rate": 1.3708514145061025e-10, + "loss": 1.2037, + "step": 79878 + }, + { + "epoch": 0.9985249631240781, + "grad_norm": 2.5126287937164307, + "learning_rate": 1.3255372595222248e-10, + "loss": 0.634, + "step": 79880 + }, + { + "epoch": 0.9985499637490938, + "grad_norm": 2.641937255859375, + "learning_rate": 1.2809846803074622e-10, + "loss": 0.5022, + "step": 79882 + }, + { + "epoch": 0.9985749643741093, + "grad_norm": 0.0021642192732542753, + "learning_rate": 1.237193677183779e-10, + "loss": 0.0, + "step": 79884 + }, + { + "epoch": 0.998599964999125, + "grad_norm": 5.552849769592285, + "learning_rate": 1.1941642505064466e-10, + "loss": 1.0744, + "step": 79886 + }, + { + "epoch": 0.9986249656241406, + "grad_norm": 0.0003303129633422941, + "learning_rate": 1.1518964005974298e-10, + "loss": 2.7121, + "step": 79888 + }, + { + "epoch": 0.9986499662491563, + "grad_norm": 1.685848355293274, + "learning_rate": 1.110390127767591e-10, + "loss": 0.0593, + "step": 79890 + }, + { + "epoch": 0.9986749668741719, + "grad_norm": 2.947385787963867, + "learning_rate": 1.0696454323499971e-10, + "loss": 0.7373, + "step": 79892 + }, + { + "epoch": 0.9986999674991874, + "grad_norm": 0.8522822856903076, + "learning_rate": 1.0296623146333062e-10, + "loss": 0.612, + "step": 79894 + }, + { + "epoch": 0.9987249681242031, + "grad_norm": 0.22551119327545166, + "learning_rate": 9.904407749394829e-11, + "loss": 0.0176, + "step": 79896 + }, + { + "epoch": 0.9987499687492187, + "grad_norm": 1.7162500619888306, + "learning_rate": 9.51980813557185e-11, + "loss": 0.8543, + "step": 79898 + }, + { + "epoch": 0.9987749693742344, + "grad_norm": 6.668663024902344, + "learning_rate": 9.14282430786173e-11, + "loss": 2.046, + "step": 79900 + }, + { + "epoch": 0.99879996999925, + "grad_norm": 2.4892821311950684, + "learning_rate": 8.773456269040026e-11, + "loss": 0.6333, + "step": 79902 + }, + { + "epoch": 0.9988249706242656, + "grad_norm": 0.00269004562869668, + "learning_rate": 8.411704022104339e-11, + "loss": 0.2211, + "step": 79904 + }, + { + "epoch": 0.9988499712492812, + "grad_norm": 2.7831900119781494, + "learning_rate": 8.057567569608183e-11, + "loss": 0.8672, + "step": 79906 + }, + { + "epoch": 0.9988749718742969, + "grad_norm": 1.529149055480957, + "learning_rate": 7.711046914438136e-11, + "loss": 0.6074, + "step": 79908 + }, + { + "epoch": 0.9988999724993125, + "grad_norm": 0.004581269342452288, + "learning_rate": 7.37214205903669e-11, + "loss": 0.1691, + "step": 79910 + }, + { + "epoch": 0.9989249731243282, + "grad_norm": 5.246918201446533, + "learning_rate": 7.040853006068383e-11, + "loss": 1.2553, + "step": 79912 + }, + { + "epoch": 0.9989499737493437, + "grad_norm": 2.311257839202881, + "learning_rate": 6.717179758086722e-11, + "loss": 0.288, + "step": 79914 + }, + { + "epoch": 0.9989749743743593, + "grad_norm": 6.517247676849365, + "learning_rate": 6.401122317534203e-11, + "loss": 1.1823, + "step": 79916 + }, + { + "epoch": 0.998999974999375, + "grad_norm": 8.114134788513184, + "learning_rate": 6.092680686853314e-11, + "loss": 2.0538, + "step": 79918 + }, + { + "epoch": 0.9990249756243906, + "grad_norm": 1.8285760879516602, + "learning_rate": 5.791854868264501e-11, + "loss": 0.3727, + "step": 79920 + }, + { + "epoch": 0.9990499762494063, + "grad_norm": 6.867819786071777, + "learning_rate": 5.498644864321279e-11, + "loss": 0.2432, + "step": 79922 + }, + { + "epoch": 0.9990749768744218, + "grad_norm": 4.515995979309082, + "learning_rate": 5.2130506769110246e-11, + "loss": 1.8874, + "step": 79924 + }, + { + "epoch": 0.9990999774994375, + "grad_norm": 0.034714244306087494, + "learning_rate": 4.9350723084762296e-11, + "loss": 0.684, + "step": 79926 + }, + { + "epoch": 0.9991249781244531, + "grad_norm": 3.173784017562866, + "learning_rate": 4.664709761015296e-11, + "loss": 1.4215, + "step": 79928 + }, + { + "epoch": 0.9991499787494688, + "grad_norm": 5.991211414337158, + "learning_rate": 4.4019630366376465e-11, + "loss": 1.8766, + "step": 79930 + }, + { + "epoch": 0.9991749793744844, + "grad_norm": 1.2288726568222046, + "learning_rate": 4.146832137341683e-11, + "loss": 0.3639, + "step": 79932 + }, + { + "epoch": 0.9991999799994999, + "grad_norm": 4.738759994506836, + "learning_rate": 3.899317065014785e-11, + "loss": 1.4246, + "step": 79934 + }, + { + "epoch": 0.9992249806245156, + "grad_norm": 2.398207664489746, + "learning_rate": 3.659417821544331e-11, + "loss": 0.966, + "step": 79936 + }, + { + "epoch": 0.9992499812495312, + "grad_norm": 1.4994165897369385, + "learning_rate": 3.427134408928723e-11, + "loss": 0.3355, + "step": 79938 + }, + { + "epoch": 0.9992749818745469, + "grad_norm": 4.0571088790893555, + "learning_rate": 3.20246682861125e-11, + "loss": 1.2275, + "step": 79940 + }, + { + "epoch": 0.9992999824995625, + "grad_norm": 2.8810150623321533, + "learning_rate": 2.985415082590315e-11, + "loss": 0.4124, + "step": 79942 + }, + { + "epoch": 0.9993249831245781, + "grad_norm": 0.0009105755016207695, + "learning_rate": 2.775979172420229e-11, + "loss": 0.1863, + "step": 79944 + }, + { + "epoch": 0.9993499837495937, + "grad_norm": 3.224313497543335, + "learning_rate": 2.5741590997663267e-11, + "loss": 0.5389, + "step": 79946 + }, + { + "epoch": 0.9993749843746094, + "grad_norm": 4.740446090698242, + "learning_rate": 2.3799548659608763e-11, + "loss": 0.8777, + "step": 79948 + }, + { + "epoch": 0.999399984999625, + "grad_norm": 0.7952799201011658, + "learning_rate": 2.1933664726692118e-11, + "loss": 0.3358, + "step": 79950 + }, + { + "epoch": 0.9994249856246407, + "grad_norm": 0.0006311294273473322, + "learning_rate": 2.014393921334623e-11, + "loss": 0.9375, + "step": 79952 + }, + { + "epoch": 0.9994499862496562, + "grad_norm": 3.76070237159729, + "learning_rate": 1.8430372131783558e-11, + "loss": 1.1021, + "step": 79954 + }, + { + "epoch": 0.9994749868746718, + "grad_norm": 2.8583877086639404, + "learning_rate": 1.6792963495326774e-11, + "loss": 0.4508, + "step": 79956 + }, + { + "epoch": 0.9994999874996875, + "grad_norm": 8.216654777526855, + "learning_rate": 1.523171331729856e-11, + "loss": 2.0247, + "step": 79958 + }, + { + "epoch": 0.9995249881247031, + "grad_norm": 2.609760284423828, + "learning_rate": 1.3746621608801136e-11, + "loss": 0.4845, + "step": 79960 + }, + { + "epoch": 0.9995499887497188, + "grad_norm": 4.1430864334106445, + "learning_rate": 1.2337688382046964e-11, + "loss": 0.7833, + "step": 79962 + }, + { + "epoch": 0.9995749893747343, + "grad_norm": 0.00041091081220656633, + "learning_rate": 1.1004913647028049e-11, + "loss": 0.0227, + "step": 79964 + }, + { + "epoch": 0.99959998999975, + "grad_norm": 0.002468356629833579, + "learning_rate": 9.748297413736397e-12, + "loss": 0.3773, + "step": 79966 + }, + { + "epoch": 0.9996249906247656, + "grad_norm": 8.668424606323242, + "learning_rate": 8.567839692164016e-12, + "loss": 2.1054, + "step": 79968 + }, + { + "epoch": 0.9996499912497813, + "grad_norm": 13.824529647827148, + "learning_rate": 7.463540492302913e-12, + "loss": 1.4879, + "step": 79970 + }, + { + "epoch": 0.9996749918747969, + "grad_norm": 6.409896373748779, + "learning_rate": 6.435399819704202e-12, + "loss": 1.3771, + "step": 79972 + }, + { + "epoch": 0.9996999924998125, + "grad_norm": 5.213324546813965, + "learning_rate": 5.483417685470116e-12, + "loss": 0.7286, + "step": 79974 + }, + { + "epoch": 0.9997249931248281, + "grad_norm": 3.093881368637085, + "learning_rate": 4.6075940940415455e-12, + "loss": 3.4506, + "step": 79976 + }, + { + "epoch": 0.9997499937498437, + "grad_norm": 6.0291619300842285, + "learning_rate": 3.807929054300275e-12, + "loss": 0.282, + "step": 79978 + }, + { + "epoch": 0.9997749943748594, + "grad_norm": 3.83475661277771, + "learning_rate": 3.0844225706871956e-12, + "loss": 1.7913, + "step": 79980 + }, + { + "epoch": 0.999799994999875, + "grad_norm": 3.52433443069458, + "learning_rate": 2.4370746509738696e-12, + "loss": 0.772, + "step": 79982 + }, + { + "epoch": 0.9998249956248906, + "grad_norm": 0.0016487683169543743, + "learning_rate": 1.865885297380743e-12, + "loss": 0.7308, + "step": 79984 + }, + { + "epoch": 0.9998499962499062, + "grad_norm": 6.610978126525879, + "learning_rate": 1.3708545154589304e-12, + "loss": 1.224, + "step": 79986 + }, + { + "epoch": 0.9998749968749219, + "grad_norm": 2.5647494792938232, + "learning_rate": 9.51982308539101e-13, + "loss": 1.0472, + "step": 79988 + }, + { + "epoch": 0.9998999974999375, + "grad_norm": 0.00038752262480556965, + "learning_rate": 6.092686810621473e-13, + "loss": 0.2038, + "step": 79990 + }, + { + "epoch": 0.9999249981249532, + "grad_norm": 0.5198023915290833, + "learning_rate": 3.4271363413829194e-13, + "loss": 0.0269, + "step": 79992 + }, + { + "epoch": 0.9999499987499687, + "grad_norm": 0.028450315818190575, + "learning_rate": 1.523171710982041e-13, + "loss": 0.0004, + "step": 79994 + }, + { + "epoch": 0.9999749993749844, + "grad_norm": 4.297536373138428, + "learning_rate": 3.807929305210678e-14, + "loss": 1.4678, + "step": 79996 + }, + { + "epoch": 1.0, + "grad_norm": 2.903334379196167, + "learning_rate": 0.0, + "loss": 1.2262, + "step": 79998 + }, + { + "epoch": 1.0, + "step": 79998, + "total_flos": 3.6835859969500774e+17, + "train_loss": 0.9321360383034607, + "train_runtime": 30517.6886, + "train_samples_per_second": 2.621, + "train_steps_per_second": 2.621 + } + ], + "logging_steps": 2, + "max_steps": 79998, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 3.6835859969500774e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario16_new_10000_nosampling_seed1/client_0/global_step79998/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario16_new_10000_nosampling_seed1/client_0/global_step79998/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6002f109957f153e4e4863f34e4159759fa3d18e --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario16_new_10000_nosampling_seed1/client_0/global_step79998/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8318faffae98c660d3cb177b3fb4a361e49762965155737074843985ab2c5e6b +size 3837841200 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario16_new_10000_nosampling_seed1/client_0/global_step79998/mp_rank_00_model_states.pt b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario16_new_10000_nosampling_seed1/client_0/global_step79998/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb219c4e9b8d4b4966907c9f5c272e04bf18dbfd --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario16_new_10000_nosampling_seed1/client_0/global_step79998/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dffdb5860a94f4493249a600e2bb17ee575ea1b8c36596a7fa4eadde45b2362 +size 639989420 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario16_new_10000_nosampling_seed1/client_0/latest b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario16_new_10000_nosampling_seed1/client_0/latest new file mode 100644 index 0000000000000000000000000000000000000000..e55386afcef1186a366bb43954857a36b97e14b5 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario16_new_10000_nosampling_seed1/client_0/latest @@ -0,0 +1 @@ +global_step79998 \ No newline at end of file diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario16_new_10000_nosampling_seed1/client_0/scheduler.pt b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario16_new_10000_nosampling_seed1/client_0/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3751324e3033073e6e6620114110bafec5fb6ded --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario16_new_10000_nosampling_seed1/client_0/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45ad69aebc8e7d648cd1c08f4a6e2907cd4b725f56f2b79c36e4b124fac234f3 +size 1064 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario16_new_10000_nosampling_seed1/client_0/zero_to_fp32.py b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario16_new_10000_nosampling_seed1/client_0/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..e93cb1c95f15c1474642edb1978714075361bc04 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario16_new_10000_nosampling_seed1/client_0/zero_to_fp32.py @@ -0,0 +1,758 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: + shared_tensor = state_dict[converted_tensors[tensor_id]] + state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + state_dict[name] = tensor.contiguous() + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in shard_state_dict: + del state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario16_new_10000_nosampling_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario16_new_10000_nosampling_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..e4a535b33eebef7bc0fddf727a237ab036edaad2 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario16_new_10000_nosampling_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19e34e04560c5e9d195d08f4a31661f0ef40d4fb8bf0033e6ef28999f66681bf +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario17_new_10000_nosampling_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario17_new_10000_nosampling_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4300c15cecdcdb32fc5208af1c8e138195076758 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario17_new_10000_nosampling_seed1/0_trainer_state.json @@ -0,0 +1,175032 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 50000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 4e-05, + "grad_norm": 2.657102584838867, + "learning_rate": 8e-09, + "loss": 0.1259, + "step": 2 + }, + { + "epoch": 8e-05, + "grad_norm": 12.382366180419922, + "learning_rate": 1.6e-08, + "loss": 0.394, + "step": 4 + }, + { + "epoch": 0.00012, + "grad_norm": 3.221132278442383, + "learning_rate": 2.4e-08, + "loss": 0.1396, + "step": 6 + }, + { + "epoch": 0.00016, + "grad_norm": 8.516926765441895, + "learning_rate": 3.2e-08, + "loss": 0.5887, + "step": 8 + }, + { + "epoch": 0.0002, + "grad_norm": 3.9125170707702637, + "learning_rate": 4e-08, + "loss": 0.1525, + "step": 10 + }, + { + "epoch": 0.00024, + "grad_norm": 10.833374977111816, + "learning_rate": 4.8e-08, + "loss": 0.6442, + "step": 12 + }, + { + "epoch": 0.00028, + "grad_norm": 9.709297180175781, + "learning_rate": 5.6000000000000005e-08, + "loss": 0.3418, + "step": 14 + }, + { + "epoch": 0.00032, + "grad_norm": 2.5925488471984863, + "learning_rate": 6.4e-08, + "loss": 0.1459, + "step": 16 + }, + { + "epoch": 0.00036, + "grad_norm": 15.350760459899902, + "learning_rate": 7.200000000000001e-08, + "loss": 0.3891, + "step": 18 + }, + { + "epoch": 0.0004, + "grad_norm": 9.679722785949707, + "learning_rate": 8e-08, + "loss": 0.5095, + "step": 20 + }, + { + "epoch": 0.00044, + "grad_norm": 7.29577112197876, + "learning_rate": 8.800000000000001e-08, + "loss": 0.2858, + "step": 22 + }, + { + "epoch": 0.00048, + "grad_norm": 2.8426859378814697, + "learning_rate": 9.6e-08, + "loss": 0.3863, + "step": 24 + }, + { + "epoch": 0.00052, + "grad_norm": 11.856480598449707, + "learning_rate": 1.04e-07, + "loss": 0.388, + "step": 26 + }, + { + "epoch": 0.00056, + "grad_norm": 5.53923225402832, + "learning_rate": 1.1200000000000001e-07, + "loss": 0.1527, + "step": 28 + }, + { + "epoch": 0.0006, + "grad_norm": 3.4012014865875244, + "learning_rate": 1.2000000000000002e-07, + "loss": 0.3378, + "step": 30 + }, + { + "epoch": 0.00064, + "grad_norm": 3.5794191360473633, + "learning_rate": 1.28e-07, + "loss": 0.14, + "step": 32 + }, + { + "epoch": 0.00068, + "grad_norm": 3.265317440032959, + "learning_rate": 1.36e-07, + "loss": 0.3105, + "step": 34 + }, + { + "epoch": 0.00072, + "grad_norm": 11.900090217590332, + "learning_rate": 1.4400000000000002e-07, + "loss": 0.6427, + "step": 36 + }, + { + "epoch": 0.00076, + "grad_norm": 11.954628944396973, + "learning_rate": 1.52e-07, + "loss": 0.5437, + "step": 38 + }, + { + "epoch": 0.0008, + "grad_norm": 2.975534439086914, + "learning_rate": 1.6e-07, + "loss": 0.1306, + "step": 40 + }, + { + "epoch": 0.00084, + "grad_norm": 2.77016282081604, + "learning_rate": 1.68e-07, + "loss": 0.1433, + "step": 42 + }, + { + "epoch": 0.00088, + "grad_norm": 2.9804704189300537, + "learning_rate": 1.7600000000000001e-07, + "loss": 0.1326, + "step": 44 + }, + { + "epoch": 0.00092, + "grad_norm": 2.5759477615356445, + "learning_rate": 1.84e-07, + "loss": 0.1329, + "step": 46 + }, + { + "epoch": 0.00096, + "grad_norm": 12.359049797058105, + "learning_rate": 1.92e-07, + "loss": 0.3753, + "step": 48 + }, + { + "epoch": 0.001, + "grad_norm": 11.3090181350708, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.6206, + "step": 50 + }, + { + "epoch": 0.00104, + "grad_norm": 2.553110361099243, + "learning_rate": 2.08e-07, + "loss": 0.2994, + "step": 52 + }, + { + "epoch": 0.00108, + "grad_norm": 3.859858989715576, + "learning_rate": 2.1600000000000003e-07, + "loss": 0.1395, + "step": 54 + }, + { + "epoch": 0.00112, + "grad_norm": 11.356216430664062, + "learning_rate": 2.2400000000000002e-07, + "loss": 0.3493, + "step": 56 + }, + { + "epoch": 0.00116, + "grad_norm": 2.7827682495117188, + "learning_rate": 2.32e-07, + "loss": 0.2993, + "step": 58 + }, + { + "epoch": 0.0012, + "grad_norm": 2.7897872924804688, + "learning_rate": 2.4000000000000003e-07, + "loss": 0.1357, + "step": 60 + }, + { + "epoch": 0.00124, + "grad_norm": 9.937381744384766, + "learning_rate": 2.48e-07, + "loss": 0.5413, + "step": 62 + }, + { + "epoch": 0.00128, + "grad_norm": 2.818027973175049, + "learning_rate": 2.56e-07, + "loss": 0.3599, + "step": 64 + }, + { + "epoch": 0.00132, + "grad_norm": 9.86359977722168, + "learning_rate": 2.6400000000000003e-07, + "loss": 0.3409, + "step": 66 + }, + { + "epoch": 0.00136, + "grad_norm": 2.258953809738159, + "learning_rate": 2.72e-07, + "loss": 0.2456, + "step": 68 + }, + { + "epoch": 0.0014, + "grad_norm": 11.02219295501709, + "learning_rate": 2.8e-07, + "loss": 0.3671, + "step": 70 + }, + { + "epoch": 0.00144, + "grad_norm": 12.457615852355957, + "learning_rate": 2.8800000000000004e-07, + "loss": 0.576, + "step": 72 + }, + { + "epoch": 0.00148, + "grad_norm": 8.626323699951172, + "learning_rate": 2.9600000000000006e-07, + "loss": 0.3066, + "step": 74 + }, + { + "epoch": 0.00152, + "grad_norm": 3.0226361751556396, + "learning_rate": 3.04e-07, + "loss": 0.3388, + "step": 76 + }, + { + "epoch": 0.00156, + "grad_norm": 3.904945135116577, + "learning_rate": 3.12e-07, + "loss": 0.3815, + "step": 78 + }, + { + "epoch": 0.0016, + "grad_norm": 9.550996780395508, + "learning_rate": 3.2e-07, + "loss": 0.5311, + "step": 80 + }, + { + "epoch": 0.00164, + "grad_norm": 2.727360486984253, + "learning_rate": 3.280000000000001e-07, + "loss": 0.3285, + "step": 82 + }, + { + "epoch": 0.00168, + "grad_norm": 8.11398696899414, + "learning_rate": 3.36e-07, + "loss": 0.5203, + "step": 84 + }, + { + "epoch": 0.00172, + "grad_norm": 9.984482765197754, + "learning_rate": 3.44e-07, + "loss": 0.328, + "step": 86 + }, + { + "epoch": 0.00176, + "grad_norm": 2.274035930633545, + "learning_rate": 3.5200000000000003e-07, + "loss": 0.3214, + "step": 88 + }, + { + "epoch": 0.0018, + "grad_norm": 3.0562233924865723, + "learning_rate": 3.6e-07, + "loss": 0.1268, + "step": 90 + }, + { + "epoch": 0.00184, + "grad_norm": 7.816158771514893, + "learning_rate": 3.68e-07, + "loss": 0.3204, + "step": 92 + }, + { + "epoch": 0.00188, + "grad_norm": 7.1242899894714355, + "learning_rate": 3.7600000000000003e-07, + "loss": 0.2945, + "step": 94 + }, + { + "epoch": 0.00192, + "grad_norm": 7.211949825286865, + "learning_rate": 3.84e-07, + "loss": 0.4127, + "step": 96 + }, + { + "epoch": 0.00196, + "grad_norm": 7.66123628616333, + "learning_rate": 3.92e-07, + "loss": 0.446, + "step": 98 + }, + { + "epoch": 0.002, + "grad_norm": 3.0335700511932373, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.2867, + "step": 100 + }, + { + "epoch": 0.00204, + "grad_norm": 2.7385785579681396, + "learning_rate": 4.0800000000000005e-07, + "loss": 0.2777, + "step": 102 + }, + { + "epoch": 0.00208, + "grad_norm": 6.6123175621032715, + "learning_rate": 4.16e-07, + "loss": 0.416, + "step": 104 + }, + { + "epoch": 0.00212, + "grad_norm": 3.4748146533966064, + "learning_rate": 4.2400000000000004e-07, + "loss": 0.161, + "step": 106 + }, + { + "epoch": 0.00216, + "grad_norm": 2.977184772491455, + "learning_rate": 4.3200000000000006e-07, + "loss": 0.2579, + "step": 108 + }, + { + "epoch": 0.0022, + "grad_norm": 3.715442180633545, + "learning_rate": 4.4e-07, + "loss": 0.3008, + "step": 110 + }, + { + "epoch": 0.00224, + "grad_norm": 6.683705806732178, + "learning_rate": 4.4800000000000004e-07, + "loss": 0.3817, + "step": 112 + }, + { + "epoch": 0.00228, + "grad_norm": 6.809701442718506, + "learning_rate": 4.5600000000000006e-07, + "loss": 0.3975, + "step": 114 + }, + { + "epoch": 0.00232, + "grad_norm": 4.028317451477051, + "learning_rate": 4.64e-07, + "loss": 0.2677, + "step": 116 + }, + { + "epoch": 0.00236, + "grad_norm": 6.974883079528809, + "learning_rate": 4.7200000000000004e-07, + "loss": 0.3433, + "step": 118 + }, + { + "epoch": 0.0024, + "grad_norm": 3.932694911956787, + "learning_rate": 4.800000000000001e-07, + "loss": 0.2552, + "step": 120 + }, + { + "epoch": 0.00244, + "grad_norm": 4.6338982582092285, + "learning_rate": 4.88e-07, + "loss": 0.2474, + "step": 122 + }, + { + "epoch": 0.00248, + "grad_norm": 5.41000509262085, + "learning_rate": 4.96e-07, + "loss": 0.2456, + "step": 124 + }, + { + "epoch": 0.00252, + "grad_norm": 4.646337985992432, + "learning_rate": 5.040000000000001e-07, + "loss": 0.2914, + "step": 126 + }, + { + "epoch": 0.00256, + "grad_norm": 5.565479755401611, + "learning_rate": 5.12e-07, + "loss": 0.2635, + "step": 128 + }, + { + "epoch": 0.0026, + "grad_norm": 4.975805282592773, + "learning_rate": 5.2e-07, + "loss": 0.2552, + "step": 130 + }, + { + "epoch": 0.00264, + "grad_norm": 5.8253397941589355, + "learning_rate": 5.280000000000001e-07, + "loss": 0.2789, + "step": 132 + }, + { + "epoch": 0.00268, + "grad_norm": 4.579864501953125, + "learning_rate": 5.36e-07, + "loss": 0.2069, + "step": 134 + }, + { + "epoch": 0.00272, + "grad_norm": 5.309229850769043, + "learning_rate": 5.44e-07, + "loss": 0.2664, + "step": 136 + }, + { + "epoch": 0.00276, + "grad_norm": 5.28040075302124, + "learning_rate": 5.520000000000001e-07, + "loss": 0.2641, + "step": 138 + }, + { + "epoch": 0.0028, + "grad_norm": 5.790406703948975, + "learning_rate": 5.6e-07, + "loss": 0.2468, + "step": 140 + }, + { + "epoch": 0.00284, + "grad_norm": 5.581902503967285, + "learning_rate": 5.680000000000001e-07, + "loss": 0.2554, + "step": 142 + }, + { + "epoch": 0.00288, + "grad_norm": 4.540887832641602, + "learning_rate": 5.760000000000001e-07, + "loss": 0.233, + "step": 144 + }, + { + "epoch": 0.00292, + "grad_norm": 4.893327713012695, + "learning_rate": 5.84e-07, + "loss": 0.2208, + "step": 146 + }, + { + "epoch": 0.00296, + "grad_norm": 3.677356719970703, + "learning_rate": 5.920000000000001e-07, + "loss": 0.2102, + "step": 148 + }, + { + "epoch": 0.003, + "grad_norm": 3.976224422454834, + "learning_rate": 6.000000000000001e-07, + "loss": 0.1947, + "step": 150 + }, + { + "epoch": 0.00304, + "grad_norm": 5.748621940612793, + "learning_rate": 6.08e-07, + "loss": 0.2543, + "step": 152 + }, + { + "epoch": 0.00308, + "grad_norm": 6.026477336883545, + "learning_rate": 6.160000000000001e-07, + "loss": 0.2433, + "step": 154 + }, + { + "epoch": 0.00312, + "grad_norm": 6.885504245758057, + "learning_rate": 6.24e-07, + "loss": 0.3032, + "step": 156 + }, + { + "epoch": 0.00316, + "grad_norm": 4.825903415679932, + "learning_rate": 6.320000000000002e-07, + "loss": 0.2028, + "step": 158 + }, + { + "epoch": 0.0032, + "grad_norm": 5.096654891967773, + "learning_rate": 6.4e-07, + "loss": 0.2745, + "step": 160 + }, + { + "epoch": 0.00324, + "grad_norm": 5.326235771179199, + "learning_rate": 6.48e-07, + "loss": 0.2335, + "step": 162 + }, + { + "epoch": 0.00328, + "grad_norm": 4.9139485359191895, + "learning_rate": 6.560000000000002e-07, + "loss": 0.2416, + "step": 164 + }, + { + "epoch": 0.00332, + "grad_norm": 4.472104549407959, + "learning_rate": 6.64e-07, + "loss": 0.2425, + "step": 166 + }, + { + "epoch": 0.00336, + "grad_norm": 5.6613383293151855, + "learning_rate": 6.72e-07, + "loss": 0.2741, + "step": 168 + }, + { + "epoch": 0.0034, + "grad_norm": 5.40153169631958, + "learning_rate": 6.800000000000001e-07, + "loss": 0.2415, + "step": 170 + }, + { + "epoch": 0.00344, + "grad_norm": 5.029289722442627, + "learning_rate": 6.88e-07, + "loss": 0.2748, + "step": 172 + }, + { + "epoch": 0.00348, + "grad_norm": 3.7147560119628906, + "learning_rate": 6.96e-07, + "loss": 0.214, + "step": 174 + }, + { + "epoch": 0.00352, + "grad_norm": 4.429921627044678, + "learning_rate": 7.040000000000001e-07, + "loss": 0.2302, + "step": 176 + }, + { + "epoch": 0.00356, + "grad_norm": 6.158509254455566, + "learning_rate": 7.12e-07, + "loss": 0.2534, + "step": 178 + }, + { + "epoch": 0.0036, + "grad_norm": 3.4717259407043457, + "learning_rate": 7.2e-07, + "loss": 0.1777, + "step": 180 + }, + { + "epoch": 0.00364, + "grad_norm": 4.617865085601807, + "learning_rate": 7.280000000000001e-07, + "loss": 0.2202, + "step": 182 + }, + { + "epoch": 0.00368, + "grad_norm": 6.66176700592041, + "learning_rate": 7.36e-07, + "loss": 0.2864, + "step": 184 + }, + { + "epoch": 0.00372, + "grad_norm": 4.1872758865356445, + "learning_rate": 7.44e-07, + "loss": 0.2016, + "step": 186 + }, + { + "epoch": 0.00376, + "grad_norm": 5.526080131530762, + "learning_rate": 7.520000000000001e-07, + "loss": 0.2223, + "step": 188 + }, + { + "epoch": 0.0038, + "grad_norm": 6.010143756866455, + "learning_rate": 7.6e-07, + "loss": 0.2838, + "step": 190 + }, + { + "epoch": 0.00384, + "grad_norm": 3.9608585834503174, + "learning_rate": 7.68e-07, + "loss": 0.1811, + "step": 192 + }, + { + "epoch": 0.00388, + "grad_norm": 4.343667030334473, + "learning_rate": 7.760000000000001e-07, + "loss": 0.182, + "step": 194 + }, + { + "epoch": 0.00392, + "grad_norm": 4.132966995239258, + "learning_rate": 7.84e-07, + "loss": 0.2504, + "step": 196 + }, + { + "epoch": 0.00396, + "grad_norm": 4.711970329284668, + "learning_rate": 7.920000000000001e-07, + "loss": 0.2791, + "step": 198 + }, + { + "epoch": 0.004, + "grad_norm": 4.310032367706299, + "learning_rate": 8.000000000000001e-07, + "loss": 0.1903, + "step": 200 + }, + { + "epoch": 0.00404, + "grad_norm": 5.420604705810547, + "learning_rate": 8.08e-07, + "loss": 0.2717, + "step": 202 + }, + { + "epoch": 0.00408, + "grad_norm": 5.509664535522461, + "learning_rate": 8.160000000000001e-07, + "loss": 0.2317, + "step": 204 + }, + { + "epoch": 0.00412, + "grad_norm": 4.344141960144043, + "learning_rate": 8.240000000000001e-07, + "loss": 0.2846, + "step": 206 + }, + { + "epoch": 0.00416, + "grad_norm": 5.954940319061279, + "learning_rate": 8.32e-07, + "loss": 0.2559, + "step": 208 + }, + { + "epoch": 0.0042, + "grad_norm": 3.8806636333465576, + "learning_rate": 8.400000000000001e-07, + "loss": 0.173, + "step": 210 + }, + { + "epoch": 0.00424, + "grad_norm": 3.924731969833374, + "learning_rate": 8.480000000000001e-07, + "loss": 0.2348, + "step": 212 + }, + { + "epoch": 0.00428, + "grad_norm": 4.86004114151001, + "learning_rate": 8.56e-07, + "loss": 0.2094, + "step": 214 + }, + { + "epoch": 0.00432, + "grad_norm": 6.914025783538818, + "learning_rate": 8.640000000000001e-07, + "loss": 0.2925, + "step": 216 + }, + { + "epoch": 0.00436, + "grad_norm": 4.0072503089904785, + "learning_rate": 8.720000000000001e-07, + "loss": 0.1722, + "step": 218 + }, + { + "epoch": 0.0044, + "grad_norm": 4.657342433929443, + "learning_rate": 8.8e-07, + "loss": 0.2294, + "step": 220 + }, + { + "epoch": 0.00444, + "grad_norm": 4.953908920288086, + "learning_rate": 8.880000000000001e-07, + "loss": 0.2876, + "step": 222 + }, + { + "epoch": 0.00448, + "grad_norm": 4.2449421882629395, + "learning_rate": 8.960000000000001e-07, + "loss": 0.2436, + "step": 224 + }, + { + "epoch": 0.00452, + "grad_norm": 3.2802836894989014, + "learning_rate": 9.04e-07, + "loss": 0.2143, + "step": 226 + }, + { + "epoch": 0.00456, + "grad_norm": 4.246271133422852, + "learning_rate": 9.120000000000001e-07, + "loss": 0.2097, + "step": 228 + }, + { + "epoch": 0.0046, + "grad_norm": 6.286600589752197, + "learning_rate": 9.200000000000001e-07, + "loss": 0.2523, + "step": 230 + }, + { + "epoch": 0.00464, + "grad_norm": 3.353677988052368, + "learning_rate": 9.28e-07, + "loss": 0.1575, + "step": 232 + }, + { + "epoch": 0.00468, + "grad_norm": 4.208551406860352, + "learning_rate": 9.360000000000001e-07, + "loss": 0.1719, + "step": 234 + }, + { + "epoch": 0.00472, + "grad_norm": 4.218686103820801, + "learning_rate": 9.440000000000001e-07, + "loss": 0.2477, + "step": 236 + }, + { + "epoch": 0.00476, + "grad_norm": 2.9963016510009766, + "learning_rate": 9.520000000000002e-07, + "loss": 0.2611, + "step": 238 + }, + { + "epoch": 0.0048, + "grad_norm": 3.4952549934387207, + "learning_rate": 9.600000000000001e-07, + "loss": 0.1642, + "step": 240 + }, + { + "epoch": 0.00484, + "grad_norm": 7.297757625579834, + "learning_rate": 9.68e-07, + "loss": 0.3984, + "step": 242 + }, + { + "epoch": 0.00488, + "grad_norm": 6.764694690704346, + "learning_rate": 9.76e-07, + "loss": 0.2323, + "step": 244 + }, + { + "epoch": 0.00492, + "grad_norm": 3.3177437782287598, + "learning_rate": 9.84e-07, + "loss": 0.2147, + "step": 246 + }, + { + "epoch": 0.00496, + "grad_norm": 6.723344326019287, + "learning_rate": 9.92e-07, + "loss": 0.3189, + "step": 248 + }, + { + "epoch": 0.005, + "grad_norm": 6.310384750366211, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.3432, + "step": 250 + }, + { + "epoch": 0.00504, + "grad_norm": 4.401824951171875, + "learning_rate": 1.0080000000000001e-06, + "loss": 0.1891, + "step": 252 + }, + { + "epoch": 0.00508, + "grad_norm": 5.598635673522949, + "learning_rate": 1.016e-06, + "loss": 0.2228, + "step": 254 + }, + { + "epoch": 0.00512, + "grad_norm": 4.023591041564941, + "learning_rate": 1.024e-06, + "loss": 0.1804, + "step": 256 + }, + { + "epoch": 0.00516, + "grad_norm": 4.954934597015381, + "learning_rate": 1.032e-06, + "loss": 0.2001, + "step": 258 + }, + { + "epoch": 0.0052, + "grad_norm": 4.342620849609375, + "learning_rate": 1.04e-06, + "loss": 0.2305, + "step": 260 + }, + { + "epoch": 0.00524, + "grad_norm": 5.468482494354248, + "learning_rate": 1.0480000000000002e-06, + "loss": 0.2816, + "step": 262 + }, + { + "epoch": 0.00528, + "grad_norm": 5.404244899749756, + "learning_rate": 1.0560000000000001e-06, + "loss": 0.2926, + "step": 264 + }, + { + "epoch": 0.00532, + "grad_norm": 4.718786239624023, + "learning_rate": 1.064e-06, + "loss": 0.1821, + "step": 266 + }, + { + "epoch": 0.00536, + "grad_norm": 4.367702960968018, + "learning_rate": 1.072e-06, + "loss": 0.2278, + "step": 268 + }, + { + "epoch": 0.0054, + "grad_norm": 6.185503959655762, + "learning_rate": 1.08e-06, + "loss": 0.2937, + "step": 270 + }, + { + "epoch": 0.00544, + "grad_norm": 5.221651077270508, + "learning_rate": 1.088e-06, + "loss": 0.2582, + "step": 272 + }, + { + "epoch": 0.00548, + "grad_norm": 4.828245162963867, + "learning_rate": 1.0960000000000002e-06, + "loss": 0.237, + "step": 274 + }, + { + "epoch": 0.00552, + "grad_norm": 5.586830139160156, + "learning_rate": 1.1040000000000001e-06, + "loss": 0.2281, + "step": 276 + }, + { + "epoch": 0.00556, + "grad_norm": 5.5975260734558105, + "learning_rate": 1.1120000000000001e-06, + "loss": 0.2689, + "step": 278 + }, + { + "epoch": 0.0056, + "grad_norm": 4.798279285430908, + "learning_rate": 1.12e-06, + "loss": 0.2469, + "step": 280 + }, + { + "epoch": 0.00564, + "grad_norm": 5.010297775268555, + "learning_rate": 1.128e-06, + "loss": 0.2373, + "step": 282 + }, + { + "epoch": 0.00568, + "grad_norm": 5.076017379760742, + "learning_rate": 1.1360000000000002e-06, + "loss": 0.2474, + "step": 284 + }, + { + "epoch": 0.00572, + "grad_norm": 5.6727471351623535, + "learning_rate": 1.1440000000000002e-06, + "loss": 0.2693, + "step": 286 + }, + { + "epoch": 0.00576, + "grad_norm": 5.25231409072876, + "learning_rate": 1.1520000000000002e-06, + "loss": 0.2365, + "step": 288 + }, + { + "epoch": 0.0058, + "grad_norm": 4.724815368652344, + "learning_rate": 1.1600000000000001e-06, + "loss": 0.2466, + "step": 290 + }, + { + "epoch": 0.00584, + "grad_norm": 4.526407241821289, + "learning_rate": 1.168e-06, + "loss": 0.2366, + "step": 292 + }, + { + "epoch": 0.00588, + "grad_norm": 4.901851177215576, + "learning_rate": 1.176e-06, + "loss": 0.217, + "step": 294 + }, + { + "epoch": 0.00592, + "grad_norm": 5.1260905265808105, + "learning_rate": 1.1840000000000002e-06, + "loss": 0.2372, + "step": 296 + }, + { + "epoch": 0.00596, + "grad_norm": 4.284771919250488, + "learning_rate": 1.1920000000000002e-06, + "loss": 0.2158, + "step": 298 + }, + { + "epoch": 0.006, + "grad_norm": 4.688657760620117, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.2695, + "step": 300 + }, + { + "epoch": 0.00604, + "grad_norm": 3.8669826984405518, + "learning_rate": 1.2080000000000001e-06, + "loss": 0.2416, + "step": 302 + }, + { + "epoch": 0.00608, + "grad_norm": 4.031740665435791, + "learning_rate": 1.216e-06, + "loss": 0.198, + "step": 304 + }, + { + "epoch": 0.00612, + "grad_norm": 5.175335884094238, + "learning_rate": 1.224e-06, + "loss": 0.2926, + "step": 306 + }, + { + "epoch": 0.00616, + "grad_norm": 5.0797834396362305, + "learning_rate": 1.2320000000000002e-06, + "loss": 0.2277, + "step": 308 + }, + { + "epoch": 0.0062, + "grad_norm": 4.79205322265625, + "learning_rate": 1.2400000000000002e-06, + "loss": 0.2811, + "step": 310 + }, + { + "epoch": 0.00624, + "grad_norm": 4.813261985778809, + "learning_rate": 1.248e-06, + "loss": 0.2364, + "step": 312 + }, + { + "epoch": 0.00628, + "grad_norm": 4.314084529876709, + "learning_rate": 1.256e-06, + "loss": 0.2059, + "step": 314 + }, + { + "epoch": 0.00632, + "grad_norm": 4.567741394042969, + "learning_rate": 1.2640000000000003e-06, + "loss": 0.225, + "step": 316 + }, + { + "epoch": 0.00636, + "grad_norm": 4.355860233306885, + "learning_rate": 1.2720000000000003e-06, + "loss": 0.2151, + "step": 318 + }, + { + "epoch": 0.0064, + "grad_norm": 4.524393558502197, + "learning_rate": 1.28e-06, + "loss": 0.2699, + "step": 320 + }, + { + "epoch": 0.00644, + "grad_norm": 4.564831256866455, + "learning_rate": 1.288e-06, + "loss": 0.2156, + "step": 322 + }, + { + "epoch": 0.00648, + "grad_norm": 4.332452774047852, + "learning_rate": 1.296e-06, + "loss": 0.1963, + "step": 324 + }, + { + "epoch": 0.00652, + "grad_norm": 4.3100457191467285, + "learning_rate": 1.304e-06, + "loss": 0.259, + "step": 326 + }, + { + "epoch": 0.00656, + "grad_norm": 5.2482123374938965, + "learning_rate": 1.3120000000000003e-06, + "loss": 0.2271, + "step": 328 + }, + { + "epoch": 0.0066, + "grad_norm": 4.075623035430908, + "learning_rate": 1.32e-06, + "loss": 0.1966, + "step": 330 + }, + { + "epoch": 0.00664, + "grad_norm": 4.7580790519714355, + "learning_rate": 1.328e-06, + "loss": 0.2467, + "step": 332 + }, + { + "epoch": 0.00668, + "grad_norm": 4.085376739501953, + "learning_rate": 1.336e-06, + "loss": 0.218, + "step": 334 + }, + { + "epoch": 0.00672, + "grad_norm": 4.291323661804199, + "learning_rate": 1.344e-06, + "loss": 0.2469, + "step": 336 + }, + { + "epoch": 0.00676, + "grad_norm": 3.6673192977905273, + "learning_rate": 1.352e-06, + "loss": 0.1745, + "step": 338 + }, + { + "epoch": 0.0068, + "grad_norm": 3.5959649085998535, + "learning_rate": 1.3600000000000001e-06, + "loss": 0.225, + "step": 340 + }, + { + "epoch": 0.00684, + "grad_norm": 4.053492546081543, + "learning_rate": 1.368e-06, + "loss": 0.1789, + "step": 342 + }, + { + "epoch": 0.00688, + "grad_norm": 5.607787132263184, + "learning_rate": 1.376e-06, + "loss": 0.241, + "step": 344 + }, + { + "epoch": 0.00692, + "grad_norm": 2.9880518913269043, + "learning_rate": 1.384e-06, + "loss": 0.2445, + "step": 346 + }, + { + "epoch": 0.00696, + "grad_norm": 3.8600430488586426, + "learning_rate": 1.392e-06, + "loss": 0.1585, + "step": 348 + }, + { + "epoch": 0.007, + "grad_norm": 6.260729789733887, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.2511, + "step": 350 + }, + { + "epoch": 0.00704, + "grad_norm": 4.873640537261963, + "learning_rate": 1.4080000000000001e-06, + "loss": 0.2091, + "step": 352 + }, + { + "epoch": 0.00708, + "grad_norm": 3.6208951473236084, + "learning_rate": 1.416e-06, + "loss": 0.2585, + "step": 354 + }, + { + "epoch": 0.00712, + "grad_norm": 6.805551528930664, + "learning_rate": 1.424e-06, + "loss": 0.2893, + "step": 356 + }, + { + "epoch": 0.00716, + "grad_norm": 3.2307944297790527, + "learning_rate": 1.432e-06, + "loss": 0.1398, + "step": 358 + }, + { + "epoch": 0.0072, + "grad_norm": 3.4144973754882812, + "learning_rate": 1.44e-06, + "loss": 0.2202, + "step": 360 + }, + { + "epoch": 0.00724, + "grad_norm": 2.9077255725860596, + "learning_rate": 1.4480000000000002e-06, + "loss": 0.1205, + "step": 362 + }, + { + "epoch": 0.00728, + "grad_norm": 3.548758029937744, + "learning_rate": 1.4560000000000001e-06, + "loss": 0.2722, + "step": 364 + }, + { + "epoch": 0.00732, + "grad_norm": 6.495785236358643, + "learning_rate": 1.464e-06, + "loss": 0.2795, + "step": 366 + }, + { + "epoch": 0.00736, + "grad_norm": 5.308606147766113, + "learning_rate": 1.472e-06, + "loss": 0.2501, + "step": 368 + }, + { + "epoch": 0.0074, + "grad_norm": 5.858297824859619, + "learning_rate": 1.48e-06, + "loss": 0.3287, + "step": 370 + }, + { + "epoch": 0.00744, + "grad_norm": 3.771726369857788, + "learning_rate": 1.488e-06, + "loss": 0.1711, + "step": 372 + }, + { + "epoch": 0.00748, + "grad_norm": 3.5179944038391113, + "learning_rate": 1.4960000000000002e-06, + "loss": 0.2322, + "step": 374 + }, + { + "epoch": 0.00752, + "grad_norm": 6.182915687561035, + "learning_rate": 1.5040000000000001e-06, + "loss": 0.2645, + "step": 376 + }, + { + "epoch": 0.00756, + "grad_norm": 5.271385669708252, + "learning_rate": 1.512e-06, + "loss": 0.3291, + "step": 378 + }, + { + "epoch": 0.0076, + "grad_norm": 3.943516254425049, + "learning_rate": 1.52e-06, + "loss": 0.2747, + "step": 380 + }, + { + "epoch": 0.00764, + "grad_norm": 5.27083683013916, + "learning_rate": 1.528e-06, + "loss": 0.2909, + "step": 382 + }, + { + "epoch": 0.00768, + "grad_norm": 4.500430583953857, + "learning_rate": 1.536e-06, + "loss": 0.1962, + "step": 384 + }, + { + "epoch": 0.00772, + "grad_norm": 4.612331390380859, + "learning_rate": 1.5440000000000002e-06, + "loss": 0.2571, + "step": 386 + }, + { + "epoch": 0.00776, + "grad_norm": 4.993540287017822, + "learning_rate": 1.5520000000000001e-06, + "loss": 0.245, + "step": 388 + }, + { + "epoch": 0.0078, + "grad_norm": 4.826557636260986, + "learning_rate": 1.56e-06, + "loss": 0.2448, + "step": 390 + }, + { + "epoch": 0.00784, + "grad_norm": 4.119671821594238, + "learning_rate": 1.568e-06, + "loss": 0.1949, + "step": 392 + }, + { + "epoch": 0.00788, + "grad_norm": 3.617347002029419, + "learning_rate": 1.576e-06, + "loss": 0.1777, + "step": 394 + }, + { + "epoch": 0.00792, + "grad_norm": 3.8245086669921875, + "learning_rate": 1.5840000000000002e-06, + "loss": 0.2524, + "step": 396 + }, + { + "epoch": 0.00796, + "grad_norm": 3.7713558673858643, + "learning_rate": 1.5920000000000002e-06, + "loss": 0.1608, + "step": 398 + }, + { + "epoch": 0.008, + "grad_norm": 6.6342854499816895, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.3402, + "step": 400 + }, + { + "epoch": 0.00804, + "grad_norm": 6.32325553894043, + "learning_rate": 1.608e-06, + "loss": 0.2424, + "step": 402 + }, + { + "epoch": 0.00808, + "grad_norm": 6.334165096282959, + "learning_rate": 1.616e-06, + "loss": 0.2498, + "step": 404 + }, + { + "epoch": 0.00812, + "grad_norm": 3.9557228088378906, + "learning_rate": 1.624e-06, + "loss": 0.2574, + "step": 406 + }, + { + "epoch": 0.00816, + "grad_norm": 3.047877073287964, + "learning_rate": 1.6320000000000002e-06, + "loss": 0.125, + "step": 408 + }, + { + "epoch": 0.0082, + "grad_norm": 3.6898391246795654, + "learning_rate": 1.6400000000000002e-06, + "loss": 0.153, + "step": 410 + }, + { + "epoch": 0.00824, + "grad_norm": 6.637157440185547, + "learning_rate": 1.6480000000000001e-06, + "loss": 0.2562, + "step": 412 + }, + { + "epoch": 0.00828, + "grad_norm": 3.2268526554107666, + "learning_rate": 1.6560000000000001e-06, + "loss": 0.2362, + "step": 414 + }, + { + "epoch": 0.00832, + "grad_norm": 6.445939540863037, + "learning_rate": 1.664e-06, + "loss": 0.2562, + "step": 416 + }, + { + "epoch": 0.00836, + "grad_norm": 3.3353357315063477, + "learning_rate": 1.672e-06, + "loss": 0.1312, + "step": 418 + }, + { + "epoch": 0.0084, + "grad_norm": 7.240478992462158, + "learning_rate": 1.6800000000000002e-06, + "loss": 0.2705, + "step": 420 + }, + { + "epoch": 0.00844, + "grad_norm": 6.92221736907959, + "learning_rate": 1.6880000000000002e-06, + "loss": 0.2854, + "step": 422 + }, + { + "epoch": 0.00848, + "grad_norm": 6.538597583770752, + "learning_rate": 1.6960000000000002e-06, + "loss": 0.3817, + "step": 424 + }, + { + "epoch": 0.00852, + "grad_norm": 3.6251144409179688, + "learning_rate": 1.7040000000000001e-06, + "loss": 0.1316, + "step": 426 + }, + { + "epoch": 0.00856, + "grad_norm": 3.1985955238342285, + "learning_rate": 1.712e-06, + "loss": 0.1311, + "step": 428 + }, + { + "epoch": 0.0086, + "grad_norm": 6.0755839347839355, + "learning_rate": 1.72e-06, + "loss": 0.3401, + "step": 430 + }, + { + "epoch": 0.00864, + "grad_norm": 3.303703784942627, + "learning_rate": 1.7280000000000002e-06, + "loss": 0.2235, + "step": 432 + }, + { + "epoch": 0.00868, + "grad_norm": 4.459012031555176, + "learning_rate": 1.7360000000000002e-06, + "loss": 0.1948, + "step": 434 + }, + { + "epoch": 0.00872, + "grad_norm": 4.711503505706787, + "learning_rate": 1.7440000000000002e-06, + "loss": 0.2443, + "step": 436 + }, + { + "epoch": 0.00876, + "grad_norm": 3.690824031829834, + "learning_rate": 1.7520000000000001e-06, + "loss": 0.2159, + "step": 438 + }, + { + "epoch": 0.0088, + "grad_norm": 5.640236854553223, + "learning_rate": 1.76e-06, + "loss": 0.2663, + "step": 440 + }, + { + "epoch": 0.00884, + "grad_norm": 4.592121124267578, + "learning_rate": 1.7680000000000003e-06, + "loss": 0.2235, + "step": 442 + }, + { + "epoch": 0.00888, + "grad_norm": 5.544561386108398, + "learning_rate": 1.7760000000000002e-06, + "loss": 0.2675, + "step": 444 + }, + { + "epoch": 0.00892, + "grad_norm": 4.2949981689453125, + "learning_rate": 1.7840000000000002e-06, + "loss": 0.204, + "step": 446 + }, + { + "epoch": 0.00896, + "grad_norm": 4.239881992340088, + "learning_rate": 1.7920000000000002e-06, + "loss": 0.2234, + "step": 448 + }, + { + "epoch": 0.009, + "grad_norm": 4.900006294250488, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.2549, + "step": 450 + }, + { + "epoch": 0.00904, + "grad_norm": 4.744946479797363, + "learning_rate": 1.808e-06, + "loss": 0.2361, + "step": 452 + }, + { + "epoch": 0.00908, + "grad_norm": 4.410623550415039, + "learning_rate": 1.8160000000000003e-06, + "loss": 0.2049, + "step": 454 + }, + { + "epoch": 0.00912, + "grad_norm": 4.314598560333252, + "learning_rate": 1.8240000000000002e-06, + "loss": 0.2344, + "step": 456 + }, + { + "epoch": 0.00916, + "grad_norm": 5.072439193725586, + "learning_rate": 1.8320000000000002e-06, + "loss": 0.2075, + "step": 458 + }, + { + "epoch": 0.0092, + "grad_norm": 4.726014614105225, + "learning_rate": 1.8400000000000002e-06, + "loss": 0.2442, + "step": 460 + }, + { + "epoch": 0.00924, + "grad_norm": 4.00297737121582, + "learning_rate": 1.8480000000000001e-06, + "loss": 0.2139, + "step": 462 + }, + { + "epoch": 0.00928, + "grad_norm": 4.581110000610352, + "learning_rate": 1.856e-06, + "loss": 0.2137, + "step": 464 + }, + { + "epoch": 0.00932, + "grad_norm": 3.906006336212158, + "learning_rate": 1.8640000000000003e-06, + "loss": 0.1857, + "step": 466 + }, + { + "epoch": 0.00936, + "grad_norm": 5.37385368347168, + "learning_rate": 1.8720000000000002e-06, + "loss": 0.2662, + "step": 468 + }, + { + "epoch": 0.0094, + "grad_norm": 4.2370686531066895, + "learning_rate": 1.8800000000000002e-06, + "loss": 0.2341, + "step": 470 + }, + { + "epoch": 0.00944, + "grad_norm": 4.088732719421387, + "learning_rate": 1.8880000000000002e-06, + "loss": 0.2039, + "step": 472 + }, + { + "epoch": 0.00948, + "grad_norm": 4.685936450958252, + "learning_rate": 1.8960000000000001e-06, + "loss": 0.2336, + "step": 474 + }, + { + "epoch": 0.00952, + "grad_norm": 3.7215142250061035, + "learning_rate": 1.9040000000000003e-06, + "loss": 0.1774, + "step": 476 + }, + { + "epoch": 0.00956, + "grad_norm": 6.04464864730835, + "learning_rate": 1.912e-06, + "loss": 0.2571, + "step": 478 + }, + { + "epoch": 0.0096, + "grad_norm": 6.5656633377075195, + "learning_rate": 1.9200000000000003e-06, + "loss": 0.3415, + "step": 480 + }, + { + "epoch": 0.00964, + "grad_norm": 3.683687448501587, + "learning_rate": 1.928e-06, + "loss": 0.1601, + "step": 482 + }, + { + "epoch": 0.00968, + "grad_norm": 2.896716594696045, + "learning_rate": 1.936e-06, + "loss": 0.1646, + "step": 484 + }, + { + "epoch": 0.00972, + "grad_norm": 5.5879998207092285, + "learning_rate": 1.944e-06, + "loss": 0.2894, + "step": 486 + }, + { + "epoch": 0.00976, + "grad_norm": 6.27906608581543, + "learning_rate": 1.952e-06, + "loss": 0.2733, + "step": 488 + }, + { + "epoch": 0.0098, + "grad_norm": 5.570860862731934, + "learning_rate": 1.9600000000000003e-06, + "loss": 0.2774, + "step": 490 + }, + { + "epoch": 0.00984, + "grad_norm": 4.2514567375183105, + "learning_rate": 1.968e-06, + "loss": 0.1949, + "step": 492 + }, + { + "epoch": 0.00988, + "grad_norm": 6.043796539306641, + "learning_rate": 1.9760000000000002e-06, + "loss": 0.2555, + "step": 494 + }, + { + "epoch": 0.00992, + "grad_norm": 4.455687046051025, + "learning_rate": 1.984e-06, + "loss": 0.2333, + "step": 496 + }, + { + "epoch": 0.00996, + "grad_norm": 3.371537208557129, + "learning_rate": 1.992e-06, + "loss": 0.2112, + "step": 498 + }, + { + "epoch": 0.01, + "grad_norm": 5.2254438400268555, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.2773, + "step": 500 + }, + { + "epoch": 0.01004, + "grad_norm": 3.8979439735412598, + "learning_rate": 2.008e-06, + "loss": 0.1852, + "step": 502 + }, + { + "epoch": 0.01008, + "grad_norm": 5.364044189453125, + "learning_rate": 2.0160000000000003e-06, + "loss": 0.2553, + "step": 504 + }, + { + "epoch": 0.01012, + "grad_norm": 4.6848368644714355, + "learning_rate": 2.024e-06, + "loss": 0.2034, + "step": 506 + }, + { + "epoch": 0.01016, + "grad_norm": 3.220750570297241, + "learning_rate": 2.032e-06, + "loss": 0.1379, + "step": 508 + }, + { + "epoch": 0.0102, + "grad_norm": 3.354771614074707, + "learning_rate": 2.04e-06, + "loss": 0.1307, + "step": 510 + }, + { + "epoch": 0.01024, + "grad_norm": 2.584714889526367, + "learning_rate": 2.048e-06, + "loss": 0.2382, + "step": 512 + }, + { + "epoch": 0.01028, + "grad_norm": 7.172134876251221, + "learning_rate": 2.0560000000000003e-06, + "loss": 0.4274, + "step": 514 + }, + { + "epoch": 0.01032, + "grad_norm": 6.903685569763184, + "learning_rate": 2.064e-06, + "loss": 0.2524, + "step": 516 + }, + { + "epoch": 0.01036, + "grad_norm": 2.7015697956085205, + "learning_rate": 2.0720000000000002e-06, + "loss": 0.288, + "step": 518 + }, + { + "epoch": 0.0104, + "grad_norm": 2.7807185649871826, + "learning_rate": 2.08e-06, + "loss": 0.2295, + "step": 520 + }, + { + "epoch": 0.01044, + "grad_norm": 3.1879405975341797, + "learning_rate": 2.088e-06, + "loss": 0.1379, + "step": 522 + }, + { + "epoch": 0.01048, + "grad_norm": 4.02957010269165, + "learning_rate": 2.0960000000000003e-06, + "loss": 0.1535, + "step": 524 + }, + { + "epoch": 0.01052, + "grad_norm": 6.380918025970459, + "learning_rate": 2.104e-06, + "loss": 0.2285, + "step": 526 + }, + { + "epoch": 0.01056, + "grad_norm": 7.242382049560547, + "learning_rate": 2.1120000000000003e-06, + "loss": 0.4096, + "step": 528 + }, + { + "epoch": 0.0106, + "grad_norm": 3.4207797050476074, + "learning_rate": 2.12e-06, + "loss": 0.1389, + "step": 530 + }, + { + "epoch": 0.01064, + "grad_norm": 5.617804050445557, + "learning_rate": 2.128e-06, + "loss": 0.3137, + "step": 532 + }, + { + "epoch": 0.01068, + "grad_norm": 4.935652732849121, + "learning_rate": 2.1360000000000004e-06, + "loss": 0.2387, + "step": 534 + }, + { + "epoch": 0.01072, + "grad_norm": 4.834444522857666, + "learning_rate": 2.144e-06, + "loss": 0.3291, + "step": 536 + }, + { + "epoch": 0.01076, + "grad_norm": 4.746677875518799, + "learning_rate": 2.1520000000000003e-06, + "loss": 0.2234, + "step": 538 + }, + { + "epoch": 0.0108, + "grad_norm": 4.172887325286865, + "learning_rate": 2.16e-06, + "loss": 0.2554, + "step": 540 + }, + { + "epoch": 0.01084, + "grad_norm": 3.9458229541778564, + "learning_rate": 2.1680000000000002e-06, + "loss": 0.2438, + "step": 542 + }, + { + "epoch": 0.01088, + "grad_norm": 4.61482048034668, + "learning_rate": 2.176e-06, + "loss": 0.2659, + "step": 544 + }, + { + "epoch": 0.01092, + "grad_norm": 4.089561939239502, + "learning_rate": 2.184e-06, + "loss": 0.2139, + "step": 546 + }, + { + "epoch": 0.01096, + "grad_norm": 4.282458782196045, + "learning_rate": 2.1920000000000004e-06, + "loss": 0.2229, + "step": 548 + }, + { + "epoch": 0.011, + "grad_norm": 4.097751140594482, + "learning_rate": 2.2e-06, + "loss": 0.2129, + "step": 550 + }, + { + "epoch": 0.01104, + "grad_norm": 4.441385269165039, + "learning_rate": 2.2080000000000003e-06, + "loss": 0.2549, + "step": 552 + }, + { + "epoch": 0.01108, + "grad_norm": 4.363365650177002, + "learning_rate": 2.216e-06, + "loss": 0.244, + "step": 554 + }, + { + "epoch": 0.01112, + "grad_norm": 4.251750946044922, + "learning_rate": 2.2240000000000002e-06, + "loss": 0.2231, + "step": 556 + }, + { + "epoch": 0.01116, + "grad_norm": 4.919429779052734, + "learning_rate": 2.2320000000000004e-06, + "loss": 0.2437, + "step": 558 + }, + { + "epoch": 0.0112, + "grad_norm": 3.406036853790283, + "learning_rate": 2.24e-06, + "loss": 0.1865, + "step": 560 + }, + { + "epoch": 0.01124, + "grad_norm": 4.949376583099365, + "learning_rate": 2.2480000000000003e-06, + "loss": 0.2336, + "step": 562 + }, + { + "epoch": 0.01128, + "grad_norm": 4.6914381980896, + "learning_rate": 2.256e-06, + "loss": 0.2659, + "step": 564 + }, + { + "epoch": 0.01132, + "grad_norm": 3.416011095046997, + "learning_rate": 2.2640000000000003e-06, + "loss": 0.1763, + "step": 566 + }, + { + "epoch": 0.01136, + "grad_norm": 3.39005446434021, + "learning_rate": 2.2720000000000004e-06, + "loss": 0.1774, + "step": 568 + }, + { + "epoch": 0.0114, + "grad_norm": 6.063793659210205, + "learning_rate": 2.28e-06, + "loss": 0.2734, + "step": 570 + }, + { + "epoch": 0.01144, + "grad_norm": 3.4234580993652344, + "learning_rate": 2.2880000000000004e-06, + "loss": 0.2769, + "step": 572 + }, + { + "epoch": 0.01148, + "grad_norm": 6.639072895050049, + "learning_rate": 2.296e-06, + "loss": 0.2642, + "step": 574 + }, + { + "epoch": 0.01152, + "grad_norm": 6.489348411560059, + "learning_rate": 2.3040000000000003e-06, + "loss": 0.2554, + "step": 576 + }, + { + "epoch": 0.01156, + "grad_norm": 6.306353569030762, + "learning_rate": 2.312e-06, + "loss": 0.4099, + "step": 578 + }, + { + "epoch": 0.0116, + "grad_norm": 2.6609838008880615, + "learning_rate": 2.3200000000000002e-06, + "loss": 0.1183, + "step": 580 + }, + { + "epoch": 0.01164, + "grad_norm": 5.7523651123046875, + "learning_rate": 2.3280000000000004e-06, + "loss": 0.3667, + "step": 582 + }, + { + "epoch": 0.01168, + "grad_norm": 5.182748317718506, + "learning_rate": 2.336e-06, + "loss": 0.2385, + "step": 584 + }, + { + "epoch": 0.01172, + "grad_norm": 4.925599098205566, + "learning_rate": 2.3440000000000003e-06, + "loss": 0.2268, + "step": 586 + }, + { + "epoch": 0.01176, + "grad_norm": 3.742825508117676, + "learning_rate": 2.352e-06, + "loss": 0.2133, + "step": 588 + }, + { + "epoch": 0.0118, + "grad_norm": 4.115776538848877, + "learning_rate": 2.3600000000000003e-06, + "loss": 0.2439, + "step": 590 + }, + { + "epoch": 0.01184, + "grad_norm": 3.776080369949341, + "learning_rate": 2.3680000000000005e-06, + "loss": 0.1939, + "step": 592 + }, + { + "epoch": 0.01188, + "grad_norm": 3.732099771499634, + "learning_rate": 2.376e-06, + "loss": 0.2337, + "step": 594 + }, + { + "epoch": 0.01192, + "grad_norm": 3.6831634044647217, + "learning_rate": 2.3840000000000004e-06, + "loss": 0.1947, + "step": 596 + }, + { + "epoch": 0.01196, + "grad_norm": 4.625443458557129, + "learning_rate": 2.392e-06, + "loss": 0.2387, + "step": 598 + }, + { + "epoch": 0.012, + "grad_norm": 3.1420481204986572, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.2388, + "step": 600 + }, + { + "epoch": 0.01204, + "grad_norm": 4.736966609954834, + "learning_rate": 2.408e-06, + "loss": 0.2113, + "step": 602 + }, + { + "epoch": 0.01208, + "grad_norm": 4.696058750152588, + "learning_rate": 2.4160000000000002e-06, + "loss": 0.3153, + "step": 604 + }, + { + "epoch": 0.01212, + "grad_norm": 4.443164825439453, + "learning_rate": 2.4240000000000004e-06, + "loss": 0.3018, + "step": 606 + }, + { + "epoch": 0.01216, + "grad_norm": 4.112351417541504, + "learning_rate": 2.432e-06, + "loss": 0.2442, + "step": 608 + }, + { + "epoch": 0.0122, + "grad_norm": 3.943788528442383, + "learning_rate": 2.4400000000000004e-06, + "loss": 0.2228, + "step": 610 + }, + { + "epoch": 0.01224, + "grad_norm": 4.24630069732666, + "learning_rate": 2.448e-06, + "loss": 0.2549, + "step": 612 + }, + { + "epoch": 0.01228, + "grad_norm": 4.488097667694092, + "learning_rate": 2.4560000000000003e-06, + "loss": 0.2356, + "step": 614 + }, + { + "epoch": 0.01232, + "grad_norm": 3.2204904556274414, + "learning_rate": 2.4640000000000005e-06, + "loss": 0.1521, + "step": 616 + }, + { + "epoch": 0.01236, + "grad_norm": 2.9464430809020996, + "learning_rate": 2.4720000000000002e-06, + "loss": 0.2183, + "step": 618 + }, + { + "epoch": 0.0124, + "grad_norm": 4.890955924987793, + "learning_rate": 2.4800000000000004e-06, + "loss": 0.3017, + "step": 620 + }, + { + "epoch": 0.01244, + "grad_norm": 5.710669994354248, + "learning_rate": 2.488e-06, + "loss": 0.2871, + "step": 622 + }, + { + "epoch": 0.01248, + "grad_norm": 2.981384754180908, + "learning_rate": 2.496e-06, + "loss": 0.1885, + "step": 624 + }, + { + "epoch": 0.01252, + "grad_norm": 4.237135887145996, + "learning_rate": 2.5040000000000005e-06, + "loss": 0.2657, + "step": 626 + }, + { + "epoch": 0.01256, + "grad_norm": 4.312715530395508, + "learning_rate": 2.512e-06, + "loss": 0.2447, + "step": 628 + }, + { + "epoch": 0.0126, + "grad_norm": 4.359628200531006, + "learning_rate": 2.52e-06, + "loss": 0.2336, + "step": 630 + }, + { + "epoch": 0.01264, + "grad_norm": 4.290392875671387, + "learning_rate": 2.5280000000000006e-06, + "loss": 0.2151, + "step": 632 + }, + { + "epoch": 0.01268, + "grad_norm": 5.171399116516113, + "learning_rate": 2.536e-06, + "loss": 0.2647, + "step": 634 + }, + { + "epoch": 0.01272, + "grad_norm": 3.4068777561187744, + "learning_rate": 2.5440000000000005e-06, + "loss": 0.1853, + "step": 636 + }, + { + "epoch": 0.01276, + "grad_norm": 4.529778003692627, + "learning_rate": 2.552e-06, + "loss": 0.2891, + "step": 638 + }, + { + "epoch": 0.0128, + "grad_norm": 2.547908067703247, + "learning_rate": 2.56e-06, + "loss": 0.139, + "step": 640 + }, + { + "epoch": 0.01284, + "grad_norm": 4.91145133972168, + "learning_rate": 2.568e-06, + "loss": 0.3399, + "step": 642 + }, + { + "epoch": 0.01288, + "grad_norm": 2.8215854167938232, + "learning_rate": 2.576e-06, + "loss": 0.2556, + "step": 644 + }, + { + "epoch": 0.01292, + "grad_norm": 3.0611534118652344, + "learning_rate": 2.5840000000000006e-06, + "loss": 0.2389, + "step": 646 + }, + { + "epoch": 0.01296, + "grad_norm": 3.2263669967651367, + "learning_rate": 2.592e-06, + "loss": 0.2355, + "step": 648 + }, + { + "epoch": 0.013, + "grad_norm": 2.8610787391662598, + "learning_rate": 2.6e-06, + "loss": 0.1863, + "step": 650 + }, + { + "epoch": 0.01304, + "grad_norm": 4.183176040649414, + "learning_rate": 2.608e-06, + "loss": 0.2551, + "step": 652 + }, + { + "epoch": 0.01308, + "grad_norm": 3.785811185836792, + "learning_rate": 2.616e-06, + "loss": 0.2228, + "step": 654 + }, + { + "epoch": 0.01312, + "grad_norm": 4.537273406982422, + "learning_rate": 2.6240000000000006e-06, + "loss": 0.2891, + "step": 656 + }, + { + "epoch": 0.01316, + "grad_norm": 4.737910747528076, + "learning_rate": 2.632e-06, + "loss": 0.3139, + "step": 658 + }, + { + "epoch": 0.0132, + "grad_norm": 3.639007329940796, + "learning_rate": 2.64e-06, + "loss": 0.1942, + "step": 660 + }, + { + "epoch": 0.01324, + "grad_norm": 3.776348829269409, + "learning_rate": 2.648e-06, + "loss": 0.2434, + "step": 662 + }, + { + "epoch": 0.01328, + "grad_norm": 3.6177823543548584, + "learning_rate": 2.656e-06, + "loss": 0.2227, + "step": 664 + }, + { + "epoch": 0.01332, + "grad_norm": 3.8628368377685547, + "learning_rate": 2.6640000000000007e-06, + "loss": 0.1861, + "step": 666 + }, + { + "epoch": 0.01336, + "grad_norm": 3.719395637512207, + "learning_rate": 2.672e-06, + "loss": 0.2226, + "step": 668 + }, + { + "epoch": 0.0134, + "grad_norm": 3.764678955078125, + "learning_rate": 2.68e-06, + "loss": 0.2226, + "step": 670 + }, + { + "epoch": 0.01344, + "grad_norm": 4.256840705871582, + "learning_rate": 2.688e-06, + "loss": 0.2654, + "step": 672 + }, + { + "epoch": 0.01348, + "grad_norm": 3.649930477142334, + "learning_rate": 2.696e-06, + "loss": 0.2668, + "step": 674 + }, + { + "epoch": 0.01352, + "grad_norm": 3.1871910095214844, + "learning_rate": 2.704e-06, + "loss": 0.1759, + "step": 676 + }, + { + "epoch": 0.01356, + "grad_norm": 3.769956350326538, + "learning_rate": 2.712e-06, + "loss": 0.203, + "step": 678 + }, + { + "epoch": 0.0136, + "grad_norm": 3.3649752140045166, + "learning_rate": 2.7200000000000002e-06, + "loss": 0.1846, + "step": 680 + }, + { + "epoch": 0.01364, + "grad_norm": 4.298521518707275, + "learning_rate": 2.728e-06, + "loss": 0.2433, + "step": 682 + }, + { + "epoch": 0.01368, + "grad_norm": 4.070352077484131, + "learning_rate": 2.736e-06, + "loss": 0.2436, + "step": 684 + }, + { + "epoch": 0.01372, + "grad_norm": 3.905560255050659, + "learning_rate": 2.744e-06, + "loss": 0.2225, + "step": 686 + }, + { + "epoch": 0.01376, + "grad_norm": 4.8821187019348145, + "learning_rate": 2.752e-06, + "loss": 0.2775, + "step": 688 + }, + { + "epoch": 0.0138, + "grad_norm": 4.284414291381836, + "learning_rate": 2.7600000000000003e-06, + "loss": 0.2889, + "step": 690 + }, + { + "epoch": 0.01384, + "grad_norm": 3.996843099594116, + "learning_rate": 2.768e-06, + "loss": 0.2542, + "step": 692 + }, + { + "epoch": 0.01388, + "grad_norm": 3.185818672180176, + "learning_rate": 2.776e-06, + "loss": 0.2132, + "step": 694 + }, + { + "epoch": 0.01392, + "grad_norm": 3.308976650238037, + "learning_rate": 2.784e-06, + "loss": 0.203, + "step": 696 + }, + { + "epoch": 0.01396, + "grad_norm": 4.243391036987305, + "learning_rate": 2.792e-06, + "loss": 0.2656, + "step": 698 + }, + { + "epoch": 0.014, + "grad_norm": 5.526286602020264, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.2627, + "step": 700 + }, + { + "epoch": 0.01404, + "grad_norm": 4.639754295349121, + "learning_rate": 2.808e-06, + "loss": 0.2474, + "step": 702 + }, + { + "epoch": 0.01408, + "grad_norm": 2.9134769439697266, + "learning_rate": 2.8160000000000002e-06, + "loss": 0.2303, + "step": 704 + }, + { + "epoch": 0.01412, + "grad_norm": 5.054844379425049, + "learning_rate": 2.824e-06, + "loss": 0.2568, + "step": 706 + }, + { + "epoch": 0.01416, + "grad_norm": 2.7024917602539062, + "learning_rate": 2.832e-06, + "loss": 0.1766, + "step": 708 + }, + { + "epoch": 0.0142, + "grad_norm": 4.263340950012207, + "learning_rate": 2.84e-06, + "loss": 0.2655, + "step": 710 + }, + { + "epoch": 0.01424, + "grad_norm": 4.348719596862793, + "learning_rate": 2.848e-06, + "loss": 0.2445, + "step": 712 + }, + { + "epoch": 0.01428, + "grad_norm": 4.259558200836182, + "learning_rate": 2.8560000000000003e-06, + "loss": 0.2654, + "step": 714 + }, + { + "epoch": 0.01432, + "grad_norm": 4.209328651428223, + "learning_rate": 2.864e-06, + "loss": 0.2548, + "step": 716 + }, + { + "epoch": 0.01436, + "grad_norm": 3.4755661487579346, + "learning_rate": 2.872e-06, + "loss": 0.2333, + "step": 718 + }, + { + "epoch": 0.0144, + "grad_norm": 3.4132802486419678, + "learning_rate": 2.88e-06, + "loss": 0.1846, + "step": 720 + }, + { + "epoch": 0.01444, + "grad_norm": 5.0501861572265625, + "learning_rate": 2.888e-06, + "loss": 0.251, + "step": 722 + }, + { + "epoch": 0.01448, + "grad_norm": 3.3669567108154297, + "learning_rate": 2.8960000000000003e-06, + "loss": 0.278, + "step": 724 + }, + { + "epoch": 0.01452, + "grad_norm": 4.937889575958252, + "learning_rate": 2.904e-06, + "loss": 0.2509, + "step": 726 + }, + { + "epoch": 0.01456, + "grad_norm": 4.605134010314941, + "learning_rate": 2.9120000000000002e-06, + "loss": 0.3133, + "step": 728 + }, + { + "epoch": 0.0146, + "grad_norm": 4.786699295043945, + "learning_rate": 2.92e-06, + "loss": 0.2427, + "step": 730 + }, + { + "epoch": 0.01464, + "grad_norm": 4.346502780914307, + "learning_rate": 2.928e-06, + "loss": 0.2773, + "step": 732 + }, + { + "epoch": 0.01468, + "grad_norm": 3.5423214435577393, + "learning_rate": 2.9360000000000003e-06, + "loss": 0.2236, + "step": 734 + }, + { + "epoch": 0.01472, + "grad_norm": 3.8420934677124023, + "learning_rate": 2.944e-06, + "loss": 0.196, + "step": 736 + }, + { + "epoch": 0.01476, + "grad_norm": 3.8465702533721924, + "learning_rate": 2.9520000000000003e-06, + "loss": 0.2324, + "step": 738 + }, + { + "epoch": 0.0148, + "grad_norm": 3.214421272277832, + "learning_rate": 2.96e-06, + "loss": 0.203, + "step": 740 + }, + { + "epoch": 0.01484, + "grad_norm": 4.110684871673584, + "learning_rate": 2.9680000000000002e-06, + "loss": 0.2434, + "step": 742 + }, + { + "epoch": 0.01488, + "grad_norm": 3.1024391651153564, + "learning_rate": 2.976e-06, + "loss": 0.2043, + "step": 744 + }, + { + "epoch": 0.01492, + "grad_norm": 4.000793933868408, + "learning_rate": 2.984e-06, + "loss": 0.254, + "step": 746 + }, + { + "epoch": 0.01496, + "grad_norm": 3.739203929901123, + "learning_rate": 2.9920000000000003e-06, + "loss": 0.2434, + "step": 748 + }, + { + "epoch": 0.015, + "grad_norm": 3.4272687435150146, + "learning_rate": 3e-06, + "loss": 0.2472, + "step": 750 + }, + { + "epoch": 0.01504, + "grad_norm": 3.367252826690674, + "learning_rate": 3.0080000000000003e-06, + "loss": 0.2446, + "step": 752 + }, + { + "epoch": 0.01508, + "grad_norm": 3.745511531829834, + "learning_rate": 3.016e-06, + "loss": 0.2667, + "step": 754 + }, + { + "epoch": 0.01512, + "grad_norm": 2.8954169750213623, + "learning_rate": 3.024e-06, + "loss": 0.1677, + "step": 756 + }, + { + "epoch": 0.01516, + "grad_norm": 3.9555675983428955, + "learning_rate": 3.0320000000000004e-06, + "loss": 0.2776, + "step": 758 + }, + { + "epoch": 0.0152, + "grad_norm": 3.2025487422943115, + "learning_rate": 3.04e-06, + "loss": 0.1783, + "step": 760 + }, + { + "epoch": 0.01524, + "grad_norm": 2.9037082195281982, + "learning_rate": 3.0480000000000003e-06, + "loss": 0.152, + "step": 762 + }, + { + "epoch": 0.01528, + "grad_norm": 2.8488287925720215, + "learning_rate": 3.056e-06, + "loss": 0.1446, + "step": 764 + }, + { + "epoch": 0.01532, + "grad_norm": 4.0185627937316895, + "learning_rate": 3.0640000000000002e-06, + "loss": 0.289, + "step": 766 + }, + { + "epoch": 0.01536, + "grad_norm": 4.888598442077637, + "learning_rate": 3.072e-06, + "loss": 0.2562, + "step": 768 + }, + { + "epoch": 0.0154, + "grad_norm": 4.536263465881348, + "learning_rate": 3.08e-06, + "loss": 0.3264, + "step": 770 + }, + { + "epoch": 0.01544, + "grad_norm": 4.0151262283325195, + "learning_rate": 3.0880000000000003e-06, + "loss": 0.2351, + "step": 772 + }, + { + "epoch": 0.01548, + "grad_norm": 4.296571254730225, + "learning_rate": 3.096e-06, + "loss": 0.3146, + "step": 774 + }, + { + "epoch": 0.01552, + "grad_norm": 2.991231679916382, + "learning_rate": 3.1040000000000003e-06, + "loss": 0.2149, + "step": 776 + }, + { + "epoch": 0.01556, + "grad_norm": 3.4674265384674072, + "learning_rate": 3.112e-06, + "loss": 0.1941, + "step": 778 + }, + { + "epoch": 0.0156, + "grad_norm": 3.7596969604492188, + "learning_rate": 3.12e-06, + "loss": 0.2547, + "step": 780 + }, + { + "epoch": 0.01564, + "grad_norm": 3.3881540298461914, + "learning_rate": 3.1280000000000004e-06, + "loss": 0.1846, + "step": 782 + }, + { + "epoch": 0.01568, + "grad_norm": 3.945168972015381, + "learning_rate": 3.136e-06, + "loss": 0.2654, + "step": 784 + }, + { + "epoch": 0.01572, + "grad_norm": 3.908146619796753, + "learning_rate": 3.1440000000000003e-06, + "loss": 0.254, + "step": 786 + }, + { + "epoch": 0.01576, + "grad_norm": 3.850041151046753, + "learning_rate": 3.152e-06, + "loss": 0.2324, + "step": 788 + }, + { + "epoch": 0.0158, + "grad_norm": 4.383365154266357, + "learning_rate": 3.1600000000000002e-06, + "loss": 0.2547, + "step": 790 + }, + { + "epoch": 0.01584, + "grad_norm": 3.869478940963745, + "learning_rate": 3.1680000000000004e-06, + "loss": 0.2665, + "step": 792 + }, + { + "epoch": 0.01588, + "grad_norm": 2.915801763534546, + "learning_rate": 3.176e-06, + "loss": 0.1763, + "step": 794 + }, + { + "epoch": 0.01592, + "grad_norm": 2.5089712142944336, + "learning_rate": 3.1840000000000003e-06, + "loss": 0.2281, + "step": 796 + }, + { + "epoch": 0.01596, + "grad_norm": 5.793497562408447, + "learning_rate": 3.192e-06, + "loss": 0.2722, + "step": 798 + }, + { + "epoch": 0.016, + "grad_norm": 5.330954074859619, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.3801, + "step": 800 + }, + { + "epoch": 0.01604, + "grad_norm": 5.492720127105713, + "learning_rate": 3.208e-06, + "loss": 0.2695, + "step": 802 + }, + { + "epoch": 0.01608, + "grad_norm": 2.6513822078704834, + "learning_rate": 3.216e-06, + "loss": 0.144, + "step": 804 + }, + { + "epoch": 0.01612, + "grad_norm": 4.511475086212158, + "learning_rate": 3.2240000000000004e-06, + "loss": 0.2595, + "step": 806 + }, + { + "epoch": 0.01616, + "grad_norm": 4.512204647064209, + "learning_rate": 3.232e-06, + "loss": 0.3132, + "step": 808 + }, + { + "epoch": 0.0162, + "grad_norm": 2.8798182010650635, + "learning_rate": 3.2400000000000003e-06, + "loss": 0.256, + "step": 810 + }, + { + "epoch": 0.01624, + "grad_norm": 3.07746958732605, + "learning_rate": 3.248e-06, + "loss": 0.2129, + "step": 812 + }, + { + "epoch": 0.01628, + "grad_norm": 3.8055660724639893, + "learning_rate": 3.2560000000000003e-06, + "loss": 0.2653, + "step": 814 + }, + { + "epoch": 0.01632, + "grad_norm": 3.8537862300872803, + "learning_rate": 3.2640000000000004e-06, + "loss": 0.2654, + "step": 816 + }, + { + "epoch": 0.01636, + "grad_norm": 3.763394355773926, + "learning_rate": 3.272e-06, + "loss": 0.2547, + "step": 818 + }, + { + "epoch": 0.0164, + "grad_norm": 3.733487129211426, + "learning_rate": 3.2800000000000004e-06, + "loss": 0.2431, + "step": 820 + }, + { + "epoch": 0.01644, + "grad_norm": 3.5346643924713135, + "learning_rate": 3.288e-06, + "loss": 0.2225, + "step": 822 + }, + { + "epoch": 0.01648, + "grad_norm": 4.021689414978027, + "learning_rate": 3.2960000000000003e-06, + "loss": 0.2668, + "step": 824 + }, + { + "epoch": 0.01652, + "grad_norm": 3.769071578979492, + "learning_rate": 3.3040000000000005e-06, + "loss": 0.2445, + "step": 826 + }, + { + "epoch": 0.01656, + "grad_norm": 3.4447572231292725, + "learning_rate": 3.3120000000000002e-06, + "loss": 0.2431, + "step": 828 + }, + { + "epoch": 0.0166, + "grad_norm": 3.038853168487549, + "learning_rate": 3.3200000000000004e-06, + "loss": 0.2224, + "step": 830 + }, + { + "epoch": 0.01664, + "grad_norm": 4.425490379333496, + "learning_rate": 3.328e-06, + "loss": 0.3014, + "step": 832 + }, + { + "epoch": 0.01668, + "grad_norm": 2.8128044605255127, + "learning_rate": 3.3360000000000003e-06, + "loss": 0.1934, + "step": 834 + }, + { + "epoch": 0.01672, + "grad_norm": 2.6545250415802, + "learning_rate": 3.344e-06, + "loss": 0.1758, + "step": 836 + }, + { + "epoch": 0.01676, + "grad_norm": 2.4833126068115234, + "learning_rate": 3.3520000000000003e-06, + "loss": 0.144, + "step": 838 + }, + { + "epoch": 0.0168, + "grad_norm": 2.2705655097961426, + "learning_rate": 3.3600000000000004e-06, + "loss": 0.255, + "step": 840 + }, + { + "epoch": 0.01684, + "grad_norm": 4.756883144378662, + "learning_rate": 3.368e-06, + "loss": 0.38, + "step": 842 + }, + { + "epoch": 0.01688, + "grad_norm": 2.2124199867248535, + "learning_rate": 3.3760000000000004e-06, + "loss": 0.2695, + "step": 844 + }, + { + "epoch": 0.01692, + "grad_norm": 2.4659252166748047, + "learning_rate": 3.384e-06, + "loss": 0.2482, + "step": 846 + }, + { + "epoch": 0.01696, + "grad_norm": 2.3692679405212402, + "learning_rate": 3.3920000000000003e-06, + "loss": 0.1442, + "step": 848 + }, + { + "epoch": 0.017, + "grad_norm": 2.321537733078003, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.2481, + "step": 850 + }, + { + "epoch": 0.01704, + "grad_norm": 2.3623554706573486, + "learning_rate": 3.4080000000000002e-06, + "loss": 0.2482, + "step": 852 + }, + { + "epoch": 0.01708, + "grad_norm": 2.4127564430236816, + "learning_rate": 3.4160000000000004e-06, + "loss": 0.1517, + "step": 854 + }, + { + "epoch": 0.01712, + "grad_norm": 4.422900199890137, + "learning_rate": 3.424e-06, + "loss": 0.3523, + "step": 856 + }, + { + "epoch": 0.01716, + "grad_norm": 2.4912867546081543, + "learning_rate": 3.4320000000000003e-06, + "loss": 0.1591, + "step": 858 + }, + { + "epoch": 0.0172, + "grad_norm": 2.3539979457855225, + "learning_rate": 3.44e-06, + "loss": 0.1597, + "step": 860 + }, + { + "epoch": 0.01724, + "grad_norm": 4.167235374450684, + "learning_rate": 3.4480000000000003e-06, + "loss": 0.235, + "step": 862 + }, + { + "epoch": 0.01728, + "grad_norm": 2.262232542037964, + "learning_rate": 3.4560000000000005e-06, + "loss": 0.2412, + "step": 864 + }, + { + "epoch": 0.01732, + "grad_norm": 2.4839165210723877, + "learning_rate": 3.464e-06, + "loss": 0.137, + "step": 866 + }, + { + "epoch": 0.01736, + "grad_norm": 4.557440757751465, + "learning_rate": 3.4720000000000004e-06, + "loss": 0.3661, + "step": 868 + }, + { + "epoch": 0.0174, + "grad_norm": 4.384861946105957, + "learning_rate": 3.48e-06, + "loss": 0.339, + "step": 870 + }, + { + "epoch": 0.01744, + "grad_norm": 4.111546516418457, + "learning_rate": 3.4880000000000003e-06, + "loss": 0.3132, + "step": 872 + }, + { + "epoch": 0.01748, + "grad_norm": 3.407968521118164, + "learning_rate": 3.4960000000000005e-06, + "loss": 0.2148, + "step": 874 + }, + { + "epoch": 0.01752, + "grad_norm": 2.930421829223633, + "learning_rate": 3.5040000000000002e-06, + "loss": 0.2124, + "step": 876 + }, + { + "epoch": 0.01756, + "grad_norm": 2.7104508876800537, + "learning_rate": 3.5120000000000004e-06, + "loss": 0.1936, + "step": 878 + }, + { + "epoch": 0.0176, + "grad_norm": 3.875896453857422, + "learning_rate": 3.52e-06, + "loss": 0.3015, + "step": 880 + }, + { + "epoch": 0.01764, + "grad_norm": 2.262103796005249, + "learning_rate": 3.5280000000000004e-06, + "loss": 0.1595, + "step": 882 + }, + { + "epoch": 0.01768, + "grad_norm": 2.070067882537842, + "learning_rate": 3.5360000000000005e-06, + "loss": 0.1375, + "step": 884 + }, + { + "epoch": 0.01772, + "grad_norm": 4.203448295593262, + "learning_rate": 3.5440000000000003e-06, + "loss": 0.4094, + "step": 886 + }, + { + "epoch": 0.01776, + "grad_norm": 1.730810284614563, + "learning_rate": 3.5520000000000005e-06, + "loss": 0.1177, + "step": 888 + }, + { + "epoch": 0.0178, + "grad_norm": 1.7230323553085327, + "learning_rate": 3.5600000000000002e-06, + "loss": 0.2635, + "step": 890 + }, + { + "epoch": 0.01784, + "grad_norm": 4.441695213317871, + "learning_rate": 3.5680000000000004e-06, + "loss": 0.294, + "step": 892 + }, + { + "epoch": 0.01788, + "grad_norm": 1.6931177377700806, + "learning_rate": 3.576e-06, + "loss": 0.2784, + "step": 894 + }, + { + "epoch": 0.01792, + "grad_norm": 1.7744909524917603, + "learning_rate": 3.5840000000000003e-06, + "loss": 0.2787, + "step": 896 + }, + { + "epoch": 0.01796, + "grad_norm": 1.4811129570007324, + "learning_rate": 3.5920000000000005e-06, + "loss": 0.112, + "step": 898 + }, + { + "epoch": 0.018, + "grad_norm": 1.8058747053146362, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.1306, + "step": 900 + }, + { + "epoch": 0.01804, + "grad_norm": 1.5712918043136597, + "learning_rate": 3.6080000000000004e-06, + "loss": 0.112, + "step": 902 + }, + { + "epoch": 0.01808, + "grad_norm": 1.5269100666046143, + "learning_rate": 3.616e-06, + "loss": 0.2577, + "step": 904 + }, + { + "epoch": 0.01812, + "grad_norm": 1.6885077953338623, + "learning_rate": 3.6240000000000004e-06, + "loss": 0.1121, + "step": 906 + }, + { + "epoch": 0.01816, + "grad_norm": 4.367637634277344, + "learning_rate": 3.6320000000000005e-06, + "loss": 0.4255, + "step": 908 + }, + { + "epoch": 0.0182, + "grad_norm": 1.6348453760147095, + "learning_rate": 3.6400000000000003e-06, + "loss": 0.2637, + "step": 910 + }, + { + "epoch": 0.01824, + "grad_norm": 3.4965250492095947, + "learning_rate": 3.6480000000000005e-06, + "loss": 0.2294, + "step": 912 + }, + { + "epoch": 0.01828, + "grad_norm": 1.9627796411514282, + "learning_rate": 3.6560000000000002e-06, + "loss": 0.1311, + "step": 914 + }, + { + "epoch": 0.01832, + "grad_norm": 3.7897562980651855, + "learning_rate": 3.6640000000000004e-06, + "loss": 0.3806, + "step": 916 + }, + { + "epoch": 0.01836, + "grad_norm": 1.8336730003356934, + "learning_rate": 3.6720000000000006e-06, + "loss": 0.2418, + "step": 918 + }, + { + "epoch": 0.0184, + "grad_norm": 3.544163942337036, + "learning_rate": 3.6800000000000003e-06, + "loss": 0.2355, + "step": 920 + }, + { + "epoch": 0.01844, + "grad_norm": 2.118680715560913, + "learning_rate": 3.6880000000000005e-06, + "loss": 0.1679, + "step": 922 + }, + { + "epoch": 0.01848, + "grad_norm": 2.075124979019165, + "learning_rate": 3.6960000000000003e-06, + "loss": 0.243, + "step": 924 + }, + { + "epoch": 0.01852, + "grad_norm": 3.3830387592315674, + "learning_rate": 3.7040000000000005e-06, + "loss": 0.3137, + "step": 926 + }, + { + "epoch": 0.01856, + "grad_norm": 2.5838842391967773, + "learning_rate": 3.712e-06, + "loss": 0.2448, + "step": 928 + }, + { + "epoch": 0.0186, + "grad_norm": 2.516493797302246, + "learning_rate": 3.7200000000000004e-06, + "loss": 0.2029, + "step": 930 + }, + { + "epoch": 0.01864, + "grad_norm": 3.094906806945801, + "learning_rate": 3.7280000000000006e-06, + "loss": 0.2541, + "step": 932 + }, + { + "epoch": 0.01868, + "grad_norm": 2.7972841262817383, + "learning_rate": 3.7360000000000003e-06, + "loss": 0.2548, + "step": 934 + }, + { + "epoch": 0.01872, + "grad_norm": 2.7139720916748047, + "learning_rate": 3.7440000000000005e-06, + "loss": 0.2225, + "step": 936 + }, + { + "epoch": 0.01876, + "grad_norm": 2.825547933578491, + "learning_rate": 3.7520000000000002e-06, + "loss": 0.2432, + "step": 938 + }, + { + "epoch": 0.0188, + "grad_norm": 3.081465005874634, + "learning_rate": 3.7600000000000004e-06, + "loss": 0.2332, + "step": 940 + }, + { + "epoch": 0.01884, + "grad_norm": 2.708775281906128, + "learning_rate": 3.7680000000000006e-06, + "loss": 0.2226, + "step": 942 + }, + { + "epoch": 0.01888, + "grad_norm": 2.744302749633789, + "learning_rate": 3.7760000000000004e-06, + "loss": 0.2332, + "step": 944 + }, + { + "epoch": 0.01892, + "grad_norm": 2.764401912689209, + "learning_rate": 3.7840000000000005e-06, + "loss": 0.2225, + "step": 946 + }, + { + "epoch": 0.01896, + "grad_norm": 2.543426513671875, + "learning_rate": 3.7920000000000003e-06, + "loss": 0.2125, + "step": 948 + }, + { + "epoch": 0.019, + "grad_norm": 2.656022071838379, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.2125, + "step": 950 + }, + { + "epoch": 0.01904, + "grad_norm": 2.8062491416931152, + "learning_rate": 3.8080000000000006e-06, + "loss": 0.2239, + "step": 952 + }, + { + "epoch": 0.01908, + "grad_norm": 3.3187570571899414, + "learning_rate": 3.816e-06, + "loss": 0.2385, + "step": 954 + }, + { + "epoch": 0.01912, + "grad_norm": 2.2662224769592285, + "learning_rate": 3.824e-06, + "loss": 0.2263, + "step": 956 + }, + { + "epoch": 0.01916, + "grad_norm": 1.9490553140640259, + "learning_rate": 3.832e-06, + "loss": 0.152, + "step": 958 + }, + { + "epoch": 0.0192, + "grad_norm": 1.9136005640029907, + "learning_rate": 3.8400000000000005e-06, + "loss": 0.152, + "step": 960 + }, + { + "epoch": 0.01924, + "grad_norm": 2.1286721229553223, + "learning_rate": 3.848e-06, + "loss": 0.152, + "step": 962 + }, + { + "epoch": 0.01928, + "grad_norm": 3.4795660972595215, + "learning_rate": 3.856e-06, + "loss": 0.3396, + "step": 964 + }, + { + "epoch": 0.01932, + "grad_norm": 1.7733250856399536, + "learning_rate": 3.864000000000001e-06, + "loss": 0.1304, + "step": 966 + }, + { + "epoch": 0.01936, + "grad_norm": 3.968585968017578, + "learning_rate": 3.872e-06, + "loss": 0.2492, + "step": 968 + }, + { + "epoch": 0.0194, + "grad_norm": 1.6123791933059692, + "learning_rate": 3.88e-06, + "loss": 0.2354, + "step": 970 + }, + { + "epoch": 0.01944, + "grad_norm": 1.8001261949539185, + "learning_rate": 3.888e-06, + "loss": 0.2701, + "step": 972 + }, + { + "epoch": 0.01948, + "grad_norm": 3.965688467025757, + "learning_rate": 3.8960000000000005e-06, + "loss": 0.3805, + "step": 974 + }, + { + "epoch": 0.01952, + "grad_norm": 3.8302969932556152, + "learning_rate": 3.904e-06, + "loss": 0.2491, + "step": 976 + }, + { + "epoch": 0.01956, + "grad_norm": 3.4181478023529053, + "learning_rate": 3.912e-06, + "loss": 0.2036, + "step": 978 + }, + { + "epoch": 0.0196, + "grad_norm": 2.2215044498443604, + "learning_rate": 3.920000000000001e-06, + "loss": 0.1595, + "step": 980 + }, + { + "epoch": 0.01964, + "grad_norm": 2.3322677612304688, + "learning_rate": 3.928e-06, + "loss": 0.1678, + "step": 982 + }, + { + "epoch": 0.01968, + "grad_norm": 2.3459746837615967, + "learning_rate": 3.936e-06, + "loss": 0.2384, + "step": 984 + }, + { + "epoch": 0.01972, + "grad_norm": 3.4067084789276123, + "learning_rate": 3.944e-06, + "loss": 0.301, + "step": 986 + }, + { + "epoch": 0.01976, + "grad_norm": 3.4219253063201904, + "learning_rate": 3.9520000000000004e-06, + "loss": 0.3134, + "step": 988 + }, + { + "epoch": 0.0198, + "grad_norm": 3.324173927307129, + "learning_rate": 3.96e-06, + "loss": 0.2352, + "step": 990 + }, + { + "epoch": 0.01984, + "grad_norm": 2.7165307998657227, + "learning_rate": 3.968e-06, + "loss": 0.2029, + "step": 992 + }, + { + "epoch": 0.01988, + "grad_norm": 2.749866008758545, + "learning_rate": 3.9760000000000006e-06, + "loss": 0.2224, + "step": 994 + }, + { + "epoch": 0.01992, + "grad_norm": 2.6545495986938477, + "learning_rate": 3.984e-06, + "loss": 0.2224, + "step": 996 + }, + { + "epoch": 0.01996, + "grad_norm": 3.3177623748779297, + "learning_rate": 3.992e-06, + "loss": 0.2654, + "step": 998 + }, + { + "epoch": 0.02, + "grad_norm": 3.1023192405700684, + "learning_rate": 4.000000000000001e-06, + "loss": 0.2431, + "step": 1000 + }, + { + "epoch": 0.02004, + "grad_norm": 2.7860546112060547, + "learning_rate": 4.008e-06, + "loss": 0.2131, + "step": 1002 + }, + { + "epoch": 0.02008, + "grad_norm": 2.4495904445648193, + "learning_rate": 4.016e-06, + "loss": 0.2237, + "step": 1004 + }, + { + "epoch": 0.02012, + "grad_norm": 3.059225559234619, + "learning_rate": 4.024e-06, + "loss": 0.2325, + "step": 1006 + }, + { + "epoch": 0.02016, + "grad_norm": 2.7723662853240967, + "learning_rate": 4.0320000000000005e-06, + "loss": 0.2445, + "step": 1008 + }, + { + "epoch": 0.0202, + "grad_norm": 2.6414833068847656, + "learning_rate": 4.04e-06, + "loss": 0.2028, + "step": 1010 + }, + { + "epoch": 0.02024, + "grad_norm": 2.913097858428955, + "learning_rate": 4.048e-06, + "loss": 0.2324, + "step": 1012 + }, + { + "epoch": 0.02028, + "grad_norm": 2.943275213241577, + "learning_rate": 4.056000000000001e-06, + "loss": 0.2237, + "step": 1014 + }, + { + "epoch": 0.02032, + "grad_norm": 3.284144163131714, + "learning_rate": 4.064e-06, + "loss": 0.2565, + "step": 1016 + }, + { + "epoch": 0.02036, + "grad_norm": 2.550537109375, + "learning_rate": 4.072e-06, + "loss": 0.1936, + "step": 1018 + }, + { + "epoch": 0.0204, + "grad_norm": 2.4376444816589355, + "learning_rate": 4.08e-06, + "loss": 0.2509, + "step": 1020 + }, + { + "epoch": 0.02044, + "grad_norm": 3.4603254795074463, + "learning_rate": 4.0880000000000005e-06, + "loss": 0.2887, + "step": 1022 + }, + { + "epoch": 0.02048, + "grad_norm": 3.218043565750122, + "learning_rate": 4.096e-06, + "loss": 0.2655, + "step": 1024 + }, + { + "epoch": 0.02052, + "grad_norm": 2.8557794094085693, + "learning_rate": 4.104e-06, + "loss": 0.2225, + "step": 1026 + }, + { + "epoch": 0.02056, + "grad_norm": 2.812446355819702, + "learning_rate": 4.112000000000001e-06, + "loss": 0.2564, + "step": 1028 + }, + { + "epoch": 0.0206, + "grad_norm": 3.0725901126861572, + "learning_rate": 4.12e-06, + "loss": 0.2444, + "step": 1030 + }, + { + "epoch": 0.02064, + "grad_norm": 3.576932668685913, + "learning_rate": 4.128e-06, + "loss": 0.2546, + "step": 1032 + }, + { + "epoch": 0.02068, + "grad_norm": 2.7877004146575928, + "learning_rate": 4.136000000000001e-06, + "loss": 0.2029, + "step": 1034 + }, + { + "epoch": 0.02072, + "grad_norm": 2.5440382957458496, + "learning_rate": 4.1440000000000005e-06, + "loss": 0.2238, + "step": 1036 + }, + { + "epoch": 0.02076, + "grad_norm": 3.158332109451294, + "learning_rate": 4.152e-06, + "loss": 0.2431, + "step": 1038 + }, + { + "epoch": 0.0208, + "grad_norm": 3.0014407634735107, + "learning_rate": 4.16e-06, + "loss": 0.2664, + "step": 1040 + }, + { + "epoch": 0.02084, + "grad_norm": 3.259829044342041, + "learning_rate": 4.168000000000001e-06, + "loss": 0.2653, + "step": 1042 + }, + { + "epoch": 0.02088, + "grad_norm": 2.531860113143921, + "learning_rate": 4.176e-06, + "loss": 0.2331, + "step": 1044 + }, + { + "epoch": 0.02092, + "grad_norm": 3.260572671890259, + "learning_rate": 4.184e-06, + "loss": 0.2541, + "step": 1046 + }, + { + "epoch": 0.02096, + "grad_norm": 2.9908628463745117, + "learning_rate": 4.192000000000001e-06, + "loss": 0.2326, + "step": 1048 + }, + { + "epoch": 0.021, + "grad_norm": 3.373250722885132, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.2236, + "step": 1050 + }, + { + "epoch": 0.02104, + "grad_norm": 2.4832353591918945, + "learning_rate": 4.208e-06, + "loss": 0.1933, + "step": 1052 + }, + { + "epoch": 0.02108, + "grad_norm": 2.094811201095581, + "learning_rate": 4.216e-06, + "loss": 0.2301, + "step": 1054 + }, + { + "epoch": 0.02112, + "grad_norm": 3.3867886066436768, + "learning_rate": 4.2240000000000006e-06, + "loss": 0.3007, + "step": 1056 + }, + { + "epoch": 0.02116, + "grad_norm": 3.2629079818725586, + "learning_rate": 4.232e-06, + "loss": 0.326, + "step": 1058 + }, + { + "epoch": 0.0212, + "grad_norm": 2.0626964569091797, + "learning_rate": 4.24e-06, + "loss": 0.1676, + "step": 1060 + }, + { + "epoch": 0.02124, + "grad_norm": 3.1327996253967285, + "learning_rate": 4.248000000000001e-06, + "loss": 0.2262, + "step": 1062 + }, + { + "epoch": 0.02128, + "grad_norm": 3.2905337810516357, + "learning_rate": 4.256e-06, + "loss": 0.2301, + "step": 1064 + }, + { + "epoch": 0.02132, + "grad_norm": 2.7409427165985107, + "learning_rate": 4.264e-06, + "loss": 0.2565, + "step": 1066 + }, + { + "epoch": 0.02136, + "grad_norm": 2.393648147583008, + "learning_rate": 4.272000000000001e-06, + "loss": 0.1846, + "step": 1068 + }, + { + "epoch": 0.0214, + "grad_norm": 2.9877490997314453, + "learning_rate": 4.2800000000000005e-06, + "loss": 0.2652, + "step": 1070 + }, + { + "epoch": 0.02144, + "grad_norm": 2.9239816665649414, + "learning_rate": 4.288e-06, + "loss": 0.2223, + "step": 1072 + }, + { + "epoch": 0.02148, + "grad_norm": 2.865799903869629, + "learning_rate": 4.296e-06, + "loss": 0.2431, + "step": 1074 + }, + { + "epoch": 0.02152, + "grad_norm": 3.4055466651916504, + "learning_rate": 4.304000000000001e-06, + "loss": 0.2261, + "step": 1076 + }, + { + "epoch": 0.02156, + "grad_norm": 2.608046531677246, + "learning_rate": 4.312e-06, + "loss": 0.2234, + "step": 1078 + }, + { + "epoch": 0.0216, + "grad_norm": 3.257391929626465, + "learning_rate": 4.32e-06, + "loss": 0.2544, + "step": 1080 + }, + { + "epoch": 0.02164, + "grad_norm": 2.6331868171691895, + "learning_rate": 4.328000000000001e-06, + "loss": 0.2025, + "step": 1082 + }, + { + "epoch": 0.02168, + "grad_norm": 2.6797642707824707, + "learning_rate": 4.3360000000000005e-06, + "loss": 0.2127, + "step": 1084 + }, + { + "epoch": 0.02172, + "grad_norm": 3.522657871246338, + "learning_rate": 4.344e-06, + "loss": 0.2664, + "step": 1086 + }, + { + "epoch": 0.02176, + "grad_norm": 2.922511577606201, + "learning_rate": 4.352e-06, + "loss": 0.2222, + "step": 1088 + }, + { + "epoch": 0.0218, + "grad_norm": 3.489651679992676, + "learning_rate": 4.360000000000001e-06, + "loss": 0.2885, + "step": 1090 + }, + { + "epoch": 0.02184, + "grad_norm": 2.914304256439209, + "learning_rate": 4.368e-06, + "loss": 0.2127, + "step": 1092 + }, + { + "epoch": 0.02188, + "grad_norm": 3.284898042678833, + "learning_rate": 4.376e-06, + "loss": 0.2348, + "step": 1094 + }, + { + "epoch": 0.02192, + "grad_norm": 3.0209121704101562, + "learning_rate": 4.384000000000001e-06, + "loss": 0.2537, + "step": 1096 + }, + { + "epoch": 0.02196, + "grad_norm": 2.7386600971221924, + "learning_rate": 4.3920000000000005e-06, + "loss": 0.2224, + "step": 1098 + }, + { + "epoch": 0.022, + "grad_norm": 3.0613341331481934, + "learning_rate": 4.4e-06, + "loss": 0.2887, + "step": 1100 + }, + { + "epoch": 0.02204, + "grad_norm": 3.1013271808624268, + "learning_rate": 4.408000000000001e-06, + "loss": 0.2444, + "step": 1102 + }, + { + "epoch": 0.02208, + "grad_norm": 2.113025665283203, + "learning_rate": 4.416000000000001e-06, + "loss": 0.2382, + "step": 1104 + }, + { + "epoch": 0.02212, + "grad_norm": 3.6046388149261475, + "learning_rate": 4.424e-06, + "loss": 0.2783, + "step": 1106 + }, + { + "epoch": 0.02216, + "grad_norm": 3.1204144954681396, + "learning_rate": 4.432e-06, + "loss": 0.2888, + "step": 1108 + }, + { + "epoch": 0.0222, + "grad_norm": 2.1236653327941895, + "learning_rate": 4.440000000000001e-06, + "loss": 0.1676, + "step": 1110 + }, + { + "epoch": 0.02224, + "grad_norm": 2.9885830879211426, + "learning_rate": 4.4480000000000004e-06, + "loss": 0.3013, + "step": 1112 + }, + { + "epoch": 0.02228, + "grad_norm": 2.7760486602783203, + "learning_rate": 4.456e-06, + "loss": 0.2772, + "step": 1114 + }, + { + "epoch": 0.02232, + "grad_norm": 3.1188278198242188, + "learning_rate": 4.464000000000001e-06, + "loss": 0.2443, + "step": 1116 + }, + { + "epoch": 0.02236, + "grad_norm": 2.8230817317962646, + "learning_rate": 4.4720000000000006e-06, + "loss": 0.2234, + "step": 1118 + }, + { + "epoch": 0.0224, + "grad_norm": 1.9721142053604126, + "learning_rate": 4.48e-06, + "loss": 0.1597, + "step": 1120 + }, + { + "epoch": 0.02244, + "grad_norm": 3.853890895843506, + "learning_rate": 4.488e-06, + "loss": 0.2696, + "step": 1122 + }, + { + "epoch": 0.02248, + "grad_norm": 3.9122331142425537, + "learning_rate": 4.496000000000001e-06, + "loss": 0.38, + "step": 1124 + }, + { + "epoch": 0.02252, + "grad_norm": 1.8330825567245483, + "learning_rate": 4.504e-06, + "loss": 0.241, + "step": 1126 + }, + { + "epoch": 0.02256, + "grad_norm": 2.0358517169952393, + "learning_rate": 4.512e-06, + "loss": 0.1446, + "step": 1128 + }, + { + "epoch": 0.0226, + "grad_norm": 3.6766574382781982, + "learning_rate": 4.520000000000001e-06, + "loss": 0.3661, + "step": 1130 + }, + { + "epoch": 0.02264, + "grad_norm": 1.8279637098312378, + "learning_rate": 4.5280000000000005e-06, + "loss": 0.2482, + "step": 1132 + }, + { + "epoch": 0.02268, + "grad_norm": 2.02260684967041, + "learning_rate": 4.536e-06, + "loss": 0.1591, + "step": 1134 + }, + { + "epoch": 0.02272, + "grad_norm": 3.268756151199341, + "learning_rate": 4.544000000000001e-06, + "loss": 0.235, + "step": 1136 + }, + { + "epoch": 0.02276, + "grad_norm": 2.0333690643310547, + "learning_rate": 4.552000000000001e-06, + "loss": 0.1673, + "step": 1138 + }, + { + "epoch": 0.0228, + "grad_norm": 2.271306276321411, + "learning_rate": 4.56e-06, + "loss": 0.2469, + "step": 1140 + }, + { + "epoch": 0.02284, + "grad_norm": 1.9175033569335938, + "learning_rate": 4.568e-06, + "loss": 0.2425, + "step": 1142 + }, + { + "epoch": 0.02288, + "grad_norm": 2.879429578781128, + "learning_rate": 4.576000000000001e-06, + "loss": 0.2348, + "step": 1144 + }, + { + "epoch": 0.02292, + "grad_norm": 2.0561306476593018, + "learning_rate": 4.5840000000000005e-06, + "loss": 0.2424, + "step": 1146 + }, + { + "epoch": 0.02296, + "grad_norm": 1.9995323419570923, + "learning_rate": 4.592e-06, + "loss": 0.2427, + "step": 1148 + }, + { + "epoch": 0.023, + "grad_norm": 2.166255235671997, + "learning_rate": 4.600000000000001e-06, + "loss": 0.1674, + "step": 1150 + }, + { + "epoch": 0.02304, + "grad_norm": 3.018146514892578, + "learning_rate": 4.608000000000001e-06, + "loss": 0.3005, + "step": 1152 + }, + { + "epoch": 0.02308, + "grad_norm": 2.2217023372650146, + "learning_rate": 4.616e-06, + "loss": 0.2027, + "step": 1154 + }, + { + "epoch": 0.02312, + "grad_norm": 2.2035160064697266, + "learning_rate": 4.624e-06, + "loss": 0.1762, + "step": 1156 + }, + { + "epoch": 0.02316, + "grad_norm": 3.1834182739257812, + "learning_rate": 4.632000000000001e-06, + "loss": 0.2426, + "step": 1158 + }, + { + "epoch": 0.0232, + "grad_norm": 3.072303056716919, + "learning_rate": 4.6400000000000005e-06, + "loss": 0.3263, + "step": 1160 + }, + { + "epoch": 0.02324, + "grad_norm": 2.117755889892578, + "learning_rate": 4.648e-06, + "loss": 0.2507, + "step": 1162 + }, + { + "epoch": 0.02328, + "grad_norm": 2.6530840396881104, + "learning_rate": 4.656000000000001e-06, + "loss": 0.2771, + "step": 1164 + }, + { + "epoch": 0.02332, + "grad_norm": 2.2141239643096924, + "learning_rate": 4.664000000000001e-06, + "loss": 0.2236, + "step": 1166 + }, + { + "epoch": 0.02336, + "grad_norm": 2.3333332538604736, + "learning_rate": 4.672e-06, + "loss": 0.2028, + "step": 1168 + }, + { + "epoch": 0.0234, + "grad_norm": 2.306243896484375, + "learning_rate": 4.680000000000001e-06, + "loss": 0.233, + "step": 1170 + }, + { + "epoch": 0.02344, + "grad_norm": 2.6366615295410156, + "learning_rate": 4.688000000000001e-06, + "loss": 0.2431, + "step": 1172 + }, + { + "epoch": 0.02348, + "grad_norm": 2.4522924423217773, + "learning_rate": 4.6960000000000004e-06, + "loss": 0.2129, + "step": 1174 + }, + { + "epoch": 0.02352, + "grad_norm": 2.310532569885254, + "learning_rate": 4.704e-06, + "loss": 0.2027, + "step": 1176 + }, + { + "epoch": 0.02356, + "grad_norm": 3.137556791305542, + "learning_rate": 4.712000000000001e-06, + "loss": 0.2898, + "step": 1178 + }, + { + "epoch": 0.0236, + "grad_norm": 1.9769203662872314, + "learning_rate": 4.7200000000000005e-06, + "loss": 0.1673, + "step": 1180 + }, + { + "epoch": 0.02364, + "grad_norm": 3.122097969055176, + "learning_rate": 4.728e-06, + "loss": 0.228, + "step": 1182 + }, + { + "epoch": 0.02368, + "grad_norm": 3.455761432647705, + "learning_rate": 4.736000000000001e-06, + "loss": 0.38, + "step": 1184 + }, + { + "epoch": 0.02372, + "grad_norm": 3.0133681297302246, + "learning_rate": 4.744000000000001e-06, + "loss": 0.2299, + "step": 1186 + }, + { + "epoch": 0.02376, + "grad_norm": 2.049027919769287, + "learning_rate": 4.752e-06, + "loss": 0.1755, + "step": 1188 + }, + { + "epoch": 0.0238, + "grad_norm": 2.069486618041992, + "learning_rate": 4.76e-06, + "loss": 0.238, + "step": 1190 + }, + { + "epoch": 0.02384, + "grad_norm": 2.9830267429351807, + "learning_rate": 4.768000000000001e-06, + "loss": 0.3132, + "step": 1192 + }, + { + "epoch": 0.02388, + "grad_norm": 2.986370801925659, + "learning_rate": 4.7760000000000005e-06, + "loss": 0.2468, + "step": 1194 + }, + { + "epoch": 0.02392, + "grad_norm": 2.068582534790039, + "learning_rate": 4.784e-06, + "loss": 0.1932, + "step": 1196 + }, + { + "epoch": 0.02396, + "grad_norm": 2.343977689743042, + "learning_rate": 4.792000000000001e-06, + "loss": 0.2121, + "step": 1198 + }, + { + "epoch": 0.024, + "grad_norm": 2.3326051235198975, + "learning_rate": 4.800000000000001e-06, + "loss": 0.2027, + "step": 1200 + }, + { + "epoch": 0.02404, + "grad_norm": 2.0597736835479736, + "learning_rate": 4.808e-06, + "loss": 0.1755, + "step": 1202 + }, + { + "epoch": 0.02408, + "grad_norm": 1.6392409801483154, + "learning_rate": 4.816e-06, + "loss": 0.1445, + "step": 1204 + }, + { + "epoch": 0.02412, + "grad_norm": 1.4358758926391602, + "learning_rate": 4.824000000000001e-06, + "loss": 0.1176, + "step": 1206 + }, + { + "epoch": 0.02416, + "grad_norm": 4.118227005004883, + "learning_rate": 4.8320000000000005e-06, + "loss": 0.282, + "step": 1208 + }, + { + "epoch": 0.0242, + "grad_norm": 4.289165019989014, + "learning_rate": 4.84e-06, + "loss": 0.3091, + "step": 1210 + }, + { + "epoch": 0.02424, + "grad_norm": 4.359921455383301, + "learning_rate": 4.848000000000001e-06, + "loss": 0.3094, + "step": 1212 + }, + { + "epoch": 0.02428, + "grad_norm": 4.318225383758545, + "learning_rate": 4.856e-06, + "loss": 0.2886, + "step": 1214 + }, + { + "epoch": 0.02432, + "grad_norm": 1.1836819648742676, + "learning_rate": 4.864e-06, + "loss": 0.2931, + "step": 1216 + }, + { + "epoch": 0.02436, + "grad_norm": 1.2964990139007568, + "learning_rate": 4.872000000000001e-06, + "loss": 0.0946, + "step": 1218 + }, + { + "epoch": 0.0244, + "grad_norm": 4.042072296142578, + "learning_rate": 4.880000000000001e-06, + "loss": 0.282, + "step": 1220 + }, + { + "epoch": 0.02444, + "grad_norm": 3.5214200019836426, + "learning_rate": 4.8880000000000005e-06, + "loss": 0.2571, + "step": 1222 + }, + { + "epoch": 0.02448, + "grad_norm": 3.248486280441284, + "learning_rate": 4.896e-06, + "loss": 0.4094, + "step": 1224 + }, + { + "epoch": 0.02452, + "grad_norm": 1.7447532415390015, + "learning_rate": 4.904000000000001e-06, + "loss": 0.2426, + "step": 1226 + }, + { + "epoch": 0.02456, + "grad_norm": 2.712894916534424, + "learning_rate": 4.9120000000000006e-06, + "loss": 0.3006, + "step": 1228 + }, + { + "epoch": 0.0246, + "grad_norm": 2.1017839908599854, + "learning_rate": 4.92e-06, + "loss": 0.1939, + "step": 1230 + }, + { + "epoch": 0.02464, + "grad_norm": 1.9791302680969238, + "learning_rate": 4.928000000000001e-06, + "loss": 0.2029, + "step": 1232 + }, + { + "epoch": 0.02468, + "grad_norm": 2.469069242477417, + "learning_rate": 4.936e-06, + "loss": 0.2542, + "step": 1234 + }, + { + "epoch": 0.02472, + "grad_norm": 2.388827085494995, + "learning_rate": 4.9440000000000004e-06, + "loss": 0.2332, + "step": 1236 + }, + { + "epoch": 0.02476, + "grad_norm": 2.1125805377960205, + "learning_rate": 4.952e-06, + "loss": 0.2332, + "step": 1238 + }, + { + "epoch": 0.0248, + "grad_norm": 2.5379397869110107, + "learning_rate": 4.960000000000001e-06, + "loss": 0.2446, + "step": 1240 + }, + { + "epoch": 0.02484, + "grad_norm": 2.116943359375, + "learning_rate": 4.9680000000000005e-06, + "loss": 0.2332, + "step": 1242 + }, + { + "epoch": 0.02488, + "grad_norm": 2.1233115196228027, + "learning_rate": 4.976e-06, + "loss": 0.2546, + "step": 1244 + }, + { + "epoch": 0.02492, + "grad_norm": 2.3569040298461914, + "learning_rate": 4.984000000000001e-06, + "loss": 0.2432, + "step": 1246 + }, + { + "epoch": 0.02496, + "grad_norm": 2.033684730529785, + "learning_rate": 4.992e-06, + "loss": 0.2124, + "step": 1248 + }, + { + "epoch": 0.025, + "grad_norm": 2.37221360206604, + "learning_rate": 5e-06, + "loss": 0.2432, + "step": 1250 + }, + { + "epoch": 0.02504, + "grad_norm": 2.071502685546875, + "learning_rate": 5.008000000000001e-06, + "loss": 0.2326, + "step": 1252 + }, + { + "epoch": 0.02508, + "grad_norm": 2.084819793701172, + "learning_rate": 5.016000000000001e-06, + "loss": 0.2123, + "step": 1254 + }, + { + "epoch": 0.02512, + "grad_norm": 2.290879011154175, + "learning_rate": 5.024e-06, + "loss": 0.2226, + "step": 1256 + }, + { + "epoch": 0.02516, + "grad_norm": 1.9211567640304565, + "learning_rate": 5.032e-06, + "loss": 0.1937, + "step": 1258 + }, + { + "epoch": 0.0252, + "grad_norm": 2.02632999420166, + "learning_rate": 5.04e-06, + "loss": 0.2333, + "step": 1260 + }, + { + "epoch": 0.02524, + "grad_norm": 1.7608680725097656, + "learning_rate": 5.048000000000001e-06, + "loss": 0.2263, + "step": 1262 + }, + { + "epoch": 0.02528, + "grad_norm": 2.466703414916992, + "learning_rate": 5.056000000000001e-06, + "loss": 0.277, + "step": 1264 + }, + { + "epoch": 0.02532, + "grad_norm": 2.352623224258423, + "learning_rate": 5.064e-06, + "loss": 0.2237, + "step": 1266 + }, + { + "epoch": 0.02536, + "grad_norm": 2.305940628051758, + "learning_rate": 5.072e-06, + "loss": 0.2237, + "step": 1268 + }, + { + "epoch": 0.0254, + "grad_norm": 2.4855425357818604, + "learning_rate": 5.0800000000000005e-06, + "loss": 0.2351, + "step": 1270 + }, + { + "epoch": 0.02544, + "grad_norm": 2.354382038116455, + "learning_rate": 5.088000000000001e-06, + "loss": 0.2332, + "step": 1272 + }, + { + "epoch": 0.02548, + "grad_norm": 2.286245822906494, + "learning_rate": 5.096000000000001e-06, + "loss": 0.2331, + "step": 1274 + }, + { + "epoch": 0.02552, + "grad_norm": 2.210615873336792, + "learning_rate": 5.104e-06, + "loss": 0.2332, + "step": 1276 + }, + { + "epoch": 0.02556, + "grad_norm": 1.9006116390228271, + "learning_rate": 5.112e-06, + "loss": 0.2351, + "step": 1278 + }, + { + "epoch": 0.0256, + "grad_norm": 2.173060178756714, + "learning_rate": 5.12e-06, + "loss": 0.2331, + "step": 1280 + }, + { + "epoch": 0.02564, + "grad_norm": 2.177828788757324, + "learning_rate": 5.128000000000001e-06, + "loss": 0.2432, + "step": 1282 + }, + { + "epoch": 0.02568, + "grad_norm": 2.343050003051758, + "learning_rate": 5.136e-06, + "loss": 0.2331, + "step": 1284 + }, + { + "epoch": 0.02572, + "grad_norm": 2.30255126953125, + "learning_rate": 5.144e-06, + "loss": 0.2332, + "step": 1286 + }, + { + "epoch": 0.02576, + "grad_norm": 2.0500354766845703, + "learning_rate": 5.152e-06, + "loss": 0.2324, + "step": 1288 + }, + { + "epoch": 0.0258, + "grad_norm": 2.271466016769409, + "learning_rate": 5.1600000000000006e-06, + "loss": 0.2324, + "step": 1290 + }, + { + "epoch": 0.02584, + "grad_norm": 2.1246254444122314, + "learning_rate": 5.168000000000001e-06, + "loss": 0.2122, + "step": 1292 + }, + { + "epoch": 0.02588, + "grad_norm": 2.2985775470733643, + "learning_rate": 5.176e-06, + "loss": 0.2431, + "step": 1294 + }, + { + "epoch": 0.02592, + "grad_norm": 2.5354206562042236, + "learning_rate": 5.184e-06, + "loss": 0.2768, + "step": 1296 + }, + { + "epoch": 0.02596, + "grad_norm": 2.0694994926452637, + "learning_rate": 5.1920000000000004e-06, + "loss": 0.233, + "step": 1298 + }, + { + "epoch": 0.026, + "grad_norm": 2.251537799835205, + "learning_rate": 5.2e-06, + "loss": 0.2223, + "step": 1300 + }, + { + "epoch": 0.02604, + "grad_norm": 2.3854007720947266, + "learning_rate": 5.208000000000001e-06, + "loss": 0.2433, + "step": 1302 + }, + { + "epoch": 0.02608, + "grad_norm": 1.7960898876190186, + "learning_rate": 5.216e-06, + "loss": 0.204, + "step": 1304 + }, + { + "epoch": 0.02612, + "grad_norm": 2.4092929363250732, + "learning_rate": 5.224e-06, + "loss": 0.2432, + "step": 1306 + }, + { + "epoch": 0.02616, + "grad_norm": 2.61735463142395, + "learning_rate": 5.232e-06, + "loss": 0.2887, + "step": 1308 + }, + { + "epoch": 0.0262, + "grad_norm": 1.9367785453796387, + "learning_rate": 5.240000000000001e-06, + "loss": 0.2235, + "step": 1310 + }, + { + "epoch": 0.02624, + "grad_norm": 2.233313798904419, + "learning_rate": 5.248000000000001e-06, + "loss": 0.233, + "step": 1312 + }, + { + "epoch": 0.02628, + "grad_norm": 2.5891780853271484, + "learning_rate": 5.256e-06, + "loss": 0.2544, + "step": 1314 + }, + { + "epoch": 0.02632, + "grad_norm": 2.2253856658935547, + "learning_rate": 5.264e-06, + "loss": 0.2027, + "step": 1316 + }, + { + "epoch": 0.02636, + "grad_norm": 1.803309440612793, + "learning_rate": 5.2720000000000005e-06, + "loss": 0.1674, + "step": 1318 + }, + { + "epoch": 0.0264, + "grad_norm": 2.852635383605957, + "learning_rate": 5.28e-06, + "loss": 0.3264, + "step": 1320 + }, + { + "epoch": 0.02644, + "grad_norm": 1.694731593132019, + "learning_rate": 5.288000000000001e-06, + "loss": 0.2103, + "step": 1322 + }, + { + "epoch": 0.02648, + "grad_norm": 3.439143419265747, + "learning_rate": 5.296e-06, + "loss": 0.3662, + "step": 1324 + }, + { + "epoch": 0.02652, + "grad_norm": 3.121201515197754, + "learning_rate": 5.304e-06, + "loss": 0.2483, + "step": 1326 + }, + { + "epoch": 0.02656, + "grad_norm": 2.0702555179595947, + "learning_rate": 5.312e-06, + "loss": 0.2127, + "step": 1328 + }, + { + "epoch": 0.0266, + "grad_norm": 1.7789878845214844, + "learning_rate": 5.320000000000001e-06, + "loss": 0.2299, + "step": 1330 + }, + { + "epoch": 0.02664, + "grad_norm": 2.3651604652404785, + "learning_rate": 5.328000000000001e-06, + "loss": 0.2652, + "step": 1332 + }, + { + "epoch": 0.02668, + "grad_norm": 2.5118119716644287, + "learning_rate": 5.336e-06, + "loss": 0.2226, + "step": 1334 + }, + { + "epoch": 0.02672, + "grad_norm": 2.0088393688201904, + "learning_rate": 5.344e-06, + "loss": 0.2235, + "step": 1336 + }, + { + "epoch": 0.02676, + "grad_norm": 2.5106072425842285, + "learning_rate": 5.352000000000001e-06, + "loss": 0.2431, + "step": 1338 + }, + { + "epoch": 0.0268, + "grad_norm": 2.441781997680664, + "learning_rate": 5.36e-06, + "loss": 0.2432, + "step": 1340 + }, + { + "epoch": 0.02684, + "grad_norm": 2.0769290924072266, + "learning_rate": 5.368000000000001e-06, + "loss": 0.2223, + "step": 1342 + }, + { + "epoch": 0.02688, + "grad_norm": 2.185964584350586, + "learning_rate": 5.376e-06, + "loss": 0.2431, + "step": 1344 + }, + { + "epoch": 0.02692, + "grad_norm": 1.77271568775177, + "learning_rate": 5.3840000000000005e-06, + "loss": 0.1939, + "step": 1346 + }, + { + "epoch": 0.02696, + "grad_norm": 2.6295835971832275, + "learning_rate": 5.392e-06, + "loss": 0.2509, + "step": 1348 + }, + { + "epoch": 0.027, + "grad_norm": 1.599590539932251, + "learning_rate": 5.400000000000001e-06, + "loss": 0.1519, + "step": 1350 + }, + { + "epoch": 0.02704, + "grad_norm": 1.3958011865615845, + "learning_rate": 5.408e-06, + "loss": 0.2415, + "step": 1352 + }, + { + "epoch": 0.02708, + "grad_norm": 1.259765386581421, + "learning_rate": 5.416e-06, + "loss": 0.2634, + "step": 1354 + }, + { + "epoch": 0.02712, + "grad_norm": 1.2369112968444824, + "learning_rate": 5.424e-06, + "loss": 0.1116, + "step": 1356 + }, + { + "epoch": 0.02716, + "grad_norm": 3.1089272499084473, + "learning_rate": 5.432000000000001e-06, + "loss": 0.4391, + "step": 1358 + }, + { + "epoch": 0.0272, + "grad_norm": 3.141720771789551, + "learning_rate": 5.4400000000000004e-06, + "loss": 0.4701, + "step": 1360 + }, + { + "epoch": 0.02724, + "grad_norm": 2.6346042156219482, + "learning_rate": 5.448e-06, + "loss": 0.3667, + "step": 1362 + }, + { + "epoch": 0.02728, + "grad_norm": 1.4199328422546387, + "learning_rate": 5.456e-06, + "loss": 0.1443, + "step": 1364 + }, + { + "epoch": 0.02732, + "grad_norm": 1.493067979812622, + "learning_rate": 5.4640000000000005e-06, + "loss": 0.1521, + "step": 1366 + }, + { + "epoch": 0.02736, + "grad_norm": 2.5915074348449707, + "learning_rate": 5.472e-06, + "loss": 0.2226, + "step": 1368 + }, + { + "epoch": 0.0274, + "grad_norm": 1.3802353143692017, + "learning_rate": 5.480000000000001e-06, + "loss": 0.2227, + "step": 1370 + }, + { + "epoch": 0.02744, + "grad_norm": 2.480912446975708, + "learning_rate": 5.488e-06, + "loss": 0.3262, + "step": 1372 + }, + { + "epoch": 0.02748, + "grad_norm": 2.250032901763916, + "learning_rate": 5.496e-06, + "loss": 0.2265, + "step": 1374 + }, + { + "epoch": 0.02752, + "grad_norm": 2.3414700031280518, + "learning_rate": 5.504e-06, + "loss": 0.2654, + "step": 1376 + }, + { + "epoch": 0.02756, + "grad_norm": 1.9343373775482178, + "learning_rate": 5.512000000000001e-06, + "loss": 0.2546, + "step": 1378 + }, + { + "epoch": 0.0276, + "grad_norm": 1.8474409580230713, + "learning_rate": 5.5200000000000005e-06, + "loss": 0.2029, + "step": 1380 + }, + { + "epoch": 0.02764, + "grad_norm": 2.3561863899230957, + "learning_rate": 5.528e-06, + "loss": 0.3013, + "step": 1382 + }, + { + "epoch": 0.02768, + "grad_norm": 1.7517907619476318, + "learning_rate": 5.536e-06, + "loss": 0.2508, + "step": 1384 + }, + { + "epoch": 0.02772, + "grad_norm": 1.5007891654968262, + "learning_rate": 5.544000000000001e-06, + "loss": 0.1444, + "step": 1386 + }, + { + "epoch": 0.02776, + "grad_norm": 1.4333440065383911, + "learning_rate": 5.552e-06, + "loss": 0.2413, + "step": 1388 + }, + { + "epoch": 0.0278, + "grad_norm": 1.4223644733428955, + "learning_rate": 5.560000000000001e-06, + "loss": 0.2695, + "step": 1390 + }, + { + "epoch": 0.02784, + "grad_norm": 1.5125880241394043, + "learning_rate": 5.568e-06, + "loss": 0.2767, + "step": 1392 + }, + { + "epoch": 0.02788, + "grad_norm": 2.7062652111053467, + "learning_rate": 5.5760000000000005e-06, + "loss": 0.2552, + "step": 1394 + }, + { + "epoch": 0.02792, + "grad_norm": 1.3818801641464233, + "learning_rate": 5.584e-06, + "loss": 0.1518, + "step": 1396 + }, + { + "epoch": 0.02796, + "grad_norm": 2.83514404296875, + "learning_rate": 5.592000000000001e-06, + "loss": 0.3808, + "step": 1398 + }, + { + "epoch": 0.028, + "grad_norm": 1.3730412721633911, + "learning_rate": 5.600000000000001e-06, + "loss": 0.1371, + "step": 1400 + }, + { + "epoch": 0.02804, + "grad_norm": 2.6313624382019043, + "learning_rate": 5.608e-06, + "loss": 0.2483, + "step": 1402 + }, + { + "epoch": 0.02808, + "grad_norm": 1.3478806018829346, + "learning_rate": 5.616e-06, + "loss": 0.1372, + "step": 1404 + }, + { + "epoch": 0.02812, + "grad_norm": 1.6451300382614136, + "learning_rate": 5.624000000000001e-06, + "loss": 0.256, + "step": 1406 + }, + { + "epoch": 0.02816, + "grad_norm": 1.4689295291900635, + "learning_rate": 5.6320000000000005e-06, + "loss": 0.2352, + "step": 1408 + }, + { + "epoch": 0.0282, + "grad_norm": 2.5533978939056396, + "learning_rate": 5.64e-06, + "loss": 0.326, + "step": 1410 + }, + { + "epoch": 0.02824, + "grad_norm": 2.327011823654175, + "learning_rate": 5.648e-06, + "loss": 0.3135, + "step": 1412 + }, + { + "epoch": 0.02828, + "grad_norm": 1.67295241355896, + "learning_rate": 5.6560000000000006e-06, + "loss": 0.2263, + "step": 1414 + }, + { + "epoch": 0.02832, + "grad_norm": 2.267033100128174, + "learning_rate": 5.664e-06, + "loss": 0.2447, + "step": 1416 + }, + { + "epoch": 0.02836, + "grad_norm": 1.9324179887771606, + "learning_rate": 5.672000000000001e-06, + "loss": 0.2225, + "step": 1418 + }, + { + "epoch": 0.0284, + "grad_norm": 1.907268762588501, + "learning_rate": 5.68e-06, + "loss": 0.2226, + "step": 1420 + }, + { + "epoch": 0.02844, + "grad_norm": 2.1152396202087402, + "learning_rate": 5.6880000000000004e-06, + "loss": 0.2447, + "step": 1422 + }, + { + "epoch": 0.02848, + "grad_norm": 2.0529420375823975, + "learning_rate": 5.696e-06, + "loss": 0.2543, + "step": 1424 + }, + { + "epoch": 0.02852, + "grad_norm": 1.60316002368927, + "learning_rate": 5.704000000000001e-06, + "loss": 0.2031, + "step": 1426 + }, + { + "epoch": 0.02856, + "grad_norm": 1.799319863319397, + "learning_rate": 5.7120000000000005e-06, + "loss": 0.2031, + "step": 1428 + }, + { + "epoch": 0.0286, + "grad_norm": 1.5264551639556885, + "learning_rate": 5.72e-06, + "loss": 0.185, + "step": 1430 + }, + { + "epoch": 0.02864, + "grad_norm": 1.4911797046661377, + "learning_rate": 5.728e-06, + "loss": 0.2385, + "step": 1432 + }, + { + "epoch": 0.02868, + "grad_norm": 1.542236566543579, + "learning_rate": 5.736000000000001e-06, + "loss": 0.1678, + "step": 1434 + }, + { + "epoch": 0.02872, + "grad_norm": 1.194549560546875, + "learning_rate": 5.744e-06, + "loss": 0.1533, + "step": 1436 + }, + { + "epoch": 0.02876, + "grad_norm": 1.1630818843841553, + "learning_rate": 5.752000000000001e-06, + "loss": 0.1306, + "step": 1438 + }, + { + "epoch": 0.0288, + "grad_norm": 2.669187545776367, + "learning_rate": 5.76e-06, + "loss": 0.2556, + "step": 1440 + }, + { + "epoch": 0.02884, + "grad_norm": 1.0479811429977417, + "learning_rate": 5.7680000000000005e-06, + "loss": 0.2727, + "step": 1442 + }, + { + "epoch": 0.02888, + "grad_norm": 2.858191728591919, + "learning_rate": 5.776e-06, + "loss": 0.4547, + "step": 1444 + }, + { + "epoch": 0.02892, + "grad_norm": 1.0926706790924072, + "learning_rate": 5.784000000000001e-06, + "loss": 0.2729, + "step": 1446 + }, + { + "epoch": 0.02896, + "grad_norm": 2.5453453063964844, + "learning_rate": 5.792000000000001e-06, + "loss": 0.2557, + "step": 1448 + }, + { + "epoch": 0.029, + "grad_norm": 1.177451252937317, + "learning_rate": 5.8e-06, + "loss": 0.1183, + "step": 1450 + }, + { + "epoch": 0.02904, + "grad_norm": 2.453476667404175, + "learning_rate": 5.808e-06, + "loss": 0.3668, + "step": 1452 + }, + { + "epoch": 0.02908, + "grad_norm": 2.4252848625183105, + "learning_rate": 5.816000000000001e-06, + "loss": 0.3667, + "step": 1454 + }, + { + "epoch": 0.02912, + "grad_norm": 2.2771778106689453, + "learning_rate": 5.8240000000000005e-06, + "loss": 0.3138, + "step": 1456 + }, + { + "epoch": 0.02916, + "grad_norm": 1.9034343957901, + "learning_rate": 5.832000000000001e-06, + "loss": 0.2151, + "step": 1458 + }, + { + "epoch": 0.0292, + "grad_norm": 1.6516907215118408, + "learning_rate": 5.84e-06, + "loss": 0.1936, + "step": 1460 + }, + { + "epoch": 0.02924, + "grad_norm": 1.8281440734863281, + "learning_rate": 5.848000000000001e-06, + "loss": 0.2434, + "step": 1462 + }, + { + "epoch": 0.02928, + "grad_norm": 1.915373682975769, + "learning_rate": 5.856e-06, + "loss": 0.2433, + "step": 1464 + }, + { + "epoch": 0.02932, + "grad_norm": 2.065528154373169, + "learning_rate": 5.864000000000001e-06, + "loss": 0.2434, + "step": 1466 + }, + { + "epoch": 0.02936, + "grad_norm": 1.7486506700515747, + "learning_rate": 5.872000000000001e-06, + "loss": 0.2123, + "step": 1468 + }, + { + "epoch": 0.0294, + "grad_norm": 1.9699206352233887, + "learning_rate": 5.8800000000000005e-06, + "loss": 0.2332, + "step": 1470 + }, + { + "epoch": 0.02944, + "grad_norm": 1.71208655834198, + "learning_rate": 5.888e-06, + "loss": 0.2352, + "step": 1472 + }, + { + "epoch": 0.02948, + "grad_norm": 1.6226720809936523, + "learning_rate": 5.896000000000001e-06, + "loss": 0.2029, + "step": 1474 + }, + { + "epoch": 0.02952, + "grad_norm": 2.191283941268921, + "learning_rate": 5.9040000000000006e-06, + "loss": 0.2472, + "step": 1476 + }, + { + "epoch": 0.02956, + "grad_norm": 1.701375961303711, + "learning_rate": 5.912e-06, + "loss": 0.2351, + "step": 1478 + }, + { + "epoch": 0.0296, + "grad_norm": 1.6387187242507935, + "learning_rate": 5.92e-06, + "loss": 0.1935, + "step": 1480 + }, + { + "epoch": 0.02964, + "grad_norm": 1.3944429159164429, + "learning_rate": 5.928000000000001e-06, + "loss": 0.1687, + "step": 1482 + }, + { + "epoch": 0.02968, + "grad_norm": 2.2740044593811035, + "learning_rate": 5.9360000000000004e-06, + "loss": 0.23, + "step": 1484 + }, + { + "epoch": 0.02972, + "grad_norm": 2.7114908695220947, + "learning_rate": 5.944000000000001e-06, + "loss": 0.3269, + "step": 1486 + }, + { + "epoch": 0.02976, + "grad_norm": 1.3932867050170898, + "learning_rate": 5.952e-06, + "loss": 0.2484, + "step": 1488 + }, + { + "epoch": 0.0298, + "grad_norm": 2.5025882720947266, + "learning_rate": 5.9600000000000005e-06, + "loss": 0.2559, + "step": 1490 + }, + { + "epoch": 0.02984, + "grad_norm": 2.273608684539795, + "learning_rate": 5.968e-06, + "loss": 0.23, + "step": 1492 + }, + { + "epoch": 0.02988, + "grad_norm": 2.4500882625579834, + "learning_rate": 5.976000000000001e-06, + "loss": 0.2508, + "step": 1494 + }, + { + "epoch": 0.02992, + "grad_norm": 2.2412219047546387, + "learning_rate": 5.984000000000001e-06, + "loss": 0.2301, + "step": 1496 + }, + { + "epoch": 0.02996, + "grad_norm": 1.5477091073989868, + "learning_rate": 5.992e-06, + "loss": 0.1757, + "step": 1498 + }, + { + "epoch": 0.03, + "grad_norm": 2.189713716506958, + "learning_rate": 6e-06, + "loss": 0.3134, + "step": 1500 + }, + { + "epoch": 0.03004, + "grad_norm": 2.201585054397583, + "learning_rate": 6.008000000000001e-06, + "loss": 0.2566, + "step": 1502 + }, + { + "epoch": 0.03008, + "grad_norm": 1.651253342628479, + "learning_rate": 6.0160000000000005e-06, + "loss": 0.1936, + "step": 1504 + }, + { + "epoch": 0.03012, + "grad_norm": 2.1957879066467285, + "learning_rate": 6.024000000000001e-06, + "loss": 0.2653, + "step": 1506 + }, + { + "epoch": 0.03016, + "grad_norm": 1.7964544296264648, + "learning_rate": 6.032e-06, + "loss": 0.2224, + "step": 1508 + }, + { + "epoch": 0.0302, + "grad_norm": 1.7743284702301025, + "learning_rate": 6.040000000000001e-06, + "loss": 0.2324, + "step": 1510 + }, + { + "epoch": 0.03024, + "grad_norm": 1.6576603651046753, + "learning_rate": 6.048e-06, + "loss": 0.2124, + "step": 1512 + }, + { + "epoch": 0.03028, + "grad_norm": 1.5011354684829712, + "learning_rate": 6.056000000000001e-06, + "loss": 0.2225, + "step": 1514 + }, + { + "epoch": 0.03032, + "grad_norm": 1.5488771200180054, + "learning_rate": 6.064000000000001e-06, + "loss": 0.2237, + "step": 1516 + }, + { + "epoch": 0.03036, + "grad_norm": 1.6479636430740356, + "learning_rate": 6.0720000000000005e-06, + "loss": 0.2124, + "step": 1518 + }, + { + "epoch": 0.0304, + "grad_norm": 1.430160641670227, + "learning_rate": 6.08e-06, + "loss": 0.2264, + "step": 1520 + }, + { + "epoch": 0.03044, + "grad_norm": 1.345729947090149, + "learning_rate": 6.088000000000001e-06, + "loss": 0.1678, + "step": 1522 + }, + { + "epoch": 0.03048, + "grad_norm": 2.373702049255371, + "learning_rate": 6.096000000000001e-06, + "loss": 0.3395, + "step": 1524 + }, + { + "epoch": 0.03052, + "grad_norm": 2.375185012817383, + "learning_rate": 6.104000000000001e-06, + "loss": 0.3395, + "step": 1526 + }, + { + "epoch": 0.03056, + "grad_norm": 1.590004324913025, + "learning_rate": 6.112e-06, + "loss": 0.1766, + "step": 1528 + }, + { + "epoch": 0.0306, + "grad_norm": 1.4237390756607056, + "learning_rate": 6.120000000000001e-06, + "loss": 0.2644, + "step": 1530 + }, + { + "epoch": 0.03064, + "grad_norm": 2.083411693572998, + "learning_rate": 6.1280000000000005e-06, + "loss": 0.3009, + "step": 1532 + }, + { + "epoch": 0.03068, + "grad_norm": 2.0237820148468018, + "learning_rate": 6.136000000000001e-06, + "loss": 0.2265, + "step": 1534 + }, + { + "epoch": 0.03072, + "grad_norm": 1.416016936302185, + "learning_rate": 6.144e-06, + "loss": 0.2472, + "step": 1536 + }, + { + "epoch": 0.03076, + "grad_norm": 1.5713372230529785, + "learning_rate": 6.1520000000000006e-06, + "loss": 0.2239, + "step": 1538 + }, + { + "epoch": 0.0308, + "grad_norm": 1.835089087486267, + "learning_rate": 6.16e-06, + "loss": 0.2546, + "step": 1540 + }, + { + "epoch": 0.03084, + "grad_norm": 1.83064866065979, + "learning_rate": 6.168000000000001e-06, + "loss": 0.2433, + "step": 1542 + }, + { + "epoch": 0.03088, + "grad_norm": 1.679935336112976, + "learning_rate": 6.176000000000001e-06, + "loss": 0.2122, + "step": 1544 + }, + { + "epoch": 0.03092, + "grad_norm": 2.0935583114624023, + "learning_rate": 6.184e-06, + "loss": 0.2445, + "step": 1546 + }, + { + "epoch": 0.03096, + "grad_norm": 1.5415987968444824, + "learning_rate": 6.192e-06, + "loss": 0.2147, + "step": 1548 + }, + { + "epoch": 0.031, + "grad_norm": 1.516928791999817, + "learning_rate": 6.200000000000001e-06, + "loss": 0.1756, + "step": 1550 + }, + { + "epoch": 0.03104, + "grad_norm": 1.486098289489746, + "learning_rate": 6.2080000000000005e-06, + "loss": 0.1843, + "step": 1552 + }, + { + "epoch": 0.03108, + "grad_norm": 2.1109912395477295, + "learning_rate": 6.216000000000001e-06, + "loss": 0.3012, + "step": 1554 + }, + { + "epoch": 0.03112, + "grad_norm": 1.9995399713516235, + "learning_rate": 6.224e-06, + "loss": 0.3006, + "step": 1556 + }, + { + "epoch": 0.03116, + "grad_norm": 1.3842883110046387, + "learning_rate": 6.232000000000001e-06, + "loss": 0.1756, + "step": 1558 + }, + { + "epoch": 0.0312, + "grad_norm": 2.222899913787842, + "learning_rate": 6.24e-06, + "loss": 0.23, + "step": 1560 + }, + { + "epoch": 0.03124, + "grad_norm": 1.4094524383544922, + "learning_rate": 6.248000000000001e-06, + "loss": 0.1517, + "step": 1562 + }, + { + "epoch": 0.03128, + "grad_norm": 2.341855049133301, + "learning_rate": 6.256000000000001e-06, + "loss": 0.2299, + "step": 1564 + }, + { + "epoch": 0.03132, + "grad_norm": 2.345210313796997, + "learning_rate": 6.264e-06, + "loss": 0.3391, + "step": 1566 + }, + { + "epoch": 0.03136, + "grad_norm": 2.3657100200653076, + "learning_rate": 6.272e-06, + "loss": 0.2425, + "step": 1568 + }, + { + "epoch": 0.0314, + "grad_norm": 1.8607157468795776, + "learning_rate": 6.280000000000001e-06, + "loss": 0.2773, + "step": 1570 + }, + { + "epoch": 0.03144, + "grad_norm": 1.6812493801116943, + "learning_rate": 6.288000000000001e-06, + "loss": 0.2027, + "step": 1572 + }, + { + "epoch": 0.03148, + "grad_norm": 1.6295716762542725, + "learning_rate": 6.296000000000001e-06, + "loss": 0.2331, + "step": 1574 + }, + { + "epoch": 0.03152, + "grad_norm": 1.8722368478775024, + "learning_rate": 6.304e-06, + "loss": 0.2432, + "step": 1576 + }, + { + "epoch": 0.03156, + "grad_norm": 1.6310819387435913, + "learning_rate": 6.312000000000001e-06, + "loss": 0.2225, + "step": 1578 + }, + { + "epoch": 0.0316, + "grad_norm": 1.6207711696624756, + "learning_rate": 6.3200000000000005e-06, + "loss": 0.2223, + "step": 1580 + }, + { + "epoch": 0.03164, + "grad_norm": 1.529112458229065, + "learning_rate": 6.328000000000001e-06, + "loss": 0.2237, + "step": 1582 + }, + { + "epoch": 0.03168, + "grad_norm": 1.3831989765167236, + "learning_rate": 6.336000000000001e-06, + "loss": 0.1846, + "step": 1584 + }, + { + "epoch": 0.03172, + "grad_norm": 2.2694520950317383, + "learning_rate": 6.344e-06, + "loss": 0.3015, + "step": 1586 + }, + { + "epoch": 0.03176, + "grad_norm": 1.2747360467910767, + "learning_rate": 6.352e-06, + "loss": 0.2182, + "step": 1588 + }, + { + "epoch": 0.0318, + "grad_norm": 1.453324794769287, + "learning_rate": 6.360000000000001e-06, + "loss": 0.1757, + "step": 1590 + }, + { + "epoch": 0.03184, + "grad_norm": 1.2199538946151733, + "learning_rate": 6.368000000000001e-06, + "loss": 0.1688, + "step": 1592 + }, + { + "epoch": 0.03188, + "grad_norm": 1.090314269065857, + "learning_rate": 6.376e-06, + "loss": 0.1375, + "step": 1594 + }, + { + "epoch": 0.03192, + "grad_norm": 2.367471694946289, + "learning_rate": 6.384e-06, + "loss": 0.3394, + "step": 1596 + }, + { + "epoch": 0.03196, + "grad_norm": 2.5629160404205322, + "learning_rate": 6.392000000000001e-06, + "loss": 0.3666, + "step": 1598 + }, + { + "epoch": 0.032, + "grad_norm": 1.0980838537216187, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.2415, + "step": 1600 + }, + { + "epoch": 0.03204, + "grad_norm": 2.097179889678955, + "learning_rate": 6.408000000000001e-06, + "loss": 0.2226, + "step": 1602 + }, + { + "epoch": 0.03208, + "grad_norm": 2.1254780292510986, + "learning_rate": 6.416e-06, + "loss": 0.2226, + "step": 1604 + }, + { + "epoch": 0.03212, + "grad_norm": 1.1858876943588257, + "learning_rate": 6.424e-06, + "loss": 0.1519, + "step": 1606 + }, + { + "epoch": 0.03216, + "grad_norm": 2.1451706886291504, + "learning_rate": 6.432e-06, + "loss": 0.2301, + "step": 1608 + }, + { + "epoch": 0.0322, + "grad_norm": 1.2590230703353882, + "learning_rate": 6.440000000000001e-06, + "loss": 0.2428, + "step": 1610 + }, + { + "epoch": 0.03224, + "grad_norm": 1.345707654953003, + "learning_rate": 6.448000000000001e-06, + "loss": 0.23, + "step": 1612 + }, + { + "epoch": 0.03228, + "grad_norm": 1.6696155071258545, + "learning_rate": 6.456e-06, + "loss": 0.2029, + "step": 1614 + }, + { + "epoch": 0.03232, + "grad_norm": 1.8876585960388184, + "learning_rate": 6.464e-06, + "loss": 0.2546, + "step": 1616 + }, + { + "epoch": 0.03236, + "grad_norm": 1.5279914140701294, + "learning_rate": 6.472000000000001e-06, + "loss": 0.1759, + "step": 1618 + }, + { + "epoch": 0.0324, + "grad_norm": 2.051577091217041, + "learning_rate": 6.480000000000001e-06, + "loss": 0.243, + "step": 1620 + }, + { + "epoch": 0.03244, + "grad_norm": 2.1705398559570312, + "learning_rate": 6.488000000000001e-06, + "loss": 0.2262, + "step": 1622 + }, + { + "epoch": 0.03248, + "grad_norm": 2.1061043739318848, + "learning_rate": 6.496e-06, + "loss": 0.218, + "step": 1624 + }, + { + "epoch": 0.03252, + "grad_norm": 2.164263963699341, + "learning_rate": 6.504e-06, + "loss": 0.2263, + "step": 1626 + }, + { + "epoch": 0.03256, + "grad_norm": 1.6603810787200928, + "learning_rate": 6.5120000000000005e-06, + "loss": 0.2565, + "step": 1628 + }, + { + "epoch": 0.0326, + "grad_norm": 1.6696163415908813, + "learning_rate": 6.520000000000001e-06, + "loss": 0.2027, + "step": 1630 + }, + { + "epoch": 0.03264, + "grad_norm": 1.6065855026245117, + "learning_rate": 6.528000000000001e-06, + "loss": 0.2237, + "step": 1632 + }, + { + "epoch": 0.03268, + "grad_norm": 1.796004056930542, + "learning_rate": 6.536e-06, + "loss": 0.1932, + "step": 1634 + }, + { + "epoch": 0.03272, + "grad_norm": 1.8346418142318726, + "learning_rate": 6.544e-06, + "loss": 0.2565, + "step": 1636 + }, + { + "epoch": 0.03276, + "grad_norm": 1.7441225051879883, + "learning_rate": 6.552000000000001e-06, + "loss": 0.2468, + "step": 1638 + }, + { + "epoch": 0.0328, + "grad_norm": 2.137575626373291, + "learning_rate": 6.560000000000001e-06, + "loss": 0.2106, + "step": 1640 + }, + { + "epoch": 0.03284, + "grad_norm": 2.1841201782226562, + "learning_rate": 6.568000000000001e-06, + "loss": 0.2886, + "step": 1642 + }, + { + "epoch": 0.03288, + "grad_norm": 2.284668445587158, + "learning_rate": 6.576e-06, + "loss": 0.2382, + "step": 1644 + }, + { + "epoch": 0.03292, + "grad_norm": 2.099884510040283, + "learning_rate": 6.584e-06, + "loss": 0.2235, + "step": 1646 + }, + { + "epoch": 0.03296, + "grad_norm": 1.6142663955688477, + "learning_rate": 6.592000000000001e-06, + "loss": 0.2348, + "step": 1648 + }, + { + "epoch": 0.033, + "grad_norm": 2.109743356704712, + "learning_rate": 6.600000000000001e-06, + "loss": 0.2537, + "step": 1650 + }, + { + "epoch": 0.03304, + "grad_norm": 1.7951887845993042, + "learning_rate": 6.608000000000001e-06, + "loss": 0.2234, + "step": 1652 + }, + { + "epoch": 0.03308, + "grad_norm": 2.363117218017578, + "learning_rate": 6.616e-06, + "loss": 0.2259, + "step": 1654 + }, + { + "epoch": 0.03312, + "grad_norm": 1.7191818952560425, + "learning_rate": 6.6240000000000004e-06, + "loss": 0.1591, + "step": 1656 + }, + { + "epoch": 0.03316, + "grad_norm": 1.5282825231552124, + "learning_rate": 6.632000000000001e-06, + "loss": 0.2347, + "step": 1658 + }, + { + "epoch": 0.0332, + "grad_norm": 3.433577060699463, + "learning_rate": 6.640000000000001e-06, + "loss": 0.2629, + "step": 1660 + }, + { + "epoch": 0.03324, + "grad_norm": 3.4274215698242188, + "learning_rate": 6.648e-06, + "loss": 0.4236, + "step": 1662 + }, + { + "epoch": 0.03328, + "grad_norm": 2.776399850845337, + "learning_rate": 6.656e-06, + "loss": 0.3528, + "step": 1664 + }, + { + "epoch": 0.03332, + "grad_norm": 2.4016106128692627, + "learning_rate": 6.664e-06, + "loss": 0.2379, + "step": 1666 + }, + { + "epoch": 0.03336, + "grad_norm": 1.812479853630066, + "learning_rate": 6.672000000000001e-06, + "loss": 0.2664, + "step": 1668 + }, + { + "epoch": 0.0334, + "grad_norm": 1.7465859651565552, + "learning_rate": 6.680000000000001e-06, + "loss": 0.2122, + "step": 1670 + }, + { + "epoch": 0.03344, + "grad_norm": 1.613160490989685, + "learning_rate": 6.688e-06, + "loss": 0.2027, + "step": 1672 + }, + { + "epoch": 0.03348, + "grad_norm": 1.8835512399673462, + "learning_rate": 6.696e-06, + "loss": 0.233, + "step": 1674 + }, + { + "epoch": 0.03352, + "grad_norm": 1.6505866050720215, + "learning_rate": 6.7040000000000005e-06, + "loss": 0.233, + "step": 1676 + }, + { + "epoch": 0.03356, + "grad_norm": 1.62929105758667, + "learning_rate": 6.712000000000001e-06, + "loss": 0.2122, + "step": 1678 + }, + { + "epoch": 0.0336, + "grad_norm": 1.9475445747375488, + "learning_rate": 6.720000000000001e-06, + "loss": 0.2263, + "step": 1680 + }, + { + "epoch": 0.03364, + "grad_norm": 1.9997152090072632, + "learning_rate": 6.728e-06, + "loss": 0.3015, + "step": 1682 + }, + { + "epoch": 0.03368, + "grad_norm": 1.8201638460159302, + "learning_rate": 6.736e-06, + "loss": 0.2382, + "step": 1684 + }, + { + "epoch": 0.03372, + "grad_norm": 2.0834591388702393, + "learning_rate": 6.744e-06, + "loss": 0.2384, + "step": 1686 + }, + { + "epoch": 0.03376, + "grad_norm": 1.3555307388305664, + "learning_rate": 6.752000000000001e-06, + "loss": 0.1758, + "step": 1688 + }, + { + "epoch": 0.0338, + "grad_norm": 1.28238046169281, + "learning_rate": 6.760000000000001e-06, + "loss": 0.1763, + "step": 1690 + }, + { + "epoch": 0.03384, + "grad_norm": 2.0541677474975586, + "learning_rate": 6.768e-06, + "loss": 0.2471, + "step": 1692 + }, + { + "epoch": 0.03388, + "grad_norm": 1.174996018409729, + "learning_rate": 6.776e-06, + "loss": 0.1601, + "step": 1694 + }, + { + "epoch": 0.03392, + "grad_norm": 1.1631274223327637, + "learning_rate": 6.784000000000001e-06, + "loss": 0.2225, + "step": 1696 + }, + { + "epoch": 0.03396, + "grad_norm": 1.2030311822891235, + "learning_rate": 6.792000000000001e-06, + "loss": 0.1441, + "step": 1698 + }, + { + "epoch": 0.034, + "grad_norm": 2.424229145050049, + "learning_rate": 6.800000000000001e-06, + "loss": 0.2916, + "step": 1700 + }, + { + "epoch": 0.03404, + "grad_norm": 2.4080426692962646, + "learning_rate": 6.808e-06, + "loss": 0.2633, + "step": 1702 + }, + { + "epoch": 0.03408, + "grad_norm": 1.0328086614608765, + "learning_rate": 6.8160000000000005e-06, + "loss": 0.2632, + "step": 1704 + }, + { + "epoch": 0.03412, + "grad_norm": 1.104709506034851, + "learning_rate": 6.824e-06, + "loss": 0.2697, + "step": 1706 + }, + { + "epoch": 0.03416, + "grad_norm": 2.407306671142578, + "learning_rate": 6.832000000000001e-06, + "loss": 0.3664, + "step": 1708 + }, + { + "epoch": 0.0342, + "grad_norm": 2.0041964054107666, + "learning_rate": 6.8400000000000014e-06, + "loss": 0.3394, + "step": 1710 + }, + { + "epoch": 0.03424, + "grad_norm": 1.227369785308838, + "learning_rate": 6.848e-06, + "loss": 0.2425, + "step": 1712 + }, + { + "epoch": 0.03428, + "grad_norm": 1.8062214851379395, + "learning_rate": 6.856e-06, + "loss": 0.2766, + "step": 1714 + }, + { + "epoch": 0.03432, + "grad_norm": 1.6282885074615479, + "learning_rate": 6.864000000000001e-06, + "loss": 0.2123, + "step": 1716 + }, + { + "epoch": 0.03436, + "grad_norm": 1.5932142734527588, + "learning_rate": 6.872000000000001e-06, + "loss": 0.2223, + "step": 1718 + }, + { + "epoch": 0.0344, + "grad_norm": 1.7690414190292358, + "learning_rate": 6.88e-06, + "loss": 0.2537, + "step": 1720 + }, + { + "epoch": 0.03444, + "grad_norm": 1.510385513305664, + "learning_rate": 6.888e-06, + "loss": 0.2234, + "step": 1722 + }, + { + "epoch": 0.03448, + "grad_norm": 1.9395493268966675, + "learning_rate": 6.8960000000000006e-06, + "loss": 0.2348, + "step": 1724 + }, + { + "epoch": 0.03452, + "grad_norm": 1.499801516532898, + "learning_rate": 6.904e-06, + "loss": 0.2469, + "step": 1726 + }, + { + "epoch": 0.03456, + "grad_norm": 1.4700795412063599, + "learning_rate": 6.912000000000001e-06, + "loss": 0.2026, + "step": 1728 + }, + { + "epoch": 0.0346, + "grad_norm": 2.338766574859619, + "learning_rate": 6.92e-06, + "loss": 0.2899, + "step": 1730 + }, + { + "epoch": 0.03464, + "grad_norm": 1.4056166410446167, + "learning_rate": 6.928e-06, + "loss": 0.2299, + "step": 1732 + }, + { + "epoch": 0.03468, + "grad_norm": 1.4140843152999878, + "learning_rate": 6.936e-06, + "loss": 0.1939, + "step": 1734 + }, + { + "epoch": 0.03472, + "grad_norm": 1.3561739921569824, + "learning_rate": 6.944000000000001e-06, + "loss": 0.1843, + "step": 1736 + }, + { + "epoch": 0.03476, + "grad_norm": 1.4675413370132446, + "learning_rate": 6.952000000000001e-06, + "loss": 0.2026, + "step": 1738 + }, + { + "epoch": 0.0348, + "grad_norm": 2.18129825592041, + "learning_rate": 6.96e-06, + "loss": 0.2424, + "step": 1740 + }, + { + "epoch": 0.03484, + "grad_norm": 2.5285205841064453, + "learning_rate": 6.968e-06, + "loss": 0.2485, + "step": 1742 + }, + { + "epoch": 0.03488, + "grad_norm": 2.129061222076416, + "learning_rate": 6.976000000000001e-06, + "loss": 0.2101, + "step": 1744 + }, + { + "epoch": 0.03492, + "grad_norm": 1.0793652534484863, + "learning_rate": 6.984e-06, + "loss": 0.1369, + "step": 1746 + }, + { + "epoch": 0.03496, + "grad_norm": 2.9984960556030273, + "learning_rate": 6.992000000000001e-06, + "loss": 0.4247, + "step": 1748 + }, + { + "epoch": 0.035, + "grad_norm": 2.4617252349853516, + "learning_rate": 7e-06, + "loss": 0.2618, + "step": 1750 + }, + { + "epoch": 0.03504, + "grad_norm": 1.2149856090545654, + "learning_rate": 7.0080000000000005e-06, + "loss": 0.2481, + "step": 1752 + }, + { + "epoch": 0.03508, + "grad_norm": 1.931709885597229, + "learning_rate": 7.016e-06, + "loss": 0.3011, + "step": 1754 + }, + { + "epoch": 0.03512, + "grad_norm": 1.908962607383728, + "learning_rate": 7.024000000000001e-06, + "loss": 0.2178, + "step": 1756 + }, + { + "epoch": 0.03516, + "grad_norm": 1.5845496654510498, + "learning_rate": 7.0320000000000015e-06, + "loss": 0.2444, + "step": 1758 + }, + { + "epoch": 0.0352, + "grad_norm": 1.5920134782791138, + "learning_rate": 7.04e-06, + "loss": 0.2431, + "step": 1760 + }, + { + "epoch": 0.03524, + "grad_norm": 1.7661044597625732, + "learning_rate": 7.048e-06, + "loss": 0.2539, + "step": 1762 + }, + { + "epoch": 0.03528, + "grad_norm": 1.6936644315719604, + "learning_rate": 7.056000000000001e-06, + "loss": 0.254, + "step": 1764 + }, + { + "epoch": 0.03532, + "grad_norm": 1.3176276683807373, + "learning_rate": 7.0640000000000005e-06, + "loss": 0.2236, + "step": 1766 + }, + { + "epoch": 0.03536, + "grad_norm": 1.381730318069458, + "learning_rate": 7.072000000000001e-06, + "loss": 0.1933, + "step": 1768 + }, + { + "epoch": 0.0354, + "grad_norm": 1.657257080078125, + "learning_rate": 7.08e-06, + "loss": 0.243, + "step": 1770 + }, + { + "epoch": 0.03544, + "grad_norm": 1.4552005529403687, + "learning_rate": 7.088000000000001e-06, + "loss": 0.233, + "step": 1772 + }, + { + "epoch": 0.03548, + "grad_norm": 1.7786808013916016, + "learning_rate": 7.096e-06, + "loss": 0.2652, + "step": 1774 + }, + { + "epoch": 0.03552, + "grad_norm": 1.724249243736267, + "learning_rate": 7.104000000000001e-06, + "loss": 0.2329, + "step": 1776 + }, + { + "epoch": 0.03556, + "grad_norm": 1.5870014429092407, + "learning_rate": 7.1120000000000015e-06, + "loss": 0.2431, + "step": 1778 + }, + { + "epoch": 0.0356, + "grad_norm": 1.564550757408142, + "learning_rate": 7.1200000000000004e-06, + "loss": 0.2223, + "step": 1780 + }, + { + "epoch": 0.03564, + "grad_norm": 1.6385741233825684, + "learning_rate": 7.128e-06, + "loss": 0.2432, + "step": 1782 + }, + { + "epoch": 0.03568, + "grad_norm": 1.6072046756744385, + "learning_rate": 7.136000000000001e-06, + "loss": 0.2431, + "step": 1784 + }, + { + "epoch": 0.03572, + "grad_norm": 1.6043187379837036, + "learning_rate": 7.1440000000000005e-06, + "loss": 0.2431, + "step": 1786 + }, + { + "epoch": 0.03576, + "grad_norm": 1.6328811645507812, + "learning_rate": 7.152e-06, + "loss": 0.2223, + "step": 1788 + }, + { + "epoch": 0.0358, + "grad_norm": 1.4815673828125, + "learning_rate": 7.16e-06, + "loss": 0.2223, + "step": 1790 + }, + { + "epoch": 0.03584, + "grad_norm": 1.542502522468567, + "learning_rate": 7.168000000000001e-06, + "loss": 0.2546, + "step": 1792 + }, + { + "epoch": 0.03588, + "grad_norm": 1.7909824848175049, + "learning_rate": 7.176e-06, + "loss": 0.2146, + "step": 1794 + }, + { + "epoch": 0.03592, + "grad_norm": 1.6004282236099243, + "learning_rate": 7.184000000000001e-06, + "loss": 0.2127, + "step": 1796 + }, + { + "epoch": 0.03596, + "grad_norm": 1.6095179319381714, + "learning_rate": 7.192e-06, + "loss": 0.243, + "step": 1798 + }, + { + "epoch": 0.036, + "grad_norm": 1.82240891456604, + "learning_rate": 7.2000000000000005e-06, + "loss": 0.2329, + "step": 1800 + }, + { + "epoch": 0.03604, + "grad_norm": 1.7587615251541138, + "learning_rate": 7.208e-06, + "loss": 0.2431, + "step": 1802 + }, + { + "epoch": 0.03608, + "grad_norm": 1.7168346643447876, + "learning_rate": 7.216000000000001e-06, + "loss": 0.2429, + "step": 1804 + }, + { + "epoch": 0.03612, + "grad_norm": 1.4251519441604614, + "learning_rate": 7.224000000000001e-06, + "loss": 0.2236, + "step": 1806 + }, + { + "epoch": 0.03616, + "grad_norm": 1.562415599822998, + "learning_rate": 7.232e-06, + "loss": 0.2029, + "step": 1808 + }, + { + "epoch": 0.0362, + "grad_norm": 1.4579025506973267, + "learning_rate": 7.24e-06, + "loss": 0.2444, + "step": 1810 + }, + { + "epoch": 0.03624, + "grad_norm": 1.673192024230957, + "learning_rate": 7.248000000000001e-06, + "loss": 0.2431, + "step": 1812 + }, + { + "epoch": 0.03628, + "grad_norm": 1.2900861501693726, + "learning_rate": 7.2560000000000005e-06, + "loss": 0.1939, + "step": 1814 + }, + { + "epoch": 0.03632, + "grad_norm": 1.1258680820465088, + "learning_rate": 7.264000000000001e-06, + "loss": 0.16, + "step": 1816 + }, + { + "epoch": 0.03636, + "grad_norm": 1.2355237007141113, + "learning_rate": 7.272e-06, + "loss": 0.2426, + "step": 1818 + }, + { + "epoch": 0.0364, + "grad_norm": 2.1641931533813477, + "learning_rate": 7.280000000000001e-06, + "loss": 0.3528, + "step": 1820 + }, + { + "epoch": 0.03644, + "grad_norm": 1.231122374534607, + "learning_rate": 7.288e-06, + "loss": 0.2426, + "step": 1822 + }, + { + "epoch": 0.03648, + "grad_norm": 1.078922152519226, + "learning_rate": 7.296000000000001e-06, + "loss": 0.1374, + "step": 1824 + }, + { + "epoch": 0.03652, + "grad_norm": 1.1285371780395508, + "learning_rate": 7.304000000000001e-06, + "loss": 0.2622, + "step": 1826 + }, + { + "epoch": 0.03656, + "grad_norm": 1.0387810468673706, + "learning_rate": 7.3120000000000005e-06, + "loss": 0.1449, + "step": 1828 + }, + { + "epoch": 0.0366, + "grad_norm": 1.0615284442901611, + "learning_rate": 7.32e-06, + "loss": 0.2414, + "step": 1830 + }, + { + "epoch": 0.03664, + "grad_norm": 2.428542137145996, + "learning_rate": 7.328000000000001e-06, + "loss": 0.2916, + "step": 1832 + }, + { + "epoch": 0.03668, + "grad_norm": 2.2104294300079346, + "learning_rate": 7.3360000000000006e-06, + "loss": 0.3526, + "step": 1834 + }, + { + "epoch": 0.03672, + "grad_norm": 1.2935429811477661, + "learning_rate": 7.344000000000001e-06, + "loss": 0.2298, + "step": 1836 + }, + { + "epoch": 0.03676, + "grad_norm": 1.42750883102417, + "learning_rate": 7.352e-06, + "loss": 0.2596, + "step": 1838 + }, + { + "epoch": 0.0368, + "grad_norm": 1.8537211418151855, + "learning_rate": 7.360000000000001e-06, + "loss": 0.2765, + "step": 1840 + }, + { + "epoch": 0.03684, + "grad_norm": 1.7400612831115723, + "learning_rate": 7.3680000000000004e-06, + "loss": 0.2772, + "step": 1842 + }, + { + "epoch": 0.03688, + "grad_norm": 1.617256760597229, + "learning_rate": 7.376000000000001e-06, + "loss": 0.2222, + "step": 1844 + }, + { + "epoch": 0.03692, + "grad_norm": 1.8406928777694702, + "learning_rate": 7.384e-06, + "loss": 0.2259, + "step": 1846 + }, + { + "epoch": 0.03696, + "grad_norm": 1.345454216003418, + "learning_rate": 7.3920000000000005e-06, + "loss": 0.2379, + "step": 1848 + }, + { + "epoch": 0.037, + "grad_norm": 1.2105920314788818, + "learning_rate": 7.4e-06, + "loss": 0.2348, + "step": 1850 + }, + { + "epoch": 0.03704, + "grad_norm": 1.9769231081008911, + "learning_rate": 7.408000000000001e-06, + "loss": 0.2222, + "step": 1852 + }, + { + "epoch": 0.03708, + "grad_norm": 2.2392334938049316, + "learning_rate": 7.416000000000001e-06, + "loss": 0.2639, + "step": 1854 + }, + { + "epoch": 0.03712, + "grad_norm": 1.4254237413406372, + "learning_rate": 7.424e-06, + "loss": 0.2468, + "step": 1856 + }, + { + "epoch": 0.03716, + "grad_norm": 1.440991997718811, + "learning_rate": 7.432e-06, + "loss": 0.1761, + "step": 1858 + }, + { + "epoch": 0.0372, + "grad_norm": 2.389122486114502, + "learning_rate": 7.440000000000001e-06, + "loss": 0.3529, + "step": 1860 + }, + { + "epoch": 0.03724, + "grad_norm": 1.9722508192062378, + "learning_rate": 7.4480000000000005e-06, + "loss": 0.2222, + "step": 1862 + }, + { + "epoch": 0.03728, + "grad_norm": 1.272542953491211, + "learning_rate": 7.456000000000001e-06, + "loss": 0.2298, + "step": 1864 + }, + { + "epoch": 0.03732, + "grad_norm": 1.910951018333435, + "learning_rate": 7.464e-06, + "loss": 0.238, + "step": 1866 + }, + { + "epoch": 0.03736, + "grad_norm": 1.3213090896606445, + "learning_rate": 7.472000000000001e-06, + "loss": 0.1843, + "step": 1868 + }, + { + "epoch": 0.0374, + "grad_norm": 1.3564770221710205, + "learning_rate": 7.48e-06, + "loss": 0.1755, + "step": 1870 + }, + { + "epoch": 0.03744, + "grad_norm": 1.4258544445037842, + "learning_rate": 7.488000000000001e-06, + "loss": 0.193, + "step": 1872 + }, + { + "epoch": 0.03748, + "grad_norm": 1.3432955741882324, + "learning_rate": 7.496000000000001e-06, + "loss": 0.1672, + "step": 1874 + }, + { + "epoch": 0.03752, + "grad_norm": 1.200715184211731, + "learning_rate": 7.5040000000000005e-06, + "loss": 0.1439, + "step": 1876 + }, + { + "epoch": 0.03756, + "grad_norm": 1.0741522312164307, + "learning_rate": 7.512e-06, + "loss": 0.2778, + "step": 1878 + }, + { + "epoch": 0.0376, + "grad_norm": 2.2628509998321533, + "learning_rate": 7.520000000000001e-06, + "loss": 0.2372, + "step": 1880 + }, + { + "epoch": 0.03764, + "grad_norm": 1.0833215713500977, + "learning_rate": 7.528000000000001e-06, + "loss": 0.1111, + "step": 1882 + }, + { + "epoch": 0.03768, + "grad_norm": 0.9178128242492676, + "learning_rate": 7.536000000000001e-06, + "loss": 0.0998, + "step": 1884 + }, + { + "epoch": 0.03772, + "grad_norm": 2.9215502738952637, + "learning_rate": 7.544e-06, + "loss": 0.4539, + "step": 1886 + }, + { + "epoch": 0.03776, + "grad_norm": 0.9439677000045776, + "learning_rate": 7.552000000000001e-06, + "loss": 0.0945, + "step": 1888 + }, + { + "epoch": 0.0378, + "grad_norm": 2.725642204284668, + "learning_rate": 7.5600000000000005e-06, + "loss": 0.4699, + "step": 1890 + }, + { + "epoch": 0.03784, + "grad_norm": 2.529439687728882, + "learning_rate": 7.568000000000001e-06, + "loss": 0.4247, + "step": 1892 + }, + { + "epoch": 0.03788, + "grad_norm": 1.1550068855285645, + "learning_rate": 7.576000000000001e-06, + "loss": 0.3068, + "step": 1894 + }, + { + "epoch": 0.03792, + "grad_norm": 1.1604268550872803, + "learning_rate": 7.5840000000000006e-06, + "loss": 0.2348, + "step": 1896 + }, + { + "epoch": 0.03796, + "grad_norm": 1.7495981454849243, + "learning_rate": 7.592e-06, + "loss": 0.2178, + "step": 1898 + }, + { + "epoch": 0.038, + "grad_norm": 1.575295090675354, + "learning_rate": 7.600000000000001e-06, + "loss": 0.2147, + "step": 1900 + }, + { + "epoch": 0.03804, + "grad_norm": 1.5853041410446167, + "learning_rate": 7.608000000000001e-06, + "loss": 0.2431, + "step": 1902 + }, + { + "epoch": 0.03808, + "grad_norm": 1.4853233098983765, + "learning_rate": 7.616000000000001e-06, + "loss": 0.2129, + "step": 1904 + }, + { + "epoch": 0.03812, + "grad_norm": 1.3025236129760742, + "learning_rate": 7.624e-06, + "loss": 0.2236, + "step": 1906 + }, + { + "epoch": 0.03816, + "grad_norm": 1.161303162574768, + "learning_rate": 7.632e-06, + "loss": 0.1677, + "step": 1908 + }, + { + "epoch": 0.0382, + "grad_norm": 2.299328327178955, + "learning_rate": 7.640000000000001e-06, + "loss": 0.3535, + "step": 1910 + }, + { + "epoch": 0.03824, + "grad_norm": 2.2270188331604004, + "learning_rate": 7.648e-06, + "loss": 0.2785, + "step": 1912 + }, + { + "epoch": 0.03828, + "grad_norm": 1.1761696338653564, + "learning_rate": 7.656000000000001e-06, + "loss": 0.2701, + "step": 1914 + }, + { + "epoch": 0.03832, + "grad_norm": 1.0971667766571045, + "learning_rate": 7.664e-06, + "loss": 0.2356, + "step": 1916 + }, + { + "epoch": 0.03836, + "grad_norm": 1.059712290763855, + "learning_rate": 7.672e-06, + "loss": 0.152, + "step": 1918 + }, + { + "epoch": 0.0384, + "grad_norm": 1.0715806484222412, + "learning_rate": 7.680000000000001e-06, + "loss": 0.2487, + "step": 1920 + }, + { + "epoch": 0.03844, + "grad_norm": 1.1390923261642456, + "learning_rate": 7.688000000000002e-06, + "loss": 0.2355, + "step": 1922 + }, + { + "epoch": 0.03848, + "grad_norm": 1.9644125699996948, + "learning_rate": 7.696e-06, + "loss": 0.3528, + "step": 1924 + }, + { + "epoch": 0.03852, + "grad_norm": 1.1373341083526611, + "learning_rate": 7.704000000000001e-06, + "loss": 0.2562, + "step": 1926 + }, + { + "epoch": 0.03856, + "grad_norm": 1.142844557762146, + "learning_rate": 7.712e-06, + "loss": 0.2429, + "step": 1928 + }, + { + "epoch": 0.0386, + "grad_norm": 1.0871421098709106, + "learning_rate": 7.72e-06, + "loss": 0.176, + "step": 1930 + }, + { + "epoch": 0.03864, + "grad_norm": 1.871744990348816, + "learning_rate": 7.728000000000001e-06, + "loss": 0.3264, + "step": 1932 + }, + { + "epoch": 0.03868, + "grad_norm": 1.5047680139541626, + "learning_rate": 7.736e-06, + "loss": 0.2353, + "step": 1934 + }, + { + "epoch": 0.03872, + "grad_norm": 1.562229871749878, + "learning_rate": 7.744e-06, + "loss": 0.2433, + "step": 1936 + }, + { + "epoch": 0.03876, + "grad_norm": 1.8186389207839966, + "learning_rate": 7.752000000000001e-06, + "loss": 0.2352, + "step": 1938 + }, + { + "epoch": 0.0388, + "grad_norm": 1.4520933628082275, + "learning_rate": 7.76e-06, + "loss": 0.2122, + "step": 1940 + }, + { + "epoch": 0.03884, + "grad_norm": 1.7360458374023438, + "learning_rate": 7.768e-06, + "loss": 0.2236, + "step": 1942 + }, + { + "epoch": 0.03888, + "grad_norm": 1.235434889793396, + "learning_rate": 7.776e-06, + "loss": 0.2181, + "step": 1944 + }, + { + "epoch": 0.03892, + "grad_norm": 1.3611688613891602, + "learning_rate": 7.784e-06, + "loss": 0.1599, + "step": 1946 + }, + { + "epoch": 0.03896, + "grad_norm": 1.0094107389450073, + "learning_rate": 7.792000000000001e-06, + "loss": 0.1249, + "step": 1948 + }, + { + "epoch": 0.039, + "grad_norm": 0.838989794254303, + "learning_rate": 7.800000000000002e-06, + "loss": 0.0953, + "step": 1950 + }, + { + "epoch": 0.03904, + "grad_norm": 0.7819032073020935, + "learning_rate": 7.808e-06, + "loss": 0.2574, + "step": 1952 + }, + { + "epoch": 0.03908, + "grad_norm": 3.0490407943725586, + "learning_rate": 7.816000000000001e-06, + "loss": 0.4856, + "step": 1954 + }, + { + "epoch": 0.03912, + "grad_norm": 2.859421968460083, + "learning_rate": 7.824e-06, + "loss": 0.4697, + "step": 1956 + }, + { + "epoch": 0.03916, + "grad_norm": 2.4658448696136475, + "learning_rate": 7.832e-06, + "loss": 0.2571, + "step": 1958 + }, + { + "epoch": 0.0392, + "grad_norm": 2.034477949142456, + "learning_rate": 7.840000000000001e-06, + "loss": 0.3531, + "step": 1960 + }, + { + "epoch": 0.03924, + "grad_norm": 1.3415395021438599, + "learning_rate": 7.848000000000002e-06, + "loss": 0.2353, + "step": 1962 + }, + { + "epoch": 0.03928, + "grad_norm": 1.6951673030853271, + "learning_rate": 7.856e-06, + "loss": 0.2446, + "step": 1964 + }, + { + "epoch": 0.03932, + "grad_norm": 1.2509803771972656, + "learning_rate": 7.864000000000001e-06, + "loss": 0.2237, + "step": 1966 + }, + { + "epoch": 0.03936, + "grad_norm": 1.3041082620620728, + "learning_rate": 7.872e-06, + "loss": 0.2125, + "step": 1968 + }, + { + "epoch": 0.0394, + "grad_norm": 1.5549094676971436, + "learning_rate": 7.88e-06, + "loss": 0.3017, + "step": 1970 + }, + { + "epoch": 0.03944, + "grad_norm": 1.063351035118103, + "learning_rate": 7.888e-06, + "loss": 0.2564, + "step": 1972 + }, + { + "epoch": 0.03948, + "grad_norm": 1.7414863109588623, + "learning_rate": 7.896e-06, + "loss": 0.2433, + "step": 1974 + }, + { + "epoch": 0.03952, + "grad_norm": 1.71708345413208, + "learning_rate": 7.904000000000001e-06, + "loss": 0.2356, + "step": 1976 + }, + { + "epoch": 0.03956, + "grad_norm": 1.749862551689148, + "learning_rate": 7.912000000000001e-06, + "loss": 0.2431, + "step": 1978 + }, + { + "epoch": 0.0396, + "grad_norm": 1.2429121732711792, + "learning_rate": 7.92e-06, + "loss": 0.1692, + "step": 1980 + }, + { + "epoch": 0.03964, + "grad_norm": 1.0173081159591675, + "learning_rate": 7.928e-06, + "loss": 0.2355, + "step": 1982 + }, + { + "epoch": 0.03968, + "grad_norm": 1.794469952583313, + "learning_rate": 7.936e-06, + "loss": 0.2512, + "step": 1984 + }, + { + "epoch": 0.03972, + "grad_norm": 1.6026344299316406, + "learning_rate": 7.944e-06, + "loss": 0.3139, + "step": 1986 + }, + { + "epoch": 0.03976, + "grad_norm": 1.1759471893310547, + "learning_rate": 7.952000000000001e-06, + "loss": 0.2512, + "step": 1988 + }, + { + "epoch": 0.0398, + "grad_norm": 1.169339656829834, + "learning_rate": 7.960000000000002e-06, + "loss": 0.176, + "step": 1990 + }, + { + "epoch": 0.03984, + "grad_norm": 1.3394396305084229, + "learning_rate": 7.968e-06, + "loss": 0.203, + "step": 1992 + }, + { + "epoch": 0.03988, + "grad_norm": 1.6189308166503906, + "learning_rate": 7.976000000000001e-06, + "loss": 0.2768, + "step": 1994 + }, + { + "epoch": 0.03992, + "grad_norm": 1.5596294403076172, + "learning_rate": 7.984e-06, + "loss": 0.2901, + "step": 1996 + }, + { + "epoch": 0.03996, + "grad_norm": 1.447299838066101, + "learning_rate": 7.992e-06, + "loss": 0.2326, + "step": 1998 + }, + { + "epoch": 0.04, + "grad_norm": 1.4742189645767212, + "learning_rate": 8.000000000000001e-06, + "loss": 0.2433, + "step": 2000 + }, + { + "epoch": 0.04004, + "grad_norm": 1.5100868940353394, + "learning_rate": 8.008e-06, + "loss": 0.2225, + "step": 2002 + }, + { + "epoch": 0.04008, + "grad_norm": 1.4285002946853638, + "learning_rate": 8.016e-06, + "loss": 0.2331, + "step": 2004 + }, + { + "epoch": 0.04012, + "grad_norm": 1.413381814956665, + "learning_rate": 8.024000000000001e-06, + "loss": 0.2123, + "step": 2006 + }, + { + "epoch": 0.04016, + "grad_norm": 1.48664128780365, + "learning_rate": 8.032e-06, + "loss": 0.2432, + "step": 2008 + }, + { + "epoch": 0.0402, + "grad_norm": 1.406788945198059, + "learning_rate": 8.040000000000001e-06, + "loss": 0.2122, + "step": 2010 + }, + { + "epoch": 0.04024, + "grad_norm": 1.4426158666610718, + "learning_rate": 8.048e-06, + "loss": 0.2332, + "step": 2012 + }, + { + "epoch": 0.04028, + "grad_norm": 1.4659472703933716, + "learning_rate": 8.056e-06, + "loss": 0.2123, + "step": 2014 + }, + { + "epoch": 0.04032, + "grad_norm": 1.5399192571640015, + "learning_rate": 8.064000000000001e-06, + "loss": 0.2445, + "step": 2016 + }, + { + "epoch": 0.04036, + "grad_norm": 1.5014554262161255, + "learning_rate": 8.072000000000002e-06, + "loss": 0.2028, + "step": 2018 + }, + { + "epoch": 0.0404, + "grad_norm": 1.611117959022522, + "learning_rate": 8.08e-06, + "loss": 0.2772, + "step": 2020 + }, + { + "epoch": 0.04044, + "grad_norm": 1.6811795234680176, + "learning_rate": 8.088e-06, + "loss": 0.2771, + "step": 2022 + }, + { + "epoch": 0.04048, + "grad_norm": 1.619080901145935, + "learning_rate": 8.096e-06, + "loss": 0.2539, + "step": 2024 + }, + { + "epoch": 0.04052, + "grad_norm": 1.4134800434112549, + "learning_rate": 8.104e-06, + "loss": 0.2223, + "step": 2026 + }, + { + "epoch": 0.04056, + "grad_norm": 1.4536045789718628, + "learning_rate": 8.112000000000001e-06, + "loss": 0.2326, + "step": 2028 + }, + { + "epoch": 0.0406, + "grad_norm": 1.4395623207092285, + "learning_rate": 8.120000000000002e-06, + "loss": 0.2546, + "step": 2030 + }, + { + "epoch": 0.04064, + "grad_norm": 1.1375670433044434, + "learning_rate": 8.128e-06, + "loss": 0.1847, + "step": 2032 + }, + { + "epoch": 0.04068, + "grad_norm": 1.1386274099349976, + "learning_rate": 8.136000000000001e-06, + "loss": 0.2264, + "step": 2034 + }, + { + "epoch": 0.04072, + "grad_norm": 1.6526950597763062, + "learning_rate": 8.144e-06, + "loss": 0.2471, + "step": 2036 + }, + { + "epoch": 0.04076, + "grad_norm": 1.7519298791885376, + "learning_rate": 8.152000000000001e-06, + "loss": 0.251, + "step": 2038 + }, + { + "epoch": 0.0408, + "grad_norm": 1.7031952142715454, + "learning_rate": 8.16e-06, + "loss": 0.2512, + "step": 2040 + }, + { + "epoch": 0.04084, + "grad_norm": 1.1413664817810059, + "learning_rate": 8.168e-06, + "loss": 0.2383, + "step": 2042 + }, + { + "epoch": 0.04088, + "grad_norm": 1.649368405342102, + "learning_rate": 8.176000000000001e-06, + "loss": 0.2302, + "step": 2044 + }, + { + "epoch": 0.04092, + "grad_norm": 1.0451163053512573, + "learning_rate": 8.184000000000002e-06, + "loss": 0.2301, + "step": 2046 + }, + { + "epoch": 0.04096, + "grad_norm": 1.4552851915359497, + "learning_rate": 8.192e-06, + "loss": 0.2149, + "step": 2048 + }, + { + "epoch": 0.041, + "grad_norm": 1.2528570890426636, + "learning_rate": 8.2e-06, + "loss": 0.2351, + "step": 2050 + }, + { + "epoch": 0.04104, + "grad_norm": 1.2174805402755737, + "learning_rate": 8.208e-06, + "loss": 0.235, + "step": 2052 + }, + { + "epoch": 0.04108, + "grad_norm": 1.355133056640625, + "learning_rate": 8.216e-06, + "loss": 0.2443, + "step": 2054 + }, + { + "epoch": 0.04112, + "grad_norm": 1.4227584600448608, + "learning_rate": 8.224000000000001e-06, + "loss": 0.2431, + "step": 2056 + }, + { + "epoch": 0.04116, + "grad_norm": 1.4033678770065308, + "learning_rate": 8.232000000000002e-06, + "loss": 0.2545, + "step": 2058 + }, + { + "epoch": 0.0412, + "grad_norm": 1.530375361442566, + "learning_rate": 8.24e-06, + "loss": 0.2329, + "step": 2060 + }, + { + "epoch": 0.04124, + "grad_norm": 1.2569605112075806, + "learning_rate": 8.248e-06, + "loss": 0.2222, + "step": 2062 + }, + { + "epoch": 0.04128, + "grad_norm": 1.2985423803329468, + "learning_rate": 8.256e-06, + "loss": 0.2126, + "step": 2064 + }, + { + "epoch": 0.04132, + "grad_norm": 1.1707721948623657, + "learning_rate": 8.264e-06, + "loss": 0.176, + "step": 2066 + }, + { + "epoch": 0.04136, + "grad_norm": 1.1869916915893555, + "learning_rate": 8.272000000000001e-06, + "loss": 0.2424, + "step": 2068 + }, + { + "epoch": 0.0414, + "grad_norm": 1.229625940322876, + "learning_rate": 8.28e-06, + "loss": 0.2424, + "step": 2070 + }, + { + "epoch": 0.04144, + "grad_norm": 1.0606056451797485, + "learning_rate": 8.288000000000001e-06, + "loss": 0.2152, + "step": 2072 + }, + { + "epoch": 0.04148, + "grad_norm": 1.1862646341323853, + "learning_rate": 8.296000000000002e-06, + "loss": 0.1305, + "step": 2074 + }, + { + "epoch": 0.04152, + "grad_norm": 1.089384913444519, + "learning_rate": 8.304e-06, + "loss": 0.1235, + "step": 2076 + }, + { + "epoch": 0.04156, + "grad_norm": 0.929021418094635, + "learning_rate": 8.312000000000001e-06, + "loss": 0.272, + "step": 2078 + }, + { + "epoch": 0.0416, + "grad_norm": 2.539999485015869, + "learning_rate": 8.32e-06, + "loss": 0.4541, + "step": 2080 + }, + { + "epoch": 0.04164, + "grad_norm": 1.1373976469039917, + "learning_rate": 8.328e-06, + "loss": 0.2843, + "step": 2082 + }, + { + "epoch": 0.04168, + "grad_norm": 1.1727124452590942, + "learning_rate": 8.336000000000001e-06, + "loss": 0.1304, + "step": 2084 + }, + { + "epoch": 0.04172, + "grad_norm": 2.008715867996216, + "learning_rate": 8.344000000000002e-06, + "loss": 0.3131, + "step": 2086 + }, + { + "epoch": 0.04176, + "grad_norm": 1.239867925643921, + "learning_rate": 8.352e-06, + "loss": 0.2638, + "step": 2088 + }, + { + "epoch": 0.0418, + "grad_norm": 1.6664377450942993, + "learning_rate": 8.36e-06, + "loss": 0.2177, + "step": 2090 + }, + { + "epoch": 0.04184, + "grad_norm": 1.6648024320602417, + "learning_rate": 8.368e-06, + "loss": 0.265, + "step": 2092 + }, + { + "epoch": 0.04188, + "grad_norm": 1.4141604900360107, + "learning_rate": 8.376e-06, + "loss": 0.2323, + "step": 2094 + }, + { + "epoch": 0.04192, + "grad_norm": 1.456096887588501, + "learning_rate": 8.384000000000001e-06, + "loss": 0.233, + "step": 2096 + }, + { + "epoch": 0.04196, + "grad_norm": 1.3184406757354736, + "learning_rate": 8.392e-06, + "loss": 0.2121, + "step": 2098 + }, + { + "epoch": 0.042, + "grad_norm": 1.6598531007766724, + "learning_rate": 8.400000000000001e-06, + "loss": 0.2651, + "step": 2100 + }, + { + "epoch": 0.04204, + "grad_norm": 1.2085477113723755, + "learning_rate": 8.408e-06, + "loss": 0.1938, + "step": 2102 + }, + { + "epoch": 0.04208, + "grad_norm": 1.3093540668487549, + "learning_rate": 8.416e-06, + "loss": 0.2222, + "step": 2104 + }, + { + "epoch": 0.04212, + "grad_norm": 1.6092407703399658, + "learning_rate": 8.424000000000001e-06, + "loss": 0.2261, + "step": 2106 + }, + { + "epoch": 0.04216, + "grad_norm": 1.5499446392059326, + "learning_rate": 8.432e-06, + "loss": 0.2431, + "step": 2108 + }, + { + "epoch": 0.0422, + "grad_norm": 1.0792912244796753, + "learning_rate": 8.44e-06, + "loss": 0.1516, + "step": 2110 + }, + { + "epoch": 0.04224, + "grad_norm": 1.8624887466430664, + "learning_rate": 8.448000000000001e-06, + "loss": 0.3258, + "step": 2112 + }, + { + "epoch": 0.04228, + "grad_norm": 1.300214171409607, + "learning_rate": 8.456000000000002e-06, + "loss": 0.1843, + "step": 2114 + }, + { + "epoch": 0.04232, + "grad_norm": 1.8368983268737793, + "learning_rate": 8.464e-06, + "loss": 0.2425, + "step": 2116 + }, + { + "epoch": 0.04236, + "grad_norm": 1.620708703994751, + "learning_rate": 8.472e-06, + "loss": 0.3132, + "step": 2118 + }, + { + "epoch": 0.0424, + "grad_norm": 1.2556018829345703, + "learning_rate": 8.48e-06, + "loss": 0.2026, + "step": 2120 + }, + { + "epoch": 0.04244, + "grad_norm": 1.1136234998703003, + "learning_rate": 8.488e-06, + "loss": 0.1761, + "step": 2122 + }, + { + "epoch": 0.04248, + "grad_norm": 1.8815817832946777, + "learning_rate": 8.496000000000001e-06, + "loss": 0.2899, + "step": 2124 + }, + { + "epoch": 0.04252, + "grad_norm": 1.3917076587677002, + "learning_rate": 8.504000000000002e-06, + "loss": 0.2443, + "step": 2126 + }, + { + "epoch": 0.04256, + "grad_norm": 1.2602580785751343, + "learning_rate": 8.512e-06, + "loss": 0.2348, + "step": 2128 + }, + { + "epoch": 0.0426, + "grad_norm": 1.7359368801116943, + "learning_rate": 8.52e-06, + "loss": 0.277, + "step": 2130 + }, + { + "epoch": 0.04264, + "grad_norm": 1.5221161842346191, + "learning_rate": 8.528e-06, + "loss": 0.2328, + "step": 2132 + }, + { + "epoch": 0.04268, + "grad_norm": 1.4124306440353394, + "learning_rate": 8.536000000000001e-06, + "loss": 0.2322, + "step": 2134 + }, + { + "epoch": 0.04272, + "grad_norm": 1.565605640411377, + "learning_rate": 8.544000000000002e-06, + "loss": 0.2543, + "step": 2136 + }, + { + "epoch": 0.04276, + "grad_norm": 1.4984469413757324, + "learning_rate": 8.552e-06, + "loss": 0.2235, + "step": 2138 + }, + { + "epoch": 0.0428, + "grad_norm": 1.1144226789474487, + "learning_rate": 8.560000000000001e-06, + "loss": 0.2028, + "step": 2140 + }, + { + "epoch": 0.04284, + "grad_norm": 1.6454660892486572, + "learning_rate": 8.568e-06, + "loss": 0.2564, + "step": 2142 + }, + { + "epoch": 0.04288, + "grad_norm": 1.705457329750061, + "learning_rate": 8.576e-06, + "loss": 0.3006, + "step": 2144 + }, + { + "epoch": 0.04292, + "grad_norm": 1.5005574226379395, + "learning_rate": 8.584000000000001e-06, + "loss": 0.2146, + "step": 2146 + }, + { + "epoch": 0.04296, + "grad_norm": 1.621537685394287, + "learning_rate": 8.592e-06, + "loss": 0.2443, + "step": 2148 + }, + { + "epoch": 0.043, + "grad_norm": 1.4583916664123535, + "learning_rate": 8.6e-06, + "loss": 0.2772, + "step": 2150 + }, + { + "epoch": 0.04304, + "grad_norm": 1.462640643119812, + "learning_rate": 8.608000000000001e-06, + "loss": 0.2538, + "step": 2152 + }, + { + "epoch": 0.04308, + "grad_norm": 1.347447156906128, + "learning_rate": 8.616000000000002e-06, + "loss": 0.2323, + "step": 2154 + }, + { + "epoch": 0.04312, + "grad_norm": 1.3093863725662231, + "learning_rate": 8.624e-06, + "loss": 0.2026, + "step": 2156 + }, + { + "epoch": 0.04316, + "grad_norm": 1.4721786975860596, + "learning_rate": 8.632e-06, + "loss": 0.2431, + "step": 2158 + }, + { + "epoch": 0.0432, + "grad_norm": 1.6014177799224854, + "learning_rate": 8.64e-06, + "loss": 0.2651, + "step": 2160 + }, + { + "epoch": 0.04324, + "grad_norm": 1.3692525625228882, + "learning_rate": 8.648000000000001e-06, + "loss": 0.2321, + "step": 2162 + }, + { + "epoch": 0.04328, + "grad_norm": 1.4468510150909424, + "learning_rate": 8.656000000000001e-06, + "loss": 0.233, + "step": 2164 + }, + { + "epoch": 0.04332, + "grad_norm": 1.3473191261291504, + "learning_rate": 8.664e-06, + "loss": 0.2127, + "step": 2166 + }, + { + "epoch": 0.04336, + "grad_norm": 1.4133415222167969, + "learning_rate": 8.672000000000001e-06, + "loss": 0.243, + "step": 2168 + }, + { + "epoch": 0.0434, + "grad_norm": 1.2962533235549927, + "learning_rate": 8.68e-06, + "loss": 0.2328, + "step": 2170 + }, + { + "epoch": 0.04344, + "grad_norm": 1.3713055849075317, + "learning_rate": 8.688e-06, + "loss": 0.2322, + "step": 2172 + }, + { + "epoch": 0.04348, + "grad_norm": 1.3755871057510376, + "learning_rate": 8.696000000000001e-06, + "loss": 0.2322, + "step": 2174 + }, + { + "epoch": 0.04352, + "grad_norm": 1.2353265285491943, + "learning_rate": 8.704e-06, + "loss": 0.2329, + "step": 2176 + }, + { + "epoch": 0.04356, + "grad_norm": 1.240393042564392, + "learning_rate": 8.712e-06, + "loss": 0.2221, + "step": 2178 + }, + { + "epoch": 0.0436, + "grad_norm": 1.3227720260620117, + "learning_rate": 8.720000000000001e-06, + "loss": 0.2223, + "step": 2180 + }, + { + "epoch": 0.04364, + "grad_norm": 1.229243278503418, + "learning_rate": 8.728e-06, + "loss": 0.2027, + "step": 2182 + }, + { + "epoch": 0.04368, + "grad_norm": 1.21268630027771, + "learning_rate": 8.736e-06, + "loss": 0.2128, + "step": 2184 + }, + { + "epoch": 0.04372, + "grad_norm": 1.4802743196487427, + "learning_rate": 8.744e-06, + "loss": 0.2539, + "step": 2186 + }, + { + "epoch": 0.04376, + "grad_norm": 1.5441895723342896, + "learning_rate": 8.752e-06, + "loss": 0.2349, + "step": 2188 + }, + { + "epoch": 0.0438, + "grad_norm": 1.352423071861267, + "learning_rate": 8.76e-06, + "loss": 0.2236, + "step": 2190 + }, + { + "epoch": 0.04384, + "grad_norm": 1.4486315250396729, + "learning_rate": 8.768000000000001e-06, + "loss": 0.243, + "step": 2192 + }, + { + "epoch": 0.04388, + "grad_norm": 1.4583035707473755, + "learning_rate": 8.776e-06, + "loss": 0.2322, + "step": 2194 + }, + { + "epoch": 0.04392, + "grad_norm": 1.4082002639770508, + "learning_rate": 8.784000000000001e-06, + "loss": 0.2221, + "step": 2196 + }, + { + "epoch": 0.04396, + "grad_norm": 1.4123090505599976, + "learning_rate": 8.792e-06, + "loss": 0.2543, + "step": 2198 + }, + { + "epoch": 0.044, + "grad_norm": 1.6789454221725464, + "learning_rate": 8.8e-06, + "loss": 0.2347, + "step": 2200 + }, + { + "epoch": 0.04404, + "grad_norm": 1.62650728225708, + "learning_rate": 8.808000000000001e-06, + "loss": 0.2329, + "step": 2202 + }, + { + "epoch": 0.04408, + "grad_norm": 2.073572874069214, + "learning_rate": 8.816000000000002e-06, + "loss": 0.2821, + "step": 2204 + }, + { + "epoch": 0.04412, + "grad_norm": 1.1948316097259521, + "learning_rate": 8.824e-06, + "loss": 0.159, + "step": 2206 + }, + { + "epoch": 0.04416, + "grad_norm": 1.4152562618255615, + "learning_rate": 8.832000000000001e-06, + "loss": 0.1761, + "step": 2208 + }, + { + "epoch": 0.0442, + "grad_norm": 2.0231568813323975, + "learning_rate": 8.84e-06, + "loss": 0.339, + "step": 2210 + }, + { + "epoch": 0.04424, + "grad_norm": 2.0994532108306885, + "learning_rate": 8.848e-06, + "loss": 0.3389, + "step": 2212 + }, + { + "epoch": 0.04428, + "grad_norm": 1.2613872289657593, + "learning_rate": 8.856000000000001e-06, + "loss": 0.2505, + "step": 2214 + }, + { + "epoch": 0.04432, + "grad_norm": 1.7761863470077515, + "learning_rate": 8.864e-06, + "loss": 0.2379, + "step": 2216 + }, + { + "epoch": 0.04436, + "grad_norm": 1.2827118635177612, + "learning_rate": 8.872e-06, + "loss": 0.1931, + "step": 2218 + }, + { + "epoch": 0.0444, + "grad_norm": 1.294471025466919, + "learning_rate": 8.880000000000001e-06, + "loss": 0.1931, + "step": 2220 + }, + { + "epoch": 0.04444, + "grad_norm": 1.2740516662597656, + "learning_rate": 8.888e-06, + "loss": 0.2348, + "step": 2222 + }, + { + "epoch": 0.04448, + "grad_norm": 1.8369877338409424, + "learning_rate": 8.896000000000001e-06, + "loss": 0.3257, + "step": 2224 + }, + { + "epoch": 0.04452, + "grad_norm": 1.2315171957015991, + "learning_rate": 8.904e-06, + "loss": 0.2125, + "step": 2226 + }, + { + "epoch": 0.04456, + "grad_norm": 1.2958788871765137, + "learning_rate": 8.912e-06, + "loss": 0.2328, + "step": 2228 + }, + { + "epoch": 0.0446, + "grad_norm": 1.3517061471939087, + "learning_rate": 8.920000000000001e-06, + "loss": 0.2221, + "step": 2230 + }, + { + "epoch": 0.04464, + "grad_norm": 1.614579677581787, + "learning_rate": 8.928000000000002e-06, + "loss": 0.2563, + "step": 2232 + }, + { + "epoch": 0.04468, + "grad_norm": 1.28358793258667, + "learning_rate": 8.936e-06, + "loss": 0.2026, + "step": 2234 + }, + { + "epoch": 0.04472, + "grad_norm": 1.4942857027053833, + "learning_rate": 8.944000000000001e-06, + "loss": 0.2429, + "step": 2236 + }, + { + "epoch": 0.04476, + "grad_norm": 1.6068791151046753, + "learning_rate": 8.952e-06, + "loss": 0.2442, + "step": 2238 + }, + { + "epoch": 0.0448, + "grad_norm": 1.3397330045700073, + "learning_rate": 8.96e-06, + "loss": 0.2536, + "step": 2240 + }, + { + "epoch": 0.04484, + "grad_norm": 1.3482013940811157, + "learning_rate": 8.968000000000001e-06, + "loss": 0.2321, + "step": 2242 + }, + { + "epoch": 0.04488, + "grad_norm": 1.1926815509796143, + "learning_rate": 8.976e-06, + "loss": 0.2119, + "step": 2244 + }, + { + "epoch": 0.04492, + "grad_norm": 1.2257006168365479, + "learning_rate": 8.984e-06, + "loss": 0.212, + "step": 2246 + }, + { + "epoch": 0.04496, + "grad_norm": 1.3300756216049194, + "learning_rate": 8.992000000000001e-06, + "loss": 0.212, + "step": 2248 + }, + { + "epoch": 0.045, + "grad_norm": 1.2121262550354004, + "learning_rate": 9e-06, + "loss": 0.2233, + "step": 2250 + }, + { + "epoch": 0.04504, + "grad_norm": 1.63218092918396, + "learning_rate": 9.008e-06, + "loss": 0.2764, + "step": 2252 + }, + { + "epoch": 0.04508, + "grad_norm": 1.2625293731689453, + "learning_rate": 9.016e-06, + "loss": 0.193, + "step": 2254 + }, + { + "epoch": 0.04512, + "grad_norm": 1.4238498210906982, + "learning_rate": 9.024e-06, + "loss": 0.2233, + "step": 2256 + }, + { + "epoch": 0.04516, + "grad_norm": 1.5470541715621948, + "learning_rate": 9.032000000000001e-06, + "loss": 0.2145, + "step": 2258 + }, + { + "epoch": 0.0452, + "grad_norm": 1.2758153676986694, + "learning_rate": 9.040000000000002e-06, + "loss": 0.193, + "step": 2260 + }, + { + "epoch": 0.04524, + "grad_norm": 1.3257676362991333, + "learning_rate": 9.048e-06, + "loss": 0.2025, + "step": 2262 + }, + { + "epoch": 0.04528, + "grad_norm": 1.587445855140686, + "learning_rate": 9.056000000000001e-06, + "loss": 0.3167, + "step": 2264 + }, + { + "epoch": 0.04532, + "grad_norm": 1.1935415267944336, + "learning_rate": 9.064e-06, + "loss": 0.2379, + "step": 2266 + }, + { + "epoch": 0.04536, + "grad_norm": 1.5440425872802734, + "learning_rate": 9.072e-06, + "loss": 0.2064, + "step": 2268 + }, + { + "epoch": 0.0454, + "grad_norm": 1.674289584159851, + "learning_rate": 9.080000000000001e-06, + "loss": 0.2763, + "step": 2270 + }, + { + "epoch": 0.04544, + "grad_norm": 1.6817419528961182, + "learning_rate": 9.088000000000002e-06, + "loss": 0.277, + "step": 2272 + }, + { + "epoch": 0.04548, + "grad_norm": 1.6302658319473267, + "learning_rate": 9.096e-06, + "loss": 0.2542, + "step": 2274 + }, + { + "epoch": 0.04552, + "grad_norm": 1.3676789999008179, + "learning_rate": 9.104000000000001e-06, + "loss": 0.222, + "step": 2276 + }, + { + "epoch": 0.04556, + "grad_norm": 1.3951865434646606, + "learning_rate": 9.112e-06, + "loss": 0.2023, + "step": 2278 + }, + { + "epoch": 0.0456, + "grad_norm": 1.8051596879959106, + "learning_rate": 9.12e-06, + "loss": 0.313, + "step": 2280 + }, + { + "epoch": 0.04564, + "grad_norm": 1.0556260347366333, + "learning_rate": 9.128e-06, + "loss": 0.1298, + "step": 2282 + }, + { + "epoch": 0.04568, + "grad_norm": 1.7956022024154663, + "learning_rate": 9.136e-06, + "loss": 0.3389, + "step": 2284 + }, + { + "epoch": 0.04572, + "grad_norm": 1.1183308362960815, + "learning_rate": 9.144000000000001e-06, + "loss": 0.1368, + "step": 2286 + }, + { + "epoch": 0.04576, + "grad_norm": 1.0486716032028198, + "learning_rate": 9.152000000000001e-06, + "loss": 0.2409, + "step": 2288 + }, + { + "epoch": 0.0458, + "grad_norm": 2.0015978813171387, + "learning_rate": 9.16e-06, + "loss": 0.2213, + "step": 2290 + }, + { + "epoch": 0.04584, + "grad_norm": 2.221010446548462, + "learning_rate": 9.168000000000001e-06, + "loss": 0.2548, + "step": 2292 + }, + { + "epoch": 0.04588, + "grad_norm": 1.462904930114746, + "learning_rate": 9.176e-06, + "loss": 0.2726, + "step": 2294 + }, + { + "epoch": 0.04592, + "grad_norm": 1.20868718624115, + "learning_rate": 9.184e-06, + "loss": 0.1444, + "step": 2296 + }, + { + "epoch": 0.04596, + "grad_norm": 2.1120851039886475, + "learning_rate": 9.192000000000001e-06, + "loss": 0.2277, + "step": 2298 + }, + { + "epoch": 0.046, + "grad_norm": 1.0792618989944458, + "learning_rate": 9.200000000000002e-06, + "loss": 0.2694, + "step": 2300 + }, + { + "epoch": 0.04604, + "grad_norm": 1.8868327140808105, + "learning_rate": 9.208e-06, + "loss": 0.2505, + "step": 2302 + }, + { + "epoch": 0.04608, + "grad_norm": 2.093108654022217, + "learning_rate": 9.216000000000001e-06, + "loss": 0.3528, + "step": 2304 + }, + { + "epoch": 0.04612, + "grad_norm": 1.2204257249832153, + "learning_rate": 9.224e-06, + "loss": 0.2506, + "step": 2306 + }, + { + "epoch": 0.04616, + "grad_norm": 1.3074686527252197, + "learning_rate": 9.232e-06, + "loss": 0.2542, + "step": 2308 + }, + { + "epoch": 0.0462, + "grad_norm": 1.3753069639205933, + "learning_rate": 9.240000000000001e-06, + "loss": 0.2327, + "step": 2310 + }, + { + "epoch": 0.04624, + "grad_norm": 1.2607896327972412, + "learning_rate": 9.248e-06, + "loss": 0.2026, + "step": 2312 + }, + { + "epoch": 0.04628, + "grad_norm": 1.2030270099639893, + "learning_rate": 9.256e-06, + "loss": 0.1932, + "step": 2314 + }, + { + "epoch": 0.04632, + "grad_norm": 1.300345540046692, + "learning_rate": 9.264000000000001e-06, + "loss": 0.1844, + "step": 2316 + }, + { + "epoch": 0.04636, + "grad_norm": 1.0507686138153076, + "learning_rate": 9.272e-06, + "loss": 0.1517, + "step": 2318 + }, + { + "epoch": 0.0464, + "grad_norm": 0.7453618049621582, + "learning_rate": 9.280000000000001e-06, + "loss": 0.2936, + "step": 2320 + }, + { + "epoch": 0.04644, + "grad_norm": 0.6527708768844604, + "learning_rate": 9.288e-06, + "loss": 0.3057, + "step": 2322 + }, + { + "epoch": 0.04648, + "grad_norm": 2.399836301803589, + "learning_rate": 9.296e-06, + "loss": 0.5024, + "step": 2324 + }, + { + "epoch": 0.04652, + "grad_norm": 2.0234410762786865, + "learning_rate": 9.304000000000001e-06, + "loss": 0.4098, + "step": 2326 + }, + { + "epoch": 0.04656, + "grad_norm": 1.7317290306091309, + "learning_rate": 9.312000000000002e-06, + "loss": 0.2281, + "step": 2328 + }, + { + "epoch": 0.0466, + "grad_norm": 1.7110791206359863, + "learning_rate": 9.32e-06, + "loss": 0.2352, + "step": 2330 + }, + { + "epoch": 0.04664, + "grad_norm": 0.9843701720237732, + "learning_rate": 9.328000000000001e-06, + "loss": 0.251, + "step": 2332 + }, + { + "epoch": 0.04668, + "grad_norm": 1.4435436725616455, + "learning_rate": 9.336e-06, + "loss": 0.2182, + "step": 2334 + }, + { + "epoch": 0.04672, + "grad_norm": 1.3640998601913452, + "learning_rate": 9.344e-06, + "loss": 0.2773, + "step": 2336 + }, + { + "epoch": 0.04676, + "grad_norm": 1.146399974822998, + "learning_rate": 9.352000000000001e-06, + "loss": 0.2027, + "step": 2338 + }, + { + "epoch": 0.0468, + "grad_norm": 1.03097403049469, + "learning_rate": 9.360000000000002e-06, + "loss": 0.1762, + "step": 2340 + }, + { + "epoch": 0.04684, + "grad_norm": 1.0897189378738403, + "learning_rate": 9.368e-06, + "loss": 0.1761, + "step": 2342 + }, + { + "epoch": 0.04688, + "grad_norm": 1.0022835731506348, + "learning_rate": 9.376000000000001e-06, + "loss": 0.2412, + "step": 2344 + }, + { + "epoch": 0.04692, + "grad_norm": 0.7966433167457581, + "learning_rate": 9.384e-06, + "loss": 0.2668, + "step": 2346 + }, + { + "epoch": 0.04696, + "grad_norm": 2.623792886734009, + "learning_rate": 9.392000000000001e-06, + "loss": 0.2876, + "step": 2348 + }, + { + "epoch": 0.047, + "grad_norm": 2.6036291122436523, + "learning_rate": 9.4e-06, + "loss": 0.298, + "step": 2350 + }, + { + "epoch": 0.04704, + "grad_norm": 2.4814701080322266, + "learning_rate": 9.408e-06, + "loss": 0.2722, + "step": 2352 + }, + { + "epoch": 0.04708, + "grad_norm": 0.9838671684265137, + "learning_rate": 9.416000000000001e-06, + "loss": 0.2843, + "step": 2354 + }, + { + "epoch": 0.04712, + "grad_norm": 0.9895330667495728, + "learning_rate": 9.424000000000002e-06, + "loss": 0.2553, + "step": 2356 + }, + { + "epoch": 0.04716, + "grad_norm": 1.818176507949829, + "learning_rate": 9.432e-06, + "loss": 0.3392, + "step": 2358 + }, + { + "epoch": 0.0472, + "grad_norm": 1.1727168560028076, + "learning_rate": 9.440000000000001e-06, + "loss": 0.2382, + "step": 2360 + }, + { + "epoch": 0.04724, + "grad_norm": 1.1874768733978271, + "learning_rate": 9.448e-06, + "loss": 0.1758, + "step": 2362 + }, + { + "epoch": 0.04728, + "grad_norm": 1.5837658643722534, + "learning_rate": 9.456e-06, + "loss": 0.2261, + "step": 2364 + }, + { + "epoch": 0.04732, + "grad_norm": 1.3779855966567993, + "learning_rate": 9.464000000000001e-06, + "loss": 0.2041, + "step": 2366 + }, + { + "epoch": 0.04736, + "grad_norm": 1.4119915962219238, + "learning_rate": 9.472000000000002e-06, + "loss": 0.2128, + "step": 2368 + }, + { + "epoch": 0.0474, + "grad_norm": 1.2224630117416382, + "learning_rate": 9.48e-06, + "loss": 0.2129, + "step": 2370 + }, + { + "epoch": 0.04744, + "grad_norm": 1.512647032737732, + "learning_rate": 9.488000000000001e-06, + "loss": 0.2438, + "step": 2372 + }, + { + "epoch": 0.04748, + "grad_norm": 1.1856619119644165, + "learning_rate": 9.496e-06, + "loss": 0.235, + "step": 2374 + }, + { + "epoch": 0.04752, + "grad_norm": 1.1422007083892822, + "learning_rate": 9.504e-06, + "loss": 0.2147, + "step": 2376 + }, + { + "epoch": 0.04756, + "grad_norm": 1.4668843746185303, + "learning_rate": 9.512000000000001e-06, + "loss": 0.2772, + "step": 2378 + }, + { + "epoch": 0.0476, + "grad_norm": 1.2234495878219604, + "learning_rate": 9.52e-06, + "loss": 0.2027, + "step": 2380 + }, + { + "epoch": 0.04764, + "grad_norm": 1.3459317684173584, + "learning_rate": 9.528000000000001e-06, + "loss": 0.233, + "step": 2382 + }, + { + "epoch": 0.04768, + "grad_norm": 1.5616450309753418, + "learning_rate": 9.536000000000002e-06, + "loss": 0.2539, + "step": 2384 + }, + { + "epoch": 0.04772, + "grad_norm": 1.2577755451202393, + "learning_rate": 9.544e-06, + "loss": 0.2148, + "step": 2386 + }, + { + "epoch": 0.04776, + "grad_norm": 1.8216487169265747, + "learning_rate": 9.552000000000001e-06, + "loss": 0.2385, + "step": 2388 + }, + { + "epoch": 0.0478, + "grad_norm": 2.3107144832611084, + "learning_rate": 9.56e-06, + "loss": 0.2598, + "step": 2390 + }, + { + "epoch": 0.04784, + "grad_norm": 1.319757103919983, + "learning_rate": 9.568e-06, + "loss": 0.1531, + "step": 2392 + }, + { + "epoch": 0.04788, + "grad_norm": 2.271888017654419, + "learning_rate": 9.576000000000001e-06, + "loss": 0.2486, + "step": 2394 + }, + { + "epoch": 0.04792, + "grad_norm": 2.684882879257202, + "learning_rate": 9.584000000000002e-06, + "loss": 0.4394, + "step": 2396 + }, + { + "epoch": 0.04796, + "grad_norm": 1.220268964767456, + "learning_rate": 9.592e-06, + "loss": 0.1327, + "step": 2398 + }, + { + "epoch": 0.048, + "grad_norm": 2.0344033241271973, + "learning_rate": 9.600000000000001e-06, + "loss": 0.326, + "step": 2400 + }, + { + "epoch": 0.04804, + "grad_norm": 1.891920566558838, + "learning_rate": 9.608e-06, + "loss": 0.3139, + "step": 2402 + }, + { + "epoch": 0.04808, + "grad_norm": 1.4556076526641846, + "learning_rate": 9.616e-06, + "loss": 0.2652, + "step": 2404 + }, + { + "epoch": 0.04812, + "grad_norm": 1.335777759552002, + "learning_rate": 9.624000000000001e-06, + "loss": 0.2221, + "step": 2406 + }, + { + "epoch": 0.04816, + "grad_norm": 1.409804105758667, + "learning_rate": 9.632e-06, + "loss": 0.2544, + "step": 2408 + }, + { + "epoch": 0.0482, + "grad_norm": 1.0016233921051025, + "learning_rate": 9.640000000000001e-06, + "loss": 0.2469, + "step": 2410 + }, + { + "epoch": 0.04824, + "grad_norm": 1.0031496286392212, + "learning_rate": 9.648000000000001e-06, + "loss": 0.1688, + "step": 2412 + }, + { + "epoch": 0.04828, + "grad_norm": 0.864292323589325, + "learning_rate": 9.656e-06, + "loss": 0.2222, + "step": 2414 + }, + { + "epoch": 0.04832, + "grad_norm": 0.9232346415519714, + "learning_rate": 9.664000000000001e-06, + "loss": 0.1674, + "step": 2416 + }, + { + "epoch": 0.04836, + "grad_norm": 1.6327571868896484, + "learning_rate": 9.672e-06, + "loss": 0.3264, + "step": 2418 + }, + { + "epoch": 0.0484, + "grad_norm": 0.8270096778869629, + "learning_rate": 9.68e-06, + "loss": 0.2349, + "step": 2420 + }, + { + "epoch": 0.04844, + "grad_norm": 0.958274245262146, + "learning_rate": 9.688000000000001e-06, + "loss": 0.2425, + "step": 2422 + }, + { + "epoch": 0.04848, + "grad_norm": 1.5311020612716675, + "learning_rate": 9.696000000000002e-06, + "loss": 0.3132, + "step": 2424 + }, + { + "epoch": 0.04852, + "grad_norm": 1.0970197916030884, + "learning_rate": 9.704e-06, + "loss": 0.1763, + "step": 2426 + }, + { + "epoch": 0.04856, + "grad_norm": 1.3016937971115112, + "learning_rate": 9.712e-06, + "loss": 0.2065, + "step": 2428 + }, + { + "epoch": 0.0486, + "grad_norm": 1.453688621520996, + "learning_rate": 9.72e-06, + "loss": 0.2564, + "step": 2430 + }, + { + "epoch": 0.04864, + "grad_norm": 1.0107396841049194, + "learning_rate": 9.728e-06, + "loss": 0.1938, + "step": 2432 + }, + { + "epoch": 0.04868, + "grad_norm": 1.2885407209396362, + "learning_rate": 9.736000000000001e-06, + "loss": 0.2653, + "step": 2434 + }, + { + "epoch": 0.04872, + "grad_norm": 1.084210753440857, + "learning_rate": 9.744000000000002e-06, + "loss": 0.2127, + "step": 2436 + }, + { + "epoch": 0.04876, + "grad_norm": 1.1274440288543701, + "learning_rate": 9.752e-06, + "loss": 0.2128, + "step": 2438 + }, + { + "epoch": 0.0488, + "grad_norm": 1.1949464082717896, + "learning_rate": 9.760000000000001e-06, + "loss": 0.233, + "step": 2440 + }, + { + "epoch": 0.04884, + "grad_norm": 1.3378957509994507, + "learning_rate": 9.768e-06, + "loss": 0.233, + "step": 2442 + }, + { + "epoch": 0.04888, + "grad_norm": 1.3556468486785889, + "learning_rate": 9.776000000000001e-06, + "loss": 0.2653, + "step": 2444 + }, + { + "epoch": 0.04892, + "grad_norm": 1.2243198156356812, + "learning_rate": 9.784000000000002e-06, + "loss": 0.2123, + "step": 2446 + }, + { + "epoch": 0.04896, + "grad_norm": 1.4487311840057373, + "learning_rate": 9.792e-06, + "loss": 0.2539, + "step": 2448 + }, + { + "epoch": 0.049, + "grad_norm": 1.5623563528060913, + "learning_rate": 9.800000000000001e-06, + "loss": 0.2351, + "step": 2450 + }, + { + "epoch": 0.04904, + "grad_norm": 1.5044504404067993, + "learning_rate": 9.808000000000002e-06, + "loss": 0.2767, + "step": 2452 + }, + { + "epoch": 0.04908, + "grad_norm": 1.5055433511734009, + "learning_rate": 9.816e-06, + "loss": 0.235, + "step": 2454 + }, + { + "epoch": 0.04912, + "grad_norm": 1.0858556032180786, + "learning_rate": 9.824000000000001e-06, + "loss": 0.2041, + "step": 2456 + }, + { + "epoch": 0.04916, + "grad_norm": 1.46283757686615, + "learning_rate": 9.832e-06, + "loss": 0.233, + "step": 2458 + }, + { + "epoch": 0.0492, + "grad_norm": 1.4156924486160278, + "learning_rate": 9.84e-06, + "loss": 0.2431, + "step": 2460 + }, + { + "epoch": 0.04924, + "grad_norm": 1.2947237491607666, + "learning_rate": 9.848000000000001e-06, + "loss": 0.2323, + "step": 2462 + }, + { + "epoch": 0.04928, + "grad_norm": 1.328177809715271, + "learning_rate": 9.856000000000002e-06, + "loss": 0.2324, + "step": 2464 + }, + { + "epoch": 0.04932, + "grad_norm": 1.2818340063095093, + "learning_rate": 9.864e-06, + "loss": 0.2432, + "step": 2466 + }, + { + "epoch": 0.04936, + "grad_norm": 1.3000508546829224, + "learning_rate": 9.872e-06, + "loss": 0.2129, + "step": 2468 + }, + { + "epoch": 0.0494, + "grad_norm": 1.2526321411132812, + "learning_rate": 9.88e-06, + "loss": 0.2222, + "step": 2470 + }, + { + "epoch": 0.04944, + "grad_norm": 1.6050446033477783, + "learning_rate": 9.888000000000001e-06, + "loss": 0.2772, + "step": 2472 + }, + { + "epoch": 0.04948, + "grad_norm": 1.114122986793518, + "learning_rate": 9.896000000000001e-06, + "loss": 0.2039, + "step": 2474 + }, + { + "epoch": 0.04952, + "grad_norm": 1.1602824926376343, + "learning_rate": 9.904e-06, + "loss": 0.2349, + "step": 2476 + }, + { + "epoch": 0.04956, + "grad_norm": 1.7067323923110962, + "learning_rate": 9.912000000000001e-06, + "loss": 0.2383, + "step": 2478 + }, + { + "epoch": 0.0496, + "grad_norm": 1.495561122894287, + "learning_rate": 9.920000000000002e-06, + "loss": 0.2537, + "step": 2480 + }, + { + "epoch": 0.04964, + "grad_norm": 1.2854578495025635, + "learning_rate": 9.928e-06, + "loss": 0.2324, + "step": 2482 + }, + { + "epoch": 0.04968, + "grad_norm": 1.3153655529022217, + "learning_rate": 9.936000000000001e-06, + "loss": 0.243, + "step": 2484 + }, + { + "epoch": 0.04972, + "grad_norm": 1.2449151277542114, + "learning_rate": 9.944e-06, + "loss": 0.2221, + "step": 2486 + }, + { + "epoch": 0.04976, + "grad_norm": 1.3584405183792114, + "learning_rate": 9.952e-06, + "loss": 0.233, + "step": 2488 + }, + { + "epoch": 0.0498, + "grad_norm": 1.5856982469558716, + "learning_rate": 9.960000000000001e-06, + "loss": 0.2378, + "step": 2490 + }, + { + "epoch": 0.04984, + "grad_norm": 1.0117071866989136, + "learning_rate": 9.968000000000002e-06, + "loss": 0.1672, + "step": 2492 + }, + { + "epoch": 0.04988, + "grad_norm": 1.8090955018997192, + "learning_rate": 9.976e-06, + "loss": 0.3388, + "step": 2494 + }, + { + "epoch": 0.04992, + "grad_norm": 0.9474315643310547, + "learning_rate": 9.984e-06, + "loss": 0.2222, + "step": 2496 + }, + { + "epoch": 0.04996, + "grad_norm": 1.6639795303344727, + "learning_rate": 9.992e-06, + "loss": 0.2348, + "step": 2498 + }, + { + "epoch": 0.05, + "grad_norm": 1.0115216970443726, + "learning_rate": 1e-05, + "loss": 0.1672, + "step": 2500 + }, + { + "epoch": 0.05004, + "grad_norm": 1.52597177028656, + "learning_rate": 1.0008e-05, + "loss": 0.3011, + "step": 2502 + }, + { + "epoch": 0.05008, + "grad_norm": 1.008470058441162, + "learning_rate": 1.0016000000000002e-05, + "loss": 0.1672, + "step": 2504 + }, + { + "epoch": 0.05012, + "grad_norm": 0.9257493019104004, + "learning_rate": 1.0024000000000001e-05, + "loss": 0.1515, + "step": 2506 + }, + { + "epoch": 0.05016, + "grad_norm": 1.4720537662506104, + "learning_rate": 1.0032000000000002e-05, + "loss": 0.3281, + "step": 2508 + }, + { + "epoch": 0.0502, + "grad_norm": 0.9952502250671387, + "learning_rate": 1.004e-05, + "loss": 0.1597, + "step": 2510 + }, + { + "epoch": 0.05024, + "grad_norm": 0.9728571176528931, + "learning_rate": 1.0048e-05, + "loss": 0.2221, + "step": 2512 + }, + { + "epoch": 0.05028, + "grad_norm": 1.6420345306396484, + "learning_rate": 1.0056000000000002e-05, + "loss": 0.2348, + "step": 2514 + }, + { + "epoch": 0.05032, + "grad_norm": 0.9451438188552856, + "learning_rate": 1.0064e-05, + "loss": 0.2062, + "step": 2516 + }, + { + "epoch": 0.05036, + "grad_norm": 1.5427919626235962, + "learning_rate": 1.0072000000000001e-05, + "loss": 0.226, + "step": 2518 + }, + { + "epoch": 0.0504, + "grad_norm": 1.0015432834625244, + "learning_rate": 1.008e-05, + "loss": 0.1514, + "step": 2520 + }, + { + "epoch": 0.05044, + "grad_norm": 0.965495228767395, + "learning_rate": 1.0088e-05, + "loss": 0.1596, + "step": 2522 + }, + { + "epoch": 0.05048, + "grad_norm": 1.9190587997436523, + "learning_rate": 1.0096000000000001e-05, + "loss": 0.2775, + "step": 2524 + }, + { + "epoch": 0.05052, + "grad_norm": 1.6993926763534546, + "learning_rate": 1.0104e-05, + "loss": 0.2883, + "step": 2526 + }, + { + "epoch": 0.05056, + "grad_norm": 0.9749553799629211, + "learning_rate": 1.0112000000000002e-05, + "loss": 0.1367, + "step": 2528 + }, + { + "epoch": 0.0506, + "grad_norm": 1.8548623323440552, + "learning_rate": 1.0120000000000001e-05, + "loss": 0.248, + "step": 2530 + }, + { + "epoch": 0.05064, + "grad_norm": 1.6875228881835938, + "learning_rate": 1.0128e-05, + "loss": 0.313, + "step": 2532 + }, + { + "epoch": 0.05068, + "grad_norm": 1.2072083950042725, + "learning_rate": 1.0136000000000001e-05, + "loss": 0.2506, + "step": 2534 + }, + { + "epoch": 0.05072, + "grad_norm": 1.3677778244018555, + "learning_rate": 1.0144e-05, + "loss": 0.1956, + "step": 2536 + }, + { + "epoch": 0.05076, + "grad_norm": 1.4899791479110718, + "learning_rate": 1.0152000000000002e-05, + "loss": 0.265, + "step": 2538 + }, + { + "epoch": 0.0508, + "grad_norm": 1.563876748085022, + "learning_rate": 1.0160000000000001e-05, + "loss": 0.2537, + "step": 2540 + }, + { + "epoch": 0.05084, + "grad_norm": 1.232150912284851, + "learning_rate": 1.0168e-05, + "loss": 0.2026, + "step": 2542 + }, + { + "epoch": 0.05088, + "grad_norm": 1.398077130317688, + "learning_rate": 1.0176000000000002e-05, + "loss": 0.2221, + "step": 2544 + }, + { + "epoch": 0.05092, + "grad_norm": 1.0996835231781006, + "learning_rate": 1.0184000000000001e-05, + "loss": 0.1674, + "step": 2546 + }, + { + "epoch": 0.05096, + "grad_norm": 1.0280756950378418, + "learning_rate": 1.0192000000000002e-05, + "loss": 0.2414, + "step": 2548 + }, + { + "epoch": 0.051, + "grad_norm": 2.5367836952209473, + "learning_rate": 1.02e-05, + "loss": 0.2782, + "step": 2550 + }, + { + "epoch": 0.05104, + "grad_norm": 0.8659659028053284, + "learning_rate": 1.0208e-05, + "loss": 0.1062, + "step": 2552 + }, + { + "epoch": 0.05108, + "grad_norm": 0.800336480140686, + "learning_rate": 1.0216000000000002e-05, + "loss": 0.0902, + "step": 2554 + }, + { + "epoch": 0.05112, + "grad_norm": 2.940565347671509, + "learning_rate": 1.0224e-05, + "loss": 0.5346, + "step": 2556 + }, + { + "epoch": 0.05116, + "grad_norm": 0.7334181666374207, + "learning_rate": 1.0232000000000001e-05, + "loss": 0.0726, + "step": 2558 + }, + { + "epoch": 0.0512, + "grad_norm": 0.8884013891220093, + "learning_rate": 1.024e-05, + "loss": 0.3371, + "step": 2560 + }, + { + "epoch": 0.05124, + "grad_norm": 2.426426410675049, + "learning_rate": 1.0248e-05, + "loss": 0.2672, + "step": 2562 + }, + { + "epoch": 0.05128, + "grad_norm": 0.9060508608818054, + "learning_rate": 1.0256000000000001e-05, + "loss": 0.2878, + "step": 2564 + }, + { + "epoch": 0.05132, + "grad_norm": 0.8333362340927124, + "learning_rate": 1.0264e-05, + "loss": 0.2575, + "step": 2566 + }, + { + "epoch": 0.05136, + "grad_norm": 1.1868318319320679, + "learning_rate": 1.0272e-05, + "loss": 0.2508, + "step": 2568 + }, + { + "epoch": 0.0514, + "grad_norm": 1.5913376808166504, + "learning_rate": 1.0280000000000002e-05, + "loss": 0.3405, + "step": 2570 + }, + { + "epoch": 0.05144, + "grad_norm": 1.283983588218689, + "learning_rate": 1.0288e-05, + "loss": 0.2793, + "step": 2572 + }, + { + "epoch": 0.05148, + "grad_norm": 1.1438534259796143, + "learning_rate": 1.0296000000000001e-05, + "loss": 0.2122, + "step": 2574 + }, + { + "epoch": 0.05152, + "grad_norm": 1.1841778755187988, + "learning_rate": 1.0304e-05, + "loss": 0.2122, + "step": 2576 + }, + { + "epoch": 0.05156, + "grad_norm": 1.1852178573608398, + "learning_rate": 1.0312e-05, + "loss": 0.2324, + "step": 2578 + }, + { + "epoch": 0.0516, + "grad_norm": 1.3252484798431396, + "learning_rate": 1.0320000000000001e-05, + "loss": 0.2445, + "step": 2580 + }, + { + "epoch": 0.05164, + "grad_norm": 0.9338566064834595, + "learning_rate": 1.0328e-05, + "loss": 0.1759, + "step": 2582 + }, + { + "epoch": 0.05168, + "grad_norm": 1.2070322036743164, + "learning_rate": 1.0336000000000002e-05, + "loss": 0.2148, + "step": 2584 + }, + { + "epoch": 0.05172, + "grad_norm": 1.015078067779541, + "learning_rate": 1.0344000000000001e-05, + "loss": 0.247, + "step": 2586 + }, + { + "epoch": 0.05176, + "grad_norm": 0.8453197479248047, + "learning_rate": 1.0352e-05, + "loss": 0.1676, + "step": 2588 + }, + { + "epoch": 0.0518, + "grad_norm": 1.4400628805160522, + "learning_rate": 1.036e-05, + "loss": 0.3132, + "step": 2590 + }, + { + "epoch": 0.05184, + "grad_norm": 0.8220688104629517, + "learning_rate": 1.0368e-05, + "loss": 0.2225, + "step": 2592 + }, + { + "epoch": 0.05188, + "grad_norm": 1.0115838050842285, + "learning_rate": 1.0376000000000002e-05, + "loss": 0.1848, + "step": 2594 + }, + { + "epoch": 0.05192, + "grad_norm": 1.3758116960525513, + "learning_rate": 1.0384000000000001e-05, + "loss": 0.2886, + "step": 2596 + }, + { + "epoch": 0.05196, + "grad_norm": 1.3006134033203125, + "learning_rate": 1.0392e-05, + "loss": 0.2262, + "step": 2598 + }, + { + "epoch": 0.052, + "grad_norm": 1.4322359561920166, + "learning_rate": 1.04e-05, + "loss": 0.2426, + "step": 2600 + }, + { + "epoch": 0.05204, + "grad_norm": 0.8334761261940002, + "learning_rate": 1.0408000000000001e-05, + "loss": 0.2224, + "step": 2602 + }, + { + "epoch": 0.05208, + "grad_norm": 1.3403642177581787, + "learning_rate": 1.0416000000000002e-05, + "loss": 0.2544, + "step": 2604 + }, + { + "epoch": 0.05212, + "grad_norm": 1.033403754234314, + "learning_rate": 1.0424e-05, + "loss": 0.273, + "step": 2606 + }, + { + "epoch": 0.05216, + "grad_norm": 1.2585885524749756, + "learning_rate": 1.0432e-05, + "loss": 0.2147, + "step": 2608 + }, + { + "epoch": 0.0522, + "grad_norm": 1.1550790071487427, + "learning_rate": 1.0440000000000002e-05, + "loss": 0.222, + "step": 2610 + }, + { + "epoch": 0.05224, + "grad_norm": 1.0420974493026733, + "learning_rate": 1.0448e-05, + "loss": 0.2442, + "step": 2612 + }, + { + "epoch": 0.05228, + "grad_norm": 1.193349003791809, + "learning_rate": 1.0456000000000001e-05, + "loss": 0.2222, + "step": 2614 + }, + { + "epoch": 0.05232, + "grad_norm": 1.3331019878387451, + "learning_rate": 1.0464e-05, + "loss": 0.265, + "step": 2616 + }, + { + "epoch": 0.05236, + "grad_norm": 1.112722396850586, + "learning_rate": 1.0472e-05, + "loss": 0.1856, + "step": 2618 + }, + { + "epoch": 0.0524, + "grad_norm": 1.1322976350784302, + "learning_rate": 1.0480000000000001e-05, + "loss": 0.2025, + "step": 2620 + }, + { + "epoch": 0.05244, + "grad_norm": 0.9957426190376282, + "learning_rate": 1.0488e-05, + "loss": 0.2507, + "step": 2622 + }, + { + "epoch": 0.05248, + "grad_norm": 1.167366623878479, + "learning_rate": 1.0496000000000003e-05, + "loss": 0.2432, + "step": 2624 + }, + { + "epoch": 0.05252, + "grad_norm": 1.3111414909362793, + "learning_rate": 1.0504000000000001e-05, + "loss": 0.2064, + "step": 2626 + }, + { + "epoch": 0.05256, + "grad_norm": 0.9701148867607117, + "learning_rate": 1.0512e-05, + "loss": 0.2296, + "step": 2628 + }, + { + "epoch": 0.0526, + "grad_norm": 1.5286060571670532, + "learning_rate": 1.0520000000000001e-05, + "loss": 0.2885, + "step": 2630 + }, + { + "epoch": 0.05264, + "grad_norm": 1.1454706192016602, + "learning_rate": 1.0528e-05, + "loss": 0.2026, + "step": 2632 + }, + { + "epoch": 0.05268, + "grad_norm": 1.4266670942306519, + "learning_rate": 1.0536000000000002e-05, + "loss": 0.2347, + "step": 2634 + }, + { + "epoch": 0.05272, + "grad_norm": 1.1909645795822144, + "learning_rate": 1.0544000000000001e-05, + "loss": 0.2442, + "step": 2636 + }, + { + "epoch": 0.05276, + "grad_norm": 1.4902628660202026, + "learning_rate": 1.0552e-05, + "loss": 0.2379, + "step": 2638 + }, + { + "epoch": 0.0528, + "grad_norm": 1.0686455965042114, + "learning_rate": 1.056e-05, + "loss": 0.2467, + "step": 2640 + }, + { + "epoch": 0.05284, + "grad_norm": 1.4113065004348755, + "learning_rate": 1.0568000000000001e-05, + "loss": 0.265, + "step": 2642 + }, + { + "epoch": 0.05288, + "grad_norm": 1.073358416557312, + "learning_rate": 1.0576000000000002e-05, + "loss": 0.193, + "step": 2644 + }, + { + "epoch": 0.05292, + "grad_norm": 1.3206804990768433, + "learning_rate": 1.0584e-05, + "loss": 0.2329, + "step": 2646 + }, + { + "epoch": 0.05296, + "grad_norm": 1.4123510122299194, + "learning_rate": 1.0592e-05, + "loss": 0.265, + "step": 2648 + }, + { + "epoch": 0.053, + "grad_norm": 1.155684471130371, + "learning_rate": 1.0600000000000002e-05, + "loss": 0.2221, + "step": 2650 + }, + { + "epoch": 0.05304, + "grad_norm": 1.2454018592834473, + "learning_rate": 1.0608e-05, + "loss": 0.232, + "step": 2652 + }, + { + "epoch": 0.05308, + "grad_norm": 1.1291812658309937, + "learning_rate": 1.0616000000000001e-05, + "loss": 0.2221, + "step": 2654 + }, + { + "epoch": 0.05312, + "grad_norm": 1.2452499866485596, + "learning_rate": 1.0624e-05, + "loss": 0.1956, + "step": 2656 + }, + { + "epoch": 0.05316, + "grad_norm": 1.1616454124450684, + "learning_rate": 1.0632000000000001e-05, + "loss": 0.2442, + "step": 2658 + }, + { + "epoch": 0.0532, + "grad_norm": 1.2809799909591675, + "learning_rate": 1.0640000000000001e-05, + "loss": 0.2542, + "step": 2660 + }, + { + "epoch": 0.05324, + "grad_norm": 1.5176533460617065, + "learning_rate": 1.0648e-05, + "loss": 0.2771, + "step": 2662 + }, + { + "epoch": 0.05328, + "grad_norm": 1.237057089805603, + "learning_rate": 1.0656000000000003e-05, + "loss": 0.232, + "step": 2664 + }, + { + "epoch": 0.05332, + "grad_norm": 1.2704704999923706, + "learning_rate": 1.0664000000000002e-05, + "loss": 0.2038, + "step": 2666 + }, + { + "epoch": 0.05336, + "grad_norm": 1.5312739610671997, + "learning_rate": 1.0672e-05, + "loss": 0.2769, + "step": 2668 + }, + { + "epoch": 0.0534, + "grad_norm": 1.5924254655838013, + "learning_rate": 1.0680000000000001e-05, + "loss": 0.3009, + "step": 2670 + }, + { + "epoch": 0.05344, + "grad_norm": 1.0451985597610474, + "learning_rate": 1.0688e-05, + "loss": 0.2127, + "step": 2672 + }, + { + "epoch": 0.05348, + "grad_norm": 1.0582048892974854, + "learning_rate": 1.0696000000000002e-05, + "loss": 0.2233, + "step": 2674 + }, + { + "epoch": 0.05352, + "grad_norm": 1.2866793870925903, + "learning_rate": 1.0704000000000001e-05, + "loss": 0.2428, + "step": 2676 + }, + { + "epoch": 0.05356, + "grad_norm": 1.0446733236312866, + "learning_rate": 1.0712e-05, + "loss": 0.1932, + "step": 2678 + }, + { + "epoch": 0.0536, + "grad_norm": 1.7246489524841309, + "learning_rate": 1.072e-05, + "loss": 0.3403, + "step": 2680 + }, + { + "epoch": 0.05364, + "grad_norm": 0.9219846725463867, + "learning_rate": 1.0728000000000001e-05, + "loss": 0.2427, + "step": 2682 + }, + { + "epoch": 0.05368, + "grad_norm": 1.3393436670303345, + "learning_rate": 1.0736000000000002e-05, + "loss": 0.2103, + "step": 2684 + }, + { + "epoch": 0.05372, + "grad_norm": 1.3890851736068726, + "learning_rate": 1.0744e-05, + "loss": 0.226, + "step": 2686 + }, + { + "epoch": 0.05376, + "grad_norm": 0.8482733964920044, + "learning_rate": 1.0752e-05, + "loss": 0.1674, + "step": 2688 + }, + { + "epoch": 0.0538, + "grad_norm": 1.2552354335784912, + "learning_rate": 1.0760000000000002e-05, + "loss": 0.2663, + "step": 2690 + }, + { + "epoch": 0.05384, + "grad_norm": 0.9857730269432068, + "learning_rate": 1.0768000000000001e-05, + "loss": 0.2382, + "step": 2692 + }, + { + "epoch": 0.05388, + "grad_norm": 0.9323517084121704, + "learning_rate": 1.0776e-05, + "loss": 0.1592, + "step": 2694 + }, + { + "epoch": 0.05392, + "grad_norm": 0.9456663727760315, + "learning_rate": 1.0784e-05, + "loss": 0.1754, + "step": 2696 + }, + { + "epoch": 0.05396, + "grad_norm": 0.9055673480033875, + "learning_rate": 1.0792000000000001e-05, + "loss": 0.235, + "step": 2698 + }, + { + "epoch": 0.054, + "grad_norm": 0.8254029154777527, + "learning_rate": 1.0800000000000002e-05, + "loss": 0.2279, + "step": 2700 + }, + { + "epoch": 0.05404, + "grad_norm": 1.597238302230835, + "learning_rate": 1.0808e-05, + "loss": 0.339, + "step": 2702 + }, + { + "epoch": 0.05408, + "grad_norm": 1.5940641164779663, + "learning_rate": 1.0816e-05, + "loss": 0.228, + "step": 2704 + }, + { + "epoch": 0.05412, + "grad_norm": 1.6176505088806152, + "learning_rate": 1.0824000000000002e-05, + "loss": 0.2482, + "step": 2706 + }, + { + "epoch": 0.05416, + "grad_norm": 1.0482736825942993, + "learning_rate": 1.0832e-05, + "loss": 0.1598, + "step": 2708 + }, + { + "epoch": 0.0542, + "grad_norm": 1.561832070350647, + "learning_rate": 1.0840000000000001e-05, + "loss": 0.3134, + "step": 2710 + }, + { + "epoch": 0.05424, + "grad_norm": 1.1613950729370117, + "learning_rate": 1.0848e-05, + "loss": 0.269, + "step": 2712 + }, + { + "epoch": 0.05428, + "grad_norm": 1.071877121925354, + "learning_rate": 1.0855999999999999e-05, + "loss": 0.2469, + "step": 2714 + }, + { + "epoch": 0.05432, + "grad_norm": 1.350127935409546, + "learning_rate": 1.0864000000000001e-05, + "loss": 0.2328, + "step": 2716 + }, + { + "epoch": 0.05436, + "grad_norm": 1.2008384466171265, + "learning_rate": 1.0872e-05, + "loss": 0.1764, + "step": 2718 + }, + { + "epoch": 0.0544, + "grad_norm": 1.3087255954742432, + "learning_rate": 1.0880000000000001e-05, + "loss": 0.2328, + "step": 2720 + }, + { + "epoch": 0.05444, + "grad_norm": 1.0834097862243652, + "learning_rate": 1.0888000000000001e-05, + "loss": 0.2126, + "step": 2722 + }, + { + "epoch": 0.05448, + "grad_norm": 1.2298542261123657, + "learning_rate": 1.0896e-05, + "loss": 0.2544, + "step": 2724 + }, + { + "epoch": 0.05452, + "grad_norm": 1.0812493562698364, + "learning_rate": 1.0904000000000001e-05, + "loss": 0.1931, + "step": 2726 + }, + { + "epoch": 0.05456, + "grad_norm": 1.0127568244934082, + "learning_rate": 1.0912e-05, + "loss": 0.1843, + "step": 2728 + }, + { + "epoch": 0.0546, + "grad_norm": 1.0161168575286865, + "learning_rate": 1.0920000000000002e-05, + "loss": 0.2507, + "step": 2730 + }, + { + "epoch": 0.05464, + "grad_norm": 1.4679392576217651, + "learning_rate": 1.0928000000000001e-05, + "loss": 0.2297, + "step": 2732 + }, + { + "epoch": 0.05468, + "grad_norm": 1.552835464477539, + "learning_rate": 1.0936e-05, + "loss": 0.2349, + "step": 2734 + }, + { + "epoch": 0.05472, + "grad_norm": 0.8774700164794922, + "learning_rate": 1.0944e-05, + "loss": 0.1307, + "step": 2736 + }, + { + "epoch": 0.05476, + "grad_norm": 1.6400331258773804, + "learning_rate": 1.0952000000000001e-05, + "loss": 0.3529, + "step": 2738 + }, + { + "epoch": 0.0548, + "grad_norm": 0.8240622878074646, + "learning_rate": 1.0960000000000002e-05, + "loss": 0.1372, + "step": 2740 + }, + { + "epoch": 0.05484, + "grad_norm": 1.5893993377685547, + "learning_rate": 1.0968e-05, + "loss": 0.2349, + "step": 2742 + }, + { + "epoch": 0.05488, + "grad_norm": 0.7998148798942566, + "learning_rate": 1.0976e-05, + "loss": 0.255, + "step": 2744 + }, + { + "epoch": 0.05492, + "grad_norm": 1.646301031112671, + "learning_rate": 1.0984000000000002e-05, + "loss": 0.339, + "step": 2746 + }, + { + "epoch": 0.05496, + "grad_norm": 1.0241515636444092, + "learning_rate": 1.0992e-05, + "loss": 0.3071, + "step": 2748 + }, + { + "epoch": 0.055, + "grad_norm": 0.8725964426994324, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.2222, + "step": 2750 + }, + { + "epoch": 0.05504, + "grad_norm": 0.9860166907310486, + "learning_rate": 1.1008e-05, + "loss": 0.2259, + "step": 2752 + }, + { + "epoch": 0.05508, + "grad_norm": 1.134415864944458, + "learning_rate": 1.1016e-05, + "loss": 0.2428, + "step": 2754 + }, + { + "epoch": 0.05512, + "grad_norm": 1.272434115409851, + "learning_rate": 1.1024000000000002e-05, + "loss": 0.2321, + "step": 2756 + }, + { + "epoch": 0.05516, + "grad_norm": 1.0774152278900146, + "learning_rate": 1.1032e-05, + "loss": 0.1753, + "step": 2758 + }, + { + "epoch": 0.0552, + "grad_norm": 1.6540567874908447, + "learning_rate": 1.1040000000000001e-05, + "loss": 0.2377, + "step": 2760 + }, + { + "epoch": 0.05524, + "grad_norm": 0.9895384311676025, + "learning_rate": 1.1048000000000002e-05, + "loss": 0.1513, + "step": 2762 + }, + { + "epoch": 0.05528, + "grad_norm": 0.8523926138877869, + "learning_rate": 1.1056e-05, + "loss": 0.1174, + "step": 2764 + }, + { + "epoch": 0.05532, + "grad_norm": 2.4195737838745117, + "learning_rate": 1.1064000000000001e-05, + "loss": 0.2762, + "step": 2766 + }, + { + "epoch": 0.05536, + "grad_norm": 0.9464126229286194, + "learning_rate": 1.1072e-05, + "loss": 0.3092, + "step": 2768 + }, + { + "epoch": 0.0554, + "grad_norm": 2.2703640460968018, + "learning_rate": 1.1080000000000002e-05, + "loss": 0.237, + "step": 2770 + }, + { + "epoch": 0.05544, + "grad_norm": 0.8126976490020752, + "learning_rate": 1.1088000000000001e-05, + "loss": 0.2819, + "step": 2772 + }, + { + "epoch": 0.05548, + "grad_norm": 2.1020612716674805, + "learning_rate": 1.1096e-05, + "loss": 0.3797, + "step": 2774 + }, + { + "epoch": 0.05552, + "grad_norm": 1.7465397119522095, + "learning_rate": 1.1104e-05, + "loss": 0.2424, + "step": 2776 + }, + { + "epoch": 0.05556, + "grad_norm": 1.5562028884887695, + "learning_rate": 1.1112000000000001e-05, + "loss": 0.2102, + "step": 2778 + }, + { + "epoch": 0.0556, + "grad_norm": 1.4340214729309082, + "learning_rate": 1.1120000000000002e-05, + "loss": 0.2537, + "step": 2780 + }, + { + "epoch": 0.05564, + "grad_norm": 1.3949699401855469, + "learning_rate": 1.1128000000000001e-05, + "loss": 0.1881, + "step": 2782 + }, + { + "epoch": 0.05568, + "grad_norm": 1.1194568872451782, + "learning_rate": 1.1136e-05, + "loss": 0.2029, + "step": 2784 + }, + { + "epoch": 0.05572, + "grad_norm": 1.152320384979248, + "learning_rate": 1.1144000000000002e-05, + "loss": 0.1762, + "step": 2786 + }, + { + "epoch": 0.05576, + "grad_norm": 1.0355608463287354, + "learning_rate": 1.1152000000000001e-05, + "loss": 0.153, + "step": 2788 + }, + { + "epoch": 0.0558, + "grad_norm": 2.6983203887939453, + "learning_rate": 1.1160000000000002e-05, + "loss": 0.2943, + "step": 2790 + }, + { + "epoch": 0.05584, + "grad_norm": 2.909799337387085, + "learning_rate": 1.1168e-05, + "loss": 0.3376, + "step": 2792 + }, + { + "epoch": 0.05588, + "grad_norm": 0.7589657306671143, + "learning_rate": 1.1176e-05, + "loss": 0.294, + "step": 2794 + }, + { + "epoch": 0.05592, + "grad_norm": 0.6559714674949646, + "learning_rate": 1.1184000000000002e-05, + "loss": 0.2736, + "step": 2796 + }, + { + "epoch": 0.05596, + "grad_norm": 0.8734223246574402, + "learning_rate": 1.1192e-05, + "loss": 0.3048, + "step": 2798 + }, + { + "epoch": 0.056, + "grad_norm": 0.9247915744781494, + "learning_rate": 1.1200000000000001e-05, + "loss": 0.1131, + "step": 2800 + }, + { + "epoch": 0.05604, + "grad_norm": 1.9555412530899048, + "learning_rate": 1.1208000000000002e-05, + "loss": 0.2626, + "step": 2802 + }, + { + "epoch": 0.05608, + "grad_norm": 1.9156025648117065, + "learning_rate": 1.1216e-05, + "loss": 0.2299, + "step": 2804 + }, + { + "epoch": 0.05612, + "grad_norm": 1.0201491117477417, + "learning_rate": 1.1224000000000001e-05, + "loss": 0.1444, + "step": 2806 + }, + { + "epoch": 0.05616, + "grad_norm": 0.9953094124794006, + "learning_rate": 1.1232e-05, + "loss": 0.2353, + "step": 2808 + }, + { + "epoch": 0.0562, + "grad_norm": 1.8483721017837524, + "learning_rate": 1.1240000000000002e-05, + "loss": 0.3667, + "step": 2810 + }, + { + "epoch": 0.05624, + "grad_norm": 1.226496934890747, + "learning_rate": 1.1248000000000001e-05, + "loss": 0.2352, + "step": 2812 + }, + { + "epoch": 0.05628, + "grad_norm": 1.6629552841186523, + "learning_rate": 1.1256e-05, + "loss": 0.3012, + "step": 2814 + }, + { + "epoch": 0.05632, + "grad_norm": 1.4289979934692383, + "learning_rate": 1.1264000000000001e-05, + "loss": 0.1962, + "step": 2816 + }, + { + "epoch": 0.05636, + "grad_norm": 1.7952378988265991, + "learning_rate": 1.1272000000000002e-05, + "loss": 0.2665, + "step": 2818 + }, + { + "epoch": 0.0564, + "grad_norm": 1.447770357131958, + "learning_rate": 1.128e-05, + "loss": 0.2224, + "step": 2820 + }, + { + "epoch": 0.05644, + "grad_norm": 1.3415924310684204, + "learning_rate": 1.1288000000000001e-05, + "loss": 0.2029, + "step": 2822 + }, + { + "epoch": 0.05648, + "grad_norm": 1.4214926958084106, + "learning_rate": 1.1296e-05, + "loss": 0.2325, + "step": 2824 + }, + { + "epoch": 0.05652, + "grad_norm": 1.4183701276779175, + "learning_rate": 1.1304000000000002e-05, + "loss": 0.2445, + "step": 2826 + }, + { + "epoch": 0.05656, + "grad_norm": 1.5809978246688843, + "learning_rate": 1.1312000000000001e-05, + "loss": 0.2643, + "step": 2828 + }, + { + "epoch": 0.0566, + "grad_norm": 1.5344550609588623, + "learning_rate": 1.132e-05, + "loss": 0.2431, + "step": 2830 + }, + { + "epoch": 0.05664, + "grad_norm": 0.9955279231071472, + "learning_rate": 1.1328e-05, + "loss": 0.2147, + "step": 2832 + }, + { + "epoch": 0.05668, + "grad_norm": 1.381709337234497, + "learning_rate": 1.1336e-05, + "loss": 0.2182, + "step": 2834 + }, + { + "epoch": 0.05672, + "grad_norm": 0.9479675889015198, + "learning_rate": 1.1344000000000002e-05, + "loss": 0.23, + "step": 2836 + }, + { + "epoch": 0.05676, + "grad_norm": 0.8969796299934387, + "learning_rate": 1.1352e-05, + "loss": 0.1251, + "step": 2838 + }, + { + "epoch": 0.0568, + "grad_norm": 1.7808446884155273, + "learning_rate": 1.136e-05, + "loss": 0.3146, + "step": 2840 + }, + { + "epoch": 0.05684, + "grad_norm": 1.3168128728866577, + "learning_rate": 1.1368000000000002e-05, + "loss": 0.2128, + "step": 2842 + }, + { + "epoch": 0.05688, + "grad_norm": 1.6436067819595337, + "learning_rate": 1.1376000000000001e-05, + "loss": 0.2899, + "step": 2844 + }, + { + "epoch": 0.05692, + "grad_norm": 1.4038478136062622, + "learning_rate": 1.1384000000000001e-05, + "loss": 0.2652, + "step": 2846 + }, + { + "epoch": 0.05696, + "grad_norm": 1.1959114074707031, + "learning_rate": 1.1392e-05, + "loss": 0.2025, + "step": 2848 + }, + { + "epoch": 0.057, + "grad_norm": 1.2000315189361572, + "learning_rate": 1.14e-05, + "loss": 0.1931, + "step": 2850 + }, + { + "epoch": 0.05704, + "grad_norm": 1.151143193244934, + "learning_rate": 1.1408000000000002e-05, + "loss": 0.2424, + "step": 2852 + }, + { + "epoch": 0.05708, + "grad_norm": 1.8177236318588257, + "learning_rate": 1.1416e-05, + "loss": 0.3523, + "step": 2854 + }, + { + "epoch": 0.05712, + "grad_norm": 1.8279262781143188, + "learning_rate": 1.1424000000000001e-05, + "loss": 0.2221, + "step": 2856 + }, + { + "epoch": 0.05716, + "grad_norm": 1.1353782415390015, + "learning_rate": 1.1432000000000002e-05, + "loss": 0.1515, + "step": 2858 + }, + { + "epoch": 0.0572, + "grad_norm": 1.0570863485336304, + "learning_rate": 1.144e-05, + "loss": 0.1596, + "step": 2860 + }, + { + "epoch": 0.05724, + "grad_norm": 2.1071555614471436, + "learning_rate": 1.1448000000000001e-05, + "loss": 0.3943, + "step": 2862 + }, + { + "epoch": 0.05728, + "grad_norm": 1.9031317234039307, + "learning_rate": 1.1456e-05, + "loss": 0.2214, + "step": 2864 + }, + { + "epoch": 0.05732, + "grad_norm": 1.064240574836731, + "learning_rate": 1.1464000000000002e-05, + "loss": 0.1514, + "step": 2866 + }, + { + "epoch": 0.05736, + "grad_norm": 1.8069698810577393, + "learning_rate": 1.1472000000000001e-05, + "loss": 0.2506, + "step": 2868 + }, + { + "epoch": 0.0574, + "grad_norm": 1.0418792963027954, + "learning_rate": 1.148e-05, + "loss": 0.1596, + "step": 2870 + }, + { + "epoch": 0.05744, + "grad_norm": 1.7286062240600586, + "learning_rate": 1.1488e-05, + "loss": 0.2468, + "step": 2872 + }, + { + "epoch": 0.05748, + "grad_norm": 1.0203661918640137, + "learning_rate": 1.1496e-05, + "loss": 0.1685, + "step": 2874 + }, + { + "epoch": 0.05752, + "grad_norm": 1.847878336906433, + "learning_rate": 1.1504000000000002e-05, + "loss": 0.2347, + "step": 2876 + }, + { + "epoch": 0.05756, + "grad_norm": 1.8078960180282593, + "learning_rate": 1.1512000000000001e-05, + "loss": 0.313, + "step": 2878 + }, + { + "epoch": 0.0576, + "grad_norm": 1.680380940437317, + "learning_rate": 1.152e-05, + "loss": 0.2297, + "step": 2880 + }, + { + "epoch": 0.05764, + "grad_norm": 1.8407866954803467, + "learning_rate": 1.1528000000000002e-05, + "loss": 0.3389, + "step": 2882 + }, + { + "epoch": 0.05768, + "grad_norm": 1.33634614944458, + "learning_rate": 1.1536000000000001e-05, + "loss": 0.2542, + "step": 2884 + }, + { + "epoch": 0.05772, + "grad_norm": 1.3988550901412964, + "learning_rate": 1.1544000000000002e-05, + "loss": 0.2428, + "step": 2886 + }, + { + "epoch": 0.05776, + "grad_norm": 1.0043295621871948, + "learning_rate": 1.1552e-05, + "loss": 0.2179, + "step": 2888 + }, + { + "epoch": 0.0578, + "grad_norm": 1.5845978260040283, + "learning_rate": 1.156e-05, + "loss": 0.2034, + "step": 2890 + }, + { + "epoch": 0.05784, + "grad_norm": 0.9281550645828247, + "learning_rate": 1.1568000000000002e-05, + "loss": 0.323, + "step": 2892 + }, + { + "epoch": 0.05788, + "grad_norm": 0.8612517714500427, + "learning_rate": 1.1576e-05, + "loss": 0.1617, + "step": 2894 + }, + { + "epoch": 0.05792, + "grad_norm": 1.5353282690048218, + "learning_rate": 1.1584000000000001e-05, + "loss": 0.3006, + "step": 2896 + }, + { + "epoch": 0.05796, + "grad_norm": 1.6778841018676758, + "learning_rate": 1.1592000000000002e-05, + "loss": 0.2594, + "step": 2898 + }, + { + "epoch": 0.058, + "grad_norm": 1.6000770330429077, + "learning_rate": 1.16e-05, + "loss": 0.2426, + "step": 2900 + }, + { + "epoch": 0.05804, + "grad_norm": 1.0727242231369019, + "learning_rate": 1.1608000000000001e-05, + "loss": 0.2443, + "step": 2902 + }, + { + "epoch": 0.05808, + "grad_norm": 0.9162804484367371, + "learning_rate": 1.1616e-05, + "loss": 0.2179, + "step": 2904 + }, + { + "epoch": 0.05812, + "grad_norm": 1.4212756156921387, + "learning_rate": 1.1624000000000003e-05, + "loss": 0.2766, + "step": 2906 + }, + { + "epoch": 0.05816, + "grad_norm": 1.2613272666931152, + "learning_rate": 1.1632000000000001e-05, + "loss": 0.2323, + "step": 2908 + }, + { + "epoch": 0.0582, + "grad_norm": 1.1039401292800903, + "learning_rate": 1.164e-05, + "loss": 0.2347, + "step": 2910 + }, + { + "epoch": 0.05824, + "grad_norm": 1.1534267663955688, + "learning_rate": 1.1648000000000001e-05, + "loss": 0.2025, + "step": 2912 + }, + { + "epoch": 0.05828, + "grad_norm": 0.9805291891098022, + "learning_rate": 1.1656e-05, + "loss": 0.2259, + "step": 2914 + }, + { + "epoch": 0.05832, + "grad_norm": 1.0569555759429932, + "learning_rate": 1.1664000000000002e-05, + "loss": 0.2638, + "step": 2916 + }, + { + "epoch": 0.05836, + "grad_norm": 1.594292163848877, + "learning_rate": 1.1672000000000001e-05, + "loss": 0.2561, + "step": 2918 + }, + { + "epoch": 0.0584, + "grad_norm": 1.8600322008132935, + "learning_rate": 1.168e-05, + "loss": 0.2548, + "step": 2920 + }, + { + "epoch": 0.05844, + "grad_norm": 1.000591516494751, + "learning_rate": 1.1688000000000002e-05, + "loss": 0.2176, + "step": 2922 + }, + { + "epoch": 0.05848, + "grad_norm": 1.517042636871338, + "learning_rate": 1.1696000000000001e-05, + "loss": 0.3004, + "step": 2924 + }, + { + "epoch": 0.05852, + "grad_norm": 1.3731657266616821, + "learning_rate": 1.1704000000000002e-05, + "loss": 0.3262, + "step": 2926 + }, + { + "epoch": 0.05856, + "grad_norm": 0.9565469026565552, + "learning_rate": 1.1712e-05, + "loss": 0.2065, + "step": 2928 + }, + { + "epoch": 0.0586, + "grad_norm": 1.1370505094528198, + "learning_rate": 1.172e-05, + "loss": 0.2322, + "step": 2930 + }, + { + "epoch": 0.05864, + "grad_norm": 1.158804178237915, + "learning_rate": 1.1728000000000002e-05, + "loss": 0.1812, + "step": 2932 + }, + { + "epoch": 0.05868, + "grad_norm": 0.9179508686065674, + "learning_rate": 1.1736e-05, + "loss": 0.1763, + "step": 2934 + }, + { + "epoch": 0.05872, + "grad_norm": 1.002487063407898, + "learning_rate": 1.1744000000000001e-05, + "loss": 0.2349, + "step": 2936 + }, + { + "epoch": 0.05876, + "grad_norm": 0.744093656539917, + "learning_rate": 1.1752000000000002e-05, + "loss": 0.2414, + "step": 2938 + }, + { + "epoch": 0.0588, + "grad_norm": 1.257668375968933, + "learning_rate": 1.1760000000000001e-05, + "loss": 0.2772, + "step": 2940 + }, + { + "epoch": 0.05884, + "grad_norm": 1.2836661338806152, + "learning_rate": 1.1768000000000002e-05, + "loss": 0.2766, + "step": 2942 + }, + { + "epoch": 0.05888, + "grad_norm": 0.9539806842803955, + "learning_rate": 1.1776e-05, + "loss": 0.2778, + "step": 2944 + }, + { + "epoch": 0.05892, + "grad_norm": 1.1407887935638428, + "learning_rate": 1.1784e-05, + "loss": 0.2322, + "step": 2946 + }, + { + "epoch": 0.05896, + "grad_norm": 1.2680952548980713, + "learning_rate": 1.1792000000000002e-05, + "loss": 0.243, + "step": 2948 + }, + { + "epoch": 0.059, + "grad_norm": 1.0041779279708862, + "learning_rate": 1.18e-05, + "loss": 0.2259, + "step": 2950 + }, + { + "epoch": 0.05904, + "grad_norm": 1.2779979705810547, + "learning_rate": 1.1808000000000001e-05, + "loss": 0.2223, + "step": 2952 + }, + { + "epoch": 0.05908, + "grad_norm": 1.4815291166305542, + "learning_rate": 1.1816e-05, + "loss": 0.2563, + "step": 2954 + }, + { + "epoch": 0.05912, + "grad_norm": 1.531692624092102, + "learning_rate": 1.1824e-05, + "loss": 0.301, + "step": 2956 + }, + { + "epoch": 0.05916, + "grad_norm": 1.5136957168579102, + "learning_rate": 1.1832000000000001e-05, + "loss": 0.2689, + "step": 2958 + }, + { + "epoch": 0.0592, + "grad_norm": 1.1534347534179688, + "learning_rate": 1.184e-05, + "loss": 0.233, + "step": 2960 + }, + { + "epoch": 0.05924, + "grad_norm": 1.0204548835754395, + "learning_rate": 1.1848000000000002e-05, + "loss": 0.2028, + "step": 2962 + }, + { + "epoch": 0.05928, + "grad_norm": 1.144406795501709, + "learning_rate": 1.1856000000000001e-05, + "loss": 0.2544, + "step": 2964 + }, + { + "epoch": 0.05932, + "grad_norm": 1.2935450077056885, + "learning_rate": 1.1864e-05, + "loss": 0.2444, + "step": 2966 + }, + { + "epoch": 0.05936, + "grad_norm": 1.1550405025482178, + "learning_rate": 1.1872000000000001e-05, + "loss": 0.2222, + "step": 2968 + }, + { + "epoch": 0.0594, + "grad_norm": 0.8224363327026367, + "learning_rate": 1.188e-05, + "loss": 0.1711, + "step": 2970 + }, + { + "epoch": 0.05944, + "grad_norm": 1.3482335805892944, + "learning_rate": 1.1888000000000002e-05, + "loss": 0.2348, + "step": 2972 + }, + { + "epoch": 0.05948, + "grad_norm": 1.3933411836624146, + "learning_rate": 1.1896000000000001e-05, + "loss": 0.2885, + "step": 2974 + }, + { + "epoch": 0.05952, + "grad_norm": 0.651768684387207, + "learning_rate": 1.1904e-05, + "loss": 0.2033, + "step": 2976 + }, + { + "epoch": 0.05956, + "grad_norm": 0.9694809317588806, + "learning_rate": 1.1912000000000002e-05, + "loss": 0.153, + "step": 2978 + }, + { + "epoch": 0.0596, + "grad_norm": 0.9634465575218201, + "learning_rate": 1.1920000000000001e-05, + "loss": 0.2147, + "step": 2980 + }, + { + "epoch": 0.05964, + "grad_norm": 1.431222677230835, + "learning_rate": 1.1928000000000002e-05, + "loss": 0.2886, + "step": 2982 + }, + { + "epoch": 0.05968, + "grad_norm": 1.0524088144302368, + "learning_rate": 1.1936e-05, + "loss": 0.2234, + "step": 2984 + }, + { + "epoch": 0.05972, + "grad_norm": 1.6286617517471313, + "learning_rate": 1.1944e-05, + "loss": 0.2484, + "step": 2986 + }, + { + "epoch": 0.05976, + "grad_norm": 1.2751195430755615, + "learning_rate": 1.1952000000000002e-05, + "loss": 0.243, + "step": 2988 + }, + { + "epoch": 0.0598, + "grad_norm": 1.1683964729309082, + "learning_rate": 1.196e-05, + "loss": 0.2429, + "step": 2990 + }, + { + "epoch": 0.05984, + "grad_norm": 1.1040698289871216, + "learning_rate": 1.1968000000000001e-05, + "loss": 0.212, + "step": 2992 + }, + { + "epoch": 0.05988, + "grad_norm": 1.194715976715088, + "learning_rate": 1.1976e-05, + "loss": 0.2026, + "step": 2994 + }, + { + "epoch": 0.05992, + "grad_norm": 0.9158677458763123, + "learning_rate": 1.1984e-05, + "loss": 0.1597, + "step": 2996 + }, + { + "epoch": 0.05996, + "grad_norm": 1.2738527059555054, + "learning_rate": 1.1992000000000001e-05, + "loss": 0.2468, + "step": 2998 + }, + { + "epoch": 0.06, + "grad_norm": 1.7749686241149902, + "learning_rate": 1.2e-05, + "loss": 0.2922, + "step": 3000 + }, + { + "epoch": 0.06004, + "grad_norm": 0.9928878545761108, + "learning_rate": 1.2008000000000003e-05, + "loss": 0.2424, + "step": 3002 + }, + { + "epoch": 0.06008, + "grad_norm": 1.0583511590957642, + "learning_rate": 1.2016000000000002e-05, + "loss": 0.1844, + "step": 3004 + }, + { + "epoch": 0.06012, + "grad_norm": 1.2116683721542358, + "learning_rate": 1.2024e-05, + "loss": 0.2429, + "step": 3006 + }, + { + "epoch": 0.06016, + "grad_norm": 1.4087711572647095, + "learning_rate": 1.2032000000000001e-05, + "loss": 0.2349, + "step": 3008 + }, + { + "epoch": 0.0602, + "grad_norm": 1.0649664402008057, + "learning_rate": 1.204e-05, + "loss": 0.1843, + "step": 3010 + }, + { + "epoch": 0.06024, + "grad_norm": 1.499222755432129, + "learning_rate": 1.2048000000000002e-05, + "loss": 0.2884, + "step": 3012 + }, + { + "epoch": 0.06028, + "grad_norm": 1.540120244026184, + "learning_rate": 1.2056000000000001e-05, + "loss": 0.2665, + "step": 3014 + }, + { + "epoch": 0.06032, + "grad_norm": 1.522642731666565, + "learning_rate": 1.2064e-05, + "loss": 0.3006, + "step": 3016 + }, + { + "epoch": 0.06036, + "grad_norm": 1.1150327920913696, + "learning_rate": 1.2072000000000002e-05, + "loss": 0.2025, + "step": 3018 + }, + { + "epoch": 0.0604, + "grad_norm": 1.6844295263290405, + "learning_rate": 1.2080000000000001e-05, + "loss": 0.2789, + "step": 3020 + }, + { + "epoch": 0.06044, + "grad_norm": 1.2081881761550903, + "learning_rate": 1.2088000000000002e-05, + "loss": 0.2025, + "step": 3022 + }, + { + "epoch": 0.06048, + "grad_norm": 1.5829849243164062, + "learning_rate": 1.2096e-05, + "loss": 0.2327, + "step": 3024 + }, + { + "epoch": 0.06052, + "grad_norm": 1.5761040449142456, + "learning_rate": 1.2104e-05, + "loss": 0.3004, + "step": 3026 + }, + { + "epoch": 0.06056, + "grad_norm": 1.0152722597122192, + "learning_rate": 1.2112000000000002e-05, + "loss": 0.2557, + "step": 3028 + }, + { + "epoch": 0.0606, + "grad_norm": 1.6286249160766602, + "learning_rate": 1.2120000000000001e-05, + "loss": 0.2278, + "step": 3030 + }, + { + "epoch": 0.06064, + "grad_norm": 0.8787254095077515, + "learning_rate": 1.2128000000000001e-05, + "loss": 0.2297, + "step": 3032 + }, + { + "epoch": 0.06068, + "grad_norm": 1.2817665338516235, + "learning_rate": 1.2136e-05, + "loss": 0.2328, + "step": 3034 + }, + { + "epoch": 0.06072, + "grad_norm": 1.2527066469192505, + "learning_rate": 1.2144000000000001e-05, + "loss": 0.2651, + "step": 3036 + }, + { + "epoch": 0.06076, + "grad_norm": 1.212935447692871, + "learning_rate": 1.2152000000000002e-05, + "loss": 0.2321, + "step": 3038 + }, + { + "epoch": 0.0608, + "grad_norm": 1.2330940961837769, + "learning_rate": 1.216e-05, + "loss": 0.2322, + "step": 3040 + }, + { + "epoch": 0.06084, + "grad_norm": 1.1016523838043213, + "learning_rate": 1.2168000000000003e-05, + "loss": 0.2121, + "step": 3042 + }, + { + "epoch": 0.06088, + "grad_norm": 1.1880099773406982, + "learning_rate": 1.2176000000000002e-05, + "loss": 0.222, + "step": 3044 + }, + { + "epoch": 0.06092, + "grad_norm": 1.209657907485962, + "learning_rate": 1.2184e-05, + "loss": 0.2126, + "step": 3046 + }, + { + "epoch": 0.06096, + "grad_norm": 1.0731627941131592, + "learning_rate": 1.2192000000000001e-05, + "loss": 0.2027, + "step": 3048 + }, + { + "epoch": 0.061, + "grad_norm": 1.3498812913894653, + "learning_rate": 1.22e-05, + "loss": 0.2543, + "step": 3050 + }, + { + "epoch": 0.06104, + "grad_norm": 1.1150494813919067, + "learning_rate": 1.2208000000000002e-05, + "loss": 0.2221, + "step": 3052 + }, + { + "epoch": 0.06108, + "grad_norm": 1.230181097984314, + "learning_rate": 1.2216000000000001e-05, + "loss": 0.2542, + "step": 3054 + }, + { + "epoch": 0.06112, + "grad_norm": 1.1644943952560425, + "learning_rate": 1.2224e-05, + "loss": 0.2219, + "step": 3056 + }, + { + "epoch": 0.06116, + "grad_norm": 1.1338425874710083, + "learning_rate": 1.2232000000000002e-05, + "loss": 0.2233, + "step": 3058 + }, + { + "epoch": 0.0612, + "grad_norm": 1.1742615699768066, + "learning_rate": 1.2240000000000001e-05, + "loss": 0.2024, + "step": 3060 + }, + { + "epoch": 0.06124, + "grad_norm": 1.0552934408187866, + "learning_rate": 1.2248000000000002e-05, + "loss": 0.1753, + "step": 3062 + }, + { + "epoch": 0.06128, + "grad_norm": 1.1712173223495483, + "learning_rate": 1.2256000000000001e-05, + "loss": 0.2467, + "step": 3064 + }, + { + "epoch": 0.06132, + "grad_norm": 1.9489846229553223, + "learning_rate": 1.2264e-05, + "loss": 0.3528, + "step": 3066 + }, + { + "epoch": 0.06136, + "grad_norm": 1.6683263778686523, + "learning_rate": 1.2272000000000002e-05, + "loss": 0.2424, + "step": 3068 + }, + { + "epoch": 0.0614, + "grad_norm": 1.3680466413497925, + "learning_rate": 1.2280000000000001e-05, + "loss": 0.277, + "step": 3070 + }, + { + "epoch": 0.06144, + "grad_norm": 1.0213823318481445, + "learning_rate": 1.2288e-05, + "loss": 0.2037, + "step": 3072 + }, + { + "epoch": 0.06148, + "grad_norm": 1.2116212844848633, + "learning_rate": 1.2296e-05, + "loss": 0.2662, + "step": 3074 + }, + { + "epoch": 0.06152, + "grad_norm": 1.0287936925888062, + "learning_rate": 1.2304000000000001e-05, + "loss": 0.1753, + "step": 3076 + }, + { + "epoch": 0.06156, + "grad_norm": 1.0463429689407349, + "learning_rate": 1.2312000000000002e-05, + "loss": 0.2024, + "step": 3078 + }, + { + "epoch": 0.0616, + "grad_norm": 1.3153589963912964, + "learning_rate": 1.232e-05, + "loss": 0.277, + "step": 3080 + }, + { + "epoch": 0.06164, + "grad_norm": 1.2861546277999878, + "learning_rate": 1.2328e-05, + "loss": 0.2541, + "step": 3082 + }, + { + "epoch": 0.06168, + "grad_norm": 1.4148166179656982, + "learning_rate": 1.2336000000000002e-05, + "loss": 0.2763, + "step": 3084 + }, + { + "epoch": 0.06172, + "grad_norm": 1.3629258871078491, + "learning_rate": 1.2344e-05, + "loss": 0.2327, + "step": 3086 + }, + { + "epoch": 0.06176, + "grad_norm": 0.9940133094787598, + "learning_rate": 1.2352000000000001e-05, + "loss": 0.2346, + "step": 3088 + }, + { + "epoch": 0.0618, + "grad_norm": 1.1752028465270996, + "learning_rate": 1.236e-05, + "loss": 0.2118, + "step": 3090 + }, + { + "epoch": 0.06184, + "grad_norm": 1.2657030820846558, + "learning_rate": 1.2368e-05, + "loss": 0.2328, + "step": 3092 + }, + { + "epoch": 0.06188, + "grad_norm": 1.3795201778411865, + "learning_rate": 1.2376000000000001e-05, + "loss": 0.2442, + "step": 3094 + }, + { + "epoch": 0.06192, + "grad_norm": 1.408085823059082, + "learning_rate": 1.2384e-05, + "loss": 0.3011, + "step": 3096 + }, + { + "epoch": 0.06196, + "grad_norm": 1.3802084922790527, + "learning_rate": 1.2392000000000003e-05, + "loss": 0.2298, + "step": 3098 + }, + { + "epoch": 0.062, + "grad_norm": 1.0647861957550049, + "learning_rate": 1.2400000000000002e-05, + "loss": 0.2345, + "step": 3100 + }, + { + "epoch": 0.06204, + "grad_norm": 1.3111398220062256, + "learning_rate": 1.2408e-05, + "loss": 0.3131, + "step": 3102 + }, + { + "epoch": 0.06208, + "grad_norm": 1.1647707223892212, + "learning_rate": 1.2416000000000001e-05, + "loss": 0.2428, + "step": 3104 + }, + { + "epoch": 0.06212, + "grad_norm": 0.972692608833313, + "learning_rate": 1.2424e-05, + "loss": 0.193, + "step": 3106 + }, + { + "epoch": 0.06216, + "grad_norm": 1.2088934183120728, + "learning_rate": 1.2432000000000002e-05, + "loss": 0.2327, + "step": 3108 + }, + { + "epoch": 0.0622, + "grad_norm": 1.21885347366333, + "learning_rate": 1.2440000000000001e-05, + "loss": 0.2328, + "step": 3110 + }, + { + "epoch": 0.06224, + "grad_norm": 1.1251527070999146, + "learning_rate": 1.2448e-05, + "loss": 0.2125, + "step": 3112 + }, + { + "epoch": 0.06228, + "grad_norm": 1.188180923461914, + "learning_rate": 1.2456e-05, + "loss": 0.2326, + "step": 3114 + }, + { + "epoch": 0.06232, + "grad_norm": 1.0790754556655884, + "learning_rate": 1.2464000000000001e-05, + "loss": 0.244, + "step": 3116 + }, + { + "epoch": 0.06236, + "grad_norm": 1.0313209295272827, + "learning_rate": 1.2472000000000002e-05, + "loss": 0.2233, + "step": 3118 + }, + { + "epoch": 0.0624, + "grad_norm": 1.0959135293960571, + "learning_rate": 1.248e-05, + "loss": 0.2118, + "step": 3120 + }, + { + "epoch": 0.06244, + "grad_norm": 1.1858084201812744, + "learning_rate": 1.2488e-05, + "loss": 0.2327, + "step": 3122 + }, + { + "epoch": 0.06248, + "grad_norm": 1.5466153621673584, + "learning_rate": 1.2496000000000002e-05, + "loss": 0.2821, + "step": 3124 + }, + { + "epoch": 0.06252, + "grad_norm": 1.1907485723495483, + "learning_rate": 1.2504000000000001e-05, + "loss": 0.2327, + "step": 3126 + }, + { + "epoch": 0.06256, + "grad_norm": 1.2560482025146484, + "learning_rate": 1.2512000000000002e-05, + "loss": 0.2648, + "step": 3128 + }, + { + "epoch": 0.0626, + "grad_norm": 0.909483015537262, + "learning_rate": 1.252e-05, + "loss": 0.2037, + "step": 3130 + }, + { + "epoch": 0.06264, + "grad_norm": 1.1608651876449585, + "learning_rate": 1.2528e-05, + "loss": 0.232, + "step": 3132 + }, + { + "epoch": 0.06268, + "grad_norm": 0.9295639395713806, + "learning_rate": 1.2536000000000002e-05, + "loss": 0.2036, + "step": 3134 + }, + { + "epoch": 0.06272, + "grad_norm": 0.8757659196853638, + "learning_rate": 1.2544e-05, + "loss": 0.1589, + "step": 3136 + }, + { + "epoch": 0.06276, + "grad_norm": 0.9842341542243958, + "learning_rate": 1.2552000000000001e-05, + "loss": 0.1841, + "step": 3138 + }, + { + "epoch": 0.0628, + "grad_norm": 1.3947588205337524, + "learning_rate": 1.2560000000000002e-05, + "loss": 0.2345, + "step": 3140 + }, + { + "epoch": 0.06284, + "grad_norm": 0.7513135671615601, + "learning_rate": 1.2568e-05, + "loss": 0.1302, + "step": 3142 + }, + { + "epoch": 0.06288, + "grad_norm": 1.6750596761703491, + "learning_rate": 1.2576000000000001e-05, + "loss": 0.3526, + "step": 3144 + }, + { + "epoch": 0.06292, + "grad_norm": 1.7884982824325562, + "learning_rate": 1.2584e-05, + "loss": 0.2345, + "step": 3146 + }, + { + "epoch": 0.06296, + "grad_norm": 2.1088316440582275, + "learning_rate": 1.2592000000000002e-05, + "loss": 0.4108, + "step": 3148 + }, + { + "epoch": 0.063, + "grad_norm": 1.723079800605774, + "learning_rate": 1.2600000000000001e-05, + "loss": 0.2287, + "step": 3150 + }, + { + "epoch": 0.06304, + "grad_norm": 0.8969802856445312, + "learning_rate": 1.2608e-05, + "loss": 0.1596, + "step": 3152 + }, + { + "epoch": 0.06308, + "grad_norm": 1.1544240713119507, + "learning_rate": 1.2616e-05, + "loss": 0.2037, + "step": 3154 + }, + { + "epoch": 0.06312, + "grad_norm": 1.1623027324676514, + "learning_rate": 1.2624000000000001e-05, + "loss": 0.2119, + "step": 3156 + }, + { + "epoch": 0.06316, + "grad_norm": 1.0547664165496826, + "learning_rate": 1.2632000000000002e-05, + "loss": 0.1931, + "step": 3158 + }, + { + "epoch": 0.0632, + "grad_norm": 1.3584059476852417, + "learning_rate": 1.2640000000000001e-05, + "loss": 0.21, + "step": 3160 + }, + { + "epoch": 0.06324, + "grad_norm": 1.1745777130126953, + "learning_rate": 1.2648e-05, + "loss": 0.194, + "step": 3162 + }, + { + "epoch": 0.06328, + "grad_norm": 1.0662522315979004, + "learning_rate": 1.2656000000000002e-05, + "loss": 0.2466, + "step": 3164 + }, + { + "epoch": 0.06332, + "grad_norm": 0.956403911113739, + "learning_rate": 1.2664000000000001e-05, + "loss": 0.178, + "step": 3166 + }, + { + "epoch": 0.06336, + "grad_norm": 1.294694185256958, + "learning_rate": 1.2672000000000002e-05, + "loss": 0.2429, + "step": 3168 + }, + { + "epoch": 0.0634, + "grad_norm": 1.4958416223526, + "learning_rate": 1.268e-05, + "loss": 0.2347, + "step": 3170 + }, + { + "epoch": 0.06344, + "grad_norm": 1.2008030414581299, + "learning_rate": 1.2688e-05, + "loss": 0.2037, + "step": 3172 + }, + { + "epoch": 0.06348, + "grad_norm": 1.9332551956176758, + "learning_rate": 1.2696000000000002e-05, + "loss": 0.2924, + "step": 3174 + }, + { + "epoch": 0.06352, + "grad_norm": 0.9314296245574951, + "learning_rate": 1.2704e-05, + "loss": 0.2552, + "step": 3176 + }, + { + "epoch": 0.06356, + "grad_norm": 1.858629822731018, + "learning_rate": 1.2712000000000001e-05, + "loss": 0.3145, + "step": 3178 + }, + { + "epoch": 0.0636, + "grad_norm": 0.7983955144882202, + "learning_rate": 1.2720000000000002e-05, + "loss": 0.1743, + "step": 3180 + }, + { + "epoch": 0.06364, + "grad_norm": 1.3046388626098633, + "learning_rate": 1.2728e-05, + "loss": 0.2662, + "step": 3182 + }, + { + "epoch": 0.06368, + "grad_norm": 1.2124332189559937, + "learning_rate": 1.2736000000000001e-05, + "loss": 0.1936, + "step": 3184 + }, + { + "epoch": 0.06372, + "grad_norm": 1.0175408124923706, + "learning_rate": 1.2744e-05, + "loss": 0.2145, + "step": 3186 + }, + { + "epoch": 0.06376, + "grad_norm": 1.616261601448059, + "learning_rate": 1.2752e-05, + "loss": 0.2466, + "step": 3188 + }, + { + "epoch": 0.0638, + "grad_norm": 1.2119932174682617, + "learning_rate": 1.2760000000000001e-05, + "loss": 0.2319, + "step": 3190 + }, + { + "epoch": 0.06384, + "grad_norm": 1.029881477355957, + "learning_rate": 1.2768e-05, + "loss": 0.1842, + "step": 3192 + }, + { + "epoch": 0.06388, + "grad_norm": 2.0432379245758057, + "learning_rate": 1.2776000000000001e-05, + "loss": 0.3311, + "step": 3194 + }, + { + "epoch": 0.06392, + "grad_norm": 1.5249927043914795, + "learning_rate": 1.2784000000000002e-05, + "loss": 0.2174, + "step": 3196 + }, + { + "epoch": 0.06396, + "grad_norm": 1.3213125467300415, + "learning_rate": 1.2792e-05, + "loss": 0.3352, + "step": 3198 + }, + { + "epoch": 0.064, + "grad_norm": 0.9089120030403137, + "learning_rate": 1.2800000000000001e-05, + "loss": 0.1437, + "step": 3200 + }, + { + "epoch": 0.06404, + "grad_norm": 1.1152640581130981, + "learning_rate": 1.2808e-05, + "loss": 0.2025, + "step": 3202 + }, + { + "epoch": 0.06408, + "grad_norm": 1.070761799812317, + "learning_rate": 1.2816000000000002e-05, + "loss": 0.2233, + "step": 3204 + }, + { + "epoch": 0.06412, + "grad_norm": 1.4863883256912231, + "learning_rate": 1.2824000000000001e-05, + "loss": 0.2466, + "step": 3206 + }, + { + "epoch": 0.06416, + "grad_norm": 1.185940146446228, + "learning_rate": 1.2832e-05, + "loss": 0.1842, + "step": 3208 + }, + { + "epoch": 0.0642, + "grad_norm": 1.4295017719268799, + "learning_rate": 1.284e-05, + "loss": 0.2442, + "step": 3210 + }, + { + "epoch": 0.06424, + "grad_norm": 1.2346702814102173, + "learning_rate": 1.2848e-05, + "loss": 0.1937, + "step": 3212 + }, + { + "epoch": 0.06428, + "grad_norm": 1.071131706237793, + "learning_rate": 1.2856000000000002e-05, + "loss": 0.2061, + "step": 3214 + }, + { + "epoch": 0.06432, + "grad_norm": 1.1424013376235962, + "learning_rate": 1.2864e-05, + "loss": 0.1842, + "step": 3216 + }, + { + "epoch": 0.06436, + "grad_norm": 1.7423549890518188, + "learning_rate": 1.2872e-05, + "loss": 0.2883, + "step": 3218 + }, + { + "epoch": 0.0644, + "grad_norm": 1.0390853881835938, + "learning_rate": 1.2880000000000002e-05, + "loss": 0.2039, + "step": 3220 + }, + { + "epoch": 0.06444, + "grad_norm": 1.4960460662841797, + "learning_rate": 1.2888000000000001e-05, + "loss": 0.2427, + "step": 3222 + }, + { + "epoch": 0.06448, + "grad_norm": 0.9724529981613159, + "learning_rate": 1.2896000000000002e-05, + "loss": 0.1512, + "step": 3224 + }, + { + "epoch": 0.06452, + "grad_norm": 1.6166033744812012, + "learning_rate": 1.2904e-05, + "loss": 0.2561, + "step": 3226 + }, + { + "epoch": 0.06456, + "grad_norm": 1.5945390462875366, + "learning_rate": 1.2912e-05, + "loss": 0.2174, + "step": 3228 + }, + { + "epoch": 0.0646, + "grad_norm": 1.011223554611206, + "learning_rate": 1.2920000000000002e-05, + "loss": 0.1853, + "step": 3230 + }, + { + "epoch": 0.06464, + "grad_norm": 0.8731397390365601, + "learning_rate": 1.2928e-05, + "loss": 0.1009, + "step": 3232 + }, + { + "epoch": 0.06468, + "grad_norm": 1.2331798076629639, + "learning_rate": 1.2936000000000001e-05, + "loss": 0.2151, + "step": 3234 + }, + { + "epoch": 0.06472, + "grad_norm": 2.211392879486084, + "learning_rate": 1.2944000000000002e-05, + "loss": 0.2663, + "step": 3236 + }, + { + "epoch": 0.06476, + "grad_norm": 3.9631268978118896, + "learning_rate": 1.2952e-05, + "loss": 0.2998, + "step": 3238 + }, + { + "epoch": 0.0648, + "grad_norm": 0.3324545919895172, + "learning_rate": 1.2960000000000001e-05, + "loss": 0.0581, + "step": 3240 + }, + { + "epoch": 0.06484, + "grad_norm": 1.6132309436798096, + "learning_rate": 1.2968e-05, + "loss": 0.1636, + "step": 3242 + }, + { + "epoch": 0.06488, + "grad_norm": 1.6938199996948242, + "learning_rate": 1.2976000000000002e-05, + "loss": 0.3313, + "step": 3244 + }, + { + "epoch": 0.06492, + "grad_norm": 1.9975175857543945, + "learning_rate": 1.2984000000000001e-05, + "loss": 0.1686, + "step": 3246 + }, + { + "epoch": 0.06496, + "grad_norm": 1.0478461980819702, + "learning_rate": 1.2992e-05, + "loss": 0.241, + "step": 3248 + }, + { + "epoch": 0.065, + "grad_norm": 1.3828840255737305, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.2124, + "step": 3250 + }, + { + "epoch": 0.06504, + "grad_norm": 1.3917092084884644, + "learning_rate": 1.3008e-05, + "loss": 0.373, + "step": 3252 + }, + { + "epoch": 0.06508, + "grad_norm": 1.9844681024551392, + "learning_rate": 1.3016000000000002e-05, + "loss": 0.1759, + "step": 3254 + }, + { + "epoch": 0.06512, + "grad_norm": 1.1961127519607544, + "learning_rate": 1.3024000000000001e-05, + "loss": 0.176, + "step": 3256 + }, + { + "epoch": 0.06516, + "grad_norm": 1.7670007944107056, + "learning_rate": 1.3032e-05, + "loss": 0.542, + "step": 3258 + }, + { + "epoch": 0.0652, + "grad_norm": 1.5128358602523804, + "learning_rate": 1.3040000000000002e-05, + "loss": 0.417, + "step": 3260 + }, + { + "epoch": 0.06524, + "grad_norm": 1.1415237188339233, + "learning_rate": 1.3048000000000001e-05, + "loss": 0.2776, + "step": 3262 + }, + { + "epoch": 0.06528, + "grad_norm": 1.1379510164260864, + "learning_rate": 1.3056000000000002e-05, + "loss": 0.2325, + "step": 3264 + }, + { + "epoch": 0.06532, + "grad_norm": 1.0110760927200317, + "learning_rate": 1.3064e-05, + "loss": 0.2144, + "step": 3266 + }, + { + "epoch": 0.06536, + "grad_norm": 1.1089855432510376, + "learning_rate": 1.3072e-05, + "loss": 0.1746, + "step": 3268 + }, + { + "epoch": 0.0654, + "grad_norm": 0.9610081315040588, + "learning_rate": 1.3080000000000002e-05, + "loss": 0.2776, + "step": 3270 + }, + { + "epoch": 0.06544, + "grad_norm": 1.1269798278808594, + "learning_rate": 1.3088e-05, + "loss": 0.2024, + "step": 3272 + }, + { + "epoch": 0.06548, + "grad_norm": 1.0223917961120605, + "learning_rate": 1.3096000000000001e-05, + "loss": 0.2346, + "step": 3274 + }, + { + "epoch": 0.06552, + "grad_norm": 0.7907661199569702, + "learning_rate": 1.3104000000000002e-05, + "loss": 0.2412, + "step": 3276 + }, + { + "epoch": 0.06556, + "grad_norm": 0.9744505286216736, + "learning_rate": 1.3112e-05, + "loss": 0.2505, + "step": 3278 + }, + { + "epoch": 0.0656, + "grad_norm": 1.4428356885910034, + "learning_rate": 1.3120000000000001e-05, + "loss": 0.2379, + "step": 3280 + }, + { + "epoch": 0.06564, + "grad_norm": 1.548170566558838, + "learning_rate": 1.3128e-05, + "loss": 0.2423, + "step": 3282 + }, + { + "epoch": 0.06568, + "grad_norm": 1.3648525476455688, + "learning_rate": 1.3136000000000003e-05, + "loss": 0.2258, + "step": 3284 + }, + { + "epoch": 0.06572, + "grad_norm": 0.8409509062767029, + "learning_rate": 1.3144000000000002e-05, + "loss": 0.1305, + "step": 3286 + }, + { + "epoch": 0.06576, + "grad_norm": 1.4474538564682007, + "learning_rate": 1.3152e-05, + "loss": 0.2152, + "step": 3288 + }, + { + "epoch": 0.0658, + "grad_norm": 1.520246982574463, + "learning_rate": 1.3160000000000001e-05, + "loss": 0.2297, + "step": 3290 + }, + { + "epoch": 0.06584, + "grad_norm": 1.6840074062347412, + "learning_rate": 1.3168e-05, + "loss": 0.2102, + "step": 3292 + }, + { + "epoch": 0.06588, + "grad_norm": 1.098219633102417, + "learning_rate": 1.3176000000000002e-05, + "loss": 0.2466, + "step": 3294 + }, + { + "epoch": 0.06592, + "grad_norm": 0.7972451448440552, + "learning_rate": 1.3184000000000001e-05, + "loss": 0.1967, + "step": 3296 + }, + { + "epoch": 0.06596, + "grad_norm": 1.1412525177001953, + "learning_rate": 1.3192e-05, + "loss": 0.2865, + "step": 3298 + }, + { + "epoch": 0.066, + "grad_norm": 1.4238272905349731, + "learning_rate": 1.3200000000000002e-05, + "loss": 0.244, + "step": 3300 + }, + { + "epoch": 0.06604, + "grad_norm": 1.4456565380096436, + "learning_rate": 1.3208000000000001e-05, + "loss": 0.2647, + "step": 3302 + }, + { + "epoch": 0.06608, + "grad_norm": 1.2581336498260498, + "learning_rate": 1.3216000000000002e-05, + "loss": 0.2426, + "step": 3304 + }, + { + "epoch": 0.06612, + "grad_norm": 0.839411199092865, + "learning_rate": 1.3224e-05, + "loss": 0.1462, + "step": 3306 + }, + { + "epoch": 0.06616, + "grad_norm": 1.0768179893493652, + "learning_rate": 1.3232e-05, + "loss": 0.256, + "step": 3308 + }, + { + "epoch": 0.0662, + "grad_norm": 1.557529330253601, + "learning_rate": 1.3240000000000002e-05, + "loss": 0.2467, + "step": 3310 + }, + { + "epoch": 0.06624, + "grad_norm": 0.8095309138298035, + "learning_rate": 1.3248000000000001e-05, + "loss": 0.2278, + "step": 3312 + }, + { + "epoch": 0.06628, + "grad_norm": 0.7296978831291199, + "learning_rate": 1.3256e-05, + "loss": 0.1321, + "step": 3314 + }, + { + "epoch": 0.06632, + "grad_norm": 1.4783475399017334, + "learning_rate": 1.3264000000000002e-05, + "loss": 0.2297, + "step": 3316 + }, + { + "epoch": 0.06636, + "grad_norm": 0.9389597773551941, + "learning_rate": 1.3272000000000001e-05, + "loss": 0.2176, + "step": 3318 + }, + { + "epoch": 0.0664, + "grad_norm": 0.9565516114234924, + "learning_rate": 1.3280000000000002e-05, + "loss": 0.1589, + "step": 3320 + }, + { + "epoch": 0.06644, + "grad_norm": 1.7567343711853027, + "learning_rate": 1.3288e-05, + "loss": 0.255, + "step": 3322 + }, + { + "epoch": 0.06648, + "grad_norm": 1.700865626335144, + "learning_rate": 1.3296e-05, + "loss": 0.3804, + "step": 3324 + }, + { + "epoch": 0.06652, + "grad_norm": 2.200929880142212, + "learning_rate": 1.3304000000000002e-05, + "loss": 0.3227, + "step": 3326 + }, + { + "epoch": 0.06656, + "grad_norm": 1.5674622058868408, + "learning_rate": 1.3312e-05, + "loss": 0.2287, + "step": 3328 + }, + { + "epoch": 0.0666, + "grad_norm": 0.9159162640571594, + "learning_rate": 1.3320000000000001e-05, + "loss": 0.176, + "step": 3330 + }, + { + "epoch": 0.06664, + "grad_norm": 0.953730046749115, + "learning_rate": 1.3328e-05, + "loss": 0.1596, + "step": 3332 + }, + { + "epoch": 0.06668, + "grad_norm": 0.7101532816886902, + "learning_rate": 1.3336e-05, + "loss": 0.1854, + "step": 3334 + }, + { + "epoch": 0.06672, + "grad_norm": 1.4776792526245117, + "learning_rate": 1.3344000000000001e-05, + "loss": 0.2895, + "step": 3336 + }, + { + "epoch": 0.06676, + "grad_norm": 1.2305346727371216, + "learning_rate": 1.3352e-05, + "loss": 0.2427, + "step": 3338 + }, + { + "epoch": 0.0668, + "grad_norm": 1.1237915754318237, + "learning_rate": 1.3360000000000003e-05, + "loss": 0.2326, + "step": 3340 + }, + { + "epoch": 0.06684, + "grad_norm": 1.395210862159729, + "learning_rate": 1.3368000000000001e-05, + "loss": 0.254, + "step": 3342 + }, + { + "epoch": 0.06688, + "grad_norm": 1.5446478128433228, + "learning_rate": 1.3376e-05, + "loss": 0.2592, + "step": 3344 + }, + { + "epoch": 0.06692, + "grad_norm": 0.8570460081100464, + "learning_rate": 1.3384000000000001e-05, + "loss": 0.1708, + "step": 3346 + }, + { + "epoch": 0.06696, + "grad_norm": 0.9128386974334717, + "learning_rate": 1.3392e-05, + "loss": 0.2478, + "step": 3348 + }, + { + "epoch": 0.067, + "grad_norm": 1.940360188484192, + "learning_rate": 1.3400000000000002e-05, + "loss": 0.269, + "step": 3350 + }, + { + "epoch": 0.06704, + "grad_norm": 1.6070318222045898, + "learning_rate": 1.3408000000000001e-05, + "loss": 0.2767, + "step": 3352 + }, + { + "epoch": 0.06708, + "grad_norm": 1.0191643238067627, + "learning_rate": 1.3416e-05, + "loss": 0.1594, + "step": 3354 + }, + { + "epoch": 0.06712, + "grad_norm": 0.9647213220596313, + "learning_rate": 1.3424000000000002e-05, + "loss": 0.2692, + "step": 3356 + }, + { + "epoch": 0.06716, + "grad_norm": 1.4229615926742554, + "learning_rate": 1.3432000000000001e-05, + "loss": 0.2175, + "step": 3358 + }, + { + "epoch": 0.0672, + "grad_norm": 1.6388298273086548, + "learning_rate": 1.3440000000000002e-05, + "loss": 0.2724, + "step": 3360 + }, + { + "epoch": 0.06724, + "grad_norm": 1.3343929052352905, + "learning_rate": 1.3448e-05, + "loss": 0.2142, + "step": 3362 + }, + { + "epoch": 0.06728, + "grad_norm": 1.0065455436706543, + "learning_rate": 1.3456e-05, + "loss": 0.1752, + "step": 3364 + }, + { + "epoch": 0.06732, + "grad_norm": 0.9495735168457031, + "learning_rate": 1.3464000000000002e-05, + "loss": 0.167, + "step": 3366 + }, + { + "epoch": 0.06736, + "grad_norm": 1.5724493265151978, + "learning_rate": 1.3472e-05, + "loss": 0.2591, + "step": 3368 + }, + { + "epoch": 0.0674, + "grad_norm": 0.8817539215087891, + "learning_rate": 1.3480000000000001e-05, + "loss": 0.1436, + "step": 3370 + }, + { + "epoch": 0.06744, + "grad_norm": 1.5693854093551636, + "learning_rate": 1.3488e-05, + "loss": 0.222, + "step": 3372 + }, + { + "epoch": 0.06748, + "grad_norm": 1.7233401536941528, + "learning_rate": 1.3496000000000001e-05, + "loss": 0.2723, + "step": 3374 + }, + { + "epoch": 0.06752, + "grad_norm": 1.0469412803649902, + "learning_rate": 1.3504000000000001e-05, + "loss": 0.2836, + "step": 3376 + }, + { + "epoch": 0.06756, + "grad_norm": 0.8190297484397888, + "learning_rate": 1.3512e-05, + "loss": 0.1443, + "step": 3378 + }, + { + "epoch": 0.0676, + "grad_norm": 0.869164228439331, + "learning_rate": 1.3520000000000003e-05, + "loss": 0.1611, + "step": 3380 + }, + { + "epoch": 0.06764, + "grad_norm": 0.8652413487434387, + "learning_rate": 1.3528000000000002e-05, + "loss": 0.1808, + "step": 3382 + }, + { + "epoch": 0.06768, + "grad_norm": 1.716015100479126, + "learning_rate": 1.3536e-05, + "loss": 0.3255, + "step": 3384 + }, + { + "epoch": 0.06772, + "grad_norm": 0.8889305591583252, + "learning_rate": 1.3544000000000001e-05, + "loss": 0.2219, + "step": 3386 + }, + { + "epoch": 0.06776, + "grad_norm": 0.7906585931777954, + "learning_rate": 1.3552e-05, + "loss": 0.146, + "step": 3388 + }, + { + "epoch": 0.0678, + "grad_norm": 0.935484766960144, + "learning_rate": 1.3560000000000002e-05, + "loss": 0.1232, + "step": 3390 + }, + { + "epoch": 0.06784, + "grad_norm": 1.7712029218673706, + "learning_rate": 1.3568000000000001e-05, + "loss": 0.2881, + "step": 3392 + }, + { + "epoch": 0.06788, + "grad_norm": 1.2021193504333496, + "learning_rate": 1.3576e-05, + "loss": 0.1347, + "step": 3394 + }, + { + "epoch": 0.06792, + "grad_norm": 1.6960314512252808, + "learning_rate": 1.3584000000000002e-05, + "loss": 0.1851, + "step": 3396 + }, + { + "epoch": 0.06796, + "grad_norm": 0.6486106514930725, + "learning_rate": 1.3592000000000001e-05, + "loss": 0.1537, + "step": 3398 + }, + { + "epoch": 0.068, + "grad_norm": 0.608600378036499, + "learning_rate": 1.3600000000000002e-05, + "loss": 0.0922, + "step": 3400 + }, + { + "epoch": 0.06804, + "grad_norm": 3.0063188076019287, + "learning_rate": 1.3608e-05, + "loss": 0.3488, + "step": 3402 + }, + { + "epoch": 0.06808, + "grad_norm": 2.8348076343536377, + "learning_rate": 1.3616e-05, + "loss": 0.5052, + "step": 3404 + }, + { + "epoch": 0.06812, + "grad_norm": 1.1153559684753418, + "learning_rate": 1.3624000000000002e-05, + "loss": 0.1189, + "step": 3406 + }, + { + "epoch": 0.06816, + "grad_norm": 1.4219166040420532, + "learning_rate": 1.3632000000000001e-05, + "loss": 0.3101, + "step": 3408 + }, + { + "epoch": 0.0682, + "grad_norm": 1.7589360475540161, + "learning_rate": 1.3640000000000002e-05, + "loss": 0.2919, + "step": 3410 + }, + { + "epoch": 0.06824, + "grad_norm": 1.3405685424804688, + "learning_rate": 1.3648e-05, + "loss": 0.2318, + "step": 3412 + }, + { + "epoch": 0.06828, + "grad_norm": 1.0189563035964966, + "learning_rate": 1.3656000000000001e-05, + "loss": 0.1916, + "step": 3414 + }, + { + "epoch": 0.06832, + "grad_norm": 1.3495848178863525, + "learning_rate": 1.3664000000000002e-05, + "loss": 0.1745, + "step": 3416 + }, + { + "epoch": 0.06836, + "grad_norm": 0.5969683527946472, + "learning_rate": 1.3672e-05, + "loss": 0.0731, + "step": 3418 + }, + { + "epoch": 0.0684, + "grad_norm": 0.890078067779541, + "learning_rate": 1.3680000000000003e-05, + "loss": 0.1305, + "step": 3420 + }, + { + "epoch": 0.06844, + "grad_norm": 2.580659866333008, + "learning_rate": 1.3688000000000002e-05, + "loss": 0.4857, + "step": 3422 + }, + { + "epoch": 0.06848, + "grad_norm": 1.0270321369171143, + "learning_rate": 1.3696e-05, + "loss": 0.4088, + "step": 3424 + }, + { + "epoch": 0.06852, + "grad_norm": 2.9821548461914062, + "learning_rate": 1.3704000000000001e-05, + "loss": 0.3541, + "step": 3426 + }, + { + "epoch": 0.06856, + "grad_norm": 1.4904887676239014, + "learning_rate": 1.3712e-05, + "loss": 0.1909, + "step": 3428 + }, + { + "epoch": 0.0686, + "grad_norm": 2.0169646739959717, + "learning_rate": 1.3720000000000002e-05, + "loss": 0.3427, + "step": 3430 + }, + { + "epoch": 0.06864, + "grad_norm": 1.4175673723220825, + "learning_rate": 1.3728000000000001e-05, + "loss": 0.2561, + "step": 3432 + }, + { + "epoch": 0.06868, + "grad_norm": 1.2154293060302734, + "learning_rate": 1.3736e-05, + "loss": 0.1811, + "step": 3434 + }, + { + "epoch": 0.06872, + "grad_norm": 1.230375051498413, + "learning_rate": 1.3744000000000003e-05, + "loss": 0.2428, + "step": 3436 + }, + { + "epoch": 0.06876, + "grad_norm": 0.9398014545440674, + "learning_rate": 1.3752000000000001e-05, + "loss": 0.1937, + "step": 3438 + }, + { + "epoch": 0.0688, + "grad_norm": 1.1455621719360352, + "learning_rate": 1.376e-05, + "loss": 0.2024, + "step": 3440 + }, + { + "epoch": 0.06884, + "grad_norm": 1.1039083003997803, + "learning_rate": 1.3768000000000001e-05, + "loss": 0.1841, + "step": 3442 + }, + { + "epoch": 0.06888, + "grad_norm": 1.3663400411605835, + "learning_rate": 1.3776e-05, + "loss": 0.2176, + "step": 3444 + }, + { + "epoch": 0.06892, + "grad_norm": 1.0299406051635742, + "learning_rate": 1.3784000000000002e-05, + "loss": 0.2258, + "step": 3446 + }, + { + "epoch": 0.06896, + "grad_norm": 1.1880073547363281, + "learning_rate": 1.3792000000000001e-05, + "loss": 0.2562, + "step": 3448 + }, + { + "epoch": 0.069, + "grad_norm": 1.1506370306015015, + "learning_rate": 1.38e-05, + "loss": 0.2593, + "step": 3450 + }, + { + "epoch": 0.06904, + "grad_norm": 1.0875287055969238, + "learning_rate": 1.3808e-05, + "loss": 0.2466, + "step": 3452 + }, + { + "epoch": 0.06908, + "grad_norm": 1.1214144229888916, + "learning_rate": 1.3816000000000001e-05, + "loss": 0.2442, + "step": 3454 + }, + { + "epoch": 0.06912, + "grad_norm": 1.2070279121398926, + "learning_rate": 1.3824000000000002e-05, + "loss": 0.222, + "step": 3456 + }, + { + "epoch": 0.06916, + "grad_norm": 1.0537116527557373, + "learning_rate": 1.3832e-05, + "loss": 0.1937, + "step": 3458 + }, + { + "epoch": 0.0692, + "grad_norm": 1.5934107303619385, + "learning_rate": 1.384e-05, + "loss": 0.353, + "step": 3460 + }, + { + "epoch": 0.06924, + "grad_norm": 0.9291589856147766, + "learning_rate": 1.3848000000000002e-05, + "loss": 0.1446, + "step": 3462 + }, + { + "epoch": 0.06928, + "grad_norm": 0.6940327286720276, + "learning_rate": 1.3856e-05, + "loss": 0.1179, + "step": 3464 + }, + { + "epoch": 0.06932, + "grad_norm": 1.9046045541763306, + "learning_rate": 1.3864000000000001e-05, + "loss": 0.3805, + "step": 3466 + }, + { + "epoch": 0.06936, + "grad_norm": 1.7215635776519775, + "learning_rate": 1.3872e-05, + "loss": 0.3523, + "step": 3468 + }, + { + "epoch": 0.0694, + "grad_norm": 0.8688755631446838, + "learning_rate": 1.3880000000000001e-05, + "loss": 0.1516, + "step": 3470 + }, + { + "epoch": 0.06944, + "grad_norm": 0.9466032981872559, + "learning_rate": 1.3888000000000002e-05, + "loss": 0.1597, + "step": 3472 + }, + { + "epoch": 0.06948, + "grad_norm": 0.8650171160697937, + "learning_rate": 1.3896e-05, + "loss": 0.1248, + "step": 3474 + }, + { + "epoch": 0.06952, + "grad_norm": 1.5906691551208496, + "learning_rate": 1.3904000000000003e-05, + "loss": 0.3029, + "step": 3476 + }, + { + "epoch": 0.06956, + "grad_norm": 1.5652605295181274, + "learning_rate": 1.3912000000000002e-05, + "loss": 0.2506, + "step": 3478 + }, + { + "epoch": 0.0696, + "grad_norm": 1.2757047414779663, + "learning_rate": 1.392e-05, + "loss": 0.2535, + "step": 3480 + }, + { + "epoch": 0.06964, + "grad_norm": 1.0378509759902954, + "learning_rate": 1.3928000000000001e-05, + "loss": 0.2126, + "step": 3482 + }, + { + "epoch": 0.06968, + "grad_norm": 0.7659558653831482, + "learning_rate": 1.3936e-05, + "loss": 0.1176, + "step": 3484 + }, + { + "epoch": 0.06972, + "grad_norm": 1.241816759109497, + "learning_rate": 1.3944000000000002e-05, + "loss": 0.2535, + "step": 3486 + }, + { + "epoch": 0.06976, + "grad_norm": 0.9848979115486145, + "learning_rate": 1.3952000000000001e-05, + "loss": 0.3011, + "step": 3488 + }, + { + "epoch": 0.0698, + "grad_norm": 1.4359294176101685, + "learning_rate": 1.396e-05, + "loss": 0.2379, + "step": 3490 + }, + { + "epoch": 0.06984, + "grad_norm": 1.6194267272949219, + "learning_rate": 1.3968e-05, + "loss": 0.2424, + "step": 3492 + }, + { + "epoch": 0.06988, + "grad_norm": 1.0628589391708374, + "learning_rate": 1.3976000000000001e-05, + "loss": 0.2346, + "step": 3494 + }, + { + "epoch": 0.06992, + "grad_norm": 1.284287452697754, + "learning_rate": 1.3984000000000002e-05, + "loss": 0.2326, + "step": 3496 + }, + { + "epoch": 0.06996, + "grad_norm": 1.131786823272705, + "learning_rate": 1.3992000000000001e-05, + "loss": 0.2024, + "step": 3498 + }, + { + "epoch": 0.07, + "grad_norm": 1.413069248199463, + "learning_rate": 1.4e-05, + "loss": 0.2175, + "step": 3500 + }, + { + "epoch": 0.07004, + "grad_norm": 1.390675663948059, + "learning_rate": 1.4008000000000002e-05, + "loss": 0.2258, + "step": 3502 + }, + { + "epoch": 0.07008, + "grad_norm": 1.4801403284072876, + "learning_rate": 1.4016000000000001e-05, + "loss": 0.3002, + "step": 3504 + }, + { + "epoch": 0.07012, + "grad_norm": 1.1887266635894775, + "learning_rate": 1.4024000000000002e-05, + "loss": 0.2219, + "step": 3506 + }, + { + "epoch": 0.07016, + "grad_norm": 1.0246108770370483, + "learning_rate": 1.4032e-05, + "loss": 0.2233, + "step": 3508 + }, + { + "epoch": 0.0702, + "grad_norm": 0.8625991940498352, + "learning_rate": 1.4040000000000001e-05, + "loss": 0.1919, + "step": 3510 + }, + { + "epoch": 0.07024, + "grad_norm": 1.1141256093978882, + "learning_rate": 1.4048000000000002e-05, + "loss": 0.1709, + "step": 3512 + }, + { + "epoch": 0.07028, + "grad_norm": 1.1223608255386353, + "learning_rate": 1.4056e-05, + "loss": 0.2219, + "step": 3514 + }, + { + "epoch": 0.07032, + "grad_norm": 0.8260738253593445, + "learning_rate": 1.4064000000000003e-05, + "loss": 0.1967, + "step": 3516 + }, + { + "epoch": 0.07036, + "grad_norm": 1.1901447772979736, + "learning_rate": 1.4072000000000002e-05, + "loss": 0.2125, + "step": 3518 + }, + { + "epoch": 0.0704, + "grad_norm": 1.2341339588165283, + "learning_rate": 1.408e-05, + "loss": 0.2218, + "step": 3520 + }, + { + "epoch": 0.07044, + "grad_norm": 1.1961063146591187, + "learning_rate": 1.4088000000000001e-05, + "loss": 0.1778, + "step": 3522 + }, + { + "epoch": 0.07048, + "grad_norm": 1.2075810432434082, + "learning_rate": 1.4096e-05, + "loss": 0.2319, + "step": 3524 + }, + { + "epoch": 0.07052, + "grad_norm": 0.6394560933113098, + "learning_rate": 1.4104000000000003e-05, + "loss": 0.2053, + "step": 3526 + }, + { + "epoch": 0.07056, + "grad_norm": 1.057494878768921, + "learning_rate": 1.4112000000000001e-05, + "loss": 0.1596, + "step": 3528 + }, + { + "epoch": 0.0706, + "grad_norm": 0.7919533252716064, + "learning_rate": 1.412e-05, + "loss": 0.3094, + "step": 3530 + }, + { + "epoch": 0.07064, + "grad_norm": 1.1525452136993408, + "learning_rate": 1.4128000000000001e-05, + "loss": 0.2231, + "step": 3532 + }, + { + "epoch": 0.07068, + "grad_norm": 0.45033857226371765, + "learning_rate": 1.4136000000000002e-05, + "loss": 0.1329, + "step": 3534 + }, + { + "epoch": 0.07072, + "grad_norm": 0.4025343060493469, + "learning_rate": 1.4144000000000002e-05, + "loss": 0.1743, + "step": 3536 + }, + { + "epoch": 0.07076, + "grad_norm": 1.8443666696548462, + "learning_rate": 1.4152000000000001e-05, + "loss": 0.34, + "step": 3538 + }, + { + "epoch": 0.0708, + "grad_norm": 1.6619844436645508, + "learning_rate": 1.416e-05, + "loss": 0.2154, + "step": 3540 + }, + { + "epoch": 0.07084, + "grad_norm": 0.7092574834823608, + "learning_rate": 1.4168000000000002e-05, + "loss": 0.4207, + "step": 3542 + }, + { + "epoch": 0.07088, + "grad_norm": 1.0335770845413208, + "learning_rate": 1.4176000000000001e-05, + "loss": 0.167, + "step": 3544 + }, + { + "epoch": 0.07092, + "grad_norm": 1.0696958303451538, + "learning_rate": 1.4184000000000002e-05, + "loss": 0.2637, + "step": 3546 + }, + { + "epoch": 0.07096, + "grad_norm": 1.5977121591567993, + "learning_rate": 1.4192e-05, + "loss": 0.3059, + "step": 3548 + }, + { + "epoch": 0.071, + "grad_norm": 1.0432473421096802, + "learning_rate": 1.4200000000000001e-05, + "loss": 0.2023, + "step": 3550 + }, + { + "epoch": 0.07104, + "grad_norm": 0.9710912704467773, + "learning_rate": 1.4208000000000002e-05, + "loss": 0.1841, + "step": 3552 + }, + { + "epoch": 0.07108, + "grad_norm": 1.6840708255767822, + "learning_rate": 1.4216e-05, + "loss": 0.2281, + "step": 3554 + }, + { + "epoch": 0.07112, + "grad_norm": 0.9869792461395264, + "learning_rate": 1.4224000000000003e-05, + "loss": 0.2777, + "step": 3556 + }, + { + "epoch": 0.07116, + "grad_norm": 0.5709349513053894, + "learning_rate": 1.4232000000000002e-05, + "loss": 0.1075, + "step": 3558 + }, + { + "epoch": 0.0712, + "grad_norm": 0.8481937646865845, + "learning_rate": 1.4240000000000001e-05, + "loss": 0.1247, + "step": 3560 + }, + { + "epoch": 0.07124, + "grad_norm": 0.8284156322479248, + "learning_rate": 1.4248000000000001e-05, + "loss": 0.2548, + "step": 3562 + }, + { + "epoch": 0.07128, + "grad_norm": 0.7816351056098938, + "learning_rate": 1.4256e-05, + "loss": 0.3323, + "step": 3564 + }, + { + "epoch": 0.07132, + "grad_norm": 1.904789686203003, + "learning_rate": 1.4264e-05, + "loss": 0.2281, + "step": 3566 + }, + { + "epoch": 0.07136, + "grad_norm": 0.6201867461204529, + "learning_rate": 1.4272000000000002e-05, + "loss": 0.1073, + "step": 3568 + }, + { + "epoch": 0.0714, + "grad_norm": 0.758331835269928, + "learning_rate": 1.428e-05, + "loss": 0.2346, + "step": 3570 + }, + { + "epoch": 0.07144, + "grad_norm": 1.967246413230896, + "learning_rate": 1.4288000000000001e-05, + "loss": 0.2721, + "step": 3572 + }, + { + "epoch": 0.07148, + "grad_norm": 1.4796680212020874, + "learning_rate": 1.4296000000000002e-05, + "loss": 0.3528, + "step": 3574 + }, + { + "epoch": 0.07152, + "grad_norm": 0.7902001142501831, + "learning_rate": 1.4304e-05, + "loss": 0.2278, + "step": 3576 + }, + { + "epoch": 0.07156, + "grad_norm": 1.4275317192077637, + "learning_rate": 1.4312000000000001e-05, + "loss": 0.2771, + "step": 3578 + }, + { + "epoch": 0.0716, + "grad_norm": 1.3200522661209106, + "learning_rate": 1.432e-05, + "loss": 0.2327, + "step": 3580 + }, + { + "epoch": 0.07164, + "grad_norm": 1.1139793395996094, + "learning_rate": 1.4328000000000002e-05, + "loss": 0.2348, + "step": 3582 + }, + { + "epoch": 0.07168, + "grad_norm": 0.9984740614891052, + "learning_rate": 1.4336000000000001e-05, + "loss": 0.1759, + "step": 3584 + }, + { + "epoch": 0.07172, + "grad_norm": 1.909267783164978, + "learning_rate": 1.4344e-05, + "loss": 0.2354, + "step": 3586 + }, + { + "epoch": 0.07176, + "grad_norm": 2.4918625354766846, + "learning_rate": 1.4352e-05, + "loss": 0.2569, + "step": 3588 + }, + { + "epoch": 0.0718, + "grad_norm": 1.0520251989364624, + "learning_rate": 1.4360000000000001e-05, + "loss": 0.1072, + "step": 3590 + }, + { + "epoch": 0.07184, + "grad_norm": 0.6004278659820557, + "learning_rate": 1.4368000000000002e-05, + "loss": 0.2845, + "step": 3592 + }, + { + "epoch": 0.07188, + "grad_norm": 0.6710703372955322, + "learning_rate": 1.4376000000000001e-05, + "loss": 0.0802, + "step": 3594 + }, + { + "epoch": 0.07192, + "grad_norm": 0.5675949454307556, + "learning_rate": 1.4384e-05, + "loss": 0.2809, + "step": 3596 + }, + { + "epoch": 0.07196, + "grad_norm": 0.8083308339118958, + "learning_rate": 1.4392000000000002e-05, + "loss": 0.3144, + "step": 3598 + }, + { + "epoch": 0.072, + "grad_norm": 1.9125927686691284, + "learning_rate": 1.4400000000000001e-05, + "loss": 0.2031, + "step": 3600 + }, + { + "epoch": 0.07204, + "grad_norm": 2.091763496398926, + "learning_rate": 1.4408000000000002e-05, + "loss": 0.4238, + "step": 3602 + }, + { + "epoch": 0.07208, + "grad_norm": 1.0892051458358765, + "learning_rate": 1.4416e-05, + "loss": 0.1589, + "step": 3604 + }, + { + "epoch": 0.07212, + "grad_norm": 1.1458592414855957, + "learning_rate": 1.4424e-05, + "loss": 0.204, + "step": 3606 + }, + { + "epoch": 0.07216, + "grad_norm": 1.452718734741211, + "learning_rate": 1.4432000000000002e-05, + "loss": 0.2064, + "step": 3608 + }, + { + "epoch": 0.0722, + "grad_norm": 1.0692187547683716, + "learning_rate": 1.444e-05, + "loss": 0.1671, + "step": 3610 + }, + { + "epoch": 0.07224, + "grad_norm": 1.537795066833496, + "learning_rate": 1.4448000000000001e-05, + "loss": 0.2544, + "step": 3612 + }, + { + "epoch": 0.07228, + "grad_norm": 1.1776238679885864, + "learning_rate": 1.4456000000000002e-05, + "loss": 0.1685, + "step": 3614 + }, + { + "epoch": 0.07232, + "grad_norm": 0.9686689972877502, + "learning_rate": 1.4464e-05, + "loss": 0.1443, + "step": 3616 + }, + { + "epoch": 0.07236, + "grad_norm": 1.8658581972122192, + "learning_rate": 1.4472000000000001e-05, + "loss": 0.2922, + "step": 3618 + }, + { + "epoch": 0.0724, + "grad_norm": 1.658939003944397, + "learning_rate": 1.448e-05, + "loss": 0.4107, + "step": 3620 + }, + { + "epoch": 0.07244, + "grad_norm": 1.4172701835632324, + "learning_rate": 1.4488000000000003e-05, + "loss": 0.3003, + "step": 3622 + }, + { + "epoch": 0.07248, + "grad_norm": 1.258033275604248, + "learning_rate": 1.4496000000000001e-05, + "loss": 0.2061, + "step": 3624 + }, + { + "epoch": 0.07252, + "grad_norm": 1.3725205659866333, + "learning_rate": 1.4504e-05, + "loss": 0.301, + "step": 3626 + }, + { + "epoch": 0.07256, + "grad_norm": 1.371358036994934, + "learning_rate": 1.4512000000000001e-05, + "loss": 0.2765, + "step": 3628 + }, + { + "epoch": 0.0726, + "grad_norm": 0.9109688997268677, + "learning_rate": 1.4520000000000002e-05, + "loss": 0.1447, + "step": 3630 + }, + { + "epoch": 0.07264, + "grad_norm": 0.9287992119789124, + "learning_rate": 1.4528000000000002e-05, + "loss": 0.1763, + "step": 3632 + }, + { + "epoch": 0.07268, + "grad_norm": 1.039117455482483, + "learning_rate": 1.4536000000000001e-05, + "loss": 0.2643, + "step": 3634 + }, + { + "epoch": 0.07272, + "grad_norm": 1.5568585395812988, + "learning_rate": 1.4544e-05, + "loss": 0.2885, + "step": 3636 + }, + { + "epoch": 0.07276, + "grad_norm": 1.4213144779205322, + "learning_rate": 1.4552000000000002e-05, + "loss": 0.2103, + "step": 3638 + }, + { + "epoch": 0.0728, + "grad_norm": 1.0280758142471313, + "learning_rate": 1.4560000000000001e-05, + "loss": 0.2781, + "step": 3640 + }, + { + "epoch": 0.07284, + "grad_norm": 1.3051283359527588, + "learning_rate": 1.4568000000000002e-05, + "loss": 0.2651, + "step": 3642 + }, + { + "epoch": 0.07288, + "grad_norm": 1.4106764793395996, + "learning_rate": 1.4576e-05, + "loss": 0.2383, + "step": 3644 + }, + { + "epoch": 0.07292, + "grad_norm": 1.0063711404800415, + "learning_rate": 1.4584e-05, + "loss": 0.2037, + "step": 3646 + }, + { + "epoch": 0.07296, + "grad_norm": 0.9925281405448914, + "learning_rate": 1.4592000000000002e-05, + "loss": 0.1936, + "step": 3648 + }, + { + "epoch": 0.073, + "grad_norm": 0.8032163381576538, + "learning_rate": 1.46e-05, + "loss": 0.2031, + "step": 3650 + }, + { + "epoch": 0.07304, + "grad_norm": 1.7162457704544067, + "learning_rate": 1.4608000000000001e-05, + "loss": 0.34, + "step": 3652 + }, + { + "epoch": 0.07308, + "grad_norm": 0.9404834508895874, + "learning_rate": 1.4616000000000002e-05, + "loss": 0.1245, + "step": 3654 + }, + { + "epoch": 0.07312, + "grad_norm": 1.7860606908798218, + "learning_rate": 1.4624000000000001e-05, + "loss": 0.4091, + "step": 3656 + }, + { + "epoch": 0.07316, + "grad_norm": 1.4453808069229126, + "learning_rate": 1.4632000000000002e-05, + "loss": 0.2543, + "step": 3658 + }, + { + "epoch": 0.0732, + "grad_norm": 0.9069640636444092, + "learning_rate": 1.464e-05, + "loss": 0.1368, + "step": 3660 + }, + { + "epoch": 0.07324, + "grad_norm": 1.0580672025680542, + "learning_rate": 1.4648000000000003e-05, + "loss": 0.2468, + "step": 3662 + }, + { + "epoch": 0.07328, + "grad_norm": 1.4719791412353516, + "learning_rate": 1.4656000000000002e-05, + "loss": 0.2102, + "step": 3664 + }, + { + "epoch": 0.07332, + "grad_norm": 0.8857517838478088, + "learning_rate": 1.4664e-05, + "loss": 0.178, + "step": 3666 + }, + { + "epoch": 0.07336, + "grad_norm": 1.0032685995101929, + "learning_rate": 1.4672000000000001e-05, + "loss": 0.2423, + "step": 3668 + }, + { + "epoch": 0.0734, + "grad_norm": 1.4047688245773315, + "learning_rate": 1.4680000000000002e-05, + "loss": 0.188, + "step": 3670 + }, + { + "epoch": 0.07344, + "grad_norm": 1.2754416465759277, + "learning_rate": 1.4688000000000002e-05, + "loss": 0.2594, + "step": 3672 + }, + { + "epoch": 0.07348, + "grad_norm": 0.9624300003051758, + "learning_rate": 1.4696000000000001e-05, + "loss": 0.1438, + "step": 3674 + }, + { + "epoch": 0.07352, + "grad_norm": 1.6493474245071411, + "learning_rate": 1.4704e-05, + "loss": 0.2279, + "step": 3676 + }, + { + "epoch": 0.07356, + "grad_norm": 1.5016306638717651, + "learning_rate": 1.4712000000000002e-05, + "loss": 0.1879, + "step": 3678 + }, + { + "epoch": 0.0736, + "grad_norm": 1.1527354717254639, + "learning_rate": 1.4720000000000001e-05, + "loss": 0.2867, + "step": 3680 + }, + { + "epoch": 0.07364, + "grad_norm": 1.5850659608840942, + "learning_rate": 1.4728000000000002e-05, + "loss": 0.2765, + "step": 3682 + }, + { + "epoch": 0.07368, + "grad_norm": 1.4731638431549072, + "learning_rate": 1.4736000000000001e-05, + "loss": 0.277, + "step": 3684 + }, + { + "epoch": 0.07372, + "grad_norm": 1.2773358821868896, + "learning_rate": 1.4744e-05, + "loss": 0.1957, + "step": 3686 + }, + { + "epoch": 0.07376, + "grad_norm": 1.6683952808380127, + "learning_rate": 1.4752000000000002e-05, + "loss": 0.2118, + "step": 3688 + }, + { + "epoch": 0.0738, + "grad_norm": 1.256117582321167, + "learning_rate": 1.4760000000000001e-05, + "loss": 0.2025, + "step": 3690 + }, + { + "epoch": 0.07384, + "grad_norm": 1.523796796798706, + "learning_rate": 1.4768e-05, + "loss": 0.2221, + "step": 3692 + }, + { + "epoch": 0.07388, + "grad_norm": 1.520205020904541, + "learning_rate": 1.4776000000000002e-05, + "loss": 0.313, + "step": 3694 + }, + { + "epoch": 0.07392, + "grad_norm": 1.1466065645217896, + "learning_rate": 1.4784000000000001e-05, + "loss": 0.1936, + "step": 3696 + }, + { + "epoch": 0.07396, + "grad_norm": 1.1140366792678833, + "learning_rate": 1.4792000000000002e-05, + "loss": 0.2466, + "step": 3698 + }, + { + "epoch": 0.074, + "grad_norm": 1.2415860891342163, + "learning_rate": 1.48e-05, + "loss": 0.1854, + "step": 3700 + }, + { + "epoch": 0.07404, + "grad_norm": 1.626991868019104, + "learning_rate": 1.4808e-05, + "loss": 0.277, + "step": 3702 + }, + { + "epoch": 0.07408, + "grad_norm": 1.2466593980789185, + "learning_rate": 1.4816000000000002e-05, + "loss": 0.2232, + "step": 3704 + }, + { + "epoch": 0.07412, + "grad_norm": 1.1414291858673096, + "learning_rate": 1.4824e-05, + "loss": 0.1708, + "step": 3706 + }, + { + "epoch": 0.07416, + "grad_norm": 1.4647233486175537, + "learning_rate": 1.4832000000000001e-05, + "loss": 0.1686, + "step": 3708 + }, + { + "epoch": 0.0742, + "grad_norm": 1.5783674716949463, + "learning_rate": 1.4840000000000002e-05, + "loss": 0.2542, + "step": 3710 + }, + { + "epoch": 0.07424, + "grad_norm": 1.9699448347091675, + "learning_rate": 1.4848e-05, + "loss": 0.3674, + "step": 3712 + }, + { + "epoch": 0.07428, + "grad_norm": 1.5950137376785278, + "learning_rate": 1.4856000000000001e-05, + "loss": 0.303, + "step": 3714 + }, + { + "epoch": 0.07432, + "grad_norm": 1.533947467803955, + "learning_rate": 1.4864e-05, + "loss": 0.2541, + "step": 3716 + }, + { + "epoch": 0.07436, + "grad_norm": 1.138789415359497, + "learning_rate": 1.4872000000000003e-05, + "loss": 0.2593, + "step": 3718 + }, + { + "epoch": 0.0744, + "grad_norm": 1.5689197778701782, + "learning_rate": 1.4880000000000002e-05, + "loss": 0.2125, + "step": 3720 + }, + { + "epoch": 0.07444, + "grad_norm": 1.7709120512008667, + "learning_rate": 1.4888e-05, + "loss": 0.3009, + "step": 3722 + }, + { + "epoch": 0.07448, + "grad_norm": 1.4901987314224243, + "learning_rate": 1.4896000000000001e-05, + "loss": 0.2594, + "step": 3724 + }, + { + "epoch": 0.07452, + "grad_norm": 1.2525999546051025, + "learning_rate": 1.4904e-05, + "loss": 0.2686, + "step": 3726 + }, + { + "epoch": 0.07456, + "grad_norm": 1.1192362308502197, + "learning_rate": 1.4912000000000002e-05, + "loss": 0.2038, + "step": 3728 + }, + { + "epoch": 0.0746, + "grad_norm": 1.3774000406265259, + "learning_rate": 1.4920000000000001e-05, + "loss": 0.2024, + "step": 3730 + }, + { + "epoch": 0.07464, + "grad_norm": 1.2776662111282349, + "learning_rate": 1.4928e-05, + "loss": 0.2542, + "step": 3732 + }, + { + "epoch": 0.07468, + "grad_norm": 1.4504011869430542, + "learning_rate": 1.4936000000000002e-05, + "loss": 0.2221, + "step": 3734 + }, + { + "epoch": 0.07472, + "grad_norm": 1.450835943222046, + "learning_rate": 1.4944000000000001e-05, + "loss": 0.1917, + "step": 3736 + }, + { + "epoch": 0.07476, + "grad_norm": 1.882724642753601, + "learning_rate": 1.4952000000000002e-05, + "loss": 0.2778, + "step": 3738 + }, + { + "epoch": 0.0748, + "grad_norm": 0.6929258108139038, + "learning_rate": 1.496e-05, + "loss": 0.2569, + "step": 3740 + }, + { + "epoch": 0.07484, + "grad_norm": 1.593355417251587, + "learning_rate": 1.4968e-05, + "loss": 0.2151, + "step": 3742 + }, + { + "epoch": 0.07488, + "grad_norm": 1.4211063385009766, + "learning_rate": 1.4976000000000002e-05, + "loss": 0.2346, + "step": 3744 + }, + { + "epoch": 0.07492, + "grad_norm": 0.84869784116745, + "learning_rate": 1.4984000000000001e-05, + "loss": 0.1122, + "step": 3746 + }, + { + "epoch": 0.07496, + "grad_norm": 1.1938934326171875, + "learning_rate": 1.4992000000000001e-05, + "loss": 0.2958, + "step": 3748 + }, + { + "epoch": 0.075, + "grad_norm": 0.9422699809074402, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.1379, + "step": 3750 + }, + { + "epoch": 0.07504, + "grad_norm": 1.1981996297836304, + "learning_rate": 1.5008000000000001e-05, + "loss": 0.2688, + "step": 3752 + }, + { + "epoch": 0.07508, + "grad_norm": 1.246044397354126, + "learning_rate": 1.5016000000000002e-05, + "loss": 0.2443, + "step": 3754 + }, + { + "epoch": 0.07512, + "grad_norm": 1.1230454444885254, + "learning_rate": 1.5024e-05, + "loss": 0.2024, + "step": 3756 + }, + { + "epoch": 0.07516, + "grad_norm": 1.2598665952682495, + "learning_rate": 1.5032000000000003e-05, + "loss": 0.2024, + "step": 3758 + }, + { + "epoch": 0.0752, + "grad_norm": 1.1607091426849365, + "learning_rate": 1.5040000000000002e-05, + "loss": 0.2125, + "step": 3760 + }, + { + "epoch": 0.07524, + "grad_norm": 1.5169909000396729, + "learning_rate": 1.5048e-05, + "loss": 0.3009, + "step": 3762 + }, + { + "epoch": 0.07528, + "grad_norm": 1.5544832944869995, + "learning_rate": 1.5056000000000001e-05, + "loss": 0.2648, + "step": 3764 + }, + { + "epoch": 0.07532, + "grad_norm": 1.8809185028076172, + "learning_rate": 1.5064e-05, + "loss": 0.2507, + "step": 3766 + }, + { + "epoch": 0.07536, + "grad_norm": 0.8778438568115234, + "learning_rate": 1.5072000000000002e-05, + "loss": 0.1062, + "step": 3768 + }, + { + "epoch": 0.0754, + "grad_norm": 2.227802276611328, + "learning_rate": 1.5080000000000001e-05, + "loss": 0.4239, + "step": 3770 + }, + { + "epoch": 0.07544, + "grad_norm": 0.7673084139823914, + "learning_rate": 1.5088e-05, + "loss": 0.0899, + "step": 3772 + }, + { + "epoch": 0.07548, + "grad_norm": 1.146222472190857, + "learning_rate": 1.5096000000000003e-05, + "loss": 0.1446, + "step": 3774 + }, + { + "epoch": 0.07552, + "grad_norm": 0.9319490194320679, + "learning_rate": 1.5104000000000001e-05, + "loss": 0.1115, + "step": 3776 + }, + { + "epoch": 0.07556, + "grad_norm": 2.5422964096069336, + "learning_rate": 1.5112000000000002e-05, + "loss": 0.2936, + "step": 3778 + }, + { + "epoch": 0.0756, + "grad_norm": 0.8567708730697632, + "learning_rate": 1.5120000000000001e-05, + "loss": 0.09, + "step": 3780 + }, + { + "epoch": 0.07564, + "grad_norm": 2.587541341781616, + "learning_rate": 1.5128e-05, + "loss": 0.4703, + "step": 3782 + }, + { + "epoch": 0.07568, + "grad_norm": 2.295764923095703, + "learning_rate": 1.5136000000000002e-05, + "loss": 0.4716, + "step": 3784 + }, + { + "epoch": 0.07572, + "grad_norm": 1.1814360618591309, + "learning_rate": 1.5144000000000001e-05, + "loss": 0.1672, + "step": 3786 + }, + { + "epoch": 0.07576, + "grad_norm": 1.464924931526184, + "learning_rate": 1.5152000000000002e-05, + "loss": 0.1918, + "step": 3788 + }, + { + "epoch": 0.0758, + "grad_norm": 1.483378291130066, + "learning_rate": 1.516e-05, + "loss": 0.2544, + "step": 3790 + }, + { + "epoch": 0.07584, + "grad_norm": 1.0949760675430298, + "learning_rate": 1.5168000000000001e-05, + "loss": 0.2466, + "step": 3792 + }, + { + "epoch": 0.07588, + "grad_norm": 1.5987359285354614, + "learning_rate": 1.5176000000000002e-05, + "loss": 0.2649, + "step": 3794 + }, + { + "epoch": 0.07592, + "grad_norm": 1.2861148118972778, + "learning_rate": 1.5184e-05, + "loss": 0.277, + "step": 3796 + }, + { + "epoch": 0.07596, + "grad_norm": 1.267193078994751, + "learning_rate": 1.5192000000000003e-05, + "loss": 0.232, + "step": 3798 + }, + { + "epoch": 0.076, + "grad_norm": 1.2787014245986938, + "learning_rate": 1.5200000000000002e-05, + "loss": 0.1989, + "step": 3800 + }, + { + "epoch": 0.07604, + "grad_norm": 1.171817421913147, + "learning_rate": 1.5208e-05, + "loss": 0.1957, + "step": 3802 + }, + { + "epoch": 0.07608, + "grad_norm": 1.3087754249572754, + "learning_rate": 1.5216000000000001e-05, + "loss": 0.2562, + "step": 3804 + }, + { + "epoch": 0.07612, + "grad_norm": 1.4428938627243042, + "learning_rate": 1.5224e-05, + "loss": 0.3005, + "step": 3806 + }, + { + "epoch": 0.07616, + "grad_norm": 1.1188089847564697, + "learning_rate": 1.5232000000000003e-05, + "loss": 0.222, + "step": 3808 + }, + { + "epoch": 0.0762, + "grad_norm": 1.2211463451385498, + "learning_rate": 1.5240000000000001e-05, + "loss": 0.2649, + "step": 3810 + }, + { + "epoch": 0.07624, + "grad_norm": 0.9540471434593201, + "learning_rate": 1.5248e-05, + "loss": 0.1931, + "step": 3812 + }, + { + "epoch": 0.07628, + "grad_norm": 1.0028493404388428, + "learning_rate": 1.5256000000000003e-05, + "loss": 0.1712, + "step": 3814 + }, + { + "epoch": 0.07632, + "grad_norm": 0.9392350912094116, + "learning_rate": 1.5264e-05, + "loss": 0.2146, + "step": 3816 + }, + { + "epoch": 0.07636, + "grad_norm": 1.1602119207382202, + "learning_rate": 1.5272e-05, + "loss": 0.2038, + "step": 3818 + }, + { + "epoch": 0.0764, + "grad_norm": 0.9701266884803772, + "learning_rate": 1.5280000000000003e-05, + "loss": 0.1843, + "step": 3820 + }, + { + "epoch": 0.07644, + "grad_norm": 1.1227836608886719, + "learning_rate": 1.5288e-05, + "loss": 0.2543, + "step": 3822 + }, + { + "epoch": 0.07648, + "grad_norm": 1.1142396926879883, + "learning_rate": 1.5296e-05, + "loss": 0.2429, + "step": 3824 + }, + { + "epoch": 0.07652, + "grad_norm": 1.3046536445617676, + "learning_rate": 1.5304e-05, + "loss": 0.2441, + "step": 3826 + }, + { + "epoch": 0.07656, + "grad_norm": 1.0184521675109863, + "learning_rate": 1.5312000000000002e-05, + "loss": 0.1931, + "step": 3828 + }, + { + "epoch": 0.0766, + "grad_norm": 1.1380181312561035, + "learning_rate": 1.5320000000000002e-05, + "loss": 0.2896, + "step": 3830 + }, + { + "epoch": 0.07664, + "grad_norm": 0.9025845527648926, + "learning_rate": 1.5328e-05, + "loss": 0.1445, + "step": 3832 + }, + { + "epoch": 0.07668, + "grad_norm": 1.0801448822021484, + "learning_rate": 1.5336000000000004e-05, + "loss": 0.1935, + "step": 3834 + }, + { + "epoch": 0.07672, + "grad_norm": 1.0308893918991089, + "learning_rate": 1.5344e-05, + "loss": 0.1464, + "step": 3836 + }, + { + "epoch": 0.07676, + "grad_norm": 1.1166125535964966, + "learning_rate": 1.5352e-05, + "loss": 0.1587, + "step": 3838 + }, + { + "epoch": 0.0768, + "grad_norm": 2.010183095932007, + "learning_rate": 1.5360000000000002e-05, + "loss": 0.3315, + "step": 3840 + }, + { + "epoch": 0.07684, + "grad_norm": 1.3305646181106567, + "learning_rate": 1.5368e-05, + "loss": 0.2768, + "step": 3842 + }, + { + "epoch": 0.07688, + "grad_norm": 0.9270874261856079, + "learning_rate": 1.5376000000000003e-05, + "loss": 0.3145, + "step": 3844 + }, + { + "epoch": 0.07692, + "grad_norm": 1.092678427696228, + "learning_rate": 1.5384e-05, + "loss": 0.1928, + "step": 3846 + }, + { + "epoch": 0.07696, + "grad_norm": 0.8483322262763977, + "learning_rate": 1.5392e-05, + "loss": 0.1523, + "step": 3848 + }, + { + "epoch": 0.077, + "grad_norm": 0.7298541069030762, + "learning_rate": 1.54e-05, + "loss": 0.1266, + "step": 3850 + }, + { + "epoch": 0.07704, + "grad_norm": 2.4266042709350586, + "learning_rate": 1.5408000000000002e-05, + "loss": 0.4298, + "step": 3852 + }, + { + "epoch": 0.07708, + "grad_norm": 2.4624226093292236, + "learning_rate": 1.5416000000000003e-05, + "loss": 0.3217, + "step": 3854 + }, + { + "epoch": 0.07712, + "grad_norm": 1.8468472957611084, + "learning_rate": 1.5424e-05, + "loss": 0.3166, + "step": 3856 + }, + { + "epoch": 0.07716, + "grad_norm": 1.0525224208831787, + "learning_rate": 1.5432e-05, + "loss": 0.1215, + "step": 3858 + }, + { + "epoch": 0.0772, + "grad_norm": 1.3043005466461182, + "learning_rate": 1.544e-05, + "loss": 0.2788, + "step": 3860 + }, + { + "epoch": 0.07724, + "grad_norm": 1.2234420776367188, + "learning_rate": 1.5448000000000002e-05, + "loss": 0.2119, + "step": 3862 + }, + { + "epoch": 0.07728, + "grad_norm": 1.265946626663208, + "learning_rate": 1.5456000000000002e-05, + "loss": 0.2771, + "step": 3864 + }, + { + "epoch": 0.07732, + "grad_norm": 1.8008368015289307, + "learning_rate": 1.5464e-05, + "loss": 0.3803, + "step": 3866 + }, + { + "epoch": 0.07736, + "grad_norm": 1.1337114572525024, + "learning_rate": 1.5472e-05, + "loss": 0.2788, + "step": 3868 + }, + { + "epoch": 0.0774, + "grad_norm": 0.9888458847999573, + "learning_rate": 1.548e-05, + "loss": 0.2024, + "step": 3870 + }, + { + "epoch": 0.07744, + "grad_norm": 0.7548109292984009, + "learning_rate": 1.5488e-05, + "loss": 0.1448, + "step": 3872 + }, + { + "epoch": 0.07748, + "grad_norm": 1.0323429107666016, + "learning_rate": 1.5496000000000002e-05, + "loss": 0.3106, + "step": 3874 + }, + { + "epoch": 0.07752, + "grad_norm": 0.8684133291244507, + "learning_rate": 1.5504000000000003e-05, + "loss": 0.2427, + "step": 3876 + }, + { + "epoch": 0.07756, + "grad_norm": 1.0497536659240723, + "learning_rate": 1.5512e-05, + "loss": 0.244, + "step": 3878 + }, + { + "epoch": 0.0776, + "grad_norm": 1.0301234722137451, + "learning_rate": 1.552e-05, + "loss": 0.2127, + "step": 3880 + }, + { + "epoch": 0.07764, + "grad_norm": 1.2946466207504272, + "learning_rate": 1.5528e-05, + "loss": 0.2146, + "step": 3882 + }, + { + "epoch": 0.07768, + "grad_norm": 1.2091281414031982, + "learning_rate": 1.5536e-05, + "loss": 0.2535, + "step": 3884 + }, + { + "epoch": 0.07772, + "grad_norm": 0.9580293893814087, + "learning_rate": 1.5544000000000002e-05, + "loss": 0.1935, + "step": 3886 + }, + { + "epoch": 0.07776, + "grad_norm": 1.5131900310516357, + "learning_rate": 1.5552e-05, + "loss": 0.2899, + "step": 3888 + }, + { + "epoch": 0.0778, + "grad_norm": 1.6066267490386963, + "learning_rate": 1.556e-05, + "loss": 0.2412, + "step": 3890 + }, + { + "epoch": 0.07784, + "grad_norm": 1.3287827968597412, + "learning_rate": 1.5568e-05, + "loss": 0.2541, + "step": 3892 + }, + { + "epoch": 0.07788, + "grad_norm": 0.8129069805145264, + "learning_rate": 1.5576e-05, + "loss": 0.192, + "step": 3894 + }, + { + "epoch": 0.07792, + "grad_norm": 0.9389771819114685, + "learning_rate": 1.5584000000000002e-05, + "loss": 0.1763, + "step": 3896 + }, + { + "epoch": 0.07796, + "grad_norm": 1.5012221336364746, + "learning_rate": 1.5592e-05, + "loss": 0.2895, + "step": 3898 + }, + { + "epoch": 0.078, + "grad_norm": 1.9337244033813477, + "learning_rate": 1.5600000000000003e-05, + "loss": 0.3696, + "step": 3900 + }, + { + "epoch": 0.07804, + "grad_norm": 0.8082063794136047, + "learning_rate": 1.5608e-05, + "loss": 0.1988, + "step": 3902 + }, + { + "epoch": 0.07808, + "grad_norm": 0.982409656047821, + "learning_rate": 1.5616e-05, + "loss": 0.193, + "step": 3904 + }, + { + "epoch": 0.07812, + "grad_norm": 1.1451822519302368, + "learning_rate": 1.5624e-05, + "loss": 0.1881, + "step": 3906 + }, + { + "epoch": 0.07816, + "grad_norm": 1.129663348197937, + "learning_rate": 1.5632000000000002e-05, + "loss": 0.2535, + "step": 3908 + }, + { + "epoch": 0.0782, + "grad_norm": 1.1826157569885254, + "learning_rate": 1.5640000000000003e-05, + "loss": 0.2898, + "step": 3910 + }, + { + "epoch": 0.07824, + "grad_norm": 0.8734445571899414, + "learning_rate": 1.5648e-05, + "loss": 0.1516, + "step": 3912 + }, + { + "epoch": 0.07828, + "grad_norm": 0.8909580707550049, + "learning_rate": 1.5656000000000004e-05, + "loss": 0.1936, + "step": 3914 + }, + { + "epoch": 0.07832, + "grad_norm": 1.0161789655685425, + "learning_rate": 1.5664e-05, + "loss": 0.2024, + "step": 3916 + }, + { + "epoch": 0.07836, + "grad_norm": 1.020602822303772, + "learning_rate": 1.5672000000000002e-05, + "loss": 0.2467, + "step": 3918 + }, + { + "epoch": 0.0784, + "grad_norm": 1.1887476444244385, + "learning_rate": 1.5680000000000002e-05, + "loss": 0.2428, + "step": 3920 + }, + { + "epoch": 0.07844, + "grad_norm": 0.8532394766807556, + "learning_rate": 1.5688e-05, + "loss": 0.1368, + "step": 3922 + }, + { + "epoch": 0.07848, + "grad_norm": 1.5416443347930908, + "learning_rate": 1.5696000000000004e-05, + "loss": 0.2466, + "step": 3924 + }, + { + "epoch": 0.07852, + "grad_norm": 1.713223934173584, + "learning_rate": 1.5704e-05, + "loss": 0.3543, + "step": 3926 + }, + { + "epoch": 0.07856, + "grad_norm": 1.3488306999206543, + "learning_rate": 1.5712e-05, + "loss": 0.2897, + "step": 3928 + }, + { + "epoch": 0.0786, + "grad_norm": 0.5228361487388611, + "learning_rate": 1.5720000000000002e-05, + "loss": 0.1441, + "step": 3930 + }, + { + "epoch": 0.07864, + "grad_norm": 1.1224550008773804, + "learning_rate": 1.5728000000000003e-05, + "loss": 0.3062, + "step": 3932 + }, + { + "epoch": 0.07868, + "grad_norm": 1.1405080556869507, + "learning_rate": 1.5736000000000003e-05, + "loss": 0.1881, + "step": 3934 + }, + { + "epoch": 0.07872, + "grad_norm": 0.7095906138420105, + "learning_rate": 1.5744e-05, + "loss": 0.1463, + "step": 3936 + }, + { + "epoch": 0.07876, + "grad_norm": 0.8990368843078613, + "learning_rate": 1.5752e-05, + "loss": 0.1597, + "step": 3938 + }, + { + "epoch": 0.0788, + "grad_norm": 0.6129059791564941, + "learning_rate": 1.576e-05, + "loss": 0.2188, + "step": 3940 + }, + { + "epoch": 0.07884, + "grad_norm": 0.8410907983779907, + "learning_rate": 1.5768000000000002e-05, + "loss": 0.3158, + "step": 3942 + }, + { + "epoch": 0.07888, + "grad_norm": 1.7135522365570068, + "learning_rate": 1.5776e-05, + "loss": 0.2188, + "step": 3944 + }, + { + "epoch": 0.07892, + "grad_norm": 0.8836010694503784, + "learning_rate": 1.5784e-05, + "loss": 0.2032, + "step": 3946 + }, + { + "epoch": 0.07896, + "grad_norm": 1.497629165649414, + "learning_rate": 1.5792e-05, + "loss": 0.4177, + "step": 3948 + }, + { + "epoch": 0.079, + "grad_norm": 0.8612191677093506, + "learning_rate": 1.58e-05, + "loss": 0.1371, + "step": 3950 + }, + { + "epoch": 0.07904, + "grad_norm": 1.3593758344650269, + "learning_rate": 1.5808000000000002e-05, + "loss": 0.2347, + "step": 3952 + }, + { + "epoch": 0.07908, + "grad_norm": 1.1692798137664795, + "learning_rate": 1.5816e-05, + "loss": 0.2662, + "step": 3954 + }, + { + "epoch": 0.07912, + "grad_norm": 1.2354578971862793, + "learning_rate": 1.5824000000000003e-05, + "loss": 0.2542, + "step": 3956 + }, + { + "epoch": 0.07916, + "grad_norm": 1.1401523351669312, + "learning_rate": 1.5832e-05, + "loss": 0.2128, + "step": 3958 + }, + { + "epoch": 0.0792, + "grad_norm": 1.7208454608917236, + "learning_rate": 1.584e-05, + "loss": 0.3258, + "step": 3960 + }, + { + "epoch": 0.07924, + "grad_norm": 2.155067205429077, + "learning_rate": 1.5848e-05, + "loss": 0.2914, + "step": 3962 + }, + { + "epoch": 0.07928, + "grad_norm": 2.0777456760406494, + "learning_rate": 1.5856e-05, + "loss": 0.2694, + "step": 3964 + }, + { + "epoch": 0.07932, + "grad_norm": 1.918150544166565, + "learning_rate": 1.5864000000000003e-05, + "loss": 0.2692, + "step": 3966 + }, + { + "epoch": 0.07936, + "grad_norm": 1.9096335172653198, + "learning_rate": 1.5872e-05, + "loss": 0.2629, + "step": 3968 + }, + { + "epoch": 0.0794, + "grad_norm": 1.1397141218185425, + "learning_rate": 1.588e-05, + "loss": 0.1598, + "step": 3970 + }, + { + "epoch": 0.07944, + "grad_norm": 1.4240903854370117, + "learning_rate": 1.5888e-05, + "loss": 0.2235, + "step": 3972 + }, + { + "epoch": 0.07948, + "grad_norm": 1.016892433166504, + "learning_rate": 1.5896e-05, + "loss": 0.1445, + "step": 3974 + }, + { + "epoch": 0.07952, + "grad_norm": 1.5460447072982788, + "learning_rate": 1.5904000000000002e-05, + "loss": 0.226, + "step": 3976 + }, + { + "epoch": 0.07956, + "grad_norm": 1.5494537353515625, + "learning_rate": 1.5912e-05, + "loss": 0.2178, + "step": 3978 + }, + { + "epoch": 0.0796, + "grad_norm": 1.303169846534729, + "learning_rate": 1.5920000000000003e-05, + "loss": 0.1937, + "step": 3980 + }, + { + "epoch": 0.07964, + "grad_norm": 1.2136911153793335, + "learning_rate": 1.5928e-05, + "loss": 0.1842, + "step": 3982 + }, + { + "epoch": 0.07968, + "grad_norm": 0.8528866171836853, + "learning_rate": 1.5936e-05, + "loss": 0.1649, + "step": 3984 + }, + { + "epoch": 0.07972, + "grad_norm": 1.7117078304290771, + "learning_rate": 1.5944000000000002e-05, + "loss": 0.228, + "step": 3986 + }, + { + "epoch": 0.07976, + "grad_norm": 1.6860315799713135, + "learning_rate": 1.5952000000000002e-05, + "loss": 0.2506, + "step": 3988 + }, + { + "epoch": 0.0798, + "grad_norm": 1.0975041389465332, + "learning_rate": 1.5960000000000003e-05, + "loss": 0.1939, + "step": 3990 + }, + { + "epoch": 0.07984, + "grad_norm": 1.4516632556915283, + "learning_rate": 1.5968e-05, + "loss": 0.2428, + "step": 3992 + }, + { + "epoch": 0.07988, + "grad_norm": 0.721763551235199, + "learning_rate": 1.5976e-05, + "loss": 0.1405, + "step": 3994 + }, + { + "epoch": 0.07992, + "grad_norm": 0.9839435815811157, + "learning_rate": 1.5984e-05, + "loss": 0.2221, + "step": 3996 + }, + { + "epoch": 0.07996, + "grad_norm": 1.3853429555892944, + "learning_rate": 1.5992000000000002e-05, + "loss": 0.341, + "step": 3998 + }, + { + "epoch": 0.08, + "grad_norm": 1.297876000404358, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.2346, + "step": 4000 + }, + { + "epoch": 0.08004, + "grad_norm": 1.6260334253311157, + "learning_rate": 1.6008e-05, + "loss": 0.3732, + "step": 4002 + }, + { + "epoch": 0.08008, + "grad_norm": 0.8990416526794434, + "learning_rate": 1.6016e-05, + "loss": 0.1463, + "step": 4004 + }, + { + "epoch": 0.08012, + "grad_norm": 0.9454728364944458, + "learning_rate": 1.6024e-05, + "loss": 0.1367, + "step": 4006 + }, + { + "epoch": 0.08016, + "grad_norm": 1.0498206615447998, + "learning_rate": 1.6032e-05, + "loss": 0.1367, + "step": 4008 + }, + { + "epoch": 0.0802, + "grad_norm": 0.5386999249458313, + "learning_rate": 1.6040000000000002e-05, + "loss": 0.0762, + "step": 4010 + }, + { + "epoch": 0.08024, + "grad_norm": 0.5715454816818237, + "learning_rate": 1.6048000000000003e-05, + "loss": 0.2494, + "step": 4012 + }, + { + "epoch": 0.08028, + "grad_norm": 0.5154113173484802, + "learning_rate": 1.6056e-05, + "loss": 0.1897, + "step": 4014 + }, + { + "epoch": 0.08032, + "grad_norm": 0.6804853081703186, + "learning_rate": 1.6064e-05, + "loss": 0.1968, + "step": 4016 + }, + { + "epoch": 0.08036, + "grad_norm": 0.6206926107406616, + "learning_rate": 1.6072e-05, + "loss": 0.0811, + "step": 4018 + }, + { + "epoch": 0.0804, + "grad_norm": 1.6889179944992065, + "learning_rate": 1.6080000000000002e-05, + "loss": 0.1375, + "step": 4020 + }, + { + "epoch": 0.08044, + "grad_norm": 1.6105074882507324, + "learning_rate": 1.6088000000000002e-05, + "loss": 0.2234, + "step": 4022 + }, + { + "epoch": 0.08048, + "grad_norm": 3.281008243560791, + "learning_rate": 1.6096e-05, + "loss": 0.3787, + "step": 4024 + }, + { + "epoch": 0.08052, + "grad_norm": 3.059906244277954, + "learning_rate": 1.6104e-05, + "loss": 0.7164, + "step": 4026 + }, + { + "epoch": 0.08056, + "grad_norm": 2.301793098449707, + "learning_rate": 1.6112e-05, + "loss": 0.5345, + "step": 4028 + }, + { + "epoch": 0.0806, + "grad_norm": 0.9499800205230713, + "learning_rate": 1.612e-05, + "loss": 0.1988, + "step": 4030 + }, + { + "epoch": 0.08064, + "grad_norm": 1.354335069656372, + "learning_rate": 1.6128000000000002e-05, + "loss": 0.1956, + "step": 4032 + }, + { + "epoch": 0.08068, + "grad_norm": 1.2647520303726196, + "learning_rate": 1.6136e-05, + "loss": 0.2898, + "step": 4034 + }, + { + "epoch": 0.08072, + "grad_norm": 1.6400550603866577, + "learning_rate": 1.6144000000000003e-05, + "loss": 0.3134, + "step": 4036 + }, + { + "epoch": 0.08076, + "grad_norm": 1.3837157487869263, + "learning_rate": 1.6152e-05, + "loss": 0.2899, + "step": 4038 + }, + { + "epoch": 0.0808, + "grad_norm": 1.3280792236328125, + "learning_rate": 1.616e-05, + "loss": 0.2224, + "step": 4040 + }, + { + "epoch": 0.08084, + "grad_norm": 1.6728554964065552, + "learning_rate": 1.6168000000000002e-05, + "loss": 0.3133, + "step": 4042 + }, + { + "epoch": 0.08088, + "grad_norm": 1.3624482154846191, + "learning_rate": 1.6176e-05, + "loss": 0.1846, + "step": 4044 + }, + { + "epoch": 0.08092, + "grad_norm": 1.615198016166687, + "learning_rate": 1.6184000000000003e-05, + "loss": 0.277, + "step": 4046 + }, + { + "epoch": 0.08096, + "grad_norm": 1.3523716926574707, + "learning_rate": 1.6192e-05, + "loss": 0.243, + "step": 4048 + }, + { + "epoch": 0.081, + "grad_norm": 1.072264313697815, + "learning_rate": 1.62e-05, + "loss": 0.1942, + "step": 4050 + }, + { + "epoch": 0.08104, + "grad_norm": 0.9979829788208008, + "learning_rate": 1.6208e-05, + "loss": 0.2509, + "step": 4052 + }, + { + "epoch": 0.08108, + "grad_norm": 1.4977816343307495, + "learning_rate": 1.6216000000000002e-05, + "loss": 0.2353, + "step": 4054 + }, + { + "epoch": 0.08112, + "grad_norm": 0.566759467124939, + "learning_rate": 1.6224000000000003e-05, + "loss": 0.1811, + "step": 4056 + }, + { + "epoch": 0.08116, + "grad_norm": 0.8321506381034851, + "learning_rate": 1.6232e-05, + "loss": 0.152, + "step": 4058 + }, + { + "epoch": 0.0812, + "grad_norm": 1.5001295804977417, + "learning_rate": 1.6240000000000004e-05, + "loss": 0.2353, + "step": 4060 + }, + { + "epoch": 0.08124, + "grad_norm": 2.189617395401001, + "learning_rate": 1.6248e-05, + "loss": 0.4607, + "step": 4062 + }, + { + "epoch": 0.08128, + "grad_norm": 0.8352468013763428, + "learning_rate": 1.6256e-05, + "loss": 0.2351, + "step": 4064 + }, + { + "epoch": 0.08132, + "grad_norm": 1.4220492839813232, + "learning_rate": 1.6264000000000002e-05, + "loss": 0.2429, + "step": 4066 + }, + { + "epoch": 0.08136, + "grad_norm": 1.4101732969284058, + "learning_rate": 1.6272000000000003e-05, + "loss": 0.3536, + "step": 4068 + }, + { + "epoch": 0.0814, + "grad_norm": 1.1986353397369385, + "learning_rate": 1.628e-05, + "loss": 0.213, + "step": 4070 + }, + { + "epoch": 0.08144, + "grad_norm": 1.0389541387557983, + "learning_rate": 1.6288e-05, + "loss": 0.1943, + "step": 4072 + }, + { + "epoch": 0.08148, + "grad_norm": 1.7580273151397705, + "learning_rate": 1.6296e-05, + "loss": 0.273, + "step": 4074 + }, + { + "epoch": 0.08152, + "grad_norm": 1.930837631225586, + "learning_rate": 1.6304000000000002e-05, + "loss": 0.2552, + "step": 4076 + }, + { + "epoch": 0.08156, + "grad_norm": 1.9581447839736938, + "learning_rate": 1.6312000000000002e-05, + "loss": 0.2733, + "step": 4078 + }, + { + "epoch": 0.0816, + "grad_norm": 2.109630584716797, + "learning_rate": 1.632e-05, + "loss": 0.4243, + "step": 4080 + }, + { + "epoch": 0.08164, + "grad_norm": 0.8786592483520508, + "learning_rate": 1.6328e-05, + "loss": 0.1715, + "step": 4082 + }, + { + "epoch": 0.08168, + "grad_norm": 1.195846438407898, + "learning_rate": 1.6336e-05, + "loss": 0.1767, + "step": 4084 + }, + { + "epoch": 0.08172, + "grad_norm": 1.004248023033142, + "learning_rate": 1.6344e-05, + "loss": 0.218, + "step": 4086 + }, + { + "epoch": 0.08176, + "grad_norm": 1.6316512823104858, + "learning_rate": 1.6352000000000002e-05, + "loss": 0.2597, + "step": 4088 + }, + { + "epoch": 0.0818, + "grad_norm": 1.1349928379058838, + "learning_rate": 1.636e-05, + "loss": 0.1766, + "step": 4090 + }, + { + "epoch": 0.08184, + "grad_norm": 1.1557841300964355, + "learning_rate": 1.6368000000000003e-05, + "loss": 0.2133, + "step": 4092 + }, + { + "epoch": 0.08188, + "grad_norm": 1.3642332553863525, + "learning_rate": 1.6376e-05, + "loss": 0.2066, + "step": 4094 + }, + { + "epoch": 0.08192, + "grad_norm": 1.165927529335022, + "learning_rate": 1.6384e-05, + "loss": 0.235, + "step": 4096 + }, + { + "epoch": 0.08196, + "grad_norm": 1.4262726306915283, + "learning_rate": 1.6392e-05, + "loss": 0.2336, + "step": 4098 + }, + { + "epoch": 0.082, + "grad_norm": 0.8945055603981018, + "learning_rate": 1.64e-05, + "loss": 0.1621, + "step": 4100 + }, + { + "epoch": 0.08204, + "grad_norm": 1.528887152671814, + "learning_rate": 1.6408000000000003e-05, + "loss": 0.2237, + "step": 4102 + }, + { + "epoch": 0.08208, + "grad_norm": 1.0698590278625488, + "learning_rate": 1.6416e-05, + "loss": 0.1961, + "step": 4104 + }, + { + "epoch": 0.08212, + "grad_norm": 1.3983112573623657, + "learning_rate": 1.6424e-05, + "loss": 0.2041, + "step": 4106 + }, + { + "epoch": 0.08216, + "grad_norm": 1.3141310214996338, + "learning_rate": 1.6432e-05, + "loss": 0.2224, + "step": 4108 + }, + { + "epoch": 0.0822, + "grad_norm": 1.3590672016143799, + "learning_rate": 1.6440000000000002e-05, + "loss": 0.2122, + "step": 4110 + }, + { + "epoch": 0.08224, + "grad_norm": 1.2422988414764404, + "learning_rate": 1.6448000000000002e-05, + "loss": 0.269, + "step": 4112 + }, + { + "epoch": 0.08228, + "grad_norm": 1.2930524349212646, + "learning_rate": 1.6456e-05, + "loss": 0.212, + "step": 4114 + }, + { + "epoch": 0.08232, + "grad_norm": 1.57683527469635, + "learning_rate": 1.6464000000000004e-05, + "loss": 0.2263, + "step": 4116 + }, + { + "epoch": 0.08236, + "grad_norm": 1.5550800561904907, + "learning_rate": 1.6472e-05, + "loss": 0.3012, + "step": 4118 + }, + { + "epoch": 0.0824, + "grad_norm": 0.8720508813858032, + "learning_rate": 1.648e-05, + "loss": 0.2219, + "step": 4120 + }, + { + "epoch": 0.08244, + "grad_norm": 0.7520616054534912, + "learning_rate": 1.6488000000000002e-05, + "loss": 0.186, + "step": 4122 + }, + { + "epoch": 0.08248, + "grad_norm": 1.247675895690918, + "learning_rate": 1.6496e-05, + "loss": 0.2129, + "step": 4124 + }, + { + "epoch": 0.08252, + "grad_norm": 1.8016812801361084, + "learning_rate": 1.6504000000000003e-05, + "loss": 0.3403, + "step": 4126 + }, + { + "epoch": 0.08256, + "grad_norm": 0.9465324282646179, + "learning_rate": 1.6512e-05, + "loss": 0.1466, + "step": 4128 + }, + { + "epoch": 0.0826, + "grad_norm": 2.6802468299865723, + "learning_rate": 1.652e-05, + "loss": 0.3323, + "step": 4130 + }, + { + "epoch": 0.08264, + "grad_norm": 2.48830509185791, + "learning_rate": 1.6528e-05, + "loss": 0.3069, + "step": 4132 + }, + { + "epoch": 0.08268, + "grad_norm": 1.9035784006118774, + "learning_rate": 1.6536000000000002e-05, + "loss": 0.3392, + "step": 4134 + }, + { + "epoch": 0.08272, + "grad_norm": 0.8343334794044495, + "learning_rate": 1.6544000000000003e-05, + "loss": 0.1306, + "step": 4136 + }, + { + "epoch": 0.08276, + "grad_norm": 0.5575568675994873, + "learning_rate": 1.6552e-05, + "loss": 0.1104, + "step": 4138 + }, + { + "epoch": 0.0828, + "grad_norm": 0.5305629968643188, + "learning_rate": 1.656e-05, + "loss": 0.2694, + "step": 4140 + }, + { + "epoch": 0.08284, + "grad_norm": 1.1198642253875732, + "learning_rate": 1.6568e-05, + "loss": 0.1763, + "step": 4142 + }, + { + "epoch": 0.08288, + "grad_norm": 1.4667333364486694, + "learning_rate": 1.6576000000000002e-05, + "loss": 0.4057, + "step": 4144 + }, + { + "epoch": 0.08292, + "grad_norm": 1.5977859497070312, + "learning_rate": 1.6584000000000002e-05, + "loss": 0.29, + "step": 4146 + }, + { + "epoch": 0.08296, + "grad_norm": 0.8098706007003784, + "learning_rate": 1.6592000000000003e-05, + "loss": 0.1971, + "step": 4148 + }, + { + "epoch": 0.083, + "grad_norm": 1.8828513622283936, + "learning_rate": 1.66e-05, + "loss": 0.3428, + "step": 4150 + }, + { + "epoch": 0.08304, + "grad_norm": 1.1666909456253052, + "learning_rate": 1.6608e-05, + "loss": 0.2222, + "step": 4152 + }, + { + "epoch": 0.08308, + "grad_norm": 1.0503040552139282, + "learning_rate": 1.6616e-05, + "loss": 0.1843, + "step": 4154 + }, + { + "epoch": 0.08312, + "grad_norm": 1.702600121498108, + "learning_rate": 1.6624000000000002e-05, + "loss": 0.2559, + "step": 4156 + }, + { + "epoch": 0.08316, + "grad_norm": 1.4998480081558228, + "learning_rate": 1.6632000000000003e-05, + "loss": 0.277, + "step": 4158 + }, + { + "epoch": 0.0832, + "grad_norm": 1.0639543533325195, + "learning_rate": 1.664e-05, + "loss": 0.2027, + "step": 4160 + }, + { + "epoch": 0.08324, + "grad_norm": 1.064984679222107, + "learning_rate": 1.6648e-05, + "loss": 0.2043, + "step": 4162 + }, + { + "epoch": 0.08328, + "grad_norm": 1.420782446861267, + "learning_rate": 1.6656e-05, + "loss": 0.2348, + "step": 4164 + }, + { + "epoch": 0.08332, + "grad_norm": 1.0377767086029053, + "learning_rate": 1.6664000000000002e-05, + "loss": 0.1674, + "step": 4166 + }, + { + "epoch": 0.08336, + "grad_norm": 1.0835598707199097, + "learning_rate": 1.6672000000000002e-05, + "loss": 0.1755, + "step": 4168 + }, + { + "epoch": 0.0834, + "grad_norm": 0.7649460434913635, + "learning_rate": 1.668e-05, + "loss": 0.1463, + "step": 4170 + }, + { + "epoch": 0.08344, + "grad_norm": 2.295994281768799, + "learning_rate": 1.6688000000000004e-05, + "loss": 0.2998, + "step": 4172 + }, + { + "epoch": 0.08348, + "grad_norm": 1.965327501296997, + "learning_rate": 1.6696e-05, + "loss": 0.1926, + "step": 4174 + }, + { + "epoch": 0.08352, + "grad_norm": 0.846682071685791, + "learning_rate": 1.6704e-05, + "loss": 0.2287, + "step": 4176 + }, + { + "epoch": 0.08356, + "grad_norm": 1.5854357481002808, + "learning_rate": 1.6712000000000002e-05, + "loss": 0.2923, + "step": 4178 + }, + { + "epoch": 0.0836, + "grad_norm": 1.063429355621338, + "learning_rate": 1.672e-05, + "loss": 0.2839, + "step": 4180 + }, + { + "epoch": 0.08364, + "grad_norm": 1.2118322849273682, + "learning_rate": 1.6728000000000003e-05, + "loss": 0.171, + "step": 4182 + }, + { + "epoch": 0.08368, + "grad_norm": 1.0248758792877197, + "learning_rate": 1.6736e-05, + "loss": 0.1598, + "step": 4184 + }, + { + "epoch": 0.08372, + "grad_norm": 1.367485761642456, + "learning_rate": 1.6744e-05, + "loss": 0.1881, + "step": 4186 + }, + { + "epoch": 0.08376, + "grad_norm": 0.8414406776428223, + "learning_rate": 1.6752e-05, + "loss": 0.199, + "step": 4188 + }, + { + "epoch": 0.0838, + "grad_norm": 0.9399147629737854, + "learning_rate": 1.6760000000000002e-05, + "loss": 0.1598, + "step": 4190 + }, + { + "epoch": 0.08384, + "grad_norm": 1.1407897472381592, + "learning_rate": 1.6768000000000003e-05, + "loss": 0.2594, + "step": 4192 + }, + { + "epoch": 0.08388, + "grad_norm": 0.9355684518814087, + "learning_rate": 1.6776e-05, + "loss": 0.1938, + "step": 4194 + }, + { + "epoch": 0.08392, + "grad_norm": 0.675446629524231, + "learning_rate": 1.6784e-05, + "loss": 0.1057, + "step": 4196 + }, + { + "epoch": 0.08396, + "grad_norm": 1.189970850944519, + "learning_rate": 1.6792e-05, + "loss": 0.1936, + "step": 4198 + }, + { + "epoch": 0.084, + "grad_norm": 0.7415083646774292, + "learning_rate": 1.6800000000000002e-05, + "loss": 0.1645, + "step": 4200 + }, + { + "epoch": 0.08404, + "grad_norm": 1.9519388675689697, + "learning_rate": 1.6808000000000002e-05, + "loss": 0.1616, + "step": 4202 + }, + { + "epoch": 0.08408, + "grad_norm": 1.3553575277328491, + "learning_rate": 1.6816e-05, + "loss": 0.1854, + "step": 4204 + }, + { + "epoch": 0.08412, + "grad_norm": 2.213923692703247, + "learning_rate": 1.6824e-05, + "loss": 0.4392, + "step": 4206 + }, + { + "epoch": 0.08416, + "grad_norm": 2.169499635696411, + "learning_rate": 1.6832e-05, + "loss": 0.4911, + "step": 4208 + }, + { + "epoch": 0.0842, + "grad_norm": 0.686635434627533, + "learning_rate": 1.684e-05, + "loss": 0.0803, + "step": 4210 + }, + { + "epoch": 0.08424, + "grad_norm": 0.44565555453300476, + "learning_rate": 1.6848000000000002e-05, + "loss": 0.2031, + "step": 4212 + }, + { + "epoch": 0.08428, + "grad_norm": 2.66469407081604, + "learning_rate": 1.6856000000000003e-05, + "loss": 0.3637, + "step": 4214 + }, + { + "epoch": 0.08432, + "grad_norm": 1.8395146131515503, + "learning_rate": 1.6864e-05, + "loss": 0.4515, + "step": 4216 + }, + { + "epoch": 0.08436, + "grad_norm": 0.7499691247940063, + "learning_rate": 1.6872e-05, + "loss": 0.1001, + "step": 4218 + }, + { + "epoch": 0.0844, + "grad_norm": 1.773316740989685, + "learning_rate": 1.688e-05, + "loss": 0.4239, + "step": 4220 + }, + { + "epoch": 0.08444, + "grad_norm": 0.8352281451225281, + "learning_rate": 1.6888e-05, + "loss": 0.1302, + "step": 4222 + }, + { + "epoch": 0.08448, + "grad_norm": 1.4688684940338135, + "learning_rate": 1.6896000000000002e-05, + "loss": 0.2425, + "step": 4224 + }, + { + "epoch": 0.08452, + "grad_norm": 0.95561683177948, + "learning_rate": 1.6904e-05, + "loss": 0.2381, + "step": 4226 + }, + { + "epoch": 0.08456, + "grad_norm": 1.1763356924057007, + "learning_rate": 1.6912000000000003e-05, + "loss": 0.2333, + "step": 4228 + }, + { + "epoch": 0.0846, + "grad_norm": 1.5498428344726562, + "learning_rate": 1.692e-05, + "loss": 0.2384, + "step": 4230 + }, + { + "epoch": 0.08464, + "grad_norm": 0.9053815603256226, + "learning_rate": 1.6928e-05, + "loss": 0.1531, + "step": 4232 + }, + { + "epoch": 0.08468, + "grad_norm": 0.8952521085739136, + "learning_rate": 1.6936000000000002e-05, + "loss": 0.1179, + "step": 4234 + }, + { + "epoch": 0.08472, + "grad_norm": 2.682142734527588, + "learning_rate": 1.6944e-05, + "loss": 0.299, + "step": 4236 + }, + { + "epoch": 0.08476, + "grad_norm": 0.44622063636779785, + "learning_rate": 1.6952000000000003e-05, + "loss": 0.3089, + "step": 4238 + }, + { + "epoch": 0.0848, + "grad_norm": 0.6047537326812744, + "learning_rate": 1.696e-05, + "loss": 0.077, + "step": 4240 + }, + { + "epoch": 0.08484, + "grad_norm": 2.5738253593444824, + "learning_rate": 1.6968e-05, + "loss": 0.5686, + "step": 4242 + }, + { + "epoch": 0.08488, + "grad_norm": 0.7998592853546143, + "learning_rate": 1.6976e-05, + "loss": 0.0702, + "step": 4244 + }, + { + "epoch": 0.08492, + "grad_norm": 0.5186586380004883, + "learning_rate": 1.6984000000000002e-05, + "loss": 0.2177, + "step": 4246 + }, + { + "epoch": 0.08496, + "grad_norm": 2.3388783931732178, + "learning_rate": 1.6992000000000003e-05, + "loss": 0.3812, + "step": 4248 + }, + { + "epoch": 0.085, + "grad_norm": 0.9229862093925476, + "learning_rate": 1.7e-05, + "loss": 0.2067, + "step": 4250 + }, + { + "epoch": 0.08504, + "grad_norm": 1.2101985216140747, + "learning_rate": 1.7008000000000004e-05, + "loss": 0.2028, + "step": 4252 + }, + { + "epoch": 0.08508, + "grad_norm": 0.584037184715271, + "learning_rate": 1.7016e-05, + "loss": 0.1394, + "step": 4254 + }, + { + "epoch": 0.08512, + "grad_norm": 1.3547852039337158, + "learning_rate": 1.7024e-05, + "loss": 0.2545, + "step": 4256 + }, + { + "epoch": 0.08516, + "grad_norm": 1.1576451063156128, + "learning_rate": 1.7032000000000002e-05, + "loss": 0.2333, + "step": 4258 + }, + { + "epoch": 0.0852, + "grad_norm": 1.2164900302886963, + "learning_rate": 1.704e-05, + "loss": 0.2667, + "step": 4260 + }, + { + "epoch": 0.08524, + "grad_norm": 1.0354036092758179, + "learning_rate": 1.7048000000000003e-05, + "loss": 0.1937, + "step": 4262 + }, + { + "epoch": 0.08528, + "grad_norm": 1.4561786651611328, + "learning_rate": 1.7056e-05, + "loss": 0.29, + "step": 4264 + }, + { + "epoch": 0.08532, + "grad_norm": 1.030469536781311, + "learning_rate": 1.7064e-05, + "loss": 0.1496, + "step": 4266 + }, + { + "epoch": 0.08536, + "grad_norm": 0.831220805644989, + "learning_rate": 1.7072000000000002e-05, + "loss": 0.2107, + "step": 4268 + }, + { + "epoch": 0.0854, + "grad_norm": 0.9362083673477173, + "learning_rate": 1.7080000000000002e-05, + "loss": 0.2149, + "step": 4270 + }, + { + "epoch": 0.08544, + "grad_norm": 0.7893744707107544, + "learning_rate": 1.7088000000000003e-05, + "loss": 0.1715, + "step": 4272 + }, + { + "epoch": 0.08548, + "grad_norm": 1.480775237083435, + "learning_rate": 1.7096e-05, + "loss": 0.2162, + "step": 4274 + }, + { + "epoch": 0.08552, + "grad_norm": 0.8567900061607361, + "learning_rate": 1.7104e-05, + "loss": 0.1601, + "step": 4276 + }, + { + "epoch": 0.08556, + "grad_norm": 1.7418828010559082, + "learning_rate": 1.7112e-05, + "loss": 0.3428, + "step": 4278 + }, + { + "epoch": 0.0856, + "grad_norm": 0.642085075378418, + "learning_rate": 1.7120000000000002e-05, + "loss": 0.1197, + "step": 4280 + }, + { + "epoch": 0.08564, + "grad_norm": 1.0054939985275269, + "learning_rate": 1.7128000000000003e-05, + "loss": 0.251, + "step": 4282 + }, + { + "epoch": 0.08568, + "grad_norm": 1.627582311630249, + "learning_rate": 1.7136e-05, + "loss": 0.2105, + "step": 4284 + }, + { + "epoch": 0.08572, + "grad_norm": 0.48165157437324524, + "learning_rate": 1.7144e-05, + "loss": 0.0648, + "step": 4286 + }, + { + "epoch": 0.08576, + "grad_norm": 0.5498226881027222, + "learning_rate": 1.7152e-05, + "loss": 0.1103, + "step": 4288 + }, + { + "epoch": 0.0858, + "grad_norm": 1.7135847806930542, + "learning_rate": 1.7160000000000002e-05, + "loss": 0.3266, + "step": 4290 + }, + { + "epoch": 0.08584, + "grad_norm": 1.0081664323806763, + "learning_rate": 1.7168000000000002e-05, + "loss": 0.1516, + "step": 4292 + }, + { + "epoch": 0.08588, + "grad_norm": 2.339938163757324, + "learning_rate": 1.7176000000000003e-05, + "loss": 0.3891, + "step": 4294 + }, + { + "epoch": 0.08592, + "grad_norm": 1.0773534774780273, + "learning_rate": 1.7184e-05, + "loss": 0.2507, + "step": 4296 + }, + { + "epoch": 0.08596, + "grad_norm": 1.5735265016555786, + "learning_rate": 1.7192e-05, + "loss": 0.3265, + "step": 4298 + }, + { + "epoch": 0.086, + "grad_norm": 1.3066484928131104, + "learning_rate": 1.72e-05, + "loss": 0.1958, + "step": 4300 + }, + { + "epoch": 0.08604, + "grad_norm": 1.475940227508545, + "learning_rate": 1.7208000000000002e-05, + "loss": 0.3464, + "step": 4302 + }, + { + "epoch": 0.08608, + "grad_norm": 1.461625099182129, + "learning_rate": 1.7216000000000003e-05, + "loss": 0.2536, + "step": 4304 + }, + { + "epoch": 0.08612, + "grad_norm": 1.298525094985962, + "learning_rate": 1.7224e-05, + "loss": 0.2442, + "step": 4306 + }, + { + "epoch": 0.08616, + "grad_norm": 0.8367576003074646, + "learning_rate": 1.7232000000000004e-05, + "loss": 0.1855, + "step": 4308 + }, + { + "epoch": 0.0862, + "grad_norm": 1.4160149097442627, + "learning_rate": 1.724e-05, + "loss": 0.2146, + "step": 4310 + }, + { + "epoch": 0.08624, + "grad_norm": 0.8427443504333496, + "learning_rate": 1.7248e-05, + "loss": 0.1552, + "step": 4312 + }, + { + "epoch": 0.08628, + "grad_norm": 1.1970607042312622, + "learning_rate": 1.7256000000000002e-05, + "loss": 0.1812, + "step": 4314 + }, + { + "epoch": 0.08632, + "grad_norm": 1.0911818742752075, + "learning_rate": 1.7264e-05, + "loss": 0.1675, + "step": 4316 + }, + { + "epoch": 0.08636, + "grad_norm": 0.8779245018959045, + "learning_rate": 1.7272000000000003e-05, + "loss": 0.1528, + "step": 4318 + }, + { + "epoch": 0.0864, + "grad_norm": 1.6643534898757935, + "learning_rate": 1.728e-05, + "loss": 0.3011, + "step": 4320 + }, + { + "epoch": 0.08644, + "grad_norm": 1.0129690170288086, + "learning_rate": 1.7288e-05, + "loss": 0.1673, + "step": 4322 + }, + { + "epoch": 0.08648, + "grad_norm": 1.1605901718139648, + "learning_rate": 1.7296000000000002e-05, + "loss": 0.1616, + "step": 4324 + }, + { + "epoch": 0.08652, + "grad_norm": 1.5743595361709595, + "learning_rate": 1.7304000000000002e-05, + "loss": 0.3265, + "step": 4326 + }, + { + "epoch": 0.08656, + "grad_norm": 1.628898024559021, + "learning_rate": 1.7312000000000003e-05, + "loss": 0.3823, + "step": 4328 + }, + { + "epoch": 0.0866, + "grad_norm": 1.5695178508758545, + "learning_rate": 1.732e-05, + "loss": 0.441, + "step": 4330 + }, + { + "epoch": 0.08664, + "grad_norm": 1.0594208240509033, + "learning_rate": 1.7328e-05, + "loss": 0.2728, + "step": 4332 + }, + { + "epoch": 0.08668, + "grad_norm": 0.9805479049682617, + "learning_rate": 1.7336e-05, + "loss": 0.1957, + "step": 4334 + }, + { + "epoch": 0.08672, + "grad_norm": 1.6817855834960938, + "learning_rate": 1.7344000000000002e-05, + "loss": 0.2544, + "step": 4336 + }, + { + "epoch": 0.08676, + "grad_norm": 0.7758905291557312, + "learning_rate": 1.7352000000000003e-05, + "loss": 0.2288, + "step": 4338 + }, + { + "epoch": 0.0868, + "grad_norm": 2.0628836154937744, + "learning_rate": 1.736e-05, + "loss": 0.4388, + "step": 4340 + }, + { + "epoch": 0.08684, + "grad_norm": 1.9717823266983032, + "learning_rate": 1.7368e-05, + "loss": 0.284, + "step": 4342 + }, + { + "epoch": 0.08688, + "grad_norm": 0.9215530157089233, + "learning_rate": 1.7376e-05, + "loss": 0.1447, + "step": 4344 + }, + { + "epoch": 0.08692, + "grad_norm": 1.435223937034607, + "learning_rate": 1.7384e-05, + "loss": 0.1883, + "step": 4346 + }, + { + "epoch": 0.08696, + "grad_norm": 2.0097944736480713, + "learning_rate": 1.7392000000000002e-05, + "loss": 0.3285, + "step": 4348 + }, + { + "epoch": 0.087, + "grad_norm": 1.0588337182998657, + "learning_rate": 1.7400000000000003e-05, + "loss": 0.1764, + "step": 4350 + }, + { + "epoch": 0.08704, + "grad_norm": 1.370244026184082, + "learning_rate": 1.7408e-05, + "loss": 0.2795, + "step": 4352 + }, + { + "epoch": 0.08708, + "grad_norm": 0.9665915966033936, + "learning_rate": 1.7416e-05, + "loss": 0.1677, + "step": 4354 + }, + { + "epoch": 0.08712, + "grad_norm": 0.7844148874282837, + "learning_rate": 1.7424e-05, + "loss": 0.1969, + "step": 4356 + }, + { + "epoch": 0.08716, + "grad_norm": 0.936467170715332, + "learning_rate": 1.7432000000000002e-05, + "loss": 0.2483, + "step": 4358 + }, + { + "epoch": 0.0872, + "grad_norm": 1.7584166526794434, + "learning_rate": 1.7440000000000002e-05, + "loss": 0.2823, + "step": 4360 + }, + { + "epoch": 0.08724, + "grad_norm": 1.2767930030822754, + "learning_rate": 1.7448e-05, + "loss": 0.1782, + "step": 4362 + }, + { + "epoch": 0.08728, + "grad_norm": 1.1067506074905396, + "learning_rate": 1.7456e-05, + "loss": 0.1592, + "step": 4364 + }, + { + "epoch": 0.08732, + "grad_norm": 0.9258239269256592, + "learning_rate": 1.7464e-05, + "loss": 0.2622, + "step": 4366 + }, + { + "epoch": 0.08736, + "grad_norm": 1.6404485702514648, + "learning_rate": 1.7472e-05, + "loss": 0.2425, + "step": 4368 + }, + { + "epoch": 0.0874, + "grad_norm": 1.5852795839309692, + "learning_rate": 1.7480000000000002e-05, + "loss": 0.1989, + "step": 4370 + }, + { + "epoch": 0.08744, + "grad_norm": 1.0172072649002075, + "learning_rate": 1.7488e-05, + "loss": 0.2558, + "step": 4372 + }, + { + "epoch": 0.08748, + "grad_norm": 0.9906463027000427, + "learning_rate": 1.7496000000000003e-05, + "loss": 0.2224, + "step": 4374 + }, + { + "epoch": 0.08752, + "grad_norm": 1.3114514350891113, + "learning_rate": 1.7504e-05, + "loss": 0.1958, + "step": 4376 + }, + { + "epoch": 0.08756, + "grad_norm": 1.6084988117218018, + "learning_rate": 1.7512e-05, + "loss": 0.2381, + "step": 4378 + }, + { + "epoch": 0.0876, + "grad_norm": 1.6375129222869873, + "learning_rate": 1.752e-05, + "loss": 0.2381, + "step": 4380 + }, + { + "epoch": 0.08764, + "grad_norm": 1.418808102607727, + "learning_rate": 1.7528e-05, + "loss": 0.2562, + "step": 4382 + }, + { + "epoch": 0.08768, + "grad_norm": 1.6183946132659912, + "learning_rate": 1.7536000000000003e-05, + "loss": 0.1695, + "step": 4384 + }, + { + "epoch": 0.08772, + "grad_norm": 0.5639644861221313, + "learning_rate": 1.7544e-05, + "loss": 0.1689, + "step": 4386 + }, + { + "epoch": 0.08776, + "grad_norm": 0.46710291504859924, + "learning_rate": 1.7552e-05, + "loss": 0.0964, + "step": 4388 + }, + { + "epoch": 0.0878, + "grad_norm": 2.123600959777832, + "learning_rate": 1.756e-05, + "loss": 0.1846, + "step": 4390 + }, + { + "epoch": 0.08784, + "grad_norm": 0.9710472822189331, + "learning_rate": 1.7568000000000002e-05, + "loss": 0.1179, + "step": 4392 + }, + { + "epoch": 0.08788, + "grad_norm": 3.039689779281616, + "learning_rate": 1.7576000000000002e-05, + "loss": 0.5269, + "step": 4394 + }, + { + "epoch": 0.08792, + "grad_norm": 2.950587511062622, + "learning_rate": 1.7584e-05, + "loss": 0.4872, + "step": 4396 + }, + { + "epoch": 0.08796, + "grad_norm": 1.0085035562515259, + "learning_rate": 1.7592000000000004e-05, + "loss": 0.1881, + "step": 4398 + }, + { + "epoch": 0.088, + "grad_norm": 1.5431442260742188, + "learning_rate": 1.76e-05, + "loss": 0.2563, + "step": 4400 + }, + { + "epoch": 0.08804, + "grad_norm": 0.808261513710022, + "learning_rate": 1.7608e-05, + "loss": 0.1617, + "step": 4402 + }, + { + "epoch": 0.08808, + "grad_norm": 0.9054556488990784, + "learning_rate": 1.7616000000000002e-05, + "loss": 0.137, + "step": 4404 + }, + { + "epoch": 0.08812, + "grad_norm": 2.1089634895324707, + "learning_rate": 1.7624000000000003e-05, + "loss": 0.316, + "step": 4406 + }, + { + "epoch": 0.08816, + "grad_norm": 0.6413194537162781, + "learning_rate": 1.7632000000000003e-05, + "loss": 0.2373, + "step": 4408 + }, + { + "epoch": 0.0882, + "grad_norm": 0.5892408490180969, + "learning_rate": 1.764e-05, + "loss": 0.1694, + "step": 4410 + }, + { + "epoch": 0.08824, + "grad_norm": 1.4100743532180786, + "learning_rate": 1.7648e-05, + "loss": 0.1647, + "step": 4412 + }, + { + "epoch": 0.08828, + "grad_norm": 1.1095575094223022, + "learning_rate": 1.7656000000000002e-05, + "loss": 0.3305, + "step": 4414 + }, + { + "epoch": 0.08832, + "grad_norm": 1.8050743341445923, + "learning_rate": 1.7664000000000002e-05, + "loss": 0.4094, + "step": 4416 + }, + { + "epoch": 0.08836, + "grad_norm": 1.2207326889038086, + "learning_rate": 1.7672000000000003e-05, + "loss": 0.16, + "step": 4418 + }, + { + "epoch": 0.0884, + "grad_norm": 1.3596608638763428, + "learning_rate": 1.768e-05, + "loss": 0.2028, + "step": 4420 + }, + { + "epoch": 0.08844, + "grad_norm": 1.2375513315200806, + "learning_rate": 1.7688e-05, + "loss": 0.1846, + "step": 4422 + }, + { + "epoch": 0.08848, + "grad_norm": 1.7320598363876343, + "learning_rate": 1.7696e-05, + "loss": 0.2067, + "step": 4424 + }, + { + "epoch": 0.08852, + "grad_norm": 1.6680903434753418, + "learning_rate": 1.7704000000000002e-05, + "loss": 0.2653, + "step": 4426 + }, + { + "epoch": 0.08856, + "grad_norm": 1.9018175601959229, + "learning_rate": 1.7712000000000003e-05, + "loss": 0.2545, + "step": 4428 + }, + { + "epoch": 0.0886, + "grad_norm": 2.2456703186035156, + "learning_rate": 1.7720000000000003e-05, + "loss": 0.2551, + "step": 4430 + }, + { + "epoch": 0.08864, + "grad_norm": 1.2704460620880127, + "learning_rate": 1.7728e-05, + "loss": 0.1447, + "step": 4432 + }, + { + "epoch": 0.08868, + "grad_norm": 1.2033799886703491, + "learning_rate": 1.7736e-05, + "loss": 0.2103, + "step": 4434 + }, + { + "epoch": 0.08872, + "grad_norm": 1.379406452178955, + "learning_rate": 1.7744e-05, + "loss": 0.4428, + "step": 4436 + }, + { + "epoch": 0.08876, + "grad_norm": 1.6205936670303345, + "learning_rate": 1.7752e-05, + "loss": 0.2665, + "step": 4438 + }, + { + "epoch": 0.0888, + "grad_norm": 2.2322463989257812, + "learning_rate": 1.7760000000000003e-05, + "loss": 0.343, + "step": 4440 + }, + { + "epoch": 0.08884, + "grad_norm": 1.5721111297607422, + "learning_rate": 1.7768e-05, + "loss": 0.2148, + "step": 4442 + }, + { + "epoch": 0.08888, + "grad_norm": 1.2986080646514893, + "learning_rate": 1.7776e-05, + "loss": 0.2562, + "step": 4444 + }, + { + "epoch": 0.08892, + "grad_norm": 1.5863697528839111, + "learning_rate": 1.7784e-05, + "loss": 0.2104, + "step": 4446 + }, + { + "epoch": 0.08896, + "grad_norm": 1.5830750465393066, + "learning_rate": 1.7792000000000002e-05, + "loss": 0.2349, + "step": 4448 + }, + { + "epoch": 0.089, + "grad_norm": 1.2278224229812622, + "learning_rate": 1.7800000000000002e-05, + "loss": 0.2348, + "step": 4450 + }, + { + "epoch": 0.08904, + "grad_norm": 1.2850112915039062, + "learning_rate": 1.7808e-05, + "loss": 0.2597, + "step": 4452 + }, + { + "epoch": 0.08908, + "grad_norm": 1.2445077896118164, + "learning_rate": 1.7816000000000004e-05, + "loss": 0.1931, + "step": 4454 + }, + { + "epoch": 0.08912, + "grad_norm": 1.3382349014282227, + "learning_rate": 1.7824e-05, + "loss": 0.2223, + "step": 4456 + }, + { + "epoch": 0.08916, + "grad_norm": 1.4373257160186768, + "learning_rate": 1.7832e-05, + "loss": 0.2546, + "step": 4458 + }, + { + "epoch": 0.0892, + "grad_norm": 1.421557903289795, + "learning_rate": 1.7840000000000002e-05, + "loss": 0.3063, + "step": 4460 + }, + { + "epoch": 0.08924, + "grad_norm": 1.450481653213501, + "learning_rate": 1.7848e-05, + "loss": 0.2224, + "step": 4462 + }, + { + "epoch": 0.08928, + "grad_norm": 1.3517749309539795, + "learning_rate": 1.7856000000000003e-05, + "loss": 0.1781, + "step": 4464 + }, + { + "epoch": 0.08932, + "grad_norm": 1.3496521711349487, + "learning_rate": 1.7864e-05, + "loss": 0.2543, + "step": 4466 + }, + { + "epoch": 0.08936, + "grad_norm": 1.1773627996444702, + "learning_rate": 1.7872e-05, + "loss": 0.2237, + "step": 4468 + }, + { + "epoch": 0.0894, + "grad_norm": 1.4544155597686768, + "learning_rate": 1.788e-05, + "loss": 0.2887, + "step": 4470 + }, + { + "epoch": 0.08944, + "grad_norm": 1.0748131275177002, + "learning_rate": 1.7888000000000002e-05, + "loss": 0.2068, + "step": 4472 + }, + { + "epoch": 0.08948, + "grad_norm": 0.8365676999092102, + "learning_rate": 1.7896000000000003e-05, + "loss": 0.1797, + "step": 4474 + }, + { + "epoch": 0.08952, + "grad_norm": 2.4135332107543945, + "learning_rate": 1.7904e-05, + "loss": 0.4305, + "step": 4476 + }, + { + "epoch": 0.08956, + "grad_norm": 1.6178016662597656, + "learning_rate": 1.7912e-05, + "loss": 0.3015, + "step": 4478 + }, + { + "epoch": 0.0896, + "grad_norm": 1.5927610397338867, + "learning_rate": 1.792e-05, + "loss": 0.2237, + "step": 4480 + }, + { + "epoch": 0.08964, + "grad_norm": 1.3702679872512817, + "learning_rate": 1.7928000000000002e-05, + "loss": 0.2129, + "step": 4482 + }, + { + "epoch": 0.08968, + "grad_norm": 1.1979732513427734, + "learning_rate": 1.7936000000000002e-05, + "loss": 0.2025, + "step": 4484 + }, + { + "epoch": 0.08972, + "grad_norm": 1.1087596416473389, + "learning_rate": 1.7944000000000003e-05, + "loss": 0.2352, + "step": 4486 + }, + { + "epoch": 0.08976, + "grad_norm": 0.9477411508560181, + "learning_rate": 1.7952e-05, + "loss": 0.2381, + "step": 4488 + }, + { + "epoch": 0.0898, + "grad_norm": 1.1389176845550537, + "learning_rate": 1.796e-05, + "loss": 0.2778, + "step": 4490 + }, + { + "epoch": 0.08984, + "grad_norm": 0.6004263758659363, + "learning_rate": 1.7968e-05, + "loss": 0.1076, + "step": 4492 + }, + { + "epoch": 0.08988, + "grad_norm": 0.7110564708709717, + "learning_rate": 1.7976000000000002e-05, + "loss": 0.2982, + "step": 4494 + }, + { + "epoch": 0.08992, + "grad_norm": 0.7500723004341125, + "learning_rate": 1.7984000000000003e-05, + "loss": 0.2722, + "step": 4496 + }, + { + "epoch": 0.08996, + "grad_norm": 0.8860373497009277, + "learning_rate": 1.7992e-05, + "loss": 0.2695, + "step": 4498 + }, + { + "epoch": 0.09, + "grad_norm": 0.7189562320709229, + "learning_rate": 1.8e-05, + "loss": 0.229, + "step": 4500 + }, + { + "epoch": 0.09004, + "grad_norm": 1.499327540397644, + "learning_rate": 1.8008e-05, + "loss": 0.1969, + "step": 4502 + }, + { + "epoch": 0.09008, + "grad_norm": 0.8557275533676147, + "learning_rate": 1.8016e-05, + "loss": 0.2765, + "step": 4504 + }, + { + "epoch": 0.09012, + "grad_norm": 1.419849157333374, + "learning_rate": 1.8024000000000002e-05, + "loss": 0.3316, + "step": 4506 + }, + { + "epoch": 0.09016, + "grad_norm": 1.149658203125, + "learning_rate": 1.8032e-05, + "loss": 0.213, + "step": 4508 + }, + { + "epoch": 0.0902, + "grad_norm": 1.3715085983276367, + "learning_rate": 1.8040000000000003e-05, + "loss": 0.1813, + "step": 4510 + }, + { + "epoch": 0.09024, + "grad_norm": 1.889169454574585, + "learning_rate": 1.8048e-05, + "loss": 0.3148, + "step": 4512 + }, + { + "epoch": 0.09028, + "grad_norm": 1.3164408206939697, + "learning_rate": 1.8056e-05, + "loss": 0.1959, + "step": 4514 + }, + { + "epoch": 0.09032, + "grad_norm": 1.2127888202667236, + "learning_rate": 1.8064000000000002e-05, + "loss": 0.2446, + "step": 4516 + }, + { + "epoch": 0.09036, + "grad_norm": 1.2533046007156372, + "learning_rate": 1.8072e-05, + "loss": 0.2546, + "step": 4518 + }, + { + "epoch": 0.0904, + "grad_norm": 1.2617857456207275, + "learning_rate": 1.8080000000000003e-05, + "loss": 0.2565, + "step": 4520 + }, + { + "epoch": 0.09044, + "grad_norm": 0.9131098985671997, + "learning_rate": 1.8088e-05, + "loss": 0.1451, + "step": 4522 + }, + { + "epoch": 0.09048, + "grad_norm": 1.0563411712646484, + "learning_rate": 1.8096e-05, + "loss": 0.1199, + "step": 4524 + }, + { + "epoch": 0.09052, + "grad_norm": 0.9294727444648743, + "learning_rate": 1.8104e-05, + "loss": 0.2352, + "step": 4526 + }, + { + "epoch": 0.09056, + "grad_norm": 2.3622560501098633, + "learning_rate": 1.8112000000000002e-05, + "loss": 0.4721, + "step": 4528 + }, + { + "epoch": 0.0906, + "grad_norm": 0.9316186308860779, + "learning_rate": 1.8120000000000003e-05, + "loss": 0.1015, + "step": 4530 + }, + { + "epoch": 0.09064, + "grad_norm": 0.8099982738494873, + "learning_rate": 1.8128e-05, + "loss": 0.2431, + "step": 4532 + }, + { + "epoch": 0.09068, + "grad_norm": 0.8638507723808289, + "learning_rate": 1.8136000000000004e-05, + "loss": 0.2784, + "step": 4534 + }, + { + "epoch": 0.09072, + "grad_norm": 1.2691715955734253, + "learning_rate": 1.8144e-05, + "loss": 0.2925, + "step": 4536 + }, + { + "epoch": 0.09076, + "grad_norm": 1.27615487575531, + "learning_rate": 1.8152000000000002e-05, + "loss": 0.1887, + "step": 4538 + }, + { + "epoch": 0.0908, + "grad_norm": 1.080847144126892, + "learning_rate": 1.8160000000000002e-05, + "loss": 0.1688, + "step": 4540 + }, + { + "epoch": 0.09084, + "grad_norm": 1.0306276082992554, + "learning_rate": 1.8168e-05, + "loss": 0.2028, + "step": 4542 + }, + { + "epoch": 0.09088, + "grad_norm": 0.9203752279281616, + "learning_rate": 1.8176000000000004e-05, + "loss": 0.2384, + "step": 4544 + }, + { + "epoch": 0.09092, + "grad_norm": 0.7134057283401489, + "learning_rate": 1.8184e-05, + "loss": 0.2093, + "step": 4546 + }, + { + "epoch": 0.09096, + "grad_norm": 0.8442509174346924, + "learning_rate": 1.8192e-05, + "loss": 0.2104, + "step": 4548 + }, + { + "epoch": 0.091, + "grad_norm": 1.171246886253357, + "learning_rate": 1.8200000000000002e-05, + "loss": 0.1961, + "step": 4550 + }, + { + "epoch": 0.09104, + "grad_norm": 1.0822831392288208, + "learning_rate": 1.8208000000000003e-05, + "loss": 0.1846, + "step": 4552 + }, + { + "epoch": 0.09108, + "grad_norm": 1.1512311697006226, + "learning_rate": 1.8216000000000003e-05, + "loss": 0.2222, + "step": 4554 + }, + { + "epoch": 0.09112, + "grad_norm": 1.5204969644546509, + "learning_rate": 1.8224e-05, + "loss": 0.2427, + "step": 4556 + }, + { + "epoch": 0.09116, + "grad_norm": 1.2680379152297974, + "learning_rate": 1.8232e-05, + "loss": 0.1882, + "step": 4558 + }, + { + "epoch": 0.0912, + "grad_norm": 0.8925144672393799, + "learning_rate": 1.824e-05, + "loss": 0.1711, + "step": 4560 + }, + { + "epoch": 0.09124, + "grad_norm": 0.725329577922821, + "learning_rate": 1.8248000000000002e-05, + "loss": 0.1325, + "step": 4562 + }, + { + "epoch": 0.09128, + "grad_norm": 1.0219533443450928, + "learning_rate": 1.8256e-05, + "loss": 0.2039, + "step": 4564 + }, + { + "epoch": 0.09132, + "grad_norm": 1.2272913455963135, + "learning_rate": 1.8264000000000003e-05, + "loss": 0.1686, + "step": 4566 + }, + { + "epoch": 0.09136, + "grad_norm": 2.052518606185913, + "learning_rate": 1.8272e-05, + "loss": 0.4434, + "step": 4568 + }, + { + "epoch": 0.0914, + "grad_norm": 1.269580364227295, + "learning_rate": 1.828e-05, + "loss": 0.2121, + "step": 4570 + }, + { + "epoch": 0.09144, + "grad_norm": 0.7107111215591431, + "learning_rate": 1.8288000000000002e-05, + "loss": 0.1325, + "step": 4572 + }, + { + "epoch": 0.09148, + "grad_norm": 1.5031596422195435, + "learning_rate": 1.8296e-05, + "loss": 0.1858, + "step": 4574 + }, + { + "epoch": 0.09152, + "grad_norm": 0.6560961604118347, + "learning_rate": 1.8304000000000003e-05, + "loss": 0.1439, + "step": 4576 + }, + { + "epoch": 0.09156, + "grad_norm": 1.6751948595046997, + "learning_rate": 1.8312e-05, + "loss": 0.2348, + "step": 4578 + }, + { + "epoch": 0.0916, + "grad_norm": 0.6721665859222412, + "learning_rate": 1.832e-05, + "loss": 0.1268, + "step": 4580 + }, + { + "epoch": 0.09164, + "grad_norm": 0.8237615823745728, + "learning_rate": 1.8328e-05, + "loss": 0.1444, + "step": 4582 + }, + { + "epoch": 0.09168, + "grad_norm": 0.6166964769363403, + "learning_rate": 1.8336000000000002e-05, + "loss": 0.122, + "step": 4584 + }, + { + "epoch": 0.09172, + "grad_norm": 2.4206552505493164, + "learning_rate": 1.8344000000000003e-05, + "loss": 0.331, + "step": 4586 + }, + { + "epoch": 0.09176, + "grad_norm": 1.760188102722168, + "learning_rate": 1.8352e-05, + "loss": 0.1571, + "step": 4588 + }, + { + "epoch": 0.0918, + "grad_norm": 0.5067967772483826, + "learning_rate": 1.8360000000000004e-05, + "loss": 0.0606, + "step": 4590 + }, + { + "epoch": 0.09184, + "grad_norm": 1.2897242307662964, + "learning_rate": 1.8368e-05, + "loss": 0.1444, + "step": 4592 + }, + { + "epoch": 0.09188, + "grad_norm": 2.2535150051116943, + "learning_rate": 1.8376e-05, + "loss": 0.3388, + "step": 4594 + }, + { + "epoch": 0.09192, + "grad_norm": 1.7057703733444214, + "learning_rate": 1.8384000000000002e-05, + "loss": 0.1916, + "step": 4596 + }, + { + "epoch": 0.09196, + "grad_norm": 1.8800257444381714, + "learning_rate": 1.8392e-05, + "loss": 0.1852, + "step": 4598 + }, + { + "epoch": 0.092, + "grad_norm": 2.1078600883483887, + "learning_rate": 1.8400000000000003e-05, + "loss": 0.2295, + "step": 4600 + }, + { + "epoch": 0.09204, + "grad_norm": 1.470872163772583, + "learning_rate": 1.8408e-05, + "loss": 0.2023, + "step": 4602 + }, + { + "epoch": 0.09208, + "grad_norm": 0.9355194568634033, + "learning_rate": 1.8416e-05, + "loss": 0.3424, + "step": 4604 + }, + { + "epoch": 0.09212, + "grad_norm": 0.8141385912895203, + "learning_rate": 1.8424000000000002e-05, + "loss": 0.2027, + "step": 4606 + }, + { + "epoch": 0.09216, + "grad_norm": 3.094829559326172, + "learning_rate": 1.8432000000000002e-05, + "loss": 0.6975, + "step": 4608 + }, + { + "epoch": 0.0922, + "grad_norm": 1.0747601985931396, + "learning_rate": 1.8440000000000003e-05, + "loss": 0.4066, + "step": 4610 + }, + { + "epoch": 0.09224, + "grad_norm": 1.0440925359725952, + "learning_rate": 1.8448e-05, + "loss": 0.192, + "step": 4612 + }, + { + "epoch": 0.09228, + "grad_norm": 1.1946139335632324, + "learning_rate": 1.8456e-05, + "loss": 0.296, + "step": 4614 + }, + { + "epoch": 0.09232, + "grad_norm": 1.1192232370376587, + "learning_rate": 1.8464e-05, + "loss": 0.2026, + "step": 4616 + }, + { + "epoch": 0.09236, + "grad_norm": 0.9420962333679199, + "learning_rate": 1.8472000000000002e-05, + "loss": 0.1673, + "step": 4618 + }, + { + "epoch": 0.0924, + "grad_norm": 0.47673070430755615, + "learning_rate": 1.8480000000000003e-05, + "loss": 0.0919, + "step": 4620 + }, + { + "epoch": 0.09244, + "grad_norm": 2.2933661937713623, + "learning_rate": 1.8488e-05, + "loss": 0.4695, + "step": 4622 + }, + { + "epoch": 0.09248, + "grad_norm": 2.3020005226135254, + "learning_rate": 1.8496e-05, + "loss": 0.4551, + "step": 4624 + }, + { + "epoch": 0.09252, + "grad_norm": 1.5819566249847412, + "learning_rate": 1.8504e-05, + "loss": 0.3391, + "step": 4626 + }, + { + "epoch": 0.09256, + "grad_norm": 1.403106451034546, + "learning_rate": 1.8512e-05, + "loss": 0.3734, + "step": 4628 + }, + { + "epoch": 0.0926, + "grad_norm": 1.3564863204956055, + "learning_rate": 1.8520000000000002e-05, + "loss": 0.2893, + "step": 4630 + }, + { + "epoch": 0.09264, + "grad_norm": 1.3579754829406738, + "learning_rate": 1.8528000000000003e-05, + "loss": 0.2894, + "step": 4632 + }, + { + "epoch": 0.09268, + "grad_norm": 1.0233476161956787, + "learning_rate": 1.8536e-05, + "loss": 0.2042, + "step": 4634 + }, + { + "epoch": 0.09272, + "grad_norm": 1.1818794012069702, + "learning_rate": 1.8544e-05, + "loss": 0.2443, + "step": 4636 + }, + { + "epoch": 0.09276, + "grad_norm": 0.7307437658309937, + "learning_rate": 1.8552e-05, + "loss": 0.2157, + "step": 4638 + }, + { + "epoch": 0.0928, + "grad_norm": 1.6520957946777344, + "learning_rate": 1.8560000000000002e-05, + "loss": 0.2568, + "step": 4640 + }, + { + "epoch": 0.09284, + "grad_norm": 1.6611372232437134, + "learning_rate": 1.8568000000000002e-05, + "loss": 0.3666, + "step": 4642 + }, + { + "epoch": 0.09288, + "grad_norm": 1.4077792167663574, + "learning_rate": 1.8576e-05, + "loss": 0.2898, + "step": 4644 + }, + { + "epoch": 0.09292, + "grad_norm": 0.8882556557655334, + "learning_rate": 1.8584000000000004e-05, + "loss": 0.2428, + "step": 4646 + }, + { + "epoch": 0.09296, + "grad_norm": 0.9847394227981567, + "learning_rate": 1.8592e-05, + "loss": 0.2242, + "step": 4648 + }, + { + "epoch": 0.093, + "grad_norm": 1.4420287609100342, + "learning_rate": 1.86e-05, + "loss": 0.252, + "step": 4650 + }, + { + "epoch": 0.09304, + "grad_norm": 0.7690425515174866, + "learning_rate": 1.8608000000000002e-05, + "loss": 0.1692, + "step": 4652 + }, + { + "epoch": 0.09308, + "grad_norm": 0.7280910611152649, + "learning_rate": 1.8616e-05, + "loss": 0.2283, + "step": 4654 + }, + { + "epoch": 0.09312, + "grad_norm": 0.7115260362625122, + "learning_rate": 1.8624000000000003e-05, + "loss": 0.1119, + "step": 4656 + }, + { + "epoch": 0.09316, + "grad_norm": 1.7190607786178589, + "learning_rate": 1.8632e-05, + "loss": 0.3665, + "step": 4658 + }, + { + "epoch": 0.0932, + "grad_norm": 1.3870068788528442, + "learning_rate": 1.864e-05, + "loss": 0.3172, + "step": 4660 + }, + { + "epoch": 0.09324, + "grad_norm": 1.3115952014923096, + "learning_rate": 1.8648000000000002e-05, + "loss": 0.1803, + "step": 4662 + }, + { + "epoch": 0.09328, + "grad_norm": 1.0175225734710693, + "learning_rate": 1.8656000000000002e-05, + "loss": 0.1939, + "step": 4664 + }, + { + "epoch": 0.09332, + "grad_norm": 1.206196904182434, + "learning_rate": 1.8664000000000003e-05, + "loss": 0.2237, + "step": 4666 + }, + { + "epoch": 0.09336, + "grad_norm": 1.2687270641326904, + "learning_rate": 1.8672e-05, + "loss": 0.2541, + "step": 4668 + }, + { + "epoch": 0.0934, + "grad_norm": 1.1587944030761719, + "learning_rate": 1.8680000000000004e-05, + "loss": 0.2824, + "step": 4670 + }, + { + "epoch": 0.09344, + "grad_norm": 1.381433129310608, + "learning_rate": 1.8688e-05, + "loss": 0.2068, + "step": 4672 + }, + { + "epoch": 0.09348, + "grad_norm": 1.1397956609725952, + "learning_rate": 1.8696000000000002e-05, + "loss": 0.1845, + "step": 4674 + }, + { + "epoch": 0.09352, + "grad_norm": 1.052229881286621, + "learning_rate": 1.8704000000000003e-05, + "loss": 0.2332, + "step": 4676 + }, + { + "epoch": 0.09356, + "grad_norm": 0.9748356938362122, + "learning_rate": 1.8712e-05, + "loss": 0.2596, + "step": 4678 + }, + { + "epoch": 0.0936, + "grad_norm": 1.0928822755813599, + "learning_rate": 1.8720000000000004e-05, + "loss": 0.2039, + "step": 4680 + }, + { + "epoch": 0.09364, + "grad_norm": 0.6322823166847229, + "learning_rate": 1.8728e-05, + "loss": 0.1858, + "step": 4682 + }, + { + "epoch": 0.09368, + "grad_norm": 1.3382562398910522, + "learning_rate": 1.8736e-05, + "loss": 0.2775, + "step": 4684 + }, + { + "epoch": 0.09372, + "grad_norm": 1.1884262561798096, + "learning_rate": 1.8744000000000002e-05, + "loss": 0.2124, + "step": 4686 + }, + { + "epoch": 0.09376, + "grad_norm": 0.9147723317146301, + "learning_rate": 1.8752000000000003e-05, + "loss": 0.2301, + "step": 4688 + }, + { + "epoch": 0.0938, + "grad_norm": 1.1134952306747437, + "learning_rate": 1.876e-05, + "loss": 0.1859, + "step": 4690 + }, + { + "epoch": 0.09384, + "grad_norm": 0.8111677765846252, + "learning_rate": 1.8768e-05, + "loss": 0.3094, + "step": 4692 + }, + { + "epoch": 0.09388, + "grad_norm": 2.486574411392212, + "learning_rate": 1.8776e-05, + "loss": 0.3436, + "step": 4694 + }, + { + "epoch": 0.09392, + "grad_norm": 0.8302222490310669, + "learning_rate": 1.8784000000000002e-05, + "loss": 0.2632, + "step": 4696 + }, + { + "epoch": 0.09396, + "grad_norm": 1.3245517015457153, + "learning_rate": 1.8792000000000002e-05, + "loss": 0.3463, + "step": 4698 + }, + { + "epoch": 0.094, + "grad_norm": 1.415390968322754, + "learning_rate": 1.88e-05, + "loss": 0.1912, + "step": 4700 + }, + { + "epoch": 0.09404, + "grad_norm": 1.9247554540634155, + "learning_rate": 1.8808e-05, + "loss": 0.3315, + "step": 4702 + }, + { + "epoch": 0.09408, + "grad_norm": 1.1804698705673218, + "learning_rate": 1.8816e-05, + "loss": 0.2964, + "step": 4704 + }, + { + "epoch": 0.09412, + "grad_norm": 1.198084831237793, + "learning_rate": 1.8824e-05, + "loss": 0.2327, + "step": 4706 + }, + { + "epoch": 0.09416, + "grad_norm": 0.8789706826210022, + "learning_rate": 1.8832000000000002e-05, + "loss": 0.176, + "step": 4708 + }, + { + "epoch": 0.0942, + "grad_norm": 0.7020732760429382, + "learning_rate": 1.884e-05, + "loss": 0.1179, + "step": 4710 + }, + { + "epoch": 0.09424, + "grad_norm": 1.0982226133346558, + "learning_rate": 1.8848000000000003e-05, + "loss": 0.2353, + "step": 4712 + }, + { + "epoch": 0.09428, + "grad_norm": 0.7245738506317139, + "learning_rate": 1.8856e-05, + "loss": 0.1557, + "step": 4714 + }, + { + "epoch": 0.09432, + "grad_norm": 2.257189989089966, + "learning_rate": 1.8864e-05, + "loss": 0.2941, + "step": 4716 + }, + { + "epoch": 0.09436, + "grad_norm": 0.6777299642562866, + "learning_rate": 1.8872e-05, + "loss": 0.2674, + "step": 4718 + }, + { + "epoch": 0.0944, + "grad_norm": 0.5905842185020447, + "learning_rate": 1.8880000000000002e-05, + "loss": 0.2476, + "step": 4720 + }, + { + "epoch": 0.09444, + "grad_norm": 1.9563500881195068, + "learning_rate": 1.8888000000000003e-05, + "loss": 0.4116, + "step": 4722 + }, + { + "epoch": 0.09448, + "grad_norm": 0.8461650609970093, + "learning_rate": 1.8896e-05, + "loss": 0.1253, + "step": 4724 + }, + { + "epoch": 0.09452, + "grad_norm": 1.6999644041061401, + "learning_rate": 1.8904000000000004e-05, + "loss": 0.2847, + "step": 4726 + }, + { + "epoch": 0.09456, + "grad_norm": 1.2935497760772705, + "learning_rate": 1.8912e-05, + "loss": 0.3174, + "step": 4728 + }, + { + "epoch": 0.0946, + "grad_norm": 1.1501516103744507, + "learning_rate": 1.8920000000000002e-05, + "loss": 0.2124, + "step": 4730 + }, + { + "epoch": 0.09464, + "grad_norm": 0.8757458329200745, + "learning_rate": 1.8928000000000002e-05, + "loss": 0.2111, + "step": 4732 + }, + { + "epoch": 0.09468, + "grad_norm": 1.4061754941940308, + "learning_rate": 1.8936e-05, + "loss": 0.2333, + "step": 4734 + }, + { + "epoch": 0.09472, + "grad_norm": 1.4631247520446777, + "learning_rate": 1.8944000000000004e-05, + "loss": 0.2655, + "step": 4736 + }, + { + "epoch": 0.09476, + "grad_norm": 1.2779186964035034, + "learning_rate": 1.8952e-05, + "loss": 0.2337, + "step": 4738 + }, + { + "epoch": 0.0948, + "grad_norm": 1.0339691638946533, + "learning_rate": 1.896e-05, + "loss": 0.1767, + "step": 4740 + }, + { + "epoch": 0.09484, + "grad_norm": 1.3446279764175415, + "learning_rate": 1.8968000000000002e-05, + "loss": 0.2332, + "step": 4742 + }, + { + "epoch": 0.09488, + "grad_norm": 1.365195393562317, + "learning_rate": 1.8976000000000003e-05, + "loss": 0.2669, + "step": 4744 + }, + { + "epoch": 0.09492, + "grad_norm": 1.0673421621322632, + "learning_rate": 1.8984000000000003e-05, + "loss": 0.2448, + "step": 4746 + }, + { + "epoch": 0.09496, + "grad_norm": 1.133434772491455, + "learning_rate": 1.8992e-05, + "loss": 0.3069, + "step": 4748 + }, + { + "epoch": 0.095, + "grad_norm": 1.2341831922531128, + "learning_rate": 1.9e-05, + "loss": 0.277, + "step": 4750 + }, + { + "epoch": 0.09504, + "grad_norm": 0.5892310738563538, + "learning_rate": 1.9008e-05, + "loss": 0.1151, + "step": 4752 + }, + { + "epoch": 0.09508, + "grad_norm": 1.3495213985443115, + "learning_rate": 1.9016000000000002e-05, + "loss": 0.2512, + "step": 4754 + }, + { + "epoch": 0.09512, + "grad_norm": 0.7921088933944702, + "learning_rate": 1.9024000000000003e-05, + "loss": 0.1454, + "step": 4756 + }, + { + "epoch": 0.09516, + "grad_norm": 1.0225422382354736, + "learning_rate": 1.9032e-05, + "loss": 0.1962, + "step": 4758 + }, + { + "epoch": 0.0952, + "grad_norm": 0.7996509075164795, + "learning_rate": 1.904e-05, + "loss": 0.2354, + "step": 4760 + }, + { + "epoch": 0.09524, + "grad_norm": 1.1948829889297485, + "learning_rate": 1.9048e-05, + "loss": 0.2546, + "step": 4762 + }, + { + "epoch": 0.09528, + "grad_norm": 1.045170545578003, + "learning_rate": 1.9056000000000002e-05, + "loss": 0.2131, + "step": 4764 + }, + { + "epoch": 0.09532, + "grad_norm": 1.3495311737060547, + "learning_rate": 1.9064000000000002e-05, + "loss": 0.2486, + "step": 4766 + }, + { + "epoch": 0.09536, + "grad_norm": 1.3425676822662354, + "learning_rate": 1.9072000000000003e-05, + "loss": 0.3009, + "step": 4768 + }, + { + "epoch": 0.0954, + "grad_norm": 1.433079719543457, + "learning_rate": 1.908e-05, + "loss": 0.3582, + "step": 4770 + }, + { + "epoch": 0.09544, + "grad_norm": 0.7620032429695129, + "learning_rate": 1.9088e-05, + "loss": 0.2352, + "step": 4772 + }, + { + "epoch": 0.09548, + "grad_norm": 0.8726173043251038, + "learning_rate": 1.9096e-05, + "loss": 0.169, + "step": 4774 + }, + { + "epoch": 0.09552, + "grad_norm": 0.8038036227226257, + "learning_rate": 1.9104000000000002e-05, + "loss": 0.1962, + "step": 4776 + }, + { + "epoch": 0.09556, + "grad_norm": 1.075671911239624, + "learning_rate": 1.9112000000000003e-05, + "loss": 0.2775, + "step": 4778 + }, + { + "epoch": 0.0956, + "grad_norm": 0.9000018239021301, + "learning_rate": 1.912e-05, + "loss": 0.169, + "step": 4780 + }, + { + "epoch": 0.09564, + "grad_norm": 1.1320769786834717, + "learning_rate": 1.9128e-05, + "loss": 0.2237, + "step": 4782 + }, + { + "epoch": 0.09568, + "grad_norm": 1.0694514513015747, + "learning_rate": 1.9136e-05, + "loss": 0.2325, + "step": 4784 + }, + { + "epoch": 0.09572, + "grad_norm": 1.2853153944015503, + "learning_rate": 1.9144000000000002e-05, + "loss": 0.2564, + "step": 4786 + }, + { + "epoch": 0.09576, + "grad_norm": 1.1604011058807373, + "learning_rate": 1.9152000000000002e-05, + "loss": 0.2566, + "step": 4788 + }, + { + "epoch": 0.0958, + "grad_norm": 1.0822042226791382, + "learning_rate": 1.916e-05, + "loss": 0.1859, + "step": 4790 + }, + { + "epoch": 0.09584, + "grad_norm": 1.4966673851013184, + "learning_rate": 1.9168000000000004e-05, + "loss": 0.3033, + "step": 4792 + }, + { + "epoch": 0.09588, + "grad_norm": 1.2534326314926147, + "learning_rate": 1.9176e-05, + "loss": 0.2431, + "step": 4794 + }, + { + "epoch": 0.09592, + "grad_norm": 1.1542603969573975, + "learning_rate": 1.9184e-05, + "loss": 0.1782, + "step": 4796 + }, + { + "epoch": 0.09596, + "grad_norm": 1.5744166374206543, + "learning_rate": 1.9192000000000002e-05, + "loss": 0.339, + "step": 4798 + }, + { + "epoch": 0.096, + "grad_norm": 0.9936608076095581, + "learning_rate": 1.9200000000000003e-05, + "loss": 0.1759, + "step": 4800 + }, + { + "epoch": 0.09604, + "grad_norm": 1.010016679763794, + "learning_rate": 1.9208000000000003e-05, + "loss": 0.1495, + "step": 4802 + }, + { + "epoch": 0.09608, + "grad_norm": 1.1769235134124756, + "learning_rate": 1.9216e-05, + "loss": 0.2041, + "step": 4804 + }, + { + "epoch": 0.09612, + "grad_norm": 0.8490267395973206, + "learning_rate": 1.9224000000000004e-05, + "loss": 0.1589, + "step": 4806 + }, + { + "epoch": 0.09616, + "grad_norm": 0.7242828011512756, + "learning_rate": 1.9232e-05, + "loss": 0.0952, + "step": 4808 + }, + { + "epoch": 0.0962, + "grad_norm": 1.030695915222168, + "learning_rate": 1.9240000000000002e-05, + "loss": 0.1261, + "step": 4810 + }, + { + "epoch": 0.09624, + "grad_norm": 1.8070573806762695, + "learning_rate": 1.9248000000000003e-05, + "loss": 0.1902, + "step": 4812 + }, + { + "epoch": 0.09628, + "grad_norm": 2.0528781414031982, + "learning_rate": 1.9256e-05, + "loss": 0.2119, + "step": 4814 + }, + { + "epoch": 0.09632, + "grad_norm": 2.372448682785034, + "learning_rate": 1.9264e-05, + "loss": 0.5183, + "step": 4816 + }, + { + "epoch": 0.09636, + "grad_norm": 0.9260823130607605, + "learning_rate": 1.9272e-05, + "loss": 0.218, + "step": 4818 + }, + { + "epoch": 0.0964, + "grad_norm": 1.1417063474655151, + "learning_rate": 1.9280000000000002e-05, + "loss": 0.2544, + "step": 4820 + }, + { + "epoch": 0.09644, + "grad_norm": 1.0941393375396729, + "learning_rate": 1.9288000000000002e-05, + "loss": 0.2223, + "step": 4822 + }, + { + "epoch": 0.09648, + "grad_norm": 0.6038942337036133, + "learning_rate": 1.9296000000000003e-05, + "loss": 0.159, + "step": 4824 + }, + { + "epoch": 0.09652, + "grad_norm": 0.758965790271759, + "learning_rate": 1.9304e-05, + "loss": 0.1448, + "step": 4826 + }, + { + "epoch": 0.09656, + "grad_norm": 1.4806100130081177, + "learning_rate": 1.9312e-05, + "loss": 0.2665, + "step": 4828 + }, + { + "epoch": 0.0966, + "grad_norm": 1.9575191736221313, + "learning_rate": 1.932e-05, + "loss": 0.2913, + "step": 4830 + }, + { + "epoch": 0.09664, + "grad_norm": 0.8333443403244019, + "learning_rate": 1.9328000000000002e-05, + "loss": 0.1374, + "step": 4832 + }, + { + "epoch": 0.09668, + "grad_norm": 0.8177634477615356, + "learning_rate": 1.9336000000000003e-05, + "loss": 0.1382, + "step": 4834 + }, + { + "epoch": 0.09672, + "grad_norm": 1.8091710805892944, + "learning_rate": 1.9344e-05, + "loss": 0.2653, + "step": 4836 + }, + { + "epoch": 0.09676, + "grad_norm": 0.9792643785476685, + "learning_rate": 1.9352e-05, + "loss": 0.1373, + "step": 4838 + }, + { + "epoch": 0.0968, + "grad_norm": 1.368931770324707, + "learning_rate": 1.936e-05, + "loss": 0.2564, + "step": 4840 + }, + { + "epoch": 0.09684, + "grad_norm": 0.5022580027580261, + "learning_rate": 1.9368e-05, + "loss": 0.1186, + "step": 4842 + }, + { + "epoch": 0.09688, + "grad_norm": 1.575372576713562, + "learning_rate": 1.9376000000000002e-05, + "loss": 0.1938, + "step": 4844 + }, + { + "epoch": 0.09692, + "grad_norm": 6.794852256774902, + "learning_rate": 1.9384e-05, + "loss": 0.1392, + "step": 4846 + }, + { + "epoch": 0.09696, + "grad_norm": 0.5950358510017395, + "learning_rate": 1.9392000000000003e-05, + "loss": 0.1934, + "step": 4848 + }, + { + "epoch": 0.097, + "grad_norm": 2.6948859691619873, + "learning_rate": 1.94e-05, + "loss": 0.212, + "step": 4850 + }, + { + "epoch": 0.09704, + "grad_norm": 2.335195779800415, + "learning_rate": 1.9408e-05, + "loss": 0.3663, + "step": 4852 + }, + { + "epoch": 0.09708, + "grad_norm": 1.7152292728424072, + "learning_rate": 1.9416000000000002e-05, + "loss": 0.277, + "step": 4854 + }, + { + "epoch": 0.09712, + "grad_norm": 0.966866135597229, + "learning_rate": 1.9424e-05, + "loss": 0.2693, + "step": 4856 + }, + { + "epoch": 0.09716, + "grad_norm": 1.3743140697479248, + "learning_rate": 1.9432000000000003e-05, + "loss": 0.2428, + "step": 4858 + }, + { + "epoch": 0.0972, + "grad_norm": 0.9330706000328064, + "learning_rate": 1.944e-05, + "loss": 0.0966, + "step": 4860 + }, + { + "epoch": 0.09724, + "grad_norm": 1.5096924304962158, + "learning_rate": 1.9448e-05, + "loss": 0.1037, + "step": 4862 + }, + { + "epoch": 0.09728, + "grad_norm": 2.9531471729278564, + "learning_rate": 1.9456e-05, + "loss": 0.8221, + "step": 4864 + }, + { + "epoch": 0.09732, + "grad_norm": 0.3469306230545044, + "learning_rate": 1.9464000000000002e-05, + "loss": 0.0462, + "step": 4866 + }, + { + "epoch": 0.09736, + "grad_norm": 0.4426053464412689, + "learning_rate": 1.9472000000000003e-05, + "loss": 0.1871, + "step": 4868 + }, + { + "epoch": 0.0974, + "grad_norm": 2.5987884998321533, + "learning_rate": 1.948e-05, + "loss": 0.5022, + "step": 4870 + }, + { + "epoch": 0.09744, + "grad_norm": 1.3448069095611572, + "learning_rate": 1.9488000000000004e-05, + "loss": 0.1387, + "step": 4872 + }, + { + "epoch": 0.09748, + "grad_norm": 1.7249398231506348, + "learning_rate": 1.9496e-05, + "loss": 0.2126, + "step": 4874 + }, + { + "epoch": 0.09752, + "grad_norm": 1.4719213247299194, + "learning_rate": 1.9504e-05, + "loss": 0.1762, + "step": 4876 + }, + { + "epoch": 0.09756, + "grad_norm": 2.4896347522735596, + "learning_rate": 1.9512000000000002e-05, + "loss": 0.3528, + "step": 4878 + }, + { + "epoch": 0.0976, + "grad_norm": 1.8837535381317139, + "learning_rate": 1.9520000000000003e-05, + "loss": 0.293, + "step": 4880 + }, + { + "epoch": 0.09764, + "grad_norm": 1.600178599357605, + "learning_rate": 1.9528000000000003e-05, + "loss": 0.2033, + "step": 4882 + }, + { + "epoch": 0.09768, + "grad_norm": 1.2287583351135254, + "learning_rate": 1.9536e-05, + "loss": 0.1037, + "step": 4884 + }, + { + "epoch": 0.09772, + "grad_norm": 2.162766933441162, + "learning_rate": 1.9544e-05, + "loss": 0.2658, + "step": 4886 + }, + { + "epoch": 0.09776, + "grad_norm": 1.4961657524108887, + "learning_rate": 1.9552000000000002e-05, + "loss": 0.1768, + "step": 4888 + }, + { + "epoch": 0.0978, + "grad_norm": 1.5972774028778076, + "learning_rate": 1.9560000000000002e-05, + "loss": 0.2513, + "step": 4890 + }, + { + "epoch": 0.09784, + "grad_norm": 2.0178208351135254, + "learning_rate": 1.9568000000000003e-05, + "loss": 0.1693, + "step": 4892 + }, + { + "epoch": 0.09788, + "grad_norm": 1.993406891822815, + "learning_rate": 1.9576e-05, + "loss": 0.2434, + "step": 4894 + }, + { + "epoch": 0.09792, + "grad_norm": 0.8384615182876587, + "learning_rate": 1.9584e-05, + "loss": 0.1697, + "step": 4896 + }, + { + "epoch": 0.09796, + "grad_norm": 1.5520025491714478, + "learning_rate": 1.9592e-05, + "loss": 0.1757, + "step": 4898 + }, + { + "epoch": 0.098, + "grad_norm": 2.563807725906372, + "learning_rate": 1.9600000000000002e-05, + "loss": 0.3393, + "step": 4900 + }, + { + "epoch": 0.09804, + "grad_norm": 0.47509002685546875, + "learning_rate": 1.9608000000000003e-05, + "loss": 0.0466, + "step": 4902 + }, + { + "epoch": 0.09808, + "grad_norm": 2.4682908058166504, + "learning_rate": 1.9616000000000003e-05, + "loss": 0.2793, + "step": 4904 + }, + { + "epoch": 0.09812, + "grad_norm": 1.7600586414337158, + "learning_rate": 1.9624e-05, + "loss": 0.3316, + "step": 4906 + }, + { + "epoch": 0.09816, + "grad_norm": 3.4813685417175293, + "learning_rate": 1.9632e-05, + "loss": 0.341, + "step": 4908 + }, + { + "epoch": 0.0982, + "grad_norm": 1.1703941822052002, + "learning_rate": 1.9640000000000002e-05, + "loss": 0.2414, + "step": 4910 + }, + { + "epoch": 0.09824, + "grad_norm": 0.9169619083404541, + "learning_rate": 1.9648000000000002e-05, + "loss": 0.1114, + "step": 4912 + }, + { + "epoch": 0.09828, + "grad_norm": 0.5550183653831482, + "learning_rate": 1.9656000000000003e-05, + "loss": 0.0608, + "step": 4914 + }, + { + "epoch": 0.09832, + "grad_norm": 2.4669971466064453, + "learning_rate": 1.9664e-05, + "loss": 0.5188, + "step": 4916 + }, + { + "epoch": 0.09836, + "grad_norm": 0.5255228877067566, + "learning_rate": 1.9672e-05, + "loss": 0.0519, + "step": 4918 + }, + { + "epoch": 0.0984, + "grad_norm": 0.9903678297996521, + "learning_rate": 1.968e-05, + "loss": 0.0891, + "step": 4920 + }, + { + "epoch": 0.09844, + "grad_norm": 2.6133570671081543, + "learning_rate": 1.9688000000000002e-05, + "loss": 0.3313, + "step": 4922 + }, + { + "epoch": 0.09848, + "grad_norm": 2.1596028804779053, + "learning_rate": 1.9696000000000003e-05, + "loss": 0.4698, + "step": 4924 + }, + { + "epoch": 0.09852, + "grad_norm": 0.9335158467292786, + "learning_rate": 1.9704e-05, + "loss": 0.1033, + "step": 4926 + }, + { + "epoch": 0.09856, + "grad_norm": 1.5088623762130737, + "learning_rate": 1.9712000000000004e-05, + "loss": 0.2349, + "step": 4928 + }, + { + "epoch": 0.0986, + "grad_norm": 1.2532533407211304, + "learning_rate": 1.972e-05, + "loss": 0.2792, + "step": 4930 + }, + { + "epoch": 0.09864, + "grad_norm": 1.212510585784912, + "learning_rate": 1.9728e-05, + "loss": 0.1714, + "step": 4932 + }, + { + "epoch": 0.09868, + "grad_norm": 1.1077470779418945, + "learning_rate": 1.9736000000000002e-05, + "loss": 0.1941, + "step": 4934 + }, + { + "epoch": 0.09872, + "grad_norm": 1.9594411849975586, + "learning_rate": 1.9744e-05, + "loss": 0.219, + "step": 4936 + }, + { + "epoch": 0.09876, + "grad_norm": 0.8232313394546509, + "learning_rate": 1.9752000000000003e-05, + "loss": 0.0872, + "step": 4938 + }, + { + "epoch": 0.0988, + "grad_norm": 2.7868332862854004, + "learning_rate": 1.976e-05, + "loss": 0.5689, + "step": 4940 + }, + { + "epoch": 0.09884, + "grad_norm": 2.1416478157043457, + "learning_rate": 1.9768e-05, + "loss": 0.2429, + "step": 4942 + }, + { + "epoch": 0.09888, + "grad_norm": 1.6793744564056396, + "learning_rate": 1.9776000000000002e-05, + "loss": 0.3828, + "step": 4944 + }, + { + "epoch": 0.09892, + "grad_norm": 2.8773117065429688, + "learning_rate": 1.9784000000000002e-05, + "loss": 0.5121, + "step": 4946 + }, + { + "epoch": 0.09896, + "grad_norm": 1.3204480409622192, + "learning_rate": 1.9792000000000003e-05, + "loss": 0.2333, + "step": 4948 + }, + { + "epoch": 0.099, + "grad_norm": 1.718261957168579, + "learning_rate": 1.98e-05, + "loss": 0.2238, + "step": 4950 + }, + { + "epoch": 0.09904, + "grad_norm": 0.624080240726471, + "learning_rate": 1.9808e-05, + "loss": 0.2033, + "step": 4952 + }, + { + "epoch": 0.09908, + "grad_norm": 1.443496584892273, + "learning_rate": 1.9816e-05, + "loss": 0.3404, + "step": 4954 + }, + { + "epoch": 0.09912, + "grad_norm": 0.8905125856399536, + "learning_rate": 1.9824000000000002e-05, + "loss": 0.1444, + "step": 4956 + }, + { + "epoch": 0.09916, + "grad_norm": 1.6824352741241455, + "learning_rate": 1.9832000000000003e-05, + "loss": 0.2486, + "step": 4958 + }, + { + "epoch": 0.0992, + "grad_norm": 0.9209867715835571, + "learning_rate": 1.9840000000000003e-05, + "loss": 0.1445, + "step": 4960 + }, + { + "epoch": 0.09924, + "grad_norm": 1.0943241119384766, + "learning_rate": 1.9848e-05, + "loss": 0.1943, + "step": 4962 + }, + { + "epoch": 0.09928, + "grad_norm": 1.000475525856018, + "learning_rate": 1.9856e-05, + "loss": 0.1993, + "step": 4964 + }, + { + "epoch": 0.09932, + "grad_norm": 1.38869309425354, + "learning_rate": 1.9864e-05, + "loss": 0.1696, + "step": 4966 + }, + { + "epoch": 0.09936, + "grad_norm": 1.792549729347229, + "learning_rate": 1.9872000000000002e-05, + "loss": 0.3678, + "step": 4968 + }, + { + "epoch": 0.0994, + "grad_norm": 1.4431064128875732, + "learning_rate": 1.9880000000000003e-05, + "loss": 0.2225, + "step": 4970 + }, + { + "epoch": 0.09944, + "grad_norm": 1.4066742658615112, + "learning_rate": 1.9888e-05, + "loss": 0.2031, + "step": 4972 + }, + { + "epoch": 0.09948, + "grad_norm": 0.4198433756828308, + "learning_rate": 1.9896e-05, + "loss": 0.0872, + "step": 4974 + }, + { + "epoch": 0.09952, + "grad_norm": 1.629225492477417, + "learning_rate": 1.9904e-05, + "loss": 0.2569, + "step": 4976 + }, + { + "epoch": 0.09956, + "grad_norm": 0.5045538544654846, + "learning_rate": 1.9912000000000002e-05, + "loss": 0.0806, + "step": 4978 + }, + { + "epoch": 0.0996, + "grad_norm": 2.886420249938965, + "learning_rate": 1.9920000000000002e-05, + "loss": 0.3901, + "step": 4980 + }, + { + "epoch": 0.09964, + "grad_norm": 1.339496374130249, + "learning_rate": 1.9928e-05, + "loss": 0.251, + "step": 4982 + }, + { + "epoch": 0.09968, + "grad_norm": 2.370908737182617, + "learning_rate": 1.9936000000000004e-05, + "loss": 0.3783, + "step": 4984 + }, + { + "epoch": 0.09972, + "grad_norm": 0.7723445296287537, + "learning_rate": 1.9944e-05, + "loss": 0.2034, + "step": 4986 + }, + { + "epoch": 0.09976, + "grad_norm": 1.3929554224014282, + "learning_rate": 1.9952e-05, + "loss": 0.2539, + "step": 4988 + }, + { + "epoch": 0.0998, + "grad_norm": 1.547407865524292, + "learning_rate": 1.9960000000000002e-05, + "loss": 0.1934, + "step": 4990 + }, + { + "epoch": 0.09984, + "grad_norm": 1.7166483402252197, + "learning_rate": 1.9968e-05, + "loss": 0.235, + "step": 4992 + }, + { + "epoch": 0.09988, + "grad_norm": 1.391156554222107, + "learning_rate": 1.9976000000000003e-05, + "loss": 0.3146, + "step": 4994 + }, + { + "epoch": 0.09992, + "grad_norm": 0.8087192177772522, + "learning_rate": 1.9984e-05, + "loss": 0.1326, + "step": 4996 + }, + { + "epoch": 0.09996, + "grad_norm": 1.1232142448425293, + "learning_rate": 1.9992e-05, + "loss": 0.159, + "step": 4998 + }, + { + "epoch": 0.1, + "grad_norm": 1.2268017530441284, + "learning_rate": 2e-05, + "loss": 0.1223, + "step": 5000 + }, + { + "epoch": 0.10004, + "grad_norm": 0.6501366496086121, + "learning_rate": 1.999999990252243e-05, + "loss": 0.2621, + "step": 5002 + }, + { + "epoch": 0.10008, + "grad_norm": 1.018579125404358, + "learning_rate": 1.9999999610089706e-05, + "loss": 0.1329, + "step": 5004 + }, + { + "epoch": 0.10012, + "grad_norm": 2.2190592288970947, + "learning_rate": 1.9999999122701846e-05, + "loss": 0.2536, + "step": 5006 + }, + { + "epoch": 0.10016, + "grad_norm": 1.9152991771697998, + "learning_rate": 1.9999998440358852e-05, + "loss": 0.2539, + "step": 5008 + }, + { + "epoch": 0.1002, + "grad_norm": 2.0179224014282227, + "learning_rate": 1.9999997563060744e-05, + "loss": 0.2767, + "step": 5010 + }, + { + "epoch": 0.10024, + "grad_norm": 0.5864546298980713, + "learning_rate": 1.9999996490807533e-05, + "loss": 0.1349, + "step": 5012 + }, + { + "epoch": 0.10028, + "grad_norm": 2.811547040939331, + "learning_rate": 1.999999522359924e-05, + "loss": 0.3904, + "step": 5014 + }, + { + "epoch": 0.10032, + "grad_norm": 2.4015820026397705, + "learning_rate": 1.9999993761435893e-05, + "loss": 0.5543, + "step": 5016 + }, + { + "epoch": 0.10036, + "grad_norm": 0.7431479096412659, + "learning_rate": 1.999999210431752e-05, + "loss": 0.1251, + "step": 5018 + }, + { + "epoch": 0.1004, + "grad_norm": 1.6561721563339233, + "learning_rate": 1.9999990252244153e-05, + "loss": 0.2656, + "step": 5020 + }, + { + "epoch": 0.10044, + "grad_norm": 0.7089477181434631, + "learning_rate": 1.9999988205215824e-05, + "loss": 0.2295, + "step": 5022 + }, + { + "epoch": 0.10048, + "grad_norm": 0.99146568775177, + "learning_rate": 1.9999985963232583e-05, + "loss": 0.2228, + "step": 5024 + }, + { + "epoch": 0.10052, + "grad_norm": 0.9542055726051331, + "learning_rate": 1.9999983526294464e-05, + "loss": 0.1467, + "step": 5026 + }, + { + "epoch": 0.10056, + "grad_norm": 1.2097877264022827, + "learning_rate": 1.9999980894401517e-05, + "loss": 0.2731, + "step": 5028 + }, + { + "epoch": 0.1006, + "grad_norm": 1.3150912523269653, + "learning_rate": 1.9999978067553796e-05, + "loss": 0.1695, + "step": 5030 + }, + { + "epoch": 0.10064, + "grad_norm": 1.7241408824920654, + "learning_rate": 1.9999975045751353e-05, + "loss": 0.2473, + "step": 5032 + }, + { + "epoch": 0.10068, + "grad_norm": 1.528927206993103, + "learning_rate": 1.9999971828994248e-05, + "loss": 0.2775, + "step": 5034 + }, + { + "epoch": 0.10072, + "grad_norm": 1.154151439666748, + "learning_rate": 1.9999968417282542e-05, + "loss": 0.3014, + "step": 5036 + }, + { + "epoch": 0.10076, + "grad_norm": 1.1072521209716797, + "learning_rate": 1.9999964810616307e-05, + "loss": 0.1847, + "step": 5038 + }, + { + "epoch": 0.1008, + "grad_norm": 1.0826150178909302, + "learning_rate": 1.9999961008995607e-05, + "loss": 0.1941, + "step": 5040 + }, + { + "epoch": 0.10084, + "grad_norm": 1.2944306135177612, + "learning_rate": 1.999995701242052e-05, + "loss": 0.159, + "step": 5042 + }, + { + "epoch": 0.10088, + "grad_norm": 1.7133711576461792, + "learning_rate": 1.9999952820891126e-05, + "loss": 0.235, + "step": 5044 + }, + { + "epoch": 0.10092, + "grad_norm": 1.0500092506408691, + "learning_rate": 1.9999948434407497e-05, + "loss": 0.1248, + "step": 5046 + }, + { + "epoch": 0.10096, + "grad_norm": 2.4780099391937256, + "learning_rate": 1.999994385296973e-05, + "loss": 0.3551, + "step": 5048 + }, + { + "epoch": 0.101, + "grad_norm": 1.4110714197158813, + "learning_rate": 1.9999939076577906e-05, + "loss": 0.1856, + "step": 5050 + }, + { + "epoch": 0.10104, + "grad_norm": 2.144401788711548, + "learning_rate": 1.9999934105232122e-05, + "loss": 0.3158, + "step": 5052 + }, + { + "epoch": 0.10108, + "grad_norm": 1.2333587408065796, + "learning_rate": 1.9999928938932473e-05, + "loss": 0.2262, + "step": 5054 + }, + { + "epoch": 0.10112, + "grad_norm": 1.3703961372375488, + "learning_rate": 1.9999923577679066e-05, + "loss": 0.3475, + "step": 5056 + }, + { + "epoch": 0.10116, + "grad_norm": 2.0621376037597656, + "learning_rate": 1.9999918021471994e-05, + "loss": 0.2916, + "step": 5058 + }, + { + "epoch": 0.1012, + "grad_norm": 1.2553815841674805, + "learning_rate": 1.9999912270311376e-05, + "loss": 0.3206, + "step": 5060 + }, + { + "epoch": 0.10124, + "grad_norm": 0.8551321029663086, + "learning_rate": 1.999990632419732e-05, + "loss": 0.162, + "step": 5062 + }, + { + "epoch": 0.10128, + "grad_norm": 1.4037811756134033, + "learning_rate": 1.9999900183129944e-05, + "loss": 0.2331, + "step": 5064 + }, + { + "epoch": 0.10132, + "grad_norm": 1.4045473337173462, + "learning_rate": 1.9999893847109362e-05, + "loss": 0.2382, + "step": 5066 + }, + { + "epoch": 0.10136, + "grad_norm": 1.0417609214782715, + "learning_rate": 1.99998873161357e-05, + "loss": 0.2302, + "step": 5068 + }, + { + "epoch": 0.1014, + "grad_norm": 1.489794373512268, + "learning_rate": 1.999988059020909e-05, + "loss": 0.2351, + "step": 5070 + }, + { + "epoch": 0.10144, + "grad_norm": 1.3613009452819824, + "learning_rate": 1.999987366932966e-05, + "loss": 0.2546, + "step": 5072 + }, + { + "epoch": 0.10148, + "grad_norm": 1.3051034212112427, + "learning_rate": 1.9999866553497544e-05, + "loss": 0.1713, + "step": 5074 + }, + { + "epoch": 0.10152, + "grad_norm": 1.40433669090271, + "learning_rate": 1.999985924271288e-05, + "loss": 0.2471, + "step": 5076 + }, + { + "epoch": 0.10156, + "grad_norm": 1.1552796363830566, + "learning_rate": 1.999985173697581e-05, + "loss": 0.1676, + "step": 5078 + }, + { + "epoch": 0.1016, + "grad_norm": 1.5835893154144287, + "learning_rate": 1.9999844036286483e-05, + "loss": 0.2181, + "step": 5080 + }, + { + "epoch": 0.10164, + "grad_norm": 1.6947658061981201, + "learning_rate": 1.999983614064505e-05, + "loss": 0.2774, + "step": 5082 + }, + { + "epoch": 0.10168, + "grad_norm": 1.3657456636428833, + "learning_rate": 1.9999828050051662e-05, + "loss": 0.2123, + "step": 5084 + }, + { + "epoch": 0.10172, + "grad_norm": 1.6144520044326782, + "learning_rate": 1.999981976450648e-05, + "loss": 0.2236, + "step": 5086 + }, + { + "epoch": 0.10176, + "grad_norm": 1.5174298286437988, + "learning_rate": 1.999981128400966e-05, + "loss": 0.2324, + "step": 5088 + }, + { + "epoch": 0.1018, + "grad_norm": 1.6227704286575317, + "learning_rate": 1.999980260856137e-05, + "loss": 0.175, + "step": 5090 + }, + { + "epoch": 0.10184, + "grad_norm": 1.9216738939285278, + "learning_rate": 1.9999793738161785e-05, + "loss": 0.23, + "step": 5092 + }, + { + "epoch": 0.10188, + "grad_norm": 1.3615074157714844, + "learning_rate": 1.9999784672811068e-05, + "loss": 0.1938, + "step": 5094 + }, + { + "epoch": 0.10192, + "grad_norm": 1.7533231973648071, + "learning_rate": 1.9999775412509405e-05, + "loss": 0.199, + "step": 5096 + }, + { + "epoch": 0.10196, + "grad_norm": 0.8518145680427551, + "learning_rate": 1.999976595725697e-05, + "loss": 0.2876, + "step": 5098 + }, + { + "epoch": 0.102, + "grad_norm": 1.5609817504882812, + "learning_rate": 1.9999756307053947e-05, + "loss": 0.178, + "step": 5100 + }, + { + "epoch": 0.10204, + "grad_norm": 1.6829190254211426, + "learning_rate": 1.999974646190053e-05, + "loss": 0.2899, + "step": 5102 + }, + { + "epoch": 0.10208, + "grad_norm": 1.4796191453933716, + "learning_rate": 1.9999736421796908e-05, + "loss": 0.1712, + "step": 5104 + }, + { + "epoch": 0.10212, + "grad_norm": 1.7163571119308472, + "learning_rate": 1.9999726186743276e-05, + "loss": 0.2063, + "step": 5106 + }, + { + "epoch": 0.10216, + "grad_norm": 1.0173156261444092, + "learning_rate": 1.9999715756739833e-05, + "loss": 0.1322, + "step": 5108 + }, + { + "epoch": 0.1022, + "grad_norm": 3.0033745765686035, + "learning_rate": 1.999970513178678e-05, + "loss": 0.3938, + "step": 5110 + }, + { + "epoch": 0.10224, + "grad_norm": 2.8910107612609863, + "learning_rate": 1.999969431188433e-05, + "loss": 0.5695, + "step": 5112 + }, + { + "epoch": 0.10228, + "grad_norm": 2.101285696029663, + "learning_rate": 1.9999683297032694e-05, + "loss": 0.2185, + "step": 5114 + }, + { + "epoch": 0.10232, + "grad_norm": 0.7513188123703003, + "learning_rate": 1.999967208723208e-05, + "loss": 0.1249, + "step": 5116 + }, + { + "epoch": 0.10236, + "grad_norm": 1.2201420068740845, + "learning_rate": 1.9999660682482708e-05, + "loss": 0.1441, + "step": 5118 + }, + { + "epoch": 0.1024, + "grad_norm": 1.1473205089569092, + "learning_rate": 1.9999649082784807e-05, + "loss": 0.1845, + "step": 5120 + }, + { + "epoch": 0.10244, + "grad_norm": 1.2223681211471558, + "learning_rate": 1.9999637288138598e-05, + "loss": 0.2349, + "step": 5122 + }, + { + "epoch": 0.10248, + "grad_norm": 1.0536630153656006, + "learning_rate": 1.9999625298544314e-05, + "loss": 0.1956, + "step": 5124 + }, + { + "epoch": 0.10252, + "grad_norm": 1.4180099964141846, + "learning_rate": 1.9999613114002184e-05, + "loss": 0.1853, + "step": 5126 + }, + { + "epoch": 0.10256, + "grad_norm": 0.8296124935150146, + "learning_rate": 1.999960073451245e-05, + "loss": 0.2631, + "step": 5128 + }, + { + "epoch": 0.1026, + "grad_norm": 0.7315515279769897, + "learning_rate": 1.999958816007535e-05, + "loss": 0.229, + "step": 5130 + }, + { + "epoch": 0.10264, + "grad_norm": 2.156069040298462, + "learning_rate": 1.999957539069113e-05, + "loss": 0.5681, + "step": 5132 + }, + { + "epoch": 0.10268, + "grad_norm": 0.603334367275238, + "learning_rate": 1.999956242636004e-05, + "loss": 0.2103, + "step": 5134 + }, + { + "epoch": 0.10272, + "grad_norm": 0.6777198910713196, + "learning_rate": 1.9999549267082337e-05, + "loss": 0.197, + "step": 5136 + }, + { + "epoch": 0.10276, + "grad_norm": 1.2616537809371948, + "learning_rate": 1.999953591285827e-05, + "loss": 0.3286, + "step": 5138 + }, + { + "epoch": 0.1028, + "grad_norm": 1.2446823120117188, + "learning_rate": 1.99995223636881e-05, + "loss": 0.2264, + "step": 5140 + }, + { + "epoch": 0.10284, + "grad_norm": 1.126900553703308, + "learning_rate": 1.9999508619572094e-05, + "loss": 0.2351, + "step": 5142 + }, + { + "epoch": 0.10288, + "grad_norm": 1.1955260038375854, + "learning_rate": 1.999949468051052e-05, + "loss": 0.2239, + "step": 5144 + }, + { + "epoch": 0.10292, + "grad_norm": 1.1818650960922241, + "learning_rate": 1.9999480546503648e-05, + "loss": 0.2548, + "step": 5146 + }, + { + "epoch": 0.10296, + "grad_norm": 1.685107946395874, + "learning_rate": 1.9999466217551757e-05, + "loss": 0.2846, + "step": 5148 + }, + { + "epoch": 0.103, + "grad_norm": 1.4596582651138306, + "learning_rate": 1.9999451693655125e-05, + "loss": 0.2301, + "step": 5150 + }, + { + "epoch": 0.10304, + "grad_norm": 0.9869801998138428, + "learning_rate": 1.999943697481403e-05, + "loss": 0.2926, + "step": 5152 + }, + { + "epoch": 0.10308, + "grad_norm": 0.9841259121894836, + "learning_rate": 1.9999422061028765e-05, + "loss": 0.2132, + "step": 5154 + }, + { + "epoch": 0.10312, + "grad_norm": 0.723128080368042, + "learning_rate": 1.9999406952299617e-05, + "loss": 0.1653, + "step": 5156 + }, + { + "epoch": 0.10316, + "grad_norm": 0.6768753528594971, + "learning_rate": 1.9999391648626886e-05, + "loss": 0.1328, + "step": 5158 + }, + { + "epoch": 0.1032, + "grad_norm": 1.2364259958267212, + "learning_rate": 1.9999376150010868e-05, + "loss": 0.2227, + "step": 5160 + }, + { + "epoch": 0.10324, + "grad_norm": 1.5613412857055664, + "learning_rate": 1.999936045645186e-05, + "loss": 0.2668, + "step": 5162 + }, + { + "epoch": 0.10328, + "grad_norm": 1.4377983808517456, + "learning_rate": 1.999934456795017e-05, + "loss": 0.2767, + "step": 5164 + }, + { + "epoch": 0.10332, + "grad_norm": 0.9703245162963867, + "learning_rate": 1.9999328484506115e-05, + "loss": 0.1519, + "step": 5166 + }, + { + "epoch": 0.10336, + "grad_norm": 0.6410667300224304, + "learning_rate": 1.9999312206120002e-05, + "loss": 0.1638, + "step": 5168 + }, + { + "epoch": 0.1034, + "grad_norm": 1.122828483581543, + "learning_rate": 1.9999295732792146e-05, + "loss": 0.3015, + "step": 5170 + }, + { + "epoch": 0.10344, + "grad_norm": 1.4236557483673096, + "learning_rate": 1.9999279064522876e-05, + "loss": 0.2263, + "step": 5172 + }, + { + "epoch": 0.10348, + "grad_norm": 1.5914745330810547, + "learning_rate": 1.999926220131251e-05, + "loss": 0.2899, + "step": 5174 + }, + { + "epoch": 0.10352, + "grad_norm": 0.9156357645988464, + "learning_rate": 1.999924514316138e-05, + "loss": 0.1676, + "step": 5176 + }, + { + "epoch": 0.10356, + "grad_norm": 0.890964686870575, + "learning_rate": 1.9999227890069818e-05, + "loss": 0.2035, + "step": 5178 + }, + { + "epoch": 0.1036, + "grad_norm": 1.0773248672485352, + "learning_rate": 1.9999210442038164e-05, + "loss": 0.1764, + "step": 5180 + }, + { + "epoch": 0.10364, + "grad_norm": 1.7059940099716187, + "learning_rate": 1.999919279906675e-05, + "loss": 0.2566, + "step": 5182 + }, + { + "epoch": 0.10368, + "grad_norm": 2.384615182876587, + "learning_rate": 1.9999174961155928e-05, + "loss": 0.3225, + "step": 5184 + }, + { + "epoch": 0.10372, + "grad_norm": 0.6116907000541687, + "learning_rate": 1.999915692830604e-05, + "loss": 0.1259, + "step": 5186 + }, + { + "epoch": 0.10376, + "grad_norm": 2.0108351707458496, + "learning_rate": 1.9999138700517434e-05, + "loss": 0.3131, + "step": 5188 + }, + { + "epoch": 0.1038, + "grad_norm": 0.8958712816238403, + "learning_rate": 1.9999120277790477e-05, + "loss": 0.2281, + "step": 5190 + }, + { + "epoch": 0.10384, + "grad_norm": 1.3847079277038574, + "learning_rate": 1.9999101660125525e-05, + "loss": 0.1881, + "step": 5192 + }, + { + "epoch": 0.10388, + "grad_norm": 1.029082179069519, + "learning_rate": 1.9999082847522936e-05, + "loss": 0.2349, + "step": 5194 + }, + { + "epoch": 0.10392, + "grad_norm": 1.4850445985794067, + "learning_rate": 1.9999063839983077e-05, + "loss": 0.1651, + "step": 5196 + }, + { + "epoch": 0.10396, + "grad_norm": 0.9487821459770203, + "learning_rate": 1.999904463750632e-05, + "loss": 0.1856, + "step": 5198 + }, + { + "epoch": 0.104, + "grad_norm": 0.7346213459968567, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.2032, + "step": 5200 + }, + { + "epoch": 0.10404, + "grad_norm": 1.4876352548599243, + "learning_rate": 1.9999005647743618e-05, + "loss": 0.3284, + "step": 5202 + }, + { + "epoch": 0.10408, + "grad_norm": 1.0586791038513184, + "learning_rate": 1.9998985860458435e-05, + "loss": 0.1844, + "step": 5204 + }, + { + "epoch": 0.10412, + "grad_norm": 2.468672752380371, + "learning_rate": 1.9998965878237872e-05, + "loss": 0.4067, + "step": 5206 + }, + { + "epoch": 0.10416, + "grad_norm": 0.9708799123764038, + "learning_rate": 1.9998945701082322e-05, + "loss": 0.2913, + "step": 5208 + }, + { + "epoch": 0.1042, + "grad_norm": 1.8149293661117554, + "learning_rate": 1.9998925328992175e-05, + "loss": 0.3074, + "step": 5210 + }, + { + "epoch": 0.10424, + "grad_norm": 1.2354202270507812, + "learning_rate": 1.999890476196783e-05, + "loss": 0.2565, + "step": 5212 + }, + { + "epoch": 0.10428, + "grad_norm": 1.787850022315979, + "learning_rate": 1.9998884000009696e-05, + "loss": 0.2925, + "step": 5214 + }, + { + "epoch": 0.10432, + "grad_norm": 0.5843737125396729, + "learning_rate": 1.9998863043118163e-05, + "loss": 0.0898, + "step": 5216 + }, + { + "epoch": 0.10436, + "grad_norm": 1.3289365768432617, + "learning_rate": 1.9998841891293656e-05, + "loss": 0.2792, + "step": 5218 + }, + { + "epoch": 0.1044, + "grad_norm": 0.6851567029953003, + "learning_rate": 1.999882054453657e-05, + "loss": 0.2289, + "step": 5220 + }, + { + "epoch": 0.10444, + "grad_norm": 1.3068211078643799, + "learning_rate": 1.9998799002847335e-05, + "loss": 0.1857, + "step": 5222 + }, + { + "epoch": 0.10448, + "grad_norm": 0.6383882164955139, + "learning_rate": 1.999877726622636e-05, + "loss": 0.1979, + "step": 5224 + }, + { + "epoch": 0.10452, + "grad_norm": 1.0502411127090454, + "learning_rate": 1.999875533467408e-05, + "loss": 0.16, + "step": 5226 + }, + { + "epoch": 0.10456, + "grad_norm": 1.0924549102783203, + "learning_rate": 1.9998733208190916e-05, + "loss": 0.2064, + "step": 5228 + }, + { + "epoch": 0.1046, + "grad_norm": 0.805095374584198, + "learning_rate": 1.9998710886777298e-05, + "loss": 0.1856, + "step": 5230 + }, + { + "epoch": 0.10464, + "grad_norm": 0.8646188974380493, + "learning_rate": 1.9998688370433667e-05, + "loss": 0.1552, + "step": 5232 + }, + { + "epoch": 0.10468, + "grad_norm": 2.3616256713867188, + "learning_rate": 1.9998665659160453e-05, + "loss": 0.3227, + "step": 5234 + }, + { + "epoch": 0.10472, + "grad_norm": 2.0255320072174072, + "learning_rate": 1.9998642752958107e-05, + "loss": 0.3943, + "step": 5236 + }, + { + "epoch": 0.10476, + "grad_norm": 1.4377954006195068, + "learning_rate": 1.9998619651827077e-05, + "loss": 0.2328, + "step": 5238 + }, + { + "epoch": 0.1048, + "grad_norm": 1.966110110282898, + "learning_rate": 1.9998596355767805e-05, + "loss": 0.2924, + "step": 5240 + }, + { + "epoch": 0.10484, + "grad_norm": 1.209402322769165, + "learning_rate": 1.9998572864780753e-05, + "loss": 0.2026, + "step": 5242 + }, + { + "epoch": 0.10488, + "grad_norm": 0.7478739619255066, + "learning_rate": 1.999854917886637e-05, + "loss": 0.1405, + "step": 5244 + }, + { + "epoch": 0.10492, + "grad_norm": 1.2143460512161255, + "learning_rate": 1.9998525298025127e-05, + "loss": 0.2028, + "step": 5246 + }, + { + "epoch": 0.10496, + "grad_norm": 1.7544403076171875, + "learning_rate": 1.9998501222257486e-05, + "loss": 0.3063, + "step": 5248 + }, + { + "epoch": 0.105, + "grad_norm": 1.1417198181152344, + "learning_rate": 1.9998476951563914e-05, + "loss": 0.235, + "step": 5250 + }, + { + "epoch": 0.10504, + "grad_norm": 1.6920967102050781, + "learning_rate": 1.999845248594489e-05, + "loss": 0.243, + "step": 5252 + }, + { + "epoch": 0.10508, + "grad_norm": 1.5047401189804077, + "learning_rate": 1.999842782540088e-05, + "loss": 0.2065, + "step": 5254 + }, + { + "epoch": 0.10512, + "grad_norm": 0.9440564513206482, + "learning_rate": 1.9998402969932376e-05, + "loss": 0.1859, + "step": 5256 + }, + { + "epoch": 0.10516, + "grad_norm": 1.2985392808914185, + "learning_rate": 1.999837791953986e-05, + "loss": 0.2347, + "step": 5258 + }, + { + "epoch": 0.1052, + "grad_norm": 1.1013519763946533, + "learning_rate": 1.9998352674223816e-05, + "loss": 0.1918, + "step": 5260 + }, + { + "epoch": 0.10524, + "grad_norm": 1.4347789287567139, + "learning_rate": 1.9998327233984742e-05, + "loss": 0.1857, + "step": 5262 + }, + { + "epoch": 0.10528, + "grad_norm": 1.0781751871109009, + "learning_rate": 1.999830159882313e-05, + "loss": 0.2102, + "step": 5264 + }, + { + "epoch": 0.10532, + "grad_norm": 0.6817001104354858, + "learning_rate": 1.9998275768739482e-05, + "loss": 0.0849, + "step": 5266 + }, + { + "epoch": 0.10536, + "grad_norm": 1.0326424837112427, + "learning_rate": 1.99982497437343e-05, + "loss": 0.1528, + "step": 5268 + }, + { + "epoch": 0.1054, + "grad_norm": 0.632640540599823, + "learning_rate": 1.9998223523808092e-05, + "loss": 0.2009, + "step": 5270 + }, + { + "epoch": 0.10544, + "grad_norm": 1.1073741912841797, + "learning_rate": 1.999819710896137e-05, + "loss": 0.1598, + "step": 5272 + }, + { + "epoch": 0.10548, + "grad_norm": 2.238525390625, + "learning_rate": 1.9998170499194645e-05, + "loss": 0.3205, + "step": 5274 + }, + { + "epoch": 0.10552, + "grad_norm": 1.4508211612701416, + "learning_rate": 1.9998143694508438e-05, + "loss": 0.2961, + "step": 5276 + }, + { + "epoch": 0.10556, + "grad_norm": 0.9644622802734375, + "learning_rate": 1.9998116694903277e-05, + "loss": 0.137, + "step": 5278 + }, + { + "epoch": 0.1056, + "grad_norm": 0.8969828486442566, + "learning_rate": 1.999808950037968e-05, + "loss": 0.3908, + "step": 5280 + }, + { + "epoch": 0.10564, + "grad_norm": 2.1252481937408447, + "learning_rate": 1.9998062110938182e-05, + "loss": 0.2984, + "step": 5282 + }, + { + "epoch": 0.10568, + "grad_norm": 1.632121205329895, + "learning_rate": 1.9998034526579313e-05, + "loss": 0.2233, + "step": 5284 + }, + { + "epoch": 0.10572, + "grad_norm": 1.9454429149627686, + "learning_rate": 1.9998006747303617e-05, + "loss": 0.2695, + "step": 5286 + }, + { + "epoch": 0.10576, + "grad_norm": 1.0886790752410889, + "learning_rate": 1.999797877311163e-05, + "loss": 0.1381, + "step": 5288 + }, + { + "epoch": 0.1058, + "grad_norm": 1.542137861251831, + "learning_rate": 1.99979506040039e-05, + "loss": 0.2663, + "step": 5290 + }, + { + "epoch": 0.10584, + "grad_norm": 1.7316040992736816, + "learning_rate": 1.999792223998098e-05, + "loss": 0.3402, + "step": 5292 + }, + { + "epoch": 0.10588, + "grad_norm": 1.713857650756836, + "learning_rate": 1.9997893681043412e-05, + "loss": 0.1853, + "step": 5294 + }, + { + "epoch": 0.10592, + "grad_norm": 1.2048274278640747, + "learning_rate": 1.9997864927191763e-05, + "loss": 0.1937, + "step": 5296 + }, + { + "epoch": 0.10596, + "grad_norm": 1.7658684253692627, + "learning_rate": 1.999783597842659e-05, + "loss": 0.1647, + "step": 5298 + }, + { + "epoch": 0.106, + "grad_norm": 1.8691672086715698, + "learning_rate": 1.9997806834748455e-05, + "loss": 0.2152, + "step": 5300 + }, + { + "epoch": 0.10604, + "grad_norm": 2.302741765975952, + "learning_rate": 1.9997777496157932e-05, + "loss": 0.3729, + "step": 5302 + }, + { + "epoch": 0.10608, + "grad_norm": 1.0218695402145386, + "learning_rate": 1.999774796265559e-05, + "loss": 0.1782, + "step": 5304 + }, + { + "epoch": 0.10612, + "grad_norm": 0.5944719910621643, + "learning_rate": 1.9997718234242e-05, + "loss": 0.1693, + "step": 5306 + }, + { + "epoch": 0.10616, + "grad_norm": 1.3406578302383423, + "learning_rate": 1.9997688310917745e-05, + "loss": 0.1463, + "step": 5308 + }, + { + "epoch": 0.1062, + "grad_norm": 1.830428957939148, + "learning_rate": 1.9997658192683412e-05, + "loss": 0.2348, + "step": 5310 + }, + { + "epoch": 0.10624, + "grad_norm": 1.7226272821426392, + "learning_rate": 1.999762787953959e-05, + "loss": 0.1551, + "step": 5312 + }, + { + "epoch": 0.10628, + "grad_norm": 2.514902353286743, + "learning_rate": 1.999759737148686e-05, + "loss": 0.313, + "step": 5314 + }, + { + "epoch": 0.10632, + "grad_norm": 0.864117443561554, + "learning_rate": 1.9997566668525822e-05, + "loss": 0.263, + "step": 5316 + }, + { + "epoch": 0.10636, + "grad_norm": 0.8862869143486023, + "learning_rate": 1.9997535770657076e-05, + "loss": 0.099, + "step": 5318 + }, + { + "epoch": 0.1064, + "grad_norm": 1.5088917016983032, + "learning_rate": 1.9997504677881224e-05, + "loss": 0.2024, + "step": 5320 + }, + { + "epoch": 0.10644, + "grad_norm": 0.9885336756706238, + "learning_rate": 1.999747339019887e-05, + "loss": 0.2426, + "step": 5322 + }, + { + "epoch": 0.10648, + "grad_norm": 2.237072467803955, + "learning_rate": 1.9997441907610624e-05, + "loss": 0.4697, + "step": 5324 + }, + { + "epoch": 0.10652, + "grad_norm": 1.3794991970062256, + "learning_rate": 1.9997410230117103e-05, + "loss": 0.2146, + "step": 5326 + }, + { + "epoch": 0.10656, + "grad_norm": 0.5655547976493835, + "learning_rate": 1.9997378357718923e-05, + "loss": 0.126, + "step": 5328 + }, + { + "epoch": 0.1066, + "grad_norm": 0.49699458479881287, + "learning_rate": 1.9997346290416703e-05, + "loss": 0.0679, + "step": 5330 + }, + { + "epoch": 0.10664, + "grad_norm": 2.224473714828491, + "learning_rate": 1.999731402821107e-05, + "loss": 0.3109, + "step": 5332 + }, + { + "epoch": 0.10668, + "grad_norm": 1.9969422817230225, + "learning_rate": 1.9997281571102655e-05, + "loss": 0.346, + "step": 5334 + }, + { + "epoch": 0.10672, + "grad_norm": 2.031707763671875, + "learning_rate": 1.9997248919092087e-05, + "loss": 0.2652, + "step": 5336 + }, + { + "epoch": 0.10676, + "grad_norm": 1.1752773523330688, + "learning_rate": 1.9997216072180007e-05, + "loss": 0.2029, + "step": 5338 + }, + { + "epoch": 0.1068, + "grad_norm": 0.8253549933433533, + "learning_rate": 1.999718303036705e-05, + "loss": 0.1448, + "step": 5340 + }, + { + "epoch": 0.10684, + "grad_norm": 0.7032029628753662, + "learning_rate": 1.9997149793653862e-05, + "loss": 0.1493, + "step": 5342 + }, + { + "epoch": 0.10688, + "grad_norm": 0.8256634473800659, + "learning_rate": 1.9997116362041095e-05, + "loss": 0.1174, + "step": 5344 + }, + { + "epoch": 0.10692, + "grad_norm": 0.9089924693107605, + "learning_rate": 1.9997082735529397e-05, + "loss": 0.1991, + "step": 5346 + }, + { + "epoch": 0.10696, + "grad_norm": 2.3540048599243164, + "learning_rate": 1.999704891411942e-05, + "loss": 0.4113, + "step": 5348 + }, + { + "epoch": 0.107, + "grad_norm": 2.2118937969207764, + "learning_rate": 1.9997014897811834e-05, + "loss": 0.3895, + "step": 5350 + }, + { + "epoch": 0.10704, + "grad_norm": 1.2331522703170776, + "learning_rate": 1.9996980686607293e-05, + "loss": 0.1591, + "step": 5352 + }, + { + "epoch": 0.10708, + "grad_norm": 1.5208487510681152, + "learning_rate": 1.9996946280506467e-05, + "loss": 0.247, + "step": 5354 + }, + { + "epoch": 0.10712, + "grad_norm": 1.0734020471572876, + "learning_rate": 1.9996911679510025e-05, + "loss": 0.1516, + "step": 5356 + }, + { + "epoch": 0.10716, + "grad_norm": 0.668266236782074, + "learning_rate": 1.9996876883618644e-05, + "loss": 0.1271, + "step": 5358 + }, + { + "epoch": 0.1072, + "grad_norm": 1.894994854927063, + "learning_rate": 1.9996841892833e-05, + "loss": 0.3003, + "step": 5360 + }, + { + "epoch": 0.10724, + "grad_norm": 1.0732451677322388, + "learning_rate": 1.999680670715378e-05, + "loss": 0.2778, + "step": 5362 + }, + { + "epoch": 0.10728, + "grad_norm": 0.4671362340450287, + "learning_rate": 1.9996771326581666e-05, + "loss": 0.0834, + "step": 5364 + }, + { + "epoch": 0.10732, + "grad_norm": 1.17079758644104, + "learning_rate": 1.9996735751117346e-05, + "loss": 0.1447, + "step": 5366 + }, + { + "epoch": 0.10736, + "grad_norm": 1.85612952709198, + "learning_rate": 1.9996699980761515e-05, + "loss": 0.1928, + "step": 5368 + }, + { + "epoch": 0.1074, + "grad_norm": 0.5252889394760132, + "learning_rate": 1.999666401551487e-05, + "loss": 0.1062, + "step": 5370 + }, + { + "epoch": 0.10744, + "grad_norm": 2.3137006759643555, + "learning_rate": 1.999662785537812e-05, + "loss": 0.3067, + "step": 5372 + }, + { + "epoch": 0.10748, + "grad_norm": 0.6695988774299622, + "learning_rate": 1.999659150035196e-05, + "loss": 0.1534, + "step": 5374 + }, + { + "epoch": 0.10752, + "grad_norm": 0.9047555327415466, + "learning_rate": 1.9996554950437105e-05, + "loss": 0.0923, + "step": 5376 + }, + { + "epoch": 0.10756, + "grad_norm": 0.8810882568359375, + "learning_rate": 1.999651820563426e-05, + "loss": 0.0796, + "step": 5378 + }, + { + "epoch": 0.1076, + "grad_norm": 1.2786803245544434, + "learning_rate": 1.9996481265944146e-05, + "loss": 0.1235, + "step": 5380 + }, + { + "epoch": 0.10764, + "grad_norm": 0.7430020570755005, + "learning_rate": 1.9996444131367486e-05, + "loss": 0.0474, + "step": 5382 + }, + { + "epoch": 0.10768, + "grad_norm": 1.0930297374725342, + "learning_rate": 1.9996406801905e-05, + "loss": 0.1538, + "step": 5384 + }, + { + "epoch": 0.10772, + "grad_norm": 4.251584053039551, + "learning_rate": 1.9996369277557415e-05, + "loss": 0.3365, + "step": 5386 + }, + { + "epoch": 0.10776, + "grad_norm": 5.222840309143066, + "learning_rate": 1.999633155832547e-05, + "loss": 0.7524, + "step": 5388 + }, + { + "epoch": 0.1078, + "grad_norm": 1.1496973037719727, + "learning_rate": 1.9996293644209886e-05, + "loss": 0.1444, + "step": 5390 + }, + { + "epoch": 0.10784, + "grad_norm": 0.595468282699585, + "learning_rate": 1.999625553521142e-05, + "loss": 0.2973, + "step": 5392 + }, + { + "epoch": 0.10788, + "grad_norm": 1.3953256607055664, + "learning_rate": 1.99962172313308e-05, + "loss": 0.1712, + "step": 5394 + }, + { + "epoch": 0.10792, + "grad_norm": 1.6137701272964478, + "learning_rate": 1.9996178732568784e-05, + "loss": 0.279, + "step": 5396 + }, + { + "epoch": 0.10796, + "grad_norm": 1.3155841827392578, + "learning_rate": 1.999614003892611e-05, + "loss": 0.1674, + "step": 5398 + }, + { + "epoch": 0.108, + "grad_norm": 2.47629976272583, + "learning_rate": 1.9996101150403543e-05, + "loss": 0.3282, + "step": 5400 + }, + { + "epoch": 0.10804, + "grad_norm": 0.8903825283050537, + "learning_rate": 1.999606206700184e-05, + "loss": 0.1112, + "step": 5402 + }, + { + "epoch": 0.10808, + "grad_norm": 1.2823234796524048, + "learning_rate": 1.999602278872176e-05, + "loss": 0.3144, + "step": 5404 + }, + { + "epoch": 0.10812, + "grad_norm": 0.4313903748989105, + "learning_rate": 1.9995983315564066e-05, + "loss": 0.0387, + "step": 5406 + }, + { + "epoch": 0.10816, + "grad_norm": 1.1061269044876099, + "learning_rate": 1.9995943647529533e-05, + "loss": 0.1598, + "step": 5408 + }, + { + "epoch": 0.1082, + "grad_norm": 0.9810930490493774, + "learning_rate": 1.9995903784618936e-05, + "loss": 0.427, + "step": 5410 + }, + { + "epoch": 0.10824, + "grad_norm": 1.3819725513458252, + "learning_rate": 1.9995863726833044e-05, + "loss": 0.2545, + "step": 5412 + }, + { + "epoch": 0.10828, + "grad_norm": 0.8561986088752747, + "learning_rate": 1.9995823474172644e-05, + "loss": 0.1062, + "step": 5414 + }, + { + "epoch": 0.10832, + "grad_norm": 1.9266183376312256, + "learning_rate": 1.999578302663852e-05, + "loss": 0.3531, + "step": 5416 + }, + { + "epoch": 0.10836, + "grad_norm": 0.521274983882904, + "learning_rate": 1.999574238423146e-05, + "loss": 0.1063, + "step": 5418 + }, + { + "epoch": 0.1084, + "grad_norm": 0.5640667080879211, + "learning_rate": 1.9995701546952252e-05, + "loss": 0.1309, + "step": 5420 + }, + { + "epoch": 0.10844, + "grad_norm": 0.5944443345069885, + "learning_rate": 1.99956605148017e-05, + "loss": 0.1651, + "step": 5422 + }, + { + "epoch": 0.10848, + "grad_norm": 2.6566379070281982, + "learning_rate": 1.99956192877806e-05, + "loss": 0.371, + "step": 5424 + }, + { + "epoch": 0.10852, + "grad_norm": 1.866576075553894, + "learning_rate": 1.9995577865889753e-05, + "loss": 0.3015, + "step": 5426 + }, + { + "epoch": 0.10856, + "grad_norm": 1.3656235933303833, + "learning_rate": 1.999553624912997e-05, + "loss": 0.2823, + "step": 5428 + }, + { + "epoch": 0.1086, + "grad_norm": 1.3736237287521362, + "learning_rate": 1.9995494437502064e-05, + "loss": 0.2146, + "step": 5430 + }, + { + "epoch": 0.10864, + "grad_norm": 1.634560465812683, + "learning_rate": 1.9995452431006844e-05, + "loss": 0.2279, + "step": 5432 + }, + { + "epoch": 0.10868, + "grad_norm": 1.8393651247024536, + "learning_rate": 1.999541022964514e-05, + "loss": 0.3664, + "step": 5434 + }, + { + "epoch": 0.10872, + "grad_norm": 0.7332890629768372, + "learning_rate": 1.999536783341776e-05, + "loss": 0.1349, + "step": 5436 + }, + { + "epoch": 0.10876, + "grad_norm": 0.7602499127388, + "learning_rate": 1.999532524232554e-05, + "loss": 0.1053, + "step": 5438 + }, + { + "epoch": 0.1088, + "grad_norm": 2.7583463191986084, + "learning_rate": 1.9995282456369313e-05, + "loss": 0.3054, + "step": 5440 + }, + { + "epoch": 0.10884, + "grad_norm": 2.7605183124542236, + "learning_rate": 1.9995239475549905e-05, + "loss": 0.5695, + "step": 5442 + }, + { + "epoch": 0.10888, + "grad_norm": 1.1840016841888428, + "learning_rate": 1.999519629986816e-05, + "loss": 0.2922, + "step": 5444 + }, + { + "epoch": 0.10892, + "grad_norm": 2.2910335063934326, + "learning_rate": 1.9995152929324915e-05, + "loss": 0.3159, + "step": 5446 + }, + { + "epoch": 0.10896, + "grad_norm": 1.3957444429397583, + "learning_rate": 1.9995109363921017e-05, + "loss": 0.1798, + "step": 5448 + }, + { + "epoch": 0.109, + "grad_norm": 1.7275999784469604, + "learning_rate": 1.9995065603657317e-05, + "loss": 0.3669, + "step": 5450 + }, + { + "epoch": 0.10904, + "grad_norm": 1.090792179107666, + "learning_rate": 1.9995021648534667e-05, + "loss": 0.1534, + "step": 5452 + }, + { + "epoch": 0.10908, + "grad_norm": 1.5701937675476074, + "learning_rate": 1.9994977498553924e-05, + "loss": 0.3263, + "step": 5454 + }, + { + "epoch": 0.10912, + "grad_norm": 1.4332448244094849, + "learning_rate": 1.999493315371595e-05, + "loss": 0.3017, + "step": 5456 + }, + { + "epoch": 0.10916, + "grad_norm": 1.0444250106811523, + "learning_rate": 1.9994888614021608e-05, + "loss": 0.1677, + "step": 5458 + }, + { + "epoch": 0.1092, + "grad_norm": 1.2528040409088135, + "learning_rate": 1.999484387947177e-05, + "loss": 0.2131, + "step": 5460 + }, + { + "epoch": 0.10924, + "grad_norm": 2.006331443786621, + "learning_rate": 1.9994798950067297e-05, + "loss": 0.4438, + "step": 5462 + }, + { + "epoch": 0.10928, + "grad_norm": 1.6738368272781372, + "learning_rate": 1.999475382580908e-05, + "loss": 0.4243, + "step": 5464 + }, + { + "epoch": 0.10932, + "grad_norm": 1.257351040840149, + "learning_rate": 1.9994708506697988e-05, + "loss": 0.1924, + "step": 5466 + }, + { + "epoch": 0.10936, + "grad_norm": 1.1239997148513794, + "learning_rate": 1.999466299273491e-05, + "loss": 0.255, + "step": 5468 + }, + { + "epoch": 0.1094, + "grad_norm": 1.2758309841156006, + "learning_rate": 1.999461728392073e-05, + "loss": 0.2439, + "step": 5470 + }, + { + "epoch": 0.10944, + "grad_norm": 1.302872896194458, + "learning_rate": 1.9994571380256343e-05, + "loss": 0.267, + "step": 5472 + }, + { + "epoch": 0.10948, + "grad_norm": 1.170225739479065, + "learning_rate": 1.9994525281742638e-05, + "loss": 0.2437, + "step": 5474 + }, + { + "epoch": 0.10952, + "grad_norm": 0.9885281324386597, + "learning_rate": 1.9994478988380516e-05, + "loss": 0.1942, + "step": 5476 + }, + { + "epoch": 0.10956, + "grad_norm": 1.2825466394424438, + "learning_rate": 1.9994432500170882e-05, + "loss": 0.2186, + "step": 5478 + }, + { + "epoch": 0.1096, + "grad_norm": 1.1757358312606812, + "learning_rate": 1.9994385817114644e-05, + "loss": 0.2135, + "step": 5480 + }, + { + "epoch": 0.10964, + "grad_norm": 0.891042172908783, + "learning_rate": 1.9994338939212707e-05, + "loss": 0.1524, + "step": 5482 + }, + { + "epoch": 0.10968, + "grad_norm": 0.9060896635055542, + "learning_rate": 1.999429186646599e-05, + "loss": 0.2434, + "step": 5484 + }, + { + "epoch": 0.10972, + "grad_norm": 0.7285366058349609, + "learning_rate": 1.99942445988754e-05, + "loss": 0.2358, + "step": 5486 + }, + { + "epoch": 0.10976, + "grad_norm": 0.9437418580055237, + "learning_rate": 1.9994197136441874e-05, + "loss": 0.1454, + "step": 5488 + }, + { + "epoch": 0.1098, + "grad_norm": 0.831741988658905, + "learning_rate": 1.9994149479166324e-05, + "loss": 0.2286, + "step": 5490 + }, + { + "epoch": 0.10984, + "grad_norm": 1.7897330522537231, + "learning_rate": 1.9994101627049687e-05, + "loss": 0.2493, + "step": 5492 + }, + { + "epoch": 0.10988, + "grad_norm": 1.034803867340088, + "learning_rate": 1.9994053580092893e-05, + "loss": 0.2932, + "step": 5494 + }, + { + "epoch": 0.10992, + "grad_norm": 1.6637086868286133, + "learning_rate": 1.999400533829688e-05, + "loss": 0.3399, + "step": 5496 + }, + { + "epoch": 0.10996, + "grad_norm": 1.2173235416412354, + "learning_rate": 1.9993956901662586e-05, + "loss": 0.2329, + "step": 5498 + }, + { + "epoch": 0.11, + "grad_norm": 1.0949251651763916, + "learning_rate": 1.999390827019096e-05, + "loss": 0.1848, + "step": 5500 + }, + { + "epoch": 0.11004, + "grad_norm": 1.5266029834747314, + "learning_rate": 1.9993859443882943e-05, + "loss": 0.2796, + "step": 5502 + }, + { + "epoch": 0.11008, + "grad_norm": 1.080815076828003, + "learning_rate": 1.9993810422739496e-05, + "loss": 0.2239, + "step": 5504 + }, + { + "epoch": 0.11012, + "grad_norm": 1.052612066268921, + "learning_rate": 1.9993761206761567e-05, + "loss": 0.2509, + "step": 5506 + }, + { + "epoch": 0.11016, + "grad_norm": 0.745017409324646, + "learning_rate": 1.9993711795950116e-05, + "loss": 0.1176, + "step": 5508 + }, + { + "epoch": 0.1102, + "grad_norm": 2.1945137977600098, + "learning_rate": 1.999366219030611e-05, + "loss": 0.2846, + "step": 5510 + }, + { + "epoch": 0.11024, + "grad_norm": 0.7774375677108765, + "learning_rate": 1.9993612389830516e-05, + "loss": 0.2572, + "step": 5512 + }, + { + "epoch": 0.11028, + "grad_norm": 1.7441879510879517, + "learning_rate": 1.99935623945243e-05, + "loss": 0.3822, + "step": 5514 + }, + { + "epoch": 0.11032, + "grad_norm": 0.7028264999389648, + "learning_rate": 1.9993512204388444e-05, + "loss": 0.1407, + "step": 5516 + }, + { + "epoch": 0.11036, + "grad_norm": 1.816733717918396, + "learning_rate": 1.999346181942392e-05, + "loss": 0.2633, + "step": 5518 + }, + { + "epoch": 0.1104, + "grad_norm": 1.122710108757019, + "learning_rate": 1.9993411239631713e-05, + "loss": 0.1765, + "step": 5520 + }, + { + "epoch": 0.11044, + "grad_norm": 1.447576880455017, + "learning_rate": 1.999336046501281e-05, + "loss": 0.3008, + "step": 5522 + }, + { + "epoch": 0.11048, + "grad_norm": 1.4500590562820435, + "learning_rate": 1.99933094955682e-05, + "loss": 0.2105, + "step": 5524 + }, + { + "epoch": 0.11052, + "grad_norm": 1.456699252128601, + "learning_rate": 1.9993258331298874e-05, + "loss": 0.2771, + "step": 5526 + }, + { + "epoch": 0.11056, + "grad_norm": 1.054067611694336, + "learning_rate": 1.9993206972205836e-05, + "loss": 0.1765, + "step": 5528 + }, + { + "epoch": 0.1106, + "grad_norm": 1.010582447052002, + "learning_rate": 1.999315541829008e-05, + "loss": 0.1886, + "step": 5530 + }, + { + "epoch": 0.11064, + "grad_norm": 2.060164213180542, + "learning_rate": 1.9993103669552618e-05, + "loss": 0.2354, + "step": 5532 + }, + { + "epoch": 0.11068, + "grad_norm": 0.9159719347953796, + "learning_rate": 1.999305172599445e-05, + "loss": 0.106, + "step": 5534 + }, + { + "epoch": 0.11072, + "grad_norm": 0.675203800201416, + "learning_rate": 1.99929995876166e-05, + "loss": 0.0687, + "step": 5536 + }, + { + "epoch": 0.11076, + "grad_norm": 0.4154834747314453, + "learning_rate": 1.999294725442007e-05, + "loss": 0.3388, + "step": 5538 + }, + { + "epoch": 0.1108, + "grad_norm": 0.6304424405097961, + "learning_rate": 1.9992894726405894e-05, + "loss": 0.282, + "step": 5540 + }, + { + "epoch": 0.11084, + "grad_norm": 0.7167519330978394, + "learning_rate": 1.999284200357509e-05, + "loss": 0.2736, + "step": 5542 + }, + { + "epoch": 0.11088, + "grad_norm": 0.7339133620262146, + "learning_rate": 1.9992789085928686e-05, + "loss": 0.3271, + "step": 5544 + }, + { + "epoch": 0.11092, + "grad_norm": 1.6901332139968872, + "learning_rate": 1.9992735973467714e-05, + "loss": 0.1996, + "step": 5546 + }, + { + "epoch": 0.11096, + "grad_norm": 0.9515992999076843, + "learning_rate": 1.9992682666193212e-05, + "loss": 0.1817, + "step": 5548 + }, + { + "epoch": 0.111, + "grad_norm": 1.6903843879699707, + "learning_rate": 1.999262916410621e-05, + "loss": 0.2891, + "step": 5550 + }, + { + "epoch": 0.11104, + "grad_norm": 1.4408758878707886, + "learning_rate": 1.9992575467207765e-05, + "loss": 0.186, + "step": 5552 + }, + { + "epoch": 0.11108, + "grad_norm": 0.9207583069801331, + "learning_rate": 1.9992521575498914e-05, + "loss": 0.2036, + "step": 5554 + }, + { + "epoch": 0.11112, + "grad_norm": 1.122739315032959, + "learning_rate": 1.999246748898071e-05, + "loss": 0.147, + "step": 5556 + }, + { + "epoch": 0.11116, + "grad_norm": 1.7631171941757202, + "learning_rate": 1.999241320765421e-05, + "loss": 0.2093, + "step": 5558 + }, + { + "epoch": 0.1112, + "grad_norm": 1.5926352739334106, + "learning_rate": 1.999235873152047e-05, + "loss": 0.2434, + "step": 5560 + }, + { + "epoch": 0.11124, + "grad_norm": 1.777729868888855, + "learning_rate": 1.999230406058055e-05, + "loss": 0.2902, + "step": 5562 + }, + { + "epoch": 0.11128, + "grad_norm": 0.9679836630821228, + "learning_rate": 1.999224919483552e-05, + "loss": 0.1374, + "step": 5564 + }, + { + "epoch": 0.11132, + "grad_norm": 1.4060176610946655, + "learning_rate": 1.9992194134286447e-05, + "loss": 0.2826, + "step": 5566 + }, + { + "epoch": 0.11136, + "grad_norm": 1.022185206413269, + "learning_rate": 1.9992138878934405e-05, + "loss": 0.1601, + "step": 5568 + }, + { + "epoch": 0.1114, + "grad_norm": 0.6369178295135498, + "learning_rate": 1.999208342878047e-05, + "loss": 0.1929, + "step": 5570 + }, + { + "epoch": 0.11144, + "grad_norm": 0.9765871167182922, + "learning_rate": 1.9992027783825724e-05, + "loss": 0.2226, + "step": 5572 + }, + { + "epoch": 0.11148, + "grad_norm": 0.8386043310165405, + "learning_rate": 1.9991971944071252e-05, + "loss": 0.1013, + "step": 5574 + }, + { + "epoch": 0.11152, + "grad_norm": 1.9029532670974731, + "learning_rate": 1.9991915909518146e-05, + "loss": 0.3808, + "step": 5576 + }, + { + "epoch": 0.11156, + "grad_norm": 1.4980477094650269, + "learning_rate": 1.9991859680167493e-05, + "loss": 0.3032, + "step": 5578 + }, + { + "epoch": 0.1116, + "grad_norm": 1.4021382331848145, + "learning_rate": 1.9991803256020393e-05, + "loss": 0.2324, + "step": 5580 + }, + { + "epoch": 0.11164, + "grad_norm": 0.7909827828407288, + "learning_rate": 1.999174663707794e-05, + "loss": 0.2574, + "step": 5582 + }, + { + "epoch": 0.11168, + "grad_norm": 1.5847548246383667, + "learning_rate": 1.9991689823341246e-05, + "loss": 0.2765, + "step": 5584 + }, + { + "epoch": 0.11172, + "grad_norm": 1.9060102701187134, + "learning_rate": 1.9991632814811415e-05, + "loss": 0.2104, + "step": 5586 + }, + { + "epoch": 0.11176, + "grad_norm": 2.201387405395508, + "learning_rate": 1.9991575611489556e-05, + "loss": 0.3207, + "step": 5588 + }, + { + "epoch": 0.1118, + "grad_norm": 0.8145676255226135, + "learning_rate": 1.9991518213376787e-05, + "loss": 0.1911, + "step": 5590 + }, + { + "epoch": 0.11184, + "grad_norm": 0.8369247317314148, + "learning_rate": 1.9991460620474226e-05, + "loss": 0.2155, + "step": 5592 + }, + { + "epoch": 0.11188, + "grad_norm": 1.8230911493301392, + "learning_rate": 1.9991402832783e-05, + "loss": 0.2824, + "step": 5594 + }, + { + "epoch": 0.11192, + "grad_norm": 1.1573693752288818, + "learning_rate": 1.999134485030423e-05, + "loss": 0.1553, + "step": 5596 + }, + { + "epoch": 0.11196, + "grad_norm": 1.5275654792785645, + "learning_rate": 1.9991286673039045e-05, + "loss": 0.1992, + "step": 5598 + }, + { + "epoch": 0.112, + "grad_norm": 1.1640148162841797, + "learning_rate": 1.9991228300988586e-05, + "loss": 0.2471, + "step": 5600 + }, + { + "epoch": 0.11204, + "grad_norm": 1.035406231880188, + "learning_rate": 1.9991169734153987e-05, + "loss": 0.1518, + "step": 5602 + }, + { + "epoch": 0.11208, + "grad_norm": 1.2051182985305786, + "learning_rate": 1.9991110972536386e-05, + "loss": 0.2353, + "step": 5604 + }, + { + "epoch": 0.11212, + "grad_norm": 1.0182743072509766, + "learning_rate": 1.9991052016136938e-05, + "loss": 0.1677, + "step": 5606 + }, + { + "epoch": 0.11216, + "grad_norm": 1.6405260562896729, + "learning_rate": 1.9990992864956785e-05, + "loss": 0.1921, + "step": 5608 + }, + { + "epoch": 0.1122, + "grad_norm": 2.0281994342803955, + "learning_rate": 1.9990933518997086e-05, + "loss": 0.2901, + "step": 5610 + }, + { + "epoch": 0.11224, + "grad_norm": 2.125577688217163, + "learning_rate": 1.999087397825899e-05, + "loss": 0.3012, + "step": 5612 + }, + { + "epoch": 0.11228, + "grad_norm": 1.5213086605072021, + "learning_rate": 1.9990814242743664e-05, + "loss": 0.2964, + "step": 5614 + }, + { + "epoch": 0.11232, + "grad_norm": 2.0472991466522217, + "learning_rate": 1.9990754312452266e-05, + "loss": 0.2263, + "step": 5616 + }, + { + "epoch": 0.11236, + "grad_norm": 0.5601586103439331, + "learning_rate": 1.9990694187385974e-05, + "loss": 0.1764, + "step": 5618 + }, + { + "epoch": 0.1124, + "grad_norm": 1.3310520648956299, + "learning_rate": 1.9990633867545956e-05, + "loss": 0.2028, + "step": 5620 + }, + { + "epoch": 0.11244, + "grad_norm": 1.5845937728881836, + "learning_rate": 1.999057335293339e-05, + "loss": 0.192, + "step": 5622 + }, + { + "epoch": 0.11248, + "grad_norm": 0.9283666610717773, + "learning_rate": 1.9990512643549444e-05, + "loss": 0.1126, + "step": 5624 + }, + { + "epoch": 0.11252, + "grad_norm": 1.359320878982544, + "learning_rate": 1.9990451739395314e-05, + "loss": 0.2029, + "step": 5626 + }, + { + "epoch": 0.11256, + "grad_norm": 2.221345901489258, + "learning_rate": 1.9990390640472186e-05, + "loss": 0.2426, + "step": 5628 + }, + { + "epoch": 0.1126, + "grad_norm": 1.3591972589492798, + "learning_rate": 1.999032934678125e-05, + "loss": 0.2026, + "step": 5630 + }, + { + "epoch": 0.11264, + "grad_norm": 0.6917791366577148, + "learning_rate": 1.9990267858323697e-05, + "loss": 0.1797, + "step": 5632 + }, + { + "epoch": 0.11268, + "grad_norm": 2.241755962371826, + "learning_rate": 1.999020617510073e-05, + "loss": 0.2922, + "step": 5634 + }, + { + "epoch": 0.11272, + "grad_norm": 2.3231401443481445, + "learning_rate": 1.999014429711355e-05, + "loss": 0.2571, + "step": 5636 + }, + { + "epoch": 0.11276, + "grad_norm": 0.6954199075698853, + "learning_rate": 1.9990082224363365e-05, + "loss": 0.2519, + "step": 5638 + }, + { + "epoch": 0.1128, + "grad_norm": 1.430155873298645, + "learning_rate": 1.9990019956851384e-05, + "loss": 0.1689, + "step": 5640 + }, + { + "epoch": 0.11284, + "grad_norm": 1.2678769826889038, + "learning_rate": 1.9989957494578817e-05, + "loss": 0.2468, + "step": 5642 + }, + { + "epoch": 0.11288, + "grad_norm": 1.141498327255249, + "learning_rate": 1.998989483754689e-05, + "loss": 0.2639, + "step": 5644 + }, + { + "epoch": 0.11292, + "grad_norm": 1.8679320812225342, + "learning_rate": 1.998983198575682e-05, + "loss": 0.2428, + "step": 5646 + }, + { + "epoch": 0.11296, + "grad_norm": 1.6926385164260864, + "learning_rate": 1.9989768939209826e-05, + "loss": 0.2221, + "step": 5648 + }, + { + "epoch": 0.113, + "grad_norm": 1.4439910650253296, + "learning_rate": 1.998970569790715e-05, + "loss": 0.3891, + "step": 5650 + }, + { + "epoch": 0.11304, + "grad_norm": 1.5907070636749268, + "learning_rate": 1.9989642261850014e-05, + "loss": 0.2663, + "step": 5652 + }, + { + "epoch": 0.11308, + "grad_norm": 1.285902500152588, + "learning_rate": 1.998957863103966e-05, + "loss": 0.2039, + "step": 5654 + }, + { + "epoch": 0.11312, + "grad_norm": 1.2097816467285156, + "learning_rate": 1.9989514805477332e-05, + "loss": 0.171, + "step": 5656 + }, + { + "epoch": 0.11316, + "grad_norm": 0.9843453168869019, + "learning_rate": 1.9989450785164264e-05, + "loss": 0.2843, + "step": 5658 + }, + { + "epoch": 0.1132, + "grad_norm": 1.3617000579833984, + "learning_rate": 1.9989386570101716e-05, + "loss": 0.2772, + "step": 5660 + }, + { + "epoch": 0.11324, + "grad_norm": 1.9762223958969116, + "learning_rate": 1.9989322160290928e-05, + "loss": 0.3732, + "step": 5662 + }, + { + "epoch": 0.11328, + "grad_norm": 0.75747150182724, + "learning_rate": 1.9989257555733164e-05, + "loss": 0.1115, + "step": 5664 + }, + { + "epoch": 0.11332, + "grad_norm": 1.0243782997131348, + "learning_rate": 1.998919275642968e-05, + "loss": 0.2696, + "step": 5666 + }, + { + "epoch": 0.11336, + "grad_norm": 1.1652289628982544, + "learning_rate": 1.9989127762381747e-05, + "loss": 0.1618, + "step": 5668 + }, + { + "epoch": 0.1134, + "grad_norm": 1.5984535217285156, + "learning_rate": 1.9989062573590618e-05, + "loss": 0.2545, + "step": 5670 + }, + { + "epoch": 0.11344, + "grad_norm": 1.3138025999069214, + "learning_rate": 1.9988997190057574e-05, + "loss": 0.2823, + "step": 5672 + }, + { + "epoch": 0.11348, + "grad_norm": 1.1239969730377197, + "learning_rate": 1.9988931611783887e-05, + "loss": 0.1529, + "step": 5674 + }, + { + "epoch": 0.11352, + "grad_norm": 1.248136043548584, + "learning_rate": 1.9988865838770834e-05, + "loss": 0.2331, + "step": 5676 + }, + { + "epoch": 0.11356, + "grad_norm": 1.2515136003494263, + "learning_rate": 1.9988799871019702e-05, + "loss": 0.1588, + "step": 5678 + }, + { + "epoch": 0.1136, + "grad_norm": 1.5982450246810913, + "learning_rate": 1.9988733708531772e-05, + "loss": 0.2146, + "step": 5680 + }, + { + "epoch": 0.11364, + "grad_norm": 0.9889798760414124, + "learning_rate": 1.9988667351308338e-05, + "loss": 0.1686, + "step": 5682 + }, + { + "epoch": 0.11368, + "grad_norm": 0.7875699996948242, + "learning_rate": 1.9988600799350685e-05, + "loss": 0.1587, + "step": 5684 + }, + { + "epoch": 0.11372, + "grad_norm": 1.357286810874939, + "learning_rate": 1.9988534052660126e-05, + "loss": 0.2562, + "step": 5686 + }, + { + "epoch": 0.11376, + "grad_norm": 1.504377841949463, + "learning_rate": 1.9988467111237947e-05, + "loss": 0.306, + "step": 5688 + }, + { + "epoch": 0.1138, + "grad_norm": 1.2039682865142822, + "learning_rate": 1.998839997508546e-05, + "loss": 0.2921, + "step": 5690 + }, + { + "epoch": 0.11384, + "grad_norm": 1.9762581586837769, + "learning_rate": 1.9988332644203976e-05, + "loss": 0.2558, + "step": 5692 + }, + { + "epoch": 0.11388, + "grad_norm": 1.4918670654296875, + "learning_rate": 1.99882651185948e-05, + "loss": 0.2321, + "step": 5694 + }, + { + "epoch": 0.11392, + "grad_norm": 0.5943285226821899, + "learning_rate": 1.9988197398259257e-05, + "loss": 0.172, + "step": 5696 + }, + { + "epoch": 0.11396, + "grad_norm": 0.5943877696990967, + "learning_rate": 1.9988129483198664e-05, + "loss": 0.21, + "step": 5698 + }, + { + "epoch": 0.114, + "grad_norm": 0.30414578318595886, + "learning_rate": 1.9988061373414342e-05, + "loss": 0.0676, + "step": 5700 + }, + { + "epoch": 0.11404, + "grad_norm": 0.49175867438316345, + "learning_rate": 1.9987993068907624e-05, + "loss": 0.0431, + "step": 5702 + }, + { + "epoch": 0.11408, + "grad_norm": 0.63605135679245, + "learning_rate": 1.9987924569679836e-05, + "loss": 0.2972, + "step": 5704 + }, + { + "epoch": 0.11412, + "grad_norm": 2.9133543968200684, + "learning_rate": 1.9987855875732317e-05, + "loss": 0.7332, + "step": 5706 + }, + { + "epoch": 0.11416, + "grad_norm": 2.1948893070220947, + "learning_rate": 1.9987786987066406e-05, + "loss": 0.2286, + "step": 5708 + }, + { + "epoch": 0.1142, + "grad_norm": 1.154448390007019, + "learning_rate": 1.9987717903683447e-05, + "loss": 0.1444, + "step": 5710 + }, + { + "epoch": 0.11424, + "grad_norm": 1.4117342233657837, + "learning_rate": 1.9987648625584785e-05, + "loss": 0.2663, + "step": 5712 + }, + { + "epoch": 0.11428, + "grad_norm": 1.0091814994812012, + "learning_rate": 1.9987579152771768e-05, + "loss": 0.137, + "step": 5714 + }, + { + "epoch": 0.11432, + "grad_norm": 0.7312894463539124, + "learning_rate": 1.9987509485245757e-05, + "loss": 0.2429, + "step": 5716 + }, + { + "epoch": 0.11436, + "grad_norm": 1.1690868139266968, + "learning_rate": 1.9987439623008106e-05, + "loss": 0.1598, + "step": 5718 + }, + { + "epoch": 0.1144, + "grad_norm": 1.2153769731521606, + "learning_rate": 1.998736956606018e-05, + "loss": 0.2026, + "step": 5720 + }, + { + "epoch": 0.11444, + "grad_norm": 1.2188223600387573, + "learning_rate": 1.9987299314403337e-05, + "loss": 0.2727, + "step": 5722 + }, + { + "epoch": 0.11448, + "grad_norm": 1.6630314588546753, + "learning_rate": 1.998722886803895e-05, + "loss": 0.2773, + "step": 5724 + }, + { + "epoch": 0.11452, + "grad_norm": 1.4490400552749634, + "learning_rate": 1.9987158226968403e-05, + "loss": 0.2323, + "step": 5726 + }, + { + "epoch": 0.11456, + "grad_norm": 1.314151644706726, + "learning_rate": 1.998708739119306e-05, + "loss": 0.2822, + "step": 5728 + }, + { + "epoch": 0.1146, + "grad_norm": 1.114105463027954, + "learning_rate": 1.9987016360714307e-05, + "loss": 0.1938, + "step": 5730 + }, + { + "epoch": 0.11464, + "grad_norm": 1.9959282875061035, + "learning_rate": 1.9986945135533528e-05, + "loss": 0.296, + "step": 5732 + }, + { + "epoch": 0.11468, + "grad_norm": 2.0949902534484863, + "learning_rate": 1.9986873715652116e-05, + "loss": 0.2571, + "step": 5734 + }, + { + "epoch": 0.11472, + "grad_norm": 0.9894675016403198, + "learning_rate": 1.9986802101071453e-05, + "loss": 0.248, + "step": 5736 + }, + { + "epoch": 0.11476, + "grad_norm": 1.9684370756149292, + "learning_rate": 1.9986730291792945e-05, + "loss": 0.3672, + "step": 5738 + }, + { + "epoch": 0.1148, + "grad_norm": 1.6250523328781128, + "learning_rate": 1.998665828781799e-05, + "loss": 0.3529, + "step": 5740 + }, + { + "epoch": 0.11484, + "grad_norm": 1.1133543252944946, + "learning_rate": 1.998658608914799e-05, + "loss": 0.2727, + "step": 5742 + }, + { + "epoch": 0.11488, + "grad_norm": 1.485260009765625, + "learning_rate": 1.998651369578435e-05, + "loss": 0.326, + "step": 5744 + }, + { + "epoch": 0.11492, + "grad_norm": 1.16234290599823, + "learning_rate": 1.9986441107728484e-05, + "loss": 0.1857, + "step": 5746 + }, + { + "epoch": 0.11496, + "grad_norm": 1.2439414262771606, + "learning_rate": 1.9986368324981807e-05, + "loss": 0.2774, + "step": 5748 + }, + { + "epoch": 0.115, + "grad_norm": 0.8268135190010071, + "learning_rate": 1.9986295347545738e-05, + "loss": 0.1782, + "step": 5750 + }, + { + "epoch": 0.11504, + "grad_norm": 0.9938164353370667, + "learning_rate": 1.9986222175421704e-05, + "loss": 0.2432, + "step": 5752 + }, + { + "epoch": 0.11508, + "grad_norm": 0.8475884199142456, + "learning_rate": 1.9986148808611126e-05, + "loss": 0.204, + "step": 5754 + }, + { + "epoch": 0.11512, + "grad_norm": 0.6872199177742004, + "learning_rate": 1.998607524711543e-05, + "loss": 0.1371, + "step": 5756 + }, + { + "epoch": 0.11516, + "grad_norm": 1.0851927995681763, + "learning_rate": 1.9986001490936062e-05, + "loss": 0.1857, + "step": 5758 + }, + { + "epoch": 0.1152, + "grad_norm": 1.5751864910125732, + "learning_rate": 1.9985927540074453e-05, + "loss": 0.228, + "step": 5760 + }, + { + "epoch": 0.11524, + "grad_norm": 0.5846288204193115, + "learning_rate": 1.9985853394532046e-05, + "loss": 0.0853, + "step": 5762 + }, + { + "epoch": 0.11528, + "grad_norm": 0.4004614055156708, + "learning_rate": 1.9985779054310286e-05, + "loss": 0.0412, + "step": 5764 + }, + { + "epoch": 0.11532, + "grad_norm": 0.42196381092071533, + "learning_rate": 1.998570451941062e-05, + "loss": 0.062, + "step": 5766 + }, + { + "epoch": 0.11536, + "grad_norm": 0.26401612162590027, + "learning_rate": 1.9985629789834503e-05, + "loss": 0.0252, + "step": 5768 + }, + { + "epoch": 0.1154, + "grad_norm": 0.25024884939193726, + "learning_rate": 1.9985554865583394e-05, + "loss": 0.4037, + "step": 5770 + }, + { + "epoch": 0.11544, + "grad_norm": 3.1667284965515137, + "learning_rate": 1.998547974665875e-05, + "loss": 0.3838, + "step": 5772 + }, + { + "epoch": 0.11548, + "grad_norm": 3.1364285945892334, + "learning_rate": 1.998540443306204e-05, + "loss": 0.4278, + "step": 5774 + }, + { + "epoch": 0.11552, + "grad_norm": 0.24979354441165924, + "learning_rate": 1.9985328924794732e-05, + "loss": 0.3683, + "step": 5776 + }, + { + "epoch": 0.11556, + "grad_norm": 1.843001365661621, + "learning_rate": 1.9985253221858293e-05, + "loss": 0.2374, + "step": 5778 + }, + { + "epoch": 0.1156, + "grad_norm": 2.1746089458465576, + "learning_rate": 1.99851773242542e-05, + "loss": 0.3072, + "step": 5780 + }, + { + "epoch": 0.11564, + "grad_norm": 1.8985518217086792, + "learning_rate": 1.9985101231983936e-05, + "loss": 0.2105, + "step": 5782 + }, + { + "epoch": 0.11568, + "grad_norm": 1.3947581052780151, + "learning_rate": 1.9985024945048982e-05, + "loss": 0.2131, + "step": 5784 + }, + { + "epoch": 0.11572, + "grad_norm": 1.7028573751449585, + "learning_rate": 1.9984948463450825e-05, + "loss": 0.2658, + "step": 5786 + }, + { + "epoch": 0.11576, + "grad_norm": 2.034242630004883, + "learning_rate": 1.998487178719096e-05, + "loss": 0.2431, + "step": 5788 + }, + { + "epoch": 0.1158, + "grad_norm": 1.3663759231567383, + "learning_rate": 1.9984794916270876e-05, + "loss": 0.2475, + "step": 5790 + }, + { + "epoch": 0.11584, + "grad_norm": 1.5198813676834106, + "learning_rate": 1.998471785069208e-05, + "loss": 0.1964, + "step": 5792 + }, + { + "epoch": 0.11588, + "grad_norm": 1.4365965127944946, + "learning_rate": 1.9984640590456064e-05, + "loss": 0.2126, + "step": 5794 + }, + { + "epoch": 0.11592, + "grad_norm": 0.834814727306366, + "learning_rate": 1.998456313556434e-05, + "loss": 0.2574, + "step": 5796 + }, + { + "epoch": 0.11596, + "grad_norm": 1.7437093257904053, + "learning_rate": 1.998448548601842e-05, + "loss": 0.2767, + "step": 5798 + }, + { + "epoch": 0.116, + "grad_norm": 2.5127363204956055, + "learning_rate": 1.9984407641819812e-05, + "loss": 0.2792, + "step": 5800 + }, + { + "epoch": 0.11604, + "grad_norm": 1.5762639045715332, + "learning_rate": 1.9984329602970036e-05, + "loss": 0.1993, + "step": 5802 + }, + { + "epoch": 0.11608, + "grad_norm": 1.2495076656341553, + "learning_rate": 1.998425136947062e-05, + "loss": 0.2925, + "step": 5804 + }, + { + "epoch": 0.11612, + "grad_norm": 1.2552995681762695, + "learning_rate": 1.9984172941323074e-05, + "loss": 0.176, + "step": 5806 + }, + { + "epoch": 0.11616, + "grad_norm": 1.2848283052444458, + "learning_rate": 1.9984094318528943e-05, + "loss": 0.2266, + "step": 5808 + }, + { + "epoch": 0.1162, + "grad_norm": 0.9582882523536682, + "learning_rate": 1.998401550108975e-05, + "loss": 0.1557, + "step": 5810 + }, + { + "epoch": 0.11624, + "grad_norm": 2.1987245082855225, + "learning_rate": 1.9983936489007037e-05, + "loss": 0.2222, + "step": 5812 + }, + { + "epoch": 0.11628, + "grad_norm": 1.4118621349334717, + "learning_rate": 1.998385728228234e-05, + "loss": 0.0775, + "step": 5814 + }, + { + "epoch": 0.11632, + "grad_norm": 0.47860100865364075, + "learning_rate": 1.998377788091721e-05, + "loss": 0.0807, + "step": 5816 + }, + { + "epoch": 0.11636, + "grad_norm": 0.5410587787628174, + "learning_rate": 1.9983698284913187e-05, + "loss": 0.414, + "step": 5818 + }, + { + "epoch": 0.1164, + "grad_norm": 2.7827532291412354, + "learning_rate": 1.9983618494271825e-05, + "loss": 0.4119, + "step": 5820 + }, + { + "epoch": 0.11644, + "grad_norm": 1.6810001134872437, + "learning_rate": 1.998353850899468e-05, + "loss": 0.3431, + "step": 5822 + }, + { + "epoch": 0.11648, + "grad_norm": 1.2347880601882935, + "learning_rate": 1.9983458329083313e-05, + "loss": 0.1716, + "step": 5824 + }, + { + "epoch": 0.11652, + "grad_norm": 0.598851203918457, + "learning_rate": 1.9983377954539287e-05, + "loss": 0.1148, + "step": 5826 + }, + { + "epoch": 0.11656, + "grad_norm": 0.7469609975814819, + "learning_rate": 1.9983297385364166e-05, + "loss": 0.3203, + "step": 5828 + }, + { + "epoch": 0.1166, + "grad_norm": 1.7693792581558228, + "learning_rate": 1.9983216621559525e-05, + "loss": 0.321, + "step": 5830 + }, + { + "epoch": 0.11664, + "grad_norm": 1.0858418941497803, + "learning_rate": 1.9983135663126937e-05, + "loss": 0.1306, + "step": 5832 + }, + { + "epoch": 0.11668, + "grad_norm": 0.6963057518005371, + "learning_rate": 1.9983054510067977e-05, + "loss": 0.1593, + "step": 5834 + }, + { + "epoch": 0.11672, + "grad_norm": 0.5538439154624939, + "learning_rate": 1.998297316238423e-05, + "loss": 0.2894, + "step": 5836 + }, + { + "epoch": 0.11676, + "grad_norm": 0.957819402217865, + "learning_rate": 1.998289162007728e-05, + "loss": 0.3308, + "step": 5838 + }, + { + "epoch": 0.1168, + "grad_norm": 1.7553479671478271, + "learning_rate": 1.998280988314872e-05, + "loss": 0.5315, + "step": 5840 + }, + { + "epoch": 0.11684, + "grad_norm": 1.2655348777770996, + "learning_rate": 1.9982727951600145e-05, + "loss": 0.3412, + "step": 5842 + }, + { + "epoch": 0.11688, + "grad_norm": 1.7266182899475098, + "learning_rate": 1.9982645825433143e-05, + "loss": 0.2831, + "step": 5844 + }, + { + "epoch": 0.11692, + "grad_norm": 0.867712676525116, + "learning_rate": 1.9982563504649327e-05, + "loss": 0.1757, + "step": 5846 + }, + { + "epoch": 0.11696, + "grad_norm": 0.4293637275695801, + "learning_rate": 1.9982480989250293e-05, + "loss": 0.1024, + "step": 5848 + }, + { + "epoch": 0.117, + "grad_norm": 3.4214251041412354, + "learning_rate": 1.9982398279237657e-05, + "loss": 0.3414, + "step": 5850 + }, + { + "epoch": 0.11704, + "grad_norm": 3.625028610229492, + "learning_rate": 1.998231537461302e-05, + "loss": 0.3621, + "step": 5852 + }, + { + "epoch": 0.11708, + "grad_norm": 3.3385987281799316, + "learning_rate": 1.998223227537801e-05, + "loss": 0.6393, + "step": 5854 + }, + { + "epoch": 0.11712, + "grad_norm": 2.163069486618042, + "learning_rate": 1.998214898153424e-05, + "loss": 0.2118, + "step": 5856 + }, + { + "epoch": 0.11716, + "grad_norm": 1.671185851097107, + "learning_rate": 1.998206549308334e-05, + "loss": 0.2194, + "step": 5858 + }, + { + "epoch": 0.1172, + "grad_norm": 1.4390283823013306, + "learning_rate": 1.9981981810026932e-05, + "loss": 0.2443, + "step": 5860 + }, + { + "epoch": 0.11724, + "grad_norm": 1.9565194845199585, + "learning_rate": 1.998189793236665e-05, + "loss": 0.2705, + "step": 5862 + }, + { + "epoch": 0.11728, + "grad_norm": 2.29457426071167, + "learning_rate": 1.998181386010413e-05, + "loss": 0.4102, + "step": 5864 + }, + { + "epoch": 0.11732, + "grad_norm": 2.0296237468719482, + "learning_rate": 1.9981729593241007e-05, + "loss": 0.2559, + "step": 5866 + }, + { + "epoch": 0.11736, + "grad_norm": 1.7978601455688477, + "learning_rate": 1.9981645131778928e-05, + "loss": 0.3541, + "step": 5868 + }, + { + "epoch": 0.1174, + "grad_norm": 1.6268324851989746, + "learning_rate": 1.998156047571954e-05, + "loss": 0.2188, + "step": 5870 + }, + { + "epoch": 0.11744, + "grad_norm": 1.3292474746704102, + "learning_rate": 1.998147562506449e-05, + "loss": 0.2442, + "step": 5872 + }, + { + "epoch": 0.11748, + "grad_norm": 1.0327584743499756, + "learning_rate": 1.9981390579815432e-05, + "loss": 0.1873, + "step": 5874 + }, + { + "epoch": 0.11752, + "grad_norm": 1.872147798538208, + "learning_rate": 1.9981305339974032e-05, + "loss": 0.25, + "step": 5876 + }, + { + "epoch": 0.11756, + "grad_norm": 2.01596999168396, + "learning_rate": 1.9981219905541938e-05, + "loss": 0.2855, + "step": 5878 + }, + { + "epoch": 0.1176, + "grad_norm": 1.0370625257492065, + "learning_rate": 1.9981134276520828e-05, + "loss": 0.2709, + "step": 5880 + }, + { + "epoch": 0.11764, + "grad_norm": 1.0228914022445679, + "learning_rate": 1.9981048452912364e-05, + "loss": 0.1461, + "step": 5882 + }, + { + "epoch": 0.11768, + "grad_norm": 1.6497337818145752, + "learning_rate": 1.9980962434718223e-05, + "loss": 0.3544, + "step": 5884 + }, + { + "epoch": 0.11772, + "grad_norm": 1.0120238065719604, + "learning_rate": 1.9980876221940086e-05, + "loss": 0.1776, + "step": 5886 + }, + { + "epoch": 0.11776, + "grad_norm": 1.6536093950271606, + "learning_rate": 1.9980789814579622e-05, + "loss": 0.3405, + "step": 5888 + }, + { + "epoch": 0.1178, + "grad_norm": 1.124070644378662, + "learning_rate": 1.9980703212638522e-05, + "loss": 0.2359, + "step": 5890 + }, + { + "epoch": 0.11784, + "grad_norm": 1.0774929523468018, + "learning_rate": 1.9980616416118478e-05, + "loss": 0.1856, + "step": 5892 + }, + { + "epoch": 0.11788, + "grad_norm": 1.377693772315979, + "learning_rate": 1.9980529425021172e-05, + "loss": 0.2663, + "step": 5894 + }, + { + "epoch": 0.11792, + "grad_norm": 1.3645896911621094, + "learning_rate": 1.9980442239348313e-05, + "loss": 0.2776, + "step": 5896 + }, + { + "epoch": 0.11796, + "grad_norm": 1.2624059915542603, + "learning_rate": 1.9980354859101595e-05, + "loss": 0.291, + "step": 5898 + }, + { + "epoch": 0.118, + "grad_norm": 1.0358372926712036, + "learning_rate": 1.9980267284282718e-05, + "loss": 0.1951, + "step": 5900 + }, + { + "epoch": 0.11804, + "grad_norm": 1.106287956237793, + "learning_rate": 1.9980179514893393e-05, + "loss": 0.1948, + "step": 5902 + }, + { + "epoch": 0.11808, + "grad_norm": 1.3015120029449463, + "learning_rate": 1.998009155093533e-05, + "loss": 0.3022, + "step": 5904 + }, + { + "epoch": 0.11812, + "grad_norm": 0.9192637205123901, + "learning_rate": 1.9980003392410242e-05, + "loss": 0.1601, + "step": 5906 + }, + { + "epoch": 0.11816, + "grad_norm": 0.9858416318893433, + "learning_rate": 1.997991503931985e-05, + "loss": 0.1764, + "step": 5908 + }, + { + "epoch": 0.1182, + "grad_norm": 1.446370005607605, + "learning_rate": 1.997982649166588e-05, + "loss": 0.2188, + "step": 5910 + }, + { + "epoch": 0.11824, + "grad_norm": 0.9036080241203308, + "learning_rate": 1.997973774945005e-05, + "loss": 0.2233, + "step": 5912 + }, + { + "epoch": 0.11828, + "grad_norm": 0.8716390132904053, + "learning_rate": 1.9979648812674098e-05, + "loss": 0.1133, + "step": 5914 + }, + { + "epoch": 0.11832, + "grad_norm": 0.9804707765579224, + "learning_rate": 1.9979559681339756e-05, + "loss": 0.1524, + "step": 5916 + }, + { + "epoch": 0.11836, + "grad_norm": 2.4496099948883057, + "learning_rate": 1.9979470355448756e-05, + "loss": 0.3475, + "step": 5918 + }, + { + "epoch": 0.1184, + "grad_norm": 0.6692758798599243, + "learning_rate": 1.9979380835002846e-05, + "loss": 0.2937, + "step": 5920 + }, + { + "epoch": 0.11844, + "grad_norm": 2.011312246322632, + "learning_rate": 1.9979291120003768e-05, + "loss": 0.4095, + "step": 5922 + }, + { + "epoch": 0.11848, + "grad_norm": 1.8013747930526733, + "learning_rate": 1.9979201210453274e-05, + "loss": 0.3953, + "step": 5924 + }, + { + "epoch": 0.11852, + "grad_norm": 0.953499436378479, + "learning_rate": 1.997911110635311e-05, + "loss": 0.239, + "step": 5926 + }, + { + "epoch": 0.11856, + "grad_norm": 1.0725963115692139, + "learning_rate": 1.9979020807705043e-05, + "loss": 0.2437, + "step": 5928 + }, + { + "epoch": 0.1186, + "grad_norm": 1.2479883432388306, + "learning_rate": 1.9978930314510826e-05, + "loss": 0.2672, + "step": 5930 + }, + { + "epoch": 0.11864, + "grad_norm": 1.0821863412857056, + "learning_rate": 1.9978839626772223e-05, + "loss": 0.2337, + "step": 5932 + }, + { + "epoch": 0.11868, + "grad_norm": 1.133877158164978, + "learning_rate": 1.9978748744491007e-05, + "loss": 0.2072, + "step": 5934 + }, + { + "epoch": 0.11872, + "grad_norm": 0.9006406664848328, + "learning_rate": 1.9978657667668945e-05, + "loss": 0.1681, + "step": 5936 + }, + { + "epoch": 0.11876, + "grad_norm": 0.8267991542816162, + "learning_rate": 1.9978566396307816e-05, + "loss": 0.1866, + "step": 5938 + }, + { + "epoch": 0.1188, + "grad_norm": 1.1188167333602905, + "learning_rate": 1.9978474930409396e-05, + "loss": 0.2337, + "step": 5940 + }, + { + "epoch": 0.11884, + "grad_norm": 1.522995948791504, + "learning_rate": 1.9978383269975467e-05, + "loss": 0.2513, + "step": 5942 + }, + { + "epoch": 0.11888, + "grad_norm": 1.329390287399292, + "learning_rate": 1.997829141500782e-05, + "loss": 0.2543, + "step": 5944 + }, + { + "epoch": 0.11892, + "grad_norm": 1.2531793117523193, + "learning_rate": 1.997819936550825e-05, + "loss": 0.233, + "step": 5946 + }, + { + "epoch": 0.11896, + "grad_norm": 1.347019910812378, + "learning_rate": 1.9978107121478545e-05, + "loss": 0.2893, + "step": 5948 + }, + { + "epoch": 0.119, + "grad_norm": 1.1377276182174683, + "learning_rate": 1.9978014682920503e-05, + "loss": 0.2574, + "step": 5950 + }, + { + "epoch": 0.11904, + "grad_norm": 0.8679915070533752, + "learning_rate": 1.9977922049835926e-05, + "loss": 0.2042, + "step": 5952 + }, + { + "epoch": 0.11908, + "grad_norm": 1.3653929233551025, + "learning_rate": 1.9977829222226622e-05, + "loss": 0.1968, + "step": 5954 + }, + { + "epoch": 0.11912, + "grad_norm": 1.9372777938842773, + "learning_rate": 1.9977736200094405e-05, + "loss": 0.2561, + "step": 5956 + }, + { + "epoch": 0.11916, + "grad_norm": 1.0496841669082642, + "learning_rate": 1.997764298344108e-05, + "loss": 0.2189, + "step": 5958 + }, + { + "epoch": 0.1192, + "grad_norm": 1.6413164138793945, + "learning_rate": 1.997754957226847e-05, + "loss": 0.2673, + "step": 5960 + }, + { + "epoch": 0.11924, + "grad_norm": 0.9902744889259338, + "learning_rate": 1.997745596657839e-05, + "loss": 0.1864, + "step": 5962 + }, + { + "epoch": 0.11928, + "grad_norm": 1.299416422843933, + "learning_rate": 1.9977362166372672e-05, + "loss": 0.2232, + "step": 5964 + }, + { + "epoch": 0.11932, + "grad_norm": 0.8203756213188171, + "learning_rate": 1.9977268171653143e-05, + "loss": 0.1977, + "step": 5966 + }, + { + "epoch": 0.11936, + "grad_norm": 0.8287013173103333, + "learning_rate": 1.997717398242163e-05, + "loss": 0.1245, + "step": 5968 + }, + { + "epoch": 0.1194, + "grad_norm": 1.9365332126617432, + "learning_rate": 1.9977079598679978e-05, + "loss": 0.3174, + "step": 5970 + }, + { + "epoch": 0.11944, + "grad_norm": 0.8306810259819031, + "learning_rate": 1.9976985020430022e-05, + "loss": 0.2494, + "step": 5972 + }, + { + "epoch": 0.11948, + "grad_norm": 1.117555022239685, + "learning_rate": 1.9976890247673607e-05, + "loss": 0.147, + "step": 5974 + }, + { + "epoch": 0.11952, + "grad_norm": 1.0202610492706299, + "learning_rate": 1.997679528041258e-05, + "loss": 0.1599, + "step": 5976 + }, + { + "epoch": 0.11956, + "grad_norm": 0.6930267810821533, + "learning_rate": 1.9976700118648792e-05, + "loss": 0.1356, + "step": 5978 + }, + { + "epoch": 0.1196, + "grad_norm": 0.7906919717788696, + "learning_rate": 1.99766047623841e-05, + "loss": 0.2295, + "step": 5980 + }, + { + "epoch": 0.11964, + "grad_norm": 0.8682307004928589, + "learning_rate": 1.997650921162036e-05, + "loss": 0.2352, + "step": 5982 + }, + { + "epoch": 0.11968, + "grad_norm": 0.4918263852596283, + "learning_rate": 1.9976413466359437e-05, + "loss": 0.2351, + "step": 5984 + }, + { + "epoch": 0.11972, + "grad_norm": 0.7072861194610596, + "learning_rate": 1.9976317526603196e-05, + "loss": 0.1639, + "step": 5986 + }, + { + "epoch": 0.11976, + "grad_norm": 0.770034909248352, + "learning_rate": 1.9976221392353513e-05, + "loss": 0.125, + "step": 5988 + }, + { + "epoch": 0.1198, + "grad_norm": 1.0607690811157227, + "learning_rate": 1.9976125063612254e-05, + "loss": 0.1138, + "step": 5990 + }, + { + "epoch": 0.11984, + "grad_norm": 1.8408883810043335, + "learning_rate": 1.9976028540381304e-05, + "loss": 0.4776, + "step": 5992 + }, + { + "epoch": 0.11988, + "grad_norm": 0.7501707077026367, + "learning_rate": 1.9975931822662537e-05, + "loss": 0.0721, + "step": 5994 + }, + { + "epoch": 0.11992, + "grad_norm": 1.1256426572799683, + "learning_rate": 1.997583491045785e-05, + "loss": 0.1758, + "step": 5996 + }, + { + "epoch": 0.11996, + "grad_norm": 1.6824567317962646, + "learning_rate": 1.9975737803769117e-05, + "loss": 0.1978, + "step": 5998 + }, + { + "epoch": 0.12, + "grad_norm": 0.8718175888061523, + "learning_rate": 1.9975640502598243e-05, + "loss": 0.1033, + "step": 6000 + }, + { + "epoch": 0.12004, + "grad_norm": 0.7391915321350098, + "learning_rate": 1.9975543006947123e-05, + "loss": 0.1003, + "step": 6002 + }, + { + "epoch": 0.12008, + "grad_norm": 2.1118409633636475, + "learning_rate": 1.9975445316817656e-05, + "loss": 0.2521, + "step": 6004 + }, + { + "epoch": 0.12012, + "grad_norm": 1.9272701740264893, + "learning_rate": 1.9975347432211748e-05, + "loss": 0.201, + "step": 6006 + }, + { + "epoch": 0.12016, + "grad_norm": 1.8054025173187256, + "learning_rate": 1.9975249353131304e-05, + "loss": 0.3403, + "step": 6008 + }, + { + "epoch": 0.1202, + "grad_norm": 2.1935548782348633, + "learning_rate": 1.9975151079578238e-05, + "loss": 0.3978, + "step": 6010 + }, + { + "epoch": 0.12024, + "grad_norm": 1.5487279891967773, + "learning_rate": 1.9975052611554467e-05, + "loss": 0.2772, + "step": 6012 + }, + { + "epoch": 0.12028, + "grad_norm": 0.8144479393959045, + "learning_rate": 1.9974953949061914e-05, + "loss": 0.1712, + "step": 6014 + }, + { + "epoch": 0.12032, + "grad_norm": 1.8183053731918335, + "learning_rate": 1.997485509210249e-05, + "loss": 0.353, + "step": 6016 + }, + { + "epoch": 0.12036, + "grad_norm": 0.5736914873123169, + "learning_rate": 1.9974756040678137e-05, + "loss": 0.09, + "step": 6018 + }, + { + "epoch": 0.1204, + "grad_norm": 0.8531671166419983, + "learning_rate": 1.9974656794790777e-05, + "loss": 0.192, + "step": 6020 + }, + { + "epoch": 0.12044, + "grad_norm": 2.0559725761413574, + "learning_rate": 1.997455735444235e-05, + "loss": 0.267, + "step": 6022 + }, + { + "epoch": 0.12048, + "grad_norm": 0.5607673525810242, + "learning_rate": 1.997445771963479e-05, + "loss": 0.1261, + "step": 6024 + }, + { + "epoch": 0.12052, + "grad_norm": 1.819886565208435, + "learning_rate": 1.9974357890370038e-05, + "loss": 0.3133, + "step": 6026 + }, + { + "epoch": 0.12056, + "grad_norm": 1.1088595390319824, + "learning_rate": 1.997425786665005e-05, + "loss": 0.2425, + "step": 6028 + }, + { + "epoch": 0.1206, + "grad_norm": 2.000393867492676, + "learning_rate": 1.9974157648476768e-05, + "loss": 0.3812, + "step": 6030 + }, + { + "epoch": 0.12064, + "grad_norm": 0.6980207562446594, + "learning_rate": 1.9974057235852148e-05, + "loss": 0.2191, + "step": 6032 + }, + { + "epoch": 0.12068, + "grad_norm": 2.1120736598968506, + "learning_rate": 1.9973956628778142e-05, + "loss": 0.3537, + "step": 6034 + }, + { + "epoch": 0.12072, + "grad_norm": 2.3661906719207764, + "learning_rate": 1.9973855827256722e-05, + "loss": 0.3519, + "step": 6036 + }, + { + "epoch": 0.12076, + "grad_norm": 0.8467989563941956, + "learning_rate": 1.997375483128985e-05, + "loss": 0.1755, + "step": 6038 + }, + { + "epoch": 0.1208, + "grad_norm": 0.9540804624557495, + "learning_rate": 1.9973653640879486e-05, + "loss": 0.1622, + "step": 6040 + }, + { + "epoch": 0.12084, + "grad_norm": 2.2021865844726562, + "learning_rate": 1.9973552256027614e-05, + "loss": 0.2932, + "step": 6042 + }, + { + "epoch": 0.12088, + "grad_norm": 1.4385979175567627, + "learning_rate": 1.9973450676736205e-05, + "loss": 0.205, + "step": 6044 + }, + { + "epoch": 0.12092, + "grad_norm": 1.7944647073745728, + "learning_rate": 1.9973348903007238e-05, + "loss": 0.2451, + "step": 6046 + }, + { + "epoch": 0.12096, + "grad_norm": 1.1936798095703125, + "learning_rate": 1.99732469348427e-05, + "loss": 0.239, + "step": 6048 + }, + { + "epoch": 0.121, + "grad_norm": 1.3681683540344238, + "learning_rate": 1.997314477224458e-05, + "loss": 0.1693, + "step": 6050 + }, + { + "epoch": 0.12104, + "grad_norm": 0.7332810163497925, + "learning_rate": 1.997304241521487e-05, + "loss": 0.0958, + "step": 6052 + }, + { + "epoch": 0.12108, + "grad_norm": 0.8056548237800598, + "learning_rate": 1.9972939863755563e-05, + "loss": 0.2526, + "step": 6054 + }, + { + "epoch": 0.12112, + "grad_norm": 1.184451699256897, + "learning_rate": 1.9972837117868657e-05, + "loss": 0.161, + "step": 6056 + }, + { + "epoch": 0.12116, + "grad_norm": 0.9460592269897461, + "learning_rate": 1.997273417755616e-05, + "loss": 0.3045, + "step": 6058 + }, + { + "epoch": 0.1212, + "grad_norm": 0.9591721296310425, + "learning_rate": 1.997263104282007e-05, + "loss": 0.0877, + "step": 6060 + }, + { + "epoch": 0.12124, + "grad_norm": 2.9109654426574707, + "learning_rate": 1.997252771366241e-05, + "loss": 0.3083, + "step": 6062 + }, + { + "epoch": 0.12128, + "grad_norm": 4.272768974304199, + "learning_rate": 1.9972424190085186e-05, + "loss": 0.5045, + "step": 6064 + }, + { + "epoch": 0.12132, + "grad_norm": 1.9983400106430054, + "learning_rate": 1.9972320472090415e-05, + "loss": 0.2102, + "step": 6066 + }, + { + "epoch": 0.12136, + "grad_norm": 1.4683374166488647, + "learning_rate": 1.9972216559680126e-05, + "loss": 0.2338, + "step": 6068 + }, + { + "epoch": 0.1214, + "grad_norm": 1.6233586072921753, + "learning_rate": 1.997211245285634e-05, + "loss": 0.2659, + "step": 6070 + }, + { + "epoch": 0.12144, + "grad_norm": 1.2752711772918701, + "learning_rate": 1.9972008151621086e-05, + "loss": 0.1691, + "step": 6072 + }, + { + "epoch": 0.12148, + "grad_norm": 1.168635368347168, + "learning_rate": 1.99719036559764e-05, + "loss": 0.251, + "step": 6074 + }, + { + "epoch": 0.12152, + "grad_norm": 0.9740179777145386, + "learning_rate": 1.997179896592432e-05, + "loss": 0.2484, + "step": 6076 + }, + { + "epoch": 0.12156, + "grad_norm": 0.989250898361206, + "learning_rate": 1.9971694081466884e-05, + "loss": 0.2489, + "step": 6078 + }, + { + "epoch": 0.1216, + "grad_norm": 1.2569531202316284, + "learning_rate": 1.997158900260614e-05, + "loss": 0.1846, + "step": 6080 + }, + { + "epoch": 0.12164, + "grad_norm": 0.9920095801353455, + "learning_rate": 1.9971483729344133e-05, + "loss": 0.1783, + "step": 6082 + }, + { + "epoch": 0.12168, + "grad_norm": 1.799651861190796, + "learning_rate": 1.9971378261682917e-05, + "loss": 0.2146, + "step": 6084 + }, + { + "epoch": 0.12172, + "grad_norm": 2.149503469467163, + "learning_rate": 1.997127259962455e-05, + "loss": 0.2416, + "step": 6086 + }, + { + "epoch": 0.12176, + "grad_norm": 1.055240511894226, + "learning_rate": 1.997116674317109e-05, + "loss": 0.1883, + "step": 6088 + }, + { + "epoch": 0.1218, + "grad_norm": 1.5465174913406372, + "learning_rate": 1.99710606923246e-05, + "loss": 0.2431, + "step": 6090 + }, + { + "epoch": 0.12184, + "grad_norm": 1.5622564554214478, + "learning_rate": 1.9970954447087147e-05, + "loss": 0.2029, + "step": 6092 + }, + { + "epoch": 0.12188, + "grad_norm": 1.8337022066116333, + "learning_rate": 1.9970848007460805e-05, + "loss": 0.2432, + "step": 6094 + }, + { + "epoch": 0.12192, + "grad_norm": 2.0915732383728027, + "learning_rate": 1.9970741373447645e-05, + "loss": 0.2887, + "step": 6096 + }, + { + "epoch": 0.12196, + "grad_norm": 1.936095952987671, + "learning_rate": 1.997063454504975e-05, + "loss": 0.2181, + "step": 6098 + }, + { + "epoch": 0.122, + "grad_norm": 2.0137152671813965, + "learning_rate": 1.9970527522269204e-05, + "loss": 0.2774, + "step": 6100 + }, + { + "epoch": 0.12204, + "grad_norm": 1.0180284976959229, + "learning_rate": 1.997042030510809e-05, + "loss": 0.2552, + "step": 6102 + }, + { + "epoch": 0.12208, + "grad_norm": 2.680649757385254, + "learning_rate": 1.9970312893568497e-05, + "loss": 0.2935, + "step": 6104 + }, + { + "epoch": 0.12212, + "grad_norm": 1.9420199394226074, + "learning_rate": 1.9970205287652522e-05, + "loss": 0.2653, + "step": 6106 + }, + { + "epoch": 0.12216, + "grad_norm": 0.9802426695823669, + "learning_rate": 1.9970097487362262e-05, + "loss": 0.1858, + "step": 6108 + }, + { + "epoch": 0.1222, + "grad_norm": 1.7140204906463623, + "learning_rate": 1.996998949269982e-05, + "loss": 0.235, + "step": 6110 + }, + { + "epoch": 0.12224, + "grad_norm": 0.9655866622924805, + "learning_rate": 1.9969881303667296e-05, + "loss": 0.1516, + "step": 6112 + }, + { + "epoch": 0.12228, + "grad_norm": 1.5268949270248413, + "learning_rate": 1.9969772920266806e-05, + "loss": 0.2104, + "step": 6114 + }, + { + "epoch": 0.12232, + "grad_norm": 1.193429946899414, + "learning_rate": 1.996966434250046e-05, + "loss": 0.1393, + "step": 6116 + }, + { + "epoch": 0.12236, + "grad_norm": 0.7073979377746582, + "learning_rate": 1.9969555570370377e-05, + "loss": 0.1911, + "step": 6118 + }, + { + "epoch": 0.1224, + "grad_norm": 1.1971638202667236, + "learning_rate": 1.9969446603878673e-05, + "loss": 0.1405, + "step": 6120 + }, + { + "epoch": 0.12244, + "grad_norm": 1.413882851600647, + "learning_rate": 1.9969337443027474e-05, + "loss": 0.1403, + "step": 6122 + }, + { + "epoch": 0.12248, + "grad_norm": 0.42216405272483826, + "learning_rate": 1.996922808781891e-05, + "loss": 0.08, + "step": 6124 + }, + { + "epoch": 0.12252, + "grad_norm": 2.6773900985717773, + "learning_rate": 1.9969118538255114e-05, + "loss": 0.2468, + "step": 6126 + }, + { + "epoch": 0.12256, + "grad_norm": 1.330291509628296, + "learning_rate": 1.9969008794338214e-05, + "loss": 0.1406, + "step": 6128 + }, + { + "epoch": 0.1226, + "grad_norm": 0.6769915223121643, + "learning_rate": 1.996889885607036e-05, + "loss": 0.3264, + "step": 6130 + }, + { + "epoch": 0.12264, + "grad_norm": 2.0759456157684326, + "learning_rate": 1.9968788723453688e-05, + "loss": 0.3661, + "step": 6132 + }, + { + "epoch": 0.12268, + "grad_norm": 1.3276007175445557, + "learning_rate": 1.996867839649035e-05, + "loss": 0.2259, + "step": 6134 + }, + { + "epoch": 0.12272, + "grad_norm": 1.8065996170043945, + "learning_rate": 1.9968567875182492e-05, + "loss": 0.2348, + "step": 6136 + }, + { + "epoch": 0.12276, + "grad_norm": 2.2081642150878906, + "learning_rate": 1.9968457159532272e-05, + "loss": 0.2664, + "step": 6138 + }, + { + "epoch": 0.1228, + "grad_norm": 0.8382675647735596, + "learning_rate": 1.9968346249541848e-05, + "loss": 0.2428, + "step": 6140 + }, + { + "epoch": 0.12284, + "grad_norm": 1.2665939331054688, + "learning_rate": 1.9968235145213382e-05, + "loss": 0.2127, + "step": 6142 + }, + { + "epoch": 0.12288, + "grad_norm": 0.9591962695121765, + "learning_rate": 1.996812384654904e-05, + "loss": 0.2032, + "step": 6144 + }, + { + "epoch": 0.12292, + "grad_norm": 2.2541000843048096, + "learning_rate": 1.9968012353550992e-05, + "loss": 0.3673, + "step": 6146 + }, + { + "epoch": 0.12296, + "grad_norm": 2.0662693977355957, + "learning_rate": 1.996790066622141e-05, + "loss": 0.2241, + "step": 6148 + }, + { + "epoch": 0.123, + "grad_norm": 1.8314193487167358, + "learning_rate": 1.9967788784562474e-05, + "loss": 0.2639, + "step": 6150 + }, + { + "epoch": 0.12304, + "grad_norm": 1.9956103563308716, + "learning_rate": 1.9967676708576362e-05, + "loss": 0.1687, + "step": 6152 + }, + { + "epoch": 0.12308, + "grad_norm": 1.1339805126190186, + "learning_rate": 1.9967564438265262e-05, + "loss": 0.1592, + "step": 6154 + }, + { + "epoch": 0.12312, + "grad_norm": 1.7619653940200806, + "learning_rate": 1.9967451973631363e-05, + "loss": 0.2561, + "step": 6156 + }, + { + "epoch": 0.12316, + "grad_norm": 1.837080717086792, + "learning_rate": 1.996733931467685e-05, + "loss": 0.301, + "step": 6158 + }, + { + "epoch": 0.1232, + "grad_norm": 0.861922025680542, + "learning_rate": 1.9967226461403934e-05, + "loss": 0.2088, + "step": 6160 + }, + { + "epoch": 0.12324, + "grad_norm": 1.8305500745773315, + "learning_rate": 1.9967113413814802e-05, + "loss": 0.1933, + "step": 6162 + }, + { + "epoch": 0.12328, + "grad_norm": 0.8591381907463074, + "learning_rate": 1.9967000171911666e-05, + "loss": 0.1075, + "step": 6164 + }, + { + "epoch": 0.12332, + "grad_norm": 0.8675278425216675, + "learning_rate": 1.9966886735696726e-05, + "loss": 0.144, + "step": 6166 + }, + { + "epoch": 0.12336, + "grad_norm": 0.962139368057251, + "learning_rate": 1.99667731051722e-05, + "loss": 0.2153, + "step": 6168 + }, + { + "epoch": 0.1234, + "grad_norm": 1.2347259521484375, + "learning_rate": 1.99666592803403e-05, + "loss": 0.1932, + "step": 6170 + }, + { + "epoch": 0.12344, + "grad_norm": 1.8141661882400513, + "learning_rate": 1.996654526120325e-05, + "loss": 0.1969, + "step": 6172 + }, + { + "epoch": 0.12348, + "grad_norm": 1.8921048641204834, + "learning_rate": 1.996643104776326e-05, + "loss": 0.1656, + "step": 6174 + }, + { + "epoch": 0.12352, + "grad_norm": 0.5889310240745544, + "learning_rate": 1.9966316640022577e-05, + "loss": 0.126, + "step": 6176 + }, + { + "epoch": 0.12356, + "grad_norm": 0.4803444445133209, + "learning_rate": 1.996620203798341e-05, + "loss": 0.1025, + "step": 6178 + }, + { + "epoch": 0.1236, + "grad_norm": 1.1572178602218628, + "learning_rate": 1.996608724164801e-05, + "loss": 0.0922, + "step": 6180 + }, + { + "epoch": 0.12364, + "grad_norm": 2.09309458732605, + "learning_rate": 1.9965972251018605e-05, + "loss": 0.2351, + "step": 6182 + }, + { + "epoch": 0.12368, + "grad_norm": 1.5566413402557373, + "learning_rate": 1.9965857066097444e-05, + "loss": 0.1616, + "step": 6184 + }, + { + "epoch": 0.12372, + "grad_norm": 2.7873289585113525, + "learning_rate": 1.9965741686886762e-05, + "loss": 0.4301, + "step": 6186 + }, + { + "epoch": 0.12376, + "grad_norm": 1.0395574569702148, + "learning_rate": 1.9965626113388823e-05, + "loss": 0.3428, + "step": 6188 + }, + { + "epoch": 0.1238, + "grad_norm": 1.9942158460617065, + "learning_rate": 1.9965510345605866e-05, + "loss": 0.1719, + "step": 6190 + }, + { + "epoch": 0.12384, + "grad_norm": 0.7222716808319092, + "learning_rate": 1.9965394383540158e-05, + "loss": 0.0682, + "step": 6192 + }, + { + "epoch": 0.12388, + "grad_norm": 0.363535076379776, + "learning_rate": 1.9965278227193955e-05, + "loss": 0.1598, + "step": 6194 + }, + { + "epoch": 0.12392, + "grad_norm": 0.9552385210990906, + "learning_rate": 1.996516187656952e-05, + "loss": 0.1113, + "step": 6196 + }, + { + "epoch": 0.12396, + "grad_norm": 0.5425085425376892, + "learning_rate": 1.996504533166913e-05, + "loss": 0.0714, + "step": 6198 + }, + { + "epoch": 0.124, + "grad_norm": 3.234384536743164, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.3867, + "step": 6200 + }, + { + "epoch": 0.12404, + "grad_norm": 1.594022512435913, + "learning_rate": 1.996481165904955e-05, + "loss": 0.2921, + "step": 6202 + }, + { + "epoch": 0.12408, + "grad_norm": 0.29209715127944946, + "learning_rate": 1.9964694531334917e-05, + "loss": 0.0947, + "step": 6204 + }, + { + "epoch": 0.12412, + "grad_norm": 0.9244029521942139, + "learning_rate": 1.9964577209353438e-05, + "loss": 0.5537, + "step": 6206 + }, + { + "epoch": 0.12416, + "grad_norm": 0.6790652275085449, + "learning_rate": 1.9964459693107396e-05, + "loss": 0.3097, + "step": 6208 + }, + { + "epoch": 0.1242, + "grad_norm": 0.7510793209075928, + "learning_rate": 1.996434198259908e-05, + "loss": 0.3602, + "step": 6210 + }, + { + "epoch": 0.12424, + "grad_norm": 1.561854600906372, + "learning_rate": 1.9964224077830788e-05, + "loss": 0.2334, + "step": 6212 + }, + { + "epoch": 0.12428, + "grad_norm": 2.2055861949920654, + "learning_rate": 1.996410597880482e-05, + "loss": 0.3639, + "step": 6214 + }, + { + "epoch": 0.12432, + "grad_norm": 1.5536295175552368, + "learning_rate": 1.9963987685523475e-05, + "loss": 0.2474, + "step": 6216 + }, + { + "epoch": 0.12436, + "grad_norm": 0.49098649621009827, + "learning_rate": 1.9963869197989058e-05, + "loss": 0.1222, + "step": 6218 + }, + { + "epoch": 0.1244, + "grad_norm": 0.4214138090610504, + "learning_rate": 1.9963750516203887e-05, + "loss": 0.1242, + "step": 6220 + }, + { + "epoch": 0.12444, + "grad_norm": 2.4640002250671387, + "learning_rate": 1.9963631640170264e-05, + "loss": 0.3737, + "step": 6222 + }, + { + "epoch": 0.12448, + "grad_norm": 1.3278768062591553, + "learning_rate": 1.9963512569890512e-05, + "loss": 0.213, + "step": 6224 + }, + { + "epoch": 0.12452, + "grad_norm": 1.4375861883163452, + "learning_rate": 1.9963393305366958e-05, + "loss": 0.2239, + "step": 6226 + }, + { + "epoch": 0.12456, + "grad_norm": 1.2990803718566895, + "learning_rate": 1.9963273846601918e-05, + "loss": 0.176, + "step": 6228 + }, + { + "epoch": 0.1246, + "grad_norm": 1.0817265510559082, + "learning_rate": 1.9963154193597728e-05, + "loss": 0.141, + "step": 6230 + }, + { + "epoch": 0.12464, + "grad_norm": 0.6701366901397705, + "learning_rate": 1.9963034346356714e-05, + "loss": 0.1395, + "step": 6232 + }, + { + "epoch": 0.12468, + "grad_norm": 1.3257755041122437, + "learning_rate": 1.996291430488122e-05, + "loss": 0.2241, + "step": 6234 + }, + { + "epoch": 0.12472, + "grad_norm": 0.9537546634674072, + "learning_rate": 1.9962794069173577e-05, + "loss": 0.2036, + "step": 6236 + }, + { + "epoch": 0.12476, + "grad_norm": 2.1137211322784424, + "learning_rate": 1.996267363923614e-05, + "loss": 0.3112, + "step": 6238 + }, + { + "epoch": 0.1248, + "grad_norm": 1.1190966367721558, + "learning_rate": 1.996255301507125e-05, + "loss": 0.2304, + "step": 6240 + }, + { + "epoch": 0.12484, + "grad_norm": 2.5403761863708496, + "learning_rate": 1.996243219668126e-05, + "loss": 0.57, + "step": 6242 + }, + { + "epoch": 0.12488, + "grad_norm": 2.4083092212677, + "learning_rate": 1.996231118406852e-05, + "loss": 0.44, + "step": 6244 + }, + { + "epoch": 0.12492, + "grad_norm": 1.2362141609191895, + "learning_rate": 1.99621899772354e-05, + "loss": 0.1547, + "step": 6246 + }, + { + "epoch": 0.12496, + "grad_norm": 0.9945597052574158, + "learning_rate": 1.9962068576184258e-05, + "loss": 0.2069, + "step": 6248 + }, + { + "epoch": 0.125, + "grad_norm": 1.3431296348571777, + "learning_rate": 1.9961946980917457e-05, + "loss": 0.1647, + "step": 6250 + }, + { + "epoch": 0.12504, + "grad_norm": 1.7158747911453247, + "learning_rate": 1.9961825191437372e-05, + "loss": 0.2218, + "step": 6252 + }, + { + "epoch": 0.12508, + "grad_norm": 1.5698102712631226, + "learning_rate": 1.9961703207746378e-05, + "loss": 0.2766, + "step": 6254 + }, + { + "epoch": 0.12512, + "grad_norm": 1.1872504949569702, + "learning_rate": 1.996158102984685e-05, + "loss": 0.1782, + "step": 6256 + }, + { + "epoch": 0.12516, + "grad_norm": 1.1075173616409302, + "learning_rate": 1.9961458657741172e-05, + "loss": 0.1519, + "step": 6258 + }, + { + "epoch": 0.1252, + "grad_norm": 1.5772653818130493, + "learning_rate": 1.9961336091431728e-05, + "loss": 0.2302, + "step": 6260 + }, + { + "epoch": 0.12524, + "grad_norm": 1.457672119140625, + "learning_rate": 1.9961213330920906e-05, + "loss": 0.1993, + "step": 6262 + }, + { + "epoch": 0.12528, + "grad_norm": 0.4268215000629425, + "learning_rate": 1.9961090376211107e-05, + "loss": 0.1242, + "step": 6264 + }, + { + "epoch": 0.12532, + "grad_norm": 1.208983063697815, + "learning_rate": 1.9960967227304717e-05, + "loss": 0.1218, + "step": 6266 + }, + { + "epoch": 0.12536, + "grad_norm": 0.7269864082336426, + "learning_rate": 1.996084388420415e-05, + "loss": 0.1251, + "step": 6268 + }, + { + "epoch": 0.1254, + "grad_norm": 0.6184012293815613, + "learning_rate": 1.9960720346911798e-05, + "loss": 0.0855, + "step": 6270 + }, + { + "epoch": 0.12544, + "grad_norm": 0.49752184748649597, + "learning_rate": 1.9960596615430076e-05, + "loss": 0.2145, + "step": 6272 + }, + { + "epoch": 0.12548, + "grad_norm": 0.8001307845115662, + "learning_rate": 1.9960472689761392e-05, + "loss": 0.1496, + "step": 6274 + }, + { + "epoch": 0.12552, + "grad_norm": 1.6824015378952026, + "learning_rate": 1.996034856990817e-05, + "loss": 0.1691, + "step": 6276 + }, + { + "epoch": 0.12556, + "grad_norm": 1.5774184465408325, + "learning_rate": 1.996022425587282e-05, + "loss": 0.1556, + "step": 6278 + }, + { + "epoch": 0.1256, + "grad_norm": 1.3654793500900269, + "learning_rate": 1.9960099747657774e-05, + "loss": 0.5949, + "step": 6280 + }, + { + "epoch": 0.12564, + "grad_norm": 1.3466670513153076, + "learning_rate": 1.9959975045265455e-05, + "loss": 0.1445, + "step": 6282 + }, + { + "epoch": 0.12568, + "grad_norm": 2.1597819328308105, + "learning_rate": 1.9959850148698292e-05, + "loss": 0.2765, + "step": 6284 + }, + { + "epoch": 0.12572, + "grad_norm": 1.865206003189087, + "learning_rate": 1.9959725057958724e-05, + "loss": 0.1695, + "step": 6286 + }, + { + "epoch": 0.12576, + "grad_norm": 2.26096510887146, + "learning_rate": 1.9959599773049188e-05, + "loss": 0.2878, + "step": 6288 + }, + { + "epoch": 0.1258, + "grad_norm": 1.9725311994552612, + "learning_rate": 1.995947429397213e-05, + "loss": 0.2664, + "step": 6290 + }, + { + "epoch": 0.12584, + "grad_norm": 1.2985403537750244, + "learning_rate": 1.995934862072999e-05, + "loss": 0.1754, + "step": 6292 + }, + { + "epoch": 0.12588, + "grad_norm": 0.33858928084373474, + "learning_rate": 1.9959222753325225e-05, + "loss": 0.1012, + "step": 6294 + }, + { + "epoch": 0.12592, + "grad_norm": 1.1536387205123901, + "learning_rate": 1.9959096691760284e-05, + "loss": 0.1194, + "step": 6296 + }, + { + "epoch": 0.12596, + "grad_norm": 2.0511655807495117, + "learning_rate": 1.995897043603762e-05, + "loss": 0.2033, + "step": 6298 + }, + { + "epoch": 0.126, + "grad_norm": 1.4900038242340088, + "learning_rate": 1.9958843986159705e-05, + "loss": 0.3833, + "step": 6300 + }, + { + "epoch": 0.12604, + "grad_norm": 1.06289541721344, + "learning_rate": 1.9958717342129e-05, + "loss": 0.1597, + "step": 6302 + }, + { + "epoch": 0.12608, + "grad_norm": 2.590325355529785, + "learning_rate": 1.9958590503947973e-05, + "loss": 0.2779, + "step": 6304 + }, + { + "epoch": 0.12612, + "grad_norm": 1.4732824563980103, + "learning_rate": 1.9958463471619093e-05, + "loss": 0.2121, + "step": 6306 + }, + { + "epoch": 0.12616, + "grad_norm": 0.9253606796264648, + "learning_rate": 1.9958336245144844e-05, + "loss": 0.1307, + "step": 6308 + }, + { + "epoch": 0.1262, + "grad_norm": 1.0325798988342285, + "learning_rate": 1.9958208824527702e-05, + "loss": 0.1883, + "step": 6310 + }, + { + "epoch": 0.12624, + "grad_norm": 0.590731680393219, + "learning_rate": 1.9958081209770155e-05, + "loss": 0.2825, + "step": 6312 + }, + { + "epoch": 0.12628, + "grad_norm": 1.1681065559387207, + "learning_rate": 1.9957953400874682e-05, + "loss": 0.1146, + "step": 6314 + }, + { + "epoch": 0.12632, + "grad_norm": 0.6564030647277832, + "learning_rate": 1.9957825397843785e-05, + "loss": 0.262, + "step": 6316 + }, + { + "epoch": 0.12636, + "grad_norm": 1.8112825155258179, + "learning_rate": 1.9957697200679956e-05, + "loss": 0.2348, + "step": 6318 + }, + { + "epoch": 0.1264, + "grad_norm": 2.477325439453125, + "learning_rate": 1.9957568809385693e-05, + "loss": 0.5208, + "step": 6320 + }, + { + "epoch": 0.12644, + "grad_norm": 1.4574240446090698, + "learning_rate": 1.99574402239635e-05, + "loss": 0.1815, + "step": 6322 + }, + { + "epoch": 0.12648, + "grad_norm": 1.2765640020370483, + "learning_rate": 1.995731144441588e-05, + "loss": 0.1784, + "step": 6324 + }, + { + "epoch": 0.12652, + "grad_norm": 2.0451760292053223, + "learning_rate": 1.995718247074535e-05, + "loss": 0.4095, + "step": 6326 + }, + { + "epoch": 0.12656, + "grad_norm": 0.8889314532279968, + "learning_rate": 1.9957053302954422e-05, + "loss": 0.1384, + "step": 6328 + }, + { + "epoch": 0.1266, + "grad_norm": 1.7793474197387695, + "learning_rate": 1.9956923941045613e-05, + "loss": 0.2006, + "step": 6330 + }, + { + "epoch": 0.12664, + "grad_norm": 1.199815034866333, + "learning_rate": 1.9956794385021444e-05, + "loss": 0.1933, + "step": 6332 + }, + { + "epoch": 0.12668, + "grad_norm": 1.4188920259475708, + "learning_rate": 1.995666463488444e-05, + "loss": 0.2233, + "step": 6334 + }, + { + "epoch": 0.12672, + "grad_norm": 0.838406503200531, + "learning_rate": 1.9956534690637137e-05, + "loss": 0.2412, + "step": 6336 + }, + { + "epoch": 0.12676, + "grad_norm": 1.505957007408142, + "learning_rate": 1.9956404552282065e-05, + "loss": 0.196, + "step": 6338 + }, + { + "epoch": 0.1268, + "grad_norm": 1.1506818532943726, + "learning_rate": 1.995627421982176e-05, + "loss": 0.3013, + "step": 6340 + }, + { + "epoch": 0.12684, + "grad_norm": 2.1794493198394775, + "learning_rate": 1.9956143693258762e-05, + "loss": 0.2157, + "step": 6342 + }, + { + "epoch": 0.12688, + "grad_norm": 1.1817760467529297, + "learning_rate": 1.9956012972595617e-05, + "loss": 0.2866, + "step": 6344 + }, + { + "epoch": 0.12692, + "grad_norm": 1.0492563247680664, + "learning_rate": 1.995588205783487e-05, + "loss": 0.1222, + "step": 6346 + }, + { + "epoch": 0.12696, + "grad_norm": 0.8269948959350586, + "learning_rate": 1.995575094897908e-05, + "loss": 0.1269, + "step": 6348 + }, + { + "epoch": 0.127, + "grad_norm": 1.2283811569213867, + "learning_rate": 1.99556196460308e-05, + "loss": 0.2348, + "step": 6350 + }, + { + "epoch": 0.12704, + "grad_norm": 1.6369658708572388, + "learning_rate": 1.9955488148992593e-05, + "loss": 0.2666, + "step": 6352 + }, + { + "epoch": 0.12708, + "grad_norm": 1.6826545000076294, + "learning_rate": 1.9955356457867016e-05, + "loss": 0.3267, + "step": 6354 + }, + { + "epoch": 0.12712, + "grad_norm": 1.7519025802612305, + "learning_rate": 1.9955224572656635e-05, + "loss": 0.1939, + "step": 6356 + }, + { + "epoch": 0.12716, + "grad_norm": 0.7529247999191284, + "learning_rate": 1.995509249336403e-05, + "loss": 0.1191, + "step": 6358 + }, + { + "epoch": 0.1272, + "grad_norm": 1.4054607152938843, + "learning_rate": 1.995496021999177e-05, + "loss": 0.1613, + "step": 6360 + }, + { + "epoch": 0.12724, + "grad_norm": 0.6603760123252869, + "learning_rate": 1.995482775254244e-05, + "loss": 0.1722, + "step": 6362 + }, + { + "epoch": 0.12728, + "grad_norm": 0.3526292145252228, + "learning_rate": 1.9954695091018613e-05, + "loss": 0.1249, + "step": 6364 + }, + { + "epoch": 0.12732, + "grad_norm": 6.504535675048828, + "learning_rate": 1.995456223542288e-05, + "loss": 0.4244, + "step": 6366 + }, + { + "epoch": 0.12736, + "grad_norm": 1.6882274150848389, + "learning_rate": 1.9954429185757835e-05, + "loss": 0.154, + "step": 6368 + }, + { + "epoch": 0.1274, + "grad_norm": 0.21043699979782104, + "learning_rate": 1.9954295942026065e-05, + "loss": 0.0348, + "step": 6370 + }, + { + "epoch": 0.12744, + "grad_norm": 1.2963343858718872, + "learning_rate": 1.995416250423017e-05, + "loss": 0.1381, + "step": 6372 + }, + { + "epoch": 0.12748, + "grad_norm": 2.1025443077087402, + "learning_rate": 1.9954028872372753e-05, + "loss": 0.3206, + "step": 6374 + }, + { + "epoch": 0.12752, + "grad_norm": 0.6908308267593384, + "learning_rate": 1.995389504645642e-05, + "loss": 0.0622, + "step": 6376 + }, + { + "epoch": 0.12756, + "grad_norm": 3.352273941040039, + "learning_rate": 1.9953761026483778e-05, + "loss": 0.3263, + "step": 6378 + }, + { + "epoch": 0.1276, + "grad_norm": 0.7383601069450378, + "learning_rate": 1.995362681245744e-05, + "loss": 0.1493, + "step": 6380 + }, + { + "epoch": 0.12764, + "grad_norm": 1.3001073598861694, + "learning_rate": 1.9953492404380023e-05, + "loss": 0.2027, + "step": 6382 + }, + { + "epoch": 0.12768, + "grad_norm": 1.8960384130477905, + "learning_rate": 1.9953357802254147e-05, + "loss": 0.2594, + "step": 6384 + }, + { + "epoch": 0.12772, + "grad_norm": 0.8936285972595215, + "learning_rate": 1.9953223006082435e-05, + "loss": 0.1033, + "step": 6386 + }, + { + "epoch": 0.12776, + "grad_norm": 0.8521447777748108, + "learning_rate": 1.995308801586752e-05, + "loss": 0.1649, + "step": 6388 + }, + { + "epoch": 0.1278, + "grad_norm": 1.203966498374939, + "learning_rate": 1.9952952831612027e-05, + "loss": 0.4528, + "step": 6390 + }, + { + "epoch": 0.12784, + "grad_norm": 1.5363205671310425, + "learning_rate": 1.9952817453318592e-05, + "loss": 0.2331, + "step": 6392 + }, + { + "epoch": 0.12788, + "grad_norm": 1.7146949768066406, + "learning_rate": 1.9952681880989862e-05, + "loss": 0.1885, + "step": 6394 + }, + { + "epoch": 0.12792, + "grad_norm": 2.3859074115753174, + "learning_rate": 1.9952546114628472e-05, + "loss": 0.2699, + "step": 6396 + }, + { + "epoch": 0.12796, + "grad_norm": 3.2517476081848145, + "learning_rate": 1.9952410154237073e-05, + "loss": 0.3673, + "step": 6398 + }, + { + "epoch": 0.128, + "grad_norm": 3.302433490753174, + "learning_rate": 1.9952273999818312e-05, + "loss": 0.7534, + "step": 6400 + }, + { + "epoch": 0.12804, + "grad_norm": 1.3940322399139404, + "learning_rate": 1.9952137651374848e-05, + "loss": 0.2825, + "step": 6402 + }, + { + "epoch": 0.12808, + "grad_norm": 1.6034691333770752, + "learning_rate": 1.9952001108909336e-05, + "loss": 0.2653, + "step": 6404 + }, + { + "epoch": 0.12812, + "grad_norm": 0.7305997610092163, + "learning_rate": 1.9951864372424435e-05, + "loss": 0.1328, + "step": 6406 + }, + { + "epoch": 0.12816, + "grad_norm": 1.1026581525802612, + "learning_rate": 1.9951727441922823e-05, + "loss": 0.1679, + "step": 6408 + }, + { + "epoch": 0.1282, + "grad_norm": 1.5603748559951782, + "learning_rate": 1.9951590317407152e-05, + "loss": 0.243, + "step": 6410 + }, + { + "epoch": 0.12824, + "grad_norm": 0.5972849130630493, + "learning_rate": 1.995145299888011e-05, + "loss": 0.078, + "step": 6412 + }, + { + "epoch": 0.12828, + "grad_norm": 1.2215160131454468, + "learning_rate": 1.9951315486344364e-05, + "loss": 0.169, + "step": 6414 + }, + { + "epoch": 0.12832, + "grad_norm": 0.7905202507972717, + "learning_rate": 1.9951177779802604e-05, + "loss": 0.141, + "step": 6416 + }, + { + "epoch": 0.12836, + "grad_norm": 1.6291162967681885, + "learning_rate": 1.995103987925751e-05, + "loss": 0.3431, + "step": 6418 + }, + { + "epoch": 0.1284, + "grad_norm": 0.5609999299049377, + "learning_rate": 1.9950901784711765e-05, + "loss": 0.1353, + "step": 6420 + }, + { + "epoch": 0.12844, + "grad_norm": 1.2601518630981445, + "learning_rate": 1.995076349616807e-05, + "loss": 0.1156, + "step": 6422 + }, + { + "epoch": 0.12848, + "grad_norm": 1.7366732358932495, + "learning_rate": 1.995062501362912e-05, + "loss": 0.1411, + "step": 6424 + }, + { + "epoch": 0.12852, + "grad_norm": 1.9648025035858154, + "learning_rate": 1.995048633709761e-05, + "loss": 0.2658, + "step": 6426 + }, + { + "epoch": 0.12856, + "grad_norm": 1.9373588562011719, + "learning_rate": 1.9950347466576244e-05, + "loss": 0.2111, + "step": 6428 + }, + { + "epoch": 0.1286, + "grad_norm": 1.2525634765625, + "learning_rate": 1.9950208402067735e-05, + "loss": 0.3495, + "step": 6430 + }, + { + "epoch": 0.12864, + "grad_norm": 0.7732521891593933, + "learning_rate": 1.9950069143574787e-05, + "loss": 0.243, + "step": 6432 + }, + { + "epoch": 0.12868, + "grad_norm": 1.630889892578125, + "learning_rate": 1.9949929691100124e-05, + "loss": 0.1857, + "step": 6434 + }, + { + "epoch": 0.12872, + "grad_norm": 1.9016661643981934, + "learning_rate": 1.9949790044646452e-05, + "loss": 0.3318, + "step": 6436 + }, + { + "epoch": 0.12876, + "grad_norm": 2.390766143798828, + "learning_rate": 1.9949650204216502e-05, + "loss": 0.5193, + "step": 6438 + }, + { + "epoch": 0.1288, + "grad_norm": 1.2057193517684937, + "learning_rate": 1.9949510169813006e-05, + "loss": 0.1117, + "step": 6440 + }, + { + "epoch": 0.12884, + "grad_norm": 1.3280627727508545, + "learning_rate": 1.994936994143868e-05, + "loss": 0.125, + "step": 6442 + }, + { + "epoch": 0.12888, + "grad_norm": 0.6482793092727661, + "learning_rate": 1.9949229519096266e-05, + "loss": 0.0818, + "step": 6444 + }, + { + "epoch": 0.12892, + "grad_norm": 1.2875325679779053, + "learning_rate": 1.99490889027885e-05, + "loss": 0.162, + "step": 6446 + }, + { + "epoch": 0.12896, + "grad_norm": 0.7256414294242859, + "learning_rate": 1.994894809251812e-05, + "loss": 0.1219, + "step": 6448 + }, + { + "epoch": 0.129, + "grad_norm": 1.4405663013458252, + "learning_rate": 1.9948807088287884e-05, + "loss": 0.1178, + "step": 6450 + }, + { + "epoch": 0.12904, + "grad_norm": 2.1902530193328857, + "learning_rate": 1.9948665890100526e-05, + "loss": 0.1439, + "step": 6452 + }, + { + "epoch": 0.12908, + "grad_norm": 1.0998188257217407, + "learning_rate": 1.9948524497958804e-05, + "loss": 0.0914, + "step": 6454 + }, + { + "epoch": 0.12912, + "grad_norm": 2.5000267028808594, + "learning_rate": 1.994838291186548e-05, + "loss": 0.3854, + "step": 6456 + }, + { + "epoch": 0.12916, + "grad_norm": 0.536543607711792, + "learning_rate": 1.9948241131823306e-05, + "loss": 0.1402, + "step": 6458 + }, + { + "epoch": 0.1292, + "grad_norm": 1.15764582157135, + "learning_rate": 1.994809915783505e-05, + "loss": 0.127, + "step": 6460 + }, + { + "epoch": 0.12924, + "grad_norm": 2.6097042560577393, + "learning_rate": 1.9947956989903478e-05, + "loss": 0.3531, + "step": 6462 + }, + { + "epoch": 0.12928, + "grad_norm": 0.9904692769050598, + "learning_rate": 1.9947814628031363e-05, + "loss": 0.1486, + "step": 6464 + }, + { + "epoch": 0.12932, + "grad_norm": 1.7070417404174805, + "learning_rate": 1.994767207222148e-05, + "loss": 0.381, + "step": 6466 + }, + { + "epoch": 0.12936, + "grad_norm": 0.728349506855011, + "learning_rate": 1.994752932247661e-05, + "loss": 0.1147, + "step": 6468 + }, + { + "epoch": 0.1294, + "grad_norm": 2.5025711059570312, + "learning_rate": 1.9947386378799534e-05, + "loss": 0.1614, + "step": 6470 + }, + { + "epoch": 0.12944, + "grad_norm": 0.2767185866832733, + "learning_rate": 1.994724324119304e-05, + "loss": 0.0134, + "step": 6472 + }, + { + "epoch": 0.12948, + "grad_norm": 0.05552440136671066, + "learning_rate": 1.994709990965992e-05, + "loss": 0.2059, + "step": 6474 + }, + { + "epoch": 0.12952, + "grad_norm": 3.2693355083465576, + "learning_rate": 1.994695638420296e-05, + "loss": 0.4994, + "step": 6476 + }, + { + "epoch": 0.12956, + "grad_norm": 0.20832853019237518, + "learning_rate": 1.994681266482497e-05, + "loss": 0.0475, + "step": 6478 + }, + { + "epoch": 0.1296, + "grad_norm": 3.688049554824829, + "learning_rate": 1.9946668751528745e-05, + "loss": 0.2855, + "step": 6480 + }, + { + "epoch": 0.12964, + "grad_norm": 4.951560974121094, + "learning_rate": 1.9946524644317087e-05, + "loss": 0.4021, + "step": 6482 + }, + { + "epoch": 0.12968, + "grad_norm": 0.25753432512283325, + "learning_rate": 1.9946380343192815e-05, + "loss": 0.015, + "step": 6484 + }, + { + "epoch": 0.12972, + "grad_norm": 0.4457055330276489, + "learning_rate": 1.9946235848158733e-05, + "loss": 0.5504, + "step": 6486 + }, + { + "epoch": 0.12976, + "grad_norm": 1.2072477340698242, + "learning_rate": 1.9946091159217668e-05, + "loss": 0.0995, + "step": 6488 + }, + { + "epoch": 0.1298, + "grad_norm": 2.480577230453491, + "learning_rate": 1.9945946276372435e-05, + "loss": 0.2389, + "step": 6490 + }, + { + "epoch": 0.12984, + "grad_norm": 2.269440174102783, + "learning_rate": 1.9945801199625856e-05, + "loss": 0.3019, + "step": 6492 + }, + { + "epoch": 0.12988, + "grad_norm": 1.1731932163238525, + "learning_rate": 1.9945655928980764e-05, + "loss": 0.2998, + "step": 6494 + }, + { + "epoch": 0.12992, + "grad_norm": 1.6135860681533813, + "learning_rate": 1.9945510464439984e-05, + "loss": 0.2575, + "step": 6496 + }, + { + "epoch": 0.12996, + "grad_norm": 1.303644061088562, + "learning_rate": 1.9945364806006363e-05, + "loss": 0.2359, + "step": 6498 + }, + { + "epoch": 0.13, + "grad_norm": 1.0742143392562866, + "learning_rate": 1.9945218953682736e-05, + "loss": 0.3629, + "step": 6500 + }, + { + "epoch": 0.13004, + "grad_norm": 1.6803796291351318, + "learning_rate": 1.994507290747194e-05, + "loss": 0.2393, + "step": 6502 + }, + { + "epoch": 0.13008, + "grad_norm": 1.5641217231750488, + "learning_rate": 1.9944926667376833e-05, + "loss": 0.2444, + "step": 6504 + }, + { + "epoch": 0.13012, + "grad_norm": 1.2647387981414795, + "learning_rate": 1.9944780233400255e-05, + "loss": 0.2344, + "step": 6506 + }, + { + "epoch": 0.13016, + "grad_norm": 1.6023184061050415, + "learning_rate": 1.9944633605545073e-05, + "loss": 0.2168, + "step": 6508 + }, + { + "epoch": 0.1302, + "grad_norm": 1.0519369840621948, + "learning_rate": 1.9944486783814135e-05, + "loss": 0.1871, + "step": 6510 + }, + { + "epoch": 0.13024, + "grad_norm": 1.4350641965866089, + "learning_rate": 1.994433976821031e-05, + "loss": 0.2554, + "step": 6512 + }, + { + "epoch": 0.13028, + "grad_norm": 1.1496543884277344, + "learning_rate": 1.9944192558736457e-05, + "loss": 0.1947, + "step": 6514 + }, + { + "epoch": 0.13032, + "grad_norm": 1.0142371654510498, + "learning_rate": 1.9944045155395452e-05, + "loss": 0.1895, + "step": 6516 + }, + { + "epoch": 0.13036, + "grad_norm": 2.256129503250122, + "learning_rate": 1.9943897558190168e-05, + "loss": 0.4261, + "step": 6518 + }, + { + "epoch": 0.1304, + "grad_norm": 1.9046844244003296, + "learning_rate": 1.994374976712348e-05, + "loss": 0.2248, + "step": 6520 + }, + { + "epoch": 0.13044, + "grad_norm": 0.7975121736526489, + "learning_rate": 1.9943601782198273e-05, + "loss": 0.2302, + "step": 6522 + }, + { + "epoch": 0.13048, + "grad_norm": 1.6771585941314697, + "learning_rate": 1.994345360341743e-05, + "loss": 0.3145, + "step": 6524 + }, + { + "epoch": 0.13052, + "grad_norm": 1.2882330417633057, + "learning_rate": 1.994330523078384e-05, + "loss": 0.2136, + "step": 6526 + }, + { + "epoch": 0.13056, + "grad_norm": 0.8668597936630249, + "learning_rate": 1.9943156664300394e-05, + "loss": 0.1694, + "step": 6528 + }, + { + "epoch": 0.1306, + "grad_norm": 1.2839992046356201, + "learning_rate": 1.994300790396999e-05, + "loss": 0.2659, + "step": 6530 + }, + { + "epoch": 0.13064, + "grad_norm": 1.0158642530441284, + "learning_rate": 1.994285894979553e-05, + "loss": 0.2153, + "step": 6532 + }, + { + "epoch": 0.13068, + "grad_norm": 0.7869083881378174, + "learning_rate": 1.994270980177991e-05, + "loss": 0.1823, + "step": 6534 + }, + { + "epoch": 0.13072, + "grad_norm": 0.9191288352012634, + "learning_rate": 1.994256045992605e-05, + "loss": 0.2353, + "step": 6536 + }, + { + "epoch": 0.13076, + "grad_norm": 1.1332579851150513, + "learning_rate": 1.9942410924236854e-05, + "loss": 0.1849, + "step": 6538 + }, + { + "epoch": 0.1308, + "grad_norm": 0.5905988812446594, + "learning_rate": 1.9942261194715236e-05, + "loss": 0.2327, + "step": 6540 + }, + { + "epoch": 0.13084, + "grad_norm": 1.825495719909668, + "learning_rate": 1.9942111271364118e-05, + "loss": 0.4717, + "step": 6542 + }, + { + "epoch": 0.13088, + "grad_norm": 1.9619017839431763, + "learning_rate": 1.9941961154186424e-05, + "loss": 0.4544, + "step": 6544 + }, + { + "epoch": 0.13092, + "grad_norm": 0.8700879216194153, + "learning_rate": 1.9941810843185077e-05, + "loss": 0.1815, + "step": 6546 + }, + { + "epoch": 0.13096, + "grad_norm": 1.0728343725204468, + "learning_rate": 1.9941660338363008e-05, + "loss": 0.2354, + "step": 6548 + }, + { + "epoch": 0.131, + "grad_norm": 0.9355764985084534, + "learning_rate": 1.9941509639723155e-05, + "loss": 0.185, + "step": 6550 + }, + { + "epoch": 0.13104, + "grad_norm": 1.061684489250183, + "learning_rate": 1.9941358747268455e-05, + "loss": 0.2337, + "step": 6552 + }, + { + "epoch": 0.13108, + "grad_norm": 1.4397170543670654, + "learning_rate": 1.9941207661001846e-05, + "loss": 0.2931, + "step": 6554 + }, + { + "epoch": 0.13112, + "grad_norm": 1.224045991897583, + "learning_rate": 1.9941056380926272e-05, + "loss": 0.2136, + "step": 6556 + }, + { + "epoch": 0.13116, + "grad_norm": 1.0178502798080444, + "learning_rate": 1.9940904907044688e-05, + "loss": 0.2136, + "step": 6558 + }, + { + "epoch": 0.1312, + "grad_norm": 0.7568352818489075, + "learning_rate": 1.9940753239360047e-05, + "loss": 0.1996, + "step": 6560 + }, + { + "epoch": 0.13124, + "grad_norm": 1.3570351600646973, + "learning_rate": 1.9940601377875302e-05, + "loss": 0.223, + "step": 6562 + }, + { + "epoch": 0.13128, + "grad_norm": 0.9138402342796326, + "learning_rate": 1.9940449322593417e-05, + "loss": 0.2046, + "step": 6564 + }, + { + "epoch": 0.13132, + "grad_norm": 1.2462719678878784, + "learning_rate": 1.9940297073517355e-05, + "loss": 0.2043, + "step": 6566 + }, + { + "epoch": 0.13136, + "grad_norm": 1.477364182472229, + "learning_rate": 1.9940144630650083e-05, + "loss": 0.1992, + "step": 6568 + }, + { + "epoch": 0.1314, + "grad_norm": 1.3539685010910034, + "learning_rate": 1.993999199399457e-05, + "loss": 0.2448, + "step": 6570 + }, + { + "epoch": 0.13144, + "grad_norm": 1.266276478767395, + "learning_rate": 1.9939839163553804e-05, + "loss": 0.2108, + "step": 6572 + }, + { + "epoch": 0.13148, + "grad_norm": 1.103616714477539, + "learning_rate": 1.993968613933075e-05, + "loss": 0.2473, + "step": 6574 + }, + { + "epoch": 0.13152, + "grad_norm": 0.9726136922836304, + "learning_rate": 1.9939532921328398e-05, + "loss": 0.1764, + "step": 6576 + }, + { + "epoch": 0.13156, + "grad_norm": 1.0918487310409546, + "learning_rate": 1.9939379509549736e-05, + "loss": 0.1941, + "step": 6578 + }, + { + "epoch": 0.1316, + "grad_norm": 1.297291874885559, + "learning_rate": 1.9939225903997748e-05, + "loss": 0.2668, + "step": 6580 + }, + { + "epoch": 0.13164, + "grad_norm": 0.887815535068512, + "learning_rate": 1.993907210467544e-05, + "loss": 0.2415, + "step": 6582 + }, + { + "epoch": 0.13168, + "grad_norm": 1.4127691984176636, + "learning_rate": 1.9938918111585805e-05, + "loss": 0.2149, + "step": 6584 + }, + { + "epoch": 0.13172, + "grad_norm": 1.6486412286758423, + "learning_rate": 1.993876392473184e-05, + "loss": 0.2625, + "step": 6586 + }, + { + "epoch": 0.13176, + "grad_norm": 0.8904417157173157, + "learning_rate": 1.9938609544116558e-05, + "loss": 0.145, + "step": 6588 + }, + { + "epoch": 0.1318, + "grad_norm": 0.6941771507263184, + "learning_rate": 1.993845496974297e-05, + "loss": 0.1693, + "step": 6590 + }, + { + "epoch": 0.13184, + "grad_norm": 2.003296375274658, + "learning_rate": 1.9938300201614077e-05, + "loss": 0.2486, + "step": 6592 + }, + { + "epoch": 0.13188, + "grad_norm": 2.406595468521118, + "learning_rate": 1.993814523973291e-05, + "loss": 0.502, + "step": 6594 + }, + { + "epoch": 0.13192, + "grad_norm": 1.905439019203186, + "learning_rate": 1.9937990084102488e-05, + "loss": 0.2243, + "step": 6596 + }, + { + "epoch": 0.13196, + "grad_norm": 0.6795022487640381, + "learning_rate": 1.993783473472583e-05, + "loss": 0.2293, + "step": 6598 + }, + { + "epoch": 0.132, + "grad_norm": 0.7151705026626587, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.0868, + "step": 6600 + }, + { + "epoch": 0.13204, + "grad_norm": 1.3485772609710693, + "learning_rate": 1.993752345474593e-05, + "loss": 0.2149, + "step": 6602 + }, + { + "epoch": 0.13208, + "grad_norm": 0.8632852435112, + "learning_rate": 1.993736752414876e-05, + "loss": 0.1862, + "step": 6604 + }, + { + "epoch": 0.13212, + "grad_norm": 1.1732356548309326, + "learning_rate": 1.9937211399817494e-05, + "loss": 0.2226, + "step": 6606 + }, + { + "epoch": 0.13216, + "grad_norm": 1.5613224506378174, + "learning_rate": 1.9937055081755172e-05, + "loss": 0.3015, + "step": 6608 + }, + { + "epoch": 0.1322, + "grad_norm": 1.5666437149047852, + "learning_rate": 1.993689856996485e-05, + "loss": 0.2793, + "step": 6610 + }, + { + "epoch": 0.13224, + "grad_norm": 1.4742523431777954, + "learning_rate": 1.9936741864449575e-05, + "loss": 0.2432, + "step": 6612 + }, + { + "epoch": 0.13228, + "grad_norm": 1.782194972038269, + "learning_rate": 1.9936584965212398e-05, + "loss": 0.2105, + "step": 6614 + }, + { + "epoch": 0.13232, + "grad_norm": 0.7983273863792419, + "learning_rate": 1.993642787225638e-05, + "loss": 0.1714, + "step": 6616 + }, + { + "epoch": 0.13236, + "grad_norm": 1.1219559907913208, + "learning_rate": 1.993627058558459e-05, + "loss": 0.2331, + "step": 6618 + }, + { + "epoch": 0.1324, + "grad_norm": 1.6098263263702393, + "learning_rate": 1.9936113105200085e-05, + "loss": 0.29, + "step": 6620 + }, + { + "epoch": 0.13244, + "grad_norm": 2.170121908187866, + "learning_rate": 1.9935955431105944e-05, + "loss": 0.355, + "step": 6622 + }, + { + "epoch": 0.13248, + "grad_norm": 1.3515195846557617, + "learning_rate": 1.9935797563305233e-05, + "loss": 0.1646, + "step": 6624 + }, + { + "epoch": 0.13252, + "grad_norm": 1.0048167705535889, + "learning_rate": 1.9935639501801033e-05, + "loss": 0.1372, + "step": 6626 + }, + { + "epoch": 0.13256, + "grad_norm": 1.4450379610061646, + "learning_rate": 1.9935481246596428e-05, + "loss": 0.178, + "step": 6628 + }, + { + "epoch": 0.1326, + "grad_norm": 2.5063552856445312, + "learning_rate": 1.99353227976945e-05, + "loss": 0.5035, + "step": 6630 + }, + { + "epoch": 0.13264, + "grad_norm": 0.5860224366188049, + "learning_rate": 1.993516415509834e-05, + "loss": 0.0814, + "step": 6632 + }, + { + "epoch": 0.13268, + "grad_norm": 2.5229315757751465, + "learning_rate": 1.993500531881104e-05, + "loss": 0.2517, + "step": 6634 + }, + { + "epoch": 0.13272, + "grad_norm": 2.4832308292388916, + "learning_rate": 1.9934846288835694e-05, + "loss": 0.4701, + "step": 6636 + }, + { + "epoch": 0.13276, + "grad_norm": 2.1078951358795166, + "learning_rate": 1.9934687065175403e-05, + "loss": 0.4096, + "step": 6638 + }, + { + "epoch": 0.1328, + "grad_norm": 1.447784423828125, + "learning_rate": 1.9934527647833276e-05, + "loss": 0.4015, + "step": 6640 + }, + { + "epoch": 0.13284, + "grad_norm": 1.1140269041061401, + "learning_rate": 1.993436803681242e-05, + "loss": 0.1789, + "step": 6642 + }, + { + "epoch": 0.13288, + "grad_norm": 0.9724023342132568, + "learning_rate": 1.993420823211594e-05, + "loss": 0.1316, + "step": 6644 + }, + { + "epoch": 0.13292, + "grad_norm": 1.012364149093628, + "learning_rate": 1.9934048233746962e-05, + "loss": 0.2235, + "step": 6646 + }, + { + "epoch": 0.13296, + "grad_norm": 0.8031742572784424, + "learning_rate": 1.9933888041708593e-05, + "loss": 0.1192, + "step": 6648 + }, + { + "epoch": 0.133, + "grad_norm": 2.4531712532043457, + "learning_rate": 1.9933727656003964e-05, + "loss": 0.2441, + "step": 6650 + }, + { + "epoch": 0.13304, + "grad_norm": 2.77994704246521, + "learning_rate": 1.9933567076636202e-05, + "loss": 0.3013, + "step": 6652 + }, + { + "epoch": 0.13308, + "grad_norm": 0.8332578539848328, + "learning_rate": 1.9933406303608437e-05, + "loss": 0.3944, + "step": 6654 + }, + { + "epoch": 0.13312, + "grad_norm": 0.7026417255401611, + "learning_rate": 1.9933245336923798e-05, + "loss": 0.0941, + "step": 6656 + }, + { + "epoch": 0.13316, + "grad_norm": 1.063018560409546, + "learning_rate": 1.9933084176585428e-05, + "loss": 0.1127, + "step": 6658 + }, + { + "epoch": 0.1332, + "grad_norm": 2.0251405239105225, + "learning_rate": 1.993292282259647e-05, + "loss": 0.3192, + "step": 6660 + }, + { + "epoch": 0.13324, + "grad_norm": 1.9089140892028809, + "learning_rate": 1.9932761274960068e-05, + "loss": 0.1457, + "step": 6662 + }, + { + "epoch": 0.13328, + "grad_norm": 1.7555911540985107, + "learning_rate": 1.993259953367937e-05, + "loss": 0.2149, + "step": 6664 + }, + { + "epoch": 0.13332, + "grad_norm": 1.9639476537704468, + "learning_rate": 1.993243759875753e-05, + "loss": 0.2049, + "step": 6666 + }, + { + "epoch": 0.13336, + "grad_norm": 2.9477429389953613, + "learning_rate": 1.9932275470197707e-05, + "loss": 0.2515, + "step": 6668 + }, + { + "epoch": 0.1334, + "grad_norm": 1.6945116519927979, + "learning_rate": 1.9932113148003057e-05, + "loss": 0.1311, + "step": 6670 + }, + { + "epoch": 0.13344, + "grad_norm": 4.1171064376831055, + "learning_rate": 1.9931950632176753e-05, + "loss": 0.3058, + "step": 6672 + }, + { + "epoch": 0.13348, + "grad_norm": 0.40714722871780396, + "learning_rate": 1.9931787922721954e-05, + "loss": 0.0272, + "step": 6674 + }, + { + "epoch": 0.13352, + "grad_norm": 1.8845539093017578, + "learning_rate": 1.9931625019641836e-05, + "loss": 0.4533, + "step": 6676 + }, + { + "epoch": 0.13356, + "grad_norm": 2.9759762287139893, + "learning_rate": 1.993146192293958e-05, + "loss": 0.3961, + "step": 6678 + }, + { + "epoch": 0.1336, + "grad_norm": 1.1433714628219604, + "learning_rate": 1.9931298632618355e-05, + "loss": 0.2163, + "step": 6680 + }, + { + "epoch": 0.13364, + "grad_norm": 2.368934154510498, + "learning_rate": 1.9931135148681352e-05, + "loss": 0.2625, + "step": 6682 + }, + { + "epoch": 0.13368, + "grad_norm": 2.1865005493164062, + "learning_rate": 1.9930971471131757e-05, + "loss": 0.2733, + "step": 6684 + }, + { + "epoch": 0.13372, + "grad_norm": 0.8401621580123901, + "learning_rate": 1.993080759997276e-05, + "loss": 0.2287, + "step": 6686 + }, + { + "epoch": 0.13376, + "grad_norm": 1.8749791383743286, + "learning_rate": 1.9930643535207556e-05, + "loss": 0.3516, + "step": 6688 + }, + { + "epoch": 0.1338, + "grad_norm": 1.5503265857696533, + "learning_rate": 1.9930479276839347e-05, + "loss": 0.1392, + "step": 6690 + }, + { + "epoch": 0.13384, + "grad_norm": 1.824994444847107, + "learning_rate": 1.9930314824871326e-05, + "loss": 0.2111, + "step": 6692 + }, + { + "epoch": 0.13388, + "grad_norm": 1.264244556427002, + "learning_rate": 1.9930150179306708e-05, + "loss": 0.1471, + "step": 6694 + }, + { + "epoch": 0.13392, + "grad_norm": 0.9368013143539429, + "learning_rate": 1.99299853401487e-05, + "loss": 0.1088, + "step": 6696 + }, + { + "epoch": 0.13396, + "grad_norm": 1.3068796396255493, + "learning_rate": 1.9929820307400516e-05, + "loss": 0.1848, + "step": 6698 + }, + { + "epoch": 0.134, + "grad_norm": 2.2639212608337402, + "learning_rate": 1.992965508106537e-05, + "loss": 0.2694, + "step": 6700 + }, + { + "epoch": 0.13404, + "grad_norm": 1.1066980361938477, + "learning_rate": 1.992948966114649e-05, + "loss": 0.2353, + "step": 6702 + }, + { + "epoch": 0.13408, + "grad_norm": 1.9940156936645508, + "learning_rate": 1.9929324047647095e-05, + "loss": 0.193, + "step": 6704 + }, + { + "epoch": 0.13412, + "grad_norm": 1.0163557529449463, + "learning_rate": 1.9929158240570416e-05, + "loss": 0.2453, + "step": 6706 + }, + { + "epoch": 0.13416, + "grad_norm": 1.7999204397201538, + "learning_rate": 1.9928992239919683e-05, + "loss": 0.3315, + "step": 6708 + }, + { + "epoch": 0.1342, + "grad_norm": 2.824714183807373, + "learning_rate": 1.9928826045698138e-05, + "loss": 0.3169, + "step": 6710 + }, + { + "epoch": 0.13424, + "grad_norm": 1.71149742603302, + "learning_rate": 1.992865965790902e-05, + "loss": 0.2126, + "step": 6712 + }, + { + "epoch": 0.13428, + "grad_norm": 1.7190256118774414, + "learning_rate": 1.9928493076555564e-05, + "loss": 0.2921, + "step": 6714 + }, + { + "epoch": 0.13432, + "grad_norm": 1.4585659503936768, + "learning_rate": 1.9928326301641024e-05, + "loss": 0.1672, + "step": 6716 + }, + { + "epoch": 0.13436, + "grad_norm": 0.8641340732574463, + "learning_rate": 1.9928159333168654e-05, + "loss": 0.0951, + "step": 6718 + }, + { + "epoch": 0.1344, + "grad_norm": 0.7940100431442261, + "learning_rate": 1.9927992171141707e-05, + "loss": 0.0641, + "step": 6720 + }, + { + "epoch": 0.13444, + "grad_norm": 4.410154342651367, + "learning_rate": 1.9927824815563442e-05, + "loss": 0.4643, + "step": 6722 + }, + { + "epoch": 0.13448, + "grad_norm": 0.36705702543258667, + "learning_rate": 1.992765726643712e-05, + "loss": 0.0302, + "step": 6724 + }, + { + "epoch": 0.13452, + "grad_norm": 4.178699970245361, + "learning_rate": 1.9927489523766006e-05, + "loss": 0.8031, + "step": 6726 + }, + { + "epoch": 0.13456, + "grad_norm": 0.5863642692565918, + "learning_rate": 1.9927321587553378e-05, + "loss": 0.3637, + "step": 6728 + }, + { + "epoch": 0.1346, + "grad_norm": 1.718750238418579, + "learning_rate": 1.99271534578025e-05, + "loss": 0.1749, + "step": 6730 + }, + { + "epoch": 0.13464, + "grad_norm": 1.5089194774627686, + "learning_rate": 1.9926985134516655e-05, + "loss": 0.1693, + "step": 6732 + }, + { + "epoch": 0.13468, + "grad_norm": 1.8897712230682373, + "learning_rate": 1.9926816617699127e-05, + "loss": 0.3041, + "step": 6734 + }, + { + "epoch": 0.13472, + "grad_norm": 0.43873172998428345, + "learning_rate": 1.9926647907353198e-05, + "loss": 0.09, + "step": 6736 + }, + { + "epoch": 0.13476, + "grad_norm": 0.6026725172996521, + "learning_rate": 1.992647900348215e-05, + "loss": 0.1026, + "step": 6738 + }, + { + "epoch": 0.1348, + "grad_norm": 0.5041399598121643, + "learning_rate": 1.992630990608929e-05, + "loss": 0.0779, + "step": 6740 + }, + { + "epoch": 0.13484, + "grad_norm": 2.218998908996582, + "learning_rate": 1.992614061517791e-05, + "loss": 0.3048, + "step": 6742 + }, + { + "epoch": 0.13488, + "grad_norm": 0.2730352580547333, + "learning_rate": 1.99259711307513e-05, + "loss": 0.3734, + "step": 6744 + }, + { + "epoch": 0.13492, + "grad_norm": 2.1611328125, + "learning_rate": 1.992580145281278e-05, + "loss": 0.5707, + "step": 6746 + }, + { + "epoch": 0.13496, + "grad_norm": 2.0350501537323, + "learning_rate": 1.992563158136565e-05, + "loss": 0.4879, + "step": 6748 + }, + { + "epoch": 0.135, + "grad_norm": 0.9848194122314453, + "learning_rate": 1.9925461516413224e-05, + "loss": 0.0983, + "step": 6750 + }, + { + "epoch": 0.13504, + "grad_norm": 1.1452796459197998, + "learning_rate": 1.992529125795881e-05, + "loss": 0.2289, + "step": 6752 + }, + { + "epoch": 0.13508, + "grad_norm": 1.552201271057129, + "learning_rate": 1.9925120806005736e-05, + "loss": 0.1744, + "step": 6754 + }, + { + "epoch": 0.13512, + "grad_norm": 2.721952199935913, + "learning_rate": 1.9924950160557324e-05, + "loss": 0.3419, + "step": 6756 + }, + { + "epoch": 0.13516, + "grad_norm": 1.8422788381576538, + "learning_rate": 1.9924779321616894e-05, + "loss": 0.1726, + "step": 6758 + }, + { + "epoch": 0.1352, + "grad_norm": 0.7072584629058838, + "learning_rate": 1.9924608289187786e-05, + "loss": 0.2807, + "step": 6760 + }, + { + "epoch": 0.13524, + "grad_norm": 1.0692065954208374, + "learning_rate": 1.9924437063273327e-05, + "loss": 0.1396, + "step": 6762 + }, + { + "epoch": 0.13528, + "grad_norm": 1.5777360200881958, + "learning_rate": 1.992426564387686e-05, + "loss": 0.1772, + "step": 6764 + }, + { + "epoch": 0.13532, + "grad_norm": 0.9814470410346985, + "learning_rate": 1.9924094031001722e-05, + "loss": 0.2368, + "step": 6766 + }, + { + "epoch": 0.13536, + "grad_norm": 0.8048774600028992, + "learning_rate": 1.9923922224651265e-05, + "loss": 0.1647, + "step": 6768 + }, + { + "epoch": 0.1354, + "grad_norm": 1.3597118854522705, + "learning_rate": 1.9923750224828833e-05, + "loss": 0.1319, + "step": 6770 + }, + { + "epoch": 0.13544, + "grad_norm": 1.6690444946289062, + "learning_rate": 1.992357803153778e-05, + "loss": 0.258, + "step": 6772 + }, + { + "epoch": 0.13548, + "grad_norm": 0.9544098377227783, + "learning_rate": 1.992340564478147e-05, + "loss": 0.1399, + "step": 6774 + }, + { + "epoch": 0.13552, + "grad_norm": 1.6810613870620728, + "learning_rate": 1.9923233064563253e-05, + "loss": 0.3223, + "step": 6776 + }, + { + "epoch": 0.13556, + "grad_norm": 2.1832189559936523, + "learning_rate": 1.99230602908865e-05, + "loss": 0.2977, + "step": 6778 + }, + { + "epoch": 0.1356, + "grad_norm": 0.8239164352416992, + "learning_rate": 1.992288732375458e-05, + "loss": 0.1205, + "step": 6780 + }, + { + "epoch": 0.13564, + "grad_norm": 0.8209680914878845, + "learning_rate": 1.992271416317086e-05, + "loss": 0.1084, + "step": 6782 + }, + { + "epoch": 0.13568, + "grad_norm": 0.8988705277442932, + "learning_rate": 1.9922540809138716e-05, + "loss": 0.2831, + "step": 6784 + }, + { + "epoch": 0.13572, + "grad_norm": 3.6754679679870605, + "learning_rate": 1.9922367261661536e-05, + "loss": 0.4101, + "step": 6786 + }, + { + "epoch": 0.13576, + "grad_norm": 2.3007149696350098, + "learning_rate": 1.9922193520742698e-05, + "loss": 0.1863, + "step": 6788 + }, + { + "epoch": 0.1358, + "grad_norm": 1.3143144845962524, + "learning_rate": 1.9922019586385587e-05, + "loss": 0.1224, + "step": 6790 + }, + { + "epoch": 0.13584, + "grad_norm": 1.7641476392745972, + "learning_rate": 1.9921845458593595e-05, + "loss": 0.2032, + "step": 6792 + }, + { + "epoch": 0.13588, + "grad_norm": 1.4746187925338745, + "learning_rate": 1.9921671137370116e-05, + "loss": 0.1598, + "step": 6794 + }, + { + "epoch": 0.13592, + "grad_norm": 1.0541317462921143, + "learning_rate": 1.9921496622718556e-05, + "loss": 0.294, + "step": 6796 + }, + { + "epoch": 0.13596, + "grad_norm": 1.5656485557556152, + "learning_rate": 1.9921321914642307e-05, + "loss": 0.2352, + "step": 6798 + }, + { + "epoch": 0.136, + "grad_norm": 2.10439395904541, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.3674, + "step": 6800 + }, + { + "epoch": 0.13604, + "grad_norm": 2.4659817218780518, + "learning_rate": 1.9920971918229385e-05, + "loss": 0.2625, + "step": 6802 + }, + { + "epoch": 0.13608, + "grad_norm": 2.050820827484131, + "learning_rate": 1.9920796629899534e-05, + "loss": 0.3409, + "step": 6804 + }, + { + "epoch": 0.13612, + "grad_norm": 1.0744448900222778, + "learning_rate": 1.9920621148158647e-05, + "loss": 0.1308, + "step": 6806 + }, + { + "epoch": 0.13616, + "grad_norm": 1.4173473119735718, + "learning_rate": 1.992044547301014e-05, + "loss": 0.2695, + "step": 6808 + }, + { + "epoch": 0.1362, + "grad_norm": 1.3819352388381958, + "learning_rate": 1.9920269604457444e-05, + "loss": 0.2569, + "step": 6810 + }, + { + "epoch": 0.13624, + "grad_norm": 2.118929386138916, + "learning_rate": 1.9920093542503983e-05, + "loss": 0.3146, + "step": 6812 + }, + { + "epoch": 0.13628, + "grad_norm": 1.198529601097107, + "learning_rate": 1.9919917287153194e-05, + "loss": 0.1595, + "step": 6814 + }, + { + "epoch": 0.13632, + "grad_norm": 1.2622236013412476, + "learning_rate": 1.9919740838408506e-05, + "loss": 0.1849, + "step": 6816 + }, + { + "epoch": 0.13636, + "grad_norm": 1.749328374862671, + "learning_rate": 1.9919564196273366e-05, + "loss": 0.2429, + "step": 6818 + }, + { + "epoch": 0.1364, + "grad_norm": 0.6512763500213623, + "learning_rate": 1.9919387360751216e-05, + "loss": 0.1146, + "step": 6820 + }, + { + "epoch": 0.13644, + "grad_norm": 1.574957251548767, + "learning_rate": 1.99192103318455e-05, + "loss": 0.2222, + "step": 6822 + }, + { + "epoch": 0.13648, + "grad_norm": 1.5049487352371216, + "learning_rate": 1.9919033109559677e-05, + "loss": 0.2433, + "step": 6824 + }, + { + "epoch": 0.13652, + "grad_norm": 1.5265933275222778, + "learning_rate": 1.991885569389719e-05, + "loss": 0.1882, + "step": 6826 + }, + { + "epoch": 0.13656, + "grad_norm": 1.6151199340820312, + "learning_rate": 1.9918678084861506e-05, + "loss": 0.2102, + "step": 6828 + }, + { + "epoch": 0.1366, + "grad_norm": 0.7941362857818604, + "learning_rate": 1.991850028245609e-05, + "loss": 0.1493, + "step": 6830 + }, + { + "epoch": 0.13664, + "grad_norm": 1.1703451871871948, + "learning_rate": 1.9918322286684402e-05, + "loss": 0.1304, + "step": 6832 + }, + { + "epoch": 0.13668, + "grad_norm": 1.2031986713409424, + "learning_rate": 1.9918144097549917e-05, + "loss": 0.2179, + "step": 6834 + }, + { + "epoch": 0.13672, + "grad_norm": 0.9719987511634827, + "learning_rate": 1.9917965715056106e-05, + "loss": 0.1247, + "step": 6836 + }, + { + "epoch": 0.13676, + "grad_norm": 1.7932404279708862, + "learning_rate": 1.9917787139206445e-05, + "loss": 0.178, + "step": 6838 + }, + { + "epoch": 0.1368, + "grad_norm": 1.0850495100021362, + "learning_rate": 1.9917608370004417e-05, + "loss": 0.1712, + "step": 6840 + }, + { + "epoch": 0.13684, + "grad_norm": 4.244604587554932, + "learning_rate": 1.991742940745351e-05, + "loss": 0.3851, + "step": 6842 + }, + { + "epoch": 0.13688, + "grad_norm": 1.813978910446167, + "learning_rate": 1.991725025155721e-05, + "loss": 0.2594, + "step": 6844 + }, + { + "epoch": 0.13692, + "grad_norm": 3.524035692214966, + "learning_rate": 1.9917070902319012e-05, + "loss": 0.3806, + "step": 6846 + }, + { + "epoch": 0.13696, + "grad_norm": 1.3102911710739136, + "learning_rate": 1.991689135974241e-05, + "loss": 0.3322, + "step": 6848 + }, + { + "epoch": 0.137, + "grad_norm": 1.0932916402816772, + "learning_rate": 1.9916711623830904e-05, + "loss": 0.1112, + "step": 6850 + }, + { + "epoch": 0.13704, + "grad_norm": 2.890202760696411, + "learning_rate": 1.9916531694588002e-05, + "loss": 0.5519, + "step": 6852 + }, + { + "epoch": 0.13708, + "grad_norm": 2.14618182182312, + "learning_rate": 1.991635157201721e-05, + "loss": 0.4247, + "step": 6854 + }, + { + "epoch": 0.13712, + "grad_norm": 1.0605123043060303, + "learning_rate": 1.9916171256122036e-05, + "loss": 0.2155, + "step": 6856 + }, + { + "epoch": 0.13716, + "grad_norm": 1.134007215499878, + "learning_rate": 1.9915990746906e-05, + "loss": 0.1186, + "step": 6858 + }, + { + "epoch": 0.1372, + "grad_norm": 0.9600256085395813, + "learning_rate": 1.9915810044372618e-05, + "loss": 0.1532, + "step": 6860 + }, + { + "epoch": 0.13724, + "grad_norm": 1.7783749103546143, + "learning_rate": 1.9915629148525413e-05, + "loss": 0.3148, + "step": 6862 + }, + { + "epoch": 0.13728, + "grad_norm": 1.6374971866607666, + "learning_rate": 1.9915448059367916e-05, + "loss": 0.2447, + "step": 6864 + }, + { + "epoch": 0.13732, + "grad_norm": 1.1923800706863403, + "learning_rate": 1.9915266776903653e-05, + "loss": 0.1692, + "step": 6866 + }, + { + "epoch": 0.13736, + "grad_norm": 0.7323181629180908, + "learning_rate": 1.9915085301136158e-05, + "loss": 0.1696, + "step": 6868 + }, + { + "epoch": 0.1374, + "grad_norm": 0.9131972193717957, + "learning_rate": 1.9914903632068975e-05, + "loss": 0.1816, + "step": 6870 + }, + { + "epoch": 0.13744, + "grad_norm": 1.1867367029190063, + "learning_rate": 1.9914721769705637e-05, + "loss": 0.1678, + "step": 6872 + }, + { + "epoch": 0.13748, + "grad_norm": 1.5564897060394287, + "learning_rate": 1.9914539714049693e-05, + "loss": 0.3361, + "step": 6874 + }, + { + "epoch": 0.13752, + "grad_norm": 1.0021207332611084, + "learning_rate": 1.9914357465104696e-05, + "loss": 0.1521, + "step": 6876 + }, + { + "epoch": 0.13756, + "grad_norm": 0.713296115398407, + "learning_rate": 1.9914175022874197e-05, + "loss": 0.1412, + "step": 6878 + }, + { + "epoch": 0.1376, + "grad_norm": 1.5466607809066772, + "learning_rate": 1.9913992387361747e-05, + "loss": 0.1937, + "step": 6880 + }, + { + "epoch": 0.13764, + "grad_norm": 1.342697262763977, + "learning_rate": 1.9913809558570914e-05, + "loss": 0.1846, + "step": 6882 + }, + { + "epoch": 0.13768, + "grad_norm": 1.1760914325714111, + "learning_rate": 1.991362653650526e-05, + "loss": 0.2512, + "step": 6884 + }, + { + "epoch": 0.13772, + "grad_norm": 1.941088080406189, + "learning_rate": 1.991344332116835e-05, + "loss": 0.2225, + "step": 6886 + }, + { + "epoch": 0.13776, + "grad_norm": 2.2317111492156982, + "learning_rate": 1.9913259912563762e-05, + "loss": 0.1886, + "step": 6888 + }, + { + "epoch": 0.1378, + "grad_norm": 1.0578317642211914, + "learning_rate": 1.9913076310695068e-05, + "loss": 0.2769, + "step": 6890 + }, + { + "epoch": 0.13784, + "grad_norm": 2.391174554824829, + "learning_rate": 1.9912892515565846e-05, + "loss": 0.3466, + "step": 6892 + }, + { + "epoch": 0.13788, + "grad_norm": 1.1865803003311157, + "learning_rate": 1.991270852717968e-05, + "loss": 0.1882, + "step": 6894 + }, + { + "epoch": 0.13792, + "grad_norm": 0.8276405930519104, + "learning_rate": 1.9912524345540164e-05, + "loss": 0.1649, + "step": 6896 + }, + { + "epoch": 0.13796, + "grad_norm": 0.629561185836792, + "learning_rate": 1.9912339970650876e-05, + "loss": 0.0909, + "step": 6898 + }, + { + "epoch": 0.138, + "grad_norm": 1.5040128231048584, + "learning_rate": 1.991215540251542e-05, + "loss": 0.1617, + "step": 6900 + }, + { + "epoch": 0.13804, + "grad_norm": 0.7158666253089905, + "learning_rate": 1.991197064113739e-05, + "loss": 0.1196, + "step": 6902 + }, + { + "epoch": 0.13808, + "grad_norm": 1.751388430595398, + "learning_rate": 1.991178568652039e-05, + "loss": 0.2433, + "step": 6904 + }, + { + "epoch": 0.13812, + "grad_norm": 0.6892380714416504, + "learning_rate": 1.9911600538668025e-05, + "loss": 0.1006, + "step": 6906 + }, + { + "epoch": 0.13816, + "grad_norm": 2.0771663188934326, + "learning_rate": 1.9911415197583904e-05, + "loss": 0.3734, + "step": 6908 + }, + { + "epoch": 0.1382, + "grad_norm": 0.9298422932624817, + "learning_rate": 1.991122966327164e-05, + "loss": 0.1001, + "step": 6910 + }, + { + "epoch": 0.13824, + "grad_norm": 1.5095725059509277, + "learning_rate": 1.9911043935734855e-05, + "loss": 0.2959, + "step": 6912 + }, + { + "epoch": 0.13828, + "grad_norm": 0.9526621103286743, + "learning_rate": 1.9910858014977163e-05, + "loss": 0.1124, + "step": 6914 + }, + { + "epoch": 0.13832, + "grad_norm": 1.5418477058410645, + "learning_rate": 1.9910671901002193e-05, + "loss": 0.1301, + "step": 6916 + }, + { + "epoch": 0.13836, + "grad_norm": 1.6323999166488647, + "learning_rate": 1.991048559381357e-05, + "loss": 0.1648, + "step": 6918 + }, + { + "epoch": 0.1384, + "grad_norm": 4.251992225646973, + "learning_rate": 1.991029909341493e-05, + "loss": 0.3435, + "step": 6920 + }, + { + "epoch": 0.13844, + "grad_norm": 0.8094559907913208, + "learning_rate": 1.9910112399809906e-05, + "loss": 0.0603, + "step": 6922 + }, + { + "epoch": 0.13848, + "grad_norm": 1.1655449867248535, + "learning_rate": 1.990992551300214e-05, + "loss": 0.16, + "step": 6924 + }, + { + "epoch": 0.13852, + "grad_norm": 0.43538451194763184, + "learning_rate": 1.990973843299527e-05, + "loss": 0.0686, + "step": 6926 + }, + { + "epoch": 0.13856, + "grad_norm": 2.072685480117798, + "learning_rate": 1.990955115979295e-05, + "loss": 0.1403, + "step": 6928 + }, + { + "epoch": 0.1386, + "grad_norm": 2.933349132537842, + "learning_rate": 1.9909363693398828e-05, + "loss": 0.5181, + "step": 6930 + }, + { + "epoch": 0.13864, + "grad_norm": 4.227395534515381, + "learning_rate": 1.990917603381656e-05, + "loss": 0.2764, + "step": 6932 + }, + { + "epoch": 0.13868, + "grad_norm": 0.6417645812034607, + "learning_rate": 1.9908988181049805e-05, + "loss": 0.1883, + "step": 6934 + }, + { + "epoch": 0.13872, + "grad_norm": 1.0671427249908447, + "learning_rate": 1.990880013510222e-05, + "loss": 0.0772, + "step": 6936 + }, + { + "epoch": 0.13876, + "grad_norm": 0.45621365308761597, + "learning_rate": 1.990861189597748e-05, + "loss": 0.2782, + "step": 6938 + }, + { + "epoch": 0.1388, + "grad_norm": 2.587555408477783, + "learning_rate": 1.9908423463679246e-05, + "loss": 0.5185, + "step": 6940 + }, + { + "epoch": 0.13884, + "grad_norm": 0.37318602204322815, + "learning_rate": 1.9908234838211197e-05, + "loss": 0.2593, + "step": 6942 + }, + { + "epoch": 0.13888, + "grad_norm": 0.5014750361442566, + "learning_rate": 1.990804601957701e-05, + "loss": 0.1138, + "step": 6944 + }, + { + "epoch": 0.13892, + "grad_norm": 0.7352229356765747, + "learning_rate": 1.9907857007780362e-05, + "loss": 0.1195, + "step": 6946 + }, + { + "epoch": 0.13896, + "grad_norm": 0.5980547070503235, + "learning_rate": 1.990766780282494e-05, + "loss": 0.0656, + "step": 6948 + }, + { + "epoch": 0.139, + "grad_norm": 2.0031843185424805, + "learning_rate": 1.9907478404714438e-05, + "loss": 0.4097, + "step": 6950 + }, + { + "epoch": 0.13904, + "grad_norm": 0.9060186743736267, + "learning_rate": 1.990728881345254e-05, + "loss": 0.1078, + "step": 6952 + }, + { + "epoch": 0.13908, + "grad_norm": 1.1324093341827393, + "learning_rate": 1.990709902904295e-05, + "loss": 0.1071, + "step": 6954 + }, + { + "epoch": 0.13912, + "grad_norm": 0.9198927879333496, + "learning_rate": 1.990690905148936e-05, + "loss": 0.2554, + "step": 6956 + }, + { + "epoch": 0.13916, + "grad_norm": 2.025151252746582, + "learning_rate": 1.9906718880795477e-05, + "loss": 0.3958, + "step": 6958 + }, + { + "epoch": 0.1392, + "grad_norm": 1.2877806425094604, + "learning_rate": 1.990652851696501e-05, + "loss": 0.1677, + "step": 6960 + }, + { + "epoch": 0.13924, + "grad_norm": 1.9269400835037231, + "learning_rate": 1.990633796000167e-05, + "loss": 0.3948, + "step": 6962 + }, + { + "epoch": 0.13928, + "grad_norm": 1.5509743690490723, + "learning_rate": 1.990614720990917e-05, + "loss": 0.29, + "step": 6964 + }, + { + "epoch": 0.13932, + "grad_norm": 1.226305365562439, + "learning_rate": 1.9905956266691232e-05, + "loss": 0.144, + "step": 6966 + }, + { + "epoch": 0.13936, + "grad_norm": 1.7284523248672485, + "learning_rate": 1.9905765130351577e-05, + "loss": 0.2563, + "step": 6968 + }, + { + "epoch": 0.1394, + "grad_norm": 1.2197706699371338, + "learning_rate": 1.990557380089393e-05, + "loss": 0.1757, + "step": 6970 + }, + { + "epoch": 0.13944, + "grad_norm": 0.757165789604187, + "learning_rate": 1.9905382278322024e-05, + "loss": 0.0854, + "step": 6972 + }, + { + "epoch": 0.13948, + "grad_norm": 0.8675475120544434, + "learning_rate": 1.990519056263959e-05, + "loss": 0.3368, + "step": 6974 + }, + { + "epoch": 0.13952, + "grad_norm": 2.7452638149261475, + "learning_rate": 1.990499865385036e-05, + "loss": 0.3264, + "step": 6976 + }, + { + "epoch": 0.13956, + "grad_norm": 2.828329563140869, + "learning_rate": 1.9904806551958093e-05, + "loss": 0.3351, + "step": 6978 + }, + { + "epoch": 0.1396, + "grad_norm": 0.5107519030570984, + "learning_rate": 1.9904614256966514e-05, + "loss": 0.3316, + "step": 6980 + }, + { + "epoch": 0.13964, + "grad_norm": 1.8627370595932007, + "learning_rate": 1.9904421768879385e-05, + "loss": 0.3663, + "step": 6982 + }, + { + "epoch": 0.13968, + "grad_norm": 1.515112280845642, + "learning_rate": 1.9904229087700454e-05, + "loss": 0.2449, + "step": 6984 + }, + { + "epoch": 0.13972, + "grad_norm": 0.9685657620429993, + "learning_rate": 1.9904036213433475e-05, + "loss": 0.1862, + "step": 6986 + }, + { + "epoch": 0.13976, + "grad_norm": 1.5764325857162476, + "learning_rate": 1.9903843146082214e-05, + "loss": 0.2095, + "step": 6988 + }, + { + "epoch": 0.1398, + "grad_norm": 0.922076940536499, + "learning_rate": 1.990364988565043e-05, + "loss": 0.1254, + "step": 6990 + }, + { + "epoch": 0.13984, + "grad_norm": 1.8516559600830078, + "learning_rate": 1.9903456432141898e-05, + "loss": 0.2194, + "step": 6992 + }, + { + "epoch": 0.13988, + "grad_norm": 2.1178102493286133, + "learning_rate": 1.9903262785560377e-05, + "loss": 0.2697, + "step": 6994 + }, + { + "epoch": 0.13992, + "grad_norm": 0.42493322491645813, + "learning_rate": 1.9903068945909653e-05, + "loss": 0.144, + "step": 6996 + }, + { + "epoch": 0.13996, + "grad_norm": 1.0228397846221924, + "learning_rate": 1.9902874913193503e-05, + "loss": 0.3847, + "step": 6998 + }, + { + "epoch": 0.14, + "grad_norm": 0.4945867359638214, + "learning_rate": 1.9902680687415704e-05, + "loss": 0.0652, + "step": 7000 + }, + { + "epoch": 0.14004, + "grad_norm": 0.7158036231994629, + "learning_rate": 1.990248626858005e-05, + "loss": 0.0815, + "step": 7002 + }, + { + "epoch": 0.14008, + "grad_norm": 2.4744040966033936, + "learning_rate": 1.9902291656690325e-05, + "loss": 0.552, + "step": 7004 + }, + { + "epoch": 0.14012, + "grad_norm": 0.7460100054740906, + "learning_rate": 1.990209685175033e-05, + "loss": 0.0822, + "step": 7006 + }, + { + "epoch": 0.14016, + "grad_norm": 2.009404420852661, + "learning_rate": 1.9901901853763857e-05, + "loss": 0.4921, + "step": 7008 + }, + { + "epoch": 0.1402, + "grad_norm": 1.3411728143692017, + "learning_rate": 1.990170666273471e-05, + "loss": 0.1789, + "step": 7010 + }, + { + "epoch": 0.14024, + "grad_norm": 0.802631676197052, + "learning_rate": 1.9901511278666693e-05, + "loss": 0.1181, + "step": 7012 + }, + { + "epoch": 0.14028, + "grad_norm": 0.8815609812736511, + "learning_rate": 1.9901315701563617e-05, + "loss": 0.104, + "step": 7014 + }, + { + "epoch": 0.14032, + "grad_norm": 1.9065686464309692, + "learning_rate": 1.9901119931429294e-05, + "loss": 0.256, + "step": 7016 + }, + { + "epoch": 0.14036, + "grad_norm": 1.683367133140564, + "learning_rate": 1.990092396826754e-05, + "loss": 0.1918, + "step": 7018 + }, + { + "epoch": 0.1404, + "grad_norm": 1.8165520429611206, + "learning_rate": 1.9900727812082177e-05, + "loss": 0.2496, + "step": 7020 + }, + { + "epoch": 0.14044, + "grad_norm": 0.6840444803237915, + "learning_rate": 1.9900531462877026e-05, + "loss": 0.1601, + "step": 7022 + }, + { + "epoch": 0.14048, + "grad_norm": 1.7153079509735107, + "learning_rate": 1.990033492065592e-05, + "loss": 0.2164, + "step": 7024 + }, + { + "epoch": 0.14052, + "grad_norm": 1.1363391876220703, + "learning_rate": 1.9900138185422685e-05, + "loss": 0.1604, + "step": 7026 + }, + { + "epoch": 0.14056, + "grad_norm": 1.6034231185913086, + "learning_rate": 1.989994125718116e-05, + "loss": 0.2659, + "step": 7028 + }, + { + "epoch": 0.1406, + "grad_norm": 1.175681710243225, + "learning_rate": 1.989974413593518e-05, + "loss": 0.2043, + "step": 7030 + }, + { + "epoch": 0.14064, + "grad_norm": 1.2691913843154907, + "learning_rate": 1.9899546821688597e-05, + "loss": 0.2475, + "step": 7032 + }, + { + "epoch": 0.14068, + "grad_norm": 1.6063746213912964, + "learning_rate": 1.989934931444525e-05, + "loss": 0.2106, + "step": 7034 + }, + { + "epoch": 0.14072, + "grad_norm": 0.9494057893753052, + "learning_rate": 1.9899151614208988e-05, + "loss": 0.2038, + "step": 7036 + }, + { + "epoch": 0.14076, + "grad_norm": 2.0747504234313965, + "learning_rate": 1.9898953720983672e-05, + "loss": 0.2352, + "step": 7038 + }, + { + "epoch": 0.1408, + "grad_norm": 1.8475919961929321, + "learning_rate": 1.989875563477316e-05, + "loss": 0.2263, + "step": 7040 + }, + { + "epoch": 0.14084, + "grad_norm": 1.6607407331466675, + "learning_rate": 1.9898557355581306e-05, + "loss": 0.3034, + "step": 7042 + }, + { + "epoch": 0.14088, + "grad_norm": 1.4096739292144775, + "learning_rate": 1.9898358883411983e-05, + "loss": 0.2225, + "step": 7044 + }, + { + "epoch": 0.14092, + "grad_norm": 1.4769234657287598, + "learning_rate": 1.9898160218269056e-05, + "loss": 0.2225, + "step": 7046 + }, + { + "epoch": 0.14096, + "grad_norm": 1.2396906614303589, + "learning_rate": 1.9897961360156398e-05, + "loss": 0.2128, + "step": 7048 + }, + { + "epoch": 0.141, + "grad_norm": 1.4089510440826416, + "learning_rate": 1.989776230907789e-05, + "loss": 0.2235, + "step": 7050 + }, + { + "epoch": 0.14104, + "grad_norm": 1.398560881614685, + "learning_rate": 1.9897563065037412e-05, + "loss": 0.2235, + "step": 7052 + }, + { + "epoch": 0.14108, + "grad_norm": 1.6806708574295044, + "learning_rate": 1.9897363628038842e-05, + "loss": 0.2431, + "step": 7054 + }, + { + "epoch": 0.14112, + "grad_norm": 1.9003552198410034, + "learning_rate": 1.9897163998086076e-05, + "loss": 0.1855, + "step": 7056 + }, + { + "epoch": 0.14116, + "grad_norm": 0.694733202457428, + "learning_rate": 1.9896964175183e-05, + "loss": 0.3527, + "step": 7058 + }, + { + "epoch": 0.1412, + "grad_norm": 0.8484461307525635, + "learning_rate": 1.989676415933351e-05, + "loss": 0.2617, + "step": 7060 + }, + { + "epoch": 0.14124, + "grad_norm": 0.610732913017273, + "learning_rate": 1.9896563950541516e-05, + "loss": 0.0684, + "step": 7062 + }, + { + "epoch": 0.14128, + "grad_norm": 3.027137517929077, + "learning_rate": 1.9896363548810905e-05, + "loss": 0.518, + "step": 7064 + }, + { + "epoch": 0.14132, + "grad_norm": 2.2186667919158936, + "learning_rate": 1.9896162954145593e-05, + "loss": 0.3807, + "step": 7066 + }, + { + "epoch": 0.14136, + "grad_norm": 1.4677462577819824, + "learning_rate": 1.9895962166549493e-05, + "loss": 0.2324, + "step": 7068 + }, + { + "epoch": 0.1414, + "grad_norm": 1.8324024677276611, + "learning_rate": 1.989576118602651e-05, + "loss": 0.3173, + "step": 7070 + }, + { + "epoch": 0.14144, + "grad_norm": 0.5959767699241638, + "learning_rate": 1.9895560012580574e-05, + "loss": 0.1226, + "step": 7072 + }, + { + "epoch": 0.14148, + "grad_norm": 0.9090994596481323, + "learning_rate": 1.9895358646215596e-05, + "loss": 0.2488, + "step": 7074 + }, + { + "epoch": 0.14152, + "grad_norm": 1.115415334701538, + "learning_rate": 1.989515708693551e-05, + "loss": 0.1094, + "step": 7076 + }, + { + "epoch": 0.14156, + "grad_norm": 0.7965419888496399, + "learning_rate": 1.989495533474424e-05, + "loss": 0.2222, + "step": 7078 + }, + { + "epoch": 0.1416, + "grad_norm": 0.6392717361450195, + "learning_rate": 1.9894753389645723e-05, + "loss": 0.2192, + "step": 7080 + }, + { + "epoch": 0.14164, + "grad_norm": 0.9550837874412537, + "learning_rate": 1.9894551251643894e-05, + "loss": 0.2416, + "step": 7082 + }, + { + "epoch": 0.14168, + "grad_norm": 1.8938186168670654, + "learning_rate": 1.9894348920742694e-05, + "loss": 0.2783, + "step": 7084 + }, + { + "epoch": 0.14172, + "grad_norm": 1.575872778892517, + "learning_rate": 1.989414639694607e-05, + "loss": 0.3149, + "step": 7086 + }, + { + "epoch": 0.14176, + "grad_norm": 1.5360054969787598, + "learning_rate": 1.9893943680257964e-05, + "loss": 0.1862, + "step": 7088 + }, + { + "epoch": 0.1418, + "grad_norm": 1.5936683416366577, + "learning_rate": 1.9893740770682334e-05, + "loss": 0.2227, + "step": 7090 + }, + { + "epoch": 0.14184, + "grad_norm": 0.9045202732086182, + "learning_rate": 1.9893537668223136e-05, + "loss": 0.1751, + "step": 7092 + }, + { + "epoch": 0.14188, + "grad_norm": 1.3264273405075073, + "learning_rate": 1.9893334372884325e-05, + "loss": 0.1305, + "step": 7094 + }, + { + "epoch": 0.14192, + "grad_norm": 2.6348557472229004, + "learning_rate": 1.989313088466987e-05, + "loss": 0.3836, + "step": 7096 + }, + { + "epoch": 0.14196, + "grad_norm": 1.0747231245040894, + "learning_rate": 1.989292720358373e-05, + "loss": 0.1531, + "step": 7098 + }, + { + "epoch": 0.142, + "grad_norm": 2.131669282913208, + "learning_rate": 1.9892723329629885e-05, + "loss": 0.1977, + "step": 7100 + }, + { + "epoch": 0.14204, + "grad_norm": 1.1489566564559937, + "learning_rate": 1.9892519262812306e-05, + "loss": 0.2282, + "step": 7102 + }, + { + "epoch": 0.14208, + "grad_norm": 0.8811477422714233, + "learning_rate": 1.9892315003134968e-05, + "loss": 0.0949, + "step": 7104 + }, + { + "epoch": 0.14212, + "grad_norm": 2.6156458854675293, + "learning_rate": 1.9892110550601858e-05, + "loss": 0.4252, + "step": 7106 + }, + { + "epoch": 0.14216, + "grad_norm": 2.0817933082580566, + "learning_rate": 1.989190590521696e-05, + "loss": 0.2598, + "step": 7108 + }, + { + "epoch": 0.1422, + "grad_norm": 1.8702760934829712, + "learning_rate": 1.9891701066984264e-05, + "loss": 0.218, + "step": 7110 + }, + { + "epoch": 0.14224, + "grad_norm": 1.2772728204727173, + "learning_rate": 1.989149603590776e-05, + "loss": 0.16, + "step": 7112 + }, + { + "epoch": 0.14228, + "grad_norm": 1.6019240617752075, + "learning_rate": 1.989129081199145e-05, + "loss": 0.2772, + "step": 7114 + }, + { + "epoch": 0.14232, + "grad_norm": 0.6891078352928162, + "learning_rate": 1.9891085395239335e-05, + "loss": 0.1126, + "step": 7116 + }, + { + "epoch": 0.14236, + "grad_norm": 1.4504890441894531, + "learning_rate": 1.9890879785655417e-05, + "loss": 0.2666, + "step": 7118 + }, + { + "epoch": 0.1424, + "grad_norm": 1.2629121541976929, + "learning_rate": 1.9890673983243708e-05, + "loss": 0.1845, + "step": 7120 + }, + { + "epoch": 0.14244, + "grad_norm": 2.2475764751434326, + "learning_rate": 1.9890467988008214e-05, + "loss": 0.2235, + "step": 7122 + }, + { + "epoch": 0.14248, + "grad_norm": 0.9451645016670227, + "learning_rate": 1.9890261799952954e-05, + "loss": 0.1372, + "step": 7124 + }, + { + "epoch": 0.14252, + "grad_norm": 1.4528452157974243, + "learning_rate": 1.989005541908195e-05, + "loss": 0.1713, + "step": 7126 + }, + { + "epoch": 0.14256, + "grad_norm": 1.2332518100738525, + "learning_rate": 1.9889848845399226e-05, + "loss": 0.194, + "step": 7128 + }, + { + "epoch": 0.1426, + "grad_norm": 1.4136275053024292, + "learning_rate": 1.9889642078908805e-05, + "loss": 0.1464, + "step": 7130 + }, + { + "epoch": 0.14264, + "grad_norm": 1.4748181104660034, + "learning_rate": 1.988943511961472e-05, + "loss": 0.3474, + "step": 7132 + }, + { + "epoch": 0.14268, + "grad_norm": 0.60792076587677, + "learning_rate": 1.9889227967521005e-05, + "loss": 0.1501, + "step": 7134 + }, + { + "epoch": 0.14272, + "grad_norm": 0.6666139364242554, + "learning_rate": 1.98890206226317e-05, + "loss": 0.1543, + "step": 7136 + }, + { + "epoch": 0.14276, + "grad_norm": 1.4386260509490967, + "learning_rate": 1.988881308495085e-05, + "loss": 0.1121, + "step": 7138 + }, + { + "epoch": 0.1428, + "grad_norm": 1.0121461153030396, + "learning_rate": 1.9888605354482494e-05, + "loss": 0.1382, + "step": 7140 + }, + { + "epoch": 0.14284, + "grad_norm": 2.6398661136627197, + "learning_rate": 1.9888397431230687e-05, + "loss": 0.5345, + "step": 7142 + }, + { + "epoch": 0.14288, + "grad_norm": 0.8884559273719788, + "learning_rate": 1.988818931519948e-05, + "loss": 0.1055, + "step": 7144 + }, + { + "epoch": 0.14292, + "grad_norm": 1.0857306718826294, + "learning_rate": 1.9887981006392932e-05, + "loss": 0.2279, + "step": 7146 + }, + { + "epoch": 0.14296, + "grad_norm": 0.7868009805679321, + "learning_rate": 1.9887772504815102e-05, + "loss": 0.1061, + "step": 7148 + }, + { + "epoch": 0.143, + "grad_norm": 0.9048712253570557, + "learning_rate": 1.988756381047006e-05, + "loss": 0.1405, + "step": 7150 + }, + { + "epoch": 0.14304, + "grad_norm": 2.254826307296753, + "learning_rate": 1.988735492336187e-05, + "loss": 0.2426, + "step": 7152 + }, + { + "epoch": 0.14308, + "grad_norm": 0.38993167877197266, + "learning_rate": 1.9887145843494604e-05, + "loss": 0.0935, + "step": 7154 + }, + { + "epoch": 0.14312, + "grad_norm": 0.3567139506340027, + "learning_rate": 1.9886936570872342e-05, + "loss": 0.0344, + "step": 7156 + }, + { + "epoch": 0.14316, + "grad_norm": 0.2452547252178192, + "learning_rate": 1.9886727105499157e-05, + "loss": 0.3124, + "step": 7158 + }, + { + "epoch": 0.1432, + "grad_norm": 0.28439682722091675, + "learning_rate": 1.988651744737914e-05, + "loss": 0.4829, + "step": 7160 + }, + { + "epoch": 0.14324, + "grad_norm": 0.8082146644592285, + "learning_rate": 1.9886307596516376e-05, + "loss": 0.0746, + "step": 7162 + }, + { + "epoch": 0.14328, + "grad_norm": 2.8341269493103027, + "learning_rate": 1.9886097552914957e-05, + "loss": 0.7675, + "step": 7164 + }, + { + "epoch": 0.14332, + "grad_norm": 1.534420132637024, + "learning_rate": 1.9885887316578974e-05, + "loss": 0.1394, + "step": 7166 + }, + { + "epoch": 0.14336, + "grad_norm": 1.8793796300888062, + "learning_rate": 1.988567688751253e-05, + "loss": 0.202, + "step": 7168 + }, + { + "epoch": 0.1434, + "grad_norm": 1.4553149938583374, + "learning_rate": 1.9885466265719723e-05, + "loss": 0.29, + "step": 7170 + }, + { + "epoch": 0.14344, + "grad_norm": 1.057927131652832, + "learning_rate": 1.9885255451204664e-05, + "loss": 0.1276, + "step": 7172 + }, + { + "epoch": 0.14348, + "grad_norm": 1.0419131517410278, + "learning_rate": 1.9885044443971456e-05, + "loss": 0.1811, + "step": 7174 + }, + { + "epoch": 0.14352, + "grad_norm": 0.7312352657318115, + "learning_rate": 1.9884833244024226e-05, + "loss": 0.2351, + "step": 7176 + }, + { + "epoch": 0.14356, + "grad_norm": 1.5083743333816528, + "learning_rate": 1.9884621851367076e-05, + "loss": 0.29, + "step": 7178 + }, + { + "epoch": 0.1436, + "grad_norm": 1.1510227918624878, + "learning_rate": 1.9884410266004134e-05, + "loss": 0.1618, + "step": 7180 + }, + { + "epoch": 0.14364, + "grad_norm": 0.9517605304718018, + "learning_rate": 1.988419848793953e-05, + "loss": 0.19, + "step": 7182 + }, + { + "epoch": 0.14368, + "grad_norm": 1.0672073364257812, + "learning_rate": 1.9883986517177382e-05, + "loss": 0.1675, + "step": 7184 + }, + { + "epoch": 0.14372, + "grad_norm": 1.1760475635528564, + "learning_rate": 1.9883774353721833e-05, + "loss": 0.2128, + "step": 7186 + }, + { + "epoch": 0.14376, + "grad_norm": 0.9910807609558105, + "learning_rate": 1.988356199757701e-05, + "loss": 0.152, + "step": 7188 + }, + { + "epoch": 0.1438, + "grad_norm": 1.2567788362503052, + "learning_rate": 1.988334944874706e-05, + "loss": 0.1933, + "step": 7190 + }, + { + "epoch": 0.14384, + "grad_norm": 1.3722747564315796, + "learning_rate": 1.9883136707236127e-05, + "loss": 0.1885, + "step": 7192 + }, + { + "epoch": 0.14388, + "grad_norm": 1.3267027139663696, + "learning_rate": 1.988292377304835e-05, + "loss": 0.1781, + "step": 7194 + }, + { + "epoch": 0.14392, + "grad_norm": 0.8960849046707153, + "learning_rate": 1.988271064618789e-05, + "loss": 0.1446, + "step": 7196 + }, + { + "epoch": 0.14396, + "grad_norm": 1.139778971672058, + "learning_rate": 1.9882497326658896e-05, + "loss": 0.1538, + "step": 7198 + }, + { + "epoch": 0.144, + "grad_norm": 1.4000202417373657, + "learning_rate": 1.988228381446553e-05, + "loss": 0.1686, + "step": 7200 + }, + { + "epoch": 0.14404, + "grad_norm": 1.7004975080490112, + "learning_rate": 1.9882070109611954e-05, + "loss": 0.1719, + "step": 7202 + }, + { + "epoch": 0.14408, + "grad_norm": 15.360363960266113, + "learning_rate": 1.9881856212102332e-05, + "loss": 0.5957, + "step": 7204 + }, + { + "epoch": 0.14412, + "grad_norm": 0.5530562400817871, + "learning_rate": 1.9881642121940836e-05, + "loss": 0.1258, + "step": 7206 + }, + { + "epoch": 0.14416, + "grad_norm": 3.7289624214172363, + "learning_rate": 1.9881427839131637e-05, + "loss": 0.4618, + "step": 7208 + }, + { + "epoch": 0.1442, + "grad_norm": 2.1833536624908447, + "learning_rate": 1.988121336367892e-05, + "loss": 0.2881, + "step": 7210 + }, + { + "epoch": 0.14424, + "grad_norm": 0.7127093076705933, + "learning_rate": 1.9880998695586857e-05, + "loss": 0.1351, + "step": 7212 + }, + { + "epoch": 0.14428, + "grad_norm": 1.3099956512451172, + "learning_rate": 1.988078383485964e-05, + "loss": 0.2471, + "step": 7214 + }, + { + "epoch": 0.14432, + "grad_norm": 0.8684079051017761, + "learning_rate": 1.9880568781501454e-05, + "loss": 0.1918, + "step": 7216 + }, + { + "epoch": 0.14436, + "grad_norm": 1.206087350845337, + "learning_rate": 1.9880353535516494e-05, + "loss": 0.1258, + "step": 7218 + }, + { + "epoch": 0.1444, + "grad_norm": 1.1917811632156372, + "learning_rate": 1.9880138096908955e-05, + "loss": 0.1272, + "step": 7220 + }, + { + "epoch": 0.14444, + "grad_norm": 2.0944066047668457, + "learning_rate": 1.9879922465683033e-05, + "loss": 0.2553, + "step": 7222 + }, + { + "epoch": 0.14448, + "grad_norm": 1.544853925704956, + "learning_rate": 1.987970664184294e-05, + "loss": 0.222, + "step": 7224 + }, + { + "epoch": 0.14452, + "grad_norm": 1.1884396076202393, + "learning_rate": 1.987949062539288e-05, + "loss": 0.176, + "step": 7226 + }, + { + "epoch": 0.14456, + "grad_norm": 1.5777769088745117, + "learning_rate": 1.9879274416337063e-05, + "loss": 0.2125, + "step": 7228 + }, + { + "epoch": 0.1446, + "grad_norm": 1.346128225326538, + "learning_rate": 1.9879058014679704e-05, + "loss": 0.1329, + "step": 7230 + }, + { + "epoch": 0.14464, + "grad_norm": 1.3025131225585938, + "learning_rate": 1.9878841420425023e-05, + "loss": 0.2775, + "step": 7232 + }, + { + "epoch": 0.14468, + "grad_norm": 1.3436505794525146, + "learning_rate": 1.9878624633577245e-05, + "loss": 0.1269, + "step": 7234 + }, + { + "epoch": 0.14472, + "grad_norm": 2.4166736602783203, + "learning_rate": 1.9878407654140592e-05, + "loss": 0.355, + "step": 7236 + }, + { + "epoch": 0.14476, + "grad_norm": 0.981257438659668, + "learning_rate": 1.9878190482119297e-05, + "loss": 0.1033, + "step": 7238 + }, + { + "epoch": 0.1448, + "grad_norm": 0.41270971298217773, + "learning_rate": 1.987797311751759e-05, + "loss": 0.058, + "step": 7240 + }, + { + "epoch": 0.14484, + "grad_norm": 1.6706019639968872, + "learning_rate": 1.9877755560339716e-05, + "loss": 0.1633, + "step": 7242 + }, + { + "epoch": 0.14488, + "grad_norm": 0.6208446621894836, + "learning_rate": 1.987753781058991e-05, + "loss": 0.1437, + "step": 7244 + }, + { + "epoch": 0.14492, + "grad_norm": 2.3433594703674316, + "learning_rate": 1.9877319868272416e-05, + "loss": 0.2874, + "step": 7246 + }, + { + "epoch": 0.14496, + "grad_norm": 2.262866258621216, + "learning_rate": 1.987710173339149e-05, + "loss": 0.2091, + "step": 7248 + }, + { + "epoch": 0.145, + "grad_norm": 0.2558308243751526, + "learning_rate": 1.9876883405951378e-05, + "loss": 0.0781, + "step": 7250 + }, + { + "epoch": 0.14504, + "grad_norm": 1.660506010055542, + "learning_rate": 1.987666488595634e-05, + "loss": 0.1164, + "step": 7252 + }, + { + "epoch": 0.14508, + "grad_norm": 3.5959184169769287, + "learning_rate": 1.9876446173410637e-05, + "loss": 0.5307, + "step": 7254 + }, + { + "epoch": 0.14512, + "grad_norm": 0.12045604735612869, + "learning_rate": 1.987622726831853e-05, + "loss": 0.3068, + "step": 7256 + }, + { + "epoch": 0.14516, + "grad_norm": 3.590435028076172, + "learning_rate": 1.9876008170684287e-05, + "loss": 0.3921, + "step": 7258 + }, + { + "epoch": 0.1452, + "grad_norm": 1.7562494277954102, + "learning_rate": 1.9875788880512183e-05, + "loss": 0.3461, + "step": 7260 + }, + { + "epoch": 0.14524, + "grad_norm": 1.044297456741333, + "learning_rate": 1.9875569397806488e-05, + "loss": 0.3837, + "step": 7262 + }, + { + "epoch": 0.14528, + "grad_norm": 0.5974923968315125, + "learning_rate": 1.987534972257148e-05, + "loss": 0.2535, + "step": 7264 + }, + { + "epoch": 0.14532, + "grad_norm": 1.2392401695251465, + "learning_rate": 1.987512985481145e-05, + "loss": 0.1485, + "step": 7266 + }, + { + "epoch": 0.14536, + "grad_norm": 0.7384064793586731, + "learning_rate": 1.9874909794530677e-05, + "loss": 0.2104, + "step": 7268 + }, + { + "epoch": 0.1454, + "grad_norm": 0.9260364174842834, + "learning_rate": 1.9874689541733455e-05, + "loss": 0.1239, + "step": 7270 + }, + { + "epoch": 0.14544, + "grad_norm": 1.5844248533248901, + "learning_rate": 1.9874469096424075e-05, + "loss": 0.1749, + "step": 7272 + }, + { + "epoch": 0.14548, + "grad_norm": 0.5808306932449341, + "learning_rate": 1.9874248458606838e-05, + "loss": 0.0821, + "step": 7274 + }, + { + "epoch": 0.14552, + "grad_norm": 0.5566797256469727, + "learning_rate": 1.9874027628286042e-05, + "loss": 0.0729, + "step": 7276 + }, + { + "epoch": 0.14556, + "grad_norm": 2.64080810546875, + "learning_rate": 1.9873806605465994e-05, + "loss": 0.4605, + "step": 7278 + }, + { + "epoch": 0.1456, + "grad_norm": 1.2277653217315674, + "learning_rate": 1.9873585390151003e-05, + "loss": 0.3667, + "step": 7280 + }, + { + "epoch": 0.14564, + "grad_norm": 0.571753978729248, + "learning_rate": 1.9873363982345384e-05, + "loss": 0.0653, + "step": 7282 + }, + { + "epoch": 0.14568, + "grad_norm": 0.5516448020935059, + "learning_rate": 1.987314238205345e-05, + "loss": 0.2696, + "step": 7284 + }, + { + "epoch": 0.14572, + "grad_norm": 1.348189353942871, + "learning_rate": 1.987292058927952e-05, + "loss": 0.1105, + "step": 7286 + }, + { + "epoch": 0.14576, + "grad_norm": 0.666982114315033, + "learning_rate": 1.9872698604027924e-05, + "loss": 0.0809, + "step": 7288 + }, + { + "epoch": 0.1458, + "grad_norm": 1.5055630207061768, + "learning_rate": 1.9872476426302983e-05, + "loss": 0.1601, + "step": 7290 + }, + { + "epoch": 0.14584, + "grad_norm": 2.397028684616089, + "learning_rate": 1.987225405610903e-05, + "loss": 0.2389, + "step": 7292 + }, + { + "epoch": 0.14588, + "grad_norm": 0.5460720658302307, + "learning_rate": 1.9872031493450404e-05, + "loss": 0.221, + "step": 7294 + }, + { + "epoch": 0.14592, + "grad_norm": 2.213996410369873, + "learning_rate": 1.987180873833144e-05, + "loss": 0.2781, + "step": 7296 + }, + { + "epoch": 0.14596, + "grad_norm": 2.2489614486694336, + "learning_rate": 1.9871585790756485e-05, + "loss": 0.2377, + "step": 7298 + }, + { + "epoch": 0.146, + "grad_norm": 1.0861976146697998, + "learning_rate": 1.987136265072988e-05, + "loss": 0.1238, + "step": 7300 + }, + { + "epoch": 0.14604, + "grad_norm": 1.3130056858062744, + "learning_rate": 1.987113931825598e-05, + "loss": 0.2179, + "step": 7302 + }, + { + "epoch": 0.14608, + "grad_norm": 1.9851852655410767, + "learning_rate": 1.9870915793339137e-05, + "loss": 0.233, + "step": 7304 + }, + { + "epoch": 0.14612, + "grad_norm": 1.8068102598190308, + "learning_rate": 1.9870692075983708e-05, + "loss": 0.2223, + "step": 7306 + }, + { + "epoch": 0.14616, + "grad_norm": 1.4866310358047485, + "learning_rate": 1.9870468166194056e-05, + "loss": 0.1465, + "step": 7308 + }, + { + "epoch": 0.1462, + "grad_norm": 2.2860567569732666, + "learning_rate": 1.987024406397454e-05, + "loss": 0.2664, + "step": 7310 + }, + { + "epoch": 0.14624, + "grad_norm": 2.17803692817688, + "learning_rate": 1.987001976932954e-05, + "loss": 0.2033, + "step": 7312 + }, + { + "epoch": 0.14628, + "grad_norm": 0.8793487548828125, + "learning_rate": 1.9869795282263423e-05, + "loss": 0.2932, + "step": 7314 + }, + { + "epoch": 0.14632, + "grad_norm": 2.452651262283325, + "learning_rate": 1.9869570602780563e-05, + "loss": 0.2562, + "step": 7316 + }, + { + "epoch": 0.14636, + "grad_norm": 3.5173466205596924, + "learning_rate": 1.9869345730885346e-05, + "loss": 0.5762, + "step": 7318 + }, + { + "epoch": 0.1464, + "grad_norm": 0.780754566192627, + "learning_rate": 1.9869120666582153e-05, + "loss": 0.1856, + "step": 7320 + }, + { + "epoch": 0.14644, + "grad_norm": 2.0783402919769287, + "learning_rate": 1.986889540987537e-05, + "loss": 0.2236, + "step": 7322 + }, + { + "epoch": 0.14648, + "grad_norm": 0.8565492630004883, + "learning_rate": 1.9868669960769387e-05, + "loss": 0.175, + "step": 7324 + }, + { + "epoch": 0.14652, + "grad_norm": 2.4858486652374268, + "learning_rate": 1.9868444319268607e-05, + "loss": 0.4022, + "step": 7326 + }, + { + "epoch": 0.14656, + "grad_norm": 0.3716197907924652, + "learning_rate": 1.9868218485377424e-05, + "loss": 0.2919, + "step": 7328 + }, + { + "epoch": 0.1466, + "grad_norm": 2.158836603164673, + "learning_rate": 1.986799245910024e-05, + "loss": 0.3399, + "step": 7330 + }, + { + "epoch": 0.14664, + "grad_norm": 0.9811989068984985, + "learning_rate": 1.9867766240441463e-05, + "loss": 0.152, + "step": 7332 + }, + { + "epoch": 0.14668, + "grad_norm": 1.5945523977279663, + "learning_rate": 1.9867539829405503e-05, + "loss": 0.2927, + "step": 7334 + }, + { + "epoch": 0.14672, + "grad_norm": 1.0902562141418457, + "learning_rate": 1.9867313225996773e-05, + "loss": 0.176, + "step": 7336 + }, + { + "epoch": 0.14676, + "grad_norm": 0.9960134029388428, + "learning_rate": 1.9867086430219692e-05, + "loss": 0.1275, + "step": 7338 + }, + { + "epoch": 0.1468, + "grad_norm": 1.9633417129516602, + "learning_rate": 1.986685944207868e-05, + "loss": 0.3396, + "step": 7340 + }, + { + "epoch": 0.14684, + "grad_norm": 1.7703826427459717, + "learning_rate": 1.9866632261578168e-05, + "loss": 0.2183, + "step": 7342 + }, + { + "epoch": 0.14688, + "grad_norm": 0.9566783308982849, + "learning_rate": 1.9866404888722575e-05, + "loss": 0.131, + "step": 7344 + }, + { + "epoch": 0.14692, + "grad_norm": 1.9172292947769165, + "learning_rate": 1.9866177323516343e-05, + "loss": 0.3532, + "step": 7346 + }, + { + "epoch": 0.14696, + "grad_norm": 1.2523494958877563, + "learning_rate": 1.98659495659639e-05, + "loss": 0.1387, + "step": 7348 + }, + { + "epoch": 0.147, + "grad_norm": 2.165980577468872, + "learning_rate": 1.9865721616069695e-05, + "loss": 0.2538, + "step": 7350 + }, + { + "epoch": 0.14704, + "grad_norm": 2.1397244930267334, + "learning_rate": 1.9865493473838168e-05, + "loss": 0.2781, + "step": 7352 + }, + { + "epoch": 0.14708, + "grad_norm": 1.2057145833969116, + "learning_rate": 1.9865265139273764e-05, + "loss": 0.1847, + "step": 7354 + }, + { + "epoch": 0.14712, + "grad_norm": 0.42186206579208374, + "learning_rate": 1.986503661238094e-05, + "loss": 0.1656, + "step": 7356 + }, + { + "epoch": 0.14716, + "grad_norm": 1.2241038084030151, + "learning_rate": 1.9864807893164143e-05, + "loss": 0.2644, + "step": 7358 + }, + { + "epoch": 0.1472, + "grad_norm": 0.8888980150222778, + "learning_rate": 1.9864578981627844e-05, + "loss": 0.2091, + "step": 7360 + }, + { + "epoch": 0.14724, + "grad_norm": 1.0495073795318604, + "learning_rate": 1.9864349877776496e-05, + "loss": 0.1521, + "step": 7362 + }, + { + "epoch": 0.14728, + "grad_norm": 1.6949281692504883, + "learning_rate": 1.9864120581614567e-05, + "loss": 0.2654, + "step": 7364 + }, + { + "epoch": 0.14732, + "grad_norm": 1.2844007015228271, + "learning_rate": 1.9863891093146533e-05, + "loss": 0.1944, + "step": 7366 + }, + { + "epoch": 0.14736, + "grad_norm": 1.0088422298431396, + "learning_rate": 1.986366141237686e-05, + "loss": 0.1146, + "step": 7368 + }, + { + "epoch": 0.1474, + "grad_norm": 2.3109898567199707, + "learning_rate": 1.9863431539310033e-05, + "loss": 0.3673, + "step": 7370 + }, + { + "epoch": 0.14744, + "grad_norm": 1.016516089439392, + "learning_rate": 1.986320147395053e-05, + "loss": 0.1306, + "step": 7372 + }, + { + "epoch": 0.14748, + "grad_norm": 1.6308385133743286, + "learning_rate": 1.9862971216302833e-05, + "loss": 0.1696, + "step": 7374 + }, + { + "epoch": 0.14752, + "grad_norm": 0.9929598569869995, + "learning_rate": 1.9862740766371434e-05, + "loss": 0.2349, + "step": 7376 + }, + { + "epoch": 0.14756, + "grad_norm": 1.619582176208496, + "learning_rate": 1.986251012416083e-05, + "loss": 0.1484, + "step": 7378 + }, + { + "epoch": 0.1476, + "grad_norm": 0.7694016098976135, + "learning_rate": 1.986227928967551e-05, + "loss": 0.0848, + "step": 7380 + }, + { + "epoch": 0.14764, + "grad_norm": 0.9460245370864868, + "learning_rate": 1.986204826291998e-05, + "loss": 0.1975, + "step": 7382 + }, + { + "epoch": 0.14768, + "grad_norm": 1.0917977094650269, + "learning_rate": 1.9861817043898743e-05, + "loss": 0.4133, + "step": 7384 + }, + { + "epoch": 0.14772, + "grad_norm": 0.2172105312347412, + "learning_rate": 1.9861585632616303e-05, + "loss": 0.0579, + "step": 7386 + }, + { + "epoch": 0.14776, + "grad_norm": 0.30192479491233826, + "learning_rate": 1.9861354029077174e-05, + "loss": 0.2658, + "step": 7388 + }, + { + "epoch": 0.1478, + "grad_norm": 2.5690112113952637, + "learning_rate": 1.9861122233285873e-05, + "loss": 0.2692, + "step": 7390 + }, + { + "epoch": 0.14784, + "grad_norm": 2.2096121311187744, + "learning_rate": 1.9860890245246915e-05, + "loss": 0.3953, + "step": 7392 + }, + { + "epoch": 0.14788, + "grad_norm": 0.7256247997283936, + "learning_rate": 1.9860658064964825e-05, + "loss": 0.0764, + "step": 7394 + }, + { + "epoch": 0.14792, + "grad_norm": 0.4456249177455902, + "learning_rate": 1.986042569244413e-05, + "loss": 0.1331, + "step": 7396 + }, + { + "epoch": 0.14796, + "grad_norm": 1.1972558498382568, + "learning_rate": 1.9860193127689364e-05, + "loss": 0.2102, + "step": 7398 + }, + { + "epoch": 0.148, + "grad_norm": 1.717457890510559, + "learning_rate": 1.985996037070505e-05, + "loss": 0.3259, + "step": 7400 + }, + { + "epoch": 0.14804, + "grad_norm": 0.8011610507965088, + "learning_rate": 1.9859727421495737e-05, + "loss": 0.1305, + "step": 7402 + }, + { + "epoch": 0.14808, + "grad_norm": 2.0014493465423584, + "learning_rate": 1.9859494280065957e-05, + "loss": 0.2697, + "step": 7404 + }, + { + "epoch": 0.14812, + "grad_norm": 1.661855936050415, + "learning_rate": 1.9859260946420265e-05, + "loss": 0.3144, + "step": 7406 + }, + { + "epoch": 0.14816, + "grad_norm": 1.4267281293869019, + "learning_rate": 1.9859027420563203e-05, + "loss": 0.212, + "step": 7408 + }, + { + "epoch": 0.1482, + "grad_norm": 0.9771876335144043, + "learning_rate": 1.9858793702499322e-05, + "loss": 0.1528, + "step": 7410 + }, + { + "epoch": 0.14824, + "grad_norm": 1.0496944189071655, + "learning_rate": 1.985855979223319e-05, + "loss": 0.137, + "step": 7412 + }, + { + "epoch": 0.14828, + "grad_norm": 1.2401149272918701, + "learning_rate": 1.9858325689769352e-05, + "loss": 0.1845, + "step": 7414 + }, + { + "epoch": 0.14832, + "grad_norm": 1.2853671312332153, + "learning_rate": 1.9858091395112383e-05, + "loss": 0.1855, + "step": 7416 + }, + { + "epoch": 0.14836, + "grad_norm": 1.5045764446258545, + "learning_rate": 1.9857856908266848e-05, + "loss": 0.2128, + "step": 7418 + }, + { + "epoch": 0.1484, + "grad_norm": 1.8231972455978394, + "learning_rate": 1.9857622229237315e-05, + "loss": 0.2177, + "step": 7420 + }, + { + "epoch": 0.14844, + "grad_norm": 2.645081043243408, + "learning_rate": 1.985738735802836e-05, + "loss": 0.3158, + "step": 7422 + }, + { + "epoch": 0.14848, + "grad_norm": 0.6652347445487976, + "learning_rate": 1.9857152294644567e-05, + "loss": 0.1302, + "step": 7424 + }, + { + "epoch": 0.14852, + "grad_norm": 1.1831133365631104, + "learning_rate": 1.9856917039090516e-05, + "loss": 0.1306, + "step": 7426 + }, + { + "epoch": 0.14856, + "grad_norm": 0.8758583664894104, + "learning_rate": 1.985668159137079e-05, + "loss": 0.2215, + "step": 7428 + }, + { + "epoch": 0.1486, + "grad_norm": 2.4444680213928223, + "learning_rate": 1.9856445951489984e-05, + "loss": 0.5695, + "step": 7430 + }, + { + "epoch": 0.14864, + "grad_norm": 0.7663723826408386, + "learning_rate": 1.9856210119452685e-05, + "loss": 0.1178, + "step": 7432 + }, + { + "epoch": 0.14868, + "grad_norm": 0.9126761555671692, + "learning_rate": 1.9855974095263498e-05, + "loss": 0.1968, + "step": 7434 + }, + { + "epoch": 0.14872, + "grad_norm": 1.1279969215393066, + "learning_rate": 1.9855737878927022e-05, + "loss": 0.1382, + "step": 7436 + }, + { + "epoch": 0.14876, + "grad_norm": 0.8934893608093262, + "learning_rate": 1.9855501470447865e-05, + "loss": 0.1645, + "step": 7438 + }, + { + "epoch": 0.1488, + "grad_norm": 1.3107315301895142, + "learning_rate": 1.985526486983063e-05, + "loss": 0.1517, + "step": 7440 + }, + { + "epoch": 0.14884, + "grad_norm": 0.737126350402832, + "learning_rate": 1.9855028077079933e-05, + "loss": 0.1103, + "step": 7442 + }, + { + "epoch": 0.14888, + "grad_norm": 0.5177561044692993, + "learning_rate": 1.985479109220039e-05, + "loss": 0.2467, + "step": 7444 + }, + { + "epoch": 0.14892, + "grad_norm": 1.0609978437423706, + "learning_rate": 1.985455391519662e-05, + "loss": 0.1062, + "step": 7446 + }, + { + "epoch": 0.14896, + "grad_norm": 0.477682501077652, + "learning_rate": 1.985431654607325e-05, + "loss": 0.2916, + "step": 7448 + }, + { + "epoch": 0.149, + "grad_norm": 2.958822011947632, + "learning_rate": 1.9854078984834904e-05, + "loss": 0.3098, + "step": 7450 + }, + { + "epoch": 0.14904, + "grad_norm": 0.7234340310096741, + "learning_rate": 1.9853841231486215e-05, + "loss": 0.091, + "step": 7452 + }, + { + "epoch": 0.14908, + "grad_norm": 0.5285912752151489, + "learning_rate": 1.9853603286031816e-05, + "loss": 0.0515, + "step": 7454 + }, + { + "epoch": 0.14912, + "grad_norm": 1.8129055500030518, + "learning_rate": 1.985336514847635e-05, + "loss": 0.1485, + "step": 7456 + }, + { + "epoch": 0.14916, + "grad_norm": 0.33580470085144043, + "learning_rate": 1.985312681882446e-05, + "loss": 0.0512, + "step": 7458 + }, + { + "epoch": 0.1492, + "grad_norm": 0.3508853018283844, + "learning_rate": 1.985288829708079e-05, + "loss": 0.1676, + "step": 7460 + }, + { + "epoch": 0.14924, + "grad_norm": 1.72153639793396, + "learning_rate": 1.9852649583249985e-05, + "loss": 0.4282, + "step": 7462 + }, + { + "epoch": 0.14928, + "grad_norm": 0.941072404384613, + "learning_rate": 1.9852410677336707e-05, + "loss": 0.0868, + "step": 7464 + }, + { + "epoch": 0.14932, + "grad_norm": 2.3540217876434326, + "learning_rate": 1.9852171579345613e-05, + "loss": 0.4093, + "step": 7466 + }, + { + "epoch": 0.14936, + "grad_norm": 1.517782211303711, + "learning_rate": 1.9851932289281355e-05, + "loss": 0.1596, + "step": 7468 + }, + { + "epoch": 0.1494, + "grad_norm": 1.5923675298690796, + "learning_rate": 1.9851692807148612e-05, + "loss": 0.4333, + "step": 7470 + }, + { + "epoch": 0.14944, + "grad_norm": 0.7151102423667908, + "learning_rate": 1.9851453132952042e-05, + "loss": 0.1125, + "step": 7472 + }, + { + "epoch": 0.14948, + "grad_norm": 1.9521896839141846, + "learning_rate": 1.9851213266696322e-05, + "loss": 0.1627, + "step": 7474 + }, + { + "epoch": 0.14952, + "grad_norm": 1.686438798904419, + "learning_rate": 1.9850973208386127e-05, + "loss": 0.1857, + "step": 7476 + }, + { + "epoch": 0.14956, + "grad_norm": 1.9694567918777466, + "learning_rate": 1.985073295802614e-05, + "loss": 0.2153, + "step": 7478 + }, + { + "epoch": 0.1496, + "grad_norm": 1.0513309240341187, + "learning_rate": 1.9850492515621038e-05, + "loss": 0.1515, + "step": 7480 + }, + { + "epoch": 0.14964, + "grad_norm": 0.5488817095756531, + "learning_rate": 1.985025188117552e-05, + "loss": 0.4302, + "step": 7482 + }, + { + "epoch": 0.14968, + "grad_norm": 1.6854190826416016, + "learning_rate": 1.9850011054694264e-05, + "loss": 0.2429, + "step": 7484 + }, + { + "epoch": 0.14972, + "grad_norm": 0.7652542591094971, + "learning_rate": 1.9849770036181973e-05, + "loss": 0.1976, + "step": 7486 + }, + { + "epoch": 0.14976, + "grad_norm": 1.9585247039794922, + "learning_rate": 1.9849528825643346e-05, + "loss": 0.2059, + "step": 7488 + }, + { + "epoch": 0.1498, + "grad_norm": 0.4594642221927643, + "learning_rate": 1.984928742308308e-05, + "loss": 0.3461, + "step": 7490 + }, + { + "epoch": 0.14984, + "grad_norm": 0.31810930371284485, + "learning_rate": 1.9849045828505886e-05, + "loss": 0.1379, + "step": 7492 + }, + { + "epoch": 0.14988, + "grad_norm": 0.9534528255462646, + "learning_rate": 1.9848804041916475e-05, + "loss": 0.1882, + "step": 7494 + }, + { + "epoch": 0.14992, + "grad_norm": 2.122267723083496, + "learning_rate": 1.984856206331956e-05, + "loss": 0.4225, + "step": 7496 + }, + { + "epoch": 0.14996, + "grad_norm": 2.1232571601867676, + "learning_rate": 1.9848319892719854e-05, + "loss": 0.3387, + "step": 7498 + }, + { + "epoch": 0.15, + "grad_norm": 0.9459700584411621, + "learning_rate": 1.9848077530122083e-05, + "loss": 0.2766, + "step": 7500 + }, + { + "epoch": 0.15004, + "grad_norm": 2.1613705158233643, + "learning_rate": 1.984783497553097e-05, + "loss": 0.4859, + "step": 7502 + }, + { + "epoch": 0.15008, + "grad_norm": 1.4831067323684692, + "learning_rate": 1.984759222895124e-05, + "loss": 0.2223, + "step": 7504 + }, + { + "epoch": 0.15012, + "grad_norm": 0.5531730651855469, + "learning_rate": 1.9847349290387637e-05, + "loss": 0.1544, + "step": 7506 + }, + { + "epoch": 0.15016, + "grad_norm": 1.6065007448196411, + "learning_rate": 1.9847106159844887e-05, + "loss": 0.2764, + "step": 7508 + }, + { + "epoch": 0.1502, + "grad_norm": 0.5746104717254639, + "learning_rate": 1.9846862837327733e-05, + "loss": 0.1223, + "step": 7510 + }, + { + "epoch": 0.15024, + "grad_norm": 1.1087607145309448, + "learning_rate": 1.9846619322840915e-05, + "loss": 0.1351, + "step": 7512 + }, + { + "epoch": 0.15028, + "grad_norm": 1.3201954364776611, + "learning_rate": 1.9846375616389188e-05, + "loss": 0.2223, + "step": 7514 + }, + { + "epoch": 0.15032, + "grad_norm": 0.7152513861656189, + "learning_rate": 1.98461317179773e-05, + "loss": 0.1744, + "step": 7516 + }, + { + "epoch": 0.15036, + "grad_norm": 1.1656863689422607, + "learning_rate": 1.984588762761e-05, + "loss": 0.2924, + "step": 7518 + }, + { + "epoch": 0.1504, + "grad_norm": 1.2769273519515991, + "learning_rate": 1.9845643345292055e-05, + "loss": 0.1536, + "step": 7520 + }, + { + "epoch": 0.15044, + "grad_norm": 1.5030382871627808, + "learning_rate": 1.9845398871028224e-05, + "loss": 0.2772, + "step": 7522 + }, + { + "epoch": 0.15048, + "grad_norm": 1.6086914539337158, + "learning_rate": 1.9845154204823275e-05, + "loss": 0.2792, + "step": 7524 + }, + { + "epoch": 0.15052, + "grad_norm": 1.315360426902771, + "learning_rate": 1.9844909346681974e-05, + "loss": 0.1962, + "step": 7526 + }, + { + "epoch": 0.15056, + "grad_norm": 1.4102592468261719, + "learning_rate": 1.9844664296609096e-05, + "loss": 0.2029, + "step": 7528 + }, + { + "epoch": 0.1506, + "grad_norm": 1.2468504905700684, + "learning_rate": 1.9844419054609418e-05, + "loss": 0.2382, + "step": 7530 + }, + { + "epoch": 0.15064, + "grad_norm": 1.2488285303115845, + "learning_rate": 1.9844173620687724e-05, + "loss": 0.1859, + "step": 7532 + }, + { + "epoch": 0.15068, + "grad_norm": 0.832545816898346, + "learning_rate": 1.98439279948488e-05, + "loss": 0.1374, + "step": 7534 + }, + { + "epoch": 0.15072, + "grad_norm": 2.000730514526367, + "learning_rate": 1.9843682177097427e-05, + "loss": 0.3462, + "step": 7536 + }, + { + "epoch": 0.15076, + "grad_norm": 1.4946625232696533, + "learning_rate": 1.9843436167438406e-05, + "loss": 0.1991, + "step": 7538 + }, + { + "epoch": 0.1508, + "grad_norm": 1.850344181060791, + "learning_rate": 1.9843189965876525e-05, + "loss": 0.2505, + "step": 7540 + }, + { + "epoch": 0.15084, + "grad_norm": 1.1567714214324951, + "learning_rate": 1.9842943572416592e-05, + "loss": 0.1689, + "step": 7542 + }, + { + "epoch": 0.15088, + "grad_norm": 1.6872179508209229, + "learning_rate": 1.9842696987063402e-05, + "loss": 0.2348, + "step": 7544 + }, + { + "epoch": 0.15092, + "grad_norm": 1.0112792253494263, + "learning_rate": 1.9842450209821772e-05, + "loss": 0.1763, + "step": 7546 + }, + { + "epoch": 0.15096, + "grad_norm": 1.2192702293395996, + "learning_rate": 1.9842203240696504e-05, + "loss": 0.2349, + "step": 7548 + }, + { + "epoch": 0.151, + "grad_norm": 0.6649866104125977, + "learning_rate": 1.984195607969242e-05, + "loss": 0.1638, + "step": 7550 + }, + { + "epoch": 0.15104, + "grad_norm": 0.5580105781555176, + "learning_rate": 1.9841708726814336e-05, + "loss": 0.2011, + "step": 7552 + }, + { + "epoch": 0.15108, + "grad_norm": 1.4010146856307983, + "learning_rate": 1.984146118206707e-05, + "loss": 0.1486, + "step": 7554 + }, + { + "epoch": 0.15112, + "grad_norm": 0.7979793548583984, + "learning_rate": 1.984121344545545e-05, + "loss": 0.1353, + "step": 7556 + }, + { + "epoch": 0.15116, + "grad_norm": 0.8251157999038696, + "learning_rate": 1.9840965516984313e-05, + "loss": 0.1002, + "step": 7558 + }, + { + "epoch": 0.1512, + "grad_norm": 0.8142253160476685, + "learning_rate": 1.9840717396658483e-05, + "loss": 0.2156, + "step": 7560 + }, + { + "epoch": 0.15124, + "grad_norm": 2.0721590518951416, + "learning_rate": 1.98404690844828e-05, + "loss": 0.2886, + "step": 7562 + }, + { + "epoch": 0.15128, + "grad_norm": 1.0478311777114868, + "learning_rate": 1.9840220580462112e-05, + "loss": 0.3838, + "step": 7564 + }, + { + "epoch": 0.15132, + "grad_norm": 1.9729478359222412, + "learning_rate": 1.983997188460125e-05, + "loss": 0.1873, + "step": 7566 + }, + { + "epoch": 0.15136, + "grad_norm": 2.3935465812683105, + "learning_rate": 1.9839722996905076e-05, + "loss": 0.3732, + "step": 7568 + }, + { + "epoch": 0.1514, + "grad_norm": 1.3615692853927612, + "learning_rate": 1.9839473917378432e-05, + "loss": 0.1931, + "step": 7570 + }, + { + "epoch": 0.15144, + "grad_norm": 1.7329498529434204, + "learning_rate": 1.983922464602618e-05, + "loss": 0.2381, + "step": 7572 + }, + { + "epoch": 0.15148, + "grad_norm": 0.7291938662528992, + "learning_rate": 1.9838975182853183e-05, + "loss": 0.1057, + "step": 7574 + }, + { + "epoch": 0.15152, + "grad_norm": 0.5892574191093445, + "learning_rate": 1.983872552786429e-05, + "loss": 0.0808, + "step": 7576 + }, + { + "epoch": 0.15156, + "grad_norm": 1.3816059827804565, + "learning_rate": 1.9838475681064385e-05, + "loss": 0.1856, + "step": 7578 + }, + { + "epoch": 0.1516, + "grad_norm": 0.5281857252120972, + "learning_rate": 1.983822564245833e-05, + "loss": 0.1609, + "step": 7580 + }, + { + "epoch": 0.15164, + "grad_norm": 2.373903274536133, + "learning_rate": 1.9837975412050995e-05, + "loss": 0.3672, + "step": 7582 + }, + { + "epoch": 0.15168, + "grad_norm": 1.9440251588821411, + "learning_rate": 1.983772498984727e-05, + "loss": 0.264, + "step": 7584 + }, + { + "epoch": 0.15172, + "grad_norm": 1.4366716146469116, + "learning_rate": 1.983747437585203e-05, + "loss": 0.1346, + "step": 7586 + }, + { + "epoch": 0.15176, + "grad_norm": 0.9306557178497314, + "learning_rate": 1.983722357007016e-05, + "loss": 0.2485, + "step": 7588 + }, + { + "epoch": 0.1518, + "grad_norm": 2.6082189083099365, + "learning_rate": 1.9836972572506557e-05, + "loss": 0.5348, + "step": 7590 + }, + { + "epoch": 0.15184, + "grad_norm": 2.062330961227417, + "learning_rate": 1.983672138316611e-05, + "loss": 0.2241, + "step": 7592 + }, + { + "epoch": 0.15188, + "grad_norm": 2.432265520095825, + "learning_rate": 1.9836470002053713e-05, + "loss": 0.3539, + "step": 7594 + }, + { + "epoch": 0.15192, + "grad_norm": 2.2417047023773193, + "learning_rate": 1.983621842917427e-05, + "loss": 0.3324, + "step": 7596 + }, + { + "epoch": 0.15196, + "grad_norm": 0.7633309960365295, + "learning_rate": 1.9835966664532683e-05, + "loss": 0.1381, + "step": 7598 + }, + { + "epoch": 0.152, + "grad_norm": 2.150635004043579, + "learning_rate": 1.983571470813386e-05, + "loss": 0.4883, + "step": 7600 + }, + { + "epoch": 0.15204, + "grad_norm": 1.193067193031311, + "learning_rate": 1.9835462559982717e-05, + "loss": 0.2128, + "step": 7602 + }, + { + "epoch": 0.15208, + "grad_norm": 1.238180160522461, + "learning_rate": 1.9835210220084168e-05, + "loss": 0.2564, + "step": 7604 + }, + { + "epoch": 0.15212, + "grad_norm": 1.1985585689544678, + "learning_rate": 1.9834957688443133e-05, + "loss": 0.2025, + "step": 7606 + }, + { + "epoch": 0.15216, + "grad_norm": 1.315383791923523, + "learning_rate": 1.9834704965064535e-05, + "loss": 0.243, + "step": 7608 + }, + { + "epoch": 0.1522, + "grad_norm": 1.37759530544281, + "learning_rate": 1.98344520499533e-05, + "loss": 0.3032, + "step": 7610 + }, + { + "epoch": 0.15224, + "grad_norm": 0.8665762543678284, + "learning_rate": 1.983419894311436e-05, + "loss": 0.1529, + "step": 7612 + }, + { + "epoch": 0.15228, + "grad_norm": 1.726604700088501, + "learning_rate": 1.9833945644552645e-05, + "loss": 0.3395, + "step": 7614 + }, + { + "epoch": 0.15232, + "grad_norm": 1.2019128799438477, + "learning_rate": 1.9833692154273097e-05, + "loss": 0.1539, + "step": 7616 + }, + { + "epoch": 0.15236, + "grad_norm": 1.0372806787490845, + "learning_rate": 1.983343847228066e-05, + "loss": 0.2383, + "step": 7618 + }, + { + "epoch": 0.1524, + "grad_norm": 0.9255988001823425, + "learning_rate": 1.983318459858028e-05, + "loss": 0.1383, + "step": 7620 + }, + { + "epoch": 0.15244, + "grad_norm": 1.164952039718628, + "learning_rate": 1.9832930533176896e-05, + "loss": 0.2237, + "step": 7622 + }, + { + "epoch": 0.15248, + "grad_norm": 0.7696358561515808, + "learning_rate": 1.9832676276075476e-05, + "loss": 0.175, + "step": 7624 + }, + { + "epoch": 0.15252, + "grad_norm": 0.8340161442756653, + "learning_rate": 1.9832421827280966e-05, + "loss": 0.1533, + "step": 7626 + }, + { + "epoch": 0.15256, + "grad_norm": 0.9257018566131592, + "learning_rate": 1.9832167186798333e-05, + "loss": 0.1251, + "step": 7628 + }, + { + "epoch": 0.1526, + "grad_norm": 1.890903353691101, + "learning_rate": 1.9831912354632537e-05, + "loss": 0.2428, + "step": 7630 + }, + { + "epoch": 0.15264, + "grad_norm": 1.8613100051879883, + "learning_rate": 1.983165733078855e-05, + "loss": 0.2484, + "step": 7632 + }, + { + "epoch": 0.15268, + "grad_norm": 1.1398245096206665, + "learning_rate": 1.9831402115271338e-05, + "loss": 0.323, + "step": 7634 + }, + { + "epoch": 0.15272, + "grad_norm": 1.7089873552322388, + "learning_rate": 1.9831146708085886e-05, + "loss": 0.4438, + "step": 7636 + }, + { + "epoch": 0.15276, + "grad_norm": 1.5602768659591675, + "learning_rate": 1.9830891109237162e-05, + "loss": 0.2773, + "step": 7638 + }, + { + "epoch": 0.1528, + "grad_norm": 1.6027737855911255, + "learning_rate": 1.9830635318730155e-05, + "loss": 0.2597, + "step": 7640 + }, + { + "epoch": 0.15284, + "grad_norm": 1.072919487953186, + "learning_rate": 1.9830379336569854e-05, + "loss": 0.1596, + "step": 7642 + }, + { + "epoch": 0.15288, + "grad_norm": 1.465996265411377, + "learning_rate": 1.9830123162761245e-05, + "loss": 0.218, + "step": 7644 + }, + { + "epoch": 0.15292, + "grad_norm": 0.7600589990615845, + "learning_rate": 1.9829866797309327e-05, + "loss": 0.1464, + "step": 7646 + }, + { + "epoch": 0.15296, + "grad_norm": 1.884635090827942, + "learning_rate": 1.982961024021909e-05, + "loss": 0.3283, + "step": 7648 + }, + { + "epoch": 0.153, + "grad_norm": 2.022665023803711, + "learning_rate": 1.9829353491495545e-05, + "loss": 0.3304, + "step": 7650 + }, + { + "epoch": 0.15304, + "grad_norm": 1.8202564716339111, + "learning_rate": 1.982909655114369e-05, + "loss": 0.3207, + "step": 7652 + }, + { + "epoch": 0.15308, + "grad_norm": 0.9020352363586426, + "learning_rate": 1.982883941916854e-05, + "loss": 0.199, + "step": 7654 + }, + { + "epoch": 0.15312, + "grad_norm": 1.469857096672058, + "learning_rate": 1.9828582095575104e-05, + "loss": 0.2565, + "step": 7656 + }, + { + "epoch": 0.15316, + "grad_norm": 0.9343539476394653, + "learning_rate": 1.98283245803684e-05, + "loss": 0.1677, + "step": 7658 + }, + { + "epoch": 0.1532, + "grad_norm": 1.3217320442199707, + "learning_rate": 1.982806687355345e-05, + "loss": 0.1653, + "step": 7660 + }, + { + "epoch": 0.15324, + "grad_norm": 1.602519154548645, + "learning_rate": 1.982780897513527e-05, + "loss": 0.256, + "step": 7662 + }, + { + "epoch": 0.15328, + "grad_norm": 1.3626492023468018, + "learning_rate": 1.9827550885118902e-05, + "loss": 0.233, + "step": 7664 + }, + { + "epoch": 0.15332, + "grad_norm": 1.3723050355911255, + "learning_rate": 1.982729260350937e-05, + "loss": 0.2866, + "step": 7666 + }, + { + "epoch": 0.15336, + "grad_norm": 1.241184115409851, + "learning_rate": 1.9827034130311704e-05, + "loss": 0.1814, + "step": 7668 + }, + { + "epoch": 0.1534, + "grad_norm": 1.5887295007705688, + "learning_rate": 1.982677546553095e-05, + "loss": 0.3394, + "step": 7670 + }, + { + "epoch": 0.15344, + "grad_norm": 2.0560507774353027, + "learning_rate": 1.982651660917215e-05, + "loss": 0.3785, + "step": 7672 + }, + { + "epoch": 0.15348, + "grad_norm": 1.2621681690216064, + "learning_rate": 1.982625756124035e-05, + "loss": 0.1541, + "step": 7674 + }, + { + "epoch": 0.15352, + "grad_norm": 1.0825417041778564, + "learning_rate": 1.98259983217406e-05, + "loss": 0.168, + "step": 7676 + }, + { + "epoch": 0.15356, + "grad_norm": 1.5802497863769531, + "learning_rate": 1.9825738890677956e-05, + "loss": 0.3736, + "step": 7678 + }, + { + "epoch": 0.1536, + "grad_norm": 2.009864091873169, + "learning_rate": 1.982547926805747e-05, + "loss": 0.2524, + "step": 7680 + }, + { + "epoch": 0.15364, + "grad_norm": 1.4554221630096436, + "learning_rate": 1.9825219453884207e-05, + "loss": 0.2543, + "step": 7682 + }, + { + "epoch": 0.15368, + "grad_norm": 1.8766400814056396, + "learning_rate": 1.9824959448163234e-05, + "loss": 0.2966, + "step": 7684 + }, + { + "epoch": 0.15372, + "grad_norm": 1.2437807321548462, + "learning_rate": 1.9824699250899616e-05, + "loss": 0.255, + "step": 7686 + }, + { + "epoch": 0.15376, + "grad_norm": 1.0751830339431763, + "learning_rate": 1.9824438862098435e-05, + "loss": 0.203, + "step": 7688 + }, + { + "epoch": 0.1538, + "grad_norm": 1.692095398902893, + "learning_rate": 1.9824178281764753e-05, + "loss": 0.2962, + "step": 7690 + }, + { + "epoch": 0.15384, + "grad_norm": 1.5225614309310913, + "learning_rate": 1.982391750990366e-05, + "loss": 0.2263, + "step": 7692 + }, + { + "epoch": 0.15388, + "grad_norm": 1.4231122732162476, + "learning_rate": 1.9823656546520238e-05, + "loss": 0.2774, + "step": 7694 + }, + { + "epoch": 0.15392, + "grad_norm": 1.094040870666504, + "learning_rate": 1.9823395391619575e-05, + "loss": 0.2472, + "step": 7696 + }, + { + "epoch": 0.15396, + "grad_norm": 1.6202055215835571, + "learning_rate": 1.9823134045206758e-05, + "loss": 0.3407, + "step": 7698 + }, + { + "epoch": 0.154, + "grad_norm": 1.2303259372711182, + "learning_rate": 1.982287250728689e-05, + "loss": 0.2432, + "step": 7700 + }, + { + "epoch": 0.15404, + "grad_norm": 0.8033788204193115, + "learning_rate": 1.982261077786506e-05, + "loss": 0.2156, + "step": 7702 + }, + { + "epoch": 0.15408, + "grad_norm": 1.2583082914352417, + "learning_rate": 1.982234885694638e-05, + "loss": 0.233, + "step": 7704 + }, + { + "epoch": 0.15412, + "grad_norm": 1.221848964691162, + "learning_rate": 1.982208674453595e-05, + "loss": 0.2126, + "step": 7706 + }, + { + "epoch": 0.15416, + "grad_norm": 1.3908582925796509, + "learning_rate": 1.9821824440638885e-05, + "loss": 0.2352, + "step": 7708 + }, + { + "epoch": 0.1542, + "grad_norm": 1.2469074726104736, + "learning_rate": 1.9821561945260292e-05, + "loss": 0.2151, + "step": 7710 + }, + { + "epoch": 0.15424, + "grad_norm": 0.8834021687507629, + "learning_rate": 1.9821299258405297e-05, + "loss": 0.2352, + "step": 7712 + }, + { + "epoch": 0.15428, + "grad_norm": 1.375472068786621, + "learning_rate": 1.9821036380079012e-05, + "loss": 0.1924, + "step": 7714 + }, + { + "epoch": 0.15432, + "grad_norm": 0.8992645144462585, + "learning_rate": 1.982077331028657e-05, + "loss": 0.2103, + "step": 7716 + }, + { + "epoch": 0.15436, + "grad_norm": 1.362004280090332, + "learning_rate": 1.9820510049033092e-05, + "loss": 0.2873, + "step": 7718 + }, + { + "epoch": 0.1544, + "grad_norm": 1.3448619842529297, + "learning_rate": 1.982024659632372e-05, + "loss": 0.2332, + "step": 7720 + }, + { + "epoch": 0.15444, + "grad_norm": 1.0288405418395996, + "learning_rate": 1.981998295216358e-05, + "loss": 0.1848, + "step": 7722 + }, + { + "epoch": 0.15448, + "grad_norm": 1.6688612699508667, + "learning_rate": 1.981971911655782e-05, + "loss": 0.2899, + "step": 7724 + }, + { + "epoch": 0.15452, + "grad_norm": 0.7649387717247009, + "learning_rate": 1.9819455089511578e-05, + "loss": 0.1178, + "step": 7726 + }, + { + "epoch": 0.15456, + "grad_norm": 0.9979507327079773, + "learning_rate": 1.9819190871030005e-05, + "loss": 0.1517, + "step": 7728 + }, + { + "epoch": 0.1546, + "grad_norm": 1.6177263259887695, + "learning_rate": 1.9818926461118254e-05, + "loss": 0.3007, + "step": 7730 + }, + { + "epoch": 0.15464, + "grad_norm": 1.4435412883758545, + "learning_rate": 1.9818661859781468e-05, + "loss": 0.2665, + "step": 7732 + }, + { + "epoch": 0.15468, + "grad_norm": 1.4548002481460571, + "learning_rate": 1.981839706702482e-05, + "loss": 0.2545, + "step": 7734 + }, + { + "epoch": 0.15472, + "grad_norm": 0.9010388255119324, + "learning_rate": 1.9818132082853466e-05, + "loss": 0.1442, + "step": 7736 + }, + { + "epoch": 0.15476, + "grad_norm": 1.327501654624939, + "learning_rate": 1.9817866907272574e-05, + "loss": 0.1782, + "step": 7738 + }, + { + "epoch": 0.1548, + "grad_norm": 2.2003066539764404, + "learning_rate": 1.981760154028731e-05, + "loss": 0.2922, + "step": 7740 + }, + { + "epoch": 0.15484, + "grad_norm": 1.3647412061691284, + "learning_rate": 1.981733598190285e-05, + "loss": 0.2431, + "step": 7742 + }, + { + "epoch": 0.15488, + "grad_norm": 1.005139708518982, + "learning_rate": 1.981707023212437e-05, + "loss": 0.2042, + "step": 7744 + }, + { + "epoch": 0.15492, + "grad_norm": 1.5023328065872192, + "learning_rate": 1.981680429095705e-05, + "loss": 0.243, + "step": 7746 + }, + { + "epoch": 0.15496, + "grad_norm": 1.4742494821548462, + "learning_rate": 1.9816538158406077e-05, + "loss": 0.1917, + "step": 7748 + }, + { + "epoch": 0.155, + "grad_norm": 1.828818917274475, + "learning_rate": 1.9816271834476642e-05, + "loss": 0.2727, + "step": 7750 + }, + { + "epoch": 0.15504, + "grad_norm": 1.2590709924697876, + "learning_rate": 1.9816005319173932e-05, + "loss": 0.2035, + "step": 7752 + }, + { + "epoch": 0.15508, + "grad_norm": 1.6004986763000488, + "learning_rate": 1.9815738612503142e-05, + "loss": 0.2144, + "step": 7754 + }, + { + "epoch": 0.15512, + "grad_norm": 0.9474157094955444, + "learning_rate": 1.981547171446948e-05, + "loss": 0.1074, + "step": 7756 + }, + { + "epoch": 0.15516, + "grad_norm": 1.8286941051483154, + "learning_rate": 1.981520462507814e-05, + "loss": 0.3403, + "step": 7758 + }, + { + "epoch": 0.1552, + "grad_norm": 1.6900080442428589, + "learning_rate": 1.981493734433433e-05, + "loss": 0.2028, + "step": 7760 + }, + { + "epoch": 0.15524, + "grad_norm": 1.8842462301254272, + "learning_rate": 1.9814669872243267e-05, + "loss": 0.2156, + "step": 7762 + }, + { + "epoch": 0.15528, + "grad_norm": 1.261971116065979, + "learning_rate": 1.981440220881016e-05, + "loss": 0.1301, + "step": 7764 + }, + { + "epoch": 0.15532, + "grad_norm": 1.4116379022598267, + "learning_rate": 1.981413435404023e-05, + "loss": 0.2347, + "step": 7766 + }, + { + "epoch": 0.15536, + "grad_norm": 1.4589776992797852, + "learning_rate": 1.98138663079387e-05, + "loss": 0.3159, + "step": 7768 + }, + { + "epoch": 0.1554, + "grad_norm": 1.869280457496643, + "learning_rate": 1.981359807051079e-05, + "loss": 0.2087, + "step": 7770 + }, + { + "epoch": 0.15544, + "grad_norm": 2.7272562980651855, + "learning_rate": 1.9813329641761738e-05, + "loss": 0.4238, + "step": 7772 + }, + { + "epoch": 0.15548, + "grad_norm": 0.6458905935287476, + "learning_rate": 1.9813061021696768e-05, + "loss": 0.2731, + "step": 7774 + }, + { + "epoch": 0.15552, + "grad_norm": 1.3797262907028198, + "learning_rate": 1.9812792210321123e-05, + "loss": 0.3009, + "step": 7776 + }, + { + "epoch": 0.15556, + "grad_norm": 2.1008527278900146, + "learning_rate": 1.9812523207640044e-05, + "loss": 0.11, + "step": 7778 + }, + { + "epoch": 0.1556, + "grad_norm": 1.2424241304397583, + "learning_rate": 1.981225401365877e-05, + "loss": 0.1599, + "step": 7780 + }, + { + "epoch": 0.15564, + "grad_norm": 0.7584575414657593, + "learning_rate": 1.9811984628382555e-05, + "loss": 0.1353, + "step": 7782 + }, + { + "epoch": 0.15568, + "grad_norm": 1.262882947921753, + "learning_rate": 1.9811715051816644e-05, + "loss": 0.2509, + "step": 7784 + }, + { + "epoch": 0.15572, + "grad_norm": 1.472593069076538, + "learning_rate": 1.9811445283966303e-05, + "loss": 0.2537, + "step": 7786 + }, + { + "epoch": 0.15576, + "grad_norm": 1.1555908918380737, + "learning_rate": 1.981117532483678e-05, + "loss": 0.2065, + "step": 7788 + }, + { + "epoch": 0.1558, + "grad_norm": 1.0211800336837769, + "learning_rate": 1.981090517443334e-05, + "loss": 0.1785, + "step": 7790 + }, + { + "epoch": 0.15584, + "grad_norm": 0.7283069491386414, + "learning_rate": 1.9810634832761258e-05, + "loss": 0.1196, + "step": 7792 + }, + { + "epoch": 0.15588, + "grad_norm": 0.9671344757080078, + "learning_rate": 1.9810364299825798e-05, + "loss": 0.1673, + "step": 7794 + }, + { + "epoch": 0.15592, + "grad_norm": 0.8068615794181824, + "learning_rate": 1.9810093575632237e-05, + "loss": 0.1688, + "step": 7796 + }, + { + "epoch": 0.15596, + "grad_norm": 1.0808343887329102, + "learning_rate": 1.9809822660185846e-05, + "loss": 0.1178, + "step": 7798 + }, + { + "epoch": 0.156, + "grad_norm": 0.9244740605354309, + "learning_rate": 1.9809551553491918e-05, + "loss": 0.2845, + "step": 7800 + }, + { + "epoch": 0.15604, + "grad_norm": 1.320740818977356, + "learning_rate": 1.980928025555573e-05, + "loss": 0.1619, + "step": 7802 + }, + { + "epoch": 0.15608, + "grad_norm": 1.1923197507858276, + "learning_rate": 1.980900876638257e-05, + "loss": 0.1989, + "step": 7804 + }, + { + "epoch": 0.15612, + "grad_norm": 0.8915482759475708, + "learning_rate": 1.9808737085977737e-05, + "loss": 0.1404, + "step": 7806 + }, + { + "epoch": 0.15616, + "grad_norm": 1.295105218887329, + "learning_rate": 1.9808465214346525e-05, + "loss": 0.226, + "step": 7808 + }, + { + "epoch": 0.1562, + "grad_norm": 2.115748405456543, + "learning_rate": 1.9808193151494233e-05, + "loss": 0.2277, + "step": 7810 + }, + { + "epoch": 0.15624, + "grad_norm": 1.3675626516342163, + "learning_rate": 1.9807920897426168e-05, + "loss": 0.1844, + "step": 7812 + }, + { + "epoch": 0.15628, + "grad_norm": 1.725345492362976, + "learning_rate": 1.9807648452147632e-05, + "loss": 0.2428, + "step": 7814 + }, + { + "epoch": 0.15632, + "grad_norm": 0.5177317261695862, + "learning_rate": 1.9807375815663944e-05, + "loss": 0.1775, + "step": 7816 + }, + { + "epoch": 0.15636, + "grad_norm": 1.249965786933899, + "learning_rate": 1.980710298798041e-05, + "loss": 0.1305, + "step": 7818 + }, + { + "epoch": 0.1564, + "grad_norm": 1.4295953512191772, + "learning_rate": 1.9806829969102356e-05, + "loss": 0.1235, + "step": 7820 + }, + { + "epoch": 0.15644, + "grad_norm": 1.7945455312728882, + "learning_rate": 1.980655675903511e-05, + "loss": 0.2036, + "step": 7822 + }, + { + "epoch": 0.15648, + "grad_norm": 1.9105643033981323, + "learning_rate": 1.980628335778398e-05, + "loss": 0.1744, + "step": 7824 + }, + { + "epoch": 0.15652, + "grad_norm": 1.4922873973846436, + "learning_rate": 1.9806009765354313e-05, + "loss": 0.1438, + "step": 7826 + }, + { + "epoch": 0.15656, + "grad_norm": 1.8857132196426392, + "learning_rate": 1.980573598175144e-05, + "loss": 0.2036, + "step": 7828 + }, + { + "epoch": 0.1566, + "grad_norm": 1.7197229862213135, + "learning_rate": 1.9805462006980688e-05, + "loss": 0.2442, + "step": 7830 + }, + { + "epoch": 0.15664, + "grad_norm": 1.837611198425293, + "learning_rate": 1.9805187841047412e-05, + "loss": 0.1304, + "step": 7832 + }, + { + "epoch": 0.15668, + "grad_norm": 1.5980074405670166, + "learning_rate": 1.980491348395695e-05, + "loss": 0.149, + "step": 7834 + }, + { + "epoch": 0.15672, + "grad_norm": 1.1150977611541748, + "learning_rate": 1.980463893571465e-05, + "loss": 0.1175, + "step": 7836 + }, + { + "epoch": 0.15676, + "grad_norm": 0.5311217308044434, + "learning_rate": 1.9804364196325867e-05, + "loss": 0.311, + "step": 7838 + }, + { + "epoch": 0.1568, + "grad_norm": 2.4319536685943604, + "learning_rate": 1.980408926579596e-05, + "loss": 0.4106, + "step": 7840 + }, + { + "epoch": 0.15684, + "grad_norm": 1.6523737907409668, + "learning_rate": 1.980381414413028e-05, + "loss": 0.1436, + "step": 7842 + }, + { + "epoch": 0.15688, + "grad_norm": 0.9188510179519653, + "learning_rate": 1.98035388313342e-05, + "loss": 0.0578, + "step": 7844 + }, + { + "epoch": 0.15692, + "grad_norm": 1.0665723085403442, + "learning_rate": 1.9803263327413077e-05, + "loss": 0.139, + "step": 7846 + }, + { + "epoch": 0.15696, + "grad_norm": 0.7280219197273254, + "learning_rate": 1.9802987632372296e-05, + "loss": 0.1605, + "step": 7848 + }, + { + "epoch": 0.157, + "grad_norm": 1.1536760330200195, + "learning_rate": 1.9802711746217222e-05, + "loss": 0.1265, + "step": 7850 + }, + { + "epoch": 0.15704, + "grad_norm": 1.5144187211990356, + "learning_rate": 1.9802435668953233e-05, + "loss": 0.0888, + "step": 7852 + }, + { + "epoch": 0.15708, + "grad_norm": 2.3853790760040283, + "learning_rate": 1.9802159400585716e-05, + "loss": 0.3381, + "step": 7854 + }, + { + "epoch": 0.15712, + "grad_norm": 1.7646344900131226, + "learning_rate": 1.980188294112005e-05, + "loss": 0.0986, + "step": 7856 + }, + { + "epoch": 0.15716, + "grad_norm": 3.4248101711273193, + "learning_rate": 1.980160629056164e-05, + "loss": 0.551, + "step": 7858 + }, + { + "epoch": 0.1572, + "grad_norm": 0.5433374047279358, + "learning_rate": 1.9801329448915863e-05, + "loss": 0.1098, + "step": 7860 + }, + { + "epoch": 0.15724, + "grad_norm": 2.2983736991882324, + "learning_rate": 1.9801052416188122e-05, + "loss": 0.2661, + "step": 7862 + }, + { + "epoch": 0.15728, + "grad_norm": 1.1316577196121216, + "learning_rate": 1.980077519238382e-05, + "loss": 0.1122, + "step": 7864 + }, + { + "epoch": 0.15732, + "grad_norm": 5.135542392730713, + "learning_rate": 1.9800497777508357e-05, + "loss": 0.3068, + "step": 7866 + }, + { + "epoch": 0.15736, + "grad_norm": 0.8710520267486572, + "learning_rate": 1.9800220171567145e-05, + "loss": 0.1268, + "step": 7868 + }, + { + "epoch": 0.1574, + "grad_norm": 4.733838081359863, + "learning_rate": 1.9799942374565597e-05, + "loss": 0.4964, + "step": 7870 + }, + { + "epoch": 0.15744, + "grad_norm": 2.4045987129211426, + "learning_rate": 1.9799664386509127e-05, + "loss": 0.1926, + "step": 7872 + }, + { + "epoch": 0.15748, + "grad_norm": 2.206483840942383, + "learning_rate": 1.9799386207403153e-05, + "loss": 0.1687, + "step": 7874 + }, + { + "epoch": 0.15752, + "grad_norm": 2.4962193965911865, + "learning_rate": 1.97991078372531e-05, + "loss": 0.34, + "step": 7876 + }, + { + "epoch": 0.15756, + "grad_norm": 0.972162127494812, + "learning_rate": 1.97988292760644e-05, + "loss": 0.5218, + "step": 7878 + }, + { + "epoch": 0.1576, + "grad_norm": 1.1167972087860107, + "learning_rate": 1.979855052384247e-05, + "loss": 0.1636, + "step": 7880 + }, + { + "epoch": 0.15764, + "grad_norm": 2.0149123668670654, + "learning_rate": 1.9798271580592758e-05, + "loss": 0.2351, + "step": 7882 + }, + { + "epoch": 0.15768, + "grad_norm": 2.050534725189209, + "learning_rate": 1.9797992446320695e-05, + "loss": 0.1482, + "step": 7884 + }, + { + "epoch": 0.15772, + "grad_norm": 1.5749183893203735, + "learning_rate": 1.979771312103173e-05, + "loss": 0.2774, + "step": 7886 + }, + { + "epoch": 0.15776, + "grad_norm": 1.2640669345855713, + "learning_rate": 1.9797433604731297e-05, + "loss": 0.1514, + "step": 7888 + }, + { + "epoch": 0.1578, + "grad_norm": 2.2213544845581055, + "learning_rate": 1.9797153897424854e-05, + "loss": 0.3282, + "step": 7890 + }, + { + "epoch": 0.15784, + "grad_norm": 1.0202912092208862, + "learning_rate": 1.9796873999117848e-05, + "loss": 0.1172, + "step": 7892 + }, + { + "epoch": 0.15788, + "grad_norm": 1.13556706905365, + "learning_rate": 1.9796593909815748e-05, + "loss": 0.1381, + "step": 7894 + }, + { + "epoch": 0.15792, + "grad_norm": 0.967321515083313, + "learning_rate": 1.9796313629524e-05, + "loss": 0.1304, + "step": 7896 + }, + { + "epoch": 0.15796, + "grad_norm": 1.8784310817718506, + "learning_rate": 1.9796033158248077e-05, + "loss": 0.1399, + "step": 7898 + }, + { + "epoch": 0.158, + "grad_norm": 1.6344741582870483, + "learning_rate": 1.979575249599344e-05, + "loss": 0.1671, + "step": 7900 + }, + { + "epoch": 0.15804, + "grad_norm": 1.0357130765914917, + "learning_rate": 1.9795471642765573e-05, + "loss": 0.0854, + "step": 7902 + }, + { + "epoch": 0.15808, + "grad_norm": 1.234656810760498, + "learning_rate": 1.9795190598569936e-05, + "loss": 0.3036, + "step": 7904 + }, + { + "epoch": 0.15812, + "grad_norm": 2.404770612716675, + "learning_rate": 1.979490936341202e-05, + "loss": 0.2657, + "step": 7906 + }, + { + "epoch": 0.15816, + "grad_norm": 0.8080562949180603, + "learning_rate": 1.97946279372973e-05, + "loss": 0.1689, + "step": 7908 + }, + { + "epoch": 0.1582, + "grad_norm": 2.5708272457122803, + "learning_rate": 1.9794346320231265e-05, + "loss": 0.2424, + "step": 7910 + }, + { + "epoch": 0.15824, + "grad_norm": 1.3143223524093628, + "learning_rate": 1.979406451221941e-05, + "loss": 0.138, + "step": 7912 + }, + { + "epoch": 0.15828, + "grad_norm": 2.8661274909973145, + "learning_rate": 1.979378251326722e-05, + "loss": 0.3847, + "step": 7914 + }, + { + "epoch": 0.15832, + "grad_norm": 1.6295055150985718, + "learning_rate": 1.97935003233802e-05, + "loss": 0.1321, + "step": 7916 + }, + { + "epoch": 0.15836, + "grad_norm": 3.8195509910583496, + "learning_rate": 1.9793217942563853e-05, + "loss": 0.4343, + "step": 7918 + }, + { + "epoch": 0.1584, + "grad_norm": 3.0890591144561768, + "learning_rate": 1.9792935370823676e-05, + "loss": 0.2652, + "step": 7920 + }, + { + "epoch": 0.15844, + "grad_norm": 2.791710376739502, + "learning_rate": 1.979265260816518e-05, + "loss": 0.264, + "step": 7922 + }, + { + "epoch": 0.15848, + "grad_norm": 1.121742844581604, + "learning_rate": 1.9792369654593884e-05, + "loss": 0.1113, + "step": 7924 + }, + { + "epoch": 0.15852, + "grad_norm": 0.9049354195594788, + "learning_rate": 1.97920865101153e-05, + "loss": 0.1143, + "step": 7926 + }, + { + "epoch": 0.15856, + "grad_norm": 0.8561918139457703, + "learning_rate": 1.9791803174734946e-05, + "loss": 0.1303, + "step": 7928 + }, + { + "epoch": 0.1586, + "grad_norm": 0.8139204382896423, + "learning_rate": 1.9791519648458352e-05, + "loss": 0.0925, + "step": 7930 + }, + { + "epoch": 0.15864, + "grad_norm": 2.0779788494110107, + "learning_rate": 1.979123593129104e-05, + "loss": 0.1731, + "step": 7932 + }, + { + "epoch": 0.15868, + "grad_norm": 3.045546293258667, + "learning_rate": 1.9790952023238538e-05, + "loss": 0.6575, + "step": 7934 + }, + { + "epoch": 0.15872, + "grad_norm": 2.748873233795166, + "learning_rate": 1.979066792430639e-05, + "loss": 0.4173, + "step": 7936 + }, + { + "epoch": 0.15876, + "grad_norm": 1.8792654275894165, + "learning_rate": 1.979038363450013e-05, + "loss": 0.3666, + "step": 7938 + }, + { + "epoch": 0.1588, + "grad_norm": 2.412562847137451, + "learning_rate": 1.97900991538253e-05, + "loss": 0.3546, + "step": 7940 + }, + { + "epoch": 0.15884, + "grad_norm": 0.6117770075798035, + "learning_rate": 1.9789814482287444e-05, + "loss": 0.1182, + "step": 7942 + }, + { + "epoch": 0.15888, + "grad_norm": 2.466597557067871, + "learning_rate": 1.978952961989212e-05, + "loss": 0.301, + "step": 7944 + }, + { + "epoch": 0.15892, + "grad_norm": 1.2001243829727173, + "learning_rate": 1.9789244566644873e-05, + "loss": 0.277, + "step": 7946 + }, + { + "epoch": 0.15896, + "grad_norm": 1.2108514308929443, + "learning_rate": 1.9788959322551262e-05, + "loss": 0.2423, + "step": 7948 + }, + { + "epoch": 0.159, + "grad_norm": 1.2298455238342285, + "learning_rate": 1.9788673887616852e-05, + "loss": 0.2428, + "step": 7950 + }, + { + "epoch": 0.15904, + "grad_norm": 0.9725298285484314, + "learning_rate": 1.9788388261847204e-05, + "loss": 0.1372, + "step": 7952 + }, + { + "epoch": 0.15908, + "grad_norm": 1.904570460319519, + "learning_rate": 1.9788102445247887e-05, + "loss": 0.2922, + "step": 7954 + }, + { + "epoch": 0.15912, + "grad_norm": 1.1969773769378662, + "learning_rate": 1.9787816437824474e-05, + "loss": 0.188, + "step": 7956 + }, + { + "epoch": 0.15916, + "grad_norm": 1.4230921268463135, + "learning_rate": 1.978753023958254e-05, + "loss": 0.1445, + "step": 7958 + }, + { + "epoch": 0.1592, + "grad_norm": 0.907671332359314, + "learning_rate": 1.9787243850527663e-05, + "loss": 0.1811, + "step": 7960 + }, + { + "epoch": 0.15924, + "grad_norm": 0.7286914587020874, + "learning_rate": 1.9786957270665432e-05, + "loss": 0.1925, + "step": 7962 + }, + { + "epoch": 0.15928, + "grad_norm": 1.5392472743988037, + "learning_rate": 1.978667050000143e-05, + "loss": 0.1671, + "step": 7964 + }, + { + "epoch": 0.15932, + "grad_norm": 1.597069501876831, + "learning_rate": 1.978638353854125e-05, + "loss": 0.223, + "step": 7966 + }, + { + "epoch": 0.15936, + "grad_norm": 1.479217290878296, + "learning_rate": 1.978609638629048e-05, + "loss": 0.3103, + "step": 7968 + }, + { + "epoch": 0.1594, + "grad_norm": 2.19864821434021, + "learning_rate": 1.978580904325472e-05, + "loss": 0.3203, + "step": 7970 + }, + { + "epoch": 0.15944, + "grad_norm": 1.0315380096435547, + "learning_rate": 1.978552150943958e-05, + "loss": 0.1234, + "step": 7972 + }, + { + "epoch": 0.15948, + "grad_norm": 1.7222741842269897, + "learning_rate": 1.978523378485066e-05, + "loss": 0.2327, + "step": 7974 + }, + { + "epoch": 0.15952, + "grad_norm": 1.009045958518982, + "learning_rate": 1.9784945869493568e-05, + "loss": 0.1235, + "step": 7976 + }, + { + "epoch": 0.15956, + "grad_norm": 2.024458408355713, + "learning_rate": 1.9784657763373922e-05, + "loss": 0.1684, + "step": 7978 + }, + { + "epoch": 0.1596, + "grad_norm": 0.9941166043281555, + "learning_rate": 1.9784369466497333e-05, + "loss": 0.257, + "step": 7980 + }, + { + "epoch": 0.15964, + "grad_norm": 1.9200680255889893, + "learning_rate": 1.9784080978869422e-05, + "loss": 0.1746, + "step": 7982 + }, + { + "epoch": 0.15968, + "grad_norm": 1.0204923152923584, + "learning_rate": 1.9783792300495817e-05, + "loss": 0.1172, + "step": 7984 + }, + { + "epoch": 0.15972, + "grad_norm": 1.5491584539413452, + "learning_rate": 1.9783503431382143e-05, + "loss": 0.3313, + "step": 7986 + }, + { + "epoch": 0.15976, + "grad_norm": 0.5088809132575989, + "learning_rate": 1.9783214371534037e-05, + "loss": 0.3065, + "step": 7988 + }, + { + "epoch": 0.1598, + "grad_norm": 0.7038092017173767, + "learning_rate": 1.9782925120957123e-05, + "loss": 0.0926, + "step": 7990 + }, + { + "epoch": 0.15984, + "grad_norm": 2.8039393424987793, + "learning_rate": 1.9782635679657053e-05, + "loss": 0.2575, + "step": 7992 + }, + { + "epoch": 0.15988, + "grad_norm": 0.9033272862434387, + "learning_rate": 1.9782346047639462e-05, + "loss": 0.1192, + "step": 7994 + }, + { + "epoch": 0.15992, + "grad_norm": 0.6262643337249756, + "learning_rate": 1.9782056224909997e-05, + "loss": 0.0767, + "step": 7996 + }, + { + "epoch": 0.15996, + "grad_norm": 3.338061571121216, + "learning_rate": 1.9781766211474313e-05, + "loss": 0.2654, + "step": 7998 + }, + { + "epoch": 0.16, + "grad_norm": 3.7527153491973877, + "learning_rate": 1.9781476007338058e-05, + "loss": 0.4708, + "step": 8000 + }, + { + "epoch": 0.16004, + "grad_norm": 0.7727829217910767, + "learning_rate": 1.9781185612506894e-05, + "loss": 0.0685, + "step": 8002 + }, + { + "epoch": 0.16008, + "grad_norm": 0.42226293683052063, + "learning_rate": 1.978089502698648e-05, + "loss": 0.0478, + "step": 8004 + }, + { + "epoch": 0.16012, + "grad_norm": 1.068312406539917, + "learning_rate": 1.978060425078248e-05, + "loss": 0.2427, + "step": 8006 + }, + { + "epoch": 0.16016, + "grad_norm": 1.8335020542144775, + "learning_rate": 1.9780313283900568e-05, + "loss": 0.2329, + "step": 8008 + }, + { + "epoch": 0.1602, + "grad_norm": 0.6957056522369385, + "learning_rate": 1.9780022126346413e-05, + "loss": 0.1969, + "step": 8010 + }, + { + "epoch": 0.16024, + "grad_norm": 0.9854130148887634, + "learning_rate": 1.977973077812569e-05, + "loss": 0.1302, + "step": 8012 + }, + { + "epoch": 0.16028, + "grad_norm": 1.8940179347991943, + "learning_rate": 1.977943923924408e-05, + "loss": 0.1585, + "step": 8014 + }, + { + "epoch": 0.16032, + "grad_norm": 3.3470258712768555, + "learning_rate": 1.9779147509707267e-05, + "loss": 0.2777, + "step": 8016 + }, + { + "epoch": 0.16036, + "grad_norm": 2.93740177154541, + "learning_rate": 1.9778855589520943e-05, + "loss": 0.356, + "step": 8018 + }, + { + "epoch": 0.1604, + "grad_norm": 2.05754017829895, + "learning_rate": 1.977856347869079e-05, + "loss": 0.178, + "step": 8020 + }, + { + "epoch": 0.16044, + "grad_norm": 0.7458534240722656, + "learning_rate": 1.977827117722251e-05, + "loss": 0.1607, + "step": 8022 + }, + { + "epoch": 0.16048, + "grad_norm": 1.4974874258041382, + "learning_rate": 1.97779786851218e-05, + "loss": 0.2635, + "step": 8024 + }, + { + "epoch": 0.16052, + "grad_norm": 2.1404976844787598, + "learning_rate": 1.977768600239436e-05, + "loss": 0.1626, + "step": 8026 + }, + { + "epoch": 0.16056, + "grad_norm": 1.7384084463119507, + "learning_rate": 1.9777393129045895e-05, + "loss": 0.1051, + "step": 8028 + }, + { + "epoch": 0.1606, + "grad_norm": 2.439089775085449, + "learning_rate": 1.977710006508212e-05, + "loss": 0.2535, + "step": 8030 + }, + { + "epoch": 0.16064, + "grad_norm": 0.7764129042625427, + "learning_rate": 1.9776806810508748e-05, + "loss": 0.0867, + "step": 8032 + }, + { + "epoch": 0.16068, + "grad_norm": 1.500686526298523, + "learning_rate": 1.977651336533149e-05, + "loss": 0.1302, + "step": 8034 + }, + { + "epoch": 0.16072, + "grad_norm": 1.9829046726226807, + "learning_rate": 1.977621972955607e-05, + "loss": 0.1619, + "step": 8036 + }, + { + "epoch": 0.16076, + "grad_norm": 0.3426654636859894, + "learning_rate": 1.9775925903188216e-05, + "loss": 0.0716, + "step": 8038 + }, + { + "epoch": 0.1608, + "grad_norm": 2.413304567337036, + "learning_rate": 1.9775631886233655e-05, + "loss": 0.1345, + "step": 8040 + }, + { + "epoch": 0.16084, + "grad_norm": 1.003629207611084, + "learning_rate": 1.9775337678698113e-05, + "loss": 0.0631, + "step": 8042 + }, + { + "epoch": 0.16088, + "grad_norm": 5.033443450927734, + "learning_rate": 1.9775043280587335e-05, + "loss": 0.6987, + "step": 8044 + }, + { + "epoch": 0.16092, + "grad_norm": 1.0900111198425293, + "learning_rate": 1.9774748691907052e-05, + "loss": 0.0749, + "step": 8046 + }, + { + "epoch": 0.16096, + "grad_norm": 4.094568252563477, + "learning_rate": 1.977445391266301e-05, + "loss": 0.4142, + "step": 8048 + }, + { + "epoch": 0.161, + "grad_norm": 1.7577019929885864, + "learning_rate": 1.9774158942860962e-05, + "loss": 0.212, + "step": 8050 + }, + { + "epoch": 0.16104, + "grad_norm": 1.7704410552978516, + "learning_rate": 1.9773863782506647e-05, + "loss": 0.1369, + "step": 8052 + }, + { + "epoch": 0.16108, + "grad_norm": 2.9185233116149902, + "learning_rate": 1.977356843160583e-05, + "loss": 0.2922, + "step": 8054 + }, + { + "epoch": 0.16112, + "grad_norm": 3.012589931488037, + "learning_rate": 1.9773272890164264e-05, + "loss": 0.4341, + "step": 8056 + }, + { + "epoch": 0.16116, + "grad_norm": 2.137446165084839, + "learning_rate": 1.977297715818771e-05, + "loss": 0.1351, + "step": 8058 + }, + { + "epoch": 0.1612, + "grad_norm": 2.1837666034698486, + "learning_rate": 1.9772681235681936e-05, + "loss": 0.2558, + "step": 8060 + }, + { + "epoch": 0.16124, + "grad_norm": 1.9255954027175903, + "learning_rate": 1.977238512265271e-05, + "loss": 0.2886, + "step": 8062 + }, + { + "epoch": 0.16128, + "grad_norm": 1.0691214799880981, + "learning_rate": 1.9772088819105804e-05, + "loss": 0.2216, + "step": 8064 + }, + { + "epoch": 0.16132, + "grad_norm": 1.7505842447280884, + "learning_rate": 1.9771792325046997e-05, + "loss": 0.2148, + "step": 8066 + }, + { + "epoch": 0.16136, + "grad_norm": 2.1301605701446533, + "learning_rate": 1.9771495640482064e-05, + "loss": 0.2962, + "step": 8068 + }, + { + "epoch": 0.1614, + "grad_norm": 2.467815637588501, + "learning_rate": 1.97711987654168e-05, + "loss": 0.2631, + "step": 8070 + }, + { + "epoch": 0.16144, + "grad_norm": 1.0406200885772705, + "learning_rate": 1.9770901699856978e-05, + "loss": 0.3001, + "step": 8072 + }, + { + "epoch": 0.16148, + "grad_norm": 1.6223191022872925, + "learning_rate": 1.9770604443808403e-05, + "loss": 0.3735, + "step": 8074 + }, + { + "epoch": 0.16152, + "grad_norm": 1.4340885877609253, + "learning_rate": 1.977030699727686e-05, + "loss": 0.2442, + "step": 8076 + }, + { + "epoch": 0.16156, + "grad_norm": 1.8097821474075317, + "learning_rate": 1.977000936026815e-05, + "loss": 0.3135, + "step": 8078 + }, + { + "epoch": 0.1616, + "grad_norm": 0.4973846673965454, + "learning_rate": 1.9769711532788083e-05, + "loss": 0.2204, + "step": 8080 + }, + { + "epoch": 0.16164, + "grad_norm": 0.8633642792701721, + "learning_rate": 1.9769413514842458e-05, + "loss": 0.2373, + "step": 8082 + }, + { + "epoch": 0.16168, + "grad_norm": 1.824936866760254, + "learning_rate": 1.9769115306437087e-05, + "loss": 0.2596, + "step": 8084 + }, + { + "epoch": 0.16172, + "grad_norm": 1.9683568477630615, + "learning_rate": 1.976881690757778e-05, + "loss": 0.2508, + "step": 8086 + }, + { + "epoch": 0.16176, + "grad_norm": 0.3019770085811615, + "learning_rate": 1.976851831827036e-05, + "loss": 0.0643, + "step": 8088 + }, + { + "epoch": 0.1618, + "grad_norm": 1.7292356491088867, + "learning_rate": 1.976821953852065e-05, + "loss": 0.154, + "step": 8090 + }, + { + "epoch": 0.16184, + "grad_norm": 1.3898652791976929, + "learning_rate": 1.9767920568334468e-05, + "loss": 0.3551, + "step": 8092 + }, + { + "epoch": 0.16188, + "grad_norm": 1.5000704526901245, + "learning_rate": 1.9767621407717647e-05, + "loss": 0.2329, + "step": 8094 + }, + { + "epoch": 0.16192, + "grad_norm": 1.7535135746002197, + "learning_rate": 1.9767322056676018e-05, + "loss": 0.2664, + "step": 8096 + }, + { + "epoch": 0.16196, + "grad_norm": 1.504202961921692, + "learning_rate": 1.9767022515215416e-05, + "loss": 0.1844, + "step": 8098 + }, + { + "epoch": 0.162, + "grad_norm": 1.2930299043655396, + "learning_rate": 1.9766722783341682e-05, + "loss": 0.1711, + "step": 8100 + }, + { + "epoch": 0.16204, + "grad_norm": 2.1406443119049072, + "learning_rate": 1.9766422861060658e-05, + "loss": 0.2597, + "step": 8102 + }, + { + "epoch": 0.16208, + "grad_norm": 1.6144795417785645, + "learning_rate": 1.976612274837819e-05, + "loss": 0.2126, + "step": 8104 + }, + { + "epoch": 0.16212, + "grad_norm": 1.357349157333374, + "learning_rate": 1.9765822445300138e-05, + "loss": 0.2147, + "step": 8106 + }, + { + "epoch": 0.16216, + "grad_norm": 0.6329825520515442, + "learning_rate": 1.9765521951832346e-05, + "loss": 0.1588, + "step": 8108 + }, + { + "epoch": 0.1622, + "grad_norm": 0.852053701877594, + "learning_rate": 1.9765221267980675e-05, + "loss": 0.1554, + "step": 8110 + }, + { + "epoch": 0.16224, + "grad_norm": 1.133777141571045, + "learning_rate": 1.976492039375099e-05, + "loss": 0.1236, + "step": 8112 + }, + { + "epoch": 0.16228, + "grad_norm": 1.8188071250915527, + "learning_rate": 1.9764619329149152e-05, + "loss": 0.2961, + "step": 8114 + }, + { + "epoch": 0.16232, + "grad_norm": 0.518783450126648, + "learning_rate": 1.9764318074181034e-05, + "loss": 0.0889, + "step": 8116 + }, + { + "epoch": 0.16236, + "grad_norm": 1.83390474319458, + "learning_rate": 1.9764016628852506e-05, + "loss": 0.222, + "step": 8118 + }, + { + "epoch": 0.1624, + "grad_norm": 2.2848639488220215, + "learning_rate": 1.976371499316945e-05, + "loss": 0.2726, + "step": 8120 + }, + { + "epoch": 0.16244, + "grad_norm": 1.3829045295715332, + "learning_rate": 1.976341316713774e-05, + "loss": 0.139, + "step": 8122 + }, + { + "epoch": 0.16248, + "grad_norm": 0.3764301836490631, + "learning_rate": 1.976311115076327e-05, + "loss": 0.0553, + "step": 8124 + }, + { + "epoch": 0.16252, + "grad_norm": 1.1030856370925903, + "learning_rate": 1.976280894405192e-05, + "loss": 0.4016, + "step": 8126 + }, + { + "epoch": 0.16256, + "grad_norm": 1.0532886981964111, + "learning_rate": 1.976250654700958e-05, + "loss": 0.1966, + "step": 8128 + }, + { + "epoch": 0.1626, + "grad_norm": 0.7048934698104858, + "learning_rate": 1.976220395964215e-05, + "loss": 0.2203, + "step": 8130 + }, + { + "epoch": 0.16264, + "grad_norm": 1.8956159353256226, + "learning_rate": 1.976190118195553e-05, + "loss": 0.4792, + "step": 8132 + }, + { + "epoch": 0.16268, + "grad_norm": 1.876881718635559, + "learning_rate": 1.976159821395562e-05, + "loss": 0.3508, + "step": 8134 + }, + { + "epoch": 0.16272, + "grad_norm": 1.0276154279708862, + "learning_rate": 1.9761295055648323e-05, + "loss": 0.1516, + "step": 8136 + }, + { + "epoch": 0.16276, + "grad_norm": 0.9506011605262756, + "learning_rate": 1.9760991707039555e-05, + "loss": 0.2485, + "step": 8138 + }, + { + "epoch": 0.1628, + "grad_norm": 1.366457462310791, + "learning_rate": 1.9760688168135233e-05, + "loss": 0.2232, + "step": 8140 + }, + { + "epoch": 0.16284, + "grad_norm": 0.6617511510848999, + "learning_rate": 1.9760384438941266e-05, + "loss": 0.1441, + "step": 8142 + }, + { + "epoch": 0.16288, + "grad_norm": 0.8019676208496094, + "learning_rate": 1.976008051946358e-05, + "loss": 0.106, + "step": 8144 + }, + { + "epoch": 0.16292, + "grad_norm": 1.4459539651870728, + "learning_rate": 1.97597764097081e-05, + "loss": 0.2147, + "step": 8146 + }, + { + "epoch": 0.16296, + "grad_norm": 0.9426968693733215, + "learning_rate": 1.9759472109680754e-05, + "loss": 0.1175, + "step": 8148 + }, + { + "epoch": 0.163, + "grad_norm": 2.2441742420196533, + "learning_rate": 1.9759167619387474e-05, + "loss": 0.296, + "step": 8150 + }, + { + "epoch": 0.16304, + "grad_norm": 2.490241289138794, + "learning_rate": 1.97588629388342e-05, + "loss": 0.2592, + "step": 8152 + }, + { + "epoch": 0.16308, + "grad_norm": 2.1092092990875244, + "learning_rate": 1.975855806802687e-05, + "loss": 0.211, + "step": 8154 + }, + { + "epoch": 0.16312, + "grad_norm": 0.7358099222183228, + "learning_rate": 1.9758253006971418e-05, + "loss": 0.1011, + "step": 8156 + }, + { + "epoch": 0.16316, + "grad_norm": 2.1907596588134766, + "learning_rate": 1.9757947755673804e-05, + "loss": 0.2289, + "step": 8158 + }, + { + "epoch": 0.1632, + "grad_norm": 0.9307940602302551, + "learning_rate": 1.9757642314139977e-05, + "loss": 0.0948, + "step": 8160 + }, + { + "epoch": 0.16324, + "grad_norm": 1.2612080574035645, + "learning_rate": 1.9757336682375888e-05, + "loss": 0.1986, + "step": 8162 + }, + { + "epoch": 0.16328, + "grad_norm": 2.200049877166748, + "learning_rate": 1.97570308603875e-05, + "loss": 0.2556, + "step": 8164 + }, + { + "epoch": 0.16332, + "grad_norm": 2.0361533164978027, + "learning_rate": 1.9756724848180767e-05, + "loss": 0.2277, + "step": 8166 + }, + { + "epoch": 0.16336, + "grad_norm": 2.06634259223938, + "learning_rate": 1.975641864576166e-05, + "loss": 0.3831, + "step": 8168 + }, + { + "epoch": 0.1634, + "grad_norm": 1.9153774976730347, + "learning_rate": 1.9756112253136154e-05, + "loss": 0.2534, + "step": 8170 + }, + { + "epoch": 0.16344, + "grad_norm": 2.2348759174346924, + "learning_rate": 1.975580567031021e-05, + "loss": 0.2375, + "step": 8172 + }, + { + "epoch": 0.16348, + "grad_norm": 1.2152385711669922, + "learning_rate": 1.9755498897289816e-05, + "loss": 0.1369, + "step": 8174 + }, + { + "epoch": 0.16352, + "grad_norm": 1.2241684198379517, + "learning_rate": 1.975519193408095e-05, + "loss": 0.1986, + "step": 8176 + }, + { + "epoch": 0.16356, + "grad_norm": 0.934434175491333, + "learning_rate": 1.9754884780689592e-05, + "loss": 0.1175, + "step": 8178 + }, + { + "epoch": 0.1636, + "grad_norm": 2.8012282848358154, + "learning_rate": 1.9754577437121733e-05, + "loss": 0.3635, + "step": 8180 + }, + { + "epoch": 0.16364, + "grad_norm": 1.7608389854431152, + "learning_rate": 1.9754269903383366e-05, + "loss": 0.277, + "step": 8182 + }, + { + "epoch": 0.16368, + "grad_norm": 1.4734357595443726, + "learning_rate": 1.9753962179480485e-05, + "loss": 0.2919, + "step": 8184 + }, + { + "epoch": 0.16372, + "grad_norm": 1.311202883720398, + "learning_rate": 1.975365426541909e-05, + "loss": 0.176, + "step": 8186 + }, + { + "epoch": 0.16376, + "grad_norm": 2.584611654281616, + "learning_rate": 1.975334616120518e-05, + "loss": 0.2999, + "step": 8188 + }, + { + "epoch": 0.1638, + "grad_norm": 1.7681434154510498, + "learning_rate": 1.975303786684477e-05, + "loss": 0.1487, + "step": 8190 + }, + { + "epoch": 0.16384, + "grad_norm": 2.492894411087036, + "learning_rate": 1.9752729382343866e-05, + "loss": 0.3354, + "step": 8192 + }, + { + "epoch": 0.16388, + "grad_norm": 1.8861433267593384, + "learning_rate": 1.9752420707708478e-05, + "loss": 0.1812, + "step": 8194 + }, + { + "epoch": 0.16392, + "grad_norm": 1.4414589405059814, + "learning_rate": 1.975211184294463e-05, + "loss": 0.47, + "step": 8196 + }, + { + "epoch": 0.16396, + "grad_norm": 1.7751636505126953, + "learning_rate": 1.975180278805834e-05, + "loss": 0.3429, + "step": 8198 + }, + { + "epoch": 0.164, + "grad_norm": 1.0772689580917358, + "learning_rate": 1.9751493543055634e-05, + "loss": 0.1956, + "step": 8200 + }, + { + "epoch": 0.16404, + "grad_norm": 1.585402488708496, + "learning_rate": 1.975118410794254e-05, + "loss": 0.2592, + "step": 8202 + }, + { + "epoch": 0.16408, + "grad_norm": 1.4508670568466187, + "learning_rate": 1.9750874482725093e-05, + "loss": 0.2663, + "step": 8204 + }, + { + "epoch": 0.16412, + "grad_norm": 1.504185676574707, + "learning_rate": 1.975056466740933e-05, + "loss": 0.1687, + "step": 8206 + }, + { + "epoch": 0.16416, + "grad_norm": 1.5024521350860596, + "learning_rate": 1.9750254662001284e-05, + "loss": 0.2025, + "step": 8208 + }, + { + "epoch": 0.1642, + "grad_norm": 0.49100741744041443, + "learning_rate": 1.9749944466507007e-05, + "loss": 0.0519, + "step": 8210 + }, + { + "epoch": 0.16424, + "grad_norm": 1.2239534854888916, + "learning_rate": 1.9749634080932542e-05, + "loss": 0.139, + "step": 8212 + }, + { + "epoch": 0.16428, + "grad_norm": 1.865170955657959, + "learning_rate": 1.974932350528394e-05, + "loss": 0.2544, + "step": 8214 + }, + { + "epoch": 0.16432, + "grad_norm": 1.0865505933761597, + "learning_rate": 1.9749012739567258e-05, + "loss": 0.1856, + "step": 8216 + }, + { + "epoch": 0.16436, + "grad_norm": 1.3902794122695923, + "learning_rate": 1.9748701783788557e-05, + "loss": 0.2176, + "step": 8218 + }, + { + "epoch": 0.1644, + "grad_norm": 2.183964252471924, + "learning_rate": 1.974839063795389e-05, + "loss": 0.3405, + "step": 8220 + }, + { + "epoch": 0.16444, + "grad_norm": 0.8577554225921631, + "learning_rate": 1.974807930206933e-05, + "loss": 0.1057, + "step": 8222 + }, + { + "epoch": 0.16448, + "grad_norm": 1.4907234907150269, + "learning_rate": 1.974776777614095e-05, + "loss": 0.2145, + "step": 8224 + }, + { + "epoch": 0.16452, + "grad_norm": 0.5178972482681274, + "learning_rate": 1.9747456060174813e-05, + "loss": 0.0454, + "step": 8226 + }, + { + "epoch": 0.16456, + "grad_norm": 0.8638200163841248, + "learning_rate": 1.9747144154177005e-05, + "loss": 0.0799, + "step": 8228 + }, + { + "epoch": 0.1646, + "grad_norm": 1.1951395273208618, + "learning_rate": 1.9746832058153602e-05, + "loss": 0.1685, + "step": 8230 + }, + { + "epoch": 0.16464, + "grad_norm": 1.294569969177246, + "learning_rate": 1.9746519772110688e-05, + "loss": 0.0989, + "step": 8232 + }, + { + "epoch": 0.16468, + "grad_norm": 0.29972395300865173, + "learning_rate": 1.9746207296054356e-05, + "loss": 0.3697, + "step": 8234 + }, + { + "epoch": 0.16472, + "grad_norm": 2.4575560092926025, + "learning_rate": 1.9745894629990696e-05, + "loss": 0.3401, + "step": 8236 + }, + { + "epoch": 0.16476, + "grad_norm": 2.4175679683685303, + "learning_rate": 1.9745581773925802e-05, + "loss": 0.4087, + "step": 8238 + }, + { + "epoch": 0.1648, + "grad_norm": 1.0271929502487183, + "learning_rate": 1.9745268727865774e-05, + "loss": 0.1516, + "step": 8240 + }, + { + "epoch": 0.16484, + "grad_norm": 0.69126957654953, + "learning_rate": 1.9744955491816713e-05, + "loss": 0.122, + "step": 8242 + }, + { + "epoch": 0.16488, + "grad_norm": 0.5979064106941223, + "learning_rate": 1.9744642065784728e-05, + "loss": 0.1502, + "step": 8244 + }, + { + "epoch": 0.16492, + "grad_norm": 0.791756808757782, + "learning_rate": 1.974432844977593e-05, + "loss": 0.1636, + "step": 8246 + }, + { + "epoch": 0.16496, + "grad_norm": 0.45649150013923645, + "learning_rate": 1.9744014643796435e-05, + "loss": 0.2916, + "step": 8248 + }, + { + "epoch": 0.165, + "grad_norm": 0.5630388259887695, + "learning_rate": 1.9743700647852356e-05, + "loss": 0.0499, + "step": 8250 + }, + { + "epoch": 0.16504, + "grad_norm": 3.353104591369629, + "learning_rate": 1.9743386461949814e-05, + "loss": 0.4869, + "step": 8252 + }, + { + "epoch": 0.16508, + "grad_norm": 0.42473313212394714, + "learning_rate": 1.974307208609494e-05, + "loss": 0.0353, + "step": 8254 + }, + { + "epoch": 0.16512, + "grad_norm": 2.8483269214630127, + "learning_rate": 1.9742757520293856e-05, + "loss": 0.3395, + "step": 8256 + }, + { + "epoch": 0.16516, + "grad_norm": 2.6911370754241943, + "learning_rate": 1.97424427645527e-05, + "loss": 0.2431, + "step": 8258 + }, + { + "epoch": 0.1652, + "grad_norm": 1.367743730545044, + "learning_rate": 1.9742127818877605e-05, + "loss": 0.1176, + "step": 8260 + }, + { + "epoch": 0.16524, + "grad_norm": 0.4545295536518097, + "learning_rate": 1.9741812683274716e-05, + "loss": 0.1512, + "step": 8262 + }, + { + "epoch": 0.16528, + "grad_norm": 1.8575947284698486, + "learning_rate": 1.974149735775017e-05, + "loss": 0.1636, + "step": 8264 + }, + { + "epoch": 0.16532, + "grad_norm": 1.1169945001602173, + "learning_rate": 1.9741181842310123e-05, + "loss": 0.1173, + "step": 8266 + }, + { + "epoch": 0.16536, + "grad_norm": 1.8746329545974731, + "learning_rate": 1.9740866136960718e-05, + "loss": 0.2121, + "step": 8268 + }, + { + "epoch": 0.1654, + "grad_norm": 1.2426767349243164, + "learning_rate": 1.974055024170811e-05, + "loss": 0.1439, + "step": 8270 + }, + { + "epoch": 0.16544, + "grad_norm": 0.8974487781524658, + "learning_rate": 1.9740234156558463e-05, + "loss": 0.0804, + "step": 8272 + }, + { + "epoch": 0.16548, + "grad_norm": 2.0527565479278564, + "learning_rate": 1.9739917881517936e-05, + "loss": 0.2898, + "step": 8274 + }, + { + "epoch": 0.16552, + "grad_norm": 1.0517388582229614, + "learning_rate": 1.9739601416592693e-05, + "loss": 0.169, + "step": 8276 + }, + { + "epoch": 0.16556, + "grad_norm": 1.5912946462631226, + "learning_rate": 1.973928476178891e-05, + "loss": 0.1248, + "step": 8278 + }, + { + "epoch": 0.1656, + "grad_norm": 1.3403985500335693, + "learning_rate": 1.9738967917112752e-05, + "loss": 0.1238, + "step": 8280 + }, + { + "epoch": 0.16564, + "grad_norm": 3.4792838096618652, + "learning_rate": 1.9738650882570405e-05, + "loss": 0.3472, + "step": 8282 + }, + { + "epoch": 0.16568, + "grad_norm": 0.4769012928009033, + "learning_rate": 1.9738333658168047e-05, + "loss": 0.16, + "step": 8284 + }, + { + "epoch": 0.16572, + "grad_norm": 2.91790509223938, + "learning_rate": 1.9738016243911855e-05, + "loss": 0.2789, + "step": 8286 + }, + { + "epoch": 0.16576, + "grad_norm": 2.521251916885376, + "learning_rate": 1.9737698639808024e-05, + "loss": 0.1754, + "step": 8288 + }, + { + "epoch": 0.1658, + "grad_norm": 0.6483784914016724, + "learning_rate": 1.9737380845862745e-05, + "loss": 0.0574, + "step": 8290 + }, + { + "epoch": 0.16584, + "grad_norm": 1.1296314001083374, + "learning_rate": 1.9737062862082216e-05, + "loss": 0.1854, + "step": 8292 + }, + { + "epoch": 0.16588, + "grad_norm": 0.6619336009025574, + "learning_rate": 1.973674468847263e-05, + "loss": 0.1269, + "step": 8294 + }, + { + "epoch": 0.16592, + "grad_norm": 3.116197109222412, + "learning_rate": 1.9736426325040194e-05, + "loss": 0.3028, + "step": 8296 + }, + { + "epoch": 0.16596, + "grad_norm": 2.2906885147094727, + "learning_rate": 1.9736107771791117e-05, + "loss": 0.1853, + "step": 8298 + }, + { + "epoch": 0.166, + "grad_norm": 3.5389902591705322, + "learning_rate": 1.9735789028731603e-05, + "loss": 0.3927, + "step": 8300 + }, + { + "epoch": 0.16604, + "grad_norm": 0.8691229820251465, + "learning_rate": 1.9735470095867872e-05, + "loss": 0.13, + "step": 8302 + }, + { + "epoch": 0.16608, + "grad_norm": 1.782021164894104, + "learning_rate": 1.973515097320614e-05, + "loss": 0.1192, + "step": 8304 + }, + { + "epoch": 0.16612, + "grad_norm": 2.8282437324523926, + "learning_rate": 1.9734831660752626e-05, + "loss": 0.2789, + "step": 8306 + }, + { + "epoch": 0.16616, + "grad_norm": 1.3609867095947266, + "learning_rate": 1.9734512158513558e-05, + "loss": 0.1008, + "step": 8308 + }, + { + "epoch": 0.1662, + "grad_norm": 1.121126413345337, + "learning_rate": 1.9734192466495162e-05, + "loss": 0.2719, + "step": 8310 + }, + { + "epoch": 0.16624, + "grad_norm": 3.8234035968780518, + "learning_rate": 1.9733872584703673e-05, + "loss": 0.3256, + "step": 8312 + }, + { + "epoch": 0.16628, + "grad_norm": 0.5418248772621155, + "learning_rate": 1.9733552513145325e-05, + "loss": 0.2286, + "step": 8314 + }, + { + "epoch": 0.16632, + "grad_norm": 1.2296606302261353, + "learning_rate": 1.9733232251826365e-05, + "loss": 0.154, + "step": 8316 + }, + { + "epoch": 0.16636, + "grad_norm": 3.5025715827941895, + "learning_rate": 1.9732911800753028e-05, + "loss": 0.278, + "step": 8318 + }, + { + "epoch": 0.1664, + "grad_norm": 0.4195598363876343, + "learning_rate": 1.9732591159931564e-05, + "loss": 0.0514, + "step": 8320 + }, + { + "epoch": 0.16644, + "grad_norm": 4.073043346405029, + "learning_rate": 1.9732270329368225e-05, + "loss": 0.3426, + "step": 8322 + }, + { + "epoch": 0.16648, + "grad_norm": 5.459471225738525, + "learning_rate": 1.973194930906927e-05, + "loss": 0.493, + "step": 8324 + }, + { + "epoch": 0.16652, + "grad_norm": 0.8880894780158997, + "learning_rate": 1.9731628099040946e-05, + "loss": 0.3435, + "step": 8326 + }, + { + "epoch": 0.16656, + "grad_norm": 1.9826653003692627, + "learning_rate": 1.9731306699289522e-05, + "loss": 0.1441, + "step": 8328 + }, + { + "epoch": 0.1666, + "grad_norm": 1.628638505935669, + "learning_rate": 1.9730985109821268e-05, + "loss": 0.1533, + "step": 8330 + }, + { + "epoch": 0.16664, + "grad_norm": 2.58833646774292, + "learning_rate": 1.9730663330642444e-05, + "loss": 0.2798, + "step": 8332 + }, + { + "epoch": 0.16668, + "grad_norm": 2.3416202068328857, + "learning_rate": 1.9730341361759334e-05, + "loss": 0.1236, + "step": 8334 + }, + { + "epoch": 0.16672, + "grad_norm": 1.692560076713562, + "learning_rate": 1.973001920317821e-05, + "loss": 0.3225, + "step": 8336 + }, + { + "epoch": 0.16676, + "grad_norm": 3.633424758911133, + "learning_rate": 1.972969685490535e-05, + "loss": 0.2591, + "step": 8338 + }, + { + "epoch": 0.1668, + "grad_norm": 29.219867706298828, + "learning_rate": 1.972937431694704e-05, + "loss": 0.4392, + "step": 8340 + }, + { + "epoch": 0.16684, + "grad_norm": 1.119402527809143, + "learning_rate": 1.9729051589309573e-05, + "loss": 0.1463, + "step": 8342 + }, + { + "epoch": 0.16688, + "grad_norm": 1.3838011026382446, + "learning_rate": 1.972872867199923e-05, + "loss": 0.0988, + "step": 8344 + }, + { + "epoch": 0.16692, + "grad_norm": 2.657405376434326, + "learning_rate": 1.9728405565022316e-05, + "loss": 0.2006, + "step": 8346 + }, + { + "epoch": 0.16696, + "grad_norm": 0.3336063623428345, + "learning_rate": 1.9728082268385126e-05, + "loss": 0.0471, + "step": 8348 + }, + { + "epoch": 0.167, + "grad_norm": 1.242652416229248, + "learning_rate": 1.972775878209397e-05, + "loss": 0.2481, + "step": 8350 + }, + { + "epoch": 0.16704, + "grad_norm": 1.5510144233703613, + "learning_rate": 1.972743510615514e-05, + "loss": 0.314, + "step": 8352 + }, + { + "epoch": 0.16708, + "grad_norm": 2.9873647689819336, + "learning_rate": 1.9727111240574958e-05, + "loss": 0.2839, + "step": 8354 + }, + { + "epoch": 0.16712, + "grad_norm": 0.8954054117202759, + "learning_rate": 1.9726787185359733e-05, + "loss": 0.232, + "step": 8356 + }, + { + "epoch": 0.16716, + "grad_norm": 2.307460069656372, + "learning_rate": 1.9726462940515787e-05, + "loss": 0.2882, + "step": 8358 + }, + { + "epoch": 0.1672, + "grad_norm": 0.7393632531166077, + "learning_rate": 1.9726138506049438e-05, + "loss": 0.0963, + "step": 8360 + }, + { + "epoch": 0.16724, + "grad_norm": 1.8359404802322388, + "learning_rate": 1.972581388196701e-05, + "loss": 0.1642, + "step": 8362 + }, + { + "epoch": 0.16728, + "grad_norm": 1.9769220352172852, + "learning_rate": 1.9725489068274833e-05, + "loss": 0.1877, + "step": 8364 + }, + { + "epoch": 0.16732, + "grad_norm": 3.384402275085449, + "learning_rate": 1.9725164064979242e-05, + "loss": 0.3051, + "step": 8366 + }, + { + "epoch": 0.16736, + "grad_norm": 2.0043246746063232, + "learning_rate": 1.9724838872086568e-05, + "loss": 0.1744, + "step": 8368 + }, + { + "epoch": 0.1674, + "grad_norm": 1.1485899686813354, + "learning_rate": 1.9724513489603153e-05, + "loss": 0.2718, + "step": 8370 + }, + { + "epoch": 0.16744, + "grad_norm": 2.6848323345184326, + "learning_rate": 1.9724187917535343e-05, + "loss": 0.4086, + "step": 8372 + }, + { + "epoch": 0.16748, + "grad_norm": 2.606077194213867, + "learning_rate": 1.972386215588948e-05, + "loss": 0.2041, + "step": 8374 + }, + { + "epoch": 0.16752, + "grad_norm": 1.0046160221099854, + "learning_rate": 1.9723536204671922e-05, + "loss": 0.1461, + "step": 8376 + }, + { + "epoch": 0.16756, + "grad_norm": 2.467046022415161, + "learning_rate": 1.972321006388902e-05, + "loss": 0.2054, + "step": 8378 + }, + { + "epoch": 0.1676, + "grad_norm": 1.7576063871383667, + "learning_rate": 1.9722883733547128e-05, + "loss": 0.3101, + "step": 8380 + }, + { + "epoch": 0.16764, + "grad_norm": 2.6923680305480957, + "learning_rate": 1.9722557213652615e-05, + "loss": 0.3952, + "step": 8382 + }, + { + "epoch": 0.16768, + "grad_norm": 0.8899527788162231, + "learning_rate": 1.9722230504211843e-05, + "loss": 0.1973, + "step": 8384 + }, + { + "epoch": 0.16772, + "grad_norm": 1.104920744895935, + "learning_rate": 1.972190360523118e-05, + "loss": 0.0991, + "step": 8386 + }, + { + "epoch": 0.16776, + "grad_norm": 1.3746849298477173, + "learning_rate": 1.9721576516717005e-05, + "loss": 0.1369, + "step": 8388 + }, + { + "epoch": 0.1678, + "grad_norm": 0.841555655002594, + "learning_rate": 1.9721249238675688e-05, + "loss": 0.1483, + "step": 8390 + }, + { + "epoch": 0.16784, + "grad_norm": 2.22682523727417, + "learning_rate": 1.972092177111361e-05, + "loss": 0.1968, + "step": 8392 + }, + { + "epoch": 0.16788, + "grad_norm": 0.5303301811218262, + "learning_rate": 1.9720594114037163e-05, + "loss": 0.0892, + "step": 8394 + }, + { + "epoch": 0.16792, + "grad_norm": 1.5012644529342651, + "learning_rate": 1.972026626745273e-05, + "loss": 0.2637, + "step": 8396 + }, + { + "epoch": 0.16796, + "grad_norm": 1.9160635471343994, + "learning_rate": 1.9719938231366695e-05, + "loss": 0.254, + "step": 8398 + }, + { + "epoch": 0.168, + "grad_norm": 2.5612833499908447, + "learning_rate": 1.9719610005785466e-05, + "loss": 0.2486, + "step": 8400 + }, + { + "epoch": 0.16804, + "grad_norm": 1.4133002758026123, + "learning_rate": 1.9719281590715432e-05, + "loss": 0.1267, + "step": 8402 + }, + { + "epoch": 0.16808, + "grad_norm": 3.549025535583496, + "learning_rate": 1.9718952986163e-05, + "loss": 0.4159, + "step": 8404 + }, + { + "epoch": 0.16812, + "grad_norm": 1.4673798084259033, + "learning_rate": 1.9718624192134578e-05, + "loss": 0.1443, + "step": 8406 + }, + { + "epoch": 0.16816, + "grad_norm": 1.947542428970337, + "learning_rate": 1.971829520863657e-05, + "loss": 0.1498, + "step": 8408 + }, + { + "epoch": 0.1682, + "grad_norm": 0.9962881207466125, + "learning_rate": 1.97179660356754e-05, + "loss": 0.0863, + "step": 8410 + }, + { + "epoch": 0.16824, + "grad_norm": 2.676340103149414, + "learning_rate": 1.9717636673257475e-05, + "loss": 0.1929, + "step": 8412 + }, + { + "epoch": 0.16828, + "grad_norm": 0.5845295190811157, + "learning_rate": 1.9717307121389218e-05, + "loss": 0.0439, + "step": 8414 + }, + { + "epoch": 0.16832, + "grad_norm": 1.9048699140548706, + "learning_rate": 1.9716977380077058e-05, + "loss": 0.3635, + "step": 8416 + }, + { + "epoch": 0.16836, + "grad_norm": 1.1131837368011475, + "learning_rate": 1.9716647449327423e-05, + "loss": 0.3712, + "step": 8418 + }, + { + "epoch": 0.1684, + "grad_norm": 0.3347177505493164, + "learning_rate": 1.971631732914674e-05, + "loss": 0.2049, + "step": 8420 + }, + { + "epoch": 0.16844, + "grad_norm": 2.5722405910491943, + "learning_rate": 1.971598701954145e-05, + "loss": 0.2666, + "step": 8422 + }, + { + "epoch": 0.16848, + "grad_norm": 0.34871533513069153, + "learning_rate": 1.9715656520517993e-05, + "loss": 0.0717, + "step": 8424 + }, + { + "epoch": 0.16852, + "grad_norm": 1.4629141092300415, + "learning_rate": 1.9715325832082808e-05, + "loss": 0.122, + "step": 8426 + }, + { + "epoch": 0.16856, + "grad_norm": 1.5437208414077759, + "learning_rate": 1.9714994954242345e-05, + "loss": 0.2026, + "step": 8428 + }, + { + "epoch": 0.1686, + "grad_norm": 2.1952977180480957, + "learning_rate": 1.9714663887003055e-05, + "loss": 0.2088, + "step": 8430 + }, + { + "epoch": 0.16864, + "grad_norm": 0.6715924739837646, + "learning_rate": 1.9714332630371385e-05, + "loss": 0.1841, + "step": 8432 + }, + { + "epoch": 0.16868, + "grad_norm": 1.2594362497329712, + "learning_rate": 1.9714001184353805e-05, + "loss": 0.1192, + "step": 8434 + }, + { + "epoch": 0.16872, + "grad_norm": 2.192284107208252, + "learning_rate": 1.971366954895677e-05, + "loss": 0.2769, + "step": 8436 + }, + { + "epoch": 0.16876, + "grad_norm": 1.940945029258728, + "learning_rate": 1.9713337724186743e-05, + "loss": 0.1796, + "step": 8438 + }, + { + "epoch": 0.1688, + "grad_norm": 2.237771987915039, + "learning_rate": 1.9713005710050203e-05, + "loss": 0.2067, + "step": 8440 + }, + { + "epoch": 0.16884, + "grad_norm": 3.681600332260132, + "learning_rate": 1.971267350655361e-05, + "loss": 0.5851, + "step": 8442 + }, + { + "epoch": 0.16888, + "grad_norm": 2.0498359203338623, + "learning_rate": 1.971234111370345e-05, + "loss": 0.3403, + "step": 8444 + }, + { + "epoch": 0.16892, + "grad_norm": 2.011526584625244, + "learning_rate": 1.9712008531506197e-05, + "loss": 0.2179, + "step": 8446 + }, + { + "epoch": 0.16896, + "grad_norm": 1.5331997871398926, + "learning_rate": 1.971167575996834e-05, + "loss": 0.2037, + "step": 8448 + }, + { + "epoch": 0.169, + "grad_norm": 0.8605691194534302, + "learning_rate": 1.971134279909636e-05, + "loss": 0.144, + "step": 8450 + }, + { + "epoch": 0.16904, + "grad_norm": 1.3581657409667969, + "learning_rate": 1.9711009648896758e-05, + "loss": 0.1929, + "step": 8452 + }, + { + "epoch": 0.16908, + "grad_norm": 2.213467836380005, + "learning_rate": 1.971067630937602e-05, + "loss": 0.2921, + "step": 8454 + }, + { + "epoch": 0.16912, + "grad_norm": 2.656189203262329, + "learning_rate": 1.971034278054065e-05, + "loss": 0.339, + "step": 8456 + }, + { + "epoch": 0.16916, + "grad_norm": 0.6040170192718506, + "learning_rate": 1.9710009062397147e-05, + "loss": 0.0888, + "step": 8458 + }, + { + "epoch": 0.1692, + "grad_norm": 1.123580813407898, + "learning_rate": 1.9709675154952017e-05, + "loss": 0.0991, + "step": 8460 + }, + { + "epoch": 0.16924, + "grad_norm": 0.7978579998016357, + "learning_rate": 1.9709341058211773e-05, + "loss": 0.1975, + "step": 8462 + }, + { + "epoch": 0.16928, + "grad_norm": 0.3744829595088959, + "learning_rate": 1.9709006772182926e-05, + "loss": 0.1482, + "step": 8464 + }, + { + "epoch": 0.16932, + "grad_norm": 0.5124669671058655, + "learning_rate": 1.9708672296871993e-05, + "loss": 0.0989, + "step": 8466 + }, + { + "epoch": 0.16936, + "grad_norm": 0.9868833422660828, + "learning_rate": 1.9708337632285494e-05, + "loss": 0.1053, + "step": 8468 + }, + { + "epoch": 0.1694, + "grad_norm": 2.299025058746338, + "learning_rate": 1.9708002778429957e-05, + "loss": 0.3803, + "step": 8470 + }, + { + "epoch": 0.16944, + "grad_norm": 1.2852928638458252, + "learning_rate": 1.9707667735311908e-05, + "loss": 0.3716, + "step": 8472 + }, + { + "epoch": 0.16948, + "grad_norm": 0.9805229306221008, + "learning_rate": 1.9707332502937875e-05, + "loss": 0.0829, + "step": 8474 + }, + { + "epoch": 0.16952, + "grad_norm": 2.334162712097168, + "learning_rate": 1.9706997081314402e-05, + "loss": 0.247, + "step": 8476 + }, + { + "epoch": 0.16956, + "grad_norm": 0.45164215564727783, + "learning_rate": 1.970666147044802e-05, + "loss": 0.1036, + "step": 8478 + }, + { + "epoch": 0.1696, + "grad_norm": 0.828177273273468, + "learning_rate": 1.9706325670345276e-05, + "loss": 0.2233, + "step": 8480 + }, + { + "epoch": 0.16964, + "grad_norm": 1.1640197038650513, + "learning_rate": 1.9705989681012715e-05, + "loss": 0.2176, + "step": 8482 + }, + { + "epoch": 0.16968, + "grad_norm": 2.0466690063476562, + "learning_rate": 1.970565350245689e-05, + "loss": 0.2687, + "step": 8484 + }, + { + "epoch": 0.16972, + "grad_norm": 0.6164987683296204, + "learning_rate": 1.9705317134684353e-05, + "loss": 0.0891, + "step": 8486 + }, + { + "epoch": 0.16976, + "grad_norm": 1.6209001541137695, + "learning_rate": 1.970498057770166e-05, + "loss": 0.2232, + "step": 8488 + }, + { + "epoch": 0.1698, + "grad_norm": 0.3647451400756836, + "learning_rate": 1.9704643831515377e-05, + "loss": 0.0324, + "step": 8490 + }, + { + "epoch": 0.16984, + "grad_norm": 0.3428860604763031, + "learning_rate": 1.9704306896132063e-05, + "loss": 0.1915, + "step": 8492 + }, + { + "epoch": 0.16988, + "grad_norm": 3.6789450645446777, + "learning_rate": 1.9703969771558295e-05, + "loss": 0.4981, + "step": 8494 + }, + { + "epoch": 0.16992, + "grad_norm": 0.9967862367630005, + "learning_rate": 1.9703632457800633e-05, + "loss": 0.0953, + "step": 8496 + }, + { + "epoch": 0.16996, + "grad_norm": 0.6218488216400146, + "learning_rate": 1.9703294954865668e-05, + "loss": 0.14, + "step": 8498 + }, + { + "epoch": 0.17, + "grad_norm": 0.4217708706855774, + "learning_rate": 1.9702957262759964e-05, + "loss": 0.2563, + "step": 8500 + }, + { + "epoch": 0.17004, + "grad_norm": 0.6579639911651611, + "learning_rate": 1.970261938149012e-05, + "loss": 0.285, + "step": 8502 + }, + { + "epoch": 0.17008, + "grad_norm": 2.3286211490631104, + "learning_rate": 1.9702281311062713e-05, + "loss": 0.2428, + "step": 8504 + }, + { + "epoch": 0.17012, + "grad_norm": 2.0184123516082764, + "learning_rate": 1.970194305148434e-05, + "loss": 0.2379, + "step": 8506 + }, + { + "epoch": 0.17016, + "grad_norm": 1.9636340141296387, + "learning_rate": 1.9701604602761587e-05, + "loss": 0.1805, + "step": 8508 + }, + { + "epoch": 0.1702, + "grad_norm": 0.9109874963760376, + "learning_rate": 1.970126596490106e-05, + "loss": 0.1177, + "step": 8510 + }, + { + "epoch": 0.17024, + "grad_norm": 2.3757901191711426, + "learning_rate": 1.9700927137909358e-05, + "loss": 0.3059, + "step": 8512 + }, + { + "epoch": 0.17028, + "grad_norm": 0.9571641087532043, + "learning_rate": 1.9700588121793088e-05, + "loss": 0.2031, + "step": 8514 + }, + { + "epoch": 0.17032, + "grad_norm": 2.252885103225708, + "learning_rate": 1.9700248916558858e-05, + "loss": 0.2467, + "step": 8516 + }, + { + "epoch": 0.17036, + "grad_norm": 1.8445241451263428, + "learning_rate": 1.9699909522213283e-05, + "loss": 0.3508, + "step": 8518 + }, + { + "epoch": 0.1704, + "grad_norm": 1.40941321849823, + "learning_rate": 1.9699569938762975e-05, + "loss": 0.1842, + "step": 8520 + }, + { + "epoch": 0.17044, + "grad_norm": 1.703148603439331, + "learning_rate": 1.9699230166214557e-05, + "loss": 0.3461, + "step": 8522 + }, + { + "epoch": 0.17048, + "grad_norm": 1.8534059524536133, + "learning_rate": 1.9698890204574657e-05, + "loss": 0.2557, + "step": 8524 + }, + { + "epoch": 0.17052, + "grad_norm": 1.4212347269058228, + "learning_rate": 1.9698550053849897e-05, + "loss": 0.2126, + "step": 8526 + }, + { + "epoch": 0.17056, + "grad_norm": 1.7469663619995117, + "learning_rate": 1.969820971404691e-05, + "loss": 0.3427, + "step": 8528 + }, + { + "epoch": 0.1706, + "grad_norm": 1.3737785816192627, + "learning_rate": 1.969786918517233e-05, + "loss": 0.1635, + "step": 8530 + }, + { + "epoch": 0.17064, + "grad_norm": 2.5126636028289795, + "learning_rate": 1.9697528467232802e-05, + "loss": 0.3147, + "step": 8532 + }, + { + "epoch": 0.17068, + "grad_norm": 1.086611270904541, + "learning_rate": 1.9697187560234963e-05, + "loss": 0.228, + "step": 8534 + }, + { + "epoch": 0.17072, + "grad_norm": 0.3428117036819458, + "learning_rate": 1.969684646418546e-05, + "loss": 0.2387, + "step": 8536 + }, + { + "epoch": 0.17076, + "grad_norm": 1.6821885108947754, + "learning_rate": 1.969650517909094e-05, + "loss": 0.2035, + "step": 8538 + }, + { + "epoch": 0.1708, + "grad_norm": 0.7379288673400879, + "learning_rate": 1.969616370495806e-05, + "loss": 0.2237, + "step": 8540 + }, + { + "epoch": 0.17084, + "grad_norm": 2.318110466003418, + "learning_rate": 1.969582204179348e-05, + "loss": 0.357, + "step": 8542 + }, + { + "epoch": 0.17088, + "grad_norm": 0.732948899269104, + "learning_rate": 1.9695480189603856e-05, + "loss": 0.2237, + "step": 8544 + }, + { + "epoch": 0.17092, + "grad_norm": 1.2306780815124512, + "learning_rate": 1.9695138148395854e-05, + "loss": 0.2261, + "step": 8546 + }, + { + "epoch": 0.17096, + "grad_norm": 1.5882400274276733, + "learning_rate": 1.969479591817614e-05, + "loss": 0.1959, + "step": 8548 + }, + { + "epoch": 0.171, + "grad_norm": 2.240323305130005, + "learning_rate": 1.9694453498951392e-05, + "loss": 0.4249, + "step": 8550 + }, + { + "epoch": 0.17104, + "grad_norm": 2.0534796714782715, + "learning_rate": 1.969411089072828e-05, + "loss": 0.3131, + "step": 8552 + }, + { + "epoch": 0.17108, + "grad_norm": 2.0534486770629883, + "learning_rate": 1.9693768093513485e-05, + "loss": 0.3313, + "step": 8554 + }, + { + "epoch": 0.17112, + "grad_norm": 1.8589192628860474, + "learning_rate": 1.969342510731369e-05, + "loss": 0.2773, + "step": 8556 + }, + { + "epoch": 0.17116, + "grad_norm": 0.922418475151062, + "learning_rate": 1.9693081932135585e-05, + "loss": 0.2225, + "step": 8558 + }, + { + "epoch": 0.1712, + "grad_norm": 0.7279538512229919, + "learning_rate": 1.9692738567985853e-05, + "loss": 0.1125, + "step": 8560 + }, + { + "epoch": 0.17124, + "grad_norm": 1.2306997776031494, + "learning_rate": 1.9692395014871196e-05, + "loss": 0.1309, + "step": 8562 + }, + { + "epoch": 0.17128, + "grad_norm": 1.2971431016921997, + "learning_rate": 1.9692051272798304e-05, + "loss": 0.1451, + "step": 8564 + }, + { + "epoch": 0.17132, + "grad_norm": 1.7850440740585327, + "learning_rate": 1.969170734177389e-05, + "loss": 0.1934, + "step": 8566 + }, + { + "epoch": 0.17136, + "grad_norm": 0.6270906329154968, + "learning_rate": 1.9691363221804645e-05, + "loss": 0.1035, + "step": 8568 + }, + { + "epoch": 0.1714, + "grad_norm": 2.8709957599639893, + "learning_rate": 1.9691018912897285e-05, + "loss": 0.6045, + "step": 8570 + }, + { + "epoch": 0.17144, + "grad_norm": 2.2615253925323486, + "learning_rate": 1.9690674415058528e-05, + "loss": 0.4891, + "step": 8572 + }, + { + "epoch": 0.17148, + "grad_norm": 1.4903382062911987, + "learning_rate": 1.969032972829508e-05, + "loss": 0.2239, + "step": 8574 + }, + { + "epoch": 0.17152, + "grad_norm": 1.832077145576477, + "learning_rate": 1.9689984852613664e-05, + "loss": 0.2108, + "step": 8576 + }, + { + "epoch": 0.17156, + "grad_norm": 1.244098424911499, + "learning_rate": 1.9689639788021004e-05, + "loss": 0.1263, + "step": 8578 + }, + { + "epoch": 0.1716, + "grad_norm": 1.510195016860962, + "learning_rate": 1.968929453452383e-05, + "loss": 0.2031, + "step": 8580 + }, + { + "epoch": 0.17164, + "grad_norm": 1.7955676317214966, + "learning_rate": 1.968894909212887e-05, + "loss": 0.2228, + "step": 8582 + }, + { + "epoch": 0.17168, + "grad_norm": 0.6494365930557251, + "learning_rate": 1.9688603460842862e-05, + "loss": 0.1397, + "step": 8584 + }, + { + "epoch": 0.17172, + "grad_norm": 0.4553622603416443, + "learning_rate": 1.9688257640672538e-05, + "loss": 0.1516, + "step": 8586 + }, + { + "epoch": 0.17176, + "grad_norm": 0.8940996527671814, + "learning_rate": 1.9687911631624644e-05, + "loss": 0.1496, + "step": 8588 + }, + { + "epoch": 0.1718, + "grad_norm": 1.655217170715332, + "learning_rate": 1.9687565433705926e-05, + "loss": 0.2182, + "step": 8590 + }, + { + "epoch": 0.17184, + "grad_norm": 0.7951929569244385, + "learning_rate": 1.968721904692313e-05, + "loss": 0.2327, + "step": 8592 + }, + { + "epoch": 0.17188, + "grad_norm": 1.7512520551681519, + "learning_rate": 1.9686872471283012e-05, + "loss": 0.1591, + "step": 8594 + }, + { + "epoch": 0.17192, + "grad_norm": 0.9874667525291443, + "learning_rate": 1.968652570679233e-05, + "loss": 0.2291, + "step": 8596 + }, + { + "epoch": 0.17196, + "grad_norm": 0.4888167679309845, + "learning_rate": 1.9686178753457844e-05, + "loss": 0.1904, + "step": 8598 + }, + { + "epoch": 0.172, + "grad_norm": 0.8956801295280457, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.354, + "step": 8600 + }, + { + "epoch": 0.17204, + "grad_norm": 1.1917521953582764, + "learning_rate": 1.968548428028451e-05, + "loss": 0.1373, + "step": 8602 + }, + { + "epoch": 0.17208, + "grad_norm": 1.3329405784606934, + "learning_rate": 1.96851367604592e-05, + "loss": 0.196, + "step": 8604 + }, + { + "epoch": 0.17212, + "grad_norm": 0.47947609424591064, + "learning_rate": 1.968478905181717e-05, + "loss": 0.0865, + "step": 8606 + }, + { + "epoch": 0.17216, + "grad_norm": 1.386217713356018, + "learning_rate": 1.9684441154365185e-05, + "loss": 0.1674, + "step": 8608 + }, + { + "epoch": 0.1722, + "grad_norm": 2.4257326126098633, + "learning_rate": 1.968409306811004e-05, + "loss": 0.2619, + "step": 8610 + }, + { + "epoch": 0.17224, + "grad_norm": 2.494288206100464, + "learning_rate": 1.968374479305851e-05, + "loss": 0.2374, + "step": 8612 + }, + { + "epoch": 0.17228, + "grad_norm": 0.6135528683662415, + "learning_rate": 1.968339632921739e-05, + "loss": 0.083, + "step": 8614 + }, + { + "epoch": 0.17232, + "grad_norm": 2.4709644317626953, + "learning_rate": 1.9683047676593475e-05, + "loss": 0.2065, + "step": 8616 + }, + { + "epoch": 0.17236, + "grad_norm": 2.9441444873809814, + "learning_rate": 1.968269883519356e-05, + "loss": 0.2201, + "step": 8618 + }, + { + "epoch": 0.1724, + "grad_norm": 2.209080934524536, + "learning_rate": 1.9682349805024447e-05, + "loss": 0.2221, + "step": 8620 + }, + { + "epoch": 0.17244, + "grad_norm": 1.1089035272598267, + "learning_rate": 1.9682000586092937e-05, + "loss": 0.2629, + "step": 8622 + }, + { + "epoch": 0.17248, + "grad_norm": 1.6689319610595703, + "learning_rate": 1.9681651178405844e-05, + "loss": 0.2064, + "step": 8624 + }, + { + "epoch": 0.17252, + "grad_norm": 0.8410994410514832, + "learning_rate": 1.9681301581969975e-05, + "loss": 0.1271, + "step": 8626 + }, + { + "epoch": 0.17256, + "grad_norm": 0.9935634732246399, + "learning_rate": 1.9680951796792146e-05, + "loss": 0.2153, + "step": 8628 + }, + { + "epoch": 0.1726, + "grad_norm": 2.301988124847412, + "learning_rate": 1.968060182287918e-05, + "loss": 0.2347, + "step": 8630 + }, + { + "epoch": 0.17264, + "grad_norm": 1.5056508779525757, + "learning_rate": 1.9680251660237898e-05, + "loss": 0.1843, + "step": 8632 + }, + { + "epoch": 0.17268, + "grad_norm": 1.1811585426330566, + "learning_rate": 1.9679901308875125e-05, + "loss": 0.1406, + "step": 8634 + }, + { + "epoch": 0.17272, + "grad_norm": 3.6611485481262207, + "learning_rate": 1.967955076879769e-05, + "loss": 0.4396, + "step": 8636 + }, + { + "epoch": 0.17276, + "grad_norm": 0.693716287612915, + "learning_rate": 1.9679200040012433e-05, + "loss": 0.0683, + "step": 8638 + }, + { + "epoch": 0.1728, + "grad_norm": 1.542188286781311, + "learning_rate": 1.967884912252619e-05, + "loss": 0.1099, + "step": 8640 + }, + { + "epoch": 0.17284, + "grad_norm": 1.0175666809082031, + "learning_rate": 1.9678498016345794e-05, + "loss": 0.0901, + "step": 8642 + }, + { + "epoch": 0.17288, + "grad_norm": 0.2201938033103943, + "learning_rate": 1.9678146721478104e-05, + "loss": 0.2432, + "step": 8644 + }, + { + "epoch": 0.17292, + "grad_norm": 2.9646718502044678, + "learning_rate": 1.9677795237929955e-05, + "loss": 0.4699, + "step": 8646 + }, + { + "epoch": 0.17296, + "grad_norm": 0.9467123746871948, + "learning_rate": 1.9677443565708206e-05, + "loss": 0.3488, + "step": 8648 + }, + { + "epoch": 0.173, + "grad_norm": 1.0129896402359009, + "learning_rate": 1.9677091704819714e-05, + "loss": 0.0648, + "step": 8650 + }, + { + "epoch": 0.17304, + "grad_norm": 2.1551315784454346, + "learning_rate": 1.9676739655271336e-05, + "loss": 0.2187, + "step": 8652 + }, + { + "epoch": 0.17308, + "grad_norm": 0.24095018208026886, + "learning_rate": 1.9676387417069938e-05, + "loss": 0.161, + "step": 8654 + }, + { + "epoch": 0.17312, + "grad_norm": 1.5884106159210205, + "learning_rate": 1.9676034990222382e-05, + "loss": 0.2259, + "step": 8656 + }, + { + "epoch": 0.17316, + "grad_norm": 2.213834762573242, + "learning_rate": 1.9675682374735547e-05, + "loss": 0.3696, + "step": 8658 + }, + { + "epoch": 0.1732, + "grad_norm": 1.8113776445388794, + "learning_rate": 1.96753295706163e-05, + "loss": 0.1585, + "step": 8660 + }, + { + "epoch": 0.17324, + "grad_norm": 1.7573916912078857, + "learning_rate": 1.9674976577871523e-05, + "loss": 0.2331, + "step": 8662 + }, + { + "epoch": 0.17328, + "grad_norm": 1.3794773817062378, + "learning_rate": 1.9674623396508095e-05, + "loss": 0.1883, + "step": 8664 + }, + { + "epoch": 0.17332, + "grad_norm": 1.8393738269805908, + "learning_rate": 1.9674270026532904e-05, + "loss": 0.1742, + "step": 8666 + }, + { + "epoch": 0.17336, + "grad_norm": 0.3081510066986084, + "learning_rate": 1.967391646795284e-05, + "loss": 0.1013, + "step": 8668 + }, + { + "epoch": 0.1734, + "grad_norm": 1.015824556350708, + "learning_rate": 1.9673562720774792e-05, + "loss": 0.0679, + "step": 8670 + }, + { + "epoch": 0.17344, + "grad_norm": 1.930190086364746, + "learning_rate": 1.9673208785005658e-05, + "loss": 0.4248, + "step": 8672 + }, + { + "epoch": 0.17348, + "grad_norm": 2.528825521469116, + "learning_rate": 1.967285466065234e-05, + "loss": 0.4241, + "step": 8674 + }, + { + "epoch": 0.17352, + "grad_norm": 1.987520456314087, + "learning_rate": 1.967250034772174e-05, + "loss": 0.1782, + "step": 8676 + }, + { + "epoch": 0.17356, + "grad_norm": 1.6077603101730347, + "learning_rate": 1.967214584622077e-05, + "loss": 0.194, + "step": 8678 + }, + { + "epoch": 0.1736, + "grad_norm": 1.7338989973068237, + "learning_rate": 1.967179115615633e-05, + "loss": 0.2322, + "step": 8680 + }, + { + "epoch": 0.17364, + "grad_norm": 2.1747944355010986, + "learning_rate": 1.9671436277535344e-05, + "loss": 0.3702, + "step": 8682 + }, + { + "epoch": 0.17368, + "grad_norm": 0.6645921468734741, + "learning_rate": 1.9671081210364732e-05, + "loss": 0.1724, + "step": 8684 + }, + { + "epoch": 0.17372, + "grad_norm": 1.4469144344329834, + "learning_rate": 1.9670725954651407e-05, + "loss": 0.1598, + "step": 8686 + }, + { + "epoch": 0.17376, + "grad_norm": 0.7222878932952881, + "learning_rate": 1.9670370510402306e-05, + "loss": 0.1467, + "step": 8688 + }, + { + "epoch": 0.1738, + "grad_norm": 2.76771879196167, + "learning_rate": 1.9670014877624353e-05, + "loss": 0.2866, + "step": 8690 + }, + { + "epoch": 0.17384, + "grad_norm": 0.4114691913127899, + "learning_rate": 1.966965905632448e-05, + "loss": 0.107, + "step": 8692 + }, + { + "epoch": 0.17388, + "grad_norm": 1.9957255125045776, + "learning_rate": 1.9669303046509623e-05, + "loss": 0.159, + "step": 8694 + }, + { + "epoch": 0.17392, + "grad_norm": 3.8648953437805176, + "learning_rate": 1.9668946848186728e-05, + "loss": 0.594, + "step": 8696 + }, + { + "epoch": 0.17396, + "grad_norm": 0.8125227093696594, + "learning_rate": 1.9668590461362735e-05, + "loss": 0.1587, + "step": 8698 + }, + { + "epoch": 0.174, + "grad_norm": 2.6033177375793457, + "learning_rate": 1.9668233886044597e-05, + "loss": 0.2815, + "step": 8700 + }, + { + "epoch": 0.17404, + "grad_norm": 1.5621024370193481, + "learning_rate": 1.9667877122239257e-05, + "loss": 0.2668, + "step": 8702 + }, + { + "epoch": 0.17408, + "grad_norm": 1.7875522375106812, + "learning_rate": 1.9667520169953677e-05, + "loss": 0.2884, + "step": 8704 + }, + { + "epoch": 0.17412, + "grad_norm": 2.229759454727173, + "learning_rate": 1.9667163029194816e-05, + "loss": 0.2824, + "step": 8706 + }, + { + "epoch": 0.17416, + "grad_norm": 0.94219571352005, + "learning_rate": 1.966680569996963e-05, + "loss": 0.1309, + "step": 8708 + }, + { + "epoch": 0.1742, + "grad_norm": 1.590328574180603, + "learning_rate": 1.9666448182285095e-05, + "loss": 0.3035, + "step": 8710 + }, + { + "epoch": 0.17424, + "grad_norm": 1.5800178050994873, + "learning_rate": 1.966609047614817e-05, + "loss": 0.2446, + "step": 8712 + }, + { + "epoch": 0.17428, + "grad_norm": 1.0563009977340698, + "learning_rate": 1.9665732581565843e-05, + "loss": 0.3069, + "step": 8714 + }, + { + "epoch": 0.17432, + "grad_norm": 1.4448928833007812, + "learning_rate": 1.966537449854508e-05, + "loss": 0.3104, + "step": 8716 + }, + { + "epoch": 0.17436, + "grad_norm": 1.333547830581665, + "learning_rate": 1.966501622709286e-05, + "loss": 0.2473, + "step": 8718 + }, + { + "epoch": 0.1744, + "grad_norm": 0.9435408711433411, + "learning_rate": 1.9664657767216176e-05, + "loss": 0.1449, + "step": 8720 + }, + { + "epoch": 0.17444, + "grad_norm": 1.1018329858779907, + "learning_rate": 1.9664299118922018e-05, + "loss": 0.1516, + "step": 8722 + }, + { + "epoch": 0.17448, + "grad_norm": 1.7102227210998535, + "learning_rate": 1.9663940282217367e-05, + "loss": 0.1991, + "step": 8724 + }, + { + "epoch": 0.17452, + "grad_norm": 1.5132559537887573, + "learning_rate": 1.9663581257109225e-05, + "loss": 0.3407, + "step": 8726 + }, + { + "epoch": 0.17456, + "grad_norm": 1.9421603679656982, + "learning_rate": 1.9663222043604594e-05, + "loss": 0.2432, + "step": 8728 + }, + { + "epoch": 0.1746, + "grad_norm": 1.6384587287902832, + "learning_rate": 1.966286264171047e-05, + "loss": 0.2925, + "step": 8730 + }, + { + "epoch": 0.17464, + "grad_norm": 2.022123336791992, + "learning_rate": 1.966250305143387e-05, + "loss": 0.3283, + "step": 8732 + }, + { + "epoch": 0.17468, + "grad_norm": 1.0188496112823486, + "learning_rate": 1.9662143272781797e-05, + "loss": 0.1554, + "step": 8734 + }, + { + "epoch": 0.17472, + "grad_norm": 1.5770045518875122, + "learning_rate": 1.9661783305761264e-05, + "loss": 0.1847, + "step": 8736 + }, + { + "epoch": 0.17476, + "grad_norm": 2.237529754638672, + "learning_rate": 1.9661423150379293e-05, + "loss": 0.343, + "step": 8738 + }, + { + "epoch": 0.1748, + "grad_norm": 0.9856255650520325, + "learning_rate": 1.9661062806642903e-05, + "loss": 0.1856, + "step": 8740 + }, + { + "epoch": 0.17484, + "grad_norm": 1.5020397901535034, + "learning_rate": 1.9660702274559118e-05, + "loss": 0.2027, + "step": 8742 + }, + { + "epoch": 0.17488, + "grad_norm": 1.9481611251831055, + "learning_rate": 1.9660341554134972e-05, + "loss": 0.29, + "step": 8744 + }, + { + "epoch": 0.17492, + "grad_norm": 2.2416043281555176, + "learning_rate": 1.9659980645377493e-05, + "loss": 0.2597, + "step": 8746 + }, + { + "epoch": 0.17496, + "grad_norm": 1.1062939167022705, + "learning_rate": 1.965961954829372e-05, + "loss": 0.1128, + "step": 8748 + }, + { + "epoch": 0.175, + "grad_norm": 1.2339228391647339, + "learning_rate": 1.9659258262890683e-05, + "loss": 0.2428, + "step": 8750 + }, + { + "epoch": 0.17504, + "grad_norm": 1.3451272249221802, + "learning_rate": 1.965889678917544e-05, + "loss": 0.2263, + "step": 8752 + }, + { + "epoch": 0.17508, + "grad_norm": 1.1581673622131348, + "learning_rate": 1.9658535127155028e-05, + "loss": 0.1921, + "step": 8754 + }, + { + "epoch": 0.17512, + "grad_norm": 1.7011741399765015, + "learning_rate": 1.9658173276836504e-05, + "loss": 0.1844, + "step": 8756 + }, + { + "epoch": 0.17516, + "grad_norm": 1.6055344343185425, + "learning_rate": 1.965781123822692e-05, + "loss": 0.1709, + "step": 8758 + }, + { + "epoch": 0.1752, + "grad_norm": 1.3709849119186401, + "learning_rate": 1.9657449011333328e-05, + "loss": 0.1517, + "step": 8760 + }, + { + "epoch": 0.17524, + "grad_norm": 1.90458083152771, + "learning_rate": 1.9657086596162802e-05, + "loss": 0.1855, + "step": 8762 + }, + { + "epoch": 0.17528, + "grad_norm": 1.067246437072754, + "learning_rate": 1.9656723992722398e-05, + "loss": 0.1917, + "step": 8764 + }, + { + "epoch": 0.17532, + "grad_norm": 2.0642666816711426, + "learning_rate": 1.965636120101919e-05, + "loss": 0.2884, + "step": 8766 + }, + { + "epoch": 0.17536, + "grad_norm": 1.8564703464508057, + "learning_rate": 1.965599822106025e-05, + "loss": 0.2221, + "step": 8768 + }, + { + "epoch": 0.1754, + "grad_norm": 1.2655736207962036, + "learning_rate": 1.9655635052852648e-05, + "loss": 0.1597, + "step": 8770 + }, + { + "epoch": 0.17544, + "grad_norm": 0.8299749493598938, + "learning_rate": 1.9655271696403474e-05, + "loss": 0.0945, + "step": 8772 + }, + { + "epoch": 0.17548, + "grad_norm": 0.5440967082977295, + "learning_rate": 1.965490815171981e-05, + "loss": 0.0795, + "step": 8774 + }, + { + "epoch": 0.17552, + "grad_norm": 0.5011920928955078, + "learning_rate": 1.9654544418808732e-05, + "loss": 0.3605, + "step": 8776 + }, + { + "epoch": 0.17556, + "grad_norm": 3.243201732635498, + "learning_rate": 1.9654180497677347e-05, + "loss": 0.3489, + "step": 8778 + }, + { + "epoch": 0.1756, + "grad_norm": 0.8073199987411499, + "learning_rate": 1.965381638833274e-05, + "loss": 0.085, + "step": 8780 + }, + { + "epoch": 0.17564, + "grad_norm": 0.3601382076740265, + "learning_rate": 1.9653452090782013e-05, + "loss": 0.4278, + "step": 8782 + }, + { + "epoch": 0.17568, + "grad_norm": 1.7133442163467407, + "learning_rate": 1.965308760503227e-05, + "loss": 0.1462, + "step": 8784 + }, + { + "epoch": 0.17572, + "grad_norm": 1.6264848709106445, + "learning_rate": 1.9652722931090612e-05, + "loss": 0.2347, + "step": 8786 + }, + { + "epoch": 0.17576, + "grad_norm": 1.6932138204574585, + "learning_rate": 1.965235806896415e-05, + "loss": 0.2346, + "step": 8788 + }, + { + "epoch": 0.1758, + "grad_norm": 1.640656590461731, + "learning_rate": 1.9651993018660002e-05, + "loss": 0.2039, + "step": 8790 + }, + { + "epoch": 0.17584, + "grad_norm": 1.673527717590332, + "learning_rate": 1.9651627780185277e-05, + "loss": 0.2563, + "step": 8792 + }, + { + "epoch": 0.17588, + "grad_norm": 0.4780973792076111, + "learning_rate": 1.96512623535471e-05, + "loss": 0.1516, + "step": 8794 + }, + { + "epoch": 0.17592, + "grad_norm": 1.6287037134170532, + "learning_rate": 1.9650896738752596e-05, + "loss": 0.1155, + "step": 8796 + }, + { + "epoch": 0.17596, + "grad_norm": 1.020290732383728, + "learning_rate": 1.965053093580889e-05, + "loss": 0.2699, + "step": 8798 + }, + { + "epoch": 0.176, + "grad_norm": 1.3936837911605835, + "learning_rate": 1.9650164944723116e-05, + "loss": 0.1516, + "step": 8800 + }, + { + "epoch": 0.17604, + "grad_norm": 2.405975580215454, + "learning_rate": 1.9649798765502408e-05, + "loss": 0.2767, + "step": 8802 + }, + { + "epoch": 0.17608, + "grad_norm": 1.2277474403381348, + "learning_rate": 1.9649432398153904e-05, + "loss": 0.1102, + "step": 8804 + }, + { + "epoch": 0.17612, + "grad_norm": 2.1564226150512695, + "learning_rate": 1.964906584268475e-05, + "loss": 0.2426, + "step": 8806 + }, + { + "epoch": 0.17616, + "grad_norm": 2.0243465900421143, + "learning_rate": 1.964869909910209e-05, + "loss": 0.2566, + "step": 8808 + }, + { + "epoch": 0.1762, + "grad_norm": 0.7544088959693909, + "learning_rate": 1.9648332167413067e-05, + "loss": 0.198, + "step": 8810 + }, + { + "epoch": 0.17624, + "grad_norm": 0.7562856078147888, + "learning_rate": 1.9647965047624847e-05, + "loss": 0.0514, + "step": 8812 + }, + { + "epoch": 0.17628, + "grad_norm": 0.5163347721099854, + "learning_rate": 1.9647597739744583e-05, + "loss": 0.0487, + "step": 8814 + }, + { + "epoch": 0.17632, + "grad_norm": 1.4958558082580566, + "learning_rate": 1.9647230243779432e-05, + "loss": 0.3001, + "step": 8816 + }, + { + "epoch": 0.17636, + "grad_norm": 0.613174319267273, + "learning_rate": 1.9646862559736555e-05, + "loss": 0.03, + "step": 8818 + }, + { + "epoch": 0.1764, + "grad_norm": 0.6638712882995605, + "learning_rate": 1.9646494687623135e-05, + "loss": 0.2814, + "step": 8820 + }, + { + "epoch": 0.17644, + "grad_norm": 3.2346270084381104, + "learning_rate": 1.964612662744633e-05, + "loss": 0.4721, + "step": 8822 + }, + { + "epoch": 0.17648, + "grad_norm": 1.9410663843154907, + "learning_rate": 1.964575837921332e-05, + "loss": 0.1748, + "step": 8824 + }, + { + "epoch": 0.17652, + "grad_norm": 1.5438395738601685, + "learning_rate": 1.9645389942931287e-05, + "loss": 0.159, + "step": 8826 + }, + { + "epoch": 0.17656, + "grad_norm": 1.6414538621902466, + "learning_rate": 1.9645021318607408e-05, + "loss": 0.099, + "step": 8828 + }, + { + "epoch": 0.1766, + "grad_norm": 0.6264420747756958, + "learning_rate": 1.9644652506248872e-05, + "loss": 0.2316, + "step": 8830 + }, + { + "epoch": 0.17664, + "grad_norm": 2.6801578998565674, + "learning_rate": 1.9644283505862877e-05, + "loss": 0.1354, + "step": 8832 + }, + { + "epoch": 0.17668, + "grad_norm": 0.8390849828720093, + "learning_rate": 1.9643914317456604e-05, + "loss": 0.076, + "step": 8834 + }, + { + "epoch": 0.17672, + "grad_norm": 0.8637872934341431, + "learning_rate": 1.964354494103726e-05, + "loss": 0.0596, + "step": 8836 + }, + { + "epoch": 0.17676, + "grad_norm": 0.3186597228050232, + "learning_rate": 1.9643175376612035e-05, + "loss": 0.0207, + "step": 8838 + }, + { + "epoch": 0.1768, + "grad_norm": 3.6277999877929688, + "learning_rate": 1.964280562418815e-05, + "loss": 0.7002, + "step": 8840 + }, + { + "epoch": 0.17684, + "grad_norm": 3.5375447273254395, + "learning_rate": 1.9642435683772797e-05, + "loss": 0.5235, + "step": 8842 + }, + { + "epoch": 0.17688, + "grad_norm": 2.680086612701416, + "learning_rate": 1.9642065555373202e-05, + "loss": 0.4544, + "step": 8844 + }, + { + "epoch": 0.17692, + "grad_norm": 1.8499177694320679, + "learning_rate": 1.9641695238996575e-05, + "loss": 0.233, + "step": 8846 + }, + { + "epoch": 0.17696, + "grad_norm": 1.875337839126587, + "learning_rate": 1.9641324734650134e-05, + "loss": 0.2792, + "step": 8848 + }, + { + "epoch": 0.177, + "grad_norm": 1.5987508296966553, + "learning_rate": 1.96409540423411e-05, + "loss": 0.3515, + "step": 8850 + }, + { + "epoch": 0.17704, + "grad_norm": 1.444116473197937, + "learning_rate": 1.964058316207671e-05, + "loss": 0.1446, + "step": 8852 + }, + { + "epoch": 0.17708, + "grad_norm": 3.8926045894622803, + "learning_rate": 1.9640212093864185e-05, + "loss": 0.453, + "step": 8854 + }, + { + "epoch": 0.17712, + "grad_norm": 2.917280912399292, + "learning_rate": 1.963984083771076e-05, + "loss": 0.2939, + "step": 8856 + }, + { + "epoch": 0.17716, + "grad_norm": 1.2947561740875244, + "learning_rate": 1.9639469393623677e-05, + "loss": 0.1149, + "step": 8858 + }, + { + "epoch": 0.1772, + "grad_norm": 0.6562832593917847, + "learning_rate": 1.9639097761610174e-05, + "loss": 0.0527, + "step": 8860 + }, + { + "epoch": 0.17724, + "grad_norm": 1.5973105430603027, + "learning_rate": 1.9638725941677502e-05, + "loss": 0.2238, + "step": 8862 + }, + { + "epoch": 0.17728, + "grad_norm": 1.8794505596160889, + "learning_rate": 1.9638353933832902e-05, + "loss": 0.2042, + "step": 8864 + }, + { + "epoch": 0.17732, + "grad_norm": 2.8195316791534424, + "learning_rate": 1.963798173808363e-05, + "loss": 0.2994, + "step": 8866 + }, + { + "epoch": 0.17736, + "grad_norm": 2.2235639095306396, + "learning_rate": 1.9637609354436943e-05, + "loss": 0.3855, + "step": 8868 + }, + { + "epoch": 0.1774, + "grad_norm": 1.1062287092208862, + "learning_rate": 1.96372367829001e-05, + "loss": 0.1175, + "step": 8870 + }, + { + "epoch": 0.17744, + "grad_norm": 1.659016489982605, + "learning_rate": 1.9636864023480363e-05, + "loss": 0.264, + "step": 8872 + }, + { + "epoch": 0.17748, + "grad_norm": 2.684100389480591, + "learning_rate": 1.9636491076185e-05, + "loss": 0.3258, + "step": 8874 + }, + { + "epoch": 0.17752, + "grad_norm": 1.3112763166427612, + "learning_rate": 1.9636117941021285e-05, + "loss": 0.1239, + "step": 8876 + }, + { + "epoch": 0.17756, + "grad_norm": 0.6926677823066711, + "learning_rate": 1.9635744617996485e-05, + "loss": 0.0931, + "step": 8878 + }, + { + "epoch": 0.1776, + "grad_norm": 0.8799062967300415, + "learning_rate": 1.963537110711789e-05, + "loss": 0.1062, + "step": 8880 + }, + { + "epoch": 0.17764, + "grad_norm": 2.7979342937469482, + "learning_rate": 1.9634997408392767e-05, + "loss": 0.2525, + "step": 8882 + }, + { + "epoch": 0.17768, + "grad_norm": 0.7965164184570312, + "learning_rate": 1.9634623521828413e-05, + "loss": 0.0854, + "step": 8884 + }, + { + "epoch": 0.17772, + "grad_norm": 0.8540827631950378, + "learning_rate": 1.9634249447432113e-05, + "loss": 0.07, + "step": 8886 + }, + { + "epoch": 0.17776, + "grad_norm": 2.6009562015533447, + "learning_rate": 1.963387518521116e-05, + "loss": 0.217, + "step": 8888 + }, + { + "epoch": 0.1778, + "grad_norm": 2.8344314098358154, + "learning_rate": 1.963350073517285e-05, + "loss": 0.2474, + "step": 8890 + }, + { + "epoch": 0.17784, + "grad_norm": 1.8479732275009155, + "learning_rate": 1.9633126097324483e-05, + "loss": 0.378, + "step": 8892 + }, + { + "epoch": 0.17788, + "grad_norm": 2.6484904289245605, + "learning_rate": 1.9632751271673365e-05, + "loss": 0.6249, + "step": 8894 + }, + { + "epoch": 0.17792, + "grad_norm": 1.2880252599716187, + "learning_rate": 1.96323762582268e-05, + "loss": 0.1603, + "step": 8896 + }, + { + "epoch": 0.17796, + "grad_norm": 1.6063363552093506, + "learning_rate": 1.96320010569921e-05, + "loss": 0.2543, + "step": 8898 + }, + { + "epoch": 0.178, + "grad_norm": 2.9524765014648438, + "learning_rate": 1.9631625667976584e-05, + "loss": 0.4252, + "step": 8900 + }, + { + "epoch": 0.17804, + "grad_norm": 0.699276864528656, + "learning_rate": 1.9631250091187565e-05, + "loss": 0.1066, + "step": 8902 + }, + { + "epoch": 0.17808, + "grad_norm": 0.76955646276474, + "learning_rate": 1.9630874326632365e-05, + "loss": 0.1065, + "step": 8904 + }, + { + "epoch": 0.17812, + "grad_norm": 1.1183042526245117, + "learning_rate": 1.9630498374318316e-05, + "loss": 0.1787, + "step": 8906 + }, + { + "epoch": 0.17816, + "grad_norm": 1.3780865669250488, + "learning_rate": 1.963012223425274e-05, + "loss": 0.2782, + "step": 8908 + }, + { + "epoch": 0.1782, + "grad_norm": 0.5536690354347229, + "learning_rate": 1.9629745906442973e-05, + "loss": 0.1437, + "step": 8910 + }, + { + "epoch": 0.17824, + "grad_norm": 0.38480624556541443, + "learning_rate": 1.962936939089635e-05, + "loss": 0.0596, + "step": 8912 + }, + { + "epoch": 0.17828, + "grad_norm": 3.8603932857513428, + "learning_rate": 1.9628992687620218e-05, + "loss": 0.3614, + "step": 8914 + }, + { + "epoch": 0.17832, + "grad_norm": 3.3747715950012207, + "learning_rate": 1.9628615796621915e-05, + "loss": 0.4859, + "step": 8916 + }, + { + "epoch": 0.17836, + "grad_norm": 3.9111361503601074, + "learning_rate": 1.962823871790879e-05, + "loss": 0.4114, + "step": 8918 + }, + { + "epoch": 0.1784, + "grad_norm": 1.4190951585769653, + "learning_rate": 1.962786145148819e-05, + "loss": 0.2231, + "step": 8920 + }, + { + "epoch": 0.17844, + "grad_norm": 1.9796652793884277, + "learning_rate": 1.9627483997367477e-05, + "loss": 0.2896, + "step": 8922 + }, + { + "epoch": 0.17848, + "grad_norm": 1.695219874382019, + "learning_rate": 1.962710635555401e-05, + "loss": 0.1717, + "step": 8924 + }, + { + "epoch": 0.17852, + "grad_norm": 1.2089800834655762, + "learning_rate": 1.9626728526055144e-05, + "loss": 0.1523, + "step": 8926 + }, + { + "epoch": 0.17856, + "grad_norm": 1.2214856147766113, + "learning_rate": 1.962635050887825e-05, + "loss": 0.1762, + "step": 8928 + }, + { + "epoch": 0.1786, + "grad_norm": 1.1999576091766357, + "learning_rate": 1.9625972304030697e-05, + "loss": 0.2432, + "step": 8930 + }, + { + "epoch": 0.17864, + "grad_norm": 2.1384780406951904, + "learning_rate": 1.9625593911519857e-05, + "loss": 0.3398, + "step": 8932 + }, + { + "epoch": 0.17868, + "grad_norm": 1.6086945533752441, + "learning_rate": 1.962521533135311e-05, + "loss": 0.2975, + "step": 8934 + }, + { + "epoch": 0.17872, + "grad_norm": 1.9378855228424072, + "learning_rate": 1.9624836563537837e-05, + "loss": 0.2043, + "step": 8936 + }, + { + "epoch": 0.17876, + "grad_norm": 1.2931777238845825, + "learning_rate": 1.9624457608081416e-05, + "loss": 0.1525, + "step": 8938 + }, + { + "epoch": 0.1788, + "grad_norm": 0.9185282588005066, + "learning_rate": 1.962407846499124e-05, + "loss": 0.2035, + "step": 8940 + }, + { + "epoch": 0.17884, + "grad_norm": 1.8409512042999268, + "learning_rate": 1.9623699134274702e-05, + "loss": 0.203, + "step": 8942 + }, + { + "epoch": 0.17888, + "grad_norm": 1.1888513565063477, + "learning_rate": 1.9623319615939192e-05, + "loss": 0.1885, + "step": 8944 + }, + { + "epoch": 0.17892, + "grad_norm": 1.661687970161438, + "learning_rate": 1.962293990999211e-05, + "loss": 0.2266, + "step": 8946 + }, + { + "epoch": 0.17896, + "grad_norm": 1.6771143674850464, + "learning_rate": 1.9622560016440863e-05, + "loss": 0.1783, + "step": 8948 + }, + { + "epoch": 0.179, + "grad_norm": 1.6031243801116943, + "learning_rate": 1.9622179935292855e-05, + "loss": 0.194, + "step": 8950 + }, + { + "epoch": 0.17904, + "grad_norm": 1.1193962097167969, + "learning_rate": 1.9621799666555495e-05, + "loss": 0.2224, + "step": 8952 + }, + { + "epoch": 0.17908, + "grad_norm": 0.5206547975540161, + "learning_rate": 1.9621419210236197e-05, + "loss": 0.2349, + "step": 8954 + }, + { + "epoch": 0.17912, + "grad_norm": 1.07195246219635, + "learning_rate": 1.9621038566342378e-05, + "loss": 0.2697, + "step": 8956 + }, + { + "epoch": 0.17916, + "grad_norm": 0.27492058277130127, + "learning_rate": 1.9620657734881457e-05, + "loss": 0.0393, + "step": 8958 + }, + { + "epoch": 0.1792, + "grad_norm": 1.7126258611679077, + "learning_rate": 1.962027671586086e-05, + "loss": 0.1794, + "step": 8960 + }, + { + "epoch": 0.17924, + "grad_norm": 2.9416568279266357, + "learning_rate": 1.9619895509288017e-05, + "loss": 0.2765, + "step": 8962 + }, + { + "epoch": 0.17928, + "grad_norm": 1.284379243850708, + "learning_rate": 1.961951411517036e-05, + "loss": 0.1957, + "step": 8964 + }, + { + "epoch": 0.17932, + "grad_norm": 3.0224509239196777, + "learning_rate": 1.961913253351532e-05, + "loss": 0.2422, + "step": 8966 + }, + { + "epoch": 0.17936, + "grad_norm": 1.1642204523086548, + "learning_rate": 1.961875076433034e-05, + "loss": 0.1147, + "step": 8968 + }, + { + "epoch": 0.1794, + "grad_norm": 2.3015997409820557, + "learning_rate": 1.9618368807622863e-05, + "loss": 0.3012, + "step": 8970 + }, + { + "epoch": 0.17944, + "grad_norm": 2.6812939643859863, + "learning_rate": 1.9617986663400334e-05, + "loss": 0.2877, + "step": 8972 + }, + { + "epoch": 0.17948, + "grad_norm": 0.840154767036438, + "learning_rate": 1.9617604331670202e-05, + "loss": 0.2774, + "step": 8974 + }, + { + "epoch": 0.17952, + "grad_norm": 1.4110496044158936, + "learning_rate": 1.9617221812439925e-05, + "loss": 0.2131, + "step": 8976 + }, + { + "epoch": 0.17956, + "grad_norm": 1.9701719284057617, + "learning_rate": 1.9616839105716954e-05, + "loss": 0.2447, + "step": 8978 + }, + { + "epoch": 0.1796, + "grad_norm": 1.9659110307693481, + "learning_rate": 1.9616456211508756e-05, + "loss": 0.2034, + "step": 8980 + }, + { + "epoch": 0.17964, + "grad_norm": 2.4763288497924805, + "learning_rate": 1.961607312982279e-05, + "loss": 0.341, + "step": 8982 + }, + { + "epoch": 0.17968, + "grad_norm": 0.8349334001541138, + "learning_rate": 1.961568986066653e-05, + "loss": 0.1061, + "step": 8984 + }, + { + "epoch": 0.17972, + "grad_norm": 0.8664035797119141, + "learning_rate": 1.9615306404047447e-05, + "loss": 0.1593, + "step": 8986 + }, + { + "epoch": 0.17976, + "grad_norm": 1.3947919607162476, + "learning_rate": 1.961492275997301e-05, + "loss": 0.2227, + "step": 8988 + }, + { + "epoch": 0.1798, + "grad_norm": 1.1912981271743774, + "learning_rate": 1.961453892845071e-05, + "loss": 0.1682, + "step": 8990 + }, + { + "epoch": 0.17984, + "grad_norm": 1.1079837083816528, + "learning_rate": 1.961415490948802e-05, + "loss": 0.1061, + "step": 8992 + }, + { + "epoch": 0.17988, + "grad_norm": 0.7612549662590027, + "learning_rate": 1.961377070309243e-05, + "loss": 0.1271, + "step": 8994 + }, + { + "epoch": 0.17992, + "grad_norm": 1.4810270071029663, + "learning_rate": 1.9613386309271437e-05, + "loss": 0.1622, + "step": 8996 + }, + { + "epoch": 0.17996, + "grad_norm": 0.6007975935935974, + "learning_rate": 1.9613001728032522e-05, + "loss": 0.0855, + "step": 8998 + }, + { + "epoch": 0.18, + "grad_norm": 1.0282642841339111, + "learning_rate": 1.961261695938319e-05, + "loss": 0.1602, + "step": 9000 + }, + { + "epoch": 0.18004, + "grad_norm": 1.5766403675079346, + "learning_rate": 1.9612232003330943e-05, + "loss": 0.1766, + "step": 9002 + }, + { + "epoch": 0.18008, + "grad_norm": 2.4125969409942627, + "learning_rate": 1.9611846859883284e-05, + "loss": 0.2429, + "step": 9004 + }, + { + "epoch": 0.18012, + "grad_norm": 1.7260750532150269, + "learning_rate": 1.9611461529047723e-05, + "loss": 0.2666, + "step": 9006 + }, + { + "epoch": 0.18016, + "grad_norm": 1.4805257320404053, + "learning_rate": 1.961107601083177e-05, + "loss": 0.1755, + "step": 9008 + }, + { + "epoch": 0.1802, + "grad_norm": 0.8025038838386536, + "learning_rate": 1.961069030524294e-05, + "loss": 0.1639, + "step": 9010 + }, + { + "epoch": 0.18024, + "grad_norm": 1.4590915441513062, + "learning_rate": 1.9610304412288756e-05, + "loss": 0.1779, + "step": 9012 + }, + { + "epoch": 0.18028, + "grad_norm": 0.6634891033172607, + "learning_rate": 1.960991833197674e-05, + "loss": 0.0644, + "step": 9014 + }, + { + "epoch": 0.18032, + "grad_norm": 2.9191157817840576, + "learning_rate": 1.960953206431442e-05, + "loss": 0.1784, + "step": 9016 + }, + { + "epoch": 0.18036, + "grad_norm": 3.4826467037200928, + "learning_rate": 1.9609145609309323e-05, + "loss": 0.3777, + "step": 9018 + }, + { + "epoch": 0.1804, + "grad_norm": 2.4772536754608154, + "learning_rate": 1.9608758966968987e-05, + "loss": 0.2022, + "step": 9020 + }, + { + "epoch": 0.18044, + "grad_norm": 0.6052477359771729, + "learning_rate": 1.9608372137300948e-05, + "loss": 0.0562, + "step": 9022 + }, + { + "epoch": 0.18048, + "grad_norm": 0.7833767533302307, + "learning_rate": 1.9607985120312744e-05, + "loss": 0.1976, + "step": 9024 + }, + { + "epoch": 0.18052, + "grad_norm": 1.4281420707702637, + "learning_rate": 1.960759791601193e-05, + "loss": 0.1216, + "step": 9026 + }, + { + "epoch": 0.18056, + "grad_norm": 1.9566974639892578, + "learning_rate": 1.9607210524406044e-05, + "loss": 0.1571, + "step": 9028 + }, + { + "epoch": 0.1806, + "grad_norm": 1.0190540552139282, + "learning_rate": 1.9606822945502642e-05, + "loss": 0.0865, + "step": 9030 + }, + { + "epoch": 0.18064, + "grad_norm": 1.9333144426345825, + "learning_rate": 1.9606435179309284e-05, + "loss": 0.2765, + "step": 9032 + }, + { + "epoch": 0.18068, + "grad_norm": 1.806312084197998, + "learning_rate": 1.9606047225833526e-05, + "loss": 0.1304, + "step": 9034 + }, + { + "epoch": 0.18072, + "grad_norm": 1.7887287139892578, + "learning_rate": 1.9605659085082927e-05, + "loss": 0.2329, + "step": 9036 + }, + { + "epoch": 0.18076, + "grad_norm": 0.8463576436042786, + "learning_rate": 1.9605270757065063e-05, + "loss": 0.0906, + "step": 9038 + }, + { + "epoch": 0.1808, + "grad_norm": 0.4555432200431824, + "learning_rate": 1.96048822417875e-05, + "loss": 0.055, + "step": 9040 + }, + { + "epoch": 0.18084, + "grad_norm": 0.18251435458660126, + "learning_rate": 1.9604493539257813e-05, + "loss": 0.0941, + "step": 9042 + }, + { + "epoch": 0.18088, + "grad_norm": 4.54519510269165, + "learning_rate": 1.9604104649483578e-05, + "loss": 0.3789, + "step": 9044 + }, + { + "epoch": 0.18092, + "grad_norm": 0.38616782426834106, + "learning_rate": 1.960371557247238e-05, + "loss": 0.2051, + "step": 9046 + }, + { + "epoch": 0.18096, + "grad_norm": 1.6684461832046509, + "learning_rate": 1.96033263082318e-05, + "loss": 0.0781, + "step": 9048 + }, + { + "epoch": 0.181, + "grad_norm": 0.317305326461792, + "learning_rate": 1.9602936856769432e-05, + "loss": 0.2479, + "step": 9050 + }, + { + "epoch": 0.18104, + "grad_norm": 2.3094637393951416, + "learning_rate": 1.9602547218092867e-05, + "loss": 0.1766, + "step": 9052 + }, + { + "epoch": 0.18108, + "grad_norm": 1.933209776878357, + "learning_rate": 1.9602157392209698e-05, + "loss": 0.1383, + "step": 9054 + }, + { + "epoch": 0.18112, + "grad_norm": 0.8823319673538208, + "learning_rate": 1.9601767379127528e-05, + "loss": 0.043, + "step": 9056 + }, + { + "epoch": 0.18116, + "grad_norm": 1.7323801517486572, + "learning_rate": 1.9601377178853957e-05, + "loss": 0.1268, + "step": 9058 + }, + { + "epoch": 0.1812, + "grad_norm": 3.001049757003784, + "learning_rate": 1.96009867913966e-05, + "loss": 0.1551, + "step": 9060 + }, + { + "epoch": 0.18124, + "grad_norm": 4.342739105224609, + "learning_rate": 1.960059621676306e-05, + "loss": 0.4175, + "step": 9062 + }, + { + "epoch": 0.18128, + "grad_norm": 4.093118190765381, + "learning_rate": 1.9600205454960952e-05, + "loss": 0.2316, + "step": 9064 + }, + { + "epoch": 0.18132, + "grad_norm": 0.7713987827301025, + "learning_rate": 1.95998145059979e-05, + "loss": 0.127, + "step": 9066 + }, + { + "epoch": 0.18136, + "grad_norm": 0.7545239925384521, + "learning_rate": 1.959942336988152e-05, + "loss": 0.1808, + "step": 9068 + }, + { + "epoch": 0.1814, + "grad_norm": 0.26918432116508484, + "learning_rate": 1.9599032046619437e-05, + "loss": 0.0263, + "step": 9070 + }, + { + "epoch": 0.18144, + "grad_norm": 2.299694061279297, + "learning_rate": 1.9598640536219288e-05, + "loss": 0.1017, + "step": 9072 + }, + { + "epoch": 0.18148, + "grad_norm": 0.9802165627479553, + "learning_rate": 1.9598248838688696e-05, + "loss": 0.1222, + "step": 9074 + }, + { + "epoch": 0.18152, + "grad_norm": 0.14226575195789337, + "learning_rate": 1.9597856954035303e-05, + "loss": 0.1957, + "step": 9076 + }, + { + "epoch": 0.18156, + "grad_norm": 1.5071886777877808, + "learning_rate": 1.9597464882266745e-05, + "loss": 0.0691, + "step": 9078 + }, + { + "epoch": 0.1816, + "grad_norm": 3.028224229812622, + "learning_rate": 1.9597072623390668e-05, + "loss": 0.1231, + "step": 9080 + }, + { + "epoch": 0.18164, + "grad_norm": 4.267984390258789, + "learning_rate": 1.959668017741472e-05, + "loss": 0.5565, + "step": 9082 + }, + { + "epoch": 0.18168, + "grad_norm": 2.8233914375305176, + "learning_rate": 1.9596287544346552e-05, + "loss": 0.2566, + "step": 9084 + }, + { + "epoch": 0.18172, + "grad_norm": 0.3133488595485687, + "learning_rate": 1.9595894724193817e-05, + "loss": 0.2023, + "step": 9086 + }, + { + "epoch": 0.18176, + "grad_norm": 1.5493234395980835, + "learning_rate": 1.9595501716964176e-05, + "loss": 0.2623, + "step": 9088 + }, + { + "epoch": 0.1818, + "grad_norm": 0.5098121762275696, + "learning_rate": 1.959510852266529e-05, + "loss": 0.104, + "step": 9090 + }, + { + "epoch": 0.18184, + "grad_norm": 1.5729719400405884, + "learning_rate": 1.959471514130482e-05, + "loss": 0.1598, + "step": 9092 + }, + { + "epoch": 0.18188, + "grad_norm": 0.48261207342147827, + "learning_rate": 1.9594321572890436e-05, + "loss": 0.3432, + "step": 9094 + }, + { + "epoch": 0.18192, + "grad_norm": 2.817244529724121, + "learning_rate": 1.959392781742982e-05, + "loss": 0.2688, + "step": 9096 + }, + { + "epoch": 0.18196, + "grad_norm": 1.3811746835708618, + "learning_rate": 1.959353387493064e-05, + "loss": 0.1516, + "step": 9098 + }, + { + "epoch": 0.182, + "grad_norm": 0.7233216166496277, + "learning_rate": 1.9593139745400575e-05, + "loss": 0.0586, + "step": 9100 + }, + { + "epoch": 0.18204, + "grad_norm": 2.0683133602142334, + "learning_rate": 1.959274542884732e-05, + "loss": 0.2233, + "step": 9102 + }, + { + "epoch": 0.18208, + "grad_norm": 2.209627866744995, + "learning_rate": 1.9592350925278546e-05, + "loss": 0.2426, + "step": 9104 + }, + { + "epoch": 0.18212, + "grad_norm": 3.551368474960327, + "learning_rate": 1.959195623470195e-05, + "loss": 0.3484, + "step": 9106 + }, + { + "epoch": 0.18216, + "grad_norm": 2.4566214084625244, + "learning_rate": 1.9591561357125236e-05, + "loss": 0.347, + "step": 9108 + }, + { + "epoch": 0.1822, + "grad_norm": 2.51399564743042, + "learning_rate": 1.9591166292556093e-05, + "loss": 0.3389, + "step": 9110 + }, + { + "epoch": 0.18224, + "grad_norm": 0.9997839331626892, + "learning_rate": 1.9590771041002225e-05, + "loss": 0.124, + "step": 9112 + }, + { + "epoch": 0.18228, + "grad_norm": 1.497406244277954, + "learning_rate": 1.9590375602471336e-05, + "loss": 0.1372, + "step": 9114 + }, + { + "epoch": 0.18232, + "grad_norm": 1.9773472547531128, + "learning_rate": 1.9589979976971142e-05, + "loss": 0.1651, + "step": 9116 + }, + { + "epoch": 0.18236, + "grad_norm": 0.5816981792449951, + "learning_rate": 1.9589584164509346e-05, + "loss": 0.0651, + "step": 9118 + }, + { + "epoch": 0.1824, + "grad_norm": 1.6259645223617554, + "learning_rate": 1.958918816509367e-05, + "loss": 0.1541, + "step": 9120 + }, + { + "epoch": 0.18244, + "grad_norm": 0.8958774209022522, + "learning_rate": 1.958879197873184e-05, + "loss": 0.1, + "step": 9122 + }, + { + "epoch": 0.18248, + "grad_norm": 5.055643081665039, + "learning_rate": 1.9588395605431568e-05, + "loss": 0.1883, + "step": 9124 + }, + { + "epoch": 0.18252, + "grad_norm": 1.4786052703857422, + "learning_rate": 1.9587999045200585e-05, + "loss": 0.2789, + "step": 9126 + }, + { + "epoch": 0.18256, + "grad_norm": 0.38675108551979065, + "learning_rate": 1.958760229804663e-05, + "loss": 0.3033, + "step": 9128 + }, + { + "epoch": 0.1826, + "grad_norm": 0.7983148097991943, + "learning_rate": 1.9587205363977428e-05, + "loss": 0.072, + "step": 9130 + }, + { + "epoch": 0.18264, + "grad_norm": 2.657257556915283, + "learning_rate": 1.9586808243000724e-05, + "loss": 0.2515, + "step": 9132 + }, + { + "epoch": 0.18268, + "grad_norm": 2.283792734146118, + "learning_rate": 1.9586410935124262e-05, + "loss": 0.2689, + "step": 9134 + }, + { + "epoch": 0.18272, + "grad_norm": 2.257211446762085, + "learning_rate": 1.9586013440355774e-05, + "loss": 0.2102, + "step": 9136 + }, + { + "epoch": 0.18276, + "grad_norm": 2.2985823154449463, + "learning_rate": 1.9585615758703025e-05, + "loss": 0.1843, + "step": 9138 + }, + { + "epoch": 0.1828, + "grad_norm": 0.4742756485939026, + "learning_rate": 1.958521789017376e-05, + "loss": 0.3518, + "step": 9140 + }, + { + "epoch": 0.18284, + "grad_norm": 1.0485692024230957, + "learning_rate": 1.9584819834775738e-05, + "loss": 0.1515, + "step": 9142 + }, + { + "epoch": 0.18288, + "grad_norm": 1.8995232582092285, + "learning_rate": 1.9584421592516717e-05, + "loss": 0.2412, + "step": 9144 + }, + { + "epoch": 0.18292, + "grad_norm": 0.8542512655258179, + "learning_rate": 1.9584023163404467e-05, + "loss": 0.1001, + "step": 9146 + }, + { + "epoch": 0.18296, + "grad_norm": 2.495702028274536, + "learning_rate": 1.9583624547446747e-05, + "loss": 0.3131, + "step": 9148 + }, + { + "epoch": 0.183, + "grad_norm": 0.5331359505653381, + "learning_rate": 1.9583225744651334e-05, + "loss": 0.169, + "step": 9150 + }, + { + "epoch": 0.18304, + "grad_norm": 1.6807941198349, + "learning_rate": 1.9582826755026002e-05, + "loss": 0.1971, + "step": 9152 + }, + { + "epoch": 0.18308, + "grad_norm": 3.049025535583496, + "learning_rate": 1.9582427578578528e-05, + "loss": 0.5388, + "step": 9154 + }, + { + "epoch": 0.18312, + "grad_norm": 1.6766411066055298, + "learning_rate": 1.9582028215316697e-05, + "loss": 0.3163, + "step": 9156 + }, + { + "epoch": 0.18316, + "grad_norm": 1.0178755521774292, + "learning_rate": 1.9581628665248288e-05, + "loss": 0.1888, + "step": 9158 + }, + { + "epoch": 0.1832, + "grad_norm": 3.1298179626464844, + "learning_rate": 1.95812289283811e-05, + "loss": 0.4233, + "step": 9160 + }, + { + "epoch": 0.18324, + "grad_norm": 0.8366566896438599, + "learning_rate": 1.9580829004722917e-05, + "loss": 0.2103, + "step": 9162 + }, + { + "epoch": 0.18328, + "grad_norm": 1.2374846935272217, + "learning_rate": 1.9580428894281542e-05, + "loss": 0.1748, + "step": 9164 + }, + { + "epoch": 0.18332, + "grad_norm": 1.0608203411102295, + "learning_rate": 1.9580028597064772e-05, + "loss": 0.1689, + "step": 9166 + }, + { + "epoch": 0.18336, + "grad_norm": 0.8350564241409302, + "learning_rate": 1.9579628113080414e-05, + "loss": 0.0953, + "step": 9168 + }, + { + "epoch": 0.1834, + "grad_norm": 1.1910744905471802, + "learning_rate": 1.9579227442336276e-05, + "loss": 0.1306, + "step": 9170 + }, + { + "epoch": 0.18344, + "grad_norm": 1.6323221921920776, + "learning_rate": 1.9578826584840164e-05, + "loss": 0.2282, + "step": 9172 + }, + { + "epoch": 0.18348, + "grad_norm": 0.7469399571418762, + "learning_rate": 1.95784255405999e-05, + "loss": 0.2159, + "step": 9174 + }, + { + "epoch": 0.18352, + "grad_norm": 1.3920437097549438, + "learning_rate": 1.9578024309623296e-05, + "loss": 0.122, + "step": 9176 + }, + { + "epoch": 0.18356, + "grad_norm": 2.832291603088379, + "learning_rate": 1.9577622891918176e-05, + "loss": 0.3818, + "step": 9178 + }, + { + "epoch": 0.1836, + "grad_norm": 0.8743041157722473, + "learning_rate": 1.9577221287492368e-05, + "loss": 0.1651, + "step": 9180 + }, + { + "epoch": 0.18364, + "grad_norm": 0.9291332364082336, + "learning_rate": 1.95768194963537e-05, + "loss": 0.3034, + "step": 9182 + }, + { + "epoch": 0.18368, + "grad_norm": 1.2648330926895142, + "learning_rate": 1.9576417518510007e-05, + "loss": 0.1408, + "step": 9184 + }, + { + "epoch": 0.18372, + "grad_norm": 2.0076534748077393, + "learning_rate": 1.9576015353969124e-05, + "loss": 0.2185, + "step": 9186 + }, + { + "epoch": 0.18376, + "grad_norm": 1.3129172325134277, + "learning_rate": 1.9575613002738893e-05, + "loss": 0.1257, + "step": 9188 + }, + { + "epoch": 0.1838, + "grad_norm": 1.6621108055114746, + "learning_rate": 1.957521046482715e-05, + "loss": 0.199, + "step": 9190 + }, + { + "epoch": 0.18384, + "grad_norm": 1.3970240354537964, + "learning_rate": 1.9574807740241754e-05, + "loss": 0.2036, + "step": 9192 + }, + { + "epoch": 0.18388, + "grad_norm": 0.7323499917984009, + "learning_rate": 1.957440482899055e-05, + "loss": 0.144, + "step": 9194 + }, + { + "epoch": 0.18392, + "grad_norm": 2.632068634033203, + "learning_rate": 1.9574001731081396e-05, + "loss": 0.4344, + "step": 9196 + }, + { + "epoch": 0.18396, + "grad_norm": 2.2098045349121094, + "learning_rate": 1.9573598446522152e-05, + "loss": 0.3146, + "step": 9198 + }, + { + "epoch": 0.184, + "grad_norm": 1.269586205482483, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.2223, + "step": 9200 + }, + { + "epoch": 0.18404, + "grad_norm": 0.47317981719970703, + "learning_rate": 1.957279131748483e-05, + "loss": 0.1365, + "step": 9202 + }, + { + "epoch": 0.18408, + "grad_norm": 2.2860662937164307, + "learning_rate": 1.9572387473022494e-05, + "loss": 0.3164, + "step": 9204 + }, + { + "epoch": 0.18412, + "grad_norm": 0.8927571773529053, + "learning_rate": 1.957198344194154e-05, + "loss": 0.1115, + "step": 9206 + }, + { + "epoch": 0.18416, + "grad_norm": 1.6033800840377808, + "learning_rate": 1.957157922424983e-05, + "loss": 0.1882, + "step": 9208 + }, + { + "epoch": 0.1842, + "grad_norm": 1.9726983308792114, + "learning_rate": 1.9571174819955264e-05, + "loss": 0.2067, + "step": 9210 + }, + { + "epoch": 0.18424, + "grad_norm": 2.634805202484131, + "learning_rate": 1.9570770229065716e-05, + "loss": 0.3389, + "step": 9212 + }, + { + "epoch": 0.18428, + "grad_norm": 2.072596788406372, + "learning_rate": 1.957036545158907e-05, + "loss": 0.3548, + "step": 9214 + }, + { + "epoch": 0.18432, + "grad_norm": 1.8319905996322632, + "learning_rate": 1.9569960487533226e-05, + "loss": 0.243, + "step": 9216 + }, + { + "epoch": 0.18436, + "grad_norm": 1.4810067415237427, + "learning_rate": 1.9569555336906077e-05, + "loss": 0.1357, + "step": 9218 + }, + { + "epoch": 0.1844, + "grad_norm": 1.0253530740737915, + "learning_rate": 1.9569149999715514e-05, + "loss": 0.1073, + "step": 9220 + }, + { + "epoch": 0.18444, + "grad_norm": 2.1034841537475586, + "learning_rate": 1.956874447596945e-05, + "loss": 0.1124, + "step": 9222 + }, + { + "epoch": 0.18448, + "grad_norm": 1.0078718662261963, + "learning_rate": 1.9568338765675786e-05, + "loss": 0.153, + "step": 9224 + }, + { + "epoch": 0.18452, + "grad_norm": 1.1429495811462402, + "learning_rate": 1.956793286884243e-05, + "loss": 0.1514, + "step": 9226 + }, + { + "epoch": 0.18456, + "grad_norm": 1.3154383897781372, + "learning_rate": 1.95675267854773e-05, + "loss": 0.144, + "step": 9228 + }, + { + "epoch": 0.1846, + "grad_norm": 2.9754602909088135, + "learning_rate": 1.9567120515588307e-05, + "loss": 0.4011, + "step": 9230 + }, + { + "epoch": 0.18464, + "grad_norm": 1.9536561965942383, + "learning_rate": 1.9566714059183373e-05, + "loss": 0.1793, + "step": 9232 + }, + { + "epoch": 0.18468, + "grad_norm": 2.2620151042938232, + "learning_rate": 1.9566307416270427e-05, + "loss": 0.2349, + "step": 9234 + }, + { + "epoch": 0.18472, + "grad_norm": 1.0828887224197388, + "learning_rate": 1.956590058685739e-05, + "loss": 0.3199, + "step": 9236 + }, + { + "epoch": 0.18476, + "grad_norm": 0.337185263633728, + "learning_rate": 1.9565493570952196e-05, + "loss": 0.144, + "step": 9238 + }, + { + "epoch": 0.1848, + "grad_norm": 1.3862285614013672, + "learning_rate": 1.956508636856278e-05, + "loss": 0.1755, + "step": 9240 + }, + { + "epoch": 0.18484, + "grad_norm": 0.39899638295173645, + "learning_rate": 1.9564678979697083e-05, + "loss": 0.1355, + "step": 9242 + }, + { + "epoch": 0.18488, + "grad_norm": 1.7034941911697388, + "learning_rate": 1.9564271404363042e-05, + "loss": 0.2178, + "step": 9244 + }, + { + "epoch": 0.18492, + "grad_norm": 2.7205326557159424, + "learning_rate": 1.956386364256861e-05, + "loss": 0.2521, + "step": 9246 + }, + { + "epoch": 0.18496, + "grad_norm": 1.178221344947815, + "learning_rate": 1.956345569432173e-05, + "loss": 0.238, + "step": 9248 + }, + { + "epoch": 0.185, + "grad_norm": 1.331501841545105, + "learning_rate": 1.9563047559630356e-05, + "loss": 0.1181, + "step": 9250 + }, + { + "epoch": 0.18504, + "grad_norm": 1.762058973312378, + "learning_rate": 1.9562639238502446e-05, + "loss": 0.1258, + "step": 9252 + }, + { + "epoch": 0.18508, + "grad_norm": 0.4319881796836853, + "learning_rate": 1.9562230730945966e-05, + "loss": 0.1073, + "step": 9254 + }, + { + "epoch": 0.18512, + "grad_norm": 0.7827363610267639, + "learning_rate": 1.956182203696887e-05, + "loss": 0.1062, + "step": 9256 + }, + { + "epoch": 0.18516, + "grad_norm": 0.31020593643188477, + "learning_rate": 1.9561413156579134e-05, + "loss": 0.1086, + "step": 9258 + }, + { + "epoch": 0.1852, + "grad_norm": 0.38485828042030334, + "learning_rate": 1.9561004089784726e-05, + "loss": 0.0344, + "step": 9260 + }, + { + "epoch": 0.18524, + "grad_norm": 0.37134209275245667, + "learning_rate": 1.956059483659362e-05, + "loss": 0.3013, + "step": 9262 + }, + { + "epoch": 0.18528, + "grad_norm": 3.5883562564849854, + "learning_rate": 1.9560185397013794e-05, + "loss": 0.8074, + "step": 9264 + }, + { + "epoch": 0.18532, + "grad_norm": 0.9051640629768372, + "learning_rate": 1.955977577105323e-05, + "loss": 0.0651, + "step": 9266 + }, + { + "epoch": 0.18536, + "grad_norm": 1.419571042060852, + "learning_rate": 1.9559365958719922e-05, + "loss": 0.1444, + "step": 9268 + }, + { + "epoch": 0.1854, + "grad_norm": 1.5189765691757202, + "learning_rate": 1.9558955960021847e-05, + "loss": 0.0995, + "step": 9270 + }, + { + "epoch": 0.18544, + "grad_norm": 0.7034042477607727, + "learning_rate": 1.9558545774967007e-05, + "loss": 0.1105, + "step": 9272 + }, + { + "epoch": 0.18548, + "grad_norm": 1.175128698348999, + "learning_rate": 1.9558135403563397e-05, + "loss": 0.1011, + "step": 9274 + }, + { + "epoch": 0.18552, + "grad_norm": 0.3182179033756256, + "learning_rate": 1.9557724845819013e-05, + "loss": 0.1673, + "step": 9276 + }, + { + "epoch": 0.18556, + "grad_norm": 2.541691303253174, + "learning_rate": 1.955731410174187e-05, + "loss": 0.2223, + "step": 9278 + }, + { + "epoch": 0.1856, + "grad_norm": 1.6336935758590698, + "learning_rate": 1.9556903171339963e-05, + "loss": 0.3004, + "step": 9280 + }, + { + "epoch": 0.18564, + "grad_norm": 0.4086708128452301, + "learning_rate": 1.955649205462131e-05, + "loss": 0.2698, + "step": 9282 + }, + { + "epoch": 0.18568, + "grad_norm": 1.9377667903900146, + "learning_rate": 1.955608075159392e-05, + "loss": 0.1514, + "step": 9284 + }, + { + "epoch": 0.18572, + "grad_norm": 2.022303342819214, + "learning_rate": 1.955566926226582e-05, + "loss": 0.1302, + "step": 9286 + }, + { + "epoch": 0.18576, + "grad_norm": 2.616868019104004, + "learning_rate": 1.9555257586645026e-05, + "loss": 0.3891, + "step": 9288 + }, + { + "epoch": 0.1858, + "grad_norm": 2.469876289367676, + "learning_rate": 1.9554845724739565e-05, + "loss": 0.233, + "step": 9290 + }, + { + "epoch": 0.18584, + "grad_norm": 2.7148711681365967, + "learning_rate": 1.955443367655747e-05, + "loss": 0.2205, + "step": 9292 + }, + { + "epoch": 0.18588, + "grad_norm": 4.003706455230713, + "learning_rate": 1.9554021442106774e-05, + "loss": 0.5118, + "step": 9294 + }, + { + "epoch": 0.18592, + "grad_norm": 0.7453495860099792, + "learning_rate": 1.9553609021395507e-05, + "loss": 0.1764, + "step": 9296 + }, + { + "epoch": 0.18596, + "grad_norm": 1.9994455575942993, + "learning_rate": 1.9553196414431713e-05, + "loss": 0.1992, + "step": 9298 + }, + { + "epoch": 0.186, + "grad_norm": 0.5579773187637329, + "learning_rate": 1.9552783621223437e-05, + "loss": 0.1774, + "step": 9300 + }, + { + "epoch": 0.18604, + "grad_norm": 1.4467592239379883, + "learning_rate": 1.9552370641778728e-05, + "loss": 0.1184, + "step": 9302 + }, + { + "epoch": 0.18608, + "grad_norm": 1.9522837400436401, + "learning_rate": 1.9551957476105637e-05, + "loss": 0.1489, + "step": 9304 + }, + { + "epoch": 0.18612, + "grad_norm": 3.3384392261505127, + "learning_rate": 1.9551544124212213e-05, + "loss": 0.2574, + "step": 9306 + }, + { + "epoch": 0.18616, + "grad_norm": 1.1845057010650635, + "learning_rate": 1.9551130586106523e-05, + "loss": 0.1919, + "step": 9308 + }, + { + "epoch": 0.1862, + "grad_norm": 1.19535231590271, + "learning_rate": 1.9550716861796623e-05, + "loss": 0.0953, + "step": 9310 + }, + { + "epoch": 0.18624, + "grad_norm": 1.9294077157974243, + "learning_rate": 1.955030295129058e-05, + "loss": 0.3255, + "step": 9312 + }, + { + "epoch": 0.18628, + "grad_norm": 0.9149627089500427, + "learning_rate": 1.9549888854596465e-05, + "loss": 0.0756, + "step": 9314 + }, + { + "epoch": 0.18632, + "grad_norm": 1.17626953125, + "learning_rate": 1.9549474571722352e-05, + "loss": 0.1464, + "step": 9316 + }, + { + "epoch": 0.18636, + "grad_norm": 0.7668462991714478, + "learning_rate": 1.954906010267631e-05, + "loss": 0.0575, + "step": 9318 + }, + { + "epoch": 0.1864, + "grad_norm": 0.9789817333221436, + "learning_rate": 1.9548645447466433e-05, + "loss": 0.1747, + "step": 9320 + }, + { + "epoch": 0.18644, + "grad_norm": 0.5351831912994385, + "learning_rate": 1.954823060610079e-05, + "loss": 0.0798, + "step": 9322 + }, + { + "epoch": 0.18648, + "grad_norm": 1.4308574199676514, + "learning_rate": 1.9547815578587478e-05, + "loss": 0.278, + "step": 9324 + }, + { + "epoch": 0.18652, + "grad_norm": 0.5537680387496948, + "learning_rate": 1.954740036493459e-05, + "loss": 0.1238, + "step": 9326 + }, + { + "epoch": 0.18656, + "grad_norm": 2.665686845779419, + "learning_rate": 1.9546984965150212e-05, + "loss": 0.2028, + "step": 9328 + }, + { + "epoch": 0.1866, + "grad_norm": 4.23110294342041, + "learning_rate": 1.9546569379242446e-05, + "loss": 0.4499, + "step": 9330 + }, + { + "epoch": 0.18664, + "grad_norm": 2.2324321269989014, + "learning_rate": 1.9546153607219398e-05, + "loss": 0.3075, + "step": 9332 + }, + { + "epoch": 0.18668, + "grad_norm": 1.0523728132247925, + "learning_rate": 1.954573764908917e-05, + "loss": 0.3367, + "step": 9334 + }, + { + "epoch": 0.18672, + "grad_norm": 3.510509490966797, + "learning_rate": 1.9545321504859875e-05, + "loss": 0.3145, + "step": 9336 + }, + { + "epoch": 0.18676, + "grad_norm": 1.570249080657959, + "learning_rate": 1.9544905174539616e-05, + "loss": 0.1754, + "step": 9338 + }, + { + "epoch": 0.1868, + "grad_norm": 0.3083939850330353, + "learning_rate": 1.9544488658136522e-05, + "loss": 0.133, + "step": 9340 + }, + { + "epoch": 0.18684, + "grad_norm": 1.5167268514633179, + "learning_rate": 1.9544071955658704e-05, + "loss": 0.4513, + "step": 9342 + }, + { + "epoch": 0.18688, + "grad_norm": 1.027532696723938, + "learning_rate": 1.9543655067114294e-05, + "loss": 0.1589, + "step": 9344 + }, + { + "epoch": 0.18692, + "grad_norm": 1.3162343502044678, + "learning_rate": 1.954323799251141e-05, + "loss": 0.1635, + "step": 9346 + }, + { + "epoch": 0.18696, + "grad_norm": 1.3201563358306885, + "learning_rate": 1.9542820731858195e-05, + "loss": 0.1835, + "step": 9348 + }, + { + "epoch": 0.187, + "grad_norm": 2.1801631450653076, + "learning_rate": 1.954240328516277e-05, + "loss": 0.1268, + "step": 9350 + }, + { + "epoch": 0.18704, + "grad_norm": 19.543203353881836, + "learning_rate": 1.9541985652433283e-05, + "loss": 0.5805, + "step": 9352 + }, + { + "epoch": 0.18708, + "grad_norm": 1.7206535339355469, + "learning_rate": 1.9541567833677876e-05, + "loss": 0.3314, + "step": 9354 + }, + { + "epoch": 0.18712, + "grad_norm": 1.192323923110962, + "learning_rate": 1.9541149828904686e-05, + "loss": 0.0854, + "step": 9356 + }, + { + "epoch": 0.18716, + "grad_norm": 2.406768560409546, + "learning_rate": 1.954073163812187e-05, + "loss": 0.2778, + "step": 9358 + }, + { + "epoch": 0.1872, + "grad_norm": 4.5520219802856445, + "learning_rate": 1.954031326133758e-05, + "loss": 0.2141, + "step": 9360 + }, + { + "epoch": 0.18724, + "grad_norm": 1.4339920282363892, + "learning_rate": 1.9539894698559973e-05, + "loss": 0.1783, + "step": 9362 + }, + { + "epoch": 0.18728, + "grad_norm": 0.7612699866294861, + "learning_rate": 1.9539475949797203e-05, + "loss": 0.0971, + "step": 9364 + }, + { + "epoch": 0.18732, + "grad_norm": 3.3820652961730957, + "learning_rate": 1.953905701505744e-05, + "loss": 0.308, + "step": 9366 + }, + { + "epoch": 0.18736, + "grad_norm": 2.672051429748535, + "learning_rate": 1.9538637894348852e-05, + "loss": 0.3805, + "step": 9368 + }, + { + "epoch": 0.1874, + "grad_norm": 1.5654019117355347, + "learning_rate": 1.9538218587679605e-05, + "loss": 0.0992, + "step": 9370 + }, + { + "epoch": 0.18744, + "grad_norm": 3.8099896907806396, + "learning_rate": 1.9537799095057878e-05, + "loss": 0.5778, + "step": 9372 + }, + { + "epoch": 0.18748, + "grad_norm": 1.606709361076355, + "learning_rate": 1.9537379416491843e-05, + "loss": 0.1671, + "step": 9374 + }, + { + "epoch": 0.18752, + "grad_norm": 2.4158120155334473, + "learning_rate": 1.9536959551989692e-05, + "loss": 0.2555, + "step": 9376 + }, + { + "epoch": 0.18756, + "grad_norm": 2.3093230724334717, + "learning_rate": 1.95365395015596e-05, + "loss": 0.2894, + "step": 9378 + }, + { + "epoch": 0.1876, + "grad_norm": 1.4772365093231201, + "learning_rate": 1.9536119265209763e-05, + "loss": 0.2481, + "step": 9380 + }, + { + "epoch": 0.18764, + "grad_norm": 2.831845760345459, + "learning_rate": 1.953569884294837e-05, + "loss": 0.301, + "step": 9382 + }, + { + "epoch": 0.18768, + "grad_norm": 0.42079147696495056, + "learning_rate": 1.953527823478362e-05, + "loss": 0.0698, + "step": 9384 + }, + { + "epoch": 0.18772, + "grad_norm": 1.2649489641189575, + "learning_rate": 1.9534857440723708e-05, + "loss": 0.1967, + "step": 9386 + }, + { + "epoch": 0.18776, + "grad_norm": 3.168179988861084, + "learning_rate": 1.9534436460776845e-05, + "loss": 0.2521, + "step": 9388 + }, + { + "epoch": 0.1878, + "grad_norm": 1.9515725374221802, + "learning_rate": 1.9534015294951235e-05, + "loss": 0.1345, + "step": 9390 + }, + { + "epoch": 0.18784, + "grad_norm": 1.0323916673660278, + "learning_rate": 1.9533593943255087e-05, + "loss": 0.1174, + "step": 9392 + }, + { + "epoch": 0.18788, + "grad_norm": 1.8460896015167236, + "learning_rate": 1.9533172405696613e-05, + "loss": 0.3315, + "step": 9394 + }, + { + "epoch": 0.18792, + "grad_norm": 3.6755073070526123, + "learning_rate": 1.953275068228404e-05, + "loss": 0.3543, + "step": 9396 + }, + { + "epoch": 0.18796, + "grad_norm": 1.8449662923812866, + "learning_rate": 1.9532328773025587e-05, + "loss": 0.1098, + "step": 9398 + }, + { + "epoch": 0.188, + "grad_norm": 2.3808956146240234, + "learning_rate": 1.9531906677929472e-05, + "loss": 0.2232, + "step": 9400 + }, + { + "epoch": 0.18804, + "grad_norm": 0.9629417061805725, + "learning_rate": 1.953148439700393e-05, + "loss": 0.0948, + "step": 9402 + }, + { + "epoch": 0.18808, + "grad_norm": 2.415940046310425, + "learning_rate": 1.9531061930257194e-05, + "loss": 0.2258, + "step": 9404 + }, + { + "epoch": 0.18812, + "grad_norm": 3.268970251083374, + "learning_rate": 1.9530639277697498e-05, + "loss": 0.3597, + "step": 9406 + }, + { + "epoch": 0.18816, + "grad_norm": 0.9257825613021851, + "learning_rate": 1.9530216439333085e-05, + "loss": 0.1586, + "step": 9408 + }, + { + "epoch": 0.1882, + "grad_norm": 1.3804137706756592, + "learning_rate": 1.952979341517219e-05, + "loss": 0.2412, + "step": 9410 + }, + { + "epoch": 0.18824, + "grad_norm": 1.8498977422714233, + "learning_rate": 1.952937020522307e-05, + "loss": 0.2865, + "step": 9412 + }, + { + "epoch": 0.18828, + "grad_norm": 0.9115754961967468, + "learning_rate": 1.9528946809493973e-05, + "loss": 0.0486, + "step": 9414 + }, + { + "epoch": 0.18832, + "grad_norm": 1.0714563131332397, + "learning_rate": 1.952852322799315e-05, + "loss": 0.1191, + "step": 9416 + }, + { + "epoch": 0.18836, + "grad_norm": 4.037947654724121, + "learning_rate": 1.952809946072886e-05, + "loss": 0.312, + "step": 9418 + }, + { + "epoch": 0.1884, + "grad_norm": 0.5623495578765869, + "learning_rate": 1.9527675507709368e-05, + "loss": 0.4498, + "step": 9420 + }, + { + "epoch": 0.18844, + "grad_norm": 3.7230727672576904, + "learning_rate": 1.9527251368942935e-05, + "loss": 0.2767, + "step": 9422 + }, + { + "epoch": 0.18848, + "grad_norm": 3.989034652709961, + "learning_rate": 1.9526827044437833e-05, + "loss": 0.3928, + "step": 9424 + }, + { + "epoch": 0.18852, + "grad_norm": 1.4996123313903809, + "learning_rate": 1.952640253420233e-05, + "loss": 0.1104, + "step": 9426 + }, + { + "epoch": 0.18856, + "grad_norm": 1.0327763557434082, + "learning_rate": 1.952597783824471e-05, + "loss": 0.3204, + "step": 9428 + }, + { + "epoch": 0.1886, + "grad_norm": 2.3546266555786133, + "learning_rate": 1.9525552956573244e-05, + "loss": 0.2867, + "step": 9430 + }, + { + "epoch": 0.18864, + "grad_norm": 1.2215509414672852, + "learning_rate": 1.952512788919622e-05, + "loss": 0.153, + "step": 9432 + }, + { + "epoch": 0.18868, + "grad_norm": 1.8348017930984497, + "learning_rate": 1.9524702636121925e-05, + "loss": 0.2106, + "step": 9434 + }, + { + "epoch": 0.18872, + "grad_norm": 1.5128921270370483, + "learning_rate": 1.952427719735865e-05, + "loss": 0.2236, + "step": 9436 + }, + { + "epoch": 0.18876, + "grad_norm": 0.9722334146499634, + "learning_rate": 1.9523851572914685e-05, + "loss": 0.131, + "step": 9438 + }, + { + "epoch": 0.1888, + "grad_norm": 0.8775439858436584, + "learning_rate": 1.9523425762798328e-05, + "loss": 0.3149, + "step": 9440 + }, + { + "epoch": 0.18884, + "grad_norm": 2.6044528484344482, + "learning_rate": 1.9522999767017885e-05, + "loss": 0.2044, + "step": 9442 + }, + { + "epoch": 0.18888, + "grad_norm": 1.0190510749816895, + "learning_rate": 1.952257358558166e-05, + "loss": 0.0824, + "step": 9444 + }, + { + "epoch": 0.18892, + "grad_norm": 1.3524110317230225, + "learning_rate": 1.9522147218497962e-05, + "loss": 0.1132, + "step": 9446 + }, + { + "epoch": 0.18896, + "grad_norm": 2.7819302082061768, + "learning_rate": 1.9521720665775098e-05, + "loss": 0.3314, + "step": 9448 + }, + { + "epoch": 0.189, + "grad_norm": 2.209686517715454, + "learning_rate": 1.9521293927421388e-05, + "loss": 0.4057, + "step": 9450 + }, + { + "epoch": 0.18904, + "grad_norm": 1.667487621307373, + "learning_rate": 1.9520867003445152e-05, + "loss": 0.2965, + "step": 9452 + }, + { + "epoch": 0.18908, + "grad_norm": 1.844473958015442, + "learning_rate": 1.9520439893854713e-05, + "loss": 0.2891, + "step": 9454 + }, + { + "epoch": 0.18912, + "grad_norm": 0.5818921327590942, + "learning_rate": 1.9520012598658397e-05, + "loss": 0.1822, + "step": 9456 + }, + { + "epoch": 0.18916, + "grad_norm": 0.9006745219230652, + "learning_rate": 1.9519585117864534e-05, + "loss": 0.133, + "step": 9458 + }, + { + "epoch": 0.1892, + "grad_norm": 0.9942321181297302, + "learning_rate": 1.9519157451481453e-05, + "loss": 0.0938, + "step": 9460 + }, + { + "epoch": 0.18924, + "grad_norm": 2.130894184112549, + "learning_rate": 1.9518729599517502e-05, + "loss": 0.5006, + "step": 9462 + }, + { + "epoch": 0.18928, + "grad_norm": 0.7670815587043762, + "learning_rate": 1.9518301561981016e-05, + "loss": 0.3583, + "step": 9464 + }, + { + "epoch": 0.18932, + "grad_norm": 3.354480028152466, + "learning_rate": 1.951787333888034e-05, + "loss": 0.477, + "step": 9466 + }, + { + "epoch": 0.18936, + "grad_norm": 1.9112355709075928, + "learning_rate": 1.9517444930223825e-05, + "loss": 0.1511, + "step": 9468 + }, + { + "epoch": 0.1894, + "grad_norm": 2.978264570236206, + "learning_rate": 1.9517016336019817e-05, + "loss": 0.5246, + "step": 9470 + }, + { + "epoch": 0.18944, + "grad_norm": 1.6822426319122314, + "learning_rate": 1.9516587556276677e-05, + "loss": 0.2773, + "step": 9472 + }, + { + "epoch": 0.18948, + "grad_norm": 1.0342905521392822, + "learning_rate": 1.9516158591002768e-05, + "loss": 0.1657, + "step": 9474 + }, + { + "epoch": 0.18952, + "grad_norm": 1.038438081741333, + "learning_rate": 1.9515729440206444e-05, + "loss": 0.1543, + "step": 9476 + }, + { + "epoch": 0.18956, + "grad_norm": 2.8317883014678955, + "learning_rate": 1.9515300103896075e-05, + "loss": 0.2292, + "step": 9478 + }, + { + "epoch": 0.1896, + "grad_norm": 4.080028057098389, + "learning_rate": 1.951487058208003e-05, + "loss": 0.564, + "step": 9480 + }, + { + "epoch": 0.18964, + "grad_norm": 0.6378528475761414, + "learning_rate": 1.951444087476669e-05, + "loss": 0.329, + "step": 9482 + }, + { + "epoch": 0.18968, + "grad_norm": 0.9534415006637573, + "learning_rate": 1.9514010981964426e-05, + "loss": 0.0812, + "step": 9484 + }, + { + "epoch": 0.18972, + "grad_norm": 1.9416404962539673, + "learning_rate": 1.9513580903681614e-05, + "loss": 0.1316, + "step": 9486 + }, + { + "epoch": 0.18976, + "grad_norm": 1.8266639709472656, + "learning_rate": 1.951315063992665e-05, + "loss": 0.34, + "step": 9488 + }, + { + "epoch": 0.1898, + "grad_norm": 3.0679771900177, + "learning_rate": 1.9512720190707915e-05, + "loss": 0.4548, + "step": 9490 + }, + { + "epoch": 0.18984, + "grad_norm": 1.0334289073944092, + "learning_rate": 1.9512289556033802e-05, + "loss": 0.15, + "step": 9492 + }, + { + "epoch": 0.18988, + "grad_norm": 2.359510898590088, + "learning_rate": 1.9511858735912708e-05, + "loss": 0.2559, + "step": 9494 + }, + { + "epoch": 0.18992, + "grad_norm": 1.6480681896209717, + "learning_rate": 1.951142773035303e-05, + "loss": 0.205, + "step": 9496 + }, + { + "epoch": 0.18996, + "grad_norm": 1.7805333137512207, + "learning_rate": 1.951099653936317e-05, + "loss": 0.1681, + "step": 9498 + }, + { + "epoch": 0.19, + "grad_norm": 1.354048490524292, + "learning_rate": 1.9510565162951538e-05, + "loss": 0.1074, + "step": 9500 + }, + { + "epoch": 0.19004, + "grad_norm": 0.8875983357429504, + "learning_rate": 1.951013360112654e-05, + "loss": 0.2698, + "step": 9502 + }, + { + "epoch": 0.19008, + "grad_norm": 3.1397862434387207, + "learning_rate": 1.950970185389659e-05, + "loss": 0.4118, + "step": 9504 + }, + { + "epoch": 0.19012, + "grad_norm": 1.713330626487732, + "learning_rate": 1.9509269921270115e-05, + "loss": 0.1154, + "step": 9506 + }, + { + "epoch": 0.19016, + "grad_norm": 0.40184077620506287, + "learning_rate": 1.950883780325552e-05, + "loss": 0.1544, + "step": 9508 + }, + { + "epoch": 0.1902, + "grad_norm": 0.57676762342453, + "learning_rate": 1.9508405499861235e-05, + "loss": 0.2298, + "step": 9510 + }, + { + "epoch": 0.19024, + "grad_norm": 2.770632743835449, + "learning_rate": 1.950797301109569e-05, + "loss": 0.2398, + "step": 9512 + }, + { + "epoch": 0.19028, + "grad_norm": 1.218796730041504, + "learning_rate": 1.950754033696732e-05, + "loss": 0.2037, + "step": 9514 + }, + { + "epoch": 0.19032, + "grad_norm": 2.883981466293335, + "learning_rate": 1.9507107477484555e-05, + "loss": 0.2784, + "step": 9516 + }, + { + "epoch": 0.19036, + "grad_norm": 2.134991407394409, + "learning_rate": 1.9506674432655833e-05, + "loss": 0.4412, + "step": 9518 + }, + { + "epoch": 0.1904, + "grad_norm": 1.9234366416931152, + "learning_rate": 1.95062412024896e-05, + "loss": 0.1744, + "step": 9520 + }, + { + "epoch": 0.19044, + "grad_norm": 0.20433446764945984, + "learning_rate": 1.9505807786994305e-05, + "loss": 0.1156, + "step": 9522 + }, + { + "epoch": 0.19048, + "grad_norm": 0.6686376333236694, + "learning_rate": 1.9505374186178384e-05, + "loss": 0.2385, + "step": 9524 + }, + { + "epoch": 0.19052, + "grad_norm": 0.4205217957496643, + "learning_rate": 1.9504940400050306e-05, + "loss": 0.1846, + "step": 9526 + }, + { + "epoch": 0.19056, + "grad_norm": 0.6399107575416565, + "learning_rate": 1.950450642861852e-05, + "loss": 0.161, + "step": 9528 + }, + { + "epoch": 0.1906, + "grad_norm": 2.7534143924713135, + "learning_rate": 1.9504072271891486e-05, + "loss": 0.2326, + "step": 9530 + }, + { + "epoch": 0.19064, + "grad_norm": 1.5532582998275757, + "learning_rate": 1.9503637929877674e-05, + "loss": 0.1553, + "step": 9532 + }, + { + "epoch": 0.19068, + "grad_norm": 1.5936890840530396, + "learning_rate": 1.9503203402585546e-05, + "loss": 0.4513, + "step": 9534 + }, + { + "epoch": 0.19072, + "grad_norm": 0.602949857711792, + "learning_rate": 1.9502768690023574e-05, + "loss": 0.1064, + "step": 9536 + }, + { + "epoch": 0.19076, + "grad_norm": 0.9524481296539307, + "learning_rate": 1.9502333792200237e-05, + "loss": 0.2877, + "step": 9538 + }, + { + "epoch": 0.1908, + "grad_norm": 2.285128355026245, + "learning_rate": 1.950189870912401e-05, + "loss": 0.4181, + "step": 9540 + }, + { + "epoch": 0.19084, + "grad_norm": 2.395177125930786, + "learning_rate": 1.950146344080337e-05, + "loss": 0.2878, + "step": 9542 + }, + { + "epoch": 0.19088, + "grad_norm": 1.5207443237304688, + "learning_rate": 1.9501027987246813e-05, + "loss": 0.1308, + "step": 9544 + }, + { + "epoch": 0.19092, + "grad_norm": 2.109400510787964, + "learning_rate": 1.9500592348462825e-05, + "loss": 0.1758, + "step": 9546 + }, + { + "epoch": 0.19096, + "grad_norm": 2.495326042175293, + "learning_rate": 1.95001565244599e-05, + "loss": 0.312, + "step": 9548 + }, + { + "epoch": 0.191, + "grad_norm": 1.8253116607666016, + "learning_rate": 1.9499720515246524e-05, + "loss": 0.1802, + "step": 9550 + }, + { + "epoch": 0.19104, + "grad_norm": 0.9338781237602234, + "learning_rate": 1.9499284320831213e-05, + "loss": 0.1312, + "step": 9552 + }, + { + "epoch": 0.19108, + "grad_norm": 1.8437707424163818, + "learning_rate": 1.9498847941222464e-05, + "loss": 0.186, + "step": 9554 + }, + { + "epoch": 0.19112, + "grad_norm": 1.1450963020324707, + "learning_rate": 1.949841137642878e-05, + "loss": 0.3238, + "step": 9556 + }, + { + "epoch": 0.19116, + "grad_norm": 1.2262176275253296, + "learning_rate": 1.9497974626458677e-05, + "loss": 0.1942, + "step": 9558 + }, + { + "epoch": 0.1912, + "grad_norm": 1.9120895862579346, + "learning_rate": 1.949753769132067e-05, + "loss": 0.2695, + "step": 9560 + }, + { + "epoch": 0.19124, + "grad_norm": 1.5375070571899414, + "learning_rate": 1.9497100571023274e-05, + "loss": 0.1784, + "step": 9562 + }, + { + "epoch": 0.19128, + "grad_norm": 1.8709008693695068, + "learning_rate": 1.949666326557502e-05, + "loss": 0.3172, + "step": 9564 + }, + { + "epoch": 0.19132, + "grad_norm": 1.1070479154586792, + "learning_rate": 1.949622577498442e-05, + "loss": 0.2354, + "step": 9566 + }, + { + "epoch": 0.19136, + "grad_norm": 0.6754641532897949, + "learning_rate": 1.9495788099260008e-05, + "loss": 0.17, + "step": 9568 + }, + { + "epoch": 0.1914, + "grad_norm": 1.2009860277175903, + "learning_rate": 1.949535023841032e-05, + "loss": 0.3151, + "step": 9570 + }, + { + "epoch": 0.19144, + "grad_norm": 0.8629559874534607, + "learning_rate": 1.949491219244389e-05, + "loss": 0.131, + "step": 9572 + }, + { + "epoch": 0.19148, + "grad_norm": 1.5030291080474854, + "learning_rate": 1.9494473961369263e-05, + "loss": 0.2353, + "step": 9574 + }, + { + "epoch": 0.19152, + "grad_norm": 0.8149023652076721, + "learning_rate": 1.9494035545194975e-05, + "loss": 0.0875, + "step": 9576 + }, + { + "epoch": 0.19156, + "grad_norm": 1.835091471672058, + "learning_rate": 1.9493596943929574e-05, + "loss": 0.2925, + "step": 9578 + }, + { + "epoch": 0.1916, + "grad_norm": 1.4542709589004517, + "learning_rate": 1.9493158157581617e-05, + "loss": 0.1185, + "step": 9580 + }, + { + "epoch": 0.19164, + "grad_norm": 1.9604910612106323, + "learning_rate": 1.949271918615965e-05, + "loss": 0.23, + "step": 9582 + }, + { + "epoch": 0.19168, + "grad_norm": 1.763771891593933, + "learning_rate": 1.9492280029672237e-05, + "loss": 0.1856, + "step": 9584 + }, + { + "epoch": 0.19172, + "grad_norm": 1.689609169960022, + "learning_rate": 1.9491840688127938e-05, + "loss": 0.1766, + "step": 9586 + }, + { + "epoch": 0.19176, + "grad_norm": 2.1000747680664062, + "learning_rate": 1.949140116153532e-05, + "loss": 0.2352, + "step": 9588 + }, + { + "epoch": 0.1918, + "grad_norm": 0.8840706944465637, + "learning_rate": 1.9490961449902946e-05, + "loss": 0.0738, + "step": 9590 + }, + { + "epoch": 0.19184, + "grad_norm": 2.879063844680786, + "learning_rate": 1.9490521553239397e-05, + "loss": 0.2475, + "step": 9592 + }, + { + "epoch": 0.19188, + "grad_norm": 1.6478220224380493, + "learning_rate": 1.9490081471553243e-05, + "loss": 0.1075, + "step": 9594 + }, + { + "epoch": 0.19192, + "grad_norm": 0.9287247657775879, + "learning_rate": 1.948964120485306e-05, + "loss": 0.0673, + "step": 9596 + }, + { + "epoch": 0.19196, + "grad_norm": 2.0111191272735596, + "learning_rate": 1.9489200753147442e-05, + "loss": 0.1232, + "step": 9598 + }, + { + "epoch": 0.192, + "grad_norm": 0.4796012043952942, + "learning_rate": 1.9488760116444966e-05, + "loss": 0.0525, + "step": 9600 + }, + { + "epoch": 0.19204, + "grad_norm": 0.8121281862258911, + "learning_rate": 1.9488319294754228e-05, + "loss": 0.1349, + "step": 9602 + }, + { + "epoch": 0.19208, + "grad_norm": 0.534895658493042, + "learning_rate": 1.948787828808382e-05, + "loss": 0.326, + "step": 9604 + }, + { + "epoch": 0.19212, + "grad_norm": 2.8003642559051514, + "learning_rate": 1.948743709644234e-05, + "loss": 0.3145, + "step": 9606 + }, + { + "epoch": 0.19216, + "grad_norm": 2.2718112468719482, + "learning_rate": 1.9486995719838392e-05, + "loss": 0.178, + "step": 9608 + }, + { + "epoch": 0.1922, + "grad_norm": 0.5057771801948547, + "learning_rate": 1.9486554158280576e-05, + "loss": 0.0842, + "step": 9610 + }, + { + "epoch": 0.19224, + "grad_norm": 0.4808379113674164, + "learning_rate": 1.9486112411777504e-05, + "loss": 0.0668, + "step": 9612 + }, + { + "epoch": 0.19228, + "grad_norm": 2.305736541748047, + "learning_rate": 1.9485670480337785e-05, + "loss": 0.2508, + "step": 9614 + }, + { + "epoch": 0.19232, + "grad_norm": 1.6206388473510742, + "learning_rate": 1.9485228363970038e-05, + "loss": 0.3637, + "step": 9616 + }, + { + "epoch": 0.19236, + "grad_norm": 0.6086488366127014, + "learning_rate": 1.948478606268288e-05, + "loss": 0.0598, + "step": 9618 + }, + { + "epoch": 0.1924, + "grad_norm": 1.270068883895874, + "learning_rate": 1.9484343576484935e-05, + "loss": 0.1325, + "step": 9620 + }, + { + "epoch": 0.19244, + "grad_norm": 0.4462535083293915, + "learning_rate": 1.948390090538483e-05, + "loss": 0.2519, + "step": 9622 + }, + { + "epoch": 0.19248, + "grad_norm": 1.5438551902770996, + "learning_rate": 1.9483458049391188e-05, + "loss": 0.1462, + "step": 9624 + }, + { + "epoch": 0.19252, + "grad_norm": 1.2739529609680176, + "learning_rate": 1.9483015008512655e-05, + "loss": 0.5178, + "step": 9626 + }, + { + "epoch": 0.19256, + "grad_norm": 0.5197372436523438, + "learning_rate": 1.948257178275786e-05, + "loss": 0.101, + "step": 9628 + }, + { + "epoch": 0.1926, + "grad_norm": 1.1742628812789917, + "learning_rate": 1.9482128372135446e-05, + "loss": 0.2101, + "step": 9630 + }, + { + "epoch": 0.19264, + "grad_norm": 2.572570562362671, + "learning_rate": 1.948168477665406e-05, + "loss": 0.2088, + "step": 9632 + }, + { + "epoch": 0.19268, + "grad_norm": 0.49560806155204773, + "learning_rate": 1.9481240996322347e-05, + "loss": 0.1402, + "step": 9634 + }, + { + "epoch": 0.19272, + "grad_norm": 3.948420286178589, + "learning_rate": 1.948079703114896e-05, + "loss": 0.3407, + "step": 9636 + }, + { + "epoch": 0.19276, + "grad_norm": 2.789116382598877, + "learning_rate": 1.9480352881142553e-05, + "loss": 0.4175, + "step": 9638 + }, + { + "epoch": 0.1928, + "grad_norm": 0.5549703240394592, + "learning_rate": 1.9479908546311783e-05, + "loss": 0.1611, + "step": 9640 + }, + { + "epoch": 0.19284, + "grad_norm": 2.3969850540161133, + "learning_rate": 1.947946402666532e-05, + "loss": 0.5519, + "step": 9642 + }, + { + "epoch": 0.19288, + "grad_norm": 1.630778431892395, + "learning_rate": 1.9479019322211824e-05, + "loss": 0.1857, + "step": 9644 + }, + { + "epoch": 0.19292, + "grad_norm": 1.750318169593811, + "learning_rate": 1.9478574432959965e-05, + "loss": 0.2101, + "step": 9646 + }, + { + "epoch": 0.19296, + "grad_norm": 1.172640085220337, + "learning_rate": 1.947812935891842e-05, + "loss": 0.1748, + "step": 9648 + }, + { + "epoch": 0.193, + "grad_norm": 1.082259178161621, + "learning_rate": 1.947768410009586e-05, + "loss": 0.1618, + "step": 9650 + }, + { + "epoch": 0.19304, + "grad_norm": 1.4227384328842163, + "learning_rate": 1.947723865650097e-05, + "loss": 0.2596, + "step": 9652 + }, + { + "epoch": 0.19308, + "grad_norm": 0.8866002559661865, + "learning_rate": 1.9476793028142433e-05, + "loss": 0.1636, + "step": 9654 + }, + { + "epoch": 0.19312, + "grad_norm": 1.409732699394226, + "learning_rate": 1.947634721502894e-05, + "loss": 0.2352, + "step": 9656 + }, + { + "epoch": 0.19316, + "grad_norm": 1.4026485681533813, + "learning_rate": 1.9475901217169175e-05, + "loss": 0.1592, + "step": 9658 + }, + { + "epoch": 0.1932, + "grad_norm": 1.918944001197815, + "learning_rate": 1.947545503457184e-05, + "loss": 0.1747, + "step": 9660 + }, + { + "epoch": 0.19324, + "grad_norm": 2.8993828296661377, + "learning_rate": 1.9475008667245628e-05, + "loss": 0.241, + "step": 9662 + }, + { + "epoch": 0.19328, + "grad_norm": 2.557218551635742, + "learning_rate": 1.9474562115199246e-05, + "loss": 0.2558, + "step": 9664 + }, + { + "epoch": 0.19332, + "grad_norm": 1.5285977125167847, + "learning_rate": 1.94741153784414e-05, + "loss": 0.1534, + "step": 9666 + }, + { + "epoch": 0.19336, + "grad_norm": 0.5569203495979309, + "learning_rate": 1.9473668456980796e-05, + "loss": 0.0653, + "step": 9668 + }, + { + "epoch": 0.1934, + "grad_norm": 2.118198871612549, + "learning_rate": 1.9473221350826145e-05, + "loss": 0.2665, + "step": 9670 + }, + { + "epoch": 0.19344, + "grad_norm": 1.371516227722168, + "learning_rate": 1.947277405998617e-05, + "loss": 0.1937, + "step": 9672 + }, + { + "epoch": 0.19348, + "grad_norm": 0.7787559628486633, + "learning_rate": 1.9472326584469584e-05, + "loss": 0.1808, + "step": 9674 + }, + { + "epoch": 0.19352, + "grad_norm": 1.033230185508728, + "learning_rate": 1.9471878924285116e-05, + "loss": 0.1514, + "step": 9676 + }, + { + "epoch": 0.19356, + "grad_norm": 0.9491438865661621, + "learning_rate": 1.9471431079441495e-05, + "loss": 0.1111, + "step": 9678 + }, + { + "epoch": 0.1936, + "grad_norm": 1.2080974578857422, + "learning_rate": 1.9470983049947446e-05, + "loss": 0.1305, + "step": 9680 + }, + { + "epoch": 0.19364, + "grad_norm": 0.9699440598487854, + "learning_rate": 1.9470534835811704e-05, + "loss": 0.1403, + "step": 9682 + }, + { + "epoch": 0.19368, + "grad_norm": 1.834931492805481, + "learning_rate": 1.9470086437043014e-05, + "loss": 0.1683, + "step": 9684 + }, + { + "epoch": 0.19372, + "grad_norm": 3.204319477081299, + "learning_rate": 1.9469637853650113e-05, + "loss": 0.6007, + "step": 9686 + }, + { + "epoch": 0.19376, + "grad_norm": 0.3500208556652069, + "learning_rate": 1.9469189085641743e-05, + "loss": 0.3031, + "step": 9688 + }, + { + "epoch": 0.1938, + "grad_norm": 0.4596458673477173, + "learning_rate": 1.946874013302666e-05, + "loss": 0.096, + "step": 9690 + }, + { + "epoch": 0.19384, + "grad_norm": 2.3113086223602295, + "learning_rate": 1.9468290995813614e-05, + "loss": 0.4221, + "step": 9692 + }, + { + "epoch": 0.19388, + "grad_norm": 2.474252939224243, + "learning_rate": 1.946784167401136e-05, + "loss": 0.2874, + "step": 9694 + }, + { + "epoch": 0.19392, + "grad_norm": 0.9908041954040527, + "learning_rate": 1.9467392167628655e-05, + "loss": 0.3094, + "step": 9696 + }, + { + "epoch": 0.19396, + "grad_norm": 1.4397754669189453, + "learning_rate": 1.9466942476674272e-05, + "loss": 0.0989, + "step": 9698 + }, + { + "epoch": 0.194, + "grad_norm": 2.064897060394287, + "learning_rate": 1.9466492601156964e-05, + "loss": 0.1687, + "step": 9700 + }, + { + "epoch": 0.19404, + "grad_norm": 1.0289360284805298, + "learning_rate": 1.9466042541085515e-05, + "loss": 0.1372, + "step": 9702 + }, + { + "epoch": 0.19408, + "grad_norm": 1.2830471992492676, + "learning_rate": 1.946559229646869e-05, + "loss": 0.2149, + "step": 9704 + }, + { + "epoch": 0.19412, + "grad_norm": 1.0622986555099487, + "learning_rate": 1.946514186731527e-05, + "loss": 0.1352, + "step": 9706 + }, + { + "epoch": 0.19416, + "grad_norm": 2.521000862121582, + "learning_rate": 1.9464691253634037e-05, + "loss": 0.2102, + "step": 9708 + }, + { + "epoch": 0.1942, + "grad_norm": 1.3868004083633423, + "learning_rate": 1.9464240455433775e-05, + "loss": 0.1652, + "step": 9710 + }, + { + "epoch": 0.19424, + "grad_norm": 1.7332419157028198, + "learning_rate": 1.9463789472723272e-05, + "loss": 0.1813, + "step": 9712 + }, + { + "epoch": 0.19428, + "grad_norm": 0.474130779504776, + "learning_rate": 1.9463338305511324e-05, + "loss": 0.0348, + "step": 9714 + }, + { + "epoch": 0.19432, + "grad_norm": 1.3292795419692993, + "learning_rate": 1.946288695380672e-05, + "loss": 0.2179, + "step": 9716 + }, + { + "epoch": 0.19436, + "grad_norm": 1.501808762550354, + "learning_rate": 1.9462435417618266e-05, + "loss": 0.3475, + "step": 9718 + }, + { + "epoch": 0.1944, + "grad_norm": 1.2587331533432007, + "learning_rate": 1.946198369695476e-05, + "loss": 0.1055, + "step": 9720 + }, + { + "epoch": 0.19444, + "grad_norm": 2.6129097938537598, + "learning_rate": 1.946153179182501e-05, + "loss": 0.2485, + "step": 9722 + }, + { + "epoch": 0.19448, + "grad_norm": 0.5519055724143982, + "learning_rate": 1.9461079702237827e-05, + "loss": 0.0586, + "step": 9724 + }, + { + "epoch": 0.19452, + "grad_norm": 0.3967776596546173, + "learning_rate": 1.9460627428202028e-05, + "loss": 0.0723, + "step": 9726 + }, + { + "epoch": 0.19456, + "grad_norm": 0.5725026726722717, + "learning_rate": 1.946017496972642e-05, + "loss": 0.0548, + "step": 9728 + }, + { + "epoch": 0.1946, + "grad_norm": 1.2414525747299194, + "learning_rate": 1.945972232681984e-05, + "loss": 0.112, + "step": 9730 + }, + { + "epoch": 0.19464, + "grad_norm": 2.622511863708496, + "learning_rate": 1.9459269499491094e-05, + "loss": 0.1969, + "step": 9732 + }, + { + "epoch": 0.19468, + "grad_norm": 0.9136091470718384, + "learning_rate": 1.9458816487749023e-05, + "loss": 0.1649, + "step": 9734 + }, + { + "epoch": 0.19472, + "grad_norm": 1.593148112297058, + "learning_rate": 1.9458363291602455e-05, + "loss": 0.1968, + "step": 9736 + }, + { + "epoch": 0.19476, + "grad_norm": 1.7927851676940918, + "learning_rate": 1.9457909911060225e-05, + "loss": 0.1461, + "step": 9738 + }, + { + "epoch": 0.1948, + "grad_norm": 1.985498070716858, + "learning_rate": 1.945745634613117e-05, + "loss": 0.3671, + "step": 9740 + }, + { + "epoch": 0.19484, + "grad_norm": 2.9198970794677734, + "learning_rate": 1.9457002596824133e-05, + "loss": 0.3471, + "step": 9742 + }, + { + "epoch": 0.19488, + "grad_norm": 0.970778226852417, + "learning_rate": 1.9456548663147966e-05, + "loss": 0.1549, + "step": 9744 + }, + { + "epoch": 0.19492, + "grad_norm": 1.3575905561447144, + "learning_rate": 1.945609454511151e-05, + "loss": 0.2217, + "step": 9746 + }, + { + "epoch": 0.19496, + "grad_norm": 2.2632248401641846, + "learning_rate": 1.945564024272363e-05, + "loss": 0.1967, + "step": 9748 + }, + { + "epoch": 0.195, + "grad_norm": 0.5009101033210754, + "learning_rate": 1.945518575599317e-05, + "loss": 0.0484, + "step": 9750 + }, + { + "epoch": 0.19504, + "grad_norm": 1.2185016870498657, + "learning_rate": 1.9454731084928995e-05, + "loss": 0.188, + "step": 9752 + }, + { + "epoch": 0.19508, + "grad_norm": 1.658449649810791, + "learning_rate": 1.945427622953997e-05, + "loss": 0.143, + "step": 9754 + }, + { + "epoch": 0.19512, + "grad_norm": 1.798744559288025, + "learning_rate": 1.9453821189834965e-05, + "loss": 0.1439, + "step": 9756 + }, + { + "epoch": 0.19516, + "grad_norm": 1.525763750076294, + "learning_rate": 1.9453365965822847e-05, + "loss": 0.1057, + "step": 9758 + }, + { + "epoch": 0.1952, + "grad_norm": 1.4084064960479736, + "learning_rate": 1.9452910557512497e-05, + "loss": 0.1306, + "step": 9760 + }, + { + "epoch": 0.19524, + "grad_norm": 0.8844407796859741, + "learning_rate": 1.9452454964912782e-05, + "loss": 0.3773, + "step": 9762 + }, + { + "epoch": 0.19528, + "grad_norm": 2.154787302017212, + "learning_rate": 1.9451999188032597e-05, + "loss": 0.3549, + "step": 9764 + }, + { + "epoch": 0.19532, + "grad_norm": 1.8716325759887695, + "learning_rate": 1.9451543226880817e-05, + "loss": 0.3462, + "step": 9766 + }, + { + "epoch": 0.19536, + "grad_norm": 3.3684356212615967, + "learning_rate": 1.9451087081466337e-05, + "loss": 0.4107, + "step": 9768 + }, + { + "epoch": 0.1954, + "grad_norm": 0.439131498336792, + "learning_rate": 1.945063075179805e-05, + "loss": 0.1301, + "step": 9770 + }, + { + "epoch": 0.19544, + "grad_norm": 1.0350403785705566, + "learning_rate": 1.945017423788485e-05, + "loss": 0.1882, + "step": 9772 + }, + { + "epoch": 0.19548, + "grad_norm": 1.5783406496047974, + "learning_rate": 1.944971753973564e-05, + "loss": 0.1584, + "step": 9774 + }, + { + "epoch": 0.19552, + "grad_norm": 0.9058586955070496, + "learning_rate": 1.9449260657359317e-05, + "loss": 0.0721, + "step": 9776 + }, + { + "epoch": 0.19556, + "grad_norm": 2.0271880626678467, + "learning_rate": 1.9448803590764797e-05, + "loss": 0.3892, + "step": 9778 + }, + { + "epoch": 0.1956, + "grad_norm": 1.2001144886016846, + "learning_rate": 1.9448346339960984e-05, + "loss": 0.1878, + "step": 9780 + }, + { + "epoch": 0.19564, + "grad_norm": 2.5591604709625244, + "learning_rate": 1.9447888904956795e-05, + "loss": 0.2085, + "step": 9782 + }, + { + "epoch": 0.19568, + "grad_norm": 3.1863532066345215, + "learning_rate": 1.9447431285761148e-05, + "loss": 0.3071, + "step": 9784 + }, + { + "epoch": 0.19572, + "grad_norm": 2.925565242767334, + "learning_rate": 1.9446973482382964e-05, + "loss": 0.5017, + "step": 9786 + }, + { + "epoch": 0.19576, + "grad_norm": 0.9327439069747925, + "learning_rate": 1.9446515494831168e-05, + "loss": 0.0584, + "step": 9788 + }, + { + "epoch": 0.1958, + "grad_norm": 0.8709267973899841, + "learning_rate": 1.944605732311469e-05, + "loss": 0.1256, + "step": 9790 + }, + { + "epoch": 0.19584, + "grad_norm": 0.7516695857048035, + "learning_rate": 1.9445598967242464e-05, + "loss": 0.0851, + "step": 9792 + }, + { + "epoch": 0.19588, + "grad_norm": 0.7536870837211609, + "learning_rate": 1.944514042722342e-05, + "loss": 0.1607, + "step": 9794 + }, + { + "epoch": 0.19592, + "grad_norm": 0.40438780188560486, + "learning_rate": 1.94446817030665e-05, + "loss": 0.1211, + "step": 9796 + }, + { + "epoch": 0.19596, + "grad_norm": 1.4704614877700806, + "learning_rate": 1.944422279478065e-05, + "loss": 0.3102, + "step": 9798 + }, + { + "epoch": 0.196, + "grad_norm": 1.5322855710983276, + "learning_rate": 1.944376370237481e-05, + "loss": 0.1615, + "step": 9800 + }, + { + "epoch": 0.19604, + "grad_norm": 2.1817309856414795, + "learning_rate": 1.944330442585794e-05, + "loss": 0.4136, + "step": 9802 + }, + { + "epoch": 0.19608, + "grad_norm": 1.7177702188491821, + "learning_rate": 1.9442844965238987e-05, + "loss": 0.3167, + "step": 9804 + }, + { + "epoch": 0.19612, + "grad_norm": 3.5267138481140137, + "learning_rate": 1.9442385320526908e-05, + "loss": 0.4739, + "step": 9806 + }, + { + "epoch": 0.19616, + "grad_norm": 0.6462252736091614, + "learning_rate": 1.944192549173067e-05, + "loss": 0.1598, + "step": 9808 + }, + { + "epoch": 0.1962, + "grad_norm": 0.6216464638710022, + "learning_rate": 1.944146547885923e-05, + "loss": 0.0887, + "step": 9810 + }, + { + "epoch": 0.19624, + "grad_norm": 2.5461483001708984, + "learning_rate": 1.944100528192156e-05, + "loss": 0.3635, + "step": 9812 + }, + { + "epoch": 0.19628, + "grad_norm": 0.7214646935462952, + "learning_rate": 1.944054490092663e-05, + "loss": 0.1632, + "step": 9814 + }, + { + "epoch": 0.19632, + "grad_norm": 1.3354928493499756, + "learning_rate": 1.944008433588342e-05, + "loss": 0.1852, + "step": 9816 + }, + { + "epoch": 0.19636, + "grad_norm": 0.9797312617301941, + "learning_rate": 1.94396235868009e-05, + "loss": 0.0897, + "step": 9818 + }, + { + "epoch": 0.1964, + "grad_norm": 2.057828903198242, + "learning_rate": 1.9439162653688066e-05, + "loss": 0.3166, + "step": 9820 + }, + { + "epoch": 0.19644, + "grad_norm": 0.7125145792961121, + "learning_rate": 1.9438701536553895e-05, + "loss": 0.2286, + "step": 9822 + }, + { + "epoch": 0.19648, + "grad_norm": 2.1528069972991943, + "learning_rate": 1.9438240235407375e-05, + "loss": 0.2422, + "step": 9824 + }, + { + "epoch": 0.19652, + "grad_norm": 1.498796820640564, + "learning_rate": 1.9437778750257504e-05, + "loss": 0.244, + "step": 9826 + }, + { + "epoch": 0.19656, + "grad_norm": 2.358058452606201, + "learning_rate": 1.9437317081113275e-05, + "loss": 0.2549, + "step": 9828 + }, + { + "epoch": 0.1966, + "grad_norm": 1.4192932844161987, + "learning_rate": 1.9436855227983695e-05, + "loss": 0.2118, + "step": 9830 + }, + { + "epoch": 0.19664, + "grad_norm": 2.227900505065918, + "learning_rate": 1.943639319087776e-05, + "loss": 0.3407, + "step": 9832 + }, + { + "epoch": 0.19668, + "grad_norm": 0.9490455389022827, + "learning_rate": 1.9435930969804488e-05, + "loss": 0.1879, + "step": 9834 + }, + { + "epoch": 0.19672, + "grad_norm": 0.9481604099273682, + "learning_rate": 1.9435468564772878e-05, + "loss": 0.1777, + "step": 9836 + }, + { + "epoch": 0.19676, + "grad_norm": 1.6269168853759766, + "learning_rate": 1.943500597579195e-05, + "loss": 0.2101, + "step": 9838 + }, + { + "epoch": 0.1968, + "grad_norm": 1.7867416143417358, + "learning_rate": 1.9434543202870726e-05, + "loss": 0.1984, + "step": 9840 + }, + { + "epoch": 0.19684, + "grad_norm": 1.7178404331207275, + "learning_rate": 1.9434080246018222e-05, + "loss": 0.2153, + "step": 9842 + }, + { + "epoch": 0.19688, + "grad_norm": 1.6523960828781128, + "learning_rate": 1.943361710524347e-05, + "loss": 0.1535, + "step": 9844 + }, + { + "epoch": 0.19692, + "grad_norm": 1.2053940296173096, + "learning_rate": 1.9433153780555495e-05, + "loss": 0.1881, + "step": 9846 + }, + { + "epoch": 0.19696, + "grad_norm": 1.9353913068771362, + "learning_rate": 1.9432690271963327e-05, + "loss": 0.4391, + "step": 9848 + }, + { + "epoch": 0.197, + "grad_norm": 0.7175002098083496, + "learning_rate": 1.943222657947601e-05, + "loss": 0.2469, + "step": 9850 + }, + { + "epoch": 0.19704, + "grad_norm": 1.4169361591339111, + "learning_rate": 1.943176270310258e-05, + "loss": 0.1685, + "step": 9852 + }, + { + "epoch": 0.19708, + "grad_norm": 1.593366265296936, + "learning_rate": 1.943129864285208e-05, + "loss": 0.2023, + "step": 9854 + }, + { + "epoch": 0.19712, + "grad_norm": 0.8151346445083618, + "learning_rate": 1.9430834398733557e-05, + "loss": 0.1533, + "step": 9856 + }, + { + "epoch": 0.19716, + "grad_norm": 0.24851517379283905, + "learning_rate": 1.943036997075606e-05, + "loss": 0.2163, + "step": 9858 + }, + { + "epoch": 0.1972, + "grad_norm": 2.4482760429382324, + "learning_rate": 1.9429905358928648e-05, + "loss": 0.3699, + "step": 9860 + }, + { + "epoch": 0.19724, + "grad_norm": 1.085711121559143, + "learning_rate": 1.9429440563260373e-05, + "loss": 0.1354, + "step": 9862 + }, + { + "epoch": 0.19728, + "grad_norm": 3.1871395111083984, + "learning_rate": 1.9428975583760303e-05, + "loss": 0.3977, + "step": 9864 + }, + { + "epoch": 0.19732, + "grad_norm": 2.4833593368530273, + "learning_rate": 1.9428510420437495e-05, + "loss": 0.3474, + "step": 9866 + }, + { + "epoch": 0.19736, + "grad_norm": 1.2429865598678589, + "learning_rate": 1.9428045073301026e-05, + "loss": 0.2235, + "step": 9868 + }, + { + "epoch": 0.1974, + "grad_norm": 0.8492429852485657, + "learning_rate": 1.9427579542359966e-05, + "loss": 0.1308, + "step": 9870 + }, + { + "epoch": 0.19744, + "grad_norm": 1.4777581691741943, + "learning_rate": 1.9427113827623385e-05, + "loss": 0.1856, + "step": 9872 + }, + { + "epoch": 0.19748, + "grad_norm": 1.010495662689209, + "learning_rate": 1.942664792910037e-05, + "loss": 0.1384, + "step": 9874 + }, + { + "epoch": 0.19752, + "grad_norm": 1.5360684394836426, + "learning_rate": 1.9426181846799998e-05, + "loss": 0.2898, + "step": 9876 + }, + { + "epoch": 0.19756, + "grad_norm": 0.7503257393836975, + "learning_rate": 1.942571558073136e-05, + "loss": 0.1646, + "step": 9878 + }, + { + "epoch": 0.1976, + "grad_norm": 1.0700163841247559, + "learning_rate": 1.9425249130903544e-05, + "loss": 0.1845, + "step": 9880 + }, + { + "epoch": 0.19764, + "grad_norm": 1.1449741125106812, + "learning_rate": 1.9424782497325642e-05, + "loss": 0.2235, + "step": 9882 + }, + { + "epoch": 0.19768, + "grad_norm": 1.7732844352722168, + "learning_rate": 1.9424315680006755e-05, + "loss": 0.3663, + "step": 9884 + }, + { + "epoch": 0.19772, + "grad_norm": 1.2468345165252686, + "learning_rate": 1.942384867895598e-05, + "loss": 0.2824, + "step": 9886 + }, + { + "epoch": 0.19776, + "grad_norm": 0.5036619305610657, + "learning_rate": 1.9423381494182422e-05, + "loss": 0.0967, + "step": 9888 + }, + { + "epoch": 0.1978, + "grad_norm": 1.7644586563110352, + "learning_rate": 1.942291412569519e-05, + "loss": 0.3312, + "step": 9890 + }, + { + "epoch": 0.19784, + "grad_norm": 0.7183815836906433, + "learning_rate": 1.94224465735034e-05, + "loss": 0.1193, + "step": 9892 + }, + { + "epoch": 0.19788, + "grad_norm": 0.62570720911026, + "learning_rate": 1.942197883761616e-05, + "loss": 0.0764, + "step": 9894 + }, + { + "epoch": 0.19792, + "grad_norm": 0.9076298475265503, + "learning_rate": 1.9421510918042593e-05, + "loss": 0.1991, + "step": 9896 + }, + { + "epoch": 0.19796, + "grad_norm": 0.9366265535354614, + "learning_rate": 1.942104281479182e-05, + "loss": 0.1062, + "step": 9898 + }, + { + "epoch": 0.198, + "grad_norm": 2.091053009033203, + "learning_rate": 1.942057452787297e-05, + "loss": 0.2632, + "step": 9900 + }, + { + "epoch": 0.19804, + "grad_norm": 0.544151246547699, + "learning_rate": 1.9420106057295166e-05, + "loss": 0.1647, + "step": 9902 + }, + { + "epoch": 0.19808, + "grad_norm": 2.3449912071228027, + "learning_rate": 1.9419637403067547e-05, + "loss": 0.502, + "step": 9904 + }, + { + "epoch": 0.19812, + "grad_norm": 1.2419512271881104, + "learning_rate": 1.9419168565199246e-05, + "loss": 0.1674, + "step": 9906 + }, + { + "epoch": 0.19816, + "grad_norm": 1.0377113819122314, + "learning_rate": 1.9418699543699402e-05, + "loss": 0.0951, + "step": 9908 + }, + { + "epoch": 0.1982, + "grad_norm": 0.4865454137325287, + "learning_rate": 1.9418230338577164e-05, + "loss": 0.0813, + "step": 9910 + }, + { + "epoch": 0.19824, + "grad_norm": 1.5188478231430054, + "learning_rate": 1.9417760949841672e-05, + "loss": 0.1387, + "step": 9912 + }, + { + "epoch": 0.19828, + "grad_norm": 0.7614434361457825, + "learning_rate": 1.9417291377502086e-05, + "loss": 0.0866, + "step": 9914 + }, + { + "epoch": 0.19832, + "grad_norm": 0.618732750415802, + "learning_rate": 1.941682162156756e-05, + "loss": 0.2423, + "step": 9916 + }, + { + "epoch": 0.19836, + "grad_norm": 1.3193557262420654, + "learning_rate": 1.941635168204724e-05, + "loss": 0.1855, + "step": 9918 + }, + { + "epoch": 0.1984, + "grad_norm": 1.780545711517334, + "learning_rate": 1.9415881558950302e-05, + "loss": 0.3011, + "step": 9920 + }, + { + "epoch": 0.19844, + "grad_norm": 1.6834936141967773, + "learning_rate": 1.9415411252285904e-05, + "loss": 0.2325, + "step": 9922 + }, + { + "epoch": 0.19848, + "grad_norm": 1.409214735031128, + "learning_rate": 1.9414940762063215e-05, + "loss": 0.0691, + "step": 9924 + }, + { + "epoch": 0.19852, + "grad_norm": 0.14483192563056946, + "learning_rate": 1.9414470088291414e-05, + "loss": 0.0484, + "step": 9926 + }, + { + "epoch": 0.19856, + "grad_norm": 1.4105803966522217, + "learning_rate": 1.9413999230979666e-05, + "loss": 0.1346, + "step": 9928 + }, + { + "epoch": 0.1986, + "grad_norm": 1.6850916147232056, + "learning_rate": 1.9413528190137158e-05, + "loss": 0.4713, + "step": 9930 + }, + { + "epoch": 0.19864, + "grad_norm": 0.1176038384437561, + "learning_rate": 1.941305696577307e-05, + "loss": 0.2554, + "step": 9932 + }, + { + "epoch": 0.19868, + "grad_norm": 3.3738763332366943, + "learning_rate": 1.9412585557896594e-05, + "loss": 0.2181, + "step": 9934 + }, + { + "epoch": 0.19872, + "grad_norm": 0.23103876411914825, + "learning_rate": 1.941211396651691e-05, + "loss": 0.2935, + "step": 9936 + }, + { + "epoch": 0.19876, + "grad_norm": 2.4048211574554443, + "learning_rate": 1.9411642191643224e-05, + "loss": 0.3148, + "step": 9938 + }, + { + "epoch": 0.1988, + "grad_norm": 0.320747971534729, + "learning_rate": 1.9411170233284728e-05, + "loss": 0.023, + "step": 9940 + }, + { + "epoch": 0.19884, + "grad_norm": 1.1678743362426758, + "learning_rate": 1.9410698091450625e-05, + "loss": 0.223, + "step": 9942 + }, + { + "epoch": 0.19888, + "grad_norm": 1.0577775239944458, + "learning_rate": 1.941022576615011e-05, + "loss": 0.1748, + "step": 9944 + }, + { + "epoch": 0.19892, + "grad_norm": 1.2733094692230225, + "learning_rate": 1.9409753257392404e-05, + "loss": 0.1531, + "step": 9946 + }, + { + "epoch": 0.19896, + "grad_norm": 1.5215041637420654, + "learning_rate": 1.9409280565186718e-05, + "loss": 0.2383, + "step": 9948 + }, + { + "epoch": 0.199, + "grad_norm": 1.4073636531829834, + "learning_rate": 1.9408807689542257e-05, + "loss": 0.124, + "step": 9950 + }, + { + "epoch": 0.19904, + "grad_norm": 2.33223295211792, + "learning_rate": 1.9408334630468247e-05, + "loss": 0.2728, + "step": 9952 + }, + { + "epoch": 0.19908, + "grad_norm": 1.1935415267944336, + "learning_rate": 1.940786138797391e-05, + "loss": 0.2038, + "step": 9954 + }, + { + "epoch": 0.19912, + "grad_norm": 1.2123422622680664, + "learning_rate": 1.9407387962068473e-05, + "loss": 0.1964, + "step": 9956 + }, + { + "epoch": 0.19916, + "grad_norm": 0.7510764002799988, + "learning_rate": 1.9406914352761162e-05, + "loss": 0.126, + "step": 9958 + }, + { + "epoch": 0.1992, + "grad_norm": 0.9220711588859558, + "learning_rate": 1.9406440560061214e-05, + "loss": 0.1859, + "step": 9960 + }, + { + "epoch": 0.19924, + "grad_norm": 1.8612496852874756, + "learning_rate": 1.9405966583977867e-05, + "loss": 0.164, + "step": 9962 + }, + { + "epoch": 0.19928, + "grad_norm": 0.6552383303642273, + "learning_rate": 1.9405492424520357e-05, + "loss": 0.0929, + "step": 9964 + }, + { + "epoch": 0.19932, + "grad_norm": 1.9411587715148926, + "learning_rate": 1.9405018081697928e-05, + "loss": 0.1814, + "step": 9966 + }, + { + "epoch": 0.19936, + "grad_norm": 2.7789502143859863, + "learning_rate": 1.940454355551983e-05, + "loss": 0.4415, + "step": 9968 + }, + { + "epoch": 0.1994, + "grad_norm": 1.1207019090652466, + "learning_rate": 1.9404068845995317e-05, + "loss": 0.137, + "step": 9970 + }, + { + "epoch": 0.19944, + "grad_norm": 2.1773934364318848, + "learning_rate": 1.940359395313364e-05, + "loss": 0.316, + "step": 9972 + }, + { + "epoch": 0.19948, + "grad_norm": 2.6107490062713623, + "learning_rate": 1.9403118876944052e-05, + "loss": 0.2825, + "step": 9974 + }, + { + "epoch": 0.19952, + "grad_norm": 1.5010229349136353, + "learning_rate": 1.9402643617435824e-05, + "loss": 0.2762, + "step": 9976 + }, + { + "epoch": 0.19956, + "grad_norm": 1.5068572759628296, + "learning_rate": 1.940216817461822e-05, + "loss": 0.2564, + "step": 9978 + }, + { + "epoch": 0.1996, + "grad_norm": 1.3122180700302124, + "learning_rate": 1.9401692548500504e-05, + "loss": 0.1496, + "step": 9980 + }, + { + "epoch": 0.19964, + "grad_norm": 1.0359747409820557, + "learning_rate": 1.940121673909195e-05, + "loss": 0.138, + "step": 9982 + }, + { + "epoch": 0.19968, + "grad_norm": 1.7188358306884766, + "learning_rate": 1.940074074640184e-05, + "loss": 0.1855, + "step": 9984 + }, + { + "epoch": 0.19972, + "grad_norm": 2.1805789470672607, + "learning_rate": 1.9400264570439447e-05, + "loss": 0.2156, + "step": 9986 + }, + { + "epoch": 0.19976, + "grad_norm": 0.2735448181629181, + "learning_rate": 1.9399788211214056e-05, + "loss": 0.0785, + "step": 9988 + }, + { + "epoch": 0.1998, + "grad_norm": 1.8478752374649048, + "learning_rate": 1.9399311668734957e-05, + "loss": 0.1541, + "step": 9990 + }, + { + "epoch": 0.19984, + "grad_norm": 2.8510892391204834, + "learning_rate": 1.9398834943011434e-05, + "loss": 0.2178, + "step": 9992 + }, + { + "epoch": 0.19988, + "grad_norm": 2.2183287143707275, + "learning_rate": 1.939835803405279e-05, + "loss": 0.3559, + "step": 9994 + }, + { + "epoch": 0.19992, + "grad_norm": 0.4033871591091156, + "learning_rate": 1.9397880941868316e-05, + "loss": 0.1402, + "step": 9996 + }, + { + "epoch": 0.19996, + "grad_norm": 2.409390926361084, + "learning_rate": 1.939740366646731e-05, + "loss": 0.2594, + "step": 9998 + }, + { + "epoch": 0.2, + "grad_norm": 0.4244735836982727, + "learning_rate": 1.9396926207859085e-05, + "loss": 0.2143, + "step": 10000 + }, + { + "epoch": 0.20004, + "grad_norm": 0.7509792447090149, + "learning_rate": 1.9396448566052946e-05, + "loss": 0.1074, + "step": 10002 + }, + { + "epoch": 0.20008, + "grad_norm": 1.5755903720855713, + "learning_rate": 1.9395970741058202e-05, + "loss": 0.357, + "step": 10004 + }, + { + "epoch": 0.20012, + "grad_norm": 2.054311513900757, + "learning_rate": 1.9395492732884174e-05, + "loss": 0.2533, + "step": 10006 + }, + { + "epoch": 0.20016, + "grad_norm": 1.9599323272705078, + "learning_rate": 1.939501454154018e-05, + "loss": 0.2694, + "step": 10008 + }, + { + "epoch": 0.2002, + "grad_norm": 0.7059348821640015, + "learning_rate": 1.9394536167035535e-05, + "loss": 0.2234, + "step": 10010 + }, + { + "epoch": 0.20024, + "grad_norm": 2.074901580810547, + "learning_rate": 1.9394057609379575e-05, + "loss": 0.3132, + "step": 10012 + }, + { + "epoch": 0.20028, + "grad_norm": 1.947082757949829, + "learning_rate": 1.9393578868581625e-05, + "loss": 0.1854, + "step": 10014 + }, + { + "epoch": 0.20032, + "grad_norm": 1.436665654182434, + "learning_rate": 1.9393099944651017e-05, + "loss": 0.1648, + "step": 10016 + }, + { + "epoch": 0.20036, + "grad_norm": 1.7094378471374512, + "learning_rate": 1.9392620837597088e-05, + "loss": 0.1778, + "step": 10018 + }, + { + "epoch": 0.2004, + "grad_norm": 0.6356484889984131, + "learning_rate": 1.9392141547429183e-05, + "loss": 0.0719, + "step": 10020 + }, + { + "epoch": 0.20044, + "grad_norm": 0.6875884532928467, + "learning_rate": 1.9391662074156646e-05, + "loss": 0.3146, + "step": 10022 + }, + { + "epoch": 0.20048, + "grad_norm": 1.3729839324951172, + "learning_rate": 1.9391182417788816e-05, + "loss": 0.1099, + "step": 10024 + }, + { + "epoch": 0.20052, + "grad_norm": 0.36588025093078613, + "learning_rate": 1.9390702578335054e-05, + "loss": 0.2409, + "step": 10026 + }, + { + "epoch": 0.20056, + "grad_norm": 0.5418481230735779, + "learning_rate": 1.9390222555804707e-05, + "loss": 0.2466, + "step": 10028 + }, + { + "epoch": 0.2006, + "grad_norm": 2.113774061203003, + "learning_rate": 1.938974235020714e-05, + "loss": 0.2236, + "step": 10030 + }, + { + "epoch": 0.20064, + "grad_norm": 1.5331465005874634, + "learning_rate": 1.938926196155171e-05, + "loss": 0.1644, + "step": 10032 + }, + { + "epoch": 0.20068, + "grad_norm": 2.5724167823791504, + "learning_rate": 1.9388781389847786e-05, + "loss": 0.3399, + "step": 10034 + }, + { + "epoch": 0.20072, + "grad_norm": 1.2149338722229004, + "learning_rate": 1.9388300635104733e-05, + "loss": 0.1176, + "step": 10036 + }, + { + "epoch": 0.20076, + "grad_norm": 1.5507280826568604, + "learning_rate": 1.9387819697331925e-05, + "loss": 0.2328, + "step": 10038 + }, + { + "epoch": 0.2008, + "grad_norm": 1.7160942554473877, + "learning_rate": 1.9387338576538743e-05, + "loss": 0.1574, + "step": 10040 + }, + { + "epoch": 0.20084, + "grad_norm": 4.614802360534668, + "learning_rate": 1.9386857272734558e-05, + "loss": 0.4776, + "step": 10042 + }, + { + "epoch": 0.20088, + "grad_norm": 3.680532217025757, + "learning_rate": 1.9386375785928762e-05, + "loss": 0.2434, + "step": 10044 + }, + { + "epoch": 0.20092, + "grad_norm": 1.9505901336669922, + "learning_rate": 1.9385894116130735e-05, + "loss": 0.197, + "step": 10046 + }, + { + "epoch": 0.20096, + "grad_norm": 1.0197224617004395, + "learning_rate": 1.938541226334987e-05, + "loss": 0.0901, + "step": 10048 + }, + { + "epoch": 0.201, + "grad_norm": 0.4660566449165344, + "learning_rate": 1.938493022759556e-05, + "loss": 0.1256, + "step": 10050 + }, + { + "epoch": 0.20104, + "grad_norm": 0.4345652461051941, + "learning_rate": 1.9384448008877205e-05, + "loss": 0.0965, + "step": 10052 + }, + { + "epoch": 0.20108, + "grad_norm": 1.088256597518921, + "learning_rate": 1.9383965607204205e-05, + "loss": 0.1443, + "step": 10054 + }, + { + "epoch": 0.20112, + "grad_norm": 1.3080309629440308, + "learning_rate": 1.9383483022585966e-05, + "loss": 0.213, + "step": 10056 + }, + { + "epoch": 0.20116, + "grad_norm": 1.321033239364624, + "learning_rate": 1.938300025503189e-05, + "loss": 0.1843, + "step": 10058 + }, + { + "epoch": 0.2012, + "grad_norm": 1.9563403129577637, + "learning_rate": 1.9382517304551397e-05, + "loss": 0.3864, + "step": 10060 + }, + { + "epoch": 0.20124, + "grad_norm": 1.350441575050354, + "learning_rate": 1.9382034171153895e-05, + "loss": 0.3898, + "step": 10062 + }, + { + "epoch": 0.20128, + "grad_norm": 2.9917795658111572, + "learning_rate": 1.9381550854848812e-05, + "loss": 0.4461, + "step": 10064 + }, + { + "epoch": 0.20132, + "grad_norm": 0.8500548601150513, + "learning_rate": 1.938106735564556e-05, + "loss": 0.1146, + "step": 10066 + }, + { + "epoch": 0.20136, + "grad_norm": 1.3503117561340332, + "learning_rate": 1.938058367355357e-05, + "loss": 0.1646, + "step": 10068 + }, + { + "epoch": 0.2014, + "grad_norm": 1.1387546062469482, + "learning_rate": 1.9380099808582278e-05, + "loss": 0.1857, + "step": 10070 + }, + { + "epoch": 0.20144, + "grad_norm": 3.090423107147217, + "learning_rate": 1.9379615760741108e-05, + "loss": 0.4325, + "step": 10072 + }, + { + "epoch": 0.20148, + "grad_norm": 1.3677157163619995, + "learning_rate": 1.9379131530039497e-05, + "loss": 0.1601, + "step": 10074 + }, + { + "epoch": 0.20152, + "grad_norm": 2.709596633911133, + "learning_rate": 1.937864711648689e-05, + "loss": 0.2887, + "step": 10076 + }, + { + "epoch": 0.20156, + "grad_norm": 1.4633097648620605, + "learning_rate": 1.937816252009273e-05, + "loss": 0.2029, + "step": 10078 + }, + { + "epoch": 0.2016, + "grad_norm": 0.6846967935562134, + "learning_rate": 1.937767774086646e-05, + "loss": 0.1932, + "step": 10080 + }, + { + "epoch": 0.20164, + "grad_norm": 1.0249208211898804, + "learning_rate": 1.9377192778817538e-05, + "loss": 0.1041, + "step": 10082 + }, + { + "epoch": 0.20168, + "grad_norm": 2.6483757495880127, + "learning_rate": 1.9376707633955415e-05, + "loss": 0.5213, + "step": 10084 + }, + { + "epoch": 0.20172, + "grad_norm": 0.9131486415863037, + "learning_rate": 1.937622230628955e-05, + "loss": 0.1389, + "step": 10086 + }, + { + "epoch": 0.20176, + "grad_norm": 0.6559426188468933, + "learning_rate": 1.9375736795829402e-05, + "loss": 0.2897, + "step": 10088 + }, + { + "epoch": 0.2018, + "grad_norm": 0.20273077487945557, + "learning_rate": 1.9375251102584438e-05, + "loss": 0.1572, + "step": 10090 + }, + { + "epoch": 0.20184, + "grad_norm": 1.6395883560180664, + "learning_rate": 1.9374765226564126e-05, + "loss": 0.181, + "step": 10092 + }, + { + "epoch": 0.20188, + "grad_norm": 1.8609838485717773, + "learning_rate": 1.9374279167777943e-05, + "loss": 0.1861, + "step": 10094 + }, + { + "epoch": 0.20192, + "grad_norm": 2.304347276687622, + "learning_rate": 1.9373792926235358e-05, + "loss": 0.2431, + "step": 10096 + }, + { + "epoch": 0.20196, + "grad_norm": 1.349528431892395, + "learning_rate": 1.9373306501945856e-05, + "loss": 0.1602, + "step": 10098 + }, + { + "epoch": 0.202, + "grad_norm": 1.788852572441101, + "learning_rate": 1.937281989491892e-05, + "loss": 0.2902, + "step": 10100 + }, + { + "epoch": 0.20204, + "grad_norm": 1.38975191116333, + "learning_rate": 1.937233310516403e-05, + "loss": 0.1447, + "step": 10102 + }, + { + "epoch": 0.20208, + "grad_norm": 0.7627458572387695, + "learning_rate": 1.937184613269068e-05, + "loss": 0.107, + "step": 10104 + }, + { + "epoch": 0.20212, + "grad_norm": 1.0019850730895996, + "learning_rate": 1.937135897750837e-05, + "loss": 0.3104, + "step": 10106 + }, + { + "epoch": 0.20216, + "grad_norm": 2.1571309566497803, + "learning_rate": 1.9370871639626588e-05, + "loss": 0.1676, + "step": 10108 + }, + { + "epoch": 0.2022, + "grad_norm": 0.775906503200531, + "learning_rate": 1.937038411905484e-05, + "loss": 0.095, + "step": 10110 + }, + { + "epoch": 0.20224, + "grad_norm": 1.4374595880508423, + "learning_rate": 1.936989641580263e-05, + "loss": 0.1518, + "step": 10112 + }, + { + "epoch": 0.20228, + "grad_norm": 1.313847541809082, + "learning_rate": 1.9369408529879468e-05, + "loss": 0.1308, + "step": 10114 + }, + { + "epoch": 0.20232, + "grad_norm": 2.5802228450775146, + "learning_rate": 1.936892046129486e-05, + "loss": 0.3581, + "step": 10116 + }, + { + "epoch": 0.20236, + "grad_norm": 0.8241437673568726, + "learning_rate": 1.9368432210058326e-05, + "loss": 0.3487, + "step": 10118 + }, + { + "epoch": 0.2024, + "grad_norm": 2.466284990310669, + "learning_rate": 1.936794377617938e-05, + "loss": 0.2551, + "step": 10120 + }, + { + "epoch": 0.20244, + "grad_norm": 0.8223090767860413, + "learning_rate": 1.936745515966755e-05, + "loss": 0.1064, + "step": 10122 + }, + { + "epoch": 0.20248, + "grad_norm": 1.9316086769104004, + "learning_rate": 1.9366966360532357e-05, + "loss": 0.1599, + "step": 10124 + }, + { + "epoch": 0.20252, + "grad_norm": 0.969348132610321, + "learning_rate": 1.9366477378783334e-05, + "loss": 0.1687, + "step": 10126 + }, + { + "epoch": 0.20256, + "grad_norm": 0.8571294546127319, + "learning_rate": 1.936598821443001e-05, + "loss": 0.0946, + "step": 10128 + }, + { + "epoch": 0.2026, + "grad_norm": 1.5762333869934082, + "learning_rate": 1.9365498867481926e-05, + "loss": 0.1097, + "step": 10130 + }, + { + "epoch": 0.20264, + "grad_norm": 2.729341506958008, + "learning_rate": 1.9365009337948615e-05, + "loss": 0.2192, + "step": 10132 + }, + { + "epoch": 0.20268, + "grad_norm": 1.074156403541565, + "learning_rate": 1.9364519625839633e-05, + "loss": 0.2426, + "step": 10134 + }, + { + "epoch": 0.20272, + "grad_norm": 0.9860680103302002, + "learning_rate": 1.9364029731164517e-05, + "loss": 0.0662, + "step": 10136 + }, + { + "epoch": 0.20276, + "grad_norm": 3.2080459594726562, + "learning_rate": 1.936353965393282e-05, + "loss": 0.3381, + "step": 10138 + }, + { + "epoch": 0.2028, + "grad_norm": 2.068566083908081, + "learning_rate": 1.9363049394154095e-05, + "loss": 0.3405, + "step": 10140 + }, + { + "epoch": 0.20284, + "grad_norm": 1.1918785572052002, + "learning_rate": 1.9362558951837904e-05, + "loss": 0.0895, + "step": 10142 + }, + { + "epoch": 0.20288, + "grad_norm": 0.2849743962287903, + "learning_rate": 1.9362068326993804e-05, + "loss": 0.2957, + "step": 10144 + }, + { + "epoch": 0.20292, + "grad_norm": 0.5715750455856323, + "learning_rate": 1.9361577519631364e-05, + "loss": 0.3077, + "step": 10146 + }, + { + "epoch": 0.20296, + "grad_norm": 0.706322431564331, + "learning_rate": 1.936108652976015e-05, + "loss": 0.335, + "step": 10148 + }, + { + "epoch": 0.203, + "grad_norm": 0.4562705159187317, + "learning_rate": 1.9360595357389735e-05, + "loss": 0.2, + "step": 10150 + }, + { + "epoch": 0.20304, + "grad_norm": 2.1435329914093018, + "learning_rate": 1.9360104002529693e-05, + "loss": 0.382, + "step": 10152 + }, + { + "epoch": 0.20308, + "grad_norm": 1.4087064266204834, + "learning_rate": 1.9359612465189607e-05, + "loss": 0.2594, + "step": 10154 + }, + { + "epoch": 0.20312, + "grad_norm": 0.8113880157470703, + "learning_rate": 1.9359120745379053e-05, + "loss": 0.1058, + "step": 10156 + }, + { + "epoch": 0.20316, + "grad_norm": 0.9185202121734619, + "learning_rate": 1.935862884310763e-05, + "loss": 0.1645, + "step": 10158 + }, + { + "epoch": 0.2032, + "grad_norm": 0.8609874248504639, + "learning_rate": 1.935813675838491e-05, + "loss": 0.263, + "step": 10160 + }, + { + "epoch": 0.20324, + "grad_norm": 0.7567583322525024, + "learning_rate": 1.93576444912205e-05, + "loss": 0.0852, + "step": 10162 + }, + { + "epoch": 0.20328, + "grad_norm": 1.9764846563339233, + "learning_rate": 1.9357152041623992e-05, + "loss": 0.2507, + "step": 10164 + }, + { + "epoch": 0.20332, + "grad_norm": 1.0988175868988037, + "learning_rate": 1.935665940960499e-05, + "loss": 0.137, + "step": 10166 + }, + { + "epoch": 0.20336, + "grad_norm": 1.8168143033981323, + "learning_rate": 1.9356166595173094e-05, + "loss": 0.2103, + "step": 10168 + }, + { + "epoch": 0.2034, + "grad_norm": 1.3708512783050537, + "learning_rate": 1.9355673598337916e-05, + "loss": 0.1464, + "step": 10170 + }, + { + "epoch": 0.20344, + "grad_norm": 1.1646409034729004, + "learning_rate": 1.9355180419109062e-05, + "loss": 0.2302, + "step": 10172 + }, + { + "epoch": 0.20348, + "grad_norm": 1.4396545886993408, + "learning_rate": 1.9354687057496146e-05, + "loss": 0.1711, + "step": 10174 + }, + { + "epoch": 0.20352, + "grad_norm": 1.7097622156143188, + "learning_rate": 1.9354193513508794e-05, + "loss": 0.2233, + "step": 10176 + }, + { + "epoch": 0.20356, + "grad_norm": 0.9789137840270996, + "learning_rate": 1.9353699787156623e-05, + "loss": 0.1061, + "step": 10178 + }, + { + "epoch": 0.2036, + "grad_norm": 1.281209945678711, + "learning_rate": 1.935320587844926e-05, + "loss": 0.2184, + "step": 10180 + }, + { + "epoch": 0.20364, + "grad_norm": 0.768380343914032, + "learning_rate": 1.935271178739633e-05, + "loss": 0.0804, + "step": 10182 + }, + { + "epoch": 0.20368, + "grad_norm": 1.0059280395507812, + "learning_rate": 1.935221751400747e-05, + "loss": 0.1746, + "step": 10184 + }, + { + "epoch": 0.20372, + "grad_norm": 0.7728565335273743, + "learning_rate": 1.935172305829232e-05, + "loss": 0.0899, + "step": 10186 + }, + { + "epoch": 0.20376, + "grad_norm": 0.6407533288002014, + "learning_rate": 1.9351228420260505e-05, + "loss": 0.1134, + "step": 10188 + }, + { + "epoch": 0.2038, + "grad_norm": 2.9580061435699463, + "learning_rate": 1.9350733599921684e-05, + "loss": 0.2467, + "step": 10190 + }, + { + "epoch": 0.20384, + "grad_norm": 0.9806170463562012, + "learning_rate": 1.9350238597285497e-05, + "loss": 0.0805, + "step": 10192 + }, + { + "epoch": 0.20388, + "grad_norm": 0.5197471976280212, + "learning_rate": 1.934974341236159e-05, + "loss": 0.092, + "step": 10194 + }, + { + "epoch": 0.20392, + "grad_norm": 1.1385143995285034, + "learning_rate": 1.9349248045159628e-05, + "loss": 0.1255, + "step": 10196 + }, + { + "epoch": 0.20396, + "grad_norm": 1.7511203289031982, + "learning_rate": 1.9348752495689256e-05, + "loss": 0.2347, + "step": 10198 + }, + { + "epoch": 0.204, + "grad_norm": 4.119521617889404, + "learning_rate": 1.9348256763960146e-05, + "loss": 0.4782, + "step": 10200 + }, + { + "epoch": 0.20404, + "grad_norm": 2.220996856689453, + "learning_rate": 1.9347760849981955e-05, + "loss": 0.6463, + "step": 10202 + }, + { + "epoch": 0.20408, + "grad_norm": 0.3251250386238098, + "learning_rate": 1.9347264753764356e-05, + "loss": 0.0887, + "step": 10204 + }, + { + "epoch": 0.20412, + "grad_norm": 0.3617643117904663, + "learning_rate": 1.9346768475317016e-05, + "loss": 0.1951, + "step": 10206 + }, + { + "epoch": 0.20416, + "grad_norm": 0.8791534900665283, + "learning_rate": 1.9346272014649613e-05, + "loss": 0.1852, + "step": 10208 + }, + { + "epoch": 0.2042, + "grad_norm": 1.3814777135849, + "learning_rate": 1.9345775371771826e-05, + "loss": 0.1931, + "step": 10210 + }, + { + "epoch": 0.20424, + "grad_norm": 1.267171025276184, + "learning_rate": 1.9345278546693335e-05, + "loss": 0.1753, + "step": 10212 + }, + { + "epoch": 0.20428, + "grad_norm": 2.067091464996338, + "learning_rate": 1.9344781539423828e-05, + "loss": 0.2896, + "step": 10214 + }, + { + "epoch": 0.20432, + "grad_norm": 0.858596920967102, + "learning_rate": 1.9344284349972994e-05, + "loss": 0.2628, + "step": 10216 + }, + { + "epoch": 0.20436, + "grad_norm": 0.6638253927230835, + "learning_rate": 1.9343786978350526e-05, + "loss": 0.0925, + "step": 10218 + }, + { + "epoch": 0.2044, + "grad_norm": 1.4779995679855347, + "learning_rate": 1.9343289424566122e-05, + "loss": 0.306, + "step": 10220 + }, + { + "epoch": 0.20444, + "grad_norm": 1.3870775699615479, + "learning_rate": 1.9342791688629478e-05, + "loss": 0.1778, + "step": 10222 + }, + { + "epoch": 0.20448, + "grad_norm": 1.3965305089950562, + "learning_rate": 1.93422937705503e-05, + "loss": 0.1955, + "step": 10224 + }, + { + "epoch": 0.20452, + "grad_norm": 1.0809314250946045, + "learning_rate": 1.9341795670338298e-05, + "loss": 0.1151, + "step": 10226 + }, + { + "epoch": 0.20456, + "grad_norm": 0.5155912637710571, + "learning_rate": 1.934129738800318e-05, + "loss": 0.08, + "step": 10228 + }, + { + "epoch": 0.2046, + "grad_norm": 2.5376956462860107, + "learning_rate": 1.9340798923554657e-05, + "loss": 0.4696, + "step": 10230 + }, + { + "epoch": 0.20464, + "grad_norm": 0.2896396815776825, + "learning_rate": 1.9340300277002452e-05, + "loss": 0.0645, + "step": 10232 + }, + { + "epoch": 0.20468, + "grad_norm": 0.8178725242614746, + "learning_rate": 1.9339801448356284e-05, + "loss": 0.1632, + "step": 10234 + }, + { + "epoch": 0.20472, + "grad_norm": 0.9383310079574585, + "learning_rate": 1.9339302437625876e-05, + "loss": 0.1917, + "step": 10236 + }, + { + "epoch": 0.20476, + "grad_norm": 0.39445987343788147, + "learning_rate": 1.9338803244820962e-05, + "loss": 0.182, + "step": 10238 + }, + { + "epoch": 0.2048, + "grad_norm": 1.7369571924209595, + "learning_rate": 1.933830386995127e-05, + "loss": 0.2541, + "step": 10240 + }, + { + "epoch": 0.20484, + "grad_norm": 0.39552417397499084, + "learning_rate": 1.9337804313026535e-05, + "loss": 0.1156, + "step": 10242 + }, + { + "epoch": 0.20488, + "grad_norm": 0.6553232073783875, + "learning_rate": 1.9337304574056498e-05, + "loss": 0.0954, + "step": 10244 + }, + { + "epoch": 0.20492, + "grad_norm": 1.9704777002334595, + "learning_rate": 1.93368046530509e-05, + "loss": 0.3129, + "step": 10246 + }, + { + "epoch": 0.20496, + "grad_norm": 0.6030738949775696, + "learning_rate": 1.9336304550019493e-05, + "loss": 0.262, + "step": 10248 + }, + { + "epoch": 0.205, + "grad_norm": 0.8764387369155884, + "learning_rate": 1.9335804264972018e-05, + "loss": 0.1797, + "step": 10250 + }, + { + "epoch": 0.20504, + "grad_norm": 1.407556176185608, + "learning_rate": 1.9335303797918236e-05, + "loss": 0.1303, + "step": 10252 + }, + { + "epoch": 0.20508, + "grad_norm": 1.4497122764587402, + "learning_rate": 1.9334803148867895e-05, + "loss": 0.2144, + "step": 10254 + }, + { + "epoch": 0.20512, + "grad_norm": 2.3378167152404785, + "learning_rate": 1.9334302317830764e-05, + "loss": 0.2554, + "step": 10256 + }, + { + "epoch": 0.20516, + "grad_norm": 0.6275981068611145, + "learning_rate": 1.9333801304816604e-05, + "loss": 0.0459, + "step": 10258 + }, + { + "epoch": 0.2052, + "grad_norm": 2.4564173221588135, + "learning_rate": 1.9333300109835182e-05, + "loss": 0.2101, + "step": 10260 + }, + { + "epoch": 0.20524, + "grad_norm": 3.3874480724334717, + "learning_rate": 1.933279873289627e-05, + "loss": 0.4112, + "step": 10262 + }, + { + "epoch": 0.20528, + "grad_norm": 0.9086415767669678, + "learning_rate": 1.933229717400964e-05, + "loss": 0.1114, + "step": 10264 + }, + { + "epoch": 0.20532, + "grad_norm": 1.2836244106292725, + "learning_rate": 1.9331795433185073e-05, + "loss": 0.1841, + "step": 10266 + }, + { + "epoch": 0.20536, + "grad_norm": 1.005722999572754, + "learning_rate": 1.933129351043235e-05, + "loss": 0.149, + "step": 10268 + }, + { + "epoch": 0.2054, + "grad_norm": 0.5522638559341431, + "learning_rate": 1.9330791405761254e-05, + "loss": 0.0513, + "step": 10270 + }, + { + "epoch": 0.20544, + "grad_norm": 1.7291734218597412, + "learning_rate": 1.9330289119181578e-05, + "loss": 0.1668, + "step": 10272 + }, + { + "epoch": 0.20548, + "grad_norm": 1.029163122177124, + "learning_rate": 1.932978665070311e-05, + "loss": 0.0608, + "step": 10274 + }, + { + "epoch": 0.20552, + "grad_norm": 2.7749674320220947, + "learning_rate": 1.932928400033565e-05, + "loss": 0.3713, + "step": 10276 + }, + { + "epoch": 0.20556, + "grad_norm": 3.158276081085205, + "learning_rate": 1.9328781168088994e-05, + "loss": 0.439, + "step": 10278 + }, + { + "epoch": 0.2056, + "grad_norm": 2.3496053218841553, + "learning_rate": 1.9328278153972947e-05, + "loss": 0.2639, + "step": 10280 + }, + { + "epoch": 0.20564, + "grad_norm": 2.5807418823242188, + "learning_rate": 1.9327774957997313e-05, + "loss": 0.3392, + "step": 10282 + }, + { + "epoch": 0.20568, + "grad_norm": 0.7413254380226135, + "learning_rate": 1.9327271580171908e-05, + "loss": 0.205, + "step": 10284 + }, + { + "epoch": 0.20572, + "grad_norm": 2.877868175506592, + "learning_rate": 1.9326768020506537e-05, + "loss": 0.4608, + "step": 10286 + }, + { + "epoch": 0.20576, + "grad_norm": 4.074870586395264, + "learning_rate": 1.9326264279011026e-05, + "loss": 0.269, + "step": 10288 + }, + { + "epoch": 0.2058, + "grad_norm": 2.1678338050842285, + "learning_rate": 1.932576035569519e-05, + "loss": 0.248, + "step": 10290 + }, + { + "epoch": 0.20584, + "grad_norm": 2.1349003314971924, + "learning_rate": 1.9325256250568852e-05, + "loss": 0.2374, + "step": 10292 + }, + { + "epoch": 0.20588, + "grad_norm": 1.1112250089645386, + "learning_rate": 1.9324751963641843e-05, + "loss": 0.1672, + "step": 10294 + }, + { + "epoch": 0.20592, + "grad_norm": 1.0069750547409058, + "learning_rate": 1.9324247494923996e-05, + "loss": 0.1194, + "step": 10296 + }, + { + "epoch": 0.20596, + "grad_norm": 1.4473931789398193, + "learning_rate": 1.9323742844425144e-05, + "loss": 0.184, + "step": 10298 + }, + { + "epoch": 0.206, + "grad_norm": 1.2297286987304688, + "learning_rate": 1.9323238012155125e-05, + "loss": 0.1983, + "step": 10300 + }, + { + "epoch": 0.20604, + "grad_norm": 1.4215984344482422, + "learning_rate": 1.932273299812378e-05, + "loss": 0.2344, + "step": 10302 + }, + { + "epoch": 0.20608, + "grad_norm": 1.559707522392273, + "learning_rate": 1.9322227802340955e-05, + "loss": 0.1759, + "step": 10304 + }, + { + "epoch": 0.20612, + "grad_norm": 0.7919709086418152, + "learning_rate": 1.93217224248165e-05, + "loss": 0.1741, + "step": 10306 + }, + { + "epoch": 0.20616, + "grad_norm": 1.318913459777832, + "learning_rate": 1.9321216865560266e-05, + "loss": 0.1445, + "step": 10308 + }, + { + "epoch": 0.2062, + "grad_norm": 1.8741525411605835, + "learning_rate": 1.932071112458211e-05, + "loss": 0.2646, + "step": 10310 + }, + { + "epoch": 0.20624, + "grad_norm": 1.055475115776062, + "learning_rate": 1.9320205201891894e-05, + "loss": 0.1963, + "step": 10312 + }, + { + "epoch": 0.20628, + "grad_norm": 3.435612678527832, + "learning_rate": 1.931969909749948e-05, + "loss": 0.3658, + "step": 10314 + }, + { + "epoch": 0.20632, + "grad_norm": 0.5344172120094299, + "learning_rate": 1.9319192811414732e-05, + "loss": 0.0621, + "step": 10316 + }, + { + "epoch": 0.20636, + "grad_norm": 0.989334225654602, + "learning_rate": 1.931868634364752e-05, + "loss": 0.2025, + "step": 10318 + }, + { + "epoch": 0.2064, + "grad_norm": 1.3826124668121338, + "learning_rate": 1.9318179694207726e-05, + "loss": 0.1434, + "step": 10320 + }, + { + "epoch": 0.20644, + "grad_norm": 1.3621537685394287, + "learning_rate": 1.9317672863105218e-05, + "loss": 0.1547, + "step": 10322 + }, + { + "epoch": 0.20648, + "grad_norm": 0.9407194256782532, + "learning_rate": 1.931716585034988e-05, + "loss": 0.1791, + "step": 10324 + }, + { + "epoch": 0.20652, + "grad_norm": 1.4598668813705444, + "learning_rate": 1.93166586559516e-05, + "loss": 0.1118, + "step": 10326 + }, + { + "epoch": 0.20656, + "grad_norm": 0.432781845331192, + "learning_rate": 1.931615127992026e-05, + "loss": 0.0737, + "step": 10328 + }, + { + "epoch": 0.2066, + "grad_norm": 1.0300897359848022, + "learning_rate": 1.931564372226576e-05, + "loss": 0.2423, + "step": 10330 + }, + { + "epoch": 0.20664, + "grad_norm": 0.4265727400779724, + "learning_rate": 1.931513598299798e-05, + "loss": 0.0307, + "step": 10332 + }, + { + "epoch": 0.20668, + "grad_norm": 0.4253847301006317, + "learning_rate": 1.9314628062126837e-05, + "loss": 0.3716, + "step": 10334 + }, + { + "epoch": 0.20672, + "grad_norm": 1.2877386808395386, + "learning_rate": 1.9314119959662218e-05, + "loss": 0.2928, + "step": 10336 + }, + { + "epoch": 0.20676, + "grad_norm": 1.3075587749481201, + "learning_rate": 1.931361167561404e-05, + "loss": 0.0853, + "step": 10338 + }, + { + "epoch": 0.2068, + "grad_norm": 0.25825726985931396, + "learning_rate": 1.9313103209992205e-05, + "loss": 0.0358, + "step": 10340 + }, + { + "epoch": 0.20684, + "grad_norm": 2.4918928146362305, + "learning_rate": 1.931259456280663e-05, + "loss": 0.2294, + "step": 10342 + }, + { + "epoch": 0.20688, + "grad_norm": 0.1792098581790924, + "learning_rate": 1.9312085734067226e-05, + "loss": 0.0432, + "step": 10344 + }, + { + "epoch": 0.20692, + "grad_norm": 2.5724995136260986, + "learning_rate": 1.931157672378392e-05, + "loss": 0.1582, + "step": 10346 + }, + { + "epoch": 0.20696, + "grad_norm": 0.22603408992290497, + "learning_rate": 1.931106753196663e-05, + "loss": 0.26, + "step": 10348 + }, + { + "epoch": 0.207, + "grad_norm": 2.7331535816192627, + "learning_rate": 1.9310558158625286e-05, + "loss": 0.14, + "step": 10350 + }, + { + "epoch": 0.20704, + "grad_norm": 0.6921839714050293, + "learning_rate": 1.9310048603769816e-05, + "loss": 0.1173, + "step": 10352 + }, + { + "epoch": 0.20708, + "grad_norm": 1.1185609102249146, + "learning_rate": 1.9309538867410157e-05, + "loss": 0.2568, + "step": 10354 + }, + { + "epoch": 0.20712, + "grad_norm": 0.8974424600601196, + "learning_rate": 1.930902894955624e-05, + "loss": 0.1879, + "step": 10356 + }, + { + "epoch": 0.20716, + "grad_norm": 3.155445098876953, + "learning_rate": 1.9308518850218017e-05, + "loss": 0.326, + "step": 10358 + }, + { + "epoch": 0.2072, + "grad_norm": 3.0944418907165527, + "learning_rate": 1.9308008569405424e-05, + "loss": 0.4018, + "step": 10360 + }, + { + "epoch": 0.20724, + "grad_norm": 3.5381581783294678, + "learning_rate": 1.9307498107128413e-05, + "loss": 0.2234, + "step": 10362 + }, + { + "epoch": 0.20728, + "grad_norm": 0.9841073751449585, + "learning_rate": 1.9306987463396934e-05, + "loss": 0.2028, + "step": 10364 + }, + { + "epoch": 0.20732, + "grad_norm": 0.7909170389175415, + "learning_rate": 1.9306476638220945e-05, + "loss": 0.0614, + "step": 10366 + }, + { + "epoch": 0.20736, + "grad_norm": 1.3577914237976074, + "learning_rate": 1.9305965631610397e-05, + "loss": 0.1366, + "step": 10368 + }, + { + "epoch": 0.2074, + "grad_norm": 1.6498678922653198, + "learning_rate": 1.930545444357526e-05, + "loss": 0.2483, + "step": 10370 + }, + { + "epoch": 0.20744, + "grad_norm": 2.3928799629211426, + "learning_rate": 1.9304943074125503e-05, + "loss": 0.2346, + "step": 10372 + }, + { + "epoch": 0.20748, + "grad_norm": 1.690988302230835, + "learning_rate": 1.9304431523271082e-05, + "loss": 0.2469, + "step": 10374 + }, + { + "epoch": 0.20752, + "grad_norm": 0.40763935446739197, + "learning_rate": 1.9303919791021984e-05, + "loss": 0.074, + "step": 10376 + }, + { + "epoch": 0.20756, + "grad_norm": 0.26057732105255127, + "learning_rate": 1.930340787738818e-05, + "loss": 0.3283, + "step": 10378 + }, + { + "epoch": 0.2076, + "grad_norm": 2.941128969192505, + "learning_rate": 1.9302895782379648e-05, + "loss": 0.3982, + "step": 10380 + }, + { + "epoch": 0.20764, + "grad_norm": 4.452024936676025, + "learning_rate": 1.9302383506006373e-05, + "loss": 0.4242, + "step": 10382 + }, + { + "epoch": 0.20768, + "grad_norm": 1.7629976272583008, + "learning_rate": 1.9301871048278343e-05, + "loss": 0.2117, + "step": 10384 + }, + { + "epoch": 0.20772, + "grad_norm": 0.9258195161819458, + "learning_rate": 1.9301358409205547e-05, + "loss": 0.0827, + "step": 10386 + }, + { + "epoch": 0.20776, + "grad_norm": 0.9625275135040283, + "learning_rate": 1.9300845588797982e-05, + "loss": 0.1119, + "step": 10388 + }, + { + "epoch": 0.2078, + "grad_norm": 2.6939120292663574, + "learning_rate": 1.9300332587065644e-05, + "loss": 0.231, + "step": 10390 + }, + { + "epoch": 0.20784, + "grad_norm": 3.4903485774993896, + "learning_rate": 1.929981940401853e-05, + "loss": 0.6777, + "step": 10392 + }, + { + "epoch": 0.20788, + "grad_norm": 0.21371302008628845, + "learning_rate": 1.9299306039666654e-05, + "loss": 0.1058, + "step": 10394 + }, + { + "epoch": 0.20792, + "grad_norm": 1.4160187244415283, + "learning_rate": 1.9298792494020017e-05, + "loss": 0.146, + "step": 10396 + }, + { + "epoch": 0.20796, + "grad_norm": 1.5434706211090088, + "learning_rate": 1.929827876708863e-05, + "loss": 0.1613, + "step": 10398 + }, + { + "epoch": 0.208, + "grad_norm": 2.0878474712371826, + "learning_rate": 1.9297764858882516e-05, + "loss": 0.2649, + "step": 10400 + }, + { + "epoch": 0.20804, + "grad_norm": 0.9227767586708069, + "learning_rate": 1.9297250769411687e-05, + "loss": 0.1234, + "step": 10402 + }, + { + "epoch": 0.20808, + "grad_norm": 0.6190481185913086, + "learning_rate": 1.9296736498686168e-05, + "loss": 0.1761, + "step": 10404 + }, + { + "epoch": 0.20812, + "grad_norm": 1.1265596151351929, + "learning_rate": 1.9296222046715988e-05, + "loss": 0.0887, + "step": 10406 + }, + { + "epoch": 0.20816, + "grad_norm": 1.5183377265930176, + "learning_rate": 1.9295707413511166e-05, + "loss": 0.1589, + "step": 10408 + }, + { + "epoch": 0.2082, + "grad_norm": 1.1050010919570923, + "learning_rate": 1.9295192599081747e-05, + "loss": 0.1707, + "step": 10410 + }, + { + "epoch": 0.20824, + "grad_norm": 2.180241107940674, + "learning_rate": 1.9294677603437765e-05, + "loss": 0.269, + "step": 10412 + }, + { + "epoch": 0.20828, + "grad_norm": 1.9047024250030518, + "learning_rate": 1.9294162426589253e-05, + "loss": 0.2033, + "step": 10414 + }, + { + "epoch": 0.20832, + "grad_norm": 0.5688599348068237, + "learning_rate": 1.9293647068546263e-05, + "loss": 0.1608, + "step": 10416 + }, + { + "epoch": 0.20836, + "grad_norm": 2.258572578430176, + "learning_rate": 1.9293131529318835e-05, + "loss": 0.2637, + "step": 10418 + }, + { + "epoch": 0.2084, + "grad_norm": 1.2414811849594116, + "learning_rate": 1.9292615808917027e-05, + "loss": 0.1217, + "step": 10420 + }, + { + "epoch": 0.20844, + "grad_norm": 1.2163736820220947, + "learning_rate": 1.9292099907350887e-05, + "loss": 0.1594, + "step": 10422 + }, + { + "epoch": 0.20848, + "grad_norm": 1.0890034437179565, + "learning_rate": 1.9291583824630478e-05, + "loss": 0.1177, + "step": 10424 + }, + { + "epoch": 0.20852, + "grad_norm": 0.18647260963916779, + "learning_rate": 1.9291067560765856e-05, + "loss": 0.0731, + "step": 10426 + }, + { + "epoch": 0.20856, + "grad_norm": 2.225029945373535, + "learning_rate": 1.929055111576709e-05, + "loss": 0.2061, + "step": 10428 + }, + { + "epoch": 0.2086, + "grad_norm": 1.8749316930770874, + "learning_rate": 1.9290034489644247e-05, + "loss": 0.5489, + "step": 10430 + }, + { + "epoch": 0.20864, + "grad_norm": 0.3968360722064972, + "learning_rate": 1.9289517682407397e-05, + "loss": 0.0635, + "step": 10432 + }, + { + "epoch": 0.20868, + "grad_norm": 1.0581965446472168, + "learning_rate": 1.928900069406662e-05, + "loss": 0.1368, + "step": 10434 + }, + { + "epoch": 0.20872, + "grad_norm": 2.360521078109741, + "learning_rate": 1.928848352463199e-05, + "loss": 0.2026, + "step": 10436 + }, + { + "epoch": 0.20876, + "grad_norm": 1.7642000913619995, + "learning_rate": 1.928796617411359e-05, + "loss": 0.1523, + "step": 10438 + }, + { + "epoch": 0.2088, + "grad_norm": 1.0278385877609253, + "learning_rate": 1.9287448642521513e-05, + "loss": 0.1071, + "step": 10440 + }, + { + "epoch": 0.20884, + "grad_norm": 2.921374797821045, + "learning_rate": 1.928693092986584e-05, + "loss": 0.3141, + "step": 10442 + }, + { + "epoch": 0.20888, + "grad_norm": 0.5568587183952332, + "learning_rate": 1.928641303615667e-05, + "loss": 0.0431, + "step": 10444 + }, + { + "epoch": 0.20892, + "grad_norm": 0.7253045439720154, + "learning_rate": 1.9285894961404094e-05, + "loss": 0.1459, + "step": 10446 + }, + { + "epoch": 0.20896, + "grad_norm": 2.989478588104248, + "learning_rate": 1.9285376705618216e-05, + "loss": 0.242, + "step": 10448 + }, + { + "epoch": 0.209, + "grad_norm": 0.37036439776420593, + "learning_rate": 1.9284858268809135e-05, + "loss": 0.0324, + "step": 10450 + }, + { + "epoch": 0.20904, + "grad_norm": 1.2193658351898193, + "learning_rate": 1.9284339650986966e-05, + "loss": 0.0942, + "step": 10452 + }, + { + "epoch": 0.20908, + "grad_norm": 2.870316505432129, + "learning_rate": 1.9283820852161818e-05, + "loss": 0.241, + "step": 10454 + }, + { + "epoch": 0.20912, + "grad_norm": 2.167454957962036, + "learning_rate": 1.9283301872343798e-05, + "loss": 0.1434, + "step": 10456 + }, + { + "epoch": 0.20916, + "grad_norm": 2.9772651195526123, + "learning_rate": 1.928278271154303e-05, + "loss": 0.1846, + "step": 10458 + }, + { + "epoch": 0.2092, + "grad_norm": 2.3194355964660645, + "learning_rate": 1.9282263369769633e-05, + "loss": 0.4391, + "step": 10460 + }, + { + "epoch": 0.20924, + "grad_norm": 2.363873243331909, + "learning_rate": 1.9281743847033736e-05, + "loss": 0.1929, + "step": 10462 + }, + { + "epoch": 0.20928, + "grad_norm": 2.642082929611206, + "learning_rate": 1.928122414334546e-05, + "loss": 0.424, + "step": 10464 + }, + { + "epoch": 0.20932, + "grad_norm": 2.5367813110351562, + "learning_rate": 1.9280704258714946e-05, + "loss": 0.2539, + "step": 10466 + }, + { + "epoch": 0.20936, + "grad_norm": 1.23013436794281, + "learning_rate": 1.928018419315232e-05, + "loss": 0.2278, + "step": 10468 + }, + { + "epoch": 0.2094, + "grad_norm": 2.26816987991333, + "learning_rate": 1.927966394666773e-05, + "loss": 0.1911, + "step": 10470 + }, + { + "epoch": 0.20944, + "grad_norm": 1.7476046085357666, + "learning_rate": 1.927914351927131e-05, + "loss": 0.471, + "step": 10472 + }, + { + "epoch": 0.20948, + "grad_norm": 2.1570608615875244, + "learning_rate": 1.9278622910973207e-05, + "loss": 0.4, + "step": 10474 + }, + { + "epoch": 0.20952, + "grad_norm": 0.7746837735176086, + "learning_rate": 1.9278102121783578e-05, + "loss": 0.128, + "step": 10476 + }, + { + "epoch": 0.20956, + "grad_norm": 0.9495717883110046, + "learning_rate": 1.927758115171257e-05, + "loss": 0.0951, + "step": 10478 + }, + { + "epoch": 0.2096, + "grad_norm": 2.0366899967193604, + "learning_rate": 1.9277060000770342e-05, + "loss": 0.1525, + "step": 10480 + }, + { + "epoch": 0.20964, + "grad_norm": 1.3854572772979736, + "learning_rate": 1.9276538668967057e-05, + "loss": 0.215, + "step": 10482 + }, + { + "epoch": 0.20968, + "grad_norm": 2.1243538856506348, + "learning_rate": 1.927601715631287e-05, + "loss": 0.1969, + "step": 10484 + }, + { + "epoch": 0.20972, + "grad_norm": 1.379431962966919, + "learning_rate": 1.927549546281795e-05, + "loss": 0.2592, + "step": 10486 + }, + { + "epoch": 0.20976, + "grad_norm": 0.6966372728347778, + "learning_rate": 1.9274973588492475e-05, + "loss": 0.1723, + "step": 10488 + }, + { + "epoch": 0.2098, + "grad_norm": 1.8415268659591675, + "learning_rate": 1.9274451533346617e-05, + "loss": 0.328, + "step": 10490 + }, + { + "epoch": 0.20984, + "grad_norm": 0.8656148314476013, + "learning_rate": 1.9273929297390545e-05, + "loss": 0.058, + "step": 10492 + }, + { + "epoch": 0.20988, + "grad_norm": 0.8220925331115723, + "learning_rate": 1.927340688063445e-05, + "loss": 0.0768, + "step": 10494 + }, + { + "epoch": 0.20992, + "grad_norm": 3.270416736602783, + "learning_rate": 1.9272884283088517e-05, + "loss": 0.4342, + "step": 10496 + }, + { + "epoch": 0.20996, + "grad_norm": 2.232227325439453, + "learning_rate": 1.9272361504762927e-05, + "loss": 0.4773, + "step": 10498 + }, + { + "epoch": 0.21, + "grad_norm": 1.5407708883285522, + "learning_rate": 1.9271838545667876e-05, + "loss": 0.1381, + "step": 10500 + }, + { + "epoch": 0.21004, + "grad_norm": 0.8652671575546265, + "learning_rate": 1.927131540581356e-05, + "loss": 0.1075, + "step": 10502 + }, + { + "epoch": 0.21008, + "grad_norm": 1.5349922180175781, + "learning_rate": 1.9270792085210176e-05, + "loss": 0.1355, + "step": 10504 + }, + { + "epoch": 0.21012, + "grad_norm": 1.4216731786727905, + "learning_rate": 1.927026858386793e-05, + "loss": 0.2036, + "step": 10506 + }, + { + "epoch": 0.21016, + "grad_norm": 1.1208696365356445, + "learning_rate": 1.9269744901797022e-05, + "loss": 0.0891, + "step": 10508 + }, + { + "epoch": 0.2102, + "grad_norm": 0.6296223998069763, + "learning_rate": 1.9269221039007666e-05, + "loss": 0.0746, + "step": 10510 + }, + { + "epoch": 0.21024, + "grad_norm": 0.23637166619300842, + "learning_rate": 1.926869699551007e-05, + "loss": 0.1467, + "step": 10512 + }, + { + "epoch": 0.21028, + "grad_norm": 0.3614215850830078, + "learning_rate": 1.9268172771314463e-05, + "loss": 0.0306, + "step": 10514 + }, + { + "epoch": 0.21032, + "grad_norm": 0.7313590049743652, + "learning_rate": 1.926764836643105e-05, + "loss": 0.103, + "step": 10516 + }, + { + "epoch": 0.21036, + "grad_norm": 2.2974534034729004, + "learning_rate": 1.9267123780870062e-05, + "loss": 0.1574, + "step": 10518 + }, + { + "epoch": 0.2104, + "grad_norm": 2.5854525566101074, + "learning_rate": 1.9266599014641724e-05, + "loss": 0.2409, + "step": 10520 + }, + { + "epoch": 0.21044, + "grad_norm": 3.136023998260498, + "learning_rate": 1.926607406775627e-05, + "loss": 0.2618, + "step": 10522 + }, + { + "epoch": 0.21048, + "grad_norm": 3.7132437229156494, + "learning_rate": 1.9265548940223927e-05, + "loss": 0.4868, + "step": 10524 + }, + { + "epoch": 0.21052, + "grad_norm": 1.7840527296066284, + "learning_rate": 1.926502363205494e-05, + "loss": 0.1461, + "step": 10526 + }, + { + "epoch": 0.21056, + "grad_norm": 1.2917466163635254, + "learning_rate": 1.9264498143259546e-05, + "loss": 0.0794, + "step": 10528 + }, + { + "epoch": 0.2106, + "grad_norm": 0.5549808144569397, + "learning_rate": 1.9263972473847995e-05, + "loss": 0.1506, + "step": 10530 + }, + { + "epoch": 0.21064, + "grad_norm": 1.3065763711929321, + "learning_rate": 1.9263446623830528e-05, + "loss": 0.1512, + "step": 10532 + }, + { + "epoch": 0.21068, + "grad_norm": 0.5769954323768616, + "learning_rate": 1.92629205932174e-05, + "loss": 0.0519, + "step": 10534 + }, + { + "epoch": 0.21072, + "grad_norm": 0.9007019400596619, + "learning_rate": 1.9262394382018867e-05, + "loss": 0.2568, + "step": 10536 + }, + { + "epoch": 0.21076, + "grad_norm": 1.2031068801879883, + "learning_rate": 1.9261867990245188e-05, + "loss": 0.0643, + "step": 10538 + }, + { + "epoch": 0.2108, + "grad_norm": 0.4326165020465851, + "learning_rate": 1.9261341417906622e-05, + "loss": 0.1035, + "step": 10540 + }, + { + "epoch": 0.21084, + "grad_norm": 0.428074449300766, + "learning_rate": 1.9260814665013436e-05, + "loss": 0.0332, + "step": 10542 + }, + { + "epoch": 0.21088, + "grad_norm": 3.6364059448242188, + "learning_rate": 1.9260287731575902e-05, + "loss": 0.2933, + "step": 10544 + }, + { + "epoch": 0.21092, + "grad_norm": 0.5232213735580444, + "learning_rate": 1.9259760617604294e-05, + "loss": 0.0551, + "step": 10546 + }, + { + "epoch": 0.21096, + "grad_norm": 0.25724250078201294, + "learning_rate": 1.925923332310888e-05, + "loss": 0.0604, + "step": 10548 + }, + { + "epoch": 0.211, + "grad_norm": 1.5369004011154175, + "learning_rate": 1.925870584809995e-05, + "loss": 0.2153, + "step": 10550 + }, + { + "epoch": 0.21104, + "grad_norm": 0.7176931500434875, + "learning_rate": 1.925817819258778e-05, + "loss": 0.0453, + "step": 10552 + }, + { + "epoch": 0.21108, + "grad_norm": 0.3055902421474457, + "learning_rate": 1.925765035658266e-05, + "loss": 0.0876, + "step": 10554 + }, + { + "epoch": 0.21112, + "grad_norm": 0.151121124625206, + "learning_rate": 1.925712234009488e-05, + "loss": 0.0858, + "step": 10556 + }, + { + "epoch": 0.21116, + "grad_norm": 0.17731066048145294, + "learning_rate": 1.9256594143134736e-05, + "loss": 0.1106, + "step": 10558 + }, + { + "epoch": 0.2112, + "grad_norm": 1.532719373703003, + "learning_rate": 1.9256065765712524e-05, + "loss": 0.0658, + "step": 10560 + }, + { + "epoch": 0.21124, + "grad_norm": 0.3410336673259735, + "learning_rate": 1.925553720783854e-05, + "loss": 0.0322, + "step": 10562 + }, + { + "epoch": 0.21128, + "grad_norm": 0.13409867882728577, + "learning_rate": 1.92550084695231e-05, + "loss": 0.1182, + "step": 10564 + }, + { + "epoch": 0.21132, + "grad_norm": 3.866100788116455, + "learning_rate": 1.92544795507765e-05, + "loss": 0.1296, + "step": 10566 + }, + { + "epoch": 0.21136, + "grad_norm": 12.639220237731934, + "learning_rate": 1.925395045160906e-05, + "loss": 0.9184, + "step": 10568 + }, + { + "epoch": 0.2114, + "grad_norm": 3.5030009746551514, + "learning_rate": 1.9253421172031086e-05, + "loss": 0.1808, + "step": 10570 + }, + { + "epoch": 0.21144, + "grad_norm": 3.658724784851074, + "learning_rate": 1.9252891712052906e-05, + "loss": 0.2191, + "step": 10572 + }, + { + "epoch": 0.21148, + "grad_norm": 2.6956074237823486, + "learning_rate": 1.9252362071684837e-05, + "loss": 0.6266, + "step": 10574 + }, + { + "epoch": 0.21152, + "grad_norm": 4.569906234741211, + "learning_rate": 1.925183225093721e-05, + "loss": 0.3484, + "step": 10576 + }, + { + "epoch": 0.21156, + "grad_norm": 2.615494728088379, + "learning_rate": 1.9251302249820345e-05, + "loss": 0.1207, + "step": 10578 + }, + { + "epoch": 0.2116, + "grad_norm": 0.23022429645061493, + "learning_rate": 1.925077206834458e-05, + "loss": 0.0668, + "step": 10580 + }, + { + "epoch": 0.21164, + "grad_norm": 0.810022234916687, + "learning_rate": 1.9250241706520257e-05, + "loss": 0.3839, + "step": 10582 + }, + { + "epoch": 0.21168, + "grad_norm": 0.7114995718002319, + "learning_rate": 1.9249711164357704e-05, + "loss": 0.253, + "step": 10584 + }, + { + "epoch": 0.21172, + "grad_norm": 1.0393822193145752, + "learning_rate": 1.924918044186727e-05, + "loss": 0.1853, + "step": 10586 + }, + { + "epoch": 0.21176, + "grad_norm": 4.8873114585876465, + "learning_rate": 1.9248649539059304e-05, + "loss": 0.401, + "step": 10588 + }, + { + "epoch": 0.2118, + "grad_norm": 0.944446861743927, + "learning_rate": 1.9248118455944153e-05, + "loss": 0.2062, + "step": 10590 + }, + { + "epoch": 0.21184, + "grad_norm": 1.6980266571044922, + "learning_rate": 1.9247587192532167e-05, + "loss": 0.1524, + "step": 10592 + }, + { + "epoch": 0.21188, + "grad_norm": 0.8342300057411194, + "learning_rate": 1.9247055748833715e-05, + "loss": 0.0765, + "step": 10594 + }, + { + "epoch": 0.21192, + "grad_norm": 1.8837534189224243, + "learning_rate": 1.9246524124859147e-05, + "loss": 0.1984, + "step": 10596 + }, + { + "epoch": 0.21196, + "grad_norm": 0.3903694748878479, + "learning_rate": 1.924599232061883e-05, + "loss": 0.0548, + "step": 10598 + }, + { + "epoch": 0.212, + "grad_norm": 1.3861278295516968, + "learning_rate": 1.9245460336123136e-05, + "loss": 0.0826, + "step": 10600 + }, + { + "epoch": 0.21204, + "grad_norm": 4.3758864402771, + "learning_rate": 1.924492817138243e-05, + "loss": 0.2627, + "step": 10602 + }, + { + "epoch": 0.21208, + "grad_norm": 3.571593761444092, + "learning_rate": 1.9244395826407086e-05, + "loss": 0.484, + "step": 10604 + }, + { + "epoch": 0.21212, + "grad_norm": 0.512595534324646, + "learning_rate": 1.9243863301207494e-05, + "loss": 0.0543, + "step": 10606 + }, + { + "epoch": 0.21216, + "grad_norm": 3.615351915359497, + "learning_rate": 1.924333059579402e-05, + "loss": 0.6686, + "step": 10608 + }, + { + "epoch": 0.2122, + "grad_norm": 2.840885877609253, + "learning_rate": 1.924279771017706e-05, + "loss": 0.2842, + "step": 10610 + }, + { + "epoch": 0.21224, + "grad_norm": 1.8657896518707275, + "learning_rate": 1.9242264644367005e-05, + "loss": 0.1744, + "step": 10612 + }, + { + "epoch": 0.21228, + "grad_norm": 0.9768684506416321, + "learning_rate": 1.9241731398374236e-05, + "loss": 0.203, + "step": 10614 + }, + { + "epoch": 0.21232, + "grad_norm": 1.571258306503296, + "learning_rate": 1.9241197972209157e-05, + "loss": 0.1325, + "step": 10616 + }, + { + "epoch": 0.21236, + "grad_norm": 1.106998324394226, + "learning_rate": 1.924066436588217e-05, + "loss": 0.1003, + "step": 10618 + }, + { + "epoch": 0.2124, + "grad_norm": 1.925621747970581, + "learning_rate": 1.924013057940367e-05, + "loss": 0.4005, + "step": 10620 + }, + { + "epoch": 0.21244, + "grad_norm": 0.6672461032867432, + "learning_rate": 1.923959661278407e-05, + "loss": 0.2538, + "step": 10622 + }, + { + "epoch": 0.21248, + "grad_norm": 1.5882905721664429, + "learning_rate": 1.923906246603377e-05, + "loss": 0.1959, + "step": 10624 + }, + { + "epoch": 0.21252, + "grad_norm": 2.451695442199707, + "learning_rate": 1.92385281391632e-05, + "loss": 0.2595, + "step": 10626 + }, + { + "epoch": 0.21256, + "grad_norm": 2.3029632568359375, + "learning_rate": 1.9237993632182764e-05, + "loss": 0.2868, + "step": 10628 + }, + { + "epoch": 0.2126, + "grad_norm": 1.2459906339645386, + "learning_rate": 1.923745894510288e-05, + "loss": 0.1517, + "step": 10630 + }, + { + "epoch": 0.21264, + "grad_norm": 1.2329891920089722, + "learning_rate": 1.9236924077933988e-05, + "loss": 0.1372, + "step": 10632 + }, + { + "epoch": 0.21268, + "grad_norm": 2.193042516708374, + "learning_rate": 1.92363890306865e-05, + "loss": 0.1759, + "step": 10634 + }, + { + "epoch": 0.21272, + "grad_norm": 0.9170559048652649, + "learning_rate": 1.9235853803370853e-05, + "loss": 0.1438, + "step": 10636 + }, + { + "epoch": 0.21276, + "grad_norm": 1.0667352676391602, + "learning_rate": 1.9235318395997482e-05, + "loss": 0.1646, + "step": 10638 + }, + { + "epoch": 0.2128, + "grad_norm": 0.24040678143501282, + "learning_rate": 1.9234782808576823e-05, + "loss": 0.1279, + "step": 10640 + }, + { + "epoch": 0.21284, + "grad_norm": 0.4912932515144348, + "learning_rate": 1.9234247041119324e-05, + "loss": 0.3201, + "step": 10642 + }, + { + "epoch": 0.21288, + "grad_norm": 2.9990742206573486, + "learning_rate": 1.923371109363542e-05, + "loss": 0.4051, + "step": 10644 + }, + { + "epoch": 0.21292, + "grad_norm": 1.0792747735977173, + "learning_rate": 1.9233174966135564e-05, + "loss": 0.1404, + "step": 10646 + }, + { + "epoch": 0.21296, + "grad_norm": 1.3657644987106323, + "learning_rate": 1.923263865863021e-05, + "loss": 0.1132, + "step": 10648 + }, + { + "epoch": 0.213, + "grad_norm": 2.2139289379119873, + "learning_rate": 1.923210217112981e-05, + "loss": 0.181, + "step": 10650 + }, + { + "epoch": 0.21304, + "grad_norm": 1.5417554378509521, + "learning_rate": 1.9231565503644826e-05, + "loss": 0.2465, + "step": 10652 + }, + { + "epoch": 0.21308, + "grad_norm": 2.753817558288574, + "learning_rate": 1.923102865618572e-05, + "loss": 0.2153, + "step": 10654 + }, + { + "epoch": 0.21312, + "grad_norm": 1.278849482536316, + "learning_rate": 1.923049162876296e-05, + "loss": 0.1302, + "step": 10656 + }, + { + "epoch": 0.21316, + "grad_norm": 2.591496467590332, + "learning_rate": 1.922995442138701e-05, + "loss": 0.262, + "step": 10658 + }, + { + "epoch": 0.2132, + "grad_norm": 0.8900525569915771, + "learning_rate": 1.9229417034068352e-05, + "loss": 0.1236, + "step": 10660 + }, + { + "epoch": 0.21324, + "grad_norm": 1.3069192171096802, + "learning_rate": 1.9228879466817453e-05, + "loss": 0.1269, + "step": 10662 + }, + { + "epoch": 0.21328, + "grad_norm": 1.0081210136413574, + "learning_rate": 1.92283417196448e-05, + "loss": 0.188, + "step": 10664 + }, + { + "epoch": 0.21332, + "grad_norm": 1.320521593093872, + "learning_rate": 1.9227803792560872e-05, + "loss": 0.1058, + "step": 10666 + }, + { + "epoch": 0.21336, + "grad_norm": 1.191002368927002, + "learning_rate": 1.9227265685576157e-05, + "loss": 0.1512, + "step": 10668 + }, + { + "epoch": 0.2134, + "grad_norm": 0.5844106078147888, + "learning_rate": 1.922672739870115e-05, + "loss": 0.1215, + "step": 10670 + }, + { + "epoch": 0.21344, + "grad_norm": 1.3029416799545288, + "learning_rate": 1.9226188931946343e-05, + "loss": 0.1174, + "step": 10672 + }, + { + "epoch": 0.21348, + "grad_norm": 0.5066297054290771, + "learning_rate": 1.922565028532223e-05, + "loss": 0.0579, + "step": 10674 + }, + { + "epoch": 0.21352, + "grad_norm": 1.0497268438339233, + "learning_rate": 1.9225111458839313e-05, + "loss": 0.1584, + "step": 10676 + }, + { + "epoch": 0.21356, + "grad_norm": 2.2253100872039795, + "learning_rate": 1.92245724525081e-05, + "loss": 0.1751, + "step": 10678 + }, + { + "epoch": 0.2136, + "grad_norm": 1.476815938949585, + "learning_rate": 1.9224033266339103e-05, + "loss": 0.0735, + "step": 10680 + }, + { + "epoch": 0.21364, + "grad_norm": 0.6758678555488586, + "learning_rate": 1.9223493900342823e-05, + "loss": 0.3704, + "step": 10682 + }, + { + "epoch": 0.21368, + "grad_norm": 0.5585721135139465, + "learning_rate": 1.9222954354529783e-05, + "loss": 0.051, + "step": 10684 + }, + { + "epoch": 0.21372, + "grad_norm": 0.7454286813735962, + "learning_rate": 1.92224146289105e-05, + "loss": 0.0456, + "step": 10686 + }, + { + "epoch": 0.21376, + "grad_norm": 0.34728899598121643, + "learning_rate": 1.9221874723495494e-05, + "loss": 0.2516, + "step": 10688 + }, + { + "epoch": 0.2138, + "grad_norm": 3.009431838989258, + "learning_rate": 1.9221334638295296e-05, + "loss": 0.2495, + "step": 10690 + }, + { + "epoch": 0.21384, + "grad_norm": 0.1250404566526413, + "learning_rate": 1.9220794373320428e-05, + "loss": 0.1115, + "step": 10692 + }, + { + "epoch": 0.21388, + "grad_norm": 3.383537530899048, + "learning_rate": 1.9220253928581428e-05, + "loss": 0.4903, + "step": 10694 + }, + { + "epoch": 0.21392, + "grad_norm": 1.647645354270935, + "learning_rate": 1.9219713304088833e-05, + "loss": 0.1707, + "step": 10696 + }, + { + "epoch": 0.21396, + "grad_norm": 1.4429653882980347, + "learning_rate": 1.9219172499853177e-05, + "loss": 0.272, + "step": 10698 + }, + { + "epoch": 0.214, + "grad_norm": 0.1829228550195694, + "learning_rate": 1.9218631515885007e-05, + "loss": 0.3449, + "step": 10700 + }, + { + "epoch": 0.21404, + "grad_norm": 0.31773659586906433, + "learning_rate": 1.921809035219487e-05, + "loss": 0.0734, + "step": 10702 + }, + { + "epoch": 0.21408, + "grad_norm": 0.7952463030815125, + "learning_rate": 1.9217549008793318e-05, + "loss": 0.2424, + "step": 10704 + }, + { + "epoch": 0.21412, + "grad_norm": 2.772484302520752, + "learning_rate": 1.92170074856909e-05, + "loss": 0.495, + "step": 10706 + }, + { + "epoch": 0.21416, + "grad_norm": 2.559830665588379, + "learning_rate": 1.9216465782898176e-05, + "loss": 0.3569, + "step": 10708 + }, + { + "epoch": 0.2142, + "grad_norm": 0.7517545223236084, + "learning_rate": 1.921592390042571e-05, + "loss": 0.0951, + "step": 10710 + }, + { + "epoch": 0.21424, + "grad_norm": 2.9494566917419434, + "learning_rate": 1.9215381838284056e-05, + "loss": 0.4711, + "step": 10712 + }, + { + "epoch": 0.21428, + "grad_norm": 1.6041901111602783, + "learning_rate": 1.9214839596483794e-05, + "loss": 0.1587, + "step": 10714 + }, + { + "epoch": 0.21432, + "grad_norm": 0.6357068419456482, + "learning_rate": 1.921429717503549e-05, + "loss": 0.1137, + "step": 10716 + }, + { + "epoch": 0.21436, + "grad_norm": 1.111652135848999, + "learning_rate": 1.9213754573949718e-05, + "loss": 0.1709, + "step": 10718 + }, + { + "epoch": 0.2144, + "grad_norm": 1.3851964473724365, + "learning_rate": 1.9213211793237056e-05, + "loss": 0.1099, + "step": 10720 + }, + { + "epoch": 0.21444, + "grad_norm": 1.4988367557525635, + "learning_rate": 1.921266883290809e-05, + "loss": 0.1939, + "step": 10722 + }, + { + "epoch": 0.21448, + "grad_norm": 2.323118209838867, + "learning_rate": 1.9212125692973396e-05, + "loss": 0.2205, + "step": 10724 + }, + { + "epoch": 0.21452, + "grad_norm": 3.3250234127044678, + "learning_rate": 1.9211582373443574e-05, + "loss": 0.461, + "step": 10726 + }, + { + "epoch": 0.21456, + "grad_norm": 0.6756795644760132, + "learning_rate": 1.9211038874329208e-05, + "loss": 0.0555, + "step": 10728 + }, + { + "epoch": 0.2146, + "grad_norm": 1.2096335887908936, + "learning_rate": 1.9210495195640895e-05, + "loss": 0.1369, + "step": 10730 + }, + { + "epoch": 0.21464, + "grad_norm": 0.9497336745262146, + "learning_rate": 1.9209951337389242e-05, + "loss": 0.1352, + "step": 10732 + }, + { + "epoch": 0.21468, + "grad_norm": 0.4585855007171631, + "learning_rate": 1.9209407299584842e-05, + "loss": 0.1842, + "step": 10734 + }, + { + "epoch": 0.21472, + "grad_norm": 1.4882261753082275, + "learning_rate": 1.920886308223831e-05, + "loss": 0.2778, + "step": 10736 + }, + { + "epoch": 0.21476, + "grad_norm": 2.612277030944824, + "learning_rate": 1.920831868536025e-05, + "loss": 0.3732, + "step": 10738 + }, + { + "epoch": 0.2148, + "grad_norm": 2.203594446182251, + "learning_rate": 1.9207774108961273e-05, + "loss": 0.2175, + "step": 10740 + }, + { + "epoch": 0.21484, + "grad_norm": 2.062299966812134, + "learning_rate": 1.9207229353052e-05, + "loss": 0.2691, + "step": 10742 + }, + { + "epoch": 0.21488, + "grad_norm": 1.3597310781478882, + "learning_rate": 1.9206684417643052e-05, + "loss": 0.1074, + "step": 10744 + }, + { + "epoch": 0.21492, + "grad_norm": 0.21492165327072144, + "learning_rate": 1.920613930274505e-05, + "loss": 0.0826, + "step": 10746 + }, + { + "epoch": 0.21496, + "grad_norm": 2.5736324787139893, + "learning_rate": 1.9205594008368623e-05, + "loss": 0.2232, + "step": 10748 + }, + { + "epoch": 0.215, + "grad_norm": 0.6737280488014221, + "learning_rate": 1.9205048534524405e-05, + "loss": 0.1175, + "step": 10750 + }, + { + "epoch": 0.21504, + "grad_norm": 2.634230613708496, + "learning_rate": 1.9204502881223023e-05, + "loss": 0.2935, + "step": 10752 + }, + { + "epoch": 0.21508, + "grad_norm": 2.621495485305786, + "learning_rate": 1.920395704847512e-05, + "loss": 0.4138, + "step": 10754 + }, + { + "epoch": 0.21512, + "grad_norm": 0.9669197797775269, + "learning_rate": 1.9203411036291337e-05, + "loss": 0.1176, + "step": 10756 + }, + { + "epoch": 0.21516, + "grad_norm": 1.2000409364700317, + "learning_rate": 1.9202864844682313e-05, + "loss": 0.2219, + "step": 10758 + }, + { + "epoch": 0.2152, + "grad_norm": 2.0503311157226562, + "learning_rate": 1.9202318473658707e-05, + "loss": 0.2536, + "step": 10760 + }, + { + "epoch": 0.21524, + "grad_norm": 0.31050050258636475, + "learning_rate": 1.920177192323116e-05, + "loss": 0.0676, + "step": 10762 + }, + { + "epoch": 0.21528, + "grad_norm": 4.178878307342529, + "learning_rate": 1.9201225193410334e-05, + "loss": 0.358, + "step": 10764 + }, + { + "epoch": 0.21532, + "grad_norm": 3.5339512825012207, + "learning_rate": 1.9200678284206883e-05, + "loss": 0.4266, + "step": 10766 + }, + { + "epoch": 0.21536, + "grad_norm": 1.387162446975708, + "learning_rate": 1.9200131195631476e-05, + "loss": 0.3227, + "step": 10768 + }, + { + "epoch": 0.2154, + "grad_norm": 3.243199110031128, + "learning_rate": 1.9199583927694775e-05, + "loss": 0.2778, + "step": 10770 + }, + { + "epoch": 0.21544, + "grad_norm": 1.3813401460647583, + "learning_rate": 1.9199036480407444e-05, + "loss": 0.1301, + "step": 10772 + }, + { + "epoch": 0.21548, + "grad_norm": 2.965819835662842, + "learning_rate": 1.9198488853780164e-05, + "loss": 0.3272, + "step": 10774 + }, + { + "epoch": 0.21552, + "grad_norm": 4.264117240905762, + "learning_rate": 1.9197941047823606e-05, + "loss": 0.1886, + "step": 10776 + }, + { + "epoch": 0.21556, + "grad_norm": 87.98302459716797, + "learning_rate": 1.9197393062548454e-05, + "loss": 0.3664, + "step": 10778 + }, + { + "epoch": 0.2156, + "grad_norm": 2.669110059738159, + "learning_rate": 1.9196844897965393e-05, + "loss": 0.3148, + "step": 10780 + }, + { + "epoch": 0.21564, + "grad_norm": 1.71773362159729, + "learning_rate": 1.91962965540851e-05, + "loss": 0.2061, + "step": 10782 + }, + { + "epoch": 0.21568, + "grad_norm": 1.0884389877319336, + "learning_rate": 1.9195748030918272e-05, + "loss": 0.1121, + "step": 10784 + }, + { + "epoch": 0.21572, + "grad_norm": 0.9536759853363037, + "learning_rate": 1.9195199328475602e-05, + "loss": 0.2344, + "step": 10786 + }, + { + "epoch": 0.21576, + "grad_norm": 1.606191635131836, + "learning_rate": 1.919465044676779e-05, + "loss": 0.1194, + "step": 10788 + }, + { + "epoch": 0.2158, + "grad_norm": 1.2501193284988403, + "learning_rate": 1.919410138580553e-05, + "loss": 0.1307, + "step": 10790 + }, + { + "epoch": 0.21584, + "grad_norm": 0.7346489429473877, + "learning_rate": 1.9193552145599533e-05, + "loss": 0.1804, + "step": 10792 + }, + { + "epoch": 0.21588, + "grad_norm": 1.4155604839324951, + "learning_rate": 1.91930027261605e-05, + "loss": 0.1097, + "step": 10794 + }, + { + "epoch": 0.21592, + "grad_norm": 1.7293940782546997, + "learning_rate": 1.919245312749915e-05, + "loss": 0.2321, + "step": 10796 + }, + { + "epoch": 0.21596, + "grad_norm": 1.5903061628341675, + "learning_rate": 1.919190334962619e-05, + "loss": 0.2035, + "step": 10798 + }, + { + "epoch": 0.216, + "grad_norm": 1.17398202419281, + "learning_rate": 1.9191353392552346e-05, + "loss": 0.0987, + "step": 10800 + }, + { + "epoch": 0.21604, + "grad_norm": 1.7565972805023193, + "learning_rate": 1.9190803256288332e-05, + "loss": 0.1633, + "step": 10802 + }, + { + "epoch": 0.21608, + "grad_norm": 0.3766821026802063, + "learning_rate": 1.919025294084488e-05, + "loss": 0.0339, + "step": 10804 + }, + { + "epoch": 0.21612, + "grad_norm": 0.478158175945282, + "learning_rate": 1.9189702446232714e-05, + "loss": 0.2111, + "step": 10806 + }, + { + "epoch": 0.21616, + "grad_norm": 2.758009195327759, + "learning_rate": 1.9189151772462567e-05, + "loss": 0.3571, + "step": 10808 + }, + { + "epoch": 0.2162, + "grad_norm": 1.3018529415130615, + "learning_rate": 1.9188600919545176e-05, + "loss": 0.2097, + "step": 10810 + }, + { + "epoch": 0.21624, + "grad_norm": 2.559582233428955, + "learning_rate": 1.9188049887491277e-05, + "loss": 0.2691, + "step": 10812 + }, + { + "epoch": 0.21628, + "grad_norm": 1.278620958328247, + "learning_rate": 1.9187498676311617e-05, + "loss": 0.1302, + "step": 10814 + }, + { + "epoch": 0.21632, + "grad_norm": 0.16412639617919922, + "learning_rate": 1.918694728601694e-05, + "loss": 0.0608, + "step": 10816 + }, + { + "epoch": 0.21636, + "grad_norm": 2.5121724605560303, + "learning_rate": 1.9186395716618e-05, + "loss": 0.3526, + "step": 10818 + }, + { + "epoch": 0.2164, + "grad_norm": 0.39133283495903015, + "learning_rate": 1.9185843968125543e-05, + "loss": 0.2409, + "step": 10820 + }, + { + "epoch": 0.21644, + "grad_norm": 1.1628919839859009, + "learning_rate": 1.918529204055033e-05, + "loss": 0.1442, + "step": 10822 + }, + { + "epoch": 0.21648, + "grad_norm": 2.5945777893066406, + "learning_rate": 1.9184739933903114e-05, + "loss": 0.3224, + "step": 10824 + }, + { + "epoch": 0.21652, + "grad_norm": 2.4780240058898926, + "learning_rate": 1.918418764819467e-05, + "loss": 0.2035, + "step": 10826 + }, + { + "epoch": 0.21656, + "grad_norm": 0.919010579586029, + "learning_rate": 1.9183635183435755e-05, + "loss": 0.1585, + "step": 10828 + }, + { + "epoch": 0.2166, + "grad_norm": 1.856467366218567, + "learning_rate": 1.918308253963715e-05, + "loss": 0.1762, + "step": 10830 + }, + { + "epoch": 0.21664, + "grad_norm": 1.377909779548645, + "learning_rate": 1.9182529716809618e-05, + "loss": 0.2426, + "step": 10832 + }, + { + "epoch": 0.21668, + "grad_norm": 0.8495416641235352, + "learning_rate": 1.9181976714963944e-05, + "loss": 0.1299, + "step": 10834 + }, + { + "epoch": 0.21672, + "grad_norm": 2.8319313526153564, + "learning_rate": 1.9181423534110908e-05, + "loss": 0.2838, + "step": 10836 + }, + { + "epoch": 0.21676, + "grad_norm": 1.424343466758728, + "learning_rate": 1.918087017426129e-05, + "loss": 0.1758, + "step": 10838 + }, + { + "epoch": 0.2168, + "grad_norm": 2.2628931999206543, + "learning_rate": 1.9180316635425883e-05, + "loss": 0.2559, + "step": 10840 + }, + { + "epoch": 0.21684, + "grad_norm": 1.0524660348892212, + "learning_rate": 1.9179762917615476e-05, + "loss": 0.0776, + "step": 10842 + }, + { + "epoch": 0.21688, + "grad_norm": 0.9089229702949524, + "learning_rate": 1.917920902084087e-05, + "loss": 0.1974, + "step": 10844 + }, + { + "epoch": 0.21692, + "grad_norm": 1.6403539180755615, + "learning_rate": 1.9178654945112848e-05, + "loss": 0.1348, + "step": 10846 + }, + { + "epoch": 0.21696, + "grad_norm": 0.890496551990509, + "learning_rate": 1.9178100690442233e-05, + "loss": 0.1583, + "step": 10848 + }, + { + "epoch": 0.217, + "grad_norm": 1.2286324501037598, + "learning_rate": 1.9177546256839814e-05, + "loss": 0.146, + "step": 10850 + }, + { + "epoch": 0.21704, + "grad_norm": 1.1812533140182495, + "learning_rate": 1.9176991644316406e-05, + "loss": 0.0797, + "step": 10852 + }, + { + "epoch": 0.21708, + "grad_norm": 2.804891586303711, + "learning_rate": 1.917643685288282e-05, + "loss": 0.3307, + "step": 10854 + }, + { + "epoch": 0.21712, + "grad_norm": 2.604396343231201, + "learning_rate": 1.9175881882549877e-05, + "loss": 0.2139, + "step": 10856 + }, + { + "epoch": 0.21716, + "grad_norm": 1.068499207496643, + "learning_rate": 1.9175326733328393e-05, + "loss": 0.1583, + "step": 10858 + }, + { + "epoch": 0.2172, + "grad_norm": 1.1817519664764404, + "learning_rate": 1.9174771405229187e-05, + "loss": 0.0798, + "step": 10860 + }, + { + "epoch": 0.21724, + "grad_norm": 1.3219587802886963, + "learning_rate": 1.917421589826309e-05, + "loss": 0.1914, + "step": 10862 + }, + { + "epoch": 0.21728, + "grad_norm": 0.48411592841148376, + "learning_rate": 1.917366021244093e-05, + "loss": 0.0956, + "step": 10864 + }, + { + "epoch": 0.21732, + "grad_norm": 1.631609559059143, + "learning_rate": 1.9173104347773546e-05, + "loss": 0.2916, + "step": 10866 + }, + { + "epoch": 0.21736, + "grad_norm": 1.9186553955078125, + "learning_rate": 1.9172548304271766e-05, + "loss": 0.149, + "step": 10868 + }, + { + "epoch": 0.2174, + "grad_norm": 1.1385438442230225, + "learning_rate": 1.9171992081946436e-05, + "loss": 0.0663, + "step": 10870 + }, + { + "epoch": 0.21744, + "grad_norm": 1.2562675476074219, + "learning_rate": 1.9171435680808396e-05, + "loss": 0.1441, + "step": 10872 + }, + { + "epoch": 0.21748, + "grad_norm": 1.8966115713119507, + "learning_rate": 1.9170879100868497e-05, + "loss": 0.3057, + "step": 10874 + }, + { + "epoch": 0.21752, + "grad_norm": 2.3165178298950195, + "learning_rate": 1.9170322342137592e-05, + "loss": 0.1361, + "step": 10876 + }, + { + "epoch": 0.21756, + "grad_norm": 4.1246442794799805, + "learning_rate": 1.9169765404626526e-05, + "loss": 0.3261, + "step": 10878 + }, + { + "epoch": 0.2176, + "grad_norm": 0.42897775769233704, + "learning_rate": 1.9169208288346168e-05, + "loss": 0.0906, + "step": 10880 + }, + { + "epoch": 0.21764, + "grad_norm": 0.6541240215301514, + "learning_rate": 1.9168650993307367e-05, + "loss": 0.22, + "step": 10882 + }, + { + "epoch": 0.21768, + "grad_norm": 1.5149059295654297, + "learning_rate": 1.9168093519521e-05, + "loss": 0.1233, + "step": 10884 + }, + { + "epoch": 0.21772, + "grad_norm": 1.1188009977340698, + "learning_rate": 1.916753586699793e-05, + "loss": 0.1054, + "step": 10886 + }, + { + "epoch": 0.21776, + "grad_norm": 4.150110244750977, + "learning_rate": 1.9166978035749023e-05, + "loss": 0.569, + "step": 10888 + }, + { + "epoch": 0.2178, + "grad_norm": 3.5288994312286377, + "learning_rate": 1.9166420025785165e-05, + "loss": 0.4219, + "step": 10890 + }, + { + "epoch": 0.21784, + "grad_norm": 3.3616011142730713, + "learning_rate": 1.9165861837117226e-05, + "loss": 0.6151, + "step": 10892 + }, + { + "epoch": 0.21788, + "grad_norm": 0.8441213369369507, + "learning_rate": 1.9165303469756096e-05, + "loss": 0.0594, + "step": 10894 + }, + { + "epoch": 0.21792, + "grad_norm": 2.3285176753997803, + "learning_rate": 1.916474492371265e-05, + "loss": 0.3129, + "step": 10896 + }, + { + "epoch": 0.21796, + "grad_norm": 1.4499602317810059, + "learning_rate": 1.916418619899779e-05, + "loss": 0.2037, + "step": 10898 + }, + { + "epoch": 0.218, + "grad_norm": 2.1067352294921875, + "learning_rate": 1.9163627295622397e-05, + "loss": 0.3004, + "step": 10900 + }, + { + "epoch": 0.21804, + "grad_norm": 1.8946260213851929, + "learning_rate": 1.9163068213597374e-05, + "loss": 0.2664, + "step": 10902 + }, + { + "epoch": 0.21808, + "grad_norm": 2.376776695251465, + "learning_rate": 1.916250895293362e-05, + "loss": 0.1504, + "step": 10904 + }, + { + "epoch": 0.21812, + "grad_norm": 1.7272354364395142, + "learning_rate": 1.9161949513642034e-05, + "loss": 0.1686, + "step": 10906 + }, + { + "epoch": 0.21816, + "grad_norm": 0.3088003695011139, + "learning_rate": 1.916138989573353e-05, + "loss": 0.11, + "step": 10908 + }, + { + "epoch": 0.2182, + "grad_norm": 1.8129092454910278, + "learning_rate": 1.9160830099219007e-05, + "loss": 0.1718, + "step": 10910 + }, + { + "epoch": 0.21824, + "grad_norm": 2.1578755378723145, + "learning_rate": 1.9160270124109386e-05, + "loss": 0.1543, + "step": 10912 + }, + { + "epoch": 0.21828, + "grad_norm": 0.7035079598426819, + "learning_rate": 1.915970997041559e-05, + "loss": 0.1883, + "step": 10914 + }, + { + "epoch": 0.21832, + "grad_norm": 0.5801697373390198, + "learning_rate": 1.9159149638148525e-05, + "loss": 0.0891, + "step": 10916 + }, + { + "epoch": 0.21836, + "grad_norm": 1.0825395584106445, + "learning_rate": 1.9158589127319125e-05, + "loss": 0.0718, + "step": 10918 + }, + { + "epoch": 0.2184, + "grad_norm": 2.0946028232574463, + "learning_rate": 1.9158028437938316e-05, + "loss": 0.1499, + "step": 10920 + }, + { + "epoch": 0.21844, + "grad_norm": 0.7358038425445557, + "learning_rate": 1.9157467570017026e-05, + "loss": 0.1541, + "step": 10922 + }, + { + "epoch": 0.21848, + "grad_norm": 1.2147932052612305, + "learning_rate": 1.9156906523566192e-05, + "loss": 0.2784, + "step": 10924 + }, + { + "epoch": 0.21852, + "grad_norm": 3.173459053039551, + "learning_rate": 1.9156345298596747e-05, + "loss": 0.2408, + "step": 10926 + }, + { + "epoch": 0.21856, + "grad_norm": 1.960069179534912, + "learning_rate": 1.9155783895119642e-05, + "loss": 0.1392, + "step": 10928 + }, + { + "epoch": 0.2186, + "grad_norm": 3.005227565765381, + "learning_rate": 1.9155222313145817e-05, + "loss": 0.3528, + "step": 10930 + }, + { + "epoch": 0.21864, + "grad_norm": 2.2563233375549316, + "learning_rate": 1.915466055268621e-05, + "loss": 0.2895, + "step": 10932 + }, + { + "epoch": 0.21868, + "grad_norm": 1.7095271348953247, + "learning_rate": 1.915409861375179e-05, + "loss": 0.1515, + "step": 10934 + }, + { + "epoch": 0.21872, + "grad_norm": 2.744685649871826, + "learning_rate": 1.9153536496353505e-05, + "loss": 0.382, + "step": 10936 + }, + { + "epoch": 0.21876, + "grad_norm": 1.1757211685180664, + "learning_rate": 1.9152974200502314e-05, + "loss": 0.3366, + "step": 10938 + }, + { + "epoch": 0.2188, + "grad_norm": 2.891740083694458, + "learning_rate": 1.9152411726209176e-05, + "loss": 0.228, + "step": 10940 + }, + { + "epoch": 0.21884, + "grad_norm": 2.0771169662475586, + "learning_rate": 1.9151849073485064e-05, + "loss": 0.2424, + "step": 10942 + }, + { + "epoch": 0.21888, + "grad_norm": 2.245248556137085, + "learning_rate": 1.9151286242340936e-05, + "loss": 0.2346, + "step": 10944 + }, + { + "epoch": 0.21892, + "grad_norm": 0.6526790261268616, + "learning_rate": 1.9150723232787777e-05, + "loss": 0.1772, + "step": 10946 + }, + { + "epoch": 0.21896, + "grad_norm": 0.8508340120315552, + "learning_rate": 1.9150160044836553e-05, + "loss": 0.0851, + "step": 10948 + }, + { + "epoch": 0.219, + "grad_norm": 0.6307037472724915, + "learning_rate": 1.914959667849825e-05, + "loss": 0.1934, + "step": 10950 + }, + { + "epoch": 0.21904, + "grad_norm": 1.3351311683654785, + "learning_rate": 1.914903313378385e-05, + "loss": 0.087, + "step": 10952 + }, + { + "epoch": 0.21908, + "grad_norm": 0.5763827562332153, + "learning_rate": 1.9148469410704334e-05, + "loss": 0.092, + "step": 10954 + }, + { + "epoch": 0.21912, + "grad_norm": 1.2783857583999634, + "learning_rate": 1.9147905509270703e-05, + "loss": 0.1267, + "step": 10956 + }, + { + "epoch": 0.21916, + "grad_norm": 3.998666763305664, + "learning_rate": 1.9147341429493945e-05, + "loss": 0.491, + "step": 10958 + }, + { + "epoch": 0.2192, + "grad_norm": 0.6262364983558655, + "learning_rate": 1.914677717138505e-05, + "loss": 0.2463, + "step": 10960 + }, + { + "epoch": 0.21924, + "grad_norm": 1.096086859703064, + "learning_rate": 1.914621273495503e-05, + "loss": 0.1549, + "step": 10962 + }, + { + "epoch": 0.21928, + "grad_norm": 2.225156545639038, + "learning_rate": 1.9145648120214883e-05, + "loss": 0.1773, + "step": 10964 + }, + { + "epoch": 0.21932, + "grad_norm": 1.8418306112289429, + "learning_rate": 1.9145083327175618e-05, + "loss": 0.2318, + "step": 10966 + }, + { + "epoch": 0.21936, + "grad_norm": 0.5383266806602478, + "learning_rate": 1.9144518355848243e-05, + "loss": 0.0831, + "step": 10968 + }, + { + "epoch": 0.2194, + "grad_norm": 0.7793349027633667, + "learning_rate": 1.9143953206243778e-05, + "loss": 0.2516, + "step": 10970 + }, + { + "epoch": 0.21944, + "grad_norm": 1.289505958557129, + "learning_rate": 1.9143387878373236e-05, + "loss": 0.1587, + "step": 10972 + }, + { + "epoch": 0.21948, + "grad_norm": 2.2750675678253174, + "learning_rate": 1.9142822372247635e-05, + "loss": 0.2125, + "step": 10974 + }, + { + "epoch": 0.21952, + "grad_norm": 2.861435651779175, + "learning_rate": 1.9142256687878012e-05, + "loss": 0.325, + "step": 10976 + }, + { + "epoch": 0.21956, + "grad_norm": 1.0795236825942993, + "learning_rate": 1.9141690825275384e-05, + "loss": 0.3144, + "step": 10978 + }, + { + "epoch": 0.2196, + "grad_norm": 1.278694987297058, + "learning_rate": 1.914112478445079e-05, + "loss": 0.2568, + "step": 10980 + }, + { + "epoch": 0.21964, + "grad_norm": 2.7355778217315674, + "learning_rate": 1.914055856541526e-05, + "loss": 0.2187, + "step": 10982 + }, + { + "epoch": 0.21968, + "grad_norm": 1.6814907789230347, + "learning_rate": 1.9139992168179836e-05, + "loss": 0.1204, + "step": 10984 + }, + { + "epoch": 0.21972, + "grad_norm": 0.40914401412010193, + "learning_rate": 1.9139425592755557e-05, + "loss": 0.0946, + "step": 10986 + }, + { + "epoch": 0.21976, + "grad_norm": 0.9100897312164307, + "learning_rate": 1.9138858839153473e-05, + "loss": 0.0964, + "step": 10988 + }, + { + "epoch": 0.2198, + "grad_norm": 1.4393465518951416, + "learning_rate": 1.9138291907384632e-05, + "loss": 0.1364, + "step": 10990 + }, + { + "epoch": 0.21984, + "grad_norm": 1.9336744546890259, + "learning_rate": 1.9137724797460084e-05, + "loss": 0.3204, + "step": 10992 + }, + { + "epoch": 0.21988, + "grad_norm": 0.4437934458255768, + "learning_rate": 1.9137157509390886e-05, + "loss": 0.1048, + "step": 10994 + }, + { + "epoch": 0.21992, + "grad_norm": 0.7904240489006042, + "learning_rate": 1.9136590043188098e-05, + "loss": 0.1439, + "step": 10996 + }, + { + "epoch": 0.21996, + "grad_norm": 2.904506206512451, + "learning_rate": 1.9136022398862785e-05, + "loss": 0.279, + "step": 10998 + }, + { + "epoch": 0.22, + "grad_norm": 2.857003927230835, + "learning_rate": 1.913545457642601e-05, + "loss": 0.2619, + "step": 11000 + }, + { + "epoch": 0.22004, + "grad_norm": 3.8185107707977295, + "learning_rate": 1.9134886575888845e-05, + "loss": 0.4729, + "step": 11002 + }, + { + "epoch": 0.22008, + "grad_norm": 1.9772758483886719, + "learning_rate": 1.9134318397262362e-05, + "loss": 0.2126, + "step": 11004 + }, + { + "epoch": 0.22012, + "grad_norm": 2.149933338165283, + "learning_rate": 1.9133750040557645e-05, + "loss": 0.2347, + "step": 11006 + }, + { + "epoch": 0.22016, + "grad_norm": 0.7035356760025024, + "learning_rate": 1.913318150578576e-05, + "loss": 0.0907, + "step": 11008 + }, + { + "epoch": 0.2202, + "grad_norm": 2.043182611465454, + "learning_rate": 1.9132612792957808e-05, + "loss": 0.5095, + "step": 11010 + }, + { + "epoch": 0.22024, + "grad_norm": 2.5696218013763428, + "learning_rate": 1.9132043902084864e-05, + "loss": 0.4885, + "step": 11012 + }, + { + "epoch": 0.22028, + "grad_norm": 0.5645694136619568, + "learning_rate": 1.9131474833178023e-05, + "loss": 0.0388, + "step": 11014 + }, + { + "epoch": 0.22032, + "grad_norm": 2.420008897781372, + "learning_rate": 1.913090558624838e-05, + "loss": 0.1362, + "step": 11016 + }, + { + "epoch": 0.22036, + "grad_norm": 0.47753390669822693, + "learning_rate": 1.9130336161307027e-05, + "loss": 0.0393, + "step": 11018 + }, + { + "epoch": 0.2204, + "grad_norm": 0.4881875216960907, + "learning_rate": 1.9129766558365076e-05, + "loss": 0.0858, + "step": 11020 + }, + { + "epoch": 0.22044, + "grad_norm": 1.037642002105713, + "learning_rate": 1.912919677743362e-05, + "loss": 0.091, + "step": 11022 + }, + { + "epoch": 0.22048, + "grad_norm": 0.4444602131843567, + "learning_rate": 1.9128626818523776e-05, + "loss": 0.1306, + "step": 11024 + }, + { + "epoch": 0.22052, + "grad_norm": 0.8363330960273743, + "learning_rate": 1.912805668164665e-05, + "loss": 0.1268, + "step": 11026 + }, + { + "epoch": 0.22056, + "grad_norm": 2.5717854499816895, + "learning_rate": 1.9127486366813365e-05, + "loss": 0.2693, + "step": 11028 + }, + { + "epoch": 0.2206, + "grad_norm": 3.6339221000671387, + "learning_rate": 1.912691587403503e-05, + "loss": 0.3732, + "step": 11030 + }, + { + "epoch": 0.22064, + "grad_norm": 0.6724214553833008, + "learning_rate": 1.912634520332277e-05, + "loss": 0.0971, + "step": 11032 + }, + { + "epoch": 0.22068, + "grad_norm": 1.352824330329895, + "learning_rate": 1.9125774354687716e-05, + "loss": 0.1033, + "step": 11034 + }, + { + "epoch": 0.22072, + "grad_norm": 0.5462868213653564, + "learning_rate": 1.912520332814099e-05, + "loss": 0.0516, + "step": 11036 + }, + { + "epoch": 0.22076, + "grad_norm": 1.0552064180374146, + "learning_rate": 1.9124632123693726e-05, + "loss": 0.1552, + "step": 11038 + }, + { + "epoch": 0.2208, + "grad_norm": 1.887067437171936, + "learning_rate": 1.9124060741357065e-05, + "loss": 0.1149, + "step": 11040 + }, + { + "epoch": 0.22084, + "grad_norm": 0.7708703279495239, + "learning_rate": 1.912348918114214e-05, + "loss": 0.078, + "step": 11042 + }, + { + "epoch": 0.22088, + "grad_norm": 0.5405762195587158, + "learning_rate": 1.9122917443060095e-05, + "loss": 0.0532, + "step": 11044 + }, + { + "epoch": 0.22092, + "grad_norm": 1.707175612449646, + "learning_rate": 1.912234552712208e-05, + "loss": 0.1386, + "step": 11046 + }, + { + "epoch": 0.22096, + "grad_norm": 4.38950777053833, + "learning_rate": 1.912177343333924e-05, + "loss": 0.3816, + "step": 11048 + }, + { + "epoch": 0.221, + "grad_norm": 0.9009478688240051, + "learning_rate": 1.9121201161722732e-05, + "loss": 0.1611, + "step": 11050 + }, + { + "epoch": 0.22104, + "grad_norm": 1.6182667016983032, + "learning_rate": 1.912062871228371e-05, + "loss": 0.2349, + "step": 11052 + }, + { + "epoch": 0.22108, + "grad_norm": 2.2071025371551514, + "learning_rate": 1.912005608503334e-05, + "loss": 0.4073, + "step": 11054 + }, + { + "epoch": 0.22112, + "grad_norm": 2.1578221321105957, + "learning_rate": 1.911948327998278e-05, + "loss": 0.3161, + "step": 11056 + }, + { + "epoch": 0.22116, + "grad_norm": 0.9627126455307007, + "learning_rate": 1.9118910297143195e-05, + "loss": 0.2104, + "step": 11058 + }, + { + "epoch": 0.2212, + "grad_norm": 3.128944158554077, + "learning_rate": 1.911833713652576e-05, + "loss": 0.1649, + "step": 11060 + }, + { + "epoch": 0.22124, + "grad_norm": 3.466752767562866, + "learning_rate": 1.9117763798141647e-05, + "loss": 0.5231, + "step": 11062 + }, + { + "epoch": 0.22128, + "grad_norm": 4.077211856842041, + "learning_rate": 1.911719028200204e-05, + "loss": 0.3635, + "step": 11064 + }, + { + "epoch": 0.22132, + "grad_norm": 3.1800339221954346, + "learning_rate": 1.911661658811811e-05, + "loss": 0.5819, + "step": 11066 + }, + { + "epoch": 0.22136, + "grad_norm": 1.218677282333374, + "learning_rate": 1.9116042716501046e-05, + "loss": 0.0954, + "step": 11068 + }, + { + "epoch": 0.2214, + "grad_norm": 1.693720817565918, + "learning_rate": 1.9115468667162038e-05, + "loss": 0.1254, + "step": 11070 + }, + { + "epoch": 0.22144, + "grad_norm": 0.6910457015037537, + "learning_rate": 1.9114894440112274e-05, + "loss": 0.1648, + "step": 11072 + }, + { + "epoch": 0.22148, + "grad_norm": 2.2469730377197266, + "learning_rate": 1.911432003536295e-05, + "loss": 0.2393, + "step": 11074 + }, + { + "epoch": 0.22152, + "grad_norm": 2.9094316959381104, + "learning_rate": 1.9113745452925267e-05, + "loss": 0.2416, + "step": 11076 + }, + { + "epoch": 0.22156, + "grad_norm": 2.9385969638824463, + "learning_rate": 1.9113170692810423e-05, + "loss": 0.2998, + "step": 11078 + }, + { + "epoch": 0.2216, + "grad_norm": 0.6871967315673828, + "learning_rate": 1.9112595755029625e-05, + "loss": 0.0853, + "step": 11080 + }, + { + "epoch": 0.22164, + "grad_norm": 0.9364672303199768, + "learning_rate": 1.911202063959408e-05, + "loss": 0.0694, + "step": 11082 + }, + { + "epoch": 0.22168, + "grad_norm": 1.0615296363830566, + "learning_rate": 1.9111445346515003e-05, + "loss": 0.0803, + "step": 11084 + }, + { + "epoch": 0.22172, + "grad_norm": 0.6584060788154602, + "learning_rate": 1.9110869875803603e-05, + "loss": 0.1545, + "step": 11086 + }, + { + "epoch": 0.22176, + "grad_norm": 1.1414886713027954, + "learning_rate": 1.911029422747111e-05, + "loss": 0.1235, + "step": 11088 + }, + { + "epoch": 0.2218, + "grad_norm": 1.871590495109558, + "learning_rate": 1.9109718401528742e-05, + "loss": 0.1182, + "step": 11090 + }, + { + "epoch": 0.22184, + "grad_norm": 1.271424651145935, + "learning_rate": 1.9109142397987715e-05, + "loss": 0.1782, + "step": 11092 + }, + { + "epoch": 0.22188, + "grad_norm": 2.296877145767212, + "learning_rate": 1.9108566216859278e-05, + "loss": 0.214, + "step": 11094 + }, + { + "epoch": 0.22192, + "grad_norm": 0.9515848159790039, + "learning_rate": 1.9107989858154646e-05, + "loss": 0.09, + "step": 11096 + }, + { + "epoch": 0.22196, + "grad_norm": 1.8566383123397827, + "learning_rate": 1.9107413321885064e-05, + "loss": 0.3251, + "step": 11098 + }, + { + "epoch": 0.222, + "grad_norm": 4.2026801109313965, + "learning_rate": 1.910683660806177e-05, + "loss": 0.4248, + "step": 11100 + }, + { + "epoch": 0.22204, + "grad_norm": 3.5284852981567383, + "learning_rate": 1.910625971669601e-05, + "loss": 0.2915, + "step": 11102 + }, + { + "epoch": 0.22208, + "grad_norm": 1.263511061668396, + "learning_rate": 1.910568264779903e-05, + "loss": 0.534, + "step": 11104 + }, + { + "epoch": 0.22212, + "grad_norm": 0.7472192049026489, + "learning_rate": 1.9105105401382074e-05, + "loss": 0.1607, + "step": 11106 + }, + { + "epoch": 0.22216, + "grad_norm": 0.89488285779953, + "learning_rate": 1.9104527977456408e-05, + "loss": 0.0535, + "step": 11108 + }, + { + "epoch": 0.2222, + "grad_norm": 2.1363790035247803, + "learning_rate": 1.9103950376033276e-05, + "loss": 0.1976, + "step": 11110 + }, + { + "epoch": 0.22224, + "grad_norm": 2.578714609146118, + "learning_rate": 1.9103372597123944e-05, + "loss": 0.3999, + "step": 11112 + }, + { + "epoch": 0.22228, + "grad_norm": 2.4478745460510254, + "learning_rate": 1.9102794640739675e-05, + "loss": 0.2667, + "step": 11114 + }, + { + "epoch": 0.22232, + "grad_norm": 2.1946935653686523, + "learning_rate": 1.910221650689174e-05, + "loss": 0.296, + "step": 11116 + }, + { + "epoch": 0.22236, + "grad_norm": 2.3780856132507324, + "learning_rate": 1.910163819559141e-05, + "loss": 0.2885, + "step": 11118 + }, + { + "epoch": 0.2224, + "grad_norm": 0.4259777069091797, + "learning_rate": 1.9101059706849957e-05, + "loss": 0.1216, + "step": 11120 + }, + { + "epoch": 0.22244, + "grad_norm": 1.4655251502990723, + "learning_rate": 1.910048104067866e-05, + "loss": 0.1498, + "step": 11122 + }, + { + "epoch": 0.22248, + "grad_norm": 1.537168025970459, + "learning_rate": 1.9099902197088797e-05, + "loss": 0.1405, + "step": 11124 + }, + { + "epoch": 0.22252, + "grad_norm": 2.004718542098999, + "learning_rate": 1.909932317609166e-05, + "loss": 0.2295, + "step": 11126 + }, + { + "epoch": 0.22256, + "grad_norm": 1.639797568321228, + "learning_rate": 1.909874397769853e-05, + "loss": 0.1535, + "step": 11128 + }, + { + "epoch": 0.2226, + "grad_norm": 2.642382860183716, + "learning_rate": 1.9098164601920702e-05, + "loss": 0.5195, + "step": 11130 + }, + { + "epoch": 0.22264, + "grad_norm": 0.9825976490974426, + "learning_rate": 1.909758504876947e-05, + "loss": 0.1127, + "step": 11132 + }, + { + "epoch": 0.22268, + "grad_norm": 2.227933168411255, + "learning_rate": 1.9097005318256137e-05, + "loss": 0.2847, + "step": 11134 + }, + { + "epoch": 0.22272, + "grad_norm": 2.1075353622436523, + "learning_rate": 1.9096425410392e-05, + "loss": 0.2922, + "step": 11136 + }, + { + "epoch": 0.22276, + "grad_norm": 1.9755196571350098, + "learning_rate": 1.9095845325188367e-05, + "loss": 0.2038, + "step": 11138 + }, + { + "epoch": 0.2228, + "grad_norm": 2.313100576400757, + "learning_rate": 1.9095265062656546e-05, + "loss": 0.3945, + "step": 11140 + }, + { + "epoch": 0.22284, + "grad_norm": 0.23155614733695984, + "learning_rate": 1.9094684622807847e-05, + "loss": 0.0707, + "step": 11142 + }, + { + "epoch": 0.22288, + "grad_norm": 2.3681726455688477, + "learning_rate": 1.9094104005653594e-05, + "loss": 0.2961, + "step": 11144 + }, + { + "epoch": 0.22292, + "grad_norm": 1.800716519355774, + "learning_rate": 1.90935232112051e-05, + "loss": 0.2549, + "step": 11146 + }, + { + "epoch": 0.22296, + "grad_norm": 1.195867896080017, + "learning_rate": 1.909294223947369e-05, + "loss": 0.2278, + "step": 11148 + }, + { + "epoch": 0.223, + "grad_norm": 1.2561428546905518, + "learning_rate": 1.9092361090470688e-05, + "loss": 0.1516, + "step": 11150 + }, + { + "epoch": 0.22304, + "grad_norm": 1.4424066543579102, + "learning_rate": 1.9091779764207425e-05, + "loss": 0.1224, + "step": 11152 + }, + { + "epoch": 0.22308, + "grad_norm": 1.0464327335357666, + "learning_rate": 1.9091198260695236e-05, + "loss": 0.1368, + "step": 11154 + }, + { + "epoch": 0.22312, + "grad_norm": 2.5518810749053955, + "learning_rate": 1.9090616579945455e-05, + "loss": 0.2766, + "step": 11156 + }, + { + "epoch": 0.22316, + "grad_norm": 0.4748414158821106, + "learning_rate": 1.9090034721969425e-05, + "loss": 0.1333, + "step": 11158 + }, + { + "epoch": 0.2232, + "grad_norm": 1.2529526948928833, + "learning_rate": 1.908945268677849e-05, + "loss": 0.1594, + "step": 11160 + }, + { + "epoch": 0.22324, + "grad_norm": 1.063579797744751, + "learning_rate": 1.908887047438399e-05, + "loss": 0.1383, + "step": 11162 + }, + { + "epoch": 0.22328, + "grad_norm": 1.551820993423462, + "learning_rate": 1.9088288084797287e-05, + "loss": 0.1526, + "step": 11164 + }, + { + "epoch": 0.22332, + "grad_norm": 0.7614520788192749, + "learning_rate": 1.9087705518029725e-05, + "loss": 0.0543, + "step": 11166 + }, + { + "epoch": 0.22336, + "grad_norm": 1.0697134733200073, + "learning_rate": 1.9087122774092666e-05, + "loss": 0.106, + "step": 11168 + }, + { + "epoch": 0.2234, + "grad_norm": 2.4159302711486816, + "learning_rate": 1.908653985299747e-05, + "loss": 0.2788, + "step": 11170 + }, + { + "epoch": 0.22344, + "grad_norm": 0.861056923866272, + "learning_rate": 1.9085956754755504e-05, + "loss": 0.3262, + "step": 11172 + }, + { + "epoch": 0.22348, + "grad_norm": 1.5610781908035278, + "learning_rate": 1.908537347937813e-05, + "loss": 0.1445, + "step": 11174 + }, + { + "epoch": 0.22352, + "grad_norm": 2.139432668685913, + "learning_rate": 1.908479002687672e-05, + "loss": 0.2052, + "step": 11176 + }, + { + "epoch": 0.22356, + "grad_norm": 0.9336367249488831, + "learning_rate": 1.908420639726266e-05, + "loss": 0.2029, + "step": 11178 + }, + { + "epoch": 0.2236, + "grad_norm": 1.1417839527130127, + "learning_rate": 1.9083622590547313e-05, + "loss": 0.1234, + "step": 11180 + }, + { + "epoch": 0.22364, + "grad_norm": 1.1195454597473145, + "learning_rate": 1.9083038606742068e-05, + "loss": 0.1446, + "step": 11182 + }, + { + "epoch": 0.22368, + "grad_norm": 0.37862569093704224, + "learning_rate": 1.908245444585831e-05, + "loss": 0.2515, + "step": 11184 + }, + { + "epoch": 0.22372, + "grad_norm": 2.0611119270324707, + "learning_rate": 1.9081870107907426e-05, + "loss": 0.1536, + "step": 11186 + }, + { + "epoch": 0.22376, + "grad_norm": 0.3056206703186035, + "learning_rate": 1.9081285592900805e-05, + "loss": 0.0853, + "step": 11188 + }, + { + "epoch": 0.2238, + "grad_norm": 0.7830289006233215, + "learning_rate": 1.9080700900849855e-05, + "loss": 0.0774, + "step": 11190 + }, + { + "epoch": 0.22384, + "grad_norm": 0.8327502012252808, + "learning_rate": 1.908011603176596e-05, + "loss": 0.0693, + "step": 11192 + }, + { + "epoch": 0.22388, + "grad_norm": 1.6718318462371826, + "learning_rate": 1.907953098566053e-05, + "loss": 0.2023, + "step": 11194 + }, + { + "epoch": 0.22392, + "grad_norm": 2.270840883255005, + "learning_rate": 1.9078945762544966e-05, + "loss": 0.171, + "step": 11196 + }, + { + "epoch": 0.22396, + "grad_norm": 1.7882457971572876, + "learning_rate": 1.907836036243068e-05, + "loss": 0.1401, + "step": 11198 + }, + { + "epoch": 0.224, + "grad_norm": 0.8566670417785645, + "learning_rate": 1.907777478532909e-05, + "loss": 0.2569, + "step": 11200 + }, + { + "epoch": 0.22404, + "grad_norm": 2.0073559284210205, + "learning_rate": 1.9077189031251603e-05, + "loss": 0.1506, + "step": 11202 + }, + { + "epoch": 0.22408, + "grad_norm": 2.943272829055786, + "learning_rate": 1.9076603100209643e-05, + "loss": 0.2425, + "step": 11204 + }, + { + "epoch": 0.22412, + "grad_norm": 0.6006516218185425, + "learning_rate": 1.9076016992214634e-05, + "loss": 0.0608, + "step": 11206 + }, + { + "epoch": 0.22416, + "grad_norm": 2.232163190841675, + "learning_rate": 1.9075430707278e-05, + "loss": 0.215, + "step": 11208 + }, + { + "epoch": 0.2242, + "grad_norm": 1.709991216659546, + "learning_rate": 1.907484424541117e-05, + "loss": 0.1669, + "step": 11210 + }, + { + "epoch": 0.22424, + "grad_norm": 1.664310097694397, + "learning_rate": 1.9074257606625583e-05, + "loss": 0.1345, + "step": 11212 + }, + { + "epoch": 0.22428, + "grad_norm": 1.142903447151184, + "learning_rate": 1.907367079093267e-05, + "loss": 0.1109, + "step": 11214 + }, + { + "epoch": 0.22432, + "grad_norm": 0.4324658513069153, + "learning_rate": 1.9073083798343873e-05, + "loss": 0.0338, + "step": 11216 + }, + { + "epoch": 0.22436, + "grad_norm": 2.121657371520996, + "learning_rate": 1.907249662887064e-05, + "loss": 0.1645, + "step": 11218 + }, + { + "epoch": 0.2244, + "grad_norm": 3.6038923263549805, + "learning_rate": 1.907190928252441e-05, + "loss": 0.563, + "step": 11220 + }, + { + "epoch": 0.22444, + "grad_norm": 0.1251765787601471, + "learning_rate": 1.907132175931664e-05, + "loss": 0.1555, + "step": 11222 + }, + { + "epoch": 0.22448, + "grad_norm": 0.6491023898124695, + "learning_rate": 1.9070734059258784e-05, + "loss": 0.1719, + "step": 11224 + }, + { + "epoch": 0.22452, + "grad_norm": 1.8930093050003052, + "learning_rate": 1.9070146182362295e-05, + "loss": 0.181, + "step": 11226 + }, + { + "epoch": 0.22456, + "grad_norm": 0.9966699481010437, + "learning_rate": 1.9069558128638636e-05, + "loss": 0.0797, + "step": 11228 + }, + { + "epoch": 0.2246, + "grad_norm": 0.2627573609352112, + "learning_rate": 1.906896989809927e-05, + "loss": 0.1618, + "step": 11230 + }, + { + "epoch": 0.22464, + "grad_norm": 0.8556308746337891, + "learning_rate": 1.9068381490755673e-05, + "loss": 0.1807, + "step": 11232 + }, + { + "epoch": 0.22468, + "grad_norm": 2.2155630588531494, + "learning_rate": 1.9067792906619308e-05, + "loss": 0.1916, + "step": 11234 + }, + { + "epoch": 0.22472, + "grad_norm": 3.5720860958099365, + "learning_rate": 1.906720414570165e-05, + "loss": 0.4738, + "step": 11236 + }, + { + "epoch": 0.22476, + "grad_norm": 1.0944561958312988, + "learning_rate": 1.9066615208014177e-05, + "loss": 0.2291, + "step": 11238 + }, + { + "epoch": 0.2248, + "grad_norm": 1.7593412399291992, + "learning_rate": 1.906602609356838e-05, + "loss": 0.137, + "step": 11240 + }, + { + "epoch": 0.22484, + "grad_norm": 2.6387410163879395, + "learning_rate": 1.906543680237573e-05, + "loss": 0.3012, + "step": 11242 + }, + { + "epoch": 0.22488, + "grad_norm": 0.48431411385536194, + "learning_rate": 1.9064847334447726e-05, + "loss": 0.3252, + "step": 11244 + }, + { + "epoch": 0.22492, + "grad_norm": 1.592052698135376, + "learning_rate": 1.9064257689795855e-05, + "loss": 0.2694, + "step": 11246 + }, + { + "epoch": 0.22496, + "grad_norm": 1.44191575050354, + "learning_rate": 1.9063667868431613e-05, + "loss": 0.1096, + "step": 11248 + }, + { + "epoch": 0.225, + "grad_norm": 0.9284886717796326, + "learning_rate": 1.9063077870366504e-05, + "loss": 0.0998, + "step": 11250 + }, + { + "epoch": 0.22504, + "grad_norm": 0.7813709378242493, + "learning_rate": 1.906248769561202e-05, + "loss": 0.1481, + "step": 11252 + }, + { + "epoch": 0.22508, + "grad_norm": 1.5958811044692993, + "learning_rate": 1.9061897344179674e-05, + "loss": 0.1777, + "step": 11254 + }, + { + "epoch": 0.22512, + "grad_norm": 2.124485969543457, + "learning_rate": 1.9061306816080974e-05, + "loss": 0.3667, + "step": 11256 + }, + { + "epoch": 0.22516, + "grad_norm": 1.3673553466796875, + "learning_rate": 1.9060716111327437e-05, + "loss": 0.2221, + "step": 11258 + }, + { + "epoch": 0.2252, + "grad_norm": 0.7944375276565552, + "learning_rate": 1.9060125229930572e-05, + "loss": 0.1399, + "step": 11260 + }, + { + "epoch": 0.22524, + "grad_norm": 3.057335376739502, + "learning_rate": 1.90595341719019e-05, + "loss": 0.3889, + "step": 11262 + }, + { + "epoch": 0.22528, + "grad_norm": 2.56778883934021, + "learning_rate": 1.9058942937252943e-05, + "loss": 0.2315, + "step": 11264 + }, + { + "epoch": 0.22532, + "grad_norm": 1.6881868839263916, + "learning_rate": 1.9058351525995232e-05, + "loss": 0.149, + "step": 11266 + }, + { + "epoch": 0.22536, + "grad_norm": 1.1504881381988525, + "learning_rate": 1.9057759938140296e-05, + "loss": 0.3773, + "step": 11268 + }, + { + "epoch": 0.2254, + "grad_norm": 0.6116735339164734, + "learning_rate": 1.9057168173699664e-05, + "loss": 0.1101, + "step": 11270 + }, + { + "epoch": 0.22544, + "grad_norm": 1.7557384967803955, + "learning_rate": 1.9056576232684878e-05, + "loss": 0.1209, + "step": 11272 + }, + { + "epoch": 0.22548, + "grad_norm": 0.5950196385383606, + "learning_rate": 1.9055984115107473e-05, + "loss": 0.0496, + "step": 11274 + }, + { + "epoch": 0.22552, + "grad_norm": 0.4528736174106598, + "learning_rate": 1.9055391820978996e-05, + "loss": 0.2235, + "step": 11276 + }, + { + "epoch": 0.22556, + "grad_norm": 3.01037335395813, + "learning_rate": 1.9054799350310994e-05, + "loss": 0.3266, + "step": 11278 + }, + { + "epoch": 0.2256, + "grad_norm": 0.5057455897331238, + "learning_rate": 1.905420670311502e-05, + "loss": 0.3976, + "step": 11280 + }, + { + "epoch": 0.22564, + "grad_norm": 0.9853131175041199, + "learning_rate": 1.905361387940262e-05, + "loss": 0.0999, + "step": 11282 + }, + { + "epoch": 0.22568, + "grad_norm": 0.8346681594848633, + "learning_rate": 1.905302087918536e-05, + "loss": 0.0702, + "step": 11284 + }, + { + "epoch": 0.22572, + "grad_norm": 2.4186506271362305, + "learning_rate": 1.9052427702474792e-05, + "loss": 0.3427, + "step": 11286 + }, + { + "epoch": 0.22576, + "grad_norm": 0.33759620785713196, + "learning_rate": 1.905183434928249e-05, + "loss": 0.0803, + "step": 11288 + }, + { + "epoch": 0.2258, + "grad_norm": 1.4015083312988281, + "learning_rate": 1.9051240819620018e-05, + "loss": 0.2263, + "step": 11290 + }, + { + "epoch": 0.22584, + "grad_norm": 2.3467891216278076, + "learning_rate": 1.9050647113498944e-05, + "loss": 0.3734, + "step": 11292 + }, + { + "epoch": 0.22588, + "grad_norm": 1.3021444082260132, + "learning_rate": 1.9050053230930845e-05, + "loss": 0.2183, + "step": 11294 + }, + { + "epoch": 0.22592, + "grad_norm": 2.7429561614990234, + "learning_rate": 1.90494591719273e-05, + "loss": 0.1978, + "step": 11296 + }, + { + "epoch": 0.22596, + "grad_norm": 1.794724464416504, + "learning_rate": 1.904886493649989e-05, + "loss": 0.1612, + "step": 11298 + }, + { + "epoch": 0.226, + "grad_norm": 2.2571423053741455, + "learning_rate": 1.9048270524660197e-05, + "loss": 0.3279, + "step": 11300 + }, + { + "epoch": 0.22604, + "grad_norm": 1.4884849786758423, + "learning_rate": 1.904767593641981e-05, + "loss": 0.306, + "step": 11302 + }, + { + "epoch": 0.22608, + "grad_norm": 1.9851083755493164, + "learning_rate": 1.9047081171790327e-05, + "loss": 0.2036, + "step": 11304 + }, + { + "epoch": 0.22612, + "grad_norm": 0.26026952266693115, + "learning_rate": 1.904648623078334e-05, + "loss": 0.1736, + "step": 11306 + }, + { + "epoch": 0.22616, + "grad_norm": 1.4504605531692505, + "learning_rate": 1.9045891113410443e-05, + "loss": 0.1405, + "step": 11308 + }, + { + "epoch": 0.2262, + "grad_norm": 0.6081011295318604, + "learning_rate": 1.904529581968324e-05, + "loss": 0.2202, + "step": 11310 + }, + { + "epoch": 0.22624, + "grad_norm": 2.6654200553894043, + "learning_rate": 1.9044700349613344e-05, + "loss": 0.3313, + "step": 11312 + }, + { + "epoch": 0.22628, + "grad_norm": 2.060365915298462, + "learning_rate": 1.9044104703212354e-05, + "loss": 0.1572, + "step": 11314 + }, + { + "epoch": 0.22632, + "grad_norm": 2.5590755939483643, + "learning_rate": 1.9043508880491887e-05, + "loss": 0.396, + "step": 11316 + }, + { + "epoch": 0.22636, + "grad_norm": 0.944518506526947, + "learning_rate": 1.904291288146356e-05, + "loss": 0.1857, + "step": 11318 + }, + { + "epoch": 0.2264, + "grad_norm": 2.006495952606201, + "learning_rate": 1.9042316706138987e-05, + "loss": 0.2443, + "step": 11320 + }, + { + "epoch": 0.22644, + "grad_norm": 1.049169659614563, + "learning_rate": 1.9041720354529796e-05, + "loss": 0.1313, + "step": 11322 + }, + { + "epoch": 0.22648, + "grad_norm": 0.8064692616462708, + "learning_rate": 1.9041123826647615e-05, + "loss": 0.1501, + "step": 11324 + }, + { + "epoch": 0.22652, + "grad_norm": 1.4455229043960571, + "learning_rate": 1.904052712250407e-05, + "loss": 0.1719, + "step": 11326 + }, + { + "epoch": 0.22656, + "grad_norm": 1.0058687925338745, + "learning_rate": 1.903993024211079e-05, + "loss": 0.2482, + "step": 11328 + }, + { + "epoch": 0.2266, + "grad_norm": 0.7775872945785522, + "learning_rate": 1.903933318547942e-05, + "loss": 0.0735, + "step": 11330 + }, + { + "epoch": 0.22664, + "grad_norm": 0.2640073895454407, + "learning_rate": 1.903873595262159e-05, + "loss": 0.1078, + "step": 11332 + }, + { + "epoch": 0.22668, + "grad_norm": 0.8950728178024292, + "learning_rate": 1.9038138543548957e-05, + "loss": 0.4363, + "step": 11334 + }, + { + "epoch": 0.22672, + "grad_norm": 3.502622127532959, + "learning_rate": 1.903754095827316e-05, + "loss": 0.3152, + "step": 11336 + }, + { + "epoch": 0.22676, + "grad_norm": 0.3251408338546753, + "learning_rate": 1.903694319680584e-05, + "loss": 0.0793, + "step": 11338 + }, + { + "epoch": 0.2268, + "grad_norm": 0.668286919593811, + "learning_rate": 1.9036345259158667e-05, + "loss": 0.0741, + "step": 11340 + }, + { + "epoch": 0.22684, + "grad_norm": 0.9403761029243469, + "learning_rate": 1.903574714534329e-05, + "loss": 0.0916, + "step": 11342 + }, + { + "epoch": 0.22688, + "grad_norm": 1.3511587381362915, + "learning_rate": 1.903514885537137e-05, + "loss": 0.1532, + "step": 11344 + }, + { + "epoch": 0.22692, + "grad_norm": 0.5090273022651672, + "learning_rate": 1.903455038925457e-05, + "loss": 0.0579, + "step": 11346 + }, + { + "epoch": 0.22696, + "grad_norm": 0.9889496564865112, + "learning_rate": 1.903395174700456e-05, + "loss": 0.1137, + "step": 11348 + }, + { + "epoch": 0.227, + "grad_norm": 2.35111403465271, + "learning_rate": 1.903335292863301e-05, + "loss": 0.4703, + "step": 11350 + }, + { + "epoch": 0.22704, + "grad_norm": 2.509702205657959, + "learning_rate": 1.9032753934151594e-05, + "loss": 0.2036, + "step": 11352 + }, + { + "epoch": 0.22708, + "grad_norm": 0.4411010146141052, + "learning_rate": 1.903215476357199e-05, + "loss": 0.2373, + "step": 11354 + }, + { + "epoch": 0.22712, + "grad_norm": 2.342237710952759, + "learning_rate": 1.9031555416905876e-05, + "loss": 0.2304, + "step": 11356 + }, + { + "epoch": 0.22716, + "grad_norm": 2.2283999919891357, + "learning_rate": 1.9030955894164945e-05, + "loss": 0.1439, + "step": 11358 + }, + { + "epoch": 0.2272, + "grad_norm": 2.7581300735473633, + "learning_rate": 1.9030356195360875e-05, + "loss": 0.3406, + "step": 11360 + }, + { + "epoch": 0.22724, + "grad_norm": 1.686549425125122, + "learning_rate": 1.9029756320505363e-05, + "loss": 0.1011, + "step": 11362 + }, + { + "epoch": 0.22728, + "grad_norm": 0.6688858866691589, + "learning_rate": 1.90291562696101e-05, + "loss": 0.2911, + "step": 11364 + }, + { + "epoch": 0.22732, + "grad_norm": 1.3737123012542725, + "learning_rate": 1.902855604268679e-05, + "loss": 0.1815, + "step": 11366 + }, + { + "epoch": 0.22736, + "grad_norm": 3.4487481117248535, + "learning_rate": 1.902795563974713e-05, + "loss": 0.6845, + "step": 11368 + }, + { + "epoch": 0.2274, + "grad_norm": 1.185194969177246, + "learning_rate": 1.902735506080283e-05, + "loss": 0.197, + "step": 11370 + }, + { + "epoch": 0.22744, + "grad_norm": 0.8552653193473816, + "learning_rate": 1.9026754305865593e-05, + "loss": 0.2235, + "step": 11372 + }, + { + "epoch": 0.22748, + "grad_norm": 0.4479105472564697, + "learning_rate": 1.9026153374947136e-05, + "loss": 0.1071, + "step": 11374 + }, + { + "epoch": 0.22752, + "grad_norm": 0.6149218082427979, + "learning_rate": 1.902555226805917e-05, + "loss": 0.0616, + "step": 11376 + }, + { + "epoch": 0.22756, + "grad_norm": 1.962731122970581, + "learning_rate": 1.9024950985213415e-05, + "loss": 0.4053, + "step": 11378 + }, + { + "epoch": 0.2276, + "grad_norm": 0.4632706344127655, + "learning_rate": 1.9024349526421596e-05, + "loss": 0.0692, + "step": 11380 + }, + { + "epoch": 0.22764, + "grad_norm": 2.1791796684265137, + "learning_rate": 1.9023747891695437e-05, + "loss": 0.1627, + "step": 11382 + }, + { + "epoch": 0.22768, + "grad_norm": 0.18579712510108948, + "learning_rate": 1.9023146081046664e-05, + "loss": 0.0222, + "step": 11384 + }, + { + "epoch": 0.22772, + "grad_norm": 0.6713818311691284, + "learning_rate": 1.9022544094487013e-05, + "loss": 0.1939, + "step": 11386 + }, + { + "epoch": 0.22776, + "grad_norm": 0.16980423033237457, + "learning_rate": 1.9021941932028225e-05, + "loss": 0.043, + "step": 11388 + }, + { + "epoch": 0.2278, + "grad_norm": 1.185660719871521, + "learning_rate": 1.902133959368203e-05, + "loss": 0.2569, + "step": 11390 + }, + { + "epoch": 0.22784, + "grad_norm": 0.300886869430542, + "learning_rate": 1.9020737079460178e-05, + "loss": 0.2964, + "step": 11392 + }, + { + "epoch": 0.22788, + "grad_norm": 1.363708257675171, + "learning_rate": 1.902013438937441e-05, + "loss": 0.0809, + "step": 11394 + }, + { + "epoch": 0.22792, + "grad_norm": 0.8322010040283203, + "learning_rate": 1.901953152343648e-05, + "loss": 0.0651, + "step": 11396 + }, + { + "epoch": 0.22796, + "grad_norm": 2.924480676651001, + "learning_rate": 1.9018928481658135e-05, + "loss": 0.3008, + "step": 11398 + }, + { + "epoch": 0.228, + "grad_norm": 2.662076234817505, + "learning_rate": 1.901832526405114e-05, + "loss": 0.1971, + "step": 11400 + }, + { + "epoch": 0.22804, + "grad_norm": 2.180522918701172, + "learning_rate": 1.9017721870627247e-05, + "loss": 0.2931, + "step": 11402 + }, + { + "epoch": 0.22808, + "grad_norm": 1.6152726411819458, + "learning_rate": 1.901711830139823e-05, + "loss": 0.0856, + "step": 11404 + }, + { + "epoch": 0.22812, + "grad_norm": 0.7907804846763611, + "learning_rate": 1.9016514556375846e-05, + "loss": 0.1329, + "step": 11406 + }, + { + "epoch": 0.22816, + "grad_norm": 0.7165326476097107, + "learning_rate": 1.901591063557187e-05, + "loss": 0.1508, + "step": 11408 + }, + { + "epoch": 0.2282, + "grad_norm": 1.263695478439331, + "learning_rate": 1.901530653899807e-05, + "loss": 0.0964, + "step": 11410 + }, + { + "epoch": 0.22824, + "grad_norm": 0.9706540703773499, + "learning_rate": 1.901470226666623e-05, + "loss": 0.1529, + "step": 11412 + }, + { + "epoch": 0.22828, + "grad_norm": 2.3902835845947266, + "learning_rate": 1.901409781858813e-05, + "loss": 0.2144, + "step": 11414 + }, + { + "epoch": 0.22832, + "grad_norm": 3.3923773765563965, + "learning_rate": 1.9013493194775553e-05, + "loss": 0.3809, + "step": 11416 + }, + { + "epoch": 0.22836, + "grad_norm": 1.0327268838882446, + "learning_rate": 1.9012888395240284e-05, + "loss": 0.0638, + "step": 11418 + }, + { + "epoch": 0.2284, + "grad_norm": 1.0249285697937012, + "learning_rate": 1.9012283419994115e-05, + "loss": 0.2106, + "step": 11420 + }, + { + "epoch": 0.22844, + "grad_norm": 3.2629783153533936, + "learning_rate": 1.901167826904884e-05, + "loss": 0.2651, + "step": 11422 + }, + { + "epoch": 0.22848, + "grad_norm": 0.5692722797393799, + "learning_rate": 1.901107294241626e-05, + "loss": 0.1012, + "step": 11424 + }, + { + "epoch": 0.22852, + "grad_norm": 0.923888087272644, + "learning_rate": 1.9010467440108172e-05, + "loss": 0.0391, + "step": 11426 + }, + { + "epoch": 0.22856, + "grad_norm": 3.056412696838379, + "learning_rate": 1.9009861762136386e-05, + "loss": 0.269, + "step": 11428 + }, + { + "epoch": 0.2286, + "grad_norm": 0.8357692956924438, + "learning_rate": 1.9009255908512704e-05, + "loss": 0.1073, + "step": 11430 + }, + { + "epoch": 0.22864, + "grad_norm": 1.7542202472686768, + "learning_rate": 1.9008649879248938e-05, + "loss": 0.0561, + "step": 11432 + }, + { + "epoch": 0.22868, + "grad_norm": 1.5024478435516357, + "learning_rate": 1.9008043674356905e-05, + "loss": 0.2981, + "step": 11434 + }, + { + "epoch": 0.22872, + "grad_norm": 1.1708229780197144, + "learning_rate": 1.9007437293848423e-05, + "loss": 0.1321, + "step": 11436 + }, + { + "epoch": 0.22876, + "grad_norm": 2.0661749839782715, + "learning_rate": 1.9006830737735317e-05, + "loss": 0.212, + "step": 11438 + }, + { + "epoch": 0.2288, + "grad_norm": 0.6110995411872864, + "learning_rate": 1.9006224006029404e-05, + "loss": 0.1739, + "step": 11440 + }, + { + "epoch": 0.22884, + "grad_norm": 2.6450438499450684, + "learning_rate": 1.900561709874252e-05, + "loss": 0.1302, + "step": 11442 + }, + { + "epoch": 0.22888, + "grad_norm": 0.4827612042427063, + "learning_rate": 1.9005010015886495e-05, + "loss": 0.1698, + "step": 11444 + }, + { + "epoch": 0.22892, + "grad_norm": 0.2660389542579651, + "learning_rate": 1.900440275747316e-05, + "loss": 0.0421, + "step": 11446 + }, + { + "epoch": 0.22896, + "grad_norm": 0.921834409236908, + "learning_rate": 1.9003795323514363e-05, + "loss": 0.0529, + "step": 11448 + }, + { + "epoch": 0.229, + "grad_norm": 0.15619871020317078, + "learning_rate": 1.9003187714021936e-05, + "loss": 0.1103, + "step": 11450 + }, + { + "epoch": 0.22904, + "grad_norm": 1.7538018226623535, + "learning_rate": 1.900257992900773e-05, + "loss": 0.1259, + "step": 11452 + }, + { + "epoch": 0.22908, + "grad_norm": 0.638708233833313, + "learning_rate": 1.9001971968483593e-05, + "loss": 0.084, + "step": 11454 + }, + { + "epoch": 0.22912, + "grad_norm": 0.2749435603618622, + "learning_rate": 1.9001363832461385e-05, + "loss": 0.0336, + "step": 11456 + }, + { + "epoch": 0.22916, + "grad_norm": 4.035305976867676, + "learning_rate": 1.9000755520952947e-05, + "loss": 0.2983, + "step": 11458 + }, + { + "epoch": 0.2292, + "grad_norm": 3.702998399734497, + "learning_rate": 1.9000147033970148e-05, + "loss": 0.4738, + "step": 11460 + }, + { + "epoch": 0.22924, + "grad_norm": 0.40122535824775696, + "learning_rate": 1.899953837152485e-05, + "loss": 0.078, + "step": 11462 + }, + { + "epoch": 0.22928, + "grad_norm": 5.087066650390625, + "learning_rate": 1.899892953362892e-05, + "loss": 0.5956, + "step": 11464 + }, + { + "epoch": 0.22932, + "grad_norm": 1.6228909492492676, + "learning_rate": 1.8998320520294222e-05, + "loss": 0.2083, + "step": 11466 + }, + { + "epoch": 0.22936, + "grad_norm": 1.4600801467895508, + "learning_rate": 1.8997711331532632e-05, + "loss": 0.2546, + "step": 11468 + }, + { + "epoch": 0.2294, + "grad_norm": 1.2490167617797852, + "learning_rate": 1.899710196735603e-05, + "loss": 0.0857, + "step": 11470 + }, + { + "epoch": 0.22944, + "grad_norm": 1.2111518383026123, + "learning_rate": 1.8996492427776295e-05, + "loss": 0.059, + "step": 11472 + }, + { + "epoch": 0.22948, + "grad_norm": 2.3969860076904297, + "learning_rate": 1.8995882712805306e-05, + "loss": 0.1533, + "step": 11474 + }, + { + "epoch": 0.22952, + "grad_norm": 1.9080106019973755, + "learning_rate": 1.8995272822454952e-05, + "loss": 0.159, + "step": 11476 + }, + { + "epoch": 0.22956, + "grad_norm": 3.7844252586364746, + "learning_rate": 1.899466275673712e-05, + "loss": 0.2885, + "step": 11478 + }, + { + "epoch": 0.2296, + "grad_norm": 0.2674176096916199, + "learning_rate": 1.899405251566371e-05, + "loss": 0.2599, + "step": 11480 + }, + { + "epoch": 0.22964, + "grad_norm": 4.453584671020508, + "learning_rate": 1.899344209924662e-05, + "loss": 0.5956, + "step": 11482 + }, + { + "epoch": 0.22968, + "grad_norm": 4.1089911460876465, + "learning_rate": 1.899283150749774e-05, + "loss": 0.3113, + "step": 11484 + }, + { + "epoch": 0.22972, + "grad_norm": 0.37550073862075806, + "learning_rate": 1.899222074042898e-05, + "loss": 0.0231, + "step": 11486 + }, + { + "epoch": 0.22976, + "grad_norm": 3.0197410583496094, + "learning_rate": 1.899160979805225e-05, + "loss": 0.2136, + "step": 11488 + }, + { + "epoch": 0.2298, + "grad_norm": 0.17912407219409943, + "learning_rate": 1.8990998680379458e-05, + "loss": 0.0495, + "step": 11490 + }, + { + "epoch": 0.22984, + "grad_norm": 0.38942742347717285, + "learning_rate": 1.8990387387422517e-05, + "loss": 0.0529, + "step": 11492 + }, + { + "epoch": 0.22988, + "grad_norm": 1.5803340673446655, + "learning_rate": 1.8989775919193345e-05, + "loss": 0.1442, + "step": 11494 + }, + { + "epoch": 0.22992, + "grad_norm": 2.3223512172698975, + "learning_rate": 1.8989164275703864e-05, + "loss": 0.1328, + "step": 11496 + }, + { + "epoch": 0.22996, + "grad_norm": 0.6338863372802734, + "learning_rate": 1.8988552456965996e-05, + "loss": 0.1373, + "step": 11498 + }, + { + "epoch": 0.23, + "grad_norm": 2.298713207244873, + "learning_rate": 1.8987940462991673e-05, + "loss": 0.2322, + "step": 11500 + }, + { + "epoch": 0.23004, + "grad_norm": 0.28502610325813293, + "learning_rate": 1.8987328293792822e-05, + "loss": 0.4217, + "step": 11502 + }, + { + "epoch": 0.23008, + "grad_norm": 1.230501651763916, + "learning_rate": 1.8986715949381378e-05, + "loss": 0.105, + "step": 11504 + }, + { + "epoch": 0.23012, + "grad_norm": 3.897822618484497, + "learning_rate": 1.898610342976928e-05, + "loss": 0.7194, + "step": 11506 + }, + { + "epoch": 0.23016, + "grad_norm": 2.4554309844970703, + "learning_rate": 1.8985490734968468e-05, + "loss": 0.2342, + "step": 11508 + }, + { + "epoch": 0.2302, + "grad_norm": 0.4419853091239929, + "learning_rate": 1.8984877864990888e-05, + "loss": 0.1351, + "step": 11510 + }, + { + "epoch": 0.23024, + "grad_norm": 0.9143764972686768, + "learning_rate": 1.8984264819848494e-05, + "loss": 0.0719, + "step": 11512 + }, + { + "epoch": 0.23028, + "grad_norm": 0.756771981716156, + "learning_rate": 1.8983651599553225e-05, + "loss": 0.1497, + "step": 11514 + }, + { + "epoch": 0.23032, + "grad_norm": 1.6774299144744873, + "learning_rate": 1.8983038204117046e-05, + "loss": 0.1212, + "step": 11516 + }, + { + "epoch": 0.23036, + "grad_norm": 1.690508246421814, + "learning_rate": 1.8982424633551912e-05, + "loss": 0.331, + "step": 11518 + }, + { + "epoch": 0.2304, + "grad_norm": 1.1979713439941406, + "learning_rate": 1.8981810887869784e-05, + "loss": 0.1966, + "step": 11520 + }, + { + "epoch": 0.23044, + "grad_norm": 1.3826994895935059, + "learning_rate": 1.898119696708263e-05, + "loss": 0.1304, + "step": 11522 + }, + { + "epoch": 0.23048, + "grad_norm": 0.5034054517745972, + "learning_rate": 1.898058287120242e-05, + "loss": 0.0639, + "step": 11524 + }, + { + "epoch": 0.23052, + "grad_norm": 0.5329388976097107, + "learning_rate": 1.897996860024112e-05, + "loss": 0.2142, + "step": 11526 + }, + { + "epoch": 0.23056, + "grad_norm": 2.203409194946289, + "learning_rate": 1.897935415421071e-05, + "loss": 0.1693, + "step": 11528 + }, + { + "epoch": 0.2306, + "grad_norm": 1.5521546602249146, + "learning_rate": 1.897873953312317e-05, + "loss": 0.1511, + "step": 11530 + }, + { + "epoch": 0.23064, + "grad_norm": 1.6767528057098389, + "learning_rate": 1.897812473699048e-05, + "loss": 0.1319, + "step": 11532 + }, + { + "epoch": 0.23068, + "grad_norm": 0.546278178691864, + "learning_rate": 1.897750976582462e-05, + "loss": 0.1178, + "step": 11534 + }, + { + "epoch": 0.23072, + "grad_norm": 0.3987419903278351, + "learning_rate": 1.897689461963759e-05, + "loss": 0.1671, + "step": 11536 + }, + { + "epoch": 0.23076, + "grad_norm": 0.8949085474014282, + "learning_rate": 1.897627929844138e-05, + "loss": 0.0776, + "step": 11538 + }, + { + "epoch": 0.2308, + "grad_norm": 0.7177824378013611, + "learning_rate": 1.8975663802247978e-05, + "loss": 0.0294, + "step": 11540 + }, + { + "epoch": 0.23084, + "grad_norm": 2.7162728309631348, + "learning_rate": 1.8975048131069392e-05, + "loss": 0.1868, + "step": 11542 + }, + { + "epoch": 0.23088, + "grad_norm": 0.8270533084869385, + "learning_rate": 1.897443228491762e-05, + "loss": 0.0986, + "step": 11544 + }, + { + "epoch": 0.23092, + "grad_norm": 0.7043898105621338, + "learning_rate": 1.8973816263804673e-05, + "loss": 0.0918, + "step": 11546 + }, + { + "epoch": 0.23096, + "grad_norm": 0.3892808258533478, + "learning_rate": 1.8973200067742558e-05, + "loss": 0.0696, + "step": 11548 + }, + { + "epoch": 0.231, + "grad_norm": 2.3132519721984863, + "learning_rate": 1.8972583696743284e-05, + "loss": 0.1792, + "step": 11550 + }, + { + "epoch": 0.23104, + "grad_norm": 2.1713171005249023, + "learning_rate": 1.897196715081888e-05, + "loss": 0.0911, + "step": 11552 + }, + { + "epoch": 0.23108, + "grad_norm": 3.6307599544525146, + "learning_rate": 1.8971350429981347e-05, + "loss": 0.3551, + "step": 11554 + }, + { + "epoch": 0.23112, + "grad_norm": 3.719191312789917, + "learning_rate": 1.8970733534242726e-05, + "loss": 0.7654, + "step": 11556 + }, + { + "epoch": 0.23116, + "grad_norm": 3.734713554382324, + "learning_rate": 1.897011646361503e-05, + "loss": 0.2568, + "step": 11558 + }, + { + "epoch": 0.2312, + "grad_norm": 1.9725539684295654, + "learning_rate": 1.8969499218110302e-05, + "loss": 0.1006, + "step": 11560 + }, + { + "epoch": 0.23124, + "grad_norm": 0.7574959993362427, + "learning_rate": 1.8968881797740565e-05, + "loss": 0.1621, + "step": 11562 + }, + { + "epoch": 0.23128, + "grad_norm": 1.290955662727356, + "learning_rate": 1.896826420251786e-05, + "loss": 0.0666, + "step": 11564 + }, + { + "epoch": 0.23132, + "grad_norm": 0.2665361762046814, + "learning_rate": 1.896764643245423e-05, + "loss": 0.0238, + "step": 11566 + }, + { + "epoch": 0.23136, + "grad_norm": 0.8640188574790955, + "learning_rate": 1.8967028487561712e-05, + "loss": 0.1537, + "step": 11568 + }, + { + "epoch": 0.2314, + "grad_norm": 2.4618077278137207, + "learning_rate": 1.896641036785236e-05, + "loss": 0.3726, + "step": 11570 + }, + { + "epoch": 0.23144, + "grad_norm": 0.5958468317985535, + "learning_rate": 1.8965792073338222e-05, + "loss": 0.0303, + "step": 11572 + }, + { + "epoch": 0.23148, + "grad_norm": 2.5247321128845215, + "learning_rate": 1.8965173604031347e-05, + "loss": 0.1688, + "step": 11574 + }, + { + "epoch": 0.23152, + "grad_norm": 0.23278816044330597, + "learning_rate": 1.8964554959943803e-05, + "loss": 0.0419, + "step": 11576 + }, + { + "epoch": 0.23156, + "grad_norm": 4.501829624176025, + "learning_rate": 1.8963936141087644e-05, + "loss": 0.5373, + "step": 11578 + }, + { + "epoch": 0.2316, + "grad_norm": 1.7148834466934204, + "learning_rate": 1.896331714747493e-05, + "loss": 0.3463, + "step": 11580 + }, + { + "epoch": 0.23164, + "grad_norm": 1.1532108783721924, + "learning_rate": 1.896269797911774e-05, + "loss": 0.0893, + "step": 11582 + }, + { + "epoch": 0.23168, + "grad_norm": 1.20183527469635, + "learning_rate": 1.8962078636028135e-05, + "loss": 0.0853, + "step": 11584 + }, + { + "epoch": 0.23172, + "grad_norm": 2.7002289295196533, + "learning_rate": 1.8961459118218192e-05, + "loss": 0.1743, + "step": 11586 + }, + { + "epoch": 0.23176, + "grad_norm": 2.379805564880371, + "learning_rate": 1.8960839425699992e-05, + "loss": 0.1202, + "step": 11588 + }, + { + "epoch": 0.2318, + "grad_norm": 0.5738748908042908, + "learning_rate": 1.896021955848561e-05, + "loss": 0.1326, + "step": 11590 + }, + { + "epoch": 0.23184, + "grad_norm": 3.8192758560180664, + "learning_rate": 1.895959951658714e-05, + "loss": 0.309, + "step": 11592 + }, + { + "epoch": 0.23188, + "grad_norm": 2.0163633823394775, + "learning_rate": 1.8958979300016663e-05, + "loss": 0.1585, + "step": 11594 + }, + { + "epoch": 0.23192, + "grad_norm": 2.1653387546539307, + "learning_rate": 1.895835890878627e-05, + "loss": 0.0978, + "step": 11596 + }, + { + "epoch": 0.23196, + "grad_norm": 0.2323293685913086, + "learning_rate": 1.8957738342908062e-05, + "loss": 0.0154, + "step": 11598 + }, + { + "epoch": 0.232, + "grad_norm": 0.07613763958215714, + "learning_rate": 1.895711760239413e-05, + "loss": 0.3573, + "step": 11600 + }, + { + "epoch": 0.23204, + "grad_norm": 4.91471529006958, + "learning_rate": 1.895649668725658e-05, + "loss": 0.8229, + "step": 11602 + }, + { + "epoch": 0.23208, + "grad_norm": 0.40124985575675964, + "learning_rate": 1.8955875597507515e-05, + "loss": 0.0273, + "step": 11604 + }, + { + "epoch": 0.23212, + "grad_norm": 2.4732465744018555, + "learning_rate": 1.8955254333159046e-05, + "loss": 0.1397, + "step": 11606 + }, + { + "epoch": 0.23216, + "grad_norm": 2.7871246337890625, + "learning_rate": 1.8954632894223285e-05, + "loss": 0.3817, + "step": 11608 + }, + { + "epoch": 0.2322, + "grad_norm": 0.3384227156639099, + "learning_rate": 1.895401128071234e-05, + "loss": 0.2341, + "step": 11610 + }, + { + "epoch": 0.23224, + "grad_norm": 0.2241961508989334, + "learning_rate": 1.895338949263834e-05, + "loss": 0.1153, + "step": 11612 + }, + { + "epoch": 0.23228, + "grad_norm": 1.741316318511963, + "learning_rate": 1.89527675300134e-05, + "loss": 0.2818, + "step": 11614 + }, + { + "epoch": 0.23232, + "grad_norm": 1.698008418083191, + "learning_rate": 1.895214539284965e-05, + "loss": 0.1739, + "step": 11616 + }, + { + "epoch": 0.23236, + "grad_norm": 3.1888890266418457, + "learning_rate": 1.8951523081159213e-05, + "loss": 0.2776, + "step": 11618 + }, + { + "epoch": 0.2324, + "grad_norm": 0.7386499643325806, + "learning_rate": 1.8950900594954226e-05, + "loss": 0.0904, + "step": 11620 + }, + { + "epoch": 0.23244, + "grad_norm": 1.8908982276916504, + "learning_rate": 1.8950277934246828e-05, + "loss": 0.1299, + "step": 11622 + }, + { + "epoch": 0.23248, + "grad_norm": 2.651435375213623, + "learning_rate": 1.894965509904915e-05, + "loss": 0.2558, + "step": 11624 + }, + { + "epoch": 0.23252, + "grad_norm": 2.3313465118408203, + "learning_rate": 1.894903208937334e-05, + "loss": 0.2217, + "step": 11626 + }, + { + "epoch": 0.23256, + "grad_norm": 1.952318787574768, + "learning_rate": 1.894840890523154e-05, + "loss": 0.3257, + "step": 11628 + }, + { + "epoch": 0.2326, + "grad_norm": 0.6608715057373047, + "learning_rate": 1.8947785546635905e-05, + "loss": 0.3352, + "step": 11630 + }, + { + "epoch": 0.23264, + "grad_norm": 0.6955599188804626, + "learning_rate": 1.894716201359858e-05, + "loss": 0.0475, + "step": 11632 + }, + { + "epoch": 0.23268, + "grad_norm": 1.6759915351867676, + "learning_rate": 1.8946538306131727e-05, + "loss": 0.2899, + "step": 11634 + }, + { + "epoch": 0.23272, + "grad_norm": 2.6271135807037354, + "learning_rate": 1.8945914424247503e-05, + "loss": 0.2215, + "step": 11636 + }, + { + "epoch": 0.23276, + "grad_norm": 2.812781572341919, + "learning_rate": 1.8945290367958074e-05, + "loss": 0.3219, + "step": 11638 + }, + { + "epoch": 0.2328, + "grad_norm": 2.0311622619628906, + "learning_rate": 1.89446661372756e-05, + "loss": 0.2769, + "step": 11640 + }, + { + "epoch": 0.23284, + "grad_norm": 1.5521637201309204, + "learning_rate": 1.8944041732212257e-05, + "loss": 0.1351, + "step": 11642 + }, + { + "epoch": 0.23288, + "grad_norm": 1.5151618719100952, + "learning_rate": 1.8943417152780216e-05, + "loss": 0.1988, + "step": 11644 + }, + { + "epoch": 0.23292, + "grad_norm": 0.5608150959014893, + "learning_rate": 1.894279239899165e-05, + "loss": 0.3285, + "step": 11646 + }, + { + "epoch": 0.23296, + "grad_norm": 3.2169206142425537, + "learning_rate": 1.8942167470858747e-05, + "loss": 0.2681, + "step": 11648 + }, + { + "epoch": 0.233, + "grad_norm": 0.5539841055870056, + "learning_rate": 1.8941542368393683e-05, + "loss": 0.1432, + "step": 11650 + }, + { + "epoch": 0.23304, + "grad_norm": 0.6651888489723206, + "learning_rate": 1.894091709160865e-05, + "loss": 0.14, + "step": 11652 + }, + { + "epoch": 0.23308, + "grad_norm": 0.1168389692902565, + "learning_rate": 1.894029164051583e-05, + "loss": 0.0695, + "step": 11654 + }, + { + "epoch": 0.23312, + "grad_norm": 2.3845059871673584, + "learning_rate": 1.8939666015127424e-05, + "loss": 0.4222, + "step": 11656 + }, + { + "epoch": 0.23316, + "grad_norm": 0.5572409629821777, + "learning_rate": 1.893904021545563e-05, + "loss": 0.3113, + "step": 11658 + }, + { + "epoch": 0.2332, + "grad_norm": 2.8725368976593018, + "learning_rate": 1.893841424151264e-05, + "loss": 0.2562, + "step": 11660 + }, + { + "epoch": 0.23324, + "grad_norm": 1.3066320419311523, + "learning_rate": 1.8937788093310665e-05, + "loss": 0.1597, + "step": 11662 + }, + { + "epoch": 0.23328, + "grad_norm": 1.5969045162200928, + "learning_rate": 1.8937161770861906e-05, + "loss": 0.1534, + "step": 11664 + }, + { + "epoch": 0.23332, + "grad_norm": 1.062953233718872, + "learning_rate": 1.893653527417858e-05, + "loss": 0.1645, + "step": 11666 + }, + { + "epoch": 0.23336, + "grad_norm": 1.3439823389053345, + "learning_rate": 1.8935908603272902e-05, + "loss": 0.0864, + "step": 11668 + }, + { + "epoch": 0.2334, + "grad_norm": 1.8335679769515991, + "learning_rate": 1.893528175815708e-05, + "loss": 0.1804, + "step": 11670 + }, + { + "epoch": 0.23344, + "grad_norm": 1.0416370630264282, + "learning_rate": 1.893465473884334e-05, + "loss": 0.2632, + "step": 11672 + }, + { + "epoch": 0.23348, + "grad_norm": 1.5643972158432007, + "learning_rate": 1.8934027545343907e-05, + "loss": 0.1462, + "step": 11674 + }, + { + "epoch": 0.23352, + "grad_norm": 1.7513236999511719, + "learning_rate": 1.893340017767101e-05, + "loss": 0.0899, + "step": 11676 + }, + { + "epoch": 0.23356, + "grad_norm": 3.0992441177368164, + "learning_rate": 1.8932772635836872e-05, + "loss": 0.3033, + "step": 11678 + }, + { + "epoch": 0.2336, + "grad_norm": 0.45053988695144653, + "learning_rate": 1.893214491985374e-05, + "loss": 0.1008, + "step": 11680 + }, + { + "epoch": 0.23364, + "grad_norm": 1.7398924827575684, + "learning_rate": 1.8931517029733838e-05, + "loss": 0.0962, + "step": 11682 + }, + { + "epoch": 0.23368, + "grad_norm": 0.4764072895050049, + "learning_rate": 1.8930888965489416e-05, + "loss": 0.1713, + "step": 11684 + }, + { + "epoch": 0.23372, + "grad_norm": 2.091444253921509, + "learning_rate": 1.8930260727132714e-05, + "loss": 0.2329, + "step": 11686 + }, + { + "epoch": 0.23376, + "grad_norm": 0.4819694459438324, + "learning_rate": 1.8929632314675982e-05, + "loss": 0.0483, + "step": 11688 + }, + { + "epoch": 0.2338, + "grad_norm": 0.6249200105667114, + "learning_rate": 1.892900372813147e-05, + "loss": 0.0828, + "step": 11690 + }, + { + "epoch": 0.23384, + "grad_norm": 2.774423599243164, + "learning_rate": 1.8928374967511436e-05, + "loss": 0.2539, + "step": 11692 + }, + { + "epoch": 0.23388, + "grad_norm": 0.6438697576522827, + "learning_rate": 1.892774603282813e-05, + "loss": 0.0522, + "step": 11694 + }, + { + "epoch": 0.23392, + "grad_norm": 2.4624264240264893, + "learning_rate": 1.8927116924093824e-05, + "loss": 0.1917, + "step": 11696 + }, + { + "epoch": 0.23396, + "grad_norm": 0.22167065739631653, + "learning_rate": 1.8926487641320778e-05, + "loss": 0.186, + "step": 11698 + }, + { + "epoch": 0.234, + "grad_norm": 1.102869987487793, + "learning_rate": 1.892585818452126e-05, + "loss": 0.0954, + "step": 11700 + }, + { + "epoch": 0.23404, + "grad_norm": 0.48890170454978943, + "learning_rate": 1.8925228553707534e-05, + "loss": 0.125, + "step": 11702 + }, + { + "epoch": 0.23408, + "grad_norm": 4.21516227722168, + "learning_rate": 1.8924598748891888e-05, + "loss": 0.3976, + "step": 11704 + }, + { + "epoch": 0.23412, + "grad_norm": 2.6281309127807617, + "learning_rate": 1.8923968770086593e-05, + "loss": 0.1556, + "step": 11706 + }, + { + "epoch": 0.23416, + "grad_norm": 2.745230197906494, + "learning_rate": 1.8923338617303937e-05, + "loss": 0.3943, + "step": 11708 + }, + { + "epoch": 0.2342, + "grad_norm": 1.0482754707336426, + "learning_rate": 1.8922708290556197e-05, + "loss": 0.0679, + "step": 11710 + }, + { + "epoch": 0.23424, + "grad_norm": 1.765434980392456, + "learning_rate": 1.8922077789855665e-05, + "loss": 0.1533, + "step": 11712 + }, + { + "epoch": 0.23428, + "grad_norm": 2.2271358966827393, + "learning_rate": 1.8921447115214634e-05, + "loss": 0.1921, + "step": 11714 + }, + { + "epoch": 0.23432, + "grad_norm": 0.6302723288536072, + "learning_rate": 1.8920816266645396e-05, + "loss": 0.3459, + "step": 11716 + }, + { + "epoch": 0.23436, + "grad_norm": 2.0706052780151367, + "learning_rate": 1.8920185244160257e-05, + "loss": 0.1098, + "step": 11718 + }, + { + "epoch": 0.2344, + "grad_norm": 2.548400640487671, + "learning_rate": 1.8919554047771508e-05, + "loss": 0.2291, + "step": 11720 + }, + { + "epoch": 0.23444, + "grad_norm": 0.34494301676750183, + "learning_rate": 1.8918922677491466e-05, + "loss": 0.0612, + "step": 11722 + }, + { + "epoch": 0.23448, + "grad_norm": 2.018221139907837, + "learning_rate": 1.8918291133332432e-05, + "loss": 0.1182, + "step": 11724 + }, + { + "epoch": 0.23452, + "grad_norm": 3.343844175338745, + "learning_rate": 1.8917659415306723e-05, + "loss": 0.4055, + "step": 11726 + }, + { + "epoch": 0.23456, + "grad_norm": 0.5177163481712341, + "learning_rate": 1.8917027523426648e-05, + "loss": 0.0513, + "step": 11728 + }, + { + "epoch": 0.2346, + "grad_norm": 2.6123530864715576, + "learning_rate": 1.8916395457704536e-05, + "loss": 0.1695, + "step": 11730 + }, + { + "epoch": 0.23464, + "grad_norm": 0.8691853284835815, + "learning_rate": 1.8915763218152704e-05, + "loss": 0.395, + "step": 11732 + }, + { + "epoch": 0.23468, + "grad_norm": 0.3831193149089813, + "learning_rate": 1.8915130804783476e-05, + "loss": 0.1916, + "step": 11734 + }, + { + "epoch": 0.23472, + "grad_norm": 1.9561011791229248, + "learning_rate": 1.891449821760918e-05, + "loss": 0.1265, + "step": 11736 + }, + { + "epoch": 0.23476, + "grad_norm": 1.8861726522445679, + "learning_rate": 1.8913865456642156e-05, + "loss": 0.1334, + "step": 11738 + }, + { + "epoch": 0.2348, + "grad_norm": 0.9033575057983398, + "learning_rate": 1.8913232521894734e-05, + "loss": 0.1402, + "step": 11740 + }, + { + "epoch": 0.23484, + "grad_norm": 1.6448909044265747, + "learning_rate": 1.8912599413379254e-05, + "loss": 0.1672, + "step": 11742 + }, + { + "epoch": 0.23488, + "grad_norm": 0.7766230702400208, + "learning_rate": 1.8911966131108064e-05, + "loss": 0.2384, + "step": 11744 + }, + { + "epoch": 0.23492, + "grad_norm": 0.5930387377738953, + "learning_rate": 1.8911332675093503e-05, + "loss": 0.1011, + "step": 11746 + }, + { + "epoch": 0.23496, + "grad_norm": 3.1305904388427734, + "learning_rate": 1.8910699045347924e-05, + "loss": 0.2651, + "step": 11748 + }, + { + "epoch": 0.235, + "grad_norm": 1.8192473649978638, + "learning_rate": 1.891006524188368e-05, + "loss": 0.1587, + "step": 11750 + }, + { + "epoch": 0.23504, + "grad_norm": 2.344914674758911, + "learning_rate": 1.8909431264713128e-05, + "loss": 0.2764, + "step": 11752 + }, + { + "epoch": 0.23508, + "grad_norm": 1.4157822132110596, + "learning_rate": 1.8908797113848624e-05, + "loss": 0.1268, + "step": 11754 + }, + { + "epoch": 0.23512, + "grad_norm": 0.12594586610794067, + "learning_rate": 1.8908162789302535e-05, + "loss": 0.0428, + "step": 11756 + }, + { + "epoch": 0.23516, + "grad_norm": 0.18962782621383667, + "learning_rate": 1.8907528291087228e-05, + "loss": 0.0224, + "step": 11758 + }, + { + "epoch": 0.2352, + "grad_norm": 2.6061489582061768, + "learning_rate": 1.890689361921507e-05, + "loss": 0.5874, + "step": 11760 + }, + { + "epoch": 0.23524, + "grad_norm": 2.487685203552246, + "learning_rate": 1.890625877369843e-05, + "loss": 0.2821, + "step": 11762 + }, + { + "epoch": 0.23528, + "grad_norm": 2.938666343688965, + "learning_rate": 1.8905623754549696e-05, + "loss": 0.2764, + "step": 11764 + }, + { + "epoch": 0.23532, + "grad_norm": 2.1512186527252197, + "learning_rate": 1.8904988561781243e-05, + "loss": 0.136, + "step": 11766 + }, + { + "epoch": 0.23536, + "grad_norm": 3.8247568607330322, + "learning_rate": 1.890435319540545e-05, + "loss": 0.3714, + "step": 11768 + }, + { + "epoch": 0.2354, + "grad_norm": 0.7972317934036255, + "learning_rate": 1.8903717655434708e-05, + "loss": 0.2202, + "step": 11770 + }, + { + "epoch": 0.23544, + "grad_norm": 3.1187400817871094, + "learning_rate": 1.8903081941881406e-05, + "loss": 0.3837, + "step": 11772 + }, + { + "epoch": 0.23548, + "grad_norm": 2.939324140548706, + "learning_rate": 1.890244605475794e-05, + "loss": 0.2933, + "step": 11774 + }, + { + "epoch": 0.23552, + "grad_norm": 2.257091999053955, + "learning_rate": 1.8901809994076702e-05, + "loss": 0.178, + "step": 11776 + }, + { + "epoch": 0.23556, + "grad_norm": 2.315429210662842, + "learning_rate": 1.8901173759850096e-05, + "loss": 0.2185, + "step": 11778 + }, + { + "epoch": 0.2356, + "grad_norm": 2.1110424995422363, + "learning_rate": 1.8900537352090523e-05, + "loss": 0.2152, + "step": 11780 + }, + { + "epoch": 0.23564, + "grad_norm": 0.8713079690933228, + "learning_rate": 1.88999007708104e-05, + "loss": 0.0958, + "step": 11782 + }, + { + "epoch": 0.23568, + "grad_norm": 1.3948969841003418, + "learning_rate": 1.889926401602212e-05, + "loss": 0.356, + "step": 11784 + }, + { + "epoch": 0.23572, + "grad_norm": 1.0065114498138428, + "learning_rate": 1.889862708773811e-05, + "loss": 0.3259, + "step": 11786 + }, + { + "epoch": 0.23576, + "grad_norm": 1.3509011268615723, + "learning_rate": 1.8897989985970787e-05, + "loss": 0.1394, + "step": 11788 + }, + { + "epoch": 0.2358, + "grad_norm": 2.79736328125, + "learning_rate": 1.8897352710732564e-05, + "loss": 0.4575, + "step": 11790 + }, + { + "epoch": 0.23584, + "grad_norm": 0.9658661484718323, + "learning_rate": 1.889671526203587e-05, + "loss": 0.0898, + "step": 11792 + }, + { + "epoch": 0.23588, + "grad_norm": 1.9837185144424438, + "learning_rate": 1.889607763989313e-05, + "loss": 0.2223, + "step": 11794 + }, + { + "epoch": 0.23592, + "grad_norm": 1.4389899969100952, + "learning_rate": 1.8895439844316782e-05, + "loss": 0.3307, + "step": 11796 + }, + { + "epoch": 0.23596, + "grad_norm": 1.571544885635376, + "learning_rate": 1.889480187531925e-05, + "loss": 0.2563, + "step": 11798 + }, + { + "epoch": 0.236, + "grad_norm": 1.0553046464920044, + "learning_rate": 1.889416373291298e-05, + "loss": 0.203, + "step": 11800 + }, + { + "epoch": 0.23604, + "grad_norm": 1.2572360038757324, + "learning_rate": 1.8893525417110404e-05, + "loss": 0.1748, + "step": 11802 + }, + { + "epoch": 0.23608, + "grad_norm": 2.4639835357666016, + "learning_rate": 1.8892886927923972e-05, + "loss": 0.2101, + "step": 11804 + }, + { + "epoch": 0.23612, + "grad_norm": 1.7414278984069824, + "learning_rate": 1.8892248265366132e-05, + "loss": 0.1843, + "step": 11806 + }, + { + "epoch": 0.23616, + "grad_norm": 0.6302216649055481, + "learning_rate": 1.8891609429449335e-05, + "loss": 0.0771, + "step": 11808 + }, + { + "epoch": 0.2362, + "grad_norm": 0.7479552030563354, + "learning_rate": 1.8890970420186035e-05, + "loss": 0.2322, + "step": 11810 + }, + { + "epoch": 0.23624, + "grad_norm": 1.7754671573638916, + "learning_rate": 1.8890331237588685e-05, + "loss": 0.1267, + "step": 11812 + }, + { + "epoch": 0.23628, + "grad_norm": 0.6762409806251526, + "learning_rate": 1.8889691881669753e-05, + "loss": 0.0508, + "step": 11814 + }, + { + "epoch": 0.23632, + "grad_norm": 2.1469180583953857, + "learning_rate": 1.8889052352441702e-05, + "loss": 0.1502, + "step": 11816 + }, + { + "epoch": 0.23636, + "grad_norm": 0.7065137028694153, + "learning_rate": 1.8888412649917e-05, + "loss": 0.1431, + "step": 11818 + }, + { + "epoch": 0.2364, + "grad_norm": 0.8220123648643494, + "learning_rate": 1.8887772774108116e-05, + "loss": 0.282, + "step": 11820 + }, + { + "epoch": 0.23644, + "grad_norm": 2.5626378059387207, + "learning_rate": 1.8887132725027526e-05, + "loss": 0.2495, + "step": 11822 + }, + { + "epoch": 0.23648, + "grad_norm": 0.5260202884674072, + "learning_rate": 1.888649250268771e-05, + "loss": 0.0954, + "step": 11824 + }, + { + "epoch": 0.23652, + "grad_norm": 1.4833420515060425, + "learning_rate": 1.8885852107101143e-05, + "loss": 0.2103, + "step": 11826 + }, + { + "epoch": 0.23656, + "grad_norm": 0.8196833729743958, + "learning_rate": 1.8885211538280317e-05, + "loss": 0.1803, + "step": 11828 + }, + { + "epoch": 0.2366, + "grad_norm": 2.393211603164673, + "learning_rate": 1.888457079623772e-05, + "loss": 0.1646, + "step": 11830 + }, + { + "epoch": 0.23664, + "grad_norm": 1.449699878692627, + "learning_rate": 1.888392988098584e-05, + "loss": 0.1122, + "step": 11832 + }, + { + "epoch": 0.23668, + "grad_norm": 1.0129352807998657, + "learning_rate": 1.888328879253717e-05, + "loss": 0.0963, + "step": 11834 + }, + { + "epoch": 0.23672, + "grad_norm": 1.3207933902740479, + "learning_rate": 1.8882647530904215e-05, + "loss": 0.126, + "step": 11836 + }, + { + "epoch": 0.23676, + "grad_norm": 1.6519392728805542, + "learning_rate": 1.888200609609947e-05, + "loss": 0.1256, + "step": 11838 + }, + { + "epoch": 0.2368, + "grad_norm": 0.8032995462417603, + "learning_rate": 1.8881364488135448e-05, + "loss": 0.1134, + "step": 11840 + }, + { + "epoch": 0.23684, + "grad_norm": 0.7004927396774292, + "learning_rate": 1.888072270702465e-05, + "loss": 0.1683, + "step": 11842 + }, + { + "epoch": 0.23688, + "grad_norm": 1.2795138359069824, + "learning_rate": 1.888008075277959e-05, + "loss": 0.095, + "step": 11844 + }, + { + "epoch": 0.23692, + "grad_norm": 0.18568472564220428, + "learning_rate": 1.8879438625412785e-05, + "loss": 0.6977, + "step": 11846 + }, + { + "epoch": 0.23696, + "grad_norm": 0.8328781723976135, + "learning_rate": 1.8878796324936752e-05, + "loss": 0.1139, + "step": 11848 + }, + { + "epoch": 0.237, + "grad_norm": 2.910402536392212, + "learning_rate": 1.8878153851364013e-05, + "loss": 0.2811, + "step": 11850 + }, + { + "epoch": 0.23704, + "grad_norm": 0.4376257359981537, + "learning_rate": 1.8877511204707098e-05, + "loss": 0.0663, + "step": 11852 + }, + { + "epoch": 0.23708, + "grad_norm": 1.729724407196045, + "learning_rate": 1.8876868384978526e-05, + "loss": 0.3067, + "step": 11854 + }, + { + "epoch": 0.23712, + "grad_norm": 0.908735454082489, + "learning_rate": 1.887622539219084e-05, + "loss": 0.3205, + "step": 11856 + }, + { + "epoch": 0.23716, + "grad_norm": 3.0754384994506836, + "learning_rate": 1.8875582226356565e-05, + "loss": 0.3032, + "step": 11858 + }, + { + "epoch": 0.2372, + "grad_norm": 1.455873727798462, + "learning_rate": 1.887493888748825e-05, + "loss": 0.1267, + "step": 11860 + }, + { + "epoch": 0.23724, + "grad_norm": 2.1937451362609863, + "learning_rate": 1.8874295375598427e-05, + "loss": 0.1084, + "step": 11862 + }, + { + "epoch": 0.23728, + "grad_norm": 2.324406147003174, + "learning_rate": 1.8873651690699652e-05, + "loss": 0.1594, + "step": 11864 + }, + { + "epoch": 0.23732, + "grad_norm": 0.8838011622428894, + "learning_rate": 1.8873007832804467e-05, + "loss": 0.1319, + "step": 11866 + }, + { + "epoch": 0.23736, + "grad_norm": 0.5748299956321716, + "learning_rate": 1.8872363801925425e-05, + "loss": 0.0316, + "step": 11868 + }, + { + "epoch": 0.2374, + "grad_norm": 1.3283421993255615, + "learning_rate": 1.8871719598075083e-05, + "loss": 0.2919, + "step": 11870 + }, + { + "epoch": 0.23744, + "grad_norm": 0.30415892601013184, + "learning_rate": 1.8871075221266003e-05, + "loss": 0.1416, + "step": 11872 + }, + { + "epoch": 0.23748, + "grad_norm": 0.21463800966739655, + "learning_rate": 1.887043067151074e-05, + "loss": 0.0732, + "step": 11874 + }, + { + "epoch": 0.23752, + "grad_norm": 0.9068067073822021, + "learning_rate": 1.8869785948821865e-05, + "loss": 0.0592, + "step": 11876 + }, + { + "epoch": 0.23756, + "grad_norm": 0.8894368410110474, + "learning_rate": 1.886914105321195e-05, + "loss": 0.0996, + "step": 11878 + }, + { + "epoch": 0.2376, + "grad_norm": 0.7509071826934814, + "learning_rate": 1.886849598469356e-05, + "loss": 0.2724, + "step": 11880 + }, + { + "epoch": 0.23764, + "grad_norm": 3.187197685241699, + "learning_rate": 1.8867850743279278e-05, + "loss": 0.3255, + "step": 11882 + }, + { + "epoch": 0.23768, + "grad_norm": 3.8357512950897217, + "learning_rate": 1.8867205328981676e-05, + "loss": 0.3939, + "step": 11884 + }, + { + "epoch": 0.23772, + "grad_norm": 1.030015230178833, + "learning_rate": 1.8866559741813347e-05, + "loss": 0.1245, + "step": 11886 + }, + { + "epoch": 0.23776, + "grad_norm": 0.9739739298820496, + "learning_rate": 1.8865913981786867e-05, + "loss": 0.3033, + "step": 11888 + }, + { + "epoch": 0.2378, + "grad_norm": 0.7096403241157532, + "learning_rate": 1.8865268048914828e-05, + "loss": 0.0514, + "step": 11890 + }, + { + "epoch": 0.23784, + "grad_norm": 2.203615665435791, + "learning_rate": 1.8864621943209828e-05, + "loss": 0.203, + "step": 11892 + }, + { + "epoch": 0.23788, + "grad_norm": 0.45405516028404236, + "learning_rate": 1.8863975664684456e-05, + "loss": 0.0334, + "step": 11894 + }, + { + "epoch": 0.23792, + "grad_norm": 1.470302939414978, + "learning_rate": 1.8863329213351318e-05, + "loss": 0.1513, + "step": 11896 + }, + { + "epoch": 0.23796, + "grad_norm": 0.31484782695770264, + "learning_rate": 1.8862682589223012e-05, + "loss": 0.0419, + "step": 11898 + }, + { + "epoch": 0.238, + "grad_norm": 1.0176615715026855, + "learning_rate": 1.8862035792312148e-05, + "loss": 0.0908, + "step": 11900 + }, + { + "epoch": 0.23804, + "grad_norm": 0.9435116648674011, + "learning_rate": 1.8861388822631333e-05, + "loss": 0.3787, + "step": 11902 + }, + { + "epoch": 0.23808, + "grad_norm": 0.4528171420097351, + "learning_rate": 1.886074168019318e-05, + "loss": 0.0983, + "step": 11904 + }, + { + "epoch": 0.23812, + "grad_norm": 1.9456427097320557, + "learning_rate": 1.8860094365010305e-05, + "loss": 0.1645, + "step": 11906 + }, + { + "epoch": 0.23816, + "grad_norm": 2.6327192783355713, + "learning_rate": 1.8859446877095333e-05, + "loss": 0.276, + "step": 11908 + }, + { + "epoch": 0.2382, + "grad_norm": 3.4247488975524902, + "learning_rate": 1.8858799216460883e-05, + "loss": 0.2775, + "step": 11910 + }, + { + "epoch": 0.23824, + "grad_norm": 1.3169310092926025, + "learning_rate": 1.8858151383119576e-05, + "loss": 0.0951, + "step": 11912 + }, + { + "epoch": 0.23828, + "grad_norm": 0.8696900606155396, + "learning_rate": 1.885750337708405e-05, + "loss": 0.1266, + "step": 11914 + }, + { + "epoch": 0.23832, + "grad_norm": 3.200303554534912, + "learning_rate": 1.8856855198366933e-05, + "loss": 0.4051, + "step": 11916 + }, + { + "epoch": 0.23836, + "grad_norm": 1.4144835472106934, + "learning_rate": 1.885620684698087e-05, + "loss": 0.086, + "step": 11918 + }, + { + "epoch": 0.2384, + "grad_norm": 2.875854730606079, + "learning_rate": 1.8855558322938492e-05, + "loss": 0.2545, + "step": 11920 + }, + { + "epoch": 0.23844, + "grad_norm": 0.6084136366844177, + "learning_rate": 1.8854909626252443e-05, + "loss": 0.074, + "step": 11922 + }, + { + "epoch": 0.23848, + "grad_norm": 0.7811341285705566, + "learning_rate": 1.8854260756935378e-05, + "loss": 0.1496, + "step": 11924 + }, + { + "epoch": 0.23852, + "grad_norm": 3.512477397918701, + "learning_rate": 1.8853611714999937e-05, + "loss": 0.5177, + "step": 11926 + }, + { + "epoch": 0.23856, + "grad_norm": 0.1555698961019516, + "learning_rate": 1.8852962500458775e-05, + "loss": 0.1559, + "step": 11928 + }, + { + "epoch": 0.2386, + "grad_norm": 1.16937255859375, + "learning_rate": 1.8852313113324553e-05, + "loss": 0.0774, + "step": 11930 + }, + { + "epoch": 0.23864, + "grad_norm": 1.7311084270477295, + "learning_rate": 1.8851663553609933e-05, + "loss": 0.1594, + "step": 11932 + }, + { + "epoch": 0.23868, + "grad_norm": 2.8562707901000977, + "learning_rate": 1.885101382132757e-05, + "loss": 0.4551, + "step": 11934 + }, + { + "epoch": 0.23872, + "grad_norm": 1.4045485258102417, + "learning_rate": 1.8850363916490137e-05, + "loss": 0.1984, + "step": 11936 + }, + { + "epoch": 0.23876, + "grad_norm": 1.090308427810669, + "learning_rate": 1.8849713839110305e-05, + "loss": 0.0854, + "step": 11938 + }, + { + "epoch": 0.2388, + "grad_norm": 2.860560178756714, + "learning_rate": 1.8849063589200744e-05, + "loss": 0.1911, + "step": 11940 + }, + { + "epoch": 0.23884, + "grad_norm": 2.258159875869751, + "learning_rate": 1.8848413166774133e-05, + "loss": 0.2344, + "step": 11942 + }, + { + "epoch": 0.23888, + "grad_norm": 2.4401731491088867, + "learning_rate": 1.8847762571843153e-05, + "loss": 0.2891, + "step": 11944 + }, + { + "epoch": 0.23892, + "grad_norm": 3.4530749320983887, + "learning_rate": 1.8847111804420483e-05, + "loss": 0.2166, + "step": 11946 + }, + { + "epoch": 0.23896, + "grad_norm": 0.5670152902603149, + "learning_rate": 1.8846460864518818e-05, + "loss": 0.0715, + "step": 11948 + }, + { + "epoch": 0.239, + "grad_norm": 1.886427879333496, + "learning_rate": 1.884580975215084e-05, + "loss": 0.2229, + "step": 11950 + }, + { + "epoch": 0.23904, + "grad_norm": 1.468625783920288, + "learning_rate": 1.8845158467329248e-05, + "loss": 0.168, + "step": 11952 + }, + { + "epoch": 0.23908, + "grad_norm": 2.594583749771118, + "learning_rate": 1.884450701006674e-05, + "loss": 0.3006, + "step": 11954 + }, + { + "epoch": 0.23912, + "grad_norm": 1.6673939228057861, + "learning_rate": 1.8843855380376013e-05, + "loss": 0.1115, + "step": 11956 + }, + { + "epoch": 0.23916, + "grad_norm": 0.3729318380355835, + "learning_rate": 1.8843203578269774e-05, + "loss": 0.1208, + "step": 11958 + }, + { + "epoch": 0.2392, + "grad_norm": 1.928885817527771, + "learning_rate": 1.8842551603760725e-05, + "loss": 0.1361, + "step": 11960 + }, + { + "epoch": 0.23924, + "grad_norm": 0.8988696932792664, + "learning_rate": 1.884189945686158e-05, + "loss": 0.1025, + "step": 11962 + }, + { + "epoch": 0.23928, + "grad_norm": 2.316962957382202, + "learning_rate": 1.8841247137585056e-05, + "loss": 0.2228, + "step": 11964 + }, + { + "epoch": 0.23932, + "grad_norm": 1.2622394561767578, + "learning_rate": 1.8840594645943867e-05, + "loss": 0.1683, + "step": 11966 + }, + { + "epoch": 0.23936, + "grad_norm": 0.7887532114982605, + "learning_rate": 1.883994198195073e-05, + "loss": 0.0863, + "step": 11968 + }, + { + "epoch": 0.2394, + "grad_norm": 1.2006763219833374, + "learning_rate": 1.8839289145618378e-05, + "loss": 0.0895, + "step": 11970 + }, + { + "epoch": 0.23944, + "grad_norm": 2.014073610305786, + "learning_rate": 1.883863613695953e-05, + "loss": 0.5652, + "step": 11972 + }, + { + "epoch": 0.23948, + "grad_norm": 1.2073639631271362, + "learning_rate": 1.8837982955986923e-05, + "loss": 0.2545, + "step": 11974 + }, + { + "epoch": 0.23952, + "grad_norm": 1.4801537990570068, + "learning_rate": 1.8837329602713286e-05, + "loss": 0.1522, + "step": 11976 + }, + { + "epoch": 0.23956, + "grad_norm": 0.9023943543434143, + "learning_rate": 1.8836676077151355e-05, + "loss": 0.107, + "step": 11978 + }, + { + "epoch": 0.2396, + "grad_norm": 0.5916066765785217, + "learning_rate": 1.8836022379313884e-05, + "loss": 0.0617, + "step": 11980 + }, + { + "epoch": 0.23964, + "grad_norm": 2.68129563331604, + "learning_rate": 1.88353685092136e-05, + "loss": 0.1684, + "step": 11982 + }, + { + "epoch": 0.23968, + "grad_norm": 3.2439167499542236, + "learning_rate": 1.883471446686326e-05, + "loss": 0.1947, + "step": 11984 + }, + { + "epoch": 0.23972, + "grad_norm": 1.7549575567245483, + "learning_rate": 1.8834060252275614e-05, + "loss": 0.3065, + "step": 11986 + }, + { + "epoch": 0.23976, + "grad_norm": 1.5792620182037354, + "learning_rate": 1.8833405865463414e-05, + "loss": 0.0853, + "step": 11988 + }, + { + "epoch": 0.2398, + "grad_norm": 0.9423738718032837, + "learning_rate": 1.883275130643942e-05, + "loss": 0.1297, + "step": 11990 + }, + { + "epoch": 0.23984, + "grad_norm": 0.3948967158794403, + "learning_rate": 1.883209657521639e-05, + "loss": 0.1044, + "step": 11992 + }, + { + "epoch": 0.23988, + "grad_norm": 3.890575647354126, + "learning_rate": 1.8831441671807092e-05, + "loss": 0.3517, + "step": 11994 + }, + { + "epoch": 0.23992, + "grad_norm": 2.5416946411132812, + "learning_rate": 1.883078659622429e-05, + "loss": 0.2684, + "step": 11996 + }, + { + "epoch": 0.23996, + "grad_norm": 2.9334464073181152, + "learning_rate": 1.883013134848076e-05, + "loss": 0.1713, + "step": 11998 + }, + { + "epoch": 0.24, + "grad_norm": 2.6777782440185547, + "learning_rate": 1.8829475928589272e-05, + "loss": 0.2368, + "step": 12000 + }, + { + "epoch": 0.24004, + "grad_norm": 0.869845986366272, + "learning_rate": 1.8828820336562605e-05, + "loss": 0.1054, + "step": 12002 + }, + { + "epoch": 0.24008, + "grad_norm": 2.028465747833252, + "learning_rate": 1.8828164572413538e-05, + "loss": 0.1386, + "step": 12004 + }, + { + "epoch": 0.24012, + "grad_norm": 0.8320929408073425, + "learning_rate": 1.882750863615486e-05, + "loss": 0.1718, + "step": 12006 + }, + { + "epoch": 0.24016, + "grad_norm": 4.751787185668945, + "learning_rate": 1.8826852527799355e-05, + "loss": 0.3457, + "step": 12008 + }, + { + "epoch": 0.2402, + "grad_norm": 1.4248484373092651, + "learning_rate": 1.882619624735982e-05, + "loss": 0.067, + "step": 12010 + }, + { + "epoch": 0.24024, + "grad_norm": 2.8923938274383545, + "learning_rate": 1.8825539794849038e-05, + "loss": 0.5736, + "step": 12012 + }, + { + "epoch": 0.24028, + "grad_norm": 1.2633355855941772, + "learning_rate": 1.8824883170279816e-05, + "loss": 0.3771, + "step": 12014 + }, + { + "epoch": 0.24032, + "grad_norm": 1.0386368036270142, + "learning_rate": 1.882422637366496e-05, + "loss": 0.2152, + "step": 12016 + }, + { + "epoch": 0.24036, + "grad_norm": 1.5807311534881592, + "learning_rate": 1.8823569405017263e-05, + "loss": 0.1441, + "step": 12018 + }, + { + "epoch": 0.2404, + "grad_norm": 1.1382708549499512, + "learning_rate": 1.8822912264349535e-05, + "loss": 0.1684, + "step": 12020 + }, + { + "epoch": 0.24044, + "grad_norm": 3.0554981231689453, + "learning_rate": 1.8822254951674593e-05, + "loss": 0.2137, + "step": 12022 + }, + { + "epoch": 0.24048, + "grad_norm": 3.001762866973877, + "learning_rate": 1.8821597467005247e-05, + "loss": 0.2256, + "step": 12024 + }, + { + "epoch": 0.24052, + "grad_norm": 1.076149821281433, + "learning_rate": 1.8820939810354318e-05, + "loss": 0.1433, + "step": 12026 + }, + { + "epoch": 0.24056, + "grad_norm": 1.2379554510116577, + "learning_rate": 1.8820281981734626e-05, + "loss": 0.1907, + "step": 12028 + }, + { + "epoch": 0.2406, + "grad_norm": 1.046095371246338, + "learning_rate": 1.8819623981158996e-05, + "loss": 0.163, + "step": 12030 + }, + { + "epoch": 0.24064, + "grad_norm": 1.1384937763214111, + "learning_rate": 1.8818965808640255e-05, + "loss": 0.1738, + "step": 12032 + }, + { + "epoch": 0.24068, + "grad_norm": 3.3864448070526123, + "learning_rate": 1.8818307464191234e-05, + "loss": 0.2928, + "step": 12034 + }, + { + "epoch": 0.24072, + "grad_norm": 0.8240743279457092, + "learning_rate": 1.8817648947824774e-05, + "loss": 0.1966, + "step": 12036 + }, + { + "epoch": 0.24076, + "grad_norm": 0.6630221009254456, + "learning_rate": 1.8816990259553703e-05, + "loss": 0.0853, + "step": 12038 + }, + { + "epoch": 0.2408, + "grad_norm": 2.7191972732543945, + "learning_rate": 1.881633139939087e-05, + "loss": 0.2004, + "step": 12040 + }, + { + "epoch": 0.24084, + "grad_norm": 1.161939024925232, + "learning_rate": 1.881567236734912e-05, + "loss": 0.1905, + "step": 12042 + }, + { + "epoch": 0.24088, + "grad_norm": 0.8331981897354126, + "learning_rate": 1.8815013163441293e-05, + "loss": 0.0641, + "step": 12044 + }, + { + "epoch": 0.24092, + "grad_norm": 2.2914631366729736, + "learning_rate": 1.881435378768025e-05, + "loss": 0.1791, + "step": 12046 + }, + { + "epoch": 0.24096, + "grad_norm": 3.973543643951416, + "learning_rate": 1.881369424007884e-05, + "loss": 0.4072, + "step": 12048 + }, + { + "epoch": 0.241, + "grad_norm": 0.9676678776741028, + "learning_rate": 1.8813034520649923e-05, + "loss": 0.0554, + "step": 12050 + }, + { + "epoch": 0.24104, + "grad_norm": 1.643855094909668, + "learning_rate": 1.8812374629406363e-05, + "loss": 0.2345, + "step": 12052 + }, + { + "epoch": 0.24108, + "grad_norm": 3.2222654819488525, + "learning_rate": 1.881171456636102e-05, + "loss": 0.2776, + "step": 12054 + }, + { + "epoch": 0.24112, + "grad_norm": 2.8636012077331543, + "learning_rate": 1.881105433152677e-05, + "loss": 0.224, + "step": 12056 + }, + { + "epoch": 0.24116, + "grad_norm": 0.5758609771728516, + "learning_rate": 1.881039392491647e-05, + "loss": 0.1623, + "step": 12058 + }, + { + "epoch": 0.2412, + "grad_norm": 2.1436879634857178, + "learning_rate": 1.8809733346543013e-05, + "loss": 0.3564, + "step": 12060 + }, + { + "epoch": 0.24124, + "grad_norm": 3.528177261352539, + "learning_rate": 1.8809072596419265e-05, + "loss": 0.2913, + "step": 12062 + }, + { + "epoch": 0.24128, + "grad_norm": 1.798124074935913, + "learning_rate": 1.8808411674558114e-05, + "loss": 0.2465, + "step": 12064 + }, + { + "epoch": 0.24132, + "grad_norm": 3.2912352085113525, + "learning_rate": 1.880775058097244e-05, + "loss": 0.2566, + "step": 12066 + }, + { + "epoch": 0.24136, + "grad_norm": 0.48505643010139465, + "learning_rate": 1.8807089315675137e-05, + "loss": 0.0664, + "step": 12068 + }, + { + "epoch": 0.2414, + "grad_norm": 0.9948359727859497, + "learning_rate": 1.880642787867909e-05, + "loss": 0.083, + "step": 12070 + }, + { + "epoch": 0.24144, + "grad_norm": 1.1214629411697388, + "learning_rate": 1.8805766269997203e-05, + "loss": 0.0737, + "step": 12072 + }, + { + "epoch": 0.24148, + "grad_norm": 1.2434484958648682, + "learning_rate": 1.8805104489642362e-05, + "loss": 0.0737, + "step": 12074 + }, + { + "epoch": 0.24152, + "grad_norm": 2.3804521560668945, + "learning_rate": 1.8804442537627482e-05, + "loss": 0.2882, + "step": 12076 + }, + { + "epoch": 0.24156, + "grad_norm": 0.5713703632354736, + "learning_rate": 1.880378041396546e-05, + "loss": 0.217, + "step": 12078 + }, + { + "epoch": 0.2416, + "grad_norm": 2.137014389038086, + "learning_rate": 1.8803118118669203e-05, + "loss": 0.1582, + "step": 12080 + }, + { + "epoch": 0.24164, + "grad_norm": 0.8293760418891907, + "learning_rate": 1.8802455651751628e-05, + "loss": 0.21, + "step": 12082 + }, + { + "epoch": 0.24168, + "grad_norm": 2.2690017223358154, + "learning_rate": 1.8801793013225648e-05, + "loss": 0.1773, + "step": 12084 + }, + { + "epoch": 0.24172, + "grad_norm": 0.5067511200904846, + "learning_rate": 1.8801130203104187e-05, + "loss": 0.0406, + "step": 12086 + }, + { + "epoch": 0.24176, + "grad_norm": 4.726089954376221, + "learning_rate": 1.8800467221400156e-05, + "loss": 0.3983, + "step": 12088 + }, + { + "epoch": 0.2418, + "grad_norm": 0.8935344219207764, + "learning_rate": 1.8799804068126487e-05, + "loss": 0.1146, + "step": 12090 + }, + { + "epoch": 0.24184, + "grad_norm": 4.957012176513672, + "learning_rate": 1.8799140743296104e-05, + "loss": 0.5603, + "step": 12092 + }, + { + "epoch": 0.24188, + "grad_norm": 1.9769582748413086, + "learning_rate": 1.8798477246921946e-05, + "loss": 0.121, + "step": 12094 + }, + { + "epoch": 0.24192, + "grad_norm": 0.4794018268585205, + "learning_rate": 1.8797813579016948e-05, + "loss": 0.0719, + "step": 12096 + }, + { + "epoch": 0.24196, + "grad_norm": 1.1550709009170532, + "learning_rate": 1.879714973959404e-05, + "loss": 0.2718, + "step": 12098 + }, + { + "epoch": 0.242, + "grad_norm": 0.8400511145591736, + "learning_rate": 1.879648572866617e-05, + "loss": 0.2572, + "step": 12100 + }, + { + "epoch": 0.24204, + "grad_norm": 0.2867695391178131, + "learning_rate": 1.8795821546246277e-05, + "loss": 0.0262, + "step": 12102 + }, + { + "epoch": 0.24208, + "grad_norm": 1.6230467557907104, + "learning_rate": 1.879515719234732e-05, + "loss": 0.1202, + "step": 12104 + }, + { + "epoch": 0.24212, + "grad_norm": 0.3581594228744507, + "learning_rate": 1.8794492666982243e-05, + "loss": 0.0946, + "step": 12106 + }, + { + "epoch": 0.24216, + "grad_norm": 3.754352331161499, + "learning_rate": 1.8793827970164006e-05, + "loss": 0.2424, + "step": 12108 + }, + { + "epoch": 0.2422, + "grad_norm": 2.7920424938201904, + "learning_rate": 1.8793163101905562e-05, + "loss": 0.2233, + "step": 12110 + }, + { + "epoch": 0.24224, + "grad_norm": 1.6925606727600098, + "learning_rate": 1.879249806221988e-05, + "loss": 0.1158, + "step": 12112 + }, + { + "epoch": 0.24228, + "grad_norm": 3.6804521083831787, + "learning_rate": 1.879183285111992e-05, + "loss": 0.395, + "step": 12114 + }, + { + "epoch": 0.24232, + "grad_norm": 2.270108222961426, + "learning_rate": 1.8791167468618654e-05, + "loss": 0.2175, + "step": 12116 + }, + { + "epoch": 0.24236, + "grad_norm": 2.286525011062622, + "learning_rate": 1.8790501914729052e-05, + "loss": 0.326, + "step": 12118 + }, + { + "epoch": 0.2424, + "grad_norm": 0.9847841262817383, + "learning_rate": 1.878983618946409e-05, + "loss": 0.1301, + "step": 12120 + }, + { + "epoch": 0.24244, + "grad_norm": 3.6634607315063477, + "learning_rate": 1.878917029283674e-05, + "loss": 0.3575, + "step": 12122 + }, + { + "epoch": 0.24248, + "grad_norm": 1.7157166004180908, + "learning_rate": 1.8788504224859996e-05, + "loss": 0.1669, + "step": 12124 + }, + { + "epoch": 0.24252, + "grad_norm": 0.9209049940109253, + "learning_rate": 1.8787837985546837e-05, + "loss": 0.2049, + "step": 12126 + }, + { + "epoch": 0.24256, + "grad_norm": 2.1520164012908936, + "learning_rate": 1.878717157491025e-05, + "loss": 0.1739, + "step": 12128 + }, + { + "epoch": 0.2426, + "grad_norm": 1.346404790878296, + "learning_rate": 1.878650499296323e-05, + "loss": 0.1807, + "step": 12130 + }, + { + "epoch": 0.24264, + "grad_norm": 0.9109814763069153, + "learning_rate": 1.8785838239718774e-05, + "loss": 0.0963, + "step": 12132 + }, + { + "epoch": 0.24268, + "grad_norm": 0.33898770809173584, + "learning_rate": 1.8785171315189876e-05, + "loss": 0.0323, + "step": 12134 + }, + { + "epoch": 0.24272, + "grad_norm": 1.3718527555465698, + "learning_rate": 1.878450421938954e-05, + "loss": 0.1514, + "step": 12136 + }, + { + "epoch": 0.24276, + "grad_norm": 3.9688446521759033, + "learning_rate": 1.878383695233077e-05, + "loss": 0.368, + "step": 12138 + }, + { + "epoch": 0.2428, + "grad_norm": 0.9747174978256226, + "learning_rate": 1.878316951402658e-05, + "loss": 0.0443, + "step": 12140 + }, + { + "epoch": 0.24284, + "grad_norm": 2.5203919410705566, + "learning_rate": 1.8782501904489975e-05, + "loss": 0.2293, + "step": 12142 + }, + { + "epoch": 0.24288, + "grad_norm": 1.5546232461929321, + "learning_rate": 1.8781834123733977e-05, + "loss": 0.13, + "step": 12144 + }, + { + "epoch": 0.24292, + "grad_norm": 0.5528755784034729, + "learning_rate": 1.87811661717716e-05, + "loss": 0.0663, + "step": 12146 + }, + { + "epoch": 0.24296, + "grad_norm": 2.337682008743286, + "learning_rate": 1.8780498048615868e-05, + "loss": 0.3611, + "step": 12148 + }, + { + "epoch": 0.243, + "grad_norm": 1.2261799573898315, + "learning_rate": 1.8779829754279806e-05, + "loss": 0.0737, + "step": 12150 + }, + { + "epoch": 0.24304, + "grad_norm": 3.0257651805877686, + "learning_rate": 1.8779161288776444e-05, + "loss": 0.2917, + "step": 12152 + }, + { + "epoch": 0.24308, + "grad_norm": 2.3271307945251465, + "learning_rate": 1.8778492652118812e-05, + "loss": 0.2816, + "step": 12154 + }, + { + "epoch": 0.24312, + "grad_norm": 1.425848126411438, + "learning_rate": 1.8777823844319946e-05, + "loss": 0.2406, + "step": 12156 + }, + { + "epoch": 0.24316, + "grad_norm": 1.1808348894119263, + "learning_rate": 1.8777154865392882e-05, + "loss": 0.0766, + "step": 12158 + }, + { + "epoch": 0.2432, + "grad_norm": 0.9504640102386475, + "learning_rate": 1.8776485715350672e-05, + "loss": 0.0554, + "step": 12160 + }, + { + "epoch": 0.24324, + "grad_norm": 1.7414124011993408, + "learning_rate": 1.877581639420635e-05, + "loss": 0.259, + "step": 12162 + }, + { + "epoch": 0.24328, + "grad_norm": 0.464599072933197, + "learning_rate": 1.877514690197297e-05, + "loss": 0.0716, + "step": 12164 + }, + { + "epoch": 0.24332, + "grad_norm": 0.7877985239028931, + "learning_rate": 1.8774477238663583e-05, + "loss": 0.1877, + "step": 12166 + }, + { + "epoch": 0.24336, + "grad_norm": 1.6016826629638672, + "learning_rate": 1.8773807404291244e-05, + "loss": 0.3979, + "step": 12168 + }, + { + "epoch": 0.2434, + "grad_norm": 1.7876696586608887, + "learning_rate": 1.8773137398869017e-05, + "loss": 0.3633, + "step": 12170 + }, + { + "epoch": 0.24344, + "grad_norm": 3.572167158126831, + "learning_rate": 1.8772467222409957e-05, + "loss": 0.4457, + "step": 12172 + }, + { + "epoch": 0.24348, + "grad_norm": 1.2825852632522583, + "learning_rate": 1.877179687492713e-05, + "loss": 0.0862, + "step": 12174 + }, + { + "epoch": 0.24352, + "grad_norm": 1.6383113861083984, + "learning_rate": 1.8771126356433607e-05, + "loss": 0.1233, + "step": 12176 + }, + { + "epoch": 0.24356, + "grad_norm": 2.836423873901367, + "learning_rate": 1.8770455666942463e-05, + "loss": 0.1669, + "step": 12178 + }, + { + "epoch": 0.2436, + "grad_norm": 2.0447771549224854, + "learning_rate": 1.8769784806466768e-05, + "loss": 0.1248, + "step": 12180 + }, + { + "epoch": 0.24364, + "grad_norm": 2.8368334770202637, + "learning_rate": 1.8769113775019604e-05, + "loss": 0.2483, + "step": 12182 + }, + { + "epoch": 0.24368, + "grad_norm": 1.5651382207870483, + "learning_rate": 1.8768442572614055e-05, + "loss": 0.2097, + "step": 12184 + }, + { + "epoch": 0.24372, + "grad_norm": 1.8670704364776611, + "learning_rate": 1.87677711992632e-05, + "loss": 0.2918, + "step": 12186 + }, + { + "epoch": 0.24376, + "grad_norm": 1.284305214881897, + "learning_rate": 1.876709965498013e-05, + "loss": 0.148, + "step": 12188 + }, + { + "epoch": 0.2438, + "grad_norm": 1.0576930046081543, + "learning_rate": 1.8766427939777943e-05, + "loss": 0.1458, + "step": 12190 + }, + { + "epoch": 0.24384, + "grad_norm": 1.2303011417388916, + "learning_rate": 1.876575605366973e-05, + "loss": 0.0837, + "step": 12192 + }, + { + "epoch": 0.24388, + "grad_norm": 1.2957791090011597, + "learning_rate": 1.8765083996668586e-05, + "loss": 0.2545, + "step": 12194 + }, + { + "epoch": 0.24392, + "grad_norm": 2.4648187160491943, + "learning_rate": 1.876441176878762e-05, + "loss": 0.3387, + "step": 12196 + }, + { + "epoch": 0.24396, + "grad_norm": 1.883474349975586, + "learning_rate": 1.8763739370039934e-05, + "loss": 0.2537, + "step": 12198 + }, + { + "epoch": 0.244, + "grad_norm": 0.640238344669342, + "learning_rate": 1.8763066800438638e-05, + "loss": 0.0389, + "step": 12200 + }, + { + "epoch": 0.24404, + "grad_norm": 2.49704647064209, + "learning_rate": 1.8762394059996843e-05, + "loss": 0.2405, + "step": 12202 + }, + { + "epoch": 0.24408, + "grad_norm": 3.6614315509796143, + "learning_rate": 1.876172114872766e-05, + "loss": 0.439, + "step": 12204 + }, + { + "epoch": 0.24412, + "grad_norm": 0.6934165954589844, + "learning_rate": 1.8761048066644217e-05, + "loss": 0.0866, + "step": 12206 + }, + { + "epoch": 0.24416, + "grad_norm": 0.7411126494407654, + "learning_rate": 1.876037481375963e-05, + "loss": 0.0885, + "step": 12208 + }, + { + "epoch": 0.2442, + "grad_norm": 0.8737144470214844, + "learning_rate": 1.8759701390087026e-05, + "loss": 0.1131, + "step": 12210 + }, + { + "epoch": 0.24424, + "grad_norm": 1.031477928161621, + "learning_rate": 1.8759027795639537e-05, + "loss": 0.119, + "step": 12212 + }, + { + "epoch": 0.24428, + "grad_norm": 1.6724709272384644, + "learning_rate": 1.875835403043029e-05, + "loss": 0.148, + "step": 12214 + }, + { + "epoch": 0.24432, + "grad_norm": 3.214942455291748, + "learning_rate": 1.8757680094472417e-05, + "loss": 0.1064, + "step": 12216 + }, + { + "epoch": 0.24436, + "grad_norm": 3.305668592453003, + "learning_rate": 1.8757005987779066e-05, + "loss": 0.3299, + "step": 12218 + }, + { + "epoch": 0.2444, + "grad_norm": 3.0554027557373047, + "learning_rate": 1.8756331710363375e-05, + "loss": 0.3062, + "step": 12220 + }, + { + "epoch": 0.24444, + "grad_norm": 1.7173433303833008, + "learning_rate": 1.8755657262238486e-05, + "loss": 0.1177, + "step": 12222 + }, + { + "epoch": 0.24448, + "grad_norm": 2.247589588165283, + "learning_rate": 1.8754982643417553e-05, + "loss": 0.1802, + "step": 12224 + }, + { + "epoch": 0.24452, + "grad_norm": 2.4869871139526367, + "learning_rate": 1.8754307853913728e-05, + "loss": 0.1963, + "step": 12226 + }, + { + "epoch": 0.24456, + "grad_norm": 0.17409728467464447, + "learning_rate": 1.875363289374016e-05, + "loss": 0.0487, + "step": 12228 + }, + { + "epoch": 0.2446, + "grad_norm": 2.4579591751098633, + "learning_rate": 1.8752957762910016e-05, + "loss": 0.1514, + "step": 12230 + }, + { + "epoch": 0.24464, + "grad_norm": 3.2685840129852295, + "learning_rate": 1.8752282461436456e-05, + "loss": 0.4693, + "step": 12232 + }, + { + "epoch": 0.24468, + "grad_norm": 1.830977439880371, + "learning_rate": 1.8751606989332637e-05, + "loss": 0.0663, + "step": 12234 + }, + { + "epoch": 0.24472, + "grad_norm": 2.1155407428741455, + "learning_rate": 1.875093134661174e-05, + "loss": 0.2766, + "step": 12236 + }, + { + "epoch": 0.24476, + "grad_norm": 3.8474771976470947, + "learning_rate": 1.875025553328693e-05, + "loss": 0.2861, + "step": 12238 + }, + { + "epoch": 0.2448, + "grad_norm": 1.3512849807739258, + "learning_rate": 1.874957954937138e-05, + "loss": 0.276, + "step": 12240 + }, + { + "epoch": 0.24484, + "grad_norm": 3.12868332862854, + "learning_rate": 1.8748903394878274e-05, + "loss": 0.3154, + "step": 12242 + }, + { + "epoch": 0.24488, + "grad_norm": 2.0334255695343018, + "learning_rate": 1.8748227069820792e-05, + "loss": 0.1436, + "step": 12244 + }, + { + "epoch": 0.24492, + "grad_norm": 1.4081681966781616, + "learning_rate": 1.874755057421212e-05, + "loss": 0.2098, + "step": 12246 + }, + { + "epoch": 0.24496, + "grad_norm": 1.1653770208358765, + "learning_rate": 1.8746873908065443e-05, + "loss": 0.1006, + "step": 12248 + }, + { + "epoch": 0.245, + "grad_norm": 2.7005844116210938, + "learning_rate": 1.874619707139396e-05, + "loss": 0.1816, + "step": 12250 + }, + { + "epoch": 0.24504, + "grad_norm": 0.17038966715335846, + "learning_rate": 1.874552006421086e-05, + "loss": 0.1129, + "step": 12252 + }, + { + "epoch": 0.24508, + "grad_norm": 3.810516119003296, + "learning_rate": 1.8744842886529344e-05, + "loss": 0.3562, + "step": 12254 + }, + { + "epoch": 0.24512, + "grad_norm": 0.40543824434280396, + "learning_rate": 1.8744165538362615e-05, + "loss": 0.297, + "step": 12256 + }, + { + "epoch": 0.24516, + "grad_norm": 1.0327624082565308, + "learning_rate": 1.8743488019723875e-05, + "loss": 0.0904, + "step": 12258 + }, + { + "epoch": 0.2452, + "grad_norm": 1.38789701461792, + "learning_rate": 1.8742810330626338e-05, + "loss": 0.1488, + "step": 12260 + }, + { + "epoch": 0.24524, + "grad_norm": 3.8168246746063232, + "learning_rate": 1.8742132471083214e-05, + "loss": 0.6076, + "step": 12262 + }, + { + "epoch": 0.24528, + "grad_norm": 0.3473513126373291, + "learning_rate": 1.874145444110771e-05, + "loss": 0.2673, + "step": 12264 + }, + { + "epoch": 0.24532, + "grad_norm": 2.3520724773406982, + "learning_rate": 1.8740776240713055e-05, + "loss": 0.1713, + "step": 12266 + }, + { + "epoch": 0.24536, + "grad_norm": 3.1194746494293213, + "learning_rate": 1.8740097869912465e-05, + "loss": 0.3904, + "step": 12268 + }, + { + "epoch": 0.2454, + "grad_norm": 2.038867235183716, + "learning_rate": 1.873941932871917e-05, + "loss": 0.1091, + "step": 12270 + }, + { + "epoch": 0.24544, + "grad_norm": 0.537987232208252, + "learning_rate": 1.8738740617146396e-05, + "loss": 0.1403, + "step": 12272 + }, + { + "epoch": 0.24548, + "grad_norm": 1.7000428438186646, + "learning_rate": 1.8738061735207372e-05, + "loss": 0.1686, + "step": 12274 + }, + { + "epoch": 0.24552, + "grad_norm": 0.8420761823654175, + "learning_rate": 1.873738268291534e-05, + "loss": 0.1925, + "step": 12276 + }, + { + "epoch": 0.24556, + "grad_norm": 0.861084520816803, + "learning_rate": 1.873670346028353e-05, + "loss": 0.0966, + "step": 12278 + }, + { + "epoch": 0.2456, + "grad_norm": 1.042839527130127, + "learning_rate": 1.8736024067325188e-05, + "loss": 0.1492, + "step": 12280 + }, + { + "epoch": 0.24564, + "grad_norm": 0.8733348846435547, + "learning_rate": 1.873534450405356e-05, + "loss": 0.1072, + "step": 12282 + }, + { + "epoch": 0.24568, + "grad_norm": 0.8794846534729004, + "learning_rate": 1.8734664770481898e-05, + "loss": 0.1059, + "step": 12284 + }, + { + "epoch": 0.24572, + "grad_norm": 0.721697211265564, + "learning_rate": 1.8733984866623444e-05, + "loss": 0.0765, + "step": 12286 + }, + { + "epoch": 0.24576, + "grad_norm": 3.1170239448547363, + "learning_rate": 1.873330479249146e-05, + "loss": 0.3732, + "step": 12288 + }, + { + "epoch": 0.2458, + "grad_norm": 0.9966888427734375, + "learning_rate": 1.8732624548099204e-05, + "loss": 0.0848, + "step": 12290 + }, + { + "epoch": 0.24584, + "grad_norm": 1.0249536037445068, + "learning_rate": 1.8731944133459934e-05, + "loss": 0.1246, + "step": 12292 + }, + { + "epoch": 0.24588, + "grad_norm": 0.4197043180465698, + "learning_rate": 1.873126354858692e-05, + "loss": 0.2562, + "step": 12294 + }, + { + "epoch": 0.24592, + "grad_norm": 1.4023358821868896, + "learning_rate": 1.873058279349343e-05, + "loss": 0.1779, + "step": 12296 + }, + { + "epoch": 0.24596, + "grad_norm": 0.9743784666061401, + "learning_rate": 1.872990186819273e-05, + "loss": 0.0849, + "step": 12298 + }, + { + "epoch": 0.246, + "grad_norm": 2.3030507564544678, + "learning_rate": 1.8729220772698096e-05, + "loss": 0.1853, + "step": 12300 + }, + { + "epoch": 0.24604, + "grad_norm": 0.7316948771476746, + "learning_rate": 1.8728539507022815e-05, + "loss": 0.1098, + "step": 12302 + }, + { + "epoch": 0.24608, + "grad_norm": 0.7312622666358948, + "learning_rate": 1.8727858071180162e-05, + "loss": 0.1684, + "step": 12304 + }, + { + "epoch": 0.24612, + "grad_norm": 0.837384819984436, + "learning_rate": 1.8727176465183417e-05, + "loss": 0.0925, + "step": 12306 + }, + { + "epoch": 0.24616, + "grad_norm": 2.6423680782318115, + "learning_rate": 1.8726494689045878e-05, + "loss": 0.457, + "step": 12308 + }, + { + "epoch": 0.2462, + "grad_norm": 1.6752443313598633, + "learning_rate": 1.8725812742780832e-05, + "loss": 0.1841, + "step": 12310 + }, + { + "epoch": 0.24624, + "grad_norm": 4.520659923553467, + "learning_rate": 1.8725130626401573e-05, + "loss": 0.4324, + "step": 12312 + }, + { + "epoch": 0.24628, + "grad_norm": 1.012895941734314, + "learning_rate": 1.87244483399214e-05, + "loss": 0.0813, + "step": 12314 + }, + { + "epoch": 0.24632, + "grad_norm": 1.0163288116455078, + "learning_rate": 1.872376588335362e-05, + "loss": 0.21, + "step": 12316 + }, + { + "epoch": 0.24636, + "grad_norm": 3.37384033203125, + "learning_rate": 1.8723083256711526e-05, + "loss": 0.4391, + "step": 12318 + }, + { + "epoch": 0.2464, + "grad_norm": 1.021409511566162, + "learning_rate": 1.8722400460008437e-05, + "loss": 0.0947, + "step": 12320 + }, + { + "epoch": 0.24644, + "grad_norm": 1.7369961738586426, + "learning_rate": 1.872171749325766e-05, + "loss": 0.2723, + "step": 12322 + }, + { + "epoch": 0.24648, + "grad_norm": 0.37870708107948303, + "learning_rate": 1.872103435647251e-05, + "loss": 0.032, + "step": 12324 + }, + { + "epoch": 0.24652, + "grad_norm": 0.33482757210731506, + "learning_rate": 1.8720351049666306e-05, + "loss": 0.0514, + "step": 12326 + }, + { + "epoch": 0.24656, + "grad_norm": 1.8426742553710938, + "learning_rate": 1.8719667572852366e-05, + "loss": 0.2504, + "step": 12328 + }, + { + "epoch": 0.2466, + "grad_norm": 2.6695401668548584, + "learning_rate": 1.871898392604402e-05, + "loss": 0.3128, + "step": 12330 + }, + { + "epoch": 0.24664, + "grad_norm": 3.3270092010498047, + "learning_rate": 1.8718300109254596e-05, + "loss": 0.325, + "step": 12332 + }, + { + "epoch": 0.24668, + "grad_norm": 0.11906854063272476, + "learning_rate": 1.8717616122497416e-05, + "loss": 0.0566, + "step": 12334 + }, + { + "epoch": 0.24672, + "grad_norm": 1.2811963558197021, + "learning_rate": 1.8716931965785826e-05, + "loss": 0.0752, + "step": 12336 + }, + { + "epoch": 0.24676, + "grad_norm": 0.34306544065475464, + "learning_rate": 1.871624763913316e-05, + "loss": 0.2176, + "step": 12338 + }, + { + "epoch": 0.2468, + "grad_norm": 2.0894646644592285, + "learning_rate": 1.8715563142552758e-05, + "loss": 0.1461, + "step": 12340 + }, + { + "epoch": 0.24684, + "grad_norm": 1.612741470336914, + "learning_rate": 1.8714878476057964e-05, + "loss": 0.1369, + "step": 12342 + }, + { + "epoch": 0.24688, + "grad_norm": 2.481621742248535, + "learning_rate": 1.871419363966213e-05, + "loss": 0.2767, + "step": 12344 + }, + { + "epoch": 0.24692, + "grad_norm": 2.7552428245544434, + "learning_rate": 1.87135086333786e-05, + "loss": 0.2052, + "step": 12346 + }, + { + "epoch": 0.24696, + "grad_norm": 4.016815662384033, + "learning_rate": 1.8712823457220736e-05, + "loss": 0.3831, + "step": 12348 + }, + { + "epoch": 0.247, + "grad_norm": 2.225023031234741, + "learning_rate": 1.8712138111201898e-05, + "loss": 0.2787, + "step": 12350 + }, + { + "epoch": 0.24704, + "grad_norm": 1.7184704542160034, + "learning_rate": 1.8711452595335434e-05, + "loss": 0.1672, + "step": 12352 + }, + { + "epoch": 0.24708, + "grad_norm": 0.7974080443382263, + "learning_rate": 1.8710766909634724e-05, + "loss": 0.305, + "step": 12354 + }, + { + "epoch": 0.24712, + "grad_norm": 2.8571219444274902, + "learning_rate": 1.871008105411312e-05, + "loss": 0.2278, + "step": 12356 + }, + { + "epoch": 0.24716, + "grad_norm": 1.6583173274993896, + "learning_rate": 1.870939502878401e-05, + "loss": 0.0913, + "step": 12358 + }, + { + "epoch": 0.2472, + "grad_norm": 0.5509397983551025, + "learning_rate": 1.8708708833660755e-05, + "loss": 0.0855, + "step": 12360 + }, + { + "epoch": 0.24724, + "grad_norm": 2.636752128601074, + "learning_rate": 1.870802246875674e-05, + "loss": 0.2007, + "step": 12362 + }, + { + "epoch": 0.24728, + "grad_norm": 1.315805435180664, + "learning_rate": 1.870733593408534e-05, + "loss": 0.1028, + "step": 12364 + }, + { + "epoch": 0.24732, + "grad_norm": 0.6554777026176453, + "learning_rate": 1.8706649229659948e-05, + "loss": 0.0681, + "step": 12366 + }, + { + "epoch": 0.24736, + "grad_norm": 1.8567655086517334, + "learning_rate": 1.8705962355493944e-05, + "loss": 0.1443, + "step": 12368 + }, + { + "epoch": 0.2474, + "grad_norm": 3.964294672012329, + "learning_rate": 1.8705275311600724e-05, + "loss": 0.6152, + "step": 12370 + }, + { + "epoch": 0.24744, + "grad_norm": 4.128782272338867, + "learning_rate": 1.870458809799368e-05, + "loss": 0.3224, + "step": 12372 + }, + { + "epoch": 0.24748, + "grad_norm": 0.5191868543624878, + "learning_rate": 1.8703900714686206e-05, + "loss": 0.1178, + "step": 12374 + }, + { + "epoch": 0.24752, + "grad_norm": 1.0056105852127075, + "learning_rate": 1.8703213161691708e-05, + "loss": 0.21, + "step": 12376 + }, + { + "epoch": 0.24756, + "grad_norm": 0.8372199535369873, + "learning_rate": 1.870252543902359e-05, + "loss": 0.0964, + "step": 12378 + }, + { + "epoch": 0.2476, + "grad_norm": 2.470940113067627, + "learning_rate": 1.870183754669526e-05, + "loss": 0.2617, + "step": 12380 + }, + { + "epoch": 0.24764, + "grad_norm": 1.6275163888931274, + "learning_rate": 1.8701149484720124e-05, + "loss": 0.1367, + "step": 12382 + }, + { + "epoch": 0.24768, + "grad_norm": 0.8431761264801025, + "learning_rate": 1.87004612531116e-05, + "loss": 0.2282, + "step": 12384 + }, + { + "epoch": 0.24772, + "grad_norm": 0.9730808734893799, + "learning_rate": 1.8699772851883106e-05, + "loss": 0.0777, + "step": 12386 + }, + { + "epoch": 0.24776, + "grad_norm": 1.3774842023849487, + "learning_rate": 1.8699084281048058e-05, + "loss": 0.1299, + "step": 12388 + }, + { + "epoch": 0.2478, + "grad_norm": 1.3900245428085327, + "learning_rate": 1.8698395540619883e-05, + "loss": 0.1008, + "step": 12390 + }, + { + "epoch": 0.24784, + "grad_norm": 0.7433670163154602, + "learning_rate": 1.8697706630612013e-05, + "loss": 0.1397, + "step": 12392 + }, + { + "epoch": 0.24788, + "grad_norm": 2.522435426712036, + "learning_rate": 1.869701755103787e-05, + "loss": 0.1819, + "step": 12394 + }, + { + "epoch": 0.24792, + "grad_norm": 1.4115827083587646, + "learning_rate": 1.8696328301910897e-05, + "loss": 0.1613, + "step": 12396 + }, + { + "epoch": 0.24796, + "grad_norm": 1.1926286220550537, + "learning_rate": 1.8695638883244522e-05, + "loss": 0.1378, + "step": 12398 + }, + { + "epoch": 0.248, + "grad_norm": 0.1912417709827423, + "learning_rate": 1.869494929505219e-05, + "loss": 0.0366, + "step": 12400 + }, + { + "epoch": 0.24804, + "grad_norm": 3.421330213546753, + "learning_rate": 1.869425953734735e-05, + "loss": 0.3546, + "step": 12402 + }, + { + "epoch": 0.24808, + "grad_norm": 0.3856886923313141, + "learning_rate": 1.869356961014344e-05, + "loss": 0.1605, + "step": 12404 + }, + { + "epoch": 0.24812, + "grad_norm": 1.1876366138458252, + "learning_rate": 1.8692879513453918e-05, + "loss": 0.1141, + "step": 12406 + }, + { + "epoch": 0.24816, + "grad_norm": 1.0100114345550537, + "learning_rate": 1.869218924729223e-05, + "loss": 0.1498, + "step": 12408 + }, + { + "epoch": 0.2482, + "grad_norm": 1.9031987190246582, + "learning_rate": 1.869149881167184e-05, + "loss": 0.1706, + "step": 12410 + }, + { + "epoch": 0.24824, + "grad_norm": 3.521596670150757, + "learning_rate": 1.869080820660621e-05, + "loss": 0.2138, + "step": 12412 + }, + { + "epoch": 0.24828, + "grad_norm": 0.2713322043418884, + "learning_rate": 1.8690117432108792e-05, + "loss": 0.2428, + "step": 12414 + }, + { + "epoch": 0.24832, + "grad_norm": 1.9255722761154175, + "learning_rate": 1.8689426488193066e-05, + "loss": 0.2021, + "step": 12416 + }, + { + "epoch": 0.24836, + "grad_norm": 0.291421502828598, + "learning_rate": 1.868873537487249e-05, + "loss": 0.0201, + "step": 12418 + }, + { + "epoch": 0.2484, + "grad_norm": 2.3766114711761475, + "learning_rate": 1.8688044092160554e-05, + "loss": 0.136, + "step": 12420 + }, + { + "epoch": 0.24844, + "grad_norm": 4.224188804626465, + "learning_rate": 1.8687352640070725e-05, + "loss": 0.394, + "step": 12422 + }, + { + "epoch": 0.24848, + "grad_norm": 0.21725626289844513, + "learning_rate": 1.8686661018616478e-05, + "loss": 0.1443, + "step": 12424 + }, + { + "epoch": 0.24852, + "grad_norm": 0.9810729026794434, + "learning_rate": 1.8685969227811306e-05, + "loss": 0.0896, + "step": 12426 + }, + { + "epoch": 0.24856, + "grad_norm": 1.8925985097885132, + "learning_rate": 1.868527726766869e-05, + "loss": 0.2836, + "step": 12428 + }, + { + "epoch": 0.2486, + "grad_norm": 0.8685587048530579, + "learning_rate": 1.8684585138202122e-05, + "loss": 0.1213, + "step": 12430 + }, + { + "epoch": 0.24864, + "grad_norm": 0.464939147233963, + "learning_rate": 1.8683892839425098e-05, + "loss": 0.0819, + "step": 12432 + }, + { + "epoch": 0.24868, + "grad_norm": 0.25700509548187256, + "learning_rate": 1.868320037135111e-05, + "loss": 0.0275, + "step": 12434 + }, + { + "epoch": 0.24872, + "grad_norm": 4.130624771118164, + "learning_rate": 1.868250773399366e-05, + "loss": 0.576, + "step": 12436 + }, + { + "epoch": 0.24876, + "grad_norm": 2.524538516998291, + "learning_rate": 1.8681814927366253e-05, + "loss": 0.2646, + "step": 12438 + }, + { + "epoch": 0.2488, + "grad_norm": 2.6931962966918945, + "learning_rate": 1.8681121951482397e-05, + "loss": 0.1746, + "step": 12440 + }, + { + "epoch": 0.24884, + "grad_norm": 3.0698630809783936, + "learning_rate": 1.8680428806355594e-05, + "loss": 0.2422, + "step": 12442 + }, + { + "epoch": 0.24888, + "grad_norm": 0.4895588755607605, + "learning_rate": 1.8679735491999364e-05, + "loss": 0.057, + "step": 12444 + }, + { + "epoch": 0.24892, + "grad_norm": 3.665213108062744, + "learning_rate": 1.8679042008427222e-05, + "loss": 0.3253, + "step": 12446 + }, + { + "epoch": 0.24896, + "grad_norm": 1.4629697799682617, + "learning_rate": 1.867834835565269e-05, + "loss": 0.1808, + "step": 12448 + }, + { + "epoch": 0.249, + "grad_norm": 0.6049095988273621, + "learning_rate": 1.8677654533689287e-05, + "loss": 0.0542, + "step": 12450 + }, + { + "epoch": 0.24904, + "grad_norm": 0.9972557425498962, + "learning_rate": 1.867696054255054e-05, + "loss": 0.0762, + "step": 12452 + }, + { + "epoch": 0.24908, + "grad_norm": 6.174975872039795, + "learning_rate": 1.8676266382249984e-05, + "loss": 0.2847, + "step": 12454 + }, + { + "epoch": 0.24912, + "grad_norm": 1.7027477025985718, + "learning_rate": 1.8675572052801145e-05, + "loss": 0.4063, + "step": 12456 + }, + { + "epoch": 0.24916, + "grad_norm": 0.38738441467285156, + "learning_rate": 1.867487755421756e-05, + "loss": 0.0852, + "step": 12458 + }, + { + "epoch": 0.2492, + "grad_norm": 0.6003584265708923, + "learning_rate": 1.8674182886512776e-05, + "loss": 0.0514, + "step": 12460 + }, + { + "epoch": 0.24924, + "grad_norm": 1.312921404838562, + "learning_rate": 1.867348804970033e-05, + "loss": 0.0738, + "step": 12462 + }, + { + "epoch": 0.24928, + "grad_norm": 1.8923938274383545, + "learning_rate": 1.8672793043793768e-05, + "loss": 0.1352, + "step": 12464 + }, + { + "epoch": 0.24932, + "grad_norm": 2.6723897457122803, + "learning_rate": 1.867209786880664e-05, + "loss": 0.3573, + "step": 12466 + }, + { + "epoch": 0.24936, + "grad_norm": 3.9355714321136475, + "learning_rate": 1.8671402524752497e-05, + "loss": 0.2175, + "step": 12468 + }, + { + "epoch": 0.2494, + "grad_norm": 1.4338043928146362, + "learning_rate": 1.86707070116449e-05, + "loss": 0.0896, + "step": 12470 + }, + { + "epoch": 0.24944, + "grad_norm": 0.5468841195106506, + "learning_rate": 1.8670011329497408e-05, + "loss": 0.0716, + "step": 12472 + }, + { + "epoch": 0.24948, + "grad_norm": 3.135535717010498, + "learning_rate": 1.866931547832358e-05, + "loss": 0.3066, + "step": 12474 + }, + { + "epoch": 0.24952, + "grad_norm": 2.2741775512695312, + "learning_rate": 1.866861945813698e-05, + "loss": 0.4697, + "step": 12476 + }, + { + "epoch": 0.24956, + "grad_norm": 1.1171355247497559, + "learning_rate": 1.8667923268951186e-05, + "loss": 0.0906, + "step": 12478 + }, + { + "epoch": 0.2496, + "grad_norm": 2.4823381900787354, + "learning_rate": 1.8667226910779767e-05, + "loss": 0.1374, + "step": 12480 + }, + { + "epoch": 0.24964, + "grad_norm": 1.7432312965393066, + "learning_rate": 1.866653038363629e-05, + "loss": 0.1265, + "step": 12482 + }, + { + "epoch": 0.24968, + "grad_norm": 1.9764775037765503, + "learning_rate": 1.8665833687534346e-05, + "loss": 0.1955, + "step": 12484 + }, + { + "epoch": 0.24972, + "grad_norm": 1.1646102666854858, + "learning_rate": 1.866513682248751e-05, + "loss": 0.0765, + "step": 12486 + }, + { + "epoch": 0.24976, + "grad_norm": 2.417910099029541, + "learning_rate": 1.866443978850937e-05, + "loss": 0.1927, + "step": 12488 + }, + { + "epoch": 0.2498, + "grad_norm": 1.3487051725387573, + "learning_rate": 1.866374258561352e-05, + "loss": 0.1526, + "step": 12490 + }, + { + "epoch": 0.24984, + "grad_norm": 1.8302266597747803, + "learning_rate": 1.8663045213813546e-05, + "loss": 0.2233, + "step": 12492 + }, + { + "epoch": 0.24988, + "grad_norm": 3.6419765949249268, + "learning_rate": 1.8662347673123044e-05, + "loss": 0.2153, + "step": 12494 + }, + { + "epoch": 0.24992, + "grad_norm": 1.2181354761123657, + "learning_rate": 1.8661649963555616e-05, + "loss": 0.0644, + "step": 12496 + }, + { + "epoch": 0.24996, + "grad_norm": 2.2489755153656006, + "learning_rate": 1.866095208512486e-05, + "loss": 0.1168, + "step": 12498 + }, + { + "epoch": 0.25, + "grad_norm": 0.7243970632553101, + "learning_rate": 1.866025403784439e-05, + "loss": 0.0662, + "step": 12500 + }, + { + "epoch": 0.25004, + "grad_norm": 0.33968719840049744, + "learning_rate": 1.8659555821727804e-05, + "loss": 0.0605, + "step": 12502 + }, + { + "epoch": 0.25008, + "grad_norm": 3.686464309692383, + "learning_rate": 1.865885743678872e-05, + "loss": 0.2728, + "step": 12504 + }, + { + "epoch": 0.25012, + "grad_norm": 0.41246938705444336, + "learning_rate": 1.8658158883040754e-05, + "loss": 0.0285, + "step": 12506 + }, + { + "epoch": 0.25016, + "grad_norm": 2.630913257598877, + "learning_rate": 1.865746016049752e-05, + "loss": 0.3407, + "step": 12508 + }, + { + "epoch": 0.2502, + "grad_norm": 0.22590942680835724, + "learning_rate": 1.8656761269172645e-05, + "loss": 0.0235, + "step": 12510 + }, + { + "epoch": 0.25024, + "grad_norm": 1.2284157276153564, + "learning_rate": 1.865606220907975e-05, + "loss": 0.223, + "step": 12512 + }, + { + "epoch": 0.25028, + "grad_norm": 1.6385239362716675, + "learning_rate": 1.8655362980232467e-05, + "loss": 0.241, + "step": 12514 + }, + { + "epoch": 0.25032, + "grad_norm": 0.2934477925300598, + "learning_rate": 1.8654663582644428e-05, + "loss": 0.1373, + "step": 12516 + }, + { + "epoch": 0.25036, + "grad_norm": 4.928040504455566, + "learning_rate": 1.8653964016329267e-05, + "loss": 0.4883, + "step": 12518 + }, + { + "epoch": 0.2504, + "grad_norm": 1.1686207056045532, + "learning_rate": 1.8653264281300622e-05, + "loss": 0.2512, + "step": 12520 + }, + { + "epoch": 0.25044, + "grad_norm": 1.9157741069793701, + "learning_rate": 1.8652564377572132e-05, + "loss": 0.1303, + "step": 12522 + }, + { + "epoch": 0.25048, + "grad_norm": 1.077045202255249, + "learning_rate": 1.865186430515745e-05, + "loss": 0.1119, + "step": 12524 + }, + { + "epoch": 0.25052, + "grad_norm": 2.6888742446899414, + "learning_rate": 1.8651164064070216e-05, + "loss": 0.1569, + "step": 12526 + }, + { + "epoch": 0.25056, + "grad_norm": 0.3451790511608124, + "learning_rate": 1.865046365432408e-05, + "loss": 0.0211, + "step": 12528 + }, + { + "epoch": 0.2506, + "grad_norm": 0.1652577519416809, + "learning_rate": 1.864976307593271e-05, + "loss": 0.0578, + "step": 12530 + }, + { + "epoch": 0.25064, + "grad_norm": 4.060386657714844, + "learning_rate": 1.864906232890975e-05, + "loss": 0.4404, + "step": 12532 + }, + { + "epoch": 0.25068, + "grad_norm": 2.4934608936309814, + "learning_rate": 1.864836141326887e-05, + "loss": 0.314, + "step": 12534 + }, + { + "epoch": 0.25072, + "grad_norm": 4.217505931854248, + "learning_rate": 1.864766032902373e-05, + "loss": 0.5175, + "step": 12536 + }, + { + "epoch": 0.25076, + "grad_norm": 1.5829455852508545, + "learning_rate": 1.8646959076188002e-05, + "loss": 0.1173, + "step": 12538 + }, + { + "epoch": 0.2508, + "grad_norm": 0.8299952149391174, + "learning_rate": 1.864625765477535e-05, + "loss": 0.5189, + "step": 12540 + }, + { + "epoch": 0.25084, + "grad_norm": 3.443507671356201, + "learning_rate": 1.8645556064799456e-05, + "loss": 0.1686, + "step": 12542 + }, + { + "epoch": 0.25088, + "grad_norm": 1.0983837842941284, + "learning_rate": 1.8644854306273997e-05, + "loss": 0.2284, + "step": 12544 + }, + { + "epoch": 0.25092, + "grad_norm": 1.118964433670044, + "learning_rate": 1.864415237921265e-05, + "loss": 0.1686, + "step": 12546 + }, + { + "epoch": 0.25096, + "grad_norm": 1.3500192165374756, + "learning_rate": 1.8643450283629106e-05, + "loss": 0.1027, + "step": 12548 + }, + { + "epoch": 0.251, + "grad_norm": 0.7096267342567444, + "learning_rate": 1.864274801953705e-05, + "loss": 0.0332, + "step": 12550 + }, + { + "epoch": 0.25104, + "grad_norm": 0.8004171848297119, + "learning_rate": 1.8642045586950165e-05, + "loss": 0.0512, + "step": 12552 + }, + { + "epoch": 0.25108, + "grad_norm": 0.5742665529251099, + "learning_rate": 1.8641342985882158e-05, + "loss": 0.1148, + "step": 12554 + }, + { + "epoch": 0.25112, + "grad_norm": 0.31756648421287537, + "learning_rate": 1.8640640216346718e-05, + "loss": 0.0392, + "step": 12556 + }, + { + "epoch": 0.25116, + "grad_norm": 3.663180351257324, + "learning_rate": 1.863993727835755e-05, + "loss": 0.3888, + "step": 12558 + }, + { + "epoch": 0.2512, + "grad_norm": 2.750037431716919, + "learning_rate": 1.8639234171928355e-05, + "loss": 0.3302, + "step": 12560 + }, + { + "epoch": 0.25124, + "grad_norm": 0.8340952396392822, + "learning_rate": 1.863853089707284e-05, + "loss": 0.0849, + "step": 12562 + }, + { + "epoch": 0.25128, + "grad_norm": 0.4791209399700165, + "learning_rate": 1.8637827453804722e-05, + "loss": 0.0861, + "step": 12564 + }, + { + "epoch": 0.25132, + "grad_norm": 2.7396864891052246, + "learning_rate": 1.863712384213771e-05, + "loss": 0.2065, + "step": 12566 + }, + { + "epoch": 0.25136, + "grad_norm": 3.21907377243042, + "learning_rate": 1.863642006208552e-05, + "loss": 0.2931, + "step": 12568 + }, + { + "epoch": 0.2514, + "grad_norm": 3.44264817237854, + "learning_rate": 1.8635716113661876e-05, + "loss": 0.3, + "step": 12570 + }, + { + "epoch": 0.25144, + "grad_norm": 1.1680327653884888, + "learning_rate": 1.86350119968805e-05, + "loss": 0.0685, + "step": 12572 + }, + { + "epoch": 0.25148, + "grad_norm": 1.0623648166656494, + "learning_rate": 1.863430771175512e-05, + "loss": 0.2181, + "step": 12574 + }, + { + "epoch": 0.25152, + "grad_norm": 0.3330172300338745, + "learning_rate": 1.8633603258299464e-05, + "loss": 0.078, + "step": 12576 + }, + { + "epoch": 0.25156, + "grad_norm": 1.4345232248306274, + "learning_rate": 1.863289863652727e-05, + "loss": 0.0957, + "step": 12578 + }, + { + "epoch": 0.2516, + "grad_norm": 1.0423222780227661, + "learning_rate": 1.863219384645227e-05, + "loss": 0.0996, + "step": 12580 + }, + { + "epoch": 0.25164, + "grad_norm": 0.3868069648742676, + "learning_rate": 1.863148888808821e-05, + "loss": 0.0886, + "step": 12582 + }, + { + "epoch": 0.25168, + "grad_norm": 0.8158010244369507, + "learning_rate": 1.8630783761448828e-05, + "loss": 0.0571, + "step": 12584 + }, + { + "epoch": 0.25172, + "grad_norm": 2.368307590484619, + "learning_rate": 1.8630078466547875e-05, + "loss": 0.1437, + "step": 12586 + }, + { + "epoch": 0.25176, + "grad_norm": 0.2726421654224396, + "learning_rate": 1.8629373003399097e-05, + "loss": 0.0504, + "step": 12588 + }, + { + "epoch": 0.2518, + "grad_norm": 1.6834535598754883, + "learning_rate": 1.862866737201625e-05, + "loss": 0.079, + "step": 12590 + }, + { + "epoch": 0.25184, + "grad_norm": 3.8192718029022217, + "learning_rate": 1.8627961572413094e-05, + "loss": 0.4231, + "step": 12592 + }, + { + "epoch": 0.25188, + "grad_norm": 2.461361885070801, + "learning_rate": 1.862725560460338e-05, + "loss": 0.3996, + "step": 12594 + }, + { + "epoch": 0.25192, + "grad_norm": 4.078171253204346, + "learning_rate": 1.862654946860088e-05, + "loss": 0.4852, + "step": 12596 + }, + { + "epoch": 0.25196, + "grad_norm": 0.7447314262390137, + "learning_rate": 1.8625843164419356e-05, + "loss": 0.0762, + "step": 12598 + }, + { + "epoch": 0.252, + "grad_norm": 0.7499154210090637, + "learning_rate": 1.8625136692072577e-05, + "loss": 0.0512, + "step": 12600 + }, + { + "epoch": 0.25204, + "grad_norm": 1.733542561531067, + "learning_rate": 1.8624430051574318e-05, + "loss": 0.1776, + "step": 12602 + }, + { + "epoch": 0.25208, + "grad_norm": 0.326977014541626, + "learning_rate": 1.8623723242938355e-05, + "loss": 0.0331, + "step": 12604 + }, + { + "epoch": 0.25212, + "grad_norm": 0.7932029366493225, + "learning_rate": 1.862301626617847e-05, + "loss": 0.0866, + "step": 12606 + }, + { + "epoch": 0.25216, + "grad_norm": 0.22424434125423431, + "learning_rate": 1.862230912130844e-05, + "loss": 0.0567, + "step": 12608 + }, + { + "epoch": 0.2522, + "grad_norm": 1.2784181833267212, + "learning_rate": 1.862160180834206e-05, + "loss": 0.0718, + "step": 12610 + }, + { + "epoch": 0.25224, + "grad_norm": 0.37674275040626526, + "learning_rate": 1.8620894327293107e-05, + "loss": 0.087, + "step": 12612 + }, + { + "epoch": 0.25228, + "grad_norm": 4.161036014556885, + "learning_rate": 1.862018667817538e-05, + "loss": 0.2411, + "step": 12614 + }, + { + "epoch": 0.25232, + "grad_norm": 3.482722520828247, + "learning_rate": 1.8619478861002682e-05, + "loss": 0.1734, + "step": 12616 + }, + { + "epoch": 0.25236, + "grad_norm": 1.2111560106277466, + "learning_rate": 1.8618770875788804e-05, + "loss": 0.2932, + "step": 12618 + }, + { + "epoch": 0.2524, + "grad_norm": 0.5999941229820251, + "learning_rate": 1.861806272254755e-05, + "loss": 0.08, + "step": 12620 + }, + { + "epoch": 0.25244, + "grad_norm": 1.7010524272918701, + "learning_rate": 1.8617354401292728e-05, + "loss": 0.0701, + "step": 12622 + }, + { + "epoch": 0.25248, + "grad_norm": 0.4510985016822815, + "learning_rate": 1.8616645912038146e-05, + "loss": 0.0256, + "step": 12624 + }, + { + "epoch": 0.25252, + "grad_norm": 0.4930625557899475, + "learning_rate": 1.8615937254797614e-05, + "loss": 0.0531, + "step": 12626 + }, + { + "epoch": 0.25256, + "grad_norm": 0.9836893677711487, + "learning_rate": 1.8615228429584947e-05, + "loss": 0.0616, + "step": 12628 + }, + { + "epoch": 0.2526, + "grad_norm": 0.3473933935165405, + "learning_rate": 1.8614519436413968e-05, + "loss": 0.3684, + "step": 12630 + }, + { + "epoch": 0.25264, + "grad_norm": 1.1358698606491089, + "learning_rate": 1.86138102752985e-05, + "loss": 0.0554, + "step": 12632 + }, + { + "epoch": 0.25268, + "grad_norm": 0.16954216361045837, + "learning_rate": 1.8613100946252364e-05, + "loss": 0.0251, + "step": 12634 + }, + { + "epoch": 0.25272, + "grad_norm": 0.506280243396759, + "learning_rate": 1.861239144928939e-05, + "loss": 0.1047, + "step": 12636 + }, + { + "epoch": 0.25276, + "grad_norm": 0.7072067856788635, + "learning_rate": 1.861168178442341e-05, + "loss": 0.0488, + "step": 12638 + }, + { + "epoch": 0.2528, + "grad_norm": 0.17361095547676086, + "learning_rate": 1.8610971951668265e-05, + "loss": 0.0225, + "step": 12640 + }, + { + "epoch": 0.25284, + "grad_norm": 0.8051401972770691, + "learning_rate": 1.8610261951037785e-05, + "loss": 0.438, + "step": 12642 + }, + { + "epoch": 0.25288, + "grad_norm": 0.9955002069473267, + "learning_rate": 1.8609551782545816e-05, + "loss": 0.2425, + "step": 12644 + }, + { + "epoch": 0.25292, + "grad_norm": 1.147768259048462, + "learning_rate": 1.8608841446206202e-05, + "loss": 0.0949, + "step": 12646 + }, + { + "epoch": 0.25296, + "grad_norm": 0.716677188873291, + "learning_rate": 1.8608130942032792e-05, + "loss": 0.0694, + "step": 12648 + }, + { + "epoch": 0.253, + "grad_norm": 0.59943687915802, + "learning_rate": 1.860742027003944e-05, + "loss": 0.0512, + "step": 12650 + }, + { + "epoch": 0.25304, + "grad_norm": 0.33036020398139954, + "learning_rate": 1.8606709430239995e-05, + "loss": 0.0305, + "step": 12652 + }, + { + "epoch": 0.25308, + "grad_norm": 2.571763753890991, + "learning_rate": 1.8605998422648318e-05, + "loss": 0.1653, + "step": 12654 + }, + { + "epoch": 0.25312, + "grad_norm": 1.4806718826293945, + "learning_rate": 1.8605287247278273e-05, + "loss": 0.2933, + "step": 12656 + }, + { + "epoch": 0.25316, + "grad_norm": 2.3175432682037354, + "learning_rate": 1.8604575904143723e-05, + "loss": 0.1346, + "step": 12658 + }, + { + "epoch": 0.2532, + "grad_norm": 1.4867122173309326, + "learning_rate": 1.8603864393258534e-05, + "loss": 0.3951, + "step": 12660 + }, + { + "epoch": 0.25324, + "grad_norm": 0.9004172086715698, + "learning_rate": 1.8603152714636582e-05, + "loss": 0.1651, + "step": 12662 + }, + { + "epoch": 0.25328, + "grad_norm": 0.7023975253105164, + "learning_rate": 1.8602440868291736e-05, + "loss": 0.4961, + "step": 12664 + }, + { + "epoch": 0.25332, + "grad_norm": 0.6705012321472168, + "learning_rate": 1.8601728854237876e-05, + "loss": 0.0598, + "step": 12666 + }, + { + "epoch": 0.25336, + "grad_norm": 0.3566977083683014, + "learning_rate": 1.8601016672488887e-05, + "loss": 0.0391, + "step": 12668 + }, + { + "epoch": 0.2534, + "grad_norm": 1.5638930797576904, + "learning_rate": 1.860030432305865e-05, + "loss": 0.0841, + "step": 12670 + }, + { + "epoch": 0.25344, + "grad_norm": 1.989863395690918, + "learning_rate": 1.8599591805961047e-05, + "loss": 0.3316, + "step": 12672 + }, + { + "epoch": 0.25348, + "grad_norm": 1.189658522605896, + "learning_rate": 1.859887912120998e-05, + "loss": 0.1793, + "step": 12674 + }, + { + "epoch": 0.25352, + "grad_norm": 1.8592965602874756, + "learning_rate": 1.8598166268819334e-05, + "loss": 0.1013, + "step": 12676 + }, + { + "epoch": 0.25356, + "grad_norm": 0.3375697135925293, + "learning_rate": 1.8597453248803014e-05, + "loss": 0.0993, + "step": 12678 + }, + { + "epoch": 0.2536, + "grad_norm": 0.33944469690322876, + "learning_rate": 1.8596740061174912e-05, + "loss": 0.0718, + "step": 12680 + }, + { + "epoch": 0.25364, + "grad_norm": 0.8336483240127563, + "learning_rate": 1.8596026705948938e-05, + "loss": 0.1967, + "step": 12682 + }, + { + "epoch": 0.25368, + "grad_norm": 0.12364495545625687, + "learning_rate": 1.8595313183139e-05, + "loss": 0.0682, + "step": 12684 + }, + { + "epoch": 0.25372, + "grad_norm": 1.7464251518249512, + "learning_rate": 1.8594599492759005e-05, + "loss": 0.0868, + "step": 12686 + }, + { + "epoch": 0.25376, + "grad_norm": 2.57926344871521, + "learning_rate": 1.8593885634822866e-05, + "loss": 0.1332, + "step": 12688 + }, + { + "epoch": 0.2538, + "grad_norm": 3.7450473308563232, + "learning_rate": 1.8593171609344505e-05, + "loss": 0.2696, + "step": 12690 + }, + { + "epoch": 0.25384, + "grad_norm": 2.8582863807678223, + "learning_rate": 1.859245741633784e-05, + "loss": 0.1645, + "step": 12692 + }, + { + "epoch": 0.25388, + "grad_norm": 0.5171008706092834, + "learning_rate": 1.8591743055816792e-05, + "loss": 0.0296, + "step": 12694 + }, + { + "epoch": 0.25392, + "grad_norm": 1.0411442518234253, + "learning_rate": 1.8591028527795288e-05, + "loss": 0.0604, + "step": 12696 + }, + { + "epoch": 0.25396, + "grad_norm": 1.7405651807785034, + "learning_rate": 1.8590313832287262e-05, + "loss": 0.0943, + "step": 12698 + }, + { + "epoch": 0.254, + "grad_norm": 0.27991607785224915, + "learning_rate": 1.8589598969306646e-05, + "loss": 0.0267, + "step": 12700 + }, + { + "epoch": 0.25404, + "grad_norm": 0.1959490031003952, + "learning_rate": 1.8588883938867376e-05, + "loss": 0.0095, + "step": 12702 + }, + { + "epoch": 0.25408, + "grad_norm": 3.220431089401245, + "learning_rate": 1.858816874098339e-05, + "loss": 0.1657, + "step": 12704 + }, + { + "epoch": 0.25412, + "grad_norm": 1.2522345781326294, + "learning_rate": 1.8587453375668635e-05, + "loss": 0.1488, + "step": 12706 + }, + { + "epoch": 0.25416, + "grad_norm": 0.09901125729084015, + "learning_rate": 1.8586737842937052e-05, + "loss": 0.0036, + "step": 12708 + }, + { + "epoch": 0.2542, + "grad_norm": 1.8055063486099243, + "learning_rate": 1.8586022142802597e-05, + "loss": 0.6557, + "step": 12710 + }, + { + "epoch": 0.25424, + "grad_norm": 2.610541343688965, + "learning_rate": 1.858530627527922e-05, + "loss": 0.0947, + "step": 12712 + }, + { + "epoch": 0.25428, + "grad_norm": 6.8286333084106445, + "learning_rate": 1.8584590240380877e-05, + "loss": 0.6098, + "step": 12714 + }, + { + "epoch": 0.25432, + "grad_norm": 0.1525983363389969, + "learning_rate": 1.8583874038121523e-05, + "loss": 0.032, + "step": 12716 + }, + { + "epoch": 0.25436, + "grad_norm": 0.7769955992698669, + "learning_rate": 1.858315766851513e-05, + "loss": 0.0693, + "step": 12718 + }, + { + "epoch": 0.2544, + "grad_norm": 4.331348896026611, + "learning_rate": 1.8582441131575658e-05, + "loss": 0.371, + "step": 12720 + }, + { + "epoch": 0.25444, + "grad_norm": 0.11427579075098038, + "learning_rate": 1.858172442731708e-05, + "loss": 0.0119, + "step": 12722 + }, + { + "epoch": 0.25448, + "grad_norm": 1.011147141456604, + "learning_rate": 1.8581007555753366e-05, + "loss": 0.2371, + "step": 12724 + }, + { + "epoch": 0.25452, + "grad_norm": 3.0604329109191895, + "learning_rate": 1.8580290516898487e-05, + "loss": 0.2004, + "step": 12726 + }, + { + "epoch": 0.25456, + "grad_norm": 0.8677058815956116, + "learning_rate": 1.857957331076643e-05, + "loss": 0.0454, + "step": 12728 + }, + { + "epoch": 0.2546, + "grad_norm": 2.068397045135498, + "learning_rate": 1.8578855937371176e-05, + "loss": 0.144, + "step": 12730 + }, + { + "epoch": 0.25464, + "grad_norm": 0.9188877940177917, + "learning_rate": 1.8578138396726707e-05, + "loss": 0.1585, + "step": 12732 + }, + { + "epoch": 0.25468, + "grad_norm": 1.4235403537750244, + "learning_rate": 1.857742068884701e-05, + "loss": 0.1302, + "step": 12734 + }, + { + "epoch": 0.25472, + "grad_norm": 2.414069175720215, + "learning_rate": 1.8576702813746087e-05, + "loss": 0.2766, + "step": 12736 + }, + { + "epoch": 0.25476, + "grad_norm": 1.3467870950698853, + "learning_rate": 1.8575984771437926e-05, + "loss": 0.1547, + "step": 12738 + }, + { + "epoch": 0.2548, + "grad_norm": 0.4867854118347168, + "learning_rate": 1.8575266561936526e-05, + "loss": 0.0593, + "step": 12740 + }, + { + "epoch": 0.25484, + "grad_norm": 3.3221027851104736, + "learning_rate": 1.8574548185255888e-05, + "loss": 0.2234, + "step": 12742 + }, + { + "epoch": 0.25488, + "grad_norm": 1.1255418062210083, + "learning_rate": 1.857382964141002e-05, + "loss": 0.2098, + "step": 12744 + }, + { + "epoch": 0.25492, + "grad_norm": 2.200483798980713, + "learning_rate": 1.8573110930412927e-05, + "loss": 0.1069, + "step": 12746 + }, + { + "epoch": 0.25496, + "grad_norm": 0.8556817770004272, + "learning_rate": 1.8572392052278623e-05, + "loss": 0.0608, + "step": 12748 + }, + { + "epoch": 0.255, + "grad_norm": 0.720320999622345, + "learning_rate": 1.8571673007021124e-05, + "loss": 0.0681, + "step": 12750 + }, + { + "epoch": 0.25504, + "grad_norm": 2.5409791469573975, + "learning_rate": 1.8570953794654446e-05, + "loss": 0.1592, + "step": 12752 + }, + { + "epoch": 0.25508, + "grad_norm": 2.0878281593322754, + "learning_rate": 1.8570234415192613e-05, + "loss": 0.1837, + "step": 12754 + }, + { + "epoch": 0.25512, + "grad_norm": 0.30694296956062317, + "learning_rate": 1.8569514868649645e-05, + "loss": 0.0362, + "step": 12756 + }, + { + "epoch": 0.25516, + "grad_norm": 1.1690855026245117, + "learning_rate": 1.8568795155039576e-05, + "loss": 0.3141, + "step": 12758 + }, + { + "epoch": 0.2552, + "grad_norm": 2.7478511333465576, + "learning_rate": 1.856807527437643e-05, + "loss": 0.1643, + "step": 12760 + }, + { + "epoch": 0.25524, + "grad_norm": 1.5158230066299438, + "learning_rate": 1.8567355226674248e-05, + "loss": 0.1071, + "step": 12762 + }, + { + "epoch": 0.25528, + "grad_norm": 0.7538143992424011, + "learning_rate": 1.8566635011947066e-05, + "loss": 0.0543, + "step": 12764 + }, + { + "epoch": 0.25532, + "grad_norm": 0.37436622381210327, + "learning_rate": 1.8565914630208924e-05, + "loss": 0.2193, + "step": 12766 + }, + { + "epoch": 0.25536, + "grad_norm": 3.2731194496154785, + "learning_rate": 1.8565194081473863e-05, + "loss": 0.1935, + "step": 12768 + }, + { + "epoch": 0.2554, + "grad_norm": 3.479818344116211, + "learning_rate": 1.8564473365755936e-05, + "loss": 0.331, + "step": 12770 + }, + { + "epoch": 0.25544, + "grad_norm": 3.23909330368042, + "learning_rate": 1.856375248306919e-05, + "loss": 0.1538, + "step": 12772 + }, + { + "epoch": 0.25548, + "grad_norm": 2.545485258102417, + "learning_rate": 1.856303143342768e-05, + "loss": 0.0823, + "step": 12774 + }, + { + "epoch": 0.25552, + "grad_norm": 3.6934258937835693, + "learning_rate": 1.8562310216845463e-05, + "loss": 0.1541, + "step": 12776 + }, + { + "epoch": 0.25556, + "grad_norm": 0.34110862016677856, + "learning_rate": 1.8561588833336607e-05, + "loss": 0.0376, + "step": 12778 + }, + { + "epoch": 0.2556, + "grad_norm": 5.949286937713623, + "learning_rate": 1.8560867282915164e-05, + "loss": 0.519, + "step": 12780 + }, + { + "epoch": 0.25564, + "grad_norm": 0.6632586717605591, + "learning_rate": 1.8560145565595204e-05, + "loss": 0.2138, + "step": 12782 + }, + { + "epoch": 0.25568, + "grad_norm": 5.159006118774414, + "learning_rate": 1.8559423681390803e-05, + "loss": 0.5078, + "step": 12784 + }, + { + "epoch": 0.25572, + "grad_norm": 1.813872218132019, + "learning_rate": 1.855870163031603e-05, + "loss": 0.0988, + "step": 12786 + }, + { + "epoch": 0.25576, + "grad_norm": 2.8547816276550293, + "learning_rate": 1.855797941238496e-05, + "loss": 0.1971, + "step": 12788 + }, + { + "epoch": 0.2558, + "grad_norm": 0.21688030660152435, + "learning_rate": 1.8557257027611677e-05, + "loss": 0.0659, + "step": 12790 + }, + { + "epoch": 0.25584, + "grad_norm": 0.7677943706512451, + "learning_rate": 1.855653447601026e-05, + "loss": 0.0352, + "step": 12792 + }, + { + "epoch": 0.25588, + "grad_norm": 1.2246853113174438, + "learning_rate": 1.8555811757594802e-05, + "loss": 0.1881, + "step": 12794 + }, + { + "epoch": 0.25592, + "grad_norm": 0.8859206438064575, + "learning_rate": 1.8555088872379387e-05, + "loss": 0.1266, + "step": 12796 + }, + { + "epoch": 0.25596, + "grad_norm": 1.2184722423553467, + "learning_rate": 1.8554365820378112e-05, + "loss": 0.1175, + "step": 12798 + }, + { + "epoch": 0.256, + "grad_norm": 1.4257628917694092, + "learning_rate": 1.855364260160507e-05, + "loss": 0.1441, + "step": 12800 + }, + { + "epoch": 0.25604, + "grad_norm": 3.112333059310913, + "learning_rate": 1.855291921607436e-05, + "loss": 0.405, + "step": 12802 + }, + { + "epoch": 0.25608, + "grad_norm": 1.447257161140442, + "learning_rate": 1.8552195663800088e-05, + "loss": 0.0546, + "step": 12804 + }, + { + "epoch": 0.25612, + "grad_norm": 1.336756944656372, + "learning_rate": 1.855147194479636e-05, + "loss": 0.1791, + "step": 12806 + }, + { + "epoch": 0.25616, + "grad_norm": 2.098148822784424, + "learning_rate": 1.8550748059077278e-05, + "loss": 0.2819, + "step": 12808 + }, + { + "epoch": 0.2562, + "grad_norm": 1.1489707231521606, + "learning_rate": 1.8550024006656967e-05, + "loss": 0.0727, + "step": 12810 + }, + { + "epoch": 0.25624, + "grad_norm": 0.3069790005683899, + "learning_rate": 1.8549299787549536e-05, + "loss": 0.0158, + "step": 12812 + }, + { + "epoch": 0.25628, + "grad_norm": 2.215352773666382, + "learning_rate": 1.85485754017691e-05, + "loss": 0.1205, + "step": 12814 + }, + { + "epoch": 0.25632, + "grad_norm": 0.20257307589054108, + "learning_rate": 1.8547850849329788e-05, + "loss": 0.0783, + "step": 12816 + }, + { + "epoch": 0.25636, + "grad_norm": 0.35657134652137756, + "learning_rate": 1.8547126130245724e-05, + "loss": 0.0347, + "step": 12818 + }, + { + "epoch": 0.2564, + "grad_norm": 0.9424009323120117, + "learning_rate": 1.854640124453103e-05, + "loss": 0.1061, + "step": 12820 + }, + { + "epoch": 0.25644, + "grad_norm": 0.4314141273498535, + "learning_rate": 1.854567619219985e-05, + "loss": 0.0363, + "step": 12822 + }, + { + "epoch": 0.25648, + "grad_norm": 3.8133480548858643, + "learning_rate": 1.854495097326631e-05, + "loss": 0.3801, + "step": 12824 + }, + { + "epoch": 0.25652, + "grad_norm": 0.172689288854599, + "learning_rate": 1.8544225587744554e-05, + "loss": 0.2397, + "step": 12826 + }, + { + "epoch": 0.25656, + "grad_norm": 1.1556005477905273, + "learning_rate": 1.854350003564872e-05, + "loss": 0.0463, + "step": 12828 + }, + { + "epoch": 0.2566, + "grad_norm": 0.18050609529018402, + "learning_rate": 1.8542774316992953e-05, + "loss": 0.1432, + "step": 12830 + }, + { + "epoch": 0.25664, + "grad_norm": 4.971765041351318, + "learning_rate": 1.8542048431791404e-05, + "loss": 0.5019, + "step": 12832 + }, + { + "epoch": 0.25668, + "grad_norm": 0.42597025632858276, + "learning_rate": 1.8541322380058223e-05, + "loss": 0.3355, + "step": 12834 + }, + { + "epoch": 0.25672, + "grad_norm": 2.301483631134033, + "learning_rate": 1.8540596161807565e-05, + "loss": 0.1296, + "step": 12836 + }, + { + "epoch": 0.25676, + "grad_norm": 0.6957600116729736, + "learning_rate": 1.8539869777053586e-05, + "loss": 0.0714, + "step": 12838 + }, + { + "epoch": 0.2568, + "grad_norm": 0.19620491564273834, + "learning_rate": 1.8539143225810453e-05, + "loss": 0.1329, + "step": 12840 + }, + { + "epoch": 0.25684, + "grad_norm": 0.7964149117469788, + "learning_rate": 1.8538416508092323e-05, + "loss": 0.058, + "step": 12842 + }, + { + "epoch": 0.25688, + "grad_norm": 0.2697044909000397, + "learning_rate": 1.8537689623913366e-05, + "loss": 0.043, + "step": 12844 + }, + { + "epoch": 0.25692, + "grad_norm": 0.46370837092399597, + "learning_rate": 1.8536962573287757e-05, + "loss": 0.2675, + "step": 12846 + }, + { + "epoch": 0.25696, + "grad_norm": 4.686145305633545, + "learning_rate": 1.8536235356229667e-05, + "loss": 0.4928, + "step": 12848 + }, + { + "epoch": 0.257, + "grad_norm": 2.4907877445220947, + "learning_rate": 1.8535507972753275e-05, + "loss": 0.1113, + "step": 12850 + }, + { + "epoch": 0.25704, + "grad_norm": 6.4145588874816895, + "learning_rate": 1.853478042287276e-05, + "loss": 0.6756, + "step": 12852 + }, + { + "epoch": 0.25708, + "grad_norm": 0.19578035175800323, + "learning_rate": 1.853405270660231e-05, + "loss": 0.102, + "step": 12854 + }, + { + "epoch": 0.25712, + "grad_norm": 1.7220886945724487, + "learning_rate": 1.8533324823956104e-05, + "loss": 0.1595, + "step": 12856 + }, + { + "epoch": 0.25716, + "grad_norm": 2.1427438259124756, + "learning_rate": 1.8532596774948338e-05, + "loss": 0.2117, + "step": 12858 + }, + { + "epoch": 0.2572, + "grad_norm": 1.3394545316696167, + "learning_rate": 1.8531868559593205e-05, + "loss": 0.0987, + "step": 12860 + }, + { + "epoch": 0.25724, + "grad_norm": 3.5407803058624268, + "learning_rate": 1.8531140177904904e-05, + "loss": 0.2807, + "step": 12862 + }, + { + "epoch": 0.25728, + "grad_norm": 1.6242300271987915, + "learning_rate": 1.8530411629897628e-05, + "loss": 0.0843, + "step": 12864 + }, + { + "epoch": 0.25732, + "grad_norm": 0.3868776857852936, + "learning_rate": 1.852968291558559e-05, + "loss": 0.3522, + "step": 12866 + }, + { + "epoch": 0.25736, + "grad_norm": 1.1682415008544922, + "learning_rate": 1.8528954034982992e-05, + "loss": 0.1304, + "step": 12868 + }, + { + "epoch": 0.2574, + "grad_norm": 1.6794663667678833, + "learning_rate": 1.8528224988104044e-05, + "loss": 0.2986, + "step": 12870 + }, + { + "epoch": 0.25744, + "grad_norm": 0.5317840576171875, + "learning_rate": 1.852749577496296e-05, + "loss": 0.0275, + "step": 12872 + }, + { + "epoch": 0.25748, + "grad_norm": 0.2720170319080353, + "learning_rate": 1.8526766395573955e-05, + "loss": 0.2013, + "step": 12874 + }, + { + "epoch": 0.25752, + "grad_norm": 2.0679872035980225, + "learning_rate": 1.8526036849951247e-05, + "loss": 0.1551, + "step": 12876 + }, + { + "epoch": 0.25756, + "grad_norm": 2.064929962158203, + "learning_rate": 1.8525307138109064e-05, + "loss": 0.1092, + "step": 12878 + }, + { + "epoch": 0.2576, + "grad_norm": 0.6450939178466797, + "learning_rate": 1.8524577260061628e-05, + "loss": 0.1431, + "step": 12880 + }, + { + "epoch": 0.25764, + "grad_norm": 1.9537156820297241, + "learning_rate": 1.8523847215823167e-05, + "loss": 0.1599, + "step": 12882 + }, + { + "epoch": 0.25768, + "grad_norm": 2.271742820739746, + "learning_rate": 1.852311700540792e-05, + "loss": 0.1647, + "step": 12884 + }, + { + "epoch": 0.25772, + "grad_norm": 2.0635905265808105, + "learning_rate": 1.8522386628830117e-05, + "loss": 0.2176, + "step": 12886 + }, + { + "epoch": 0.25776, + "grad_norm": 3.146209955215454, + "learning_rate": 1.8521656086103998e-05, + "loss": 0.1843, + "step": 12888 + }, + { + "epoch": 0.2578, + "grad_norm": 0.6826728582382202, + "learning_rate": 1.8520925377243812e-05, + "loss": 0.072, + "step": 12890 + }, + { + "epoch": 0.25784, + "grad_norm": 1.2926603555679321, + "learning_rate": 1.852019450226379e-05, + "loss": 0.1388, + "step": 12892 + }, + { + "epoch": 0.25788, + "grad_norm": 0.1460658311843872, + "learning_rate": 1.8519463461178198e-05, + "loss": 0.043, + "step": 12894 + }, + { + "epoch": 0.25792, + "grad_norm": 2.065156936645508, + "learning_rate": 1.8518732254001275e-05, + "loss": 0.292, + "step": 12896 + }, + { + "epoch": 0.25796, + "grad_norm": 2.3534648418426514, + "learning_rate": 1.8518000880747286e-05, + "loss": 0.3168, + "step": 12898 + }, + { + "epoch": 0.258, + "grad_norm": 0.4880741238594055, + "learning_rate": 1.851726934143048e-05, + "loss": 0.0365, + "step": 12900 + }, + { + "epoch": 0.25804, + "grad_norm": 3.0832114219665527, + "learning_rate": 1.851653763606512e-05, + "loss": 0.3012, + "step": 12902 + }, + { + "epoch": 0.25808, + "grad_norm": 0.8673762679100037, + "learning_rate": 1.8515805764665478e-05, + "loss": 0.1268, + "step": 12904 + }, + { + "epoch": 0.25812, + "grad_norm": 2.2275872230529785, + "learning_rate": 1.851507372724582e-05, + "loss": 0.1707, + "step": 12906 + }, + { + "epoch": 0.25816, + "grad_norm": 1.1149704456329346, + "learning_rate": 1.8514341523820417e-05, + "loss": 0.0951, + "step": 12908 + }, + { + "epoch": 0.2582, + "grad_norm": 2.6488773822784424, + "learning_rate": 1.8513609154403535e-05, + "loss": 0.0547, + "step": 12910 + }, + { + "epoch": 0.25824, + "grad_norm": 1.0364302396774292, + "learning_rate": 1.8512876619009467e-05, + "loss": 0.1257, + "step": 12912 + }, + { + "epoch": 0.25828, + "grad_norm": 0.5439472794532776, + "learning_rate": 1.8512143917652486e-05, + "loss": 0.0634, + "step": 12914 + }, + { + "epoch": 0.25832, + "grad_norm": 0.9415821433067322, + "learning_rate": 1.8511411050346873e-05, + "loss": 0.047, + "step": 12916 + }, + { + "epoch": 0.25836, + "grad_norm": 4.734103202819824, + "learning_rate": 1.8510678017106923e-05, + "loss": 0.5052, + "step": 12918 + }, + { + "epoch": 0.2584, + "grad_norm": 3.3583688735961914, + "learning_rate": 1.850994481794692e-05, + "loss": 0.1684, + "step": 12920 + }, + { + "epoch": 0.25844, + "grad_norm": 1.9954699277877808, + "learning_rate": 1.8509211452881164e-05, + "loss": 0.3256, + "step": 12922 + }, + { + "epoch": 0.25848, + "grad_norm": 3.8018224239349365, + "learning_rate": 1.850847792192395e-05, + "loss": 0.2837, + "step": 12924 + }, + { + "epoch": 0.25852, + "grad_norm": 0.13819772005081177, + "learning_rate": 1.8507744225089576e-05, + "loss": 0.0342, + "step": 12926 + }, + { + "epoch": 0.25856, + "grad_norm": 4.417327404022217, + "learning_rate": 1.8507010362392352e-05, + "loss": 0.4574, + "step": 12928 + }, + { + "epoch": 0.2586, + "grad_norm": 1.3854364156723022, + "learning_rate": 1.850627633384658e-05, + "loss": 0.0926, + "step": 12930 + }, + { + "epoch": 0.25864, + "grad_norm": 2.0202832221984863, + "learning_rate": 1.850554213946657e-05, + "loss": 0.2693, + "step": 12932 + }, + { + "epoch": 0.25868, + "grad_norm": 1.2339322566986084, + "learning_rate": 1.8504807779266637e-05, + "loss": 0.3147, + "step": 12934 + }, + { + "epoch": 0.25872, + "grad_norm": 2.416682720184326, + "learning_rate": 1.85040732532611e-05, + "loss": 0.2379, + "step": 12936 + }, + { + "epoch": 0.25876, + "grad_norm": 0.17299287021160126, + "learning_rate": 1.8503338561464273e-05, + "loss": 0.0227, + "step": 12938 + }, + { + "epoch": 0.2588, + "grad_norm": 2.8627631664276123, + "learning_rate": 1.8502603703890488e-05, + "loss": 0.3885, + "step": 12940 + }, + { + "epoch": 0.25884, + "grad_norm": 2.1979868412017822, + "learning_rate": 1.8501868680554062e-05, + "loss": 0.3634, + "step": 12942 + }, + { + "epoch": 0.25888, + "grad_norm": 0.5339406728744507, + "learning_rate": 1.850113349146933e-05, + "loss": 0.0295, + "step": 12944 + }, + { + "epoch": 0.25892, + "grad_norm": 0.37028124928474426, + "learning_rate": 1.8500398136650627e-05, + "loss": 0.14, + "step": 12946 + }, + { + "epoch": 0.25896, + "grad_norm": 3.8550727367401123, + "learning_rate": 1.8499662616112282e-05, + "loss": 0.197, + "step": 12948 + }, + { + "epoch": 0.259, + "grad_norm": 3.994664430618286, + "learning_rate": 1.849892692986864e-05, + "loss": 0.2888, + "step": 12950 + }, + { + "epoch": 0.25904, + "grad_norm": 3.637413501739502, + "learning_rate": 1.8498191077934042e-05, + "loss": 0.3424, + "step": 12952 + }, + { + "epoch": 0.25908, + "grad_norm": 0.9729142785072327, + "learning_rate": 1.8497455060322834e-05, + "loss": 0.1646, + "step": 12954 + }, + { + "epoch": 0.25912, + "grad_norm": 1.2947378158569336, + "learning_rate": 1.8496718877049367e-05, + "loss": 0.0669, + "step": 12956 + }, + { + "epoch": 0.25916, + "grad_norm": 1.2030107975006104, + "learning_rate": 1.8495982528127988e-05, + "loss": 0.0829, + "step": 12958 + }, + { + "epoch": 0.2592, + "grad_norm": 0.3153713345527649, + "learning_rate": 1.8495246013573057e-05, + "loss": 0.0423, + "step": 12960 + }, + { + "epoch": 0.25924, + "grad_norm": 3.287943124771118, + "learning_rate": 1.849450933339893e-05, + "loss": 0.3734, + "step": 12962 + }, + { + "epoch": 0.25928, + "grad_norm": 2.0768120288848877, + "learning_rate": 1.849377248761997e-05, + "loss": 0.1269, + "step": 12964 + }, + { + "epoch": 0.25932, + "grad_norm": 1.6580488681793213, + "learning_rate": 1.8493035476250542e-05, + "loss": 0.1148, + "step": 12966 + }, + { + "epoch": 0.25936, + "grad_norm": 0.8957072496414185, + "learning_rate": 1.8492298299305015e-05, + "loss": 0.092, + "step": 12968 + }, + { + "epoch": 0.2594, + "grad_norm": 0.5129411220550537, + "learning_rate": 1.8491560956797766e-05, + "loss": 0.1135, + "step": 12970 + }, + { + "epoch": 0.25944, + "grad_norm": 0.7412582039833069, + "learning_rate": 1.8490823448743157e-05, + "loss": 0.0854, + "step": 12972 + }, + { + "epoch": 0.25948, + "grad_norm": 0.6200820803642273, + "learning_rate": 1.8490085775155578e-05, + "loss": 0.1999, + "step": 12974 + }, + { + "epoch": 0.25952, + "grad_norm": 0.6613550186157227, + "learning_rate": 1.8489347936049403e-05, + "loss": 0.0513, + "step": 12976 + }, + { + "epoch": 0.25956, + "grad_norm": 2.620978593826294, + "learning_rate": 1.848860993143902e-05, + "loss": 0.5611, + "step": 12978 + }, + { + "epoch": 0.2596, + "grad_norm": 0.996439516544342, + "learning_rate": 1.848787176133882e-05, + "loss": 0.0652, + "step": 12980 + }, + { + "epoch": 0.25964, + "grad_norm": 0.924527108669281, + "learning_rate": 1.8487133425763186e-05, + "loss": 0.1387, + "step": 12982 + }, + { + "epoch": 0.25968, + "grad_norm": 0.8868405222892761, + "learning_rate": 1.848639492472652e-05, + "loss": 0.2533, + "step": 12984 + }, + { + "epoch": 0.25972, + "grad_norm": 1.9999319314956665, + "learning_rate": 1.8485656258243213e-05, + "loss": 0.2142, + "step": 12986 + }, + { + "epoch": 0.25976, + "grad_norm": 0.08810557425022125, + "learning_rate": 1.8484917426327667e-05, + "loss": 0.0182, + "step": 12988 + }, + { + "epoch": 0.2598, + "grad_norm": 1.0882701873779297, + "learning_rate": 1.848417842899429e-05, + "loss": 0.074, + "step": 12990 + }, + { + "epoch": 0.25984, + "grad_norm": 1.659543752670288, + "learning_rate": 1.8483439266257485e-05, + "loss": 0.3303, + "step": 12992 + }, + { + "epoch": 0.25988, + "grad_norm": 0.4674219489097595, + "learning_rate": 1.848269993813167e-05, + "loss": 0.0472, + "step": 12994 + }, + { + "epoch": 0.25992, + "grad_norm": 0.23368990421295166, + "learning_rate": 1.848196044463124e-05, + "loss": 0.2149, + "step": 12996 + }, + { + "epoch": 0.25996, + "grad_norm": 0.25018420815467834, + "learning_rate": 1.848122078577064e-05, + "loss": 0.326, + "step": 12998 + }, + { + "epoch": 0.26, + "grad_norm": 2.303804397583008, + "learning_rate": 1.848048096156426e-05, + "loss": 0.1684, + "step": 13000 + }, + { + "epoch": 0.26004, + "grad_norm": 0.3830341100692749, + "learning_rate": 1.8479740972026545e-05, + "loss": 0.0511, + "step": 13002 + }, + { + "epoch": 0.26008, + "grad_norm": 2.6052615642547607, + "learning_rate": 1.8479000817171912e-05, + "loss": 0.092, + "step": 13004 + }, + { + "epoch": 0.26012, + "grad_norm": 1.785607099533081, + "learning_rate": 1.8478260497014796e-05, + "loss": 0.2865, + "step": 13006 + }, + { + "epoch": 0.26016, + "grad_norm": 1.8647180795669556, + "learning_rate": 1.847752001156962e-05, + "loss": 0.1853, + "step": 13008 + }, + { + "epoch": 0.2602, + "grad_norm": 1.32681143283844, + "learning_rate": 1.8476779360850833e-05, + "loss": 0.0964, + "step": 13010 + }, + { + "epoch": 0.26024, + "grad_norm": 5.51923942565918, + "learning_rate": 1.8476038544872867e-05, + "loss": 0.7523, + "step": 13012 + }, + { + "epoch": 0.26028, + "grad_norm": 0.4814985394477844, + "learning_rate": 1.8475297563650164e-05, + "loss": 0.1328, + "step": 13014 + }, + { + "epoch": 0.26032, + "grad_norm": 0.7449128031730652, + "learning_rate": 1.8474556417197175e-05, + "loss": 0.294, + "step": 13016 + }, + { + "epoch": 0.26036, + "grad_norm": 1.477545976638794, + "learning_rate": 1.8473815105528342e-05, + "loss": 0.0755, + "step": 13018 + }, + { + "epoch": 0.2604, + "grad_norm": 1.2498903274536133, + "learning_rate": 1.8473073628658123e-05, + "loss": 0.241, + "step": 13020 + }, + { + "epoch": 0.26044, + "grad_norm": 1.4469594955444336, + "learning_rate": 1.8472331986600972e-05, + "loss": 0.0755, + "step": 13022 + }, + { + "epoch": 0.26048, + "grad_norm": 0.6290865540504456, + "learning_rate": 1.8471590179371346e-05, + "loss": 0.1022, + "step": 13024 + }, + { + "epoch": 0.26052, + "grad_norm": 1.6324102878570557, + "learning_rate": 1.8470848206983706e-05, + "loss": 0.2911, + "step": 13026 + }, + { + "epoch": 0.26056, + "grad_norm": 0.6799131035804749, + "learning_rate": 1.8470106069452522e-05, + "loss": 0.2204, + "step": 13028 + }, + { + "epoch": 0.2606, + "grad_norm": 0.29284337162971497, + "learning_rate": 1.8469363766792258e-05, + "loss": 0.1486, + "step": 13030 + }, + { + "epoch": 0.26064, + "grad_norm": 1.0575001239776611, + "learning_rate": 1.8468621299017388e-05, + "loss": 0.0593, + "step": 13032 + }, + { + "epoch": 0.26068, + "grad_norm": 3.0211665630340576, + "learning_rate": 1.8467878666142386e-05, + "loss": 0.235, + "step": 13034 + }, + { + "epoch": 0.26072, + "grad_norm": 1.4452314376831055, + "learning_rate": 1.8467135868181727e-05, + "loss": 0.3979, + "step": 13036 + }, + { + "epoch": 0.26076, + "grad_norm": 0.3922651410102844, + "learning_rate": 1.84663929051499e-05, + "loss": 0.0404, + "step": 13038 + }, + { + "epoch": 0.2608, + "grad_norm": 0.10685823112726212, + "learning_rate": 1.8465649777061377e-05, + "loss": 0.066, + "step": 13040 + }, + { + "epoch": 0.26084, + "grad_norm": 2.4469590187072754, + "learning_rate": 1.846490648393066e-05, + "loss": 0.1343, + "step": 13042 + }, + { + "epoch": 0.26088, + "grad_norm": 0.4468083083629608, + "learning_rate": 1.846416302577223e-05, + "loss": 0.2993, + "step": 13044 + }, + { + "epoch": 0.26092, + "grad_norm": 1.7830570936203003, + "learning_rate": 1.8463419402600586e-05, + "loss": 0.1386, + "step": 13046 + }, + { + "epoch": 0.26096, + "grad_norm": 1.813840389251709, + "learning_rate": 1.8462675614430223e-05, + "loss": 0.1193, + "step": 13048 + }, + { + "epoch": 0.261, + "grad_norm": 2.0503532886505127, + "learning_rate": 1.8461931661275642e-05, + "loss": 0.1935, + "step": 13050 + }, + { + "epoch": 0.26104, + "grad_norm": 0.9407958388328552, + "learning_rate": 1.8461187543151346e-05, + "loss": 0.1028, + "step": 13052 + }, + { + "epoch": 0.26108, + "grad_norm": 2.499138593673706, + "learning_rate": 1.8460443260071842e-05, + "loss": 0.1568, + "step": 13054 + }, + { + "epoch": 0.26112, + "grad_norm": 1.4799453020095825, + "learning_rate": 1.8459698812051642e-05, + "loss": 0.1034, + "step": 13056 + }, + { + "epoch": 0.26116, + "grad_norm": 0.5199181437492371, + "learning_rate": 1.845895419910526e-05, + "loss": 0.0282, + "step": 13058 + }, + { + "epoch": 0.2612, + "grad_norm": 2.183885097503662, + "learning_rate": 1.8458209421247208e-05, + "loss": 0.0713, + "step": 13060 + }, + { + "epoch": 0.26124, + "grad_norm": 1.8252264261245728, + "learning_rate": 1.845746447849201e-05, + "loss": 0.114, + "step": 13062 + }, + { + "epoch": 0.26128, + "grad_norm": 4.14583158493042, + "learning_rate": 1.845671937085419e-05, + "loss": 0.3077, + "step": 13064 + }, + { + "epoch": 0.26132, + "grad_norm": 1.097846269607544, + "learning_rate": 1.8455974098348267e-05, + "loss": 0.3434, + "step": 13066 + }, + { + "epoch": 0.26136, + "grad_norm": 1.3191756010055542, + "learning_rate": 1.845522866098878e-05, + "loss": 0.0588, + "step": 13068 + }, + { + "epoch": 0.2614, + "grad_norm": 4.279885768890381, + "learning_rate": 1.8454483058790254e-05, + "loss": 0.3788, + "step": 13070 + }, + { + "epoch": 0.26144, + "grad_norm": 3.5715651512145996, + "learning_rate": 1.845373729176723e-05, + "loss": 0.291, + "step": 13072 + }, + { + "epoch": 0.26148, + "grad_norm": 0.9713408350944519, + "learning_rate": 1.8452991359934247e-05, + "loss": 0.1841, + "step": 13074 + }, + { + "epoch": 0.26152, + "grad_norm": 1.1529017686843872, + "learning_rate": 1.8452245263305842e-05, + "loss": 0.1804, + "step": 13076 + }, + { + "epoch": 0.26156, + "grad_norm": 3.1645030975341797, + "learning_rate": 1.8451499001896566e-05, + "loss": 0.174, + "step": 13078 + }, + { + "epoch": 0.2616, + "grad_norm": 1.9316821098327637, + "learning_rate": 1.8450752575720967e-05, + "loss": 0.1926, + "step": 13080 + }, + { + "epoch": 0.26164, + "grad_norm": 2.345200777053833, + "learning_rate": 1.8450005984793595e-05, + "loss": 0.1643, + "step": 13082 + }, + { + "epoch": 0.26168, + "grad_norm": 0.4262254536151886, + "learning_rate": 1.8449259229129003e-05, + "loss": 0.1312, + "step": 13084 + }, + { + "epoch": 0.26172, + "grad_norm": 4.243289470672607, + "learning_rate": 1.8448512308741755e-05, + "loss": 0.4321, + "step": 13086 + }, + { + "epoch": 0.26176, + "grad_norm": 1.5151766538619995, + "learning_rate": 1.844776522364641e-05, + "loss": 0.0756, + "step": 13088 + }, + { + "epoch": 0.2618, + "grad_norm": 1.416543960571289, + "learning_rate": 1.844701797385753e-05, + "loss": 0.0957, + "step": 13090 + }, + { + "epoch": 0.26184, + "grad_norm": 5.472243309020996, + "learning_rate": 1.844627055938969e-05, + "loss": 0.559, + "step": 13092 + }, + { + "epoch": 0.26188, + "grad_norm": 1.421844244003296, + "learning_rate": 1.8445522980257452e-05, + "loss": 0.2155, + "step": 13094 + }, + { + "epoch": 0.26192, + "grad_norm": 3.8200430870056152, + "learning_rate": 1.8444775236475402e-05, + "loss": 0.2214, + "step": 13096 + }, + { + "epoch": 0.26196, + "grad_norm": 0.5611984133720398, + "learning_rate": 1.8444027328058106e-05, + "loss": 0.0412, + "step": 13098 + }, + { + "epoch": 0.262, + "grad_norm": 1.1077096462249756, + "learning_rate": 1.8443279255020153e-05, + "loss": 0.4348, + "step": 13100 + }, + { + "epoch": 0.26204, + "grad_norm": 0.1607050746679306, + "learning_rate": 1.8442531017376122e-05, + "loss": 0.3073, + "step": 13102 + }, + { + "epoch": 0.26208, + "grad_norm": 0.19312061369419098, + "learning_rate": 1.8441782615140603e-05, + "loss": 0.2415, + "step": 13104 + }, + { + "epoch": 0.26212, + "grad_norm": 2.8654589653015137, + "learning_rate": 1.8441034048328185e-05, + "loss": 0.2131, + "step": 13106 + }, + { + "epoch": 0.26216, + "grad_norm": 1.8379548788070679, + "learning_rate": 1.8440285316953463e-05, + "loss": 0.1177, + "step": 13108 + }, + { + "epoch": 0.2622, + "grad_norm": 2.5005552768707275, + "learning_rate": 1.8439536421031035e-05, + "loss": 0.1274, + "step": 13110 + }, + { + "epoch": 0.26224, + "grad_norm": 0.8177522420883179, + "learning_rate": 1.84387873605755e-05, + "loss": 0.0726, + "step": 13112 + }, + { + "epoch": 0.26228, + "grad_norm": 4.018287181854248, + "learning_rate": 1.8438038135601456e-05, + "loss": 0.4139, + "step": 13114 + }, + { + "epoch": 0.26232, + "grad_norm": 1.9863694906234741, + "learning_rate": 1.8437288746123518e-05, + "loss": 0.1841, + "step": 13116 + }, + { + "epoch": 0.26236, + "grad_norm": 1.142555594444275, + "learning_rate": 1.843653919215629e-05, + "loss": 0.1221, + "step": 13118 + }, + { + "epoch": 0.2624, + "grad_norm": 3.874295473098755, + "learning_rate": 1.843578947371439e-05, + "loss": 0.2467, + "step": 13120 + }, + { + "epoch": 0.26244, + "grad_norm": 0.5656265616416931, + "learning_rate": 1.843503959081243e-05, + "loss": 0.077, + "step": 13122 + }, + { + "epoch": 0.26248, + "grad_norm": 0.459708571434021, + "learning_rate": 1.843428954346503e-05, + "loss": 0.048, + "step": 13124 + }, + { + "epoch": 0.26252, + "grad_norm": 2.1648237705230713, + "learning_rate": 1.8433539331686812e-05, + "loss": 0.3209, + "step": 13126 + }, + { + "epoch": 0.26256, + "grad_norm": 1.2376688718795776, + "learning_rate": 1.8432788955492404e-05, + "loss": 0.1269, + "step": 13128 + }, + { + "epoch": 0.2626, + "grad_norm": 2.6757755279541016, + "learning_rate": 1.8432038414896432e-05, + "loss": 0.2729, + "step": 13130 + }, + { + "epoch": 0.26264, + "grad_norm": 0.22687818109989166, + "learning_rate": 1.843128770991353e-05, + "loss": 0.0556, + "step": 13132 + }, + { + "epoch": 0.26268, + "grad_norm": 0.1206810250878334, + "learning_rate": 1.8430536840558337e-05, + "loss": 0.1403, + "step": 13134 + }, + { + "epoch": 0.26272, + "grad_norm": 1.9791617393493652, + "learning_rate": 1.842978580684549e-05, + "loss": 0.1546, + "step": 13136 + }, + { + "epoch": 0.26276, + "grad_norm": 0.1869966685771942, + "learning_rate": 1.8429034608789626e-05, + "loss": 0.0236, + "step": 13138 + }, + { + "epoch": 0.2628, + "grad_norm": 1.006481647491455, + "learning_rate": 1.842828324640539e-05, + "loss": 0.1694, + "step": 13140 + }, + { + "epoch": 0.26284, + "grad_norm": 1.5045253038406372, + "learning_rate": 1.8427531719707433e-05, + "loss": 0.2912, + "step": 13142 + }, + { + "epoch": 0.26288, + "grad_norm": 3.50871205329895, + "learning_rate": 1.842678002871041e-05, + "loss": 0.2204, + "step": 13144 + }, + { + "epoch": 0.26292, + "grad_norm": 0.7588958740234375, + "learning_rate": 1.842602817342897e-05, + "loss": 0.1684, + "step": 13146 + }, + { + "epoch": 0.26296, + "grad_norm": 1.8760360479354858, + "learning_rate": 1.842527615387777e-05, + "loss": 0.2376, + "step": 13148 + }, + { + "epoch": 0.263, + "grad_norm": 0.5472724437713623, + "learning_rate": 1.842452397007148e-05, + "loss": 0.0332, + "step": 13150 + }, + { + "epoch": 0.26304, + "grad_norm": 0.8816221356391907, + "learning_rate": 1.8423771622024752e-05, + "loss": 0.1685, + "step": 13152 + }, + { + "epoch": 0.26308, + "grad_norm": 2.1365199089050293, + "learning_rate": 1.842301910975226e-05, + "loss": 0.4065, + "step": 13154 + }, + { + "epoch": 0.26312, + "grad_norm": 1.1899921894073486, + "learning_rate": 1.8422266433268677e-05, + "loss": 0.0997, + "step": 13156 + }, + { + "epoch": 0.26316, + "grad_norm": 2.8447394371032715, + "learning_rate": 1.842151359258867e-05, + "loss": 0.2635, + "step": 13158 + }, + { + "epoch": 0.2632, + "grad_norm": 0.882147490978241, + "learning_rate": 1.8420760587726925e-05, + "loss": 0.0554, + "step": 13160 + }, + { + "epoch": 0.26324, + "grad_norm": 0.4778634011745453, + "learning_rate": 1.8420007418698115e-05, + "loss": 0.064, + "step": 13162 + }, + { + "epoch": 0.26328, + "grad_norm": 0.6153609156608582, + "learning_rate": 1.8419254085516923e-05, + "loss": 0.1272, + "step": 13164 + }, + { + "epoch": 0.26332, + "grad_norm": 3.5406980514526367, + "learning_rate": 1.8418500588198042e-05, + "loss": 0.3049, + "step": 13166 + }, + { + "epoch": 0.26336, + "grad_norm": 1.2041414976119995, + "learning_rate": 1.8417746926756157e-05, + "loss": 0.0695, + "step": 13168 + }, + { + "epoch": 0.2634, + "grad_norm": 1.9515597820281982, + "learning_rate": 1.8416993101205957e-05, + "loss": 0.1214, + "step": 13170 + }, + { + "epoch": 0.26344, + "grad_norm": 1.1206248998641968, + "learning_rate": 1.841623911156215e-05, + "loss": 0.1194, + "step": 13172 + }, + { + "epoch": 0.26348, + "grad_norm": 1.2936654090881348, + "learning_rate": 1.8415484957839425e-05, + "loss": 0.074, + "step": 13174 + }, + { + "epoch": 0.26352, + "grad_norm": 4.6961894035339355, + "learning_rate": 1.841473064005249e-05, + "loss": 0.3279, + "step": 13176 + }, + { + "epoch": 0.26356, + "grad_norm": 1.5797566175460815, + "learning_rate": 1.8413976158216045e-05, + "loss": 0.1904, + "step": 13178 + }, + { + "epoch": 0.2636, + "grad_norm": 3.512632131576538, + "learning_rate": 1.8413221512344805e-05, + "loss": 0.206, + "step": 13180 + }, + { + "epoch": 0.26364, + "grad_norm": 1.7019661664962769, + "learning_rate": 1.8412466702453484e-05, + "loss": 0.103, + "step": 13182 + }, + { + "epoch": 0.26368, + "grad_norm": 0.6436877846717834, + "learning_rate": 1.8411711728556787e-05, + "loss": 0.1006, + "step": 13184 + }, + { + "epoch": 0.26372, + "grad_norm": 4.494359493255615, + "learning_rate": 1.8410956590669444e-05, + "loss": 0.3045, + "step": 13186 + }, + { + "epoch": 0.26376, + "grad_norm": 0.8744049668312073, + "learning_rate": 1.8410201288806168e-05, + "loss": 0.0309, + "step": 13188 + }, + { + "epoch": 0.2638, + "grad_norm": 1.927114725112915, + "learning_rate": 1.8409445822981694e-05, + "loss": 0.0897, + "step": 13190 + }, + { + "epoch": 0.26384, + "grad_norm": 0.64263916015625, + "learning_rate": 1.8408690193210737e-05, + "loss": 0.1203, + "step": 13192 + }, + { + "epoch": 0.26388, + "grad_norm": 0.5110843181610107, + "learning_rate": 1.840793439950804e-05, + "loss": 0.1652, + "step": 13194 + }, + { + "epoch": 0.26392, + "grad_norm": 0.4440609812736511, + "learning_rate": 1.8407178441888332e-05, + "loss": 0.0284, + "step": 13196 + }, + { + "epoch": 0.26396, + "grad_norm": 2.901899576187134, + "learning_rate": 1.8406422320366354e-05, + "loss": 0.179, + "step": 13198 + }, + { + "epoch": 0.264, + "grad_norm": 0.4352920651435852, + "learning_rate": 1.8405666034956842e-05, + "loss": 0.1133, + "step": 13200 + }, + { + "epoch": 0.26404, + "grad_norm": 0.6128321886062622, + "learning_rate": 1.8404909585674547e-05, + "loss": 0.0934, + "step": 13202 + }, + { + "epoch": 0.26408, + "grad_norm": 0.7929186224937439, + "learning_rate": 1.8404152972534212e-05, + "loss": 0.1928, + "step": 13204 + }, + { + "epoch": 0.26412, + "grad_norm": 0.8801857829093933, + "learning_rate": 1.8403396195550583e-05, + "loss": 0.0769, + "step": 13206 + }, + { + "epoch": 0.26416, + "grad_norm": 4.376862525939941, + "learning_rate": 1.8402639254738422e-05, + "loss": 0.3155, + "step": 13208 + }, + { + "epoch": 0.2642, + "grad_norm": 1.745628833770752, + "learning_rate": 1.8401882150112485e-05, + "loss": 0.1744, + "step": 13210 + }, + { + "epoch": 0.26424, + "grad_norm": 4.522614002227783, + "learning_rate": 1.8401124881687526e-05, + "loss": 0.2369, + "step": 13212 + }, + { + "epoch": 0.26428, + "grad_norm": 1.0061336755752563, + "learning_rate": 1.8400367449478315e-05, + "loss": 0.0636, + "step": 13214 + }, + { + "epoch": 0.26432, + "grad_norm": 4.486847400665283, + "learning_rate": 1.8399609853499614e-05, + "loss": 0.568, + "step": 13216 + }, + { + "epoch": 0.26436, + "grad_norm": 4.648692607879639, + "learning_rate": 1.8398852093766195e-05, + "loss": 0.5125, + "step": 13218 + }, + { + "epoch": 0.2644, + "grad_norm": 3.577181577682495, + "learning_rate": 1.839809417029283e-05, + "loss": 0.4266, + "step": 13220 + }, + { + "epoch": 0.26444, + "grad_norm": 2.374598503112793, + "learning_rate": 1.8397336083094297e-05, + "loss": 0.1278, + "step": 13222 + }, + { + "epoch": 0.26448, + "grad_norm": 0.42310023307800293, + "learning_rate": 1.839657783218537e-05, + "loss": 0.299, + "step": 13224 + }, + { + "epoch": 0.26452, + "grad_norm": 1.2712061405181885, + "learning_rate": 1.839581941758084e-05, + "loss": 0.0557, + "step": 13226 + }, + { + "epoch": 0.26456, + "grad_norm": 1.611876130104065, + "learning_rate": 1.8395060839295488e-05, + "loss": 0.146, + "step": 13228 + }, + { + "epoch": 0.2646, + "grad_norm": 0.26951366662979126, + "learning_rate": 1.8394302097344103e-05, + "loss": 0.029, + "step": 13230 + }, + { + "epoch": 0.26464, + "grad_norm": 1.817070722579956, + "learning_rate": 1.8393543191741473e-05, + "loss": 0.1347, + "step": 13232 + }, + { + "epoch": 0.26468, + "grad_norm": 1.585113286972046, + "learning_rate": 1.8392784122502398e-05, + "loss": 0.1246, + "step": 13234 + }, + { + "epoch": 0.26472, + "grad_norm": 3.463995933532715, + "learning_rate": 1.8392024889641678e-05, + "loss": 0.3887, + "step": 13236 + }, + { + "epoch": 0.26476, + "grad_norm": 0.4837510585784912, + "learning_rate": 1.839126549317411e-05, + "loss": 0.0439, + "step": 13238 + }, + { + "epoch": 0.2648, + "grad_norm": 0.7327644228935242, + "learning_rate": 1.8390505933114503e-05, + "loss": 0.0677, + "step": 13240 + }, + { + "epoch": 0.26484, + "grad_norm": 0.9807936549186707, + "learning_rate": 1.8389746209477662e-05, + "loss": 0.1071, + "step": 13242 + }, + { + "epoch": 0.26488, + "grad_norm": 0.5484764575958252, + "learning_rate": 1.83889863222784e-05, + "loss": 0.0404, + "step": 13244 + }, + { + "epoch": 0.26492, + "grad_norm": 1.2883450984954834, + "learning_rate": 1.838822627153153e-05, + "loss": 0.0701, + "step": 13246 + }, + { + "epoch": 0.26496, + "grad_norm": 0.3366704285144806, + "learning_rate": 1.8387466057251872e-05, + "loss": 0.0992, + "step": 13248 + }, + { + "epoch": 0.265, + "grad_norm": 3.1932449340820312, + "learning_rate": 1.8386705679454243e-05, + "loss": 0.1963, + "step": 13250 + }, + { + "epoch": 0.26504, + "grad_norm": 1.6078327894210815, + "learning_rate": 1.838594513815347e-05, + "loss": 0.2908, + "step": 13252 + }, + { + "epoch": 0.26508, + "grad_norm": 1.331251621246338, + "learning_rate": 1.8385184433364378e-05, + "loss": 0.0943, + "step": 13254 + }, + { + "epoch": 0.26512, + "grad_norm": 2.7168331146240234, + "learning_rate": 1.8384423565101798e-05, + "loss": 0.1966, + "step": 13256 + }, + { + "epoch": 0.26516, + "grad_norm": 0.56437748670578, + "learning_rate": 1.8383662533380568e-05, + "loss": 0.0711, + "step": 13258 + }, + { + "epoch": 0.2652, + "grad_norm": 0.8733907341957092, + "learning_rate": 1.8382901338215515e-05, + "loss": 0.0464, + "step": 13260 + }, + { + "epoch": 0.26524, + "grad_norm": 1.2734105587005615, + "learning_rate": 1.8382139979621486e-05, + "loss": 0.0951, + "step": 13262 + }, + { + "epoch": 0.26528, + "grad_norm": 3.778334379196167, + "learning_rate": 1.8381378457613327e-05, + "loss": 0.3866, + "step": 13264 + }, + { + "epoch": 0.26532, + "grad_norm": 3.3344883918762207, + "learning_rate": 1.8380616772205876e-05, + "loss": 0.2777, + "step": 13266 + }, + { + "epoch": 0.26536, + "grad_norm": 1.4218171834945679, + "learning_rate": 1.8379854923413988e-05, + "loss": 0.6334, + "step": 13268 + }, + { + "epoch": 0.2654, + "grad_norm": 0.7205459475517273, + "learning_rate": 1.8379092911252515e-05, + "loss": 0.042, + "step": 13270 + }, + { + "epoch": 0.26544, + "grad_norm": 0.6167098879814148, + "learning_rate": 1.8378330735736313e-05, + "loss": 0.0931, + "step": 13272 + }, + { + "epoch": 0.26548, + "grad_norm": 0.42512258887290955, + "learning_rate": 1.8377568396880236e-05, + "loss": 0.0474, + "step": 13274 + }, + { + "epoch": 0.26552, + "grad_norm": 2.9507153034210205, + "learning_rate": 1.837680589469915e-05, + "loss": 0.205, + "step": 13276 + }, + { + "epoch": 0.26556, + "grad_norm": 1.049381136894226, + "learning_rate": 1.837604322920792e-05, + "loss": 0.112, + "step": 13278 + }, + { + "epoch": 0.2656, + "grad_norm": 1.2420353889465332, + "learning_rate": 1.837528040042142e-05, + "loss": 0.0964, + "step": 13280 + }, + { + "epoch": 0.26564, + "grad_norm": 0.6771272420883179, + "learning_rate": 1.837451740835451e-05, + "loss": 0.1464, + "step": 13282 + }, + { + "epoch": 0.26568, + "grad_norm": 2.897874116897583, + "learning_rate": 1.8373754253022078e-05, + "loss": 0.2768, + "step": 13284 + }, + { + "epoch": 0.26572, + "grad_norm": 2.2719337940216064, + "learning_rate": 1.8372990934438993e-05, + "loss": 0.0702, + "step": 13286 + }, + { + "epoch": 0.26576, + "grad_norm": 0.7799497246742249, + "learning_rate": 1.8372227452620137e-05, + "loss": 0.0292, + "step": 13288 + }, + { + "epoch": 0.2658, + "grad_norm": 0.38331475853919983, + "learning_rate": 1.83714638075804e-05, + "loss": 0.0734, + "step": 13290 + }, + { + "epoch": 0.26584, + "grad_norm": 1.5784164667129517, + "learning_rate": 1.8370699999334666e-05, + "loss": 0.1526, + "step": 13292 + }, + { + "epoch": 0.26588, + "grad_norm": 1.7907809019088745, + "learning_rate": 1.8369936027897823e-05, + "loss": 0.1097, + "step": 13294 + }, + { + "epoch": 0.26592, + "grad_norm": 0.7917414307594299, + "learning_rate": 1.836917189328477e-05, + "loss": 0.1965, + "step": 13296 + }, + { + "epoch": 0.26596, + "grad_norm": 4.616872310638428, + "learning_rate": 1.83684075955104e-05, + "loss": 0.4318, + "step": 13298 + }, + { + "epoch": 0.266, + "grad_norm": 2.181004524230957, + "learning_rate": 1.836764313458962e-05, + "loss": 0.176, + "step": 13300 + }, + { + "epoch": 0.26604, + "grad_norm": 1.3001112937927246, + "learning_rate": 1.8366878510537323e-05, + "loss": 0.1441, + "step": 13302 + }, + { + "epoch": 0.26608, + "grad_norm": 0.6612851023674011, + "learning_rate": 1.836611372336843e-05, + "loss": 0.034, + "step": 13304 + }, + { + "epoch": 0.26612, + "grad_norm": 1.2978699207305908, + "learning_rate": 1.8365348773097836e-05, + "loss": 0.0648, + "step": 13306 + }, + { + "epoch": 0.26616, + "grad_norm": 4.881285667419434, + "learning_rate": 1.836458365974046e-05, + "loss": 0.4016, + "step": 13308 + }, + { + "epoch": 0.2662, + "grad_norm": 1.8232345581054688, + "learning_rate": 1.8363818383311226e-05, + "loss": 0.1644, + "step": 13310 + }, + { + "epoch": 0.26624, + "grad_norm": 2.8476107120513916, + "learning_rate": 1.836305294382504e-05, + "loss": 0.2287, + "step": 13312 + }, + { + "epoch": 0.26628, + "grad_norm": 4.043719291687012, + "learning_rate": 1.8362287341296835e-05, + "loss": 0.2888, + "step": 13314 + }, + { + "epoch": 0.26632, + "grad_norm": 1.6907627582550049, + "learning_rate": 1.8361521575741533e-05, + "loss": 0.2027, + "step": 13316 + }, + { + "epoch": 0.26636, + "grad_norm": 4.112112998962402, + "learning_rate": 1.8360755647174062e-05, + "loss": 0.47, + "step": 13318 + }, + { + "epoch": 0.2664, + "grad_norm": 2.740569829940796, + "learning_rate": 1.8359989555609355e-05, + "loss": 0.2465, + "step": 13320 + }, + { + "epoch": 0.26644, + "grad_norm": 1.6211426258087158, + "learning_rate": 1.835922330106235e-05, + "loss": 0.0825, + "step": 13322 + }, + { + "epoch": 0.26648, + "grad_norm": 0.38151174783706665, + "learning_rate": 1.8358456883547982e-05, + "loss": 0.0182, + "step": 13324 + }, + { + "epoch": 0.26652, + "grad_norm": 0.6401554346084595, + "learning_rate": 1.8357690303081194e-05, + "loss": 0.1204, + "step": 13326 + }, + { + "epoch": 0.26656, + "grad_norm": 0.19731436669826508, + "learning_rate": 1.835692355967693e-05, + "loss": 0.1971, + "step": 13328 + }, + { + "epoch": 0.2666, + "grad_norm": 3.383336305618286, + "learning_rate": 1.8356156653350138e-05, + "loss": 0.2919, + "step": 13330 + }, + { + "epoch": 0.26664, + "grad_norm": 2.828131914138794, + "learning_rate": 1.8355389584115767e-05, + "loss": 0.2893, + "step": 13332 + }, + { + "epoch": 0.26668, + "grad_norm": 0.6338841915130615, + "learning_rate": 1.835462235198878e-05, + "loss": 0.0932, + "step": 13334 + }, + { + "epoch": 0.26672, + "grad_norm": 0.25264880061149597, + "learning_rate": 1.8353854956984127e-05, + "loss": 0.0171, + "step": 13336 + }, + { + "epoch": 0.26676, + "grad_norm": 1.7723348140716553, + "learning_rate": 1.8353087399116768e-05, + "loss": 0.0929, + "step": 13338 + }, + { + "epoch": 0.2668, + "grad_norm": 0.607445478439331, + "learning_rate": 1.8352319678401677e-05, + "loss": 0.2562, + "step": 13340 + }, + { + "epoch": 0.26684, + "grad_norm": 0.40969118475914, + "learning_rate": 1.835155179485381e-05, + "loss": 0.0473, + "step": 13342 + }, + { + "epoch": 0.26688, + "grad_norm": 0.11803799867630005, + "learning_rate": 1.8350783748488135e-05, + "loss": 0.1195, + "step": 13344 + }, + { + "epoch": 0.26692, + "grad_norm": 3.061289072036743, + "learning_rate": 1.835001553931964e-05, + "loss": 0.3401, + "step": 13346 + }, + { + "epoch": 0.26696, + "grad_norm": 3.6428990364074707, + "learning_rate": 1.8349247167363287e-05, + "loss": 0.2531, + "step": 13348 + }, + { + "epoch": 0.267, + "grad_norm": 1.4959973096847534, + "learning_rate": 1.8348478632634067e-05, + "loss": 0.0951, + "step": 13350 + }, + { + "epoch": 0.26704, + "grad_norm": 3.378732919692993, + "learning_rate": 1.8347709935146958e-05, + "loss": 0.2845, + "step": 13352 + }, + { + "epoch": 0.26708, + "grad_norm": 1.1527683734893799, + "learning_rate": 1.834694107491694e-05, + "loss": 0.0885, + "step": 13354 + }, + { + "epoch": 0.26712, + "grad_norm": 0.4322059750556946, + "learning_rate": 1.8346172051959014e-05, + "loss": 0.2175, + "step": 13356 + }, + { + "epoch": 0.26716, + "grad_norm": 3.212174415588379, + "learning_rate": 1.8345402866288165e-05, + "loss": 0.3007, + "step": 13358 + }, + { + "epoch": 0.2672, + "grad_norm": 2.4002606868743896, + "learning_rate": 1.834463351791939e-05, + "loss": 0.1386, + "step": 13360 + }, + { + "epoch": 0.26724, + "grad_norm": 0.07350198924541473, + "learning_rate": 1.834386400686769e-05, + "loss": 0.067, + "step": 13362 + }, + { + "epoch": 0.26728, + "grad_norm": 0.23093752562999725, + "learning_rate": 1.8343094333148068e-05, + "loss": 0.0324, + "step": 13364 + }, + { + "epoch": 0.26732, + "grad_norm": 5.141496658325195, + "learning_rate": 1.8342324496775524e-05, + "loss": 0.4298, + "step": 13366 + }, + { + "epoch": 0.26736, + "grad_norm": 4.638444900512695, + "learning_rate": 1.834155449776507e-05, + "loss": 0.6266, + "step": 13368 + }, + { + "epoch": 0.2674, + "grad_norm": 3.1133623123168945, + "learning_rate": 1.8340784336131715e-05, + "loss": 0.3427, + "step": 13370 + }, + { + "epoch": 0.26744, + "grad_norm": 0.9743351936340332, + "learning_rate": 1.834001401189047e-05, + "loss": 0.0551, + "step": 13372 + }, + { + "epoch": 0.26748, + "grad_norm": 0.1266448050737381, + "learning_rate": 1.8339243525056367e-05, + "loss": 0.057, + "step": 13374 + }, + { + "epoch": 0.26752, + "grad_norm": 0.7559276819229126, + "learning_rate": 1.8338472875644417e-05, + "loss": 0.2313, + "step": 13376 + }, + { + "epoch": 0.26756, + "grad_norm": 0.3137763440608978, + "learning_rate": 1.833770206366964e-05, + "loss": 0.162, + "step": 13378 + }, + { + "epoch": 0.2676, + "grad_norm": 1.2111955881118774, + "learning_rate": 1.8336931089147076e-05, + "loss": 0.154, + "step": 13380 + }, + { + "epoch": 0.26764, + "grad_norm": 0.12986694276332855, + "learning_rate": 1.833615995209174e-05, + "loss": 0.0197, + "step": 13382 + }, + { + "epoch": 0.26768, + "grad_norm": 2.233372449874878, + "learning_rate": 1.833538865251868e-05, + "loss": 0.1647, + "step": 13384 + }, + { + "epoch": 0.26772, + "grad_norm": 2.471004009246826, + "learning_rate": 1.8334617190442926e-05, + "loss": 0.3616, + "step": 13386 + }, + { + "epoch": 0.26776, + "grad_norm": 0.7323565483093262, + "learning_rate": 1.8333845565879517e-05, + "loss": 0.0695, + "step": 13388 + }, + { + "epoch": 0.2678, + "grad_norm": 3.354944944381714, + "learning_rate": 1.83330737788435e-05, + "loss": 0.1533, + "step": 13390 + }, + { + "epoch": 0.26784, + "grad_norm": 1.001588225364685, + "learning_rate": 1.8332301829349918e-05, + "loss": 0.0666, + "step": 13392 + }, + { + "epoch": 0.26788, + "grad_norm": 1.3605215549468994, + "learning_rate": 1.8331529717413825e-05, + "loss": 0.2551, + "step": 13394 + }, + { + "epoch": 0.26792, + "grad_norm": 2.8472328186035156, + "learning_rate": 1.8330757443050266e-05, + "loss": 0.4995, + "step": 13396 + }, + { + "epoch": 0.26796, + "grad_norm": 0.5409247875213623, + "learning_rate": 1.8329985006274303e-05, + "loss": 0.0698, + "step": 13398 + }, + { + "epoch": 0.268, + "grad_norm": 0.2284543514251709, + "learning_rate": 1.8329212407100996e-05, + "loss": 0.1456, + "step": 13400 + }, + { + "epoch": 0.26804, + "grad_norm": 0.4225962162017822, + "learning_rate": 1.8328439645545404e-05, + "loss": 0.3165, + "step": 13402 + }, + { + "epoch": 0.26808, + "grad_norm": 1.5341953039169312, + "learning_rate": 1.8327666721622592e-05, + "loss": 0.2155, + "step": 13404 + }, + { + "epoch": 0.26812, + "grad_norm": 1.328489065170288, + "learning_rate": 1.8326893635347633e-05, + "loss": 0.08, + "step": 13406 + }, + { + "epoch": 0.26816, + "grad_norm": 2.201509475708008, + "learning_rate": 1.8326120386735595e-05, + "loss": 0.2026, + "step": 13408 + }, + { + "epoch": 0.2682, + "grad_norm": 1.8402854204177856, + "learning_rate": 1.832534697580155e-05, + "loss": 0.0764, + "step": 13410 + }, + { + "epoch": 0.26824, + "grad_norm": 1.3680170774459839, + "learning_rate": 1.8324573402560584e-05, + "loss": 0.2433, + "step": 13412 + }, + { + "epoch": 0.26828, + "grad_norm": 0.4880695641040802, + "learning_rate": 1.832379966702777e-05, + "loss": 0.1046, + "step": 13414 + }, + { + "epoch": 0.26832, + "grad_norm": 2.9456229209899902, + "learning_rate": 1.8323025769218198e-05, + "loss": 0.1745, + "step": 13416 + }, + { + "epoch": 0.26836, + "grad_norm": 0.6032606959342957, + "learning_rate": 1.8322251709146953e-05, + "loss": 0.0415, + "step": 13418 + }, + { + "epoch": 0.2684, + "grad_norm": 1.0807853937149048, + "learning_rate": 1.8321477486829128e-05, + "loss": 0.1711, + "step": 13420 + }, + { + "epoch": 0.26844, + "grad_norm": 1.2612396478652954, + "learning_rate": 1.832070310227981e-05, + "loss": 0.1589, + "step": 13422 + }, + { + "epoch": 0.26848, + "grad_norm": 1.9814019203186035, + "learning_rate": 1.8319928555514108e-05, + "loss": 0.31, + "step": 13424 + }, + { + "epoch": 0.26852, + "grad_norm": 0.2763594090938568, + "learning_rate": 1.8319153846547113e-05, + "loss": 0.328, + "step": 13426 + }, + { + "epoch": 0.26856, + "grad_norm": 4.0555620193481445, + "learning_rate": 1.831837897539393e-05, + "loss": 0.3423, + "step": 13428 + }, + { + "epoch": 0.2686, + "grad_norm": 4.195181846618652, + "learning_rate": 1.8317603942069665e-05, + "loss": 0.3261, + "step": 13430 + }, + { + "epoch": 0.26864, + "grad_norm": 4.071376323699951, + "learning_rate": 1.831682874658943e-05, + "loss": 0.3429, + "step": 13432 + }, + { + "epoch": 0.26868, + "grad_norm": 1.3577892780303955, + "learning_rate": 1.831605338896834e-05, + "loss": 0.1237, + "step": 13434 + }, + { + "epoch": 0.26872, + "grad_norm": 2.079383134841919, + "learning_rate": 1.83152778692215e-05, + "loss": 0.193, + "step": 13436 + }, + { + "epoch": 0.26876, + "grad_norm": 0.21151946485042572, + "learning_rate": 1.8314502187364045e-05, + "loss": 0.145, + "step": 13438 + }, + { + "epoch": 0.2688, + "grad_norm": 1.7085661888122559, + "learning_rate": 1.8313726343411085e-05, + "loss": 0.1379, + "step": 13440 + }, + { + "epoch": 0.26884, + "grad_norm": 0.8711621165275574, + "learning_rate": 1.8312950337377752e-05, + "loss": 0.1439, + "step": 13442 + }, + { + "epoch": 0.26888, + "grad_norm": 2.324939012527466, + "learning_rate": 1.8312174169279175e-05, + "loss": 0.2324, + "step": 13444 + }, + { + "epoch": 0.26892, + "grad_norm": 1.9038413763046265, + "learning_rate": 1.831139783913048e-05, + "loss": 0.1046, + "step": 13446 + }, + { + "epoch": 0.26896, + "grad_norm": 1.2236344814300537, + "learning_rate": 1.83106213469468e-05, + "loss": 0.0671, + "step": 13448 + }, + { + "epoch": 0.269, + "grad_norm": 0.2837848663330078, + "learning_rate": 1.8309844692743283e-05, + "loss": 0.063, + "step": 13450 + }, + { + "epoch": 0.26904, + "grad_norm": 2.376680374145508, + "learning_rate": 1.8309067876535068e-05, + "loss": 0.1686, + "step": 13452 + }, + { + "epoch": 0.26908, + "grad_norm": 2.881920099258423, + "learning_rate": 1.8308290898337294e-05, + "loss": 0.2176, + "step": 13454 + }, + { + "epoch": 0.26912, + "grad_norm": 1.338384747505188, + "learning_rate": 1.8307513758165113e-05, + "loss": 0.0548, + "step": 13456 + }, + { + "epoch": 0.26916, + "grad_norm": 2.323641777038574, + "learning_rate": 1.8306736456033673e-05, + "loss": 0.1515, + "step": 13458 + }, + { + "epoch": 0.2692, + "grad_norm": 1.1423605680465698, + "learning_rate": 1.830595899195813e-05, + "loss": 0.1582, + "step": 13460 + }, + { + "epoch": 0.26924, + "grad_norm": 1.9832956790924072, + "learning_rate": 1.8305181365953636e-05, + "loss": 0.2256, + "step": 13462 + }, + { + "epoch": 0.26928, + "grad_norm": 2.639439344406128, + "learning_rate": 1.830440357803536e-05, + "loss": 0.1466, + "step": 13464 + }, + { + "epoch": 0.26932, + "grad_norm": 0.27571168541908264, + "learning_rate": 1.8303625628218462e-05, + "loss": 0.1059, + "step": 13466 + }, + { + "epoch": 0.26936, + "grad_norm": 0.3007839620113373, + "learning_rate": 1.8302847516518105e-05, + "loss": 0.0633, + "step": 13468 + }, + { + "epoch": 0.2694, + "grad_norm": 0.22574257850646973, + "learning_rate": 1.830206924294946e-05, + "loss": 0.3621, + "step": 13470 + }, + { + "epoch": 0.26944, + "grad_norm": 3.3344063758850098, + "learning_rate": 1.8301290807527703e-05, + "loss": 0.5537, + "step": 13472 + }, + { + "epoch": 0.26948, + "grad_norm": 0.7418382167816162, + "learning_rate": 1.8300512210268006e-05, + "loss": 0.0609, + "step": 13474 + }, + { + "epoch": 0.26952, + "grad_norm": 0.4085451364517212, + "learning_rate": 1.8299733451185554e-05, + "loss": 0.0303, + "step": 13476 + }, + { + "epoch": 0.26956, + "grad_norm": 0.9392039179801941, + "learning_rate": 1.8298954530295524e-05, + "loss": 0.426, + "step": 13478 + }, + { + "epoch": 0.2696, + "grad_norm": 2.713181972503662, + "learning_rate": 1.82981754476131e-05, + "loss": 0.1562, + "step": 13480 + }, + { + "epoch": 0.26964, + "grad_norm": 1.8924349546432495, + "learning_rate": 1.8297396203153474e-05, + "loss": 0.199, + "step": 13482 + }, + { + "epoch": 0.26968, + "grad_norm": 2.914430618286133, + "learning_rate": 1.829661679693184e-05, + "loss": 0.1777, + "step": 13484 + }, + { + "epoch": 0.26972, + "grad_norm": 2.228182554244995, + "learning_rate": 1.8295837228963387e-05, + "loss": 0.1839, + "step": 13486 + }, + { + "epoch": 0.26976, + "grad_norm": 0.7170392274856567, + "learning_rate": 1.829505749926332e-05, + "loss": 0.2066, + "step": 13488 + }, + { + "epoch": 0.2698, + "grad_norm": 1.1007248163223267, + "learning_rate": 1.8294277607846834e-05, + "loss": 0.0776, + "step": 13490 + }, + { + "epoch": 0.26984, + "grad_norm": 1.6608154773712158, + "learning_rate": 1.8293497554729133e-05, + "loss": 0.3169, + "step": 13492 + }, + { + "epoch": 0.26988, + "grad_norm": 0.45031100511550903, + "learning_rate": 1.8292717339925433e-05, + "loss": 0.0989, + "step": 13494 + }, + { + "epoch": 0.26992, + "grad_norm": 0.8984354734420776, + "learning_rate": 1.8291936963450933e-05, + "loss": 0.1888, + "step": 13496 + }, + { + "epoch": 0.26996, + "grad_norm": 2.3774373531341553, + "learning_rate": 1.829115642532086e-05, + "loss": 0.2959, + "step": 13498 + }, + { + "epoch": 0.27, + "grad_norm": 2.2063093185424805, + "learning_rate": 1.8290375725550417e-05, + "loss": 0.1589, + "step": 13500 + }, + { + "epoch": 0.27004, + "grad_norm": 4.124106407165527, + "learning_rate": 1.8289594864154836e-05, + "loss": 0.3729, + "step": 13502 + }, + { + "epoch": 0.27008, + "grad_norm": 0.3193870186805725, + "learning_rate": 1.8288813841149333e-05, + "loss": 0.0557, + "step": 13504 + }, + { + "epoch": 0.27012, + "grad_norm": 1.3125792741775513, + "learning_rate": 1.8288032656549138e-05, + "loss": 0.2627, + "step": 13506 + }, + { + "epoch": 0.27016, + "grad_norm": 2.745957612991333, + "learning_rate": 1.828725131036948e-05, + "loss": 0.2761, + "step": 13508 + }, + { + "epoch": 0.2702, + "grad_norm": 0.5667065978050232, + "learning_rate": 1.828646980262559e-05, + "loss": 0.0641, + "step": 13510 + }, + { + "epoch": 0.27024, + "grad_norm": 0.921308696269989, + "learning_rate": 1.8285688133332704e-05, + "loss": 0.0594, + "step": 13512 + }, + { + "epoch": 0.27028, + "grad_norm": 0.8067626357078552, + "learning_rate": 1.8284906302506065e-05, + "loss": 0.0493, + "step": 13514 + }, + { + "epoch": 0.27032, + "grad_norm": 1.5873661041259766, + "learning_rate": 1.828412431016091e-05, + "loss": 0.0717, + "step": 13516 + }, + { + "epoch": 0.27036, + "grad_norm": 2.157350540161133, + "learning_rate": 1.8283342156312483e-05, + "loss": 0.2121, + "step": 13518 + }, + { + "epoch": 0.2704, + "grad_norm": 2.483289957046509, + "learning_rate": 1.8282559840976043e-05, + "loss": 0.2654, + "step": 13520 + }, + { + "epoch": 0.27044, + "grad_norm": 1.6786280870437622, + "learning_rate": 1.8281777364166832e-05, + "loss": 0.2299, + "step": 13522 + }, + { + "epoch": 0.27048, + "grad_norm": 2.7738821506500244, + "learning_rate": 1.8280994725900107e-05, + "loss": 0.1812, + "step": 13524 + }, + { + "epoch": 0.27052, + "grad_norm": 2.43896484375, + "learning_rate": 1.8280211926191126e-05, + "loss": 0.2884, + "step": 13526 + }, + { + "epoch": 0.27056, + "grad_norm": 3.4284443855285645, + "learning_rate": 1.8279428965055153e-05, + "loss": 0.3667, + "step": 13528 + }, + { + "epoch": 0.2706, + "grad_norm": 1.6317651271820068, + "learning_rate": 1.8278645842507448e-05, + "loss": 0.0951, + "step": 13530 + }, + { + "epoch": 0.27064, + "grad_norm": 3.276733636856079, + "learning_rate": 1.827786255856328e-05, + "loss": 0.4277, + "step": 13532 + }, + { + "epoch": 0.27068, + "grad_norm": 0.8985788822174072, + "learning_rate": 1.827707911323792e-05, + "loss": 0.0803, + "step": 13534 + }, + { + "epoch": 0.27072, + "grad_norm": 2.0466020107269287, + "learning_rate": 1.827629550654664e-05, + "loss": 0.222, + "step": 13536 + }, + { + "epoch": 0.27076, + "grad_norm": 0.3658325672149658, + "learning_rate": 1.827551173850472e-05, + "loss": 0.1025, + "step": 13538 + }, + { + "epoch": 0.2708, + "grad_norm": 2.502396821975708, + "learning_rate": 1.827472780912744e-05, + "loss": 0.1605, + "step": 13540 + }, + { + "epoch": 0.27084, + "grad_norm": 0.9946320056915283, + "learning_rate": 1.8273943718430082e-05, + "loss": 0.2719, + "step": 13542 + }, + { + "epoch": 0.27088, + "grad_norm": 2.1386420726776123, + "learning_rate": 1.8273159466427926e-05, + "loss": 0.2769, + "step": 13544 + }, + { + "epoch": 0.27092, + "grad_norm": 3.443119764328003, + "learning_rate": 1.8272375053136267e-05, + "loss": 0.3713, + "step": 13546 + }, + { + "epoch": 0.27096, + "grad_norm": 4.069324970245361, + "learning_rate": 1.8271590478570402e-05, + "loss": 0.4323, + "step": 13548 + }, + { + "epoch": 0.271, + "grad_norm": 1.6679490804672241, + "learning_rate": 1.827080574274562e-05, + "loss": 0.2023, + "step": 13550 + }, + { + "epoch": 0.27104, + "grad_norm": 2.3559956550598145, + "learning_rate": 1.827002084567722e-05, + "loss": 0.3279, + "step": 13552 + }, + { + "epoch": 0.27108, + "grad_norm": 1.1848734617233276, + "learning_rate": 1.8269235787380507e-05, + "loss": 0.0964, + "step": 13554 + }, + { + "epoch": 0.27112, + "grad_norm": 3.7747421264648438, + "learning_rate": 1.826845056787079e-05, + "loss": 0.4278, + "step": 13556 + }, + { + "epoch": 0.27116, + "grad_norm": 0.7345425486564636, + "learning_rate": 1.8267665187163367e-05, + "loss": 0.0648, + "step": 13558 + }, + { + "epoch": 0.2712, + "grad_norm": 0.1628524214029312, + "learning_rate": 1.8266879645273557e-05, + "loss": 0.0383, + "step": 13560 + }, + { + "epoch": 0.27124, + "grad_norm": 2.132204532623291, + "learning_rate": 1.826609394221667e-05, + "loss": 0.0988, + "step": 13562 + }, + { + "epoch": 0.27128, + "grad_norm": 0.4693473279476166, + "learning_rate": 1.826530807800803e-05, + "loss": 0.0528, + "step": 13564 + }, + { + "epoch": 0.27132, + "grad_norm": 1.046143889427185, + "learning_rate": 1.8264522052662947e-05, + "loss": 0.1491, + "step": 13566 + }, + { + "epoch": 0.27136, + "grad_norm": 0.7594102621078491, + "learning_rate": 1.8263735866196758e-05, + "loss": 0.1179, + "step": 13568 + }, + { + "epoch": 0.2714, + "grad_norm": 2.044571876525879, + "learning_rate": 1.826294951862478e-05, + "loss": 0.1114, + "step": 13570 + }, + { + "epoch": 0.27144, + "grad_norm": 1.0054494142532349, + "learning_rate": 1.826216300996235e-05, + "loss": 0.1435, + "step": 13572 + }, + { + "epoch": 0.27148, + "grad_norm": 0.29653117060661316, + "learning_rate": 1.8261376340224795e-05, + "loss": 0.0182, + "step": 13574 + }, + { + "epoch": 0.27152, + "grad_norm": 1.1230769157409668, + "learning_rate": 1.8260589509427457e-05, + "loss": 0.3482, + "step": 13576 + }, + { + "epoch": 0.27156, + "grad_norm": 1.353697419166565, + "learning_rate": 1.8259802517585676e-05, + "loss": 0.1876, + "step": 13578 + }, + { + "epoch": 0.2716, + "grad_norm": 1.2037018537521362, + "learning_rate": 1.8259015364714786e-05, + "loss": 0.3363, + "step": 13580 + }, + { + "epoch": 0.27164, + "grad_norm": 2.9207098484039307, + "learning_rate": 1.8258228050830143e-05, + "loss": 0.2083, + "step": 13582 + }, + { + "epoch": 0.27168, + "grad_norm": 1.473772644996643, + "learning_rate": 1.8257440575947095e-05, + "loss": 0.0885, + "step": 13584 + }, + { + "epoch": 0.27172, + "grad_norm": 0.6050575971603394, + "learning_rate": 1.825665294008099e-05, + "loss": 0.0886, + "step": 13586 + }, + { + "epoch": 0.27176, + "grad_norm": 1.8267652988433838, + "learning_rate": 1.8255865143247183e-05, + "loss": 0.1592, + "step": 13588 + }, + { + "epoch": 0.2718, + "grad_norm": 1.3714460134506226, + "learning_rate": 1.825507718546104e-05, + "loss": 0.0957, + "step": 13590 + }, + { + "epoch": 0.27184, + "grad_norm": 0.16262219846248627, + "learning_rate": 1.825428906673791e-05, + "loss": 0.0934, + "step": 13592 + }, + { + "epoch": 0.27188, + "grad_norm": 1.9971120357513428, + "learning_rate": 1.8253500787093173e-05, + "loss": 0.1266, + "step": 13594 + }, + { + "epoch": 0.27192, + "grad_norm": 2.4502108097076416, + "learning_rate": 1.8252712346542184e-05, + "loss": 0.3138, + "step": 13596 + }, + { + "epoch": 0.27196, + "grad_norm": 2.7824370861053467, + "learning_rate": 1.8251923745100318e-05, + "loss": 0.1649, + "step": 13598 + }, + { + "epoch": 0.272, + "grad_norm": 2.131316900253296, + "learning_rate": 1.8251134982782952e-05, + "loss": 0.185, + "step": 13600 + }, + { + "epoch": 0.27204, + "grad_norm": 0.826485812664032, + "learning_rate": 1.8250346059605464e-05, + "loss": 0.0844, + "step": 13602 + }, + { + "epoch": 0.27208, + "grad_norm": 0.8746130466461182, + "learning_rate": 1.824955697558323e-05, + "loss": 0.0763, + "step": 13604 + }, + { + "epoch": 0.27212, + "grad_norm": 0.933100163936615, + "learning_rate": 1.8248767730731634e-05, + "loss": 0.13, + "step": 13606 + }, + { + "epoch": 0.27216, + "grad_norm": 2.27563214302063, + "learning_rate": 1.8247978325066063e-05, + "loss": 0.0992, + "step": 13608 + }, + { + "epoch": 0.2722, + "grad_norm": 2.1366522312164307, + "learning_rate": 1.8247188758601912e-05, + "loss": 0.1488, + "step": 13610 + }, + { + "epoch": 0.27224, + "grad_norm": 0.6231719255447388, + "learning_rate": 1.824639903135457e-05, + "loss": 0.154, + "step": 13612 + }, + { + "epoch": 0.27228, + "grad_norm": 0.1307094693183899, + "learning_rate": 1.8245609143339433e-05, + "loss": 0.056, + "step": 13614 + }, + { + "epoch": 0.27232, + "grad_norm": 2.5207455158233643, + "learning_rate": 1.8244819094571897e-05, + "loss": 0.2478, + "step": 13616 + }, + { + "epoch": 0.27236, + "grad_norm": 4.080536365509033, + "learning_rate": 1.8244028885067373e-05, + "loss": 0.2574, + "step": 13618 + }, + { + "epoch": 0.2724, + "grad_norm": 6.026556968688965, + "learning_rate": 1.824323851484126e-05, + "loss": 0.2173, + "step": 13620 + }, + { + "epoch": 0.27244, + "grad_norm": 0.6603242754936218, + "learning_rate": 1.8242447983908967e-05, + "loss": 0.0286, + "step": 13622 + }, + { + "epoch": 0.27248, + "grad_norm": 1.669029951095581, + "learning_rate": 1.8241657292285907e-05, + "loss": 0.1841, + "step": 13624 + }, + { + "epoch": 0.27252, + "grad_norm": 0.4801255762577057, + "learning_rate": 1.8240866439987497e-05, + "loss": 0.0408, + "step": 13626 + }, + { + "epoch": 0.27256, + "grad_norm": 1.1989116668701172, + "learning_rate": 1.824007542702915e-05, + "loss": 0.1688, + "step": 13628 + }, + { + "epoch": 0.2726, + "grad_norm": 2.1467158794403076, + "learning_rate": 1.8239284253426294e-05, + "loss": 0.1801, + "step": 13630 + }, + { + "epoch": 0.27264, + "grad_norm": 0.2854277491569519, + "learning_rate": 1.8238492919194347e-05, + "loss": 0.0308, + "step": 13632 + }, + { + "epoch": 0.27268, + "grad_norm": 1.029968500137329, + "learning_rate": 1.823770142434874e-05, + "loss": 0.3376, + "step": 13634 + }, + { + "epoch": 0.27272, + "grad_norm": 2.35151743888855, + "learning_rate": 1.8236909768904904e-05, + "loss": 0.0882, + "step": 13636 + }, + { + "epoch": 0.27276, + "grad_norm": 0.2596605718135834, + "learning_rate": 1.823611795287827e-05, + "loss": 0.062, + "step": 13638 + }, + { + "epoch": 0.2728, + "grad_norm": 0.13437815010547638, + "learning_rate": 1.8235325976284276e-05, + "loss": 0.1307, + "step": 13640 + }, + { + "epoch": 0.27284, + "grad_norm": 3.8916070461273193, + "learning_rate": 1.8234533839138363e-05, + "loss": 0.5555, + "step": 13642 + }, + { + "epoch": 0.27288, + "grad_norm": 0.5199761986732483, + "learning_rate": 1.8233741541455972e-05, + "loss": 0.1713, + "step": 13644 + }, + { + "epoch": 0.27292, + "grad_norm": 2.7422800064086914, + "learning_rate": 1.8232949083252555e-05, + "loss": 0.1208, + "step": 13646 + }, + { + "epoch": 0.27296, + "grad_norm": 0.2232038378715515, + "learning_rate": 1.823215646454355e-05, + "loss": 0.1968, + "step": 13648 + }, + { + "epoch": 0.273, + "grad_norm": 2.147113561630249, + "learning_rate": 1.8231363685344422e-05, + "loss": 0.1132, + "step": 13650 + }, + { + "epoch": 0.27304, + "grad_norm": 2.9898743629455566, + "learning_rate": 1.8230570745670618e-05, + "loss": 0.1616, + "step": 13652 + }, + { + "epoch": 0.27308, + "grad_norm": 1.0604362487792969, + "learning_rate": 1.8229777645537602e-05, + "loss": 0.0623, + "step": 13654 + }, + { + "epoch": 0.27312, + "grad_norm": 0.7570363283157349, + "learning_rate": 1.8228984384960832e-05, + "loss": 0.0529, + "step": 13656 + }, + { + "epoch": 0.27316, + "grad_norm": 1.0687867403030396, + "learning_rate": 1.8228190963955775e-05, + "loss": 0.0636, + "step": 13658 + }, + { + "epoch": 0.2732, + "grad_norm": 0.26108518242836, + "learning_rate": 1.82273973825379e-05, + "loss": 0.2274, + "step": 13660 + }, + { + "epoch": 0.27324, + "grad_norm": 3.38396954536438, + "learning_rate": 1.8226603640722678e-05, + "loss": 0.3615, + "step": 13662 + }, + { + "epoch": 0.27328, + "grad_norm": 3.8815064430236816, + "learning_rate": 1.8225809738525583e-05, + "loss": 0.3224, + "step": 13664 + }, + { + "epoch": 0.27332, + "grad_norm": 2.2346725463867188, + "learning_rate": 1.822501567596209e-05, + "loss": 0.1264, + "step": 13666 + }, + { + "epoch": 0.27336, + "grad_norm": 3.16046142578125, + "learning_rate": 1.8224221453047683e-05, + "loss": 0.2119, + "step": 13668 + }, + { + "epoch": 0.2734, + "grad_norm": 1.8244448900222778, + "learning_rate": 1.8223427069797845e-05, + "loss": 0.074, + "step": 13670 + }, + { + "epoch": 0.27344, + "grad_norm": 4.922366619110107, + "learning_rate": 1.822263252622806e-05, + "loss": 0.548, + "step": 13672 + }, + { + "epoch": 0.27348, + "grad_norm": 1.8069086074829102, + "learning_rate": 1.8221837822353823e-05, + "loss": 0.1177, + "step": 13674 + }, + { + "epoch": 0.27352, + "grad_norm": 0.4497756361961365, + "learning_rate": 1.8221042958190628e-05, + "loss": 0.1025, + "step": 13676 + }, + { + "epoch": 0.27356, + "grad_norm": 3.3996806144714355, + "learning_rate": 1.822024793375396e-05, + "loss": 0.2809, + "step": 13678 + }, + { + "epoch": 0.2736, + "grad_norm": 1.255844235420227, + "learning_rate": 1.8219452749059332e-05, + "loss": 0.1108, + "step": 13680 + }, + { + "epoch": 0.27364, + "grad_norm": 0.8495016694068909, + "learning_rate": 1.821865740412224e-05, + "loss": 0.1837, + "step": 13682 + }, + { + "epoch": 0.27368, + "grad_norm": 0.2009900063276291, + "learning_rate": 1.8217861898958192e-05, + "loss": 0.3066, + "step": 13684 + }, + { + "epoch": 0.27372, + "grad_norm": 2.941349983215332, + "learning_rate": 1.8217066233582694e-05, + "loss": 0.2147, + "step": 13686 + }, + { + "epoch": 0.27376, + "grad_norm": 1.7844221591949463, + "learning_rate": 1.821627040801126e-05, + "loss": 0.0867, + "step": 13688 + }, + { + "epoch": 0.2738, + "grad_norm": 0.33297255635261536, + "learning_rate": 1.8215474422259403e-05, + "loss": 0.0742, + "step": 13690 + }, + { + "epoch": 0.27384, + "grad_norm": 1.7532206773757935, + "learning_rate": 1.8214678276342642e-05, + "loss": 0.0886, + "step": 13692 + }, + { + "epoch": 0.27388, + "grad_norm": 1.3043293952941895, + "learning_rate": 1.8213881970276502e-05, + "loss": 0.3194, + "step": 13694 + }, + { + "epoch": 0.27392, + "grad_norm": 0.8275747895240784, + "learning_rate": 1.82130855040765e-05, + "loss": 0.076, + "step": 13696 + }, + { + "epoch": 0.27396, + "grad_norm": 1.106379508972168, + "learning_rate": 1.8212288877758172e-05, + "loss": 0.0463, + "step": 13698 + }, + { + "epoch": 0.274, + "grad_norm": 3.7838215827941895, + "learning_rate": 1.821149209133704e-05, + "loss": 0.4712, + "step": 13700 + }, + { + "epoch": 0.27404, + "grad_norm": 3.97619366645813, + "learning_rate": 1.8210695144828646e-05, + "loss": 0.3661, + "step": 13702 + }, + { + "epoch": 0.27408, + "grad_norm": 2.619741916656494, + "learning_rate": 1.8209898038248523e-05, + "loss": 0.2033, + "step": 13704 + }, + { + "epoch": 0.27412, + "grad_norm": 5.324502944946289, + "learning_rate": 1.8209100771612207e-05, + "loss": 0.4242, + "step": 13706 + }, + { + "epoch": 0.27416, + "grad_norm": 3.82039737701416, + "learning_rate": 1.820830334493525e-05, + "loss": 0.2644, + "step": 13708 + }, + { + "epoch": 0.2742, + "grad_norm": 3.5419020652770996, + "learning_rate": 1.820750575823319e-05, + "loss": 0.2551, + "step": 13710 + }, + { + "epoch": 0.27424, + "grad_norm": 1.9079474210739136, + "learning_rate": 1.820670801152158e-05, + "loss": 0.1934, + "step": 13712 + }, + { + "epoch": 0.27428, + "grad_norm": 0.5470077395439148, + "learning_rate": 1.8205910104815972e-05, + "loss": 0.2537, + "step": 13714 + }, + { + "epoch": 0.27432, + "grad_norm": 0.8731921315193176, + "learning_rate": 1.8205112038131925e-05, + "loss": 0.1436, + "step": 13716 + }, + { + "epoch": 0.27436, + "grad_norm": 0.8071569204330444, + "learning_rate": 1.8204313811484994e-05, + "loss": 0.0552, + "step": 13718 + }, + { + "epoch": 0.2744, + "grad_norm": 2.816098213195801, + "learning_rate": 1.8203515424890738e-05, + "loss": 0.1963, + "step": 13720 + }, + { + "epoch": 0.27444, + "grad_norm": 1.6980316638946533, + "learning_rate": 1.8202716878364728e-05, + "loss": 0.124, + "step": 13722 + }, + { + "epoch": 0.27448, + "grad_norm": 1.742850422859192, + "learning_rate": 1.8201918171922527e-05, + "loss": 0.1216, + "step": 13724 + }, + { + "epoch": 0.27452, + "grad_norm": 1.4226740598678589, + "learning_rate": 1.8201119305579712e-05, + "loss": 0.0773, + "step": 13726 + }, + { + "epoch": 0.27456, + "grad_norm": 0.4306575655937195, + "learning_rate": 1.820032027935185e-05, + "loss": 0.0815, + "step": 13728 + }, + { + "epoch": 0.2746, + "grad_norm": 1.3877571821212769, + "learning_rate": 1.8199521093254524e-05, + "loss": 0.2322, + "step": 13730 + }, + { + "epoch": 0.27464, + "grad_norm": 1.5886887311935425, + "learning_rate": 1.8198721747303315e-05, + "loss": 0.1982, + "step": 13732 + }, + { + "epoch": 0.27468, + "grad_norm": 1.9041664600372314, + "learning_rate": 1.8197922241513804e-05, + "loss": 0.1509, + "step": 13734 + }, + { + "epoch": 0.27472, + "grad_norm": 1.766433835029602, + "learning_rate": 1.8197122575901576e-05, + "loss": 0.141, + "step": 13736 + }, + { + "epoch": 0.27476, + "grad_norm": 1.068678855895996, + "learning_rate": 1.8196322750482224e-05, + "loss": 0.0808, + "step": 13738 + }, + { + "epoch": 0.2748, + "grad_norm": 1.2163448333740234, + "learning_rate": 1.819552276527134e-05, + "loss": 0.1264, + "step": 13740 + }, + { + "epoch": 0.27484, + "grad_norm": 1.0135730504989624, + "learning_rate": 1.8194722620284522e-05, + "loss": 0.0374, + "step": 13742 + }, + { + "epoch": 0.27488, + "grad_norm": 1.869798183441162, + "learning_rate": 1.8193922315537363e-05, + "loss": 0.0906, + "step": 13744 + }, + { + "epoch": 0.27492, + "grad_norm": 0.7461000680923462, + "learning_rate": 1.8193121851045476e-05, + "loss": 0.0297, + "step": 13746 + }, + { + "epoch": 0.27496, + "grad_norm": 1.049478530883789, + "learning_rate": 1.8192321226824455e-05, + "loss": 0.1385, + "step": 13748 + }, + { + "epoch": 0.275, + "grad_norm": 0.33200380206108093, + "learning_rate": 1.819152044288992e-05, + "loss": 0.0331, + "step": 13750 + }, + { + "epoch": 0.27504, + "grad_norm": 0.44478949904441833, + "learning_rate": 1.8190719499257472e-05, + "loss": 0.165, + "step": 13752 + }, + { + "epoch": 0.27508, + "grad_norm": 0.1103639006614685, + "learning_rate": 1.8189918395942735e-05, + "loss": 0.2874, + "step": 13754 + }, + { + "epoch": 0.27512, + "grad_norm": 0.2934657633304596, + "learning_rate": 1.8189117132961318e-05, + "loss": 0.0236, + "step": 13756 + }, + { + "epoch": 0.27516, + "grad_norm": 1.927647590637207, + "learning_rate": 1.8188315710328847e-05, + "loss": 0.3555, + "step": 13758 + }, + { + "epoch": 0.2752, + "grad_norm": 1.3666789531707764, + "learning_rate": 1.8187514128060946e-05, + "loss": 0.0835, + "step": 13760 + }, + { + "epoch": 0.27524, + "grad_norm": 0.5176371335983276, + "learning_rate": 1.8186712386173243e-05, + "loss": 0.0418, + "step": 13762 + }, + { + "epoch": 0.27528, + "grad_norm": 2.326599359512329, + "learning_rate": 1.818591048468137e-05, + "loss": 0.124, + "step": 13764 + }, + { + "epoch": 0.27532, + "grad_norm": 0.05787103995680809, + "learning_rate": 1.818510842360095e-05, + "loss": 0.2524, + "step": 13766 + }, + { + "epoch": 0.27536, + "grad_norm": 2.552379846572876, + "learning_rate": 1.8184306202947635e-05, + "loss": 0.1243, + "step": 13768 + }, + { + "epoch": 0.2754, + "grad_norm": 1.0616904497146606, + "learning_rate": 1.818350382273705e-05, + "loss": 0.1024, + "step": 13770 + }, + { + "epoch": 0.27544, + "grad_norm": 1.2916063070297241, + "learning_rate": 1.818270128298485e-05, + "loss": 0.0457, + "step": 13772 + }, + { + "epoch": 0.27548, + "grad_norm": 0.730215847492218, + "learning_rate": 1.8181898583706674e-05, + "loss": 0.2911, + "step": 13774 + }, + { + "epoch": 0.27552, + "grad_norm": 0.14121130108833313, + "learning_rate": 1.8181095724918173e-05, + "loss": 0.5529, + "step": 13776 + }, + { + "epoch": 0.27556, + "grad_norm": 4.01580810546875, + "learning_rate": 1.8180292706634998e-05, + "loss": 0.3606, + "step": 13778 + }, + { + "epoch": 0.2756, + "grad_norm": 3.3181886672973633, + "learning_rate": 1.8179489528872808e-05, + "loss": 0.1231, + "step": 13780 + }, + { + "epoch": 0.27564, + "grad_norm": 1.6067922115325928, + "learning_rate": 1.8178686191647257e-05, + "loss": 0.1378, + "step": 13782 + }, + { + "epoch": 0.27568, + "grad_norm": 0.9750070571899414, + "learning_rate": 1.8177882694974008e-05, + "loss": 0.0471, + "step": 13784 + }, + { + "epoch": 0.27572, + "grad_norm": 1.598386526107788, + "learning_rate": 1.8177079038868724e-05, + "loss": 0.0891, + "step": 13786 + }, + { + "epoch": 0.27576, + "grad_norm": 2.700871229171753, + "learning_rate": 1.8176275223347076e-05, + "loss": 0.1458, + "step": 13788 + }, + { + "epoch": 0.2758, + "grad_norm": 4.477280139923096, + "learning_rate": 1.817547124842473e-05, + "loss": 0.4317, + "step": 13790 + }, + { + "epoch": 0.27584, + "grad_norm": 0.6130542159080505, + "learning_rate": 1.8174667114117368e-05, + "loss": 0.0412, + "step": 13792 + }, + { + "epoch": 0.27588, + "grad_norm": 2.059784412384033, + "learning_rate": 1.817386282044066e-05, + "loss": 0.1216, + "step": 13794 + }, + { + "epoch": 0.27592, + "grad_norm": 2.464590549468994, + "learning_rate": 1.8173058367410287e-05, + "loss": 0.2695, + "step": 13796 + }, + { + "epoch": 0.27596, + "grad_norm": 1.1500215530395508, + "learning_rate": 1.8172253755041934e-05, + "loss": 0.1072, + "step": 13798 + }, + { + "epoch": 0.276, + "grad_norm": 2.1653268337249756, + "learning_rate": 1.8171448983351284e-05, + "loss": 0.1297, + "step": 13800 + }, + { + "epoch": 0.27604, + "grad_norm": 0.16081923246383667, + "learning_rate": 1.8170644052354035e-05, + "loss": 0.0841, + "step": 13802 + }, + { + "epoch": 0.27608, + "grad_norm": 2.116102457046509, + "learning_rate": 1.816983896206587e-05, + "loss": 0.1756, + "step": 13804 + }, + { + "epoch": 0.27612, + "grad_norm": 1.9042611122131348, + "learning_rate": 1.8169033712502487e-05, + "loss": 0.0888, + "step": 13806 + }, + { + "epoch": 0.27616, + "grad_norm": 1.2963188886642456, + "learning_rate": 1.816822830367959e-05, + "loss": 0.1794, + "step": 13808 + }, + { + "epoch": 0.2762, + "grad_norm": 2.1148459911346436, + "learning_rate": 1.8167422735612877e-05, + "loss": 0.1178, + "step": 13810 + }, + { + "epoch": 0.27624, + "grad_norm": 0.9541906118392944, + "learning_rate": 1.816661700831805e-05, + "loss": 0.0619, + "step": 13812 + }, + { + "epoch": 0.27628, + "grad_norm": 0.1620175987482071, + "learning_rate": 1.816581112181082e-05, + "loss": 0.0691, + "step": 13814 + }, + { + "epoch": 0.27632, + "grad_norm": 1.6059064865112305, + "learning_rate": 1.81650050761069e-05, + "loss": 0.0716, + "step": 13816 + }, + { + "epoch": 0.27636, + "grad_norm": 1.8435617685317993, + "learning_rate": 1.8164198871222002e-05, + "loss": 0.0551, + "step": 13818 + }, + { + "epoch": 0.2764, + "grad_norm": 1.3263272047042847, + "learning_rate": 1.816339250717184e-05, + "loss": 0.2231, + "step": 13820 + }, + { + "epoch": 0.27644, + "grad_norm": 0.7177687883377075, + "learning_rate": 1.8162585983972144e-05, + "loss": 0.0713, + "step": 13822 + }, + { + "epoch": 0.27648, + "grad_norm": 0.9945403933525085, + "learning_rate": 1.8161779301638626e-05, + "loss": 0.0281, + "step": 13824 + }, + { + "epoch": 0.27652, + "grad_norm": 0.4848639667034149, + "learning_rate": 1.816097246018702e-05, + "loss": 0.0174, + "step": 13826 + }, + { + "epoch": 0.27656, + "grad_norm": 3.2370107173919678, + "learning_rate": 1.816016545963306e-05, + "loss": 0.1204, + "step": 13828 + }, + { + "epoch": 0.2766, + "grad_norm": 0.12480328977108002, + "learning_rate": 1.815935829999247e-05, + "loss": 0.0377, + "step": 13830 + }, + { + "epoch": 0.27664, + "grad_norm": 2.6683926582336426, + "learning_rate": 1.8158550981280987e-05, + "loss": 0.5052, + "step": 13832 + }, + { + "epoch": 0.27668, + "grad_norm": 0.18744216859340668, + "learning_rate": 1.8157743503514353e-05, + "loss": 0.0435, + "step": 13834 + }, + { + "epoch": 0.27672, + "grad_norm": 0.17445942759513855, + "learning_rate": 1.815693586670831e-05, + "loss": 0.0422, + "step": 13836 + }, + { + "epoch": 0.27676, + "grad_norm": 4.989577770233154, + "learning_rate": 1.8156128070878606e-05, + "loss": 0.3535, + "step": 13838 + }, + { + "epoch": 0.2768, + "grad_norm": 1.0004160404205322, + "learning_rate": 1.8155320116040983e-05, + "loss": 0.2061, + "step": 13840 + }, + { + "epoch": 0.27684, + "grad_norm": 0.6795430183410645, + "learning_rate": 1.8154512002211196e-05, + "loss": 0.1714, + "step": 13842 + }, + { + "epoch": 0.27688, + "grad_norm": 0.11768478155136108, + "learning_rate": 1.8153703729405e-05, + "loss": 0.0218, + "step": 13844 + }, + { + "epoch": 0.27692, + "grad_norm": 4.848945617675781, + "learning_rate": 1.8152895297638152e-05, + "loss": 0.2111, + "step": 13846 + }, + { + "epoch": 0.27696, + "grad_norm": 2.748131275177002, + "learning_rate": 1.815208670692641e-05, + "loss": 0.2344, + "step": 13848 + }, + { + "epoch": 0.277, + "grad_norm": 0.2002178281545639, + "learning_rate": 1.815127795728554e-05, + "loss": 0.1942, + "step": 13850 + }, + { + "epoch": 0.27704, + "grad_norm": 0.13880066573619843, + "learning_rate": 1.8150469048731317e-05, + "loss": 0.0835, + "step": 13852 + }, + { + "epoch": 0.27708, + "grad_norm": 3.5351803302764893, + "learning_rate": 1.81496599812795e-05, + "loss": 0.1671, + "step": 13854 + }, + { + "epoch": 0.27712, + "grad_norm": 1.1395386457443237, + "learning_rate": 1.8148850754945865e-05, + "loss": 0.1581, + "step": 13856 + }, + { + "epoch": 0.27716, + "grad_norm": 1.0206689834594727, + "learning_rate": 1.814804136974619e-05, + "loss": 0.297, + "step": 13858 + }, + { + "epoch": 0.2772, + "grad_norm": 6.499826431274414, + "learning_rate": 1.814723182569625e-05, + "loss": 0.3792, + "step": 13860 + }, + { + "epoch": 0.27724, + "grad_norm": 0.223692387342453, + "learning_rate": 1.8146422122811835e-05, + "loss": 0.2411, + "step": 13862 + }, + { + "epoch": 0.27728, + "grad_norm": 0.09994492679834366, + "learning_rate": 1.814561226110873e-05, + "loss": 0.1188, + "step": 13864 + }, + { + "epoch": 0.27732, + "grad_norm": 4.409681797027588, + "learning_rate": 1.8144802240602712e-05, + "loss": 0.4241, + "step": 13866 + }, + { + "epoch": 0.27736, + "grad_norm": 0.3152048587799072, + "learning_rate": 1.8143992061309586e-05, + "loss": 0.1579, + "step": 13868 + }, + { + "epoch": 0.2774, + "grad_norm": 0.06324777752161026, + "learning_rate": 1.814318172324514e-05, + "loss": 0.0444, + "step": 13870 + }, + { + "epoch": 0.27744, + "grad_norm": 1.6149296760559082, + "learning_rate": 1.8142371226425176e-05, + "loss": 0.0736, + "step": 13872 + }, + { + "epoch": 0.27748, + "grad_norm": 0.11296772956848145, + "learning_rate": 1.814156057086549e-05, + "loss": 0.027, + "step": 13874 + }, + { + "epoch": 0.27752, + "grad_norm": 0.9176329374313354, + "learning_rate": 1.814074975658189e-05, + "loss": 0.0645, + "step": 13876 + }, + { + "epoch": 0.27756, + "grad_norm": 1.9767318964004517, + "learning_rate": 1.8139938783590186e-05, + "loss": 0.206, + "step": 13878 + }, + { + "epoch": 0.2776, + "grad_norm": 3.002462387084961, + "learning_rate": 1.8139127651906183e-05, + "loss": 0.2149, + "step": 13880 + }, + { + "epoch": 0.27764, + "grad_norm": 3.9112138748168945, + "learning_rate": 1.8138316361545696e-05, + "loss": 0.5385, + "step": 13882 + }, + { + "epoch": 0.27768, + "grad_norm": 1.6586554050445557, + "learning_rate": 1.813750491252454e-05, + "loss": 0.3949, + "step": 13884 + }, + { + "epoch": 0.27772, + "grad_norm": 0.9065015912055969, + "learning_rate": 1.8136693304858538e-05, + "loss": 0.0379, + "step": 13886 + }, + { + "epoch": 0.27776, + "grad_norm": 1.366718053817749, + "learning_rate": 1.813588153856351e-05, + "loss": 0.0717, + "step": 13888 + }, + { + "epoch": 0.2778, + "grad_norm": 0.6794711351394653, + "learning_rate": 1.813506961365528e-05, + "loss": 0.1296, + "step": 13890 + }, + { + "epoch": 0.27784, + "grad_norm": 2.269726276397705, + "learning_rate": 1.8134257530149684e-05, + "loss": 0.3154, + "step": 13892 + }, + { + "epoch": 0.27788, + "grad_norm": 0.3622196912765503, + "learning_rate": 1.8133445288062547e-05, + "loss": 0.0717, + "step": 13894 + }, + { + "epoch": 0.27792, + "grad_norm": 0.5304604172706604, + "learning_rate": 1.8132632887409708e-05, + "loss": 0.0289, + "step": 13896 + }, + { + "epoch": 0.27796, + "grad_norm": 1.767288327217102, + "learning_rate": 1.8131820328207005e-05, + "loss": 0.0985, + "step": 13898 + }, + { + "epoch": 0.278, + "grad_norm": 1.936938762664795, + "learning_rate": 1.8131007610470278e-05, + "loss": 0.063, + "step": 13900 + }, + { + "epoch": 0.27804, + "grad_norm": 1.7485769987106323, + "learning_rate": 1.813019473421537e-05, + "loss": 0.0674, + "step": 13902 + }, + { + "epoch": 0.27808, + "grad_norm": 1.4094303846359253, + "learning_rate": 1.812938169945813e-05, + "loss": 0.0825, + "step": 13904 + }, + { + "epoch": 0.27812, + "grad_norm": 2.3387482166290283, + "learning_rate": 1.812856850621441e-05, + "loss": 0.2482, + "step": 13906 + }, + { + "epoch": 0.27816, + "grad_norm": 0.5380547642707825, + "learning_rate": 1.8127755154500063e-05, + "loss": 0.0469, + "step": 13908 + }, + { + "epoch": 0.2782, + "grad_norm": 1.012824296951294, + "learning_rate": 1.812694164433094e-05, + "loss": 0.0717, + "step": 13910 + }, + { + "epoch": 0.27824, + "grad_norm": 0.291908860206604, + "learning_rate": 1.812612797572291e-05, + "loss": 0.312, + "step": 13912 + }, + { + "epoch": 0.27828, + "grad_norm": 2.3648946285247803, + "learning_rate": 1.8125314148691832e-05, + "loss": 0.1037, + "step": 13914 + }, + { + "epoch": 0.27832, + "grad_norm": 0.9929531812667847, + "learning_rate": 1.812450016325357e-05, + "loss": 0.1143, + "step": 13916 + }, + { + "epoch": 0.27836, + "grad_norm": 0.805779218673706, + "learning_rate": 1.812368601942399e-05, + "loss": 0.1689, + "step": 13918 + }, + { + "epoch": 0.2784, + "grad_norm": 1.949899435043335, + "learning_rate": 1.812287171721897e-05, + "loss": 0.1368, + "step": 13920 + }, + { + "epoch": 0.27844, + "grad_norm": 2.544736623764038, + "learning_rate": 1.812205725665439e-05, + "loss": 0.1097, + "step": 13922 + }, + { + "epoch": 0.27848, + "grad_norm": 0.48412445187568665, + "learning_rate": 1.812124263774612e-05, + "loss": 0.1114, + "step": 13924 + }, + { + "epoch": 0.27852, + "grad_norm": 0.7187126278877258, + "learning_rate": 1.8120427860510045e-05, + "loss": 0.0322, + "step": 13926 + }, + { + "epoch": 0.27856, + "grad_norm": 0.4275336265563965, + "learning_rate": 1.8119612924962043e-05, + "loss": 0.0838, + "step": 13928 + }, + { + "epoch": 0.2786, + "grad_norm": 0.07418370246887207, + "learning_rate": 1.811879783111801e-05, + "loss": 0.0736, + "step": 13930 + }, + { + "epoch": 0.27864, + "grad_norm": 0.17737308144569397, + "learning_rate": 1.8117982578993832e-05, + "loss": 0.1107, + "step": 13932 + }, + { + "epoch": 0.27868, + "grad_norm": 1.5988705158233643, + "learning_rate": 1.8117167168605404e-05, + "loss": 0.0395, + "step": 13934 + }, + { + "epoch": 0.27872, + "grad_norm": 0.1750524789094925, + "learning_rate": 1.8116351599968623e-05, + "loss": 0.077, + "step": 13936 + }, + { + "epoch": 0.27876, + "grad_norm": 0.1851358264684677, + "learning_rate": 1.8115535873099392e-05, + "loss": 0.0136, + "step": 13938 + }, + { + "epoch": 0.2788, + "grad_norm": 4.859838962554932, + "learning_rate": 1.8114719988013612e-05, + "loss": 0.4327, + "step": 13940 + }, + { + "epoch": 0.27884, + "grad_norm": 2.011188268661499, + "learning_rate": 1.8113903944727184e-05, + "loss": 0.0956, + "step": 13942 + }, + { + "epoch": 0.27888, + "grad_norm": 0.12611640989780426, + "learning_rate": 1.8113087743256025e-05, + "loss": 0.6322, + "step": 13944 + }, + { + "epoch": 0.27892, + "grad_norm": 1.394384741783142, + "learning_rate": 1.8112271383616038e-05, + "loss": 0.2372, + "step": 13946 + }, + { + "epoch": 0.27896, + "grad_norm": 5.542813777923584, + "learning_rate": 1.811145486582315e-05, + "loss": 0.8797, + "step": 13948 + }, + { + "epoch": 0.279, + "grad_norm": 4.387783050537109, + "learning_rate": 1.8110638189893267e-05, + "loss": 0.4033, + "step": 13950 + }, + { + "epoch": 0.27904, + "grad_norm": 0.7585332989692688, + "learning_rate": 1.810982135584232e-05, + "loss": 0.0518, + "step": 13952 + }, + { + "epoch": 0.27908, + "grad_norm": 3.015827178955078, + "learning_rate": 1.8109004363686233e-05, + "loss": 0.2998, + "step": 13954 + }, + { + "epoch": 0.27912, + "grad_norm": 1.4483543634414673, + "learning_rate": 1.8108187213440927e-05, + "loss": 0.1056, + "step": 13956 + }, + { + "epoch": 0.27916, + "grad_norm": 1.3036689758300781, + "learning_rate": 1.8107369905122342e-05, + "loss": 0.0861, + "step": 13958 + }, + { + "epoch": 0.2792, + "grad_norm": 1.088880181312561, + "learning_rate": 1.81065524387464e-05, + "loss": 0.1072, + "step": 13960 + }, + { + "epoch": 0.27924, + "grad_norm": 0.1905621588230133, + "learning_rate": 1.810573481432905e-05, + "loss": 0.2888, + "step": 13962 + }, + { + "epoch": 0.27928, + "grad_norm": 0.5962364673614502, + "learning_rate": 1.8104917031886223e-05, + "loss": 0.0716, + "step": 13964 + }, + { + "epoch": 0.27932, + "grad_norm": 3.0053627490997314, + "learning_rate": 1.8104099091433868e-05, + "loss": 0.1739, + "step": 13966 + }, + { + "epoch": 0.27936, + "grad_norm": 0.3377809524536133, + "learning_rate": 1.810328099298793e-05, + "loss": 0.0766, + "step": 13968 + }, + { + "epoch": 0.2794, + "grad_norm": 0.7270305752754211, + "learning_rate": 1.8102462736564355e-05, + "loss": 0.0613, + "step": 13970 + }, + { + "epoch": 0.27944, + "grad_norm": 3.6558709144592285, + "learning_rate": 1.81016443221791e-05, + "loss": 0.3885, + "step": 13972 + }, + { + "epoch": 0.27948, + "grad_norm": 0.6742943525314331, + "learning_rate": 1.8100825749848113e-05, + "loss": 0.3815, + "step": 13974 + }, + { + "epoch": 0.27952, + "grad_norm": 0.41863077878952026, + "learning_rate": 1.810000701958736e-05, + "loss": 0.2931, + "step": 13976 + }, + { + "epoch": 0.27956, + "grad_norm": 0.10907094180583954, + "learning_rate": 1.8099188131412803e-05, + "loss": 0.0084, + "step": 13978 + }, + { + "epoch": 0.2796, + "grad_norm": 2.000558376312256, + "learning_rate": 1.80983690853404e-05, + "loss": 0.1111, + "step": 13980 + }, + { + "epoch": 0.27964, + "grad_norm": 3.327892780303955, + "learning_rate": 1.8097549881386125e-05, + "loss": 0.2516, + "step": 13982 + }, + { + "epoch": 0.27968, + "grad_norm": 0.3733806908130646, + "learning_rate": 1.8096730519565945e-05, + "loss": 0.203, + "step": 13984 + }, + { + "epoch": 0.27972, + "grad_norm": 0.9676666259765625, + "learning_rate": 1.8095910999895833e-05, + "loss": 0.1802, + "step": 13986 + }, + { + "epoch": 0.27976, + "grad_norm": 0.4695341885089874, + "learning_rate": 1.809509132239177e-05, + "loss": 0.1048, + "step": 13988 + }, + { + "epoch": 0.2798, + "grad_norm": 1.2391659021377563, + "learning_rate": 1.8094271487069733e-05, + "loss": 0.0853, + "step": 13990 + }, + { + "epoch": 0.27984, + "grad_norm": 0.13892512023448944, + "learning_rate": 1.809345149394571e-05, + "loss": 0.0575, + "step": 13992 + }, + { + "epoch": 0.27988, + "grad_norm": 1.5416851043701172, + "learning_rate": 1.809263134303568e-05, + "loss": 0.1238, + "step": 13994 + }, + { + "epoch": 0.27992, + "grad_norm": 0.1952722668647766, + "learning_rate": 1.8091811034355635e-05, + "loss": 0.0335, + "step": 13996 + }, + { + "epoch": 0.27996, + "grad_norm": 5.858908653259277, + "learning_rate": 1.8090990567921567e-05, + "loss": 0.6302, + "step": 13998 + }, + { + "epoch": 0.28, + "grad_norm": 4.656360626220703, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.4578, + "step": 14000 + }, + { + "epoch": 0.28004, + "grad_norm": 0.6951506733894348, + "learning_rate": 1.8089349161855356e-05, + "loss": 0.0964, + "step": 14002 + }, + { + "epoch": 0.28008, + "grad_norm": 2.0384020805358887, + "learning_rate": 1.8088528222255206e-05, + "loss": 0.3409, + "step": 14004 + }, + { + "epoch": 0.28012, + "grad_norm": 2.3477139472961426, + "learning_rate": 1.8087707124965036e-05, + "loss": 0.4717, + "step": 14006 + }, + { + "epoch": 0.28016, + "grad_norm": 1.077904224395752, + "learning_rate": 1.8086885870000856e-05, + "loss": 0.1501, + "step": 14008 + }, + { + "epoch": 0.2802, + "grad_norm": 2.2895548343658447, + "learning_rate": 1.8086064457378667e-05, + "loss": 0.1765, + "step": 14010 + }, + { + "epoch": 0.28024, + "grad_norm": 2.029711961746216, + "learning_rate": 1.8085242887114488e-05, + "loss": 0.1855, + "step": 14012 + }, + { + "epoch": 0.28028, + "grad_norm": 2.099667549133301, + "learning_rate": 1.8084421159224342e-05, + "loss": 0.3669, + "step": 14014 + }, + { + "epoch": 0.28032, + "grad_norm": 2.0971622467041016, + "learning_rate": 1.8083599273724238e-05, + "loss": 0.176, + "step": 14016 + }, + { + "epoch": 0.28036, + "grad_norm": 1.21254301071167, + "learning_rate": 1.8082777230630205e-05, + "loss": 0.1382, + "step": 14018 + }, + { + "epoch": 0.2804, + "grad_norm": 1.7391518354415894, + "learning_rate": 1.8081955029958272e-05, + "loss": 0.1674, + "step": 14020 + }, + { + "epoch": 0.28044, + "grad_norm": 1.1301816701889038, + "learning_rate": 1.8081132671724462e-05, + "loss": 0.1056, + "step": 14022 + }, + { + "epoch": 0.28048, + "grad_norm": 2.9834516048431396, + "learning_rate": 1.808031015594481e-05, + "loss": 0.2426, + "step": 14024 + }, + { + "epoch": 0.28052, + "grad_norm": 1.466332197189331, + "learning_rate": 1.8079487482635355e-05, + "loss": 0.1645, + "step": 14026 + }, + { + "epoch": 0.28056, + "grad_norm": 2.182770252227783, + "learning_rate": 1.807866465181213e-05, + "loss": 0.2031, + "step": 14028 + }, + { + "epoch": 0.2806, + "grad_norm": 0.903663694858551, + "learning_rate": 1.8077841663491174e-05, + "loss": 0.0529, + "step": 14030 + }, + { + "epoch": 0.28064, + "grad_norm": 0.5229770541191101, + "learning_rate": 1.8077018517688542e-05, + "loss": 0.2697, + "step": 14032 + }, + { + "epoch": 0.28068, + "grad_norm": 3.6851367950439453, + "learning_rate": 1.8076195214420273e-05, + "loss": 0.265, + "step": 14034 + }, + { + "epoch": 0.28072, + "grad_norm": 1.2360787391662598, + "learning_rate": 1.8075371753702423e-05, + "loss": 0.1688, + "step": 14036 + }, + { + "epoch": 0.28076, + "grad_norm": 1.8765102624893188, + "learning_rate": 1.807454813555104e-05, + "loss": 0.2636, + "step": 14038 + }, + { + "epoch": 0.2808, + "grad_norm": 1.2580994367599487, + "learning_rate": 1.8073724359982184e-05, + "loss": 0.087, + "step": 14040 + }, + { + "epoch": 0.28084, + "grad_norm": 2.371668577194214, + "learning_rate": 1.8072900427011915e-05, + "loss": 0.1529, + "step": 14042 + }, + { + "epoch": 0.28088, + "grad_norm": 1.1488683223724365, + "learning_rate": 1.8072076336656298e-05, + "loss": 0.0944, + "step": 14044 + }, + { + "epoch": 0.28092, + "grad_norm": 1.7019360065460205, + "learning_rate": 1.8071252088931398e-05, + "loss": 0.1245, + "step": 14046 + }, + { + "epoch": 0.28096, + "grad_norm": 2.567016363143921, + "learning_rate": 1.807042768385328e-05, + "loss": 0.2427, + "step": 14048 + }, + { + "epoch": 0.281, + "grad_norm": 1.8502188920974731, + "learning_rate": 1.806960312143802e-05, + "loss": 0.2105, + "step": 14050 + }, + { + "epoch": 0.28104, + "grad_norm": 0.21321547031402588, + "learning_rate": 1.8068778401701694e-05, + "loss": 0.0726, + "step": 14052 + }, + { + "epoch": 0.28108, + "grad_norm": 2.6879405975341797, + "learning_rate": 1.8067953524660378e-05, + "loss": 0.14, + "step": 14054 + }, + { + "epoch": 0.28112, + "grad_norm": 3.959941864013672, + "learning_rate": 1.8067128490330153e-05, + "loss": 0.2541, + "step": 14056 + }, + { + "epoch": 0.28116, + "grad_norm": 2.5907182693481445, + "learning_rate": 1.806630329872711e-05, + "loss": 0.2175, + "step": 14058 + }, + { + "epoch": 0.2812, + "grad_norm": 0.7969430685043335, + "learning_rate": 1.8065477949867327e-05, + "loss": 0.203, + "step": 14060 + }, + { + "epoch": 0.28124, + "grad_norm": 2.48140025138855, + "learning_rate": 1.80646524437669e-05, + "loss": 0.585, + "step": 14062 + }, + { + "epoch": 0.28128, + "grad_norm": 3.809731960296631, + "learning_rate": 1.806382678044192e-05, + "loss": 0.2194, + "step": 14064 + }, + { + "epoch": 0.28132, + "grad_norm": 3.492370367050171, + "learning_rate": 1.8063000959908486e-05, + "loss": 0.1907, + "step": 14066 + }, + { + "epoch": 0.28136, + "grad_norm": 1.0616767406463623, + "learning_rate": 1.80621749821827e-05, + "loss": 0.1053, + "step": 14068 + }, + { + "epoch": 0.2814, + "grad_norm": 2.886739492416382, + "learning_rate": 1.806134884728066e-05, + "loss": 0.3636, + "step": 14070 + }, + { + "epoch": 0.28144, + "grad_norm": 2.1516499519348145, + "learning_rate": 1.806052255521847e-05, + "loss": 0.0911, + "step": 14072 + }, + { + "epoch": 0.28148, + "grad_norm": 1.188248634338379, + "learning_rate": 1.805969610601225e-05, + "loss": 0.1396, + "step": 14074 + }, + { + "epoch": 0.28152, + "grad_norm": 3.7331302165985107, + "learning_rate": 1.80588694996781e-05, + "loss": 0.277, + "step": 14076 + }, + { + "epoch": 0.28156, + "grad_norm": 2.4707837104797363, + "learning_rate": 1.8058042736232144e-05, + "loss": 0.2541, + "step": 14078 + }, + { + "epoch": 0.2816, + "grad_norm": 4.365174293518066, + "learning_rate": 1.8057215815690494e-05, + "loss": 0.3163, + "step": 14080 + }, + { + "epoch": 0.28164, + "grad_norm": 0.48728737235069275, + "learning_rate": 1.8056388738069276e-05, + "loss": 0.1926, + "step": 14082 + }, + { + "epoch": 0.28168, + "grad_norm": 1.1473493576049805, + "learning_rate": 1.8055561503384607e-05, + "loss": 0.2423, + "step": 14084 + }, + { + "epoch": 0.28172, + "grad_norm": 0.5348671078681946, + "learning_rate": 1.8054734111652622e-05, + "loss": 0.0678, + "step": 14086 + }, + { + "epoch": 0.28176, + "grad_norm": 3.437502384185791, + "learning_rate": 1.805390656288945e-05, + "loss": 0.1648, + "step": 14088 + }, + { + "epoch": 0.2818, + "grad_norm": 0.6887494921684265, + "learning_rate": 1.8053078857111218e-05, + "loss": 0.0887, + "step": 14090 + }, + { + "epoch": 0.28184, + "grad_norm": 2.706835985183716, + "learning_rate": 1.8052250994334074e-05, + "loss": 0.1271, + "step": 14092 + }, + { + "epoch": 0.28188, + "grad_norm": 0.6169825792312622, + "learning_rate": 1.805142297457415e-05, + "loss": 0.0249, + "step": 14094 + }, + { + "epoch": 0.28192, + "grad_norm": 0.5258232951164246, + "learning_rate": 1.8050594797847587e-05, + "loss": 0.0526, + "step": 14096 + }, + { + "epoch": 0.28196, + "grad_norm": 0.47198447585105896, + "learning_rate": 1.8049766464170536e-05, + "loss": 0.4066, + "step": 14098 + }, + { + "epoch": 0.282, + "grad_norm": 5.4205732345581055, + "learning_rate": 1.804893797355914e-05, + "loss": 0.5629, + "step": 14100 + }, + { + "epoch": 0.28204, + "grad_norm": 0.5080637335777283, + "learning_rate": 1.804810932602956e-05, + "loss": 0.0942, + "step": 14102 + }, + { + "epoch": 0.28208, + "grad_norm": 3.6119742393493652, + "learning_rate": 1.8047280521597943e-05, + "loss": 0.3816, + "step": 14104 + }, + { + "epoch": 0.28212, + "grad_norm": 2.069274425506592, + "learning_rate": 1.8046451560280446e-05, + "loss": 0.115, + "step": 14106 + }, + { + "epoch": 0.28216, + "grad_norm": 1.2143782377243042, + "learning_rate": 1.8045622442093237e-05, + "loss": 0.2005, + "step": 14108 + }, + { + "epoch": 0.2822, + "grad_norm": 1.3858507871627808, + "learning_rate": 1.8044793167052476e-05, + "loss": 0.1924, + "step": 14110 + }, + { + "epoch": 0.28224, + "grad_norm": 2.797293186187744, + "learning_rate": 1.804396373517433e-05, + "loss": 0.1876, + "step": 14112 + }, + { + "epoch": 0.28228, + "grad_norm": 1.6924933195114136, + "learning_rate": 1.804313414647497e-05, + "loss": 0.2478, + "step": 14114 + }, + { + "epoch": 0.28232, + "grad_norm": 0.5913247466087341, + "learning_rate": 1.8042304400970567e-05, + "loss": 0.4957, + "step": 14116 + }, + { + "epoch": 0.28236, + "grad_norm": 0.920061469078064, + "learning_rate": 1.80414744986773e-05, + "loss": 0.11, + "step": 14118 + }, + { + "epoch": 0.2824, + "grad_norm": 2.0980846881866455, + "learning_rate": 1.8040644439611348e-05, + "loss": 0.3889, + "step": 14120 + }, + { + "epoch": 0.28244, + "grad_norm": 1.9051934480667114, + "learning_rate": 1.8039814223788894e-05, + "loss": 0.1402, + "step": 14122 + }, + { + "epoch": 0.28248, + "grad_norm": 1.3169316053390503, + "learning_rate": 1.803898385122612e-05, + "loss": 0.1114, + "step": 14124 + }, + { + "epoch": 0.28252, + "grad_norm": 2.0331740379333496, + "learning_rate": 1.803815332193922e-05, + "loss": 0.2261, + "step": 14126 + }, + { + "epoch": 0.28256, + "grad_norm": 2.97623872756958, + "learning_rate": 1.8037322635944383e-05, + "loss": 0.1742, + "step": 14128 + }, + { + "epoch": 0.2826, + "grad_norm": 1.3332233428955078, + "learning_rate": 1.80364917932578e-05, + "loss": 0.095, + "step": 14130 + }, + { + "epoch": 0.28264, + "grad_norm": 0.2189227044582367, + "learning_rate": 1.8035660793895675e-05, + "loss": 0.1234, + "step": 14132 + }, + { + "epoch": 0.28268, + "grad_norm": 2.3974666595458984, + "learning_rate": 1.8034829637874202e-05, + "loss": 0.105, + "step": 14134 + }, + { + "epoch": 0.28272, + "grad_norm": 0.09074573963880539, + "learning_rate": 1.803399832520959e-05, + "loss": 0.1533, + "step": 14136 + }, + { + "epoch": 0.28276, + "grad_norm": 1.4797594547271729, + "learning_rate": 1.803316685591805e-05, + "loss": 0.2778, + "step": 14138 + }, + { + "epoch": 0.2828, + "grad_norm": 0.8309820294380188, + "learning_rate": 1.803233523001578e-05, + "loss": 0.0763, + "step": 14140 + }, + { + "epoch": 0.28284, + "grad_norm": 3.9328055381774902, + "learning_rate": 1.8031503447519002e-05, + "loss": 0.3315, + "step": 14142 + }, + { + "epoch": 0.28288, + "grad_norm": 0.8902566432952881, + "learning_rate": 1.8030671508443928e-05, + "loss": 0.2534, + "step": 14144 + }, + { + "epoch": 0.28292, + "grad_norm": 3.3640589714050293, + "learning_rate": 1.802983941280678e-05, + "loss": 0.2171, + "step": 14146 + }, + { + "epoch": 0.28296, + "grad_norm": 3.719315528869629, + "learning_rate": 1.8029007160623778e-05, + "loss": 0.3832, + "step": 14148 + }, + { + "epoch": 0.283, + "grad_norm": 0.3859250247478485, + "learning_rate": 1.8028174751911147e-05, + "loss": 0.0165, + "step": 14150 + }, + { + "epoch": 0.28304, + "grad_norm": 0.7829034924507141, + "learning_rate": 1.8027342186685114e-05, + "loss": 0.0494, + "step": 14152 + }, + { + "epoch": 0.28308, + "grad_norm": 3.0728352069854736, + "learning_rate": 1.8026509464961917e-05, + "loss": 0.256, + "step": 14154 + }, + { + "epoch": 0.28312, + "grad_norm": 4.369729042053223, + "learning_rate": 1.802567658675778e-05, + "loss": 0.3707, + "step": 14156 + }, + { + "epoch": 0.28316, + "grad_norm": 4.116450309753418, + "learning_rate": 1.8024843552088955e-05, + "loss": 0.4814, + "step": 14158 + }, + { + "epoch": 0.2832, + "grad_norm": 0.5963610410690308, + "learning_rate": 1.802401036097167e-05, + "loss": 0.0454, + "step": 14160 + }, + { + "epoch": 0.28324, + "grad_norm": 0.6535839438438416, + "learning_rate": 1.802317701342217e-05, + "loss": 0.3254, + "step": 14162 + }, + { + "epoch": 0.28328, + "grad_norm": 0.4497660994529724, + "learning_rate": 1.80223435094567e-05, + "loss": 0.1351, + "step": 14164 + }, + { + "epoch": 0.28332, + "grad_norm": 1.5467596054077148, + "learning_rate": 1.802150984909152e-05, + "loss": 0.2028, + "step": 14166 + }, + { + "epoch": 0.28336, + "grad_norm": 0.3059878647327423, + "learning_rate": 1.802067603234287e-05, + "loss": 0.2613, + "step": 14168 + }, + { + "epoch": 0.2834, + "grad_norm": 0.0939486175775528, + "learning_rate": 1.801984205922701e-05, + "loss": 0.1412, + "step": 14170 + }, + { + "epoch": 0.28344, + "grad_norm": 1.9400557279586792, + "learning_rate": 1.801900792976021e-05, + "loss": 0.1182, + "step": 14172 + }, + { + "epoch": 0.28348, + "grad_norm": 3.02059268951416, + "learning_rate": 1.8018173643958715e-05, + "loss": 0.2179, + "step": 14174 + }, + { + "epoch": 0.28352, + "grad_norm": 1.8330661058425903, + "learning_rate": 1.8017339201838798e-05, + "loss": 0.2408, + "step": 14176 + }, + { + "epoch": 0.28356, + "grad_norm": 2.40677547454834, + "learning_rate": 1.8016504603416724e-05, + "loss": 0.1586, + "step": 14178 + }, + { + "epoch": 0.2836, + "grad_norm": 2.268019914627075, + "learning_rate": 1.8015669848708768e-05, + "loss": 0.2021, + "step": 14180 + }, + { + "epoch": 0.28364, + "grad_norm": 2.0810976028442383, + "learning_rate": 1.80148349377312e-05, + "loss": 0.1266, + "step": 14182 + }, + { + "epoch": 0.28368, + "grad_norm": 2.671053886413574, + "learning_rate": 1.8013999870500298e-05, + "loss": 0.3802, + "step": 14184 + }, + { + "epoch": 0.28372, + "grad_norm": 0.6344791650772095, + "learning_rate": 1.8013164647032345e-05, + "loss": 0.0592, + "step": 14186 + }, + { + "epoch": 0.28376, + "grad_norm": 1.2029799222946167, + "learning_rate": 1.801232926734362e-05, + "loss": 0.0504, + "step": 14188 + }, + { + "epoch": 0.2838, + "grad_norm": 1.489011526107788, + "learning_rate": 1.8011493731450412e-05, + "loss": 0.0707, + "step": 14190 + }, + { + "epoch": 0.28384, + "grad_norm": 0.13862940669059753, + "learning_rate": 1.8010658039369006e-05, + "loss": 0.0117, + "step": 14192 + }, + { + "epoch": 0.28388, + "grad_norm": 0.5708065032958984, + "learning_rate": 1.8009822191115703e-05, + "loss": 0.0493, + "step": 14194 + }, + { + "epoch": 0.28392, + "grad_norm": 2.6081161499023438, + "learning_rate": 1.800898618670679e-05, + "loss": 0.243, + "step": 14196 + }, + { + "epoch": 0.28396, + "grad_norm": 2.6660726070404053, + "learning_rate": 1.8008150026158568e-05, + "loss": 0.1195, + "step": 14198 + }, + { + "epoch": 0.284, + "grad_norm": 2.568321704864502, + "learning_rate": 1.8007313709487334e-05, + "loss": 0.1479, + "step": 14200 + }, + { + "epoch": 0.28404, + "grad_norm": 0.7989767789840698, + "learning_rate": 1.8006477236709403e-05, + "loss": 0.1231, + "step": 14202 + }, + { + "epoch": 0.28408, + "grad_norm": 3.9916694164276123, + "learning_rate": 1.8005640607841074e-05, + "loss": 0.3215, + "step": 14204 + }, + { + "epoch": 0.28412, + "grad_norm": 0.37471282482147217, + "learning_rate": 1.800480382289866e-05, + "loss": 0.1399, + "step": 14206 + }, + { + "epoch": 0.28416, + "grad_norm": 1.3793988227844238, + "learning_rate": 1.8003966881898473e-05, + "loss": 0.1052, + "step": 14208 + }, + { + "epoch": 0.2842, + "grad_norm": 1.9147634506225586, + "learning_rate": 1.8003129784856832e-05, + "loss": 0.2984, + "step": 14210 + }, + { + "epoch": 0.28424, + "grad_norm": 1.2842190265655518, + "learning_rate": 1.8002292531790056e-05, + "loss": 0.0828, + "step": 14212 + }, + { + "epoch": 0.28428, + "grad_norm": 1.5214848518371582, + "learning_rate": 1.8001455122714468e-05, + "loss": 0.2216, + "step": 14214 + }, + { + "epoch": 0.28432, + "grad_norm": 1.0022906064987183, + "learning_rate": 1.8000617557646392e-05, + "loss": 0.2726, + "step": 14216 + }, + { + "epoch": 0.28436, + "grad_norm": 3.7954139709472656, + "learning_rate": 1.799977983660216e-05, + "loss": 0.2032, + "step": 14218 + }, + { + "epoch": 0.2844, + "grad_norm": 1.3821995258331299, + "learning_rate": 1.7998941959598097e-05, + "loss": 0.1642, + "step": 14220 + }, + { + "epoch": 0.28444, + "grad_norm": 1.8991258144378662, + "learning_rate": 1.7998103926650542e-05, + "loss": 0.2141, + "step": 14222 + }, + { + "epoch": 0.28448, + "grad_norm": 0.40920719504356384, + "learning_rate": 1.7997265737775837e-05, + "loss": 0.0557, + "step": 14224 + }, + { + "epoch": 0.28452, + "grad_norm": 0.37918931245803833, + "learning_rate": 1.7996427392990317e-05, + "loss": 0.0868, + "step": 14226 + }, + { + "epoch": 0.28456, + "grad_norm": 0.5814907550811768, + "learning_rate": 1.799558889231033e-05, + "loss": 0.0661, + "step": 14228 + }, + { + "epoch": 0.2846, + "grad_norm": 0.5906938314437866, + "learning_rate": 1.799475023575222e-05, + "loss": 0.0284, + "step": 14230 + }, + { + "epoch": 0.28464, + "grad_norm": 0.6251517534255981, + "learning_rate": 1.7993911423332336e-05, + "loss": 0.0909, + "step": 14232 + }, + { + "epoch": 0.28468, + "grad_norm": 1.2009992599487305, + "learning_rate": 1.7993072455067037e-05, + "loss": 0.1923, + "step": 14234 + }, + { + "epoch": 0.28472, + "grad_norm": 1.1094995737075806, + "learning_rate": 1.7992233330972673e-05, + "loss": 0.1141, + "step": 14236 + }, + { + "epoch": 0.28476, + "grad_norm": 3.204378843307495, + "learning_rate": 1.7991394051065606e-05, + "loss": 0.5263, + "step": 14238 + }, + { + "epoch": 0.2848, + "grad_norm": 0.6204032897949219, + "learning_rate": 1.79905546153622e-05, + "loss": 0.048, + "step": 14240 + }, + { + "epoch": 0.28484, + "grad_norm": 1.407907485961914, + "learning_rate": 1.7989715023878817e-05, + "loss": 0.222, + "step": 14242 + }, + { + "epoch": 0.28488, + "grad_norm": 0.9180957674980164, + "learning_rate": 1.7988875276631824e-05, + "loss": 0.441, + "step": 14244 + }, + { + "epoch": 0.28492, + "grad_norm": 0.086957648396492, + "learning_rate": 1.7988035373637597e-05, + "loss": 0.2534, + "step": 14246 + }, + { + "epoch": 0.28496, + "grad_norm": 2.076343297958374, + "learning_rate": 1.7987195314912504e-05, + "loss": 0.1932, + "step": 14248 + }, + { + "epoch": 0.285, + "grad_norm": 0.37668871879577637, + "learning_rate": 1.798635510047293e-05, + "loss": 0.0659, + "step": 14250 + }, + { + "epoch": 0.28504, + "grad_norm": 1.701723337173462, + "learning_rate": 1.798551473033525e-05, + "loss": 0.0756, + "step": 14252 + }, + { + "epoch": 0.28508, + "grad_norm": 0.6286095976829529, + "learning_rate": 1.798467420451585e-05, + "loss": 0.1062, + "step": 14254 + }, + { + "epoch": 0.28512, + "grad_norm": 1.153843879699707, + "learning_rate": 1.7983833523031114e-05, + "loss": 0.0591, + "step": 14256 + }, + { + "epoch": 0.28516, + "grad_norm": 2.3864028453826904, + "learning_rate": 1.7982992685897437e-05, + "loss": 0.1683, + "step": 14258 + }, + { + "epoch": 0.2852, + "grad_norm": 1.1616573333740234, + "learning_rate": 1.7982151693131206e-05, + "loss": 0.1173, + "step": 14260 + }, + { + "epoch": 0.28524, + "grad_norm": 3.471229314804077, + "learning_rate": 1.7981310544748817e-05, + "loss": 0.2885, + "step": 14262 + }, + { + "epoch": 0.28528, + "grad_norm": 4.01158332824707, + "learning_rate": 1.798046924076667e-05, + "loss": 0.1866, + "step": 14264 + }, + { + "epoch": 0.28532, + "grad_norm": 1.267479658126831, + "learning_rate": 1.7979627781201164e-05, + "loss": 0.053, + "step": 14266 + }, + { + "epoch": 0.28536, + "grad_norm": 1.7384591102600098, + "learning_rate": 1.797878616606871e-05, + "loss": 0.3194, + "step": 14268 + }, + { + "epoch": 0.2854, + "grad_norm": 2.775806188583374, + "learning_rate": 1.7977944395385713e-05, + "loss": 0.3259, + "step": 14270 + }, + { + "epoch": 0.28544, + "grad_norm": 1.3707369565963745, + "learning_rate": 1.7977102469168578e-05, + "loss": 0.0851, + "step": 14272 + }, + { + "epoch": 0.28548, + "grad_norm": 1.659745454788208, + "learning_rate": 1.7976260387433727e-05, + "loss": 0.1026, + "step": 14274 + }, + { + "epoch": 0.28552, + "grad_norm": 0.06987076997756958, + "learning_rate": 1.7975418150197573e-05, + "loss": 0.0816, + "step": 14276 + }, + { + "epoch": 0.28556, + "grad_norm": 1.7395708560943604, + "learning_rate": 1.7974575757476537e-05, + "loss": 0.1682, + "step": 14278 + }, + { + "epoch": 0.2856, + "grad_norm": 0.7999492883682251, + "learning_rate": 1.7973733209287036e-05, + "loss": 0.0253, + "step": 14280 + }, + { + "epoch": 0.28564, + "grad_norm": 0.43148738145828247, + "learning_rate": 1.7972890505645504e-05, + "loss": 0.0507, + "step": 14282 + }, + { + "epoch": 0.28568, + "grad_norm": 0.1533154845237732, + "learning_rate": 1.797204764656837e-05, + "loss": 0.0096, + "step": 14284 + }, + { + "epoch": 0.28572, + "grad_norm": 1.5341662168502808, + "learning_rate": 1.797120463207206e-05, + "loss": 0.0768, + "step": 14286 + }, + { + "epoch": 0.28576, + "grad_norm": 0.5576345324516296, + "learning_rate": 1.797036146217301e-05, + "loss": 0.0512, + "step": 14288 + }, + { + "epoch": 0.2858, + "grad_norm": 1.6656696796417236, + "learning_rate": 1.7969518136887664e-05, + "loss": 0.1139, + "step": 14290 + }, + { + "epoch": 0.28584, + "grad_norm": 0.6043295860290527, + "learning_rate": 1.7968674656232455e-05, + "loss": 0.0201, + "step": 14292 + }, + { + "epoch": 0.28588, + "grad_norm": 1.3496124744415283, + "learning_rate": 1.7967831020223836e-05, + "loss": 0.1399, + "step": 14294 + }, + { + "epoch": 0.28592, + "grad_norm": 5.86049747467041, + "learning_rate": 1.7966987228878247e-05, + "loss": 0.602, + "step": 14296 + }, + { + "epoch": 0.28596, + "grad_norm": 0.886258602142334, + "learning_rate": 1.7966143282212135e-05, + "loss": 0.0675, + "step": 14298 + }, + { + "epoch": 0.286, + "grad_norm": 1.4567359685897827, + "learning_rate": 1.7965299180241963e-05, + "loss": 0.0688, + "step": 14300 + }, + { + "epoch": 0.28604, + "grad_norm": 3.49088978767395, + "learning_rate": 1.7964454922984184e-05, + "loss": 0.2911, + "step": 14302 + }, + { + "epoch": 0.28608, + "grad_norm": 0.42732319235801697, + "learning_rate": 1.7963610510455254e-05, + "loss": 0.2192, + "step": 14304 + }, + { + "epoch": 0.28612, + "grad_norm": 0.8709750771522522, + "learning_rate": 1.796276594267164e-05, + "loss": 0.1605, + "step": 14306 + }, + { + "epoch": 0.28616, + "grad_norm": 0.6724504828453064, + "learning_rate": 1.79619212196498e-05, + "loss": 0.0867, + "step": 14308 + }, + { + "epoch": 0.2862, + "grad_norm": 0.4268300235271454, + "learning_rate": 1.796107634140621e-05, + "loss": 0.042, + "step": 14310 + }, + { + "epoch": 0.28624, + "grad_norm": 4.099006175994873, + "learning_rate": 1.7960231307957333e-05, + "loss": 0.1645, + "step": 14312 + }, + { + "epoch": 0.28628, + "grad_norm": 1.2440413236618042, + "learning_rate": 1.7959386119319652e-05, + "loss": 0.0641, + "step": 14314 + }, + { + "epoch": 0.28632, + "grad_norm": 0.8956195116043091, + "learning_rate": 1.795854077550964e-05, + "loss": 0.3177, + "step": 14316 + }, + { + "epoch": 0.28636, + "grad_norm": 2.8048689365386963, + "learning_rate": 1.795769527654378e-05, + "loss": 0.188, + "step": 14318 + }, + { + "epoch": 0.2864, + "grad_norm": 0.022640246897935867, + "learning_rate": 1.7956849622438554e-05, + "loss": 0.0991, + "step": 14320 + }, + { + "epoch": 0.28644, + "grad_norm": 0.9887398481369019, + "learning_rate": 1.795600381321045e-05, + "loss": 0.3217, + "step": 14322 + }, + { + "epoch": 0.28648, + "grad_norm": 4.146340847015381, + "learning_rate": 1.795515784887595e-05, + "loss": 0.2752, + "step": 14324 + }, + { + "epoch": 0.28652, + "grad_norm": 4.304254531860352, + "learning_rate": 1.7954311729451556e-05, + "loss": 0.2371, + "step": 14326 + }, + { + "epoch": 0.28656, + "grad_norm": 0.8742691874504089, + "learning_rate": 1.795346545495376e-05, + "loss": 0.0281, + "step": 14328 + }, + { + "epoch": 0.2866, + "grad_norm": 2.116488456726074, + "learning_rate": 1.795261902539906e-05, + "loss": 0.139, + "step": 14330 + }, + { + "epoch": 0.28664, + "grad_norm": 1.4740376472473145, + "learning_rate": 1.7951772440803955e-05, + "loss": 0.1265, + "step": 14332 + }, + { + "epoch": 0.28668, + "grad_norm": 2.6915533542633057, + "learning_rate": 1.7950925701184953e-05, + "loss": 0.1878, + "step": 14334 + }, + { + "epoch": 0.28672, + "grad_norm": 1.0866576433181763, + "learning_rate": 1.7950078806558565e-05, + "loss": 0.0607, + "step": 14336 + }, + { + "epoch": 0.28676, + "grad_norm": 1.3714499473571777, + "learning_rate": 1.7949231756941292e-05, + "loss": 0.1234, + "step": 14338 + }, + { + "epoch": 0.2868, + "grad_norm": 3.929786205291748, + "learning_rate": 1.794838455234966e-05, + "loss": 0.2719, + "step": 14340 + }, + { + "epoch": 0.28684, + "grad_norm": 0.5676825642585754, + "learning_rate": 1.794753719280017e-05, + "loss": 0.0338, + "step": 14342 + }, + { + "epoch": 0.28688, + "grad_norm": 1.711248517036438, + "learning_rate": 1.7946689678309356e-05, + "loss": 0.0716, + "step": 14344 + }, + { + "epoch": 0.28692, + "grad_norm": 1.0800158977508545, + "learning_rate": 1.7945842008893736e-05, + "loss": 0.0846, + "step": 14346 + }, + { + "epoch": 0.28696, + "grad_norm": 1.0415916442871094, + "learning_rate": 1.7944994184569837e-05, + "loss": 0.0361, + "step": 14348 + }, + { + "epoch": 0.287, + "grad_norm": 2.6491684913635254, + "learning_rate": 1.7944146205354182e-05, + "loss": 0.2143, + "step": 14350 + }, + { + "epoch": 0.28704, + "grad_norm": 3.126689910888672, + "learning_rate": 1.794329807126331e-05, + "loss": 0.1431, + "step": 14352 + }, + { + "epoch": 0.28708, + "grad_norm": 2.3759868144989014, + "learning_rate": 1.7942449782313752e-05, + "loss": 0.1172, + "step": 14354 + }, + { + "epoch": 0.28712, + "grad_norm": 3.4473466873168945, + "learning_rate": 1.7941601338522044e-05, + "loss": 0.2232, + "step": 14356 + }, + { + "epoch": 0.28716, + "grad_norm": 2.9990646839141846, + "learning_rate": 1.794075273990473e-05, + "loss": 0.169, + "step": 14358 + }, + { + "epoch": 0.2872, + "grad_norm": 1.2136856317520142, + "learning_rate": 1.7939903986478354e-05, + "loss": 0.1131, + "step": 14360 + }, + { + "epoch": 0.28724, + "grad_norm": 0.3164205849170685, + "learning_rate": 1.7939055078259464e-05, + "loss": 0.1251, + "step": 14362 + }, + { + "epoch": 0.28728, + "grad_norm": 5.535141468048096, + "learning_rate": 1.7938206015264605e-05, + "loss": 0.3517, + "step": 14364 + }, + { + "epoch": 0.28732, + "grad_norm": 0.8993200063705444, + "learning_rate": 1.793735679751033e-05, + "loss": 0.0354, + "step": 14366 + }, + { + "epoch": 0.28736, + "grad_norm": 1.9991686344146729, + "learning_rate": 1.7936507425013204e-05, + "loss": 0.6803, + "step": 14368 + }, + { + "epoch": 0.2874, + "grad_norm": 0.40226486325263977, + "learning_rate": 1.793565789778978e-05, + "loss": 0.0187, + "step": 14370 + }, + { + "epoch": 0.28744, + "grad_norm": 0.05665360763669014, + "learning_rate": 1.7934808215856618e-05, + "loss": 0.1075, + "step": 14372 + }, + { + "epoch": 0.28748, + "grad_norm": 5.181577682495117, + "learning_rate": 1.7933958379230284e-05, + "loss": 0.1097, + "step": 14374 + }, + { + "epoch": 0.28752, + "grad_norm": 0.49877026677131653, + "learning_rate": 1.7933108387927346e-05, + "loss": 0.1532, + "step": 14376 + }, + { + "epoch": 0.28756, + "grad_norm": 0.6242012977600098, + "learning_rate": 1.7932258241964377e-05, + "loss": 0.0688, + "step": 14378 + }, + { + "epoch": 0.2876, + "grad_norm": 6.321228981018066, + "learning_rate": 1.793140794135795e-05, + "loss": 0.375, + "step": 14380 + }, + { + "epoch": 0.28764, + "grad_norm": 0.5172264575958252, + "learning_rate": 1.793055748612464e-05, + "loss": 0.0128, + "step": 14382 + }, + { + "epoch": 0.28768, + "grad_norm": 8.574150085449219, + "learning_rate": 1.792970687628103e-05, + "loss": 0.6387, + "step": 14384 + }, + { + "epoch": 0.28772, + "grad_norm": 2.414968252182007, + "learning_rate": 1.79288561118437e-05, + "loss": 0.11, + "step": 14386 + }, + { + "epoch": 0.28776, + "grad_norm": 0.2230048030614853, + "learning_rate": 1.792800519282924e-05, + "loss": 0.0401, + "step": 14388 + }, + { + "epoch": 0.2878, + "grad_norm": 4.9623799324035645, + "learning_rate": 1.7927154119254234e-05, + "loss": 0.4873, + "step": 14390 + }, + { + "epoch": 0.28784, + "grad_norm": 3.0144944190979004, + "learning_rate": 1.7926302891135283e-05, + "loss": 0.2162, + "step": 14392 + }, + { + "epoch": 0.28788, + "grad_norm": 1.9309844970703125, + "learning_rate": 1.7925451508488975e-05, + "loss": 0.1334, + "step": 14394 + }, + { + "epoch": 0.28792, + "grad_norm": 3.0066561698913574, + "learning_rate": 1.792459997133191e-05, + "loss": 0.1307, + "step": 14396 + }, + { + "epoch": 0.28796, + "grad_norm": 1.3914867639541626, + "learning_rate": 1.7923748279680684e-05, + "loss": 0.1007, + "step": 14398 + }, + { + "epoch": 0.288, + "grad_norm": 1.7349884510040283, + "learning_rate": 1.792289643355191e-05, + "loss": 0.0891, + "step": 14400 + }, + { + "epoch": 0.28804, + "grad_norm": 0.6202976107597351, + "learning_rate": 1.7922044432962188e-05, + "loss": 0.1575, + "step": 14402 + }, + { + "epoch": 0.28808, + "grad_norm": 0.8850497007369995, + "learning_rate": 1.792119227792813e-05, + "loss": 0.1952, + "step": 14404 + }, + { + "epoch": 0.28812, + "grad_norm": 0.6241782903671265, + "learning_rate": 1.7920339968466357e-05, + "loss": 0.0495, + "step": 14406 + }, + { + "epoch": 0.28816, + "grad_norm": 3.9046826362609863, + "learning_rate": 1.7919487504593472e-05, + "loss": 0.1721, + "step": 14408 + }, + { + "epoch": 0.2882, + "grad_norm": 7.140202045440674, + "learning_rate": 1.791863488632611e-05, + "loss": 0.2794, + "step": 14410 + }, + { + "epoch": 0.28824, + "grad_norm": 0.28007379174232483, + "learning_rate": 1.7917782113680875e-05, + "loss": 0.1324, + "step": 14412 + }, + { + "epoch": 0.28828, + "grad_norm": 0.12959645688533783, + "learning_rate": 1.7916929186674404e-05, + "loss": 0.0304, + "step": 14414 + }, + { + "epoch": 0.28832, + "grad_norm": 0.3574233651161194, + "learning_rate": 1.7916076105323323e-05, + "loss": 0.212, + "step": 14416 + }, + { + "epoch": 0.28836, + "grad_norm": 0.08482571691274643, + "learning_rate": 1.7915222869644266e-05, + "loss": 0.033, + "step": 14418 + }, + { + "epoch": 0.2884, + "grad_norm": 1.1078437566757202, + "learning_rate": 1.7914369479653858e-05, + "loss": 0.188, + "step": 14420 + }, + { + "epoch": 0.28844, + "grad_norm": 0.4725673794746399, + "learning_rate": 1.7913515935368744e-05, + "loss": 0.0229, + "step": 14422 + }, + { + "epoch": 0.28848, + "grad_norm": 1.022612452507019, + "learning_rate": 1.791266223680557e-05, + "loss": 0.0641, + "step": 14424 + }, + { + "epoch": 0.28852, + "grad_norm": 2.1373977661132812, + "learning_rate": 1.7911808383980962e-05, + "loss": 0.0571, + "step": 14426 + }, + { + "epoch": 0.28856, + "grad_norm": 0.3240865170955658, + "learning_rate": 1.7910954376911582e-05, + "loss": 0.0483, + "step": 14428 + }, + { + "epoch": 0.2886, + "grad_norm": 1.8635598421096802, + "learning_rate": 1.791010021561407e-05, + "loss": 0.0398, + "step": 14430 + }, + { + "epoch": 0.28864, + "grad_norm": 0.16711384057998657, + "learning_rate": 1.7909245900105085e-05, + "loss": 0.0214, + "step": 14432 + }, + { + "epoch": 0.28868, + "grad_norm": 3.750593662261963, + "learning_rate": 1.790839143040128e-05, + "loss": 0.1847, + "step": 14434 + }, + { + "epoch": 0.28872, + "grad_norm": 0.22735664248466492, + "learning_rate": 1.790753680651931e-05, + "loss": 0.0104, + "step": 14436 + }, + { + "epoch": 0.28876, + "grad_norm": 1.8939744234085083, + "learning_rate": 1.7906682028475838e-05, + "loss": 0.2147, + "step": 14438 + }, + { + "epoch": 0.2888, + "grad_norm": 2.4399983882904053, + "learning_rate": 1.7905827096287532e-05, + "loss": 0.1068, + "step": 14440 + }, + { + "epoch": 0.28884, + "grad_norm": 6.801800727844238, + "learning_rate": 1.7904972009971054e-05, + "loss": 0.8555, + "step": 14442 + }, + { + "epoch": 0.28888, + "grad_norm": 2.0905890464782715, + "learning_rate": 1.7904116769543077e-05, + "loss": 0.269, + "step": 14444 + }, + { + "epoch": 0.28892, + "grad_norm": 2.1598782539367676, + "learning_rate": 1.7903261375020275e-05, + "loss": 0.1679, + "step": 14446 + }, + { + "epoch": 0.28896, + "grad_norm": 0.8717235922813416, + "learning_rate": 1.7902405826419323e-05, + "loss": 0.0391, + "step": 14448 + }, + { + "epoch": 0.289, + "grad_norm": 7.404802322387695, + "learning_rate": 1.7901550123756906e-05, + "loss": 0.5953, + "step": 14450 + }, + { + "epoch": 0.28904, + "grad_norm": 1.289519190788269, + "learning_rate": 1.7900694267049698e-05, + "loss": 0.0567, + "step": 14452 + }, + { + "epoch": 0.28908, + "grad_norm": 0.6174150109291077, + "learning_rate": 1.7899838256314387e-05, + "loss": 0.0408, + "step": 14454 + }, + { + "epoch": 0.28912, + "grad_norm": 1.0847967863082886, + "learning_rate": 1.789898209156766e-05, + "loss": 0.2024, + "step": 14456 + }, + { + "epoch": 0.28916, + "grad_norm": 6.026782512664795, + "learning_rate": 1.7898125772826213e-05, + "loss": 0.612, + "step": 14458 + }, + { + "epoch": 0.2892, + "grad_norm": 0.28514227271080017, + "learning_rate": 1.789726930010674e-05, + "loss": 0.0175, + "step": 14460 + }, + { + "epoch": 0.28924, + "grad_norm": 6.444458961486816, + "learning_rate": 1.789641267342593e-05, + "loss": 0.4853, + "step": 14462 + }, + { + "epoch": 0.28928, + "grad_norm": 1.395638108253479, + "learning_rate": 1.7895555892800494e-05, + "loss": 0.3537, + "step": 14464 + }, + { + "epoch": 0.28932, + "grad_norm": 3.0816619396209717, + "learning_rate": 1.789469895824713e-05, + "loss": 0.2767, + "step": 14466 + }, + { + "epoch": 0.28936, + "grad_norm": 1.0042427778244019, + "learning_rate": 1.7893841869782548e-05, + "loss": 0.1643, + "step": 14468 + }, + { + "epoch": 0.2894, + "grad_norm": 1.0983458757400513, + "learning_rate": 1.789298462742345e-05, + "loss": 0.0691, + "step": 14470 + }, + { + "epoch": 0.28944, + "grad_norm": 1.6913858652114868, + "learning_rate": 1.789212723118656e-05, + "loss": 0.1524, + "step": 14472 + }, + { + "epoch": 0.28948, + "grad_norm": 0.29854318499565125, + "learning_rate": 1.789126968108858e-05, + "loss": 0.0347, + "step": 14474 + }, + { + "epoch": 0.28952, + "grad_norm": 0.8820049166679382, + "learning_rate": 1.7890411977146235e-05, + "loss": 0.0454, + "step": 14476 + }, + { + "epoch": 0.28956, + "grad_norm": 3.944244146347046, + "learning_rate": 1.788955411937625e-05, + "loss": 0.4171, + "step": 14478 + }, + { + "epoch": 0.2896, + "grad_norm": 1.9328136444091797, + "learning_rate": 1.7888696107795343e-05, + "loss": 0.3545, + "step": 14480 + }, + { + "epoch": 0.28964, + "grad_norm": 4.51222562789917, + "learning_rate": 1.7887837942420244e-05, + "loss": 0.586, + "step": 14482 + }, + { + "epoch": 0.28968, + "grad_norm": 1.2977607250213623, + "learning_rate": 1.7886979623267686e-05, + "loss": 0.1008, + "step": 14484 + }, + { + "epoch": 0.28972, + "grad_norm": 1.6659489870071411, + "learning_rate": 1.7886121150354395e-05, + "loss": 0.1439, + "step": 14486 + }, + { + "epoch": 0.28976, + "grad_norm": 1.0284509658813477, + "learning_rate": 1.7885262523697116e-05, + "loss": 0.1881, + "step": 14488 + }, + { + "epoch": 0.2898, + "grad_norm": 1.3336604833602905, + "learning_rate": 1.7884403743312583e-05, + "loss": 0.066, + "step": 14490 + }, + { + "epoch": 0.28984, + "grad_norm": 0.6083081364631653, + "learning_rate": 1.788354480921754e-05, + "loss": 0.2463, + "step": 14492 + }, + { + "epoch": 0.28988, + "grad_norm": 2.9080193042755127, + "learning_rate": 1.7882685721428735e-05, + "loss": 0.1851, + "step": 14494 + }, + { + "epoch": 0.28992, + "grad_norm": 1.1830471754074097, + "learning_rate": 1.788182647996291e-05, + "loss": 0.1483, + "step": 14496 + }, + { + "epoch": 0.28996, + "grad_norm": 1.2729605436325073, + "learning_rate": 1.788096708483682e-05, + "loss": 0.0477, + "step": 14498 + }, + { + "epoch": 0.29, + "grad_norm": 2.366981029510498, + "learning_rate": 1.788010753606722e-05, + "loss": 0.1276, + "step": 14500 + }, + { + "epoch": 0.29004, + "grad_norm": 1.2227957248687744, + "learning_rate": 1.787924783367087e-05, + "loss": 0.158, + "step": 14502 + }, + { + "epoch": 0.29008, + "grad_norm": 1.1128462553024292, + "learning_rate": 1.7878387977664522e-05, + "loss": 0.0894, + "step": 14504 + }, + { + "epoch": 0.29012, + "grad_norm": 1.6515731811523438, + "learning_rate": 1.7877527968064946e-05, + "loss": 0.1793, + "step": 14506 + }, + { + "epoch": 0.29016, + "grad_norm": 0.6994551420211792, + "learning_rate": 1.787666780488891e-05, + "loss": 0.0297, + "step": 14508 + }, + { + "epoch": 0.2902, + "grad_norm": 3.0963656902313232, + "learning_rate": 1.7875807488153173e-05, + "loss": 0.1426, + "step": 14510 + }, + { + "epoch": 0.29024, + "grad_norm": 4.494760513305664, + "learning_rate": 1.787494701787452e-05, + "loss": 0.5262, + "step": 14512 + }, + { + "epoch": 0.29028, + "grad_norm": 0.3602535128593445, + "learning_rate": 1.7874086394069716e-05, + "loss": 0.0419, + "step": 14514 + }, + { + "epoch": 0.29032, + "grad_norm": 1.78075110912323, + "learning_rate": 1.7873225616755547e-05, + "loss": 0.1587, + "step": 14516 + }, + { + "epoch": 0.29036, + "grad_norm": 4.1772589683532715, + "learning_rate": 1.7872364685948792e-05, + "loss": 0.2888, + "step": 14518 + }, + { + "epoch": 0.2904, + "grad_norm": 1.262121319770813, + "learning_rate": 1.7871503601666233e-05, + "loss": 0.114, + "step": 14520 + }, + { + "epoch": 0.29044, + "grad_norm": 5.405762672424316, + "learning_rate": 1.787064236392466e-05, + "loss": 0.3311, + "step": 14522 + }, + { + "epoch": 0.29048, + "grad_norm": 2.362020254135132, + "learning_rate": 1.786978097274086e-05, + "loss": 0.179, + "step": 14524 + }, + { + "epoch": 0.29052, + "grad_norm": 1.856524109840393, + "learning_rate": 1.786891942813163e-05, + "loss": 0.1232, + "step": 14526 + }, + { + "epoch": 0.29056, + "grad_norm": 5.139665126800537, + "learning_rate": 1.7868057730113766e-05, + "loss": 0.3942, + "step": 14528 + }, + { + "epoch": 0.2906, + "grad_norm": 1.4436147212982178, + "learning_rate": 1.7867195878704062e-05, + "loss": 0.0573, + "step": 14530 + }, + { + "epoch": 0.29064, + "grad_norm": 1.7152405977249146, + "learning_rate": 1.7866333873919327e-05, + "loss": 0.0696, + "step": 14532 + }, + { + "epoch": 0.29068, + "grad_norm": 3.6955525875091553, + "learning_rate": 1.786547171577636e-05, + "loss": 0.2567, + "step": 14534 + }, + { + "epoch": 0.29072, + "grad_norm": 0.25889676809310913, + "learning_rate": 1.7864609404291976e-05, + "loss": 0.0744, + "step": 14536 + }, + { + "epoch": 0.29076, + "grad_norm": 1.488526701927185, + "learning_rate": 1.786374693948298e-05, + "loss": 0.1234, + "step": 14538 + }, + { + "epoch": 0.2908, + "grad_norm": 1.128847360610962, + "learning_rate": 1.786288432136619e-05, + "loss": 0.1391, + "step": 14540 + }, + { + "epoch": 0.29084, + "grad_norm": 1.5025537014007568, + "learning_rate": 1.7862021549958425e-05, + "loss": 0.0764, + "step": 14542 + }, + { + "epoch": 0.29088, + "grad_norm": 4.137233257293701, + "learning_rate": 1.78611586252765e-05, + "loss": 0.3423, + "step": 14544 + }, + { + "epoch": 0.29092, + "grad_norm": 2.5407471656799316, + "learning_rate": 1.7860295547337236e-05, + "loss": 0.2882, + "step": 14546 + }, + { + "epoch": 0.29096, + "grad_norm": 0.45097362995147705, + "learning_rate": 1.7859432316157468e-05, + "loss": 0.25, + "step": 14548 + }, + { + "epoch": 0.291, + "grad_norm": 0.39788898825645447, + "learning_rate": 1.785856893175402e-05, + "loss": 0.1178, + "step": 14550 + }, + { + "epoch": 0.29104, + "grad_norm": 1.5347849130630493, + "learning_rate": 1.7857705394143726e-05, + "loss": 0.1368, + "step": 14552 + }, + { + "epoch": 0.29108, + "grad_norm": 1.2319400310516357, + "learning_rate": 1.7856841703343417e-05, + "loss": 0.058, + "step": 14554 + }, + { + "epoch": 0.29112, + "grad_norm": 1.1436735391616821, + "learning_rate": 1.785597785936994e-05, + "loss": 0.0716, + "step": 14556 + }, + { + "epoch": 0.29116, + "grad_norm": 1.3908770084381104, + "learning_rate": 1.785511386224012e-05, + "loss": 0.1233, + "step": 14558 + }, + { + "epoch": 0.2912, + "grad_norm": 1.1358555555343628, + "learning_rate": 1.785424971197082e-05, + "loss": 0.0569, + "step": 14560 + }, + { + "epoch": 0.29124, + "grad_norm": 0.5747413635253906, + "learning_rate": 1.7853385408578875e-05, + "loss": 0.0908, + "step": 14562 + }, + { + "epoch": 0.29128, + "grad_norm": 1.5065499544143677, + "learning_rate": 1.7852520952081137e-05, + "loss": 0.0988, + "step": 14564 + }, + { + "epoch": 0.29132, + "grad_norm": 2.007763385772705, + "learning_rate": 1.7851656342494462e-05, + "loss": 0.1914, + "step": 14566 + }, + { + "epoch": 0.29136, + "grad_norm": 3.619459867477417, + "learning_rate": 1.7850791579835703e-05, + "loss": 0.4511, + "step": 14568 + }, + { + "epoch": 0.2914, + "grad_norm": 5.679092884063721, + "learning_rate": 1.7849926664121726e-05, + "loss": 0.3226, + "step": 14570 + }, + { + "epoch": 0.29144, + "grad_norm": 2.248204231262207, + "learning_rate": 1.784906159536938e-05, + "loss": 0.1386, + "step": 14572 + }, + { + "epoch": 0.29148, + "grad_norm": 1.2115639448165894, + "learning_rate": 1.7848196373595542e-05, + "loss": 0.0447, + "step": 14574 + }, + { + "epoch": 0.29152, + "grad_norm": 0.6227659583091736, + "learning_rate": 1.784733099881707e-05, + "loss": 0.0738, + "step": 14576 + }, + { + "epoch": 0.29156, + "grad_norm": 2.955458879470825, + "learning_rate": 1.7846465471050845e-05, + "loss": 0.2096, + "step": 14578 + }, + { + "epoch": 0.2916, + "grad_norm": 1.667859673500061, + "learning_rate": 1.7845599790313735e-05, + "loss": 0.164, + "step": 14580 + }, + { + "epoch": 0.29164, + "grad_norm": 5.751975059509277, + "learning_rate": 1.7844733956622614e-05, + "loss": 0.325, + "step": 14582 + }, + { + "epoch": 0.29168, + "grad_norm": 2.930295705795288, + "learning_rate": 1.7843867969994372e-05, + "loss": 0.1731, + "step": 14584 + }, + { + "epoch": 0.29172, + "grad_norm": 1.283210039138794, + "learning_rate": 1.7843001830445882e-05, + "loss": 0.0901, + "step": 14586 + }, + { + "epoch": 0.29176, + "grad_norm": 3.610074520111084, + "learning_rate": 1.7842135537994036e-05, + "loss": 0.1908, + "step": 14588 + }, + { + "epoch": 0.2918, + "grad_norm": 1.2931278944015503, + "learning_rate": 1.7841269092655714e-05, + "loss": 0.3948, + "step": 14590 + }, + { + "epoch": 0.29184, + "grad_norm": 0.10443456470966339, + "learning_rate": 1.784040249444782e-05, + "loss": 0.0823, + "step": 14592 + }, + { + "epoch": 0.29188, + "grad_norm": 1.3924881219863892, + "learning_rate": 1.783953574338724e-05, + "loss": 0.1604, + "step": 14594 + }, + { + "epoch": 0.29192, + "grad_norm": 1.2565973997116089, + "learning_rate": 1.783866883949088e-05, + "loss": 0.13, + "step": 14596 + }, + { + "epoch": 0.29196, + "grad_norm": 2.077430486679077, + "learning_rate": 1.783780178277563e-05, + "loss": 0.1145, + "step": 14598 + }, + { + "epoch": 0.292, + "grad_norm": 0.12291911989450455, + "learning_rate": 1.78369345732584e-05, + "loss": 0.0076, + "step": 14600 + }, + { + "epoch": 0.29204, + "grad_norm": 0.8358623385429382, + "learning_rate": 1.7836067210956093e-05, + "loss": 0.0374, + "step": 14602 + }, + { + "epoch": 0.29208, + "grad_norm": 0.08000515401363373, + "learning_rate": 1.7835199695885626e-05, + "loss": 0.0088, + "step": 14604 + }, + { + "epoch": 0.29212, + "grad_norm": 3.9628055095672607, + "learning_rate": 1.7834332028063906e-05, + "loss": 0.1454, + "step": 14606 + }, + { + "epoch": 0.29216, + "grad_norm": 2.394742012023926, + "learning_rate": 1.783346420750785e-05, + "loss": 0.1005, + "step": 14608 + }, + { + "epoch": 0.2922, + "grad_norm": 3.4412615299224854, + "learning_rate": 1.7832596234234376e-05, + "loss": 0.2019, + "step": 14610 + }, + { + "epoch": 0.29224, + "grad_norm": 3.837790012359619, + "learning_rate": 1.7831728108260407e-05, + "loss": 0.1592, + "step": 14612 + }, + { + "epoch": 0.29228, + "grad_norm": 1.275189757347107, + "learning_rate": 1.7830859829602866e-05, + "loss": 0.1629, + "step": 14614 + }, + { + "epoch": 0.29232, + "grad_norm": 0.3675195872783661, + "learning_rate": 1.782999139827868e-05, + "loss": 0.0182, + "step": 14616 + }, + { + "epoch": 0.29236, + "grad_norm": 0.7561726570129395, + "learning_rate": 1.782912281430478e-05, + "loss": 0.0381, + "step": 14618 + }, + { + "epoch": 0.2924, + "grad_norm": 2.293750286102295, + "learning_rate": 1.78282540776981e-05, + "loss": 0.113, + "step": 14620 + }, + { + "epoch": 0.29244, + "grad_norm": 3.6936380863189697, + "learning_rate": 1.782738518847558e-05, + "loss": 0.2065, + "step": 14622 + }, + { + "epoch": 0.29248, + "grad_norm": 4.903948783874512, + "learning_rate": 1.782651614665415e-05, + "loss": 0.5052, + "step": 14624 + }, + { + "epoch": 0.29252, + "grad_norm": 2.0957083702087402, + "learning_rate": 1.782564695225076e-05, + "loss": 0.1064, + "step": 14626 + }, + { + "epoch": 0.29256, + "grad_norm": 1.50724458694458, + "learning_rate": 1.7824777605282354e-05, + "loss": 0.0604, + "step": 14628 + }, + { + "epoch": 0.2926, + "grad_norm": 0.2959724962711334, + "learning_rate": 1.7823908105765883e-05, + "loss": 0.0516, + "step": 14630 + }, + { + "epoch": 0.29264, + "grad_norm": 0.5449682474136353, + "learning_rate": 1.7823038453718293e-05, + "loss": 0.125, + "step": 14632 + }, + { + "epoch": 0.29268, + "grad_norm": 0.6722941994667053, + "learning_rate": 1.7822168649156543e-05, + "loss": 0.2823, + "step": 14634 + }, + { + "epoch": 0.29272, + "grad_norm": 0.6741670966148376, + "learning_rate": 1.7821298692097583e-05, + "loss": 0.2461, + "step": 14636 + }, + { + "epoch": 0.29276, + "grad_norm": 0.2107659876346588, + "learning_rate": 1.7820428582558385e-05, + "loss": 0.0097, + "step": 14638 + }, + { + "epoch": 0.2928, + "grad_norm": 5.275033950805664, + "learning_rate": 1.7819558320555902e-05, + "loss": 0.5358, + "step": 14640 + }, + { + "epoch": 0.29284, + "grad_norm": 0.17462381720542908, + "learning_rate": 1.7818687906107106e-05, + "loss": 0.0397, + "step": 14642 + }, + { + "epoch": 0.29288, + "grad_norm": 1.9801253080368042, + "learning_rate": 1.781781733922896e-05, + "loss": 0.0608, + "step": 14644 + }, + { + "epoch": 0.29292, + "grad_norm": 0.12424734234809875, + "learning_rate": 1.781694661993844e-05, + "loss": 0.1097, + "step": 14646 + }, + { + "epoch": 0.29296, + "grad_norm": 0.9819557666778564, + "learning_rate": 1.7816075748252526e-05, + "loss": 0.0431, + "step": 14648 + }, + { + "epoch": 0.293, + "grad_norm": 4.677022457122803, + "learning_rate": 1.781520472418819e-05, + "loss": 0.2285, + "step": 14650 + }, + { + "epoch": 0.29304, + "grad_norm": 0.4649621248245239, + "learning_rate": 1.7814333547762414e-05, + "loss": 0.0304, + "step": 14652 + }, + { + "epoch": 0.29308, + "grad_norm": 0.05486014112830162, + "learning_rate": 1.781346221899218e-05, + "loss": 0.0316, + "step": 14654 + }, + { + "epoch": 0.29312, + "grad_norm": 0.22905196249485016, + "learning_rate": 1.781259073789448e-05, + "loss": 0.3791, + "step": 14656 + }, + { + "epoch": 0.29316, + "grad_norm": 0.3994489312171936, + "learning_rate": 1.78117191044863e-05, + "loss": 0.1635, + "step": 14658 + }, + { + "epoch": 0.2932, + "grad_norm": 0.14601075649261475, + "learning_rate": 1.7810847318784632e-05, + "loss": 0.0236, + "step": 14660 + }, + { + "epoch": 0.29324, + "grad_norm": 1.4189969301223755, + "learning_rate": 1.780997538080648e-05, + "loss": 0.0671, + "step": 14662 + }, + { + "epoch": 0.29328, + "grad_norm": 0.7094199657440186, + "learning_rate": 1.7809103290568832e-05, + "loss": 0.1597, + "step": 14664 + }, + { + "epoch": 0.29332, + "grad_norm": 2.302602767944336, + "learning_rate": 1.7808231048088698e-05, + "loss": 0.1761, + "step": 14666 + }, + { + "epoch": 0.29336, + "grad_norm": 0.6834855079650879, + "learning_rate": 1.780735865338308e-05, + "loss": 0.1697, + "step": 14668 + }, + { + "epoch": 0.2934, + "grad_norm": 0.14875075221061707, + "learning_rate": 1.7806486106468983e-05, + "loss": 0.0105, + "step": 14670 + }, + { + "epoch": 0.29344, + "grad_norm": 3.223283290863037, + "learning_rate": 1.780561340736342e-05, + "loss": 0.1972, + "step": 14672 + }, + { + "epoch": 0.29348, + "grad_norm": 3.1943533420562744, + "learning_rate": 1.780474055608341e-05, + "loss": 0.0947, + "step": 14674 + }, + { + "epoch": 0.29352, + "grad_norm": 1.676885962486267, + "learning_rate": 1.780386755264596e-05, + "loss": 0.0689, + "step": 14676 + }, + { + "epoch": 0.29356, + "grad_norm": 2.435214042663574, + "learning_rate": 1.7802994397068096e-05, + "loss": 0.0695, + "step": 14678 + }, + { + "epoch": 0.2936, + "grad_norm": 2.1134955883026123, + "learning_rate": 1.780212108936684e-05, + "loss": 0.1119, + "step": 14680 + }, + { + "epoch": 0.29364, + "grad_norm": 3.4708147048950195, + "learning_rate": 1.780124762955921e-05, + "loss": 0.5311, + "step": 14682 + }, + { + "epoch": 0.29368, + "grad_norm": 0.09367550164461136, + "learning_rate": 1.780037401766225e-05, + "loss": 0.0261, + "step": 14684 + }, + { + "epoch": 0.29372, + "grad_norm": 3.1627862453460693, + "learning_rate": 1.779950025369298e-05, + "loss": 0.0852, + "step": 14686 + }, + { + "epoch": 0.29376, + "grad_norm": 0.14846143126487732, + "learning_rate": 1.779862633766843e-05, + "loss": 0.0421, + "step": 14688 + }, + { + "epoch": 0.2938, + "grad_norm": 1.887032389640808, + "learning_rate": 1.7797752269605654e-05, + "loss": 0.1765, + "step": 14690 + }, + { + "epoch": 0.29384, + "grad_norm": 2.7295522689819336, + "learning_rate": 1.779687804952168e-05, + "loss": 0.2511, + "step": 14692 + }, + { + "epoch": 0.29388, + "grad_norm": 1.1252068281173706, + "learning_rate": 1.7796003677433555e-05, + "loss": 0.2286, + "step": 14694 + }, + { + "epoch": 0.29392, + "grad_norm": 0.4142070710659027, + "learning_rate": 1.779512915335832e-05, + "loss": 0.1623, + "step": 14696 + }, + { + "epoch": 0.29396, + "grad_norm": 5.643939018249512, + "learning_rate": 1.7794254477313035e-05, + "loss": 0.2896, + "step": 14698 + }, + { + "epoch": 0.294, + "grad_norm": 0.7820349335670471, + "learning_rate": 1.7793379649314743e-05, + "loss": 0.0636, + "step": 14700 + }, + { + "epoch": 0.29404, + "grad_norm": 1.7795664072036743, + "learning_rate": 1.7792504669380503e-05, + "loss": 0.1488, + "step": 14702 + }, + { + "epoch": 0.29408, + "grad_norm": 0.9914916157722473, + "learning_rate": 1.7791629537527373e-05, + "loss": 0.2775, + "step": 14704 + }, + { + "epoch": 0.29412, + "grad_norm": 3.014572858810425, + "learning_rate": 1.7790754253772413e-05, + "loss": 0.5229, + "step": 14706 + }, + { + "epoch": 0.29416, + "grad_norm": 0.13502268493175507, + "learning_rate": 1.7789878818132688e-05, + "loss": 0.0761, + "step": 14708 + }, + { + "epoch": 0.2942, + "grad_norm": 3.3152122497558594, + "learning_rate": 1.7789003230625266e-05, + "loss": 0.177, + "step": 14710 + }, + { + "epoch": 0.29424, + "grad_norm": 5.369414806365967, + "learning_rate": 1.7788127491267214e-05, + "loss": 0.5927, + "step": 14712 + }, + { + "epoch": 0.29428, + "grad_norm": 3.487069845199585, + "learning_rate": 1.7787251600075607e-05, + "loss": 0.2429, + "step": 14714 + }, + { + "epoch": 0.29432, + "grad_norm": 3.8428220748901367, + "learning_rate": 1.7786375557067526e-05, + "loss": 0.2786, + "step": 14716 + }, + { + "epoch": 0.29436, + "grad_norm": 3.51001238822937, + "learning_rate": 1.778549936226004e-05, + "loss": 0.2772, + "step": 14718 + }, + { + "epoch": 0.2944, + "grad_norm": 4.255728721618652, + "learning_rate": 1.7784623015670237e-05, + "loss": 0.3483, + "step": 14720 + }, + { + "epoch": 0.29444, + "grad_norm": 2.047574043273926, + "learning_rate": 1.7783746517315198e-05, + "loss": 0.1214, + "step": 14722 + }, + { + "epoch": 0.29448, + "grad_norm": 1.5482364892959595, + "learning_rate": 1.7782869867212017e-05, + "loss": 0.1513, + "step": 14724 + }, + { + "epoch": 0.29452, + "grad_norm": 2.1496729850769043, + "learning_rate": 1.7781993065377778e-05, + "loss": 0.1804, + "step": 14726 + }, + { + "epoch": 0.29456, + "grad_norm": 0.5156332850456238, + "learning_rate": 1.778111611182958e-05, + "loss": 0.1669, + "step": 14728 + }, + { + "epoch": 0.2946, + "grad_norm": 0.315357506275177, + "learning_rate": 1.7780239006584515e-05, + "loss": 0.1877, + "step": 14730 + }, + { + "epoch": 0.29464, + "grad_norm": 2.6156222820281982, + "learning_rate": 1.777936174965969e-05, + "loss": 0.1688, + "step": 14732 + }, + { + "epoch": 0.29468, + "grad_norm": 2.8259360790252686, + "learning_rate": 1.7778484341072198e-05, + "loss": 0.0909, + "step": 14734 + }, + { + "epoch": 0.29472, + "grad_norm": 1.3566676378250122, + "learning_rate": 1.7777606780839152e-05, + "loss": 0.1297, + "step": 14736 + }, + { + "epoch": 0.29476, + "grad_norm": 0.28343456983566284, + "learning_rate": 1.7776729068977655e-05, + "loss": 0.0406, + "step": 14738 + }, + { + "epoch": 0.2948, + "grad_norm": 3.2818140983581543, + "learning_rate": 1.7775851205504823e-05, + "loss": 0.2061, + "step": 14740 + }, + { + "epoch": 0.29484, + "grad_norm": 0.44789421558380127, + "learning_rate": 1.7774973190437768e-05, + "loss": 0.0346, + "step": 14742 + }, + { + "epoch": 0.29488, + "grad_norm": 4.796633243560791, + "learning_rate": 1.7774095023793606e-05, + "loss": 0.3252, + "step": 14744 + }, + { + "epoch": 0.29492, + "grad_norm": 0.5193742513656616, + "learning_rate": 1.777321670558946e-05, + "loss": 0.0751, + "step": 14746 + }, + { + "epoch": 0.29496, + "grad_norm": 0.7356694936752319, + "learning_rate": 1.7772338235842453e-05, + "loss": 0.0795, + "step": 14748 + }, + { + "epoch": 0.295, + "grad_norm": 5.52099084854126, + "learning_rate": 1.777145961456971e-05, + "loss": 0.4212, + "step": 14750 + }, + { + "epoch": 0.29504, + "grad_norm": 1.3164730072021484, + "learning_rate": 1.7770580841788364e-05, + "loss": 0.0471, + "step": 14752 + }, + { + "epoch": 0.29508, + "grad_norm": 2.00929594039917, + "learning_rate": 1.7769701917515538e-05, + "loss": 0.1876, + "step": 14754 + }, + { + "epoch": 0.29512, + "grad_norm": 3.1139559745788574, + "learning_rate": 1.776882284176838e-05, + "loss": 0.1963, + "step": 14756 + }, + { + "epoch": 0.29516, + "grad_norm": 0.8052374124526978, + "learning_rate": 1.7767943614564014e-05, + "loss": 0.0384, + "step": 14758 + }, + { + "epoch": 0.2952, + "grad_norm": 2.4413719177246094, + "learning_rate": 1.7767064235919594e-05, + "loss": 0.3219, + "step": 14760 + }, + { + "epoch": 0.29524, + "grad_norm": 1.687934160232544, + "learning_rate": 1.7766184705852253e-05, + "loss": 0.0628, + "step": 14762 + }, + { + "epoch": 0.29528, + "grad_norm": 0.08128426969051361, + "learning_rate": 1.7765305024379148e-05, + "loss": 0.0745, + "step": 14764 + }, + { + "epoch": 0.29532, + "grad_norm": 0.6243935227394104, + "learning_rate": 1.776442519151742e-05, + "loss": 0.1249, + "step": 14766 + }, + { + "epoch": 0.29536, + "grad_norm": 1.289546251296997, + "learning_rate": 1.7763545207284226e-05, + "loss": 0.1006, + "step": 14768 + }, + { + "epoch": 0.2954, + "grad_norm": 0.476906955242157, + "learning_rate": 1.776266507169672e-05, + "loss": 0.0318, + "step": 14770 + }, + { + "epoch": 0.29544, + "grad_norm": 2.7477152347564697, + "learning_rate": 1.7761784784772064e-05, + "loss": 0.161, + "step": 14772 + }, + { + "epoch": 0.29548, + "grad_norm": 0.8336180448532104, + "learning_rate": 1.7760904346527415e-05, + "loss": 0.1301, + "step": 14774 + }, + { + "epoch": 0.29552, + "grad_norm": 0.3724995255470276, + "learning_rate": 1.7760023756979944e-05, + "loss": 0.0753, + "step": 14776 + }, + { + "epoch": 0.29556, + "grad_norm": 0.4858705997467041, + "learning_rate": 1.7759143016146815e-05, + "loss": 0.0285, + "step": 14778 + }, + { + "epoch": 0.2956, + "grad_norm": 2.093740701675415, + "learning_rate": 1.7758262124045195e-05, + "loss": 0.2551, + "step": 14780 + }, + { + "epoch": 0.29564, + "grad_norm": 2.2212371826171875, + "learning_rate": 1.7757381080692264e-05, + "loss": 0.1033, + "step": 14782 + }, + { + "epoch": 0.29568, + "grad_norm": 3.897878885269165, + "learning_rate": 1.775649988610519e-05, + "loss": 0.4243, + "step": 14784 + }, + { + "epoch": 0.29572, + "grad_norm": 0.548601508140564, + "learning_rate": 1.7755618540301164e-05, + "loss": 0.0208, + "step": 14786 + }, + { + "epoch": 0.29576, + "grad_norm": 3.176617383956909, + "learning_rate": 1.7754737043297355e-05, + "loss": 0.1707, + "step": 14788 + }, + { + "epoch": 0.2958, + "grad_norm": 4.81865119934082, + "learning_rate": 1.775385539511096e-05, + "loss": 0.2172, + "step": 14790 + }, + { + "epoch": 0.29584, + "grad_norm": 1.7068867683410645, + "learning_rate": 1.775297359575916e-05, + "loss": 0.1705, + "step": 14792 + }, + { + "epoch": 0.29588, + "grad_norm": 0.33588749170303345, + "learning_rate": 1.7752091645259146e-05, + "loss": 0.0606, + "step": 14794 + }, + { + "epoch": 0.29592, + "grad_norm": 0.5670729279518127, + "learning_rate": 1.775120954362812e-05, + "loss": 0.1006, + "step": 14796 + }, + { + "epoch": 0.29596, + "grad_norm": 0.15792778134346008, + "learning_rate": 1.7750327290883266e-05, + "loss": 0.0149, + "step": 14798 + }, + { + "epoch": 0.296, + "grad_norm": 1.571219563484192, + "learning_rate": 1.7749444887041797e-05, + "loss": 0.1261, + "step": 14800 + }, + { + "epoch": 0.29604, + "grad_norm": 3.0025057792663574, + "learning_rate": 1.7748562332120908e-05, + "loss": 0.2324, + "step": 14802 + }, + { + "epoch": 0.29608, + "grad_norm": 0.6767044067382812, + "learning_rate": 1.774767962613781e-05, + "loss": 0.0692, + "step": 14804 + }, + { + "epoch": 0.29612, + "grad_norm": 0.10636643320322037, + "learning_rate": 1.7746796769109704e-05, + "loss": 0.0164, + "step": 14806 + }, + { + "epoch": 0.29616, + "grad_norm": 1.508853554725647, + "learning_rate": 1.7745913761053812e-05, + "loss": 0.0729, + "step": 14808 + }, + { + "epoch": 0.2962, + "grad_norm": 0.1719498187303543, + "learning_rate": 1.7745030601987338e-05, + "loss": 0.02, + "step": 14810 + }, + { + "epoch": 0.29624, + "grad_norm": 0.568817138671875, + "learning_rate": 1.774414729192751e-05, + "loss": 0.0867, + "step": 14812 + }, + { + "epoch": 0.29628, + "grad_norm": 1.9188456535339355, + "learning_rate": 1.774326383089154e-05, + "loss": 0.1231, + "step": 14814 + }, + { + "epoch": 0.29632, + "grad_norm": 2.1048197746276855, + "learning_rate": 1.7742380218896658e-05, + "loss": 0.1132, + "step": 14816 + }, + { + "epoch": 0.29636, + "grad_norm": 1.5585366487503052, + "learning_rate": 1.7741496455960087e-05, + "loss": 0.0715, + "step": 14818 + }, + { + "epoch": 0.2964, + "grad_norm": 0.3387696444988251, + "learning_rate": 1.7740612542099054e-05, + "loss": 0.0135, + "step": 14820 + }, + { + "epoch": 0.29644, + "grad_norm": 0.2875639498233795, + "learning_rate": 1.7739728477330796e-05, + "loss": 0.0304, + "step": 14822 + }, + { + "epoch": 0.29648, + "grad_norm": 2.609077215194702, + "learning_rate": 1.7738844261672547e-05, + "loss": 0.2343, + "step": 14824 + }, + { + "epoch": 0.29652, + "grad_norm": 4.727116107940674, + "learning_rate": 1.7737959895141545e-05, + "loss": 0.3353, + "step": 14826 + }, + { + "epoch": 0.29656, + "grad_norm": 1.1076093912124634, + "learning_rate": 1.7737075377755032e-05, + "loss": 0.1504, + "step": 14828 + }, + { + "epoch": 0.2966, + "grad_norm": 0.4751300811767578, + "learning_rate": 1.773619070953025e-05, + "loss": 0.1655, + "step": 14830 + }, + { + "epoch": 0.29664, + "grad_norm": 5.897354602813721, + "learning_rate": 1.773530589048445e-05, + "loss": 0.1719, + "step": 14832 + }, + { + "epoch": 0.29668, + "grad_norm": 1.1190518140792847, + "learning_rate": 1.7734420920634875e-05, + "loss": 0.1595, + "step": 14834 + }, + { + "epoch": 0.29672, + "grad_norm": 3.3378970623016357, + "learning_rate": 1.773353579999878e-05, + "loss": 0.1926, + "step": 14836 + }, + { + "epoch": 0.29676, + "grad_norm": 0.8075652122497559, + "learning_rate": 1.7732650528593432e-05, + "loss": 0.1006, + "step": 14838 + }, + { + "epoch": 0.2968, + "grad_norm": 5.903055191040039, + "learning_rate": 1.7731765106436073e-05, + "loss": 0.287, + "step": 14840 + }, + { + "epoch": 0.29684, + "grad_norm": 0.024300891906023026, + "learning_rate": 1.7730879533543976e-05, + "loss": 0.0098, + "step": 14842 + }, + { + "epoch": 0.29688, + "grad_norm": 0.033405888825654984, + "learning_rate": 1.7729993809934404e-05, + "loss": 0.0112, + "step": 14844 + }, + { + "epoch": 0.29692, + "grad_norm": 4.867193698883057, + "learning_rate": 1.772910793562462e-05, + "loss": 0.1211, + "step": 14846 + }, + { + "epoch": 0.29696, + "grad_norm": 1.5241156816482544, + "learning_rate": 1.77282219106319e-05, + "loss": 0.0444, + "step": 14848 + }, + { + "epoch": 0.297, + "grad_norm": 0.006660448852926493, + "learning_rate": 1.7727335734973512e-05, + "loss": 0.0032, + "step": 14850 + }, + { + "epoch": 0.29704, + "grad_norm": 0.1913507878780365, + "learning_rate": 1.7726449408666737e-05, + "loss": 0.0622, + "step": 14852 + }, + { + "epoch": 0.29708, + "grad_norm": 0.5121814012527466, + "learning_rate": 1.7725562931728854e-05, + "loss": 0.0189, + "step": 14854 + }, + { + "epoch": 0.29712, + "grad_norm": 1.3913726806640625, + "learning_rate": 1.7724676304177145e-05, + "loss": 0.1117, + "step": 14856 + }, + { + "epoch": 0.29716, + "grad_norm": 4.943055629730225, + "learning_rate": 1.7723789526028888e-05, + "loss": 0.2514, + "step": 14858 + }, + { + "epoch": 0.2972, + "grad_norm": 0.40373730659484863, + "learning_rate": 1.7722902597301385e-05, + "loss": 0.124, + "step": 14860 + }, + { + "epoch": 0.29724, + "grad_norm": 4.4397196769714355, + "learning_rate": 1.7722015518011918e-05, + "loss": 0.2684, + "step": 14862 + }, + { + "epoch": 0.29728, + "grad_norm": 2.7939820289611816, + "learning_rate": 1.7721128288177782e-05, + "loss": 0.1252, + "step": 14864 + }, + { + "epoch": 0.29732, + "grad_norm": 4.37222146987915, + "learning_rate": 1.7720240907816275e-05, + "loss": 0.2182, + "step": 14866 + }, + { + "epoch": 0.29736, + "grad_norm": 0.769174337387085, + "learning_rate": 1.7719353376944698e-05, + "loss": 0.1631, + "step": 14868 + }, + { + "epoch": 0.2974, + "grad_norm": 2.1914570331573486, + "learning_rate": 1.771846569558035e-05, + "loss": 0.1143, + "step": 14870 + }, + { + "epoch": 0.29744, + "grad_norm": 2.973811388015747, + "learning_rate": 1.771757786374054e-05, + "loss": 0.0686, + "step": 14872 + }, + { + "epoch": 0.29748, + "grad_norm": 1.8037440776824951, + "learning_rate": 1.7716689881442583e-05, + "loss": 0.0588, + "step": 14874 + }, + { + "epoch": 0.29752, + "grad_norm": 0.86244797706604, + "learning_rate": 1.7715801748703776e-05, + "loss": 0.1895, + "step": 14876 + }, + { + "epoch": 0.29756, + "grad_norm": 0.3415195047855377, + "learning_rate": 1.7714913465541445e-05, + "loss": 0.0113, + "step": 14878 + }, + { + "epoch": 0.2976, + "grad_norm": 0.25101372599601746, + "learning_rate": 1.7714025031972904e-05, + "loss": 0.0071, + "step": 14880 + }, + { + "epoch": 0.29764, + "grad_norm": 1.745338797569275, + "learning_rate": 1.7713136448015472e-05, + "loss": 0.0434, + "step": 14882 + }, + { + "epoch": 0.29768, + "grad_norm": 0.8529555797576904, + "learning_rate": 1.7712247713686473e-05, + "loss": 0.098, + "step": 14884 + }, + { + "epoch": 0.29772, + "grad_norm": 6.252440452575684, + "learning_rate": 1.7711358829003237e-05, + "loss": 0.1998, + "step": 14886 + }, + { + "epoch": 0.29776, + "grad_norm": 0.027358369901776314, + "learning_rate": 1.771046979398309e-05, + "loss": 0.0798, + "step": 14888 + }, + { + "epoch": 0.2978, + "grad_norm": 12.595635414123535, + "learning_rate": 1.7709580608643364e-05, + "loss": 0.6731, + "step": 14890 + }, + { + "epoch": 0.29784, + "grad_norm": 9.765357971191406, + "learning_rate": 1.7708691273001396e-05, + "loss": 0.2673, + "step": 14892 + }, + { + "epoch": 0.29788, + "grad_norm": 8.98580551147461, + "learning_rate": 1.770780178707452e-05, + "loss": 0.3803, + "step": 14894 + }, + { + "epoch": 0.29792, + "grad_norm": 6.602435111999512, + "learning_rate": 1.7706912150880083e-05, + "loss": 0.3072, + "step": 14896 + }, + { + "epoch": 0.29796, + "grad_norm": 7.224418640136719, + "learning_rate": 1.7706022364435424e-05, + "loss": 0.4398, + "step": 14898 + }, + { + "epoch": 0.298, + "grad_norm": 0.4055699408054352, + "learning_rate": 1.7705132427757895e-05, + "loss": 0.0111, + "step": 14900 + }, + { + "epoch": 0.29804, + "grad_norm": 0.6209208369255066, + "learning_rate": 1.770424234086484e-05, + "loss": 0.0252, + "step": 14902 + }, + { + "epoch": 0.29808, + "grad_norm": 0.24258652329444885, + "learning_rate": 1.770335210377361e-05, + "loss": 0.0142, + "step": 14904 + }, + { + "epoch": 0.29812, + "grad_norm": 0.49224144220352173, + "learning_rate": 1.770246171650157e-05, + "loss": 0.0169, + "step": 14906 + }, + { + "epoch": 0.29816, + "grad_norm": 3.492278814315796, + "learning_rate": 1.7701571179066076e-05, + "loss": 0.1418, + "step": 14908 + }, + { + "epoch": 0.2982, + "grad_norm": 3.4173991680145264, + "learning_rate": 1.770068049148448e-05, + "loss": 0.1208, + "step": 14910 + }, + { + "epoch": 0.29824, + "grad_norm": 2.1459619998931885, + "learning_rate": 1.7699789653774157e-05, + "loss": 0.051, + "step": 14912 + }, + { + "epoch": 0.29828, + "grad_norm": 0.8315721154212952, + "learning_rate": 1.769889866595247e-05, + "loss": 0.2911, + "step": 14914 + }, + { + "epoch": 0.29832, + "grad_norm": 1.8178069591522217, + "learning_rate": 1.769800752803679e-05, + "loss": 0.0714, + "step": 14916 + }, + { + "epoch": 0.29836, + "grad_norm": 2.061394453048706, + "learning_rate": 1.7697116240044493e-05, + "loss": 0.1534, + "step": 14918 + }, + { + "epoch": 0.2984, + "grad_norm": 4.088727951049805, + "learning_rate": 1.7696224801992947e-05, + "loss": 0.2816, + "step": 14920 + }, + { + "epoch": 0.29844, + "grad_norm": 2.5863962173461914, + "learning_rate": 1.7695333213899536e-05, + "loss": 0.1214, + "step": 14922 + }, + { + "epoch": 0.29848, + "grad_norm": 9.621322631835938, + "learning_rate": 1.7694441475781646e-05, + "loss": 0.6508, + "step": 14924 + }, + { + "epoch": 0.29852, + "grad_norm": 4.002285957336426, + "learning_rate": 1.7693549587656656e-05, + "loss": 0.2222, + "step": 14926 + }, + { + "epoch": 0.29856, + "grad_norm": 0.3769944906234741, + "learning_rate": 1.7692657549541955e-05, + "loss": 0.1359, + "step": 14928 + }, + { + "epoch": 0.2986, + "grad_norm": 2.1883819103240967, + "learning_rate": 1.769176536145494e-05, + "loss": 0.0885, + "step": 14930 + }, + { + "epoch": 0.29864, + "grad_norm": 0.40118536353111267, + "learning_rate": 1.7690873023412993e-05, + "loss": 0.021, + "step": 14932 + }, + { + "epoch": 0.29868, + "grad_norm": 3.4902429580688477, + "learning_rate": 1.768998053543352e-05, + "loss": 0.1817, + "step": 14934 + }, + { + "epoch": 0.29872, + "grad_norm": 0.04276531934738159, + "learning_rate": 1.7689087897533916e-05, + "loss": 0.0144, + "step": 14936 + }, + { + "epoch": 0.29876, + "grad_norm": 1.5018919706344604, + "learning_rate": 1.7688195109731584e-05, + "loss": 0.0535, + "step": 14938 + }, + { + "epoch": 0.2988, + "grad_norm": 1.8501821756362915, + "learning_rate": 1.7687302172043933e-05, + "loss": 0.1851, + "step": 14940 + }, + { + "epoch": 0.29884, + "grad_norm": 0.9290989637374878, + "learning_rate": 1.7686409084488367e-05, + "loss": 0.0347, + "step": 14942 + }, + { + "epoch": 0.29888, + "grad_norm": 5.364753723144531, + "learning_rate": 1.76855158470823e-05, + "loss": 0.3896, + "step": 14944 + }, + { + "epoch": 0.29892, + "grad_norm": 1.5774483680725098, + "learning_rate": 1.7684622459843142e-05, + "loss": 0.1007, + "step": 14946 + }, + { + "epoch": 0.29896, + "grad_norm": 0.4301636517047882, + "learning_rate": 1.7683728922788316e-05, + "loss": 0.0554, + "step": 14948 + }, + { + "epoch": 0.299, + "grad_norm": 3.112740993499756, + "learning_rate": 1.7682835235935236e-05, + "loss": 0.3802, + "step": 14950 + }, + { + "epoch": 0.29904, + "grad_norm": 1.4373043775558472, + "learning_rate": 1.768194139930133e-05, + "loss": 0.3908, + "step": 14952 + }, + { + "epoch": 0.29908, + "grad_norm": 1.7735404968261719, + "learning_rate": 1.768104741290402e-05, + "loss": 0.1317, + "step": 14954 + }, + { + "epoch": 0.29912, + "grad_norm": 0.19376151263713837, + "learning_rate": 1.7680153276760736e-05, + "loss": 0.0301, + "step": 14956 + }, + { + "epoch": 0.29916, + "grad_norm": 4.314919471740723, + "learning_rate": 1.767925899088891e-05, + "loss": 0.1565, + "step": 14958 + }, + { + "epoch": 0.2992, + "grad_norm": 0.5964663028717041, + "learning_rate": 1.767836455530598e-05, + "loss": 0.0215, + "step": 14960 + }, + { + "epoch": 0.29924, + "grad_norm": 0.22416628897190094, + "learning_rate": 1.767746997002937e-05, + "loss": 0.0505, + "step": 14962 + }, + { + "epoch": 0.29928, + "grad_norm": 0.6064656376838684, + "learning_rate": 1.767657523507654e-05, + "loss": 0.031, + "step": 14964 + }, + { + "epoch": 0.29932, + "grad_norm": 0.7713704705238342, + "learning_rate": 1.767568035046492e-05, + "loss": 0.0487, + "step": 14966 + }, + { + "epoch": 0.29936, + "grad_norm": 1.9709938764572144, + "learning_rate": 1.7674785316211963e-05, + "loss": 0.0904, + "step": 14968 + }, + { + "epoch": 0.2994, + "grad_norm": 4.28175687789917, + "learning_rate": 1.767389013233511e-05, + "loss": 0.3668, + "step": 14970 + }, + { + "epoch": 0.29944, + "grad_norm": 1.7420181035995483, + "learning_rate": 1.767299479885182e-05, + "loss": 0.08, + "step": 14972 + }, + { + "epoch": 0.29948, + "grad_norm": 0.7319266200065613, + "learning_rate": 1.7672099315779548e-05, + "loss": 0.0716, + "step": 14974 + }, + { + "epoch": 0.29952, + "grad_norm": 0.14818066358566284, + "learning_rate": 1.7671203683135747e-05, + "loss": 0.0189, + "step": 14976 + }, + { + "epoch": 0.29956, + "grad_norm": 1.0115325450897217, + "learning_rate": 1.7670307900937885e-05, + "loss": 0.0283, + "step": 14978 + }, + { + "epoch": 0.2996, + "grad_norm": 0.9340389966964722, + "learning_rate": 1.7669411969203417e-05, + "loss": 0.0725, + "step": 14980 + }, + { + "epoch": 0.29964, + "grad_norm": 1.8434480428695679, + "learning_rate": 1.766851588794982e-05, + "loss": 0.3777, + "step": 14982 + }, + { + "epoch": 0.29968, + "grad_norm": 1.5564606189727783, + "learning_rate": 1.766761965719455e-05, + "loss": 0.0555, + "step": 14984 + }, + { + "epoch": 0.29972, + "grad_norm": 4.539933204650879, + "learning_rate": 1.7666723276955092e-05, + "loss": 0.1176, + "step": 14986 + }, + { + "epoch": 0.29976, + "grad_norm": 5.102761268615723, + "learning_rate": 1.7665826747248916e-05, + "loss": 0.3028, + "step": 14988 + }, + { + "epoch": 0.2998, + "grad_norm": 1.3899976015090942, + "learning_rate": 1.76649300680935e-05, + "loss": 0.2613, + "step": 14990 + }, + { + "epoch": 0.29984, + "grad_norm": 2.124330520629883, + "learning_rate": 1.7664033239506327e-05, + "loss": 0.5406, + "step": 14992 + }, + { + "epoch": 0.29988, + "grad_norm": 4.812784671783447, + "learning_rate": 1.7663136261504877e-05, + "loss": 0.193, + "step": 14994 + }, + { + "epoch": 0.29992, + "grad_norm": 5.116690635681152, + "learning_rate": 1.7662239134106646e-05, + "loss": 0.2468, + "step": 14996 + }, + { + "epoch": 0.29996, + "grad_norm": 3.600754976272583, + "learning_rate": 1.7661341857329116e-05, + "loss": 0.1382, + "step": 14998 + }, + { + "epoch": 0.3, + "grad_norm": 2.8524415493011475, + "learning_rate": 1.766044443118978e-05, + "loss": 0.1753, + "step": 15000 + }, + { + "epoch": 0.30004, + "grad_norm": 1.0970051288604736, + "learning_rate": 1.765954685570614e-05, + "loss": 0.1738, + "step": 15002 + }, + { + "epoch": 0.30008, + "grad_norm": 9.012948989868164, + "learning_rate": 1.765864913089569e-05, + "loss": 1.0376, + "step": 15004 + }, + { + "epoch": 0.30012, + "grad_norm": 0.1060853973031044, + "learning_rate": 1.765775125677593e-05, + "loss": 0.3037, + "step": 15006 + }, + { + "epoch": 0.30016, + "grad_norm": 1.0257296562194824, + "learning_rate": 1.765685323336437e-05, + "loss": 0.0809, + "step": 15008 + }, + { + "epoch": 0.3002, + "grad_norm": 3.039320230484009, + "learning_rate": 1.7655955060678508e-05, + "loss": 0.2036, + "step": 15010 + }, + { + "epoch": 0.30024, + "grad_norm": 2.50639271736145, + "learning_rate": 1.7655056738735864e-05, + "loss": 0.0686, + "step": 15012 + }, + { + "epoch": 0.30028, + "grad_norm": 0.3814341723918915, + "learning_rate": 1.765415826755395e-05, + "loss": 0.0813, + "step": 15014 + }, + { + "epoch": 0.30032, + "grad_norm": 6.828376293182373, + "learning_rate": 1.7653259647150275e-05, + "loss": 0.7395, + "step": 15016 + }, + { + "epoch": 0.30036, + "grad_norm": 5.075229167938232, + "learning_rate": 1.7652360877542367e-05, + "loss": 0.2912, + "step": 15018 + }, + { + "epoch": 0.3004, + "grad_norm": 0.2933032214641571, + "learning_rate": 1.7651461958747745e-05, + "loss": 0.1057, + "step": 15020 + }, + { + "epoch": 0.30044, + "grad_norm": 1.0155243873596191, + "learning_rate": 1.765056289078393e-05, + "loss": 0.0676, + "step": 15022 + }, + { + "epoch": 0.30048, + "grad_norm": 2.2933127880096436, + "learning_rate": 1.7649663673668454e-05, + "loss": 0.1132, + "step": 15024 + }, + { + "epoch": 0.30052, + "grad_norm": 1.0529201030731201, + "learning_rate": 1.7648764307418846e-05, + "loss": 0.0563, + "step": 15026 + }, + { + "epoch": 0.30056, + "grad_norm": 0.6429734826087952, + "learning_rate": 1.764786479205264e-05, + "loss": 0.0278, + "step": 15028 + }, + { + "epoch": 0.3006, + "grad_norm": 2.1369378566741943, + "learning_rate": 1.7646965127587373e-05, + "loss": 0.212, + "step": 15030 + }, + { + "epoch": 0.30064, + "grad_norm": 0.7891219258308411, + "learning_rate": 1.764606531404058e-05, + "loss": 0.0384, + "step": 15032 + }, + { + "epoch": 0.30068, + "grad_norm": 0.4015871584415436, + "learning_rate": 1.7645165351429812e-05, + "loss": 0.0319, + "step": 15034 + }, + { + "epoch": 0.30072, + "grad_norm": 3.223700523376465, + "learning_rate": 1.7644265239772605e-05, + "loss": 0.1416, + "step": 15036 + }, + { + "epoch": 0.30076, + "grad_norm": 1.242086410522461, + "learning_rate": 1.7643364979086518e-05, + "loss": 0.3711, + "step": 15038 + }, + { + "epoch": 0.3008, + "grad_norm": 0.9759232997894287, + "learning_rate": 1.764246456938909e-05, + "loss": 0.0985, + "step": 15040 + }, + { + "epoch": 0.30084, + "grad_norm": 0.893886148929596, + "learning_rate": 1.7641564010697886e-05, + "loss": 0.2431, + "step": 15042 + }, + { + "epoch": 0.30088, + "grad_norm": 2.8658993244171143, + "learning_rate": 1.7640663303030452e-05, + "loss": 0.1301, + "step": 15044 + }, + { + "epoch": 0.30092, + "grad_norm": 3.5477747917175293, + "learning_rate": 1.7639762446404357e-05, + "loss": 0.2436, + "step": 15046 + }, + { + "epoch": 0.30096, + "grad_norm": 3.76532244682312, + "learning_rate": 1.7638861440837157e-05, + "loss": 0.264, + "step": 15048 + }, + { + "epoch": 0.301, + "grad_norm": 2.5068161487579346, + "learning_rate": 1.7637960286346423e-05, + "loss": 0.1496, + "step": 15050 + }, + { + "epoch": 0.30104, + "grad_norm": 1.5976431369781494, + "learning_rate": 1.763705898294972e-05, + "loss": 0.1608, + "step": 15052 + }, + { + "epoch": 0.30108, + "grad_norm": 0.6078852415084839, + "learning_rate": 1.7636157530664618e-05, + "loss": 0.0241, + "step": 15054 + }, + { + "epoch": 0.30112, + "grad_norm": 0.17108844220638275, + "learning_rate": 1.76352559295087e-05, + "loss": 0.0326, + "step": 15056 + }, + { + "epoch": 0.30116, + "grad_norm": 0.6601122617721558, + "learning_rate": 1.7634354179499532e-05, + "loss": 0.0738, + "step": 15058 + }, + { + "epoch": 0.3012, + "grad_norm": 3.3584773540496826, + "learning_rate": 1.76334522806547e-05, + "loss": 0.185, + "step": 15060 + }, + { + "epoch": 0.30124, + "grad_norm": 0.23604339361190796, + "learning_rate": 1.7632550232991782e-05, + "loss": 0.0223, + "step": 15062 + }, + { + "epoch": 0.30128, + "grad_norm": 0.9141828417778015, + "learning_rate": 1.7631648036528374e-05, + "loss": 0.0459, + "step": 15064 + }, + { + "epoch": 0.30132, + "grad_norm": 2.798262357711792, + "learning_rate": 1.7630745691282054e-05, + "loss": 0.0859, + "step": 15066 + }, + { + "epoch": 0.30136, + "grad_norm": 3.7508139610290527, + "learning_rate": 1.762984319727042e-05, + "loss": 0.1667, + "step": 15068 + }, + { + "epoch": 0.3014, + "grad_norm": 5.661866664886475, + "learning_rate": 1.7628940554511064e-05, + "loss": 0.257, + "step": 15070 + }, + { + "epoch": 0.30144, + "grad_norm": 1.7489049434661865, + "learning_rate": 1.7628037763021586e-05, + "loss": 0.0824, + "step": 15072 + }, + { + "epoch": 0.30148, + "grad_norm": 1.1636838912963867, + "learning_rate": 1.762713482281958e-05, + "loss": 0.0901, + "step": 15074 + }, + { + "epoch": 0.30152, + "grad_norm": 0.16807523369789124, + "learning_rate": 1.762623173392266e-05, + "loss": 0.0144, + "step": 15076 + }, + { + "epoch": 0.30156, + "grad_norm": 3.6525561809539795, + "learning_rate": 1.762532849634842e-05, + "loss": 0.1704, + "step": 15078 + }, + { + "epoch": 0.3016, + "grad_norm": 0.2456052005290985, + "learning_rate": 1.762442511011448e-05, + "loss": 0.0718, + "step": 15080 + }, + { + "epoch": 0.30164, + "grad_norm": 2.3706419467926025, + "learning_rate": 1.7623521575238447e-05, + "loss": 0.1847, + "step": 15082 + }, + { + "epoch": 0.30168, + "grad_norm": 3.1749892234802246, + "learning_rate": 1.7622617891737933e-05, + "loss": 0.2216, + "step": 15084 + }, + { + "epoch": 0.30172, + "grad_norm": 3.3219761848449707, + "learning_rate": 1.762171405963056e-05, + "loss": 0.2282, + "step": 15086 + }, + { + "epoch": 0.30176, + "grad_norm": 4.4903459548950195, + "learning_rate": 1.7620810078933946e-05, + "loss": 0.2427, + "step": 15088 + }, + { + "epoch": 0.3018, + "grad_norm": 0.09083159267902374, + "learning_rate": 1.761990594966572e-05, + "loss": 0.108, + "step": 15090 + }, + { + "epoch": 0.30184, + "grad_norm": 2.66601300239563, + "learning_rate": 1.7619001671843503e-05, + "loss": 0.1175, + "step": 15092 + }, + { + "epoch": 0.30188, + "grad_norm": 5.032575607299805, + "learning_rate": 1.7618097245484926e-05, + "loss": 0.3062, + "step": 15094 + }, + { + "epoch": 0.30192, + "grad_norm": 3.7968380451202393, + "learning_rate": 1.761719267060762e-05, + "loss": 0.2252, + "step": 15096 + }, + { + "epoch": 0.30196, + "grad_norm": 4.951503753662109, + "learning_rate": 1.7616287947229224e-05, + "loss": 0.3946, + "step": 15098 + }, + { + "epoch": 0.302, + "grad_norm": 0.3226601779460907, + "learning_rate": 1.761538307536737e-05, + "loss": 0.0351, + "step": 15100 + }, + { + "epoch": 0.30204, + "grad_norm": 3.207599401473999, + "learning_rate": 1.7614478055039705e-05, + "loss": 0.1756, + "step": 15102 + }, + { + "epoch": 0.30208, + "grad_norm": 1.489466905593872, + "learning_rate": 1.7613572886263865e-05, + "loss": 0.0488, + "step": 15104 + }, + { + "epoch": 0.30212, + "grad_norm": 0.7482890486717224, + "learning_rate": 1.7612667569057508e-05, + "loss": 0.0272, + "step": 15106 + }, + { + "epoch": 0.30216, + "grad_norm": 0.5470091700553894, + "learning_rate": 1.7611762103438273e-05, + "loss": 0.0158, + "step": 15108 + }, + { + "epoch": 0.3022, + "grad_norm": 7.026060581207275, + "learning_rate": 1.761085648942382e-05, + "loss": 0.3179, + "step": 15110 + }, + { + "epoch": 0.30224, + "grad_norm": 2.572309970855713, + "learning_rate": 1.76099507270318e-05, + "loss": 0.1527, + "step": 15112 + }, + { + "epoch": 0.30228, + "grad_norm": 0.36539149284362793, + "learning_rate": 1.7609044816279872e-05, + "loss": 0.0252, + "step": 15114 + }, + { + "epoch": 0.30232, + "grad_norm": 5.921687126159668, + "learning_rate": 1.76081387571857e-05, + "loss": 0.294, + "step": 15116 + }, + { + "epoch": 0.30236, + "grad_norm": 1.841825008392334, + "learning_rate": 1.7607232549766945e-05, + "loss": 0.0677, + "step": 15118 + }, + { + "epoch": 0.3024, + "grad_norm": 5.552126884460449, + "learning_rate": 1.7606326194041274e-05, + "loss": 0.363, + "step": 15120 + }, + { + "epoch": 0.30244, + "grad_norm": 5.985955238342285, + "learning_rate": 1.760541969002636e-05, + "loss": 0.3682, + "step": 15122 + }, + { + "epoch": 0.30248, + "grad_norm": 0.3599695861339569, + "learning_rate": 1.760451303773987e-05, + "loss": 0.0231, + "step": 15124 + }, + { + "epoch": 0.30252, + "grad_norm": 4.065272331237793, + "learning_rate": 1.7603606237199486e-05, + "loss": 0.3422, + "step": 15126 + }, + { + "epoch": 0.30256, + "grad_norm": 2.627293825149536, + "learning_rate": 1.7602699288422885e-05, + "loss": 0.1594, + "step": 15128 + }, + { + "epoch": 0.3026, + "grad_norm": 1.437003493309021, + "learning_rate": 1.760179219142774e-05, + "loss": 0.2774, + "step": 15130 + }, + { + "epoch": 0.30264, + "grad_norm": 0.25753462314605713, + "learning_rate": 1.7600884946231753e-05, + "loss": 0.0329, + "step": 15132 + }, + { + "epoch": 0.30268, + "grad_norm": 4.532603740692139, + "learning_rate": 1.7599977552852595e-05, + "loss": 0.1731, + "step": 15134 + }, + { + "epoch": 0.30272, + "grad_norm": 1.0158910751342773, + "learning_rate": 1.7599070011307965e-05, + "loss": 0.0471, + "step": 15136 + }, + { + "epoch": 0.30276, + "grad_norm": 3.089996576309204, + "learning_rate": 1.7598162321615547e-05, + "loss": 0.1435, + "step": 15138 + }, + { + "epoch": 0.3028, + "grad_norm": 0.9207138419151306, + "learning_rate": 1.759725448379305e-05, + "loss": 0.2537, + "step": 15140 + }, + { + "epoch": 0.30284, + "grad_norm": 7.451425552368164, + "learning_rate": 1.7596346497858162e-05, + "loss": 0.4015, + "step": 15142 + }, + { + "epoch": 0.30288, + "grad_norm": 0.37954428791999817, + "learning_rate": 1.7595438363828584e-05, + "loss": 0.1346, + "step": 15144 + }, + { + "epoch": 0.30292, + "grad_norm": 1.1332663297653198, + "learning_rate": 1.759453008172203e-05, + "loss": 0.1397, + "step": 15146 + }, + { + "epoch": 0.30296, + "grad_norm": 2.0181655883789062, + "learning_rate": 1.7593621651556203e-05, + "loss": 0.1109, + "step": 15148 + }, + { + "epoch": 0.303, + "grad_norm": 1.5753419399261475, + "learning_rate": 1.759271307334881e-05, + "loss": 0.1265, + "step": 15150 + }, + { + "epoch": 0.30304, + "grad_norm": 3.5345983505249023, + "learning_rate": 1.759180434711757e-05, + "loss": 0.1809, + "step": 15152 + }, + { + "epoch": 0.30308, + "grad_norm": 8.87281322479248, + "learning_rate": 1.759089547288019e-05, + "loss": 0.6254, + "step": 15154 + }, + { + "epoch": 0.30312, + "grad_norm": 1.1397298574447632, + "learning_rate": 1.7589986450654397e-05, + "loss": 0.1306, + "step": 15156 + }, + { + "epoch": 0.30316, + "grad_norm": 1.4813637733459473, + "learning_rate": 1.7589077280457912e-05, + "loss": 0.0765, + "step": 15158 + }, + { + "epoch": 0.3032, + "grad_norm": 0.824224591255188, + "learning_rate": 1.7588167962308458e-05, + "loss": 0.0482, + "step": 15160 + }, + { + "epoch": 0.30324, + "grad_norm": 0.7987216711044312, + "learning_rate": 1.758725849622376e-05, + "loss": 0.0306, + "step": 15162 + }, + { + "epoch": 0.30328, + "grad_norm": 0.8684126138687134, + "learning_rate": 1.7586348882221555e-05, + "loss": 0.0297, + "step": 15164 + }, + { + "epoch": 0.30332, + "grad_norm": 0.5283447504043579, + "learning_rate": 1.7585439120319573e-05, + "loss": 0.1134, + "step": 15166 + }, + { + "epoch": 0.30336, + "grad_norm": 1.4128984212875366, + "learning_rate": 1.7584529210535545e-05, + "loss": 0.0648, + "step": 15168 + }, + { + "epoch": 0.3034, + "grad_norm": 2.346113681793213, + "learning_rate": 1.7583619152887222e-05, + "loss": 0.0835, + "step": 15170 + }, + { + "epoch": 0.30344, + "grad_norm": 0.38201430439949036, + "learning_rate": 1.7582708947392334e-05, + "loss": 0.0226, + "step": 15172 + }, + { + "epoch": 0.30348, + "grad_norm": 1.162042260169983, + "learning_rate": 1.7581798594068634e-05, + "loss": 0.0432, + "step": 15174 + }, + { + "epoch": 0.30352, + "grad_norm": 5.861164569854736, + "learning_rate": 1.7580888092933867e-05, + "loss": 0.4636, + "step": 15176 + }, + { + "epoch": 0.30356, + "grad_norm": 1.911660075187683, + "learning_rate": 1.7579977444005782e-05, + "loss": 0.1513, + "step": 15178 + }, + { + "epoch": 0.3036, + "grad_norm": 2.444329023361206, + "learning_rate": 1.7579066647302134e-05, + "loss": 0.2775, + "step": 15180 + }, + { + "epoch": 0.30364, + "grad_norm": 0.31808799505233765, + "learning_rate": 1.7578155702840683e-05, + "loss": 0.0731, + "step": 15182 + }, + { + "epoch": 0.30368, + "grad_norm": 0.166672483086586, + "learning_rate": 1.7577244610639183e-05, + "loss": 0.0704, + "step": 15184 + }, + { + "epoch": 0.30372, + "grad_norm": 0.37505054473876953, + "learning_rate": 1.75763333707154e-05, + "loss": 0.0361, + "step": 15186 + }, + { + "epoch": 0.30376, + "grad_norm": 0.4303554892539978, + "learning_rate": 1.7575421983087095e-05, + "loss": 0.0406, + "step": 15188 + }, + { + "epoch": 0.3038, + "grad_norm": 4.615163803100586, + "learning_rate": 1.757451044777204e-05, + "loss": 0.3077, + "step": 15190 + }, + { + "epoch": 0.30384, + "grad_norm": 0.4433780312538147, + "learning_rate": 1.7573598764788e-05, + "loss": 0.1475, + "step": 15192 + }, + { + "epoch": 0.30388, + "grad_norm": 1.86637544631958, + "learning_rate": 1.757268693415276e-05, + "loss": 0.111, + "step": 15194 + }, + { + "epoch": 0.30392, + "grad_norm": 0.00508333882316947, + "learning_rate": 1.7571774955884083e-05, + "loss": 0.0192, + "step": 15196 + }, + { + "epoch": 0.30396, + "grad_norm": 1.7693041563034058, + "learning_rate": 1.757086282999976e-05, + "loss": 0.2285, + "step": 15198 + }, + { + "epoch": 0.304, + "grad_norm": 0.4645844101905823, + "learning_rate": 1.7569950556517566e-05, + "loss": 0.0829, + "step": 15200 + }, + { + "epoch": 0.30404, + "grad_norm": 3.9631707668304443, + "learning_rate": 1.7569038135455288e-05, + "loss": 0.0967, + "step": 15202 + }, + { + "epoch": 0.30408, + "grad_norm": 1.8301076889038086, + "learning_rate": 1.756812556683072e-05, + "loss": 0.0622, + "step": 15204 + }, + { + "epoch": 0.30412, + "grad_norm": 0.3427114188671112, + "learning_rate": 1.756721285066164e-05, + "loss": 0.0137, + "step": 15206 + }, + { + "epoch": 0.30416, + "grad_norm": 0.023101026192307472, + "learning_rate": 1.7566299986965855e-05, + "loss": 0.0197, + "step": 15208 + }, + { + "epoch": 0.3042, + "grad_norm": 0.8487818241119385, + "learning_rate": 1.756538697576115e-05, + "loss": 0.0372, + "step": 15210 + }, + { + "epoch": 0.30424, + "grad_norm": 0.5828219056129456, + "learning_rate": 1.756447381706534e-05, + "loss": 0.0257, + "step": 15212 + }, + { + "epoch": 0.30428, + "grad_norm": 3.665497303009033, + "learning_rate": 1.7563560510896212e-05, + "loss": 0.266, + "step": 15214 + }, + { + "epoch": 0.30432, + "grad_norm": 0.5894708633422852, + "learning_rate": 1.756264705727158e-05, + "loss": 0.0207, + "step": 15216 + }, + { + "epoch": 0.30436, + "grad_norm": 0.16233226656913757, + "learning_rate": 1.756173345620925e-05, + "loss": 0.0623, + "step": 15218 + }, + { + "epoch": 0.3044, + "grad_norm": 1.480055332183838, + "learning_rate": 1.7560819707727034e-05, + "loss": 0.0588, + "step": 15220 + }, + { + "epoch": 0.30444, + "grad_norm": 0.4860997498035431, + "learning_rate": 1.7559905811842745e-05, + "loss": 0.2786, + "step": 15222 + }, + { + "epoch": 0.30448, + "grad_norm": 0.2356310784816742, + "learning_rate": 1.7558991768574197e-05, + "loss": 0.0168, + "step": 15224 + }, + { + "epoch": 0.30452, + "grad_norm": 0.4412866532802582, + "learning_rate": 1.7558077577939214e-05, + "loss": 0.0565, + "step": 15226 + }, + { + "epoch": 0.30456, + "grad_norm": 0.224368616938591, + "learning_rate": 1.7557163239955622e-05, + "loss": 0.0092, + "step": 15228 + }, + { + "epoch": 0.3046, + "grad_norm": 3.9912781715393066, + "learning_rate": 1.7556248754641237e-05, + "loss": 0.1548, + "step": 15230 + }, + { + "epoch": 0.30464, + "grad_norm": 1.7151639461517334, + "learning_rate": 1.7555334122013894e-05, + "loss": 0.0509, + "step": 15232 + }, + { + "epoch": 0.30468, + "grad_norm": 7.992063999176025, + "learning_rate": 1.755441934209142e-05, + "loss": 0.4565, + "step": 15234 + }, + { + "epoch": 0.30472, + "grad_norm": 0.2239702045917511, + "learning_rate": 1.7553504414891657e-05, + "loss": 0.0107, + "step": 15236 + }, + { + "epoch": 0.30476, + "grad_norm": 0.8992124795913696, + "learning_rate": 1.7552589340432433e-05, + "loss": 0.086, + "step": 15238 + }, + { + "epoch": 0.3048, + "grad_norm": 0.4417951703071594, + "learning_rate": 1.7551674118731592e-05, + "loss": 0.0641, + "step": 15240 + }, + { + "epoch": 0.30484, + "grad_norm": 0.5500771403312683, + "learning_rate": 1.7550758749806975e-05, + "loss": 0.0144, + "step": 15242 + }, + { + "epoch": 0.30488, + "grad_norm": 0.39483776688575745, + "learning_rate": 1.7549843233676434e-05, + "loss": 0.2578, + "step": 15244 + }, + { + "epoch": 0.30492, + "grad_norm": 7.623286247253418, + "learning_rate": 1.754892757035781e-05, + "loss": 0.2252, + "step": 15246 + }, + { + "epoch": 0.30496, + "grad_norm": 0.12410958856344223, + "learning_rate": 1.754801175986895e-05, + "loss": 0.0099, + "step": 15248 + }, + { + "epoch": 0.305, + "grad_norm": 0.1940814107656479, + "learning_rate": 1.7547095802227723e-05, + "loss": 0.0471, + "step": 15250 + }, + { + "epoch": 0.30504, + "grad_norm": 7.669508934020996, + "learning_rate": 1.754617969745197e-05, + "loss": 0.4509, + "step": 15252 + }, + { + "epoch": 0.30508, + "grad_norm": 0.3214776813983917, + "learning_rate": 1.7545263445559566e-05, + "loss": 0.5373, + "step": 15254 + }, + { + "epoch": 0.30512, + "grad_norm": 0.6871696710586548, + "learning_rate": 1.7544347046568363e-05, + "loss": 0.0155, + "step": 15256 + }, + { + "epoch": 0.30516, + "grad_norm": 0.29214778542518616, + "learning_rate": 1.754343050049623e-05, + "loss": 0.0874, + "step": 15258 + }, + { + "epoch": 0.3052, + "grad_norm": 0.4848942756652832, + "learning_rate": 1.754251380736104e-05, + "loss": 0.0185, + "step": 15260 + }, + { + "epoch": 0.30524, + "grad_norm": 4.589218616485596, + "learning_rate": 1.7541596967180655e-05, + "loss": 0.1603, + "step": 15262 + }, + { + "epoch": 0.30528, + "grad_norm": 1.3725391626358032, + "learning_rate": 1.7540679979972958e-05, + "loss": 0.0635, + "step": 15264 + }, + { + "epoch": 0.30532, + "grad_norm": 6.900383949279785, + "learning_rate": 1.7539762845755822e-05, + "loss": 0.32, + "step": 15266 + }, + { + "epoch": 0.30536, + "grad_norm": 0.44843146204948425, + "learning_rate": 1.7538845564547126e-05, + "loss": 0.5215, + "step": 15268 + }, + { + "epoch": 0.3054, + "grad_norm": 2.1940062046051025, + "learning_rate": 1.7537928136364756e-05, + "loss": 0.0865, + "step": 15270 + }, + { + "epoch": 0.30544, + "grad_norm": 0.398472398519516, + "learning_rate": 1.7537010561226595e-05, + "loss": 0.0182, + "step": 15272 + }, + { + "epoch": 0.30548, + "grad_norm": 0.376362681388855, + "learning_rate": 1.7536092839150534e-05, + "loss": 0.1498, + "step": 15274 + }, + { + "epoch": 0.30552, + "grad_norm": 3.3850951194763184, + "learning_rate": 1.753517497015446e-05, + "loss": 0.1504, + "step": 15276 + }, + { + "epoch": 0.30556, + "grad_norm": 0.7483506798744202, + "learning_rate": 1.7534256954256275e-05, + "loss": 0.3222, + "step": 15278 + }, + { + "epoch": 0.3056, + "grad_norm": 3.838348388671875, + "learning_rate": 1.7533338791473872e-05, + "loss": 0.1148, + "step": 15280 + }, + { + "epoch": 0.30564, + "grad_norm": 0.8372960090637207, + "learning_rate": 1.7532420481825147e-05, + "loss": 0.0612, + "step": 15282 + }, + { + "epoch": 0.30568, + "grad_norm": 0.5834248065948486, + "learning_rate": 1.7531502025328008e-05, + "loss": 0.0962, + "step": 15284 + }, + { + "epoch": 0.30572, + "grad_norm": 0.3754962086677551, + "learning_rate": 1.753058342200036e-05, + "loss": 0.108, + "step": 15286 + }, + { + "epoch": 0.30576, + "grad_norm": 4.245324611663818, + "learning_rate": 1.752966467186011e-05, + "loss": 0.3307, + "step": 15288 + }, + { + "epoch": 0.3058, + "grad_norm": 0.3741220235824585, + "learning_rate": 1.7528745774925175e-05, + "loss": 0.0687, + "step": 15290 + }, + { + "epoch": 0.30584, + "grad_norm": 0.22087222337722778, + "learning_rate": 1.752782673121346e-05, + "loss": 0.2556, + "step": 15292 + }, + { + "epoch": 0.30588, + "grad_norm": 2.743889808654785, + "learning_rate": 1.7526907540742888e-05, + "loss": 0.1055, + "step": 15294 + }, + { + "epoch": 0.30592, + "grad_norm": 1.6168007850646973, + "learning_rate": 1.752598820353138e-05, + "loss": 0.059, + "step": 15296 + }, + { + "epoch": 0.30596, + "grad_norm": 4.491397380828857, + "learning_rate": 1.752506871959686e-05, + "loss": 0.2536, + "step": 15298 + }, + { + "epoch": 0.306, + "grad_norm": 0.4475858807563782, + "learning_rate": 1.7524149088957244e-05, + "loss": 0.137, + "step": 15300 + }, + { + "epoch": 0.30604, + "grad_norm": 1.805694818496704, + "learning_rate": 1.7523229311630473e-05, + "loss": 0.1365, + "step": 15302 + }, + { + "epoch": 0.30608, + "grad_norm": 0.8247256278991699, + "learning_rate": 1.7522309387634472e-05, + "loss": 0.0773, + "step": 15304 + }, + { + "epoch": 0.30612, + "grad_norm": 0.5330941677093506, + "learning_rate": 1.7521389316987177e-05, + "loss": 0.0555, + "step": 15306 + }, + { + "epoch": 0.30616, + "grad_norm": 0.7326613068580627, + "learning_rate": 1.7520469099706526e-05, + "loss": 0.2695, + "step": 15308 + }, + { + "epoch": 0.3062, + "grad_norm": 0.6063528060913086, + "learning_rate": 1.7519548735810456e-05, + "loss": 0.0215, + "step": 15310 + }, + { + "epoch": 0.30624, + "grad_norm": 5.872349739074707, + "learning_rate": 1.7518628225316915e-05, + "loss": 0.2733, + "step": 15312 + }, + { + "epoch": 0.30628, + "grad_norm": 0.3473183214664459, + "learning_rate": 1.7517707568243843e-05, + "loss": 0.0197, + "step": 15314 + }, + { + "epoch": 0.30632, + "grad_norm": 3.294935941696167, + "learning_rate": 1.7516786764609194e-05, + "loss": 0.4421, + "step": 15316 + }, + { + "epoch": 0.30636, + "grad_norm": 1.9609888792037964, + "learning_rate": 1.7515865814430914e-05, + "loss": 0.079, + "step": 15318 + }, + { + "epoch": 0.3064, + "grad_norm": 0.6036831736564636, + "learning_rate": 1.7514944717726962e-05, + "loss": 0.0363, + "step": 15320 + }, + { + "epoch": 0.30644, + "grad_norm": 0.4730902910232544, + "learning_rate": 1.7514023474515292e-05, + "loss": 0.0144, + "step": 15322 + }, + { + "epoch": 0.30648, + "grad_norm": 0.6989727020263672, + "learning_rate": 1.7513102084813872e-05, + "loss": 0.0982, + "step": 15324 + }, + { + "epoch": 0.30652, + "grad_norm": 10.438264846801758, + "learning_rate": 1.751218054864065e-05, + "loss": 0.5911, + "step": 15326 + }, + { + "epoch": 0.30656, + "grad_norm": 2.2200767993927, + "learning_rate": 1.751125886601361e-05, + "loss": 0.2146, + "step": 15328 + }, + { + "epoch": 0.3066, + "grad_norm": 0.9700307846069336, + "learning_rate": 1.7510337036950703e-05, + "loss": 0.1506, + "step": 15330 + }, + { + "epoch": 0.30664, + "grad_norm": 0.12311700731515884, + "learning_rate": 1.7509415061469916e-05, + "loss": 0.0741, + "step": 15332 + }, + { + "epoch": 0.30668, + "grad_norm": 0.7921209335327148, + "learning_rate": 1.750849293958921e-05, + "loss": 0.0428, + "step": 15334 + }, + { + "epoch": 0.30672, + "grad_norm": 2.219238042831421, + "learning_rate": 1.7507570671326573e-05, + "loss": 0.093, + "step": 15336 + }, + { + "epoch": 0.30676, + "grad_norm": 0.1655678004026413, + "learning_rate": 1.750664825669998e-05, + "loss": 0.0264, + "step": 15338 + }, + { + "epoch": 0.3068, + "grad_norm": 0.4137740731239319, + "learning_rate": 1.7505725695727414e-05, + "loss": 0.0272, + "step": 15340 + }, + { + "epoch": 0.30684, + "grad_norm": 2.799170732498169, + "learning_rate": 1.750480298842686e-05, + "loss": 0.2557, + "step": 15342 + }, + { + "epoch": 0.30688, + "grad_norm": 0.4473487138748169, + "learning_rate": 1.750388013481631e-05, + "loss": 0.0455, + "step": 15344 + }, + { + "epoch": 0.30692, + "grad_norm": 3.255624532699585, + "learning_rate": 1.750295713491375e-05, + "loss": 0.3066, + "step": 15346 + }, + { + "epoch": 0.30696, + "grad_norm": 0.42788711190223694, + "learning_rate": 1.750203398873718e-05, + "loss": 0.2316, + "step": 15348 + }, + { + "epoch": 0.307, + "grad_norm": 2.884857416152954, + "learning_rate": 1.7501110696304598e-05, + "loss": 0.0993, + "step": 15350 + }, + { + "epoch": 0.30704, + "grad_norm": 0.5520044565200806, + "learning_rate": 1.7500187257634e-05, + "loss": 0.0321, + "step": 15352 + }, + { + "epoch": 0.30708, + "grad_norm": 0.7678419947624207, + "learning_rate": 1.7499263672743385e-05, + "loss": 0.0307, + "step": 15354 + }, + { + "epoch": 0.30712, + "grad_norm": 3.2743375301361084, + "learning_rate": 1.7498339941650768e-05, + "loss": 0.1017, + "step": 15356 + }, + { + "epoch": 0.30716, + "grad_norm": 0.04811051860451698, + "learning_rate": 1.749741606437415e-05, + "loss": 0.285, + "step": 15358 + }, + { + "epoch": 0.3072, + "grad_norm": 5.359227657318115, + "learning_rate": 1.749649204093155e-05, + "loss": 0.4146, + "step": 15360 + }, + { + "epoch": 0.30724, + "grad_norm": 6.767193794250488, + "learning_rate": 1.749556787134098e-05, + "loss": 0.5871, + "step": 15362 + }, + { + "epoch": 0.30728, + "grad_norm": 6.574188232421875, + "learning_rate": 1.749464355562045e-05, + "loss": 0.3267, + "step": 15364 + }, + { + "epoch": 0.30732, + "grad_norm": 3.828575611114502, + "learning_rate": 1.749371909378799e-05, + "loss": 0.4995, + "step": 15366 + }, + { + "epoch": 0.30736, + "grad_norm": 1.5161720514297485, + "learning_rate": 1.749279448586162e-05, + "loss": 0.0986, + "step": 15368 + }, + { + "epoch": 0.3074, + "grad_norm": 0.6164879202842712, + "learning_rate": 1.7491869731859353e-05, + "loss": 0.0713, + "step": 15370 + }, + { + "epoch": 0.30744, + "grad_norm": 0.3796132504940033, + "learning_rate": 1.749094483179924e-05, + "loss": 0.0222, + "step": 15372 + }, + { + "epoch": 0.30748, + "grad_norm": 0.9299970269203186, + "learning_rate": 1.7490019785699294e-05, + "loss": 0.1233, + "step": 15374 + }, + { + "epoch": 0.30752, + "grad_norm": 0.5708969235420227, + "learning_rate": 1.7489094593577557e-05, + "loss": 0.1818, + "step": 15376 + }, + { + "epoch": 0.30756, + "grad_norm": 1.2074493169784546, + "learning_rate": 1.7488169255452067e-05, + "loss": 0.2138, + "step": 15378 + }, + { + "epoch": 0.3076, + "grad_norm": 0.6409634351730347, + "learning_rate": 1.7487243771340862e-05, + "loss": 0.0362, + "step": 15380 + }, + { + "epoch": 0.30764, + "grad_norm": 2.3372786045074463, + "learning_rate": 1.7486318141261987e-05, + "loss": 0.1305, + "step": 15382 + }, + { + "epoch": 0.30768, + "grad_norm": 1.653885841369629, + "learning_rate": 1.7485392365233483e-05, + "loss": 0.1174, + "step": 15384 + }, + { + "epoch": 0.30772, + "grad_norm": 4.1761345863342285, + "learning_rate": 1.74844664432734e-05, + "loss": 0.1691, + "step": 15386 + }, + { + "epoch": 0.30776, + "grad_norm": 3.263448715209961, + "learning_rate": 1.7483540375399794e-05, + "loss": 0.4049, + "step": 15388 + }, + { + "epoch": 0.3078, + "grad_norm": 1.8893465995788574, + "learning_rate": 1.7482614161630714e-05, + "loss": 0.0573, + "step": 15390 + }, + { + "epoch": 0.30784, + "grad_norm": 3.1003127098083496, + "learning_rate": 1.748168780198422e-05, + "loss": 0.1349, + "step": 15392 + }, + { + "epoch": 0.30788, + "grad_norm": 4.354234218597412, + "learning_rate": 1.7480761296478364e-05, + "loss": 0.3141, + "step": 15394 + }, + { + "epoch": 0.30792, + "grad_norm": 3.076211452484131, + "learning_rate": 1.747983464513122e-05, + "loss": 0.3467, + "step": 15396 + }, + { + "epoch": 0.30796, + "grad_norm": 2.3402509689331055, + "learning_rate": 1.747890784796085e-05, + "loss": 0.1097, + "step": 15398 + }, + { + "epoch": 0.308, + "grad_norm": 3.353760242462158, + "learning_rate": 1.747798090498532e-05, + "loss": 0.1832, + "step": 15400 + }, + { + "epoch": 0.30804, + "grad_norm": 2.3331267833709717, + "learning_rate": 1.7477053816222704e-05, + "loss": 0.1096, + "step": 15402 + }, + { + "epoch": 0.30808, + "grad_norm": 3.1192269325256348, + "learning_rate": 1.7476126581691072e-05, + "loss": 0.1532, + "step": 15404 + }, + { + "epoch": 0.30812, + "grad_norm": 1.4034531116485596, + "learning_rate": 1.7475199201408503e-05, + "loss": 0.1583, + "step": 15406 + }, + { + "epoch": 0.30816, + "grad_norm": 5.487583637237549, + "learning_rate": 1.7474271675393078e-05, + "loss": 0.3245, + "step": 15408 + }, + { + "epoch": 0.3082, + "grad_norm": 1.3282278776168823, + "learning_rate": 1.7473344003662877e-05, + "loss": 0.2097, + "step": 15410 + }, + { + "epoch": 0.30824, + "grad_norm": 0.8985127806663513, + "learning_rate": 1.7472416186235988e-05, + "loss": 0.0542, + "step": 15412 + }, + { + "epoch": 0.30828, + "grad_norm": 1.0544627904891968, + "learning_rate": 1.74714882231305e-05, + "loss": 0.0541, + "step": 15414 + }, + { + "epoch": 0.30832, + "grad_norm": 1.7180418968200684, + "learning_rate": 1.7470560114364504e-05, + "loss": 0.1026, + "step": 15416 + }, + { + "epoch": 0.30836, + "grad_norm": 0.6630425453186035, + "learning_rate": 1.746963185995609e-05, + "loss": 0.0332, + "step": 15418 + }, + { + "epoch": 0.3084, + "grad_norm": 0.37573322653770447, + "learning_rate": 1.746870345992336e-05, + "loss": 0.2599, + "step": 15420 + }, + { + "epoch": 0.30844, + "grad_norm": 2.2252209186553955, + "learning_rate": 1.7467774914284403e-05, + "loss": 0.0962, + "step": 15422 + }, + { + "epoch": 0.30848, + "grad_norm": 3.2719783782958984, + "learning_rate": 1.7466846223057334e-05, + "loss": 0.196, + "step": 15424 + }, + { + "epoch": 0.30852, + "grad_norm": 2.817922592163086, + "learning_rate": 1.7465917386260256e-05, + "loss": 0.1581, + "step": 15426 + }, + { + "epoch": 0.30856, + "grad_norm": 0.921356737613678, + "learning_rate": 1.7464988403911273e-05, + "loss": 0.0739, + "step": 15428 + }, + { + "epoch": 0.3086, + "grad_norm": 1.5599448680877686, + "learning_rate": 1.7464059276028497e-05, + "loss": 0.1172, + "step": 15430 + }, + { + "epoch": 0.30864, + "grad_norm": 0.8024337291717529, + "learning_rate": 1.746313000263004e-05, + "loss": 0.0773, + "step": 15432 + }, + { + "epoch": 0.30868, + "grad_norm": 2.383070707321167, + "learning_rate": 1.7462200583734026e-05, + "loss": 0.3219, + "step": 15434 + }, + { + "epoch": 0.30872, + "grad_norm": 0.326882541179657, + "learning_rate": 1.746127101935857e-05, + "loss": 0.0183, + "step": 15436 + }, + { + "epoch": 0.30876, + "grad_norm": 4.504505157470703, + "learning_rate": 1.7460341309521792e-05, + "loss": 0.2066, + "step": 15438 + }, + { + "epoch": 0.3088, + "grad_norm": 0.9099228978157043, + "learning_rate": 1.7459411454241822e-05, + "loss": 0.0351, + "step": 15440 + }, + { + "epoch": 0.30884, + "grad_norm": 4.0239176750183105, + "learning_rate": 1.7458481453536785e-05, + "loss": 0.1504, + "step": 15442 + }, + { + "epoch": 0.30888, + "grad_norm": 2.63104248046875, + "learning_rate": 1.745755130742481e-05, + "loss": 0.1546, + "step": 15444 + }, + { + "epoch": 0.30892, + "grad_norm": 1.6247618198394775, + "learning_rate": 1.7456621015924032e-05, + "loss": 0.0893, + "step": 15446 + }, + { + "epoch": 0.30896, + "grad_norm": 1.2438127994537354, + "learning_rate": 1.7455690579052593e-05, + "loss": 0.0795, + "step": 15448 + }, + { + "epoch": 0.309, + "grad_norm": 2.5976104736328125, + "learning_rate": 1.7454759996828622e-05, + "loss": 0.0811, + "step": 15450 + }, + { + "epoch": 0.30904, + "grad_norm": 0.39666232466697693, + "learning_rate": 1.745382926927027e-05, + "loss": 0.0374, + "step": 15452 + }, + { + "epoch": 0.30908, + "grad_norm": 0.43394169211387634, + "learning_rate": 1.745289839639568e-05, + "loss": 0.0702, + "step": 15454 + }, + { + "epoch": 0.30912, + "grad_norm": 3.14559006690979, + "learning_rate": 1.7451967378222997e-05, + "loss": 0.1673, + "step": 15456 + }, + { + "epoch": 0.30916, + "grad_norm": 3.419085741043091, + "learning_rate": 1.7451036214770375e-05, + "loss": 0.1395, + "step": 15458 + }, + { + "epoch": 0.3092, + "grad_norm": 0.35848352313041687, + "learning_rate": 1.7450104906055963e-05, + "loss": 0.0383, + "step": 15460 + }, + { + "epoch": 0.30924, + "grad_norm": 1.749132752418518, + "learning_rate": 1.744917345209792e-05, + "loss": 0.0758, + "step": 15462 + }, + { + "epoch": 0.30928, + "grad_norm": 0.29592186212539673, + "learning_rate": 1.744824185291441e-05, + "loss": 0.0496, + "step": 15464 + }, + { + "epoch": 0.30932, + "grad_norm": 0.6094013452529907, + "learning_rate": 1.7447310108523585e-05, + "loss": 0.018, + "step": 15466 + }, + { + "epoch": 0.30936, + "grad_norm": 2.3420255184173584, + "learning_rate": 1.744637821894362e-05, + "loss": 0.4948, + "step": 15468 + }, + { + "epoch": 0.3094, + "grad_norm": 1.0034443140029907, + "learning_rate": 1.7445446184192674e-05, + "loss": 0.0482, + "step": 15470 + }, + { + "epoch": 0.30944, + "grad_norm": 5.8568525314331055, + "learning_rate": 1.7444514004288925e-05, + "loss": 0.4233, + "step": 15472 + }, + { + "epoch": 0.30948, + "grad_norm": 0.4641003906726837, + "learning_rate": 1.744358167925054e-05, + "loss": 0.1603, + "step": 15474 + }, + { + "epoch": 0.30952, + "grad_norm": 0.11647447198629379, + "learning_rate": 1.7442649209095703e-05, + "loss": 0.253, + "step": 15476 + }, + { + "epoch": 0.30956, + "grad_norm": 2.314030170440674, + "learning_rate": 1.744171659384258e-05, + "loss": 0.0886, + "step": 15478 + }, + { + "epoch": 0.3096, + "grad_norm": 1.419448733329773, + "learning_rate": 1.7440783833509366e-05, + "loss": 0.0893, + "step": 15480 + }, + { + "epoch": 0.30964, + "grad_norm": 0.08089319616556168, + "learning_rate": 1.743985092811424e-05, + "loss": 0.0054, + "step": 15482 + }, + { + "epoch": 0.30968, + "grad_norm": 0.14030279219150543, + "learning_rate": 1.743891787767539e-05, + "loss": 0.1931, + "step": 15484 + }, + { + "epoch": 0.30972, + "grad_norm": 3.0038864612579346, + "learning_rate": 1.7437984682211006e-05, + "loss": 0.2771, + "step": 15486 + }, + { + "epoch": 0.30976, + "grad_norm": 2.475938081741333, + "learning_rate": 1.743705134173928e-05, + "loss": 0.0801, + "step": 15488 + }, + { + "epoch": 0.3098, + "grad_norm": 0.8803727626800537, + "learning_rate": 1.743611785627841e-05, + "loss": 0.0458, + "step": 15490 + }, + { + "epoch": 0.30984, + "grad_norm": 1.0721440315246582, + "learning_rate": 1.7435184225846592e-05, + "loss": 0.0983, + "step": 15492 + }, + { + "epoch": 0.30988, + "grad_norm": 0.22869879007339478, + "learning_rate": 1.7434250450462035e-05, + "loss": 0.0488, + "step": 15494 + }, + { + "epoch": 0.30992, + "grad_norm": 0.3522304892539978, + "learning_rate": 1.7433316530142934e-05, + "loss": 0.0188, + "step": 15496 + }, + { + "epoch": 0.30996, + "grad_norm": 1.367971658706665, + "learning_rate": 1.74323824649075e-05, + "loss": 0.0386, + "step": 15498 + }, + { + "epoch": 0.31, + "grad_norm": 2.5587661266326904, + "learning_rate": 1.7431448254773943e-05, + "loss": 0.0981, + "step": 15500 + }, + { + "epoch": 0.31004, + "grad_norm": 0.1018109917640686, + "learning_rate": 1.7430513899760478e-05, + "loss": 0.0053, + "step": 15502 + }, + { + "epoch": 0.31008, + "grad_norm": 0.4704916179180145, + "learning_rate": 1.7429579399885317e-05, + "loss": 0.0231, + "step": 15504 + }, + { + "epoch": 0.31012, + "grad_norm": 0.08257484436035156, + "learning_rate": 1.7428644755166683e-05, + "loss": 0.0035, + "step": 15506 + }, + { + "epoch": 0.31016, + "grad_norm": 0.38170483708381653, + "learning_rate": 1.7427709965622795e-05, + "loss": 0.0305, + "step": 15508 + }, + { + "epoch": 0.3102, + "grad_norm": 0.9878209233283997, + "learning_rate": 1.7426775031271876e-05, + "loss": 0.0333, + "step": 15510 + }, + { + "epoch": 0.31024, + "grad_norm": 2.022639036178589, + "learning_rate": 1.7425839952132157e-05, + "loss": 0.0542, + "step": 15512 + }, + { + "epoch": 0.31028, + "grad_norm": 1.4240269660949707, + "learning_rate": 1.7424904728221863e-05, + "loss": 0.0591, + "step": 15514 + }, + { + "epoch": 0.31032, + "grad_norm": 5.519767761230469, + "learning_rate": 1.7423969359559225e-05, + "loss": 0.1504, + "step": 15516 + }, + { + "epoch": 0.31036, + "grad_norm": 2.1025502681732178, + "learning_rate": 1.7423033846162488e-05, + "loss": 0.1068, + "step": 15518 + }, + { + "epoch": 0.3104, + "grad_norm": 1.9554163217544556, + "learning_rate": 1.7422098188049885e-05, + "loss": 0.0648, + "step": 15520 + }, + { + "epoch": 0.31044, + "grad_norm": 6.841938018798828, + "learning_rate": 1.742116238523965e-05, + "loss": 0.2092, + "step": 15522 + }, + { + "epoch": 0.31048, + "grad_norm": 0.09093368053436279, + "learning_rate": 1.7420226437750036e-05, + "loss": 0.007, + "step": 15524 + }, + { + "epoch": 0.31052, + "grad_norm": 0.08849980682134628, + "learning_rate": 1.7419290345599292e-05, + "loss": 0.0665, + "step": 15526 + }, + { + "epoch": 0.31056, + "grad_norm": 0.7301101088523865, + "learning_rate": 1.7418354108805658e-05, + "loss": 0.0438, + "step": 15528 + }, + { + "epoch": 0.3106, + "grad_norm": 3.2531497478485107, + "learning_rate": 1.7417417727387392e-05, + "loss": 0.2556, + "step": 15530 + }, + { + "epoch": 0.31064, + "grad_norm": 3.3276941776275635, + "learning_rate": 1.741648120136275e-05, + "loss": 0.0757, + "step": 15532 + }, + { + "epoch": 0.31068, + "grad_norm": 4.829263687133789, + "learning_rate": 1.7415544530749987e-05, + "loss": 0.2374, + "step": 15534 + }, + { + "epoch": 0.31072, + "grad_norm": 0.6184201836585999, + "learning_rate": 1.741460771556737e-05, + "loss": 0.2087, + "step": 15536 + }, + { + "epoch": 0.31076, + "grad_norm": 5.1503682136535645, + "learning_rate": 1.7413670755833157e-05, + "loss": 0.6439, + "step": 15538 + }, + { + "epoch": 0.3108, + "grad_norm": 0.977074921131134, + "learning_rate": 1.741273365156561e-05, + "loss": 0.0344, + "step": 15540 + }, + { + "epoch": 0.31084, + "grad_norm": 5.906308650970459, + "learning_rate": 1.741179640278301e-05, + "loss": 0.3137, + "step": 15542 + }, + { + "epoch": 0.31088, + "grad_norm": 2.6914358139038086, + "learning_rate": 1.7410859009503626e-05, + "loss": 0.1062, + "step": 15544 + }, + { + "epoch": 0.31092, + "grad_norm": 0.6594189405441284, + "learning_rate": 1.7409921471745725e-05, + "loss": 0.0194, + "step": 15546 + }, + { + "epoch": 0.31096, + "grad_norm": 0.611106812953949, + "learning_rate": 1.7408983789527588e-05, + "loss": 0.0527, + "step": 15548 + }, + { + "epoch": 0.311, + "grad_norm": 4.055391311645508, + "learning_rate": 1.74080459628675e-05, + "loss": 0.2405, + "step": 15550 + }, + { + "epoch": 0.31104, + "grad_norm": 2.2600810527801514, + "learning_rate": 1.7407107991783746e-05, + "loss": 0.0822, + "step": 15552 + }, + { + "epoch": 0.31108, + "grad_norm": 0.18978053331375122, + "learning_rate": 1.7406169876294603e-05, + "loss": 0.0207, + "step": 15554 + }, + { + "epoch": 0.31112, + "grad_norm": 0.26935142278671265, + "learning_rate": 1.740523161641837e-05, + "loss": 0.1018, + "step": 15556 + }, + { + "epoch": 0.31116, + "grad_norm": 0.27566221356391907, + "learning_rate": 1.740429321217333e-05, + "loss": 0.0926, + "step": 15558 + }, + { + "epoch": 0.3112, + "grad_norm": 7.824626922607422, + "learning_rate": 1.7403354663577782e-05, + "loss": 0.704, + "step": 15560 + }, + { + "epoch": 0.31124, + "grad_norm": 0.9544142484664917, + "learning_rate": 1.7402415970650026e-05, + "loss": 0.1623, + "step": 15562 + }, + { + "epoch": 0.31128, + "grad_norm": 0.22792300581932068, + "learning_rate": 1.7401477133408356e-05, + "loss": 0.0925, + "step": 15564 + }, + { + "epoch": 0.31132, + "grad_norm": 0.0500677116215229, + "learning_rate": 1.7400538151871084e-05, + "loss": 0.0432, + "step": 15566 + }, + { + "epoch": 0.31136, + "grad_norm": 1.3511085510253906, + "learning_rate": 1.7399599026056505e-05, + "loss": 0.1527, + "step": 15568 + }, + { + "epoch": 0.3114, + "grad_norm": 1.1326266527175903, + "learning_rate": 1.7398659755982937e-05, + "loss": 0.1211, + "step": 15570 + }, + { + "epoch": 0.31144, + "grad_norm": 1.051580548286438, + "learning_rate": 1.7397720341668685e-05, + "loss": 0.0386, + "step": 15572 + }, + { + "epoch": 0.31148, + "grad_norm": 3.257323980331421, + "learning_rate": 1.739678078313207e-05, + "loss": 0.195, + "step": 15574 + }, + { + "epoch": 0.31152, + "grad_norm": 0.6531505584716797, + "learning_rate": 1.7395841080391405e-05, + "loss": 0.0184, + "step": 15576 + }, + { + "epoch": 0.31156, + "grad_norm": 4.286770820617676, + "learning_rate": 1.7394901233465006e-05, + "loss": 0.2369, + "step": 15578 + }, + { + "epoch": 0.3116, + "grad_norm": 7.321203708648682, + "learning_rate": 1.7393961242371203e-05, + "loss": 0.385, + "step": 15580 + }, + { + "epoch": 0.31164, + "grad_norm": 1.5289379358291626, + "learning_rate": 1.7393021107128324e-05, + "loss": 0.0495, + "step": 15582 + }, + { + "epoch": 0.31168, + "grad_norm": 3.509819746017456, + "learning_rate": 1.739208082775469e-05, + "loss": 0.0978, + "step": 15584 + }, + { + "epoch": 0.31172, + "grad_norm": 0.7437022924423218, + "learning_rate": 1.739114040426863e-05, + "loss": 0.0258, + "step": 15586 + }, + { + "epoch": 0.31176, + "grad_norm": 1.1156177520751953, + "learning_rate": 1.7390199836688485e-05, + "loss": 0.0199, + "step": 15588 + }, + { + "epoch": 0.3118, + "grad_norm": 6.02294397354126, + "learning_rate": 1.738925912503259e-05, + "loss": 0.2773, + "step": 15590 + }, + { + "epoch": 0.31184, + "grad_norm": 1.7652300596237183, + "learning_rate": 1.7388318269319282e-05, + "loss": 0.242, + "step": 15592 + }, + { + "epoch": 0.31188, + "grad_norm": 1.7993072271347046, + "learning_rate": 1.7387377269566907e-05, + "loss": 0.0723, + "step": 15594 + }, + { + "epoch": 0.31192, + "grad_norm": 5.598077774047852, + "learning_rate": 1.738643612579381e-05, + "loss": 0.4831, + "step": 15596 + }, + { + "epoch": 0.31196, + "grad_norm": 2.42624568939209, + "learning_rate": 1.7385494838018337e-05, + "loss": 0.2417, + "step": 15598 + }, + { + "epoch": 0.312, + "grad_norm": 2.538588523864746, + "learning_rate": 1.7384553406258842e-05, + "loss": 0.1142, + "step": 15600 + }, + { + "epoch": 0.31204, + "grad_norm": 1.6089099645614624, + "learning_rate": 1.738361183053367e-05, + "loss": 0.6512, + "step": 15602 + }, + { + "epoch": 0.31208, + "grad_norm": 4.256545066833496, + "learning_rate": 1.738267011086119e-05, + "loss": 0.25, + "step": 15604 + }, + { + "epoch": 0.31212, + "grad_norm": 6.371361255645752, + "learning_rate": 1.7381728247259752e-05, + "loss": 0.4242, + "step": 15606 + }, + { + "epoch": 0.31216, + "grad_norm": 2.8448286056518555, + "learning_rate": 1.7380786239747725e-05, + "loss": 0.2096, + "step": 15608 + }, + { + "epoch": 0.3122, + "grad_norm": 1.718850016593933, + "learning_rate": 1.737984408834347e-05, + "loss": 0.1487, + "step": 15610 + }, + { + "epoch": 0.31224, + "grad_norm": 6.227499961853027, + "learning_rate": 1.737890179306535e-05, + "loss": 0.1369, + "step": 15612 + }, + { + "epoch": 0.31228, + "grad_norm": 0.43306562304496765, + "learning_rate": 1.7377959353931744e-05, + "loss": 0.0225, + "step": 15614 + }, + { + "epoch": 0.31232, + "grad_norm": 4.502784729003906, + "learning_rate": 1.7377016770961027e-05, + "loss": 0.2165, + "step": 15616 + }, + { + "epoch": 0.31236, + "grad_norm": 0.5673362612724304, + "learning_rate": 1.7376074044171565e-05, + "loss": 0.0358, + "step": 15618 + }, + { + "epoch": 0.3124, + "grad_norm": 1.5204440355300903, + "learning_rate": 1.737513117358174e-05, + "loss": 0.0602, + "step": 15620 + }, + { + "epoch": 0.31244, + "grad_norm": 0.4032520055770874, + "learning_rate": 1.737418815920994e-05, + "loss": 0.2159, + "step": 15622 + }, + { + "epoch": 0.31248, + "grad_norm": 0.3075963854789734, + "learning_rate": 1.7373245001074544e-05, + "loss": 0.0215, + "step": 15624 + }, + { + "epoch": 0.31252, + "grad_norm": 5.586942672729492, + "learning_rate": 1.737230169919394e-05, + "loss": 0.605, + "step": 15626 + }, + { + "epoch": 0.31256, + "grad_norm": 1.0614689588546753, + "learning_rate": 1.7371358253586516e-05, + "loss": 0.2569, + "step": 15628 + }, + { + "epoch": 0.3126, + "grad_norm": 0.8392022252082825, + "learning_rate": 1.7370414664270675e-05, + "loss": 0.0985, + "step": 15630 + }, + { + "epoch": 0.31264, + "grad_norm": 4.228948593139648, + "learning_rate": 1.73694709312648e-05, + "loss": 0.1774, + "step": 15632 + }, + { + "epoch": 0.31268, + "grad_norm": 2.5476112365722656, + "learning_rate": 1.7368527054587298e-05, + "loss": 0.1774, + "step": 15634 + }, + { + "epoch": 0.31272, + "grad_norm": 0.9139809608459473, + "learning_rate": 1.7367583034256563e-05, + "loss": 0.0607, + "step": 15636 + }, + { + "epoch": 0.31276, + "grad_norm": 3.150912046432495, + "learning_rate": 1.7366638870291006e-05, + "loss": 0.1511, + "step": 15638 + }, + { + "epoch": 0.3128, + "grad_norm": 0.6910889148712158, + "learning_rate": 1.7365694562709034e-05, + "loss": 0.0634, + "step": 15640 + }, + { + "epoch": 0.31284, + "grad_norm": 1.1133276224136353, + "learning_rate": 1.7364750111529055e-05, + "loss": 0.0539, + "step": 15642 + }, + { + "epoch": 0.31288, + "grad_norm": 0.34480899572372437, + "learning_rate": 1.7363805516769477e-05, + "loss": 0.0344, + "step": 15644 + }, + { + "epoch": 0.31292, + "grad_norm": 3.99821138381958, + "learning_rate": 1.736286077844872e-05, + "loss": 0.2816, + "step": 15646 + }, + { + "epoch": 0.31296, + "grad_norm": 0.4932464361190796, + "learning_rate": 1.7361915896585203e-05, + "loss": 0.0674, + "step": 15648 + }, + { + "epoch": 0.313, + "grad_norm": 0.5906198024749756, + "learning_rate": 1.7360970871197347e-05, + "loss": 0.0514, + "step": 15650 + }, + { + "epoch": 0.31304, + "grad_norm": 5.820806503295898, + "learning_rate": 1.736002570230357e-05, + "loss": 0.4947, + "step": 15652 + }, + { + "epoch": 0.31308, + "grad_norm": 0.9395971298217773, + "learning_rate": 1.7359080389922307e-05, + "loss": 0.0293, + "step": 15654 + }, + { + "epoch": 0.31312, + "grad_norm": 0.5219261646270752, + "learning_rate": 1.7358134934071978e-05, + "loss": 0.1064, + "step": 15656 + }, + { + "epoch": 0.31316, + "grad_norm": 1.3408665657043457, + "learning_rate": 1.7357189334771023e-05, + "loss": 0.0941, + "step": 15658 + }, + { + "epoch": 0.3132, + "grad_norm": 3.039652109146118, + "learning_rate": 1.7356243592037876e-05, + "loss": 0.1024, + "step": 15660 + }, + { + "epoch": 0.31324, + "grad_norm": 0.9673461318016052, + "learning_rate": 1.735529770589097e-05, + "loss": 0.0305, + "step": 15662 + }, + { + "epoch": 0.31328, + "grad_norm": 0.2907091975212097, + "learning_rate": 1.7354351676348748e-05, + "loss": 0.0252, + "step": 15664 + }, + { + "epoch": 0.31332, + "grad_norm": 0.382183700799942, + "learning_rate": 1.7353405503429657e-05, + "loss": 0.1067, + "step": 15666 + }, + { + "epoch": 0.31336, + "grad_norm": 0.3072642683982849, + "learning_rate": 1.735245918715214e-05, + "loss": 0.0236, + "step": 15668 + }, + { + "epoch": 0.3134, + "grad_norm": 0.15921689569950104, + "learning_rate": 1.7351512727534645e-05, + "loss": 0.0422, + "step": 15670 + }, + { + "epoch": 0.31344, + "grad_norm": 3.5605309009552, + "learning_rate": 1.7350566124595622e-05, + "loss": 0.1444, + "step": 15672 + }, + { + "epoch": 0.31348, + "grad_norm": 2.967390775680542, + "learning_rate": 1.734961937835353e-05, + "loss": 0.137, + "step": 15674 + }, + { + "epoch": 0.31352, + "grad_norm": 0.4266352355480194, + "learning_rate": 1.7348672488826826e-05, + "loss": 0.0875, + "step": 15676 + }, + { + "epoch": 0.31356, + "grad_norm": 6.022814750671387, + "learning_rate": 1.7347725456033965e-05, + "loss": 0.7331, + "step": 15678 + }, + { + "epoch": 0.3136, + "grad_norm": 1.5895347595214844, + "learning_rate": 1.7346778279993417e-05, + "loss": 0.0566, + "step": 15680 + }, + { + "epoch": 0.31364, + "grad_norm": 4.650524616241455, + "learning_rate": 1.7345830960723642e-05, + "loss": 0.2309, + "step": 15682 + }, + { + "epoch": 0.31368, + "grad_norm": 0.09798453748226166, + "learning_rate": 1.734488349824311e-05, + "loss": 0.0553, + "step": 15684 + }, + { + "epoch": 0.31372, + "grad_norm": 0.17478026449680328, + "learning_rate": 1.7343935892570293e-05, + "loss": 0.1107, + "step": 15686 + }, + { + "epoch": 0.31376, + "grad_norm": 5.427863597869873, + "learning_rate": 1.7342988143723663e-05, + "loss": 0.505, + "step": 15688 + }, + { + "epoch": 0.3138, + "grad_norm": 0.26553061604499817, + "learning_rate": 1.7342040251721702e-05, + "loss": 0.0874, + "step": 15690 + }, + { + "epoch": 0.31384, + "grad_norm": 5.090146541595459, + "learning_rate": 1.7341092216582886e-05, + "loss": 0.6179, + "step": 15692 + }, + { + "epoch": 0.31388, + "grad_norm": 3.615873336791992, + "learning_rate": 1.73401440383257e-05, + "loss": 0.1592, + "step": 15694 + }, + { + "epoch": 0.31392, + "grad_norm": 2.632354974746704, + "learning_rate": 1.733919571696862e-05, + "loss": 0.1071, + "step": 15696 + }, + { + "epoch": 0.31396, + "grad_norm": 1.7409979104995728, + "learning_rate": 1.733824725253015e-05, + "loss": 0.2928, + "step": 15698 + }, + { + "epoch": 0.314, + "grad_norm": 1.9537363052368164, + "learning_rate": 1.7337298645028764e-05, + "loss": 0.8579, + "step": 15700 + }, + { + "epoch": 0.31404, + "grad_norm": 0.6973008513450623, + "learning_rate": 1.733634989448297e-05, + "loss": 0.1005, + "step": 15702 + }, + { + "epoch": 0.31408, + "grad_norm": 0.07180860638618469, + "learning_rate": 1.7335401000911254e-05, + "loss": 0.0044, + "step": 15704 + }, + { + "epoch": 0.31412, + "grad_norm": 0.32020699977874756, + "learning_rate": 1.7334451964332117e-05, + "loss": 0.0081, + "step": 15706 + }, + { + "epoch": 0.31416, + "grad_norm": 6.933139801025391, + "learning_rate": 1.7333502784764067e-05, + "loss": 0.88, + "step": 15708 + }, + { + "epoch": 0.3142, + "grad_norm": 28.50889778137207, + "learning_rate": 1.7332553462225604e-05, + "loss": 0.4303, + "step": 15710 + }, + { + "epoch": 0.31424, + "grad_norm": 2.8416171073913574, + "learning_rate": 1.7331603996735233e-05, + "loss": 0.7696, + "step": 15712 + }, + { + "epoch": 0.31428, + "grad_norm": 2.9188692569732666, + "learning_rate": 1.733065438831147e-05, + "loss": 0.1187, + "step": 15714 + }, + { + "epoch": 0.31432, + "grad_norm": 2.4281020164489746, + "learning_rate": 1.7329704636972824e-05, + "loss": 0.1703, + "step": 15716 + }, + { + "epoch": 0.31436, + "grad_norm": 0.2293003797531128, + "learning_rate": 1.7328754742737814e-05, + "loss": 0.123, + "step": 15718 + }, + { + "epoch": 0.3144, + "grad_norm": 0.2430272251367569, + "learning_rate": 1.732780470562496e-05, + "loss": 0.0286, + "step": 15720 + }, + { + "epoch": 0.31444, + "grad_norm": 3.7189559936523438, + "learning_rate": 1.7326854525652773e-05, + "loss": 0.237, + "step": 15722 + }, + { + "epoch": 0.31448, + "grad_norm": 2.7050728797912598, + "learning_rate": 1.732590420283979e-05, + "loss": 0.1251, + "step": 15724 + }, + { + "epoch": 0.31452, + "grad_norm": 3.175504207611084, + "learning_rate": 1.7324953737204537e-05, + "loss": 0.217, + "step": 15726 + }, + { + "epoch": 0.31456, + "grad_norm": 0.2358209490776062, + "learning_rate": 1.7324003128765536e-05, + "loss": 0.1339, + "step": 15728 + }, + { + "epoch": 0.3146, + "grad_norm": 0.5016998648643494, + "learning_rate": 1.732305237754132e-05, + "loss": 0.0733, + "step": 15730 + }, + { + "epoch": 0.31464, + "grad_norm": 2.3808019161224365, + "learning_rate": 1.732210148355043e-05, + "loss": 0.1204, + "step": 15732 + }, + { + "epoch": 0.31468, + "grad_norm": 1.5789074897766113, + "learning_rate": 1.73211504468114e-05, + "loss": 0.2774, + "step": 15734 + }, + { + "epoch": 0.31472, + "grad_norm": 1.5150010585784912, + "learning_rate": 1.7320199267342776e-05, + "loss": 0.0612, + "step": 15736 + }, + { + "epoch": 0.31476, + "grad_norm": 3.079272747039795, + "learning_rate": 1.7319247945163097e-05, + "loss": 0.2917, + "step": 15738 + }, + { + "epoch": 0.3148, + "grad_norm": 2.8698856830596924, + "learning_rate": 1.7318296480290912e-05, + "loss": 0.1189, + "step": 15740 + }, + { + "epoch": 0.31484, + "grad_norm": 1.7778351306915283, + "learning_rate": 1.731734487274477e-05, + "loss": 0.2146, + "step": 15742 + }, + { + "epoch": 0.31488, + "grad_norm": 1.1785888671875, + "learning_rate": 1.731639312254322e-05, + "loss": 0.18, + "step": 15744 + }, + { + "epoch": 0.31492, + "grad_norm": 5.316389083862305, + "learning_rate": 1.7315441229704824e-05, + "loss": 0.4344, + "step": 15746 + }, + { + "epoch": 0.31496, + "grad_norm": 0.9005515575408936, + "learning_rate": 1.7314489194248133e-05, + "loss": 0.0571, + "step": 15748 + }, + { + "epoch": 0.315, + "grad_norm": 3.0006966590881348, + "learning_rate": 1.7313537016191706e-05, + "loss": 0.463, + "step": 15750 + }, + { + "epoch": 0.31504, + "grad_norm": 1.0781195163726807, + "learning_rate": 1.7312584695554112e-05, + "loss": 0.1024, + "step": 15752 + }, + { + "epoch": 0.31508, + "grad_norm": 0.5759251713752747, + "learning_rate": 1.7311632232353917e-05, + "loss": 0.0851, + "step": 15754 + }, + { + "epoch": 0.31512, + "grad_norm": 2.4761481285095215, + "learning_rate": 1.7310679626609685e-05, + "loss": 0.1247, + "step": 15756 + }, + { + "epoch": 0.31516, + "grad_norm": 1.7713353633880615, + "learning_rate": 1.730972687833999e-05, + "loss": 0.161, + "step": 15758 + }, + { + "epoch": 0.3152, + "grad_norm": 1.2119979858398438, + "learning_rate": 1.7308773987563406e-05, + "loss": 0.0947, + "step": 15760 + }, + { + "epoch": 0.31524, + "grad_norm": 1.1372756958007812, + "learning_rate": 1.7307820954298508e-05, + "loss": 0.223, + "step": 15762 + }, + { + "epoch": 0.31528, + "grad_norm": 0.6768745183944702, + "learning_rate": 1.730686777856388e-05, + "loss": 0.3603, + "step": 15764 + }, + { + "epoch": 0.31532, + "grad_norm": 2.145143985748291, + "learning_rate": 1.7305914460378102e-05, + "loss": 0.1837, + "step": 15766 + }, + { + "epoch": 0.31536, + "grad_norm": 2.0293455123901367, + "learning_rate": 1.7304960999759762e-05, + "loss": 0.099, + "step": 15768 + }, + { + "epoch": 0.3154, + "grad_norm": 1.8327125310897827, + "learning_rate": 1.730400739672745e-05, + "loss": 0.1115, + "step": 15770 + }, + { + "epoch": 0.31544, + "grad_norm": 0.34703806042671204, + "learning_rate": 1.730305365129975e-05, + "loss": 0.042, + "step": 15772 + }, + { + "epoch": 0.31548, + "grad_norm": 5.953537464141846, + "learning_rate": 1.7302099763495257e-05, + "loss": 0.2211, + "step": 15774 + }, + { + "epoch": 0.31552, + "grad_norm": 0.9132360816001892, + "learning_rate": 1.730114573333257e-05, + "loss": 0.1567, + "step": 15776 + }, + { + "epoch": 0.31556, + "grad_norm": 1.9349182844161987, + "learning_rate": 1.730019156083029e-05, + "loss": 0.1006, + "step": 15778 + }, + { + "epoch": 0.3156, + "grad_norm": 0.6921196579933167, + "learning_rate": 1.7299237246007018e-05, + "loss": 0.1115, + "step": 15780 + }, + { + "epoch": 0.31564, + "grad_norm": 3.265803098678589, + "learning_rate": 1.7298282788881355e-05, + "loss": 0.2028, + "step": 15782 + }, + { + "epoch": 0.31568, + "grad_norm": 2.3165946006774902, + "learning_rate": 1.7297328189471913e-05, + "loss": 0.398, + "step": 15784 + }, + { + "epoch": 0.31572, + "grad_norm": 1.11973237991333, + "learning_rate": 1.7296373447797302e-05, + "loss": 0.2135, + "step": 15786 + }, + { + "epoch": 0.31576, + "grad_norm": 0.8430264592170715, + "learning_rate": 1.729541856387613e-05, + "loss": 0.0985, + "step": 15788 + }, + { + "epoch": 0.3158, + "grad_norm": 3.523293972015381, + "learning_rate": 1.7294463537727026e-05, + "loss": 0.2083, + "step": 15790 + }, + { + "epoch": 0.31584, + "grad_norm": 1.0722057819366455, + "learning_rate": 1.7293508369368593e-05, + "loss": 0.0457, + "step": 15792 + }, + { + "epoch": 0.31588, + "grad_norm": 0.47214141488075256, + "learning_rate": 1.7292553058819458e-05, + "loss": 0.0964, + "step": 15794 + }, + { + "epoch": 0.31592, + "grad_norm": 0.7144514918327332, + "learning_rate": 1.729159760609825e-05, + "loss": 0.2136, + "step": 15796 + }, + { + "epoch": 0.31596, + "grad_norm": 1.7572450637817383, + "learning_rate": 1.7290642011223595e-05, + "loss": 0.0836, + "step": 15798 + }, + { + "epoch": 0.316, + "grad_norm": 0.7178676724433899, + "learning_rate": 1.7289686274214116e-05, + "loss": 0.2111, + "step": 15800 + }, + { + "epoch": 0.31604, + "grad_norm": 3.9717540740966797, + "learning_rate": 1.7288730395088454e-05, + "loss": 0.221, + "step": 15802 + }, + { + "epoch": 0.31608, + "grad_norm": 0.15830573439598083, + "learning_rate": 1.7287774373865235e-05, + "loss": 0.1096, + "step": 15804 + }, + { + "epoch": 0.31612, + "grad_norm": 0.3933067321777344, + "learning_rate": 1.728681821056311e-05, + "loss": 0.0896, + "step": 15806 + }, + { + "epoch": 0.31616, + "grad_norm": 6.021996021270752, + "learning_rate": 1.7285861905200705e-05, + "loss": 0.4563, + "step": 15808 + }, + { + "epoch": 0.3162, + "grad_norm": 2.104672431945801, + "learning_rate": 1.7284905457796678e-05, + "loss": 0.0982, + "step": 15810 + }, + { + "epoch": 0.31624, + "grad_norm": 0.5108229517936707, + "learning_rate": 1.7283948868369663e-05, + "loss": 0.0383, + "step": 15812 + }, + { + "epoch": 0.31628, + "grad_norm": 2.0941174030303955, + "learning_rate": 1.7282992136938317e-05, + "loss": 0.1586, + "step": 15814 + }, + { + "epoch": 0.31632, + "grad_norm": 3.4301645755767822, + "learning_rate": 1.728203526352129e-05, + "loss": 0.5122, + "step": 15816 + }, + { + "epoch": 0.31636, + "grad_norm": 1.9067096710205078, + "learning_rate": 1.7281078248137234e-05, + "loss": 0.0641, + "step": 15818 + }, + { + "epoch": 0.3164, + "grad_norm": 0.6576545238494873, + "learning_rate": 1.7280121090804813e-05, + "loss": 0.0741, + "step": 15820 + }, + { + "epoch": 0.31644, + "grad_norm": 3.2201058864593506, + "learning_rate": 1.727916379154268e-05, + "loss": 0.2086, + "step": 15822 + }, + { + "epoch": 0.31648, + "grad_norm": 0.29130667448043823, + "learning_rate": 1.7278206350369507e-05, + "loss": 0.0334, + "step": 15824 + }, + { + "epoch": 0.31652, + "grad_norm": 0.5967603921890259, + "learning_rate": 1.7277248767303948e-05, + "loss": 0.1434, + "step": 15826 + }, + { + "epoch": 0.31656, + "grad_norm": 0.8665748834609985, + "learning_rate": 1.727629104236468e-05, + "loss": 0.0336, + "step": 15828 + }, + { + "epoch": 0.3166, + "grad_norm": 0.4633719027042389, + "learning_rate": 1.727533317557037e-05, + "loss": 0.1277, + "step": 15830 + }, + { + "epoch": 0.31664, + "grad_norm": 0.7972171902656555, + "learning_rate": 1.7274375166939698e-05, + "loss": 0.0949, + "step": 15832 + }, + { + "epoch": 0.31668, + "grad_norm": 2.011974334716797, + "learning_rate": 1.727341701649133e-05, + "loss": 0.1096, + "step": 15834 + }, + { + "epoch": 0.31672, + "grad_norm": 1.0478410720825195, + "learning_rate": 1.7272458724243957e-05, + "loss": 0.0953, + "step": 15836 + }, + { + "epoch": 0.31676, + "grad_norm": 5.703129768371582, + "learning_rate": 1.727150029021626e-05, + "loss": 0.4763, + "step": 15838 + }, + { + "epoch": 0.3168, + "grad_norm": 1.7226327657699585, + "learning_rate": 1.727054171442692e-05, + "loss": 0.1319, + "step": 15840 + }, + { + "epoch": 0.31684, + "grad_norm": 3.7932989597320557, + "learning_rate": 1.7269582996894626e-05, + "loss": 0.2125, + "step": 15842 + }, + { + "epoch": 0.31688, + "grad_norm": 2.3970890045166016, + "learning_rate": 1.7268624137638065e-05, + "loss": 0.0962, + "step": 15844 + }, + { + "epoch": 0.31692, + "grad_norm": 4.59257173538208, + "learning_rate": 1.726766513667594e-05, + "loss": 0.2492, + "step": 15846 + }, + { + "epoch": 0.31696, + "grad_norm": 5.922719955444336, + "learning_rate": 1.726670599402694e-05, + "loss": 0.1036, + "step": 15848 + }, + { + "epoch": 0.317, + "grad_norm": 1.8857485055923462, + "learning_rate": 1.7265746709709762e-05, + "loss": 0.0899, + "step": 15850 + }, + { + "epoch": 0.31704, + "grad_norm": 2.499972343444824, + "learning_rate": 1.726478728374311e-05, + "loss": 0.1854, + "step": 15852 + }, + { + "epoch": 0.31708, + "grad_norm": 0.7511909604072571, + "learning_rate": 1.7263827716145692e-05, + "loss": 0.2433, + "step": 15854 + }, + { + "epoch": 0.31712, + "grad_norm": 2.4529483318328857, + "learning_rate": 1.7262868006936217e-05, + "loss": 0.0995, + "step": 15856 + }, + { + "epoch": 0.31716, + "grad_norm": 1.235609531402588, + "learning_rate": 1.7261908156133387e-05, + "loss": 0.1734, + "step": 15858 + }, + { + "epoch": 0.3172, + "grad_norm": 0.6051344871520996, + "learning_rate": 1.7260948163755918e-05, + "loss": 0.0243, + "step": 15860 + }, + { + "epoch": 0.31724, + "grad_norm": 0.23391370475292206, + "learning_rate": 1.725998802982253e-05, + "loss": 0.0304, + "step": 15862 + }, + { + "epoch": 0.31728, + "grad_norm": 2.150848388671875, + "learning_rate": 1.7259027754351936e-05, + "loss": 0.293, + "step": 15864 + }, + { + "epoch": 0.31732, + "grad_norm": 4.772307395935059, + "learning_rate": 1.7258067337362862e-05, + "loss": 0.2369, + "step": 15866 + }, + { + "epoch": 0.31736, + "grad_norm": 2.0497992038726807, + "learning_rate": 1.7257106778874023e-05, + "loss": 0.0648, + "step": 15868 + }, + { + "epoch": 0.3174, + "grad_norm": 1.3256276845932007, + "learning_rate": 1.7256146078904153e-05, + "loss": 0.0922, + "step": 15870 + }, + { + "epoch": 0.31744, + "grad_norm": 0.63593989610672, + "learning_rate": 1.7255185237471978e-05, + "loss": 0.047, + "step": 15872 + }, + { + "epoch": 0.31748, + "grad_norm": 3.509033203125, + "learning_rate": 1.7254224254596234e-05, + "loss": 0.1174, + "step": 15874 + }, + { + "epoch": 0.31752, + "grad_norm": 0.3835446834564209, + "learning_rate": 1.7253263130295655e-05, + "loss": 0.0387, + "step": 15876 + }, + { + "epoch": 0.31756, + "grad_norm": 0.391276478767395, + "learning_rate": 1.7252301864588972e-05, + "loss": 0.0189, + "step": 15878 + }, + { + "epoch": 0.3176, + "grad_norm": 0.3877471089363098, + "learning_rate": 1.7251340457494934e-05, + "loss": 0.0213, + "step": 15880 + }, + { + "epoch": 0.31764, + "grad_norm": 2.826010227203369, + "learning_rate": 1.7250378909032283e-05, + "loss": 0.079, + "step": 15882 + }, + { + "epoch": 0.31768, + "grad_norm": 0.4491501748561859, + "learning_rate": 1.7249417219219758e-05, + "loss": 0.0211, + "step": 15884 + }, + { + "epoch": 0.31772, + "grad_norm": 2.2370333671569824, + "learning_rate": 1.7248455388076113e-05, + "loss": 0.3493, + "step": 15886 + }, + { + "epoch": 0.31776, + "grad_norm": 6.1003828048706055, + "learning_rate": 1.72474934156201e-05, + "loss": 0.5519, + "step": 15888 + }, + { + "epoch": 0.3178, + "grad_norm": 0.38942137360572815, + "learning_rate": 1.7246531301870467e-05, + "loss": 0.2135, + "step": 15890 + }, + { + "epoch": 0.31784, + "grad_norm": 0.8859061598777771, + "learning_rate": 1.724556904684598e-05, + "loss": 0.0319, + "step": 15892 + }, + { + "epoch": 0.31788, + "grad_norm": 2.96437931060791, + "learning_rate": 1.7244606650565394e-05, + "loss": 0.0877, + "step": 15894 + }, + { + "epoch": 0.31792, + "grad_norm": 0.29209795594215393, + "learning_rate": 1.724364411304747e-05, + "loss": 0.0481, + "step": 15896 + }, + { + "epoch": 0.31796, + "grad_norm": 5.178647518157959, + "learning_rate": 1.7242681434310975e-05, + "loss": 0.3848, + "step": 15898 + }, + { + "epoch": 0.318, + "grad_norm": 3.560938835144043, + "learning_rate": 1.7241718614374678e-05, + "loss": 0.1739, + "step": 15900 + }, + { + "epoch": 0.31804, + "grad_norm": 0.581648051738739, + "learning_rate": 1.7240755653257345e-05, + "loss": 0.0311, + "step": 15902 + }, + { + "epoch": 0.31808, + "grad_norm": 1.7434611320495605, + "learning_rate": 1.7239792550977753e-05, + "loss": 0.0578, + "step": 15904 + }, + { + "epoch": 0.31812, + "grad_norm": 4.392858982086182, + "learning_rate": 1.7238829307554683e-05, + "loss": 0.125, + "step": 15906 + }, + { + "epoch": 0.31816, + "grad_norm": 0.47869184613227844, + "learning_rate": 1.7237865923006904e-05, + "loss": 0.0198, + "step": 15908 + }, + { + "epoch": 0.3182, + "grad_norm": 2.1246492862701416, + "learning_rate": 1.7236902397353204e-05, + "loss": 0.0925, + "step": 15910 + }, + { + "epoch": 0.31824, + "grad_norm": 4.943930625915527, + "learning_rate": 1.7235938730612368e-05, + "loss": 0.3595, + "step": 15912 + }, + { + "epoch": 0.31828, + "grad_norm": 2.9077911376953125, + "learning_rate": 1.723497492280318e-05, + "loss": 0.1174, + "step": 15914 + }, + { + "epoch": 0.31832, + "grad_norm": 0.29923972487449646, + "learning_rate": 1.7234010973944428e-05, + "loss": 0.0297, + "step": 15916 + }, + { + "epoch": 0.31836, + "grad_norm": 1.5591965913772583, + "learning_rate": 1.7233046884054913e-05, + "loss": 0.0579, + "step": 15918 + }, + { + "epoch": 0.3184, + "grad_norm": 0.5916215181350708, + "learning_rate": 1.7232082653153422e-05, + "loss": 0.0828, + "step": 15920 + }, + { + "epoch": 0.31844, + "grad_norm": 5.438044548034668, + "learning_rate": 1.7231118281258755e-05, + "loss": 0.2446, + "step": 15922 + }, + { + "epoch": 0.31848, + "grad_norm": 2.2646780014038086, + "learning_rate": 1.7230153768389717e-05, + "loss": 0.0702, + "step": 15924 + }, + { + "epoch": 0.31852, + "grad_norm": 0.4259355366230011, + "learning_rate": 1.7229189114565107e-05, + "loss": 0.0127, + "step": 15926 + }, + { + "epoch": 0.31856, + "grad_norm": 0.1939091980457306, + "learning_rate": 1.7228224319803738e-05, + "loss": 0.1206, + "step": 15928 + }, + { + "epoch": 0.3186, + "grad_norm": 1.6402721405029297, + "learning_rate": 1.7227259384124408e-05, + "loss": 0.1254, + "step": 15930 + }, + { + "epoch": 0.31864, + "grad_norm": 7.244926929473877, + "learning_rate": 1.722629430754594e-05, + "loss": 0.3078, + "step": 15932 + }, + { + "epoch": 0.31868, + "grad_norm": 2.3702917098999023, + "learning_rate": 1.7225329090087143e-05, + "loss": 0.0812, + "step": 15934 + }, + { + "epoch": 0.31872, + "grad_norm": 0.3348061442375183, + "learning_rate": 1.7224363731766833e-05, + "loss": 0.1975, + "step": 15936 + }, + { + "epoch": 0.31876, + "grad_norm": 0.5108859539031982, + "learning_rate": 1.7223398232603832e-05, + "loss": 0.044, + "step": 15938 + }, + { + "epoch": 0.3188, + "grad_norm": 4.644346714019775, + "learning_rate": 1.722243259261697e-05, + "loss": 0.3015, + "step": 15940 + }, + { + "epoch": 0.31884, + "grad_norm": 0.16174529492855072, + "learning_rate": 1.7221466811825062e-05, + "loss": 0.0304, + "step": 15942 + }, + { + "epoch": 0.31888, + "grad_norm": 8.733914375305176, + "learning_rate": 1.7220500890246944e-05, + "loss": 0.3059, + "step": 15944 + }, + { + "epoch": 0.31892, + "grad_norm": 0.45194968581199646, + "learning_rate": 1.7219534827901437e-05, + "loss": 0.0368, + "step": 15946 + }, + { + "epoch": 0.31896, + "grad_norm": 2.091768980026245, + "learning_rate": 1.721856862480739e-05, + "loss": 0.2484, + "step": 15948 + }, + { + "epoch": 0.319, + "grad_norm": 0.29290610551834106, + "learning_rate": 1.7217602280983622e-05, + "loss": 0.0404, + "step": 15950 + }, + { + "epoch": 0.31904, + "grad_norm": 1.0254249572753906, + "learning_rate": 1.721663579644899e-05, + "loss": 0.0226, + "step": 15952 + }, + { + "epoch": 0.31908, + "grad_norm": 5.568663597106934, + "learning_rate": 1.7215669171222324e-05, + "loss": 0.2534, + "step": 15954 + }, + { + "epoch": 0.31912, + "grad_norm": 3.351776123046875, + "learning_rate": 1.7214702405322472e-05, + "loss": 0.0745, + "step": 15956 + }, + { + "epoch": 0.31916, + "grad_norm": 0.7938966155052185, + "learning_rate": 1.7213735498768283e-05, + "loss": 0.5097, + "step": 15958 + }, + { + "epoch": 0.3192, + "grad_norm": 3.4773406982421875, + "learning_rate": 1.721276845157861e-05, + "loss": 0.157, + "step": 15960 + }, + { + "epoch": 0.31924, + "grad_norm": 6.607729911804199, + "learning_rate": 1.7211801263772297e-05, + "loss": 0.367, + "step": 15962 + }, + { + "epoch": 0.31928, + "grad_norm": 6.144454479217529, + "learning_rate": 1.721083393536821e-05, + "loss": 0.215, + "step": 15964 + }, + { + "epoch": 0.31932, + "grad_norm": 0.2307746410369873, + "learning_rate": 1.7209866466385197e-05, + "loss": 0.0108, + "step": 15966 + }, + { + "epoch": 0.31936, + "grad_norm": 4.266392230987549, + "learning_rate": 1.720889885684213e-05, + "loss": 0.1472, + "step": 15968 + }, + { + "epoch": 0.3194, + "grad_norm": 0.6444510221481323, + "learning_rate": 1.7207931106757867e-05, + "loss": 0.0695, + "step": 15970 + }, + { + "epoch": 0.31944, + "grad_norm": 4.570140838623047, + "learning_rate": 1.7206963216151277e-05, + "loss": 0.181, + "step": 15972 + }, + { + "epoch": 0.31948, + "grad_norm": 5.4952874183654785, + "learning_rate": 1.720599518504123e-05, + "loss": 0.2024, + "step": 15974 + }, + { + "epoch": 0.31952, + "grad_norm": 1.5192103385925293, + "learning_rate": 1.7205027013446596e-05, + "loss": 0.0527, + "step": 15976 + }, + { + "epoch": 0.31956, + "grad_norm": 0.10201127827167511, + "learning_rate": 1.7204058701386247e-05, + "loss": 0.0399, + "step": 15978 + }, + { + "epoch": 0.3196, + "grad_norm": 0.10512110590934753, + "learning_rate": 1.720309024887907e-05, + "loss": 0.0045, + "step": 15980 + }, + { + "epoch": 0.31964, + "grad_norm": 0.5055105686187744, + "learning_rate": 1.720212165594394e-05, + "loss": 0.0387, + "step": 15982 + }, + { + "epoch": 0.31968, + "grad_norm": 0.3290804624557495, + "learning_rate": 1.7201152922599737e-05, + "loss": 0.0395, + "step": 15984 + }, + { + "epoch": 0.31972, + "grad_norm": 4.986927032470703, + "learning_rate": 1.7200184048865352e-05, + "loss": 0.4567, + "step": 15986 + }, + { + "epoch": 0.31976, + "grad_norm": 4.870688438415527, + "learning_rate": 1.7199215034759675e-05, + "loss": 0.2534, + "step": 15988 + }, + { + "epoch": 0.3198, + "grad_norm": 0.5102529525756836, + "learning_rate": 1.719824588030159e-05, + "loss": 0.189, + "step": 15990 + }, + { + "epoch": 0.31984, + "grad_norm": 1.2885544300079346, + "learning_rate": 1.719727658551e-05, + "loss": 0.045, + "step": 15992 + }, + { + "epoch": 0.31988, + "grad_norm": 0.9797398447990417, + "learning_rate": 1.7196307150403794e-05, + "loss": 0.2718, + "step": 15994 + }, + { + "epoch": 0.31992, + "grad_norm": 1.4463056325912476, + "learning_rate": 1.7195337575001874e-05, + "loss": 0.0808, + "step": 15996 + }, + { + "epoch": 0.31996, + "grad_norm": 0.20445750653743744, + "learning_rate": 1.7194367859323147e-05, + "loss": 0.3968, + "step": 15998 + }, + { + "epoch": 0.32, + "grad_norm": 4.527005672454834, + "learning_rate": 1.7193398003386514e-05, + "loss": 0.2683, + "step": 16000 + }, + { + "epoch": 0.32004, + "grad_norm": 1.3663297891616821, + "learning_rate": 1.719242800721088e-05, + "loss": 0.0449, + "step": 16002 + }, + { + "epoch": 0.32008, + "grad_norm": 2.901444911956787, + "learning_rate": 1.719145787081516e-05, + "loss": 0.2342, + "step": 16004 + }, + { + "epoch": 0.32012, + "grad_norm": 0.39152657985687256, + "learning_rate": 1.7190487594218273e-05, + "loss": 0.0405, + "step": 16006 + }, + { + "epoch": 0.32016, + "grad_norm": 4.795445919036865, + "learning_rate": 1.7189517177439123e-05, + "loss": 0.2086, + "step": 16008 + }, + { + "epoch": 0.3202, + "grad_norm": 0.15157388150691986, + "learning_rate": 1.7188546620496634e-05, + "loss": 0.0208, + "step": 16010 + }, + { + "epoch": 0.32024, + "grad_norm": 0.8949214816093445, + "learning_rate": 1.7187575923409728e-05, + "loss": 0.1865, + "step": 16012 + }, + { + "epoch": 0.32028, + "grad_norm": 3.6949150562286377, + "learning_rate": 1.718660508619733e-05, + "loss": 0.207, + "step": 16014 + }, + { + "epoch": 0.32032, + "grad_norm": 3.285771369934082, + "learning_rate": 1.7185634108878367e-05, + "loss": 0.3974, + "step": 16016 + }, + { + "epoch": 0.32036, + "grad_norm": 1.0087043046951294, + "learning_rate": 1.7184662991471768e-05, + "loss": 0.1121, + "step": 16018 + }, + { + "epoch": 0.3204, + "grad_norm": 2.6116867065429688, + "learning_rate": 1.7183691733996463e-05, + "loss": 0.1182, + "step": 16020 + }, + { + "epoch": 0.32044, + "grad_norm": 3.8841333389282227, + "learning_rate": 1.718272033647139e-05, + "loss": 0.1972, + "step": 16022 + }, + { + "epoch": 0.32048, + "grad_norm": 0.8342236280441284, + "learning_rate": 1.7181748798915485e-05, + "loss": 0.0582, + "step": 16024 + }, + { + "epoch": 0.32052, + "grad_norm": 1.8132292032241821, + "learning_rate": 1.7180777121347692e-05, + "loss": 0.1548, + "step": 16026 + }, + { + "epoch": 0.32056, + "grad_norm": 3.2074408531188965, + "learning_rate": 1.717980530378695e-05, + "loss": 0.806, + "step": 16028 + }, + { + "epoch": 0.3206, + "grad_norm": 2.401840925216675, + "learning_rate": 1.7178833346252208e-05, + "loss": 0.1038, + "step": 16030 + }, + { + "epoch": 0.32064, + "grad_norm": 2.751347064971924, + "learning_rate": 1.7177861248762413e-05, + "loss": 0.1002, + "step": 16032 + }, + { + "epoch": 0.32068, + "grad_norm": 1.5733017921447754, + "learning_rate": 1.7176889011336518e-05, + "loss": 0.0611, + "step": 16034 + }, + { + "epoch": 0.32072, + "grad_norm": 2.7847797870635986, + "learning_rate": 1.7175916633993478e-05, + "loss": 0.5357, + "step": 16036 + }, + { + "epoch": 0.32076, + "grad_norm": 3.9468088150024414, + "learning_rate": 1.7174944116752244e-05, + "loss": 0.2235, + "step": 16038 + }, + { + "epoch": 0.3208, + "grad_norm": 0.548914909362793, + "learning_rate": 1.717397145963179e-05, + "loss": 0.0567, + "step": 16040 + }, + { + "epoch": 0.32084, + "grad_norm": 2.4420058727264404, + "learning_rate": 1.717299866265106e-05, + "loss": 0.091, + "step": 16042 + }, + { + "epoch": 0.32088, + "grad_norm": 1.098307728767395, + "learning_rate": 1.7172025725829034e-05, + "loss": 0.0383, + "step": 16044 + }, + { + "epoch": 0.32092, + "grad_norm": 0.03777585178613663, + "learning_rate": 1.717105264918467e-05, + "loss": 0.0201, + "step": 16046 + }, + { + "epoch": 0.32096, + "grad_norm": 1.014460802078247, + "learning_rate": 1.7170079432736945e-05, + "loss": 0.1503, + "step": 16048 + }, + { + "epoch": 0.321, + "grad_norm": 0.3275274932384491, + "learning_rate": 1.716910607650483e-05, + "loss": 0.0628, + "step": 16050 + }, + { + "epoch": 0.32104, + "grad_norm": 1.2497047185897827, + "learning_rate": 1.7168132580507298e-05, + "loss": 0.08, + "step": 16052 + }, + { + "epoch": 0.32108, + "grad_norm": 1.3920304775238037, + "learning_rate": 1.7167158944763337e-05, + "loss": 0.035, + "step": 16054 + }, + { + "epoch": 0.32112, + "grad_norm": 4.419384479522705, + "learning_rate": 1.716618516929192e-05, + "loss": 0.3385, + "step": 16056 + }, + { + "epoch": 0.32116, + "grad_norm": 0.4665645658969879, + "learning_rate": 1.7165211254112032e-05, + "loss": 0.0732, + "step": 16058 + }, + { + "epoch": 0.3212, + "grad_norm": 1.265932321548462, + "learning_rate": 1.716423719924266e-05, + "loss": 0.1801, + "step": 16060 + }, + { + "epoch": 0.32124, + "grad_norm": 3.544856071472168, + "learning_rate": 1.71632630047028e-05, + "loss": 0.4704, + "step": 16062 + }, + { + "epoch": 0.32128, + "grad_norm": 1.6911847591400146, + "learning_rate": 1.7162288670511434e-05, + "loss": 0.3888, + "step": 16064 + }, + { + "epoch": 0.32132, + "grad_norm": 3.146268367767334, + "learning_rate": 1.7161314196687572e-05, + "loss": 0.212, + "step": 16066 + }, + { + "epoch": 0.32136, + "grad_norm": 0.5026207566261292, + "learning_rate": 1.7160339583250193e-05, + "loss": 0.1114, + "step": 16068 + }, + { + "epoch": 0.3214, + "grad_norm": 2.266026020050049, + "learning_rate": 1.7159364830218312e-05, + "loss": 0.1135, + "step": 16070 + }, + { + "epoch": 0.32144, + "grad_norm": 1.5932385921478271, + "learning_rate": 1.715838993761093e-05, + "loss": 0.3253, + "step": 16072 + }, + { + "epoch": 0.32148, + "grad_norm": 1.2698506116867065, + "learning_rate": 1.7157414905447047e-05, + "loss": 0.068, + "step": 16074 + }, + { + "epoch": 0.32152, + "grad_norm": 2.6855216026306152, + "learning_rate": 1.715643973374568e-05, + "loss": 0.1459, + "step": 16076 + }, + { + "epoch": 0.32156, + "grad_norm": 0.7530030012130737, + "learning_rate": 1.7155464422525828e-05, + "loss": 0.0389, + "step": 16078 + }, + { + "epoch": 0.3216, + "grad_norm": 2.1283938884735107, + "learning_rate": 1.715448897180652e-05, + "loss": 0.1441, + "step": 16080 + }, + { + "epoch": 0.32164, + "grad_norm": 0.5409755110740662, + "learning_rate": 1.7153513381606764e-05, + "loss": 0.131, + "step": 16082 + }, + { + "epoch": 0.32168, + "grad_norm": 0.8860979080200195, + "learning_rate": 1.7152537651945583e-05, + "loss": 0.0641, + "step": 16084 + }, + { + "epoch": 0.32172, + "grad_norm": 0.31773701310157776, + "learning_rate": 1.7151561782841996e-05, + "loss": 0.0608, + "step": 16086 + }, + { + "epoch": 0.32176, + "grad_norm": 1.6014044284820557, + "learning_rate": 1.715058577431503e-05, + "loss": 0.0657, + "step": 16088 + }, + { + "epoch": 0.3218, + "grad_norm": 1.1043497323989868, + "learning_rate": 1.7149609626383718e-05, + "loss": 0.0794, + "step": 16090 + }, + { + "epoch": 0.32184, + "grad_norm": 0.22107000648975372, + "learning_rate": 1.714863333906708e-05, + "loss": 0.0173, + "step": 16092 + }, + { + "epoch": 0.32188, + "grad_norm": 0.4537990093231201, + "learning_rate": 1.714765691238416e-05, + "loss": 0.0308, + "step": 16094 + }, + { + "epoch": 0.32192, + "grad_norm": 0.4509681761264801, + "learning_rate": 1.714668034635398e-05, + "loss": 0.0272, + "step": 16096 + }, + { + "epoch": 0.32196, + "grad_norm": 0.28565874695777893, + "learning_rate": 1.7145703640995594e-05, + "loss": 0.1819, + "step": 16098 + }, + { + "epoch": 0.322, + "grad_norm": 3.1052944660186768, + "learning_rate": 1.7144726796328034e-05, + "loss": 0.13, + "step": 16100 + }, + { + "epoch": 0.32204, + "grad_norm": 3.82970929145813, + "learning_rate": 1.714374981237035e-05, + "loss": 0.2557, + "step": 16102 + }, + { + "epoch": 0.32208, + "grad_norm": 0.2585867941379547, + "learning_rate": 1.7142772689141582e-05, + "loss": 0.0165, + "step": 16104 + }, + { + "epoch": 0.32212, + "grad_norm": 2.202338695526123, + "learning_rate": 1.7141795426660784e-05, + "loss": 0.078, + "step": 16106 + }, + { + "epoch": 0.32216, + "grad_norm": 3.34578800201416, + "learning_rate": 1.7140818024947007e-05, + "loss": 0.1087, + "step": 16108 + }, + { + "epoch": 0.3222, + "grad_norm": 0.08114609122276306, + "learning_rate": 1.713984048401931e-05, + "loss": 0.0135, + "step": 16110 + }, + { + "epoch": 0.32224, + "grad_norm": 4.76944637298584, + "learning_rate": 1.7138862803896743e-05, + "loss": 0.1757, + "step": 16112 + }, + { + "epoch": 0.32228, + "grad_norm": 0.8807359933853149, + "learning_rate": 1.7137884984598372e-05, + "loss": 0.0271, + "step": 16114 + }, + { + "epoch": 0.32232, + "grad_norm": 0.023580588400363922, + "learning_rate": 1.7136907026143256e-05, + "loss": 0.0087, + "step": 16116 + }, + { + "epoch": 0.32236, + "grad_norm": 5.35784912109375, + "learning_rate": 1.7135928928550466e-05, + "loss": 0.5953, + "step": 16118 + }, + { + "epoch": 0.3224, + "grad_norm": 8.564976692199707, + "learning_rate": 1.7134950691839063e-05, + "loss": 0.1943, + "step": 16120 + }, + { + "epoch": 0.32244, + "grad_norm": 3.8104562759399414, + "learning_rate": 1.713397231602813e-05, + "loss": 0.1297, + "step": 16122 + }, + { + "epoch": 0.32248, + "grad_norm": 5.1338300704956055, + "learning_rate": 1.713299380113673e-05, + "loss": 0.3138, + "step": 16124 + }, + { + "epoch": 0.32252, + "grad_norm": 3.0016963481903076, + "learning_rate": 1.7132015147183944e-05, + "loss": 0.0835, + "step": 16126 + }, + { + "epoch": 0.32256, + "grad_norm": 0.5856846570968628, + "learning_rate": 1.713103635418885e-05, + "loss": 0.0869, + "step": 16128 + }, + { + "epoch": 0.3226, + "grad_norm": 0.11773691326379776, + "learning_rate": 1.713005742217053e-05, + "loss": 0.0397, + "step": 16130 + }, + { + "epoch": 0.32264, + "grad_norm": 0.619381308555603, + "learning_rate": 1.712907835114807e-05, + "loss": 0.0286, + "step": 16132 + }, + { + "epoch": 0.32268, + "grad_norm": 4.597063064575195, + "learning_rate": 1.712809914114056e-05, + "loss": 0.2514, + "step": 16134 + }, + { + "epoch": 0.32272, + "grad_norm": 0.09251407533884048, + "learning_rate": 1.712711979216709e-05, + "loss": 0.0495, + "step": 16136 + }, + { + "epoch": 0.32276, + "grad_norm": 0.6138032078742981, + "learning_rate": 1.7126140304246745e-05, + "loss": 0.0897, + "step": 16138 + }, + { + "epoch": 0.3228, + "grad_norm": 0.1791575402021408, + "learning_rate": 1.7125160677398625e-05, + "loss": 0.2701, + "step": 16140 + }, + { + "epoch": 0.32284, + "grad_norm": 0.287889301776886, + "learning_rate": 1.7124180911641835e-05, + "loss": 0.1962, + "step": 16142 + }, + { + "epoch": 0.32288, + "grad_norm": 0.26359036564826965, + "learning_rate": 1.7123201006995467e-05, + "loss": 0.017, + "step": 16144 + }, + { + "epoch": 0.32292, + "grad_norm": 0.3813267648220062, + "learning_rate": 1.7122220963478627e-05, + "loss": 0.0149, + "step": 16146 + }, + { + "epoch": 0.32296, + "grad_norm": 3.405272960662842, + "learning_rate": 1.7121240781110424e-05, + "loss": 0.0925, + "step": 16148 + }, + { + "epoch": 0.323, + "grad_norm": 4.147353172302246, + "learning_rate": 1.712026045990997e-05, + "loss": 0.1437, + "step": 16150 + }, + { + "epoch": 0.32304, + "grad_norm": 1.2576863765716553, + "learning_rate": 1.7119279999896363e-05, + "loss": 0.0568, + "step": 16152 + }, + { + "epoch": 0.32308, + "grad_norm": 0.8773243427276611, + "learning_rate": 1.7118299401088734e-05, + "loss": 0.0727, + "step": 16154 + }, + { + "epoch": 0.32312, + "grad_norm": 3.0079121589660645, + "learning_rate": 1.7117318663506192e-05, + "loss": 0.4422, + "step": 16156 + }, + { + "epoch": 0.32316, + "grad_norm": 0.318135142326355, + "learning_rate": 1.711633778716786e-05, + "loss": 0.0137, + "step": 16158 + }, + { + "epoch": 0.3232, + "grad_norm": 5.994028568267822, + "learning_rate": 1.7115356772092858e-05, + "loss": 0.4337, + "step": 16160 + }, + { + "epoch": 0.32324, + "grad_norm": 0.9951370358467102, + "learning_rate": 1.7114375618300307e-05, + "loss": 0.0609, + "step": 16162 + }, + { + "epoch": 0.32328, + "grad_norm": 0.3285679221153259, + "learning_rate": 1.7113394325809348e-05, + "loss": 0.3258, + "step": 16164 + }, + { + "epoch": 0.32332, + "grad_norm": 1.0817328691482544, + "learning_rate": 1.71124128946391e-05, + "loss": 0.0693, + "step": 16166 + }, + { + "epoch": 0.32336, + "grad_norm": 1.9155687093734741, + "learning_rate": 1.7111431324808704e-05, + "loss": 0.095, + "step": 16168 + }, + { + "epoch": 0.3234, + "grad_norm": 1.6288319826126099, + "learning_rate": 1.711044961633729e-05, + "loss": 0.1061, + "step": 16170 + }, + { + "epoch": 0.32344, + "grad_norm": 0.5561500191688538, + "learning_rate": 1.7109467769244005e-05, + "loss": 0.0471, + "step": 16172 + }, + { + "epoch": 0.32348, + "grad_norm": 1.658542513847351, + "learning_rate": 1.710848578354798e-05, + "loss": 0.0485, + "step": 16174 + }, + { + "epoch": 0.32352, + "grad_norm": 2.153235912322998, + "learning_rate": 1.710750365926837e-05, + "loss": 0.0666, + "step": 16176 + }, + { + "epoch": 0.32356, + "grad_norm": 0.858582615852356, + "learning_rate": 1.710652139642431e-05, + "loss": 0.0512, + "step": 16178 + }, + { + "epoch": 0.3236, + "grad_norm": 3.277390241622925, + "learning_rate": 1.710553899503496e-05, + "loss": 0.2685, + "step": 16180 + }, + { + "epoch": 0.32364, + "grad_norm": 0.010445878840982914, + "learning_rate": 1.710455645511947e-05, + "loss": 0.0094, + "step": 16182 + }, + { + "epoch": 0.32368, + "grad_norm": 1.600502610206604, + "learning_rate": 1.7103573776696998e-05, + "loss": 0.0416, + "step": 16184 + }, + { + "epoch": 0.32372, + "grad_norm": 0.49528589844703674, + "learning_rate": 1.7102590959786694e-05, + "loss": 0.0323, + "step": 16186 + }, + { + "epoch": 0.32376, + "grad_norm": 3.096388101577759, + "learning_rate": 1.7101608004407723e-05, + "loss": 0.1641, + "step": 16188 + }, + { + "epoch": 0.3238, + "grad_norm": 0.19158527255058289, + "learning_rate": 1.710062491057925e-05, + "loss": 0.0103, + "step": 16190 + }, + { + "epoch": 0.32384, + "grad_norm": 0.49462875723838806, + "learning_rate": 1.7099641678320434e-05, + "loss": 0.1196, + "step": 16192 + }, + { + "epoch": 0.32388, + "grad_norm": 1.8107792139053345, + "learning_rate": 1.7098658307650452e-05, + "loss": 0.0827, + "step": 16194 + }, + { + "epoch": 0.32392, + "grad_norm": 3.4889211654663086, + "learning_rate": 1.709767479858847e-05, + "loss": 0.1056, + "step": 16196 + }, + { + "epoch": 0.32396, + "grad_norm": 0.1274157017469406, + "learning_rate": 1.7096691151153664e-05, + "loss": 0.0499, + "step": 16198 + }, + { + "epoch": 0.324, + "grad_norm": 0.6775373816490173, + "learning_rate": 1.709570736536521e-05, + "loss": 0.0507, + "step": 16200 + }, + { + "epoch": 0.32404, + "grad_norm": 0.5393198132514954, + "learning_rate": 1.709472344124229e-05, + "loss": 0.2068, + "step": 16202 + }, + { + "epoch": 0.32408, + "grad_norm": 0.012266800738871098, + "learning_rate": 1.7093739378804083e-05, + "loss": 0.2511, + "step": 16204 + }, + { + "epoch": 0.32412, + "grad_norm": 3.115396738052368, + "learning_rate": 1.709275517806977e-05, + "loss": 0.1055, + "step": 16206 + }, + { + "epoch": 0.32416, + "grad_norm": 0.07142937928438187, + "learning_rate": 1.709177083905855e-05, + "loss": 0.0314, + "step": 16208 + }, + { + "epoch": 0.3242, + "grad_norm": 2.726477861404419, + "learning_rate": 1.7090786361789602e-05, + "loss": 0.0659, + "step": 16210 + }, + { + "epoch": 0.32424, + "grad_norm": 6.84682035446167, + "learning_rate": 1.7089801746282125e-05, + "loss": 0.4705, + "step": 16212 + }, + { + "epoch": 0.32428, + "grad_norm": 5.434685707092285, + "learning_rate": 1.7088816992555314e-05, + "loss": 0.4068, + "step": 16214 + }, + { + "epoch": 0.32432, + "grad_norm": 1.6554923057556152, + "learning_rate": 1.7087832100628365e-05, + "loss": 0.0643, + "step": 16216 + }, + { + "epoch": 0.32436, + "grad_norm": 6.641782283782959, + "learning_rate": 1.708684707052048e-05, + "loss": 0.2228, + "step": 16218 + }, + { + "epoch": 0.3244, + "grad_norm": 1.645523190498352, + "learning_rate": 1.7085861902250864e-05, + "loss": 0.1295, + "step": 16220 + }, + { + "epoch": 0.32444, + "grad_norm": 1.7489758729934692, + "learning_rate": 1.708487659583872e-05, + "loss": 0.0725, + "step": 16222 + }, + { + "epoch": 0.32448, + "grad_norm": 0.3988535702228546, + "learning_rate": 1.708389115130326e-05, + "loss": 0.0395, + "step": 16224 + }, + { + "epoch": 0.32452, + "grad_norm": 4.040034770965576, + "learning_rate": 1.7082905568663696e-05, + "loss": 0.1568, + "step": 16226 + }, + { + "epoch": 0.32456, + "grad_norm": 5.795557975769043, + "learning_rate": 1.708191984793924e-05, + "loss": 0.136, + "step": 16228 + }, + { + "epoch": 0.3246, + "grad_norm": 0.4960711896419525, + "learning_rate": 1.7080933989149112e-05, + "loss": 0.2147, + "step": 16230 + }, + { + "epoch": 0.32464, + "grad_norm": 3.600579261779785, + "learning_rate": 1.707994799231253e-05, + "loss": 0.2138, + "step": 16232 + }, + { + "epoch": 0.32468, + "grad_norm": 0.18769556283950806, + "learning_rate": 1.7078961857448715e-05, + "loss": 0.0522, + "step": 16234 + }, + { + "epoch": 0.32472, + "grad_norm": 0.5941236615180969, + "learning_rate": 1.7077975584576895e-05, + "loss": 0.0235, + "step": 16236 + }, + { + "epoch": 0.32476, + "grad_norm": 3.4103808403015137, + "learning_rate": 1.70769891737163e-05, + "loss": 0.1433, + "step": 16238 + }, + { + "epoch": 0.3248, + "grad_norm": 1.7883304357528687, + "learning_rate": 1.7076002624886156e-05, + "loss": 0.1377, + "step": 16240 + }, + { + "epoch": 0.32484, + "grad_norm": 0.7579551935195923, + "learning_rate": 1.7075015938105695e-05, + "loss": 0.019, + "step": 16242 + }, + { + "epoch": 0.32488, + "grad_norm": 1.8712331056594849, + "learning_rate": 1.7074029113394156e-05, + "loss": 0.0602, + "step": 16244 + }, + { + "epoch": 0.32492, + "grad_norm": 6.2287068367004395, + "learning_rate": 1.7073042150770783e-05, + "loss": 0.4734, + "step": 16246 + }, + { + "epoch": 0.32496, + "grad_norm": 2.315816879272461, + "learning_rate": 1.707205505025481e-05, + "loss": 0.0769, + "step": 16248 + }, + { + "epoch": 0.325, + "grad_norm": 0.03574265539646149, + "learning_rate": 1.7071067811865477e-05, + "loss": 0.004, + "step": 16250 + }, + { + "epoch": 0.32504, + "grad_norm": 4.804280757904053, + "learning_rate": 1.7070080435622038e-05, + "loss": 0.1167, + "step": 16252 + }, + { + "epoch": 0.32508, + "grad_norm": 0.1387215554714203, + "learning_rate": 1.7069092921543746e-05, + "loss": 0.0449, + "step": 16254 + }, + { + "epoch": 0.32512, + "grad_norm": 4.903221607208252, + "learning_rate": 1.7068105269649846e-05, + "loss": 0.4341, + "step": 16256 + }, + { + "epoch": 0.32516, + "grad_norm": 0.8874146938323975, + "learning_rate": 1.7067117479959593e-05, + "loss": 0.1506, + "step": 16258 + }, + { + "epoch": 0.3252, + "grad_norm": 0.9925326108932495, + "learning_rate": 1.706612955249225e-05, + "loss": 0.0552, + "step": 16260 + }, + { + "epoch": 0.32524, + "grad_norm": 4.535938262939453, + "learning_rate": 1.706514148726707e-05, + "loss": 0.1919, + "step": 16262 + }, + { + "epoch": 0.32528, + "grad_norm": 0.19841808080673218, + "learning_rate": 1.7064153284303322e-05, + "loss": 0.0124, + "step": 16264 + }, + { + "epoch": 0.32532, + "grad_norm": 1.582801342010498, + "learning_rate": 1.7063164943620268e-05, + "loss": 0.197, + "step": 16266 + }, + { + "epoch": 0.32536, + "grad_norm": 1.0979560613632202, + "learning_rate": 1.7062176465237175e-05, + "loss": 0.081, + "step": 16268 + }, + { + "epoch": 0.3254, + "grad_norm": 1.5198105573654175, + "learning_rate": 1.7061187849173318e-05, + "loss": 0.0443, + "step": 16270 + }, + { + "epoch": 0.32544, + "grad_norm": 6.339849472045898, + "learning_rate": 1.7060199095447966e-05, + "loss": 0.3578, + "step": 16272 + }, + { + "epoch": 0.32548, + "grad_norm": 0.44778671860694885, + "learning_rate": 1.70592102040804e-05, + "loss": 0.0242, + "step": 16274 + }, + { + "epoch": 0.32552, + "grad_norm": 3.003749370574951, + "learning_rate": 1.70582211750899e-05, + "loss": 0.0961, + "step": 16276 + }, + { + "epoch": 0.32556, + "grad_norm": 0.18879765272140503, + "learning_rate": 1.7057232008495738e-05, + "loss": 0.0228, + "step": 16278 + }, + { + "epoch": 0.3256, + "grad_norm": 6.073958873748779, + "learning_rate": 1.705624270431721e-05, + "loss": 0.0799, + "step": 16280 + }, + { + "epoch": 0.32564, + "grad_norm": 0.5624605417251587, + "learning_rate": 1.7055253262573594e-05, + "loss": 0.0152, + "step": 16282 + }, + { + "epoch": 0.32568, + "grad_norm": 2.186891555786133, + "learning_rate": 1.7054263683284185e-05, + "loss": 0.0887, + "step": 16284 + }, + { + "epoch": 0.32572, + "grad_norm": 1.7874258756637573, + "learning_rate": 1.705327396646827e-05, + "loss": 0.042, + "step": 16286 + }, + { + "epoch": 0.32576, + "grad_norm": 0.18481609225273132, + "learning_rate": 1.7052284112145155e-05, + "loss": 0.0412, + "step": 16288 + }, + { + "epoch": 0.3258, + "grad_norm": 7.120002269744873, + "learning_rate": 1.7051294120334126e-05, + "loss": 0.7473, + "step": 16290 + }, + { + "epoch": 0.32584, + "grad_norm": 1.86830472946167, + "learning_rate": 1.7050303991054488e-05, + "loss": 0.5659, + "step": 16292 + }, + { + "epoch": 0.32588, + "grad_norm": 5.762758255004883, + "learning_rate": 1.7049313724325546e-05, + "loss": 0.3309, + "step": 16294 + }, + { + "epoch": 0.32592, + "grad_norm": 4.470278263092041, + "learning_rate": 1.7048323320166603e-05, + "loss": 0.2689, + "step": 16296 + }, + { + "epoch": 0.32596, + "grad_norm": 1.51018488407135, + "learning_rate": 1.7047332778596967e-05, + "loss": 0.0893, + "step": 16298 + }, + { + "epoch": 0.326, + "grad_norm": 0.7399940490722656, + "learning_rate": 1.7046342099635948e-05, + "loss": 0.0213, + "step": 16300 + }, + { + "epoch": 0.32604, + "grad_norm": 8.485127449035645, + "learning_rate": 1.7045351283302865e-05, + "loss": 0.3796, + "step": 16302 + }, + { + "epoch": 0.32608, + "grad_norm": 3.692599058151245, + "learning_rate": 1.704436032961703e-05, + "loss": 0.1389, + "step": 16304 + }, + { + "epoch": 0.32612, + "grad_norm": 0.3924711048603058, + "learning_rate": 1.7043369238597765e-05, + "loss": 0.0123, + "step": 16306 + }, + { + "epoch": 0.32616, + "grad_norm": 0.11972848325967789, + "learning_rate": 1.704237801026439e-05, + "loss": 0.0453, + "step": 16308 + }, + { + "epoch": 0.3262, + "grad_norm": 0.48979291319847107, + "learning_rate": 1.704138664463623e-05, + "loss": 0.0495, + "step": 16310 + }, + { + "epoch": 0.32624, + "grad_norm": 0.6737412810325623, + "learning_rate": 1.7040395141732614e-05, + "loss": 0.1153, + "step": 16312 + }, + { + "epoch": 0.32628, + "grad_norm": 3.7763445377349854, + "learning_rate": 1.703940350157287e-05, + "loss": 0.1628, + "step": 16314 + }, + { + "epoch": 0.32632, + "grad_norm": 3.179725408554077, + "learning_rate": 1.703841172417633e-05, + "loss": 0.0896, + "step": 16316 + }, + { + "epoch": 0.32636, + "grad_norm": 0.1661628931760788, + "learning_rate": 1.7037419809562326e-05, + "loss": 0.0319, + "step": 16318 + }, + { + "epoch": 0.3264, + "grad_norm": 5.179080963134766, + "learning_rate": 1.7036427757750205e-05, + "loss": 0.3539, + "step": 16320 + }, + { + "epoch": 0.32644, + "grad_norm": 3.6181561946868896, + "learning_rate": 1.70354355687593e-05, + "loss": 0.1424, + "step": 16322 + }, + { + "epoch": 0.32648, + "grad_norm": 1.8368514776229858, + "learning_rate": 1.7034443242608957e-05, + "loss": 0.0823, + "step": 16324 + }, + { + "epoch": 0.32652, + "grad_norm": 2.846884250640869, + "learning_rate": 1.703345077931852e-05, + "loss": 0.1603, + "step": 16326 + }, + { + "epoch": 0.32656, + "grad_norm": 0.7028946280479431, + "learning_rate": 1.703245817890734e-05, + "loss": 0.0235, + "step": 16328 + }, + { + "epoch": 0.3266, + "grad_norm": 0.6736174821853638, + "learning_rate": 1.7031465441394766e-05, + "loss": 0.0207, + "step": 16330 + }, + { + "epoch": 0.32664, + "grad_norm": 2.4986026287078857, + "learning_rate": 1.7030472566800156e-05, + "loss": 0.2293, + "step": 16332 + }, + { + "epoch": 0.32668, + "grad_norm": 0.3261864483356476, + "learning_rate": 1.7029479555142862e-05, + "loss": 0.0272, + "step": 16334 + }, + { + "epoch": 0.32672, + "grad_norm": 0.5758706331253052, + "learning_rate": 1.7028486406442247e-05, + "loss": 0.039, + "step": 16336 + }, + { + "epoch": 0.32676, + "grad_norm": 2.8246476650238037, + "learning_rate": 1.7027493120717665e-05, + "loss": 0.3803, + "step": 16338 + }, + { + "epoch": 0.3268, + "grad_norm": 1.1458014249801636, + "learning_rate": 1.7026499697988496e-05, + "loss": 0.0985, + "step": 16340 + }, + { + "epoch": 0.32684, + "grad_norm": 0.03861589729785919, + "learning_rate": 1.702550613827409e-05, + "loss": 0.0657, + "step": 16342 + }, + { + "epoch": 0.32688, + "grad_norm": 0.8070500493049622, + "learning_rate": 1.702451244159383e-05, + "loss": 0.0325, + "step": 16344 + }, + { + "epoch": 0.32692, + "grad_norm": 1.3693742752075195, + "learning_rate": 1.702351860796708e-05, + "loss": 0.2098, + "step": 16346 + }, + { + "epoch": 0.32696, + "grad_norm": 4.401216506958008, + "learning_rate": 1.702252463741322e-05, + "loss": 0.2296, + "step": 16348 + }, + { + "epoch": 0.327, + "grad_norm": 0.31040382385253906, + "learning_rate": 1.7021530529951627e-05, + "loss": 0.0286, + "step": 16350 + }, + { + "epoch": 0.32704, + "grad_norm": 0.4792312681674957, + "learning_rate": 1.702053628560168e-05, + "loss": 0.0163, + "step": 16352 + }, + { + "epoch": 0.32708, + "grad_norm": 0.09704890847206116, + "learning_rate": 1.7019541904382767e-05, + "loss": 0.0123, + "step": 16354 + }, + { + "epoch": 0.32712, + "grad_norm": 5.094161510467529, + "learning_rate": 1.7018547386314267e-05, + "loss": 0.198, + "step": 16356 + }, + { + "epoch": 0.32716, + "grad_norm": 0.23315563797950745, + "learning_rate": 1.7017552731415576e-05, + "loss": 0.0428, + "step": 16358 + }, + { + "epoch": 0.3272, + "grad_norm": 4.993490695953369, + "learning_rate": 1.7016557939706075e-05, + "loss": 0.3997, + "step": 16360 + }, + { + "epoch": 0.32724, + "grad_norm": 4.963287353515625, + "learning_rate": 1.701556301120517e-05, + "loss": 0.1989, + "step": 16362 + }, + { + "epoch": 0.32728, + "grad_norm": 0.923456072807312, + "learning_rate": 1.701456794593225e-05, + "loss": 0.4409, + "step": 16364 + }, + { + "epoch": 0.32732, + "grad_norm": 4.970625400543213, + "learning_rate": 1.701357274390672e-05, + "loss": 0.3921, + "step": 16366 + }, + { + "epoch": 0.32736, + "grad_norm": 0.9597758650779724, + "learning_rate": 1.7012577405147975e-05, + "loss": 0.1151, + "step": 16368 + }, + { + "epoch": 0.3274, + "grad_norm": 3.3839354515075684, + "learning_rate": 1.7011581929675424e-05, + "loss": 0.1512, + "step": 16370 + }, + { + "epoch": 0.32744, + "grad_norm": 1.2971267700195312, + "learning_rate": 1.7010586317508475e-05, + "loss": 0.2277, + "step": 16372 + }, + { + "epoch": 0.32748, + "grad_norm": 1.1341314315795898, + "learning_rate": 1.7009590568666533e-05, + "loss": 0.0884, + "step": 16374 + }, + { + "epoch": 0.32752, + "grad_norm": 2.0889220237731934, + "learning_rate": 1.7008594683169018e-05, + "loss": 0.1806, + "step": 16376 + }, + { + "epoch": 0.32756, + "grad_norm": 2.1786954402923584, + "learning_rate": 1.700759866103534e-05, + "loss": 0.0589, + "step": 16378 + }, + { + "epoch": 0.3276, + "grad_norm": 0.7665202021598816, + "learning_rate": 1.700660250228492e-05, + "loss": 0.0246, + "step": 16380 + }, + { + "epoch": 0.32764, + "grad_norm": 1.3824220895767212, + "learning_rate": 1.7005606206937174e-05, + "loss": 0.2181, + "step": 16382 + }, + { + "epoch": 0.32768, + "grad_norm": 0.20474602282047272, + "learning_rate": 1.700460977501153e-05, + "loss": 0.0635, + "step": 16384 + }, + { + "epoch": 0.32772, + "grad_norm": 3.744617462158203, + "learning_rate": 1.7003613206527412e-05, + "loss": 0.1528, + "step": 16386 + }, + { + "epoch": 0.32776, + "grad_norm": 0.08727017790079117, + "learning_rate": 1.7002616501504246e-05, + "loss": 0.0743, + "step": 16388 + }, + { + "epoch": 0.3278, + "grad_norm": 0.9983001947402954, + "learning_rate": 1.7001619659961467e-05, + "loss": 0.041, + "step": 16390 + }, + { + "epoch": 0.32784, + "grad_norm": 1.7175920009613037, + "learning_rate": 1.7000622681918512e-05, + "loss": 0.1053, + "step": 16392 + }, + { + "epoch": 0.32788, + "grad_norm": 0.32887861132621765, + "learning_rate": 1.6999625567394812e-05, + "loss": 0.0182, + "step": 16394 + }, + { + "epoch": 0.32792, + "grad_norm": 0.30104249715805054, + "learning_rate": 1.6998628316409808e-05, + "loss": 0.2427, + "step": 16396 + }, + { + "epoch": 0.32796, + "grad_norm": 0.6140630841255188, + "learning_rate": 1.699763092898294e-05, + "loss": 0.1417, + "step": 16398 + }, + { + "epoch": 0.328, + "grad_norm": 6.742161750793457, + "learning_rate": 1.6996633405133656e-05, + "loss": 0.4537, + "step": 16400 + }, + { + "epoch": 0.32804, + "grad_norm": 0.2739720642566681, + "learning_rate": 1.6995635744881403e-05, + "loss": 0.2752, + "step": 16402 + }, + { + "epoch": 0.32808, + "grad_norm": 1.5056416988372803, + "learning_rate": 1.6994637948245625e-05, + "loss": 0.1853, + "step": 16404 + }, + { + "epoch": 0.32812, + "grad_norm": 4.012441158294678, + "learning_rate": 1.6993640015245782e-05, + "loss": 0.2863, + "step": 16406 + }, + { + "epoch": 0.32816, + "grad_norm": 0.28074806928634644, + "learning_rate": 1.6992641945901326e-05, + "loss": 0.013, + "step": 16408 + }, + { + "epoch": 0.3282, + "grad_norm": 2.7840282917022705, + "learning_rate": 1.6991643740231714e-05, + "loss": 0.1037, + "step": 16410 + }, + { + "epoch": 0.32824, + "grad_norm": 0.34590092301368713, + "learning_rate": 1.6990645398256412e-05, + "loss": 0.0273, + "step": 16412 + }, + { + "epoch": 0.32828, + "grad_norm": 2.7401342391967773, + "learning_rate": 1.6989646919994874e-05, + "loss": 0.1026, + "step": 16414 + }, + { + "epoch": 0.32832, + "grad_norm": 0.12371435761451721, + "learning_rate": 1.6988648305466572e-05, + "loss": 0.0997, + "step": 16416 + }, + { + "epoch": 0.32836, + "grad_norm": 0.4464220106601715, + "learning_rate": 1.6987649554690975e-05, + "loss": 0.0321, + "step": 16418 + }, + { + "epoch": 0.3284, + "grad_norm": 3.8840529918670654, + "learning_rate": 1.6986650667687552e-05, + "loss": 0.2344, + "step": 16420 + }, + { + "epoch": 0.32844, + "grad_norm": 2.0998456478118896, + "learning_rate": 1.6985651644475777e-05, + "loss": 0.0835, + "step": 16422 + }, + { + "epoch": 0.32848, + "grad_norm": 0.5181192755699158, + "learning_rate": 1.6984652485075127e-05, + "loss": 0.036, + "step": 16424 + }, + { + "epoch": 0.32852, + "grad_norm": 1.020400047302246, + "learning_rate": 1.698365318950508e-05, + "loss": 0.062, + "step": 16426 + }, + { + "epoch": 0.32856, + "grad_norm": 0.8587630987167358, + "learning_rate": 1.698265375778512e-05, + "loss": 0.0617, + "step": 16428 + }, + { + "epoch": 0.3286, + "grad_norm": 0.772904634475708, + "learning_rate": 1.698165418993473e-05, + "loss": 0.0837, + "step": 16430 + }, + { + "epoch": 0.32864, + "grad_norm": 0.7367000579833984, + "learning_rate": 1.6980654485973396e-05, + "loss": 0.0195, + "step": 16432 + }, + { + "epoch": 0.32868, + "grad_norm": 2.197821617126465, + "learning_rate": 1.697965464592061e-05, + "loss": 0.0661, + "step": 16434 + }, + { + "epoch": 0.32872, + "grad_norm": 4.12138557434082, + "learning_rate": 1.697865466979586e-05, + "loss": 0.4464, + "step": 16436 + }, + { + "epoch": 0.32876, + "grad_norm": 0.19090653955936432, + "learning_rate": 1.6977654557618648e-05, + "loss": 0.3969, + "step": 16438 + }, + { + "epoch": 0.3288, + "grad_norm": 9.243576049804688, + "learning_rate": 1.6976654309408464e-05, + "loss": 0.3772, + "step": 16440 + }, + { + "epoch": 0.32884, + "grad_norm": 0.3260861933231354, + "learning_rate": 1.6975653925184817e-05, + "loss": 0.2889, + "step": 16442 + }, + { + "epoch": 0.32888, + "grad_norm": 0.7095568180084229, + "learning_rate": 1.6974653404967204e-05, + "loss": 0.057, + "step": 16444 + }, + { + "epoch": 0.32892, + "grad_norm": 0.46357280015945435, + "learning_rate": 1.6973652748775135e-05, + "loss": 0.0268, + "step": 16446 + }, + { + "epoch": 0.32896, + "grad_norm": 2.449434518814087, + "learning_rate": 1.6972651956628108e-05, + "loss": 0.161, + "step": 16448 + }, + { + "epoch": 0.329, + "grad_norm": 0.09879753738641739, + "learning_rate": 1.697165102854565e-05, + "loss": 0.0293, + "step": 16450 + }, + { + "epoch": 0.32904, + "grad_norm": 0.2873331606388092, + "learning_rate": 1.697064996454726e-05, + "loss": 0.0543, + "step": 16452 + }, + { + "epoch": 0.32908, + "grad_norm": 0.22737549245357513, + "learning_rate": 1.6969648764652463e-05, + "loss": 0.1436, + "step": 16454 + }, + { + "epoch": 0.32912, + "grad_norm": 5.459818363189697, + "learning_rate": 1.696864742888077e-05, + "loss": 0.3515, + "step": 16456 + }, + { + "epoch": 0.32916, + "grad_norm": 2.469439744949341, + "learning_rate": 1.696764595725171e-05, + "loss": 0.0674, + "step": 16458 + }, + { + "epoch": 0.3292, + "grad_norm": 0.31971773505210876, + "learning_rate": 1.696664434978481e-05, + "loss": 0.7201, + "step": 16460 + }, + { + "epoch": 0.32924, + "grad_norm": 4.484799861907959, + "learning_rate": 1.6965642606499586e-05, + "loss": 0.2424, + "step": 16462 + }, + { + "epoch": 0.32928, + "grad_norm": 0.8409584760665894, + "learning_rate": 1.6964640727415574e-05, + "loss": 0.2255, + "step": 16464 + }, + { + "epoch": 0.32932, + "grad_norm": 0.8655370473861694, + "learning_rate": 1.696363871255231e-05, + "loss": 0.1568, + "step": 16466 + }, + { + "epoch": 0.32936, + "grad_norm": 0.73469078540802, + "learning_rate": 1.6962636561929322e-05, + "loss": 0.0481, + "step": 16468 + }, + { + "epoch": 0.3294, + "grad_norm": 1.0996536016464233, + "learning_rate": 1.6961634275566147e-05, + "loss": 0.0849, + "step": 16470 + }, + { + "epoch": 0.32944, + "grad_norm": 4.250053405761719, + "learning_rate": 1.696063185348233e-05, + "loss": 0.2463, + "step": 16472 + }, + { + "epoch": 0.32948, + "grad_norm": 0.35456550121307373, + "learning_rate": 1.695962929569741e-05, + "loss": 0.0309, + "step": 16474 + }, + { + "epoch": 0.32952, + "grad_norm": 1.2363214492797852, + "learning_rate": 1.6958626602230934e-05, + "loss": 0.0484, + "step": 16476 + }, + { + "epoch": 0.32956, + "grad_norm": 4.794445037841797, + "learning_rate": 1.6957623773102453e-05, + "loss": 0.3768, + "step": 16478 + }, + { + "epoch": 0.3296, + "grad_norm": 2.6009891033172607, + "learning_rate": 1.695662080833151e-05, + "loss": 0.1263, + "step": 16480 + }, + { + "epoch": 0.32964, + "grad_norm": 0.700455904006958, + "learning_rate": 1.6955617707937663e-05, + "loss": 0.1272, + "step": 16482 + }, + { + "epoch": 0.32968, + "grad_norm": 2.2975358963012695, + "learning_rate": 1.695461447194047e-05, + "loss": 0.2834, + "step": 16484 + }, + { + "epoch": 0.32972, + "grad_norm": 2.975135564804077, + "learning_rate": 1.6953611100359486e-05, + "loss": 0.1399, + "step": 16486 + }, + { + "epoch": 0.32976, + "grad_norm": 1.0361666679382324, + "learning_rate": 1.6952607593214276e-05, + "loss": 0.0609, + "step": 16488 + }, + { + "epoch": 0.3298, + "grad_norm": 0.08327195048332214, + "learning_rate": 1.69516039505244e-05, + "loss": 0.1403, + "step": 16490 + }, + { + "epoch": 0.32984, + "grad_norm": 3.5848634243011475, + "learning_rate": 1.695060017230943e-05, + "loss": 0.3524, + "step": 16492 + }, + { + "epoch": 0.32988, + "grad_norm": 2.7130608558654785, + "learning_rate": 1.6949596258588923e-05, + "loss": 0.1509, + "step": 16494 + }, + { + "epoch": 0.32992, + "grad_norm": 1.2165616750717163, + "learning_rate": 1.6948592209382462e-05, + "loss": 0.0546, + "step": 16496 + }, + { + "epoch": 0.32996, + "grad_norm": 1.4334158897399902, + "learning_rate": 1.694758802470962e-05, + "loss": 0.0628, + "step": 16498 + }, + { + "epoch": 0.33, + "grad_norm": 0.11533254384994507, + "learning_rate": 1.6946583704589973e-05, + "loss": 0.1299, + "step": 16500 + }, + { + "epoch": 0.33004, + "grad_norm": 0.5349085330963135, + "learning_rate": 1.6945579249043104e-05, + "loss": 0.0282, + "step": 16502 + }, + { + "epoch": 0.33008, + "grad_norm": 5.3281660079956055, + "learning_rate": 1.6944574658088583e-05, + "loss": 0.2584, + "step": 16504 + }, + { + "epoch": 0.33012, + "grad_norm": 4.835336685180664, + "learning_rate": 1.694356993174601e-05, + "loss": 0.377, + "step": 16506 + }, + { + "epoch": 0.33016, + "grad_norm": 4.229154109954834, + "learning_rate": 1.6942565070034965e-05, + "loss": 0.3485, + "step": 16508 + }, + { + "epoch": 0.3302, + "grad_norm": 0.6270577907562256, + "learning_rate": 1.694156007297504e-05, + "loss": 0.2381, + "step": 16510 + }, + { + "epoch": 0.33024, + "grad_norm": 2.1517724990844727, + "learning_rate": 1.6940554940585828e-05, + "loss": 0.1321, + "step": 16512 + }, + { + "epoch": 0.33028, + "grad_norm": 1.258538842201233, + "learning_rate": 1.693954967288692e-05, + "loss": 0.0528, + "step": 16514 + }, + { + "epoch": 0.33032, + "grad_norm": 0.8241715431213379, + "learning_rate": 1.6938544269897925e-05, + "loss": 0.1095, + "step": 16516 + }, + { + "epoch": 0.33036, + "grad_norm": 1.751956582069397, + "learning_rate": 1.6937538731638432e-05, + "loss": 0.1117, + "step": 16518 + }, + { + "epoch": 0.3304, + "grad_norm": 1.2181658744812012, + "learning_rate": 1.693653305812805e-05, + "loss": 0.2421, + "step": 16520 + }, + { + "epoch": 0.33044, + "grad_norm": 1.1999409198760986, + "learning_rate": 1.693552724938639e-05, + "loss": 0.0665, + "step": 16522 + }, + { + "epoch": 0.33048, + "grad_norm": 1.2208713293075562, + "learning_rate": 1.693452130543305e-05, + "loss": 0.1212, + "step": 16524 + }, + { + "epoch": 0.33052, + "grad_norm": 3.304497241973877, + "learning_rate": 1.6933515226287648e-05, + "loss": 0.1852, + "step": 16526 + }, + { + "epoch": 0.33056, + "grad_norm": 2.8437745571136475, + "learning_rate": 1.6932509011969798e-05, + "loss": 0.0843, + "step": 16528 + }, + { + "epoch": 0.3306, + "grad_norm": 0.06503769755363464, + "learning_rate": 1.6931502662499116e-05, + "loss": 0.008, + "step": 16530 + }, + { + "epoch": 0.33064, + "grad_norm": 3.3610620498657227, + "learning_rate": 1.693049617789522e-05, + "loss": 0.1506, + "step": 16532 + }, + { + "epoch": 0.33068, + "grad_norm": 3.74802827835083, + "learning_rate": 1.6929489558177735e-05, + "loss": 0.1191, + "step": 16534 + }, + { + "epoch": 0.33072, + "grad_norm": 5.344501495361328, + "learning_rate": 1.6928482803366284e-05, + "loss": 0.3105, + "step": 16536 + }, + { + "epoch": 0.33076, + "grad_norm": 1.2355037927627563, + "learning_rate": 1.692747591348049e-05, + "loss": 0.0759, + "step": 16538 + }, + { + "epoch": 0.3308, + "grad_norm": 2.3904693126678467, + "learning_rate": 1.6926468888539988e-05, + "loss": 0.0711, + "step": 16540 + }, + { + "epoch": 0.33084, + "grad_norm": 0.8571620583534241, + "learning_rate": 1.692546172856441e-05, + "loss": 0.0271, + "step": 16542 + }, + { + "epoch": 0.33088, + "grad_norm": 0.49500951170921326, + "learning_rate": 1.6924454433573392e-05, + "loss": 0.0236, + "step": 16544 + }, + { + "epoch": 0.33092, + "grad_norm": 0.9211265444755554, + "learning_rate": 1.6923447003586568e-05, + "loss": 0.2313, + "step": 16546 + }, + { + "epoch": 0.33096, + "grad_norm": 3.2544314861297607, + "learning_rate": 1.692243943862358e-05, + "loss": 0.259, + "step": 16548 + }, + { + "epoch": 0.331, + "grad_norm": 3.3456060886383057, + "learning_rate": 1.692143173870407e-05, + "loss": 0.1057, + "step": 16550 + }, + { + "epoch": 0.33104, + "grad_norm": 0.6489210724830627, + "learning_rate": 1.6920423903847683e-05, + "loss": 0.0341, + "step": 16552 + }, + { + "epoch": 0.33108, + "grad_norm": 1.1154831647872925, + "learning_rate": 1.6919415934074074e-05, + "loss": 0.2496, + "step": 16554 + }, + { + "epoch": 0.33112, + "grad_norm": 0.2887541353702545, + "learning_rate": 1.6918407829402888e-05, + "loss": 0.1246, + "step": 16556 + }, + { + "epoch": 0.33116, + "grad_norm": 2.6522436141967773, + "learning_rate": 1.691739958985378e-05, + "loss": 0.1433, + "step": 16558 + }, + { + "epoch": 0.3312, + "grad_norm": 0.19953754544258118, + "learning_rate": 1.6916391215446403e-05, + "loss": 0.0076, + "step": 16560 + }, + { + "epoch": 0.33124, + "grad_norm": 0.7643298506736755, + "learning_rate": 1.691538270620042e-05, + "loss": 0.0959, + "step": 16562 + }, + { + "epoch": 0.33128, + "grad_norm": 1.9934186935424805, + "learning_rate": 1.6914374062135492e-05, + "loss": 0.054, + "step": 16564 + }, + { + "epoch": 0.33132, + "grad_norm": 0.6715986728668213, + "learning_rate": 1.6913365283271278e-05, + "loss": 0.358, + "step": 16566 + }, + { + "epoch": 0.33136, + "grad_norm": 3.12096905708313, + "learning_rate": 1.6912356369627452e-05, + "loss": 0.1435, + "step": 16568 + }, + { + "epoch": 0.3314, + "grad_norm": 2.1117031574249268, + "learning_rate": 1.691134732122368e-05, + "loss": 0.0648, + "step": 16570 + }, + { + "epoch": 0.33144, + "grad_norm": 2.5786502361297607, + "learning_rate": 1.691033813807963e-05, + "loss": 0.2377, + "step": 16572 + }, + { + "epoch": 0.33148, + "grad_norm": 2.658134937286377, + "learning_rate": 1.6909328820214984e-05, + "loss": 0.0849, + "step": 16574 + }, + { + "epoch": 0.33152, + "grad_norm": 3.769286632537842, + "learning_rate": 1.6908319367649413e-05, + "loss": 0.1688, + "step": 16576 + }, + { + "epoch": 0.33156, + "grad_norm": 0.26206791400909424, + "learning_rate": 1.6907309780402603e-05, + "loss": 0.0107, + "step": 16578 + }, + { + "epoch": 0.3316, + "grad_norm": 0.2837698459625244, + "learning_rate": 1.690630005849423e-05, + "loss": 0.0655, + "step": 16580 + }, + { + "epoch": 0.33164, + "grad_norm": 2.075249195098877, + "learning_rate": 1.6905290201943983e-05, + "loss": 0.0546, + "step": 16582 + }, + { + "epoch": 0.33168, + "grad_norm": 1.152931809425354, + "learning_rate": 1.6904280210771547e-05, + "loss": 0.0591, + "step": 16584 + }, + { + "epoch": 0.33172, + "grad_norm": 1.7530357837677002, + "learning_rate": 1.6903270084996615e-05, + "loss": 0.0529, + "step": 16586 + }, + { + "epoch": 0.33176, + "grad_norm": 1.8961780071258545, + "learning_rate": 1.6902259824638877e-05, + "loss": 0.074, + "step": 16588 + }, + { + "epoch": 0.3318, + "grad_norm": 0.04489324614405632, + "learning_rate": 1.6901249429718033e-05, + "loss": 0.08, + "step": 16590 + }, + { + "epoch": 0.33184, + "grad_norm": 0.4599303603172302, + "learning_rate": 1.6900238900253777e-05, + "loss": 0.293, + "step": 16592 + }, + { + "epoch": 0.33188, + "grad_norm": 2.714822292327881, + "learning_rate": 1.689922823626581e-05, + "loss": 0.0823, + "step": 16594 + }, + { + "epoch": 0.33192, + "grad_norm": 0.25278186798095703, + "learning_rate": 1.6898217437773837e-05, + "loss": 0.2403, + "step": 16596 + }, + { + "epoch": 0.33196, + "grad_norm": 0.18026241660118103, + "learning_rate": 1.6897206504797563e-05, + "loss": 0.0076, + "step": 16598 + }, + { + "epoch": 0.332, + "grad_norm": 0.25344207882881165, + "learning_rate": 1.68961954373567e-05, + "loss": 0.2402, + "step": 16600 + }, + { + "epoch": 0.33204, + "grad_norm": 1.1532522439956665, + "learning_rate": 1.6895184235470955e-05, + "loss": 0.0801, + "step": 16602 + }, + { + "epoch": 0.33208, + "grad_norm": 1.658304214477539, + "learning_rate": 1.689417289916004e-05, + "loss": 0.6154, + "step": 16604 + }, + { + "epoch": 0.33212, + "grad_norm": 1.6712119579315186, + "learning_rate": 1.689316142844368e-05, + "loss": 0.0404, + "step": 16606 + }, + { + "epoch": 0.33216, + "grad_norm": 0.6218709349632263, + "learning_rate": 1.689214982334159e-05, + "loss": 0.051, + "step": 16608 + }, + { + "epoch": 0.3322, + "grad_norm": 0.7998192310333252, + "learning_rate": 1.6891138083873486e-05, + "loss": 0.0319, + "step": 16610 + }, + { + "epoch": 0.33224, + "grad_norm": 1.6321742534637451, + "learning_rate": 1.68901262100591e-05, + "loss": 0.0636, + "step": 16612 + }, + { + "epoch": 0.33228, + "grad_norm": 0.7016741633415222, + "learning_rate": 1.688911420191816e-05, + "loss": 0.0409, + "step": 16614 + }, + { + "epoch": 0.33232, + "grad_norm": 0.47498518228530884, + "learning_rate": 1.6888102059470384e-05, + "loss": 0.0201, + "step": 16616 + }, + { + "epoch": 0.33236, + "grad_norm": 0.7560154795646667, + "learning_rate": 1.6887089782735516e-05, + "loss": 0.1325, + "step": 16618 + }, + { + "epoch": 0.3324, + "grad_norm": 0.9177704453468323, + "learning_rate": 1.6886077371733285e-05, + "loss": 0.0394, + "step": 16620 + }, + { + "epoch": 0.33244, + "grad_norm": 0.6514781713485718, + "learning_rate": 1.6885064826483433e-05, + "loss": 0.2014, + "step": 16622 + }, + { + "epoch": 0.33248, + "grad_norm": 5.056098461151123, + "learning_rate": 1.6884052147005697e-05, + "loss": 0.2883, + "step": 16624 + }, + { + "epoch": 0.33252, + "grad_norm": 0.9682506918907166, + "learning_rate": 1.688303933331982e-05, + "loss": 0.1534, + "step": 16626 + }, + { + "epoch": 0.33256, + "grad_norm": 5.815745830535889, + "learning_rate": 1.6882026385445548e-05, + "loss": 0.2578, + "step": 16628 + }, + { + "epoch": 0.3326, + "grad_norm": 0.14805668592453003, + "learning_rate": 1.688101330340263e-05, + "loss": 0.0114, + "step": 16630 + }, + { + "epoch": 0.33264, + "grad_norm": 0.8298325538635254, + "learning_rate": 1.688000008721081e-05, + "loss": 0.0428, + "step": 16632 + }, + { + "epoch": 0.33268, + "grad_norm": 1.77851140499115, + "learning_rate": 1.687898673688985e-05, + "loss": 0.1129, + "step": 16634 + }, + { + "epoch": 0.33272, + "grad_norm": 0.10805915296077728, + "learning_rate": 1.68779732524595e-05, + "loss": 0.0085, + "step": 16636 + }, + { + "epoch": 0.33276, + "grad_norm": 0.09099410474300385, + "learning_rate": 1.6876959633939525e-05, + "loss": 0.0171, + "step": 16638 + }, + { + "epoch": 0.3328, + "grad_norm": 1.3410054445266724, + "learning_rate": 1.6875945881349676e-05, + "loss": 0.039, + "step": 16640 + }, + { + "epoch": 0.33284, + "grad_norm": 1.1494176387786865, + "learning_rate": 1.687493199470972e-05, + "loss": 0.0421, + "step": 16642 + }, + { + "epoch": 0.33288, + "grad_norm": 4.4510087966918945, + "learning_rate": 1.6873917974039433e-05, + "loss": 0.3941, + "step": 16644 + }, + { + "epoch": 0.33292, + "grad_norm": 2.3761754035949707, + "learning_rate": 1.6872903819358572e-05, + "loss": 0.2146, + "step": 16646 + }, + { + "epoch": 0.33296, + "grad_norm": 7.450374126434326, + "learning_rate": 1.6871889530686914e-05, + "loss": 0.3586, + "step": 16648 + }, + { + "epoch": 0.333, + "grad_norm": 1.0324788093566895, + "learning_rate": 1.6870875108044233e-05, + "loss": 0.0358, + "step": 16650 + }, + { + "epoch": 0.33304, + "grad_norm": 0.771527111530304, + "learning_rate": 1.68698605514503e-05, + "loss": 0.1618, + "step": 16652 + }, + { + "epoch": 0.33308, + "grad_norm": 0.6965760588645935, + "learning_rate": 1.6868845860924904e-05, + "loss": 0.0928, + "step": 16654 + }, + { + "epoch": 0.33312, + "grad_norm": 0.04678639769554138, + "learning_rate": 1.6867831036487817e-05, + "loss": 0.0278, + "step": 16656 + }, + { + "epoch": 0.33316, + "grad_norm": 0.3338869512081146, + "learning_rate": 1.686681607815883e-05, + "loss": 0.1471, + "step": 16658 + }, + { + "epoch": 0.3332, + "grad_norm": 0.15934427082538605, + "learning_rate": 1.686580098595773e-05, + "loss": 0.0074, + "step": 16660 + }, + { + "epoch": 0.33324, + "grad_norm": 1.1188606023788452, + "learning_rate": 1.6864785759904303e-05, + "loss": 0.0484, + "step": 16662 + }, + { + "epoch": 0.33328, + "grad_norm": 5.047342300415039, + "learning_rate": 1.6863770400018344e-05, + "loss": 0.2087, + "step": 16664 + }, + { + "epoch": 0.33332, + "grad_norm": 0.6817123889923096, + "learning_rate": 1.6862754906319644e-05, + "loss": 0.1817, + "step": 16666 + }, + { + "epoch": 0.33336, + "grad_norm": 0.4822758734226227, + "learning_rate": 1.6861739278828008e-05, + "loss": 0.0358, + "step": 16668 + }, + { + "epoch": 0.3334, + "grad_norm": 0.3511021137237549, + "learning_rate": 1.6860723517563232e-05, + "loss": 0.01, + "step": 16670 + }, + { + "epoch": 0.33344, + "grad_norm": 0.4015445113182068, + "learning_rate": 1.685970762254512e-05, + "loss": 0.0149, + "step": 16672 + }, + { + "epoch": 0.33348, + "grad_norm": 0.6052727699279785, + "learning_rate": 1.685869159379347e-05, + "loss": 0.0212, + "step": 16674 + }, + { + "epoch": 0.33352, + "grad_norm": 0.2376863658428192, + "learning_rate": 1.68576754313281e-05, + "loss": 0.0056, + "step": 16676 + }, + { + "epoch": 0.33356, + "grad_norm": 0.19871146976947784, + "learning_rate": 1.6856659135168822e-05, + "loss": 0.0108, + "step": 16678 + }, + { + "epoch": 0.3336, + "grad_norm": 0.9796876311302185, + "learning_rate": 1.6855642705335438e-05, + "loss": 0.0323, + "step": 16680 + }, + { + "epoch": 0.33364, + "grad_norm": 1.59602689743042, + "learning_rate": 1.685462614184777e-05, + "loss": 0.0528, + "step": 16682 + }, + { + "epoch": 0.33368, + "grad_norm": 2.018644332885742, + "learning_rate": 1.6853609444725635e-05, + "loss": 0.0622, + "step": 16684 + }, + { + "epoch": 0.33372, + "grad_norm": 0.44561755657196045, + "learning_rate": 1.685259261398886e-05, + "loss": 0.021, + "step": 16686 + }, + { + "epoch": 0.33376, + "grad_norm": 0.2881699502468109, + "learning_rate": 1.6851575649657263e-05, + "loss": 0.0435, + "step": 16688 + }, + { + "epoch": 0.3338, + "grad_norm": 0.08678021281957626, + "learning_rate": 1.685055855175067e-05, + "loss": 0.1644, + "step": 16690 + }, + { + "epoch": 0.33384, + "grad_norm": 0.2870337963104248, + "learning_rate": 1.6849541320288915e-05, + "loss": 0.0589, + "step": 16692 + }, + { + "epoch": 0.33388, + "grad_norm": 0.27139919996261597, + "learning_rate": 1.6848523955291818e-05, + "loss": 0.0054, + "step": 16694 + }, + { + "epoch": 0.33392, + "grad_norm": 5.357472896575928, + "learning_rate": 1.6847506456779224e-05, + "loss": 0.1616, + "step": 16696 + }, + { + "epoch": 0.33396, + "grad_norm": 4.6909260749816895, + "learning_rate": 1.684648882477097e-05, + "loss": 0.1395, + "step": 16698 + }, + { + "epoch": 0.334, + "grad_norm": 6.508863925933838, + "learning_rate": 1.684547105928689e-05, + "loss": 0.241, + "step": 16700 + }, + { + "epoch": 0.33404, + "grad_norm": 8.53169059753418, + "learning_rate": 1.6844453160346822e-05, + "loss": 0.4143, + "step": 16702 + }, + { + "epoch": 0.33408, + "grad_norm": 0.7827663421630859, + "learning_rate": 1.6843435127970623e-05, + "loss": 0.0339, + "step": 16704 + }, + { + "epoch": 0.33412, + "grad_norm": 0.07489456236362457, + "learning_rate": 1.684241696217813e-05, + "loss": 0.0052, + "step": 16706 + }, + { + "epoch": 0.33416, + "grad_norm": 0.8925665020942688, + "learning_rate": 1.6841398662989194e-05, + "loss": 0.1271, + "step": 16708 + }, + { + "epoch": 0.3342, + "grad_norm": 3.7668182849884033, + "learning_rate": 1.684038023042367e-05, + "loss": 0.0996, + "step": 16710 + }, + { + "epoch": 0.33424, + "grad_norm": 0.5317335724830627, + "learning_rate": 1.6839361664501413e-05, + "loss": 0.0267, + "step": 16712 + }, + { + "epoch": 0.33428, + "grad_norm": 4.7232770919799805, + "learning_rate": 1.683834296524228e-05, + "loss": 0.1631, + "step": 16714 + }, + { + "epoch": 0.33432, + "grad_norm": 4.073986053466797, + "learning_rate": 1.6837324132666127e-05, + "loss": 0.3029, + "step": 16716 + }, + { + "epoch": 0.33436, + "grad_norm": 1.6283317804336548, + "learning_rate": 1.6836305166792822e-05, + "loss": 0.0463, + "step": 16718 + }, + { + "epoch": 0.3344, + "grad_norm": 3.5407769680023193, + "learning_rate": 1.6835286067642228e-05, + "loss": 0.1803, + "step": 16720 + }, + { + "epoch": 0.33444, + "grad_norm": 9.276052474975586, + "learning_rate": 1.6834266835234215e-05, + "loss": 0.4417, + "step": 16722 + }, + { + "epoch": 0.33448, + "grad_norm": 8.764215469360352, + "learning_rate": 1.6833247469588647e-05, + "loss": 0.4902, + "step": 16724 + }, + { + "epoch": 0.33452, + "grad_norm": 9.807323455810547, + "learning_rate": 1.6832227970725404e-05, + "loss": 0.2554, + "step": 16726 + }, + { + "epoch": 0.33456, + "grad_norm": 0.3811289370059967, + "learning_rate": 1.683120833866436e-05, + "loss": 0.1483, + "step": 16728 + }, + { + "epoch": 0.3346, + "grad_norm": 3.541203022003174, + "learning_rate": 1.683018857342539e-05, + "loss": 0.1665, + "step": 16730 + }, + { + "epoch": 0.33464, + "grad_norm": 1.3100032806396484, + "learning_rate": 1.682916867502838e-05, + "loss": 0.0284, + "step": 16732 + }, + { + "epoch": 0.33468, + "grad_norm": 4.247417449951172, + "learning_rate": 1.682814864349321e-05, + "loss": 0.1718, + "step": 16734 + }, + { + "epoch": 0.33472, + "grad_norm": 2.3857433795928955, + "learning_rate": 1.6827128478839767e-05, + "loss": 0.1584, + "step": 16736 + }, + { + "epoch": 0.33476, + "grad_norm": 1.5587561130523682, + "learning_rate": 1.682610818108794e-05, + "loss": 0.0535, + "step": 16738 + }, + { + "epoch": 0.3348, + "grad_norm": 0.7828773260116577, + "learning_rate": 1.6825087750257617e-05, + "loss": 0.027, + "step": 16740 + }, + { + "epoch": 0.33484, + "grad_norm": 0.9500515460968018, + "learning_rate": 1.6824067186368695e-05, + "loss": 0.634, + "step": 16742 + }, + { + "epoch": 0.33488, + "grad_norm": 3.6887686252593994, + "learning_rate": 1.6823046489441072e-05, + "loss": 0.1683, + "step": 16744 + }, + { + "epoch": 0.33492, + "grad_norm": 1.0140001773834229, + "learning_rate": 1.6822025659494646e-05, + "loss": 0.0432, + "step": 16746 + }, + { + "epoch": 0.33496, + "grad_norm": 5.372739315032959, + "learning_rate": 1.6821004696549316e-05, + "loss": 0.2061, + "step": 16748 + }, + { + "epoch": 0.335, + "grad_norm": 0.22667771577835083, + "learning_rate": 1.6819983600624986e-05, + "loss": 0.0178, + "step": 16750 + }, + { + "epoch": 0.33504, + "grad_norm": 0.2845028340816498, + "learning_rate": 1.6818962371741567e-05, + "loss": 0.3258, + "step": 16752 + }, + { + "epoch": 0.33508, + "grad_norm": 9.220826148986816, + "learning_rate": 1.681794100991896e-05, + "loss": 0.3628, + "step": 16754 + }, + { + "epoch": 0.33512, + "grad_norm": 0.2743541896343231, + "learning_rate": 1.6816919515177094e-05, + "loss": 0.0659, + "step": 16756 + }, + { + "epoch": 0.33516, + "grad_norm": 0.782159149646759, + "learning_rate": 1.6815897887535865e-05, + "loss": 0.3223, + "step": 16758 + }, + { + "epoch": 0.3352, + "grad_norm": 0.7060818672180176, + "learning_rate": 1.68148761270152e-05, + "loss": 0.0569, + "step": 16760 + }, + { + "epoch": 0.33524, + "grad_norm": 0.994706928730011, + "learning_rate": 1.6813854233635013e-05, + "loss": 0.0434, + "step": 16762 + }, + { + "epoch": 0.33528, + "grad_norm": 1.3881088495254517, + "learning_rate": 1.681283220741523e-05, + "loss": 0.068, + "step": 16764 + }, + { + "epoch": 0.33532, + "grad_norm": 0.3119105398654938, + "learning_rate": 1.681181004837578e-05, + "loss": 0.0109, + "step": 16766 + }, + { + "epoch": 0.33536, + "grad_norm": 0.28930678963661194, + "learning_rate": 1.6810787756536584e-05, + "loss": 0.0148, + "step": 16768 + }, + { + "epoch": 0.3354, + "grad_norm": 3.992739200592041, + "learning_rate": 1.6809765331917576e-05, + "loss": 0.193, + "step": 16770 + }, + { + "epoch": 0.33544, + "grad_norm": 1.0316425561904907, + "learning_rate": 1.6808742774538683e-05, + "loss": 0.0291, + "step": 16772 + }, + { + "epoch": 0.33548, + "grad_norm": 0.8042166829109192, + "learning_rate": 1.6807720084419847e-05, + "loss": 0.1246, + "step": 16774 + }, + { + "epoch": 0.33552, + "grad_norm": 3.2400081157684326, + "learning_rate": 1.6806697261581e-05, + "loss": 0.1371, + "step": 16776 + }, + { + "epoch": 0.33556, + "grad_norm": 1.138961672782898, + "learning_rate": 1.6805674306042094e-05, + "loss": 0.2528, + "step": 16778 + }, + { + "epoch": 0.3356, + "grad_norm": 0.10956587642431259, + "learning_rate": 1.6804651217823055e-05, + "loss": 0.2222, + "step": 16780 + }, + { + "epoch": 0.33564, + "grad_norm": 1.736253261566162, + "learning_rate": 1.680362799694384e-05, + "loss": 0.1345, + "step": 16782 + }, + { + "epoch": 0.33568, + "grad_norm": 0.5647356510162354, + "learning_rate": 1.6802604643424396e-05, + "loss": 0.3009, + "step": 16784 + }, + { + "epoch": 0.33572, + "grad_norm": 5.443858623504639, + "learning_rate": 1.6801581157284667e-05, + "loss": 0.1796, + "step": 16786 + }, + { + "epoch": 0.33576, + "grad_norm": 1.4146254062652588, + "learning_rate": 1.6800557538544614e-05, + "loss": 0.0471, + "step": 16788 + }, + { + "epoch": 0.3358, + "grad_norm": 1.3830536603927612, + "learning_rate": 1.6799533787224192e-05, + "loss": 0.0762, + "step": 16790 + }, + { + "epoch": 0.33584, + "grad_norm": 1.227768898010254, + "learning_rate": 1.6798509903343362e-05, + "loss": 0.057, + "step": 16792 + }, + { + "epoch": 0.33588, + "grad_norm": 2.3848235607147217, + "learning_rate": 1.6797485886922075e-05, + "loss": 0.1131, + "step": 16794 + }, + { + "epoch": 0.33592, + "grad_norm": 0.9173333644866943, + "learning_rate": 1.6796461737980302e-05, + "loss": 0.0261, + "step": 16796 + }, + { + "epoch": 0.33596, + "grad_norm": 1.473246693611145, + "learning_rate": 1.6795437456538012e-05, + "loss": 0.0725, + "step": 16798 + }, + { + "epoch": 0.336, + "grad_norm": 4.268857955932617, + "learning_rate": 1.6794413042615168e-05, + "loss": 0.193, + "step": 16800 + }, + { + "epoch": 0.33604, + "grad_norm": 0.2798662781715393, + "learning_rate": 1.6793388496231743e-05, + "loss": 0.1963, + "step": 16802 + }, + { + "epoch": 0.33608, + "grad_norm": 0.6838331818580627, + "learning_rate": 1.679236381740771e-05, + "loss": 0.085, + "step": 16804 + }, + { + "epoch": 0.33612, + "grad_norm": 1.6307849884033203, + "learning_rate": 1.679133900616305e-05, + "loss": 0.2319, + "step": 16806 + }, + { + "epoch": 0.33616, + "grad_norm": 0.5652933120727539, + "learning_rate": 1.679031406251774e-05, + "loss": 0.3498, + "step": 16808 + }, + { + "epoch": 0.3362, + "grad_norm": 1.1211968660354614, + "learning_rate": 1.6789288986491764e-05, + "loss": 0.0247, + "step": 16810 + }, + { + "epoch": 0.33624, + "grad_norm": 3.0112743377685547, + "learning_rate": 1.67882637781051e-05, + "loss": 0.0982, + "step": 16812 + }, + { + "epoch": 0.33628, + "grad_norm": 0.5299977660179138, + "learning_rate": 1.678723843737774e-05, + "loss": 0.2803, + "step": 16814 + }, + { + "epoch": 0.33632, + "grad_norm": 6.5041399002075195, + "learning_rate": 1.678621296432967e-05, + "loss": 0.3424, + "step": 16816 + }, + { + "epoch": 0.33636, + "grad_norm": 0.7807433009147644, + "learning_rate": 1.678518735898089e-05, + "loss": 0.0617, + "step": 16818 + }, + { + "epoch": 0.3364, + "grad_norm": 5.904835224151611, + "learning_rate": 1.6784161621351384e-05, + "loss": 0.2752, + "step": 16820 + }, + { + "epoch": 0.33644, + "grad_norm": 4.768101692199707, + "learning_rate": 1.6783135751461154e-05, + "loss": 0.245, + "step": 16822 + }, + { + "epoch": 0.33648, + "grad_norm": 1.400777816772461, + "learning_rate": 1.6782109749330205e-05, + "loss": 0.1296, + "step": 16824 + }, + { + "epoch": 0.33652, + "grad_norm": 3.734025239944458, + "learning_rate": 1.678108361497853e-05, + "loss": 0.1114, + "step": 16826 + }, + { + "epoch": 0.33656, + "grad_norm": 3.2174153327941895, + "learning_rate": 1.6780057348426143e-05, + "loss": 0.1141, + "step": 16828 + }, + { + "epoch": 0.3366, + "grad_norm": 6.0337605476379395, + "learning_rate": 1.6779030949693044e-05, + "loss": 0.3606, + "step": 16830 + }, + { + "epoch": 0.33664, + "grad_norm": 1.0544716119766235, + "learning_rate": 1.677800441879925e-05, + "loss": 0.0346, + "step": 16832 + }, + { + "epoch": 0.33668, + "grad_norm": 4.320994853973389, + "learning_rate": 1.6776977755764767e-05, + "loss": 0.1927, + "step": 16834 + }, + { + "epoch": 0.33672, + "grad_norm": 0.33737438917160034, + "learning_rate": 1.6775950960609616e-05, + "loss": 0.0169, + "step": 16836 + }, + { + "epoch": 0.33676, + "grad_norm": 0.4967826008796692, + "learning_rate": 1.6774924033353813e-05, + "loss": 0.0529, + "step": 16838 + }, + { + "epoch": 0.3368, + "grad_norm": 0.7977933883666992, + "learning_rate": 1.6773896974017373e-05, + "loss": 0.062, + "step": 16840 + }, + { + "epoch": 0.33684, + "grad_norm": 0.4937629699707031, + "learning_rate": 1.677286978262033e-05, + "loss": 0.0197, + "step": 16842 + }, + { + "epoch": 0.33688, + "grad_norm": 7.2934980392456055, + "learning_rate": 1.6771842459182703e-05, + "loss": 0.376, + "step": 16844 + }, + { + "epoch": 0.33692, + "grad_norm": 1.8130958080291748, + "learning_rate": 1.6770815003724515e-05, + "loss": 0.0739, + "step": 16846 + }, + { + "epoch": 0.33696, + "grad_norm": 1.4755409955978394, + "learning_rate": 1.6769787416265808e-05, + "loss": 0.1971, + "step": 16848 + }, + { + "epoch": 0.337, + "grad_norm": 0.10397140681743622, + "learning_rate": 1.6768759696826608e-05, + "loss": 0.0118, + "step": 16850 + }, + { + "epoch": 0.33704, + "grad_norm": 1.1834120750427246, + "learning_rate": 1.6767731845426956e-05, + "loss": 0.1065, + "step": 16852 + }, + { + "epoch": 0.33708, + "grad_norm": 0.31142157316207886, + "learning_rate": 1.6766703862086882e-05, + "loss": 0.0191, + "step": 16854 + }, + { + "epoch": 0.33712, + "grad_norm": 0.28139904141426086, + "learning_rate": 1.676567574682644e-05, + "loss": 0.1026, + "step": 16856 + }, + { + "epoch": 0.33716, + "grad_norm": 1.9728635549545288, + "learning_rate": 1.676464749966566e-05, + "loss": 0.2208, + "step": 16858 + }, + { + "epoch": 0.3372, + "grad_norm": 0.9441595673561096, + "learning_rate": 1.6763619120624595e-05, + "loss": 0.0222, + "step": 16860 + }, + { + "epoch": 0.33724, + "grad_norm": 3.2067246437072754, + "learning_rate": 1.6762590609723295e-05, + "loss": 0.1109, + "step": 16862 + }, + { + "epoch": 0.33728, + "grad_norm": 0.026357220485806465, + "learning_rate": 1.6761561966981807e-05, + "loss": 0.0079, + "step": 16864 + }, + { + "epoch": 0.33732, + "grad_norm": 0.31788161396980286, + "learning_rate": 1.676053319242019e-05, + "loss": 0.0255, + "step": 16866 + }, + { + "epoch": 0.33736, + "grad_norm": 0.6147656440734863, + "learning_rate": 1.6759504286058495e-05, + "loss": 0.7262, + "step": 16868 + }, + { + "epoch": 0.3374, + "grad_norm": 0.5182554125785828, + "learning_rate": 1.6758475247916786e-05, + "loss": 0.1193, + "step": 16870 + }, + { + "epoch": 0.33744, + "grad_norm": 0.1513175070285797, + "learning_rate": 1.675744607801512e-05, + "loss": 0.1663, + "step": 16872 + }, + { + "epoch": 0.33748, + "grad_norm": 3.3411529064178467, + "learning_rate": 1.675641677637357e-05, + "loss": 0.1176, + "step": 16874 + }, + { + "epoch": 0.33752, + "grad_norm": 0.25415274500846863, + "learning_rate": 1.675538734301219e-05, + "loss": 0.019, + "step": 16876 + }, + { + "epoch": 0.33756, + "grad_norm": 2.037665843963623, + "learning_rate": 1.675435777795106e-05, + "loss": 0.0796, + "step": 16878 + }, + { + "epoch": 0.3376, + "grad_norm": 0.9893176555633545, + "learning_rate": 1.6753328081210244e-05, + "loss": 0.0825, + "step": 16880 + }, + { + "epoch": 0.33764, + "grad_norm": 5.486210823059082, + "learning_rate": 1.675229825280982e-05, + "loss": 0.1935, + "step": 16882 + }, + { + "epoch": 0.33768, + "grad_norm": 3.051051378250122, + "learning_rate": 1.6751268292769873e-05, + "loss": 0.13, + "step": 16884 + }, + { + "epoch": 0.33772, + "grad_norm": 3.044945001602173, + "learning_rate": 1.675023820111047e-05, + "loss": 0.0703, + "step": 16886 + }, + { + "epoch": 0.33776, + "grad_norm": 0.7334156036376953, + "learning_rate": 1.6749207977851695e-05, + "loss": 0.0379, + "step": 16888 + }, + { + "epoch": 0.3378, + "grad_norm": 0.3982863128185272, + "learning_rate": 1.6748177623013638e-05, + "loss": 0.0554, + "step": 16890 + }, + { + "epoch": 0.33784, + "grad_norm": 1.3133035898208618, + "learning_rate": 1.6747147136616385e-05, + "loss": 0.0612, + "step": 16892 + }, + { + "epoch": 0.33788, + "grad_norm": 5.725838661193848, + "learning_rate": 1.6746116518680025e-05, + "loss": 0.2661, + "step": 16894 + }, + { + "epoch": 0.33792, + "grad_norm": 1.1251200437545776, + "learning_rate": 1.674508576922465e-05, + "loss": 0.1359, + "step": 16896 + }, + { + "epoch": 0.33796, + "grad_norm": 4.70427131652832, + "learning_rate": 1.6744054888270352e-05, + "loss": 0.1763, + "step": 16898 + }, + { + "epoch": 0.338, + "grad_norm": 3.237642526626587, + "learning_rate": 1.6743023875837233e-05, + "loss": 0.1251, + "step": 16900 + }, + { + "epoch": 0.33804, + "grad_norm": 0.8153768181800842, + "learning_rate": 1.6741992731945396e-05, + "loss": 0.0255, + "step": 16902 + }, + { + "epoch": 0.33808, + "grad_norm": 0.23805347084999084, + "learning_rate": 1.6740961456614938e-05, + "loss": 0.0514, + "step": 16904 + }, + { + "epoch": 0.33812, + "grad_norm": 3.1128036975860596, + "learning_rate": 1.6739930049865965e-05, + "loss": 0.092, + "step": 16906 + }, + { + "epoch": 0.33816, + "grad_norm": 0.3832116425037384, + "learning_rate": 1.6738898511718588e-05, + "loss": 0.2113, + "step": 16908 + }, + { + "epoch": 0.3382, + "grad_norm": 0.41894611716270447, + "learning_rate": 1.6737866842192908e-05, + "loss": 0.0419, + "step": 16910 + }, + { + "epoch": 0.33824, + "grad_norm": 1.7902566194534302, + "learning_rate": 1.6736835041309053e-05, + "loss": 0.1903, + "step": 16912 + }, + { + "epoch": 0.33828, + "grad_norm": 4.188342571258545, + "learning_rate": 1.6735803109087125e-05, + "loss": 0.1473, + "step": 16914 + }, + { + "epoch": 0.33832, + "grad_norm": 3.507917881011963, + "learning_rate": 1.673477104554725e-05, + "loss": 0.1536, + "step": 16916 + }, + { + "epoch": 0.33836, + "grad_norm": 0.7962108254432678, + "learning_rate": 1.6733738850709547e-05, + "loss": 0.0203, + "step": 16918 + }, + { + "epoch": 0.3384, + "grad_norm": 0.09323643893003464, + "learning_rate": 1.6732706524594138e-05, + "loss": 0.0438, + "step": 16920 + }, + { + "epoch": 0.33844, + "grad_norm": 0.2687893211841583, + "learning_rate": 1.673167406722115e-05, + "loss": 0.1006, + "step": 16922 + }, + { + "epoch": 0.33848, + "grad_norm": 2.5212855339050293, + "learning_rate": 1.673064147861071e-05, + "loss": 0.0576, + "step": 16924 + }, + { + "epoch": 0.33852, + "grad_norm": 6.957879543304443, + "learning_rate": 1.6729608758782948e-05, + "loss": 0.4596, + "step": 16926 + }, + { + "epoch": 0.33856, + "grad_norm": 0.9253079891204834, + "learning_rate": 1.6728575907758e-05, + "loss": 0.0131, + "step": 16928 + }, + { + "epoch": 0.3386, + "grad_norm": 0.6495663523674011, + "learning_rate": 1.6727542925556e-05, + "loss": 0.0162, + "step": 16930 + }, + { + "epoch": 0.33864, + "grad_norm": 0.10559587925672531, + "learning_rate": 1.6726509812197085e-05, + "loss": 0.3204, + "step": 16932 + }, + { + "epoch": 0.33868, + "grad_norm": 0.7624945640563965, + "learning_rate": 1.67254765677014e-05, + "loss": 0.0226, + "step": 16934 + }, + { + "epoch": 0.33872, + "grad_norm": 0.15978480875492096, + "learning_rate": 1.6724443192089084e-05, + "loss": 0.1193, + "step": 16936 + }, + { + "epoch": 0.33876, + "grad_norm": 0.05267977714538574, + "learning_rate": 1.6723409685380288e-05, + "loss": 0.0029, + "step": 16938 + }, + { + "epoch": 0.3388, + "grad_norm": 0.05778368189930916, + "learning_rate": 1.6722376047595163e-05, + "loss": 0.0035, + "step": 16940 + }, + { + "epoch": 0.33884, + "grad_norm": 6.1596879959106445, + "learning_rate": 1.672134227875385e-05, + "loss": 0.8277, + "step": 16942 + }, + { + "epoch": 0.33888, + "grad_norm": 2.7986862659454346, + "learning_rate": 1.6720308378876514e-05, + "loss": 0.1521, + "step": 16944 + }, + { + "epoch": 0.33892, + "grad_norm": 0.07473911345005035, + "learning_rate": 1.67192743479833e-05, + "loss": 0.1639, + "step": 16946 + }, + { + "epoch": 0.33896, + "grad_norm": 7.511332988739014, + "learning_rate": 1.671824018609438e-05, + "loss": 0.2744, + "step": 16948 + }, + { + "epoch": 0.339, + "grad_norm": 0.8236697316169739, + "learning_rate": 1.6717205893229904e-05, + "loss": 0.3077, + "step": 16950 + }, + { + "epoch": 0.33904, + "grad_norm": 0.6431706547737122, + "learning_rate": 1.6716171469410042e-05, + "loss": 0.0201, + "step": 16952 + }, + { + "epoch": 0.33908, + "grad_norm": 1.4383758306503296, + "learning_rate": 1.6715136914654962e-05, + "loss": 0.034, + "step": 16954 + }, + { + "epoch": 0.33912, + "grad_norm": 2.709090232849121, + "learning_rate": 1.671410222898483e-05, + "loss": 0.1095, + "step": 16956 + }, + { + "epoch": 0.33916, + "grad_norm": 4.128770351409912, + "learning_rate": 1.6713067412419814e-05, + "loss": 0.1381, + "step": 16958 + }, + { + "epoch": 0.3392, + "grad_norm": 0.5411794185638428, + "learning_rate": 1.6712032464980094e-05, + "loss": 0.0139, + "step": 16960 + }, + { + "epoch": 0.33924, + "grad_norm": 1.9212247133255005, + "learning_rate": 1.671099738668585e-05, + "loss": 0.0736, + "step": 16962 + }, + { + "epoch": 0.33928, + "grad_norm": 0.4338116943836212, + "learning_rate": 1.6709962177557252e-05, + "loss": 0.2006, + "step": 16964 + }, + { + "epoch": 0.33932, + "grad_norm": 2.834181785583496, + "learning_rate": 1.670892683761449e-05, + "loss": 0.1585, + "step": 16966 + }, + { + "epoch": 0.33936, + "grad_norm": 5.776358604431152, + "learning_rate": 1.670789136687774e-05, + "loss": 0.5449, + "step": 16968 + }, + { + "epoch": 0.3394, + "grad_norm": 1.0032732486724854, + "learning_rate": 1.6706855765367202e-05, + "loss": 0.0294, + "step": 16970 + }, + { + "epoch": 0.33944, + "grad_norm": 2.9750423431396484, + "learning_rate": 1.6705820033103054e-05, + "loss": 0.0918, + "step": 16972 + }, + { + "epoch": 0.33948, + "grad_norm": 2.9288411140441895, + "learning_rate": 1.6704784170105496e-05, + "loss": 0.0799, + "step": 16974 + }, + { + "epoch": 0.33952, + "grad_norm": 0.37014245986938477, + "learning_rate": 1.670374817639471e-05, + "loss": 0.0243, + "step": 16976 + }, + { + "epoch": 0.33956, + "grad_norm": 0.34377771615982056, + "learning_rate": 1.6702712051990907e-05, + "loss": 0.2753, + "step": 16978 + }, + { + "epoch": 0.3396, + "grad_norm": 1.3321951627731323, + "learning_rate": 1.6701675796914284e-05, + "loss": 0.0489, + "step": 16980 + }, + { + "epoch": 0.33964, + "grad_norm": 1.543725848197937, + "learning_rate": 1.670063941118504e-05, + "loss": 0.0405, + "step": 16982 + }, + { + "epoch": 0.33968, + "grad_norm": 6.701570510864258, + "learning_rate": 1.669960289482338e-05, + "loss": 0.2571, + "step": 16984 + }, + { + "epoch": 0.33972, + "grad_norm": 0.4716836214065552, + "learning_rate": 1.669856624784951e-05, + "loss": 0.0146, + "step": 16986 + }, + { + "epoch": 0.33976, + "grad_norm": 5.421060562133789, + "learning_rate": 1.6697529470283646e-05, + "loss": 0.3308, + "step": 16988 + }, + { + "epoch": 0.3398, + "grad_norm": 4.46936559677124, + "learning_rate": 1.6696492562145996e-05, + "loss": 0.2542, + "step": 16990 + }, + { + "epoch": 0.33984, + "grad_norm": 2.488905429840088, + "learning_rate": 1.6695455523456776e-05, + "loss": 0.0581, + "step": 16992 + }, + { + "epoch": 0.33988, + "grad_norm": 2.2775096893310547, + "learning_rate": 1.6694418354236202e-05, + "loss": 0.0773, + "step": 16994 + }, + { + "epoch": 0.33992, + "grad_norm": 1.6818841695785522, + "learning_rate": 1.66933810545045e-05, + "loss": 0.0662, + "step": 16996 + }, + { + "epoch": 0.33996, + "grad_norm": 2.9971346855163574, + "learning_rate": 1.6692343624281883e-05, + "loss": 0.1232, + "step": 16998 + }, + { + "epoch": 0.34, + "grad_norm": 3.951794385910034, + "learning_rate": 1.6691306063588583e-05, + "loss": 0.1324, + "step": 17000 + }, + { + "epoch": 0.34004, + "grad_norm": 0.13534203171730042, + "learning_rate": 1.669026837244483e-05, + "loss": 0.0081, + "step": 17002 + }, + { + "epoch": 0.34008, + "grad_norm": 3.301889181137085, + "learning_rate": 1.6689230550870847e-05, + "loss": 0.1541, + "step": 17004 + }, + { + "epoch": 0.34012, + "grad_norm": 6.4559712409973145, + "learning_rate": 1.6688192598886868e-05, + "loss": 0.4247, + "step": 17006 + }, + { + "epoch": 0.34016, + "grad_norm": 0.33837175369262695, + "learning_rate": 1.6687154516513135e-05, + "loss": 0.0233, + "step": 17008 + }, + { + "epoch": 0.3402, + "grad_norm": 0.4409739077091217, + "learning_rate": 1.6686116303769884e-05, + "loss": 0.0227, + "step": 17010 + }, + { + "epoch": 0.34024, + "grad_norm": 0.6742091178894043, + "learning_rate": 1.668507796067735e-05, + "loss": 0.0508, + "step": 17012 + }, + { + "epoch": 0.34028, + "grad_norm": 2.648855447769165, + "learning_rate": 1.668403948725578e-05, + "loss": 0.1585, + "step": 17014 + }, + { + "epoch": 0.34032, + "grad_norm": 3.12753963470459, + "learning_rate": 1.6683000883525416e-05, + "loss": 0.1536, + "step": 17016 + }, + { + "epoch": 0.34036, + "grad_norm": 0.17951515316963196, + "learning_rate": 1.6681962149506516e-05, + "loss": 0.0337, + "step": 17018 + }, + { + "epoch": 0.3404, + "grad_norm": 1.9742778539657593, + "learning_rate": 1.668092328521932e-05, + "loss": 0.0676, + "step": 17020 + }, + { + "epoch": 0.34044, + "grad_norm": 2.1393556594848633, + "learning_rate": 1.667988429068408e-05, + "loss": 0.0748, + "step": 17022 + }, + { + "epoch": 0.34048, + "grad_norm": 0.33013829588890076, + "learning_rate": 1.6678845165921066e-05, + "loss": 0.1982, + "step": 17024 + }, + { + "epoch": 0.34052, + "grad_norm": 2.434903144836426, + "learning_rate": 1.6677805910950523e-05, + "loss": 0.1132, + "step": 17026 + }, + { + "epoch": 0.34056, + "grad_norm": 0.8751149773597717, + "learning_rate": 1.6676766525792713e-05, + "loss": 0.093, + "step": 17028 + }, + { + "epoch": 0.3406, + "grad_norm": 0.8466118574142456, + "learning_rate": 1.667572701046791e-05, + "loss": 0.0612, + "step": 17030 + }, + { + "epoch": 0.34064, + "grad_norm": 3.868595838546753, + "learning_rate": 1.6674687364996362e-05, + "loss": 0.1632, + "step": 17032 + }, + { + "epoch": 0.34068, + "grad_norm": 0.4567359685897827, + "learning_rate": 1.6673647589398353e-05, + "loss": 0.027, + "step": 17034 + }, + { + "epoch": 0.34072, + "grad_norm": 1.2878336906433105, + "learning_rate": 1.6672607683694145e-05, + "loss": 0.0604, + "step": 17036 + }, + { + "epoch": 0.34076, + "grad_norm": 0.14642199873924255, + "learning_rate": 1.667156764790402e-05, + "loss": 0.05, + "step": 17038 + }, + { + "epoch": 0.3408, + "grad_norm": 2.2066848278045654, + "learning_rate": 1.6670527482048246e-05, + "loss": 0.1912, + "step": 17040 + }, + { + "epoch": 0.34084, + "grad_norm": 4.734772205352783, + "learning_rate": 1.6669487186147106e-05, + "loss": 0.2025, + "step": 17042 + }, + { + "epoch": 0.34088, + "grad_norm": 0.8878720998764038, + "learning_rate": 1.6668446760220876e-05, + "loss": 0.0322, + "step": 17044 + }, + { + "epoch": 0.34092, + "grad_norm": 2.3040664196014404, + "learning_rate": 1.6667406204289848e-05, + "loss": 0.1296, + "step": 17046 + }, + { + "epoch": 0.34096, + "grad_norm": 0.48293954133987427, + "learning_rate": 1.6666365518374302e-05, + "loss": 0.0099, + "step": 17048 + }, + { + "epoch": 0.341, + "grad_norm": 5.386542797088623, + "learning_rate": 1.6665324702494524e-05, + "loss": 0.3006, + "step": 17050 + }, + { + "epoch": 0.34104, + "grad_norm": 8.69636058807373, + "learning_rate": 1.6664283756670814e-05, + "loss": 0.4744, + "step": 17052 + }, + { + "epoch": 0.34108, + "grad_norm": 0.7325419187545776, + "learning_rate": 1.6663242680923462e-05, + "loss": 0.3358, + "step": 17054 + }, + { + "epoch": 0.34112, + "grad_norm": 4.287683010101318, + "learning_rate": 1.6662201475272762e-05, + "loss": 0.2462, + "step": 17056 + }, + { + "epoch": 0.34116, + "grad_norm": 0.3002616763114929, + "learning_rate": 1.6661160139739013e-05, + "loss": 0.0064, + "step": 17058 + }, + { + "epoch": 0.3412, + "grad_norm": 0.7853125929832458, + "learning_rate": 1.666011867434252e-05, + "loss": 0.0337, + "step": 17060 + }, + { + "epoch": 0.34124, + "grad_norm": 3.694016456604004, + "learning_rate": 1.6659077079103582e-05, + "loss": 0.1301, + "step": 17062 + }, + { + "epoch": 0.34128, + "grad_norm": 1.9649008512496948, + "learning_rate": 1.6658035354042507e-05, + "loss": 0.0683, + "step": 17064 + }, + { + "epoch": 0.34132, + "grad_norm": 3.371514081954956, + "learning_rate": 1.6656993499179607e-05, + "loss": 0.1875, + "step": 17066 + }, + { + "epoch": 0.34136, + "grad_norm": 0.12148881703615189, + "learning_rate": 1.6655951514535192e-05, + "loss": 0.0132, + "step": 17068 + }, + { + "epoch": 0.3414, + "grad_norm": 0.06351299583911896, + "learning_rate": 1.6654909400129575e-05, + "loss": 0.0484, + "step": 17070 + }, + { + "epoch": 0.34144, + "grad_norm": 1.083614468574524, + "learning_rate": 1.6653867155983072e-05, + "loss": 0.0314, + "step": 17072 + }, + { + "epoch": 0.34148, + "grad_norm": 0.028404476121068, + "learning_rate": 1.6652824782116007e-05, + "loss": 0.0018, + "step": 17074 + }, + { + "epoch": 0.34152, + "grad_norm": 0.3154192864894867, + "learning_rate": 1.6651782278548695e-05, + "loss": 0.241, + "step": 17076 + }, + { + "epoch": 0.34156, + "grad_norm": 0.16646599769592285, + "learning_rate": 1.6650739645301463e-05, + "loss": 0.0832, + "step": 17078 + }, + { + "epoch": 0.3416, + "grad_norm": 0.07799062132835388, + "learning_rate": 1.6649696882394635e-05, + "loss": 0.0051, + "step": 17080 + }, + { + "epoch": 0.34164, + "grad_norm": 0.7744149565696716, + "learning_rate": 1.6648653989848543e-05, + "loss": 0.0179, + "step": 17082 + }, + { + "epoch": 0.34168, + "grad_norm": 9.080946922302246, + "learning_rate": 1.6647610967683524e-05, + "loss": 0.3268, + "step": 17084 + }, + { + "epoch": 0.34172, + "grad_norm": 2.9299609661102295, + "learning_rate": 1.6646567815919903e-05, + "loss": 0.1054, + "step": 17086 + }, + { + "epoch": 0.34176, + "grad_norm": 2.1126773357391357, + "learning_rate": 1.664552453457802e-05, + "loss": 0.3536, + "step": 17088 + }, + { + "epoch": 0.3418, + "grad_norm": 0.4800480902194977, + "learning_rate": 1.664448112367822e-05, + "loss": 0.0161, + "step": 17090 + }, + { + "epoch": 0.34184, + "grad_norm": 0.4726940095424652, + "learning_rate": 1.6643437583240835e-05, + "loss": 0.0177, + "step": 17092 + }, + { + "epoch": 0.34188, + "grad_norm": 0.0450495183467865, + "learning_rate": 1.6642393913286214e-05, + "loss": 0.0179, + "step": 17094 + }, + { + "epoch": 0.34192, + "grad_norm": 7.1661248207092285, + "learning_rate": 1.6641350113834705e-05, + "loss": 0.4343, + "step": 17096 + }, + { + "epoch": 0.34196, + "grad_norm": 0.15764491260051727, + "learning_rate": 1.6640306184906652e-05, + "loss": 0.1941, + "step": 17098 + }, + { + "epoch": 0.342, + "grad_norm": 2.6838629245758057, + "learning_rate": 1.6639262126522417e-05, + "loss": 0.1065, + "step": 17100 + }, + { + "epoch": 0.34204, + "grad_norm": 4.174835205078125, + "learning_rate": 1.6638217938702345e-05, + "loss": 0.0902, + "step": 17102 + }, + { + "epoch": 0.34208, + "grad_norm": 0.26295796036720276, + "learning_rate": 1.6637173621466803e-05, + "loss": 0.0474, + "step": 17104 + }, + { + "epoch": 0.34212, + "grad_norm": 0.7039120197296143, + "learning_rate": 1.6636129174836136e-05, + "loss": 0.0227, + "step": 17106 + }, + { + "epoch": 0.34216, + "grad_norm": 4.9258012771606445, + "learning_rate": 1.663508459883072e-05, + "loss": 0.1398, + "step": 17108 + }, + { + "epoch": 0.3422, + "grad_norm": 7.427706718444824, + "learning_rate": 1.6634039893470912e-05, + "loss": 0.3577, + "step": 17110 + }, + { + "epoch": 0.34224, + "grad_norm": 6.5830302238464355, + "learning_rate": 1.663299505877708e-05, + "loss": 0.3577, + "step": 17112 + }, + { + "epoch": 0.34228, + "grad_norm": 1.1468533277511597, + "learning_rate": 1.6631950094769596e-05, + "loss": 0.1022, + "step": 17114 + }, + { + "epoch": 0.34232, + "grad_norm": 0.00931995827704668, + "learning_rate": 1.663090500146883e-05, + "loss": 0.0037, + "step": 17116 + }, + { + "epoch": 0.34236, + "grad_norm": 0.9852369427680969, + "learning_rate": 1.6629859778895156e-05, + "loss": 0.0664, + "step": 17118 + }, + { + "epoch": 0.3424, + "grad_norm": 0.6357668042182922, + "learning_rate": 1.6628814427068954e-05, + "loss": 0.0256, + "step": 17120 + }, + { + "epoch": 0.34244, + "grad_norm": 3.135948896408081, + "learning_rate": 1.66277689460106e-05, + "loss": 0.098, + "step": 17122 + }, + { + "epoch": 0.34248, + "grad_norm": 1.6187409162521362, + "learning_rate": 1.6626723335740476e-05, + "loss": 0.3282, + "step": 17124 + }, + { + "epoch": 0.34252, + "grad_norm": 2.582258701324463, + "learning_rate": 1.6625677596278977e-05, + "loss": 0.0711, + "step": 17126 + }, + { + "epoch": 0.34256, + "grad_norm": 0.05208037793636322, + "learning_rate": 1.6624631727646477e-05, + "loss": 0.0181, + "step": 17128 + }, + { + "epoch": 0.3426, + "grad_norm": 0.4288436472415924, + "learning_rate": 1.662358572986337e-05, + "loss": 0.1877, + "step": 17130 + }, + { + "epoch": 0.34264, + "grad_norm": 0.1297820508480072, + "learning_rate": 1.662253960295005e-05, + "loss": 0.0091, + "step": 17132 + }, + { + "epoch": 0.34268, + "grad_norm": 0.35183846950531006, + "learning_rate": 1.6621493346926912e-05, + "loss": 0.017, + "step": 17134 + }, + { + "epoch": 0.34272, + "grad_norm": 2.7513723373413086, + "learning_rate": 1.662044696181435e-05, + "loss": 0.0915, + "step": 17136 + }, + { + "epoch": 0.34276, + "grad_norm": 4.485042572021484, + "learning_rate": 1.661940044763277e-05, + "loss": 0.1823, + "step": 17138 + }, + { + "epoch": 0.3428, + "grad_norm": 0.4558943510055542, + "learning_rate": 1.6618353804402567e-05, + "loss": 0.0296, + "step": 17140 + }, + { + "epoch": 0.34284, + "grad_norm": 4.3394083976745605, + "learning_rate": 1.6617307032144148e-05, + "loss": 0.3512, + "step": 17142 + }, + { + "epoch": 0.34288, + "grad_norm": 6.512083053588867, + "learning_rate": 1.6616260130877926e-05, + "loss": 0.1971, + "step": 17144 + }, + { + "epoch": 0.34292, + "grad_norm": 1.6714696884155273, + "learning_rate": 1.6615213100624304e-05, + "loss": 0.1541, + "step": 17146 + }, + { + "epoch": 0.34296, + "grad_norm": 4.41439151763916, + "learning_rate": 1.66141659414037e-05, + "loss": 0.1582, + "step": 17148 + }, + { + "epoch": 0.343, + "grad_norm": 2.1438491344451904, + "learning_rate": 1.661311865323652e-05, + "loss": 0.2343, + "step": 17150 + }, + { + "epoch": 0.34304, + "grad_norm": 0.8060789704322815, + "learning_rate": 1.661207123614319e-05, + "loss": 0.0333, + "step": 17152 + }, + { + "epoch": 0.34308, + "grad_norm": 5.20093297958374, + "learning_rate": 1.661102369014413e-05, + "loss": 0.1732, + "step": 17154 + }, + { + "epoch": 0.34312, + "grad_norm": 0.6835343837738037, + "learning_rate": 1.6609976015259755e-05, + "loss": 0.1417, + "step": 17156 + }, + { + "epoch": 0.34316, + "grad_norm": 1.2793054580688477, + "learning_rate": 1.6608928211510496e-05, + "loss": 0.0723, + "step": 17158 + }, + { + "epoch": 0.3432, + "grad_norm": 0.24360480904579163, + "learning_rate": 1.6607880278916778e-05, + "loss": 0.0251, + "step": 17160 + }, + { + "epoch": 0.34324, + "grad_norm": 0.09846855700016022, + "learning_rate": 1.6606832217499032e-05, + "loss": 0.0365, + "step": 17162 + }, + { + "epoch": 0.34328, + "grad_norm": 1.7155580520629883, + "learning_rate": 1.6605784027277692e-05, + "loss": 0.0619, + "step": 17164 + }, + { + "epoch": 0.34332, + "grad_norm": 0.038793276995420456, + "learning_rate": 1.6604735708273195e-05, + "loss": 0.1389, + "step": 17166 + }, + { + "epoch": 0.34336, + "grad_norm": 0.044860709458589554, + "learning_rate": 1.660368726050597e-05, + "loss": 0.0027, + "step": 17168 + }, + { + "epoch": 0.3434, + "grad_norm": 2.586829662322998, + "learning_rate": 1.6602638683996462e-05, + "loss": 0.3633, + "step": 17170 + }, + { + "epoch": 0.34344, + "grad_norm": 1.0693333148956299, + "learning_rate": 1.6601589978765118e-05, + "loss": 0.0231, + "step": 17172 + }, + { + "epoch": 0.34348, + "grad_norm": 1.2994874715805054, + "learning_rate": 1.6600541144832375e-05, + "loss": 0.3176, + "step": 17174 + }, + { + "epoch": 0.34352, + "grad_norm": 0.058630164712667465, + "learning_rate": 1.6599492182218685e-05, + "loss": 0.005, + "step": 17176 + }, + { + "epoch": 0.34356, + "grad_norm": 1.726430058479309, + "learning_rate": 1.6598443090944496e-05, + "loss": 0.062, + "step": 17178 + }, + { + "epoch": 0.3436, + "grad_norm": 0.13881641626358032, + "learning_rate": 1.6597393871030264e-05, + "loss": 0.0416, + "step": 17180 + }, + { + "epoch": 0.34364, + "grad_norm": 0.45755618810653687, + "learning_rate": 1.659634452249644e-05, + "loss": 0.0227, + "step": 17182 + }, + { + "epoch": 0.34368, + "grad_norm": 0.7707438468933105, + "learning_rate": 1.6595295045363483e-05, + "loss": 0.1348, + "step": 17184 + }, + { + "epoch": 0.34372, + "grad_norm": 3.9122769832611084, + "learning_rate": 1.6594245439651856e-05, + "loss": 0.1489, + "step": 17186 + }, + { + "epoch": 0.34376, + "grad_norm": 1.0204169750213623, + "learning_rate": 1.6593195705382017e-05, + "loss": 0.0335, + "step": 17188 + }, + { + "epoch": 0.3438, + "grad_norm": 1.123063087463379, + "learning_rate": 1.6592145842574433e-05, + "loss": 0.1115, + "step": 17190 + }, + { + "epoch": 0.34384, + "grad_norm": 0.6343837976455688, + "learning_rate": 1.6591095851249573e-05, + "loss": 0.0575, + "step": 17192 + }, + { + "epoch": 0.34388, + "grad_norm": 0.4030526876449585, + "learning_rate": 1.659004573142791e-05, + "loss": 0.0209, + "step": 17194 + }, + { + "epoch": 0.34392, + "grad_norm": 7.010035991668701, + "learning_rate": 1.6588995483129907e-05, + "loss": 0.5357, + "step": 17196 + }, + { + "epoch": 0.34396, + "grad_norm": 5.529228687286377, + "learning_rate": 1.6587945106376046e-05, + "loss": 0.3826, + "step": 17198 + }, + { + "epoch": 0.344, + "grad_norm": 2.2141528129577637, + "learning_rate": 1.6586894601186804e-05, + "loss": 0.0485, + "step": 17200 + }, + { + "epoch": 0.34404, + "grad_norm": 0.4636524021625519, + "learning_rate": 1.6585843967582663e-05, + "loss": 0.2463, + "step": 17202 + }, + { + "epoch": 0.34408, + "grad_norm": 1.307140588760376, + "learning_rate": 1.65847932055841e-05, + "loss": 0.0394, + "step": 17204 + }, + { + "epoch": 0.34412, + "grad_norm": 0.4742557406425476, + "learning_rate": 1.6583742315211605e-05, + "loss": 0.0494, + "step": 17206 + }, + { + "epoch": 0.34416, + "grad_norm": 3.162999391555786, + "learning_rate": 1.6582691296485664e-05, + "loss": 0.1851, + "step": 17208 + }, + { + "epoch": 0.3442, + "grad_norm": 1.441324234008789, + "learning_rate": 1.6581640149426766e-05, + "loss": 0.0472, + "step": 17210 + }, + { + "epoch": 0.34424, + "grad_norm": 4.7971978187561035, + "learning_rate": 1.6580588874055408e-05, + "loss": 0.2553, + "step": 17212 + }, + { + "epoch": 0.34428, + "grad_norm": 0.18466828763484955, + "learning_rate": 1.657953747039208e-05, + "loss": 0.0072, + "step": 17214 + }, + { + "epoch": 0.34432, + "grad_norm": 4.285243511199951, + "learning_rate": 1.657848593845728e-05, + "loss": 0.206, + "step": 17216 + }, + { + "epoch": 0.34436, + "grad_norm": 0.5333167910575867, + "learning_rate": 1.657743427827151e-05, + "loss": 0.0238, + "step": 17218 + }, + { + "epoch": 0.3444, + "grad_norm": 0.4298962652683258, + "learning_rate": 1.6576382489855274e-05, + "loss": 0.0407, + "step": 17220 + }, + { + "epoch": 0.34444, + "grad_norm": 0.166345477104187, + "learning_rate": 1.6575330573229075e-05, + "loss": 0.034, + "step": 17222 + }, + { + "epoch": 0.34448, + "grad_norm": 0.22661006450653076, + "learning_rate": 1.6574278528413425e-05, + "loss": 0.0247, + "step": 17224 + }, + { + "epoch": 0.34452, + "grad_norm": 5.5315470695495605, + "learning_rate": 1.6573226355428825e-05, + "loss": 0.3597, + "step": 17226 + }, + { + "epoch": 0.34456, + "grad_norm": 0.05208860710263252, + "learning_rate": 1.6572174054295797e-05, + "loss": 0.0041, + "step": 17228 + }, + { + "epoch": 0.3446, + "grad_norm": 0.5197557806968689, + "learning_rate": 1.6571121625034847e-05, + "loss": 0.2159, + "step": 17230 + }, + { + "epoch": 0.34464, + "grad_norm": 0.931566596031189, + "learning_rate": 1.6570069067666502e-05, + "loss": 0.0661, + "step": 17232 + }, + { + "epoch": 0.34468, + "grad_norm": 3.0832653045654297, + "learning_rate": 1.6569016382211275e-05, + "loss": 0.2425, + "step": 17234 + }, + { + "epoch": 0.34472, + "grad_norm": 0.3877396881580353, + "learning_rate": 1.6567963568689694e-05, + "loss": 0.0295, + "step": 17236 + }, + { + "epoch": 0.34476, + "grad_norm": 0.3189389407634735, + "learning_rate": 1.6566910627122282e-05, + "loss": 0.0792, + "step": 17238 + }, + { + "epoch": 0.3448, + "grad_norm": 0.3197782039642334, + "learning_rate": 1.6565857557529567e-05, + "loss": 0.0104, + "step": 17240 + }, + { + "epoch": 0.34484, + "grad_norm": 1.5107206106185913, + "learning_rate": 1.6564804359932077e-05, + "loss": 0.0433, + "step": 17242 + }, + { + "epoch": 0.34488, + "grad_norm": 5.585995197296143, + "learning_rate": 1.6563751034350345e-05, + "loss": 0.1203, + "step": 17244 + }, + { + "epoch": 0.34492, + "grad_norm": 1.9328787326812744, + "learning_rate": 1.656269758080491e-05, + "loss": 0.085, + "step": 17246 + }, + { + "epoch": 0.34496, + "grad_norm": 5.3560686111450195, + "learning_rate": 1.6561643999316307e-05, + "loss": 0.2722, + "step": 17248 + }, + { + "epoch": 0.345, + "grad_norm": 0.2819126546382904, + "learning_rate": 1.6560590289905074e-05, + "loss": 0.0791, + "step": 17250 + }, + { + "epoch": 0.34504, + "grad_norm": 1.5351324081420898, + "learning_rate": 1.655953645259176e-05, + "loss": 0.1173, + "step": 17252 + }, + { + "epoch": 0.34508, + "grad_norm": 0.3410387635231018, + "learning_rate": 1.65584824873969e-05, + "loss": 0.1139, + "step": 17254 + }, + { + "epoch": 0.34512, + "grad_norm": 0.7360186576843262, + "learning_rate": 1.6557428394341052e-05, + "loss": 0.0437, + "step": 17256 + }, + { + "epoch": 0.34516, + "grad_norm": 0.46183618903160095, + "learning_rate": 1.6556374173444756e-05, + "loss": 0.0208, + "step": 17258 + }, + { + "epoch": 0.3452, + "grad_norm": 0.6117851138114929, + "learning_rate": 1.6555319824728577e-05, + "loss": 0.025, + "step": 17260 + }, + { + "epoch": 0.34524, + "grad_norm": 30.72237777709961, + "learning_rate": 1.6554265348213055e-05, + "loss": 0.5698, + "step": 17262 + }, + { + "epoch": 0.34528, + "grad_norm": 6.0001420974731445, + "learning_rate": 1.655321074391876e-05, + "loss": 0.3629, + "step": 17264 + }, + { + "epoch": 0.34532, + "grad_norm": 0.5154655575752258, + "learning_rate": 1.655215601186625e-05, + "loss": 0.0495, + "step": 17266 + }, + { + "epoch": 0.34536, + "grad_norm": 0.7453352212905884, + "learning_rate": 1.6551101152076086e-05, + "loss": 0.0339, + "step": 17268 + }, + { + "epoch": 0.3454, + "grad_norm": 0.8038049340248108, + "learning_rate": 1.6550046164568827e-05, + "loss": 0.0317, + "step": 17270 + }, + { + "epoch": 0.34544, + "grad_norm": 0.8219498991966248, + "learning_rate": 1.654899104936505e-05, + "loss": 0.0285, + "step": 17272 + }, + { + "epoch": 0.34548, + "grad_norm": 1.4489871263504028, + "learning_rate": 1.6547935806485318e-05, + "loss": 0.0755, + "step": 17274 + }, + { + "epoch": 0.34552, + "grad_norm": 4.273381233215332, + "learning_rate": 1.6546880435950207e-05, + "loss": 0.2534, + "step": 17276 + }, + { + "epoch": 0.34556, + "grad_norm": 0.4738249182701111, + "learning_rate": 1.6545824937780292e-05, + "loss": 0.0374, + "step": 17278 + }, + { + "epoch": 0.3456, + "grad_norm": 0.07099390029907227, + "learning_rate": 1.654476931199615e-05, + "loss": 0.0392, + "step": 17280 + }, + { + "epoch": 0.34564, + "grad_norm": 1.1427011489868164, + "learning_rate": 1.654371355861836e-05, + "loss": 0.2308, + "step": 17282 + }, + { + "epoch": 0.34568, + "grad_norm": 0.05547505244612694, + "learning_rate": 1.6542657677667506e-05, + "loss": 0.1278, + "step": 17284 + }, + { + "epoch": 0.34572, + "grad_norm": 0.3823341429233551, + "learning_rate": 1.6541601669164174e-05, + "loss": 0.157, + "step": 17286 + }, + { + "epoch": 0.34576, + "grad_norm": 2.6805572509765625, + "learning_rate": 1.6540545533128946e-05, + "loss": 0.0961, + "step": 17288 + }, + { + "epoch": 0.3458, + "grad_norm": 4.7388763427734375, + "learning_rate": 1.6539489269582414e-05, + "loss": 0.192, + "step": 17290 + }, + { + "epoch": 0.34584, + "grad_norm": 0.06485304236412048, + "learning_rate": 1.653843287854518e-05, + "loss": 0.0071, + "step": 17292 + }, + { + "epoch": 0.34588, + "grad_norm": 0.07013511657714844, + "learning_rate": 1.6537376360037822e-05, + "loss": 0.0537, + "step": 17294 + }, + { + "epoch": 0.34592, + "grad_norm": 0.0410093329846859, + "learning_rate": 1.653631971408095e-05, + "loss": 0.0038, + "step": 17296 + }, + { + "epoch": 0.34596, + "grad_norm": 1.8741061687469482, + "learning_rate": 1.653526294069516e-05, + "loss": 0.0526, + "step": 17298 + }, + { + "epoch": 0.346, + "grad_norm": 0.9385887384414673, + "learning_rate": 1.6534206039901057e-05, + "loss": 0.1837, + "step": 17300 + }, + { + "epoch": 0.34604, + "grad_norm": 0.08681720495223999, + "learning_rate": 1.653314901171924e-05, + "loss": 0.7744, + "step": 17302 + }, + { + "epoch": 0.34608, + "grad_norm": 0.5048580169677734, + "learning_rate": 1.653209185617032e-05, + "loss": 0.0361, + "step": 17304 + }, + { + "epoch": 0.34612, + "grad_norm": 6.730835437774658, + "learning_rate": 1.6531034573274905e-05, + "loss": 0.2164, + "step": 17306 + }, + { + "epoch": 0.34616, + "grad_norm": 0.05883805453777313, + "learning_rate": 1.652997716305361e-05, + "loss": 0.6916, + "step": 17308 + }, + { + "epoch": 0.3462, + "grad_norm": 0.24006076157093048, + "learning_rate": 1.652891962552705e-05, + "loss": 0.0068, + "step": 17310 + }, + { + "epoch": 0.34624, + "grad_norm": 2.873769998550415, + "learning_rate": 1.652786196071584e-05, + "loss": 0.0889, + "step": 17312 + }, + { + "epoch": 0.34628, + "grad_norm": 4.8787641525268555, + "learning_rate": 1.6526804168640597e-05, + "loss": 0.3666, + "step": 17314 + }, + { + "epoch": 0.34632, + "grad_norm": 0.7553620338439941, + "learning_rate": 1.652574624932195e-05, + "loss": 0.0205, + "step": 17316 + }, + { + "epoch": 0.34636, + "grad_norm": 0.3950783610343933, + "learning_rate": 1.6524688202780523e-05, + "loss": 0.0131, + "step": 17318 + }, + { + "epoch": 0.3464, + "grad_norm": 5.9806389808654785, + "learning_rate": 1.652363002903693e-05, + "loss": 0.4634, + "step": 17320 + }, + { + "epoch": 0.34644, + "grad_norm": 0.4887770414352417, + "learning_rate": 1.652257172811182e-05, + "loss": 0.0429, + "step": 17322 + }, + { + "epoch": 0.34648, + "grad_norm": 0.26252204179763794, + "learning_rate": 1.6521513300025812e-05, + "loss": 0.0213, + "step": 17324 + }, + { + "epoch": 0.34652, + "grad_norm": 5.519219875335693, + "learning_rate": 1.6520454744799546e-05, + "loss": 0.228, + "step": 17326 + }, + { + "epoch": 0.34656, + "grad_norm": 1.1049526929855347, + "learning_rate": 1.6519396062453662e-05, + "loss": 0.0825, + "step": 17328 + }, + { + "epoch": 0.3466, + "grad_norm": 0.027014534920454025, + "learning_rate": 1.651833725300879e-05, + "loss": 0.0428, + "step": 17330 + }, + { + "epoch": 0.34664, + "grad_norm": 0.09421694278717041, + "learning_rate": 1.651727831648558e-05, + "loss": 0.14, + "step": 17332 + }, + { + "epoch": 0.34668, + "grad_norm": 5.43719482421875, + "learning_rate": 1.651621925290467e-05, + "loss": 0.3395, + "step": 17334 + }, + { + "epoch": 0.34672, + "grad_norm": 1.6897752285003662, + "learning_rate": 1.6515160062286715e-05, + "loss": 0.125, + "step": 17336 + }, + { + "epoch": 0.34676, + "grad_norm": 1.757899522781372, + "learning_rate": 1.6514100744652358e-05, + "loss": 0.0497, + "step": 17338 + }, + { + "epoch": 0.3468, + "grad_norm": 0.5661625266075134, + "learning_rate": 1.6513041300022253e-05, + "loss": 0.0167, + "step": 17340 + }, + { + "epoch": 0.34684, + "grad_norm": 0.8637098670005798, + "learning_rate": 1.6511981728417057e-05, + "loss": 0.0515, + "step": 17342 + }, + { + "epoch": 0.34688, + "grad_norm": 0.1678367406129837, + "learning_rate": 1.6510922029857425e-05, + "loss": 0.0613, + "step": 17344 + }, + { + "epoch": 0.34692, + "grad_norm": 0.1302204728126526, + "learning_rate": 1.6509862204364014e-05, + "loss": 0.4915, + "step": 17346 + }, + { + "epoch": 0.34696, + "grad_norm": 0.6225660443305969, + "learning_rate": 1.6508802251957488e-05, + "loss": 0.1022, + "step": 17348 + }, + { + "epoch": 0.347, + "grad_norm": 5.386775016784668, + "learning_rate": 1.650774217265851e-05, + "loss": 0.696, + "step": 17350 + }, + { + "epoch": 0.34704, + "grad_norm": 3.5533292293548584, + "learning_rate": 1.650668196648775e-05, + "loss": 0.1031, + "step": 17352 + }, + { + "epoch": 0.34708, + "grad_norm": 1.8449721336364746, + "learning_rate": 1.6505621633465872e-05, + "loss": 0.0391, + "step": 17354 + }, + { + "epoch": 0.34712, + "grad_norm": 1.1189188957214355, + "learning_rate": 1.6504561173613556e-05, + "loss": 0.0478, + "step": 17356 + }, + { + "epoch": 0.34716, + "grad_norm": 2.0099940299987793, + "learning_rate": 1.650350058695147e-05, + "loss": 0.0633, + "step": 17358 + }, + { + "epoch": 0.3472, + "grad_norm": 0.7614078521728516, + "learning_rate": 1.650243987350029e-05, + "loss": 0.0246, + "step": 17360 + }, + { + "epoch": 0.34724, + "grad_norm": 2.284593343734741, + "learning_rate": 1.6501379033280697e-05, + "loss": 0.0583, + "step": 17362 + }, + { + "epoch": 0.34728, + "grad_norm": 0.28244680166244507, + "learning_rate": 1.6500318066313374e-05, + "loss": 0.0254, + "step": 17364 + }, + { + "epoch": 0.34732, + "grad_norm": 4.578318119049072, + "learning_rate": 1.6499256972619e-05, + "loss": 0.2048, + "step": 17366 + }, + { + "epoch": 0.34736, + "grad_norm": 0.3967999517917633, + "learning_rate": 1.649819575221827e-05, + "loss": 0.1991, + "step": 17368 + }, + { + "epoch": 0.3474, + "grad_norm": 2.5564284324645996, + "learning_rate": 1.649713440513187e-05, + "loss": 0.0956, + "step": 17370 + }, + { + "epoch": 0.34744, + "grad_norm": 1.5583560466766357, + "learning_rate": 1.649607293138048e-05, + "loss": 0.0739, + "step": 17372 + }, + { + "epoch": 0.34748, + "grad_norm": 0.9033929705619812, + "learning_rate": 1.6495011330984813e-05, + "loss": 0.0262, + "step": 17374 + }, + { + "epoch": 0.34752, + "grad_norm": 0.7439427375793457, + "learning_rate": 1.6493949603965553e-05, + "loss": 0.2256, + "step": 17376 + }, + { + "epoch": 0.34756, + "grad_norm": 3.400862216949463, + "learning_rate": 1.64928877503434e-05, + "loss": 0.2551, + "step": 17378 + }, + { + "epoch": 0.3476, + "grad_norm": 6.292299270629883, + "learning_rate": 1.649182577013906e-05, + "loss": 0.1315, + "step": 17380 + }, + { + "epoch": 0.34764, + "grad_norm": 3.6527719497680664, + "learning_rate": 1.6490763663373236e-05, + "loss": 0.158, + "step": 17382 + }, + { + "epoch": 0.34768, + "grad_norm": 1.5507051944732666, + "learning_rate": 1.6489701430066632e-05, + "loss": 0.0683, + "step": 17384 + }, + { + "epoch": 0.34772, + "grad_norm": 0.7033218741416931, + "learning_rate": 1.6488639070239956e-05, + "loss": 0.0183, + "step": 17386 + }, + { + "epoch": 0.34776, + "grad_norm": 2.3528406620025635, + "learning_rate": 1.648757658391392e-05, + "loss": 0.1365, + "step": 17388 + }, + { + "epoch": 0.3478, + "grad_norm": 0.2747696340084076, + "learning_rate": 1.6486513971109245e-05, + "loss": 0.1207, + "step": 17390 + }, + { + "epoch": 0.34784, + "grad_norm": 4.318127155303955, + "learning_rate": 1.6485451231846632e-05, + "loss": 0.1369, + "step": 17392 + }, + { + "epoch": 0.34788, + "grad_norm": 0.19665905833244324, + "learning_rate": 1.648438836614681e-05, + "loss": 0.1418, + "step": 17394 + }, + { + "epoch": 0.34792, + "grad_norm": 1.7573418617248535, + "learning_rate": 1.6483325374030502e-05, + "loss": 0.2723, + "step": 17396 + }, + { + "epoch": 0.34796, + "grad_norm": 0.6320146918296814, + "learning_rate": 1.6482262255518428e-05, + "loss": 0.0321, + "step": 17398 + }, + { + "epoch": 0.348, + "grad_norm": 1.4004027843475342, + "learning_rate": 1.6481199010631312e-05, + "loss": 0.1262, + "step": 17400 + }, + { + "epoch": 0.34804, + "grad_norm": 8.702557563781738, + "learning_rate": 1.6480135639389882e-05, + "loss": 0.357, + "step": 17402 + }, + { + "epoch": 0.34808, + "grad_norm": 1.2897530794143677, + "learning_rate": 1.6479072141814874e-05, + "loss": 0.0278, + "step": 17404 + }, + { + "epoch": 0.34812, + "grad_norm": 0.5760429501533508, + "learning_rate": 1.647800851792702e-05, + "loss": 0.0538, + "step": 17406 + }, + { + "epoch": 0.34816, + "grad_norm": 3.233463764190674, + "learning_rate": 1.6476944767747057e-05, + "loss": 0.2537, + "step": 17408 + }, + { + "epoch": 0.3482, + "grad_norm": 5.9904046058654785, + "learning_rate": 1.6475880891295716e-05, + "loss": 0.2585, + "step": 17410 + }, + { + "epoch": 0.34824, + "grad_norm": 0.5717849135398865, + "learning_rate": 1.6474816888593744e-05, + "loss": 0.0455, + "step": 17412 + }, + { + "epoch": 0.34828, + "grad_norm": 1.503483533859253, + "learning_rate": 1.6473752759661885e-05, + "loss": 0.1344, + "step": 17414 + }, + { + "epoch": 0.34832, + "grad_norm": 1.5067806243896484, + "learning_rate": 1.6472688504520883e-05, + "loss": 0.0331, + "step": 17416 + }, + { + "epoch": 0.34836, + "grad_norm": 0.6267659664154053, + "learning_rate": 1.6471624123191485e-05, + "loss": 0.0433, + "step": 17418 + }, + { + "epoch": 0.3484, + "grad_norm": 1.8319287300109863, + "learning_rate": 1.6470559615694445e-05, + "loss": 0.1024, + "step": 17420 + }, + { + "epoch": 0.34844, + "grad_norm": 0.05861746892333031, + "learning_rate": 1.6469494982050513e-05, + "loss": 0.0224, + "step": 17422 + }, + { + "epoch": 0.34848, + "grad_norm": 1.9893465042114258, + "learning_rate": 1.6468430222280444e-05, + "loss": 0.0709, + "step": 17424 + }, + { + "epoch": 0.34852, + "grad_norm": 0.1076655164361, + "learning_rate": 1.6467365336405e-05, + "loss": 0.0317, + "step": 17426 + }, + { + "epoch": 0.34856, + "grad_norm": 1.4101063013076782, + "learning_rate": 1.646630032444494e-05, + "loss": 0.113, + "step": 17428 + }, + { + "epoch": 0.3486, + "grad_norm": 0.43820109963417053, + "learning_rate": 1.6465235186421024e-05, + "loss": 0.07, + "step": 17430 + }, + { + "epoch": 0.34864, + "grad_norm": 7.2431488037109375, + "learning_rate": 1.646416992235402e-05, + "loss": 0.5485, + "step": 17432 + }, + { + "epoch": 0.34868, + "grad_norm": 0.19328710436820984, + "learning_rate": 1.6463104532264692e-05, + "loss": 0.0067, + "step": 17434 + }, + { + "epoch": 0.34872, + "grad_norm": 1.3525421619415283, + "learning_rate": 1.6462039016173816e-05, + "loss": 0.0309, + "step": 17436 + }, + { + "epoch": 0.34876, + "grad_norm": 1.2906386852264404, + "learning_rate": 1.6460973374102164e-05, + "loss": 0.1112, + "step": 17438 + }, + { + "epoch": 0.3488, + "grad_norm": 0.24970024824142456, + "learning_rate": 1.6459907606070513e-05, + "loss": 0.012, + "step": 17440 + }, + { + "epoch": 0.34884, + "grad_norm": 1.3090877532958984, + "learning_rate": 1.6458841712099628e-05, + "loss": 0.041, + "step": 17442 + }, + { + "epoch": 0.34888, + "grad_norm": 0.18988510966300964, + "learning_rate": 1.645777569221031e-05, + "loss": 0.0313, + "step": 17444 + }, + { + "epoch": 0.34892, + "grad_norm": 10.658203125, + "learning_rate": 1.645670954642332e-05, + "loss": 0.4152, + "step": 17446 + }, + { + "epoch": 0.34896, + "grad_norm": 0.2995603084564209, + "learning_rate": 1.645564327475946e-05, + "loss": 0.0121, + "step": 17448 + }, + { + "epoch": 0.349, + "grad_norm": 2.022160291671753, + "learning_rate": 1.645457687723951e-05, + "loss": 0.0628, + "step": 17450 + }, + { + "epoch": 0.34904, + "grad_norm": 0.8069949150085449, + "learning_rate": 1.6453510353884255e-05, + "loss": 0.2674, + "step": 17452 + }, + { + "epoch": 0.34908, + "grad_norm": 1.8396785259246826, + "learning_rate": 1.64524437047145e-05, + "loss": 0.1094, + "step": 17454 + }, + { + "epoch": 0.34912, + "grad_norm": 0.6209604144096375, + "learning_rate": 1.6451376929751028e-05, + "loss": 0.0673, + "step": 17456 + }, + { + "epoch": 0.34916, + "grad_norm": 0.03823308274149895, + "learning_rate": 1.6450310029014644e-05, + "loss": 0.0088, + "step": 17458 + }, + { + "epoch": 0.3492, + "grad_norm": 2.1965949535369873, + "learning_rate": 1.6449243002526146e-05, + "loss": 0.0473, + "step": 17460 + }, + { + "epoch": 0.34924, + "grad_norm": 0.1265733242034912, + "learning_rate": 1.6448175850306334e-05, + "loss": 0.2218, + "step": 17462 + }, + { + "epoch": 0.34928, + "grad_norm": 3.8314011096954346, + "learning_rate": 1.6447108572376015e-05, + "loss": 0.0985, + "step": 17464 + }, + { + "epoch": 0.34932, + "grad_norm": 5.064357757568359, + "learning_rate": 1.6446041168755994e-05, + "loss": 0.1073, + "step": 17466 + }, + { + "epoch": 0.34936, + "grad_norm": 0.11410260945558548, + "learning_rate": 1.644497363946708e-05, + "loss": 0.0548, + "step": 17468 + }, + { + "epoch": 0.3494, + "grad_norm": 0.5510149002075195, + "learning_rate": 1.6443905984530092e-05, + "loss": 0.0429, + "step": 17470 + }, + { + "epoch": 0.34944, + "grad_norm": 0.5685005187988281, + "learning_rate": 1.6442838203965834e-05, + "loss": 0.1632, + "step": 17472 + }, + { + "epoch": 0.34948, + "grad_norm": 3.018385887145996, + "learning_rate": 1.644177029779513e-05, + "loss": 0.0541, + "step": 17474 + }, + { + "epoch": 0.34952, + "grad_norm": 11.695815086364746, + "learning_rate": 1.64407022660388e-05, + "loss": 0.7071, + "step": 17476 + }, + { + "epoch": 0.34956, + "grad_norm": 0.47378674149513245, + "learning_rate": 1.643963410871766e-05, + "loss": 0.3428, + "step": 17478 + }, + { + "epoch": 0.3496, + "grad_norm": 4.640476226806641, + "learning_rate": 1.643856582585254e-05, + "loss": 0.217, + "step": 17480 + }, + { + "epoch": 0.34964, + "grad_norm": 0.3240664303302765, + "learning_rate": 1.6437497417464262e-05, + "loss": 0.0083, + "step": 17482 + }, + { + "epoch": 0.34968, + "grad_norm": 0.20197688043117523, + "learning_rate": 1.6436428883573658e-05, + "loss": 0.0563, + "step": 17484 + }, + { + "epoch": 0.34972, + "grad_norm": 4.597728252410889, + "learning_rate": 1.6435360224201563e-05, + "loss": 0.1274, + "step": 17486 + }, + { + "epoch": 0.34976, + "grad_norm": 1.4183919429779053, + "learning_rate": 1.6434291439368805e-05, + "loss": 0.0405, + "step": 17488 + }, + { + "epoch": 0.3498, + "grad_norm": 1.33935546875, + "learning_rate": 1.643322252909622e-05, + "loss": 0.0691, + "step": 17490 + }, + { + "epoch": 0.34984, + "grad_norm": 2.9574713706970215, + "learning_rate": 1.6432153493404654e-05, + "loss": 0.0628, + "step": 17492 + }, + { + "epoch": 0.34988, + "grad_norm": 4.022613048553467, + "learning_rate": 1.643108433231494e-05, + "loss": 0.0861, + "step": 17494 + }, + { + "epoch": 0.34992, + "grad_norm": 2.252579689025879, + "learning_rate": 1.643001504584793e-05, + "loss": 0.0435, + "step": 17496 + }, + { + "epoch": 0.34996, + "grad_norm": 3.457031011581421, + "learning_rate": 1.6428945634024464e-05, + "loss": 0.1074, + "step": 17498 + }, + { + "epoch": 0.35, + "grad_norm": 1.6419892311096191, + "learning_rate": 1.6427876096865394e-05, + "loss": 0.0645, + "step": 17500 + }, + { + "epoch": 0.35004, + "grad_norm": 0.03874907270073891, + "learning_rate": 1.642680643439157e-05, + "loss": 0.0797, + "step": 17502 + }, + { + "epoch": 0.35008, + "grad_norm": 0.7197763919830322, + "learning_rate": 1.6425736646623847e-05, + "loss": 0.2172, + "step": 17504 + }, + { + "epoch": 0.35012, + "grad_norm": 1.3315032720565796, + "learning_rate": 1.6424666733583078e-05, + "loss": 0.0771, + "step": 17506 + }, + { + "epoch": 0.35016, + "grad_norm": 1.5590792894363403, + "learning_rate": 1.6423596695290124e-05, + "loss": 0.0723, + "step": 17508 + }, + { + "epoch": 0.3502, + "grad_norm": 1.6399348974227905, + "learning_rate": 1.6422526531765846e-05, + "loss": 0.1755, + "step": 17510 + }, + { + "epoch": 0.35024, + "grad_norm": 0.587476909160614, + "learning_rate": 1.6421456243031105e-05, + "loss": 0.1276, + "step": 17512 + }, + { + "epoch": 0.35028, + "grad_norm": 0.4333774149417877, + "learning_rate": 1.642038582910677e-05, + "loss": 0.0095, + "step": 17514 + }, + { + "epoch": 0.35032, + "grad_norm": 0.17268478870391846, + "learning_rate": 1.6419315290013708e-05, + "loss": 0.007, + "step": 17516 + }, + { + "epoch": 0.35036, + "grad_norm": 5.419097423553467, + "learning_rate": 1.6418244625772788e-05, + "loss": 0.2064, + "step": 17518 + }, + { + "epoch": 0.3504, + "grad_norm": 1.7944759130477905, + "learning_rate": 1.6417173836404888e-05, + "loss": 0.0605, + "step": 17520 + }, + { + "epoch": 0.35044, + "grad_norm": 2.0803637504577637, + "learning_rate": 1.6416102921930876e-05, + "loss": 0.0893, + "step": 17522 + }, + { + "epoch": 0.35048, + "grad_norm": 7.612325668334961, + "learning_rate": 1.6415031882371635e-05, + "loss": 0.4547, + "step": 17524 + }, + { + "epoch": 0.35052, + "grad_norm": 5.425506114959717, + "learning_rate": 1.6413960717748048e-05, + "loss": 0.1579, + "step": 17526 + }, + { + "epoch": 0.35056, + "grad_norm": 1.4144898653030396, + "learning_rate": 1.6412889428080992e-05, + "loss": 0.1093, + "step": 17528 + }, + { + "epoch": 0.3506, + "grad_norm": 9.354620933532715, + "learning_rate": 1.6411818013391357e-05, + "loss": 0.1917, + "step": 17530 + }, + { + "epoch": 0.35064, + "grad_norm": 0.4380197525024414, + "learning_rate": 1.6410746473700026e-05, + "loss": 0.0223, + "step": 17532 + }, + { + "epoch": 0.35068, + "grad_norm": 0.16187433898448944, + "learning_rate": 1.6409674809027892e-05, + "loss": 0.0244, + "step": 17534 + }, + { + "epoch": 0.35072, + "grad_norm": 8.707640647888184, + "learning_rate": 1.640860301939585e-05, + "loss": 0.3983, + "step": 17536 + }, + { + "epoch": 0.35076, + "grad_norm": 2.409919023513794, + "learning_rate": 1.6407531104824793e-05, + "loss": 0.0568, + "step": 17538 + }, + { + "epoch": 0.3508, + "grad_norm": 0.004726950079202652, + "learning_rate": 1.6406459065335616e-05, + "loss": 0.3919, + "step": 17540 + }, + { + "epoch": 0.35084, + "grad_norm": 0.8846420049667358, + "learning_rate": 1.6405386900949222e-05, + "loss": 0.4276, + "step": 17542 + }, + { + "epoch": 0.35088, + "grad_norm": 0.007295698393136263, + "learning_rate": 1.6404314611686513e-05, + "loss": 0.0717, + "step": 17544 + }, + { + "epoch": 0.35092, + "grad_norm": 5.3613457679748535, + "learning_rate": 1.6403242197568396e-05, + "loss": 0.1164, + "step": 17546 + }, + { + "epoch": 0.35096, + "grad_norm": 1.5016878843307495, + "learning_rate": 1.6402169658615773e-05, + "loss": 0.1263, + "step": 17548 + }, + { + "epoch": 0.351, + "grad_norm": 0.32711920142173767, + "learning_rate": 1.6401096994849558e-05, + "loss": 0.0227, + "step": 17550 + }, + { + "epoch": 0.35104, + "grad_norm": 3.5603675842285156, + "learning_rate": 1.6400024206290657e-05, + "loss": 0.0817, + "step": 17552 + }, + { + "epoch": 0.35108, + "grad_norm": 0.8301962614059448, + "learning_rate": 1.6398951292959995e-05, + "loss": 0.0195, + "step": 17554 + }, + { + "epoch": 0.35112, + "grad_norm": 0.0553896427154541, + "learning_rate": 1.639787825487848e-05, + "loss": 0.0279, + "step": 17556 + }, + { + "epoch": 0.35116, + "grad_norm": 0.11809836328029633, + "learning_rate": 1.6396805092067036e-05, + "loss": 0.0107, + "step": 17558 + }, + { + "epoch": 0.3512, + "grad_norm": 0.2369593232870102, + "learning_rate": 1.6395731804546582e-05, + "loss": 0.0627, + "step": 17560 + }, + { + "epoch": 0.35124, + "grad_norm": 1.4466736316680908, + "learning_rate": 1.6394658392338044e-05, + "loss": 0.0333, + "step": 17562 + }, + { + "epoch": 0.35128, + "grad_norm": 2.6136088371276855, + "learning_rate": 1.639358485546235e-05, + "loss": 0.079, + "step": 17564 + }, + { + "epoch": 0.35132, + "grad_norm": 4.118597507476807, + "learning_rate": 1.6392511193940427e-05, + "loss": 0.0902, + "step": 17566 + }, + { + "epoch": 0.35136, + "grad_norm": 5.183708190917969, + "learning_rate": 1.6391437407793206e-05, + "loss": 0.1395, + "step": 17568 + }, + { + "epoch": 0.3514, + "grad_norm": 7.481127738952637, + "learning_rate": 1.639036349704162e-05, + "loss": 0.2544, + "step": 17570 + }, + { + "epoch": 0.35144, + "grad_norm": 1.5896517038345337, + "learning_rate": 1.6389289461706613e-05, + "loss": 0.0486, + "step": 17572 + }, + { + "epoch": 0.35148, + "grad_norm": 10.001874923706055, + "learning_rate": 1.6388215301809113e-05, + "loss": 0.3812, + "step": 17574 + }, + { + "epoch": 0.35152, + "grad_norm": 5.111035346984863, + "learning_rate": 1.638714101737007e-05, + "loss": 0.1979, + "step": 17576 + }, + { + "epoch": 0.35156, + "grad_norm": 13.514886856079102, + "learning_rate": 1.6386066608410426e-05, + "loss": 0.5133, + "step": 17578 + }, + { + "epoch": 0.3516, + "grad_norm": 0.16910775005817413, + "learning_rate": 1.6384992074951124e-05, + "loss": 0.0552, + "step": 17580 + }, + { + "epoch": 0.35164, + "grad_norm": 0.14981743693351746, + "learning_rate": 1.6383917417013115e-05, + "loss": 0.0267, + "step": 17582 + }, + { + "epoch": 0.35168, + "grad_norm": 0.1830967515707016, + "learning_rate": 1.638284263461735e-05, + "loss": 0.1928, + "step": 17584 + }, + { + "epoch": 0.35172, + "grad_norm": 5.3207478523254395, + "learning_rate": 1.638176772778478e-05, + "loss": 0.3424, + "step": 17586 + }, + { + "epoch": 0.35176, + "grad_norm": 0.5542528033256531, + "learning_rate": 1.6380692696536364e-05, + "loss": 0.0362, + "step": 17588 + }, + { + "epoch": 0.3518, + "grad_norm": 2.7574920654296875, + "learning_rate": 1.6379617540893056e-05, + "loss": 0.0704, + "step": 17590 + }, + { + "epoch": 0.35184, + "grad_norm": 5.725733280181885, + "learning_rate": 1.637854226087582e-05, + "loss": 0.2463, + "step": 17592 + }, + { + "epoch": 0.35188, + "grad_norm": 0.9772158861160278, + "learning_rate": 1.6377466856505622e-05, + "loss": 0.1948, + "step": 17594 + }, + { + "epoch": 0.35192, + "grad_norm": 4.7497239112854, + "learning_rate": 1.637639132780342e-05, + "loss": 0.1269, + "step": 17596 + }, + { + "epoch": 0.35196, + "grad_norm": 4.0849833488464355, + "learning_rate": 1.637531567479019e-05, + "loss": 0.2342, + "step": 17598 + }, + { + "epoch": 0.352, + "grad_norm": 0.19445233047008514, + "learning_rate": 1.63742398974869e-05, + "loss": 0.3045, + "step": 17600 + }, + { + "epoch": 0.35204, + "grad_norm": 1.1301147937774658, + "learning_rate": 1.637316399591452e-05, + "loss": 0.1299, + "step": 17602 + }, + { + "epoch": 0.35208, + "grad_norm": 2.993999719619751, + "learning_rate": 1.6372087970094024e-05, + "loss": 0.2722, + "step": 17604 + }, + { + "epoch": 0.35212, + "grad_norm": 0.27229446172714233, + "learning_rate": 1.6371011820046393e-05, + "loss": 0.0189, + "step": 17606 + }, + { + "epoch": 0.35216, + "grad_norm": 0.8071450591087341, + "learning_rate": 1.6369935545792608e-05, + "loss": 0.0418, + "step": 17608 + }, + { + "epoch": 0.3522, + "grad_norm": 0.2753044068813324, + "learning_rate": 1.636885914735365e-05, + "loss": 0.2713, + "step": 17610 + }, + { + "epoch": 0.35224, + "grad_norm": 2.1007473468780518, + "learning_rate": 1.6367782624750502e-05, + "loss": 0.1434, + "step": 17612 + }, + { + "epoch": 0.35228, + "grad_norm": 5.932200908660889, + "learning_rate": 1.6366705978004157e-05, + "loss": 0.1563, + "step": 17614 + }, + { + "epoch": 0.35232, + "grad_norm": 2.718043088912964, + "learning_rate": 1.63656292071356e-05, + "loss": 0.1439, + "step": 17616 + }, + { + "epoch": 0.35236, + "grad_norm": 1.370788812637329, + "learning_rate": 1.6364552312165822e-05, + "loss": 0.0639, + "step": 17618 + }, + { + "epoch": 0.3524, + "grad_norm": 0.24267466366291046, + "learning_rate": 1.6363475293115824e-05, + "loss": 0.0234, + "step": 17620 + }, + { + "epoch": 0.35244, + "grad_norm": 0.532560408115387, + "learning_rate": 1.6362398150006596e-05, + "loss": 0.1495, + "step": 17622 + }, + { + "epoch": 0.35248, + "grad_norm": 0.23399797081947327, + "learning_rate": 1.636132088285914e-05, + "loss": 0.0515, + "step": 17624 + }, + { + "epoch": 0.35252, + "grad_norm": 2.9876439571380615, + "learning_rate": 1.636024349169446e-05, + "loss": 0.0815, + "step": 17626 + }, + { + "epoch": 0.35256, + "grad_norm": 0.6955809593200684, + "learning_rate": 1.6359165976533556e-05, + "loss": 0.0182, + "step": 17628 + }, + { + "epoch": 0.3526, + "grad_norm": 6.298573017120361, + "learning_rate": 1.6358088337397444e-05, + "loss": 0.2159, + "step": 17630 + }, + { + "epoch": 0.35264, + "grad_norm": 0.9497578144073486, + "learning_rate": 1.635701057430712e-05, + "loss": 0.0195, + "step": 17632 + }, + { + "epoch": 0.35268, + "grad_norm": 1.2758201360702515, + "learning_rate": 1.6355932687283605e-05, + "loss": 0.0319, + "step": 17634 + }, + { + "epoch": 0.35272, + "grad_norm": 6.975235939025879, + "learning_rate": 1.635485467634791e-05, + "loss": 0.548, + "step": 17636 + }, + { + "epoch": 0.35276, + "grad_norm": 0.33492517471313477, + "learning_rate": 1.635377654152105e-05, + "loss": 0.0104, + "step": 17638 + }, + { + "epoch": 0.3528, + "grad_norm": 12.547435760498047, + "learning_rate": 1.6352698282824045e-05, + "loss": 0.6802, + "step": 17640 + }, + { + "epoch": 0.35284, + "grad_norm": 1.5387061834335327, + "learning_rate": 1.6351619900277916e-05, + "loss": 0.0664, + "step": 17642 + }, + { + "epoch": 0.35288, + "grad_norm": 0.1413160115480423, + "learning_rate": 1.635054139390369e-05, + "loss": 0.1411, + "step": 17644 + }, + { + "epoch": 0.35292, + "grad_norm": 1.8437918424606323, + "learning_rate": 1.6349462763722387e-05, + "loss": 0.0647, + "step": 17646 + }, + { + "epoch": 0.35296, + "grad_norm": 1.8176966905593872, + "learning_rate": 1.634838400975504e-05, + "loss": 0.2614, + "step": 17648 + }, + { + "epoch": 0.353, + "grad_norm": 0.053951043635606766, + "learning_rate": 1.6347305132022677e-05, + "loss": 0.028, + "step": 17650 + }, + { + "epoch": 0.35304, + "grad_norm": 0.04534218832850456, + "learning_rate": 1.6346226130546335e-05, + "loss": 0.0973, + "step": 17652 + }, + { + "epoch": 0.35308, + "grad_norm": 1.0749170780181885, + "learning_rate": 1.6345147005347044e-05, + "loss": 0.4159, + "step": 17654 + }, + { + "epoch": 0.35312, + "grad_norm": 2.939535140991211, + "learning_rate": 1.6344067756445848e-05, + "loss": 0.222, + "step": 17656 + }, + { + "epoch": 0.35316, + "grad_norm": 0.008629345335066319, + "learning_rate": 1.6342988383863785e-05, + "loss": 0.0651, + "step": 17658 + }, + { + "epoch": 0.3532, + "grad_norm": 0.33465370535850525, + "learning_rate": 1.6341908887621894e-05, + "loss": 0.0279, + "step": 17660 + }, + { + "epoch": 0.35324, + "grad_norm": 1.5664665699005127, + "learning_rate": 1.6340829267741233e-05, + "loss": 0.0498, + "step": 17662 + }, + { + "epoch": 0.35328, + "grad_norm": 3.273545980453491, + "learning_rate": 1.6339749524242833e-05, + "loss": 0.1025, + "step": 17664 + }, + { + "epoch": 0.35332, + "grad_norm": 0.23870788514614105, + "learning_rate": 1.6338669657147756e-05, + "loss": 0.3235, + "step": 17666 + }, + { + "epoch": 0.35336, + "grad_norm": 2.2871053218841553, + "learning_rate": 1.6337589666477047e-05, + "loss": 0.0826, + "step": 17668 + }, + { + "epoch": 0.3534, + "grad_norm": 7.519972324371338, + "learning_rate": 1.6336509552251766e-05, + "loss": 0.2923, + "step": 17670 + }, + { + "epoch": 0.35344, + "grad_norm": 0.9769750237464905, + "learning_rate": 1.633542931449297e-05, + "loss": 0.0265, + "step": 17672 + }, + { + "epoch": 0.35348, + "grad_norm": 4.102519989013672, + "learning_rate": 1.633434895322172e-05, + "loss": 0.1292, + "step": 17674 + }, + { + "epoch": 0.35352, + "grad_norm": 2.954634666442871, + "learning_rate": 1.633326846845907e-05, + "loss": 0.1906, + "step": 17676 + }, + { + "epoch": 0.35356, + "grad_norm": 2.017524003982544, + "learning_rate": 1.6332187860226096e-05, + "loss": 0.1056, + "step": 17678 + }, + { + "epoch": 0.3536, + "grad_norm": 2.270111083984375, + "learning_rate": 1.6331107128543856e-05, + "loss": 0.1459, + "step": 17680 + }, + { + "epoch": 0.35364, + "grad_norm": 3.0380306243896484, + "learning_rate": 1.633002627343343e-05, + "loss": 0.1045, + "step": 17682 + }, + { + "epoch": 0.35368, + "grad_norm": 0.41562411189079285, + "learning_rate": 1.6328945294915875e-05, + "loss": 0.0103, + "step": 17684 + }, + { + "epoch": 0.35372, + "grad_norm": 1.2088533639907837, + "learning_rate": 1.6327864193012274e-05, + "loss": 0.037, + "step": 17686 + }, + { + "epoch": 0.35376, + "grad_norm": 0.4525662064552307, + "learning_rate": 1.6326782967743706e-05, + "loss": 0.0487, + "step": 17688 + }, + { + "epoch": 0.3538, + "grad_norm": 0.10311990976333618, + "learning_rate": 1.6325701619131246e-05, + "loss": 0.0261, + "step": 17690 + }, + { + "epoch": 0.35384, + "grad_norm": 0.2833750247955322, + "learning_rate": 1.6324620147195974e-05, + "loss": 0.0223, + "step": 17692 + }, + { + "epoch": 0.35388, + "grad_norm": 0.08948279172182083, + "learning_rate": 1.632353855195898e-05, + "loss": 0.0603, + "step": 17694 + }, + { + "epoch": 0.35392, + "grad_norm": 0.20968782901763916, + "learning_rate": 1.6322456833441346e-05, + "loss": 0.0511, + "step": 17696 + }, + { + "epoch": 0.35396, + "grad_norm": 2.681591749191284, + "learning_rate": 1.6321374991664157e-05, + "loss": 0.0608, + "step": 17698 + }, + { + "epoch": 0.354, + "grad_norm": 2.6466805934906006, + "learning_rate": 1.632029302664851e-05, + "loss": 0.1051, + "step": 17700 + }, + { + "epoch": 0.35404, + "grad_norm": 1.6002295017242432, + "learning_rate": 1.63192109384155e-05, + "loss": 0.0923, + "step": 17702 + }, + { + "epoch": 0.35408, + "grad_norm": 0.2336399108171463, + "learning_rate": 1.6318128726986212e-05, + "loss": 0.02, + "step": 17704 + }, + { + "epoch": 0.35412, + "grad_norm": 0.06375624239444733, + "learning_rate": 1.6317046392381757e-05, + "loss": 0.0202, + "step": 17706 + }, + { + "epoch": 0.35416, + "grad_norm": 10.194836616516113, + "learning_rate": 1.6315963934623228e-05, + "loss": 0.6525, + "step": 17708 + }, + { + "epoch": 0.3542, + "grad_norm": 5.477636337280273, + "learning_rate": 1.6314881353731733e-05, + "loss": 0.1906, + "step": 17710 + }, + { + "epoch": 0.35424, + "grad_norm": 0.39633122086524963, + "learning_rate": 1.6313798649728373e-05, + "loss": 0.0377, + "step": 17712 + }, + { + "epoch": 0.35428, + "grad_norm": 2.5069358348846436, + "learning_rate": 1.631271582263426e-05, + "loss": 0.1364, + "step": 17714 + }, + { + "epoch": 0.35432, + "grad_norm": 4.559687614440918, + "learning_rate": 1.63116328724705e-05, + "loss": 0.1189, + "step": 17716 + }, + { + "epoch": 0.35436, + "grad_norm": 0.5751075148582458, + "learning_rate": 1.631054979925821e-05, + "loss": 0.0132, + "step": 17718 + }, + { + "epoch": 0.3544, + "grad_norm": 8.445002555847168, + "learning_rate": 1.6309466603018497e-05, + "loss": 0.8408, + "step": 17720 + }, + { + "epoch": 0.35444, + "grad_norm": 5.044352054595947, + "learning_rate": 1.6308383283772488e-05, + "loss": 0.1404, + "step": 17722 + }, + { + "epoch": 0.35448, + "grad_norm": 0.08314798772335052, + "learning_rate": 1.63072998415413e-05, + "loss": 0.0112, + "step": 17724 + }, + { + "epoch": 0.35452, + "grad_norm": 1.1585147380828857, + "learning_rate": 1.6306216276346054e-05, + "loss": 0.0328, + "step": 17726 + }, + { + "epoch": 0.35456, + "grad_norm": 0.6570413112640381, + "learning_rate": 1.6305132588207873e-05, + "loss": 0.0657, + "step": 17728 + }, + { + "epoch": 0.3546, + "grad_norm": 3.4231345653533936, + "learning_rate": 1.630404877714789e-05, + "loss": 0.1454, + "step": 17730 + }, + { + "epoch": 0.35464, + "grad_norm": 0.9478602409362793, + "learning_rate": 1.6302964843187226e-05, + "loss": 0.1895, + "step": 17732 + }, + { + "epoch": 0.35468, + "grad_norm": 0.710604190826416, + "learning_rate": 1.630188078634702e-05, + "loss": 0.3006, + "step": 17734 + }, + { + "epoch": 0.35472, + "grad_norm": 0.17982326447963715, + "learning_rate": 1.6300796606648402e-05, + "loss": 0.0203, + "step": 17736 + }, + { + "epoch": 0.35476, + "grad_norm": 0.7902155518531799, + "learning_rate": 1.6299712304112514e-05, + "loss": 0.0244, + "step": 17738 + }, + { + "epoch": 0.3548, + "grad_norm": 3.0208725929260254, + "learning_rate": 1.6298627878760488e-05, + "loss": 0.338, + "step": 17740 + }, + { + "epoch": 0.35484, + "grad_norm": 0.6516825556755066, + "learning_rate": 1.6297543330613472e-05, + "loss": 0.0222, + "step": 17742 + }, + { + "epoch": 0.35488, + "grad_norm": 7.32478141784668, + "learning_rate": 1.62964586596926e-05, + "loss": 0.3725, + "step": 17744 + }, + { + "epoch": 0.35492, + "grad_norm": 0.15216147899627686, + "learning_rate": 1.629537386601903e-05, + "loss": 0.0372, + "step": 17746 + }, + { + "epoch": 0.35496, + "grad_norm": 2.3382368087768555, + "learning_rate": 1.6294288949613907e-05, + "loss": 0.1062, + "step": 17748 + }, + { + "epoch": 0.355, + "grad_norm": 0.26763275265693665, + "learning_rate": 1.6293203910498375e-05, + "loss": 0.009, + "step": 17750 + }, + { + "epoch": 0.35504, + "grad_norm": 0.2650817930698395, + "learning_rate": 1.6292118748693596e-05, + "loss": 0.1212, + "step": 17752 + }, + { + "epoch": 0.35508, + "grad_norm": 2.4351015090942383, + "learning_rate": 1.6291033464220724e-05, + "loss": 0.08, + "step": 17754 + }, + { + "epoch": 0.35512, + "grad_norm": 1.5336289405822754, + "learning_rate": 1.6289948057100913e-05, + "loss": 0.0515, + "step": 17756 + }, + { + "epoch": 0.35516, + "grad_norm": 1.5522749423980713, + "learning_rate": 1.6288862527355325e-05, + "loss": 0.0755, + "step": 17758 + }, + { + "epoch": 0.3552, + "grad_norm": 0.5478784441947937, + "learning_rate": 1.628777687500513e-05, + "loss": 0.1514, + "step": 17760 + }, + { + "epoch": 0.35524, + "grad_norm": 0.33437252044677734, + "learning_rate": 1.6286691100071484e-05, + "loss": 0.072, + "step": 17762 + }, + { + "epoch": 0.35528, + "grad_norm": 0.4933164119720459, + "learning_rate": 1.628560520257556e-05, + "loss": 0.0835, + "step": 17764 + }, + { + "epoch": 0.35532, + "grad_norm": 2.4641294479370117, + "learning_rate": 1.6284519182538522e-05, + "loss": 0.0964, + "step": 17766 + }, + { + "epoch": 0.35536, + "grad_norm": 1.0652782917022705, + "learning_rate": 1.6283433039981552e-05, + "loss": 0.1264, + "step": 17768 + }, + { + "epoch": 0.3554, + "grad_norm": 0.9257857203483582, + "learning_rate": 1.6282346774925816e-05, + "loss": 0.0296, + "step": 17770 + }, + { + "epoch": 0.35544, + "grad_norm": 0.5775297284126282, + "learning_rate": 1.62812603873925e-05, + "loss": 0.0438, + "step": 17772 + }, + { + "epoch": 0.35548, + "grad_norm": 0.5298619270324707, + "learning_rate": 1.6280173877402775e-05, + "loss": 0.0605, + "step": 17774 + }, + { + "epoch": 0.35552, + "grad_norm": 1.4354290962219238, + "learning_rate": 1.6279087244977828e-05, + "loss": 0.1714, + "step": 17776 + }, + { + "epoch": 0.35556, + "grad_norm": 0.8040006756782532, + "learning_rate": 1.6278000490138843e-05, + "loss": 0.0426, + "step": 17778 + }, + { + "epoch": 0.3556, + "grad_norm": 4.438948154449463, + "learning_rate": 1.6276913612907005e-05, + "loss": 0.3668, + "step": 17780 + }, + { + "epoch": 0.35564, + "grad_norm": 0.7024829387664795, + "learning_rate": 1.6275826613303508e-05, + "loss": 0.0218, + "step": 17782 + }, + { + "epoch": 0.35568, + "grad_norm": 0.8861123323440552, + "learning_rate": 1.6274739491349537e-05, + "loss": 0.062, + "step": 17784 + }, + { + "epoch": 0.35572, + "grad_norm": 0.9333793520927429, + "learning_rate": 1.627365224706629e-05, + "loss": 0.0453, + "step": 17786 + }, + { + "epoch": 0.35576, + "grad_norm": 9.540788650512695, + "learning_rate": 1.6272564880474962e-05, + "loss": 0.4422, + "step": 17788 + }, + { + "epoch": 0.3558, + "grad_norm": 1.451034426689148, + "learning_rate": 1.6271477391596754e-05, + "loss": 0.3564, + "step": 17790 + }, + { + "epoch": 0.35584, + "grad_norm": 1.2275645732879639, + "learning_rate": 1.6270389780452864e-05, + "loss": 0.0526, + "step": 17792 + }, + { + "epoch": 0.35588, + "grad_norm": 0.2352413833141327, + "learning_rate": 1.6269302047064497e-05, + "loss": 0.0059, + "step": 17794 + }, + { + "epoch": 0.35592, + "grad_norm": 0.041367024183273315, + "learning_rate": 1.626821419145286e-05, + "loss": 0.0082, + "step": 17796 + }, + { + "epoch": 0.35596, + "grad_norm": 1.6044394969940186, + "learning_rate": 1.6267126213639158e-05, + "loss": 0.0469, + "step": 17798 + }, + { + "epoch": 0.356, + "grad_norm": 0.7549530267715454, + "learning_rate": 1.6266038113644605e-05, + "loss": 0.0612, + "step": 17800 + }, + { + "epoch": 0.35604, + "grad_norm": 0.3308250308036804, + "learning_rate": 1.6264949891490415e-05, + "loss": 0.0316, + "step": 17802 + }, + { + "epoch": 0.35608, + "grad_norm": 1.9390405416488647, + "learning_rate": 1.62638615471978e-05, + "loss": 0.0505, + "step": 17804 + }, + { + "epoch": 0.35612, + "grad_norm": 0.03847414627671242, + "learning_rate": 1.6262773080787982e-05, + "loss": 0.0275, + "step": 17806 + }, + { + "epoch": 0.35616, + "grad_norm": 3.0420050621032715, + "learning_rate": 1.6261684492282173e-05, + "loss": 0.1912, + "step": 17808 + }, + { + "epoch": 0.3562, + "grad_norm": 0.2638985812664032, + "learning_rate": 1.6260595781701605e-05, + "loss": 0.0194, + "step": 17810 + }, + { + "epoch": 0.35624, + "grad_norm": 5.974759578704834, + "learning_rate": 1.6259506949067497e-05, + "loss": 0.5449, + "step": 17812 + }, + { + "epoch": 0.35628, + "grad_norm": 0.14410942792892456, + "learning_rate": 1.6258417994401078e-05, + "loss": 0.0913, + "step": 17814 + }, + { + "epoch": 0.35632, + "grad_norm": 1.4921376705169678, + "learning_rate": 1.625732891772358e-05, + "loss": 0.0551, + "step": 17816 + }, + { + "epoch": 0.35636, + "grad_norm": 0.5748323798179626, + "learning_rate": 1.6256239719056232e-05, + "loss": 0.329, + "step": 17818 + }, + { + "epoch": 0.3564, + "grad_norm": 0.11737240105867386, + "learning_rate": 1.6255150398420273e-05, + "loss": 0.1176, + "step": 17820 + }, + { + "epoch": 0.35644, + "grad_norm": 3.1447360515594482, + "learning_rate": 1.6254060955836933e-05, + "loss": 0.0989, + "step": 17822 + }, + { + "epoch": 0.35648, + "grad_norm": 1.6444171667099, + "learning_rate": 1.6252971391327455e-05, + "loss": 0.0364, + "step": 17824 + }, + { + "epoch": 0.35652, + "grad_norm": 1.8541412353515625, + "learning_rate": 1.625188170491308e-05, + "loss": 0.0451, + "step": 17826 + }, + { + "epoch": 0.35656, + "grad_norm": 1.5776764154434204, + "learning_rate": 1.6250791896615057e-05, + "loss": 0.2343, + "step": 17828 + }, + { + "epoch": 0.3566, + "grad_norm": 7.487852573394775, + "learning_rate": 1.6249701966454626e-05, + "loss": 0.3052, + "step": 17830 + }, + { + "epoch": 0.35664, + "grad_norm": 0.8934144973754883, + "learning_rate": 1.6248611914453035e-05, + "loss": 0.0389, + "step": 17832 + }, + { + "epoch": 0.35668, + "grad_norm": 0.3555889427661896, + "learning_rate": 1.624752174063154e-05, + "loss": 0.0263, + "step": 17834 + }, + { + "epoch": 0.35672, + "grad_norm": 6.163067817687988, + "learning_rate": 1.624643144501139e-05, + "loss": 0.2211, + "step": 17836 + }, + { + "epoch": 0.35676, + "grad_norm": 5.122868537902832, + "learning_rate": 1.6245341027613847e-05, + "loss": 0.1251, + "step": 17838 + }, + { + "epoch": 0.3568, + "grad_norm": 1.2979416847229004, + "learning_rate": 1.624425048846016e-05, + "loss": 0.0798, + "step": 17840 + }, + { + "epoch": 0.35684, + "grad_norm": 4.061666011810303, + "learning_rate": 1.6243159827571598e-05, + "loss": 0.118, + "step": 17842 + }, + { + "epoch": 0.35688, + "grad_norm": 0.12677527964115143, + "learning_rate": 1.624206904496942e-05, + "loss": 0.005, + "step": 17844 + }, + { + "epoch": 0.35692, + "grad_norm": 0.07220792025327682, + "learning_rate": 1.6240978140674895e-05, + "loss": 0.1911, + "step": 17846 + }, + { + "epoch": 0.35696, + "grad_norm": 1.184763789176941, + "learning_rate": 1.6239887114709282e-05, + "loss": 0.1213, + "step": 17848 + }, + { + "epoch": 0.357, + "grad_norm": 0.3839258551597595, + "learning_rate": 1.6238795967093865e-05, + "loss": 0.0461, + "step": 17850 + }, + { + "epoch": 0.35704, + "grad_norm": 2.804851770401001, + "learning_rate": 1.6237704697849903e-05, + "loss": 0.0863, + "step": 17852 + }, + { + "epoch": 0.35708, + "grad_norm": 0.11908572912216187, + "learning_rate": 1.6236613306998678e-05, + "loss": 0.0236, + "step": 17854 + }, + { + "epoch": 0.35712, + "grad_norm": 0.2414737492799759, + "learning_rate": 1.6235521794561467e-05, + "loss": 0.0258, + "step": 17856 + }, + { + "epoch": 0.35716, + "grad_norm": 5.759023666381836, + "learning_rate": 1.6234430160559548e-05, + "loss": 0.3027, + "step": 17858 + }, + { + "epoch": 0.3572, + "grad_norm": 6.173045635223389, + "learning_rate": 1.6233338405014204e-05, + "loss": 0.2052, + "step": 17860 + }, + { + "epoch": 0.35724, + "grad_norm": 0.4764063060283661, + "learning_rate": 1.6232246527946718e-05, + "loss": 0.1699, + "step": 17862 + }, + { + "epoch": 0.35728, + "grad_norm": 3.238454818725586, + "learning_rate": 1.6231154529378376e-05, + "loss": 0.121, + "step": 17864 + }, + { + "epoch": 0.35732, + "grad_norm": 7.711775302886963, + "learning_rate": 1.6230062409330466e-05, + "loss": 0.2066, + "step": 17866 + }, + { + "epoch": 0.35736, + "grad_norm": 0.13444127142429352, + "learning_rate": 1.6228970167824286e-05, + "loss": 0.1791, + "step": 17868 + }, + { + "epoch": 0.3574, + "grad_norm": 0.0949302688241005, + "learning_rate": 1.6227877804881126e-05, + "loss": 0.0112, + "step": 17870 + }, + { + "epoch": 0.35744, + "grad_norm": 0.04980006441473961, + "learning_rate": 1.6226785320522283e-05, + "loss": 0.0312, + "step": 17872 + }, + { + "epoch": 0.35748, + "grad_norm": 1.036314606666565, + "learning_rate": 1.622569271476905e-05, + "loss": 0.0231, + "step": 17874 + }, + { + "epoch": 0.35752, + "grad_norm": 0.16494622826576233, + "learning_rate": 1.622459998764273e-05, + "loss": 0.0304, + "step": 17876 + }, + { + "epoch": 0.35756, + "grad_norm": 0.1960364729166031, + "learning_rate": 1.6223507139164637e-05, + "loss": 0.1016, + "step": 17878 + }, + { + "epoch": 0.3576, + "grad_norm": 1.4493738412857056, + "learning_rate": 1.6222414169356066e-05, + "loss": 0.0674, + "step": 17880 + }, + { + "epoch": 0.35764, + "grad_norm": 6.817080974578857, + "learning_rate": 1.6221321078238327e-05, + "loss": 0.38, + "step": 17882 + }, + { + "epoch": 0.35768, + "grad_norm": 0.0738503485918045, + "learning_rate": 1.6220227865832726e-05, + "loss": 0.3934, + "step": 17884 + }, + { + "epoch": 0.35772, + "grad_norm": 0.5917087197303772, + "learning_rate": 1.6219134532160584e-05, + "loss": 0.0454, + "step": 17886 + }, + { + "epoch": 0.35776, + "grad_norm": 1.5912224054336548, + "learning_rate": 1.6218041077243213e-05, + "loss": 0.0941, + "step": 17888 + }, + { + "epoch": 0.3578, + "grad_norm": 0.5052757859230042, + "learning_rate": 1.621694750110193e-05, + "loss": 0.0241, + "step": 17890 + }, + { + "epoch": 0.35784, + "grad_norm": 0.2181224524974823, + "learning_rate": 1.6215853803758054e-05, + "loss": 0.0143, + "step": 17892 + }, + { + "epoch": 0.35788, + "grad_norm": 5.16738224029541, + "learning_rate": 1.6214759985232905e-05, + "loss": 0.2105, + "step": 17894 + }, + { + "epoch": 0.35792, + "grad_norm": 0.13268977403640747, + "learning_rate": 1.6213666045547815e-05, + "loss": 0.0173, + "step": 17896 + }, + { + "epoch": 0.35796, + "grad_norm": 0.2823771834373474, + "learning_rate": 1.6212571984724104e-05, + "loss": 0.0154, + "step": 17898 + }, + { + "epoch": 0.358, + "grad_norm": 5.8488030433654785, + "learning_rate": 1.6211477802783105e-05, + "loss": 0.6708, + "step": 17900 + }, + { + "epoch": 0.35804, + "grad_norm": 0.2501814663410187, + "learning_rate": 1.621038349974615e-05, + "loss": 0.1019, + "step": 17902 + }, + { + "epoch": 0.35808, + "grad_norm": 1.4544960260391235, + "learning_rate": 1.6209289075634568e-05, + "loss": 0.218, + "step": 17904 + }, + { + "epoch": 0.35812, + "grad_norm": 0.8482900857925415, + "learning_rate": 1.62081945304697e-05, + "loss": 0.0322, + "step": 17906 + }, + { + "epoch": 0.35816, + "grad_norm": 2.0221335887908936, + "learning_rate": 1.6207099864272883e-05, + "loss": 0.2544, + "step": 17908 + }, + { + "epoch": 0.3582, + "grad_norm": 0.06926898658275604, + "learning_rate": 1.6206005077065457e-05, + "loss": 0.3202, + "step": 17910 + }, + { + "epoch": 0.35824, + "grad_norm": 0.64817875623703, + "learning_rate": 1.620491016886877e-05, + "loss": 0.0302, + "step": 17912 + }, + { + "epoch": 0.35828, + "grad_norm": 5.373390197753906, + "learning_rate": 1.6203815139704163e-05, + "loss": 0.5174, + "step": 17914 + }, + { + "epoch": 0.35832, + "grad_norm": 0.37022149562835693, + "learning_rate": 1.620271998959299e-05, + "loss": 0.024, + "step": 17916 + }, + { + "epoch": 0.35836, + "grad_norm": 0.16131360828876495, + "learning_rate": 1.620162471855659e-05, + "loss": 0.3041, + "step": 17918 + }, + { + "epoch": 0.3584, + "grad_norm": 10.196391105651855, + "learning_rate": 1.620052932661633e-05, + "loss": 0.2493, + "step": 17920 + }, + { + "epoch": 0.35844, + "grad_norm": 0.10272915661334991, + "learning_rate": 1.619943381379355e-05, + "loss": 0.2532, + "step": 17922 + }, + { + "epoch": 0.35848, + "grad_norm": 0.42668458819389343, + "learning_rate": 1.6198338180109624e-05, + "loss": 0.0251, + "step": 17924 + }, + { + "epoch": 0.35852, + "grad_norm": 1.3810794353485107, + "learning_rate": 1.6197242425585903e-05, + "loss": 0.0737, + "step": 17926 + }, + { + "epoch": 0.35856, + "grad_norm": 0.9960891008377075, + "learning_rate": 1.619614655024375e-05, + "loss": 0.0254, + "step": 17928 + }, + { + "epoch": 0.3586, + "grad_norm": 0.7960811853408813, + "learning_rate": 1.619505055410453e-05, + "loss": 0.1477, + "step": 17930 + }, + { + "epoch": 0.35864, + "grad_norm": 0.3769576847553253, + "learning_rate": 1.6193954437189607e-05, + "loss": 0.0119, + "step": 17932 + }, + { + "epoch": 0.35868, + "grad_norm": 1.4029814004898071, + "learning_rate": 1.6192858199520353e-05, + "loss": 0.162, + "step": 17934 + }, + { + "epoch": 0.35872, + "grad_norm": 0.20009195804595947, + "learning_rate": 1.6191761841118146e-05, + "loss": 0.0047, + "step": 17936 + }, + { + "epoch": 0.35876, + "grad_norm": 0.33888381719589233, + "learning_rate": 1.619066536200435e-05, + "loss": 0.041, + "step": 17938 + }, + { + "epoch": 0.3588, + "grad_norm": 7.3624067306518555, + "learning_rate": 1.618956876220035e-05, + "loss": 0.3696, + "step": 17940 + }, + { + "epoch": 0.35884, + "grad_norm": 6.1871113777160645, + "learning_rate": 1.6188472041727516e-05, + "loss": 0.3631, + "step": 17942 + }, + { + "epoch": 0.35888, + "grad_norm": 0.4631151556968689, + "learning_rate": 1.6187375200607233e-05, + "loss": 0.0375, + "step": 17944 + }, + { + "epoch": 0.35892, + "grad_norm": 0.9272108674049377, + "learning_rate": 1.6186278238860883e-05, + "loss": 0.0431, + "step": 17946 + }, + { + "epoch": 0.35896, + "grad_norm": 0.4997870922088623, + "learning_rate": 1.6185181156509862e-05, + "loss": 0.0295, + "step": 17948 + }, + { + "epoch": 0.359, + "grad_norm": 1.3169363737106323, + "learning_rate": 1.6184083953575543e-05, + "loss": 0.0552, + "step": 17950 + }, + { + "epoch": 0.35904, + "grad_norm": 5.043934345245361, + "learning_rate": 1.6182986630079325e-05, + "loss": 0.4463, + "step": 17952 + }, + { + "epoch": 0.35908, + "grad_norm": 0.5077948570251465, + "learning_rate": 1.61818891860426e-05, + "loss": 0.0405, + "step": 17954 + }, + { + "epoch": 0.35912, + "grad_norm": 1.3388806581497192, + "learning_rate": 1.6180791621486764e-05, + "loss": 0.1459, + "step": 17956 + }, + { + "epoch": 0.35916, + "grad_norm": 0.2706378400325775, + "learning_rate": 1.617969393643321e-05, + "loss": 0.1956, + "step": 17958 + }, + { + "epoch": 0.3592, + "grad_norm": 0.16951288282871246, + "learning_rate": 1.6178596130903345e-05, + "loss": 0.1097, + "step": 17960 + }, + { + "epoch": 0.35924, + "grad_norm": 2.473334550857544, + "learning_rate": 1.6177498204918568e-05, + "loss": 0.1509, + "step": 17962 + }, + { + "epoch": 0.35928, + "grad_norm": 0.41813209652900696, + "learning_rate": 1.6176400158500278e-05, + "loss": 0.0333, + "step": 17964 + }, + { + "epoch": 0.35932, + "grad_norm": 2.64650821685791, + "learning_rate": 1.617530199166989e-05, + "loss": 0.0826, + "step": 17966 + }, + { + "epoch": 0.35936, + "grad_norm": 0.9686365127563477, + "learning_rate": 1.617420370444881e-05, + "loss": 0.033, + "step": 17968 + }, + { + "epoch": 0.3594, + "grad_norm": 5.254931926727295, + "learning_rate": 1.617310529685845e-05, + "loss": 0.238, + "step": 17970 + }, + { + "epoch": 0.35944, + "grad_norm": 0.8361281156539917, + "learning_rate": 1.6172006768920226e-05, + "loss": 0.023, + "step": 17972 + }, + { + "epoch": 0.35948, + "grad_norm": 0.47896072268486023, + "learning_rate": 1.617090812065555e-05, + "loss": 0.0309, + "step": 17974 + }, + { + "epoch": 0.35952, + "grad_norm": 0.4425899088382721, + "learning_rate": 1.6169809352085847e-05, + "loss": 0.0373, + "step": 17976 + }, + { + "epoch": 0.35956, + "grad_norm": 1.0268394947052002, + "learning_rate": 1.616871046323253e-05, + "loss": 0.0987, + "step": 17978 + }, + { + "epoch": 0.3596, + "grad_norm": 5.766290664672852, + "learning_rate": 1.6167611454117027e-05, + "loss": 0.2015, + "step": 17980 + }, + { + "epoch": 0.35964, + "grad_norm": 0.9430360794067383, + "learning_rate": 1.6166512324760765e-05, + "loss": 0.0985, + "step": 17982 + }, + { + "epoch": 0.35968, + "grad_norm": 9.854353904724121, + "learning_rate": 1.616541307518517e-05, + "loss": 0.4794, + "step": 17984 + }, + { + "epoch": 0.35972, + "grad_norm": 3.3668692111968994, + "learning_rate": 1.6164313705411674e-05, + "loss": 0.0615, + "step": 17986 + }, + { + "epoch": 0.35976, + "grad_norm": 0.8055031895637512, + "learning_rate": 1.6163214215461705e-05, + "loss": 0.0271, + "step": 17988 + }, + { + "epoch": 0.3598, + "grad_norm": 1.3131518363952637, + "learning_rate": 1.6162114605356704e-05, + "loss": 0.0352, + "step": 17990 + }, + { + "epoch": 0.35984, + "grad_norm": 2.6118412017822266, + "learning_rate": 1.6161014875118112e-05, + "loss": 0.1073, + "step": 17992 + }, + { + "epoch": 0.35988, + "grad_norm": 0.10305214673280716, + "learning_rate": 1.6159915024767357e-05, + "loss": 0.2526, + "step": 17994 + }, + { + "epoch": 0.35992, + "grad_norm": 1.273938775062561, + "learning_rate": 1.6158815054325887e-05, + "loss": 0.0314, + "step": 17996 + }, + { + "epoch": 0.35996, + "grad_norm": 0.2889821231365204, + "learning_rate": 1.6157714963815146e-05, + "loss": 0.0359, + "step": 17998 + }, + { + "epoch": 0.36, + "grad_norm": 0.7568132877349854, + "learning_rate": 1.6156614753256583e-05, + "loss": 0.1224, + "step": 18000 + }, + { + "epoch": 0.36004, + "grad_norm": 0.23565931618213654, + "learning_rate": 1.6155514422671646e-05, + "loss": 0.3213, + "step": 18002 + }, + { + "epoch": 0.36008, + "grad_norm": 0.6689991354942322, + "learning_rate": 1.6154413972081788e-05, + "loss": 0.5232, + "step": 18004 + }, + { + "epoch": 0.36012, + "grad_norm": 0.9707899689674377, + "learning_rate": 1.615331340150846e-05, + "loss": 0.0407, + "step": 18006 + }, + { + "epoch": 0.36016, + "grad_norm": 3.4436590671539307, + "learning_rate": 1.6152212710973117e-05, + "loss": 0.18, + "step": 18008 + }, + { + "epoch": 0.3602, + "grad_norm": 1.270196557044983, + "learning_rate": 1.6151111900497225e-05, + "loss": 0.0646, + "step": 18010 + }, + { + "epoch": 0.36024, + "grad_norm": 0.4278820753097534, + "learning_rate": 1.6150010970102234e-05, + "loss": 0.1482, + "step": 18012 + }, + { + "epoch": 0.36028, + "grad_norm": 0.7218039631843567, + "learning_rate": 1.6148909919809613e-05, + "loss": 0.0323, + "step": 18014 + }, + { + "epoch": 0.36032, + "grad_norm": 1.3532779216766357, + "learning_rate": 1.614780874964083e-05, + "loss": 0.158, + "step": 18016 + }, + { + "epoch": 0.36036, + "grad_norm": 0.7151562571525574, + "learning_rate": 1.614670745961735e-05, + "loss": 0.0567, + "step": 18018 + }, + { + "epoch": 0.3604, + "grad_norm": 2.749716281890869, + "learning_rate": 1.6145606049760644e-05, + "loss": 0.0985, + "step": 18020 + }, + { + "epoch": 0.36044, + "grad_norm": 0.88433438539505, + "learning_rate": 1.6144504520092187e-05, + "loss": 0.0437, + "step": 18022 + }, + { + "epoch": 0.36048, + "grad_norm": 0.19187894463539124, + "learning_rate": 1.6143402870633448e-05, + "loss": 0.2231, + "step": 18024 + }, + { + "epoch": 0.36052, + "grad_norm": 0.5258651971817017, + "learning_rate": 1.6142301101405906e-05, + "loss": 0.0472, + "step": 18026 + }, + { + "epoch": 0.36056, + "grad_norm": 1.8251184225082397, + "learning_rate": 1.6141199212431043e-05, + "loss": 0.071, + "step": 18028 + }, + { + "epoch": 0.3606, + "grad_norm": 1.2933608293533325, + "learning_rate": 1.614009720373034e-05, + "loss": 0.0541, + "step": 18030 + }, + { + "epoch": 0.36064, + "grad_norm": 5.054795742034912, + "learning_rate": 1.6138995075325277e-05, + "loss": 0.2616, + "step": 18032 + }, + { + "epoch": 0.36068, + "grad_norm": 0.8554680943489075, + "learning_rate": 1.613789282723735e-05, + "loss": 0.3945, + "step": 18034 + }, + { + "epoch": 0.36072, + "grad_norm": 0.9882926940917969, + "learning_rate": 1.613679045948804e-05, + "loss": 0.0453, + "step": 18036 + }, + { + "epoch": 0.36076, + "grad_norm": 0.7566093802452087, + "learning_rate": 1.613568797209884e-05, + "loss": 0.1573, + "step": 18038 + }, + { + "epoch": 0.3608, + "grad_norm": 0.6628336906433105, + "learning_rate": 1.6134585365091243e-05, + "loss": 0.0943, + "step": 18040 + }, + { + "epoch": 0.36084, + "grad_norm": 1.968747615814209, + "learning_rate": 1.613348263848675e-05, + "loss": 0.1019, + "step": 18042 + }, + { + "epoch": 0.36088, + "grad_norm": 0.3357315957546234, + "learning_rate": 1.613237979230685e-05, + "loss": 0.0334, + "step": 18044 + }, + { + "epoch": 0.36092, + "grad_norm": 4.591020584106445, + "learning_rate": 1.6131276826573053e-05, + "loss": 0.1667, + "step": 18046 + }, + { + "epoch": 0.36096, + "grad_norm": 0.07952108234167099, + "learning_rate": 1.6130173741306857e-05, + "loss": 0.4901, + "step": 18048 + }, + { + "epoch": 0.361, + "grad_norm": 0.0457305870950222, + "learning_rate": 1.6129070536529767e-05, + "loss": 0.0279, + "step": 18050 + }, + { + "epoch": 0.36104, + "grad_norm": 2.6198318004608154, + "learning_rate": 1.612796721226329e-05, + "loss": 0.098, + "step": 18052 + }, + { + "epoch": 0.36108, + "grad_norm": 1.8289825916290283, + "learning_rate": 1.612686376852894e-05, + "loss": 0.1062, + "step": 18054 + }, + { + "epoch": 0.36112, + "grad_norm": 2.5488851070404053, + "learning_rate": 1.6125760205348225e-05, + "loss": 0.0969, + "step": 18056 + }, + { + "epoch": 0.36116, + "grad_norm": 0.3523503839969635, + "learning_rate": 1.612465652274266e-05, + "loss": 0.0175, + "step": 18058 + }, + { + "epoch": 0.3612, + "grad_norm": 1.312870979309082, + "learning_rate": 1.6123552720733767e-05, + "loss": 0.0689, + "step": 18060 + }, + { + "epoch": 0.36124, + "grad_norm": 2.535517930984497, + "learning_rate": 1.612244879934306e-05, + "loss": 0.092, + "step": 18062 + }, + { + "epoch": 0.36128, + "grad_norm": 2.308239459991455, + "learning_rate": 1.6121344758592057e-05, + "loss": 0.1025, + "step": 18064 + }, + { + "epoch": 0.36132, + "grad_norm": 0.760168194770813, + "learning_rate": 1.612024059850229e-05, + "loss": 0.0929, + "step": 18066 + }, + { + "epoch": 0.36136, + "grad_norm": 0.5463431477546692, + "learning_rate": 1.611913631909528e-05, + "loss": 0.0473, + "step": 18068 + }, + { + "epoch": 0.3614, + "grad_norm": 0.06888382881879807, + "learning_rate": 1.611803192039256e-05, + "loss": 0.0666, + "step": 18070 + }, + { + "epoch": 0.36144, + "grad_norm": 0.9196195602416992, + "learning_rate": 1.6116927402415655e-05, + "loss": 0.0253, + "step": 18072 + }, + { + "epoch": 0.36148, + "grad_norm": 4.500860214233398, + "learning_rate": 1.6115822765186104e-05, + "loss": 0.2764, + "step": 18074 + }, + { + "epoch": 0.36152, + "grad_norm": 1.0791810750961304, + "learning_rate": 1.6114718008725438e-05, + "loss": 0.137, + "step": 18076 + }, + { + "epoch": 0.36156, + "grad_norm": 3.212216377258301, + "learning_rate": 1.6113613133055195e-05, + "loss": 0.1513, + "step": 18078 + }, + { + "epoch": 0.3616, + "grad_norm": 0.36872079968452454, + "learning_rate": 1.611250813819692e-05, + "loss": 0.0261, + "step": 18080 + }, + { + "epoch": 0.36164, + "grad_norm": 0.1772865653038025, + "learning_rate": 1.6111403024172146e-05, + "loss": 0.0162, + "step": 18082 + }, + { + "epoch": 0.36168, + "grad_norm": 0.07402986288070679, + "learning_rate": 1.611029779100243e-05, + "loss": 0.012, + "step": 18084 + }, + { + "epoch": 0.36172, + "grad_norm": 0.4063480794429779, + "learning_rate": 1.610919243870931e-05, + "loss": 0.0352, + "step": 18086 + }, + { + "epoch": 0.36176, + "grad_norm": 0.7997627854347229, + "learning_rate": 1.6108086967314337e-05, + "loss": 0.0262, + "step": 18088 + }, + { + "epoch": 0.3618, + "grad_norm": 1.0033260583877563, + "learning_rate": 1.6106981376839064e-05, + "loss": 0.0371, + "step": 18090 + }, + { + "epoch": 0.36184, + "grad_norm": 1.5580030679702759, + "learning_rate": 1.6105875667305047e-05, + "loss": 0.0368, + "step": 18092 + }, + { + "epoch": 0.36188, + "grad_norm": 0.5047820806503296, + "learning_rate": 1.610476983873384e-05, + "loss": 0.0092, + "step": 18094 + }, + { + "epoch": 0.36192, + "grad_norm": 7.43864631652832, + "learning_rate": 1.6103663891147e-05, + "loss": 0.3533, + "step": 18096 + }, + { + "epoch": 0.36196, + "grad_norm": 2.2422401905059814, + "learning_rate": 1.6102557824566093e-05, + "loss": 0.1877, + "step": 18098 + }, + { + "epoch": 0.362, + "grad_norm": 0.28550297021865845, + "learning_rate": 1.610145163901268e-05, + "loss": 0.0925, + "step": 18100 + }, + { + "epoch": 0.36204, + "grad_norm": 0.6148421168327332, + "learning_rate": 1.6100345334508322e-05, + "loss": 0.0214, + "step": 18102 + }, + { + "epoch": 0.36208, + "grad_norm": 0.9186118841171265, + "learning_rate": 1.609923891107459e-05, + "loss": 0.0942, + "step": 18104 + }, + { + "epoch": 0.36212, + "grad_norm": 0.23994334042072296, + "learning_rate": 1.6098132368733064e-05, + "loss": 0.0161, + "step": 18106 + }, + { + "epoch": 0.36216, + "grad_norm": 0.1106773242354393, + "learning_rate": 1.60970257075053e-05, + "loss": 0.1779, + "step": 18108 + }, + { + "epoch": 0.3622, + "grad_norm": 5.128823280334473, + "learning_rate": 1.6095918927412883e-05, + "loss": 0.0817, + "step": 18110 + }, + { + "epoch": 0.36224, + "grad_norm": 8.7070951461792, + "learning_rate": 1.609481202847739e-05, + "loss": 0.3568, + "step": 18112 + }, + { + "epoch": 0.36228, + "grad_norm": 8.58885383605957, + "learning_rate": 1.6093705010720393e-05, + "loss": 0.4495, + "step": 18114 + }, + { + "epoch": 0.36232, + "grad_norm": 8.947004318237305, + "learning_rate": 1.6092597874163485e-05, + "loss": 0.6121, + "step": 18116 + }, + { + "epoch": 0.36236, + "grad_norm": 3.720614194869995, + "learning_rate": 1.6091490618828244e-05, + "loss": 0.1495, + "step": 18118 + }, + { + "epoch": 0.3624, + "grad_norm": 0.4863084554672241, + "learning_rate": 1.6090383244736256e-05, + "loss": 0.0132, + "step": 18120 + }, + { + "epoch": 0.36244, + "grad_norm": 1.4336379766464233, + "learning_rate": 1.608927575190911e-05, + "loss": 0.0808, + "step": 18122 + }, + { + "epoch": 0.36248, + "grad_norm": 0.09281836450099945, + "learning_rate": 1.6088168140368405e-05, + "loss": 0.0075, + "step": 18124 + }, + { + "epoch": 0.36252, + "grad_norm": 0.01995585486292839, + "learning_rate": 1.608706041013572e-05, + "loss": 0.0069, + "step": 18126 + }, + { + "epoch": 0.36256, + "grad_norm": 0.32430997490882874, + "learning_rate": 1.608595256123266e-05, + "loss": 0.3082, + "step": 18128 + }, + { + "epoch": 0.3626, + "grad_norm": 0.9762694239616394, + "learning_rate": 1.608484459368082e-05, + "loss": 0.0237, + "step": 18130 + }, + { + "epoch": 0.36264, + "grad_norm": 1.0514203310012817, + "learning_rate": 1.6083736507501808e-05, + "loss": 0.0332, + "step": 18132 + }, + { + "epoch": 0.36268, + "grad_norm": 0.09751838445663452, + "learning_rate": 1.608262830271722e-05, + "loss": 0.0095, + "step": 18134 + }, + { + "epoch": 0.36272, + "grad_norm": 0.4347691237926483, + "learning_rate": 1.6081519979348656e-05, + "loss": 0.0361, + "step": 18136 + }, + { + "epoch": 0.36276, + "grad_norm": 1.9937986135482788, + "learning_rate": 1.6080411537417732e-05, + "loss": 0.0413, + "step": 18138 + }, + { + "epoch": 0.3628, + "grad_norm": 1.4787825345993042, + "learning_rate": 1.6079302976946055e-05, + "loss": 0.1478, + "step": 18140 + }, + { + "epoch": 0.36284, + "grad_norm": 3.5549509525299072, + "learning_rate": 1.607819429795524e-05, + "loss": 0.1226, + "step": 18142 + }, + { + "epoch": 0.36288, + "grad_norm": 1.3395289182662964, + "learning_rate": 1.607708550046689e-05, + "loss": 0.0571, + "step": 18144 + }, + { + "epoch": 0.36292, + "grad_norm": 1.3771246671676636, + "learning_rate": 1.6075976584502635e-05, + "loss": 0.059, + "step": 18146 + }, + { + "epoch": 0.36296, + "grad_norm": 1.549639344215393, + "learning_rate": 1.6074867550084088e-05, + "loss": 0.0373, + "step": 18148 + }, + { + "epoch": 0.363, + "grad_norm": 1.823847770690918, + "learning_rate": 1.607375839723287e-05, + "loss": 0.0719, + "step": 18150 + }, + { + "epoch": 0.36304, + "grad_norm": 0.04834393411874771, + "learning_rate": 1.6072649125970604e-05, + "loss": 0.0312, + "step": 18152 + }, + { + "epoch": 0.36308, + "grad_norm": 0.1711275279521942, + "learning_rate": 1.6071539736318914e-05, + "loss": 0.0155, + "step": 18154 + }, + { + "epoch": 0.36312, + "grad_norm": 0.20131616294384003, + "learning_rate": 1.6070430228299438e-05, + "loss": 0.0149, + "step": 18156 + }, + { + "epoch": 0.36316, + "grad_norm": 7.745622634887695, + "learning_rate": 1.6069320601933796e-05, + "loss": 0.6114, + "step": 18158 + }, + { + "epoch": 0.3632, + "grad_norm": 1.0452277660369873, + "learning_rate": 1.6068210857243625e-05, + "loss": 0.1398, + "step": 18160 + }, + { + "epoch": 0.36324, + "grad_norm": 0.48404204845428467, + "learning_rate": 1.6067100994250554e-05, + "loss": 0.0135, + "step": 18162 + }, + { + "epoch": 0.36328, + "grad_norm": 1.8988122940063477, + "learning_rate": 1.6065991012976233e-05, + "loss": 0.0664, + "step": 18164 + }, + { + "epoch": 0.36332, + "grad_norm": 1.033026933670044, + "learning_rate": 1.606488091344229e-05, + "loss": 0.0456, + "step": 18166 + }, + { + "epoch": 0.36336, + "grad_norm": 2.4575700759887695, + "learning_rate": 1.6063770695670374e-05, + "loss": 0.1458, + "step": 18168 + }, + { + "epoch": 0.3634, + "grad_norm": 2.604053258895874, + "learning_rate": 1.6062660359682124e-05, + "loss": 0.1364, + "step": 18170 + }, + { + "epoch": 0.36344, + "grad_norm": 3.2752552032470703, + "learning_rate": 1.6061549905499192e-05, + "loss": 0.0795, + "step": 18172 + }, + { + "epoch": 0.36348, + "grad_norm": 0.824990451335907, + "learning_rate": 1.606043933314322e-05, + "loss": 0.1326, + "step": 18174 + }, + { + "epoch": 0.36352, + "grad_norm": 1.8464702367782593, + "learning_rate": 1.6059328642635864e-05, + "loss": 0.1055, + "step": 18176 + }, + { + "epoch": 0.36356, + "grad_norm": 3.7726690769195557, + "learning_rate": 1.605821783399878e-05, + "loss": 0.3309, + "step": 18178 + }, + { + "epoch": 0.3636, + "grad_norm": 5.0610857009887695, + "learning_rate": 1.6057106907253617e-05, + "loss": 0.2378, + "step": 18180 + }, + { + "epoch": 0.36364, + "grad_norm": 0.5467048287391663, + "learning_rate": 1.6055995862422035e-05, + "loss": 0.0982, + "step": 18182 + }, + { + "epoch": 0.36368, + "grad_norm": 0.12442391365766525, + "learning_rate": 1.60548846995257e-05, + "loss": 0.5511, + "step": 18184 + }, + { + "epoch": 0.36372, + "grad_norm": 2.8212268352508545, + "learning_rate": 1.6053773418586266e-05, + "loss": 0.0798, + "step": 18186 + }, + { + "epoch": 0.36376, + "grad_norm": 4.725414276123047, + "learning_rate": 1.6052662019625407e-05, + "loss": 0.1572, + "step": 18188 + }, + { + "epoch": 0.3638, + "grad_norm": 1.0813826322555542, + "learning_rate": 1.605155050266478e-05, + "loss": 0.0333, + "step": 18190 + }, + { + "epoch": 0.36384, + "grad_norm": 0.15839067101478577, + "learning_rate": 1.605043886772607e-05, + "loss": 0.0054, + "step": 18192 + }, + { + "epoch": 0.36388, + "grad_norm": 0.7414913177490234, + "learning_rate": 1.604932711483093e-05, + "loss": 0.0514, + "step": 18194 + }, + { + "epoch": 0.36392, + "grad_norm": 0.9567102789878845, + "learning_rate": 1.604821524400105e-05, + "loss": 0.0862, + "step": 18196 + }, + { + "epoch": 0.36396, + "grad_norm": 2.4906105995178223, + "learning_rate": 1.6047103255258094e-05, + "loss": 0.0536, + "step": 18198 + }, + { + "epoch": 0.364, + "grad_norm": 1.9523097276687622, + "learning_rate": 1.6045991148623752e-05, + "loss": 0.0983, + "step": 18200 + }, + { + "epoch": 0.36404, + "grad_norm": 0.0781865119934082, + "learning_rate": 1.6044878924119697e-05, + "loss": 0.1521, + "step": 18202 + }, + { + "epoch": 0.36408, + "grad_norm": 0.3728654384613037, + "learning_rate": 1.6043766581767616e-05, + "loss": 0.1115, + "step": 18204 + }, + { + "epoch": 0.36412, + "grad_norm": 3.6443369388580322, + "learning_rate": 1.6042654121589193e-05, + "loss": 0.1298, + "step": 18206 + }, + { + "epoch": 0.36416, + "grad_norm": 2.4219446182250977, + "learning_rate": 1.604154154360612e-05, + "loss": 0.0968, + "step": 18208 + }, + { + "epoch": 0.3642, + "grad_norm": 5.615652561187744, + "learning_rate": 1.6040428847840078e-05, + "loss": 0.1922, + "step": 18210 + }, + { + "epoch": 0.36424, + "grad_norm": 2.232715129852295, + "learning_rate": 1.6039316034312767e-05, + "loss": 0.0602, + "step": 18212 + }, + { + "epoch": 0.36428, + "grad_norm": 0.13588374853134155, + "learning_rate": 1.6038203103045885e-05, + "loss": 0.0555, + "step": 18214 + }, + { + "epoch": 0.36432, + "grad_norm": 0.1763405054807663, + "learning_rate": 1.6037090054061128e-05, + "loss": 0.0109, + "step": 18216 + }, + { + "epoch": 0.36436, + "grad_norm": 1.805359125137329, + "learning_rate": 1.6035976887380184e-05, + "loss": 0.0605, + "step": 18218 + }, + { + "epoch": 0.3644, + "grad_norm": 0.7587236166000366, + "learning_rate": 1.6034863603024768e-05, + "loss": 0.0869, + "step": 18220 + }, + { + "epoch": 0.36444, + "grad_norm": 6.0844550132751465, + "learning_rate": 1.6033750201016577e-05, + "loss": 0.5125, + "step": 18222 + }, + { + "epoch": 0.36448, + "grad_norm": 4.543872356414795, + "learning_rate": 1.603263668137732e-05, + "loss": 0.1089, + "step": 18224 + }, + { + "epoch": 0.36452, + "grad_norm": 0.3927098512649536, + "learning_rate": 1.6031523044128706e-05, + "loss": 0.1121, + "step": 18226 + }, + { + "epoch": 0.36456, + "grad_norm": 2.1104695796966553, + "learning_rate": 1.6030409289292444e-05, + "loss": 0.055, + "step": 18228 + }, + { + "epoch": 0.3646, + "grad_norm": 0.20250003039836884, + "learning_rate": 1.602929541689025e-05, + "loss": 0.0208, + "step": 18230 + }, + { + "epoch": 0.36464, + "grad_norm": 2.175424098968506, + "learning_rate": 1.6028181426943835e-05, + "loss": 0.2097, + "step": 18232 + }, + { + "epoch": 0.36468, + "grad_norm": 0.0941927433013916, + "learning_rate": 1.602706731947492e-05, + "loss": 0.0492, + "step": 18234 + }, + { + "epoch": 0.36472, + "grad_norm": 2.8148930072784424, + "learning_rate": 1.6025953094505228e-05, + "loss": 0.696, + "step": 18236 + }, + { + "epoch": 0.36476, + "grad_norm": 0.1445407271385193, + "learning_rate": 1.6024838752056474e-05, + "loss": 0.0995, + "step": 18238 + }, + { + "epoch": 0.3648, + "grad_norm": 0.6218573451042175, + "learning_rate": 1.6023724292150387e-05, + "loss": 0.1511, + "step": 18240 + }, + { + "epoch": 0.36484, + "grad_norm": 0.19464747607707977, + "learning_rate": 1.6022609714808695e-05, + "loss": 0.0463, + "step": 18242 + }, + { + "epoch": 0.36488, + "grad_norm": 2.461178779602051, + "learning_rate": 1.6021495020053128e-05, + "loss": 0.2342, + "step": 18244 + }, + { + "epoch": 0.36492, + "grad_norm": 0.032831545919179916, + "learning_rate": 1.602038020790541e-05, + "loss": 0.0046, + "step": 18246 + }, + { + "epoch": 0.36496, + "grad_norm": 2.23691725730896, + "learning_rate": 1.6019265278387287e-05, + "loss": 0.2097, + "step": 18248 + }, + { + "epoch": 0.365, + "grad_norm": 0.8390579223632812, + "learning_rate": 1.6018150231520486e-05, + "loss": 0.0256, + "step": 18250 + }, + { + "epoch": 0.36504, + "grad_norm": 4.170796871185303, + "learning_rate": 1.6017035067326744e-05, + "loss": 0.094, + "step": 18252 + }, + { + "epoch": 0.36508, + "grad_norm": 1.4493157863616943, + "learning_rate": 1.6015919785827812e-05, + "loss": 0.0719, + "step": 18254 + }, + { + "epoch": 0.36512, + "grad_norm": 0.6054086685180664, + "learning_rate": 1.6014804387045422e-05, + "loss": 0.0268, + "step": 18256 + }, + { + "epoch": 0.36516, + "grad_norm": 0.989529013633728, + "learning_rate": 1.6013688871001326e-05, + "loss": 0.0251, + "step": 18258 + }, + { + "epoch": 0.3652, + "grad_norm": 0.5186317563056946, + "learning_rate": 1.601257323771727e-05, + "loss": 0.3307, + "step": 18260 + }, + { + "epoch": 0.36524, + "grad_norm": 4.523470878601074, + "learning_rate": 1.6011457487214998e-05, + "loss": 0.2424, + "step": 18262 + }, + { + "epoch": 0.36528, + "grad_norm": 0.9658664464950562, + "learning_rate": 1.6010341619516273e-05, + "loss": 0.0458, + "step": 18264 + }, + { + "epoch": 0.36532, + "grad_norm": 2.59512996673584, + "learning_rate": 1.6009225634642838e-05, + "loss": 0.7558, + "step": 18266 + }, + { + "epoch": 0.36536, + "grad_norm": 0.40939632058143616, + "learning_rate": 1.6008109532616463e-05, + "loss": 0.102, + "step": 18268 + }, + { + "epoch": 0.3654, + "grad_norm": 7.188291549682617, + "learning_rate": 1.6006993313458896e-05, + "loss": 0.4052, + "step": 18270 + }, + { + "epoch": 0.36544, + "grad_norm": 0.8016760349273682, + "learning_rate": 1.6005876977191902e-05, + "loss": 0.191, + "step": 18272 + }, + { + "epoch": 0.36548, + "grad_norm": 0.17911462485790253, + "learning_rate": 1.600476052383724e-05, + "loss": 0.1205, + "step": 18274 + }, + { + "epoch": 0.36552, + "grad_norm": 3.1310579776763916, + "learning_rate": 1.600364395341669e-05, + "loss": 0.1391, + "step": 18276 + }, + { + "epoch": 0.36556, + "grad_norm": 2.3891685009002686, + "learning_rate": 1.6002527265952003e-05, + "loss": 0.0934, + "step": 18278 + }, + { + "epoch": 0.3656, + "grad_norm": 1.3662078380584717, + "learning_rate": 1.6001410461464955e-05, + "loss": 0.1008, + "step": 18280 + }, + { + "epoch": 0.36564, + "grad_norm": 0.30074459314346313, + "learning_rate": 1.6000293539977325e-05, + "loss": 0.2581, + "step": 18282 + }, + { + "epoch": 0.36568, + "grad_norm": 5.669365882873535, + "learning_rate": 1.5999176501510883e-05, + "loss": 0.2096, + "step": 18284 + }, + { + "epoch": 0.36572, + "grad_norm": 0.11168132722377777, + "learning_rate": 1.5998059346087406e-05, + "loss": 0.0197, + "step": 18286 + }, + { + "epoch": 0.36576, + "grad_norm": 1.3366787433624268, + "learning_rate": 1.5996942073728673e-05, + "loss": 0.0326, + "step": 18288 + }, + { + "epoch": 0.3658, + "grad_norm": 0.08350656181573868, + "learning_rate": 1.5995824684456465e-05, + "loss": 0.0085, + "step": 18290 + }, + { + "epoch": 0.36584, + "grad_norm": 4.081442356109619, + "learning_rate": 1.599470717829257e-05, + "loss": 0.1481, + "step": 18292 + }, + { + "epoch": 0.36588, + "grad_norm": 0.22599883377552032, + "learning_rate": 1.5993589555258773e-05, + "loss": 0.0092, + "step": 18294 + }, + { + "epoch": 0.36592, + "grad_norm": 0.8422930836677551, + "learning_rate": 1.599247181537686e-05, + "loss": 0.03, + "step": 18296 + }, + { + "epoch": 0.36596, + "grad_norm": 2.305933713912964, + "learning_rate": 1.5991353958668626e-05, + "loss": 0.0675, + "step": 18298 + }, + { + "epoch": 0.366, + "grad_norm": 0.25635069608688354, + "learning_rate": 1.599023598515586e-05, + "loss": 0.015, + "step": 18300 + }, + { + "epoch": 0.36604, + "grad_norm": 0.8682334423065186, + "learning_rate": 1.598911789486036e-05, + "loss": 0.1231, + "step": 18302 + }, + { + "epoch": 0.36608, + "grad_norm": 0.8702625036239624, + "learning_rate": 1.5987999687803927e-05, + "loss": 0.043, + "step": 18304 + }, + { + "epoch": 0.36612, + "grad_norm": 0.7851545214653015, + "learning_rate": 1.5986881364008353e-05, + "loss": 0.0359, + "step": 18306 + }, + { + "epoch": 0.36616, + "grad_norm": 0.42714235186576843, + "learning_rate": 1.5985762923495443e-05, + "loss": 0.5787, + "step": 18308 + }, + { + "epoch": 0.3662, + "grad_norm": 5.672743797302246, + "learning_rate": 1.5984644366287007e-05, + "loss": 0.3089, + "step": 18310 + }, + { + "epoch": 0.36624, + "grad_norm": 6.188354015350342, + "learning_rate": 1.5983525692404845e-05, + "loss": 0.2523, + "step": 18312 + }, + { + "epoch": 0.36628, + "grad_norm": 6.640321731567383, + "learning_rate": 1.598240690187077e-05, + "loss": 0.3591, + "step": 18314 + }, + { + "epoch": 0.36632, + "grad_norm": 0.47889289259910583, + "learning_rate": 1.5981287994706592e-05, + "loss": 0.0978, + "step": 18316 + }, + { + "epoch": 0.36636, + "grad_norm": 3.370731830596924, + "learning_rate": 1.598016897093413e-05, + "loss": 0.0949, + "step": 18318 + }, + { + "epoch": 0.3664, + "grad_norm": 0.28622961044311523, + "learning_rate": 1.597904983057519e-05, + "loss": 0.0655, + "step": 18320 + }, + { + "epoch": 0.36644, + "grad_norm": 1.6147313117980957, + "learning_rate": 1.5977930573651597e-05, + "loss": 0.0738, + "step": 18322 + }, + { + "epoch": 0.36648, + "grad_norm": 0.9693979620933533, + "learning_rate": 1.597681120018517e-05, + "loss": 0.0276, + "step": 18324 + }, + { + "epoch": 0.36652, + "grad_norm": 1.882070541381836, + "learning_rate": 1.5975691710197728e-05, + "loss": 0.1317, + "step": 18326 + }, + { + "epoch": 0.36656, + "grad_norm": 0.9368869662284851, + "learning_rate": 1.5974572103711105e-05, + "loss": 0.0238, + "step": 18328 + }, + { + "epoch": 0.3666, + "grad_norm": 0.10920092463493347, + "learning_rate": 1.5973452380747125e-05, + "loss": 0.0099, + "step": 18330 + }, + { + "epoch": 0.36664, + "grad_norm": 1.2385905981063843, + "learning_rate": 1.597233254132761e-05, + "loss": 0.0434, + "step": 18332 + }, + { + "epoch": 0.36668, + "grad_norm": 1.799249529838562, + "learning_rate": 1.5971212585474398e-05, + "loss": 0.2024, + "step": 18334 + }, + { + "epoch": 0.36672, + "grad_norm": 6.160976886749268, + "learning_rate": 1.597009251320932e-05, + "loss": 0.4296, + "step": 18336 + }, + { + "epoch": 0.36676, + "grad_norm": 0.3359818756580353, + "learning_rate": 1.596897232455422e-05, + "loss": 0.0097, + "step": 18338 + }, + { + "epoch": 0.3668, + "grad_norm": 0.35995912551879883, + "learning_rate": 1.596785201953093e-05, + "loss": 0.0093, + "step": 18340 + }, + { + "epoch": 0.36684, + "grad_norm": 0.08901505917310715, + "learning_rate": 1.596673159816129e-05, + "loss": 0.1403, + "step": 18342 + }, + { + "epoch": 0.36688, + "grad_norm": 2.122617721557617, + "learning_rate": 1.5965611060467146e-05, + "loss": 0.0947, + "step": 18344 + }, + { + "epoch": 0.36692, + "grad_norm": 0.20618616044521332, + "learning_rate": 1.5964490406470344e-05, + "loss": 0.0109, + "step": 18346 + }, + { + "epoch": 0.36696, + "grad_norm": 0.16300992667675018, + "learning_rate": 1.5963369636192734e-05, + "loss": 0.0309, + "step": 18348 + }, + { + "epoch": 0.367, + "grad_norm": 0.6649755835533142, + "learning_rate": 1.5962248749656158e-05, + "loss": 0.0179, + "step": 18350 + }, + { + "epoch": 0.36704, + "grad_norm": 0.21324098110198975, + "learning_rate": 1.5961127746882478e-05, + "loss": 0.0918, + "step": 18352 + }, + { + "epoch": 0.36708, + "grad_norm": 1.1480660438537598, + "learning_rate": 1.596000662789354e-05, + "loss": 0.2254, + "step": 18354 + }, + { + "epoch": 0.36712, + "grad_norm": 0.025066586211323738, + "learning_rate": 1.5958885392711203e-05, + "loss": 0.1383, + "step": 18356 + }, + { + "epoch": 0.36716, + "grad_norm": 2.753736972808838, + "learning_rate": 1.5957764041357332e-05, + "loss": 0.13, + "step": 18358 + }, + { + "epoch": 0.3672, + "grad_norm": 0.08352970331907272, + "learning_rate": 1.5956642573853784e-05, + "loss": 0.0354, + "step": 18360 + }, + { + "epoch": 0.36724, + "grad_norm": 0.6293634176254272, + "learning_rate": 1.5955520990222418e-05, + "loss": 0.0557, + "step": 18362 + }, + { + "epoch": 0.36728, + "grad_norm": 1.4472367763519287, + "learning_rate": 1.5954399290485106e-05, + "loss": 0.0406, + "step": 18364 + }, + { + "epoch": 0.36732, + "grad_norm": 8.468729972839355, + "learning_rate": 1.595327747466371e-05, + "loss": 0.5607, + "step": 18366 + }, + { + "epoch": 0.36736, + "grad_norm": 0.05073682963848114, + "learning_rate": 1.595215554278011e-05, + "loss": 0.0067, + "step": 18368 + }, + { + "epoch": 0.3674, + "grad_norm": 6.543701171875, + "learning_rate": 1.5951033494856174e-05, + "loss": 0.4709, + "step": 18370 + }, + { + "epoch": 0.36744, + "grad_norm": 0.159030944108963, + "learning_rate": 1.5949911330913775e-05, + "loss": 0.0555, + "step": 18372 + }, + { + "epoch": 0.36748, + "grad_norm": 1.6952077150344849, + "learning_rate": 1.594878905097479e-05, + "loss": 0.059, + "step": 18374 + }, + { + "epoch": 0.36752, + "grad_norm": 2.6202821731567383, + "learning_rate": 1.59476666550611e-05, + "loss": 0.0821, + "step": 18376 + }, + { + "epoch": 0.36756, + "grad_norm": 0.09784882515668869, + "learning_rate": 1.5946544143194587e-05, + "loss": 0.14, + "step": 18378 + }, + { + "epoch": 0.3676, + "grad_norm": 2.05426287651062, + "learning_rate": 1.5945421515397135e-05, + "loss": 0.076, + "step": 18380 + }, + { + "epoch": 0.36764, + "grad_norm": 1.2007070779800415, + "learning_rate": 1.5944298771690627e-05, + "loss": 0.0452, + "step": 18382 + }, + { + "epoch": 0.36768, + "grad_norm": 1.351505994796753, + "learning_rate": 1.5943175912096958e-05, + "loss": 0.0676, + "step": 18384 + }, + { + "epoch": 0.36772, + "grad_norm": 1.5011881589889526, + "learning_rate": 1.594205293663801e-05, + "loss": 0.047, + "step": 18386 + }, + { + "epoch": 0.36776, + "grad_norm": 2.617875576019287, + "learning_rate": 1.5940929845335684e-05, + "loss": 0.1383, + "step": 18388 + }, + { + "epoch": 0.3678, + "grad_norm": 4.131845951080322, + "learning_rate": 1.593980663821187e-05, + "loss": 0.0924, + "step": 18390 + }, + { + "epoch": 0.36784, + "grad_norm": 0.7329148054122925, + "learning_rate": 1.5938683315288472e-05, + "loss": 0.0262, + "step": 18392 + }, + { + "epoch": 0.36788, + "grad_norm": 0.2315065860748291, + "learning_rate": 1.5937559876587383e-05, + "loss": 0.0142, + "step": 18394 + }, + { + "epoch": 0.36792, + "grad_norm": 0.7093839049339294, + "learning_rate": 1.5936436322130506e-05, + "loss": 0.1478, + "step": 18396 + }, + { + "epoch": 0.36796, + "grad_norm": 4.021420001983643, + "learning_rate": 1.593531265193975e-05, + "loss": 0.4338, + "step": 18398 + }, + { + "epoch": 0.368, + "grad_norm": 0.020315198227763176, + "learning_rate": 1.5934188866037017e-05, + "loss": 0.0343, + "step": 18400 + }, + { + "epoch": 0.36804, + "grad_norm": 15.591917991638184, + "learning_rate": 1.5933064964444212e-05, + "loss": 0.575, + "step": 18402 + }, + { + "epoch": 0.36808, + "grad_norm": 0.21007587015628815, + "learning_rate": 1.593194094718326e-05, + "loss": 0.1008, + "step": 18404 + }, + { + "epoch": 0.36812, + "grad_norm": 1.5185688734054565, + "learning_rate": 1.593081681427606e-05, + "loss": 0.0488, + "step": 18406 + }, + { + "epoch": 0.36816, + "grad_norm": 0.009912027046084404, + "learning_rate": 1.592969256574453e-05, + "loss": 0.0719, + "step": 18408 + }, + { + "epoch": 0.3682, + "grad_norm": 2.037956714630127, + "learning_rate": 1.5928568201610593e-05, + "loss": 0.1435, + "step": 18410 + }, + { + "epoch": 0.36824, + "grad_norm": 0.43225085735321045, + "learning_rate": 1.592744372189617e-05, + "loss": 0.0588, + "step": 18412 + }, + { + "epoch": 0.36828, + "grad_norm": 1.7257016897201538, + "learning_rate": 1.5926319126623174e-05, + "loss": 0.1969, + "step": 18414 + }, + { + "epoch": 0.36832, + "grad_norm": 1.3954687118530273, + "learning_rate": 1.592519441581354e-05, + "loss": 0.0577, + "step": 18416 + }, + { + "epoch": 0.36836, + "grad_norm": 2.238105058670044, + "learning_rate": 1.592406958948919e-05, + "loss": 0.1088, + "step": 18418 + }, + { + "epoch": 0.3684, + "grad_norm": 0.2787289321422577, + "learning_rate": 1.592294464767205e-05, + "loss": 0.0081, + "step": 18420 + }, + { + "epoch": 0.36844, + "grad_norm": 1.2176587581634521, + "learning_rate": 1.5921819590384057e-05, + "loss": 0.0863, + "step": 18422 + }, + { + "epoch": 0.36848, + "grad_norm": 3.221731185913086, + "learning_rate": 1.5920694417647144e-05, + "loss": 0.4793, + "step": 18424 + }, + { + "epoch": 0.36852, + "grad_norm": 0.0953148603439331, + "learning_rate": 1.5919569129483244e-05, + "loss": 0.1919, + "step": 18426 + }, + { + "epoch": 0.36856, + "grad_norm": 2.680712938308716, + "learning_rate": 1.5918443725914298e-05, + "loss": 0.079, + "step": 18428 + }, + { + "epoch": 0.3686, + "grad_norm": 1.1151096820831299, + "learning_rate": 1.591731820696224e-05, + "loss": 0.1927, + "step": 18430 + }, + { + "epoch": 0.36864, + "grad_norm": 0.5400416851043701, + "learning_rate": 1.591619257264902e-05, + "loss": 0.0308, + "step": 18432 + }, + { + "epoch": 0.36868, + "grad_norm": 6.309864521026611, + "learning_rate": 1.591506682299658e-05, + "loss": 0.501, + "step": 18434 + }, + { + "epoch": 0.36872, + "grad_norm": 0.3881809115409851, + "learning_rate": 1.5913940958026867e-05, + "loss": 0.0992, + "step": 18436 + }, + { + "epoch": 0.36876, + "grad_norm": 0.5603182315826416, + "learning_rate": 1.591281497776183e-05, + "loss": 0.1484, + "step": 18438 + }, + { + "epoch": 0.3688, + "grad_norm": 0.44966796040534973, + "learning_rate": 1.591168888222342e-05, + "loss": 0.0432, + "step": 18440 + }, + { + "epoch": 0.36884, + "grad_norm": 5.229931354522705, + "learning_rate": 1.5910562671433594e-05, + "loss": 0.3196, + "step": 18442 + }, + { + "epoch": 0.36888, + "grad_norm": 6.491963863372803, + "learning_rate": 1.5909436345414306e-05, + "loss": 0.3902, + "step": 18444 + }, + { + "epoch": 0.36892, + "grad_norm": 0.297059565782547, + "learning_rate": 1.5908309904187508e-05, + "loss": 0.0137, + "step": 18446 + }, + { + "epoch": 0.36896, + "grad_norm": 2.1344401836395264, + "learning_rate": 1.5907183347775173e-05, + "loss": 0.0581, + "step": 18448 + }, + { + "epoch": 0.369, + "grad_norm": 0.33805540204048157, + "learning_rate": 1.5906056676199256e-05, + "loss": 0.0323, + "step": 18450 + }, + { + "epoch": 0.36904, + "grad_norm": 0.41675829887390137, + "learning_rate": 1.5904929889481722e-05, + "loss": 0.2921, + "step": 18452 + }, + { + "epoch": 0.36908, + "grad_norm": 5.941579818725586, + "learning_rate": 1.590380298764454e-05, + "loss": 0.3318, + "step": 18454 + }, + { + "epoch": 0.36912, + "grad_norm": 0.5312708616256714, + "learning_rate": 1.590267597070968e-05, + "loss": 0.1112, + "step": 18456 + }, + { + "epoch": 0.36916, + "grad_norm": 0.36362579464912415, + "learning_rate": 1.5901548838699114e-05, + "loss": 0.0227, + "step": 18458 + }, + { + "epoch": 0.3692, + "grad_norm": 1.0887210369110107, + "learning_rate": 1.5900421591634813e-05, + "loss": 0.0411, + "step": 18460 + }, + { + "epoch": 0.36924, + "grad_norm": 2.421297788619995, + "learning_rate": 1.589929422953876e-05, + "loss": 0.0941, + "step": 18462 + }, + { + "epoch": 0.36928, + "grad_norm": 0.956271767616272, + "learning_rate": 1.589816675243292e-05, + "loss": 0.043, + "step": 18464 + }, + { + "epoch": 0.36932, + "grad_norm": 3.4353439807891846, + "learning_rate": 1.589703916033929e-05, + "loss": 0.1583, + "step": 18466 + }, + { + "epoch": 0.36936, + "grad_norm": 0.2543022930622101, + "learning_rate": 1.5895911453279844e-05, + "loss": 0.0933, + "step": 18468 + }, + { + "epoch": 0.3694, + "grad_norm": 0.9411227107048035, + "learning_rate": 1.589478363127657e-05, + "loss": 0.0377, + "step": 18470 + }, + { + "epoch": 0.36944, + "grad_norm": 8.089996337890625, + "learning_rate": 1.5893655694351447e-05, + "loss": 0.4063, + "step": 18472 + }, + { + "epoch": 0.36948, + "grad_norm": 0.5063104033470154, + "learning_rate": 1.589252764252648e-05, + "loss": 0.0238, + "step": 18474 + }, + { + "epoch": 0.36952, + "grad_norm": 2.274550676345825, + "learning_rate": 1.589139947582365e-05, + "loss": 0.2366, + "step": 18476 + }, + { + "epoch": 0.36956, + "grad_norm": 2.4536540508270264, + "learning_rate": 1.5890271194264953e-05, + "loss": 0.1295, + "step": 18478 + }, + { + "epoch": 0.3696, + "grad_norm": 0.8217976093292236, + "learning_rate": 1.5889142797872387e-05, + "loss": 0.2066, + "step": 18480 + }, + { + "epoch": 0.36964, + "grad_norm": 0.07187756896018982, + "learning_rate": 1.5888014286667953e-05, + "loss": 0.1283, + "step": 18482 + }, + { + "epoch": 0.36968, + "grad_norm": 5.9583635330200195, + "learning_rate": 1.5886885660673645e-05, + "loss": 0.2677, + "step": 18484 + }, + { + "epoch": 0.36972, + "grad_norm": 6.774234771728516, + "learning_rate": 1.5885756919911475e-05, + "loss": 0.3633, + "step": 18486 + }, + { + "epoch": 0.36976, + "grad_norm": 0.21496474742889404, + "learning_rate": 1.588462806440344e-05, + "loss": 0.0353, + "step": 18488 + }, + { + "epoch": 0.3698, + "grad_norm": 1.7285622358322144, + "learning_rate": 1.5883499094171556e-05, + "loss": 0.1604, + "step": 18490 + }, + { + "epoch": 0.36984, + "grad_norm": 3.1683316230773926, + "learning_rate": 1.5882370009237823e-05, + "loss": 0.099, + "step": 18492 + }, + { + "epoch": 0.36988, + "grad_norm": 0.5872227549552917, + "learning_rate": 1.5881240809624258e-05, + "loss": 0.0223, + "step": 18494 + }, + { + "epoch": 0.36992, + "grad_norm": 0.30561351776123047, + "learning_rate": 1.5880111495352878e-05, + "loss": 0.495, + "step": 18496 + }, + { + "epoch": 0.36996, + "grad_norm": 3.5608952045440674, + "learning_rate": 1.5878982066445695e-05, + "loss": 0.1877, + "step": 18498 + }, + { + "epoch": 0.37, + "grad_norm": 1.785874605178833, + "learning_rate": 1.5877852522924733e-05, + "loss": 0.0752, + "step": 18500 + }, + { + "epoch": 0.37004, + "grad_norm": 3.7873222827911377, + "learning_rate": 1.587672286481201e-05, + "loss": 0.424, + "step": 18502 + }, + { + "epoch": 0.37008, + "grad_norm": 0.4142662584781647, + "learning_rate": 1.5875593092129546e-05, + "loss": 0.4213, + "step": 18504 + }, + { + "epoch": 0.37012, + "grad_norm": 1.343969464302063, + "learning_rate": 1.5874463204899375e-05, + "loss": 0.6966, + "step": 18506 + }, + { + "epoch": 0.37016, + "grad_norm": 2.0821526050567627, + "learning_rate": 1.587333320314351e-05, + "loss": 0.0703, + "step": 18508 + }, + { + "epoch": 0.3702, + "grad_norm": 1.949179768562317, + "learning_rate": 1.5872203086883996e-05, + "loss": 0.0836, + "step": 18510 + }, + { + "epoch": 0.37024, + "grad_norm": 1.7877384424209595, + "learning_rate": 1.5871072856142862e-05, + "loss": 0.0834, + "step": 18512 + }, + { + "epoch": 0.37028, + "grad_norm": 2.3005967140197754, + "learning_rate": 1.586994251094214e-05, + "loss": 0.2274, + "step": 18514 + }, + { + "epoch": 0.37032, + "grad_norm": 6.586012363433838, + "learning_rate": 1.586881205130386e-05, + "loss": 0.4069, + "step": 18516 + }, + { + "epoch": 0.37036, + "grad_norm": 1.9597316980361938, + "learning_rate": 1.586768147725007e-05, + "loss": 0.1025, + "step": 18518 + }, + { + "epoch": 0.3704, + "grad_norm": 0.7521215081214905, + "learning_rate": 1.5866550788802815e-05, + "loss": 0.0318, + "step": 18520 + }, + { + "epoch": 0.37044, + "grad_norm": 1.5997519493103027, + "learning_rate": 1.5865419985984126e-05, + "loss": 0.071, + "step": 18522 + }, + { + "epoch": 0.37048, + "grad_norm": 1.1867830753326416, + "learning_rate": 1.5864289068816057e-05, + "loss": 0.2683, + "step": 18524 + }, + { + "epoch": 0.37052, + "grad_norm": 1.6466953754425049, + "learning_rate": 1.5863158037320654e-05, + "loss": 0.342, + "step": 18526 + }, + { + "epoch": 0.37056, + "grad_norm": 1.304824948310852, + "learning_rate": 1.5862026891519967e-05, + "loss": 0.0901, + "step": 18528 + }, + { + "epoch": 0.3706, + "grad_norm": 2.736896514892578, + "learning_rate": 1.5860895631436044e-05, + "loss": 0.1214, + "step": 18530 + }, + { + "epoch": 0.37064, + "grad_norm": 0.4320034384727478, + "learning_rate": 1.5859764257090947e-05, + "loss": 0.0701, + "step": 18532 + }, + { + "epoch": 0.37068, + "grad_norm": 2.24123215675354, + "learning_rate": 1.5858632768506727e-05, + "loss": 0.0955, + "step": 18534 + }, + { + "epoch": 0.37072, + "grad_norm": 3.3872480392456055, + "learning_rate": 1.5857501165705443e-05, + "loss": 0.1295, + "step": 18536 + }, + { + "epoch": 0.37076, + "grad_norm": 2.7988216876983643, + "learning_rate": 1.5856369448709163e-05, + "loss": 0.0918, + "step": 18538 + }, + { + "epoch": 0.3708, + "grad_norm": 1.2397958040237427, + "learning_rate": 1.5855237617539943e-05, + "loss": 0.1533, + "step": 18540 + }, + { + "epoch": 0.37084, + "grad_norm": 0.9438172578811646, + "learning_rate": 1.585410567221985e-05, + "loss": 0.0638, + "step": 18542 + }, + { + "epoch": 0.37088, + "grad_norm": 0.34480735659599304, + "learning_rate": 1.5852973612770958e-05, + "loss": 0.1347, + "step": 18544 + }, + { + "epoch": 0.37092, + "grad_norm": 0.08892511576414108, + "learning_rate": 1.5851841439215326e-05, + "loss": 0.0096, + "step": 18546 + }, + { + "epoch": 0.37096, + "grad_norm": 0.1502445787191391, + "learning_rate": 1.5850709151575033e-05, + "loss": 0.1305, + "step": 18548 + }, + { + "epoch": 0.371, + "grad_norm": 3.1955223083496094, + "learning_rate": 1.584957674987216e-05, + "loss": 0.234, + "step": 18550 + }, + { + "epoch": 0.37104, + "grad_norm": 2.723264217376709, + "learning_rate": 1.584844423412877e-05, + "loss": 0.2877, + "step": 18552 + }, + { + "epoch": 0.37108, + "grad_norm": 0.15198270976543427, + "learning_rate": 1.584731160436695e-05, + "loss": 0.0761, + "step": 18554 + }, + { + "epoch": 0.37112, + "grad_norm": 1.1011923551559448, + "learning_rate": 1.584617886060878e-05, + "loss": 0.0507, + "step": 18556 + }, + { + "epoch": 0.37116, + "grad_norm": 0.7618362903594971, + "learning_rate": 1.584504600287634e-05, + "loss": 0.1504, + "step": 18558 + }, + { + "epoch": 0.3712, + "grad_norm": 2.553861379623413, + "learning_rate": 1.5843913031191722e-05, + "loss": 0.1344, + "step": 18560 + }, + { + "epoch": 0.37124, + "grad_norm": 0.11956460028886795, + "learning_rate": 1.584277994557701e-05, + "loss": 0.5113, + "step": 18562 + }, + { + "epoch": 0.37128, + "grad_norm": 3.8547658920288086, + "learning_rate": 1.5841646746054297e-05, + "loss": 0.1096, + "step": 18564 + }, + { + "epoch": 0.37132, + "grad_norm": 4.100532531738281, + "learning_rate": 1.5840513432645674e-05, + "loss": 0.1414, + "step": 18566 + }, + { + "epoch": 0.37136, + "grad_norm": 2.327911376953125, + "learning_rate": 1.583938000537323e-05, + "loss": 0.0729, + "step": 18568 + }, + { + "epoch": 0.3714, + "grad_norm": 0.427388459444046, + "learning_rate": 1.583824646425907e-05, + "loss": 0.0215, + "step": 18570 + }, + { + "epoch": 0.37144, + "grad_norm": 0.05809508636593819, + "learning_rate": 1.583711280932529e-05, + "loss": 0.0182, + "step": 18572 + }, + { + "epoch": 0.37148, + "grad_norm": 0.4485122561454773, + "learning_rate": 1.583597904059399e-05, + "loss": 0.0992, + "step": 18574 + }, + { + "epoch": 0.37152, + "grad_norm": 5.398560523986816, + "learning_rate": 1.5834845158087274e-05, + "loss": 0.3608, + "step": 18576 + }, + { + "epoch": 0.37156, + "grad_norm": 1.0758551359176636, + "learning_rate": 1.5833711161827244e-05, + "loss": 0.0773, + "step": 18578 + }, + { + "epoch": 0.3716, + "grad_norm": 1.3610153198242188, + "learning_rate": 1.5832577051836016e-05, + "loss": 0.1019, + "step": 18580 + }, + { + "epoch": 0.37164, + "grad_norm": 0.5909686088562012, + "learning_rate": 1.5831442828135693e-05, + "loss": 0.4469, + "step": 18582 + }, + { + "epoch": 0.37168, + "grad_norm": 0.09332767128944397, + "learning_rate": 1.5830308490748394e-05, + "loss": 0.0238, + "step": 18584 + }, + { + "epoch": 0.37172, + "grad_norm": 0.9352405071258545, + "learning_rate": 1.5829174039696226e-05, + "loss": 0.0285, + "step": 18586 + }, + { + "epoch": 0.37176, + "grad_norm": 2.243422508239746, + "learning_rate": 1.582803947500131e-05, + "loss": 0.1211, + "step": 18588 + }, + { + "epoch": 0.3718, + "grad_norm": 0.7703618407249451, + "learning_rate": 1.5826904796685763e-05, + "loss": 0.0684, + "step": 18590 + }, + { + "epoch": 0.37184, + "grad_norm": 2.9413881301879883, + "learning_rate": 1.5825770004771704e-05, + "loss": 0.1838, + "step": 18592 + }, + { + "epoch": 0.37188, + "grad_norm": 1.7377849817276, + "learning_rate": 1.5824635099281264e-05, + "loss": 0.2082, + "step": 18594 + }, + { + "epoch": 0.37192, + "grad_norm": 0.06803534179925919, + "learning_rate": 1.5823500080236563e-05, + "loss": 0.0488, + "step": 18596 + }, + { + "epoch": 0.37196, + "grad_norm": 0.7298295497894287, + "learning_rate": 1.582236494765973e-05, + "loss": 0.0174, + "step": 18598 + }, + { + "epoch": 0.372, + "grad_norm": 0.5400382876396179, + "learning_rate": 1.5821229701572897e-05, + "loss": 0.165, + "step": 18600 + }, + { + "epoch": 0.37204, + "grad_norm": 0.7345936894416809, + "learning_rate": 1.582009434199819e-05, + "loss": 0.0458, + "step": 18602 + }, + { + "epoch": 0.37208, + "grad_norm": 0.3909364640712738, + "learning_rate": 1.581895886895775e-05, + "loss": 0.0966, + "step": 18604 + }, + { + "epoch": 0.37212, + "grad_norm": 5.576320171356201, + "learning_rate": 1.581782328247371e-05, + "loss": 0.4018, + "step": 18606 + }, + { + "epoch": 0.37216, + "grad_norm": 1.0786551237106323, + "learning_rate": 1.581668758256821e-05, + "loss": 0.0394, + "step": 18608 + }, + { + "epoch": 0.3722, + "grad_norm": 6.3442230224609375, + "learning_rate": 1.5815551769263387e-05, + "loss": 0.2501, + "step": 18610 + }, + { + "epoch": 0.37224, + "grad_norm": 0.20456373691558838, + "learning_rate": 1.5814415842581396e-05, + "loss": 0.0102, + "step": 18612 + }, + { + "epoch": 0.37228, + "grad_norm": 3.124368667602539, + "learning_rate": 1.5813279802544367e-05, + "loss": 0.1035, + "step": 18614 + }, + { + "epoch": 0.37232, + "grad_norm": 3.6895718574523926, + "learning_rate": 1.5812143649174462e-05, + "loss": 0.1644, + "step": 18616 + }, + { + "epoch": 0.37236, + "grad_norm": 0.3282232880592346, + "learning_rate": 1.5811007382493818e-05, + "loss": 0.0109, + "step": 18618 + }, + { + "epoch": 0.3724, + "grad_norm": 2.917626142501831, + "learning_rate": 1.5809871002524602e-05, + "loss": 0.0925, + "step": 18620 + }, + { + "epoch": 0.37244, + "grad_norm": 0.8484794497489929, + "learning_rate": 1.580873450928895e-05, + "loss": 0.0264, + "step": 18622 + }, + { + "epoch": 0.37248, + "grad_norm": 4.426766872406006, + "learning_rate": 1.5807597902809036e-05, + "loss": 0.4006, + "step": 18624 + }, + { + "epoch": 0.37252, + "grad_norm": 2.128451108932495, + "learning_rate": 1.5806461183107007e-05, + "loss": 0.0515, + "step": 18626 + }, + { + "epoch": 0.37256, + "grad_norm": 1.2686827182769775, + "learning_rate": 1.5805324350205028e-05, + "loss": 0.2726, + "step": 18628 + }, + { + "epoch": 0.3726, + "grad_norm": 1.3040310144424438, + "learning_rate": 1.580418740412526e-05, + "loss": 0.0431, + "step": 18630 + }, + { + "epoch": 0.37264, + "grad_norm": 3.969554901123047, + "learning_rate": 1.5803050344889876e-05, + "loss": 0.177, + "step": 18632 + }, + { + "epoch": 0.37268, + "grad_norm": 1.6701476573944092, + "learning_rate": 1.5801913172521032e-05, + "loss": 0.0583, + "step": 18634 + }, + { + "epoch": 0.37272, + "grad_norm": 2.3122432231903076, + "learning_rate": 1.5800775887040906e-05, + "loss": 0.079, + "step": 18636 + }, + { + "epoch": 0.37276, + "grad_norm": 2.0830166339874268, + "learning_rate": 1.579963848847167e-05, + "loss": 0.0591, + "step": 18638 + }, + { + "epoch": 0.3728, + "grad_norm": 0.36257487535476685, + "learning_rate": 1.5798500976835493e-05, + "loss": 0.0352, + "step": 18640 + }, + { + "epoch": 0.37284, + "grad_norm": 0.19620782136917114, + "learning_rate": 1.5797363352154555e-05, + "loss": 0.0633, + "step": 18642 + }, + { + "epoch": 0.37288, + "grad_norm": 1.2955976724624634, + "learning_rate": 1.5796225614451034e-05, + "loss": 0.0332, + "step": 18644 + }, + { + "epoch": 0.37292, + "grad_norm": 6.3344807624816895, + "learning_rate": 1.579508776374711e-05, + "loss": 1.1165, + "step": 18646 + }, + { + "epoch": 0.37296, + "grad_norm": 4.017623424530029, + "learning_rate": 1.5793949800064966e-05, + "loss": 0.1866, + "step": 18648 + }, + { + "epoch": 0.373, + "grad_norm": 1.4126598834991455, + "learning_rate": 1.5792811723426787e-05, + "loss": 0.045, + "step": 18650 + }, + { + "epoch": 0.37304, + "grad_norm": 3.800299644470215, + "learning_rate": 1.5791673533854766e-05, + "loss": 0.1414, + "step": 18652 + }, + { + "epoch": 0.37308, + "grad_norm": 0.933398962020874, + "learning_rate": 1.5790535231371085e-05, + "loss": 0.0244, + "step": 18654 + }, + { + "epoch": 0.37312, + "grad_norm": 4.449921131134033, + "learning_rate": 1.578939681599794e-05, + "loss": 0.3138, + "step": 18656 + }, + { + "epoch": 0.37316, + "grad_norm": 0.8803878426551819, + "learning_rate": 1.5788258287757517e-05, + "loss": 0.1116, + "step": 18658 + }, + { + "epoch": 0.3732, + "grad_norm": 0.18315483629703522, + "learning_rate": 1.5787119646672025e-05, + "loss": 0.0169, + "step": 18660 + }, + { + "epoch": 0.37324, + "grad_norm": 0.1304912567138672, + "learning_rate": 1.5785980892763656e-05, + "loss": 0.0092, + "step": 18662 + }, + { + "epoch": 0.37328, + "grad_norm": 0.1878766417503357, + "learning_rate": 1.5784842026054605e-05, + "loss": 0.0233, + "step": 18664 + }, + { + "epoch": 0.37332, + "grad_norm": 0.6090713143348694, + "learning_rate": 1.5783703046567084e-05, + "loss": 0.0419, + "step": 18666 + }, + { + "epoch": 0.37336, + "grad_norm": 0.2690102159976959, + "learning_rate": 1.5782563954323296e-05, + "loss": 0.0938, + "step": 18668 + }, + { + "epoch": 0.3734, + "grad_norm": 1.4628255367279053, + "learning_rate": 1.5781424749345447e-05, + "loss": 0.0507, + "step": 18670 + }, + { + "epoch": 0.37344, + "grad_norm": 1.1694756746292114, + "learning_rate": 1.5780285431655743e-05, + "loss": 0.114, + "step": 18672 + }, + { + "epoch": 0.37348, + "grad_norm": 0.44556039571762085, + "learning_rate": 1.57791460012764e-05, + "loss": 0.0121, + "step": 18674 + }, + { + "epoch": 0.37352, + "grad_norm": 1.9168649911880493, + "learning_rate": 1.5778006458229632e-05, + "loss": 0.1168, + "step": 18676 + }, + { + "epoch": 0.37356, + "grad_norm": 0.21417728066444397, + "learning_rate": 1.577686680253765e-05, + "loss": 0.0429, + "step": 18678 + }, + { + "epoch": 0.3736, + "grad_norm": 1.582472801208496, + "learning_rate": 1.5775727034222675e-05, + "loss": 0.3616, + "step": 18680 + }, + { + "epoch": 0.37364, + "grad_norm": 0.9811189770698547, + "learning_rate": 1.5774587153306933e-05, + "loss": 0.3972, + "step": 18682 + }, + { + "epoch": 0.37368, + "grad_norm": 1.1116573810577393, + "learning_rate": 1.577344715981264e-05, + "loss": 0.0647, + "step": 18684 + }, + { + "epoch": 0.37372, + "grad_norm": 0.4201771914958954, + "learning_rate": 1.5772307053762022e-05, + "loss": 0.0243, + "step": 18686 + }, + { + "epoch": 0.37376, + "grad_norm": 1.0702892541885376, + "learning_rate": 1.5771166835177304e-05, + "loss": 0.0375, + "step": 18688 + }, + { + "epoch": 0.3738, + "grad_norm": 1.42148756980896, + "learning_rate": 1.577002650408072e-05, + "loss": 0.0452, + "step": 18690 + }, + { + "epoch": 0.37384, + "grad_norm": 7.385735511779785, + "learning_rate": 1.5768886060494496e-05, + "loss": 0.3051, + "step": 18692 + }, + { + "epoch": 0.37388, + "grad_norm": 0.1864384561777115, + "learning_rate": 1.5767745504440868e-05, + "loss": 0.0308, + "step": 18694 + }, + { + "epoch": 0.37392, + "grad_norm": 1.727019190788269, + "learning_rate": 1.5766604835942074e-05, + "loss": 0.1877, + "step": 18696 + }, + { + "epoch": 0.37396, + "grad_norm": 0.10412977635860443, + "learning_rate": 1.5765464055020348e-05, + "loss": 0.067, + "step": 18698 + }, + { + "epoch": 0.374, + "grad_norm": 0.5202248096466064, + "learning_rate": 1.5764323161697933e-05, + "loss": 0.0752, + "step": 18700 + }, + { + "epoch": 0.37404, + "grad_norm": 0.2997385859489441, + "learning_rate": 1.576318215599707e-05, + "loss": 0.3258, + "step": 18702 + }, + { + "epoch": 0.37408, + "grad_norm": 0.06612567603588104, + "learning_rate": 1.5762041037940006e-05, + "loss": 0.3198, + "step": 18704 + }, + { + "epoch": 0.37412, + "grad_norm": 0.178510844707489, + "learning_rate": 1.5760899807548985e-05, + "loss": 0.0706, + "step": 18706 + }, + { + "epoch": 0.37416, + "grad_norm": 2.5286028385162354, + "learning_rate": 1.5759758464846253e-05, + "loss": 0.1007, + "step": 18708 + }, + { + "epoch": 0.3742, + "grad_norm": 1.3461700677871704, + "learning_rate": 1.5758617009854068e-05, + "loss": 0.1025, + "step": 18710 + }, + { + "epoch": 0.37424, + "grad_norm": 0.520829975605011, + "learning_rate": 1.575747544259468e-05, + "loss": 0.2329, + "step": 18712 + }, + { + "epoch": 0.37428, + "grad_norm": 0.1499163955450058, + "learning_rate": 1.5756333763090343e-05, + "loss": 0.0505, + "step": 18714 + }, + { + "epoch": 0.37432, + "grad_norm": 6.091121196746826, + "learning_rate": 1.5755191971363313e-05, + "loss": 0.3002, + "step": 18716 + }, + { + "epoch": 0.37436, + "grad_norm": 0.43238335847854614, + "learning_rate": 1.5754050067435853e-05, + "loss": 0.245, + "step": 18718 + }, + { + "epoch": 0.3744, + "grad_norm": 0.8499479293823242, + "learning_rate": 1.575290805133023e-05, + "loss": 0.0264, + "step": 18720 + }, + { + "epoch": 0.37444, + "grad_norm": 0.5448671579360962, + "learning_rate": 1.5751765923068694e-05, + "loss": 0.0283, + "step": 18722 + }, + { + "epoch": 0.37448, + "grad_norm": 5.969097137451172, + "learning_rate": 1.5750623682673526e-05, + "loss": 0.3038, + "step": 18724 + }, + { + "epoch": 0.37452, + "grad_norm": 1.7946648597717285, + "learning_rate": 1.574948133016699e-05, + "loss": 0.3711, + "step": 18726 + }, + { + "epoch": 0.37456, + "grad_norm": 0.4349295198917389, + "learning_rate": 1.5748338865571355e-05, + "loss": 0.0168, + "step": 18728 + }, + { + "epoch": 0.3746, + "grad_norm": 1.4279255867004395, + "learning_rate": 1.5747196288908887e-05, + "loss": 0.0509, + "step": 18730 + }, + { + "epoch": 0.37464, + "grad_norm": 0.09744566679000854, + "learning_rate": 1.5746053600201874e-05, + "loss": 0.0217, + "step": 18732 + }, + { + "epoch": 0.37468, + "grad_norm": 1.4484037160873413, + "learning_rate": 1.574491079947259e-05, + "loss": 0.1687, + "step": 18734 + }, + { + "epoch": 0.37472, + "grad_norm": 2.6288540363311768, + "learning_rate": 1.5743767886743312e-05, + "loss": 0.1265, + "step": 18736 + }, + { + "epoch": 0.37476, + "grad_norm": 0.16769097745418549, + "learning_rate": 1.5742624862036316e-05, + "loss": 0.1798, + "step": 18738 + }, + { + "epoch": 0.3748, + "grad_norm": 1.6165450811386108, + "learning_rate": 1.57414817253739e-05, + "loss": 0.0922, + "step": 18740 + }, + { + "epoch": 0.37484, + "grad_norm": 1.0364428758621216, + "learning_rate": 1.5740338476778335e-05, + "loss": 0.1928, + "step": 18742 + }, + { + "epoch": 0.37488, + "grad_norm": 0.8943467736244202, + "learning_rate": 1.5739195116271918e-05, + "loss": 0.022, + "step": 18744 + }, + { + "epoch": 0.37492, + "grad_norm": 1.4349443912506104, + "learning_rate": 1.573805164387694e-05, + "loss": 0.0444, + "step": 18746 + }, + { + "epoch": 0.37496, + "grad_norm": 0.2693409323692322, + "learning_rate": 1.5736908059615687e-05, + "loss": 0.1027, + "step": 18748 + }, + { + "epoch": 0.375, + "grad_norm": 3.945185422897339, + "learning_rate": 1.573576436351046e-05, + "loss": 0.2423, + "step": 18750 + }, + { + "epoch": 0.37504, + "grad_norm": 0.444303035736084, + "learning_rate": 1.5734620555583555e-05, + "loss": 0.0199, + "step": 18752 + }, + { + "epoch": 0.37508, + "grad_norm": 0.27372655272483826, + "learning_rate": 1.573347663585727e-05, + "loss": 0.0597, + "step": 18754 + }, + { + "epoch": 0.37512, + "grad_norm": 3.657891273498535, + "learning_rate": 1.5732332604353904e-05, + "loss": 0.2141, + "step": 18756 + }, + { + "epoch": 0.37516, + "grad_norm": 0.5718064308166504, + "learning_rate": 1.5731188461095764e-05, + "loss": 0.0178, + "step": 18758 + }, + { + "epoch": 0.3752, + "grad_norm": 0.1478005051612854, + "learning_rate": 1.5730044206105156e-05, + "loss": 0.0686, + "step": 18760 + }, + { + "epoch": 0.37524, + "grad_norm": 0.5320046544075012, + "learning_rate": 1.5728899839404384e-05, + "loss": 0.0243, + "step": 18762 + }, + { + "epoch": 0.37528, + "grad_norm": 7.929348945617676, + "learning_rate": 1.5727755361015758e-05, + "loss": 0.3257, + "step": 18764 + }, + { + "epoch": 0.37532, + "grad_norm": 1.1491146087646484, + "learning_rate": 1.5726610770961596e-05, + "loss": 0.0332, + "step": 18766 + }, + { + "epoch": 0.37536, + "grad_norm": 0.442731648683548, + "learning_rate": 1.572546606926421e-05, + "loss": 0.0199, + "step": 18768 + }, + { + "epoch": 0.3754, + "grad_norm": 0.7782795429229736, + "learning_rate": 1.572432125594591e-05, + "loss": 0.1043, + "step": 18770 + }, + { + "epoch": 0.37544, + "grad_norm": 0.19453571736812592, + "learning_rate": 1.572317633102902e-05, + "loss": 0.0326, + "step": 18772 + }, + { + "epoch": 0.37548, + "grad_norm": 1.7861778736114502, + "learning_rate": 1.5722031294535868e-05, + "loss": 0.0441, + "step": 18774 + }, + { + "epoch": 0.37552, + "grad_norm": 2.358484983444214, + "learning_rate": 1.5720886146488767e-05, + "loss": 0.086, + "step": 18776 + }, + { + "epoch": 0.37556, + "grad_norm": 3.7254066467285156, + "learning_rate": 1.5719740886910043e-05, + "loss": 0.2214, + "step": 18778 + }, + { + "epoch": 0.3756, + "grad_norm": 0.12799592316150665, + "learning_rate": 1.5718595515822027e-05, + "loss": 0.0103, + "step": 18780 + }, + { + "epoch": 0.37564, + "grad_norm": 1.6138910055160522, + "learning_rate": 1.5717450033247046e-05, + "loss": 0.0528, + "step": 18782 + }, + { + "epoch": 0.37568, + "grad_norm": 0.7521283030509949, + "learning_rate": 1.5716304439207435e-05, + "loss": 0.0526, + "step": 18784 + }, + { + "epoch": 0.37572, + "grad_norm": 0.13133372366428375, + "learning_rate": 1.5715158733725523e-05, + "loss": 0.016, + "step": 18786 + }, + { + "epoch": 0.37576, + "grad_norm": 0.208866149187088, + "learning_rate": 1.5714012916823653e-05, + "loss": 0.0387, + "step": 18788 + }, + { + "epoch": 0.3758, + "grad_norm": 0.4437614977359772, + "learning_rate": 1.5712866988524157e-05, + "loss": 0.0225, + "step": 18790 + }, + { + "epoch": 0.37584, + "grad_norm": 0.02481985278427601, + "learning_rate": 1.571172094884938e-05, + "loss": 0.1764, + "step": 18792 + }, + { + "epoch": 0.37588, + "grad_norm": 0.14045919477939606, + "learning_rate": 1.571057479782166e-05, + "loss": 0.0108, + "step": 18794 + }, + { + "epoch": 0.37592, + "grad_norm": 3.7837460041046143, + "learning_rate": 1.5709428535463345e-05, + "loss": 0.1643, + "step": 18796 + }, + { + "epoch": 0.37596, + "grad_norm": 2.2750840187072754, + "learning_rate": 1.570828216179678e-05, + "loss": 0.1165, + "step": 18798 + }, + { + "epoch": 0.376, + "grad_norm": 0.04881409555673599, + "learning_rate": 1.570713567684432e-05, + "loss": 0.0026, + "step": 18800 + }, + { + "epoch": 0.37604, + "grad_norm": 0.22542938590049744, + "learning_rate": 1.570598908062831e-05, + "loss": 0.006, + "step": 18802 + }, + { + "epoch": 0.37608, + "grad_norm": 0.15002131462097168, + "learning_rate": 1.57048423731711e-05, + "loss": 0.179, + "step": 18804 + }, + { + "epoch": 0.37612, + "grad_norm": 6.637826442718506, + "learning_rate": 1.5703695554495055e-05, + "loss": 0.3223, + "step": 18806 + }, + { + "epoch": 0.37616, + "grad_norm": 1.3303861618041992, + "learning_rate": 1.570254862462253e-05, + "loss": 0.1414, + "step": 18808 + }, + { + "epoch": 0.3762, + "grad_norm": 0.09150140732526779, + "learning_rate": 1.5701401583575883e-05, + "loss": 0.0256, + "step": 18810 + }, + { + "epoch": 0.37624, + "grad_norm": 4.582808494567871, + "learning_rate": 1.570025443137748e-05, + "loss": 0.1685, + "step": 18812 + }, + { + "epoch": 0.37628, + "grad_norm": 5.373281002044678, + "learning_rate": 1.5699107168049672e-05, + "loss": 0.1861, + "step": 18814 + }, + { + "epoch": 0.37632, + "grad_norm": 2.1712331771850586, + "learning_rate": 1.5697959793614844e-05, + "loss": 0.0638, + "step": 18816 + }, + { + "epoch": 0.37636, + "grad_norm": 1.1708221435546875, + "learning_rate": 1.5696812308095354e-05, + "loss": 0.0428, + "step": 18818 + }, + { + "epoch": 0.3764, + "grad_norm": 0.9238252639770508, + "learning_rate": 1.5695664711513575e-05, + "loss": 0.0246, + "step": 18820 + }, + { + "epoch": 0.37644, + "grad_norm": 0.3966056704521179, + "learning_rate": 1.569451700389188e-05, + "loss": 0.0239, + "step": 18822 + }, + { + "epoch": 0.37648, + "grad_norm": 0.3123132586479187, + "learning_rate": 1.5693369185252648e-05, + "loss": 0.0767, + "step": 18824 + }, + { + "epoch": 0.37652, + "grad_norm": 0.8621528744697571, + "learning_rate": 1.5692221255618246e-05, + "loss": 0.1211, + "step": 18826 + }, + { + "epoch": 0.37656, + "grad_norm": 0.35545283555984497, + "learning_rate": 1.5691073215011066e-05, + "loss": 0.078, + "step": 18828 + }, + { + "epoch": 0.3766, + "grad_norm": 12.334202766418457, + "learning_rate": 1.5689925063453483e-05, + "loss": 0.4921, + "step": 18830 + }, + { + "epoch": 0.37664, + "grad_norm": 7.689411163330078, + "learning_rate": 1.5688776800967876e-05, + "loss": 0.1914, + "step": 18832 + }, + { + "epoch": 0.37668, + "grad_norm": 1.522964596748352, + "learning_rate": 1.5687628427576643e-05, + "loss": 0.054, + "step": 18834 + }, + { + "epoch": 0.37672, + "grad_norm": 2.9637506008148193, + "learning_rate": 1.568647994330216e-05, + "loss": 0.074, + "step": 18836 + }, + { + "epoch": 0.37676, + "grad_norm": 0.6118870973587036, + "learning_rate": 1.5685331348166827e-05, + "loss": 0.0657, + "step": 18838 + }, + { + "epoch": 0.3768, + "grad_norm": 2.0934112071990967, + "learning_rate": 1.568418264219303e-05, + "loss": 0.4825, + "step": 18840 + }, + { + "epoch": 0.37684, + "grad_norm": 0.5413713455200195, + "learning_rate": 1.568303382540317e-05, + "loss": 0.0278, + "step": 18842 + }, + { + "epoch": 0.37688, + "grad_norm": 0.5372637510299683, + "learning_rate": 1.5681884897819636e-05, + "loss": 0.0588, + "step": 18844 + }, + { + "epoch": 0.37692, + "grad_norm": 0.15016813576221466, + "learning_rate": 1.568073585946483e-05, + "loss": 0.0245, + "step": 18846 + }, + { + "epoch": 0.37696, + "grad_norm": 0.2765910029411316, + "learning_rate": 1.5679586710361155e-05, + "loss": 0.1954, + "step": 18848 + }, + { + "epoch": 0.377, + "grad_norm": 4.371386528015137, + "learning_rate": 1.5678437450531014e-05, + "loss": 0.2215, + "step": 18850 + }, + { + "epoch": 0.37704, + "grad_norm": 0.22658534348011017, + "learning_rate": 1.567728807999681e-05, + "loss": 0.0201, + "step": 18852 + }, + { + "epoch": 0.37708, + "grad_norm": 2.2802934646606445, + "learning_rate": 1.567613859878095e-05, + "loss": 0.1487, + "step": 18854 + }, + { + "epoch": 0.37712, + "grad_norm": 6.491363525390625, + "learning_rate": 1.567498900690585e-05, + "loss": 0.326, + "step": 18856 + }, + { + "epoch": 0.37716, + "grad_norm": 0.10868099331855774, + "learning_rate": 1.567383930439392e-05, + "loss": 0.0074, + "step": 18858 + }, + { + "epoch": 0.3772, + "grad_norm": 0.667709469795227, + "learning_rate": 1.567268949126757e-05, + "loss": 0.3006, + "step": 18860 + }, + { + "epoch": 0.37724, + "grad_norm": 2.8344643115997314, + "learning_rate": 1.5671539567549213e-05, + "loss": 0.0972, + "step": 18862 + }, + { + "epoch": 0.37728, + "grad_norm": 5.504004955291748, + "learning_rate": 1.5670389533261276e-05, + "loss": 0.6313, + "step": 18864 + }, + { + "epoch": 0.37732, + "grad_norm": 8.801671981811523, + "learning_rate": 1.5669239388426174e-05, + "loss": 0.3055, + "step": 18866 + }, + { + "epoch": 0.37736, + "grad_norm": 0.35987749695777893, + "learning_rate": 1.566808913306633e-05, + "loss": 0.0097, + "step": 18868 + }, + { + "epoch": 0.3774, + "grad_norm": 0.9072269201278687, + "learning_rate": 1.5666938767204173e-05, + "loss": 0.0983, + "step": 18870 + }, + { + "epoch": 0.37744, + "grad_norm": 1.6846226453781128, + "learning_rate": 1.5665788290862124e-05, + "loss": 0.0351, + "step": 18872 + }, + { + "epoch": 0.37748, + "grad_norm": 0.07229340821504593, + "learning_rate": 1.5664637704062622e-05, + "loss": 0.0257, + "step": 18874 + }, + { + "epoch": 0.37752, + "grad_norm": 2.4431443214416504, + "learning_rate": 1.5663487006828086e-05, + "loss": 0.0862, + "step": 18876 + }, + { + "epoch": 0.37756, + "grad_norm": 0.20951583981513977, + "learning_rate": 1.5662336199180956e-05, + "loss": 0.0081, + "step": 18878 + }, + { + "epoch": 0.3776, + "grad_norm": 0.5247324705123901, + "learning_rate": 1.5661185281143666e-05, + "loss": 0.0128, + "step": 18880 + }, + { + "epoch": 0.37764, + "grad_norm": 2.414011001586914, + "learning_rate": 1.566003425273866e-05, + "loss": 0.0536, + "step": 18882 + }, + { + "epoch": 0.37768, + "grad_norm": 0.18436414003372192, + "learning_rate": 1.5658883113988365e-05, + "loss": 0.5523, + "step": 18884 + }, + { + "epoch": 0.37772, + "grad_norm": 0.17465011775493622, + "learning_rate": 1.5657731864915235e-05, + "loss": 0.0567, + "step": 18886 + }, + { + "epoch": 0.37776, + "grad_norm": 0.175582617521286, + "learning_rate": 1.5656580505541705e-05, + "loss": 0.0078, + "step": 18888 + }, + { + "epoch": 0.3778, + "grad_norm": 2.2384207248687744, + "learning_rate": 1.565542903589023e-05, + "loss": 0.0484, + "step": 18890 + }, + { + "epoch": 0.37784, + "grad_norm": 0.791627824306488, + "learning_rate": 1.565427745598325e-05, + "loss": 0.4825, + "step": 18892 + }, + { + "epoch": 0.37788, + "grad_norm": 8.096990585327148, + "learning_rate": 1.5653125765843226e-05, + "loss": 0.611, + "step": 18894 + }, + { + "epoch": 0.37792, + "grad_norm": 0.4588298499584198, + "learning_rate": 1.56519739654926e-05, + "loss": 0.0336, + "step": 18896 + }, + { + "epoch": 0.37796, + "grad_norm": 1.6228711605072021, + "learning_rate": 1.5650822054953834e-05, + "loss": 0.0518, + "step": 18898 + }, + { + "epoch": 0.378, + "grad_norm": 5.945956230163574, + "learning_rate": 1.564967003424938e-05, + "loss": 0.2775, + "step": 18900 + }, + { + "epoch": 0.37804, + "grad_norm": 1.281278133392334, + "learning_rate": 1.5648517903401703e-05, + "loss": 0.0458, + "step": 18902 + }, + { + "epoch": 0.37808, + "grad_norm": 0.1547202616930008, + "learning_rate": 1.564736566243326e-05, + "loss": 0.0164, + "step": 18904 + }, + { + "epoch": 0.37812, + "grad_norm": 5.016740798950195, + "learning_rate": 1.5646213311366518e-05, + "loss": 0.1687, + "step": 18906 + }, + { + "epoch": 0.37816, + "grad_norm": 1.7733781337738037, + "learning_rate": 1.5645060850223938e-05, + "loss": 0.0574, + "step": 18908 + }, + { + "epoch": 0.3782, + "grad_norm": 0.15986771881580353, + "learning_rate": 1.5643908279027994e-05, + "loss": 0.0037, + "step": 18910 + }, + { + "epoch": 0.37824, + "grad_norm": 2.711219549179077, + "learning_rate": 1.564275559780115e-05, + "loss": 0.1517, + "step": 18912 + }, + { + "epoch": 0.37828, + "grad_norm": 0.20591028034687042, + "learning_rate": 1.564160280656588e-05, + "loss": 0.0118, + "step": 18914 + }, + { + "epoch": 0.37832, + "grad_norm": 1.4295903444290161, + "learning_rate": 1.5640449905344658e-05, + "loss": 0.0829, + "step": 18916 + }, + { + "epoch": 0.37836, + "grad_norm": 0.11275259405374527, + "learning_rate": 1.563929689415996e-05, + "loss": 0.1082, + "step": 18918 + }, + { + "epoch": 0.3784, + "grad_norm": 2.1287765502929688, + "learning_rate": 1.5638143773034268e-05, + "loss": 0.4019, + "step": 18920 + }, + { + "epoch": 0.37844, + "grad_norm": 0.10162672400474548, + "learning_rate": 1.5636990541990063e-05, + "loss": 0.0262, + "step": 18922 + }, + { + "epoch": 0.37848, + "grad_norm": 4.930300712585449, + "learning_rate": 1.563583720104982e-05, + "loss": 0.1519, + "step": 18924 + }, + { + "epoch": 0.37852, + "grad_norm": 6.0335917472839355, + "learning_rate": 1.5634683750236033e-05, + "loss": 0.1854, + "step": 18926 + }, + { + "epoch": 0.37856, + "grad_norm": 7.105855941772461, + "learning_rate": 1.5633530189571185e-05, + "loss": 0.2883, + "step": 18928 + }, + { + "epoch": 0.3786, + "grad_norm": 0.1143481656908989, + "learning_rate": 1.563237651907777e-05, + "loss": 0.0106, + "step": 18930 + }, + { + "epoch": 0.37864, + "grad_norm": 2.160773515701294, + "learning_rate": 1.5631222738778268e-05, + "loss": 0.085, + "step": 18932 + }, + { + "epoch": 0.37868, + "grad_norm": 0.5582566261291504, + "learning_rate": 1.563006884869518e-05, + "loss": 0.0483, + "step": 18934 + }, + { + "epoch": 0.37872, + "grad_norm": 0.7694061994552612, + "learning_rate": 1.5628914848851002e-05, + "loss": 0.0197, + "step": 18936 + }, + { + "epoch": 0.37876, + "grad_norm": 0.781874418258667, + "learning_rate": 1.562776073926823e-05, + "loss": 0.0735, + "step": 18938 + }, + { + "epoch": 0.3788, + "grad_norm": 2.6733174324035645, + "learning_rate": 1.562660651996937e-05, + "loss": 0.2775, + "step": 18940 + }, + { + "epoch": 0.37884, + "grad_norm": 0.9193813800811768, + "learning_rate": 1.5625452190976914e-05, + "loss": 0.0222, + "step": 18942 + }, + { + "epoch": 0.37888, + "grad_norm": 1.4329476356506348, + "learning_rate": 1.5624297752313373e-05, + "loss": 0.217, + "step": 18944 + }, + { + "epoch": 0.37892, + "grad_norm": 0.16291557252407074, + "learning_rate": 1.5623143204001255e-05, + "loss": 0.0038, + "step": 18946 + }, + { + "epoch": 0.37896, + "grad_norm": 1.3170140981674194, + "learning_rate": 1.5621988546063064e-05, + "loss": 0.2287, + "step": 18948 + }, + { + "epoch": 0.379, + "grad_norm": 7.423402786254883, + "learning_rate": 1.5620833778521306e-05, + "loss": 0.3978, + "step": 18950 + }, + { + "epoch": 0.37904, + "grad_norm": 0.17446936666965485, + "learning_rate": 1.5619678901398505e-05, + "loss": 0.0618, + "step": 18952 + }, + { + "epoch": 0.37908, + "grad_norm": 0.02372094802558422, + "learning_rate": 1.561852391471717e-05, + "loss": 0.007, + "step": 18954 + }, + { + "epoch": 0.37912, + "grad_norm": 0.037790697067976, + "learning_rate": 1.561736881849982e-05, + "loss": 0.5491, + "step": 18956 + }, + { + "epoch": 0.37916, + "grad_norm": 3.2314064502716064, + "learning_rate": 1.5616213612768967e-05, + "loss": 0.1097, + "step": 18958 + }, + { + "epoch": 0.3792, + "grad_norm": 0.08754423260688782, + "learning_rate": 1.5615058297547144e-05, + "loss": 0.0209, + "step": 18960 + }, + { + "epoch": 0.37924, + "grad_norm": 5.0933685302734375, + "learning_rate": 1.5613902872856866e-05, + "loss": 0.1399, + "step": 18962 + }, + { + "epoch": 0.37928, + "grad_norm": 5.967873573303223, + "learning_rate": 1.5612747338720665e-05, + "loss": 0.3567, + "step": 18964 + }, + { + "epoch": 0.37932, + "grad_norm": 0.3203069865703583, + "learning_rate": 1.561159169516106e-05, + "loss": 0.0323, + "step": 18966 + }, + { + "epoch": 0.37936, + "grad_norm": 0.2847297787666321, + "learning_rate": 1.561043594220059e-05, + "loss": 0.0305, + "step": 18968 + }, + { + "epoch": 0.3794, + "grad_norm": 0.12487699836492538, + "learning_rate": 1.560928007986178e-05, + "loss": 0.1412, + "step": 18970 + }, + { + "epoch": 0.37944, + "grad_norm": 1.4635485410690308, + "learning_rate": 1.560812410816717e-05, + "loss": 0.0408, + "step": 18972 + }, + { + "epoch": 0.37948, + "grad_norm": 7.807023525238037, + "learning_rate": 1.560696802713929e-05, + "loss": 0.4532, + "step": 18974 + }, + { + "epoch": 0.37952, + "grad_norm": 2.634989023208618, + "learning_rate": 1.5605811836800683e-05, + "loss": 0.1346, + "step": 18976 + }, + { + "epoch": 0.37956, + "grad_norm": 0.5680826902389526, + "learning_rate": 1.560465553717389e-05, + "loss": 0.0174, + "step": 18978 + }, + { + "epoch": 0.3796, + "grad_norm": 2.5152342319488525, + "learning_rate": 1.5603499128281447e-05, + "loss": 0.3801, + "step": 18980 + }, + { + "epoch": 0.37964, + "grad_norm": 1.0542048215866089, + "learning_rate": 1.560234261014591e-05, + "loss": 0.025, + "step": 18982 + }, + { + "epoch": 0.37968, + "grad_norm": 0.11543188989162445, + "learning_rate": 1.5601185982789812e-05, + "loss": 0.02, + "step": 18984 + }, + { + "epoch": 0.37972, + "grad_norm": 0.2462630271911621, + "learning_rate": 1.5600029246235716e-05, + "loss": 0.0827, + "step": 18986 + }, + { + "epoch": 0.37976, + "grad_norm": 1.9688029289245605, + "learning_rate": 1.5598872400506164e-05, + "loss": 0.0374, + "step": 18988 + }, + { + "epoch": 0.3798, + "grad_norm": 1.5059888362884521, + "learning_rate": 1.5597715445623714e-05, + "loss": 0.0351, + "step": 18990 + }, + { + "epoch": 0.37984, + "grad_norm": 0.6204372048377991, + "learning_rate": 1.5596558381610915e-05, + "loss": 0.0143, + "step": 18992 + }, + { + "epoch": 0.37988, + "grad_norm": 0.5461714267730713, + "learning_rate": 1.5595401208490332e-05, + "loss": 0.0174, + "step": 18994 + }, + { + "epoch": 0.37992, + "grad_norm": 0.6669613122940063, + "learning_rate": 1.5594243926284526e-05, + "loss": 0.025, + "step": 18996 + }, + { + "epoch": 0.37996, + "grad_norm": 2.4335577487945557, + "learning_rate": 1.5593086535016045e-05, + "loss": 0.1611, + "step": 18998 + }, + { + "epoch": 0.38, + "grad_norm": 0.18546657264232635, + "learning_rate": 1.5591929034707468e-05, + "loss": 0.0113, + "step": 19000 + }, + { + "epoch": 0.38004, + "grad_norm": 9.742053985595703, + "learning_rate": 1.5590771425381356e-05, + "loss": 0.2234, + "step": 19002 + }, + { + "epoch": 0.38008, + "grad_norm": 1.2214194536209106, + "learning_rate": 1.5589613707060278e-05, + "loss": 0.2774, + "step": 19004 + }, + { + "epoch": 0.38012, + "grad_norm": 0.4254070520401001, + "learning_rate": 1.55884558797668e-05, + "loss": 0.0766, + "step": 19006 + }, + { + "epoch": 0.38016, + "grad_norm": 1.6732795238494873, + "learning_rate": 1.55872979435235e-05, + "loss": 0.0942, + "step": 19008 + }, + { + "epoch": 0.3802, + "grad_norm": 0.19849324226379395, + "learning_rate": 1.558613989835295e-05, + "loss": 0.2242, + "step": 19010 + }, + { + "epoch": 0.38024, + "grad_norm": 0.8850323557853699, + "learning_rate": 1.5584981744277722e-05, + "loss": 0.0208, + "step": 19012 + }, + { + "epoch": 0.38028, + "grad_norm": 0.5364490151405334, + "learning_rate": 1.5583823481320405e-05, + "loss": 0.0358, + "step": 19014 + }, + { + "epoch": 0.38032, + "grad_norm": 1.3712902069091797, + "learning_rate": 1.558266510950357e-05, + "loss": 0.0455, + "step": 19016 + }, + { + "epoch": 0.38036, + "grad_norm": 0.6356553435325623, + "learning_rate": 1.5581506628849806e-05, + "loss": 0.0391, + "step": 19018 + }, + { + "epoch": 0.3804, + "grad_norm": 3.850977897644043, + "learning_rate": 1.55803480393817e-05, + "loss": 0.1479, + "step": 19020 + }, + { + "epoch": 0.38044, + "grad_norm": 0.8296269774436951, + "learning_rate": 1.5579189341121833e-05, + "loss": 0.0244, + "step": 19022 + }, + { + "epoch": 0.38048, + "grad_norm": 10.267026901245117, + "learning_rate": 1.5578030534092797e-05, + "loss": 0.181, + "step": 19024 + }, + { + "epoch": 0.38052, + "grad_norm": 0.3978153467178345, + "learning_rate": 1.5576871618317186e-05, + "loss": 0.0202, + "step": 19026 + }, + { + "epoch": 0.38056, + "grad_norm": 1.3787176609039307, + "learning_rate": 1.557571259381759e-05, + "loss": 0.0397, + "step": 19028 + }, + { + "epoch": 0.3806, + "grad_norm": 0.6282860636711121, + "learning_rate": 1.5574553460616608e-05, + "loss": 0.0136, + "step": 19030 + }, + { + "epoch": 0.38064, + "grad_norm": 0.17398717999458313, + "learning_rate": 1.5573394218736835e-05, + "loss": 0.023, + "step": 19032 + }, + { + "epoch": 0.38068, + "grad_norm": 0.44865724444389343, + "learning_rate": 1.5572234868200876e-05, + "loss": 0.017, + "step": 19034 + }, + { + "epoch": 0.38072, + "grad_norm": 0.3588070571422577, + "learning_rate": 1.5571075409031325e-05, + "loss": 0.6558, + "step": 19036 + }, + { + "epoch": 0.38076, + "grad_norm": 3.5430033206939697, + "learning_rate": 1.556991584125079e-05, + "loss": 0.1114, + "step": 19038 + }, + { + "epoch": 0.3808, + "grad_norm": 0.10465722531080246, + "learning_rate": 1.556875616488188e-05, + "loss": 0.0067, + "step": 19040 + }, + { + "epoch": 0.38084, + "grad_norm": 5.419610500335693, + "learning_rate": 1.5567596379947202e-05, + "loss": 0.2145, + "step": 19042 + }, + { + "epoch": 0.38088, + "grad_norm": 1.2705800533294678, + "learning_rate": 1.5566436486469367e-05, + "loss": 0.1534, + "step": 19044 + }, + { + "epoch": 0.38092, + "grad_norm": 7.914336204528809, + "learning_rate": 1.5565276484470986e-05, + "loss": 0.192, + "step": 19046 + }, + { + "epoch": 0.38096, + "grad_norm": 0.8881228566169739, + "learning_rate": 1.5564116373974675e-05, + "loss": 0.1114, + "step": 19048 + }, + { + "epoch": 0.381, + "grad_norm": 2.877713203430176, + "learning_rate": 1.556295615500305e-05, + "loss": 0.1398, + "step": 19050 + }, + { + "epoch": 0.38104, + "grad_norm": 1.8552221059799194, + "learning_rate": 1.556179582757873e-05, + "loss": 0.1401, + "step": 19052 + }, + { + "epoch": 0.38108, + "grad_norm": 5.537761211395264, + "learning_rate": 1.556063539172434e-05, + "loss": 0.1368, + "step": 19054 + }, + { + "epoch": 0.38112, + "grad_norm": 1.312057614326477, + "learning_rate": 1.5559474847462496e-05, + "loss": 0.0289, + "step": 19056 + }, + { + "epoch": 0.38116, + "grad_norm": 1.9931873083114624, + "learning_rate": 1.5558314194815828e-05, + "loss": 0.0388, + "step": 19058 + }, + { + "epoch": 0.3812, + "grad_norm": 2.3651089668273926, + "learning_rate": 1.5557153433806967e-05, + "loss": 0.0683, + "step": 19060 + }, + { + "epoch": 0.38124, + "grad_norm": 0.34294024109840393, + "learning_rate": 1.5555992564458534e-05, + "loss": 0.0094, + "step": 19062 + }, + { + "epoch": 0.38128, + "grad_norm": 0.2671163082122803, + "learning_rate": 1.555483158679317e-05, + "loss": 0.0162, + "step": 19064 + }, + { + "epoch": 0.38132, + "grad_norm": 0.015320426784455776, + "learning_rate": 1.5553670500833503e-05, + "loss": 0.0066, + "step": 19066 + }, + { + "epoch": 0.38136, + "grad_norm": 0.09224827587604523, + "learning_rate": 1.555250930660217e-05, + "loss": 0.3744, + "step": 19068 + }, + { + "epoch": 0.3814, + "grad_norm": 0.0196982529014349, + "learning_rate": 1.555134800412181e-05, + "loss": 0.012, + "step": 19070 + }, + { + "epoch": 0.38144, + "grad_norm": 0.7148879766464233, + "learning_rate": 1.555018659341506e-05, + "loss": 0.0451, + "step": 19072 + }, + { + "epoch": 0.38148, + "grad_norm": 1.6056584119796753, + "learning_rate": 1.554902507450457e-05, + "loss": 0.0526, + "step": 19074 + }, + { + "epoch": 0.38152, + "grad_norm": 0.8913539052009583, + "learning_rate": 1.5547863447412973e-05, + "loss": 0.0452, + "step": 19076 + }, + { + "epoch": 0.38156, + "grad_norm": 1.0282562971115112, + "learning_rate": 1.5546701712162928e-05, + "loss": 0.0163, + "step": 19078 + }, + { + "epoch": 0.3816, + "grad_norm": 0.21017049252986908, + "learning_rate": 1.5545539868777075e-05, + "loss": 0.5325, + "step": 19080 + }, + { + "epoch": 0.38164, + "grad_norm": 1.4121205806732178, + "learning_rate": 1.5544377917278067e-05, + "loss": 0.0198, + "step": 19082 + }, + { + "epoch": 0.38168, + "grad_norm": 0.09110379219055176, + "learning_rate": 1.5543215857688556e-05, + "loss": 0.1172, + "step": 19084 + }, + { + "epoch": 0.38172, + "grad_norm": 8.45071792602539, + "learning_rate": 1.55420536900312e-05, + "loss": 0.199, + "step": 19086 + }, + { + "epoch": 0.38176, + "grad_norm": 0.36103278398513794, + "learning_rate": 1.554089141432865e-05, + "loss": 0.3241, + "step": 19088 + }, + { + "epoch": 0.3818, + "grad_norm": 5.0715484619140625, + "learning_rate": 1.5539729030603574e-05, + "loss": 0.0874, + "step": 19090 + }, + { + "epoch": 0.38184, + "grad_norm": 0.50795978307724, + "learning_rate": 1.5538566538878628e-05, + "loss": 0.0956, + "step": 19092 + }, + { + "epoch": 0.38188, + "grad_norm": 0.6973530650138855, + "learning_rate": 1.5537403939176474e-05, + "loss": 0.0302, + "step": 19094 + }, + { + "epoch": 0.38192, + "grad_norm": 0.009519080631434917, + "learning_rate": 1.553624123151978e-05, + "loss": 0.0029, + "step": 19096 + }, + { + "epoch": 0.38196, + "grad_norm": 2.0825541019439697, + "learning_rate": 1.5535078415931212e-05, + "loss": 0.3564, + "step": 19098 + }, + { + "epoch": 0.382, + "grad_norm": 2.2211170196533203, + "learning_rate": 1.553391549243344e-05, + "loss": 0.0809, + "step": 19100 + }, + { + "epoch": 0.38204, + "grad_norm": 1.1762011051177979, + "learning_rate": 1.553275246104914e-05, + "loss": 0.0252, + "step": 19102 + }, + { + "epoch": 0.38208, + "grad_norm": 1.1103547811508179, + "learning_rate": 1.5531589321800978e-05, + "loss": 0.1175, + "step": 19104 + }, + { + "epoch": 0.38212, + "grad_norm": 0.8343392014503479, + "learning_rate": 1.553042607471164e-05, + "loss": 0.0272, + "step": 19106 + }, + { + "epoch": 0.38216, + "grad_norm": 0.3366476893424988, + "learning_rate": 1.5529262719803794e-05, + "loss": 0.0191, + "step": 19108 + }, + { + "epoch": 0.3822, + "grad_norm": 5.011086463928223, + "learning_rate": 1.5528099257100126e-05, + "loss": 0.1219, + "step": 19110 + }, + { + "epoch": 0.38224, + "grad_norm": 3.3225321769714355, + "learning_rate": 1.5526935686623316e-05, + "loss": 0.1433, + "step": 19112 + }, + { + "epoch": 0.38228, + "grad_norm": 5.7032904624938965, + "learning_rate": 1.5525772008396048e-05, + "loss": 0.1547, + "step": 19114 + }, + { + "epoch": 0.38232, + "grad_norm": 4.346321105957031, + "learning_rate": 1.5524608222441012e-05, + "loss": 0.3065, + "step": 19116 + }, + { + "epoch": 0.38236, + "grad_norm": 1.0693632364273071, + "learning_rate": 1.5523444328780895e-05, + "loss": 0.0301, + "step": 19118 + }, + { + "epoch": 0.3824, + "grad_norm": 0.14873360097408295, + "learning_rate": 1.5522280327438388e-05, + "loss": 0.0033, + "step": 19120 + }, + { + "epoch": 0.38244, + "grad_norm": 0.48521098494529724, + "learning_rate": 1.552111621843618e-05, + "loss": 0.0781, + "step": 19122 + }, + { + "epoch": 0.38248, + "grad_norm": 8.84417724609375, + "learning_rate": 1.5519952001796973e-05, + "loss": 0.2678, + "step": 19124 + }, + { + "epoch": 0.38252, + "grad_norm": 2.1305291652679443, + "learning_rate": 1.5518787677543456e-05, + "loss": 0.0507, + "step": 19126 + }, + { + "epoch": 0.38256, + "grad_norm": 0.5216226577758789, + "learning_rate": 1.5517623245698334e-05, + "loss": 0.0199, + "step": 19128 + }, + { + "epoch": 0.3826, + "grad_norm": 0.4077513813972473, + "learning_rate": 1.5516458706284306e-05, + "loss": 0.0649, + "step": 19130 + }, + { + "epoch": 0.38264, + "grad_norm": 8.08298110961914, + "learning_rate": 1.5515294059324074e-05, + "loss": 0.203, + "step": 19132 + }, + { + "epoch": 0.38268, + "grad_norm": 2.597734212875366, + "learning_rate": 1.5514129304840345e-05, + "loss": 0.1108, + "step": 19134 + }, + { + "epoch": 0.38272, + "grad_norm": 0.7449654936790466, + "learning_rate": 1.551296444285583e-05, + "loss": 0.0101, + "step": 19136 + }, + { + "epoch": 0.38276, + "grad_norm": 1.1880123615264893, + "learning_rate": 1.5511799473393226e-05, + "loss": 0.019, + "step": 19138 + }, + { + "epoch": 0.3828, + "grad_norm": 0.20228922367095947, + "learning_rate": 1.5510634396475262e-05, + "loss": 0.1534, + "step": 19140 + }, + { + "epoch": 0.38284, + "grad_norm": 0.5488869547843933, + "learning_rate": 1.550946921212464e-05, + "loss": 0.0117, + "step": 19142 + }, + { + "epoch": 0.38288, + "grad_norm": 7.284850120544434, + "learning_rate": 1.550830392036408e-05, + "loss": 0.4942, + "step": 19144 + }, + { + "epoch": 0.38292, + "grad_norm": 8.671734809875488, + "learning_rate": 1.55071385212163e-05, + "loss": 0.6516, + "step": 19146 + }, + { + "epoch": 0.38296, + "grad_norm": 0.12049512565135956, + "learning_rate": 1.5505973014704017e-05, + "loss": 0.0236, + "step": 19148 + }, + { + "epoch": 0.383, + "grad_norm": 0.054370757192373276, + "learning_rate": 1.5504807400849957e-05, + "loss": 0.22, + "step": 19150 + }, + { + "epoch": 0.38304, + "grad_norm": 0.3836689591407776, + "learning_rate": 1.5503641679676846e-05, + "loss": 0.0398, + "step": 19152 + }, + { + "epoch": 0.38308, + "grad_norm": 0.22243894636631012, + "learning_rate": 1.55024758512074e-05, + "loss": 0.3604, + "step": 19154 + }, + { + "epoch": 0.38312, + "grad_norm": 2.507899761199951, + "learning_rate": 1.5501309915464358e-05, + "loss": 0.0647, + "step": 19156 + }, + { + "epoch": 0.38316, + "grad_norm": 6.393192768096924, + "learning_rate": 1.5500143872470446e-05, + "loss": 0.2067, + "step": 19158 + }, + { + "epoch": 0.3832, + "grad_norm": 0.8043296933174133, + "learning_rate": 1.54989777222484e-05, + "loss": 0.0317, + "step": 19160 + }, + { + "epoch": 0.38324, + "grad_norm": 3.1292924880981445, + "learning_rate": 1.549781146482095e-05, + "loss": 0.6198, + "step": 19162 + }, + { + "epoch": 0.38328, + "grad_norm": 4.12066650390625, + "learning_rate": 1.5496645100210838e-05, + "loss": 0.3348, + "step": 19164 + }, + { + "epoch": 0.38332, + "grad_norm": 1.8940438032150269, + "learning_rate": 1.5495478628440795e-05, + "loss": 0.0482, + "step": 19166 + }, + { + "epoch": 0.38336, + "grad_norm": 0.7527879476547241, + "learning_rate": 1.549431204953357e-05, + "loss": 0.0249, + "step": 19168 + }, + { + "epoch": 0.3834, + "grad_norm": 0.7048530578613281, + "learning_rate": 1.54931453635119e-05, + "loss": 0.0249, + "step": 19170 + }, + { + "epoch": 0.38344, + "grad_norm": 0.011066598817706108, + "learning_rate": 1.549197857039854e-05, + "loss": 0.0004, + "step": 19172 + }, + { + "epoch": 0.38348, + "grad_norm": 3.318969964981079, + "learning_rate": 1.5490811670216223e-05, + "loss": 0.3305, + "step": 19174 + }, + { + "epoch": 0.38352, + "grad_norm": 0.30136558413505554, + "learning_rate": 1.548964466298771e-05, + "loss": 0.0096, + "step": 19176 + }, + { + "epoch": 0.38356, + "grad_norm": 0.9279295206069946, + "learning_rate": 1.5488477548735746e-05, + "loss": 0.1175, + "step": 19178 + }, + { + "epoch": 0.3836, + "grad_norm": 0.33209189772605896, + "learning_rate": 1.5487310327483087e-05, + "loss": 0.0788, + "step": 19180 + }, + { + "epoch": 0.38364, + "grad_norm": 5.8631591796875, + "learning_rate": 1.548614299925249e-05, + "loss": 0.2513, + "step": 19182 + }, + { + "epoch": 0.38368, + "grad_norm": 1.6051543951034546, + "learning_rate": 1.5484975564066704e-05, + "loss": 0.1098, + "step": 19184 + }, + { + "epoch": 0.38372, + "grad_norm": 0.8995859622955322, + "learning_rate": 1.54838080219485e-05, + "loss": 0.0231, + "step": 19186 + }, + { + "epoch": 0.38376, + "grad_norm": 0.2964262068271637, + "learning_rate": 1.548264037292064e-05, + "loss": 0.0063, + "step": 19188 + }, + { + "epoch": 0.3838, + "grad_norm": 4.208271503448486, + "learning_rate": 1.5481472617005878e-05, + "loss": 0.2878, + "step": 19190 + }, + { + "epoch": 0.38384, + "grad_norm": 2.0189309120178223, + "learning_rate": 1.5480304754226985e-05, + "loss": 0.058, + "step": 19192 + }, + { + "epoch": 0.38388, + "grad_norm": 1.2775721549987793, + "learning_rate": 1.5479136784606728e-05, + "loss": 0.031, + "step": 19194 + }, + { + "epoch": 0.38392, + "grad_norm": 0.07477006316184998, + "learning_rate": 1.5477968708167885e-05, + "loss": 0.0131, + "step": 19196 + }, + { + "epoch": 0.38396, + "grad_norm": 0.056892428547143936, + "learning_rate": 1.5476800524933216e-05, + "loss": 0.0143, + "step": 19198 + }, + { + "epoch": 0.384, + "grad_norm": 0.3363075256347656, + "learning_rate": 1.5475632234925505e-05, + "loss": 0.0147, + "step": 19200 + }, + { + "epoch": 0.38404, + "grad_norm": 1.5552570819854736, + "learning_rate": 1.5474463838167522e-05, + "loss": 0.0528, + "step": 19202 + }, + { + "epoch": 0.38408, + "grad_norm": 0.5733017325401306, + "learning_rate": 1.547329533468205e-05, + "loss": 0.0187, + "step": 19204 + }, + { + "epoch": 0.38412, + "grad_norm": 0.24811288714408875, + "learning_rate": 1.5472126724491862e-05, + "loss": 0.0347, + "step": 19206 + }, + { + "epoch": 0.38416, + "grad_norm": 2.8013923168182373, + "learning_rate": 1.5470958007619752e-05, + "loss": 0.0588, + "step": 19208 + }, + { + "epoch": 0.3842, + "grad_norm": 0.8643259406089783, + "learning_rate": 1.5469789184088498e-05, + "loss": 0.033, + "step": 19210 + }, + { + "epoch": 0.38424, + "grad_norm": 7.68132209777832, + "learning_rate": 1.546862025392089e-05, + "loss": 0.3653, + "step": 19212 + }, + { + "epoch": 0.38428, + "grad_norm": 6.019965648651123, + "learning_rate": 1.546745121713971e-05, + "loss": 0.1402, + "step": 19214 + }, + { + "epoch": 0.38432, + "grad_norm": 6.427103519439697, + "learning_rate": 1.5466282073767756e-05, + "loss": 0.3691, + "step": 19216 + }, + { + "epoch": 0.38436, + "grad_norm": 0.18923549354076385, + "learning_rate": 1.546511282382782e-05, + "loss": 0.0192, + "step": 19218 + }, + { + "epoch": 0.3844, + "grad_norm": 2.8891360759735107, + "learning_rate": 1.5463943467342694e-05, + "loss": 0.0731, + "step": 19220 + }, + { + "epoch": 0.38444, + "grad_norm": 3.555166721343994, + "learning_rate": 1.5462774004335175e-05, + "loss": 0.0862, + "step": 19222 + }, + { + "epoch": 0.38448, + "grad_norm": 0.2626972198486328, + "learning_rate": 1.546160443482807e-05, + "loss": 0.0328, + "step": 19224 + }, + { + "epoch": 0.38452, + "grad_norm": 1.0554734468460083, + "learning_rate": 1.5460434758844168e-05, + "loss": 0.0184, + "step": 19226 + }, + { + "epoch": 0.38456, + "grad_norm": 2.228058099746704, + "learning_rate": 1.5459264976406284e-05, + "loss": 0.0613, + "step": 19228 + }, + { + "epoch": 0.3846, + "grad_norm": 1.0715707540512085, + "learning_rate": 1.5458095087537216e-05, + "loss": 0.0283, + "step": 19230 + }, + { + "epoch": 0.38464, + "grad_norm": 0.11379556357860565, + "learning_rate": 1.5456925092259777e-05, + "loss": 0.0137, + "step": 19232 + }, + { + "epoch": 0.38468, + "grad_norm": 5.085546493530273, + "learning_rate": 1.545575499059677e-05, + "loss": 0.3308, + "step": 19234 + }, + { + "epoch": 0.38472, + "grad_norm": 0.18543753027915955, + "learning_rate": 1.5454584782571015e-05, + "loss": 0.0044, + "step": 19236 + }, + { + "epoch": 0.38476, + "grad_norm": 1.9832814931869507, + "learning_rate": 1.545341446820532e-05, + "loss": 0.0536, + "step": 19238 + }, + { + "epoch": 0.3848, + "grad_norm": 4.131438732147217, + "learning_rate": 1.5452244047522504e-05, + "loss": 0.1326, + "step": 19240 + }, + { + "epoch": 0.38484, + "grad_norm": 4.34050178527832, + "learning_rate": 1.545107352054538e-05, + "loss": 0.1925, + "step": 19242 + }, + { + "epoch": 0.38488, + "grad_norm": 0.13877464830875397, + "learning_rate": 1.544990288729677e-05, + "loss": 0.1291, + "step": 19244 + }, + { + "epoch": 0.38492, + "grad_norm": 6.0008673667907715, + "learning_rate": 1.54487321477995e-05, + "loss": 0.2586, + "step": 19246 + }, + { + "epoch": 0.38496, + "grad_norm": 0.5656890273094177, + "learning_rate": 1.5447561302076392e-05, + "loss": 0.011, + "step": 19248 + }, + { + "epoch": 0.385, + "grad_norm": 0.09421506524085999, + "learning_rate": 1.5446390350150272e-05, + "loss": 0.0212, + "step": 19250 + }, + { + "epoch": 0.38504, + "grad_norm": 3.7564492225646973, + "learning_rate": 1.5445219292043968e-05, + "loss": 0.1175, + "step": 19252 + }, + { + "epoch": 0.38508, + "grad_norm": 2.9009757041931152, + "learning_rate": 1.544404812778031e-05, + "loss": 0.0475, + "step": 19254 + }, + { + "epoch": 0.38512, + "grad_norm": 3.4521870613098145, + "learning_rate": 1.544287685738213e-05, + "loss": 0.0907, + "step": 19256 + }, + { + "epoch": 0.38516, + "grad_norm": 2.199578285217285, + "learning_rate": 1.5441705480872268e-05, + "loss": 0.0553, + "step": 19258 + }, + { + "epoch": 0.3852, + "grad_norm": 4.817182540893555, + "learning_rate": 1.544053399827355e-05, + "loss": 0.1479, + "step": 19260 + }, + { + "epoch": 0.38524, + "grad_norm": 7.781014442443848, + "learning_rate": 1.5439362409608826e-05, + "loss": 0.2106, + "step": 19262 + }, + { + "epoch": 0.38528, + "grad_norm": 0.36279457807540894, + "learning_rate": 1.5438190714900924e-05, + "loss": 0.0931, + "step": 19264 + }, + { + "epoch": 0.38532, + "grad_norm": 0.2927575707435608, + "learning_rate": 1.5437018914172703e-05, + "loss": 0.0074, + "step": 19266 + }, + { + "epoch": 0.38536, + "grad_norm": 0.04100119322538376, + "learning_rate": 1.5435847007446992e-05, + "loss": 0.0061, + "step": 19268 + }, + { + "epoch": 0.3854, + "grad_norm": 1.6818991899490356, + "learning_rate": 1.543467499474665e-05, + "loss": 0.0368, + "step": 19270 + }, + { + "epoch": 0.38544, + "grad_norm": 0.043223872780799866, + "learning_rate": 1.5433502876094518e-05, + "loss": 0.0012, + "step": 19272 + }, + { + "epoch": 0.38548, + "grad_norm": 0.22957579791545868, + "learning_rate": 1.5432330651513454e-05, + "loss": 0.2093, + "step": 19274 + }, + { + "epoch": 0.38552, + "grad_norm": 1.4579267501831055, + "learning_rate": 1.5431158321026304e-05, + "loss": 0.0308, + "step": 19276 + }, + { + "epoch": 0.38556, + "grad_norm": 0.5055438876152039, + "learning_rate": 1.5429985884655923e-05, + "loss": 0.6373, + "step": 19278 + }, + { + "epoch": 0.3856, + "grad_norm": 1.9614235162734985, + "learning_rate": 1.5428813342425177e-05, + "loss": 0.0694, + "step": 19280 + }, + { + "epoch": 0.38564, + "grad_norm": 0.3361104428768158, + "learning_rate": 1.542764069435692e-05, + "loss": 0.0095, + "step": 19282 + }, + { + "epoch": 0.38568, + "grad_norm": 0.7929049730300903, + "learning_rate": 1.542646794047401e-05, + "loss": 0.0773, + "step": 19284 + }, + { + "epoch": 0.38572, + "grad_norm": 0.33513185381889343, + "learning_rate": 1.5425295080799316e-05, + "loss": 0.3985, + "step": 19286 + }, + { + "epoch": 0.38576, + "grad_norm": 0.1729194074869156, + "learning_rate": 1.54241221153557e-05, + "loss": 0.0063, + "step": 19288 + }, + { + "epoch": 0.3858, + "grad_norm": 2.3911733627319336, + "learning_rate": 1.542294904416603e-05, + "loss": 0.0664, + "step": 19290 + }, + { + "epoch": 0.38584, + "grad_norm": 0.10376584529876709, + "learning_rate": 1.542177586725318e-05, + "loss": 0.0441, + "step": 19292 + }, + { + "epoch": 0.38588, + "grad_norm": 0.8807884454727173, + "learning_rate": 1.5420602584640014e-05, + "loss": 0.0659, + "step": 19294 + }, + { + "epoch": 0.38592, + "grad_norm": 1.2600030899047852, + "learning_rate": 1.5419429196349413e-05, + "loss": 0.0219, + "step": 19296 + }, + { + "epoch": 0.38596, + "grad_norm": 0.7662216424942017, + "learning_rate": 1.541825570240425e-05, + "loss": 0.0198, + "step": 19298 + }, + { + "epoch": 0.386, + "grad_norm": 0.367400586605072, + "learning_rate": 1.54170821028274e-05, + "loss": 0.3269, + "step": 19300 + }, + { + "epoch": 0.38604, + "grad_norm": 7.456146717071533, + "learning_rate": 1.5415908397641746e-05, + "loss": 0.3079, + "step": 19302 + }, + { + "epoch": 0.38608, + "grad_norm": 2.481837749481201, + "learning_rate": 1.5414734586870172e-05, + "loss": 0.0863, + "step": 19304 + }, + { + "epoch": 0.38612, + "grad_norm": 3.3614907264709473, + "learning_rate": 1.5413560670535555e-05, + "loss": 0.0572, + "step": 19306 + }, + { + "epoch": 0.38616, + "grad_norm": 4.597964286804199, + "learning_rate": 1.5412386648660792e-05, + "loss": 0.0837, + "step": 19308 + }, + { + "epoch": 0.3862, + "grad_norm": 0.19274936616420746, + "learning_rate": 1.541121252126876e-05, + "loss": 0.0129, + "step": 19310 + }, + { + "epoch": 0.38624, + "grad_norm": 0.22246278822422028, + "learning_rate": 1.5410038288382356e-05, + "loss": 0.0347, + "step": 19312 + }, + { + "epoch": 0.38628, + "grad_norm": 0.4613822102546692, + "learning_rate": 1.540886395002447e-05, + "loss": 0.0285, + "step": 19314 + }, + { + "epoch": 0.38632, + "grad_norm": 1.988722801208496, + "learning_rate": 1.5407689506217994e-05, + "loss": 0.0894, + "step": 19316 + }, + { + "epoch": 0.38636, + "grad_norm": 0.2756900489330292, + "learning_rate": 1.540651495698583e-05, + "loss": 0.2092, + "step": 19318 + }, + { + "epoch": 0.3864, + "grad_norm": 0.3078145980834961, + "learning_rate": 1.540534030235087e-05, + "loss": 0.0092, + "step": 19320 + }, + { + "epoch": 0.38644, + "grad_norm": 0.7314436435699463, + "learning_rate": 1.540416554233602e-05, + "loss": 0.171, + "step": 19322 + }, + { + "epoch": 0.38648, + "grad_norm": 9.536004066467285, + "learning_rate": 1.5402990676964185e-05, + "loss": 0.3577, + "step": 19324 + }, + { + "epoch": 0.38652, + "grad_norm": 0.5454075932502747, + "learning_rate": 1.540181570625826e-05, + "loss": 0.1384, + "step": 19326 + }, + { + "epoch": 0.38656, + "grad_norm": 1.6442465782165527, + "learning_rate": 1.540064063024116e-05, + "loss": 0.0361, + "step": 19328 + }, + { + "epoch": 0.3866, + "grad_norm": 10.982992172241211, + "learning_rate": 1.5399465448935788e-05, + "loss": 0.4607, + "step": 19330 + }, + { + "epoch": 0.38664, + "grad_norm": 0.10945425927639008, + "learning_rate": 1.5398290162365056e-05, + "loss": 0.0046, + "step": 19332 + }, + { + "epoch": 0.38668, + "grad_norm": 3.875614643096924, + "learning_rate": 1.539711477055188e-05, + "loss": 0.0705, + "step": 19334 + }, + { + "epoch": 0.38672, + "grad_norm": 5.701943874359131, + "learning_rate": 1.5395939273519173e-05, + "loss": 0.2115, + "step": 19336 + }, + { + "epoch": 0.38676, + "grad_norm": 2.983280897140503, + "learning_rate": 1.5394763671289853e-05, + "loss": 0.0802, + "step": 19338 + }, + { + "epoch": 0.3868, + "grad_norm": 1.4641560316085815, + "learning_rate": 1.5393587963886837e-05, + "loss": 0.0346, + "step": 19340 + }, + { + "epoch": 0.38684, + "grad_norm": 0.4708462357521057, + "learning_rate": 1.5392412151333046e-05, + "loss": 0.02, + "step": 19342 + }, + { + "epoch": 0.38688, + "grad_norm": 1.395068883895874, + "learning_rate": 1.53912362336514e-05, + "loss": 0.0388, + "step": 19344 + }, + { + "epoch": 0.38692, + "grad_norm": 0.40103980898857117, + "learning_rate": 1.5390060210864835e-05, + "loss": 0.0106, + "step": 19346 + }, + { + "epoch": 0.38696, + "grad_norm": 6.371538162231445, + "learning_rate": 1.5388884082996266e-05, + "loss": 0.2005, + "step": 19348 + }, + { + "epoch": 0.387, + "grad_norm": 0.5278392434120178, + "learning_rate": 1.5387707850068633e-05, + "loss": 0.0442, + "step": 19350 + }, + { + "epoch": 0.38704, + "grad_norm": 0.7788107991218567, + "learning_rate": 1.5386531512104855e-05, + "loss": 0.064, + "step": 19352 + }, + { + "epoch": 0.38708, + "grad_norm": 0.1285223811864853, + "learning_rate": 1.5385355069127877e-05, + "loss": 0.3394, + "step": 19354 + }, + { + "epoch": 0.38712, + "grad_norm": 1.10030996799469, + "learning_rate": 1.538417852116063e-05, + "loss": 0.0396, + "step": 19356 + }, + { + "epoch": 0.38716, + "grad_norm": 0.01617274060845375, + "learning_rate": 1.5383001868226045e-05, + "loss": 0.1763, + "step": 19358 + }, + { + "epoch": 0.3872, + "grad_norm": 0.6168213486671448, + "learning_rate": 1.5381825110347072e-05, + "loss": 0.0109, + "step": 19360 + }, + { + "epoch": 0.38724, + "grad_norm": 0.07582133263349533, + "learning_rate": 1.5380648247546647e-05, + "loss": 0.1396, + "step": 19362 + }, + { + "epoch": 0.38728, + "grad_norm": 0.14204809069633484, + "learning_rate": 1.5379471279847714e-05, + "loss": 0.3953, + "step": 19364 + }, + { + "epoch": 0.38732, + "grad_norm": 0.3910934031009674, + "learning_rate": 1.537829420727322e-05, + "loss": 0.0081, + "step": 19366 + }, + { + "epoch": 0.38736, + "grad_norm": 0.5632573366165161, + "learning_rate": 1.537711702984611e-05, + "loss": 0.1233, + "step": 19368 + }, + { + "epoch": 0.3874, + "grad_norm": 0.24504095315933228, + "learning_rate": 1.5375939747589334e-05, + "loss": 0.0091, + "step": 19370 + }, + { + "epoch": 0.38744, + "grad_norm": 6.2935404777526855, + "learning_rate": 1.537476236052585e-05, + "loss": 0.3004, + "step": 19372 + }, + { + "epoch": 0.38748, + "grad_norm": 10.660759925842285, + "learning_rate": 1.53735848686786e-05, + "loss": 0.2419, + "step": 19374 + }, + { + "epoch": 0.38752, + "grad_norm": 1.9202033281326294, + "learning_rate": 1.5372407272070548e-05, + "loss": 0.3609, + "step": 19376 + }, + { + "epoch": 0.38756, + "grad_norm": 1.370199203491211, + "learning_rate": 1.537122957072465e-05, + "loss": 0.0225, + "step": 19378 + }, + { + "epoch": 0.3876, + "grad_norm": 1.1423128843307495, + "learning_rate": 1.5370051764663872e-05, + "loss": 0.1152, + "step": 19380 + }, + { + "epoch": 0.38764, + "grad_norm": 0.6269557476043701, + "learning_rate": 1.5368873853911167e-05, + "loss": 0.108, + "step": 19382 + }, + { + "epoch": 0.38768, + "grad_norm": 0.5982319712638855, + "learning_rate": 1.5367695838489497e-05, + "loss": 0.0199, + "step": 19384 + }, + { + "epoch": 0.38772, + "grad_norm": 0.043565403670072556, + "learning_rate": 1.5366517718421842e-05, + "loss": 0.0314, + "step": 19386 + }, + { + "epoch": 0.38776, + "grad_norm": 0.15856757760047913, + "learning_rate": 1.5365339493731155e-05, + "loss": 0.0114, + "step": 19388 + }, + { + "epoch": 0.3878, + "grad_norm": 1.1394495964050293, + "learning_rate": 1.5364161164440413e-05, + "loss": 0.0315, + "step": 19390 + }, + { + "epoch": 0.38784, + "grad_norm": 1.0029600858688354, + "learning_rate": 1.5362982730572587e-05, + "loss": 0.34, + "step": 19392 + }, + { + "epoch": 0.38788, + "grad_norm": 0.1470315158367157, + "learning_rate": 1.5361804192150654e-05, + "loss": 0.0036, + "step": 19394 + }, + { + "epoch": 0.38792, + "grad_norm": 0.6464909911155701, + "learning_rate": 1.5360625549197588e-05, + "loss": 0.062, + "step": 19396 + }, + { + "epoch": 0.38796, + "grad_norm": 0.3678975999355316, + "learning_rate": 1.5359446801736363e-05, + "loss": 0.0661, + "step": 19398 + }, + { + "epoch": 0.388, + "grad_norm": 3.1656606197357178, + "learning_rate": 1.5358267949789968e-05, + "loss": 0.0798, + "step": 19400 + }, + { + "epoch": 0.38804, + "grad_norm": 7.855528354644775, + "learning_rate": 1.535708899338138e-05, + "loss": 0.5084, + "step": 19402 + }, + { + "epoch": 0.38808, + "grad_norm": 1.1043705940246582, + "learning_rate": 1.5355909932533582e-05, + "loss": 0.0265, + "step": 19404 + }, + { + "epoch": 0.38812, + "grad_norm": 0.6556646227836609, + "learning_rate": 1.5354730767269565e-05, + "loss": 0.1278, + "step": 19406 + }, + { + "epoch": 0.38816, + "grad_norm": 1.0427733659744263, + "learning_rate": 1.535355149761231e-05, + "loss": 0.0738, + "step": 19408 + }, + { + "epoch": 0.3882, + "grad_norm": 0.126190647482872, + "learning_rate": 1.5352372123584816e-05, + "loss": 0.0123, + "step": 19410 + }, + { + "epoch": 0.38824, + "grad_norm": 0.08853612840175629, + "learning_rate": 1.5351192645210073e-05, + "loss": 0.0029, + "step": 19412 + }, + { + "epoch": 0.38828, + "grad_norm": 0.352936714887619, + "learning_rate": 1.5350013062511075e-05, + "loss": 0.0402, + "step": 19414 + }, + { + "epoch": 0.38832, + "grad_norm": 4.958409786224365, + "learning_rate": 1.534883337551081e-05, + "loss": 0.121, + "step": 19416 + }, + { + "epoch": 0.38836, + "grad_norm": 0.4258521795272827, + "learning_rate": 1.5347653584232292e-05, + "loss": 0.0285, + "step": 19418 + }, + { + "epoch": 0.3884, + "grad_norm": 0.18186913430690765, + "learning_rate": 1.5346473688698514e-05, + "loss": 0.3588, + "step": 19420 + }, + { + "epoch": 0.38844, + "grad_norm": 0.24956873059272766, + "learning_rate": 1.534529368893248e-05, + "loss": 0.0201, + "step": 19422 + }, + { + "epoch": 0.38848, + "grad_norm": 0.022052429616451263, + "learning_rate": 1.534411358495719e-05, + "loss": 0.5488, + "step": 19424 + }, + { + "epoch": 0.38852, + "grad_norm": 3.235151767730713, + "learning_rate": 1.5342933376795656e-05, + "loss": 0.0544, + "step": 19426 + }, + { + "epoch": 0.38856, + "grad_norm": 2.534489870071411, + "learning_rate": 1.5341753064470887e-05, + "loss": 0.1487, + "step": 19428 + }, + { + "epoch": 0.3886, + "grad_norm": 2.117810010910034, + "learning_rate": 1.5340572648005887e-05, + "loss": 0.0423, + "step": 19430 + }, + { + "epoch": 0.38864, + "grad_norm": 3.8031914234161377, + "learning_rate": 1.5339392127423675e-05, + "loss": 0.115, + "step": 19432 + }, + { + "epoch": 0.38868, + "grad_norm": 0.5878464579582214, + "learning_rate": 1.5338211502747266e-05, + "loss": 0.0753, + "step": 19434 + }, + { + "epoch": 0.38872, + "grad_norm": 1.198586344718933, + "learning_rate": 1.5337030773999674e-05, + "loss": 0.0277, + "step": 19436 + }, + { + "epoch": 0.38876, + "grad_norm": 6.502416133880615, + "learning_rate": 1.5335849941203923e-05, + "loss": 0.3847, + "step": 19438 + }, + { + "epoch": 0.3888, + "grad_norm": 0.47675493359565735, + "learning_rate": 1.533466900438303e-05, + "loss": 0.0293, + "step": 19440 + }, + { + "epoch": 0.38884, + "grad_norm": 0.26889967918395996, + "learning_rate": 1.5333487963560014e-05, + "loss": 0.0114, + "step": 19442 + }, + { + "epoch": 0.38888, + "grad_norm": 0.9501915574073792, + "learning_rate": 1.533230681875791e-05, + "loss": 0.024, + "step": 19444 + }, + { + "epoch": 0.38892, + "grad_norm": 1.745932936668396, + "learning_rate": 1.533112556999973e-05, + "loss": 0.0902, + "step": 19446 + }, + { + "epoch": 0.38896, + "grad_norm": 7.875311374664307, + "learning_rate": 1.5329944217308523e-05, + "loss": 0.4694, + "step": 19448 + }, + { + "epoch": 0.389, + "grad_norm": 0.29301080107688904, + "learning_rate": 1.53287627607073e-05, + "loss": 0.0487, + "step": 19450 + }, + { + "epoch": 0.38904, + "grad_norm": 3.3194377422332764, + "learning_rate": 1.5327581200219107e-05, + "loss": 0.0897, + "step": 19452 + }, + { + "epoch": 0.38908, + "grad_norm": 0.6653426289558411, + "learning_rate": 1.5326399535866978e-05, + "loss": 0.0947, + "step": 19454 + }, + { + "epoch": 0.38912, + "grad_norm": 0.6029551029205322, + "learning_rate": 1.532521776767395e-05, + "loss": 0.0372, + "step": 19456 + }, + { + "epoch": 0.38916, + "grad_norm": 2.166616916656494, + "learning_rate": 1.5324035895663057e-05, + "loss": 0.0616, + "step": 19458 + }, + { + "epoch": 0.3892, + "grad_norm": 0.47849664092063904, + "learning_rate": 1.532285391985734e-05, + "loss": 0.0253, + "step": 19460 + }, + { + "epoch": 0.38924, + "grad_norm": 0.9488723278045654, + "learning_rate": 1.5321671840279852e-05, + "loss": 0.2516, + "step": 19462 + }, + { + "epoch": 0.38928, + "grad_norm": 0.0721396803855896, + "learning_rate": 1.5320489656953624e-05, + "loss": 0.0536, + "step": 19464 + }, + { + "epoch": 0.38932, + "grad_norm": 2.099135160446167, + "learning_rate": 1.5319307369901717e-05, + "loss": 0.0714, + "step": 19466 + }, + { + "epoch": 0.38936, + "grad_norm": 0.18388952314853668, + "learning_rate": 1.531812497914717e-05, + "loss": 0.2063, + "step": 19468 + }, + { + "epoch": 0.3894, + "grad_norm": 0.04238602891564369, + "learning_rate": 1.5316942484713043e-05, + "loss": 0.2676, + "step": 19470 + }, + { + "epoch": 0.38944, + "grad_norm": 9.638751029968262, + "learning_rate": 1.531575988662238e-05, + "loss": 0.4878, + "step": 19472 + }, + { + "epoch": 0.38948, + "grad_norm": 0.940143883228302, + "learning_rate": 1.5314577184898247e-05, + "loss": 0.0237, + "step": 19474 + }, + { + "epoch": 0.38952, + "grad_norm": 0.0772828459739685, + "learning_rate": 1.5313394379563692e-05, + "loss": 0.0977, + "step": 19476 + }, + { + "epoch": 0.38956, + "grad_norm": 0.04602000117301941, + "learning_rate": 1.531221147064178e-05, + "loss": 0.0032, + "step": 19478 + }, + { + "epoch": 0.3896, + "grad_norm": 0.5368438959121704, + "learning_rate": 1.5311028458155567e-05, + "loss": 0.3241, + "step": 19480 + }, + { + "epoch": 0.38964, + "grad_norm": 0.024189511314034462, + "learning_rate": 1.5309845342128124e-05, + "loss": 0.0014, + "step": 19482 + }, + { + "epoch": 0.38968, + "grad_norm": 1.626494288444519, + "learning_rate": 1.5308662122582507e-05, + "loss": 0.7279, + "step": 19484 + }, + { + "epoch": 0.38972, + "grad_norm": 0.006364248227328062, + "learning_rate": 1.5307478799541794e-05, + "loss": 0.0093, + "step": 19486 + }, + { + "epoch": 0.38976, + "grad_norm": 3.0056211948394775, + "learning_rate": 1.5306295373029047e-05, + "loss": 0.2138, + "step": 19488 + }, + { + "epoch": 0.3898, + "grad_norm": 0.663325846195221, + "learning_rate": 1.5305111843067343e-05, + "loss": 0.0196, + "step": 19490 + }, + { + "epoch": 0.38984, + "grad_norm": 4.8226637840271, + "learning_rate": 1.530392820967975e-05, + "loss": 0.1325, + "step": 19492 + }, + { + "epoch": 0.38988, + "grad_norm": 0.8271994590759277, + "learning_rate": 1.5302744472889345e-05, + "loss": 0.0796, + "step": 19494 + }, + { + "epoch": 0.38992, + "grad_norm": 1.6560271978378296, + "learning_rate": 1.530156063271921e-05, + "loss": 0.0758, + "step": 19496 + }, + { + "epoch": 0.38996, + "grad_norm": 0.5540909171104431, + "learning_rate": 1.5300376689192414e-05, + "loss": 0.0389, + "step": 19498 + }, + { + "epoch": 0.39, + "grad_norm": 1.931809425354004, + "learning_rate": 1.529919264233205e-05, + "loss": 0.2151, + "step": 19500 + }, + { + "epoch": 0.39004, + "grad_norm": 7.088435173034668, + "learning_rate": 1.52980084921612e-05, + "loss": 0.3714, + "step": 19502 + }, + { + "epoch": 0.39008, + "grad_norm": 6.39118766784668, + "learning_rate": 1.5296824238702942e-05, + "loss": 0.4256, + "step": 19504 + }, + { + "epoch": 0.39012, + "grad_norm": 2.1358020305633545, + "learning_rate": 1.5295639881980368e-05, + "loss": 0.164, + "step": 19506 + }, + { + "epoch": 0.39016, + "grad_norm": 0.07578388601541519, + "learning_rate": 1.5294455422016576e-05, + "loss": 0.3024, + "step": 19508 + }, + { + "epoch": 0.3902, + "grad_norm": 8.37445068359375, + "learning_rate": 1.5293270858834643e-05, + "loss": 0.595, + "step": 19510 + }, + { + "epoch": 0.39024, + "grad_norm": 0.28720515966415405, + "learning_rate": 1.529208619245767e-05, + "loss": 0.0052, + "step": 19512 + }, + { + "epoch": 0.39028, + "grad_norm": 0.27244412899017334, + "learning_rate": 1.5290901422908754e-05, + "loss": 0.3242, + "step": 19514 + }, + { + "epoch": 0.39032, + "grad_norm": 0.04065551981329918, + "learning_rate": 1.5289716550210987e-05, + "loss": 0.2678, + "step": 19516 + }, + { + "epoch": 0.39036, + "grad_norm": 2.285207748413086, + "learning_rate": 1.5288531574387478e-05, + "loss": 0.0909, + "step": 19518 + }, + { + "epoch": 0.3904, + "grad_norm": 3.3406150341033936, + "learning_rate": 1.528734649546132e-05, + "loss": 0.2019, + "step": 19520 + }, + { + "epoch": 0.39044, + "grad_norm": 1.598830223083496, + "learning_rate": 1.5286161313455617e-05, + "loss": 0.0456, + "step": 19522 + }, + { + "epoch": 0.39048, + "grad_norm": 2.9741296768188477, + "learning_rate": 1.5284976028393484e-05, + "loss": 0.1682, + "step": 19524 + }, + { + "epoch": 0.39052, + "grad_norm": 3.6466917991638184, + "learning_rate": 1.5283790640298016e-05, + "loss": 0.163, + "step": 19526 + }, + { + "epoch": 0.39056, + "grad_norm": 1.0965001583099365, + "learning_rate": 1.5282605149192337e-05, + "loss": 0.074, + "step": 19528 + }, + { + "epoch": 0.3906, + "grad_norm": 7.840695858001709, + "learning_rate": 1.5281419555099547e-05, + "loss": 0.4343, + "step": 19530 + }, + { + "epoch": 0.39064, + "grad_norm": 0.660235583782196, + "learning_rate": 1.528023385804276e-05, + "loss": 0.0685, + "step": 19532 + }, + { + "epoch": 0.39068, + "grad_norm": 0.8720241189002991, + "learning_rate": 1.5279048058045103e-05, + "loss": 0.0279, + "step": 19534 + }, + { + "epoch": 0.39072, + "grad_norm": 1.6403353214263916, + "learning_rate": 1.527786215512968e-05, + "loss": 0.0722, + "step": 19536 + }, + { + "epoch": 0.39076, + "grad_norm": 0.8626824021339417, + "learning_rate": 1.527667614931962e-05, + "loss": 0.0258, + "step": 19538 + }, + { + "epoch": 0.3908, + "grad_norm": 1.4341994524002075, + "learning_rate": 1.5275490040638038e-05, + "loss": 0.0679, + "step": 19540 + }, + { + "epoch": 0.39084, + "grad_norm": 0.33020052313804626, + "learning_rate": 1.5274303829108065e-05, + "loss": 0.115, + "step": 19542 + }, + { + "epoch": 0.39088, + "grad_norm": 3.054619073867798, + "learning_rate": 1.5273117514752826e-05, + "loss": 0.1206, + "step": 19544 + }, + { + "epoch": 0.39092, + "grad_norm": 1.2047922611236572, + "learning_rate": 1.527193109759544e-05, + "loss": 0.2236, + "step": 19546 + }, + { + "epoch": 0.39096, + "grad_norm": 0.29318442940711975, + "learning_rate": 1.5270744577659048e-05, + "loss": 0.1035, + "step": 19548 + }, + { + "epoch": 0.391, + "grad_norm": 0.08104128390550613, + "learning_rate": 1.5269557954966777e-05, + "loss": 0.1645, + "step": 19550 + }, + { + "epoch": 0.39104, + "grad_norm": 0.7046159505844116, + "learning_rate": 1.526837122954176e-05, + "loss": 0.0571, + "step": 19552 + }, + { + "epoch": 0.39108, + "grad_norm": 1.7223374843597412, + "learning_rate": 1.5267184401407133e-05, + "loss": 0.062, + "step": 19554 + }, + { + "epoch": 0.39112, + "grad_norm": 3.0632011890411377, + "learning_rate": 1.526599747058603e-05, + "loss": 0.2771, + "step": 19556 + }, + { + "epoch": 0.39116, + "grad_norm": 0.617131769657135, + "learning_rate": 1.5264810437101602e-05, + "loss": 0.1131, + "step": 19558 + }, + { + "epoch": 0.3912, + "grad_norm": 0.7139395475387573, + "learning_rate": 1.526362330097698e-05, + "loss": 0.0598, + "step": 19560 + }, + { + "epoch": 0.39124, + "grad_norm": 3.6671009063720703, + "learning_rate": 1.5262436062235315e-05, + "loss": 0.1045, + "step": 19562 + }, + { + "epoch": 0.39128, + "grad_norm": 2.374755620956421, + "learning_rate": 1.526124872089975e-05, + "loss": 0.0798, + "step": 19564 + }, + { + "epoch": 0.39132, + "grad_norm": 0.386971652507782, + "learning_rate": 1.5260061276993428e-05, + "loss": 0.042, + "step": 19566 + }, + { + "epoch": 0.39136, + "grad_norm": 0.6177732348442078, + "learning_rate": 1.5258873730539507e-05, + "loss": 0.0336, + "step": 19568 + }, + { + "epoch": 0.3914, + "grad_norm": 1.6000009775161743, + "learning_rate": 1.5257686081561134e-05, + "loss": 0.0771, + "step": 19570 + }, + { + "epoch": 0.39144, + "grad_norm": 0.24884824454784393, + "learning_rate": 1.5256498330081462e-05, + "loss": 0.0313, + "step": 19572 + }, + { + "epoch": 0.39148, + "grad_norm": 1.2360836267471313, + "learning_rate": 1.525531047612365e-05, + "loss": 0.0801, + "step": 19574 + }, + { + "epoch": 0.39152, + "grad_norm": 0.21043050289154053, + "learning_rate": 1.5254122519710857e-05, + "loss": 0.0127, + "step": 19576 + }, + { + "epoch": 0.39156, + "grad_norm": 0.22340217232704163, + "learning_rate": 1.525293446086624e-05, + "loss": 0.0913, + "step": 19578 + }, + { + "epoch": 0.3916, + "grad_norm": 0.26221904158592224, + "learning_rate": 1.5251746299612959e-05, + "loss": 0.1321, + "step": 19580 + }, + { + "epoch": 0.39164, + "grad_norm": 0.9043211340904236, + "learning_rate": 1.5250558035974182e-05, + "loss": 0.1175, + "step": 19582 + }, + { + "epoch": 0.39168, + "grad_norm": 0.16331470012664795, + "learning_rate": 1.5249369669973071e-05, + "loss": 0.011, + "step": 19584 + }, + { + "epoch": 0.39172, + "grad_norm": 4.488603591918945, + "learning_rate": 1.5248181201632797e-05, + "loss": 0.2146, + "step": 19586 + }, + { + "epoch": 0.39176, + "grad_norm": 0.3642001748085022, + "learning_rate": 1.5246992630976528e-05, + "loss": 0.0099, + "step": 19588 + }, + { + "epoch": 0.3918, + "grad_norm": 0.22812925279140472, + "learning_rate": 1.5245803958027434e-05, + "loss": 0.0576, + "step": 19590 + }, + { + "epoch": 0.39184, + "grad_norm": 0.7360527515411377, + "learning_rate": 1.5244615182808694e-05, + "loss": 0.0427, + "step": 19592 + }, + { + "epoch": 0.39188, + "grad_norm": 6.033315181732178, + "learning_rate": 1.5243426305343483e-05, + "loss": 0.3348, + "step": 19594 + }, + { + "epoch": 0.39192, + "grad_norm": 1.3601754903793335, + "learning_rate": 1.5242237325654973e-05, + "loss": 0.2724, + "step": 19596 + }, + { + "epoch": 0.39196, + "grad_norm": 0.49739545583724976, + "learning_rate": 1.524104824376635e-05, + "loss": 0.021, + "step": 19598 + }, + { + "epoch": 0.392, + "grad_norm": 0.4938672184944153, + "learning_rate": 1.5239859059700794e-05, + "loss": 0.0235, + "step": 19600 + }, + { + "epoch": 0.39204, + "grad_norm": 0.7530527710914612, + "learning_rate": 1.5238669773481487e-05, + "loss": 0.045, + "step": 19602 + }, + { + "epoch": 0.39208, + "grad_norm": 2.893275022506714, + "learning_rate": 1.5237480385131612e-05, + "loss": 0.1064, + "step": 19604 + }, + { + "epoch": 0.39212, + "grad_norm": 1.2975547313690186, + "learning_rate": 1.5236290894674364e-05, + "loss": 0.2344, + "step": 19606 + }, + { + "epoch": 0.39216, + "grad_norm": 0.7754942178726196, + "learning_rate": 1.5235101302132929e-05, + "loss": 0.0301, + "step": 19608 + }, + { + "epoch": 0.3922, + "grad_norm": 0.043842196464538574, + "learning_rate": 1.5233911607530499e-05, + "loss": 0.0385, + "step": 19610 + }, + { + "epoch": 0.39224, + "grad_norm": 1.057559609413147, + "learning_rate": 1.5232721810890265e-05, + "loss": 0.0313, + "step": 19612 + }, + { + "epoch": 0.39228, + "grad_norm": 0.29439160227775574, + "learning_rate": 1.5231531912235427e-05, + "loss": 0.0434, + "step": 19614 + }, + { + "epoch": 0.39232, + "grad_norm": 2.425570011138916, + "learning_rate": 1.5230341911589183e-05, + "loss": 0.0555, + "step": 19616 + }, + { + "epoch": 0.39236, + "grad_norm": 8.475826263427734, + "learning_rate": 1.522915180897473e-05, + "loss": 0.625, + "step": 19618 + }, + { + "epoch": 0.3924, + "grad_norm": 0.4184390604496002, + "learning_rate": 1.5227961604415266e-05, + "loss": 0.0751, + "step": 19620 + }, + { + "epoch": 0.39244, + "grad_norm": 0.7108362317085266, + "learning_rate": 1.5226771297934004e-05, + "loss": 0.0317, + "step": 19622 + }, + { + "epoch": 0.39248, + "grad_norm": 2.250364065170288, + "learning_rate": 1.5225580889554144e-05, + "loss": 0.0588, + "step": 19624 + }, + { + "epoch": 0.39252, + "grad_norm": 0.6643792986869812, + "learning_rate": 1.522439037929889e-05, + "loss": 0.0151, + "step": 19626 + }, + { + "epoch": 0.39256, + "grad_norm": 2.330036163330078, + "learning_rate": 1.5223199767191462e-05, + "loss": 0.096, + "step": 19628 + }, + { + "epoch": 0.3926, + "grad_norm": 1.015620231628418, + "learning_rate": 1.5222009053255061e-05, + "loss": 0.0648, + "step": 19630 + }, + { + "epoch": 0.39264, + "grad_norm": 0.8719807863235474, + "learning_rate": 1.5220818237512908e-05, + "loss": 0.0301, + "step": 19632 + }, + { + "epoch": 0.39268, + "grad_norm": 2.194244146347046, + "learning_rate": 1.5219627319988213e-05, + "loss": 0.0442, + "step": 19634 + }, + { + "epoch": 0.39272, + "grad_norm": 1.4806863069534302, + "learning_rate": 1.5218436300704196e-05, + "loss": 0.1213, + "step": 19636 + }, + { + "epoch": 0.39276, + "grad_norm": 0.4618266522884369, + "learning_rate": 1.5217245179684078e-05, + "loss": 0.0729, + "step": 19638 + }, + { + "epoch": 0.3928, + "grad_norm": 0.3123670518398285, + "learning_rate": 1.5216053956951081e-05, + "loss": 0.0196, + "step": 19640 + }, + { + "epoch": 0.39284, + "grad_norm": 2.029175043106079, + "learning_rate": 1.5214862632528421e-05, + "loss": 0.0422, + "step": 19642 + }, + { + "epoch": 0.39288, + "grad_norm": 0.2774386703968048, + "learning_rate": 1.5213671206439333e-05, + "loss": 0.1816, + "step": 19644 + }, + { + "epoch": 0.39292, + "grad_norm": 1.7430106401443481, + "learning_rate": 1.5212479678707044e-05, + "loss": 0.0539, + "step": 19646 + }, + { + "epoch": 0.39296, + "grad_norm": 0.10814661532640457, + "learning_rate": 1.5211288049354777e-05, + "loss": 0.0072, + "step": 19648 + }, + { + "epoch": 0.393, + "grad_norm": 0.04979890584945679, + "learning_rate": 1.5210096318405768e-05, + "loss": 0.0533, + "step": 19650 + }, + { + "epoch": 0.39304, + "grad_norm": 3.091217041015625, + "learning_rate": 1.5208904485883244e-05, + "loss": 0.0896, + "step": 19652 + }, + { + "epoch": 0.39308, + "grad_norm": 1.450109601020813, + "learning_rate": 1.5207712551810447e-05, + "loss": 0.0773, + "step": 19654 + }, + { + "epoch": 0.39312, + "grad_norm": 0.8801401257514954, + "learning_rate": 1.5206520516210615e-05, + "loss": 0.0482, + "step": 19656 + }, + { + "epoch": 0.39316, + "grad_norm": 4.44351053237915, + "learning_rate": 1.5205328379106985e-05, + "loss": 0.1618, + "step": 19658 + }, + { + "epoch": 0.3932, + "grad_norm": 1.9077383279800415, + "learning_rate": 1.5204136140522799e-05, + "loss": 0.1003, + "step": 19660 + }, + { + "epoch": 0.39324, + "grad_norm": 6.085291385650635, + "learning_rate": 1.5202943800481296e-05, + "loss": 0.2209, + "step": 19662 + }, + { + "epoch": 0.39328, + "grad_norm": 1.8675026893615723, + "learning_rate": 1.5201751359005728e-05, + "loss": 0.0366, + "step": 19664 + }, + { + "epoch": 0.39332, + "grad_norm": 0.2257256954908371, + "learning_rate": 1.520055881611934e-05, + "loss": 0.0085, + "step": 19666 + }, + { + "epoch": 0.39336, + "grad_norm": 1.571885585784912, + "learning_rate": 1.5199366171845378e-05, + "loss": 0.0273, + "step": 19668 + }, + { + "epoch": 0.3934, + "grad_norm": 0.32897496223449707, + "learning_rate": 1.5198173426207095e-05, + "loss": 0.0599, + "step": 19670 + }, + { + "epoch": 0.39344, + "grad_norm": 0.740436851978302, + "learning_rate": 1.5196980579227747e-05, + "loss": 0.2838, + "step": 19672 + }, + { + "epoch": 0.39348, + "grad_norm": 2.5615806579589844, + "learning_rate": 1.5195787630930587e-05, + "loss": 0.0588, + "step": 19674 + }, + { + "epoch": 0.39352, + "grad_norm": 2.8306782245635986, + "learning_rate": 1.5194594581338873e-05, + "loss": 0.1316, + "step": 19676 + }, + { + "epoch": 0.39356, + "grad_norm": 0.10669126361608505, + "learning_rate": 1.5193401430475861e-05, + "loss": 0.0129, + "step": 19678 + }, + { + "epoch": 0.3936, + "grad_norm": 0.1696690022945404, + "learning_rate": 1.5192208178364815e-05, + "loss": 0.0304, + "step": 19680 + }, + { + "epoch": 0.39364, + "grad_norm": 0.12965071201324463, + "learning_rate": 1.5191014825029e-05, + "loss": 0.0051, + "step": 19682 + }, + { + "epoch": 0.39368, + "grad_norm": 0.16873379051685333, + "learning_rate": 1.5189821370491675e-05, + "loss": 0.1193, + "step": 19684 + }, + { + "epoch": 0.39372, + "grad_norm": 5.4751691818237305, + "learning_rate": 1.5188627814776111e-05, + "loss": 0.5442, + "step": 19686 + }, + { + "epoch": 0.39376, + "grad_norm": 1.2470128536224365, + "learning_rate": 1.5187434157905575e-05, + "loss": 0.017, + "step": 19688 + }, + { + "epoch": 0.3938, + "grad_norm": 0.5839638113975525, + "learning_rate": 1.5186240399903343e-05, + "loss": 0.0281, + "step": 19690 + }, + { + "epoch": 0.39384, + "grad_norm": 3.5224976539611816, + "learning_rate": 1.5185046540792683e-05, + "loss": 0.0569, + "step": 19692 + }, + { + "epoch": 0.39388, + "grad_norm": 8.88117504119873, + "learning_rate": 1.518385258059687e-05, + "loss": 0.3925, + "step": 19694 + }, + { + "epoch": 0.39392, + "grad_norm": 0.37988215684890747, + "learning_rate": 1.5182658519339181e-05, + "loss": 0.0092, + "step": 19696 + }, + { + "epoch": 0.39396, + "grad_norm": 9.575187683105469, + "learning_rate": 1.5181464357042902e-05, + "loss": 0.1825, + "step": 19698 + }, + { + "epoch": 0.394, + "grad_norm": 3.78875994682312, + "learning_rate": 1.5180270093731305e-05, + "loss": 0.1214, + "step": 19700 + }, + { + "epoch": 0.39404, + "grad_norm": 2.220719575881958, + "learning_rate": 1.5179075729427672e-05, + "loss": 0.0367, + "step": 19702 + }, + { + "epoch": 0.39408, + "grad_norm": 0.6154415607452393, + "learning_rate": 1.5177881264155294e-05, + "loss": 0.0155, + "step": 19704 + }, + { + "epoch": 0.39412, + "grad_norm": 3.6212708950042725, + "learning_rate": 1.5176686697937456e-05, + "loss": 0.1188, + "step": 19706 + }, + { + "epoch": 0.39416, + "grad_norm": 0.2541663646697998, + "learning_rate": 1.5175492030797446e-05, + "loss": 0.0773, + "step": 19708 + }, + { + "epoch": 0.3942, + "grad_norm": 0.05384648218750954, + "learning_rate": 1.5174297262758551e-05, + "loss": 0.1511, + "step": 19710 + }, + { + "epoch": 0.39424, + "grad_norm": 4.720484733581543, + "learning_rate": 1.5173102393844068e-05, + "loss": 0.1203, + "step": 19712 + }, + { + "epoch": 0.39428, + "grad_norm": 0.030074112117290497, + "learning_rate": 1.5171907424077293e-05, + "loss": 0.1162, + "step": 19714 + }, + { + "epoch": 0.39432, + "grad_norm": 0.5805762410163879, + "learning_rate": 1.5170712353481518e-05, + "loss": 0.034, + "step": 19716 + }, + { + "epoch": 0.39436, + "grad_norm": 0.253887802362442, + "learning_rate": 1.5169517182080045e-05, + "loss": 0.0228, + "step": 19718 + }, + { + "epoch": 0.3944, + "grad_norm": 0.020084718242287636, + "learning_rate": 1.5168321909896171e-05, + "loss": 0.0013, + "step": 19720 + }, + { + "epoch": 0.39444, + "grad_norm": 2.7582571506500244, + "learning_rate": 1.5167126536953203e-05, + "loss": 0.0473, + "step": 19722 + }, + { + "epoch": 0.39448, + "grad_norm": 0.014872301369905472, + "learning_rate": 1.5165931063274442e-05, + "loss": 0.2193, + "step": 19724 + }, + { + "epoch": 0.39452, + "grad_norm": 0.17521876096725464, + "learning_rate": 1.5164735488883193e-05, + "loss": 0.2222, + "step": 19726 + }, + { + "epoch": 0.39456, + "grad_norm": 0.11509138345718384, + "learning_rate": 1.5163539813802767e-05, + "loss": 0.004, + "step": 19728 + }, + { + "epoch": 0.3946, + "grad_norm": 0.10480709373950958, + "learning_rate": 1.5162344038056476e-05, + "loss": 0.0049, + "step": 19730 + }, + { + "epoch": 0.39464, + "grad_norm": 0.08702775835990906, + "learning_rate": 1.5161148161667631e-05, + "loss": 0.0107, + "step": 19732 + }, + { + "epoch": 0.39468, + "grad_norm": 3.6191649436950684, + "learning_rate": 1.5159952184659545e-05, + "loss": 0.3711, + "step": 19734 + }, + { + "epoch": 0.39472, + "grad_norm": 0.07192423194646835, + "learning_rate": 1.5158756107055533e-05, + "loss": 0.0024, + "step": 19736 + }, + { + "epoch": 0.39476, + "grad_norm": 8.462958335876465, + "learning_rate": 1.5157559928878915e-05, + "loss": 0.2115, + "step": 19738 + }, + { + "epoch": 0.3948, + "grad_norm": 0.02040664292871952, + "learning_rate": 1.5156363650153012e-05, + "loss": 0.0096, + "step": 19740 + }, + { + "epoch": 0.39484, + "grad_norm": 0.05997433885931969, + "learning_rate": 1.5155167270901143e-05, + "loss": 0.0049, + "step": 19742 + }, + { + "epoch": 0.39488, + "grad_norm": 0.8016365170478821, + "learning_rate": 1.5153970791146636e-05, + "loss": 0.0249, + "step": 19744 + }, + { + "epoch": 0.39492, + "grad_norm": 2.144590139389038, + "learning_rate": 1.5152774210912815e-05, + "loss": 0.2275, + "step": 19746 + }, + { + "epoch": 0.39496, + "grad_norm": 0.28179964423179626, + "learning_rate": 1.5151577530223007e-05, + "loss": 0.007, + "step": 19748 + }, + { + "epoch": 0.395, + "grad_norm": 0.45844894647598267, + "learning_rate": 1.5150380749100545e-05, + "loss": 0.0074, + "step": 19750 + }, + { + "epoch": 0.39504, + "grad_norm": 5.856701850891113, + "learning_rate": 1.5149183867568755e-05, + "loss": 0.1894, + "step": 19752 + }, + { + "epoch": 0.39508, + "grad_norm": 4.613709449768066, + "learning_rate": 1.5147986885650978e-05, + "loss": 0.0615, + "step": 19754 + }, + { + "epoch": 0.39512, + "grad_norm": 0.17264103889465332, + "learning_rate": 1.5146789803370542e-05, + "loss": 0.1529, + "step": 19756 + }, + { + "epoch": 0.39516, + "grad_norm": 0.30700328946113586, + "learning_rate": 1.5145592620750793e-05, + "loss": 0.0623, + "step": 19758 + }, + { + "epoch": 0.3952, + "grad_norm": 0.12594039738178253, + "learning_rate": 1.5144395337815066e-05, + "loss": 0.1405, + "step": 19760 + }, + { + "epoch": 0.39524, + "grad_norm": 0.9696130752563477, + "learning_rate": 1.5143197954586702e-05, + "loss": 0.0403, + "step": 19762 + }, + { + "epoch": 0.39528, + "grad_norm": 3.288656234741211, + "learning_rate": 1.5142000471089047e-05, + "loss": 0.0592, + "step": 19764 + }, + { + "epoch": 0.39532, + "grad_norm": 7.1322245597839355, + "learning_rate": 1.5140802887345445e-05, + "loss": 0.1565, + "step": 19766 + }, + { + "epoch": 0.39536, + "grad_norm": 0.02947377972304821, + "learning_rate": 1.5139605203379246e-05, + "loss": 0.2053, + "step": 19768 + }, + { + "epoch": 0.3954, + "grad_norm": 0.05019596964120865, + "learning_rate": 1.5138407419213797e-05, + "loss": 0.0006, + "step": 19770 + }, + { + "epoch": 0.39544, + "grad_norm": 0.017792154103517532, + "learning_rate": 1.513720953487245e-05, + "loss": 0.3366, + "step": 19772 + }, + { + "epoch": 0.39548, + "grad_norm": 0.29074686765670776, + "learning_rate": 1.5136011550378555e-05, + "loss": 0.0699, + "step": 19774 + }, + { + "epoch": 0.39552, + "grad_norm": 0.06957539170980453, + "learning_rate": 1.513481346575547e-05, + "loss": 0.0012, + "step": 19776 + }, + { + "epoch": 0.39556, + "grad_norm": 12.04201889038086, + "learning_rate": 1.5133615281026556e-05, + "loss": 0.5294, + "step": 19778 + }, + { + "epoch": 0.3956, + "grad_norm": 0.021088039502501488, + "learning_rate": 1.5132416996215171e-05, + "loss": 0.1629, + "step": 19780 + }, + { + "epoch": 0.39564, + "grad_norm": 2.614157199859619, + "learning_rate": 1.5131218611344672e-05, + "loss": 0.0432, + "step": 19782 + }, + { + "epoch": 0.39568, + "grad_norm": 10.052862167358398, + "learning_rate": 1.5130020126438428e-05, + "loss": 0.7225, + "step": 19784 + }, + { + "epoch": 0.39572, + "grad_norm": 9.899544715881348, + "learning_rate": 1.5128821541519794e-05, + "loss": 0.4036, + "step": 19786 + }, + { + "epoch": 0.39576, + "grad_norm": 0.27117618918418884, + "learning_rate": 1.5127622856612147e-05, + "loss": 0.307, + "step": 19788 + }, + { + "epoch": 0.3958, + "grad_norm": 8.230034828186035, + "learning_rate": 1.5126424071738853e-05, + "loss": 0.7215, + "step": 19790 + }, + { + "epoch": 0.39584, + "grad_norm": 0.8487887978553772, + "learning_rate": 1.512522518692328e-05, + "loss": 0.2258, + "step": 19792 + }, + { + "epoch": 0.39588, + "grad_norm": 2.610872507095337, + "learning_rate": 1.5124026202188807e-05, + "loss": 0.1065, + "step": 19794 + }, + { + "epoch": 0.39592, + "grad_norm": 4.05129337310791, + "learning_rate": 1.5122827117558802e-05, + "loss": 0.1963, + "step": 19796 + }, + { + "epoch": 0.39596, + "grad_norm": 1.0809433460235596, + "learning_rate": 1.5121627933056646e-05, + "loss": 0.0641, + "step": 19798 + }, + { + "epoch": 0.396, + "grad_norm": 1.597779631614685, + "learning_rate": 1.5120428648705716e-05, + "loss": 0.0513, + "step": 19800 + }, + { + "epoch": 0.39604, + "grad_norm": 6.1514363288879395, + "learning_rate": 1.5119229264529396e-05, + "loss": 0.3061, + "step": 19802 + }, + { + "epoch": 0.39608, + "grad_norm": 6.985719203948975, + "learning_rate": 1.5118029780551065e-05, + "loss": 0.324, + "step": 19804 + }, + { + "epoch": 0.39612, + "grad_norm": 0.4878060519695282, + "learning_rate": 1.5116830196794108e-05, + "loss": 0.0109, + "step": 19806 + }, + { + "epoch": 0.39616, + "grad_norm": 2.249528169631958, + "learning_rate": 1.5115630513281908e-05, + "loss": 0.179, + "step": 19808 + }, + { + "epoch": 0.3962, + "grad_norm": 0.20959948003292084, + "learning_rate": 1.511443073003786e-05, + "loss": 0.0509, + "step": 19810 + }, + { + "epoch": 0.39624, + "grad_norm": 1.1356053352355957, + "learning_rate": 1.5113230847085353e-05, + "loss": 0.0471, + "step": 19812 + }, + { + "epoch": 0.39628, + "grad_norm": 6.955126762390137, + "learning_rate": 1.511203086444778e-05, + "loss": 0.6878, + "step": 19814 + }, + { + "epoch": 0.39632, + "grad_norm": 1.356454610824585, + "learning_rate": 1.5110830782148531e-05, + "loss": 0.0607, + "step": 19816 + }, + { + "epoch": 0.39636, + "grad_norm": 1.3716199398040771, + "learning_rate": 1.5109630600211005e-05, + "loss": 0.035, + "step": 19818 + }, + { + "epoch": 0.3964, + "grad_norm": 2.803385019302368, + "learning_rate": 1.51084303186586e-05, + "loss": 0.0947, + "step": 19820 + }, + { + "epoch": 0.39644, + "grad_norm": 1.3845487833023071, + "learning_rate": 1.5107229937514719e-05, + "loss": 0.0843, + "step": 19822 + }, + { + "epoch": 0.39648, + "grad_norm": 3.6763696670532227, + "learning_rate": 1.5106029456802756e-05, + "loss": 0.1205, + "step": 19824 + }, + { + "epoch": 0.39652, + "grad_norm": 1.5039225816726685, + "learning_rate": 1.5104828876546121e-05, + "loss": 0.0496, + "step": 19826 + }, + { + "epoch": 0.39656, + "grad_norm": 1.4553216695785522, + "learning_rate": 1.5103628196768219e-05, + "loss": 0.13, + "step": 19828 + }, + { + "epoch": 0.3966, + "grad_norm": 2.718512535095215, + "learning_rate": 1.510242741749246e-05, + "loss": 0.2096, + "step": 19830 + }, + { + "epoch": 0.39664, + "grad_norm": 0.9082993268966675, + "learning_rate": 1.5101226538742248e-05, + "loss": 0.0889, + "step": 19832 + }, + { + "epoch": 0.39668, + "grad_norm": 2.0460758209228516, + "learning_rate": 1.5100025560540998e-05, + "loss": 0.0802, + "step": 19834 + }, + { + "epoch": 0.39672, + "grad_norm": 3.2407941818237305, + "learning_rate": 1.5098824482912129e-05, + "loss": 0.1511, + "step": 19836 + }, + { + "epoch": 0.39676, + "grad_norm": 2.5816285610198975, + "learning_rate": 1.509762330587905e-05, + "loss": 0.161, + "step": 19838 + }, + { + "epoch": 0.3968, + "grad_norm": 0.3901021480560303, + "learning_rate": 1.5096422029465178e-05, + "loss": 0.0193, + "step": 19840 + }, + { + "epoch": 0.39684, + "grad_norm": 1.6786216497421265, + "learning_rate": 1.5095220653693935e-05, + "loss": 0.0591, + "step": 19842 + }, + { + "epoch": 0.39688, + "grad_norm": 0.10453870892524719, + "learning_rate": 1.5094019178588749e-05, + "loss": 0.0056, + "step": 19844 + }, + { + "epoch": 0.39692, + "grad_norm": 2.6114501953125, + "learning_rate": 1.5092817604173026e-05, + "loss": 0.1458, + "step": 19846 + }, + { + "epoch": 0.39696, + "grad_norm": 7.185065746307373, + "learning_rate": 1.5091615930470206e-05, + "loss": 0.2649, + "step": 19848 + }, + { + "epoch": 0.397, + "grad_norm": 0.06603219360113144, + "learning_rate": 1.5090414157503715e-05, + "loss": 0.0051, + "step": 19850 + }, + { + "epoch": 0.39704, + "grad_norm": 0.3882182240486145, + "learning_rate": 1.5089212285296977e-05, + "loss": 0.0693, + "step": 19852 + }, + { + "epoch": 0.39708, + "grad_norm": 0.12600266933441162, + "learning_rate": 1.5088010313873426e-05, + "loss": 0.0084, + "step": 19854 + }, + { + "epoch": 0.39712, + "grad_norm": 2.209001302719116, + "learning_rate": 1.5086808243256491e-05, + "loss": 0.0629, + "step": 19856 + }, + { + "epoch": 0.39716, + "grad_norm": 0.7280517816543579, + "learning_rate": 1.5085606073469615e-05, + "loss": 0.0527, + "step": 19858 + }, + { + "epoch": 0.3972, + "grad_norm": 3.4950504302978516, + "learning_rate": 1.508440380453623e-05, + "loss": 0.2033, + "step": 19860 + }, + { + "epoch": 0.39724, + "grad_norm": 2.7980122566223145, + "learning_rate": 1.5083201436479773e-05, + "loss": 0.0916, + "step": 19862 + }, + { + "epoch": 0.39728, + "grad_norm": 3.007899522781372, + "learning_rate": 1.5081998969323688e-05, + "loss": 0.0604, + "step": 19864 + }, + { + "epoch": 0.39732, + "grad_norm": 0.47212377190589905, + "learning_rate": 1.5080796403091413e-05, + "loss": 0.0251, + "step": 19866 + }, + { + "epoch": 0.39736, + "grad_norm": 0.7570868730545044, + "learning_rate": 1.50795937378064e-05, + "loss": 0.0302, + "step": 19868 + }, + { + "epoch": 0.3974, + "grad_norm": 0.22123460471630096, + "learning_rate": 1.5078390973492094e-05, + "loss": 0.1101, + "step": 19870 + }, + { + "epoch": 0.39744, + "grad_norm": 0.9688688516616821, + "learning_rate": 1.5077188110171941e-05, + "loss": 0.0549, + "step": 19872 + }, + { + "epoch": 0.39748, + "grad_norm": 4.428416728973389, + "learning_rate": 1.5075985147869388e-05, + "loss": 0.1267, + "step": 19874 + }, + { + "epoch": 0.39752, + "grad_norm": 0.9268019199371338, + "learning_rate": 1.5074782086607893e-05, + "loss": 0.0458, + "step": 19876 + }, + { + "epoch": 0.39756, + "grad_norm": 0.3611871302127838, + "learning_rate": 1.507357892641091e-05, + "loss": 0.0496, + "step": 19878 + }, + { + "epoch": 0.3976, + "grad_norm": 0.06669802218675613, + "learning_rate": 1.5072375667301893e-05, + "loss": 0.0062, + "step": 19880 + }, + { + "epoch": 0.39764, + "grad_norm": 0.03991738334298134, + "learning_rate": 1.50711723093043e-05, + "loss": 0.0247, + "step": 19882 + }, + { + "epoch": 0.39768, + "grad_norm": 5.137723445892334, + "learning_rate": 1.506996885244159e-05, + "loss": 0.2137, + "step": 19884 + }, + { + "epoch": 0.39772, + "grad_norm": 4.372279167175293, + "learning_rate": 1.5068765296737234e-05, + "loss": 0.1139, + "step": 19886 + }, + { + "epoch": 0.39776, + "grad_norm": 0.0890234038233757, + "learning_rate": 1.5067561642214683e-05, + "loss": 0.0734, + "step": 19888 + }, + { + "epoch": 0.3978, + "grad_norm": 0.10426435619592667, + "learning_rate": 1.506635788889741e-05, + "loss": 0.0123, + "step": 19890 + }, + { + "epoch": 0.39784, + "grad_norm": 0.9748809337615967, + "learning_rate": 1.5065154036808882e-05, + "loss": 0.0341, + "step": 19892 + }, + { + "epoch": 0.39788, + "grad_norm": 1.7268158197402954, + "learning_rate": 1.506395008597257e-05, + "loss": 0.0528, + "step": 19894 + }, + { + "epoch": 0.39792, + "grad_norm": 0.24092644453048706, + "learning_rate": 1.5062746036411942e-05, + "loss": 0.0147, + "step": 19896 + }, + { + "epoch": 0.39796, + "grad_norm": 0.2995627522468567, + "learning_rate": 1.5061541888150473e-05, + "loss": 0.1221, + "step": 19898 + }, + { + "epoch": 0.398, + "grad_norm": 1.384395956993103, + "learning_rate": 1.5060337641211637e-05, + "loss": 0.2235, + "step": 19900 + }, + { + "epoch": 0.39804, + "grad_norm": 0.19901083409786224, + "learning_rate": 1.505913329561892e-05, + "loss": 0.0304, + "step": 19902 + }, + { + "epoch": 0.39808, + "grad_norm": 0.28782984614372253, + "learning_rate": 1.505792885139579e-05, + "loss": 0.0094, + "step": 19904 + }, + { + "epoch": 0.39812, + "grad_norm": 0.13597029447555542, + "learning_rate": 1.5056724308565736e-05, + "loss": 0.0751, + "step": 19906 + }, + { + "epoch": 0.39816, + "grad_norm": 5.2422075271606445, + "learning_rate": 1.5055519667152237e-05, + "loss": 0.0482, + "step": 19908 + }, + { + "epoch": 0.3982, + "grad_norm": 0.13544303178787231, + "learning_rate": 1.5054314927178779e-05, + "loss": 0.054, + "step": 19910 + }, + { + "epoch": 0.39824, + "grad_norm": 0.4457043707370758, + "learning_rate": 1.5053110088668848e-05, + "loss": 0.123, + "step": 19912 + }, + { + "epoch": 0.39828, + "grad_norm": 0.08583429455757141, + "learning_rate": 1.5051905151645934e-05, + "loss": 0.2057, + "step": 19914 + }, + { + "epoch": 0.39832, + "grad_norm": 1.4703518152236938, + "learning_rate": 1.505070011613353e-05, + "loss": 0.0481, + "step": 19916 + }, + { + "epoch": 0.39836, + "grad_norm": 11.654886245727539, + "learning_rate": 1.5049494982155127e-05, + "loss": 0.2071, + "step": 19918 + }, + { + "epoch": 0.3984, + "grad_norm": 0.06370462477207184, + "learning_rate": 1.504828974973422e-05, + "loss": 0.0349, + "step": 19920 + }, + { + "epoch": 0.39844, + "grad_norm": 9.709142684936523, + "learning_rate": 1.5047084418894304e-05, + "loss": 0.6871, + "step": 19922 + }, + { + "epoch": 0.39848, + "grad_norm": 2.5477254390716553, + "learning_rate": 1.504587898965888e-05, + "loss": 0.085, + "step": 19924 + }, + { + "epoch": 0.39852, + "grad_norm": 9.488274574279785, + "learning_rate": 1.5044673462051446e-05, + "loss": 0.5326, + "step": 19926 + }, + { + "epoch": 0.39856, + "grad_norm": 1.169447660446167, + "learning_rate": 1.5043467836095506e-05, + "loss": 0.0907, + "step": 19928 + }, + { + "epoch": 0.3986, + "grad_norm": 0.18542388081550598, + "learning_rate": 1.5042262111814566e-05, + "loss": 0.0243, + "step": 19930 + }, + { + "epoch": 0.39864, + "grad_norm": 4.172033786773682, + "learning_rate": 1.5041056289232126e-05, + "loss": 0.1331, + "step": 19932 + }, + { + "epoch": 0.39868, + "grad_norm": 11.365524291992188, + "learning_rate": 1.5039850368371703e-05, + "loss": 0.4127, + "step": 19934 + }, + { + "epoch": 0.39872, + "grad_norm": 0.8641773462295532, + "learning_rate": 1.5038644349256798e-05, + "loss": 0.0226, + "step": 19936 + }, + { + "epoch": 0.39876, + "grad_norm": 2.284414768218994, + "learning_rate": 1.5037438231910928e-05, + "loss": 0.0997, + "step": 19938 + }, + { + "epoch": 0.3988, + "grad_norm": 5.9554524421691895, + "learning_rate": 1.503623201635761e-05, + "loss": 0.3178, + "step": 19940 + }, + { + "epoch": 0.39884, + "grad_norm": 0.18197591602802277, + "learning_rate": 1.5035025702620356e-05, + "loss": 0.0079, + "step": 19942 + }, + { + "epoch": 0.39888, + "grad_norm": 0.27162110805511475, + "learning_rate": 1.503381929072268e-05, + "loss": 0.0213, + "step": 19944 + }, + { + "epoch": 0.39892, + "grad_norm": 0.25670409202575684, + "learning_rate": 1.5032612780688104e-05, + "loss": 0.032, + "step": 19946 + }, + { + "epoch": 0.39896, + "grad_norm": 0.959989070892334, + "learning_rate": 1.5031406172540155e-05, + "loss": 0.0267, + "step": 19948 + }, + { + "epoch": 0.399, + "grad_norm": 0.843010663986206, + "learning_rate": 1.5030199466302354e-05, + "loss": 0.0213, + "step": 19950 + }, + { + "epoch": 0.39904, + "grad_norm": 5.959972858428955, + "learning_rate": 1.5028992661998222e-05, + "loss": 0.3712, + "step": 19952 + }, + { + "epoch": 0.39908, + "grad_norm": 1.1728074550628662, + "learning_rate": 1.5027785759651286e-05, + "loss": 0.0611, + "step": 19954 + }, + { + "epoch": 0.39912, + "grad_norm": 4.988274574279785, + "learning_rate": 1.5026578759285082e-05, + "loss": 0.7874, + "step": 19956 + }, + { + "epoch": 0.39916, + "grad_norm": 10.035672187805176, + "learning_rate": 1.5025371660923137e-05, + "loss": 0.3991, + "step": 19958 + }, + { + "epoch": 0.3992, + "grad_norm": 0.5634121894836426, + "learning_rate": 1.5024164464588982e-05, + "loss": 0.0345, + "step": 19960 + }, + { + "epoch": 0.39924, + "grad_norm": 0.6701781153678894, + "learning_rate": 1.5022957170306156e-05, + "loss": 0.0294, + "step": 19962 + }, + { + "epoch": 0.39928, + "grad_norm": 1.5048573017120361, + "learning_rate": 1.5021749778098192e-05, + "loss": 0.1253, + "step": 19964 + }, + { + "epoch": 0.39932, + "grad_norm": 0.8697019219398499, + "learning_rate": 1.5020542287988633e-05, + "loss": 0.0638, + "step": 19966 + }, + { + "epoch": 0.39936, + "grad_norm": 1.533044457435608, + "learning_rate": 1.5019334700001018e-05, + "loss": 0.0485, + "step": 19968 + }, + { + "epoch": 0.3994, + "grad_norm": 1.4492056369781494, + "learning_rate": 1.5018127014158886e-05, + "loss": 0.1644, + "step": 19970 + }, + { + "epoch": 0.39944, + "grad_norm": 0.7432280778884888, + "learning_rate": 1.5016919230485785e-05, + "loss": 0.0282, + "step": 19972 + }, + { + "epoch": 0.39948, + "grad_norm": 0.22569289803504944, + "learning_rate": 1.5015711349005261e-05, + "loss": 0.0387, + "step": 19974 + }, + { + "epoch": 0.39952, + "grad_norm": 0.5578616261482239, + "learning_rate": 1.5014503369740866e-05, + "loss": 0.051, + "step": 19976 + }, + { + "epoch": 0.39956, + "grad_norm": 0.18390381336212158, + "learning_rate": 1.5013295292716139e-05, + "loss": 0.0419, + "step": 19978 + }, + { + "epoch": 0.3996, + "grad_norm": 0.7070146799087524, + "learning_rate": 1.5012087117954643e-05, + "loss": 0.0389, + "step": 19980 + }, + { + "epoch": 0.39964, + "grad_norm": 0.1409284621477127, + "learning_rate": 1.5010878845479928e-05, + "loss": 0.0061, + "step": 19982 + }, + { + "epoch": 0.39968, + "grad_norm": 1.9816834926605225, + "learning_rate": 1.5009670475315551e-05, + "loss": 0.0551, + "step": 19984 + }, + { + "epoch": 0.39972, + "grad_norm": 0.08553530275821686, + "learning_rate": 1.5008462007485069e-05, + "loss": 0.0028, + "step": 19986 + }, + { + "epoch": 0.39976, + "grad_norm": 0.4606189727783203, + "learning_rate": 1.500725344201204e-05, + "loss": 0.0209, + "step": 19988 + }, + { + "epoch": 0.3998, + "grad_norm": 2.113089084625244, + "learning_rate": 1.5006044778920028e-05, + "loss": 0.0946, + "step": 19990 + }, + { + "epoch": 0.39984, + "grad_norm": 9.334609985351562, + "learning_rate": 1.5004836018232595e-05, + "loss": 0.2863, + "step": 19992 + }, + { + "epoch": 0.39988, + "grad_norm": 0.2154085487127304, + "learning_rate": 1.500362715997331e-05, + "loss": 0.0137, + "step": 19994 + }, + { + "epoch": 0.39992, + "grad_norm": 4.660719871520996, + "learning_rate": 1.5002418204165735e-05, + "loss": 0.1316, + "step": 19996 + }, + { + "epoch": 0.39996, + "grad_norm": 0.984494686126709, + "learning_rate": 1.5001209150833443e-05, + "loss": 0.0514, + "step": 19998 + }, + { + "epoch": 0.4, + "grad_norm": 6.184274196624756, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.2544, + "step": 20000 + }, + { + "epoch": 0.40004, + "grad_norm": 0.2785149812698364, + "learning_rate": 1.4998790751688988e-05, + "loss": 0.0115, + "step": 20002 + }, + { + "epoch": 0.40008, + "grad_norm": 1.915695309638977, + "learning_rate": 1.4997581405923974e-05, + "loss": 0.0563, + "step": 20004 + }, + { + "epoch": 0.40012, + "grad_norm": 6.879190921783447, + "learning_rate": 1.4996371962728535e-05, + "loss": 0.4018, + "step": 20006 + }, + { + "epoch": 0.40016, + "grad_norm": 8.193893432617188, + "learning_rate": 1.499516242212626e-05, + "loss": 0.3634, + "step": 20008 + }, + { + "epoch": 0.4002, + "grad_norm": 0.5774162411689758, + "learning_rate": 1.4993952784140716e-05, + "loss": 0.0555, + "step": 20010 + }, + { + "epoch": 0.40024, + "grad_norm": 0.7495988607406616, + "learning_rate": 1.4992743048795493e-05, + "loss": 0.0172, + "step": 20012 + }, + { + "epoch": 0.40028, + "grad_norm": 3.202899932861328, + "learning_rate": 1.4991533216114174e-05, + "loss": 0.1061, + "step": 20014 + }, + { + "epoch": 0.40032, + "grad_norm": 5.050683975219727, + "learning_rate": 1.4990323286120342e-05, + "loss": 0.1581, + "step": 20016 + }, + { + "epoch": 0.40036, + "grad_norm": 0.6914377212524414, + "learning_rate": 1.4989113258837593e-05, + "loss": 0.0262, + "step": 20018 + }, + { + "epoch": 0.4004, + "grad_norm": 2.6238961219787598, + "learning_rate": 1.498790313428951e-05, + "loss": 0.0693, + "step": 20020 + }, + { + "epoch": 0.40044, + "grad_norm": 6.105449199676514, + "learning_rate": 1.4986692912499688e-05, + "loss": 0.3369, + "step": 20022 + }, + { + "epoch": 0.40048, + "grad_norm": 0.5408428907394409, + "learning_rate": 1.498548259349172e-05, + "loss": 0.0374, + "step": 20024 + }, + { + "epoch": 0.40052, + "grad_norm": 0.45190420746803284, + "learning_rate": 1.4984272177289202e-05, + "loss": 0.1149, + "step": 20026 + }, + { + "epoch": 0.40056, + "grad_norm": 6.039384841918945, + "learning_rate": 1.4983061663915733e-05, + "loss": 0.3243, + "step": 20028 + }, + { + "epoch": 0.4006, + "grad_norm": 4.46484375, + "learning_rate": 1.498185105339491e-05, + "loss": 0.2067, + "step": 20030 + }, + { + "epoch": 0.40064, + "grad_norm": 0.08078326284885406, + "learning_rate": 1.4980640345750338e-05, + "loss": 0.06, + "step": 20032 + }, + { + "epoch": 0.40068, + "grad_norm": 0.18498027324676514, + "learning_rate": 1.4979429541005614e-05, + "loss": 0.0163, + "step": 20034 + }, + { + "epoch": 0.40072, + "grad_norm": 0.31834810972213745, + "learning_rate": 1.4978218639184353e-05, + "loss": 0.0137, + "step": 20036 + }, + { + "epoch": 0.40076, + "grad_norm": 0.15392674505710602, + "learning_rate": 1.497700764031015e-05, + "loss": 0.0761, + "step": 20038 + }, + { + "epoch": 0.4008, + "grad_norm": 2.660405397415161, + "learning_rate": 1.4975796544406627e-05, + "loss": 0.1399, + "step": 20040 + }, + { + "epoch": 0.40084, + "grad_norm": 0.37361881136894226, + "learning_rate": 1.4974585351497387e-05, + "loss": 0.0212, + "step": 20042 + }, + { + "epoch": 0.40088, + "grad_norm": 7.372602462768555, + "learning_rate": 1.4973374061606043e-05, + "loss": 0.3826, + "step": 20044 + }, + { + "epoch": 0.40092, + "grad_norm": 0.3730666935443878, + "learning_rate": 1.4972162674756212e-05, + "loss": 0.3459, + "step": 20046 + }, + { + "epoch": 0.40096, + "grad_norm": 1.4099470376968384, + "learning_rate": 1.4970951190971512e-05, + "loss": 0.0486, + "step": 20048 + }, + { + "epoch": 0.401, + "grad_norm": 0.028211232274770737, + "learning_rate": 1.4969739610275556e-05, + "loss": 0.2675, + "step": 20050 + }, + { + "epoch": 0.40104, + "grad_norm": 1.3148154020309448, + "learning_rate": 1.4968527932691967e-05, + "loss": 0.0325, + "step": 20052 + }, + { + "epoch": 0.40108, + "grad_norm": 0.10133989155292511, + "learning_rate": 1.496731615824437e-05, + "loss": 0.0161, + "step": 20054 + }, + { + "epoch": 0.40112, + "grad_norm": 0.8631731271743774, + "learning_rate": 1.4966104286956387e-05, + "loss": 0.0319, + "step": 20056 + }, + { + "epoch": 0.40116, + "grad_norm": 2.9706811904907227, + "learning_rate": 1.4964892318851644e-05, + "loss": 0.0926, + "step": 20058 + }, + { + "epoch": 0.4012, + "grad_norm": 0.20938849449157715, + "learning_rate": 1.496368025395377e-05, + "loss": 0.0422, + "step": 20060 + }, + { + "epoch": 0.40124, + "grad_norm": 0.1295449286699295, + "learning_rate": 1.4962468092286393e-05, + "loss": 0.0176, + "step": 20062 + }, + { + "epoch": 0.40128, + "grad_norm": 6.565609455108643, + "learning_rate": 1.4961255833873147e-05, + "loss": 0.2562, + "step": 20064 + }, + { + "epoch": 0.40132, + "grad_norm": 0.7443307042121887, + "learning_rate": 1.4960043478737662e-05, + "loss": 0.0962, + "step": 20066 + }, + { + "epoch": 0.40136, + "grad_norm": 0.6779064536094666, + "learning_rate": 1.4958831026903577e-05, + "loss": 0.0151, + "step": 20068 + }, + { + "epoch": 0.4014, + "grad_norm": 0.32921674847602844, + "learning_rate": 1.4957618478394529e-05, + "loss": 0.0385, + "step": 20070 + }, + { + "epoch": 0.40144, + "grad_norm": 0.3552952706813812, + "learning_rate": 1.4956405833234154e-05, + "loss": 0.0274, + "step": 20072 + }, + { + "epoch": 0.40148, + "grad_norm": 5.186117649078369, + "learning_rate": 1.4955193091446098e-05, + "loss": 0.1552, + "step": 20074 + }, + { + "epoch": 0.40152, + "grad_norm": 0.8130146861076355, + "learning_rate": 1.4953980253054e-05, + "loss": 0.0374, + "step": 20076 + }, + { + "epoch": 0.40156, + "grad_norm": 0.8155460357666016, + "learning_rate": 1.4952767318081509e-05, + "loss": 0.0314, + "step": 20078 + }, + { + "epoch": 0.4016, + "grad_norm": 1.174075961112976, + "learning_rate": 1.4951554286552266e-05, + "loss": 0.047, + "step": 20080 + }, + { + "epoch": 0.40164, + "grad_norm": 1.3999419212341309, + "learning_rate": 1.4950341158489928e-05, + "loss": 0.0451, + "step": 20082 + }, + { + "epoch": 0.40168, + "grad_norm": 0.033052969723939896, + "learning_rate": 1.4949127933918136e-05, + "loss": 0.0196, + "step": 20084 + }, + { + "epoch": 0.40172, + "grad_norm": 10.127443313598633, + "learning_rate": 1.4947914612860546e-05, + "loss": 0.4487, + "step": 20086 + }, + { + "epoch": 0.40176, + "grad_norm": 1.6588187217712402, + "learning_rate": 1.4946701195340814e-05, + "loss": 0.0318, + "step": 20088 + }, + { + "epoch": 0.4018, + "grad_norm": 0.0771472230553627, + "learning_rate": 1.4945487681382597e-05, + "loss": 0.2689, + "step": 20090 + }, + { + "epoch": 0.40184, + "grad_norm": 0.4089638888835907, + "learning_rate": 1.4944274071009553e-05, + "loss": 0.0127, + "step": 20092 + }, + { + "epoch": 0.40188, + "grad_norm": 1.4325809478759766, + "learning_rate": 1.4943060364245336e-05, + "loss": 0.1395, + "step": 20094 + }, + { + "epoch": 0.40192, + "grad_norm": 0.3430492877960205, + "learning_rate": 1.4941846561113616e-05, + "loss": 0.08, + "step": 20096 + }, + { + "epoch": 0.40196, + "grad_norm": 0.1378263533115387, + "learning_rate": 1.4940632661638055e-05, + "loss": 0.0234, + "step": 20098 + }, + { + "epoch": 0.402, + "grad_norm": 5.355794429779053, + "learning_rate": 1.493941866584231e-05, + "loss": 0.2048, + "step": 20100 + }, + { + "epoch": 0.40204, + "grad_norm": 1.7231786251068115, + "learning_rate": 1.4938204573750059e-05, + "loss": 0.0726, + "step": 20102 + }, + { + "epoch": 0.40208, + "grad_norm": 0.7346600890159607, + "learning_rate": 1.4936990385384968e-05, + "loss": 0.0283, + "step": 20104 + }, + { + "epoch": 0.40212, + "grad_norm": 0.20003089308738708, + "learning_rate": 1.493577610077071e-05, + "loss": 0.0678, + "step": 20106 + }, + { + "epoch": 0.40216, + "grad_norm": 0.07327680289745331, + "learning_rate": 1.4934561719930952e-05, + "loss": 0.044, + "step": 20108 + }, + { + "epoch": 0.4022, + "grad_norm": 1.1300890445709229, + "learning_rate": 1.4933347242889371e-05, + "loss": 0.0555, + "step": 20110 + }, + { + "epoch": 0.40224, + "grad_norm": 0.5577422380447388, + "learning_rate": 1.493213266966965e-05, + "loss": 0.0146, + "step": 20112 + }, + { + "epoch": 0.40228, + "grad_norm": 1.1436004638671875, + "learning_rate": 1.4930918000295463e-05, + "loss": 0.0764, + "step": 20114 + }, + { + "epoch": 0.40232, + "grad_norm": 6.692148685455322, + "learning_rate": 1.4929703234790488e-05, + "loss": 0.39, + "step": 20116 + }, + { + "epoch": 0.40236, + "grad_norm": 0.2738065719604492, + "learning_rate": 1.4928488373178412e-05, + "loss": 0.01, + "step": 20118 + }, + { + "epoch": 0.4024, + "grad_norm": 2.763288974761963, + "learning_rate": 1.4927273415482916e-05, + "loss": 0.0477, + "step": 20120 + }, + { + "epoch": 0.40244, + "grad_norm": 6.396679401397705, + "learning_rate": 1.4926058361727691e-05, + "loss": 0.1656, + "step": 20122 + }, + { + "epoch": 0.40248, + "grad_norm": 1.537180781364441, + "learning_rate": 1.4924843211936421e-05, + "loss": 0.0594, + "step": 20124 + }, + { + "epoch": 0.40252, + "grad_norm": 0.9947190880775452, + "learning_rate": 1.4923627966132796e-05, + "loss": 0.0337, + "step": 20126 + }, + { + "epoch": 0.40256, + "grad_norm": 1.4067109823226929, + "learning_rate": 1.492241262434051e-05, + "loss": 0.2004, + "step": 20128 + }, + { + "epoch": 0.4026, + "grad_norm": 1.76613187789917, + "learning_rate": 1.4921197186583256e-05, + "loss": 0.0606, + "step": 20130 + }, + { + "epoch": 0.40264, + "grad_norm": 2.789982557296753, + "learning_rate": 1.4919981652884731e-05, + "loss": 0.0722, + "step": 20132 + }, + { + "epoch": 0.40268, + "grad_norm": 0.32369503378868103, + "learning_rate": 1.4918766023268627e-05, + "loss": 0.0263, + "step": 20134 + }, + { + "epoch": 0.40272, + "grad_norm": 1.283302664756775, + "learning_rate": 1.491755029775865e-05, + "loss": 0.1802, + "step": 20136 + }, + { + "epoch": 0.40276, + "grad_norm": 1.1572728157043457, + "learning_rate": 1.4916334476378498e-05, + "loss": 0.0164, + "step": 20138 + }, + { + "epoch": 0.4028, + "grad_norm": 4.864006519317627, + "learning_rate": 1.4915118559151871e-05, + "loss": 0.2215, + "step": 20140 + }, + { + "epoch": 0.40284, + "grad_norm": 0.47154510021209717, + "learning_rate": 1.4913902546102478e-05, + "loss": 0.0155, + "step": 20142 + }, + { + "epoch": 0.40288, + "grad_norm": 0.0795871689915657, + "learning_rate": 1.4912686437254027e-05, + "loss": 0.0034, + "step": 20144 + }, + { + "epoch": 0.40292, + "grad_norm": 0.05546657741069794, + "learning_rate": 1.4911470232630225e-05, + "loss": 0.1907, + "step": 20146 + }, + { + "epoch": 0.40296, + "grad_norm": 0.2196025401353836, + "learning_rate": 1.4910253932254784e-05, + "loss": 0.0752, + "step": 20148 + }, + { + "epoch": 0.403, + "grad_norm": 0.03930635750293732, + "learning_rate": 1.490903753615141e-05, + "loss": 0.0385, + "step": 20150 + }, + { + "epoch": 0.40304, + "grad_norm": 0.3226500451564789, + "learning_rate": 1.4907821044343824e-05, + "loss": 0.3788, + "step": 20152 + }, + { + "epoch": 0.40308, + "grad_norm": 1.8747247457504272, + "learning_rate": 1.490660445685574e-05, + "loss": 0.0693, + "step": 20154 + }, + { + "epoch": 0.40312, + "grad_norm": 0.1639351099729538, + "learning_rate": 1.4905387773710876e-05, + "loss": 0.0138, + "step": 20156 + }, + { + "epoch": 0.40316, + "grad_norm": 0.3619752526283264, + "learning_rate": 1.4904170994932952e-05, + "loss": 0.0487, + "step": 20158 + }, + { + "epoch": 0.4032, + "grad_norm": 0.7436124086380005, + "learning_rate": 1.4902954120545687e-05, + "loss": 0.0925, + "step": 20160 + }, + { + "epoch": 0.40324, + "grad_norm": 4.672463893890381, + "learning_rate": 1.490173715057281e-05, + "loss": 0.3, + "step": 20162 + }, + { + "epoch": 0.40328, + "grad_norm": 0.2583831548690796, + "learning_rate": 1.4900520085038044e-05, + "loss": 0.1437, + "step": 20164 + }, + { + "epoch": 0.40332, + "grad_norm": 0.5635135173797607, + "learning_rate": 1.4899302923965118e-05, + "loss": 0.1046, + "step": 20166 + }, + { + "epoch": 0.40336, + "grad_norm": 0.2632385790348053, + "learning_rate": 1.4898085667377755e-05, + "loss": 0.0139, + "step": 20168 + }, + { + "epoch": 0.4034, + "grad_norm": 2.7811899185180664, + "learning_rate": 1.4896868315299692e-05, + "loss": 0.1232, + "step": 20170 + }, + { + "epoch": 0.40344, + "grad_norm": 0.1274184286594391, + "learning_rate": 1.4895650867754658e-05, + "loss": 0.178, + "step": 20172 + }, + { + "epoch": 0.40348, + "grad_norm": 0.20239317417144775, + "learning_rate": 1.4894433324766392e-05, + "loss": 0.015, + "step": 20174 + }, + { + "epoch": 0.40352, + "grad_norm": 6.181962966918945, + "learning_rate": 1.4893215686358628e-05, + "loss": 0.116, + "step": 20176 + }, + { + "epoch": 0.40356, + "grad_norm": 1.3061614036560059, + "learning_rate": 1.4891997952555103e-05, + "loss": 0.0364, + "step": 20178 + }, + { + "epoch": 0.4036, + "grad_norm": 4.137821197509766, + "learning_rate": 1.4890780123379565e-05, + "loss": 0.081, + "step": 20180 + }, + { + "epoch": 0.40364, + "grad_norm": 2.9976415634155273, + "learning_rate": 1.4889562198855746e-05, + "loss": 0.0903, + "step": 20182 + }, + { + "epoch": 0.40368, + "grad_norm": 9.879684448242188, + "learning_rate": 1.4888344179007396e-05, + "loss": 0.481, + "step": 20184 + }, + { + "epoch": 0.40372, + "grad_norm": 1.1070258617401123, + "learning_rate": 1.4887126063858259e-05, + "loss": 0.0431, + "step": 20186 + }, + { + "epoch": 0.40376, + "grad_norm": 0.6092350482940674, + "learning_rate": 1.4885907853432085e-05, + "loss": 0.0186, + "step": 20188 + }, + { + "epoch": 0.4038, + "grad_norm": 0.21740569174289703, + "learning_rate": 1.488468954775262e-05, + "loss": 0.0102, + "step": 20190 + }, + { + "epoch": 0.40384, + "grad_norm": 0.517441987991333, + "learning_rate": 1.4883471146843617e-05, + "loss": 0.0155, + "step": 20192 + }, + { + "epoch": 0.40388, + "grad_norm": 3.92497181892395, + "learning_rate": 1.488225265072883e-05, + "loss": 0.1063, + "step": 20194 + }, + { + "epoch": 0.40392, + "grad_norm": 0.025281911715865135, + "learning_rate": 1.4881034059432016e-05, + "loss": 0.0045, + "step": 20196 + }, + { + "epoch": 0.40396, + "grad_norm": 0.15466342866420746, + "learning_rate": 1.4879815372976927e-05, + "loss": 0.012, + "step": 20198 + }, + { + "epoch": 0.404, + "grad_norm": 5.437465190887451, + "learning_rate": 1.4878596591387329e-05, + "loss": 0.2535, + "step": 20200 + }, + { + "epoch": 0.40404, + "grad_norm": 1.4904897212982178, + "learning_rate": 1.4877377714686976e-05, + "loss": 0.0363, + "step": 20202 + }, + { + "epoch": 0.40408, + "grad_norm": 7.52074670791626, + "learning_rate": 1.4876158742899635e-05, + "loss": 0.4514, + "step": 20204 + }, + { + "epoch": 0.40412, + "grad_norm": 1.4729865789413452, + "learning_rate": 1.4874939676049068e-05, + "loss": 0.1028, + "step": 20206 + }, + { + "epoch": 0.40416, + "grad_norm": 0.04764460772275925, + "learning_rate": 1.487372051415904e-05, + "loss": 0.0009, + "step": 20208 + }, + { + "epoch": 0.4042, + "grad_norm": 0.28796735405921936, + "learning_rate": 1.4872501257253325e-05, + "loss": 0.0382, + "step": 20210 + }, + { + "epoch": 0.40424, + "grad_norm": 0.7198758721351624, + "learning_rate": 1.4871281905355688e-05, + "loss": 0.0121, + "step": 20212 + }, + { + "epoch": 0.40428, + "grad_norm": 0.12826015055179596, + "learning_rate": 1.4870062458489903e-05, + "loss": 0.0083, + "step": 20214 + }, + { + "epoch": 0.40432, + "grad_norm": 0.2049020528793335, + "learning_rate": 1.4868842916679742e-05, + "loss": 0.0331, + "step": 20216 + }, + { + "epoch": 0.40436, + "grad_norm": 3.9259138107299805, + "learning_rate": 1.4867623279948981e-05, + "loss": 0.0817, + "step": 20218 + }, + { + "epoch": 0.4044, + "grad_norm": 0.049849364906549454, + "learning_rate": 1.4866403548321402e-05, + "loss": 0.0199, + "step": 20220 + }, + { + "epoch": 0.40444, + "grad_norm": 0.5141642093658447, + "learning_rate": 1.4865183721820778e-05, + "loss": 0.1026, + "step": 20222 + }, + { + "epoch": 0.40448, + "grad_norm": 0.031742893159389496, + "learning_rate": 1.4863963800470891e-05, + "loss": 0.1277, + "step": 20224 + }, + { + "epoch": 0.40452, + "grad_norm": 0.8927772045135498, + "learning_rate": 1.4862743784295526e-05, + "loss": 0.0251, + "step": 20226 + }, + { + "epoch": 0.40456, + "grad_norm": 0.3545210063457489, + "learning_rate": 1.486152367331847e-05, + "loss": 0.0075, + "step": 20228 + }, + { + "epoch": 0.4046, + "grad_norm": 0.3852754235267639, + "learning_rate": 1.4860303467563504e-05, + "loss": 0.009, + "step": 20230 + }, + { + "epoch": 0.40464, + "grad_norm": 2.310112714767456, + "learning_rate": 1.485908316705442e-05, + "loss": 0.0471, + "step": 20232 + }, + { + "epoch": 0.40468, + "grad_norm": 3.8410255908966064, + "learning_rate": 1.485786277181501e-05, + "loss": 0.107, + "step": 20234 + }, + { + "epoch": 0.40472, + "grad_norm": 5.093357086181641, + "learning_rate": 1.4856642281869064e-05, + "loss": 0.1008, + "step": 20236 + }, + { + "epoch": 0.40476, + "grad_norm": 1.1310490369796753, + "learning_rate": 1.4855421697240376e-05, + "loss": 0.0257, + "step": 20238 + }, + { + "epoch": 0.4048, + "grad_norm": 0.4265461564064026, + "learning_rate": 1.485420101795274e-05, + "loss": 0.0353, + "step": 20240 + }, + { + "epoch": 0.40484, + "grad_norm": 0.11894828826189041, + "learning_rate": 1.485298024402996e-05, + "loss": 0.4906, + "step": 20242 + }, + { + "epoch": 0.40488, + "grad_norm": 6.934410572052002, + "learning_rate": 1.485175937549583e-05, + "loss": 0.1579, + "step": 20244 + }, + { + "epoch": 0.40492, + "grad_norm": 0.4777839481830597, + "learning_rate": 1.4850538412374152e-05, + "loss": 0.0898, + "step": 20246 + }, + { + "epoch": 0.40496, + "grad_norm": 8.345479011535645, + "learning_rate": 1.484931735468873e-05, + "loss": 0.3138, + "step": 20248 + }, + { + "epoch": 0.405, + "grad_norm": 5.681833267211914, + "learning_rate": 1.4848096202463373e-05, + "loss": 0.078, + "step": 20250 + }, + { + "epoch": 0.40504, + "grad_norm": 0.46865060925483704, + "learning_rate": 1.4846874955721881e-05, + "loss": 0.0294, + "step": 20252 + }, + { + "epoch": 0.40508, + "grad_norm": 5.292983531951904, + "learning_rate": 1.4845653614488069e-05, + "loss": 0.1325, + "step": 20254 + }, + { + "epoch": 0.40512, + "grad_norm": 13.398819923400879, + "learning_rate": 1.4844432178785744e-05, + "loss": 1.1364, + "step": 20256 + }, + { + "epoch": 0.40516, + "grad_norm": 2.4517807960510254, + "learning_rate": 1.4843210648638718e-05, + "loss": 0.3571, + "step": 20258 + }, + { + "epoch": 0.4052, + "grad_norm": 0.09474340081214905, + "learning_rate": 1.4841989024070809e-05, + "loss": 0.0062, + "step": 20260 + }, + { + "epoch": 0.40524, + "grad_norm": 1.1247187852859497, + "learning_rate": 1.484076730510583e-05, + "loss": 0.0305, + "step": 20262 + }, + { + "epoch": 0.40528, + "grad_norm": 2.2551491260528564, + "learning_rate": 1.4839545491767599e-05, + "loss": 0.0724, + "step": 20264 + }, + { + "epoch": 0.40532, + "grad_norm": 5.167893409729004, + "learning_rate": 1.4838323584079939e-05, + "loss": 0.1503, + "step": 20266 + }, + { + "epoch": 0.40536, + "grad_norm": 2.6267082691192627, + "learning_rate": 1.483710158206667e-05, + "loss": 0.0691, + "step": 20268 + }, + { + "epoch": 0.4054, + "grad_norm": 0.04130876809358597, + "learning_rate": 1.4835879485751617e-05, + "loss": 0.3925, + "step": 20270 + }, + { + "epoch": 0.40544, + "grad_norm": 0.33735957741737366, + "learning_rate": 1.4834657295158601e-05, + "loss": 0.0394, + "step": 20272 + }, + { + "epoch": 0.40548, + "grad_norm": 0.41554898023605347, + "learning_rate": 1.4833435010311452e-05, + "loss": 0.0664, + "step": 20274 + }, + { + "epoch": 0.40552, + "grad_norm": 0.20095115900039673, + "learning_rate": 1.4832212631234e-05, + "loss": 0.1102, + "step": 20276 + }, + { + "epoch": 0.40556, + "grad_norm": 0.07275223731994629, + "learning_rate": 1.4830990157950076e-05, + "loss": 0.0129, + "step": 20278 + }, + { + "epoch": 0.4056, + "grad_norm": 0.9453328847885132, + "learning_rate": 1.4829767590483508e-05, + "loss": 0.3249, + "step": 20280 + }, + { + "epoch": 0.40564, + "grad_norm": 0.14392267167568207, + "learning_rate": 1.4828544928858137e-05, + "loss": 0.0052, + "step": 20282 + }, + { + "epoch": 0.40568, + "grad_norm": 0.4511679410934448, + "learning_rate": 1.4827322173097798e-05, + "loss": 0.0562, + "step": 20284 + }, + { + "epoch": 0.40572, + "grad_norm": 0.34868115186691284, + "learning_rate": 1.4826099323226327e-05, + "loss": 0.007, + "step": 20286 + }, + { + "epoch": 0.40576, + "grad_norm": 0.2363252341747284, + "learning_rate": 1.4824876379267563e-05, + "loss": 0.036, + "step": 20288 + }, + { + "epoch": 0.4058, + "grad_norm": 0.0720195323228836, + "learning_rate": 1.4823653341245353e-05, + "loss": 0.0055, + "step": 20290 + }, + { + "epoch": 0.40584, + "grad_norm": 1.6187591552734375, + "learning_rate": 1.4822430209183534e-05, + "loss": 0.047, + "step": 20292 + }, + { + "epoch": 0.40588, + "grad_norm": 1.751233458518982, + "learning_rate": 1.4821206983105957e-05, + "loss": 0.0551, + "step": 20294 + }, + { + "epoch": 0.40592, + "grad_norm": 5.745449542999268, + "learning_rate": 1.481998366303647e-05, + "loss": 0.259, + "step": 20296 + }, + { + "epoch": 0.40596, + "grad_norm": 0.04721454158425331, + "learning_rate": 1.4818760248998918e-05, + "loss": 0.031, + "step": 20298 + }, + { + "epoch": 0.406, + "grad_norm": 5.372164726257324, + "learning_rate": 1.4817536741017153e-05, + "loss": 0.21, + "step": 20300 + }, + { + "epoch": 0.40604, + "grad_norm": 4.1009345054626465, + "learning_rate": 1.481631313911503e-05, + "loss": 0.4997, + "step": 20302 + }, + { + "epoch": 0.40608, + "grad_norm": 0.20901474356651306, + "learning_rate": 1.4815089443316406e-05, + "loss": 0.0062, + "step": 20304 + }, + { + "epoch": 0.40612, + "grad_norm": 8.46589469909668, + "learning_rate": 1.481386565364513e-05, + "loss": 0.2277, + "step": 20306 + }, + { + "epoch": 0.40616, + "grad_norm": 0.6542305946350098, + "learning_rate": 1.4812641770125064e-05, + "loss": 0.0179, + "step": 20308 + }, + { + "epoch": 0.4062, + "grad_norm": 5.498322486877441, + "learning_rate": 1.4811417792780074e-05, + "loss": 0.3517, + "step": 20310 + }, + { + "epoch": 0.40624, + "grad_norm": 0.1510726362466812, + "learning_rate": 1.4810193721634014e-05, + "loss": 0.0202, + "step": 20312 + }, + { + "epoch": 0.40628, + "grad_norm": 1.0624662637710571, + "learning_rate": 1.4808969556710748e-05, + "loss": 0.0332, + "step": 20314 + }, + { + "epoch": 0.40632, + "grad_norm": 0.6121907830238342, + "learning_rate": 1.4807745298034148e-05, + "loss": 0.085, + "step": 20316 + }, + { + "epoch": 0.40636, + "grad_norm": 0.12922534346580505, + "learning_rate": 1.480652094562808e-05, + "loss": 0.0902, + "step": 20318 + }, + { + "epoch": 0.4064, + "grad_norm": 0.2879844605922699, + "learning_rate": 1.4805296499516408e-05, + "loss": 0.0114, + "step": 20320 + }, + { + "epoch": 0.40644, + "grad_norm": 1.3093199729919434, + "learning_rate": 1.480407195972301e-05, + "loss": 0.1053, + "step": 20322 + }, + { + "epoch": 0.40648, + "grad_norm": 1.0948566198349, + "learning_rate": 1.4802847326271753e-05, + "loss": 0.0482, + "step": 20324 + }, + { + "epoch": 0.40652, + "grad_norm": 0.14965872466564178, + "learning_rate": 1.4801622599186519e-05, + "loss": 0.0107, + "step": 20326 + }, + { + "epoch": 0.40656, + "grad_norm": 0.3963265120983124, + "learning_rate": 1.4800397778491175e-05, + "loss": 0.012, + "step": 20328 + }, + { + "epoch": 0.4066, + "grad_norm": 1.8601523637771606, + "learning_rate": 1.4799172864209607e-05, + "loss": 0.0385, + "step": 20330 + }, + { + "epoch": 0.40664, + "grad_norm": 0.801947832107544, + "learning_rate": 1.4797947856365693e-05, + "loss": 0.0173, + "step": 20332 + }, + { + "epoch": 0.40668, + "grad_norm": 3.6670141220092773, + "learning_rate": 1.4796722754983317e-05, + "loss": 0.0559, + "step": 20334 + }, + { + "epoch": 0.40672, + "grad_norm": 0.21114467084407806, + "learning_rate": 1.4795497560086358e-05, + "loss": 0.0145, + "step": 20336 + }, + { + "epoch": 0.40676, + "grad_norm": 0.21455343067646027, + "learning_rate": 1.4794272271698708e-05, + "loss": 0.0048, + "step": 20338 + }, + { + "epoch": 0.4068, + "grad_norm": 2.8762314319610596, + "learning_rate": 1.4793046889844252e-05, + "loss": 0.0948, + "step": 20340 + }, + { + "epoch": 0.40684, + "grad_norm": 0.3187183737754822, + "learning_rate": 1.4791821414546878e-05, + "loss": 0.0076, + "step": 20342 + }, + { + "epoch": 0.40688, + "grad_norm": 1.6012365818023682, + "learning_rate": 1.479059584583048e-05, + "loss": 0.0803, + "step": 20344 + }, + { + "epoch": 0.40692, + "grad_norm": 0.4567665755748749, + "learning_rate": 1.4789370183718948e-05, + "loss": 0.0137, + "step": 20346 + }, + { + "epoch": 0.40696, + "grad_norm": 0.1339041143655777, + "learning_rate": 1.4788144428236177e-05, + "loss": 0.0055, + "step": 20348 + }, + { + "epoch": 0.407, + "grad_norm": 0.5825588703155518, + "learning_rate": 1.478691857940607e-05, + "loss": 0.115, + "step": 20350 + }, + { + "epoch": 0.40704, + "grad_norm": 4.143157482147217, + "learning_rate": 1.4785692637252518e-05, + "loss": 0.3468, + "step": 20352 + }, + { + "epoch": 0.40708, + "grad_norm": 0.06647776812314987, + "learning_rate": 1.4784466601799424e-05, + "loss": 0.0064, + "step": 20354 + }, + { + "epoch": 0.40712, + "grad_norm": 6.701377868652344, + "learning_rate": 1.478324047307069e-05, + "loss": 0.2758, + "step": 20356 + }, + { + "epoch": 0.40716, + "grad_norm": 0.2635682225227356, + "learning_rate": 1.4782014251090222e-05, + "loss": 0.0258, + "step": 20358 + }, + { + "epoch": 0.4072, + "grad_norm": 4.3543829917907715, + "learning_rate": 1.4780787935881925e-05, + "loss": 0.0898, + "step": 20360 + }, + { + "epoch": 0.40724, + "grad_norm": 0.17944128811359406, + "learning_rate": 1.4779561527469702e-05, + "loss": 0.0064, + "step": 20362 + }, + { + "epoch": 0.40728, + "grad_norm": 0.39561015367507935, + "learning_rate": 1.4778335025877468e-05, + "loss": 0.1963, + "step": 20364 + }, + { + "epoch": 0.40732, + "grad_norm": 0.8964623212814331, + "learning_rate": 1.4777108431129135e-05, + "loss": 0.0403, + "step": 20366 + }, + { + "epoch": 0.40736, + "grad_norm": 7.42054557800293, + "learning_rate": 1.4775881743248612e-05, + "loss": 0.4548, + "step": 20368 + }, + { + "epoch": 0.4074, + "grad_norm": 0.2570275366306305, + "learning_rate": 1.4774654962259813e-05, + "loss": 0.142, + "step": 20370 + }, + { + "epoch": 0.40744, + "grad_norm": 0.1519845575094223, + "learning_rate": 1.4773428088186662e-05, + "loss": 0.0678, + "step": 20372 + }, + { + "epoch": 0.40748, + "grad_norm": 0.23522382974624634, + "learning_rate": 1.4772201121053073e-05, + "loss": 0.0089, + "step": 20374 + }, + { + "epoch": 0.40752, + "grad_norm": 0.32477423548698425, + "learning_rate": 1.4770974060882963e-05, + "loss": 0.0076, + "step": 20376 + }, + { + "epoch": 0.40756, + "grad_norm": 6.320554733276367, + "learning_rate": 1.4769746907700257e-05, + "loss": 0.3979, + "step": 20378 + }, + { + "epoch": 0.4076, + "grad_norm": 0.16348500549793243, + "learning_rate": 1.4768519661528879e-05, + "loss": 0.0078, + "step": 20380 + }, + { + "epoch": 0.40764, + "grad_norm": 6.341750621795654, + "learning_rate": 1.4767292322392757e-05, + "loss": 0.2282, + "step": 20382 + }, + { + "epoch": 0.40768, + "grad_norm": 0.5297778844833374, + "learning_rate": 1.4766064890315818e-05, + "loss": 0.1498, + "step": 20384 + }, + { + "epoch": 0.40772, + "grad_norm": 0.12797915935516357, + "learning_rate": 1.4764837365321989e-05, + "loss": 0.0062, + "step": 20386 + }, + { + "epoch": 0.40776, + "grad_norm": 0.1331162005662918, + "learning_rate": 1.4763609747435203e-05, + "loss": 0.0056, + "step": 20388 + }, + { + "epoch": 0.4078, + "grad_norm": 4.961131572723389, + "learning_rate": 1.4762382036679393e-05, + "loss": 0.2644, + "step": 20390 + }, + { + "epoch": 0.40784, + "grad_norm": 0.7726793885231018, + "learning_rate": 1.4761154233078494e-05, + "loss": 0.0181, + "step": 20392 + }, + { + "epoch": 0.40788, + "grad_norm": 0.8242180347442627, + "learning_rate": 1.4759926336656438e-05, + "loss": 0.0282, + "step": 20394 + }, + { + "epoch": 0.40792, + "grad_norm": 0.8926911354064941, + "learning_rate": 1.475869834743717e-05, + "loss": 0.0224, + "step": 20396 + }, + { + "epoch": 0.40796, + "grad_norm": 0.7602707743644714, + "learning_rate": 1.4757470265444627e-05, + "loss": 0.1371, + "step": 20398 + }, + { + "epoch": 0.408, + "grad_norm": 1.287292242050171, + "learning_rate": 1.4756242090702756e-05, + "loss": 0.0235, + "step": 20400 + }, + { + "epoch": 0.40804, + "grad_norm": 1.1576226949691772, + "learning_rate": 1.4755013823235491e-05, + "loss": 0.0279, + "step": 20402 + }, + { + "epoch": 0.40808, + "grad_norm": 0.3035500943660736, + "learning_rate": 1.4753785463066785e-05, + "loss": 0.3422, + "step": 20404 + }, + { + "epoch": 0.40812, + "grad_norm": 0.213074192404747, + "learning_rate": 1.4752557010220585e-05, + "loss": 0.0157, + "step": 20406 + }, + { + "epoch": 0.40816, + "grad_norm": 5.150664806365967, + "learning_rate": 1.4751328464720842e-05, + "loss": 0.149, + "step": 20408 + }, + { + "epoch": 0.4082, + "grad_norm": 1.014341115951538, + "learning_rate": 1.47500998265915e-05, + "loss": 0.0515, + "step": 20410 + }, + { + "epoch": 0.40824, + "grad_norm": 0.07243596017360687, + "learning_rate": 1.4748871095856516e-05, + "loss": 0.0601, + "step": 20412 + }, + { + "epoch": 0.40828, + "grad_norm": 2.5268023014068604, + "learning_rate": 1.4747642272539848e-05, + "loss": 0.1071, + "step": 20414 + }, + { + "epoch": 0.40832, + "grad_norm": 0.07113788276910782, + "learning_rate": 1.4746413356665449e-05, + "loss": 0.06, + "step": 20416 + }, + { + "epoch": 0.40836, + "grad_norm": 3.764847755432129, + "learning_rate": 1.4745184348257278e-05, + "loss": 0.1875, + "step": 20418 + }, + { + "epoch": 0.4084, + "grad_norm": 0.8110201358795166, + "learning_rate": 1.4743955247339292e-05, + "loss": 0.021, + "step": 20420 + }, + { + "epoch": 0.40844, + "grad_norm": 8.872319221496582, + "learning_rate": 1.4742726053935461e-05, + "loss": 0.4706, + "step": 20422 + }, + { + "epoch": 0.40848, + "grad_norm": 0.10915534943342209, + "learning_rate": 1.4741496768069743e-05, + "loss": 0.0358, + "step": 20424 + }, + { + "epoch": 0.40852, + "grad_norm": 0.4860292673110962, + "learning_rate": 1.4740267389766104e-05, + "loss": 0.0156, + "step": 20426 + }, + { + "epoch": 0.40856, + "grad_norm": 0.28773432970046997, + "learning_rate": 1.4739037919048509e-05, + "loss": 0.0169, + "step": 20428 + }, + { + "epoch": 0.4086, + "grad_norm": 1.6885144710540771, + "learning_rate": 1.4737808355940932e-05, + "loss": 0.0272, + "step": 20430 + }, + { + "epoch": 0.40864, + "grad_norm": 0.06944509595632553, + "learning_rate": 1.4736578700467344e-05, + "loss": 0.0051, + "step": 20432 + }, + { + "epoch": 0.40868, + "grad_norm": 1.8452457189559937, + "learning_rate": 1.4735348952651712e-05, + "loss": 0.0471, + "step": 20434 + }, + { + "epoch": 0.40872, + "grad_norm": 0.8156838417053223, + "learning_rate": 1.4734119112518016e-05, + "loss": 0.0322, + "step": 20436 + }, + { + "epoch": 0.40876, + "grad_norm": 2.125274419784546, + "learning_rate": 1.4732889180090229e-05, + "loss": 0.0443, + "step": 20438 + }, + { + "epoch": 0.4088, + "grad_norm": 10.364171981811523, + "learning_rate": 1.4731659155392332e-05, + "loss": 0.4123, + "step": 20440 + }, + { + "epoch": 0.40884, + "grad_norm": 0.4966798424720764, + "learning_rate": 1.4730429038448306e-05, + "loss": 0.0103, + "step": 20442 + }, + { + "epoch": 0.40888, + "grad_norm": 5.314332485198975, + "learning_rate": 1.4729198829282127e-05, + "loss": 0.1795, + "step": 20444 + }, + { + "epoch": 0.40892, + "grad_norm": 0.01911291666328907, + "learning_rate": 1.4727968527917785e-05, + "loss": 0.0134, + "step": 20446 + }, + { + "epoch": 0.40896, + "grad_norm": 8.677035331726074, + "learning_rate": 1.472673813437926e-05, + "loss": 0.3022, + "step": 20448 + }, + { + "epoch": 0.409, + "grad_norm": 6.927994728088379, + "learning_rate": 1.4725507648690542e-05, + "loss": 0.164, + "step": 20450 + }, + { + "epoch": 0.40904, + "grad_norm": 0.35114815831184387, + "learning_rate": 1.4724277070875618e-05, + "loss": 0.0451, + "step": 20452 + }, + { + "epoch": 0.40908, + "grad_norm": 8.213900566101074, + "learning_rate": 1.4723046400958482e-05, + "loss": 0.181, + "step": 20454 + }, + { + "epoch": 0.40912, + "grad_norm": 0.11984271556138992, + "learning_rate": 1.4721815638963129e-05, + "loss": 0.0292, + "step": 20456 + }, + { + "epoch": 0.40916, + "grad_norm": 1.2116822004318237, + "learning_rate": 1.4720584784913544e-05, + "loss": 0.0492, + "step": 20458 + }, + { + "epoch": 0.4092, + "grad_norm": 0.01715029962360859, + "learning_rate": 1.4719353838833729e-05, + "loss": 0.0019, + "step": 20460 + }, + { + "epoch": 0.40924, + "grad_norm": 1.4131801128387451, + "learning_rate": 1.4718122800747681e-05, + "loss": 0.1712, + "step": 20462 + }, + { + "epoch": 0.40928, + "grad_norm": 0.42641380429267883, + "learning_rate": 1.4716891670679404e-05, + "loss": 0.0189, + "step": 20464 + }, + { + "epoch": 0.40932, + "grad_norm": 0.35201260447502136, + "learning_rate": 1.471566044865289e-05, + "loss": 0.0104, + "step": 20466 + }, + { + "epoch": 0.40936, + "grad_norm": 0.16613604128360748, + "learning_rate": 1.471442913469215e-05, + "loss": 0.0075, + "step": 20468 + }, + { + "epoch": 0.4094, + "grad_norm": 1.36626398563385, + "learning_rate": 1.4713197728821185e-05, + "loss": 0.0364, + "step": 20470 + }, + { + "epoch": 0.40944, + "grad_norm": 0.039711881428956985, + "learning_rate": 1.4711966231064008e-05, + "loss": 0.0157, + "step": 20472 + }, + { + "epoch": 0.40948, + "grad_norm": 0.31563079357147217, + "learning_rate": 1.471073464144462e-05, + "loss": 0.0113, + "step": 20474 + }, + { + "epoch": 0.40952, + "grad_norm": 4.449748992919922, + "learning_rate": 1.4709502959987036e-05, + "loss": 0.0842, + "step": 20476 + }, + { + "epoch": 0.40956, + "grad_norm": 0.05114997923374176, + "learning_rate": 1.4708271186715269e-05, + "loss": 0.0727, + "step": 20478 + }, + { + "epoch": 0.4096, + "grad_norm": 3.6119303703308105, + "learning_rate": 1.470703932165333e-05, + "loss": 0.0569, + "step": 20480 + }, + { + "epoch": 0.40964, + "grad_norm": 0.09045970439910889, + "learning_rate": 1.4705807364825234e-05, + "loss": 0.0808, + "step": 20482 + }, + { + "epoch": 0.40968, + "grad_norm": 0.6918383240699768, + "learning_rate": 1.4704575316255e-05, + "loss": 0.4417, + "step": 20484 + }, + { + "epoch": 0.40972, + "grad_norm": 0.028703223913908005, + "learning_rate": 1.470334317596665e-05, + "loss": 0.0477, + "step": 20486 + }, + { + "epoch": 0.40976, + "grad_norm": 11.025091171264648, + "learning_rate": 1.4702110943984203e-05, + "loss": 0.2535, + "step": 20488 + }, + { + "epoch": 0.4098, + "grad_norm": 0.2000122368335724, + "learning_rate": 1.4700878620331684e-05, + "loss": 0.0054, + "step": 20490 + }, + { + "epoch": 0.40984, + "grad_norm": 0.36667779088020325, + "learning_rate": 1.4699646205033115e-05, + "loss": 0.2726, + "step": 20492 + }, + { + "epoch": 0.40988, + "grad_norm": 0.8181225657463074, + "learning_rate": 1.4698413698112526e-05, + "loss": 0.0301, + "step": 20494 + }, + { + "epoch": 0.40992, + "grad_norm": 0.3953721523284912, + "learning_rate": 1.469718109959394e-05, + "loss": 0.1008, + "step": 20496 + }, + { + "epoch": 0.40996, + "grad_norm": 7.128571510314941, + "learning_rate": 1.4695948409501391e-05, + "loss": 0.2351, + "step": 20498 + }, + { + "epoch": 0.41, + "grad_norm": 3.4213149547576904, + "learning_rate": 1.469471562785891e-05, + "loss": 0.0962, + "step": 20500 + }, + { + "epoch": 0.41004, + "grad_norm": 0.3411567807197571, + "learning_rate": 1.4693482754690528e-05, + "loss": 0.0087, + "step": 20502 + }, + { + "epoch": 0.41008, + "grad_norm": 0.4995606243610382, + "learning_rate": 1.4692249790020288e-05, + "loss": 0.1989, + "step": 20504 + }, + { + "epoch": 0.41012, + "grad_norm": 10.740034103393555, + "learning_rate": 1.4691016733872221e-05, + "loss": 0.8832, + "step": 20506 + }, + { + "epoch": 0.41016, + "grad_norm": 0.9761050343513489, + "learning_rate": 1.4689783586270368e-05, + "loss": 0.0303, + "step": 20508 + }, + { + "epoch": 0.4102, + "grad_norm": 0.5653215646743774, + "learning_rate": 1.468855034723877e-05, + "loss": 0.0494, + "step": 20510 + }, + { + "epoch": 0.41024, + "grad_norm": 2.5624241828918457, + "learning_rate": 1.4687317016801468e-05, + "loss": 0.0641, + "step": 20512 + }, + { + "epoch": 0.41028, + "grad_norm": 12.909000396728516, + "learning_rate": 1.4686083594982507e-05, + "loss": 0.3726, + "step": 20514 + }, + { + "epoch": 0.41032, + "grad_norm": 1.4449940919876099, + "learning_rate": 1.4684850081805934e-05, + "loss": 0.0318, + "step": 20516 + }, + { + "epoch": 0.41036, + "grad_norm": 5.941878795623779, + "learning_rate": 1.4683616477295796e-05, + "loss": 0.147, + "step": 20518 + }, + { + "epoch": 0.4104, + "grad_norm": 0.3476545214653015, + "learning_rate": 1.4682382781476146e-05, + "loss": 0.0149, + "step": 20520 + }, + { + "epoch": 0.41044, + "grad_norm": 7.274043560028076, + "learning_rate": 1.4681148994371032e-05, + "loss": 0.329, + "step": 20522 + }, + { + "epoch": 0.41048, + "grad_norm": 0.13163933157920837, + "learning_rate": 1.4679915116004509e-05, + "loss": 0.0026, + "step": 20524 + }, + { + "epoch": 0.41052, + "grad_norm": 0.11267507076263428, + "learning_rate": 1.4678681146400628e-05, + "loss": 0.0125, + "step": 20526 + }, + { + "epoch": 0.41056, + "grad_norm": 0.22365887463092804, + "learning_rate": 1.4677447085583453e-05, + "loss": 0.0372, + "step": 20528 + }, + { + "epoch": 0.4106, + "grad_norm": 3.1908931732177734, + "learning_rate": 1.467621293357704e-05, + "loss": 0.0547, + "step": 20530 + }, + { + "epoch": 0.41064, + "grad_norm": 7.841202259063721, + "learning_rate": 1.4674978690405446e-05, + "loss": 0.6847, + "step": 20532 + }, + { + "epoch": 0.41068, + "grad_norm": 0.22998499870300293, + "learning_rate": 1.4673744356092736e-05, + "loss": 0.0234, + "step": 20534 + }, + { + "epoch": 0.41072, + "grad_norm": 1.4811245203018188, + "learning_rate": 1.4672509930662972e-05, + "loss": 0.0312, + "step": 20536 + }, + { + "epoch": 0.41076, + "grad_norm": 4.2034687995910645, + "learning_rate": 1.4671275414140222e-05, + "loss": 0.1097, + "step": 20538 + }, + { + "epoch": 0.4108, + "grad_norm": 0.05002928152680397, + "learning_rate": 1.4670040806548555e-05, + "loss": 0.0022, + "step": 20540 + }, + { + "epoch": 0.41084, + "grad_norm": 0.10615232586860657, + "learning_rate": 1.4668806107912035e-05, + "loss": 0.0027, + "step": 20542 + }, + { + "epoch": 0.41088, + "grad_norm": 0.0975232943892479, + "learning_rate": 1.4667571318254741e-05, + "loss": 0.0603, + "step": 20544 + }, + { + "epoch": 0.41092, + "grad_norm": 0.8081616163253784, + "learning_rate": 1.4666336437600738e-05, + "loss": 0.0254, + "step": 20546 + }, + { + "epoch": 0.41096, + "grad_norm": 1.1739654541015625, + "learning_rate": 1.4665101465974102e-05, + "loss": 0.0245, + "step": 20548 + }, + { + "epoch": 0.411, + "grad_norm": 0.5519691109657288, + "learning_rate": 1.4663866403398915e-05, + "loss": 0.0105, + "step": 20550 + }, + { + "epoch": 0.41104, + "grad_norm": 1.315009593963623, + "learning_rate": 1.4662631249899248e-05, + "loss": 0.0737, + "step": 20552 + }, + { + "epoch": 0.41108, + "grad_norm": 4.20556640625, + "learning_rate": 1.4661396005499186e-05, + "loss": 0.2436, + "step": 20554 + }, + { + "epoch": 0.41112, + "grad_norm": 0.16260233521461487, + "learning_rate": 1.466016067022281e-05, + "loss": 0.0099, + "step": 20556 + }, + { + "epoch": 0.41116, + "grad_norm": 0.7955337762832642, + "learning_rate": 1.4658925244094197e-05, + "loss": 0.0183, + "step": 20558 + }, + { + "epoch": 0.4112, + "grad_norm": 10.266300201416016, + "learning_rate": 1.4657689727137443e-05, + "loss": 0.409, + "step": 20560 + }, + { + "epoch": 0.41124, + "grad_norm": 1.8679064512252808, + "learning_rate": 1.4656454119376631e-05, + "loss": 0.7614, + "step": 20562 + }, + { + "epoch": 0.41128, + "grad_norm": 0.1181689202785492, + "learning_rate": 1.4655218420835846e-05, + "loss": 0.0067, + "step": 20564 + }, + { + "epoch": 0.41132, + "grad_norm": 1.9668501615524292, + "learning_rate": 1.4653982631539183e-05, + "loss": 0.0436, + "step": 20566 + }, + { + "epoch": 0.41136, + "grad_norm": 1.5348625183105469, + "learning_rate": 1.465274675151073e-05, + "loss": 0.0309, + "step": 20568 + }, + { + "epoch": 0.4114, + "grad_norm": 0.7575429081916809, + "learning_rate": 1.4651510780774585e-05, + "loss": 0.4432, + "step": 20570 + }, + { + "epoch": 0.41144, + "grad_norm": 0.03141963481903076, + "learning_rate": 1.4650274719354843e-05, + "loss": 0.0037, + "step": 20572 + }, + { + "epoch": 0.41148, + "grad_norm": 0.19160567224025726, + "learning_rate": 1.46490385672756e-05, + "loss": 0.1665, + "step": 20574 + }, + { + "epoch": 0.41152, + "grad_norm": 4.934004306793213, + "learning_rate": 1.4647802324560956e-05, + "loss": 0.1197, + "step": 20576 + }, + { + "epoch": 0.41156, + "grad_norm": 0.15764977037906647, + "learning_rate": 1.4646565991235015e-05, + "loss": 0.02, + "step": 20578 + }, + { + "epoch": 0.4116, + "grad_norm": 0.0348866768181324, + "learning_rate": 1.464532956732188e-05, + "loss": 0.1765, + "step": 20580 + }, + { + "epoch": 0.41164, + "grad_norm": 2.5932223796844482, + "learning_rate": 1.4644093052845649e-05, + "loss": 0.0654, + "step": 20582 + }, + { + "epoch": 0.41168, + "grad_norm": 1.2088121175765991, + "learning_rate": 1.4642856447830434e-05, + "loss": 0.0345, + "step": 20584 + }, + { + "epoch": 0.41172, + "grad_norm": 0.31541329622268677, + "learning_rate": 1.4641619752300343e-05, + "loss": 0.0124, + "step": 20586 + }, + { + "epoch": 0.41176, + "grad_norm": 1.80655038356781, + "learning_rate": 1.4640382966279484e-05, + "loss": 0.2369, + "step": 20588 + }, + { + "epoch": 0.4118, + "grad_norm": 4.958369731903076, + "learning_rate": 1.4639146089791972e-05, + "loss": 0.1496, + "step": 20590 + }, + { + "epoch": 0.41184, + "grad_norm": 0.49858126044273376, + "learning_rate": 1.4637909122861918e-05, + "loss": 0.0268, + "step": 20592 + }, + { + "epoch": 0.41188, + "grad_norm": 3.5693461894989014, + "learning_rate": 1.4636672065513435e-05, + "loss": 0.0912, + "step": 20594 + }, + { + "epoch": 0.41192, + "grad_norm": 3.309270143508911, + "learning_rate": 1.4635434917770648e-05, + "loss": 0.0727, + "step": 20596 + }, + { + "epoch": 0.41196, + "grad_norm": 0.12697052955627441, + "learning_rate": 1.463419767965767e-05, + "loss": 0.3945, + "step": 20598 + }, + { + "epoch": 0.412, + "grad_norm": 2.302178144454956, + "learning_rate": 1.463296035119862e-05, + "loss": 0.5502, + "step": 20600 + }, + { + "epoch": 0.41204, + "grad_norm": 0.679383397102356, + "learning_rate": 1.4631722932417622e-05, + "loss": 0.1175, + "step": 20602 + }, + { + "epoch": 0.41208, + "grad_norm": 11.099163055419922, + "learning_rate": 1.4630485423338802e-05, + "loss": 0.3761, + "step": 20604 + }, + { + "epoch": 0.41212, + "grad_norm": 1.5973944664001465, + "learning_rate": 1.4629247823986286e-05, + "loss": 0.2529, + "step": 20606 + }, + { + "epoch": 0.41216, + "grad_norm": 3.0365705490112305, + "learning_rate": 1.4628010134384198e-05, + "loss": 0.0765, + "step": 20608 + }, + { + "epoch": 0.4122, + "grad_norm": 2.085017204284668, + "learning_rate": 1.462677235455667e-05, + "loss": 0.0636, + "step": 20610 + }, + { + "epoch": 0.41224, + "grad_norm": 1.1248310804367065, + "learning_rate": 1.4625534484527837e-05, + "loss": 0.0482, + "step": 20612 + }, + { + "epoch": 0.41228, + "grad_norm": 0.12561309337615967, + "learning_rate": 1.4624296524321826e-05, + "loss": 0.0322, + "step": 20614 + }, + { + "epoch": 0.41232, + "grad_norm": 2.9413506984710693, + "learning_rate": 1.462305847396277e-05, + "loss": 0.0692, + "step": 20616 + }, + { + "epoch": 0.41236, + "grad_norm": 0.4263097047805786, + "learning_rate": 1.4621820333474812e-05, + "loss": 0.011, + "step": 20618 + }, + { + "epoch": 0.4124, + "grad_norm": 6.220742702484131, + "learning_rate": 1.4620582102882088e-05, + "loss": 0.2197, + "step": 20620 + }, + { + "epoch": 0.41244, + "grad_norm": 0.16307295858860016, + "learning_rate": 1.4619343782208735e-05, + "loss": 0.0131, + "step": 20622 + }, + { + "epoch": 0.41248, + "grad_norm": 3.0907399654388428, + "learning_rate": 1.4618105371478896e-05, + "loss": 0.0983, + "step": 20624 + }, + { + "epoch": 0.41252, + "grad_norm": 7.371023654937744, + "learning_rate": 1.4616866870716714e-05, + "loss": 0.4535, + "step": 20626 + }, + { + "epoch": 0.41256, + "grad_norm": 4.923431873321533, + "learning_rate": 1.4615628279946342e-05, + "loss": 0.1405, + "step": 20628 + }, + { + "epoch": 0.4126, + "grad_norm": 0.41451284289360046, + "learning_rate": 1.4614389599191917e-05, + "loss": 0.0958, + "step": 20630 + }, + { + "epoch": 0.41264, + "grad_norm": 0.9513950943946838, + "learning_rate": 1.4613150828477589e-05, + "loss": 0.0379, + "step": 20632 + }, + { + "epoch": 0.41268, + "grad_norm": 0.1319998800754547, + "learning_rate": 1.4611911967827513e-05, + "loss": 0.0365, + "step": 20634 + }, + { + "epoch": 0.41272, + "grad_norm": 1.9522920846939087, + "learning_rate": 1.461067301726584e-05, + "loss": 0.0509, + "step": 20636 + }, + { + "epoch": 0.41276, + "grad_norm": 1.6309096813201904, + "learning_rate": 1.4609433976816717e-05, + "loss": 0.0332, + "step": 20638 + }, + { + "epoch": 0.4128, + "grad_norm": 0.8555678129196167, + "learning_rate": 1.4608194846504311e-05, + "loss": 0.0981, + "step": 20640 + }, + { + "epoch": 0.41284, + "grad_norm": 0.06223054975271225, + "learning_rate": 1.460695562635277e-05, + "loss": 0.007, + "step": 20642 + }, + { + "epoch": 0.41288, + "grad_norm": 0.7032274007797241, + "learning_rate": 1.4605716316386262e-05, + "loss": 0.1311, + "step": 20644 + }, + { + "epoch": 0.41292, + "grad_norm": 6.4907636642456055, + "learning_rate": 1.460447691662894e-05, + "loss": 0.4018, + "step": 20646 + }, + { + "epoch": 0.41296, + "grad_norm": 1.412866473197937, + "learning_rate": 1.460323742710497e-05, + "loss": 0.168, + "step": 20648 + }, + { + "epoch": 0.413, + "grad_norm": 0.4197157919406891, + "learning_rate": 1.4601997847838518e-05, + "loss": 0.0159, + "step": 20650 + }, + { + "epoch": 0.41304, + "grad_norm": 1.0664288997650146, + "learning_rate": 1.4600758178853749e-05, + "loss": 0.1621, + "step": 20652 + }, + { + "epoch": 0.41308, + "grad_norm": 3.154805898666382, + "learning_rate": 1.459951842017483e-05, + "loss": 0.0919, + "step": 20654 + }, + { + "epoch": 0.41312, + "grad_norm": 0.6699068546295166, + "learning_rate": 1.459827857182593e-05, + "loss": 0.1634, + "step": 20656 + }, + { + "epoch": 0.41316, + "grad_norm": 0.45389923453330994, + "learning_rate": 1.4597038633831222e-05, + "loss": 0.0224, + "step": 20658 + }, + { + "epoch": 0.4132, + "grad_norm": 6.807598114013672, + "learning_rate": 1.4595798606214882e-05, + "loss": 0.3739, + "step": 20660 + }, + { + "epoch": 0.41324, + "grad_norm": 1.2302683591842651, + "learning_rate": 1.4594558489001078e-05, + "loss": 0.0216, + "step": 20662 + }, + { + "epoch": 0.41328, + "grad_norm": 0.24366706609725952, + "learning_rate": 1.4593318282213992e-05, + "loss": 0.0511, + "step": 20664 + }, + { + "epoch": 0.41332, + "grad_norm": 6.0141496658325195, + "learning_rate": 1.4592077985877804e-05, + "loss": 0.3424, + "step": 20666 + }, + { + "epoch": 0.41336, + "grad_norm": 0.4361632168292999, + "learning_rate": 1.459083760001669e-05, + "loss": 0.008, + "step": 20668 + }, + { + "epoch": 0.4134, + "grad_norm": 0.042820610105991364, + "learning_rate": 1.4589597124654834e-05, + "loss": 0.0159, + "step": 20670 + }, + { + "epoch": 0.41344, + "grad_norm": 0.11829710751771927, + "learning_rate": 1.4588356559816417e-05, + "loss": 0.0055, + "step": 20672 + }, + { + "epoch": 0.41348, + "grad_norm": 5.2649993896484375, + "learning_rate": 1.4587115905525626e-05, + "loss": 0.1962, + "step": 20674 + }, + { + "epoch": 0.41352, + "grad_norm": 0.34772661328315735, + "learning_rate": 1.4585875161806653e-05, + "loss": 0.0092, + "step": 20676 + }, + { + "epoch": 0.41356, + "grad_norm": 1.8344801664352417, + "learning_rate": 1.458463432868368e-05, + "loss": 0.0418, + "step": 20678 + }, + { + "epoch": 0.4136, + "grad_norm": 3.409769296646118, + "learning_rate": 1.4583393406180898e-05, + "loss": 0.1523, + "step": 20680 + }, + { + "epoch": 0.41364, + "grad_norm": 0.11776147037744522, + "learning_rate": 1.4582152394322507e-05, + "loss": 0.0238, + "step": 20682 + }, + { + "epoch": 0.41368, + "grad_norm": 2.268002510070801, + "learning_rate": 1.4580911293132693e-05, + "loss": 0.0665, + "step": 20684 + }, + { + "epoch": 0.41372, + "grad_norm": 1.119387149810791, + "learning_rate": 1.4579670102635656e-05, + "loss": 0.0204, + "step": 20686 + }, + { + "epoch": 0.41376, + "grad_norm": 0.7859762907028198, + "learning_rate": 1.4578428822855592e-05, + "loss": 0.0138, + "step": 20688 + }, + { + "epoch": 0.4138, + "grad_norm": 0.40945637226104736, + "learning_rate": 1.4577187453816702e-05, + "loss": 0.0203, + "step": 20690 + }, + { + "epoch": 0.41384, + "grad_norm": 0.13437698781490326, + "learning_rate": 1.4575945995543183e-05, + "loss": 0.0066, + "step": 20692 + }, + { + "epoch": 0.41388, + "grad_norm": 1.5293508768081665, + "learning_rate": 1.4574704448059245e-05, + "loss": 0.0316, + "step": 20694 + }, + { + "epoch": 0.41392, + "grad_norm": 3.829540967941284, + "learning_rate": 1.4573462811389087e-05, + "loss": 0.1488, + "step": 20696 + }, + { + "epoch": 0.41396, + "grad_norm": 0.5369614958763123, + "learning_rate": 1.4572221085556918e-05, + "loss": 0.0121, + "step": 20698 + }, + { + "epoch": 0.414, + "grad_norm": 0.5963718891143799, + "learning_rate": 1.4570979270586944e-05, + "loss": 0.0556, + "step": 20700 + }, + { + "epoch": 0.41404, + "grad_norm": 0.653486430644989, + "learning_rate": 1.4569737366503375e-05, + "loss": 0.0443, + "step": 20702 + }, + { + "epoch": 0.41408, + "grad_norm": 7.316303730010986, + "learning_rate": 1.4568495373330425e-05, + "loss": 0.2656, + "step": 20704 + }, + { + "epoch": 0.41412, + "grad_norm": 0.19683335721492767, + "learning_rate": 1.4567253291092303e-05, + "loss": 0.0514, + "step": 20706 + }, + { + "epoch": 0.41416, + "grad_norm": 3.945401906967163, + "learning_rate": 1.4566011119813228e-05, + "loss": 0.0988, + "step": 20708 + }, + { + "epoch": 0.4142, + "grad_norm": 6.361738681793213, + "learning_rate": 1.4564768859517417e-05, + "loss": 0.1496, + "step": 20710 + }, + { + "epoch": 0.41424, + "grad_norm": 0.2617712616920471, + "learning_rate": 1.4563526510229085e-05, + "loss": 0.0132, + "step": 20712 + }, + { + "epoch": 0.41428, + "grad_norm": 1.9353983402252197, + "learning_rate": 1.4562284071972455e-05, + "loss": 0.0557, + "step": 20714 + }, + { + "epoch": 0.41432, + "grad_norm": 0.17189601063728333, + "learning_rate": 1.456104154477175e-05, + "loss": 0.1529, + "step": 20716 + }, + { + "epoch": 0.41436, + "grad_norm": 1.6509426832199097, + "learning_rate": 1.4559798928651189e-05, + "loss": 0.2614, + "step": 20718 + }, + { + "epoch": 0.4144, + "grad_norm": 9.893117904663086, + "learning_rate": 1.4558556223635004e-05, + "loss": 0.4708, + "step": 20720 + }, + { + "epoch": 0.41444, + "grad_norm": 8.503486633300781, + "learning_rate": 1.4557313429747413e-05, + "loss": 0.4522, + "step": 20722 + }, + { + "epoch": 0.41448, + "grad_norm": 0.7999789118766785, + "learning_rate": 1.4556070547012653e-05, + "loss": 0.0166, + "step": 20724 + }, + { + "epoch": 0.41452, + "grad_norm": 0.6891396045684814, + "learning_rate": 1.4554827575454956e-05, + "loss": 0.2843, + "step": 20726 + }, + { + "epoch": 0.41456, + "grad_norm": 8.433758735656738, + "learning_rate": 1.4553584515098545e-05, + "loss": 0.6092, + "step": 20728 + }, + { + "epoch": 0.4146, + "grad_norm": 1.3510788679122925, + "learning_rate": 1.455234136596766e-05, + "loss": 0.0483, + "step": 20730 + }, + { + "epoch": 0.41464, + "grad_norm": 0.19801117479801178, + "learning_rate": 1.4551098128086538e-05, + "loss": 0.0175, + "step": 20732 + }, + { + "epoch": 0.41468, + "grad_norm": 0.17651773989200592, + "learning_rate": 1.4549854801479416e-05, + "loss": 0.0082, + "step": 20734 + }, + { + "epoch": 0.41472, + "grad_norm": 0.5609853863716125, + "learning_rate": 1.4548611386170532e-05, + "loss": 0.008, + "step": 20736 + }, + { + "epoch": 0.41476, + "grad_norm": 0.6358975768089294, + "learning_rate": 1.4547367882184125e-05, + "loss": 0.0285, + "step": 20738 + }, + { + "epoch": 0.4148, + "grad_norm": 4.902917861938477, + "learning_rate": 1.454612428954444e-05, + "loss": 0.2784, + "step": 20740 + }, + { + "epoch": 0.41484, + "grad_norm": 0.21305061876773834, + "learning_rate": 1.4544880608275724e-05, + "loss": 0.0327, + "step": 20742 + }, + { + "epoch": 0.41488, + "grad_norm": 0.2961067259311676, + "learning_rate": 1.454363683840222e-05, + "loss": 0.0075, + "step": 20744 + }, + { + "epoch": 0.41492, + "grad_norm": 0.5730262994766235, + "learning_rate": 1.4542392979948175e-05, + "loss": 0.1032, + "step": 20746 + }, + { + "epoch": 0.41496, + "grad_norm": 1.4526289701461792, + "learning_rate": 1.454114903293784e-05, + "loss": 0.0499, + "step": 20748 + }, + { + "epoch": 0.415, + "grad_norm": 0.4300333559513092, + "learning_rate": 1.4539904997395468e-05, + "loss": 0.0322, + "step": 20750 + }, + { + "epoch": 0.41504, + "grad_norm": 0.023481404408812523, + "learning_rate": 1.4538660873345312e-05, + "loss": 0.0272, + "step": 20752 + }, + { + "epoch": 0.41508, + "grad_norm": 1.96396005153656, + "learning_rate": 1.4537416660811625e-05, + "loss": 0.0922, + "step": 20754 + }, + { + "epoch": 0.41512, + "grad_norm": 4.4794769287109375, + "learning_rate": 1.4536172359818662e-05, + "loss": 0.1456, + "step": 20756 + }, + { + "epoch": 0.41516, + "grad_norm": 6.661200523376465, + "learning_rate": 1.4534927970390686e-05, + "loss": 0.3668, + "step": 20758 + }, + { + "epoch": 0.4152, + "grad_norm": 0.6190367341041565, + "learning_rate": 1.4533683492551954e-05, + "loss": 0.047, + "step": 20760 + }, + { + "epoch": 0.41524, + "grad_norm": 7.291167259216309, + "learning_rate": 1.4532438926326726e-05, + "loss": 0.2711, + "step": 20762 + }, + { + "epoch": 0.41528, + "grad_norm": 3.3469350337982178, + "learning_rate": 1.453119427173927e-05, + "loss": 0.0684, + "step": 20764 + }, + { + "epoch": 0.41532, + "grad_norm": 0.7275287508964539, + "learning_rate": 1.4529949528813848e-05, + "loss": 0.7435, + "step": 20766 + }, + { + "epoch": 0.41536, + "grad_norm": 0.419234961271286, + "learning_rate": 1.4528704697574729e-05, + "loss": 0.0189, + "step": 20768 + }, + { + "epoch": 0.4154, + "grad_norm": 0.18925714492797852, + "learning_rate": 1.452745977804618e-05, + "loss": 0.0677, + "step": 20770 + }, + { + "epoch": 0.41544, + "grad_norm": 0.5495632886886597, + "learning_rate": 1.4526214770252472e-05, + "loss": 0.0237, + "step": 20772 + }, + { + "epoch": 0.41548, + "grad_norm": 2.8311960697174072, + "learning_rate": 1.4524969674217874e-05, + "loss": 0.123, + "step": 20774 + }, + { + "epoch": 0.41552, + "grad_norm": 0.38749101758003235, + "learning_rate": 1.4523724489966666e-05, + "loss": 0.0886, + "step": 20776 + }, + { + "epoch": 0.41556, + "grad_norm": 1.6220570802688599, + "learning_rate": 1.4522479217523117e-05, + "loss": 0.0488, + "step": 20778 + }, + { + "epoch": 0.4156, + "grad_norm": 5.627616882324219, + "learning_rate": 1.4521233856911507e-05, + "loss": 0.2285, + "step": 20780 + }, + { + "epoch": 0.41564, + "grad_norm": 0.15273825824260712, + "learning_rate": 1.4519988408156121e-05, + "loss": 0.057, + "step": 20782 + }, + { + "epoch": 0.41568, + "grad_norm": 2.001547336578369, + "learning_rate": 1.4518742871281229e-05, + "loss": 0.0487, + "step": 20784 + }, + { + "epoch": 0.41572, + "grad_norm": 5.037659168243408, + "learning_rate": 1.4517497246311116e-05, + "loss": 0.2573, + "step": 20786 + }, + { + "epoch": 0.41576, + "grad_norm": 6.140831470489502, + "learning_rate": 1.4516251533270076e-05, + "loss": 0.1185, + "step": 20788 + }, + { + "epoch": 0.4158, + "grad_norm": 1.983172059059143, + "learning_rate": 1.4515005732182384e-05, + "loss": 0.0719, + "step": 20790 + }, + { + "epoch": 0.41584, + "grad_norm": 0.5419233441352844, + "learning_rate": 1.4513759843072327e-05, + "loss": 0.0128, + "step": 20792 + }, + { + "epoch": 0.41588, + "grad_norm": 0.5059753060340881, + "learning_rate": 1.4512513865964202e-05, + "loss": 0.0277, + "step": 20794 + }, + { + "epoch": 0.41592, + "grad_norm": 3.0805811882019043, + "learning_rate": 1.4511267800882294e-05, + "loss": 0.1402, + "step": 20796 + }, + { + "epoch": 0.41596, + "grad_norm": 2.8150951862335205, + "learning_rate": 1.4510021647850896e-05, + "loss": 0.1173, + "step": 20798 + }, + { + "epoch": 0.416, + "grad_norm": 3.4432897567749023, + "learning_rate": 1.4508775406894308e-05, + "loss": 0.1019, + "step": 20800 + }, + { + "epoch": 0.41604, + "grad_norm": 0.2465415596961975, + "learning_rate": 1.4507529078036822e-05, + "loss": 0.0091, + "step": 20802 + }, + { + "epoch": 0.41608, + "grad_norm": 4.344147682189941, + "learning_rate": 1.4506282661302735e-05, + "loss": 0.3774, + "step": 20804 + }, + { + "epoch": 0.41612, + "grad_norm": 1.3174333572387695, + "learning_rate": 1.4505036156716346e-05, + "loss": 0.041, + "step": 20806 + }, + { + "epoch": 0.41616, + "grad_norm": 8.791354179382324, + "learning_rate": 1.4503789564301962e-05, + "loss": 0.4957, + "step": 20808 + }, + { + "epoch": 0.4162, + "grad_norm": 0.1901395618915558, + "learning_rate": 1.4502542884083876e-05, + "loss": 0.008, + "step": 20810 + }, + { + "epoch": 0.41624, + "grad_norm": 0.7898779511451721, + "learning_rate": 1.4501296116086401e-05, + "loss": 0.0435, + "step": 20812 + }, + { + "epoch": 0.41628, + "grad_norm": 0.48358359932899475, + "learning_rate": 1.4500049260333842e-05, + "loss": 0.0222, + "step": 20814 + }, + { + "epoch": 0.41632, + "grad_norm": 0.43052855134010315, + "learning_rate": 1.4498802316850504e-05, + "loss": 0.0251, + "step": 20816 + }, + { + "epoch": 0.41636, + "grad_norm": 1.5615307092666626, + "learning_rate": 1.4497555285660701e-05, + "loss": 0.4485, + "step": 20818 + }, + { + "epoch": 0.4164, + "grad_norm": 0.3761162757873535, + "learning_rate": 1.449630816678874e-05, + "loss": 0.014, + "step": 20820 + }, + { + "epoch": 0.41644, + "grad_norm": 0.3952595293521881, + "learning_rate": 1.4495060960258938e-05, + "loss": 0.0164, + "step": 20822 + }, + { + "epoch": 0.41648, + "grad_norm": 10.045720100402832, + "learning_rate": 1.4493813666095609e-05, + "loss": 0.3375, + "step": 20824 + }, + { + "epoch": 0.41652, + "grad_norm": 1.7041940689086914, + "learning_rate": 1.4492566284323066e-05, + "loss": 0.0591, + "step": 20826 + }, + { + "epoch": 0.41656, + "grad_norm": 2.105421304702759, + "learning_rate": 1.449131881496563e-05, + "loss": 0.0597, + "step": 20828 + }, + { + "epoch": 0.4166, + "grad_norm": 0.16086561977863312, + "learning_rate": 1.4490071258047625e-05, + "loss": 0.0513, + "step": 20830 + }, + { + "epoch": 0.41664, + "grad_norm": 0.8576699495315552, + "learning_rate": 1.4488823613593368e-05, + "loss": 0.0596, + "step": 20832 + }, + { + "epoch": 0.41668, + "grad_norm": 5.444559097290039, + "learning_rate": 1.4487575881627182e-05, + "loss": 0.1867, + "step": 20834 + }, + { + "epoch": 0.41672, + "grad_norm": 8.540923118591309, + "learning_rate": 1.4486328062173394e-05, + "loss": 0.1407, + "step": 20836 + }, + { + "epoch": 0.41676, + "grad_norm": 1.5162941217422485, + "learning_rate": 1.4485080155256333e-05, + "loss": 0.1056, + "step": 20838 + }, + { + "epoch": 0.4168, + "grad_norm": 0.39432692527770996, + "learning_rate": 1.4483832160900326e-05, + "loss": 0.048, + "step": 20840 + }, + { + "epoch": 0.41684, + "grad_norm": 6.689739227294922, + "learning_rate": 1.44825840791297e-05, + "loss": 0.2763, + "step": 20842 + }, + { + "epoch": 0.41688, + "grad_norm": 0.11600014567375183, + "learning_rate": 1.448133590996879e-05, + "loss": 0.3756, + "step": 20844 + }, + { + "epoch": 0.41692, + "grad_norm": 1.0759398937225342, + "learning_rate": 1.4480087653441927e-05, + "loss": 0.0221, + "step": 20846 + }, + { + "epoch": 0.41696, + "grad_norm": 2.2419607639312744, + "learning_rate": 1.4478839309573454e-05, + "loss": 0.0647, + "step": 20848 + }, + { + "epoch": 0.417, + "grad_norm": 13.015746116638184, + "learning_rate": 1.4477590878387697e-05, + "loss": 0.4917, + "step": 20850 + }, + { + "epoch": 0.41704, + "grad_norm": 0.4847770631313324, + "learning_rate": 1.4476342359909004e-05, + "loss": 0.0237, + "step": 20852 + }, + { + "epoch": 0.41708, + "grad_norm": 0.5505168437957764, + "learning_rate": 1.4475093754161711e-05, + "loss": 0.1067, + "step": 20854 + }, + { + "epoch": 0.41712, + "grad_norm": 0.184028759598732, + "learning_rate": 1.4473845061170165e-05, + "loss": 0.009, + "step": 20856 + }, + { + "epoch": 0.41716, + "grad_norm": 0.35153618454933167, + "learning_rate": 1.4472596280958704e-05, + "loss": 0.0081, + "step": 20858 + }, + { + "epoch": 0.4172, + "grad_norm": 0.14959849417209625, + "learning_rate": 1.4471347413551673e-05, + "loss": 0.0186, + "step": 20860 + }, + { + "epoch": 0.41724, + "grad_norm": 0.8195196986198425, + "learning_rate": 1.4470098458973425e-05, + "loss": 0.062, + "step": 20862 + }, + { + "epoch": 0.41728, + "grad_norm": 1.0487048625946045, + "learning_rate": 1.4468849417248305e-05, + "loss": 0.0362, + "step": 20864 + }, + { + "epoch": 0.41732, + "grad_norm": 0.0677654966711998, + "learning_rate": 1.4467600288400665e-05, + "loss": 0.0078, + "step": 20866 + }, + { + "epoch": 0.41736, + "grad_norm": 0.4628746509552002, + "learning_rate": 1.4466351072454856e-05, + "loss": 0.0496, + "step": 20868 + }, + { + "epoch": 0.4174, + "grad_norm": 4.08014440536499, + "learning_rate": 1.4465101769435235e-05, + "loss": 0.424, + "step": 20870 + }, + { + "epoch": 0.41744, + "grad_norm": 4.379924774169922, + "learning_rate": 1.446385237936616e-05, + "loss": 0.1767, + "step": 20872 + }, + { + "epoch": 0.41748, + "grad_norm": 6.632184982299805, + "learning_rate": 1.446260290227198e-05, + "loss": 0.3181, + "step": 20874 + }, + { + "epoch": 0.41752, + "grad_norm": 0.36029019951820374, + "learning_rate": 1.446135333817706e-05, + "loss": 0.0655, + "step": 20876 + }, + { + "epoch": 0.41756, + "grad_norm": 0.3356572389602661, + "learning_rate": 1.4460103687105758e-05, + "loss": 0.0177, + "step": 20878 + }, + { + "epoch": 0.4176, + "grad_norm": 0.8914333581924438, + "learning_rate": 1.4458853949082443e-05, + "loss": 0.0482, + "step": 20880 + }, + { + "epoch": 0.41764, + "grad_norm": 0.06988471001386642, + "learning_rate": 1.445760412413147e-05, + "loss": 0.0204, + "step": 20882 + }, + { + "epoch": 0.41768, + "grad_norm": 0.7361657023429871, + "learning_rate": 1.445635421227721e-05, + "loss": 0.0309, + "step": 20884 + }, + { + "epoch": 0.41772, + "grad_norm": 0.4174771308898926, + "learning_rate": 1.445510421354403e-05, + "loss": 0.0162, + "step": 20886 + }, + { + "epoch": 0.41776, + "grad_norm": 3.9504387378692627, + "learning_rate": 1.4453854127956305e-05, + "loss": 0.1966, + "step": 20888 + }, + { + "epoch": 0.4178, + "grad_norm": 0.24165746569633484, + "learning_rate": 1.4452603955538397e-05, + "loss": 0.02, + "step": 20890 + }, + { + "epoch": 0.41784, + "grad_norm": 0.5650448203086853, + "learning_rate": 1.4451353696314683e-05, + "loss": 0.0374, + "step": 20892 + }, + { + "epoch": 0.41788, + "grad_norm": 2.704866409301758, + "learning_rate": 1.4450103350309536e-05, + "loss": 0.0956, + "step": 20894 + }, + { + "epoch": 0.41792, + "grad_norm": 1.718785285949707, + "learning_rate": 1.4448852917547336e-05, + "loss": 0.0563, + "step": 20896 + }, + { + "epoch": 0.41796, + "grad_norm": 0.47051653265953064, + "learning_rate": 1.4447602398052457e-05, + "loss": 0.0106, + "step": 20898 + }, + { + "epoch": 0.418, + "grad_norm": 0.19166041910648346, + "learning_rate": 1.4446351791849276e-05, + "loss": 0.007, + "step": 20900 + }, + { + "epoch": 0.41804, + "grad_norm": 0.20351430773735046, + "learning_rate": 1.444510109896218e-05, + "loss": 0.008, + "step": 20902 + }, + { + "epoch": 0.41808, + "grad_norm": 10.319317817687988, + "learning_rate": 1.4443850319415549e-05, + "loss": 0.5619, + "step": 20904 + }, + { + "epoch": 0.41812, + "grad_norm": 5.405810356140137, + "learning_rate": 1.444259945323377e-05, + "loss": 0.5083, + "step": 20906 + }, + { + "epoch": 0.41816, + "grad_norm": 0.18200142681598663, + "learning_rate": 1.4441348500441227e-05, + "loss": 0.0505, + "step": 20908 + }, + { + "epoch": 0.4182, + "grad_norm": 0.46438154578208923, + "learning_rate": 1.4440097461062308e-05, + "loss": 0.0116, + "step": 20910 + }, + { + "epoch": 0.41824, + "grad_norm": 0.6188514232635498, + "learning_rate": 1.4438846335121402e-05, + "loss": 0.0387, + "step": 20912 + }, + { + "epoch": 0.41828, + "grad_norm": 2.815110921859741, + "learning_rate": 1.4437595122642902e-05, + "loss": 0.106, + "step": 20914 + }, + { + "epoch": 0.41832, + "grad_norm": 0.3524959683418274, + "learning_rate": 1.44363438236512e-05, + "loss": 0.1252, + "step": 20916 + }, + { + "epoch": 0.41836, + "grad_norm": 1.3061641454696655, + "learning_rate": 1.4435092438170692e-05, + "loss": 0.1018, + "step": 20918 + }, + { + "epoch": 0.4184, + "grad_norm": 0.34870269894599915, + "learning_rate": 1.4433840966225772e-05, + "loss": 0.0421, + "step": 20920 + }, + { + "epoch": 0.41844, + "grad_norm": 1.4492168426513672, + "learning_rate": 1.4432589407840843e-05, + "loss": 0.0647, + "step": 20922 + }, + { + "epoch": 0.41848, + "grad_norm": 0.1799827367067337, + "learning_rate": 1.4431337763040298e-05, + "loss": 0.0109, + "step": 20924 + }, + { + "epoch": 0.41852, + "grad_norm": 4.2852606773376465, + "learning_rate": 1.4430086031848545e-05, + "loss": 0.2837, + "step": 20926 + }, + { + "epoch": 0.41856, + "grad_norm": 1.4303083419799805, + "learning_rate": 1.4428834214289982e-05, + "loss": 0.0508, + "step": 20928 + }, + { + "epoch": 0.4186, + "grad_norm": 0.3398796617984772, + "learning_rate": 1.442758231038902e-05, + "loss": 0.1035, + "step": 20930 + }, + { + "epoch": 0.41864, + "grad_norm": 0.3706623911857605, + "learning_rate": 1.4426330320170055e-05, + "loss": 0.1207, + "step": 20932 + }, + { + "epoch": 0.41868, + "grad_norm": 4.781558036804199, + "learning_rate": 1.4425078243657504e-05, + "loss": 0.1903, + "step": 20934 + }, + { + "epoch": 0.41872, + "grad_norm": 0.9082777500152588, + "learning_rate": 1.4423826080875773e-05, + "loss": 0.0288, + "step": 20936 + }, + { + "epoch": 0.41876, + "grad_norm": 6.200571060180664, + "learning_rate": 1.4422573831849278e-05, + "loss": 0.7846, + "step": 20938 + }, + { + "epoch": 0.4188, + "grad_norm": 1.3461445569992065, + "learning_rate": 1.4421321496602428e-05, + "loss": 0.0278, + "step": 20940 + }, + { + "epoch": 0.41884, + "grad_norm": 1.037987232208252, + "learning_rate": 1.442006907515964e-05, + "loss": 0.1974, + "step": 20942 + }, + { + "epoch": 0.41888, + "grad_norm": 2.0008904933929443, + "learning_rate": 1.4418816567545328e-05, + "loss": 0.0583, + "step": 20944 + }, + { + "epoch": 0.41892, + "grad_norm": 6.681247234344482, + "learning_rate": 1.4417563973783914e-05, + "loss": 0.2536, + "step": 20946 + }, + { + "epoch": 0.41896, + "grad_norm": 0.5391479730606079, + "learning_rate": 1.4416311293899816e-05, + "loss": 0.0157, + "step": 20948 + }, + { + "epoch": 0.419, + "grad_norm": 10.445176124572754, + "learning_rate": 1.4415058527917454e-05, + "loss": 0.3884, + "step": 20950 + }, + { + "epoch": 0.41904, + "grad_norm": 0.17170917987823486, + "learning_rate": 1.4413805675861252e-05, + "loss": 0.0309, + "step": 20952 + }, + { + "epoch": 0.41908, + "grad_norm": 4.253650665283203, + "learning_rate": 1.4412552737755641e-05, + "loss": 0.1984, + "step": 20954 + }, + { + "epoch": 0.41912, + "grad_norm": 5.2246294021606445, + "learning_rate": 1.4411299713625037e-05, + "loss": 0.2055, + "step": 20956 + }, + { + "epoch": 0.41916, + "grad_norm": 2.177748441696167, + "learning_rate": 1.4410046603493877e-05, + "loss": 0.0711, + "step": 20958 + }, + { + "epoch": 0.4192, + "grad_norm": 0.2557746171951294, + "learning_rate": 1.4408793407386587e-05, + "loss": 0.0516, + "step": 20960 + }, + { + "epoch": 0.41924, + "grad_norm": 0.31493502855300903, + "learning_rate": 1.4407540125327601e-05, + "loss": 0.2894, + "step": 20962 + }, + { + "epoch": 0.41928, + "grad_norm": 0.1223224475979805, + "learning_rate": 1.440628675734135e-05, + "loss": 0.0072, + "step": 20964 + }, + { + "epoch": 0.41932, + "grad_norm": 7.953594207763672, + "learning_rate": 1.440503330345227e-05, + "loss": 0.2939, + "step": 20966 + }, + { + "epoch": 0.41936, + "grad_norm": 1.9923067092895508, + "learning_rate": 1.4403779763684797e-05, + "loss": 0.0621, + "step": 20968 + }, + { + "epoch": 0.4194, + "grad_norm": 2.1618173122406006, + "learning_rate": 1.4402526138063373e-05, + "loss": 0.0602, + "step": 20970 + }, + { + "epoch": 0.41944, + "grad_norm": 3.1657817363739014, + "learning_rate": 1.4401272426612432e-05, + "loss": 0.346, + "step": 20972 + }, + { + "epoch": 0.41948, + "grad_norm": 0.16808032989501953, + "learning_rate": 1.440001862935642e-05, + "loss": 0.0379, + "step": 20974 + }, + { + "epoch": 0.41952, + "grad_norm": 2.6337382793426514, + "learning_rate": 1.4398764746319783e-05, + "loss": 0.1256, + "step": 20976 + }, + { + "epoch": 0.41956, + "grad_norm": 0.22046241164207458, + "learning_rate": 1.439751077752696e-05, + "loss": 0.0699, + "step": 20978 + }, + { + "epoch": 0.4196, + "grad_norm": 3.6300032138824463, + "learning_rate": 1.43962567230024e-05, + "loss": 0.0744, + "step": 20980 + }, + { + "epoch": 0.41964, + "grad_norm": 0.3867838680744171, + "learning_rate": 1.4395002582770552e-05, + "loss": 0.3814, + "step": 20982 + }, + { + "epoch": 0.41968, + "grad_norm": 1.5958555936813354, + "learning_rate": 1.4393748356855865e-05, + "loss": 0.1383, + "step": 20984 + }, + { + "epoch": 0.41972, + "grad_norm": 0.6414316892623901, + "learning_rate": 1.4392494045282797e-05, + "loss": 0.0164, + "step": 20986 + }, + { + "epoch": 0.41976, + "grad_norm": 3.756026268005371, + "learning_rate": 1.439123964807579e-05, + "loss": 0.1982, + "step": 20988 + }, + { + "epoch": 0.4198, + "grad_norm": 2.255112648010254, + "learning_rate": 1.4389985165259308e-05, + "loss": 0.1024, + "step": 20990 + }, + { + "epoch": 0.41984, + "grad_norm": 0.42729151248931885, + "learning_rate": 1.4388730596857803e-05, + "loss": 0.0137, + "step": 20992 + }, + { + "epoch": 0.41988, + "grad_norm": 0.05234821140766144, + "learning_rate": 1.4387475942895739e-05, + "loss": 0.0535, + "step": 20994 + }, + { + "epoch": 0.41992, + "grad_norm": 4.0476555824279785, + "learning_rate": 1.438622120339757e-05, + "loss": 0.1961, + "step": 20996 + }, + { + "epoch": 0.41996, + "grad_norm": 0.8334672451019287, + "learning_rate": 1.4384966378387762e-05, + "loss": 0.03, + "step": 20998 + }, + { + "epoch": 0.42, + "grad_norm": 0.7605409622192383, + "learning_rate": 1.4383711467890776e-05, + "loss": 0.0982, + "step": 21000 + }, + { + "epoch": 0.42004, + "grad_norm": 0.9089729189872742, + "learning_rate": 1.4382456471931077e-05, + "loss": 0.1456, + "step": 21002 + }, + { + "epoch": 0.42008, + "grad_norm": 3.714945077896118, + "learning_rate": 1.4381201390533134e-05, + "loss": 0.1754, + "step": 21004 + }, + { + "epoch": 0.42012, + "grad_norm": 0.2614938020706177, + "learning_rate": 1.4379946223721415e-05, + "loss": 0.0947, + "step": 21006 + }, + { + "epoch": 0.42016, + "grad_norm": 0.2557194232940674, + "learning_rate": 1.4378690971520388e-05, + "loss": 0.0402, + "step": 21008 + }, + { + "epoch": 0.4202, + "grad_norm": 0.08807345479726791, + "learning_rate": 1.4377435633954528e-05, + "loss": 0.014, + "step": 21010 + }, + { + "epoch": 0.42024, + "grad_norm": 0.044121649116277695, + "learning_rate": 1.4376180211048305e-05, + "loss": 0.0592, + "step": 21012 + }, + { + "epoch": 0.42028, + "grad_norm": 0.965806245803833, + "learning_rate": 1.4374924702826199e-05, + "loss": 0.0286, + "step": 21014 + }, + { + "epoch": 0.42032, + "grad_norm": 0.28609687089920044, + "learning_rate": 1.437366910931268e-05, + "loss": 0.0528, + "step": 21016 + }, + { + "epoch": 0.42036, + "grad_norm": 0.1342650055885315, + "learning_rate": 1.4372413430532231e-05, + "loss": 0.0745, + "step": 21018 + }, + { + "epoch": 0.4204, + "grad_norm": 0.22506046295166016, + "learning_rate": 1.437115766650933e-05, + "loss": 0.0237, + "step": 21020 + }, + { + "epoch": 0.42044, + "grad_norm": 0.055640771985054016, + "learning_rate": 1.436990181726846e-05, + "loss": 0.0974, + "step": 21022 + }, + { + "epoch": 0.42048, + "grad_norm": 0.8441785573959351, + "learning_rate": 1.4368645882834104e-05, + "loss": 0.0506, + "step": 21024 + }, + { + "epoch": 0.42052, + "grad_norm": 0.12662677466869354, + "learning_rate": 1.4367389863230749e-05, + "loss": 0.0607, + "step": 21026 + }, + { + "epoch": 0.42056, + "grad_norm": 0.35263559222221375, + "learning_rate": 1.4366133758482881e-05, + "loss": 0.019, + "step": 21028 + }, + { + "epoch": 0.4206, + "grad_norm": 0.5650438666343689, + "learning_rate": 1.436487756861499e-05, + "loss": 0.0144, + "step": 21030 + }, + { + "epoch": 0.42064, + "grad_norm": 0.09568748623132706, + "learning_rate": 1.4363621293651557e-05, + "loss": 0.0541, + "step": 21032 + }, + { + "epoch": 0.42068, + "grad_norm": 0.21715250611305237, + "learning_rate": 1.4362364933617083e-05, + "loss": 0.0682, + "step": 21034 + }, + { + "epoch": 0.42072, + "grad_norm": 1.096156120300293, + "learning_rate": 1.436110848853606e-05, + "loss": 0.0307, + "step": 21036 + }, + { + "epoch": 0.42076, + "grad_norm": 0.050642818212509155, + "learning_rate": 1.4359851958432983e-05, + "loss": 0.002, + "step": 21038 + }, + { + "epoch": 0.4208, + "grad_norm": 4.765760898590088, + "learning_rate": 1.4358595343332342e-05, + "loss": 0.1239, + "step": 21040 + }, + { + "epoch": 0.42084, + "grad_norm": 0.19872090220451355, + "learning_rate": 1.4357338643258643e-05, + "loss": 0.0049, + "step": 21042 + }, + { + "epoch": 0.42088, + "grad_norm": 1.9533100128173828, + "learning_rate": 1.4356081858236385e-05, + "loss": 0.0486, + "step": 21044 + }, + { + "epoch": 0.42092, + "grad_norm": 0.009902294725179672, + "learning_rate": 1.435482498829007e-05, + "loss": 0.0422, + "step": 21046 + }, + { + "epoch": 0.42096, + "grad_norm": 8.695076942443848, + "learning_rate": 1.4353568033444198e-05, + "loss": 0.151, + "step": 21048 + }, + { + "epoch": 0.421, + "grad_norm": 1.1114933490753174, + "learning_rate": 1.4352310993723277e-05, + "loss": 0.0429, + "step": 21050 + }, + { + "epoch": 0.42104, + "grad_norm": 0.6717561483383179, + "learning_rate": 1.4351053869151812e-05, + "loss": 0.0152, + "step": 21052 + }, + { + "epoch": 0.42108, + "grad_norm": 6.936482906341553, + "learning_rate": 1.4349796659754312e-05, + "loss": 0.3204, + "step": 21054 + }, + { + "epoch": 0.42112, + "grad_norm": 0.000862398708704859, + "learning_rate": 1.4348539365555283e-05, + "loss": 0.017, + "step": 21056 + }, + { + "epoch": 0.42116, + "grad_norm": 0.19534771144390106, + "learning_rate": 1.4347281986579242e-05, + "loss": 0.1102, + "step": 21058 + }, + { + "epoch": 0.4212, + "grad_norm": 2.3930091857910156, + "learning_rate": 1.4346024522850704e-05, + "loss": 0.2992, + "step": 21060 + }, + { + "epoch": 0.42124, + "grad_norm": 5.040376663208008, + "learning_rate": 1.4344766974394177e-05, + "loss": 0.1325, + "step": 21062 + }, + { + "epoch": 0.42128, + "grad_norm": 0.19376565515995026, + "learning_rate": 1.4343509341234186e-05, + "loss": 0.0039, + "step": 21064 + }, + { + "epoch": 0.42132, + "grad_norm": 0.7717118859291077, + "learning_rate": 1.4342251623395239e-05, + "loss": 0.0144, + "step": 21066 + }, + { + "epoch": 0.42136, + "grad_norm": 1.16474449634552, + "learning_rate": 1.4340993820901864e-05, + "loss": 0.0289, + "step": 21068 + }, + { + "epoch": 0.4214, + "grad_norm": 1.4993611574172974, + "learning_rate": 1.4339735933778576e-05, + "loss": 0.024, + "step": 21070 + }, + { + "epoch": 0.42144, + "grad_norm": 0.5253115892410278, + "learning_rate": 1.4338477962049903e-05, + "loss": 0.0189, + "step": 21072 + }, + { + "epoch": 0.42148, + "grad_norm": 0.2586051821708679, + "learning_rate": 1.433721990574037e-05, + "loss": 0.361, + "step": 21074 + }, + { + "epoch": 0.42152, + "grad_norm": 0.6900949478149414, + "learning_rate": 1.4335961764874502e-05, + "loss": 0.0176, + "step": 21076 + }, + { + "epoch": 0.42156, + "grad_norm": 0.10626737028360367, + "learning_rate": 1.4334703539476826e-05, + "loss": 0.1773, + "step": 21078 + }, + { + "epoch": 0.4216, + "grad_norm": 6.707786560058594, + "learning_rate": 1.4333445229571874e-05, + "loss": 0.4264, + "step": 21080 + }, + { + "epoch": 0.42164, + "grad_norm": 0.26821717619895935, + "learning_rate": 1.4332186835184174e-05, + "loss": 0.0837, + "step": 21082 + }, + { + "epoch": 0.42168, + "grad_norm": 0.7980112433433533, + "learning_rate": 1.4330928356338265e-05, + "loss": 0.0197, + "step": 21084 + }, + { + "epoch": 0.42172, + "grad_norm": 2.988243341445923, + "learning_rate": 1.4329669793058676e-05, + "loss": 0.06, + "step": 21086 + }, + { + "epoch": 0.42176, + "grad_norm": 2.4509634971618652, + "learning_rate": 1.4328411145369942e-05, + "loss": 0.0558, + "step": 21088 + }, + { + "epoch": 0.4218, + "grad_norm": 3.670501232147217, + "learning_rate": 1.4327152413296607e-05, + "loss": 0.3728, + "step": 21090 + }, + { + "epoch": 0.42184, + "grad_norm": 3.044402837753296, + "learning_rate": 1.432589359686321e-05, + "loss": 0.1305, + "step": 21092 + }, + { + "epoch": 0.42188, + "grad_norm": 0.3555263578891754, + "learning_rate": 1.4324634696094288e-05, + "loss": 0.0242, + "step": 21094 + }, + { + "epoch": 0.42192, + "grad_norm": 0.4786948263645172, + "learning_rate": 1.4323375711014386e-05, + "loss": 0.1251, + "step": 21096 + }, + { + "epoch": 0.42196, + "grad_norm": 0.09610778838396072, + "learning_rate": 1.4322116641648052e-05, + "loss": 0.0438, + "step": 21098 + }, + { + "epoch": 0.422, + "grad_norm": 0.04625485837459564, + "learning_rate": 1.4320857488019826e-05, + "loss": 0.0277, + "step": 21100 + }, + { + "epoch": 0.42204, + "grad_norm": 0.37294793128967285, + "learning_rate": 1.4319598250154258e-05, + "loss": 0.0598, + "step": 21102 + }, + { + "epoch": 0.42208, + "grad_norm": 4.1445159912109375, + "learning_rate": 1.43183389280759e-05, + "loss": 0.3995, + "step": 21104 + }, + { + "epoch": 0.42212, + "grad_norm": 0.04052822291851044, + "learning_rate": 1.4317079521809299e-05, + "loss": 0.0384, + "step": 21106 + }, + { + "epoch": 0.42216, + "grad_norm": 4.223455429077148, + "learning_rate": 1.431582003137901e-05, + "loss": 0.0637, + "step": 21108 + }, + { + "epoch": 0.4222, + "grad_norm": 0.08229215443134308, + "learning_rate": 1.4314560456809592e-05, + "loss": 0.0063, + "step": 21110 + }, + { + "epoch": 0.42224, + "grad_norm": 0.20039337873458862, + "learning_rate": 1.4313300798125594e-05, + "loss": 0.0117, + "step": 21112 + }, + { + "epoch": 0.42228, + "grad_norm": 0.090336374938488, + "learning_rate": 1.4312041055351575e-05, + "loss": 0.0042, + "step": 21114 + }, + { + "epoch": 0.42232, + "grad_norm": 1.221958041191101, + "learning_rate": 1.4310781228512096e-05, + "loss": 0.029, + "step": 21116 + }, + { + "epoch": 0.42236, + "grad_norm": 0.22948575019836426, + "learning_rate": 1.430952131763172e-05, + "loss": 0.0336, + "step": 21118 + }, + { + "epoch": 0.4224, + "grad_norm": 2.9062581062316895, + "learning_rate": 1.4308261322735006e-05, + "loss": 0.0898, + "step": 21120 + }, + { + "epoch": 0.42244, + "grad_norm": 3.351998805999756, + "learning_rate": 1.430700124384652e-05, + "loss": 0.0898, + "step": 21122 + }, + { + "epoch": 0.42248, + "grad_norm": 0.34382784366607666, + "learning_rate": 1.4305741080990825e-05, + "loss": 0.0182, + "step": 21124 + }, + { + "epoch": 0.42252, + "grad_norm": 0.3683737516403198, + "learning_rate": 1.4304480834192494e-05, + "loss": 0.0196, + "step": 21126 + }, + { + "epoch": 0.42256, + "grad_norm": 0.0542580708861351, + "learning_rate": 1.430322050347609e-05, + "loss": 0.0053, + "step": 21128 + }, + { + "epoch": 0.4226, + "grad_norm": 5.8726277351379395, + "learning_rate": 1.4301960088866187e-05, + "loss": 0.1397, + "step": 21130 + }, + { + "epoch": 0.42264, + "grad_norm": 0.09681979566812515, + "learning_rate": 1.4300699590387359e-05, + "loss": 0.0079, + "step": 21132 + }, + { + "epoch": 0.42268, + "grad_norm": 0.2566986680030823, + "learning_rate": 1.429943900806418e-05, + "loss": 0.0132, + "step": 21134 + }, + { + "epoch": 0.42272, + "grad_norm": 0.07702215760946274, + "learning_rate": 1.429817834192122e-05, + "loss": 0.005, + "step": 21136 + }, + { + "epoch": 0.42276, + "grad_norm": 0.7401084899902344, + "learning_rate": 1.4296917591983062e-05, + "loss": 0.1744, + "step": 21138 + }, + { + "epoch": 0.4228, + "grad_norm": 3.312042713165283, + "learning_rate": 1.4295656758274283e-05, + "loss": 0.0655, + "step": 21140 + }, + { + "epoch": 0.42284, + "grad_norm": 4.704451560974121, + "learning_rate": 1.4294395840819465e-05, + "loss": 0.1646, + "step": 21142 + }, + { + "epoch": 0.42288, + "grad_norm": 0.1461869329214096, + "learning_rate": 1.429313483964319e-05, + "loss": 0.0031, + "step": 21144 + }, + { + "epoch": 0.42292, + "grad_norm": 0.16156551241874695, + "learning_rate": 1.4291873754770038e-05, + "loss": 0.0036, + "step": 21146 + }, + { + "epoch": 0.42296, + "grad_norm": 0.22668993473052979, + "learning_rate": 1.42906125862246e-05, + "loss": 0.1196, + "step": 21148 + }, + { + "epoch": 0.423, + "grad_norm": 4.676059246063232, + "learning_rate": 1.4289351334031461e-05, + "loss": 0.1079, + "step": 21150 + }, + { + "epoch": 0.42304, + "grad_norm": 0.09266293048858643, + "learning_rate": 1.4288089998215209e-05, + "loss": 0.0353, + "step": 21152 + }, + { + "epoch": 0.42308, + "grad_norm": 0.060397326946258545, + "learning_rate": 1.4286828578800434e-05, + "loss": 0.3553, + "step": 21154 + }, + { + "epoch": 0.42312, + "grad_norm": 5.916524887084961, + "learning_rate": 1.4285567075811728e-05, + "loss": 0.1282, + "step": 21156 + }, + { + "epoch": 0.42316, + "grad_norm": 0.142455592751503, + "learning_rate": 1.4284305489273686e-05, + "loss": 0.0667, + "step": 21158 + }, + { + "epoch": 0.4232, + "grad_norm": 1.8176215887069702, + "learning_rate": 1.4283043819210905e-05, + "loss": 0.1462, + "step": 21160 + }, + { + "epoch": 0.42324, + "grad_norm": 6.45490026473999, + "learning_rate": 1.4281782065647978e-05, + "loss": 0.5357, + "step": 21162 + }, + { + "epoch": 0.42328, + "grad_norm": 0.12805739045143127, + "learning_rate": 1.4280520228609503e-05, + "loss": 0.044, + "step": 21164 + }, + { + "epoch": 0.42332, + "grad_norm": 0.24344398081302643, + "learning_rate": 1.4279258308120087e-05, + "loss": 0.0061, + "step": 21166 + }, + { + "epoch": 0.42336, + "grad_norm": 11.86435604095459, + "learning_rate": 1.4277996304204324e-05, + "loss": 0.8605, + "step": 21168 + }, + { + "epoch": 0.4234, + "grad_norm": 0.18161284923553467, + "learning_rate": 1.4276734216886823e-05, + "loss": 0.018, + "step": 21170 + }, + { + "epoch": 0.42344, + "grad_norm": 0.2130209356546402, + "learning_rate": 1.4275472046192182e-05, + "loss": 0.4339, + "step": 21172 + }, + { + "epoch": 0.42348, + "grad_norm": 0.6022624373435974, + "learning_rate": 1.4274209792145017e-05, + "loss": 0.0909, + "step": 21174 + }, + { + "epoch": 0.42352, + "grad_norm": 0.17922160029411316, + "learning_rate": 1.427294745476993e-05, + "loss": 0.0101, + "step": 21176 + }, + { + "epoch": 0.42356, + "grad_norm": 0.12300986051559448, + "learning_rate": 1.4271685034091531e-05, + "loss": 0.0157, + "step": 21178 + }, + { + "epoch": 0.4236, + "grad_norm": 0.8590319156646729, + "learning_rate": 1.4270422530134433e-05, + "loss": 0.2513, + "step": 21180 + }, + { + "epoch": 0.42364, + "grad_norm": 4.018752098083496, + "learning_rate": 1.4269159942923253e-05, + "loss": 0.1295, + "step": 21182 + }, + { + "epoch": 0.42368, + "grad_norm": 0.749228835105896, + "learning_rate": 1.4267897272482601e-05, + "loss": 0.0772, + "step": 21184 + }, + { + "epoch": 0.42372, + "grad_norm": 1.3328666687011719, + "learning_rate": 1.4266634518837092e-05, + "loss": 0.2004, + "step": 21186 + }, + { + "epoch": 0.42376, + "grad_norm": 0.3991926312446594, + "learning_rate": 1.4265371682011349e-05, + "loss": 0.0223, + "step": 21188 + }, + { + "epoch": 0.4238, + "grad_norm": 0.734405517578125, + "learning_rate": 1.4264108762029989e-05, + "loss": 0.0566, + "step": 21190 + }, + { + "epoch": 0.42384, + "grad_norm": 0.17800576984882355, + "learning_rate": 1.426284575891763e-05, + "loss": 0.0096, + "step": 21192 + }, + { + "epoch": 0.42388, + "grad_norm": 1.877294898033142, + "learning_rate": 1.42615826726989e-05, + "loss": 0.0605, + "step": 21194 + }, + { + "epoch": 0.42392, + "grad_norm": 0.7458105087280273, + "learning_rate": 1.4260319503398421e-05, + "loss": 0.0218, + "step": 21196 + }, + { + "epoch": 0.42396, + "grad_norm": 7.127344608306885, + "learning_rate": 1.4259056251040821e-05, + "loss": 0.1998, + "step": 21198 + }, + { + "epoch": 0.424, + "grad_norm": 0.5516077280044556, + "learning_rate": 1.4257792915650728e-05, + "loss": 0.0656, + "step": 21200 + }, + { + "epoch": 0.42404, + "grad_norm": 0.6389816403388977, + "learning_rate": 1.425652949725277e-05, + "loss": 0.0174, + "step": 21202 + }, + { + "epoch": 0.42408, + "grad_norm": 0.9680565595626831, + "learning_rate": 1.4255265995871574e-05, + "loss": 0.0408, + "step": 21204 + }, + { + "epoch": 0.42412, + "grad_norm": 3.2662742137908936, + "learning_rate": 1.425400241153178e-05, + "loss": 0.0687, + "step": 21206 + }, + { + "epoch": 0.42416, + "grad_norm": 0.02578105218708515, + "learning_rate": 1.4252738744258019e-05, + "loss": 0.0109, + "step": 21208 + }, + { + "epoch": 0.4242, + "grad_norm": 0.07658355683088303, + "learning_rate": 1.4251474994074927e-05, + "loss": 0.0358, + "step": 21210 + }, + { + "epoch": 0.42424, + "grad_norm": 8.752213478088379, + "learning_rate": 1.425021116100714e-05, + "loss": 0.5788, + "step": 21212 + }, + { + "epoch": 0.42428, + "grad_norm": 1.738503336906433, + "learning_rate": 1.4248947245079297e-05, + "loss": 0.1252, + "step": 21214 + }, + { + "epoch": 0.42432, + "grad_norm": 0.07151328027248383, + "learning_rate": 1.4247683246316042e-05, + "loss": 0.1912, + "step": 21216 + }, + { + "epoch": 0.42436, + "grad_norm": 0.38795381784439087, + "learning_rate": 1.4246419164742013e-05, + "loss": 0.0116, + "step": 21218 + }, + { + "epoch": 0.4244, + "grad_norm": 0.4472762942314148, + "learning_rate": 1.424515500038186e-05, + "loss": 0.0166, + "step": 21220 + }, + { + "epoch": 0.42444, + "grad_norm": 5.968524932861328, + "learning_rate": 1.424389075326022e-05, + "loss": 0.3696, + "step": 21222 + }, + { + "epoch": 0.42448, + "grad_norm": 0.2665546238422394, + "learning_rate": 1.424262642340175e-05, + "loss": 0.0112, + "step": 21224 + }, + { + "epoch": 0.42452, + "grad_norm": 0.1335950344800949, + "learning_rate": 1.4241362010831092e-05, + "loss": 0.0991, + "step": 21226 + }, + { + "epoch": 0.42456, + "grad_norm": 0.5232160687446594, + "learning_rate": 1.4240097515572896e-05, + "loss": 0.0224, + "step": 21228 + }, + { + "epoch": 0.4246, + "grad_norm": 4.208860874176025, + "learning_rate": 1.4238832937651816e-05, + "loss": 0.1498, + "step": 21230 + }, + { + "epoch": 0.42464, + "grad_norm": 3.4781711101531982, + "learning_rate": 1.4237568277092509e-05, + "loss": 0.1326, + "step": 21232 + }, + { + "epoch": 0.42468, + "grad_norm": 0.7050960659980774, + "learning_rate": 1.4236303533919622e-05, + "loss": 0.0251, + "step": 21234 + }, + { + "epoch": 0.42472, + "grad_norm": 0.3269438147544861, + "learning_rate": 1.423503870815782e-05, + "loss": 0.0479, + "step": 21236 + }, + { + "epoch": 0.42476, + "grad_norm": 0.019716067239642143, + "learning_rate": 1.4233773799831759e-05, + "loss": 0.003, + "step": 21238 + }, + { + "epoch": 0.4248, + "grad_norm": 2.944213390350342, + "learning_rate": 1.4232508808966097e-05, + "loss": 0.1363, + "step": 21240 + }, + { + "epoch": 0.42484, + "grad_norm": 3.106740713119507, + "learning_rate": 1.4231243735585493e-05, + "loss": 0.1394, + "step": 21242 + }, + { + "epoch": 0.42488, + "grad_norm": 1.0450763702392578, + "learning_rate": 1.4229978579714617e-05, + "loss": 0.0374, + "step": 21244 + }, + { + "epoch": 0.42492, + "grad_norm": 6.098747730255127, + "learning_rate": 1.422871334137813e-05, + "loss": 0.2096, + "step": 21246 + }, + { + "epoch": 0.42496, + "grad_norm": 0.2866940498352051, + "learning_rate": 1.4227448020600702e-05, + "loss": 0.051, + "step": 21248 + }, + { + "epoch": 0.425, + "grad_norm": 7.562528610229492, + "learning_rate": 1.4226182617406996e-05, + "loss": 0.3638, + "step": 21250 + }, + { + "epoch": 0.42504, + "grad_norm": 4.74235200881958, + "learning_rate": 1.4224917131821682e-05, + "loss": 0.1963, + "step": 21252 + }, + { + "epoch": 0.42508, + "grad_norm": 1.573341727256775, + "learning_rate": 1.4223651563869438e-05, + "loss": 0.0763, + "step": 21254 + }, + { + "epoch": 0.42512, + "grad_norm": 0.20338726043701172, + "learning_rate": 1.4222385913574933e-05, + "loss": 0.0079, + "step": 21256 + }, + { + "epoch": 0.42516, + "grad_norm": 0.9583024978637695, + "learning_rate": 1.4221120180962836e-05, + "loss": 0.1113, + "step": 21258 + }, + { + "epoch": 0.4252, + "grad_norm": 0.6294077634811401, + "learning_rate": 1.4219854366057831e-05, + "loss": 0.0164, + "step": 21260 + }, + { + "epoch": 0.42524, + "grad_norm": 0.4787385165691376, + "learning_rate": 1.4218588468884588e-05, + "loss": 0.1584, + "step": 21262 + }, + { + "epoch": 0.42528, + "grad_norm": 0.046267878264188766, + "learning_rate": 1.4217322489467798e-05, + "loss": 0.0014, + "step": 21264 + }, + { + "epoch": 0.42532, + "grad_norm": 0.19003942608833313, + "learning_rate": 1.421605642783213e-05, + "loss": 0.0196, + "step": 21266 + }, + { + "epoch": 0.42536, + "grad_norm": 0.6500610709190369, + "learning_rate": 1.421479028400227e-05, + "loss": 0.0265, + "step": 21268 + }, + { + "epoch": 0.4254, + "grad_norm": 0.24465972185134888, + "learning_rate": 1.421352405800291e-05, + "loss": 0.0326, + "step": 21270 + }, + { + "epoch": 0.42544, + "grad_norm": 0.16909773647785187, + "learning_rate": 1.4212257749858727e-05, + "loss": 0.0763, + "step": 21272 + }, + { + "epoch": 0.42548, + "grad_norm": 0.828912079334259, + "learning_rate": 1.4210991359594406e-05, + "loss": 0.0539, + "step": 21274 + }, + { + "epoch": 0.42552, + "grad_norm": 0.0022263394203037024, + "learning_rate": 1.4209724887234643e-05, + "loss": 0.2346, + "step": 21276 + }, + { + "epoch": 0.42556, + "grad_norm": 0.6881316304206848, + "learning_rate": 1.4208458332804127e-05, + "loss": 0.0176, + "step": 21278 + }, + { + "epoch": 0.4256, + "grad_norm": 0.6067637801170349, + "learning_rate": 1.420719169632755e-05, + "loss": 0.0825, + "step": 21280 + }, + { + "epoch": 0.42564, + "grad_norm": 0.9284895062446594, + "learning_rate": 1.4205924977829601e-05, + "loss": 0.1742, + "step": 21282 + }, + { + "epoch": 0.42568, + "grad_norm": 1.0684144496917725, + "learning_rate": 1.420465817733498e-05, + "loss": 0.0457, + "step": 21284 + }, + { + "epoch": 0.42572, + "grad_norm": 0.6049753427505493, + "learning_rate": 1.4203391294868385e-05, + "loss": 0.0236, + "step": 21286 + }, + { + "epoch": 0.42576, + "grad_norm": 0.5724116563796997, + "learning_rate": 1.4202124330454514e-05, + "loss": 0.0209, + "step": 21288 + }, + { + "epoch": 0.4258, + "grad_norm": 1.3873716592788696, + "learning_rate": 1.4200857284118067e-05, + "loss": 0.0322, + "step": 21290 + }, + { + "epoch": 0.42584, + "grad_norm": 1.9368702173233032, + "learning_rate": 1.4199590155883739e-05, + "loss": 0.0455, + "step": 21292 + }, + { + "epoch": 0.42588, + "grad_norm": 4.107070446014404, + "learning_rate": 1.4198322945776241e-05, + "loss": 0.1393, + "step": 21294 + }, + { + "epoch": 0.42592, + "grad_norm": 0.7535285949707031, + "learning_rate": 1.4197055653820277e-05, + "loss": 0.2969, + "step": 21296 + }, + { + "epoch": 0.42596, + "grad_norm": 3.6992459297180176, + "learning_rate": 1.4195788280040552e-05, + "loss": 0.2213, + "step": 21298 + }, + { + "epoch": 0.426, + "grad_norm": 0.29953432083129883, + "learning_rate": 1.4194520824461773e-05, + "loss": 0.0229, + "step": 21300 + }, + { + "epoch": 0.42604, + "grad_norm": 0.4078396260738373, + "learning_rate": 1.419325328710865e-05, + "loss": 0.0122, + "step": 21302 + }, + { + "epoch": 0.42608, + "grad_norm": 8.568266868591309, + "learning_rate": 1.41919856680059e-05, + "loss": 0.671, + "step": 21304 + }, + { + "epoch": 0.42612, + "grad_norm": 0.4735901355743408, + "learning_rate": 1.419071796717823e-05, + "loss": 0.0126, + "step": 21306 + }, + { + "epoch": 0.42616, + "grad_norm": 1.2341177463531494, + "learning_rate": 1.4189450184650354e-05, + "loss": 0.0369, + "step": 21308 + }, + { + "epoch": 0.4262, + "grad_norm": 0.42275404930114746, + "learning_rate": 1.4188182320446985e-05, + "loss": 0.0287, + "step": 21310 + }, + { + "epoch": 0.42624, + "grad_norm": 1.7456307411193848, + "learning_rate": 1.4186914374592853e-05, + "loss": 0.1611, + "step": 21312 + }, + { + "epoch": 0.42628, + "grad_norm": 0.07599857449531555, + "learning_rate": 1.4185646347112666e-05, + "loss": 0.0065, + "step": 21314 + }, + { + "epoch": 0.42632, + "grad_norm": 1.344786524772644, + "learning_rate": 1.418437823803115e-05, + "loss": 0.0221, + "step": 21316 + }, + { + "epoch": 0.42636, + "grad_norm": 0.4662238359451294, + "learning_rate": 1.4183110047373023e-05, + "loss": 0.1239, + "step": 21318 + }, + { + "epoch": 0.4264, + "grad_norm": 8.570446014404297, + "learning_rate": 1.4181841775163014e-05, + "loss": 0.2859, + "step": 21320 + }, + { + "epoch": 0.42644, + "grad_norm": 2.2464423179626465, + "learning_rate": 1.418057342142585e-05, + "loss": 0.0418, + "step": 21322 + }, + { + "epoch": 0.42648, + "grad_norm": 0.8576517105102539, + "learning_rate": 1.4179304986186253e-05, + "loss": 0.0627, + "step": 21324 + }, + { + "epoch": 0.42652, + "grad_norm": 0.5067144632339478, + "learning_rate": 1.4178036469468952e-05, + "loss": 0.0555, + "step": 21326 + }, + { + "epoch": 0.42656, + "grad_norm": 0.9081014394760132, + "learning_rate": 1.4176767871298679e-05, + "loss": 0.0378, + "step": 21328 + }, + { + "epoch": 0.4266, + "grad_norm": 0.12821421027183533, + "learning_rate": 1.4175499191700169e-05, + "loss": 0.0128, + "step": 21330 + }, + { + "epoch": 0.42664, + "grad_norm": 1.020243763923645, + "learning_rate": 1.4174230430698149e-05, + "loss": 0.0229, + "step": 21332 + }, + { + "epoch": 0.42668, + "grad_norm": 5.186869144439697, + "learning_rate": 1.4172961588317358e-05, + "loss": 0.0851, + "step": 21334 + }, + { + "epoch": 0.42672, + "grad_norm": 0.25428372621536255, + "learning_rate": 1.4171692664582533e-05, + "loss": 0.0066, + "step": 21336 + }, + { + "epoch": 0.42676, + "grad_norm": 0.01752658747136593, + "learning_rate": 1.4170423659518413e-05, + "loss": 0.3734, + "step": 21338 + }, + { + "epoch": 0.4268, + "grad_norm": 4.052397727966309, + "learning_rate": 1.4169154573149737e-05, + "loss": 0.101, + "step": 21340 + }, + { + "epoch": 0.42684, + "grad_norm": 0.09746850281953812, + "learning_rate": 1.4167885405501244e-05, + "loss": 0.0399, + "step": 21342 + }, + { + "epoch": 0.42688, + "grad_norm": 0.13635191321372986, + "learning_rate": 1.416661615659768e-05, + "loss": 0.0364, + "step": 21344 + }, + { + "epoch": 0.42692, + "grad_norm": 0.510616660118103, + "learning_rate": 1.4165346826463791e-05, + "loss": 0.011, + "step": 21346 + }, + { + "epoch": 0.42696, + "grad_norm": 0.1883372813463211, + "learning_rate": 1.4164077415124318e-05, + "loss": 0.0047, + "step": 21348 + }, + { + "epoch": 0.427, + "grad_norm": 1.4473716020584106, + "learning_rate": 1.4162807922604014e-05, + "loss": 0.0737, + "step": 21350 + }, + { + "epoch": 0.42704, + "grad_norm": 0.7261978983879089, + "learning_rate": 1.4161538348927624e-05, + "loss": 0.1306, + "step": 21352 + }, + { + "epoch": 0.42708, + "grad_norm": 10.835768699645996, + "learning_rate": 1.4160268694119905e-05, + "loss": 0.476, + "step": 21354 + }, + { + "epoch": 0.42712, + "grad_norm": 1.8505717515945435, + "learning_rate": 1.4158998958205604e-05, + "loss": 0.0319, + "step": 21356 + }, + { + "epoch": 0.42716, + "grad_norm": 0.6595346927642822, + "learning_rate": 1.4157729141209477e-05, + "loss": 0.0673, + "step": 21358 + }, + { + "epoch": 0.4272, + "grad_norm": 0.08689193427562714, + "learning_rate": 1.415645924315628e-05, + "loss": 0.0077, + "step": 21360 + }, + { + "epoch": 0.42724, + "grad_norm": 0.09413640201091766, + "learning_rate": 1.415518926407077e-05, + "loss": 0.0154, + "step": 21362 + }, + { + "epoch": 0.42728, + "grad_norm": 0.5917205810546875, + "learning_rate": 1.4153919203977706e-05, + "loss": 0.1603, + "step": 21364 + }, + { + "epoch": 0.42732, + "grad_norm": 6.390798568725586, + "learning_rate": 1.4152649062901849e-05, + "loss": 0.1273, + "step": 21366 + }, + { + "epoch": 0.42736, + "grad_norm": 4.777008533477783, + "learning_rate": 1.415137884086796e-05, + "loss": 0.2149, + "step": 21368 + }, + { + "epoch": 0.4274, + "grad_norm": 1.7265806198120117, + "learning_rate": 1.4150108537900805e-05, + "loss": 0.0513, + "step": 21370 + }, + { + "epoch": 0.42744, + "grad_norm": 1.1510144472122192, + "learning_rate": 1.4148838154025143e-05, + "loss": 0.0249, + "step": 21372 + }, + { + "epoch": 0.42748, + "grad_norm": 1.5556848049163818, + "learning_rate": 1.414756768926575e-05, + "loss": 0.1093, + "step": 21374 + }, + { + "epoch": 0.42752, + "grad_norm": 1.6870874166488647, + "learning_rate": 1.4146297143647388e-05, + "loss": 0.0365, + "step": 21376 + }, + { + "epoch": 0.42756, + "grad_norm": 1.3021432161331177, + "learning_rate": 1.4145026517194832e-05, + "loss": 0.032, + "step": 21378 + }, + { + "epoch": 0.4276, + "grad_norm": 7.848767280578613, + "learning_rate": 1.4143755809932843e-05, + "loss": 0.9638, + "step": 21380 + }, + { + "epoch": 0.42764, + "grad_norm": 0.039167605340480804, + "learning_rate": 1.4142485021886203e-05, + "loss": 0.0013, + "step": 21382 + }, + { + "epoch": 0.42768, + "grad_norm": 6.3971967697143555, + "learning_rate": 1.4141214153079688e-05, + "loss": 0.2544, + "step": 21384 + }, + { + "epoch": 0.42772, + "grad_norm": 0.7953847646713257, + "learning_rate": 1.413994320353807e-05, + "loss": 0.1925, + "step": 21386 + }, + { + "epoch": 0.42776, + "grad_norm": 4.703469753265381, + "learning_rate": 1.4138672173286128e-05, + "loss": 0.2026, + "step": 21388 + }, + { + "epoch": 0.4278, + "grad_norm": 1.4613499641418457, + "learning_rate": 1.4137401062348639e-05, + "loss": 0.0317, + "step": 21390 + }, + { + "epoch": 0.42784, + "grad_norm": 3.316371440887451, + "learning_rate": 1.4136129870750388e-05, + "loss": 0.0796, + "step": 21392 + }, + { + "epoch": 0.42788, + "grad_norm": 6.900356292724609, + "learning_rate": 1.4134858598516156e-05, + "loss": 0.3303, + "step": 21394 + }, + { + "epoch": 0.42792, + "grad_norm": 2.4867327213287354, + "learning_rate": 1.4133587245670728e-05, + "loss": 0.0673, + "step": 21396 + }, + { + "epoch": 0.42796, + "grad_norm": 0.08055495470762253, + "learning_rate": 1.4132315812238883e-05, + "loss": 0.1777, + "step": 21398 + }, + { + "epoch": 0.428, + "grad_norm": 0.18988686800003052, + "learning_rate": 1.413104429824542e-05, + "loss": 0.0208, + "step": 21400 + }, + { + "epoch": 0.42804, + "grad_norm": 4.561859607696533, + "learning_rate": 1.4129772703715118e-05, + "loss": 0.0945, + "step": 21402 + }, + { + "epoch": 0.42808, + "grad_norm": 2.1712727546691895, + "learning_rate": 1.4128501028672771e-05, + "loss": 0.0862, + "step": 21404 + }, + { + "epoch": 0.42812, + "grad_norm": 6.171627044677734, + "learning_rate": 1.4127229273143172e-05, + "loss": 0.1961, + "step": 21406 + }, + { + "epoch": 0.42816, + "grad_norm": 1.0873897075653076, + "learning_rate": 1.4125957437151116e-05, + "loss": 0.0262, + "step": 21408 + }, + { + "epoch": 0.4282, + "grad_norm": 0.8044955134391785, + "learning_rate": 1.4124685520721393e-05, + "loss": 0.0305, + "step": 21410 + }, + { + "epoch": 0.42824, + "grad_norm": 2.0014185905456543, + "learning_rate": 1.4123413523878804e-05, + "loss": 0.0443, + "step": 21412 + }, + { + "epoch": 0.42828, + "grad_norm": 0.37785279750823975, + "learning_rate": 1.4122141446648142e-05, + "loss": 0.0587, + "step": 21414 + }, + { + "epoch": 0.42832, + "grad_norm": 0.370246022939682, + "learning_rate": 1.412086928905421e-05, + "loss": 0.0253, + "step": 21416 + }, + { + "epoch": 0.42836, + "grad_norm": 1.7286045551300049, + "learning_rate": 1.4119597051121814e-05, + "loss": 0.0384, + "step": 21418 + }, + { + "epoch": 0.4284, + "grad_norm": 0.03757275640964508, + "learning_rate": 1.411832473287575e-05, + "loss": 0.0723, + "step": 21420 + }, + { + "epoch": 0.42844, + "grad_norm": 0.07154977321624756, + "learning_rate": 1.4117052334340826e-05, + "loss": 0.0103, + "step": 21422 + }, + { + "epoch": 0.42848, + "grad_norm": 0.003092928556725383, + "learning_rate": 1.4115779855541844e-05, + "loss": 0.0065, + "step": 21424 + }, + { + "epoch": 0.42852, + "grad_norm": 0.0451117642223835, + "learning_rate": 1.4114507296503622e-05, + "loss": 0.0221, + "step": 21426 + }, + { + "epoch": 0.42856, + "grad_norm": 0.8236647844314575, + "learning_rate": 1.4113234657250959e-05, + "loss": 0.0172, + "step": 21428 + }, + { + "epoch": 0.4286, + "grad_norm": 1.7824243307113647, + "learning_rate": 1.4111961937808665e-05, + "loss": 0.0378, + "step": 21430 + }, + { + "epoch": 0.42864, + "grad_norm": 0.09279848635196686, + "learning_rate": 1.4110689138201557e-05, + "loss": 0.0049, + "step": 21432 + }, + { + "epoch": 0.42868, + "grad_norm": 6.551976680755615, + "learning_rate": 1.410941625845445e-05, + "loss": 0.3847, + "step": 21434 + }, + { + "epoch": 0.42872, + "grad_norm": 1.0289956331253052, + "learning_rate": 1.4108143298592155e-05, + "loss": 0.0174, + "step": 21436 + }, + { + "epoch": 0.42876, + "grad_norm": 0.4657614827156067, + "learning_rate": 1.4106870258639493e-05, + "loss": 0.4228, + "step": 21438 + }, + { + "epoch": 0.4288, + "grad_norm": 0.665585458278656, + "learning_rate": 1.4105597138621281e-05, + "loss": 0.0151, + "step": 21440 + }, + { + "epoch": 0.42884, + "grad_norm": 6.7070512771606445, + "learning_rate": 1.4104323938562341e-05, + "loss": 0.3199, + "step": 21442 + }, + { + "epoch": 0.42888, + "grad_norm": 0.10702440142631531, + "learning_rate": 1.410305065848749e-05, + "loss": 0.0355, + "step": 21444 + }, + { + "epoch": 0.42892, + "grad_norm": 6.896639823913574, + "learning_rate": 1.4101777298421555e-05, + "loss": 0.3066, + "step": 21446 + }, + { + "epoch": 0.42896, + "grad_norm": 0.21957914531230927, + "learning_rate": 1.4100503858389358e-05, + "loss": 0.0067, + "step": 21448 + }, + { + "epoch": 0.429, + "grad_norm": 6.749436378479004, + "learning_rate": 1.4099230338415728e-05, + "loss": 0.226, + "step": 21450 + }, + { + "epoch": 0.42904, + "grad_norm": 0.4856334924697876, + "learning_rate": 1.4097956738525493e-05, + "loss": 0.0064, + "step": 21452 + }, + { + "epoch": 0.42908, + "grad_norm": 3.350235939025879, + "learning_rate": 1.4096683058743482e-05, + "loss": 0.1509, + "step": 21454 + }, + { + "epoch": 0.42912, + "grad_norm": 0.733495831489563, + "learning_rate": 1.4095409299094525e-05, + "loss": 0.0337, + "step": 21456 + }, + { + "epoch": 0.42916, + "grad_norm": 0.4160688817501068, + "learning_rate": 1.4094135459603454e-05, + "loss": 0.0132, + "step": 21458 + }, + { + "epoch": 0.4292, + "grad_norm": 1.3181277513504028, + "learning_rate": 1.4092861540295109e-05, + "loss": 0.0383, + "step": 21460 + }, + { + "epoch": 0.42924, + "grad_norm": 0.22002357244491577, + "learning_rate": 1.4091587541194317e-05, + "loss": 0.1557, + "step": 21462 + }, + { + "epoch": 0.42928, + "grad_norm": 0.4087846875190735, + "learning_rate": 1.4090313462325919e-05, + "loss": 0.0234, + "step": 21464 + }, + { + "epoch": 0.42932, + "grad_norm": 0.18846437335014343, + "learning_rate": 1.4089039303714756e-05, + "loss": 0.0065, + "step": 21466 + }, + { + "epoch": 0.42936, + "grad_norm": 0.41208550333976746, + "learning_rate": 1.4087765065385668e-05, + "loss": 0.0117, + "step": 21468 + }, + { + "epoch": 0.4294, + "grad_norm": 0.22913210093975067, + "learning_rate": 1.4086490747363492e-05, + "loss": 0.1317, + "step": 21470 + }, + { + "epoch": 0.42944, + "grad_norm": 0.40518251061439514, + "learning_rate": 1.4085216349673077e-05, + "loss": 0.2137, + "step": 21472 + }, + { + "epoch": 0.42948, + "grad_norm": 0.40821900963783264, + "learning_rate": 1.4083941872339264e-05, + "loss": 0.0081, + "step": 21474 + }, + { + "epoch": 0.42952, + "grad_norm": 9.173455238342285, + "learning_rate": 1.4082667315386903e-05, + "loss": 0.406, + "step": 21476 + }, + { + "epoch": 0.42956, + "grad_norm": 0.18689796328544617, + "learning_rate": 1.4081392678840842e-05, + "loss": 0.0047, + "step": 21478 + }, + { + "epoch": 0.4296, + "grad_norm": 0.18701058626174927, + "learning_rate": 1.4080117962725929e-05, + "loss": 0.0502, + "step": 21480 + }, + { + "epoch": 0.42964, + "grad_norm": 4.620704174041748, + "learning_rate": 1.4078843167067016e-05, + "loss": 0.1388, + "step": 21482 + }, + { + "epoch": 0.42968, + "grad_norm": 0.23867541551589966, + "learning_rate": 1.4077568291888954e-05, + "loss": 0.0063, + "step": 21484 + }, + { + "epoch": 0.42972, + "grad_norm": 0.3484518527984619, + "learning_rate": 1.40762933372166e-05, + "loss": 0.0342, + "step": 21486 + }, + { + "epoch": 0.42976, + "grad_norm": 0.8492715358734131, + "learning_rate": 1.4075018303074808e-05, + "loss": 0.0179, + "step": 21488 + }, + { + "epoch": 0.4298, + "grad_norm": 0.11041174083948135, + "learning_rate": 1.4073743189488436e-05, + "loss": 0.0362, + "step": 21490 + }, + { + "epoch": 0.42984, + "grad_norm": 0.130832239985466, + "learning_rate": 1.4072467996482347e-05, + "loss": 0.0036, + "step": 21492 + }, + { + "epoch": 0.42988, + "grad_norm": 0.017637543380260468, + "learning_rate": 1.4071192724081393e-05, + "loss": 0.0075, + "step": 21494 + }, + { + "epoch": 0.42992, + "grad_norm": 1.2414381504058838, + "learning_rate": 1.4069917372310444e-05, + "loss": 0.0297, + "step": 21496 + }, + { + "epoch": 0.42996, + "grad_norm": 0.589642345905304, + "learning_rate": 1.406864194119436e-05, + "loss": 0.0216, + "step": 21498 + }, + { + "epoch": 0.43, + "grad_norm": 0.47805866599082947, + "learning_rate": 1.4067366430758004e-05, + "loss": 0.0119, + "step": 21500 + }, + { + "epoch": 0.43004, + "grad_norm": 8.64869499206543, + "learning_rate": 1.4066090841026247e-05, + "loss": 0.3702, + "step": 21502 + }, + { + "epoch": 0.43008, + "grad_norm": 0.14048393070697784, + "learning_rate": 1.4064815172023956e-05, + "loss": 0.0108, + "step": 21504 + }, + { + "epoch": 0.43012, + "grad_norm": 0.028690924867987633, + "learning_rate": 1.4063539423775998e-05, + "loss": 0.0109, + "step": 21506 + }, + { + "epoch": 0.43016, + "grad_norm": 0.04716285690665245, + "learning_rate": 1.406226359630725e-05, + "loss": 0.0067, + "step": 21508 + }, + { + "epoch": 0.4302, + "grad_norm": 0.08424215018749237, + "learning_rate": 1.4060987689642581e-05, + "loss": 0.0109, + "step": 21510 + }, + { + "epoch": 0.43024, + "grad_norm": 0.29496464133262634, + "learning_rate": 1.4059711703806866e-05, + "loss": 0.0263, + "step": 21512 + }, + { + "epoch": 0.43028, + "grad_norm": 0.6859709024429321, + "learning_rate": 1.4058435638824983e-05, + "loss": 0.0183, + "step": 21514 + }, + { + "epoch": 0.43032, + "grad_norm": 0.14502336084842682, + "learning_rate": 1.4057159494721806e-05, + "loss": 0.0404, + "step": 21516 + }, + { + "epoch": 0.43036, + "grad_norm": 3.7537171840667725, + "learning_rate": 1.4055883271522217e-05, + "loss": 0.3552, + "step": 21518 + }, + { + "epoch": 0.4304, + "grad_norm": 11.298666000366211, + "learning_rate": 1.4054606969251095e-05, + "loss": 0.2418, + "step": 21520 + }, + { + "epoch": 0.43044, + "grad_norm": 1.4379717111587524, + "learning_rate": 1.4053330587933321e-05, + "loss": 0.0296, + "step": 21522 + }, + { + "epoch": 0.43048, + "grad_norm": 1.5062381029129028, + "learning_rate": 1.4052054127593782e-05, + "loss": 0.0425, + "step": 21524 + }, + { + "epoch": 0.43052, + "grad_norm": 4.1195268630981445, + "learning_rate": 1.4050777588257363e-05, + "loss": 0.093, + "step": 21526 + }, + { + "epoch": 0.43056, + "grad_norm": 0.24518629908561707, + "learning_rate": 1.4049500969948946e-05, + "loss": 0.378, + "step": 21528 + }, + { + "epoch": 0.4306, + "grad_norm": 0.6957114934921265, + "learning_rate": 1.4048224272693426e-05, + "loss": 0.0162, + "step": 21530 + }, + { + "epoch": 0.43064, + "grad_norm": 1.3657232522964478, + "learning_rate": 1.4046947496515689e-05, + "loss": 0.0304, + "step": 21532 + }, + { + "epoch": 0.43068, + "grad_norm": 0.5532651543617249, + "learning_rate": 1.4045670641440627e-05, + "loss": 0.0122, + "step": 21534 + }, + { + "epoch": 0.43072, + "grad_norm": 0.2105746865272522, + "learning_rate": 1.404439370749313e-05, + "loss": 0.029, + "step": 21536 + }, + { + "epoch": 0.43076, + "grad_norm": 0.2679453492164612, + "learning_rate": 1.4043116694698098e-05, + "loss": 0.0263, + "step": 21538 + }, + { + "epoch": 0.4308, + "grad_norm": 0.16425028443336487, + "learning_rate": 1.4041839603080423e-05, + "loss": 0.0038, + "step": 21540 + }, + { + "epoch": 0.43084, + "grad_norm": 3.7010293006896973, + "learning_rate": 1.4040562432665006e-05, + "loss": 0.2499, + "step": 21542 + }, + { + "epoch": 0.43088, + "grad_norm": 0.6654846668243408, + "learning_rate": 1.4039285183476744e-05, + "loss": 0.1528, + "step": 21544 + }, + { + "epoch": 0.43092, + "grad_norm": 0.03247804939746857, + "learning_rate": 1.4038007855540537e-05, + "loss": 0.0089, + "step": 21546 + }, + { + "epoch": 0.43096, + "grad_norm": 0.16093920171260834, + "learning_rate": 1.4036730448881288e-05, + "loss": 0.0066, + "step": 21548 + }, + { + "epoch": 0.431, + "grad_norm": 6.680233955383301, + "learning_rate": 1.4035452963523903e-05, + "loss": 0.3502, + "step": 21550 + }, + { + "epoch": 0.43104, + "grad_norm": 3.06365966796875, + "learning_rate": 1.4034175399493281e-05, + "loss": 0.0664, + "step": 21552 + }, + { + "epoch": 0.43108, + "grad_norm": 1.2866508960723877, + "learning_rate": 1.4032897756814334e-05, + "loss": 0.0515, + "step": 21554 + }, + { + "epoch": 0.43112, + "grad_norm": 1.0550140142440796, + "learning_rate": 1.403162003551197e-05, + "loss": 0.0281, + "step": 21556 + }, + { + "epoch": 0.43116, + "grad_norm": 0.013443294912576675, + "learning_rate": 1.4030342235611098e-05, + "loss": 0.0006, + "step": 21558 + }, + { + "epoch": 0.4312, + "grad_norm": 8.286821365356445, + "learning_rate": 1.4029064357136628e-05, + "loss": 0.3893, + "step": 21560 + }, + { + "epoch": 0.43124, + "grad_norm": 0.11926181614398956, + "learning_rate": 1.4027786400113471e-05, + "loss": 0.0237, + "step": 21562 + }, + { + "epoch": 0.43128, + "grad_norm": 0.0899956077337265, + "learning_rate": 1.402650836456655e-05, + "loss": 0.0038, + "step": 21564 + }, + { + "epoch": 0.43132, + "grad_norm": 4.722187519073486, + "learning_rate": 1.4025230250520776e-05, + "loss": 0.1982, + "step": 21566 + }, + { + "epoch": 0.43136, + "grad_norm": 3.149765968322754, + "learning_rate": 1.4023952058001062e-05, + "loss": 0.2342, + "step": 21568 + }, + { + "epoch": 0.4314, + "grad_norm": 1.9933454990386963, + "learning_rate": 1.4022673787032333e-05, + "loss": 0.2938, + "step": 21570 + }, + { + "epoch": 0.43144, + "grad_norm": 1.6827183961868286, + "learning_rate": 1.4021395437639505e-05, + "loss": 0.0325, + "step": 21572 + }, + { + "epoch": 0.43148, + "grad_norm": 0.6374794840812683, + "learning_rate": 1.4020117009847506e-05, + "loss": 0.0796, + "step": 21574 + }, + { + "epoch": 0.43152, + "grad_norm": 0.8888494968414307, + "learning_rate": 1.4018838503681253e-05, + "loss": 0.0288, + "step": 21576 + }, + { + "epoch": 0.43156, + "grad_norm": 0.0413009375333786, + "learning_rate": 1.4017559919165676e-05, + "loss": 0.014, + "step": 21578 + }, + { + "epoch": 0.4316, + "grad_norm": 0.47750765085220337, + "learning_rate": 1.4016281256325702e-05, + "loss": 0.0098, + "step": 21580 + }, + { + "epoch": 0.43164, + "grad_norm": 3.692173480987549, + "learning_rate": 1.4015002515186253e-05, + "loss": 0.0811, + "step": 21582 + }, + { + "epoch": 0.43168, + "grad_norm": 2.4005062580108643, + "learning_rate": 1.4013723695772266e-05, + "loss": 0.0736, + "step": 21584 + }, + { + "epoch": 0.43172, + "grad_norm": 2.0776844024658203, + "learning_rate": 1.4012444798108668e-05, + "loss": 0.0887, + "step": 21586 + }, + { + "epoch": 0.43176, + "grad_norm": 7.132383346557617, + "learning_rate": 1.4011165822220391e-05, + "loss": 0.2358, + "step": 21588 + }, + { + "epoch": 0.4318, + "grad_norm": 2.8121018409729004, + "learning_rate": 1.4009886768132375e-05, + "loss": 0.0489, + "step": 21590 + }, + { + "epoch": 0.43184, + "grad_norm": 0.520871639251709, + "learning_rate": 1.400860763586955e-05, + "loss": 0.0281, + "step": 21592 + }, + { + "epoch": 0.43188, + "grad_norm": 0.07862299680709839, + "learning_rate": 1.4007328425456854e-05, + "loss": 0.0024, + "step": 21594 + }, + { + "epoch": 0.43192, + "grad_norm": 0.11784262955188751, + "learning_rate": 1.4006049136919229e-05, + "loss": 0.0056, + "step": 21596 + }, + { + "epoch": 0.43196, + "grad_norm": 0.7969817519187927, + "learning_rate": 1.4004769770281615e-05, + "loss": 0.0307, + "step": 21598 + }, + { + "epoch": 0.432, + "grad_norm": 0.08439784497022629, + "learning_rate": 1.4003490325568953e-05, + "loss": 0.0107, + "step": 21600 + }, + { + "epoch": 0.43204, + "grad_norm": 3.6620404720306396, + "learning_rate": 1.4002210802806183e-05, + "loss": 0.0827, + "step": 21602 + }, + { + "epoch": 0.43208, + "grad_norm": 1.0752053260803223, + "learning_rate": 1.4000931202018252e-05, + "loss": 0.0241, + "step": 21604 + }, + { + "epoch": 0.43212, + "grad_norm": 0.663324236869812, + "learning_rate": 1.399965152323011e-05, + "loss": 0.1149, + "step": 21606 + }, + { + "epoch": 0.43216, + "grad_norm": 0.2588200271129608, + "learning_rate": 1.3998371766466702e-05, + "loss": 0.0082, + "step": 21608 + }, + { + "epoch": 0.4322, + "grad_norm": 0.22532868385314941, + "learning_rate": 1.3997091931752978e-05, + "loss": 0.0046, + "step": 21610 + }, + { + "epoch": 0.43224, + "grad_norm": 0.03244852274656296, + "learning_rate": 1.3995812019113887e-05, + "loss": 0.0476, + "step": 21612 + }, + { + "epoch": 0.43228, + "grad_norm": 1.5414983034133911, + "learning_rate": 1.3994532028574386e-05, + "loss": 0.0218, + "step": 21614 + }, + { + "epoch": 0.43232, + "grad_norm": 0.015462299808859825, + "learning_rate": 1.3993251960159425e-05, + "loss": 0.0046, + "step": 21616 + }, + { + "epoch": 0.43236, + "grad_norm": 0.39385098218917847, + "learning_rate": 1.3991971813893964e-05, + "loss": 0.1451, + "step": 21618 + }, + { + "epoch": 0.4324, + "grad_norm": 0.9816085696220398, + "learning_rate": 1.3990691589802955e-05, + "loss": 0.0124, + "step": 21620 + }, + { + "epoch": 0.43244, + "grad_norm": 0.8502940535545349, + "learning_rate": 1.3989411287911356e-05, + "loss": 0.3714, + "step": 21622 + }, + { + "epoch": 0.43248, + "grad_norm": 7.140644550323486, + "learning_rate": 1.3988130908244135e-05, + "loss": 0.2366, + "step": 21624 + }, + { + "epoch": 0.43252, + "grad_norm": 0.09884797036647797, + "learning_rate": 1.3986850450826245e-05, + "loss": 0.0096, + "step": 21626 + }, + { + "epoch": 0.43256, + "grad_norm": 0.1344882696866989, + "learning_rate": 1.3985569915682656e-05, + "loss": 0.1402, + "step": 21628 + }, + { + "epoch": 0.4326, + "grad_norm": 4.148534297943115, + "learning_rate": 1.3984289302838327e-05, + "loss": 0.0732, + "step": 21630 + }, + { + "epoch": 0.43264, + "grad_norm": 0.05394592508673668, + "learning_rate": 1.398300861231823e-05, + "loss": 0.0723, + "step": 21632 + }, + { + "epoch": 0.43268, + "grad_norm": 0.0835237130522728, + "learning_rate": 1.3981727844147329e-05, + "loss": 0.0052, + "step": 21634 + }, + { + "epoch": 0.43272, + "grad_norm": 1.7417535781860352, + "learning_rate": 1.3980446998350589e-05, + "loss": 0.0638, + "step": 21636 + }, + { + "epoch": 0.43276, + "grad_norm": 0.41585931181907654, + "learning_rate": 1.397916607495299e-05, + "loss": 0.047, + "step": 21638 + }, + { + "epoch": 0.4328, + "grad_norm": 0.271838903427124, + "learning_rate": 1.39778850739795e-05, + "loss": 0.0054, + "step": 21640 + }, + { + "epoch": 0.43284, + "grad_norm": 1.4800443649291992, + "learning_rate": 1.3976603995455089e-05, + "loss": 0.057, + "step": 21642 + }, + { + "epoch": 0.43288, + "grad_norm": 0.14725527167320251, + "learning_rate": 1.397532283940474e-05, + "loss": 0.0034, + "step": 21644 + }, + { + "epoch": 0.43292, + "grad_norm": 0.08027645945549011, + "learning_rate": 1.3974041605853422e-05, + "loss": 0.0483, + "step": 21646 + }, + { + "epoch": 0.43296, + "grad_norm": 0.21222716569900513, + "learning_rate": 1.3972760294826118e-05, + "loss": 0.0691, + "step": 21648 + }, + { + "epoch": 0.433, + "grad_norm": 7.29835844039917, + "learning_rate": 1.3971478906347806e-05, + "loss": 0.1627, + "step": 21650 + }, + { + "epoch": 0.43304, + "grad_norm": 0.478442907333374, + "learning_rate": 1.3970197440443472e-05, + "loss": 0.0858, + "step": 21652 + }, + { + "epoch": 0.43308, + "grad_norm": 1.6674721240997314, + "learning_rate": 1.396891589713809e-05, + "loss": 0.1424, + "step": 21654 + }, + { + "epoch": 0.43312, + "grad_norm": 3.2271454334259033, + "learning_rate": 1.3967634276456654e-05, + "loss": 0.106, + "step": 21656 + }, + { + "epoch": 0.43316, + "grad_norm": 8.452455520629883, + "learning_rate": 1.396635257842414e-05, + "loss": 0.2506, + "step": 21658 + }, + { + "epoch": 0.4332, + "grad_norm": 0.00470671895891428, + "learning_rate": 1.3965070803065543e-05, + "loss": 0.1759, + "step": 21660 + }, + { + "epoch": 0.43324, + "grad_norm": 0.44993576407432556, + "learning_rate": 1.3963788950405848e-05, + "loss": 0.1571, + "step": 21662 + }, + { + "epoch": 0.43328, + "grad_norm": 0.11128101497888565, + "learning_rate": 1.396250702047005e-05, + "loss": 0.0073, + "step": 21664 + }, + { + "epoch": 0.43332, + "grad_norm": 0.29007357358932495, + "learning_rate": 1.3961225013283135e-05, + "loss": 0.1557, + "step": 21666 + }, + { + "epoch": 0.43336, + "grad_norm": 0.8326806426048279, + "learning_rate": 1.3959942928870101e-05, + "loss": 0.0673, + "step": 21668 + }, + { + "epoch": 0.4334, + "grad_norm": 6.014547348022461, + "learning_rate": 1.3958660767255938e-05, + "loss": 0.1755, + "step": 21670 + }, + { + "epoch": 0.43344, + "grad_norm": 6.149803161621094, + "learning_rate": 1.3957378528465647e-05, + "loss": 0.1521, + "step": 21672 + }, + { + "epoch": 0.43348, + "grad_norm": 0.24626590311527252, + "learning_rate": 1.3956096212524223e-05, + "loss": 0.0042, + "step": 21674 + }, + { + "epoch": 0.43352, + "grad_norm": 0.7691135406494141, + "learning_rate": 1.395481381945667e-05, + "loss": 0.0119, + "step": 21676 + }, + { + "epoch": 0.43356, + "grad_norm": 1.2245210409164429, + "learning_rate": 1.395353134928798e-05, + "loss": 0.0223, + "step": 21678 + }, + { + "epoch": 0.4336, + "grad_norm": 0.4574774205684662, + "learning_rate": 1.3952248802043166e-05, + "loss": 0.1718, + "step": 21680 + }, + { + "epoch": 0.43364, + "grad_norm": 0.32321181893348694, + "learning_rate": 1.3950966177747226e-05, + "loss": 0.0083, + "step": 21682 + }, + { + "epoch": 0.43368, + "grad_norm": 0.08553195744752884, + "learning_rate": 1.3949683476425162e-05, + "loss": 0.0231, + "step": 21684 + }, + { + "epoch": 0.43372, + "grad_norm": 3.5292680263519287, + "learning_rate": 1.3948400698101992e-05, + "loss": 0.1066, + "step": 21686 + }, + { + "epoch": 0.43376, + "grad_norm": 1.4458038806915283, + "learning_rate": 1.3947117842802713e-05, + "loss": 0.0287, + "step": 21688 + }, + { + "epoch": 0.4338, + "grad_norm": 7.6593499183654785, + "learning_rate": 1.394583491055234e-05, + "loss": 0.3366, + "step": 21690 + }, + { + "epoch": 0.43384, + "grad_norm": 0.40094199776649475, + "learning_rate": 1.3944551901375884e-05, + "loss": 0.0087, + "step": 21692 + }, + { + "epoch": 0.43388, + "grad_norm": 0.05792125314474106, + "learning_rate": 1.3943268815298358e-05, + "loss": 0.0023, + "step": 21694 + }, + { + "epoch": 0.43392, + "grad_norm": 0.4084220826625824, + "learning_rate": 1.3941985652344778e-05, + "loss": 0.0065, + "step": 21696 + }, + { + "epoch": 0.43396, + "grad_norm": 0.8941878080368042, + "learning_rate": 1.3940702412540157e-05, + "loss": 0.0164, + "step": 21698 + }, + { + "epoch": 0.434, + "grad_norm": 0.14905564486980438, + "learning_rate": 1.3939419095909513e-05, + "loss": 0.0072, + "step": 21700 + }, + { + "epoch": 0.43404, + "grad_norm": 8.198543548583984, + "learning_rate": 1.3938135702477866e-05, + "loss": 0.2081, + "step": 21702 + }, + { + "epoch": 0.43408, + "grad_norm": 0.0958990678191185, + "learning_rate": 1.3936852232270236e-05, + "loss": 0.0032, + "step": 21704 + }, + { + "epoch": 0.43412, + "grad_norm": 7.002901077270508, + "learning_rate": 1.3935568685311647e-05, + "loss": 0.6713, + "step": 21706 + }, + { + "epoch": 0.43416, + "grad_norm": 9.001626014709473, + "learning_rate": 1.3934285061627116e-05, + "loss": 0.3679, + "step": 21708 + }, + { + "epoch": 0.4342, + "grad_norm": 5.277889728546143, + "learning_rate": 1.3933001361241674e-05, + "loss": 0.1329, + "step": 21710 + }, + { + "epoch": 0.43424, + "grad_norm": 0.13269397616386414, + "learning_rate": 1.3931717584180349e-05, + "loss": 0.0214, + "step": 21712 + }, + { + "epoch": 0.43428, + "grad_norm": 0.5051240921020508, + "learning_rate": 1.3930433730468161e-05, + "loss": 0.0305, + "step": 21714 + }, + { + "epoch": 0.43432, + "grad_norm": 7.153587341308594, + "learning_rate": 1.3929149800130144e-05, + "loss": 0.3022, + "step": 21716 + }, + { + "epoch": 0.43436, + "grad_norm": 3.5675699710845947, + "learning_rate": 1.392786579319133e-05, + "loss": 0.4156, + "step": 21718 + }, + { + "epoch": 0.4344, + "grad_norm": 0.15269440412521362, + "learning_rate": 1.3926581709676752e-05, + "loss": 0.0031, + "step": 21720 + }, + { + "epoch": 0.43444, + "grad_norm": 2.2694220542907715, + "learning_rate": 1.392529754961144e-05, + "loss": 0.0806, + "step": 21722 + }, + { + "epoch": 0.43448, + "grad_norm": 0.766481339931488, + "learning_rate": 1.3924013313020432e-05, + "loss": 0.0189, + "step": 21724 + }, + { + "epoch": 0.43452, + "grad_norm": 0.836694061756134, + "learning_rate": 1.3922728999928765e-05, + "loss": 0.6438, + "step": 21726 + }, + { + "epoch": 0.43456, + "grad_norm": 5.987422943115234, + "learning_rate": 1.3921444610361475e-05, + "loss": 0.1434, + "step": 21728 + }, + { + "epoch": 0.4346, + "grad_norm": 1.3467838764190674, + "learning_rate": 1.3920160144343604e-05, + "loss": 0.0321, + "step": 21730 + }, + { + "epoch": 0.43464, + "grad_norm": 1.2610383033752441, + "learning_rate": 1.3918875601900194e-05, + "loss": 0.0385, + "step": 21732 + }, + { + "epoch": 0.43468, + "grad_norm": 0.6688830256462097, + "learning_rate": 1.3917590983056284e-05, + "loss": 0.0588, + "step": 21734 + }, + { + "epoch": 0.43472, + "grad_norm": 0.3022245764732361, + "learning_rate": 1.3916306287836924e-05, + "loss": 0.0646, + "step": 21736 + }, + { + "epoch": 0.43476, + "grad_norm": 1.160797357559204, + "learning_rate": 1.3915021516267158e-05, + "loss": 0.0285, + "step": 21738 + }, + { + "epoch": 0.4348, + "grad_norm": 0.6420384645462036, + "learning_rate": 1.3913736668372027e-05, + "loss": 0.0174, + "step": 21740 + }, + { + "epoch": 0.43484, + "grad_norm": 0.040739789605140686, + "learning_rate": 1.3912451744176585e-05, + "loss": 0.0014, + "step": 21742 + }, + { + "epoch": 0.43488, + "grad_norm": 0.2851472795009613, + "learning_rate": 1.3911166743705883e-05, + "loss": 0.0116, + "step": 21744 + }, + { + "epoch": 0.43492, + "grad_norm": 2.3602027893066406, + "learning_rate": 1.3909881666984975e-05, + "loss": 0.0627, + "step": 21746 + }, + { + "epoch": 0.43496, + "grad_norm": 0.17945975065231323, + "learning_rate": 1.3908596514038908e-05, + "loss": 0.0034, + "step": 21748 + }, + { + "epoch": 0.435, + "grad_norm": 0.8111706376075745, + "learning_rate": 1.3907311284892737e-05, + "loss": 0.0172, + "step": 21750 + }, + { + "epoch": 0.43504, + "grad_norm": 0.05050947889685631, + "learning_rate": 1.3906025979571525e-05, + "loss": 0.0199, + "step": 21752 + }, + { + "epoch": 0.43508, + "grad_norm": 1.6318714618682861, + "learning_rate": 1.3904740598100325e-05, + "loss": 0.2197, + "step": 21754 + }, + { + "epoch": 0.43512, + "grad_norm": 0.9612879753112793, + "learning_rate": 1.39034551405042e-05, + "loss": 0.0173, + "step": 21756 + }, + { + "epoch": 0.43516, + "grad_norm": 2.178402900695801, + "learning_rate": 1.39021696068082e-05, + "loss": 0.0613, + "step": 21758 + }, + { + "epoch": 0.4352, + "grad_norm": 2.1915512084960938, + "learning_rate": 1.3900883997037398e-05, + "loss": 0.0376, + "step": 21760 + }, + { + "epoch": 0.43524, + "grad_norm": 0.5786200165748596, + "learning_rate": 1.3899598311216855e-05, + "loss": 0.1989, + "step": 21762 + }, + { + "epoch": 0.43528, + "grad_norm": 0.11308538913726807, + "learning_rate": 1.3898312549371636e-05, + "loss": 0.0548, + "step": 21764 + }, + { + "epoch": 0.43532, + "grad_norm": 0.2586745619773865, + "learning_rate": 1.3897026711526804e-05, + "loss": 0.0063, + "step": 21766 + }, + { + "epoch": 0.43536, + "grad_norm": 3.6141326427459717, + "learning_rate": 1.3895740797707431e-05, + "loss": 0.0657, + "step": 21768 + }, + { + "epoch": 0.4354, + "grad_norm": 0.06646331399679184, + "learning_rate": 1.3894454807938587e-05, + "loss": 0.0014, + "step": 21770 + }, + { + "epoch": 0.43544, + "grad_norm": 0.23824086785316467, + "learning_rate": 1.3893168742245343e-05, + "loss": 0.0108, + "step": 21772 + }, + { + "epoch": 0.43548, + "grad_norm": 0.0879829004406929, + "learning_rate": 1.3891882600652765e-05, + "loss": 0.013, + "step": 21774 + }, + { + "epoch": 0.43552, + "grad_norm": 0.4631655514240265, + "learning_rate": 1.3890596383185934e-05, + "loss": 0.184, + "step": 21776 + }, + { + "epoch": 0.43556, + "grad_norm": 2.878438711166382, + "learning_rate": 1.3889310089869925e-05, + "loss": 0.0471, + "step": 21778 + }, + { + "epoch": 0.4356, + "grad_norm": 0.011387491598725319, + "learning_rate": 1.388802372072981e-05, + "loss": 0.0106, + "step": 21780 + }, + { + "epoch": 0.43564, + "grad_norm": 8.3805513381958, + "learning_rate": 1.3886737275790677e-05, + "loss": 0.2842, + "step": 21782 + }, + { + "epoch": 0.43568, + "grad_norm": 3.852874279022217, + "learning_rate": 1.3885450755077595e-05, + "loss": 0.0665, + "step": 21784 + }, + { + "epoch": 0.43572, + "grad_norm": 0.5800528526306152, + "learning_rate": 1.3884164158615652e-05, + "loss": 0.2773, + "step": 21786 + }, + { + "epoch": 0.43576, + "grad_norm": 7.950834274291992, + "learning_rate": 1.3882877486429926e-05, + "loss": 0.4578, + "step": 21788 + }, + { + "epoch": 0.4358, + "grad_norm": 0.6548236608505249, + "learning_rate": 1.3881590738545508e-05, + "loss": 0.019, + "step": 21790 + }, + { + "epoch": 0.43584, + "grad_norm": 0.1941821277141571, + "learning_rate": 1.3880303914987478e-05, + "loss": 0.0818, + "step": 21792 + }, + { + "epoch": 0.43588, + "grad_norm": 3.9743740558624268, + "learning_rate": 1.3879017015780929e-05, + "loss": 0.1291, + "step": 21794 + }, + { + "epoch": 0.43592, + "grad_norm": 0.08799915760755539, + "learning_rate": 1.3877730040950943e-05, + "loss": 0.0019, + "step": 21796 + }, + { + "epoch": 0.43596, + "grad_norm": 5.911798477172852, + "learning_rate": 1.3876442990522612e-05, + "loss": 0.0928, + "step": 21798 + }, + { + "epoch": 0.436, + "grad_norm": 12.067059516906738, + "learning_rate": 1.3875155864521031e-05, + "loss": 0.4928, + "step": 21800 + }, + { + "epoch": 0.43604, + "grad_norm": 0.0459207259118557, + "learning_rate": 1.3873868662971292e-05, + "loss": 0.0022, + "step": 21802 + }, + { + "epoch": 0.43608, + "grad_norm": 0.035365816205739975, + "learning_rate": 1.3872581385898486e-05, + "loss": 0.0882, + "step": 21804 + }, + { + "epoch": 0.43612, + "grad_norm": 0.3708675801753998, + "learning_rate": 1.3871294033327718e-05, + "loss": 0.0918, + "step": 21806 + }, + { + "epoch": 0.43616, + "grad_norm": 0.12501436471939087, + "learning_rate": 1.3870006605284077e-05, + "loss": 0.0121, + "step": 21808 + }, + { + "epoch": 0.4362, + "grad_norm": 0.5882879495620728, + "learning_rate": 1.3868719101792664e-05, + "loss": 0.014, + "step": 21810 + }, + { + "epoch": 0.43624, + "grad_norm": 1.2193182706832886, + "learning_rate": 1.386743152287858e-05, + "loss": 0.027, + "step": 21812 + }, + { + "epoch": 0.43628, + "grad_norm": 0.9006592631340027, + "learning_rate": 1.3866143868566929e-05, + "loss": 0.0201, + "step": 21814 + }, + { + "epoch": 0.43632, + "grad_norm": 1.793985366821289, + "learning_rate": 1.386485613888281e-05, + "loss": 0.0455, + "step": 21816 + }, + { + "epoch": 0.43636, + "grad_norm": 0.2008230984210968, + "learning_rate": 1.3863568333851334e-05, + "loss": 0.0084, + "step": 21818 + }, + { + "epoch": 0.4364, + "grad_norm": 0.3567597568035126, + "learning_rate": 1.3862280453497601e-05, + "loss": 0.0112, + "step": 21820 + }, + { + "epoch": 0.43644, + "grad_norm": 0.09280122816562653, + "learning_rate": 1.3860992497846723e-05, + "loss": 0.0106, + "step": 21822 + }, + { + "epoch": 0.43648, + "grad_norm": 0.28987154364585876, + "learning_rate": 1.3859704466923812e-05, + "loss": 0.0148, + "step": 21824 + }, + { + "epoch": 0.43652, + "grad_norm": 0.6342856287956238, + "learning_rate": 1.3858416360753973e-05, + "loss": 0.2437, + "step": 21826 + }, + { + "epoch": 0.43656, + "grad_norm": 0.09779674559831619, + "learning_rate": 1.385712817936232e-05, + "loss": 0.0027, + "step": 21828 + }, + { + "epoch": 0.4366, + "grad_norm": 4.4563212394714355, + "learning_rate": 1.3855839922773968e-05, + "loss": 0.073, + "step": 21830 + }, + { + "epoch": 0.43664, + "grad_norm": 0.039782628417015076, + "learning_rate": 1.3854551591014029e-05, + "loss": 0.0097, + "step": 21832 + }, + { + "epoch": 0.43668, + "grad_norm": 0.43498721718788147, + "learning_rate": 1.3853263184107624e-05, + "loss": 0.0616, + "step": 21834 + }, + { + "epoch": 0.43672, + "grad_norm": 0.0446365624666214, + "learning_rate": 1.3851974702079871e-05, + "loss": 0.2514, + "step": 21836 + }, + { + "epoch": 0.43676, + "grad_norm": 1.4965838193893433, + "learning_rate": 1.3850686144955887e-05, + "loss": 0.029, + "step": 21838 + }, + { + "epoch": 0.4368, + "grad_norm": 4.479378700256348, + "learning_rate": 1.3849397512760797e-05, + "loss": 0.1006, + "step": 21840 + }, + { + "epoch": 0.43684, + "grad_norm": 0.05002667382359505, + "learning_rate": 1.3848108805519716e-05, + "loss": 0.0801, + "step": 21842 + }, + { + "epoch": 0.43688, + "grad_norm": 1.3610866069793701, + "learning_rate": 1.3846820023257777e-05, + "loss": 0.0219, + "step": 21844 + }, + { + "epoch": 0.43692, + "grad_norm": 1.1453207731246948, + "learning_rate": 1.3845531166000097e-05, + "loss": 0.0191, + "step": 21846 + }, + { + "epoch": 0.43696, + "grad_norm": 0.5640309453010559, + "learning_rate": 1.384424223377181e-05, + "loss": 0.0214, + "step": 21848 + }, + { + "epoch": 0.437, + "grad_norm": 0.17144815623760223, + "learning_rate": 1.3842953226598036e-05, + "loss": 0.3758, + "step": 21850 + }, + { + "epoch": 0.43704, + "grad_norm": 0.061355721205472946, + "learning_rate": 1.3841664144503919e-05, + "loss": 0.0199, + "step": 21852 + }, + { + "epoch": 0.43708, + "grad_norm": 1.1758824586868286, + "learning_rate": 1.3840374987514576e-05, + "loss": 0.0261, + "step": 21854 + }, + { + "epoch": 0.43712, + "grad_norm": 0.8748311996459961, + "learning_rate": 1.3839085755655146e-05, + "loss": 0.0157, + "step": 21856 + }, + { + "epoch": 0.43716, + "grad_norm": 3.4473369121551514, + "learning_rate": 1.3837796448950765e-05, + "loss": 0.0452, + "step": 21858 + }, + { + "epoch": 0.4372, + "grad_norm": 0.04685530066490173, + "learning_rate": 1.3836507067426565e-05, + "loss": 0.0025, + "step": 21860 + }, + { + "epoch": 0.43724, + "grad_norm": 1.9481794834136963, + "learning_rate": 1.3835217611107686e-05, + "loss": 0.4193, + "step": 21862 + }, + { + "epoch": 0.43728, + "grad_norm": 7.963339328765869, + "learning_rate": 1.3833928080019262e-05, + "loss": 0.1406, + "step": 21864 + }, + { + "epoch": 0.43732, + "grad_norm": 0.332195907831192, + "learning_rate": 1.3832638474186438e-05, + "loss": 0.0191, + "step": 21866 + }, + { + "epoch": 0.43736, + "grad_norm": 0.5163422226905823, + "learning_rate": 1.3831348793634354e-05, + "loss": 0.1239, + "step": 21868 + }, + { + "epoch": 0.4374, + "grad_norm": 0.4803784191608429, + "learning_rate": 1.3830059038388153e-05, + "loss": 0.0083, + "step": 21870 + }, + { + "epoch": 0.43744, + "grad_norm": 0.19090314209461212, + "learning_rate": 1.3828769208472976e-05, + "loss": 0.0081, + "step": 21872 + }, + { + "epoch": 0.43748, + "grad_norm": 6.583742141723633, + "learning_rate": 1.3827479303913978e-05, + "loss": 0.189, + "step": 21874 + }, + { + "epoch": 0.43752, + "grad_norm": 10.325713157653809, + "learning_rate": 1.3826189324736294e-05, + "loss": 0.7277, + "step": 21876 + }, + { + "epoch": 0.43756, + "grad_norm": 7.287636756896973, + "learning_rate": 1.3824899270965087e-05, + "loss": 0.0821, + "step": 21878 + }, + { + "epoch": 0.4376, + "grad_norm": 3.555633306503296, + "learning_rate": 1.3823609142625492e-05, + "loss": 0.0771, + "step": 21880 + }, + { + "epoch": 0.43764, + "grad_norm": 0.25082236528396606, + "learning_rate": 1.382231893974267e-05, + "loss": 0.0034, + "step": 21882 + }, + { + "epoch": 0.43768, + "grad_norm": 3.6382205486297607, + "learning_rate": 1.3821028662341776e-05, + "loss": 0.0798, + "step": 21884 + }, + { + "epoch": 0.43772, + "grad_norm": 0.1806647926568985, + "learning_rate": 1.3819738310447956e-05, + "loss": 0.0046, + "step": 21886 + }, + { + "epoch": 0.43776, + "grad_norm": 2.758450984954834, + "learning_rate": 1.3818447884086375e-05, + "loss": 0.0621, + "step": 21888 + }, + { + "epoch": 0.4378, + "grad_norm": 10.490538597106934, + "learning_rate": 1.3817157383282184e-05, + "loss": 0.3054, + "step": 21890 + }, + { + "epoch": 0.43784, + "grad_norm": 0.1030542254447937, + "learning_rate": 1.3815866808060548e-05, + "loss": 0.0319, + "step": 21892 + }, + { + "epoch": 0.43788, + "grad_norm": 0.4868571162223816, + "learning_rate": 1.3814576158446623e-05, + "loss": 0.0405, + "step": 21894 + }, + { + "epoch": 0.43792, + "grad_norm": 0.16732919216156006, + "learning_rate": 1.3813285434465572e-05, + "loss": 0.0239, + "step": 21896 + }, + { + "epoch": 0.43796, + "grad_norm": 1.426781415939331, + "learning_rate": 1.3811994636142556e-05, + "loss": 0.0176, + "step": 21898 + }, + { + "epoch": 0.438, + "grad_norm": 0.6091141700744629, + "learning_rate": 1.3810703763502744e-05, + "loss": 0.0316, + "step": 21900 + }, + { + "epoch": 0.43804, + "grad_norm": 2.3138210773468018, + "learning_rate": 1.3809412816571298e-05, + "loss": 0.047, + "step": 21902 + }, + { + "epoch": 0.43808, + "grad_norm": 5.329185962677002, + "learning_rate": 1.380812179537339e-05, + "loss": 0.1165, + "step": 21904 + }, + { + "epoch": 0.43812, + "grad_norm": 3.4736196994781494, + "learning_rate": 1.3806830699934183e-05, + "loss": 0.3945, + "step": 21906 + }, + { + "epoch": 0.43816, + "grad_norm": 0.17808333039283752, + "learning_rate": 1.3805539530278858e-05, + "loss": 0.016, + "step": 21908 + }, + { + "epoch": 0.4382, + "grad_norm": 0.04077162966132164, + "learning_rate": 1.3804248286432577e-05, + "loss": 0.0064, + "step": 21910 + }, + { + "epoch": 0.43824, + "grad_norm": 6.417949676513672, + "learning_rate": 1.380295696842052e-05, + "loss": 0.1849, + "step": 21912 + }, + { + "epoch": 0.43828, + "grad_norm": 0.03370688855648041, + "learning_rate": 1.3801665576267856e-05, + "loss": 0.0099, + "step": 21914 + }, + { + "epoch": 0.43832, + "grad_norm": 3.7570676803588867, + "learning_rate": 1.3800374109999766e-05, + "loss": 0.0816, + "step": 21916 + }, + { + "epoch": 0.43836, + "grad_norm": 4.640753269195557, + "learning_rate": 1.3799082569641425e-05, + "loss": 0.0902, + "step": 21918 + }, + { + "epoch": 0.4384, + "grad_norm": 1.0571603775024414, + "learning_rate": 1.3797790955218014e-05, + "loss": 0.0205, + "step": 21920 + }, + { + "epoch": 0.43844, + "grad_norm": 0.711433470249176, + "learning_rate": 1.379649926675471e-05, + "loss": 0.039, + "step": 21922 + }, + { + "epoch": 0.43848, + "grad_norm": 5.714783668518066, + "learning_rate": 1.3795207504276702e-05, + "loss": 0.1578, + "step": 21924 + }, + { + "epoch": 0.43852, + "grad_norm": 1.3160879611968994, + "learning_rate": 1.3793915667809171e-05, + "loss": 0.0319, + "step": 21926 + }, + { + "epoch": 0.43856, + "grad_norm": 3.3807337284088135, + "learning_rate": 1.3792623757377297e-05, + "loss": 0.0721, + "step": 21928 + }, + { + "epoch": 0.4386, + "grad_norm": 7.959221363067627, + "learning_rate": 1.3791331773006272e-05, + "loss": 0.1932, + "step": 21930 + }, + { + "epoch": 0.43864, + "grad_norm": 0.05442311614751816, + "learning_rate": 1.3790039714721282e-05, + "loss": 0.0089, + "step": 21932 + }, + { + "epoch": 0.43868, + "grad_norm": 0.07380429655313492, + "learning_rate": 1.3788747582547517e-05, + "loss": 0.004, + "step": 21934 + }, + { + "epoch": 0.43872, + "grad_norm": 1.656934380531311, + "learning_rate": 1.3787455376510167e-05, + "loss": 0.1768, + "step": 21936 + }, + { + "epoch": 0.43876, + "grad_norm": 0.7802779078483582, + "learning_rate": 1.3786163096634424e-05, + "loss": 0.0422, + "step": 21938 + }, + { + "epoch": 0.4388, + "grad_norm": 4.160977363586426, + "learning_rate": 1.3784870742945482e-05, + "loss": 0.0997, + "step": 21940 + }, + { + "epoch": 0.43884, + "grad_norm": 0.379193514585495, + "learning_rate": 1.3783578315468537e-05, + "loss": 0.0069, + "step": 21942 + }, + { + "epoch": 0.43888, + "grad_norm": 5.2355499267578125, + "learning_rate": 1.3782285814228783e-05, + "loss": 0.3053, + "step": 21944 + }, + { + "epoch": 0.43892, + "grad_norm": 6.081615447998047, + "learning_rate": 1.3780993239251425e-05, + "loss": 0.1971, + "step": 21946 + }, + { + "epoch": 0.43896, + "grad_norm": 0.7994007468223572, + "learning_rate": 1.3779700590561653e-05, + "loss": 0.0283, + "step": 21948 + }, + { + "epoch": 0.439, + "grad_norm": 9.508234024047852, + "learning_rate": 1.3778407868184674e-05, + "loss": 0.359, + "step": 21950 + }, + { + "epoch": 0.43904, + "grad_norm": 0.0341409407556057, + "learning_rate": 1.3777115072145686e-05, + "loss": 0.0023, + "step": 21952 + }, + { + "epoch": 0.43908, + "grad_norm": 1.6025769710540771, + "learning_rate": 1.3775822202469895e-05, + "loss": 0.027, + "step": 21954 + }, + { + "epoch": 0.43912, + "grad_norm": 0.23223210871219635, + "learning_rate": 1.3774529259182508e-05, + "loss": 0.0173, + "step": 21956 + }, + { + "epoch": 0.43916, + "grad_norm": 0.2867533564567566, + "learning_rate": 1.3773236242308729e-05, + "loss": 0.0161, + "step": 21958 + }, + { + "epoch": 0.4392, + "grad_norm": 0.010575160384178162, + "learning_rate": 1.3771943151873768e-05, + "loss": 0.0963, + "step": 21960 + }, + { + "epoch": 0.43924, + "grad_norm": 0.09005050361156464, + "learning_rate": 1.3770649987902833e-05, + "loss": 0.0037, + "step": 21962 + }, + { + "epoch": 0.43928, + "grad_norm": 0.02397659234702587, + "learning_rate": 1.3769356750421135e-05, + "loss": 0.1383, + "step": 21964 + }, + { + "epoch": 0.43932, + "grad_norm": 8.075858116149902, + "learning_rate": 1.3768063439453886e-05, + "loss": 0.4106, + "step": 21966 + }, + { + "epoch": 0.43936, + "grad_norm": 0.46485695242881775, + "learning_rate": 1.3766770055026302e-05, + "loss": 0.0078, + "step": 21968 + }, + { + "epoch": 0.4394, + "grad_norm": 0.9396723508834839, + "learning_rate": 1.3765476597163595e-05, + "loss": 0.0229, + "step": 21970 + }, + { + "epoch": 0.43944, + "grad_norm": 1.0576955080032349, + "learning_rate": 1.3764183065890982e-05, + "loss": 0.0208, + "step": 21972 + }, + { + "epoch": 0.43948, + "grad_norm": 0.1935000717639923, + "learning_rate": 1.3762889461233683e-05, + "loss": 0.0068, + "step": 21974 + }, + { + "epoch": 0.43952, + "grad_norm": 0.13666649162769318, + "learning_rate": 1.3761595783216916e-05, + "loss": 0.0092, + "step": 21976 + }, + { + "epoch": 0.43956, + "grad_norm": 0.1040896326303482, + "learning_rate": 1.3760302031865904e-05, + "loss": 0.002, + "step": 21978 + }, + { + "epoch": 0.4396, + "grad_norm": 0.6441869139671326, + "learning_rate": 1.3759008207205869e-05, + "loss": 0.0106, + "step": 21980 + }, + { + "epoch": 0.43964, + "grad_norm": 4.732868194580078, + "learning_rate": 1.3757714309262033e-05, + "loss": 0.0855, + "step": 21982 + }, + { + "epoch": 0.43968, + "grad_norm": 8.96866512298584, + "learning_rate": 1.3756420338059622e-05, + "loss": 0.3374, + "step": 21984 + }, + { + "epoch": 0.43972, + "grad_norm": 0.11279227584600449, + "learning_rate": 1.3755126293623862e-05, + "loss": 0.003, + "step": 21986 + }, + { + "epoch": 0.43976, + "grad_norm": 2.5251636505126953, + "learning_rate": 1.375383217597998e-05, + "loss": 0.0883, + "step": 21988 + }, + { + "epoch": 0.4398, + "grad_norm": 1.6845054626464844, + "learning_rate": 1.375253798515321e-05, + "loss": 0.0321, + "step": 21990 + }, + { + "epoch": 0.43984, + "grad_norm": 0.16152961552143097, + "learning_rate": 1.3751243721168778e-05, + "loss": 0.0037, + "step": 21992 + }, + { + "epoch": 0.43988, + "grad_norm": 0.5077049136161804, + "learning_rate": 1.3749949384051919e-05, + "loss": 0.0083, + "step": 21994 + }, + { + "epoch": 0.43992, + "grad_norm": 0.39719781279563904, + "learning_rate": 1.3748654973827866e-05, + "loss": 0.0113, + "step": 21996 + }, + { + "epoch": 0.43996, + "grad_norm": 2.8250787258148193, + "learning_rate": 1.3747360490521857e-05, + "loss": 0.0381, + "step": 21998 + }, + { + "epoch": 0.44, + "grad_norm": 2.1998422145843506, + "learning_rate": 1.3746065934159123e-05, + "loss": 0.0396, + "step": 22000 + }, + { + "epoch": 0.44004, + "grad_norm": 0.20964863896369934, + "learning_rate": 1.3744771304764904e-05, + "loss": 0.5127, + "step": 22002 + }, + { + "epoch": 0.44008, + "grad_norm": 0.0017498733941465616, + "learning_rate": 1.3743476602364442e-05, + "loss": 0.0149, + "step": 22004 + }, + { + "epoch": 0.44012, + "grad_norm": 4.037510395050049, + "learning_rate": 1.374218182698298e-05, + "loss": 0.0779, + "step": 22006 + }, + { + "epoch": 0.44016, + "grad_norm": 0.06948761641979218, + "learning_rate": 1.374088697864575e-05, + "loss": 0.0027, + "step": 22008 + }, + { + "epoch": 0.4402, + "grad_norm": 0.8306567072868347, + "learning_rate": 1.3739592057378005e-05, + "loss": 0.3317, + "step": 22010 + }, + { + "epoch": 0.44024, + "grad_norm": 1.1217856407165527, + "learning_rate": 1.3738297063204989e-05, + "loss": 0.2622, + "step": 22012 + }, + { + "epoch": 0.44028, + "grad_norm": 0.38303372263908386, + "learning_rate": 1.3737001996151946e-05, + "loss": 0.0161, + "step": 22014 + }, + { + "epoch": 0.44032, + "grad_norm": 0.14913024008274078, + "learning_rate": 1.3735706856244127e-05, + "loss": 0.0066, + "step": 22016 + }, + { + "epoch": 0.44036, + "grad_norm": 0.013138970360159874, + "learning_rate": 1.3734411643506778e-05, + "loss": 0.0011, + "step": 22018 + }, + { + "epoch": 0.4404, + "grad_norm": 0.6656855940818787, + "learning_rate": 1.373311635796515e-05, + "loss": 0.0136, + "step": 22020 + }, + { + "epoch": 0.44044, + "grad_norm": 0.3214743733406067, + "learning_rate": 1.37318209996445e-05, + "loss": 0.0576, + "step": 22022 + }, + { + "epoch": 0.44048, + "grad_norm": 0.7134259343147278, + "learning_rate": 1.3730525568570075e-05, + "loss": 0.3144, + "step": 22024 + }, + { + "epoch": 0.44052, + "grad_norm": 0.2677977383136749, + "learning_rate": 1.3729230064767134e-05, + "loss": 0.0233, + "step": 22026 + }, + { + "epoch": 0.44056, + "grad_norm": 0.013845080509781837, + "learning_rate": 1.3727934488260934e-05, + "loss": 0.0046, + "step": 22028 + }, + { + "epoch": 0.4406, + "grad_norm": 0.9421234726905823, + "learning_rate": 1.3726638839076732e-05, + "loss": 0.1632, + "step": 22030 + }, + { + "epoch": 0.44064, + "grad_norm": 0.0659535825252533, + "learning_rate": 1.3725343117239788e-05, + "loss": 0.006, + "step": 22032 + }, + { + "epoch": 0.44068, + "grad_norm": 3.569798707962036, + "learning_rate": 1.372404732277536e-05, + "loss": 0.0724, + "step": 22034 + }, + { + "epoch": 0.44072, + "grad_norm": 0.1058008000254631, + "learning_rate": 1.3722751455708712e-05, + "loss": 0.0436, + "step": 22036 + }, + { + "epoch": 0.44076, + "grad_norm": 0.11046282202005386, + "learning_rate": 1.372145551606511e-05, + "loss": 0.0018, + "step": 22038 + }, + { + "epoch": 0.4408, + "grad_norm": 3.8142507076263428, + "learning_rate": 1.3720159503869816e-05, + "loss": 0.072, + "step": 22040 + }, + { + "epoch": 0.44084, + "grad_norm": 0.04750274121761322, + "learning_rate": 1.3718863419148097e-05, + "loss": 0.01, + "step": 22042 + }, + { + "epoch": 0.44088, + "grad_norm": 0.30417048931121826, + "learning_rate": 1.371756726192522e-05, + "loss": 0.0108, + "step": 22044 + }, + { + "epoch": 0.44092, + "grad_norm": 0.6705209016799927, + "learning_rate": 1.3716271032226453e-05, + "loss": 0.0106, + "step": 22046 + }, + { + "epoch": 0.44096, + "grad_norm": 0.07671104371547699, + "learning_rate": 1.3714974730077074e-05, + "loss": 0.0078, + "step": 22048 + }, + { + "epoch": 0.441, + "grad_norm": 4.544712066650391, + "learning_rate": 1.371367835550235e-05, + "loss": 0.0867, + "step": 22050 + }, + { + "epoch": 0.44104, + "grad_norm": 0.03699319437146187, + "learning_rate": 1.3712381908527552e-05, + "loss": 0.0055, + "step": 22052 + }, + { + "epoch": 0.44108, + "grad_norm": 0.07326368242502213, + "learning_rate": 1.3711085389177958e-05, + "loss": 0.2197, + "step": 22054 + }, + { + "epoch": 0.44112, + "grad_norm": 0.06111101433634758, + "learning_rate": 1.3709788797478844e-05, + "loss": 0.0885, + "step": 22056 + }, + { + "epoch": 0.44116, + "grad_norm": 8.69036865234375, + "learning_rate": 1.3708492133455486e-05, + "loss": 0.2856, + "step": 22058 + }, + { + "epoch": 0.4412, + "grad_norm": 0.037652671337127686, + "learning_rate": 1.3707195397133165e-05, + "loss": 0.0008, + "step": 22060 + }, + { + "epoch": 0.44124, + "grad_norm": 1.551648497581482, + "learning_rate": 1.3705898588537164e-05, + "loss": 0.0539, + "step": 22062 + }, + { + "epoch": 0.44128, + "grad_norm": 1.8103842735290527, + "learning_rate": 1.3704601707692762e-05, + "loss": 0.3248, + "step": 22064 + }, + { + "epoch": 0.44132, + "grad_norm": 0.3787986934185028, + "learning_rate": 1.3703304754625241e-05, + "loss": 0.0078, + "step": 22066 + }, + { + "epoch": 0.44136, + "grad_norm": 10.335736274719238, + "learning_rate": 1.3702007729359885e-05, + "loss": 0.4711, + "step": 22068 + }, + { + "epoch": 0.4414, + "grad_norm": 2.2633233070373535, + "learning_rate": 1.3700710631921984e-05, + "loss": 0.1067, + "step": 22070 + }, + { + "epoch": 0.44144, + "grad_norm": 1.4669080972671509, + "learning_rate": 1.3699413462336826e-05, + "loss": 0.035, + "step": 22072 + }, + { + "epoch": 0.44148, + "grad_norm": 0.27060583233833313, + "learning_rate": 1.3698116220629696e-05, + "loss": 0.0084, + "step": 22074 + }, + { + "epoch": 0.44152, + "grad_norm": 0.4120027720928192, + "learning_rate": 1.3696818906825886e-05, + "loss": 0.0156, + "step": 22076 + }, + { + "epoch": 0.44156, + "grad_norm": 4.866480827331543, + "learning_rate": 1.3695521520950687e-05, + "loss": 0.0959, + "step": 22078 + }, + { + "epoch": 0.4416, + "grad_norm": 10.54121208190918, + "learning_rate": 1.3694224063029396e-05, + "loss": 0.2723, + "step": 22080 + }, + { + "epoch": 0.44164, + "grad_norm": 0.011630935594439507, + "learning_rate": 1.3692926533087306e-05, + "loss": 0.0008, + "step": 22082 + }, + { + "epoch": 0.44168, + "grad_norm": 0.17522622644901276, + "learning_rate": 1.369162893114971e-05, + "loss": 0.0091, + "step": 22084 + }, + { + "epoch": 0.44172, + "grad_norm": 0.03678895905613899, + "learning_rate": 1.3690331257241907e-05, + "loss": 0.0111, + "step": 22086 + }, + { + "epoch": 0.44176, + "grad_norm": 2.085881471633911, + "learning_rate": 1.3689033511389199e-05, + "loss": 0.0274, + "step": 22088 + }, + { + "epoch": 0.4418, + "grad_norm": 9.018424034118652, + "learning_rate": 1.3687735693616876e-05, + "loss": 0.2071, + "step": 22090 + }, + { + "epoch": 0.44184, + "grad_norm": 0.7894377708435059, + "learning_rate": 1.3686437803950254e-05, + "loss": 0.0129, + "step": 22092 + }, + { + "epoch": 0.44188, + "grad_norm": 0.3216244876384735, + "learning_rate": 1.3685139842414628e-05, + "loss": 0.0058, + "step": 22094 + }, + { + "epoch": 0.44192, + "grad_norm": 3.375828742980957, + "learning_rate": 1.3683841809035303e-05, + "loss": 0.0847, + "step": 22096 + }, + { + "epoch": 0.44196, + "grad_norm": 0.02898956649005413, + "learning_rate": 1.3682543703837583e-05, + "loss": 0.0153, + "step": 22098 + }, + { + "epoch": 0.442, + "grad_norm": 0.5103554725646973, + "learning_rate": 1.3681245526846782e-05, + "loss": 0.1436, + "step": 22100 + }, + { + "epoch": 0.44204, + "grad_norm": 5.07358455657959, + "learning_rate": 1.3679947278088202e-05, + "loss": 0.0905, + "step": 22102 + }, + { + "epoch": 0.44208, + "grad_norm": 0.09821854531764984, + "learning_rate": 1.3678648957587155e-05, + "loss": 0.0232, + "step": 22104 + }, + { + "epoch": 0.44212, + "grad_norm": 0.45767292380332947, + "learning_rate": 1.3677350565368955e-05, + "loss": 0.0073, + "step": 22106 + }, + { + "epoch": 0.44216, + "grad_norm": 0.5723925232887268, + "learning_rate": 1.3676052101458907e-05, + "loss": 0.1581, + "step": 22108 + }, + { + "epoch": 0.4422, + "grad_norm": 0.1452585607767105, + "learning_rate": 1.3674753565882336e-05, + "loss": 0.2058, + "step": 22110 + }, + { + "epoch": 0.44224, + "grad_norm": 0.19196239113807678, + "learning_rate": 1.3673454958664549e-05, + "loss": 0.0125, + "step": 22112 + }, + { + "epoch": 0.44228, + "grad_norm": 0.04162432253360748, + "learning_rate": 1.3672156279830869e-05, + "loss": 0.6907, + "step": 22114 + }, + { + "epoch": 0.44232, + "grad_norm": 1.145473599433899, + "learning_rate": 1.3670857529406609e-05, + "loss": 0.0959, + "step": 22116 + }, + { + "epoch": 0.44236, + "grad_norm": 0.012021801434457302, + "learning_rate": 1.3669558707417095e-05, + "loss": 0.0026, + "step": 22118 + }, + { + "epoch": 0.4424, + "grad_norm": 0.3740341365337372, + "learning_rate": 1.3668259813887644e-05, + "loss": 0.015, + "step": 22120 + }, + { + "epoch": 0.44244, + "grad_norm": 0.03164314851164818, + "learning_rate": 1.3666960848843577e-05, + "loss": 0.1762, + "step": 22122 + }, + { + "epoch": 0.44248, + "grad_norm": 9.273025512695312, + "learning_rate": 1.3665661812310221e-05, + "loss": 0.5713, + "step": 22124 + }, + { + "epoch": 0.44252, + "grad_norm": 0.21377308666706085, + "learning_rate": 1.3664362704312902e-05, + "loss": 0.0411, + "step": 22126 + }, + { + "epoch": 0.44256, + "grad_norm": 0.6587578058242798, + "learning_rate": 1.3663063524876944e-05, + "loss": 0.016, + "step": 22128 + }, + { + "epoch": 0.4426, + "grad_norm": 4.964693546295166, + "learning_rate": 1.3661764274027678e-05, + "loss": 0.3994, + "step": 22130 + }, + { + "epoch": 0.44264, + "grad_norm": 1.0900167226791382, + "learning_rate": 1.366046495179043e-05, + "loss": 0.1086, + "step": 22132 + }, + { + "epoch": 0.44268, + "grad_norm": 0.534684419631958, + "learning_rate": 1.3659165558190539e-05, + "loss": 0.0161, + "step": 22134 + }, + { + "epoch": 0.44272, + "grad_norm": 0.17799623310565948, + "learning_rate": 1.3657866093253327e-05, + "loss": 0.0412, + "step": 22136 + }, + { + "epoch": 0.44276, + "grad_norm": 0.5307139754295349, + "learning_rate": 1.3656566557004134e-05, + "loss": 0.026, + "step": 22138 + }, + { + "epoch": 0.4428, + "grad_norm": 0.13514617085456848, + "learning_rate": 1.365526694946829e-05, + "loss": 0.0497, + "step": 22140 + }, + { + "epoch": 0.44284, + "grad_norm": 1.8316967487335205, + "learning_rate": 1.3653967270671138e-05, + "loss": 0.0771, + "step": 22142 + }, + { + "epoch": 0.44288, + "grad_norm": 0.4635218381881714, + "learning_rate": 1.3652667520638012e-05, + "loss": 0.0227, + "step": 22144 + }, + { + "epoch": 0.44292, + "grad_norm": 8.819931030273438, + "learning_rate": 1.3651367699394253e-05, + "loss": 0.4787, + "step": 22146 + }, + { + "epoch": 0.44296, + "grad_norm": 4.072523593902588, + "learning_rate": 1.3650067806965202e-05, + "loss": 0.0703, + "step": 22148 + }, + { + "epoch": 0.443, + "grad_norm": 0.23171766102313995, + "learning_rate": 1.3648767843376196e-05, + "loss": 0.1788, + "step": 22150 + }, + { + "epoch": 0.44304, + "grad_norm": 0.04267463460564613, + "learning_rate": 1.3647467808652586e-05, + "loss": 0.0309, + "step": 22152 + }, + { + "epoch": 0.44308, + "grad_norm": 0.778616726398468, + "learning_rate": 1.3646167702819714e-05, + "loss": 0.0223, + "step": 22154 + }, + { + "epoch": 0.44312, + "grad_norm": 4.206186294555664, + "learning_rate": 1.3644867525902923e-05, + "loss": 0.1291, + "step": 22156 + }, + { + "epoch": 0.44316, + "grad_norm": 5.7118821144104, + "learning_rate": 1.3643567277927564e-05, + "loss": 0.168, + "step": 22158 + }, + { + "epoch": 0.4432, + "grad_norm": 0.10125558078289032, + "learning_rate": 1.3642266958918985e-05, + "loss": 0.0489, + "step": 22160 + }, + { + "epoch": 0.44324, + "grad_norm": 3.2517571449279785, + "learning_rate": 1.3640966568902534e-05, + "loss": 0.1294, + "step": 22162 + }, + { + "epoch": 0.44328, + "grad_norm": 5.6968584060668945, + "learning_rate": 1.3639666107903566e-05, + "loss": 0.0976, + "step": 22164 + }, + { + "epoch": 0.44332, + "grad_norm": 0.3794493079185486, + "learning_rate": 1.3638365575947434e-05, + "loss": 0.0099, + "step": 22166 + }, + { + "epoch": 0.44336, + "grad_norm": 1.808602213859558, + "learning_rate": 1.3637064973059495e-05, + "loss": 0.0349, + "step": 22168 + }, + { + "epoch": 0.4434, + "grad_norm": 1.2034085988998413, + "learning_rate": 1.36357642992651e-05, + "loss": 0.0252, + "step": 22170 + }, + { + "epoch": 0.44344, + "grad_norm": 1.1613339185714722, + "learning_rate": 1.3634463554589608e-05, + "loss": 0.0279, + "step": 22172 + }, + { + "epoch": 0.44348, + "grad_norm": 4.968079090118408, + "learning_rate": 1.3633162739058377e-05, + "loss": 0.0979, + "step": 22174 + }, + { + "epoch": 0.44352, + "grad_norm": 0.052560027688741684, + "learning_rate": 1.3631861852696768e-05, + "loss": 0.0139, + "step": 22176 + }, + { + "epoch": 0.44356, + "grad_norm": 2.892549753189087, + "learning_rate": 1.3630560895530146e-05, + "loss": 0.0637, + "step": 22178 + }, + { + "epoch": 0.4436, + "grad_norm": 5.993508815765381, + "learning_rate": 1.3629259867583864e-05, + "loss": 0.3256, + "step": 22180 + }, + { + "epoch": 0.44364, + "grad_norm": 0.9036456942558289, + "learning_rate": 1.3627958768883296e-05, + "loss": 0.0254, + "step": 22182 + }, + { + "epoch": 0.44368, + "grad_norm": 0.09063595533370972, + "learning_rate": 1.3626657599453803e-05, + "loss": 0.0094, + "step": 22184 + }, + { + "epoch": 0.44372, + "grad_norm": 1.7135627269744873, + "learning_rate": 1.3625356359320754e-05, + "loss": 0.0242, + "step": 22186 + }, + { + "epoch": 0.44376, + "grad_norm": 2.3486053943634033, + "learning_rate": 1.3624055048509517e-05, + "loss": 0.0228, + "step": 22188 + }, + { + "epoch": 0.4438, + "grad_norm": 6.249374866485596, + "learning_rate": 1.3622753667045459e-05, + "loss": 0.4863, + "step": 22190 + }, + { + "epoch": 0.44384, + "grad_norm": 0.29792940616607666, + "learning_rate": 1.3621452214953954e-05, + "loss": 0.0703, + "step": 22192 + }, + { + "epoch": 0.44388, + "grad_norm": 0.3997501730918884, + "learning_rate": 1.3620150692260374e-05, + "loss": 0.1571, + "step": 22194 + }, + { + "epoch": 0.44392, + "grad_norm": 0.08235745877027512, + "learning_rate": 1.361884909899009e-05, + "loss": 0.0202, + "step": 22196 + }, + { + "epoch": 0.44396, + "grad_norm": 0.03951987624168396, + "learning_rate": 1.361754743516848e-05, + "loss": 0.0018, + "step": 22198 + }, + { + "epoch": 0.444, + "grad_norm": 0.5953903794288635, + "learning_rate": 1.3616245700820922e-05, + "loss": 0.2402, + "step": 22200 + }, + { + "epoch": 0.44404, + "grad_norm": 1.290582299232483, + "learning_rate": 1.3614943895972795e-05, + "loss": 0.0387, + "step": 22202 + }, + { + "epoch": 0.44408, + "grad_norm": 0.004560151137411594, + "learning_rate": 1.3613642020649475e-05, + "loss": 0.0012, + "step": 22204 + }, + { + "epoch": 0.44412, + "grad_norm": 0.2286054939031601, + "learning_rate": 1.361234007487634e-05, + "loss": 0.0567, + "step": 22206 + }, + { + "epoch": 0.44416, + "grad_norm": 0.19514571130275726, + "learning_rate": 1.3611038058678776e-05, + "loss": 0.0039, + "step": 22208 + }, + { + "epoch": 0.4442, + "grad_norm": 1.4990348815917969, + "learning_rate": 1.3609735972082168e-05, + "loss": 0.0428, + "step": 22210 + }, + { + "epoch": 0.44424, + "grad_norm": 0.13674625754356384, + "learning_rate": 1.3608433815111896e-05, + "loss": 0.0481, + "step": 22212 + }, + { + "epoch": 0.44428, + "grad_norm": 0.06633152812719345, + "learning_rate": 1.3607131587793352e-05, + "loss": 0.0037, + "step": 22214 + }, + { + "epoch": 0.44432, + "grad_norm": 0.11309555172920227, + "learning_rate": 1.3605829290151917e-05, + "loss": 0.0149, + "step": 22216 + }, + { + "epoch": 0.44436, + "grad_norm": 1.3019574880599976, + "learning_rate": 1.3604526922212989e-05, + "loss": 0.0262, + "step": 22218 + }, + { + "epoch": 0.4444, + "grad_norm": 0.20580363273620605, + "learning_rate": 1.3603224484001949e-05, + "loss": 0.0691, + "step": 22220 + }, + { + "epoch": 0.44444, + "grad_norm": 0.25676608085632324, + "learning_rate": 1.3601921975544192e-05, + "loss": 0.0144, + "step": 22222 + }, + { + "epoch": 0.44448, + "grad_norm": 0.19006872177124023, + "learning_rate": 1.3600619396865114e-05, + "loss": 0.1538, + "step": 22224 + }, + { + "epoch": 0.44452, + "grad_norm": 0.3314298987388611, + "learning_rate": 1.3599316747990107e-05, + "loss": 0.0433, + "step": 22226 + }, + { + "epoch": 0.44456, + "grad_norm": 4.640695571899414, + "learning_rate": 1.3598014028944567e-05, + "loss": 0.061, + "step": 22228 + }, + { + "epoch": 0.4446, + "grad_norm": 0.03250709921121597, + "learning_rate": 1.3596711239753889e-05, + "loss": 0.0242, + "step": 22230 + }, + { + "epoch": 0.44464, + "grad_norm": 2.7566699981689453, + "learning_rate": 1.3595408380443473e-05, + "loss": 0.0421, + "step": 22232 + }, + { + "epoch": 0.44468, + "grad_norm": 0.288393497467041, + "learning_rate": 1.3594105451038723e-05, + "loss": 0.0096, + "step": 22234 + }, + { + "epoch": 0.44472, + "grad_norm": 1.8727314472198486, + "learning_rate": 1.3592802451565035e-05, + "loss": 0.1175, + "step": 22236 + }, + { + "epoch": 0.44476, + "grad_norm": 1.2293200492858887, + "learning_rate": 1.3591499382047815e-05, + "loss": 0.0236, + "step": 22238 + }, + { + "epoch": 0.4448, + "grad_norm": 0.03415056690573692, + "learning_rate": 1.3590196242512463e-05, + "loss": 0.0342, + "step": 22240 + }, + { + "epoch": 0.44484, + "grad_norm": 0.2329597920179367, + "learning_rate": 1.358889303298439e-05, + "loss": 0.0081, + "step": 22242 + }, + { + "epoch": 0.44488, + "grad_norm": 0.23514150083065033, + "learning_rate": 1.3587589753488999e-05, + "loss": 0.005, + "step": 22244 + }, + { + "epoch": 0.44492, + "grad_norm": 0.3872077763080597, + "learning_rate": 1.3586286404051696e-05, + "loss": 0.1451, + "step": 22246 + }, + { + "epoch": 0.44496, + "grad_norm": 0.027852654457092285, + "learning_rate": 1.3584982984697894e-05, + "loss": 0.0048, + "step": 22248 + }, + { + "epoch": 0.445, + "grad_norm": 0.14147837460041046, + "learning_rate": 1.3583679495453e-05, + "loss": 0.1288, + "step": 22250 + }, + { + "epoch": 0.44504, + "grad_norm": 3.9015283584594727, + "learning_rate": 1.3582375936342438e-05, + "loss": 0.4069, + "step": 22252 + }, + { + "epoch": 0.44508, + "grad_norm": 0.33804479241371155, + "learning_rate": 1.3581072307391607e-05, + "loss": 0.0574, + "step": 22254 + }, + { + "epoch": 0.44512, + "grad_norm": 1.1713398694992065, + "learning_rate": 1.357976860862593e-05, + "loss": 0.0239, + "step": 22256 + }, + { + "epoch": 0.44516, + "grad_norm": 0.01981295645236969, + "learning_rate": 1.357846484007082e-05, + "loss": 0.0012, + "step": 22258 + }, + { + "epoch": 0.4452, + "grad_norm": 0.015103516168892384, + "learning_rate": 1.3577161001751696e-05, + "loss": 0.0021, + "step": 22260 + }, + { + "epoch": 0.44524, + "grad_norm": 0.09153374284505844, + "learning_rate": 1.3575857093693974e-05, + "loss": 0.0597, + "step": 22262 + }, + { + "epoch": 0.44528, + "grad_norm": 2.9966578483581543, + "learning_rate": 1.357455311592308e-05, + "loss": 0.0534, + "step": 22264 + }, + { + "epoch": 0.44532, + "grad_norm": 1.7195632457733154, + "learning_rate": 1.357324906846443e-05, + "loss": 0.04, + "step": 22266 + }, + { + "epoch": 0.44536, + "grad_norm": 5.865781784057617, + "learning_rate": 1.3571944951343452e-05, + "loss": 0.2838, + "step": 22268 + }, + { + "epoch": 0.4454, + "grad_norm": 0.04826553165912628, + "learning_rate": 1.3570640764585567e-05, + "loss": 0.7114, + "step": 22270 + }, + { + "epoch": 0.44544, + "grad_norm": 0.43066295981407166, + "learning_rate": 1.3569336508216205e-05, + "loss": 0.0072, + "step": 22272 + }, + { + "epoch": 0.44548, + "grad_norm": 0.18336810171604156, + "learning_rate": 1.3568032182260786e-05, + "loss": 0.3388, + "step": 22274 + }, + { + "epoch": 0.44552, + "grad_norm": 0.04040505364537239, + "learning_rate": 1.3566727786744744e-05, + "loss": 0.1387, + "step": 22276 + }, + { + "epoch": 0.44556, + "grad_norm": 8.681761741638184, + "learning_rate": 1.356542332169351e-05, + "loss": 0.269, + "step": 22278 + }, + { + "epoch": 0.4456, + "grad_norm": 0.09065070003271103, + "learning_rate": 1.3564118787132507e-05, + "loss": 0.0079, + "step": 22280 + }, + { + "epoch": 0.44564, + "grad_norm": 2.127603530883789, + "learning_rate": 1.3562814183087178e-05, + "loss": 0.05, + "step": 22282 + }, + { + "epoch": 0.44568, + "grad_norm": 0.20709025859832764, + "learning_rate": 1.3561509509582953e-05, + "loss": 0.1413, + "step": 22284 + }, + { + "epoch": 0.44572, + "grad_norm": 9.360221862792969, + "learning_rate": 1.3560204766645262e-05, + "loss": 0.3187, + "step": 22286 + }, + { + "epoch": 0.44576, + "grad_norm": 0.6289005875587463, + "learning_rate": 1.3558899954299548e-05, + "loss": 0.0107, + "step": 22288 + }, + { + "epoch": 0.4458, + "grad_norm": 0.34491831064224243, + "learning_rate": 1.355759507257125e-05, + "loss": 0.0485, + "step": 22290 + }, + { + "epoch": 0.44584, + "grad_norm": 0.40562617778778076, + "learning_rate": 1.3556290121485804e-05, + "loss": 0.0213, + "step": 22292 + }, + { + "epoch": 0.44588, + "grad_norm": 0.09612096101045609, + "learning_rate": 1.355498510106865e-05, + "loss": 0.0047, + "step": 22294 + }, + { + "epoch": 0.44592, + "grad_norm": 0.5384599566459656, + "learning_rate": 1.355368001134523e-05, + "loss": 0.0155, + "step": 22296 + }, + { + "epoch": 0.44596, + "grad_norm": 0.7314568161964417, + "learning_rate": 1.3552374852340987e-05, + "loss": 0.0119, + "step": 22298 + }, + { + "epoch": 0.446, + "grad_norm": 7.085252285003662, + "learning_rate": 1.3551069624081372e-05, + "loss": 0.2722, + "step": 22300 + }, + { + "epoch": 0.44604, + "grad_norm": 0.2850324809551239, + "learning_rate": 1.3549764326591825e-05, + "loss": 0.0189, + "step": 22302 + }, + { + "epoch": 0.44608, + "grad_norm": 1.326620101928711, + "learning_rate": 1.3548458959897793e-05, + "loss": 0.1535, + "step": 22304 + }, + { + "epoch": 0.44612, + "grad_norm": 0.2122139036655426, + "learning_rate": 1.354715352402473e-05, + "loss": 0.0752, + "step": 22306 + }, + { + "epoch": 0.44616, + "grad_norm": 11.154052734375, + "learning_rate": 1.354584801899808e-05, + "loss": 0.4514, + "step": 22308 + }, + { + "epoch": 0.4462, + "grad_norm": 4.992071151733398, + "learning_rate": 1.3544542444843298e-05, + "loss": 0.1794, + "step": 22310 + }, + { + "epoch": 0.44624, + "grad_norm": 0.03439154848456383, + "learning_rate": 1.3543236801585838e-05, + "loss": 0.0123, + "step": 22312 + }, + { + "epoch": 0.44628, + "grad_norm": 0.04648863896727562, + "learning_rate": 1.3541931089251148e-05, + "loss": 0.0078, + "step": 22314 + }, + { + "epoch": 0.44632, + "grad_norm": 0.08277220278978348, + "learning_rate": 1.3540625307864693e-05, + "loss": 0.0258, + "step": 22316 + }, + { + "epoch": 0.44636, + "grad_norm": 0.02463538385927677, + "learning_rate": 1.353931945745192e-05, + "loss": 0.0012, + "step": 22318 + }, + { + "epoch": 0.4464, + "grad_norm": 0.8079087138175964, + "learning_rate": 1.3538013538038295e-05, + "loss": 0.0152, + "step": 22320 + }, + { + "epoch": 0.44644, + "grad_norm": 0.05716508999466896, + "learning_rate": 1.3536707549649272e-05, + "loss": 0.003, + "step": 22322 + }, + { + "epoch": 0.44648, + "grad_norm": 4.026297569274902, + "learning_rate": 1.3535401492310318e-05, + "loss": 0.0721, + "step": 22324 + }, + { + "epoch": 0.44652, + "grad_norm": 0.10320478677749634, + "learning_rate": 1.3534095366046894e-05, + "loss": 0.028, + "step": 22326 + }, + { + "epoch": 0.44656, + "grad_norm": 2.8793888092041016, + "learning_rate": 1.3532789170884455e-05, + "loss": 0.0565, + "step": 22328 + }, + { + "epoch": 0.4466, + "grad_norm": 0.16204951703548431, + "learning_rate": 1.3531482906848474e-05, + "loss": 0.2526, + "step": 22330 + }, + { + "epoch": 0.44664, + "grad_norm": 0.08807549625635147, + "learning_rate": 1.3530176573964418e-05, + "loss": 0.0032, + "step": 22332 + }, + { + "epoch": 0.44668, + "grad_norm": 0.3876563310623169, + "learning_rate": 1.3528870172257751e-05, + "loss": 0.0357, + "step": 22334 + }, + { + "epoch": 0.44672, + "grad_norm": 0.061199869960546494, + "learning_rate": 1.3527563701753943e-05, + "loss": 0.0015, + "step": 22336 + }, + { + "epoch": 0.44676, + "grad_norm": 2.446772336959839, + "learning_rate": 1.3526257162478462e-05, + "loss": 0.0404, + "step": 22338 + }, + { + "epoch": 0.4468, + "grad_norm": 0.12628202140331268, + "learning_rate": 1.3524950554456786e-05, + "loss": 0.0208, + "step": 22340 + }, + { + "epoch": 0.44684, + "grad_norm": 11.800382614135742, + "learning_rate": 1.3523643877714383e-05, + "loss": 0.4212, + "step": 22342 + }, + { + "epoch": 0.44688, + "grad_norm": 0.04353706166148186, + "learning_rate": 1.3522337132276731e-05, + "loss": 0.0017, + "step": 22344 + }, + { + "epoch": 0.44692, + "grad_norm": 1.643513560295105, + "learning_rate": 1.3521030318169299e-05, + "loss": 0.0311, + "step": 22346 + }, + { + "epoch": 0.44696, + "grad_norm": 2.290158271789551, + "learning_rate": 1.351972343541757e-05, + "loss": 0.0482, + "step": 22348 + }, + { + "epoch": 0.447, + "grad_norm": 0.25944170355796814, + "learning_rate": 1.3518416484047018e-05, + "loss": 0.0076, + "step": 22350 + }, + { + "epoch": 0.44704, + "grad_norm": 8.511293411254883, + "learning_rate": 1.3517109464083129e-05, + "loss": 0.2181, + "step": 22352 + }, + { + "epoch": 0.44708, + "grad_norm": 0.12174885720014572, + "learning_rate": 1.3515802375551379e-05, + "loss": 0.004, + "step": 22354 + }, + { + "epoch": 0.44712, + "grad_norm": 8.891485214233398, + "learning_rate": 1.351449521847725e-05, + "loss": 0.3899, + "step": 22356 + }, + { + "epoch": 0.44716, + "grad_norm": 0.32153528928756714, + "learning_rate": 1.351318799288623e-05, + "loss": 0.0065, + "step": 22358 + }, + { + "epoch": 0.4472, + "grad_norm": 1.0665245056152344, + "learning_rate": 1.3511880698803801e-05, + "loss": 0.0138, + "step": 22360 + }, + { + "epoch": 0.44724, + "grad_norm": 0.1873953491449356, + "learning_rate": 1.3510573336255448e-05, + "loss": 0.0038, + "step": 22362 + }, + { + "epoch": 0.44728, + "grad_norm": 0.6201633810997009, + "learning_rate": 1.3509265905266661e-05, + "loss": 0.0089, + "step": 22364 + }, + { + "epoch": 0.44732, + "grad_norm": 4.366800308227539, + "learning_rate": 1.3507958405862932e-05, + "loss": 0.1228, + "step": 22366 + }, + { + "epoch": 0.44736, + "grad_norm": 6.1833086013793945, + "learning_rate": 1.3506650838069742e-05, + "loss": 0.1281, + "step": 22368 + }, + { + "epoch": 0.4474, + "grad_norm": 7.795065879821777, + "learning_rate": 1.350534320191259e-05, + "loss": 0.6264, + "step": 22370 + }, + { + "epoch": 0.44744, + "grad_norm": 0.6562732458114624, + "learning_rate": 1.350403549741697e-05, + "loss": 0.0117, + "step": 22372 + }, + { + "epoch": 0.44748, + "grad_norm": 0.5605940818786621, + "learning_rate": 1.3502727724608373e-05, + "loss": 0.3824, + "step": 22374 + }, + { + "epoch": 0.44752, + "grad_norm": 0.022456957027316093, + "learning_rate": 1.3501419883512294e-05, + "loss": 0.065, + "step": 22376 + }, + { + "epoch": 0.44756, + "grad_norm": 1.0072202682495117, + "learning_rate": 1.3500111974154236e-05, + "loss": 0.424, + "step": 22378 + }, + { + "epoch": 0.4476, + "grad_norm": 0.13462594151496887, + "learning_rate": 1.349880399655969e-05, + "loss": 0.3033, + "step": 22380 + }, + { + "epoch": 0.44764, + "grad_norm": 0.1496659517288208, + "learning_rate": 1.349749595075416e-05, + "loss": 0.0053, + "step": 22382 + }, + { + "epoch": 0.44768, + "grad_norm": 8.549918174743652, + "learning_rate": 1.3496187836763144e-05, + "loss": 0.1822, + "step": 22384 + }, + { + "epoch": 0.44772, + "grad_norm": 0.8722949028015137, + "learning_rate": 1.3494879654612145e-05, + "loss": 0.0546, + "step": 22386 + }, + { + "epoch": 0.44776, + "grad_norm": 3.16166353225708, + "learning_rate": 1.3493571404326671e-05, + "loss": 0.0834, + "step": 22388 + }, + { + "epoch": 0.4478, + "grad_norm": 0.32196831703186035, + "learning_rate": 1.3492263085932224e-05, + "loss": 0.2402, + "step": 22390 + }, + { + "epoch": 0.44784, + "grad_norm": 1.0929127931594849, + "learning_rate": 1.3490954699454308e-05, + "loss": 0.0527, + "step": 22392 + }, + { + "epoch": 0.44788, + "grad_norm": 2.369367837905884, + "learning_rate": 1.3489646244918434e-05, + "loss": 0.0946, + "step": 22394 + }, + { + "epoch": 0.44792, + "grad_norm": 0.7660874128341675, + "learning_rate": 1.3488337722350111e-05, + "loss": 0.0253, + "step": 22396 + }, + { + "epoch": 0.44796, + "grad_norm": 4.533940315246582, + "learning_rate": 1.3487029131774848e-05, + "loss": 0.1566, + "step": 22398 + }, + { + "epoch": 0.448, + "grad_norm": 4.911184310913086, + "learning_rate": 1.3485720473218153e-05, + "loss": 0.1851, + "step": 22400 + }, + { + "epoch": 0.44804, + "grad_norm": 2.0255463123321533, + "learning_rate": 1.3484411746705546e-05, + "loss": 0.0471, + "step": 22402 + }, + { + "epoch": 0.44808, + "grad_norm": 0.6635950803756714, + "learning_rate": 1.3483102952262537e-05, + "loss": 0.0182, + "step": 22404 + }, + { + "epoch": 0.44812, + "grad_norm": 0.034267179667949677, + "learning_rate": 1.3481794089914643e-05, + "loss": 0.0056, + "step": 22406 + }, + { + "epoch": 0.44816, + "grad_norm": 0.6712360382080078, + "learning_rate": 1.3480485159687382e-05, + "loss": 0.0168, + "step": 22408 + }, + { + "epoch": 0.4482, + "grad_norm": 0.037604354321956635, + "learning_rate": 1.3479176161606269e-05, + "loss": 0.005, + "step": 22410 + }, + { + "epoch": 0.44824, + "grad_norm": 0.19922903180122375, + "learning_rate": 1.3477867095696829e-05, + "loss": 0.0112, + "step": 22412 + }, + { + "epoch": 0.44828, + "grad_norm": 1.0391736030578613, + "learning_rate": 1.3476557961984578e-05, + "loss": 0.0376, + "step": 22414 + }, + { + "epoch": 0.44832, + "grad_norm": 0.11735395342111588, + "learning_rate": 1.3475248760495037e-05, + "loss": 0.0055, + "step": 22416 + }, + { + "epoch": 0.44836, + "grad_norm": 1.3976548910140991, + "learning_rate": 1.3473939491253734e-05, + "loss": 0.0359, + "step": 22418 + }, + { + "epoch": 0.4484, + "grad_norm": 0.06455808877944946, + "learning_rate": 1.347263015428619e-05, + "loss": 0.0018, + "step": 22420 + }, + { + "epoch": 0.44844, + "grad_norm": 0.04595983028411865, + "learning_rate": 1.3471320749617938e-05, + "loss": 0.0081, + "step": 22422 + }, + { + "epoch": 0.44848, + "grad_norm": 3.3582515716552734, + "learning_rate": 1.3470011277274497e-05, + "loss": 0.0425, + "step": 22424 + }, + { + "epoch": 0.44852, + "grad_norm": 11.605416297912598, + "learning_rate": 1.34687017372814e-05, + "loss": 0.7613, + "step": 22426 + }, + { + "epoch": 0.44856, + "grad_norm": 0.18317227065563202, + "learning_rate": 1.3467392129664179e-05, + "loss": 0.4906, + "step": 22428 + }, + { + "epoch": 0.4486, + "grad_norm": 1.47052001953125, + "learning_rate": 1.3466082454448364e-05, + "loss": 0.0642, + "step": 22430 + }, + { + "epoch": 0.44864, + "grad_norm": 0.046959247440099716, + "learning_rate": 1.3464772711659485e-05, + "loss": 0.0221, + "step": 22432 + }, + { + "epoch": 0.44868, + "grad_norm": 1.9251179695129395, + "learning_rate": 1.3463462901323077e-05, + "loss": 0.331, + "step": 22434 + }, + { + "epoch": 0.44872, + "grad_norm": 0.967789888381958, + "learning_rate": 1.3462153023464675e-05, + "loss": 0.184, + "step": 22436 + }, + { + "epoch": 0.44876, + "grad_norm": 8.923356056213379, + "learning_rate": 1.3460843078109822e-05, + "loss": 0.4117, + "step": 22438 + }, + { + "epoch": 0.4488, + "grad_norm": 6.929161071777344, + "learning_rate": 1.3459533065284049e-05, + "loss": 0.2339, + "step": 22440 + }, + { + "epoch": 0.44884, + "grad_norm": 0.04858009144663811, + "learning_rate": 1.34582229850129e-05, + "loss": 0.0026, + "step": 22442 + }, + { + "epoch": 0.44888, + "grad_norm": 0.05251335725188255, + "learning_rate": 1.3456912837321909e-05, + "loss": 0.0177, + "step": 22444 + }, + { + "epoch": 0.44892, + "grad_norm": 1.57291579246521, + "learning_rate": 1.3455602622236628e-05, + "loss": 0.0365, + "step": 22446 + }, + { + "epoch": 0.44896, + "grad_norm": 5.099557876586914, + "learning_rate": 1.3454292339782594e-05, + "loss": 0.0895, + "step": 22448 + }, + { + "epoch": 0.449, + "grad_norm": 0.06707976758480072, + "learning_rate": 1.3452981989985347e-05, + "loss": 0.5693, + "step": 22450 + }, + { + "epoch": 0.44904, + "grad_norm": 0.5521982908248901, + "learning_rate": 1.3451671572870443e-05, + "loss": 0.0094, + "step": 22452 + }, + { + "epoch": 0.44908, + "grad_norm": 2.491488456726074, + "learning_rate": 1.3450361088463422e-05, + "loss": 0.0574, + "step": 22454 + }, + { + "epoch": 0.44912, + "grad_norm": 3.053924322128296, + "learning_rate": 1.3449050536789839e-05, + "loss": 0.3708, + "step": 22456 + }, + { + "epoch": 0.44916, + "grad_norm": 5.438982009887695, + "learning_rate": 1.3447739917875237e-05, + "loss": 0.1875, + "step": 22458 + }, + { + "epoch": 0.4492, + "grad_norm": 0.6890442371368408, + "learning_rate": 1.344642923174517e-05, + "loss": 0.1189, + "step": 22460 + }, + { + "epoch": 0.44924, + "grad_norm": 7.806150913238525, + "learning_rate": 1.3445118478425195e-05, + "loss": 0.2852, + "step": 22462 + }, + { + "epoch": 0.44928, + "grad_norm": 0.20833025872707367, + "learning_rate": 1.3443807657940859e-05, + "loss": 0.0047, + "step": 22464 + }, + { + "epoch": 0.44932, + "grad_norm": 2.883152723312378, + "learning_rate": 1.3442496770317718e-05, + "loss": 0.1546, + "step": 22466 + }, + { + "epoch": 0.44936, + "grad_norm": 0.20095235109329224, + "learning_rate": 1.3441185815581333e-05, + "loss": 0.1534, + "step": 22468 + }, + { + "epoch": 0.4494, + "grad_norm": 5.960371971130371, + "learning_rate": 1.3439874793757255e-05, + "loss": 0.2364, + "step": 22470 + }, + { + "epoch": 0.44944, + "grad_norm": 0.19958671927452087, + "learning_rate": 1.3438563704871053e-05, + "loss": 0.0203, + "step": 22472 + }, + { + "epoch": 0.44948, + "grad_norm": 3.885439872741699, + "learning_rate": 1.3437252548948277e-05, + "loss": 0.053, + "step": 22474 + }, + { + "epoch": 0.44952, + "grad_norm": 1.0013411045074463, + "learning_rate": 1.343594132601449e-05, + "loss": 0.3786, + "step": 22476 + }, + { + "epoch": 0.44956, + "grad_norm": 0.24294398725032806, + "learning_rate": 1.3434630036095263e-05, + "loss": 0.0207, + "step": 22478 + }, + { + "epoch": 0.4496, + "grad_norm": 0.05002877861261368, + "learning_rate": 1.3433318679216154e-05, + "loss": 0.2199, + "step": 22480 + }, + { + "epoch": 0.44964, + "grad_norm": 0.11976473778486252, + "learning_rate": 1.343200725540273e-05, + "loss": 0.01, + "step": 22482 + }, + { + "epoch": 0.44968, + "grad_norm": 3.107151985168457, + "learning_rate": 1.3430695764680557e-05, + "loss": 0.1302, + "step": 22484 + }, + { + "epoch": 0.44972, + "grad_norm": 1.8162059783935547, + "learning_rate": 1.3429384207075202e-05, + "loss": 0.0867, + "step": 22486 + }, + { + "epoch": 0.44976, + "grad_norm": 1.8745672702789307, + "learning_rate": 1.3428072582612242e-05, + "loss": 0.059, + "step": 22488 + }, + { + "epoch": 0.4498, + "grad_norm": 0.05579938739538193, + "learning_rate": 1.3426760891317236e-05, + "loss": 0.0048, + "step": 22490 + }, + { + "epoch": 0.44984, + "grad_norm": 2.433908700942993, + "learning_rate": 1.3425449133215764e-05, + "loss": 0.2057, + "step": 22492 + }, + { + "epoch": 0.44988, + "grad_norm": 0.4931296408176422, + "learning_rate": 1.3424137308333398e-05, + "loss": 0.0145, + "step": 22494 + }, + { + "epoch": 0.44992, + "grad_norm": 0.3764399588108063, + "learning_rate": 1.3422825416695713e-05, + "loss": 0.0321, + "step": 22496 + }, + { + "epoch": 0.44996, + "grad_norm": 0.13419531285762787, + "learning_rate": 1.3421513458328285e-05, + "loss": 0.0095, + "step": 22498 + }, + { + "epoch": 0.45, + "grad_norm": 2.572575569152832, + "learning_rate": 1.342020143325669e-05, + "loss": 0.3655, + "step": 22500 + }, + { + "epoch": 0.45004, + "grad_norm": 2.167342185974121, + "learning_rate": 1.3418889341506504e-05, + "loss": 0.1105, + "step": 22502 + }, + { + "epoch": 0.45008, + "grad_norm": 0.40076929330825806, + "learning_rate": 1.3417577183103316e-05, + "loss": 0.02, + "step": 22504 + }, + { + "epoch": 0.45012, + "grad_norm": 0.27474889159202576, + "learning_rate": 1.3416264958072697e-05, + "loss": 0.0105, + "step": 22506 + }, + { + "epoch": 0.45016, + "grad_norm": 7.137170314788818, + "learning_rate": 1.3414952666440232e-05, + "loss": 0.2123, + "step": 22508 + }, + { + "epoch": 0.4502, + "grad_norm": 5.459310531616211, + "learning_rate": 1.3413640308231511e-05, + "loss": 0.2927, + "step": 22510 + }, + { + "epoch": 0.45024, + "grad_norm": 0.11864382773637772, + "learning_rate": 1.3412327883472114e-05, + "loss": 0.0446, + "step": 22512 + }, + { + "epoch": 0.45028, + "grad_norm": 1.0370675325393677, + "learning_rate": 1.3411015392187627e-05, + "loss": 0.3051, + "step": 22514 + }, + { + "epoch": 0.45032, + "grad_norm": 0.1380472183227539, + "learning_rate": 1.3409702834403642e-05, + "loss": 0.0136, + "step": 22516 + }, + { + "epoch": 0.45036, + "grad_norm": 0.12099252641201019, + "learning_rate": 1.3408390210145741e-05, + "loss": 0.1186, + "step": 22518 + }, + { + "epoch": 0.4504, + "grad_norm": 0.5273206830024719, + "learning_rate": 1.340707751943952e-05, + "loss": 0.1757, + "step": 22520 + }, + { + "epoch": 0.45044, + "grad_norm": 2.3943631649017334, + "learning_rate": 1.3405764762310567e-05, + "loss": 0.1187, + "step": 22522 + }, + { + "epoch": 0.45048, + "grad_norm": 0.08067600429058075, + "learning_rate": 1.3404451938784475e-05, + "loss": 0.0355, + "step": 22524 + }, + { + "epoch": 0.45052, + "grad_norm": 1.0204651355743408, + "learning_rate": 1.3403139048886842e-05, + "loss": 0.0232, + "step": 22526 + }, + { + "epoch": 0.45056, + "grad_norm": 6.034824848175049, + "learning_rate": 1.3401826092643262e-05, + "loss": 0.3138, + "step": 22528 + }, + { + "epoch": 0.4506, + "grad_norm": 5.2472243309021, + "learning_rate": 1.340051307007933e-05, + "loss": 0.1262, + "step": 22530 + }, + { + "epoch": 0.45064, + "grad_norm": 0.12080294638872147, + "learning_rate": 1.3399199981220648e-05, + "loss": 0.47, + "step": 22532 + }, + { + "epoch": 0.45068, + "grad_norm": 1.8742226362228394, + "learning_rate": 1.3397886826092808e-05, + "loss": 0.0457, + "step": 22534 + }, + { + "epoch": 0.45072, + "grad_norm": 1.8921071290969849, + "learning_rate": 1.3396573604721416e-05, + "loss": 0.1836, + "step": 22536 + }, + { + "epoch": 0.45076, + "grad_norm": 7.303708076477051, + "learning_rate": 1.3395260317132076e-05, + "loss": 0.2242, + "step": 22538 + }, + { + "epoch": 0.4508, + "grad_norm": 1.9891000986099243, + "learning_rate": 1.3393946963350381e-05, + "loss": 0.0429, + "step": 22540 + }, + { + "epoch": 0.45084, + "grad_norm": 4.609589576721191, + "learning_rate": 1.3392633543401949e-05, + "loss": 0.143, + "step": 22542 + }, + { + "epoch": 0.45088, + "grad_norm": 6.519754886627197, + "learning_rate": 1.3391320057312378e-05, + "loss": 0.2209, + "step": 22544 + }, + { + "epoch": 0.45092, + "grad_norm": 0.1707395762205124, + "learning_rate": 1.3390006505107277e-05, + "loss": 0.02, + "step": 22546 + }, + { + "epoch": 0.45096, + "grad_norm": 1.5573171377182007, + "learning_rate": 1.3388692886812251e-05, + "loss": 0.0381, + "step": 22548 + }, + { + "epoch": 0.451, + "grad_norm": 0.24640503525733948, + "learning_rate": 1.3387379202452917e-05, + "loss": 0.0115, + "step": 22550 + }, + { + "epoch": 0.45104, + "grad_norm": 4.0118727684021, + "learning_rate": 1.3386065452054877e-05, + "loss": 0.095, + "step": 22552 + }, + { + "epoch": 0.45108, + "grad_norm": 1.6638370752334595, + "learning_rate": 1.3384751635643751e-05, + "loss": 0.0674, + "step": 22554 + }, + { + "epoch": 0.45112, + "grad_norm": 0.7110278606414795, + "learning_rate": 1.3383437753245147e-05, + "loss": 0.0554, + "step": 22556 + }, + { + "epoch": 0.45116, + "grad_norm": 0.19641894102096558, + "learning_rate": 1.3382123804884684e-05, + "loss": 0.0279, + "step": 22558 + }, + { + "epoch": 0.4512, + "grad_norm": 0.33556127548217773, + "learning_rate": 1.3380809790587975e-05, + "loss": 0.0096, + "step": 22560 + }, + { + "epoch": 0.45124, + "grad_norm": 5.194820880889893, + "learning_rate": 1.3379495710380638e-05, + "loss": 0.1776, + "step": 22562 + }, + { + "epoch": 0.45128, + "grad_norm": 1.0399174690246582, + "learning_rate": 1.3378181564288292e-05, + "loss": 0.0758, + "step": 22564 + }, + { + "epoch": 0.45132, + "grad_norm": 1.925079107284546, + "learning_rate": 1.337686735233656e-05, + "loss": 0.0534, + "step": 22566 + }, + { + "epoch": 0.45136, + "grad_norm": 6.099506378173828, + "learning_rate": 1.337555307455106e-05, + "loss": 0.2282, + "step": 22568 + }, + { + "epoch": 0.4514, + "grad_norm": 2.071758985519409, + "learning_rate": 1.3374238730957414e-05, + "loss": 0.1339, + "step": 22570 + }, + { + "epoch": 0.45144, + "grad_norm": 0.8707184791564941, + "learning_rate": 1.3372924321581247e-05, + "loss": 0.2208, + "step": 22572 + }, + { + "epoch": 0.45148, + "grad_norm": 1.0557018518447876, + "learning_rate": 1.3371609846448182e-05, + "loss": 0.0269, + "step": 22574 + }, + { + "epoch": 0.45152, + "grad_norm": 8.624567031860352, + "learning_rate": 1.3370295305583847e-05, + "loss": 0.5693, + "step": 22576 + }, + { + "epoch": 0.45156, + "grad_norm": 0.43817681074142456, + "learning_rate": 1.3368980699013874e-05, + "loss": 0.0177, + "step": 22578 + }, + { + "epoch": 0.4516, + "grad_norm": 2.897361993789673, + "learning_rate": 1.3367666026763884e-05, + "loss": 0.272, + "step": 22580 + }, + { + "epoch": 0.45164, + "grad_norm": 1.448103427886963, + "learning_rate": 1.3366351288859511e-05, + "loss": 0.0364, + "step": 22582 + }, + { + "epoch": 0.45168, + "grad_norm": 6.473248481750488, + "learning_rate": 1.336503648532639e-05, + "loss": 0.3158, + "step": 22584 + }, + { + "epoch": 0.45172, + "grad_norm": 1.5573062896728516, + "learning_rate": 1.3363721616190148e-05, + "loss": 0.0466, + "step": 22586 + }, + { + "epoch": 0.45176, + "grad_norm": 1.7023497819900513, + "learning_rate": 1.3362406681476423e-05, + "loss": 0.0504, + "step": 22588 + }, + { + "epoch": 0.4518, + "grad_norm": 1.9570659399032593, + "learning_rate": 1.3361091681210846e-05, + "loss": 0.0504, + "step": 22590 + }, + { + "epoch": 0.45184, + "grad_norm": 1.2977790832519531, + "learning_rate": 1.3359776615419055e-05, + "loss": 0.0507, + "step": 22592 + }, + { + "epoch": 0.45188, + "grad_norm": 1.2055326700210571, + "learning_rate": 1.3358461484126695e-05, + "loss": 0.0528, + "step": 22594 + }, + { + "epoch": 0.45192, + "grad_norm": 1.2086156606674194, + "learning_rate": 1.3357146287359392e-05, + "loss": 0.0457, + "step": 22596 + }, + { + "epoch": 0.45196, + "grad_norm": 6.727536201477051, + "learning_rate": 1.3355831025142797e-05, + "loss": 0.3628, + "step": 22598 + }, + { + "epoch": 0.452, + "grad_norm": 0.7935797572135925, + "learning_rate": 1.3354515697502552e-05, + "loss": 0.0318, + "step": 22600 + }, + { + "epoch": 0.45204, + "grad_norm": 0.48964452743530273, + "learning_rate": 1.3353200304464293e-05, + "loss": 0.0405, + "step": 22602 + }, + { + "epoch": 0.45208, + "grad_norm": 1.3407950401306152, + "learning_rate": 1.3351884846053668e-05, + "loss": 0.1325, + "step": 22604 + }, + { + "epoch": 0.45212, + "grad_norm": 7.557590007781982, + "learning_rate": 1.3350569322296323e-05, + "loss": 0.7531, + "step": 22606 + }, + { + "epoch": 0.45216, + "grad_norm": 3.1752941608428955, + "learning_rate": 1.3349253733217902e-05, + "loss": 0.1363, + "step": 22608 + }, + { + "epoch": 0.4522, + "grad_norm": 0.2801438868045807, + "learning_rate": 1.3347938078844058e-05, + "loss": 0.0115, + "step": 22610 + }, + { + "epoch": 0.45224, + "grad_norm": 0.1916799694299698, + "learning_rate": 1.3346622359200436e-05, + "loss": 0.0674, + "step": 22612 + }, + { + "epoch": 0.45228, + "grad_norm": 1.0710551738739014, + "learning_rate": 1.334530657431269e-05, + "loss": 0.0401, + "step": 22614 + }, + { + "epoch": 0.45232, + "grad_norm": 0.053716789931058884, + "learning_rate": 1.3343990724206465e-05, + "loss": 0.0254, + "step": 22616 + }, + { + "epoch": 0.45236, + "grad_norm": 5.725371837615967, + "learning_rate": 1.3342674808907426e-05, + "loss": 0.2167, + "step": 22618 + }, + { + "epoch": 0.4524, + "grad_norm": 6.994811534881592, + "learning_rate": 1.3341358828441217e-05, + "loss": 0.4168, + "step": 22620 + }, + { + "epoch": 0.45244, + "grad_norm": 0.14819569885730743, + "learning_rate": 1.3340042782833497e-05, + "loss": 0.0067, + "step": 22622 + }, + { + "epoch": 0.45248, + "grad_norm": 2.0543951988220215, + "learning_rate": 1.3338726672109923e-05, + "loss": 0.0807, + "step": 22624 + }, + { + "epoch": 0.45252, + "grad_norm": 1.613221287727356, + "learning_rate": 1.3337410496296158e-05, + "loss": 0.0754, + "step": 22626 + }, + { + "epoch": 0.45256, + "grad_norm": 0.3494924306869507, + "learning_rate": 1.3336094255417853e-05, + "loss": 0.0097, + "step": 22628 + }, + { + "epoch": 0.4526, + "grad_norm": 5.085944175720215, + "learning_rate": 1.3334777949500673e-05, + "loss": 0.1316, + "step": 22630 + }, + { + "epoch": 0.45264, + "grad_norm": 1.3971079587936401, + "learning_rate": 1.333346157857028e-05, + "loss": 0.0605, + "step": 22632 + }, + { + "epoch": 0.45268, + "grad_norm": 0.34035030007362366, + "learning_rate": 1.333214514265234e-05, + "loss": 0.0278, + "step": 22634 + }, + { + "epoch": 0.45272, + "grad_norm": 4.389297008514404, + "learning_rate": 1.3330828641772511e-05, + "loss": 0.0864, + "step": 22636 + }, + { + "epoch": 0.45276, + "grad_norm": 0.2893334627151489, + "learning_rate": 1.3329512075956466e-05, + "loss": 0.0113, + "step": 22638 + }, + { + "epoch": 0.4528, + "grad_norm": 0.2289775162935257, + "learning_rate": 1.3328195445229869e-05, + "loss": 0.0228, + "step": 22640 + }, + { + "epoch": 0.45284, + "grad_norm": 0.3772680461406708, + "learning_rate": 1.3326878749618388e-05, + "loss": 0.1572, + "step": 22642 + }, + { + "epoch": 0.45288, + "grad_norm": 0.3961712121963501, + "learning_rate": 1.3325561989147691e-05, + "loss": 0.0553, + "step": 22644 + }, + { + "epoch": 0.45292, + "grad_norm": 1.134604811668396, + "learning_rate": 1.3324245163843452e-05, + "loss": 0.0309, + "step": 22646 + }, + { + "epoch": 0.45296, + "grad_norm": 2.9433186054229736, + "learning_rate": 1.3322928273731342e-05, + "loss": 0.0533, + "step": 22648 + }, + { + "epoch": 0.453, + "grad_norm": 0.3215423822402954, + "learning_rate": 1.3321611318837033e-05, + "loss": 0.1121, + "step": 22650 + }, + { + "epoch": 0.45304, + "grad_norm": 0.08179072290658951, + "learning_rate": 1.3320294299186203e-05, + "loss": 0.0066, + "step": 22652 + }, + { + "epoch": 0.45308, + "grad_norm": 0.18208244442939758, + "learning_rate": 1.331897721480453e-05, + "loss": 0.009, + "step": 22654 + }, + { + "epoch": 0.45312, + "grad_norm": 1.3357722759246826, + "learning_rate": 1.3317660065717682e-05, + "loss": 0.0234, + "step": 22656 + }, + { + "epoch": 0.45316, + "grad_norm": 0.3479057848453522, + "learning_rate": 1.3316342851951345e-05, + "loss": 0.0092, + "step": 22658 + }, + { + "epoch": 0.4532, + "grad_norm": 0.21715214848518372, + "learning_rate": 1.3315025573531198e-05, + "loss": 0.0509, + "step": 22660 + }, + { + "epoch": 0.45324, + "grad_norm": 7.726456642150879, + "learning_rate": 1.3313708230482916e-05, + "loss": 0.269, + "step": 22662 + }, + { + "epoch": 0.45328, + "grad_norm": 1.7848354578018188, + "learning_rate": 1.3312390822832188e-05, + "loss": 0.0515, + "step": 22664 + }, + { + "epoch": 0.45332, + "grad_norm": 7.435822010040283, + "learning_rate": 1.3311073350604697e-05, + "loss": 0.6549, + "step": 22666 + }, + { + "epoch": 0.45336, + "grad_norm": 0.07086294889450073, + "learning_rate": 1.3309755813826128e-05, + "loss": 0.0227, + "step": 22668 + }, + { + "epoch": 0.4534, + "grad_norm": 0.1385602504014969, + "learning_rate": 1.3308438212522164e-05, + "loss": 0.04, + "step": 22670 + }, + { + "epoch": 0.45344, + "grad_norm": 7.723602294921875, + "learning_rate": 1.3307120546718493e-05, + "loss": 0.3049, + "step": 22672 + }, + { + "epoch": 0.45348, + "grad_norm": 0.2989422380924225, + "learning_rate": 1.3305802816440805e-05, + "loss": 0.0096, + "step": 22674 + }, + { + "epoch": 0.45352, + "grad_norm": 0.3192601203918457, + "learning_rate": 1.330448502171479e-05, + "loss": 0.2241, + "step": 22676 + }, + { + "epoch": 0.45356, + "grad_norm": 0.39944571256637573, + "learning_rate": 1.3303167162566136e-05, + "loss": 0.0647, + "step": 22678 + }, + { + "epoch": 0.4536, + "grad_norm": 3.618532180786133, + "learning_rate": 1.3301849239020537e-05, + "loss": 0.1793, + "step": 22680 + }, + { + "epoch": 0.45364, + "grad_norm": 0.13563181459903717, + "learning_rate": 1.3300531251103687e-05, + "loss": 0.0303, + "step": 22682 + }, + { + "epoch": 0.45368, + "grad_norm": 0.4700915515422821, + "learning_rate": 1.3299213198841286e-05, + "loss": 0.125, + "step": 22684 + }, + { + "epoch": 0.45372, + "grad_norm": 2.6087915897369385, + "learning_rate": 1.3297895082259022e-05, + "loss": 0.0622, + "step": 22686 + }, + { + "epoch": 0.45376, + "grad_norm": 0.5771138668060303, + "learning_rate": 1.3296576901382596e-05, + "loss": 0.0131, + "step": 22688 + }, + { + "epoch": 0.4538, + "grad_norm": 2.8451788425445557, + "learning_rate": 1.3295258656237703e-05, + "loss": 0.0769, + "step": 22690 + }, + { + "epoch": 0.45384, + "grad_norm": 3.1669135093688965, + "learning_rate": 1.3293940346850052e-05, + "loss": 0.0977, + "step": 22692 + }, + { + "epoch": 0.45388, + "grad_norm": 2.4522321224212646, + "learning_rate": 1.3292621973245335e-05, + "loss": 0.071, + "step": 22694 + }, + { + "epoch": 0.45392, + "grad_norm": 2.067753553390503, + "learning_rate": 1.3291303535449256e-05, + "loss": 0.0587, + "step": 22696 + }, + { + "epoch": 0.45396, + "grad_norm": 0.804047167301178, + "learning_rate": 1.3289985033487521e-05, + "loss": 0.0437, + "step": 22698 + }, + { + "epoch": 0.454, + "grad_norm": 0.21835486590862274, + "learning_rate": 1.3288666467385834e-05, + "loss": 0.01, + "step": 22700 + }, + { + "epoch": 0.45404, + "grad_norm": 0.25091588497161865, + "learning_rate": 1.3287347837169902e-05, + "loss": 0.015, + "step": 22702 + }, + { + "epoch": 0.45408, + "grad_norm": 3.927424192428589, + "learning_rate": 1.328602914286543e-05, + "loss": 0.0811, + "step": 22704 + }, + { + "epoch": 0.45412, + "grad_norm": 1.7222589254379272, + "learning_rate": 1.328471038449813e-05, + "loss": 0.0901, + "step": 22706 + }, + { + "epoch": 0.45416, + "grad_norm": 0.4581039845943451, + "learning_rate": 1.3283391562093711e-05, + "loss": 0.1055, + "step": 22708 + }, + { + "epoch": 0.4542, + "grad_norm": 0.5675275921821594, + "learning_rate": 1.328207267567788e-05, + "loss": 0.0121, + "step": 22710 + }, + { + "epoch": 0.45424, + "grad_norm": 1.1409437656402588, + "learning_rate": 1.3280753725276352e-05, + "loss": 0.0206, + "step": 22712 + }, + { + "epoch": 0.45428, + "grad_norm": 0.38246989250183105, + "learning_rate": 1.327943471091484e-05, + "loss": 0.0187, + "step": 22714 + }, + { + "epoch": 0.45432, + "grad_norm": 0.5675737857818604, + "learning_rate": 1.3278115632619064e-05, + "loss": 0.0419, + "step": 22716 + }, + { + "epoch": 0.45436, + "grad_norm": 2.7451772689819336, + "learning_rate": 1.3276796490414735e-05, + "loss": 0.052, + "step": 22718 + }, + { + "epoch": 0.4544, + "grad_norm": 0.007984990254044533, + "learning_rate": 1.327547728432757e-05, + "loss": 0.0015, + "step": 22720 + }, + { + "epoch": 0.45444, + "grad_norm": 0.598864734172821, + "learning_rate": 1.3274158014383291e-05, + "loss": 0.0331, + "step": 22722 + }, + { + "epoch": 0.45448, + "grad_norm": 4.692266464233398, + "learning_rate": 1.3272838680607616e-05, + "loss": 0.1086, + "step": 22724 + }, + { + "epoch": 0.45452, + "grad_norm": 0.14803127944469452, + "learning_rate": 1.3271519283026264e-05, + "loss": 0.006, + "step": 22726 + }, + { + "epoch": 0.45456, + "grad_norm": 0.17922534048557281, + "learning_rate": 1.327019982166496e-05, + "loss": 0.0046, + "step": 22728 + }, + { + "epoch": 0.4546, + "grad_norm": 2.08524227142334, + "learning_rate": 1.3268880296549424e-05, + "loss": 0.0606, + "step": 22730 + }, + { + "epoch": 0.45464, + "grad_norm": 1.8054505586624146, + "learning_rate": 1.326756070770539e-05, + "loss": 0.4031, + "step": 22732 + }, + { + "epoch": 0.45468, + "grad_norm": 0.38337117433547974, + "learning_rate": 1.3266241055158573e-05, + "loss": 0.0061, + "step": 22734 + }, + { + "epoch": 0.45472, + "grad_norm": 0.7259480953216553, + "learning_rate": 1.3264921338934706e-05, + "loss": 0.0132, + "step": 22736 + }, + { + "epoch": 0.45476, + "grad_norm": 5.104259967803955, + "learning_rate": 1.3263601559059518e-05, + "loss": 0.1996, + "step": 22738 + }, + { + "epoch": 0.4548, + "grad_norm": 0.10957562923431396, + "learning_rate": 1.3262281715558736e-05, + "loss": 0.2214, + "step": 22740 + }, + { + "epoch": 0.45484, + "grad_norm": 0.13272184133529663, + "learning_rate": 1.3260961808458093e-05, + "loss": 0.0103, + "step": 22742 + }, + { + "epoch": 0.45488, + "grad_norm": 0.6905316114425659, + "learning_rate": 1.3259641837783322e-05, + "loss": 0.047, + "step": 22744 + }, + { + "epoch": 0.45492, + "grad_norm": 0.17149414122104645, + "learning_rate": 1.3258321803560152e-05, + "loss": 0.0609, + "step": 22746 + }, + { + "epoch": 0.45496, + "grad_norm": 0.11602599918842316, + "learning_rate": 1.3257001705814323e-05, + "loss": 0.0035, + "step": 22748 + }, + { + "epoch": 0.455, + "grad_norm": 0.9685229659080505, + "learning_rate": 1.3255681544571568e-05, + "loss": 0.0162, + "step": 22750 + }, + { + "epoch": 0.45504, + "grad_norm": 0.10660627484321594, + "learning_rate": 1.3254361319857625e-05, + "loss": 0.0026, + "step": 22752 + }, + { + "epoch": 0.45508, + "grad_norm": 0.6363770365715027, + "learning_rate": 1.3253041031698233e-05, + "loss": 0.3664, + "step": 22754 + }, + { + "epoch": 0.45512, + "grad_norm": 0.033832285553216934, + "learning_rate": 1.3251720680119134e-05, + "loss": 0.0025, + "step": 22756 + }, + { + "epoch": 0.45516, + "grad_norm": 0.12583456933498383, + "learning_rate": 1.3250400265146064e-05, + "loss": 0.1292, + "step": 22758 + }, + { + "epoch": 0.4552, + "grad_norm": 0.060983527451753616, + "learning_rate": 1.3249079786804765e-05, + "loss": 0.0247, + "step": 22760 + }, + { + "epoch": 0.45524, + "grad_norm": 6.765350341796875, + "learning_rate": 1.3247759245120986e-05, + "loss": 0.3257, + "step": 22762 + }, + { + "epoch": 0.45528, + "grad_norm": 0.778579831123352, + "learning_rate": 1.3246438640120467e-05, + "loss": 0.0155, + "step": 22764 + }, + { + "epoch": 0.45532, + "grad_norm": 0.09979141503572464, + "learning_rate": 1.3245117971828955e-05, + "loss": 0.1518, + "step": 22766 + }, + { + "epoch": 0.45536, + "grad_norm": 8.671530723571777, + "learning_rate": 1.32437972402722e-05, + "loss": 0.1567, + "step": 22768 + }, + { + "epoch": 0.4554, + "grad_norm": 5.265048980712891, + "learning_rate": 1.3242476445475945e-05, + "loss": 0.1165, + "step": 22770 + }, + { + "epoch": 0.45544, + "grad_norm": 0.8577294945716858, + "learning_rate": 1.3241155587465943e-05, + "loss": 0.0834, + "step": 22772 + }, + { + "epoch": 0.45548, + "grad_norm": 3.7234413623809814, + "learning_rate": 1.3239834666267945e-05, + "loss": 0.0949, + "step": 22774 + }, + { + "epoch": 0.45552, + "grad_norm": 3.527254104614258, + "learning_rate": 1.3238513681907704e-05, + "loss": 0.068, + "step": 22776 + }, + { + "epoch": 0.45556, + "grad_norm": 4.95729398727417, + "learning_rate": 1.323719263441097e-05, + "loss": 0.2022, + "step": 22778 + }, + { + "epoch": 0.4556, + "grad_norm": 1.0419620275497437, + "learning_rate": 1.3235871523803496e-05, + "loss": 0.0253, + "step": 22780 + }, + { + "epoch": 0.45564, + "grad_norm": 0.23202700912952423, + "learning_rate": 1.3234550350111043e-05, + "loss": 0.0571, + "step": 22782 + }, + { + "epoch": 0.45568, + "grad_norm": 0.2940210998058319, + "learning_rate": 1.3233229113359368e-05, + "loss": 0.0332, + "step": 22784 + }, + { + "epoch": 0.45572, + "grad_norm": 0.9276112914085388, + "learning_rate": 1.3231907813574223e-05, + "loss": 0.0172, + "step": 22786 + }, + { + "epoch": 0.45576, + "grad_norm": 0.011755533516407013, + "learning_rate": 1.3230586450781372e-05, + "loss": 0.0474, + "step": 22788 + }, + { + "epoch": 0.4558, + "grad_norm": 0.23806554079055786, + "learning_rate": 1.3229265025006577e-05, + "loss": 0.0208, + "step": 22790 + }, + { + "epoch": 0.45584, + "grad_norm": 1.2886513471603394, + "learning_rate": 1.3227943536275599e-05, + "loss": 0.0428, + "step": 22792 + }, + { + "epoch": 0.45588, + "grad_norm": 0.14162510633468628, + "learning_rate": 1.3226621984614194e-05, + "loss": 0.0611, + "step": 22794 + }, + { + "epoch": 0.45592, + "grad_norm": 4.072484970092773, + "learning_rate": 1.3225300370048136e-05, + "loss": 0.5649, + "step": 22796 + }, + { + "epoch": 0.45596, + "grad_norm": 0.03377193212509155, + "learning_rate": 1.3223978692603188e-05, + "loss": 0.0018, + "step": 22798 + }, + { + "epoch": 0.456, + "grad_norm": 0.29331669211387634, + "learning_rate": 1.3222656952305113e-05, + "loss": 0.0066, + "step": 22800 + }, + { + "epoch": 0.45604, + "grad_norm": 1.406451940536499, + "learning_rate": 1.3221335149179682e-05, + "loss": 0.0284, + "step": 22802 + }, + { + "epoch": 0.45608, + "grad_norm": 0.07747183740139008, + "learning_rate": 1.3220013283252664e-05, + "loss": 0.0391, + "step": 22804 + }, + { + "epoch": 0.45612, + "grad_norm": 0.2449193000793457, + "learning_rate": 1.3218691354549831e-05, + "loss": 0.0062, + "step": 22806 + }, + { + "epoch": 0.45616, + "grad_norm": 0.1049896776676178, + "learning_rate": 1.321736936309695e-05, + "loss": 0.0083, + "step": 22808 + }, + { + "epoch": 0.4562, + "grad_norm": 0.299650102853775, + "learning_rate": 1.32160473089198e-05, + "loss": 0.0075, + "step": 22810 + }, + { + "epoch": 0.45624, + "grad_norm": 0.37499842047691345, + "learning_rate": 1.3214725192044149e-05, + "loss": 0.1036, + "step": 22812 + }, + { + "epoch": 0.45628, + "grad_norm": 3.150299549102783, + "learning_rate": 1.3213403012495778e-05, + "loss": 0.0732, + "step": 22814 + }, + { + "epoch": 0.45632, + "grad_norm": 0.9382277727127075, + "learning_rate": 1.321208077030046e-05, + "loss": 0.0366, + "step": 22816 + }, + { + "epoch": 0.45636, + "grad_norm": 1.7500793933868408, + "learning_rate": 1.3210758465483972e-05, + "loss": 0.0463, + "step": 22818 + }, + { + "epoch": 0.4564, + "grad_norm": 0.5860172510147095, + "learning_rate": 1.3209436098072095e-05, + "loss": 0.0143, + "step": 22820 + }, + { + "epoch": 0.45644, + "grad_norm": 0.27389851212501526, + "learning_rate": 1.3208113668090612e-05, + "loss": 0.0096, + "step": 22822 + }, + { + "epoch": 0.45648, + "grad_norm": 0.37061458826065063, + "learning_rate": 1.3206791175565299e-05, + "loss": 0.0359, + "step": 22824 + }, + { + "epoch": 0.45652, + "grad_norm": 5.85990047454834, + "learning_rate": 1.3205468620521941e-05, + "loss": 0.1757, + "step": 22826 + }, + { + "epoch": 0.45656, + "grad_norm": 0.5246058702468872, + "learning_rate": 1.3204146002986322e-05, + "loss": 0.034, + "step": 22828 + }, + { + "epoch": 0.4566, + "grad_norm": 0.12209940701723099, + "learning_rate": 1.3202823322984228e-05, + "loss": 0.002, + "step": 22830 + }, + { + "epoch": 0.45664, + "grad_norm": 0.2716951072216034, + "learning_rate": 1.3201500580541444e-05, + "loss": 0.0041, + "step": 22832 + }, + { + "epoch": 0.45668, + "grad_norm": 0.07735347002744675, + "learning_rate": 1.3200177775683759e-05, + "loss": 0.0314, + "step": 22834 + }, + { + "epoch": 0.45672, + "grad_norm": 0.1749098151922226, + "learning_rate": 1.3198854908436958e-05, + "loss": 0.0096, + "step": 22836 + }, + { + "epoch": 0.45676, + "grad_norm": 0.019628409296274185, + "learning_rate": 1.3197531978826837e-05, + "loss": 0.0108, + "step": 22838 + }, + { + "epoch": 0.4568, + "grad_norm": 0.889190137386322, + "learning_rate": 1.319620898687918e-05, + "loss": 0.116, + "step": 22840 + }, + { + "epoch": 0.45684, + "grad_norm": 1.6888517141342163, + "learning_rate": 1.3194885932619786e-05, + "loss": 0.0603, + "step": 22842 + }, + { + "epoch": 0.45688, + "grad_norm": 0.019964104518294334, + "learning_rate": 1.3193562816074445e-05, + "loss": 0.0216, + "step": 22844 + }, + { + "epoch": 0.45692, + "grad_norm": 0.39921414852142334, + "learning_rate": 1.3192239637268955e-05, + "loss": 0.0101, + "step": 22846 + }, + { + "epoch": 0.45696, + "grad_norm": 0.007172862999141216, + "learning_rate": 1.3190916396229107e-05, + "loss": 0.0008, + "step": 22848 + }, + { + "epoch": 0.457, + "grad_norm": 0.45555007457733154, + "learning_rate": 1.3189593092980701e-05, + "loss": 0.0111, + "step": 22850 + }, + { + "epoch": 0.45704, + "grad_norm": 0.016645358875393867, + "learning_rate": 1.3188269727549537e-05, + "loss": 0.0153, + "step": 22852 + }, + { + "epoch": 0.45708, + "grad_norm": 0.058810506016016006, + "learning_rate": 1.3186946299961414e-05, + "loss": 0.0536, + "step": 22854 + }, + { + "epoch": 0.45712, + "grad_norm": 0.16988052427768707, + "learning_rate": 1.3185622810242129e-05, + "loss": 0.004, + "step": 22856 + }, + { + "epoch": 0.45716, + "grad_norm": 0.25887688994407654, + "learning_rate": 1.3184299258417488e-05, + "loss": 0.0038, + "step": 22858 + }, + { + "epoch": 0.4572, + "grad_norm": 0.04046948626637459, + "learning_rate": 1.3182975644513296e-05, + "loss": 0.0478, + "step": 22860 + }, + { + "epoch": 0.45724, + "grad_norm": 1.1642305850982666, + "learning_rate": 1.3181651968555354e-05, + "loss": 0.0209, + "step": 22862 + }, + { + "epoch": 0.45728, + "grad_norm": 0.07696034014225006, + "learning_rate": 1.3180328230569468e-05, + "loss": 0.003, + "step": 22864 + }, + { + "epoch": 0.45732, + "grad_norm": 0.052664320915937424, + "learning_rate": 1.3179004430581446e-05, + "loss": 0.0066, + "step": 22866 + }, + { + "epoch": 0.45736, + "grad_norm": 1.7295891046524048, + "learning_rate": 1.3177680568617096e-05, + "loss": 0.0218, + "step": 22868 + }, + { + "epoch": 0.4574, + "grad_norm": 0.1670520305633545, + "learning_rate": 1.3176356644702225e-05, + "loss": 0.0042, + "step": 22870 + }, + { + "epoch": 0.45744, + "grad_norm": 0.0028416512068361044, + "learning_rate": 1.3175032658862653e-05, + "loss": 0.0014, + "step": 22872 + }, + { + "epoch": 0.45748, + "grad_norm": 0.6106926202774048, + "learning_rate": 1.3173708611124179e-05, + "loss": 0.108, + "step": 22874 + }, + { + "epoch": 0.45752, + "grad_norm": 5.243770122528076, + "learning_rate": 1.3172384501512623e-05, + "loss": 0.0956, + "step": 22876 + }, + { + "epoch": 0.45756, + "grad_norm": 2.0241918563842773, + "learning_rate": 1.3171060330053798e-05, + "loss": 0.0485, + "step": 22878 + }, + { + "epoch": 0.4576, + "grad_norm": 0.04347739741206169, + "learning_rate": 1.316973609677352e-05, + "loss": 0.0196, + "step": 22880 + }, + { + "epoch": 0.45764, + "grad_norm": 0.1211620643734932, + "learning_rate": 1.3168411801697602e-05, + "loss": 0.0022, + "step": 22882 + }, + { + "epoch": 0.45768, + "grad_norm": 0.5613121390342712, + "learning_rate": 1.3167087444851867e-05, + "loss": 0.045, + "step": 22884 + }, + { + "epoch": 0.45772, + "grad_norm": 2.126831531524658, + "learning_rate": 1.316576302626213e-05, + "loss": 0.0247, + "step": 22886 + }, + { + "epoch": 0.45776, + "grad_norm": 5.7207536697387695, + "learning_rate": 1.3164438545954217e-05, + "loss": 0.1159, + "step": 22888 + }, + { + "epoch": 0.4578, + "grad_norm": 0.23298974335193634, + "learning_rate": 1.316311400395394e-05, + "loss": 0.005, + "step": 22890 + }, + { + "epoch": 0.45784, + "grad_norm": 0.4906231164932251, + "learning_rate": 1.316178940028713e-05, + "loss": 0.0059, + "step": 22892 + }, + { + "epoch": 0.45788, + "grad_norm": 0.029720092192292213, + "learning_rate": 1.316046473497961e-05, + "loss": 0.0009, + "step": 22894 + }, + { + "epoch": 0.45792, + "grad_norm": 8.320427894592285, + "learning_rate": 1.3159140008057203e-05, + "loss": 0.1385, + "step": 22896 + }, + { + "epoch": 0.45796, + "grad_norm": 0.15769709646701813, + "learning_rate": 1.3157815219545731e-05, + "loss": 0.0025, + "step": 22898 + }, + { + "epoch": 0.458, + "grad_norm": 0.010138146579265594, + "learning_rate": 1.3156490369471026e-05, + "loss": 0.0002, + "step": 22900 + }, + { + "epoch": 0.45804, + "grad_norm": 0.27897292375564575, + "learning_rate": 1.3155165457858916e-05, + "loss": 0.0035, + "step": 22902 + }, + { + "epoch": 0.45808, + "grad_norm": 1.6096746921539307, + "learning_rate": 1.3153840484735235e-05, + "loss": 0.0202, + "step": 22904 + }, + { + "epoch": 0.45812, + "grad_norm": 12.349888801574707, + "learning_rate": 1.3152515450125806e-05, + "loss": 0.3388, + "step": 22906 + }, + { + "epoch": 0.45816, + "grad_norm": 0.019182393327355385, + "learning_rate": 1.3151190354056467e-05, + "loss": 0.0121, + "step": 22908 + }, + { + "epoch": 0.4582, + "grad_norm": 0.2604212760925293, + "learning_rate": 1.3149865196553049e-05, + "loss": 0.0063, + "step": 22910 + }, + { + "epoch": 0.45824, + "grad_norm": 1.8420590162277222, + "learning_rate": 1.3148539977641389e-05, + "loss": 0.0111, + "step": 22912 + }, + { + "epoch": 0.45828, + "grad_norm": 1.3529107570648193, + "learning_rate": 1.3147214697347321e-05, + "loss": 0.0275, + "step": 22914 + }, + { + "epoch": 0.45832, + "grad_norm": 0.1565800905227661, + "learning_rate": 1.3145889355696679e-05, + "loss": 0.1075, + "step": 22916 + }, + { + "epoch": 0.45836, + "grad_norm": 0.05333136022090912, + "learning_rate": 1.3144563952715305e-05, + "loss": 0.0013, + "step": 22918 + }, + { + "epoch": 0.4584, + "grad_norm": 0.3101954162120819, + "learning_rate": 1.3143238488429042e-05, + "loss": 0.0056, + "step": 22920 + }, + { + "epoch": 0.45844, + "grad_norm": 8.913131713867188, + "learning_rate": 1.3141912962863723e-05, + "loss": 0.2714, + "step": 22922 + }, + { + "epoch": 0.45848, + "grad_norm": 0.012534413486719131, + "learning_rate": 1.3140587376045193e-05, + "loss": 0.0011, + "step": 22924 + }, + { + "epoch": 0.45852, + "grad_norm": 0.7254354953765869, + "learning_rate": 1.3139261727999295e-05, + "loss": 0.125, + "step": 22926 + }, + { + "epoch": 0.45856, + "grad_norm": 0.013286568224430084, + "learning_rate": 1.3137936018751876e-05, + "loss": 0.0058, + "step": 22928 + }, + { + "epoch": 0.4586, + "grad_norm": 17.02678680419922, + "learning_rate": 1.3136610248328779e-05, + "loss": 0.177, + "step": 22930 + }, + { + "epoch": 0.45864, + "grad_norm": 0.19181397557258606, + "learning_rate": 1.3135284416755849e-05, + "loss": 0.1517, + "step": 22932 + }, + { + "epoch": 0.45868, + "grad_norm": 13.339981079101562, + "learning_rate": 1.3133958524058934e-05, + "loss": 0.4565, + "step": 22934 + }, + { + "epoch": 0.45872, + "grad_norm": 6.718850135803223, + "learning_rate": 1.3132632570263886e-05, + "loss": 0.1066, + "step": 22936 + }, + { + "epoch": 0.45876, + "grad_norm": 0.022561179473996162, + "learning_rate": 1.3131306555396553e-05, + "loss": 0.0011, + "step": 22938 + }, + { + "epoch": 0.4588, + "grad_norm": 0.9420211911201477, + "learning_rate": 1.3129980479482783e-05, + "loss": 0.017, + "step": 22940 + }, + { + "epoch": 0.45884, + "grad_norm": 20.565645217895508, + "learning_rate": 1.3128654342548434e-05, + "loss": 0.3678, + "step": 22942 + }, + { + "epoch": 0.45888, + "grad_norm": 0.2854865491390228, + "learning_rate": 1.312732814461936e-05, + "loss": 0.0053, + "step": 22944 + }, + { + "epoch": 0.45892, + "grad_norm": 1.0938246250152588, + "learning_rate": 1.3126001885721413e-05, + "loss": 0.0135, + "step": 22946 + }, + { + "epoch": 0.45896, + "grad_norm": 0.0073933969251811504, + "learning_rate": 1.3124675565880448e-05, + "loss": 0.001, + "step": 22948 + }, + { + "epoch": 0.459, + "grad_norm": 0.023856626823544502, + "learning_rate": 1.3123349185122328e-05, + "loss": 0.0011, + "step": 22950 + }, + { + "epoch": 0.45904, + "grad_norm": 0.004724459256976843, + "learning_rate": 1.3122022743472904e-05, + "loss": 0.0718, + "step": 22952 + }, + { + "epoch": 0.45908, + "grad_norm": 7.067189693450928, + "learning_rate": 1.3120696240958041e-05, + "loss": 0.1503, + "step": 22954 + }, + { + "epoch": 0.45912, + "grad_norm": 0.4250018000602722, + "learning_rate": 1.3119369677603597e-05, + "loss": 0.007, + "step": 22956 + }, + { + "epoch": 0.45916, + "grad_norm": 0.4185434579849243, + "learning_rate": 1.3118043053435434e-05, + "loss": 0.0041, + "step": 22958 + }, + { + "epoch": 0.4592, + "grad_norm": 6.95473051071167, + "learning_rate": 1.3116716368479418e-05, + "loss": 0.2423, + "step": 22960 + }, + { + "epoch": 0.45924, + "grad_norm": 4.6257219314575195, + "learning_rate": 1.3115389622761415e-05, + "loss": 0.1059, + "step": 22962 + }, + { + "epoch": 0.45928, + "grad_norm": 0.6660208106040955, + "learning_rate": 1.3114062816307284e-05, + "loss": 0.0135, + "step": 22964 + }, + { + "epoch": 0.45932, + "grad_norm": 0.008658917620778084, + "learning_rate": 1.3112735949142896e-05, + "loss": 0.0105, + "step": 22966 + }, + { + "epoch": 0.45936, + "grad_norm": 0.5046286582946777, + "learning_rate": 1.3111409021294118e-05, + "loss": 0.0117, + "step": 22968 + }, + { + "epoch": 0.4594, + "grad_norm": 0.0606674961745739, + "learning_rate": 1.311008203278682e-05, + "loss": 0.0033, + "step": 22970 + }, + { + "epoch": 0.45944, + "grad_norm": 2.6149966716766357, + "learning_rate": 1.3108754983646868e-05, + "loss": 0.0489, + "step": 22972 + }, + { + "epoch": 0.45948, + "grad_norm": 0.05696773901581764, + "learning_rate": 1.3107427873900141e-05, + "loss": 0.0222, + "step": 22974 + }, + { + "epoch": 0.45952, + "grad_norm": 1.8103243112564087, + "learning_rate": 1.3106100703572505e-05, + "loss": 0.1756, + "step": 22976 + }, + { + "epoch": 0.45956, + "grad_norm": 0.14144860208034515, + "learning_rate": 1.310477347268984e-05, + "loss": 0.0053, + "step": 22978 + }, + { + "epoch": 0.4596, + "grad_norm": 0.005865506362169981, + "learning_rate": 1.3103446181278015e-05, + "loss": 0.0191, + "step": 22980 + }, + { + "epoch": 0.45964, + "grad_norm": 0.05544571205973625, + "learning_rate": 1.310211882936291e-05, + "loss": 0.0012, + "step": 22982 + }, + { + "epoch": 0.45968, + "grad_norm": 0.04259861633181572, + "learning_rate": 1.31007914169704e-05, + "loss": 0.0055, + "step": 22984 + }, + { + "epoch": 0.45972, + "grad_norm": 0.24133358895778656, + "learning_rate": 1.3099463944126366e-05, + "loss": 0.0393, + "step": 22986 + }, + { + "epoch": 0.45976, + "grad_norm": 0.011466724798083305, + "learning_rate": 1.3098136410856688e-05, + "loss": 0.7726, + "step": 22988 + }, + { + "epoch": 0.4598, + "grad_norm": 0.22719086706638336, + "learning_rate": 1.3096808817187243e-05, + "loss": 0.0032, + "step": 22990 + }, + { + "epoch": 0.45984, + "grad_norm": 3.967308759689331, + "learning_rate": 1.3095481163143917e-05, + "loss": 0.0524, + "step": 22992 + }, + { + "epoch": 0.45988, + "grad_norm": 0.036984749138355255, + "learning_rate": 1.3094153448752593e-05, + "loss": 0.0023, + "step": 22994 + }, + { + "epoch": 0.45992, + "grad_norm": 4.870099067687988, + "learning_rate": 1.3092825674039154e-05, + "loss": 0.065, + "step": 22996 + }, + { + "epoch": 0.45996, + "grad_norm": 1.343288779258728, + "learning_rate": 1.3091497839029485e-05, + "loss": 0.0322, + "step": 22998 + }, + { + "epoch": 0.46, + "grad_norm": 3.4142444133758545, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.0736, + "step": 23000 + }, + { + "epoch": 0.46004, + "grad_norm": 0.0499454028904438, + "learning_rate": 1.3088841988225011e-05, + "loss": 0.1505, + "step": 23002 + }, + { + "epoch": 0.46008, + "grad_norm": 0.24810077250003815, + "learning_rate": 1.3087513972481985e-05, + "loss": 0.0048, + "step": 23004 + }, + { + "epoch": 0.46012, + "grad_norm": 0.040719449520111084, + "learning_rate": 1.308618589654628e-05, + "loss": 0.0274, + "step": 23006 + }, + { + "epoch": 0.46016, + "grad_norm": 3.9761178493499756, + "learning_rate": 1.3084857760443793e-05, + "loss": 0.401, + "step": 23008 + }, + { + "epoch": 0.4602, + "grad_norm": 0.24434122443199158, + "learning_rate": 1.3083529564200417e-05, + "loss": 0.0195, + "step": 23010 + }, + { + "epoch": 0.46024, + "grad_norm": 2.0178940296173096, + "learning_rate": 1.3082201307842044e-05, + "loss": 0.0402, + "step": 23012 + }, + { + "epoch": 0.46028, + "grad_norm": 0.012104739435017109, + "learning_rate": 1.3080872991394569e-05, + "loss": 0.0006, + "step": 23014 + }, + { + "epoch": 0.46032, + "grad_norm": 0.05365459620952606, + "learning_rate": 1.3079544614883891e-05, + "loss": 0.0137, + "step": 23016 + }, + { + "epoch": 0.46036, + "grad_norm": 0.11487829685211182, + "learning_rate": 1.3078216178335906e-05, + "loss": 0.066, + "step": 23018 + }, + { + "epoch": 0.4604, + "grad_norm": 6.467010021209717, + "learning_rate": 1.3076887681776509e-05, + "loss": 0.118, + "step": 23020 + }, + { + "epoch": 0.46044, + "grad_norm": 0.3801230192184448, + "learning_rate": 1.3075559125231602e-05, + "loss": 0.0115, + "step": 23022 + }, + { + "epoch": 0.46048, + "grad_norm": 0.041423384100198746, + "learning_rate": 1.3074230508727088e-05, + "loss": 0.0138, + "step": 23024 + }, + { + "epoch": 0.46052, + "grad_norm": 0.38852792978286743, + "learning_rate": 1.307290183228887e-05, + "loss": 0.0351, + "step": 23026 + }, + { + "epoch": 0.46056, + "grad_norm": 0.11396226286888123, + "learning_rate": 1.3071573095942842e-05, + "loss": 0.0023, + "step": 23028 + }, + { + "epoch": 0.4606, + "grad_norm": 0.01546976063400507, + "learning_rate": 1.307024429971492e-05, + "loss": 0.0042, + "step": 23030 + }, + { + "epoch": 0.46064, + "grad_norm": 0.3905719220638275, + "learning_rate": 1.3068915443631005e-05, + "loss": 0.0113, + "step": 23032 + }, + { + "epoch": 0.46068, + "grad_norm": 0.40270286798477173, + "learning_rate": 1.3067586527717002e-05, + "loss": 0.0142, + "step": 23034 + }, + { + "epoch": 0.46072, + "grad_norm": 0.560402512550354, + "learning_rate": 1.3066257551998822e-05, + "loss": 0.0083, + "step": 23036 + }, + { + "epoch": 0.46076, + "grad_norm": 0.6545971632003784, + "learning_rate": 1.306492851650237e-05, + "loss": 0.0121, + "step": 23038 + }, + { + "epoch": 0.4608, + "grad_norm": 0.027944326400756836, + "learning_rate": 1.306359942125356e-05, + "loss": 0.0039, + "step": 23040 + }, + { + "epoch": 0.46084, + "grad_norm": 3.0347976684570312, + "learning_rate": 1.3062270266278303e-05, + "loss": 0.0423, + "step": 23042 + }, + { + "epoch": 0.46088, + "grad_norm": 0.6979380249977112, + "learning_rate": 1.3060941051602509e-05, + "loss": 0.0093, + "step": 23044 + }, + { + "epoch": 0.46092, + "grad_norm": 0.05571307614445686, + "learning_rate": 1.3059611777252092e-05, + "loss": 0.003, + "step": 23046 + }, + { + "epoch": 0.46096, + "grad_norm": 0.18731382489204407, + "learning_rate": 1.305828244325297e-05, + "loss": 0.7133, + "step": 23048 + }, + { + "epoch": 0.461, + "grad_norm": 0.13998675346374512, + "learning_rate": 1.3056953049631059e-05, + "loss": 0.0182, + "step": 23050 + }, + { + "epoch": 0.46104, + "grad_norm": 9.686752319335938, + "learning_rate": 1.3055623596412272e-05, + "loss": 0.1417, + "step": 23052 + }, + { + "epoch": 0.46108, + "grad_norm": 0.15204210579395294, + "learning_rate": 1.3054294083622531e-05, + "loss": 0.0071, + "step": 23054 + }, + { + "epoch": 0.46112, + "grad_norm": 6.098876476287842, + "learning_rate": 1.3052964511287754e-05, + "loss": 0.1718, + "step": 23056 + }, + { + "epoch": 0.46116, + "grad_norm": 2.9855117797851562, + "learning_rate": 1.3051634879433864e-05, + "loss": 0.0457, + "step": 23058 + }, + { + "epoch": 0.4612, + "grad_norm": 0.13409411907196045, + "learning_rate": 1.3050305188086778e-05, + "loss": 0.0038, + "step": 23060 + }, + { + "epoch": 0.46124, + "grad_norm": 1.1712781190872192, + "learning_rate": 1.3048975437272419e-05, + "loss": 0.0148, + "step": 23062 + }, + { + "epoch": 0.46128, + "grad_norm": 2.551997661590576, + "learning_rate": 1.3047645627016719e-05, + "loss": 0.0501, + "step": 23064 + }, + { + "epoch": 0.46132, + "grad_norm": 0.5481998324394226, + "learning_rate": 1.3046315757345597e-05, + "loss": 0.0176, + "step": 23066 + }, + { + "epoch": 0.46136, + "grad_norm": 0.14785589277744293, + "learning_rate": 1.3044985828284981e-05, + "loss": 0.0024, + "step": 23068 + }, + { + "epoch": 0.4614, + "grad_norm": 0.08972799777984619, + "learning_rate": 1.3043655839860803e-05, + "loss": 0.0307, + "step": 23070 + }, + { + "epoch": 0.46144, + "grad_norm": 2.9687156677246094, + "learning_rate": 1.3042325792098982e-05, + "loss": 0.0446, + "step": 23072 + }, + { + "epoch": 0.46148, + "grad_norm": 0.6325868368148804, + "learning_rate": 1.3040995685025456e-05, + "loss": 0.025, + "step": 23074 + }, + { + "epoch": 0.46152, + "grad_norm": 7.520946502685547, + "learning_rate": 1.3039665518666152e-05, + "loss": 0.2253, + "step": 23076 + }, + { + "epoch": 0.46156, + "grad_norm": 3.7466237545013428, + "learning_rate": 1.3038335293047002e-05, + "loss": 0.3087, + "step": 23078 + }, + { + "epoch": 0.4616, + "grad_norm": 0.23217208683490753, + "learning_rate": 1.3037005008193944e-05, + "loss": 0.0219, + "step": 23080 + }, + { + "epoch": 0.46164, + "grad_norm": 0.0699751079082489, + "learning_rate": 1.303567466413291e-05, + "loss": 0.1908, + "step": 23082 + }, + { + "epoch": 0.46168, + "grad_norm": 0.2623482346534729, + "learning_rate": 1.3034344260889836e-05, + "loss": 0.0218, + "step": 23084 + }, + { + "epoch": 0.46172, + "grad_norm": 0.13875682651996613, + "learning_rate": 1.3033013798490657e-05, + "loss": 0.0058, + "step": 23086 + }, + { + "epoch": 0.46176, + "grad_norm": 3.1339683532714844, + "learning_rate": 1.3031683276961311e-05, + "loss": 0.0512, + "step": 23088 + }, + { + "epoch": 0.4618, + "grad_norm": 0.058798108249902725, + "learning_rate": 1.3030352696327741e-05, + "loss": 0.0176, + "step": 23090 + }, + { + "epoch": 0.46184, + "grad_norm": 0.11755158007144928, + "learning_rate": 1.3029022056615886e-05, + "loss": 0.002, + "step": 23092 + }, + { + "epoch": 0.46188, + "grad_norm": 0.003513603936880827, + "learning_rate": 1.3027691357851684e-05, + "loss": 0.1379, + "step": 23094 + }, + { + "epoch": 0.46192, + "grad_norm": 0.15285128355026245, + "learning_rate": 1.3026360600061081e-05, + "loss": 0.0083, + "step": 23096 + }, + { + "epoch": 0.46196, + "grad_norm": 2.287078619003296, + "learning_rate": 1.3025029783270017e-05, + "loss": 0.0367, + "step": 23098 + }, + { + "epoch": 0.462, + "grad_norm": 0.04101434350013733, + "learning_rate": 1.3023698907504447e-05, + "loss": 0.0028, + "step": 23100 + }, + { + "epoch": 0.46204, + "grad_norm": 0.9978191256523132, + "learning_rate": 1.3022367972790307e-05, + "loss": 0.099, + "step": 23102 + }, + { + "epoch": 0.46208, + "grad_norm": 0.8069206476211548, + "learning_rate": 1.3021036979153548e-05, + "loss": 0.0109, + "step": 23104 + }, + { + "epoch": 0.46212, + "grad_norm": 2.5835628509521484, + "learning_rate": 1.3019705926620114e-05, + "loss": 0.077, + "step": 23106 + }, + { + "epoch": 0.46216, + "grad_norm": 1.5700677633285522, + "learning_rate": 1.3018374815215962e-05, + "loss": 0.0221, + "step": 23108 + }, + { + "epoch": 0.4622, + "grad_norm": 2.8525218963623047, + "learning_rate": 1.3017043644967036e-05, + "loss": 0.0488, + "step": 23110 + }, + { + "epoch": 0.46224, + "grad_norm": 0.03627539798617363, + "learning_rate": 1.3015712415899294e-05, + "loss": 0.0013, + "step": 23112 + }, + { + "epoch": 0.46228, + "grad_norm": 0.5256879925727844, + "learning_rate": 1.3014381128038683e-05, + "loss": 0.0077, + "step": 23114 + }, + { + "epoch": 0.46232, + "grad_norm": 0.14086785912513733, + "learning_rate": 1.3013049781411161e-05, + "loss": 0.2863, + "step": 23116 + }, + { + "epoch": 0.46236, + "grad_norm": 2.5363495349884033, + "learning_rate": 1.3011718376042683e-05, + "loss": 0.0459, + "step": 23118 + }, + { + "epoch": 0.4624, + "grad_norm": 1.8759194612503052, + "learning_rate": 1.3010386911959207e-05, + "loss": 0.0285, + "step": 23120 + }, + { + "epoch": 0.46244, + "grad_norm": 0.9415001273155212, + "learning_rate": 1.3009055389186685e-05, + "loss": 1.0344, + "step": 23122 + }, + { + "epoch": 0.46248, + "grad_norm": 12.423998832702637, + "learning_rate": 1.300772380775108e-05, + "loss": 0.355, + "step": 23124 + }, + { + "epoch": 0.46252, + "grad_norm": 0.03549017384648323, + "learning_rate": 1.300639216767835e-05, + "loss": 0.0017, + "step": 23126 + }, + { + "epoch": 0.46256, + "grad_norm": 0.6928879618644714, + "learning_rate": 1.3005060468994458e-05, + "loss": 0.0101, + "step": 23128 + }, + { + "epoch": 0.4626, + "grad_norm": 1.0771560668945312, + "learning_rate": 1.3003728711725364e-05, + "loss": 0.0192, + "step": 23130 + }, + { + "epoch": 0.46264, + "grad_norm": 1.3362617492675781, + "learning_rate": 1.3002396895897033e-05, + "loss": 0.0619, + "step": 23132 + }, + { + "epoch": 0.46268, + "grad_norm": 0.01528435479849577, + "learning_rate": 1.3001065021535429e-05, + "loss": 0.0023, + "step": 23134 + }, + { + "epoch": 0.46272, + "grad_norm": 6.99247932434082, + "learning_rate": 1.2999733088666515e-05, + "loss": 0.2348, + "step": 23136 + }, + { + "epoch": 0.46276, + "grad_norm": 0.050660233944654465, + "learning_rate": 1.2998401097316264e-05, + "loss": 0.1273, + "step": 23138 + }, + { + "epoch": 0.4628, + "grad_norm": 0.11047706007957458, + "learning_rate": 1.299706904751064e-05, + "loss": 0.0539, + "step": 23140 + }, + { + "epoch": 0.46284, + "grad_norm": 0.10410135239362717, + "learning_rate": 1.299573693927561e-05, + "loss": 0.0317, + "step": 23142 + }, + { + "epoch": 0.46288, + "grad_norm": 0.04335760697722435, + "learning_rate": 1.2994404772637145e-05, + "loss": 0.2352, + "step": 23144 + }, + { + "epoch": 0.46292, + "grad_norm": 0.6697125434875488, + "learning_rate": 1.2993072547621218e-05, + "loss": 0.0887, + "step": 23146 + }, + { + "epoch": 0.46296, + "grad_norm": 0.033777523785829544, + "learning_rate": 1.2991740264253802e-05, + "loss": 0.0062, + "step": 23148 + }, + { + "epoch": 0.463, + "grad_norm": 5.654421806335449, + "learning_rate": 1.2990407922560869e-05, + "loss": 0.1441, + "step": 23150 + }, + { + "epoch": 0.46304, + "grad_norm": 5.812587261199951, + "learning_rate": 1.2989075522568394e-05, + "loss": 0.2761, + "step": 23152 + }, + { + "epoch": 0.46308, + "grad_norm": 5.309936046600342, + "learning_rate": 1.2987743064302352e-05, + "loss": 0.1947, + "step": 23154 + }, + { + "epoch": 0.46312, + "grad_norm": 0.3308638632297516, + "learning_rate": 1.2986410547788722e-05, + "loss": 0.0074, + "step": 23156 + }, + { + "epoch": 0.46316, + "grad_norm": 5.839651584625244, + "learning_rate": 1.298507797305348e-05, + "loss": 0.1664, + "step": 23158 + }, + { + "epoch": 0.4632, + "grad_norm": 1.52165687084198, + "learning_rate": 1.2983745340122604e-05, + "loss": 0.0319, + "step": 23160 + }, + { + "epoch": 0.46324, + "grad_norm": 0.20282498002052307, + "learning_rate": 1.2982412649022078e-05, + "loss": 0.0082, + "step": 23162 + }, + { + "epoch": 0.46328, + "grad_norm": 1.6357498168945312, + "learning_rate": 1.2981079899777885e-05, + "loss": 0.0312, + "step": 23164 + }, + { + "epoch": 0.46332, + "grad_norm": 3.608379364013672, + "learning_rate": 1.2979747092416e-05, + "loss": 0.0898, + "step": 23166 + }, + { + "epoch": 0.46336, + "grad_norm": 0.06677204370498657, + "learning_rate": 1.2978414226962413e-05, + "loss": 0.039, + "step": 23168 + }, + { + "epoch": 0.4634, + "grad_norm": 0.37117645144462585, + "learning_rate": 1.2977081303443107e-05, + "loss": 0.0269, + "step": 23170 + }, + { + "epoch": 0.46344, + "grad_norm": 0.12302859127521515, + "learning_rate": 1.297574832188407e-05, + "loss": 0.0986, + "step": 23172 + }, + { + "epoch": 0.46348, + "grad_norm": 0.2604866325855255, + "learning_rate": 1.2974415282311287e-05, + "loss": 0.0104, + "step": 23174 + }, + { + "epoch": 0.46352, + "grad_norm": 1.6402561664581299, + "learning_rate": 1.2973082184750747e-05, + "loss": 0.027, + "step": 23176 + }, + { + "epoch": 0.46356, + "grad_norm": 2.093235731124878, + "learning_rate": 1.2971749029228438e-05, + "loss": 0.0527, + "step": 23178 + }, + { + "epoch": 0.4636, + "grad_norm": 2.1155974864959717, + "learning_rate": 1.297041581577035e-05, + "loss": 0.1359, + "step": 23180 + }, + { + "epoch": 0.46364, + "grad_norm": 0.21299943327903748, + "learning_rate": 1.2969082544402478e-05, + "loss": 0.0056, + "step": 23182 + }, + { + "epoch": 0.46368, + "grad_norm": 7.040535926818848, + "learning_rate": 1.2967749215150814e-05, + "loss": 0.1414, + "step": 23184 + }, + { + "epoch": 0.46372, + "grad_norm": 0.28384363651275635, + "learning_rate": 1.296641582804135e-05, + "loss": 0.0134, + "step": 23186 + }, + { + "epoch": 0.46376, + "grad_norm": 0.11902350187301636, + "learning_rate": 1.2965082383100084e-05, + "loss": 0.0032, + "step": 23188 + }, + { + "epoch": 0.4638, + "grad_norm": 0.0035110407043248415, + "learning_rate": 1.2963748880353011e-05, + "loss": 0.0073, + "step": 23190 + }, + { + "epoch": 0.46384, + "grad_norm": 0.3111509382724762, + "learning_rate": 1.2962415319826125e-05, + "loss": 0.0103, + "step": 23192 + }, + { + "epoch": 0.46388, + "grad_norm": 12.255789756774902, + "learning_rate": 1.2961081701545428e-05, + "loss": 0.65, + "step": 23194 + }, + { + "epoch": 0.46392, + "grad_norm": 0.10547096282243729, + "learning_rate": 1.2959748025536919e-05, + "loss": 0.0546, + "step": 23196 + }, + { + "epoch": 0.46396, + "grad_norm": 0.7159829139709473, + "learning_rate": 1.29584142918266e-05, + "loss": 0.0293, + "step": 23198 + }, + { + "epoch": 0.464, + "grad_norm": 0.7210692763328552, + "learning_rate": 1.2957080500440469e-05, + "loss": 0.074, + "step": 23200 + }, + { + "epoch": 0.46404, + "grad_norm": 1.1430697441101074, + "learning_rate": 1.295574665140453e-05, + "loss": 0.0262, + "step": 23202 + }, + { + "epoch": 0.46408, + "grad_norm": 0.3324704170227051, + "learning_rate": 1.2954412744744791e-05, + "loss": 0.0043, + "step": 23204 + }, + { + "epoch": 0.46412, + "grad_norm": 0.08645473420619965, + "learning_rate": 1.2953078780487253e-05, + "loss": 0.032, + "step": 23206 + }, + { + "epoch": 0.46416, + "grad_norm": 1.3793597221374512, + "learning_rate": 1.2951744758657928e-05, + "loss": 0.0223, + "step": 23208 + }, + { + "epoch": 0.4642, + "grad_norm": 0.038133520632982254, + "learning_rate": 1.2950410679282815e-05, + "loss": 0.0307, + "step": 23210 + }, + { + "epoch": 0.46424, + "grad_norm": 0.4635067284107208, + "learning_rate": 1.2949076542387926e-05, + "loss": 0.0088, + "step": 23212 + }, + { + "epoch": 0.46428, + "grad_norm": 1.1223888397216797, + "learning_rate": 1.2947742347999275e-05, + "loss": 0.2339, + "step": 23214 + }, + { + "epoch": 0.46432, + "grad_norm": 0.10840708762407303, + "learning_rate": 1.2946408096142866e-05, + "loss": 0.2695, + "step": 23216 + }, + { + "epoch": 0.46436, + "grad_norm": 13.425800323486328, + "learning_rate": 1.2945073786844718e-05, + "loss": 0.4138, + "step": 23218 + }, + { + "epoch": 0.4644, + "grad_norm": 0.4430961012840271, + "learning_rate": 1.2943739420130837e-05, + "loss": 0.0073, + "step": 23220 + }, + { + "epoch": 0.46444, + "grad_norm": 1.0978928804397583, + "learning_rate": 1.2942404996027243e-05, + "loss": 0.0296, + "step": 23222 + }, + { + "epoch": 0.46448, + "grad_norm": 0.0463353767991066, + "learning_rate": 1.2941070514559947e-05, + "loss": 0.0009, + "step": 23224 + }, + { + "epoch": 0.46452, + "grad_norm": 0.031491659581661224, + "learning_rate": 1.2939735975754969e-05, + "loss": 0.0017, + "step": 23226 + }, + { + "epoch": 0.46456, + "grad_norm": 9.893491744995117, + "learning_rate": 1.2938401379638321e-05, + "loss": 0.3597, + "step": 23228 + }, + { + "epoch": 0.4646, + "grad_norm": 1.777672290802002, + "learning_rate": 1.2937066726236029e-05, + "loss": 0.0507, + "step": 23230 + }, + { + "epoch": 0.46464, + "grad_norm": 1.1400187015533447, + "learning_rate": 1.2935732015574107e-05, + "loss": 0.022, + "step": 23232 + }, + { + "epoch": 0.46468, + "grad_norm": 0.1102994978427887, + "learning_rate": 1.2934397247678576e-05, + "loss": 0.0028, + "step": 23234 + }, + { + "epoch": 0.46472, + "grad_norm": 0.02626592479646206, + "learning_rate": 1.2933062422575465e-05, + "loss": 0.0034, + "step": 23236 + }, + { + "epoch": 0.46476, + "grad_norm": 0.3708360195159912, + "learning_rate": 1.2931727540290788e-05, + "loss": 0.0375, + "step": 23238 + }, + { + "epoch": 0.4648, + "grad_norm": 7.177817344665527, + "learning_rate": 1.2930392600850574e-05, + "loss": 0.2272, + "step": 23240 + }, + { + "epoch": 0.46484, + "grad_norm": 0.8423925638198853, + "learning_rate": 1.292905760428085e-05, + "loss": 0.0177, + "step": 23242 + }, + { + "epoch": 0.46488, + "grad_norm": 0.45136502385139465, + "learning_rate": 1.2927722550607638e-05, + "loss": 0.0122, + "step": 23244 + }, + { + "epoch": 0.46492, + "grad_norm": 0.07407118380069733, + "learning_rate": 1.2926387439856969e-05, + "loss": 0.7322, + "step": 23246 + }, + { + "epoch": 0.46496, + "grad_norm": 8.258886337280273, + "learning_rate": 1.292505227205487e-05, + "loss": 0.2068, + "step": 23248 + }, + { + "epoch": 0.465, + "grad_norm": 3.650693893432617, + "learning_rate": 1.2923717047227368e-05, + "loss": 0.106, + "step": 23250 + }, + { + "epoch": 0.46504, + "grad_norm": 0.20776966214179993, + "learning_rate": 1.2922381765400501e-05, + "loss": 0.0304, + "step": 23252 + }, + { + "epoch": 0.46508, + "grad_norm": 0.03891460597515106, + "learning_rate": 1.2921046426600295e-05, + "loss": 0.1764, + "step": 23254 + }, + { + "epoch": 0.46512, + "grad_norm": 5.910741329193115, + "learning_rate": 1.2919711030852783e-05, + "loss": 0.1079, + "step": 23256 + }, + { + "epoch": 0.46516, + "grad_norm": 3.2490181922912598, + "learning_rate": 1.2918375578184005e-05, + "loss": 0.0596, + "step": 23258 + }, + { + "epoch": 0.4652, + "grad_norm": 0.22738969326019287, + "learning_rate": 1.291704006861999e-05, + "loss": 0.0107, + "step": 23260 + }, + { + "epoch": 0.46524, + "grad_norm": 1.6866412162780762, + "learning_rate": 1.2915704502186781e-05, + "loss": 0.0361, + "step": 23262 + }, + { + "epoch": 0.46528, + "grad_norm": 5.046910285949707, + "learning_rate": 1.2914368878910407e-05, + "loss": 0.2833, + "step": 23264 + }, + { + "epoch": 0.46532, + "grad_norm": 2.8457372188568115, + "learning_rate": 1.2913033198816914e-05, + "loss": 0.0388, + "step": 23266 + }, + { + "epoch": 0.46536, + "grad_norm": 0.17293722927570343, + "learning_rate": 1.2911697461932335e-05, + "loss": 0.2536, + "step": 23268 + }, + { + "epoch": 0.4654, + "grad_norm": 0.7502626180648804, + "learning_rate": 1.2910361668282718e-05, + "loss": 0.033, + "step": 23270 + }, + { + "epoch": 0.46544, + "grad_norm": 3.2902987003326416, + "learning_rate": 1.2909025817894104e-05, + "loss": 0.0492, + "step": 23272 + }, + { + "epoch": 0.46548, + "grad_norm": 2.0009021759033203, + "learning_rate": 1.290768991079253e-05, + "loss": 0.044, + "step": 23274 + }, + { + "epoch": 0.46552, + "grad_norm": 2.840958833694458, + "learning_rate": 1.2906353947004049e-05, + "loss": 0.0428, + "step": 23276 + }, + { + "epoch": 0.46556, + "grad_norm": 5.291861057281494, + "learning_rate": 1.2905017926554697e-05, + "loss": 0.0911, + "step": 23278 + }, + { + "epoch": 0.4656, + "grad_norm": 0.1288905143737793, + "learning_rate": 1.2903681849470528e-05, + "loss": 0.0448, + "step": 23280 + }, + { + "epoch": 0.46564, + "grad_norm": 0.06638153642416, + "learning_rate": 1.2902345715777584e-05, + "loss": 0.2849, + "step": 23282 + }, + { + "epoch": 0.46568, + "grad_norm": 0.31178638339042664, + "learning_rate": 1.2901009525501917e-05, + "loss": 0.1113, + "step": 23284 + }, + { + "epoch": 0.46572, + "grad_norm": 0.44206637144088745, + "learning_rate": 1.2899673278669576e-05, + "loss": 0.0083, + "step": 23286 + }, + { + "epoch": 0.46576, + "grad_norm": 0.010382477194070816, + "learning_rate": 1.289833697530661e-05, + "loss": 0.0015, + "step": 23288 + }, + { + "epoch": 0.4658, + "grad_norm": 0.07661782205104828, + "learning_rate": 1.2897000615439075e-05, + "loss": 0.013, + "step": 23290 + }, + { + "epoch": 0.46584, + "grad_norm": 0.3530619740486145, + "learning_rate": 1.2895664199093023e-05, + "loss": 0.0062, + "step": 23292 + }, + { + "epoch": 0.46588, + "grad_norm": 0.1443473994731903, + "learning_rate": 1.2894327726294503e-05, + "loss": 0.0024, + "step": 23294 + }, + { + "epoch": 0.46592, + "grad_norm": 1.1344654560089111, + "learning_rate": 1.2892991197069575e-05, + "loss": 0.1063, + "step": 23296 + }, + { + "epoch": 0.46596, + "grad_norm": 0.04572427272796631, + "learning_rate": 1.2891654611444295e-05, + "loss": 0.0017, + "step": 23298 + }, + { + "epoch": 0.466, + "grad_norm": 3.100804567337036, + "learning_rate": 1.2890317969444716e-05, + "loss": 0.0442, + "step": 23300 + }, + { + "epoch": 0.46604, + "grad_norm": 0.2006092071533203, + "learning_rate": 1.2888981271096902e-05, + "loss": 0.0107, + "step": 23302 + }, + { + "epoch": 0.46608, + "grad_norm": 0.2644574046134949, + "learning_rate": 1.2887644516426913e-05, + "loss": 0.0417, + "step": 23304 + }, + { + "epoch": 0.46612, + "grad_norm": 0.2063608318567276, + "learning_rate": 1.2886307705460807e-05, + "loss": 0.0029, + "step": 23306 + }, + { + "epoch": 0.46616, + "grad_norm": 0.04582366719841957, + "learning_rate": 1.2884970838224644e-05, + "loss": 0.0012, + "step": 23308 + }, + { + "epoch": 0.4662, + "grad_norm": 0.08100147545337677, + "learning_rate": 1.2883633914744493e-05, + "loss": 0.0026, + "step": 23310 + }, + { + "epoch": 0.46624, + "grad_norm": 1.2166075706481934, + "learning_rate": 1.2882296935046412e-05, + "loss": 0.02, + "step": 23312 + }, + { + "epoch": 0.46628, + "grad_norm": 9.483664512634277, + "learning_rate": 1.2880959899156468e-05, + "loss": 0.361, + "step": 23314 + }, + { + "epoch": 0.46632, + "grad_norm": 1.2525614500045776, + "learning_rate": 1.2879622807100729e-05, + "loss": 0.0161, + "step": 23316 + }, + { + "epoch": 0.46636, + "grad_norm": 0.02976984903216362, + "learning_rate": 1.2878285658905258e-05, + "loss": 0.0008, + "step": 23318 + }, + { + "epoch": 0.4664, + "grad_norm": 9.100519180297852, + "learning_rate": 1.287694845459613e-05, + "loss": 0.238, + "step": 23320 + }, + { + "epoch": 0.46644, + "grad_norm": 1.0037606954574585, + "learning_rate": 1.2875611194199407e-05, + "loss": 0.0151, + "step": 23322 + }, + { + "epoch": 0.46648, + "grad_norm": 10.804069519042969, + "learning_rate": 1.2874273877741165e-05, + "loss": 0.3015, + "step": 23324 + }, + { + "epoch": 0.46652, + "grad_norm": 0.472030907869339, + "learning_rate": 1.2872936505247476e-05, + "loss": 0.0087, + "step": 23326 + }, + { + "epoch": 0.46656, + "grad_norm": 9.42579174041748, + "learning_rate": 1.2871599076744405e-05, + "loss": 0.3455, + "step": 23328 + }, + { + "epoch": 0.4666, + "grad_norm": 12.832179069519043, + "learning_rate": 1.2870261592258038e-05, + "loss": 0.6565, + "step": 23330 + }, + { + "epoch": 0.46664, + "grad_norm": 8.11945915222168, + "learning_rate": 1.286892405181444e-05, + "loss": 0.1562, + "step": 23332 + }, + { + "epoch": 0.46668, + "grad_norm": 0.3427315354347229, + "learning_rate": 1.286758645543969e-05, + "loss": 0.0056, + "step": 23334 + }, + { + "epoch": 0.46672, + "grad_norm": 0.2128485143184662, + "learning_rate": 1.2866248803159871e-05, + "loss": 0.0055, + "step": 23336 + }, + { + "epoch": 0.46676, + "grad_norm": 0.36540448665618896, + "learning_rate": 1.286491109500105e-05, + "loss": 0.0088, + "step": 23338 + }, + { + "epoch": 0.4668, + "grad_norm": 0.13256210088729858, + "learning_rate": 1.2863573330989315e-05, + "loss": 0.0041, + "step": 23340 + }, + { + "epoch": 0.46684, + "grad_norm": 0.15809954702854156, + "learning_rate": 1.2862235511150742e-05, + "loss": 0.0031, + "step": 23342 + }, + { + "epoch": 0.46688, + "grad_norm": 14.813865661621094, + "learning_rate": 1.2860897635511416e-05, + "loss": 0.4568, + "step": 23344 + }, + { + "epoch": 0.46692, + "grad_norm": 3.4385087490081787, + "learning_rate": 1.2859559704097417e-05, + "loss": 0.1118, + "step": 23346 + }, + { + "epoch": 0.46696, + "grad_norm": 2.2498342990875244, + "learning_rate": 1.2858221716934828e-05, + "loss": 0.0985, + "step": 23348 + }, + { + "epoch": 0.467, + "grad_norm": 0.620111346244812, + "learning_rate": 1.2856883674049736e-05, + "loss": 0.0137, + "step": 23350 + }, + { + "epoch": 0.46704, + "grad_norm": 1.0745933055877686, + "learning_rate": 1.2855545575468229e-05, + "loss": 0.0529, + "step": 23352 + }, + { + "epoch": 0.46708, + "grad_norm": 0.24348184466362, + "learning_rate": 1.2854207421216387e-05, + "loss": 0.0119, + "step": 23354 + }, + { + "epoch": 0.46712, + "grad_norm": 1.0990605354309082, + "learning_rate": 1.2852869211320303e-05, + "loss": 0.2694, + "step": 23356 + }, + { + "epoch": 0.46716, + "grad_norm": 1.9293162822723389, + "learning_rate": 1.2851530945806063e-05, + "loss": 0.0428, + "step": 23358 + }, + { + "epoch": 0.4672, + "grad_norm": 2.3007495403289795, + "learning_rate": 1.2850192624699762e-05, + "loss": 0.0456, + "step": 23360 + }, + { + "epoch": 0.46724, + "grad_norm": 0.119877390563488, + "learning_rate": 1.2848854248027487e-05, + "loss": 0.0166, + "step": 23362 + }, + { + "epoch": 0.46728, + "grad_norm": 0.13423989713191986, + "learning_rate": 1.2847515815815334e-05, + "loss": 0.0134, + "step": 23364 + }, + { + "epoch": 0.46732, + "grad_norm": 4.650007724761963, + "learning_rate": 1.2846177328089392e-05, + "loss": 0.1113, + "step": 23366 + }, + { + "epoch": 0.46736, + "grad_norm": 0.06850983202457428, + "learning_rate": 1.2844838784875759e-05, + "loss": 0.0314, + "step": 23368 + }, + { + "epoch": 0.4674, + "grad_norm": 0.7167955636978149, + "learning_rate": 1.2843500186200529e-05, + "loss": 0.0157, + "step": 23370 + }, + { + "epoch": 0.46744, + "grad_norm": 6.009052276611328, + "learning_rate": 1.2842161532089796e-05, + "loss": 0.1414, + "step": 23372 + }, + { + "epoch": 0.46748, + "grad_norm": 0.5077647566795349, + "learning_rate": 1.2840822822569663e-05, + "loss": 0.0121, + "step": 23374 + }, + { + "epoch": 0.46752, + "grad_norm": 0.12338085472583771, + "learning_rate": 1.2839484057666228e-05, + "loss": 0.0052, + "step": 23376 + }, + { + "epoch": 0.46756, + "grad_norm": 7.1527838706970215, + "learning_rate": 1.2838145237405588e-05, + "loss": 0.1711, + "step": 23378 + }, + { + "epoch": 0.4676, + "grad_norm": 0.22065600752830505, + "learning_rate": 1.2836806361813846e-05, + "loss": 0.2076, + "step": 23380 + }, + { + "epoch": 0.46764, + "grad_norm": 1.4535212516784668, + "learning_rate": 1.2835467430917103e-05, + "loss": 0.0205, + "step": 23382 + }, + { + "epoch": 0.46768, + "grad_norm": 0.22947511076927185, + "learning_rate": 1.283412844474146e-05, + "loss": 0.0178, + "step": 23384 + }, + { + "epoch": 0.46772, + "grad_norm": 1.128913402557373, + "learning_rate": 1.2832789403313029e-05, + "loss": 0.0267, + "step": 23386 + }, + { + "epoch": 0.46776, + "grad_norm": 0.7041154503822327, + "learning_rate": 1.2831450306657906e-05, + "loss": 0.0135, + "step": 23388 + }, + { + "epoch": 0.4678, + "grad_norm": 0.19774992763996124, + "learning_rate": 1.2830111154802203e-05, + "loss": 0.0034, + "step": 23390 + }, + { + "epoch": 0.46784, + "grad_norm": 0.24018090963363647, + "learning_rate": 1.2828771947772025e-05, + "loss": 0.005, + "step": 23392 + }, + { + "epoch": 0.46788, + "grad_norm": 0.10654724389314651, + "learning_rate": 1.2827432685593483e-05, + "loss": 0.0079, + "step": 23394 + }, + { + "epoch": 0.46792, + "grad_norm": 0.04782687872648239, + "learning_rate": 1.2826093368292687e-05, + "loss": 0.0018, + "step": 23396 + }, + { + "epoch": 0.46796, + "grad_norm": 0.10599657893180847, + "learning_rate": 1.2824753995895743e-05, + "loss": 0.0028, + "step": 23398 + }, + { + "epoch": 0.468, + "grad_norm": 0.012996667996048927, + "learning_rate": 1.2823414568428767e-05, + "loss": 0.2347, + "step": 23400 + }, + { + "epoch": 0.46804, + "grad_norm": 0.022246479988098145, + "learning_rate": 1.282207508591787e-05, + "loss": 0.0475, + "step": 23402 + }, + { + "epoch": 0.46808, + "grad_norm": 9.639910697937012, + "learning_rate": 1.2820735548389166e-05, + "loss": 0.4905, + "step": 23404 + }, + { + "epoch": 0.46812, + "grad_norm": 0.10066027194261551, + "learning_rate": 1.2819395955868767e-05, + "loss": 0.0164, + "step": 23406 + }, + { + "epoch": 0.46816, + "grad_norm": 0.31906628608703613, + "learning_rate": 1.2818056308382795e-05, + "loss": 0.0843, + "step": 23408 + }, + { + "epoch": 0.4682, + "grad_norm": 0.5322705507278442, + "learning_rate": 1.2816716605957366e-05, + "loss": 0.0105, + "step": 23410 + }, + { + "epoch": 0.46824, + "grad_norm": 2.4871647357940674, + "learning_rate": 1.2815376848618595e-05, + "loss": 0.04, + "step": 23412 + }, + { + "epoch": 0.46828, + "grad_norm": 2.743516445159912, + "learning_rate": 1.2814037036392605e-05, + "loss": 0.3362, + "step": 23414 + }, + { + "epoch": 0.46832, + "grad_norm": 0.3856973946094513, + "learning_rate": 1.2812697169305512e-05, + "loss": 0.0081, + "step": 23416 + }, + { + "epoch": 0.46836, + "grad_norm": 1.5544463396072388, + "learning_rate": 1.2811357247383443e-05, + "loss": 0.0232, + "step": 23418 + }, + { + "epoch": 0.4684, + "grad_norm": 0.19762647151947021, + "learning_rate": 1.2810017270652513e-05, + "loss": 0.0109, + "step": 23420 + }, + { + "epoch": 0.46844, + "grad_norm": 0.5101973414421082, + "learning_rate": 1.280867723913885e-05, + "loss": 0.3092, + "step": 23422 + }, + { + "epoch": 0.46848, + "grad_norm": 6.558561325073242, + "learning_rate": 1.2807337152868578e-05, + "loss": 0.1429, + "step": 23424 + }, + { + "epoch": 0.46852, + "grad_norm": 2.7506446838378906, + "learning_rate": 1.2805997011867825e-05, + "loss": 0.0347, + "step": 23426 + }, + { + "epoch": 0.46856, + "grad_norm": 0.13016492128372192, + "learning_rate": 1.2804656816162715e-05, + "loss": 0.0098, + "step": 23428 + }, + { + "epoch": 0.4686, + "grad_norm": 0.8748236894607544, + "learning_rate": 1.2803316565779378e-05, + "loss": 0.051, + "step": 23430 + }, + { + "epoch": 0.46864, + "grad_norm": 0.2981139123439789, + "learning_rate": 1.2801976260743937e-05, + "loss": 0.0527, + "step": 23432 + }, + { + "epoch": 0.46868, + "grad_norm": 8.992562294006348, + "learning_rate": 1.280063590108253e-05, + "loss": 0.3391, + "step": 23434 + }, + { + "epoch": 0.46872, + "grad_norm": 0.09334604442119598, + "learning_rate": 1.2799295486821281e-05, + "loss": 0.0032, + "step": 23436 + }, + { + "epoch": 0.46876, + "grad_norm": 2.8799455165863037, + "learning_rate": 1.2797955017986328e-05, + "loss": 0.0883, + "step": 23438 + }, + { + "epoch": 0.4688, + "grad_norm": 0.12806819379329681, + "learning_rate": 1.27966144946038e-05, + "loss": 0.0047, + "step": 23440 + }, + { + "epoch": 0.46884, + "grad_norm": 1.2272675037384033, + "learning_rate": 1.2795273916699836e-05, + "loss": 0.0457, + "step": 23442 + }, + { + "epoch": 0.46888, + "grad_norm": 0.44575607776641846, + "learning_rate": 1.2793933284300564e-05, + "loss": 0.024, + "step": 23444 + }, + { + "epoch": 0.46892, + "grad_norm": 0.5016786456108093, + "learning_rate": 1.2792592597432123e-05, + "loss": 0.1337, + "step": 23446 + }, + { + "epoch": 0.46896, + "grad_norm": 7.934219837188721, + "learning_rate": 1.2791251856120655e-05, + "loss": 0.5815, + "step": 23448 + }, + { + "epoch": 0.469, + "grad_norm": 0.4206535816192627, + "learning_rate": 1.2789911060392295e-05, + "loss": 0.008, + "step": 23450 + }, + { + "epoch": 0.46904, + "grad_norm": 0.781876802444458, + "learning_rate": 1.278857021027318e-05, + "loss": 0.0157, + "step": 23452 + }, + { + "epoch": 0.46908, + "grad_norm": 0.2135469764471054, + "learning_rate": 1.2787229305789454e-05, + "loss": 0.0059, + "step": 23454 + }, + { + "epoch": 0.46912, + "grad_norm": 0.3117426037788391, + "learning_rate": 1.278588834696726e-05, + "loss": 0.0313, + "step": 23456 + }, + { + "epoch": 0.46916, + "grad_norm": 2.6613495349884033, + "learning_rate": 1.2784547333832738e-05, + "loss": 0.2228, + "step": 23458 + }, + { + "epoch": 0.4692, + "grad_norm": 2.313399076461792, + "learning_rate": 1.278320626641203e-05, + "loss": 0.2882, + "step": 23460 + }, + { + "epoch": 0.46924, + "grad_norm": 1.5928893089294434, + "learning_rate": 1.2781865144731283e-05, + "loss": 0.1326, + "step": 23462 + }, + { + "epoch": 0.46928, + "grad_norm": 5.957620620727539, + "learning_rate": 1.2780523968816644e-05, + "loss": 0.1174, + "step": 23464 + }, + { + "epoch": 0.46932, + "grad_norm": 0.2859576642513275, + "learning_rate": 1.2779182738694257e-05, + "loss": 0.1423, + "step": 23466 + }, + { + "epoch": 0.46936, + "grad_norm": 0.16352693736553192, + "learning_rate": 1.2777841454390276e-05, + "loss": 0.0077, + "step": 23468 + }, + { + "epoch": 0.4694, + "grad_norm": 0.6968619227409363, + "learning_rate": 1.2776500115930842e-05, + "loss": 0.0163, + "step": 23470 + }, + { + "epoch": 0.46944, + "grad_norm": 4.52216100692749, + "learning_rate": 1.2775158723342108e-05, + "loss": 0.0925, + "step": 23472 + }, + { + "epoch": 0.46948, + "grad_norm": 0.1803295761346817, + "learning_rate": 1.2773817276650228e-05, + "loss": 0.0136, + "step": 23474 + }, + { + "epoch": 0.46952, + "grad_norm": 0.2776285707950592, + "learning_rate": 1.2772475775881353e-05, + "loss": 0.0556, + "step": 23476 + }, + { + "epoch": 0.46956, + "grad_norm": 3.1826963424682617, + "learning_rate": 1.2771134221061632e-05, + "loss": 0.2368, + "step": 23478 + }, + { + "epoch": 0.4696, + "grad_norm": 1.1680593490600586, + "learning_rate": 1.2769792612217224e-05, + "loss": 0.3006, + "step": 23480 + }, + { + "epoch": 0.46964, + "grad_norm": 0.3023163378238678, + "learning_rate": 1.2768450949374285e-05, + "loss": 0.0229, + "step": 23482 + }, + { + "epoch": 0.46968, + "grad_norm": 5.942921161651611, + "learning_rate": 1.276710923255897e-05, + "loss": 0.0978, + "step": 23484 + }, + { + "epoch": 0.46972, + "grad_norm": 0.21421153843402863, + "learning_rate": 1.2765767461797433e-05, + "loss": 0.0408, + "step": 23486 + }, + { + "epoch": 0.46976, + "grad_norm": 0.02955026924610138, + "learning_rate": 1.2764425637115836e-05, + "loss": 0.0009, + "step": 23488 + }, + { + "epoch": 0.4698, + "grad_norm": 0.5344889760017395, + "learning_rate": 1.2763083758540337e-05, + "loss": 0.0165, + "step": 23490 + }, + { + "epoch": 0.46984, + "grad_norm": 1.3777315616607666, + "learning_rate": 1.27617418260971e-05, + "loss": 0.064, + "step": 23492 + }, + { + "epoch": 0.46988, + "grad_norm": 0.5639392733573914, + "learning_rate": 1.2760399839812282e-05, + "loss": 0.1709, + "step": 23494 + }, + { + "epoch": 0.46992, + "grad_norm": 0.1260940134525299, + "learning_rate": 1.275905779971205e-05, + "loss": 0.0037, + "step": 23496 + }, + { + "epoch": 0.46996, + "grad_norm": 0.10517417639493942, + "learning_rate": 1.2757715705822566e-05, + "loss": 0.0394, + "step": 23498 + }, + { + "epoch": 0.47, + "grad_norm": 3.7686665058135986, + "learning_rate": 1.2756373558169992e-05, + "loss": 0.0798, + "step": 23500 + }, + { + "epoch": 0.47004, + "grad_norm": 2.9853034019470215, + "learning_rate": 1.27550313567805e-05, + "loss": 0.0691, + "step": 23502 + }, + { + "epoch": 0.47008, + "grad_norm": 0.10408655554056168, + "learning_rate": 1.2753689101680252e-05, + "loss": 0.0011, + "step": 23504 + }, + { + "epoch": 0.47012, + "grad_norm": 0.1417813003063202, + "learning_rate": 1.2752346792895413e-05, + "loss": 0.0023, + "step": 23506 + }, + { + "epoch": 0.47016, + "grad_norm": 2.847541332244873, + "learning_rate": 1.2751004430452164e-05, + "loss": 0.048, + "step": 23508 + }, + { + "epoch": 0.4702, + "grad_norm": 0.011720442213118076, + "learning_rate": 1.2749662014376662e-05, + "loss": 0.0003, + "step": 23510 + }, + { + "epoch": 0.47024, + "grad_norm": 1.1164864301681519, + "learning_rate": 1.2748319544695082e-05, + "loss": 0.017, + "step": 23512 + }, + { + "epoch": 0.47028, + "grad_norm": 10.151074409484863, + "learning_rate": 1.27469770214336e-05, + "loss": 0.1898, + "step": 23514 + }, + { + "epoch": 0.47032, + "grad_norm": 5.806954860687256, + "learning_rate": 1.2745634444618389e-05, + "loss": 0.1398, + "step": 23516 + }, + { + "epoch": 0.47036, + "grad_norm": 2.1932497024536133, + "learning_rate": 1.274429181427562e-05, + "loss": 0.0377, + "step": 23518 + }, + { + "epoch": 0.4704, + "grad_norm": 0.12536126375198364, + "learning_rate": 1.2742949130431468e-05, + "loss": 0.0039, + "step": 23520 + }, + { + "epoch": 0.47044, + "grad_norm": 0.6606065630912781, + "learning_rate": 1.2741606393112109e-05, + "loss": 0.0087, + "step": 23522 + }, + { + "epoch": 0.47048, + "grad_norm": 0.09036335349082947, + "learning_rate": 1.2740263602343726e-05, + "loss": 0.1393, + "step": 23524 + }, + { + "epoch": 0.47052, + "grad_norm": 9.56493854522705, + "learning_rate": 1.2738920758152488e-05, + "loss": 0.1762, + "step": 23526 + }, + { + "epoch": 0.47056, + "grad_norm": 1.7119274139404297, + "learning_rate": 1.2737577860564583e-05, + "loss": 0.1007, + "step": 23528 + }, + { + "epoch": 0.4706, + "grad_norm": 0.6305935382843018, + "learning_rate": 1.2736234909606186e-05, + "loss": 0.4565, + "step": 23530 + }, + { + "epoch": 0.47064, + "grad_norm": 0.5219892263412476, + "learning_rate": 1.2734891905303484e-05, + "loss": 0.0136, + "step": 23532 + }, + { + "epoch": 0.47068, + "grad_norm": 9.247647285461426, + "learning_rate": 1.2733548847682653e-05, + "loss": 0.1164, + "step": 23534 + }, + { + "epoch": 0.47072, + "grad_norm": 1.6315861940383911, + "learning_rate": 1.2732205736769885e-05, + "loss": 0.0194, + "step": 23536 + }, + { + "epoch": 0.47076, + "grad_norm": 1.4097293615341187, + "learning_rate": 1.2730862572591353e-05, + "loss": 0.0211, + "step": 23538 + }, + { + "epoch": 0.4708, + "grad_norm": 0.05567552149295807, + "learning_rate": 1.2729519355173254e-05, + "loss": 0.0021, + "step": 23540 + }, + { + "epoch": 0.47084, + "grad_norm": 0.23996679484844208, + "learning_rate": 1.2728176084541768e-05, + "loss": 0.0149, + "step": 23542 + }, + { + "epoch": 0.47088, + "grad_norm": 0.7040707468986511, + "learning_rate": 1.2726832760723084e-05, + "loss": 0.0728, + "step": 23544 + }, + { + "epoch": 0.47092, + "grad_norm": 1.1942986249923706, + "learning_rate": 1.2725489383743393e-05, + "loss": 0.0242, + "step": 23546 + }, + { + "epoch": 0.47096, + "grad_norm": 0.5411656498908997, + "learning_rate": 1.2724145953628883e-05, + "loss": 0.0114, + "step": 23548 + }, + { + "epoch": 0.471, + "grad_norm": 0.7068665623664856, + "learning_rate": 1.2722802470405744e-05, + "loss": 0.0124, + "step": 23550 + }, + { + "epoch": 0.47104, + "grad_norm": 0.6337742209434509, + "learning_rate": 1.2721458934100172e-05, + "loss": 0.0123, + "step": 23552 + }, + { + "epoch": 0.47108, + "grad_norm": 0.13842423260211945, + "learning_rate": 1.2720115344738354e-05, + "loss": 0.0663, + "step": 23554 + }, + { + "epoch": 0.47112, + "grad_norm": 0.8281781077384949, + "learning_rate": 1.2718771702346489e-05, + "loss": 0.0141, + "step": 23556 + }, + { + "epoch": 0.47116, + "grad_norm": 0.18641123175621033, + "learning_rate": 1.2717428006950767e-05, + "loss": 0.0052, + "step": 23558 + }, + { + "epoch": 0.4712, + "grad_norm": 2.840365409851074, + "learning_rate": 1.2716084258577388e-05, + "loss": 0.0361, + "step": 23560 + }, + { + "epoch": 0.47124, + "grad_norm": 0.07420391589403152, + "learning_rate": 1.2714740457252547e-05, + "loss": 0.0021, + "step": 23562 + }, + { + "epoch": 0.47128, + "grad_norm": 0.006524605210870504, + "learning_rate": 1.2713396603002449e-05, + "loss": 0.0031, + "step": 23564 + }, + { + "epoch": 0.47132, + "grad_norm": 0.13531708717346191, + "learning_rate": 1.271205269585328e-05, + "loss": 0.0032, + "step": 23566 + }, + { + "epoch": 0.47136, + "grad_norm": 0.4491540193557739, + "learning_rate": 1.2710708735831252e-05, + "loss": 0.0078, + "step": 23568 + }, + { + "epoch": 0.4714, + "grad_norm": 8.116090774536133, + "learning_rate": 1.270936472296256e-05, + "loss": 0.1323, + "step": 23570 + }, + { + "epoch": 0.47144, + "grad_norm": 4.034780025482178, + "learning_rate": 1.270802065727341e-05, + "loss": 0.0435, + "step": 23572 + }, + { + "epoch": 0.47148, + "grad_norm": 15.192484855651855, + "learning_rate": 1.270667653879e-05, + "loss": 0.4343, + "step": 23574 + }, + { + "epoch": 0.47152, + "grad_norm": 0.08001242578029633, + "learning_rate": 1.2705332367538539e-05, + "loss": 0.0019, + "step": 23576 + }, + { + "epoch": 0.47156, + "grad_norm": 0.10833963006734848, + "learning_rate": 1.270398814354523e-05, + "loss": 0.2362, + "step": 23578 + }, + { + "epoch": 0.4716, + "grad_norm": 7.556113243103027, + "learning_rate": 1.270264386683628e-05, + "loss": 0.1005, + "step": 23580 + }, + { + "epoch": 0.47164, + "grad_norm": 0.17943783104419708, + "learning_rate": 1.2701299537437896e-05, + "loss": 0.0024, + "step": 23582 + }, + { + "epoch": 0.47168, + "grad_norm": 9.21529483795166, + "learning_rate": 1.2699955155376287e-05, + "loss": 0.2048, + "step": 23584 + }, + { + "epoch": 0.47172, + "grad_norm": 3.4593613147735596, + "learning_rate": 1.2698610720677664e-05, + "loss": 0.0655, + "step": 23586 + }, + { + "epoch": 0.47176, + "grad_norm": 0.03525862470269203, + "learning_rate": 1.2697266233368234e-05, + "loss": 0.0245, + "step": 23588 + }, + { + "epoch": 0.4718, + "grad_norm": 0.14275594055652618, + "learning_rate": 1.2695921693474211e-05, + "loss": 0.0044, + "step": 23590 + }, + { + "epoch": 0.47184, + "grad_norm": 0.03240374103188515, + "learning_rate": 1.2694577101021805e-05, + "loss": 0.1629, + "step": 23592 + }, + { + "epoch": 0.47188, + "grad_norm": 0.009285668842494488, + "learning_rate": 1.2693232456037233e-05, + "loss": 0.0026, + "step": 23594 + }, + { + "epoch": 0.47192, + "grad_norm": 0.24841491878032684, + "learning_rate": 1.2691887758546706e-05, + "loss": 0.0123, + "step": 23596 + }, + { + "epoch": 0.47196, + "grad_norm": 7.748345851898193, + "learning_rate": 1.2690543008576443e-05, + "loss": 0.138, + "step": 23598 + }, + { + "epoch": 0.472, + "grad_norm": 0.01859770342707634, + "learning_rate": 1.2689198206152657e-05, + "loss": 0.0108, + "step": 23600 + }, + { + "epoch": 0.47204, + "grad_norm": 15.256397247314453, + "learning_rate": 1.2687853351301569e-05, + "loss": 0.4321, + "step": 23602 + }, + { + "epoch": 0.47208, + "grad_norm": 14.891229629516602, + "learning_rate": 1.2686508444049397e-05, + "loss": 0.4019, + "step": 23604 + }, + { + "epoch": 0.47212, + "grad_norm": 0.025440629571676254, + "learning_rate": 1.268516348442236e-05, + "loss": 0.0038, + "step": 23606 + }, + { + "epoch": 0.47216, + "grad_norm": 1.0725206136703491, + "learning_rate": 1.2683818472446677e-05, + "loss": 0.1689, + "step": 23608 + }, + { + "epoch": 0.4722, + "grad_norm": 4.4425048828125, + "learning_rate": 1.268247340814857e-05, + "loss": 0.0835, + "step": 23610 + }, + { + "epoch": 0.47224, + "grad_norm": 0.3875373899936676, + "learning_rate": 1.2681128291554263e-05, + "loss": 0.0202, + "step": 23612 + }, + { + "epoch": 0.47228, + "grad_norm": 7.7451629638671875, + "learning_rate": 1.2679783122689982e-05, + "loss": 0.1635, + "step": 23614 + }, + { + "epoch": 0.47232, + "grad_norm": 15.229622840881348, + "learning_rate": 1.2678437901581945e-05, + "loss": 0.3847, + "step": 23616 + }, + { + "epoch": 0.47236, + "grad_norm": 0.29059654474258423, + "learning_rate": 1.2677092628256386e-05, + "loss": 0.0084, + "step": 23618 + }, + { + "epoch": 0.4724, + "grad_norm": 1.8465328216552734, + "learning_rate": 1.2675747302739528e-05, + "loss": 0.0206, + "step": 23620 + }, + { + "epoch": 0.47244, + "grad_norm": 1.012139916419983, + "learning_rate": 1.26744019250576e-05, + "loss": 0.0224, + "step": 23622 + }, + { + "epoch": 0.47248, + "grad_norm": 3.6207337379455566, + "learning_rate": 1.2673056495236825e-05, + "loss": 0.0622, + "step": 23624 + }, + { + "epoch": 0.47252, + "grad_norm": 0.014693282544612885, + "learning_rate": 1.2671711013303441e-05, + "loss": 0.127, + "step": 23626 + }, + { + "epoch": 0.47256, + "grad_norm": 0.9001736640930176, + "learning_rate": 1.2670365479283674e-05, + "loss": 0.0137, + "step": 23628 + }, + { + "epoch": 0.4726, + "grad_norm": 1.3141943216323853, + "learning_rate": 1.2669019893203758e-05, + "loss": 0.1431, + "step": 23630 + }, + { + "epoch": 0.47264, + "grad_norm": 0.12340354174375534, + "learning_rate": 1.2667674255089923e-05, + "loss": 0.0604, + "step": 23632 + }, + { + "epoch": 0.47268, + "grad_norm": 0.12639841437339783, + "learning_rate": 1.2666328564968407e-05, + "loss": 0.002, + "step": 23634 + }, + { + "epoch": 0.47272, + "grad_norm": 2.851219892501831, + "learning_rate": 1.2664982822865441e-05, + "loss": 0.0595, + "step": 23636 + }, + { + "epoch": 0.47276, + "grad_norm": 9.588460922241211, + "learning_rate": 1.2663637028807265e-05, + "loss": 0.2297, + "step": 23638 + }, + { + "epoch": 0.4728, + "grad_norm": 3.3348615169525146, + "learning_rate": 1.2662291182820115e-05, + "loss": 0.0771, + "step": 23640 + }, + { + "epoch": 0.47284, + "grad_norm": 0.060948409140110016, + "learning_rate": 1.2660945284930226e-05, + "loss": 0.0091, + "step": 23642 + }, + { + "epoch": 0.47288, + "grad_norm": 2.46496844291687, + "learning_rate": 1.2659599335163836e-05, + "loss": 0.043, + "step": 23644 + }, + { + "epoch": 0.47292, + "grad_norm": 2.773106575012207, + "learning_rate": 1.2658253333547192e-05, + "loss": 0.1837, + "step": 23646 + }, + { + "epoch": 0.47296, + "grad_norm": 0.11077375710010529, + "learning_rate": 1.2656907280106528e-05, + "loss": 0.0019, + "step": 23648 + }, + { + "epoch": 0.473, + "grad_norm": 0.5063719749450684, + "learning_rate": 1.265556117486809e-05, + "loss": 0.0085, + "step": 23650 + }, + { + "epoch": 0.47304, + "grad_norm": 0.7223660349845886, + "learning_rate": 1.2654215017858119e-05, + "loss": 0.0122, + "step": 23652 + }, + { + "epoch": 0.47308, + "grad_norm": 10.433505058288574, + "learning_rate": 1.265286880910286e-05, + "loss": 0.3377, + "step": 23654 + }, + { + "epoch": 0.47312, + "grad_norm": 0.005545246414840221, + "learning_rate": 1.2651522548628559e-05, + "loss": 0.301, + "step": 23656 + }, + { + "epoch": 0.47316, + "grad_norm": 0.4838094711303711, + "learning_rate": 1.2650176236461457e-05, + "loss": 0.008, + "step": 23658 + }, + { + "epoch": 0.4732, + "grad_norm": 0.27463823556900024, + "learning_rate": 1.2648829872627809e-05, + "loss": 0.0513, + "step": 23660 + }, + { + "epoch": 0.47324, + "grad_norm": 0.39382848143577576, + "learning_rate": 1.2647483457153857e-05, + "loss": 0.0278, + "step": 23662 + }, + { + "epoch": 0.47328, + "grad_norm": 0.2843596637248993, + "learning_rate": 1.2646136990065854e-05, + "loss": 0.0051, + "step": 23664 + }, + { + "epoch": 0.47332, + "grad_norm": 1.9458723068237305, + "learning_rate": 1.2644790471390045e-05, + "loss": 0.0882, + "step": 23666 + }, + { + "epoch": 0.47336, + "grad_norm": 0.045301344245672226, + "learning_rate": 1.2643443901152685e-05, + "loss": 0.0045, + "step": 23668 + }, + { + "epoch": 0.4734, + "grad_norm": 0.10080362111330032, + "learning_rate": 1.2642097279380025e-05, + "loss": 0.0253, + "step": 23670 + }, + { + "epoch": 0.47344, + "grad_norm": 1.824454426765442, + "learning_rate": 1.264075060609832e-05, + "loss": 0.0303, + "step": 23672 + }, + { + "epoch": 0.47348, + "grad_norm": 0.4299186170101166, + "learning_rate": 1.2639403881333822e-05, + "loss": 0.5345, + "step": 23674 + }, + { + "epoch": 0.47352, + "grad_norm": 3.0561320781707764, + "learning_rate": 1.2638057105112787e-05, + "loss": 0.0538, + "step": 23676 + }, + { + "epoch": 0.47356, + "grad_norm": 7.61189079284668, + "learning_rate": 1.263671027746147e-05, + "loss": 0.1555, + "step": 23678 + }, + { + "epoch": 0.4736, + "grad_norm": 3.464015245437622, + "learning_rate": 1.263536339840613e-05, + "loss": 0.0728, + "step": 23680 + }, + { + "epoch": 0.47364, + "grad_norm": 11.509221076965332, + "learning_rate": 1.2634016467973018e-05, + "loss": 0.2704, + "step": 23682 + }, + { + "epoch": 0.47368, + "grad_norm": 9.341641426086426, + "learning_rate": 1.2632669486188403e-05, + "loss": 0.203, + "step": 23684 + }, + { + "epoch": 0.47372, + "grad_norm": 0.22303611040115356, + "learning_rate": 1.263132245307854e-05, + "loss": 0.0051, + "step": 23686 + }, + { + "epoch": 0.47376, + "grad_norm": 0.1479979306459427, + "learning_rate": 1.2629975368669694e-05, + "loss": 0.0363, + "step": 23688 + }, + { + "epoch": 0.4738, + "grad_norm": 8.81959342956543, + "learning_rate": 1.2628628232988123e-05, + "loss": 0.3138, + "step": 23690 + }, + { + "epoch": 0.47384, + "grad_norm": 0.7171793580055237, + "learning_rate": 1.262728104606009e-05, + "loss": 0.244, + "step": 23692 + }, + { + "epoch": 0.47388, + "grad_norm": 5.558905124664307, + "learning_rate": 1.2625933807911859e-05, + "loss": 0.072, + "step": 23694 + }, + { + "epoch": 0.47392, + "grad_norm": 0.6509491205215454, + "learning_rate": 1.2624586518569699e-05, + "loss": 0.0141, + "step": 23696 + }, + { + "epoch": 0.47396, + "grad_norm": 3.5871176719665527, + "learning_rate": 1.2623239178059874e-05, + "loss": 0.0719, + "step": 23698 + }, + { + "epoch": 0.474, + "grad_norm": 0.20595665276050568, + "learning_rate": 1.2621891786408648e-05, + "loss": 0.0489, + "step": 23700 + }, + { + "epoch": 0.47404, + "grad_norm": 0.24248579144477844, + "learning_rate": 1.262054434364229e-05, + "loss": 0.0514, + "step": 23702 + }, + { + "epoch": 0.47408, + "grad_norm": 5.957839012145996, + "learning_rate": 1.2619196849787078e-05, + "loss": 0.1079, + "step": 23704 + }, + { + "epoch": 0.47412, + "grad_norm": 1.390537142753601, + "learning_rate": 1.2617849304869272e-05, + "loss": 0.0766, + "step": 23706 + }, + { + "epoch": 0.47416, + "grad_norm": 0.11186898499727249, + "learning_rate": 1.2616501708915145e-05, + "loss": 0.128, + "step": 23708 + }, + { + "epoch": 0.4742, + "grad_norm": 0.01932678185403347, + "learning_rate": 1.261515406195097e-05, + "loss": 0.3189, + "step": 23710 + }, + { + "epoch": 0.47424, + "grad_norm": 0.49027401208877563, + "learning_rate": 1.2613806364003023e-05, + "loss": 0.0443, + "step": 23712 + }, + { + "epoch": 0.47428, + "grad_norm": 14.494041442871094, + "learning_rate": 1.2612458615097571e-05, + "loss": 0.6663, + "step": 23714 + }, + { + "epoch": 0.47432, + "grad_norm": 0.01437304075807333, + "learning_rate": 1.2611110815260895e-05, + "loss": 0.0032, + "step": 23716 + }, + { + "epoch": 0.47436, + "grad_norm": 11.84504222869873, + "learning_rate": 1.2609762964519268e-05, + "loss": 0.3837, + "step": 23718 + }, + { + "epoch": 0.4744, + "grad_norm": 0.7123515605926514, + "learning_rate": 1.2608415062898971e-05, + "loss": 0.0342, + "step": 23720 + }, + { + "epoch": 0.47444, + "grad_norm": 12.142621040344238, + "learning_rate": 1.2607067110426279e-05, + "loss": 0.3386, + "step": 23722 + }, + { + "epoch": 0.47448, + "grad_norm": 0.19503018260002136, + "learning_rate": 1.2605719107127473e-05, + "loss": 0.0029, + "step": 23724 + }, + { + "epoch": 0.47452, + "grad_norm": 11.252654075622559, + "learning_rate": 1.2604371053028826e-05, + "loss": 0.6739, + "step": 23726 + }, + { + "epoch": 0.47456, + "grad_norm": 0.09086796641349792, + "learning_rate": 1.260302294815663e-05, + "loss": 0.0166, + "step": 23728 + }, + { + "epoch": 0.4746, + "grad_norm": 0.2233055979013443, + "learning_rate": 1.2601674792537157e-05, + "loss": 0.18, + "step": 23730 + }, + { + "epoch": 0.47464, + "grad_norm": 7.757842540740967, + "learning_rate": 1.2600326586196696e-05, + "loss": 0.1073, + "step": 23732 + }, + { + "epoch": 0.47468, + "grad_norm": 0.6904750466346741, + "learning_rate": 1.259897832916153e-05, + "loss": 0.2945, + "step": 23734 + }, + { + "epoch": 0.47472, + "grad_norm": 0.2892902195453644, + "learning_rate": 1.2597630021457945e-05, + "loss": 0.0253, + "step": 23736 + }, + { + "epoch": 0.47476, + "grad_norm": 1.926376223564148, + "learning_rate": 1.2596281663112224e-05, + "loss": 0.0401, + "step": 23738 + }, + { + "epoch": 0.4748, + "grad_norm": 8.719735145568848, + "learning_rate": 1.2594933254150654e-05, + "loss": 0.2114, + "step": 23740 + }, + { + "epoch": 0.47484, + "grad_norm": 1.3817473649978638, + "learning_rate": 1.2593584794599528e-05, + "loss": 0.0347, + "step": 23742 + }, + { + "epoch": 0.47488, + "grad_norm": 0.08177917450666428, + "learning_rate": 1.2592236284485129e-05, + "loss": 0.0041, + "step": 23744 + }, + { + "epoch": 0.47492, + "grad_norm": 0.014458193443715572, + "learning_rate": 1.259088772383375e-05, + "loss": 0.0007, + "step": 23746 + }, + { + "epoch": 0.47496, + "grad_norm": 1.7446435689926147, + "learning_rate": 1.2589539112671678e-05, + "loss": 0.0241, + "step": 23748 + }, + { + "epoch": 0.475, + "grad_norm": 7.755382537841797, + "learning_rate": 1.2588190451025209e-05, + "loss": 0.1907, + "step": 23750 + }, + { + "epoch": 0.47504, + "grad_norm": 0.09321743249893188, + "learning_rate": 1.2586841738920637e-05, + "loss": 0.0036, + "step": 23752 + }, + { + "epoch": 0.47508, + "grad_norm": 6.370089054107666, + "learning_rate": 1.258549297638425e-05, + "loss": 0.1231, + "step": 23754 + }, + { + "epoch": 0.47512, + "grad_norm": 0.041543155908584595, + "learning_rate": 1.2584144163442347e-05, + "loss": 0.0035, + "step": 23756 + }, + { + "epoch": 0.47516, + "grad_norm": 0.47291451692581177, + "learning_rate": 1.2582795300121226e-05, + "loss": 0.0367, + "step": 23758 + }, + { + "epoch": 0.4752, + "grad_norm": 0.36255767941474915, + "learning_rate": 1.2581446386447178e-05, + "loss": 0.0085, + "step": 23760 + }, + { + "epoch": 0.47524, + "grad_norm": 0.6766573190689087, + "learning_rate": 1.2580097422446507e-05, + "loss": 0.0107, + "step": 23762 + }, + { + "epoch": 0.47528, + "grad_norm": 0.40601062774658203, + "learning_rate": 1.2578748408145504e-05, + "loss": 0.0114, + "step": 23764 + }, + { + "epoch": 0.47532, + "grad_norm": 0.030356070026755333, + "learning_rate": 1.2577399343570475e-05, + "loss": 0.1903, + "step": 23766 + }, + { + "epoch": 0.47536, + "grad_norm": 0.38310477137565613, + "learning_rate": 1.2576050228747723e-05, + "loss": 0.0318, + "step": 23768 + }, + { + "epoch": 0.4754, + "grad_norm": 0.053774718195199966, + "learning_rate": 1.257470106370354e-05, + "loss": 0.374, + "step": 23770 + }, + { + "epoch": 0.47544, + "grad_norm": 1.3699204921722412, + "learning_rate": 1.2573351848464235e-05, + "loss": 0.5923, + "step": 23772 + }, + { + "epoch": 0.47548, + "grad_norm": 1.5535622835159302, + "learning_rate": 1.2572002583056112e-05, + "loss": 0.0255, + "step": 23774 + }, + { + "epoch": 0.47552, + "grad_norm": 0.40768012404441833, + "learning_rate": 1.2570653267505474e-05, + "loss": 0.0242, + "step": 23776 + }, + { + "epoch": 0.47556, + "grad_norm": 0.28330886363983154, + "learning_rate": 1.256930390183863e-05, + "loss": 0.0104, + "step": 23778 + }, + { + "epoch": 0.4756, + "grad_norm": 9.164889335632324, + "learning_rate": 1.256795448608188e-05, + "loss": 0.2359, + "step": 23780 + }, + { + "epoch": 0.47564, + "grad_norm": 10.7890043258667, + "learning_rate": 1.2566605020261536e-05, + "loss": 0.3987, + "step": 23782 + }, + { + "epoch": 0.47568, + "grad_norm": 4.140585422515869, + "learning_rate": 1.2565255504403905e-05, + "loss": 0.1173, + "step": 23784 + }, + { + "epoch": 0.47572, + "grad_norm": 0.16124527156352997, + "learning_rate": 1.25639059385353e-05, + "loss": 0.0114, + "step": 23786 + }, + { + "epoch": 0.47576, + "grad_norm": 3.0707128047943115, + "learning_rate": 1.2562556322682026e-05, + "loss": 0.0864, + "step": 23788 + }, + { + "epoch": 0.4758, + "grad_norm": 2.304121255874634, + "learning_rate": 1.2561206656870397e-05, + "loss": 0.0865, + "step": 23790 + }, + { + "epoch": 0.47584, + "grad_norm": 2.5866849422454834, + "learning_rate": 1.255985694112673e-05, + "loss": 0.0605, + "step": 23792 + }, + { + "epoch": 0.47588, + "grad_norm": 2.2790579795837402, + "learning_rate": 1.2558507175477328e-05, + "loss": 0.0468, + "step": 23794 + }, + { + "epoch": 0.47592, + "grad_norm": 4.735161304473877, + "learning_rate": 1.2557157359948517e-05, + "loss": 0.7245, + "step": 23796 + }, + { + "epoch": 0.47596, + "grad_norm": 2.6561660766601562, + "learning_rate": 1.2555807494566604e-05, + "loss": 0.0482, + "step": 23798 + }, + { + "epoch": 0.476, + "grad_norm": 0.5667159557342529, + "learning_rate": 1.2554457579357906e-05, + "loss": 0.0127, + "step": 23800 + }, + { + "epoch": 0.47604, + "grad_norm": 0.07455992698669434, + "learning_rate": 1.2553107614348746e-05, + "loss": 0.0146, + "step": 23802 + }, + { + "epoch": 0.47608, + "grad_norm": 0.13926978409290314, + "learning_rate": 1.2551757599565438e-05, + "loss": 0.0036, + "step": 23804 + }, + { + "epoch": 0.47612, + "grad_norm": 0.11268332600593567, + "learning_rate": 1.2550407535034299e-05, + "loss": 0.0209, + "step": 23806 + }, + { + "epoch": 0.47616, + "grad_norm": 0.049102459102869034, + "learning_rate": 1.2549057420781652e-05, + "loss": 0.1165, + "step": 23808 + }, + { + "epoch": 0.4762, + "grad_norm": 1.5012444257736206, + "learning_rate": 1.2547707256833823e-05, + "loss": 0.0377, + "step": 23810 + }, + { + "epoch": 0.47624, + "grad_norm": 1.8888638019561768, + "learning_rate": 1.2546357043217128e-05, + "loss": 0.0362, + "step": 23812 + }, + { + "epoch": 0.47628, + "grad_norm": 0.25110509991645813, + "learning_rate": 1.254500677995789e-05, + "loss": 0.0093, + "step": 23814 + }, + { + "epoch": 0.47632, + "grad_norm": 9.479930877685547, + "learning_rate": 1.2543656467082435e-05, + "loss": 0.8959, + "step": 23816 + }, + { + "epoch": 0.47636, + "grad_norm": 2.318331003189087, + "learning_rate": 1.2542306104617088e-05, + "loss": 0.052, + "step": 23818 + }, + { + "epoch": 0.4764, + "grad_norm": 1.0306791067123413, + "learning_rate": 1.2540955692588173e-05, + "loss": 0.0244, + "step": 23820 + }, + { + "epoch": 0.47644, + "grad_norm": 2.6673953533172607, + "learning_rate": 1.2539605231022019e-05, + "loss": 0.0527, + "step": 23822 + }, + { + "epoch": 0.47648, + "grad_norm": 0.0960010439157486, + "learning_rate": 1.2538254719944953e-05, + "loss": 0.0052, + "step": 23824 + }, + { + "epoch": 0.47652, + "grad_norm": 0.648833692073822, + "learning_rate": 1.2536904159383308e-05, + "loss": 0.1927, + "step": 23826 + }, + { + "epoch": 0.47656, + "grad_norm": 0.03738166764378548, + "learning_rate": 1.2535553549363407e-05, + "loss": 0.0029, + "step": 23828 + }, + { + "epoch": 0.4766, + "grad_norm": 0.39852964878082275, + "learning_rate": 1.2534202889911584e-05, + "loss": 0.0511, + "step": 23830 + }, + { + "epoch": 0.47664, + "grad_norm": 1.5549079179763794, + "learning_rate": 1.2532852181054174e-05, + "loss": 0.1681, + "step": 23832 + }, + { + "epoch": 0.47668, + "grad_norm": 0.10456269234418869, + "learning_rate": 1.2531501422817505e-05, + "loss": 0.0016, + "step": 23834 + }, + { + "epoch": 0.47672, + "grad_norm": 0.23497812449932098, + "learning_rate": 1.2530150615227911e-05, + "loss": 0.016, + "step": 23836 + }, + { + "epoch": 0.47676, + "grad_norm": 0.719590425491333, + "learning_rate": 1.252879975831173e-05, + "loss": 0.0192, + "step": 23838 + }, + { + "epoch": 0.4768, + "grad_norm": 0.3030431270599365, + "learning_rate": 1.2527448852095295e-05, + "loss": 0.0827, + "step": 23840 + }, + { + "epoch": 0.47684, + "grad_norm": 0.42284396290779114, + "learning_rate": 1.2526097896604943e-05, + "loss": 0.0088, + "step": 23842 + }, + { + "epoch": 0.47688, + "grad_norm": 1.6261942386627197, + "learning_rate": 1.2524746891867012e-05, + "loss": 0.0277, + "step": 23844 + }, + { + "epoch": 0.47692, + "grad_norm": 1.5688042640686035, + "learning_rate": 1.2523395837907843e-05, + "loss": 0.0243, + "step": 23846 + }, + { + "epoch": 0.47696, + "grad_norm": 1.4776084423065186, + "learning_rate": 1.2522044734753772e-05, + "loss": 0.1466, + "step": 23848 + }, + { + "epoch": 0.477, + "grad_norm": 12.148561477661133, + "learning_rate": 1.252069358243114e-05, + "loss": 0.3113, + "step": 23850 + }, + { + "epoch": 0.47704, + "grad_norm": 0.05208801105618477, + "learning_rate": 1.2519342380966286e-05, + "loss": 0.0047, + "step": 23852 + }, + { + "epoch": 0.47708, + "grad_norm": 1.369588851928711, + "learning_rate": 1.2517991130385558e-05, + "loss": 0.0956, + "step": 23854 + }, + { + "epoch": 0.47712, + "grad_norm": 0.8950476050376892, + "learning_rate": 1.2516639830715296e-05, + "loss": 0.021, + "step": 23856 + }, + { + "epoch": 0.47716, + "grad_norm": 3.2632977962493896, + "learning_rate": 1.2515288481981846e-05, + "loss": 0.0758, + "step": 23858 + }, + { + "epoch": 0.4772, + "grad_norm": 1.8953782320022583, + "learning_rate": 1.251393708421155e-05, + "loss": 0.0383, + "step": 23860 + }, + { + "epoch": 0.47724, + "grad_norm": 2.7043919563293457, + "learning_rate": 1.2512585637430757e-05, + "loss": 0.0584, + "step": 23862 + }, + { + "epoch": 0.47728, + "grad_norm": 11.818194389343262, + "learning_rate": 1.2511234141665816e-05, + "loss": 0.3927, + "step": 23864 + }, + { + "epoch": 0.47732, + "grad_norm": 1.2826049327850342, + "learning_rate": 1.250988259694307e-05, + "loss": 0.0635, + "step": 23866 + }, + { + "epoch": 0.47736, + "grad_norm": 0.10601619631052017, + "learning_rate": 1.250853100328887e-05, + "loss": 0.0058, + "step": 23868 + }, + { + "epoch": 0.4774, + "grad_norm": 0.11759787797927856, + "learning_rate": 1.2507179360729569e-05, + "loss": 0.1285, + "step": 23870 + }, + { + "epoch": 0.47744, + "grad_norm": 0.20572084188461304, + "learning_rate": 1.2505827669291513e-05, + "loss": 0.0084, + "step": 23872 + }, + { + "epoch": 0.47748, + "grad_norm": 0.5088813304901123, + "learning_rate": 1.250447592900106e-05, + "loss": 0.043, + "step": 23874 + }, + { + "epoch": 0.47752, + "grad_norm": 9.185209274291992, + "learning_rate": 1.2503124139884555e-05, + "loss": 0.3207, + "step": 23876 + }, + { + "epoch": 0.47756, + "grad_norm": 0.04943743720650673, + "learning_rate": 1.2501772301968358e-05, + "loss": 0.0023, + "step": 23878 + }, + { + "epoch": 0.4776, + "grad_norm": 2.511465311050415, + "learning_rate": 1.2500420415278822e-05, + "loss": 0.0495, + "step": 23880 + }, + { + "epoch": 0.47764, + "grad_norm": 0.15890367329120636, + "learning_rate": 1.2499068479842307e-05, + "loss": 0.003, + "step": 23882 + }, + { + "epoch": 0.47768, + "grad_norm": 0.6301324367523193, + "learning_rate": 1.2497716495685159e-05, + "loss": 0.0161, + "step": 23884 + }, + { + "epoch": 0.47772, + "grad_norm": 1.1396852731704712, + "learning_rate": 1.2496364462833743e-05, + "loss": 0.0281, + "step": 23886 + }, + { + "epoch": 0.47776, + "grad_norm": 0.2419501096010208, + "learning_rate": 1.2495012381314418e-05, + "loss": 0.004, + "step": 23888 + }, + { + "epoch": 0.4778, + "grad_norm": 0.03051568567752838, + "learning_rate": 1.249366025115354e-05, + "loss": 0.001, + "step": 23890 + }, + { + "epoch": 0.47784, + "grad_norm": 0.9904881119728088, + "learning_rate": 1.2492308072377477e-05, + "loss": 0.016, + "step": 23892 + }, + { + "epoch": 0.47788, + "grad_norm": 0.2502973675727844, + "learning_rate": 1.249095584501258e-05, + "loss": 0.0055, + "step": 23894 + }, + { + "epoch": 0.47792, + "grad_norm": 0.25716155767440796, + "learning_rate": 1.2489603569085218e-05, + "loss": 0.1538, + "step": 23896 + }, + { + "epoch": 0.47796, + "grad_norm": 1.5211005210876465, + "learning_rate": 1.2488251244621754e-05, + "loss": 0.0537, + "step": 23898 + }, + { + "epoch": 0.478, + "grad_norm": 0.15379619598388672, + "learning_rate": 1.2486898871648552e-05, + "loss": 0.0196, + "step": 23900 + }, + { + "epoch": 0.47804, + "grad_norm": 0.09689789265394211, + "learning_rate": 1.2485546450191972e-05, + "loss": 0.1517, + "step": 23902 + }, + { + "epoch": 0.47808, + "grad_norm": 0.17600277066230774, + "learning_rate": 1.2484193980278385e-05, + "loss": 0.0025, + "step": 23904 + }, + { + "epoch": 0.47812, + "grad_norm": 0.03776726499199867, + "learning_rate": 1.2482841461934158e-05, + "loss": 0.0384, + "step": 23906 + }, + { + "epoch": 0.47816, + "grad_norm": 0.3915739357471466, + "learning_rate": 1.248148889518566e-05, + "loss": 0.0226, + "step": 23908 + }, + { + "epoch": 0.4782, + "grad_norm": 0.047994811087846756, + "learning_rate": 1.2480136280059256e-05, + "loss": 0.0018, + "step": 23910 + }, + { + "epoch": 0.47824, + "grad_norm": 0.09059856832027435, + "learning_rate": 1.247878361658132e-05, + "loss": 0.0049, + "step": 23912 + }, + { + "epoch": 0.47828, + "grad_norm": 0.10245191305875778, + "learning_rate": 1.2477430904778223e-05, + "loss": 0.0736, + "step": 23914 + }, + { + "epoch": 0.47832, + "grad_norm": 1.9738801717758179, + "learning_rate": 1.2476078144676333e-05, + "loss": 0.0387, + "step": 23916 + }, + { + "epoch": 0.47836, + "grad_norm": 1.172239065170288, + "learning_rate": 1.2474725336302025e-05, + "loss": 0.0199, + "step": 23918 + }, + { + "epoch": 0.4784, + "grad_norm": 2.4396989345550537, + "learning_rate": 1.2473372479681671e-05, + "loss": 0.0354, + "step": 23920 + }, + { + "epoch": 0.47844, + "grad_norm": 0.3730975389480591, + "learning_rate": 1.2472019574841648e-05, + "loss": 0.5742, + "step": 23922 + }, + { + "epoch": 0.47848, + "grad_norm": 0.08682174235582352, + "learning_rate": 1.2470666621808331e-05, + "loss": 0.0022, + "step": 23924 + }, + { + "epoch": 0.47852, + "grad_norm": 0.49109742045402527, + "learning_rate": 1.2469313620608096e-05, + "loss": 0.0144, + "step": 23926 + }, + { + "epoch": 0.47856, + "grad_norm": 0.23772776126861572, + "learning_rate": 1.2467960571267321e-05, + "loss": 0.0045, + "step": 23928 + }, + { + "epoch": 0.4786, + "grad_norm": 3.8223989009857178, + "learning_rate": 1.2466607473812386e-05, + "loss": 0.8042, + "step": 23930 + }, + { + "epoch": 0.47864, + "grad_norm": 10.258096694946289, + "learning_rate": 1.2465254328269665e-05, + "loss": 0.9991, + "step": 23932 + }, + { + "epoch": 0.47868, + "grad_norm": 0.3578335642814636, + "learning_rate": 1.2463901134665545e-05, + "loss": 0.0773, + "step": 23934 + }, + { + "epoch": 0.47872, + "grad_norm": 0.9871477484703064, + "learning_rate": 1.2462547893026403e-05, + "loss": 0.0407, + "step": 23936 + }, + { + "epoch": 0.47876, + "grad_norm": 0.08479070663452148, + "learning_rate": 1.246119460337862e-05, + "loss": 0.0095, + "step": 23938 + }, + { + "epoch": 0.4788, + "grad_norm": 0.054877132177352905, + "learning_rate": 1.2459841265748582e-05, + "loss": 0.0128, + "step": 23940 + }, + { + "epoch": 0.47884, + "grad_norm": 1.4807058572769165, + "learning_rate": 1.2458487880162671e-05, + "loss": 0.0527, + "step": 23942 + }, + { + "epoch": 0.47888, + "grad_norm": 3.141028881072998, + "learning_rate": 1.2457134446647274e-05, + "loss": 0.0691, + "step": 23944 + }, + { + "epoch": 0.47892, + "grad_norm": 0.23962147533893585, + "learning_rate": 1.2455780965228776e-05, + "loss": 0.0123, + "step": 23946 + }, + { + "epoch": 0.47896, + "grad_norm": 3.960911512374878, + "learning_rate": 1.2454427435933564e-05, + "loss": 0.0981, + "step": 23948 + }, + { + "epoch": 0.479, + "grad_norm": 0.35881999135017395, + "learning_rate": 1.2453073858788027e-05, + "loss": 0.079, + "step": 23950 + }, + { + "epoch": 0.47904, + "grad_norm": 1.5158425569534302, + "learning_rate": 1.2451720233818552e-05, + "loss": 0.0418, + "step": 23952 + }, + { + "epoch": 0.47908, + "grad_norm": 0.028333058580756187, + "learning_rate": 1.2450366561051526e-05, + "loss": 0.2197, + "step": 23954 + }, + { + "epoch": 0.47912, + "grad_norm": 0.6689972281455994, + "learning_rate": 1.2449012840513347e-05, + "loss": 0.0148, + "step": 23956 + }, + { + "epoch": 0.47916, + "grad_norm": 0.2365189790725708, + "learning_rate": 1.2447659072230396e-05, + "loss": 0.0172, + "step": 23958 + }, + { + "epoch": 0.4792, + "grad_norm": 0.16032665967941284, + "learning_rate": 1.2446305256229074e-05, + "loss": 0.012, + "step": 23960 + }, + { + "epoch": 0.47924, + "grad_norm": 0.2286798655986786, + "learning_rate": 1.244495139253577e-05, + "loss": 0.0163, + "step": 23962 + }, + { + "epoch": 0.47928, + "grad_norm": 4.658543109893799, + "learning_rate": 1.2443597481176884e-05, + "loss": 0.1479, + "step": 23964 + }, + { + "epoch": 0.47932, + "grad_norm": 4.156376838684082, + "learning_rate": 1.2442243522178804e-05, + "loss": 0.1013, + "step": 23966 + }, + { + "epoch": 0.47936, + "grad_norm": 0.47341296076774597, + "learning_rate": 1.2440889515567931e-05, + "loss": 0.0116, + "step": 23968 + }, + { + "epoch": 0.4794, + "grad_norm": 0.4731936454772949, + "learning_rate": 1.2439535461370658e-05, + "loss": 0.1035, + "step": 23970 + }, + { + "epoch": 0.47944, + "grad_norm": 3.851405620574951, + "learning_rate": 1.2438181359613388e-05, + "loss": 0.3455, + "step": 23972 + }, + { + "epoch": 0.47948, + "grad_norm": 0.4036383628845215, + "learning_rate": 1.2436827210322516e-05, + "loss": 0.0098, + "step": 23974 + }, + { + "epoch": 0.47952, + "grad_norm": 1.1785892248153687, + "learning_rate": 1.2435473013524439e-05, + "loss": 0.0452, + "step": 23976 + }, + { + "epoch": 0.47956, + "grad_norm": 6.150143146514893, + "learning_rate": 1.2434118769245564e-05, + "loss": 0.2534, + "step": 23978 + }, + { + "epoch": 0.4796, + "grad_norm": 0.4725285470485687, + "learning_rate": 1.2432764477512294e-05, + "loss": 0.0386, + "step": 23980 + }, + { + "epoch": 0.47964, + "grad_norm": 0.2755596339702606, + "learning_rate": 1.2431410138351026e-05, + "loss": 0.3049, + "step": 23982 + }, + { + "epoch": 0.47968, + "grad_norm": 0.09865692257881165, + "learning_rate": 1.2430055751788163e-05, + "loss": 0.004, + "step": 23984 + }, + { + "epoch": 0.47972, + "grad_norm": 2.1980340480804443, + "learning_rate": 1.2428701317850116e-05, + "loss": 0.0549, + "step": 23986 + }, + { + "epoch": 0.47976, + "grad_norm": 1.209535837173462, + "learning_rate": 1.2427346836563286e-05, + "loss": 0.0279, + "step": 23988 + }, + { + "epoch": 0.4798, + "grad_norm": 7.314139366149902, + "learning_rate": 1.2425992307954075e-05, + "loss": 0.3084, + "step": 23990 + }, + { + "epoch": 0.47984, + "grad_norm": 5.297550201416016, + "learning_rate": 1.2424637732048898e-05, + "loss": 0.1671, + "step": 23992 + }, + { + "epoch": 0.47988, + "grad_norm": 1.2782483100891113, + "learning_rate": 1.242328310887416e-05, + "loss": 0.0345, + "step": 23994 + }, + { + "epoch": 0.47992, + "grad_norm": 0.2041507512331009, + "learning_rate": 1.242192843845627e-05, + "loss": 0.0057, + "step": 23996 + }, + { + "epoch": 0.47996, + "grad_norm": 0.16653397679328918, + "learning_rate": 1.242057372082164e-05, + "loss": 0.0043, + "step": 23998 + }, + { + "epoch": 0.48, + "grad_norm": 0.08936497569084167, + "learning_rate": 1.2419218955996677e-05, + "loss": 0.0485, + "step": 24000 + }, + { + "epoch": 0.48004, + "grad_norm": 0.451909601688385, + "learning_rate": 1.2417864144007798e-05, + "loss": 0.0123, + "step": 24002 + }, + { + "epoch": 0.48008, + "grad_norm": 0.3473186194896698, + "learning_rate": 1.241650928488141e-05, + "loss": 0.0169, + "step": 24004 + }, + { + "epoch": 0.48012, + "grad_norm": 0.9574404954910278, + "learning_rate": 1.241515437864393e-05, + "loss": 0.0222, + "step": 24006 + }, + { + "epoch": 0.48016, + "grad_norm": 0.16866886615753174, + "learning_rate": 1.2413799425321774e-05, + "loss": 0.0144, + "step": 24008 + }, + { + "epoch": 0.4802, + "grad_norm": 0.1050657257437706, + "learning_rate": 1.241244442494135e-05, + "loss": 0.0041, + "step": 24010 + }, + { + "epoch": 0.48024, + "grad_norm": 0.27624213695526123, + "learning_rate": 1.2411089377529083e-05, + "loss": 0.0081, + "step": 24012 + }, + { + "epoch": 0.48028, + "grad_norm": 0.14220629632472992, + "learning_rate": 1.240973428311139e-05, + "loss": 0.0292, + "step": 24014 + }, + { + "epoch": 0.48032, + "grad_norm": 1.2060632705688477, + "learning_rate": 1.2408379141714684e-05, + "loss": 0.0508, + "step": 24016 + }, + { + "epoch": 0.48036, + "grad_norm": 2.6006650924682617, + "learning_rate": 1.2407023953365387e-05, + "loss": 0.4755, + "step": 24018 + }, + { + "epoch": 0.4804, + "grad_norm": 0.037414513528347015, + "learning_rate": 1.2405668718089918e-05, + "loss": 0.0197, + "step": 24020 + }, + { + "epoch": 0.48044, + "grad_norm": 0.3920677900314331, + "learning_rate": 1.2404313435914701e-05, + "loss": 0.1239, + "step": 24022 + }, + { + "epoch": 0.48048, + "grad_norm": 0.691098153591156, + "learning_rate": 1.2402958106866154e-05, + "loss": 0.0169, + "step": 24024 + }, + { + "epoch": 0.48052, + "grad_norm": 3.630009651184082, + "learning_rate": 1.2401602730970701e-05, + "loss": 0.2862, + "step": 24026 + }, + { + "epoch": 0.48056, + "grad_norm": 0.38661473989486694, + "learning_rate": 1.2400247308254767e-05, + "loss": 0.02, + "step": 24028 + }, + { + "epoch": 0.4806, + "grad_norm": 2.599837064743042, + "learning_rate": 1.2398891838744777e-05, + "loss": 0.0459, + "step": 24030 + }, + { + "epoch": 0.48064, + "grad_norm": 0.033815838396549225, + "learning_rate": 1.2397536322467154e-05, + "loss": 0.0967, + "step": 24032 + }, + { + "epoch": 0.48068, + "grad_norm": 7.9446797370910645, + "learning_rate": 1.2396180759448326e-05, + "loss": 0.1901, + "step": 24034 + }, + { + "epoch": 0.48072, + "grad_norm": 0.3865413963794708, + "learning_rate": 1.2394825149714722e-05, + "loss": 0.0083, + "step": 24036 + }, + { + "epoch": 0.48076, + "grad_norm": 12.738469123840332, + "learning_rate": 1.2393469493292769e-05, + "loss": 0.5383, + "step": 24038 + }, + { + "epoch": 0.4808, + "grad_norm": 7.770951747894287, + "learning_rate": 1.2392113790208895e-05, + "loss": 0.291, + "step": 24040 + }, + { + "epoch": 0.48084, + "grad_norm": 0.031019236892461777, + "learning_rate": 1.239075804048953e-05, + "loss": 0.0028, + "step": 24042 + }, + { + "epoch": 0.48088, + "grad_norm": 0.15995000302791595, + "learning_rate": 1.2389402244161107e-05, + "loss": 0.0079, + "step": 24044 + }, + { + "epoch": 0.48092, + "grad_norm": 0.054766375571489334, + "learning_rate": 1.2388046401250058e-05, + "loss": 0.0045, + "step": 24046 + }, + { + "epoch": 0.48096, + "grad_norm": 0.04799220338463783, + "learning_rate": 1.2386690511782815e-05, + "loss": 0.0048, + "step": 24048 + }, + { + "epoch": 0.481, + "grad_norm": 2.0233724117279053, + "learning_rate": 1.238533457578581e-05, + "loss": 0.0562, + "step": 24050 + }, + { + "epoch": 0.48104, + "grad_norm": 2.2650461196899414, + "learning_rate": 1.2383978593285482e-05, + "loss": 0.0772, + "step": 24052 + }, + { + "epoch": 0.48108, + "grad_norm": 1.4731786251068115, + "learning_rate": 1.2382622564308261e-05, + "loss": 0.0418, + "step": 24054 + }, + { + "epoch": 0.48112, + "grad_norm": 1.2829450368881226, + "learning_rate": 1.238126648888059e-05, + "loss": 0.0285, + "step": 24056 + }, + { + "epoch": 0.48116, + "grad_norm": 1.8755429983139038, + "learning_rate": 1.2379910367028897e-05, + "loss": 0.0344, + "step": 24058 + }, + { + "epoch": 0.4812, + "grad_norm": 0.141145721077919, + "learning_rate": 1.2378554198779632e-05, + "loss": 0.1289, + "step": 24060 + }, + { + "epoch": 0.48124, + "grad_norm": 0.5357638597488403, + "learning_rate": 1.2377197984159225e-05, + "loss": 0.0157, + "step": 24062 + }, + { + "epoch": 0.48128, + "grad_norm": 0.14999203383922577, + "learning_rate": 1.2375841723194121e-05, + "loss": 0.024, + "step": 24064 + }, + { + "epoch": 0.48132, + "grad_norm": 0.8514152765274048, + "learning_rate": 1.237448541591076e-05, + "loss": 0.0431, + "step": 24066 + }, + { + "epoch": 0.48136, + "grad_norm": 0.6749513745307922, + "learning_rate": 1.237312906233558e-05, + "loss": 0.6016, + "step": 24068 + }, + { + "epoch": 0.4814, + "grad_norm": 0.3429635763168335, + "learning_rate": 1.2371772662495031e-05, + "loss": 0.0115, + "step": 24070 + }, + { + "epoch": 0.48144, + "grad_norm": 0.37725478410720825, + "learning_rate": 1.2370416216415552e-05, + "loss": 0.0107, + "step": 24072 + }, + { + "epoch": 0.48148, + "grad_norm": 0.2209049016237259, + "learning_rate": 1.2369059724123589e-05, + "loss": 0.0063, + "step": 24074 + }, + { + "epoch": 0.48152, + "grad_norm": 2.6241860389709473, + "learning_rate": 1.2367703185645585e-05, + "loss": 0.487, + "step": 24076 + }, + { + "epoch": 0.48156, + "grad_norm": 6.648057460784912, + "learning_rate": 1.236634660100799e-05, + "loss": 0.1851, + "step": 24078 + }, + { + "epoch": 0.4816, + "grad_norm": 0.10151161998510361, + "learning_rate": 1.236498997023725e-05, + "loss": 0.0315, + "step": 24080 + }, + { + "epoch": 0.48164, + "grad_norm": 1.1242314577102661, + "learning_rate": 1.2363633293359809e-05, + "loss": 0.0233, + "step": 24082 + }, + { + "epoch": 0.48168, + "grad_norm": 0.9760063886642456, + "learning_rate": 1.2362276570402124e-05, + "loss": 0.0457, + "step": 24084 + }, + { + "epoch": 0.48172, + "grad_norm": 3.0152156352996826, + "learning_rate": 1.236091980139064e-05, + "loss": 0.0711, + "step": 24086 + }, + { + "epoch": 0.48176, + "grad_norm": 1.4853832721710205, + "learning_rate": 1.2359562986351812e-05, + "loss": 0.034, + "step": 24088 + }, + { + "epoch": 0.4818, + "grad_norm": 0.29999038577079773, + "learning_rate": 1.2358206125312085e-05, + "loss": 0.1824, + "step": 24090 + }, + { + "epoch": 0.48184, + "grad_norm": 0.15039001405239105, + "learning_rate": 1.2356849218297915e-05, + "loss": 0.0067, + "step": 24092 + }, + { + "epoch": 0.48188, + "grad_norm": 0.39101067185401917, + "learning_rate": 1.235549226533576e-05, + "loss": 0.0966, + "step": 24094 + }, + { + "epoch": 0.48192, + "grad_norm": 1.0610731840133667, + "learning_rate": 1.2354135266452067e-05, + "loss": 0.043, + "step": 24096 + }, + { + "epoch": 0.48196, + "grad_norm": 5.415040493011475, + "learning_rate": 1.2352778221673294e-05, + "loss": 0.1397, + "step": 24098 + }, + { + "epoch": 0.482, + "grad_norm": 0.11728936433792114, + "learning_rate": 1.23514211310259e-05, + "loss": 0.1922, + "step": 24100 + }, + { + "epoch": 0.48204, + "grad_norm": 0.538981020450592, + "learning_rate": 1.2350063994536341e-05, + "loss": 0.0122, + "step": 24102 + }, + { + "epoch": 0.48208, + "grad_norm": 0.16263000667095184, + "learning_rate": 1.2348706812231075e-05, + "loss": 0.1786, + "step": 24104 + }, + { + "epoch": 0.48212, + "grad_norm": 0.42220398783683777, + "learning_rate": 1.234734958413656e-05, + "loss": 0.0287, + "step": 24106 + }, + { + "epoch": 0.48216, + "grad_norm": 0.9096614718437195, + "learning_rate": 1.2345992310279258e-05, + "loss": 0.0796, + "step": 24108 + }, + { + "epoch": 0.4822, + "grad_norm": 2.342851161956787, + "learning_rate": 1.2344634990685624e-05, + "loss": 0.0661, + "step": 24110 + }, + { + "epoch": 0.48224, + "grad_norm": 5.424790382385254, + "learning_rate": 1.234327762538213e-05, + "loss": 0.1698, + "step": 24112 + }, + { + "epoch": 0.48228, + "grad_norm": 0.11737619340419769, + "learning_rate": 1.2341920214395226e-05, + "loss": 0.0358, + "step": 24114 + }, + { + "epoch": 0.48232, + "grad_norm": 0.20739144086837769, + "learning_rate": 1.2340562757751385e-05, + "loss": 0.0075, + "step": 24116 + }, + { + "epoch": 0.48236, + "grad_norm": 0.8579668998718262, + "learning_rate": 1.2339205255477066e-05, + "loss": 0.0884, + "step": 24118 + }, + { + "epoch": 0.4824, + "grad_norm": 1.2178311347961426, + "learning_rate": 1.2337847707598738e-05, + "loss": 0.0224, + "step": 24120 + }, + { + "epoch": 0.48244, + "grad_norm": 1.5658973455429077, + "learning_rate": 1.2336490114142867e-05, + "loss": 0.0311, + "step": 24122 + }, + { + "epoch": 0.48248, + "grad_norm": 0.40778791904449463, + "learning_rate": 1.2335132475135914e-05, + "loss": 0.0205, + "step": 24124 + }, + { + "epoch": 0.48252, + "grad_norm": 0.1155146062374115, + "learning_rate": 1.2333774790604354e-05, + "loss": 0.2523, + "step": 24126 + }, + { + "epoch": 0.48256, + "grad_norm": 0.6472220420837402, + "learning_rate": 1.2332417060574654e-05, + "loss": 0.025, + "step": 24128 + }, + { + "epoch": 0.4826, + "grad_norm": 0.14676879346370697, + "learning_rate": 1.233105928507328e-05, + "loss": 0.0069, + "step": 24130 + }, + { + "epoch": 0.48264, + "grad_norm": 3.119403123855591, + "learning_rate": 1.2329701464126704e-05, + "loss": 0.0569, + "step": 24132 + }, + { + "epoch": 0.48268, + "grad_norm": 0.8259975910186768, + "learning_rate": 1.2328343597761401e-05, + "loss": 0.0147, + "step": 24134 + }, + { + "epoch": 0.48272, + "grad_norm": 0.9646003246307373, + "learning_rate": 1.232698568600384e-05, + "loss": 0.0429, + "step": 24136 + }, + { + "epoch": 0.48276, + "grad_norm": 0.0397605262696743, + "learning_rate": 1.2325627728880494e-05, + "loss": 0.0038, + "step": 24138 + }, + { + "epoch": 0.4828, + "grad_norm": 0.10623092949390411, + "learning_rate": 1.2324269726417841e-05, + "loss": 0.0189, + "step": 24140 + }, + { + "epoch": 0.48284, + "grad_norm": 0.8401734232902527, + "learning_rate": 1.232291167864235e-05, + "loss": 0.0184, + "step": 24142 + }, + { + "epoch": 0.48288, + "grad_norm": 0.3687879145145416, + "learning_rate": 1.2321553585580504e-05, + "loss": 0.2113, + "step": 24144 + }, + { + "epoch": 0.48292, + "grad_norm": 5.128537178039551, + "learning_rate": 1.2320195447258774e-05, + "loss": 0.1188, + "step": 24146 + }, + { + "epoch": 0.48296, + "grad_norm": 4.525920391082764, + "learning_rate": 1.2318837263703636e-05, + "loss": 0.0957, + "step": 24148 + }, + { + "epoch": 0.483, + "grad_norm": 0.5826332569122314, + "learning_rate": 1.2317479034941572e-05, + "loss": 0.0114, + "step": 24150 + }, + { + "epoch": 0.48304, + "grad_norm": 6.17272424697876, + "learning_rate": 1.2316120760999066e-05, + "loss": 0.4863, + "step": 24152 + }, + { + "epoch": 0.48308, + "grad_norm": 1.9829837083816528, + "learning_rate": 1.231476244190259e-05, + "loss": 0.0485, + "step": 24154 + }, + { + "epoch": 0.48312, + "grad_norm": 0.18168485164642334, + "learning_rate": 1.2313404077678631e-05, + "loss": 0.0066, + "step": 24156 + }, + { + "epoch": 0.48316, + "grad_norm": 0.18660737574100494, + "learning_rate": 1.2312045668353664e-05, + "loss": 0.0416, + "step": 24158 + }, + { + "epoch": 0.4832, + "grad_norm": 0.9801319241523743, + "learning_rate": 1.2310687213954182e-05, + "loss": 0.0322, + "step": 24160 + }, + { + "epoch": 0.48324, + "grad_norm": 1.8259955644607544, + "learning_rate": 1.230932871450666e-05, + "loss": 0.0224, + "step": 24162 + }, + { + "epoch": 0.48328, + "grad_norm": 0.43891921639442444, + "learning_rate": 1.2307970170037584e-05, + "loss": 0.0096, + "step": 24164 + }, + { + "epoch": 0.48332, + "grad_norm": 0.515741229057312, + "learning_rate": 1.2306611580573441e-05, + "loss": 0.0128, + "step": 24166 + }, + { + "epoch": 0.48336, + "grad_norm": 0.2282395362854004, + "learning_rate": 1.2305252946140723e-05, + "loss": 0.0048, + "step": 24168 + }, + { + "epoch": 0.4834, + "grad_norm": 7.5883989334106445, + "learning_rate": 1.2303894266765908e-05, + "loss": 0.2902, + "step": 24170 + }, + { + "epoch": 0.48344, + "grad_norm": 2.1857845783233643, + "learning_rate": 1.2302535542475487e-05, + "loss": 0.045, + "step": 24172 + }, + { + "epoch": 0.48348, + "grad_norm": 0.8016509413719177, + "learning_rate": 1.2301176773295954e-05, + "loss": 0.025, + "step": 24174 + }, + { + "epoch": 0.48352, + "grad_norm": 3.744248151779175, + "learning_rate": 1.2299817959253795e-05, + "loss": 0.13, + "step": 24176 + }, + { + "epoch": 0.48356, + "grad_norm": 2.6565704345703125, + "learning_rate": 1.2298459100375497e-05, + "loss": 0.0456, + "step": 24178 + }, + { + "epoch": 0.4836, + "grad_norm": 0.08808040618896484, + "learning_rate": 1.2297100196687557e-05, + "loss": 0.1171, + "step": 24180 + }, + { + "epoch": 0.48364, + "grad_norm": 10.509963035583496, + "learning_rate": 1.2295741248216466e-05, + "loss": 0.3219, + "step": 24182 + }, + { + "epoch": 0.48368, + "grad_norm": 0.05954721197485924, + "learning_rate": 1.229438225498872e-05, + "loss": 0.0074, + "step": 24184 + }, + { + "epoch": 0.48372, + "grad_norm": 0.2369040846824646, + "learning_rate": 1.2293023217030809e-05, + "loss": 0.018, + "step": 24186 + }, + { + "epoch": 0.48376, + "grad_norm": 0.07229789346456528, + "learning_rate": 1.2291664134369229e-05, + "loss": 0.0016, + "step": 24188 + }, + { + "epoch": 0.4838, + "grad_norm": 0.08884967863559723, + "learning_rate": 1.2290305007030479e-05, + "loss": 0.0027, + "step": 24190 + }, + { + "epoch": 0.48384, + "grad_norm": 0.3661576211452484, + "learning_rate": 1.2288945835041051e-05, + "loss": 0.0074, + "step": 24192 + }, + { + "epoch": 0.48388, + "grad_norm": 0.7331153154373169, + "learning_rate": 1.2287586618427448e-05, + "loss": 0.2622, + "step": 24194 + }, + { + "epoch": 0.48392, + "grad_norm": 1.1205540895462036, + "learning_rate": 1.2286227357216162e-05, + "loss": 0.0239, + "step": 24196 + }, + { + "epoch": 0.48396, + "grad_norm": 0.04825017973780632, + "learning_rate": 1.22848680514337e-05, + "loss": 0.1066, + "step": 24198 + }, + { + "epoch": 0.484, + "grad_norm": 0.3297734260559082, + "learning_rate": 1.2283508701106559e-05, + "loss": 0.0277, + "step": 24200 + }, + { + "epoch": 0.48404, + "grad_norm": 4.710222244262695, + "learning_rate": 1.228214930626124e-05, + "loss": 0.1681, + "step": 24202 + }, + { + "epoch": 0.48408, + "grad_norm": 0.913475513458252, + "learning_rate": 1.2280789866924243e-05, + "loss": 0.0213, + "step": 24204 + }, + { + "epoch": 0.48412, + "grad_norm": 1.0847779512405396, + "learning_rate": 1.2279430383122076e-05, + "loss": 0.0542, + "step": 24206 + }, + { + "epoch": 0.48416, + "grad_norm": 0.20021666586399078, + "learning_rate": 1.2278070854881241e-05, + "loss": 0.0074, + "step": 24208 + }, + { + "epoch": 0.4842, + "grad_norm": 9.426711082458496, + "learning_rate": 1.2276711282228241e-05, + "loss": 0.4, + "step": 24210 + }, + { + "epoch": 0.48424, + "grad_norm": 0.30246567726135254, + "learning_rate": 1.2275351665189583e-05, + "loss": 0.0621, + "step": 24212 + }, + { + "epoch": 0.48428, + "grad_norm": 0.3422253131866455, + "learning_rate": 1.227399200379177e-05, + "loss": 0.0076, + "step": 24214 + }, + { + "epoch": 0.48432, + "grad_norm": 0.6009932160377502, + "learning_rate": 1.2272632298061313e-05, + "loss": 0.0161, + "step": 24216 + }, + { + "epoch": 0.48436, + "grad_norm": 5.583820819854736, + "learning_rate": 1.2271272548024722e-05, + "loss": 0.1849, + "step": 24218 + }, + { + "epoch": 0.4844, + "grad_norm": 0.3121090233325958, + "learning_rate": 1.2269912753708502e-05, + "loss": 0.1682, + "step": 24220 + }, + { + "epoch": 0.48444, + "grad_norm": 0.015154270455241203, + "learning_rate": 1.2268552915139162e-05, + "loss": 0.001, + "step": 24222 + }, + { + "epoch": 0.48448, + "grad_norm": 0.022837447002530098, + "learning_rate": 1.2267193032343219e-05, + "loss": 0.0076, + "step": 24224 + }, + { + "epoch": 0.48452, + "grad_norm": 0.18976718187332153, + "learning_rate": 1.2265833105347176e-05, + "loss": 0.007, + "step": 24226 + }, + { + "epoch": 0.48456, + "grad_norm": 0.5247077941894531, + "learning_rate": 1.2264473134177554e-05, + "loss": 0.0136, + "step": 24228 + }, + { + "epoch": 0.4846, + "grad_norm": 7.68178129196167, + "learning_rate": 1.226311311886086e-05, + "loss": 0.1769, + "step": 24230 + }, + { + "epoch": 0.48464, + "grad_norm": 1.031031608581543, + "learning_rate": 1.2261753059423611e-05, + "loss": 0.021, + "step": 24232 + }, + { + "epoch": 0.48468, + "grad_norm": 3.869450569152832, + "learning_rate": 1.2260392955892324e-05, + "loss": 0.2421, + "step": 24234 + }, + { + "epoch": 0.48472, + "grad_norm": 4.572662830352783, + "learning_rate": 1.2259032808293509e-05, + "loss": 0.1452, + "step": 24236 + }, + { + "epoch": 0.48476, + "grad_norm": 0.22000019252300262, + "learning_rate": 1.2257672616653689e-05, + "loss": 0.0063, + "step": 24238 + }, + { + "epoch": 0.4848, + "grad_norm": 0.11602582037448883, + "learning_rate": 1.2256312380999376e-05, + "loss": 0.0046, + "step": 24240 + }, + { + "epoch": 0.48484, + "grad_norm": 0.404530793428421, + "learning_rate": 1.2254952101357097e-05, + "loss": 0.0087, + "step": 24242 + }, + { + "epoch": 0.48488, + "grad_norm": 0.15674683451652527, + "learning_rate": 1.2253591777753361e-05, + "loss": 0.0052, + "step": 24244 + }, + { + "epoch": 0.48492, + "grad_norm": 3.207733154296875, + "learning_rate": 1.2252231410214694e-05, + "loss": 0.0718, + "step": 24246 + }, + { + "epoch": 0.48496, + "grad_norm": 0.14624859392642975, + "learning_rate": 1.2250870998767617e-05, + "loss": 0.1287, + "step": 24248 + }, + { + "epoch": 0.485, + "grad_norm": 0.3365086317062378, + "learning_rate": 1.2249510543438652e-05, + "loss": 0.0442, + "step": 24250 + }, + { + "epoch": 0.48504, + "grad_norm": 11.143074035644531, + "learning_rate": 1.224815004425432e-05, + "loss": 0.2741, + "step": 24252 + }, + { + "epoch": 0.48508, + "grad_norm": 0.713740885257721, + "learning_rate": 1.2246789501241143e-05, + "loss": 0.0168, + "step": 24254 + }, + { + "epoch": 0.48512, + "grad_norm": 0.24880319833755493, + "learning_rate": 1.224542891442565e-05, + "loss": 0.0054, + "step": 24256 + }, + { + "epoch": 0.48516, + "grad_norm": 7.97175931930542, + "learning_rate": 1.2244068283834365e-05, + "loss": 0.2361, + "step": 24258 + }, + { + "epoch": 0.4852, + "grad_norm": 12.452173233032227, + "learning_rate": 1.2242707609493814e-05, + "loss": 0.4153, + "step": 24260 + }, + { + "epoch": 0.48524, + "grad_norm": 0.36035674810409546, + "learning_rate": 1.2241346891430523e-05, + "loss": 0.0155, + "step": 24262 + }, + { + "epoch": 0.48528, + "grad_norm": 0.20292232930660248, + "learning_rate": 1.2239986129671018e-05, + "loss": 0.0061, + "step": 24264 + }, + { + "epoch": 0.48532, + "grad_norm": 0.23454992473125458, + "learning_rate": 1.2238625324241832e-05, + "loss": 0.2881, + "step": 24266 + }, + { + "epoch": 0.48536, + "grad_norm": 0.06513172388076782, + "learning_rate": 1.2237264475169495e-05, + "loss": 0.0181, + "step": 24268 + }, + { + "epoch": 0.4854, + "grad_norm": 0.9181752800941467, + "learning_rate": 1.223590358248053e-05, + "loss": 0.0906, + "step": 24270 + }, + { + "epoch": 0.48544, + "grad_norm": 6.486910343170166, + "learning_rate": 1.2234542646201477e-05, + "loss": 0.1864, + "step": 24272 + }, + { + "epoch": 0.48548, + "grad_norm": 2.857639789581299, + "learning_rate": 1.2233181666358864e-05, + "loss": 0.1239, + "step": 24274 + }, + { + "epoch": 0.48552, + "grad_norm": 0.43988749384880066, + "learning_rate": 1.2231820642979226e-05, + "loss": 0.0574, + "step": 24276 + }, + { + "epoch": 0.48556, + "grad_norm": 0.49034321308135986, + "learning_rate": 1.2230459576089095e-05, + "loss": 0.0542, + "step": 24278 + }, + { + "epoch": 0.4856, + "grad_norm": 2.2066900730133057, + "learning_rate": 1.2229098465715005e-05, + "loss": 0.0486, + "step": 24280 + }, + { + "epoch": 0.48564, + "grad_norm": 0.043127287179231644, + "learning_rate": 1.2227737311883494e-05, + "loss": 0.0021, + "step": 24282 + }, + { + "epoch": 0.48568, + "grad_norm": 0.015685031190514565, + "learning_rate": 1.2226376114621096e-05, + "loss": 0.0794, + "step": 24284 + }, + { + "epoch": 0.48572, + "grad_norm": 6.411545753479004, + "learning_rate": 1.222501487395435e-05, + "loss": 0.1951, + "step": 24286 + }, + { + "epoch": 0.48576, + "grad_norm": 0.2397252470254898, + "learning_rate": 1.2223653589909792e-05, + "loss": 0.0228, + "step": 24288 + }, + { + "epoch": 0.4858, + "grad_norm": 0.2729826867580414, + "learning_rate": 1.2222292262513967e-05, + "loss": 0.0284, + "step": 24290 + }, + { + "epoch": 0.48584, + "grad_norm": 0.08914900571107864, + "learning_rate": 1.2220930891793406e-05, + "loss": 0.0119, + "step": 24292 + }, + { + "epoch": 0.48588, + "grad_norm": 3.0063610076904297, + "learning_rate": 1.2219569477774654e-05, + "loss": 0.1544, + "step": 24294 + }, + { + "epoch": 0.48592, + "grad_norm": 0.4421555995941162, + "learning_rate": 1.2218208020484255e-05, + "loss": 0.0108, + "step": 24296 + }, + { + "epoch": 0.48596, + "grad_norm": 0.3303496241569519, + "learning_rate": 1.221684651994875e-05, + "loss": 0.0095, + "step": 24298 + }, + { + "epoch": 0.486, + "grad_norm": 0.2120480090379715, + "learning_rate": 1.2215484976194675e-05, + "loss": 0.0144, + "step": 24300 + }, + { + "epoch": 0.48604, + "grad_norm": 2.369288682937622, + "learning_rate": 1.2214123389248582e-05, + "loss": 0.0799, + "step": 24302 + }, + { + "epoch": 0.48608, + "grad_norm": 0.1301039159297943, + "learning_rate": 1.2212761759137014e-05, + "loss": 0.0665, + "step": 24304 + }, + { + "epoch": 0.48612, + "grad_norm": 4.566817283630371, + "learning_rate": 1.2211400085886516e-05, + "loss": 0.2139, + "step": 24306 + }, + { + "epoch": 0.48616, + "grad_norm": 0.23187963664531708, + "learning_rate": 1.2210038369523636e-05, + "loss": 0.0049, + "step": 24308 + }, + { + "epoch": 0.4862, + "grad_norm": 0.6520898938179016, + "learning_rate": 1.220867661007492e-05, + "loss": 0.1592, + "step": 24310 + }, + { + "epoch": 0.48624, + "grad_norm": 0.18267185986042023, + "learning_rate": 1.2207314807566917e-05, + "loss": 0.0041, + "step": 24312 + }, + { + "epoch": 0.48628, + "grad_norm": 0.1041250228881836, + "learning_rate": 1.2205952962026172e-05, + "loss": 0.0203, + "step": 24314 + }, + { + "epoch": 0.48632, + "grad_norm": 3.875145435333252, + "learning_rate": 1.2204591073479245e-05, + "loss": 0.0697, + "step": 24316 + }, + { + "epoch": 0.48636, + "grad_norm": 0.013121387921273708, + "learning_rate": 1.2203229141952672e-05, + "loss": 0.0018, + "step": 24318 + }, + { + "epoch": 0.4864, + "grad_norm": 4.553174018859863, + "learning_rate": 1.2201867167473015e-05, + "loss": 0.0896, + "step": 24320 + }, + { + "epoch": 0.48644, + "grad_norm": 0.44139885902404785, + "learning_rate": 1.2200505150066826e-05, + "loss": 0.0089, + "step": 24322 + }, + { + "epoch": 0.48648, + "grad_norm": 0.3616592288017273, + "learning_rate": 1.2199143089760656e-05, + "loss": 0.0051, + "step": 24324 + }, + { + "epoch": 0.48652, + "grad_norm": 0.21129731833934784, + "learning_rate": 1.2197780986581058e-05, + "loss": 0.0065, + "step": 24326 + }, + { + "epoch": 0.48656, + "grad_norm": 2.5041418075561523, + "learning_rate": 1.2196418840554588e-05, + "loss": 0.0398, + "step": 24328 + }, + { + "epoch": 0.4866, + "grad_norm": 0.781056821346283, + "learning_rate": 1.2195056651707806e-05, + "loss": 0.0136, + "step": 24330 + }, + { + "epoch": 0.48664, + "grad_norm": 0.7563095688819885, + "learning_rate": 1.219369442006726e-05, + "loss": 0.0124, + "step": 24332 + }, + { + "epoch": 0.48668, + "grad_norm": 0.07835564017295837, + "learning_rate": 1.2192332145659514e-05, + "loss": 0.0117, + "step": 24334 + }, + { + "epoch": 0.48672, + "grad_norm": 0.13339215517044067, + "learning_rate": 1.2190969828511123e-05, + "loss": 0.4312, + "step": 24336 + }, + { + "epoch": 0.48676, + "grad_norm": 8.416287422180176, + "learning_rate": 1.2189607468648647e-05, + "loss": 0.2519, + "step": 24338 + }, + { + "epoch": 0.4868, + "grad_norm": 0.9465008974075317, + "learning_rate": 1.2188245066098647e-05, + "loss": 0.0849, + "step": 24340 + }, + { + "epoch": 0.48684, + "grad_norm": 0.4107397794723511, + "learning_rate": 1.2186882620887684e-05, + "loss": 0.2894, + "step": 24342 + }, + { + "epoch": 0.48688, + "grad_norm": 9.155985832214355, + "learning_rate": 1.2185520133042315e-05, + "loss": 0.3101, + "step": 24344 + }, + { + "epoch": 0.48692, + "grad_norm": 0.49887901544570923, + "learning_rate": 1.2184157602589108e-05, + "loss": 0.0109, + "step": 24346 + }, + { + "epoch": 0.48696, + "grad_norm": 6.023498058319092, + "learning_rate": 1.2182795029554625e-05, + "loss": 0.0898, + "step": 24348 + }, + { + "epoch": 0.487, + "grad_norm": 0.20464615523815155, + "learning_rate": 1.2181432413965428e-05, + "loss": 0.0092, + "step": 24350 + }, + { + "epoch": 0.48704, + "grad_norm": 0.42251724004745483, + "learning_rate": 1.2180069755848083e-05, + "loss": 0.0108, + "step": 24352 + }, + { + "epoch": 0.48708, + "grad_norm": 1.1225874423980713, + "learning_rate": 1.2178707055229154e-05, + "loss": 0.1263, + "step": 24354 + }, + { + "epoch": 0.48712, + "grad_norm": 0.9588225483894348, + "learning_rate": 1.2177344312135214e-05, + "loss": 0.0159, + "step": 24356 + }, + { + "epoch": 0.48716, + "grad_norm": 4.517892837524414, + "learning_rate": 1.2175981526592819e-05, + "loss": 0.1096, + "step": 24358 + }, + { + "epoch": 0.4872, + "grad_norm": 0.17417430877685547, + "learning_rate": 1.217461869862855e-05, + "loss": 0.003, + "step": 24360 + }, + { + "epoch": 0.48724, + "grad_norm": 0.09235970675945282, + "learning_rate": 1.2173255828268965e-05, + "loss": 0.0111, + "step": 24362 + }, + { + "epoch": 0.48728, + "grad_norm": 0.2385251224040985, + "learning_rate": 1.2171892915540643e-05, + "loss": 0.0904, + "step": 24364 + }, + { + "epoch": 0.48732, + "grad_norm": 0.11591198295354843, + "learning_rate": 1.217052996047015e-05, + "loss": 0.0136, + "step": 24366 + }, + { + "epoch": 0.48736, + "grad_norm": 2.025345802307129, + "learning_rate": 1.2169166963084056e-05, + "loss": 0.0427, + "step": 24368 + }, + { + "epoch": 0.4874, + "grad_norm": 0.015240730717778206, + "learning_rate": 1.2167803923408935e-05, + "loss": 0.0042, + "step": 24370 + }, + { + "epoch": 0.48744, + "grad_norm": 0.7905471324920654, + "learning_rate": 1.2166440841471361e-05, + "loss": 0.0173, + "step": 24372 + }, + { + "epoch": 0.48748, + "grad_norm": 0.3366212248802185, + "learning_rate": 1.2165077717297908e-05, + "loss": 0.0628, + "step": 24374 + }, + { + "epoch": 0.48752, + "grad_norm": 2.06046199798584, + "learning_rate": 1.2163714550915149e-05, + "loss": 0.2094, + "step": 24376 + }, + { + "epoch": 0.48756, + "grad_norm": 0.23866654932498932, + "learning_rate": 1.2162351342349662e-05, + "loss": 0.0083, + "step": 24378 + }, + { + "epoch": 0.4876, + "grad_norm": 0.8936452865600586, + "learning_rate": 1.2160988091628023e-05, + "loss": 0.1433, + "step": 24380 + }, + { + "epoch": 0.48764, + "grad_norm": 0.9661118388175964, + "learning_rate": 1.2159624798776808e-05, + "loss": 0.0183, + "step": 24382 + }, + { + "epoch": 0.48768, + "grad_norm": 0.027736777439713478, + "learning_rate": 1.2158261463822596e-05, + "loss": 0.0045, + "step": 24384 + }, + { + "epoch": 0.48772, + "grad_norm": 0.1312670260667801, + "learning_rate": 1.2156898086791964e-05, + "loss": 0.004, + "step": 24386 + }, + { + "epoch": 0.48776, + "grad_norm": 0.0284518264234066, + "learning_rate": 1.2155534667711495e-05, + "loss": 0.0797, + "step": 24388 + }, + { + "epoch": 0.4878, + "grad_norm": 2.828599691390991, + "learning_rate": 1.2154171206607765e-05, + "loss": 0.0423, + "step": 24390 + }, + { + "epoch": 0.48784, + "grad_norm": 0.3728087842464447, + "learning_rate": 1.215280770350736e-05, + "loss": 0.0332, + "step": 24392 + }, + { + "epoch": 0.48788, + "grad_norm": 0.2870666980743408, + "learning_rate": 1.2151444158436863e-05, + "loss": 0.0047, + "step": 24394 + }, + { + "epoch": 0.48792, + "grad_norm": 7.065379619598389, + "learning_rate": 1.2150080571422854e-05, + "loss": 0.119, + "step": 24396 + }, + { + "epoch": 0.48796, + "grad_norm": 0.03815269470214844, + "learning_rate": 1.2148716942491914e-05, + "loss": 0.0015, + "step": 24398 + }, + { + "epoch": 0.488, + "grad_norm": 0.7965930104255676, + "learning_rate": 1.2147353271670634e-05, + "loss": 0.0121, + "step": 24400 + }, + { + "epoch": 0.48804, + "grad_norm": 0.1263447403907776, + "learning_rate": 1.2145989558985596e-05, + "loss": 0.0151, + "step": 24402 + }, + { + "epoch": 0.48808, + "grad_norm": 0.047066375613212585, + "learning_rate": 1.2144625804463384e-05, + "loss": 0.0013, + "step": 24404 + }, + { + "epoch": 0.48812, + "grad_norm": 8.62158203125, + "learning_rate": 1.2143262008130592e-05, + "loss": 0.3597, + "step": 24406 + }, + { + "epoch": 0.48816, + "grad_norm": 0.1562364250421524, + "learning_rate": 1.21418981700138e-05, + "loss": 0.0025, + "step": 24408 + }, + { + "epoch": 0.4882, + "grad_norm": 6.818633556365967, + "learning_rate": 1.2140534290139601e-05, + "loss": 0.1136, + "step": 24410 + }, + { + "epoch": 0.48824, + "grad_norm": 1.3495508432388306, + "learning_rate": 1.2139170368534584e-05, + "loss": 0.0247, + "step": 24412 + }, + { + "epoch": 0.48828, + "grad_norm": 0.18188434839248657, + "learning_rate": 1.2137806405225343e-05, + "loss": 0.3391, + "step": 24414 + }, + { + "epoch": 0.48832, + "grad_norm": 0.4216897785663605, + "learning_rate": 1.2136442400238464e-05, + "loss": 0.009, + "step": 24416 + }, + { + "epoch": 0.48836, + "grad_norm": 1.1265438795089722, + "learning_rate": 1.2135078353600538e-05, + "loss": 0.0425, + "step": 24418 + }, + { + "epoch": 0.4884, + "grad_norm": 0.09929484128952026, + "learning_rate": 1.2133714265338162e-05, + "loss": 0.0056, + "step": 24420 + }, + { + "epoch": 0.48844, + "grad_norm": 0.18727919459342957, + "learning_rate": 1.2132350135477928e-05, + "loss": 0.0195, + "step": 24422 + }, + { + "epoch": 0.48848, + "grad_norm": 0.5227280855178833, + "learning_rate": 1.2130985964046429e-05, + "loss": 0.0204, + "step": 24424 + }, + { + "epoch": 0.48852, + "grad_norm": 0.8282238841056824, + "learning_rate": 1.2129621751070261e-05, + "loss": 0.0132, + "step": 24426 + }, + { + "epoch": 0.48856, + "grad_norm": 6.777761936187744, + "learning_rate": 1.2128257496576021e-05, + "loss": 0.1173, + "step": 24428 + }, + { + "epoch": 0.4886, + "grad_norm": 0.7241013646125793, + "learning_rate": 1.2126893200590309e-05, + "loss": 0.0374, + "step": 24430 + }, + { + "epoch": 0.48864, + "grad_norm": 0.4039786458015442, + "learning_rate": 1.2125528863139715e-05, + "loss": 0.006, + "step": 24432 + }, + { + "epoch": 0.48868, + "grad_norm": 0.27261045575141907, + "learning_rate": 1.2124164484250845e-05, + "loss": 0.0065, + "step": 24434 + }, + { + "epoch": 0.48872, + "grad_norm": 0.5981525182723999, + "learning_rate": 1.2122800063950293e-05, + "loss": 0.0199, + "step": 24436 + }, + { + "epoch": 0.48876, + "grad_norm": 5.4145097732543945, + "learning_rate": 1.2121435602264659e-05, + "loss": 0.2981, + "step": 24438 + }, + { + "epoch": 0.4888, + "grad_norm": 0.3925323188304901, + "learning_rate": 1.212007109922055e-05, + "loss": 0.0059, + "step": 24440 + }, + { + "epoch": 0.48884, + "grad_norm": 3.505307674407959, + "learning_rate": 1.2118706554844559e-05, + "loss": 0.0535, + "step": 24442 + }, + { + "epoch": 0.48888, + "grad_norm": 0.18300913274288177, + "learning_rate": 1.2117341969163295e-05, + "loss": 0.0295, + "step": 24444 + }, + { + "epoch": 0.48892, + "grad_norm": 3.135765552520752, + "learning_rate": 1.2115977342203358e-05, + "loss": 0.0489, + "step": 24446 + }, + { + "epoch": 0.48896, + "grad_norm": 1.422950267791748, + "learning_rate": 1.2114612673991354e-05, + "loss": 0.0794, + "step": 24448 + }, + { + "epoch": 0.489, + "grad_norm": 0.21562430262565613, + "learning_rate": 1.211324796455389e-05, + "loss": 0.2386, + "step": 24450 + }, + { + "epoch": 0.48904, + "grad_norm": 0.4756023585796356, + "learning_rate": 1.2111883213917563e-05, + "loss": 0.0057, + "step": 24452 + }, + { + "epoch": 0.48908, + "grad_norm": 1.16567063331604, + "learning_rate": 1.2110518422108992e-05, + "loss": 0.0204, + "step": 24454 + }, + { + "epoch": 0.48912, + "grad_norm": 2.364558696746826, + "learning_rate": 1.2109153589154773e-05, + "loss": 0.1093, + "step": 24456 + }, + { + "epoch": 0.48916, + "grad_norm": 8.675752639770508, + "learning_rate": 1.2107788715081521e-05, + "loss": 0.275, + "step": 24458 + }, + { + "epoch": 0.4892, + "grad_norm": 0.4576083719730377, + "learning_rate": 1.2106423799915841e-05, + "loss": 0.0213, + "step": 24460 + }, + { + "epoch": 0.48924, + "grad_norm": 1.6863102912902832, + "learning_rate": 1.2105058843684347e-05, + "loss": 0.0249, + "step": 24462 + }, + { + "epoch": 0.48928, + "grad_norm": 0.24939841032028198, + "learning_rate": 1.2103693846413647e-05, + "loss": 0.0109, + "step": 24464 + }, + { + "epoch": 0.48932, + "grad_norm": 12.926040649414062, + "learning_rate": 1.210232880813035e-05, + "loss": 0.6214, + "step": 24466 + }, + { + "epoch": 0.48936, + "grad_norm": 0.1446399986743927, + "learning_rate": 1.2100963728861072e-05, + "loss": 0.0047, + "step": 24468 + }, + { + "epoch": 0.4894, + "grad_norm": 0.1398218870162964, + "learning_rate": 1.2099598608632427e-05, + "loss": 0.0601, + "step": 24470 + }, + { + "epoch": 0.48944, + "grad_norm": 0.5002564191818237, + "learning_rate": 1.2098233447471024e-05, + "loss": 0.0205, + "step": 24472 + }, + { + "epoch": 0.48948, + "grad_norm": 2.6560864448547363, + "learning_rate": 1.2096868245403477e-05, + "loss": 0.1187, + "step": 24474 + }, + { + "epoch": 0.48952, + "grad_norm": 0.13807247579097748, + "learning_rate": 1.2095503002456405e-05, + "loss": 0.0285, + "step": 24476 + }, + { + "epoch": 0.48956, + "grad_norm": 0.8119270205497742, + "learning_rate": 1.2094137718656427e-05, + "loss": 0.0109, + "step": 24478 + }, + { + "epoch": 0.4896, + "grad_norm": 0.003991511184722185, + "learning_rate": 1.2092772394030153e-05, + "loss": 0.0035, + "step": 24480 + }, + { + "epoch": 0.48964, + "grad_norm": 1.1798136234283447, + "learning_rate": 1.2091407028604203e-05, + "loss": 0.0317, + "step": 24482 + }, + { + "epoch": 0.48968, + "grad_norm": 0.7156407833099365, + "learning_rate": 1.20900416224052e-05, + "loss": 0.0443, + "step": 24484 + }, + { + "epoch": 0.48972, + "grad_norm": 0.0389837771654129, + "learning_rate": 1.2088676175459756e-05, + "loss": 0.0051, + "step": 24486 + }, + { + "epoch": 0.48976, + "grad_norm": 0.9852121472358704, + "learning_rate": 1.2087310687794498e-05, + "loss": 0.0134, + "step": 24488 + }, + { + "epoch": 0.4898, + "grad_norm": 0.8697068095207214, + "learning_rate": 1.208594515943604e-05, + "loss": 0.0418, + "step": 24490 + }, + { + "epoch": 0.48984, + "grad_norm": 1.2187045812606812, + "learning_rate": 1.2084579590411008e-05, + "loss": 0.0127, + "step": 24492 + }, + { + "epoch": 0.48988, + "grad_norm": 2.4876246452331543, + "learning_rate": 1.2083213980746024e-05, + "loss": 0.0344, + "step": 24494 + }, + { + "epoch": 0.48992, + "grad_norm": 0.05811166390776634, + "learning_rate": 1.2081848330467712e-05, + "loss": 0.008, + "step": 24496 + }, + { + "epoch": 0.48996, + "grad_norm": 8.686592102050781, + "learning_rate": 1.2080482639602693e-05, + "loss": 0.0649, + "step": 24498 + }, + { + "epoch": 0.49, + "grad_norm": 13.110339164733887, + "learning_rate": 1.2079116908177592e-05, + "loss": 0.5489, + "step": 24500 + }, + { + "epoch": 0.49004, + "grad_norm": 9.348589897155762, + "learning_rate": 1.2077751136219043e-05, + "loss": 0.2858, + "step": 24502 + }, + { + "epoch": 0.49008, + "grad_norm": 2.883085250854492, + "learning_rate": 1.2076385323753664e-05, + "loss": 0.0685, + "step": 24504 + }, + { + "epoch": 0.49012, + "grad_norm": 3.3569438457489014, + "learning_rate": 1.207501947080808e-05, + "loss": 0.1169, + "step": 24506 + }, + { + "epoch": 0.49016, + "grad_norm": 7.937385082244873, + "learning_rate": 1.2073653577408925e-05, + "loss": 0.223, + "step": 24508 + }, + { + "epoch": 0.4902, + "grad_norm": 0.48262977600097656, + "learning_rate": 1.2072287643582825e-05, + "loss": 0.0069, + "step": 24510 + }, + { + "epoch": 0.49024, + "grad_norm": 0.019169878214597702, + "learning_rate": 1.2070921669356415e-05, + "loss": 0.0648, + "step": 24512 + }, + { + "epoch": 0.49028, + "grad_norm": 0.2672705352306366, + "learning_rate": 1.2069555654756315e-05, + "loss": 0.0313, + "step": 24514 + }, + { + "epoch": 0.49032, + "grad_norm": 0.5158534049987793, + "learning_rate": 1.2068189599809165e-05, + "loss": 0.0095, + "step": 24516 + }, + { + "epoch": 0.49036, + "grad_norm": 0.44646763801574707, + "learning_rate": 1.2066823504541596e-05, + "loss": 0.0076, + "step": 24518 + }, + { + "epoch": 0.4904, + "grad_norm": 1.6133357286453247, + "learning_rate": 1.2065457368980236e-05, + "loss": 0.033, + "step": 24520 + }, + { + "epoch": 0.49044, + "grad_norm": 0.08128403127193451, + "learning_rate": 1.2064091193151724e-05, + "loss": 0.0113, + "step": 24522 + }, + { + "epoch": 0.49048, + "grad_norm": 0.030795246362686157, + "learning_rate": 1.2062724977082687e-05, + "loss": 0.0009, + "step": 24524 + }, + { + "epoch": 0.49052, + "grad_norm": 1.3742413520812988, + "learning_rate": 1.2061358720799766e-05, + "loss": 0.0198, + "step": 24526 + }, + { + "epoch": 0.49056, + "grad_norm": 4.324734687805176, + "learning_rate": 1.2059992424329598e-05, + "loss": 0.0736, + "step": 24528 + }, + { + "epoch": 0.4906, + "grad_norm": 1.0275579690933228, + "learning_rate": 1.2058626087698814e-05, + "loss": 0.0196, + "step": 24530 + }, + { + "epoch": 0.49064, + "grad_norm": 0.04896090179681778, + "learning_rate": 1.2057259710934055e-05, + "loss": 0.0087, + "step": 24532 + }, + { + "epoch": 0.49068, + "grad_norm": 1.8072025775909424, + "learning_rate": 1.205589329406196e-05, + "loss": 0.027, + "step": 24534 + }, + { + "epoch": 0.49072, + "grad_norm": 0.9527117609977722, + "learning_rate": 1.205452683710917e-05, + "loss": 0.0286, + "step": 24536 + }, + { + "epoch": 0.49076, + "grad_norm": 0.011108504608273506, + "learning_rate": 1.2053160340102316e-05, + "loss": 0.0032, + "step": 24538 + }, + { + "epoch": 0.4908, + "grad_norm": 2.2983505725860596, + "learning_rate": 1.2051793803068046e-05, + "loss": 0.0497, + "step": 24540 + }, + { + "epoch": 0.49084, + "grad_norm": 0.005336278583854437, + "learning_rate": 1.2050427226033001e-05, + "loss": 0.0045, + "step": 24542 + }, + { + "epoch": 0.49088, + "grad_norm": 0.208055779337883, + "learning_rate": 1.2049060609023821e-05, + "loss": 0.5709, + "step": 24544 + }, + { + "epoch": 0.49092, + "grad_norm": 1.9117363691329956, + "learning_rate": 1.2047693952067148e-05, + "loss": 0.0402, + "step": 24546 + }, + { + "epoch": 0.49096, + "grad_norm": 0.13609579205513, + "learning_rate": 1.2046327255189627e-05, + "loss": 0.0028, + "step": 24548 + }, + { + "epoch": 0.491, + "grad_norm": 0.5002719163894653, + "learning_rate": 1.2044960518417902e-05, + "loss": 0.0086, + "step": 24550 + }, + { + "epoch": 0.49104, + "grad_norm": 0.345497727394104, + "learning_rate": 1.2043593741778623e-05, + "loss": 0.0049, + "step": 24552 + }, + { + "epoch": 0.49108, + "grad_norm": 8.695642471313477, + "learning_rate": 1.204222692529843e-05, + "loss": 0.2098, + "step": 24554 + }, + { + "epoch": 0.49112, + "grad_norm": 0.07750652730464935, + "learning_rate": 1.2040860069003972e-05, + "loss": 0.0047, + "step": 24556 + }, + { + "epoch": 0.49116, + "grad_norm": 0.2933429479598999, + "learning_rate": 1.2039493172921894e-05, + "loss": 0.0148, + "step": 24558 + }, + { + "epoch": 0.4912, + "grad_norm": 0.9464128613471985, + "learning_rate": 1.203812623707885e-05, + "loss": 0.0161, + "step": 24560 + }, + { + "epoch": 0.49124, + "grad_norm": 0.5900248885154724, + "learning_rate": 1.2036759261501482e-05, + "loss": 0.0073, + "step": 24562 + }, + { + "epoch": 0.49128, + "grad_norm": 20.60508155822754, + "learning_rate": 1.2035392246216447e-05, + "loss": 0.5073, + "step": 24564 + }, + { + "epoch": 0.49132, + "grad_norm": 2.752948760986328, + "learning_rate": 1.2034025191250387e-05, + "loss": 0.0425, + "step": 24566 + }, + { + "epoch": 0.49136, + "grad_norm": 13.253288269042969, + "learning_rate": 1.2032658096629965e-05, + "loss": 0.7321, + "step": 24568 + }, + { + "epoch": 0.4914, + "grad_norm": 17.016056060791016, + "learning_rate": 1.2031290962381823e-05, + "loss": 0.4712, + "step": 24570 + }, + { + "epoch": 0.49144, + "grad_norm": 0.22997727990150452, + "learning_rate": 1.202992378853262e-05, + "loss": 0.0032, + "step": 24572 + }, + { + "epoch": 0.49148, + "grad_norm": 0.7054104804992676, + "learning_rate": 1.2028556575109006e-05, + "loss": 0.0221, + "step": 24574 + }, + { + "epoch": 0.49152, + "grad_norm": 1.032226800918579, + "learning_rate": 1.202718932213764e-05, + "loss": 0.0234, + "step": 24576 + }, + { + "epoch": 0.49156, + "grad_norm": 0.20521147549152374, + "learning_rate": 1.2025822029645172e-05, + "loss": 0.0204, + "step": 24578 + }, + { + "epoch": 0.4916, + "grad_norm": 0.05613086000084877, + "learning_rate": 1.202445469765826e-05, + "loss": 0.0091, + "step": 24580 + }, + { + "epoch": 0.49164, + "grad_norm": 0.9897121787071228, + "learning_rate": 1.2023087326203562e-05, + "loss": 0.0362, + "step": 24582 + }, + { + "epoch": 0.49168, + "grad_norm": 0.15718965232372284, + "learning_rate": 1.2021719915307737e-05, + "loss": 0.0033, + "step": 24584 + }, + { + "epoch": 0.49172, + "grad_norm": 0.8275176286697388, + "learning_rate": 1.2020352464997439e-05, + "loss": 0.0221, + "step": 24586 + }, + { + "epoch": 0.49176, + "grad_norm": 0.19055643677711487, + "learning_rate": 1.2018984975299332e-05, + "loss": 0.1291, + "step": 24588 + }, + { + "epoch": 0.4918, + "grad_norm": 8.952033042907715, + "learning_rate": 1.201761744624007e-05, + "loss": 0.723, + "step": 24590 + }, + { + "epoch": 0.49184, + "grad_norm": 0.15755002200603485, + "learning_rate": 1.2016249877846323e-05, + "loss": 0.0071, + "step": 24592 + }, + { + "epoch": 0.49188, + "grad_norm": 0.07744963467121124, + "learning_rate": 1.2014882270144741e-05, + "loss": 0.0051, + "step": 24594 + }, + { + "epoch": 0.49192, + "grad_norm": 1.9567334651947021, + "learning_rate": 1.2013514623161993e-05, + "loss": 0.0313, + "step": 24596 + }, + { + "epoch": 0.49196, + "grad_norm": 6.749203205108643, + "learning_rate": 1.201214693692474e-05, + "loss": 0.2272, + "step": 24598 + }, + { + "epoch": 0.492, + "grad_norm": 0.7741773724555969, + "learning_rate": 1.2010779211459649e-05, + "loss": 0.0145, + "step": 24600 + }, + { + "epoch": 0.49204, + "grad_norm": 0.25222450494766235, + "learning_rate": 1.200941144679338e-05, + "loss": 0.0511, + "step": 24602 + }, + { + "epoch": 0.49208, + "grad_norm": 0.16970255970954895, + "learning_rate": 1.20080436429526e-05, + "loss": 0.0336, + "step": 24604 + }, + { + "epoch": 0.49212, + "grad_norm": 0.17650048434734344, + "learning_rate": 1.2006675799963978e-05, + "loss": 0.0047, + "step": 24606 + }, + { + "epoch": 0.49216, + "grad_norm": 0.13423971831798553, + "learning_rate": 1.200530791785418e-05, + "loss": 0.0667, + "step": 24608 + }, + { + "epoch": 0.4922, + "grad_norm": 1.4170149564743042, + "learning_rate": 1.2003939996649864e-05, + "loss": 0.0273, + "step": 24610 + }, + { + "epoch": 0.49224, + "grad_norm": 1.7385914325714111, + "learning_rate": 1.2002572036377711e-05, + "loss": 0.13, + "step": 24612 + }, + { + "epoch": 0.49228, + "grad_norm": 2.7895803451538086, + "learning_rate": 1.2001204037064385e-05, + "loss": 0.0846, + "step": 24614 + }, + { + "epoch": 0.49232, + "grad_norm": 7.406790733337402, + "learning_rate": 1.1999835998736556e-05, + "loss": 0.2568, + "step": 24616 + }, + { + "epoch": 0.49236, + "grad_norm": 0.2861374616622925, + "learning_rate": 1.1998467921420894e-05, + "loss": 0.0054, + "step": 24618 + }, + { + "epoch": 0.4924, + "grad_norm": 0.06005101278424263, + "learning_rate": 1.1997099805144071e-05, + "loss": 0.1065, + "step": 24620 + }, + { + "epoch": 0.49244, + "grad_norm": 0.3076300323009491, + "learning_rate": 1.199573164993276e-05, + "loss": 0.0052, + "step": 24622 + }, + { + "epoch": 0.49248, + "grad_norm": 0.12440301477909088, + "learning_rate": 1.1994363455813632e-05, + "loss": 0.0066, + "step": 24624 + }, + { + "epoch": 0.49252, + "grad_norm": 0.39566922187805176, + "learning_rate": 1.1992995222813363e-05, + "loss": 0.0165, + "step": 24626 + }, + { + "epoch": 0.49256, + "grad_norm": 0.7715207934379578, + "learning_rate": 1.1991626950958626e-05, + "loss": 0.0125, + "step": 24628 + }, + { + "epoch": 0.4926, + "grad_norm": 0.37720057368278503, + "learning_rate": 1.1990258640276094e-05, + "loss": 0.0241, + "step": 24630 + }, + { + "epoch": 0.49264, + "grad_norm": 3.3960046768188477, + "learning_rate": 1.1988890290792446e-05, + "loss": 0.0767, + "step": 24632 + }, + { + "epoch": 0.49268, + "grad_norm": 0.2754705846309662, + "learning_rate": 1.198752190253436e-05, + "loss": 0.0112, + "step": 24634 + }, + { + "epoch": 0.49272, + "grad_norm": 0.37032392621040344, + "learning_rate": 1.1986153475528508e-05, + "loss": 0.0134, + "step": 24636 + }, + { + "epoch": 0.49276, + "grad_norm": 0.27704620361328125, + "learning_rate": 1.1984785009801572e-05, + "loss": 0.0239, + "step": 24638 + }, + { + "epoch": 0.4928, + "grad_norm": 0.6407403349876404, + "learning_rate": 1.1983416505380234e-05, + "loss": 0.016, + "step": 24640 + }, + { + "epoch": 0.49284, + "grad_norm": 0.20436358451843262, + "learning_rate": 1.198204796229117e-05, + "loss": 0.0046, + "step": 24642 + }, + { + "epoch": 0.49288, + "grad_norm": 0.7733310461044312, + "learning_rate": 1.1980679380561056e-05, + "loss": 0.0194, + "step": 24644 + }, + { + "epoch": 0.49292, + "grad_norm": 0.40539997816085815, + "learning_rate": 1.1979310760216581e-05, + "loss": 0.0178, + "step": 24646 + }, + { + "epoch": 0.49296, + "grad_norm": 4.198763847351074, + "learning_rate": 1.1977942101284421e-05, + "loss": 0.13, + "step": 24648 + }, + { + "epoch": 0.493, + "grad_norm": 0.5321587920188904, + "learning_rate": 1.1976573403791263e-05, + "loss": 0.007, + "step": 24650 + }, + { + "epoch": 0.49304, + "grad_norm": 3.619006633758545, + "learning_rate": 1.1975204667763788e-05, + "loss": 0.0859, + "step": 24652 + }, + { + "epoch": 0.49308, + "grad_norm": 0.598909854888916, + "learning_rate": 1.1973835893228682e-05, + "loss": 0.0076, + "step": 24654 + }, + { + "epoch": 0.49312, + "grad_norm": 0.48754119873046875, + "learning_rate": 1.197246708021263e-05, + "loss": 0.2418, + "step": 24656 + }, + { + "epoch": 0.49316, + "grad_norm": 0.9066795706748962, + "learning_rate": 1.1971098228742315e-05, + "loss": 0.0132, + "step": 24658 + }, + { + "epoch": 0.4932, + "grad_norm": 0.1834554225206375, + "learning_rate": 1.1969729338844429e-05, + "loss": 0.0063, + "step": 24660 + }, + { + "epoch": 0.49324, + "grad_norm": 0.08148279041051865, + "learning_rate": 1.1968360410545652e-05, + "loss": 0.0104, + "step": 24662 + }, + { + "epoch": 0.49328, + "grad_norm": 0.15060171484947205, + "learning_rate": 1.1966991443872674e-05, + "loss": 0.003, + "step": 24664 + }, + { + "epoch": 0.49332, + "grad_norm": 0.45107030868530273, + "learning_rate": 1.196562243885219e-05, + "loss": 0.007, + "step": 24666 + }, + { + "epoch": 0.49336, + "grad_norm": 0.22329773008823395, + "learning_rate": 1.1964253395510879e-05, + "loss": 0.0044, + "step": 24668 + }, + { + "epoch": 0.4934, + "grad_norm": 0.07275186479091644, + "learning_rate": 1.196288431387544e-05, + "loss": 0.0042, + "step": 24670 + }, + { + "epoch": 0.49344, + "grad_norm": 0.20322397351264954, + "learning_rate": 1.1961515193972559e-05, + "loss": 0.1927, + "step": 24672 + }, + { + "epoch": 0.49348, + "grad_norm": 0.25028127431869507, + "learning_rate": 1.1960146035828933e-05, + "loss": 0.0412, + "step": 24674 + }, + { + "epoch": 0.49352, + "grad_norm": 0.9445393085479736, + "learning_rate": 1.195877683947125e-05, + "loss": 0.0222, + "step": 24676 + }, + { + "epoch": 0.49356, + "grad_norm": 1.6491223573684692, + "learning_rate": 1.1957407604926202e-05, + "loss": 0.0279, + "step": 24678 + }, + { + "epoch": 0.4936, + "grad_norm": 0.03139188140630722, + "learning_rate": 1.1956038332220484e-05, + "loss": 0.0005, + "step": 24680 + }, + { + "epoch": 0.49364, + "grad_norm": 7.579476356506348, + "learning_rate": 1.1954669021380793e-05, + "loss": 0.6612, + "step": 24682 + }, + { + "epoch": 0.49368, + "grad_norm": 1.096871018409729, + "learning_rate": 1.1953299672433824e-05, + "loss": 0.0141, + "step": 24684 + }, + { + "epoch": 0.49372, + "grad_norm": 2.796138286590576, + "learning_rate": 1.195193028540627e-05, + "loss": 0.0312, + "step": 24686 + }, + { + "epoch": 0.49376, + "grad_norm": 0.27665820717811584, + "learning_rate": 1.1950560860324832e-05, + "loss": 0.0063, + "step": 24688 + }, + { + "epoch": 0.4938, + "grad_norm": 0.2872874140739441, + "learning_rate": 1.1949191397216207e-05, + "loss": 0.0832, + "step": 24690 + }, + { + "epoch": 0.49384, + "grad_norm": 0.34378206729888916, + "learning_rate": 1.194782189610709e-05, + "loss": 0.1671, + "step": 24692 + }, + { + "epoch": 0.49388, + "grad_norm": 0.03522568941116333, + "learning_rate": 1.1946452357024186e-05, + "loss": 0.0024, + "step": 24694 + }, + { + "epoch": 0.49392, + "grad_norm": 0.03487389162182808, + "learning_rate": 1.1945082779994187e-05, + "loss": 0.0061, + "step": 24696 + }, + { + "epoch": 0.49396, + "grad_norm": 1.714577078819275, + "learning_rate": 1.19437131650438e-05, + "loss": 0.064, + "step": 24698 + }, + { + "epoch": 0.494, + "grad_norm": 0.043229859322309494, + "learning_rate": 1.194234351219972e-05, + "loss": 0.0012, + "step": 24700 + }, + { + "epoch": 0.49404, + "grad_norm": 0.03480292484164238, + "learning_rate": 1.1940973821488657e-05, + "loss": 0.004, + "step": 24702 + }, + { + "epoch": 0.49408, + "grad_norm": 0.1363033652305603, + "learning_rate": 1.193960409293731e-05, + "loss": 0.0666, + "step": 24704 + }, + { + "epoch": 0.49412, + "grad_norm": 9.771759033203125, + "learning_rate": 1.1938234326572382e-05, + "loss": 0.8944, + "step": 24706 + }, + { + "epoch": 0.49416, + "grad_norm": 0.07669425755739212, + "learning_rate": 1.193686452242058e-05, + "loss": 0.0021, + "step": 24708 + }, + { + "epoch": 0.4942, + "grad_norm": 1.3092851638793945, + "learning_rate": 1.1935494680508606e-05, + "loss": 0.0272, + "step": 24710 + }, + { + "epoch": 0.49424, + "grad_norm": 0.08759189397096634, + "learning_rate": 1.1934124800863166e-05, + "loss": 0.22, + "step": 24712 + }, + { + "epoch": 0.49428, + "grad_norm": 0.1273440420627594, + "learning_rate": 1.1932754883510964e-05, + "loss": 0.089, + "step": 24714 + }, + { + "epoch": 0.49432, + "grad_norm": 9.934099197387695, + "learning_rate": 1.1931384928478716e-05, + "loss": 0.2589, + "step": 24716 + }, + { + "epoch": 0.49436, + "grad_norm": 0.553743839263916, + "learning_rate": 1.1930014935793122e-05, + "loss": 0.0294, + "step": 24718 + }, + { + "epoch": 0.4944, + "grad_norm": 0.06734142452478409, + "learning_rate": 1.192864490548089e-05, + "loss": 0.003, + "step": 24720 + }, + { + "epoch": 0.49444, + "grad_norm": 8.23748779296875, + "learning_rate": 1.1927274837568737e-05, + "loss": 0.1635, + "step": 24722 + }, + { + "epoch": 0.49448, + "grad_norm": 0.6793308258056641, + "learning_rate": 1.192590473208337e-05, + "loss": 0.0187, + "step": 24724 + }, + { + "epoch": 0.49452, + "grad_norm": 0.22711844742298126, + "learning_rate": 1.1924534589051494e-05, + "loss": 0.0051, + "step": 24726 + }, + { + "epoch": 0.49456, + "grad_norm": 0.6140071153640747, + "learning_rate": 1.192316440849983e-05, + "loss": 0.0405, + "step": 24728 + }, + { + "epoch": 0.4946, + "grad_norm": 0.049897123128175735, + "learning_rate": 1.1921794190455082e-05, + "loss": 0.0386, + "step": 24730 + }, + { + "epoch": 0.49464, + "grad_norm": 6.5064592361450195, + "learning_rate": 1.1920423934943969e-05, + "loss": 0.1217, + "step": 24732 + }, + { + "epoch": 0.49468, + "grad_norm": 0.11346384882926941, + "learning_rate": 1.1919053641993204e-05, + "loss": 0.0021, + "step": 24734 + }, + { + "epoch": 0.49472, + "grad_norm": 0.11488459259271622, + "learning_rate": 1.1917683311629497e-05, + "loss": 0.0036, + "step": 24736 + }, + { + "epoch": 0.49476, + "grad_norm": 0.02679680474102497, + "learning_rate": 1.1916312943879566e-05, + "loss": 0.0062, + "step": 24738 + }, + { + "epoch": 0.4948, + "grad_norm": 2.9875648021698, + "learning_rate": 1.191494253877013e-05, + "loss": 0.0452, + "step": 24740 + }, + { + "epoch": 0.49484, + "grad_norm": 0.637257993221283, + "learning_rate": 1.1913572096327903e-05, + "loss": 0.0967, + "step": 24742 + }, + { + "epoch": 0.49488, + "grad_norm": 1.419529676437378, + "learning_rate": 1.1912201616579604e-05, + "loss": 0.0228, + "step": 24744 + }, + { + "epoch": 0.49492, + "grad_norm": 7.297929763793945, + "learning_rate": 1.191083109955195e-05, + "loss": 0.1555, + "step": 24746 + }, + { + "epoch": 0.49496, + "grad_norm": 0.28033342957496643, + "learning_rate": 1.1909460545271658e-05, + "loss": 0.0061, + "step": 24748 + }, + { + "epoch": 0.495, + "grad_norm": 0.05965856462717056, + "learning_rate": 1.190808995376545e-05, + "loss": 0.0081, + "step": 24750 + }, + { + "epoch": 0.49504, + "grad_norm": 0.07968581467866898, + "learning_rate": 1.1906719325060045e-05, + "loss": 0.0055, + "step": 24752 + }, + { + "epoch": 0.49508, + "grad_norm": 2.664159059524536, + "learning_rate": 1.1905348659182165e-05, + "loss": 0.0676, + "step": 24754 + }, + { + "epoch": 0.49512, + "grad_norm": 1.1286035776138306, + "learning_rate": 1.1903977956158537e-05, + "loss": 0.1205, + "step": 24756 + }, + { + "epoch": 0.49516, + "grad_norm": 0.00962778553366661, + "learning_rate": 1.1902607216015873e-05, + "loss": 0.0018, + "step": 24758 + }, + { + "epoch": 0.4952, + "grad_norm": 0.06146226078271866, + "learning_rate": 1.1901236438780902e-05, + "loss": 0.004, + "step": 24760 + }, + { + "epoch": 0.49524, + "grad_norm": 2.4121997356414795, + "learning_rate": 1.1899865624480352e-05, + "loss": 0.0594, + "step": 24762 + }, + { + "epoch": 0.49528, + "grad_norm": 6.534864902496338, + "learning_rate": 1.1898494773140942e-05, + "loss": 0.1506, + "step": 24764 + }, + { + "epoch": 0.49532, + "grad_norm": 0.9975444674491882, + "learning_rate": 1.1897123884789396e-05, + "loss": 0.0163, + "step": 24766 + }, + { + "epoch": 0.49536, + "grad_norm": 0.1996578723192215, + "learning_rate": 1.1895752959452445e-05, + "loss": 0.0069, + "step": 24768 + }, + { + "epoch": 0.4954, + "grad_norm": 0.09919610619544983, + "learning_rate": 1.1894381997156814e-05, + "loss": 0.0184, + "step": 24770 + }, + { + "epoch": 0.49544, + "grad_norm": 0.393038809299469, + "learning_rate": 1.1893010997929233e-05, + "loss": 0.0527, + "step": 24772 + }, + { + "epoch": 0.49548, + "grad_norm": 0.023849084973335266, + "learning_rate": 1.1891639961796425e-05, + "loss": 0.006, + "step": 24774 + }, + { + "epoch": 0.49552, + "grad_norm": 0.028694193810224533, + "learning_rate": 1.1890268888785122e-05, + "loss": 0.0086, + "step": 24776 + }, + { + "epoch": 0.49556, + "grad_norm": 3.0673716068267822, + "learning_rate": 1.1888897778922055e-05, + "loss": 0.0538, + "step": 24778 + }, + { + "epoch": 0.4956, + "grad_norm": 0.031365055590867996, + "learning_rate": 1.1887526632233954e-05, + "loss": 0.0137, + "step": 24780 + }, + { + "epoch": 0.49564, + "grad_norm": 0.11896321177482605, + "learning_rate": 1.1886155448747549e-05, + "loss": 0.0134, + "step": 24782 + }, + { + "epoch": 0.49568, + "grad_norm": 2.6118416786193848, + "learning_rate": 1.1884784228489572e-05, + "loss": 0.0396, + "step": 24784 + }, + { + "epoch": 0.49572, + "grad_norm": 1.3734396696090698, + "learning_rate": 1.1883412971486754e-05, + "loss": 0.0282, + "step": 24786 + }, + { + "epoch": 0.49576, + "grad_norm": 0.11194130778312683, + "learning_rate": 1.1882041677765835e-05, + "loss": 0.0065, + "step": 24788 + }, + { + "epoch": 0.4958, + "grad_norm": 4.569367408752441, + "learning_rate": 1.188067034735354e-05, + "loss": 0.6805, + "step": 24790 + }, + { + "epoch": 0.49584, + "grad_norm": 0.0313790999352932, + "learning_rate": 1.187929898027661e-05, + "loss": 0.0039, + "step": 24792 + }, + { + "epoch": 0.49588, + "grad_norm": 1.0830788612365723, + "learning_rate": 1.1877927576561779e-05, + "loss": 0.0254, + "step": 24794 + }, + { + "epoch": 0.49592, + "grad_norm": 0.7699664831161499, + "learning_rate": 1.1876556136235787e-05, + "loss": 0.0131, + "step": 24796 + }, + { + "epoch": 0.49596, + "grad_norm": 1.096179723739624, + "learning_rate": 1.1875184659325363e-05, + "loss": 0.0674, + "step": 24798 + }, + { + "epoch": 0.496, + "grad_norm": 0.5803694725036621, + "learning_rate": 1.187381314585725e-05, + "loss": 0.0097, + "step": 24800 + }, + { + "epoch": 0.49604, + "grad_norm": 0.8489663600921631, + "learning_rate": 1.1872441595858182e-05, + "loss": 0.0487, + "step": 24802 + }, + { + "epoch": 0.49608, + "grad_norm": 0.16951017081737518, + "learning_rate": 1.1871070009354903e-05, + "loss": 0.0081, + "step": 24804 + }, + { + "epoch": 0.49612, + "grad_norm": 0.6756897568702698, + "learning_rate": 1.186969838637415e-05, + "loss": 0.0075, + "step": 24806 + }, + { + "epoch": 0.49616, + "grad_norm": 0.14937390387058258, + "learning_rate": 1.1868326726942666e-05, + "loss": 0.007, + "step": 24808 + }, + { + "epoch": 0.4962, + "grad_norm": 2.4513065814971924, + "learning_rate": 1.186695503108719e-05, + "loss": 0.1384, + "step": 24810 + }, + { + "epoch": 0.49624, + "grad_norm": 0.018400341272354126, + "learning_rate": 1.1865583298834466e-05, + "loss": 0.0014, + "step": 24812 + }, + { + "epoch": 0.49628, + "grad_norm": 0.0713537186384201, + "learning_rate": 1.1864211530211236e-05, + "loss": 0.0022, + "step": 24814 + }, + { + "epoch": 0.49632, + "grad_norm": 0.44687461853027344, + "learning_rate": 1.1862839725244237e-05, + "loss": 0.3788, + "step": 24816 + }, + { + "epoch": 0.49636, + "grad_norm": 1.6915563344955444, + "learning_rate": 1.1861467883960222e-05, + "loss": 0.0317, + "step": 24818 + }, + { + "epoch": 0.4964, + "grad_norm": 1.9154590368270874, + "learning_rate": 1.186009600638593e-05, + "loss": 0.0602, + "step": 24820 + }, + { + "epoch": 0.49644, + "grad_norm": 0.8310080170631409, + "learning_rate": 1.1858724092548114e-05, + "loss": 0.0128, + "step": 24822 + }, + { + "epoch": 0.49648, + "grad_norm": 0.38720205426216125, + "learning_rate": 1.1857352142473511e-05, + "loss": 0.0095, + "step": 24824 + }, + { + "epoch": 0.49652, + "grad_norm": 3.8085241317749023, + "learning_rate": 1.185598015618887e-05, + "loss": 0.0526, + "step": 24826 + }, + { + "epoch": 0.49656, + "grad_norm": 0.10286778211593628, + "learning_rate": 1.1854608133720942e-05, + "loss": 0.0022, + "step": 24828 + }, + { + "epoch": 0.4966, + "grad_norm": 0.31015950441360474, + "learning_rate": 1.1853236075096474e-05, + "loss": 0.011, + "step": 24830 + }, + { + "epoch": 0.49664, + "grad_norm": 0.2185518592596054, + "learning_rate": 1.1851863980342218e-05, + "loss": 0.0036, + "step": 24832 + }, + { + "epoch": 0.49668, + "grad_norm": 0.03029787726700306, + "learning_rate": 1.1850491849484916e-05, + "loss": 0.0721, + "step": 24834 + }, + { + "epoch": 0.49672, + "grad_norm": 3.462186336517334, + "learning_rate": 1.1849119682551323e-05, + "loss": 0.0456, + "step": 24836 + }, + { + "epoch": 0.49676, + "grad_norm": 0.1644420623779297, + "learning_rate": 1.1847747479568193e-05, + "loss": 0.0439, + "step": 24838 + }, + { + "epoch": 0.4968, + "grad_norm": 0.7057328820228577, + "learning_rate": 1.184637524056227e-05, + "loss": 0.012, + "step": 24840 + }, + { + "epoch": 0.49684, + "grad_norm": 0.14904402196407318, + "learning_rate": 1.1845002965560312e-05, + "loss": 0.0168, + "step": 24842 + }, + { + "epoch": 0.49688, + "grad_norm": 7.162698268890381, + "learning_rate": 1.1843630654589074e-05, + "loss": 0.1905, + "step": 24844 + }, + { + "epoch": 0.49692, + "grad_norm": 0.12181685119867325, + "learning_rate": 1.1842258307675308e-05, + "loss": 0.0733, + "step": 24846 + }, + { + "epoch": 0.49696, + "grad_norm": 0.0046715461649000645, + "learning_rate": 1.1840885924845766e-05, + "loss": 0.0006, + "step": 24848 + }, + { + "epoch": 0.497, + "grad_norm": 0.04185083135962486, + "learning_rate": 1.1839513506127202e-05, + "loss": 0.0245, + "step": 24850 + }, + { + "epoch": 0.49704, + "grad_norm": 0.7482472658157349, + "learning_rate": 1.183814105154638e-05, + "loss": 0.0132, + "step": 24852 + }, + { + "epoch": 0.49708, + "grad_norm": 0.009386198595166206, + "learning_rate": 1.183676856113005e-05, + "loss": 0.2506, + "step": 24854 + }, + { + "epoch": 0.49712, + "grad_norm": 0.0838630422949791, + "learning_rate": 1.1835396034904968e-05, + "loss": 0.0658, + "step": 24856 + }, + { + "epoch": 0.49716, + "grad_norm": 6.221941947937012, + "learning_rate": 1.18340234728979e-05, + "loss": 0.1169, + "step": 24858 + }, + { + "epoch": 0.4972, + "grad_norm": 1.9362729787826538, + "learning_rate": 1.1832650875135599e-05, + "loss": 0.0287, + "step": 24860 + }, + { + "epoch": 0.49724, + "grad_norm": 0.12412931025028229, + "learning_rate": 1.1831278241644827e-05, + "loss": 0.0204, + "step": 24862 + }, + { + "epoch": 0.49728, + "grad_norm": 0.5518790483474731, + "learning_rate": 1.1829905572452339e-05, + "loss": 0.0102, + "step": 24864 + }, + { + "epoch": 0.49732, + "grad_norm": 2.8708202838897705, + "learning_rate": 1.1828532867584904e-05, + "loss": 0.2511, + "step": 24866 + }, + { + "epoch": 0.49736, + "grad_norm": 0.19570839405059814, + "learning_rate": 1.1827160127069277e-05, + "loss": 0.0095, + "step": 24868 + }, + { + "epoch": 0.4974, + "grad_norm": 0.14784298837184906, + "learning_rate": 1.1825787350932224e-05, + "loss": 0.0126, + "step": 24870 + }, + { + "epoch": 0.49744, + "grad_norm": 0.001384496339596808, + "learning_rate": 1.1824414539200505e-05, + "loss": 0.0008, + "step": 24872 + }, + { + "epoch": 0.49748, + "grad_norm": 0.046612393110990524, + "learning_rate": 1.1823041691900885e-05, + "loss": 0.001, + "step": 24874 + }, + { + "epoch": 0.49752, + "grad_norm": 0.01906108669936657, + "learning_rate": 1.1821668809060128e-05, + "loss": 0.0009, + "step": 24876 + }, + { + "epoch": 0.49756, + "grad_norm": 0.05962609127163887, + "learning_rate": 1.1820295890705003e-05, + "loss": 0.0246, + "step": 24878 + }, + { + "epoch": 0.4976, + "grad_norm": 1.0364419221878052, + "learning_rate": 1.181892293686227e-05, + "loss": 0.0094, + "step": 24880 + }, + { + "epoch": 0.49764, + "grad_norm": 0.09909071028232574, + "learning_rate": 1.18175499475587e-05, + "loss": 0.0038, + "step": 24882 + }, + { + "epoch": 0.49768, + "grad_norm": 0.09210160374641418, + "learning_rate": 1.1816176922821057e-05, + "loss": 0.0017, + "step": 24884 + }, + { + "epoch": 0.49772, + "grad_norm": 2.774003744125366, + "learning_rate": 1.181480386267611e-05, + "loss": 0.0393, + "step": 24886 + }, + { + "epoch": 0.49776, + "grad_norm": 10.2894287109375, + "learning_rate": 1.1813430767150625e-05, + "loss": 0.1064, + "step": 24888 + }, + { + "epoch": 0.4978, + "grad_norm": 0.13015110790729523, + "learning_rate": 1.1812057636271374e-05, + "loss": 0.0032, + "step": 24890 + }, + { + "epoch": 0.49784, + "grad_norm": 8.64266586303711, + "learning_rate": 1.1810684470065128e-05, + "loss": 0.14, + "step": 24892 + }, + { + "epoch": 0.49788, + "grad_norm": 0.5477207899093628, + "learning_rate": 1.1809311268558656e-05, + "loss": 0.0239, + "step": 24894 + }, + { + "epoch": 0.49792, + "grad_norm": 14.214123725891113, + "learning_rate": 1.180793803177873e-05, + "loss": 0.4929, + "step": 24896 + }, + { + "epoch": 0.49796, + "grad_norm": 0.03539009392261505, + "learning_rate": 1.1806564759752119e-05, + "loss": 0.0309, + "step": 24898 + }, + { + "epoch": 0.498, + "grad_norm": 3.824571132659912, + "learning_rate": 1.1805191452505602e-05, + "loss": 0.0303, + "step": 24900 + }, + { + "epoch": 0.49804, + "grad_norm": 0.270616352558136, + "learning_rate": 1.1803818110065948e-05, + "loss": 0.3959, + "step": 24902 + }, + { + "epoch": 0.49808, + "grad_norm": 0.21298854053020477, + "learning_rate": 1.1802444732459928e-05, + "loss": 0.0505, + "step": 24904 + }, + { + "epoch": 0.49812, + "grad_norm": 0.2053719162940979, + "learning_rate": 1.180107131971432e-05, + "loss": 0.0021, + "step": 24906 + }, + { + "epoch": 0.49816, + "grad_norm": 0.08820559084415436, + "learning_rate": 1.17996978718559e-05, + "loss": 0.0223, + "step": 24908 + }, + { + "epoch": 0.4982, + "grad_norm": 0.6839241981506348, + "learning_rate": 1.1798324388911445e-05, + "loss": 0.0087, + "step": 24910 + }, + { + "epoch": 0.49824, + "grad_norm": 0.05737980082631111, + "learning_rate": 1.1796950870907727e-05, + "loss": 0.0023, + "step": 24912 + }, + { + "epoch": 0.49828, + "grad_norm": 0.01383074838668108, + "learning_rate": 1.1795577317871527e-05, + "loss": 0.0005, + "step": 24914 + }, + { + "epoch": 0.49832, + "grad_norm": 2.81005859375, + "learning_rate": 1.1794203729829628e-05, + "loss": 0.0421, + "step": 24916 + }, + { + "epoch": 0.49836, + "grad_norm": 2.271932601928711, + "learning_rate": 1.1792830106808798e-05, + "loss": 0.0344, + "step": 24918 + }, + { + "epoch": 0.4984, + "grad_norm": 1.5053937435150146, + "learning_rate": 1.1791456448835825e-05, + "loss": 0.0085, + "step": 24920 + }, + { + "epoch": 0.49844, + "grad_norm": 0.008526270277798176, + "learning_rate": 1.1790082755937484e-05, + "loss": 0.0028, + "step": 24922 + }, + { + "epoch": 0.49848, + "grad_norm": 0.8289796113967896, + "learning_rate": 1.178870902814056e-05, + "loss": 0.0084, + "step": 24924 + }, + { + "epoch": 0.49852, + "grad_norm": 0.010116233490407467, + "learning_rate": 1.178733526547183e-05, + "loss": 0.0012, + "step": 24926 + }, + { + "epoch": 0.49856, + "grad_norm": 2.9889614582061768, + "learning_rate": 1.1785961467958082e-05, + "loss": 0.5407, + "step": 24928 + }, + { + "epoch": 0.4986, + "grad_norm": 0.09043505042791367, + "learning_rate": 1.1784587635626095e-05, + "loss": 0.0017, + "step": 24930 + }, + { + "epoch": 0.49864, + "grad_norm": 0.7953193783760071, + "learning_rate": 1.1783213768502652e-05, + "loss": 0.0404, + "step": 24932 + }, + { + "epoch": 0.49868, + "grad_norm": 1.1704776287078857, + "learning_rate": 1.178183986661454e-05, + "loss": 0.0928, + "step": 24934 + }, + { + "epoch": 0.49872, + "grad_norm": 0.05824824422597885, + "learning_rate": 1.1780465929988543e-05, + "loss": 0.0013, + "step": 24936 + }, + { + "epoch": 0.49876, + "grad_norm": 0.7841659784317017, + "learning_rate": 1.1779091958651445e-05, + "loss": 0.0097, + "step": 24938 + }, + { + "epoch": 0.4988, + "grad_norm": 0.010520660318434238, + "learning_rate": 1.1777717952630033e-05, + "loss": 0.3365, + "step": 24940 + }, + { + "epoch": 0.49884, + "grad_norm": 0.056659579277038574, + "learning_rate": 1.1776343911951091e-05, + "loss": 0.0059, + "step": 24942 + }, + { + "epoch": 0.49888, + "grad_norm": 0.006611093413084745, + "learning_rate": 1.1774969836641417e-05, + "loss": 0.002, + "step": 24944 + }, + { + "epoch": 0.49892, + "grad_norm": 15.947021484375, + "learning_rate": 1.1773595726727787e-05, + "loss": 0.5893, + "step": 24946 + }, + { + "epoch": 0.49896, + "grad_norm": 0.011277156881988049, + "learning_rate": 1.1772221582236995e-05, + "loss": 0.001, + "step": 24948 + }, + { + "epoch": 0.499, + "grad_norm": 0.6427209973335266, + "learning_rate": 1.1770847403195836e-05, + "loss": 0.0093, + "step": 24950 + }, + { + "epoch": 0.49904, + "grad_norm": 0.11532298475503922, + "learning_rate": 1.176947318963109e-05, + "loss": 0.0121, + "step": 24952 + }, + { + "epoch": 0.49908, + "grad_norm": 0.039296235889196396, + "learning_rate": 1.1768098941569557e-05, + "loss": 0.0476, + "step": 24954 + }, + { + "epoch": 0.49912, + "grad_norm": 0.339981347322464, + "learning_rate": 1.1766724659038021e-05, + "loss": 0.0152, + "step": 24956 + }, + { + "epoch": 0.49916, + "grad_norm": 0.4375641644001007, + "learning_rate": 1.1765350342063279e-05, + "loss": 0.01, + "step": 24958 + }, + { + "epoch": 0.4992, + "grad_norm": 0.0572979561984539, + "learning_rate": 1.1763975990672125e-05, + "loss": 0.0972, + "step": 24960 + }, + { + "epoch": 0.49924, + "grad_norm": 0.9337266683578491, + "learning_rate": 1.176260160489135e-05, + "loss": 0.0105, + "step": 24962 + }, + { + "epoch": 0.49928, + "grad_norm": 1.137778878211975, + "learning_rate": 1.1761227184747748e-05, + "loss": 0.0775, + "step": 24964 + }, + { + "epoch": 0.49932, + "grad_norm": 1.004963755607605, + "learning_rate": 1.1759852730268118e-05, + "loss": 0.0126, + "step": 24966 + }, + { + "epoch": 0.49936, + "grad_norm": 0.4429681599140167, + "learning_rate": 1.1758478241479252e-05, + "loss": 0.0089, + "step": 24968 + }, + { + "epoch": 0.4994, + "grad_norm": 0.056526727974414825, + "learning_rate": 1.1757103718407948e-05, + "loss": 0.0019, + "step": 24970 + }, + { + "epoch": 0.49944, + "grad_norm": 2.8215880393981934, + "learning_rate": 1.1755729161081e-05, + "loss": 0.0537, + "step": 24972 + }, + { + "epoch": 0.49948, + "grad_norm": 0.09910590946674347, + "learning_rate": 1.175435456952521e-05, + "loss": 0.0026, + "step": 24974 + }, + { + "epoch": 0.49952, + "grad_norm": 0.5798914432525635, + "learning_rate": 1.1752979943767376e-05, + "loss": 0.0094, + "step": 24976 + }, + { + "epoch": 0.49956, + "grad_norm": 0.23002998530864716, + "learning_rate": 1.1751605283834291e-05, + "loss": 0.0043, + "step": 24978 + }, + { + "epoch": 0.4996, + "grad_norm": 0.9089416861534119, + "learning_rate": 1.1750230589752763e-05, + "loss": 0.017, + "step": 24980 + }, + { + "epoch": 0.49964, + "grad_norm": 0.2845090329647064, + "learning_rate": 1.1748855861549585e-05, + "loss": 0.2089, + "step": 24982 + }, + { + "epoch": 0.49968, + "grad_norm": 0.08011695742607117, + "learning_rate": 1.1747481099251565e-05, + "loss": 0.0016, + "step": 24984 + }, + { + "epoch": 0.49972, + "grad_norm": 5.495361328125, + "learning_rate": 1.1746106302885498e-05, + "loss": 0.1022, + "step": 24986 + }, + { + "epoch": 0.49976, + "grad_norm": 0.1885821521282196, + "learning_rate": 1.1744731472478194e-05, + "loss": 0.0045, + "step": 24988 + }, + { + "epoch": 0.4998, + "grad_norm": 0.22529256343841553, + "learning_rate": 1.1743356608056448e-05, + "loss": 0.3216, + "step": 24990 + }, + { + "epoch": 0.49984, + "grad_norm": 0.031750280410051346, + "learning_rate": 1.1741981709647073e-05, + "loss": 0.0529, + "step": 24992 + }, + { + "epoch": 0.49988, + "grad_norm": 0.7024204730987549, + "learning_rate": 1.1740606777276862e-05, + "loss": 0.0808, + "step": 24994 + }, + { + "epoch": 0.49992, + "grad_norm": 0.9295951128005981, + "learning_rate": 1.1739231810972626e-05, + "loss": 0.0153, + "step": 24996 + }, + { + "epoch": 0.49996, + "grad_norm": 3.4697675704956055, + "learning_rate": 1.173785681076117e-05, + "loss": 0.0436, + "step": 24998 + }, + { + "epoch": 0.5, + "grad_norm": 0.7673600316047668, + "learning_rate": 1.1736481776669307e-05, + "loss": 0.0108, + "step": 25000 + }, + { + "epoch": 0.50004, + "grad_norm": 0.5324963927268982, + "learning_rate": 1.1735106708723831e-05, + "loss": 0.0099, + "step": 25002 + }, + { + "epoch": 0.50008, + "grad_norm": 0.11528488248586655, + "learning_rate": 1.1733731606951559e-05, + "loss": 0.0044, + "step": 25004 + }, + { + "epoch": 0.50012, + "grad_norm": 1.6757924556732178, + "learning_rate": 1.1732356471379296e-05, + "loss": 0.0213, + "step": 25006 + }, + { + "epoch": 0.50016, + "grad_norm": 0.9979416131973267, + "learning_rate": 1.1730981302033854e-05, + "loss": 0.0151, + "step": 25008 + }, + { + "epoch": 0.5002, + "grad_norm": 0.05595788732171059, + "learning_rate": 1.1729606098942039e-05, + "loss": 0.0031, + "step": 25010 + }, + { + "epoch": 0.50024, + "grad_norm": 0.1369212418794632, + "learning_rate": 1.172823086213066e-05, + "loss": 0.0048, + "step": 25012 + }, + { + "epoch": 0.50028, + "grad_norm": 0.010449003428220749, + "learning_rate": 1.1726855591626531e-05, + "loss": 0.0013, + "step": 25014 + }, + { + "epoch": 0.50032, + "grad_norm": 0.17273491621017456, + "learning_rate": 1.1725480287456467e-05, + "loss": 0.0037, + "step": 25016 + }, + { + "epoch": 0.50036, + "grad_norm": 2.2191247940063477, + "learning_rate": 1.1724104949647275e-05, + "loss": 0.0339, + "step": 25018 + }, + { + "epoch": 0.5004, + "grad_norm": 0.051663823425769806, + "learning_rate": 1.1722729578225769e-05, + "loss": 0.0008, + "step": 25020 + }, + { + "epoch": 0.50044, + "grad_norm": 0.08623900264501572, + "learning_rate": 1.1721354173218761e-05, + "loss": 0.0009, + "step": 25022 + }, + { + "epoch": 0.50048, + "grad_norm": 0.034593332558870316, + "learning_rate": 1.171997873465307e-05, + "loss": 0.0014, + "step": 25024 + }, + { + "epoch": 0.50052, + "grad_norm": 0.5157285332679749, + "learning_rate": 1.1718603262555507e-05, + "loss": 0.0076, + "step": 25026 + }, + { + "epoch": 0.50056, + "grad_norm": 0.3140588402748108, + "learning_rate": 1.171722775695289e-05, + "loss": 0.0046, + "step": 25028 + }, + { + "epoch": 0.5006, + "grad_norm": 0.5516940951347351, + "learning_rate": 1.171585221787203e-05, + "loss": 0.0085, + "step": 25030 + }, + { + "epoch": 0.50064, + "grad_norm": 0.17428430914878845, + "learning_rate": 1.171447664533975e-05, + "loss": 0.0024, + "step": 25032 + }, + { + "epoch": 0.50068, + "grad_norm": 0.03148328885436058, + "learning_rate": 1.1713101039382865e-05, + "loss": 0.0012, + "step": 25034 + }, + { + "epoch": 0.50072, + "grad_norm": 0.5026605129241943, + "learning_rate": 1.1711725400028192e-05, + "loss": 0.0079, + "step": 25036 + }, + { + "epoch": 0.50076, + "grad_norm": 12.07751178741455, + "learning_rate": 1.1710349727302558e-05, + "loss": 0.2192, + "step": 25038 + }, + { + "epoch": 0.5008, + "grad_norm": 0.09440100938081741, + "learning_rate": 1.1708974021232768e-05, + "loss": 0.0019, + "step": 25040 + }, + { + "epoch": 0.50084, + "grad_norm": 0.005217694211751223, + "learning_rate": 1.1707598281845654e-05, + "loss": 0.0104, + "step": 25042 + }, + { + "epoch": 0.50088, + "grad_norm": 0.1963692158460617, + "learning_rate": 1.1706222509168029e-05, + "loss": 0.0109, + "step": 25044 + }, + { + "epoch": 0.50092, + "grad_norm": 1.9424309730529785, + "learning_rate": 1.1704846703226718e-05, + "loss": 0.0282, + "step": 25046 + }, + { + "epoch": 0.50096, + "grad_norm": 0.03388219699263573, + "learning_rate": 1.170347086404854e-05, + "loss": 0.0172, + "step": 25048 + }, + { + "epoch": 0.501, + "grad_norm": 0.021040042862296104, + "learning_rate": 1.1702094991660326e-05, + "loss": 0.0304, + "step": 25050 + }, + { + "epoch": 0.50104, + "grad_norm": 0.19272065162658691, + "learning_rate": 1.1700719086088891e-05, + "loss": 0.0063, + "step": 25052 + }, + { + "epoch": 0.50108, + "grad_norm": 0.07135728001594543, + "learning_rate": 1.1699343147361064e-05, + "loss": 0.0015, + "step": 25054 + }, + { + "epoch": 0.50112, + "grad_norm": 15.157230377197266, + "learning_rate": 1.1697967175503667e-05, + "loss": 0.3064, + "step": 25056 + }, + { + "epoch": 0.50116, + "grad_norm": 0.37177708745002747, + "learning_rate": 1.1696591170543527e-05, + "loss": 0.005, + "step": 25058 + }, + { + "epoch": 0.5012, + "grad_norm": 0.23939736187458038, + "learning_rate": 1.1695215132507465e-05, + "loss": 0.0044, + "step": 25060 + }, + { + "epoch": 0.50124, + "grad_norm": 1.3709754943847656, + "learning_rate": 1.1693839061422311e-05, + "loss": 0.0156, + "step": 25062 + }, + { + "epoch": 0.50128, + "grad_norm": 1.9068236351013184, + "learning_rate": 1.1692462957314893e-05, + "loss": 0.036, + "step": 25064 + }, + { + "epoch": 0.50132, + "grad_norm": 0.14745588600635529, + "learning_rate": 1.1691086820212043e-05, + "loss": 0.0021, + "step": 25066 + }, + { + "epoch": 0.50136, + "grad_norm": 11.957047462463379, + "learning_rate": 1.168971065014058e-05, + "loss": 0.469, + "step": 25068 + }, + { + "epoch": 0.5014, + "grad_norm": 6.013798713684082, + "learning_rate": 1.1688334447127338e-05, + "loss": 0.1073, + "step": 25070 + }, + { + "epoch": 0.50144, + "grad_norm": 0.050131939351558685, + "learning_rate": 1.1686958211199151e-05, + "loss": 0.3014, + "step": 25072 + }, + { + "epoch": 0.50148, + "grad_norm": 4.35910701751709, + "learning_rate": 1.1685581942382845e-05, + "loss": 0.0647, + "step": 25074 + }, + { + "epoch": 0.50152, + "grad_norm": 0.15674009919166565, + "learning_rate": 1.1684205640705246e-05, + "loss": 0.0046, + "step": 25076 + }, + { + "epoch": 0.50156, + "grad_norm": 0.007940207608044147, + "learning_rate": 1.1682829306193193e-05, + "loss": 0.0169, + "step": 25078 + }, + { + "epoch": 0.5016, + "grad_norm": 0.2047659158706665, + "learning_rate": 1.1681452938873516e-05, + "loss": 0.0297, + "step": 25080 + }, + { + "epoch": 0.50164, + "grad_norm": 0.3373105823993683, + "learning_rate": 1.1680076538773051e-05, + "loss": 0.0224, + "step": 25082 + }, + { + "epoch": 0.50168, + "grad_norm": 0.05565967783331871, + "learning_rate": 1.1678700105918626e-05, + "loss": 0.0139, + "step": 25084 + }, + { + "epoch": 0.50172, + "grad_norm": 0.15395581722259521, + "learning_rate": 1.1677323640337078e-05, + "loss": 0.0046, + "step": 25086 + }, + { + "epoch": 0.50176, + "grad_norm": 14.429585456848145, + "learning_rate": 1.1675947142055241e-05, + "loss": 0.4536, + "step": 25088 + }, + { + "epoch": 0.5018, + "grad_norm": 0.0005247334484010935, + "learning_rate": 1.1674570611099956e-05, + "loss": 0.0027, + "step": 25090 + }, + { + "epoch": 0.50184, + "grad_norm": 0.0759979858994484, + "learning_rate": 1.1673194047498056e-05, + "loss": 0.0592, + "step": 25092 + }, + { + "epoch": 0.50188, + "grad_norm": 15.317930221557617, + "learning_rate": 1.1671817451276371e-05, + "loss": 0.8804, + "step": 25094 + }, + { + "epoch": 0.50192, + "grad_norm": 0.01914581097662449, + "learning_rate": 1.1670440822461747e-05, + "loss": 0.0154, + "step": 25096 + }, + { + "epoch": 0.50196, + "grad_norm": 0.062423091381788254, + "learning_rate": 1.166906416108102e-05, + "loss": 0.0016, + "step": 25098 + }, + { + "epoch": 0.502, + "grad_norm": 5.502828598022461, + "learning_rate": 1.1667687467161025e-05, + "loss": 0.1412, + "step": 25100 + }, + { + "epoch": 0.50204, + "grad_norm": 5.101687431335449, + "learning_rate": 1.1666310740728604e-05, + "loss": 0.0977, + "step": 25102 + }, + { + "epoch": 0.50208, + "grad_norm": 0.10996166616678238, + "learning_rate": 1.1664933981810598e-05, + "loss": 0.0356, + "step": 25104 + }, + { + "epoch": 0.50212, + "grad_norm": 5.470623016357422, + "learning_rate": 1.1663557190433849e-05, + "loss": 0.5109, + "step": 25106 + }, + { + "epoch": 0.50216, + "grad_norm": 0.09406164288520813, + "learning_rate": 1.1662180366625198e-05, + "loss": 0.0028, + "step": 25108 + }, + { + "epoch": 0.5022, + "grad_norm": 0.13287439942359924, + "learning_rate": 1.166080351041148e-05, + "loss": 0.0113, + "step": 25110 + }, + { + "epoch": 0.50224, + "grad_norm": 0.018199678510427475, + "learning_rate": 1.1659426621819543e-05, + "loss": 0.027, + "step": 25112 + }, + { + "epoch": 0.50228, + "grad_norm": 0.14128318428993225, + "learning_rate": 1.1658049700876233e-05, + "loss": 0.0047, + "step": 25114 + }, + { + "epoch": 0.50232, + "grad_norm": 0.6291810274124146, + "learning_rate": 1.1656672747608389e-05, + "loss": 0.0123, + "step": 25116 + }, + { + "epoch": 0.50236, + "grad_norm": 0.2559276521205902, + "learning_rate": 1.1655295762042855e-05, + "loss": 0.0047, + "step": 25118 + }, + { + "epoch": 0.5024, + "grad_norm": 11.125782012939453, + "learning_rate": 1.1653918744206478e-05, + "loss": 0.6101, + "step": 25120 + }, + { + "epoch": 0.50244, + "grad_norm": 6.500679969787598, + "learning_rate": 1.1652541694126107e-05, + "loss": 0.2612, + "step": 25122 + }, + { + "epoch": 0.50248, + "grad_norm": 0.3036309778690338, + "learning_rate": 1.165116461182858e-05, + "loss": 0.0107, + "step": 25124 + }, + { + "epoch": 0.50252, + "grad_norm": 0.34774404764175415, + "learning_rate": 1.1649787497340754e-05, + "loss": 0.0101, + "step": 25126 + }, + { + "epoch": 0.50256, + "grad_norm": 0.8745576739311218, + "learning_rate": 1.164841035068947e-05, + "loss": 0.0173, + "step": 25128 + }, + { + "epoch": 0.5026, + "grad_norm": 0.12647293508052826, + "learning_rate": 1.1647033171901573e-05, + "loss": 0.0398, + "step": 25130 + }, + { + "epoch": 0.50264, + "grad_norm": 0.40029987692832947, + "learning_rate": 1.1645655961003923e-05, + "loss": 0.0094, + "step": 25132 + }, + { + "epoch": 0.50268, + "grad_norm": 0.24211648106575012, + "learning_rate": 1.164427871802336e-05, + "loss": 0.1802, + "step": 25134 + }, + { + "epoch": 0.50272, + "grad_norm": 1.0957788228988647, + "learning_rate": 1.1642901442986733e-05, + "loss": 0.0537, + "step": 25136 + }, + { + "epoch": 0.50276, + "grad_norm": 0.043154191225767136, + "learning_rate": 1.1641524135920901e-05, + "loss": 0.08, + "step": 25138 + }, + { + "epoch": 0.5028, + "grad_norm": 8.530190467834473, + "learning_rate": 1.1640146796852711e-05, + "loss": 0.3102, + "step": 25140 + }, + { + "epoch": 0.50284, + "grad_norm": 0.5792354345321655, + "learning_rate": 1.1638769425809015e-05, + "loss": 0.0142, + "step": 25142 + }, + { + "epoch": 0.50288, + "grad_norm": 2.0130674839019775, + "learning_rate": 1.1637392022816665e-05, + "loss": 0.0421, + "step": 25144 + }, + { + "epoch": 0.50292, + "grad_norm": 0.7200837135314941, + "learning_rate": 1.163601458790251e-05, + "loss": 0.0177, + "step": 25146 + }, + { + "epoch": 0.50296, + "grad_norm": 1.7879152297973633, + "learning_rate": 1.1634637121093416e-05, + "loss": 0.0377, + "step": 25148 + }, + { + "epoch": 0.503, + "grad_norm": 0.37126484513282776, + "learning_rate": 1.1633259622416224e-05, + "loss": 0.093, + "step": 25150 + }, + { + "epoch": 0.50304, + "grad_norm": 0.5943799614906311, + "learning_rate": 1.1631882091897796e-05, + "loss": 0.0896, + "step": 25152 + }, + { + "epoch": 0.50308, + "grad_norm": 0.18804502487182617, + "learning_rate": 1.1630504529564986e-05, + "loss": 0.024, + "step": 25154 + }, + { + "epoch": 0.50312, + "grad_norm": 0.10634545236825943, + "learning_rate": 1.1629126935444655e-05, + "loss": 0.0117, + "step": 25156 + }, + { + "epoch": 0.50316, + "grad_norm": 0.22084057331085205, + "learning_rate": 1.162774930956365e-05, + "loss": 0.1196, + "step": 25158 + }, + { + "epoch": 0.5032, + "grad_norm": 3.256941318511963, + "learning_rate": 1.1626371651948839e-05, + "loss": 0.052, + "step": 25160 + }, + { + "epoch": 0.50324, + "grad_norm": 18.071847915649414, + "learning_rate": 1.1624993962627072e-05, + "loss": 0.9223, + "step": 25162 + }, + { + "epoch": 0.50328, + "grad_norm": 1.977400779724121, + "learning_rate": 1.1623616241625214e-05, + "loss": 0.0295, + "step": 25164 + }, + { + "epoch": 0.50332, + "grad_norm": 0.039854615926742554, + "learning_rate": 1.1622238488970117e-05, + "loss": 0.0071, + "step": 25166 + }, + { + "epoch": 0.50336, + "grad_norm": 2.807111978530884, + "learning_rate": 1.1620860704688649e-05, + "loss": 0.0514, + "step": 25168 + }, + { + "epoch": 0.5034, + "grad_norm": 0.12574706971645355, + "learning_rate": 1.1619482888807662e-05, + "loss": 0.0077, + "step": 25170 + }, + { + "epoch": 0.50344, + "grad_norm": 11.261214256286621, + "learning_rate": 1.1618105041354029e-05, + "loss": 0.4899, + "step": 25172 + }, + { + "epoch": 0.50348, + "grad_norm": 2.496298313140869, + "learning_rate": 1.1616727162354602e-05, + "loss": 0.058, + "step": 25174 + }, + { + "epoch": 0.50352, + "grad_norm": 0.6593449115753174, + "learning_rate": 1.161534925183625e-05, + "loss": 0.0162, + "step": 25176 + }, + { + "epoch": 0.50356, + "grad_norm": 3.71480655670166, + "learning_rate": 1.161397130982583e-05, + "loss": 0.0677, + "step": 25178 + }, + { + "epoch": 0.5036, + "grad_norm": 5.96329402923584, + "learning_rate": 1.1612593336350209e-05, + "loss": 0.1544, + "step": 25180 + }, + { + "epoch": 0.50364, + "grad_norm": 0.06973904371261597, + "learning_rate": 1.1611215331436248e-05, + "loss": 0.0047, + "step": 25182 + }, + { + "epoch": 0.50368, + "grad_norm": 2.9700870513916016, + "learning_rate": 1.1609837295110815e-05, + "loss": 0.1849, + "step": 25184 + }, + { + "epoch": 0.50372, + "grad_norm": 0.9202972054481506, + "learning_rate": 1.1608459227400777e-05, + "loss": 0.0174, + "step": 25186 + }, + { + "epoch": 0.50376, + "grad_norm": 0.29646962881088257, + "learning_rate": 1.1607081128333e-05, + "loss": 0.2085, + "step": 25188 + }, + { + "epoch": 0.5038, + "grad_norm": 1.954544186592102, + "learning_rate": 1.1605702997934345e-05, + "loss": 0.0569, + "step": 25190 + }, + { + "epoch": 0.50384, + "grad_norm": 2.8031022548675537, + "learning_rate": 1.1604324836231684e-05, + "loss": 0.2254, + "step": 25192 + }, + { + "epoch": 0.50388, + "grad_norm": 0.5423191785812378, + "learning_rate": 1.1602946643251888e-05, + "loss": 0.0175, + "step": 25194 + }, + { + "epoch": 0.50392, + "grad_norm": 0.16626229882240295, + "learning_rate": 1.160156841902182e-05, + "loss": 0.3576, + "step": 25196 + }, + { + "epoch": 0.50396, + "grad_norm": 2.6061251163482666, + "learning_rate": 1.1600190163568353e-05, + "loss": 0.061, + "step": 25198 + }, + { + "epoch": 0.504, + "grad_norm": 2.1176254749298096, + "learning_rate": 1.159881187691835e-05, + "loss": 0.6099, + "step": 25200 + }, + { + "epoch": 0.50404, + "grad_norm": 3.3320696353912354, + "learning_rate": 1.1597433559098688e-05, + "loss": 0.0661, + "step": 25202 + }, + { + "epoch": 0.50408, + "grad_norm": 5.049382209777832, + "learning_rate": 1.1596055210136239e-05, + "loss": 0.1343, + "step": 25204 + }, + { + "epoch": 0.50412, + "grad_norm": 0.3757965862751007, + "learning_rate": 1.1594676830057869e-05, + "loss": 0.0277, + "step": 25206 + }, + { + "epoch": 0.50416, + "grad_norm": 0.5141261219978333, + "learning_rate": 1.1593298418890455e-05, + "loss": 0.0241, + "step": 25208 + }, + { + "epoch": 0.5042, + "grad_norm": 0.5391250252723694, + "learning_rate": 1.1591919976660867e-05, + "loss": 0.0174, + "step": 25210 + }, + { + "epoch": 0.50424, + "grad_norm": 0.4264516830444336, + "learning_rate": 1.1590541503395983e-05, + "loss": 0.0103, + "step": 25212 + }, + { + "epoch": 0.50428, + "grad_norm": 0.04121111333370209, + "learning_rate": 1.158916299912267e-05, + "loss": 0.0021, + "step": 25214 + }, + { + "epoch": 0.50432, + "grad_norm": 5.517562389373779, + "learning_rate": 1.1587784463867806e-05, + "loss": 0.1327, + "step": 25216 + }, + { + "epoch": 0.50436, + "grad_norm": 3.390841245651245, + "learning_rate": 1.1586405897658267e-05, + "loss": 0.1438, + "step": 25218 + }, + { + "epoch": 0.5044, + "grad_norm": 0.32203903794288635, + "learning_rate": 1.158502730052093e-05, + "loss": 0.0082, + "step": 25220 + }, + { + "epoch": 0.50444, + "grad_norm": 6.563533306121826, + "learning_rate": 1.1583648672482667e-05, + "loss": 0.2612, + "step": 25222 + }, + { + "epoch": 0.50448, + "grad_norm": 0.27425417304039, + "learning_rate": 1.1582270013570358e-05, + "loss": 0.0755, + "step": 25224 + }, + { + "epoch": 0.50452, + "grad_norm": 0.9122912883758545, + "learning_rate": 1.1580891323810882e-05, + "loss": 0.0184, + "step": 25226 + }, + { + "epoch": 0.50456, + "grad_norm": 0.0827673003077507, + "learning_rate": 1.1579512603231116e-05, + "loss": 0.0087, + "step": 25228 + }, + { + "epoch": 0.5046, + "grad_norm": 0.8201323747634888, + "learning_rate": 1.157813385185794e-05, + "loss": 0.02, + "step": 25230 + }, + { + "epoch": 0.50464, + "grad_norm": 6.0892510414123535, + "learning_rate": 1.1576755069718229e-05, + "loss": 0.1139, + "step": 25232 + }, + { + "epoch": 0.50468, + "grad_norm": 1.5245158672332764, + "learning_rate": 1.1575376256838865e-05, + "loss": 0.0307, + "step": 25234 + }, + { + "epoch": 0.50472, + "grad_norm": 6.260934829711914, + "learning_rate": 1.157399741324673e-05, + "loss": 0.1668, + "step": 25236 + }, + { + "epoch": 0.50476, + "grad_norm": 0.687454879283905, + "learning_rate": 1.157261853896871e-05, + "loss": 0.0221, + "step": 25238 + }, + { + "epoch": 0.5048, + "grad_norm": 0.22535787522792816, + "learning_rate": 1.157123963403168e-05, + "loss": 0.051, + "step": 25240 + }, + { + "epoch": 0.50484, + "grad_norm": 2.887383460998535, + "learning_rate": 1.1569860698462521e-05, + "loss": 0.0537, + "step": 25242 + }, + { + "epoch": 0.50488, + "grad_norm": 0.10927522927522659, + "learning_rate": 1.1568481732288124e-05, + "loss": 0.0098, + "step": 25244 + }, + { + "epoch": 0.50492, + "grad_norm": 0.2788447439670563, + "learning_rate": 1.1567102735535366e-05, + "loss": 0.0064, + "step": 25246 + }, + { + "epoch": 0.50496, + "grad_norm": 7.861612319946289, + "learning_rate": 1.1565723708231133e-05, + "loss": 0.192, + "step": 25248 + }, + { + "epoch": 0.505, + "grad_norm": 1.5365667343139648, + "learning_rate": 1.156434465040231e-05, + "loss": 0.025, + "step": 25250 + }, + { + "epoch": 0.50504, + "grad_norm": 1.8588340282440186, + "learning_rate": 1.1562965562075783e-05, + "loss": 0.0365, + "step": 25252 + }, + { + "epoch": 0.50508, + "grad_norm": 0.4920038878917694, + "learning_rate": 1.1561586443278438e-05, + "loss": 0.0142, + "step": 25254 + }, + { + "epoch": 0.50512, + "grad_norm": 3.3696892261505127, + "learning_rate": 1.1560207294037163e-05, + "loss": 0.075, + "step": 25256 + }, + { + "epoch": 0.50516, + "grad_norm": 0.20995618402957916, + "learning_rate": 1.155882811437884e-05, + "loss": 0.1084, + "step": 25258 + }, + { + "epoch": 0.5052, + "grad_norm": 0.13455362617969513, + "learning_rate": 1.1557448904330362e-05, + "loss": 0.04, + "step": 25260 + }, + { + "epoch": 0.50524, + "grad_norm": 0.1898173689842224, + "learning_rate": 1.1556069663918616e-05, + "loss": 0.012, + "step": 25262 + }, + { + "epoch": 0.50528, + "grad_norm": 2.792837142944336, + "learning_rate": 1.1554690393170494e-05, + "loss": 0.0825, + "step": 25264 + }, + { + "epoch": 0.50532, + "grad_norm": 1.5661802291870117, + "learning_rate": 1.1553311092112878e-05, + "loss": 0.0274, + "step": 25266 + }, + { + "epoch": 0.50536, + "grad_norm": 0.17757633328437805, + "learning_rate": 1.155193176077266e-05, + "loss": 0.0069, + "step": 25268 + }, + { + "epoch": 0.5054, + "grad_norm": 0.19369198381900787, + "learning_rate": 1.155055239917674e-05, + "loss": 0.0051, + "step": 25270 + }, + { + "epoch": 0.50544, + "grad_norm": 0.12627707421779633, + "learning_rate": 1.1549173007352e-05, + "loss": 0.0023, + "step": 25272 + }, + { + "epoch": 0.50548, + "grad_norm": 7.171467304229736, + "learning_rate": 1.1547793585325332e-05, + "loss": 0.5225, + "step": 25274 + }, + { + "epoch": 0.50552, + "grad_norm": 0.11632611602544785, + "learning_rate": 1.1546414133123633e-05, + "loss": 0.0208, + "step": 25276 + }, + { + "epoch": 0.50556, + "grad_norm": 0.0914120152592659, + "learning_rate": 1.1545034650773796e-05, + "loss": 0.2054, + "step": 25278 + }, + { + "epoch": 0.5056, + "grad_norm": 0.06831289082765579, + "learning_rate": 1.1543655138302714e-05, + "loss": 0.0726, + "step": 25280 + }, + { + "epoch": 0.50564, + "grad_norm": 0.9251918792724609, + "learning_rate": 1.1542275595737277e-05, + "loss": 0.0209, + "step": 25282 + }, + { + "epoch": 0.50568, + "grad_norm": 4.217491149902344, + "learning_rate": 1.1540896023104385e-05, + "loss": 0.0827, + "step": 25284 + }, + { + "epoch": 0.50572, + "grad_norm": 1.8891899585723877, + "learning_rate": 1.1539516420430931e-05, + "loss": 0.031, + "step": 25286 + }, + { + "epoch": 0.50576, + "grad_norm": 0.384598046541214, + "learning_rate": 1.1538136787743813e-05, + "loss": 0.0111, + "step": 25288 + }, + { + "epoch": 0.5058, + "grad_norm": 0.07719040662050247, + "learning_rate": 1.1536757125069924e-05, + "loss": 0.0045, + "step": 25290 + }, + { + "epoch": 0.50584, + "grad_norm": 0.08056877553462982, + "learning_rate": 1.1535377432436166e-05, + "loss": 0.0162, + "step": 25292 + }, + { + "epoch": 0.50588, + "grad_norm": 0.12566564977169037, + "learning_rate": 1.1533997709869433e-05, + "loss": 0.0063, + "step": 25294 + }, + { + "epoch": 0.50592, + "grad_norm": 9.627388954162598, + "learning_rate": 1.1532617957396626e-05, + "loss": 0.2003, + "step": 25296 + }, + { + "epoch": 0.50596, + "grad_norm": 8.082025527954102, + "learning_rate": 1.1531238175044644e-05, + "loss": 0.2059, + "step": 25298 + }, + { + "epoch": 0.506, + "grad_norm": 0.6336140632629395, + "learning_rate": 1.1529858362840383e-05, + "loss": 0.0127, + "step": 25300 + }, + { + "epoch": 0.50604, + "grad_norm": 0.03874058276414871, + "learning_rate": 1.1528478520810747e-05, + "loss": 0.0033, + "step": 25302 + }, + { + "epoch": 0.50608, + "grad_norm": 1.3259133100509644, + "learning_rate": 1.1527098648982634e-05, + "loss": 0.0203, + "step": 25304 + }, + { + "epoch": 0.50612, + "grad_norm": 0.1187184676527977, + "learning_rate": 1.1525718747382946e-05, + "loss": 0.006, + "step": 25306 + }, + { + "epoch": 0.50616, + "grad_norm": 0.147477388381958, + "learning_rate": 1.1524338816038587e-05, + "loss": 0.0075, + "step": 25308 + }, + { + "epoch": 0.5062, + "grad_norm": 7.021744251251221, + "learning_rate": 1.1522958854976458e-05, + "loss": 0.3418, + "step": 25310 + }, + { + "epoch": 0.50624, + "grad_norm": 0.1625882238149643, + "learning_rate": 1.1521578864223462e-05, + "loss": 0.0048, + "step": 25312 + }, + { + "epoch": 0.50628, + "grad_norm": 0.44576114416122437, + "learning_rate": 1.1520198843806505e-05, + "loss": 0.0116, + "step": 25314 + }, + { + "epoch": 0.50632, + "grad_norm": 0.29520732164382935, + "learning_rate": 1.1518818793752484e-05, + "loss": 0.0079, + "step": 25316 + }, + { + "epoch": 0.50636, + "grad_norm": 0.003797600045800209, + "learning_rate": 1.1517438714088314e-05, + "loss": 0.0027, + "step": 25318 + }, + { + "epoch": 0.5064, + "grad_norm": 0.2405017465353012, + "learning_rate": 1.1516058604840891e-05, + "loss": 0.022, + "step": 25320 + }, + { + "epoch": 0.50644, + "grad_norm": 2.146263360977173, + "learning_rate": 1.1514678466037126e-05, + "loss": 0.055, + "step": 25322 + }, + { + "epoch": 0.50648, + "grad_norm": 0.11288882046937943, + "learning_rate": 1.1513298297703924e-05, + "loss": 0.0026, + "step": 25324 + }, + { + "epoch": 0.50652, + "grad_norm": 0.022259708493947983, + "learning_rate": 1.1511918099868193e-05, + "loss": 0.0271, + "step": 25326 + }, + { + "epoch": 0.50656, + "grad_norm": 4.264219284057617, + "learning_rate": 1.1510537872556842e-05, + "loss": 0.2859, + "step": 25328 + }, + { + "epoch": 0.5066, + "grad_norm": 8.206864356994629, + "learning_rate": 1.1509157615796775e-05, + "loss": 0.2272, + "step": 25330 + }, + { + "epoch": 0.50664, + "grad_norm": 7.286279678344727, + "learning_rate": 1.1507777329614905e-05, + "loss": 0.5092, + "step": 25332 + }, + { + "epoch": 0.50668, + "grad_norm": 0.25741711258888245, + "learning_rate": 1.1506397014038137e-05, + "loss": 0.0157, + "step": 25334 + }, + { + "epoch": 0.50672, + "grad_norm": 0.010019033215939999, + "learning_rate": 1.150501666909339e-05, + "loss": 0.0052, + "step": 25336 + }, + { + "epoch": 0.50676, + "grad_norm": 0.24400882422924042, + "learning_rate": 1.150363629480756e-05, + "loss": 0.0048, + "step": 25338 + }, + { + "epoch": 0.5068, + "grad_norm": 0.17696748673915863, + "learning_rate": 1.1502255891207572e-05, + "loss": 0.0064, + "step": 25340 + }, + { + "epoch": 0.50684, + "grad_norm": 0.5514408349990845, + "learning_rate": 1.150087545832033e-05, + "loss": 0.0113, + "step": 25342 + }, + { + "epoch": 0.50688, + "grad_norm": 0.06737691909074783, + "learning_rate": 1.1499494996172748e-05, + "loss": 0.0432, + "step": 25344 + }, + { + "epoch": 0.50692, + "grad_norm": 0.12299779057502747, + "learning_rate": 1.149811450479174e-05, + "loss": 0.0045, + "step": 25346 + }, + { + "epoch": 0.50696, + "grad_norm": 0.17790544033050537, + "learning_rate": 1.149673398420422e-05, + "loss": 0.0059, + "step": 25348 + }, + { + "epoch": 0.507, + "grad_norm": 7.9182024002075195, + "learning_rate": 1.1495353434437098e-05, + "loss": 0.1802, + "step": 25350 + }, + { + "epoch": 0.50704, + "grad_norm": 0.0842953473329544, + "learning_rate": 1.149397285551729e-05, + "loss": 0.139, + "step": 25352 + }, + { + "epoch": 0.50708, + "grad_norm": 0.6564203500747681, + "learning_rate": 1.1492592247471715e-05, + "loss": 0.0254, + "step": 25354 + }, + { + "epoch": 0.50712, + "grad_norm": 1.4241111278533936, + "learning_rate": 1.1491211610327282e-05, + "loss": 0.0247, + "step": 25356 + }, + { + "epoch": 0.50716, + "grad_norm": 5.365379810333252, + "learning_rate": 1.1489830944110914e-05, + "loss": 0.093, + "step": 25358 + }, + { + "epoch": 0.5072, + "grad_norm": 0.060174886137247086, + "learning_rate": 1.1488450248849523e-05, + "loss": 0.0144, + "step": 25360 + }, + { + "epoch": 0.50724, + "grad_norm": 0.08914550393819809, + "learning_rate": 1.1487069524570029e-05, + "loss": 0.0112, + "step": 25362 + }, + { + "epoch": 0.50728, + "grad_norm": 2.9307737350463867, + "learning_rate": 1.1485688771299348e-05, + "loss": 0.1791, + "step": 25364 + }, + { + "epoch": 0.50732, + "grad_norm": 0.3022654950618744, + "learning_rate": 1.1484307989064401e-05, + "loss": 0.0095, + "step": 25366 + }, + { + "epoch": 0.50736, + "grad_norm": 0.18087898194789886, + "learning_rate": 1.1482927177892107e-05, + "loss": 0.1293, + "step": 25368 + }, + { + "epoch": 0.5074, + "grad_norm": 1.4875408411026, + "learning_rate": 1.1481546337809381e-05, + "loss": 0.0383, + "step": 25370 + }, + { + "epoch": 0.50744, + "grad_norm": 0.11822689324617386, + "learning_rate": 1.1480165468843148e-05, + "loss": 0.017, + "step": 25372 + }, + { + "epoch": 0.50748, + "grad_norm": 1.6375008821487427, + "learning_rate": 1.1478784571020325e-05, + "loss": 0.0312, + "step": 25374 + }, + { + "epoch": 0.50752, + "grad_norm": 6.97691535949707, + "learning_rate": 1.1477403644367839e-05, + "loss": 0.7251, + "step": 25376 + }, + { + "epoch": 0.50756, + "grad_norm": 3.225081205368042, + "learning_rate": 1.1476022688912604e-05, + "loss": 0.0635, + "step": 25378 + }, + { + "epoch": 0.5076, + "grad_norm": 9.10552978515625, + "learning_rate": 1.1474641704681551e-05, + "loss": 0.3983, + "step": 25380 + }, + { + "epoch": 0.50764, + "grad_norm": 10.239252090454102, + "learning_rate": 1.1473260691701597e-05, + "loss": 0.3052, + "step": 25382 + }, + { + "epoch": 0.50768, + "grad_norm": 0.33059510588645935, + "learning_rate": 1.1471879649999667e-05, + "loss": 0.0161, + "step": 25384 + }, + { + "epoch": 0.50772, + "grad_norm": 0.5657035112380981, + "learning_rate": 1.1470498579602687e-05, + "loss": 0.0145, + "step": 25386 + }, + { + "epoch": 0.50776, + "grad_norm": 0.16977669298648834, + "learning_rate": 1.146911748053758e-05, + "loss": 0.0447, + "step": 25388 + }, + { + "epoch": 0.5078, + "grad_norm": 0.3758429288864136, + "learning_rate": 1.1467736352831266e-05, + "loss": 0.0084, + "step": 25390 + }, + { + "epoch": 0.50784, + "grad_norm": 0.9118502736091614, + "learning_rate": 1.1466355196510683e-05, + "loss": 0.0905, + "step": 25392 + }, + { + "epoch": 0.50788, + "grad_norm": 0.16047999262809753, + "learning_rate": 1.1464974011602747e-05, + "loss": 0.0495, + "step": 25394 + }, + { + "epoch": 0.50792, + "grad_norm": 0.5050129294395447, + "learning_rate": 1.1463592798134389e-05, + "loss": 0.0103, + "step": 25396 + }, + { + "epoch": 0.50796, + "grad_norm": 0.33647748827934265, + "learning_rate": 1.1462211556132536e-05, + "loss": 0.0471, + "step": 25398 + }, + { + "epoch": 0.508, + "grad_norm": 0.48659747838974, + "learning_rate": 1.1460830285624119e-05, + "loss": 0.0131, + "step": 25400 + }, + { + "epoch": 0.50804, + "grad_norm": 5.457479476928711, + "learning_rate": 1.1459448986636061e-05, + "loss": 0.2783, + "step": 25402 + }, + { + "epoch": 0.50808, + "grad_norm": 0.14508816599845886, + "learning_rate": 1.1458067659195291e-05, + "loss": 0.0368, + "step": 25404 + }, + { + "epoch": 0.50812, + "grad_norm": 0.19442638754844666, + "learning_rate": 1.1456686303328743e-05, + "loss": 0.0047, + "step": 25406 + }, + { + "epoch": 0.50816, + "grad_norm": 0.047886453568935394, + "learning_rate": 1.1455304919063348e-05, + "loss": 0.0158, + "step": 25408 + }, + { + "epoch": 0.5082, + "grad_norm": 0.09358753263950348, + "learning_rate": 1.1453923506426032e-05, + "loss": 0.0187, + "step": 25410 + }, + { + "epoch": 0.50824, + "grad_norm": 0.5068225860595703, + "learning_rate": 1.1452542065443728e-05, + "loss": 0.0147, + "step": 25412 + }, + { + "epoch": 0.50828, + "grad_norm": 4.016885757446289, + "learning_rate": 1.145116059614337e-05, + "loss": 0.0773, + "step": 25414 + }, + { + "epoch": 0.50832, + "grad_norm": 0.252331405878067, + "learning_rate": 1.144977909855189e-05, + "loss": 0.0058, + "step": 25416 + }, + { + "epoch": 0.50836, + "grad_norm": 0.43279820680618286, + "learning_rate": 1.144839757269622e-05, + "loss": 0.0071, + "step": 25418 + }, + { + "epoch": 0.5084, + "grad_norm": 0.27570468187332153, + "learning_rate": 1.1447016018603293e-05, + "loss": 0.0136, + "step": 25420 + }, + { + "epoch": 0.50844, + "grad_norm": 0.4368174970149994, + "learning_rate": 1.1445634436300041e-05, + "loss": 0.0076, + "step": 25422 + }, + { + "epoch": 0.50848, + "grad_norm": 1.095167875289917, + "learning_rate": 1.1444252825813407e-05, + "loss": 0.0305, + "step": 25424 + }, + { + "epoch": 0.50852, + "grad_norm": 0.07374872267246246, + "learning_rate": 1.1442871187170316e-05, + "loss": 0.1637, + "step": 25426 + }, + { + "epoch": 0.50856, + "grad_norm": 0.012351618148386478, + "learning_rate": 1.144148952039771e-05, + "loss": 0.0214, + "step": 25428 + }, + { + "epoch": 0.5086, + "grad_norm": 0.2655238211154938, + "learning_rate": 1.1440107825522522e-05, + "loss": 0.0182, + "step": 25430 + }, + { + "epoch": 0.50864, + "grad_norm": 0.12543976306915283, + "learning_rate": 1.1438726102571694e-05, + "loss": 0.0111, + "step": 25432 + }, + { + "epoch": 0.50868, + "grad_norm": 10.636273384094238, + "learning_rate": 1.1437344351572157e-05, + "loss": 0.4175, + "step": 25434 + }, + { + "epoch": 0.50872, + "grad_norm": 0.7419483661651611, + "learning_rate": 1.1435962572550853e-05, + "loss": 0.0129, + "step": 25436 + }, + { + "epoch": 0.50876, + "grad_norm": 0.0782681256532669, + "learning_rate": 1.143458076553472e-05, + "loss": 0.0025, + "step": 25438 + }, + { + "epoch": 0.5088, + "grad_norm": 0.2825784981250763, + "learning_rate": 1.1433198930550694e-05, + "loss": 0.0428, + "step": 25440 + }, + { + "epoch": 0.50884, + "grad_norm": 0.6156742572784424, + "learning_rate": 1.1431817067625719e-05, + "loss": 0.081, + "step": 25442 + }, + { + "epoch": 0.50888, + "grad_norm": 0.08964650332927704, + "learning_rate": 1.1430435176786734e-05, + "loss": 0.0117, + "step": 25444 + }, + { + "epoch": 0.50892, + "grad_norm": 11.795438766479492, + "learning_rate": 1.1429053258060676e-05, + "loss": 0.592, + "step": 25446 + }, + { + "epoch": 0.50896, + "grad_norm": 0.03138967230916023, + "learning_rate": 1.1427671311474489e-05, + "loss": 0.004, + "step": 25448 + }, + { + "epoch": 0.509, + "grad_norm": 1.9993648529052734, + "learning_rate": 1.1426289337055119e-05, + "loss": 0.0336, + "step": 25450 + }, + { + "epoch": 0.50904, + "grad_norm": 7.721449375152588, + "learning_rate": 1.14249073348295e-05, + "loss": 0.2561, + "step": 25452 + }, + { + "epoch": 0.50908, + "grad_norm": 10.34350299835205, + "learning_rate": 1.1423525304824581e-05, + "loss": 0.3051, + "step": 25454 + }, + { + "epoch": 0.50912, + "grad_norm": 0.32112669944763184, + "learning_rate": 1.1422143247067301e-05, + "loss": 0.0089, + "step": 25456 + }, + { + "epoch": 0.50916, + "grad_norm": 0.192326158285141, + "learning_rate": 1.142076116158461e-05, + "loss": 0.0216, + "step": 25458 + }, + { + "epoch": 0.5092, + "grad_norm": 0.23236465454101562, + "learning_rate": 1.1419379048403446e-05, + "loss": 0.0065, + "step": 25460 + }, + { + "epoch": 0.50924, + "grad_norm": 1.1044167280197144, + "learning_rate": 1.1417996907550757e-05, + "loss": 0.0341, + "step": 25462 + }, + { + "epoch": 0.50928, + "grad_norm": 0.37920793890953064, + "learning_rate": 1.1416614739053487e-05, + "loss": 0.0079, + "step": 25464 + }, + { + "epoch": 0.50932, + "grad_norm": 0.2823484241962433, + "learning_rate": 1.1415232542938586e-05, + "loss": 0.0762, + "step": 25466 + }, + { + "epoch": 0.50936, + "grad_norm": 8.807631492614746, + "learning_rate": 1.1413850319232995e-05, + "loss": 0.4502, + "step": 25468 + }, + { + "epoch": 0.5094, + "grad_norm": 0.1380489617586136, + "learning_rate": 1.141246806796367e-05, + "loss": 0.074, + "step": 25470 + }, + { + "epoch": 0.50944, + "grad_norm": 2.6684842109680176, + "learning_rate": 1.1411085789157546e-05, + "loss": 0.0554, + "step": 25472 + }, + { + "epoch": 0.50948, + "grad_norm": 0.18576960265636444, + "learning_rate": 1.1409703482841584e-05, + "loss": 0.0144, + "step": 25474 + }, + { + "epoch": 0.50952, + "grad_norm": 0.4050545394420624, + "learning_rate": 1.1408321149042721e-05, + "loss": 0.0091, + "step": 25476 + }, + { + "epoch": 0.50956, + "grad_norm": 1.0810153484344482, + "learning_rate": 1.1406938787787914e-05, + "loss": 0.1948, + "step": 25478 + }, + { + "epoch": 0.5096, + "grad_norm": 1.5166752338409424, + "learning_rate": 1.140555639910411e-05, + "loss": 0.0389, + "step": 25480 + }, + { + "epoch": 0.50964, + "grad_norm": 0.19263191521167755, + "learning_rate": 1.1404173983018266e-05, + "loss": 0.0501, + "step": 25482 + }, + { + "epoch": 0.50968, + "grad_norm": 0.04033713415265083, + "learning_rate": 1.140279153955732e-05, + "loss": 0.0078, + "step": 25484 + }, + { + "epoch": 0.50972, + "grad_norm": 5.9230427742004395, + "learning_rate": 1.1401409068748235e-05, + "loss": 0.1162, + "step": 25486 + }, + { + "epoch": 0.50976, + "grad_norm": 0.5399721264839172, + "learning_rate": 1.1400026570617959e-05, + "loss": 0.0552, + "step": 25488 + }, + { + "epoch": 0.5098, + "grad_norm": 0.04495928809046745, + "learning_rate": 1.1398644045193443e-05, + "loss": 0.0801, + "step": 25490 + }, + { + "epoch": 0.50984, + "grad_norm": 0.029662998393177986, + "learning_rate": 1.139726149250164e-05, + "loss": 0.0036, + "step": 25492 + }, + { + "epoch": 0.50988, + "grad_norm": 1.050740122795105, + "learning_rate": 1.1395878912569506e-05, + "loss": 0.0421, + "step": 25494 + }, + { + "epoch": 0.50992, + "grad_norm": 4.082086563110352, + "learning_rate": 1.1394496305423994e-05, + "loss": 0.0904, + "step": 25496 + }, + { + "epoch": 0.50996, + "grad_norm": 0.036015696823596954, + "learning_rate": 1.139311367109206e-05, + "loss": 0.0381, + "step": 25498 + }, + { + "epoch": 0.51, + "grad_norm": 0.11697985976934433, + "learning_rate": 1.1391731009600655e-05, + "loss": 0.1286, + "step": 25500 + }, + { + "epoch": 0.51004, + "grad_norm": 0.46057844161987305, + "learning_rate": 1.1390348320976738e-05, + "loss": 0.0261, + "step": 25502 + }, + { + "epoch": 0.51008, + "grad_norm": 6.3799052238464355, + "learning_rate": 1.1388965605247269e-05, + "loss": 0.1434, + "step": 25504 + }, + { + "epoch": 0.51012, + "grad_norm": 7.034229278564453, + "learning_rate": 1.1387582862439198e-05, + "loss": 0.2535, + "step": 25506 + }, + { + "epoch": 0.51016, + "grad_norm": 0.3921196460723877, + "learning_rate": 1.1386200092579483e-05, + "loss": 0.0103, + "step": 25508 + }, + { + "epoch": 0.5102, + "grad_norm": 0.2107616513967514, + "learning_rate": 1.1384817295695083e-05, + "loss": 0.0041, + "step": 25510 + }, + { + "epoch": 0.51024, + "grad_norm": 1.1024750471115112, + "learning_rate": 1.1383434471812957e-05, + "loss": 0.0207, + "step": 25512 + }, + { + "epoch": 0.51028, + "grad_norm": 0.20728036761283875, + "learning_rate": 1.1382051620960067e-05, + "loss": 0.007, + "step": 25514 + }, + { + "epoch": 0.51032, + "grad_norm": 0.2393142431974411, + "learning_rate": 1.1380668743163366e-05, + "loss": 0.0194, + "step": 25516 + }, + { + "epoch": 0.51036, + "grad_norm": 1.455000400543213, + "learning_rate": 1.1379285838449817e-05, + "loss": 0.0239, + "step": 25518 + }, + { + "epoch": 0.5104, + "grad_norm": 0.15666437149047852, + "learning_rate": 1.137790290684638e-05, + "loss": 0.3565, + "step": 25520 + }, + { + "epoch": 0.51044, + "grad_norm": 0.2779100835323334, + "learning_rate": 1.137651994838002e-05, + "loss": 0.0069, + "step": 25522 + }, + { + "epoch": 0.51048, + "grad_norm": 0.07333400100469589, + "learning_rate": 1.1375136963077695e-05, + "loss": 0.0026, + "step": 25524 + }, + { + "epoch": 0.51052, + "grad_norm": 0.04894920811057091, + "learning_rate": 1.1373753950966364e-05, + "loss": 0.0014, + "step": 25526 + }, + { + "epoch": 0.51056, + "grad_norm": 0.339486688375473, + "learning_rate": 1.1372370912072991e-05, + "loss": 0.6144, + "step": 25528 + }, + { + "epoch": 0.5106, + "grad_norm": 0.3342268466949463, + "learning_rate": 1.1370987846424547e-05, + "loss": 0.0149, + "step": 25530 + }, + { + "epoch": 0.51064, + "grad_norm": 0.1767198145389557, + "learning_rate": 1.1369604754047982e-05, + "loss": 0.0554, + "step": 25532 + }, + { + "epoch": 0.51068, + "grad_norm": 0.19618569314479828, + "learning_rate": 1.136822163497027e-05, + "loss": 0.034, + "step": 25534 + }, + { + "epoch": 0.51072, + "grad_norm": 3.112215757369995, + "learning_rate": 1.136683848921837e-05, + "loss": 0.0479, + "step": 25536 + }, + { + "epoch": 0.51076, + "grad_norm": 0.2865694463253021, + "learning_rate": 1.1365455316819255e-05, + "loss": 0.0172, + "step": 25538 + }, + { + "epoch": 0.5108, + "grad_norm": 0.724200427532196, + "learning_rate": 1.1364072117799884e-05, + "loss": 0.0372, + "step": 25540 + }, + { + "epoch": 0.51084, + "grad_norm": 1.161482810974121, + "learning_rate": 1.1362688892187226e-05, + "loss": 0.0401, + "step": 25542 + }, + { + "epoch": 0.51088, + "grad_norm": 0.6193623542785645, + "learning_rate": 1.1361305640008245e-05, + "loss": 0.013, + "step": 25544 + }, + { + "epoch": 0.51092, + "grad_norm": 0.6415063142776489, + "learning_rate": 1.1359922361289908e-05, + "loss": 0.0991, + "step": 25546 + }, + { + "epoch": 0.51096, + "grad_norm": 0.5347509384155273, + "learning_rate": 1.1358539056059186e-05, + "loss": 0.011, + "step": 25548 + }, + { + "epoch": 0.511, + "grad_norm": 0.29506468772888184, + "learning_rate": 1.1357155724343046e-05, + "loss": 0.0099, + "step": 25550 + }, + { + "epoch": 0.51104, + "grad_norm": 1.9432384967803955, + "learning_rate": 1.1355772366168454e-05, + "loss": 0.121, + "step": 25552 + }, + { + "epoch": 0.51108, + "grad_norm": 0.35685214400291443, + "learning_rate": 1.1354388981562384e-05, + "loss": 0.0232, + "step": 25554 + }, + { + "epoch": 0.51112, + "grad_norm": 0.4276372790336609, + "learning_rate": 1.1353005570551803e-05, + "loss": 0.034, + "step": 25556 + }, + { + "epoch": 0.51116, + "grad_norm": 4.075132369995117, + "learning_rate": 1.1351622133163684e-05, + "loss": 0.0605, + "step": 25558 + }, + { + "epoch": 0.5112, + "grad_norm": 0.6879112124443054, + "learning_rate": 1.1350238669424993e-05, + "loss": 0.0388, + "step": 25560 + }, + { + "epoch": 0.51124, + "grad_norm": 6.461786270141602, + "learning_rate": 1.1348855179362705e-05, + "loss": 0.2843, + "step": 25562 + }, + { + "epoch": 0.51128, + "grad_norm": 0.4164138734340668, + "learning_rate": 1.1347471663003791e-05, + "loss": 0.01, + "step": 25564 + }, + { + "epoch": 0.51132, + "grad_norm": 0.005491485353559256, + "learning_rate": 1.1346088120375224e-05, + "loss": 0.002, + "step": 25566 + }, + { + "epoch": 0.51136, + "grad_norm": 3.111361265182495, + "learning_rate": 1.1344704551503973e-05, + "loss": 0.0771, + "step": 25568 + }, + { + "epoch": 0.5114, + "grad_norm": 0.3379030227661133, + "learning_rate": 1.1343320956417015e-05, + "loss": 0.1314, + "step": 25570 + }, + { + "epoch": 0.51144, + "grad_norm": 0.023498469963669777, + "learning_rate": 1.1341937335141327e-05, + "loss": 0.0527, + "step": 25572 + }, + { + "epoch": 0.51148, + "grad_norm": 7.833118438720703, + "learning_rate": 1.1340553687703877e-05, + "loss": 0.3452, + "step": 25574 + }, + { + "epoch": 0.51152, + "grad_norm": 1.4201871156692505, + "learning_rate": 1.1339170014131645e-05, + "loss": 0.1201, + "step": 25576 + }, + { + "epoch": 0.51156, + "grad_norm": 0.4157942533493042, + "learning_rate": 1.13377863144516e-05, + "loss": 0.0185, + "step": 25578 + }, + { + "epoch": 0.5116, + "grad_norm": 0.3595886528491974, + "learning_rate": 1.1336402588690727e-05, + "loss": 0.0073, + "step": 25580 + }, + { + "epoch": 0.51164, + "grad_norm": 6.682578086853027, + "learning_rate": 1.1335018836875992e-05, + "loss": 0.1011, + "step": 25582 + }, + { + "epoch": 0.51168, + "grad_norm": 0.9781031012535095, + "learning_rate": 1.133363505903438e-05, + "loss": 0.4855, + "step": 25584 + }, + { + "epoch": 0.51172, + "grad_norm": 0.45664048194885254, + "learning_rate": 1.1332251255192866e-05, + "loss": 0.0797, + "step": 25586 + }, + { + "epoch": 0.51176, + "grad_norm": 3.455428123474121, + "learning_rate": 1.1330867425378428e-05, + "loss": 0.1136, + "step": 25588 + }, + { + "epoch": 0.5118, + "grad_norm": 0.4038996696472168, + "learning_rate": 1.1329483569618045e-05, + "loss": 0.013, + "step": 25590 + }, + { + "epoch": 0.51184, + "grad_norm": 0.11936644464731216, + "learning_rate": 1.1328099687938696e-05, + "loss": 0.0073, + "step": 25592 + }, + { + "epoch": 0.51188, + "grad_norm": 0.07849162071943283, + "learning_rate": 1.1326715780367359e-05, + "loss": 0.0018, + "step": 25594 + }, + { + "epoch": 0.51192, + "grad_norm": 0.37361571192741394, + "learning_rate": 1.1325331846931017e-05, + "loss": 0.0054, + "step": 25596 + }, + { + "epoch": 0.51196, + "grad_norm": 0.7207719683647156, + "learning_rate": 1.1323947887656643e-05, + "loss": 0.1398, + "step": 25598 + }, + { + "epoch": 0.512, + "grad_norm": 3.5348684787750244, + "learning_rate": 1.1322563902571227e-05, + "loss": 0.0621, + "step": 25600 + }, + { + "epoch": 0.51204, + "grad_norm": 8.543627738952637, + "learning_rate": 1.1321179891701743e-05, + "loss": 0.4144, + "step": 25602 + }, + { + "epoch": 0.51208, + "grad_norm": 0.13598193228244781, + "learning_rate": 1.1319795855075182e-05, + "loss": 0.0036, + "step": 25604 + }, + { + "epoch": 0.51212, + "grad_norm": 0.037671349942684174, + "learning_rate": 1.1318411792718518e-05, + "loss": 0.0057, + "step": 25606 + }, + { + "epoch": 0.51216, + "grad_norm": 4.520040512084961, + "learning_rate": 1.131702770465874e-05, + "loss": 0.1142, + "step": 25608 + }, + { + "epoch": 0.5122, + "grad_norm": 10.340364456176758, + "learning_rate": 1.1315643590922827e-05, + "loss": 0.7108, + "step": 25610 + }, + { + "epoch": 0.51224, + "grad_norm": 0.1523045152425766, + "learning_rate": 1.1314259451537766e-05, + "loss": 0.0039, + "step": 25612 + }, + { + "epoch": 0.51228, + "grad_norm": 0.1489941030740738, + "learning_rate": 1.1312875286530537e-05, + "loss": 0.0045, + "step": 25614 + }, + { + "epoch": 0.51232, + "grad_norm": 0.7030998468399048, + "learning_rate": 1.1311491095928133e-05, + "loss": 0.0153, + "step": 25616 + }, + { + "epoch": 0.51236, + "grad_norm": 0.0992182046175003, + "learning_rate": 1.1310106879757532e-05, + "loss": 0.2521, + "step": 25618 + }, + { + "epoch": 0.5124, + "grad_norm": 5.835574150085449, + "learning_rate": 1.1308722638045724e-05, + "loss": 0.2013, + "step": 25620 + }, + { + "epoch": 0.51244, + "grad_norm": 8.144350051879883, + "learning_rate": 1.1307338370819694e-05, + "loss": 0.2773, + "step": 25622 + }, + { + "epoch": 0.51248, + "grad_norm": 0.4679543972015381, + "learning_rate": 1.1305954078106427e-05, + "loss": 0.0084, + "step": 25624 + }, + { + "epoch": 0.51252, + "grad_norm": 0.9074338674545288, + "learning_rate": 1.1304569759932917e-05, + "loss": 0.0234, + "step": 25626 + }, + { + "epoch": 0.51256, + "grad_norm": 2.7980546951293945, + "learning_rate": 1.1303185416326148e-05, + "loss": 0.0538, + "step": 25628 + }, + { + "epoch": 0.5126, + "grad_norm": 0.4935486912727356, + "learning_rate": 1.1301801047313106e-05, + "loss": 0.0121, + "step": 25630 + }, + { + "epoch": 0.51264, + "grad_norm": 0.08430974185466766, + "learning_rate": 1.1300416652920783e-05, + "loss": 0.0037, + "step": 25632 + }, + { + "epoch": 0.51268, + "grad_norm": 0.3377213478088379, + "learning_rate": 1.1299032233176165e-05, + "loss": 0.0206, + "step": 25634 + }, + { + "epoch": 0.51272, + "grad_norm": 2.3545849323272705, + "learning_rate": 1.1297647788106251e-05, + "loss": 0.0939, + "step": 25636 + }, + { + "epoch": 0.51276, + "grad_norm": 3.8547048568725586, + "learning_rate": 1.1296263317738021e-05, + "loss": 0.09, + "step": 25638 + }, + { + "epoch": 0.5128, + "grad_norm": 0.09125081449747086, + "learning_rate": 1.129487882209847e-05, + "loss": 0.0161, + "step": 25640 + }, + { + "epoch": 0.51284, + "grad_norm": 3.8152694702148438, + "learning_rate": 1.1293494301214594e-05, + "loss": 0.152, + "step": 25642 + }, + { + "epoch": 0.51288, + "grad_norm": 0.0606033056974411, + "learning_rate": 1.1292109755113377e-05, + "loss": 0.0431, + "step": 25644 + }, + { + "epoch": 0.51292, + "grad_norm": 0.8936419486999512, + "learning_rate": 1.1290725183821816e-05, + "loss": 0.0943, + "step": 25646 + }, + { + "epoch": 0.51296, + "grad_norm": 0.12374190986156464, + "learning_rate": 1.1289340587366901e-05, + "loss": 0.0148, + "step": 25648 + }, + { + "epoch": 0.513, + "grad_norm": 3.581334114074707, + "learning_rate": 1.128795596577563e-05, + "loss": 0.096, + "step": 25650 + }, + { + "epoch": 0.51304, + "grad_norm": 0.18994610011577606, + "learning_rate": 1.1286571319074992e-05, + "loss": 0.0125, + "step": 25652 + }, + { + "epoch": 0.51308, + "grad_norm": 1.4075530767440796, + "learning_rate": 1.1285186647291985e-05, + "loss": 0.0737, + "step": 25654 + }, + { + "epoch": 0.51312, + "grad_norm": 6.926531791687012, + "learning_rate": 1.1283801950453603e-05, + "loss": 0.2014, + "step": 25656 + }, + { + "epoch": 0.51316, + "grad_norm": 0.1409662663936615, + "learning_rate": 1.1282417228586842e-05, + "loss": 0.003, + "step": 25658 + }, + { + "epoch": 0.5132, + "grad_norm": 6.277205944061279, + "learning_rate": 1.1281032481718696e-05, + "loss": 0.208, + "step": 25660 + }, + { + "epoch": 0.51324, + "grad_norm": 0.042475625872612, + "learning_rate": 1.1279647709876165e-05, + "loss": 0.0057, + "step": 25662 + }, + { + "epoch": 0.51328, + "grad_norm": 1.595754623413086, + "learning_rate": 1.1278262913086238e-05, + "loss": 0.0382, + "step": 25664 + }, + { + "epoch": 0.51332, + "grad_norm": 0.150416299700737, + "learning_rate": 1.127687809137592e-05, + "loss": 0.0544, + "step": 25666 + }, + { + "epoch": 0.51336, + "grad_norm": 1.2813128232955933, + "learning_rate": 1.1275493244772205e-05, + "loss": 0.0304, + "step": 25668 + }, + { + "epoch": 0.5134, + "grad_norm": 0.19371671974658966, + "learning_rate": 1.1274108373302095e-05, + "loss": 0.0088, + "step": 25670 + }, + { + "epoch": 0.51344, + "grad_norm": 0.6340610980987549, + "learning_rate": 1.1272723476992582e-05, + "loss": 0.0132, + "step": 25672 + }, + { + "epoch": 0.51348, + "grad_norm": 0.09800193458795547, + "learning_rate": 1.1271338555870673e-05, + "loss": 0.0661, + "step": 25674 + }, + { + "epoch": 0.51352, + "grad_norm": 0.23842956125736237, + "learning_rate": 1.1269953609963364e-05, + "loss": 0.0042, + "step": 25676 + }, + { + "epoch": 0.51356, + "grad_norm": 20.990129470825195, + "learning_rate": 1.1268568639297654e-05, + "loss": 0.2364, + "step": 25678 + }, + { + "epoch": 0.5136, + "grad_norm": 0.6501579880714417, + "learning_rate": 1.1267183643900548e-05, + "loss": 0.0674, + "step": 25680 + }, + { + "epoch": 0.51364, + "grad_norm": 1.7088394165039062, + "learning_rate": 1.1265798623799043e-05, + "loss": 0.331, + "step": 25682 + }, + { + "epoch": 0.51368, + "grad_norm": 0.34526389837265015, + "learning_rate": 1.1264413579020141e-05, + "loss": 0.0051, + "step": 25684 + }, + { + "epoch": 0.51372, + "grad_norm": 1.5854307413101196, + "learning_rate": 1.1263028509590847e-05, + "loss": 0.0419, + "step": 25686 + }, + { + "epoch": 0.51376, + "grad_norm": 0.30952274799346924, + "learning_rate": 1.1261643415538162e-05, + "loss": 0.0154, + "step": 25688 + }, + { + "epoch": 0.5138, + "grad_norm": 1.0876747369766235, + "learning_rate": 1.1260258296889086e-05, + "loss": 0.0316, + "step": 25690 + }, + { + "epoch": 0.51384, + "grad_norm": 1.4097256660461426, + "learning_rate": 1.125887315367063e-05, + "loss": 0.0361, + "step": 25692 + }, + { + "epoch": 0.51388, + "grad_norm": 0.05195668339729309, + "learning_rate": 1.1257487985909792e-05, + "loss": 0.0025, + "step": 25694 + }, + { + "epoch": 0.51392, + "grad_norm": 0.7030275464057922, + "learning_rate": 1.1256102793633579e-05, + "loss": 0.0161, + "step": 25696 + }, + { + "epoch": 0.51396, + "grad_norm": 0.10923147201538086, + "learning_rate": 1.1254717576868994e-05, + "loss": 0.0065, + "step": 25698 + }, + { + "epoch": 0.514, + "grad_norm": 10.174997329711914, + "learning_rate": 1.1253332335643043e-05, + "loss": 0.6509, + "step": 25700 + }, + { + "epoch": 0.51404, + "grad_norm": 3.226008892059326, + "learning_rate": 1.1251947069982736e-05, + "loss": 0.0505, + "step": 25702 + }, + { + "epoch": 0.51408, + "grad_norm": 0.5094591379165649, + "learning_rate": 1.1250561779915075e-05, + "loss": 0.026, + "step": 25704 + }, + { + "epoch": 0.51412, + "grad_norm": 0.24614500999450684, + "learning_rate": 1.1249176465467066e-05, + "loss": 0.0042, + "step": 25706 + }, + { + "epoch": 0.51416, + "grad_norm": 0.023976480588316917, + "learning_rate": 1.124779112666572e-05, + "loss": 0.0018, + "step": 25708 + }, + { + "epoch": 0.5142, + "grad_norm": 5.029190540313721, + "learning_rate": 1.1246405763538047e-05, + "loss": 0.2815, + "step": 25710 + }, + { + "epoch": 0.51424, + "grad_norm": 0.11072373390197754, + "learning_rate": 1.1245020376111046e-05, + "loss": 0.0187, + "step": 25712 + }, + { + "epoch": 0.51428, + "grad_norm": 5.092423915863037, + "learning_rate": 1.1243634964411736e-05, + "loss": 0.1144, + "step": 25714 + }, + { + "epoch": 0.51432, + "grad_norm": 0.3981836438179016, + "learning_rate": 1.1242249528467122e-05, + "loss": 0.0113, + "step": 25716 + }, + { + "epoch": 0.51436, + "grad_norm": 1.4504181146621704, + "learning_rate": 1.1240864068304214e-05, + "loss": 0.0385, + "step": 25718 + }, + { + "epoch": 0.5144, + "grad_norm": 0.16960874199867249, + "learning_rate": 1.1239478583950019e-05, + "loss": 0.0027, + "step": 25720 + }, + { + "epoch": 0.51444, + "grad_norm": 0.11348660290241241, + "learning_rate": 1.1238093075431553e-05, + "loss": 0.0067, + "step": 25722 + }, + { + "epoch": 0.51448, + "grad_norm": 0.23958458006381989, + "learning_rate": 1.1236707542775824e-05, + "loss": 0.0924, + "step": 25724 + }, + { + "epoch": 0.51452, + "grad_norm": 0.038472022861242294, + "learning_rate": 1.1235321986009846e-05, + "loss": 0.0651, + "step": 25726 + }, + { + "epoch": 0.51456, + "grad_norm": 0.49248456954956055, + "learning_rate": 1.123393640516063e-05, + "loss": 0.0076, + "step": 25728 + }, + { + "epoch": 0.5146, + "grad_norm": 0.06172837316989899, + "learning_rate": 1.1232550800255188e-05, + "loss": 0.0037, + "step": 25730 + }, + { + "epoch": 0.51464, + "grad_norm": 0.05947710573673248, + "learning_rate": 1.1231165171320533e-05, + "loss": 0.0017, + "step": 25732 + }, + { + "epoch": 0.51468, + "grad_norm": 1.702142596244812, + "learning_rate": 1.122977951838368e-05, + "loss": 0.0316, + "step": 25734 + }, + { + "epoch": 0.51472, + "grad_norm": 1.5186893939971924, + "learning_rate": 1.1228393841471644e-05, + "loss": 0.0197, + "step": 25736 + }, + { + "epoch": 0.51476, + "grad_norm": 0.8490279316902161, + "learning_rate": 1.1227008140611435e-05, + "loss": 0.0293, + "step": 25738 + }, + { + "epoch": 0.5148, + "grad_norm": 0.03051161766052246, + "learning_rate": 1.1225622415830068e-05, + "loss": 0.0121, + "step": 25740 + }, + { + "epoch": 0.51484, + "grad_norm": 1.5436677932739258, + "learning_rate": 1.1224236667154566e-05, + "loss": 0.0674, + "step": 25742 + }, + { + "epoch": 0.51488, + "grad_norm": 4.959597110748291, + "learning_rate": 1.1222850894611939e-05, + "loss": 0.1715, + "step": 25744 + }, + { + "epoch": 0.51492, + "grad_norm": 0.02311425656080246, + "learning_rate": 1.1221465098229205e-05, + "loss": 0.003, + "step": 25746 + }, + { + "epoch": 0.51496, + "grad_norm": 0.05162786692380905, + "learning_rate": 1.1220079278033378e-05, + "loss": 0.1633, + "step": 25748 + }, + { + "epoch": 0.515, + "grad_norm": 0.1205090805888176, + "learning_rate": 1.1218693434051475e-05, + "loss": 0.0538, + "step": 25750 + }, + { + "epoch": 0.51504, + "grad_norm": 0.22523051500320435, + "learning_rate": 1.121730756631052e-05, + "loss": 0.0495, + "step": 25752 + }, + { + "epoch": 0.51508, + "grad_norm": 0.021891148760914803, + "learning_rate": 1.1215921674837527e-05, + "loss": 0.1628, + "step": 25754 + }, + { + "epoch": 0.51512, + "grad_norm": 3.704132556915283, + "learning_rate": 1.1214535759659513e-05, + "loss": 0.0726, + "step": 25756 + }, + { + "epoch": 0.51516, + "grad_norm": 3.5538268089294434, + "learning_rate": 1.12131498208035e-05, + "loss": 0.0606, + "step": 25758 + }, + { + "epoch": 0.5152, + "grad_norm": 8.029019355773926, + "learning_rate": 1.1211763858296507e-05, + "loss": 0.2026, + "step": 25760 + }, + { + "epoch": 0.51524, + "grad_norm": 0.24857933819293976, + "learning_rate": 1.1210377872165552e-05, + "loss": 0.0038, + "step": 25762 + }, + { + "epoch": 0.51528, + "grad_norm": 1.4012573957443237, + "learning_rate": 1.1208991862437662e-05, + "loss": 0.0392, + "step": 25764 + }, + { + "epoch": 0.51532, + "grad_norm": 0.48687753081321716, + "learning_rate": 1.1207605829139848e-05, + "loss": 0.0302, + "step": 25766 + }, + { + "epoch": 0.51536, + "grad_norm": 0.158638596534729, + "learning_rate": 1.120621977229914e-05, + "loss": 0.2213, + "step": 25768 + }, + { + "epoch": 0.5154, + "grad_norm": 0.29217395186424255, + "learning_rate": 1.1204833691942553e-05, + "loss": 0.2718, + "step": 25770 + }, + { + "epoch": 0.51544, + "grad_norm": 0.05971800908446312, + "learning_rate": 1.1203447588097115e-05, + "loss": 0.001, + "step": 25772 + }, + { + "epoch": 0.51548, + "grad_norm": 0.30691543221473694, + "learning_rate": 1.1202061460789846e-05, + "loss": 0.0108, + "step": 25774 + }, + { + "epoch": 0.51552, + "grad_norm": 3.750365734100342, + "learning_rate": 1.1200675310047773e-05, + "loss": 0.0563, + "step": 25776 + }, + { + "epoch": 0.51556, + "grad_norm": 1.3344318866729736, + "learning_rate": 1.1199289135897913e-05, + "loss": 0.3552, + "step": 25778 + }, + { + "epoch": 0.5156, + "grad_norm": 0.015987439081072807, + "learning_rate": 1.1197902938367297e-05, + "loss": 0.0015, + "step": 25780 + }, + { + "epoch": 0.51564, + "grad_norm": 2.08910870552063, + "learning_rate": 1.1196516717482943e-05, + "loss": 0.0737, + "step": 25782 + }, + { + "epoch": 0.51568, + "grad_norm": 2.7584617137908936, + "learning_rate": 1.1195130473271885e-05, + "loss": 0.0364, + "step": 25784 + }, + { + "epoch": 0.51572, + "grad_norm": 0.057149291038513184, + "learning_rate": 1.1193744205761136e-05, + "loss": 0.0156, + "step": 25786 + }, + { + "epoch": 0.51576, + "grad_norm": 0.13517898321151733, + "learning_rate": 1.1192357914977735e-05, + "loss": 0.0026, + "step": 25788 + }, + { + "epoch": 0.5158, + "grad_norm": 0.17334163188934326, + "learning_rate": 1.11909716009487e-05, + "loss": 0.0062, + "step": 25790 + }, + { + "epoch": 0.51584, + "grad_norm": 0.543694019317627, + "learning_rate": 1.118958526370106e-05, + "loss": 0.1335, + "step": 25792 + }, + { + "epoch": 0.51588, + "grad_norm": 0.029715074226260185, + "learning_rate": 1.1188198903261842e-05, + "loss": 0.0473, + "step": 25794 + }, + { + "epoch": 0.51592, + "grad_norm": 7.990741729736328, + "learning_rate": 1.1186812519658076e-05, + "loss": 0.1768, + "step": 25796 + }, + { + "epoch": 0.51596, + "grad_norm": 0.3478615880012512, + "learning_rate": 1.118542611291679e-05, + "loss": 0.5337, + "step": 25798 + }, + { + "epoch": 0.516, + "grad_norm": 0.9623531103134155, + "learning_rate": 1.1184039683065014e-05, + "loss": 0.0576, + "step": 25800 + }, + { + "epoch": 0.51604, + "grad_norm": 0.7966538071632385, + "learning_rate": 1.118265323012977e-05, + "loss": 0.048, + "step": 25802 + }, + { + "epoch": 0.51608, + "grad_norm": 0.34366026520729065, + "learning_rate": 1.1181266754138091e-05, + "loss": 0.0421, + "step": 25804 + }, + { + "epoch": 0.51612, + "grad_norm": 1.0151318311691284, + "learning_rate": 1.1179880255117013e-05, + "loss": 0.0152, + "step": 25806 + }, + { + "epoch": 0.51616, + "grad_norm": 1.408479928970337, + "learning_rate": 1.117849373309356e-05, + "loss": 0.2229, + "step": 25808 + }, + { + "epoch": 0.5162, + "grad_norm": 0.4819766879081726, + "learning_rate": 1.1177107188094765e-05, + "loss": 0.0091, + "step": 25810 + }, + { + "epoch": 0.51624, + "grad_norm": 7.182643413543701, + "learning_rate": 1.117572062014766e-05, + "loss": 0.2763, + "step": 25812 + }, + { + "epoch": 0.51628, + "grad_norm": 0.05230380967259407, + "learning_rate": 1.1174334029279276e-05, + "loss": 0.008, + "step": 25814 + }, + { + "epoch": 0.51632, + "grad_norm": 0.6880418658256531, + "learning_rate": 1.1172947415516643e-05, + "loss": 0.0525, + "step": 25816 + }, + { + "epoch": 0.51636, + "grad_norm": 0.5059282183647156, + "learning_rate": 1.1171560778886803e-05, + "loss": 0.0089, + "step": 25818 + }, + { + "epoch": 0.5164, + "grad_norm": 0.6490634083747864, + "learning_rate": 1.1170174119416778e-05, + "loss": 0.3627, + "step": 25820 + }, + { + "epoch": 0.51644, + "grad_norm": 0.023799702525138855, + "learning_rate": 1.1168787437133604e-05, + "loss": 0.0136, + "step": 25822 + }, + { + "epoch": 0.51648, + "grad_norm": 0.047376591712236404, + "learning_rate": 1.1167400732064319e-05, + "loss": 0.0479, + "step": 25824 + }, + { + "epoch": 0.51652, + "grad_norm": 10.151888847351074, + "learning_rate": 1.1166014004235957e-05, + "loss": 0.5073, + "step": 25826 + }, + { + "epoch": 0.51656, + "grad_norm": 4.394889831542969, + "learning_rate": 1.1164627253675551e-05, + "loss": 0.0632, + "step": 25828 + }, + { + "epoch": 0.5166, + "grad_norm": 9.502250671386719, + "learning_rate": 1.1163240480410136e-05, + "loss": 0.3936, + "step": 25830 + }, + { + "epoch": 0.51664, + "grad_norm": 0.635400652885437, + "learning_rate": 1.116185368446675e-05, + "loss": 0.173, + "step": 25832 + }, + { + "epoch": 0.51668, + "grad_norm": 10.015397071838379, + "learning_rate": 1.116046686587243e-05, + "loss": 0.2859, + "step": 25834 + }, + { + "epoch": 0.51672, + "grad_norm": 1.1152065992355347, + "learning_rate": 1.1159080024654207e-05, + "loss": 0.0174, + "step": 25836 + }, + { + "epoch": 0.51676, + "grad_norm": 11.905075073242188, + "learning_rate": 1.1157693160839125e-05, + "loss": 0.3604, + "step": 25838 + }, + { + "epoch": 0.5168, + "grad_norm": 0.9701817631721497, + "learning_rate": 1.1156306274454218e-05, + "loss": 0.0137, + "step": 25840 + }, + { + "epoch": 0.51684, + "grad_norm": 0.565684974193573, + "learning_rate": 1.1154919365526524e-05, + "loss": 0.0117, + "step": 25842 + }, + { + "epoch": 0.51688, + "grad_norm": 0.021617436781525612, + "learning_rate": 1.1153532434083083e-05, + "loss": 0.0216, + "step": 25844 + }, + { + "epoch": 0.51692, + "grad_norm": 12.50875473022461, + "learning_rate": 1.1152145480150934e-05, + "loss": 0.3258, + "step": 25846 + }, + { + "epoch": 0.51696, + "grad_norm": 0.8959413170814514, + "learning_rate": 1.1150758503757116e-05, + "loss": 0.0142, + "step": 25848 + }, + { + "epoch": 0.517, + "grad_norm": 0.07044629007577896, + "learning_rate": 1.1149371504928667e-05, + "loss": 0.0018, + "step": 25850 + }, + { + "epoch": 0.51704, + "grad_norm": 0.36847472190856934, + "learning_rate": 1.114798448369263e-05, + "loss": 0.0076, + "step": 25852 + }, + { + "epoch": 0.51708, + "grad_norm": 0.7002848982810974, + "learning_rate": 1.1146597440076043e-05, + "loss": 0.248, + "step": 25854 + }, + { + "epoch": 0.51712, + "grad_norm": 0.02057218737900257, + "learning_rate": 1.114521037410595e-05, + "loss": 0.2044, + "step": 25856 + }, + { + "epoch": 0.51716, + "grad_norm": 0.2863292098045349, + "learning_rate": 1.1143823285809392e-05, + "loss": 0.3776, + "step": 25858 + }, + { + "epoch": 0.5172, + "grad_norm": 4.9816365242004395, + "learning_rate": 1.1142436175213409e-05, + "loss": 0.0861, + "step": 25860 + }, + { + "epoch": 0.51724, + "grad_norm": 2.195424795150757, + "learning_rate": 1.1141049042345045e-05, + "loss": 0.0434, + "step": 25862 + }, + { + "epoch": 0.51728, + "grad_norm": 0.1417004019021988, + "learning_rate": 1.113966188723134e-05, + "loss": 0.0443, + "step": 25864 + }, + { + "epoch": 0.51732, + "grad_norm": 1.4168384075164795, + "learning_rate": 1.1138274709899343e-05, + "loss": 0.0269, + "step": 25866 + }, + { + "epoch": 0.51736, + "grad_norm": 0.3090262711048126, + "learning_rate": 1.1136887510376095e-05, + "loss": 0.0081, + "step": 25868 + }, + { + "epoch": 0.5174, + "grad_norm": 0.5161823630332947, + "learning_rate": 1.1135500288688636e-05, + "loss": 0.0137, + "step": 25870 + }, + { + "epoch": 0.51744, + "grad_norm": 0.3961809575557709, + "learning_rate": 1.1134113044864016e-05, + "loss": 0.0187, + "step": 25872 + }, + { + "epoch": 0.51748, + "grad_norm": 4.144495487213135, + "learning_rate": 1.113272577892928e-05, + "loss": 0.06, + "step": 25874 + }, + { + "epoch": 0.51752, + "grad_norm": 13.49681282043457, + "learning_rate": 1.113133849091147e-05, + "loss": 0.6913, + "step": 25876 + }, + { + "epoch": 0.51756, + "grad_norm": 0.4188225567340851, + "learning_rate": 1.1129951180837632e-05, + "loss": 0.197, + "step": 25878 + }, + { + "epoch": 0.5176, + "grad_norm": 0.02809799462556839, + "learning_rate": 1.1128563848734817e-05, + "loss": 0.0028, + "step": 25880 + }, + { + "epoch": 0.51764, + "grad_norm": 0.3608945608139038, + "learning_rate": 1.112717649463007e-05, + "loss": 0.1443, + "step": 25882 + }, + { + "epoch": 0.51768, + "grad_norm": 0.2458459436893463, + "learning_rate": 1.1125789118550432e-05, + "loss": 0.2232, + "step": 25884 + }, + { + "epoch": 0.51772, + "grad_norm": 0.9170827865600586, + "learning_rate": 1.1124401720522962e-05, + "loss": 0.0142, + "step": 25886 + }, + { + "epoch": 0.51776, + "grad_norm": 0.11373819410800934, + "learning_rate": 1.1123014300574695e-05, + "loss": 0.0147, + "step": 25888 + }, + { + "epoch": 0.5178, + "grad_norm": 0.6127526760101318, + "learning_rate": 1.112162685873269e-05, + "loss": 0.0112, + "step": 25890 + }, + { + "epoch": 0.51784, + "grad_norm": 0.25882488489151, + "learning_rate": 1.112023939502399e-05, + "loss": 0.005, + "step": 25892 + }, + { + "epoch": 0.51788, + "grad_norm": 4.0532546043396, + "learning_rate": 1.1118851909475647e-05, + "loss": 0.0692, + "step": 25894 + }, + { + "epoch": 0.51792, + "grad_norm": 0.9964383244514465, + "learning_rate": 1.111746440211471e-05, + "loss": 0.0151, + "step": 25896 + }, + { + "epoch": 0.51796, + "grad_norm": 1.1634153127670288, + "learning_rate": 1.1116076872968232e-05, + "loss": 0.021, + "step": 25898 + }, + { + "epoch": 0.518, + "grad_norm": 0.5160863995552063, + "learning_rate": 1.1114689322063255e-05, + "loss": 0.0122, + "step": 25900 + }, + { + "epoch": 0.51804, + "grad_norm": 0.9320271015167236, + "learning_rate": 1.1113301749426842e-05, + "loss": 0.1669, + "step": 25902 + }, + { + "epoch": 0.51808, + "grad_norm": 6.216498851776123, + "learning_rate": 1.1111914155086032e-05, + "loss": 0.1061, + "step": 25904 + }, + { + "epoch": 0.51812, + "grad_norm": 0.05672333389520645, + "learning_rate": 1.111052653906789e-05, + "loss": 0.003, + "step": 25906 + }, + { + "epoch": 0.51816, + "grad_norm": 0.2110857516527176, + "learning_rate": 1.1109138901399454e-05, + "loss": 0.0049, + "step": 25908 + }, + { + "epoch": 0.5182, + "grad_norm": 0.18534882366657257, + "learning_rate": 1.1107751242107786e-05, + "loss": 0.0088, + "step": 25910 + }, + { + "epoch": 0.51824, + "grad_norm": 1.5441750288009644, + "learning_rate": 1.110636356121994e-05, + "loss": 0.036, + "step": 25912 + }, + { + "epoch": 0.51828, + "grad_norm": 0.11577341705560684, + "learning_rate": 1.1104975858762967e-05, + "loss": 0.0361, + "step": 25914 + }, + { + "epoch": 0.51832, + "grad_norm": 1.981433629989624, + "learning_rate": 1.1103588134763918e-05, + "loss": 0.0372, + "step": 25916 + }, + { + "epoch": 0.51836, + "grad_norm": 2.856367588043213, + "learning_rate": 1.110220038924985e-05, + "loss": 0.1316, + "step": 25918 + }, + { + "epoch": 0.5184, + "grad_norm": 1.06260085105896, + "learning_rate": 1.1100812622247823e-05, + "loss": 0.0232, + "step": 25920 + }, + { + "epoch": 0.51844, + "grad_norm": 3.560757875442505, + "learning_rate": 1.1099424833784884e-05, + "loss": 0.0539, + "step": 25922 + }, + { + "epoch": 0.51848, + "grad_norm": 0.3369103670120239, + "learning_rate": 1.1098037023888092e-05, + "loss": 0.0222, + "step": 25924 + }, + { + "epoch": 0.51852, + "grad_norm": 0.22837160527706146, + "learning_rate": 1.1096649192584501e-05, + "loss": 0.0616, + "step": 25926 + }, + { + "epoch": 0.51856, + "grad_norm": 0.15883350372314453, + "learning_rate": 1.109526133990117e-05, + "loss": 0.0153, + "step": 25928 + }, + { + "epoch": 0.5186, + "grad_norm": 2.933267116546631, + "learning_rate": 1.1093873465865156e-05, + "loss": 0.0658, + "step": 25930 + }, + { + "epoch": 0.51864, + "grad_norm": 2.8515655994415283, + "learning_rate": 1.1092485570503517e-05, + "loss": 0.0602, + "step": 25932 + }, + { + "epoch": 0.51868, + "grad_norm": 1.0879662036895752, + "learning_rate": 1.1091097653843305e-05, + "loss": 0.0262, + "step": 25934 + }, + { + "epoch": 0.51872, + "grad_norm": 5.877429962158203, + "learning_rate": 1.108970971591159e-05, + "loss": 0.1395, + "step": 25936 + }, + { + "epoch": 0.51876, + "grad_norm": 3.8534233570098877, + "learning_rate": 1.1088321756735418e-05, + "loss": 0.081, + "step": 25938 + }, + { + "epoch": 0.5188, + "grad_norm": 0.016764894127845764, + "learning_rate": 1.1086933776341853e-05, + "loss": 0.0014, + "step": 25940 + }, + { + "epoch": 0.51884, + "grad_norm": 0.24612091481685638, + "learning_rate": 1.1085545774757956e-05, + "loss": 0.0249, + "step": 25942 + }, + { + "epoch": 0.51888, + "grad_norm": 0.02421277016401291, + "learning_rate": 1.1084157752010782e-05, + "loss": 0.1629, + "step": 25944 + }, + { + "epoch": 0.51892, + "grad_norm": 2.4265241622924805, + "learning_rate": 1.10827697081274e-05, + "loss": 0.0623, + "step": 25946 + }, + { + "epoch": 0.51896, + "grad_norm": 0.6444478034973145, + "learning_rate": 1.1081381643134861e-05, + "loss": 0.081, + "step": 25948 + }, + { + "epoch": 0.519, + "grad_norm": 8.038579940795898, + "learning_rate": 1.1079993557060228e-05, + "loss": 0.2023, + "step": 25950 + }, + { + "epoch": 0.51904, + "grad_norm": 1.180158257484436, + "learning_rate": 1.1078605449930569e-05, + "loss": 0.191, + "step": 25952 + }, + { + "epoch": 0.51908, + "grad_norm": 0.08138783276081085, + "learning_rate": 1.1077217321772942e-05, + "loss": 0.0026, + "step": 25954 + }, + { + "epoch": 0.51912, + "grad_norm": 1.1543784141540527, + "learning_rate": 1.1075829172614407e-05, + "loss": 0.0196, + "step": 25956 + }, + { + "epoch": 0.51916, + "grad_norm": 0.4315066635608673, + "learning_rate": 1.1074441002482027e-05, + "loss": 0.0767, + "step": 25958 + }, + { + "epoch": 0.5192, + "grad_norm": 0.04612398147583008, + "learning_rate": 1.1073052811402867e-05, + "loss": 0.022, + "step": 25960 + }, + { + "epoch": 0.51924, + "grad_norm": 7.328697204589844, + "learning_rate": 1.1071664599403991e-05, + "loss": 0.1394, + "step": 25962 + }, + { + "epoch": 0.51928, + "grad_norm": 0.570597231388092, + "learning_rate": 1.1070276366512463e-05, + "loss": 0.015, + "step": 25964 + }, + { + "epoch": 0.51932, + "grad_norm": 1.0916519165039062, + "learning_rate": 1.1068888112755343e-05, + "loss": 0.0137, + "step": 25966 + }, + { + "epoch": 0.51936, + "grad_norm": 0.038479845970869064, + "learning_rate": 1.10674998381597e-05, + "loss": 0.1269, + "step": 25968 + }, + { + "epoch": 0.5194, + "grad_norm": 0.4722753167152405, + "learning_rate": 1.10661115427526e-05, + "loss": 0.0081, + "step": 25970 + }, + { + "epoch": 0.51944, + "grad_norm": 0.013431471772491932, + "learning_rate": 1.1064723226561107e-05, + "loss": 0.002, + "step": 25972 + }, + { + "epoch": 0.51948, + "grad_norm": 0.10042447596788406, + "learning_rate": 1.1063334889612285e-05, + "loss": 0.0042, + "step": 25974 + }, + { + "epoch": 0.51952, + "grad_norm": 0.02001630701124668, + "learning_rate": 1.10619465319332e-05, + "loss": 0.3735, + "step": 25976 + }, + { + "epoch": 0.51956, + "grad_norm": 5.873444557189941, + "learning_rate": 1.1060558153550923e-05, + "loss": 0.3164, + "step": 25978 + }, + { + "epoch": 0.5196, + "grad_norm": 1.6501132249832153, + "learning_rate": 1.105916975449252e-05, + "loss": 0.0362, + "step": 25980 + }, + { + "epoch": 0.51964, + "grad_norm": 0.021483445540070534, + "learning_rate": 1.1057781334785054e-05, + "loss": 0.2509, + "step": 25982 + }, + { + "epoch": 0.51968, + "grad_norm": 5.736363410949707, + "learning_rate": 1.1056392894455596e-05, + "loss": 0.1068, + "step": 25984 + }, + { + "epoch": 0.51972, + "grad_norm": 0.034441027790308, + "learning_rate": 1.1055004433531218e-05, + "loss": 0.0722, + "step": 25986 + }, + { + "epoch": 0.51976, + "grad_norm": 2.434174060821533, + "learning_rate": 1.1053615952038984e-05, + "loss": 0.0441, + "step": 25988 + }, + { + "epoch": 0.5198, + "grad_norm": 0.13118958473205566, + "learning_rate": 1.1052227450005968e-05, + "loss": 0.005, + "step": 25990 + }, + { + "epoch": 0.51984, + "grad_norm": 0.2126355767250061, + "learning_rate": 1.105083892745923e-05, + "loss": 0.0557, + "step": 25992 + }, + { + "epoch": 0.51988, + "grad_norm": 0.24468356370925903, + "learning_rate": 1.1049450384425849e-05, + "loss": 0.0066, + "step": 25994 + }, + { + "epoch": 0.51992, + "grad_norm": 0.29817578196525574, + "learning_rate": 1.1048061820932894e-05, + "loss": 0.0313, + "step": 25996 + }, + { + "epoch": 0.51996, + "grad_norm": 0.9318892955780029, + "learning_rate": 1.104667323700743e-05, + "loss": 0.0198, + "step": 25998 + }, + { + "epoch": 0.52, + "grad_norm": 0.014441663399338722, + "learning_rate": 1.1045284632676535e-05, + "loss": 0.0015, + "step": 26000 + }, + { + "epoch": 0.52004, + "grad_norm": 0.09310849010944366, + "learning_rate": 1.1043896007967278e-05, + "loss": 0.0126, + "step": 26002 + }, + { + "epoch": 0.52008, + "grad_norm": 0.42830878496170044, + "learning_rate": 1.1042507362906732e-05, + "loss": 0.0083, + "step": 26004 + }, + { + "epoch": 0.52012, + "grad_norm": 0.25970399379730225, + "learning_rate": 1.1041118697521968e-05, + "loss": 0.0078, + "step": 26006 + }, + { + "epoch": 0.52016, + "grad_norm": 0.042612235993146896, + "learning_rate": 1.1039730011840058e-05, + "loss": 0.0477, + "step": 26008 + }, + { + "epoch": 0.5202, + "grad_norm": 0.06052291765809059, + "learning_rate": 1.1038341305888074e-05, + "loss": 0.0129, + "step": 26010 + }, + { + "epoch": 0.52024, + "grad_norm": 0.6352026462554932, + "learning_rate": 1.1036952579693096e-05, + "loss": 0.0209, + "step": 26012 + }, + { + "epoch": 0.52028, + "grad_norm": 12.470151901245117, + "learning_rate": 1.1035563833282191e-05, + "loss": 0.5713, + "step": 26014 + }, + { + "epoch": 0.52032, + "grad_norm": 0.7025101780891418, + "learning_rate": 1.1034175066682435e-05, + "loss": 0.0177, + "step": 26016 + }, + { + "epoch": 0.52036, + "grad_norm": 0.11829572170972824, + "learning_rate": 1.1032786279920905e-05, + "loss": 0.0032, + "step": 26018 + }, + { + "epoch": 0.5204, + "grad_norm": 9.629467010498047, + "learning_rate": 1.1031397473024674e-05, + "loss": 0.674, + "step": 26020 + }, + { + "epoch": 0.52044, + "grad_norm": 0.7409912347793579, + "learning_rate": 1.103000864602082e-05, + "loss": 0.0139, + "step": 26022 + }, + { + "epoch": 0.52048, + "grad_norm": 0.061340637505054474, + "learning_rate": 1.1028619798936418e-05, + "loss": 0.003, + "step": 26024 + }, + { + "epoch": 0.52052, + "grad_norm": 5.554502010345459, + "learning_rate": 1.102723093179854e-05, + "loss": 0.1008, + "step": 26026 + }, + { + "epoch": 0.52056, + "grad_norm": 0.12371434271335602, + "learning_rate": 1.102584204463427e-05, + "loss": 0.023, + "step": 26028 + }, + { + "epoch": 0.5206, + "grad_norm": 0.0318363755941391, + "learning_rate": 1.1024453137470677e-05, + "loss": 0.0011, + "step": 26030 + }, + { + "epoch": 0.52064, + "grad_norm": 0.07998578995466232, + "learning_rate": 1.1023064210334845e-05, + "loss": 0.0041, + "step": 26032 + }, + { + "epoch": 0.52068, + "grad_norm": 0.018337303772568703, + "learning_rate": 1.1021675263253845e-05, + "loss": 0.0021, + "step": 26034 + }, + { + "epoch": 0.52072, + "grad_norm": 0.1000121533870697, + "learning_rate": 1.1020286296254768e-05, + "loss": 0.0041, + "step": 26036 + }, + { + "epoch": 0.52076, + "grad_norm": 0.2100212723016739, + "learning_rate": 1.1018897309364678e-05, + "loss": 0.007, + "step": 26038 + }, + { + "epoch": 0.5208, + "grad_norm": 0.05176030844449997, + "learning_rate": 1.1017508302610665e-05, + "loss": 0.0199, + "step": 26040 + }, + { + "epoch": 0.52084, + "grad_norm": 0.11371594667434692, + "learning_rate": 1.10161192760198e-05, + "loss": 0.0356, + "step": 26042 + }, + { + "epoch": 0.52088, + "grad_norm": 0.9420391321182251, + "learning_rate": 1.101473022961917e-05, + "loss": 0.0169, + "step": 26044 + }, + { + "epoch": 0.52092, + "grad_norm": 0.04001902416348457, + "learning_rate": 1.101334116343585e-05, + "loss": 0.0063, + "step": 26046 + }, + { + "epoch": 0.52096, + "grad_norm": 1.1805957555770874, + "learning_rate": 1.1011952077496919e-05, + "loss": 0.0185, + "step": 26048 + }, + { + "epoch": 0.521, + "grad_norm": 0.038524508476257324, + "learning_rate": 1.1010562971829464e-05, + "loss": 0.0014, + "step": 26050 + }, + { + "epoch": 0.52104, + "grad_norm": 1.5351862907409668, + "learning_rate": 1.1009173846460565e-05, + "loss": 0.0313, + "step": 26052 + }, + { + "epoch": 0.52108, + "grad_norm": 1.2220206260681152, + "learning_rate": 1.1007784701417302e-05, + "loss": 0.0165, + "step": 26054 + }, + { + "epoch": 0.52112, + "grad_norm": 0.6160152554512024, + "learning_rate": 1.1006395536726755e-05, + "loss": 0.0382, + "step": 26056 + }, + { + "epoch": 0.52116, + "grad_norm": 0.23948512971401215, + "learning_rate": 1.1005006352416012e-05, + "loss": 0.023, + "step": 26058 + }, + { + "epoch": 0.5212, + "grad_norm": 0.19126282632350922, + "learning_rate": 1.1003617148512149e-05, + "loss": 0.016, + "step": 26060 + }, + { + "epoch": 0.52124, + "grad_norm": 0.11858147382736206, + "learning_rate": 1.1002227925042257e-05, + "loss": 0.0063, + "step": 26062 + }, + { + "epoch": 0.52128, + "grad_norm": 0.703114926815033, + "learning_rate": 1.1000838682033415e-05, + "loss": 0.01, + "step": 26064 + }, + { + "epoch": 0.52132, + "grad_norm": 0.6413401365280151, + "learning_rate": 1.0999449419512704e-05, + "loss": 0.1262, + "step": 26066 + }, + { + "epoch": 0.52136, + "grad_norm": 8.310418128967285, + "learning_rate": 1.0998060137507213e-05, + "loss": 0.1262, + "step": 26068 + }, + { + "epoch": 0.5214, + "grad_norm": 1.3642995357513428, + "learning_rate": 1.099667083604403e-05, + "loss": 0.0174, + "step": 26070 + }, + { + "epoch": 0.52144, + "grad_norm": 0.6409021615982056, + "learning_rate": 1.0995281515150232e-05, + "loss": 0.0115, + "step": 26072 + }, + { + "epoch": 0.52148, + "grad_norm": 0.49941933155059814, + "learning_rate": 1.0993892174852911e-05, + "loss": 0.0189, + "step": 26074 + }, + { + "epoch": 0.52152, + "grad_norm": 11.575167655944824, + "learning_rate": 1.0992502815179149e-05, + "loss": 0.5103, + "step": 26076 + }, + { + "epoch": 0.52156, + "grad_norm": 0.3606087267398834, + "learning_rate": 1.0991113436156034e-05, + "loss": 0.0107, + "step": 26078 + }, + { + "epoch": 0.5216, + "grad_norm": 0.22306901216506958, + "learning_rate": 1.0989724037810651e-05, + "loss": 0.0039, + "step": 26080 + }, + { + "epoch": 0.52164, + "grad_norm": 0.11550795286893845, + "learning_rate": 1.098833462017009e-05, + "loss": 0.0059, + "step": 26082 + }, + { + "epoch": 0.52168, + "grad_norm": 0.17104019224643707, + "learning_rate": 1.0986945183261434e-05, + "loss": 0.0037, + "step": 26084 + }, + { + "epoch": 0.52172, + "grad_norm": 2.5958001613616943, + "learning_rate": 1.0985555727111778e-05, + "loss": 0.0566, + "step": 26086 + }, + { + "epoch": 0.52176, + "grad_norm": 0.31045886874198914, + "learning_rate": 1.0984166251748203e-05, + "loss": 0.0117, + "step": 26088 + }, + { + "epoch": 0.5218, + "grad_norm": 0.14619766175746918, + "learning_rate": 1.0982776757197799e-05, + "loss": 0.0101, + "step": 26090 + }, + { + "epoch": 0.52184, + "grad_norm": 0.9940387010574341, + "learning_rate": 1.0981387243487658e-05, + "loss": 0.016, + "step": 26092 + }, + { + "epoch": 0.52188, + "grad_norm": 2.809271812438965, + "learning_rate": 1.097999771064487e-05, + "loss": 0.0445, + "step": 26094 + }, + { + "epoch": 0.52192, + "grad_norm": 0.02163929119706154, + "learning_rate": 1.0978608158696517e-05, + "loss": 0.0272, + "step": 26096 + }, + { + "epoch": 0.52196, + "grad_norm": 0.6801024675369263, + "learning_rate": 1.0977218587669695e-05, + "loss": 0.0134, + "step": 26098 + }, + { + "epoch": 0.522, + "grad_norm": 0.17566877603530884, + "learning_rate": 1.0975828997591496e-05, + "loss": 0.0048, + "step": 26100 + }, + { + "epoch": 0.52204, + "grad_norm": 0.10146034508943558, + "learning_rate": 1.097443938848901e-05, + "loss": 0.0058, + "step": 26102 + }, + { + "epoch": 0.52208, + "grad_norm": 5.072085857391357, + "learning_rate": 1.097304976038932e-05, + "loss": 0.1208, + "step": 26104 + }, + { + "epoch": 0.52212, + "grad_norm": 0.31616485118865967, + "learning_rate": 1.0971660113319526e-05, + "loss": 0.0254, + "step": 26106 + }, + { + "epoch": 0.52216, + "grad_norm": 1.3601008653640747, + "learning_rate": 1.0970270447306724e-05, + "loss": 0.0358, + "step": 26108 + }, + { + "epoch": 0.5222, + "grad_norm": 8.03628158569336, + "learning_rate": 1.0968880762377994e-05, + "loss": 0.0996, + "step": 26110 + }, + { + "epoch": 0.52224, + "grad_norm": 0.07300654798746109, + "learning_rate": 1.0967491058560438e-05, + "loss": 0.0178, + "step": 26112 + }, + { + "epoch": 0.52228, + "grad_norm": 9.91533088684082, + "learning_rate": 1.0966101335881141e-05, + "loss": 0.2671, + "step": 26114 + }, + { + "epoch": 0.52232, + "grad_norm": 0.027400368824601173, + "learning_rate": 1.0964711594367204e-05, + "loss": 0.0068, + "step": 26116 + }, + { + "epoch": 0.52236, + "grad_norm": 0.10972347110509872, + "learning_rate": 1.096332183404572e-05, + "loss": 0.0023, + "step": 26118 + }, + { + "epoch": 0.5224, + "grad_norm": 0.08463925868272781, + "learning_rate": 1.0961932054943778e-05, + "loss": 0.0062, + "step": 26120 + }, + { + "epoch": 0.52244, + "grad_norm": 0.133084237575531, + "learning_rate": 1.0960542257088475e-05, + "loss": 0.0038, + "step": 26122 + }, + { + "epoch": 0.52248, + "grad_norm": 0.15937654674053192, + "learning_rate": 1.0959152440506906e-05, + "loss": 0.0065, + "step": 26124 + }, + { + "epoch": 0.52252, + "grad_norm": 0.042700447142124176, + "learning_rate": 1.095776260522617e-05, + "loss": 0.0427, + "step": 26126 + }, + { + "epoch": 0.52256, + "grad_norm": 4.9012556076049805, + "learning_rate": 1.0956372751273356e-05, + "loss": 0.1069, + "step": 26128 + }, + { + "epoch": 0.5226, + "grad_norm": 0.06259570270776749, + "learning_rate": 1.0954982878675564e-05, + "loss": 0.1065, + "step": 26130 + }, + { + "epoch": 0.52264, + "grad_norm": 0.009936410933732986, + "learning_rate": 1.0953592987459886e-05, + "loss": 0.0083, + "step": 26132 + }, + { + "epoch": 0.52268, + "grad_norm": 0.12107209861278534, + "learning_rate": 1.0952203077653427e-05, + "loss": 0.0018, + "step": 26134 + }, + { + "epoch": 0.52272, + "grad_norm": 0.06402108073234558, + "learning_rate": 1.0950813149283274e-05, + "loss": 0.0142, + "step": 26136 + }, + { + "epoch": 0.52276, + "grad_norm": 0.07016150653362274, + "learning_rate": 1.0949423202376532e-05, + "loss": 0.0199, + "step": 26138 + }, + { + "epoch": 0.5228, + "grad_norm": 0.013958358205854893, + "learning_rate": 1.0948033236960294e-05, + "loss": 0.0011, + "step": 26140 + }, + { + "epoch": 0.52284, + "grad_norm": 4.97059440612793, + "learning_rate": 1.0946643253061661e-05, + "loss": 0.1012, + "step": 26142 + }, + { + "epoch": 0.52288, + "grad_norm": 0.93296217918396, + "learning_rate": 1.094525325070773e-05, + "loss": 0.0151, + "step": 26144 + }, + { + "epoch": 0.52292, + "grad_norm": 0.014128506183624268, + "learning_rate": 1.0943863229925604e-05, + "loss": 0.1058, + "step": 26146 + }, + { + "epoch": 0.52296, + "grad_norm": 0.1157107800245285, + "learning_rate": 1.0942473190742374e-05, + "loss": 0.1398, + "step": 26148 + }, + { + "epoch": 0.523, + "grad_norm": 0.06672809273004532, + "learning_rate": 1.0941083133185146e-05, + "loss": 0.0019, + "step": 26150 + }, + { + "epoch": 0.52304, + "grad_norm": 11.727568626403809, + "learning_rate": 1.0939693057281015e-05, + "loss": 0.5078, + "step": 26152 + }, + { + "epoch": 0.52308, + "grad_norm": 0.04880139231681824, + "learning_rate": 1.0938302963057086e-05, + "loss": 0.0063, + "step": 26154 + }, + { + "epoch": 0.52312, + "grad_norm": 0.016731398180127144, + "learning_rate": 1.0936912850540458e-05, + "loss": 0.0052, + "step": 26156 + }, + { + "epoch": 0.52316, + "grad_norm": 0.04476553574204445, + "learning_rate": 1.0935522719758231e-05, + "loss": 0.022, + "step": 26158 + }, + { + "epoch": 0.5232, + "grad_norm": 1.2988169193267822, + "learning_rate": 1.0934132570737508e-05, + "loss": 0.0317, + "step": 26160 + }, + { + "epoch": 0.52324, + "grad_norm": 0.025210587307810783, + "learning_rate": 1.0932742403505389e-05, + "loss": 0.0048, + "step": 26162 + }, + { + "epoch": 0.52328, + "grad_norm": 0.3374262750148773, + "learning_rate": 1.0931352218088975e-05, + "loss": 0.0085, + "step": 26164 + }, + { + "epoch": 0.52332, + "grad_norm": 0.4092826545238495, + "learning_rate": 1.0929962014515371e-05, + "loss": 0.0866, + "step": 26166 + }, + { + "epoch": 0.52336, + "grad_norm": 0.38938599824905396, + "learning_rate": 1.092857179281168e-05, + "loss": 0.0055, + "step": 26168 + }, + { + "epoch": 0.5234, + "grad_norm": 0.23700933158397675, + "learning_rate": 1.0927181553005001e-05, + "loss": 0.0103, + "step": 26170 + }, + { + "epoch": 0.52344, + "grad_norm": 0.08299708366394043, + "learning_rate": 1.0925791295122444e-05, + "loss": 0.0046, + "step": 26172 + }, + { + "epoch": 0.52348, + "grad_norm": 0.04405497387051582, + "learning_rate": 1.0924401019191106e-05, + "loss": 0.0011, + "step": 26174 + }, + { + "epoch": 0.52352, + "grad_norm": 0.1644066870212555, + "learning_rate": 1.0923010725238096e-05, + "loss": 0.0056, + "step": 26176 + }, + { + "epoch": 0.52356, + "grad_norm": 0.04175623133778572, + "learning_rate": 1.0921620413290516e-05, + "loss": 0.004, + "step": 26178 + }, + { + "epoch": 0.5236, + "grad_norm": 0.47247812151908875, + "learning_rate": 1.0920230083375474e-05, + "loss": 0.0084, + "step": 26180 + }, + { + "epoch": 0.52364, + "grad_norm": 0.007618095725774765, + "learning_rate": 1.0918839735520068e-05, + "loss": 0.0471, + "step": 26182 + }, + { + "epoch": 0.52368, + "grad_norm": 0.42814624309539795, + "learning_rate": 1.0917449369751414e-05, + "loss": 0.0204, + "step": 26184 + }, + { + "epoch": 0.52372, + "grad_norm": 0.007168436422944069, + "learning_rate": 1.0916058986096606e-05, + "loss": 0.1056, + "step": 26186 + }, + { + "epoch": 0.52376, + "grad_norm": 0.2943163812160492, + "learning_rate": 1.091466858458276e-05, + "loss": 0.1202, + "step": 26188 + }, + { + "epoch": 0.5238, + "grad_norm": 1.5392895936965942, + "learning_rate": 1.0913278165236977e-05, + "loss": 0.0229, + "step": 26190 + }, + { + "epoch": 0.52384, + "grad_norm": 0.26896724104881287, + "learning_rate": 1.091188772808637e-05, + "loss": 0.0063, + "step": 26192 + }, + { + "epoch": 0.52388, + "grad_norm": 10.595925331115723, + "learning_rate": 1.0910497273158039e-05, + "loss": 0.2358, + "step": 26194 + }, + { + "epoch": 0.52392, + "grad_norm": 0.03860298916697502, + "learning_rate": 1.0909106800479097e-05, + "loss": 0.0012, + "step": 26196 + }, + { + "epoch": 0.52396, + "grad_norm": 0.026900725439190865, + "learning_rate": 1.0907716310076649e-05, + "loss": 0.0005, + "step": 26198 + }, + { + "epoch": 0.524, + "grad_norm": 0.017514415085315704, + "learning_rate": 1.0906325801977804e-05, + "loss": 0.0026, + "step": 26200 + }, + { + "epoch": 0.52404, + "grad_norm": 0.00835768599063158, + "learning_rate": 1.0904935276209669e-05, + "loss": 0.0016, + "step": 26202 + }, + { + "epoch": 0.52408, + "grad_norm": 1.0366884469985962, + "learning_rate": 1.0903544732799357e-05, + "loss": 0.015, + "step": 26204 + }, + { + "epoch": 0.52412, + "grad_norm": 9.16769027709961, + "learning_rate": 1.0902154171773973e-05, + "loss": 0.1707, + "step": 26206 + }, + { + "epoch": 0.52416, + "grad_norm": 0.02833292819559574, + "learning_rate": 1.0900763593160633e-05, + "loss": 0.0068, + "step": 26208 + }, + { + "epoch": 0.5242, + "grad_norm": 0.19560281932353973, + "learning_rate": 1.0899372996986439e-05, + "loss": 0.0033, + "step": 26210 + }, + { + "epoch": 0.52424, + "grad_norm": 0.5548677444458008, + "learning_rate": 1.0897982383278508e-05, + "loss": 0.0117, + "step": 26212 + }, + { + "epoch": 0.52428, + "grad_norm": 5.15410852432251, + "learning_rate": 1.0896591752063947e-05, + "loss": 0.0749, + "step": 26214 + }, + { + "epoch": 0.52432, + "grad_norm": 0.1342317759990692, + "learning_rate": 1.0895201103369873e-05, + "loss": 0.0282, + "step": 26216 + }, + { + "epoch": 0.52436, + "grad_norm": 3.3781795501708984, + "learning_rate": 1.0893810437223387e-05, + "loss": 0.0381, + "step": 26218 + }, + { + "epoch": 0.5244, + "grad_norm": 1.954777479171753, + "learning_rate": 1.0892419753651606e-05, + "loss": 0.024, + "step": 26220 + }, + { + "epoch": 0.52444, + "grad_norm": 0.14995774626731873, + "learning_rate": 1.0891029052681643e-05, + "loss": 0.0035, + "step": 26222 + }, + { + "epoch": 0.52448, + "grad_norm": 0.01453237235546112, + "learning_rate": 1.0889638334340612e-05, + "loss": 0.0215, + "step": 26224 + }, + { + "epoch": 0.52452, + "grad_norm": 0.8067145347595215, + "learning_rate": 1.0888247598655622e-05, + "loss": 0.0107, + "step": 26226 + }, + { + "epoch": 0.52456, + "grad_norm": 3.284372091293335, + "learning_rate": 1.0886856845653787e-05, + "loss": 0.0356, + "step": 26228 + }, + { + "epoch": 0.5246, + "grad_norm": 0.9654093384742737, + "learning_rate": 1.0885466075362224e-05, + "loss": 0.0131, + "step": 26230 + }, + { + "epoch": 0.52464, + "grad_norm": 0.07409268617630005, + "learning_rate": 1.0884075287808044e-05, + "loss": 0.0017, + "step": 26232 + }, + { + "epoch": 0.52468, + "grad_norm": 0.028415987268090248, + "learning_rate": 1.0882684483018357e-05, + "loss": 0.0171, + "step": 26234 + }, + { + "epoch": 0.52472, + "grad_norm": 9.520540237426758, + "learning_rate": 1.0881293661020285e-05, + "loss": 0.2283, + "step": 26236 + }, + { + "epoch": 0.52476, + "grad_norm": 0.08391518145799637, + "learning_rate": 1.0879902821840938e-05, + "loss": 0.0349, + "step": 26238 + }, + { + "epoch": 0.5248, + "grad_norm": 0.29951584339141846, + "learning_rate": 1.0878511965507435e-05, + "loss": 0.0039, + "step": 26240 + }, + { + "epoch": 0.52484, + "grad_norm": 0.11463232338428497, + "learning_rate": 1.0877121092046884e-05, + "loss": 0.0019, + "step": 26242 + }, + { + "epoch": 0.52488, + "grad_norm": 0.3911224603652954, + "learning_rate": 1.087573020148641e-05, + "loss": 0.005, + "step": 26244 + }, + { + "epoch": 0.52492, + "grad_norm": 0.09719792008399963, + "learning_rate": 1.0874339293853123e-05, + "loss": 0.0026, + "step": 26246 + }, + { + "epoch": 0.52496, + "grad_norm": 0.05296667665243149, + "learning_rate": 1.0872948369174142e-05, + "loss": 0.0008, + "step": 26248 + }, + { + "epoch": 0.525, + "grad_norm": 0.002242497866973281, + "learning_rate": 1.0871557427476585e-05, + "loss": 0.0792, + "step": 26250 + }, + { + "epoch": 0.52504, + "grad_norm": 0.028402231633663177, + "learning_rate": 1.0870166468787564e-05, + "loss": 0.0016, + "step": 26252 + }, + { + "epoch": 0.52508, + "grad_norm": 8.420186042785645, + "learning_rate": 1.08687754931342e-05, + "loss": 0.1555, + "step": 26254 + }, + { + "epoch": 0.52512, + "grad_norm": 0.007738684304058552, + "learning_rate": 1.0867384500543612e-05, + "loss": 0.0011, + "step": 26256 + }, + { + "epoch": 0.52516, + "grad_norm": 0.010374592617154121, + "learning_rate": 1.0865993491042916e-05, + "loss": 0.0793, + "step": 26258 + }, + { + "epoch": 0.5252, + "grad_norm": 0.03511568903923035, + "learning_rate": 1.086460246465923e-05, + "loss": 0.0009, + "step": 26260 + }, + { + "epoch": 0.52524, + "grad_norm": 7.864818096160889, + "learning_rate": 1.0863211421419675e-05, + "loss": 0.2859, + "step": 26262 + }, + { + "epoch": 0.52528, + "grad_norm": 4.706964015960693, + "learning_rate": 1.086182036135137e-05, + "loss": 0.0813, + "step": 26264 + }, + { + "epoch": 0.52532, + "grad_norm": 0.4651036262512207, + "learning_rate": 1.0860429284481433e-05, + "loss": 0.0849, + "step": 26266 + }, + { + "epoch": 0.52536, + "grad_norm": 0.06071778014302254, + "learning_rate": 1.0859038190836983e-05, + "loss": 0.0477, + "step": 26268 + }, + { + "epoch": 0.5254, + "grad_norm": 0.017190856859087944, + "learning_rate": 1.085764708044514e-05, + "loss": 0.0012, + "step": 26270 + }, + { + "epoch": 0.52544, + "grad_norm": 0.25179553031921387, + "learning_rate": 1.0856255953333026e-05, + "loss": 0.0044, + "step": 26272 + }, + { + "epoch": 0.52548, + "grad_norm": 0.00962358620017767, + "learning_rate": 1.0854864809527765e-05, + "loss": 0.0004, + "step": 26274 + }, + { + "epoch": 0.52552, + "grad_norm": 8.208951950073242, + "learning_rate": 1.0853473649056472e-05, + "loss": 0.1428, + "step": 26276 + }, + { + "epoch": 0.52556, + "grad_norm": 0.33479753136634827, + "learning_rate": 1.0852082471946268e-05, + "loss": 0.3761, + "step": 26278 + }, + { + "epoch": 0.5256, + "grad_norm": 0.10518090426921844, + "learning_rate": 1.0850691278224282e-05, + "loss": 0.039, + "step": 26280 + }, + { + "epoch": 0.52564, + "grad_norm": 0.5871152281761169, + "learning_rate": 1.0849300067917629e-05, + "loss": 0.0092, + "step": 26282 + }, + { + "epoch": 0.52568, + "grad_norm": 0.057262569665908813, + "learning_rate": 1.0847908841053437e-05, + "loss": 0.0529, + "step": 26284 + }, + { + "epoch": 0.52572, + "grad_norm": 0.09842804819345474, + "learning_rate": 1.0846517597658821e-05, + "loss": 0.0279, + "step": 26286 + }, + { + "epoch": 0.52576, + "grad_norm": 0.08697588741779327, + "learning_rate": 1.0845126337760912e-05, + "loss": 0.0034, + "step": 26288 + }, + { + "epoch": 0.5258, + "grad_norm": 0.2203957438468933, + "learning_rate": 1.0843735061386829e-05, + "loss": 0.0044, + "step": 26290 + }, + { + "epoch": 0.52584, + "grad_norm": 14.87230110168457, + "learning_rate": 1.0842343768563697e-05, + "loss": 0.67, + "step": 26292 + }, + { + "epoch": 0.52588, + "grad_norm": 0.0690964087843895, + "learning_rate": 1.0840952459318637e-05, + "loss": 0.0015, + "step": 26294 + }, + { + "epoch": 0.52592, + "grad_norm": 0.04879305511713028, + "learning_rate": 1.0839561133678777e-05, + "loss": 0.0155, + "step": 26296 + }, + { + "epoch": 0.52596, + "grad_norm": 3.1164352893829346, + "learning_rate": 1.0838169791671242e-05, + "loss": 0.0434, + "step": 26298 + }, + { + "epoch": 0.526, + "grad_norm": 1.5434929132461548, + "learning_rate": 1.083677843332316e-05, + "loss": 0.0182, + "step": 26300 + }, + { + "epoch": 0.52604, + "grad_norm": 0.03156234323978424, + "learning_rate": 1.0835387058661644e-05, + "loss": 0.0136, + "step": 26302 + }, + { + "epoch": 0.52608, + "grad_norm": 0.5140184164047241, + "learning_rate": 1.0833995667713827e-05, + "loss": 0.0139, + "step": 26304 + }, + { + "epoch": 0.52612, + "grad_norm": 0.032616276293992996, + "learning_rate": 1.0832604260506841e-05, + "loss": 0.0217, + "step": 26306 + }, + { + "epoch": 0.52616, + "grad_norm": 0.717605471611023, + "learning_rate": 1.0831212837067801e-05, + "loss": 0.1054, + "step": 26308 + }, + { + "epoch": 0.5262, + "grad_norm": 0.010107713751494884, + "learning_rate": 1.082982139742384e-05, + "loss": 0.0149, + "step": 26310 + }, + { + "epoch": 0.52624, + "grad_norm": 0.886735200881958, + "learning_rate": 1.0828429941602082e-05, + "loss": 0.0307, + "step": 26312 + }, + { + "epoch": 0.52628, + "grad_norm": 0.24077743291854858, + "learning_rate": 1.0827038469629663e-05, + "loss": 0.0372, + "step": 26314 + }, + { + "epoch": 0.52632, + "grad_norm": 3.2137351036071777, + "learning_rate": 1.0825646981533698e-05, + "loss": 0.0381, + "step": 26316 + }, + { + "epoch": 0.52636, + "grad_norm": 1.0360592603683472, + "learning_rate": 1.082425547734132e-05, + "loss": 0.0154, + "step": 26318 + }, + { + "epoch": 0.5264, + "grad_norm": 0.08556565642356873, + "learning_rate": 1.0822863957079657e-05, + "loss": 0.0033, + "step": 26320 + }, + { + "epoch": 0.52644, + "grad_norm": 19.61138153076172, + "learning_rate": 1.082147242077584e-05, + "loss": 1.1043, + "step": 26322 + }, + { + "epoch": 0.52648, + "grad_norm": 0.03372828662395477, + "learning_rate": 1.0820080868456993e-05, + "loss": 0.0193, + "step": 26324 + }, + { + "epoch": 0.52652, + "grad_norm": 8.442540168762207, + "learning_rate": 1.0818689300150247e-05, + "loss": 0.1368, + "step": 26326 + }, + { + "epoch": 0.52656, + "grad_norm": 0.33420446515083313, + "learning_rate": 1.081729771588273e-05, + "loss": 0.005, + "step": 26328 + }, + { + "epoch": 0.5266, + "grad_norm": 0.09586931765079498, + "learning_rate": 1.0815906115681579e-05, + "loss": 0.0014, + "step": 26330 + }, + { + "epoch": 0.52664, + "grad_norm": 0.022487666457891464, + "learning_rate": 1.0814514499573911e-05, + "loss": 0.0095, + "step": 26332 + }, + { + "epoch": 0.52668, + "grad_norm": 0.9529041051864624, + "learning_rate": 1.081312286758687e-05, + "loss": 0.0156, + "step": 26334 + }, + { + "epoch": 0.52672, + "grad_norm": 0.27156752347946167, + "learning_rate": 1.0811731219747577e-05, + "loss": 0.0203, + "step": 26336 + }, + { + "epoch": 0.52676, + "grad_norm": 15.363471031188965, + "learning_rate": 1.0810339556083166e-05, + "loss": 0.8202, + "step": 26338 + }, + { + "epoch": 0.5268, + "grad_norm": 0.1954812854528427, + "learning_rate": 1.0808947876620768e-05, + "loss": 0.0452, + "step": 26340 + }, + { + "epoch": 0.52684, + "grad_norm": 1.6898951530456543, + "learning_rate": 1.0807556181387516e-05, + "loss": 0.0276, + "step": 26342 + }, + { + "epoch": 0.52688, + "grad_norm": 7.627818584442139, + "learning_rate": 1.0806164470410538e-05, + "loss": 0.1512, + "step": 26344 + }, + { + "epoch": 0.52692, + "grad_norm": 0.5104392170906067, + "learning_rate": 1.0804772743716971e-05, + "loss": 0.0114, + "step": 26346 + }, + { + "epoch": 0.52696, + "grad_norm": 1.3532130718231201, + "learning_rate": 1.0803381001333943e-05, + "loss": 0.0191, + "step": 26348 + }, + { + "epoch": 0.527, + "grad_norm": 0.18195775151252747, + "learning_rate": 1.0801989243288588e-05, + "loss": 0.0099, + "step": 26350 + }, + { + "epoch": 0.52704, + "grad_norm": 7.580629348754883, + "learning_rate": 1.0800597469608046e-05, + "loss": 0.1649, + "step": 26352 + }, + { + "epoch": 0.52708, + "grad_norm": 0.02780088782310486, + "learning_rate": 1.079920568031944e-05, + "loss": 0.0085, + "step": 26354 + }, + { + "epoch": 0.52712, + "grad_norm": 0.46728482842445374, + "learning_rate": 1.0797813875449907e-05, + "loss": 0.006, + "step": 26356 + }, + { + "epoch": 0.52716, + "grad_norm": 0.4120773673057556, + "learning_rate": 1.0796422055026581e-05, + "loss": 0.112, + "step": 26358 + }, + { + "epoch": 0.5272, + "grad_norm": 0.6524427533149719, + "learning_rate": 1.07950302190766e-05, + "loss": 0.0162, + "step": 26360 + }, + { + "epoch": 0.52724, + "grad_norm": 3.7401599884033203, + "learning_rate": 1.0793638367627093e-05, + "loss": 0.078, + "step": 26362 + }, + { + "epoch": 0.52728, + "grad_norm": 0.7036227583885193, + "learning_rate": 1.07922465007052e-05, + "loss": 0.1054, + "step": 26364 + }, + { + "epoch": 0.52732, + "grad_norm": 0.13618990778923035, + "learning_rate": 1.0790854618338049e-05, + "loss": 0.0025, + "step": 26366 + }, + { + "epoch": 0.52736, + "grad_norm": 0.3211521804332733, + "learning_rate": 1.0789462720552786e-05, + "loss": 0.0089, + "step": 26368 + }, + { + "epoch": 0.5274, + "grad_norm": 0.256818562746048, + "learning_rate": 1.0788070807376536e-05, + "loss": 0.0063, + "step": 26370 + }, + { + "epoch": 0.52744, + "grad_norm": 0.3716055154800415, + "learning_rate": 1.0786678878836444e-05, + "loss": 0.4189, + "step": 26372 + }, + { + "epoch": 0.52748, + "grad_norm": 2.216856002807617, + "learning_rate": 1.078528693495964e-05, + "loss": 0.066, + "step": 26374 + }, + { + "epoch": 0.52752, + "grad_norm": 0.13265135884284973, + "learning_rate": 1.0783894975773262e-05, + "loss": 0.0735, + "step": 26376 + }, + { + "epoch": 0.52756, + "grad_norm": 0.7771785855293274, + "learning_rate": 1.0782503001304448e-05, + "loss": 0.0117, + "step": 26378 + }, + { + "epoch": 0.5276, + "grad_norm": 0.8260683417320251, + "learning_rate": 1.0781111011580336e-05, + "loss": 0.0162, + "step": 26380 + }, + { + "epoch": 0.52764, + "grad_norm": 3.2448954582214355, + "learning_rate": 1.0779719006628061e-05, + "loss": 0.0674, + "step": 26382 + }, + { + "epoch": 0.52768, + "grad_norm": 0.6177045702934265, + "learning_rate": 1.0778326986474765e-05, + "loss": 0.0174, + "step": 26384 + }, + { + "epoch": 0.52772, + "grad_norm": 1.6135854721069336, + "learning_rate": 1.0776934951147584e-05, + "loss": 0.0318, + "step": 26386 + }, + { + "epoch": 0.52776, + "grad_norm": 0.049044832587242126, + "learning_rate": 1.0775542900673657e-05, + "loss": 0.0072, + "step": 26388 + }, + { + "epoch": 0.5278, + "grad_norm": 1.2994493246078491, + "learning_rate": 1.0774150835080119e-05, + "loss": 0.0214, + "step": 26390 + }, + { + "epoch": 0.52784, + "grad_norm": 0.6323918700218201, + "learning_rate": 1.0772758754394112e-05, + "loss": 0.023, + "step": 26392 + }, + { + "epoch": 0.52788, + "grad_norm": 0.11425930261611938, + "learning_rate": 1.0771366658642774e-05, + "loss": 0.0024, + "step": 26394 + }, + { + "epoch": 0.52792, + "grad_norm": 3.0206239223480225, + "learning_rate": 1.076997454785325e-05, + "loss": 0.0531, + "step": 26396 + }, + { + "epoch": 0.52796, + "grad_norm": 11.435927391052246, + "learning_rate": 1.0768582422052673e-05, + "loss": 0.712, + "step": 26398 + }, + { + "epoch": 0.528, + "grad_norm": 0.5166125893592834, + "learning_rate": 1.0767190281268187e-05, + "loss": 0.0173, + "step": 26400 + }, + { + "epoch": 0.52804, + "grad_norm": 0.3527916669845581, + "learning_rate": 1.0765798125526931e-05, + "loss": 0.0071, + "step": 26402 + }, + { + "epoch": 0.52808, + "grad_norm": 0.26271936297416687, + "learning_rate": 1.076440595485605e-05, + "loss": 0.0367, + "step": 26404 + }, + { + "epoch": 0.52812, + "grad_norm": 11.756441116333008, + "learning_rate": 1.0763013769282677e-05, + "loss": 0.3005, + "step": 26406 + }, + { + "epoch": 0.52816, + "grad_norm": 0.28375333547592163, + "learning_rate": 1.0761621568833957e-05, + "loss": 0.2395, + "step": 26408 + }, + { + "epoch": 0.5282, + "grad_norm": 0.21554477512836456, + "learning_rate": 1.0760229353537032e-05, + "loss": 0.0066, + "step": 26410 + }, + { + "epoch": 0.52824, + "grad_norm": 3.6381330490112305, + "learning_rate": 1.0758837123419049e-05, + "loss": 0.0674, + "step": 26412 + }, + { + "epoch": 0.52828, + "grad_norm": 2.4321682453155518, + "learning_rate": 1.0757444878507142e-05, + "loss": 0.0343, + "step": 26414 + }, + { + "epoch": 0.52832, + "grad_norm": 0.01334977988153696, + "learning_rate": 1.0756052618828458e-05, + "loss": 0.0084, + "step": 26416 + }, + { + "epoch": 0.52836, + "grad_norm": 4.09330940246582, + "learning_rate": 1.0754660344410137e-05, + "loss": 0.071, + "step": 26418 + }, + { + "epoch": 0.5284, + "grad_norm": 0.399078905582428, + "learning_rate": 1.0753268055279328e-05, + "loss": 0.0149, + "step": 26420 + }, + { + "epoch": 0.52844, + "grad_norm": 0.7425819039344788, + "learning_rate": 1.075187575146317e-05, + "loss": 0.0168, + "step": 26422 + }, + { + "epoch": 0.52848, + "grad_norm": 2.061309337615967, + "learning_rate": 1.0750483432988806e-05, + "loss": 0.0569, + "step": 26424 + }, + { + "epoch": 0.52852, + "grad_norm": 0.10315658897161484, + "learning_rate": 1.074909109988338e-05, + "loss": 0.0888, + "step": 26426 + }, + { + "epoch": 0.52856, + "grad_norm": 6.8898701667785645, + "learning_rate": 1.074769875217404e-05, + "loss": 0.2144, + "step": 26428 + }, + { + "epoch": 0.5286, + "grad_norm": 1.8378477096557617, + "learning_rate": 1.0746306389887924e-05, + "loss": 0.1681, + "step": 26430 + }, + { + "epoch": 0.52864, + "grad_norm": 0.3611679971218109, + "learning_rate": 1.074491401305218e-05, + "loss": 0.0143, + "step": 26432 + }, + { + "epoch": 0.52868, + "grad_norm": 0.32979917526245117, + "learning_rate": 1.0743521621693957e-05, + "loss": 0.0156, + "step": 26434 + }, + { + "epoch": 0.52872, + "grad_norm": 0.10907693952322006, + "learning_rate": 1.07421292158404e-05, + "loss": 0.0123, + "step": 26436 + }, + { + "epoch": 0.52876, + "grad_norm": 0.2340276539325714, + "learning_rate": 1.0740736795518649e-05, + "loss": 0.0257, + "step": 26438 + }, + { + "epoch": 0.5288, + "grad_norm": 2.351132392883301, + "learning_rate": 1.0739344360755853e-05, + "loss": 0.0604, + "step": 26440 + }, + { + "epoch": 0.52884, + "grad_norm": 0.040218476206064224, + "learning_rate": 1.0737951911579154e-05, + "loss": 0.0038, + "step": 26442 + }, + { + "epoch": 0.52888, + "grad_norm": 0.0651169940829277, + "learning_rate": 1.0736559448015708e-05, + "loss": 0.003, + "step": 26444 + }, + { + "epoch": 0.52892, + "grad_norm": 0.9831897616386414, + "learning_rate": 1.0735166970092654e-05, + "loss": 0.0156, + "step": 26446 + }, + { + "epoch": 0.52896, + "grad_norm": 2.5601940155029297, + "learning_rate": 1.0733774477837141e-05, + "loss": 0.0409, + "step": 26448 + }, + { + "epoch": 0.529, + "grad_norm": 0.005807260517030954, + "learning_rate": 1.0732381971276318e-05, + "loss": 0.0058, + "step": 26450 + }, + { + "epoch": 0.52904, + "grad_norm": 0.011485474184155464, + "learning_rate": 1.0730989450437333e-05, + "loss": 0.0003, + "step": 26452 + }, + { + "epoch": 0.52908, + "grad_norm": 1.73483407497406, + "learning_rate": 1.0729596915347329e-05, + "loss": 0.0299, + "step": 26454 + }, + { + "epoch": 0.52912, + "grad_norm": 0.2379186749458313, + "learning_rate": 1.072820436603346e-05, + "loss": 0.0037, + "step": 26456 + }, + { + "epoch": 0.52916, + "grad_norm": 2.901157855987549, + "learning_rate": 1.0726811802522869e-05, + "loss": 0.0421, + "step": 26458 + }, + { + "epoch": 0.5292, + "grad_norm": 0.7324485182762146, + "learning_rate": 1.072541922484271e-05, + "loss": 0.0189, + "step": 26460 + }, + { + "epoch": 0.52924, + "grad_norm": 0.9174254536628723, + "learning_rate": 1.0724026633020131e-05, + "loss": 0.0181, + "step": 26462 + }, + { + "epoch": 0.52928, + "grad_norm": 0.0055763572454452515, + "learning_rate": 1.0722634027082277e-05, + "loss": 0.0045, + "step": 26464 + }, + { + "epoch": 0.52932, + "grad_norm": 0.03846409171819687, + "learning_rate": 1.07212414070563e-05, + "loss": 0.0529, + "step": 26466 + }, + { + "epoch": 0.52936, + "grad_norm": 0.023970862850546837, + "learning_rate": 1.0719848772969352e-05, + "loss": 0.0094, + "step": 26468 + }, + { + "epoch": 0.5294, + "grad_norm": 2.0667366981506348, + "learning_rate": 1.0718456124848584e-05, + "loss": 0.0282, + "step": 26470 + }, + { + "epoch": 0.52944, + "grad_norm": 0.1498708724975586, + "learning_rate": 1.0717063462721143e-05, + "loss": 0.0051, + "step": 26472 + }, + { + "epoch": 0.52948, + "grad_norm": 2.8295836448669434, + "learning_rate": 1.0715670786614178e-05, + "loss": 0.0727, + "step": 26474 + }, + { + "epoch": 0.52952, + "grad_norm": 7.482932090759277, + "learning_rate": 1.0714278096554844e-05, + "loss": 0.1112, + "step": 26476 + }, + { + "epoch": 0.52956, + "grad_norm": 0.34512877464294434, + "learning_rate": 1.0712885392570292e-05, + "loss": 0.0085, + "step": 26478 + }, + { + "epoch": 0.5296, + "grad_norm": 0.34749987721443176, + "learning_rate": 1.071149267468767e-05, + "loss": 0.0633, + "step": 26480 + }, + { + "epoch": 0.52964, + "grad_norm": 0.41208019852638245, + "learning_rate": 1.0710099942934133e-05, + "loss": 0.0769, + "step": 26482 + }, + { + "epoch": 0.52968, + "grad_norm": 0.013913094066083431, + "learning_rate": 1.0708707197336832e-05, + "loss": 0.0024, + "step": 26484 + }, + { + "epoch": 0.52972, + "grad_norm": 0.06713767349720001, + "learning_rate": 1.070731443792292e-05, + "loss": 0.1275, + "step": 26486 + }, + { + "epoch": 0.52976, + "grad_norm": 0.03626180812716484, + "learning_rate": 1.0705921664719547e-05, + "loss": 0.0015, + "step": 26488 + }, + { + "epoch": 0.5298, + "grad_norm": 0.0687965601682663, + "learning_rate": 1.070452887775387e-05, + "loss": 0.022, + "step": 26490 + }, + { + "epoch": 0.52984, + "grad_norm": 0.16281326115131378, + "learning_rate": 1.070313607705304e-05, + "loss": 0.004, + "step": 26492 + }, + { + "epoch": 0.52988, + "grad_norm": 0.03125195577740669, + "learning_rate": 1.0701743262644211e-05, + "loss": 0.0086, + "step": 26494 + }, + { + "epoch": 0.52992, + "grad_norm": 12.888455390930176, + "learning_rate": 1.0700350434554531e-05, + "loss": 0.0967, + "step": 26496 + }, + { + "epoch": 0.52996, + "grad_norm": 0.05132137984037399, + "learning_rate": 1.0698957592811161e-05, + "loss": 0.0009, + "step": 26498 + }, + { + "epoch": 0.53, + "grad_norm": 0.02674899250268936, + "learning_rate": 1.0697564737441254e-05, + "loss": 0.0019, + "step": 26500 + }, + { + "epoch": 0.53004, + "grad_norm": 0.3030546307563782, + "learning_rate": 1.0696171868471963e-05, + "loss": 0.0036, + "step": 26502 + }, + { + "epoch": 0.53008, + "grad_norm": 0.2044653445482254, + "learning_rate": 1.0694778985930443e-05, + "loss": 0.0018, + "step": 26504 + }, + { + "epoch": 0.53012, + "grad_norm": 1.546480417251587, + "learning_rate": 1.069338608984385e-05, + "loss": 0.0299, + "step": 26506 + }, + { + "epoch": 0.53016, + "grad_norm": 0.02706979587674141, + "learning_rate": 1.0691993180239336e-05, + "loss": 0.0024, + "step": 26508 + }, + { + "epoch": 0.5302, + "grad_norm": 1.3045746088027954, + "learning_rate": 1.0690600257144062e-05, + "loss": 0.476, + "step": 26510 + }, + { + "epoch": 0.53024, + "grad_norm": 0.25934961438179016, + "learning_rate": 1.0689207320585176e-05, + "loss": 0.023, + "step": 26512 + }, + { + "epoch": 0.53028, + "grad_norm": 0.013513011857867241, + "learning_rate": 1.068781437058984e-05, + "loss": 0.0051, + "step": 26514 + }, + { + "epoch": 0.53032, + "grad_norm": 0.27069371938705444, + "learning_rate": 1.0686421407185208e-05, + "loss": 0.0024, + "step": 26516 + }, + { + "epoch": 0.53036, + "grad_norm": 6.271989822387695, + "learning_rate": 1.068502843039844e-05, + "loss": 0.2905, + "step": 26518 + }, + { + "epoch": 0.5304, + "grad_norm": 14.566640853881836, + "learning_rate": 1.0683635440256689e-05, + "loss": 0.5899, + "step": 26520 + }, + { + "epoch": 0.53044, + "grad_norm": 11.460806846618652, + "learning_rate": 1.068224243678711e-05, + "loss": 0.5295, + "step": 26522 + }, + { + "epoch": 0.53048, + "grad_norm": 0.03495955839753151, + "learning_rate": 1.0680849420016867e-05, + "loss": 0.2045, + "step": 26524 + }, + { + "epoch": 0.53052, + "grad_norm": 0.2293742448091507, + "learning_rate": 1.0679456389973118e-05, + "loss": 0.0027, + "step": 26526 + }, + { + "epoch": 0.53056, + "grad_norm": 0.17644667625427246, + "learning_rate": 1.067806334668301e-05, + "loss": 0.003, + "step": 26528 + }, + { + "epoch": 0.5306, + "grad_norm": 0.37689825892448425, + "learning_rate": 1.067667029017371e-05, + "loss": 0.0276, + "step": 26530 + }, + { + "epoch": 0.53064, + "grad_norm": 0.009736418724060059, + "learning_rate": 1.0675277220472373e-05, + "loss": 0.0012, + "step": 26532 + }, + { + "epoch": 0.53068, + "grad_norm": 0.010020258836448193, + "learning_rate": 1.0673884137606161e-05, + "loss": 0.015, + "step": 26534 + }, + { + "epoch": 0.53072, + "grad_norm": 1.0507210493087769, + "learning_rate": 1.067249104160223e-05, + "loss": 0.0185, + "step": 26536 + }, + { + "epoch": 0.53076, + "grad_norm": 0.9027954339981079, + "learning_rate": 1.067109793248774e-05, + "loss": 0.0455, + "step": 26538 + }, + { + "epoch": 0.5308, + "grad_norm": 0.08287934958934784, + "learning_rate": 1.0669704810289852e-05, + "loss": 0.0084, + "step": 26540 + }, + { + "epoch": 0.53084, + "grad_norm": 0.3925636410713196, + "learning_rate": 1.066831167503572e-05, + "loss": 0.01, + "step": 26542 + }, + { + "epoch": 0.53088, + "grad_norm": 0.07106564193964005, + "learning_rate": 1.0666918526752513e-05, + "loss": 0.0073, + "step": 26544 + }, + { + "epoch": 0.53092, + "grad_norm": 0.06914015859365463, + "learning_rate": 1.0665525365467381e-05, + "loss": 0.0058, + "step": 26546 + }, + { + "epoch": 0.53096, + "grad_norm": 1.1524646282196045, + "learning_rate": 1.066413219120749e-05, + "loss": 0.022, + "step": 26548 + }, + { + "epoch": 0.531, + "grad_norm": 3.9653890132904053, + "learning_rate": 1.0662739004000005e-05, + "loss": 0.0808, + "step": 26550 + }, + { + "epoch": 0.53104, + "grad_norm": 0.3720242381095886, + "learning_rate": 1.0661345803872076e-05, + "loss": 0.0077, + "step": 26552 + }, + { + "epoch": 0.53108, + "grad_norm": 14.408124923706055, + "learning_rate": 1.0659952590850873e-05, + "loss": 0.4085, + "step": 26554 + }, + { + "epoch": 0.53112, + "grad_norm": 0.059734344482421875, + "learning_rate": 1.0658559364963552e-05, + "loss": 0.0058, + "step": 26556 + }, + { + "epoch": 0.53116, + "grad_norm": 1.7070225477218628, + "learning_rate": 1.0657166126237281e-05, + "loss": 0.1646, + "step": 26558 + }, + { + "epoch": 0.5312, + "grad_norm": 8.584222793579102, + "learning_rate": 1.0655772874699217e-05, + "loss": 0.2528, + "step": 26560 + }, + { + "epoch": 0.53124, + "grad_norm": 0.5128980875015259, + "learning_rate": 1.0654379610376523e-05, + "loss": 0.0068, + "step": 26562 + }, + { + "epoch": 0.53128, + "grad_norm": 0.04522757604718208, + "learning_rate": 1.0652986333296358e-05, + "loss": 0.0009, + "step": 26564 + }, + { + "epoch": 0.53132, + "grad_norm": 2.719454288482666, + "learning_rate": 1.0651593043485893e-05, + "loss": 0.1128, + "step": 26566 + }, + { + "epoch": 0.53136, + "grad_norm": 0.15036891400814056, + "learning_rate": 1.0650199740972285e-05, + "loss": 0.0208, + "step": 26568 + }, + { + "epoch": 0.5314, + "grad_norm": 3.767780303955078, + "learning_rate": 1.0648806425782697e-05, + "loss": 0.0477, + "step": 26570 + }, + { + "epoch": 0.53144, + "grad_norm": 0.11330600082874298, + "learning_rate": 1.0647413097944293e-05, + "loss": 0.0205, + "step": 26572 + }, + { + "epoch": 0.53148, + "grad_norm": 0.5198065638542175, + "learning_rate": 1.0646019757484238e-05, + "loss": 0.0112, + "step": 26574 + }, + { + "epoch": 0.53152, + "grad_norm": 0.162441685795784, + "learning_rate": 1.0644626404429697e-05, + "loss": 0.1406, + "step": 26576 + }, + { + "epoch": 0.53156, + "grad_norm": 0.4099635183811188, + "learning_rate": 1.0643233038807831e-05, + "loss": 0.0081, + "step": 26578 + }, + { + "epoch": 0.5316, + "grad_norm": 0.11459595710039139, + "learning_rate": 1.0641839660645806e-05, + "loss": 0.0136, + "step": 26580 + }, + { + "epoch": 0.53164, + "grad_norm": 0.37849757075309753, + "learning_rate": 1.0640446269970786e-05, + "loss": 0.0072, + "step": 26582 + }, + { + "epoch": 0.53168, + "grad_norm": 0.2680104374885559, + "learning_rate": 1.0639052866809938e-05, + "loss": 0.0112, + "step": 26584 + }, + { + "epoch": 0.53172, + "grad_norm": 0.011925021186470985, + "learning_rate": 1.0637659451190423e-05, + "loss": 0.4494, + "step": 26586 + }, + { + "epoch": 0.53176, + "grad_norm": 0.26280340552330017, + "learning_rate": 1.0636266023139407e-05, + "loss": 0.2221, + "step": 26588 + }, + { + "epoch": 0.5318, + "grad_norm": 0.022689417004585266, + "learning_rate": 1.0634872582684062e-05, + "loss": 0.0076, + "step": 26590 + }, + { + "epoch": 0.53184, + "grad_norm": 0.10928899794816971, + "learning_rate": 1.0633479129851547e-05, + "loss": 0.0031, + "step": 26592 + }, + { + "epoch": 0.53188, + "grad_norm": 12.793343544006348, + "learning_rate": 1.0632085664669032e-05, + "loss": 0.812, + "step": 26594 + }, + { + "epoch": 0.53192, + "grad_norm": 1.3972402811050415, + "learning_rate": 1.0630692187163681e-05, + "loss": 0.1223, + "step": 26596 + }, + { + "epoch": 0.53196, + "grad_norm": 0.27634763717651367, + "learning_rate": 1.0629298697362657e-05, + "loss": 0.0704, + "step": 26598 + }, + { + "epoch": 0.532, + "grad_norm": 1.0445234775543213, + "learning_rate": 1.0627905195293135e-05, + "loss": 0.0254, + "step": 26600 + }, + { + "epoch": 0.53204, + "grad_norm": 0.048153504729270935, + "learning_rate": 1.0626511680982276e-05, + "loss": 0.0026, + "step": 26602 + }, + { + "epoch": 0.53208, + "grad_norm": 0.2607126235961914, + "learning_rate": 1.0625118154457251e-05, + "loss": 0.0045, + "step": 26604 + }, + { + "epoch": 0.53212, + "grad_norm": 0.6603890657424927, + "learning_rate": 1.0623724615745223e-05, + "loss": 0.0329, + "step": 26606 + }, + { + "epoch": 0.53216, + "grad_norm": 1.1724435091018677, + "learning_rate": 1.0622331064873365e-05, + "loss": 0.0173, + "step": 26608 + }, + { + "epoch": 0.5322, + "grad_norm": 7.153045654296875, + "learning_rate": 1.0620937501868842e-05, + "loss": 0.5228, + "step": 26610 + }, + { + "epoch": 0.53224, + "grad_norm": 1.534719705581665, + "learning_rate": 1.0619543926758824e-05, + "loss": 0.0417, + "step": 26612 + }, + { + "epoch": 0.53228, + "grad_norm": 0.13035470247268677, + "learning_rate": 1.0618150339570476e-05, + "loss": 0.0038, + "step": 26614 + }, + { + "epoch": 0.53232, + "grad_norm": 0.12394361943006516, + "learning_rate": 1.0616756740330971e-05, + "loss": 0.0541, + "step": 26616 + }, + { + "epoch": 0.53236, + "grad_norm": 1.542615532875061, + "learning_rate": 1.0615363129067473e-05, + "loss": 0.3734, + "step": 26618 + }, + { + "epoch": 0.5324, + "grad_norm": 1.1763298511505127, + "learning_rate": 1.0613969505807157e-05, + "loss": 0.0321, + "step": 26620 + }, + { + "epoch": 0.53244, + "grad_norm": 0.1407637745141983, + "learning_rate": 1.0612575870577186e-05, + "loss": 0.0264, + "step": 26622 + }, + { + "epoch": 0.53248, + "grad_norm": 2.873258590698242, + "learning_rate": 1.0611182223404737e-05, + "loss": 0.0497, + "step": 26624 + }, + { + "epoch": 0.53252, + "grad_norm": 2.724318027496338, + "learning_rate": 1.0609788564316972e-05, + "loss": 0.0433, + "step": 26626 + }, + { + "epoch": 0.53256, + "grad_norm": 3.6219093799591064, + "learning_rate": 1.0608394893341067e-05, + "loss": 0.075, + "step": 26628 + }, + { + "epoch": 0.5326, + "grad_norm": 5.235140323638916, + "learning_rate": 1.060700121050419e-05, + "loss": 0.3419, + "step": 26630 + }, + { + "epoch": 0.53264, + "grad_norm": 0.48189932107925415, + "learning_rate": 1.0605607515833513e-05, + "loss": 0.0136, + "step": 26632 + }, + { + "epoch": 0.53268, + "grad_norm": 5.106363773345947, + "learning_rate": 1.0604213809356205e-05, + "loss": 0.084, + "step": 26634 + }, + { + "epoch": 0.53272, + "grad_norm": 0.38182878494262695, + "learning_rate": 1.0602820091099438e-05, + "loss": 0.0239, + "step": 26636 + }, + { + "epoch": 0.53276, + "grad_norm": 1.2527564764022827, + "learning_rate": 1.0601426361090378e-05, + "loss": 0.0249, + "step": 26638 + }, + { + "epoch": 0.5328, + "grad_norm": 1.0470901727676392, + "learning_rate": 1.0600032619356208e-05, + "loss": 0.036, + "step": 26640 + }, + { + "epoch": 0.53284, + "grad_norm": 1.0703178644180298, + "learning_rate": 1.0598638865924092e-05, + "loss": 0.0361, + "step": 26642 + }, + { + "epoch": 0.53288, + "grad_norm": 0.8338672518730164, + "learning_rate": 1.05972451008212e-05, + "loss": 0.0137, + "step": 26644 + }, + { + "epoch": 0.53292, + "grad_norm": 0.5128210186958313, + "learning_rate": 1.0595851324074711e-05, + "loss": 0.0294, + "step": 26646 + }, + { + "epoch": 0.53296, + "grad_norm": 0.10662674903869629, + "learning_rate": 1.0594457535711795e-05, + "loss": 0.0073, + "step": 26648 + }, + { + "epoch": 0.533, + "grad_norm": 0.07116353511810303, + "learning_rate": 1.0593063735759619e-05, + "loss": 0.0159, + "step": 26650 + }, + { + "epoch": 0.53304, + "grad_norm": 0.39650198817253113, + "learning_rate": 1.059166992424536e-05, + "loss": 0.0087, + "step": 26652 + }, + { + "epoch": 0.53308, + "grad_norm": 0.2064025104045868, + "learning_rate": 1.0590276101196194e-05, + "loss": 0.0144, + "step": 26654 + }, + { + "epoch": 0.53312, + "grad_norm": 1.118259310722351, + "learning_rate": 1.0588882266639289e-05, + "loss": 0.0159, + "step": 26656 + }, + { + "epoch": 0.53316, + "grad_norm": 0.09745229035615921, + "learning_rate": 1.0587488420601822e-05, + "loss": 0.0129, + "step": 26658 + }, + { + "epoch": 0.5332, + "grad_norm": 0.6743201613426208, + "learning_rate": 1.0586094563110965e-05, + "loss": 0.015, + "step": 26660 + }, + { + "epoch": 0.53324, + "grad_norm": 0.270235538482666, + "learning_rate": 1.0584700694193898e-05, + "loss": 0.0043, + "step": 26662 + }, + { + "epoch": 0.53328, + "grad_norm": 0.5399250984191895, + "learning_rate": 1.0583306813877784e-05, + "loss": 0.0111, + "step": 26664 + }, + { + "epoch": 0.53332, + "grad_norm": 0.7938815951347351, + "learning_rate": 1.0581912922189805e-05, + "loss": 0.1079, + "step": 26666 + }, + { + "epoch": 0.53336, + "grad_norm": 1.8007539510726929, + "learning_rate": 1.0580519019157134e-05, + "loss": 0.0662, + "step": 26668 + }, + { + "epoch": 0.5334, + "grad_norm": 0.5355725288391113, + "learning_rate": 1.0579125104806944e-05, + "loss": 0.0311, + "step": 26670 + }, + { + "epoch": 0.53344, + "grad_norm": 1.7203783988952637, + "learning_rate": 1.0577731179166415e-05, + "loss": 0.1033, + "step": 26672 + }, + { + "epoch": 0.53348, + "grad_norm": 0.23350279033184052, + "learning_rate": 1.0576337242262717e-05, + "loss": 0.007, + "step": 26674 + }, + { + "epoch": 0.53352, + "grad_norm": 2.981083869934082, + "learning_rate": 1.057494329412303e-05, + "loss": 0.0476, + "step": 26676 + }, + { + "epoch": 0.53356, + "grad_norm": 0.6918087005615234, + "learning_rate": 1.0573549334774524e-05, + "loss": 0.0138, + "step": 26678 + }, + { + "epoch": 0.5336, + "grad_norm": 0.0070442636497318745, + "learning_rate": 1.0572155364244383e-05, + "loss": 0.0006, + "step": 26680 + }, + { + "epoch": 0.53364, + "grad_norm": 13.985828399658203, + "learning_rate": 1.0570761382559778e-05, + "loss": 0.2346, + "step": 26682 + }, + { + "epoch": 0.53368, + "grad_norm": 0.03996047005057335, + "learning_rate": 1.0569367389747883e-05, + "loss": 0.0475, + "step": 26684 + }, + { + "epoch": 0.53372, + "grad_norm": 3.8288960456848145, + "learning_rate": 1.056797338583588e-05, + "loss": 0.0691, + "step": 26686 + }, + { + "epoch": 0.53376, + "grad_norm": 0.8162854909896851, + "learning_rate": 1.056657937085094e-05, + "loss": 0.0154, + "step": 26688 + }, + { + "epoch": 0.5338, + "grad_norm": 0.022087369114160538, + "learning_rate": 1.0565185344820248e-05, + "loss": 0.0034, + "step": 26690 + }, + { + "epoch": 0.53384, + "grad_norm": 1.0199159383773804, + "learning_rate": 1.0563791307770972e-05, + "loss": 0.0468, + "step": 26692 + }, + { + "epoch": 0.53388, + "grad_norm": 0.33072197437286377, + "learning_rate": 1.0562397259730298e-05, + "loss": 0.166, + "step": 26694 + }, + { + "epoch": 0.53392, + "grad_norm": 0.14873859286308289, + "learning_rate": 1.05610032007254e-05, + "loss": 0.0087, + "step": 26696 + }, + { + "epoch": 0.53396, + "grad_norm": 0.13721659779548645, + "learning_rate": 1.0559609130783455e-05, + "loss": 0.0113, + "step": 26698 + }, + { + "epoch": 0.534, + "grad_norm": 0.1838867962360382, + "learning_rate": 1.055821504993164e-05, + "loss": 0.0042, + "step": 26700 + }, + { + "epoch": 0.53404, + "grad_norm": 1.739628791809082, + "learning_rate": 1.0556820958197136e-05, + "loss": 0.0361, + "step": 26702 + }, + { + "epoch": 0.53408, + "grad_norm": 0.4831330180168152, + "learning_rate": 1.055542685560712e-05, + "loss": 0.0085, + "step": 26704 + }, + { + "epoch": 0.53412, + "grad_norm": 0.03182966634631157, + "learning_rate": 1.0554032742188776e-05, + "loss": 0.0474, + "step": 26706 + }, + { + "epoch": 0.53416, + "grad_norm": 0.1673239916563034, + "learning_rate": 1.0552638617969274e-05, + "loss": 0.0056, + "step": 26708 + }, + { + "epoch": 0.5342, + "grad_norm": 0.016697289422154427, + "learning_rate": 1.0551244482975798e-05, + "loss": 0.0019, + "step": 26710 + }, + { + "epoch": 0.53424, + "grad_norm": 0.16891339421272278, + "learning_rate": 1.0549850337235526e-05, + "loss": 0.0048, + "step": 26712 + }, + { + "epoch": 0.53428, + "grad_norm": 0.4695425033569336, + "learning_rate": 1.0548456180775642e-05, + "loss": 0.0063, + "step": 26714 + }, + { + "epoch": 0.53432, + "grad_norm": 0.25559601187705994, + "learning_rate": 1.0547062013623321e-05, + "loss": 0.0147, + "step": 26716 + }, + { + "epoch": 0.53436, + "grad_norm": 0.43617576360702515, + "learning_rate": 1.0545667835805743e-05, + "loss": 0.0083, + "step": 26718 + }, + { + "epoch": 0.5344, + "grad_norm": 0.6374733448028564, + "learning_rate": 1.0544273647350091e-05, + "loss": 0.0085, + "step": 26720 + }, + { + "epoch": 0.53444, + "grad_norm": 0.8706669211387634, + "learning_rate": 1.0542879448283547e-05, + "loss": 0.702, + "step": 26722 + }, + { + "epoch": 0.53448, + "grad_norm": 0.09425900131464005, + "learning_rate": 1.0541485238633284e-05, + "loss": 0.0033, + "step": 26724 + }, + { + "epoch": 0.53452, + "grad_norm": 0.9422333836555481, + "learning_rate": 1.0540091018426488e-05, + "loss": 0.025, + "step": 26726 + }, + { + "epoch": 0.53456, + "grad_norm": 0.08172561228275299, + "learning_rate": 1.0538696787690341e-05, + "loss": 0.0052, + "step": 26728 + }, + { + "epoch": 0.5346, + "grad_norm": 0.02842836081981659, + "learning_rate": 1.0537302546452022e-05, + "loss": 0.0136, + "step": 26730 + }, + { + "epoch": 0.53464, + "grad_norm": 1.5398122072219849, + "learning_rate": 1.0535908294738718e-05, + "loss": 0.0427, + "step": 26732 + }, + { + "epoch": 0.53468, + "grad_norm": 3.2734169960021973, + "learning_rate": 1.0534514032577599e-05, + "loss": 0.0708, + "step": 26734 + }, + { + "epoch": 0.53472, + "grad_norm": 2.9789044857025146, + "learning_rate": 1.0533119759995856e-05, + "loss": 0.2093, + "step": 26736 + }, + { + "epoch": 0.53476, + "grad_norm": 13.619478225708008, + "learning_rate": 1.053172547702067e-05, + "loss": 0.4519, + "step": 26738 + }, + { + "epoch": 0.5348, + "grad_norm": 0.04206617921590805, + "learning_rate": 1.053033118367922e-05, + "loss": 0.0381, + "step": 26740 + }, + { + "epoch": 0.53484, + "grad_norm": 0.08738017827272415, + "learning_rate": 1.052893687999869e-05, + "loss": 0.0105, + "step": 26742 + }, + { + "epoch": 0.53488, + "grad_norm": 3.4443109035491943, + "learning_rate": 1.0527542566006266e-05, + "loss": 0.0553, + "step": 26744 + }, + { + "epoch": 0.53492, + "grad_norm": 2.575432777404785, + "learning_rate": 1.0526148241729128e-05, + "loss": 0.0385, + "step": 26746 + }, + { + "epoch": 0.53496, + "grad_norm": 10.418098449707031, + "learning_rate": 1.0524753907194456e-05, + "loss": 0.8126, + "step": 26748 + }, + { + "epoch": 0.535, + "grad_norm": 3.0101332664489746, + "learning_rate": 1.0523359562429441e-05, + "loss": 0.032, + "step": 26750 + }, + { + "epoch": 0.53504, + "grad_norm": 0.7773890495300293, + "learning_rate": 1.0521965207461258e-05, + "loss": 0.0936, + "step": 26752 + }, + { + "epoch": 0.53508, + "grad_norm": 0.04510436952114105, + "learning_rate": 1.0520570842317097e-05, + "loss": 0.0246, + "step": 26754 + }, + { + "epoch": 0.53512, + "grad_norm": 0.05654945969581604, + "learning_rate": 1.051917646702414e-05, + "loss": 0.0011, + "step": 26756 + }, + { + "epoch": 0.53516, + "grad_norm": 0.22508074343204498, + "learning_rate": 1.0517782081609566e-05, + "loss": 0.0118, + "step": 26758 + }, + { + "epoch": 0.5352, + "grad_norm": 0.9879879355430603, + "learning_rate": 1.0516387686100566e-05, + "loss": 0.1275, + "step": 26760 + }, + { + "epoch": 0.53524, + "grad_norm": 2.5072786808013916, + "learning_rate": 1.0514993280524323e-05, + "loss": 0.0285, + "step": 26762 + }, + { + "epoch": 0.53528, + "grad_norm": 0.7367064952850342, + "learning_rate": 1.051359886490802e-05, + "loss": 0.0739, + "step": 26764 + }, + { + "epoch": 0.53532, + "grad_norm": 0.19331993162631989, + "learning_rate": 1.0512204439278845e-05, + "loss": 0.0038, + "step": 26766 + }, + { + "epoch": 0.53536, + "grad_norm": 0.6092647910118103, + "learning_rate": 1.051081000366398e-05, + "loss": 0.0131, + "step": 26768 + }, + { + "epoch": 0.5354, + "grad_norm": 0.07854962348937988, + "learning_rate": 1.050941555809061e-05, + "loss": 0.0056, + "step": 26770 + }, + { + "epoch": 0.53544, + "grad_norm": 0.09461840242147446, + "learning_rate": 1.0508021102585921e-05, + "loss": 0.0536, + "step": 26772 + }, + { + "epoch": 0.53548, + "grad_norm": 0.22532686591148376, + "learning_rate": 1.05066266371771e-05, + "loss": 0.0091, + "step": 26774 + }, + { + "epoch": 0.53552, + "grad_norm": 0.285540372133255, + "learning_rate": 1.050523216189133e-05, + "loss": 0.046, + "step": 26776 + }, + { + "epoch": 0.53556, + "grad_norm": 0.07825616747140884, + "learning_rate": 1.05038376767558e-05, + "loss": 0.0051, + "step": 26778 + }, + { + "epoch": 0.5356, + "grad_norm": 0.5147652626037598, + "learning_rate": 1.0502443181797696e-05, + "loss": 0.0168, + "step": 26780 + }, + { + "epoch": 0.53564, + "grad_norm": 0.29202404618263245, + "learning_rate": 1.0501048677044203e-05, + "loss": 0.0081, + "step": 26782 + }, + { + "epoch": 0.53568, + "grad_norm": 4.922876834869385, + "learning_rate": 1.049965416252251e-05, + "loss": 0.2056, + "step": 26784 + }, + { + "epoch": 0.53572, + "grad_norm": 0.21585983037948608, + "learning_rate": 1.0498259638259797e-05, + "loss": 0.0452, + "step": 26786 + }, + { + "epoch": 0.53576, + "grad_norm": 0.011350870132446289, + "learning_rate": 1.0496865104283263e-05, + "loss": 0.0005, + "step": 26788 + }, + { + "epoch": 0.5358, + "grad_norm": 0.4773101806640625, + "learning_rate": 1.0495470560620082e-05, + "loss": 0.0097, + "step": 26790 + }, + { + "epoch": 0.53584, + "grad_norm": 5.556761741638184, + "learning_rate": 1.0494076007297449e-05, + "loss": 0.1608, + "step": 26792 + }, + { + "epoch": 0.53588, + "grad_norm": 0.0576479509472847, + "learning_rate": 1.0492681444342549e-05, + "loss": 0.0724, + "step": 26794 + }, + { + "epoch": 0.53592, + "grad_norm": 0.024008596315979958, + "learning_rate": 1.0491286871782573e-05, + "loss": 0.034, + "step": 26796 + }, + { + "epoch": 0.53596, + "grad_norm": 0.9200670719146729, + "learning_rate": 1.0489892289644703e-05, + "loss": 0.0282, + "step": 26798 + }, + { + "epoch": 0.536, + "grad_norm": 0.07679222524166107, + "learning_rate": 1.0488497697956134e-05, + "loss": 0.0047, + "step": 26800 + }, + { + "epoch": 0.53604, + "grad_norm": 0.05182759836316109, + "learning_rate": 1.0487103096744049e-05, + "loss": 0.0125, + "step": 26802 + }, + { + "epoch": 0.53608, + "grad_norm": 0.15791594982147217, + "learning_rate": 1.0485708486035642e-05, + "loss": 0.0544, + "step": 26804 + }, + { + "epoch": 0.53612, + "grad_norm": 3.4111361503601074, + "learning_rate": 1.0484313865858093e-05, + "loss": 0.0438, + "step": 26806 + }, + { + "epoch": 0.53616, + "grad_norm": 0.0504157580435276, + "learning_rate": 1.0482919236238598e-05, + "loss": 0.0027, + "step": 26808 + }, + { + "epoch": 0.5362, + "grad_norm": 1.0512336492538452, + "learning_rate": 1.0481524597204342e-05, + "loss": 0.0356, + "step": 26810 + }, + { + "epoch": 0.53624, + "grad_norm": 0.1932576596736908, + "learning_rate": 1.0480129948782518e-05, + "loss": 0.0029, + "step": 26812 + }, + { + "epoch": 0.53628, + "grad_norm": 0.4421952962875366, + "learning_rate": 1.0478735291000313e-05, + "loss": 0.0103, + "step": 26814 + }, + { + "epoch": 0.53632, + "grad_norm": 0.39671656489372253, + "learning_rate": 1.0477340623884917e-05, + "loss": 0.1308, + "step": 26816 + }, + { + "epoch": 0.53636, + "grad_norm": 0.5671156644821167, + "learning_rate": 1.0475945947463522e-05, + "loss": 0.0065, + "step": 26818 + }, + { + "epoch": 0.5364, + "grad_norm": 0.0068065812811255455, + "learning_rate": 1.0474551261763315e-05, + "loss": 0.0028, + "step": 26820 + }, + { + "epoch": 0.53644, + "grad_norm": 0.02359149232506752, + "learning_rate": 1.0473156566811485e-05, + "loss": 0.038, + "step": 26822 + }, + { + "epoch": 0.53648, + "grad_norm": 0.05548625439405441, + "learning_rate": 1.0471761862635223e-05, + "loss": 0.0125, + "step": 26824 + }, + { + "epoch": 0.53652, + "grad_norm": 0.05280667543411255, + "learning_rate": 1.0470367149261722e-05, + "loss": 0.0428, + "step": 26826 + }, + { + "epoch": 0.53656, + "grad_norm": 1.2277755737304688, + "learning_rate": 1.0468972426718171e-05, + "loss": 0.0143, + "step": 26828 + }, + { + "epoch": 0.5366, + "grad_norm": 0.1072806790471077, + "learning_rate": 1.0467577695031763e-05, + "loss": 0.0021, + "step": 26830 + }, + { + "epoch": 0.53664, + "grad_norm": 2.5208253860473633, + "learning_rate": 1.0466182954229684e-05, + "loss": 0.0313, + "step": 26832 + }, + { + "epoch": 0.53668, + "grad_norm": 0.1986348032951355, + "learning_rate": 1.0464788204339132e-05, + "loss": 0.0065, + "step": 26834 + }, + { + "epoch": 0.53672, + "grad_norm": 0.03794122114777565, + "learning_rate": 1.0463393445387291e-05, + "loss": 0.0013, + "step": 26836 + }, + { + "epoch": 0.53676, + "grad_norm": 7.0868940353393555, + "learning_rate": 1.0461998677401359e-05, + "loss": 0.2082, + "step": 26838 + }, + { + "epoch": 0.5368, + "grad_norm": 0.01382212620228529, + "learning_rate": 1.0460603900408523e-05, + "loss": 0.0052, + "step": 26840 + }, + { + "epoch": 0.53684, + "grad_norm": 0.17006628215312958, + "learning_rate": 1.0459209114435977e-05, + "loss": 0.0046, + "step": 26842 + }, + { + "epoch": 0.53688, + "grad_norm": 0.08929680287837982, + "learning_rate": 1.0457814319510914e-05, + "loss": 0.0481, + "step": 26844 + }, + { + "epoch": 0.53692, + "grad_norm": 0.12657152116298676, + "learning_rate": 1.0456419515660525e-05, + "loss": 0.0038, + "step": 26846 + }, + { + "epoch": 0.53696, + "grad_norm": 4.013470649719238, + "learning_rate": 1.0455024702912002e-05, + "loss": 0.6353, + "step": 26848 + }, + { + "epoch": 0.537, + "grad_norm": 0.13649001717567444, + "learning_rate": 1.0453629881292537e-05, + "loss": 0.002, + "step": 26850 + }, + { + "epoch": 0.53704, + "grad_norm": 0.32332271337509155, + "learning_rate": 1.0452235050829326e-05, + "loss": 0.0069, + "step": 26852 + }, + { + "epoch": 0.53708, + "grad_norm": 0.01091464702039957, + "learning_rate": 1.0450840211549563e-05, + "loss": 0.0005, + "step": 26854 + }, + { + "epoch": 0.53712, + "grad_norm": 0.05819209665060043, + "learning_rate": 1.0449445363480433e-05, + "loss": 0.5691, + "step": 26856 + }, + { + "epoch": 0.53716, + "grad_norm": 0.1847330778837204, + "learning_rate": 1.0448050506649134e-05, + "loss": 0.1921, + "step": 26858 + }, + { + "epoch": 0.5372, + "grad_norm": 3.2470703125, + "learning_rate": 1.0446655641082864e-05, + "loss": 0.0401, + "step": 26860 + }, + { + "epoch": 0.53724, + "grad_norm": 0.08175677806138992, + "learning_rate": 1.0445260766808807e-05, + "loss": 0.0015, + "step": 26862 + }, + { + "epoch": 0.53728, + "grad_norm": 0.02038196101784706, + "learning_rate": 1.0443865883854165e-05, + "loss": 0.0106, + "step": 26864 + }, + { + "epoch": 0.53732, + "grad_norm": 0.04918184503912926, + "learning_rate": 1.0442470992246128e-05, + "loss": 0.0009, + "step": 26866 + }, + { + "epoch": 0.53736, + "grad_norm": 0.7314780354499817, + "learning_rate": 1.0441076092011893e-05, + "loss": 0.01, + "step": 26868 + }, + { + "epoch": 0.5374, + "grad_norm": 0.019521113485097885, + "learning_rate": 1.043968118317865e-05, + "loss": 0.0719, + "step": 26870 + }, + { + "epoch": 0.53744, + "grad_norm": 15.077009201049805, + "learning_rate": 1.0438286265773599e-05, + "loss": 0.5685, + "step": 26872 + }, + { + "epoch": 0.53748, + "grad_norm": 2.6114540100097656, + "learning_rate": 1.0436891339823928e-05, + "loss": 0.0753, + "step": 26874 + }, + { + "epoch": 0.53752, + "grad_norm": 0.05800892785191536, + "learning_rate": 1.043549640535684e-05, + "loss": 0.0013, + "step": 26876 + }, + { + "epoch": 0.53756, + "grad_norm": 0.09249979257583618, + "learning_rate": 1.0434101462399521e-05, + "loss": 0.0023, + "step": 26878 + }, + { + "epoch": 0.5376, + "grad_norm": 0.06464552134275436, + "learning_rate": 1.0432706510979172e-05, + "loss": 0.0089, + "step": 26880 + }, + { + "epoch": 0.53764, + "grad_norm": 0.028194110840559006, + "learning_rate": 1.0431311551122986e-05, + "loss": 0.1059, + "step": 26882 + }, + { + "epoch": 0.53768, + "grad_norm": 0.06307654082775116, + "learning_rate": 1.0429916582858159e-05, + "loss": 0.0008, + "step": 26884 + }, + { + "epoch": 0.53772, + "grad_norm": 0.49621468782424927, + "learning_rate": 1.042852160621189e-05, + "loss": 0.0595, + "step": 26886 + }, + { + "epoch": 0.53776, + "grad_norm": 0.25208669900894165, + "learning_rate": 1.0427126621211372e-05, + "loss": 0.0141, + "step": 26888 + }, + { + "epoch": 0.5378, + "grad_norm": 1.3454445600509644, + "learning_rate": 1.0425731627883798e-05, + "loss": 0.021, + "step": 26890 + }, + { + "epoch": 0.53784, + "grad_norm": 0.993198812007904, + "learning_rate": 1.0424336626256365e-05, + "loss": 0.0136, + "step": 26892 + }, + { + "epoch": 0.53788, + "grad_norm": 11.32917594909668, + "learning_rate": 1.0422941616356274e-05, + "loss": 0.2067, + "step": 26894 + }, + { + "epoch": 0.53792, + "grad_norm": 14.412635803222656, + "learning_rate": 1.0421546598210716e-05, + "loss": 0.3235, + "step": 26896 + }, + { + "epoch": 0.53796, + "grad_norm": 0.12481661885976791, + "learning_rate": 1.0420151571846893e-05, + "loss": 0.0734, + "step": 26898 + }, + { + "epoch": 0.538, + "grad_norm": 11.817384719848633, + "learning_rate": 1.0418756537291996e-05, + "loss": 0.3216, + "step": 26900 + }, + { + "epoch": 0.53804, + "grad_norm": 14.205918312072754, + "learning_rate": 1.0417361494573229e-05, + "loss": 1.1421, + "step": 26902 + }, + { + "epoch": 0.53808, + "grad_norm": 13.620315551757812, + "learning_rate": 1.041596644371778e-05, + "loss": 1.3395, + "step": 26904 + }, + { + "epoch": 0.53812, + "grad_norm": 0.422826886177063, + "learning_rate": 1.0414571384752854e-05, + "loss": 0.0054, + "step": 26906 + }, + { + "epoch": 0.53816, + "grad_norm": 3.1883087158203125, + "learning_rate": 1.0413176317705645e-05, + "loss": 0.0487, + "step": 26908 + }, + { + "epoch": 0.5382, + "grad_norm": 2.3425989151000977, + "learning_rate": 1.0411781242603352e-05, + "loss": 0.0404, + "step": 26910 + }, + { + "epoch": 0.53824, + "grad_norm": 1.1288100481033325, + "learning_rate": 1.0410386159473168e-05, + "loss": 0.0155, + "step": 26912 + }, + { + "epoch": 0.53828, + "grad_norm": 5.951961040496826, + "learning_rate": 1.0408991068342296e-05, + "loss": 0.1274, + "step": 26914 + }, + { + "epoch": 0.53832, + "grad_norm": 1.2351343631744385, + "learning_rate": 1.0407595969237931e-05, + "loss": 0.3373, + "step": 26916 + }, + { + "epoch": 0.53836, + "grad_norm": 0.12393581122159958, + "learning_rate": 1.0406200862187277e-05, + "loss": 0.006, + "step": 26918 + }, + { + "epoch": 0.5384, + "grad_norm": 0.09643212705850601, + "learning_rate": 1.0404805747217525e-05, + "loss": 0.0163, + "step": 26920 + }, + { + "epoch": 0.53844, + "grad_norm": 3.237812042236328, + "learning_rate": 1.0403410624355878e-05, + "loss": 0.0736, + "step": 26922 + }, + { + "epoch": 0.53848, + "grad_norm": 1.832435965538025, + "learning_rate": 1.0402015493629533e-05, + "loss": 0.0409, + "step": 26924 + }, + { + "epoch": 0.53852, + "grad_norm": 2.081012725830078, + "learning_rate": 1.0400620355065691e-05, + "loss": 0.0673, + "step": 26926 + }, + { + "epoch": 0.53856, + "grad_norm": 0.03153998777270317, + "learning_rate": 1.0399225208691546e-05, + "loss": 0.1161, + "step": 26928 + }, + { + "epoch": 0.5386, + "grad_norm": 9.90538215637207, + "learning_rate": 1.03978300545343e-05, + "loss": 0.4556, + "step": 26930 + }, + { + "epoch": 0.53864, + "grad_norm": 0.4740760028362274, + "learning_rate": 1.0396434892621152e-05, + "loss": 0.3823, + "step": 26932 + }, + { + "epoch": 0.53868, + "grad_norm": 0.31176042556762695, + "learning_rate": 1.0395039722979303e-05, + "loss": 0.1669, + "step": 26934 + }, + { + "epoch": 0.53872, + "grad_norm": 0.6280246376991272, + "learning_rate": 1.039364454563595e-05, + "loss": 0.0125, + "step": 26936 + }, + { + "epoch": 0.53876, + "grad_norm": 0.6490773558616638, + "learning_rate": 1.0392249360618296e-05, + "loss": 0.2144, + "step": 26938 + }, + { + "epoch": 0.5388, + "grad_norm": 2.8015286922454834, + "learning_rate": 1.0390854167953537e-05, + "loss": 0.061, + "step": 26940 + }, + { + "epoch": 0.53884, + "grad_norm": 0.4719173014163971, + "learning_rate": 1.0389458967668877e-05, + "loss": 0.0221, + "step": 26942 + }, + { + "epoch": 0.53888, + "grad_norm": 0.19596363604068756, + "learning_rate": 1.0388063759791512e-05, + "loss": 0.0364, + "step": 26944 + }, + { + "epoch": 0.53892, + "grad_norm": 0.3093459904193878, + "learning_rate": 1.0386668544348642e-05, + "loss": 0.1106, + "step": 26946 + }, + { + "epoch": 0.53896, + "grad_norm": 0.24900685250759125, + "learning_rate": 1.0385273321367473e-05, + "loss": 0.0054, + "step": 26948 + }, + { + "epoch": 0.539, + "grad_norm": 0.7571417093276978, + "learning_rate": 1.03838780908752e-05, + "loss": 0.0345, + "step": 26950 + }, + { + "epoch": 0.53904, + "grad_norm": 0.7199618220329285, + "learning_rate": 1.0382482852899027e-05, + "loss": 0.0134, + "step": 26952 + }, + { + "epoch": 0.53908, + "grad_norm": 0.04063424840569496, + "learning_rate": 1.0381087607466154e-05, + "loss": 0.0011, + "step": 26954 + }, + { + "epoch": 0.53912, + "grad_norm": 0.6411146521568298, + "learning_rate": 1.0379692354603784e-05, + "loss": 0.0141, + "step": 26956 + }, + { + "epoch": 0.53916, + "grad_norm": 0.9855934381484985, + "learning_rate": 1.0378297094339116e-05, + "loss": 0.0611, + "step": 26958 + }, + { + "epoch": 0.5392, + "grad_norm": 0.5699477195739746, + "learning_rate": 1.0376901826699349e-05, + "loss": 0.0225, + "step": 26960 + }, + { + "epoch": 0.53924, + "grad_norm": 0.16809101402759552, + "learning_rate": 1.0375506551711685e-05, + "loss": 0.0029, + "step": 26962 + }, + { + "epoch": 0.53928, + "grad_norm": 0.7330561876296997, + "learning_rate": 1.0374111269403328e-05, + "loss": 0.1862, + "step": 26964 + }, + { + "epoch": 0.53932, + "grad_norm": 4.518948078155518, + "learning_rate": 1.0372715979801485e-05, + "loss": 0.1091, + "step": 26966 + }, + { + "epoch": 0.53936, + "grad_norm": 9.75075912475586, + "learning_rate": 1.0371320682933346e-05, + "loss": 0.2881, + "step": 26968 + }, + { + "epoch": 0.5394, + "grad_norm": 1.1619610786437988, + "learning_rate": 1.036992537882612e-05, + "loss": 0.0356, + "step": 26970 + }, + { + "epoch": 0.53944, + "grad_norm": 0.10807998478412628, + "learning_rate": 1.0368530067507013e-05, + "loss": 0.2523, + "step": 26972 + }, + { + "epoch": 0.53948, + "grad_norm": 0.041611094027757645, + "learning_rate": 1.0367134749003216e-05, + "loss": 0.0531, + "step": 26974 + }, + { + "epoch": 0.53952, + "grad_norm": 7.0103302001953125, + "learning_rate": 1.0365739423341942e-05, + "loss": 0.3667, + "step": 26976 + }, + { + "epoch": 0.53956, + "grad_norm": 1.655321717262268, + "learning_rate": 1.0364344090550389e-05, + "loss": 0.4376, + "step": 26978 + }, + { + "epoch": 0.5396, + "grad_norm": 0.12761345505714417, + "learning_rate": 1.036294875065576e-05, + "loss": 0.004, + "step": 26980 + }, + { + "epoch": 0.53964, + "grad_norm": 9.105834007263184, + "learning_rate": 1.0361553403685258e-05, + "loss": 0.2844, + "step": 26982 + }, + { + "epoch": 0.53968, + "grad_norm": 0.47837772965431213, + "learning_rate": 1.0360158049666086e-05, + "loss": 0.0196, + "step": 26984 + }, + { + "epoch": 0.53972, + "grad_norm": 0.1427871137857437, + "learning_rate": 1.0358762688625447e-05, + "loss": 0.0088, + "step": 26986 + }, + { + "epoch": 0.53976, + "grad_norm": 0.4857753813266754, + "learning_rate": 1.0357367320590546e-05, + "loss": 0.0429, + "step": 26988 + }, + { + "epoch": 0.5398, + "grad_norm": 0.15699543058872223, + "learning_rate": 1.0355971945588586e-05, + "loss": 0.0552, + "step": 26990 + }, + { + "epoch": 0.53984, + "grad_norm": 0.18718573451042175, + "learning_rate": 1.035457656364677e-05, + "loss": 0.0066, + "step": 26992 + }, + { + "epoch": 0.53988, + "grad_norm": 0.19360795617103577, + "learning_rate": 1.03531811747923e-05, + "loss": 0.0228, + "step": 26994 + }, + { + "epoch": 0.53992, + "grad_norm": 1.8113163709640503, + "learning_rate": 1.0351785779052379e-05, + "loss": 0.0317, + "step": 26996 + }, + { + "epoch": 0.53996, + "grad_norm": 0.09536489099264145, + "learning_rate": 1.0350390376454216e-05, + "loss": 0.0049, + "step": 26998 + }, + { + "epoch": 0.54, + "grad_norm": 1.8036309480667114, + "learning_rate": 1.0348994967025012e-05, + "loss": 0.0514, + "step": 27000 + }, + { + "epoch": 0.54004, + "grad_norm": 0.33894258737564087, + "learning_rate": 1.0347599550791969e-05, + "loss": 0.0095, + "step": 27002 + }, + { + "epoch": 0.54008, + "grad_norm": 3.712040662765503, + "learning_rate": 1.0346204127782296e-05, + "loss": 0.0992, + "step": 27004 + }, + { + "epoch": 0.54012, + "grad_norm": 0.03873537480831146, + "learning_rate": 1.0344808698023197e-05, + "loss": 0.0588, + "step": 27006 + }, + { + "epoch": 0.54016, + "grad_norm": 0.390178382396698, + "learning_rate": 1.034341326154187e-05, + "loss": 0.0117, + "step": 27008 + }, + { + "epoch": 0.5402, + "grad_norm": 3.958425998687744, + "learning_rate": 1.034201781836553e-05, + "loss": 0.1225, + "step": 27010 + }, + { + "epoch": 0.54024, + "grad_norm": 0.33827072381973267, + "learning_rate": 1.0340622368521373e-05, + "loss": 0.1563, + "step": 27012 + }, + { + "epoch": 0.54028, + "grad_norm": 1.5615297555923462, + "learning_rate": 1.0339226912036609e-05, + "loss": 0.0709, + "step": 27014 + }, + { + "epoch": 0.54032, + "grad_norm": 10.850151062011719, + "learning_rate": 1.033783144893844e-05, + "loss": 0.2891, + "step": 27016 + }, + { + "epoch": 0.54036, + "grad_norm": 0.7685954570770264, + "learning_rate": 1.0336435979254074e-05, + "loss": 0.5598, + "step": 27018 + }, + { + "epoch": 0.5404, + "grad_norm": 2.0856435298919678, + "learning_rate": 1.0335040503010715e-05, + "loss": 0.0714, + "step": 27020 + }, + { + "epoch": 0.54044, + "grad_norm": 1.5451316833496094, + "learning_rate": 1.033364502023557e-05, + "loss": 0.2096, + "step": 27022 + }, + { + "epoch": 0.54048, + "grad_norm": 0.023893553763628006, + "learning_rate": 1.0332249530955845e-05, + "loss": 0.0048, + "step": 27024 + }, + { + "epoch": 0.54052, + "grad_norm": 1.481833815574646, + "learning_rate": 1.0330854035198743e-05, + "loss": 0.0951, + "step": 27026 + }, + { + "epoch": 0.54056, + "grad_norm": 1.4732143878936768, + "learning_rate": 1.032945853299147e-05, + "loss": 0.1302, + "step": 27028 + }, + { + "epoch": 0.5406, + "grad_norm": 0.3572692573070526, + "learning_rate": 1.0328063024361232e-05, + "loss": 0.0239, + "step": 27030 + }, + { + "epoch": 0.54064, + "grad_norm": 0.24485038220882416, + "learning_rate": 1.032666750933524e-05, + "loss": 0.0319, + "step": 27032 + }, + { + "epoch": 0.54068, + "grad_norm": 0.23289477825164795, + "learning_rate": 1.0325271987940695e-05, + "loss": 0.0563, + "step": 27034 + }, + { + "epoch": 0.54072, + "grad_norm": 0.553931713104248, + "learning_rate": 1.0323876460204805e-05, + "loss": 0.0429, + "step": 27036 + }, + { + "epoch": 0.54076, + "grad_norm": 0.0713791474699974, + "learning_rate": 1.0322480926154774e-05, + "loss": 0.0086, + "step": 27038 + }, + { + "epoch": 0.5408, + "grad_norm": 0.38116654753685, + "learning_rate": 1.0321085385817818e-05, + "loss": 0.0151, + "step": 27040 + }, + { + "epoch": 0.54084, + "grad_norm": 0.3474423885345459, + "learning_rate": 1.0319689839221132e-05, + "loss": 0.0214, + "step": 27042 + }, + { + "epoch": 0.54088, + "grad_norm": 3.488801956176758, + "learning_rate": 1.031829428639193e-05, + "loss": 0.0919, + "step": 27044 + }, + { + "epoch": 0.54092, + "grad_norm": 0.08675214648246765, + "learning_rate": 1.0316898727357416e-05, + "loss": 0.0164, + "step": 27046 + }, + { + "epoch": 0.54096, + "grad_norm": 0.06958648562431335, + "learning_rate": 1.03155031621448e-05, + "loss": 0.0436, + "step": 27048 + }, + { + "epoch": 0.541, + "grad_norm": 3.03411602973938, + "learning_rate": 1.0314107590781284e-05, + "loss": 0.0601, + "step": 27050 + }, + { + "epoch": 0.54104, + "grad_norm": 0.08416260033845901, + "learning_rate": 1.0312712013294082e-05, + "loss": 0.0088, + "step": 27052 + }, + { + "epoch": 0.54108, + "grad_norm": 0.4267222583293915, + "learning_rate": 1.0311316429710395e-05, + "loss": 0.0081, + "step": 27054 + }, + { + "epoch": 0.54112, + "grad_norm": 0.2567064166069031, + "learning_rate": 1.0309920840057437e-05, + "loss": 0.0117, + "step": 27056 + }, + { + "epoch": 0.54116, + "grad_norm": 3.424328565597534, + "learning_rate": 1.030852524436241e-05, + "loss": 0.0955, + "step": 27058 + }, + { + "epoch": 0.5412, + "grad_norm": 4.190125942230225, + "learning_rate": 1.030712964265253e-05, + "loss": 0.0767, + "step": 27060 + }, + { + "epoch": 0.54124, + "grad_norm": 4.921823978424072, + "learning_rate": 1.0305734034954993e-05, + "loss": 0.1029, + "step": 27062 + }, + { + "epoch": 0.54128, + "grad_norm": 0.6272897720336914, + "learning_rate": 1.0304338421297018e-05, + "loss": 0.0321, + "step": 27064 + }, + { + "epoch": 0.54132, + "grad_norm": 3.5116331577301025, + "learning_rate": 1.0302942801705806e-05, + "loss": 0.1186, + "step": 27066 + }, + { + "epoch": 0.54136, + "grad_norm": 0.15841183066368103, + "learning_rate": 1.0301547176208569e-05, + "loss": 0.005, + "step": 27068 + }, + { + "epoch": 0.5414, + "grad_norm": 0.9805967807769775, + "learning_rate": 1.0300151544832513e-05, + "loss": 0.0382, + "step": 27070 + }, + { + "epoch": 0.54144, + "grad_norm": 0.877571702003479, + "learning_rate": 1.0298755907604852e-05, + "loss": 0.0345, + "step": 27072 + }, + { + "epoch": 0.54148, + "grad_norm": 0.29760053753852844, + "learning_rate": 1.0297360264552787e-05, + "loss": 0.0084, + "step": 27074 + }, + { + "epoch": 0.54152, + "grad_norm": 4.111742973327637, + "learning_rate": 1.0295964615703532e-05, + "loss": 0.0659, + "step": 27076 + }, + { + "epoch": 0.54156, + "grad_norm": 1.636993646621704, + "learning_rate": 1.0294568961084298e-05, + "loss": 0.0299, + "step": 27078 + }, + { + "epoch": 0.5416, + "grad_norm": 0.23189543187618256, + "learning_rate": 1.0293173300722286e-05, + "loss": 0.0109, + "step": 27080 + }, + { + "epoch": 0.54164, + "grad_norm": 0.742379367351532, + "learning_rate": 1.029177763464471e-05, + "loss": 0.0128, + "step": 27082 + }, + { + "epoch": 0.54168, + "grad_norm": 0.2575025260448456, + "learning_rate": 1.0290381962878779e-05, + "loss": 0.1299, + "step": 27084 + }, + { + "epoch": 0.54172, + "grad_norm": 0.028026310727000237, + "learning_rate": 1.0288986285451702e-05, + "loss": 0.0016, + "step": 27086 + }, + { + "epoch": 0.54176, + "grad_norm": 0.1983443647623062, + "learning_rate": 1.0287590602390686e-05, + "loss": 0.0224, + "step": 27088 + }, + { + "epoch": 0.5418, + "grad_norm": 0.24549387395381927, + "learning_rate": 1.0286194913722948e-05, + "loss": 0.004, + "step": 27090 + }, + { + "epoch": 0.54184, + "grad_norm": 0.48601388931274414, + "learning_rate": 1.0284799219475692e-05, + "loss": 0.0091, + "step": 27092 + }, + { + "epoch": 0.54188, + "grad_norm": 0.15503400564193726, + "learning_rate": 1.028340351967613e-05, + "loss": 0.0046, + "step": 27094 + }, + { + "epoch": 0.54192, + "grad_norm": 0.03004273772239685, + "learning_rate": 1.0282007814351467e-05, + "loss": 0.0025, + "step": 27096 + }, + { + "epoch": 0.54196, + "grad_norm": 0.5641649961471558, + "learning_rate": 1.0280612103528918e-05, + "loss": 0.0405, + "step": 27098 + }, + { + "epoch": 0.542, + "grad_norm": 0.9370047450065613, + "learning_rate": 1.0279216387235691e-05, + "loss": 0.0525, + "step": 27100 + }, + { + "epoch": 0.54204, + "grad_norm": 0.04926246777176857, + "learning_rate": 1.0277820665498997e-05, + "loss": 0.2676, + "step": 27102 + }, + { + "epoch": 0.54208, + "grad_norm": 4.274174690246582, + "learning_rate": 1.0276424938346044e-05, + "loss": 0.1113, + "step": 27104 + }, + { + "epoch": 0.54212, + "grad_norm": 0.045342907309532166, + "learning_rate": 1.0275029205804048e-05, + "loss": 0.0111, + "step": 27106 + }, + { + "epoch": 0.54216, + "grad_norm": 0.6830602884292603, + "learning_rate": 1.0273633467900215e-05, + "loss": 0.0103, + "step": 27108 + }, + { + "epoch": 0.5422, + "grad_norm": 0.24211256206035614, + "learning_rate": 1.0272237724661753e-05, + "loss": 0.0049, + "step": 27110 + }, + { + "epoch": 0.54224, + "grad_norm": 0.02118796296417713, + "learning_rate": 1.0270841976115882e-05, + "loss": 0.0023, + "step": 27112 + }, + { + "epoch": 0.54228, + "grad_norm": 0.36675748229026794, + "learning_rate": 1.0269446222289807e-05, + "loss": 0.0108, + "step": 27114 + }, + { + "epoch": 0.54232, + "grad_norm": 0.2980620861053467, + "learning_rate": 1.0268050463210737e-05, + "loss": 0.0094, + "step": 27116 + }, + { + "epoch": 0.54236, + "grad_norm": 0.23340721428394318, + "learning_rate": 1.0266654698905882e-05, + "loss": 0.0054, + "step": 27118 + }, + { + "epoch": 0.5424, + "grad_norm": 0.21352998912334442, + "learning_rate": 1.026525892940246e-05, + "loss": 0.0751, + "step": 27120 + }, + { + "epoch": 0.54244, + "grad_norm": 0.3078934848308563, + "learning_rate": 1.026386315472768e-05, + "loss": 0.0072, + "step": 27122 + }, + { + "epoch": 0.54248, + "grad_norm": 1.5694891214370728, + "learning_rate": 1.026246737490875e-05, + "loss": 0.3847, + "step": 27124 + }, + { + "epoch": 0.54252, + "grad_norm": 0.30287981033325195, + "learning_rate": 1.0261071589972883e-05, + "loss": 0.0045, + "step": 27126 + }, + { + "epoch": 0.54256, + "grad_norm": 2.196620225906372, + "learning_rate": 1.0259675799947293e-05, + "loss": 0.1262, + "step": 27128 + }, + { + "epoch": 0.5426, + "grad_norm": 0.009001532569527626, + "learning_rate": 1.0258280004859189e-05, + "loss": 0.0026, + "step": 27130 + }, + { + "epoch": 0.54264, + "grad_norm": 0.3074134290218353, + "learning_rate": 1.0256884204735783e-05, + "loss": 0.0052, + "step": 27132 + }, + { + "epoch": 0.54268, + "grad_norm": 2.4177393913269043, + "learning_rate": 1.0255488399604285e-05, + "loss": 0.035, + "step": 27134 + }, + { + "epoch": 0.54272, + "grad_norm": 0.04773857071995735, + "learning_rate": 1.025409258949191e-05, + "loss": 0.0089, + "step": 27136 + }, + { + "epoch": 0.54276, + "grad_norm": 0.2259228527545929, + "learning_rate": 1.025269677442587e-05, + "loss": 0.0088, + "step": 27138 + }, + { + "epoch": 0.5428, + "grad_norm": 0.2157992720603943, + "learning_rate": 1.0251300954433377e-05, + "loss": 0.0067, + "step": 27140 + }, + { + "epoch": 0.54284, + "grad_norm": 0.18575741350650787, + "learning_rate": 1.024990512954164e-05, + "loss": 0.0035, + "step": 27142 + }, + { + "epoch": 0.54288, + "grad_norm": 0.037247803062200546, + "learning_rate": 1.0248509299777875e-05, + "loss": 0.0033, + "step": 27144 + }, + { + "epoch": 0.54292, + "grad_norm": 0.44183817505836487, + "learning_rate": 1.0247113465169296e-05, + "loss": 0.0079, + "step": 27146 + }, + { + "epoch": 0.54296, + "grad_norm": 0.2869946360588074, + "learning_rate": 1.024571762574311e-05, + "loss": 0.0836, + "step": 27148 + }, + { + "epoch": 0.543, + "grad_norm": 7.434078216552734, + "learning_rate": 1.0244321781526533e-05, + "loss": 0.1807, + "step": 27150 + }, + { + "epoch": 0.54304, + "grad_norm": 0.12843844294548035, + "learning_rate": 1.0242925932546776e-05, + "loss": 0.021, + "step": 27152 + }, + { + "epoch": 0.54308, + "grad_norm": 0.06719369441270828, + "learning_rate": 1.0241530078831056e-05, + "loss": 0.0159, + "step": 27154 + }, + { + "epoch": 0.54312, + "grad_norm": 0.4231438934803009, + "learning_rate": 1.0240134220406579e-05, + "loss": 0.007, + "step": 27156 + }, + { + "epoch": 0.54316, + "grad_norm": 1.6186177730560303, + "learning_rate": 1.0238738357300564e-05, + "loss": 0.0388, + "step": 27158 + }, + { + "epoch": 0.5432, + "grad_norm": 6.74821138381958, + "learning_rate": 1.0237342489540221e-05, + "loss": 0.1667, + "step": 27160 + }, + { + "epoch": 0.54324, + "grad_norm": 5.554576396942139, + "learning_rate": 1.0235946617152766e-05, + "loss": 0.117, + "step": 27162 + }, + { + "epoch": 0.54328, + "grad_norm": 0.12885163724422455, + "learning_rate": 1.0234550740165408e-05, + "loss": 0.0036, + "step": 27164 + }, + { + "epoch": 0.54332, + "grad_norm": 0.45921921730041504, + "learning_rate": 1.0233154858605363e-05, + "loss": 0.009, + "step": 27166 + }, + { + "epoch": 0.54336, + "grad_norm": 0.13794144988059998, + "learning_rate": 1.0231758972499844e-05, + "loss": 0.003, + "step": 27168 + }, + { + "epoch": 0.5434, + "grad_norm": 6.1237616539001465, + "learning_rate": 1.0230363081876065e-05, + "loss": 0.1057, + "step": 27170 + }, + { + "epoch": 0.54344, + "grad_norm": 1.7540812492370605, + "learning_rate": 1.0228967186761239e-05, + "loss": 0.0286, + "step": 27172 + }, + { + "epoch": 0.54348, + "grad_norm": 5.797973155975342, + "learning_rate": 1.0227571287182578e-05, + "loss": 0.1057, + "step": 27174 + }, + { + "epoch": 0.54352, + "grad_norm": 0.15003328025341034, + "learning_rate": 1.0226175383167299e-05, + "loss": 0.1779, + "step": 27176 + }, + { + "epoch": 0.54356, + "grad_norm": 0.008264578878879547, + "learning_rate": 1.0224779474742614e-05, + "loss": 0.0011, + "step": 27178 + }, + { + "epoch": 0.5436, + "grad_norm": 0.549720048904419, + "learning_rate": 1.0223383561935738e-05, + "loss": 0.0095, + "step": 27180 + }, + { + "epoch": 0.54364, + "grad_norm": 0.03396516665816307, + "learning_rate": 1.0221987644773886e-05, + "loss": 0.0018, + "step": 27182 + }, + { + "epoch": 0.54368, + "grad_norm": 0.36986809968948364, + "learning_rate": 1.0220591723284268e-05, + "loss": 0.006, + "step": 27184 + }, + { + "epoch": 0.54372, + "grad_norm": 0.6337360143661499, + "learning_rate": 1.0219195797494103e-05, + "loss": 0.0096, + "step": 27186 + }, + { + "epoch": 0.54376, + "grad_norm": 0.08721047639846802, + "learning_rate": 1.0217799867430603e-05, + "loss": 0.018, + "step": 27188 + }, + { + "epoch": 0.5438, + "grad_norm": 2.274035930633545, + "learning_rate": 1.0216403933120979e-05, + "loss": 0.0355, + "step": 27190 + }, + { + "epoch": 0.54384, + "grad_norm": 0.4108924865722656, + "learning_rate": 1.021500799459245e-05, + "loss": 0.0176, + "step": 27192 + }, + { + "epoch": 0.54388, + "grad_norm": 0.43853914737701416, + "learning_rate": 1.0213612051872231e-05, + "loss": 0.0064, + "step": 27194 + }, + { + "epoch": 0.54392, + "grad_norm": 0.18094418942928314, + "learning_rate": 1.0212216104987536e-05, + "loss": 0.009, + "step": 27196 + }, + { + "epoch": 0.54396, + "grad_norm": 0.47008398175239563, + "learning_rate": 1.0210820153965577e-05, + "loss": 0.0848, + "step": 27198 + }, + { + "epoch": 0.544, + "grad_norm": 4.326330184936523, + "learning_rate": 1.0209424198833571e-05, + "loss": 0.1063, + "step": 27200 + }, + { + "epoch": 0.54404, + "grad_norm": 0.06836646795272827, + "learning_rate": 1.0208028239618732e-05, + "loss": 0.0043, + "step": 27202 + }, + { + "epoch": 0.54408, + "grad_norm": 10.941441535949707, + "learning_rate": 1.0206632276348278e-05, + "loss": 0.268, + "step": 27204 + }, + { + "epoch": 0.54412, + "grad_norm": 1.471898078918457, + "learning_rate": 1.0205236309049417e-05, + "loss": 0.022, + "step": 27206 + }, + { + "epoch": 0.54416, + "grad_norm": 0.16075968742370605, + "learning_rate": 1.0203840337749371e-05, + "loss": 0.0032, + "step": 27208 + }, + { + "epoch": 0.5442, + "grad_norm": 0.24625827372074127, + "learning_rate": 1.0202444362475352e-05, + "loss": 0.0372, + "step": 27210 + }, + { + "epoch": 0.54424, + "grad_norm": 0.3929317593574524, + "learning_rate": 1.0201048383254578e-05, + "loss": 0.0111, + "step": 27212 + }, + { + "epoch": 0.54428, + "grad_norm": 4.805552959442139, + "learning_rate": 1.019965240011426e-05, + "loss": 0.1005, + "step": 27214 + }, + { + "epoch": 0.54432, + "grad_norm": 0.06613735854625702, + "learning_rate": 1.0198256413081617e-05, + "loss": 0.0727, + "step": 27216 + }, + { + "epoch": 0.54436, + "grad_norm": 0.10192946344614029, + "learning_rate": 1.0196860422183862e-05, + "loss": 0.0063, + "step": 27218 + }, + { + "epoch": 0.5444, + "grad_norm": 0.12793593108654022, + "learning_rate": 1.0195464427448213e-05, + "loss": 0.0229, + "step": 27220 + }, + { + "epoch": 0.54444, + "grad_norm": 0.07373897731304169, + "learning_rate": 1.0194068428901882e-05, + "loss": 0.0027, + "step": 27222 + }, + { + "epoch": 0.54448, + "grad_norm": 0.07925815135240555, + "learning_rate": 1.0192672426572087e-05, + "loss": 0.0029, + "step": 27224 + }, + { + "epoch": 0.54452, + "grad_norm": 0.5501324534416199, + "learning_rate": 1.0191276420486047e-05, + "loss": 0.0773, + "step": 27226 + }, + { + "epoch": 0.54456, + "grad_norm": 0.25313642621040344, + "learning_rate": 1.0189880410670974e-05, + "loss": 0.0149, + "step": 27228 + }, + { + "epoch": 0.5446, + "grad_norm": 0.8686872124671936, + "learning_rate": 1.0188484397154083e-05, + "loss": 0.5399, + "step": 27230 + }, + { + "epoch": 0.54464, + "grad_norm": 0.12196207791566849, + "learning_rate": 1.0187088379962593e-05, + "loss": 0.0019, + "step": 27232 + }, + { + "epoch": 0.54468, + "grad_norm": 0.0033821614924818277, + "learning_rate": 1.0185692359123717e-05, + "loss": 0.0104, + "step": 27234 + }, + { + "epoch": 0.54472, + "grad_norm": 0.031718458980321884, + "learning_rate": 1.0184296334664677e-05, + "loss": 0.0054, + "step": 27236 + }, + { + "epoch": 0.54476, + "grad_norm": 0.1188858300447464, + "learning_rate": 1.018290030661268e-05, + "loss": 0.0026, + "step": 27238 + }, + { + "epoch": 0.5448, + "grad_norm": 0.42435112595558167, + "learning_rate": 1.0181504274994949e-05, + "loss": 0.0064, + "step": 27240 + }, + { + "epoch": 0.54484, + "grad_norm": 1.4208176136016846, + "learning_rate": 1.0180108239838698e-05, + "loss": 0.0159, + "step": 27242 + }, + { + "epoch": 0.54488, + "grad_norm": 2.7152764797210693, + "learning_rate": 1.0178712201171147e-05, + "loss": 0.062, + "step": 27244 + }, + { + "epoch": 0.54492, + "grad_norm": 4.366142749786377, + "learning_rate": 1.0177316159019508e-05, + "loss": 0.1175, + "step": 27246 + }, + { + "epoch": 0.54496, + "grad_norm": 0.06299091875553131, + "learning_rate": 1.0175920113410998e-05, + "loss": 0.2676, + "step": 27248 + }, + { + "epoch": 0.545, + "grad_norm": 1.1219062805175781, + "learning_rate": 1.0174524064372837e-05, + "loss": 0.0359, + "step": 27250 + }, + { + "epoch": 0.54504, + "grad_norm": 2.609570264816284, + "learning_rate": 1.017312801193224e-05, + "loss": 0.0274, + "step": 27252 + }, + { + "epoch": 0.54508, + "grad_norm": 0.04521966353058815, + "learning_rate": 1.017173195611642e-05, + "loss": 0.0005, + "step": 27254 + }, + { + "epoch": 0.54512, + "grad_norm": 0.10649070143699646, + "learning_rate": 1.0170335896952599e-05, + "loss": 0.0017, + "step": 27256 + }, + { + "epoch": 0.54516, + "grad_norm": 0.7401922345161438, + "learning_rate": 1.016893983446799e-05, + "loss": 0.0089, + "step": 27258 + }, + { + "epoch": 0.5452, + "grad_norm": 0.0033348107244819403, + "learning_rate": 1.0167543768689816e-05, + "loss": 0.0422, + "step": 27260 + }, + { + "epoch": 0.54524, + "grad_norm": 3.531507968902588, + "learning_rate": 1.0166147699645285e-05, + "loss": 0.0622, + "step": 27262 + }, + { + "epoch": 0.54528, + "grad_norm": 0.014988877810537815, + "learning_rate": 1.0164751627361623e-05, + "loss": 0.0005, + "step": 27264 + }, + { + "epoch": 0.54532, + "grad_norm": 2.8219640254974365, + "learning_rate": 1.0163355551866043e-05, + "loss": 0.0388, + "step": 27266 + }, + { + "epoch": 0.54536, + "grad_norm": 6.308206558227539, + "learning_rate": 1.016195947318576e-05, + "loss": 0.1385, + "step": 27268 + }, + { + "epoch": 0.5454, + "grad_norm": 0.1850086897611618, + "learning_rate": 1.0160563391347998e-05, + "loss": 0.0036, + "step": 27270 + }, + { + "epoch": 0.54544, + "grad_norm": 0.017400259152054787, + "learning_rate": 1.0159167306379964e-05, + "loss": 0.0059, + "step": 27272 + }, + { + "epoch": 0.54548, + "grad_norm": 0.013216017745435238, + "learning_rate": 1.0157771218308885e-05, + "loss": 0.0032, + "step": 27274 + }, + { + "epoch": 0.54552, + "grad_norm": 0.3324415683746338, + "learning_rate": 1.0156375127161976e-05, + "loss": 0.0089, + "step": 27276 + }, + { + "epoch": 0.54556, + "grad_norm": 1.8168026208877563, + "learning_rate": 1.015497903296645e-05, + "loss": 0.4376, + "step": 27278 + }, + { + "epoch": 0.5456, + "grad_norm": 0.4428303837776184, + "learning_rate": 1.0153582935749531e-05, + "loss": 0.0058, + "step": 27280 + }, + { + "epoch": 0.54564, + "grad_norm": 0.030530376359820366, + "learning_rate": 1.0152186835538431e-05, + "loss": 0.0086, + "step": 27282 + }, + { + "epoch": 0.54568, + "grad_norm": 0.3100105822086334, + "learning_rate": 1.0150790732360372e-05, + "loss": 0.0094, + "step": 27284 + }, + { + "epoch": 0.54572, + "grad_norm": 0.5219314694404602, + "learning_rate": 1.0149394626242573e-05, + "loss": 0.0138, + "step": 27286 + }, + { + "epoch": 0.54576, + "grad_norm": 0.18200355768203735, + "learning_rate": 1.0147998517212242e-05, + "loss": 0.0033, + "step": 27288 + }, + { + "epoch": 0.5458, + "grad_norm": 0.6633211374282837, + "learning_rate": 1.0146602405296608e-05, + "loss": 0.2786, + "step": 27290 + }, + { + "epoch": 0.54584, + "grad_norm": 0.04681920260190964, + "learning_rate": 1.0145206290522884e-05, + "loss": 0.0078, + "step": 27292 + }, + { + "epoch": 0.54588, + "grad_norm": 0.4597190320491791, + "learning_rate": 1.014381017291829e-05, + "loss": 0.0176, + "step": 27294 + }, + { + "epoch": 0.54592, + "grad_norm": 2.292119264602661, + "learning_rate": 1.0142414052510039e-05, + "loss": 0.0329, + "step": 27296 + }, + { + "epoch": 0.54596, + "grad_norm": 0.03962758928537369, + "learning_rate": 1.0141017929325354e-05, + "loss": 0.0013, + "step": 27298 + }, + { + "epoch": 0.546, + "grad_norm": 0.2898590564727783, + "learning_rate": 1.0139621803391454e-05, + "loss": 0.0046, + "step": 27300 + }, + { + "epoch": 0.54604, + "grad_norm": 0.04193664342164993, + "learning_rate": 1.0138225674735554e-05, + "loss": 0.0139, + "step": 27302 + }, + { + "epoch": 0.54608, + "grad_norm": 0.01423686370253563, + "learning_rate": 1.0136829543384873e-05, + "loss": 0.0084, + "step": 27304 + }, + { + "epoch": 0.54612, + "grad_norm": 0.06268034875392914, + "learning_rate": 1.013543340936663e-05, + "loss": 0.0034, + "step": 27306 + }, + { + "epoch": 0.54616, + "grad_norm": 0.004920764360576868, + "learning_rate": 1.0134037272708042e-05, + "loss": 0.0118, + "step": 27308 + }, + { + "epoch": 0.5462, + "grad_norm": 3.161939859390259, + "learning_rate": 1.013264113343633e-05, + "loss": 0.031, + "step": 27310 + }, + { + "epoch": 0.54624, + "grad_norm": 0.06511238217353821, + "learning_rate": 1.0131244991578709e-05, + "loss": 0.1274, + "step": 27312 + }, + { + "epoch": 0.54628, + "grad_norm": 8.261418342590332, + "learning_rate": 1.01298488471624e-05, + "loss": 0.1266, + "step": 27314 + }, + { + "epoch": 0.54632, + "grad_norm": 0.04934012144804001, + "learning_rate": 1.0128452700214619e-05, + "loss": 0.0043, + "step": 27316 + }, + { + "epoch": 0.54636, + "grad_norm": 0.6835076808929443, + "learning_rate": 1.012705655076259e-05, + "loss": 0.0135, + "step": 27318 + }, + { + "epoch": 0.5464, + "grad_norm": 0.47263842821121216, + "learning_rate": 1.0125660398833528e-05, + "loss": 0.0061, + "step": 27320 + }, + { + "epoch": 0.54644, + "grad_norm": 0.014340730383992195, + "learning_rate": 1.0124264244454651e-05, + "loss": 0.0008, + "step": 27322 + }, + { + "epoch": 0.54648, + "grad_norm": 1.7879259586334229, + "learning_rate": 1.0122868087653177e-05, + "loss": 0.0227, + "step": 27324 + }, + { + "epoch": 0.54652, + "grad_norm": 0.23972240090370178, + "learning_rate": 1.0121471928456327e-05, + "loss": 0.004, + "step": 27326 + }, + { + "epoch": 0.54656, + "grad_norm": 0.033365581184625626, + "learning_rate": 1.012007576689132e-05, + "loss": 0.072, + "step": 27328 + }, + { + "epoch": 0.5466, + "grad_norm": 10.557865142822266, + "learning_rate": 1.0118679602985373e-05, + "loss": 0.4418, + "step": 27330 + }, + { + "epoch": 0.54664, + "grad_norm": 10.986113548278809, + "learning_rate": 1.0117283436765706e-05, + "loss": 0.3585, + "step": 27332 + }, + { + "epoch": 0.54668, + "grad_norm": 0.04446001350879669, + "learning_rate": 1.0115887268259541e-05, + "loss": 0.0087, + "step": 27334 + }, + { + "epoch": 0.54672, + "grad_norm": 0.017744531854987144, + "learning_rate": 1.011449109749409e-05, + "loss": 0.0151, + "step": 27336 + }, + { + "epoch": 0.54676, + "grad_norm": 0.7034832835197449, + "learning_rate": 1.011309492449658e-05, + "loss": 0.0125, + "step": 27338 + }, + { + "epoch": 0.5468, + "grad_norm": 0.15131209790706635, + "learning_rate": 1.0111698749294223e-05, + "loss": 0.0056, + "step": 27340 + }, + { + "epoch": 0.54684, + "grad_norm": 0.4439794719219208, + "learning_rate": 1.0110302571914243e-05, + "loss": 0.0083, + "step": 27342 + }, + { + "epoch": 0.54688, + "grad_norm": 0.022544462233781815, + "learning_rate": 1.0108906392383858e-05, + "loss": 0.0649, + "step": 27344 + }, + { + "epoch": 0.54692, + "grad_norm": 16.166032791137695, + "learning_rate": 1.0107510210730284e-05, + "loss": 0.3234, + "step": 27346 + }, + { + "epoch": 0.54696, + "grad_norm": 0.09564830362796783, + "learning_rate": 1.0106114026980745e-05, + "loss": 0.0012, + "step": 27348 + }, + { + "epoch": 0.547, + "grad_norm": 0.044392526149749756, + "learning_rate": 1.010471784116246e-05, + "loss": 0.1902, + "step": 27350 + }, + { + "epoch": 0.54704, + "grad_norm": 0.06674259901046753, + "learning_rate": 1.0103321653302644e-05, + "loss": 0.0014, + "step": 27352 + }, + { + "epoch": 0.54708, + "grad_norm": 5.363626480102539, + "learning_rate": 1.0101925463428521e-05, + "loss": 0.1173, + "step": 27354 + }, + { + "epoch": 0.54712, + "grad_norm": 0.11553719639778137, + "learning_rate": 1.0100529271567308e-05, + "loss": 0.0975, + "step": 27356 + }, + { + "epoch": 0.54716, + "grad_norm": 3.1834158897399902, + "learning_rate": 1.0099133077746227e-05, + "loss": 0.0355, + "step": 27358 + }, + { + "epoch": 0.5472, + "grad_norm": 0.3144366443157196, + "learning_rate": 1.0097736881992492e-05, + "loss": 0.0058, + "step": 27360 + }, + { + "epoch": 0.54724, + "grad_norm": 0.08310475200414658, + "learning_rate": 1.0096340684333329e-05, + "loss": 0.0094, + "step": 27362 + }, + { + "epoch": 0.54728, + "grad_norm": 0.021636726334691048, + "learning_rate": 1.0094944484795952e-05, + "loss": 0.0423, + "step": 27364 + }, + { + "epoch": 0.54732, + "grad_norm": 0.003261483972892165, + "learning_rate": 1.0093548283407585e-05, + "loss": 0.0525, + "step": 27366 + }, + { + "epoch": 0.54736, + "grad_norm": 0.01996905542910099, + "learning_rate": 1.0092152080195445e-05, + "loss": 0.0047, + "step": 27368 + }, + { + "epoch": 0.5474, + "grad_norm": 0.1632719188928604, + "learning_rate": 1.0090755875186752e-05, + "loss": 0.0547, + "step": 27370 + }, + { + "epoch": 0.54744, + "grad_norm": 3.93467116355896, + "learning_rate": 1.0089359668408732e-05, + "loss": 0.0984, + "step": 27372 + }, + { + "epoch": 0.54748, + "grad_norm": 8.605527877807617, + "learning_rate": 1.0087963459888596e-05, + "loss": 0.1163, + "step": 27374 + }, + { + "epoch": 0.54752, + "grad_norm": 8.628336906433105, + "learning_rate": 1.0086567249653567e-05, + "loss": 0.1679, + "step": 27376 + }, + { + "epoch": 0.54756, + "grad_norm": 0.0722033753991127, + "learning_rate": 1.0085171037730863e-05, + "loss": 0.0014, + "step": 27378 + }, + { + "epoch": 0.5476, + "grad_norm": 0.04335404559969902, + "learning_rate": 1.0083774824147707e-05, + "loss": 0.011, + "step": 27380 + }, + { + "epoch": 0.54764, + "grad_norm": 0.007563580758869648, + "learning_rate": 1.008237860893132e-05, + "loss": 0.0009, + "step": 27382 + }, + { + "epoch": 0.54768, + "grad_norm": 0.12607663869857788, + "learning_rate": 1.0080982392108915e-05, + "loss": 0.0043, + "step": 27384 + }, + { + "epoch": 0.54772, + "grad_norm": 0.009282215498387814, + "learning_rate": 1.0079586173707719e-05, + "loss": 0.0105, + "step": 27386 + }, + { + "epoch": 0.54776, + "grad_norm": 1.6827800273895264, + "learning_rate": 1.0078189953754951e-05, + "loss": 0.0265, + "step": 27388 + }, + { + "epoch": 0.5478, + "grad_norm": 0.12199721485376358, + "learning_rate": 1.007679373227783e-05, + "loss": 0.0121, + "step": 27390 + }, + { + "epoch": 0.54784, + "grad_norm": 0.012153131887316704, + "learning_rate": 1.0075397509303573e-05, + "loss": 0.0065, + "step": 27392 + }, + { + "epoch": 0.54788, + "grad_norm": 0.017807921394705772, + "learning_rate": 1.0074001284859403e-05, + "loss": 0.0007, + "step": 27394 + }, + { + "epoch": 0.54792, + "grad_norm": 0.0258171446621418, + "learning_rate": 1.007260505897254e-05, + "loss": 0.0007, + "step": 27396 + }, + { + "epoch": 0.54796, + "grad_norm": 0.18653331696987152, + "learning_rate": 1.0071208831670204e-05, + "loss": 0.0213, + "step": 27398 + }, + { + "epoch": 0.548, + "grad_norm": 0.020504863932728767, + "learning_rate": 1.0069812602979617e-05, + "loss": 0.138, + "step": 27400 + }, + { + "epoch": 0.54804, + "grad_norm": 0.13594962656497955, + "learning_rate": 1.0068416372927995e-05, + "loss": 0.0048, + "step": 27402 + }, + { + "epoch": 0.54808, + "grad_norm": 0.6774048209190369, + "learning_rate": 1.006702014154256e-05, + "loss": 0.0151, + "step": 27404 + }, + { + "epoch": 0.54812, + "grad_norm": 8.22294807434082, + "learning_rate": 1.0065623908850537e-05, + "loss": 0.1271, + "step": 27406 + }, + { + "epoch": 0.54816, + "grad_norm": 0.3035842478275299, + "learning_rate": 1.0064227674879142e-05, + "loss": 0.0116, + "step": 27408 + }, + { + "epoch": 0.5482, + "grad_norm": 6.9651336669921875, + "learning_rate": 1.0062831439655591e-05, + "loss": 0.0841, + "step": 27410 + }, + { + "epoch": 0.54824, + "grad_norm": 0.2887144088745117, + "learning_rate": 1.0061435203207111e-05, + "loss": 0.0116, + "step": 27412 + }, + { + "epoch": 0.54828, + "grad_norm": 3.4512972831726074, + "learning_rate": 1.0060038965560918e-05, + "loss": 0.3784, + "step": 27414 + }, + { + "epoch": 0.54832, + "grad_norm": 3.1914448738098145, + "learning_rate": 1.0058642726744239e-05, + "loss": 0.0444, + "step": 27416 + }, + { + "epoch": 0.54836, + "grad_norm": 2.3946011066436768, + "learning_rate": 1.0057246486784287e-05, + "loss": 0.0301, + "step": 27418 + }, + { + "epoch": 0.5484, + "grad_norm": 0.2971828877925873, + "learning_rate": 1.0055850245708283e-05, + "loss": 0.003, + "step": 27420 + }, + { + "epoch": 0.54844, + "grad_norm": 6.3284759521484375, + "learning_rate": 1.0054454003543455e-05, + "loss": 0.1168, + "step": 27422 + }, + { + "epoch": 0.54848, + "grad_norm": 0.32511961460113525, + "learning_rate": 1.0053057760317015e-05, + "loss": 0.0063, + "step": 27424 + }, + { + "epoch": 0.54852, + "grad_norm": 0.19690358638763428, + "learning_rate": 1.0051661516056186e-05, + "loss": 0.0029, + "step": 27426 + }, + { + "epoch": 0.54856, + "grad_norm": 0.23112039268016815, + "learning_rate": 1.005026527078819e-05, + "loss": 0.0051, + "step": 27428 + }, + { + "epoch": 0.5486, + "grad_norm": 0.8665205240249634, + "learning_rate": 1.0048869024540247e-05, + "loss": 0.1078, + "step": 27430 + }, + { + "epoch": 0.54864, + "grad_norm": 0.011771835386753082, + "learning_rate": 1.0047472777339578e-05, + "loss": 0.0029, + "step": 27432 + }, + { + "epoch": 0.54868, + "grad_norm": 0.0981072410941124, + "learning_rate": 1.00460765292134e-05, + "loss": 0.0022, + "step": 27434 + }, + { + "epoch": 0.54872, + "grad_norm": 0.08027248829603195, + "learning_rate": 1.0044680280188939e-05, + "loss": 0.0346, + "step": 27436 + }, + { + "epoch": 0.54876, + "grad_norm": 11.357316970825195, + "learning_rate": 1.004328403029341e-05, + "loss": 0.2816, + "step": 27438 + }, + { + "epoch": 0.5488, + "grad_norm": 0.002985490020364523, + "learning_rate": 1.0041887779554041e-05, + "loss": 0.0118, + "step": 27440 + }, + { + "epoch": 0.54884, + "grad_norm": 0.06928359717130661, + "learning_rate": 1.0040491527998047e-05, + "loss": 0.001, + "step": 27442 + }, + { + "epoch": 0.54888, + "grad_norm": 0.33910104632377625, + "learning_rate": 1.0039095275652646e-05, + "loss": 0.0084, + "step": 27444 + }, + { + "epoch": 0.54892, + "grad_norm": 0.3221791684627533, + "learning_rate": 1.0037699022545064e-05, + "loss": 0.0056, + "step": 27446 + }, + { + "epoch": 0.54896, + "grad_norm": 0.037854522466659546, + "learning_rate": 1.0036302768702523e-05, + "loss": 0.0271, + "step": 27448 + }, + { + "epoch": 0.549, + "grad_norm": 0.48376527428627014, + "learning_rate": 1.0034906514152239e-05, + "loss": 0.0145, + "step": 27450 + }, + { + "epoch": 0.54904, + "grad_norm": 0.04000244662165642, + "learning_rate": 1.0033510258921433e-05, + "loss": 0.001, + "step": 27452 + }, + { + "epoch": 0.54908, + "grad_norm": 0.6253162622451782, + "learning_rate": 1.0032114003037328e-05, + "loss": 0.1802, + "step": 27454 + }, + { + "epoch": 0.54912, + "grad_norm": 0.08619693666696548, + "learning_rate": 1.0030717746527147e-05, + "loss": 0.0059, + "step": 27456 + }, + { + "epoch": 0.54916, + "grad_norm": 0.0381433442234993, + "learning_rate": 1.0029321489418107e-05, + "loss": 0.0009, + "step": 27458 + }, + { + "epoch": 0.5492, + "grad_norm": 0.19115276634693146, + "learning_rate": 1.0027925231737428e-05, + "loss": 0.0026, + "step": 27460 + }, + { + "epoch": 0.54924, + "grad_norm": 0.006166035309433937, + "learning_rate": 1.002652897351233e-05, + "loss": 0.0168, + "step": 27462 + }, + { + "epoch": 0.54928, + "grad_norm": 0.008441977202892303, + "learning_rate": 1.0025132714770041e-05, + "loss": 0.0073, + "step": 27464 + }, + { + "epoch": 0.54932, + "grad_norm": 0.036100003868341446, + "learning_rate": 1.0023736455537772e-05, + "loss": 0.0015, + "step": 27466 + }, + { + "epoch": 0.54936, + "grad_norm": 0.049622103571891785, + "learning_rate": 1.002234019584275e-05, + "loss": 0.0008, + "step": 27468 + }, + { + "epoch": 0.5494, + "grad_norm": 3.2394235134124756, + "learning_rate": 1.0020943935712193e-05, + "loss": 0.039, + "step": 27470 + }, + { + "epoch": 0.54944, + "grad_norm": 0.03403151407837868, + "learning_rate": 1.0019547675173326e-05, + "loss": 0.0006, + "step": 27472 + }, + { + "epoch": 0.54948, + "grad_norm": 0.44242462515830994, + "learning_rate": 1.0018151414253368e-05, + "loss": 0.0047, + "step": 27474 + }, + { + "epoch": 0.54952, + "grad_norm": 0.043694883584976196, + "learning_rate": 1.0016755152979538e-05, + "loss": 0.0015, + "step": 27476 + }, + { + "epoch": 0.54956, + "grad_norm": 0.14547115564346313, + "learning_rate": 1.0015358891379055e-05, + "loss": 0.0017, + "step": 27478 + }, + { + "epoch": 0.5496, + "grad_norm": 0.03444121032953262, + "learning_rate": 1.0013962629479145e-05, + "loss": 0.0034, + "step": 27480 + }, + { + "epoch": 0.54964, + "grad_norm": 0.026572335511446, + "learning_rate": 1.0012566367307027e-05, + "loss": 0.0008, + "step": 27482 + }, + { + "epoch": 0.54968, + "grad_norm": 0.09029684960842133, + "learning_rate": 1.0011170104889917e-05, + "loss": 0.0017, + "step": 27484 + }, + { + "epoch": 0.54972, + "grad_norm": 3.5786988735198975, + "learning_rate": 1.0009773842255043e-05, + "loss": 0.0471, + "step": 27486 + }, + { + "epoch": 0.54976, + "grad_norm": 0.03407740592956543, + "learning_rate": 1.0008377579429623e-05, + "loss": 0.0044, + "step": 27488 + }, + { + "epoch": 0.5498, + "grad_norm": 9.894259452819824, + "learning_rate": 1.0006981316440876e-05, + "loss": 0.2046, + "step": 27490 + }, + { + "epoch": 0.54984, + "grad_norm": 2.1463491916656494, + "learning_rate": 1.000558505331603e-05, + "loss": 0.0224, + "step": 27492 + }, + { + "epoch": 0.54988, + "grad_norm": 0.0028436570428311825, + "learning_rate": 1.0004188790082294e-05, + "loss": 0.0003, + "step": 27494 + }, + { + "epoch": 0.54992, + "grad_norm": 0.0811770036816597, + "learning_rate": 1.00027925267669e-05, + "loss": 0.001, + "step": 27496 + }, + { + "epoch": 0.54996, + "grad_norm": 2.009540319442749, + "learning_rate": 1.0001396263397061e-05, + "loss": 0.0297, + "step": 27498 + }, + { + "epoch": 0.55, + "grad_norm": 0.07667175680398941, + "learning_rate": 1e-05, + "loss": 0.0345, + "step": 27500 + }, + { + "epoch": 0.55004, + "grad_norm": 0.17055034637451172, + "learning_rate": 9.998603736602944e-06, + "loss": 0.0069, + "step": 27502 + }, + { + "epoch": 0.55008, + "grad_norm": 13.325889587402344, + "learning_rate": 9.997207473233104e-06, + "loss": 0.7693, + "step": 27504 + }, + { + "epoch": 0.55012, + "grad_norm": 0.00202170480042696, + "learning_rate": 9.995811209917709e-06, + "loss": 0.0149, + "step": 27506 + }, + { + "epoch": 0.55016, + "grad_norm": 10.201229095458984, + "learning_rate": 9.994414946683975e-06, + "loss": 0.3188, + "step": 27508 + }, + { + "epoch": 0.5502, + "grad_norm": 0.0359174944460392, + "learning_rate": 9.993018683559126e-06, + "loss": 0.0061, + "step": 27510 + }, + { + "epoch": 0.55024, + "grad_norm": 7.918325424194336, + "learning_rate": 9.99162242057038e-06, + "loss": 0.1742, + "step": 27512 + }, + { + "epoch": 0.55028, + "grad_norm": 0.2900846302509308, + "learning_rate": 9.990226157744959e-06, + "loss": 0.0057, + "step": 27514 + }, + { + "epoch": 0.55032, + "grad_norm": 0.9697043299674988, + "learning_rate": 9.988829895110086e-06, + "loss": 0.0105, + "step": 27516 + }, + { + "epoch": 0.55036, + "grad_norm": 0.039890188723802567, + "learning_rate": 9.98743363269298e-06, + "loss": 0.0021, + "step": 27518 + }, + { + "epoch": 0.5504, + "grad_norm": 0.25116580724716187, + "learning_rate": 9.986037370520856e-06, + "loss": 0.0278, + "step": 27520 + }, + { + "epoch": 0.55044, + "grad_norm": 9.85426139831543, + "learning_rate": 9.984641108620949e-06, + "loss": 0.1508, + "step": 27522 + }, + { + "epoch": 0.55048, + "grad_norm": 2.7490766048431396, + "learning_rate": 9.983244847020465e-06, + "loss": 0.0427, + "step": 27524 + }, + { + "epoch": 0.55052, + "grad_norm": 0.3366789221763611, + "learning_rate": 9.981848585746637e-06, + "loss": 0.0049, + "step": 27526 + }, + { + "epoch": 0.55056, + "grad_norm": 0.1536809206008911, + "learning_rate": 9.980452324826675e-06, + "loss": 0.0076, + "step": 27528 + }, + { + "epoch": 0.5506, + "grad_norm": 0.06535954028367996, + "learning_rate": 9.979056064287807e-06, + "loss": 0.0022, + "step": 27530 + }, + { + "epoch": 0.55064, + "grad_norm": 0.03130906820297241, + "learning_rate": 9.977659804157253e-06, + "loss": 0.0966, + "step": 27532 + }, + { + "epoch": 0.55068, + "grad_norm": 0.13720397651195526, + "learning_rate": 9.976263544462233e-06, + "loss": 0.0018, + "step": 27534 + }, + { + "epoch": 0.55072, + "grad_norm": 0.0013926130486652255, + "learning_rate": 9.974867285229962e-06, + "loss": 0.0007, + "step": 27536 + }, + { + "epoch": 0.55076, + "grad_norm": 0.28439193964004517, + "learning_rate": 9.973471026487672e-06, + "loss": 0.0074, + "step": 27538 + }, + { + "epoch": 0.5508, + "grad_norm": 0.12803678214550018, + "learning_rate": 9.972074768262576e-06, + "loss": 0.0168, + "step": 27540 + }, + { + "epoch": 0.55084, + "grad_norm": 0.08513852208852768, + "learning_rate": 9.970678510581897e-06, + "loss": 0.0198, + "step": 27542 + }, + { + "epoch": 0.55088, + "grad_norm": 0.5518543124198914, + "learning_rate": 9.969282253472856e-06, + "loss": 0.0081, + "step": 27544 + }, + { + "epoch": 0.55092, + "grad_norm": 0.17410436272621155, + "learning_rate": 9.967885996962672e-06, + "loss": 0.1081, + "step": 27546 + }, + { + "epoch": 0.55096, + "grad_norm": 0.1718490868806839, + "learning_rate": 9.966489741078568e-06, + "loss": 0.0047, + "step": 27548 + }, + { + "epoch": 0.551, + "grad_norm": 0.3818737864494324, + "learning_rate": 9.965093485847766e-06, + "loss": 0.0053, + "step": 27550 + }, + { + "epoch": 0.55104, + "grad_norm": 0.18781933188438416, + "learning_rate": 9.963697231297479e-06, + "loss": 0.0267, + "step": 27552 + }, + { + "epoch": 0.55108, + "grad_norm": 14.613036155700684, + "learning_rate": 9.962300977454938e-06, + "loss": 1.2202, + "step": 27554 + }, + { + "epoch": 0.55112, + "grad_norm": 9.55982780456543, + "learning_rate": 9.960904724347353e-06, + "loss": 0.1061, + "step": 27556 + }, + { + "epoch": 0.55116, + "grad_norm": 4.139070510864258, + "learning_rate": 9.959508472001956e-06, + "loss": 0.0908, + "step": 27558 + }, + { + "epoch": 0.5512, + "grad_norm": 0.11027640104293823, + "learning_rate": 9.958112220445964e-06, + "loss": 0.0038, + "step": 27560 + }, + { + "epoch": 0.55124, + "grad_norm": 0.38571906089782715, + "learning_rate": 9.956715969706591e-06, + "loss": 0.0083, + "step": 27562 + }, + { + "epoch": 0.55128, + "grad_norm": 3.8429503440856934, + "learning_rate": 9.955319719811065e-06, + "loss": 0.0916, + "step": 27564 + }, + { + "epoch": 0.55132, + "grad_norm": 0.9504136443138123, + "learning_rate": 9.953923470786605e-06, + "loss": 0.5598, + "step": 27566 + }, + { + "epoch": 0.55136, + "grad_norm": 0.1857927143573761, + "learning_rate": 9.952527222660426e-06, + "loss": 0.0031, + "step": 27568 + }, + { + "epoch": 0.5514, + "grad_norm": 1.2926034927368164, + "learning_rate": 9.951130975459758e-06, + "loss": 0.0192, + "step": 27570 + }, + { + "epoch": 0.55144, + "grad_norm": 0.21432380378246307, + "learning_rate": 9.949734729211811e-06, + "loss": 0.0295, + "step": 27572 + }, + { + "epoch": 0.55148, + "grad_norm": 6.829582214355469, + "learning_rate": 9.948338483943816e-06, + "loss": 0.34, + "step": 27574 + }, + { + "epoch": 0.55152, + "grad_norm": 0.5625303983688354, + "learning_rate": 9.94694223968299e-06, + "loss": 0.0235, + "step": 27576 + }, + { + "epoch": 0.55156, + "grad_norm": 0.2766787111759186, + "learning_rate": 9.945545996456549e-06, + "loss": 0.0156, + "step": 27578 + }, + { + "epoch": 0.5516, + "grad_norm": 0.317290335893631, + "learning_rate": 9.944149754291719e-06, + "loss": 0.005, + "step": 27580 + }, + { + "epoch": 0.55164, + "grad_norm": 0.45682206749916077, + "learning_rate": 9.94275351321572e-06, + "loss": 0.028, + "step": 27582 + }, + { + "epoch": 0.55168, + "grad_norm": 0.4453709125518799, + "learning_rate": 9.941357273255765e-06, + "loss": 0.0711, + "step": 27584 + }, + { + "epoch": 0.55172, + "grad_norm": 0.0726189985871315, + "learning_rate": 9.939961034439084e-06, + "loss": 0.0034, + "step": 27586 + }, + { + "epoch": 0.55176, + "grad_norm": 0.09943961352109909, + "learning_rate": 9.93856479679289e-06, + "loss": 0.0026, + "step": 27588 + }, + { + "epoch": 0.5518, + "grad_norm": 0.21390722692012787, + "learning_rate": 9.937168560344412e-06, + "loss": 0.0123, + "step": 27590 + }, + { + "epoch": 0.55184, + "grad_norm": 0.04006095975637436, + "learning_rate": 9.935772325120863e-06, + "loss": 0.0137, + "step": 27592 + }, + { + "epoch": 0.55188, + "grad_norm": 5.600834369659424, + "learning_rate": 9.934376091149466e-06, + "loss": 0.2569, + "step": 27594 + }, + { + "epoch": 0.55192, + "grad_norm": 0.26382920145988464, + "learning_rate": 9.932979858457442e-06, + "loss": 0.0078, + "step": 27596 + }, + { + "epoch": 0.55196, + "grad_norm": 0.24437756836414337, + "learning_rate": 9.931583627072008e-06, + "loss": 0.0081, + "step": 27598 + }, + { + "epoch": 0.552, + "grad_norm": 0.23513302206993103, + "learning_rate": 9.930187397020385e-06, + "loss": 0.0104, + "step": 27600 + }, + { + "epoch": 0.55204, + "grad_norm": 0.7561574578285217, + "learning_rate": 9.928791168329798e-06, + "loss": 0.0174, + "step": 27602 + }, + { + "epoch": 0.55208, + "grad_norm": 8.996896743774414, + "learning_rate": 9.927394941027461e-06, + "loss": 0.1522, + "step": 27604 + }, + { + "epoch": 0.55212, + "grad_norm": 0.059313345700502396, + "learning_rate": 9.925998715140599e-06, + "loss": 0.0114, + "step": 27606 + }, + { + "epoch": 0.55216, + "grad_norm": 1.416508674621582, + "learning_rate": 9.92460249069643e-06, + "loss": 0.0237, + "step": 27608 + }, + { + "epoch": 0.5522, + "grad_norm": 0.0651056244969368, + "learning_rate": 9.923206267722173e-06, + "loss": 0.0476, + "step": 27610 + }, + { + "epoch": 0.55224, + "grad_norm": 0.06001109629869461, + "learning_rate": 9.921810046245052e-06, + "loss": 0.0007, + "step": 27612 + }, + { + "epoch": 0.55228, + "grad_norm": 1.0513113737106323, + "learning_rate": 9.920413826292281e-06, + "loss": 0.0486, + "step": 27614 + }, + { + "epoch": 0.55232, + "grad_norm": 0.2517978847026825, + "learning_rate": 9.919017607891087e-06, + "loss": 0.0046, + "step": 27616 + }, + { + "epoch": 0.55236, + "grad_norm": 0.25240519642829895, + "learning_rate": 9.917621391068687e-06, + "loss": 0.0065, + "step": 27618 + }, + { + "epoch": 0.5524, + "grad_norm": 0.05283946543931961, + "learning_rate": 9.916225175852295e-06, + "loss": 0.0054, + "step": 27620 + }, + { + "epoch": 0.55244, + "grad_norm": 0.29530519247055054, + "learning_rate": 9.914828962269139e-06, + "loss": 0.2549, + "step": 27622 + }, + { + "epoch": 0.55248, + "grad_norm": 0.27795419096946716, + "learning_rate": 9.913432750346438e-06, + "loss": 0.0058, + "step": 27624 + }, + { + "epoch": 0.55252, + "grad_norm": 0.09765683859586716, + "learning_rate": 9.912036540111405e-06, + "loss": 0.0031, + "step": 27626 + }, + { + "epoch": 0.55256, + "grad_norm": 0.00946139171719551, + "learning_rate": 9.910640331591272e-06, + "loss": 0.0134, + "step": 27628 + }, + { + "epoch": 0.5526, + "grad_norm": 0.8164198398590088, + "learning_rate": 9.909244124813246e-06, + "loss": 0.019, + "step": 27630 + }, + { + "epoch": 0.55264, + "grad_norm": 0.7800242304801941, + "learning_rate": 9.907847919804557e-06, + "loss": 0.3302, + "step": 27632 + }, + { + "epoch": 0.55268, + "grad_norm": 0.08243805170059204, + "learning_rate": 9.90645171659242e-06, + "loss": 0.3378, + "step": 27634 + }, + { + "epoch": 0.55272, + "grad_norm": 0.21612322330474854, + "learning_rate": 9.90505551520405e-06, + "loss": 0.011, + "step": 27636 + }, + { + "epoch": 0.55276, + "grad_norm": 1.0669845342636108, + "learning_rate": 9.903659315666675e-06, + "loss": 0.0206, + "step": 27638 + }, + { + "epoch": 0.5528, + "grad_norm": 0.5960061550140381, + "learning_rate": 9.902263118007513e-06, + "loss": 0.0103, + "step": 27640 + }, + { + "epoch": 0.55284, + "grad_norm": 0.2351382076740265, + "learning_rate": 9.900866922253777e-06, + "loss": 0.0273, + "step": 27642 + }, + { + "epoch": 0.55288, + "grad_norm": 0.10907915979623795, + "learning_rate": 9.899470728432695e-06, + "loss": 0.0019, + "step": 27644 + }, + { + "epoch": 0.55292, + "grad_norm": 2.508169412612915, + "learning_rate": 9.898074536571482e-06, + "loss": 0.0527, + "step": 27646 + }, + { + "epoch": 0.55296, + "grad_norm": 0.6012316942214966, + "learning_rate": 9.89667834669736e-06, + "loss": 0.0094, + "step": 27648 + }, + { + "epoch": 0.553, + "grad_norm": 0.3078933358192444, + "learning_rate": 9.895282158837545e-06, + "loss": 0.0269, + "step": 27650 + }, + { + "epoch": 0.55304, + "grad_norm": 0.1605868637561798, + "learning_rate": 9.893885973019257e-06, + "loss": 0.0042, + "step": 27652 + }, + { + "epoch": 0.55308, + "grad_norm": 3.963712215423584, + "learning_rate": 9.892489789269719e-06, + "loss": 0.2926, + "step": 27654 + }, + { + "epoch": 0.55312, + "grad_norm": 0.07757361978292465, + "learning_rate": 9.891093607616147e-06, + "loss": 0.0075, + "step": 27656 + }, + { + "epoch": 0.55316, + "grad_norm": 0.8136953115463257, + "learning_rate": 9.889697428085759e-06, + "loss": 0.1005, + "step": 27658 + }, + { + "epoch": 0.5532, + "grad_norm": 3.1040847301483154, + "learning_rate": 9.88830125070578e-06, + "loss": 0.0434, + "step": 27660 + }, + { + "epoch": 0.55324, + "grad_norm": 0.032721735537052155, + "learning_rate": 9.886905075503423e-06, + "loss": 0.0016, + "step": 27662 + }, + { + "epoch": 0.55328, + "grad_norm": 0.433323472738266, + "learning_rate": 9.885508902505913e-06, + "loss": 0.0047, + "step": 27664 + }, + { + "epoch": 0.55332, + "grad_norm": 0.09007762372493744, + "learning_rate": 9.884112731740462e-06, + "loss": 0.0015, + "step": 27666 + }, + { + "epoch": 0.55336, + "grad_norm": 0.006099425721913576, + "learning_rate": 9.882716563234296e-06, + "loss": 0.0003, + "step": 27668 + }, + { + "epoch": 0.5534, + "grad_norm": 0.1495029330253601, + "learning_rate": 9.88132039701463e-06, + "loss": 0.2856, + "step": 27670 + }, + { + "epoch": 0.55344, + "grad_norm": 0.13580183684825897, + "learning_rate": 9.879924233108686e-06, + "loss": 0.004, + "step": 27672 + }, + { + "epoch": 0.55348, + "grad_norm": 7.014049530029297, + "learning_rate": 9.878528071543674e-06, + "loss": 0.1276, + "step": 27674 + }, + { + "epoch": 0.55352, + "grad_norm": 0.1677582710981369, + "learning_rate": 9.877131912346827e-06, + "loss": 0.0027, + "step": 27676 + }, + { + "epoch": 0.55356, + "grad_norm": 0.3914656639099121, + "learning_rate": 9.87573575554535e-06, + "loss": 0.0063, + "step": 27678 + }, + { + "epoch": 0.5536, + "grad_norm": 0.049463022500276566, + "learning_rate": 9.874339601166474e-06, + "loss": 0.0023, + "step": 27680 + }, + { + "epoch": 0.55364, + "grad_norm": 0.0753476619720459, + "learning_rate": 9.872943449237413e-06, + "loss": 0.0024, + "step": 27682 + }, + { + "epoch": 0.55368, + "grad_norm": 0.003053525695577264, + "learning_rate": 9.87154729978538e-06, + "loss": 0.1265, + "step": 27684 + }, + { + "epoch": 0.55372, + "grad_norm": 0.012832383625209332, + "learning_rate": 9.870151152837604e-06, + "loss": 0.0046, + "step": 27686 + }, + { + "epoch": 0.55376, + "grad_norm": 0.024933794513344765, + "learning_rate": 9.868755008421296e-06, + "loss": 0.038, + "step": 27688 + }, + { + "epoch": 0.5538, + "grad_norm": 12.498543739318848, + "learning_rate": 9.867358866563674e-06, + "loss": 0.3823, + "step": 27690 + }, + { + "epoch": 0.55384, + "grad_norm": 0.9905427694320679, + "learning_rate": 9.865962727291961e-06, + "loss": 0.0157, + "step": 27692 + }, + { + "epoch": 0.55388, + "grad_norm": 0.7272503972053528, + "learning_rate": 9.864566590633371e-06, + "loss": 0.1078, + "step": 27694 + }, + { + "epoch": 0.55392, + "grad_norm": 1.465700626373291, + "learning_rate": 9.863170456615128e-06, + "loss": 0.0192, + "step": 27696 + }, + { + "epoch": 0.55396, + "grad_norm": 0.07881804555654526, + "learning_rate": 9.86177432526445e-06, + "loss": 0.0657, + "step": 27698 + }, + { + "epoch": 0.554, + "grad_norm": 8.025162696838379, + "learning_rate": 9.860378196608549e-06, + "loss": 0.1992, + "step": 27700 + }, + { + "epoch": 0.55404, + "grad_norm": 2.721102237701416, + "learning_rate": 9.85898207067465e-06, + "loss": 0.0713, + "step": 27702 + }, + { + "epoch": 0.55408, + "grad_norm": 0.22787438333034515, + "learning_rate": 9.857585947489966e-06, + "loss": 0.0028, + "step": 27704 + }, + { + "epoch": 0.55412, + "grad_norm": 2.82332444190979, + "learning_rate": 9.856189827081713e-06, + "loss": 0.0678, + "step": 27706 + }, + { + "epoch": 0.55416, + "grad_norm": 0.008079352788627148, + "learning_rate": 9.85479370947712e-06, + "loss": 0.0009, + "step": 27708 + }, + { + "epoch": 0.5542, + "grad_norm": 0.27983203530311584, + "learning_rate": 9.853397594703394e-06, + "loss": 0.0084, + "step": 27710 + }, + { + "epoch": 0.55424, + "grad_norm": 2.8296101093292236, + "learning_rate": 9.85200148278776e-06, + "loss": 0.1128, + "step": 27712 + }, + { + "epoch": 0.55428, + "grad_norm": 0.11699584126472473, + "learning_rate": 9.850605373757434e-06, + "loss": 0.1517, + "step": 27714 + }, + { + "epoch": 0.55432, + "grad_norm": 0.36213919520378113, + "learning_rate": 9.84920926763963e-06, + "loss": 0.0044, + "step": 27716 + }, + { + "epoch": 0.55436, + "grad_norm": 0.09329619258642197, + "learning_rate": 9.847813164461572e-06, + "loss": 0.0888, + "step": 27718 + }, + { + "epoch": 0.5544, + "grad_norm": 0.7968976497650146, + "learning_rate": 9.84641706425047e-06, + "loss": 0.0895, + "step": 27720 + }, + { + "epoch": 0.55444, + "grad_norm": 0.9311356544494629, + "learning_rate": 9.845020967033551e-06, + "loss": 0.0105, + "step": 27722 + }, + { + "epoch": 0.55448, + "grad_norm": 0.5992615818977356, + "learning_rate": 9.843624872838029e-06, + "loss": 0.2409, + "step": 27724 + }, + { + "epoch": 0.55452, + "grad_norm": 0.6014813780784607, + "learning_rate": 9.842228781691115e-06, + "loss": 0.0075, + "step": 27726 + }, + { + "epoch": 0.55456, + "grad_norm": 0.021701080724596977, + "learning_rate": 9.840832693620037e-06, + "loss": 0.0037, + "step": 27728 + }, + { + "epoch": 0.5546, + "grad_norm": 0.013848645612597466, + "learning_rate": 9.839436608652007e-06, + "loss": 0.0004, + "step": 27730 + }, + { + "epoch": 0.55464, + "grad_norm": 0.0006705937557853758, + "learning_rate": 9.838040526814241e-06, + "loss": 0.0118, + "step": 27732 + }, + { + "epoch": 0.55468, + "grad_norm": 0.04235807806253433, + "learning_rate": 9.83664444813396e-06, + "loss": 0.0056, + "step": 27734 + }, + { + "epoch": 0.55472, + "grad_norm": 2.2906301021575928, + "learning_rate": 9.835248372638379e-06, + "loss": 0.1894, + "step": 27736 + }, + { + "epoch": 0.55476, + "grad_norm": 1.6637910604476929, + "learning_rate": 9.833852300354716e-06, + "loss": 0.0286, + "step": 27738 + }, + { + "epoch": 0.5548, + "grad_norm": 4.706565856933594, + "learning_rate": 9.832456231310189e-06, + "loss": 0.0881, + "step": 27740 + }, + { + "epoch": 0.55484, + "grad_norm": 0.04350293427705765, + "learning_rate": 9.83106016553201e-06, + "loss": 0.0062, + "step": 27742 + }, + { + "epoch": 0.55488, + "grad_norm": 0.09358122199773788, + "learning_rate": 9.829664103047404e-06, + "loss": 0.0012, + "step": 27744 + }, + { + "epoch": 0.55492, + "grad_norm": 0.5511211156845093, + "learning_rate": 9.828268043883584e-06, + "loss": 0.1566, + "step": 27746 + }, + { + "epoch": 0.55496, + "grad_norm": 0.6073222756385803, + "learning_rate": 9.826871988067763e-06, + "loss": 0.0216, + "step": 27748 + }, + { + "epoch": 0.555, + "grad_norm": 0.43576663732528687, + "learning_rate": 9.825475935627165e-06, + "loss": 0.0107, + "step": 27750 + }, + { + "epoch": 0.55504, + "grad_norm": 0.09857188910245895, + "learning_rate": 9.824079886589004e-06, + "loss": 0.0254, + "step": 27752 + }, + { + "epoch": 0.55508, + "grad_norm": 0.0038349914830178022, + "learning_rate": 9.822683840980496e-06, + "loss": 0.015, + "step": 27754 + }, + { + "epoch": 0.55512, + "grad_norm": 0.1761569231748581, + "learning_rate": 9.821287798828858e-06, + "loss": 0.0024, + "step": 27756 + }, + { + "epoch": 0.55516, + "grad_norm": 0.6028778553009033, + "learning_rate": 9.819891760161302e-06, + "loss": 0.0074, + "step": 27758 + }, + { + "epoch": 0.5552, + "grad_norm": 0.3083222806453705, + "learning_rate": 9.818495725005053e-06, + "loss": 0.0112, + "step": 27760 + }, + { + "epoch": 0.55524, + "grad_norm": 0.5766793489456177, + "learning_rate": 9.817099693387324e-06, + "loss": 0.01, + "step": 27762 + }, + { + "epoch": 0.55528, + "grad_norm": 0.40994375944137573, + "learning_rate": 9.815703665335327e-06, + "loss": 0.0069, + "step": 27764 + }, + { + "epoch": 0.55532, + "grad_norm": 0.012059430591762066, + "learning_rate": 9.814307640876284e-06, + "loss": 0.0003, + "step": 27766 + }, + { + "epoch": 0.55536, + "grad_norm": 0.012128137983381748, + "learning_rate": 9.81291162003741e-06, + "loss": 0.0001, + "step": 27768 + }, + { + "epoch": 0.5554, + "grad_norm": 0.0004434961883816868, + "learning_rate": 9.81151560284592e-06, + "loss": 0.0213, + "step": 27770 + }, + { + "epoch": 0.55544, + "grad_norm": 0.0006106240325607359, + "learning_rate": 9.810119589329031e-06, + "loss": 0.0192, + "step": 27772 + }, + { + "epoch": 0.55548, + "grad_norm": 0.9939858913421631, + "learning_rate": 9.808723579513955e-06, + "loss": 0.0171, + "step": 27774 + }, + { + "epoch": 0.55552, + "grad_norm": 1.1986949443817139, + "learning_rate": 9.807327573427914e-06, + "loss": 0.0118, + "step": 27776 + }, + { + "epoch": 0.55556, + "grad_norm": 5.44025993347168, + "learning_rate": 9.805931571098123e-06, + "loss": 0.1174, + "step": 27778 + }, + { + "epoch": 0.5556, + "grad_norm": 0.008697547018527985, + "learning_rate": 9.80453557255179e-06, + "loss": 0.0215, + "step": 27780 + }, + { + "epoch": 0.55564, + "grad_norm": 0.3671952188014984, + "learning_rate": 9.803139577816142e-06, + "loss": 0.0045, + "step": 27782 + }, + { + "epoch": 0.55568, + "grad_norm": 1.0792585611343384, + "learning_rate": 9.801743586918386e-06, + "loss": 0.0173, + "step": 27784 + }, + { + "epoch": 0.55572, + "grad_norm": 0.0046522426418960094, + "learning_rate": 9.800347599885745e-06, + "loss": 0.0065, + "step": 27786 + }, + { + "epoch": 0.55576, + "grad_norm": 0.1355116069316864, + "learning_rate": 9.798951616745427e-06, + "loss": 0.0047, + "step": 27788 + }, + { + "epoch": 0.5558, + "grad_norm": 0.08097584545612335, + "learning_rate": 9.79755563752465e-06, + "loss": 0.0019, + "step": 27790 + }, + { + "epoch": 0.55584, + "grad_norm": 0.9992024302482605, + "learning_rate": 9.796159662250632e-06, + "loss": 0.0132, + "step": 27792 + }, + { + "epoch": 0.55588, + "grad_norm": 0.6516185402870178, + "learning_rate": 9.794763690950588e-06, + "loss": 0.779, + "step": 27794 + }, + { + "epoch": 0.55592, + "grad_norm": 0.04964638873934746, + "learning_rate": 9.793367723651726e-06, + "loss": 0.0007, + "step": 27796 + }, + { + "epoch": 0.55596, + "grad_norm": 0.0006640268256887794, + "learning_rate": 9.791971760381271e-06, + "loss": 0.0024, + "step": 27798 + }, + { + "epoch": 0.556, + "grad_norm": 3.5991814136505127, + "learning_rate": 9.790575801166432e-06, + "loss": 0.0482, + "step": 27800 + }, + { + "epoch": 0.55604, + "grad_norm": 0.006409541238099337, + "learning_rate": 9.789179846034424e-06, + "loss": 0.0004, + "step": 27802 + }, + { + "epoch": 0.55608, + "grad_norm": 1.3521472215652466, + "learning_rate": 9.787783895012468e-06, + "loss": 0.2029, + "step": 27804 + }, + { + "epoch": 0.55612, + "grad_norm": 7.778560638427734, + "learning_rate": 9.78638794812777e-06, + "loss": 0.0883, + "step": 27806 + }, + { + "epoch": 0.55616, + "grad_norm": 8.262565612792969, + "learning_rate": 9.784992005407553e-06, + "loss": 0.0493, + "step": 27808 + }, + { + "epoch": 0.5562, + "grad_norm": 16.589704513549805, + "learning_rate": 9.783596066879023e-06, + "loss": 2.0187, + "step": 27810 + }, + { + "epoch": 0.55624, + "grad_norm": 0.17991311848163605, + "learning_rate": 9.782200132569402e-06, + "loss": 1.1029, + "step": 27812 + }, + { + "epoch": 0.55628, + "grad_norm": 4.495822429656982, + "learning_rate": 9.780804202505902e-06, + "loss": 0.0512, + "step": 27814 + }, + { + "epoch": 0.55632, + "grad_norm": 12.092491149902344, + "learning_rate": 9.779408276715733e-06, + "loss": 0.1772, + "step": 27816 + }, + { + "epoch": 0.55636, + "grad_norm": 4.1191301345825195, + "learning_rate": 9.778012355226117e-06, + "loss": 0.0421, + "step": 27818 + }, + { + "epoch": 0.5564, + "grad_norm": 0.8931537866592407, + "learning_rate": 9.776616438064265e-06, + "loss": 0.0238, + "step": 27820 + }, + { + "epoch": 0.55644, + "grad_norm": 4.560133457183838, + "learning_rate": 9.775220525257388e-06, + "loss": 0.4499, + "step": 27822 + }, + { + "epoch": 0.55648, + "grad_norm": 0.6696871519088745, + "learning_rate": 9.773824616832706e-06, + "loss": 0.1988, + "step": 27824 + }, + { + "epoch": 0.55652, + "grad_norm": 0.06957507878541946, + "learning_rate": 9.772428712817424e-06, + "loss": 0.0018, + "step": 27826 + }, + { + "epoch": 0.55656, + "grad_norm": 0.013362167403101921, + "learning_rate": 9.771032813238766e-06, + "loss": 0.0025, + "step": 27828 + }, + { + "epoch": 0.5566, + "grad_norm": 0.5778622627258301, + "learning_rate": 9.76963691812394e-06, + "loss": 0.0105, + "step": 27830 + }, + { + "epoch": 0.55664, + "grad_norm": 0.06285127997398376, + "learning_rate": 9.768241027500157e-06, + "loss": 0.0014, + "step": 27832 + }, + { + "epoch": 0.55668, + "grad_norm": 8.215449333190918, + "learning_rate": 9.76684514139464e-06, + "loss": 0.1896, + "step": 27834 + }, + { + "epoch": 0.55672, + "grad_norm": 0.08578488975763321, + "learning_rate": 9.765449259834596e-06, + "loss": 0.0054, + "step": 27836 + }, + { + "epoch": 0.55676, + "grad_norm": 0.025802334770560265, + "learning_rate": 9.764053382847239e-06, + "loss": 0.0097, + "step": 27838 + }, + { + "epoch": 0.5568, + "grad_norm": 0.3122371733188629, + "learning_rate": 9.762657510459784e-06, + "loss": 0.0198, + "step": 27840 + }, + { + "epoch": 0.55684, + "grad_norm": 0.13446548581123352, + "learning_rate": 9.761261642699436e-06, + "loss": 0.0017, + "step": 27842 + }, + { + "epoch": 0.55688, + "grad_norm": 0.9780333042144775, + "learning_rate": 9.759865779593423e-06, + "loss": 0.0266, + "step": 27844 + }, + { + "epoch": 0.55692, + "grad_norm": 1.625132441520691, + "learning_rate": 9.758469921168949e-06, + "loss": 0.0269, + "step": 27846 + }, + { + "epoch": 0.55696, + "grad_norm": 0.15297484397888184, + "learning_rate": 9.757074067453224e-06, + "loss": 0.0256, + "step": 27848 + }, + { + "epoch": 0.557, + "grad_norm": 0.03902287408709526, + "learning_rate": 9.75567821847347e-06, + "loss": 0.0062, + "step": 27850 + }, + { + "epoch": 0.55704, + "grad_norm": 0.11382202804088593, + "learning_rate": 9.754282374256893e-06, + "loss": 0.0052, + "step": 27852 + }, + { + "epoch": 0.55708, + "grad_norm": 1.3527569770812988, + "learning_rate": 9.752886534830707e-06, + "loss": 0.0215, + "step": 27854 + }, + { + "epoch": 0.55712, + "grad_norm": 0.15838785469532013, + "learning_rate": 9.751490700222128e-06, + "loss": 0.006, + "step": 27856 + }, + { + "epoch": 0.55716, + "grad_norm": 0.08286938071250916, + "learning_rate": 9.75009487045836e-06, + "loss": 0.0016, + "step": 27858 + }, + { + "epoch": 0.5572, + "grad_norm": 0.002188511658459902, + "learning_rate": 9.748699045566626e-06, + "loss": 0.0021, + "step": 27860 + }, + { + "epoch": 0.55724, + "grad_norm": 0.03978179022669792, + "learning_rate": 9.747303225574134e-06, + "loss": 0.0036, + "step": 27862 + }, + { + "epoch": 0.55728, + "grad_norm": 0.8980177044868469, + "learning_rate": 9.74590741050809e-06, + "loss": 0.0134, + "step": 27864 + }, + { + "epoch": 0.55732, + "grad_norm": 0.023520490154623985, + "learning_rate": 9.744511600395718e-06, + "loss": 0.0085, + "step": 27866 + }, + { + "epoch": 0.55736, + "grad_norm": 0.0344727486371994, + "learning_rate": 9.743115795264222e-06, + "loss": 0.0192, + "step": 27868 + }, + { + "epoch": 0.5574, + "grad_norm": 2.460998773574829, + "learning_rate": 9.741719995140814e-06, + "loss": 0.0276, + "step": 27870 + }, + { + "epoch": 0.55744, + "grad_norm": 0.06928756833076477, + "learning_rate": 9.74032420005271e-06, + "loss": 0.0039, + "step": 27872 + }, + { + "epoch": 0.55748, + "grad_norm": 0.27635639905929565, + "learning_rate": 9.738928410027119e-06, + "loss": 0.0126, + "step": 27874 + }, + { + "epoch": 0.55752, + "grad_norm": 0.05948524922132492, + "learning_rate": 9.737532625091254e-06, + "loss": 0.0014, + "step": 27876 + }, + { + "epoch": 0.55756, + "grad_norm": 0.8727840185165405, + "learning_rate": 9.736136845272325e-06, + "loss": 0.0207, + "step": 27878 + }, + { + "epoch": 0.5576, + "grad_norm": 10.062636375427246, + "learning_rate": 9.73474107059754e-06, + "loss": 0.1733, + "step": 27880 + }, + { + "epoch": 0.55764, + "grad_norm": 0.17335271835327148, + "learning_rate": 9.73334530109412e-06, + "loss": 0.0113, + "step": 27882 + }, + { + "epoch": 0.55768, + "grad_norm": 0.11623515188694, + "learning_rate": 9.73194953678927e-06, + "loss": 0.0015, + "step": 27884 + }, + { + "epoch": 0.55772, + "grad_norm": 9.368587493896484, + "learning_rate": 9.730553777710196e-06, + "loss": 0.127, + "step": 27886 + }, + { + "epoch": 0.55776, + "grad_norm": 0.6483526825904846, + "learning_rate": 9.729158023884122e-06, + "loss": 0.0199, + "step": 27888 + }, + { + "epoch": 0.5578, + "grad_norm": 4.937281131744385, + "learning_rate": 9.727762275338246e-06, + "loss": 0.0749, + "step": 27890 + }, + { + "epoch": 0.55784, + "grad_norm": 14.8868989944458, + "learning_rate": 9.72636653209979e-06, + "loss": 0.3407, + "step": 27892 + }, + { + "epoch": 0.55788, + "grad_norm": 0.0392831452190876, + "learning_rate": 9.724970794195957e-06, + "loss": 0.0032, + "step": 27894 + }, + { + "epoch": 0.55792, + "grad_norm": 0.06026112288236618, + "learning_rate": 9.723575061653957e-06, + "loss": 0.0044, + "step": 27896 + }, + { + "epoch": 0.55796, + "grad_norm": 0.3608343005180359, + "learning_rate": 9.722179334501008e-06, + "loss": 0.0054, + "step": 27898 + }, + { + "epoch": 0.558, + "grad_norm": 0.2614648938179016, + "learning_rate": 9.720783612764314e-06, + "loss": 0.0163, + "step": 27900 + }, + { + "epoch": 0.55804, + "grad_norm": 0.30727818608283997, + "learning_rate": 9.719387896471084e-06, + "loss": 0.0074, + "step": 27902 + }, + { + "epoch": 0.55808, + "grad_norm": 0.07595589756965637, + "learning_rate": 9.717992185648537e-06, + "loss": 0.0013, + "step": 27904 + }, + { + "epoch": 0.55812, + "grad_norm": 0.16658702492713928, + "learning_rate": 9.716596480323875e-06, + "loss": 0.002, + "step": 27906 + }, + { + "epoch": 0.55816, + "grad_norm": 1.1799055337905884, + "learning_rate": 9.715200780524311e-06, + "loss": 0.0316, + "step": 27908 + }, + { + "epoch": 0.5582, + "grad_norm": 0.06891180574893951, + "learning_rate": 9.713805086277055e-06, + "loss": 0.0178, + "step": 27910 + }, + { + "epoch": 0.55824, + "grad_norm": 0.1100345030426979, + "learning_rate": 9.712409397609312e-06, + "loss": 0.0018, + "step": 27912 + }, + { + "epoch": 0.55828, + "grad_norm": 3.2146759033203125, + "learning_rate": 9.7110137145483e-06, + "loss": 0.1798, + "step": 27914 + }, + { + "epoch": 0.55832, + "grad_norm": 0.8063428997993469, + "learning_rate": 9.709618037121223e-06, + "loss": 0.0098, + "step": 27916 + }, + { + "epoch": 0.55836, + "grad_norm": 0.2913118898868561, + "learning_rate": 9.708222365355292e-06, + "loss": 0.0036, + "step": 27918 + }, + { + "epoch": 0.5584, + "grad_norm": 0.20612582564353943, + "learning_rate": 9.706826699277719e-06, + "loss": 0.3754, + "step": 27920 + }, + { + "epoch": 0.55844, + "grad_norm": 0.4143584668636322, + "learning_rate": 9.705431038915707e-06, + "loss": 0.0077, + "step": 27922 + }, + { + "epoch": 0.55848, + "grad_norm": 0.09916268289089203, + "learning_rate": 9.704035384296471e-06, + "loss": 0.0102, + "step": 27924 + }, + { + "epoch": 0.55852, + "grad_norm": 0.16190136969089508, + "learning_rate": 9.702639735447215e-06, + "loss": 0.0037, + "step": 27926 + }, + { + "epoch": 0.55856, + "grad_norm": 2.729367256164551, + "learning_rate": 9.701244092395152e-06, + "loss": 0.0796, + "step": 27928 + }, + { + "epoch": 0.5586, + "grad_norm": 0.8335127234458923, + "learning_rate": 9.699848455167489e-06, + "loss": 0.0084, + "step": 27930 + }, + { + "epoch": 0.55864, + "grad_norm": 0.3058096766471863, + "learning_rate": 9.698452823791433e-06, + "loss": 0.006, + "step": 27932 + }, + { + "epoch": 0.55868, + "grad_norm": 9.389960289001465, + "learning_rate": 9.697057198294196e-06, + "loss": 0.5325, + "step": 27934 + }, + { + "epoch": 0.55872, + "grad_norm": 16.99538803100586, + "learning_rate": 9.695661578702987e-06, + "loss": 0.8823, + "step": 27936 + }, + { + "epoch": 0.55876, + "grad_norm": 0.028203124180436134, + "learning_rate": 9.694265965045007e-06, + "loss": 0.002, + "step": 27938 + }, + { + "epoch": 0.5588, + "grad_norm": 0.03898847848176956, + "learning_rate": 9.692870357347474e-06, + "loss": 0.0008, + "step": 27940 + }, + { + "epoch": 0.55884, + "grad_norm": 0.1262374222278595, + "learning_rate": 9.691474755637592e-06, + "loss": 0.0018, + "step": 27942 + }, + { + "epoch": 0.55888, + "grad_norm": 3.5666403770446777, + "learning_rate": 9.690079159942566e-06, + "loss": 0.0719, + "step": 27944 + }, + { + "epoch": 0.55892, + "grad_norm": 0.26233044266700745, + "learning_rate": 9.688683570289607e-06, + "loss": 0.0036, + "step": 27946 + }, + { + "epoch": 0.55896, + "grad_norm": 0.013490596786141396, + "learning_rate": 9.68728798670592e-06, + "loss": 0.0005, + "step": 27948 + }, + { + "epoch": 0.559, + "grad_norm": 0.08990497887134552, + "learning_rate": 9.685892409218718e-06, + "loss": 0.0043, + "step": 27950 + }, + { + "epoch": 0.55904, + "grad_norm": 2.1161022186279297, + "learning_rate": 9.684496837855206e-06, + "loss": 0.0282, + "step": 27952 + }, + { + "epoch": 0.55908, + "grad_norm": 0.020155848935246468, + "learning_rate": 9.683101272642584e-06, + "loss": 0.0047, + "step": 27954 + }, + { + "epoch": 0.55912, + "grad_norm": 0.11466172337532043, + "learning_rate": 9.681705713608073e-06, + "loss": 0.0146, + "step": 27956 + }, + { + "epoch": 0.55916, + "grad_norm": 0.2921643853187561, + "learning_rate": 9.680310160778871e-06, + "loss": 0.0035, + "step": 27958 + }, + { + "epoch": 0.5592, + "grad_norm": 0.06234792247414589, + "learning_rate": 9.678914614182185e-06, + "loss": 0.0801, + "step": 27960 + }, + { + "epoch": 0.55924, + "grad_norm": 0.010948210023343563, + "learning_rate": 9.677519073845227e-06, + "loss": 0.0094, + "step": 27962 + }, + { + "epoch": 0.55928, + "grad_norm": 0.5774330496788025, + "learning_rate": 9.676123539795197e-06, + "loss": 0.0448, + "step": 27964 + }, + { + "epoch": 0.55932, + "grad_norm": 0.22203844785690308, + "learning_rate": 9.674728012059309e-06, + "loss": 0.4134, + "step": 27966 + }, + { + "epoch": 0.55936, + "grad_norm": 5.237298488616943, + "learning_rate": 9.673332490664765e-06, + "loss": 0.0787, + "step": 27968 + }, + { + "epoch": 0.5594, + "grad_norm": 1.1646064519882202, + "learning_rate": 9.671936975638768e-06, + "loss": 0.0305, + "step": 27970 + }, + { + "epoch": 0.55944, + "grad_norm": 0.016720907762646675, + "learning_rate": 9.670541467008534e-06, + "loss": 0.0007, + "step": 27972 + }, + { + "epoch": 0.55948, + "grad_norm": 0.17345932126045227, + "learning_rate": 9.669145964801262e-06, + "loss": 0.0054, + "step": 27974 + }, + { + "epoch": 0.55952, + "grad_norm": 0.19992846250534058, + "learning_rate": 9.66775046904416e-06, + "loss": 0.0298, + "step": 27976 + }, + { + "epoch": 0.55956, + "grad_norm": 7.357767105102539, + "learning_rate": 9.666354979764433e-06, + "loss": 0.1906, + "step": 27978 + }, + { + "epoch": 0.5596, + "grad_norm": 0.243364155292511, + "learning_rate": 9.664959496989286e-06, + "loss": 0.0069, + "step": 27980 + }, + { + "epoch": 0.55964, + "grad_norm": 1.2284674644470215, + "learning_rate": 9.663564020745927e-06, + "loss": 0.0151, + "step": 27982 + }, + { + "epoch": 0.55968, + "grad_norm": 0.04106824845075607, + "learning_rate": 9.662168551061564e-06, + "loss": 0.0012, + "step": 27984 + }, + { + "epoch": 0.55972, + "grad_norm": 0.029306307435035706, + "learning_rate": 9.660773087963393e-06, + "loss": 0.0076, + "step": 27986 + }, + { + "epoch": 0.55976, + "grad_norm": 0.38954854011535645, + "learning_rate": 9.65937763147863e-06, + "loss": 0.1807, + "step": 27988 + }, + { + "epoch": 0.5598, + "grad_norm": 0.07660063356161118, + "learning_rate": 9.657982181634476e-06, + "loss": 0.0141, + "step": 27990 + }, + { + "epoch": 0.55984, + "grad_norm": 0.1729685515165329, + "learning_rate": 9.656586738458133e-06, + "loss": 0.0025, + "step": 27992 + }, + { + "epoch": 0.55988, + "grad_norm": 9.516131401062012, + "learning_rate": 9.655191301976808e-06, + "loss": 0.1959, + "step": 27994 + }, + { + "epoch": 0.55992, + "grad_norm": 2.3472156524658203, + "learning_rate": 9.653795872217705e-06, + "loss": 0.0294, + "step": 27996 + }, + { + "epoch": 0.55996, + "grad_norm": 0.923475980758667, + "learning_rate": 9.652400449208034e-06, + "loss": 0.0101, + "step": 27998 + }, + { + "epoch": 0.56, + "grad_norm": 3.064631223678589, + "learning_rate": 9.651005032974994e-06, + "loss": 0.0407, + "step": 28000 + }, + { + "epoch": 0.56004, + "grad_norm": 0.0060301367193460464, + "learning_rate": 9.649609623545785e-06, + "loss": 0.0082, + "step": 28002 + }, + { + "epoch": 0.56008, + "grad_norm": 1.0289080142974854, + "learning_rate": 9.648214220947623e-06, + "loss": 0.0131, + "step": 28004 + }, + { + "epoch": 0.56012, + "grad_norm": 0.8580365180969238, + "learning_rate": 9.646818825207705e-06, + "loss": 0.0136, + "step": 28006 + }, + { + "epoch": 0.56016, + "grad_norm": 0.033235084265470505, + "learning_rate": 9.645423436353233e-06, + "loss": 0.0085, + "step": 28008 + }, + { + "epoch": 0.5602, + "grad_norm": 0.09387192130088806, + "learning_rate": 9.644028054411416e-06, + "loss": 0.0202, + "step": 28010 + }, + { + "epoch": 0.56024, + "grad_norm": 0.09294787794351578, + "learning_rate": 9.642632679409454e-06, + "loss": 0.0533, + "step": 28012 + }, + { + "epoch": 0.56028, + "grad_norm": 8.336915969848633, + "learning_rate": 9.641237311374554e-06, + "loss": 0.2534, + "step": 28014 + }, + { + "epoch": 0.56032, + "grad_norm": 0.25126969814300537, + "learning_rate": 9.639841950333919e-06, + "loss": 0.1655, + "step": 28016 + }, + { + "epoch": 0.56036, + "grad_norm": 0.09446287900209427, + "learning_rate": 9.638446596314744e-06, + "loss": 0.0249, + "step": 28018 + }, + { + "epoch": 0.5604, + "grad_norm": 1.3293390274047852, + "learning_rate": 9.637051249344244e-06, + "loss": 0.0215, + "step": 28020 + }, + { + "epoch": 0.56044, + "grad_norm": 2.5190415382385254, + "learning_rate": 9.635655909449611e-06, + "loss": 0.0553, + "step": 28022 + }, + { + "epoch": 0.56048, + "grad_norm": 7.917520523071289, + "learning_rate": 9.63426057665806e-06, + "loss": 0.138, + "step": 28024 + }, + { + "epoch": 0.56052, + "grad_norm": 0.14260348677635193, + "learning_rate": 9.632865250996787e-06, + "loss": 0.0037, + "step": 28026 + }, + { + "epoch": 0.56056, + "grad_norm": 1.6844749450683594, + "learning_rate": 9.631469932492992e-06, + "loss": 0.0213, + "step": 28028 + }, + { + "epoch": 0.5606, + "grad_norm": 0.028179774060845375, + "learning_rate": 9.630074621173882e-06, + "loss": 0.0242, + "step": 28030 + }, + { + "epoch": 0.56064, + "grad_norm": 0.003600007388740778, + "learning_rate": 9.628679317066659e-06, + "loss": 0.1896, + "step": 28032 + }, + { + "epoch": 0.56068, + "grad_norm": 5.6524200439453125, + "learning_rate": 9.62728402019852e-06, + "loss": 0.1068, + "step": 28034 + }, + { + "epoch": 0.56072, + "grad_norm": 0.5889772772789001, + "learning_rate": 9.625888730596674e-06, + "loss": 0.6178, + "step": 28036 + }, + { + "epoch": 0.56076, + "grad_norm": 1.345757007598877, + "learning_rate": 9.624493448288316e-06, + "loss": 0.0168, + "step": 28038 + }, + { + "epoch": 0.5608, + "grad_norm": 0.2329307645559311, + "learning_rate": 9.623098173300655e-06, + "loss": 0.004, + "step": 28040 + }, + { + "epoch": 0.56084, + "grad_norm": 0.007047729101032019, + "learning_rate": 9.62170290566089e-06, + "loss": 0.0011, + "step": 28042 + }, + { + "epoch": 0.56088, + "grad_norm": 0.14910395443439484, + "learning_rate": 9.62030764539622e-06, + "loss": 0.0044, + "step": 28044 + }, + { + "epoch": 0.56092, + "grad_norm": 0.07507620006799698, + "learning_rate": 9.61891239253385e-06, + "loss": 0.0156, + "step": 28046 + }, + { + "epoch": 0.56096, + "grad_norm": 0.14714406430721283, + "learning_rate": 9.617517147100977e-06, + "loss": 0.0173, + "step": 28048 + }, + { + "epoch": 0.561, + "grad_norm": 0.12323746830224991, + "learning_rate": 9.616121909124801e-06, + "loss": 0.0019, + "step": 28050 + }, + { + "epoch": 0.56104, + "grad_norm": 1.3413605690002441, + "learning_rate": 9.61472667863253e-06, + "loss": 0.0298, + "step": 28052 + }, + { + "epoch": 0.56108, + "grad_norm": 0.059414491057395935, + "learning_rate": 9.613331455651358e-06, + "loss": 0.0024, + "step": 28054 + }, + { + "epoch": 0.56112, + "grad_norm": 0.3268600106239319, + "learning_rate": 9.611936240208492e-06, + "loss": 0.0118, + "step": 28056 + }, + { + "epoch": 0.56116, + "grad_norm": 0.11073578149080276, + "learning_rate": 9.610541032331128e-06, + "loss": 0.0051, + "step": 28058 + }, + { + "epoch": 0.5612, + "grad_norm": 0.04547901451587677, + "learning_rate": 9.609145832046465e-06, + "loss": 0.007, + "step": 28060 + }, + { + "epoch": 0.56124, + "grad_norm": 12.102242469787598, + "learning_rate": 9.607750639381707e-06, + "loss": 0.6904, + "step": 28062 + }, + { + "epoch": 0.56128, + "grad_norm": 0.1789897084236145, + "learning_rate": 9.606355454364052e-06, + "loss": 0.0054, + "step": 28064 + }, + { + "epoch": 0.56132, + "grad_norm": 3.7873220443725586, + "learning_rate": 9.6049602770207e-06, + "loss": 0.0479, + "step": 28066 + }, + { + "epoch": 0.56136, + "grad_norm": 1.010331630706787, + "learning_rate": 9.603565107378851e-06, + "loss": 0.0402, + "step": 28068 + }, + { + "epoch": 0.5614, + "grad_norm": 0.03692992031574249, + "learning_rate": 9.602169945465702e-06, + "loss": 0.0024, + "step": 28070 + }, + { + "epoch": 0.56144, + "grad_norm": 0.1853279173374176, + "learning_rate": 9.600774791308457e-06, + "loss": 0.0271, + "step": 28072 + }, + { + "epoch": 0.56148, + "grad_norm": 0.07865256071090698, + "learning_rate": 9.599379644934314e-06, + "loss": 0.0157, + "step": 28074 + }, + { + "epoch": 0.56152, + "grad_norm": 0.20033441483974457, + "learning_rate": 9.597984506370467e-06, + "loss": 0.0159, + "step": 28076 + }, + { + "epoch": 0.56156, + "grad_norm": 1.2601325511932373, + "learning_rate": 9.596589375644124e-06, + "loss": 0.025, + "step": 28078 + }, + { + "epoch": 0.5616, + "grad_norm": 0.042887575924396515, + "learning_rate": 9.595194252782476e-06, + "loss": 0.0153, + "step": 28080 + }, + { + "epoch": 0.56164, + "grad_norm": 0.2292271852493286, + "learning_rate": 9.593799137812727e-06, + "loss": 0.0052, + "step": 28082 + }, + { + "epoch": 0.56168, + "grad_norm": 0.4187135696411133, + "learning_rate": 9.59240403076207e-06, + "loss": 0.0074, + "step": 28084 + }, + { + "epoch": 0.56172, + "grad_norm": 0.06290174275636673, + "learning_rate": 9.591008931657705e-06, + "loss": 0.0021, + "step": 28086 + }, + { + "epoch": 0.56176, + "grad_norm": 0.5193558931350708, + "learning_rate": 9.589613840526834e-06, + "loss": 0.0103, + "step": 28088 + }, + { + "epoch": 0.5618, + "grad_norm": 0.2940395772457123, + "learning_rate": 9.588218757396655e-06, + "loss": 0.2234, + "step": 28090 + }, + { + "epoch": 0.56184, + "grad_norm": 0.306414932012558, + "learning_rate": 9.586823682294357e-06, + "loss": 0.0213, + "step": 28092 + }, + { + "epoch": 0.56188, + "grad_norm": 1.5131957530975342, + "learning_rate": 9.585428615247148e-06, + "loss": 0.0243, + "step": 28094 + }, + { + "epoch": 0.56192, + "grad_norm": 1.3899258375167847, + "learning_rate": 9.584033556282222e-06, + "loss": 0.0257, + "step": 28096 + }, + { + "epoch": 0.56196, + "grad_norm": 0.7709293961524963, + "learning_rate": 9.582638505426776e-06, + "loss": 0.012, + "step": 28098 + }, + { + "epoch": 0.562, + "grad_norm": 0.22048762440681458, + "learning_rate": 9.581243462708007e-06, + "loss": 0.0404, + "step": 28100 + }, + { + "epoch": 0.56204, + "grad_norm": 0.07482000440359116, + "learning_rate": 9.579848428153107e-06, + "loss": 0.0085, + "step": 28102 + }, + { + "epoch": 0.56208, + "grad_norm": 0.9564638137817383, + "learning_rate": 9.578453401789285e-06, + "loss": 0.1204, + "step": 28104 + }, + { + "epoch": 0.56212, + "grad_norm": 0.18601533770561218, + "learning_rate": 9.577058383643731e-06, + "loss": 0.0033, + "step": 28106 + }, + { + "epoch": 0.56216, + "grad_norm": 0.008574767038226128, + "learning_rate": 9.575663373743636e-06, + "loss": 0.0016, + "step": 28108 + }, + { + "epoch": 0.5622, + "grad_norm": 0.4828856885433197, + "learning_rate": 9.574268372116205e-06, + "loss": 0.0121, + "step": 28110 + }, + { + "epoch": 0.56224, + "grad_norm": 0.021926559507846832, + "learning_rate": 9.572873378788633e-06, + "loss": 0.0048, + "step": 28112 + }, + { + "epoch": 0.56228, + "grad_norm": 0.12970907986164093, + "learning_rate": 9.571478393788112e-06, + "loss": 0.0038, + "step": 28114 + }, + { + "epoch": 0.56232, + "grad_norm": 0.27953603863716125, + "learning_rate": 9.570083417141843e-06, + "loss": 0.0059, + "step": 28116 + }, + { + "epoch": 0.56236, + "grad_norm": 1.6512665748596191, + "learning_rate": 9.568688448877015e-06, + "loss": 0.0468, + "step": 28118 + }, + { + "epoch": 0.5624, + "grad_norm": 1.0006870031356812, + "learning_rate": 9.567293489020831e-06, + "loss": 0.0129, + "step": 28120 + }, + { + "epoch": 0.56244, + "grad_norm": 0.3323831260204315, + "learning_rate": 9.565898537600484e-06, + "loss": 0.0091, + "step": 28122 + }, + { + "epoch": 0.56248, + "grad_norm": 0.12180417031049728, + "learning_rate": 9.564503594643163e-06, + "loss": 0.0118, + "step": 28124 + }, + { + "epoch": 0.56252, + "grad_norm": 2.178788423538208, + "learning_rate": 9.563108660176075e-06, + "loss": 0.476, + "step": 28126 + }, + { + "epoch": 0.56256, + "grad_norm": 0.19712476432323456, + "learning_rate": 9.561713734226405e-06, + "loss": 0.0407, + "step": 28128 + }, + { + "epoch": 0.5626, + "grad_norm": 0.14703622460365295, + "learning_rate": 9.560318816821354e-06, + "loss": 0.0076, + "step": 28130 + }, + { + "epoch": 0.56264, + "grad_norm": 1.3873039484024048, + "learning_rate": 9.55892390798811e-06, + "loss": 0.1689, + "step": 28132 + }, + { + "epoch": 0.56268, + "grad_norm": 1.7973332405090332, + "learning_rate": 9.557529007753874e-06, + "loss": 0.0417, + "step": 28134 + }, + { + "epoch": 0.56272, + "grad_norm": 0.010082075372338295, + "learning_rate": 9.556134116145838e-06, + "loss": 0.0021, + "step": 28136 + }, + { + "epoch": 0.56276, + "grad_norm": 0.3435744345188141, + "learning_rate": 9.554739233191198e-06, + "loss": 0.0065, + "step": 28138 + }, + { + "epoch": 0.5628, + "grad_norm": 0.0363922119140625, + "learning_rate": 9.553344358917141e-06, + "loss": 0.0425, + "step": 28140 + }, + { + "epoch": 0.56284, + "grad_norm": 0.06063404306769371, + "learning_rate": 9.551949493350869e-06, + "loss": 0.0038, + "step": 28142 + }, + { + "epoch": 0.56288, + "grad_norm": 0.06169329211115837, + "learning_rate": 9.550554636519568e-06, + "loss": 0.001, + "step": 28144 + }, + { + "epoch": 0.56292, + "grad_norm": 0.030089087784290314, + "learning_rate": 9.54915978845044e-06, + "loss": 0.0965, + "step": 28146 + }, + { + "epoch": 0.56296, + "grad_norm": 0.029353126883506775, + "learning_rate": 9.547764949170676e-06, + "loss": 0.0008, + "step": 28148 + }, + { + "epoch": 0.563, + "grad_norm": 0.01829596608877182, + "learning_rate": 9.546370118707463e-06, + "loss": 0.0004, + "step": 28150 + }, + { + "epoch": 0.56304, + "grad_norm": 0.0573263019323349, + "learning_rate": 9.544975297088e-06, + "loss": 0.0037, + "step": 28152 + }, + { + "epoch": 0.56308, + "grad_norm": 0.025941206142306328, + "learning_rate": 9.54358048433948e-06, + "loss": 0.0242, + "step": 28154 + }, + { + "epoch": 0.56312, + "grad_norm": 8.525765419006348, + "learning_rate": 9.542185680489087e-06, + "loss": 0.1225, + "step": 28156 + }, + { + "epoch": 0.56316, + "grad_norm": 0.18976950645446777, + "learning_rate": 9.540790885564025e-06, + "loss": 0.0034, + "step": 28158 + }, + { + "epoch": 0.5632, + "grad_norm": 0.07840930670499802, + "learning_rate": 9.539396099591477e-06, + "loss": 0.0432, + "step": 28160 + }, + { + "epoch": 0.56324, + "grad_norm": 0.486000120639801, + "learning_rate": 9.538001322598644e-06, + "loss": 0.0261, + "step": 28162 + }, + { + "epoch": 0.56328, + "grad_norm": 11.118730545043945, + "learning_rate": 9.536606554612712e-06, + "loss": 0.3548, + "step": 28164 + }, + { + "epoch": 0.56332, + "grad_norm": 2.99885630607605, + "learning_rate": 9.535211795660872e-06, + "loss": 0.0476, + "step": 28166 + }, + { + "epoch": 0.56336, + "grad_norm": 0.020622195675969124, + "learning_rate": 9.53381704577032e-06, + "loss": 0.0004, + "step": 28168 + }, + { + "epoch": 0.5634, + "grad_norm": 0.1989106833934784, + "learning_rate": 9.532422304968243e-06, + "loss": 0.0049, + "step": 28170 + }, + { + "epoch": 0.56344, + "grad_norm": 0.6044005155563354, + "learning_rate": 9.53102757328183e-06, + "loss": 0.2929, + "step": 28172 + }, + { + "epoch": 0.56348, + "grad_norm": 0.11110671609640121, + "learning_rate": 9.529632850738281e-06, + "loss": 0.0019, + "step": 28174 + }, + { + "epoch": 0.56352, + "grad_norm": 0.05198169872164726, + "learning_rate": 9.528238137364778e-06, + "loss": 0.0054, + "step": 28176 + }, + { + "epoch": 0.56356, + "grad_norm": 0.3785227835178375, + "learning_rate": 9.52684343318852e-06, + "loss": 0.0077, + "step": 28178 + }, + { + "epoch": 0.5636, + "grad_norm": 0.11121510714292526, + "learning_rate": 9.525448738236691e-06, + "loss": 0.0064, + "step": 28180 + }, + { + "epoch": 0.56364, + "grad_norm": 0.9995467662811279, + "learning_rate": 9.524054052536482e-06, + "loss": 0.0252, + "step": 28182 + }, + { + "epoch": 0.56368, + "grad_norm": 0.04430152103304863, + "learning_rate": 9.522659376115086e-06, + "loss": 0.0009, + "step": 28184 + }, + { + "epoch": 0.56372, + "grad_norm": 5.814423561096191, + "learning_rate": 9.521264708999692e-06, + "loss": 0.1343, + "step": 28186 + }, + { + "epoch": 0.56376, + "grad_norm": 0.012110461480915546, + "learning_rate": 9.519870051217483e-06, + "loss": 0.0004, + "step": 28188 + }, + { + "epoch": 0.5638, + "grad_norm": 0.6013237833976746, + "learning_rate": 9.518475402795661e-06, + "loss": 0.0083, + "step": 28190 + }, + { + "epoch": 0.56384, + "grad_norm": 0.08650899678468704, + "learning_rate": 9.517080763761404e-06, + "loss": 0.0028, + "step": 28192 + }, + { + "epoch": 0.56388, + "grad_norm": 0.01995227299630642, + "learning_rate": 9.51568613414191e-06, + "loss": 0.0005, + "step": 28194 + }, + { + "epoch": 0.56392, + "grad_norm": 2.8014907836914062, + "learning_rate": 9.514291513964365e-06, + "loss": 0.0525, + "step": 28196 + }, + { + "epoch": 0.56396, + "grad_norm": 0.023219365626573563, + "learning_rate": 9.512896903255954e-06, + "loss": 0.0006, + "step": 28198 + }, + { + "epoch": 0.564, + "grad_norm": 0.007133812643587589, + "learning_rate": 9.511502302043867e-06, + "loss": 0.0025, + "step": 28200 + }, + { + "epoch": 0.56404, + "grad_norm": 1.4305708408355713, + "learning_rate": 9.510107710355299e-06, + "loss": 0.0216, + "step": 28202 + }, + { + "epoch": 0.56408, + "grad_norm": 0.44853419065475464, + "learning_rate": 9.50871312821743e-06, + "loss": 0.0196, + "step": 28204 + }, + { + "epoch": 0.56412, + "grad_norm": 0.3856588900089264, + "learning_rate": 9.507318555657454e-06, + "loss": 0.0075, + "step": 28206 + }, + { + "epoch": 0.56416, + "grad_norm": 0.5895951986312866, + "learning_rate": 9.505923992702553e-06, + "loss": 0.0117, + "step": 28208 + }, + { + "epoch": 0.5642, + "grad_norm": 0.09981171041727066, + "learning_rate": 9.504529439379921e-06, + "loss": 0.0037, + "step": 28210 + }, + { + "epoch": 0.56424, + "grad_norm": 10.915987014770508, + "learning_rate": 9.503134895716744e-06, + "loss": 1.038, + "step": 28212 + }, + { + "epoch": 0.56428, + "grad_norm": 0.002492685802280903, + "learning_rate": 9.501740361740201e-06, + "loss": 0.0031, + "step": 28214 + }, + { + "epoch": 0.56432, + "grad_norm": 0.7053931951522827, + "learning_rate": 9.500345837477493e-06, + "loss": 0.0848, + "step": 28216 + }, + { + "epoch": 0.56436, + "grad_norm": 0.02431447058916092, + "learning_rate": 9.4989513229558e-06, + "loss": 0.0018, + "step": 28218 + }, + { + "epoch": 0.5644, + "grad_norm": 0.24176663160324097, + "learning_rate": 9.497556818202306e-06, + "loss": 0.0331, + "step": 28220 + }, + { + "epoch": 0.56444, + "grad_norm": 0.2054692804813385, + "learning_rate": 9.496162323244202e-06, + "loss": 0.0026, + "step": 28222 + }, + { + "epoch": 0.56448, + "grad_norm": 0.1164097785949707, + "learning_rate": 9.49476783810867e-06, + "loss": 0.0021, + "step": 28224 + }, + { + "epoch": 0.56452, + "grad_norm": 0.11546561121940613, + "learning_rate": 9.493373362822903e-06, + "loss": 0.0121, + "step": 28226 + }, + { + "epoch": 0.56456, + "grad_norm": 0.021723726764321327, + "learning_rate": 9.491978897414084e-06, + "loss": 0.027, + "step": 28228 + }, + { + "epoch": 0.5646, + "grad_norm": 0.06611990183591843, + "learning_rate": 9.490584441909392e-06, + "loss": 0.0197, + "step": 28230 + }, + { + "epoch": 0.56464, + "grad_norm": 1.0894352197647095, + "learning_rate": 9.489189996336023e-06, + "loss": 0.0266, + "step": 28232 + }, + { + "epoch": 0.56468, + "grad_norm": 0.23572991788387299, + "learning_rate": 9.487795560721156e-06, + "loss": 0.3765, + "step": 28234 + }, + { + "epoch": 0.56472, + "grad_norm": 0.07017603516578674, + "learning_rate": 9.486401135091981e-06, + "loss": 0.0015, + "step": 28236 + }, + { + "epoch": 0.56476, + "grad_norm": 0.35189127922058105, + "learning_rate": 9.485006719475682e-06, + "loss": 0.0046, + "step": 28238 + }, + { + "epoch": 0.5648, + "grad_norm": 0.010611198842525482, + "learning_rate": 9.483612313899436e-06, + "loss": 0.0026, + "step": 28240 + }, + { + "epoch": 0.56484, + "grad_norm": 0.07492072880268097, + "learning_rate": 9.482217918390437e-06, + "loss": 0.016, + "step": 28242 + }, + { + "epoch": 0.56488, + "grad_norm": 0.021082434803247452, + "learning_rate": 9.480823532975867e-06, + "loss": 0.0007, + "step": 28244 + }, + { + "epoch": 0.56492, + "grad_norm": 0.03063320554792881, + "learning_rate": 9.479429157682905e-06, + "loss": 0.0021, + "step": 28246 + }, + { + "epoch": 0.56496, + "grad_norm": 1.3374626636505127, + "learning_rate": 9.478034792538745e-06, + "loss": 0.0151, + "step": 28248 + }, + { + "epoch": 0.565, + "grad_norm": 2.665524959564209, + "learning_rate": 9.476640437570562e-06, + "loss": 0.0527, + "step": 28250 + }, + { + "epoch": 0.56504, + "grad_norm": 0.0473046600818634, + "learning_rate": 9.475246092805547e-06, + "loss": 0.0652, + "step": 28252 + }, + { + "epoch": 0.56508, + "grad_norm": 3.2645516395568848, + "learning_rate": 9.473851758270877e-06, + "loss": 0.0734, + "step": 28254 + }, + { + "epoch": 0.56512, + "grad_norm": 0.16001230478286743, + "learning_rate": 9.472457433993736e-06, + "loss": 0.036, + "step": 28256 + }, + { + "epoch": 0.56516, + "grad_norm": 0.13267189264297485, + "learning_rate": 9.471063120001313e-06, + "loss": 0.002, + "step": 28258 + }, + { + "epoch": 0.5652, + "grad_norm": 0.29664328694343567, + "learning_rate": 9.469668816320785e-06, + "loss": 0.0098, + "step": 28260 + }, + { + "epoch": 0.56524, + "grad_norm": 0.10977911949157715, + "learning_rate": 9.468274522979334e-06, + "loss": 0.0121, + "step": 28262 + }, + { + "epoch": 0.56528, + "grad_norm": 1.4720417261123657, + "learning_rate": 9.466880240004149e-06, + "loss": 0.0239, + "step": 28264 + }, + { + "epoch": 0.56532, + "grad_norm": 0.023519722744822502, + "learning_rate": 9.465485967422404e-06, + "loss": 0.0012, + "step": 28266 + }, + { + "epoch": 0.56536, + "grad_norm": 0.08906776458024979, + "learning_rate": 9.464091705261287e-06, + "loss": 0.0014, + "step": 28268 + }, + { + "epoch": 0.5654, + "grad_norm": 0.004061401356011629, + "learning_rate": 9.46269745354798e-06, + "loss": 0.0421, + "step": 28270 + }, + { + "epoch": 0.56544, + "grad_norm": 0.01288817822933197, + "learning_rate": 9.46130321230966e-06, + "loss": 0.0586, + "step": 28272 + }, + { + "epoch": 0.56548, + "grad_norm": 0.09373671561479568, + "learning_rate": 9.459908981573515e-06, + "loss": 0.1278, + "step": 28274 + }, + { + "epoch": 0.56552, + "grad_norm": 0.7754537463188171, + "learning_rate": 9.458514761366721e-06, + "loss": 0.0538, + "step": 28276 + }, + { + "epoch": 0.56556, + "grad_norm": 0.28161293268203735, + "learning_rate": 9.457120551716456e-06, + "loss": 0.1936, + "step": 28278 + }, + { + "epoch": 0.5656, + "grad_norm": 0.024497080594301224, + "learning_rate": 9.45572635264991e-06, + "loss": 0.0027, + "step": 28280 + }, + { + "epoch": 0.56564, + "grad_norm": 0.06572854518890381, + "learning_rate": 9.454332164194255e-06, + "loss": 0.0011, + "step": 28282 + }, + { + "epoch": 0.56568, + "grad_norm": 0.11392145603895187, + "learning_rate": 9.45293798637668e-06, + "loss": 0.0026, + "step": 28284 + }, + { + "epoch": 0.56572, + "grad_norm": 0.005495155230164528, + "learning_rate": 9.45154381922436e-06, + "loss": 0.0213, + "step": 28286 + }, + { + "epoch": 0.56576, + "grad_norm": 0.014011748135089874, + "learning_rate": 9.450149662764474e-06, + "loss": 0.6088, + "step": 28288 + }, + { + "epoch": 0.5658, + "grad_norm": 0.6378273963928223, + "learning_rate": 9.448755517024207e-06, + "loss": 0.0653, + "step": 28290 + }, + { + "epoch": 0.56584, + "grad_norm": 0.020406188443303108, + "learning_rate": 9.447361382030731e-06, + "loss": 0.0004, + "step": 28292 + }, + { + "epoch": 0.56588, + "grad_norm": 0.051727838814258575, + "learning_rate": 9.445967257811229e-06, + "loss": 0.0174, + "step": 28294 + }, + { + "epoch": 0.56592, + "grad_norm": 0.04515519365668297, + "learning_rate": 9.444573144392882e-06, + "loss": 0.0023, + "step": 28296 + }, + { + "epoch": 0.56596, + "grad_norm": 0.09763102978467941, + "learning_rate": 9.443179041802865e-06, + "loss": 0.0026, + "step": 28298 + }, + { + "epoch": 0.566, + "grad_norm": 0.25552889704704285, + "learning_rate": 9.441784950068362e-06, + "loss": 0.0042, + "step": 28300 + }, + { + "epoch": 0.56604, + "grad_norm": 11.419285774230957, + "learning_rate": 9.44039086921655e-06, + "loss": 0.302, + "step": 28302 + }, + { + "epoch": 0.56608, + "grad_norm": 0.14480353891849518, + "learning_rate": 9.438996799274604e-06, + "loss": 0.0086, + "step": 28304 + }, + { + "epoch": 0.56612, + "grad_norm": 0.9236003756523132, + "learning_rate": 9.437602740269706e-06, + "loss": 0.0128, + "step": 28306 + }, + { + "epoch": 0.56616, + "grad_norm": 0.052394378930330276, + "learning_rate": 9.436208692229031e-06, + "loss": 0.0014, + "step": 28308 + }, + { + "epoch": 0.5662, + "grad_norm": 0.025236796587705612, + "learning_rate": 9.434814655179756e-06, + "loss": 0.0027, + "step": 28310 + }, + { + "epoch": 0.56624, + "grad_norm": 1.7090152502059937, + "learning_rate": 9.433420629149063e-06, + "loss": 0.0457, + "step": 28312 + }, + { + "epoch": 0.56628, + "grad_norm": 0.2730117738246918, + "learning_rate": 9.432026614164122e-06, + "loss": 0.1532, + "step": 28314 + }, + { + "epoch": 0.56632, + "grad_norm": 0.211952805519104, + "learning_rate": 9.430632610252119e-06, + "loss": 0.0036, + "step": 28316 + }, + { + "epoch": 0.56636, + "grad_norm": 0.24282465875148773, + "learning_rate": 9.429238617440227e-06, + "loss": 0.0269, + "step": 28318 + }, + { + "epoch": 0.5664, + "grad_norm": 1.959204912185669, + "learning_rate": 9.42784463575562e-06, + "loss": 0.0314, + "step": 28320 + }, + { + "epoch": 0.56644, + "grad_norm": 0.5950428247451782, + "learning_rate": 9.426450665225479e-06, + "loss": 0.0083, + "step": 28322 + }, + { + "epoch": 0.56648, + "grad_norm": 0.00948792602866888, + "learning_rate": 9.425056705876974e-06, + "loss": 0.0058, + "step": 28324 + }, + { + "epoch": 0.56652, + "grad_norm": 0.09059219807386398, + "learning_rate": 9.423662757737286e-06, + "loss": 0.0022, + "step": 28326 + }, + { + "epoch": 0.56656, + "grad_norm": 0.27598652243614197, + "learning_rate": 9.422268820833588e-06, + "loss": 0.0152, + "step": 28328 + }, + { + "epoch": 0.5666, + "grad_norm": 1.1832337379455566, + "learning_rate": 9.420874895193056e-06, + "loss": 0.0176, + "step": 28330 + }, + { + "epoch": 0.56664, + "grad_norm": 4.906232833862305, + "learning_rate": 9.41948098084287e-06, + "loss": 0.1106, + "step": 28332 + }, + { + "epoch": 0.56668, + "grad_norm": 0.034737780690193176, + "learning_rate": 9.4180870778102e-06, + "loss": 0.0007, + "step": 28334 + }, + { + "epoch": 0.56672, + "grad_norm": 2.0798752307891846, + "learning_rate": 9.416693186122217e-06, + "loss": 0.032, + "step": 28336 + }, + { + "epoch": 0.56676, + "grad_norm": 0.5762849450111389, + "learning_rate": 9.415299305806107e-06, + "loss": 0.0146, + "step": 28338 + }, + { + "epoch": 0.5668, + "grad_norm": 3.657274007797241, + "learning_rate": 9.413905436889035e-06, + "loss": 0.0601, + "step": 28340 + }, + { + "epoch": 0.56684, + "grad_norm": 3.2246251106262207, + "learning_rate": 9.41251157939818e-06, + "loss": 0.0554, + "step": 28342 + }, + { + "epoch": 0.56688, + "grad_norm": 1.0210002660751343, + "learning_rate": 9.411117733360714e-06, + "loss": 0.0152, + "step": 28344 + }, + { + "epoch": 0.56692, + "grad_norm": 0.0326303131878376, + "learning_rate": 9.409723898803808e-06, + "loss": 0.0014, + "step": 28346 + }, + { + "epoch": 0.56696, + "grad_norm": 0.08367007225751877, + "learning_rate": 9.408330075754642e-06, + "loss": 0.3929, + "step": 28348 + }, + { + "epoch": 0.567, + "grad_norm": 7.019062042236328, + "learning_rate": 9.406936264240386e-06, + "loss": 0.1314, + "step": 28350 + }, + { + "epoch": 0.56704, + "grad_norm": 2.9681236743927, + "learning_rate": 9.405542464288208e-06, + "loss": 0.0493, + "step": 28352 + }, + { + "epoch": 0.56708, + "grad_norm": 1.6203856468200684, + "learning_rate": 9.40414867592529e-06, + "loss": 0.0218, + "step": 28354 + }, + { + "epoch": 0.56712, + "grad_norm": 0.02039504237473011, + "learning_rate": 9.4027548991788e-06, + "loss": 0.0015, + "step": 28356 + }, + { + "epoch": 0.56716, + "grad_norm": 0.046185240149497986, + "learning_rate": 9.401361134075911e-06, + "loss": 0.0089, + "step": 28358 + }, + { + "epoch": 0.5672, + "grad_norm": 0.08438848704099655, + "learning_rate": 9.399967380643795e-06, + "loss": 0.0068, + "step": 28360 + }, + { + "epoch": 0.56724, + "grad_norm": 0.2865337133407593, + "learning_rate": 9.39857363890962e-06, + "loss": 0.0273, + "step": 28362 + }, + { + "epoch": 0.56728, + "grad_norm": 0.14515264332294464, + "learning_rate": 9.397179908900567e-06, + "loss": 0.0491, + "step": 28364 + }, + { + "epoch": 0.56732, + "grad_norm": 4.572182655334473, + "learning_rate": 9.3957861906438e-06, + "loss": 0.1023, + "step": 28366 + }, + { + "epoch": 0.56736, + "grad_norm": 0.18607258796691895, + "learning_rate": 9.39439248416649e-06, + "loss": 0.0071, + "step": 28368 + }, + { + "epoch": 0.5674, + "grad_norm": 0.06674949824810028, + "learning_rate": 9.392998789495813e-06, + "loss": 0.0347, + "step": 28370 + }, + { + "epoch": 0.56744, + "grad_norm": 0.0822567567229271, + "learning_rate": 9.391605106658935e-06, + "loss": 0.0226, + "step": 28372 + }, + { + "epoch": 0.56748, + "grad_norm": 0.20905061066150665, + "learning_rate": 9.390211435683032e-06, + "loss": 0.0363, + "step": 28374 + }, + { + "epoch": 0.56752, + "grad_norm": 0.0090031074360013, + "learning_rate": 9.38881777659527e-06, + "loss": 0.002, + "step": 28376 + }, + { + "epoch": 0.56756, + "grad_norm": 0.17082449793815613, + "learning_rate": 9.387424129422816e-06, + "loss": 0.0062, + "step": 28378 + }, + { + "epoch": 0.5676, + "grad_norm": 0.2668822407722473, + "learning_rate": 9.386030494192847e-06, + "loss": 0.0056, + "step": 28380 + }, + { + "epoch": 0.56764, + "grad_norm": 0.018237493932247162, + "learning_rate": 9.38463687093253e-06, + "loss": 0.0041, + "step": 28382 + }, + { + "epoch": 0.56768, + "grad_norm": 8.036276817321777, + "learning_rate": 9.383243259669032e-06, + "loss": 0.1169, + "step": 28384 + }, + { + "epoch": 0.56772, + "grad_norm": 0.1226920485496521, + "learning_rate": 9.381849660429527e-06, + "loss": 0.0027, + "step": 28386 + }, + { + "epoch": 0.56776, + "grad_norm": 0.906539261341095, + "learning_rate": 9.38045607324118e-06, + "loss": 0.0131, + "step": 28388 + }, + { + "epoch": 0.5678, + "grad_norm": 0.07044260948896408, + "learning_rate": 9.379062498131161e-06, + "loss": 0.0019, + "step": 28390 + }, + { + "epoch": 0.56784, + "grad_norm": 0.026711275801062584, + "learning_rate": 9.377668935126636e-06, + "loss": 0.0192, + "step": 28392 + }, + { + "epoch": 0.56788, + "grad_norm": 0.6670635342597961, + "learning_rate": 9.376275384254777e-06, + "loss": 0.0394, + "step": 28394 + }, + { + "epoch": 0.56792, + "grad_norm": 1.040916919708252, + "learning_rate": 9.374881845542752e-06, + "loss": 0.0117, + "step": 28396 + }, + { + "epoch": 0.56796, + "grad_norm": 0.022619228810071945, + "learning_rate": 9.373488319017727e-06, + "loss": 0.0241, + "step": 28398 + }, + { + "epoch": 0.568, + "grad_norm": 0.018735740333795547, + "learning_rate": 9.372094804706867e-06, + "loss": 0.0021, + "step": 28400 + }, + { + "epoch": 0.56804, + "grad_norm": 0.06694286316633224, + "learning_rate": 9.370701302637345e-06, + "loss": 0.0114, + "step": 28402 + }, + { + "epoch": 0.56808, + "grad_norm": 0.45961180329322815, + "learning_rate": 9.36930781283632e-06, + "loss": 0.0106, + "step": 28404 + }, + { + "epoch": 0.56812, + "grad_norm": 0.010538251139223576, + "learning_rate": 9.36791433533097e-06, + "loss": 0.0215, + "step": 28406 + }, + { + "epoch": 0.56816, + "grad_norm": 0.05770508572459221, + "learning_rate": 9.366520870148455e-06, + "loss": 0.0021, + "step": 28408 + }, + { + "epoch": 0.5682, + "grad_norm": 5.4650468826293945, + "learning_rate": 9.36512741731594e-06, + "loss": 0.065, + "step": 28410 + }, + { + "epoch": 0.56824, + "grad_norm": 0.12817339599132538, + "learning_rate": 9.363733976860594e-06, + "loss": 0.0071, + "step": 28412 + }, + { + "epoch": 0.56828, + "grad_norm": 0.00063539162511006, + "learning_rate": 9.362340548809582e-06, + "loss": 0.1378, + "step": 28414 + }, + { + "epoch": 0.56832, + "grad_norm": 0.14695312082767487, + "learning_rate": 9.360947133190065e-06, + "loss": 0.3561, + "step": 28416 + }, + { + "epoch": 0.56836, + "grad_norm": 0.11721087247133255, + "learning_rate": 9.359553730029217e-06, + "loss": 0.0102, + "step": 28418 + }, + { + "epoch": 0.5684, + "grad_norm": 0.026254547759890556, + "learning_rate": 9.358160339354194e-06, + "loss": 0.0054, + "step": 28420 + }, + { + "epoch": 0.56844, + "grad_norm": 0.09471387416124344, + "learning_rate": 9.35676696119217e-06, + "loss": 0.02, + "step": 28422 + }, + { + "epoch": 0.56848, + "grad_norm": 0.4904533922672272, + "learning_rate": 9.355373595570307e-06, + "loss": 0.0113, + "step": 28424 + }, + { + "epoch": 0.56852, + "grad_norm": 0.027951253578066826, + "learning_rate": 9.353980242515764e-06, + "loss": 0.2507, + "step": 28426 + }, + { + "epoch": 0.56856, + "grad_norm": 1.783734679222107, + "learning_rate": 9.35258690205571e-06, + "loss": 0.021, + "step": 28428 + }, + { + "epoch": 0.5686, + "grad_norm": 0.06649097800254822, + "learning_rate": 9.351193574217305e-06, + "loss": 0.0039, + "step": 28430 + }, + { + "epoch": 0.56864, + "grad_norm": 0.006856963504105806, + "learning_rate": 9.34980025902772e-06, + "loss": 0.002, + "step": 28432 + }, + { + "epoch": 0.56868, + "grad_norm": 2.428537368774414, + "learning_rate": 9.34840695651411e-06, + "loss": 0.0378, + "step": 28434 + }, + { + "epoch": 0.56872, + "grad_norm": 0.05525505170226097, + "learning_rate": 9.347013666703642e-06, + "loss": 0.0042, + "step": 28436 + }, + { + "epoch": 0.56876, + "grad_norm": 1.926811933517456, + "learning_rate": 9.34562038962348e-06, + "loss": 0.0192, + "step": 28438 + }, + { + "epoch": 0.5688, + "grad_norm": 0.5147356986999512, + "learning_rate": 9.344227125300788e-06, + "loss": 0.0068, + "step": 28440 + }, + { + "epoch": 0.56884, + "grad_norm": 0.12774626910686493, + "learning_rate": 9.342833873762722e-06, + "loss": 0.0023, + "step": 28442 + }, + { + "epoch": 0.56888, + "grad_norm": 0.621437132358551, + "learning_rate": 9.341440635036451e-06, + "loss": 0.0113, + "step": 28444 + }, + { + "epoch": 0.56892, + "grad_norm": 0.06797731667757034, + "learning_rate": 9.340047409149128e-06, + "loss": 0.0883, + "step": 28446 + }, + { + "epoch": 0.56896, + "grad_norm": 0.010279195383191109, + "learning_rate": 9.338654196127926e-06, + "loss": 0.0014, + "step": 28448 + }, + { + "epoch": 0.569, + "grad_norm": 1.5996688604354858, + "learning_rate": 9.337260996000002e-06, + "loss": 0.1003, + "step": 28450 + }, + { + "epoch": 0.56904, + "grad_norm": 9.188528060913086, + "learning_rate": 9.33586780879251e-06, + "loss": 0.1066, + "step": 28452 + }, + { + "epoch": 0.56908, + "grad_norm": 0.2251138836145401, + "learning_rate": 9.334474634532622e-06, + "loss": 0.0032, + "step": 28454 + }, + { + "epoch": 0.56912, + "grad_norm": 0.013592283241450787, + "learning_rate": 9.333081473247493e-06, + "loss": 0.0003, + "step": 28456 + }, + { + "epoch": 0.56916, + "grad_norm": 0.08405624330043793, + "learning_rate": 9.331688324964281e-06, + "loss": 0.002, + "step": 28458 + }, + { + "epoch": 0.5692, + "grad_norm": 5.54563045501709, + "learning_rate": 9.330295189710153e-06, + "loss": 0.0977, + "step": 28460 + }, + { + "epoch": 0.56924, + "grad_norm": 0.1361892968416214, + "learning_rate": 9.328902067512262e-06, + "loss": 0.2683, + "step": 28462 + }, + { + "epoch": 0.56928, + "grad_norm": 1.6002477407455444, + "learning_rate": 9.327508958397773e-06, + "loss": 0.3713, + "step": 28464 + }, + { + "epoch": 0.56932, + "grad_norm": 0.04347389563918114, + "learning_rate": 9.326115862393844e-06, + "loss": 0.0009, + "step": 28466 + }, + { + "epoch": 0.56936, + "grad_norm": 0.014182020910084248, + "learning_rate": 9.324722779527629e-06, + "loss": 0.0269, + "step": 28468 + }, + { + "epoch": 0.5694, + "grad_norm": 1.8587437868118286, + "learning_rate": 9.323329709826294e-06, + "loss": 0.0215, + "step": 28470 + }, + { + "epoch": 0.56944, + "grad_norm": 1.005372166633606, + "learning_rate": 9.321936653316995e-06, + "loss": 0.0218, + "step": 28472 + }, + { + "epoch": 0.56948, + "grad_norm": 0.14193716645240784, + "learning_rate": 9.320543610026887e-06, + "loss": 0.0016, + "step": 28474 + }, + { + "epoch": 0.56952, + "grad_norm": 6.477029323577881, + "learning_rate": 9.319150579983134e-06, + "loss": 0.1212, + "step": 28476 + }, + { + "epoch": 0.56956, + "grad_norm": 4.331614017486572, + "learning_rate": 9.31775756321289e-06, + "loss": 0.0582, + "step": 28478 + }, + { + "epoch": 0.5696, + "grad_norm": 13.788094520568848, + "learning_rate": 9.316364559743315e-06, + "loss": 0.6296, + "step": 28480 + }, + { + "epoch": 0.56964, + "grad_norm": 1.9945893287658691, + "learning_rate": 9.314971569601565e-06, + "loss": 0.1404, + "step": 28482 + }, + { + "epoch": 0.56968, + "grad_norm": 4.544174671173096, + "learning_rate": 9.313578592814792e-06, + "loss": 0.0648, + "step": 28484 + }, + { + "epoch": 0.56972, + "grad_norm": 0.02139720879495144, + "learning_rate": 9.312185629410164e-06, + "loss": 0.0004, + "step": 28486 + }, + { + "epoch": 0.56976, + "grad_norm": 0.0072884089313447475, + "learning_rate": 9.310792679414828e-06, + "loss": 0.0583, + "step": 28488 + }, + { + "epoch": 0.5698, + "grad_norm": 12.204236030578613, + "learning_rate": 9.309399742855943e-06, + "loss": 0.6913, + "step": 28490 + }, + { + "epoch": 0.56984, + "grad_norm": 0.006829811725765467, + "learning_rate": 9.308006819760666e-06, + "loss": 0.015, + "step": 28492 + }, + { + "epoch": 0.56988, + "grad_norm": 0.0013014940777793527, + "learning_rate": 9.306613910156154e-06, + "loss": 0.0024, + "step": 28494 + }, + { + "epoch": 0.56992, + "grad_norm": 0.39925721287727356, + "learning_rate": 9.30522101406956e-06, + "loss": 0.0583, + "step": 28496 + }, + { + "epoch": 0.56996, + "grad_norm": 0.26210832595825195, + "learning_rate": 9.303828131528042e-06, + "loss": 0.5112, + "step": 28498 + }, + { + "epoch": 0.57, + "grad_norm": 12.84286117553711, + "learning_rate": 9.302435262558748e-06, + "loss": 0.3219, + "step": 28500 + }, + { + "epoch": 0.57004, + "grad_norm": 0.0119844451546669, + "learning_rate": 9.301042407188842e-06, + "loss": 0.0717, + "step": 28502 + }, + { + "epoch": 0.57008, + "grad_norm": 1.0213990211486816, + "learning_rate": 9.299649565445474e-06, + "loss": 0.0218, + "step": 28504 + }, + { + "epoch": 0.57012, + "grad_norm": 0.032424211502075195, + "learning_rate": 9.298256737355792e-06, + "loss": 0.0006, + "step": 28506 + }, + { + "epoch": 0.57016, + "grad_norm": 1.0209506750106812, + "learning_rate": 9.296863922946963e-06, + "loss": 0.0139, + "step": 28508 + }, + { + "epoch": 0.5702, + "grad_norm": 0.09637520462274551, + "learning_rate": 9.295471122246131e-06, + "loss": 0.0042, + "step": 28510 + }, + { + "epoch": 0.57024, + "grad_norm": 0.18658089637756348, + "learning_rate": 9.294078335280455e-06, + "loss": 0.0088, + "step": 28512 + }, + { + "epoch": 0.57028, + "grad_norm": 1.3136833906173706, + "learning_rate": 9.292685562077085e-06, + "loss": 0.0452, + "step": 28514 + }, + { + "epoch": 0.57032, + "grad_norm": 2.4591000080108643, + "learning_rate": 9.29129280266317e-06, + "loss": 0.0427, + "step": 28516 + }, + { + "epoch": 0.57036, + "grad_norm": 0.13084082305431366, + "learning_rate": 9.28990005706587e-06, + "loss": 0.026, + "step": 28518 + }, + { + "epoch": 0.5704, + "grad_norm": 2.5678250789642334, + "learning_rate": 9.288507325312334e-06, + "loss": 0.0487, + "step": 28520 + }, + { + "epoch": 0.57044, + "grad_norm": 0.042868562042713165, + "learning_rate": 9.28711460742971e-06, + "loss": 0.003, + "step": 28522 + }, + { + "epoch": 0.57048, + "grad_norm": 0.08183278143405914, + "learning_rate": 9.28572190344516e-06, + "loss": 0.0073, + "step": 28524 + }, + { + "epoch": 0.57052, + "grad_norm": 2.136808395385742, + "learning_rate": 9.284329213385824e-06, + "loss": 0.1324, + "step": 28526 + }, + { + "epoch": 0.57056, + "grad_norm": 0.03242453560233116, + "learning_rate": 9.28293653727886e-06, + "loss": 0.0009, + "step": 28528 + }, + { + "epoch": 0.5706, + "grad_norm": 0.10139863938093185, + "learning_rate": 9.281543875151419e-06, + "loss": 0.007, + "step": 28530 + }, + { + "epoch": 0.57064, + "grad_norm": 0.052191849797964096, + "learning_rate": 9.280151227030648e-06, + "loss": 0.0306, + "step": 28532 + }, + { + "epoch": 0.57068, + "grad_norm": 0.18590615689754486, + "learning_rate": 9.278758592943702e-06, + "loss": 0.2058, + "step": 28534 + }, + { + "epoch": 0.57072, + "grad_norm": 2.021144390106201, + "learning_rate": 9.277365972917725e-06, + "loss": 0.0292, + "step": 28536 + }, + { + "epoch": 0.57076, + "grad_norm": 0.04875257983803749, + "learning_rate": 9.275973366979874e-06, + "loss": 0.0086, + "step": 28538 + }, + { + "epoch": 0.5708, + "grad_norm": 0.23222970962524414, + "learning_rate": 9.274580775157294e-06, + "loss": 0.0412, + "step": 28540 + }, + { + "epoch": 0.57084, + "grad_norm": 0.08104540407657623, + "learning_rate": 9.273188197477131e-06, + "loss": 0.006, + "step": 28542 + }, + { + "epoch": 0.57088, + "grad_norm": 2.9331178665161133, + "learning_rate": 9.271795633966544e-06, + "loss": 0.0475, + "step": 28544 + }, + { + "epoch": 0.57092, + "grad_norm": 7.467319011688232, + "learning_rate": 9.270403084652674e-06, + "loss": 0.1287, + "step": 28546 + }, + { + "epoch": 0.57096, + "grad_norm": 9.926913261413574, + "learning_rate": 9.269010549562672e-06, + "loss": 0.285, + "step": 28548 + }, + { + "epoch": 0.571, + "grad_norm": 4.595130920410156, + "learning_rate": 9.267618028723687e-06, + "loss": 0.0725, + "step": 28550 + }, + { + "epoch": 0.57104, + "grad_norm": 0.014695197343826294, + "learning_rate": 9.26622552216286e-06, + "loss": 0.0005, + "step": 28552 + }, + { + "epoch": 0.57108, + "grad_norm": 0.08807073533535004, + "learning_rate": 9.26483302990735e-06, + "loss": 0.0017, + "step": 28554 + }, + { + "epoch": 0.57112, + "grad_norm": 0.23694035410881042, + "learning_rate": 9.263440551984297e-06, + "loss": 0.0096, + "step": 28556 + }, + { + "epoch": 0.57116, + "grad_norm": 0.05271292105317116, + "learning_rate": 9.262048088420844e-06, + "loss": 0.1385, + "step": 28558 + }, + { + "epoch": 0.5712, + "grad_norm": 0.13322323560714722, + "learning_rate": 9.260655639244152e-06, + "loss": 0.0187, + "step": 28560 + }, + { + "epoch": 0.57124, + "grad_norm": 0.38606879115104675, + "learning_rate": 9.259263204481356e-06, + "loss": 0.0433, + "step": 28562 + }, + { + "epoch": 0.57128, + "grad_norm": 0.026042243465781212, + "learning_rate": 9.257870784159604e-06, + "loss": 0.0022, + "step": 28564 + }, + { + "epoch": 0.57132, + "grad_norm": 0.9154963493347168, + "learning_rate": 9.256478378306046e-06, + "loss": 0.0118, + "step": 28566 + }, + { + "epoch": 0.57136, + "grad_norm": 0.15726254880428314, + "learning_rate": 9.255085986947817e-06, + "loss": 0.002, + "step": 28568 + }, + { + "epoch": 0.5714, + "grad_norm": 0.5192418098449707, + "learning_rate": 9.253693610112079e-06, + "loss": 0.0196, + "step": 28570 + }, + { + "epoch": 0.57144, + "grad_norm": 0.12463173270225525, + "learning_rate": 9.252301247825966e-06, + "loss": 0.0075, + "step": 28572 + }, + { + "epoch": 0.57148, + "grad_norm": 0.11380500346422195, + "learning_rate": 9.250908900116622e-06, + "loss": 0.0036, + "step": 28574 + }, + { + "epoch": 0.57152, + "grad_norm": 0.06668450683355331, + "learning_rate": 9.249516567011198e-06, + "loss": 0.0025, + "step": 28576 + }, + { + "epoch": 0.57156, + "grad_norm": 0.8035642504692078, + "learning_rate": 9.248124248536835e-06, + "loss": 0.0094, + "step": 28578 + }, + { + "epoch": 0.5716, + "grad_norm": 0.9614832997322083, + "learning_rate": 9.246731944720675e-06, + "loss": 0.2495, + "step": 28580 + }, + { + "epoch": 0.57164, + "grad_norm": 6.656185150146484, + "learning_rate": 9.245339655589865e-06, + "loss": 0.1503, + "step": 28582 + }, + { + "epoch": 0.57168, + "grad_norm": 1.7503300905227661, + "learning_rate": 9.243947381171543e-06, + "loss": 0.1738, + "step": 28584 + }, + { + "epoch": 0.57172, + "grad_norm": 1.0973345041275024, + "learning_rate": 9.24255512149286e-06, + "loss": 0.0387, + "step": 28586 + }, + { + "epoch": 0.57176, + "grad_norm": 0.13803432881832123, + "learning_rate": 9.241162876580956e-06, + "loss": 0.004, + "step": 28588 + }, + { + "epoch": 0.5718, + "grad_norm": 0.0016592812025919557, + "learning_rate": 9.239770646462968e-06, + "loss": 0.0001, + "step": 28590 + }, + { + "epoch": 0.57184, + "grad_norm": 0.038178086280822754, + "learning_rate": 9.238378431166046e-06, + "loss": 0.0381, + "step": 28592 + }, + { + "epoch": 0.57188, + "grad_norm": 0.7120166420936584, + "learning_rate": 9.23698623071733e-06, + "loss": 0.1851, + "step": 28594 + }, + { + "epoch": 0.57192, + "grad_norm": 21.966787338256836, + "learning_rate": 9.235594045143955e-06, + "loss": 0.7618, + "step": 28596 + }, + { + "epoch": 0.57196, + "grad_norm": 0.01999671757221222, + "learning_rate": 9.23420187447307e-06, + "loss": 0.0059, + "step": 28598 + }, + { + "epoch": 0.572, + "grad_norm": 2.399156093597412, + "learning_rate": 9.232809718731815e-06, + "loss": 0.0279, + "step": 28600 + }, + { + "epoch": 0.57204, + "grad_norm": 0.9010326266288757, + "learning_rate": 9.231417577947328e-06, + "loss": 0.012, + "step": 28602 + }, + { + "epoch": 0.57208, + "grad_norm": 3.9018237590789795, + "learning_rate": 9.230025452146753e-06, + "loss": 0.3251, + "step": 28604 + }, + { + "epoch": 0.57212, + "grad_norm": 0.17444856464862823, + "learning_rate": 9.228633341357224e-06, + "loss": 0.0023, + "step": 28606 + }, + { + "epoch": 0.57216, + "grad_norm": 0.43566420674324036, + "learning_rate": 9.22724124560589e-06, + "loss": 0.0106, + "step": 28608 + }, + { + "epoch": 0.5722, + "grad_norm": 9.823724746704102, + "learning_rate": 9.225849164919886e-06, + "loss": 0.5889, + "step": 28610 + }, + { + "epoch": 0.57224, + "grad_norm": 0.017586462199687958, + "learning_rate": 9.224457099326346e-06, + "loss": 0.017, + "step": 28612 + }, + { + "epoch": 0.57228, + "grad_norm": 0.02451055310666561, + "learning_rate": 9.22306504885242e-06, + "loss": 0.0136, + "step": 28614 + }, + { + "epoch": 0.57232, + "grad_norm": 0.06768772751092911, + "learning_rate": 9.221673013525235e-06, + "loss": 0.005, + "step": 28616 + }, + { + "epoch": 0.57236, + "grad_norm": 1.585445761680603, + "learning_rate": 9.22028099337194e-06, + "loss": 0.0191, + "step": 28618 + }, + { + "epoch": 0.5724, + "grad_norm": 0.03676792234182358, + "learning_rate": 9.218888988419668e-06, + "loss": 0.0796, + "step": 28620 + }, + { + "epoch": 0.57244, + "grad_norm": 0.23804523050785065, + "learning_rate": 9.217496998695554e-06, + "loss": 0.0035, + "step": 28622 + }, + { + "epoch": 0.57248, + "grad_norm": 1.369307041168213, + "learning_rate": 9.216105024226742e-06, + "loss": 0.0316, + "step": 28624 + }, + { + "epoch": 0.57252, + "grad_norm": 0.39868152141571045, + "learning_rate": 9.214713065040366e-06, + "loss": 0.1106, + "step": 28626 + }, + { + "epoch": 0.57256, + "grad_norm": 0.675058901309967, + "learning_rate": 9.213321121163558e-06, + "loss": 0.0112, + "step": 28628 + }, + { + "epoch": 0.5726, + "grad_norm": 0.21677647531032562, + "learning_rate": 9.211929192623466e-06, + "loss": 0.0099, + "step": 28630 + }, + { + "epoch": 0.57264, + "grad_norm": 0.5883644223213196, + "learning_rate": 9.210537279447217e-06, + "loss": 0.0097, + "step": 28632 + }, + { + "epoch": 0.57268, + "grad_norm": 0.41028064489364624, + "learning_rate": 9.209145381661953e-06, + "loss": 0.0088, + "step": 28634 + }, + { + "epoch": 0.57272, + "grad_norm": 0.2951124310493469, + "learning_rate": 9.207753499294807e-06, + "loss": 0.0079, + "step": 28636 + }, + { + "epoch": 0.57276, + "grad_norm": 1.5279455184936523, + "learning_rate": 9.206361632372908e-06, + "loss": 0.0187, + "step": 28638 + }, + { + "epoch": 0.5728, + "grad_norm": 3.789426803588867, + "learning_rate": 9.204969780923404e-06, + "loss": 0.0646, + "step": 28640 + }, + { + "epoch": 0.57284, + "grad_norm": 0.024475565180182457, + "learning_rate": 9.203577944973419e-06, + "loss": 0.0022, + "step": 28642 + }, + { + "epoch": 0.57288, + "grad_norm": 0.1534118503332138, + "learning_rate": 9.202186124550096e-06, + "loss": 0.0138, + "step": 28644 + }, + { + "epoch": 0.57292, + "grad_norm": 0.03606635332107544, + "learning_rate": 9.200794319680565e-06, + "loss": 0.0055, + "step": 28646 + }, + { + "epoch": 0.57296, + "grad_norm": 11.46073055267334, + "learning_rate": 9.199402530391958e-06, + "loss": 0.3208, + "step": 28648 + }, + { + "epoch": 0.573, + "grad_norm": 2.587067127227783, + "learning_rate": 9.198010756711413e-06, + "loss": 0.0431, + "step": 28650 + }, + { + "epoch": 0.57304, + "grad_norm": 12.251541137695312, + "learning_rate": 9.19661899866606e-06, + "loss": 0.3005, + "step": 28652 + }, + { + "epoch": 0.57308, + "grad_norm": 0.11833800375461578, + "learning_rate": 9.195227256283032e-06, + "loss": 0.0094, + "step": 28654 + }, + { + "epoch": 0.57312, + "grad_norm": 0.2416994720697403, + "learning_rate": 9.193835529589465e-06, + "loss": 0.0031, + "step": 28656 + }, + { + "epoch": 0.57316, + "grad_norm": 6.357607364654541, + "learning_rate": 9.192443818612486e-06, + "loss": 0.1324, + "step": 28658 + }, + { + "epoch": 0.5732, + "grad_norm": 0.08492617309093475, + "learning_rate": 9.191052123379234e-06, + "loss": 0.0014, + "step": 28660 + }, + { + "epoch": 0.57324, + "grad_norm": 19.467220306396484, + "learning_rate": 9.189660443916839e-06, + "loss": 0.4692, + "step": 28662 + }, + { + "epoch": 0.57328, + "grad_norm": 0.252763032913208, + "learning_rate": 9.188268780252426e-06, + "loss": 0.1, + "step": 28664 + }, + { + "epoch": 0.57332, + "grad_norm": 0.6328834891319275, + "learning_rate": 9.186877132413134e-06, + "loss": 0.1053, + "step": 28666 + }, + { + "epoch": 0.57336, + "grad_norm": 0.28794991970062256, + "learning_rate": 9.18548550042609e-06, + "loss": 0.0152, + "step": 28668 + }, + { + "epoch": 0.5734, + "grad_norm": 0.055643852800130844, + "learning_rate": 9.184093884318426e-06, + "loss": 0.0008, + "step": 28670 + }, + { + "epoch": 0.57344, + "grad_norm": 0.35371121764183044, + "learning_rate": 9.182702284117273e-06, + "loss": 0.0065, + "step": 28672 + }, + { + "epoch": 0.57348, + "grad_norm": 1.124647617340088, + "learning_rate": 9.181310699849755e-06, + "loss": 0.0764, + "step": 28674 + }, + { + "epoch": 0.57352, + "grad_norm": 0.35822394490242004, + "learning_rate": 9.17991913154301e-06, + "loss": 0.0084, + "step": 28676 + }, + { + "epoch": 0.57356, + "grad_norm": 0.08139649033546448, + "learning_rate": 9.178527579224166e-06, + "loss": 0.0163, + "step": 28678 + }, + { + "epoch": 0.5736, + "grad_norm": 0.29279518127441406, + "learning_rate": 9.177136042920344e-06, + "loss": 0.005, + "step": 28680 + }, + { + "epoch": 0.57364, + "grad_norm": 3.330169439315796, + "learning_rate": 9.175744522658683e-06, + "loss": 0.0558, + "step": 28682 + }, + { + "epoch": 0.57368, + "grad_norm": 0.0739416629076004, + "learning_rate": 9.174353018466307e-06, + "loss": 0.0017, + "step": 28684 + }, + { + "epoch": 0.57372, + "grad_norm": 0.26596373319625854, + "learning_rate": 9.17296153037034e-06, + "loss": 0.0089, + "step": 28686 + }, + { + "epoch": 0.57376, + "grad_norm": 0.0252067893743515, + "learning_rate": 9.171570058397919e-06, + "loss": 0.0016, + "step": 28688 + }, + { + "epoch": 0.5738, + "grad_norm": 0.28486377000808716, + "learning_rate": 9.170178602576161e-06, + "loss": 0.0071, + "step": 28690 + }, + { + "epoch": 0.57384, + "grad_norm": 0.027166252955794334, + "learning_rate": 9.1687871629322e-06, + "loss": 0.001, + "step": 28692 + }, + { + "epoch": 0.57388, + "grad_norm": 0.3525671064853668, + "learning_rate": 9.167395739493164e-06, + "loss": 0.0061, + "step": 28694 + }, + { + "epoch": 0.57392, + "grad_norm": 0.02544800564646721, + "learning_rate": 9.166004332286173e-06, + "loss": 0.355, + "step": 28696 + }, + { + "epoch": 0.57396, + "grad_norm": 0.054171372205019, + "learning_rate": 9.16461294133836e-06, + "loss": 0.0196, + "step": 28698 + }, + { + "epoch": 0.574, + "grad_norm": 5.279478549957275, + "learning_rate": 9.163221566676847e-06, + "loss": 0.0939, + "step": 28700 + }, + { + "epoch": 0.57404, + "grad_norm": 1.7585911750793457, + "learning_rate": 9.16183020832876e-06, + "loss": 0.0341, + "step": 28702 + }, + { + "epoch": 0.57408, + "grad_norm": 0.06051582470536232, + "learning_rate": 9.160438866321226e-06, + "loss": 0.0309, + "step": 28704 + }, + { + "epoch": 0.57412, + "grad_norm": 0.00640004500746727, + "learning_rate": 9.159047540681363e-06, + "loss": 0.0009, + "step": 28706 + }, + { + "epoch": 0.57416, + "grad_norm": 0.008407481014728546, + "learning_rate": 9.157656231436306e-06, + "loss": 0.0169, + "step": 28708 + }, + { + "epoch": 0.5742, + "grad_norm": 0.0213144663721323, + "learning_rate": 9.156264938613176e-06, + "loss": 0.0053, + "step": 28710 + }, + { + "epoch": 0.57424, + "grad_norm": 14.045548439025879, + "learning_rate": 9.15487366223909e-06, + "loss": 0.401, + "step": 28712 + }, + { + "epoch": 0.57428, + "grad_norm": 0.017843466252088547, + "learning_rate": 9.15348240234118e-06, + "loss": 0.0003, + "step": 28714 + }, + { + "epoch": 0.57432, + "grad_norm": 0.008490671403706074, + "learning_rate": 9.152091158946568e-06, + "loss": 0.002, + "step": 28716 + }, + { + "epoch": 0.57436, + "grad_norm": 2.486069679260254, + "learning_rate": 9.150699932082374e-06, + "loss": 0.0421, + "step": 28718 + }, + { + "epoch": 0.5744, + "grad_norm": 0.002049290807917714, + "learning_rate": 9.14930872177572e-06, + "loss": 0.0019, + "step": 28720 + }, + { + "epoch": 0.57444, + "grad_norm": 0.48116764426231384, + "learning_rate": 9.147917528053732e-06, + "loss": 0.0067, + "step": 28722 + }, + { + "epoch": 0.57448, + "grad_norm": 0.0756707638502121, + "learning_rate": 9.146526350943532e-06, + "loss": 0.1164, + "step": 28724 + }, + { + "epoch": 0.57452, + "grad_norm": 5.109095096588135, + "learning_rate": 9.14513519047224e-06, + "loss": 0.1296, + "step": 28726 + }, + { + "epoch": 0.57456, + "grad_norm": 0.36388689279556274, + "learning_rate": 9.143744046666974e-06, + "loss": 0.3794, + "step": 28728 + }, + { + "epoch": 0.5746, + "grad_norm": 14.177210807800293, + "learning_rate": 9.142352919554862e-06, + "loss": 0.9408, + "step": 28730 + }, + { + "epoch": 0.57464, + "grad_norm": 0.13333897292613983, + "learning_rate": 9.140961809163022e-06, + "loss": 0.0085, + "step": 28732 + }, + { + "epoch": 0.57468, + "grad_norm": 0.004771328531205654, + "learning_rate": 9.139570715518569e-06, + "loss": 0.1056, + "step": 28734 + }, + { + "epoch": 0.57472, + "grad_norm": 0.2678373157978058, + "learning_rate": 9.138179638648633e-06, + "loss": 0.0617, + "step": 28736 + }, + { + "epoch": 0.57476, + "grad_norm": 0.1290665566921234, + "learning_rate": 9.136788578580326e-06, + "loss": 0.0025, + "step": 28738 + }, + { + "epoch": 0.5748, + "grad_norm": 0.008556300774216652, + "learning_rate": 9.135397535340773e-06, + "loss": 0.0133, + "step": 28740 + }, + { + "epoch": 0.57484, + "grad_norm": 0.037612173706293106, + "learning_rate": 9.13400650895709e-06, + "loss": 0.0016, + "step": 28742 + }, + { + "epoch": 0.57488, + "grad_norm": 1.0267890691757202, + "learning_rate": 9.13261549945639e-06, + "loss": 0.0174, + "step": 28744 + }, + { + "epoch": 0.57492, + "grad_norm": 0.13473203778266907, + "learning_rate": 9.131224506865804e-06, + "loss": 0.003, + "step": 28746 + }, + { + "epoch": 0.57496, + "grad_norm": 1.0428699254989624, + "learning_rate": 9.129833531212436e-06, + "loss": 0.0191, + "step": 28748 + }, + { + "epoch": 0.575, + "grad_norm": 0.06670401245355606, + "learning_rate": 9.128442572523418e-06, + "loss": 0.0075, + "step": 28750 + }, + { + "epoch": 0.57504, + "grad_norm": 0.19051960110664368, + "learning_rate": 9.12705163082586e-06, + "loss": 0.0037, + "step": 28752 + }, + { + "epoch": 0.57508, + "grad_norm": 6.542418003082275, + "learning_rate": 9.125660706146879e-06, + "loss": 0.1406, + "step": 28754 + }, + { + "epoch": 0.57512, + "grad_norm": 0.4215536117553711, + "learning_rate": 9.124269798513594e-06, + "loss": 0.3091, + "step": 28756 + }, + { + "epoch": 0.57516, + "grad_norm": 0.5113170742988586, + "learning_rate": 9.12287890795312e-06, + "loss": 0.0092, + "step": 28758 + }, + { + "epoch": 0.5752, + "grad_norm": 9.531103134155273, + "learning_rate": 9.121488034492569e-06, + "loss": 0.7757, + "step": 28760 + }, + { + "epoch": 0.57524, + "grad_norm": 0.09500984102487564, + "learning_rate": 9.120097178159065e-06, + "loss": 0.0069, + "step": 28762 + }, + { + "epoch": 0.57528, + "grad_norm": 0.018233561888337135, + "learning_rate": 9.118706338979715e-06, + "loss": 0.0011, + "step": 28764 + }, + { + "epoch": 0.57532, + "grad_norm": 0.02588677778840065, + "learning_rate": 9.117315516981644e-06, + "loss": 0.0049, + "step": 28766 + }, + { + "epoch": 0.57536, + "grad_norm": 0.9714416265487671, + "learning_rate": 9.11592471219196e-06, + "loss": 0.0206, + "step": 28768 + }, + { + "epoch": 0.5754, + "grad_norm": 4.182343482971191, + "learning_rate": 9.114533924637778e-06, + "loss": 0.4791, + "step": 28770 + }, + { + "epoch": 0.57544, + "grad_norm": 0.22992634773254395, + "learning_rate": 9.113143154346215e-06, + "loss": 0.0107, + "step": 28772 + }, + { + "epoch": 0.57548, + "grad_norm": 2.077782392501831, + "learning_rate": 9.111752401344383e-06, + "loss": 0.0544, + "step": 28774 + }, + { + "epoch": 0.57552, + "grad_norm": 0.10085693746805191, + "learning_rate": 9.11036166565939e-06, + "loss": 0.0046, + "step": 28776 + }, + { + "epoch": 0.57556, + "grad_norm": 0.3292076885700226, + "learning_rate": 9.10897094731836e-06, + "loss": 0.0065, + "step": 28778 + }, + { + "epoch": 0.5756, + "grad_norm": 2.292893409729004, + "learning_rate": 9.107580246348395e-06, + "loss": 0.0654, + "step": 28780 + }, + { + "epoch": 0.57564, + "grad_norm": 0.007027735933661461, + "learning_rate": 9.106189562776618e-06, + "loss": 0.0058, + "step": 28782 + }, + { + "epoch": 0.57568, + "grad_norm": 7.190089225769043, + "learning_rate": 9.104798896630134e-06, + "loss": 0.2679, + "step": 28784 + }, + { + "epoch": 0.57572, + "grad_norm": 0.1812330186367035, + "learning_rate": 9.103408247936054e-06, + "loss": 0.1299, + "step": 28786 + }, + { + "epoch": 0.57576, + "grad_norm": 0.2787942886352539, + "learning_rate": 9.102017616721494e-06, + "loss": 0.0074, + "step": 28788 + }, + { + "epoch": 0.5758, + "grad_norm": 0.3302738070487976, + "learning_rate": 9.100627003013563e-06, + "loss": 0.0099, + "step": 28790 + }, + { + "epoch": 0.57584, + "grad_norm": 0.25595706701278687, + "learning_rate": 9.099236406839368e-06, + "loss": 0.0084, + "step": 28792 + }, + { + "epoch": 0.57588, + "grad_norm": 3.657064914703369, + "learning_rate": 9.097845828226028e-06, + "loss": 0.0633, + "step": 28794 + }, + { + "epoch": 0.57592, + "grad_norm": 0.7761194109916687, + "learning_rate": 9.096455267200643e-06, + "loss": 0.1908, + "step": 28796 + }, + { + "epoch": 0.57596, + "grad_norm": 0.46186670660972595, + "learning_rate": 9.095064723790333e-06, + "loss": 0.058, + "step": 28798 + }, + { + "epoch": 0.576, + "grad_norm": 0.5022237300872803, + "learning_rate": 9.093674198022201e-06, + "loss": 0.012, + "step": 28800 + }, + { + "epoch": 0.57604, + "grad_norm": 0.2178589552640915, + "learning_rate": 9.092283689923353e-06, + "loss": 0.0111, + "step": 28802 + }, + { + "epoch": 0.57608, + "grad_norm": 0.3388141095638275, + "learning_rate": 9.090893199520907e-06, + "loss": 0.0073, + "step": 28804 + }, + { + "epoch": 0.57612, + "grad_norm": 0.08790047466754913, + "learning_rate": 9.089502726841963e-06, + "loss": 0.0049, + "step": 28806 + }, + { + "epoch": 0.57616, + "grad_norm": 0.08198133856058121, + "learning_rate": 9.088112271913632e-06, + "loss": 0.0037, + "step": 28808 + }, + { + "epoch": 0.5762, + "grad_norm": 2.7692043781280518, + "learning_rate": 9.086721834763024e-06, + "loss": 0.0728, + "step": 28810 + }, + { + "epoch": 0.57624, + "grad_norm": 0.13684777915477753, + "learning_rate": 9.08533141541724e-06, + "loss": 0.0263, + "step": 28812 + }, + { + "epoch": 0.57628, + "grad_norm": 0.16033555567264557, + "learning_rate": 9.083941013903395e-06, + "loss": 0.0119, + "step": 28814 + }, + { + "epoch": 0.57632, + "grad_norm": 0.9644231200218201, + "learning_rate": 9.082550630248591e-06, + "loss": 0.1567, + "step": 28816 + }, + { + "epoch": 0.57636, + "grad_norm": 0.5333106517791748, + "learning_rate": 9.081160264479932e-06, + "loss": 0.0231, + "step": 28818 + }, + { + "epoch": 0.5764, + "grad_norm": 1.8151469230651855, + "learning_rate": 9.07976991662453e-06, + "loss": 0.0307, + "step": 28820 + }, + { + "epoch": 0.57644, + "grad_norm": 2.0032291412353516, + "learning_rate": 9.078379586709487e-06, + "loss": 0.0982, + "step": 28822 + }, + { + "epoch": 0.57648, + "grad_norm": 7.046091556549072, + "learning_rate": 9.076989274761906e-06, + "loss": 0.1548, + "step": 28824 + }, + { + "epoch": 0.57652, + "grad_norm": 0.1422433704137802, + "learning_rate": 9.075598980808897e-06, + "loss": 0.0088, + "step": 28826 + }, + { + "epoch": 0.57656, + "grad_norm": 0.27753421664237976, + "learning_rate": 9.074208704877558e-06, + "loss": 0.0067, + "step": 28828 + }, + { + "epoch": 0.5766, + "grad_norm": 0.8514907956123352, + "learning_rate": 9.072818446995e-06, + "loss": 0.0488, + "step": 28830 + }, + { + "epoch": 0.57664, + "grad_norm": 0.526959240436554, + "learning_rate": 9.071428207188324e-06, + "loss": 0.0113, + "step": 28832 + }, + { + "epoch": 0.57668, + "grad_norm": 0.2176792174577713, + "learning_rate": 9.07003798548463e-06, + "loss": 0.0043, + "step": 28834 + }, + { + "epoch": 0.57672, + "grad_norm": 11.484294891357422, + "learning_rate": 9.068647781911028e-06, + "loss": 0.45, + "step": 28836 + }, + { + "epoch": 0.57676, + "grad_norm": 0.16974198818206787, + "learning_rate": 9.067257596494616e-06, + "loss": 0.0364, + "step": 28838 + }, + { + "epoch": 0.5768, + "grad_norm": 0.923835813999176, + "learning_rate": 9.065867429262497e-06, + "loss": 0.2512, + "step": 28840 + }, + { + "epoch": 0.57684, + "grad_norm": 0.10757440328598022, + "learning_rate": 9.064477280241774e-06, + "loss": 0.0038, + "step": 28842 + }, + { + "epoch": 0.57688, + "grad_norm": 0.13244915008544922, + "learning_rate": 9.063087149459544e-06, + "loss": 0.6107, + "step": 28844 + }, + { + "epoch": 0.57692, + "grad_norm": 1.5840784311294556, + "learning_rate": 9.061697036942917e-06, + "loss": 0.0382, + "step": 28846 + }, + { + "epoch": 0.57696, + "grad_norm": 0.044432371854782104, + "learning_rate": 9.06030694271899e-06, + "loss": 0.0022, + "step": 28848 + }, + { + "epoch": 0.577, + "grad_norm": 0.1322735995054245, + "learning_rate": 9.058916866814857e-06, + "loss": 0.0072, + "step": 28850 + }, + { + "epoch": 0.57704, + "grad_norm": 1.570481300354004, + "learning_rate": 9.05752680925763e-06, + "loss": 0.0354, + "step": 28852 + }, + { + "epoch": 0.57708, + "grad_norm": 0.46304380893707275, + "learning_rate": 9.0561367700744e-06, + "loss": 0.0409, + "step": 28854 + }, + { + "epoch": 0.57712, + "grad_norm": 0.5993260145187378, + "learning_rate": 9.054746749292271e-06, + "loss": 0.0136, + "step": 28856 + }, + { + "epoch": 0.57716, + "grad_norm": 0.5935038328170776, + "learning_rate": 9.05335674693834e-06, + "loss": 0.0113, + "step": 28858 + }, + { + "epoch": 0.5772, + "grad_norm": 5.2791972160339355, + "learning_rate": 9.051966763039706e-06, + "loss": 0.1328, + "step": 28860 + }, + { + "epoch": 0.57724, + "grad_norm": 0.8299260139465332, + "learning_rate": 9.050576797623471e-06, + "loss": 0.0416, + "step": 28862 + }, + { + "epoch": 0.57728, + "grad_norm": 0.1760714203119278, + "learning_rate": 9.049186850716729e-06, + "loss": 0.0052, + "step": 28864 + }, + { + "epoch": 0.57732, + "grad_norm": 1.6594963073730469, + "learning_rate": 9.047796922346576e-06, + "loss": 0.0275, + "step": 28866 + }, + { + "epoch": 0.57736, + "grad_norm": 0.24639929831027985, + "learning_rate": 9.046407012540116e-06, + "loss": 0.0071, + "step": 28868 + }, + { + "epoch": 0.5774, + "grad_norm": 0.22968684136867523, + "learning_rate": 9.045017121324438e-06, + "loss": 0.0072, + "step": 28870 + }, + { + "epoch": 0.57744, + "grad_norm": 1.1718319654464722, + "learning_rate": 9.043627248726646e-06, + "loss": 0.0125, + "step": 28872 + }, + { + "epoch": 0.57748, + "grad_norm": 0.3512219190597534, + "learning_rate": 9.042237394773834e-06, + "loss": 0.0128, + "step": 28874 + }, + { + "epoch": 0.57752, + "grad_norm": 0.0596662312746048, + "learning_rate": 9.040847559493094e-06, + "loss": 0.0431, + "step": 28876 + }, + { + "epoch": 0.57756, + "grad_norm": 0.9375609755516052, + "learning_rate": 9.039457742911528e-06, + "loss": 0.0838, + "step": 28878 + }, + { + "epoch": 0.5776, + "grad_norm": 0.10480885207653046, + "learning_rate": 9.038067945056229e-06, + "loss": 0.0023, + "step": 28880 + }, + { + "epoch": 0.57764, + "grad_norm": 0.09894931316375732, + "learning_rate": 9.036678165954284e-06, + "loss": 0.0036, + "step": 28882 + }, + { + "epoch": 0.57768, + "grad_norm": 0.22727520763874054, + "learning_rate": 9.035288405632798e-06, + "loss": 0.0452, + "step": 28884 + }, + { + "epoch": 0.57772, + "grad_norm": 0.13012072443962097, + "learning_rate": 9.033898664118858e-06, + "loss": 0.003, + "step": 28886 + }, + { + "epoch": 0.57776, + "grad_norm": 0.02951199747622013, + "learning_rate": 9.032508941439566e-06, + "loss": 0.0137, + "step": 28888 + }, + { + "epoch": 0.5778, + "grad_norm": 0.39260903000831604, + "learning_rate": 9.031119237622011e-06, + "loss": 0.0074, + "step": 28890 + }, + { + "epoch": 0.57784, + "grad_norm": 0.6972404718399048, + "learning_rate": 9.029729552693281e-06, + "loss": 0.0142, + "step": 28892 + }, + { + "epoch": 0.57788, + "grad_norm": 2.7768046855926514, + "learning_rate": 9.028339886680475e-06, + "loss": 0.0475, + "step": 28894 + }, + { + "epoch": 0.57792, + "grad_norm": 0.1589418351650238, + "learning_rate": 9.026950239610684e-06, + "loss": 0.0051, + "step": 28896 + }, + { + "epoch": 0.57796, + "grad_norm": 0.37328195571899414, + "learning_rate": 9.025560611510995e-06, + "loss": 0.0067, + "step": 28898 + }, + { + "epoch": 0.578, + "grad_norm": 0.05882475525140762, + "learning_rate": 9.024171002408507e-06, + "loss": 0.0012, + "step": 28900 + }, + { + "epoch": 0.57804, + "grad_norm": 9.362600326538086, + "learning_rate": 9.022781412330305e-06, + "loss": 0.2693, + "step": 28902 + }, + { + "epoch": 0.57808, + "grad_norm": 1.2293438911437988, + "learning_rate": 9.021391841303484e-06, + "loss": 0.0241, + "step": 28904 + }, + { + "epoch": 0.57812, + "grad_norm": 0.9405255317687988, + "learning_rate": 9.020002289355136e-06, + "loss": 0.0172, + "step": 28906 + }, + { + "epoch": 0.57816, + "grad_norm": 1.8404614925384521, + "learning_rate": 9.018612756512344e-06, + "loss": 0.035, + "step": 28908 + }, + { + "epoch": 0.5782, + "grad_norm": 1.6309444904327393, + "learning_rate": 9.017223242802205e-06, + "loss": 0.0354, + "step": 28910 + }, + { + "epoch": 0.57824, + "grad_norm": 0.051668670028448105, + "learning_rate": 9.015833748251804e-06, + "loss": 0.1767, + "step": 28912 + }, + { + "epoch": 0.57828, + "grad_norm": 0.0882217288017273, + "learning_rate": 9.014444272888225e-06, + "loss": 0.2203, + "step": 28914 + }, + { + "epoch": 0.57832, + "grad_norm": 0.022572193294763565, + "learning_rate": 9.01305481673857e-06, + "loss": 0.0097, + "step": 28916 + }, + { + "epoch": 0.57836, + "grad_norm": 0.1760086864233017, + "learning_rate": 9.011665379829912e-06, + "loss": 0.0165, + "step": 28918 + }, + { + "epoch": 0.5784, + "grad_norm": 0.22773176431655884, + "learning_rate": 9.01027596218935e-06, + "loss": 0.0098, + "step": 28920 + }, + { + "epoch": 0.57844, + "grad_norm": 1.500815987586975, + "learning_rate": 9.00888656384397e-06, + "loss": 0.0489, + "step": 28922 + }, + { + "epoch": 0.57848, + "grad_norm": 0.05083772912621498, + "learning_rate": 9.007497184820854e-06, + "loss": 0.0477, + "step": 28924 + }, + { + "epoch": 0.57852, + "grad_norm": 1.0453532934188843, + "learning_rate": 9.00610782514709e-06, + "loss": 0.0304, + "step": 28926 + }, + { + "epoch": 0.57856, + "grad_norm": 0.11185531318187714, + "learning_rate": 9.00471848484977e-06, + "loss": 0.0031, + "step": 28928 + }, + { + "epoch": 0.5786, + "grad_norm": 0.04304559528827667, + "learning_rate": 9.003329163955973e-06, + "loss": 0.0089, + "step": 28930 + }, + { + "epoch": 0.57864, + "grad_norm": 0.5998311042785645, + "learning_rate": 9.001939862492789e-06, + "loss": 0.0469, + "step": 28932 + }, + { + "epoch": 0.57868, + "grad_norm": 0.24100877344608307, + "learning_rate": 9.000550580487296e-06, + "loss": 0.0044, + "step": 28934 + }, + { + "epoch": 0.57872, + "grad_norm": 2.282188892364502, + "learning_rate": 8.999161317966589e-06, + "loss": 0.0358, + "step": 28936 + }, + { + "epoch": 0.57876, + "grad_norm": 0.2772686779499054, + "learning_rate": 8.997772074957748e-06, + "loss": 0.005, + "step": 28938 + }, + { + "epoch": 0.5788, + "grad_norm": 9.211976051330566, + "learning_rate": 8.996382851487851e-06, + "loss": 0.3375, + "step": 28940 + }, + { + "epoch": 0.57884, + "grad_norm": 0.5519323348999023, + "learning_rate": 8.994993647583992e-06, + "loss": 0.0249, + "step": 28942 + }, + { + "epoch": 0.57888, + "grad_norm": 0.18863257765769958, + "learning_rate": 8.993604463273248e-06, + "loss": 0.0053, + "step": 28944 + }, + { + "epoch": 0.57892, + "grad_norm": 1.2882295846939087, + "learning_rate": 8.992215298582701e-06, + "loss": 0.0269, + "step": 28946 + }, + { + "epoch": 0.57896, + "grad_norm": 0.6059069037437439, + "learning_rate": 8.99082615353944e-06, + "loss": 0.0321, + "step": 28948 + }, + { + "epoch": 0.579, + "grad_norm": 0.05282720550894737, + "learning_rate": 8.989437028170537e-06, + "loss": 0.0019, + "step": 28950 + }, + { + "epoch": 0.57904, + "grad_norm": 1.04810631275177, + "learning_rate": 8.988047922503084e-06, + "loss": 0.0172, + "step": 28952 + }, + { + "epoch": 0.57908, + "grad_norm": 1.8451000452041626, + "learning_rate": 8.986658836564157e-06, + "loss": 0.04, + "step": 28954 + }, + { + "epoch": 0.57912, + "grad_norm": 0.4922449290752411, + "learning_rate": 8.985269770380833e-06, + "loss": 0.0119, + "step": 28956 + }, + { + "epoch": 0.57916, + "grad_norm": 0.7227178812026978, + "learning_rate": 8.983880723980203e-06, + "loss": 0.0236, + "step": 28958 + }, + { + "epoch": 0.5792, + "grad_norm": 0.06262659281492233, + "learning_rate": 8.982491697389339e-06, + "loss": 0.0058, + "step": 28960 + }, + { + "epoch": 0.57924, + "grad_norm": 0.013476785272359848, + "learning_rate": 8.981102690635324e-06, + "loss": 0.0151, + "step": 28962 + }, + { + "epoch": 0.57928, + "grad_norm": 0.33121445775032043, + "learning_rate": 8.979713703745237e-06, + "loss": 0.0071, + "step": 28964 + }, + { + "epoch": 0.57932, + "grad_norm": 0.05780190974473953, + "learning_rate": 8.978324736746153e-06, + "loss": 0.0039, + "step": 28966 + }, + { + "epoch": 0.57936, + "grad_norm": 0.06075397506356239, + "learning_rate": 8.97693578966516e-06, + "loss": 0.0031, + "step": 28968 + }, + { + "epoch": 0.5794, + "grad_norm": 1.3773740530014038, + "learning_rate": 8.975546862529328e-06, + "loss": 0.0175, + "step": 28970 + }, + { + "epoch": 0.57944, + "grad_norm": 0.07913611084222794, + "learning_rate": 8.974157955365734e-06, + "loss": 0.0536, + "step": 28972 + }, + { + "epoch": 0.57948, + "grad_norm": 0.13548429310321808, + "learning_rate": 8.972769068201463e-06, + "loss": 0.008, + "step": 28974 + }, + { + "epoch": 0.57952, + "grad_norm": 3.3624346256256104, + "learning_rate": 8.971380201063586e-06, + "loss": 0.0587, + "step": 28976 + }, + { + "epoch": 0.57956, + "grad_norm": 0.7139538526535034, + "learning_rate": 8.969991353979184e-06, + "loss": 0.0686, + "step": 28978 + }, + { + "epoch": 0.5796, + "grad_norm": 0.13247138261795044, + "learning_rate": 8.968602526975329e-06, + "loss": 0.0071, + "step": 28980 + }, + { + "epoch": 0.57964, + "grad_norm": 0.17342950403690338, + "learning_rate": 8.967213720079097e-06, + "loss": 0.0043, + "step": 28982 + }, + { + "epoch": 0.57968, + "grad_norm": 0.039687275886535645, + "learning_rate": 8.965824933317568e-06, + "loss": 0.0063, + "step": 28984 + }, + { + "epoch": 0.57972, + "grad_norm": 11.010661125183105, + "learning_rate": 8.964436166717814e-06, + "loss": 0.2755, + "step": 28986 + }, + { + "epoch": 0.57976, + "grad_norm": 0.8309107422828674, + "learning_rate": 8.963047420306907e-06, + "loss": 0.0655, + "step": 28988 + }, + { + "epoch": 0.5798, + "grad_norm": 0.06251782923936844, + "learning_rate": 8.961658694111929e-06, + "loss": 0.0068, + "step": 28990 + }, + { + "epoch": 0.57984, + "grad_norm": 0.22682258486747742, + "learning_rate": 8.960269988159946e-06, + "loss": 0.0059, + "step": 28992 + }, + { + "epoch": 0.57988, + "grad_norm": 0.2843988835811615, + "learning_rate": 8.958881302478035e-06, + "loss": 0.0045, + "step": 28994 + }, + { + "epoch": 0.57992, + "grad_norm": 1.9669157266616821, + "learning_rate": 8.957492637093272e-06, + "loss": 0.0367, + "step": 28996 + }, + { + "epoch": 0.57996, + "grad_norm": 0.8940660357475281, + "learning_rate": 8.956103992032723e-06, + "loss": 0.025, + "step": 28998 + }, + { + "epoch": 0.58, + "grad_norm": 6.876120567321777, + "learning_rate": 8.954715367323468e-06, + "loss": 0.1148, + "step": 29000 + }, + { + "epoch": 0.58004, + "grad_norm": 11.571383476257324, + "learning_rate": 8.953326762992573e-06, + "loss": 0.3376, + "step": 29002 + }, + { + "epoch": 0.58008, + "grad_norm": 1.0429880619049072, + "learning_rate": 8.951938179067109e-06, + "loss": 0.0159, + "step": 29004 + }, + { + "epoch": 0.58012, + "grad_norm": 0.005419196095317602, + "learning_rate": 8.950549615574155e-06, + "loss": 0.0036, + "step": 29006 + }, + { + "epoch": 0.58016, + "grad_norm": 0.18839721381664276, + "learning_rate": 8.94916107254077e-06, + "loss": 0.0188, + "step": 29008 + }, + { + "epoch": 0.5802, + "grad_norm": 0.1404939740896225, + "learning_rate": 8.947772549994037e-06, + "loss": 0.0892, + "step": 29010 + }, + { + "epoch": 0.58024, + "grad_norm": 0.15334473550319672, + "learning_rate": 8.946384047961017e-06, + "loss": 0.0033, + "step": 29012 + }, + { + "epoch": 0.58028, + "grad_norm": 0.061746746301651, + "learning_rate": 8.944995566468784e-06, + "loss": 0.0066, + "step": 29014 + }, + { + "epoch": 0.58032, + "grad_norm": 0.022621240466833115, + "learning_rate": 8.943607105544406e-06, + "loss": 0.0005, + "step": 29016 + }, + { + "epoch": 0.58036, + "grad_norm": 0.2459660917520523, + "learning_rate": 8.94221866521495e-06, + "loss": 0.0273, + "step": 29018 + }, + { + "epoch": 0.5804, + "grad_norm": 3.414381265640259, + "learning_rate": 8.940830245507483e-06, + "loss": 0.0537, + "step": 29020 + }, + { + "epoch": 0.58044, + "grad_norm": 0.07356253266334534, + "learning_rate": 8.939441846449079e-06, + "loss": 0.0025, + "step": 29022 + }, + { + "epoch": 0.58048, + "grad_norm": 0.5427801012992859, + "learning_rate": 8.9380534680668e-06, + "loss": 0.0099, + "step": 29024 + }, + { + "epoch": 0.58052, + "grad_norm": 0.10496728867292404, + "learning_rate": 8.936665110387719e-06, + "loss": 0.0059, + "step": 29026 + }, + { + "epoch": 0.58056, + "grad_norm": 0.12712039053440094, + "learning_rate": 8.935276773438896e-06, + "loss": 0.338, + "step": 29028 + }, + { + "epoch": 0.5806, + "grad_norm": 1.2992701530456543, + "learning_rate": 8.933888457247402e-06, + "loss": 0.0183, + "step": 29030 + }, + { + "epoch": 0.58064, + "grad_norm": 0.01481008343398571, + "learning_rate": 8.932500161840303e-06, + "loss": 0.0008, + "step": 29032 + }, + { + "epoch": 0.58068, + "grad_norm": 0.12351672351360321, + "learning_rate": 8.93111188724466e-06, + "loss": 0.0287, + "step": 29034 + }, + { + "epoch": 0.58072, + "grad_norm": 0.060862623155117035, + "learning_rate": 8.92972363348754e-06, + "loss": 0.0022, + "step": 29036 + }, + { + "epoch": 0.58076, + "grad_norm": 0.23873074352741241, + "learning_rate": 8.928335400596012e-06, + "loss": 0.0562, + "step": 29038 + }, + { + "epoch": 0.5808, + "grad_norm": 0.011109225451946259, + "learning_rate": 8.926947188597133e-06, + "loss": 0.0046, + "step": 29040 + }, + { + "epoch": 0.58084, + "grad_norm": 3.011868953704834, + "learning_rate": 8.925558997517976e-06, + "loss": 0.0526, + "step": 29042 + }, + { + "epoch": 0.58088, + "grad_norm": 3.415602445602417, + "learning_rate": 8.9241708273856e-06, + "loss": 0.1294, + "step": 29044 + }, + { + "epoch": 0.58092, + "grad_norm": 0.11216510087251663, + "learning_rate": 8.922782678227063e-06, + "loss": 0.0047, + "step": 29046 + }, + { + "epoch": 0.58096, + "grad_norm": 0.009844589978456497, + "learning_rate": 8.921394550069434e-06, + "loss": 0.0009, + "step": 29048 + }, + { + "epoch": 0.581, + "grad_norm": 0.002285466995090246, + "learning_rate": 8.920006442939772e-06, + "loss": 0.7519, + "step": 29050 + }, + { + "epoch": 0.58104, + "grad_norm": 0.01000229548662901, + "learning_rate": 8.918618356865142e-06, + "loss": 0.0647, + "step": 29052 + }, + { + "epoch": 0.58108, + "grad_norm": 0.011961586773395538, + "learning_rate": 8.917230291872606e-06, + "loss": 0.0019, + "step": 29054 + }, + { + "epoch": 0.58112, + "grad_norm": 0.7307564616203308, + "learning_rate": 8.915842247989217e-06, + "loss": 0.0129, + "step": 29056 + }, + { + "epoch": 0.58116, + "grad_norm": 0.299642950296402, + "learning_rate": 8.914454225242048e-06, + "loss": 0.0052, + "step": 29058 + }, + { + "epoch": 0.5812, + "grad_norm": 0.35100308060646057, + "learning_rate": 8.913066223658152e-06, + "loss": 0.0065, + "step": 29060 + }, + { + "epoch": 0.58124, + "grad_norm": 8.542476654052734, + "learning_rate": 8.911678243264584e-06, + "loss": 0.1959, + "step": 29062 + }, + { + "epoch": 0.58128, + "grad_norm": 0.09699425101280212, + "learning_rate": 8.910290284088414e-06, + "loss": 0.0134, + "step": 29064 + }, + { + "epoch": 0.58132, + "grad_norm": 0.005970369093120098, + "learning_rate": 8.908902346156695e-06, + "loss": 0.0104, + "step": 29066 + }, + { + "epoch": 0.58136, + "grad_norm": 0.02619067206978798, + "learning_rate": 8.907514429496486e-06, + "loss": 0.0013, + "step": 29068 + }, + { + "epoch": 0.5814, + "grad_norm": 0.34410297870635986, + "learning_rate": 8.906126534134849e-06, + "loss": 0.1113, + "step": 29070 + }, + { + "epoch": 0.58144, + "grad_norm": 1.2571543455123901, + "learning_rate": 8.90473866009883e-06, + "loss": 0.0177, + "step": 29072 + }, + { + "epoch": 0.58148, + "grad_norm": 0.7793738842010498, + "learning_rate": 8.903350807415502e-06, + "loss": 0.0176, + "step": 29074 + }, + { + "epoch": 0.58152, + "grad_norm": 0.2656927704811096, + "learning_rate": 8.901962976111913e-06, + "loss": 0.0042, + "step": 29076 + }, + { + "epoch": 0.58156, + "grad_norm": 0.0705445259809494, + "learning_rate": 8.900575166215119e-06, + "loss": 0.004, + "step": 29078 + }, + { + "epoch": 0.5816, + "grad_norm": 0.3923485279083252, + "learning_rate": 8.89918737775218e-06, + "loss": 0.0631, + "step": 29080 + }, + { + "epoch": 0.58164, + "grad_norm": 0.41888484358787537, + "learning_rate": 8.897799610750149e-06, + "loss": 0.0114, + "step": 29082 + }, + { + "epoch": 0.58168, + "grad_norm": 0.7849786281585693, + "learning_rate": 8.896411865236084e-06, + "loss": 0.0198, + "step": 29084 + }, + { + "epoch": 0.58172, + "grad_norm": 0.019936662167310715, + "learning_rate": 8.895024141237038e-06, + "loss": 0.0019, + "step": 29086 + }, + { + "epoch": 0.58176, + "grad_norm": 0.09103147685527802, + "learning_rate": 8.89363643878006e-06, + "loss": 0.0025, + "step": 29088 + }, + { + "epoch": 0.5818, + "grad_norm": 10.062845230102539, + "learning_rate": 8.892248757892215e-06, + "loss": 0.2512, + "step": 29090 + }, + { + "epoch": 0.58184, + "grad_norm": 0.030152102932333946, + "learning_rate": 8.890861098600549e-06, + "loss": 0.0068, + "step": 29092 + }, + { + "epoch": 0.58188, + "grad_norm": 0.008707122877240181, + "learning_rate": 8.889473460932114e-06, + "loss": 0.0025, + "step": 29094 + }, + { + "epoch": 0.58192, + "grad_norm": 2.098313331604004, + "learning_rate": 8.88808584491397e-06, + "loss": 0.0395, + "step": 29096 + }, + { + "epoch": 0.58196, + "grad_norm": 3.343618154525757, + "learning_rate": 8.886698250573163e-06, + "loss": 0.075, + "step": 29098 + }, + { + "epoch": 0.582, + "grad_norm": 0.14493553340435028, + "learning_rate": 8.885310677936746e-06, + "loss": 0.0031, + "step": 29100 + }, + { + "epoch": 0.58204, + "grad_norm": 1.4063767194747925, + "learning_rate": 8.883923127031775e-06, + "loss": 0.0282, + "step": 29102 + }, + { + "epoch": 0.58208, + "grad_norm": 7.795495986938477, + "learning_rate": 8.882535597885291e-06, + "loss": 0.0883, + "step": 29104 + }, + { + "epoch": 0.58212, + "grad_norm": 0.0960172489285469, + "learning_rate": 8.881148090524356e-06, + "loss": 0.0049, + "step": 29106 + }, + { + "epoch": 0.58216, + "grad_norm": 0.04118557274341583, + "learning_rate": 8.879760604976015e-06, + "loss": 0.0007, + "step": 29108 + }, + { + "epoch": 0.5822, + "grad_norm": 0.07684961706399918, + "learning_rate": 8.878373141267312e-06, + "loss": 0.0014, + "step": 29110 + }, + { + "epoch": 0.58224, + "grad_norm": 0.01585264503955841, + "learning_rate": 8.876985699425307e-06, + "loss": 0.0005, + "step": 29112 + }, + { + "epoch": 0.58228, + "grad_norm": 7.562989234924316, + "learning_rate": 8.875598279477042e-06, + "loss": 0.1499, + "step": 29114 + }, + { + "epoch": 0.58232, + "grad_norm": 0.10835274308919907, + "learning_rate": 8.87421088144957e-06, + "loss": 0.0028, + "step": 29116 + }, + { + "epoch": 0.58236, + "grad_norm": 0.007672865409404039, + "learning_rate": 8.872823505369935e-06, + "loss": 0.0001, + "step": 29118 + }, + { + "epoch": 0.5824, + "grad_norm": 0.16108813881874084, + "learning_rate": 8.871436151265183e-06, + "loss": 0.0601, + "step": 29120 + }, + { + "epoch": 0.58244, + "grad_norm": 4.054848670959473, + "learning_rate": 8.87004881916237e-06, + "loss": 0.0659, + "step": 29122 + }, + { + "epoch": 0.58248, + "grad_norm": 5.821801662445068, + "learning_rate": 8.868661509088535e-06, + "loss": 0.1184, + "step": 29124 + }, + { + "epoch": 0.58252, + "grad_norm": 0.032916802912950516, + "learning_rate": 8.867274221070722e-06, + "loss": 0.1628, + "step": 29126 + }, + { + "epoch": 0.58256, + "grad_norm": 0.009411095641553402, + "learning_rate": 8.865886955135986e-06, + "loss": 0.0013, + "step": 29128 + }, + { + "epoch": 0.5826, + "grad_norm": 0.14701451361179352, + "learning_rate": 8.864499711311362e-06, + "loss": 0.0136, + "step": 29130 + }, + { + "epoch": 0.58264, + "grad_norm": 0.0864851251244545, + "learning_rate": 8.863112489623908e-06, + "loss": 0.0971, + "step": 29132 + }, + { + "epoch": 0.58268, + "grad_norm": 0.1964845359325409, + "learning_rate": 8.86172529010066e-06, + "loss": 0.0042, + "step": 29134 + }, + { + "epoch": 0.58272, + "grad_norm": 0.05141604319214821, + "learning_rate": 8.86033811276866e-06, + "loss": 0.0008, + "step": 29136 + }, + { + "epoch": 0.58276, + "grad_norm": 0.07267799973487854, + "learning_rate": 8.85895095765496e-06, + "loss": 0.0026, + "step": 29138 + }, + { + "epoch": 0.5828, + "grad_norm": 0.28642335534095764, + "learning_rate": 8.857563824786598e-06, + "loss": 0.0071, + "step": 29140 + }, + { + "epoch": 0.58284, + "grad_norm": 6.784119129180908, + "learning_rate": 8.856176714190611e-06, + "loss": 0.2428, + "step": 29142 + }, + { + "epoch": 0.58288, + "grad_norm": 1.438897728919983, + "learning_rate": 8.854789625894053e-06, + "loss": 0.022, + "step": 29144 + }, + { + "epoch": 0.58292, + "grad_norm": 5.454882621765137, + "learning_rate": 8.853402559923957e-06, + "loss": 0.085, + "step": 29146 + }, + { + "epoch": 0.58296, + "grad_norm": 0.35781925916671753, + "learning_rate": 8.852015516307371e-06, + "loss": 0.0049, + "step": 29148 + }, + { + "epoch": 0.583, + "grad_norm": 0.3111158311367035, + "learning_rate": 8.850628495071336e-06, + "loss": 0.0039, + "step": 29150 + }, + { + "epoch": 0.58304, + "grad_norm": 0.004758504219353199, + "learning_rate": 8.849241496242886e-06, + "loss": 0.0045, + "step": 29152 + }, + { + "epoch": 0.58308, + "grad_norm": 2.460970878601074, + "learning_rate": 8.84785451984907e-06, + "loss": 0.0294, + "step": 29154 + }, + { + "epoch": 0.58312, + "grad_norm": 0.03129517287015915, + "learning_rate": 8.846467565916917e-06, + "loss": 0.0272, + "step": 29156 + }, + { + "epoch": 0.58316, + "grad_norm": 0.13022944331169128, + "learning_rate": 8.845080634473478e-06, + "loss": 0.0154, + "step": 29158 + }, + { + "epoch": 0.5832, + "grad_norm": 0.012424208223819733, + "learning_rate": 8.843693725545787e-06, + "loss": 0.1759, + "step": 29160 + }, + { + "epoch": 0.58324, + "grad_norm": 0.9914032220840454, + "learning_rate": 8.842306839160877e-06, + "loss": 0.0104, + "step": 29162 + }, + { + "epoch": 0.58328, + "grad_norm": 2.1684365272521973, + "learning_rate": 8.840919975345795e-06, + "loss": 0.0356, + "step": 29164 + }, + { + "epoch": 0.58332, + "grad_norm": 0.1611567735671997, + "learning_rate": 8.839533134127577e-06, + "loss": 0.0039, + "step": 29166 + }, + { + "epoch": 0.58336, + "grad_norm": 0.04861815273761749, + "learning_rate": 8.838146315533253e-06, + "loss": 0.0034, + "step": 29168 + }, + { + "epoch": 0.5834, + "grad_norm": 0.353142112493515, + "learning_rate": 8.836759519589869e-06, + "loss": 0.0054, + "step": 29170 + }, + { + "epoch": 0.58344, + "grad_norm": 1.413556456565857, + "learning_rate": 8.83537274632445e-06, + "loss": 0.0144, + "step": 29172 + }, + { + "epoch": 0.58348, + "grad_norm": 0.004638876300305128, + "learning_rate": 8.833985995764046e-06, + "loss": 0.0017, + "step": 29174 + }, + { + "epoch": 0.58352, + "grad_norm": 1.4216763973236084, + "learning_rate": 8.832599267935684e-06, + "loss": 0.0157, + "step": 29176 + }, + { + "epoch": 0.58356, + "grad_norm": 0.14157208800315857, + "learning_rate": 8.831212562866398e-06, + "loss": 0.0018, + "step": 29178 + }, + { + "epoch": 0.5836, + "grad_norm": 0.3505954146385193, + "learning_rate": 8.829825880583228e-06, + "loss": 0.0101, + "step": 29180 + }, + { + "epoch": 0.58364, + "grad_norm": 0.9480804800987244, + "learning_rate": 8.828439221113202e-06, + "loss": 0.0147, + "step": 29182 + }, + { + "epoch": 0.58368, + "grad_norm": 0.03851030766963959, + "learning_rate": 8.827052584483358e-06, + "loss": 0.0012, + "step": 29184 + }, + { + "epoch": 0.58372, + "grad_norm": 0.469452440738678, + "learning_rate": 8.825665970720727e-06, + "loss": 0.0069, + "step": 29186 + }, + { + "epoch": 0.58376, + "grad_norm": 0.06513720005750656, + "learning_rate": 8.824279379852342e-06, + "loss": 0.0017, + "step": 29188 + }, + { + "epoch": 0.5838, + "grad_norm": 0.351838618516922, + "learning_rate": 8.822892811905237e-06, + "loss": 0.0055, + "step": 29190 + }, + { + "epoch": 0.58384, + "grad_norm": 4.397757053375244, + "learning_rate": 8.821506266906444e-06, + "loss": 0.0813, + "step": 29192 + }, + { + "epoch": 0.58388, + "grad_norm": 0.42043861746788025, + "learning_rate": 8.820119744882989e-06, + "loss": 0.0051, + "step": 29194 + }, + { + "epoch": 0.58392, + "grad_norm": 0.0244932621717453, + "learning_rate": 8.81873324586191e-06, + "loss": 0.0004, + "step": 29196 + }, + { + "epoch": 0.58396, + "grad_norm": 0.5020100474357605, + "learning_rate": 8.817346769870235e-06, + "loss": 0.0477, + "step": 29198 + }, + { + "epoch": 0.584, + "grad_norm": 0.29257044196128845, + "learning_rate": 8.815960316934991e-06, + "loss": 0.0143, + "step": 29200 + }, + { + "epoch": 0.58404, + "grad_norm": 2.1873748302459717, + "learning_rate": 8.814573887083213e-06, + "loss": 0.0339, + "step": 29202 + }, + { + "epoch": 0.58408, + "grad_norm": 0.1519250124692917, + "learning_rate": 8.813187480341925e-06, + "loss": 0.0052, + "step": 29204 + }, + { + "epoch": 0.58412, + "grad_norm": 0.012719781138002872, + "learning_rate": 8.811801096738161e-06, + "loss": 0.0029, + "step": 29206 + }, + { + "epoch": 0.58416, + "grad_norm": 0.1004280149936676, + "learning_rate": 8.810414736298944e-06, + "loss": 0.0023, + "step": 29208 + }, + { + "epoch": 0.5842, + "grad_norm": 0.3225383162498474, + "learning_rate": 8.809028399051302e-06, + "loss": 0.0148, + "step": 29210 + }, + { + "epoch": 0.58424, + "grad_norm": 0.22077825665473938, + "learning_rate": 8.80764208502227e-06, + "loss": 0.0032, + "step": 29212 + }, + { + "epoch": 0.58428, + "grad_norm": 0.06507759541273117, + "learning_rate": 8.806255794238867e-06, + "loss": 0.0014, + "step": 29214 + }, + { + "epoch": 0.58432, + "grad_norm": 1.0550217628479004, + "learning_rate": 8.804869526728119e-06, + "loss": 0.0199, + "step": 29216 + }, + { + "epoch": 0.58436, + "grad_norm": 0.13435354828834534, + "learning_rate": 8.803483282517058e-06, + "loss": 0.0021, + "step": 29218 + }, + { + "epoch": 0.5844, + "grad_norm": 0.0014454451156780124, + "learning_rate": 8.802097061632706e-06, + "loss": 0.0024, + "step": 29220 + }, + { + "epoch": 0.58444, + "grad_norm": 0.2736039459705353, + "learning_rate": 8.800710864102089e-06, + "loss": 0.004, + "step": 29222 + }, + { + "epoch": 0.58448, + "grad_norm": 0.11437831819057465, + "learning_rate": 8.799324689952232e-06, + "loss": 0.0034, + "step": 29224 + }, + { + "epoch": 0.58452, + "grad_norm": 0.042022954672575, + "learning_rate": 8.797938539210154e-06, + "loss": 0.0383, + "step": 29226 + }, + { + "epoch": 0.58456, + "grad_norm": 2.2121989727020264, + "learning_rate": 8.796552411902886e-06, + "loss": 0.0282, + "step": 29228 + }, + { + "epoch": 0.5846, + "grad_norm": 0.4022773802280426, + "learning_rate": 8.79516630805745e-06, + "loss": 0.0049, + "step": 29230 + }, + { + "epoch": 0.58464, + "grad_norm": 0.020980576053261757, + "learning_rate": 8.793780227700863e-06, + "loss": 0.0019, + "step": 29232 + }, + { + "epoch": 0.58468, + "grad_norm": 0.5544033646583557, + "learning_rate": 8.792394170860155e-06, + "loss": 0.006, + "step": 29234 + }, + { + "epoch": 0.58472, + "grad_norm": 0.17046919465065002, + "learning_rate": 8.791008137562341e-06, + "loss": 0.0033, + "step": 29236 + }, + { + "epoch": 0.58476, + "grad_norm": 0.05363912507891655, + "learning_rate": 8.78962212783445e-06, + "loss": 0.0014, + "step": 29238 + }, + { + "epoch": 0.5848, + "grad_norm": 0.40739405155181885, + "learning_rate": 8.788236141703498e-06, + "loss": 0.0071, + "step": 29240 + }, + { + "epoch": 0.58484, + "grad_norm": 0.22406534850597382, + "learning_rate": 8.786850179196502e-06, + "loss": 0.0264, + "step": 29242 + }, + { + "epoch": 0.58488, + "grad_norm": 0.6350603699684143, + "learning_rate": 8.78546424034049e-06, + "loss": 0.0098, + "step": 29244 + }, + { + "epoch": 0.58492, + "grad_norm": 0.08132784068584442, + "learning_rate": 8.784078325162478e-06, + "loss": 0.0092, + "step": 29246 + }, + { + "epoch": 0.58496, + "grad_norm": 6.728363037109375, + "learning_rate": 8.782692433689481e-06, + "loss": 0.0795, + "step": 29248 + }, + { + "epoch": 0.585, + "grad_norm": 0.07252570241689682, + "learning_rate": 8.781306565948528e-06, + "loss": 0.0726, + "step": 29250 + }, + { + "epoch": 0.58504, + "grad_norm": 0.030157286673784256, + "learning_rate": 8.779920721966625e-06, + "loss": 0.0045, + "step": 29252 + }, + { + "epoch": 0.58508, + "grad_norm": 0.003970475867390633, + "learning_rate": 8.778534901770798e-06, + "loss": 0.0083, + "step": 29254 + }, + { + "epoch": 0.58512, + "grad_norm": 0.22210215032100677, + "learning_rate": 8.777149105388065e-06, + "loss": 0.0054, + "step": 29256 + }, + { + "epoch": 0.58516, + "grad_norm": 0.29850324988365173, + "learning_rate": 8.775763332845436e-06, + "loss": 0.0047, + "step": 29258 + }, + { + "epoch": 0.5852, + "grad_norm": 0.27650758624076843, + "learning_rate": 8.774377584169934e-06, + "loss": 0.0448, + "step": 29260 + }, + { + "epoch": 0.58524, + "grad_norm": 0.04848780483007431, + "learning_rate": 8.772991859388567e-06, + "loss": 0.0023, + "step": 29262 + }, + { + "epoch": 0.58528, + "grad_norm": 0.6560940742492676, + "learning_rate": 8.77160615852836e-06, + "loss": 0.0069, + "step": 29264 + }, + { + "epoch": 0.58532, + "grad_norm": 0.07681524008512497, + "learning_rate": 8.770220481616323e-06, + "loss": 0.0012, + "step": 29266 + }, + { + "epoch": 0.58536, + "grad_norm": 0.006877340376377106, + "learning_rate": 8.768834828679467e-06, + "loss": 0.0009, + "step": 29268 + }, + { + "epoch": 0.5854, + "grad_norm": 6.233239650726318, + "learning_rate": 8.767449199744813e-06, + "loss": 0.3246, + "step": 29270 + }, + { + "epoch": 0.58544, + "grad_norm": 0.021337462589144707, + "learning_rate": 8.766063594839373e-06, + "loss": 0.2043, + "step": 29272 + }, + { + "epoch": 0.58548, + "grad_norm": 0.5204716324806213, + "learning_rate": 8.764678013990156e-06, + "loss": 0.0087, + "step": 29274 + }, + { + "epoch": 0.58552, + "grad_norm": 0.5628295540809631, + "learning_rate": 8.76329245722418e-06, + "loss": 0.0065, + "step": 29276 + }, + { + "epoch": 0.58556, + "grad_norm": 0.004453560337424278, + "learning_rate": 8.761906924568449e-06, + "loss": 0.0008, + "step": 29278 + }, + { + "epoch": 0.5856, + "grad_norm": 0.2838994264602661, + "learning_rate": 8.760521416049983e-06, + "loss": 0.0042, + "step": 29280 + }, + { + "epoch": 0.58564, + "grad_norm": 0.1384422481060028, + "learning_rate": 8.759135931695792e-06, + "loss": 0.0108, + "step": 29282 + }, + { + "epoch": 0.58568, + "grad_norm": 0.45106741786003113, + "learning_rate": 8.75775047153288e-06, + "loss": 0.0064, + "step": 29284 + }, + { + "epoch": 0.58572, + "grad_norm": 0.06828857958316803, + "learning_rate": 8.756365035588266e-06, + "loss": 0.0009, + "step": 29286 + }, + { + "epoch": 0.58576, + "grad_norm": 0.21587787568569183, + "learning_rate": 8.754979623888955e-06, + "loss": 0.0199, + "step": 29288 + }, + { + "epoch": 0.5858, + "grad_norm": 0.06551580131053925, + "learning_rate": 8.753594236461957e-06, + "loss": 0.001, + "step": 29290 + }, + { + "epoch": 0.58584, + "grad_norm": 0.04720674455165863, + "learning_rate": 8.752208873334283e-06, + "loss": 0.0005, + "step": 29292 + }, + { + "epoch": 0.58588, + "grad_norm": 0.3290921151638031, + "learning_rate": 8.750823534532936e-06, + "loss": 0.3053, + "step": 29294 + }, + { + "epoch": 0.58592, + "grad_norm": 0.29444435238838196, + "learning_rate": 8.74943822008493e-06, + "loss": 0.0225, + "step": 29296 + }, + { + "epoch": 0.58596, + "grad_norm": 0.038704268634319305, + "learning_rate": 8.748052930017269e-06, + "loss": 0.0172, + "step": 29298 + }, + { + "epoch": 0.586, + "grad_norm": 16.087528228759766, + "learning_rate": 8.746667664356957e-06, + "loss": 0.6557, + "step": 29300 + }, + { + "epoch": 0.58604, + "grad_norm": 0.0675344169139862, + "learning_rate": 8.745282423131009e-06, + "loss": 0.0072, + "step": 29302 + }, + { + "epoch": 0.58608, + "grad_norm": 0.03068799152970314, + "learning_rate": 8.743897206366426e-06, + "loss": 0.0085, + "step": 29304 + }, + { + "epoch": 0.58612, + "grad_norm": 0.9695281386375427, + "learning_rate": 8.742512014090211e-06, + "loss": 0.0143, + "step": 29306 + }, + { + "epoch": 0.58616, + "grad_norm": 0.30647796392440796, + "learning_rate": 8.741126846329374e-06, + "loss": 0.003, + "step": 29308 + }, + { + "epoch": 0.5862, + "grad_norm": 0.07300916314125061, + "learning_rate": 8.739741703110914e-06, + "loss": 0.0046, + "step": 29310 + }, + { + "epoch": 0.58624, + "grad_norm": 7.545339107513428, + "learning_rate": 8.738356584461841e-06, + "loss": 0.1082, + "step": 29312 + }, + { + "epoch": 0.58628, + "grad_norm": 0.015503862872719765, + "learning_rate": 8.736971490409158e-06, + "loss": 0.267, + "step": 29314 + }, + { + "epoch": 0.58632, + "grad_norm": 0.03128153830766678, + "learning_rate": 8.73558642097986e-06, + "loss": 0.0022, + "step": 29316 + }, + { + "epoch": 0.58636, + "grad_norm": 0.22921864688396454, + "learning_rate": 8.73420137620096e-06, + "loss": 0.0033, + "step": 29318 + }, + { + "epoch": 0.5864, + "grad_norm": 1.6708685159683228, + "learning_rate": 8.732816356099455e-06, + "loss": 0.0253, + "step": 29320 + }, + { + "epoch": 0.58644, + "grad_norm": 0.005444944370537996, + "learning_rate": 8.731431360702349e-06, + "loss": 0.0023, + "step": 29322 + }, + { + "epoch": 0.58648, + "grad_norm": 0.36375653743743896, + "learning_rate": 8.730046390036638e-06, + "loss": 0.0251, + "step": 29324 + }, + { + "epoch": 0.58652, + "grad_norm": 0.22736486792564392, + "learning_rate": 8.728661444129329e-06, + "loss": 0.0103, + "step": 29326 + }, + { + "epoch": 0.58656, + "grad_norm": 6.868948936462402, + "learning_rate": 8.72727652300742e-06, + "loss": 0.0888, + "step": 29328 + }, + { + "epoch": 0.5866, + "grad_norm": 0.02393249049782753, + "learning_rate": 8.725891626697912e-06, + "loss": 0.0042, + "step": 29330 + }, + { + "epoch": 0.58664, + "grad_norm": 9.73531723022461, + "learning_rate": 8.724506755227797e-06, + "loss": 0.1387, + "step": 29332 + }, + { + "epoch": 0.58668, + "grad_norm": 0.08607261627912521, + "learning_rate": 8.723121908624084e-06, + "loss": 0.0016, + "step": 29334 + }, + { + "epoch": 0.58672, + "grad_norm": 0.05481287091970444, + "learning_rate": 8.721737086913766e-06, + "loss": 0.0051, + "step": 29336 + }, + { + "epoch": 0.58676, + "grad_norm": 0.02416832000017166, + "learning_rate": 8.720352290123839e-06, + "loss": 0.5886, + "step": 29338 + }, + { + "epoch": 0.5868, + "grad_norm": 0.755462110042572, + "learning_rate": 8.718967518281307e-06, + "loss": 0.0118, + "step": 29340 + }, + { + "epoch": 0.58684, + "grad_norm": 0.3699451684951782, + "learning_rate": 8.71758277141316e-06, + "loss": 0.0058, + "step": 29342 + }, + { + "epoch": 0.58688, + "grad_norm": 0.058676332235336304, + "learning_rate": 8.716198049546399e-06, + "loss": 0.0009, + "step": 29344 + }, + { + "epoch": 0.58692, + "grad_norm": 0.20006011426448822, + "learning_rate": 8.714813352708018e-06, + "loss": 0.4711, + "step": 29346 + }, + { + "epoch": 0.58696, + "grad_norm": 0.005863044876605272, + "learning_rate": 8.71342868092501e-06, + "loss": 0.005, + "step": 29348 + }, + { + "epoch": 0.587, + "grad_norm": 0.08536366373300552, + "learning_rate": 8.712044034224374e-06, + "loss": 0.0055, + "step": 29350 + }, + { + "epoch": 0.58704, + "grad_norm": 0.029304426163434982, + "learning_rate": 8.710659412633104e-06, + "loss": 0.0042, + "step": 29352 + }, + { + "epoch": 0.58708, + "grad_norm": 0.007394412998110056, + "learning_rate": 8.709274816178187e-06, + "loss": 0.0337, + "step": 29354 + }, + { + "epoch": 0.58712, + "grad_norm": 1.7672417163848877, + "learning_rate": 8.707890244886628e-06, + "loss": 0.0292, + "step": 29356 + }, + { + "epoch": 0.58716, + "grad_norm": 14.659585952758789, + "learning_rate": 8.706505698785411e-06, + "loss": 0.469, + "step": 29358 + }, + { + "epoch": 0.5872, + "grad_norm": 0.3857487440109253, + "learning_rate": 8.705121177901532e-06, + "loss": 0.0051, + "step": 29360 + }, + { + "epoch": 0.58724, + "grad_norm": 0.24700355529785156, + "learning_rate": 8.703736682261984e-06, + "loss": 0.0673, + "step": 29362 + }, + { + "epoch": 0.58728, + "grad_norm": 1.7089180946350098, + "learning_rate": 8.702352211893752e-06, + "loss": 0.0277, + "step": 29364 + }, + { + "epoch": 0.58732, + "grad_norm": 0.16367825865745544, + "learning_rate": 8.700967766823836e-06, + "loss": 0.0128, + "step": 29366 + }, + { + "epoch": 0.58736, + "grad_norm": 0.24755752086639404, + "learning_rate": 8.699583347079218e-06, + "loss": 0.0075, + "step": 29368 + }, + { + "epoch": 0.5874, + "grad_norm": 0.01419824454933405, + "learning_rate": 8.698198952686896e-06, + "loss": 0.0241, + "step": 29370 + }, + { + "epoch": 0.58744, + "grad_norm": 0.10120553523302078, + "learning_rate": 8.696814583673856e-06, + "loss": 0.3022, + "step": 29372 + }, + { + "epoch": 0.58748, + "grad_norm": 0.03875740244984627, + "learning_rate": 8.695430240067086e-06, + "loss": 0.0069, + "step": 29374 + }, + { + "epoch": 0.58752, + "grad_norm": 0.11313897371292114, + "learning_rate": 8.694045921893575e-06, + "loss": 0.0146, + "step": 29376 + }, + { + "epoch": 0.58756, + "grad_norm": 0.7191026210784912, + "learning_rate": 8.692661629180312e-06, + "loss": 0.0101, + "step": 29378 + }, + { + "epoch": 0.5876, + "grad_norm": 6.167153358459473, + "learning_rate": 8.69127736195428e-06, + "loss": 0.0892, + "step": 29380 + }, + { + "epoch": 0.58764, + "grad_norm": 0.013203679583966732, + "learning_rate": 8.689893120242472e-06, + "loss": 0.0029, + "step": 29382 + }, + { + "epoch": 0.58768, + "grad_norm": 0.018309490755200386, + "learning_rate": 8.688508904071868e-06, + "loss": 0.0033, + "step": 29384 + }, + { + "epoch": 0.58772, + "grad_norm": 0.01491500623524189, + "learning_rate": 8.687124713469464e-06, + "loss": 0.015, + "step": 29386 + }, + { + "epoch": 0.58776, + "grad_norm": 0.022171640768647194, + "learning_rate": 8.68574054846224e-06, + "loss": 0.0012, + "step": 29388 + }, + { + "epoch": 0.5878, + "grad_norm": 0.035225801169872284, + "learning_rate": 8.684356409077177e-06, + "loss": 0.0217, + "step": 29390 + }, + { + "epoch": 0.58784, + "grad_norm": 0.11222666501998901, + "learning_rate": 8.682972295341262e-06, + "loss": 0.0066, + "step": 29392 + }, + { + "epoch": 0.58788, + "grad_norm": 0.020729731768369675, + "learning_rate": 8.681588207281485e-06, + "loss": 0.0005, + "step": 29394 + }, + { + "epoch": 0.58792, + "grad_norm": 0.3343951404094696, + "learning_rate": 8.680204144924821e-06, + "loss": 0.0049, + "step": 29396 + }, + { + "epoch": 0.58796, + "grad_norm": 9.943642616271973, + "learning_rate": 8.67882010829826e-06, + "loss": 0.2671, + "step": 29398 + }, + { + "epoch": 0.588, + "grad_norm": 0.41041433811187744, + "learning_rate": 8.677436097428775e-06, + "loss": 0.0095, + "step": 29400 + }, + { + "epoch": 0.58804, + "grad_norm": 0.1479770839214325, + "learning_rate": 8.676052112343359e-06, + "loss": 0.0058, + "step": 29402 + }, + { + "epoch": 0.58808, + "grad_norm": 6.914095401763916, + "learning_rate": 8.67466815306899e-06, + "loss": 0.1174, + "step": 29404 + }, + { + "epoch": 0.58812, + "grad_norm": 0.10125455260276794, + "learning_rate": 8.673284219632643e-06, + "loss": 0.0031, + "step": 29406 + }, + { + "epoch": 0.58816, + "grad_norm": 0.30268394947052, + "learning_rate": 8.671900312061306e-06, + "loss": 0.0101, + "step": 29408 + }, + { + "epoch": 0.5882, + "grad_norm": 10.483762741088867, + "learning_rate": 8.670516430381958e-06, + "loss": 0.319, + "step": 29410 + }, + { + "epoch": 0.58824, + "grad_norm": 0.28025710582733154, + "learning_rate": 8.669132574621573e-06, + "loss": 0.0407, + "step": 29412 + }, + { + "epoch": 0.58828, + "grad_norm": 1.5236310958862305, + "learning_rate": 8.667748744807138e-06, + "loss": 0.0169, + "step": 29414 + }, + { + "epoch": 0.58832, + "grad_norm": 2.6180615425109863, + "learning_rate": 8.66636494096562e-06, + "loss": 0.0336, + "step": 29416 + }, + { + "epoch": 0.58836, + "grad_norm": 10.720961570739746, + "learning_rate": 8.66498116312401e-06, + "loss": 0.3016, + "step": 29418 + }, + { + "epoch": 0.5884, + "grad_norm": 0.006771945394575596, + "learning_rate": 8.663597411309278e-06, + "loss": 0.0133, + "step": 29420 + }, + { + "epoch": 0.58844, + "grad_norm": 0.12595859169960022, + "learning_rate": 8.6622136855484e-06, + "loss": 0.0038, + "step": 29422 + }, + { + "epoch": 0.58848, + "grad_norm": 0.6946057081222534, + "learning_rate": 8.66082998586836e-06, + "loss": 0.0145, + "step": 29424 + }, + { + "epoch": 0.58852, + "grad_norm": 0.7445206642150879, + "learning_rate": 8.659446312296128e-06, + "loss": 0.3456, + "step": 29426 + }, + { + "epoch": 0.58856, + "grad_norm": 0.61057448387146, + "learning_rate": 8.658062664858677e-06, + "loss": 0.0122, + "step": 29428 + }, + { + "epoch": 0.5886, + "grad_norm": 1.70108163356781, + "learning_rate": 8.656679043582986e-06, + "loss": 0.0263, + "step": 29430 + }, + { + "epoch": 0.58864, + "grad_norm": 0.05678033456206322, + "learning_rate": 8.655295448496029e-06, + "loss": 0.0008, + "step": 29432 + }, + { + "epoch": 0.58868, + "grad_norm": 0.12308619171380997, + "learning_rate": 8.65391187962478e-06, + "loss": 0.0017, + "step": 29434 + }, + { + "epoch": 0.58872, + "grad_norm": 0.39642518758773804, + "learning_rate": 8.652528336996212e-06, + "loss": 0.0065, + "step": 29436 + }, + { + "epoch": 0.58876, + "grad_norm": 0.21838103234767914, + "learning_rate": 8.651144820637296e-06, + "loss": 0.0066, + "step": 29438 + }, + { + "epoch": 0.5888, + "grad_norm": 0.21413034200668335, + "learning_rate": 8.649761330575009e-06, + "loss": 0.0036, + "step": 29440 + }, + { + "epoch": 0.58884, + "grad_norm": 0.7119531035423279, + "learning_rate": 8.64837786683632e-06, + "loss": 0.0156, + "step": 29442 + }, + { + "epoch": 0.58888, + "grad_norm": 0.42569541931152344, + "learning_rate": 8.646994429448199e-06, + "loss": 0.0083, + "step": 29444 + }, + { + "epoch": 0.58892, + "grad_norm": 0.05527850240468979, + "learning_rate": 8.64561101843762e-06, + "loss": 0.0022, + "step": 29446 + }, + { + "epoch": 0.58896, + "grad_norm": 0.06598035991191864, + "learning_rate": 8.644227633831547e-06, + "loss": 0.0073, + "step": 29448 + }, + { + "epoch": 0.589, + "grad_norm": 1.8691895008087158, + "learning_rate": 8.642844275656957e-06, + "loss": 0.0269, + "step": 29450 + }, + { + "epoch": 0.58904, + "grad_norm": 0.07497288286685944, + "learning_rate": 8.641460943940819e-06, + "loss": 0.0345, + "step": 29452 + }, + { + "epoch": 0.58908, + "grad_norm": 0.09816735237836838, + "learning_rate": 8.640077638710094e-06, + "loss": 0.1511, + "step": 29454 + }, + { + "epoch": 0.58912, + "grad_norm": 0.7875702381134033, + "learning_rate": 8.63869435999176e-06, + "loss": 0.0122, + "step": 29456 + }, + { + "epoch": 0.58916, + "grad_norm": 0.020880578085780144, + "learning_rate": 8.63731110781278e-06, + "loss": 0.2507, + "step": 29458 + }, + { + "epoch": 0.5892, + "grad_norm": 0.223989799618721, + "learning_rate": 8.635927882200117e-06, + "loss": 0.0026, + "step": 29460 + }, + { + "epoch": 0.58924, + "grad_norm": 0.0314764641225338, + "learning_rate": 8.634544683180746e-06, + "loss": 0.0018, + "step": 29462 + }, + { + "epoch": 0.58928, + "grad_norm": 0.18544571101665497, + "learning_rate": 8.633161510781629e-06, + "loss": 0.0024, + "step": 29464 + }, + { + "epoch": 0.58932, + "grad_norm": 1.1994588375091553, + "learning_rate": 8.631778365029734e-06, + "loss": 0.1206, + "step": 29466 + }, + { + "epoch": 0.58936, + "grad_norm": 0.8591035008430481, + "learning_rate": 8.630395245952023e-06, + "loss": 0.0094, + "step": 29468 + }, + { + "epoch": 0.5894, + "grad_norm": 1.1710474491119385, + "learning_rate": 8.629012153575458e-06, + "loss": 0.014, + "step": 29470 + }, + { + "epoch": 0.58944, + "grad_norm": 0.13480274379253387, + "learning_rate": 8.62762908792701e-06, + "loss": 0.0022, + "step": 29472 + }, + { + "epoch": 0.58948, + "grad_norm": 0.616683304309845, + "learning_rate": 8.626246049033638e-06, + "loss": 0.0109, + "step": 29474 + }, + { + "epoch": 0.58952, + "grad_norm": 1.8761776685714722, + "learning_rate": 8.624863036922308e-06, + "loss": 0.0307, + "step": 29476 + }, + { + "epoch": 0.58956, + "grad_norm": 12.503318786621094, + "learning_rate": 8.623480051619983e-06, + "loss": 0.0873, + "step": 29478 + }, + { + "epoch": 0.5896, + "grad_norm": 0.007251795381307602, + "learning_rate": 8.62209709315362e-06, + "loss": 0.0003, + "step": 29480 + }, + { + "epoch": 0.58964, + "grad_norm": 0.16990624368190765, + "learning_rate": 8.620714161550185e-06, + "loss": 0.0028, + "step": 29482 + }, + { + "epoch": 0.58968, + "grad_norm": 8.780206680297852, + "learning_rate": 8.61933125683664e-06, + "loss": 0.209, + "step": 29484 + }, + { + "epoch": 0.58972, + "grad_norm": 0.05129164829850197, + "learning_rate": 8.617948379039936e-06, + "loss": 0.0024, + "step": 29486 + }, + { + "epoch": 0.58976, + "grad_norm": 2.392899513244629, + "learning_rate": 8.616565528187046e-06, + "loss": 0.0637, + "step": 29488 + }, + { + "epoch": 0.5898, + "grad_norm": 0.015195751562714577, + "learning_rate": 8.615182704304918e-06, + "loss": 0.0004, + "step": 29490 + }, + { + "epoch": 0.58984, + "grad_norm": 0.12688684463500977, + "learning_rate": 8.61379990742052e-06, + "loss": 0.51, + "step": 29492 + }, + { + "epoch": 0.58988, + "grad_norm": 3.1831672191619873, + "learning_rate": 8.612417137560808e-06, + "loss": 0.0384, + "step": 29494 + }, + { + "epoch": 0.58992, + "grad_norm": 0.7258736491203308, + "learning_rate": 8.611034394752735e-06, + "loss": 0.0164, + "step": 29496 + }, + { + "epoch": 0.58996, + "grad_norm": 0.22636626660823822, + "learning_rate": 8.609651679023265e-06, + "loss": 0.0055, + "step": 29498 + }, + { + "epoch": 0.59, + "grad_norm": 0.04373423010110855, + "learning_rate": 8.60826899039935e-06, + "loss": 0.0009, + "step": 29500 + }, + { + "epoch": 0.59004, + "grad_norm": 0.09976312518119812, + "learning_rate": 8.606886328907942e-06, + "loss": 0.0016, + "step": 29502 + }, + { + "epoch": 0.59008, + "grad_norm": 0.3110026717185974, + "learning_rate": 8.60550369457601e-06, + "loss": 0.0054, + "step": 29504 + }, + { + "epoch": 0.59012, + "grad_norm": 0.2112628072500229, + "learning_rate": 8.604121087430494e-06, + "loss": 0.0038, + "step": 29506 + }, + { + "epoch": 0.59016, + "grad_norm": 0.03780724108219147, + "learning_rate": 8.602738507498361e-06, + "loss": 0.0017, + "step": 29508 + }, + { + "epoch": 0.5902, + "grad_norm": 0.06581848859786987, + "learning_rate": 8.601355954806562e-06, + "loss": 0.0175, + "step": 29510 + }, + { + "epoch": 0.59024, + "grad_norm": 0.03551261126995087, + "learning_rate": 8.599973429382045e-06, + "loss": 0.0013, + "step": 29512 + }, + { + "epoch": 0.59028, + "grad_norm": 0.6106751561164856, + "learning_rate": 8.598590931251768e-06, + "loss": 0.0373, + "step": 29514 + }, + { + "epoch": 0.59032, + "grad_norm": 0.07549088448286057, + "learning_rate": 8.597208460442682e-06, + "loss": 0.0478, + "step": 29516 + }, + { + "epoch": 0.59036, + "grad_norm": 12.0219087600708, + "learning_rate": 8.59582601698174e-06, + "loss": 0.3374, + "step": 29518 + }, + { + "epoch": 0.5904, + "grad_norm": 0.23217789828777313, + "learning_rate": 8.594443600895892e-06, + "loss": 0.0065, + "step": 29520 + }, + { + "epoch": 0.59044, + "grad_norm": 0.08428551256656647, + "learning_rate": 8.593061212212086e-06, + "loss": 0.0016, + "step": 29522 + }, + { + "epoch": 0.59048, + "grad_norm": 0.26195028424263, + "learning_rate": 8.591678850957282e-06, + "loss": 0.0069, + "step": 29524 + }, + { + "epoch": 0.59052, + "grad_norm": 0.1161496564745903, + "learning_rate": 8.590296517158423e-06, + "loss": 0.4313, + "step": 29526 + }, + { + "epoch": 0.59056, + "grad_norm": 0.20973610877990723, + "learning_rate": 8.588914210842455e-06, + "loss": 0.0031, + "step": 29528 + }, + { + "epoch": 0.5906, + "grad_norm": 1.0923898220062256, + "learning_rate": 8.587531932036334e-06, + "loss": 0.0151, + "step": 29530 + }, + { + "epoch": 0.59064, + "grad_norm": 2.474292755126953, + "learning_rate": 8.586149680767008e-06, + "loss": 0.0286, + "step": 29532 + }, + { + "epoch": 0.59068, + "grad_norm": 0.005621998570859432, + "learning_rate": 8.584767457061417e-06, + "loss": 0.0012, + "step": 29534 + }, + { + "epoch": 0.59072, + "grad_norm": 0.03607236221432686, + "learning_rate": 8.583385260946516e-06, + "loss": 0.0078, + "step": 29536 + }, + { + "epoch": 0.59076, + "grad_norm": 0.10005678236484528, + "learning_rate": 8.582003092449245e-06, + "loss": 0.0533, + "step": 29538 + }, + { + "epoch": 0.5908, + "grad_norm": 0.03642386198043823, + "learning_rate": 8.580620951596556e-06, + "loss": 0.0009, + "step": 29540 + }, + { + "epoch": 0.59084, + "grad_norm": 0.028721176087856293, + "learning_rate": 8.579238838415396e-06, + "loss": 0.0341, + "step": 29542 + }, + { + "epoch": 0.59088, + "grad_norm": 0.006531063932925463, + "learning_rate": 8.577856752932699e-06, + "loss": 0.0005, + "step": 29544 + }, + { + "epoch": 0.59092, + "grad_norm": 1.1070122718811035, + "learning_rate": 8.576474695175422e-06, + "loss": 0.1204, + "step": 29546 + }, + { + "epoch": 0.59096, + "grad_norm": 0.05344233289361, + "learning_rate": 8.575092665170503e-06, + "loss": 0.0032, + "step": 29548 + }, + { + "epoch": 0.591, + "grad_norm": 0.17269538342952728, + "learning_rate": 8.573710662944884e-06, + "loss": 0.003, + "step": 29550 + }, + { + "epoch": 0.59104, + "grad_norm": 0.4900006353855133, + "learning_rate": 8.572328688525513e-06, + "loss": 0.0085, + "step": 29552 + }, + { + "epoch": 0.59108, + "grad_norm": 1.6697988510131836, + "learning_rate": 8.570946741939325e-06, + "loss": 0.0469, + "step": 29554 + }, + { + "epoch": 0.59112, + "grad_norm": 0.7754396796226501, + "learning_rate": 8.56956482321327e-06, + "loss": 0.0105, + "step": 29556 + }, + { + "epoch": 0.59116, + "grad_norm": 1.5385688543319702, + "learning_rate": 8.568182932374284e-06, + "loss": 0.014, + "step": 29558 + }, + { + "epoch": 0.5912, + "grad_norm": 0.2440924197435379, + "learning_rate": 8.566801069449307e-06, + "loss": 0.0028, + "step": 29560 + }, + { + "epoch": 0.59124, + "grad_norm": 12.613072395324707, + "learning_rate": 8.565419234465284e-06, + "loss": 0.5285, + "step": 29562 + }, + { + "epoch": 0.59128, + "grad_norm": 8.599262237548828, + "learning_rate": 8.564037427449152e-06, + "loss": 0.1616, + "step": 29564 + }, + { + "epoch": 0.59132, + "grad_norm": 0.7231384515762329, + "learning_rate": 8.562655648427847e-06, + "loss": 0.0148, + "step": 29566 + }, + { + "epoch": 0.59136, + "grad_norm": 0.050117433071136475, + "learning_rate": 8.56127389742831e-06, + "loss": 0.001, + "step": 29568 + }, + { + "epoch": 0.5914, + "grad_norm": 0.09138685464859009, + "learning_rate": 8.559892174477478e-06, + "loss": 0.0015, + "step": 29570 + }, + { + "epoch": 0.59144, + "grad_norm": 0.07273485511541367, + "learning_rate": 8.558510479602293e-06, + "loss": 0.7942, + "step": 29572 + }, + { + "epoch": 0.59148, + "grad_norm": 0.41292163729667664, + "learning_rate": 8.557128812829689e-06, + "loss": 0.0051, + "step": 29574 + }, + { + "epoch": 0.59152, + "grad_norm": 0.40779417753219604, + "learning_rate": 8.555747174186597e-06, + "loss": 0.0091, + "step": 29576 + }, + { + "epoch": 0.59156, + "grad_norm": 0.07531381398439407, + "learning_rate": 8.55436556369996e-06, + "loss": 0.014, + "step": 29578 + }, + { + "epoch": 0.5916, + "grad_norm": 0.10028036683797836, + "learning_rate": 8.552983981396709e-06, + "loss": 0.0017, + "step": 29580 + }, + { + "epoch": 0.59164, + "grad_norm": 0.27066388726234436, + "learning_rate": 8.551602427303785e-06, + "loss": 0.0035, + "step": 29582 + }, + { + "epoch": 0.59168, + "grad_norm": 0.22972093522548676, + "learning_rate": 8.550220901448112e-06, + "loss": 0.0028, + "step": 29584 + }, + { + "epoch": 0.59172, + "grad_norm": 0.09465501457452774, + "learning_rate": 8.54883940385663e-06, + "loss": 0.0023, + "step": 29586 + }, + { + "epoch": 0.59176, + "grad_norm": 1.2450675964355469, + "learning_rate": 8.547457934556274e-06, + "loss": 0.0202, + "step": 29588 + }, + { + "epoch": 0.5918, + "grad_norm": 0.48208504915237427, + "learning_rate": 8.546076493573973e-06, + "loss": 0.0357, + "step": 29590 + }, + { + "epoch": 0.59184, + "grad_norm": 0.0009340498363599181, + "learning_rate": 8.544695080936654e-06, + "loss": 0.0006, + "step": 29592 + }, + { + "epoch": 0.59188, + "grad_norm": 0.5890219807624817, + "learning_rate": 8.543313696671259e-06, + "loss": 0.0093, + "step": 29594 + }, + { + "epoch": 0.59192, + "grad_norm": 0.5934277176856995, + "learning_rate": 8.541932340804709e-06, + "loss": 0.0074, + "step": 29596 + }, + { + "epoch": 0.59196, + "grad_norm": 0.04322314262390137, + "learning_rate": 8.540551013363942e-06, + "loss": 0.0652, + "step": 29598 + }, + { + "epoch": 0.592, + "grad_norm": 1.7086842060089111, + "learning_rate": 8.539169714375885e-06, + "loss": 0.0356, + "step": 29600 + }, + { + "epoch": 0.59204, + "grad_norm": 0.04891900718212128, + "learning_rate": 8.537788443867465e-06, + "loss": 0.0011, + "step": 29602 + }, + { + "epoch": 0.59208, + "grad_norm": 0.21994523704051971, + "learning_rate": 8.536407201865613e-06, + "loss": 0.0131, + "step": 29604 + }, + { + "epoch": 0.59212, + "grad_norm": 3.067214250564575, + "learning_rate": 8.535025988397258e-06, + "loss": 0.0542, + "step": 29606 + }, + { + "epoch": 0.59216, + "grad_norm": 2.0830676555633545, + "learning_rate": 8.533644803489319e-06, + "loss": 0.034, + "step": 29608 + }, + { + "epoch": 0.5922, + "grad_norm": 3.2084193229675293, + "learning_rate": 8.532263647168735e-06, + "loss": 0.0482, + "step": 29610 + }, + { + "epoch": 0.59224, + "grad_norm": 0.04643159359693527, + "learning_rate": 8.530882519462422e-06, + "loss": 0.0011, + "step": 29612 + }, + { + "epoch": 0.59228, + "grad_norm": 1.1861649751663208, + "learning_rate": 8.529501420397315e-06, + "loss": 0.0141, + "step": 29614 + }, + { + "epoch": 0.59232, + "grad_norm": 0.05424069985747337, + "learning_rate": 8.528120350000336e-06, + "loss": 0.251, + "step": 29616 + }, + { + "epoch": 0.59236, + "grad_norm": 0.11830441653728485, + "learning_rate": 8.526739308298406e-06, + "loss": 0.0033, + "step": 29618 + }, + { + "epoch": 0.5924, + "grad_norm": 0.08997233957052231, + "learning_rate": 8.525358295318454e-06, + "loss": 0.0127, + "step": 29620 + }, + { + "epoch": 0.59244, + "grad_norm": 0.5073150992393494, + "learning_rate": 8.523977311087399e-06, + "loss": 0.0061, + "step": 29622 + }, + { + "epoch": 0.59248, + "grad_norm": 1.2451092004776, + "learning_rate": 8.522596355632164e-06, + "loss": 0.017, + "step": 29624 + }, + { + "epoch": 0.59252, + "grad_norm": 0.1040751114487648, + "learning_rate": 8.521215428979679e-06, + "loss": 0.0253, + "step": 29626 + }, + { + "epoch": 0.59256, + "grad_norm": 0.1307157725095749, + "learning_rate": 8.519834531156854e-06, + "loss": 0.002, + "step": 29628 + }, + { + "epoch": 0.5926, + "grad_norm": 0.11678148806095123, + "learning_rate": 8.518453662190622e-06, + "loss": 0.0599, + "step": 29630 + }, + { + "epoch": 0.59264, + "grad_norm": 0.19273856282234192, + "learning_rate": 8.517072822107898e-06, + "loss": 0.0033, + "step": 29632 + }, + { + "epoch": 0.59268, + "grad_norm": 0.5048583149909973, + "learning_rate": 8.5156920109356e-06, + "loss": 0.049, + "step": 29634 + }, + { + "epoch": 0.59272, + "grad_norm": 0.01700177602469921, + "learning_rate": 8.514311228700655e-06, + "loss": 0.0007, + "step": 29636 + }, + { + "epoch": 0.59276, + "grad_norm": 4.0047688484191895, + "learning_rate": 8.512930475429978e-06, + "loss": 0.046, + "step": 29638 + }, + { + "epoch": 0.5928, + "grad_norm": 0.006990041583776474, + "learning_rate": 8.511549751150478e-06, + "loss": 0.0003, + "step": 29640 + }, + { + "epoch": 0.59284, + "grad_norm": 0.010473258793354034, + "learning_rate": 8.510169055889091e-06, + "loss": 0.0011, + "step": 29642 + }, + { + "epoch": 0.59288, + "grad_norm": 0.006811855360865593, + "learning_rate": 8.50878838967272e-06, + "loss": 0.0007, + "step": 29644 + }, + { + "epoch": 0.59292, + "grad_norm": 0.18271081149578094, + "learning_rate": 8.507407752528289e-06, + "loss": 0.0053, + "step": 29646 + }, + { + "epoch": 0.59296, + "grad_norm": 4.486298561096191, + "learning_rate": 8.506027144482713e-06, + "loss": 0.0733, + "step": 29648 + }, + { + "epoch": 0.593, + "grad_norm": 0.2905728816986084, + "learning_rate": 8.504646565562907e-06, + "loss": 0.0224, + "step": 29650 + }, + { + "epoch": 0.59304, + "grad_norm": 0.04207836464047432, + "learning_rate": 8.503266015795783e-06, + "loss": 0.0048, + "step": 29652 + }, + { + "epoch": 0.59308, + "grad_norm": 0.49115100502967834, + "learning_rate": 8.501885495208263e-06, + "loss": 0.0173, + "step": 29654 + }, + { + "epoch": 0.59312, + "grad_norm": 0.03868307173252106, + "learning_rate": 8.500505003827254e-06, + "loss": 0.0008, + "step": 29656 + }, + { + "epoch": 0.59316, + "grad_norm": 0.002161839045584202, + "learning_rate": 8.499124541679674e-06, + "loss": 0.0058, + "step": 29658 + }, + { + "epoch": 0.5932, + "grad_norm": 0.09863412380218506, + "learning_rate": 8.49774410879243e-06, + "loss": 0.0021, + "step": 29660 + }, + { + "epoch": 0.59324, + "grad_norm": 9.415827662451193e-05, + "learning_rate": 8.496363705192441e-06, + "loss": 0.0073, + "step": 29662 + }, + { + "epoch": 0.59328, + "grad_norm": 12.903849601745605, + "learning_rate": 8.494983330906617e-06, + "loss": 0.1758, + "step": 29664 + }, + { + "epoch": 0.59332, + "grad_norm": 0.6558539867401123, + "learning_rate": 8.493602985961861e-06, + "loss": 0.0114, + "step": 29666 + }, + { + "epoch": 0.59336, + "grad_norm": 0.6997925639152527, + "learning_rate": 8.492222670385097e-06, + "loss": 0.0093, + "step": 29668 + }, + { + "epoch": 0.5934, + "grad_norm": 0.07838628441095352, + "learning_rate": 8.490842384203227e-06, + "loss": 0.01, + "step": 29670 + }, + { + "epoch": 0.59344, + "grad_norm": 0.14655491709709167, + "learning_rate": 8.489462127443163e-06, + "loss": 0.0034, + "step": 29672 + }, + { + "epoch": 0.59348, + "grad_norm": 0.1429615616798401, + "learning_rate": 8.48808190013181e-06, + "loss": 0.0812, + "step": 29674 + }, + { + "epoch": 0.59352, + "grad_norm": 3.928591251373291, + "learning_rate": 8.486701702296078e-06, + "loss": 0.0597, + "step": 29676 + }, + { + "epoch": 0.59356, + "grad_norm": 0.0023664848413318396, + "learning_rate": 8.485321533962877e-06, + "loss": 0.0168, + "step": 29678 + }, + { + "epoch": 0.5936, + "grad_norm": 0.07569558918476105, + "learning_rate": 8.483941395159114e-06, + "loss": 0.0014, + "step": 29680 + }, + { + "epoch": 0.59364, + "grad_norm": 0.004453673958778381, + "learning_rate": 8.48256128591169e-06, + "loss": 0.0171, + "step": 29682 + }, + { + "epoch": 0.59368, + "grad_norm": 0.07181800156831741, + "learning_rate": 8.481181206247517e-06, + "loss": 0.0013, + "step": 29684 + }, + { + "epoch": 0.59372, + "grad_norm": 3.6916558742523193, + "learning_rate": 8.4798011561935e-06, + "loss": 0.0487, + "step": 29686 + }, + { + "epoch": 0.59376, + "grad_norm": 0.3016667068004608, + "learning_rate": 8.47842113577654e-06, + "loss": 0.7769, + "step": 29688 + }, + { + "epoch": 0.5938, + "grad_norm": 0.028274578973650932, + "learning_rate": 8.477041145023546e-06, + "loss": 0.0011, + "step": 29690 + }, + { + "epoch": 0.59384, + "grad_norm": 0.1687939167022705, + "learning_rate": 8.475661183961415e-06, + "loss": 0.1079, + "step": 29692 + }, + { + "epoch": 0.59388, + "grad_norm": 12.198387145996094, + "learning_rate": 8.474281252617056e-06, + "loss": 0.4693, + "step": 29694 + }, + { + "epoch": 0.59392, + "grad_norm": 10.484233856201172, + "learning_rate": 8.472901351017371e-06, + "loss": 0.1636, + "step": 29696 + }, + { + "epoch": 0.59396, + "grad_norm": 11.850851058959961, + "learning_rate": 8.471521479189255e-06, + "loss": 0.4113, + "step": 29698 + }, + { + "epoch": 0.594, + "grad_norm": 1.3147449493408203, + "learning_rate": 8.47014163715962e-06, + "loss": 0.0154, + "step": 29700 + }, + { + "epoch": 0.59404, + "grad_norm": 0.554701566696167, + "learning_rate": 8.46876182495536e-06, + "loss": 0.0068, + "step": 29702 + }, + { + "epoch": 0.59408, + "grad_norm": 0.010147090069949627, + "learning_rate": 8.467382042603377e-06, + "loss": 0.0052, + "step": 29704 + }, + { + "epoch": 0.59412, + "grad_norm": 0.05870923027396202, + "learning_rate": 8.466002290130572e-06, + "loss": 0.003, + "step": 29706 + }, + { + "epoch": 0.59416, + "grad_norm": 0.26755738258361816, + "learning_rate": 8.464622567563836e-06, + "loss": 0.0036, + "step": 29708 + }, + { + "epoch": 0.5942, + "grad_norm": 0.0658513531088829, + "learning_rate": 8.46324287493008e-06, + "loss": 0.0724, + "step": 29710 + }, + { + "epoch": 0.59424, + "grad_norm": 0.28718483448028564, + "learning_rate": 8.461863212256192e-06, + "loss": 0.0088, + "step": 29712 + }, + { + "epoch": 0.59428, + "grad_norm": 10.547172546386719, + "learning_rate": 8.46048357956907e-06, + "loss": 0.3027, + "step": 29714 + }, + { + "epoch": 0.59432, + "grad_norm": 0.07442519813776016, + "learning_rate": 8.459103976895619e-06, + "loss": 0.0042, + "step": 29716 + }, + { + "epoch": 0.59436, + "grad_norm": 3.44236159324646, + "learning_rate": 8.457724404262725e-06, + "loss": 0.2022, + "step": 29718 + }, + { + "epoch": 0.5944, + "grad_norm": 0.14170297980308533, + "learning_rate": 8.45634486169729e-06, + "loss": 0.0093, + "step": 29720 + }, + { + "epoch": 0.59444, + "grad_norm": 0.588088870048523, + "learning_rate": 8.454965349226206e-06, + "loss": 0.4179, + "step": 29722 + }, + { + "epoch": 0.59448, + "grad_norm": 0.3594462275505066, + "learning_rate": 8.453585866876367e-06, + "loss": 0.0074, + "step": 29724 + }, + { + "epoch": 0.59452, + "grad_norm": 0.08008204400539398, + "learning_rate": 8.452206414674671e-06, + "loss": 0.1065, + "step": 29726 + }, + { + "epoch": 0.59456, + "grad_norm": 0.2445383369922638, + "learning_rate": 8.450826992648006e-06, + "loss": 0.0074, + "step": 29728 + }, + { + "epoch": 0.5946, + "grad_norm": 0.04395482316613197, + "learning_rate": 8.449447600823262e-06, + "loss": 0.0123, + "step": 29730 + }, + { + "epoch": 0.59464, + "grad_norm": 0.3643282353878021, + "learning_rate": 8.448068239227341e-06, + "loss": 0.0085, + "step": 29732 + }, + { + "epoch": 0.59468, + "grad_norm": 6.322700500488281, + "learning_rate": 8.446688907887123e-06, + "loss": 0.1076, + "step": 29734 + }, + { + "epoch": 0.59472, + "grad_norm": 0.3959144055843353, + "learning_rate": 8.445309606829511e-06, + "loss": 0.0091, + "step": 29736 + }, + { + "epoch": 0.59476, + "grad_norm": 1.234972357749939, + "learning_rate": 8.443930336081387e-06, + "loss": 0.0196, + "step": 29738 + }, + { + "epoch": 0.5948, + "grad_norm": 1.328403353691101, + "learning_rate": 8.44255109566964e-06, + "loss": 0.0241, + "step": 29740 + }, + { + "epoch": 0.59484, + "grad_norm": 0.09883800148963928, + "learning_rate": 8.441171885621163e-06, + "loss": 0.0059, + "step": 29742 + }, + { + "epoch": 0.59488, + "grad_norm": 0.1543036550283432, + "learning_rate": 8.439792705962844e-06, + "loss": 0.0037, + "step": 29744 + }, + { + "epoch": 0.59492, + "grad_norm": 0.02391796000301838, + "learning_rate": 8.438413556721564e-06, + "loss": 0.0026, + "step": 29746 + }, + { + "epoch": 0.59496, + "grad_norm": 1.997132420539856, + "learning_rate": 8.437034437924222e-06, + "loss": 0.0296, + "step": 29748 + }, + { + "epoch": 0.595, + "grad_norm": 1.2042282819747925, + "learning_rate": 8.43565534959769e-06, + "loss": 0.0191, + "step": 29750 + }, + { + "epoch": 0.59504, + "grad_norm": 0.09623416513204575, + "learning_rate": 8.43427629176887e-06, + "loss": 0.0091, + "step": 29752 + }, + { + "epoch": 0.59508, + "grad_norm": 0.04597938060760498, + "learning_rate": 8.432897264464639e-06, + "loss": 0.0014, + "step": 29754 + }, + { + "epoch": 0.59512, + "grad_norm": 0.07319008558988571, + "learning_rate": 8.43151826771188e-06, + "loss": 0.0059, + "step": 29756 + }, + { + "epoch": 0.59516, + "grad_norm": 0.19936437904834747, + "learning_rate": 8.43013930153748e-06, + "loss": 0.0065, + "step": 29758 + }, + { + "epoch": 0.5952, + "grad_norm": 0.2398824542760849, + "learning_rate": 8.428760365968327e-06, + "loss": 0.0073, + "step": 29760 + }, + { + "epoch": 0.59524, + "grad_norm": 0.0655774474143982, + "learning_rate": 8.427381461031291e-06, + "loss": 0.0113, + "step": 29762 + }, + { + "epoch": 0.59528, + "grad_norm": 0.21942001581192017, + "learning_rate": 8.42600258675327e-06, + "loss": 0.0074, + "step": 29764 + }, + { + "epoch": 0.59532, + "grad_norm": 0.7035091519355774, + "learning_rate": 8.424623743161136e-06, + "loss": 0.0113, + "step": 29766 + }, + { + "epoch": 0.59536, + "grad_norm": 0.09619949758052826, + "learning_rate": 8.423244930281774e-06, + "loss": 0.0033, + "step": 29768 + }, + { + "epoch": 0.5954, + "grad_norm": 0.12052619457244873, + "learning_rate": 8.421866148142066e-06, + "loss": 0.0038, + "step": 29770 + }, + { + "epoch": 0.59544, + "grad_norm": 3.378112554550171, + "learning_rate": 8.420487396768887e-06, + "loss": 0.158, + "step": 29772 + }, + { + "epoch": 0.59548, + "grad_norm": 3.015493392944336, + "learning_rate": 8.419108676189121e-06, + "loss": 0.0609, + "step": 29774 + }, + { + "epoch": 0.59552, + "grad_norm": 0.3161105215549469, + "learning_rate": 8.417729986429642e-06, + "loss": 0.0106, + "step": 29776 + }, + { + "epoch": 0.59556, + "grad_norm": 0.13945914804935455, + "learning_rate": 8.416351327517336e-06, + "loss": 0.0034, + "step": 29778 + }, + { + "epoch": 0.5956, + "grad_norm": 0.4278077781200409, + "learning_rate": 8.414972699479076e-06, + "loss": 0.004, + "step": 29780 + }, + { + "epoch": 0.59564, + "grad_norm": 0.4876065254211426, + "learning_rate": 8.413594102341735e-06, + "loss": 0.0075, + "step": 29782 + }, + { + "epoch": 0.59568, + "grad_norm": 0.004188814666122198, + "learning_rate": 8.412215536132197e-06, + "loss": 0.0004, + "step": 29784 + }, + { + "epoch": 0.59572, + "grad_norm": 0.08250216394662857, + "learning_rate": 8.410837000877335e-06, + "loss": 0.0011, + "step": 29786 + }, + { + "epoch": 0.59576, + "grad_norm": 0.014267437160015106, + "learning_rate": 8.409458496604019e-06, + "loss": 0.1897, + "step": 29788 + }, + { + "epoch": 0.5958, + "grad_norm": 0.12471506744623184, + "learning_rate": 8.408080023339134e-06, + "loss": 0.0021, + "step": 29790 + }, + { + "epoch": 0.59584, + "grad_norm": 0.1854293793439865, + "learning_rate": 8.406701581109547e-06, + "loss": 0.0074, + "step": 29792 + }, + { + "epoch": 0.59588, + "grad_norm": 0.7587574124336243, + "learning_rate": 8.405323169942133e-06, + "loss": 0.0132, + "step": 29794 + }, + { + "epoch": 0.59592, + "grad_norm": 0.32321205735206604, + "learning_rate": 8.403944789863766e-06, + "loss": 0.0167, + "step": 29796 + }, + { + "epoch": 0.59596, + "grad_norm": 6.729931831359863, + "learning_rate": 8.402566440901312e-06, + "loss": 0.1261, + "step": 29798 + }, + { + "epoch": 0.596, + "grad_norm": 0.02082648314535618, + "learning_rate": 8.401188123081653e-06, + "loss": 0.001, + "step": 29800 + }, + { + "epoch": 0.59604, + "grad_norm": 0.5194290280342102, + "learning_rate": 8.399809836431654e-06, + "loss": 0.0058, + "step": 29802 + }, + { + "epoch": 0.59608, + "grad_norm": 0.05654335767030716, + "learning_rate": 8.398431580978181e-06, + "loss": 0.0008, + "step": 29804 + }, + { + "epoch": 0.59612, + "grad_norm": 0.17170588672161102, + "learning_rate": 8.397053356748115e-06, + "loss": 0.0985, + "step": 29806 + }, + { + "epoch": 0.59616, + "grad_norm": 0.6287344694137573, + "learning_rate": 8.395675163768316e-06, + "loss": 0.0122, + "step": 29808 + }, + { + "epoch": 0.5962, + "grad_norm": 0.11846911907196045, + "learning_rate": 8.394297002065658e-06, + "loss": 0.0039, + "step": 29810 + }, + { + "epoch": 0.59624, + "grad_norm": 0.012574316002428532, + "learning_rate": 8.392918871667006e-06, + "loss": 0.0005, + "step": 29812 + }, + { + "epoch": 0.59628, + "grad_norm": 0.08763652294874191, + "learning_rate": 8.391540772599224e-06, + "loss": 0.0013, + "step": 29814 + }, + { + "epoch": 0.59632, + "grad_norm": 0.03848515450954437, + "learning_rate": 8.390162704889186e-06, + "loss": 0.0006, + "step": 29816 + }, + { + "epoch": 0.59636, + "grad_norm": 0.00893150083720684, + "learning_rate": 8.388784668563757e-06, + "loss": 0.0032, + "step": 29818 + }, + { + "epoch": 0.5964, + "grad_norm": 0.03951844573020935, + "learning_rate": 8.387406663649796e-06, + "loss": 0.0044, + "step": 29820 + }, + { + "epoch": 0.59644, + "grad_norm": 0.015027003362774849, + "learning_rate": 8.386028690174176e-06, + "loss": 0.0009, + "step": 29822 + }, + { + "epoch": 0.59648, + "grad_norm": 0.12700532376766205, + "learning_rate": 8.384650748163754e-06, + "loss": 0.0016, + "step": 29824 + }, + { + "epoch": 0.59652, + "grad_norm": 0.07538685947656631, + "learning_rate": 8.3832728376454e-06, + "loss": 0.0018, + "step": 29826 + }, + { + "epoch": 0.59656, + "grad_norm": 0.20017577707767487, + "learning_rate": 8.381894958645976e-06, + "loss": 0.0025, + "step": 29828 + }, + { + "epoch": 0.5966, + "grad_norm": 0.047809626907110214, + "learning_rate": 8.380517111192336e-06, + "loss": 0.0009, + "step": 29830 + }, + { + "epoch": 0.59664, + "grad_norm": 0.043080270290374756, + "learning_rate": 8.379139295311355e-06, + "loss": 0.0009, + "step": 29832 + }, + { + "epoch": 0.59668, + "grad_norm": 0.01112732570618391, + "learning_rate": 8.377761511029888e-06, + "loss": 0.001, + "step": 29834 + }, + { + "epoch": 0.59672, + "grad_norm": 0.008582981303334236, + "learning_rate": 8.37638375837479e-06, + "loss": 0.0017, + "step": 29836 + }, + { + "epoch": 0.59676, + "grad_norm": 0.09549646824598312, + "learning_rate": 8.375006037372932e-06, + "loss": 0.0015, + "step": 29838 + }, + { + "epoch": 0.5968, + "grad_norm": 0.09510376304388046, + "learning_rate": 8.373628348051165e-06, + "loss": 0.0017, + "step": 29840 + }, + { + "epoch": 0.59684, + "grad_norm": 0.10605904459953308, + "learning_rate": 8.372250690436353e-06, + "loss": 0.0115, + "step": 29842 + }, + { + "epoch": 0.59688, + "grad_norm": 0.19817869365215302, + "learning_rate": 8.37087306455535e-06, + "loss": 0.0191, + "step": 29844 + }, + { + "epoch": 0.59692, + "grad_norm": 0.5197306871414185, + "learning_rate": 8.369495470435014e-06, + "loss": 0.0057, + "step": 29846 + }, + { + "epoch": 0.59696, + "grad_norm": 0.01713869906961918, + "learning_rate": 8.368117908102207e-06, + "loss": 0.001, + "step": 29848 + }, + { + "epoch": 0.597, + "grad_norm": 0.043021466583013535, + "learning_rate": 8.366740377583781e-06, + "loss": 0.0032, + "step": 29850 + }, + { + "epoch": 0.59704, + "grad_norm": 0.5223146080970764, + "learning_rate": 8.365362878906588e-06, + "loss": 0.0527, + "step": 29852 + }, + { + "epoch": 0.59708, + "grad_norm": 0.2905948758125305, + "learning_rate": 8.363985412097491e-06, + "loss": 0.0072, + "step": 29854 + }, + { + "epoch": 0.59712, + "grad_norm": 0.06876882910728455, + "learning_rate": 8.362607977183338e-06, + "loss": 0.0219, + "step": 29856 + }, + { + "epoch": 0.59716, + "grad_norm": 3.1686458587646484, + "learning_rate": 8.361230574190988e-06, + "loss": 0.866, + "step": 29858 + }, + { + "epoch": 0.5972, + "grad_norm": 12.61536693572998, + "learning_rate": 8.35985320314729e-06, + "loss": 0.5283, + "step": 29860 + }, + { + "epoch": 0.59724, + "grad_norm": 2.8275129795074463, + "learning_rate": 8.3584758640791e-06, + "loss": 0.0473, + "step": 29862 + }, + { + "epoch": 0.59728, + "grad_norm": 0.08792532980442047, + "learning_rate": 8.357098557013269e-06, + "loss": 0.016, + "step": 29864 + }, + { + "epoch": 0.59732, + "grad_norm": 6.712808609008789, + "learning_rate": 8.355721281976646e-06, + "loss": 0.1758, + "step": 29866 + }, + { + "epoch": 0.59736, + "grad_norm": 0.849624752998352, + "learning_rate": 8.35434403899608e-06, + "loss": 0.0209, + "step": 29868 + }, + { + "epoch": 0.5974, + "grad_norm": 4.193872451782227, + "learning_rate": 8.352966828098428e-06, + "loss": 0.4629, + "step": 29870 + }, + { + "epoch": 0.59744, + "grad_norm": 0.022380635142326355, + "learning_rate": 8.351589649310532e-06, + "loss": 0.0528, + "step": 29872 + }, + { + "epoch": 0.59748, + "grad_norm": 0.05445457622408867, + "learning_rate": 8.350212502659249e-06, + "loss": 0.0035, + "step": 29874 + }, + { + "epoch": 0.59752, + "grad_norm": 0.044524986296892166, + "learning_rate": 8.348835388171421e-06, + "loss": 0.0007, + "step": 29876 + }, + { + "epoch": 0.59756, + "grad_norm": 0.033269189298152924, + "learning_rate": 8.347458305873897e-06, + "loss": 0.0586, + "step": 29878 + }, + { + "epoch": 0.5976, + "grad_norm": 0.039992060512304306, + "learning_rate": 8.346081255793524e-06, + "loss": 0.0037, + "step": 29880 + }, + { + "epoch": 0.59764, + "grad_norm": 0.9082234501838684, + "learning_rate": 8.344704237957146e-06, + "loss": 0.0131, + "step": 29882 + }, + { + "epoch": 0.59768, + "grad_norm": 0.019944051280617714, + "learning_rate": 8.343327252391616e-06, + "loss": 0.0303, + "step": 29884 + }, + { + "epoch": 0.59772, + "grad_norm": 0.09714611619710922, + "learning_rate": 8.341950299123772e-06, + "loss": 0.0023, + "step": 29886 + }, + { + "epoch": 0.59776, + "grad_norm": 14.000515937805176, + "learning_rate": 8.340573378180457e-06, + "loss": 0.67, + "step": 29888 + }, + { + "epoch": 0.5978, + "grad_norm": 0.12927015125751495, + "learning_rate": 8.339196489588522e-06, + "loss": 0.0035, + "step": 29890 + }, + { + "epoch": 0.59784, + "grad_norm": 0.02781982161104679, + "learning_rate": 8.337819633374809e-06, + "loss": 0.002, + "step": 29892 + }, + { + "epoch": 0.59788, + "grad_norm": 0.010186122730374336, + "learning_rate": 8.336442809566153e-06, + "loss": 0.0073, + "step": 29894 + }, + { + "epoch": 0.59792, + "grad_norm": 0.103156678378582, + "learning_rate": 8.335066018189404e-06, + "loss": 0.0058, + "step": 29896 + }, + { + "epoch": 0.59796, + "grad_norm": 4.5638041496276855, + "learning_rate": 8.333689259271396e-06, + "loss": 0.0655, + "step": 29898 + }, + { + "epoch": 0.598, + "grad_norm": 0.02173244208097458, + "learning_rate": 8.332312532838978e-06, + "loss": 0.0021, + "step": 29900 + }, + { + "epoch": 0.59804, + "grad_norm": 0.4008151590824127, + "learning_rate": 8.330935838918986e-06, + "loss": 0.0168, + "step": 29902 + }, + { + "epoch": 0.59808, + "grad_norm": 0.21552854776382446, + "learning_rate": 8.329559177538255e-06, + "loss": 0.0103, + "step": 29904 + }, + { + "epoch": 0.59812, + "grad_norm": 0.04593251273036003, + "learning_rate": 8.32818254872363e-06, + "loss": 0.0195, + "step": 29906 + }, + { + "epoch": 0.59816, + "grad_norm": 6.643938064575195, + "learning_rate": 8.326805952501951e-06, + "loss": 0.1005, + "step": 29908 + }, + { + "epoch": 0.5982, + "grad_norm": 0.27445927262306213, + "learning_rate": 8.325429388900046e-06, + "loss": 0.0098, + "step": 29910 + }, + { + "epoch": 0.59824, + "grad_norm": 0.1942368596792221, + "learning_rate": 8.32405285794476e-06, + "loss": 0.2064, + "step": 29912 + }, + { + "epoch": 0.59828, + "grad_norm": 2.55977463722229, + "learning_rate": 8.322676359662922e-06, + "loss": 0.0289, + "step": 29914 + }, + { + "epoch": 0.59832, + "grad_norm": 0.01899183727800846, + "learning_rate": 8.321299894081377e-06, + "loss": 0.0339, + "step": 29916 + }, + { + "epoch": 0.59836, + "grad_norm": 0.02422706037759781, + "learning_rate": 8.319923461226956e-06, + "loss": 0.0004, + "step": 29918 + }, + { + "epoch": 0.5984, + "grad_norm": 0.31536903977394104, + "learning_rate": 8.318547061126485e-06, + "loss": 0.0045, + "step": 29920 + }, + { + "epoch": 0.59844, + "grad_norm": 0.19084873795509338, + "learning_rate": 8.31717069380681e-06, + "loss": 0.0055, + "step": 29922 + }, + { + "epoch": 0.59848, + "grad_norm": 5.518856525421143, + "learning_rate": 8.315794359294759e-06, + "loss": 0.0685, + "step": 29924 + }, + { + "epoch": 0.59852, + "grad_norm": 0.11453299224376678, + "learning_rate": 8.31441805761716e-06, + "loss": 0.0161, + "step": 29926 + }, + { + "epoch": 0.59856, + "grad_norm": 0.017163531854748726, + "learning_rate": 8.313041788800852e-06, + "loss": 0.0052, + "step": 29928 + }, + { + "epoch": 0.5986, + "grad_norm": 5.269928455352783, + "learning_rate": 8.311665552872662e-06, + "loss": 0.072, + "step": 29930 + }, + { + "epoch": 0.59864, + "grad_norm": 0.057147376239299774, + "learning_rate": 8.310289349859421e-06, + "loss": 0.0881, + "step": 29932 + }, + { + "epoch": 0.59868, + "grad_norm": 0.04384542629122734, + "learning_rate": 8.308913179787962e-06, + "loss": 0.0427, + "step": 29934 + }, + { + "epoch": 0.59872, + "grad_norm": 0.0036618609447032213, + "learning_rate": 8.307537042685105e-06, + "loss": 0.0019, + "step": 29936 + }, + { + "epoch": 0.59876, + "grad_norm": 0.09829560667276382, + "learning_rate": 8.30616093857769e-06, + "loss": 0.0015, + "step": 29938 + }, + { + "epoch": 0.5988, + "grad_norm": 0.23029106855392456, + "learning_rate": 8.30478486749254e-06, + "loss": 0.0051, + "step": 29940 + }, + { + "epoch": 0.59884, + "grad_norm": 0.043722789734601974, + "learning_rate": 8.303408829456476e-06, + "loss": 0.003, + "step": 29942 + }, + { + "epoch": 0.59888, + "grad_norm": 1.264432430267334, + "learning_rate": 8.302032824496335e-06, + "loss": 0.0154, + "step": 29944 + }, + { + "epoch": 0.59892, + "grad_norm": 0.03951273858547211, + "learning_rate": 8.300656852638938e-06, + "loss": 0.0018, + "step": 29946 + }, + { + "epoch": 0.59896, + "grad_norm": 0.16087055206298828, + "learning_rate": 8.29928091391111e-06, + "loss": 0.0025, + "step": 29948 + }, + { + "epoch": 0.599, + "grad_norm": 0.20028574764728546, + "learning_rate": 8.297905008339677e-06, + "loss": 0.0123, + "step": 29950 + }, + { + "epoch": 0.59904, + "grad_norm": 0.4836605191230774, + "learning_rate": 8.296529135951458e-06, + "loss": 0.0115, + "step": 29952 + }, + { + "epoch": 0.59908, + "grad_norm": 5.338066577911377, + "learning_rate": 8.295153296773287e-06, + "loss": 0.0927, + "step": 29954 + }, + { + "epoch": 0.59912, + "grad_norm": 0.5828953981399536, + "learning_rate": 8.293777490831976e-06, + "loss": 0.0046, + "step": 29956 + }, + { + "epoch": 0.59916, + "grad_norm": 0.040318481624126434, + "learning_rate": 8.29240171815435e-06, + "loss": 0.0004, + "step": 29958 + }, + { + "epoch": 0.5992, + "grad_norm": 0.14034731686115265, + "learning_rate": 8.291025978767236e-06, + "loss": 0.002, + "step": 29960 + }, + { + "epoch": 0.59924, + "grad_norm": 4.34777307510376, + "learning_rate": 8.289650272697447e-06, + "loss": 0.0464, + "step": 29962 + }, + { + "epoch": 0.59928, + "grad_norm": 0.01770027168095112, + "learning_rate": 8.288274599971809e-06, + "loss": 0.0032, + "step": 29964 + }, + { + "epoch": 0.59932, + "grad_norm": 0.018984071910381317, + "learning_rate": 8.286898960617138e-06, + "loss": 0.0006, + "step": 29966 + }, + { + "epoch": 0.59936, + "grad_norm": 0.10116297751665115, + "learning_rate": 8.285523354660249e-06, + "loss": 0.0046, + "step": 29968 + }, + { + "epoch": 0.5994, + "grad_norm": 3.0599732398986816, + "learning_rate": 8.284147782127971e-06, + "loss": 0.0309, + "step": 29970 + }, + { + "epoch": 0.59944, + "grad_norm": 0.7652086019515991, + "learning_rate": 8.282772243047115e-06, + "loss": 0.0128, + "step": 29972 + }, + { + "epoch": 0.59948, + "grad_norm": 3.514219045639038, + "learning_rate": 8.281396737444494e-06, + "loss": 0.0526, + "step": 29974 + }, + { + "epoch": 0.59952, + "grad_norm": 0.4322128891944885, + "learning_rate": 8.280021265346933e-06, + "loss": 0.0045, + "step": 29976 + }, + { + "epoch": 0.59956, + "grad_norm": 0.24766989052295685, + "learning_rate": 8.27864582678124e-06, + "loss": 0.0058, + "step": 29978 + }, + { + "epoch": 0.5996, + "grad_norm": 0.03401138633489609, + "learning_rate": 8.277270421774234e-06, + "loss": 0.0007, + "step": 29980 + }, + { + "epoch": 0.59964, + "grad_norm": 0.17247138917446136, + "learning_rate": 8.275895050352729e-06, + "loss": 0.0262, + "step": 29982 + }, + { + "epoch": 0.59968, + "grad_norm": 3.9883008003234863, + "learning_rate": 8.274519712543535e-06, + "loss": 0.0532, + "step": 29984 + }, + { + "epoch": 0.59972, + "grad_norm": 0.1343778371810913, + "learning_rate": 8.27314440837347e-06, + "loss": 0.0106, + "step": 29986 + }, + { + "epoch": 0.59976, + "grad_norm": 0.01745651662349701, + "learning_rate": 8.27176913786934e-06, + "loss": 0.0059, + "step": 29988 + }, + { + "epoch": 0.5998, + "grad_norm": 0.6962364315986633, + "learning_rate": 8.270393901057964e-06, + "loss": 0.0094, + "step": 29990 + }, + { + "epoch": 0.59984, + "grad_norm": 0.12524311244487762, + "learning_rate": 8.26901869796615e-06, + "loss": 0.0043, + "step": 29992 + }, + { + "epoch": 0.59988, + "grad_norm": 0.010182525962591171, + "learning_rate": 8.267643528620704e-06, + "loss": 0.0008, + "step": 29994 + }, + { + "epoch": 0.59992, + "grad_norm": 0.19074301421642303, + "learning_rate": 8.266268393048443e-06, + "loss": 0.0057, + "step": 29996 + }, + { + "epoch": 0.59996, + "grad_norm": 0.001413557562045753, + "learning_rate": 8.264893291276172e-06, + "loss": 0.005, + "step": 29998 + }, + { + "epoch": 0.6, + "grad_norm": 0.592971920967102, + "learning_rate": 8.263518223330698e-06, + "loss": 0.0083, + "step": 30000 + }, + { + "epoch": 0.60004, + "grad_norm": 0.029195476323366165, + "learning_rate": 8.262143189238832e-06, + "loss": 0.006, + "step": 30002 + }, + { + "epoch": 0.60008, + "grad_norm": 0.12349887192249298, + "learning_rate": 8.260768189027376e-06, + "loss": 0.0015, + "step": 30004 + }, + { + "epoch": 0.60012, + "grad_norm": 0.3000310957431793, + "learning_rate": 8.259393222723141e-06, + "loss": 0.0051, + "step": 30006 + }, + { + "epoch": 0.60016, + "grad_norm": 0.0016719391569495201, + "learning_rate": 8.258018290352934e-06, + "loss": 0.3732, + "step": 30008 + }, + { + "epoch": 0.6002, + "grad_norm": 0.16787534952163696, + "learning_rate": 8.25664339194355e-06, + "loss": 0.0021, + "step": 30010 + }, + { + "epoch": 0.60024, + "grad_norm": 0.0008227517828345299, + "learning_rate": 8.255268527521807e-06, + "loss": 0.0006, + "step": 30012 + }, + { + "epoch": 0.60028, + "grad_norm": 0.01012949924916029, + "learning_rate": 8.253893697114504e-06, + "loss": 0.0002, + "step": 30014 + }, + { + "epoch": 0.60032, + "grad_norm": 0.6776123642921448, + "learning_rate": 8.252518900748438e-06, + "loss": 0.021, + "step": 30016 + }, + { + "epoch": 0.60036, + "grad_norm": 0.8290019035339355, + "learning_rate": 8.251144138450417e-06, + "loss": 0.0121, + "step": 30018 + }, + { + "epoch": 0.6004, + "grad_norm": 1.0841697454452515, + "learning_rate": 8.249769410247239e-06, + "loss": 0.0193, + "step": 30020 + }, + { + "epoch": 0.60044, + "grad_norm": 0.15798786282539368, + "learning_rate": 8.24839471616571e-06, + "loss": 0.0025, + "step": 30022 + }, + { + "epoch": 0.60048, + "grad_norm": 1.9906407594680786, + "learning_rate": 8.24702005623263e-06, + "loss": 0.0306, + "step": 30024 + }, + { + "epoch": 0.60052, + "grad_norm": 0.010168171487748623, + "learning_rate": 8.245645430474791e-06, + "loss": 0.0036, + "step": 30026 + }, + { + "epoch": 0.60056, + "grad_norm": 0.05872655659914017, + "learning_rate": 8.244270838919001e-06, + "loss": 0.0013, + "step": 30028 + }, + { + "epoch": 0.6006, + "grad_norm": 0.6501550674438477, + "learning_rate": 8.242896281592057e-06, + "loss": 0.0372, + "step": 30030 + }, + { + "epoch": 0.60064, + "grad_norm": 0.04125295206904411, + "learning_rate": 8.24152175852075e-06, + "loss": 0.0027, + "step": 30032 + }, + { + "epoch": 0.60068, + "grad_norm": 0.07928876578807831, + "learning_rate": 8.240147269731887e-06, + "loss": 0.0249, + "step": 30034 + }, + { + "epoch": 0.60072, + "grad_norm": 1.669029712677002, + "learning_rate": 8.238772815252252e-06, + "loss": 0.0192, + "step": 30036 + }, + { + "epoch": 0.60076, + "grad_norm": 0.012801926583051682, + "learning_rate": 8.237398395108653e-06, + "loss": 0.0015, + "step": 30038 + }, + { + "epoch": 0.6008, + "grad_norm": 0.00646992726251483, + "learning_rate": 8.236024009327879e-06, + "loss": 0.1897, + "step": 30040 + }, + { + "epoch": 0.60084, + "grad_norm": 0.08375166356563568, + "learning_rate": 8.234649657936723e-06, + "loss": 0.0023, + "step": 30042 + }, + { + "epoch": 0.60088, + "grad_norm": 0.09563011676073074, + "learning_rate": 8.23327534096198e-06, + "loss": 0.0038, + "step": 30044 + }, + { + "epoch": 0.60092, + "grad_norm": 0.17295952141284943, + "learning_rate": 8.23190105843045e-06, + "loss": 0.0023, + "step": 30046 + }, + { + "epoch": 0.60096, + "grad_norm": 0.0032808396499603987, + "learning_rate": 8.230526810368912e-06, + "loss": 0.0002, + "step": 30048 + }, + { + "epoch": 0.601, + "grad_norm": 0.046266425400972366, + "learning_rate": 8.22915259680417e-06, + "loss": 0.0044, + "step": 30050 + }, + { + "epoch": 0.60104, + "grad_norm": 0.15123960375785828, + "learning_rate": 8.227778417763005e-06, + "loss": 0.0028, + "step": 30052 + }, + { + "epoch": 0.60108, + "grad_norm": 0.12583403289318085, + "learning_rate": 8.226404273272216e-06, + "loss": 0.054, + "step": 30054 + }, + { + "epoch": 0.60112, + "grad_norm": 0.4299061596393585, + "learning_rate": 8.22503016335859e-06, + "loss": 0.0058, + "step": 30056 + }, + { + "epoch": 0.60116, + "grad_norm": 0.052774786949157715, + "learning_rate": 8.223656088048907e-06, + "loss": 0.0018, + "step": 30058 + }, + { + "epoch": 0.6012, + "grad_norm": 0.2387007772922516, + "learning_rate": 8.222282047369972e-06, + "loss": 0.0292, + "step": 30060 + }, + { + "epoch": 0.60124, + "grad_norm": 0.7416079640388489, + "learning_rate": 8.22090804134856e-06, + "loss": 0.012, + "step": 30062 + }, + { + "epoch": 0.60128, + "grad_norm": 0.04532059654593468, + "learning_rate": 8.219534070011459e-06, + "loss": 0.0019, + "step": 30064 + }, + { + "epoch": 0.60132, + "grad_norm": 4.378178119659424, + "learning_rate": 8.218160133385462e-06, + "loss": 0.0424, + "step": 30066 + }, + { + "epoch": 0.60136, + "grad_norm": 0.243277907371521, + "learning_rate": 8.21678623149735e-06, + "loss": 1.3355, + "step": 30068 + }, + { + "epoch": 0.6014, + "grad_norm": 0.012190325185656548, + "learning_rate": 8.215412364373908e-06, + "loss": 0.0106, + "step": 30070 + }, + { + "epoch": 0.60144, + "grad_norm": 0.1339280754327774, + "learning_rate": 8.214038532041921e-06, + "loss": 0.0017, + "step": 30072 + }, + { + "epoch": 0.60148, + "grad_norm": 0.5342984795570374, + "learning_rate": 8.21266473452817e-06, + "loss": 0.006, + "step": 30074 + }, + { + "epoch": 0.60152, + "grad_norm": 0.008449085988104343, + "learning_rate": 8.211290971859442e-06, + "loss": 0.0001, + "step": 30076 + }, + { + "epoch": 0.60156, + "grad_norm": 0.11327645927667618, + "learning_rate": 8.209917244062519e-06, + "loss": 0.0078, + "step": 30078 + }, + { + "epoch": 0.6016, + "grad_norm": 3.6598095893859863, + "learning_rate": 8.208543551164178e-06, + "loss": 0.0427, + "step": 30080 + }, + { + "epoch": 0.60164, + "grad_norm": 0.04206288978457451, + "learning_rate": 8.207169893191205e-06, + "loss": 0.0015, + "step": 30082 + }, + { + "epoch": 0.60168, + "grad_norm": 0.8718621134757996, + "learning_rate": 8.205796270170377e-06, + "loss": 0.0131, + "step": 30084 + }, + { + "epoch": 0.60172, + "grad_norm": 0.03728761151432991, + "learning_rate": 8.204422682128474e-06, + "loss": 0.0108, + "step": 30086 + }, + { + "epoch": 0.60176, + "grad_norm": 0.04786170274019241, + "learning_rate": 8.203049129092278e-06, + "loss": 0.0056, + "step": 30088 + }, + { + "epoch": 0.6018, + "grad_norm": 0.19472382962703705, + "learning_rate": 8.201675611088558e-06, + "loss": 0.0678, + "step": 30090 + }, + { + "epoch": 0.60184, + "grad_norm": 0.28382423520088196, + "learning_rate": 8.200302128144105e-06, + "loss": 0.0375, + "step": 30092 + }, + { + "epoch": 0.60188, + "grad_norm": 0.3828771412372589, + "learning_rate": 8.19892868028568e-06, + "loss": 0.0063, + "step": 30094 + }, + { + "epoch": 0.60192, + "grad_norm": 0.21681611239910126, + "learning_rate": 8.197555267540076e-06, + "loss": 0.0081, + "step": 30096 + }, + { + "epoch": 0.60196, + "grad_norm": 0.5747266411781311, + "learning_rate": 8.196181889934059e-06, + "loss": 0.0095, + "step": 30098 + }, + { + "epoch": 0.602, + "grad_norm": 0.010593094862997532, + "learning_rate": 8.194808547494401e-06, + "loss": 0.001, + "step": 30100 + }, + { + "epoch": 0.60204, + "grad_norm": 0.14979159832000732, + "learning_rate": 8.193435240247883e-06, + "loss": 0.0321, + "step": 30102 + }, + { + "epoch": 0.60208, + "grad_norm": 0.035458602011203766, + "learning_rate": 8.192061968221275e-06, + "loss": 0.0006, + "step": 30104 + }, + { + "epoch": 0.60212, + "grad_norm": 0.015367195010185242, + "learning_rate": 8.190688731441346e-06, + "loss": 0.0005, + "step": 30106 + }, + { + "epoch": 0.60216, + "grad_norm": 0.08713119477033615, + "learning_rate": 8.189315529934875e-06, + "loss": 0.0535, + "step": 30108 + }, + { + "epoch": 0.6022, + "grad_norm": 0.844511866569519, + "learning_rate": 8.187942363728626e-06, + "loss": 0.0105, + "step": 30110 + }, + { + "epoch": 0.60224, + "grad_norm": 2.7793140411376953, + "learning_rate": 8.186569232849377e-06, + "loss": 0.0352, + "step": 30112 + }, + { + "epoch": 0.60228, + "grad_norm": 0.6045851111412048, + "learning_rate": 8.185196137323897e-06, + "loss": 0.0282, + "step": 30114 + }, + { + "epoch": 0.60232, + "grad_norm": 0.2657265365123749, + "learning_rate": 8.183823077178948e-06, + "loss": 0.0327, + "step": 30116 + }, + { + "epoch": 0.60236, + "grad_norm": 0.003605189034715295, + "learning_rate": 8.182450052441304e-06, + "loss": 0.0004, + "step": 30118 + }, + { + "epoch": 0.6024, + "grad_norm": 0.013093413785099983, + "learning_rate": 8.181077063137733e-06, + "loss": 0.0648, + "step": 30120 + }, + { + "epoch": 0.60244, + "grad_norm": 2.0957062244415283, + "learning_rate": 8.179704109295e-06, + "loss": 0.0451, + "step": 30122 + }, + { + "epoch": 0.60248, + "grad_norm": 0.5395042896270752, + "learning_rate": 8.178331190939874e-06, + "loss": 0.0087, + "step": 30124 + }, + { + "epoch": 0.60252, + "grad_norm": 0.00538515904918313, + "learning_rate": 8.176958308099117e-06, + "loss": 0.0012, + "step": 30126 + }, + { + "epoch": 0.60256, + "grad_norm": 0.08338706940412521, + "learning_rate": 8.175585460799499e-06, + "loss": 0.002, + "step": 30128 + }, + { + "epoch": 0.6026, + "grad_norm": 0.2577911615371704, + "learning_rate": 8.174212649067781e-06, + "loss": 0.0051, + "step": 30130 + }, + { + "epoch": 0.60264, + "grad_norm": 0.11214730143547058, + "learning_rate": 8.172839872930725e-06, + "loss": 0.0163, + "step": 30132 + }, + { + "epoch": 0.60268, + "grad_norm": 0.023349234834313393, + "learning_rate": 8.1714671324151e-06, + "loss": 0.0014, + "step": 30134 + }, + { + "epoch": 0.60272, + "grad_norm": 3.7288169860839844, + "learning_rate": 8.170094427547663e-06, + "loss": 0.0349, + "step": 30136 + }, + { + "epoch": 0.60276, + "grad_norm": 0.276470810174942, + "learning_rate": 8.168721758355177e-06, + "loss": 0.0037, + "step": 30138 + }, + { + "epoch": 0.6028, + "grad_norm": 0.018818790093064308, + "learning_rate": 8.167349124864406e-06, + "loss": 0.176, + "step": 30140 + }, + { + "epoch": 0.60284, + "grad_norm": 4.466943740844727, + "learning_rate": 8.1659765271021e-06, + "loss": 0.0341, + "step": 30142 + }, + { + "epoch": 0.60288, + "grad_norm": 0.2574562132358551, + "learning_rate": 8.164603965095033e-06, + "loss": 0.0037, + "step": 30144 + }, + { + "epoch": 0.60292, + "grad_norm": 0.035599030554294586, + "learning_rate": 8.163231438869955e-06, + "loss": 0.0076, + "step": 30146 + }, + { + "epoch": 0.60296, + "grad_norm": 0.007641094736754894, + "learning_rate": 8.161858948453622e-06, + "loss": 0.0133, + "step": 30148 + }, + { + "epoch": 0.603, + "grad_norm": 0.05461018905043602, + "learning_rate": 8.1604864938728e-06, + "loss": 0.0009, + "step": 30150 + }, + { + "epoch": 0.60304, + "grad_norm": 0.058837272226810455, + "learning_rate": 8.15911407515424e-06, + "loss": 0.0722, + "step": 30152 + }, + { + "epoch": 0.60308, + "grad_norm": 0.034269850701093674, + "learning_rate": 8.157741692324697e-06, + "loss": 0.0006, + "step": 30154 + }, + { + "epoch": 0.60312, + "grad_norm": 0.0841434895992279, + "learning_rate": 8.15636934541093e-06, + "loss": 0.0015, + "step": 30156 + }, + { + "epoch": 0.60316, + "grad_norm": 0.04717846214771271, + "learning_rate": 8.154997034439688e-06, + "loss": 0.0017, + "step": 30158 + }, + { + "epoch": 0.6032, + "grad_norm": 0.068609818816185, + "learning_rate": 8.153624759437733e-06, + "loss": 0.0008, + "step": 30160 + }, + { + "epoch": 0.60324, + "grad_norm": 0.0456969328224659, + "learning_rate": 8.152252520431812e-06, + "loss": 0.0078, + "step": 30162 + }, + { + "epoch": 0.60328, + "grad_norm": 0.17097772657871246, + "learning_rate": 8.150880317448677e-06, + "loss": 0.0025, + "step": 30164 + }, + { + "epoch": 0.60332, + "grad_norm": 0.5035855770111084, + "learning_rate": 8.149508150515086e-06, + "loss": 0.005, + "step": 30166 + }, + { + "epoch": 0.60336, + "grad_norm": 0.7542459964752197, + "learning_rate": 8.148136019657787e-06, + "loss": 0.0097, + "step": 30168 + }, + { + "epoch": 0.6034, + "grad_norm": 1.1630561351776123, + "learning_rate": 8.146763924903527e-06, + "loss": 0.0122, + "step": 30170 + }, + { + "epoch": 0.60344, + "grad_norm": 0.21626624464988708, + "learning_rate": 8.14539186627906e-06, + "loss": 0.0547, + "step": 30172 + }, + { + "epoch": 0.60348, + "grad_norm": 0.7324731945991516, + "learning_rate": 8.144019843811131e-06, + "loss": 0.0076, + "step": 30174 + }, + { + "epoch": 0.60352, + "grad_norm": 0.5409705638885498, + "learning_rate": 8.142647857526494e-06, + "loss": 0.0075, + "step": 30176 + }, + { + "epoch": 0.60356, + "grad_norm": 0.017631063237786293, + "learning_rate": 8.141275907451891e-06, + "loss": 0.0012, + "step": 30178 + }, + { + "epoch": 0.6036, + "grad_norm": 0.5844282507896423, + "learning_rate": 8.139903993614069e-06, + "loss": 0.0065, + "step": 30180 + }, + { + "epoch": 0.60364, + "grad_norm": 0.007872704416513443, + "learning_rate": 8.138532116039781e-06, + "loss": 0.0008, + "step": 30182 + }, + { + "epoch": 0.60368, + "grad_norm": 0.13630345463752747, + "learning_rate": 8.137160274755765e-06, + "loss": 0.0027, + "step": 30184 + }, + { + "epoch": 0.60372, + "grad_norm": 0.20998160541057587, + "learning_rate": 8.135788469788769e-06, + "loss": 0.0033, + "step": 30186 + }, + { + "epoch": 0.60376, + "grad_norm": 0.0445094034075737, + "learning_rate": 8.134416701165537e-06, + "loss": 0.0011, + "step": 30188 + }, + { + "epoch": 0.6038, + "grad_norm": 7.365537166595459, + "learning_rate": 8.133044968912811e-06, + "loss": 0.1345, + "step": 30190 + }, + { + "epoch": 0.60384, + "grad_norm": 0.0039907339960336685, + "learning_rate": 8.131673273057337e-06, + "loss": 0.0006, + "step": 30192 + }, + { + "epoch": 0.60388, + "grad_norm": 0.008950859308242798, + "learning_rate": 8.130301613625853e-06, + "loss": 0.0073, + "step": 30194 + }, + { + "epoch": 0.60392, + "grad_norm": 0.0478266142308712, + "learning_rate": 8.128929990645099e-06, + "loss": 0.001, + "step": 30196 + }, + { + "epoch": 0.60396, + "grad_norm": 0.42339736223220825, + "learning_rate": 8.127558404141821e-06, + "loss": 0.004, + "step": 30198 + }, + { + "epoch": 0.604, + "grad_norm": 0.30022671818733215, + "learning_rate": 8.126186854142752e-06, + "loss": 0.0027, + "step": 30200 + }, + { + "epoch": 0.60404, + "grad_norm": 0.14839670062065125, + "learning_rate": 8.12481534067464e-06, + "loss": 0.0015, + "step": 30202 + }, + { + "epoch": 0.60408, + "grad_norm": 1.9808988571166992, + "learning_rate": 8.123443863764218e-06, + "loss": 0.014, + "step": 30204 + }, + { + "epoch": 0.60412, + "grad_norm": 0.0016335586551576853, + "learning_rate": 8.122072423438221e-06, + "loss": 0.0001, + "step": 30206 + }, + { + "epoch": 0.60416, + "grad_norm": 0.011198671534657478, + "learning_rate": 8.120701019723392e-06, + "loss": 0.0105, + "step": 30208 + }, + { + "epoch": 0.6042, + "grad_norm": 0.022668106481432915, + "learning_rate": 8.119329652646463e-06, + "loss": 0.002, + "step": 30210 + }, + { + "epoch": 0.60424, + "grad_norm": 0.052180562168359756, + "learning_rate": 8.117958322234168e-06, + "loss": 0.0008, + "step": 30212 + }, + { + "epoch": 0.60428, + "grad_norm": 4.284718990325928, + "learning_rate": 8.116587028513249e-06, + "loss": 0.0587, + "step": 30214 + }, + { + "epoch": 0.60432, + "grad_norm": 0.6376302242279053, + "learning_rate": 8.11521577151043e-06, + "loss": 0.0082, + "step": 30216 + }, + { + "epoch": 0.60436, + "grad_norm": 0.051784683018922806, + "learning_rate": 8.113844551252454e-06, + "loss": 0.127, + "step": 30218 + }, + { + "epoch": 0.6044, + "grad_norm": 0.17497703433036804, + "learning_rate": 8.112473367766051e-06, + "loss": 0.0022, + "step": 30220 + }, + { + "epoch": 0.60444, + "grad_norm": 0.32574179768562317, + "learning_rate": 8.111102221077947e-06, + "loss": 0.0027, + "step": 30222 + }, + { + "epoch": 0.60448, + "grad_norm": 0.03888704255223274, + "learning_rate": 8.109731111214882e-06, + "loss": 0.0012, + "step": 30224 + }, + { + "epoch": 0.60452, + "grad_norm": 0.0212875846773386, + "learning_rate": 8.10836003820358e-06, + "loss": 0.0047, + "step": 30226 + }, + { + "epoch": 0.60456, + "grad_norm": 0.022390129044651985, + "learning_rate": 8.10698900207077e-06, + "loss": 0.0012, + "step": 30228 + }, + { + "epoch": 0.6046, + "grad_norm": 0.06971040368080139, + "learning_rate": 8.10561800284319e-06, + "loss": 0.0008, + "step": 30230 + }, + { + "epoch": 0.60464, + "grad_norm": 12.223097801208496, + "learning_rate": 8.104247040547555e-06, + "loss": 0.1218, + "step": 30232 + }, + { + "epoch": 0.60468, + "grad_norm": 2.8321175575256348, + "learning_rate": 8.102876115210607e-06, + "loss": 0.0244, + "step": 30234 + }, + { + "epoch": 0.60472, + "grad_norm": 0.3695628345012665, + "learning_rate": 8.101505226859063e-06, + "loss": 0.0037, + "step": 30236 + }, + { + "epoch": 0.60476, + "grad_norm": 0.4487791955471039, + "learning_rate": 8.100134375519652e-06, + "loss": 0.0052, + "step": 30238 + }, + { + "epoch": 0.6048, + "grad_norm": 0.014707839116454124, + "learning_rate": 8.098763561219101e-06, + "loss": 0.0046, + "step": 30240 + }, + { + "epoch": 0.60484, + "grad_norm": 0.013935609720647335, + "learning_rate": 8.09739278398413e-06, + "loss": 0.0008, + "step": 30242 + }, + { + "epoch": 0.60488, + "grad_norm": 0.026638079434633255, + "learning_rate": 8.096022043841467e-06, + "loss": 0.0009, + "step": 30244 + }, + { + "epoch": 0.60492, + "grad_norm": 0.04540257528424263, + "learning_rate": 8.094651340817836e-06, + "loss": 0.0425, + "step": 30246 + }, + { + "epoch": 0.60496, + "grad_norm": 0.9490401148796082, + "learning_rate": 8.093280674939955e-06, + "loss": 0.0093, + "step": 30248 + }, + { + "epoch": 0.605, + "grad_norm": 0.08673414587974548, + "learning_rate": 8.091910046234552e-06, + "loss": 0.006, + "step": 30250 + }, + { + "epoch": 0.60504, + "grad_norm": 0.23532728850841522, + "learning_rate": 8.090539454728347e-06, + "loss": 0.0035, + "step": 30252 + }, + { + "epoch": 0.60508, + "grad_norm": 0.0022726610768586397, + "learning_rate": 8.089168900448052e-06, + "loss": 0.0019, + "step": 30254 + }, + { + "epoch": 0.60512, + "grad_norm": 0.007865861058235168, + "learning_rate": 8.087798383420397e-06, + "loss": 0.0035, + "step": 30256 + }, + { + "epoch": 0.60516, + "grad_norm": 21.118133544921875, + "learning_rate": 8.086427903672099e-06, + "loss": 0.4497, + "step": 30258 + }, + { + "epoch": 0.6052, + "grad_norm": 0.6657847166061401, + "learning_rate": 8.08505746122987e-06, + "loss": 0.0065, + "step": 30260 + }, + { + "epoch": 0.60524, + "grad_norm": 0.005645287688821554, + "learning_rate": 8.083687056120437e-06, + "loss": 0.0337, + "step": 30262 + }, + { + "epoch": 0.60528, + "grad_norm": 16.784461975097656, + "learning_rate": 8.082316688370504e-06, + "loss": 0.1545, + "step": 30264 + }, + { + "epoch": 0.60532, + "grad_norm": 0.010502091608941555, + "learning_rate": 8.0809463580068e-06, + "loss": 0.0003, + "step": 30266 + }, + { + "epoch": 0.60536, + "grad_norm": 14.36788272857666, + "learning_rate": 8.079576065056034e-06, + "loss": 0.1899, + "step": 30268 + }, + { + "epoch": 0.6054, + "grad_norm": 0.05072566866874695, + "learning_rate": 8.078205809544918e-06, + "loss": 0.0005, + "step": 30270 + }, + { + "epoch": 0.60544, + "grad_norm": 0.0035322927869856358, + "learning_rate": 8.076835591500172e-06, + "loss": 0.03, + "step": 30272 + }, + { + "epoch": 0.60548, + "grad_norm": 11.162628173828125, + "learning_rate": 8.075465410948509e-06, + "loss": 0.2053, + "step": 30274 + }, + { + "epoch": 0.60552, + "grad_norm": 0.13662396371364594, + "learning_rate": 8.074095267916634e-06, + "loss": 0.0019, + "step": 30276 + }, + { + "epoch": 0.60556, + "grad_norm": 0.36048388481140137, + "learning_rate": 8.072725162431266e-06, + "loss": 0.0247, + "step": 30278 + }, + { + "epoch": 0.6056, + "grad_norm": 0.6206938028335571, + "learning_rate": 8.07135509451911e-06, + "loss": 0.0066, + "step": 30280 + }, + { + "epoch": 0.60564, + "grad_norm": 0.04421544820070267, + "learning_rate": 8.069985064206883e-06, + "loss": 0.0076, + "step": 30282 + }, + { + "epoch": 0.60568, + "grad_norm": 0.6677579283714294, + "learning_rate": 8.068615071521289e-06, + "loss": 0.0106, + "step": 30284 + }, + { + "epoch": 0.60572, + "grad_norm": 0.044440340250730515, + "learning_rate": 8.067245116489037e-06, + "loss": 0.0085, + "step": 30286 + }, + { + "epoch": 0.60576, + "grad_norm": 0.026976294815540314, + "learning_rate": 8.06587519913684e-06, + "loss": 0.024, + "step": 30288 + }, + { + "epoch": 0.6058, + "grad_norm": 0.041315365582704544, + "learning_rate": 8.064505319491398e-06, + "loss": 0.0021, + "step": 30290 + }, + { + "epoch": 0.60584, + "grad_norm": 0.23420897126197815, + "learning_rate": 8.063135477579425e-06, + "loss": 0.0042, + "step": 30292 + }, + { + "epoch": 0.60588, + "grad_norm": 0.018875690177083015, + "learning_rate": 8.061765673427621e-06, + "loss": 0.0074, + "step": 30294 + }, + { + "epoch": 0.60592, + "grad_norm": 1.4634774923324585, + "learning_rate": 8.060395907062692e-06, + "loss": 0.0168, + "step": 30296 + }, + { + "epoch": 0.60596, + "grad_norm": 6.645504474639893, + "learning_rate": 8.059026178511346e-06, + "loss": 0.0722, + "step": 30298 + }, + { + "epoch": 0.606, + "grad_norm": 0.058591101318597794, + "learning_rate": 8.057656487800283e-06, + "loss": 0.0086, + "step": 30300 + }, + { + "epoch": 0.60604, + "grad_norm": 0.01316917035728693, + "learning_rate": 8.056286834956203e-06, + "loss": 0.0719, + "step": 30302 + }, + { + "epoch": 0.60608, + "grad_norm": 1.664507269859314, + "learning_rate": 8.054917220005817e-06, + "loss": 0.0169, + "step": 30304 + }, + { + "epoch": 0.60612, + "grad_norm": 0.0673666000366211, + "learning_rate": 8.053547642975819e-06, + "loss": 0.0008, + "step": 30306 + }, + { + "epoch": 0.60616, + "grad_norm": 0.2066110074520111, + "learning_rate": 8.052178103892913e-06, + "loss": 0.0058, + "step": 30308 + }, + { + "epoch": 0.6062, + "grad_norm": 0.05179663375020027, + "learning_rate": 8.050808602783797e-06, + "loss": 0.001, + "step": 30310 + }, + { + "epoch": 0.60624, + "grad_norm": 0.2870140075683594, + "learning_rate": 8.049439139675168e-06, + "loss": 0.0021, + "step": 30312 + }, + { + "epoch": 0.60628, + "grad_norm": 0.017953509464859962, + "learning_rate": 8.048069714593732e-06, + "loss": 0.0005, + "step": 30314 + }, + { + "epoch": 0.60632, + "grad_norm": 0.08262341469526291, + "learning_rate": 8.046700327566181e-06, + "loss": 0.0009, + "step": 30316 + }, + { + "epoch": 0.60636, + "grad_norm": 0.12762287259101868, + "learning_rate": 8.045330978619208e-06, + "loss": 0.002, + "step": 30318 + }, + { + "epoch": 0.6064, + "grad_norm": 0.2842397391796112, + "learning_rate": 8.04396166777952e-06, + "loss": 0.0033, + "step": 30320 + }, + { + "epoch": 0.60644, + "grad_norm": 0.06642533838748932, + "learning_rate": 8.0425923950738e-06, + "loss": 0.0111, + "step": 30322 + }, + { + "epoch": 0.60648, + "grad_norm": 0.0490313395857811, + "learning_rate": 8.041223160528754e-06, + "loss": 0.0006, + "step": 30324 + }, + { + "epoch": 0.60652, + "grad_norm": 0.3975965976715088, + "learning_rate": 8.039853964171072e-06, + "loss": 0.0147, + "step": 30326 + }, + { + "epoch": 0.60656, + "grad_norm": 0.2548873722553253, + "learning_rate": 8.038484806027443e-06, + "loss": 0.0036, + "step": 30328 + }, + { + "epoch": 0.6066, + "grad_norm": 0.0021599214524030685, + "learning_rate": 8.037115686124564e-06, + "loss": 0.0003, + "step": 30330 + }, + { + "epoch": 0.60664, + "grad_norm": 0.3634200990200043, + "learning_rate": 8.035746604489126e-06, + "loss": 0.0046, + "step": 30332 + }, + { + "epoch": 0.60668, + "grad_norm": 0.0024862110149115324, + "learning_rate": 8.034377561147814e-06, + "loss": 0.0024, + "step": 30334 + }, + { + "epoch": 0.60672, + "grad_norm": 0.07534155249595642, + "learning_rate": 8.033008556127328e-06, + "loss": 0.0072, + "step": 30336 + }, + { + "epoch": 0.60676, + "grad_norm": 0.2421099990606308, + "learning_rate": 8.031639589454349e-06, + "loss": 0.0018, + "step": 30338 + }, + { + "epoch": 0.6068, + "grad_norm": 0.003981968387961388, + "learning_rate": 8.030270661155575e-06, + "loss": 0.0035, + "step": 30340 + }, + { + "epoch": 0.60684, + "grad_norm": 0.0009586130036041141, + "learning_rate": 8.028901771257686e-06, + "loss": 0.0057, + "step": 30342 + }, + { + "epoch": 0.60688, + "grad_norm": 0.4813041687011719, + "learning_rate": 8.027532919787372e-06, + "loss": 0.0056, + "step": 30344 + }, + { + "epoch": 0.60692, + "grad_norm": 0.001919611357152462, + "learning_rate": 8.026164106771321e-06, + "loss": 0.0013, + "step": 30346 + }, + { + "epoch": 0.60696, + "grad_norm": 0.004707758780568838, + "learning_rate": 8.024795332236217e-06, + "loss": 0.0001, + "step": 30348 + }, + { + "epoch": 0.607, + "grad_norm": 0.0002849553420674056, + "learning_rate": 8.023426596208739e-06, + "loss": 0.0035, + "step": 30350 + }, + { + "epoch": 0.60704, + "grad_norm": 12.225000381469727, + "learning_rate": 8.022057898715582e-06, + "loss": 0.1663, + "step": 30352 + }, + { + "epoch": 0.60708, + "grad_norm": 0.003114251885563135, + "learning_rate": 8.02068923978342e-06, + "loss": 0.0007, + "step": 30354 + }, + { + "epoch": 0.60712, + "grad_norm": 0.057641640305519104, + "learning_rate": 8.019320619438946e-06, + "loss": 0.001, + "step": 30356 + }, + { + "epoch": 0.60716, + "grad_norm": 3.751983165740967, + "learning_rate": 8.017952037708837e-06, + "loss": 0.3306, + "step": 30358 + }, + { + "epoch": 0.6072, + "grad_norm": 0.1392090618610382, + "learning_rate": 8.016583494619769e-06, + "loss": 0.008, + "step": 30360 + }, + { + "epoch": 0.60724, + "grad_norm": 0.11151229590177536, + "learning_rate": 8.01521499019843e-06, + "loss": 0.0043, + "step": 30362 + }, + { + "epoch": 0.60728, + "grad_norm": 0.036570120602846146, + "learning_rate": 8.013846524471495e-06, + "loss": 0.0005, + "step": 30364 + }, + { + "epoch": 0.60732, + "grad_norm": 0.05996578931808472, + "learning_rate": 8.012478097465643e-06, + "loss": 0.1632, + "step": 30366 + }, + { + "epoch": 0.60736, + "grad_norm": 0.11677764356136322, + "learning_rate": 8.011109709207558e-06, + "loss": 0.0015, + "step": 30368 + }, + { + "epoch": 0.6074, + "grad_norm": 0.011930148117244244, + "learning_rate": 8.009741359723906e-06, + "loss": 0.0001, + "step": 30370 + }, + { + "epoch": 0.60744, + "grad_norm": 0.03631271794438362, + "learning_rate": 8.008373049041379e-06, + "loss": 0.0078, + "step": 30372 + }, + { + "epoch": 0.60748, + "grad_norm": 0.08273430168628693, + "learning_rate": 8.007004777186642e-06, + "loss": 0.0009, + "step": 30374 + }, + { + "epoch": 0.60752, + "grad_norm": 0.07624933123588562, + "learning_rate": 8.00563654418637e-06, + "loss": 0.0007, + "step": 30376 + }, + { + "epoch": 0.60756, + "grad_norm": 0.17366832494735718, + "learning_rate": 8.004268350067241e-06, + "loss": 0.0029, + "step": 30378 + }, + { + "epoch": 0.6076, + "grad_norm": 12.637579917907715, + "learning_rate": 8.00290019485593e-06, + "loss": 0.1654, + "step": 30380 + }, + { + "epoch": 0.60764, + "grad_norm": 4.730615139007568, + "learning_rate": 8.001532078579108e-06, + "loss": 0.0734, + "step": 30382 + }, + { + "epoch": 0.60768, + "grad_norm": 0.7776033282279968, + "learning_rate": 8.000164001263448e-06, + "loss": 0.0045, + "step": 30384 + }, + { + "epoch": 0.60772, + "grad_norm": 0.04445186257362366, + "learning_rate": 7.998795962935616e-06, + "loss": 0.0016, + "step": 30386 + }, + { + "epoch": 0.60776, + "grad_norm": 0.06714071333408356, + "learning_rate": 7.99742796362229e-06, + "loss": 0.001, + "step": 30388 + }, + { + "epoch": 0.6078, + "grad_norm": 0.03936715051531792, + "learning_rate": 7.996060003350139e-06, + "loss": 0.0043, + "step": 30390 + }, + { + "epoch": 0.60784, + "grad_norm": 0.030684150755405426, + "learning_rate": 7.994692082145824e-06, + "loss": 0.0007, + "step": 30392 + }, + { + "epoch": 0.60788, + "grad_norm": 0.027361026033759117, + "learning_rate": 7.993324200036025e-06, + "loss": 0.0007, + "step": 30394 + }, + { + "epoch": 0.60792, + "grad_norm": 0.16888506710529327, + "learning_rate": 7.9919563570474e-06, + "loss": 0.004, + "step": 30396 + }, + { + "epoch": 0.60796, + "grad_norm": 0.006589732598513365, + "learning_rate": 7.990588553206623e-06, + "loss": 0.0004, + "step": 30398 + }, + { + "epoch": 0.608, + "grad_norm": 7.258841514587402, + "learning_rate": 7.989220788540356e-06, + "loss": 0.0585, + "step": 30400 + }, + { + "epoch": 0.60804, + "grad_norm": 0.029832612723112106, + "learning_rate": 7.987853063075262e-06, + "loss": 0.0016, + "step": 30402 + }, + { + "epoch": 0.60808, + "grad_norm": 0.05532959848642349, + "learning_rate": 7.986485376838012e-06, + "loss": 0.4112, + "step": 30404 + }, + { + "epoch": 0.60812, + "grad_norm": 0.025396879762411118, + "learning_rate": 7.985117729855266e-06, + "loss": 0.0002, + "step": 30406 + }, + { + "epoch": 0.60816, + "grad_norm": 0.14456498622894287, + "learning_rate": 7.983750122153682e-06, + "loss": 0.002, + "step": 30408 + }, + { + "epoch": 0.6082, + "grad_norm": 0.06910812854766846, + "learning_rate": 7.982382553759931e-06, + "loss": 0.0722, + "step": 30410 + }, + { + "epoch": 0.60824, + "grad_norm": 0.03701549023389816, + "learning_rate": 7.981015024700671e-06, + "loss": 0.0026, + "step": 30412 + }, + { + "epoch": 0.60828, + "grad_norm": 0.08387093991041183, + "learning_rate": 7.979647535002563e-06, + "loss": 0.0654, + "step": 30414 + }, + { + "epoch": 0.60832, + "grad_norm": 0.0693984255194664, + "learning_rate": 7.978280084692268e-06, + "loss": 0.0009, + "step": 30416 + }, + { + "epoch": 0.60836, + "grad_norm": 2.187084674835205, + "learning_rate": 7.976912673796438e-06, + "loss": 0.0318, + "step": 30418 + }, + { + "epoch": 0.6084, + "grad_norm": 0.13785311579704285, + "learning_rate": 7.975545302341743e-06, + "loss": 0.0063, + "step": 30420 + }, + { + "epoch": 0.60844, + "grad_norm": 0.02167799510061741, + "learning_rate": 7.974177970354833e-06, + "loss": 0.0007, + "step": 30422 + }, + { + "epoch": 0.60848, + "grad_norm": 0.007388543803244829, + "learning_rate": 7.972810677862363e-06, + "loss": 0.0003, + "step": 30424 + }, + { + "epoch": 0.60852, + "grad_norm": 0.11040625721216202, + "learning_rate": 7.971443424890997e-06, + "loss": 0.431, + "step": 30426 + }, + { + "epoch": 0.60856, + "grad_norm": 0.0009215960744768381, + "learning_rate": 7.970076211467382e-06, + "loss": 0.0003, + "step": 30428 + }, + { + "epoch": 0.6086, + "grad_norm": 3.3263542652130127, + "learning_rate": 7.96870903761818e-06, + "loss": 0.0342, + "step": 30430 + }, + { + "epoch": 0.60864, + "grad_norm": 0.08685833215713501, + "learning_rate": 7.96734190337004e-06, + "loss": 0.0016, + "step": 30432 + }, + { + "epoch": 0.60868, + "grad_norm": 0.19165925681591034, + "learning_rate": 7.965974808749613e-06, + "loss": 0.002, + "step": 30434 + }, + { + "epoch": 0.60872, + "grad_norm": 5.4016852378845215, + "learning_rate": 7.964607753783557e-06, + "loss": 0.0721, + "step": 30436 + }, + { + "epoch": 0.60876, + "grad_norm": 0.09817271679639816, + "learning_rate": 7.963240738498523e-06, + "loss": 0.001, + "step": 30438 + }, + { + "epoch": 0.6088, + "grad_norm": 0.014293916523456573, + "learning_rate": 7.961873762921153e-06, + "loss": 0.0003, + "step": 30440 + }, + { + "epoch": 0.60884, + "grad_norm": 4.341486930847168, + "learning_rate": 7.960506827078109e-06, + "loss": 0.0478, + "step": 30442 + }, + { + "epoch": 0.60888, + "grad_norm": 4.067984580993652, + "learning_rate": 7.959139930996032e-06, + "loss": 0.0431, + "step": 30444 + }, + { + "epoch": 0.60892, + "grad_norm": 0.03537396714091301, + "learning_rate": 7.957773074701572e-06, + "loss": 0.0585, + "step": 30446 + }, + { + "epoch": 0.60896, + "grad_norm": 0.012551154009997845, + "learning_rate": 7.956406258221379e-06, + "loss": 0.0002, + "step": 30448 + }, + { + "epoch": 0.609, + "grad_norm": 0.0024516526609659195, + "learning_rate": 7.955039481582098e-06, + "loss": 0.0791, + "step": 30450 + }, + { + "epoch": 0.60904, + "grad_norm": 0.0462043359875679, + "learning_rate": 7.953672744810375e-06, + "loss": 0.0055, + "step": 30452 + }, + { + "epoch": 0.60908, + "grad_norm": 1.0491124391555786, + "learning_rate": 7.952306047932856e-06, + "loss": 0.0114, + "step": 30454 + }, + { + "epoch": 0.60912, + "grad_norm": 16.807998657226562, + "learning_rate": 7.950939390976182e-06, + "loss": 0.1688, + "step": 30456 + }, + { + "epoch": 0.60916, + "grad_norm": 0.03177264332771301, + "learning_rate": 7.949572773967002e-06, + "loss": 0.0009, + "step": 30458 + }, + { + "epoch": 0.6092, + "grad_norm": 0.0016921627102419734, + "learning_rate": 7.948206196931953e-06, + "loss": 0.005, + "step": 30460 + }, + { + "epoch": 0.60924, + "grad_norm": 0.021376457065343857, + "learning_rate": 7.946839659897685e-06, + "loss": 0.0003, + "step": 30462 + }, + { + "epoch": 0.60928, + "grad_norm": 0.049090854823589325, + "learning_rate": 7.945473162890836e-06, + "loss": 0.0012, + "step": 30464 + }, + { + "epoch": 0.60932, + "grad_norm": 0.11134711652994156, + "learning_rate": 7.94410670593804e-06, + "loss": 0.0085, + "step": 30466 + }, + { + "epoch": 0.60936, + "grad_norm": 9.090014457702637, + "learning_rate": 7.942740289065947e-06, + "loss": 0.098, + "step": 30468 + }, + { + "epoch": 0.6094, + "grad_norm": 2.3859965801239014, + "learning_rate": 7.94137391230119e-06, + "loss": 0.0304, + "step": 30470 + }, + { + "epoch": 0.60944, + "grad_norm": 0.07401503622531891, + "learning_rate": 7.940007575670406e-06, + "loss": 0.0006, + "step": 30472 + }, + { + "epoch": 0.60948, + "grad_norm": 0.10264304280281067, + "learning_rate": 7.938641279200236e-06, + "loss": 0.003, + "step": 30474 + }, + { + "epoch": 0.60952, + "grad_norm": 0.15376870334148407, + "learning_rate": 7.937275022917314e-06, + "loss": 0.0029, + "step": 30476 + }, + { + "epoch": 0.60956, + "grad_norm": 0.008542210794985294, + "learning_rate": 7.935908806848281e-06, + "loss": 0.0004, + "step": 30478 + }, + { + "epoch": 0.6096, + "grad_norm": 0.025714246556162834, + "learning_rate": 7.934542631019767e-06, + "loss": 0.0004, + "step": 30480 + }, + { + "epoch": 0.60964, + "grad_norm": 0.22464706003665924, + "learning_rate": 7.933176495458407e-06, + "loss": 0.0034, + "step": 30482 + }, + { + "epoch": 0.60968, + "grad_norm": 0.022455761209130287, + "learning_rate": 7.931810400190838e-06, + "loss": 0.0003, + "step": 30484 + }, + { + "epoch": 0.60972, + "grad_norm": 1.5480966567993164, + "learning_rate": 7.930444345243689e-06, + "loss": 0.0194, + "step": 30486 + }, + { + "epoch": 0.60976, + "grad_norm": 11.446585655212402, + "learning_rate": 7.92907833064359e-06, + "loss": 0.2216, + "step": 30488 + }, + { + "epoch": 0.6098, + "grad_norm": 0.036661647260189056, + "learning_rate": 7.927712356417176e-06, + "loss": 0.0719, + "step": 30490 + }, + { + "epoch": 0.60984, + "grad_norm": 0.12237220257520676, + "learning_rate": 7.926346422591076e-06, + "loss": 0.0011, + "step": 30492 + }, + { + "epoch": 0.60988, + "grad_norm": 0.0003056424029637128, + "learning_rate": 7.924980529191923e-06, + "loss": 0.0002, + "step": 30494 + }, + { + "epoch": 0.60992, + "grad_norm": 0.08117714524269104, + "learning_rate": 7.923614676246341e-06, + "loss": 0.001, + "step": 30496 + }, + { + "epoch": 0.60996, + "grad_norm": 0.000326941953971982, + "learning_rate": 7.92224886378096e-06, + "loss": 0.0239, + "step": 30498 + }, + { + "epoch": 0.61, + "grad_norm": 3.461974859237671, + "learning_rate": 7.92088309182241e-06, + "loss": 0.1152, + "step": 30500 + }, + { + "epoch": 0.61004, + "grad_norm": 1.5821267366409302, + "learning_rate": 7.919517360397309e-06, + "loss": 0.0134, + "step": 30502 + }, + { + "epoch": 0.61008, + "grad_norm": 0.018190154805779457, + "learning_rate": 7.918151669532293e-06, + "loss": 0.0584, + "step": 30504 + }, + { + "epoch": 0.61012, + "grad_norm": 0.022082800045609474, + "learning_rate": 7.91678601925398e-06, + "loss": 0.0002, + "step": 30506 + }, + { + "epoch": 0.61016, + "grad_norm": 5.782828330993652, + "learning_rate": 7.915420409588994e-06, + "loss": 0.047, + "step": 30508 + }, + { + "epoch": 0.6102, + "grad_norm": 0.31486913561820984, + "learning_rate": 7.914054840563962e-06, + "loss": 0.0037, + "step": 30510 + }, + { + "epoch": 0.61024, + "grad_norm": 5.5287275314331055, + "learning_rate": 7.912689312205509e-06, + "loss": 0.0836, + "step": 30512 + }, + { + "epoch": 0.61028, + "grad_norm": 0.05756918340921402, + "learning_rate": 7.911323824540246e-06, + "loss": 0.0011, + "step": 30514 + }, + { + "epoch": 0.61032, + "grad_norm": 0.25883859395980835, + "learning_rate": 7.909958377594803e-06, + "loss": 0.0026, + "step": 30516 + }, + { + "epoch": 0.61036, + "grad_norm": 7.734786033630371, + "learning_rate": 7.908592971395797e-06, + "loss": 0.0878, + "step": 30518 + }, + { + "epoch": 0.6104, + "grad_norm": 10.993239402770996, + "learning_rate": 7.907227605969849e-06, + "loss": 0.1277, + "step": 30520 + }, + { + "epoch": 0.61044, + "grad_norm": 0.3380585312843323, + "learning_rate": 7.905862281343578e-06, + "loss": 0.0034, + "step": 30522 + }, + { + "epoch": 0.61048, + "grad_norm": 0.006308411713689566, + "learning_rate": 7.904496997543595e-06, + "loss": 0.0001, + "step": 30524 + }, + { + "epoch": 0.61052, + "grad_norm": 0.3079102039337158, + "learning_rate": 7.903131754596525e-06, + "loss": 0.0026, + "step": 30526 + }, + { + "epoch": 0.61056, + "grad_norm": 0.0009201092179864645, + "learning_rate": 7.901766552528981e-06, + "loss": 0.0027, + "step": 30528 + }, + { + "epoch": 0.6106, + "grad_norm": 0.06186520680785179, + "learning_rate": 7.900401391367576e-06, + "loss": 0.0035, + "step": 30530 + }, + { + "epoch": 0.61064, + "grad_norm": 0.004622861742973328, + "learning_rate": 7.89903627113893e-06, + "loss": 0.0001, + "step": 30532 + }, + { + "epoch": 0.61068, + "grad_norm": 0.03938038647174835, + "learning_rate": 7.897671191869652e-06, + "loss": 0.0424, + "step": 30534 + }, + { + "epoch": 0.61072, + "grad_norm": 0.0177161768078804, + "learning_rate": 7.896306153586358e-06, + "loss": 0.019, + "step": 30536 + }, + { + "epoch": 0.61076, + "grad_norm": 0.5093832612037659, + "learning_rate": 7.894941156315658e-06, + "loss": 0.0086, + "step": 30538 + }, + { + "epoch": 0.6108, + "grad_norm": 0.006063362583518028, + "learning_rate": 7.89357620008416e-06, + "loss": 0.0001, + "step": 30540 + }, + { + "epoch": 0.61084, + "grad_norm": 0.4108628034591675, + "learning_rate": 7.892211284918482e-06, + "loss": 0.0035, + "step": 30542 + }, + { + "epoch": 0.61088, + "grad_norm": 0.010814635083079338, + "learning_rate": 7.890846410845232e-06, + "loss": 0.0002, + "step": 30544 + }, + { + "epoch": 0.61092, + "grad_norm": 0.14566229283809662, + "learning_rate": 7.889481577891011e-06, + "loss": 0.0013, + "step": 30546 + }, + { + "epoch": 0.61096, + "grad_norm": 15.887885093688965, + "learning_rate": 7.888116786082439e-06, + "loss": 0.301, + "step": 30548 + }, + { + "epoch": 0.611, + "grad_norm": 0.003912921529263258, + "learning_rate": 7.886752035446116e-06, + "loss": 0.0014, + "step": 30550 + }, + { + "epoch": 0.61104, + "grad_norm": 0.0021677440963685513, + "learning_rate": 7.885387326008648e-06, + "loss": 0.0422, + "step": 30552 + }, + { + "epoch": 0.61108, + "grad_norm": 0.009734582155942917, + "learning_rate": 7.884022657796647e-06, + "loss": 0.0045, + "step": 30554 + }, + { + "epoch": 0.61112, + "grad_norm": 0.010685116052627563, + "learning_rate": 7.882658030836707e-06, + "loss": 0.0015, + "step": 30556 + }, + { + "epoch": 0.61116, + "grad_norm": 0.03592381626367569, + "learning_rate": 7.881293445155444e-06, + "loss": 0.0048, + "step": 30558 + }, + { + "epoch": 0.6112, + "grad_norm": 0.007010181900113821, + "learning_rate": 7.879928900779457e-06, + "loss": 0.0002, + "step": 30560 + }, + { + "epoch": 0.61124, + "grad_norm": 0.00770664494484663, + "learning_rate": 7.878564397735343e-06, + "loss": 0.0002, + "step": 30562 + }, + { + "epoch": 0.61128, + "grad_norm": 0.04501604661345482, + "learning_rate": 7.87719993604971e-06, + "loss": 0.0022, + "step": 30564 + }, + { + "epoch": 0.61132, + "grad_norm": 0.49132290482521057, + "learning_rate": 7.875835515749159e-06, + "loss": 0.0075, + "step": 30566 + }, + { + "epoch": 0.61136, + "grad_norm": 0.10299468785524368, + "learning_rate": 7.874471136860287e-06, + "loss": 0.0052, + "step": 30568 + }, + { + "epoch": 0.6114, + "grad_norm": 0.005446136463433504, + "learning_rate": 7.873106799409696e-06, + "loss": 0.0004, + "step": 30570 + }, + { + "epoch": 0.61144, + "grad_norm": 0.06657605618238449, + "learning_rate": 7.871742503423977e-06, + "loss": 0.0014, + "step": 30572 + }, + { + "epoch": 0.61148, + "grad_norm": 0.00021010191994719207, + "learning_rate": 7.87037824892974e-06, + "loss": 0.0001, + "step": 30574 + }, + { + "epoch": 0.61152, + "grad_norm": 0.005655916407704353, + "learning_rate": 7.869014035953576e-06, + "loss": 0.0715, + "step": 30576 + }, + { + "epoch": 0.61156, + "grad_norm": 0.007140064146369696, + "learning_rate": 7.867649864522075e-06, + "loss": 0.0001, + "step": 30578 + }, + { + "epoch": 0.6116, + "grad_norm": 0.04715336859226227, + "learning_rate": 7.866285734661842e-06, + "loss": 0.0067, + "step": 30580 + }, + { + "epoch": 0.61164, + "grad_norm": 0.05126124620437622, + "learning_rate": 7.864921646399464e-06, + "loss": 0.0049, + "step": 30582 + }, + { + "epoch": 0.61168, + "grad_norm": 10.331167221069336, + "learning_rate": 7.863557599761539e-06, + "loss": 0.0819, + "step": 30584 + }, + { + "epoch": 0.61172, + "grad_norm": 0.7827118635177612, + "learning_rate": 7.86219359477466e-06, + "loss": 1.3218, + "step": 30586 + }, + { + "epoch": 0.61176, + "grad_norm": 4.640404224395752, + "learning_rate": 7.860829631465417e-06, + "loss": 0.043, + "step": 30588 + }, + { + "epoch": 0.6118, + "grad_norm": 0.048608191311359406, + "learning_rate": 7.8594657098604e-06, + "loss": 0.0015, + "step": 30590 + }, + { + "epoch": 0.61184, + "grad_norm": 0.23543749749660492, + "learning_rate": 7.858101829986204e-06, + "loss": 0.0043, + "step": 30592 + }, + { + "epoch": 0.61188, + "grad_norm": 0.004526949021965265, + "learning_rate": 7.856737991869412e-06, + "loss": 0.0003, + "step": 30594 + }, + { + "epoch": 0.61192, + "grad_norm": 0.18905574083328247, + "learning_rate": 7.855374195536617e-06, + "loss": 0.0321, + "step": 30596 + }, + { + "epoch": 0.61196, + "grad_norm": 0.3787979781627655, + "learning_rate": 7.854010441014406e-06, + "loss": 0.0188, + "step": 30598 + }, + { + "epoch": 0.612, + "grad_norm": 0.02387390471994877, + "learning_rate": 7.852646728329368e-06, + "loss": 0.0007, + "step": 30600 + }, + { + "epoch": 0.61204, + "grad_norm": 0.019283097237348557, + "learning_rate": 7.85128305750809e-06, + "loss": 0.0021, + "step": 30602 + }, + { + "epoch": 0.61208, + "grad_norm": 0.018329957500100136, + "learning_rate": 7.84991942857715e-06, + "loss": 0.0094, + "step": 30604 + }, + { + "epoch": 0.61212, + "grad_norm": 0.11068323999643326, + "learning_rate": 7.848555841563142e-06, + "loss": 0.0028, + "step": 30606 + }, + { + "epoch": 0.61216, + "grad_norm": 0.018834663555026054, + "learning_rate": 7.84719229649264e-06, + "loss": 0.0017, + "step": 30608 + }, + { + "epoch": 0.6122, + "grad_norm": 0.12407799065113068, + "learning_rate": 7.845828793392236e-06, + "loss": 0.0018, + "step": 30610 + }, + { + "epoch": 0.61224, + "grad_norm": 0.6944671273231506, + "learning_rate": 7.84446533228851e-06, + "loss": 0.0084, + "step": 30612 + }, + { + "epoch": 0.61228, + "grad_norm": 0.009047173894941807, + "learning_rate": 7.843101913208037e-06, + "loss": 0.004, + "step": 30614 + }, + { + "epoch": 0.61232, + "grad_norm": 13.094088554382324, + "learning_rate": 7.841738536177408e-06, + "loss": 0.1498, + "step": 30616 + }, + { + "epoch": 0.61236, + "grad_norm": 0.05847708135843277, + "learning_rate": 7.840375201223197e-06, + "loss": 0.0006, + "step": 30618 + }, + { + "epoch": 0.6124, + "grad_norm": 0.06317579746246338, + "learning_rate": 7.83901190837198e-06, + "loss": 0.0045, + "step": 30620 + }, + { + "epoch": 0.61244, + "grad_norm": 0.04079360142350197, + "learning_rate": 7.837648657650342e-06, + "loss": 0.0009, + "step": 30622 + }, + { + "epoch": 0.61248, + "grad_norm": 0.02222428470849991, + "learning_rate": 7.836285449084853e-06, + "loss": 0.0106, + "step": 30624 + }, + { + "epoch": 0.61252, + "grad_norm": 0.055912237614393234, + "learning_rate": 7.834922282702095e-06, + "loss": 0.1061, + "step": 30626 + }, + { + "epoch": 0.61256, + "grad_norm": 0.003607069607824087, + "learning_rate": 7.833559158528644e-06, + "loss": 0.0002, + "step": 30628 + }, + { + "epoch": 0.6126, + "grad_norm": 0.11169768124818802, + "learning_rate": 7.832196076591067e-06, + "loss": 0.0007, + "step": 30630 + }, + { + "epoch": 0.61264, + "grad_norm": 0.03081376664340496, + "learning_rate": 7.830833036915947e-06, + "loss": 0.0003, + "step": 30632 + }, + { + "epoch": 0.61268, + "grad_norm": 0.007126562763005495, + "learning_rate": 7.829470039529857e-06, + "loss": 0.0239, + "step": 30634 + }, + { + "epoch": 0.61272, + "grad_norm": 0.10203002393245697, + "learning_rate": 7.82810708445936e-06, + "loss": 0.0009, + "step": 30636 + }, + { + "epoch": 0.61276, + "grad_norm": 0.10175178945064545, + "learning_rate": 7.826744171731039e-06, + "loss": 0.0012, + "step": 30638 + }, + { + "epoch": 0.6128, + "grad_norm": 0.03891914710402489, + "learning_rate": 7.825381301371452e-06, + "loss": 0.0005, + "step": 30640 + }, + { + "epoch": 0.61284, + "grad_norm": 0.07578583806753159, + "learning_rate": 7.824018473407183e-06, + "loss": 0.005, + "step": 30642 + }, + { + "epoch": 0.61288, + "grad_norm": 0.07979124039411545, + "learning_rate": 7.822655687864793e-06, + "loss": 0.0309, + "step": 30644 + }, + { + "epoch": 0.61292, + "grad_norm": 7.491034507751465, + "learning_rate": 7.821292944770846e-06, + "loss": 0.1379, + "step": 30646 + }, + { + "epoch": 0.61296, + "grad_norm": 0.8762968182563782, + "learning_rate": 7.81993024415192e-06, + "loss": 0.0099, + "step": 30648 + }, + { + "epoch": 0.613, + "grad_norm": 0.27937954664230347, + "learning_rate": 7.818567586034578e-06, + "loss": 0.0094, + "step": 30650 + }, + { + "epoch": 0.61304, + "grad_norm": 0.005576529074460268, + "learning_rate": 7.817204970445376e-06, + "loss": 0.0016, + "step": 30652 + }, + { + "epoch": 0.61308, + "grad_norm": 0.0021144093479961157, + "learning_rate": 7.815842397410895e-06, + "loss": 0.0168, + "step": 30654 + }, + { + "epoch": 0.61312, + "grad_norm": 0.6080222725868225, + "learning_rate": 7.814479866957687e-06, + "loss": 0.0051, + "step": 30656 + }, + { + "epoch": 0.61316, + "grad_norm": 0.10938484221696854, + "learning_rate": 7.813117379112321e-06, + "loss": 0.549, + "step": 30658 + }, + { + "epoch": 0.6132, + "grad_norm": 0.424824982881546, + "learning_rate": 7.811754933901358e-06, + "loss": 0.0036, + "step": 30660 + }, + { + "epoch": 0.61324, + "grad_norm": 2.8351004123687744, + "learning_rate": 7.810392531351354e-06, + "loss": 0.0256, + "step": 30662 + }, + { + "epoch": 0.61328, + "grad_norm": 0.0015582280466333032, + "learning_rate": 7.80903017148888e-06, + "loss": 0.0003, + "step": 30664 + }, + { + "epoch": 0.61332, + "grad_norm": 0.8490119576454163, + "learning_rate": 7.80766785434049e-06, + "loss": 0.0074, + "step": 30666 + }, + { + "epoch": 0.61336, + "grad_norm": 0.09390805661678314, + "learning_rate": 7.806305579932741e-06, + "loss": 0.0726, + "step": 30668 + }, + { + "epoch": 0.6134, + "grad_norm": 0.9333593249320984, + "learning_rate": 7.804943348292197e-06, + "loss": 0.01, + "step": 30670 + }, + { + "epoch": 0.61344, + "grad_norm": 0.002398656914010644, + "learning_rate": 7.803581159445412e-06, + "loss": 0.0006, + "step": 30672 + }, + { + "epoch": 0.61348, + "grad_norm": 0.1480758935213089, + "learning_rate": 7.802219013418943e-06, + "loss": 0.0017, + "step": 30674 + }, + { + "epoch": 0.61352, + "grad_norm": 0.015542572364211082, + "learning_rate": 7.800856910239349e-06, + "loss": 0.0003, + "step": 30676 + }, + { + "epoch": 0.61356, + "grad_norm": 0.021498829126358032, + "learning_rate": 7.799494849933175e-06, + "loss": 0.0004, + "step": 30678 + }, + { + "epoch": 0.6136, + "grad_norm": 0.1082838848233223, + "learning_rate": 7.798132832526986e-06, + "loss": 0.0883, + "step": 30680 + }, + { + "epoch": 0.61364, + "grad_norm": 0.03678959980607033, + "learning_rate": 7.796770858047333e-06, + "loss": 0.0004, + "step": 30682 + }, + { + "epoch": 0.61368, + "grad_norm": 1.2390350103378296, + "learning_rate": 7.79540892652076e-06, + "loss": 0.0129, + "step": 30684 + }, + { + "epoch": 0.61372, + "grad_norm": 0.02833748608827591, + "learning_rate": 7.79404703797383e-06, + "loss": 0.0003, + "step": 30686 + }, + { + "epoch": 0.61376, + "grad_norm": 0.05232476443052292, + "learning_rate": 7.792685192433087e-06, + "loss": 0.1267, + "step": 30688 + }, + { + "epoch": 0.6138, + "grad_norm": 3.738515853881836, + "learning_rate": 7.791323389925084e-06, + "loss": 0.0213, + "step": 30690 + }, + { + "epoch": 0.61384, + "grad_norm": 0.046514954417943954, + "learning_rate": 7.789961630476369e-06, + "loss": 0.0029, + "step": 30692 + }, + { + "epoch": 0.61388, + "grad_norm": 0.051338210701942444, + "learning_rate": 7.788599914113485e-06, + "loss": 0.0009, + "step": 30694 + }, + { + "epoch": 0.61392, + "grad_norm": 0.008418879471719265, + "learning_rate": 7.787238240862988e-06, + "loss": 0.8346, + "step": 30696 + }, + { + "epoch": 0.61396, + "grad_norm": 0.002001651097089052, + "learning_rate": 7.785876610751423e-06, + "loss": 0.0006, + "step": 30698 + }, + { + "epoch": 0.614, + "grad_norm": 0.019197192043066025, + "learning_rate": 7.784515023805328e-06, + "loss": 0.0012, + "step": 30700 + }, + { + "epoch": 0.61404, + "grad_norm": 0.0015036441618576646, + "learning_rate": 7.783153480051256e-06, + "loss": 0.0011, + "step": 30702 + }, + { + "epoch": 0.61408, + "grad_norm": 0.0475848913192749, + "learning_rate": 7.781791979515749e-06, + "loss": 0.0009, + "step": 30704 + }, + { + "epoch": 0.61412, + "grad_norm": 0.34486162662506104, + "learning_rate": 7.780430522225348e-06, + "loss": 0.0045, + "step": 30706 + }, + { + "epoch": 0.61416, + "grad_norm": 0.14882132411003113, + "learning_rate": 7.779069108206597e-06, + "loss": 0.0022, + "step": 30708 + }, + { + "epoch": 0.6142, + "grad_norm": 0.02729039266705513, + "learning_rate": 7.777707737486036e-06, + "loss": 0.0003, + "step": 30710 + }, + { + "epoch": 0.61424, + "grad_norm": 0.5011493563652039, + "learning_rate": 7.77634641009021e-06, + "loss": 0.0041, + "step": 30712 + }, + { + "epoch": 0.61428, + "grad_norm": 0.03091845102608204, + "learning_rate": 7.774985126045651e-06, + "loss": 0.0004, + "step": 30714 + }, + { + "epoch": 0.61432, + "grad_norm": 11.767280578613281, + "learning_rate": 7.773623885378905e-06, + "loss": 0.1058, + "step": 30716 + }, + { + "epoch": 0.61436, + "grad_norm": 0.4254560172557831, + "learning_rate": 7.77226268811651e-06, + "loss": 0.0504, + "step": 30718 + }, + { + "epoch": 0.6144, + "grad_norm": 0.09302908927202225, + "learning_rate": 7.770901534284996e-06, + "loss": 0.0011, + "step": 30720 + }, + { + "epoch": 0.61444, + "grad_norm": 8.098690032958984, + "learning_rate": 7.769540423910908e-06, + "loss": 0.0891, + "step": 30722 + }, + { + "epoch": 0.61448, + "grad_norm": 0.001983341993764043, + "learning_rate": 7.768179357020778e-06, + "loss": 0.0011, + "step": 30724 + }, + { + "epoch": 0.61452, + "grad_norm": 0.053783614188432693, + "learning_rate": 7.766818333641137e-06, + "loss": 0.0012, + "step": 30726 + }, + { + "epoch": 0.61456, + "grad_norm": 0.007310414686799049, + "learning_rate": 7.765457353798526e-06, + "loss": 0.0003, + "step": 30728 + }, + { + "epoch": 0.6146, + "grad_norm": 8.996715545654297, + "learning_rate": 7.76409641751947e-06, + "loss": 0.1266, + "step": 30730 + }, + { + "epoch": 0.61464, + "grad_norm": 0.014404531568288803, + "learning_rate": 7.76273552483051e-06, + "loss": 0.0002, + "step": 30732 + }, + { + "epoch": 0.61468, + "grad_norm": 0.13819999992847443, + "learning_rate": 7.761374675758171e-06, + "loss": 0.0015, + "step": 30734 + }, + { + "epoch": 0.61472, + "grad_norm": 0.09391556680202484, + "learning_rate": 7.760013870328982e-06, + "loss": 0.0009, + "step": 30736 + }, + { + "epoch": 0.61476, + "grad_norm": 0.03130458667874336, + "learning_rate": 7.75865310856948e-06, + "loss": 0.0009, + "step": 30738 + }, + { + "epoch": 0.6148, + "grad_norm": 3.058893918991089, + "learning_rate": 7.757292390506191e-06, + "loss": 0.0213, + "step": 30740 + }, + { + "epoch": 0.61484, + "grad_norm": 0.028217436745762825, + "learning_rate": 7.755931716165636e-06, + "loss": 0.0216, + "step": 30742 + }, + { + "epoch": 0.61488, + "grad_norm": 7.497964382171631, + "learning_rate": 7.754571085574352e-06, + "loss": 0.336, + "step": 30744 + }, + { + "epoch": 0.61492, + "grad_norm": 0.004102681297808886, + "learning_rate": 7.753210498758857e-06, + "loss": 0.0582, + "step": 30746 + }, + { + "epoch": 0.61496, + "grad_norm": 0.018118273466825485, + "learning_rate": 7.751849955745684e-06, + "loss": 0.0008, + "step": 30748 + }, + { + "epoch": 0.615, + "grad_norm": 0.02413778565824032, + "learning_rate": 7.750489456561351e-06, + "loss": 0.0095, + "step": 30750 + }, + { + "epoch": 0.61504, + "grad_norm": 0.24996493756771088, + "learning_rate": 7.749129001232383e-06, + "loss": 0.0039, + "step": 30752 + }, + { + "epoch": 0.61508, + "grad_norm": 1.6928073167800903, + "learning_rate": 7.747768589785307e-06, + "loss": 0.0107, + "step": 30754 + }, + { + "epoch": 0.61512, + "grad_norm": 0.08807302266359329, + "learning_rate": 7.746408222246642e-06, + "loss": 0.0012, + "step": 30756 + }, + { + "epoch": 0.61516, + "grad_norm": 1.4551990032196045, + "learning_rate": 7.745047898642908e-06, + "loss": 0.0155, + "step": 30758 + }, + { + "epoch": 0.6152, + "grad_norm": 0.02548418939113617, + "learning_rate": 7.743687619000625e-06, + "loss": 0.0005, + "step": 30760 + }, + { + "epoch": 0.61524, + "grad_norm": 3.5418918132781982, + "learning_rate": 7.742327383346313e-06, + "loss": 0.0314, + "step": 30762 + }, + { + "epoch": 0.61528, + "grad_norm": 0.0042381188832223415, + "learning_rate": 7.740967191706493e-06, + "loss": 0.001, + "step": 30764 + }, + { + "epoch": 0.61532, + "grad_norm": 0.004918028134852648, + "learning_rate": 7.739607044107681e-06, + "loss": 0.0004, + "step": 30766 + }, + { + "epoch": 0.61536, + "grad_norm": 0.7788521647453308, + "learning_rate": 7.73824694057639e-06, + "loss": 0.0066, + "step": 30768 + }, + { + "epoch": 0.6154, + "grad_norm": 5.086558818817139, + "learning_rate": 7.736886881139143e-06, + "loss": 0.0795, + "step": 30770 + }, + { + "epoch": 0.61544, + "grad_norm": 0.0469384528696537, + "learning_rate": 7.73552686582245e-06, + "loss": 0.0011, + "step": 30772 + }, + { + "epoch": 0.61548, + "grad_norm": 0.0054907361045479774, + "learning_rate": 7.734166894652826e-06, + "loss": 0.0004, + "step": 30774 + }, + { + "epoch": 0.61552, + "grad_norm": 0.37001341581344604, + "learning_rate": 7.732806967656785e-06, + "loss": 0.0327, + "step": 30776 + }, + { + "epoch": 0.61556, + "grad_norm": 0.01002547424286604, + "learning_rate": 7.73144708486084e-06, + "loss": 0.0105, + "step": 30778 + }, + { + "epoch": 0.6156, + "grad_norm": 0.02897241897881031, + "learning_rate": 7.730087246291503e-06, + "loss": 0.0012, + "step": 30780 + }, + { + "epoch": 0.61564, + "grad_norm": 0.014857331290841103, + "learning_rate": 7.728727451975283e-06, + "loss": 0.0001, + "step": 30782 + }, + { + "epoch": 0.61568, + "grad_norm": 0.2015204280614853, + "learning_rate": 7.727367701938687e-06, + "loss": 0.0098, + "step": 30784 + }, + { + "epoch": 0.61572, + "grad_norm": 0.005855706054717302, + "learning_rate": 7.726007996208234e-06, + "loss": 0.0001, + "step": 30786 + }, + { + "epoch": 0.61576, + "grad_norm": 0.013119998387992382, + "learning_rate": 7.724648334810423e-06, + "loss": 0.1057, + "step": 30788 + }, + { + "epoch": 0.6158, + "grad_norm": 0.023067915812134743, + "learning_rate": 7.72328871777176e-06, + "loss": 0.0036, + "step": 30790 + }, + { + "epoch": 0.61584, + "grad_norm": 0.02125619538128376, + "learning_rate": 7.721929145118762e-06, + "loss": 0.0015, + "step": 30792 + }, + { + "epoch": 0.61588, + "grad_norm": 1.9175662994384766, + "learning_rate": 7.720569616877924e-06, + "loss": 0.0146, + "step": 30794 + }, + { + "epoch": 0.61592, + "grad_norm": 0.04570529982447624, + "learning_rate": 7.719210133075759e-06, + "loss": 0.0008, + "step": 30796 + }, + { + "epoch": 0.61596, + "grad_norm": 0.00023096823133528233, + "learning_rate": 7.717850693738765e-06, + "loss": 0.0005, + "step": 30798 + }, + { + "epoch": 0.616, + "grad_norm": 0.04575316980481148, + "learning_rate": 7.716491298893443e-06, + "loss": 0.0005, + "step": 30800 + }, + { + "epoch": 0.61604, + "grad_norm": 0.003664563177153468, + "learning_rate": 7.715131948566304e-06, + "loss": 0.0002, + "step": 30802 + }, + { + "epoch": 0.61608, + "grad_norm": 0.054589007049798965, + "learning_rate": 7.713772642783838e-06, + "loss": 0.034, + "step": 30804 + }, + { + "epoch": 0.61612, + "grad_norm": 0.01629713922739029, + "learning_rate": 7.712413381572556e-06, + "loss": 0.0006, + "step": 30806 + }, + { + "epoch": 0.61616, + "grad_norm": 0.09964984655380249, + "learning_rate": 7.711054164958954e-06, + "loss": 0.0047, + "step": 30808 + }, + { + "epoch": 0.6162, + "grad_norm": 0.4837248623371124, + "learning_rate": 7.709694992969525e-06, + "loss": 0.0047, + "step": 30810 + }, + { + "epoch": 0.61624, + "grad_norm": 1.691934585571289, + "learning_rate": 7.708335865630775e-06, + "loss": 0.0134, + "step": 30812 + }, + { + "epoch": 0.61628, + "grad_norm": 0.021580029278993607, + "learning_rate": 7.706976782969198e-06, + "loss": 0.0033, + "step": 30814 + }, + { + "epoch": 0.61632, + "grad_norm": 0.014436133205890656, + "learning_rate": 7.705617745011284e-06, + "loss": 0.0006, + "step": 30816 + }, + { + "epoch": 0.61636, + "grad_norm": 0.8098128437995911, + "learning_rate": 7.704258751783537e-06, + "loss": 0.0073, + "step": 30818 + }, + { + "epoch": 0.6164, + "grad_norm": 0.002296419581398368, + "learning_rate": 7.702899803312443e-06, + "loss": 0.0001, + "step": 30820 + }, + { + "epoch": 0.61644, + "grad_norm": 0.05674051493406296, + "learning_rate": 7.701540899624505e-06, + "loss": 0.4889, + "step": 30822 + }, + { + "epoch": 0.61648, + "grad_norm": 0.6779454946517944, + "learning_rate": 7.70018204074621e-06, + "loss": 0.0056, + "step": 30824 + }, + { + "epoch": 0.61652, + "grad_norm": 15.569342613220215, + "learning_rate": 7.698823226704048e-06, + "loss": 0.4111, + "step": 30826 + }, + { + "epoch": 0.61656, + "grad_norm": 0.012770972214639187, + "learning_rate": 7.697464457524515e-06, + "loss": 0.0002, + "step": 30828 + }, + { + "epoch": 0.6166, + "grad_norm": 0.6806149482727051, + "learning_rate": 7.696105733234099e-06, + "loss": 0.0046, + "step": 30830 + }, + { + "epoch": 0.61664, + "grad_norm": 0.0024663018994033337, + "learning_rate": 7.69474705385928e-06, + "loss": 0.0001, + "step": 30832 + }, + { + "epoch": 0.61668, + "grad_norm": 0.2748136520385742, + "learning_rate": 7.69338841942656e-06, + "loss": 0.0039, + "step": 30834 + }, + { + "epoch": 0.61672, + "grad_norm": 0.05508193001151085, + "learning_rate": 7.692029829962417e-06, + "loss": 0.0009, + "step": 30836 + }, + { + "epoch": 0.61676, + "grad_norm": 0.016941804438829422, + "learning_rate": 7.690671285493344e-06, + "loss": 0.0058, + "step": 30838 + }, + { + "epoch": 0.6168, + "grad_norm": 0.7791293263435364, + "learning_rate": 7.689312786045823e-06, + "loss": 0.0085, + "step": 30840 + }, + { + "epoch": 0.61684, + "grad_norm": 0.010879590176045895, + "learning_rate": 7.687954331646337e-06, + "loss": 0.0004, + "step": 30842 + }, + { + "epoch": 0.61688, + "grad_norm": 0.3199196457862854, + "learning_rate": 7.686595922321372e-06, + "loss": 0.0034, + "step": 30844 + }, + { + "epoch": 0.61692, + "grad_norm": 0.0778098925948143, + "learning_rate": 7.685237558097412e-06, + "loss": 0.0653, + "step": 30846 + }, + { + "epoch": 0.61696, + "grad_norm": 0.3571108281612396, + "learning_rate": 7.683879239000936e-06, + "loss": 0.0042, + "step": 30848 + }, + { + "epoch": 0.617, + "grad_norm": 0.0027699992060661316, + "learning_rate": 7.68252096505843e-06, + "loss": 0.0149, + "step": 30850 + }, + { + "epoch": 0.61704, + "grad_norm": 0.05009458214044571, + "learning_rate": 7.681162736296364e-06, + "loss": 0.5687, + "step": 30852 + }, + { + "epoch": 0.61708, + "grad_norm": 0.013008958660066128, + "learning_rate": 7.67980455274123e-06, + "loss": 0.0002, + "step": 30854 + }, + { + "epoch": 0.61712, + "grad_norm": 0.023421961814165115, + "learning_rate": 7.6784464144195e-06, + "loss": 0.0016, + "step": 30856 + }, + { + "epoch": 0.61716, + "grad_norm": 3.485891342163086, + "learning_rate": 7.677088321357648e-06, + "loss": 0.2196, + "step": 30858 + }, + { + "epoch": 0.6172, + "grad_norm": 0.049520574510097504, + "learning_rate": 7.67573027358216e-06, + "loss": 0.0005, + "step": 30860 + }, + { + "epoch": 0.61724, + "grad_norm": 0.08457546681165695, + "learning_rate": 7.674372271119508e-06, + "loss": 0.0175, + "step": 30862 + }, + { + "epoch": 0.61728, + "grad_norm": 0.025286365300416946, + "learning_rate": 7.673014313996161e-06, + "loss": 0.0005, + "step": 30864 + }, + { + "epoch": 0.61732, + "grad_norm": 0.4059581160545349, + "learning_rate": 7.671656402238604e-06, + "loss": 0.0035, + "step": 30866 + }, + { + "epoch": 0.61736, + "grad_norm": 22.314308166503906, + "learning_rate": 7.670298535873298e-06, + "loss": 0.2271, + "step": 30868 + }, + { + "epoch": 0.6174, + "grad_norm": 0.30459198355674744, + "learning_rate": 7.668940714926724e-06, + "loss": 0.0036, + "step": 30870 + }, + { + "epoch": 0.61744, + "grad_norm": 0.0412948876619339, + "learning_rate": 7.667582939425351e-06, + "loss": 0.0011, + "step": 30872 + }, + { + "epoch": 0.61748, + "grad_norm": 0.028231091797351837, + "learning_rate": 7.666225209395648e-06, + "loss": 0.001, + "step": 30874 + }, + { + "epoch": 0.61752, + "grad_norm": 0.7136555314064026, + "learning_rate": 7.664867524864088e-06, + "loss": 0.0067, + "step": 30876 + }, + { + "epoch": 0.61756, + "grad_norm": 0.922121524810791, + "learning_rate": 7.66350988585714e-06, + "loss": 0.0131, + "step": 30878 + }, + { + "epoch": 0.6176, + "grad_norm": 0.09966624528169632, + "learning_rate": 7.662152292401265e-06, + "loss": 0.0033, + "step": 30880 + }, + { + "epoch": 0.61764, + "grad_norm": 3.5858850479125977, + "learning_rate": 7.660794744522938e-06, + "loss": 0.0307, + "step": 30882 + }, + { + "epoch": 0.61768, + "grad_norm": 0.002886170754209161, + "learning_rate": 7.659437242248617e-06, + "loss": 0.0133, + "step": 30884 + }, + { + "epoch": 0.61772, + "grad_norm": 0.09298573434352875, + "learning_rate": 7.658079785604776e-06, + "loss": 0.1385, + "step": 30886 + }, + { + "epoch": 0.61776, + "grad_norm": 0.1351897418498993, + "learning_rate": 7.656722374617876e-06, + "loss": 0.0046, + "step": 30888 + }, + { + "epoch": 0.6178, + "grad_norm": 0.6605213284492493, + "learning_rate": 7.655365009314375e-06, + "loss": 0.0284, + "step": 30890 + }, + { + "epoch": 0.61784, + "grad_norm": 9.75413703918457, + "learning_rate": 7.654007689720746e-06, + "loss": 0.2507, + "step": 30892 + }, + { + "epoch": 0.61788, + "grad_norm": 0.10200540721416473, + "learning_rate": 7.652650415863443e-06, + "loss": 0.0015, + "step": 30894 + }, + { + "epoch": 0.61792, + "grad_norm": 0.04953261837363243, + "learning_rate": 7.651293187768927e-06, + "loss": 0.4305, + "step": 30896 + }, + { + "epoch": 0.61796, + "grad_norm": 0.004517833702266216, + "learning_rate": 7.649936005463662e-06, + "loss": 0.0001, + "step": 30898 + }, + { + "epoch": 0.618, + "grad_norm": 0.33340659737586975, + "learning_rate": 7.6485788689741e-06, + "loss": 0.0035, + "step": 30900 + }, + { + "epoch": 0.61804, + "grad_norm": 0.06935939937829971, + "learning_rate": 7.647221778326708e-06, + "loss": 0.0008, + "step": 30902 + }, + { + "epoch": 0.61808, + "grad_norm": 0.014343783259391785, + "learning_rate": 7.645864733547938e-06, + "loss": 0.0007, + "step": 30904 + }, + { + "epoch": 0.61812, + "grad_norm": 0.37137898802757263, + "learning_rate": 7.644507734664244e-06, + "loss": 0.0176, + "step": 30906 + }, + { + "epoch": 0.61816, + "grad_norm": 0.09416356682777405, + "learning_rate": 7.643150781702087e-06, + "loss": 0.0066, + "step": 30908 + }, + { + "epoch": 0.6182, + "grad_norm": 0.30363762378692627, + "learning_rate": 7.641793874687918e-06, + "loss": 0.0445, + "step": 30910 + }, + { + "epoch": 0.61824, + "grad_norm": 0.01165077742189169, + "learning_rate": 7.640437013648191e-06, + "loss": 0.876, + "step": 30912 + }, + { + "epoch": 0.61828, + "grad_norm": 0.23707714676856995, + "learning_rate": 7.639080198609363e-06, + "loss": 0.0111, + "step": 30914 + }, + { + "epoch": 0.61832, + "grad_norm": 0.013839238323271275, + "learning_rate": 7.637723429597878e-06, + "loss": 0.0134, + "step": 30916 + }, + { + "epoch": 0.61836, + "grad_norm": 0.6706051826477051, + "learning_rate": 7.636366706640193e-06, + "loss": 0.0102, + "step": 30918 + }, + { + "epoch": 0.6184, + "grad_norm": 0.9622282981872559, + "learning_rate": 7.635010029762755e-06, + "loss": 0.0089, + "step": 30920 + }, + { + "epoch": 0.61844, + "grad_norm": 0.3764236271381378, + "learning_rate": 7.633653398992013e-06, + "loss": 0.004, + "step": 30922 + }, + { + "epoch": 0.61848, + "grad_norm": 0.15521249175071716, + "learning_rate": 7.632296814354418e-06, + "loss": 0.151, + "step": 30924 + }, + { + "epoch": 0.61852, + "grad_norm": 0.29471978545188904, + "learning_rate": 7.630940275876413e-06, + "loss": 0.0121, + "step": 30926 + }, + { + "epoch": 0.61856, + "grad_norm": 0.23145373165607452, + "learning_rate": 7.62958378358445e-06, + "loss": 0.0045, + "step": 30928 + }, + { + "epoch": 0.6186, + "grad_norm": 0.002373849041759968, + "learning_rate": 7.628227337504972e-06, + "loss": 0.0239, + "step": 30930 + }, + { + "epoch": 0.61864, + "grad_norm": 0.4711386263370514, + "learning_rate": 7.62687093766442e-06, + "loss": 0.0051, + "step": 30932 + }, + { + "epoch": 0.61868, + "grad_norm": 0.08394095301628113, + "learning_rate": 7.625514584089244e-06, + "loss": 0.0199, + "step": 30934 + }, + { + "epoch": 0.61872, + "grad_norm": 1.111007809638977, + "learning_rate": 7.624158276805883e-06, + "loss": 0.0133, + "step": 30936 + }, + { + "epoch": 0.61876, + "grad_norm": 0.12961189448833466, + "learning_rate": 7.622802015840775e-06, + "loss": 0.002, + "step": 30938 + }, + { + "epoch": 0.6188, + "grad_norm": 0.10581119358539581, + "learning_rate": 7.621445801220372e-06, + "loss": 0.0048, + "step": 30940 + }, + { + "epoch": 0.61884, + "grad_norm": 1.7240737676620483, + "learning_rate": 7.620089632971102e-06, + "loss": 0.025, + "step": 30942 + }, + { + "epoch": 0.61888, + "grad_norm": 0.27604350447654724, + "learning_rate": 7.6187335111194136e-06, + "loss": 0.0035, + "step": 30944 + }, + { + "epoch": 0.61892, + "grad_norm": 4.507542133331299, + "learning_rate": 7.617377435691742e-06, + "loss": 0.0799, + "step": 30946 + }, + { + "epoch": 0.61896, + "grad_norm": 0.6847659945487976, + "learning_rate": 7.616021406714522e-06, + "loss": 0.0085, + "step": 30948 + }, + { + "epoch": 0.619, + "grad_norm": 0.06730266660451889, + "learning_rate": 7.6146654242141935e-06, + "loss": 0.0028, + "step": 30950 + }, + { + "epoch": 0.61904, + "grad_norm": 0.7125744819641113, + "learning_rate": 7.613309488217191e-06, + "loss": 0.0165, + "step": 30952 + }, + { + "epoch": 0.61908, + "grad_norm": 0.0006864942843094468, + "learning_rate": 7.611953598749944e-06, + "loss": 0.3545, + "step": 30954 + }, + { + "epoch": 0.61912, + "grad_norm": 0.15674948692321777, + "learning_rate": 7.6105977558388955e-06, + "loss": 0.0038, + "step": 30956 + }, + { + "epoch": 0.61916, + "grad_norm": 3.7913835048675537, + "learning_rate": 7.6092419595104705e-06, + "loss": 0.047, + "step": 30958 + }, + { + "epoch": 0.6192, + "grad_norm": 0.0007697112741880119, + "learning_rate": 7.6078862097911075e-06, + "loss": 0.0045, + "step": 30960 + }, + { + "epoch": 0.61924, + "grad_norm": 1.3883320093154907, + "learning_rate": 7.606530506707235e-06, + "loss": 0.0401, + "step": 30962 + }, + { + "epoch": 0.61928, + "grad_norm": 0.07566668093204498, + "learning_rate": 7.60517485028528e-06, + "loss": 0.0175, + "step": 30964 + }, + { + "epoch": 0.61932, + "grad_norm": 0.24673788249492645, + "learning_rate": 7.603819240551678e-06, + "loss": 0.0042, + "step": 30966 + }, + { + "epoch": 0.61936, + "grad_norm": 0.2567690908908844, + "learning_rate": 7.60246367753285e-06, + "loss": 0.0034, + "step": 30968 + }, + { + "epoch": 0.6194, + "grad_norm": 0.03302746266126633, + "learning_rate": 7.6011081612552265e-06, + "loss": 0.0023, + "step": 30970 + }, + { + "epoch": 0.61944, + "grad_norm": 0.1019730493426323, + "learning_rate": 7.599752691745237e-06, + "loss": 0.0179, + "step": 30972 + }, + { + "epoch": 0.61948, + "grad_norm": 0.15890946984291077, + "learning_rate": 7.598397269029301e-06, + "loss": 0.0441, + "step": 30974 + }, + { + "epoch": 0.61952, + "grad_norm": 0.03528108075261116, + "learning_rate": 7.5970418931338495e-06, + "loss": 0.0009, + "step": 30976 + }, + { + "epoch": 0.61956, + "grad_norm": 0.0035813050344586372, + "learning_rate": 7.595686564085304e-06, + "loss": 0.0007, + "step": 30978 + }, + { + "epoch": 0.6196, + "grad_norm": 0.06648768484592438, + "learning_rate": 7.594331281910082e-06, + "loss": 0.0016, + "step": 30980 + }, + { + "epoch": 0.61964, + "grad_norm": 0.04314457252621651, + "learning_rate": 7.592976046634616e-06, + "loss": 0.0008, + "step": 30982 + }, + { + "epoch": 0.61968, + "grad_norm": 0.06159967929124832, + "learning_rate": 7.59162085828532e-06, + "loss": 0.0007, + "step": 30984 + }, + { + "epoch": 0.61972, + "grad_norm": 0.032844629138708115, + "learning_rate": 7.590265716888613e-06, + "loss": 0.0004, + "step": 30986 + }, + { + "epoch": 0.61976, + "grad_norm": 1.536316990852356, + "learning_rate": 7.58891062247092e-06, + "loss": 0.0243, + "step": 30988 + }, + { + "epoch": 0.6198, + "grad_norm": 0.7120261192321777, + "learning_rate": 7.58755557505865e-06, + "loss": 0.0087, + "step": 30990 + }, + { + "epoch": 0.61984, + "grad_norm": 0.010984672233462334, + "learning_rate": 7.586200574678231e-06, + "loss": 0.0006, + "step": 30992 + }, + { + "epoch": 0.61988, + "grad_norm": 0.4420309364795685, + "learning_rate": 7.584845621356074e-06, + "loss": 0.0094, + "step": 30994 + }, + { + "epoch": 0.61992, + "grad_norm": 2.827888011932373, + "learning_rate": 7.583490715118591e-06, + "loss": 0.0323, + "step": 30996 + }, + { + "epoch": 0.61996, + "grad_norm": 0.3102402985095978, + "learning_rate": 7.582135855992205e-06, + "loss": 0.0034, + "step": 30998 + }, + { + "epoch": 0.62, + "grad_norm": 0.389903724193573, + "learning_rate": 7.580781044003324e-06, + "loss": 0.0051, + "step": 31000 + }, + { + "epoch": 0.62004, + "grad_norm": 0.4298730194568634, + "learning_rate": 7.579426279178362e-06, + "loss": 0.2708, + "step": 31002 + }, + { + "epoch": 0.62008, + "grad_norm": 6.930491924285889, + "learning_rate": 7.578071561543732e-06, + "loss": 0.1183, + "step": 31004 + }, + { + "epoch": 0.62012, + "grad_norm": 0.39811283349990845, + "learning_rate": 7.57671689112584e-06, + "loss": 0.0184, + "step": 31006 + }, + { + "epoch": 0.62016, + "grad_norm": 0.5465975999832153, + "learning_rate": 7.5753622679511044e-06, + "loss": 0.0058, + "step": 31008 + }, + { + "epoch": 0.6202, + "grad_norm": 0.19249533116817474, + "learning_rate": 7.574007692045928e-06, + "loss": 0.0031, + "step": 31010 + }, + { + "epoch": 0.62024, + "grad_norm": 0.22211775183677673, + "learning_rate": 7.572653163436718e-06, + "loss": 0.0029, + "step": 31012 + }, + { + "epoch": 0.62028, + "grad_norm": 0.0562053918838501, + "learning_rate": 7.571298682149888e-06, + "loss": 0.1063, + "step": 31014 + }, + { + "epoch": 0.62032, + "grad_norm": 0.15118162333965302, + "learning_rate": 7.569944248211838e-06, + "loss": 0.0035, + "step": 31016 + }, + { + "epoch": 0.62036, + "grad_norm": 0.106383316218853, + "learning_rate": 7.568589861648978e-06, + "loss": 0.0061, + "step": 31018 + }, + { + "epoch": 0.6204, + "grad_norm": 0.0017797602340579033, + "learning_rate": 7.5672355224877115e-06, + "loss": 0.0002, + "step": 31020 + }, + { + "epoch": 0.62044, + "grad_norm": 0.20290738344192505, + "learning_rate": 7.565881230754436e-06, + "loss": 0.0058, + "step": 31022 + }, + { + "epoch": 0.62048, + "grad_norm": 0.4846250116825104, + "learning_rate": 7.564526986475563e-06, + "loss": 0.0057, + "step": 31024 + }, + { + "epoch": 0.62052, + "grad_norm": 0.05928203836083412, + "learning_rate": 7.563172789677491e-06, + "loss": 0.0078, + "step": 31026 + }, + { + "epoch": 0.62056, + "grad_norm": 0.006197931244969368, + "learning_rate": 7.5618186403866155e-06, + "loss": 0.0027, + "step": 31028 + }, + { + "epoch": 0.6206, + "grad_norm": 0.033299464732408524, + "learning_rate": 7.560464538629345e-06, + "loss": 0.0174, + "step": 31030 + }, + { + "epoch": 0.62064, + "grad_norm": 0.2095312923192978, + "learning_rate": 7.559110484432073e-06, + "loss": 0.0081, + "step": 31032 + }, + { + "epoch": 0.62068, + "grad_norm": 0.10088931024074554, + "learning_rate": 7.557756477821199e-06, + "loss": 0.0037, + "step": 31034 + }, + { + "epoch": 0.62072, + "grad_norm": 0.21590295433998108, + "learning_rate": 7.55640251882312e-06, + "loss": 0.0022, + "step": 31036 + }, + { + "epoch": 0.62076, + "grad_norm": 3.808238983154297, + "learning_rate": 7.55504860746423e-06, + "loss": 0.0502, + "step": 31038 + }, + { + "epoch": 0.6208, + "grad_norm": 0.005622537806630135, + "learning_rate": 7.553694743770928e-06, + "loss": 0.1759, + "step": 31040 + }, + { + "epoch": 0.62084, + "grad_norm": 2.303985118865967, + "learning_rate": 7.552340927769608e-06, + "loss": 0.0342, + "step": 31042 + }, + { + "epoch": 0.62088, + "grad_norm": 0.01572628691792488, + "learning_rate": 7.550987159486657e-06, + "loss": 0.0041, + "step": 31044 + }, + { + "epoch": 0.62092, + "grad_norm": 0.019247805699706078, + "learning_rate": 7.549633438948476e-06, + "loss": 0.0007, + "step": 31046 + }, + { + "epoch": 0.62096, + "grad_norm": 0.12699012458324432, + "learning_rate": 7.548279766181451e-06, + "loss": 0.0021, + "step": 31048 + }, + { + "epoch": 0.621, + "grad_norm": 0.016951784491539, + "learning_rate": 7.546926141211975e-06, + "loss": 0.0022, + "step": 31050 + }, + { + "epoch": 0.62104, + "grad_norm": 0.001140800304710865, + "learning_rate": 7.545572564066436e-06, + "loss": 0.0003, + "step": 31052 + }, + { + "epoch": 0.62108, + "grad_norm": 4.157416343688965, + "learning_rate": 7.544219034771225e-06, + "loss": 0.045, + "step": 31054 + }, + { + "epoch": 0.62112, + "grad_norm": 0.46072152256965637, + "learning_rate": 7.542865553352728e-06, + "loss": 0.035, + "step": 31056 + }, + { + "epoch": 0.62116, + "grad_norm": 0.9790863990783691, + "learning_rate": 7.541512119837333e-06, + "loss": 0.0155, + "step": 31058 + }, + { + "epoch": 0.6212, + "grad_norm": 0.3963067829608917, + "learning_rate": 7.54015873425142e-06, + "loss": 0.0036, + "step": 31060 + }, + { + "epoch": 0.62124, + "grad_norm": 0.15027640759944916, + "learning_rate": 7.538805396621384e-06, + "loss": 0.0028, + "step": 31062 + }, + { + "epoch": 0.62128, + "grad_norm": 0.10034507513046265, + "learning_rate": 7.5374521069736e-06, + "loss": 0.0012, + "step": 31064 + }, + { + "epoch": 0.62132, + "grad_norm": 0.020767942070961, + "learning_rate": 7.536098865334458e-06, + "loss": 0.0002, + "step": 31066 + }, + { + "epoch": 0.62136, + "grad_norm": 0.07826952636241913, + "learning_rate": 7.534745671730337e-06, + "loss": 0.0009, + "step": 31068 + }, + { + "epoch": 0.6214, + "grad_norm": 0.014354903250932693, + "learning_rate": 7.533392526187617e-06, + "loss": 0.0002, + "step": 31070 + }, + { + "epoch": 0.62144, + "grad_norm": 0.11711080372333527, + "learning_rate": 7.532039428732681e-06, + "loss": 0.0046, + "step": 31072 + }, + { + "epoch": 0.62148, + "grad_norm": 0.08009708672761917, + "learning_rate": 7.530686379391908e-06, + "loss": 0.0009, + "step": 31074 + }, + { + "epoch": 0.62152, + "grad_norm": 0.1266847848892212, + "learning_rate": 7.52933337819167e-06, + "loss": 0.0116, + "step": 31076 + }, + { + "epoch": 0.62156, + "grad_norm": 0.002535708947107196, + "learning_rate": 7.527980425158355e-06, + "loss": 0.0093, + "step": 31078 + }, + { + "epoch": 0.6216, + "grad_norm": 0.16278693079948425, + "learning_rate": 7.526627520318329e-06, + "loss": 0.0028, + "step": 31080 + }, + { + "epoch": 0.62164, + "grad_norm": 0.26781487464904785, + "learning_rate": 7.525274663697978e-06, + "loss": 0.0029, + "step": 31082 + }, + { + "epoch": 0.62168, + "grad_norm": 0.1088237315416336, + "learning_rate": 7.523921855323672e-06, + "loss": 0.0013, + "step": 31084 + }, + { + "epoch": 0.62172, + "grad_norm": 0.20916813611984253, + "learning_rate": 7.522569095221781e-06, + "loss": 0.4127, + "step": 31086 + }, + { + "epoch": 0.62176, + "grad_norm": 0.028380922973155975, + "learning_rate": 7.521216383418683e-06, + "loss": 0.0004, + "step": 31088 + }, + { + "epoch": 0.6218, + "grad_norm": 0.4700152277946472, + "learning_rate": 7.519863719940748e-06, + "loss": 0.0058, + "step": 31090 + }, + { + "epoch": 0.62184, + "grad_norm": 0.17666727304458618, + "learning_rate": 7.518511104814343e-06, + "loss": 0.0024, + "step": 31092 + }, + { + "epoch": 0.62188, + "grad_norm": 0.15574225783348083, + "learning_rate": 7.517158538065845e-06, + "loss": 0.003, + "step": 31094 + }, + { + "epoch": 0.62192, + "grad_norm": 0.22271984815597534, + "learning_rate": 7.515806019721615e-06, + "loss": 0.0017, + "step": 31096 + }, + { + "epoch": 0.62196, + "grad_norm": 0.010996654629707336, + "learning_rate": 7.514453549808032e-06, + "loss": 0.0025, + "step": 31098 + }, + { + "epoch": 0.622, + "grad_norm": 15.7056884765625, + "learning_rate": 7.513101128351454e-06, + "loss": 1.2788, + "step": 31100 + }, + { + "epoch": 0.62204, + "grad_norm": 0.1964249610900879, + "learning_rate": 7.51174875537825e-06, + "loss": 0.0024, + "step": 31102 + }, + { + "epoch": 0.62208, + "grad_norm": 7.494671821594238, + "learning_rate": 7.5103964309147855e-06, + "loss": 0.1396, + "step": 31104 + }, + { + "epoch": 0.62212, + "grad_norm": 0.009905073791742325, + "learning_rate": 7.509044154987423e-06, + "loss": 0.0018, + "step": 31106 + }, + { + "epoch": 0.62216, + "grad_norm": 1.0586129426956177, + "learning_rate": 7.507691927622527e-06, + "loss": 0.0196, + "step": 31108 + }, + { + "epoch": 0.6222, + "grad_norm": 1.1705061197280884, + "learning_rate": 7.506339748846461e-06, + "loss": 0.0156, + "step": 31110 + }, + { + "epoch": 0.62224, + "grad_norm": 0.09493441879749298, + "learning_rate": 7.504987618685582e-06, + "loss": 0.285, + "step": 31112 + }, + { + "epoch": 0.62228, + "grad_norm": 0.0006764042191207409, + "learning_rate": 7.503635537166258e-06, + "loss": 0.0039, + "step": 31114 + }, + { + "epoch": 0.62232, + "grad_norm": 0.08971883356571198, + "learning_rate": 7.502283504314846e-06, + "loss": 0.3193, + "step": 31116 + }, + { + "epoch": 0.62236, + "grad_norm": 0.31529563665390015, + "learning_rate": 7.500931520157697e-06, + "loss": 0.0031, + "step": 31118 + }, + { + "epoch": 0.6224, + "grad_norm": 3.5258233547210693, + "learning_rate": 7.49957958472118e-06, + "loss": 0.0527, + "step": 31120 + }, + { + "epoch": 0.62244, + "grad_norm": 0.039310481399297714, + "learning_rate": 7.4982276980316426e-06, + "loss": 0.0425, + "step": 31122 + }, + { + "epoch": 0.62248, + "grad_norm": 0.5605636239051819, + "learning_rate": 7.496875860115447e-06, + "loss": 0.0144, + "step": 31124 + }, + { + "epoch": 0.62252, + "grad_norm": 0.10991386324167252, + "learning_rate": 7.4955240709989465e-06, + "loss": 0.0016, + "step": 31126 + }, + { + "epoch": 0.62256, + "grad_norm": 0.013063249178230762, + "learning_rate": 7.4941723307084875e-06, + "loss": 0.024, + "step": 31128 + }, + { + "epoch": 0.6226, + "grad_norm": 0.7872563600540161, + "learning_rate": 7.492820639270435e-06, + "loss": 0.0186, + "step": 31130 + }, + { + "epoch": 0.62264, + "grad_norm": 0.6672159433364868, + "learning_rate": 7.491468996711133e-06, + "loss": 0.0111, + "step": 31132 + }, + { + "epoch": 0.62268, + "grad_norm": 0.04978490248322487, + "learning_rate": 7.490117403056931e-06, + "loss": 0.0967, + "step": 31134 + }, + { + "epoch": 0.62272, + "grad_norm": 3.429258108139038, + "learning_rate": 7.488765858334188e-06, + "loss": 0.046, + "step": 31136 + }, + { + "epoch": 0.62276, + "grad_norm": 0.1194697916507721, + "learning_rate": 7.487414362569243e-06, + "loss": 0.0017, + "step": 31138 + }, + { + "epoch": 0.6228, + "grad_norm": 1.1191067695617676, + "learning_rate": 7.486062915788453e-06, + "loss": 0.1188, + "step": 31140 + }, + { + "epoch": 0.62284, + "grad_norm": 0.11091174185276031, + "learning_rate": 7.48471151801816e-06, + "loss": 0.016, + "step": 31142 + }, + { + "epoch": 0.62288, + "grad_norm": 15.308854103088379, + "learning_rate": 7.483360169284706e-06, + "loss": 0.6102, + "step": 31144 + }, + { + "epoch": 0.62292, + "grad_norm": 0.003580351360142231, + "learning_rate": 7.482008869614446e-06, + "loss": 0.0017, + "step": 31146 + }, + { + "epoch": 0.62296, + "grad_norm": 0.009906753897666931, + "learning_rate": 7.480657619033718e-06, + "loss": 0.0013, + "step": 31148 + }, + { + "epoch": 0.623, + "grad_norm": 0.8743605613708496, + "learning_rate": 7.4793064175688635e-06, + "loss": 0.0094, + "step": 31150 + }, + { + "epoch": 0.62304, + "grad_norm": 1.092905044555664, + "learning_rate": 7.4779552652462325e-06, + "loss": 0.0134, + "step": 31152 + }, + { + "epoch": 0.62308, + "grad_norm": 0.013674641959369183, + "learning_rate": 7.47660416209216e-06, + "loss": 0.0004, + "step": 31154 + }, + { + "epoch": 0.62312, + "grad_norm": 0.25882506370544434, + "learning_rate": 7.47525310813299e-06, + "loss": 0.0038, + "step": 31156 + }, + { + "epoch": 0.62316, + "grad_norm": 0.39943620562553406, + "learning_rate": 7.473902103395061e-06, + "loss": 0.0232, + "step": 31158 + }, + { + "epoch": 0.6232, + "grad_norm": 0.051459431648254395, + "learning_rate": 7.472551147904708e-06, + "loss": 0.0039, + "step": 31160 + }, + { + "epoch": 0.62324, + "grad_norm": 0.18015632033348083, + "learning_rate": 7.471200241688274e-06, + "loss": 0.0173, + "step": 31162 + }, + { + "epoch": 0.62328, + "grad_norm": 0.16177304089069366, + "learning_rate": 7.469849384772093e-06, + "loss": 0.0552, + "step": 31164 + }, + { + "epoch": 0.62332, + "grad_norm": 0.277736634016037, + "learning_rate": 7.468498577182498e-06, + "loss": 0.0067, + "step": 31166 + }, + { + "epoch": 0.62336, + "grad_norm": 0.7509629130363464, + "learning_rate": 7.4671478189458305e-06, + "loss": 0.0167, + "step": 31168 + }, + { + "epoch": 0.6234, + "grad_norm": 0.11838442832231522, + "learning_rate": 7.465797110088417e-06, + "loss": 0.0024, + "step": 31170 + }, + { + "epoch": 0.62344, + "grad_norm": 0.0399327352643013, + "learning_rate": 7.464446450636597e-06, + "loss": 0.3735, + "step": 31172 + }, + { + "epoch": 0.62348, + "grad_norm": 0.005077596753835678, + "learning_rate": 7.463095840616697e-06, + "loss": 0.0045, + "step": 31174 + }, + { + "epoch": 0.62352, + "grad_norm": 0.24496281147003174, + "learning_rate": 7.461745280055049e-06, + "loss": 0.0067, + "step": 31176 + }, + { + "epoch": 0.62356, + "grad_norm": 8.386669158935547, + "learning_rate": 7.4603947689779845e-06, + "loss": 0.106, + "step": 31178 + }, + { + "epoch": 0.6236, + "grad_norm": 3.5129411220550537, + "learning_rate": 7.4590443074118325e-06, + "loss": 0.0822, + "step": 31180 + }, + { + "epoch": 0.62364, + "grad_norm": 1.0095124244689941, + "learning_rate": 7.457693895382916e-06, + "loss": 0.0118, + "step": 31182 + }, + { + "epoch": 0.62368, + "grad_norm": 3.692737579345703, + "learning_rate": 7.45634353291757e-06, + "loss": 0.0524, + "step": 31184 + }, + { + "epoch": 0.62372, + "grad_norm": 0.30676695704460144, + "learning_rate": 7.45499322004211e-06, + "loss": 0.0336, + "step": 31186 + }, + { + "epoch": 0.62376, + "grad_norm": 0.04992116242647171, + "learning_rate": 7.453642956782875e-06, + "loss": 0.0063, + "step": 31188 + }, + { + "epoch": 0.6238, + "grad_norm": 0.7834407091140747, + "learning_rate": 7.4522927431661805e-06, + "loss": 0.0084, + "step": 31190 + }, + { + "epoch": 0.62384, + "grad_norm": 0.13057416677474976, + "learning_rate": 7.450942579218348e-06, + "loss": 0.0095, + "step": 31192 + }, + { + "epoch": 0.62388, + "grad_norm": 0.01569710671901703, + "learning_rate": 7.449592464965704e-06, + "loss": 0.0003, + "step": 31194 + }, + { + "epoch": 0.62392, + "grad_norm": 3.060992479324341, + "learning_rate": 7.448242400434569e-06, + "loss": 0.0409, + "step": 31196 + }, + { + "epoch": 0.62396, + "grad_norm": 2.2794458866119385, + "learning_rate": 7.446892385651256e-06, + "loss": 0.093, + "step": 31198 + }, + { + "epoch": 0.624, + "grad_norm": 0.05219431594014168, + "learning_rate": 7.445542420642097e-06, + "loss": 0.008, + "step": 31200 + }, + { + "epoch": 0.62404, + "grad_norm": 0.654983639717102, + "learning_rate": 7.444192505433399e-06, + "loss": 0.4769, + "step": 31202 + }, + { + "epoch": 0.62408, + "grad_norm": 10.103184700012207, + "learning_rate": 7.442842640051487e-06, + "loss": 0.1324, + "step": 31204 + }, + { + "epoch": 0.62412, + "grad_norm": 0.8013262748718262, + "learning_rate": 7.441492824522674e-06, + "loss": 0.0127, + "step": 31206 + }, + { + "epoch": 0.62416, + "grad_norm": 1.0597976446151733, + "learning_rate": 7.440143058873275e-06, + "loss": 0.0138, + "step": 31208 + }, + { + "epoch": 0.6242, + "grad_norm": 0.2812664210796356, + "learning_rate": 7.438793343129605e-06, + "loss": 0.0372, + "step": 31210 + }, + { + "epoch": 0.62424, + "grad_norm": 1.0592083930969238, + "learning_rate": 7.43744367731798e-06, + "loss": 0.0123, + "step": 31212 + }, + { + "epoch": 0.62428, + "grad_norm": 5.940463542938232, + "learning_rate": 7.436094061464704e-06, + "loss": 0.573, + "step": 31214 + }, + { + "epoch": 0.62432, + "grad_norm": 0.014451706781983376, + "learning_rate": 7.434744495596098e-06, + "loss": 0.0119, + "step": 31216 + }, + { + "epoch": 0.62436, + "grad_norm": 0.4617650806903839, + "learning_rate": 7.433394979738465e-06, + "loss": 0.0057, + "step": 31218 + }, + { + "epoch": 0.6244, + "grad_norm": 14.801005363464355, + "learning_rate": 7.432045513918122e-06, + "loss": 0.9719, + "step": 31220 + }, + { + "epoch": 0.62444, + "grad_norm": 0.3942791819572449, + "learning_rate": 7.430696098161377e-06, + "loss": 0.053, + "step": 31222 + }, + { + "epoch": 0.62448, + "grad_norm": 0.004160311073064804, + "learning_rate": 7.429346732494529e-06, + "loss": 0.1265, + "step": 31224 + }, + { + "epoch": 0.62452, + "grad_norm": 0.43840622901916504, + "learning_rate": 7.427997416943893e-06, + "loss": 0.0068, + "step": 31226 + }, + { + "epoch": 0.62456, + "grad_norm": 0.14308203756809235, + "learning_rate": 7.426648151535767e-06, + "loss": 0.0047, + "step": 31228 + }, + { + "epoch": 0.6246, + "grad_norm": 0.35600849986076355, + "learning_rate": 7.4252989362964635e-06, + "loss": 0.0056, + "step": 31230 + }, + { + "epoch": 0.62464, + "grad_norm": 1.7647074460983276, + "learning_rate": 7.423949771252283e-06, + "loss": 0.0269, + "step": 31232 + }, + { + "epoch": 0.62468, + "grad_norm": 0.016198808327317238, + "learning_rate": 7.422600656429525e-06, + "loss": 0.0021, + "step": 31234 + }, + { + "epoch": 0.62472, + "grad_norm": 1.3655314445495605, + "learning_rate": 7.421251591854498e-06, + "loss": 0.0216, + "step": 31236 + }, + { + "epoch": 0.62476, + "grad_norm": 0.021238747984170914, + "learning_rate": 7.419902577553498e-06, + "loss": 0.0008, + "step": 31238 + }, + { + "epoch": 0.6248, + "grad_norm": 0.004898239858448505, + "learning_rate": 7.418553613552824e-06, + "loss": 0.0213, + "step": 31240 + }, + { + "epoch": 0.62484, + "grad_norm": 0.12263903766870499, + "learning_rate": 7.417204699878777e-06, + "loss": 0.0017, + "step": 31242 + }, + { + "epoch": 0.62488, + "grad_norm": 0.028845705091953278, + "learning_rate": 7.4158558365576525e-06, + "loss": 0.0038, + "step": 31244 + }, + { + "epoch": 0.62492, + "grad_norm": 0.034109920263290405, + "learning_rate": 7.414507023615754e-06, + "loss": 0.0153, + "step": 31246 + }, + { + "epoch": 0.62496, + "grad_norm": 0.6218048930168152, + "learning_rate": 7.413158261079369e-06, + "loss": 0.0393, + "step": 31248 + }, + { + "epoch": 0.625, + "grad_norm": 0.12852460145950317, + "learning_rate": 7.411809548974792e-06, + "loss": 0.0056, + "step": 31250 + }, + { + "epoch": 0.62504, + "grad_norm": 0.015783216804265976, + "learning_rate": 7.410460887328326e-06, + "loss": 0.0011, + "step": 31252 + }, + { + "epoch": 0.62508, + "grad_norm": 5.221416473388672, + "learning_rate": 7.409112276166257e-06, + "loss": 0.4191, + "step": 31254 + }, + { + "epoch": 0.62512, + "grad_norm": 0.3159046769142151, + "learning_rate": 7.407763715514873e-06, + "loss": 0.0516, + "step": 31256 + }, + { + "epoch": 0.62516, + "grad_norm": 0.08196666091680527, + "learning_rate": 7.406415205400475e-06, + "loss": 0.0075, + "step": 31258 + }, + { + "epoch": 0.6252, + "grad_norm": 12.138805389404297, + "learning_rate": 7.405066745849347e-06, + "loss": 0.3265, + "step": 31260 + }, + { + "epoch": 0.62524, + "grad_norm": 0.08773002028465271, + "learning_rate": 7.40371833688778e-06, + "loss": 0.0013, + "step": 31262 + }, + { + "epoch": 0.62528, + "grad_norm": 0.001173201366327703, + "learning_rate": 7.4023699785420585e-06, + "loss": 0.0001, + "step": 31264 + }, + { + "epoch": 0.62532, + "grad_norm": 0.1587437391281128, + "learning_rate": 7.401021670838471e-06, + "loss": 0.0601, + "step": 31266 + }, + { + "epoch": 0.62536, + "grad_norm": 0.12430432438850403, + "learning_rate": 7.399673413803305e-06, + "loss": 0.0043, + "step": 31268 + }, + { + "epoch": 0.6254, + "grad_norm": 0.9547127485275269, + "learning_rate": 7.398325207462846e-06, + "loss": 0.0125, + "step": 31270 + }, + { + "epoch": 0.62544, + "grad_norm": 0.21513010561466217, + "learning_rate": 7.396977051843373e-06, + "loss": 0.0676, + "step": 31272 + }, + { + "epoch": 0.62548, + "grad_norm": 0.058136556297540665, + "learning_rate": 7.395628946971176e-06, + "loss": 0.004, + "step": 31274 + }, + { + "epoch": 0.62552, + "grad_norm": 0.1288098394870758, + "learning_rate": 7.394280892872531e-06, + "loss": 0.0023, + "step": 31276 + }, + { + "epoch": 0.62556, + "grad_norm": 3.8783020973205566, + "learning_rate": 7.392932889573725e-06, + "loss": 0.0527, + "step": 31278 + }, + { + "epoch": 0.6256, + "grad_norm": 0.058138247579336166, + "learning_rate": 7.391584937101034e-06, + "loss": 0.0012, + "step": 31280 + }, + { + "epoch": 0.62564, + "grad_norm": 0.08488816767930984, + "learning_rate": 7.390237035480733e-06, + "loss": 0.0018, + "step": 31282 + }, + { + "epoch": 0.62568, + "grad_norm": 0.39730650186538696, + "learning_rate": 7.3888891847391085e-06, + "loss": 0.0062, + "step": 31284 + }, + { + "epoch": 0.62572, + "grad_norm": 0.596539318561554, + "learning_rate": 7.387541384902435e-06, + "loss": 0.0094, + "step": 31286 + }, + { + "epoch": 0.62576, + "grad_norm": 0.011292646639049053, + "learning_rate": 7.386193635996981e-06, + "loss": 0.0065, + "step": 31288 + }, + { + "epoch": 0.6258, + "grad_norm": 0.5311620831489563, + "learning_rate": 7.384845938049033e-06, + "loss": 0.0091, + "step": 31290 + }, + { + "epoch": 0.62584, + "grad_norm": 0.014415093697607517, + "learning_rate": 7.383498291084857e-06, + "loss": 0.0066, + "step": 31292 + }, + { + "epoch": 0.62588, + "grad_norm": 0.08430857956409454, + "learning_rate": 7.382150695130732e-06, + "loss": 0.0156, + "step": 31294 + }, + { + "epoch": 0.62592, + "grad_norm": 0.26926249265670776, + "learning_rate": 7.3808031502129275e-06, + "loss": 0.0045, + "step": 31296 + }, + { + "epoch": 0.62596, + "grad_norm": 0.06867936253547668, + "learning_rate": 7.379455656357708e-06, + "loss": 0.001, + "step": 31298 + }, + { + "epoch": 0.626, + "grad_norm": 0.07374703884124756, + "learning_rate": 7.378108213591355e-06, + "loss": 0.0013, + "step": 31300 + }, + { + "epoch": 0.62604, + "grad_norm": 0.7902337312698364, + "learning_rate": 7.376760821940132e-06, + "loss": 0.0751, + "step": 31302 + }, + { + "epoch": 0.62608, + "grad_norm": 0.9392643570899963, + "learning_rate": 7.375413481430303e-06, + "loss": 0.0158, + "step": 31304 + }, + { + "epoch": 0.62612, + "grad_norm": 0.19996482133865356, + "learning_rate": 7.374066192088143e-06, + "loss": 0.0088, + "step": 31306 + }, + { + "epoch": 0.62616, + "grad_norm": 0.18948613107204437, + "learning_rate": 7.372718953939914e-06, + "loss": 0.0891, + "step": 31308 + }, + { + "epoch": 0.6262, + "grad_norm": 16.796262741088867, + "learning_rate": 7.37137176701188e-06, + "loss": 0.6107, + "step": 31310 + }, + { + "epoch": 0.62624, + "grad_norm": 0.281244695186615, + "learning_rate": 7.370024631330309e-06, + "loss": 0.0341, + "step": 31312 + }, + { + "epoch": 0.62628, + "grad_norm": 0.10830851644277573, + "learning_rate": 7.368677546921461e-06, + "loss": 0.0019, + "step": 31314 + }, + { + "epoch": 0.62632, + "grad_norm": 0.18978168070316315, + "learning_rate": 7.367330513811599e-06, + "loss": 0.0289, + "step": 31316 + }, + { + "epoch": 0.62636, + "grad_norm": 0.028386183083057404, + "learning_rate": 7.365983532026986e-06, + "loss": 0.0023, + "step": 31318 + }, + { + "epoch": 0.6264, + "grad_norm": 0.22271476686000824, + "learning_rate": 7.364636601593875e-06, + "loss": 0.0038, + "step": 31320 + }, + { + "epoch": 0.62644, + "grad_norm": 0.04635992273688316, + "learning_rate": 7.363289722538534e-06, + "loss": 0.0009, + "step": 31322 + }, + { + "epoch": 0.62648, + "grad_norm": 1.4902387857437134, + "learning_rate": 7.361942894887215e-06, + "loss": 0.0265, + "step": 31324 + }, + { + "epoch": 0.62652, + "grad_norm": 0.01839093305170536, + "learning_rate": 7.360596118666179e-06, + "loss": 0.0009, + "step": 31326 + }, + { + "epoch": 0.62656, + "grad_norm": 0.03994540870189667, + "learning_rate": 7.3592493939016815e-06, + "loss": 0.001, + "step": 31328 + }, + { + "epoch": 0.6266, + "grad_norm": 0.7888814210891724, + "learning_rate": 7.357902720619976e-06, + "loss": 0.0103, + "step": 31330 + }, + { + "epoch": 0.62664, + "grad_norm": 0.09743326157331467, + "learning_rate": 7.356556098847318e-06, + "loss": 0.0077, + "step": 31332 + }, + { + "epoch": 0.62668, + "grad_norm": 0.9948057532310486, + "learning_rate": 7.355209528609955e-06, + "loss": 0.0118, + "step": 31334 + }, + { + "epoch": 0.62672, + "grad_norm": 0.982292890548706, + "learning_rate": 7.35386300993415e-06, + "loss": 0.0109, + "step": 31336 + }, + { + "epoch": 0.62676, + "grad_norm": 5.540365695953369, + "learning_rate": 7.3525165428461465e-06, + "loss": 0.0717, + "step": 31338 + }, + { + "epoch": 0.6268, + "grad_norm": 0.01789987087249756, + "learning_rate": 7.351170127372191e-06, + "loss": 0.0037, + "step": 31340 + }, + { + "epoch": 0.62684, + "grad_norm": 0.05664494261145592, + "learning_rate": 7.349823763538544e-06, + "loss": 0.005, + "step": 31342 + }, + { + "epoch": 0.62688, + "grad_norm": 0.07401535660028458, + "learning_rate": 7.348477451371445e-06, + "loss": 0.0015, + "step": 31344 + }, + { + "epoch": 0.62692, + "grad_norm": 0.0543343685567379, + "learning_rate": 7.347131190897144e-06, + "loss": 0.001, + "step": 31346 + }, + { + "epoch": 0.62696, + "grad_norm": 0.2650677263736725, + "learning_rate": 7.345784982141886e-06, + "loss": 0.0032, + "step": 31348 + }, + { + "epoch": 0.627, + "grad_norm": 0.3901927173137665, + "learning_rate": 7.344438825131912e-06, + "loss": 0.0074, + "step": 31350 + }, + { + "epoch": 0.62704, + "grad_norm": 0.06337747722864151, + "learning_rate": 7.3430927198934745e-06, + "loss": 0.3925, + "step": 31352 + }, + { + "epoch": 0.62708, + "grad_norm": 0.044139619916677475, + "learning_rate": 7.341746666452814e-06, + "loss": 0.0016, + "step": 31354 + }, + { + "epoch": 0.62712, + "grad_norm": 0.12306879460811615, + "learning_rate": 7.340400664836165e-06, + "loss": 0.0025, + "step": 31356 + }, + { + "epoch": 0.62716, + "grad_norm": 0.02353224903345108, + "learning_rate": 7.339054715069778e-06, + "loss": 0.0029, + "step": 31358 + }, + { + "epoch": 0.6272, + "grad_norm": 0.5992239117622375, + "learning_rate": 7.33770881717989e-06, + "loss": 0.0069, + "step": 31360 + }, + { + "epoch": 0.62724, + "grad_norm": 0.1084524542093277, + "learning_rate": 7.3363629711927375e-06, + "loss": 0.0042, + "step": 31362 + }, + { + "epoch": 0.62728, + "grad_norm": 1.3938908576965332, + "learning_rate": 7.335017177134562e-06, + "loss": 0.0285, + "step": 31364 + }, + { + "epoch": 0.62732, + "grad_norm": 0.10075213760137558, + "learning_rate": 7.333671435031595e-06, + "loss": 0.0027, + "step": 31366 + }, + { + "epoch": 0.62736, + "grad_norm": 0.12802214920520782, + "learning_rate": 7.33232574491008e-06, + "loss": 0.1768, + "step": 31368 + }, + { + "epoch": 0.6274, + "grad_norm": 0.06431638449430466, + "learning_rate": 7.330980106796247e-06, + "loss": 0.004, + "step": 31370 + }, + { + "epoch": 0.62744, + "grad_norm": 0.7978562116622925, + "learning_rate": 7.329634520716328e-06, + "loss": 0.0307, + "step": 31372 + }, + { + "epoch": 0.62748, + "grad_norm": 0.13564088940620422, + "learning_rate": 7.328288986696562e-06, + "loss": 0.0023, + "step": 31374 + }, + { + "epoch": 0.62752, + "grad_norm": 0.6425479054450989, + "learning_rate": 7.326943504763178e-06, + "loss": 0.0084, + "step": 31376 + }, + { + "epoch": 0.62756, + "grad_norm": 0.04244162142276764, + "learning_rate": 7.325598074942402e-06, + "loss": 0.0719, + "step": 31378 + }, + { + "epoch": 0.6276, + "grad_norm": 0.024953318759799004, + "learning_rate": 7.324252697260475e-06, + "loss": 0.0008, + "step": 31380 + }, + { + "epoch": 0.62764, + "grad_norm": 0.33566489815711975, + "learning_rate": 7.322907371743615e-06, + "loss": 0.0047, + "step": 31382 + }, + { + "epoch": 0.62768, + "grad_norm": 0.32460957765579224, + "learning_rate": 7.3215620984180555e-06, + "loss": 0.3398, + "step": 31384 + }, + { + "epoch": 0.62772, + "grad_norm": 0.721697211265564, + "learning_rate": 7.320216877310023e-06, + "loss": 0.0066, + "step": 31386 + }, + { + "epoch": 0.62776, + "grad_norm": 0.09717179089784622, + "learning_rate": 7.318871708445738e-06, + "loss": 0.0077, + "step": 31388 + }, + { + "epoch": 0.6278, + "grad_norm": 0.9212268590927124, + "learning_rate": 7.3175265918514335e-06, + "loss": 0.0084, + "step": 31390 + }, + { + "epoch": 0.62784, + "grad_norm": 0.27624595165252686, + "learning_rate": 7.3161815275533285e-06, + "loss": 0.0118, + "step": 31392 + }, + { + "epoch": 0.62788, + "grad_norm": 0.06258582323789597, + "learning_rate": 7.314836515577644e-06, + "loss": 0.0078, + "step": 31394 + }, + { + "epoch": 0.62792, + "grad_norm": 0.29024365544319153, + "learning_rate": 7.313491555950606e-06, + "loss": 0.0083, + "step": 31396 + }, + { + "epoch": 0.62796, + "grad_norm": 0.02608044445514679, + "learning_rate": 7.312146648698432e-06, + "loss": 0.0473, + "step": 31398 + }, + { + "epoch": 0.628, + "grad_norm": 0.19011174142360687, + "learning_rate": 7.310801793847344e-06, + "loss": 0.0028, + "step": 31400 + }, + { + "epoch": 0.62804, + "grad_norm": 0.2837914824485779, + "learning_rate": 7.309456991423561e-06, + "loss": 0.0446, + "step": 31402 + }, + { + "epoch": 0.62808, + "grad_norm": 0.059830229729413986, + "learning_rate": 7.308112241453295e-06, + "loss": 0.0058, + "step": 31404 + }, + { + "epoch": 0.62812, + "grad_norm": 0.00048138032434508204, + "learning_rate": 7.30676754396277e-06, + "loss": 0.0003, + "step": 31406 + }, + { + "epoch": 0.62816, + "grad_norm": 0.023831596598029137, + "learning_rate": 7.305422898978198e-06, + "loss": 0.0002, + "step": 31408 + }, + { + "epoch": 0.6282, + "grad_norm": 0.7332186698913574, + "learning_rate": 7.3040783065257906e-06, + "loss": 0.0078, + "step": 31410 + }, + { + "epoch": 0.62824, + "grad_norm": 1.0661134719848633, + "learning_rate": 7.302733766631769e-06, + "loss": 0.0173, + "step": 31412 + }, + { + "epoch": 0.62828, + "grad_norm": 0.2674930989742279, + "learning_rate": 7.301389279322338e-06, + "loss": 0.0047, + "step": 31414 + }, + { + "epoch": 0.62832, + "grad_norm": 0.11742710322141647, + "learning_rate": 7.300044844623715e-06, + "loss": 0.0038, + "step": 31416 + }, + { + "epoch": 0.62836, + "grad_norm": 0.029872102662920952, + "learning_rate": 7.298700462562108e-06, + "loss": 0.0012, + "step": 31418 + }, + { + "epoch": 0.6284, + "grad_norm": 0.2297857701778412, + "learning_rate": 7.297356133163722e-06, + "loss": 0.0038, + "step": 31420 + }, + { + "epoch": 0.62844, + "grad_norm": 0.2964599132537842, + "learning_rate": 7.2960118564547744e-06, + "loss": 0.0198, + "step": 31422 + }, + { + "epoch": 0.62848, + "grad_norm": 9.36132526397705, + "learning_rate": 7.294667632461463e-06, + "loss": 0.267, + "step": 31424 + }, + { + "epoch": 0.62852, + "grad_norm": 6.15860652923584, + "learning_rate": 7.293323461210003e-06, + "loss": 0.0508, + "step": 31426 + }, + { + "epoch": 0.62856, + "grad_norm": 0.023788096383213997, + "learning_rate": 7.291979342726594e-06, + "loss": 0.0017, + "step": 31428 + }, + { + "epoch": 0.6286, + "grad_norm": 0.001219308702275157, + "learning_rate": 7.290635277037442e-06, + "loss": 0.0003, + "step": 31430 + }, + { + "epoch": 0.62864, + "grad_norm": 0.5749883055686951, + "learning_rate": 7.289291264168752e-06, + "loss": 0.0138, + "step": 31432 + }, + { + "epoch": 0.62868, + "grad_norm": 0.07546419650316238, + "learning_rate": 7.287947304146721e-06, + "loss": 0.0027, + "step": 31434 + }, + { + "epoch": 0.62872, + "grad_norm": 0.0865025520324707, + "learning_rate": 7.286603396997555e-06, + "loss": 0.0012, + "step": 31436 + }, + { + "epoch": 0.62876, + "grad_norm": 8.549095153808594, + "learning_rate": 7.2852595427474536e-06, + "loss": 0.1383, + "step": 31438 + }, + { + "epoch": 0.6288, + "grad_norm": 1.5017470121383667, + "learning_rate": 7.283915741422611e-06, + "loss": 0.0382, + "step": 31440 + }, + { + "epoch": 0.62884, + "grad_norm": 0.22447556257247925, + "learning_rate": 7.2825719930492345e-06, + "loss": 0.2687, + "step": 31442 + }, + { + "epoch": 0.62888, + "grad_norm": 0.03375914692878723, + "learning_rate": 7.281228297653515e-06, + "loss": 0.0007, + "step": 31444 + }, + { + "epoch": 0.62892, + "grad_norm": 0.14855024218559265, + "learning_rate": 7.279884655261646e-06, + "loss": 0.0026, + "step": 31446 + }, + { + "epoch": 0.62896, + "grad_norm": 9.608999252319336, + "learning_rate": 7.278541065899831e-06, + "loss": 0.1381, + "step": 31448 + }, + { + "epoch": 0.629, + "grad_norm": 0.044963471591472626, + "learning_rate": 7.277197529594257e-06, + "loss": 0.0029, + "step": 31450 + }, + { + "epoch": 0.62904, + "grad_norm": 0.16623114049434662, + "learning_rate": 7.27585404637112e-06, + "loss": 0.003, + "step": 31452 + }, + { + "epoch": 0.62908, + "grad_norm": 0.18663403391838074, + "learning_rate": 7.27451061625661e-06, + "loss": 0.0035, + "step": 31454 + }, + { + "epoch": 0.62912, + "grad_norm": 0.010136986151337624, + "learning_rate": 7.273167239276916e-06, + "loss": 0.0006, + "step": 31456 + }, + { + "epoch": 0.62916, + "grad_norm": 8.165782928466797, + "learning_rate": 7.271823915458234e-06, + "loss": 0.0971, + "step": 31458 + }, + { + "epoch": 0.6292, + "grad_norm": 14.252657890319824, + "learning_rate": 7.27048064482675e-06, + "loss": 0.2271, + "step": 31460 + }, + { + "epoch": 0.62924, + "grad_norm": 0.10927187651395798, + "learning_rate": 7.269137427408646e-06, + "loss": 0.0027, + "step": 31462 + }, + { + "epoch": 0.62928, + "grad_norm": 0.019518403336405754, + "learning_rate": 7.267794263230119e-06, + "loss": 0.0007, + "step": 31464 + }, + { + "epoch": 0.62932, + "grad_norm": 0.1624830663204193, + "learning_rate": 7.266451152317349e-06, + "loss": 0.0069, + "step": 31466 + }, + { + "epoch": 0.62936, + "grad_norm": 0.34467536211013794, + "learning_rate": 7.265108094696518e-06, + "loss": 0.0045, + "step": 31468 + }, + { + "epoch": 0.6294, + "grad_norm": 0.04180987924337387, + "learning_rate": 7.263765090393817e-06, + "loss": 0.0005, + "step": 31470 + }, + { + "epoch": 0.62944, + "grad_norm": 0.05953501537442207, + "learning_rate": 7.262422139435419e-06, + "loss": 0.0025, + "step": 31472 + }, + { + "epoch": 0.62948, + "grad_norm": 0.29396793246269226, + "learning_rate": 7.261079241847514e-06, + "loss": 0.0336, + "step": 31474 + }, + { + "epoch": 0.62952, + "grad_norm": 0.0676058903336525, + "learning_rate": 7.259736397656281e-06, + "loss": 0.0015, + "step": 31476 + }, + { + "epoch": 0.62956, + "grad_norm": 0.2686992883682251, + "learning_rate": 7.258393606887893e-06, + "loss": 0.0047, + "step": 31478 + }, + { + "epoch": 0.6296, + "grad_norm": 14.161534309387207, + "learning_rate": 7.257050869568536e-06, + "loss": 0.1788, + "step": 31480 + }, + { + "epoch": 0.62964, + "grad_norm": 14.537640571594238, + "learning_rate": 7.255708185724385e-06, + "loss": 0.2528, + "step": 31482 + }, + { + "epoch": 0.62968, + "grad_norm": 0.498505562543869, + "learning_rate": 7.254365555381614e-06, + "loss": 0.0054, + "step": 31484 + }, + { + "epoch": 0.62972, + "grad_norm": 0.07379510998725891, + "learning_rate": 7.253022978566402e-06, + "loss": 0.127, + "step": 31486 + }, + { + "epoch": 0.62976, + "grad_norm": 4.324132442474365, + "learning_rate": 7.251680455304917e-06, + "loss": 0.0721, + "step": 31488 + }, + { + "epoch": 0.6298, + "grad_norm": 2.772010564804077, + "learning_rate": 7.250337985623342e-06, + "loss": 0.0306, + "step": 31490 + }, + { + "epoch": 0.62984, + "grad_norm": 0.22392971813678741, + "learning_rate": 7.248995569547842e-06, + "loss": 0.0081, + "step": 31492 + }, + { + "epoch": 0.62988, + "grad_norm": 0.14174015820026398, + "learning_rate": 7.247653207104586e-06, + "loss": 0.0232, + "step": 31494 + }, + { + "epoch": 0.62992, + "grad_norm": 0.02853500284254551, + "learning_rate": 7.246310898319753e-06, + "loss": 0.0271, + "step": 31496 + }, + { + "epoch": 0.62996, + "grad_norm": 0.2754034101963043, + "learning_rate": 7.244968643219505e-06, + "loss": 0.004, + "step": 31498 + }, + { + "epoch": 0.63, + "grad_norm": 0.20067831873893738, + "learning_rate": 7.243626441830009e-06, + "loss": 0.0031, + "step": 31500 + }, + { + "epoch": 0.63004, + "grad_norm": 0.05918801203370094, + "learning_rate": 7.242284294177437e-06, + "loss": 0.0008, + "step": 31502 + }, + { + "epoch": 0.63008, + "grad_norm": 13.84370231628418, + "learning_rate": 7.240942200287951e-06, + "loss": 0.567, + "step": 31504 + }, + { + "epoch": 0.63012, + "grad_norm": 13.651983261108398, + "learning_rate": 7.23960016018772e-06, + "loss": 0.4112, + "step": 31506 + }, + { + "epoch": 0.63016, + "grad_norm": 4.243030071258545, + "learning_rate": 7.2382581739029045e-06, + "loss": 0.0648, + "step": 31508 + }, + { + "epoch": 0.6302, + "grad_norm": 0.6994690895080566, + "learning_rate": 7.236916241459664e-06, + "loss": 0.0115, + "step": 31510 + }, + { + "epoch": 0.63024, + "grad_norm": 0.015027419663965702, + "learning_rate": 7.235574362884168e-06, + "loss": 0.0066, + "step": 31512 + }, + { + "epoch": 0.63028, + "grad_norm": 0.6325730681419373, + "learning_rate": 7.234232538202572e-06, + "loss": 0.0059, + "step": 31514 + }, + { + "epoch": 0.63032, + "grad_norm": 0.021676095202565193, + "learning_rate": 7.232890767441034e-06, + "loss": 0.0019, + "step": 31516 + }, + { + "epoch": 0.63036, + "grad_norm": 2.4186441898345947, + "learning_rate": 7.231549050625718e-06, + "loss": 0.0345, + "step": 31518 + }, + { + "epoch": 0.6304, + "grad_norm": 0.8618155717849731, + "learning_rate": 7.2302073877827775e-06, + "loss": 0.0137, + "step": 31520 + }, + { + "epoch": 0.63044, + "grad_norm": 0.07507678866386414, + "learning_rate": 7.228865778938371e-06, + "loss": 0.0072, + "step": 31522 + }, + { + "epoch": 0.63048, + "grad_norm": 2.203366279602051, + "learning_rate": 7.227524224118654e-06, + "loss": 0.0342, + "step": 31524 + }, + { + "epoch": 0.63052, + "grad_norm": 0.08635860681533813, + "learning_rate": 7.226182723349774e-06, + "loss": 0.0011, + "step": 31526 + }, + { + "epoch": 0.63056, + "grad_norm": 0.0006599762127734721, + "learning_rate": 7.224841276657895e-06, + "loss": 0.0082, + "step": 31528 + }, + { + "epoch": 0.6306, + "grad_norm": 0.5596723556518555, + "learning_rate": 7.22349988406916e-06, + "loss": 0.0084, + "step": 31530 + }, + { + "epoch": 0.63064, + "grad_norm": 0.161610946059227, + "learning_rate": 7.222158545609727e-06, + "loss": 0.0024, + "step": 31532 + }, + { + "epoch": 0.63068, + "grad_norm": 0.15091070532798767, + "learning_rate": 7.220817261305746e-06, + "loss": 0.0026, + "step": 31534 + }, + { + "epoch": 0.63072, + "grad_norm": 0.14333677291870117, + "learning_rate": 7.219476031183359e-06, + "loss": 0.0022, + "step": 31536 + }, + { + "epoch": 0.63076, + "grad_norm": 1.4679220914840698, + "learning_rate": 7.2181348552687215e-06, + "loss": 0.0249, + "step": 31538 + }, + { + "epoch": 0.6308, + "grad_norm": 0.528102457523346, + "learning_rate": 7.216793733587976e-06, + "loss": 0.0041, + "step": 31540 + }, + { + "epoch": 0.63084, + "grad_norm": 0.5886843800544739, + "learning_rate": 7.215452666167266e-06, + "loss": 0.0099, + "step": 31542 + }, + { + "epoch": 0.63088, + "grad_norm": 18.728178024291992, + "learning_rate": 7.214111653032744e-06, + "loss": 0.4507, + "step": 31544 + }, + { + "epoch": 0.63092, + "grad_norm": 13.221710205078125, + "learning_rate": 7.212770694210545e-06, + "loss": 0.1799, + "step": 31546 + }, + { + "epoch": 0.63096, + "grad_norm": 0.22989244759082794, + "learning_rate": 7.211429789726822e-06, + "loss": 0.0036, + "step": 31548 + }, + { + "epoch": 0.631, + "grad_norm": 0.10839955508708954, + "learning_rate": 7.210088939607709e-06, + "loss": 0.3743, + "step": 31550 + }, + { + "epoch": 0.63104, + "grad_norm": 0.19717639684677124, + "learning_rate": 7.208748143879347e-06, + "loss": 0.0029, + "step": 31552 + }, + { + "epoch": 0.63108, + "grad_norm": 0.020993143320083618, + "learning_rate": 7.20740740256788e-06, + "loss": 0.0019, + "step": 31554 + }, + { + "epoch": 0.63112, + "grad_norm": 5.796292304992676, + "learning_rate": 7.2060667156994425e-06, + "loss": 0.1059, + "step": 31556 + }, + { + "epoch": 0.63116, + "grad_norm": 0.444795161485672, + "learning_rate": 7.204726083300169e-06, + "loss": 0.0307, + "step": 31558 + }, + { + "epoch": 0.6312, + "grad_norm": 0.1254923939704895, + "learning_rate": 7.203385505396203e-06, + "loss": 0.0013, + "step": 31560 + }, + { + "epoch": 0.63124, + "grad_norm": 0.14138726890087128, + "learning_rate": 7.2020449820136725e-06, + "loss": 0.0033, + "step": 31562 + }, + { + "epoch": 0.63128, + "grad_norm": 0.33773505687713623, + "learning_rate": 7.20070451317872e-06, + "loss": 0.0066, + "step": 31564 + }, + { + "epoch": 0.63132, + "grad_norm": 0.1663036197423935, + "learning_rate": 7.199364098917474e-06, + "loss": 0.0975, + "step": 31566 + }, + { + "epoch": 0.63136, + "grad_norm": 3.070606231689453, + "learning_rate": 7.198023739256066e-06, + "loss": 0.0486, + "step": 31568 + }, + { + "epoch": 0.6314, + "grad_norm": 0.08172451704740524, + "learning_rate": 7.196683434220626e-06, + "loss": 0.0092, + "step": 31570 + }, + { + "epoch": 0.63144, + "grad_norm": 0.08254224807024002, + "learning_rate": 7.195343183837289e-06, + "loss": 0.2845, + "step": 31572 + }, + { + "epoch": 0.63148, + "grad_norm": 0.4787577688694, + "learning_rate": 7.194002988132178e-06, + "loss": 0.039, + "step": 31574 + }, + { + "epoch": 0.63152, + "grad_norm": 0.06832042336463928, + "learning_rate": 7.192662847131424e-06, + "loss": 0.0015, + "step": 31576 + }, + { + "epoch": 0.63156, + "grad_norm": 0.0007337635033763945, + "learning_rate": 7.191322760861151e-06, + "loss": 0.0004, + "step": 31578 + }, + { + "epoch": 0.6316, + "grad_norm": 0.24711726605892181, + "learning_rate": 7.189982729347491e-06, + "loss": 0.0035, + "step": 31580 + }, + { + "epoch": 0.63164, + "grad_norm": 0.14460009336471558, + "learning_rate": 7.188642752616564e-06, + "loss": 0.5495, + "step": 31582 + }, + { + "epoch": 0.63168, + "grad_norm": 0.09517991542816162, + "learning_rate": 7.187302830694488e-06, + "loss": 0.0033, + "step": 31584 + }, + { + "epoch": 0.63172, + "grad_norm": 0.5508714318275452, + "learning_rate": 7.185962963607398e-06, + "loss": 0.0077, + "step": 31586 + }, + { + "epoch": 0.63176, + "grad_norm": 0.01606142334640026, + "learning_rate": 7.184623151381407e-06, + "loss": 0.015, + "step": 31588 + }, + { + "epoch": 0.6318, + "grad_norm": 0.05276543274521828, + "learning_rate": 7.1832833940426346e-06, + "loss": 0.2045, + "step": 31590 + }, + { + "epoch": 0.63184, + "grad_norm": 2.3659775257110596, + "learning_rate": 7.181943691617207e-06, + "loss": 0.045, + "step": 31592 + }, + { + "epoch": 0.63188, + "grad_norm": 0.06665129959583282, + "learning_rate": 7.180604044131232e-06, + "loss": 0.0028, + "step": 31594 + }, + { + "epoch": 0.63192, + "grad_norm": 0.2516744136810303, + "learning_rate": 7.179264451610838e-06, + "loss": 0.0041, + "step": 31596 + }, + { + "epoch": 0.63196, + "grad_norm": 12.375970840454102, + "learning_rate": 7.177924914082135e-06, + "loss": 0.2048, + "step": 31598 + }, + { + "epoch": 0.632, + "grad_norm": 0.13394330441951752, + "learning_rate": 7.176585431571235e-06, + "loss": 0.0025, + "step": 31600 + }, + { + "epoch": 0.63204, + "grad_norm": 0.09106019884347916, + "learning_rate": 7.1752460041042595e-06, + "loss": 0.0061, + "step": 31602 + }, + { + "epoch": 0.63208, + "grad_norm": 0.18101708590984344, + "learning_rate": 7.173906631707317e-06, + "loss": 0.0045, + "step": 31604 + }, + { + "epoch": 0.63212, + "grad_norm": 0.14096421003341675, + "learning_rate": 7.172567314406518e-06, + "loss": 0.0019, + "step": 31606 + }, + { + "epoch": 0.63216, + "grad_norm": 8.939845085144043, + "learning_rate": 7.171228052227978e-06, + "loss": 0.1129, + "step": 31608 + }, + { + "epoch": 0.6322, + "grad_norm": 0.19640326499938965, + "learning_rate": 7.169888845197798e-06, + "loss": 0.0026, + "step": 31610 + }, + { + "epoch": 0.63224, + "grad_norm": 0.16590481996536255, + "learning_rate": 7.168549693342096e-06, + "loss": 0.0024, + "step": 31612 + }, + { + "epoch": 0.63228, + "grad_norm": 0.06191745772957802, + "learning_rate": 7.1672105966869756e-06, + "loss": 0.0072, + "step": 31614 + }, + { + "epoch": 0.63232, + "grad_norm": 0.24749061465263367, + "learning_rate": 7.165871555258539e-06, + "loss": 0.0037, + "step": 31616 + }, + { + "epoch": 0.63236, + "grad_norm": 0.09246961027383804, + "learning_rate": 7.1645325690829004e-06, + "loss": 0.0045, + "step": 31618 + }, + { + "epoch": 0.6324, + "grad_norm": 0.0705619603395462, + "learning_rate": 7.163193638186159e-06, + "loss": 0.043, + "step": 31620 + }, + { + "epoch": 0.63244, + "grad_norm": 0.00903823971748352, + "learning_rate": 7.1618547625944155e-06, + "loss": 0.001, + "step": 31622 + }, + { + "epoch": 0.63248, + "grad_norm": 0.013909485191106796, + "learning_rate": 7.160515942333777e-06, + "loss": 0.1159, + "step": 31624 + }, + { + "epoch": 0.63252, + "grad_norm": 10.485234260559082, + "learning_rate": 7.159177177430337e-06, + "loss": 0.1762, + "step": 31626 + }, + { + "epoch": 0.63256, + "grad_norm": 1.1887738704681396, + "learning_rate": 7.157838467910206e-06, + "loss": 0.0155, + "step": 31628 + }, + { + "epoch": 0.6326, + "grad_norm": 0.310630202293396, + "learning_rate": 7.156499813799477e-06, + "loss": 0.0048, + "step": 31630 + }, + { + "epoch": 0.63264, + "grad_norm": 2.947725772857666, + "learning_rate": 7.155161215124243e-06, + "loss": 0.0263, + "step": 31632 + }, + { + "epoch": 0.63268, + "grad_norm": 0.02319677174091339, + "learning_rate": 7.153822671910611e-06, + "loss": 0.0006, + "step": 31634 + }, + { + "epoch": 0.63272, + "grad_norm": 0.21494421362876892, + "learning_rate": 7.152484184184668e-06, + "loss": 0.0034, + "step": 31636 + }, + { + "epoch": 0.63276, + "grad_norm": 0.010097084566950798, + "learning_rate": 7.151145751972515e-06, + "loss": 0.0011, + "step": 31638 + }, + { + "epoch": 0.6328, + "grad_norm": 0.3476366698741913, + "learning_rate": 7.149807375300239e-06, + "loss": 0.0051, + "step": 31640 + }, + { + "epoch": 0.63284, + "grad_norm": 0.15482890605926514, + "learning_rate": 7.1484690541939375e-06, + "loss": 0.067, + "step": 31642 + }, + { + "epoch": 0.63288, + "grad_norm": 0.04776802286505699, + "learning_rate": 7.147130788679702e-06, + "loss": 0.0058, + "step": 31644 + }, + { + "epoch": 0.63292, + "grad_norm": 0.13916659355163574, + "learning_rate": 7.145792578783618e-06, + "loss": 0.0886, + "step": 31646 + }, + { + "epoch": 0.63296, + "grad_norm": 1.1095513105392456, + "learning_rate": 7.144454424531775e-06, + "loss": 0.0208, + "step": 31648 + }, + { + "epoch": 0.633, + "grad_norm": 0.6677342057228088, + "learning_rate": 7.143116325950266e-06, + "loss": 0.0091, + "step": 31650 + }, + { + "epoch": 0.63304, + "grad_norm": 0.06980463117361069, + "learning_rate": 7.141778283065172e-06, + "loss": 0.2352, + "step": 31652 + }, + { + "epoch": 0.63308, + "grad_norm": 0.015999237075448036, + "learning_rate": 7.140440295902584e-06, + "loss": 0.0076, + "step": 31654 + }, + { + "epoch": 0.63312, + "grad_norm": 0.25827306509017944, + "learning_rate": 7.139102364488587e-06, + "loss": 0.0052, + "step": 31656 + }, + { + "epoch": 0.63316, + "grad_norm": 0.7692295908927917, + "learning_rate": 7.137764488849259e-06, + "loss": 0.0111, + "step": 31658 + }, + { + "epoch": 0.6332, + "grad_norm": 0.04545054957270622, + "learning_rate": 7.13642666901069e-06, + "loss": 0.0307, + "step": 31660 + }, + { + "epoch": 0.63324, + "grad_norm": 1.7581311464309692, + "learning_rate": 7.135088904998954e-06, + "loss": 0.0278, + "step": 31662 + }, + { + "epoch": 0.63328, + "grad_norm": 2.272301435470581, + "learning_rate": 7.133751196840133e-06, + "loss": 0.0285, + "step": 31664 + }, + { + "epoch": 0.63332, + "grad_norm": 0.833665668964386, + "learning_rate": 7.132413544560311e-06, + "loss": 0.0469, + "step": 31666 + }, + { + "epoch": 0.63336, + "grad_norm": 0.0009694076143205166, + "learning_rate": 7.131075948185559e-06, + "loss": 0.0025, + "step": 31668 + }, + { + "epoch": 0.6334, + "grad_norm": 1.4096145629882812, + "learning_rate": 7.129738407741964e-06, + "loss": 0.0164, + "step": 31670 + }, + { + "epoch": 0.63344, + "grad_norm": 0.021393265575170517, + "learning_rate": 7.128400923255596e-06, + "loss": 0.006, + "step": 31672 + }, + { + "epoch": 0.63348, + "grad_norm": 0.4287591278553009, + "learning_rate": 7.1270634947525285e-06, + "loss": 0.0066, + "step": 31674 + }, + { + "epoch": 0.63352, + "grad_norm": 0.06315450370311737, + "learning_rate": 7.125726122258838e-06, + "loss": 0.004, + "step": 31676 + }, + { + "epoch": 0.63356, + "grad_norm": 0.34583157300949097, + "learning_rate": 7.124388805800598e-06, + "loss": 0.0095, + "step": 31678 + }, + { + "epoch": 0.6336, + "grad_norm": 0.07624457031488419, + "learning_rate": 7.123051545403874e-06, + "loss": 0.0042, + "step": 31680 + }, + { + "epoch": 0.63364, + "grad_norm": 0.00619817478582263, + "learning_rate": 7.121714341094745e-06, + "loss": 0.0045, + "step": 31682 + }, + { + "epoch": 0.63368, + "grad_norm": 9.17541217803955, + "learning_rate": 7.120377192899273e-06, + "loss": 0.3073, + "step": 31684 + }, + { + "epoch": 0.63372, + "grad_norm": 0.6268200874328613, + "learning_rate": 7.119040100843534e-06, + "loss": 0.7174, + "step": 31686 + }, + { + "epoch": 0.63376, + "grad_norm": 0.1360889971256256, + "learning_rate": 7.117703064953592e-06, + "loss": 0.0152, + "step": 31688 + }, + { + "epoch": 0.6338, + "grad_norm": 0.4228789508342743, + "learning_rate": 7.116366085255511e-06, + "loss": 0.0061, + "step": 31690 + }, + { + "epoch": 0.63384, + "grad_norm": 0.3627934753894806, + "learning_rate": 7.115029161775358e-06, + "loss": 0.0048, + "step": 31692 + }, + { + "epoch": 0.63388, + "grad_norm": 0.18268531560897827, + "learning_rate": 7.113692294539196e-06, + "loss": 0.6103, + "step": 31694 + }, + { + "epoch": 0.63392, + "grad_norm": 10.210945129394531, + "learning_rate": 7.1123554835730875e-06, + "loss": 0.1499, + "step": 31696 + }, + { + "epoch": 0.63396, + "grad_norm": 0.15039928257465363, + "learning_rate": 7.111018728903099e-06, + "loss": 0.004, + "step": 31698 + }, + { + "epoch": 0.634, + "grad_norm": 0.2797171175479889, + "learning_rate": 7.109682030555283e-06, + "loss": 0.009, + "step": 31700 + }, + { + "epoch": 0.63404, + "grad_norm": 0.15972192585468292, + "learning_rate": 7.108345388555709e-06, + "loss": 0.0024, + "step": 31702 + }, + { + "epoch": 0.63408, + "grad_norm": 10.254597663879395, + "learning_rate": 7.107008802930429e-06, + "loss": 0.2515, + "step": 31704 + }, + { + "epoch": 0.63412, + "grad_norm": 0.09472730755805969, + "learning_rate": 7.105672273705501e-06, + "loss": 0.0092, + "step": 31706 + }, + { + "epoch": 0.63416, + "grad_norm": 0.30878251791000366, + "learning_rate": 7.104335800906981e-06, + "loss": 0.0043, + "step": 31708 + }, + { + "epoch": 0.6342, + "grad_norm": 4.758611679077148, + "learning_rate": 7.102999384560927e-06, + "loss": 0.0909, + "step": 31710 + }, + { + "epoch": 0.63424, + "grad_norm": 0.26272955536842346, + "learning_rate": 7.10166302469339e-06, + "loss": 0.003, + "step": 31712 + }, + { + "epoch": 0.63428, + "grad_norm": 0.20598158240318298, + "learning_rate": 7.100326721330429e-06, + "loss": 0.0049, + "step": 31714 + }, + { + "epoch": 0.63432, + "grad_norm": 0.01665376126766205, + "learning_rate": 7.098990474498084e-06, + "loss": 0.0719, + "step": 31716 + }, + { + "epoch": 0.63436, + "grad_norm": 0.090780109167099, + "learning_rate": 7.097654284222419e-06, + "loss": 0.0022, + "step": 31718 + }, + { + "epoch": 0.6344, + "grad_norm": 0.2634499967098236, + "learning_rate": 7.096318150529476e-06, + "loss": 0.0295, + "step": 31720 + }, + { + "epoch": 0.63444, + "grad_norm": 0.6527043581008911, + "learning_rate": 7.094982073445303e-06, + "loss": 0.0175, + "step": 31722 + }, + { + "epoch": 0.63448, + "grad_norm": 0.25526633858680725, + "learning_rate": 7.093646052995955e-06, + "loss": 0.0913, + "step": 31724 + }, + { + "epoch": 0.63452, + "grad_norm": 0.271874338388443, + "learning_rate": 7.092310089207473e-06, + "loss": 0.0141, + "step": 31726 + }, + { + "epoch": 0.63456, + "grad_norm": 0.028099386021494865, + "learning_rate": 7.090974182105899e-06, + "loss": 0.0003, + "step": 31728 + }, + { + "epoch": 0.6346, + "grad_norm": 0.029135851189494133, + "learning_rate": 7.0896383317172845e-06, + "loss": 0.0044, + "step": 31730 + }, + { + "epoch": 0.63464, + "grad_norm": 0.12901762127876282, + "learning_rate": 7.088302538067664e-06, + "loss": 0.0022, + "step": 31732 + }, + { + "epoch": 0.63468, + "grad_norm": 0.023254986852407455, + "learning_rate": 7.086966801183091e-06, + "loss": 0.0024, + "step": 31734 + }, + { + "epoch": 0.63472, + "grad_norm": 0.06598751991987228, + "learning_rate": 7.0856311210895976e-06, + "loss": 0.0033, + "step": 31736 + }, + { + "epoch": 0.63476, + "grad_norm": 0.2835110127925873, + "learning_rate": 7.084295497813223e-06, + "loss": 0.0189, + "step": 31738 + }, + { + "epoch": 0.6348, + "grad_norm": 0.11398360878229141, + "learning_rate": 7.082959931380011e-06, + "loss": 0.0595, + "step": 31740 + }, + { + "epoch": 0.63484, + "grad_norm": 0.4562951326370239, + "learning_rate": 7.081624421815998e-06, + "loss": 0.012, + "step": 31742 + }, + { + "epoch": 0.63488, + "grad_norm": 0.06830459088087082, + "learning_rate": 7.0802889691472185e-06, + "loss": 0.0046, + "step": 31744 + }, + { + "epoch": 0.63492, + "grad_norm": 0.07053519785404205, + "learning_rate": 7.078953573399711e-06, + "loss": 0.0009, + "step": 31746 + }, + { + "epoch": 0.63496, + "grad_norm": 0.07330576330423355, + "learning_rate": 7.077618234599502e-06, + "loss": 0.0018, + "step": 31748 + }, + { + "epoch": 0.635, + "grad_norm": 8.139209747314453, + "learning_rate": 7.076282952772634e-06, + "loss": 0.1532, + "step": 31750 + }, + { + "epoch": 0.63504, + "grad_norm": 0.05888473987579346, + "learning_rate": 7.074947727945136e-06, + "loss": 0.0035, + "step": 31752 + }, + { + "epoch": 0.63508, + "grad_norm": 0.24966531991958618, + "learning_rate": 7.073612560143033e-06, + "loss": 0.0047, + "step": 31754 + }, + { + "epoch": 0.63512, + "grad_norm": 0.09992976486682892, + "learning_rate": 7.072277449392364e-06, + "loss": 0.0013, + "step": 31756 + }, + { + "epoch": 0.63516, + "grad_norm": 0.34277215600013733, + "learning_rate": 7.070942395719151e-06, + "loss": 0.0074, + "step": 31758 + }, + { + "epoch": 0.6352, + "grad_norm": 0.04776770621538162, + "learning_rate": 7.069607399149427e-06, + "loss": 0.0155, + "step": 31760 + }, + { + "epoch": 0.63524, + "grad_norm": 1.75226891040802, + "learning_rate": 7.0682724597092155e-06, + "loss": 0.0207, + "step": 31762 + }, + { + "epoch": 0.63528, + "grad_norm": 0.7807689905166626, + "learning_rate": 7.0669375774245375e-06, + "loss": 0.0163, + "step": 31764 + }, + { + "epoch": 0.63532, + "grad_norm": 0.08848290890455246, + "learning_rate": 7.065602752321426e-06, + "loss": 0.0076, + "step": 31766 + }, + { + "epoch": 0.63536, + "grad_norm": 0.20701779425144196, + "learning_rate": 7.0642679844258985e-06, + "loss": 0.0042, + "step": 31768 + }, + { + "epoch": 0.6354, + "grad_norm": 0.9179302453994751, + "learning_rate": 7.062933273763974e-06, + "loss": 0.0112, + "step": 31770 + }, + { + "epoch": 0.63544, + "grad_norm": 0.10466334968805313, + "learning_rate": 7.061598620361683e-06, + "loss": 0.0022, + "step": 31772 + }, + { + "epoch": 0.63548, + "grad_norm": 0.48697108030319214, + "learning_rate": 7.060264024245036e-06, + "loss": 0.0109, + "step": 31774 + }, + { + "epoch": 0.63552, + "grad_norm": 0.038754142820835114, + "learning_rate": 7.058929485440056e-06, + "loss": 0.0029, + "step": 31776 + }, + { + "epoch": 0.63556, + "grad_norm": 0.02474352903664112, + "learning_rate": 7.057595003972761e-06, + "loss": 0.0009, + "step": 31778 + }, + { + "epoch": 0.6356, + "grad_norm": 0.022978950291872025, + "learning_rate": 7.056260579869165e-06, + "loss": 0.0053, + "step": 31780 + }, + { + "epoch": 0.63564, + "grad_norm": 1.9661635160446167, + "learning_rate": 7.054926213155287e-06, + "loss": 0.0202, + "step": 31782 + }, + { + "epoch": 0.63568, + "grad_norm": 0.08693261444568634, + "learning_rate": 7.053591903857138e-06, + "loss": 0.0592, + "step": 31784 + }, + { + "epoch": 0.63572, + "grad_norm": 2.303571939468384, + "learning_rate": 7.052257652000727e-06, + "loss": 0.1865, + "step": 31786 + }, + { + "epoch": 0.63576, + "grad_norm": 0.3061215281486511, + "learning_rate": 7.050923457612076e-06, + "loss": 0.0139, + "step": 31788 + }, + { + "epoch": 0.6358, + "grad_norm": 0.3483854830265045, + "learning_rate": 7.049589320717186e-06, + "loss": 0.0043, + "step": 31790 + }, + { + "epoch": 0.63584, + "grad_norm": 0.14974091947078705, + "learning_rate": 7.048255241342075e-06, + "loss": 0.0036, + "step": 31792 + }, + { + "epoch": 0.63588, + "grad_norm": 0.08869310468435287, + "learning_rate": 7.046921219512748e-06, + "loss": 0.0028, + "step": 31794 + }, + { + "epoch": 0.63592, + "grad_norm": 0.020766526460647583, + "learning_rate": 7.045587255255211e-06, + "loss": 0.0009, + "step": 31796 + }, + { + "epoch": 0.63596, + "grad_norm": 2.736603260040283, + "learning_rate": 7.044253348595472e-06, + "loss": 0.0328, + "step": 31798 + }, + { + "epoch": 0.636, + "grad_norm": 13.700223922729492, + "learning_rate": 7.042919499559538e-06, + "loss": 0.6294, + "step": 31800 + }, + { + "epoch": 0.63604, + "grad_norm": 0.4872925281524658, + "learning_rate": 7.041585708173404e-06, + "loss": 0.031, + "step": 31802 + }, + { + "epoch": 0.63608, + "grad_norm": 0.09438377618789673, + "learning_rate": 7.040251974463084e-06, + "loss": 0.0016, + "step": 31804 + }, + { + "epoch": 0.63612, + "grad_norm": 0.16316789388656616, + "learning_rate": 7.038918298454573e-06, + "loss": 0.002, + "step": 31806 + }, + { + "epoch": 0.63616, + "grad_norm": 15.184229850769043, + "learning_rate": 7.037584680173877e-06, + "loss": 0.2677, + "step": 31808 + }, + { + "epoch": 0.6362, + "grad_norm": 10.894099235534668, + "learning_rate": 7.036251119646993e-06, + "loss": 0.229, + "step": 31810 + }, + { + "epoch": 0.63624, + "grad_norm": 0.10558629781007767, + "learning_rate": 7.034917616899919e-06, + "loss": 0.0095, + "step": 31812 + }, + { + "epoch": 0.63628, + "grad_norm": 1.1145880222320557, + "learning_rate": 7.0335841719586526e-06, + "loss": 0.0304, + "step": 31814 + }, + { + "epoch": 0.63632, + "grad_norm": 0.15801697969436646, + "learning_rate": 7.032250784849191e-06, + "loss": 0.2215, + "step": 31816 + }, + { + "epoch": 0.63636, + "grad_norm": 0.4043772518634796, + "learning_rate": 7.0309174555975235e-06, + "loss": 0.0081, + "step": 31818 + }, + { + "epoch": 0.6364, + "grad_norm": 4.844185829162598, + "learning_rate": 7.029584184229653e-06, + "loss": 0.0528, + "step": 31820 + }, + { + "epoch": 0.63644, + "grad_norm": 1.9937554597854614, + "learning_rate": 7.028250970771564e-06, + "loss": 0.0199, + "step": 31822 + }, + { + "epoch": 0.63648, + "grad_norm": 0.06903332471847534, + "learning_rate": 7.026917815249257e-06, + "loss": 0.0016, + "step": 31824 + }, + { + "epoch": 0.63652, + "grad_norm": 0.036854639649391174, + "learning_rate": 7.025584717688718e-06, + "loss": 0.0014, + "step": 31826 + }, + { + "epoch": 0.63656, + "grad_norm": 0.0385788157582283, + "learning_rate": 7.0242516781159335e-06, + "loss": 0.0218, + "step": 31828 + }, + { + "epoch": 0.6366, + "grad_norm": 0.34978073835372925, + "learning_rate": 7.022918696556896e-06, + "loss": 0.0059, + "step": 31830 + }, + { + "epoch": 0.63664, + "grad_norm": 0.23750346899032593, + "learning_rate": 7.02158577303759e-06, + "loss": 0.0028, + "step": 31832 + }, + { + "epoch": 0.63668, + "grad_norm": 1.1562350988388062, + "learning_rate": 7.020252907584002e-06, + "loss": 0.0233, + "step": 31834 + }, + { + "epoch": 0.63672, + "grad_norm": 0.05324674770236015, + "learning_rate": 7.018920100222122e-06, + "loss": 0.0219, + "step": 31836 + }, + { + "epoch": 0.63676, + "grad_norm": 0.12472227215766907, + "learning_rate": 7.017587350977923e-06, + "loss": 0.0036, + "step": 31838 + }, + { + "epoch": 0.6368, + "grad_norm": 0.14714384078979492, + "learning_rate": 7.016254659877398e-06, + "loss": 0.3387, + "step": 31840 + }, + { + "epoch": 0.63684, + "grad_norm": 0.08086532354354858, + "learning_rate": 7.014922026946526e-06, + "loss": 0.016, + "step": 31842 + }, + { + "epoch": 0.63688, + "grad_norm": 0.25800228118896484, + "learning_rate": 7.01358945221128e-06, + "loss": 0.0043, + "step": 31844 + }, + { + "epoch": 0.63692, + "grad_norm": 0.4211122691631317, + "learning_rate": 7.012256935697652e-06, + "loss": 0.0058, + "step": 31846 + }, + { + "epoch": 0.63696, + "grad_norm": 0.21847596764564514, + "learning_rate": 7.010924477431609e-06, + "loss": 0.0028, + "step": 31848 + }, + { + "epoch": 0.637, + "grad_norm": 2.3227591514587402, + "learning_rate": 7.009592077439135e-06, + "loss": 0.0245, + "step": 31850 + }, + { + "epoch": 0.63704, + "grad_norm": 5.170724868774414, + "learning_rate": 7.0082597357462035e-06, + "loss": 0.077, + "step": 31852 + }, + { + "epoch": 0.63708, + "grad_norm": 0.45555347204208374, + "learning_rate": 7.006927452378782e-06, + "loss": 0.0093, + "step": 31854 + }, + { + "epoch": 0.63712, + "grad_norm": 0.05284750834107399, + "learning_rate": 7.005595227362858e-06, + "loss": 0.0058, + "step": 31856 + }, + { + "epoch": 0.63716, + "grad_norm": 0.3703402280807495, + "learning_rate": 7.004263060724395e-06, + "loss": 0.0249, + "step": 31858 + }, + { + "epoch": 0.6372, + "grad_norm": 0.06403866410255432, + "learning_rate": 7.002930952489362e-06, + "loss": 0.0012, + "step": 31860 + }, + { + "epoch": 0.63724, + "grad_norm": 0.15859024226665497, + "learning_rate": 7.001598902683737e-06, + "loss": 0.0023, + "step": 31862 + }, + { + "epoch": 0.63728, + "grad_norm": 1.346355676651001, + "learning_rate": 7.000266911333484e-06, + "loss": 0.0139, + "step": 31864 + }, + { + "epoch": 0.63732, + "grad_norm": 0.17525161802768707, + "learning_rate": 6.9989349784645735e-06, + "loss": 0.0028, + "step": 31866 + }, + { + "epoch": 0.63736, + "grad_norm": 0.0022014465648680925, + "learning_rate": 6.997603104102971e-06, + "loss": 0.0011, + "step": 31868 + }, + { + "epoch": 0.6374, + "grad_norm": 0.09355635941028595, + "learning_rate": 6.996271288274636e-06, + "loss": 0.0012, + "step": 31870 + }, + { + "epoch": 0.63744, + "grad_norm": 0.029183300212025642, + "learning_rate": 6.994939531005545e-06, + "loss": 0.001, + "step": 31872 + }, + { + "epoch": 0.63748, + "grad_norm": 0.056891005486249924, + "learning_rate": 6.993607832321654e-06, + "loss": 0.0007, + "step": 31874 + }, + { + "epoch": 0.63752, + "grad_norm": 0.17575150728225708, + "learning_rate": 6.992276192248921e-06, + "loss": 0.0081, + "step": 31876 + }, + { + "epoch": 0.63756, + "grad_norm": 0.02329327166080475, + "learning_rate": 6.990944610813319e-06, + "loss": 0.002, + "step": 31878 + }, + { + "epoch": 0.6376, + "grad_norm": 0.05096748098731041, + "learning_rate": 6.9896130880407965e-06, + "loss": 0.1764, + "step": 31880 + }, + { + "epoch": 0.63764, + "grad_norm": 4.847313404083252, + "learning_rate": 6.98828162395732e-06, + "loss": 0.0654, + "step": 31882 + }, + { + "epoch": 0.63768, + "grad_norm": 1.7379165887832642, + "learning_rate": 6.986950218588842e-06, + "loss": 0.0356, + "step": 31884 + }, + { + "epoch": 0.63772, + "grad_norm": 11.491791725158691, + "learning_rate": 6.985618871961318e-06, + "loss": 0.3187, + "step": 31886 + }, + { + "epoch": 0.63776, + "grad_norm": 20.09892463684082, + "learning_rate": 6.98428758410071e-06, + "loss": 0.7134, + "step": 31888 + }, + { + "epoch": 0.6378, + "grad_norm": 0.008129633031785488, + "learning_rate": 6.982956355032968e-06, + "loss": 0.0338, + "step": 31890 + }, + { + "epoch": 0.63784, + "grad_norm": 0.212479829788208, + "learning_rate": 6.981625184784041e-06, + "loss": 0.2069, + "step": 31892 + }, + { + "epoch": 0.63788, + "grad_norm": 0.14605940878391266, + "learning_rate": 6.98029407337989e-06, + "loss": 0.0604, + "step": 31894 + }, + { + "epoch": 0.63792, + "grad_norm": 0.17446285486221313, + "learning_rate": 6.978963020846457e-06, + "loss": 0.0023, + "step": 31896 + }, + { + "epoch": 0.63796, + "grad_norm": 0.17962205410003662, + "learning_rate": 6.9776320272096976e-06, + "loss": 0.0034, + "step": 31898 + }, + { + "epoch": 0.638, + "grad_norm": 0.4617069959640503, + "learning_rate": 6.976301092495556e-06, + "loss": 0.0365, + "step": 31900 + }, + { + "epoch": 0.63804, + "grad_norm": 3.2209370136260986, + "learning_rate": 6.974970216729981e-06, + "loss": 0.047, + "step": 31902 + }, + { + "epoch": 0.63808, + "grad_norm": 0.178860142827034, + "learning_rate": 6.973639399938923e-06, + "loss": 0.0032, + "step": 31904 + }, + { + "epoch": 0.63812, + "grad_norm": 0.9910588264465332, + "learning_rate": 6.972308642148321e-06, + "loss": 0.0316, + "step": 31906 + }, + { + "epoch": 0.63816, + "grad_norm": 0.11491326987743378, + "learning_rate": 6.9709779433841176e-06, + "loss": 0.0039, + "step": 31908 + }, + { + "epoch": 0.6382, + "grad_norm": 2.7505226135253906, + "learning_rate": 6.969647303672262e-06, + "loss": 0.0884, + "step": 31910 + }, + { + "epoch": 0.63824, + "grad_norm": 0.35021764039993286, + "learning_rate": 6.968316723038689e-06, + "loss": 0.0079, + "step": 31912 + }, + { + "epoch": 0.63828, + "grad_norm": 0.6391382217407227, + "learning_rate": 6.966986201509346e-06, + "loss": 0.0225, + "step": 31914 + }, + { + "epoch": 0.63832, + "grad_norm": 0.8867546319961548, + "learning_rate": 6.965655739110169e-06, + "loss": 0.0131, + "step": 31916 + }, + { + "epoch": 0.63836, + "grad_norm": 0.2090277373790741, + "learning_rate": 6.964325335867093e-06, + "loss": 0.003, + "step": 31918 + }, + { + "epoch": 0.6384, + "grad_norm": 11.058446884155273, + "learning_rate": 6.962994991806059e-06, + "loss": 0.7323, + "step": 31920 + }, + { + "epoch": 0.63844, + "grad_norm": 0.2809205949306488, + "learning_rate": 6.961664706953002e-06, + "loss": 0.0111, + "step": 31922 + }, + { + "epoch": 0.63848, + "grad_norm": 0.25639793276786804, + "learning_rate": 6.9603344813338505e-06, + "loss": 0.0053, + "step": 31924 + }, + { + "epoch": 0.63852, + "grad_norm": 0.05082498490810394, + "learning_rate": 6.959004314974549e-06, + "loss": 0.0041, + "step": 31926 + }, + { + "epoch": 0.63856, + "grad_norm": 0.8294826745986938, + "learning_rate": 6.957674207901018e-06, + "loss": 0.0109, + "step": 31928 + }, + { + "epoch": 0.6386, + "grad_norm": 0.038510579615831375, + "learning_rate": 6.956344160139201e-06, + "loss": 0.0029, + "step": 31930 + }, + { + "epoch": 0.63864, + "grad_norm": 11.096923828125, + "learning_rate": 6.95501417171502e-06, + "loss": 0.1528, + "step": 31932 + }, + { + "epoch": 0.63868, + "grad_norm": 0.05340830236673355, + "learning_rate": 6.9536842426544035e-06, + "loss": 0.0041, + "step": 31934 + }, + { + "epoch": 0.63872, + "grad_norm": 0.0682784840464592, + "learning_rate": 6.952354372983283e-06, + "loss": 0.0157, + "step": 31936 + }, + { + "epoch": 0.63876, + "grad_norm": 0.0503353476524353, + "learning_rate": 6.951024562727584e-06, + "loss": 0.0021, + "step": 31938 + }, + { + "epoch": 0.6388, + "grad_norm": 0.31730127334594727, + "learning_rate": 6.949694811913226e-06, + "loss": 0.0091, + "step": 31940 + }, + { + "epoch": 0.63884, + "grad_norm": 0.029681818559765816, + "learning_rate": 6.948365120566143e-06, + "loss": 0.0025, + "step": 31942 + }, + { + "epoch": 0.63888, + "grad_norm": 0.16900725662708282, + "learning_rate": 6.9470354887122485e-06, + "loss": 0.0063, + "step": 31944 + }, + { + "epoch": 0.63892, + "grad_norm": 0.1775830090045929, + "learning_rate": 6.945705916377472e-06, + "loss": 0.0026, + "step": 31946 + }, + { + "epoch": 0.63896, + "grad_norm": 0.005051844287663698, + "learning_rate": 6.944376403587732e-06, + "loss": 0.0035, + "step": 31948 + }, + { + "epoch": 0.639, + "grad_norm": 0.30118802189826965, + "learning_rate": 6.943046950368944e-06, + "loss": 0.0137, + "step": 31950 + }, + { + "epoch": 0.63904, + "grad_norm": 0.0026517517399042845, + "learning_rate": 6.941717556747033e-06, + "loss": 0.0026, + "step": 31952 + }, + { + "epoch": 0.63908, + "grad_norm": 0.05140574276447296, + "learning_rate": 6.940388222747908e-06, + "loss": 0.003, + "step": 31954 + }, + { + "epoch": 0.63912, + "grad_norm": 0.060657452791929245, + "learning_rate": 6.939058948397495e-06, + "loss": 0.0014, + "step": 31956 + }, + { + "epoch": 0.63916, + "grad_norm": 0.057694897055625916, + "learning_rate": 6.937729733721702e-06, + "loss": 0.0021, + "step": 31958 + }, + { + "epoch": 0.6392, + "grad_norm": 9.222054481506348, + "learning_rate": 6.9364005787464406e-06, + "loss": 0.1632, + "step": 31960 + }, + { + "epoch": 0.63924, + "grad_norm": 0.018954796716570854, + "learning_rate": 6.935071483497633e-06, + "loss": 0.0003, + "step": 31962 + }, + { + "epoch": 0.63928, + "grad_norm": 0.37908267974853516, + "learning_rate": 6.9337424480011825e-06, + "loss": 0.0055, + "step": 31964 + }, + { + "epoch": 0.63932, + "grad_norm": 0.05370325222611427, + "learning_rate": 6.932413472283002e-06, + "loss": 0.0018, + "step": 31966 + }, + { + "epoch": 0.63936, + "grad_norm": 0.27595558762550354, + "learning_rate": 6.931084556368998e-06, + "loss": 0.0048, + "step": 31968 + }, + { + "epoch": 0.6394, + "grad_norm": 0.4113791584968567, + "learning_rate": 6.929755700285082e-06, + "loss": 0.005, + "step": 31970 + }, + { + "epoch": 0.63944, + "grad_norm": 0.1273205727338791, + "learning_rate": 6.928426904057159e-06, + "loss": 0.008, + "step": 31972 + }, + { + "epoch": 0.63948, + "grad_norm": 0.3907381296157837, + "learning_rate": 6.9270981677111375e-06, + "loss": 0.0116, + "step": 31974 + }, + { + "epoch": 0.63952, + "grad_norm": 0.827637791633606, + "learning_rate": 6.925769491272913e-06, + "loss": 0.0184, + "step": 31976 + }, + { + "epoch": 0.63956, + "grad_norm": 0.018291203305125237, + "learning_rate": 6.924440874768401e-06, + "loss": 0.0033, + "step": 31978 + }, + { + "epoch": 0.6396, + "grad_norm": 0.508056640625, + "learning_rate": 6.923112318223497e-06, + "loss": 0.04, + "step": 31980 + }, + { + "epoch": 0.63964, + "grad_norm": 0.07021445780992508, + "learning_rate": 6.9217838216640976e-06, + "loss": 0.0048, + "step": 31982 + }, + { + "epoch": 0.63968, + "grad_norm": 0.1037091314792633, + "learning_rate": 6.920455385116112e-06, + "loss": 0.0021, + "step": 31984 + }, + { + "epoch": 0.63972, + "grad_norm": 0.17846162617206573, + "learning_rate": 6.919127008605432e-06, + "loss": 0.0026, + "step": 31986 + }, + { + "epoch": 0.63976, + "grad_norm": 0.5708821415901184, + "learning_rate": 6.917798692157959e-06, + "loss": 0.0132, + "step": 31988 + }, + { + "epoch": 0.6398, + "grad_norm": 0.23201200366020203, + "learning_rate": 6.9164704357995874e-06, + "loss": 0.0216, + "step": 31990 + }, + { + "epoch": 0.63984, + "grad_norm": 0.15237456560134888, + "learning_rate": 6.915142239556208e-06, + "loss": 0.0085, + "step": 31992 + }, + { + "epoch": 0.63988, + "grad_norm": 0.6184283494949341, + "learning_rate": 6.913814103453723e-06, + "loss": 0.0096, + "step": 31994 + }, + { + "epoch": 0.63992, + "grad_norm": 10.947364807128906, + "learning_rate": 6.912486027518021e-06, + "loss": 0.301, + "step": 31996 + }, + { + "epoch": 0.63996, + "grad_norm": 0.02572430856525898, + "learning_rate": 6.91115801177499e-06, + "loss": 0.0009, + "step": 31998 + }, + { + "epoch": 0.64, + "grad_norm": 0.06973595917224884, + "learning_rate": 6.909830056250527e-06, + "loss": 0.0038, + "step": 32000 + }, + { + "epoch": 0.64004, + "grad_norm": 0.16101861000061035, + "learning_rate": 6.908502160970516e-06, + "loss": 0.0035, + "step": 32002 + }, + { + "epoch": 0.64008, + "grad_norm": 0.30064132809638977, + "learning_rate": 6.907174325960849e-06, + "loss": 0.0504, + "step": 32004 + }, + { + "epoch": 0.64012, + "grad_norm": 0.20710352063179016, + "learning_rate": 6.9058465512474116e-06, + "loss": 0.0667, + "step": 32006 + }, + { + "epoch": 0.64016, + "grad_norm": 0.33833226561546326, + "learning_rate": 6.904518836856084e-06, + "loss": 0.0079, + "step": 32008 + }, + { + "epoch": 0.6402, + "grad_norm": 0.11791830509901047, + "learning_rate": 6.903191182812759e-06, + "loss": 0.0059, + "step": 32010 + }, + { + "epoch": 0.64024, + "grad_norm": 6.4234938621521, + "learning_rate": 6.901863589143317e-06, + "loss": 0.0795, + "step": 32012 + }, + { + "epoch": 0.64028, + "grad_norm": 0.17394407093524933, + "learning_rate": 6.900536055873634e-06, + "loss": 0.0022, + "step": 32014 + }, + { + "epoch": 0.64032, + "grad_norm": 0.04040106385946274, + "learning_rate": 6.8992085830296015e-06, + "loss": 0.0008, + "step": 32016 + }, + { + "epoch": 0.64036, + "grad_norm": 0.239081010222435, + "learning_rate": 6.897881170637093e-06, + "loss": 0.4136, + "step": 32018 + }, + { + "epoch": 0.6404, + "grad_norm": 0.3230477571487427, + "learning_rate": 6.896553818721989e-06, + "loss": 0.0039, + "step": 32020 + }, + { + "epoch": 0.64044, + "grad_norm": 5.5188798904418945, + "learning_rate": 6.895226527310166e-06, + "loss": 0.3138, + "step": 32022 + }, + { + "epoch": 0.64048, + "grad_norm": 0.344985693693161, + "learning_rate": 6.893899296427497e-06, + "loss": 0.0112, + "step": 32024 + }, + { + "epoch": 0.64052, + "grad_norm": 0.09448330104351044, + "learning_rate": 6.892572126099863e-06, + "loss": 0.0016, + "step": 32026 + }, + { + "epoch": 0.64056, + "grad_norm": 0.4432951807975769, + "learning_rate": 6.8912450163531365e-06, + "loss": 0.0646, + "step": 32028 + }, + { + "epoch": 0.6406, + "grad_norm": 0.14938752353191376, + "learning_rate": 6.889917967213184e-06, + "loss": 0.0022, + "step": 32030 + }, + { + "epoch": 0.64064, + "grad_norm": 0.1278923749923706, + "learning_rate": 6.888590978705887e-06, + "loss": 0.0041, + "step": 32032 + }, + { + "epoch": 0.64068, + "grad_norm": 0.38677433133125305, + "learning_rate": 6.8872640508571075e-06, + "loss": 0.0072, + "step": 32034 + }, + { + "epoch": 0.64072, + "grad_norm": 0.4866528809070587, + "learning_rate": 6.885937183692718e-06, + "loss": 0.2087, + "step": 32036 + }, + { + "epoch": 0.64076, + "grad_norm": 0.319130539894104, + "learning_rate": 6.884610377238589e-06, + "loss": 0.0055, + "step": 32038 + }, + { + "epoch": 0.6408, + "grad_norm": 0.07391074299812317, + "learning_rate": 6.883283631520582e-06, + "loss": 0.0019, + "step": 32040 + }, + { + "epoch": 0.64084, + "grad_norm": 0.4733890891075134, + "learning_rate": 6.881956946564568e-06, + "loss": 0.0074, + "step": 32042 + }, + { + "epoch": 0.64088, + "grad_norm": 2.4281833171844482, + "learning_rate": 6.880630322396403e-06, + "loss": 0.0283, + "step": 32044 + }, + { + "epoch": 0.64092, + "grad_norm": 5.087307929992676, + "learning_rate": 6.879303759041962e-06, + "loss": 0.1626, + "step": 32046 + }, + { + "epoch": 0.64096, + "grad_norm": 0.030993325635790825, + "learning_rate": 6.8779772565271e-06, + "loss": 0.0047, + "step": 32048 + }, + { + "epoch": 0.641, + "grad_norm": 0.4456462860107422, + "learning_rate": 6.876650814877675e-06, + "loss": 0.0364, + "step": 32050 + }, + { + "epoch": 0.64104, + "grad_norm": 0.603700578212738, + "learning_rate": 6.875324434119552e-06, + "loss": 0.0084, + "step": 32052 + }, + { + "epoch": 0.64108, + "grad_norm": 0.32716992497444153, + "learning_rate": 6.87399811427859e-06, + "loss": 0.0048, + "step": 32054 + }, + { + "epoch": 0.64112, + "grad_norm": 1.2314534187316895, + "learning_rate": 6.872671855380642e-06, + "loss": 0.0339, + "step": 32056 + }, + { + "epoch": 0.64116, + "grad_norm": 0.436275452375412, + "learning_rate": 6.871345657451569e-06, + "loss": 0.0138, + "step": 32058 + }, + { + "epoch": 0.6412, + "grad_norm": 0.3382951021194458, + "learning_rate": 6.870019520517217e-06, + "loss": 0.0072, + "step": 32060 + }, + { + "epoch": 0.64124, + "grad_norm": 0.9156345129013062, + "learning_rate": 6.868693444603451e-06, + "loss": 0.0641, + "step": 32062 + }, + { + "epoch": 0.64128, + "grad_norm": 2.1581578254699707, + "learning_rate": 6.867367429736119e-06, + "loss": 0.0288, + "step": 32064 + }, + { + "epoch": 0.64132, + "grad_norm": 0.059873975813388824, + "learning_rate": 6.866041475941068e-06, + "loss": 0.0007, + "step": 32066 + }, + { + "epoch": 0.64136, + "grad_norm": 0.24000641703605652, + "learning_rate": 6.864715583244155e-06, + "loss": 0.0033, + "step": 32068 + }, + { + "epoch": 0.6414, + "grad_norm": 0.5594494938850403, + "learning_rate": 6.863389751671225e-06, + "loss": 0.76, + "step": 32070 + }, + { + "epoch": 0.64144, + "grad_norm": 1.116856336593628, + "learning_rate": 6.862063981248126e-06, + "loss": 0.0128, + "step": 32072 + }, + { + "epoch": 0.64148, + "grad_norm": 1.6152119636535645, + "learning_rate": 6.8607382720007066e-06, + "loss": 0.0271, + "step": 32074 + }, + { + "epoch": 0.64152, + "grad_norm": 0.12597328424453735, + "learning_rate": 6.859412623954807e-06, + "loss": 0.0021, + "step": 32076 + }, + { + "epoch": 0.64156, + "grad_norm": 13.934225082397461, + "learning_rate": 6.858087037136281e-06, + "loss": 0.5494, + "step": 32078 + }, + { + "epoch": 0.6416, + "grad_norm": 0.3319699466228485, + "learning_rate": 6.856761511570963e-06, + "loss": 0.0077, + "step": 32080 + }, + { + "epoch": 0.64164, + "grad_norm": 0.07579215615987778, + "learning_rate": 6.855436047284693e-06, + "loss": 0.0044, + "step": 32082 + }, + { + "epoch": 0.64168, + "grad_norm": 0.03917817771434784, + "learning_rate": 6.8541106443033236e-06, + "loss": 0.0061, + "step": 32084 + }, + { + "epoch": 0.64172, + "grad_norm": 2.4408161640167236, + "learning_rate": 6.852785302652685e-06, + "loss": 0.0485, + "step": 32086 + }, + { + "epoch": 0.64176, + "grad_norm": 0.013590146787464619, + "learning_rate": 6.851460022358615e-06, + "loss": 0.001, + "step": 32088 + }, + { + "epoch": 0.6418, + "grad_norm": 1.391179084777832, + "learning_rate": 6.850134803446955e-06, + "loss": 0.2252, + "step": 32090 + }, + { + "epoch": 0.64184, + "grad_norm": 0.6332449913024902, + "learning_rate": 6.848809645943535e-06, + "loss": 0.0105, + "step": 32092 + }, + { + "epoch": 0.64188, + "grad_norm": 0.743232786655426, + "learning_rate": 6.847484549874197e-06, + "loss": 0.0139, + "step": 32094 + }, + { + "epoch": 0.64192, + "grad_norm": 0.3410627245903015, + "learning_rate": 6.846159515264769e-06, + "loss": 0.0062, + "step": 32096 + }, + { + "epoch": 0.64196, + "grad_norm": 0.11710008233785629, + "learning_rate": 6.844834542141084e-06, + "loss": 0.0353, + "step": 32098 + }, + { + "epoch": 0.642, + "grad_norm": 5.455016613006592, + "learning_rate": 6.843509630528977e-06, + "loss": 0.1034, + "step": 32100 + }, + { + "epoch": 0.64204, + "grad_norm": 0.34286507964134216, + "learning_rate": 6.842184780454274e-06, + "loss": 0.0058, + "step": 32102 + }, + { + "epoch": 0.64208, + "grad_norm": 0.012364926747977734, + "learning_rate": 6.840859991942801e-06, + "loss": 0.0136, + "step": 32104 + }, + { + "epoch": 0.64212, + "grad_norm": 0.27808964252471924, + "learning_rate": 6.839535265020393e-06, + "loss": 0.0076, + "step": 32106 + }, + { + "epoch": 0.64216, + "grad_norm": 0.22462518513202667, + "learning_rate": 6.838210599712869e-06, + "loss": 0.0407, + "step": 32108 + }, + { + "epoch": 0.6422, + "grad_norm": 0.783320963382721, + "learning_rate": 6.836885996046061e-06, + "loss": 0.4023, + "step": 32110 + }, + { + "epoch": 0.64224, + "grad_norm": 0.8324812650680542, + "learning_rate": 6.8355614540457885e-06, + "loss": 0.0117, + "step": 32112 + }, + { + "epoch": 0.64228, + "grad_norm": 0.11952342092990875, + "learning_rate": 6.834236973737869e-06, + "loss": 0.0036, + "step": 32114 + }, + { + "epoch": 0.64232, + "grad_norm": 0.11859772354364395, + "learning_rate": 6.832912555148136e-06, + "loss": 0.011, + "step": 32116 + }, + { + "epoch": 0.64236, + "grad_norm": 8.958109855651855, + "learning_rate": 6.8315881983024015e-06, + "loss": 0.1271, + "step": 32118 + }, + { + "epoch": 0.6424, + "grad_norm": 0.05099139362573624, + "learning_rate": 6.830263903226483e-06, + "loss": 0.0012, + "step": 32120 + }, + { + "epoch": 0.64244, + "grad_norm": 0.1529378741979599, + "learning_rate": 6.828939669946205e-06, + "loss": 0.0043, + "step": 32122 + }, + { + "epoch": 0.64248, + "grad_norm": 0.3658531904220581, + "learning_rate": 6.827615498487379e-06, + "loss": 0.0073, + "step": 32124 + }, + { + "epoch": 0.64252, + "grad_norm": 6.076254367828369, + "learning_rate": 6.826291388875825e-06, + "loss": 0.0948, + "step": 32126 + }, + { + "epoch": 0.64256, + "grad_norm": 0.38768643140792847, + "learning_rate": 6.824967341137353e-06, + "loss": 0.0041, + "step": 32128 + }, + { + "epoch": 0.6426, + "grad_norm": 0.09495265036821365, + "learning_rate": 6.823643355297774e-06, + "loss": 0.0974, + "step": 32130 + }, + { + "epoch": 0.64264, + "grad_norm": 0.6679491400718689, + "learning_rate": 6.822319431382907e-06, + "loss": 0.0085, + "step": 32132 + }, + { + "epoch": 0.64268, + "grad_norm": 0.31558331847190857, + "learning_rate": 6.820995569418558e-06, + "loss": 0.0054, + "step": 32134 + }, + { + "epoch": 0.64272, + "grad_norm": 0.24929150938987732, + "learning_rate": 6.819671769430534e-06, + "loss": 0.0043, + "step": 32136 + }, + { + "epoch": 0.64276, + "grad_norm": 0.7778469920158386, + "learning_rate": 6.818348031444651e-06, + "loss": 0.0409, + "step": 32138 + }, + { + "epoch": 0.6428, + "grad_norm": 0.26907435059547424, + "learning_rate": 6.8170243554867065e-06, + "loss": 0.0918, + "step": 32140 + }, + { + "epoch": 0.64284, + "grad_norm": 0.04331982508301735, + "learning_rate": 6.815700741582514e-06, + "loss": 0.0123, + "step": 32142 + }, + { + "epoch": 0.64288, + "grad_norm": 0.1453785002231598, + "learning_rate": 6.814377189757876e-06, + "loss": 0.011, + "step": 32144 + }, + { + "epoch": 0.64292, + "grad_norm": 0.10468646883964539, + "learning_rate": 6.81305370003859e-06, + "loss": 0.018, + "step": 32146 + }, + { + "epoch": 0.64296, + "grad_norm": 0.09122520685195923, + "learning_rate": 6.8117302724504674e-06, + "loss": 0.0085, + "step": 32148 + }, + { + "epoch": 0.643, + "grad_norm": 0.38296860456466675, + "learning_rate": 6.8104069070193e-06, + "loss": 0.0049, + "step": 32150 + }, + { + "epoch": 0.64304, + "grad_norm": 0.10058281570672989, + "learning_rate": 6.809083603770896e-06, + "loss": 0.0011, + "step": 32152 + }, + { + "epoch": 0.64308, + "grad_norm": 0.4103845953941345, + "learning_rate": 6.80776036273105e-06, + "loss": 0.0061, + "step": 32154 + }, + { + "epoch": 0.64312, + "grad_norm": 0.2327689528465271, + "learning_rate": 6.806437183925557e-06, + "loss": 0.0035, + "step": 32156 + }, + { + "epoch": 0.64316, + "grad_norm": 0.3270370662212372, + "learning_rate": 6.805114067380218e-06, + "loss": 0.0066, + "step": 32158 + }, + { + "epoch": 0.6432, + "grad_norm": 0.24136927723884583, + "learning_rate": 6.803791013120822e-06, + "loss": 0.0033, + "step": 32160 + }, + { + "epoch": 0.64324, + "grad_norm": 0.20677773654460907, + "learning_rate": 6.802468021173166e-06, + "loss": 0.0117, + "step": 32162 + }, + { + "epoch": 0.64328, + "grad_norm": 12.079313278198242, + "learning_rate": 6.801145091563045e-06, + "loss": 0.5884, + "step": 32164 + }, + { + "epoch": 0.64332, + "grad_norm": 0.04707193374633789, + "learning_rate": 6.799822224316243e-06, + "loss": 0.0077, + "step": 32166 + }, + { + "epoch": 0.64336, + "grad_norm": 0.020915113389492035, + "learning_rate": 6.798499419458557e-06, + "loss": 0.0004, + "step": 32168 + }, + { + "epoch": 0.6434, + "grad_norm": 0.8364986777305603, + "learning_rate": 6.797176677015775e-06, + "loss": 0.0306, + "step": 32170 + }, + { + "epoch": 0.64344, + "grad_norm": 0.13633333146572113, + "learning_rate": 6.795853997013677e-06, + "loss": 0.0021, + "step": 32172 + }, + { + "epoch": 0.64348, + "grad_norm": 0.04886246845126152, + "learning_rate": 6.79453137947806e-06, + "loss": 0.0015, + "step": 32174 + }, + { + "epoch": 0.64352, + "grad_norm": 0.10721350461244583, + "learning_rate": 6.793208824434705e-06, + "loss": 0.0055, + "step": 32176 + }, + { + "epoch": 0.64356, + "grad_norm": 0.13894210755825043, + "learning_rate": 6.7918863319093905e-06, + "loss": 0.0076, + "step": 32178 + }, + { + "epoch": 0.6436, + "grad_norm": 0.0810784101486206, + "learning_rate": 6.790563901927907e-06, + "loss": 0.0031, + "step": 32180 + }, + { + "epoch": 0.64364, + "grad_norm": 0.053505342453718185, + "learning_rate": 6.789241534516029e-06, + "loss": 0.0015, + "step": 32182 + }, + { + "epoch": 0.64368, + "grad_norm": 7.124048709869385, + "learning_rate": 6.787919229699543e-06, + "loss": 0.1916, + "step": 32184 + }, + { + "epoch": 0.64372, + "grad_norm": 0.030463451519608498, + "learning_rate": 6.786596987504226e-06, + "loss": 0.0014, + "step": 32186 + }, + { + "epoch": 0.64376, + "grad_norm": 0.025050003081560135, + "learning_rate": 6.78527480795585e-06, + "loss": 0.0107, + "step": 32188 + }, + { + "epoch": 0.6438, + "grad_norm": 0.026881398633122444, + "learning_rate": 6.783952691080203e-06, + "loss": 0.001, + "step": 32190 + }, + { + "epoch": 0.64384, + "grad_norm": 0.8623941540718079, + "learning_rate": 6.782630636903054e-06, + "loss": 0.019, + "step": 32192 + }, + { + "epoch": 0.64388, + "grad_norm": 0.027993522584438324, + "learning_rate": 6.781308645450173e-06, + "loss": 0.0272, + "step": 32194 + }, + { + "epoch": 0.64392, + "grad_norm": 8.7447509765625, + "learning_rate": 6.77998671674734e-06, + "loss": 0.2097, + "step": 32196 + }, + { + "epoch": 0.64396, + "grad_norm": 0.24774935841560364, + "learning_rate": 6.77866485082032e-06, + "loss": 0.036, + "step": 32198 + }, + { + "epoch": 0.644, + "grad_norm": 0.040240898728370667, + "learning_rate": 6.777343047694891e-06, + "loss": 0.0008, + "step": 32200 + }, + { + "epoch": 0.64404, + "grad_norm": 0.47879889607429504, + "learning_rate": 6.776021307396818e-06, + "loss": 0.0111, + "step": 32202 + }, + { + "epoch": 0.64408, + "grad_norm": 0.2919880151748657, + "learning_rate": 6.774699629951865e-06, + "loss": 0.0056, + "step": 32204 + }, + { + "epoch": 0.64412, + "grad_norm": 0.03791571408510208, + "learning_rate": 6.773378015385807e-06, + "loss": 0.001, + "step": 32206 + }, + { + "epoch": 0.64416, + "grad_norm": 0.2939360737800598, + "learning_rate": 6.772056463724408e-06, + "loss": 0.0202, + "step": 32208 + }, + { + "epoch": 0.6442, + "grad_norm": 0.9366205930709839, + "learning_rate": 6.770734974993427e-06, + "loss": 0.0281, + "step": 32210 + }, + { + "epoch": 0.64424, + "grad_norm": 0.012764387764036655, + "learning_rate": 6.769413549218632e-06, + "loss": 0.0065, + "step": 32212 + }, + { + "epoch": 0.64428, + "grad_norm": 0.11176375299692154, + "learning_rate": 6.768092186425779e-06, + "loss": 0.0479, + "step": 32214 + }, + { + "epoch": 0.64432, + "grad_norm": 0.23161794245243073, + "learning_rate": 6.766770886640637e-06, + "loss": 0.0138, + "step": 32216 + }, + { + "epoch": 0.64436, + "grad_norm": 0.1321943998336792, + "learning_rate": 6.76544964988896e-06, + "loss": 0.0022, + "step": 32218 + }, + { + "epoch": 0.6444, + "grad_norm": 2.55798602104187, + "learning_rate": 6.764128476196505e-06, + "loss": 0.0342, + "step": 32220 + }, + { + "epoch": 0.64444, + "grad_norm": 0.007022396195679903, + "learning_rate": 6.7628073655890345e-06, + "loss": 0.0012, + "step": 32222 + }, + { + "epoch": 0.64448, + "grad_norm": 0.15550200641155243, + "learning_rate": 6.7614863180923004e-06, + "loss": 0.01, + "step": 32224 + }, + { + "epoch": 0.64452, + "grad_norm": 0.3288436532020569, + "learning_rate": 6.760165333732057e-06, + "loss": 0.0836, + "step": 32226 + }, + { + "epoch": 0.64456, + "grad_norm": 0.014403628185391426, + "learning_rate": 6.758844412534058e-06, + "loss": 0.0005, + "step": 32228 + }, + { + "epoch": 0.6446, + "grad_norm": 0.032309819012880325, + "learning_rate": 6.757523554524056e-06, + "loss": 0.0136, + "step": 32230 + }, + { + "epoch": 0.64464, + "grad_norm": 0.04838347062468529, + "learning_rate": 6.756202759727804e-06, + "loss": 0.0382, + "step": 32232 + }, + { + "epoch": 0.64468, + "grad_norm": 0.03770815208554268, + "learning_rate": 6.754882028171048e-06, + "loss": 0.3552, + "step": 32234 + }, + { + "epoch": 0.64472, + "grad_norm": 0.049472376704216, + "learning_rate": 6.753561359879534e-06, + "loss": 0.4113, + "step": 32236 + }, + { + "epoch": 0.64476, + "grad_norm": 8.428031921386719, + "learning_rate": 6.752240754879017e-06, + "loss": 0.3537, + "step": 32238 + }, + { + "epoch": 0.6448, + "grad_norm": 0.8251115679740906, + "learning_rate": 6.750920213195238e-06, + "loss": 0.0153, + "step": 32240 + }, + { + "epoch": 0.64484, + "grad_norm": 0.5913460850715637, + "learning_rate": 6.749599734853939e-06, + "loss": 0.0059, + "step": 32242 + }, + { + "epoch": 0.64488, + "grad_norm": 0.262238085269928, + "learning_rate": 6.7482793198808705e-06, + "loss": 0.0033, + "step": 32244 + }, + { + "epoch": 0.64492, + "grad_norm": 0.16711513698101044, + "learning_rate": 6.7469589683017675e-06, + "loss": 0.0029, + "step": 32246 + }, + { + "epoch": 0.64496, + "grad_norm": 0.04749559983611107, + "learning_rate": 6.745638680142377e-06, + "loss": 0.0007, + "step": 32248 + }, + { + "epoch": 0.645, + "grad_norm": 0.00511857308447361, + "learning_rate": 6.744318455428436e-06, + "loss": 0.0024, + "step": 32250 + }, + { + "epoch": 0.64504, + "grad_norm": 0.05753978714346886, + "learning_rate": 6.7429982941856785e-06, + "loss": 0.0009, + "step": 32252 + }, + { + "epoch": 0.64508, + "grad_norm": 0.00047036196338012815, + "learning_rate": 6.741678196439852e-06, + "loss": 0.0019, + "step": 32254 + }, + { + "epoch": 0.64512, + "grad_norm": 1.5647906064987183, + "learning_rate": 6.74035816221668e-06, + "loss": 0.0228, + "step": 32256 + }, + { + "epoch": 0.64516, + "grad_norm": 0.03826628625392914, + "learning_rate": 6.739038191541909e-06, + "loss": 0.0024, + "step": 32258 + }, + { + "epoch": 0.6452, + "grad_norm": 0.03477836400270462, + "learning_rate": 6.737718284441267e-06, + "loss": 0.1268, + "step": 32260 + }, + { + "epoch": 0.64524, + "grad_norm": 0.09459693729877472, + "learning_rate": 6.736398440940485e-06, + "loss": 0.0114, + "step": 32262 + }, + { + "epoch": 0.64528, + "grad_norm": 0.07409579306840897, + "learning_rate": 6.7350786610652974e-06, + "loss": 0.004, + "step": 32264 + }, + { + "epoch": 0.64532, + "grad_norm": 0.09443586319684982, + "learning_rate": 6.733758944841431e-06, + "loss": 0.0014, + "step": 32266 + }, + { + "epoch": 0.64536, + "grad_norm": 0.037624724209308624, + "learning_rate": 6.732439292294612e-06, + "loss": 0.0879, + "step": 32268 + }, + { + "epoch": 0.6454, + "grad_norm": 0.004942172206938267, + "learning_rate": 6.731119703450577e-06, + "loss": 0.0239, + "step": 32270 + }, + { + "epoch": 0.64544, + "grad_norm": 0.04893188551068306, + "learning_rate": 6.729800178335042e-06, + "loss": 0.0025, + "step": 32272 + }, + { + "epoch": 0.64548, + "grad_norm": 0.14896070957183838, + "learning_rate": 6.728480716973739e-06, + "loss": 0.0037, + "step": 32274 + }, + { + "epoch": 0.64552, + "grad_norm": 0.03483813628554344, + "learning_rate": 6.727161319392388e-06, + "loss": 0.0014, + "step": 32276 + }, + { + "epoch": 0.64556, + "grad_norm": 0.030963564291596413, + "learning_rate": 6.725841985616712e-06, + "loss": 0.001, + "step": 32278 + }, + { + "epoch": 0.6456, + "grad_norm": 0.143478661775589, + "learning_rate": 6.7245227156724324e-06, + "loss": 0.5504, + "step": 32280 + }, + { + "epoch": 0.64564, + "grad_norm": 0.06486771255731583, + "learning_rate": 6.72320350958527e-06, + "loss": 0.0127, + "step": 32282 + }, + { + "epoch": 0.64568, + "grad_norm": 0.011603477410972118, + "learning_rate": 6.7218843673809375e-06, + "loss": 0.0036, + "step": 32284 + }, + { + "epoch": 0.64572, + "grad_norm": 0.2518036961555481, + "learning_rate": 6.720565289085161e-06, + "loss": 0.003, + "step": 32286 + }, + { + "epoch": 0.64576, + "grad_norm": 0.1623043268918991, + "learning_rate": 6.71924627472365e-06, + "loss": 0.2364, + "step": 32288 + }, + { + "epoch": 0.6458, + "grad_norm": 0.4583805501461029, + "learning_rate": 6.717927324322124e-06, + "loss": 0.0129, + "step": 32290 + }, + { + "epoch": 0.64584, + "grad_norm": 0.1949196755886078, + "learning_rate": 6.716608437906296e-06, + "loss": 0.0046, + "step": 32292 + }, + { + "epoch": 0.64588, + "grad_norm": 0.41805773973464966, + "learning_rate": 6.715289615501875e-06, + "loss": 0.0047, + "step": 32294 + }, + { + "epoch": 0.64592, + "grad_norm": 0.013370327651500702, + "learning_rate": 6.713970857134574e-06, + "loss": 0.0029, + "step": 32296 + }, + { + "epoch": 0.64596, + "grad_norm": 0.00443869736045599, + "learning_rate": 6.7126521628301025e-06, + "loss": 0.0017, + "step": 32298 + }, + { + "epoch": 0.646, + "grad_norm": 0.31110715866088867, + "learning_rate": 6.711333532614168e-06, + "loss": 0.0061, + "step": 32300 + }, + { + "epoch": 0.64604, + "grad_norm": 0.07692176103591919, + "learning_rate": 6.710014966512483e-06, + "loss": 0.0073, + "step": 32302 + }, + { + "epoch": 0.64608, + "grad_norm": 1.8580824136734009, + "learning_rate": 6.708696464550746e-06, + "loss": 0.0213, + "step": 32304 + }, + { + "epoch": 0.64612, + "grad_norm": 0.606857419013977, + "learning_rate": 6.707378026754669e-06, + "loss": 0.0077, + "step": 32306 + }, + { + "epoch": 0.64616, + "grad_norm": 0.2149885594844818, + "learning_rate": 6.706059653149954e-06, + "loss": 0.017, + "step": 32308 + }, + { + "epoch": 0.6462, + "grad_norm": 0.638815701007843, + "learning_rate": 6.704741343762296e-06, + "loss": 0.0069, + "step": 32310 + }, + { + "epoch": 0.64624, + "grad_norm": 0.031950242817401886, + "learning_rate": 6.703423098617407e-06, + "loss": 0.0067, + "step": 32312 + }, + { + "epoch": 0.64628, + "grad_norm": 0.666752815246582, + "learning_rate": 6.7021049177409816e-06, + "loss": 0.0122, + "step": 32314 + }, + { + "epoch": 0.64632, + "grad_norm": 0.06883435696363449, + "learning_rate": 6.700786801158716e-06, + "loss": 0.0023, + "step": 32316 + }, + { + "epoch": 0.64636, + "grad_norm": 0.49343788623809814, + "learning_rate": 6.699468748896314e-06, + "loss": 0.0161, + "step": 32318 + }, + { + "epoch": 0.6464, + "grad_norm": 0.22003430128097534, + "learning_rate": 6.698150760979463e-06, + "loss": 0.0045, + "step": 32320 + }, + { + "epoch": 0.64644, + "grad_norm": 0.08714282512664795, + "learning_rate": 6.696832837433867e-06, + "loss": 0.0025, + "step": 32322 + }, + { + "epoch": 0.64648, + "grad_norm": 0.3214889168739319, + "learning_rate": 6.695514978285216e-06, + "loss": 0.0161, + "step": 32324 + }, + { + "epoch": 0.64652, + "grad_norm": 0.21270200610160828, + "learning_rate": 6.694197183559197e-06, + "loss": 0.0331, + "step": 32326 + }, + { + "epoch": 0.64656, + "grad_norm": 3.1974411010742188, + "learning_rate": 6.692879453281509e-06, + "loss": 0.0082, + "step": 32328 + }, + { + "epoch": 0.6466, + "grad_norm": 0.04198148474097252, + "learning_rate": 6.69156178747784e-06, + "loss": 0.0195, + "step": 32330 + }, + { + "epoch": 0.64664, + "grad_norm": 1.056466817855835, + "learning_rate": 6.6902441861738754e-06, + "loss": 0.0183, + "step": 32332 + }, + { + "epoch": 0.64668, + "grad_norm": 0.47067561745643616, + "learning_rate": 6.688926649395305e-06, + "loss": 0.0059, + "step": 32334 + }, + { + "epoch": 0.64672, + "grad_norm": 0.1292915642261505, + "learning_rate": 6.687609177167811e-06, + "loss": 0.0025, + "step": 32336 + }, + { + "epoch": 0.64676, + "grad_norm": 1.6575909852981567, + "learning_rate": 6.686291769517086e-06, + "loss": 0.0171, + "step": 32338 + }, + { + "epoch": 0.6468, + "grad_norm": 0.8179502487182617, + "learning_rate": 6.684974426468809e-06, + "loss": 0.0117, + "step": 32340 + }, + { + "epoch": 0.64684, + "grad_norm": 0.07027515769004822, + "learning_rate": 6.683657148048657e-06, + "loss": 0.1273, + "step": 32342 + }, + { + "epoch": 0.64688, + "grad_norm": 0.07443919032812119, + "learning_rate": 6.68233993428232e-06, + "loss": 0.0019, + "step": 32344 + }, + { + "epoch": 0.64692, + "grad_norm": 1.4602211713790894, + "learning_rate": 6.681022785195477e-06, + "loss": 0.0204, + "step": 32346 + }, + { + "epoch": 0.64696, + "grad_norm": 0.5940263271331787, + "learning_rate": 6.679705700813799e-06, + "loss": 0.01, + "step": 32348 + }, + { + "epoch": 0.647, + "grad_norm": 0.48532333970069885, + "learning_rate": 6.67838868116297e-06, + "loss": 0.0061, + "step": 32350 + }, + { + "epoch": 0.64704, + "grad_norm": 0.2694094777107239, + "learning_rate": 6.67707172626866e-06, + "loss": 0.1792, + "step": 32352 + }, + { + "epoch": 0.64708, + "grad_norm": 9.589007377624512, + "learning_rate": 6.675754836156552e-06, + "loss": 0.2196, + "step": 32354 + }, + { + "epoch": 0.64712, + "grad_norm": 0.10484237223863602, + "learning_rate": 6.674438010852313e-06, + "loss": 0.0661, + "step": 32356 + }, + { + "epoch": 0.64716, + "grad_norm": 0.18958362936973572, + "learning_rate": 6.673121250381616e-06, + "loss": 0.0033, + "step": 32358 + }, + { + "epoch": 0.6472, + "grad_norm": 0.37038686871528625, + "learning_rate": 6.671804554770135e-06, + "loss": 0.0108, + "step": 32360 + }, + { + "epoch": 0.64724, + "grad_norm": 0.19737105071544647, + "learning_rate": 6.670487924043536e-06, + "loss": 0.0037, + "step": 32362 + }, + { + "epoch": 0.64728, + "grad_norm": 0.45408836007118225, + "learning_rate": 6.669171358227491e-06, + "loss": 0.0049, + "step": 32364 + }, + { + "epoch": 0.64732, + "grad_norm": 0.0172322578728199, + "learning_rate": 6.667854857347664e-06, + "loss": 0.0241, + "step": 32366 + }, + { + "epoch": 0.64736, + "grad_norm": 0.3052011728286743, + "learning_rate": 6.666538421429721e-06, + "loss": 0.0089, + "step": 32368 + }, + { + "epoch": 0.6474, + "grad_norm": 0.009505480527877808, + "learning_rate": 6.6652220504993305e-06, + "loss": 0.0379, + "step": 32370 + }, + { + "epoch": 0.64744, + "grad_norm": 0.021774958819150925, + "learning_rate": 6.663905744582153e-06, + "loss": 0.0003, + "step": 32372 + }, + { + "epoch": 0.64748, + "grad_norm": 0.018501833081245422, + "learning_rate": 6.6625895037038455e-06, + "loss": 0.0007, + "step": 32374 + }, + { + "epoch": 0.64752, + "grad_norm": 0.25982940196990967, + "learning_rate": 6.661273327890079e-06, + "loss": 0.0058, + "step": 32376 + }, + { + "epoch": 0.64756, + "grad_norm": 0.1291324645280838, + "learning_rate": 6.659957217166504e-06, + "loss": 0.5897, + "step": 32378 + }, + { + "epoch": 0.6476, + "grad_norm": 0.5144178867340088, + "learning_rate": 6.658641171558785e-06, + "loss": 0.0055, + "step": 32380 + }, + { + "epoch": 0.64764, + "grad_norm": 0.13759197294712067, + "learning_rate": 6.657325191092578e-06, + "loss": 0.0015, + "step": 32382 + }, + { + "epoch": 0.64768, + "grad_norm": 0.02443574368953705, + "learning_rate": 6.6560092757935345e-06, + "loss": 0.0006, + "step": 32384 + }, + { + "epoch": 0.64772, + "grad_norm": 0.07659760117530823, + "learning_rate": 6.654693425687315e-06, + "loss": 0.0016, + "step": 32386 + }, + { + "epoch": 0.64776, + "grad_norm": 0.8026557564735413, + "learning_rate": 6.653377640799568e-06, + "loss": 0.01, + "step": 32388 + }, + { + "epoch": 0.6478, + "grad_norm": 0.03006008081138134, + "learning_rate": 6.6520619211559435e-06, + "loss": 0.0006, + "step": 32390 + }, + { + "epoch": 0.64784, + "grad_norm": 0.889630138874054, + "learning_rate": 6.6507462667821e-06, + "loss": 0.0137, + "step": 32392 + }, + { + "epoch": 0.64788, + "grad_norm": 0.03706235811114311, + "learning_rate": 6.649430677703677e-06, + "loss": 0.0013, + "step": 32394 + }, + { + "epoch": 0.64792, + "grad_norm": 0.03301604464650154, + "learning_rate": 6.6481151539463325e-06, + "loss": 0.0172, + "step": 32396 + }, + { + "epoch": 0.64796, + "grad_norm": 0.20211422443389893, + "learning_rate": 6.646799695535711e-06, + "loss": 0.3026, + "step": 32398 + }, + { + "epoch": 0.648, + "grad_norm": 0.10521315038204193, + "learning_rate": 6.645484302497452e-06, + "loss": 0.0049, + "step": 32400 + }, + { + "epoch": 0.64804, + "grad_norm": 0.10912155359983444, + "learning_rate": 6.6441689748572034e-06, + "loss": 0.236, + "step": 32402 + }, + { + "epoch": 0.64808, + "grad_norm": 0.002206355333328247, + "learning_rate": 6.642853712640611e-06, + "loss": 0.0524, + "step": 32404 + }, + { + "epoch": 0.64812, + "grad_norm": 10.124850273132324, + "learning_rate": 6.6415385158733095e-06, + "loss": 0.256, + "step": 32406 + }, + { + "epoch": 0.64816, + "grad_norm": 0.8947609066963196, + "learning_rate": 6.640223384580947e-06, + "loss": 0.3848, + "step": 32408 + }, + { + "epoch": 0.6482, + "grad_norm": 0.3996865451335907, + "learning_rate": 6.638908318789156e-06, + "loss": 0.0923, + "step": 32410 + }, + { + "epoch": 0.64824, + "grad_norm": 18.6572208404541, + "learning_rate": 6.637593318523581e-06, + "loss": 0.5785, + "step": 32412 + }, + { + "epoch": 0.64828, + "grad_norm": 0.046544142067432404, + "learning_rate": 6.636278383809855e-06, + "loss": 0.0009, + "step": 32414 + }, + { + "epoch": 0.64832, + "grad_norm": 0.569743275642395, + "learning_rate": 6.6349635146736135e-06, + "loss": 0.1442, + "step": 32416 + }, + { + "epoch": 0.64836, + "grad_norm": 0.051119573414325714, + "learning_rate": 6.633648711140491e-06, + "loss": 0.002, + "step": 32418 + }, + { + "epoch": 0.6484, + "grad_norm": 0.01987524889409542, + "learning_rate": 6.63233397323612e-06, + "loss": 0.0033, + "step": 32420 + }, + { + "epoch": 0.64844, + "grad_norm": 0.45700377225875854, + "learning_rate": 6.63101930098613e-06, + "loss": 0.0071, + "step": 32422 + }, + { + "epoch": 0.64848, + "grad_norm": 0.48314738273620605, + "learning_rate": 6.629704694416155e-06, + "loss": 0.005, + "step": 32424 + }, + { + "epoch": 0.64852, + "grad_norm": 0.688758909702301, + "learning_rate": 6.628390153551819e-06, + "loss": 0.0113, + "step": 32426 + }, + { + "epoch": 0.64856, + "grad_norm": 0.050479792058467865, + "learning_rate": 6.6270756784187575e-06, + "loss": 0.0025, + "step": 32428 + }, + { + "epoch": 0.6486, + "grad_norm": 0.1956561952829361, + "learning_rate": 6.62576126904259e-06, + "loss": 0.004, + "step": 32430 + }, + { + "epoch": 0.64864, + "grad_norm": 0.5447821617126465, + "learning_rate": 6.624446925448944e-06, + "loss": 0.0084, + "step": 32432 + }, + { + "epoch": 0.64868, + "grad_norm": 0.11575870960950851, + "learning_rate": 6.623132647663442e-06, + "loss": 0.013, + "step": 32434 + }, + { + "epoch": 0.64872, + "grad_norm": 0.030080217868089676, + "learning_rate": 6.621818435711709e-06, + "loss": 0.001, + "step": 32436 + }, + { + "epoch": 0.64876, + "grad_norm": 0.17292892932891846, + "learning_rate": 6.620504289619364e-06, + "loss": 0.0043, + "step": 32438 + }, + { + "epoch": 0.6488, + "grad_norm": 0.20301558077335358, + "learning_rate": 6.6191902094120295e-06, + "loss": 0.0743, + "step": 32440 + }, + { + "epoch": 0.64884, + "grad_norm": 7.655066967010498, + "learning_rate": 6.617876195115318e-06, + "loss": 0.1166, + "step": 32442 + }, + { + "epoch": 0.64888, + "grad_norm": 0.5371330380439758, + "learning_rate": 6.616562246754855e-06, + "loss": 0.0205, + "step": 32444 + }, + { + "epoch": 0.64892, + "grad_norm": 0.05590663477778435, + "learning_rate": 6.615248364356254e-06, + "loss": 0.0044, + "step": 32446 + }, + { + "epoch": 0.64896, + "grad_norm": 0.3170464038848877, + "learning_rate": 6.613934547945123e-06, + "loss": 0.0047, + "step": 32448 + }, + { + "epoch": 0.649, + "grad_norm": 0.03621755540370941, + "learning_rate": 6.612620797547087e-06, + "loss": 0.0023, + "step": 32450 + }, + { + "epoch": 0.64904, + "grad_norm": 0.05654457211494446, + "learning_rate": 6.611307113187753e-06, + "loss": 0.0156, + "step": 32452 + }, + { + "epoch": 0.64908, + "grad_norm": 0.009913759306073189, + "learning_rate": 6.609993494892727e-06, + "loss": 0.0004, + "step": 32454 + }, + { + "epoch": 0.64912, + "grad_norm": 0.022697702050209045, + "learning_rate": 6.608679942687626e-06, + "loss": 0.0007, + "step": 32456 + }, + { + "epoch": 0.64916, + "grad_norm": 2.469749689102173, + "learning_rate": 6.607366456598053e-06, + "loss": 0.0273, + "step": 32458 + }, + { + "epoch": 0.6492, + "grad_norm": 3.0359675884246826, + "learning_rate": 6.60605303664962e-06, + "loss": 0.0548, + "step": 32460 + }, + { + "epoch": 0.64924, + "grad_norm": 0.20818358659744263, + "learning_rate": 6.604739682867931e-06, + "loss": 0.2211, + "step": 32462 + }, + { + "epoch": 0.64928, + "grad_norm": 0.124291330575943, + "learning_rate": 6.603426395278585e-06, + "loss": 0.0014, + "step": 32464 + }, + { + "epoch": 0.64932, + "grad_norm": 0.11511200666427612, + "learning_rate": 6.602113173907195e-06, + "loss": 0.0045, + "step": 32466 + }, + { + "epoch": 0.64936, + "grad_norm": 0.058192454278469086, + "learning_rate": 6.600800018779356e-06, + "loss": 0.0798, + "step": 32468 + }, + { + "epoch": 0.6494, + "grad_norm": 0.11254055052995682, + "learning_rate": 6.5994869299206736e-06, + "loss": 0.005, + "step": 32470 + }, + { + "epoch": 0.64944, + "grad_norm": 0.07878486067056656, + "learning_rate": 6.598173907356742e-06, + "loss": 0.0024, + "step": 32472 + }, + { + "epoch": 0.64948, + "grad_norm": 0.23855990171432495, + "learning_rate": 6.596860951113158e-06, + "loss": 0.0042, + "step": 32474 + }, + { + "epoch": 0.64952, + "grad_norm": 0.15481939911842346, + "learning_rate": 6.595548061215526e-06, + "loss": 0.0208, + "step": 32476 + }, + { + "epoch": 0.64956, + "grad_norm": 0.5947263836860657, + "learning_rate": 6.594235237689439e-06, + "loss": 0.0092, + "step": 32478 + }, + { + "epoch": 0.6496, + "grad_norm": 0.04544331133365631, + "learning_rate": 6.5929224805604845e-06, + "loss": 0.0029, + "step": 32480 + }, + { + "epoch": 0.64964, + "grad_norm": 0.16106635332107544, + "learning_rate": 6.591609789854263e-06, + "loss": 0.0101, + "step": 32482 + }, + { + "epoch": 0.64968, + "grad_norm": 0.06686897575855255, + "learning_rate": 6.590297165596362e-06, + "loss": 0.0113, + "step": 32484 + }, + { + "epoch": 0.64972, + "grad_norm": 1.1148886680603027, + "learning_rate": 6.588984607812376e-06, + "loss": 0.0185, + "step": 32486 + }, + { + "epoch": 0.64976, + "grad_norm": 0.23590992391109467, + "learning_rate": 6.58767211652789e-06, + "loss": 0.0065, + "step": 32488 + }, + { + "epoch": 0.6498, + "grad_norm": 0.2041998952627182, + "learning_rate": 6.58635969176849e-06, + "loss": 0.0039, + "step": 32490 + }, + { + "epoch": 0.64984, + "grad_norm": 10.748553276062012, + "learning_rate": 6.585047333559769e-06, + "loss": 0.243, + "step": 32492 + }, + { + "epoch": 0.64988, + "grad_norm": 0.13699521124362946, + "learning_rate": 6.5837350419273085e-06, + "loss": 0.0043, + "step": 32494 + }, + { + "epoch": 0.64992, + "grad_norm": 14.702545166015625, + "learning_rate": 6.582422816896687e-06, + "loss": 0.0487, + "step": 32496 + }, + { + "epoch": 0.64996, + "grad_norm": 2.128756523132324, + "learning_rate": 6.581110658493497e-06, + "loss": 0.0265, + "step": 32498 + }, + { + "epoch": 0.65, + "grad_norm": 0.656535267829895, + "learning_rate": 6.579798566743314e-06, + "loss": 0.0128, + "step": 32500 + }, + { + "epoch": 0.65004, + "grad_norm": 0.3672715723514557, + "learning_rate": 6.578486541671717e-06, + "loss": 0.0038, + "step": 32502 + }, + { + "epoch": 0.65008, + "grad_norm": 1.1085790395736694, + "learning_rate": 6.577174583304289e-06, + "loss": 0.0171, + "step": 32504 + }, + { + "epoch": 0.65012, + "grad_norm": 0.28073811531066895, + "learning_rate": 6.575862691666603e-06, + "loss": 0.0066, + "step": 32506 + }, + { + "epoch": 0.65016, + "grad_norm": 0.8704543709754944, + "learning_rate": 6.574550866784238e-06, + "loss": 0.0082, + "step": 32508 + }, + { + "epoch": 0.6502, + "grad_norm": 0.14309829473495483, + "learning_rate": 6.573239108682769e-06, + "loss": 0.0076, + "step": 32510 + }, + { + "epoch": 0.65024, + "grad_norm": 0.1659983992576599, + "learning_rate": 6.571927417387762e-06, + "loss": 0.0063, + "step": 32512 + }, + { + "epoch": 0.65028, + "grad_norm": 0.06543391942977905, + "learning_rate": 6.570615792924799e-06, + "loss": 0.0014, + "step": 32514 + }, + { + "epoch": 0.65032, + "grad_norm": 0.20309627056121826, + "learning_rate": 6.569304235319443e-06, + "loss": 0.003, + "step": 32516 + }, + { + "epoch": 0.65036, + "grad_norm": 0.6179678440093994, + "learning_rate": 6.567992744597271e-06, + "loss": 0.0163, + "step": 32518 + }, + { + "epoch": 0.6504, + "grad_norm": 0.024490151554346085, + "learning_rate": 6.566681320783849e-06, + "loss": 0.0005, + "step": 32520 + }, + { + "epoch": 0.65044, + "grad_norm": 0.1954810917377472, + "learning_rate": 6.565369963904738e-06, + "loss": 0.0289, + "step": 32522 + }, + { + "epoch": 0.65048, + "grad_norm": 0.45914426445961, + "learning_rate": 6.564058673985512e-06, + "loss": 0.0085, + "step": 32524 + }, + { + "epoch": 0.65052, + "grad_norm": 0.09265535324811935, + "learning_rate": 6.56274745105173e-06, + "loss": 0.0034, + "step": 32526 + }, + { + "epoch": 0.65056, + "grad_norm": 0.2380063533782959, + "learning_rate": 6.561436295128951e-06, + "loss": 0.0037, + "step": 32528 + }, + { + "epoch": 0.6506, + "grad_norm": 0.17134565114974976, + "learning_rate": 6.560125206242746e-06, + "loss": 0.0021, + "step": 32530 + }, + { + "epoch": 0.65064, + "grad_norm": 3.573376417160034, + "learning_rate": 6.558814184418669e-06, + "loss": 0.0341, + "step": 32532 + }, + { + "epoch": 0.65068, + "grad_norm": 0.3246428370475769, + "learning_rate": 6.557503229682283e-06, + "loss": 0.0289, + "step": 32534 + }, + { + "epoch": 0.65072, + "grad_norm": 0.2821413576602936, + "learning_rate": 6.556192342059145e-06, + "loss": 0.0049, + "step": 32536 + }, + { + "epoch": 0.65076, + "grad_norm": 0.07549911737442017, + "learning_rate": 6.554881521574808e-06, + "loss": 0.0011, + "step": 32538 + }, + { + "epoch": 0.6508, + "grad_norm": 1.019721269607544, + "learning_rate": 6.553570768254831e-06, + "loss": 0.0143, + "step": 32540 + }, + { + "epoch": 0.65084, + "grad_norm": 14.828266143798828, + "learning_rate": 6.552260082124767e-06, + "loss": 0.3081, + "step": 32542 + }, + { + "epoch": 0.65088, + "grad_norm": 0.36637502908706665, + "learning_rate": 6.550949463210163e-06, + "loss": 0.0271, + "step": 32544 + }, + { + "epoch": 0.65092, + "grad_norm": 0.07029100507497787, + "learning_rate": 6.54963891153658e-06, + "loss": 0.0801, + "step": 32546 + }, + { + "epoch": 0.65096, + "grad_norm": 0.03793686628341675, + "learning_rate": 6.5483284271295574e-06, + "loss": 0.0095, + "step": 32548 + }, + { + "epoch": 0.651, + "grad_norm": 0.03069189004600048, + "learning_rate": 6.547018010014654e-06, + "loss": 0.002, + "step": 32550 + }, + { + "epoch": 0.65104, + "grad_norm": 1.0638121366500854, + "learning_rate": 6.545707660217413e-06, + "loss": 0.0181, + "step": 32552 + }, + { + "epoch": 0.65108, + "grad_norm": 1.1861242055892944, + "learning_rate": 6.544397377763376e-06, + "loss": 0.0316, + "step": 32554 + }, + { + "epoch": 0.65112, + "grad_norm": 1.4013395309448242, + "learning_rate": 6.543087162678094e-06, + "loss": 0.0661, + "step": 32556 + }, + { + "epoch": 0.65116, + "grad_norm": 0.17645515501499176, + "learning_rate": 6.541777014987103e-06, + "loss": 0.0091, + "step": 32558 + }, + { + "epoch": 0.6512, + "grad_norm": 0.06771417707204819, + "learning_rate": 6.540466934715953e-06, + "loss": 0.0017, + "step": 32560 + }, + { + "epoch": 0.65124, + "grad_norm": 0.19307170808315277, + "learning_rate": 6.5391569218901816e-06, + "loss": 0.1181, + "step": 32562 + }, + { + "epoch": 0.65128, + "grad_norm": 0.835790753364563, + "learning_rate": 6.5378469765353244e-06, + "loss": 0.0155, + "step": 32564 + }, + { + "epoch": 0.65132, + "grad_norm": 4.258443355560303, + "learning_rate": 6.536537098676927e-06, + "loss": 0.0883, + "step": 32566 + }, + { + "epoch": 0.65136, + "grad_norm": 8.692572593688965, + "learning_rate": 6.535227288340522e-06, + "loss": 0.2915, + "step": 32568 + }, + { + "epoch": 0.6514, + "grad_norm": 0.06608561426401138, + "learning_rate": 6.53391754555164e-06, + "loss": 0.0008, + "step": 32570 + }, + { + "epoch": 0.65144, + "grad_norm": 5.836001396179199, + "learning_rate": 6.532607870335824e-06, + "loss": 0.1078, + "step": 32572 + }, + { + "epoch": 0.65148, + "grad_norm": 9.341081619262695, + "learning_rate": 6.531298262718602e-06, + "loss": 0.0988, + "step": 32574 + }, + { + "epoch": 0.65152, + "grad_norm": 0.7622522711753845, + "learning_rate": 6.529988722725506e-06, + "loss": 0.0186, + "step": 32576 + }, + { + "epoch": 0.65156, + "grad_norm": 0.0032373431604355574, + "learning_rate": 6.5286792503820675e-06, + "loss": 0.4299, + "step": 32578 + }, + { + "epoch": 0.6516, + "grad_norm": 0.32812342047691345, + "learning_rate": 6.52736984571381e-06, + "loss": 0.0269, + "step": 32580 + }, + { + "epoch": 0.65164, + "grad_norm": 0.04761631786823273, + "learning_rate": 6.5260605087462695e-06, + "loss": 0.0037, + "step": 32582 + }, + { + "epoch": 0.65168, + "grad_norm": 0.15763039886951447, + "learning_rate": 6.524751239504968e-06, + "loss": 0.0078, + "step": 32584 + }, + { + "epoch": 0.65172, + "grad_norm": 1.3478422164916992, + "learning_rate": 6.5234420380154265e-06, + "loss": 0.0232, + "step": 32586 + }, + { + "epoch": 0.65176, + "grad_norm": 0.0447864830493927, + "learning_rate": 6.522132904303175e-06, + "loss": 0.0425, + "step": 32588 + }, + { + "epoch": 0.6518, + "grad_norm": 0.1464865505695343, + "learning_rate": 6.520823838393732e-06, + "loss": 0.0066, + "step": 32590 + }, + { + "epoch": 0.65184, + "grad_norm": 4.603998184204102, + "learning_rate": 6.519514840312621e-06, + "loss": 0.0502, + "step": 32592 + }, + { + "epoch": 0.65188, + "grad_norm": 0.12079712003469467, + "learning_rate": 6.51820591008536e-06, + "loss": 0.0118, + "step": 32594 + }, + { + "epoch": 0.65192, + "grad_norm": 0.1569289267063141, + "learning_rate": 6.5168970477374635e-06, + "loss": 0.0225, + "step": 32596 + }, + { + "epoch": 0.65196, + "grad_norm": 0.10653073340654373, + "learning_rate": 6.515588253294456e-06, + "loss": 0.2846, + "step": 32598 + }, + { + "epoch": 0.652, + "grad_norm": 0.06416107714176178, + "learning_rate": 6.5142795267818505e-06, + "loss": 0.0026, + "step": 32600 + }, + { + "epoch": 0.65204, + "grad_norm": 0.2661743462085724, + "learning_rate": 6.512970868225156e-06, + "loss": 0.0047, + "step": 32602 + }, + { + "epoch": 0.65208, + "grad_norm": 13.90140438079834, + "learning_rate": 6.511662277649893e-06, + "loss": 0.9183, + "step": 32604 + }, + { + "epoch": 0.65212, + "grad_norm": 0.3376246690750122, + "learning_rate": 6.510353755081568e-06, + "loss": 0.1094, + "step": 32606 + }, + { + "epoch": 0.65216, + "grad_norm": 0.01918553002178669, + "learning_rate": 6.509045300545695e-06, + "loss": 0.002, + "step": 32608 + }, + { + "epoch": 0.6522, + "grad_norm": 0.3737304210662842, + "learning_rate": 6.5077369140677815e-06, + "loss": 0.0058, + "step": 32610 + }, + { + "epoch": 0.65224, + "grad_norm": 0.5591306090354919, + "learning_rate": 6.506428595673331e-06, + "loss": 0.0065, + "step": 32612 + }, + { + "epoch": 0.65228, + "grad_norm": 3.3368382453918457, + "learning_rate": 6.505120345387857e-06, + "loss": 0.0733, + "step": 32614 + }, + { + "epoch": 0.65232, + "grad_norm": 0.15800483524799347, + "learning_rate": 6.503812163236861e-06, + "loss": 0.0233, + "step": 32616 + }, + { + "epoch": 0.65236, + "grad_norm": 0.2148776650428772, + "learning_rate": 6.5025040492458434e-06, + "loss": 0.0068, + "step": 32618 + }, + { + "epoch": 0.6524, + "grad_norm": 0.8530850410461426, + "learning_rate": 6.501196003440313e-06, + "loss": 0.0125, + "step": 32620 + }, + { + "epoch": 0.65244, + "grad_norm": 0.3335830569267273, + "learning_rate": 6.499888025845766e-06, + "loss": 0.0065, + "step": 32622 + }, + { + "epoch": 0.65248, + "grad_norm": 0.4917263388633728, + "learning_rate": 6.498580116487707e-06, + "loss": 0.0088, + "step": 32624 + }, + { + "epoch": 0.65252, + "grad_norm": 0.019136695191264153, + "learning_rate": 6.49727227539163e-06, + "loss": 0.0191, + "step": 32626 + }, + { + "epoch": 0.65256, + "grad_norm": 0.09503002464771271, + "learning_rate": 6.495964502583032e-06, + "loss": 0.0031, + "step": 32628 + }, + { + "epoch": 0.6526, + "grad_norm": 0.46820348501205444, + "learning_rate": 6.494656798087412e-06, + "loss": 0.0647, + "step": 32630 + }, + { + "epoch": 0.65264, + "grad_norm": 0.02362857758998871, + "learning_rate": 6.4933491619302625e-06, + "loss": 0.0037, + "step": 32632 + }, + { + "epoch": 0.65268, + "grad_norm": 0.5792891383171082, + "learning_rate": 6.4920415941370725e-06, + "loss": 0.0917, + "step": 32634 + }, + { + "epoch": 0.65272, + "grad_norm": 0.19566769897937775, + "learning_rate": 6.490734094733342e-06, + "loss": 0.0819, + "step": 32636 + }, + { + "epoch": 0.65276, + "grad_norm": 0.15111024677753448, + "learning_rate": 6.489426663744551e-06, + "loss": 0.0138, + "step": 32638 + }, + { + "epoch": 0.6528, + "grad_norm": 15.644721031188965, + "learning_rate": 6.488119301196201e-06, + "loss": 1.0031, + "step": 32640 + }, + { + "epoch": 0.65284, + "grad_norm": 0.021385278552770615, + "learning_rate": 6.486812007113772e-06, + "loss": 0.0006, + "step": 32642 + }, + { + "epoch": 0.65288, + "grad_norm": 0.1341344714164734, + "learning_rate": 6.485504781522751e-06, + "loss": 0.0019, + "step": 32644 + }, + { + "epoch": 0.65292, + "grad_norm": 0.16886453330516815, + "learning_rate": 6.484197624448625e-06, + "loss": 0.0032, + "step": 32646 + }, + { + "epoch": 0.65296, + "grad_norm": 1.3546996116638184, + "learning_rate": 6.482890535916876e-06, + "loss": 0.0204, + "step": 32648 + }, + { + "epoch": 0.653, + "grad_norm": 0.08774750679731369, + "learning_rate": 6.481583515952983e-06, + "loss": 0.0179, + "step": 32650 + }, + { + "epoch": 0.65304, + "grad_norm": 0.10432785004377365, + "learning_rate": 6.480276564582434e-06, + "loss": 0.0021, + "step": 32652 + }, + { + "epoch": 0.65308, + "grad_norm": 0.06583769619464874, + "learning_rate": 6.478969681830703e-06, + "loss": 0.009, + "step": 32654 + }, + { + "epoch": 0.65312, + "grad_norm": 0.015696030110120773, + "learning_rate": 6.477662867723274e-06, + "loss": 0.0472, + "step": 32656 + }, + { + "epoch": 0.65316, + "grad_norm": 0.27591371536254883, + "learning_rate": 6.47635612228562e-06, + "loss": 0.0041, + "step": 32658 + }, + { + "epoch": 0.6532, + "grad_norm": 0.8626949191093445, + "learning_rate": 6.475049445543215e-06, + "loss": 0.0132, + "step": 32660 + }, + { + "epoch": 0.65324, + "grad_norm": 0.2327229231595993, + "learning_rate": 6.473742837521539e-06, + "loss": 0.0095, + "step": 32662 + }, + { + "epoch": 0.65328, + "grad_norm": 0.19377672672271729, + "learning_rate": 6.472436298246059e-06, + "loss": 0.004, + "step": 32664 + }, + { + "epoch": 0.65332, + "grad_norm": 0.06089696288108826, + "learning_rate": 6.471129827742252e-06, + "loss": 0.0028, + "step": 32666 + }, + { + "epoch": 0.65336, + "grad_norm": 0.1486150473356247, + "learning_rate": 6.469823426035586e-06, + "loss": 0.0077, + "step": 32668 + }, + { + "epoch": 0.6534, + "grad_norm": 3.9483699798583984, + "learning_rate": 6.468517093151525e-06, + "loss": 0.0822, + "step": 32670 + }, + { + "epoch": 0.65344, + "grad_norm": 0.13921602070331573, + "learning_rate": 6.467210829115547e-06, + "loss": 0.0286, + "step": 32672 + }, + { + "epoch": 0.65348, + "grad_norm": 0.5211065411567688, + "learning_rate": 6.465904633953113e-06, + "loss": 0.0106, + "step": 32674 + }, + { + "epoch": 0.65352, + "grad_norm": 0.9719707369804382, + "learning_rate": 6.4645985076896835e-06, + "loss": 0.0212, + "step": 32676 + }, + { + "epoch": 0.65356, + "grad_norm": 0.024474168196320534, + "learning_rate": 6.463292450350729e-06, + "loss": 0.0012, + "step": 32678 + }, + { + "epoch": 0.6536, + "grad_norm": 1.3940963745117188, + "learning_rate": 6.461986461961706e-06, + "loss": 0.0144, + "step": 32680 + }, + { + "epoch": 0.65364, + "grad_norm": 0.019243808463215828, + "learning_rate": 6.460680542548081e-06, + "loss": 0.0019, + "step": 32682 + }, + { + "epoch": 0.65368, + "grad_norm": 0.04674278199672699, + "learning_rate": 6.4593746921353114e-06, + "loss": 0.0056, + "step": 32684 + }, + { + "epoch": 0.65372, + "grad_norm": 0.1553819328546524, + "learning_rate": 6.458068910748852e-06, + "loss": 0.1285, + "step": 32686 + }, + { + "epoch": 0.65376, + "grad_norm": 0.01705465465784073, + "learning_rate": 6.456763198414166e-06, + "loss": 0.0214, + "step": 32688 + }, + { + "epoch": 0.6538, + "grad_norm": 0.07354449480772018, + "learning_rate": 6.455457555156706e-06, + "loss": 0.0162, + "step": 32690 + }, + { + "epoch": 0.65384, + "grad_norm": 0.053570400923490524, + "learning_rate": 6.454151981001924e-06, + "loss": 0.3737, + "step": 32692 + }, + { + "epoch": 0.65388, + "grad_norm": 5.61977481842041, + "learning_rate": 6.452846475975274e-06, + "loss": 0.0996, + "step": 32694 + }, + { + "epoch": 0.65392, + "grad_norm": 0.14265556633472443, + "learning_rate": 6.451541040102207e-06, + "loss": 0.0035, + "step": 32696 + }, + { + "epoch": 0.65396, + "grad_norm": 1.4312294721603394, + "learning_rate": 6.450235673408178e-06, + "loss": 0.0238, + "step": 32698 + }, + { + "epoch": 0.654, + "grad_norm": 0.4056430757045746, + "learning_rate": 6.448930375918632e-06, + "loss": 0.0069, + "step": 32700 + }, + { + "epoch": 0.65404, + "grad_norm": 0.007910965010523796, + "learning_rate": 6.4476251476590126e-06, + "loss": 0.0009, + "step": 32702 + }, + { + "epoch": 0.65408, + "grad_norm": 0.141156867146492, + "learning_rate": 6.446319988654773e-06, + "loss": 0.3029, + "step": 32704 + }, + { + "epoch": 0.65412, + "grad_norm": 2.7240521907806396, + "learning_rate": 6.445014898931356e-06, + "loss": 0.3485, + "step": 32706 + }, + { + "epoch": 0.65416, + "grad_norm": 0.02102692611515522, + "learning_rate": 6.443709878514198e-06, + "loss": 0.0006, + "step": 32708 + }, + { + "epoch": 0.6542, + "grad_norm": 0.07658424973487854, + "learning_rate": 6.442404927428751e-06, + "loss": 0.0015, + "step": 32710 + }, + { + "epoch": 0.65424, + "grad_norm": 0.01395318191498518, + "learning_rate": 6.4411000457004515e-06, + "loss": 0.001, + "step": 32712 + }, + { + "epoch": 0.65428, + "grad_norm": 4.840794086456299, + "learning_rate": 6.43979523335474e-06, + "loss": 0.0435, + "step": 32714 + }, + { + "epoch": 0.65432, + "grad_norm": 0.007745859678834677, + "learning_rate": 6.438490490417053e-06, + "loss": 0.0012, + "step": 32716 + }, + { + "epoch": 0.65436, + "grad_norm": 1.566006064414978, + "learning_rate": 6.437185816912823e-06, + "loss": 0.0225, + "step": 32718 + }, + { + "epoch": 0.6544, + "grad_norm": 0.5857573747634888, + "learning_rate": 6.435881212867494e-06, + "loss": 0.0083, + "step": 32720 + }, + { + "epoch": 0.65444, + "grad_norm": 0.07874596118927002, + "learning_rate": 6.434576678306497e-06, + "loss": 0.0024, + "step": 32722 + }, + { + "epoch": 0.65448, + "grad_norm": 0.4291655719280243, + "learning_rate": 6.433272213255257e-06, + "loss": 0.0081, + "step": 32724 + }, + { + "epoch": 0.65452, + "grad_norm": 5.9162917137146, + "learning_rate": 6.4319678177392175e-06, + "loss": 0.2111, + "step": 32726 + }, + { + "epoch": 0.65456, + "grad_norm": 0.22410213947296143, + "learning_rate": 6.430663491783799e-06, + "loss": 0.0046, + "step": 32728 + }, + { + "epoch": 0.6546, + "grad_norm": 0.38738641142845154, + "learning_rate": 6.4293592354144365e-06, + "loss": 0.0129, + "step": 32730 + }, + { + "epoch": 0.65464, + "grad_norm": 1.0782744884490967, + "learning_rate": 6.428055048656553e-06, + "loss": 0.0161, + "step": 32732 + }, + { + "epoch": 0.65468, + "grad_norm": 0.34177282452583313, + "learning_rate": 6.426750931535571e-06, + "loss": 0.0079, + "step": 32734 + }, + { + "epoch": 0.65472, + "grad_norm": 0.13374514877796173, + "learning_rate": 6.425446884076925e-06, + "loss": 0.002, + "step": 32736 + }, + { + "epoch": 0.65476, + "grad_norm": 0.5757413506507874, + "learning_rate": 6.42414290630603e-06, + "loss": 0.0098, + "step": 32738 + }, + { + "epoch": 0.6548, + "grad_norm": 0.046906281262636185, + "learning_rate": 6.422838998248308e-06, + "loss": 0.0966, + "step": 32740 + }, + { + "epoch": 0.65484, + "grad_norm": 0.04208020865917206, + "learning_rate": 6.4215351599291846e-06, + "loss": 0.0022, + "step": 32742 + }, + { + "epoch": 0.65488, + "grad_norm": 0.2681506276130676, + "learning_rate": 6.4202313913740735e-06, + "loss": 0.0042, + "step": 32744 + }, + { + "epoch": 0.65492, + "grad_norm": 0.04055709391832352, + "learning_rate": 6.418927692608396e-06, + "loss": 0.0017, + "step": 32746 + }, + { + "epoch": 0.65496, + "grad_norm": 0.09819893538951874, + "learning_rate": 6.4176240636575685e-06, + "loss": 0.0142, + "step": 32748 + }, + { + "epoch": 0.655, + "grad_norm": 0.08340027928352356, + "learning_rate": 6.4163205045469975e-06, + "loss": 0.003, + "step": 32750 + }, + { + "epoch": 0.65504, + "grad_norm": 1.158875823020935, + "learning_rate": 6.4150170153021095e-06, + "loss": 0.0151, + "step": 32752 + }, + { + "epoch": 0.65508, + "grad_norm": 0.452300488948822, + "learning_rate": 6.41371359594831e-06, + "loss": 0.0112, + "step": 32754 + }, + { + "epoch": 0.65512, + "grad_norm": 0.07170093804597855, + "learning_rate": 6.412410246511005e-06, + "loss": 0.0009, + "step": 32756 + }, + { + "epoch": 0.65516, + "grad_norm": 0.079286590218544, + "learning_rate": 6.411106967015615e-06, + "loss": 0.0017, + "step": 32758 + }, + { + "epoch": 0.6552, + "grad_norm": 0.20555444061756134, + "learning_rate": 6.409803757487539e-06, + "loss": 0.01, + "step": 32760 + }, + { + "epoch": 0.65524, + "grad_norm": 0.030724655836820602, + "learning_rate": 6.408500617952187e-06, + "loss": 0.0004, + "step": 32762 + }, + { + "epoch": 0.65528, + "grad_norm": 0.308552622795105, + "learning_rate": 6.407197548434968e-06, + "loss": 0.0058, + "step": 32764 + }, + { + "epoch": 0.65532, + "grad_norm": 0.3859323263168335, + "learning_rate": 6.40589454896128e-06, + "loss": 0.0063, + "step": 32766 + }, + { + "epoch": 0.65536, + "grad_norm": 0.12715311348438263, + "learning_rate": 6.404591619556529e-06, + "loss": 0.0048, + "step": 32768 + }, + { + "epoch": 0.6554, + "grad_norm": 0.18611644208431244, + "learning_rate": 6.403288760246112e-06, + "loss": 0.0026, + "step": 32770 + }, + { + "epoch": 0.65544, + "grad_norm": 0.635840892791748, + "learning_rate": 6.401985971055437e-06, + "loss": 0.008, + "step": 32772 + }, + { + "epoch": 0.65548, + "grad_norm": 0.36199653148651123, + "learning_rate": 6.400683252009899e-06, + "loss": 0.0042, + "step": 32774 + }, + { + "epoch": 0.65552, + "grad_norm": 0.2671782076358795, + "learning_rate": 6.399380603134887e-06, + "loss": 0.0037, + "step": 32776 + }, + { + "epoch": 0.65556, + "grad_norm": 0.13258981704711914, + "learning_rate": 6.398078024455809e-06, + "loss": 0.0028, + "step": 32778 + }, + { + "epoch": 0.6556, + "grad_norm": 0.0512712262570858, + "learning_rate": 6.396775515998055e-06, + "loss": 0.0479, + "step": 32780 + }, + { + "epoch": 0.65564, + "grad_norm": 0.6028703451156616, + "learning_rate": 6.395473077787015e-06, + "loss": 0.0114, + "step": 32782 + }, + { + "epoch": 0.65568, + "grad_norm": 0.0006070572999306023, + "learning_rate": 6.394170709848085e-06, + "loss": 0.0007, + "step": 32784 + }, + { + "epoch": 0.65572, + "grad_norm": 0.08403336256742477, + "learning_rate": 6.392868412206649e-06, + "loss": 0.0018, + "step": 32786 + }, + { + "epoch": 0.65576, + "grad_norm": 0.06419689208269119, + "learning_rate": 6.391566184888105e-06, + "loss": 0.0058, + "step": 32788 + }, + { + "epoch": 0.6558, + "grad_norm": 4.993454456329346, + "learning_rate": 6.390264027917836e-06, + "loss": 0.0962, + "step": 32790 + }, + { + "epoch": 0.65584, + "grad_norm": 0.6880948543548584, + "learning_rate": 6.388961941321225e-06, + "loss": 0.1137, + "step": 32792 + }, + { + "epoch": 0.65588, + "grad_norm": 7.407217025756836, + "learning_rate": 6.387659925123663e-06, + "loss": 0.1268, + "step": 32794 + }, + { + "epoch": 0.65592, + "grad_norm": 0.03247537463903427, + "learning_rate": 6.386357979350531e-06, + "loss": 0.0019, + "step": 32796 + }, + { + "epoch": 0.65596, + "grad_norm": 0.10727542638778687, + "learning_rate": 6.385056104027208e-06, + "loss": 0.107, + "step": 32798 + }, + { + "epoch": 0.656, + "grad_norm": 2.6357200145721436, + "learning_rate": 6.383754299179079e-06, + "loss": 0.0344, + "step": 32800 + }, + { + "epoch": 0.65604, + "grad_norm": 0.016515696421265602, + "learning_rate": 6.3824525648315184e-06, + "loss": 0.0012, + "step": 32802 + }, + { + "epoch": 0.65608, + "grad_norm": 0.11206996440887451, + "learning_rate": 6.381150901009912e-06, + "loss": 0.0026, + "step": 32804 + }, + { + "epoch": 0.65612, + "grad_norm": 0.6735295057296753, + "learning_rate": 6.379849307739631e-06, + "loss": 0.0494, + "step": 32806 + }, + { + "epoch": 0.65616, + "grad_norm": 2.050312042236328, + "learning_rate": 6.378547785046047e-06, + "loss": 0.0193, + "step": 32808 + }, + { + "epoch": 0.6562, + "grad_norm": 11.894461631774902, + "learning_rate": 6.377246332954544e-06, + "loss": 0.4108, + "step": 32810 + }, + { + "epoch": 0.65624, + "grad_norm": 1.2639902830123901, + "learning_rate": 6.375944951490488e-06, + "loss": 0.0172, + "step": 32812 + }, + { + "epoch": 0.65628, + "grad_norm": 0.6038187742233276, + "learning_rate": 6.374643640679249e-06, + "loss": 0.0091, + "step": 32814 + }, + { + "epoch": 0.65632, + "grad_norm": 0.00490248017013073, + "learning_rate": 6.373342400546201e-06, + "loss": 0.0035, + "step": 32816 + }, + { + "epoch": 0.65636, + "grad_norm": 0.6161715984344482, + "learning_rate": 6.372041231116705e-06, + "loss": 0.0121, + "step": 32818 + }, + { + "epoch": 0.6564, + "grad_norm": 0.057648442685604095, + "learning_rate": 6.370740132416138e-06, + "loss": 0.0021, + "step": 32820 + }, + { + "epoch": 0.65644, + "grad_norm": 0.1353878378868103, + "learning_rate": 6.369439104469861e-06, + "loss": 0.0015, + "step": 32822 + }, + { + "epoch": 0.65648, + "grad_norm": 0.06765949726104736, + "learning_rate": 6.368138147303233e-06, + "loss": 0.0033, + "step": 32824 + }, + { + "epoch": 0.65652, + "grad_norm": 0.10847935825586319, + "learning_rate": 6.366837260941625e-06, + "loss": 0.0038, + "step": 32826 + }, + { + "epoch": 0.65656, + "grad_norm": 0.23671230673789978, + "learning_rate": 6.365536445410396e-06, + "loss": 0.0058, + "step": 32828 + }, + { + "epoch": 0.6566, + "grad_norm": 0.01848820224404335, + "learning_rate": 6.364235700734903e-06, + "loss": 0.0026, + "step": 32830 + }, + { + "epoch": 0.65664, + "grad_norm": 1.817760944366455, + "learning_rate": 6.362935026940507e-06, + "loss": 0.0271, + "step": 32832 + }, + { + "epoch": 0.65668, + "grad_norm": 0.1497332602739334, + "learning_rate": 6.361634424052565e-06, + "loss": 0.0026, + "step": 32834 + }, + { + "epoch": 0.65672, + "grad_norm": 0.23740048706531525, + "learning_rate": 6.360333892096435e-06, + "loss": 0.0034, + "step": 32836 + }, + { + "epoch": 0.65676, + "grad_norm": 0.39388373494148254, + "learning_rate": 6.35903343109747e-06, + "loss": 0.0053, + "step": 32838 + }, + { + "epoch": 0.6568, + "grad_norm": 0.11209284514188766, + "learning_rate": 6.357733041081018e-06, + "loss": 0.0015, + "step": 32840 + }, + { + "epoch": 0.65684, + "grad_norm": 0.04985019192099571, + "learning_rate": 6.35643272207244e-06, + "loss": 0.0011, + "step": 32842 + }, + { + "epoch": 0.65688, + "grad_norm": 0.0031565381214022636, + "learning_rate": 6.355132474097081e-06, + "loss": 0.0019, + "step": 32844 + }, + { + "epoch": 0.65692, + "grad_norm": 0.05660685524344444, + "learning_rate": 6.353832297180289e-06, + "loss": 0.2676, + "step": 32846 + }, + { + "epoch": 0.65696, + "grad_norm": 0.15137429535388947, + "learning_rate": 6.352532191347416e-06, + "loss": 0.004, + "step": 32848 + }, + { + "epoch": 0.657, + "grad_norm": 1.4465373754501343, + "learning_rate": 6.351232156623803e-06, + "loss": 0.0199, + "step": 32850 + }, + { + "epoch": 0.65704, + "grad_norm": 0.29290011525154114, + "learning_rate": 6.349932193034801e-06, + "loss": 0.0101, + "step": 32852 + }, + { + "epoch": 0.65708, + "grad_norm": 0.04627860337495804, + "learning_rate": 6.34863230060575e-06, + "loss": 0.0042, + "step": 32854 + }, + { + "epoch": 0.65712, + "grad_norm": 0.13726481795310974, + "learning_rate": 6.347332479361987e-06, + "loss": 0.0025, + "step": 32856 + }, + { + "epoch": 0.65716, + "grad_norm": 0.2928997576236725, + "learning_rate": 6.3460327293288634e-06, + "loss": 0.0043, + "step": 32858 + }, + { + "epoch": 0.6572, + "grad_norm": 0.02428307943046093, + "learning_rate": 6.344733050531713e-06, + "loss": 0.1266, + "step": 32860 + }, + { + "epoch": 0.65724, + "grad_norm": 0.7670952081680298, + "learning_rate": 6.343433442995868e-06, + "loss": 0.0093, + "step": 32862 + }, + { + "epoch": 0.65728, + "grad_norm": 0.12020937353372574, + "learning_rate": 6.342133906746676e-06, + "loss": 0.0028, + "step": 32864 + }, + { + "epoch": 0.65732, + "grad_norm": 0.03278132900595665, + "learning_rate": 6.340834441809465e-06, + "loss": 0.0035, + "step": 32866 + }, + { + "epoch": 0.65736, + "grad_norm": 0.08114150166511536, + "learning_rate": 6.3395350482095705e-06, + "loss": 0.0199, + "step": 32868 + }, + { + "epoch": 0.6574, + "grad_norm": 0.06731092929840088, + "learning_rate": 6.338235725972326e-06, + "loss": 0.0039, + "step": 32870 + }, + { + "epoch": 0.65744, + "grad_norm": 0.037790972739458084, + "learning_rate": 6.336936475123057e-06, + "loss": 0.0021, + "step": 32872 + }, + { + "epoch": 0.65748, + "grad_norm": 0.007863866165280342, + "learning_rate": 6.3356372956871015e-06, + "loss": 0.0009, + "step": 32874 + }, + { + "epoch": 0.65752, + "grad_norm": 0.35901322960853577, + "learning_rate": 6.334338187689779e-06, + "loss": 0.0053, + "step": 32876 + }, + { + "epoch": 0.65756, + "grad_norm": 0.12426302582025528, + "learning_rate": 6.333039151156426e-06, + "loss": 0.0055, + "step": 32878 + }, + { + "epoch": 0.6576, + "grad_norm": 0.051954638212919235, + "learning_rate": 6.33174018611236e-06, + "loss": 0.0531, + "step": 32880 + }, + { + "epoch": 0.65764, + "grad_norm": 0.21212628483772278, + "learning_rate": 6.3304412925829084e-06, + "loss": 0.0046, + "step": 32882 + }, + { + "epoch": 0.65768, + "grad_norm": 0.04844426363706589, + "learning_rate": 6.329142470593393e-06, + "loss": 0.0041, + "step": 32884 + }, + { + "epoch": 0.65772, + "grad_norm": 0.07555101811885834, + "learning_rate": 6.327843720169135e-06, + "loss": 0.002, + "step": 32886 + }, + { + "epoch": 0.65776, + "grad_norm": 0.00887142401188612, + "learning_rate": 6.326545041335453e-06, + "loss": 0.0045, + "step": 32888 + }, + { + "epoch": 0.6578, + "grad_norm": 0.2060374766588211, + "learning_rate": 6.325246434117669e-06, + "loss": 0.0058, + "step": 32890 + }, + { + "epoch": 0.65784, + "grad_norm": 0.08834671974182129, + "learning_rate": 6.323947898541093e-06, + "loss": 0.0096, + "step": 32892 + }, + { + "epoch": 0.65788, + "grad_norm": 0.18109402060508728, + "learning_rate": 6.3226494346310495e-06, + "loss": 0.0069, + "step": 32894 + }, + { + "epoch": 0.65792, + "grad_norm": 0.022421469911932945, + "learning_rate": 6.321351042412849e-06, + "loss": 0.0009, + "step": 32896 + }, + { + "epoch": 0.65796, + "grad_norm": 0.08977604657411575, + "learning_rate": 6.320052721911801e-06, + "loss": 0.048, + "step": 32898 + }, + { + "epoch": 0.658, + "grad_norm": 0.038755692541599274, + "learning_rate": 6.318754473153221e-06, + "loss": 0.0011, + "step": 32900 + }, + { + "epoch": 0.65804, + "grad_norm": 0.06414701044559479, + "learning_rate": 6.3174562961624185e-06, + "loss": 0.002, + "step": 32902 + }, + { + "epoch": 0.65808, + "grad_norm": 0.03492606431245804, + "learning_rate": 6.316158190964701e-06, + "loss": 0.0009, + "step": 32904 + }, + { + "epoch": 0.65812, + "grad_norm": 0.53715580701828, + "learning_rate": 6.3148601575853765e-06, + "loss": 0.0156, + "step": 32906 + }, + { + "epoch": 0.65816, + "grad_norm": 0.020749138668179512, + "learning_rate": 6.313562196049748e-06, + "loss": 0.0026, + "step": 32908 + }, + { + "epoch": 0.6582, + "grad_norm": 1.7251853942871094, + "learning_rate": 6.3122643063831245e-06, + "loss": 0.0191, + "step": 32910 + }, + { + "epoch": 0.65824, + "grad_norm": 0.0326460599899292, + "learning_rate": 6.310966488610808e-06, + "loss": 0.0005, + "step": 32912 + }, + { + "epoch": 0.65828, + "grad_norm": 0.12103268504142761, + "learning_rate": 6.309668742758095e-06, + "loss": 0.0059, + "step": 32914 + }, + { + "epoch": 0.65832, + "grad_norm": 1.4952465295791626, + "learning_rate": 6.308371068850294e-06, + "loss": 0.0317, + "step": 32916 + }, + { + "epoch": 0.65836, + "grad_norm": 0.01677023619413376, + "learning_rate": 6.3070734669126986e-06, + "loss": 0.0004, + "step": 32918 + }, + { + "epoch": 0.6584, + "grad_norm": 5.453736782073975, + "learning_rate": 6.305775936970606e-06, + "loss": 0.0794, + "step": 32920 + }, + { + "epoch": 0.65844, + "grad_norm": 0.12866921722888947, + "learning_rate": 6.304478479049317e-06, + "loss": 0.0024, + "step": 32922 + }, + { + "epoch": 0.65848, + "grad_norm": 0.0006165747763589025, + "learning_rate": 6.303181093174115e-06, + "loss": 0.001, + "step": 32924 + }, + { + "epoch": 0.65852, + "grad_norm": 0.03143717348575592, + "learning_rate": 6.301883779370308e-06, + "loss": 0.0527, + "step": 32926 + }, + { + "epoch": 0.65856, + "grad_norm": 0.3895099461078644, + "learning_rate": 6.300586537663179e-06, + "loss": 0.0063, + "step": 32928 + }, + { + "epoch": 0.6586, + "grad_norm": 0.004485605750232935, + "learning_rate": 6.299289368078016e-06, + "loss": 0.004, + "step": 32930 + }, + { + "epoch": 0.65864, + "grad_norm": 0.007521644700318575, + "learning_rate": 6.297992270640116e-06, + "loss": 0.0008, + "step": 32932 + }, + { + "epoch": 0.65868, + "grad_norm": 5.00998067855835, + "learning_rate": 6.2966952453747644e-06, + "loss": 0.0559, + "step": 32934 + }, + { + "epoch": 0.65872, + "grad_norm": 0.04346874728798866, + "learning_rate": 6.295398292307242e-06, + "loss": 0.0016, + "step": 32936 + }, + { + "epoch": 0.65876, + "grad_norm": 0.08109447360038757, + "learning_rate": 6.29410141146284e-06, + "loss": 0.001, + "step": 32938 + }, + { + "epoch": 0.6588, + "grad_norm": 0.03786114603281021, + "learning_rate": 6.292804602866833e-06, + "loss": 0.001, + "step": 32940 + }, + { + "epoch": 0.65884, + "grad_norm": 0.020715942606329918, + "learning_rate": 6.291507866544515e-06, + "loss": 0.0005, + "step": 32942 + }, + { + "epoch": 0.65888, + "grad_norm": 0.6852784156799316, + "learning_rate": 6.290211202521159e-06, + "loss": 0.0092, + "step": 32944 + }, + { + "epoch": 0.65892, + "grad_norm": 0.03294676914811134, + "learning_rate": 6.288914610822043e-06, + "loss": 0.0009, + "step": 32946 + }, + { + "epoch": 0.65896, + "grad_norm": 0.13473834097385406, + "learning_rate": 6.287618091472451e-06, + "loss": 0.0026, + "step": 32948 + }, + { + "epoch": 0.659, + "grad_norm": 0.15472979843616486, + "learning_rate": 6.286321644497655e-06, + "loss": 0.0024, + "step": 32950 + }, + { + "epoch": 0.65904, + "grad_norm": 0.11158885806798935, + "learning_rate": 6.285025269922927e-06, + "loss": 0.0034, + "step": 32952 + }, + { + "epoch": 0.65908, + "grad_norm": 0.47014063596725464, + "learning_rate": 6.283728967773548e-06, + "loss": 0.0048, + "step": 32954 + }, + { + "epoch": 0.65912, + "grad_norm": 0.12209653109312057, + "learning_rate": 6.282432738074782e-06, + "loss": 0.0023, + "step": 32956 + }, + { + "epoch": 0.65916, + "grad_norm": 0.0056997681967914104, + "learning_rate": 6.281136580851907e-06, + "loss": 0.0003, + "step": 32958 + }, + { + "epoch": 0.6592, + "grad_norm": 0.05333265662193298, + "learning_rate": 6.27984049613019e-06, + "loss": 0.0007, + "step": 32960 + }, + { + "epoch": 0.65924, + "grad_norm": 10.69875431060791, + "learning_rate": 6.278544483934892e-06, + "loss": 0.2046, + "step": 32962 + }, + { + "epoch": 0.65928, + "grad_norm": 0.14517396688461304, + "learning_rate": 6.27724854429129e-06, + "loss": 0.0021, + "step": 32964 + }, + { + "epoch": 0.65932, + "grad_norm": 13.047223091125488, + "learning_rate": 6.275952677224644e-06, + "loss": 0.3934, + "step": 32966 + }, + { + "epoch": 0.65936, + "grad_norm": 0.008544554933905602, + "learning_rate": 6.274656882760215e-06, + "loss": 0.0009, + "step": 32968 + }, + { + "epoch": 0.6594, + "grad_norm": 0.03911634534597397, + "learning_rate": 6.273361160923271e-06, + "loss": 0.0527, + "step": 32970 + }, + { + "epoch": 0.65944, + "grad_norm": 5.5382280349731445, + "learning_rate": 6.272065511739067e-06, + "loss": 0.0473, + "step": 32972 + }, + { + "epoch": 0.65948, + "grad_norm": 0.09051526337862015, + "learning_rate": 6.270769935232869e-06, + "loss": 0.0015, + "step": 32974 + }, + { + "epoch": 0.65952, + "grad_norm": 0.20666630566120148, + "learning_rate": 6.269474431429929e-06, + "loss": 0.0024, + "step": 32976 + }, + { + "epoch": 0.65956, + "grad_norm": 0.13723531365394592, + "learning_rate": 6.2681790003555036e-06, + "loss": 0.0021, + "step": 32978 + }, + { + "epoch": 0.6596, + "grad_norm": 0.009264588356018066, + "learning_rate": 6.2668836420348535e-06, + "loss": 0.0007, + "step": 32980 + }, + { + "epoch": 0.65964, + "grad_norm": 0.7582176327705383, + "learning_rate": 6.265588356493224e-06, + "loss": 0.0154, + "step": 32982 + }, + { + "epoch": 0.65968, + "grad_norm": 0.0023514286149293184, + "learning_rate": 6.264293143755875e-06, + "loss": 0.1625, + "step": 32984 + }, + { + "epoch": 0.65972, + "grad_norm": 14.397802352905273, + "learning_rate": 6.262998003848057e-06, + "loss": 0.2197, + "step": 32986 + }, + { + "epoch": 0.65976, + "grad_norm": 0.1201602891087532, + "learning_rate": 6.2617029367950134e-06, + "loss": 0.0017, + "step": 32988 + }, + { + "epoch": 0.6598, + "grad_norm": 1.0667494535446167, + "learning_rate": 6.260407942621998e-06, + "loss": 0.0173, + "step": 32990 + }, + { + "epoch": 0.65984, + "grad_norm": 0.37575092911720276, + "learning_rate": 6.259113021354255e-06, + "loss": 0.0052, + "step": 32992 + }, + { + "epoch": 0.65988, + "grad_norm": 0.1250942051410675, + "learning_rate": 6.257818173017025e-06, + "loss": 0.0036, + "step": 32994 + }, + { + "epoch": 0.65992, + "grad_norm": 0.2299497127532959, + "learning_rate": 6.256523397635561e-06, + "loss": 0.0032, + "step": 32996 + }, + { + "epoch": 0.65996, + "grad_norm": 0.017585840076208115, + "learning_rate": 6.255228695235096e-06, + "loss": 0.0008, + "step": 32998 + }, + { + "epoch": 0.66, + "grad_norm": 0.08259763568639755, + "learning_rate": 6.25393406584088e-06, + "loss": 0.0034, + "step": 33000 + }, + { + "epoch": 0.66004, + "grad_norm": 0.31629839539527893, + "learning_rate": 6.252639509478148e-06, + "loss": 0.0068, + "step": 33002 + }, + { + "epoch": 0.66008, + "grad_norm": 0.13932964205741882, + "learning_rate": 6.251345026172135e-06, + "loss": 0.0016, + "step": 33004 + }, + { + "epoch": 0.66012, + "grad_norm": 0.018446244299411774, + "learning_rate": 6.250050615948085e-06, + "loss": 0.001, + "step": 33006 + }, + { + "epoch": 0.66016, + "grad_norm": 0.025635594502091408, + "learning_rate": 6.2487562788312275e-06, + "loss": 0.065, + "step": 33008 + }, + { + "epoch": 0.6602, + "grad_norm": 0.39457038044929504, + "learning_rate": 6.247462014846793e-06, + "loss": 0.0069, + "step": 33010 + }, + { + "epoch": 0.66024, + "grad_norm": 0.022656744346022606, + "learning_rate": 6.246167824020023e-06, + "loss": 0.0012, + "step": 33012 + }, + { + "epoch": 0.66028, + "grad_norm": 1.3952428102493286, + "learning_rate": 6.24487370637614e-06, + "loss": 0.0261, + "step": 33014 + }, + { + "epoch": 0.66032, + "grad_norm": 0.20217175781726837, + "learning_rate": 6.243579661940381e-06, + "loss": 0.0123, + "step": 33016 + }, + { + "epoch": 0.66036, + "grad_norm": 0.025564029812812805, + "learning_rate": 6.242285690737971e-06, + "loss": 0.0526, + "step": 33018 + }, + { + "epoch": 0.6604, + "grad_norm": 0.1944737434387207, + "learning_rate": 6.240991792794133e-06, + "loss": 0.0051, + "step": 33020 + }, + { + "epoch": 0.66044, + "grad_norm": 0.6617783904075623, + "learning_rate": 6.239697968134099e-06, + "loss": 0.0086, + "step": 33022 + }, + { + "epoch": 0.66048, + "grad_norm": 0.12118110060691833, + "learning_rate": 6.238404216783085e-06, + "loss": 0.0018, + "step": 33024 + }, + { + "epoch": 0.66052, + "grad_norm": 0.01130674034357071, + "learning_rate": 6.237110538766319e-06, + "loss": 0.0065, + "step": 33026 + }, + { + "epoch": 0.66056, + "grad_norm": 0.09960096329450607, + "learning_rate": 6.235816934109023e-06, + "loss": 0.0023, + "step": 33028 + }, + { + "epoch": 0.6606, + "grad_norm": 0.9162008166313171, + "learning_rate": 6.234523402836408e-06, + "loss": 0.0172, + "step": 33030 + }, + { + "epoch": 0.66064, + "grad_norm": 0.114871546626091, + "learning_rate": 6.233229944973702e-06, + "loss": 0.0026, + "step": 33032 + }, + { + "epoch": 0.66068, + "grad_norm": 14.779073715209961, + "learning_rate": 6.231936560546118e-06, + "loss": 0.374, + "step": 33034 + }, + { + "epoch": 0.66072, + "grad_norm": 0.1434658318758011, + "learning_rate": 6.230643249578866e-06, + "loss": 0.0053, + "step": 33036 + }, + { + "epoch": 0.66076, + "grad_norm": 0.02059905230998993, + "learning_rate": 6.2293500120971695e-06, + "loss": 0.0219, + "step": 33038 + }, + { + "epoch": 0.6608, + "grad_norm": 13.034128189086914, + "learning_rate": 6.228056848126236e-06, + "loss": 0.2674, + "step": 33040 + }, + { + "epoch": 0.66084, + "grad_norm": 0.2245660126209259, + "learning_rate": 6.226763757691272e-06, + "loss": 0.0031, + "step": 33042 + }, + { + "epoch": 0.66088, + "grad_norm": 0.04486272856593132, + "learning_rate": 6.225470740817495e-06, + "loss": 0.0012, + "step": 33044 + }, + { + "epoch": 0.66092, + "grad_norm": 0.01830073818564415, + "learning_rate": 6.224177797530104e-06, + "loss": 0.0003, + "step": 33046 + }, + { + "epoch": 0.66096, + "grad_norm": 3.9815659523010254, + "learning_rate": 6.222884927854317e-06, + "loss": 0.069, + "step": 33048 + }, + { + "epoch": 0.661, + "grad_norm": 0.4508057236671448, + "learning_rate": 6.22159213181533e-06, + "loss": 0.0069, + "step": 33050 + }, + { + "epoch": 0.66104, + "grad_norm": 0.006646623834967613, + "learning_rate": 6.220299409438348e-06, + "loss": 0.0189, + "step": 33052 + }, + { + "epoch": 0.66108, + "grad_norm": 0.7981998920440674, + "learning_rate": 6.219006760748578e-06, + "loss": 0.0082, + "step": 33054 + }, + { + "epoch": 0.66112, + "grad_norm": 2.015871286392212, + "learning_rate": 6.2177141857712175e-06, + "loss": 0.0332, + "step": 33056 + }, + { + "epoch": 0.66116, + "grad_norm": 0.24012427031993866, + "learning_rate": 6.2164216845314655e-06, + "loss": 0.0038, + "step": 33058 + }, + { + "epoch": 0.6612, + "grad_norm": 0.009618744254112244, + "learning_rate": 6.2151292570545215e-06, + "loss": 0.0036, + "step": 33060 + }, + { + "epoch": 0.66124, + "grad_norm": 10.291488647460938, + "learning_rate": 6.213836903365578e-06, + "loss": 0.2354, + "step": 33062 + }, + { + "epoch": 0.66128, + "grad_norm": 0.04768673703074455, + "learning_rate": 6.212544623489836e-06, + "loss": 0.0087, + "step": 33064 + }, + { + "epoch": 0.66132, + "grad_norm": 1.1132988929748535, + "learning_rate": 6.2112524174524865e-06, + "loss": 0.0174, + "step": 33066 + }, + { + "epoch": 0.66136, + "grad_norm": 0.09848179668188095, + "learning_rate": 6.20996028527872e-06, + "loss": 0.0024, + "step": 33068 + }, + { + "epoch": 0.6614, + "grad_norm": 11.819485664367676, + "learning_rate": 6.208668226993731e-06, + "loss": 0.1903, + "step": 33070 + }, + { + "epoch": 0.66144, + "grad_norm": 0.015914637595415115, + "learning_rate": 6.2073762426227065e-06, + "loss": 0.0011, + "step": 33072 + }, + { + "epoch": 0.66148, + "grad_norm": 0.0183681882917881, + "learning_rate": 6.206084332190834e-06, + "loss": 0.0875, + "step": 33074 + }, + { + "epoch": 0.66152, + "grad_norm": 0.04581696540117264, + "learning_rate": 6.204792495723302e-06, + "loss": 0.0138, + "step": 33076 + }, + { + "epoch": 0.66156, + "grad_norm": 4.005656719207764, + "learning_rate": 6.20350073324529e-06, + "loss": 0.0352, + "step": 33078 + }, + { + "epoch": 0.6616, + "grad_norm": 1.8710702657699585, + "learning_rate": 6.202209044781991e-06, + "loss": 0.0425, + "step": 33080 + }, + { + "epoch": 0.66164, + "grad_norm": 0.028906872496008873, + "learning_rate": 6.2009174303585805e-06, + "loss": 0.0005, + "step": 33082 + }, + { + "epoch": 0.66168, + "grad_norm": 0.041796427220106125, + "learning_rate": 6.199625890000237e-06, + "loss": 0.116, + "step": 33084 + }, + { + "epoch": 0.66172, + "grad_norm": 0.016744373366236687, + "learning_rate": 6.198334423732148e-06, + "loss": 0.0012, + "step": 33086 + }, + { + "epoch": 0.66176, + "grad_norm": 0.02552068792283535, + "learning_rate": 6.197043031579484e-06, + "loss": 0.0069, + "step": 33088 + }, + { + "epoch": 0.6618, + "grad_norm": 0.0006453068344853818, + "learning_rate": 6.195751713567426e-06, + "loss": 0.0092, + "step": 33090 + }, + { + "epoch": 0.66184, + "grad_norm": 11.069605827331543, + "learning_rate": 6.194460469721144e-06, + "loss": 0.2194, + "step": 33092 + }, + { + "epoch": 0.66188, + "grad_norm": 0.014784876257181168, + "learning_rate": 6.193169300065816e-06, + "loss": 0.0012, + "step": 33094 + }, + { + "epoch": 0.66192, + "grad_norm": 1.0618042945861816, + "learning_rate": 6.191878204626614e-06, + "loss": 0.018, + "step": 33096 + }, + { + "epoch": 0.66196, + "grad_norm": 3.6740970611572266, + "learning_rate": 6.1905871834287065e-06, + "loss": 0.0481, + "step": 33098 + }, + { + "epoch": 0.662, + "grad_norm": 0.027965998277068138, + "learning_rate": 6.18929623649726e-06, + "loss": 0.0003, + "step": 33100 + }, + { + "epoch": 0.66204, + "grad_norm": 0.019201334565877914, + "learning_rate": 6.188005363857448e-06, + "loss": 0.0002, + "step": 33102 + }, + { + "epoch": 0.66208, + "grad_norm": 0.2667655348777771, + "learning_rate": 6.186714565534431e-06, + "loss": 0.007, + "step": 33104 + }, + { + "epoch": 0.66212, + "grad_norm": 0.704729437828064, + "learning_rate": 6.18542384155338e-06, + "loss": 0.0111, + "step": 33106 + }, + { + "epoch": 0.66216, + "grad_norm": 1.0448118448257446, + "learning_rate": 6.184133191939454e-06, + "loss": 0.0118, + "step": 33108 + }, + { + "epoch": 0.6622, + "grad_norm": 0.08497055619955063, + "learning_rate": 6.182842616717817e-06, + "loss": 0.001, + "step": 33110 + }, + { + "epoch": 0.66224, + "grad_norm": 0.0883277952671051, + "learning_rate": 6.181552115913627e-06, + "loss": 0.0069, + "step": 33112 + }, + { + "epoch": 0.66228, + "grad_norm": 8.392080307006836, + "learning_rate": 6.180261689552047e-06, + "loss": 0.1507, + "step": 33114 + }, + { + "epoch": 0.66232, + "grad_norm": 7.713323593139648, + "learning_rate": 6.178971337658226e-06, + "loss": 0.0585, + "step": 33116 + }, + { + "epoch": 0.66236, + "grad_norm": 0.039208751171827316, + "learning_rate": 6.1776810602573325e-06, + "loss": 0.0054, + "step": 33118 + }, + { + "epoch": 0.6624, + "grad_norm": 6.333431720733643, + "learning_rate": 6.176390857374508e-06, + "loss": 0.1061, + "step": 33120 + }, + { + "epoch": 0.66244, + "grad_norm": 0.1254982352256775, + "learning_rate": 6.175100729034917e-06, + "loss": 0.016, + "step": 33122 + }, + { + "epoch": 0.66248, + "grad_norm": 9.492289543151855, + "learning_rate": 6.1738106752637076e-06, + "loss": 0.1383, + "step": 33124 + }, + { + "epoch": 0.66252, + "grad_norm": 0.007525081280618906, + "learning_rate": 6.172520696086026e-06, + "loss": 0.0074, + "step": 33126 + }, + { + "epoch": 0.66256, + "grad_norm": 0.1411527395248413, + "learning_rate": 6.1712307915270255e-06, + "loss": 0.0085, + "step": 33128 + }, + { + "epoch": 0.6626, + "grad_norm": 0.016800081357359886, + "learning_rate": 6.169940961611853e-06, + "loss": 0.0004, + "step": 33130 + }, + { + "epoch": 0.66264, + "grad_norm": 0.13396337628364563, + "learning_rate": 6.168651206365649e-06, + "loss": 0.0047, + "step": 33132 + }, + { + "epoch": 0.66268, + "grad_norm": 0.0042631253600120544, + "learning_rate": 6.1673615258135655e-06, + "loss": 0.0006, + "step": 33134 + }, + { + "epoch": 0.66272, + "grad_norm": 0.3035335838794708, + "learning_rate": 6.166071919980738e-06, + "loss": 0.4146, + "step": 33136 + }, + { + "epoch": 0.66276, + "grad_norm": 3.07562255859375, + "learning_rate": 6.164782388892319e-06, + "loss": 0.0398, + "step": 33138 + }, + { + "epoch": 0.6628, + "grad_norm": 0.38395223021507263, + "learning_rate": 6.1634929325734385e-06, + "loss": 0.0045, + "step": 33140 + }, + { + "epoch": 0.66284, + "grad_norm": 0.029885966330766678, + "learning_rate": 6.162203551049237e-06, + "loss": 0.0031, + "step": 33142 + }, + { + "epoch": 0.66288, + "grad_norm": 0.011470742523670197, + "learning_rate": 6.160914244344858e-06, + "loss": 0.0011, + "step": 33144 + }, + { + "epoch": 0.66292, + "grad_norm": 0.254517525434494, + "learning_rate": 6.159625012485429e-06, + "loss": 0.109, + "step": 33146 + }, + { + "epoch": 0.66296, + "grad_norm": 0.15397219359874725, + "learning_rate": 6.1583358554960845e-06, + "loss": 0.0098, + "step": 33148 + }, + { + "epoch": 0.663, + "grad_norm": 11.2572660446167, + "learning_rate": 6.157046773401964e-06, + "loss": 0.3737, + "step": 33150 + }, + { + "epoch": 0.66304, + "grad_norm": 0.06664712727069855, + "learning_rate": 6.155757766228192e-06, + "loss": 0.0029, + "step": 33152 + }, + { + "epoch": 0.66308, + "grad_norm": 1.9426047801971436, + "learning_rate": 6.154468833999906e-06, + "loss": 0.0984, + "step": 33154 + }, + { + "epoch": 0.66312, + "grad_norm": 0.07675272226333618, + "learning_rate": 6.1531799767422296e-06, + "loss": 0.0021, + "step": 33156 + }, + { + "epoch": 0.66316, + "grad_norm": 0.15560327470302582, + "learning_rate": 6.151891194480286e-06, + "loss": 0.0063, + "step": 33158 + }, + { + "epoch": 0.6632, + "grad_norm": 0.12353220582008362, + "learning_rate": 6.150602487239207e-06, + "loss": 0.006, + "step": 33160 + }, + { + "epoch": 0.66324, + "grad_norm": 0.28217896819114685, + "learning_rate": 6.149313855044116e-06, + "loss": 0.0088, + "step": 33162 + }, + { + "epoch": 0.66328, + "grad_norm": 0.0421840101480484, + "learning_rate": 6.14802529792013e-06, + "loss": 0.0013, + "step": 33164 + }, + { + "epoch": 0.66332, + "grad_norm": 0.02605116181075573, + "learning_rate": 6.146736815892378e-06, + "loss": 0.0006, + "step": 33166 + }, + { + "epoch": 0.66336, + "grad_norm": 0.010916471481323242, + "learning_rate": 6.14544840898597e-06, + "loss": 0.2194, + "step": 33168 + }, + { + "epoch": 0.6634, + "grad_norm": 0.17741845548152924, + "learning_rate": 6.144160077226035e-06, + "loss": 0.0078, + "step": 33170 + }, + { + "epoch": 0.66344, + "grad_norm": 0.2287956178188324, + "learning_rate": 6.1428718206376845e-06, + "loss": 0.004, + "step": 33172 + }, + { + "epoch": 0.66348, + "grad_norm": 0.12854449450969696, + "learning_rate": 6.1415836392460294e-06, + "loss": 0.3383, + "step": 33174 + }, + { + "epoch": 0.66352, + "grad_norm": 0.0014011499006301165, + "learning_rate": 6.1402955330761916e-06, + "loss": 0.0004, + "step": 33176 + }, + { + "epoch": 0.66356, + "grad_norm": 2.4630703926086426, + "learning_rate": 6.139007502153277e-06, + "loss": 0.0218, + "step": 33178 + }, + { + "epoch": 0.6636, + "grad_norm": 0.1047174260020256, + "learning_rate": 6.137719546502401e-06, + "loss": 0.4511, + "step": 33180 + }, + { + "epoch": 0.66364, + "grad_norm": 0.06815418601036072, + "learning_rate": 6.136431666148673e-06, + "loss": 0.0016, + "step": 33182 + }, + { + "epoch": 0.66368, + "grad_norm": 0.1774531602859497, + "learning_rate": 6.135143861117192e-06, + "loss": 0.0026, + "step": 33184 + }, + { + "epoch": 0.66372, + "grad_norm": 1.5646941661834717, + "learning_rate": 6.133856131433076e-06, + "loss": 0.0117, + "step": 33186 + }, + { + "epoch": 0.66376, + "grad_norm": 0.034059640020132065, + "learning_rate": 6.132568477121426e-06, + "loss": 0.0135, + "step": 33188 + }, + { + "epoch": 0.6638, + "grad_norm": 0.0485096238553524, + "learning_rate": 6.131280898207339e-06, + "loss": 0.0343, + "step": 33190 + }, + { + "epoch": 0.66384, + "grad_norm": 1.2392582893371582, + "learning_rate": 6.129993394715928e-06, + "loss": 0.0209, + "step": 33192 + }, + { + "epoch": 0.66388, + "grad_norm": 0.05925335735082626, + "learning_rate": 6.128705966672285e-06, + "loss": 0.0009, + "step": 33194 + }, + { + "epoch": 0.66392, + "grad_norm": 0.12456371635198593, + "learning_rate": 6.127418614101514e-06, + "loss": 0.002, + "step": 33196 + }, + { + "epoch": 0.66396, + "grad_norm": 0.04570312425494194, + "learning_rate": 6.126131337028714e-06, + "loss": 0.0007, + "step": 33198 + }, + { + "epoch": 0.664, + "grad_norm": 1.2808114290237427, + "learning_rate": 6.124844135478971e-06, + "loss": 0.011, + "step": 33200 + }, + { + "epoch": 0.66404, + "grad_norm": 0.3527333736419678, + "learning_rate": 6.123557009477391e-06, + "loss": 0.0043, + "step": 33202 + }, + { + "epoch": 0.66408, + "grad_norm": 0.014178172685205936, + "learning_rate": 6.122269959049063e-06, + "loss": 0.0003, + "step": 33204 + }, + { + "epoch": 0.66412, + "grad_norm": 0.0021437129471451044, + "learning_rate": 6.1209829842190745e-06, + "loss": 0.0006, + "step": 33206 + }, + { + "epoch": 0.66416, + "grad_norm": 0.04731046408414841, + "learning_rate": 6.119696085012525e-06, + "loss": 0.0021, + "step": 33208 + }, + { + "epoch": 0.6642, + "grad_norm": 0.028633620589971542, + "learning_rate": 6.118409261454494e-06, + "loss": 0.0013, + "step": 33210 + }, + { + "epoch": 0.66424, + "grad_norm": 0.1583404392004013, + "learning_rate": 6.117122513570077e-06, + "loss": 0.003, + "step": 33212 + }, + { + "epoch": 0.66428, + "grad_norm": 2.458254337310791, + "learning_rate": 6.115835841384355e-06, + "loss": 0.0303, + "step": 33214 + }, + { + "epoch": 0.66432, + "grad_norm": 0.47066888213157654, + "learning_rate": 6.114549244922408e-06, + "loss": 0.0063, + "step": 33216 + }, + { + "epoch": 0.66436, + "grad_norm": 16.378429412841797, + "learning_rate": 6.1132627242093275e-06, + "loss": 0.4108, + "step": 33218 + }, + { + "epoch": 0.6644, + "grad_norm": 0.8548154830932617, + "learning_rate": 6.1119762792701935e-06, + "loss": 0.0114, + "step": 33220 + }, + { + "epoch": 0.66444, + "grad_norm": 0.5370762944221497, + "learning_rate": 6.110689910130078e-06, + "loss": 0.007, + "step": 33222 + }, + { + "epoch": 0.66448, + "grad_norm": 0.41936632990837097, + "learning_rate": 6.109403616814069e-06, + "loss": 0.0094, + "step": 33224 + }, + { + "epoch": 0.66452, + "grad_norm": 0.08572202175855637, + "learning_rate": 6.108117399347238e-06, + "loss": 0.0028, + "step": 33226 + }, + { + "epoch": 0.66456, + "grad_norm": 0.03578587621450424, + "learning_rate": 6.106831257754661e-06, + "loss": 0.0172, + "step": 33228 + }, + { + "epoch": 0.6646, + "grad_norm": 5.48469352722168, + "learning_rate": 6.1055451920614165e-06, + "loss": 0.1273, + "step": 33230 + }, + { + "epoch": 0.66464, + "grad_norm": 0.37923818826675415, + "learning_rate": 6.1042592022925704e-06, + "loss": 0.0061, + "step": 33232 + }, + { + "epoch": 0.66468, + "grad_norm": 0.3686494827270508, + "learning_rate": 6.1029732884731995e-06, + "loss": 0.0049, + "step": 33234 + }, + { + "epoch": 0.66472, + "grad_norm": 0.1414594203233719, + "learning_rate": 6.101687450628371e-06, + "loss": 0.0017, + "step": 33236 + }, + { + "epoch": 0.66476, + "grad_norm": 0.04995923116803169, + "learning_rate": 6.100401688783147e-06, + "loss": 0.0878, + "step": 33238 + }, + { + "epoch": 0.6648, + "grad_norm": 0.13950374722480774, + "learning_rate": 6.099116002962604e-06, + "loss": 0.0036, + "step": 33240 + }, + { + "epoch": 0.66484, + "grad_norm": 0.09055082499980927, + "learning_rate": 6.0978303931917995e-06, + "loss": 0.0011, + "step": 33242 + }, + { + "epoch": 0.66488, + "grad_norm": 0.35501179099082947, + "learning_rate": 6.096544859495806e-06, + "loss": 0.0171, + "step": 33244 + }, + { + "epoch": 0.66492, + "grad_norm": 0.014337645843625069, + "learning_rate": 6.095259401899678e-06, + "loss": 0.0023, + "step": 33246 + }, + { + "epoch": 0.66496, + "grad_norm": 0.19615207612514496, + "learning_rate": 6.093974020428477e-06, + "loss": 0.003, + "step": 33248 + }, + { + "epoch": 0.665, + "grad_norm": 0.602226197719574, + "learning_rate": 6.092688715107265e-06, + "loss": 0.0103, + "step": 33250 + }, + { + "epoch": 0.66504, + "grad_norm": 1.385561227798462, + "learning_rate": 6.091403485961098e-06, + "loss": 0.0136, + "step": 33252 + }, + { + "epoch": 0.66508, + "grad_norm": 7.130559921264648, + "learning_rate": 6.09011833301503e-06, + "loss": 0.1064, + "step": 33254 + }, + { + "epoch": 0.66512, + "grad_norm": 0.08242812007665634, + "learning_rate": 6.08883325629412e-06, + "loss": 0.0081, + "step": 33256 + }, + { + "epoch": 0.66516, + "grad_norm": 0.05500863119959831, + "learning_rate": 6.087548255823414e-06, + "loss": 0.0019, + "step": 33258 + }, + { + "epoch": 0.6652, + "grad_norm": 6.807837009429932, + "learning_rate": 6.086263331627976e-06, + "loss": 0.09, + "step": 33260 + }, + { + "epoch": 0.66524, + "grad_norm": 0.20004494488239288, + "learning_rate": 6.084978483732849e-06, + "loss": 0.0088, + "step": 33262 + }, + { + "epoch": 0.66528, + "grad_norm": 0.23349931836128235, + "learning_rate": 6.083693712163079e-06, + "loss": 0.0029, + "step": 33264 + }, + { + "epoch": 0.66532, + "grad_norm": 0.19229786098003387, + "learning_rate": 6.082409016943719e-06, + "loss": 0.0287, + "step": 33266 + }, + { + "epoch": 0.66536, + "grad_norm": 0.008571092039346695, + "learning_rate": 6.081124398099812e-06, + "loss": 0.004, + "step": 33268 + }, + { + "epoch": 0.6654, + "grad_norm": 0.7435047030448914, + "learning_rate": 6.079839855656397e-06, + "loss": 0.0112, + "step": 33270 + }, + { + "epoch": 0.66544, + "grad_norm": 0.17726710438728333, + "learning_rate": 6.078555389638528e-06, + "loss": 0.0026, + "step": 33272 + }, + { + "epoch": 0.66548, + "grad_norm": 0.057027481496334076, + "learning_rate": 6.077271000071236e-06, + "loss": 0.0019, + "step": 33274 + }, + { + "epoch": 0.66552, + "grad_norm": 0.3072870075702667, + "learning_rate": 6.075986686979569e-06, + "loss": 0.0029, + "step": 33276 + }, + { + "epoch": 0.66556, + "grad_norm": 0.0187519583851099, + "learning_rate": 6.0747024503885635e-06, + "loss": 0.0015, + "step": 33278 + }, + { + "epoch": 0.6656, + "grad_norm": 0.130766361951828, + "learning_rate": 6.073418290323251e-06, + "loss": 0.0043, + "step": 33280 + }, + { + "epoch": 0.66564, + "grad_norm": 0.041488442569971085, + "learning_rate": 6.072134206808673e-06, + "loss": 0.0005, + "step": 33282 + }, + { + "epoch": 0.66568, + "grad_norm": 0.06218927726149559, + "learning_rate": 6.0708501998698555e-06, + "loss": 0.0011, + "step": 33284 + }, + { + "epoch": 0.66572, + "grad_norm": 0.3351127803325653, + "learning_rate": 6.069566269531842e-06, + "loss": 0.0042, + "step": 33286 + }, + { + "epoch": 0.66576, + "grad_norm": 0.310712605714798, + "learning_rate": 6.068282415819657e-06, + "loss": 0.0054, + "step": 33288 + }, + { + "epoch": 0.6658, + "grad_norm": 0.03489372134208679, + "learning_rate": 6.066998638758326e-06, + "loss": 0.0048, + "step": 33290 + }, + { + "epoch": 0.66584, + "grad_norm": 10.089116096496582, + "learning_rate": 6.065714938372887e-06, + "loss": 0.2052, + "step": 33292 + }, + { + "epoch": 0.66588, + "grad_norm": 7.739749431610107, + "learning_rate": 6.06443131468836e-06, + "loss": 0.0723, + "step": 33294 + }, + { + "epoch": 0.66592, + "grad_norm": 0.0167820006608963, + "learning_rate": 6.063147767729764e-06, + "loss": 0.0215, + "step": 33296 + }, + { + "epoch": 0.66596, + "grad_norm": 3.6306607723236084, + "learning_rate": 6.0618642975221364e-06, + "loss": 0.0379, + "step": 33298 + }, + { + "epoch": 0.666, + "grad_norm": 0.12866607308387756, + "learning_rate": 6.06058090409049e-06, + "loss": 0.0028, + "step": 33300 + }, + { + "epoch": 0.66604, + "grad_norm": 2.2557525634765625, + "learning_rate": 6.059297587459847e-06, + "loss": 0.036, + "step": 33302 + }, + { + "epoch": 0.66608, + "grad_norm": 0.036615367978811264, + "learning_rate": 6.0580143476552265e-06, + "loss": 0.0031, + "step": 33304 + }, + { + "epoch": 0.66612, + "grad_norm": 0.32519251108169556, + "learning_rate": 6.056731184701643e-06, + "loss": 0.0035, + "step": 33306 + }, + { + "epoch": 0.66616, + "grad_norm": 0.052304599434137344, + "learning_rate": 6.055448098624118e-06, + "loss": 0.0017, + "step": 33308 + }, + { + "epoch": 0.6662, + "grad_norm": 0.00022848101798444986, + "learning_rate": 6.054165089447663e-06, + "loss": 0.5082, + "step": 33310 + }, + { + "epoch": 0.66624, + "grad_norm": 0.5102116465568542, + "learning_rate": 6.052882157197289e-06, + "loss": 0.378, + "step": 33312 + }, + { + "epoch": 0.66628, + "grad_norm": 0.8378762006759644, + "learning_rate": 6.051599301898012e-06, + "loss": 0.0515, + "step": 33314 + }, + { + "epoch": 0.66632, + "grad_norm": 5.744176864624023, + "learning_rate": 6.0503165235748375e-06, + "loss": 0.1057, + "step": 33316 + }, + { + "epoch": 0.66636, + "grad_norm": 0.23353669047355652, + "learning_rate": 6.049033822252778e-06, + "loss": 0.0042, + "step": 33318 + }, + { + "epoch": 0.6664, + "grad_norm": 0.07787993550300598, + "learning_rate": 6.047751197956838e-06, + "loss": 0.0115, + "step": 33320 + }, + { + "epoch": 0.66644, + "grad_norm": 0.06999791413545609, + "learning_rate": 6.046468650712019e-06, + "loss": 0.0027, + "step": 33322 + }, + { + "epoch": 0.66648, + "grad_norm": 4.368006229400635, + "learning_rate": 6.045186180543334e-06, + "loss": 0.0587, + "step": 33324 + }, + { + "epoch": 0.66652, + "grad_norm": 5.551947593688965, + "learning_rate": 6.04390378747578e-06, + "loss": 0.0863, + "step": 33326 + }, + { + "epoch": 0.66656, + "grad_norm": 0.02227894589304924, + "learning_rate": 6.042621471534353e-06, + "loss": 0.0066, + "step": 33328 + }, + { + "epoch": 0.6666, + "grad_norm": 0.05079172924160957, + "learning_rate": 6.0413392327440635e-06, + "loss": 0.0038, + "step": 33330 + }, + { + "epoch": 0.66664, + "grad_norm": 0.08534837514162064, + "learning_rate": 6.040057071129901e-06, + "loss": 0.0009, + "step": 33332 + }, + { + "epoch": 0.66668, + "grad_norm": 0.05826630815863609, + "learning_rate": 6.038774986716868e-06, + "loss": 0.0079, + "step": 33334 + }, + { + "epoch": 0.66672, + "grad_norm": 8.175589561462402, + "learning_rate": 6.037492979529955e-06, + "loss": 0.109, + "step": 33336 + }, + { + "epoch": 0.66676, + "grad_norm": 0.00893015693873167, + "learning_rate": 6.036211049594152e-06, + "loss": 0.0035, + "step": 33338 + }, + { + "epoch": 0.6668, + "grad_norm": 0.07733816653490067, + "learning_rate": 6.0349291969344595e-06, + "loss": 0.0013, + "step": 33340 + }, + { + "epoch": 0.66684, + "grad_norm": 0.778854250907898, + "learning_rate": 6.033647421575863e-06, + "loss": 0.0085, + "step": 33342 + }, + { + "epoch": 0.66688, + "grad_norm": 0.060479044914245605, + "learning_rate": 6.03236572354335e-06, + "loss": 0.0025, + "step": 33344 + }, + { + "epoch": 0.66692, + "grad_norm": 0.1485956311225891, + "learning_rate": 6.031084102861913e-06, + "loss": 0.003, + "step": 33346 + }, + { + "epoch": 0.66696, + "grad_norm": 0.16826723515987396, + "learning_rate": 6.029802559556531e-06, + "loss": 0.0078, + "step": 33348 + }, + { + "epoch": 0.667, + "grad_norm": 0.006537174805998802, + "learning_rate": 6.028521093652195e-06, + "loss": 0.5482, + "step": 33350 + }, + { + "epoch": 0.66704, + "grad_norm": 0.4680923819541931, + "learning_rate": 6.027239705173884e-06, + "loss": 0.1682, + "step": 33352 + }, + { + "epoch": 0.66708, + "grad_norm": 0.15323206782341003, + "learning_rate": 6.025958394146581e-06, + "loss": 0.0025, + "step": 33354 + }, + { + "epoch": 0.66712, + "grad_norm": 0.18459594249725342, + "learning_rate": 6.0246771605952645e-06, + "loss": 0.0091, + "step": 33356 + }, + { + "epoch": 0.66716, + "grad_norm": 0.004110946785658598, + "learning_rate": 6.023396004544915e-06, + "loss": 0.0007, + "step": 33358 + }, + { + "epoch": 0.6672, + "grad_norm": 4.199509143829346, + "learning_rate": 6.022114926020504e-06, + "loss": 0.0383, + "step": 33360 + }, + { + "epoch": 0.66724, + "grad_norm": 1.474185585975647, + "learning_rate": 6.020833925047014e-06, + "loss": 0.0191, + "step": 33362 + }, + { + "epoch": 0.66728, + "grad_norm": 3.9030632972717285, + "learning_rate": 6.019553001649412e-06, + "loss": 0.049, + "step": 33364 + }, + { + "epoch": 0.66732, + "grad_norm": 0.4207804203033447, + "learning_rate": 6.0182721558526745e-06, + "loss": 0.004, + "step": 33366 + }, + { + "epoch": 0.66736, + "grad_norm": 1.735830545425415, + "learning_rate": 6.016991387681775e-06, + "loss": 0.0227, + "step": 33368 + }, + { + "epoch": 0.6674, + "grad_norm": 0.039759691804647446, + "learning_rate": 6.015710697161674e-06, + "loss": 0.0035, + "step": 33370 + }, + { + "epoch": 0.66744, + "grad_norm": 1.6696492433547974, + "learning_rate": 6.0144300843173475e-06, + "loss": 0.0198, + "step": 33372 + }, + { + "epoch": 0.66748, + "grad_norm": 2.189854383468628, + "learning_rate": 6.013149549173758e-06, + "loss": 0.0294, + "step": 33374 + }, + { + "epoch": 0.66752, + "grad_norm": 0.010115008801221848, + "learning_rate": 6.011869091755867e-06, + "loss": 0.0058, + "step": 33376 + }, + { + "epoch": 0.66756, + "grad_norm": 0.02130456455051899, + "learning_rate": 6.010588712088646e-06, + "loss": 0.024, + "step": 33378 + }, + { + "epoch": 0.6676, + "grad_norm": 0.1688174605369568, + "learning_rate": 6.009308410197048e-06, + "loss": 0.0021, + "step": 33380 + }, + { + "epoch": 0.66764, + "grad_norm": 1.1951736211776733, + "learning_rate": 6.0080281861060395e-06, + "loss": 0.0149, + "step": 33382 + }, + { + "epoch": 0.66768, + "grad_norm": 0.02096068114042282, + "learning_rate": 6.006748039840578e-06, + "loss": 0.0006, + "step": 33384 + }, + { + "epoch": 0.66772, + "grad_norm": 0.1360967606306076, + "learning_rate": 6.005467971425616e-06, + "loss": 0.002, + "step": 33386 + }, + { + "epoch": 0.66776, + "grad_norm": 0.06588536500930786, + "learning_rate": 6.004187980886116e-06, + "loss": 0.0011, + "step": 33388 + }, + { + "epoch": 0.6678, + "grad_norm": 0.22159886360168457, + "learning_rate": 6.002908068247024e-06, + "loss": 0.0033, + "step": 33390 + }, + { + "epoch": 0.66784, + "grad_norm": 0.0503297857940197, + "learning_rate": 6.001628233533301e-06, + "loss": 0.001, + "step": 33392 + }, + { + "epoch": 0.66788, + "grad_norm": 0.31711718440055847, + "learning_rate": 6.000348476769893e-06, + "loss": 0.0036, + "step": 33394 + }, + { + "epoch": 0.66792, + "grad_norm": 0.42404037714004517, + "learning_rate": 5.9990687979817485e-06, + "loss": 0.0108, + "step": 33396 + }, + { + "epoch": 0.66796, + "grad_norm": 0.06720263510942459, + "learning_rate": 5.99778919719382e-06, + "loss": 0.001, + "step": 33398 + }, + { + "epoch": 0.668, + "grad_norm": 7.900259494781494, + "learning_rate": 5.996509674431053e-06, + "loss": 0.1519, + "step": 33400 + }, + { + "epoch": 0.66804, + "grad_norm": 3.833287477493286, + "learning_rate": 5.995230229718387e-06, + "loss": 0.0583, + "step": 33402 + }, + { + "epoch": 0.66808, + "grad_norm": 0.021361274644732475, + "learning_rate": 5.993950863080773e-06, + "loss": 0.0015, + "step": 33404 + }, + { + "epoch": 0.66812, + "grad_norm": 0.0947735607624054, + "learning_rate": 5.992671574543145e-06, + "loss": 0.0037, + "step": 33406 + }, + { + "epoch": 0.66816, + "grad_norm": 0.009977089241147041, + "learning_rate": 5.991392364130453e-06, + "loss": 0.0014, + "step": 33408 + }, + { + "epoch": 0.6682, + "grad_norm": 2.0275397300720215, + "learning_rate": 5.990113231867629e-06, + "loss": 0.0301, + "step": 33410 + }, + { + "epoch": 0.66824, + "grad_norm": 0.16463474929332733, + "learning_rate": 5.988834177779607e-06, + "loss": 0.0022, + "step": 33412 + }, + { + "epoch": 0.66828, + "grad_norm": 0.08767302334308624, + "learning_rate": 5.987555201891335e-06, + "loss": 0.0018, + "step": 33414 + }, + { + "epoch": 0.66832, + "grad_norm": 0.6554246544837952, + "learning_rate": 5.9862763042277386e-06, + "loss": 0.01, + "step": 33416 + }, + { + "epoch": 0.66836, + "grad_norm": 0.33018553256988525, + "learning_rate": 5.98499748481375e-06, + "loss": 0.0046, + "step": 33418 + }, + { + "epoch": 0.6684, + "grad_norm": 0.13232503831386566, + "learning_rate": 5.983718743674302e-06, + "loss": 0.0021, + "step": 33420 + }, + { + "epoch": 0.66844, + "grad_norm": 0.010576226748526096, + "learning_rate": 5.982440080834324e-06, + "loss": 0.0058, + "step": 33422 + }, + { + "epoch": 0.66848, + "grad_norm": 0.02264762669801712, + "learning_rate": 5.981161496318749e-06, + "loss": 0.0026, + "step": 33424 + }, + { + "epoch": 0.66852, + "grad_norm": 0.0487513542175293, + "learning_rate": 5.9798829901524994e-06, + "loss": 0.0275, + "step": 33426 + }, + { + "epoch": 0.66856, + "grad_norm": 0.07582306861877441, + "learning_rate": 5.978604562360495e-06, + "loss": 0.0023, + "step": 33428 + }, + { + "epoch": 0.6686, + "grad_norm": 0.0501171238720417, + "learning_rate": 5.977326212967671e-06, + "loss": 0.0097, + "step": 33430 + }, + { + "epoch": 0.66864, + "grad_norm": 0.42548367381095886, + "learning_rate": 5.976047941998942e-06, + "loss": 0.0053, + "step": 33432 + }, + { + "epoch": 0.66868, + "grad_norm": 11.83771800994873, + "learning_rate": 5.974769749479227e-06, + "loss": 0.2045, + "step": 33434 + }, + { + "epoch": 0.66872, + "grad_norm": 0.027382897213101387, + "learning_rate": 5.97349163543345e-06, + "loss": 0.0009, + "step": 33436 + }, + { + "epoch": 0.66876, + "grad_norm": 2.5073301792144775, + "learning_rate": 5.972213599886527e-06, + "loss": 0.0239, + "step": 33438 + }, + { + "epoch": 0.6688, + "grad_norm": 0.19073130190372467, + "learning_rate": 5.970935642863375e-06, + "loss": 0.0028, + "step": 33440 + }, + { + "epoch": 0.66884, + "grad_norm": 0.11536348611116409, + "learning_rate": 5.969657764388906e-06, + "loss": 0.0025, + "step": 33442 + }, + { + "epoch": 0.66888, + "grad_norm": 0.11292218416929245, + "learning_rate": 5.968379964488032e-06, + "loss": 0.0021, + "step": 33444 + }, + { + "epoch": 0.66892, + "grad_norm": 0.5048941969871521, + "learning_rate": 5.967102243185668e-06, + "loss": 0.007, + "step": 33446 + }, + { + "epoch": 0.66896, + "grad_norm": 0.0037427754141390324, + "learning_rate": 5.965824600506722e-06, + "loss": 0.0024, + "step": 33448 + }, + { + "epoch": 0.669, + "grad_norm": 0.05061539635062218, + "learning_rate": 5.9645470364761e-06, + "loss": 0.0015, + "step": 33450 + }, + { + "epoch": 0.66904, + "grad_norm": 0.019562775269150734, + "learning_rate": 5.963269551118714e-06, + "loss": 0.0009, + "step": 33452 + }, + { + "epoch": 0.66908, + "grad_norm": 0.03714483603835106, + "learning_rate": 5.961992144459464e-06, + "loss": 0.0585, + "step": 33454 + }, + { + "epoch": 0.66912, + "grad_norm": 8.614507675170898, + "learning_rate": 5.960714816523259e-06, + "loss": 0.1267, + "step": 33456 + }, + { + "epoch": 0.66916, + "grad_norm": 20.781251907348633, + "learning_rate": 5.959437567334998e-06, + "loss": 0.9384, + "step": 33458 + }, + { + "epoch": 0.6692, + "grad_norm": 6.636744499206543, + "learning_rate": 5.958160396919577e-06, + "loss": 0.0876, + "step": 33460 + }, + { + "epoch": 0.66924, + "grad_norm": 0.04856095835566521, + "learning_rate": 5.956883305301905e-06, + "loss": 0.0097, + "step": 33462 + }, + { + "epoch": 0.66928, + "grad_norm": 0.18718057870864868, + "learning_rate": 5.955606292506874e-06, + "loss": 0.0025, + "step": 33464 + }, + { + "epoch": 0.66932, + "grad_norm": 0.27670586109161377, + "learning_rate": 5.954329358559376e-06, + "loss": 0.0062, + "step": 33466 + }, + { + "epoch": 0.66936, + "grad_norm": 0.0028297589160501957, + "learning_rate": 5.953052503484316e-06, + "loss": 0.0005, + "step": 33468 + }, + { + "epoch": 0.6694, + "grad_norm": 0.08757247775793076, + "learning_rate": 5.951775727306577e-06, + "loss": 0.0016, + "step": 33470 + }, + { + "epoch": 0.66944, + "grad_norm": 0.028150303289294243, + "learning_rate": 5.950499030051057e-06, + "loss": 0.0003, + "step": 33472 + }, + { + "epoch": 0.66948, + "grad_norm": 0.029008975252509117, + "learning_rate": 5.9492224117426436e-06, + "loss": 0.0096, + "step": 33474 + }, + { + "epoch": 0.66952, + "grad_norm": 0.021586840972304344, + "learning_rate": 5.94794587240622e-06, + "loss": 0.3187, + "step": 33476 + }, + { + "epoch": 0.66956, + "grad_norm": 0.3263280987739563, + "learning_rate": 5.9466694120666816e-06, + "loss": 0.0051, + "step": 33478 + }, + { + "epoch": 0.6696, + "grad_norm": 0.11655394732952118, + "learning_rate": 5.94539303074891e-06, + "loss": 0.002, + "step": 33480 + }, + { + "epoch": 0.66964, + "grad_norm": 1.4854536056518555, + "learning_rate": 5.944116728477784e-06, + "loss": 0.015, + "step": 33482 + }, + { + "epoch": 0.66968, + "grad_norm": 0.2302982360124588, + "learning_rate": 5.942840505278197e-06, + "loss": 0.004, + "step": 33484 + }, + { + "epoch": 0.66972, + "grad_norm": 0.016018209978938103, + "learning_rate": 5.94156436117502e-06, + "loss": 0.0018, + "step": 33486 + }, + { + "epoch": 0.66976, + "grad_norm": 0.25380614399909973, + "learning_rate": 5.940288296193137e-06, + "loss": 0.0173, + "step": 33488 + }, + { + "epoch": 0.6698, + "grad_norm": 3.855494260787964, + "learning_rate": 5.939012310357422e-06, + "loss": 0.0528, + "step": 33490 + }, + { + "epoch": 0.66984, + "grad_norm": 0.17339682579040527, + "learning_rate": 5.937736403692751e-06, + "loss": 0.0037, + "step": 33492 + }, + { + "epoch": 0.66988, + "grad_norm": 0.1563248336315155, + "learning_rate": 5.936460576224004e-06, + "loss": 0.1173, + "step": 33494 + }, + { + "epoch": 0.66992, + "grad_norm": 0.29934704303741455, + "learning_rate": 5.935184827976046e-06, + "loss": 0.0107, + "step": 33496 + }, + { + "epoch": 0.66996, + "grad_norm": 1.513911485671997, + "learning_rate": 5.9339091589737555e-06, + "loss": 0.0168, + "step": 33498 + }, + { + "epoch": 0.67, + "grad_norm": 0.40770795941352844, + "learning_rate": 5.932633569242e-06, + "loss": 0.0061, + "step": 33500 + }, + { + "epoch": 0.67004, + "grad_norm": 0.028204575181007385, + "learning_rate": 5.931358058805643e-06, + "loss": 0.0026, + "step": 33502 + }, + { + "epoch": 0.67008, + "grad_norm": 0.015017407014966011, + "learning_rate": 5.930082627689559e-06, + "loss": 0.0004, + "step": 33504 + }, + { + "epoch": 0.67012, + "grad_norm": 0.11059493571519852, + "learning_rate": 5.92880727591861e-06, + "loss": 0.0054, + "step": 33506 + }, + { + "epoch": 0.67016, + "grad_norm": 2.9434874057769775, + "learning_rate": 5.927532003517658e-06, + "loss": 0.0431, + "step": 33508 + }, + { + "epoch": 0.6702, + "grad_norm": 0.5507172346115112, + "learning_rate": 5.926256810511566e-06, + "loss": 0.0091, + "step": 33510 + }, + { + "epoch": 0.67024, + "grad_norm": 0.014565677382051945, + "learning_rate": 5.924981696925192e-06, + "loss": 0.0041, + "step": 33512 + }, + { + "epoch": 0.67028, + "grad_norm": 0.03450004756450653, + "learning_rate": 5.923706662783402e-06, + "loss": 0.3549, + "step": 33514 + }, + { + "epoch": 0.67032, + "grad_norm": 0.5269293785095215, + "learning_rate": 5.922431708111049e-06, + "loss": 0.0149, + "step": 33516 + }, + { + "epoch": 0.67036, + "grad_norm": 2.327636241912842, + "learning_rate": 5.921156832932985e-06, + "loss": 0.0345, + "step": 33518 + }, + { + "epoch": 0.6704, + "grad_norm": 0.2880159914493561, + "learning_rate": 5.9198820372740726e-06, + "loss": 0.3576, + "step": 33520 + }, + { + "epoch": 0.67044, + "grad_norm": 0.03166310861706734, + "learning_rate": 5.91860732115916e-06, + "loss": 0.0013, + "step": 33522 + }, + { + "epoch": 0.67048, + "grad_norm": 0.1716882884502411, + "learning_rate": 5.917332684613097e-06, + "loss": 0.0036, + "step": 33524 + }, + { + "epoch": 0.67052, + "grad_norm": 0.4562424421310425, + "learning_rate": 5.9160581276607385e-06, + "loss": 0.0063, + "step": 33526 + }, + { + "epoch": 0.67056, + "grad_norm": 0.08670276403427124, + "learning_rate": 5.914783650326925e-06, + "loss": 0.0016, + "step": 33528 + }, + { + "epoch": 0.6706, + "grad_norm": 0.7791656255722046, + "learning_rate": 5.913509252636511e-06, + "loss": 0.0057, + "step": 33530 + }, + { + "epoch": 0.67064, + "grad_norm": 2.2632710933685303, + "learning_rate": 5.912234934614338e-06, + "loss": 0.1564, + "step": 33532 + }, + { + "epoch": 0.67068, + "grad_norm": 0.29847607016563416, + "learning_rate": 5.910960696285245e-06, + "loss": 0.0042, + "step": 33534 + }, + { + "epoch": 0.67072, + "grad_norm": 0.43751201033592224, + "learning_rate": 5.909686537674082e-06, + "loss": 0.01, + "step": 33536 + }, + { + "epoch": 0.67076, + "grad_norm": 0.1552165299654007, + "learning_rate": 5.908412458805688e-06, + "loss": 0.0035, + "step": 33538 + }, + { + "epoch": 0.6708, + "grad_norm": 0.12895900011062622, + "learning_rate": 5.907138459704895e-06, + "loss": 0.0066, + "step": 33540 + }, + { + "epoch": 0.67084, + "grad_norm": 1.9321372509002686, + "learning_rate": 5.905864540396549e-06, + "loss": 0.028, + "step": 33542 + }, + { + "epoch": 0.67088, + "grad_norm": 0.011761555448174477, + "learning_rate": 5.904590700905476e-06, + "loss": 0.0016, + "step": 33544 + }, + { + "epoch": 0.67092, + "grad_norm": 2.2951650619506836, + "learning_rate": 5.903316941256521e-06, + "loss": 0.025, + "step": 33546 + }, + { + "epoch": 0.67096, + "grad_norm": 0.4777732491493225, + "learning_rate": 5.90204326147451e-06, + "loss": 0.0058, + "step": 33548 + }, + { + "epoch": 0.671, + "grad_norm": 11.7274808883667, + "learning_rate": 5.900769661584273e-06, + "loss": 0.3059, + "step": 33550 + }, + { + "epoch": 0.67104, + "grad_norm": 2.1675209999084473, + "learning_rate": 5.899496141610644e-06, + "loss": 0.0304, + "step": 33552 + }, + { + "epoch": 0.67108, + "grad_norm": 0.13040657341480255, + "learning_rate": 5.89822270157845e-06, + "loss": 0.0047, + "step": 33554 + }, + { + "epoch": 0.67112, + "grad_norm": 0.1538267433643341, + "learning_rate": 5.896949341512515e-06, + "loss": 0.0038, + "step": 33556 + }, + { + "epoch": 0.67116, + "grad_norm": 0.16705945134162903, + "learning_rate": 5.895676061437663e-06, + "loss": 0.0026, + "step": 33558 + }, + { + "epoch": 0.6712, + "grad_norm": 0.03911121189594269, + "learning_rate": 5.894402861378721e-06, + "loss": 0.0018, + "step": 33560 + }, + { + "epoch": 0.67124, + "grad_norm": 0.10490576922893524, + "learning_rate": 5.89312974136051e-06, + "loss": 0.0018, + "step": 33562 + }, + { + "epoch": 0.67128, + "grad_norm": 0.20127680897712708, + "learning_rate": 5.891856701407848e-06, + "loss": 0.0037, + "step": 33564 + }, + { + "epoch": 0.67132, + "grad_norm": 0.7214168906211853, + "learning_rate": 5.890583741545552e-06, + "loss": 0.0121, + "step": 33566 + }, + { + "epoch": 0.67136, + "grad_norm": 0.12776872515678406, + "learning_rate": 5.8893108617984454e-06, + "loss": 0.0032, + "step": 33568 + }, + { + "epoch": 0.6714, + "grad_norm": 9.46851921081543, + "learning_rate": 5.88803806219134e-06, + "loss": 0.3373, + "step": 33570 + }, + { + "epoch": 0.67144, + "grad_norm": 0.01197251956909895, + "learning_rate": 5.886765342749046e-06, + "loss": 0.0014, + "step": 33572 + }, + { + "epoch": 0.67148, + "grad_norm": 2.617143154144287, + "learning_rate": 5.885492703496383e-06, + "loss": 0.0409, + "step": 33574 + }, + { + "epoch": 0.67152, + "grad_norm": 1.453140139579773, + "learning_rate": 5.884220144458155e-06, + "loss": 0.0232, + "step": 33576 + }, + { + "epoch": 0.67156, + "grad_norm": 0.21335196495056152, + "learning_rate": 5.882947665659177e-06, + "loss": 0.0028, + "step": 33578 + }, + { + "epoch": 0.6716, + "grad_norm": 0.40099409222602844, + "learning_rate": 5.881675267124254e-06, + "loss": 0.0056, + "step": 33580 + }, + { + "epoch": 0.67164, + "grad_norm": 0.5457550287246704, + "learning_rate": 5.880402948878187e-06, + "loss": 0.0078, + "step": 33582 + }, + { + "epoch": 0.67168, + "grad_norm": 0.1811612844467163, + "learning_rate": 5.879130710945791e-06, + "loss": 0.3754, + "step": 33584 + }, + { + "epoch": 0.67172, + "grad_norm": 0.06746190041303635, + "learning_rate": 5.8778585533518625e-06, + "loss": 0.0143, + "step": 33586 + }, + { + "epoch": 0.67176, + "grad_norm": 0.1389012485742569, + "learning_rate": 5.8765864761212e-06, + "loss": 0.0026, + "step": 33588 + }, + { + "epoch": 0.6718, + "grad_norm": 0.07667150348424911, + "learning_rate": 5.8753144792786095e-06, + "loss": 0.0019, + "step": 33590 + }, + { + "epoch": 0.67184, + "grad_norm": 0.22734367847442627, + "learning_rate": 5.874042562848887e-06, + "loss": 0.0059, + "step": 33592 + }, + { + "epoch": 0.67188, + "grad_norm": 0.5401288866996765, + "learning_rate": 5.872770726856829e-06, + "loss": 0.0096, + "step": 33594 + }, + { + "epoch": 0.67192, + "grad_norm": 0.2020556777715683, + "learning_rate": 5.871498971327232e-06, + "loss": 0.0058, + "step": 33596 + }, + { + "epoch": 0.67196, + "grad_norm": 0.24058352410793304, + "learning_rate": 5.870227296284883e-06, + "loss": 0.0173, + "step": 33598 + }, + { + "epoch": 0.672, + "grad_norm": 0.16682367026805878, + "learning_rate": 5.868955701754584e-06, + "loss": 0.0026, + "step": 33600 + }, + { + "epoch": 0.67204, + "grad_norm": 2.553643226623535, + "learning_rate": 5.867684187761116e-06, + "loss": 0.0316, + "step": 33602 + }, + { + "epoch": 0.67208, + "grad_norm": 0.06593707203865051, + "learning_rate": 5.866412754329276e-06, + "loss": 0.0017, + "step": 33604 + }, + { + "epoch": 0.67212, + "grad_norm": 0.4860255718231201, + "learning_rate": 5.865141401483847e-06, + "loss": 0.0067, + "step": 33606 + }, + { + "epoch": 0.67216, + "grad_norm": 0.009103422053158283, + "learning_rate": 5.863870129249615e-06, + "loss": 0.0057, + "step": 33608 + }, + { + "epoch": 0.6722, + "grad_norm": 0.07642601430416107, + "learning_rate": 5.862598937651364e-06, + "loss": 0.001, + "step": 33610 + }, + { + "epoch": 0.67224, + "grad_norm": 0.21414095163345337, + "learning_rate": 5.8613278267138786e-06, + "loss": 0.0032, + "step": 33612 + }, + { + "epoch": 0.67228, + "grad_norm": 3.5090818405151367, + "learning_rate": 5.860056796461932e-06, + "loss": 0.0339, + "step": 33614 + }, + { + "epoch": 0.67232, + "grad_norm": 0.010533247143030167, + "learning_rate": 5.858785846920316e-06, + "loss": 0.0006, + "step": 33616 + }, + { + "epoch": 0.67236, + "grad_norm": 0.1428423970937729, + "learning_rate": 5.8575149781137965e-06, + "loss": 0.0022, + "step": 33618 + }, + { + "epoch": 0.6724, + "grad_norm": 0.7792277932167053, + "learning_rate": 5.85624419006716e-06, + "loss": 0.0106, + "step": 33620 + }, + { + "epoch": 0.67244, + "grad_norm": 8.652009963989258, + "learning_rate": 5.854973482805175e-06, + "loss": 0.2565, + "step": 33622 + }, + { + "epoch": 0.67248, + "grad_norm": 0.3310551643371582, + "learning_rate": 5.8537028563526145e-06, + "loss": 0.0047, + "step": 33624 + }, + { + "epoch": 0.67252, + "grad_norm": 0.06519640237092972, + "learning_rate": 5.852432310734252e-06, + "loss": 0.0013, + "step": 33626 + }, + { + "epoch": 0.67256, + "grad_norm": 0.03926287218928337, + "learning_rate": 5.851161845974857e-06, + "loss": 0.0024, + "step": 33628 + }, + { + "epoch": 0.6726, + "grad_norm": 0.136404350399971, + "learning_rate": 5.849891462099199e-06, + "loss": 0.2202, + "step": 33630 + }, + { + "epoch": 0.67264, + "grad_norm": 0.17859101295471191, + "learning_rate": 5.848621159132044e-06, + "loss": 0.0506, + "step": 33632 + }, + { + "epoch": 0.67268, + "grad_norm": 1.3872891664505005, + "learning_rate": 5.847350937098153e-06, + "loss": 0.0468, + "step": 33634 + }, + { + "epoch": 0.67272, + "grad_norm": 0.25533512234687805, + "learning_rate": 5.8460807960222965e-06, + "loss": 0.0069, + "step": 33636 + }, + { + "epoch": 0.67276, + "grad_norm": 0.3294685184955597, + "learning_rate": 5.844810735929234e-06, + "loss": 0.0057, + "step": 33638 + }, + { + "epoch": 0.6728, + "grad_norm": 16.947124481201172, + "learning_rate": 5.843540756843722e-06, + "loss": 0.2509, + "step": 33640 + }, + { + "epoch": 0.67284, + "grad_norm": 1.1531957387924194, + "learning_rate": 5.8422708587905264e-06, + "loss": 0.0141, + "step": 33642 + }, + { + "epoch": 0.67288, + "grad_norm": 0.21819400787353516, + "learning_rate": 5.8410010417944e-06, + "loss": 0.0051, + "step": 33644 + }, + { + "epoch": 0.67292, + "grad_norm": 6.251773357391357, + "learning_rate": 5.839731305880098e-06, + "loss": 0.1196, + "step": 33646 + }, + { + "epoch": 0.67296, + "grad_norm": 0.02302783913910389, + "learning_rate": 5.838461651072379e-06, + "loss": 0.0026, + "step": 33648 + }, + { + "epoch": 0.673, + "grad_norm": 13.833105087280273, + "learning_rate": 5.83719207739599e-06, + "loss": 0.5231, + "step": 33650 + }, + { + "epoch": 0.67304, + "grad_norm": 0.27896055579185486, + "learning_rate": 5.835922584875684e-06, + "loss": 0.0041, + "step": 33652 + }, + { + "epoch": 0.67308, + "grad_norm": 1.0422602891921997, + "learning_rate": 5.834653173536212e-06, + "loss": 0.0123, + "step": 33654 + }, + { + "epoch": 0.67312, + "grad_norm": 0.11461999267339706, + "learning_rate": 5.83338384340232e-06, + "loss": 0.0013, + "step": 33656 + }, + { + "epoch": 0.67316, + "grad_norm": 0.4420066475868225, + "learning_rate": 5.83211459449876e-06, + "loss": 0.0052, + "step": 33658 + }, + { + "epoch": 0.6732, + "grad_norm": 1.6314717531204224, + "learning_rate": 5.830845426850268e-06, + "loss": 0.0216, + "step": 33660 + }, + { + "epoch": 0.67324, + "grad_norm": 0.3307351768016815, + "learning_rate": 5.82957634048159e-06, + "loss": 0.0058, + "step": 33662 + }, + { + "epoch": 0.67328, + "grad_norm": 0.12075591087341309, + "learning_rate": 5.828307335417469e-06, + "loss": 0.0031, + "step": 33664 + }, + { + "epoch": 0.67332, + "grad_norm": 0.4491958022117615, + "learning_rate": 5.8270384116826425e-06, + "loss": 0.1314, + "step": 33666 + }, + { + "epoch": 0.67336, + "grad_norm": 0.012807722203433514, + "learning_rate": 5.825769569301852e-06, + "loss": 0.0963, + "step": 33668 + }, + { + "epoch": 0.6734, + "grad_norm": 5.766508102416992, + "learning_rate": 5.824500808299836e-06, + "loss": 0.0982, + "step": 33670 + }, + { + "epoch": 0.67344, + "grad_norm": 0.017519278451800346, + "learning_rate": 5.823232128701324e-06, + "loss": 0.0059, + "step": 33672 + }, + { + "epoch": 0.67348, + "grad_norm": 0.1387035846710205, + "learning_rate": 5.821963530531051e-06, + "loss": 0.002, + "step": 33674 + }, + { + "epoch": 0.67352, + "grad_norm": 0.5000888705253601, + "learning_rate": 5.820695013813753e-06, + "loss": 0.0692, + "step": 33676 + }, + { + "epoch": 0.67356, + "grad_norm": 0.0034701062832027674, + "learning_rate": 5.819426578574151e-06, + "loss": 0.0021, + "step": 33678 + }, + { + "epoch": 0.6736, + "grad_norm": 1.1298859119415283, + "learning_rate": 5.818158224836987e-06, + "loss": 0.0138, + "step": 33680 + }, + { + "epoch": 0.67364, + "grad_norm": 2.091813087463379, + "learning_rate": 5.816889952626974e-06, + "loss": 0.0278, + "step": 33682 + }, + { + "epoch": 0.67368, + "grad_norm": 0.2090715765953064, + "learning_rate": 5.815621761968853e-06, + "loss": 0.0075, + "step": 33684 + }, + { + "epoch": 0.67372, + "grad_norm": 0.39512529969215393, + "learning_rate": 5.814353652887337e-06, + "loss": 0.0077, + "step": 33686 + }, + { + "epoch": 0.67376, + "grad_norm": 0.1766010820865631, + "learning_rate": 5.8130856254071485e-06, + "loss": 0.0025, + "step": 33688 + }, + { + "epoch": 0.6738, + "grad_norm": 1.7910854816436768, + "learning_rate": 5.811817679553018e-06, + "loss": 0.0286, + "step": 33690 + }, + { + "epoch": 0.67384, + "grad_norm": 0.032072149217128754, + "learning_rate": 5.810549815349653e-06, + "loss": 0.0721, + "step": 33692 + }, + { + "epoch": 0.67388, + "grad_norm": 0.4790990650653839, + "learning_rate": 5.809282032821777e-06, + "loss": 0.0056, + "step": 33694 + }, + { + "epoch": 0.67392, + "grad_norm": 1.7971466779708862, + "learning_rate": 5.808014331994104e-06, + "loss": 0.0257, + "step": 33696 + }, + { + "epoch": 0.67396, + "grad_norm": 2.8018133640289307, + "learning_rate": 5.806746712891351e-06, + "loss": 0.5551, + "step": 33698 + }, + { + "epoch": 0.674, + "grad_norm": 0.0639367625117302, + "learning_rate": 5.8054791755382286e-06, + "loss": 0.0025, + "step": 33700 + }, + { + "epoch": 0.67404, + "grad_norm": 0.04812747985124588, + "learning_rate": 5.804211719959454e-06, + "loss": 0.0078, + "step": 33702 + }, + { + "epoch": 0.67408, + "grad_norm": 0.016831904649734497, + "learning_rate": 5.802944346179727e-06, + "loss": 0.0003, + "step": 33704 + }, + { + "epoch": 0.67412, + "grad_norm": 0.2037241905927658, + "learning_rate": 5.801677054223762e-06, + "loss": 0.0034, + "step": 33706 + }, + { + "epoch": 0.67416, + "grad_norm": 0.3897462785243988, + "learning_rate": 5.800409844116264e-06, + "loss": 0.0106, + "step": 33708 + }, + { + "epoch": 0.6742, + "grad_norm": 0.3287089467048645, + "learning_rate": 5.799142715881938e-06, + "loss": 0.0041, + "step": 33710 + }, + { + "epoch": 0.67424, + "grad_norm": 0.12341790646314621, + "learning_rate": 5.797875669545491e-06, + "loss": 0.0029, + "step": 33712 + }, + { + "epoch": 0.67428, + "grad_norm": 4.164330959320068, + "learning_rate": 5.796608705131613e-06, + "loss": 0.0815, + "step": 33714 + }, + { + "epoch": 0.67432, + "grad_norm": 0.09996206313371658, + "learning_rate": 5.795341822665021e-06, + "loss": 0.0016, + "step": 33716 + }, + { + "epoch": 0.67436, + "grad_norm": 0.4511549770832062, + "learning_rate": 5.794075022170402e-06, + "loss": 0.0108, + "step": 33718 + }, + { + "epoch": 0.6744, + "grad_norm": 0.2502342462539673, + "learning_rate": 5.792808303672454e-06, + "loss": 0.0051, + "step": 33720 + }, + { + "epoch": 0.67444, + "grad_norm": 0.11368487030267715, + "learning_rate": 5.791541667195875e-06, + "loss": 0.0014, + "step": 33722 + }, + { + "epoch": 0.67448, + "grad_norm": 5.55974817276001, + "learning_rate": 5.790275112765358e-06, + "loss": 0.0685, + "step": 33724 + }, + { + "epoch": 0.67452, + "grad_norm": 0.10408742725849152, + "learning_rate": 5.7890086404055975e-06, + "loss": 0.003, + "step": 33726 + }, + { + "epoch": 0.67456, + "grad_norm": 0.1416464000940323, + "learning_rate": 5.787742250141279e-06, + "loss": 0.0057, + "step": 33728 + }, + { + "epoch": 0.6746, + "grad_norm": 0.1563337743282318, + "learning_rate": 5.786475941997094e-06, + "loss": 0.0023, + "step": 33730 + }, + { + "epoch": 0.67464, + "grad_norm": 0.04455948993563652, + "learning_rate": 5.785209715997729e-06, + "loss": 0.0273, + "step": 33732 + }, + { + "epoch": 0.67468, + "grad_norm": 0.08659907430410385, + "learning_rate": 5.783943572167876e-06, + "loss": 0.0013, + "step": 33734 + }, + { + "epoch": 0.67472, + "grad_norm": 0.36686787009239197, + "learning_rate": 5.782677510532204e-06, + "loss": 0.0052, + "step": 33736 + }, + { + "epoch": 0.67476, + "grad_norm": 0.2688996195793152, + "learning_rate": 5.781411531115413e-06, + "loss": 0.0032, + "step": 33738 + }, + { + "epoch": 0.6748, + "grad_norm": 0.126085102558136, + "learning_rate": 5.780145633942173e-06, + "loss": 0.0034, + "step": 33740 + }, + { + "epoch": 0.67484, + "grad_norm": 0.3556622564792633, + "learning_rate": 5.7788798190371665e-06, + "loss": 0.0054, + "step": 33742 + }, + { + "epoch": 0.67488, + "grad_norm": 0.16343402862548828, + "learning_rate": 5.777614086425074e-06, + "loss": 0.0033, + "step": 33744 + }, + { + "epoch": 0.67492, + "grad_norm": 0.1346292495727539, + "learning_rate": 5.7763484361305625e-06, + "loss": 0.0021, + "step": 33746 + }, + { + "epoch": 0.67496, + "grad_norm": 0.7795007228851318, + "learning_rate": 5.775082868178319e-06, + "loss": 0.0108, + "step": 33748 + }, + { + "epoch": 0.675, + "grad_norm": 0.011111886240541935, + "learning_rate": 5.773817382593008e-06, + "loss": 0.0003, + "step": 33750 + }, + { + "epoch": 0.67504, + "grad_norm": 0.05305711179971695, + "learning_rate": 5.7725519793993015e-06, + "loss": 0.0041, + "step": 33752 + }, + { + "epoch": 0.67508, + "grad_norm": 0.05196328088641167, + "learning_rate": 5.771286658621871e-06, + "loss": 0.0037, + "step": 33754 + }, + { + "epoch": 0.67512, + "grad_norm": 0.010733092203736305, + "learning_rate": 5.770021420285383e-06, + "loss": 0.0106, + "step": 33756 + }, + { + "epoch": 0.67516, + "grad_norm": 0.18556718528270721, + "learning_rate": 5.76875626441451e-06, + "loss": 0.0025, + "step": 33758 + }, + { + "epoch": 0.6752, + "grad_norm": 0.08214113116264343, + "learning_rate": 5.7674911910339094e-06, + "loss": 0.0054, + "step": 33760 + }, + { + "epoch": 0.67524, + "grad_norm": 1.1060632467269897, + "learning_rate": 5.7662262001682454e-06, + "loss": 0.0173, + "step": 33762 + }, + { + "epoch": 0.67528, + "grad_norm": 0.42288127541542053, + "learning_rate": 5.764961291842181e-06, + "loss": 0.0046, + "step": 33764 + }, + { + "epoch": 0.67532, + "grad_norm": 0.007555618416517973, + "learning_rate": 5.763696466080382e-06, + "loss": 0.0019, + "step": 33766 + }, + { + "epoch": 0.67536, + "grad_norm": 0.0017536119557917118, + "learning_rate": 5.762431722907493e-06, + "loss": 0.0019, + "step": 33768 + }, + { + "epoch": 0.6754, + "grad_norm": 0.8435770273208618, + "learning_rate": 5.761167062348187e-06, + "loss": 0.0082, + "step": 33770 + }, + { + "epoch": 0.67544, + "grad_norm": 0.03862854465842247, + "learning_rate": 5.759902484427107e-06, + "loss": 0.0019, + "step": 33772 + }, + { + "epoch": 0.67548, + "grad_norm": 0.021321583539247513, + "learning_rate": 5.758637989168912e-06, + "loss": 0.0007, + "step": 33774 + }, + { + "epoch": 0.67552, + "grad_norm": 5.015499114990234, + "learning_rate": 5.757373576598255e-06, + "loss": 0.047, + "step": 33776 + }, + { + "epoch": 0.67556, + "grad_norm": 0.0004095426120329648, + "learning_rate": 5.756109246739778e-06, + "loss": 0.0016, + "step": 33778 + }, + { + "epoch": 0.6756, + "grad_norm": 0.20227044820785522, + "learning_rate": 5.754844999618144e-06, + "loss": 0.0025, + "step": 33780 + }, + { + "epoch": 0.67564, + "grad_norm": 0.00042185853817500174, + "learning_rate": 5.753580835257988e-06, + "loss": 0.0, + "step": 33782 + }, + { + "epoch": 0.67568, + "grad_norm": 0.0966842919588089, + "learning_rate": 5.75231675368396e-06, + "loss": 0.0049, + "step": 33784 + }, + { + "epoch": 0.67572, + "grad_norm": 0.32440313696861267, + "learning_rate": 5.751052754920704e-06, + "loss": 0.0046, + "step": 33786 + }, + { + "epoch": 0.67576, + "grad_norm": 0.08497834950685501, + "learning_rate": 5.749788838992862e-06, + "loss": 0.0017, + "step": 33788 + }, + { + "epoch": 0.6758, + "grad_norm": 0.16051408648490906, + "learning_rate": 5.748525005925074e-06, + "loss": 0.0022, + "step": 33790 + }, + { + "epoch": 0.67584, + "grad_norm": 1.339820384979248, + "learning_rate": 5.7472612557419845e-06, + "loss": 0.0149, + "step": 33792 + }, + { + "epoch": 0.67588, + "grad_norm": 0.5854858160018921, + "learning_rate": 5.745997588468222e-06, + "loss": 0.0929, + "step": 33794 + }, + { + "epoch": 0.67592, + "grad_norm": 0.2948962450027466, + "learning_rate": 5.7447340041284275e-06, + "loss": 0.0035, + "step": 33796 + }, + { + "epoch": 0.67596, + "grad_norm": 0.08907529711723328, + "learning_rate": 5.743470502747232e-06, + "loss": 0.0013, + "step": 33798 + }, + { + "epoch": 0.676, + "grad_norm": 0.3660951852798462, + "learning_rate": 5.742207084349274e-06, + "loss": 0.9623, + "step": 33800 + }, + { + "epoch": 0.67604, + "grad_norm": 0.5193997621536255, + "learning_rate": 5.740943748959182e-06, + "loss": 0.0081, + "step": 33802 + }, + { + "epoch": 0.67608, + "grad_norm": 0.008166606537997723, + "learning_rate": 5.739680496601577e-06, + "loss": 0.0006, + "step": 33804 + }, + { + "epoch": 0.67612, + "grad_norm": 0.2314213514328003, + "learning_rate": 5.738417327301102e-06, + "loss": 0.3036, + "step": 33806 + }, + { + "epoch": 0.67616, + "grad_norm": 1.5541338920593262, + "learning_rate": 5.7371542410823725e-06, + "loss": 0.0182, + "step": 33808 + }, + { + "epoch": 0.6762, + "grad_norm": 0.1114191859960556, + "learning_rate": 5.735891237970015e-06, + "loss": 0.0013, + "step": 33810 + }, + { + "epoch": 0.67624, + "grad_norm": 0.03921220824122429, + "learning_rate": 5.734628317988657e-06, + "loss": 0.0153, + "step": 33812 + }, + { + "epoch": 0.67628, + "grad_norm": 1.3685781955718994, + "learning_rate": 5.733365481162907e-06, + "loss": 0.0228, + "step": 33814 + }, + { + "epoch": 0.67632, + "grad_norm": 1.1204941272735596, + "learning_rate": 5.732102727517403e-06, + "loss": 0.0155, + "step": 33816 + }, + { + "epoch": 0.67636, + "grad_norm": 0.024724751710891724, + "learning_rate": 5.73084005707675e-06, + "loss": 0.0014, + "step": 33818 + }, + { + "epoch": 0.6764, + "grad_norm": 0.00485242297872901, + "learning_rate": 5.729577469865566e-06, + "loss": 0.0074, + "step": 33820 + }, + { + "epoch": 0.67644, + "grad_norm": 1.5125396251678467, + "learning_rate": 5.728314965908468e-06, + "loss": 0.3714, + "step": 33822 + }, + { + "epoch": 0.67648, + "grad_norm": 0.4506271481513977, + "learning_rate": 5.727052545230074e-06, + "loss": 0.0057, + "step": 33824 + }, + { + "epoch": 0.67652, + "grad_norm": 0.41962751746177673, + "learning_rate": 5.725790207854987e-06, + "loss": 0.0114, + "step": 33826 + }, + { + "epoch": 0.67656, + "grad_norm": 0.0012246286496520042, + "learning_rate": 5.724527953807819e-06, + "loss": 0.0006, + "step": 33828 + }, + { + "epoch": 0.6766, + "grad_norm": 0.05486495420336723, + "learning_rate": 5.723265783113181e-06, + "loss": 0.0007, + "step": 33830 + }, + { + "epoch": 0.67664, + "grad_norm": 12.854463577270508, + "learning_rate": 5.722003695795677e-06, + "loss": 0.3922, + "step": 33832 + }, + { + "epoch": 0.67668, + "grad_norm": 0.1035250648856163, + "learning_rate": 5.720741691879919e-06, + "loss": 0.0026, + "step": 33834 + }, + { + "epoch": 0.67672, + "grad_norm": 0.0009501204476691782, + "learning_rate": 5.7194797713904945e-06, + "loss": 0.0007, + "step": 33836 + }, + { + "epoch": 0.67676, + "grad_norm": 0.17863386869430542, + "learning_rate": 5.718217934352026e-06, + "loss": 0.0029, + "step": 33838 + }, + { + "epoch": 0.6768, + "grad_norm": 0.4564540386199951, + "learning_rate": 5.716956180789098e-06, + "loss": 0.006, + "step": 33840 + }, + { + "epoch": 0.67684, + "grad_norm": 0.01526276022195816, + "learning_rate": 5.7156945107263155e-06, + "loss": 0.0051, + "step": 33842 + }, + { + "epoch": 0.67688, + "grad_norm": 2.3815736770629883, + "learning_rate": 5.714432924188277e-06, + "loss": 0.0246, + "step": 33844 + }, + { + "epoch": 0.67692, + "grad_norm": 0.4822750985622406, + "learning_rate": 5.713171421199568e-06, + "loss": 0.0137, + "step": 33846 + }, + { + "epoch": 0.67696, + "grad_norm": 0.0019656624644994736, + "learning_rate": 5.711910001784796e-06, + "loss": 0.0027, + "step": 33848 + }, + { + "epoch": 0.677, + "grad_norm": 0.11079718172550201, + "learning_rate": 5.710648665968543e-06, + "loss": 0.0024, + "step": 33850 + }, + { + "epoch": 0.67704, + "grad_norm": 0.025287069380283356, + "learning_rate": 5.709387413775403e-06, + "loss": 0.0006, + "step": 33852 + }, + { + "epoch": 0.67708, + "grad_norm": 0.034796420484781265, + "learning_rate": 5.7081262452299625e-06, + "loss": 0.0044, + "step": 33854 + }, + { + "epoch": 0.67712, + "grad_norm": 0.5395022034645081, + "learning_rate": 5.706865160356816e-06, + "loss": 0.008, + "step": 33856 + }, + { + "epoch": 0.67716, + "grad_norm": 1.066475749015808, + "learning_rate": 5.7056041591805345e-06, + "loss": 0.0182, + "step": 33858 + }, + { + "epoch": 0.6772, + "grad_norm": 0.05823765695095062, + "learning_rate": 5.704343241725719e-06, + "loss": 0.001, + "step": 33860 + }, + { + "epoch": 0.67724, + "grad_norm": 0.034240953624248505, + "learning_rate": 5.70308240801694e-06, + "loss": 0.0076, + "step": 33862 + }, + { + "epoch": 0.67728, + "grad_norm": 0.09522419422864914, + "learning_rate": 5.7018216580787815e-06, + "loss": 0.0013, + "step": 33864 + }, + { + "epoch": 0.67732, + "grad_norm": 0.10540533065795898, + "learning_rate": 5.700560991935826e-06, + "loss": 0.0065, + "step": 33866 + }, + { + "epoch": 0.67736, + "grad_norm": 0.0501268245279789, + "learning_rate": 5.69930040961264e-06, + "loss": 0.003, + "step": 33868 + }, + { + "epoch": 0.6774, + "grad_norm": 0.2197226732969284, + "learning_rate": 5.698039911133816e-06, + "loss": 0.0098, + "step": 33870 + }, + { + "epoch": 0.67744, + "grad_norm": 0.0027001311536878347, + "learning_rate": 5.696779496523913e-06, + "loss": 0.0005, + "step": 33872 + }, + { + "epoch": 0.67748, + "grad_norm": 0.020881664007902145, + "learning_rate": 5.69551916580751e-06, + "loss": 0.0152, + "step": 33874 + }, + { + "epoch": 0.67752, + "grad_norm": 0.04966573789715767, + "learning_rate": 5.694258919009177e-06, + "loss": 0.0012, + "step": 33876 + }, + { + "epoch": 0.67756, + "grad_norm": 0.5088112950325012, + "learning_rate": 5.692998756153483e-06, + "loss": 0.0066, + "step": 33878 + }, + { + "epoch": 0.6776, + "grad_norm": 1.5019111633300781, + "learning_rate": 5.691738677265e-06, + "loss": 0.0168, + "step": 33880 + }, + { + "epoch": 0.67764, + "grad_norm": 0.07739800214767456, + "learning_rate": 5.6904786823682835e-06, + "loss": 0.0018, + "step": 33882 + }, + { + "epoch": 0.67768, + "grad_norm": 0.22796015441417694, + "learning_rate": 5.6892187714879055e-06, + "loss": 0.0035, + "step": 33884 + }, + { + "epoch": 0.67772, + "grad_norm": 1.6471278667449951, + "learning_rate": 5.687958944648426e-06, + "loss": 0.0168, + "step": 33886 + }, + { + "epoch": 0.67776, + "grad_norm": 0.11301126331090927, + "learning_rate": 5.6866992018744125e-06, + "loss": 0.003, + "step": 33888 + }, + { + "epoch": 0.6778, + "grad_norm": 0.05958496034145355, + "learning_rate": 5.685439543190409e-06, + "loss": 0.0022, + "step": 33890 + }, + { + "epoch": 0.67784, + "grad_norm": 0.25636380910873413, + "learning_rate": 5.684179968620992e-06, + "loss": 0.0078, + "step": 33892 + }, + { + "epoch": 0.67788, + "grad_norm": 0.42172011733055115, + "learning_rate": 5.682920478190703e-06, + "loss": 0.0077, + "step": 33894 + }, + { + "epoch": 0.67792, + "grad_norm": 0.5206360220909119, + "learning_rate": 5.681661071924104e-06, + "loss": 0.0162, + "step": 33896 + }, + { + "epoch": 0.67796, + "grad_norm": 0.137943297624588, + "learning_rate": 5.680401749845747e-06, + "loss": 0.0047, + "step": 33898 + }, + { + "epoch": 0.678, + "grad_norm": 0.26926279067993164, + "learning_rate": 5.679142511980176e-06, + "loss": 0.0033, + "step": 33900 + }, + { + "epoch": 0.67804, + "grad_norm": 0.0918063297867775, + "learning_rate": 5.677883358351954e-06, + "loss": 0.0015, + "step": 33902 + }, + { + "epoch": 0.67808, + "grad_norm": 0.5683647394180298, + "learning_rate": 5.676624288985613e-06, + "loss": 0.0069, + "step": 33904 + }, + { + "epoch": 0.67812, + "grad_norm": 0.03720572590827942, + "learning_rate": 5.6753653039057145e-06, + "loss": 0.0016, + "step": 33906 + }, + { + "epoch": 0.67816, + "grad_norm": 0.02452513761818409, + "learning_rate": 5.674106403136792e-06, + "loss": 0.0005, + "step": 33908 + }, + { + "epoch": 0.6782, + "grad_norm": 14.546141624450684, + "learning_rate": 5.672847586703393e-06, + "loss": 0.9804, + "step": 33910 + }, + { + "epoch": 0.67824, + "grad_norm": 0.02021370269358158, + "learning_rate": 5.671588854630061e-06, + "loss": 0.0006, + "step": 33912 + }, + { + "epoch": 0.67828, + "grad_norm": 1.1745613813400269, + "learning_rate": 5.67033020694133e-06, + "loss": 0.014, + "step": 33914 + }, + { + "epoch": 0.67832, + "grad_norm": 0.4114781320095062, + "learning_rate": 5.66907164366174e-06, + "loss": 0.0052, + "step": 33916 + }, + { + "epoch": 0.67836, + "grad_norm": 0.02258186787366867, + "learning_rate": 5.6678131648158275e-06, + "loss": 0.0003, + "step": 33918 + }, + { + "epoch": 0.6784, + "grad_norm": 0.0016349110519513488, + "learning_rate": 5.666554770428129e-06, + "loss": 0.0021, + "step": 33920 + }, + { + "epoch": 0.67844, + "grad_norm": 0.2088354527950287, + "learning_rate": 5.665296460523175e-06, + "loss": 0.0048, + "step": 33922 + }, + { + "epoch": 0.67848, + "grad_norm": 1.565005898475647, + "learning_rate": 5.664038235125503e-06, + "loss": 0.02, + "step": 33924 + }, + { + "epoch": 0.67852, + "grad_norm": 0.014373654499650002, + "learning_rate": 5.66278009425963e-06, + "loss": 0.0002, + "step": 33926 + }, + { + "epoch": 0.67856, + "grad_norm": 0.008761599659919739, + "learning_rate": 5.661522037950099e-06, + "loss": 0.002, + "step": 33928 + }, + { + "epoch": 0.6786, + "grad_norm": 0.017641590908169746, + "learning_rate": 5.660264066221426e-06, + "loss": 0.0003, + "step": 33930 + }, + { + "epoch": 0.67864, + "grad_norm": 0.3611632287502289, + "learning_rate": 5.65900617909814e-06, + "loss": 0.0047, + "step": 33932 + }, + { + "epoch": 0.67868, + "grad_norm": 0.0687982365489006, + "learning_rate": 5.657748376604766e-06, + "loss": 0.031, + "step": 33934 + }, + { + "epoch": 0.67872, + "grad_norm": 0.043861646205186844, + "learning_rate": 5.656490658765817e-06, + "loss": 0.0022, + "step": 33936 + }, + { + "epoch": 0.67876, + "grad_norm": 0.237624391913414, + "learning_rate": 5.655233025605824e-06, + "loss": 0.0049, + "step": 33938 + }, + { + "epoch": 0.6788, + "grad_norm": 0.018079526722431183, + "learning_rate": 5.653975477149298e-06, + "loss": 0.0033, + "step": 33940 + }, + { + "epoch": 0.67884, + "grad_norm": 0.06781204789876938, + "learning_rate": 5.652718013420757e-06, + "loss": 0.001, + "step": 33942 + }, + { + "epoch": 0.67888, + "grad_norm": 0.0067239077761769295, + "learning_rate": 5.651460634444716e-06, + "loss": 0.0019, + "step": 33944 + }, + { + "epoch": 0.67892, + "grad_norm": 0.17621637880802155, + "learning_rate": 5.650203340245695e-06, + "loss": 0.0258, + "step": 33946 + }, + { + "epoch": 0.67896, + "grad_norm": 0.017649414017796516, + "learning_rate": 5.648946130848192e-06, + "loss": 0.0214, + "step": 33948 + }, + { + "epoch": 0.679, + "grad_norm": 0.6027618050575256, + "learning_rate": 5.647689006276727e-06, + "loss": 0.0087, + "step": 33950 + }, + { + "epoch": 0.67904, + "grad_norm": 0.05599893257021904, + "learning_rate": 5.6464319665558035e-06, + "loss": 0.0068, + "step": 33952 + }, + { + "epoch": 0.67908, + "grad_norm": 0.02134905941784382, + "learning_rate": 5.645175011709931e-06, + "loss": 0.0008, + "step": 33954 + }, + { + "epoch": 0.67912, + "grad_norm": 0.00584295904263854, + "learning_rate": 5.643918141763618e-06, + "loss": 0.0014, + "step": 33956 + }, + { + "epoch": 0.67916, + "grad_norm": 0.025318004190921783, + "learning_rate": 5.642661356741355e-06, + "loss": 0.0011, + "step": 33958 + }, + { + "epoch": 0.6792, + "grad_norm": 0.38168227672576904, + "learning_rate": 5.641404656667661e-06, + "loss": 0.0039, + "step": 33960 + }, + { + "epoch": 0.67924, + "grad_norm": 0.0563659742474556, + "learning_rate": 5.640148041567022e-06, + "loss": 0.2842, + "step": 33962 + }, + { + "epoch": 0.67928, + "grad_norm": 0.11263835430145264, + "learning_rate": 5.638891511463942e-06, + "loss": 0.0019, + "step": 33964 + }, + { + "epoch": 0.67932, + "grad_norm": 0.4274801015853882, + "learning_rate": 5.6376350663829215e-06, + "loss": 0.0059, + "step": 33966 + }, + { + "epoch": 0.67936, + "grad_norm": 0.22795990109443665, + "learning_rate": 5.636378706348442e-06, + "loss": 0.0024, + "step": 33968 + }, + { + "epoch": 0.6794, + "grad_norm": 0.18439379334449768, + "learning_rate": 5.6351224313850165e-06, + "loss": 0.0026, + "step": 33970 + }, + { + "epoch": 0.67944, + "grad_norm": 0.08566885441541672, + "learning_rate": 5.633866241517121e-06, + "loss": 0.0067, + "step": 33972 + }, + { + "epoch": 0.67948, + "grad_norm": 0.2379797250032425, + "learning_rate": 5.632610136769251e-06, + "loss": 0.003, + "step": 33974 + }, + { + "epoch": 0.67952, + "grad_norm": 0.012247094884514809, + "learning_rate": 5.631354117165895e-06, + "loss": 0.0073, + "step": 33976 + }, + { + "epoch": 0.67956, + "grad_norm": 0.022207997739315033, + "learning_rate": 5.630098182731543e-06, + "loss": 0.0007, + "step": 33978 + }, + { + "epoch": 0.6796, + "grad_norm": 0.16796040534973145, + "learning_rate": 5.628842333490674e-06, + "loss": 0.002, + "step": 33980 + }, + { + "epoch": 0.67964, + "grad_norm": 0.003479942912235856, + "learning_rate": 5.627586569467773e-06, + "loss": 0.005, + "step": 33982 + }, + { + "epoch": 0.67968, + "grad_norm": 0.01510850340127945, + "learning_rate": 5.626330890687324e-06, + "loss": 0.0003, + "step": 33984 + }, + { + "epoch": 0.67972, + "grad_norm": 0.2849043309688568, + "learning_rate": 5.625075297173805e-06, + "loss": 0.0176, + "step": 33986 + }, + { + "epoch": 0.67976, + "grad_norm": 0.038045480847358704, + "learning_rate": 5.623819788951699e-06, + "loss": 0.0014, + "step": 33988 + }, + { + "epoch": 0.6798, + "grad_norm": 0.0557287335395813, + "learning_rate": 5.622564366045472e-06, + "loss": 0.0088, + "step": 33990 + }, + { + "epoch": 0.67984, + "grad_norm": 0.00899308081716299, + "learning_rate": 5.621309028479615e-06, + "loss": 0.0082, + "step": 33992 + }, + { + "epoch": 0.67988, + "grad_norm": 0.829914927482605, + "learning_rate": 5.620053776278588e-06, + "loss": 0.047, + "step": 33994 + }, + { + "epoch": 0.67992, + "grad_norm": 3.5535998344421387, + "learning_rate": 5.618798609466867e-06, + "loss": 0.0345, + "step": 33996 + }, + { + "epoch": 0.67996, + "grad_norm": 2.4362521171569824, + "learning_rate": 5.617543528068924e-06, + "loss": 0.0384, + "step": 33998 + }, + { + "epoch": 0.68, + "grad_norm": 0.011212198063731194, + "learning_rate": 5.616288532109225e-06, + "loss": 0.0023, + "step": 34000 + }, + { + "epoch": 0.68004, + "grad_norm": 0.1574394851922989, + "learning_rate": 5.6150336216122425e-06, + "loss": 0.0098, + "step": 34002 + }, + { + "epoch": 0.68008, + "grad_norm": 0.5858563780784607, + "learning_rate": 5.613778796602434e-06, + "loss": 0.0122, + "step": 34004 + }, + { + "epoch": 0.68012, + "grad_norm": 0.16316141188144684, + "learning_rate": 5.612524057104265e-06, + "loss": 0.0063, + "step": 34006 + }, + { + "epoch": 0.68016, + "grad_norm": 0.42217326164245605, + "learning_rate": 5.611269403142197e-06, + "loss": 0.008, + "step": 34008 + }, + { + "epoch": 0.6802, + "grad_norm": 0.03841651603579521, + "learning_rate": 5.610014834740694e-06, + "loss": 0.3367, + "step": 34010 + }, + { + "epoch": 0.68024, + "grad_norm": 0.23493005335330963, + "learning_rate": 5.608760351924211e-06, + "loss": 0.0184, + "step": 34012 + }, + { + "epoch": 0.68028, + "grad_norm": 0.1815159022808075, + "learning_rate": 5.607505954717209e-06, + "loss": 0.0031, + "step": 34014 + }, + { + "epoch": 0.68032, + "grad_norm": 0.057721834629774094, + "learning_rate": 5.606251643144136e-06, + "loss": 0.5689, + "step": 34016 + }, + { + "epoch": 0.68036, + "grad_norm": 0.0004167717706877738, + "learning_rate": 5.604997417229449e-06, + "loss": 0.0008, + "step": 34018 + }, + { + "epoch": 0.6804, + "grad_norm": 0.034408267587423325, + "learning_rate": 5.603743276997607e-06, + "loss": 0.0086, + "step": 34020 + }, + { + "epoch": 0.68044, + "grad_norm": 0.2823813557624817, + "learning_rate": 5.602489222473042e-06, + "loss": 0.0103, + "step": 34022 + }, + { + "epoch": 0.68048, + "grad_norm": 0.7635644674301147, + "learning_rate": 5.601235253680221e-06, + "loss": 0.0373, + "step": 34024 + }, + { + "epoch": 0.68052, + "grad_norm": 0.562323272228241, + "learning_rate": 5.599981370643577e-06, + "loss": 0.0059, + "step": 34026 + }, + { + "epoch": 0.68056, + "grad_norm": 3.1265501976013184, + "learning_rate": 5.59872757338757e-06, + "loss": 0.038, + "step": 34028 + }, + { + "epoch": 0.6806, + "grad_norm": 2.361760377883911, + "learning_rate": 5.59747386193663e-06, + "loss": 0.0395, + "step": 34030 + }, + { + "epoch": 0.68064, + "grad_norm": 0.011980934999883175, + "learning_rate": 5.596220236315204e-06, + "loss": 0.0028, + "step": 34032 + }, + { + "epoch": 0.68068, + "grad_norm": 0.002200984861701727, + "learning_rate": 5.594966696547735e-06, + "loss": 0.0421, + "step": 34034 + }, + { + "epoch": 0.68072, + "grad_norm": 0.10850045084953308, + "learning_rate": 5.593713242658655e-06, + "loss": 0.0115, + "step": 34036 + }, + { + "epoch": 0.68076, + "grad_norm": 0.11311683058738708, + "learning_rate": 5.592459874672404e-06, + "loss": 0.0203, + "step": 34038 + }, + { + "epoch": 0.6808, + "grad_norm": 0.1731967329978943, + "learning_rate": 5.591206592613416e-06, + "loss": 0.0041, + "step": 34040 + }, + { + "epoch": 0.68084, + "grad_norm": 0.3507029712200165, + "learning_rate": 5.589953396506125e-06, + "loss": 0.0148, + "step": 34042 + }, + { + "epoch": 0.68088, + "grad_norm": 0.16662293672561646, + "learning_rate": 5.588700286374964e-06, + "loss": 0.0063, + "step": 34044 + }, + { + "epoch": 0.68092, + "grad_norm": 0.010951015166938305, + "learning_rate": 5.587447262244365e-06, + "loss": 0.0215, + "step": 34046 + }, + { + "epoch": 0.68096, + "grad_norm": 0.7533734440803528, + "learning_rate": 5.586194324138749e-06, + "loss": 0.9291, + "step": 34048 + }, + { + "epoch": 0.681, + "grad_norm": 0.004165030084550381, + "learning_rate": 5.584941472082549e-06, + "loss": 0.0035, + "step": 34050 + }, + { + "epoch": 0.68104, + "grad_norm": 0.04835691675543785, + "learning_rate": 5.583688706100188e-06, + "loss": 0.001, + "step": 34052 + }, + { + "epoch": 0.68108, + "grad_norm": 0.03057384490966797, + "learning_rate": 5.582436026216087e-06, + "loss": 0.006, + "step": 34054 + }, + { + "epoch": 0.68112, + "grad_norm": 0.0811537653207779, + "learning_rate": 5.5811834324546755e-06, + "loss": 0.0017, + "step": 34056 + }, + { + "epoch": 0.68116, + "grad_norm": 0.042569566518068314, + "learning_rate": 5.57993092484036e-06, + "loss": 0.007, + "step": 34058 + }, + { + "epoch": 0.6812, + "grad_norm": 0.8476104140281677, + "learning_rate": 5.5786785033975745e-06, + "loss": 0.0101, + "step": 34060 + }, + { + "epoch": 0.68124, + "grad_norm": 0.12081660330295563, + "learning_rate": 5.577426168150724e-06, + "loss": 0.0015, + "step": 34062 + }, + { + "epoch": 0.68128, + "grad_norm": 2.2799251079559326, + "learning_rate": 5.576173919124227e-06, + "loss": 0.048, + "step": 34064 + }, + { + "epoch": 0.68132, + "grad_norm": 0.4178319275379181, + "learning_rate": 5.574921756342497e-06, + "loss": 0.0695, + "step": 34066 + }, + { + "epoch": 0.68136, + "grad_norm": 0.03355199471116066, + "learning_rate": 5.5736696798299495e-06, + "loss": 0.0031, + "step": 34068 + }, + { + "epoch": 0.6814, + "grad_norm": 9.60323715209961, + "learning_rate": 5.572417689610987e-06, + "loss": 0.4861, + "step": 34070 + }, + { + "epoch": 0.68144, + "grad_norm": 10.574545860290527, + "learning_rate": 5.57116578571002e-06, + "loss": 0.1935, + "step": 34072 + }, + { + "epoch": 0.68148, + "grad_norm": 1.1994458436965942, + "learning_rate": 5.569913968151457e-06, + "loss": 0.0149, + "step": 34074 + }, + { + "epoch": 0.68152, + "grad_norm": 0.46550315618515015, + "learning_rate": 5.568662236959702e-06, + "loss": 0.0051, + "step": 34076 + }, + { + "epoch": 0.68156, + "grad_norm": 0.010910150595009327, + "learning_rate": 5.5674105921591614e-06, + "loss": 0.0003, + "step": 34078 + }, + { + "epoch": 0.6816, + "grad_norm": 0.02252117544412613, + "learning_rate": 5.5661590337742255e-06, + "loss": 0.0011, + "step": 34080 + }, + { + "epoch": 0.68164, + "grad_norm": 0.6434391736984253, + "learning_rate": 5.564907561829311e-06, + "loss": 0.0081, + "step": 34082 + }, + { + "epoch": 0.68168, + "grad_norm": 0.1157982349395752, + "learning_rate": 5.5636561763488014e-06, + "loss": 0.0012, + "step": 34084 + }, + { + "epoch": 0.68172, + "grad_norm": 0.18081189692020416, + "learning_rate": 5.562404877357099e-06, + "loss": 0.0037, + "step": 34086 + }, + { + "epoch": 0.68176, + "grad_norm": 0.09146368503570557, + "learning_rate": 5.561153664878603e-06, + "loss": 0.0048, + "step": 34088 + }, + { + "epoch": 0.6818, + "grad_norm": 0.1044728085398674, + "learning_rate": 5.559902538937694e-06, + "loss": 0.0014, + "step": 34090 + }, + { + "epoch": 0.68184, + "grad_norm": 0.2880372107028961, + "learning_rate": 5.5586514995587785e-06, + "loss": 0.0056, + "step": 34092 + }, + { + "epoch": 0.68188, + "grad_norm": 0.5988687872886658, + "learning_rate": 5.557400546766233e-06, + "loss": 0.0095, + "step": 34094 + }, + { + "epoch": 0.68192, + "grad_norm": 0.010754359886050224, + "learning_rate": 5.556149680584453e-06, + "loss": 0.0036, + "step": 34096 + }, + { + "epoch": 0.68196, + "grad_norm": 4.895345687866211, + "learning_rate": 5.554898901037822e-06, + "loss": 0.0779, + "step": 34098 + }, + { + "epoch": 0.682, + "grad_norm": 2.254683256149292, + "learning_rate": 5.553648208150728e-06, + "loss": 0.0327, + "step": 34100 + }, + { + "epoch": 0.68204, + "grad_norm": 0.44873878359794617, + "learning_rate": 5.552397601947549e-06, + "loss": 0.0148, + "step": 34102 + }, + { + "epoch": 0.68208, + "grad_norm": 0.06684210896492004, + "learning_rate": 5.551147082452668e-06, + "loss": 0.0197, + "step": 34104 + }, + { + "epoch": 0.68212, + "grad_norm": 0.11648407578468323, + "learning_rate": 5.549896649690465e-06, + "loss": 0.0021, + "step": 34106 + }, + { + "epoch": 0.68216, + "grad_norm": 0.1275390386581421, + "learning_rate": 5.548646303685319e-06, + "loss": 0.0038, + "step": 34108 + }, + { + "epoch": 0.6822, + "grad_norm": 0.13717076182365417, + "learning_rate": 5.5473960444616085e-06, + "loss": 0.0044, + "step": 34110 + }, + { + "epoch": 0.68224, + "grad_norm": 0.019674621522426605, + "learning_rate": 5.5461458720436956e-06, + "loss": 0.0016, + "step": 34112 + }, + { + "epoch": 0.68228, + "grad_norm": 0.4533429741859436, + "learning_rate": 5.54489578645597e-06, + "loss": 0.0036, + "step": 34114 + }, + { + "epoch": 0.68232, + "grad_norm": 0.022974224761128426, + "learning_rate": 5.543645787722791e-06, + "loss": 0.012, + "step": 34116 + }, + { + "epoch": 0.68236, + "grad_norm": 0.41291409730911255, + "learning_rate": 5.5423958758685315e-06, + "loss": 0.0053, + "step": 34118 + }, + { + "epoch": 0.6824, + "grad_norm": 0.04736128821969032, + "learning_rate": 5.5411460509175605e-06, + "loss": 0.0023, + "step": 34120 + }, + { + "epoch": 0.68244, + "grad_norm": 0.10667528957128525, + "learning_rate": 5.539896312894242e-06, + "loss": 0.0015, + "step": 34122 + }, + { + "epoch": 0.68248, + "grad_norm": 0.5565482378005981, + "learning_rate": 5.538646661822944e-06, + "loss": 0.0059, + "step": 34124 + }, + { + "epoch": 0.68252, + "grad_norm": 0.09072499722242355, + "learning_rate": 5.537397097728024e-06, + "loss": 0.0013, + "step": 34126 + }, + { + "epoch": 0.68256, + "grad_norm": 1.3752261400222778, + "learning_rate": 5.536147620633845e-06, + "loss": 0.0132, + "step": 34128 + }, + { + "epoch": 0.6826, + "grad_norm": 0.004987790249288082, + "learning_rate": 5.534898230564765e-06, + "loss": 0.0003, + "step": 34130 + }, + { + "epoch": 0.68264, + "grad_norm": 0.01779782772064209, + "learning_rate": 5.533648927545144e-06, + "loss": 0.0013, + "step": 34132 + }, + { + "epoch": 0.68268, + "grad_norm": 0.013986971229314804, + "learning_rate": 5.532399711599334e-06, + "loss": 0.0066, + "step": 34134 + }, + { + "epoch": 0.68272, + "grad_norm": 1.6023242473602295, + "learning_rate": 5.531150582751699e-06, + "loss": 0.0247, + "step": 34136 + }, + { + "epoch": 0.68276, + "grad_norm": 0.00513812992721796, + "learning_rate": 5.529901541026579e-06, + "loss": 0.0011, + "step": 34138 + }, + { + "epoch": 0.6828, + "grad_norm": 1.7668133974075317, + "learning_rate": 5.5286525864483285e-06, + "loss": 0.0361, + "step": 34140 + }, + { + "epoch": 0.68284, + "grad_norm": 0.011380034498870373, + "learning_rate": 5.527403719041304e-06, + "loss": 0.0007, + "step": 34142 + }, + { + "epoch": 0.68288, + "grad_norm": 0.20751754939556122, + "learning_rate": 5.526154938829838e-06, + "loss": 0.0051, + "step": 34144 + }, + { + "epoch": 0.68292, + "grad_norm": 0.46685004234313965, + "learning_rate": 5.524906245838291e-06, + "loss": 0.0122, + "step": 34146 + }, + { + "epoch": 0.68296, + "grad_norm": 0.08870386332273483, + "learning_rate": 5.523657640090993e-06, + "loss": 0.0729, + "step": 34148 + }, + { + "epoch": 0.683, + "grad_norm": 0.044387176632881165, + "learning_rate": 5.522409121612304e-06, + "loss": 0.0013, + "step": 34150 + }, + { + "epoch": 0.68304, + "grad_norm": 0.11129982769489288, + "learning_rate": 5.521160690426551e-06, + "loss": 0.5693, + "step": 34152 + }, + { + "epoch": 0.68308, + "grad_norm": 0.22688838839530945, + "learning_rate": 5.519912346558073e-06, + "loss": 0.0041, + "step": 34154 + }, + { + "epoch": 0.68312, + "grad_norm": 0.058646008372306824, + "learning_rate": 5.518664090031216e-06, + "loss": 0.0072, + "step": 34156 + }, + { + "epoch": 0.68316, + "grad_norm": 0.914740800857544, + "learning_rate": 5.517415920870306e-06, + "loss": 0.0109, + "step": 34158 + }, + { + "epoch": 0.6832, + "grad_norm": 1.795693278312683, + "learning_rate": 5.516167839099679e-06, + "loss": 0.0381, + "step": 34160 + }, + { + "epoch": 0.68324, + "grad_norm": 0.12228808552026749, + "learning_rate": 5.514919844743669e-06, + "loss": 0.0023, + "step": 34162 + }, + { + "epoch": 0.68328, + "grad_norm": 0.00027581973699852824, + "learning_rate": 5.5136719378266055e-06, + "loss": 0.1379, + "step": 34164 + }, + { + "epoch": 0.68332, + "grad_norm": 4.138396263122559, + "learning_rate": 5.5124241183728186e-06, + "loss": 0.0532, + "step": 34166 + }, + { + "epoch": 0.68336, + "grad_norm": 0.37327688932418823, + "learning_rate": 5.511176386406637e-06, + "loss": 0.0071, + "step": 34168 + }, + { + "epoch": 0.6834, + "grad_norm": 0.01993618533015251, + "learning_rate": 5.50992874195238e-06, + "loss": 0.0005, + "step": 34170 + }, + { + "epoch": 0.68344, + "grad_norm": 0.03924781084060669, + "learning_rate": 5.508681185034371e-06, + "loss": 0.0304, + "step": 34172 + }, + { + "epoch": 0.68348, + "grad_norm": 0.1819092482328415, + "learning_rate": 5.507433715676941e-06, + "loss": 0.0058, + "step": 34174 + }, + { + "epoch": 0.68352, + "grad_norm": 0.0801297202706337, + "learning_rate": 5.506186333904395e-06, + "loss": 0.0011, + "step": 34176 + }, + { + "epoch": 0.68356, + "grad_norm": 0.17311087250709534, + "learning_rate": 5.504939039741068e-06, + "loss": 0.0024, + "step": 34178 + }, + { + "epoch": 0.6836, + "grad_norm": 5.749317646026611, + "learning_rate": 5.50369183321126e-06, + "loss": 0.0988, + "step": 34180 + }, + { + "epoch": 0.68364, + "grad_norm": 0.4302535057067871, + "learning_rate": 5.502444714339303e-06, + "loss": 0.0035, + "step": 34182 + }, + { + "epoch": 0.68368, + "grad_norm": 0.8310112953186035, + "learning_rate": 5.501197683149497e-06, + "loss": 0.0073, + "step": 34184 + }, + { + "epoch": 0.68372, + "grad_norm": 0.1012859120965004, + "learning_rate": 5.49995073966616e-06, + "loss": 0.0017, + "step": 34186 + }, + { + "epoch": 0.68376, + "grad_norm": 0.4406280219554901, + "learning_rate": 5.498703883913599e-06, + "loss": 0.0089, + "step": 34188 + }, + { + "epoch": 0.6838, + "grad_norm": 0.16960148513317108, + "learning_rate": 5.497457115916127e-06, + "loss": 0.0019, + "step": 34190 + }, + { + "epoch": 0.68384, + "grad_norm": 0.9182130694389343, + "learning_rate": 5.496210435698044e-06, + "loss": 0.0134, + "step": 34192 + }, + { + "epoch": 0.68388, + "grad_norm": 0.06449150294065475, + "learning_rate": 5.494963843283656e-06, + "loss": 0.0012, + "step": 34194 + }, + { + "epoch": 0.68392, + "grad_norm": 0.12110091000795364, + "learning_rate": 5.493717338697268e-06, + "loss": 0.0023, + "step": 34196 + }, + { + "epoch": 0.68396, + "grad_norm": 10.186429023742676, + "learning_rate": 5.49247092196318e-06, + "loss": 0.1499, + "step": 34198 + }, + { + "epoch": 0.684, + "grad_norm": 0.09407088160514832, + "learning_rate": 5.491224593105695e-06, + "loss": 0.0016, + "step": 34200 + }, + { + "epoch": 0.68404, + "grad_norm": 0.02537434548139572, + "learning_rate": 5.489978352149101e-06, + "loss": 0.001, + "step": 34202 + }, + { + "epoch": 0.68408, + "grad_norm": 0.016506880521774292, + "learning_rate": 5.48873219911771e-06, + "loss": 0.0005, + "step": 34204 + }, + { + "epoch": 0.68412, + "grad_norm": 0.15622259676456451, + "learning_rate": 5.487486134035802e-06, + "loss": 0.0292, + "step": 34206 + }, + { + "epoch": 0.68416, + "grad_norm": 0.051878493279218674, + "learning_rate": 5.486240156927674e-06, + "loss": 0.0008, + "step": 34208 + }, + { + "epoch": 0.6842, + "grad_norm": 0.12523899972438812, + "learning_rate": 5.484994267817624e-06, + "loss": 0.0019, + "step": 34210 + }, + { + "epoch": 0.68424, + "grad_norm": 0.07926715165376663, + "learning_rate": 5.4837484667299255e-06, + "loss": 0.0176, + "step": 34212 + }, + { + "epoch": 0.68428, + "grad_norm": 0.4909690022468567, + "learning_rate": 5.482502753688886e-06, + "loss": 0.0048, + "step": 34214 + }, + { + "epoch": 0.68432, + "grad_norm": 0.06820381432771683, + "learning_rate": 5.481257128718775e-06, + "loss": 0.0021, + "step": 34216 + }, + { + "epoch": 0.68436, + "grad_norm": 0.07043137401342392, + "learning_rate": 5.480011591843883e-06, + "loss": 0.0158, + "step": 34218 + }, + { + "epoch": 0.6844, + "grad_norm": 0.1535535454750061, + "learning_rate": 5.478766143088492e-06, + "loss": 0.0021, + "step": 34220 + }, + { + "epoch": 0.68444, + "grad_norm": 0.017150577157735825, + "learning_rate": 5.477520782476884e-06, + "loss": 0.0017, + "step": 34222 + }, + { + "epoch": 0.68448, + "grad_norm": 0.024530556052923203, + "learning_rate": 5.47627551003334e-06, + "loss": 0.0018, + "step": 34224 + }, + { + "epoch": 0.68452, + "grad_norm": 0.04510439559817314, + "learning_rate": 5.475030325782129e-06, + "loss": 0.006, + "step": 34226 + }, + { + "epoch": 0.68456, + "grad_norm": 0.06028197333216667, + "learning_rate": 5.4737852297475325e-06, + "loss": 0.0063, + "step": 34228 + }, + { + "epoch": 0.6846, + "grad_norm": 3.1495583057403564, + "learning_rate": 5.472540221953824e-06, + "loss": 0.0335, + "step": 34230 + }, + { + "epoch": 0.68464, + "grad_norm": 0.03454675152897835, + "learning_rate": 5.471295302425277e-06, + "loss": 0.0011, + "step": 34232 + }, + { + "epoch": 0.68468, + "grad_norm": 0.5698761940002441, + "learning_rate": 5.470050471186152e-06, + "loss": 0.0052, + "step": 34234 + }, + { + "epoch": 0.68472, + "grad_norm": 0.35459327697753906, + "learning_rate": 5.468805728260733e-06, + "loss": 0.0121, + "step": 34236 + }, + { + "epoch": 0.68476, + "grad_norm": 0.036007657647132874, + "learning_rate": 5.467561073673276e-06, + "loss": 0.0312, + "step": 34238 + }, + { + "epoch": 0.6848, + "grad_norm": 2.2337586879730225, + "learning_rate": 5.466316507448049e-06, + "loss": 0.0287, + "step": 34240 + }, + { + "epoch": 0.68484, + "grad_norm": 0.012946453876793385, + "learning_rate": 5.4650720296093196e-06, + "loss": 0.0006, + "step": 34242 + }, + { + "epoch": 0.68488, + "grad_norm": 0.17345821857452393, + "learning_rate": 5.463827640181338e-06, + "loss": 0.0068, + "step": 34244 + }, + { + "epoch": 0.68492, + "grad_norm": 0.06740807741880417, + "learning_rate": 5.46258333918838e-06, + "loss": 0.0008, + "step": 34246 + }, + { + "epoch": 0.68496, + "grad_norm": 0.16850782930850983, + "learning_rate": 5.461339126654691e-06, + "loss": 0.0037, + "step": 34248 + }, + { + "epoch": 0.685, + "grad_norm": 0.03553618863224983, + "learning_rate": 5.460095002604533e-06, + "loss": 0.0136, + "step": 34250 + }, + { + "epoch": 0.68504, + "grad_norm": 0.003464704379439354, + "learning_rate": 5.45885096706216e-06, + "loss": 0.0001, + "step": 34252 + }, + { + "epoch": 0.68508, + "grad_norm": 0.007819012738764286, + "learning_rate": 5.457607020051826e-06, + "loss": 0.0003, + "step": 34254 + }, + { + "epoch": 0.68512, + "grad_norm": 0.06720442324876785, + "learning_rate": 5.456363161597782e-06, + "loss": 0.001, + "step": 34256 + }, + { + "epoch": 0.68516, + "grad_norm": 0.6171526312828064, + "learning_rate": 5.4551193917242795e-06, + "loss": 0.0093, + "step": 34258 + }, + { + "epoch": 0.6852, + "grad_norm": 0.3219982385635376, + "learning_rate": 5.453875710455562e-06, + "loss": 0.0046, + "step": 34260 + }, + { + "epoch": 0.68524, + "grad_norm": 0.2500327229499817, + "learning_rate": 5.452632117815877e-06, + "loss": 0.0049, + "step": 34262 + }, + { + "epoch": 0.68528, + "grad_norm": 0.01917940378189087, + "learning_rate": 5.451388613829475e-06, + "loss": 0.0066, + "step": 34264 + }, + { + "epoch": 0.68532, + "grad_norm": 0.0018621481722220778, + "learning_rate": 5.450145198520585e-06, + "loss": 0.0005, + "step": 34266 + }, + { + "epoch": 0.68536, + "grad_norm": 0.12237013876438141, + "learning_rate": 5.448901871913466e-06, + "loss": 0.0016, + "step": 34268 + }, + { + "epoch": 0.6854, + "grad_norm": 0.3378949761390686, + "learning_rate": 5.447658634032338e-06, + "loss": 0.0049, + "step": 34270 + }, + { + "epoch": 0.68544, + "grad_norm": 5.989847660064697, + "learning_rate": 5.446415484901458e-06, + "loss": 0.0481, + "step": 34272 + }, + { + "epoch": 0.68548, + "grad_norm": 0.25727155804634094, + "learning_rate": 5.445172424545049e-06, + "loss": 0.0069, + "step": 34274 + }, + { + "epoch": 0.68552, + "grad_norm": 0.015954582020640373, + "learning_rate": 5.443929452987347e-06, + "loss": 0.0134, + "step": 34276 + }, + { + "epoch": 0.68556, + "grad_norm": 0.24256551265716553, + "learning_rate": 5.442686570252591e-06, + "loss": 0.0069, + "step": 34278 + }, + { + "epoch": 0.6856, + "grad_norm": 9.255775451660156, + "learning_rate": 5.441443776365003e-06, + "loss": 0.1099, + "step": 34280 + }, + { + "epoch": 0.68564, + "grad_norm": 0.11563736200332642, + "learning_rate": 5.440201071348814e-06, + "loss": 0.0052, + "step": 34282 + }, + { + "epoch": 0.68568, + "grad_norm": 0.05890674889087677, + "learning_rate": 5.438958455228254e-06, + "loss": 0.0021, + "step": 34284 + }, + { + "epoch": 0.68572, + "grad_norm": 0.0023066007997840643, + "learning_rate": 5.437715928027546e-06, + "loss": 0.0021, + "step": 34286 + }, + { + "epoch": 0.68576, + "grad_norm": 0.7328288555145264, + "learning_rate": 5.4364734897709146e-06, + "loss": 0.0864, + "step": 34288 + }, + { + "epoch": 0.6858, + "grad_norm": 0.1167401447892189, + "learning_rate": 5.435231140482588e-06, + "loss": 0.0011, + "step": 34290 + }, + { + "epoch": 0.68584, + "grad_norm": 0.06749004870653152, + "learning_rate": 5.4339888801867745e-06, + "loss": 0.0041, + "step": 34292 + }, + { + "epoch": 0.68588, + "grad_norm": 0.028848325833678246, + "learning_rate": 5.432746708907699e-06, + "loss": 0.0024, + "step": 34294 + }, + { + "epoch": 0.68592, + "grad_norm": 0.17130471765995026, + "learning_rate": 5.431504626669581e-06, + "loss": 0.0035, + "step": 34296 + }, + { + "epoch": 0.68596, + "grad_norm": 3.415788173675537, + "learning_rate": 5.430262633496625e-06, + "loss": 0.0283, + "step": 34298 + }, + { + "epoch": 0.686, + "grad_norm": 0.049865078181028366, + "learning_rate": 5.429020729413062e-06, + "loss": 0.0004, + "step": 34300 + }, + { + "epoch": 0.68604, + "grad_norm": 0.33718615770339966, + "learning_rate": 5.427778914443082e-06, + "loss": 0.0028, + "step": 34302 + }, + { + "epoch": 0.68608, + "grad_norm": 0.19242580235004425, + "learning_rate": 5.426537188610916e-06, + "loss": 0.0019, + "step": 34304 + }, + { + "epoch": 0.68612, + "grad_norm": 0.2126673012971878, + "learning_rate": 5.425295551940757e-06, + "loss": 0.0046, + "step": 34306 + }, + { + "epoch": 0.68616, + "grad_norm": 0.21154406666755676, + "learning_rate": 5.424054004456816e-06, + "loss": 0.0029, + "step": 34308 + }, + { + "epoch": 0.6862, + "grad_norm": 0.2749503254890442, + "learning_rate": 5.4228125461833026e-06, + "loss": 0.0038, + "step": 34310 + }, + { + "epoch": 0.68624, + "grad_norm": 1.9613330364227295, + "learning_rate": 5.421571177144407e-06, + "loss": 0.0204, + "step": 34312 + }, + { + "epoch": 0.68628, + "grad_norm": 0.22654061019420624, + "learning_rate": 5.420329897364347e-06, + "loss": 0.0231, + "step": 34314 + }, + { + "epoch": 0.68632, + "grad_norm": 0.000718388007953763, + "learning_rate": 5.419088706867309e-06, + "loss": 0.0267, + "step": 34316 + }, + { + "epoch": 0.68636, + "grad_norm": 11.129434585571289, + "learning_rate": 5.417847605677496e-06, + "loss": 0.138, + "step": 34318 + }, + { + "epoch": 0.6864, + "grad_norm": 1.2853810787200928, + "learning_rate": 5.416606593819102e-06, + "loss": 0.0152, + "step": 34320 + }, + { + "epoch": 0.68644, + "grad_norm": 0.3444299101829529, + "learning_rate": 5.415365671316326e-06, + "loss": 0.0031, + "step": 34322 + }, + { + "epoch": 0.68648, + "grad_norm": 0.007521247025579214, + "learning_rate": 5.414124838193349e-06, + "loss": 0.0377, + "step": 34324 + }, + { + "epoch": 0.68652, + "grad_norm": 6.068883419036865, + "learning_rate": 5.412884094474375e-06, + "loss": 0.089, + "step": 34326 + }, + { + "epoch": 0.68656, + "grad_norm": 0.33722400665283203, + "learning_rate": 5.411643440183587e-06, + "loss": 0.007, + "step": 34328 + }, + { + "epoch": 0.6866, + "grad_norm": 0.0252050943672657, + "learning_rate": 5.41040287534517e-06, + "loss": 0.0006, + "step": 34330 + }, + { + "epoch": 0.68664, + "grad_norm": 0.06878980994224548, + "learning_rate": 5.409162399983317e-06, + "loss": 0.0006, + "step": 34332 + }, + { + "epoch": 0.68668, + "grad_norm": 0.010296154767274857, + "learning_rate": 5.407922014122198e-06, + "loss": 0.0012, + "step": 34334 + }, + { + "epoch": 0.68672, + "grad_norm": 0.018848933279514313, + "learning_rate": 5.406681717786011e-06, + "loss": 0.0007, + "step": 34336 + }, + { + "epoch": 0.68676, + "grad_norm": 1.2720900774002075, + "learning_rate": 5.405441510998925e-06, + "loss": 0.28, + "step": 34338 + }, + { + "epoch": 0.6868, + "grad_norm": 0.05109095945954323, + "learning_rate": 5.404201393785123e-06, + "loss": 0.0028, + "step": 34340 + }, + { + "epoch": 0.68684, + "grad_norm": 3.724384307861328, + "learning_rate": 5.402961366168779e-06, + "loss": 0.0717, + "step": 34342 + }, + { + "epoch": 0.68688, + "grad_norm": 0.20486332476139069, + "learning_rate": 5.401721428174072e-06, + "loss": 0.5711, + "step": 34344 + }, + { + "epoch": 0.68692, + "grad_norm": 0.060385145246982574, + "learning_rate": 5.4004815798251765e-06, + "loss": 0.003, + "step": 34346 + }, + { + "epoch": 0.68696, + "grad_norm": 0.09213009476661682, + "learning_rate": 5.3992418211462545e-06, + "loss": 0.0052, + "step": 34348 + }, + { + "epoch": 0.687, + "grad_norm": 0.4149014949798584, + "learning_rate": 5.398002152161484e-06, + "loss": 0.0065, + "step": 34350 + }, + { + "epoch": 0.68704, + "grad_norm": 0.021320892497897148, + "learning_rate": 5.396762572895032e-06, + "loss": 0.0014, + "step": 34352 + }, + { + "epoch": 0.68708, + "grad_norm": 8.608973503112793, + "learning_rate": 5.395523083371065e-06, + "loss": 0.1511, + "step": 34354 + }, + { + "epoch": 0.68712, + "grad_norm": 0.02585335075855255, + "learning_rate": 5.394283683613739e-06, + "loss": 0.0014, + "step": 34356 + }, + { + "epoch": 0.68716, + "grad_norm": 0.00593285309150815, + "learning_rate": 5.393044373647231e-06, + "loss": 0.0045, + "step": 34358 + }, + { + "epoch": 0.6872, + "grad_norm": 0.0033711495343595743, + "learning_rate": 5.391805153495693e-06, + "loss": 0.0009, + "step": 34360 + }, + { + "epoch": 0.68724, + "grad_norm": 0.43543747067451477, + "learning_rate": 5.390566023183283e-06, + "loss": 0.0053, + "step": 34362 + }, + { + "epoch": 0.68728, + "grad_norm": 0.07927624136209488, + "learning_rate": 5.389326982734168e-06, + "loss": 0.009, + "step": 34364 + }, + { + "epoch": 0.68732, + "grad_norm": 0.01700541563332081, + "learning_rate": 5.388088032172488e-06, + "loss": 0.0029, + "step": 34366 + }, + { + "epoch": 0.68736, + "grad_norm": 0.016167107969522476, + "learning_rate": 5.386849171522415e-06, + "loss": 0.0215, + "step": 34368 + }, + { + "epoch": 0.6874, + "grad_norm": 0.25255775451660156, + "learning_rate": 5.385610400808088e-06, + "loss": 0.0448, + "step": 34370 + }, + { + "epoch": 0.68744, + "grad_norm": 0.22120831906795502, + "learning_rate": 5.384371720053661e-06, + "loss": 0.0044, + "step": 34372 + }, + { + "epoch": 0.68748, + "grad_norm": 0.7636732459068298, + "learning_rate": 5.383133129283285e-06, + "loss": 0.0112, + "step": 34374 + }, + { + "epoch": 0.68752, + "grad_norm": 0.05387191101908684, + "learning_rate": 5.3818946285211045e-06, + "loss": 1.1464, + "step": 34376 + }, + { + "epoch": 0.68756, + "grad_norm": 0.08519614487886429, + "learning_rate": 5.38065621779127e-06, + "loss": 0.0024, + "step": 34378 + }, + { + "epoch": 0.6876, + "grad_norm": 3.1842997074127197, + "learning_rate": 5.379417897117917e-06, + "loss": 0.0337, + "step": 34380 + }, + { + "epoch": 0.68764, + "grad_norm": 0.1340363621711731, + "learning_rate": 5.37817966652519e-06, + "loss": 0.0105, + "step": 34382 + }, + { + "epoch": 0.68768, + "grad_norm": 1.394234538078308, + "learning_rate": 5.3769415260372315e-06, + "loss": 0.0153, + "step": 34384 + }, + { + "epoch": 0.68772, + "grad_norm": 17.30963706970215, + "learning_rate": 5.375703475678181e-06, + "loss": 0.2509, + "step": 34386 + }, + { + "epoch": 0.68776, + "grad_norm": 0.11635629832744598, + "learning_rate": 5.3744655154721644e-06, + "loss": 0.0019, + "step": 34388 + }, + { + "epoch": 0.6878, + "grad_norm": 0.06850595772266388, + "learning_rate": 5.373227645443332e-06, + "loss": 0.0022, + "step": 34390 + }, + { + "epoch": 0.68784, + "grad_norm": 0.12444016337394714, + "learning_rate": 5.371989865615801e-06, + "loss": 0.9396, + "step": 34392 + }, + { + "epoch": 0.68788, + "grad_norm": 0.12830768525600433, + "learning_rate": 5.370752176013717e-06, + "loss": 0.1774, + "step": 34394 + }, + { + "epoch": 0.68792, + "grad_norm": 0.20950061082839966, + "learning_rate": 5.3695145766611985e-06, + "loss": 0.0042, + "step": 34396 + }, + { + "epoch": 0.68796, + "grad_norm": 0.0784657746553421, + "learning_rate": 5.368277067582379e-06, + "loss": 0.001, + "step": 34398 + }, + { + "epoch": 0.688, + "grad_norm": 0.019133931025862694, + "learning_rate": 5.367039648801386e-06, + "loss": 0.0015, + "step": 34400 + }, + { + "epoch": 0.68804, + "grad_norm": 0.021801285445690155, + "learning_rate": 5.365802320342336e-06, + "loss": 0.0066, + "step": 34402 + }, + { + "epoch": 0.68808, + "grad_norm": 1.1766773462295532, + "learning_rate": 5.364565082229356e-06, + "loss": 0.0165, + "step": 34404 + }, + { + "epoch": 0.68812, + "grad_norm": 0.15862146019935608, + "learning_rate": 5.363327934486565e-06, + "loss": 0.004, + "step": 34406 + }, + { + "epoch": 0.68816, + "grad_norm": 0.016283472999930382, + "learning_rate": 5.362090877138084e-06, + "loss": 0.024, + "step": 34408 + }, + { + "epoch": 0.6882, + "grad_norm": 0.11319513618946075, + "learning_rate": 5.360853910208028e-06, + "loss": 0.0593, + "step": 34410 + }, + { + "epoch": 0.68824, + "grad_norm": 1.7027075290679932, + "learning_rate": 5.359617033720519e-06, + "loss": 0.0229, + "step": 34412 + }, + { + "epoch": 0.68828, + "grad_norm": 0.3652607500553131, + "learning_rate": 5.3583802476996605e-06, + "loss": 0.0051, + "step": 34414 + }, + { + "epoch": 0.68832, + "grad_norm": 0.07576179504394531, + "learning_rate": 5.357143552169568e-06, + "loss": 0.1509, + "step": 34416 + }, + { + "epoch": 0.68836, + "grad_norm": 0.06624989211559296, + "learning_rate": 5.355906947154354e-06, + "loss": 0.0025, + "step": 34418 + }, + { + "epoch": 0.6884, + "grad_norm": 0.04466761648654938, + "learning_rate": 5.354670432678124e-06, + "loss": 0.0006, + "step": 34420 + }, + { + "epoch": 0.68844, + "grad_norm": 1.4282164573669434, + "learning_rate": 5.3534340087649885e-06, + "loss": 0.0216, + "step": 34422 + }, + { + "epoch": 0.68848, + "grad_norm": 1.3152709007263184, + "learning_rate": 5.352197675439041e-06, + "loss": 0.0259, + "step": 34424 + }, + { + "epoch": 0.68852, + "grad_norm": 0.0038489876314997673, + "learning_rate": 5.350961432724403e-06, + "loss": 0.0008, + "step": 34426 + }, + { + "epoch": 0.68856, + "grad_norm": 0.04039355367422104, + "learning_rate": 5.34972528064516e-06, + "loss": 0.0016, + "step": 34428 + }, + { + "epoch": 0.6886, + "grad_norm": 0.026203405112028122, + "learning_rate": 5.348489219225417e-06, + "loss": 0.0009, + "step": 34430 + }, + { + "epoch": 0.68864, + "grad_norm": 0.04150604456663132, + "learning_rate": 5.3472532484892745e-06, + "loss": 0.0013, + "step": 34432 + }, + { + "epoch": 0.68868, + "grad_norm": 6.2908854484558105, + "learning_rate": 5.346017368460819e-06, + "loss": 0.1291, + "step": 34434 + }, + { + "epoch": 0.68872, + "grad_norm": 0.7027806639671326, + "learning_rate": 5.344781579164158e-06, + "loss": 0.0084, + "step": 34436 + }, + { + "epoch": 0.68876, + "grad_norm": 0.16293281316757202, + "learning_rate": 5.343545880623373e-06, + "loss": 0.0026, + "step": 34438 + }, + { + "epoch": 0.6888, + "grad_norm": 0.009602675214409828, + "learning_rate": 5.342310272862558e-06, + "loss": 0.0301, + "step": 34440 + }, + { + "epoch": 0.68884, + "grad_norm": 0.02650151588022709, + "learning_rate": 5.341074755905802e-06, + "loss": 0.0031, + "step": 34442 + }, + { + "epoch": 0.68888, + "grad_norm": 2.0451929569244385, + "learning_rate": 5.339839329777197e-06, + "loss": 0.0215, + "step": 34444 + }, + { + "epoch": 0.68892, + "grad_norm": 0.055877428501844406, + "learning_rate": 5.338603994500818e-06, + "loss": 0.0023, + "step": 34446 + }, + { + "epoch": 0.68896, + "grad_norm": 0.10318000614643097, + "learning_rate": 5.337368750100756e-06, + "loss": 0.0023, + "step": 34448 + }, + { + "epoch": 0.689, + "grad_norm": 0.21265357732772827, + "learning_rate": 5.336133596601089e-06, + "loss": 0.1778, + "step": 34450 + }, + { + "epoch": 0.68904, + "grad_norm": 0.44791021943092346, + "learning_rate": 5.334898534025898e-06, + "loss": 0.0059, + "step": 34452 + }, + { + "epoch": 0.68908, + "grad_norm": 0.015640482306480408, + "learning_rate": 5.3336635623992685e-06, + "loss": 0.0003, + "step": 34454 + }, + { + "epoch": 0.68912, + "grad_norm": 0.04302642494440079, + "learning_rate": 5.332428681745261e-06, + "loss": 0.0005, + "step": 34456 + }, + { + "epoch": 0.68916, + "grad_norm": 0.012326248921453953, + "learning_rate": 5.331193892087967e-06, + "loss": 0.0009, + "step": 34458 + }, + { + "epoch": 0.6892, + "grad_norm": 0.1648857146501541, + "learning_rate": 5.3299591934514485e-06, + "loss": 0.0023, + "step": 34460 + }, + { + "epoch": 0.68924, + "grad_norm": 0.08085698634386063, + "learning_rate": 5.328724585859779e-06, + "loss": 0.0016, + "step": 34462 + }, + { + "epoch": 0.68928, + "grad_norm": 14.04450798034668, + "learning_rate": 5.327490069337029e-06, + "loss": 0.1512, + "step": 34464 + }, + { + "epoch": 0.68932, + "grad_norm": 0.09286914765834808, + "learning_rate": 5.326255643907266e-06, + "loss": 0.0023, + "step": 34466 + }, + { + "epoch": 0.68936, + "grad_norm": 0.011881734244525433, + "learning_rate": 5.325021309594558e-06, + "loss": 0.0003, + "step": 34468 + }, + { + "epoch": 0.6894, + "grad_norm": 3.0575287342071533, + "learning_rate": 5.323787066422964e-06, + "loss": 0.0382, + "step": 34470 + }, + { + "epoch": 0.68944, + "grad_norm": 0.3633778393268585, + "learning_rate": 5.322552914416548e-06, + "loss": 0.1792, + "step": 34472 + }, + { + "epoch": 0.68948, + "grad_norm": 0.02970445714890957, + "learning_rate": 5.3213188535993705e-06, + "loss": 0.0013, + "step": 34474 + }, + { + "epoch": 0.68952, + "grad_norm": 0.4974798262119293, + "learning_rate": 5.320084883995496e-06, + "loss": 0.0051, + "step": 34476 + }, + { + "epoch": 0.68956, + "grad_norm": 0.1732017546892166, + "learning_rate": 5.318851005628968e-06, + "loss": 0.3029, + "step": 34478 + }, + { + "epoch": 0.6896, + "grad_norm": 0.05369787663221359, + "learning_rate": 5.317617218523856e-06, + "loss": 0.0011, + "step": 34480 + }, + { + "epoch": 0.68964, + "grad_norm": 0.06229046359658241, + "learning_rate": 5.316383522704205e-06, + "loss": 0.0073, + "step": 34482 + }, + { + "epoch": 0.68968, + "grad_norm": 0.27181705832481384, + "learning_rate": 5.315149918194067e-06, + "loss": 0.0047, + "step": 34484 + }, + { + "epoch": 0.68972, + "grad_norm": 0.3433868885040283, + "learning_rate": 5.313916405017498e-06, + "loss": 0.0051, + "step": 34486 + }, + { + "epoch": 0.68976, + "grad_norm": 0.16613806784152985, + "learning_rate": 5.312682983198534e-06, + "loss": 0.4513, + "step": 34488 + }, + { + "epoch": 0.6898, + "grad_norm": 4.365116596221924, + "learning_rate": 5.311449652761235e-06, + "loss": 0.0615, + "step": 34490 + }, + { + "epoch": 0.68984, + "grad_norm": 0.44808781147003174, + "learning_rate": 5.310216413729636e-06, + "loss": 0.0061, + "step": 34492 + }, + { + "epoch": 0.68988, + "grad_norm": 0.01568368822336197, + "learning_rate": 5.30898326612778e-06, + "loss": 0.0042, + "step": 34494 + }, + { + "epoch": 0.68992, + "grad_norm": 0.20193378627300262, + "learning_rate": 5.307750209979714e-06, + "loss": 0.0739, + "step": 34496 + }, + { + "epoch": 0.68996, + "grad_norm": 0.0011151389917358756, + "learning_rate": 5.306517245309471e-06, + "loss": 0.0024, + "step": 34498 + }, + { + "epoch": 0.69, + "grad_norm": 0.16601775586605072, + "learning_rate": 5.305284372141095e-06, + "loss": 0.0136, + "step": 34500 + }, + { + "epoch": 0.69004, + "grad_norm": 0.043439991772174835, + "learning_rate": 5.304051590498613e-06, + "loss": 0.0011, + "step": 34502 + }, + { + "epoch": 0.69008, + "grad_norm": 0.17004930973052979, + "learning_rate": 5.302818900406064e-06, + "loss": 0.0093, + "step": 34504 + }, + { + "epoch": 0.69012, + "grad_norm": 0.06004815176129341, + "learning_rate": 5.301586301887478e-06, + "loss": 0.0071, + "step": 34506 + }, + { + "epoch": 0.69016, + "grad_norm": 0.22615765035152435, + "learning_rate": 5.300353794966891e-06, + "loss": 0.0087, + "step": 34508 + }, + { + "epoch": 0.6902, + "grad_norm": 0.046477243304252625, + "learning_rate": 5.299121379668316e-06, + "loss": 0.0086, + "step": 34510 + }, + { + "epoch": 0.69024, + "grad_norm": 0.0806552842259407, + "learning_rate": 5.2978890560157995e-06, + "loss": 0.0009, + "step": 34512 + }, + { + "epoch": 0.69028, + "grad_norm": 0.1709822714328766, + "learning_rate": 5.2966568240333525e-06, + "loss": 0.0076, + "step": 34514 + }, + { + "epoch": 0.69032, + "grad_norm": 0.09115654975175858, + "learning_rate": 5.295424683745002e-06, + "loss": 0.0018, + "step": 34516 + }, + { + "epoch": 0.69036, + "grad_norm": 0.457251638174057, + "learning_rate": 5.29419263517477e-06, + "loss": 0.0128, + "step": 34518 + }, + { + "epoch": 0.6904, + "grad_norm": 0.036721162497997284, + "learning_rate": 5.292960678346674e-06, + "loss": 0.0077, + "step": 34520 + }, + { + "epoch": 0.69044, + "grad_norm": 0.4533739686012268, + "learning_rate": 5.291728813284738e-06, + "loss": 0.0045, + "step": 34522 + }, + { + "epoch": 0.69048, + "grad_norm": 0.07468832284212112, + "learning_rate": 5.290497040012964e-06, + "loss": 0.001, + "step": 34524 + }, + { + "epoch": 0.69052, + "grad_norm": 0.3041543662548065, + "learning_rate": 5.289265358555384e-06, + "loss": 0.0046, + "step": 34526 + }, + { + "epoch": 0.69056, + "grad_norm": 0.5083479285240173, + "learning_rate": 5.2880337689359965e-06, + "loss": 0.0136, + "step": 34528 + }, + { + "epoch": 0.6906, + "grad_norm": 0.29100897908210754, + "learning_rate": 5.286802271178815e-06, + "loss": 0.0103, + "step": 34530 + }, + { + "epoch": 0.69064, + "grad_norm": 0.21994437277317047, + "learning_rate": 5.285570865307852e-06, + "loss": 0.0065, + "step": 34532 + }, + { + "epoch": 0.69068, + "grad_norm": 0.1637793332338333, + "learning_rate": 5.284339551347115e-06, + "loss": 0.0022, + "step": 34534 + }, + { + "epoch": 0.69072, + "grad_norm": 0.18804965913295746, + "learning_rate": 5.283108329320602e-06, + "loss": 0.0026, + "step": 34536 + }, + { + "epoch": 0.69076, + "grad_norm": 0.16930505633354187, + "learning_rate": 5.281877199252321e-06, + "loss": 0.0029, + "step": 34538 + }, + { + "epoch": 0.6908, + "grad_norm": 0.42801979184150696, + "learning_rate": 5.280646161166274e-06, + "loss": 0.0065, + "step": 34540 + }, + { + "epoch": 0.69084, + "grad_norm": 0.039299555122852325, + "learning_rate": 5.279415215086459e-06, + "loss": 0.8143, + "step": 34542 + }, + { + "epoch": 0.69088, + "grad_norm": 0.009038642048835754, + "learning_rate": 5.278184361036877e-06, + "loss": 0.0003, + "step": 34544 + }, + { + "epoch": 0.69092, + "grad_norm": 0.09989436715841293, + "learning_rate": 5.276953599041517e-06, + "loss": 0.0435, + "step": 34546 + }, + { + "epoch": 0.69096, + "grad_norm": 0.047764603048563004, + "learning_rate": 5.275722929124383e-06, + "loss": 0.0012, + "step": 34548 + }, + { + "epoch": 0.691, + "grad_norm": 0.2508624196052551, + "learning_rate": 5.274492351309462e-06, + "loss": 0.0298, + "step": 34550 + }, + { + "epoch": 0.69104, + "grad_norm": 1.0161370038986206, + "learning_rate": 5.273261865620742e-06, + "loss": 0.0171, + "step": 34552 + }, + { + "epoch": 0.69108, + "grad_norm": 0.5158554911613464, + "learning_rate": 5.272031472082222e-06, + "loss": 0.0074, + "step": 34554 + }, + { + "epoch": 0.69112, + "grad_norm": 1.7818008661270142, + "learning_rate": 5.270801170717874e-06, + "loss": 0.0304, + "step": 34556 + }, + { + "epoch": 0.69116, + "grad_norm": 0.44713878631591797, + "learning_rate": 5.2695709615517e-06, + "loss": 0.0065, + "step": 34558 + }, + { + "epoch": 0.6912, + "grad_norm": 3.9585323333740234, + "learning_rate": 5.26834084460767e-06, + "loss": 0.0855, + "step": 34560 + }, + { + "epoch": 0.69124, + "grad_norm": 0.11373496800661087, + "learning_rate": 5.267110819909773e-06, + "loss": 0.0538, + "step": 34562 + }, + { + "epoch": 0.69128, + "grad_norm": 8.428182601928711, + "learning_rate": 5.265880887481987e-06, + "loss": 0.3382, + "step": 34564 + }, + { + "epoch": 0.69132, + "grad_norm": 0.3061128258705139, + "learning_rate": 5.264651047348293e-06, + "loss": 0.0096, + "step": 34566 + }, + { + "epoch": 0.69136, + "grad_norm": 0.22435897588729858, + "learning_rate": 5.263421299532663e-06, + "loss": 0.0052, + "step": 34568 + }, + { + "epoch": 0.6914, + "grad_norm": 0.0034971474669873714, + "learning_rate": 5.262191644059071e-06, + "loss": 0.0003, + "step": 34570 + }, + { + "epoch": 0.69144, + "grad_norm": 12.583556175231934, + "learning_rate": 5.260962080951493e-06, + "loss": 0.3609, + "step": 34572 + }, + { + "epoch": 0.69148, + "grad_norm": 2.1118485927581787, + "learning_rate": 5.2597326102339e-06, + "loss": 0.0284, + "step": 34574 + }, + { + "epoch": 0.69152, + "grad_norm": 0.25067242980003357, + "learning_rate": 5.258503231930263e-06, + "loss": 0.0061, + "step": 34576 + }, + { + "epoch": 0.69156, + "grad_norm": 0.1306159943342209, + "learning_rate": 5.257273946064539e-06, + "loss": 0.008, + "step": 34578 + }, + { + "epoch": 0.6916, + "grad_norm": 1.0859222412109375, + "learning_rate": 5.256044752660709e-06, + "loss": 0.016, + "step": 34580 + }, + { + "epoch": 0.69164, + "grad_norm": 1.8013739585876465, + "learning_rate": 5.254815651742725e-06, + "loss": 0.0302, + "step": 34582 + }, + { + "epoch": 0.69168, + "grad_norm": 0.1610076129436493, + "learning_rate": 5.253586643334554e-06, + "loss": 0.0024, + "step": 34584 + }, + { + "epoch": 0.69172, + "grad_norm": 1.1207622289657593, + "learning_rate": 5.252357727460154e-06, + "loss": 0.017, + "step": 34586 + }, + { + "epoch": 0.69176, + "grad_norm": 0.012464928440749645, + "learning_rate": 5.251128904143483e-06, + "loss": 0.004, + "step": 34588 + }, + { + "epoch": 0.6918, + "grad_norm": 1.0221163034439087, + "learning_rate": 5.2499001734085045e-06, + "loss": 0.0342, + "step": 34590 + }, + { + "epoch": 0.69184, + "grad_norm": 0.8821408152580261, + "learning_rate": 5.248671535279164e-06, + "loss": 0.0586, + "step": 34592 + }, + { + "epoch": 0.69188, + "grad_norm": 0.023338276892900467, + "learning_rate": 5.247442989779417e-06, + "loss": 0.0054, + "step": 34594 + }, + { + "epoch": 0.69192, + "grad_norm": 0.15915335714817047, + "learning_rate": 5.246214536933216e-06, + "loss": 0.0103, + "step": 34596 + }, + { + "epoch": 0.69196, + "grad_norm": 0.04657658934593201, + "learning_rate": 5.244986176764514e-06, + "loss": 0.0009, + "step": 34598 + }, + { + "epoch": 0.692, + "grad_norm": 0.23813889920711517, + "learning_rate": 5.243757909297247e-06, + "loss": 0.0558, + "step": 34600 + }, + { + "epoch": 0.69204, + "grad_norm": 0.1923103928565979, + "learning_rate": 5.242529734555375e-06, + "loss": 0.0026, + "step": 34602 + }, + { + "epoch": 0.69208, + "grad_norm": 0.11740794777870178, + "learning_rate": 5.241301652562833e-06, + "loss": 0.0024, + "step": 34604 + }, + { + "epoch": 0.69212, + "grad_norm": 0.018970176577568054, + "learning_rate": 5.240073663343563e-06, + "loss": 0.0012, + "step": 34606 + }, + { + "epoch": 0.69216, + "grad_norm": 0.5341404676437378, + "learning_rate": 5.238845766921513e-06, + "loss": 0.0065, + "step": 34608 + }, + { + "epoch": 0.6922, + "grad_norm": 0.28426483273506165, + "learning_rate": 5.237617963320608e-06, + "loss": 0.0036, + "step": 34610 + }, + { + "epoch": 0.69224, + "grad_norm": 0.0024180952459573746, + "learning_rate": 5.2363902525648e-06, + "loss": 0.0117, + "step": 34612 + }, + { + "epoch": 0.69228, + "grad_norm": 0.09691325575113297, + "learning_rate": 5.235162634678013e-06, + "loss": 0.0139, + "step": 34614 + }, + { + "epoch": 0.69232, + "grad_norm": 0.0936669185757637, + "learning_rate": 5.233935109684184e-06, + "loss": 0.0019, + "step": 34616 + }, + { + "epoch": 0.69236, + "grad_norm": 0.21987484395503998, + "learning_rate": 5.232707677607243e-06, + "loss": 0.0052, + "step": 34618 + }, + { + "epoch": 0.6924, + "grad_norm": 0.0881354883313179, + "learning_rate": 5.23148033847112e-06, + "loss": 0.0021, + "step": 34620 + }, + { + "epoch": 0.69244, + "grad_norm": 0.8111876845359802, + "learning_rate": 5.230253092299747e-06, + "loss": 0.0143, + "step": 34622 + }, + { + "epoch": 0.69248, + "grad_norm": 0.2744852602481842, + "learning_rate": 5.229025939117043e-06, + "loss": 0.0136, + "step": 34624 + }, + { + "epoch": 0.69252, + "grad_norm": 0.0026694657281041145, + "learning_rate": 5.2277988789469325e-06, + "loss": 0.0013, + "step": 34626 + }, + { + "epoch": 0.69256, + "grad_norm": 0.1503143608570099, + "learning_rate": 5.226571911813341e-06, + "loss": 0.0067, + "step": 34628 + }, + { + "epoch": 0.6926, + "grad_norm": 1.833997368812561, + "learning_rate": 5.225345037740186e-06, + "loss": 0.0589, + "step": 34630 + }, + { + "epoch": 0.69264, + "grad_norm": 0.1589391678571701, + "learning_rate": 5.224118256751389e-06, + "loss": 0.0018, + "step": 34632 + }, + { + "epoch": 0.69268, + "grad_norm": 0.33828112483024597, + "learning_rate": 5.22289156887087e-06, + "loss": 0.0074, + "step": 34634 + }, + { + "epoch": 0.69272, + "grad_norm": 0.15459562838077545, + "learning_rate": 5.221664974122535e-06, + "loss": 0.0019, + "step": 34636 + }, + { + "epoch": 0.69276, + "grad_norm": 0.004356979392468929, + "learning_rate": 5.2204384725303e-06, + "loss": 0.0005, + "step": 34638 + }, + { + "epoch": 0.6928, + "grad_norm": 10.530078887939453, + "learning_rate": 5.219212064118079e-06, + "loss": 0.2339, + "step": 34640 + }, + { + "epoch": 0.69284, + "grad_norm": 0.010165924206376076, + "learning_rate": 5.21798574890978e-06, + "loss": 0.0002, + "step": 34642 + }, + { + "epoch": 0.69288, + "grad_norm": 10.05173397064209, + "learning_rate": 5.216759526929313e-06, + "loss": 0.1259, + "step": 34644 + }, + { + "epoch": 0.69292, + "grad_norm": 0.17284154891967773, + "learning_rate": 5.215533398200576e-06, + "loss": 0.0023, + "step": 34646 + }, + { + "epoch": 0.69296, + "grad_norm": 0.3251740634441376, + "learning_rate": 5.214307362747486e-06, + "loss": 0.0042, + "step": 34648 + }, + { + "epoch": 0.693, + "grad_norm": 0.034255921840667725, + "learning_rate": 5.213081420593933e-06, + "loss": 0.0426, + "step": 34650 + }, + { + "epoch": 0.69304, + "grad_norm": 0.315082311630249, + "learning_rate": 5.211855571763822e-06, + "loss": 0.0034, + "step": 34652 + }, + { + "epoch": 0.69308, + "grad_norm": 0.0065054260194301605, + "learning_rate": 5.2106298162810535e-06, + "loss": 0.0005, + "step": 34654 + }, + { + "epoch": 0.69312, + "grad_norm": 0.8012586236000061, + "learning_rate": 5.2094041541695236e-06, + "loss": 0.007, + "step": 34656 + }, + { + "epoch": 0.69316, + "grad_norm": 0.027783511206507683, + "learning_rate": 5.2081785854531255e-06, + "loss": 0.0005, + "step": 34658 + }, + { + "epoch": 0.6932, + "grad_norm": 0.08520510047674179, + "learning_rate": 5.2069531101557505e-06, + "loss": 0.001, + "step": 34660 + }, + { + "epoch": 0.69324, + "grad_norm": 0.27166247367858887, + "learning_rate": 5.205727728301293e-06, + "loss": 0.5915, + "step": 34662 + }, + { + "epoch": 0.69328, + "grad_norm": 0.00958156120032072, + "learning_rate": 5.204502439913641e-06, + "loss": 0.0094, + "step": 34664 + }, + { + "epoch": 0.69332, + "grad_norm": 0.07082948088645935, + "learning_rate": 5.2032772450166876e-06, + "loss": 0.0033, + "step": 34666 + }, + { + "epoch": 0.69336, + "grad_norm": 1.0422077178955078, + "learning_rate": 5.202052143634306e-06, + "loss": 0.0099, + "step": 34668 + }, + { + "epoch": 0.6934, + "grad_norm": 0.008655696175992489, + "learning_rate": 5.200827135790396e-06, + "loss": 0.0002, + "step": 34670 + }, + { + "epoch": 0.69344, + "grad_norm": 0.09408349543809891, + "learning_rate": 5.199602221508827e-06, + "loss": 0.0028, + "step": 34672 + }, + { + "epoch": 0.69348, + "grad_norm": 13.437262535095215, + "learning_rate": 5.198377400813486e-06, + "loss": 0.2839, + "step": 34674 + }, + { + "epoch": 0.69352, + "grad_norm": 0.20197243988513947, + "learning_rate": 5.197152673728251e-06, + "loss": 0.0042, + "step": 34676 + }, + { + "epoch": 0.69356, + "grad_norm": 0.08757858723402023, + "learning_rate": 5.1959280402769905e-06, + "loss": 0.0023, + "step": 34678 + }, + { + "epoch": 0.6936, + "grad_norm": 0.5328137874603271, + "learning_rate": 5.194703500483593e-06, + "loss": 0.009, + "step": 34680 + }, + { + "epoch": 0.69364, + "grad_norm": 0.5837282538414001, + "learning_rate": 5.193479054371923e-06, + "loss": 0.044, + "step": 34682 + }, + { + "epoch": 0.69368, + "grad_norm": 15.568937301635742, + "learning_rate": 5.192254701965852e-06, + "loss": 0.337, + "step": 34684 + }, + { + "epoch": 0.69372, + "grad_norm": 0.0730186328291893, + "learning_rate": 5.191030443289251e-06, + "loss": 0.005, + "step": 34686 + }, + { + "epoch": 0.69376, + "grad_norm": 0.08694903552532196, + "learning_rate": 5.189806278365992e-06, + "loss": 0.0013, + "step": 34688 + }, + { + "epoch": 0.6938, + "grad_norm": 0.009752245619893074, + "learning_rate": 5.188582207219931e-06, + "loss": 0.0093, + "step": 34690 + }, + { + "epoch": 0.69384, + "grad_norm": 0.04926410689949989, + "learning_rate": 5.187358229874937e-06, + "loss": 0.0077, + "step": 34692 + }, + { + "epoch": 0.69388, + "grad_norm": 0.13818663358688354, + "learning_rate": 5.186134346354872e-06, + "loss": 0.0147, + "step": 34694 + }, + { + "epoch": 0.69392, + "grad_norm": 0.37854933738708496, + "learning_rate": 5.184910556683598e-06, + "loss": 0.0045, + "step": 34696 + }, + { + "epoch": 0.69396, + "grad_norm": 0.16154880821704865, + "learning_rate": 5.1836868608849735e-06, + "loss": 0.0038, + "step": 34698 + }, + { + "epoch": 0.694, + "grad_norm": 2.6416444778442383, + "learning_rate": 5.1824632589828465e-06, + "loss": 0.036, + "step": 34700 + }, + { + "epoch": 0.69404, + "grad_norm": 0.19426329433918, + "learning_rate": 5.181239751001086e-06, + "loss": 0.0036, + "step": 34702 + }, + { + "epoch": 0.69408, + "grad_norm": 0.04490737244486809, + "learning_rate": 5.180016336963533e-06, + "loss": 0.0036, + "step": 34704 + }, + { + "epoch": 0.69412, + "grad_norm": 0.4276507496833801, + "learning_rate": 5.178793016894044e-06, + "loss": 0.0048, + "step": 34706 + }, + { + "epoch": 0.69416, + "grad_norm": 0.05447959899902344, + "learning_rate": 5.17756979081647e-06, + "loss": 0.053, + "step": 34708 + }, + { + "epoch": 0.6942, + "grad_norm": 0.2056310921907425, + "learning_rate": 5.176346658754648e-06, + "loss": 0.49, + "step": 34710 + }, + { + "epoch": 0.69424, + "grad_norm": 0.08223637193441391, + "learning_rate": 5.175123620732441e-06, + "loss": 0.0102, + "step": 34712 + }, + { + "epoch": 0.69428, + "grad_norm": 0.011786160990595818, + "learning_rate": 5.173900676773677e-06, + "loss": 0.0004, + "step": 34714 + }, + { + "epoch": 0.69432, + "grad_norm": 0.09750737994909286, + "learning_rate": 5.172677826902205e-06, + "loss": 0.0531, + "step": 34716 + }, + { + "epoch": 0.69436, + "grad_norm": 0.20648139715194702, + "learning_rate": 5.171455071141863e-06, + "loss": 0.0054, + "step": 34718 + }, + { + "epoch": 0.6944, + "grad_norm": 0.0019581238739192486, + "learning_rate": 5.1702324095164955e-06, + "loss": 0.0004, + "step": 34720 + }, + { + "epoch": 0.69444, + "grad_norm": 6.817371368408203, + "learning_rate": 5.169009842049926e-06, + "loss": 0.106, + "step": 34722 + }, + { + "epoch": 0.69448, + "grad_norm": 0.00781947374343872, + "learning_rate": 5.167787368766003e-06, + "loss": 0.3731, + "step": 34724 + }, + { + "epoch": 0.69452, + "grad_norm": 0.06870592385530472, + "learning_rate": 5.1665649896885494e-06, + "loss": 0.0021, + "step": 34726 + }, + { + "epoch": 0.69456, + "grad_norm": 0.6944168210029602, + "learning_rate": 5.1653427048414e-06, + "loss": 0.0109, + "step": 34728 + }, + { + "epoch": 0.6946, + "grad_norm": 0.743777871131897, + "learning_rate": 5.16412051424839e-06, + "loss": 0.0144, + "step": 34730 + }, + { + "epoch": 0.69464, + "grad_norm": 14.201929092407227, + "learning_rate": 5.1628984179333285e-06, + "loss": 0.3939, + "step": 34732 + }, + { + "epoch": 0.69468, + "grad_norm": 2.5966453552246094, + "learning_rate": 5.161676415920063e-06, + "loss": 0.0401, + "step": 34734 + }, + { + "epoch": 0.69472, + "grad_norm": 0.07213573902845383, + "learning_rate": 5.160454508232398e-06, + "loss": 0.0126, + "step": 34736 + }, + { + "epoch": 0.69476, + "grad_norm": 0.1834849715232849, + "learning_rate": 5.159232694894172e-06, + "loss": 0.0066, + "step": 34738 + }, + { + "epoch": 0.6948, + "grad_norm": 0.16293106973171234, + "learning_rate": 5.158010975929193e-06, + "loss": 0.0051, + "step": 34740 + }, + { + "epoch": 0.69484, + "grad_norm": 0.06328290700912476, + "learning_rate": 5.156789351361282e-06, + "loss": 0.0046, + "step": 34742 + }, + { + "epoch": 0.69488, + "grad_norm": 0.13390015065670013, + "learning_rate": 5.1555678212142615e-06, + "loss": 0.0056, + "step": 34744 + }, + { + "epoch": 0.69492, + "grad_norm": 0.00632708054035902, + "learning_rate": 5.154346385511936e-06, + "loss": 0.0001, + "step": 34746 + }, + { + "epoch": 0.69496, + "grad_norm": 5.78420352935791, + "learning_rate": 5.153125044278122e-06, + "loss": 0.0885, + "step": 34748 + }, + { + "epoch": 0.695, + "grad_norm": 0.020176395773887634, + "learning_rate": 5.151903797536631e-06, + "loss": 0.0015, + "step": 34750 + }, + { + "epoch": 0.69504, + "grad_norm": 0.07855921983718872, + "learning_rate": 5.150682645311271e-06, + "loss": 0.0012, + "step": 34752 + }, + { + "epoch": 0.69508, + "grad_norm": 0.14001545310020447, + "learning_rate": 5.149461587625849e-06, + "loss": 0.0055, + "step": 34754 + }, + { + "epoch": 0.69512, + "grad_norm": 0.7305329442024231, + "learning_rate": 5.148240624504175e-06, + "loss": 0.0134, + "step": 34756 + }, + { + "epoch": 0.69516, + "grad_norm": 0.0702420026063919, + "learning_rate": 5.147019755970044e-06, + "loss": 0.0023, + "step": 34758 + }, + { + "epoch": 0.6952, + "grad_norm": 0.4649130702018738, + "learning_rate": 5.145798982047261e-06, + "loss": 0.0121, + "step": 34760 + }, + { + "epoch": 0.69524, + "grad_norm": 2.235635280609131, + "learning_rate": 5.144578302759631e-06, + "loss": 1.0304, + "step": 34762 + }, + { + "epoch": 0.69528, + "grad_norm": 0.28224319219589233, + "learning_rate": 5.143357718130938e-06, + "loss": 0.0028, + "step": 34764 + }, + { + "epoch": 0.69532, + "grad_norm": 0.07170959562063217, + "learning_rate": 5.142137228184994e-06, + "loss": 0.0013, + "step": 34766 + }, + { + "epoch": 0.69536, + "grad_norm": 0.11535307765007019, + "learning_rate": 5.1409168329455796e-06, + "loss": 0.0014, + "step": 34768 + }, + { + "epoch": 0.6954, + "grad_norm": 0.06655599921941757, + "learning_rate": 5.139696532436499e-06, + "loss": 0.022, + "step": 34770 + }, + { + "epoch": 0.69544, + "grad_norm": 0.09562664479017258, + "learning_rate": 5.1384763266815345e-06, + "loss": 0.0016, + "step": 34772 + }, + { + "epoch": 0.69548, + "grad_norm": 0.2682766020298004, + "learning_rate": 5.137256215704476e-06, + "loss": 0.0053, + "step": 34774 + }, + { + "epoch": 0.69552, + "grad_norm": 0.040076542645692825, + "learning_rate": 5.136036199529115e-06, + "loss": 0.355, + "step": 34776 + }, + { + "epoch": 0.69556, + "grad_norm": 0.07679080963134766, + "learning_rate": 5.134816278179229e-06, + "loss": 0.0036, + "step": 34778 + }, + { + "epoch": 0.6956, + "grad_norm": 0.04559766501188278, + "learning_rate": 5.133596451678603e-06, + "loss": 0.0011, + "step": 34780 + }, + { + "epoch": 0.69564, + "grad_norm": 14.236139297485352, + "learning_rate": 5.1323767200510215e-06, + "loss": 0.962, + "step": 34782 + }, + { + "epoch": 0.69568, + "grad_norm": 0.40900909900665283, + "learning_rate": 5.131157083320259e-06, + "loss": 0.005, + "step": 34784 + }, + { + "epoch": 0.69572, + "grad_norm": 0.6599021553993225, + "learning_rate": 5.1299375415101e-06, + "loss": 0.0067, + "step": 34786 + }, + { + "epoch": 0.69576, + "grad_norm": 3.7725586891174316, + "learning_rate": 5.128718094644316e-06, + "loss": 0.0517, + "step": 34788 + }, + { + "epoch": 0.6958, + "grad_norm": 0.2658238112926483, + "learning_rate": 5.127498742746675e-06, + "loss": 0.0046, + "step": 34790 + }, + { + "epoch": 0.69584, + "grad_norm": 0.22004452347755432, + "learning_rate": 5.126279485840962e-06, + "loss": 0.0051, + "step": 34792 + }, + { + "epoch": 0.69588, + "grad_norm": 0.6519259214401245, + "learning_rate": 5.1250603239509355e-06, + "loss": 0.0059, + "step": 34794 + }, + { + "epoch": 0.69592, + "grad_norm": 0.2872970700263977, + "learning_rate": 5.123841257100368e-06, + "loss": 0.0074, + "step": 34796 + }, + { + "epoch": 0.69596, + "grad_norm": 0.24819782376289368, + "learning_rate": 5.122622285313029e-06, + "loss": 0.0068, + "step": 34798 + }, + { + "epoch": 0.696, + "grad_norm": 1.2865228652954102, + "learning_rate": 5.121403408612672e-06, + "loss": 0.0228, + "step": 34800 + }, + { + "epoch": 0.69604, + "grad_norm": 0.18402352929115295, + "learning_rate": 5.120184627023075e-06, + "loss": 0.2068, + "step": 34802 + }, + { + "epoch": 0.69608, + "grad_norm": 0.06282474845647812, + "learning_rate": 5.118965940567987e-06, + "loss": 0.0385, + "step": 34804 + }, + { + "epoch": 0.69612, + "grad_norm": 0.10301303118467331, + "learning_rate": 5.1177473492711715e-06, + "loss": 0.0032, + "step": 34806 + }, + { + "epoch": 0.69616, + "grad_norm": 0.6240763068199158, + "learning_rate": 5.116528853156384e-06, + "loss": 0.012, + "step": 34808 + }, + { + "epoch": 0.6962, + "grad_norm": 0.03578506410121918, + "learning_rate": 5.115310452247386e-06, + "loss": 0.0137, + "step": 34810 + }, + { + "epoch": 0.69624, + "grad_norm": 0.1444813758134842, + "learning_rate": 5.11409214656792e-06, + "loss": 0.0134, + "step": 34812 + }, + { + "epoch": 0.69628, + "grad_norm": 0.12023904919624329, + "learning_rate": 5.112873936141744e-06, + "loss": 0.0015, + "step": 34814 + }, + { + "epoch": 0.69632, + "grad_norm": 0.561907172203064, + "learning_rate": 5.111655820992607e-06, + "loss": 0.0143, + "step": 34816 + }, + { + "epoch": 0.69636, + "grad_norm": 1.6629489660263062, + "learning_rate": 5.110437801144256e-06, + "loss": 0.0194, + "step": 34818 + }, + { + "epoch": 0.6964, + "grad_norm": 0.057781536132097244, + "learning_rate": 5.109219876620441e-06, + "loss": 0.0029, + "step": 34820 + }, + { + "epoch": 0.69644, + "grad_norm": 8.501346588134766, + "learning_rate": 5.108002047444895e-06, + "loss": 0.1761, + "step": 34822 + }, + { + "epoch": 0.69648, + "grad_norm": 0.025449397042393684, + "learning_rate": 5.106784313641375e-06, + "loss": 0.1159, + "step": 34824 + }, + { + "epoch": 0.69652, + "grad_norm": 0.13668468594551086, + "learning_rate": 5.105566675233611e-06, + "loss": 0.0052, + "step": 34826 + }, + { + "epoch": 0.69656, + "grad_norm": 0.08732996881008148, + "learning_rate": 5.1043491322453434e-06, + "loss": 0.002, + "step": 34828 + }, + { + "epoch": 0.6966, + "grad_norm": 0.08802985399961472, + "learning_rate": 5.103131684700315e-06, + "loss": 0.0031, + "step": 34830 + }, + { + "epoch": 0.69664, + "grad_norm": 0.25515803694725037, + "learning_rate": 5.101914332622247e-06, + "loss": 0.0099, + "step": 34832 + }, + { + "epoch": 0.69668, + "grad_norm": 0.4686034023761749, + "learning_rate": 5.1006970760348886e-06, + "loss": 0.0061, + "step": 34834 + }, + { + "epoch": 0.69672, + "grad_norm": 0.004316556267440319, + "learning_rate": 5.099479914961958e-06, + "loss": 0.0008, + "step": 34836 + }, + { + "epoch": 0.69676, + "grad_norm": 0.022282470017671585, + "learning_rate": 5.098262849427191e-06, + "loss": 0.006, + "step": 34838 + }, + { + "epoch": 0.6968, + "grad_norm": 0.23879440128803253, + "learning_rate": 5.0970458794543135e-06, + "loss": 0.0101, + "step": 34840 + }, + { + "epoch": 0.69684, + "grad_norm": 1.880080223083496, + "learning_rate": 5.09582900506705e-06, + "loss": 0.0263, + "step": 34842 + }, + { + "epoch": 0.69688, + "grad_norm": 0.20683276653289795, + "learning_rate": 5.094612226289128e-06, + "loss": 0.0028, + "step": 34844 + }, + { + "epoch": 0.69692, + "grad_norm": 0.6902167201042175, + "learning_rate": 5.093395543144264e-06, + "loss": 0.0096, + "step": 34846 + }, + { + "epoch": 0.69696, + "grad_norm": 0.13888771831989288, + "learning_rate": 5.09217895565618e-06, + "loss": 0.0098, + "step": 34848 + }, + { + "epoch": 0.697, + "grad_norm": 0.0044683353044092655, + "learning_rate": 5.090962463848592e-06, + "loss": 0.0003, + "step": 34850 + }, + { + "epoch": 0.69704, + "grad_norm": 0.03486279770731926, + "learning_rate": 5.089746067745222e-06, + "loss": 0.0194, + "step": 34852 + }, + { + "epoch": 0.69708, + "grad_norm": 0.023029563948512077, + "learning_rate": 5.088529767369775e-06, + "loss": 0.0059, + "step": 34854 + }, + { + "epoch": 0.69712, + "grad_norm": 0.05417614057660103, + "learning_rate": 5.087313562745975e-06, + "loss": 0.0019, + "step": 34856 + }, + { + "epoch": 0.69716, + "grad_norm": 0.653338611125946, + "learning_rate": 5.08609745389752e-06, + "loss": 0.0128, + "step": 34858 + }, + { + "epoch": 0.6972, + "grad_norm": 0.5612165331840515, + "learning_rate": 5.0848814408481305e-06, + "loss": 0.009, + "step": 34860 + }, + { + "epoch": 0.69724, + "grad_norm": 0.034161489456892014, + "learning_rate": 5.083665523621506e-06, + "loss": 0.0025, + "step": 34862 + }, + { + "epoch": 0.69728, + "grad_norm": 0.10785096883773804, + "learning_rate": 5.082449702241352e-06, + "loss": 0.0016, + "step": 34864 + }, + { + "epoch": 0.69732, + "grad_norm": 0.025512835010886192, + "learning_rate": 5.081233976731377e-06, + "loss": 0.0016, + "step": 34866 + }, + { + "epoch": 0.69736, + "grad_norm": 4.734359264373779, + "learning_rate": 5.080018347115274e-06, + "loss": 0.0594, + "step": 34868 + }, + { + "epoch": 0.6974, + "grad_norm": 0.026613827794790268, + "learning_rate": 5.078802813416746e-06, + "loss": 0.0014, + "step": 34870 + }, + { + "epoch": 0.69744, + "grad_norm": 0.23566681146621704, + "learning_rate": 5.0775873756594905e-06, + "loss": 0.0119, + "step": 34872 + }, + { + "epoch": 0.69748, + "grad_norm": 0.5950816869735718, + "learning_rate": 5.076372033867204e-06, + "loss": 0.0096, + "step": 34874 + }, + { + "epoch": 0.69752, + "grad_norm": 0.40088948607444763, + "learning_rate": 5.07515678806358e-06, + "loss": 0.0312, + "step": 34876 + }, + { + "epoch": 0.69756, + "grad_norm": 0.008152471855282784, + "learning_rate": 5.073941638272313e-06, + "loss": 0.0005, + "step": 34878 + }, + { + "epoch": 0.6976, + "grad_norm": 0.3676146864891052, + "learning_rate": 5.072726584517086e-06, + "loss": 0.0163, + "step": 34880 + }, + { + "epoch": 0.69764, + "grad_norm": 0.07732506841421127, + "learning_rate": 5.0715116268215916e-06, + "loss": 0.0033, + "step": 34882 + }, + { + "epoch": 0.69768, + "grad_norm": 1.6835519075393677, + "learning_rate": 5.070296765209519e-06, + "loss": 0.02, + "step": 34884 + }, + { + "epoch": 0.69772, + "grad_norm": 0.011611531488597393, + "learning_rate": 5.069081999704541e-06, + "loss": 0.0007, + "step": 34886 + }, + { + "epoch": 0.69776, + "grad_norm": 0.048470236361026764, + "learning_rate": 5.067867330330356e-06, + "loss": 0.0021, + "step": 34888 + }, + { + "epoch": 0.6978, + "grad_norm": 0.1633344292640686, + "learning_rate": 5.066652757110628e-06, + "loss": 0.0014, + "step": 34890 + }, + { + "epoch": 0.69784, + "grad_norm": 0.3183739185333252, + "learning_rate": 5.0654382800690524e-06, + "loss": 0.0044, + "step": 34892 + }, + { + "epoch": 0.69788, + "grad_norm": 0.016893157735466957, + "learning_rate": 5.0642238992292945e-06, + "loss": 0.0006, + "step": 34894 + }, + { + "epoch": 0.69792, + "grad_norm": 0.36807358264923096, + "learning_rate": 5.063009614615033e-06, + "loss": 0.0061, + "step": 34896 + }, + { + "epoch": 0.69796, + "grad_norm": 1.6770625114440918, + "learning_rate": 5.061795426249945e-06, + "loss": 0.017, + "step": 34898 + }, + { + "epoch": 0.698, + "grad_norm": 1.5286266803741455, + "learning_rate": 5.060581334157693e-06, + "loss": 0.0213, + "step": 34900 + }, + { + "epoch": 0.69804, + "grad_norm": 0.3804475665092468, + "learning_rate": 5.059367338361952e-06, + "loss": 0.0137, + "step": 34902 + }, + { + "epoch": 0.69808, + "grad_norm": 0.17661802470684052, + "learning_rate": 5.058153438886386e-06, + "loss": 0.0022, + "step": 34904 + }, + { + "epoch": 0.69812, + "grad_norm": 13.93532943725586, + "learning_rate": 5.056939635754665e-06, + "loss": 1.0429, + "step": 34906 + }, + { + "epoch": 0.69816, + "grad_norm": 0.7899149656295776, + "learning_rate": 5.055725928990449e-06, + "loss": 0.0141, + "step": 34908 + }, + { + "epoch": 0.6982, + "grad_norm": 0.09542568773031235, + "learning_rate": 5.054512318617406e-06, + "loss": 0.0097, + "step": 34910 + }, + { + "epoch": 0.69824, + "grad_norm": 0.28724074363708496, + "learning_rate": 5.053298804659188e-06, + "loss": 0.0095, + "step": 34912 + }, + { + "epoch": 0.69828, + "grad_norm": 2.6272099018096924, + "learning_rate": 5.052085387139457e-06, + "loss": 0.0353, + "step": 34914 + }, + { + "epoch": 0.69832, + "grad_norm": 0.34621724486351013, + "learning_rate": 5.050872066081869e-06, + "loss": 0.0065, + "step": 34916 + }, + { + "epoch": 0.69836, + "grad_norm": 0.20958131551742554, + "learning_rate": 5.049658841510077e-06, + "loss": 0.0022, + "step": 34918 + }, + { + "epoch": 0.6984, + "grad_norm": 0.8103027939796448, + "learning_rate": 5.048445713447738e-06, + "loss": 0.0147, + "step": 34920 + }, + { + "epoch": 0.69844, + "grad_norm": 0.09885184466838837, + "learning_rate": 5.047232681918493e-06, + "loss": 0.0091, + "step": 34922 + }, + { + "epoch": 0.69848, + "grad_norm": 0.10323923081159592, + "learning_rate": 5.046019746946003e-06, + "loss": 0.0042, + "step": 34924 + }, + { + "epoch": 0.69852, + "grad_norm": 0.7131811380386353, + "learning_rate": 5.044806908553904e-06, + "loss": 0.0161, + "step": 34926 + }, + { + "epoch": 0.69856, + "grad_norm": 0.004688633140176535, + "learning_rate": 5.043594166765846e-06, + "loss": 0.0045, + "step": 34928 + }, + { + "epoch": 0.6986, + "grad_norm": 0.35690411925315857, + "learning_rate": 5.042381521605473e-06, + "loss": 0.0172, + "step": 34930 + }, + { + "epoch": 0.69864, + "grad_norm": 0.028428301215171814, + "learning_rate": 5.041168973096423e-06, + "loss": 0.0474, + "step": 34932 + }, + { + "epoch": 0.69868, + "grad_norm": 0.04998277500271797, + "learning_rate": 5.0399565212623415e-06, + "loss": 0.0139, + "step": 34934 + }, + { + "epoch": 0.69872, + "grad_norm": 0.15187177062034607, + "learning_rate": 5.038744166126857e-06, + "loss": 0.0021, + "step": 34936 + }, + { + "epoch": 0.69876, + "grad_norm": 0.3245806396007538, + "learning_rate": 5.037531907713609e-06, + "loss": 0.0058, + "step": 34938 + }, + { + "epoch": 0.6988, + "grad_norm": 0.01720963604748249, + "learning_rate": 5.036319746046232e-06, + "loss": 0.0024, + "step": 34940 + }, + { + "epoch": 0.69884, + "grad_norm": 0.0756583958864212, + "learning_rate": 5.03510768114836e-06, + "loss": 0.0065, + "step": 34942 + }, + { + "epoch": 0.69888, + "grad_norm": 1.4666401147842407, + "learning_rate": 5.033895713043613e-06, + "loss": 0.0199, + "step": 34944 + }, + { + "epoch": 0.69892, + "grad_norm": 0.056050702929496765, + "learning_rate": 5.032683841755632e-06, + "loss": 0.0018, + "step": 34946 + }, + { + "epoch": 0.69896, + "grad_norm": 0.01778876781463623, + "learning_rate": 5.031472067308035e-06, + "loss": 0.0013, + "step": 34948 + }, + { + "epoch": 0.699, + "grad_norm": 1.2871317863464355, + "learning_rate": 5.030260389724447e-06, + "loss": 0.0194, + "step": 34950 + }, + { + "epoch": 0.69904, + "grad_norm": 15.299742698669434, + "learning_rate": 5.029048809028496e-06, + "loss": 0.7112, + "step": 34952 + }, + { + "epoch": 0.69908, + "grad_norm": 0.15873320400714874, + "learning_rate": 5.027837325243788e-06, + "loss": 0.0151, + "step": 34954 + }, + { + "epoch": 0.69912, + "grad_norm": 1.6308045387268066, + "learning_rate": 5.026625938393961e-06, + "loss": 0.0337, + "step": 34956 + }, + { + "epoch": 0.69916, + "grad_norm": 0.05372858792543411, + "learning_rate": 5.025414648502617e-06, + "loss": 0.0017, + "step": 34958 + }, + { + "epoch": 0.6992, + "grad_norm": 3.3602418899536133, + "learning_rate": 5.024203455593375e-06, + "loss": 0.0436, + "step": 34960 + }, + { + "epoch": 0.69924, + "grad_norm": 0.3487181067466736, + "learning_rate": 5.022992359689849e-06, + "loss": 0.0055, + "step": 34962 + }, + { + "epoch": 0.69928, + "grad_norm": 0.7088773250579834, + "learning_rate": 5.02178136081565e-06, + "loss": 0.014, + "step": 34964 + }, + { + "epoch": 0.69932, + "grad_norm": 1.1121580600738525, + "learning_rate": 5.02057045899439e-06, + "loss": 0.0122, + "step": 34966 + }, + { + "epoch": 0.69936, + "grad_norm": 0.08688843250274658, + "learning_rate": 5.019359654249667e-06, + "loss": 0.0023, + "step": 34968 + }, + { + "epoch": 0.6994, + "grad_norm": 1.9838581085205078, + "learning_rate": 5.018148946605092e-06, + "loss": 0.0348, + "step": 34970 + }, + { + "epoch": 0.69944, + "grad_norm": 1.046244740486145, + "learning_rate": 5.016938336084269e-06, + "loss": 0.015, + "step": 34972 + }, + { + "epoch": 0.69948, + "grad_norm": 0.27073153853416443, + "learning_rate": 5.015727822710803e-06, + "loss": 0.0059, + "step": 34974 + }, + { + "epoch": 0.69952, + "grad_norm": 0.08976360410451889, + "learning_rate": 5.0145174065082814e-06, + "loss": 0.0025, + "step": 34976 + }, + { + "epoch": 0.69956, + "grad_norm": 0.13632060587406158, + "learning_rate": 5.013307087500317e-06, + "loss": 0.0038, + "step": 34978 + }, + { + "epoch": 0.6996, + "grad_norm": 0.10048286616802216, + "learning_rate": 5.012096865710494e-06, + "loss": 0.0038, + "step": 34980 + }, + { + "epoch": 0.69964, + "grad_norm": 2.7894089221954346, + "learning_rate": 5.010886741162411e-06, + "loss": 0.0378, + "step": 34982 + }, + { + "epoch": 0.69968, + "grad_norm": 0.5153655409812927, + "learning_rate": 5.009676713879658e-06, + "loss": 0.0075, + "step": 34984 + }, + { + "epoch": 0.69972, + "grad_norm": 0.40326711535453796, + "learning_rate": 5.008466783885828e-06, + "loss": 0.0051, + "step": 34986 + }, + { + "epoch": 0.69976, + "grad_norm": 2.5500576496124268, + "learning_rate": 5.007256951204512e-06, + "loss": 0.0295, + "step": 34988 + }, + { + "epoch": 0.6998, + "grad_norm": 0.3045768439769745, + "learning_rate": 5.0060472158592885e-06, + "loss": 0.004, + "step": 34990 + }, + { + "epoch": 0.69984, + "grad_norm": 3.6163170337677, + "learning_rate": 5.004837577873744e-06, + "loss": 0.0508, + "step": 34992 + }, + { + "epoch": 0.69988, + "grad_norm": 0.016492489725351334, + "learning_rate": 5.003628037271464e-06, + "loss": 0.0009, + "step": 34994 + }, + { + "epoch": 0.69992, + "grad_norm": 0.2900027632713318, + "learning_rate": 5.002418594076028e-06, + "loss": 0.0043, + "step": 34996 + }, + { + "epoch": 0.69996, + "grad_norm": 0.1100371852517128, + "learning_rate": 5.0012092483110146e-06, + "loss": 0.0019, + "step": 34998 + }, + { + "epoch": 0.7, + "grad_norm": 0.026852384209632874, + "learning_rate": 5.000000000000003e-06, + "loss": 0.0012, + "step": 35000 + }, + { + "epoch": 0.70004, + "grad_norm": 0.5490977764129639, + "learning_rate": 4.998790849166562e-06, + "loss": 0.0082, + "step": 35002 + }, + { + "epoch": 0.70008, + "grad_norm": 0.22128818929195404, + "learning_rate": 4.997581795834268e-06, + "loss": 0.0074, + "step": 35004 + }, + { + "epoch": 0.70012, + "grad_norm": 0.2817034125328064, + "learning_rate": 4.996372840026697e-06, + "loss": 0.0095, + "step": 35006 + }, + { + "epoch": 0.70016, + "grad_norm": 0.005273847375065088, + "learning_rate": 4.995163981767405e-06, + "loss": 0.0268, + "step": 35008 + }, + { + "epoch": 0.7002, + "grad_norm": 0.03902825713157654, + "learning_rate": 4.993955221079976e-06, + "loss": 0.0273, + "step": 35010 + }, + { + "epoch": 0.70024, + "grad_norm": 0.03711535781621933, + "learning_rate": 4.99274655798796e-06, + "loss": 0.6089, + "step": 35012 + }, + { + "epoch": 0.70028, + "grad_norm": 0.022272683680057526, + "learning_rate": 4.991537992514934e-06, + "loss": 0.0042, + "step": 35014 + }, + { + "epoch": 0.70032, + "grad_norm": 0.3838396370410919, + "learning_rate": 4.990329524684451e-06, + "loss": 0.0066, + "step": 35016 + }, + { + "epoch": 0.70036, + "grad_norm": 0.03182246908545494, + "learning_rate": 4.989121154520073e-06, + "loss": 0.0135, + "step": 35018 + }, + { + "epoch": 0.7004, + "grad_norm": 0.2583100199699402, + "learning_rate": 4.98791288204536e-06, + "loss": 0.0034, + "step": 35020 + }, + { + "epoch": 0.70044, + "grad_norm": 0.3741409182548523, + "learning_rate": 4.986704707283864e-06, + "loss": 0.004, + "step": 35022 + }, + { + "epoch": 0.70048, + "grad_norm": 0.024866094812750816, + "learning_rate": 4.98549663025914e-06, + "loss": 0.0004, + "step": 35024 + }, + { + "epoch": 0.70052, + "grad_norm": 0.002509101526811719, + "learning_rate": 4.984288650994741e-06, + "loss": 0.005, + "step": 35026 + }, + { + "epoch": 0.70056, + "grad_norm": 0.14872156083583832, + "learning_rate": 4.9830807695142166e-06, + "loss": 0.1283, + "step": 35028 + }, + { + "epoch": 0.7006, + "grad_norm": 3.7465946674346924, + "learning_rate": 4.981872985841115e-06, + "loss": 0.0713, + "step": 35030 + }, + { + "epoch": 0.70064, + "grad_norm": 0.20653124153614044, + "learning_rate": 4.980665299998988e-06, + "loss": 0.0042, + "step": 35032 + }, + { + "epoch": 0.70068, + "grad_norm": 0.5591579079627991, + "learning_rate": 4.979457712011371e-06, + "loss": 0.0174, + "step": 35034 + }, + { + "epoch": 0.70072, + "grad_norm": 0.023349126800894737, + "learning_rate": 4.9782502219018106e-06, + "loss": 0.0719, + "step": 35036 + }, + { + "epoch": 0.70076, + "grad_norm": 0.26608192920684814, + "learning_rate": 4.977042829693846e-06, + "loss": 0.0116, + "step": 35038 + }, + { + "epoch": 0.7008, + "grad_norm": 0.18989738821983337, + "learning_rate": 4.97583553541102e-06, + "loss": 0.0025, + "step": 35040 + }, + { + "epoch": 0.70084, + "grad_norm": 0.23122861981391907, + "learning_rate": 4.974628339076869e-06, + "loss": 0.007, + "step": 35042 + }, + { + "epoch": 0.70088, + "grad_norm": 0.03545524552464485, + "learning_rate": 4.973421240714918e-06, + "loss": 0.0719, + "step": 35044 + }, + { + "epoch": 0.70092, + "grad_norm": 0.12483721226453781, + "learning_rate": 4.972214240348717e-06, + "loss": 0.0164, + "step": 35046 + }, + { + "epoch": 0.70096, + "grad_norm": 3.9283668994903564, + "learning_rate": 4.9710073380017835e-06, + "loss": 0.0562, + "step": 35048 + }, + { + "epoch": 0.701, + "grad_norm": 0.12819740176200867, + "learning_rate": 4.96980053369765e-06, + "loss": 0.0027, + "step": 35050 + }, + { + "epoch": 0.70104, + "grad_norm": 1.2172681093215942, + "learning_rate": 4.968593827459846e-06, + "loss": 0.0161, + "step": 35052 + }, + { + "epoch": 0.70108, + "grad_norm": 0.17114228010177612, + "learning_rate": 4.967387219311894e-06, + "loss": 0.0023, + "step": 35054 + }, + { + "epoch": 0.70112, + "grad_norm": 0.17071810364723206, + "learning_rate": 4.966180709277324e-06, + "loss": 0.0049, + "step": 35056 + }, + { + "epoch": 0.70116, + "grad_norm": 0.2256973832845688, + "learning_rate": 4.964974297379649e-06, + "loss": 0.0032, + "step": 35058 + }, + { + "epoch": 0.7012, + "grad_norm": 3.0116453170776367, + "learning_rate": 4.9637679836423926e-06, + "loss": 0.0605, + "step": 35060 + }, + { + "epoch": 0.70124, + "grad_norm": 0.766916036605835, + "learning_rate": 4.962561768089071e-06, + "loss": 0.0083, + "step": 35062 + }, + { + "epoch": 0.70128, + "grad_norm": 4.5019683837890625, + "learning_rate": 4.961355650743206e-06, + "loss": 0.078, + "step": 35064 + }, + { + "epoch": 0.70132, + "grad_norm": 0.06877569109201431, + "learning_rate": 4.9601496316283e-06, + "loss": 0.008, + "step": 35066 + }, + { + "epoch": 0.70136, + "grad_norm": 0.11129564046859741, + "learning_rate": 4.958943710767877e-06, + "loss": 0.0069, + "step": 35068 + }, + { + "epoch": 0.7014, + "grad_norm": 0.055575352162122726, + "learning_rate": 4.957737888185439e-06, + "loss": 0.0653, + "step": 35070 + }, + { + "epoch": 0.70144, + "grad_norm": 0.39997684955596924, + "learning_rate": 4.956532163904496e-06, + "loss": 0.0154, + "step": 35072 + }, + { + "epoch": 0.70148, + "grad_norm": 0.05197107419371605, + "learning_rate": 4.955326537948561e-06, + "loss": 0.0021, + "step": 35074 + }, + { + "epoch": 0.70152, + "grad_norm": 0.016439005732536316, + "learning_rate": 4.954121010341122e-06, + "loss": 0.072, + "step": 35076 + }, + { + "epoch": 0.70156, + "grad_norm": 0.7126488089561462, + "learning_rate": 4.9529155811057005e-06, + "loss": 0.0285, + "step": 35078 + }, + { + "epoch": 0.7016, + "grad_norm": 0.5910236835479736, + "learning_rate": 4.951710250265785e-06, + "loss": 0.0189, + "step": 35080 + }, + { + "epoch": 0.70164, + "grad_norm": 0.04013931006193161, + "learning_rate": 4.950505017844876e-06, + "loss": 0.0137, + "step": 35082 + }, + { + "epoch": 0.70168, + "grad_norm": 0.032994914799928665, + "learning_rate": 4.949299883866472e-06, + "loss": 0.0719, + "step": 35084 + }, + { + "epoch": 0.70172, + "grad_norm": 0.061938490718603134, + "learning_rate": 4.948094848354067e-06, + "loss": 0.0011, + "step": 35086 + }, + { + "epoch": 0.70176, + "grad_norm": 5.4292988777160645, + "learning_rate": 4.946889911331157e-06, + "loss": 0.0531, + "step": 35088 + }, + { + "epoch": 0.7018, + "grad_norm": 0.02231276035308838, + "learning_rate": 4.945685072821227e-06, + "loss": 0.012, + "step": 35090 + }, + { + "epoch": 0.70184, + "grad_norm": 0.13172116875648499, + "learning_rate": 4.944480332847767e-06, + "loss": 0.0036, + "step": 35092 + }, + { + "epoch": 0.70188, + "grad_norm": 0.10894032567739487, + "learning_rate": 4.943275691434268e-06, + "loss": 0.0015, + "step": 35094 + }, + { + "epoch": 0.70192, + "grad_norm": 0.051083341240882874, + "learning_rate": 4.942071148604215e-06, + "loss": 0.0967, + "step": 35096 + }, + { + "epoch": 0.70196, + "grad_norm": 6.242901802062988, + "learning_rate": 4.940866704381082e-06, + "loss": 0.0654, + "step": 35098 + }, + { + "epoch": 0.702, + "grad_norm": 1.4029980897903442, + "learning_rate": 4.939662358788364e-06, + "loss": 0.0241, + "step": 35100 + }, + { + "epoch": 0.70204, + "grad_norm": 10.801342964172363, + "learning_rate": 4.9384581118495304e-06, + "loss": 0.1328, + "step": 35102 + }, + { + "epoch": 0.70208, + "grad_norm": 0.3300490975379944, + "learning_rate": 4.937253963588061e-06, + "loss": 0.0039, + "step": 35104 + }, + { + "epoch": 0.70212, + "grad_norm": 0.012844405137002468, + "learning_rate": 4.936049914027433e-06, + "loss": 0.0106, + "step": 35106 + }, + { + "epoch": 0.70216, + "grad_norm": 16.031221389770508, + "learning_rate": 4.934845963191119e-06, + "loss": 0.3036, + "step": 35108 + }, + { + "epoch": 0.7022, + "grad_norm": 0.08386287093162537, + "learning_rate": 4.933642111102595e-06, + "loss": 0.0033, + "step": 35110 + }, + { + "epoch": 0.70224, + "grad_norm": 5.865134239196777, + "learning_rate": 4.93243835778532e-06, + "loss": 0.0666, + "step": 35112 + }, + { + "epoch": 0.70228, + "grad_norm": 0.3355964124202728, + "learning_rate": 4.9312347032627705e-06, + "loss": 0.0065, + "step": 35114 + }, + { + "epoch": 0.70232, + "grad_norm": 4.07538366317749, + "learning_rate": 4.93003114755841e-06, + "loss": 0.0605, + "step": 35116 + }, + { + "epoch": 0.70236, + "grad_norm": 0.5718857645988464, + "learning_rate": 4.928827690695701e-06, + "loss": 0.0067, + "step": 35118 + }, + { + "epoch": 0.7024, + "grad_norm": 0.06708572059869766, + "learning_rate": 4.927624332698109e-06, + "loss": 0.0022, + "step": 35120 + }, + { + "epoch": 0.70244, + "grad_norm": 0.373503714799881, + "learning_rate": 4.926421073589094e-06, + "loss": 0.0089, + "step": 35122 + }, + { + "epoch": 0.70248, + "grad_norm": 0.10995486378669739, + "learning_rate": 4.925217913392109e-06, + "loss": 0.0042, + "step": 35124 + }, + { + "epoch": 0.70252, + "grad_norm": 0.07717972248792648, + "learning_rate": 4.924014852130614e-06, + "loss": 0.0013, + "step": 35126 + }, + { + "epoch": 0.70256, + "grad_norm": 0.04187637194991112, + "learning_rate": 4.922811889828066e-06, + "loss": 0.0039, + "step": 35128 + }, + { + "epoch": 0.7026, + "grad_norm": 0.21963021159172058, + "learning_rate": 4.921609026507907e-06, + "loss": 0.0025, + "step": 35130 + }, + { + "epoch": 0.70264, + "grad_norm": 0.04314108565449715, + "learning_rate": 4.9204062621936025e-06, + "loss": 0.0012, + "step": 35132 + }, + { + "epoch": 0.70268, + "grad_norm": 0.16166113317012787, + "learning_rate": 4.919203596908584e-06, + "loss": 0.0057, + "step": 35134 + }, + { + "epoch": 0.70272, + "grad_norm": 0.022256284952163696, + "learning_rate": 4.918001030676316e-06, + "loss": 0.0034, + "step": 35136 + }, + { + "epoch": 0.70276, + "grad_norm": 0.018688850104808807, + "learning_rate": 4.91679856352023e-06, + "loss": 0.0047, + "step": 35138 + }, + { + "epoch": 0.7028, + "grad_norm": 0.03029695525765419, + "learning_rate": 4.915596195463773e-06, + "loss": 0.0006, + "step": 35140 + }, + { + "epoch": 0.70284, + "grad_norm": 0.1406441032886505, + "learning_rate": 4.91439392653039e-06, + "loss": 0.0018, + "step": 35142 + }, + { + "epoch": 0.70288, + "grad_norm": 0.028638266026973724, + "learning_rate": 4.913191756743507e-06, + "loss": 0.0084, + "step": 35144 + }, + { + "epoch": 0.70292, + "grad_norm": 0.07062693685293198, + "learning_rate": 4.911989686126578e-06, + "loss": 0.5289, + "step": 35146 + }, + { + "epoch": 0.70296, + "grad_norm": 0.1697697937488556, + "learning_rate": 4.910787714703026e-06, + "loss": 0.0032, + "step": 35148 + }, + { + "epoch": 0.703, + "grad_norm": 0.07983212172985077, + "learning_rate": 4.909585842496287e-06, + "loss": 0.0029, + "step": 35150 + }, + { + "epoch": 0.70304, + "grad_norm": 0.07388291507959366, + "learning_rate": 4.908384069529794e-06, + "loss": 0.0011, + "step": 35152 + }, + { + "epoch": 0.70308, + "grad_norm": 0.054822757840156555, + "learning_rate": 4.907182395826977e-06, + "loss": 0.0012, + "step": 35154 + }, + { + "epoch": 0.70312, + "grad_norm": 0.06117987632751465, + "learning_rate": 4.905980821411258e-06, + "loss": 0.0011, + "step": 35156 + }, + { + "epoch": 0.70316, + "grad_norm": 0.06628439575433731, + "learning_rate": 4.9047793463060655e-06, + "loss": 0.0021, + "step": 35158 + }, + { + "epoch": 0.7032, + "grad_norm": 0.9521624445915222, + "learning_rate": 4.903577970534823e-06, + "loss": 0.0125, + "step": 35160 + }, + { + "epoch": 0.70324, + "grad_norm": 0.0011041597463190556, + "learning_rate": 4.9023766941209514e-06, + "loss": 0.0336, + "step": 35162 + }, + { + "epoch": 0.70328, + "grad_norm": 15.566515922546387, + "learning_rate": 4.901175517087875e-06, + "loss": 0.2357, + "step": 35164 + }, + { + "epoch": 0.70332, + "grad_norm": 11.93229866027832, + "learning_rate": 4.8999744394589985e-06, + "loss": 0.4751, + "step": 35166 + }, + { + "epoch": 0.70336, + "grad_norm": 0.19963745772838593, + "learning_rate": 4.8987734612577544e-06, + "loss": 0.0054, + "step": 35168 + }, + { + "epoch": 0.7034, + "grad_norm": 0.07329483330249786, + "learning_rate": 4.897572582507544e-06, + "loss": 0.0018, + "step": 35170 + }, + { + "epoch": 0.70344, + "grad_norm": 0.030023543164134026, + "learning_rate": 4.896371803231783e-06, + "loss": 0.0038, + "step": 35172 + }, + { + "epoch": 0.70348, + "grad_norm": 2.6368234157562256, + "learning_rate": 4.89517112345388e-06, + "loss": 0.0332, + "step": 35174 + }, + { + "epoch": 0.70352, + "grad_norm": 0.028183577582240105, + "learning_rate": 4.8939705431972465e-06, + "loss": 0.0005, + "step": 35176 + }, + { + "epoch": 0.70356, + "grad_norm": 0.23270584642887115, + "learning_rate": 4.892770062485288e-06, + "loss": 0.0035, + "step": 35178 + }, + { + "epoch": 0.7036, + "grad_norm": 0.4060174226760864, + "learning_rate": 4.891569681341403e-06, + "loss": 0.006, + "step": 35180 + }, + { + "epoch": 0.70364, + "grad_norm": 0.5370384454727173, + "learning_rate": 4.890369399788997e-06, + "loss": 0.0064, + "step": 35182 + }, + { + "epoch": 0.70368, + "grad_norm": 14.192737579345703, + "learning_rate": 4.889169217851471e-06, + "loss": 0.5126, + "step": 35184 + }, + { + "epoch": 0.70372, + "grad_norm": 0.06499318033456802, + "learning_rate": 4.887969135552225e-06, + "loss": 0.0276, + "step": 35186 + }, + { + "epoch": 0.70376, + "grad_norm": 0.06553060561418533, + "learning_rate": 4.886769152914645e-06, + "loss": 0.002, + "step": 35188 + }, + { + "epoch": 0.7038, + "grad_norm": 0.008890131488442421, + "learning_rate": 4.885569269962142e-06, + "loss": 0.0118, + "step": 35190 + }, + { + "epoch": 0.70384, + "grad_norm": 1.4942965507507324, + "learning_rate": 4.884369486718093e-06, + "loss": 0.0148, + "step": 35192 + }, + { + "epoch": 0.70388, + "grad_norm": 0.18673214316368103, + "learning_rate": 4.883169803205896e-06, + "loss": 0.0019, + "step": 35194 + }, + { + "epoch": 0.70392, + "grad_norm": 0.7996389865875244, + "learning_rate": 4.8819702194489415e-06, + "loss": 0.0119, + "step": 35196 + }, + { + "epoch": 0.70396, + "grad_norm": 0.09115879982709885, + "learning_rate": 4.880770735470606e-06, + "loss": 0.0018, + "step": 35198 + }, + { + "epoch": 0.704, + "grad_norm": 16.5026798248291, + "learning_rate": 4.879571351294287e-06, + "loss": 0.4337, + "step": 35200 + }, + { + "epoch": 0.70404, + "grad_norm": 0.03649982810020447, + "learning_rate": 4.8783720669433565e-06, + "loss": 0.0008, + "step": 35202 + }, + { + "epoch": 0.70408, + "grad_norm": 0.024300837889313698, + "learning_rate": 4.8771728824412e-06, + "loss": 0.0011, + "step": 35204 + }, + { + "epoch": 0.70412, + "grad_norm": 0.01941288821399212, + "learning_rate": 4.875973797811195e-06, + "loss": 0.0019, + "step": 35206 + }, + { + "epoch": 0.70416, + "grad_norm": 0.08106648921966553, + "learning_rate": 4.87477481307672e-06, + "loss": 0.0014, + "step": 35208 + }, + { + "epoch": 0.7042, + "grad_norm": 0.3461911082267761, + "learning_rate": 4.873575928261151e-06, + "loss": 0.0048, + "step": 35210 + }, + { + "epoch": 0.70424, + "grad_norm": 0.06464198231697083, + "learning_rate": 4.872377143387856e-06, + "loss": 0.0033, + "step": 35212 + }, + { + "epoch": 0.70428, + "grad_norm": 0.687208890914917, + "learning_rate": 4.871178458480209e-06, + "loss": 0.0095, + "step": 35214 + }, + { + "epoch": 0.70432, + "grad_norm": 0.036339059472084045, + "learning_rate": 4.869979873561577e-06, + "loss": 0.0012, + "step": 35216 + }, + { + "epoch": 0.70436, + "grad_norm": 0.02859339490532875, + "learning_rate": 4.868781388655332e-06, + "loss": 0.0016, + "step": 35218 + }, + { + "epoch": 0.7044, + "grad_norm": 7.062947750091553, + "learning_rate": 4.8675830037848295e-06, + "loss": 0.109, + "step": 35220 + }, + { + "epoch": 0.70444, + "grad_norm": 0.04288202151656151, + "learning_rate": 4.866384718973446e-06, + "loss": 0.0009, + "step": 35222 + }, + { + "epoch": 0.70448, + "grad_norm": 0.4737221300601959, + "learning_rate": 4.86518653424453e-06, + "loss": 0.0136, + "step": 35224 + }, + { + "epoch": 0.70452, + "grad_norm": 0.09811866283416748, + "learning_rate": 4.863988449621447e-06, + "loss": 0.0388, + "step": 35226 + }, + { + "epoch": 0.70456, + "grad_norm": 0.5892497897148132, + "learning_rate": 4.862790465127558e-06, + "loss": 0.0099, + "step": 35228 + }, + { + "epoch": 0.7046, + "grad_norm": 1.3564786911010742, + "learning_rate": 4.861592580786205e-06, + "loss": 0.0306, + "step": 35230 + }, + { + "epoch": 0.70464, + "grad_norm": 0.036615967750549316, + "learning_rate": 4.8603947966207585e-06, + "loss": 0.0423, + "step": 35232 + }, + { + "epoch": 0.70468, + "grad_norm": 0.02094743587076664, + "learning_rate": 4.859197112654557e-06, + "loss": 0.0009, + "step": 35234 + }, + { + "epoch": 0.70472, + "grad_norm": 0.3974197506904602, + "learning_rate": 4.857999528910955e-06, + "loss": 0.0061, + "step": 35236 + }, + { + "epoch": 0.70476, + "grad_norm": 4.518623352050781, + "learning_rate": 4.856802045413299e-06, + "loss": 0.0534, + "step": 35238 + }, + { + "epoch": 0.7048, + "grad_norm": 0.5281798839569092, + "learning_rate": 4.855604662184935e-06, + "loss": 0.0073, + "step": 35240 + }, + { + "epoch": 0.70484, + "grad_norm": 0.12622293829917908, + "learning_rate": 4.854407379249211e-06, + "loss": 0.0023, + "step": 35242 + }, + { + "epoch": 0.70488, + "grad_norm": 0.10393939167261124, + "learning_rate": 4.8532101966294595e-06, + "loss": 0.0131, + "step": 35244 + }, + { + "epoch": 0.70492, + "grad_norm": 0.22669559717178345, + "learning_rate": 4.852013114349026e-06, + "loss": 0.0091, + "step": 35246 + }, + { + "epoch": 0.70496, + "grad_norm": 0.04184245318174362, + "learning_rate": 4.850816132431246e-06, + "loss": 0.0018, + "step": 35248 + }, + { + "epoch": 0.705, + "grad_norm": 0.06482013314962387, + "learning_rate": 4.849619250899458e-06, + "loss": 0.002, + "step": 35250 + }, + { + "epoch": 0.70504, + "grad_norm": 0.2460731416940689, + "learning_rate": 4.848422469776994e-06, + "loss": 0.0031, + "step": 35252 + }, + { + "epoch": 0.70508, + "grad_norm": 0.008783942088484764, + "learning_rate": 4.847225789087189e-06, + "loss": 0.0016, + "step": 35254 + }, + { + "epoch": 0.70512, + "grad_norm": 0.22987668216228485, + "learning_rate": 4.846029208853364e-06, + "loss": 0.004, + "step": 35256 + }, + { + "epoch": 0.70516, + "grad_norm": 0.11076600104570389, + "learning_rate": 4.844832729098858e-06, + "loss": 0.0022, + "step": 35258 + }, + { + "epoch": 0.7052, + "grad_norm": 0.7910118699073792, + "learning_rate": 4.843636349846991e-06, + "loss": 0.0105, + "step": 35260 + }, + { + "epoch": 0.70524, + "grad_norm": 3.545034170150757, + "learning_rate": 4.8424400711210865e-06, + "loss": 0.0308, + "step": 35262 + }, + { + "epoch": 0.70528, + "grad_norm": 0.36021122336387634, + "learning_rate": 4.841243892944471e-06, + "loss": 0.1424, + "step": 35264 + }, + { + "epoch": 0.70532, + "grad_norm": 0.19484597444534302, + "learning_rate": 4.840047815340456e-06, + "loss": 0.0109, + "step": 35266 + }, + { + "epoch": 0.70536, + "grad_norm": 0.010027873329818249, + "learning_rate": 4.838851838332373e-06, + "loss": 0.0008, + "step": 35268 + }, + { + "epoch": 0.7054, + "grad_norm": 0.0534285269677639, + "learning_rate": 4.837655961943526e-06, + "loss": 0.001, + "step": 35270 + }, + { + "epoch": 0.70544, + "grad_norm": 0.13873173296451569, + "learning_rate": 4.836460186197234e-06, + "loss": 0.0186, + "step": 35272 + }, + { + "epoch": 0.70548, + "grad_norm": 0.060440488159656525, + "learning_rate": 4.835264511116808e-06, + "loss": 0.3194, + "step": 35274 + }, + { + "epoch": 0.70552, + "grad_norm": 0.23729641735553741, + "learning_rate": 4.834068936725564e-06, + "loss": 0.0062, + "step": 35276 + }, + { + "epoch": 0.70556, + "grad_norm": 0.03961078077554703, + "learning_rate": 4.832873463046802e-06, + "loss": 0.001, + "step": 35278 + }, + { + "epoch": 0.7056, + "grad_norm": 0.0007473393925465643, + "learning_rate": 4.831678090103832e-06, + "loss": 0.0009, + "step": 35280 + }, + { + "epoch": 0.70564, + "grad_norm": 0.2187644988298416, + "learning_rate": 4.8304828179199595e-06, + "loss": 0.0112, + "step": 35282 + }, + { + "epoch": 0.70568, + "grad_norm": 0.690470814704895, + "learning_rate": 4.829287646518485e-06, + "loss": 0.0086, + "step": 35284 + }, + { + "epoch": 0.70572, + "grad_norm": 0.059947676956653595, + "learning_rate": 4.828092575922712e-06, + "loss": 0.2369, + "step": 35286 + }, + { + "epoch": 0.70576, + "grad_norm": 1.2652790546417236, + "learning_rate": 4.8268976061559315e-06, + "loss": 0.0272, + "step": 35288 + }, + { + "epoch": 0.7058, + "grad_norm": 0.3126930594444275, + "learning_rate": 4.825702737241452e-06, + "loss": 0.0069, + "step": 35290 + }, + { + "epoch": 0.70584, + "grad_norm": 0.04620177671313286, + "learning_rate": 4.824507969202558e-06, + "loss": 0.0018, + "step": 35292 + }, + { + "epoch": 0.70588, + "grad_norm": 0.6288846135139465, + "learning_rate": 4.823313302062547e-06, + "loss": 0.0072, + "step": 35294 + }, + { + "epoch": 0.70592, + "grad_norm": 0.03710521385073662, + "learning_rate": 4.822118735844709e-06, + "loss": 0.0006, + "step": 35296 + }, + { + "epoch": 0.70596, + "grad_norm": 0.09351963549852371, + "learning_rate": 4.820924270572327e-06, + "loss": 0.0042, + "step": 35298 + }, + { + "epoch": 0.706, + "grad_norm": 0.10109462589025497, + "learning_rate": 4.8197299062687e-06, + "loss": 0.0313, + "step": 35300 + }, + { + "epoch": 0.70604, + "grad_norm": 0.2333042472600937, + "learning_rate": 4.8185356429571015e-06, + "loss": 0.0031, + "step": 35302 + }, + { + "epoch": 0.70608, + "grad_norm": 7.178792476654053, + "learning_rate": 4.817341480660818e-06, + "loss": 0.1515, + "step": 35304 + }, + { + "epoch": 0.70612, + "grad_norm": 0.005293331574648619, + "learning_rate": 4.81614741940313e-06, + "loss": 0.2669, + "step": 35306 + }, + { + "epoch": 0.70616, + "grad_norm": 0.043056439608335495, + "learning_rate": 4.814953459207322e-06, + "loss": 0.0009, + "step": 35308 + }, + { + "epoch": 0.7062, + "grad_norm": 0.24058572947978973, + "learning_rate": 4.813759600096661e-06, + "loss": 0.0029, + "step": 35310 + }, + { + "epoch": 0.70624, + "grad_norm": 0.12123788148164749, + "learning_rate": 4.812565842094426e-06, + "loss": 0.0036, + "step": 35312 + }, + { + "epoch": 0.70628, + "grad_norm": 0.007648860570043325, + "learning_rate": 4.811372185223892e-06, + "loss": 0.0003, + "step": 35314 + }, + { + "epoch": 0.70632, + "grad_norm": 0.024419190362095833, + "learning_rate": 4.810178629508329e-06, + "loss": 0.0008, + "step": 35316 + }, + { + "epoch": 0.70636, + "grad_norm": 0.4248585104942322, + "learning_rate": 4.808985174971007e-06, + "loss": 0.0091, + "step": 35318 + }, + { + "epoch": 0.7064, + "grad_norm": 0.053773876279592514, + "learning_rate": 4.807791821635186e-06, + "loss": 0.0007, + "step": 35320 + }, + { + "epoch": 0.70644, + "grad_norm": 0.24675941467285156, + "learning_rate": 4.806598569524142e-06, + "loss": 0.0045, + "step": 35322 + }, + { + "epoch": 0.70648, + "grad_norm": 0.16888296604156494, + "learning_rate": 4.805405418661131e-06, + "loss": 0.0088, + "step": 35324 + }, + { + "epoch": 0.70652, + "grad_norm": 0.2271803617477417, + "learning_rate": 4.804212369069415e-06, + "loss": 0.0123, + "step": 35326 + }, + { + "epoch": 0.70656, + "grad_norm": 0.12149889022111893, + "learning_rate": 4.803019420772254e-06, + "loss": 0.0038, + "step": 35328 + }, + { + "epoch": 0.7066, + "grad_norm": 5.226046562194824, + "learning_rate": 4.801826573792905e-06, + "loss": 0.059, + "step": 35330 + }, + { + "epoch": 0.70664, + "grad_norm": 4.464857578277588, + "learning_rate": 4.8006338281546264e-06, + "loss": 0.07, + "step": 35332 + }, + { + "epoch": 0.70668, + "grad_norm": 0.024756502360105515, + "learning_rate": 4.7994411838806645e-06, + "loss": 0.001, + "step": 35334 + }, + { + "epoch": 0.70672, + "grad_norm": 3.482351064682007, + "learning_rate": 4.798248640994274e-06, + "loss": 0.0553, + "step": 35336 + }, + { + "epoch": 0.70676, + "grad_norm": 0.24501711130142212, + "learning_rate": 4.797056199518705e-06, + "loss": 0.0054, + "step": 35338 + }, + { + "epoch": 0.7068, + "grad_norm": 0.003266796935349703, + "learning_rate": 4.795863859477207e-06, + "loss": 0.2668, + "step": 35340 + }, + { + "epoch": 0.70684, + "grad_norm": 0.10977339744567871, + "learning_rate": 4.794671620893016e-06, + "loss": 0.0019, + "step": 35342 + }, + { + "epoch": 0.70688, + "grad_norm": 0.01727784425020218, + "learning_rate": 4.793479483789387e-06, + "loss": 0.0011, + "step": 35344 + }, + { + "epoch": 0.70692, + "grad_norm": 0.030893227085471153, + "learning_rate": 4.792287448189554e-06, + "loss": 0.0013, + "step": 35346 + }, + { + "epoch": 0.70696, + "grad_norm": 0.2486249953508377, + "learning_rate": 4.791095514116758e-06, + "loss": 0.0042, + "step": 35348 + }, + { + "epoch": 0.707, + "grad_norm": 0.23870886862277985, + "learning_rate": 4.78990368159424e-06, + "loss": 0.0051, + "step": 35350 + }, + { + "epoch": 0.70704, + "grad_norm": 0.1337326020002365, + "learning_rate": 4.788711950645225e-06, + "loss": 0.0026, + "step": 35352 + }, + { + "epoch": 0.70708, + "grad_norm": 0.39503079652786255, + "learning_rate": 4.787520321292961e-06, + "loss": 0.0077, + "step": 35354 + }, + { + "epoch": 0.70712, + "grad_norm": 0.5140219330787659, + "learning_rate": 4.786328793560664e-06, + "loss": 0.0196, + "step": 35356 + }, + { + "epoch": 0.70716, + "grad_norm": 0.6509701609611511, + "learning_rate": 4.7851373674715795e-06, + "loss": 0.0098, + "step": 35358 + }, + { + "epoch": 0.7072, + "grad_norm": 0.02931361086666584, + "learning_rate": 4.783946043048922e-06, + "loss": 0.0008, + "step": 35360 + }, + { + "epoch": 0.70724, + "grad_norm": 1.1569797992706299, + "learning_rate": 4.782754820315922e-06, + "loss": 0.0335, + "step": 35362 + }, + { + "epoch": 0.70728, + "grad_norm": 0.15549808740615845, + "learning_rate": 4.781563699295808e-06, + "loss": 0.0044, + "step": 35364 + }, + { + "epoch": 0.70732, + "grad_norm": 0.03651542216539383, + "learning_rate": 4.780372680011791e-06, + "loss": 0.0004, + "step": 35366 + }, + { + "epoch": 0.70736, + "grad_norm": 0.1681773066520691, + "learning_rate": 4.779181762487096e-06, + "loss": 0.0138, + "step": 35368 + }, + { + "epoch": 0.7074, + "grad_norm": 0.08006273210048676, + "learning_rate": 4.7779909467449416e-06, + "loss": 0.0431, + "step": 35370 + }, + { + "epoch": 0.70744, + "grad_norm": 0.07807572185993195, + "learning_rate": 4.776800232808542e-06, + "loss": 0.0057, + "step": 35372 + }, + { + "epoch": 0.70748, + "grad_norm": 0.16669100522994995, + "learning_rate": 4.77560962070111e-06, + "loss": 0.0032, + "step": 35374 + }, + { + "epoch": 0.70752, + "grad_norm": 0.12714888155460358, + "learning_rate": 4.774419110445862e-06, + "loss": 0.0028, + "step": 35376 + }, + { + "epoch": 0.70756, + "grad_norm": 0.005153198726475239, + "learning_rate": 4.7732287020659996e-06, + "loss": 0.0019, + "step": 35378 + }, + { + "epoch": 0.7076, + "grad_norm": 0.9324168562889099, + "learning_rate": 4.772038395584735e-06, + "loss": 0.0185, + "step": 35380 + }, + { + "epoch": 0.70764, + "grad_norm": 0.3536270260810852, + "learning_rate": 4.770848191025274e-06, + "loss": 0.0912, + "step": 35382 + }, + { + "epoch": 0.70768, + "grad_norm": 0.022434279322624207, + "learning_rate": 4.769658088410819e-06, + "loss": 0.0015, + "step": 35384 + }, + { + "epoch": 0.70772, + "grad_norm": 0.05564047396183014, + "learning_rate": 4.768468087764576e-06, + "loss": 0.0046, + "step": 35386 + }, + { + "epoch": 0.70776, + "grad_norm": 2.577486991882324, + "learning_rate": 4.7672781891097344e-06, + "loss": 0.0307, + "step": 35388 + }, + { + "epoch": 0.7078, + "grad_norm": 0.0006658771308138967, + "learning_rate": 4.7660883924695055e-06, + "loss": 0.0015, + "step": 35390 + }, + { + "epoch": 0.70784, + "grad_norm": 0.04266093298792839, + "learning_rate": 4.764898697867074e-06, + "loss": 0.0153, + "step": 35392 + }, + { + "epoch": 0.70788, + "grad_norm": 14.393653869628906, + "learning_rate": 4.763709105325638e-06, + "loss": 1.1044, + "step": 35394 + }, + { + "epoch": 0.70792, + "grad_norm": 0.026501160115003586, + "learning_rate": 4.762519614868388e-06, + "loss": 0.0005, + "step": 35396 + }, + { + "epoch": 0.70796, + "grad_norm": 0.00545933423563838, + "learning_rate": 4.7613302265185195e-06, + "loss": 0.0003, + "step": 35398 + }, + { + "epoch": 0.708, + "grad_norm": 0.4665370583534241, + "learning_rate": 4.76014094029921e-06, + "loss": 0.0045, + "step": 35400 + }, + { + "epoch": 0.70804, + "grad_norm": 0.46756526827812195, + "learning_rate": 4.758951756233652e-06, + "loss": 0.0076, + "step": 35402 + }, + { + "epoch": 0.70808, + "grad_norm": 0.7609041929244995, + "learning_rate": 4.7577626743450265e-06, + "loss": 0.0122, + "step": 35404 + }, + { + "epoch": 0.70812, + "grad_norm": 0.14993195235729218, + "learning_rate": 4.7565736946565174e-06, + "loss": 0.0739, + "step": 35406 + }, + { + "epoch": 0.70816, + "grad_norm": 1.2285717725753784, + "learning_rate": 4.755384817191307e-06, + "loss": 0.024, + "step": 35408 + }, + { + "epoch": 0.7082, + "grad_norm": 0.5861390829086304, + "learning_rate": 4.754196041972563e-06, + "loss": 0.0144, + "step": 35410 + }, + { + "epoch": 0.70824, + "grad_norm": 0.3444001376628876, + "learning_rate": 4.753007369023475e-06, + "loss": 0.012, + "step": 35412 + }, + { + "epoch": 0.70828, + "grad_norm": 0.0028369324281811714, + "learning_rate": 4.751818798367206e-06, + "loss": 0.0057, + "step": 35414 + }, + { + "epoch": 0.70832, + "grad_norm": 0.143353670835495, + "learning_rate": 4.750630330026931e-06, + "loss": 0.0021, + "step": 35416 + }, + { + "epoch": 0.70836, + "grad_norm": 0.027155231684446335, + "learning_rate": 4.7494419640258235e-06, + "loss": 0.0047, + "step": 35418 + }, + { + "epoch": 0.7084, + "grad_norm": 0.011539890430867672, + "learning_rate": 4.7482537003870425e-06, + "loss": 0.0003, + "step": 35420 + }, + { + "epoch": 0.70844, + "grad_norm": 0.707912027835846, + "learning_rate": 4.747065539133765e-06, + "loss": 0.0081, + "step": 35422 + }, + { + "epoch": 0.70848, + "grad_norm": 1.1144344806671143, + "learning_rate": 4.745877480289146e-06, + "loss": 0.0138, + "step": 35424 + }, + { + "epoch": 0.70852, + "grad_norm": 0.059675250202417374, + "learning_rate": 4.744689523876351e-06, + "loss": 0.0014, + "step": 35426 + }, + { + "epoch": 0.70856, + "grad_norm": 0.4044031500816345, + "learning_rate": 4.743501669918539e-06, + "loss": 0.0193, + "step": 35428 + }, + { + "epoch": 0.7086, + "grad_norm": 0.31474408507347107, + "learning_rate": 4.7423139184388725e-06, + "loss": 0.0034, + "step": 35430 + }, + { + "epoch": 0.70864, + "grad_norm": 0.048821356147527695, + "learning_rate": 4.7411262694604985e-06, + "loss": 0.0007, + "step": 35432 + }, + { + "epoch": 0.70868, + "grad_norm": 0.224624902009964, + "learning_rate": 4.739938723006576e-06, + "loss": 0.0368, + "step": 35434 + }, + { + "epoch": 0.70872, + "grad_norm": 0.06759592145681381, + "learning_rate": 4.738751279100254e-06, + "loss": 0.0011, + "step": 35436 + }, + { + "epoch": 0.70876, + "grad_norm": 0.25093913078308105, + "learning_rate": 4.737563937764686e-06, + "loss": 0.0027, + "step": 35438 + }, + { + "epoch": 0.7088, + "grad_norm": 0.6107013821601868, + "learning_rate": 4.736376699023023e-06, + "loss": 0.0108, + "step": 35440 + }, + { + "epoch": 0.70884, + "grad_norm": 0.07565418630838394, + "learning_rate": 4.735189562898398e-06, + "loss": 0.0429, + "step": 35442 + }, + { + "epoch": 0.70888, + "grad_norm": 0.10187634825706482, + "learning_rate": 4.7340025294139705e-06, + "loss": 0.002, + "step": 35444 + }, + { + "epoch": 0.70892, + "grad_norm": 0.11990930885076523, + "learning_rate": 4.732815598592871e-06, + "loss": 0.0017, + "step": 35446 + }, + { + "epoch": 0.70896, + "grad_norm": 7.89234733581543, + "learning_rate": 4.731628770458243e-06, + "loss": 0.1632, + "step": 35448 + }, + { + "epoch": 0.709, + "grad_norm": 0.4513358175754547, + "learning_rate": 4.7304420450332244e-06, + "loss": 0.0062, + "step": 35450 + }, + { + "epoch": 0.70904, + "grad_norm": 3.1982407569885254, + "learning_rate": 4.729255422340951e-06, + "loss": 0.0478, + "step": 35452 + }, + { + "epoch": 0.70908, + "grad_norm": 0.056754764169454575, + "learning_rate": 4.728068902404561e-06, + "loss": 0.0026, + "step": 35454 + }, + { + "epoch": 0.70912, + "grad_norm": 0.19330832362174988, + "learning_rate": 4.726882485247177e-06, + "loss": 0.0061, + "step": 35456 + }, + { + "epoch": 0.70916, + "grad_norm": 1.0289379358291626, + "learning_rate": 4.725696170891936e-06, + "loss": 0.0493, + "step": 35458 + }, + { + "epoch": 0.7092, + "grad_norm": 14.5425386428833, + "learning_rate": 4.724509959361961e-06, + "loss": 0.4689, + "step": 35460 + }, + { + "epoch": 0.70924, + "grad_norm": 0.10164958238601685, + "learning_rate": 4.723323850680383e-06, + "loss": 0.0016, + "step": 35462 + }, + { + "epoch": 0.70928, + "grad_norm": 0.02818375825881958, + "learning_rate": 4.72213784487032e-06, + "loss": 0.0008, + "step": 35464 + }, + { + "epoch": 0.70932, + "grad_norm": 1.0332599878311157, + "learning_rate": 4.720951941954903e-06, + "loss": 0.2621, + "step": 35466 + }, + { + "epoch": 0.70936, + "grad_norm": 1.5236332416534424, + "learning_rate": 4.719766141957241e-06, + "loss": 0.0171, + "step": 35468 + }, + { + "epoch": 0.7094, + "grad_norm": 1.162231206893921, + "learning_rate": 4.718580444900457e-06, + "loss": 0.0293, + "step": 35470 + }, + { + "epoch": 0.70944, + "grad_norm": 0.0058043356984853745, + "learning_rate": 4.717394850807669e-06, + "loss": 0.0169, + "step": 35472 + }, + { + "epoch": 0.70948, + "grad_norm": 0.02499859407544136, + "learning_rate": 4.716209359701982e-06, + "loss": 0.0022, + "step": 35474 + }, + { + "epoch": 0.70952, + "grad_norm": 0.07787273824214935, + "learning_rate": 4.715023971606521e-06, + "loss": 0.0018, + "step": 35476 + }, + { + "epoch": 0.70956, + "grad_norm": 0.09103851020336151, + "learning_rate": 4.713838686544381e-06, + "loss": 0.0055, + "step": 35478 + }, + { + "epoch": 0.7096, + "grad_norm": 11.300434112548828, + "learning_rate": 4.712653504538684e-06, + "loss": 0.2379, + "step": 35480 + }, + { + "epoch": 0.70964, + "grad_norm": 0.1701025366783142, + "learning_rate": 4.711468425612526e-06, + "loss": 0.0045, + "step": 35482 + }, + { + "epoch": 0.70968, + "grad_norm": 14.170312881469727, + "learning_rate": 4.710283449789014e-06, + "loss": 0.234, + "step": 35484 + }, + { + "epoch": 0.70972, + "grad_norm": 1.1236366033554077, + "learning_rate": 4.709098577091252e-06, + "loss": 0.0163, + "step": 35486 + }, + { + "epoch": 0.70976, + "grad_norm": 0.012309937737882137, + "learning_rate": 4.707913807542335e-06, + "loss": 0.0018, + "step": 35488 + }, + { + "epoch": 0.7098, + "grad_norm": 0.13643988966941833, + "learning_rate": 4.706729141165362e-06, + "loss": 0.0122, + "step": 35490 + }, + { + "epoch": 0.70984, + "grad_norm": 0.051057249307632446, + "learning_rate": 4.705544577983429e-06, + "loss": 0.0009, + "step": 35492 + }, + { + "epoch": 0.70988, + "grad_norm": 0.0868530198931694, + "learning_rate": 4.704360118019631e-06, + "loss": 0.0037, + "step": 35494 + }, + { + "epoch": 0.70992, + "grad_norm": 0.07746024429798126, + "learning_rate": 4.7031757612970585e-06, + "loss": 0.003, + "step": 35496 + }, + { + "epoch": 0.70996, + "grad_norm": 0.01003172155469656, + "learning_rate": 4.7019915078388056e-06, + "loss": 0.0025, + "step": 35498 + }, + { + "epoch": 0.71, + "grad_norm": 0.012234585359692574, + "learning_rate": 4.700807357667953e-06, + "loss": 0.0032, + "step": 35500 + }, + { + "epoch": 0.71004, + "grad_norm": 0.4323903024196625, + "learning_rate": 4.699623310807587e-06, + "loss": 0.0047, + "step": 35502 + }, + { + "epoch": 0.71008, + "grad_norm": 0.1841270625591278, + "learning_rate": 4.698439367280795e-06, + "loss": 0.0025, + "step": 35504 + }, + { + "epoch": 0.71012, + "grad_norm": 0.6370208263397217, + "learning_rate": 4.6972555271106564e-06, + "loss": 0.0116, + "step": 35506 + }, + { + "epoch": 0.71016, + "grad_norm": 0.09062057733535767, + "learning_rate": 4.696071790320256e-06, + "loss": 0.0016, + "step": 35508 + }, + { + "epoch": 0.7102, + "grad_norm": 0.018607405945658684, + "learning_rate": 4.694888156932657e-06, + "loss": 0.0041, + "step": 35510 + }, + { + "epoch": 0.71024, + "grad_norm": 0.031716786324977875, + "learning_rate": 4.693704626970954e-06, + "loss": 0.003, + "step": 35512 + }, + { + "epoch": 0.71028, + "grad_norm": 0.6038597822189331, + "learning_rate": 4.692521200458208e-06, + "loss": 0.0058, + "step": 35514 + }, + { + "epoch": 0.71032, + "grad_norm": 0.10488325357437134, + "learning_rate": 4.691337877417492e-06, + "loss": 0.0035, + "step": 35516 + }, + { + "epoch": 0.71036, + "grad_norm": 0.01234135776758194, + "learning_rate": 4.690154657871878e-06, + "loss": 0.0029, + "step": 35518 + }, + { + "epoch": 0.7104, + "grad_norm": 0.050831202417612076, + "learning_rate": 4.688971541844436e-06, + "loss": 0.005, + "step": 35520 + }, + { + "epoch": 0.71044, + "grad_norm": 0.2264208197593689, + "learning_rate": 4.687788529358225e-06, + "loss": 0.0048, + "step": 35522 + }, + { + "epoch": 0.71048, + "grad_norm": 3.951321840286255, + "learning_rate": 4.686605620436311e-06, + "loss": 0.0382, + "step": 35524 + }, + { + "epoch": 0.71052, + "grad_norm": 0.9351179003715515, + "learning_rate": 4.685422815101755e-06, + "loss": 0.0124, + "step": 35526 + }, + { + "epoch": 0.71056, + "grad_norm": 0.09930499643087387, + "learning_rate": 4.684240113377619e-06, + "loss": 0.0013, + "step": 35528 + }, + { + "epoch": 0.7106, + "grad_norm": 2.238816261291504, + "learning_rate": 4.6830575152869615e-06, + "loss": 0.0278, + "step": 35530 + }, + { + "epoch": 0.71064, + "grad_norm": 0.007874507457017899, + "learning_rate": 4.681875020852829e-06, + "loss": 0.0118, + "step": 35532 + }, + { + "epoch": 0.71068, + "grad_norm": 0.635307788848877, + "learning_rate": 4.680692630098287e-06, + "loss": 0.0058, + "step": 35534 + }, + { + "epoch": 0.71072, + "grad_norm": 0.2008236050605774, + "learning_rate": 4.679510343046377e-06, + "loss": 0.0033, + "step": 35536 + }, + { + "epoch": 0.71076, + "grad_norm": 0.16370591521263123, + "learning_rate": 4.6783281597201524e-06, + "loss": 0.0017, + "step": 35538 + }, + { + "epoch": 0.7108, + "grad_norm": 2.8953185081481934, + "learning_rate": 4.677146080142664e-06, + "loss": 0.0301, + "step": 35540 + }, + { + "epoch": 0.71084, + "grad_norm": 0.0122796930372715, + "learning_rate": 4.6759641043369455e-06, + "loss": 0.0011, + "step": 35542 + }, + { + "epoch": 0.71088, + "grad_norm": 0.027162300422787666, + "learning_rate": 4.674782232326055e-06, + "loss": 0.0007, + "step": 35544 + }, + { + "epoch": 0.71092, + "grad_norm": 14.248711585998535, + "learning_rate": 4.6736004641330236e-06, + "loss": 0.4886, + "step": 35546 + }, + { + "epoch": 0.71096, + "grad_norm": 0.027185970917344093, + "learning_rate": 4.672418799780892e-06, + "loss": 0.0007, + "step": 35548 + }, + { + "epoch": 0.711, + "grad_norm": 0.6379719376564026, + "learning_rate": 4.671237239292699e-06, + "loss": 0.0076, + "step": 35550 + }, + { + "epoch": 0.71104, + "grad_norm": 0.008741975761950016, + "learning_rate": 4.670055782691481e-06, + "loss": 0.004, + "step": 35552 + }, + { + "epoch": 0.71108, + "grad_norm": 0.288970947265625, + "learning_rate": 4.668874430000272e-06, + "loss": 0.0822, + "step": 35554 + }, + { + "epoch": 0.71112, + "grad_norm": 0.3411048948764801, + "learning_rate": 4.667693181242098e-06, + "loss": 0.0098, + "step": 35556 + }, + { + "epoch": 0.71116, + "grad_norm": 0.08018364012241364, + "learning_rate": 4.66651203643999e-06, + "loss": 0.0015, + "step": 35558 + }, + { + "epoch": 0.7112, + "grad_norm": 0.02246985211968422, + "learning_rate": 4.6653309956169745e-06, + "loss": 0.0012, + "step": 35560 + }, + { + "epoch": 0.71124, + "grad_norm": 0.09385038167238235, + "learning_rate": 4.664150058796083e-06, + "loss": 0.0027, + "step": 35562 + }, + { + "epoch": 0.71128, + "grad_norm": 0.0029411790892481804, + "learning_rate": 4.6629692260003245e-06, + "loss": 0.0012, + "step": 35564 + }, + { + "epoch": 0.71132, + "grad_norm": 0.14983917772769928, + "learning_rate": 4.661788497252737e-06, + "loss": 0.0028, + "step": 35566 + }, + { + "epoch": 0.71136, + "grad_norm": 0.015421691350638866, + "learning_rate": 4.660607872576327e-06, + "loss": 0.0084, + "step": 35568 + }, + { + "epoch": 0.7114, + "grad_norm": 0.3482932150363922, + "learning_rate": 4.659427351994116e-06, + "loss": 0.0036, + "step": 35570 + }, + { + "epoch": 0.71144, + "grad_norm": 0.026959314942359924, + "learning_rate": 4.658246935529118e-06, + "loss": 0.0135, + "step": 35572 + }, + { + "epoch": 0.71148, + "grad_norm": 0.19424711167812347, + "learning_rate": 4.657066623204346e-06, + "loss": 0.0153, + "step": 35574 + }, + { + "epoch": 0.71152, + "grad_norm": 0.05918389558792114, + "learning_rate": 4.655886415042814e-06, + "loss": 0.0015, + "step": 35576 + }, + { + "epoch": 0.71156, + "grad_norm": 0.033403120934963226, + "learning_rate": 4.654706311067525e-06, + "loss": 0.0076, + "step": 35578 + }, + { + "epoch": 0.7116, + "grad_norm": 15.485438346862793, + "learning_rate": 4.6535263113014885e-06, + "loss": 0.7131, + "step": 35580 + }, + { + "epoch": 0.71164, + "grad_norm": 0.02871272899210453, + "learning_rate": 4.652346415767708e-06, + "loss": 0.0012, + "step": 35582 + }, + { + "epoch": 0.71168, + "grad_norm": 1.3247753381729126, + "learning_rate": 4.6511666244891875e-06, + "loss": 0.0173, + "step": 35584 + }, + { + "epoch": 0.71172, + "grad_norm": 6.323796272277832, + "learning_rate": 4.6499869374889285e-06, + "loss": 0.0792, + "step": 35586 + }, + { + "epoch": 0.71176, + "grad_norm": 0.15521489083766937, + "learning_rate": 4.6488073547899316e-06, + "loss": 0.0059, + "step": 35588 + }, + { + "epoch": 0.7118, + "grad_norm": 1.0113967657089233, + "learning_rate": 4.647627876415186e-06, + "loss": 0.0119, + "step": 35590 + }, + { + "epoch": 0.71184, + "grad_norm": 0.8716335296630859, + "learning_rate": 4.6464485023876904e-06, + "loss": 0.0121, + "step": 35592 + }, + { + "epoch": 0.71188, + "grad_norm": 0.1476646214723587, + "learning_rate": 4.645269232730443e-06, + "loss": 0.0016, + "step": 35594 + }, + { + "epoch": 0.71192, + "grad_norm": 0.2879174053668976, + "learning_rate": 4.644090067466419e-06, + "loss": 0.0079, + "step": 35596 + }, + { + "epoch": 0.71196, + "grad_norm": 0.053737200796604156, + "learning_rate": 4.642911006618626e-06, + "loss": 0.0006, + "step": 35598 + }, + { + "epoch": 0.712, + "grad_norm": 16.694303512573242, + "learning_rate": 4.641732050210032e-06, + "loss": 0.4432, + "step": 35600 + }, + { + "epoch": 0.71204, + "grad_norm": 0.02010623924434185, + "learning_rate": 4.640553198263639e-06, + "loss": 0.0006, + "step": 35602 + }, + { + "epoch": 0.71208, + "grad_norm": 0.010639224201440811, + "learning_rate": 4.639374450802415e-06, + "loss": 0.0009, + "step": 35604 + }, + { + "epoch": 0.71212, + "grad_norm": 0.04851124808192253, + "learning_rate": 4.6381958078493475e-06, + "loss": 0.0018, + "step": 35606 + }, + { + "epoch": 0.71216, + "grad_norm": 0.3937026560306549, + "learning_rate": 4.637017269427416e-06, + "loss": 0.0065, + "step": 35608 + }, + { + "epoch": 0.7122, + "grad_norm": 0.3023659586906433, + "learning_rate": 4.635838835559591e-06, + "loss": 0.004, + "step": 35610 + }, + { + "epoch": 0.71224, + "grad_norm": 0.0011877071810886264, + "learning_rate": 4.634660506268849e-06, + "loss": 0.4687, + "step": 35612 + }, + { + "epoch": 0.71228, + "grad_norm": 0.1279020458459854, + "learning_rate": 4.633482281578163e-06, + "loss": 0.0019, + "step": 35614 + }, + { + "epoch": 0.71232, + "grad_norm": 0.09234695136547089, + "learning_rate": 4.632304161510502e-06, + "loss": 0.0019, + "step": 35616 + }, + { + "epoch": 0.71236, + "grad_norm": 8.249895095825195, + "learning_rate": 4.631126146088837e-06, + "loss": 0.1285, + "step": 35618 + }, + { + "epoch": 0.7124, + "grad_norm": 0.27787402272224426, + "learning_rate": 4.629948235336133e-06, + "loss": 0.0033, + "step": 35620 + }, + { + "epoch": 0.71244, + "grad_norm": 1.958892583847046, + "learning_rate": 4.628770429275351e-06, + "loss": 0.0269, + "step": 35622 + }, + { + "epoch": 0.71248, + "grad_norm": 0.06252941489219666, + "learning_rate": 4.627592727929454e-06, + "loss": 0.0018, + "step": 35624 + }, + { + "epoch": 0.71252, + "grad_norm": 0.028398072347044945, + "learning_rate": 4.626415131321407e-06, + "loss": 0.0014, + "step": 35626 + }, + { + "epoch": 0.71256, + "grad_norm": 0.17122183740139008, + "learning_rate": 4.625237639474155e-06, + "loss": 0.1643, + "step": 35628 + }, + { + "epoch": 0.7126, + "grad_norm": 0.19637827575206757, + "learning_rate": 4.62406025241067e-06, + "loss": 0.0025, + "step": 35630 + }, + { + "epoch": 0.71264, + "grad_norm": 0.5537214875221252, + "learning_rate": 4.6228829701538916e-06, + "loss": 0.012, + "step": 35632 + }, + { + "epoch": 0.71268, + "grad_norm": 11.544634819030762, + "learning_rate": 4.621705792726784e-06, + "loss": 0.3581, + "step": 35634 + }, + { + "epoch": 0.71272, + "grad_norm": 4.814572334289551, + "learning_rate": 4.620528720152289e-06, + "loss": 0.0812, + "step": 35636 + }, + { + "epoch": 0.71276, + "grad_norm": 0.005476151593029499, + "learning_rate": 4.619351752453354e-06, + "loss": 0.0011, + "step": 35638 + }, + { + "epoch": 0.7128, + "grad_norm": 0.0501590259373188, + "learning_rate": 4.618174889652928e-06, + "loss": 0.005, + "step": 35640 + }, + { + "epoch": 0.71284, + "grad_norm": 0.24855394661426544, + "learning_rate": 4.616998131773956e-06, + "loss": 0.0033, + "step": 35642 + }, + { + "epoch": 0.71288, + "grad_norm": 0.05276808887720108, + "learning_rate": 4.615821478839375e-06, + "loss": 0.0011, + "step": 35644 + }, + { + "epoch": 0.71292, + "grad_norm": 0.013254664838314056, + "learning_rate": 4.614644930872125e-06, + "loss": 0.0169, + "step": 35646 + }, + { + "epoch": 0.71296, + "grad_norm": 0.08163206279277802, + "learning_rate": 4.613468487895144e-06, + "loss": 0.0018, + "step": 35648 + }, + { + "epoch": 0.713, + "grad_norm": 0.041182275861501694, + "learning_rate": 4.612292149931369e-06, + "loss": 0.0795, + "step": 35650 + }, + { + "epoch": 0.71304, + "grad_norm": 8.044835090637207, + "learning_rate": 4.611115917003737e-06, + "loss": 0.1275, + "step": 35652 + }, + { + "epoch": 0.71308, + "grad_norm": 0.1630486696958542, + "learning_rate": 4.609939789135165e-06, + "loss": 0.0134, + "step": 35654 + }, + { + "epoch": 0.71312, + "grad_norm": 0.1449144035577774, + "learning_rate": 4.6087637663486005e-06, + "loss": 0.0073, + "step": 35656 + }, + { + "epoch": 0.71316, + "grad_norm": 0.03431127965450287, + "learning_rate": 4.607587848666958e-06, + "loss": 0.0172, + "step": 35658 + }, + { + "epoch": 0.7132, + "grad_norm": 0.7151272892951965, + "learning_rate": 4.606412036113166e-06, + "loss": 0.0072, + "step": 35660 + }, + { + "epoch": 0.71324, + "grad_norm": 0.16757547855377197, + "learning_rate": 4.605236328710153e-06, + "loss": 0.0025, + "step": 35662 + }, + { + "epoch": 0.71328, + "grad_norm": 0.11734584718942642, + "learning_rate": 4.6040607264808264e-06, + "loss": 0.0349, + "step": 35664 + }, + { + "epoch": 0.71332, + "grad_norm": 0.07663137465715408, + "learning_rate": 4.602885229448123e-06, + "loss": 0.0016, + "step": 35666 + }, + { + "epoch": 0.71336, + "grad_norm": 0.18107986450195312, + "learning_rate": 4.6017098376349456e-06, + "loss": 0.0093, + "step": 35668 + }, + { + "epoch": 0.7134, + "grad_norm": 0.34505605697631836, + "learning_rate": 4.600534551064215e-06, + "loss": 0.0056, + "step": 35670 + }, + { + "epoch": 0.71344, + "grad_norm": 0.017245732247829437, + "learning_rate": 4.599359369758842e-06, + "loss": 0.002, + "step": 35672 + }, + { + "epoch": 0.71348, + "grad_norm": 0.011895371600985527, + "learning_rate": 4.59818429374174e-06, + "loss": 0.0005, + "step": 35674 + }, + { + "epoch": 0.71352, + "grad_norm": 0.5477983355522156, + "learning_rate": 4.5970093230358196e-06, + "loss": 0.007, + "step": 35676 + }, + { + "epoch": 0.71356, + "grad_norm": 0.6626917719841003, + "learning_rate": 4.59583445766398e-06, + "loss": 0.0148, + "step": 35678 + }, + { + "epoch": 0.7136, + "grad_norm": 0.00267635565251112, + "learning_rate": 4.59465969764913e-06, + "loss": 0.0, + "step": 35680 + }, + { + "epoch": 0.71364, + "grad_norm": 0.20478494465351105, + "learning_rate": 4.593485043014172e-06, + "loss": 0.0113, + "step": 35682 + }, + { + "epoch": 0.71368, + "grad_norm": 1.6113684177398682, + "learning_rate": 4.5923104937820095e-06, + "loss": 0.0261, + "step": 35684 + }, + { + "epoch": 0.71372, + "grad_norm": 0.47069305181503296, + "learning_rate": 4.591136049975532e-06, + "loss": 0.0091, + "step": 35686 + }, + { + "epoch": 0.71376, + "grad_norm": 0.022337688133120537, + "learning_rate": 4.589961711617649e-06, + "loss": 0.0017, + "step": 35688 + }, + { + "epoch": 0.7138, + "grad_norm": 0.22155870497226715, + "learning_rate": 4.588787478731242e-06, + "loss": 0.0077, + "step": 35690 + }, + { + "epoch": 0.71384, + "grad_norm": 0.017086291685700417, + "learning_rate": 4.587613351339212e-06, + "loss": 0.0003, + "step": 35692 + }, + { + "epoch": 0.71388, + "grad_norm": 0.07473401725292206, + "learning_rate": 4.586439329464448e-06, + "loss": 0.0013, + "step": 35694 + }, + { + "epoch": 0.71392, + "grad_norm": 0.41083306074142456, + "learning_rate": 4.585265413129829e-06, + "loss": 0.0071, + "step": 35696 + }, + { + "epoch": 0.71396, + "grad_norm": 0.003964456729590893, + "learning_rate": 4.584091602358257e-06, + "loss": 0.0017, + "step": 35698 + }, + { + "epoch": 0.714, + "grad_norm": 0.0011219048174098134, + "learning_rate": 4.582917897172603e-06, + "loss": 0.0421, + "step": 35700 + }, + { + "epoch": 0.71404, + "grad_norm": 0.4141826927661896, + "learning_rate": 4.581744297595754e-06, + "loss": 0.0074, + "step": 35702 + }, + { + "epoch": 0.71408, + "grad_norm": 0.003777065547183156, + "learning_rate": 4.580570803650589e-06, + "loss": 0.0012, + "step": 35704 + }, + { + "epoch": 0.71412, + "grad_norm": 0.8901287913322449, + "learning_rate": 4.579397415359987e-06, + "loss": 0.0143, + "step": 35706 + }, + { + "epoch": 0.71416, + "grad_norm": 17.139480590820312, + "learning_rate": 4.578224132746822e-06, + "loss": 0.7525, + "step": 35708 + }, + { + "epoch": 0.7142, + "grad_norm": 0.006555816624313593, + "learning_rate": 4.577050955833972e-06, + "loss": 0.0002, + "step": 35710 + }, + { + "epoch": 0.71424, + "grad_norm": 0.04376785457134247, + "learning_rate": 4.575877884644303e-06, + "loss": 0.0016, + "step": 35712 + }, + { + "epoch": 0.71428, + "grad_norm": 0.05041707679629326, + "learning_rate": 4.574704919200688e-06, + "loss": 0.0045, + "step": 35714 + }, + { + "epoch": 0.71432, + "grad_norm": 0.06144984811544418, + "learning_rate": 4.5735320595259955e-06, + "loss": 0.0017, + "step": 35716 + }, + { + "epoch": 0.71436, + "grad_norm": 0.272312730550766, + "learning_rate": 4.572359305643084e-06, + "loss": 0.0027, + "step": 35718 + }, + { + "epoch": 0.7144, + "grad_norm": 0.0735294371843338, + "learning_rate": 4.571186657574828e-06, + "loss": 0.0025, + "step": 35720 + }, + { + "epoch": 0.71444, + "grad_norm": 0.31047335267066956, + "learning_rate": 4.570014115344076e-06, + "loss": 0.0036, + "step": 35722 + }, + { + "epoch": 0.71448, + "grad_norm": 0.2591038644313812, + "learning_rate": 4.568841678973701e-06, + "loss": 0.0041, + "step": 35724 + }, + { + "epoch": 0.71452, + "grad_norm": 2.1112220287323, + "learning_rate": 4.567669348486551e-06, + "loss": 0.4537, + "step": 35726 + }, + { + "epoch": 0.71456, + "grad_norm": 0.31209009885787964, + "learning_rate": 4.566497123905483e-06, + "loss": 0.0111, + "step": 35728 + }, + { + "epoch": 0.7146, + "grad_norm": 0.36937013268470764, + "learning_rate": 4.565325005253356e-06, + "loss": 0.0089, + "step": 35730 + }, + { + "epoch": 0.71464, + "grad_norm": 0.20335978269577026, + "learning_rate": 4.564152992553012e-06, + "loss": 0.0037, + "step": 35732 + }, + { + "epoch": 0.71468, + "grad_norm": 0.04368205741047859, + "learning_rate": 4.562981085827303e-06, + "loss": 0.0008, + "step": 35734 + }, + { + "epoch": 0.71472, + "grad_norm": 0.49849626421928406, + "learning_rate": 4.561809285099077e-06, + "loss": 0.007, + "step": 35736 + }, + { + "epoch": 0.71476, + "grad_norm": 0.16324937343597412, + "learning_rate": 4.560637590391178e-06, + "loss": 0.0051, + "step": 35738 + }, + { + "epoch": 0.7148, + "grad_norm": 0.04402920603752136, + "learning_rate": 4.559466001726451e-06, + "loss": 0.0008, + "step": 35740 + }, + { + "epoch": 0.71484, + "grad_norm": 0.245769202709198, + "learning_rate": 4.558294519127739e-06, + "loss": 0.0087, + "step": 35742 + }, + { + "epoch": 0.71488, + "grad_norm": 0.11059848964214325, + "learning_rate": 4.557123142617873e-06, + "loss": 0.012, + "step": 35744 + }, + { + "epoch": 0.71492, + "grad_norm": 0.21064667403697968, + "learning_rate": 4.555951872219692e-06, + "loss": 0.0027, + "step": 35746 + }, + { + "epoch": 0.71496, + "grad_norm": 0.8337798118591309, + "learning_rate": 4.554780707956038e-06, + "loss": 0.0161, + "step": 35748 + }, + { + "epoch": 0.715, + "grad_norm": 1.857798457145691, + "learning_rate": 4.5536096498497295e-06, + "loss": 0.0239, + "step": 35750 + }, + { + "epoch": 0.71504, + "grad_norm": 0.06492837518453598, + "learning_rate": 4.552438697923611e-06, + "loss": 0.0023, + "step": 35752 + }, + { + "epoch": 0.71508, + "grad_norm": 0.2630350887775421, + "learning_rate": 4.551267852200498e-06, + "loss": 0.0035, + "step": 35754 + }, + { + "epoch": 0.71512, + "grad_norm": 0.04097409173846245, + "learning_rate": 4.550097112703231e-06, + "loss": 0.0096, + "step": 35756 + }, + { + "epoch": 0.71516, + "grad_norm": 3.1358940601348877, + "learning_rate": 4.548926479454623e-06, + "loss": 0.0455, + "step": 35758 + }, + { + "epoch": 0.7152, + "grad_norm": 0.03721105307340622, + "learning_rate": 4.5477559524775e-06, + "loss": 0.001, + "step": 35760 + }, + { + "epoch": 0.71524, + "grad_norm": 0.008456829935312271, + "learning_rate": 4.546585531794685e-06, + "loss": 0.0065, + "step": 35762 + }, + { + "epoch": 0.71528, + "grad_norm": 0.1697263866662979, + "learning_rate": 4.545415217428986e-06, + "loss": 0.0026, + "step": 35764 + }, + { + "epoch": 0.71532, + "grad_norm": 0.1191292330622673, + "learning_rate": 4.544245009403231e-06, + "loss": 0.005, + "step": 35766 + }, + { + "epoch": 0.71536, + "grad_norm": 0.027673665434122086, + "learning_rate": 4.543074907740226e-06, + "loss": 0.0033, + "step": 35768 + }, + { + "epoch": 0.7154, + "grad_norm": 0.40844038128852844, + "learning_rate": 4.541904912462785e-06, + "loss": 0.0056, + "step": 35770 + }, + { + "epoch": 0.71544, + "grad_norm": 0.02858986333012581, + "learning_rate": 4.540735023593718e-06, + "loss": 0.0006, + "step": 35772 + }, + { + "epoch": 0.71548, + "grad_norm": 0.015690771862864494, + "learning_rate": 4.5395652411558356e-06, + "loss": 0.0003, + "step": 35774 + }, + { + "epoch": 0.71552, + "grad_norm": 0.7862305641174316, + "learning_rate": 4.538395565171936e-06, + "loss": 0.0105, + "step": 35776 + }, + { + "epoch": 0.71556, + "grad_norm": 1.5734740495681763, + "learning_rate": 4.537225995664827e-06, + "loss": 0.045, + "step": 35778 + }, + { + "epoch": 0.7156, + "grad_norm": 0.04010339081287384, + "learning_rate": 4.53605653265731e-06, + "loss": 0.0008, + "step": 35780 + }, + { + "epoch": 0.71564, + "grad_norm": 0.04091395065188408, + "learning_rate": 4.534887176172184e-06, + "loss": 0.0021, + "step": 35782 + }, + { + "epoch": 0.71568, + "grad_norm": 0.045659635215997696, + "learning_rate": 4.533717926232249e-06, + "loss": 0.0028, + "step": 35784 + }, + { + "epoch": 0.71572, + "grad_norm": 5.173661231994629, + "learning_rate": 4.532548782860291e-06, + "loss": 0.051, + "step": 35786 + }, + { + "epoch": 0.71576, + "grad_norm": 0.04398990795016289, + "learning_rate": 4.531379746079115e-06, + "loss": 0.0013, + "step": 35788 + }, + { + "epoch": 0.7158, + "grad_norm": 0.057687196880578995, + "learning_rate": 4.530210815911504e-06, + "loss": 0.0033, + "step": 35790 + }, + { + "epoch": 0.71584, + "grad_norm": 0.5006396174430847, + "learning_rate": 4.529041992380248e-06, + "loss": 0.0052, + "step": 35792 + }, + { + "epoch": 0.71588, + "grad_norm": 0.011304776184260845, + "learning_rate": 4.527873275508137e-06, + "loss": 0.0003, + "step": 35794 + }, + { + "epoch": 0.71592, + "grad_norm": 1.5508036613464355, + "learning_rate": 4.526704665317952e-06, + "loss": 0.0194, + "step": 35796 + }, + { + "epoch": 0.71596, + "grad_norm": 0.0201110802590847, + "learning_rate": 4.525536161832482e-06, + "loss": 0.0003, + "step": 35798 + }, + { + "epoch": 0.716, + "grad_norm": 0.05554316192865372, + "learning_rate": 4.524367765074499e-06, + "loss": 0.0037, + "step": 35800 + }, + { + "epoch": 0.71604, + "grad_norm": 0.7913910150527954, + "learning_rate": 4.523199475066785e-06, + "loss": 0.011, + "step": 35802 + }, + { + "epoch": 0.71608, + "grad_norm": 0.05671700835227966, + "learning_rate": 4.522031291832118e-06, + "loss": 0.0009, + "step": 35804 + }, + { + "epoch": 0.71612, + "grad_norm": 0.04169537499547005, + "learning_rate": 4.520863215393274e-06, + "loss": 0.0006, + "step": 35806 + }, + { + "epoch": 0.71616, + "grad_norm": 0.049886927008628845, + "learning_rate": 4.5196952457730155e-06, + "loss": 0.0007, + "step": 35808 + }, + { + "epoch": 0.7162, + "grad_norm": 0.4045501947402954, + "learning_rate": 4.518527382994127e-06, + "loss": 0.0125, + "step": 35810 + }, + { + "epoch": 0.71624, + "grad_norm": 0.05530283600091934, + "learning_rate": 4.517359627079365e-06, + "loss": 0.0013, + "step": 35812 + }, + { + "epoch": 0.71628, + "grad_norm": 0.21899105608463287, + "learning_rate": 4.516191978051499e-06, + "loss": 0.003, + "step": 35814 + }, + { + "epoch": 0.71632, + "grad_norm": 0.5158188939094543, + "learning_rate": 4.5150244359333e-06, + "loss": 0.0064, + "step": 35816 + }, + { + "epoch": 0.71636, + "grad_norm": 1.2764500379562378, + "learning_rate": 4.513857000747513e-06, + "loss": 0.0118, + "step": 35818 + }, + { + "epoch": 0.7164, + "grad_norm": 2.8312923908233643, + "learning_rate": 4.512689672516918e-06, + "loss": 0.0353, + "step": 35820 + }, + { + "epoch": 0.71644, + "grad_norm": 0.04815095663070679, + "learning_rate": 4.511522451264258e-06, + "loss": 0.2046, + "step": 35822 + }, + { + "epoch": 0.71648, + "grad_norm": 0.16266728937625885, + "learning_rate": 4.510355337012294e-06, + "loss": 0.0019, + "step": 35824 + }, + { + "epoch": 0.71652, + "grad_norm": 0.017872437834739685, + "learning_rate": 4.509188329783779e-06, + "loss": 0.0005, + "step": 35826 + }, + { + "epoch": 0.71656, + "grad_norm": 0.01102125272154808, + "learning_rate": 4.508021429601463e-06, + "loss": 0.0093, + "step": 35828 + }, + { + "epoch": 0.7166, + "grad_norm": 0.29978978633880615, + "learning_rate": 4.506854636488103e-06, + "loss": 0.0112, + "step": 35830 + }, + { + "epoch": 0.71664, + "grad_norm": 0.11824386566877365, + "learning_rate": 4.505687950466434e-06, + "loss": 0.0019, + "step": 35832 + }, + { + "epoch": 0.71668, + "grad_norm": 0.5553231835365295, + "learning_rate": 4.504521371559207e-06, + "loss": 0.008, + "step": 35834 + }, + { + "epoch": 0.71672, + "grad_norm": 11.647028923034668, + "learning_rate": 4.503354899789166e-06, + "loss": 0.1522, + "step": 35836 + }, + { + "epoch": 0.71676, + "grad_norm": 0.06951744109392166, + "learning_rate": 4.5021885351790554e-06, + "loss": 0.001, + "step": 35838 + }, + { + "epoch": 0.7168, + "grad_norm": 1.6694493293762207, + "learning_rate": 4.501022277751602e-06, + "loss": 0.3373, + "step": 35840 + }, + { + "epoch": 0.71684, + "grad_norm": 5.921212196350098, + "learning_rate": 4.499856127529557e-06, + "loss": 0.0878, + "step": 35842 + }, + { + "epoch": 0.71688, + "grad_norm": 0.32345834374427795, + "learning_rate": 4.498690084535645e-06, + "loss": 0.0046, + "step": 35844 + }, + { + "epoch": 0.71692, + "grad_norm": 0.04020804911851883, + "learning_rate": 4.497524148792602e-06, + "loss": 0.0011, + "step": 35846 + }, + { + "epoch": 0.71696, + "grad_norm": 0.17438650131225586, + "learning_rate": 4.4963583203231594e-06, + "loss": 0.0167, + "step": 35848 + }, + { + "epoch": 0.717, + "grad_norm": 0.1504935920238495, + "learning_rate": 4.495192599150045e-06, + "loss": 0.0284, + "step": 35850 + }, + { + "epoch": 0.71704, + "grad_norm": 0.15746933221817017, + "learning_rate": 4.494026985295988e-06, + "loss": 0.0019, + "step": 35852 + }, + { + "epoch": 0.71708, + "grad_norm": 0.6221498847007751, + "learning_rate": 4.4928614787837046e-06, + "loss": 0.0072, + "step": 35854 + }, + { + "epoch": 0.71712, + "grad_norm": 0.06186533719301224, + "learning_rate": 4.491696079635923e-06, + "loss": 0.0589, + "step": 35856 + }, + { + "epoch": 0.71716, + "grad_norm": 0.05898040160536766, + "learning_rate": 4.490530787875362e-06, + "loss": 0.0217, + "step": 35858 + }, + { + "epoch": 0.7172, + "grad_norm": 0.20171163976192474, + "learning_rate": 4.48936560352474e-06, + "loss": 0.0024, + "step": 35860 + }, + { + "epoch": 0.71724, + "grad_norm": 0.00365906092338264, + "learning_rate": 4.488200526606773e-06, + "loss": 0.0117, + "step": 35862 + }, + { + "epoch": 0.71728, + "grad_norm": 10.296208381652832, + "learning_rate": 4.487035557144178e-06, + "loss": 0.1273, + "step": 35864 + }, + { + "epoch": 0.71732, + "grad_norm": 0.2753579020500183, + "learning_rate": 4.4858706951596596e-06, + "loss": 0.0194, + "step": 35866 + }, + { + "epoch": 0.71736, + "grad_norm": 0.29169297218322754, + "learning_rate": 4.484705940675929e-06, + "loss": 0.0057, + "step": 35868 + }, + { + "epoch": 0.7174, + "grad_norm": 2.7735843658447266, + "learning_rate": 4.483541293715699e-06, + "loss": 0.036, + "step": 35870 + }, + { + "epoch": 0.71744, + "grad_norm": 0.003130729775875807, + "learning_rate": 4.482376754301668e-06, + "loss": 0.0003, + "step": 35872 + }, + { + "epoch": 0.71748, + "grad_norm": 0.10301042348146439, + "learning_rate": 4.481212322456548e-06, + "loss": 0.0014, + "step": 35874 + }, + { + "epoch": 0.71752, + "grad_norm": 0.208500936627388, + "learning_rate": 4.480047998203029e-06, + "loss": 0.0032, + "step": 35876 + }, + { + "epoch": 0.71756, + "grad_norm": 0.28557056188583374, + "learning_rate": 4.478883781563822e-06, + "loss": 0.0046, + "step": 35878 + }, + { + "epoch": 0.7176, + "grad_norm": 0.016378650441765785, + "learning_rate": 4.477719672561615e-06, + "loss": 0.0003, + "step": 35880 + }, + { + "epoch": 0.71764, + "grad_norm": 15.912352561950684, + "learning_rate": 4.476555671219105e-06, + "loss": 0.2271, + "step": 35882 + }, + { + "epoch": 0.71768, + "grad_norm": 7.73808479309082, + "learning_rate": 4.475391777558991e-06, + "loss": 0.1201, + "step": 35884 + }, + { + "epoch": 0.71772, + "grad_norm": 0.03925720974802971, + "learning_rate": 4.47422799160395e-06, + "loss": 0.0014, + "step": 35886 + }, + { + "epoch": 0.71776, + "grad_norm": 0.4739120900630951, + "learning_rate": 4.4730643133766874e-06, + "loss": 0.0057, + "step": 35888 + }, + { + "epoch": 0.7178, + "grad_norm": 0.12643268704414368, + "learning_rate": 4.471900742899876e-06, + "loss": 0.0149, + "step": 35890 + }, + { + "epoch": 0.71784, + "grad_norm": 10.801959991455078, + "learning_rate": 4.4707372801962075e-06, + "loss": 0.1004, + "step": 35892 + }, + { + "epoch": 0.71788, + "grad_norm": 0.008273271843791008, + "learning_rate": 4.469573925288362e-06, + "loss": 0.0045, + "step": 35894 + }, + { + "epoch": 0.71792, + "grad_norm": 0.020040031522512436, + "learning_rate": 4.468410678199023e-06, + "loss": 0.0012, + "step": 35896 + }, + { + "epoch": 0.71796, + "grad_norm": 1.7131773233413696, + "learning_rate": 4.467247538950863e-06, + "loss": 0.1023, + "step": 35898 + }, + { + "epoch": 0.718, + "grad_norm": 0.2093469649553299, + "learning_rate": 4.46608450756656e-06, + "loss": 0.0031, + "step": 35900 + }, + { + "epoch": 0.71804, + "grad_norm": 0.06926066428422928, + "learning_rate": 4.46492158406879e-06, + "loss": 0.0014, + "step": 35902 + }, + { + "epoch": 0.71808, + "grad_norm": 0.03821684792637825, + "learning_rate": 4.463758768480222e-06, + "loss": 0.0054, + "step": 35904 + }, + { + "epoch": 0.71812, + "grad_norm": 0.07885520160198212, + "learning_rate": 4.462596060823531e-06, + "loss": 0.0049, + "step": 35906 + }, + { + "epoch": 0.71816, + "grad_norm": 0.39290744066238403, + "learning_rate": 4.461433461121374e-06, + "loss": 0.0129, + "step": 35908 + }, + { + "epoch": 0.7182, + "grad_norm": 0.272808313369751, + "learning_rate": 4.4602709693964296e-06, + "loss": 0.0032, + "step": 35910 + }, + { + "epoch": 0.71824, + "grad_norm": 0.0479113943874836, + "learning_rate": 4.459108585671351e-06, + "loss": 0.0018, + "step": 35912 + }, + { + "epoch": 0.71828, + "grad_norm": 0.297632098197937, + "learning_rate": 4.4579463099688035e-06, + "loss": 0.0061, + "step": 35914 + }, + { + "epoch": 0.71832, + "grad_norm": 0.14468397200107574, + "learning_rate": 4.456784142311445e-06, + "loss": 0.002, + "step": 35916 + }, + { + "epoch": 0.71836, + "grad_norm": 0.1913263350725174, + "learning_rate": 4.455622082721934e-06, + "loss": 0.0026, + "step": 35918 + }, + { + "epoch": 0.7184, + "grad_norm": 0.028052877634763718, + "learning_rate": 4.4544601312229295e-06, + "loss": 0.002, + "step": 35920 + }, + { + "epoch": 0.71844, + "grad_norm": 0.023752253502607346, + "learning_rate": 4.453298287837076e-06, + "loss": 0.001, + "step": 35922 + }, + { + "epoch": 0.71848, + "grad_norm": 0.0261703934520483, + "learning_rate": 4.4521365525870274e-06, + "loss": 0.0004, + "step": 35924 + }, + { + "epoch": 0.71852, + "grad_norm": 0.0718899518251419, + "learning_rate": 4.450974925495433e-06, + "loss": 0.0048, + "step": 35926 + }, + { + "epoch": 0.71856, + "grad_norm": 0.15391510725021362, + "learning_rate": 4.449813406584943e-06, + "loss": 0.0017, + "step": 35928 + }, + { + "epoch": 0.7186, + "grad_norm": 0.20192356407642365, + "learning_rate": 4.44865199587819e-06, + "loss": 0.0024, + "step": 35930 + }, + { + "epoch": 0.71864, + "grad_norm": 0.13417261838912964, + "learning_rate": 4.447490693397834e-06, + "loss": 0.0061, + "step": 35932 + }, + { + "epoch": 0.71868, + "grad_norm": 3.404914617538452, + "learning_rate": 4.446329499166499e-06, + "loss": 0.3882, + "step": 35934 + }, + { + "epoch": 0.71872, + "grad_norm": 10.409333229064941, + "learning_rate": 4.445168413206832e-06, + "loss": 0.1532, + "step": 35936 + }, + { + "epoch": 0.71876, + "grad_norm": 0.3159799873828888, + "learning_rate": 4.4440074355414685e-06, + "loss": 0.0052, + "step": 35938 + }, + { + "epoch": 0.7188, + "grad_norm": 0.01219028141349554, + "learning_rate": 4.442846566193034e-06, + "loss": 0.0046, + "step": 35940 + }, + { + "epoch": 0.71884, + "grad_norm": 0.037856824696063995, + "learning_rate": 4.441685805184174e-06, + "loss": 0.0006, + "step": 35942 + }, + { + "epoch": 0.71888, + "grad_norm": 0.044569239020347595, + "learning_rate": 4.440525152537507e-06, + "loss": 0.0015, + "step": 35944 + }, + { + "epoch": 0.71892, + "grad_norm": 0.8599595427513123, + "learning_rate": 4.439364608275665e-06, + "loss": 0.0073, + "step": 35946 + }, + { + "epoch": 0.71896, + "grad_norm": 0.08764898777008057, + "learning_rate": 4.438204172421271e-06, + "loss": 0.0011, + "step": 35948 + }, + { + "epoch": 0.719, + "grad_norm": 1.0507032871246338, + "learning_rate": 4.437043844996952e-06, + "loss": 0.0138, + "step": 35950 + }, + { + "epoch": 0.71904, + "grad_norm": 11.188591003417969, + "learning_rate": 4.43588362602533e-06, + "loss": 0.2517, + "step": 35952 + }, + { + "epoch": 0.71908, + "grad_norm": 0.6768333315849304, + "learning_rate": 4.434723515529018e-06, + "loss": 0.0106, + "step": 35954 + }, + { + "epoch": 0.71912, + "grad_norm": 0.02211754582822323, + "learning_rate": 4.433563513530635e-06, + "loss": 0.4301, + "step": 35956 + }, + { + "epoch": 0.71916, + "grad_norm": 0.17140555381774902, + "learning_rate": 4.432403620052799e-06, + "loss": 0.0322, + "step": 35958 + }, + { + "epoch": 0.7192, + "grad_norm": 0.31434571743011475, + "learning_rate": 4.4312438351181246e-06, + "loss": 0.05, + "step": 35960 + }, + { + "epoch": 0.71924, + "grad_norm": 0.029241563752293587, + "learning_rate": 4.43008415874921e-06, + "loss": 0.0009, + "step": 35962 + }, + { + "epoch": 0.71928, + "grad_norm": 6.058168411254883, + "learning_rate": 4.428924590968679e-06, + "loss": 0.0722, + "step": 35964 + }, + { + "epoch": 0.71932, + "grad_norm": 0.21339915692806244, + "learning_rate": 4.427765131799129e-06, + "loss": 0.0024, + "step": 35966 + }, + { + "epoch": 0.71936, + "grad_norm": 0.052867140620946884, + "learning_rate": 4.426605781263166e-06, + "loss": 0.0009, + "step": 35968 + }, + { + "epoch": 0.7194, + "grad_norm": 3.6610026359558105, + "learning_rate": 4.425446539383394e-06, + "loss": 0.0527, + "step": 35970 + }, + { + "epoch": 0.71944, + "grad_norm": 0.27237024903297424, + "learning_rate": 4.424287406182409e-06, + "loss": 0.0065, + "step": 35972 + }, + { + "epoch": 0.71948, + "grad_norm": 0.3287094235420227, + "learning_rate": 4.423128381682817e-06, + "loss": 0.0029, + "step": 35974 + }, + { + "epoch": 0.71952, + "grad_norm": 0.18565978109836578, + "learning_rate": 4.421969465907201e-06, + "loss": 0.0035, + "step": 35976 + }, + { + "epoch": 0.71956, + "grad_norm": 0.016825182363390923, + "learning_rate": 4.4208106588781695e-06, + "loss": 0.0052, + "step": 35978 + }, + { + "epoch": 0.7196, + "grad_norm": 0.05184777081012726, + "learning_rate": 4.419651960618302e-06, + "loss": 0.0043, + "step": 35980 + }, + { + "epoch": 0.71964, + "grad_norm": 0.004592935089021921, + "learning_rate": 4.418493371150193e-06, + "loss": 0.0716, + "step": 35982 + }, + { + "epoch": 0.71968, + "grad_norm": 1.156174898147583, + "learning_rate": 4.41733489049643e-06, + "loss": 0.0149, + "step": 35984 + }, + { + "epoch": 0.71972, + "grad_norm": 0.0033629806712269783, + "learning_rate": 4.4161765186796e-06, + "loss": 0.0149, + "step": 35986 + }, + { + "epoch": 0.71976, + "grad_norm": 0.07284032553434372, + "learning_rate": 4.41501825572228e-06, + "loss": 0.0309, + "step": 35988 + }, + { + "epoch": 0.7198, + "grad_norm": 0.03793435916304588, + "learning_rate": 4.413860101647055e-06, + "loss": 0.0006, + "step": 35990 + }, + { + "epoch": 0.71984, + "grad_norm": 0.004840437322854996, + "learning_rate": 4.412702056476503e-06, + "loss": 0.0009, + "step": 35992 + }, + { + "epoch": 0.71988, + "grad_norm": 0.06400870531797409, + "learning_rate": 4.411544120233201e-06, + "loss": 0.0031, + "step": 35994 + }, + { + "epoch": 0.71992, + "grad_norm": 0.03510703891515732, + "learning_rate": 4.410386292939727e-06, + "loss": 0.0005, + "step": 35996 + }, + { + "epoch": 0.71996, + "grad_norm": 0.17143741250038147, + "learning_rate": 4.409228574618642e-06, + "loss": 0.0023, + "step": 35998 + }, + { + "epoch": 0.72, + "grad_norm": 0.04256359115242958, + "learning_rate": 4.408070965292534e-06, + "loss": 0.0019, + "step": 36000 + }, + { + "epoch": 0.72004, + "grad_norm": 0.013988692313432693, + "learning_rate": 4.406913464983955e-06, + "loss": 0.0006, + "step": 36002 + }, + { + "epoch": 0.72008, + "grad_norm": 0.1428169161081314, + "learning_rate": 4.405756073715479e-06, + "loss": 0.0018, + "step": 36004 + }, + { + "epoch": 0.72012, + "grad_norm": 0.0410158596932888, + "learning_rate": 4.404598791509672e-06, + "loss": 0.0006, + "step": 36006 + }, + { + "epoch": 0.72016, + "grad_norm": 0.9169901609420776, + "learning_rate": 4.403441618389084e-06, + "loss": 0.01, + "step": 36008 + }, + { + "epoch": 0.7202, + "grad_norm": 5.656311988830566, + "learning_rate": 4.402284554376292e-06, + "loss": 0.0429, + "step": 36010 + }, + { + "epoch": 0.72024, + "grad_norm": 0.37295910716056824, + "learning_rate": 4.401127599493839e-06, + "loss": 0.0035, + "step": 36012 + }, + { + "epoch": 0.72028, + "grad_norm": 2.1135575771331787, + "learning_rate": 4.399970753764287e-06, + "loss": 0.0186, + "step": 36014 + }, + { + "epoch": 0.72032, + "grad_norm": 0.002745369216427207, + "learning_rate": 4.3988140172101875e-06, + "loss": 0.0001, + "step": 36016 + }, + { + "epoch": 0.72036, + "grad_norm": 0.7475999593734741, + "learning_rate": 4.397657389854096e-06, + "loss": 0.0128, + "step": 36018 + }, + { + "epoch": 0.7204, + "grad_norm": 12.850484848022461, + "learning_rate": 4.3965008717185555e-06, + "loss": 0.1915, + "step": 36020 + }, + { + "epoch": 0.72044, + "grad_norm": 0.09373635053634644, + "learning_rate": 4.395344462826115e-06, + "loss": 0.0013, + "step": 36022 + }, + { + "epoch": 0.72048, + "grad_norm": 0.029542161151766777, + "learning_rate": 4.3941881631993184e-06, + "loss": 0.0012, + "step": 36024 + }, + { + "epoch": 0.72052, + "grad_norm": 0.11684658378362656, + "learning_rate": 4.393031972860712e-06, + "loss": 0.007, + "step": 36026 + }, + { + "epoch": 0.72056, + "grad_norm": 0.024301232770085335, + "learning_rate": 4.391875891832836e-06, + "loss": 0.0052, + "step": 36028 + }, + { + "epoch": 0.7206, + "grad_norm": 0.7791779637336731, + "learning_rate": 4.39071992013822e-06, + "loss": 0.0093, + "step": 36030 + }, + { + "epoch": 0.72064, + "grad_norm": 0.25875550508499146, + "learning_rate": 4.389564057799413e-06, + "loss": 0.0085, + "step": 36032 + }, + { + "epoch": 0.72068, + "grad_norm": 0.08306992799043655, + "learning_rate": 4.388408304838941e-06, + "loss": 0.0023, + "step": 36034 + }, + { + "epoch": 0.72072, + "grad_norm": 0.3983538746833801, + "learning_rate": 4.3872526612793375e-06, + "loss": 0.0161, + "step": 36036 + }, + { + "epoch": 0.72076, + "grad_norm": 0.23614732921123505, + "learning_rate": 4.386097127143133e-06, + "loss": 0.0029, + "step": 36038 + }, + { + "epoch": 0.7208, + "grad_norm": 0.08229189366102219, + "learning_rate": 4.384941702452856e-06, + "loss": 0.0016, + "step": 36040 + }, + { + "epoch": 0.72084, + "grad_norm": 0.019187767058610916, + "learning_rate": 4.3837863872310334e-06, + "loss": 0.0026, + "step": 36042 + }, + { + "epoch": 0.72088, + "grad_norm": 0.2567756474018097, + "learning_rate": 4.3826311815001845e-06, + "loss": 0.0026, + "step": 36044 + }, + { + "epoch": 0.72092, + "grad_norm": 0.36548396944999695, + "learning_rate": 4.381476085282832e-06, + "loss": 0.0041, + "step": 36046 + }, + { + "epoch": 0.72096, + "grad_norm": 0.0019759417045861483, + "learning_rate": 4.380321098601496e-06, + "loss": 0.0001, + "step": 36048 + }, + { + "epoch": 0.721, + "grad_norm": 0.014667623676359653, + "learning_rate": 4.379166221478697e-06, + "loss": 0.001, + "step": 36050 + }, + { + "epoch": 0.72104, + "grad_norm": 0.008683623746037483, + "learning_rate": 4.378011453936939e-06, + "loss": 0.0003, + "step": 36052 + }, + { + "epoch": 0.72108, + "grad_norm": 0.03899886831641197, + "learning_rate": 4.37685679599875e-06, + "loss": 0.0016, + "step": 36054 + }, + { + "epoch": 0.72112, + "grad_norm": 1.1409915685653687, + "learning_rate": 4.375702247686628e-06, + "loss": 0.099, + "step": 36056 + }, + { + "epoch": 0.72116, + "grad_norm": 0.0032470638398081064, + "learning_rate": 4.374547809023088e-06, + "loss": 0.0, + "step": 36058 + }, + { + "epoch": 0.7212, + "grad_norm": 0.001804051105864346, + "learning_rate": 4.373393480030637e-06, + "loss": 0.0027, + "step": 36060 + }, + { + "epoch": 0.72124, + "grad_norm": 0.18788032233715057, + "learning_rate": 4.372239260731769e-06, + "loss": 0.0027, + "step": 36062 + }, + { + "epoch": 0.72128, + "grad_norm": 0.23962090909481049, + "learning_rate": 4.371085151149002e-06, + "loss": 0.0029, + "step": 36064 + }, + { + "epoch": 0.72132, + "grad_norm": 0.08172918856143951, + "learning_rate": 4.369931151304824e-06, + "loss": 0.0037, + "step": 36066 + }, + { + "epoch": 0.72136, + "grad_norm": 0.5683065056800842, + "learning_rate": 4.368777261221737e-06, + "loss": 0.0064, + "step": 36068 + }, + { + "epoch": 0.7214, + "grad_norm": 0.018867097795009613, + "learning_rate": 4.367623480922236e-06, + "loss": 0.0012, + "step": 36070 + }, + { + "epoch": 0.72144, + "grad_norm": 0.012214038521051407, + "learning_rate": 4.366469810428816e-06, + "loss": 0.0213, + "step": 36072 + }, + { + "epoch": 0.72148, + "grad_norm": 0.2017282396554947, + "learning_rate": 4.36531624976397e-06, + "loss": 0.0038, + "step": 36074 + }, + { + "epoch": 0.72152, + "grad_norm": 0.03271455690264702, + "learning_rate": 4.364162798950181e-06, + "loss": 0.0004, + "step": 36076 + }, + { + "epoch": 0.72156, + "grad_norm": 0.39483609795570374, + "learning_rate": 4.363009458009941e-06, + "loss": 0.0032, + "step": 36078 + }, + { + "epoch": 0.7216, + "grad_norm": 0.2306974232196808, + "learning_rate": 4.361856226965733e-06, + "loss": 0.0024, + "step": 36080 + }, + { + "epoch": 0.72164, + "grad_norm": 0.06212737411260605, + "learning_rate": 4.360703105840041e-06, + "loss": 0.0006, + "step": 36082 + }, + { + "epoch": 0.72168, + "grad_norm": 0.01524145994335413, + "learning_rate": 4.359550094655344e-06, + "loss": 0.0007, + "step": 36084 + }, + { + "epoch": 0.72172, + "grad_norm": 0.01441393792629242, + "learning_rate": 4.358397193434126e-06, + "loss": 0.0002, + "step": 36086 + }, + { + "epoch": 0.72176, + "grad_norm": 0.14166679978370667, + "learning_rate": 4.357244402198856e-06, + "loss": 0.0022, + "step": 36088 + }, + { + "epoch": 0.7218, + "grad_norm": 0.3050740361213684, + "learning_rate": 4.356091720972011e-06, + "loss": 0.0054, + "step": 36090 + }, + { + "epoch": 0.72184, + "grad_norm": 0.052828237414360046, + "learning_rate": 4.354939149776068e-06, + "loss": 0.0016, + "step": 36092 + }, + { + "epoch": 0.72188, + "grad_norm": 0.022324735298752785, + "learning_rate": 4.353786688633485e-06, + "loss": 0.0047, + "step": 36094 + }, + { + "epoch": 0.72192, + "grad_norm": 0.04275790974497795, + "learning_rate": 4.352634337566744e-06, + "loss": 0.0009, + "step": 36096 + }, + { + "epoch": 0.72196, + "grad_norm": 0.016912119463086128, + "learning_rate": 4.3514820965982975e-06, + "loss": 0.0006, + "step": 36098 + }, + { + "epoch": 0.722, + "grad_norm": 0.08396325260400772, + "learning_rate": 4.350329965750622e-06, + "loss": 0.0017, + "step": 36100 + }, + { + "epoch": 0.72204, + "grad_norm": 0.02682013437151909, + "learning_rate": 4.349177945046169e-06, + "loss": 0.0047, + "step": 36102 + }, + { + "epoch": 0.72208, + "grad_norm": 0.3171871304512024, + "learning_rate": 4.348026034507402e-06, + "loss": 0.0037, + "step": 36104 + }, + { + "epoch": 0.72212, + "grad_norm": 0.026643341407179832, + "learning_rate": 4.3468742341567775e-06, + "loss": 0.0003, + "step": 36106 + }, + { + "epoch": 0.72216, + "grad_norm": 0.017665943130850792, + "learning_rate": 4.345722544016753e-06, + "loss": 0.0025, + "step": 36108 + }, + { + "epoch": 0.7222, + "grad_norm": 14.213312149047852, + "learning_rate": 4.344570964109775e-06, + "loss": 0.2045, + "step": 36110 + }, + { + "epoch": 0.72224, + "grad_norm": 0.011292390525341034, + "learning_rate": 4.343419494458298e-06, + "loss": 0.001, + "step": 36112 + }, + { + "epoch": 0.72228, + "grad_norm": 0.12433134019374847, + "learning_rate": 4.342268135084769e-06, + "loss": 0.0012, + "step": 36114 + }, + { + "epoch": 0.72232, + "grad_norm": 0.04358460009098053, + "learning_rate": 4.341116886011636e-06, + "loss": 0.0034, + "step": 36116 + }, + { + "epoch": 0.72236, + "grad_norm": 0.003148618619889021, + "learning_rate": 4.339965747261347e-06, + "loss": 0.0072, + "step": 36118 + }, + { + "epoch": 0.7224, + "grad_norm": 12.63055419921875, + "learning_rate": 4.338814718856333e-06, + "loss": 0.1501, + "step": 36120 + }, + { + "epoch": 0.72244, + "grad_norm": 0.023153942078351974, + "learning_rate": 4.337663800819046e-06, + "loss": 0.0023, + "step": 36122 + }, + { + "epoch": 0.72248, + "grad_norm": 0.6754043698310852, + "learning_rate": 4.336512993171916e-06, + "loss": 0.0082, + "step": 36124 + }, + { + "epoch": 0.72252, + "grad_norm": 0.23813621699810028, + "learning_rate": 4.335362295937381e-06, + "loss": 0.0044, + "step": 36126 + }, + { + "epoch": 0.72256, + "grad_norm": 9.846617698669434, + "learning_rate": 4.3342117091378774e-06, + "loss": 0.2555, + "step": 36128 + }, + { + "epoch": 0.7226, + "grad_norm": 0.011435150168836117, + "learning_rate": 4.3330612327958265e-06, + "loss": 0.0029, + "step": 36130 + }, + { + "epoch": 0.72264, + "grad_norm": 0.027149928733706474, + "learning_rate": 4.331910866933673e-06, + "loss": 0.001, + "step": 36132 + }, + { + "epoch": 0.72268, + "grad_norm": 0.09199237823486328, + "learning_rate": 4.33076061157383e-06, + "loss": 0.0011, + "step": 36134 + }, + { + "epoch": 0.72272, + "grad_norm": 0.08758186548948288, + "learning_rate": 4.329610466738728e-06, + "loss": 0.0011, + "step": 36136 + }, + { + "epoch": 0.72276, + "grad_norm": 12.492571830749512, + "learning_rate": 4.32846043245079e-06, + "loss": 0.1762, + "step": 36138 + }, + { + "epoch": 0.7228, + "grad_norm": 0.4151621460914612, + "learning_rate": 4.3273105087324375e-06, + "loss": 0.0045, + "step": 36140 + }, + { + "epoch": 0.72284, + "grad_norm": 0.8958083391189575, + "learning_rate": 4.3261606956060855e-06, + "loss": 0.0054, + "step": 36142 + }, + { + "epoch": 0.72288, + "grad_norm": 17.050697326660156, + "learning_rate": 4.325010993094151e-06, + "loss": 0.2512, + "step": 36144 + }, + { + "epoch": 0.72292, + "grad_norm": 0.06486823409795761, + "learning_rate": 4.323861401219048e-06, + "loss": 0.0015, + "step": 36146 + }, + { + "epoch": 0.72296, + "grad_norm": 0.01764885149896145, + "learning_rate": 4.322711920003191e-06, + "loss": 0.0007, + "step": 36148 + }, + { + "epoch": 0.723, + "grad_norm": 0.02646748535335064, + "learning_rate": 4.321562549468991e-06, + "loss": 0.0003, + "step": 36150 + }, + { + "epoch": 0.72304, + "grad_norm": 0.003847447456791997, + "learning_rate": 4.320413289638844e-06, + "loss": 0.0583, + "step": 36152 + }, + { + "epoch": 0.72308, + "grad_norm": 0.01128915511071682, + "learning_rate": 4.3192641405351734e-06, + "loss": 0.0002, + "step": 36154 + }, + { + "epoch": 0.72312, + "grad_norm": 0.01226562075316906, + "learning_rate": 4.318115102180368e-06, + "loss": 0.001, + "step": 36156 + }, + { + "epoch": 0.72316, + "grad_norm": 9.372428894042969, + "learning_rate": 4.316966174596834e-06, + "loss": 0.106, + "step": 36158 + }, + { + "epoch": 0.7232, + "grad_norm": 0.004310740157961845, + "learning_rate": 4.315817357806974e-06, + "loss": 0.0002, + "step": 36160 + }, + { + "epoch": 0.72324, + "grad_norm": 0.4312596917152405, + "learning_rate": 4.314668651833174e-06, + "loss": 0.0047, + "step": 36162 + }, + { + "epoch": 0.72328, + "grad_norm": 0.02979811653494835, + "learning_rate": 4.313520056697843e-06, + "loss": 0.0013, + "step": 36164 + }, + { + "epoch": 0.72332, + "grad_norm": 0.11521825939416885, + "learning_rate": 4.312371572423362e-06, + "loss": 0.0012, + "step": 36166 + }, + { + "epoch": 0.72336, + "grad_norm": 0.194438174366951, + "learning_rate": 4.311223199032125e-06, + "loss": 0.0029, + "step": 36168 + }, + { + "epoch": 0.7234, + "grad_norm": 0.019067103043198586, + "learning_rate": 4.310074936546521e-06, + "loss": 0.0074, + "step": 36170 + }, + { + "epoch": 0.72344, + "grad_norm": 0.015227299183607101, + "learning_rate": 4.308926784988936e-06, + "loss": 0.0338, + "step": 36172 + }, + { + "epoch": 0.72348, + "grad_norm": 0.019881892949342728, + "learning_rate": 4.307778744381753e-06, + "loss": 0.0046, + "step": 36174 + }, + { + "epoch": 0.72352, + "grad_norm": 0.12256456166505814, + "learning_rate": 4.306630814747358e-06, + "loss": 0.0038, + "step": 36176 + }, + { + "epoch": 0.72356, + "grad_norm": 0.004903374705463648, + "learning_rate": 4.3054829961081215e-06, + "loss": 0.0015, + "step": 36178 + }, + { + "epoch": 0.7236, + "grad_norm": 0.06049567833542824, + "learning_rate": 4.304335288486426e-06, + "loss": 0.0218, + "step": 36180 + }, + { + "epoch": 0.72364, + "grad_norm": 0.004148684907704592, + "learning_rate": 4.303187691904651e-06, + "loss": 0.0006, + "step": 36182 + }, + { + "epoch": 0.72368, + "grad_norm": 0.38116949796676636, + "learning_rate": 4.302040206385158e-06, + "loss": 0.012, + "step": 36184 + }, + { + "epoch": 0.72372, + "grad_norm": 0.030345745384693146, + "learning_rate": 4.300892831950329e-06, + "loss": 0.0027, + "step": 36186 + }, + { + "epoch": 0.72376, + "grad_norm": 0.3794231116771698, + "learning_rate": 4.299745568622524e-06, + "loss": 0.0043, + "step": 36188 + }, + { + "epoch": 0.7238, + "grad_norm": 0.09265456348657608, + "learning_rate": 4.29859841642412e-06, + "loss": 0.0019, + "step": 36190 + }, + { + "epoch": 0.72384, + "grad_norm": 0.08455861359834671, + "learning_rate": 4.297451375377471e-06, + "loss": 0.0047, + "step": 36192 + }, + { + "epoch": 0.72388, + "grad_norm": 0.015279523096978664, + "learning_rate": 4.296304445504945e-06, + "loss": 0.0009, + "step": 36194 + }, + { + "epoch": 0.72392, + "grad_norm": 0.06276126950979233, + "learning_rate": 4.295157626828903e-06, + "loss": 0.001, + "step": 36196 + }, + { + "epoch": 0.72396, + "grad_norm": 0.03430837392807007, + "learning_rate": 4.294010919371696e-06, + "loss": 0.017, + "step": 36198 + }, + { + "epoch": 0.724, + "grad_norm": 0.49767065048217773, + "learning_rate": 4.292864323155684e-06, + "loss": 0.0046, + "step": 36200 + }, + { + "epoch": 0.72404, + "grad_norm": 0.2243293821811676, + "learning_rate": 4.291717838203221e-06, + "loss": 0.0025, + "step": 36202 + }, + { + "epoch": 0.72408, + "grad_norm": 0.02650454454123974, + "learning_rate": 4.290571464536657e-06, + "loss": 0.0005, + "step": 36204 + }, + { + "epoch": 0.72412, + "grad_norm": 0.07710383087396622, + "learning_rate": 4.289425202178341e-06, + "loss": 0.0012, + "step": 36206 + }, + { + "epoch": 0.72416, + "grad_norm": 0.02595711499452591, + "learning_rate": 4.288279051150626e-06, + "loss": 0.0019, + "step": 36208 + }, + { + "epoch": 0.7242, + "grad_norm": 0.08365406095981598, + "learning_rate": 4.287133011475847e-06, + "loss": 0.0011, + "step": 36210 + }, + { + "epoch": 0.72424, + "grad_norm": 0.14076781272888184, + "learning_rate": 4.2859870831763505e-06, + "loss": 0.0014, + "step": 36212 + }, + { + "epoch": 0.72428, + "grad_norm": 0.013473015278577805, + "learning_rate": 4.284841266274482e-06, + "loss": 0.0036, + "step": 36214 + }, + { + "epoch": 0.72432, + "grad_norm": 0.039292216300964355, + "learning_rate": 4.283695560792568e-06, + "loss": 0.0006, + "step": 36216 + }, + { + "epoch": 0.72436, + "grad_norm": 0.0057335966266691685, + "learning_rate": 4.282549966752958e-06, + "loss": 0.0013, + "step": 36218 + }, + { + "epoch": 0.7244, + "grad_norm": 0.22827404737472534, + "learning_rate": 4.281404484177974e-06, + "loss": 0.0021, + "step": 36220 + }, + { + "epoch": 0.72444, + "grad_norm": 0.1553879976272583, + "learning_rate": 4.2802591130899605e-06, + "loss": 0.0019, + "step": 36222 + }, + { + "epoch": 0.72448, + "grad_norm": 0.08805286884307861, + "learning_rate": 4.279113853511237e-06, + "loss": 0.0023, + "step": 36224 + }, + { + "epoch": 0.72452, + "grad_norm": 0.614973247051239, + "learning_rate": 4.277968705464135e-06, + "loss": 0.0082, + "step": 36226 + }, + { + "epoch": 0.72456, + "grad_norm": 0.00045946621685288846, + "learning_rate": 4.276823668970981e-06, + "loss": 0.0961, + "step": 36228 + }, + { + "epoch": 0.7246, + "grad_norm": 0.32507139444351196, + "learning_rate": 4.275678744054094e-06, + "loss": 0.0027, + "step": 36230 + }, + { + "epoch": 0.72464, + "grad_norm": 0.03253946825861931, + "learning_rate": 4.274533930735796e-06, + "loss": 0.0003, + "step": 36232 + }, + { + "epoch": 0.72468, + "grad_norm": 0.15972496569156647, + "learning_rate": 4.273389229038406e-06, + "loss": 0.0031, + "step": 36234 + }, + { + "epoch": 0.72472, + "grad_norm": 0.004296302795410156, + "learning_rate": 4.272244638984243e-06, + "loss": 0.0024, + "step": 36236 + }, + { + "epoch": 0.72476, + "grad_norm": 0.126490518450737, + "learning_rate": 4.2711001605956185e-06, + "loss": 0.0015, + "step": 36238 + }, + { + "epoch": 0.7248, + "grad_norm": 0.03658302128314972, + "learning_rate": 4.26995579389485e-06, + "loss": 0.0023, + "step": 36240 + }, + { + "epoch": 0.72484, + "grad_norm": 9.268577575683594, + "learning_rate": 4.2688115389042354e-06, + "loss": 0.1502, + "step": 36242 + }, + { + "epoch": 0.72488, + "grad_norm": 7.02674674987793, + "learning_rate": 4.2676673956460975e-06, + "loss": 0.0964, + "step": 36244 + }, + { + "epoch": 0.72492, + "grad_norm": 0.0022018146701157093, + "learning_rate": 4.266523364142732e-06, + "loss": 0.005, + "step": 36246 + }, + { + "epoch": 0.72496, + "grad_norm": 0.01902182213962078, + "learning_rate": 4.265379444416446e-06, + "loss": 0.0046, + "step": 36248 + }, + { + "epoch": 0.725, + "grad_norm": 0.8587008714675903, + "learning_rate": 4.264235636489542e-06, + "loss": 0.0106, + "step": 36250 + }, + { + "epoch": 0.72504, + "grad_norm": 0.021054094657301903, + "learning_rate": 4.263091940384311e-06, + "loss": 0.0105, + "step": 36252 + }, + { + "epoch": 0.72508, + "grad_norm": 0.007931679487228394, + "learning_rate": 4.261948356123063e-06, + "loss": 0.0082, + "step": 36254 + }, + { + "epoch": 0.72512, + "grad_norm": 0.013314995914697647, + "learning_rate": 4.260804883728082e-06, + "loss": 0.0006, + "step": 36256 + }, + { + "epoch": 0.72516, + "grad_norm": 1.4881452322006226, + "learning_rate": 4.259661523221666e-06, + "loss": 0.0162, + "step": 36258 + }, + { + "epoch": 0.7252, + "grad_norm": 0.06552489101886749, + "learning_rate": 4.258518274626103e-06, + "loss": 0.0021, + "step": 36260 + }, + { + "epoch": 0.72524, + "grad_norm": 0.4606167674064636, + "learning_rate": 4.257375137963686e-06, + "loss": 0.0048, + "step": 36262 + }, + { + "epoch": 0.72528, + "grad_norm": 0.09019752591848373, + "learning_rate": 4.256232113256693e-06, + "loss": 0.0012, + "step": 36264 + }, + { + "epoch": 0.72532, + "grad_norm": 0.0068406471982598305, + "learning_rate": 4.255089200527413e-06, + "loss": 0.0092, + "step": 36266 + }, + { + "epoch": 0.72536, + "grad_norm": 0.07192987948656082, + "learning_rate": 4.253946399798126e-06, + "loss": 0.0019, + "step": 36268 + }, + { + "epoch": 0.7254, + "grad_norm": 0.029013054445385933, + "learning_rate": 4.2528037110911126e-06, + "loss": 0.0003, + "step": 36270 + }, + { + "epoch": 0.72544, + "grad_norm": 0.2533039450645447, + "learning_rate": 4.251661134428653e-06, + "loss": 0.0045, + "step": 36272 + }, + { + "epoch": 0.72548, + "grad_norm": 0.40428149700164795, + "learning_rate": 4.250518669833011e-06, + "loss": 0.0042, + "step": 36274 + }, + { + "epoch": 0.72552, + "grad_norm": 0.002867602277547121, + "learning_rate": 4.249376317326477e-06, + "loss": 0.0004, + "step": 36276 + }, + { + "epoch": 0.72556, + "grad_norm": 4.84197473526001, + "learning_rate": 4.2482340769313066e-06, + "loss": 0.0378, + "step": 36278 + }, + { + "epoch": 0.7256, + "grad_norm": 12.536681175231934, + "learning_rate": 4.247091948669775e-06, + "loss": 0.2044, + "step": 36280 + }, + { + "epoch": 0.72564, + "grad_norm": 12.049700736999512, + "learning_rate": 4.245949932564151e-06, + "loss": 0.1625, + "step": 36282 + }, + { + "epoch": 0.72568, + "grad_norm": 0.08241265267133713, + "learning_rate": 4.244808028636689e-06, + "loss": 0.0007, + "step": 36284 + }, + { + "epoch": 0.72572, + "grad_norm": 0.0071081966161727905, + "learning_rate": 4.243666236909664e-06, + "loss": 0.0001, + "step": 36286 + }, + { + "epoch": 0.72576, + "grad_norm": 0.0029363289941102266, + "learning_rate": 4.242524557405325e-06, + "loss": 0.0002, + "step": 36288 + }, + { + "epoch": 0.7258, + "grad_norm": 13.457371711730957, + "learning_rate": 4.2413829901459345e-06, + "loss": 0.2672, + "step": 36290 + }, + { + "epoch": 0.72584, + "grad_norm": 0.19779185950756073, + "learning_rate": 4.2402415351537476e-06, + "loss": 0.002, + "step": 36292 + }, + { + "epoch": 0.72588, + "grad_norm": 0.02787073887884617, + "learning_rate": 4.239100192451017e-06, + "loss": 0.0006, + "step": 36294 + }, + { + "epoch": 0.72592, + "grad_norm": 0.021851293742656708, + "learning_rate": 4.237958962059998e-06, + "loss": 0.0134, + "step": 36296 + }, + { + "epoch": 0.72596, + "grad_norm": 0.2859400510787964, + "learning_rate": 4.236817844002931e-06, + "loss": 0.008, + "step": 36298 + }, + { + "epoch": 0.726, + "grad_norm": 0.02223866805434227, + "learning_rate": 4.235676838302069e-06, + "loss": 0.0014, + "step": 36300 + }, + { + "epoch": 0.72604, + "grad_norm": 0.00275521632283926, + "learning_rate": 4.234535944979653e-06, + "loss": 0.0004, + "step": 36302 + }, + { + "epoch": 0.72608, + "grad_norm": 0.0993347242474556, + "learning_rate": 4.233395164057932e-06, + "loss": 0.0011, + "step": 36304 + }, + { + "epoch": 0.72612, + "grad_norm": 0.014470826834440231, + "learning_rate": 4.232254495559132e-06, + "loss": 0.0002, + "step": 36306 + }, + { + "epoch": 0.72616, + "grad_norm": 0.012519203126430511, + "learning_rate": 4.231113939505509e-06, + "loss": 0.0002, + "step": 36308 + }, + { + "epoch": 0.7262, + "grad_norm": 0.010599869303405285, + "learning_rate": 4.229973495919286e-06, + "loss": 0.0005, + "step": 36310 + }, + { + "epoch": 0.72624, + "grad_norm": 0.06406386941671371, + "learning_rate": 4.2288331648227e-06, + "loss": 0.3737, + "step": 36312 + }, + { + "epoch": 0.72628, + "grad_norm": 0.07474435120820999, + "learning_rate": 4.227692946237982e-06, + "loss": 0.008, + "step": 36314 + }, + { + "epoch": 0.72632, + "grad_norm": 0.018879350274801254, + "learning_rate": 4.226552840187362e-06, + "loss": 0.0003, + "step": 36316 + }, + { + "epoch": 0.72636, + "grad_norm": 0.15596069395542145, + "learning_rate": 4.225412846693071e-06, + "loss": 0.0067, + "step": 36318 + }, + { + "epoch": 0.7264, + "grad_norm": 0.8615396618843079, + "learning_rate": 4.224272965777326e-06, + "loss": 0.0173, + "step": 36320 + }, + { + "epoch": 0.72644, + "grad_norm": 0.00875829253345728, + "learning_rate": 4.223133197462352e-06, + "loss": 0.0189, + "step": 36322 + }, + { + "epoch": 0.72648, + "grad_norm": 0.037944115698337555, + "learning_rate": 4.221993541770372e-06, + "loss": 0.0027, + "step": 36324 + }, + { + "epoch": 0.72652, + "grad_norm": 0.17824649810791016, + "learning_rate": 4.2208539987236015e-06, + "loss": 0.0021, + "step": 36326 + }, + { + "epoch": 0.72656, + "grad_norm": 0.23479311168193817, + "learning_rate": 4.219714568344258e-06, + "loss": 0.0029, + "step": 36328 + }, + { + "epoch": 0.7266, + "grad_norm": 2.1019883155822754, + "learning_rate": 4.218575250654559e-06, + "loss": 0.0255, + "step": 36330 + }, + { + "epoch": 0.72664, + "grad_norm": 0.005333161447197199, + "learning_rate": 4.217436045676707e-06, + "loss": 0.0002, + "step": 36332 + }, + { + "epoch": 0.72668, + "grad_norm": 0.009489958174526691, + "learning_rate": 4.216296953432917e-06, + "loss": 0.0022, + "step": 36334 + }, + { + "epoch": 0.72672, + "grad_norm": 0.06573789566755295, + "learning_rate": 4.2151579739453995e-06, + "loss": 0.0006, + "step": 36336 + }, + { + "epoch": 0.72676, + "grad_norm": 0.0030418739188462496, + "learning_rate": 4.214019107236348e-06, + "loss": 0.0377, + "step": 36338 + }, + { + "epoch": 0.7268, + "grad_norm": 0.6088114380836487, + "learning_rate": 4.21288035332798e-06, + "loss": 0.0072, + "step": 36340 + }, + { + "epoch": 0.72684, + "grad_norm": 0.7282350063323975, + "learning_rate": 4.211741712242482e-06, + "loss": 0.7462, + "step": 36342 + }, + { + "epoch": 0.72688, + "grad_norm": 0.0027097281999886036, + "learning_rate": 4.2106031840020656e-06, + "loss": 0.0002, + "step": 36344 + }, + { + "epoch": 0.72692, + "grad_norm": 0.0131676709279418, + "learning_rate": 4.209464768628918e-06, + "loss": 0.0002, + "step": 36346 + }, + { + "epoch": 0.72696, + "grad_norm": 14.349725723266602, + "learning_rate": 4.208326466145236e-06, + "loss": 0.3197, + "step": 36348 + }, + { + "epoch": 0.727, + "grad_norm": 12.074787139892578, + "learning_rate": 4.207188276573214e-06, + "loss": 0.4742, + "step": 36350 + }, + { + "epoch": 0.72704, + "grad_norm": 0.13893453776836395, + "learning_rate": 4.206050199935037e-06, + "loss": 0.0026, + "step": 36352 + }, + { + "epoch": 0.72708, + "grad_norm": 3.172316312789917, + "learning_rate": 4.204912236252893e-06, + "loss": 0.0273, + "step": 36354 + }, + { + "epoch": 0.72712, + "grad_norm": 0.0785023644566536, + "learning_rate": 4.203774385548969e-06, + "loss": 0.001, + "step": 36356 + }, + { + "epoch": 0.72716, + "grad_norm": 0.24675513803958893, + "learning_rate": 4.202636647845446e-06, + "loss": 0.0037, + "step": 36358 + }, + { + "epoch": 0.7272, + "grad_norm": 0.14348670840263367, + "learning_rate": 4.201499023164508e-06, + "loss": 0.0014, + "step": 36360 + }, + { + "epoch": 0.72724, + "grad_norm": 0.05475310981273651, + "learning_rate": 4.2003615115283346e-06, + "loss": 0.0011, + "step": 36362 + }, + { + "epoch": 0.72728, + "grad_norm": 0.03823680803179741, + "learning_rate": 4.199224112959096e-06, + "loss": 0.0005, + "step": 36364 + }, + { + "epoch": 0.72732, + "grad_norm": 0.20389661192893982, + "learning_rate": 4.19808682747897e-06, + "loss": 0.0026, + "step": 36366 + }, + { + "epoch": 0.72736, + "grad_norm": 0.035306189209222794, + "learning_rate": 4.196949655110128e-06, + "loss": 0.0007, + "step": 36368 + }, + { + "epoch": 0.7274, + "grad_norm": 0.06519696116447449, + "learning_rate": 4.19581259587474e-06, + "loss": 0.0013, + "step": 36370 + }, + { + "epoch": 0.72744, + "grad_norm": 0.14268752932548523, + "learning_rate": 4.194675649794977e-06, + "loss": 0.0022, + "step": 36372 + }, + { + "epoch": 0.72748, + "grad_norm": 0.0751107856631279, + "learning_rate": 4.193538816892995e-06, + "loss": 0.0014, + "step": 36374 + }, + { + "epoch": 0.72752, + "grad_norm": 0.6332207322120667, + "learning_rate": 4.192402097190969e-06, + "loss": 0.0069, + "step": 36376 + }, + { + "epoch": 0.72756, + "grad_norm": 0.18100537359714508, + "learning_rate": 4.19126549071105e-06, + "loss": 0.009, + "step": 36378 + }, + { + "epoch": 0.7276, + "grad_norm": 1.0427311658859253, + "learning_rate": 4.190128997475402e-06, + "loss": 0.0176, + "step": 36380 + }, + { + "epoch": 0.72764, + "grad_norm": 0.0199296772480011, + "learning_rate": 4.18899261750618e-06, + "loss": 0.0003, + "step": 36382 + }, + { + "epoch": 0.72768, + "grad_norm": 0.059375472366809845, + "learning_rate": 4.187856350825539e-06, + "loss": 0.0007, + "step": 36384 + }, + { + "epoch": 0.72772, + "grad_norm": 16.595745086669922, + "learning_rate": 4.186720197455634e-06, + "loss": 0.9387, + "step": 36386 + }, + { + "epoch": 0.72776, + "grad_norm": 3.77359938621521, + "learning_rate": 4.185584157418608e-06, + "loss": 0.0524, + "step": 36388 + }, + { + "epoch": 0.7278, + "grad_norm": 17.40479278564453, + "learning_rate": 4.184448230736613e-06, + "loss": 0.8984, + "step": 36390 + }, + { + "epoch": 0.72784, + "grad_norm": 0.05260748416185379, + "learning_rate": 4.183312417431793e-06, + "loss": 0.0097, + "step": 36392 + }, + { + "epoch": 0.72788, + "grad_norm": 0.03297363966703415, + "learning_rate": 4.182176717526295e-06, + "loss": 0.4301, + "step": 36394 + }, + { + "epoch": 0.72792, + "grad_norm": 0.6852316856384277, + "learning_rate": 4.181041131042251e-06, + "loss": 0.0064, + "step": 36396 + }, + { + "epoch": 0.72796, + "grad_norm": 1.5092096328735352, + "learning_rate": 4.179905658001813e-06, + "loss": 0.0197, + "step": 36398 + }, + { + "epoch": 0.728, + "grad_norm": 0.25186023116111755, + "learning_rate": 4.178770298427107e-06, + "loss": 0.0051, + "step": 36400 + }, + { + "epoch": 0.72804, + "grad_norm": 2.3065528869628906, + "learning_rate": 4.177635052340271e-06, + "loss": 0.0319, + "step": 36402 + }, + { + "epoch": 0.72808, + "grad_norm": 0.09081540256738663, + "learning_rate": 4.17649991976344e-06, + "loss": 0.0024, + "step": 36404 + }, + { + "epoch": 0.72812, + "grad_norm": 0.28765514492988586, + "learning_rate": 4.175364900718735e-06, + "loss": 0.0032, + "step": 36406 + }, + { + "epoch": 0.72816, + "grad_norm": 0.017920250073075294, + "learning_rate": 4.174229995228297e-06, + "loss": 0.0002, + "step": 36408 + }, + { + "epoch": 0.7282, + "grad_norm": 0.06242061033844948, + "learning_rate": 4.173095203314241e-06, + "loss": 0.0025, + "step": 36410 + }, + { + "epoch": 0.72824, + "grad_norm": 0.02696925587952137, + "learning_rate": 4.171960524998694e-06, + "loss": 0.0028, + "step": 36412 + }, + { + "epoch": 0.72828, + "grad_norm": 13.453140258789062, + "learning_rate": 4.170825960303776e-06, + "loss": 0.2096, + "step": 36414 + }, + { + "epoch": 0.72832, + "grad_norm": 0.06808508932590485, + "learning_rate": 4.169691509251607e-06, + "loss": 0.0031, + "step": 36416 + }, + { + "epoch": 0.72836, + "grad_norm": 0.006327158305794001, + "learning_rate": 4.168557171864309e-06, + "loss": 0.1379, + "step": 36418 + }, + { + "epoch": 0.7284, + "grad_norm": 1.2700203657150269, + "learning_rate": 4.167422948163986e-06, + "loss": 0.0135, + "step": 36420 + }, + { + "epoch": 0.72844, + "grad_norm": 0.08335117995738983, + "learning_rate": 4.166288838172756e-06, + "loss": 0.0011, + "step": 36422 + }, + { + "epoch": 0.72848, + "grad_norm": 0.08938663452863693, + "learning_rate": 4.165154841912728e-06, + "loss": 0.0015, + "step": 36424 + }, + { + "epoch": 0.72852, + "grad_norm": 0.04497845470905304, + "learning_rate": 4.164020959406016e-06, + "loss": 0.0037, + "step": 36426 + }, + { + "epoch": 0.72856, + "grad_norm": 0.10321192443370819, + "learning_rate": 4.162887190674711e-06, + "loss": 0.0013, + "step": 36428 + }, + { + "epoch": 0.7286, + "grad_norm": 1.9155797958374023, + "learning_rate": 4.161753535740932e-06, + "loss": 0.0681, + "step": 36430 + }, + { + "epoch": 0.72864, + "grad_norm": 0.01797253079712391, + "learning_rate": 4.160619994626771e-06, + "loss": 0.0017, + "step": 36432 + }, + { + "epoch": 0.72868, + "grad_norm": 0.3727301359176636, + "learning_rate": 4.15948656735433e-06, + "loss": 0.0043, + "step": 36434 + }, + { + "epoch": 0.72872, + "grad_norm": 10.466442108154297, + "learning_rate": 4.158353253945704e-06, + "loss": 0.1757, + "step": 36436 + }, + { + "epoch": 0.72876, + "grad_norm": 0.020218592137098312, + "learning_rate": 4.157220054422989e-06, + "loss": 0.0033, + "step": 36438 + }, + { + "epoch": 0.7288, + "grad_norm": 0.6997669339179993, + "learning_rate": 4.15608696880828e-06, + "loss": 0.008, + "step": 36440 + }, + { + "epoch": 0.72884, + "grad_norm": 0.3335988223552704, + "learning_rate": 4.154953997123663e-06, + "loss": 0.0108, + "step": 36442 + }, + { + "epoch": 0.72888, + "grad_norm": 14.771251678466797, + "learning_rate": 4.1538211393912255e-06, + "loss": 0.6089, + "step": 36444 + }, + { + "epoch": 0.72892, + "grad_norm": 0.05036545917391777, + "learning_rate": 4.152688395633054e-06, + "loss": 0.0009, + "step": 36446 + }, + { + "epoch": 0.72896, + "grad_norm": 0.20177066326141357, + "learning_rate": 4.151555765871235e-06, + "loss": 0.006, + "step": 36448 + }, + { + "epoch": 0.729, + "grad_norm": 0.055642373859882355, + "learning_rate": 4.150423250127846e-06, + "loss": 0.001, + "step": 36450 + }, + { + "epoch": 0.72904, + "grad_norm": 0.1748485416173935, + "learning_rate": 4.149290848424969e-06, + "loss": 0.0022, + "step": 36452 + }, + { + "epoch": 0.72908, + "grad_norm": 0.03999490290880203, + "learning_rate": 4.148158560784678e-06, + "loss": 0.0028, + "step": 36454 + }, + { + "epoch": 0.72912, + "grad_norm": 0.04596409946680069, + "learning_rate": 4.147026387229047e-06, + "loss": 0.0041, + "step": 36456 + }, + { + "epoch": 0.72916, + "grad_norm": 0.44225507974624634, + "learning_rate": 4.1458943277801545e-06, + "loss": 0.0142, + "step": 36458 + }, + { + "epoch": 0.7292, + "grad_norm": 9.275355339050293, + "learning_rate": 4.144762382460059e-06, + "loss": 0.1173, + "step": 36460 + }, + { + "epoch": 0.72924, + "grad_norm": 0.22289355099201202, + "learning_rate": 4.1436305512908415e-06, + "loss": 0.0038, + "step": 36462 + }, + { + "epoch": 0.72928, + "grad_norm": 5.402467250823975, + "learning_rate": 4.142498834294555e-06, + "loss": 0.0936, + "step": 36464 + }, + { + "epoch": 0.72932, + "grad_norm": 12.018261909484863, + "learning_rate": 4.1413672314932765e-06, + "loss": 0.2388, + "step": 36466 + }, + { + "epoch": 0.72936, + "grad_norm": 0.11795645207166672, + "learning_rate": 4.140235742909057e-06, + "loss": 0.0433, + "step": 36468 + }, + { + "epoch": 0.7294, + "grad_norm": 0.5238744020462036, + "learning_rate": 4.1391043685639576e-06, + "loss": 0.0094, + "step": 36470 + }, + { + "epoch": 0.72944, + "grad_norm": 3.6503937244415283, + "learning_rate": 4.137973108480039e-06, + "loss": 0.0595, + "step": 36472 + }, + { + "epoch": 0.72948, + "grad_norm": 0.034979529678821564, + "learning_rate": 4.1368419626793495e-06, + "loss": 0.0014, + "step": 36474 + }, + { + "epoch": 0.72952, + "grad_norm": 0.04218033701181412, + "learning_rate": 4.135710931183945e-06, + "loss": 0.001, + "step": 36476 + }, + { + "epoch": 0.72956, + "grad_norm": 0.12001907825469971, + "learning_rate": 4.134580014015875e-06, + "loss": 0.0021, + "step": 36478 + }, + { + "epoch": 0.7296, + "grad_norm": 0.6132646799087524, + "learning_rate": 4.133449211197188e-06, + "loss": 0.0077, + "step": 36480 + }, + { + "epoch": 0.72964, + "grad_norm": 0.2748970687389374, + "learning_rate": 4.132318522749928e-06, + "loss": 0.0037, + "step": 36482 + }, + { + "epoch": 0.72968, + "grad_norm": 0.42647042870521545, + "learning_rate": 4.131187948696143e-06, + "loss": 0.005, + "step": 36484 + }, + { + "epoch": 0.72972, + "grad_norm": 0.16856083273887634, + "learning_rate": 4.130057489057866e-06, + "loss": 0.003, + "step": 36486 + }, + { + "epoch": 0.72976, + "grad_norm": 0.007131832651793957, + "learning_rate": 4.128927143857141e-06, + "loss": 0.0149, + "step": 36488 + }, + { + "epoch": 0.7298, + "grad_norm": 0.25579383969306946, + "learning_rate": 4.127796913116004e-06, + "loss": 0.0078, + "step": 36490 + }, + { + "epoch": 0.72984, + "grad_norm": 0.6888216137886047, + "learning_rate": 4.12666679685649e-06, + "loss": 0.0093, + "step": 36492 + }, + { + "epoch": 0.72988, + "grad_norm": 0.4995808005332947, + "learning_rate": 4.125536795100633e-06, + "loss": 0.0085, + "step": 36494 + }, + { + "epoch": 0.72992, + "grad_norm": 0.06901977211236954, + "learning_rate": 4.124406907870454e-06, + "loss": 0.0019, + "step": 36496 + }, + { + "epoch": 0.72996, + "grad_norm": 6.3128485679626465, + "learning_rate": 4.123277135187995e-06, + "loss": 0.0908, + "step": 36498 + }, + { + "epoch": 0.73, + "grad_norm": 0.2795031666755676, + "learning_rate": 4.12214747707527e-06, + "loss": 0.0028, + "step": 36500 + }, + { + "epoch": 0.73004, + "grad_norm": 5.239640712738037, + "learning_rate": 4.121017933554306e-06, + "loss": 0.0655, + "step": 36502 + }, + { + "epoch": 0.73008, + "grad_norm": 0.22023038566112518, + "learning_rate": 4.119888504647124e-06, + "loss": 0.0357, + "step": 36504 + }, + { + "epoch": 0.73012, + "grad_norm": 0.11073442548513412, + "learning_rate": 4.118759190375742e-06, + "loss": 0.0024, + "step": 36506 + }, + { + "epoch": 0.73016, + "grad_norm": 0.3187287747859955, + "learning_rate": 4.117629990762182e-06, + "loss": 0.0043, + "step": 36508 + }, + { + "epoch": 0.7302, + "grad_norm": 0.0716082751750946, + "learning_rate": 4.1165009058284496e-06, + "loss": 0.0088, + "step": 36510 + }, + { + "epoch": 0.73024, + "grad_norm": 1.1204296350479126, + "learning_rate": 4.115371935596562e-06, + "loss": 0.5597, + "step": 36512 + }, + { + "epoch": 0.73028, + "grad_norm": 0.0894218385219574, + "learning_rate": 4.114243080088527e-06, + "loss": 0.0013, + "step": 36514 + }, + { + "epoch": 0.73032, + "grad_norm": 1.4001100063323975, + "learning_rate": 4.113114339326358e-06, + "loss": 0.0292, + "step": 36516 + }, + { + "epoch": 0.73036, + "grad_norm": 0.11834389716386795, + "learning_rate": 4.111985713332047e-06, + "loss": 0.0033, + "step": 36518 + }, + { + "epoch": 0.7304, + "grad_norm": 1.489597201347351, + "learning_rate": 4.110857202127615e-06, + "loss": 0.0179, + "step": 36520 + }, + { + "epoch": 0.73044, + "grad_norm": 0.007349444553256035, + "learning_rate": 4.109728805735049e-06, + "loss": 0.0117, + "step": 36522 + }, + { + "epoch": 0.73048, + "grad_norm": 1.3002409934997559, + "learning_rate": 4.108600524176353e-06, + "loss": 0.0285, + "step": 36524 + }, + { + "epoch": 0.73052, + "grad_norm": 0.1865215003490448, + "learning_rate": 4.107472357473525e-06, + "loss": 0.0982, + "step": 36526 + }, + { + "epoch": 0.73056, + "grad_norm": 5.497201442718506, + "learning_rate": 4.106344305648552e-06, + "loss": 0.0673, + "step": 36528 + }, + { + "epoch": 0.7306, + "grad_norm": 0.5089060664176941, + "learning_rate": 4.105216368723437e-06, + "loss": 0.0061, + "step": 36530 + }, + { + "epoch": 0.73064, + "grad_norm": 0.49338918924331665, + "learning_rate": 4.104088546720161e-06, + "loss": 0.0254, + "step": 36532 + }, + { + "epoch": 0.73068, + "grad_norm": 0.38894665241241455, + "learning_rate": 4.102960839660713e-06, + "loss": 0.0056, + "step": 36534 + }, + { + "epoch": 0.73072, + "grad_norm": 0.29430443048477173, + "learning_rate": 4.1018332475670795e-06, + "loss": 0.0096, + "step": 36536 + }, + { + "epoch": 0.73076, + "grad_norm": 0.37714749574661255, + "learning_rate": 4.100705770461245e-06, + "loss": 0.0212, + "step": 36538 + }, + { + "epoch": 0.7308, + "grad_norm": 0.007208532188087702, + "learning_rate": 4.099578408365192e-06, + "loss": 0.2837, + "step": 36540 + }, + { + "epoch": 0.73084, + "grad_norm": 0.03943221643567085, + "learning_rate": 4.098451161300891e-06, + "loss": 0.0013, + "step": 36542 + }, + { + "epoch": 0.73088, + "grad_norm": 0.05705014243721962, + "learning_rate": 4.097324029290323e-06, + "loss": 0.0009, + "step": 36544 + }, + { + "epoch": 0.73092, + "grad_norm": 1.0421491861343384, + "learning_rate": 4.096197012355462e-06, + "loss": 0.0185, + "step": 36546 + }, + { + "epoch": 0.73096, + "grad_norm": 8.006577491760254, + "learning_rate": 4.0950701105182835e-06, + "loss": 0.1062, + "step": 36548 + }, + { + "epoch": 0.731, + "grad_norm": 0.11827359348535538, + "learning_rate": 4.093943323800746e-06, + "loss": 0.0019, + "step": 36550 + }, + { + "epoch": 0.73104, + "grad_norm": 0.04483804106712341, + "learning_rate": 4.092816652224831e-06, + "loss": 0.0007, + "step": 36552 + }, + { + "epoch": 0.73108, + "grad_norm": 0.01960039883852005, + "learning_rate": 4.091690095812493e-06, + "loss": 0.0066, + "step": 36554 + }, + { + "epoch": 0.73112, + "grad_norm": 0.29531386494636536, + "learning_rate": 4.090563654585699e-06, + "loss": 0.0051, + "step": 36556 + }, + { + "epoch": 0.73116, + "grad_norm": 0.02468755468726158, + "learning_rate": 4.089437328566413e-06, + "loss": 0.0033, + "step": 36558 + }, + { + "epoch": 0.7312, + "grad_norm": 0.1501748412847519, + "learning_rate": 4.08831111777658e-06, + "loss": 0.0136, + "step": 36560 + }, + { + "epoch": 0.73124, + "grad_norm": 0.24982596933841705, + "learning_rate": 4.0871850222381735e-06, + "loss": 0.0051, + "step": 36562 + }, + { + "epoch": 0.73128, + "grad_norm": 4.022255897521973, + "learning_rate": 4.086059041973136e-06, + "loss": 0.0573, + "step": 36564 + }, + { + "epoch": 0.73132, + "grad_norm": 0.04989243298768997, + "learning_rate": 4.084933177003423e-06, + "loss": 0.0008, + "step": 36566 + }, + { + "epoch": 0.73136, + "grad_norm": 1.3820081949234009, + "learning_rate": 4.0838074273509805e-06, + "loss": 0.0182, + "step": 36568 + }, + { + "epoch": 0.7314, + "grad_norm": 2.1947884559631348, + "learning_rate": 4.08268179303776e-06, + "loss": 0.0282, + "step": 36570 + }, + { + "epoch": 0.73144, + "grad_norm": 0.13435980677604675, + "learning_rate": 4.081556274085704e-06, + "loss": 0.0033, + "step": 36572 + }, + { + "epoch": 0.73148, + "grad_norm": 0.056391723453998566, + "learning_rate": 4.08043087051676e-06, + "loss": 0.0039, + "step": 36574 + }, + { + "epoch": 0.73152, + "grad_norm": 0.020113935694098473, + "learning_rate": 4.079305582352858e-06, + "loss": 0.0013, + "step": 36576 + }, + { + "epoch": 0.73156, + "grad_norm": 0.45230820775032043, + "learning_rate": 4.0781804096159435e-06, + "loss": 0.0089, + "step": 36578 + }, + { + "epoch": 0.7316, + "grad_norm": 0.021878225728869438, + "learning_rate": 4.0770553523279535e-06, + "loss": 0.0014, + "step": 36580 + }, + { + "epoch": 0.73164, + "grad_norm": 0.047866713255643845, + "learning_rate": 4.075930410510812e-06, + "loss": 0.0014, + "step": 36582 + }, + { + "epoch": 0.73168, + "grad_norm": 0.0862593874335289, + "learning_rate": 4.074805584186464e-06, + "loss": 0.0025, + "step": 36584 + }, + { + "epoch": 0.73172, + "grad_norm": 0.019508883357048035, + "learning_rate": 4.073680873376824e-06, + "loss": 0.0023, + "step": 36586 + }, + { + "epoch": 0.73176, + "grad_norm": 0.0028324536979198456, + "learning_rate": 4.072556278103834e-06, + "loss": 0.001, + "step": 36588 + }, + { + "epoch": 0.7318, + "grad_norm": 0.13257217407226562, + "learning_rate": 4.071431798389408e-06, + "loss": 0.164, + "step": 36590 + }, + { + "epoch": 0.73184, + "grad_norm": 0.016421010717749596, + "learning_rate": 4.0703074342554705e-06, + "loss": 0.0004, + "step": 36592 + }, + { + "epoch": 0.73188, + "grad_norm": 0.06982171535491943, + "learning_rate": 4.0691831857239474e-06, + "loss": 0.0089, + "step": 36594 + }, + { + "epoch": 0.73192, + "grad_norm": 0.016464507207274437, + "learning_rate": 4.068059052816743e-06, + "loss": 0.0025, + "step": 36596 + }, + { + "epoch": 0.73196, + "grad_norm": 0.03703216090798378, + "learning_rate": 4.0669350355557876e-06, + "loss": 0.0044, + "step": 36598 + }, + { + "epoch": 0.732, + "grad_norm": 0.05035844445228577, + "learning_rate": 4.065811133962987e-06, + "loss": 0.0246, + "step": 36600 + }, + { + "epoch": 0.73204, + "grad_norm": 0.2636191248893738, + "learning_rate": 4.0646873480602525e-06, + "loss": 0.0112, + "step": 36602 + }, + { + "epoch": 0.73208, + "grad_norm": 0.09716726094484329, + "learning_rate": 4.0635636778694935e-06, + "loss": 0.0388, + "step": 36604 + }, + { + "epoch": 0.73212, + "grad_norm": 0.1922415941953659, + "learning_rate": 4.06244012341262e-06, + "loss": 0.0025, + "step": 36606 + }, + { + "epoch": 0.73216, + "grad_norm": 0.17619144916534424, + "learning_rate": 4.06131668471153e-06, + "loss": 0.0186, + "step": 36608 + }, + { + "epoch": 0.7322, + "grad_norm": 1.0322282314300537, + "learning_rate": 4.06019336178813e-06, + "loss": 0.0133, + "step": 36610 + }, + { + "epoch": 0.73224, + "grad_norm": 0.0065015689469873905, + "learning_rate": 4.059070154664317e-06, + "loss": 0.0092, + "step": 36612 + }, + { + "epoch": 0.73228, + "grad_norm": 0.3692048192024231, + "learning_rate": 4.0579470633619896e-06, + "loss": 0.0182, + "step": 36614 + }, + { + "epoch": 0.73232, + "grad_norm": 0.12055213749408722, + "learning_rate": 4.056824087903048e-06, + "loss": 0.0015, + "step": 36616 + }, + { + "epoch": 0.73236, + "grad_norm": 0.07934419810771942, + "learning_rate": 4.055701228309372e-06, + "loss": 0.0008, + "step": 36618 + }, + { + "epoch": 0.7324, + "grad_norm": 0.2746652662754059, + "learning_rate": 4.05457848460287e-06, + "loss": 0.0045, + "step": 36620 + }, + { + "epoch": 0.73244, + "grad_norm": 0.15530522167682648, + "learning_rate": 4.053455856805416e-06, + "loss": 0.0022, + "step": 36622 + }, + { + "epoch": 0.73248, + "grad_norm": 0.7238077521324158, + "learning_rate": 4.052333344938902e-06, + "loss": 0.0105, + "step": 36624 + }, + { + "epoch": 0.73252, + "grad_norm": 0.007046421058475971, + "learning_rate": 4.051210949025216e-06, + "loss": 0.0057, + "step": 36626 + }, + { + "epoch": 0.73256, + "grad_norm": 0.28595349192619324, + "learning_rate": 4.050088669086227e-06, + "loss": 0.0046, + "step": 36628 + }, + { + "epoch": 0.7326, + "grad_norm": 0.9284149408340454, + "learning_rate": 4.048966505143831e-06, + "loss": 0.0173, + "step": 36630 + }, + { + "epoch": 0.73264, + "grad_norm": 0.04088468477129936, + "learning_rate": 4.0478444572198925e-06, + "loss": 0.1505, + "step": 36632 + }, + { + "epoch": 0.73268, + "grad_norm": 0.7909045219421387, + "learning_rate": 4.046722525336291e-06, + "loss": 0.0122, + "step": 36634 + }, + { + "epoch": 0.73272, + "grad_norm": 0.31055933237075806, + "learning_rate": 4.0456007095148976e-06, + "loss": 0.0077, + "step": 36636 + }, + { + "epoch": 0.73276, + "grad_norm": 0.22581855952739716, + "learning_rate": 4.0444790097775885e-06, + "loss": 0.0031, + "step": 36638 + }, + { + "epoch": 0.7328, + "grad_norm": 0.006373594515025616, + "learning_rate": 4.04335742614622e-06, + "loss": 0.0874, + "step": 36640 + }, + { + "epoch": 0.73284, + "grad_norm": 0.08096469193696976, + "learning_rate": 4.042235958642672e-06, + "loss": 0.0724, + "step": 36642 + }, + { + "epoch": 0.73288, + "grad_norm": 0.9870432615280151, + "learning_rate": 4.041114607288799e-06, + "loss": 0.0104, + "step": 36644 + }, + { + "epoch": 0.73292, + "grad_norm": 0.017834220081567764, + "learning_rate": 4.039993372106462e-06, + "loss": 0.1158, + "step": 36646 + }, + { + "epoch": 0.73296, + "grad_norm": 0.030705435201525688, + "learning_rate": 4.038872253117529e-06, + "loss": 0.0215, + "step": 36648 + }, + { + "epoch": 0.733, + "grad_norm": 0.12248971313238144, + "learning_rate": 4.037751250343841e-06, + "loss": 0.0015, + "step": 36650 + }, + { + "epoch": 0.73304, + "grad_norm": 0.08712133020162582, + "learning_rate": 4.03663036380727e-06, + "loss": 0.0024, + "step": 36652 + }, + { + "epoch": 0.73308, + "grad_norm": 0.031768474727869034, + "learning_rate": 4.035509593529657e-06, + "loss": 0.0108, + "step": 36654 + }, + { + "epoch": 0.73312, + "grad_norm": 0.03850848227739334, + "learning_rate": 4.034388939532855e-06, + "loss": 0.0022, + "step": 36656 + }, + { + "epoch": 0.73316, + "grad_norm": 0.859061598777771, + "learning_rate": 4.033268401838712e-06, + "loss": 0.0102, + "step": 36658 + }, + { + "epoch": 0.7332, + "grad_norm": 0.18880419433116913, + "learning_rate": 4.032147980469072e-06, + "loss": 0.0023, + "step": 36660 + }, + { + "epoch": 0.73324, + "grad_norm": 0.12928073108196259, + "learning_rate": 4.031027675445785e-06, + "loss": 0.0098, + "step": 36662 + }, + { + "epoch": 0.73328, + "grad_norm": 0.041382402181625366, + "learning_rate": 4.029907486790682e-06, + "loss": 0.0009, + "step": 36664 + }, + { + "epoch": 0.73332, + "grad_norm": 0.05399594083428383, + "learning_rate": 4.0287874145256066e-06, + "loss": 0.0006, + "step": 36666 + }, + { + "epoch": 0.73336, + "grad_norm": 0.07707379758358002, + "learning_rate": 4.027667458672394e-06, + "loss": 0.0015, + "step": 36668 + }, + { + "epoch": 0.7334, + "grad_norm": 0.06053093820810318, + "learning_rate": 4.026547619252883e-06, + "loss": 0.0021, + "step": 36670 + }, + { + "epoch": 0.73344, + "grad_norm": 5.8013787269592285, + "learning_rate": 4.025427896288895e-06, + "loss": 0.0806, + "step": 36672 + }, + { + "epoch": 0.73348, + "grad_norm": 0.04886800795793533, + "learning_rate": 4.024308289802272e-06, + "loss": 0.0033, + "step": 36674 + }, + { + "epoch": 0.73352, + "grad_norm": 0.011124156415462494, + "learning_rate": 4.023188799814833e-06, + "loss": 0.0005, + "step": 36676 + }, + { + "epoch": 0.73356, + "grad_norm": 0.17093032598495483, + "learning_rate": 4.022069426348405e-06, + "loss": 0.002, + "step": 36678 + }, + { + "epoch": 0.7336, + "grad_norm": 0.0401415154337883, + "learning_rate": 4.020950169424815e-06, + "loss": 0.0004, + "step": 36680 + }, + { + "epoch": 0.73364, + "grad_norm": 0.007856769487261772, + "learning_rate": 4.019831029065873e-06, + "loss": 0.0001, + "step": 36682 + }, + { + "epoch": 0.73368, + "grad_norm": 0.08782428503036499, + "learning_rate": 4.018712005293409e-06, + "loss": 0.0012, + "step": 36684 + }, + { + "epoch": 0.73372, + "grad_norm": 0.05822417512536049, + "learning_rate": 4.017593098129229e-06, + "loss": 0.003, + "step": 36686 + }, + { + "epoch": 0.73376, + "grad_norm": 0.10701453685760498, + "learning_rate": 4.016474307595157e-06, + "loss": 0.0013, + "step": 36688 + }, + { + "epoch": 0.7338, + "grad_norm": 0.12227456271648407, + "learning_rate": 4.015355633712996e-06, + "loss": 0.0021, + "step": 36690 + }, + { + "epoch": 0.73384, + "grad_norm": 0.01358648668974638, + "learning_rate": 4.014237076504558e-06, + "loss": 0.4108, + "step": 36692 + }, + { + "epoch": 0.73388, + "grad_norm": 1.1387336254119873, + "learning_rate": 4.013118635991653e-06, + "loss": 0.0097, + "step": 36694 + }, + { + "epoch": 0.73392, + "grad_norm": 0.011114876717329025, + "learning_rate": 4.012000312196079e-06, + "loss": 0.0014, + "step": 36696 + }, + { + "epoch": 0.73396, + "grad_norm": 0.02584102377295494, + "learning_rate": 4.010882105139643e-06, + "loss": 0.0075, + "step": 36698 + }, + { + "epoch": 0.734, + "grad_norm": 0.06320419907569885, + "learning_rate": 4.009764014844143e-06, + "loss": 0.0173, + "step": 36700 + }, + { + "epoch": 0.73404, + "grad_norm": 0.1629435271024704, + "learning_rate": 4.008646041331377e-06, + "loss": 0.0184, + "step": 36702 + }, + { + "epoch": 0.73408, + "grad_norm": 0.11662520468235016, + "learning_rate": 4.007528184623141e-06, + "loss": 0.002, + "step": 36704 + }, + { + "epoch": 0.73412, + "grad_norm": 0.006103876046836376, + "learning_rate": 4.006410444741232e-06, + "loss": 0.0002, + "step": 36706 + }, + { + "epoch": 0.73416, + "grad_norm": 0.8420383334159851, + "learning_rate": 4.005292821707431e-06, + "loss": 0.0093, + "step": 36708 + }, + { + "epoch": 0.7342, + "grad_norm": 0.06103360652923584, + "learning_rate": 4.004175315543538e-06, + "loss": 0.0055, + "step": 36710 + }, + { + "epoch": 0.73424, + "grad_norm": 2.4666919708251953, + "learning_rate": 4.003057926271331e-06, + "loss": 0.0312, + "step": 36712 + }, + { + "epoch": 0.73428, + "grad_norm": 0.0473816841840744, + "learning_rate": 4.001940653912598e-06, + "loss": 0.0014, + "step": 36714 + }, + { + "epoch": 0.73432, + "grad_norm": 0.00726566044613719, + "learning_rate": 4.000823498489123e-06, + "loss": 0.0001, + "step": 36716 + }, + { + "epoch": 0.73436, + "grad_norm": 0.02724340744316578, + "learning_rate": 3.999706460022675e-06, + "loss": 0.0017, + "step": 36718 + }, + { + "epoch": 0.7344, + "grad_norm": 0.12861202657222748, + "learning_rate": 3.998589538535046e-06, + "loss": 0.0037, + "step": 36720 + }, + { + "epoch": 0.73444, + "grad_norm": 0.2024446576833725, + "learning_rate": 3.997472734048001e-06, + "loss": 0.0043, + "step": 36722 + }, + { + "epoch": 0.73448, + "grad_norm": 0.2279440313577652, + "learning_rate": 3.996356046583314e-06, + "loss": 0.0026, + "step": 36724 + }, + { + "epoch": 0.73452, + "grad_norm": 1.4563530683517456, + "learning_rate": 3.995239476162758e-06, + "loss": 0.0133, + "step": 36726 + }, + { + "epoch": 0.73456, + "grad_norm": 0.3103003203868866, + "learning_rate": 3.994123022808103e-06, + "loss": 0.0041, + "step": 36728 + }, + { + "epoch": 0.7346, + "grad_norm": 0.024268843233585358, + "learning_rate": 3.993006686541108e-06, + "loss": 0.0052, + "step": 36730 + }, + { + "epoch": 0.73464, + "grad_norm": 1.3556392192840576, + "learning_rate": 3.99189046738354e-06, + "loss": 0.019, + "step": 36732 + }, + { + "epoch": 0.73468, + "grad_norm": 12.395588874816895, + "learning_rate": 3.990774365357161e-06, + "loss": 0.3531, + "step": 36734 + }, + { + "epoch": 0.73472, + "grad_norm": 0.010941626504063606, + "learning_rate": 3.98965838048373e-06, + "loss": 0.0002, + "step": 36736 + }, + { + "epoch": 0.73476, + "grad_norm": 0.0008844839176163077, + "learning_rate": 3.988542512785006e-06, + "loss": 0.1156, + "step": 36738 + }, + { + "epoch": 0.7348, + "grad_norm": 0.3153142035007477, + "learning_rate": 3.987426762282733e-06, + "loss": 0.0035, + "step": 36740 + }, + { + "epoch": 0.73484, + "grad_norm": 0.003669999772682786, + "learning_rate": 3.986311128998679e-06, + "loss": 0.0003, + "step": 36742 + }, + { + "epoch": 0.73488, + "grad_norm": 0.027709627524018288, + "learning_rate": 3.985195612954581e-06, + "loss": 0.0003, + "step": 36744 + }, + { + "epoch": 0.73492, + "grad_norm": 0.041419290006160736, + "learning_rate": 3.984080214172191e-06, + "loss": 0.0008, + "step": 36746 + }, + { + "epoch": 0.73496, + "grad_norm": 0.20826968550682068, + "learning_rate": 3.982964932673259e-06, + "loss": 0.003, + "step": 36748 + }, + { + "epoch": 0.735, + "grad_norm": 0.002127312822267413, + "learning_rate": 3.981849768479516e-06, + "loss": 0.0002, + "step": 36750 + }, + { + "epoch": 0.73504, + "grad_norm": 0.05681801959872246, + "learning_rate": 3.980734721612718e-06, + "loss": 0.0078, + "step": 36752 + }, + { + "epoch": 0.73508, + "grad_norm": 0.1325502246618271, + "learning_rate": 3.97961979209459e-06, + "loss": 0.0013, + "step": 36754 + }, + { + "epoch": 0.73512, + "grad_norm": 1.2574840784072876, + "learning_rate": 3.978504979946876e-06, + "loss": 0.0196, + "step": 36756 + }, + { + "epoch": 0.73516, + "grad_norm": 1.2581146955490112, + "learning_rate": 3.977390285191306e-06, + "loss": 0.5798, + "step": 36758 + }, + { + "epoch": 0.7352, + "grad_norm": 0.23633025586605072, + "learning_rate": 3.976275707849616e-06, + "loss": 0.0053, + "step": 36760 + }, + { + "epoch": 0.73524, + "grad_norm": 0.07852756977081299, + "learning_rate": 3.97516124794353e-06, + "loss": 0.0221, + "step": 36762 + }, + { + "epoch": 0.73528, + "grad_norm": 0.11917464435100555, + "learning_rate": 3.974046905494777e-06, + "loss": 0.0013, + "step": 36764 + }, + { + "epoch": 0.73532, + "grad_norm": 0.005790765397250652, + "learning_rate": 3.972932680525082e-06, + "loss": 0.0022, + "step": 36766 + }, + { + "epoch": 0.73536, + "grad_norm": 0.039293091744184494, + "learning_rate": 3.971818573056168e-06, + "loss": 0.0017, + "step": 36768 + }, + { + "epoch": 0.7354, + "grad_norm": 1.1223692893981934, + "learning_rate": 3.970704583109755e-06, + "loss": 0.0151, + "step": 36770 + }, + { + "epoch": 0.73544, + "grad_norm": 0.04355599731206894, + "learning_rate": 3.969590710707556e-06, + "loss": 0.0017, + "step": 36772 + }, + { + "epoch": 0.73548, + "grad_norm": 2.935800313949585, + "learning_rate": 3.968476955871298e-06, + "loss": 0.0538, + "step": 36774 + }, + { + "epoch": 0.73552, + "grad_norm": 0.0009012670489028096, + "learning_rate": 3.967363318622682e-06, + "loss": 0.0035, + "step": 36776 + }, + { + "epoch": 0.73556, + "grad_norm": 0.2653495967388153, + "learning_rate": 3.9662497989834246e-06, + "loss": 0.0029, + "step": 36778 + }, + { + "epoch": 0.7356, + "grad_norm": 0.332628071308136, + "learning_rate": 3.965136396975235e-06, + "loss": 0.0034, + "step": 36780 + }, + { + "epoch": 0.73564, + "grad_norm": 0.15634587407112122, + "learning_rate": 3.964023112619816e-06, + "loss": 0.0029, + "step": 36782 + }, + { + "epoch": 0.73568, + "grad_norm": 0.26463425159454346, + "learning_rate": 3.962909945938879e-06, + "loss": 0.004, + "step": 36784 + }, + { + "epoch": 0.73572, + "grad_norm": 0.34495308995246887, + "learning_rate": 3.961796896954115e-06, + "loss": 0.0043, + "step": 36786 + }, + { + "epoch": 0.73576, + "grad_norm": 0.05172456428408623, + "learning_rate": 3.960683965687232e-06, + "loss": 0.0012, + "step": 36788 + }, + { + "epoch": 0.7358, + "grad_norm": 0.2651236653327942, + "learning_rate": 3.959571152159922e-06, + "loss": 0.0037, + "step": 36790 + }, + { + "epoch": 0.73584, + "grad_norm": 0.019558671861886978, + "learning_rate": 3.958458456393884e-06, + "loss": 0.0007, + "step": 36792 + }, + { + "epoch": 0.73588, + "grad_norm": 0.0006411226349882782, + "learning_rate": 3.957345878410808e-06, + "loss": 0.0092, + "step": 36794 + }, + { + "epoch": 0.73592, + "grad_norm": 0.07148347795009613, + "learning_rate": 3.956233418232389e-06, + "loss": 0.0013, + "step": 36796 + }, + { + "epoch": 0.73596, + "grad_norm": 0.022970881313085556, + "learning_rate": 3.955121075880307e-06, + "loss": 0.003, + "step": 36798 + }, + { + "epoch": 0.736, + "grad_norm": 0.04396575316786766, + "learning_rate": 3.954008851376252e-06, + "loss": 0.0013, + "step": 36800 + }, + { + "epoch": 0.73604, + "grad_norm": 0.07922078669071198, + "learning_rate": 3.952896744741911e-06, + "loss": 0.0014, + "step": 36802 + }, + { + "epoch": 0.73608, + "grad_norm": 0.023551687598228455, + "learning_rate": 3.951784755998954e-06, + "loss": 0.0003, + "step": 36804 + }, + { + "epoch": 0.73612, + "grad_norm": 3.235146999359131, + "learning_rate": 3.950672885169074e-06, + "loss": 0.034, + "step": 36806 + }, + { + "epoch": 0.73616, + "grad_norm": 0.379334419965744, + "learning_rate": 3.9495611322739325e-06, + "loss": 0.0038, + "step": 36808 + }, + { + "epoch": 0.7362, + "grad_norm": 0.2731438875198364, + "learning_rate": 3.94844949733522e-06, + "loss": 0.0025, + "step": 36810 + }, + { + "epoch": 0.73624, + "grad_norm": 0.11518782377243042, + "learning_rate": 3.947337980374596e-06, + "loss": 0.002, + "step": 36812 + }, + { + "epoch": 0.73628, + "grad_norm": 1.2635796070098877, + "learning_rate": 3.946226581413734e-06, + "loss": 0.0122, + "step": 36814 + }, + { + "epoch": 0.73632, + "grad_norm": 3.145796060562134, + "learning_rate": 3.945115300474306e-06, + "loss": 0.0387, + "step": 36816 + }, + { + "epoch": 0.73636, + "grad_norm": 0.033823270350694656, + "learning_rate": 3.944004137577968e-06, + "loss": 0.0003, + "step": 36818 + }, + { + "epoch": 0.7364, + "grad_norm": 0.023426493629813194, + "learning_rate": 3.942893092746387e-06, + "loss": 0.0007, + "step": 36820 + }, + { + "epoch": 0.73644, + "grad_norm": 0.09796269237995148, + "learning_rate": 3.941782166001225e-06, + "loss": 0.0022, + "step": 36822 + }, + { + "epoch": 0.73648, + "grad_norm": 0.055425483733415604, + "learning_rate": 3.940671357364137e-06, + "loss": 0.0013, + "step": 36824 + }, + { + "epoch": 0.73652, + "grad_norm": 22.566268920898438, + "learning_rate": 3.939560666856781e-06, + "loss": 0.3072, + "step": 36826 + }, + { + "epoch": 0.73656, + "grad_norm": 0.00530384574085474, + "learning_rate": 3.938450094500814e-06, + "loss": 0.0006, + "step": 36828 + }, + { + "epoch": 0.7366, + "grad_norm": 0.007812377996742725, + "learning_rate": 3.937339640317879e-06, + "loss": 0.0011, + "step": 36830 + }, + { + "epoch": 0.73664, + "grad_norm": 0.04200005903840065, + "learning_rate": 3.93622930432963e-06, + "loss": 0.0034, + "step": 36832 + }, + { + "epoch": 0.73668, + "grad_norm": 4.314445972442627, + "learning_rate": 3.935119086557712e-06, + "loss": 0.0632, + "step": 36834 + }, + { + "epoch": 0.73672, + "grad_norm": 0.4472809433937073, + "learning_rate": 3.934008987023768e-06, + "loss": 0.008, + "step": 36836 + }, + { + "epoch": 0.73676, + "grad_norm": 0.2224574238061905, + "learning_rate": 3.932899005749448e-06, + "loss": 0.0024, + "step": 36838 + }, + { + "epoch": 0.7368, + "grad_norm": 14.674774169921875, + "learning_rate": 3.931789142756377e-06, + "loss": 0.2506, + "step": 36840 + }, + { + "epoch": 0.73684, + "grad_norm": 2.5518674850463867, + "learning_rate": 3.930679398066209e-06, + "loss": 0.0456, + "step": 36842 + }, + { + "epoch": 0.73688, + "grad_norm": 0.5926717519760132, + "learning_rate": 3.929569771700565e-06, + "loss": 0.0137, + "step": 36844 + }, + { + "epoch": 0.73692, + "grad_norm": 0.017315898090600967, + "learning_rate": 3.928460263681086e-06, + "loss": 0.0008, + "step": 36846 + }, + { + "epoch": 0.73696, + "grad_norm": 0.11780912429094315, + "learning_rate": 3.927350874029397e-06, + "loss": 0.0012, + "step": 36848 + }, + { + "epoch": 0.737, + "grad_norm": 0.1441420614719391, + "learning_rate": 3.9262416027671354e-06, + "loss": 0.0042, + "step": 36850 + }, + { + "epoch": 0.73704, + "grad_norm": 0.0360276959836483, + "learning_rate": 3.925132449915917e-06, + "loss": 0.0009, + "step": 36852 + }, + { + "epoch": 0.73708, + "grad_norm": 0.010684586130082607, + "learning_rate": 3.9240234154973675e-06, + "loss": 0.0301, + "step": 36854 + }, + { + "epoch": 0.73712, + "grad_norm": 0.015758737921714783, + "learning_rate": 3.922914499533111e-06, + "loss": 0.0012, + "step": 36856 + }, + { + "epoch": 0.73716, + "grad_norm": 0.08075561374425888, + "learning_rate": 3.921805702044764e-06, + "loss": 0.0029, + "step": 36858 + }, + { + "epoch": 0.7372, + "grad_norm": 0.01293183397501707, + "learning_rate": 3.920697023053949e-06, + "loss": 0.0009, + "step": 36860 + }, + { + "epoch": 0.73724, + "grad_norm": 0.03594984486699104, + "learning_rate": 3.919588462582268e-06, + "loss": 0.034, + "step": 36862 + }, + { + "epoch": 0.73728, + "grad_norm": 0.01561798993498087, + "learning_rate": 3.918480020651346e-06, + "loss": 0.0016, + "step": 36864 + }, + { + "epoch": 0.73732, + "grad_norm": 0.17551450431346893, + "learning_rate": 3.917371697282785e-06, + "loss": 0.0071, + "step": 36866 + }, + { + "epoch": 0.73736, + "grad_norm": 2.864158868789673, + "learning_rate": 3.916263492498194e-06, + "loss": 0.048, + "step": 36868 + }, + { + "epoch": 0.7374, + "grad_norm": 0.003465390531346202, + "learning_rate": 3.915155406319181e-06, + "loss": 0.0002, + "step": 36870 + }, + { + "epoch": 0.73744, + "grad_norm": 0.7269207835197449, + "learning_rate": 3.91404743876734e-06, + "loss": 0.0127, + "step": 36872 + }, + { + "epoch": 0.73748, + "grad_norm": 0.07237668335437775, + "learning_rate": 3.912939589864283e-06, + "loss": 0.0723, + "step": 36874 + }, + { + "epoch": 0.73752, + "grad_norm": 0.0008467034203931689, + "learning_rate": 3.9118318596316e-06, + "loss": 0.0006, + "step": 36876 + }, + { + "epoch": 0.73756, + "grad_norm": 0.08162243664264679, + "learning_rate": 3.91072424809089e-06, + "loss": 0.0026, + "step": 36878 + }, + { + "epoch": 0.7376, + "grad_norm": 0.03963766619563103, + "learning_rate": 3.9096167552637454e-06, + "loss": 0.0015, + "step": 36880 + }, + { + "epoch": 0.73764, + "grad_norm": 0.05009407922625542, + "learning_rate": 3.90850938117176e-06, + "loss": 0.0011, + "step": 36882 + }, + { + "epoch": 0.73768, + "grad_norm": 0.0721624419093132, + "learning_rate": 3.907402125836518e-06, + "loss": 0.0009, + "step": 36884 + }, + { + "epoch": 0.73772, + "grad_norm": 3.5284712314605713, + "learning_rate": 3.906294989279608e-06, + "loss": 0.0386, + "step": 36886 + }, + { + "epoch": 0.73776, + "grad_norm": 0.1391194760799408, + "learning_rate": 3.905187971522615e-06, + "loss": 0.0393, + "step": 36888 + }, + { + "epoch": 0.7378, + "grad_norm": 0.12201707065105438, + "learning_rate": 3.90408107258712e-06, + "loss": 0.002, + "step": 36890 + }, + { + "epoch": 0.73784, + "grad_norm": 0.042391251772642136, + "learning_rate": 3.9029742924947054e-06, + "loss": 0.0005, + "step": 36892 + }, + { + "epoch": 0.73788, + "grad_norm": 0.014411268755793571, + "learning_rate": 3.901867631266939e-06, + "loss": 0.0032, + "step": 36894 + }, + { + "epoch": 0.73792, + "grad_norm": 0.05813998728990555, + "learning_rate": 3.900761088925411e-06, + "loss": 0.0039, + "step": 36896 + }, + { + "epoch": 0.73796, + "grad_norm": 14.163265228271484, + "learning_rate": 3.899654665491681e-06, + "loss": 0.251, + "step": 36898 + }, + { + "epoch": 0.738, + "grad_norm": 0.015950674191117287, + "learning_rate": 3.898548360987325e-06, + "loss": 0.0007, + "step": 36900 + }, + { + "epoch": 0.73804, + "grad_norm": 14.347043991088867, + "learning_rate": 3.897442175433909e-06, + "loss": 0.1897, + "step": 36902 + }, + { + "epoch": 0.73808, + "grad_norm": 0.010440831072628498, + "learning_rate": 3.896336108852999e-06, + "loss": 0.0003, + "step": 36904 + }, + { + "epoch": 0.73812, + "grad_norm": 0.011844846419990063, + "learning_rate": 3.895230161266163e-06, + "loss": 0.0006, + "step": 36906 + }, + { + "epoch": 0.73816, + "grad_norm": 0.00840231217443943, + "learning_rate": 3.894124332694956e-06, + "loss": 0.0073, + "step": 36908 + }, + { + "epoch": 0.7382, + "grad_norm": 0.0023939800448715687, + "learning_rate": 3.893018623160938e-06, + "loss": 0.0001, + "step": 36910 + }, + { + "epoch": 0.73824, + "grad_norm": 0.09234428405761719, + "learning_rate": 3.8919130326856645e-06, + "loss": 0.0019, + "step": 36912 + }, + { + "epoch": 0.73828, + "grad_norm": 0.17759446799755096, + "learning_rate": 3.890807561290692e-06, + "loss": 0.0028, + "step": 36914 + }, + { + "epoch": 0.73832, + "grad_norm": 0.056679047644138336, + "learning_rate": 3.8897022089975725e-06, + "loss": 0.0007, + "step": 36916 + }, + { + "epoch": 0.73836, + "grad_norm": 0.013613870367407799, + "learning_rate": 3.888596975827856e-06, + "loss": 0.0014, + "step": 36918 + }, + { + "epoch": 0.7384, + "grad_norm": 0.012537484988570213, + "learning_rate": 3.887491861803085e-06, + "loss": 0.0169, + "step": 36920 + }, + { + "epoch": 0.73844, + "grad_norm": 0.6455032825469971, + "learning_rate": 3.886386866944807e-06, + "loss": 0.0065, + "step": 36922 + }, + { + "epoch": 0.73848, + "grad_norm": 2.8605873584747314, + "learning_rate": 3.885281991274568e-06, + "loss": 0.0221, + "step": 36924 + }, + { + "epoch": 0.73852, + "grad_norm": 0.01856083609163761, + "learning_rate": 3.884177234813897e-06, + "loss": 0.0007, + "step": 36926 + }, + { + "epoch": 0.73856, + "grad_norm": 0.021003179252147675, + "learning_rate": 3.8830725975843474e-06, + "loss": 0.0066, + "step": 36928 + }, + { + "epoch": 0.7386, + "grad_norm": 0.06528956443071365, + "learning_rate": 3.88196807960744e-06, + "loss": 0.0064, + "step": 36930 + }, + { + "epoch": 0.73864, + "grad_norm": 0.3114141821861267, + "learning_rate": 3.880863680904721e-06, + "loss": 0.0075, + "step": 36932 + }, + { + "epoch": 0.73868, + "grad_norm": 0.011164918541908264, + "learning_rate": 3.879759401497712e-06, + "loss": 0.0036, + "step": 36934 + }, + { + "epoch": 0.73872, + "grad_norm": 0.07461822032928467, + "learning_rate": 3.878655241407943e-06, + "loss": 0.0012, + "step": 36936 + }, + { + "epoch": 0.73876, + "grad_norm": 0.2144385129213333, + "learning_rate": 3.877551200656946e-06, + "loss": 0.0048, + "step": 36938 + }, + { + "epoch": 0.7388, + "grad_norm": 0.2659952938556671, + "learning_rate": 3.876447279266238e-06, + "loss": 0.0066, + "step": 36940 + }, + { + "epoch": 0.73884, + "grad_norm": 0.10574542731046677, + "learning_rate": 3.8753434772573405e-06, + "loss": 0.0066, + "step": 36942 + }, + { + "epoch": 0.73888, + "grad_norm": 1.2907629013061523, + "learning_rate": 3.874239794651778e-06, + "loss": 0.0159, + "step": 36944 + }, + { + "epoch": 0.73892, + "grad_norm": 0.4333820641040802, + "learning_rate": 3.873136231471062e-06, + "loss": 0.0048, + "step": 36946 + }, + { + "epoch": 0.73896, + "grad_norm": 0.06124432384967804, + "learning_rate": 3.87203278773671e-06, + "loss": 0.0007, + "step": 36948 + }, + { + "epoch": 0.739, + "grad_norm": 0.10596076399087906, + "learning_rate": 3.8709294634702374e-06, + "loss": 0.0177, + "step": 36950 + }, + { + "epoch": 0.73904, + "grad_norm": 0.09338468313217163, + "learning_rate": 3.8698262586931465e-06, + "loss": 0.0013, + "step": 36952 + }, + { + "epoch": 0.73908, + "grad_norm": 0.10474805533885956, + "learning_rate": 3.8687231734269495e-06, + "loss": 0.0011, + "step": 36954 + }, + { + "epoch": 0.73912, + "grad_norm": 0.037933383136987686, + "learning_rate": 3.86762020769315e-06, + "loss": 0.0474, + "step": 36956 + }, + { + "epoch": 0.73916, + "grad_norm": 0.00819697231054306, + "learning_rate": 3.866517361513254e-06, + "loss": 0.0028, + "step": 36958 + }, + { + "epoch": 0.7392, + "grad_norm": 0.036881718784570694, + "learning_rate": 3.86541463490876e-06, + "loss": 0.0031, + "step": 36960 + }, + { + "epoch": 0.73924, + "grad_norm": 0.04137767478823662, + "learning_rate": 3.86431202790116e-06, + "loss": 0.0012, + "step": 36962 + }, + { + "epoch": 0.73928, + "grad_norm": 0.3380156457424164, + "learning_rate": 3.863209540511964e-06, + "loss": 0.0042, + "step": 36964 + }, + { + "epoch": 0.73932, + "grad_norm": 0.159647136926651, + "learning_rate": 3.862107172762653e-06, + "loss": 0.0024, + "step": 36966 + }, + { + "epoch": 0.73936, + "grad_norm": 0.01804082654416561, + "learning_rate": 3.861004924674724e-06, + "loss": 0.0046, + "step": 36968 + }, + { + "epoch": 0.7394, + "grad_norm": 19.546220779418945, + "learning_rate": 3.859902796269664e-06, + "loss": 0.3011, + "step": 36970 + }, + { + "epoch": 0.73944, + "grad_norm": 0.017992934212088585, + "learning_rate": 3.858800787568961e-06, + "loss": 0.0029, + "step": 36972 + }, + { + "epoch": 0.73948, + "grad_norm": 0.6673968434333801, + "learning_rate": 3.8576988985940974e-06, + "loss": 0.0364, + "step": 36974 + }, + { + "epoch": 0.73952, + "grad_norm": 1.0233354568481445, + "learning_rate": 3.856597129366556e-06, + "loss": 0.0156, + "step": 36976 + }, + { + "epoch": 0.73956, + "grad_norm": 12.294342994689941, + "learning_rate": 3.855495479907816e-06, + "loss": 0.2812, + "step": 36978 + }, + { + "epoch": 0.7396, + "grad_norm": 0.012027356773614883, + "learning_rate": 3.854393950239356e-06, + "loss": 0.0007, + "step": 36980 + }, + { + "epoch": 0.73964, + "grad_norm": 0.2664031982421875, + "learning_rate": 3.853292540382652e-06, + "loss": 0.044, + "step": 36982 + }, + { + "epoch": 0.73968, + "grad_norm": 0.006112410221248865, + "learning_rate": 3.852191250359168e-06, + "loss": 0.0009, + "step": 36984 + }, + { + "epoch": 0.73972, + "grad_norm": 0.019428232684731483, + "learning_rate": 3.8510900801903875e-06, + "loss": 0.0014, + "step": 36986 + }, + { + "epoch": 0.73976, + "grad_norm": 0.17096617817878723, + "learning_rate": 3.849989029897768e-06, + "loss": 0.0029, + "step": 36988 + }, + { + "epoch": 0.7398, + "grad_norm": 0.06348001956939697, + "learning_rate": 3.848888099502779e-06, + "loss": 0.0009, + "step": 36990 + }, + { + "epoch": 0.73984, + "grad_norm": 0.021814914420247078, + "learning_rate": 3.847787289026886e-06, + "loss": 0.0003, + "step": 36992 + }, + { + "epoch": 0.73988, + "grad_norm": 3.854985237121582, + "learning_rate": 3.8466865984915415e-06, + "loss": 0.0534, + "step": 36994 + }, + { + "epoch": 0.73992, + "grad_norm": 0.2574734091758728, + "learning_rate": 3.845586027918215e-06, + "loss": 0.0029, + "step": 36996 + }, + { + "epoch": 0.73996, + "grad_norm": 0.15419773757457733, + "learning_rate": 3.844485577328355e-06, + "loss": 0.0312, + "step": 36998 + }, + { + "epoch": 0.74, + "grad_norm": 0.014632870443165302, + "learning_rate": 3.8433852467434175e-06, + "loss": 0.0036, + "step": 37000 + }, + { + "epoch": 0.74004, + "grad_norm": 0.2613840103149414, + "learning_rate": 3.842285036184855e-06, + "loss": 0.0028, + "step": 37002 + }, + { + "epoch": 0.74008, + "grad_norm": 0.17807398736476898, + "learning_rate": 3.841184945674114e-06, + "loss": 0.0733, + "step": 37004 + }, + { + "epoch": 0.74012, + "grad_norm": 0.10689588636159897, + "learning_rate": 3.840084975232649e-06, + "loss": 0.0145, + "step": 37006 + }, + { + "epoch": 0.74016, + "grad_norm": 1.1840450763702393, + "learning_rate": 3.838985124881894e-06, + "loss": 0.0109, + "step": 37008 + }, + { + "epoch": 0.7402, + "grad_norm": 0.6925340890884399, + "learning_rate": 3.8378853946432956e-06, + "loss": 0.008, + "step": 37010 + }, + { + "epoch": 0.74024, + "grad_norm": 0.1182493269443512, + "learning_rate": 3.836785784538295e-06, + "loss": 0.0015, + "step": 37012 + }, + { + "epoch": 0.74028, + "grad_norm": 0.14288491010665894, + "learning_rate": 3.835686294588332e-06, + "loss": 0.0019, + "step": 37014 + }, + { + "epoch": 0.74032, + "grad_norm": 0.05030154064297676, + "learning_rate": 3.8345869248148305e-06, + "loss": 0.0035, + "step": 37016 + }, + { + "epoch": 0.74036, + "grad_norm": 8.863909721374512, + "learning_rate": 3.833487675239238e-06, + "loss": 0.1112, + "step": 37018 + }, + { + "epoch": 0.7404, + "grad_norm": 0.03357205167412758, + "learning_rate": 3.832388545882975e-06, + "loss": 0.0005, + "step": 37020 + }, + { + "epoch": 0.74044, + "grad_norm": 0.0006922458996996284, + "learning_rate": 3.831289536767473e-06, + "loss": 0.0009, + "step": 37022 + }, + { + "epoch": 0.74048, + "grad_norm": 0.6634942889213562, + "learning_rate": 3.830190647914157e-06, + "loss": 0.0067, + "step": 37024 + }, + { + "epoch": 0.74052, + "grad_norm": 0.6131115555763245, + "learning_rate": 3.8290918793444495e-06, + "loss": 0.0245, + "step": 37026 + }, + { + "epoch": 0.74056, + "grad_norm": 0.1566074937582016, + "learning_rate": 3.827993231079779e-06, + "loss": 0.0043, + "step": 37028 + }, + { + "epoch": 0.7406, + "grad_norm": 0.6094681620597839, + "learning_rate": 3.826894703141552e-06, + "loss": 0.0074, + "step": 37030 + }, + { + "epoch": 0.74064, + "grad_norm": 0.37059906125068665, + "learning_rate": 3.825796295551192e-06, + "loss": 0.5923, + "step": 37032 + }, + { + "epoch": 0.74068, + "grad_norm": 0.04346172511577606, + "learning_rate": 3.824698008330111e-06, + "loss": 0.0023, + "step": 37034 + }, + { + "epoch": 0.74072, + "grad_norm": 0.0443565808236599, + "learning_rate": 3.823599841499722e-06, + "loss": 0.0008, + "step": 37036 + }, + { + "epoch": 0.74076, + "grad_norm": 0.05461861938238144, + "learning_rate": 3.822501795081435e-06, + "loss": 0.0014, + "step": 37038 + }, + { + "epoch": 0.7408, + "grad_norm": 0.517964243888855, + "learning_rate": 3.821403869096658e-06, + "loss": 0.0045, + "step": 37040 + }, + { + "epoch": 0.74084, + "grad_norm": 0.5206900238990784, + "learning_rate": 3.820306063566791e-06, + "loss": 0.0083, + "step": 37042 + }, + { + "epoch": 0.74088, + "grad_norm": 0.054449278861284256, + "learning_rate": 3.819208378513238e-06, + "loss": 0.001, + "step": 37044 + }, + { + "epoch": 0.74092, + "grad_norm": 0.5296640396118164, + "learning_rate": 3.818110813957404e-06, + "loss": 0.0131, + "step": 37046 + }, + { + "epoch": 0.74096, + "grad_norm": 0.023139867931604385, + "learning_rate": 3.817013369920675e-06, + "loss": 0.0007, + "step": 37048 + }, + { + "epoch": 0.741, + "grad_norm": 0.11450035870075226, + "learning_rate": 3.81591604642446e-06, + "loss": 0.0018, + "step": 37050 + }, + { + "epoch": 0.74104, + "grad_norm": 0.043797582387924194, + "learning_rate": 3.81481884349014e-06, + "loss": 0.0062, + "step": 37052 + }, + { + "epoch": 0.74108, + "grad_norm": 0.012780791148543358, + "learning_rate": 3.8137217611391176e-06, + "loss": 0.0016, + "step": 37054 + }, + { + "epoch": 0.74112, + "grad_norm": 0.011874282732605934, + "learning_rate": 3.81262479939277e-06, + "loss": 0.0045, + "step": 37056 + }, + { + "epoch": 0.74116, + "grad_norm": 8.356731414794922, + "learning_rate": 3.8115279582724874e-06, + "loss": 0.0796, + "step": 37058 + }, + { + "epoch": 0.7412, + "grad_norm": 0.033971693366765976, + "learning_rate": 3.810431237799657e-06, + "loss": 0.0035, + "step": 37060 + }, + { + "epoch": 0.74124, + "grad_norm": 0.00014759314944967628, + "learning_rate": 3.809334637995653e-06, + "loss": 0.0008, + "step": 37062 + }, + { + "epoch": 0.74128, + "grad_norm": 10.092412948608398, + "learning_rate": 3.8082381588818574e-06, + "loss": 0.1791, + "step": 37064 + }, + { + "epoch": 0.74132, + "grad_norm": 0.011808822862803936, + "learning_rate": 3.8071418004796468e-06, + "loss": 0.0058, + "step": 37066 + }, + { + "epoch": 0.74136, + "grad_norm": 0.16648198664188385, + "learning_rate": 3.806045562810394e-06, + "loss": 0.0118, + "step": 37068 + }, + { + "epoch": 0.7414, + "grad_norm": 0.45787709951400757, + "learning_rate": 3.804949445895473e-06, + "loss": 0.0036, + "step": 37070 + }, + { + "epoch": 0.74144, + "grad_norm": 1.377711534500122, + "learning_rate": 3.8038534497562553e-06, + "loss": 0.0165, + "step": 37072 + }, + { + "epoch": 0.74148, + "grad_norm": 0.0061847432516515255, + "learning_rate": 3.802757574414101e-06, + "loss": 0.0149, + "step": 37074 + }, + { + "epoch": 0.74152, + "grad_norm": 0.10662473738193512, + "learning_rate": 3.801661819890378e-06, + "loss": 0.0034, + "step": 37076 + }, + { + "epoch": 0.74156, + "grad_norm": 0.008786111138761044, + "learning_rate": 3.8005661862064525e-06, + "loss": 0.0014, + "step": 37078 + }, + { + "epoch": 0.7416, + "grad_norm": 0.11345613747835159, + "learning_rate": 3.7994706733836738e-06, + "loss": 0.1907, + "step": 37080 + }, + { + "epoch": 0.74164, + "grad_norm": 0.13876090943813324, + "learning_rate": 3.7983752814434136e-06, + "loss": 0.0019, + "step": 37082 + }, + { + "epoch": 0.74168, + "grad_norm": 0.0015076813288033009, + "learning_rate": 3.797280010407014e-06, + "loss": 0.0044, + "step": 37084 + }, + { + "epoch": 0.74172, + "grad_norm": 0.030128709971904755, + "learning_rate": 3.7961848602958397e-06, + "loss": 0.0021, + "step": 37086 + }, + { + "epoch": 0.74176, + "grad_norm": 0.7665963768959045, + "learning_rate": 3.7950898311312325e-06, + "loss": 0.0089, + "step": 37088 + }, + { + "epoch": 0.7418, + "grad_norm": 0.13990731537342072, + "learning_rate": 3.793994922934544e-06, + "loss": 0.0019, + "step": 37090 + }, + { + "epoch": 0.74184, + "grad_norm": 0.0943998321890831, + "learning_rate": 3.792900135727122e-06, + "loss": 0.0011, + "step": 37092 + }, + { + "epoch": 0.74188, + "grad_norm": 0.015601537190377712, + "learning_rate": 3.7918054695303054e-06, + "loss": 0.0169, + "step": 37094 + }, + { + "epoch": 0.74192, + "grad_norm": 0.0022477535530924797, + "learning_rate": 3.790710924365436e-06, + "loss": 0.0005, + "step": 37096 + }, + { + "epoch": 0.74196, + "grad_norm": 0.25091472268104553, + "learning_rate": 3.7896165002538543e-06, + "loss": 0.0021, + "step": 37098 + }, + { + "epoch": 0.742, + "grad_norm": 0.2108025848865509, + "learning_rate": 3.7885221972168974e-06, + "loss": 0.0025, + "step": 37100 + }, + { + "epoch": 0.74204, + "grad_norm": 0.055564116686582565, + "learning_rate": 3.787428015275897e-06, + "loss": 0.0005, + "step": 37102 + }, + { + "epoch": 0.74208, + "grad_norm": 0.2777080833911896, + "learning_rate": 3.786333954452189e-06, + "loss": 0.0036, + "step": 37104 + }, + { + "epoch": 0.74212, + "grad_norm": 0.0012838824186474085, + "learning_rate": 3.7852400147670932e-06, + "loss": 0.0004, + "step": 37106 + }, + { + "epoch": 0.74216, + "grad_norm": 0.49313414096832275, + "learning_rate": 3.78414619624195e-06, + "loss": 0.005, + "step": 37108 + }, + { + "epoch": 0.7422, + "grad_norm": 2.9069297313690186, + "learning_rate": 3.783052498898073e-06, + "loss": 0.0337, + "step": 37110 + }, + { + "epoch": 0.74224, + "grad_norm": 0.3530168831348419, + "learning_rate": 3.7819589227567886e-06, + "loss": 0.0046, + "step": 37112 + }, + { + "epoch": 0.74228, + "grad_norm": 0.46879592537879944, + "learning_rate": 3.7808654678394197e-06, + "loss": 0.0101, + "step": 37114 + }, + { + "epoch": 0.74232, + "grad_norm": 0.00634798314422369, + "learning_rate": 3.7797721341672735e-06, + "loss": 0.0006, + "step": 37116 + }, + { + "epoch": 0.74236, + "grad_norm": 0.03593284264206886, + "learning_rate": 3.7786789217616783e-06, + "loss": 0.0026, + "step": 37118 + }, + { + "epoch": 0.7424, + "grad_norm": 1.5916515588760376, + "learning_rate": 3.7775858306439374e-06, + "loss": 0.0107, + "step": 37120 + }, + { + "epoch": 0.74244, + "grad_norm": 0.029705604538321495, + "learning_rate": 3.7764928608353635e-06, + "loss": 0.0008, + "step": 37122 + }, + { + "epoch": 0.74248, + "grad_norm": 1.092126727104187, + "learning_rate": 3.7754000123572666e-06, + "loss": 0.0481, + "step": 37124 + }, + { + "epoch": 0.74252, + "grad_norm": 0.007162534631788731, + "learning_rate": 3.7743072852309504e-06, + "loss": 0.0001, + "step": 37126 + }, + { + "epoch": 0.74256, + "grad_norm": 0.24561016261577606, + "learning_rate": 3.7732146794777225e-06, + "loss": 0.3203, + "step": 37128 + }, + { + "epoch": 0.7426, + "grad_norm": 0.15024785697460175, + "learning_rate": 3.772122195118877e-06, + "loss": 0.0069, + "step": 37130 + }, + { + "epoch": 0.74264, + "grad_norm": 0.08640042692422867, + "learning_rate": 3.7710298321757142e-06, + "loss": 0.0054, + "step": 37132 + }, + { + "epoch": 0.74268, + "grad_norm": 0.0009193642763420939, + "learning_rate": 3.7699375906695333e-06, + "loss": 0.0035, + "step": 37134 + }, + { + "epoch": 0.74272, + "grad_norm": 1.982927918434143, + "learning_rate": 3.7688454706216292e-06, + "loss": 0.0505, + "step": 37136 + }, + { + "epoch": 0.74276, + "grad_norm": 1.5510486364364624, + "learning_rate": 3.767753472053285e-06, + "loss": 0.015, + "step": 37138 + }, + { + "epoch": 0.7428, + "grad_norm": 0.003934736829251051, + "learning_rate": 3.766661594985801e-06, + "loss": 0.0012, + "step": 37140 + }, + { + "epoch": 0.74284, + "grad_norm": 0.502759575843811, + "learning_rate": 3.765569839440455e-06, + "loss": 0.0039, + "step": 37142 + }, + { + "epoch": 0.74288, + "grad_norm": 0.06746059656143188, + "learning_rate": 3.7644782054385354e-06, + "loss": 0.0275, + "step": 37144 + }, + { + "epoch": 0.74292, + "grad_norm": 2.4788577556610107, + "learning_rate": 3.763386693001326e-06, + "loss": 0.0217, + "step": 37146 + }, + { + "epoch": 0.74296, + "grad_norm": 0.044798847287893295, + "learning_rate": 3.7622953021500973e-06, + "loss": 0.0084, + "step": 37148 + }, + { + "epoch": 0.743, + "grad_norm": 0.00025423927581869066, + "learning_rate": 3.7612040329061405e-06, + "loss": 0.0015, + "step": 37150 + }, + { + "epoch": 0.74304, + "grad_norm": 0.10324070602655411, + "learning_rate": 3.760112885290719e-06, + "loss": 0.0017, + "step": 37152 + }, + { + "epoch": 0.74308, + "grad_norm": 18.251020431518555, + "learning_rate": 3.759021859325109e-06, + "loss": 0.533, + "step": 37154 + }, + { + "epoch": 0.74312, + "grad_norm": 3.096472978591919, + "learning_rate": 3.757930955030582e-06, + "loss": 0.0292, + "step": 37156 + }, + { + "epoch": 0.74316, + "grad_norm": 0.06896321475505829, + "learning_rate": 3.756840172428404e-06, + "loss": 0.001, + "step": 37158 + }, + { + "epoch": 0.7432, + "grad_norm": 0.3386240005493164, + "learning_rate": 3.7557495115398446e-06, + "loss": 0.0331, + "step": 37160 + }, + { + "epoch": 0.74324, + "grad_norm": 0.005600091069936752, + "learning_rate": 3.7546589723861594e-06, + "loss": 0.0149, + "step": 37162 + }, + { + "epoch": 0.74328, + "grad_norm": 0.02306092530488968, + "learning_rate": 3.7535685549886137e-06, + "loss": 0.0015, + "step": 37164 + }, + { + "epoch": 0.74332, + "grad_norm": 0.013635436072945595, + "learning_rate": 3.752478259368464e-06, + "loss": 0.002, + "step": 37166 + }, + { + "epoch": 0.74336, + "grad_norm": 0.0017647088970988989, + "learning_rate": 3.7513880855469708e-06, + "loss": 0.0006, + "step": 37168 + }, + { + "epoch": 0.7434, + "grad_norm": 0.7868532538414001, + "learning_rate": 3.7502980335453777e-06, + "loss": 0.0098, + "step": 37170 + }, + { + "epoch": 0.74344, + "grad_norm": 0.016499247401952744, + "learning_rate": 3.749208103384948e-06, + "loss": 0.0002, + "step": 37172 + }, + { + "epoch": 0.74348, + "grad_norm": 0.3183657228946686, + "learning_rate": 3.7481182950869176e-06, + "loss": 0.0021, + "step": 37174 + }, + { + "epoch": 0.74352, + "grad_norm": 0.024492286145687103, + "learning_rate": 3.747028608672547e-06, + "loss": 0.0003, + "step": 37176 + }, + { + "epoch": 0.74356, + "grad_norm": 0.0007659525726921856, + "learning_rate": 3.7459390441630693e-06, + "loss": 0.1264, + "step": 37178 + }, + { + "epoch": 0.7436, + "grad_norm": 0.20539714395999908, + "learning_rate": 3.7448496015797296e-06, + "loss": 0.0031, + "step": 37180 + }, + { + "epoch": 0.74364, + "grad_norm": 0.03434257209300995, + "learning_rate": 3.7437602809437713e-06, + "loss": 0.0003, + "step": 37182 + }, + { + "epoch": 0.74368, + "grad_norm": 0.08047926425933838, + "learning_rate": 3.7426710822764234e-06, + "loss": 0.0308, + "step": 37184 + }, + { + "epoch": 0.74372, + "grad_norm": 0.2913214862346649, + "learning_rate": 3.741582005598924e-06, + "loss": 0.0032, + "step": 37186 + }, + { + "epoch": 0.74376, + "grad_norm": 0.014682562090456486, + "learning_rate": 3.7404930509325054e-06, + "loss": 0.0003, + "step": 37188 + }, + { + "epoch": 0.7438, + "grad_norm": 0.4748970866203308, + "learning_rate": 3.7394042182983983e-06, + "loss": 0.005, + "step": 37190 + }, + { + "epoch": 0.74384, + "grad_norm": 0.02698575146496296, + "learning_rate": 3.738315507717828e-06, + "loss": 0.0013, + "step": 37192 + }, + { + "epoch": 0.74388, + "grad_norm": 0.001016833819448948, + "learning_rate": 3.7372269192120245e-06, + "loss": 0.004, + "step": 37194 + }, + { + "epoch": 0.74392, + "grad_norm": 0.07212288677692413, + "learning_rate": 3.7361384528022027e-06, + "loss": 0.2044, + "step": 37196 + }, + { + "epoch": 0.74396, + "grad_norm": 0.042950138449668884, + "learning_rate": 3.735050108509588e-06, + "loss": 0.0004, + "step": 37198 + }, + { + "epoch": 0.744, + "grad_norm": 0.0545138381421566, + "learning_rate": 3.7339618863553983e-06, + "loss": 0.0026, + "step": 37200 + }, + { + "epoch": 0.74404, + "grad_norm": 12.325044631958008, + "learning_rate": 3.7328737863608422e-06, + "loss": 0.1528, + "step": 37202 + }, + { + "epoch": 0.74408, + "grad_norm": 0.0018318616785109043, + "learning_rate": 3.731785808547145e-06, + "loss": 0.0072, + "step": 37204 + }, + { + "epoch": 0.74412, + "grad_norm": 0.4658692181110382, + "learning_rate": 3.7306979529355025e-06, + "loss": 0.0032, + "step": 37206 + }, + { + "epoch": 0.74416, + "grad_norm": 0.1311798095703125, + "learning_rate": 3.7296102195471394e-06, + "loss": 0.1509, + "step": 37208 + }, + { + "epoch": 0.7442, + "grad_norm": 0.2475178986787796, + "learning_rate": 3.728522608403249e-06, + "loss": 0.0022, + "step": 37210 + }, + { + "epoch": 0.74424, + "grad_norm": 1.9699668884277344, + "learning_rate": 3.727435119525039e-06, + "loss": 0.0168, + "step": 37212 + }, + { + "epoch": 0.74428, + "grad_norm": 0.03035101480782032, + "learning_rate": 3.7263477529337143e-06, + "loss": 0.138, + "step": 37214 + }, + { + "epoch": 0.74432, + "grad_norm": 0.22442203760147095, + "learning_rate": 3.7252605086504633e-06, + "loss": 0.0113, + "step": 37216 + }, + { + "epoch": 0.74436, + "grad_norm": 0.030860206112265587, + "learning_rate": 3.724173386696496e-06, + "loss": 0.0003, + "step": 37218 + }, + { + "epoch": 0.7444, + "grad_norm": 18.171430587768555, + "learning_rate": 3.723086387092997e-06, + "loss": 0.2515, + "step": 37220 + }, + { + "epoch": 0.74444, + "grad_norm": 0.0018290199805051088, + "learning_rate": 3.7219995098611584e-06, + "loss": 0.001, + "step": 37222 + }, + { + "epoch": 0.74448, + "grad_norm": 0.0008730236440896988, + "learning_rate": 3.7209127550221737e-06, + "loss": 0.0044, + "step": 37224 + }, + { + "epoch": 0.74452, + "grad_norm": 4.274622917175293, + "learning_rate": 3.71982612259723e-06, + "loss": 0.0505, + "step": 37226 + }, + { + "epoch": 0.74456, + "grad_norm": 0.01574675925076008, + "learning_rate": 3.718739612607505e-06, + "loss": 0.0001, + "step": 37228 + }, + { + "epoch": 0.7446, + "grad_norm": 0.06102366745471954, + "learning_rate": 3.7176532250741857e-06, + "loss": 0.0005, + "step": 37230 + }, + { + "epoch": 0.74464, + "grad_norm": 22.0614013671875, + "learning_rate": 3.716566960018452e-06, + "loss": 0.2042, + "step": 37232 + }, + { + "epoch": 0.74468, + "grad_norm": 0.012235701084136963, + "learning_rate": 3.715480817461479e-06, + "loss": 0.0002, + "step": 37234 + }, + { + "epoch": 0.74472, + "grad_norm": 0.031367745250463486, + "learning_rate": 3.7143947974244466e-06, + "loss": 0.001, + "step": 37236 + }, + { + "epoch": 0.74476, + "grad_norm": 0.058858346194028854, + "learning_rate": 3.7133088999285174e-06, + "loss": 0.0006, + "step": 37238 + }, + { + "epoch": 0.7448, + "grad_norm": 0.054843854159116745, + "learning_rate": 3.7122231249948747e-06, + "loss": 0.0016, + "step": 37240 + }, + { + "epoch": 0.74484, + "grad_norm": 0.2842358648777008, + "learning_rate": 3.711137472644676e-06, + "loss": 0.0025, + "step": 37242 + }, + { + "epoch": 0.74488, + "grad_norm": 0.14249858260154724, + "learning_rate": 3.710051942899089e-06, + "loss": 0.0054, + "step": 37244 + }, + { + "epoch": 0.74492, + "grad_norm": 1.1833070516586304, + "learning_rate": 3.708966535779278e-06, + "loss": 0.0113, + "step": 37246 + }, + { + "epoch": 0.74496, + "grad_norm": 4.5456953048706055, + "learning_rate": 3.7078812513064044e-06, + "loss": 0.0431, + "step": 37248 + }, + { + "epoch": 0.745, + "grad_norm": 0.03031904250383377, + "learning_rate": 3.7067960895016277e-06, + "loss": 0.0964, + "step": 37250 + }, + { + "epoch": 0.74504, + "grad_norm": 0.14770863950252533, + "learning_rate": 3.7057110503860984e-06, + "loss": 0.0028, + "step": 37252 + }, + { + "epoch": 0.74508, + "grad_norm": 0.07017827033996582, + "learning_rate": 3.704626133980972e-06, + "loss": 0.0014, + "step": 37254 + }, + { + "epoch": 0.74512, + "grad_norm": 0.0002878528612200171, + "learning_rate": 3.7035413403073995e-06, + "loss": 0.0014, + "step": 37256 + }, + { + "epoch": 0.74516, + "grad_norm": 0.9112129211425781, + "learning_rate": 3.702456669386535e-06, + "loss": 0.0057, + "step": 37258 + }, + { + "epoch": 0.7452, + "grad_norm": 0.015457157045602798, + "learning_rate": 3.7013721212395128e-06, + "loss": 0.0001, + "step": 37260 + }, + { + "epoch": 0.74524, + "grad_norm": 0.15641577541828156, + "learning_rate": 3.70028769588749e-06, + "loss": 0.0023, + "step": 37262 + }, + { + "epoch": 0.74528, + "grad_norm": 0.02528415247797966, + "learning_rate": 3.699203393351599e-06, + "loss": 0.0005, + "step": 37264 + }, + { + "epoch": 0.74532, + "grad_norm": 0.017068738117814064, + "learning_rate": 3.698119213652982e-06, + "loss": 0.0065, + "step": 37266 + }, + { + "epoch": 0.74536, + "grad_norm": 0.00459112785756588, + "learning_rate": 3.697035156812778e-06, + "loss": 0.0024, + "step": 37268 + }, + { + "epoch": 0.7454, + "grad_norm": 0.13196201622486115, + "learning_rate": 3.6959512228521123e-06, + "loss": 0.0432, + "step": 37270 + }, + { + "epoch": 0.74544, + "grad_norm": 0.21912981569766998, + "learning_rate": 3.694867411792129e-06, + "loss": 0.0025, + "step": 37272 + }, + { + "epoch": 0.74548, + "grad_norm": 0.2784512937068939, + "learning_rate": 3.6937837236539497e-06, + "loss": 0.0032, + "step": 37274 + }, + { + "epoch": 0.74552, + "grad_norm": 0.04292463883757591, + "learning_rate": 3.692700158458702e-06, + "loss": 0.0007, + "step": 37276 + }, + { + "epoch": 0.74556, + "grad_norm": 0.0020774647127836943, + "learning_rate": 3.691616716227513e-06, + "loss": 0.0001, + "step": 37278 + }, + { + "epoch": 0.7456, + "grad_norm": 0.004868479911237955, + "learning_rate": 3.6905333969815038e-06, + "loss": 0.0012, + "step": 37280 + }, + { + "epoch": 0.74564, + "grad_norm": 0.031343623995780945, + "learning_rate": 3.6894502007417965e-06, + "loss": 0.0016, + "step": 37282 + }, + { + "epoch": 0.74568, + "grad_norm": 0.14283506572246552, + "learning_rate": 3.688367127529504e-06, + "loss": 0.0017, + "step": 37284 + }, + { + "epoch": 0.74572, + "grad_norm": 0.009802279062569141, + "learning_rate": 3.687284177365744e-06, + "loss": 0.0003, + "step": 37286 + }, + { + "epoch": 0.74576, + "grad_norm": 2.982943534851074, + "learning_rate": 3.6862013502716286e-06, + "loss": 0.0226, + "step": 37288 + }, + { + "epoch": 0.7458, + "grad_norm": 0.004722327925264835, + "learning_rate": 3.685118646268272e-06, + "loss": 0.0001, + "step": 37290 + }, + { + "epoch": 0.74584, + "grad_norm": 0.007412976585328579, + "learning_rate": 3.6840360653767713e-06, + "loss": 0.0007, + "step": 37292 + }, + { + "epoch": 0.74588, + "grad_norm": 0.3129328191280365, + "learning_rate": 3.682953607618247e-06, + "loss": 0.0035, + "step": 37294 + }, + { + "epoch": 0.74592, + "grad_norm": 20.152799606323242, + "learning_rate": 3.6818712730137894e-06, + "loss": 0.5684, + "step": 37296 + }, + { + "epoch": 0.74596, + "grad_norm": 0.1373240351676941, + "learning_rate": 3.6807890615845053e-06, + "loss": 0.0015, + "step": 37298 + }, + { + "epoch": 0.746, + "grad_norm": 0.023103700950741768, + "learning_rate": 3.679706973351491e-06, + "loss": 0.0006, + "step": 37300 + }, + { + "epoch": 0.74604, + "grad_norm": 0.24267642199993134, + "learning_rate": 3.6786250083358442e-06, + "loss": 0.0031, + "step": 37302 + }, + { + "epoch": 0.74608, + "grad_norm": 1.271437644958496, + "learning_rate": 3.67754316655866e-06, + "loss": 0.0117, + "step": 37304 + }, + { + "epoch": 0.74612, + "grad_norm": 0.4328921437263489, + "learning_rate": 3.67646144804102e-06, + "loss": 0.006, + "step": 37306 + }, + { + "epoch": 0.74616, + "grad_norm": 0.11545508354902267, + "learning_rate": 3.6753798528040276e-06, + "loss": 0.0044, + "step": 37308 + }, + { + "epoch": 0.7462, + "grad_norm": 0.07203338295221329, + "learning_rate": 3.674298380868756e-06, + "loss": 0.002, + "step": 37310 + }, + { + "epoch": 0.74624, + "grad_norm": 2.655513286590576, + "learning_rate": 3.6732170322562954e-06, + "loss": 0.0302, + "step": 37312 + }, + { + "epoch": 0.74628, + "grad_norm": 0.16931743919849396, + "learning_rate": 3.6721358069877256e-06, + "loss": 0.0022, + "step": 37314 + }, + { + "epoch": 0.74632, + "grad_norm": 0.5986286401748657, + "learning_rate": 3.6710547050841284e-06, + "loss": 0.035, + "step": 37316 + }, + { + "epoch": 0.74636, + "grad_norm": 0.028884688392281532, + "learning_rate": 3.669973726566576e-06, + "loss": 0.0016, + "step": 37318 + }, + { + "epoch": 0.7464, + "grad_norm": 0.009295453317463398, + "learning_rate": 3.6688928714561444e-06, + "loss": 0.0025, + "step": 37320 + }, + { + "epoch": 0.74644, + "grad_norm": 0.161848247051239, + "learning_rate": 3.667812139773905e-06, + "loss": 0.0025, + "step": 37322 + }, + { + "epoch": 0.74648, + "grad_norm": 0.3035411834716797, + "learning_rate": 3.666731531540929e-06, + "loss": 0.0047, + "step": 37324 + }, + { + "epoch": 0.74652, + "grad_norm": 0.050743669271469116, + "learning_rate": 3.6656510467782856e-06, + "loss": 0.0015, + "step": 37326 + }, + { + "epoch": 0.74656, + "grad_norm": 0.04393339902162552, + "learning_rate": 3.6645706855070296e-06, + "loss": 0.0008, + "step": 37328 + }, + { + "epoch": 0.7466, + "grad_norm": 0.053769078105688095, + "learning_rate": 3.663490447748236e-06, + "loss": 0.0062, + "step": 37330 + }, + { + "epoch": 0.74664, + "grad_norm": 0.6661229133605957, + "learning_rate": 3.6624103335229543e-06, + "loss": 0.012, + "step": 37332 + }, + { + "epoch": 0.74668, + "grad_norm": 0.13869120180606842, + "learning_rate": 3.6613303428522474e-06, + "loss": 0.0017, + "step": 37334 + }, + { + "epoch": 0.74672, + "grad_norm": 0.07582858949899673, + "learning_rate": 3.660250475757171e-06, + "loss": 0.0009, + "step": 37336 + }, + { + "epoch": 0.74676, + "grad_norm": 0.008409420028328896, + "learning_rate": 3.65917073225877e-06, + "loss": 0.0013, + "step": 37338 + }, + { + "epoch": 0.7468, + "grad_norm": 0.009114986285567284, + "learning_rate": 3.658091112378106e-06, + "loss": 0.0009, + "step": 37340 + }, + { + "epoch": 0.74684, + "grad_norm": 0.21094000339508057, + "learning_rate": 3.657011616136218e-06, + "loss": 0.0025, + "step": 37342 + }, + { + "epoch": 0.74688, + "grad_norm": 0.012277348898351192, + "learning_rate": 3.6559322435541533e-06, + "loss": 0.0026, + "step": 37344 + }, + { + "epoch": 0.74692, + "grad_norm": 0.0031363291200250387, + "learning_rate": 3.6548529946529564e-06, + "loss": 0.0057, + "step": 37346 + }, + { + "epoch": 0.74696, + "grad_norm": 0.001197865349240601, + "learning_rate": 3.6537738694536696e-06, + "loss": 0.0013, + "step": 37348 + }, + { + "epoch": 0.747, + "grad_norm": 0.164690762758255, + "learning_rate": 3.6526948679773256e-06, + "loss": 0.0014, + "step": 37350 + }, + { + "epoch": 0.74704, + "grad_norm": 0.006463205441832542, + "learning_rate": 3.6516159902449633e-06, + "loss": 0.0057, + "step": 37352 + }, + { + "epoch": 0.74708, + "grad_norm": 8.254701614379883, + "learning_rate": 3.650537236277616e-06, + "loss": 0.0875, + "step": 37354 + }, + { + "epoch": 0.74712, + "grad_norm": 0.0002514323277864605, + "learning_rate": 3.6494586060963135e-06, + "loss": 0.0001, + "step": 37356 + }, + { + "epoch": 0.74716, + "grad_norm": 0.004027370363473892, + "learning_rate": 3.6483800997220875e-06, + "loss": 0.0001, + "step": 37358 + }, + { + "epoch": 0.7472, + "grad_norm": 8.573799133300781, + "learning_rate": 3.6473017171759563e-06, + "loss": 0.0968, + "step": 37360 + }, + { + "epoch": 0.74724, + "grad_norm": 0.0033357664942741394, + "learning_rate": 3.6462234584789545e-06, + "loss": 0.0001, + "step": 37362 + }, + { + "epoch": 0.74728, + "grad_norm": 0.057454682886600494, + "learning_rate": 3.645145323652094e-06, + "loss": 0.0023, + "step": 37364 + }, + { + "epoch": 0.74732, + "grad_norm": 0.004082403611391783, + "learning_rate": 3.644067312716397e-06, + "loss": 0.0064, + "step": 37366 + }, + { + "epoch": 0.74736, + "grad_norm": 0.008747431449592113, + "learning_rate": 3.642989425692881e-06, + "loss": 0.0022, + "step": 37368 + }, + { + "epoch": 0.7474, + "grad_norm": 0.0898641049861908, + "learning_rate": 3.6419116626025585e-06, + "loss": 0.0014, + "step": 37370 + }, + { + "epoch": 0.74744, + "grad_norm": 0.011344103142619133, + "learning_rate": 3.640834023466445e-06, + "loss": 0.0045, + "step": 37372 + }, + { + "epoch": 0.74748, + "grad_norm": 0.8914843797683716, + "learning_rate": 3.639756508305543e-06, + "loss": 0.0083, + "step": 37374 + }, + { + "epoch": 0.74752, + "grad_norm": 0.016102246940135956, + "learning_rate": 3.638679117140862e-06, + "loss": 0.0003, + "step": 37376 + }, + { + "epoch": 0.74756, + "grad_norm": 0.0021381275728344917, + "learning_rate": 3.637601849993406e-06, + "loss": 0.0003, + "step": 37378 + }, + { + "epoch": 0.7476, + "grad_norm": 0.016579722985625267, + "learning_rate": 3.636524706884181e-06, + "loss": 0.0015, + "step": 37380 + }, + { + "epoch": 0.74764, + "grad_norm": 0.03265412524342537, + "learning_rate": 3.635447687834177e-06, + "loss": 0.0012, + "step": 37382 + }, + { + "epoch": 0.74768, + "grad_norm": 0.05582933500409126, + "learning_rate": 3.6343707928644033e-06, + "loss": 0.004, + "step": 37384 + }, + { + "epoch": 0.74772, + "grad_norm": 0.08266349881887436, + "learning_rate": 3.633294021995846e-06, + "loss": 0.001, + "step": 37386 + }, + { + "epoch": 0.74776, + "grad_norm": 0.16027139127254486, + "learning_rate": 3.6322173752494984e-06, + "loss": 0.0047, + "step": 37388 + }, + { + "epoch": 0.7478, + "grad_norm": 1.30055832862854, + "learning_rate": 3.6311408526463554e-06, + "loss": 0.016, + "step": 37390 + }, + { + "epoch": 0.74784, + "grad_norm": 0.20078888535499573, + "learning_rate": 3.6300644542073924e-06, + "loss": 0.2359, + "step": 37392 + }, + { + "epoch": 0.74788, + "grad_norm": 0.0036534194368869066, + "learning_rate": 3.62898817995361e-06, + "loss": 0.0004, + "step": 37394 + }, + { + "epoch": 0.74792, + "grad_norm": 0.48302194476127625, + "learning_rate": 3.62791202990598e-06, + "loss": 0.0051, + "step": 37396 + }, + { + "epoch": 0.74796, + "grad_norm": 0.04991447180509567, + "learning_rate": 3.6268360040854846e-06, + "loss": 0.0006, + "step": 37398 + }, + { + "epoch": 0.748, + "grad_norm": 0.026949701830744743, + "learning_rate": 3.625760102513103e-06, + "loss": 0.017, + "step": 37400 + }, + { + "epoch": 0.74804, + "grad_norm": 0.017148064449429512, + "learning_rate": 3.6246843252098096e-06, + "loss": 0.0011, + "step": 37402 + }, + { + "epoch": 0.74808, + "grad_norm": 0.0032952805049717426, + "learning_rate": 3.623608672196581e-06, + "loss": 0.0002, + "step": 37404 + }, + { + "epoch": 0.74812, + "grad_norm": 0.02289682999253273, + "learning_rate": 3.622533143494381e-06, + "loss": 0.0009, + "step": 37406 + }, + { + "epoch": 0.74816, + "grad_norm": 5.513171672821045, + "learning_rate": 3.621457739124181e-06, + "loss": 0.0378, + "step": 37408 + }, + { + "epoch": 0.7482, + "grad_norm": 0.00015009334310889244, + "learning_rate": 3.620382459106946e-06, + "loss": 0.0006, + "step": 37410 + }, + { + "epoch": 0.74824, + "grad_norm": 0.04816358536481857, + "learning_rate": 3.6193073034636393e-06, + "loss": 0.0038, + "step": 37412 + }, + { + "epoch": 0.74828, + "grad_norm": 0.0025727213360369205, + "learning_rate": 3.618232272215222e-06, + "loss": 0.0057, + "step": 37414 + }, + { + "epoch": 0.74832, + "grad_norm": 0.03416856378316879, + "learning_rate": 3.617157365382654e-06, + "loss": 0.9384, + "step": 37416 + }, + { + "epoch": 0.74836, + "grad_norm": 0.014262857846915722, + "learning_rate": 3.616082582986887e-06, + "loss": 0.0003, + "step": 37418 + }, + { + "epoch": 0.7484, + "grad_norm": 1.260925054550171, + "learning_rate": 3.615007925048878e-06, + "loss": 0.0384, + "step": 37420 + }, + { + "epoch": 0.74844, + "grad_norm": 0.03281322121620178, + "learning_rate": 3.6139333915895747e-06, + "loss": 0.0017, + "step": 37422 + }, + { + "epoch": 0.74848, + "grad_norm": 0.18149933218955994, + "learning_rate": 3.612858982629929e-06, + "loss": 0.0023, + "step": 37424 + }, + { + "epoch": 0.74852, + "grad_norm": 0.008338173851370811, + "learning_rate": 3.6117846981908887e-06, + "loss": 0.0025, + "step": 37426 + }, + { + "epoch": 0.74856, + "grad_norm": 0.03440070152282715, + "learning_rate": 3.610710538293387e-06, + "loss": 0.0016, + "step": 37428 + }, + { + "epoch": 0.7486, + "grad_norm": 0.265352725982666, + "learning_rate": 3.6096365029583803e-06, + "loss": 0.0036, + "step": 37430 + }, + { + "epoch": 0.74864, + "grad_norm": 0.004637456499040127, + "learning_rate": 3.6085625922067966e-06, + "loss": 0.0002, + "step": 37432 + }, + { + "epoch": 0.74868, + "grad_norm": 0.06725190579891205, + "learning_rate": 3.6074888060595757e-06, + "loss": 0.0014, + "step": 37434 + }, + { + "epoch": 0.74872, + "grad_norm": 1.9399330615997314, + "learning_rate": 3.606415144537652e-06, + "loss": 0.1323, + "step": 37436 + }, + { + "epoch": 0.74876, + "grad_norm": 0.014423329383134842, + "learning_rate": 3.6053416076619595e-06, + "loss": 0.0008, + "step": 37438 + }, + { + "epoch": 0.7488, + "grad_norm": 1.4135528802871704, + "learning_rate": 3.604268195453421e-06, + "loss": 0.0077, + "step": 37440 + }, + { + "epoch": 0.74884, + "grad_norm": 13.786764144897461, + "learning_rate": 3.6031949079329677e-06, + "loss": 0.1905, + "step": 37442 + }, + { + "epoch": 0.74888, + "grad_norm": 0.19520150125026703, + "learning_rate": 3.602121745121522e-06, + "loss": 0.002, + "step": 37444 + }, + { + "epoch": 0.74892, + "grad_norm": 0.3947261869907379, + "learning_rate": 3.6010487070400067e-06, + "loss": 0.0508, + "step": 37446 + }, + { + "epoch": 0.74896, + "grad_norm": 0.08135099709033966, + "learning_rate": 3.599975793709345e-06, + "loss": 0.0007, + "step": 37448 + }, + { + "epoch": 0.749, + "grad_norm": 0.5357251167297363, + "learning_rate": 3.598903005150444e-06, + "loss": 0.0071, + "step": 37450 + }, + { + "epoch": 0.74904, + "grad_norm": 0.08957285434007645, + "learning_rate": 3.597830341384231e-06, + "loss": 0.0012, + "step": 37452 + }, + { + "epoch": 0.74908, + "grad_norm": 0.1274961233139038, + "learning_rate": 3.5967578024316074e-06, + "loss": 0.0013, + "step": 37454 + }, + { + "epoch": 0.74912, + "grad_norm": 0.009100174531340599, + "learning_rate": 3.595685388313487e-06, + "loss": 0.001, + "step": 37456 + }, + { + "epoch": 0.74916, + "grad_norm": 0.01955522783100605, + "learning_rate": 3.594613099050781e-06, + "loss": 0.0007, + "step": 37458 + }, + { + "epoch": 0.7492, + "grad_norm": 0.060405801981687546, + "learning_rate": 3.5935409346643835e-06, + "loss": 0.0039, + "step": 37460 + }, + { + "epoch": 0.74924, + "grad_norm": 0.292317658662796, + "learning_rate": 3.5924688951752107e-06, + "loss": 0.0028, + "step": 37462 + }, + { + "epoch": 0.74928, + "grad_norm": 0.00928858108818531, + "learning_rate": 3.5913969806041514e-06, + "loss": 0.0006, + "step": 37464 + }, + { + "epoch": 0.74932, + "grad_norm": 0.09187527000904083, + "learning_rate": 3.5903251909721082e-06, + "loss": 0.0031, + "step": 37466 + }, + { + "epoch": 0.74936, + "grad_norm": 0.0818590298295021, + "learning_rate": 3.589253526299975e-06, + "loss": 0.0012, + "step": 37468 + }, + { + "epoch": 0.7494, + "grad_norm": 0.18510222434997559, + "learning_rate": 3.5881819866086485e-06, + "loss": 0.003, + "step": 37470 + }, + { + "epoch": 0.74944, + "grad_norm": 0.029806938022375107, + "learning_rate": 3.5871105719190115e-06, + "loss": 0.0015, + "step": 37472 + }, + { + "epoch": 0.74948, + "grad_norm": 0.008557148277759552, + "learning_rate": 3.5860392822519563e-06, + "loss": 0.0001, + "step": 37474 + }, + { + "epoch": 0.74952, + "grad_norm": 0.004856890998780727, + "learning_rate": 3.5849681176283657e-06, + "loss": 0.0003, + "step": 37476 + }, + { + "epoch": 0.74956, + "grad_norm": 0.3222333788871765, + "learning_rate": 3.5838970780691262e-06, + "loss": 0.0179, + "step": 37478 + }, + { + "epoch": 0.7496, + "grad_norm": 0.07267600297927856, + "learning_rate": 3.582826163595119e-06, + "loss": 0.0009, + "step": 37480 + }, + { + "epoch": 0.74964, + "grad_norm": 0.07631855458021164, + "learning_rate": 3.5817553742272126e-06, + "loss": 0.1764, + "step": 37482 + }, + { + "epoch": 0.74968, + "grad_norm": 13.292688369750977, + "learning_rate": 3.5806847099862964e-06, + "loss": 0.3203, + "step": 37484 + }, + { + "epoch": 0.74972, + "grad_norm": 0.010700932703912258, + "learning_rate": 3.579614170893233e-06, + "loss": 0.0017, + "step": 37486 + }, + { + "epoch": 0.74976, + "grad_norm": 12.748937606811523, + "learning_rate": 3.5785437569688965e-06, + "loss": 0.1768, + "step": 37488 + }, + { + "epoch": 0.7498, + "grad_norm": 0.22698724269866943, + "learning_rate": 3.5774734682341563e-06, + "loss": 0.0038, + "step": 37490 + }, + { + "epoch": 0.74984, + "grad_norm": 0.11003106087446213, + "learning_rate": 3.5764033047098766e-06, + "loss": 0.0011, + "step": 37492 + }, + { + "epoch": 0.74988, + "grad_norm": 0.10892976820468903, + "learning_rate": 3.575333266416926e-06, + "loss": 0.4698, + "step": 37494 + }, + { + "epoch": 0.74992, + "grad_norm": 0.1434694528579712, + "learning_rate": 3.574263353376156e-06, + "loss": 0.0027, + "step": 37496 + }, + { + "epoch": 0.74996, + "grad_norm": 0.569466769695282, + "learning_rate": 3.5731935656084316e-06, + "loss": 0.0046, + "step": 37498 + }, + { + "epoch": 0.75, + "grad_norm": 0.36763373017311096, + "learning_rate": 3.5721239031346067e-06, + "loss": 0.0037, + "step": 37500 + }, + { + "epoch": 0.75004, + "grad_norm": 0.01607009768486023, + "learning_rate": 3.5710543659755392e-06, + "loss": 0.0006, + "step": 37502 + }, + { + "epoch": 0.75008, + "grad_norm": 0.20837071537971497, + "learning_rate": 3.5699849541520703e-06, + "loss": 0.0024, + "step": 37504 + }, + { + "epoch": 0.75012, + "grad_norm": 0.03410451486706734, + "learning_rate": 3.568915667685061e-06, + "loss": 0.2507, + "step": 37506 + }, + { + "epoch": 0.75016, + "grad_norm": 2.9355170726776123, + "learning_rate": 3.5678465065953493e-06, + "loss": 0.0239, + "step": 37508 + }, + { + "epoch": 0.7502, + "grad_norm": 1.4717992544174194, + "learning_rate": 3.5667774709037804e-06, + "loss": 0.0198, + "step": 37510 + }, + { + "epoch": 0.75024, + "grad_norm": 17.434707641601562, + "learning_rate": 3.5657085606312015e-06, + "loss": 0.4493, + "step": 37512 + }, + { + "epoch": 0.75028, + "grad_norm": 0.036864012479782104, + "learning_rate": 3.5646397757984384e-06, + "loss": 0.0007, + "step": 37514 + }, + { + "epoch": 0.75032, + "grad_norm": 0.05561945214867592, + "learning_rate": 3.563571116426344e-06, + "loss": 0.0006, + "step": 37516 + }, + { + "epoch": 0.75036, + "grad_norm": 0.016456007957458496, + "learning_rate": 3.5625025825357374e-06, + "loss": 0.0009, + "step": 37518 + }, + { + "epoch": 0.7504, + "grad_norm": 0.05419864505529404, + "learning_rate": 3.5614341741474633e-06, + "loss": 0.0013, + "step": 37520 + }, + { + "epoch": 0.75044, + "grad_norm": 0.029927335679531097, + "learning_rate": 3.5603658912823426e-06, + "loss": 0.0007, + "step": 37522 + }, + { + "epoch": 0.75048, + "grad_norm": 1.3230620622634888, + "learning_rate": 3.559297733961202e-06, + "loss": 0.0163, + "step": 37524 + }, + { + "epoch": 0.75052, + "grad_norm": 0.019686074927449226, + "learning_rate": 3.558229702204874e-06, + "loss": 0.0874, + "step": 37526 + }, + { + "epoch": 0.75056, + "grad_norm": 0.07020935416221619, + "learning_rate": 3.5571617960341696e-06, + "loss": 0.001, + "step": 37528 + }, + { + "epoch": 0.7506, + "grad_norm": 0.06424737721681595, + "learning_rate": 3.5560940154699133e-06, + "loss": 0.0024, + "step": 37530 + }, + { + "epoch": 0.75064, + "grad_norm": 0.05206729844212532, + "learning_rate": 3.5550263605329215e-06, + "loss": 0.072, + "step": 37532 + }, + { + "epoch": 0.75068, + "grad_norm": 10.584309577941895, + "learning_rate": 3.5539588312440086e-06, + "loss": 0.1273, + "step": 37534 + }, + { + "epoch": 0.75072, + "grad_norm": 0.5519432425498962, + "learning_rate": 3.552891427623988e-06, + "loss": 0.0059, + "step": 37536 + }, + { + "epoch": 0.75076, + "grad_norm": 0.03731250762939453, + "learning_rate": 3.551824149693671e-06, + "loss": 0.001, + "step": 37538 + }, + { + "epoch": 0.7508, + "grad_norm": 0.00551648112013936, + "learning_rate": 3.5507569974738575e-06, + "loss": 0.0149, + "step": 37540 + }, + { + "epoch": 0.75084, + "grad_norm": 0.024900078773498535, + "learning_rate": 3.5496899709853584e-06, + "loss": 0.0004, + "step": 37542 + }, + { + "epoch": 0.75088, + "grad_norm": 0.01107246894389391, + "learning_rate": 3.5486230702489767e-06, + "loss": 0.0005, + "step": 37544 + }, + { + "epoch": 0.75092, + "grad_norm": 0.04443622753024101, + "learning_rate": 3.5475562952855023e-06, + "loss": 0.0005, + "step": 37546 + }, + { + "epoch": 0.75096, + "grad_norm": 0.12037836760282516, + "learning_rate": 3.5464896461157485e-06, + "loss": 0.0021, + "step": 37548 + }, + { + "epoch": 0.751, + "grad_norm": 0.04870326444506645, + "learning_rate": 3.545423122760493e-06, + "loss": 0.001, + "step": 37550 + }, + { + "epoch": 0.75104, + "grad_norm": 0.059736695140600204, + "learning_rate": 3.544356725240544e-06, + "loss": 0.0011, + "step": 37552 + }, + { + "epoch": 0.75108, + "grad_norm": 0.26507800817489624, + "learning_rate": 3.543290453576681e-06, + "loss": 0.0022, + "step": 37554 + }, + { + "epoch": 0.75112, + "grad_norm": 0.07509078085422516, + "learning_rate": 3.5422243077896947e-06, + "loss": 0.0035, + "step": 37556 + }, + { + "epoch": 0.75116, + "grad_norm": 0.12354589998722076, + "learning_rate": 3.5411582879003703e-06, + "loss": 0.1278, + "step": 37558 + }, + { + "epoch": 0.7512, + "grad_norm": 0.1549459993839264, + "learning_rate": 3.540092393929494e-06, + "loss": 0.0091, + "step": 37560 + }, + { + "epoch": 0.75124, + "grad_norm": 0.077096126973629, + "learning_rate": 3.539026625897838e-06, + "loss": 0.0052, + "step": 37562 + }, + { + "epoch": 0.75128, + "grad_norm": 0.08839257061481476, + "learning_rate": 3.537960983826184e-06, + "loss": 0.2844, + "step": 37564 + }, + { + "epoch": 0.75132, + "grad_norm": 0.008728528395295143, + "learning_rate": 3.5368954677353086e-06, + "loss": 0.0003, + "step": 37566 + }, + { + "epoch": 0.75136, + "grad_norm": 0.01001665461808443, + "learning_rate": 3.535830077645983e-06, + "loss": 0.0058, + "step": 37568 + }, + { + "epoch": 0.7514, + "grad_norm": 0.06541306525468826, + "learning_rate": 3.5347648135789823e-06, + "loss": 0.004, + "step": 37570 + }, + { + "epoch": 0.75144, + "grad_norm": 0.022064857184886932, + "learning_rate": 3.5336996755550623e-06, + "loss": 0.0011, + "step": 37572 + }, + { + "epoch": 0.75148, + "grad_norm": 2.724846124649048, + "learning_rate": 3.5326346635950024e-06, + "loss": 0.0157, + "step": 37574 + }, + { + "epoch": 0.75152, + "grad_norm": 0.018042657524347305, + "learning_rate": 3.531569777719558e-06, + "loss": 0.0042, + "step": 37576 + }, + { + "epoch": 0.75156, + "grad_norm": 0.1663220226764679, + "learning_rate": 3.5305050179494894e-06, + "loss": 0.0021, + "step": 37578 + }, + { + "epoch": 0.7516, + "grad_norm": 0.012451454997062683, + "learning_rate": 3.5294403843055604e-06, + "loss": 0.0014, + "step": 37580 + }, + { + "epoch": 0.75164, + "grad_norm": 0.016757024452090263, + "learning_rate": 3.5283758768085142e-06, + "loss": 0.1499, + "step": 37582 + }, + { + "epoch": 0.75168, + "grad_norm": 0.2092302292585373, + "learning_rate": 3.52731149547912e-06, + "loss": 0.0021, + "step": 37584 + }, + { + "epoch": 0.75172, + "grad_norm": 0.018922051414847374, + "learning_rate": 3.5262472403381166e-06, + "loss": 0.0094, + "step": 37586 + }, + { + "epoch": 0.75176, + "grad_norm": 0.3788010776042938, + "learning_rate": 3.5251831114062573e-06, + "loss": 0.0057, + "step": 37588 + }, + { + "epoch": 0.7518, + "grad_norm": 0.26667970418930054, + "learning_rate": 3.524119108704286e-06, + "loss": 0.0186, + "step": 37590 + }, + { + "epoch": 0.75184, + "grad_norm": 0.07944825291633606, + "learning_rate": 3.52305523225295e-06, + "loss": 0.0009, + "step": 37592 + }, + { + "epoch": 0.75188, + "grad_norm": 0.27763310074806213, + "learning_rate": 3.5219914820729826e-06, + "loss": 0.003, + "step": 37594 + }, + { + "epoch": 0.75192, + "grad_norm": 0.0307918731123209, + "learning_rate": 3.520927858185128e-06, + "loss": 0.0649, + "step": 37596 + }, + { + "epoch": 0.75196, + "grad_norm": 0.08398518711328506, + "learning_rate": 3.519864360610119e-06, + "loss": 0.0014, + "step": 37598 + }, + { + "epoch": 0.752, + "grad_norm": 0.01907552033662796, + "learning_rate": 3.5188009893686916e-06, + "loss": 0.0018, + "step": 37600 + }, + { + "epoch": 0.75204, + "grad_norm": 0.6536014080047607, + "learning_rate": 3.5177377444815784e-06, + "loss": 0.0066, + "step": 37602 + }, + { + "epoch": 0.75208, + "grad_norm": 0.12128688395023346, + "learning_rate": 3.5166746259694986e-06, + "loss": 0.0015, + "step": 37604 + }, + { + "epoch": 0.75212, + "grad_norm": 0.007816554978489876, + "learning_rate": 3.515611633853191e-06, + "loss": 0.0015, + "step": 37606 + }, + { + "epoch": 0.75216, + "grad_norm": 1.5737613439559937, + "learning_rate": 3.514548768153371e-06, + "loss": 0.0261, + "step": 37608 + }, + { + "epoch": 0.7522, + "grad_norm": 0.4248290956020355, + "learning_rate": 3.5134860288907602e-06, + "loss": 0.0042, + "step": 37610 + }, + { + "epoch": 0.75224, + "grad_norm": 0.2559052109718323, + "learning_rate": 3.5124234160860825e-06, + "loss": 0.0034, + "step": 37612 + }, + { + "epoch": 0.75228, + "grad_norm": 0.14981725811958313, + "learning_rate": 3.511360929760044e-06, + "loss": 0.0166, + "step": 37614 + }, + { + "epoch": 0.75232, + "grad_norm": 0.16798259317874908, + "learning_rate": 3.5102985699333713e-06, + "loss": 0.0056, + "step": 37616 + }, + { + "epoch": 0.75236, + "grad_norm": 0.02065044455230236, + "learning_rate": 3.5092363366267657e-06, + "loss": 0.0009, + "step": 37618 + }, + { + "epoch": 0.7524, + "grad_norm": 0.225711390376091, + "learning_rate": 3.50817422986094e-06, + "loss": 0.0021, + "step": 37620 + }, + { + "epoch": 0.75244, + "grad_norm": 0.018767032772302628, + "learning_rate": 3.507112249656599e-06, + "loss": 0.0014, + "step": 37622 + }, + { + "epoch": 0.75248, + "grad_norm": 0.210261732339859, + "learning_rate": 3.5060503960344482e-06, + "loss": 0.0047, + "step": 37624 + }, + { + "epoch": 0.75252, + "grad_norm": 0.08115861564874649, + "learning_rate": 3.5049886690151913e-06, + "loss": 0.0014, + "step": 37626 + }, + { + "epoch": 0.75256, + "grad_norm": 0.04769701138138771, + "learning_rate": 3.5039270686195203e-06, + "loss": 0.0005, + "step": 37628 + }, + { + "epoch": 0.7526, + "grad_norm": 0.41187626123428345, + "learning_rate": 3.502865594868136e-06, + "loss": 0.0055, + "step": 37630 + }, + { + "epoch": 0.75264, + "grad_norm": 0.06407048553228378, + "learning_rate": 3.5018042477817317e-06, + "loss": 0.4692, + "step": 37632 + }, + { + "epoch": 0.75268, + "grad_norm": 0.021547725424170494, + "learning_rate": 3.5007430273810027e-06, + "loss": 0.0007, + "step": 37634 + }, + { + "epoch": 0.75272, + "grad_norm": 0.2667396068572998, + "learning_rate": 3.499681933686627e-06, + "loss": 0.0038, + "step": 37636 + }, + { + "epoch": 0.75276, + "grad_norm": 0.11022435873746872, + "learning_rate": 3.498620966719306e-06, + "loss": 0.0013, + "step": 37638 + }, + { + "epoch": 0.7528, + "grad_norm": 0.819058895111084, + "learning_rate": 3.4975601264997094e-06, + "loss": 0.0119, + "step": 37640 + }, + { + "epoch": 0.75284, + "grad_norm": 0.20541009306907654, + "learning_rate": 3.496499413048533e-06, + "loss": 0.002, + "step": 37642 + }, + { + "epoch": 0.75288, + "grad_norm": 1.1173121929168701, + "learning_rate": 3.4954388263864457e-06, + "loss": 0.0118, + "step": 37644 + }, + { + "epoch": 0.75292, + "grad_norm": 0.534986138343811, + "learning_rate": 3.494378366534126e-06, + "loss": 0.0101, + "step": 37646 + }, + { + "epoch": 0.75296, + "grad_norm": 0.043722003698349, + "learning_rate": 3.493318033512254e-06, + "loss": 0.0061, + "step": 37648 + }, + { + "epoch": 0.753, + "grad_norm": 0.07309567183256149, + "learning_rate": 3.492257827341492e-06, + "loss": 0.0009, + "step": 37650 + }, + { + "epoch": 0.75304, + "grad_norm": 0.019366998225450516, + "learning_rate": 3.491197748042515e-06, + "loss": 0.0003, + "step": 37652 + }, + { + "epoch": 0.75308, + "grad_norm": 0.011826679110527039, + "learning_rate": 3.4901377956359894e-06, + "loss": 0.0002, + "step": 37654 + }, + { + "epoch": 0.75312, + "grad_norm": 0.004611657466739416, + "learning_rate": 3.489077970142578e-06, + "loss": 0.0002, + "step": 37656 + }, + { + "epoch": 0.75316, + "grad_norm": 0.018150242045521736, + "learning_rate": 3.4880182715829435e-06, + "loss": 0.0029, + "step": 37658 + }, + { + "epoch": 0.7532, + "grad_norm": 0.2004113346338272, + "learning_rate": 3.4869586999777492e-06, + "loss": 0.0024, + "step": 37660 + }, + { + "epoch": 0.75324, + "grad_norm": 7.453114032745361, + "learning_rate": 3.4858992553476447e-06, + "loss": 0.1158, + "step": 37662 + }, + { + "epoch": 0.75328, + "grad_norm": 0.07080120593309402, + "learning_rate": 3.484839937713288e-06, + "loss": 0.0139, + "step": 37664 + }, + { + "epoch": 0.75332, + "grad_norm": 0.02360295131802559, + "learning_rate": 3.4837807470953346e-06, + "loss": 0.0015, + "step": 37666 + }, + { + "epoch": 0.75336, + "grad_norm": 0.009472462348639965, + "learning_rate": 3.4827216835144227e-06, + "loss": 0.0002, + "step": 37668 + }, + { + "epoch": 0.7534, + "grad_norm": 7.591722011566162, + "learning_rate": 3.4816627469912147e-06, + "loss": 0.0717, + "step": 37670 + }, + { + "epoch": 0.75344, + "grad_norm": 0.20510584115982056, + "learning_rate": 3.480603937546341e-06, + "loss": 0.0096, + "step": 37672 + }, + { + "epoch": 0.75348, + "grad_norm": 0.022112006321549416, + "learning_rate": 3.479545255200455e-06, + "loss": 0.0241, + "step": 37674 + }, + { + "epoch": 0.75352, + "grad_norm": 11.296287536621094, + "learning_rate": 3.4784866999741882e-06, + "loss": 0.1781, + "step": 37676 + }, + { + "epoch": 0.75356, + "grad_norm": 0.02334410324692726, + "learning_rate": 3.4774282718881815e-06, + "loss": 0.0003, + "step": 37678 + }, + { + "epoch": 0.7536, + "grad_norm": 0.036330465227365494, + "learning_rate": 3.476369970963072e-06, + "loss": 0.0008, + "step": 37680 + }, + { + "epoch": 0.75364, + "grad_norm": 0.033654242753982544, + "learning_rate": 3.4753117972194848e-06, + "loss": 0.0007, + "step": 37682 + }, + { + "epoch": 0.75368, + "grad_norm": 0.10544562339782715, + "learning_rate": 3.474253750678053e-06, + "loss": 0.0011, + "step": 37684 + }, + { + "epoch": 0.75372, + "grad_norm": 0.058982979506254196, + "learning_rate": 3.473195831359404e-06, + "loss": 0.0009, + "step": 37686 + }, + { + "epoch": 0.75376, + "grad_norm": 1.1746914386749268, + "learning_rate": 3.4721380392841632e-06, + "loss": 0.0118, + "step": 37688 + }, + { + "epoch": 0.7538, + "grad_norm": 0.02703983709216118, + "learning_rate": 3.4710803744729517e-06, + "loss": 0.0084, + "step": 37690 + }, + { + "epoch": 0.75384, + "grad_norm": 0.03333725035190582, + "learning_rate": 3.470022836946393e-06, + "loss": 0.0005, + "step": 37692 + }, + { + "epoch": 0.75388, + "grad_norm": 0.03565731272101402, + "learning_rate": 3.4689654267250974e-06, + "loss": 0.2839, + "step": 37694 + }, + { + "epoch": 0.75392, + "grad_norm": 0.03274323791265488, + "learning_rate": 3.4679081438296835e-06, + "loss": 0.0006, + "step": 37696 + }, + { + "epoch": 0.75396, + "grad_norm": 0.0271530169993639, + "learning_rate": 3.466850988280762e-06, + "loss": 0.0005, + "step": 37698 + }, + { + "epoch": 0.754, + "grad_norm": 0.13534125685691833, + "learning_rate": 3.4657939600989453e-06, + "loss": 0.0013, + "step": 37700 + }, + { + "epoch": 0.75404, + "grad_norm": 0.2608644962310791, + "learning_rate": 3.4647370593048424e-06, + "loss": 0.0038, + "step": 37702 + }, + { + "epoch": 0.75408, + "grad_norm": 15.736099243164062, + "learning_rate": 3.463680285919049e-06, + "loss": 0.2684, + "step": 37704 + }, + { + "epoch": 0.75412, + "grad_norm": 0.05075777694582939, + "learning_rate": 3.4626236399621794e-06, + "loss": 0.0137, + "step": 37706 + }, + { + "epoch": 0.75416, + "grad_norm": 0.01429790910333395, + "learning_rate": 3.461567121454825e-06, + "loss": 0.0013, + "step": 37708 + }, + { + "epoch": 0.7542, + "grad_norm": 0.04104471579194069, + "learning_rate": 3.4605107304175855e-06, + "loss": 0.001, + "step": 37710 + }, + { + "epoch": 0.75424, + "grad_norm": 0.11226858198642731, + "learning_rate": 3.459454466871056e-06, + "loss": 0.0021, + "step": 37712 + }, + { + "epoch": 0.75428, + "grad_norm": 0.40213149785995483, + "learning_rate": 3.4583983308358325e-06, + "loss": 0.006, + "step": 37714 + }, + { + "epoch": 0.75432, + "grad_norm": 0.5027270317077637, + "learning_rate": 3.4573423223324976e-06, + "loss": 0.0083, + "step": 37716 + }, + { + "epoch": 0.75436, + "grad_norm": 0.022348303347826004, + "learning_rate": 3.4562864413816422e-06, + "loss": 0.0003, + "step": 37718 + }, + { + "epoch": 0.7544, + "grad_norm": 0.04763978347182274, + "learning_rate": 3.455230688003852e-06, + "loss": 0.0014, + "step": 37720 + }, + { + "epoch": 0.75444, + "grad_norm": 0.8898167014122009, + "learning_rate": 3.45417506221971e-06, + "loss": 0.0076, + "step": 37722 + }, + { + "epoch": 0.75448, + "grad_norm": 0.0514877550303936, + "learning_rate": 3.4531195640497973e-06, + "loss": 0.0008, + "step": 37724 + }, + { + "epoch": 0.75452, + "grad_norm": 0.01918642967939377, + "learning_rate": 3.4520641935146826e-06, + "loss": 0.0005, + "step": 37726 + }, + { + "epoch": 0.75456, + "grad_norm": 0.0013052395079284906, + "learning_rate": 3.4510089506349555e-06, + "loss": 0.0092, + "step": 37728 + }, + { + "epoch": 0.7546, + "grad_norm": 0.03265800699591637, + "learning_rate": 3.4499538354311757e-06, + "loss": 0.0017, + "step": 37730 + }, + { + "epoch": 0.75464, + "grad_norm": 0.5528931021690369, + "learning_rate": 3.4488988479239184e-06, + "loss": 0.0138, + "step": 37732 + }, + { + "epoch": 0.75468, + "grad_norm": 0.0265800878405571, + "learning_rate": 3.4478439881337543e-06, + "loss": 0.0007, + "step": 37734 + }, + { + "epoch": 0.75472, + "grad_norm": 0.22989849746227264, + "learning_rate": 3.4467892560812387e-06, + "loss": 0.0066, + "step": 37736 + }, + { + "epoch": 0.75476, + "grad_norm": 1.7048195600509644, + "learning_rate": 3.445734651786946e-06, + "loss": 0.0154, + "step": 37738 + }, + { + "epoch": 0.7548, + "grad_norm": 0.025723639875650406, + "learning_rate": 3.4446801752714287e-06, + "loss": 0.0019, + "step": 37740 + }, + { + "epoch": 0.75484, + "grad_norm": 0.01067932229489088, + "learning_rate": 3.4436258265552447e-06, + "loss": 0.0011, + "step": 37742 + }, + { + "epoch": 0.75488, + "grad_norm": 0.05555712431669235, + "learning_rate": 3.442571605658952e-06, + "loss": 0.0005, + "step": 37744 + }, + { + "epoch": 0.75492, + "grad_norm": 0.32628747820854187, + "learning_rate": 3.441517512603101e-06, + "loss": 0.0027, + "step": 37746 + }, + { + "epoch": 0.75496, + "grad_norm": 0.028635190799832344, + "learning_rate": 3.440463547408247e-06, + "loss": 0.0009, + "step": 37748 + }, + { + "epoch": 0.755, + "grad_norm": 0.0028046369552612305, + "learning_rate": 3.4394097100949286e-06, + "loss": 0.0004, + "step": 37750 + }, + { + "epoch": 0.75504, + "grad_norm": 0.07181168347597122, + "learning_rate": 3.438356000683697e-06, + "loss": 0.088, + "step": 37752 + }, + { + "epoch": 0.75508, + "grad_norm": 0.005849338136613369, + "learning_rate": 3.437302419195092e-06, + "loss": 0.0022, + "step": 37754 + }, + { + "epoch": 0.75512, + "grad_norm": 0.0001956079067895189, + "learning_rate": 3.436248965649659e-06, + "loss": 0.0006, + "step": 37756 + }, + { + "epoch": 0.75516, + "grad_norm": 0.020633477717638016, + "learning_rate": 3.4351956400679244e-06, + "loss": 0.0011, + "step": 37758 + }, + { + "epoch": 0.7552, + "grad_norm": 0.2181508094072342, + "learning_rate": 3.4341424424704373e-06, + "loss": 0.0025, + "step": 37760 + }, + { + "epoch": 0.75524, + "grad_norm": 0.007980355061590672, + "learning_rate": 3.4330893728777216e-06, + "loss": 0.0013, + "step": 37762 + }, + { + "epoch": 0.75528, + "grad_norm": 0.015341482125222683, + "learning_rate": 3.4320364313103084e-06, + "loss": 1.1046, + "step": 37764 + }, + { + "epoch": 0.75532, + "grad_norm": 0.02082805335521698, + "learning_rate": 3.4309836177887258e-06, + "loss": 0.0008, + "step": 37766 + }, + { + "epoch": 0.75536, + "grad_norm": 0.019232559949159622, + "learning_rate": 3.4299309323335005e-06, + "loss": 0.0014, + "step": 37768 + }, + { + "epoch": 0.7554, + "grad_norm": 0.02576390653848648, + "learning_rate": 3.4288783749651568e-06, + "loss": 0.0008, + "step": 37770 + }, + { + "epoch": 0.75544, + "grad_norm": 0.10780924558639526, + "learning_rate": 3.4278259457042097e-06, + "loss": 0.0018, + "step": 37772 + }, + { + "epoch": 0.75548, + "grad_norm": 0.19525595009326935, + "learning_rate": 3.4267736445711785e-06, + "loss": 0.0018, + "step": 37774 + }, + { + "epoch": 0.75552, + "grad_norm": 0.06062036007642746, + "learning_rate": 3.4257214715865806e-06, + "loss": 0.0028, + "step": 37776 + }, + { + "epoch": 0.75556, + "grad_norm": 0.019506610929965973, + "learning_rate": 3.4246694267709256e-06, + "loss": 0.0018, + "step": 37778 + }, + { + "epoch": 0.7556, + "grad_norm": 0.03478178754448891, + "learning_rate": 3.4236175101447265e-06, + "loss": 0.0014, + "step": 37780 + }, + { + "epoch": 0.75564, + "grad_norm": 0.11721889674663544, + "learning_rate": 3.4225657217284925e-06, + "loss": 0.0016, + "step": 37782 + }, + { + "epoch": 0.75568, + "grad_norm": 0.3100990951061249, + "learning_rate": 3.4215140615427235e-06, + "loss": 0.0041, + "step": 37784 + }, + { + "epoch": 0.75572, + "grad_norm": 0.05872030556201935, + "learning_rate": 3.420462529607923e-06, + "loss": 0.0011, + "step": 37786 + }, + { + "epoch": 0.75576, + "grad_norm": 0.15060240030288696, + "learning_rate": 3.4194111259445983e-06, + "loss": 0.0228, + "step": 37788 + }, + { + "epoch": 0.7558, + "grad_norm": 0.01963307149708271, + "learning_rate": 3.418359850573234e-06, + "loss": 0.0378, + "step": 37790 + }, + { + "epoch": 0.75584, + "grad_norm": 1.1676629781723022, + "learning_rate": 3.4173087035143394e-06, + "loss": 0.0117, + "step": 37792 + }, + { + "epoch": 0.75588, + "grad_norm": 7.65149450302124, + "learning_rate": 3.4162576847883933e-06, + "loss": 0.1643, + "step": 37794 + }, + { + "epoch": 0.75592, + "grad_norm": 0.01996101811528206, + "learning_rate": 3.4152067944159006e-06, + "loss": 0.0003, + "step": 37796 + }, + { + "epoch": 0.75596, + "grad_norm": 0.01856582798063755, + "learning_rate": 3.4141560324173382e-06, + "loss": 0.0214, + "step": 37798 + }, + { + "epoch": 0.756, + "grad_norm": 0.1228942722082138, + "learning_rate": 3.4131053988131947e-06, + "loss": 0.0029, + "step": 37800 + }, + { + "epoch": 0.75604, + "grad_norm": 0.19425423443317413, + "learning_rate": 3.4120548936239563e-06, + "loss": 0.004, + "step": 37802 + }, + { + "epoch": 0.75608, + "grad_norm": 0.029815413057804108, + "learning_rate": 3.4110045168700954e-06, + "loss": 0.0074, + "step": 37804 + }, + { + "epoch": 0.75612, + "grad_norm": 2.8042142391204834, + "learning_rate": 3.409954268572094e-06, + "loss": 0.0337, + "step": 37806 + }, + { + "epoch": 0.75616, + "grad_norm": 0.8528339266777039, + "learning_rate": 3.408904148750427e-06, + "loss": 0.0092, + "step": 37808 + }, + { + "epoch": 0.7562, + "grad_norm": 0.023027701303362846, + "learning_rate": 3.4078541574255664e-06, + "loss": 0.6292, + "step": 37810 + }, + { + "epoch": 0.75624, + "grad_norm": 0.10593923926353455, + "learning_rate": 3.4068042946179836e-06, + "loss": 0.0053, + "step": 37812 + }, + { + "epoch": 0.75628, + "grad_norm": 0.14386524260044098, + "learning_rate": 3.405754560348148e-06, + "loss": 0.0015, + "step": 37814 + }, + { + "epoch": 0.75632, + "grad_norm": 0.008973979391157627, + "learning_rate": 3.4047049546365196e-06, + "loss": 0.0582, + "step": 37816 + }, + { + "epoch": 0.75636, + "grad_norm": 1.2304470539093018, + "learning_rate": 3.4036554775035635e-06, + "loss": 0.0116, + "step": 37818 + }, + { + "epoch": 0.7564, + "grad_norm": 0.22234004735946655, + "learning_rate": 3.4026061289697397e-06, + "loss": 0.0024, + "step": 37820 + }, + { + "epoch": 0.75644, + "grad_norm": 0.059225499629974365, + "learning_rate": 3.4015569090555057e-06, + "loss": 0.0015, + "step": 37822 + }, + { + "epoch": 0.75648, + "grad_norm": 0.08063013851642609, + "learning_rate": 3.400507817781321e-06, + "loss": 0.0012, + "step": 37824 + }, + { + "epoch": 0.75652, + "grad_norm": 0.41283807158470154, + "learning_rate": 3.3994588551676267e-06, + "loss": 0.0041, + "step": 37826 + }, + { + "epoch": 0.75656, + "grad_norm": 0.03233475983142853, + "learning_rate": 3.398410021234887e-06, + "loss": 0.0006, + "step": 37828 + }, + { + "epoch": 0.7566, + "grad_norm": 0.09040983021259308, + "learning_rate": 3.397361316003539e-06, + "loss": 0.0016, + "step": 37830 + }, + { + "epoch": 0.75664, + "grad_norm": 0.023189887404441833, + "learning_rate": 3.396312739494032e-06, + "loss": 0.0033, + "step": 37832 + }, + { + "epoch": 0.75668, + "grad_norm": 6.305114269256592, + "learning_rate": 3.3952642917268073e-06, + "loss": 0.1058, + "step": 37834 + }, + { + "epoch": 0.75672, + "grad_norm": 0.009225829504430294, + "learning_rate": 3.394215972722307e-06, + "loss": 0.4493, + "step": 37836 + }, + { + "epoch": 0.75676, + "grad_norm": 0.017582150176167488, + "learning_rate": 3.393167782500969e-06, + "loss": 0.0023, + "step": 37838 + }, + { + "epoch": 0.7568, + "grad_norm": 0.04843564331531525, + "learning_rate": 3.3921197210832235e-06, + "loss": 0.065, + "step": 37840 + }, + { + "epoch": 0.75684, + "grad_norm": 0.6916446685791016, + "learning_rate": 3.391071788489506e-06, + "loss": 0.0095, + "step": 37842 + }, + { + "epoch": 0.75688, + "grad_norm": 0.19248461723327637, + "learning_rate": 3.390023984740247e-06, + "loss": 0.0042, + "step": 37844 + }, + { + "epoch": 0.75692, + "grad_norm": 0.20719636976718903, + "learning_rate": 3.388976309855877e-06, + "loss": 0.0031, + "step": 37846 + }, + { + "epoch": 0.75696, + "grad_norm": 0.7151374220848083, + "learning_rate": 3.3879287638568103e-06, + "loss": 0.0067, + "step": 37848 + }, + { + "epoch": 0.757, + "grad_norm": 0.5736889243125916, + "learning_rate": 3.3868813467634833e-06, + "loss": 0.0046, + "step": 37850 + }, + { + "epoch": 0.75704, + "grad_norm": 0.2566472589969635, + "learning_rate": 3.3858340585963056e-06, + "loss": 0.0112, + "step": 37852 + }, + { + "epoch": 0.75708, + "grad_norm": 0.08630825579166412, + "learning_rate": 3.384786899375697e-06, + "loss": 0.1272, + "step": 37854 + }, + { + "epoch": 0.75712, + "grad_norm": 0.23339253664016724, + "learning_rate": 3.383739869122079e-06, + "loss": 0.0027, + "step": 37856 + }, + { + "epoch": 0.75716, + "grad_norm": 0.024858085438609123, + "learning_rate": 3.382692967855851e-06, + "loss": 0.0004, + "step": 37858 + }, + { + "epoch": 0.7572, + "grad_norm": 0.10614167153835297, + "learning_rate": 3.381646195597437e-06, + "loss": 0.003, + "step": 37860 + }, + { + "epoch": 0.75724, + "grad_norm": 0.12934891879558563, + "learning_rate": 3.380599552367234e-06, + "loss": 0.0017, + "step": 37862 + }, + { + "epoch": 0.75728, + "grad_norm": 0.004434429574757814, + "learning_rate": 3.3795530381856513e-06, + "loss": 0.0132, + "step": 37864 + }, + { + "epoch": 0.75732, + "grad_norm": 1.1210802793502808, + "learning_rate": 3.378506653073089e-06, + "loss": 0.0153, + "step": 37866 + }, + { + "epoch": 0.75736, + "grad_norm": 0.29757413268089294, + "learning_rate": 3.377460397049951e-06, + "loss": 0.0085, + "step": 37868 + }, + { + "epoch": 0.7574, + "grad_norm": 0.27420714497566223, + "learning_rate": 3.376414270136633e-06, + "loss": 0.0033, + "step": 37870 + }, + { + "epoch": 0.75744, + "grad_norm": 0.12479912489652634, + "learning_rate": 3.3753682723535274e-06, + "loss": 0.0026, + "step": 37872 + }, + { + "epoch": 0.75748, + "grad_norm": 0.3299117684364319, + "learning_rate": 3.3743224037210263e-06, + "loss": 0.0041, + "step": 37874 + }, + { + "epoch": 0.75752, + "grad_norm": 0.13038913905620575, + "learning_rate": 3.373276664259523e-06, + "loss": 0.0658, + "step": 37876 + }, + { + "epoch": 0.75756, + "grad_norm": 0.0072821457870304585, + "learning_rate": 3.3722310539894043e-06, + "loss": 0.0149, + "step": 37878 + }, + { + "epoch": 0.7576, + "grad_norm": 0.16172359883785248, + "learning_rate": 3.3711855729310482e-06, + "loss": 0.0023, + "step": 37880 + }, + { + "epoch": 0.75764, + "grad_norm": 0.12890025973320007, + "learning_rate": 3.3701402211048473e-06, + "loss": 0.0017, + "step": 37882 + }, + { + "epoch": 0.75768, + "grad_norm": 0.058602817356586456, + "learning_rate": 3.369094998531174e-06, + "loss": 0.0069, + "step": 37884 + }, + { + "epoch": 0.75772, + "grad_norm": 0.007134957239031792, + "learning_rate": 3.3680499052304072e-06, + "loss": 0.0001, + "step": 37886 + }, + { + "epoch": 0.75776, + "grad_norm": 0.1337442398071289, + "learning_rate": 3.367004941222921e-06, + "loss": 0.0021, + "step": 37888 + }, + { + "epoch": 0.7578, + "grad_norm": 0.7287259697914124, + "learning_rate": 3.3659601065290893e-06, + "loss": 0.032, + "step": 37890 + }, + { + "epoch": 0.75784, + "grad_norm": 0.018411332741379738, + "learning_rate": 3.3649154011692842e-06, + "loss": 0.0028, + "step": 37892 + }, + { + "epoch": 0.75788, + "grad_norm": 0.5869236588478088, + "learning_rate": 3.3638708251638653e-06, + "loss": 0.0062, + "step": 37894 + }, + { + "epoch": 0.75792, + "grad_norm": 0.015927491709589958, + "learning_rate": 3.3628263785332017e-06, + "loss": 0.0002, + "step": 37896 + }, + { + "epoch": 0.75796, + "grad_norm": 6.067494869232178, + "learning_rate": 3.3617820612976548e-06, + "loss": 0.072, + "step": 37898 + }, + { + "epoch": 0.758, + "grad_norm": 0.024173516780138016, + "learning_rate": 3.360737873477584e-06, + "loss": 0.0008, + "step": 37900 + }, + { + "epoch": 0.75804, + "grad_norm": 0.2619019150733948, + "learning_rate": 3.3596938150933467e-06, + "loss": 0.0058, + "step": 37902 + }, + { + "epoch": 0.75808, + "grad_norm": 0.15568631887435913, + "learning_rate": 3.3586498861653004e-06, + "loss": 0.0146, + "step": 37904 + }, + { + "epoch": 0.75812, + "grad_norm": 0.16190020740032196, + "learning_rate": 3.3576060867137905e-06, + "loss": 0.0027, + "step": 37906 + }, + { + "epoch": 0.75816, + "grad_norm": 13.240020751953125, + "learning_rate": 3.356562416759169e-06, + "loss": 0.1542, + "step": 37908 + }, + { + "epoch": 0.7582, + "grad_norm": 0.010815966874361038, + "learning_rate": 3.355518876321787e-06, + "loss": 0.0016, + "step": 37910 + }, + { + "epoch": 0.75824, + "grad_norm": 0.022545399144291878, + "learning_rate": 3.354475465421979e-06, + "loss": 0.0066, + "step": 37912 + }, + { + "epoch": 0.75828, + "grad_norm": 0.08183301240205765, + "learning_rate": 3.3534321840801e-06, + "loss": 0.0101, + "step": 37914 + }, + { + "epoch": 0.75832, + "grad_norm": 1.3828105926513672, + "learning_rate": 3.3523890323164764e-06, + "loss": 0.0136, + "step": 37916 + }, + { + "epoch": 0.75836, + "grad_norm": 0.027178987860679626, + "learning_rate": 3.351346010151456e-06, + "loss": 0.0009, + "step": 37918 + }, + { + "epoch": 0.7584, + "grad_norm": 4.979388236999512, + "learning_rate": 3.3503031176053657e-06, + "loss": 0.0748, + "step": 37920 + }, + { + "epoch": 0.75844, + "grad_norm": 0.01757543906569481, + "learning_rate": 3.3492603546985403e-06, + "loss": 0.0025, + "step": 37922 + }, + { + "epoch": 0.75848, + "grad_norm": 0.052860844880342484, + "learning_rate": 3.3482177214513112e-06, + "loss": 0.0007, + "step": 37924 + }, + { + "epoch": 0.75852, + "grad_norm": 0.011921670287847519, + "learning_rate": 3.3471752178839946e-06, + "loss": 0.0002, + "step": 37926 + }, + { + "epoch": 0.75856, + "grad_norm": 0.020400943234562874, + "learning_rate": 3.3461328440169294e-06, + "loss": 0.0073, + "step": 37928 + }, + { + "epoch": 0.7586, + "grad_norm": 0.13615351915359497, + "learning_rate": 3.3450905998704274e-06, + "loss": 0.0069, + "step": 37930 + }, + { + "epoch": 0.75864, + "grad_norm": 0.25266149640083313, + "learning_rate": 3.344048485464809e-06, + "loss": 0.0054, + "step": 37932 + }, + { + "epoch": 0.75868, + "grad_norm": 0.5642973184585571, + "learning_rate": 3.3430065008203936e-06, + "loss": 0.0065, + "step": 37934 + }, + { + "epoch": 0.75872, + "grad_norm": 0.2827344238758087, + "learning_rate": 3.341964645957496e-06, + "loss": 0.0111, + "step": 37936 + }, + { + "epoch": 0.75876, + "grad_norm": 3.424948215484619, + "learning_rate": 3.3409229208964224e-06, + "loss": 0.0355, + "step": 37938 + }, + { + "epoch": 0.7588, + "grad_norm": 0.02248912677168846, + "learning_rate": 3.3398813256574847e-06, + "loss": 0.0052, + "step": 37940 + }, + { + "epoch": 0.75884, + "grad_norm": 1.027138590812683, + "learning_rate": 3.3388398602609894e-06, + "loss": 0.5773, + "step": 37942 + }, + { + "epoch": 0.75888, + "grad_norm": 0.4078832268714905, + "learning_rate": 3.3377985247272402e-06, + "loss": 0.0035, + "step": 37944 + }, + { + "epoch": 0.75892, + "grad_norm": 0.757798433303833, + "learning_rate": 3.3367573190765432e-06, + "loss": 0.0176, + "step": 37946 + }, + { + "epoch": 0.75896, + "grad_norm": 0.6944248080253601, + "learning_rate": 3.3357162433291856e-06, + "loss": 0.0084, + "step": 37948 + }, + { + "epoch": 0.759, + "grad_norm": 1.815329670906067, + "learning_rate": 3.3346752975054763e-06, + "loss": 0.019, + "step": 37950 + }, + { + "epoch": 0.75904, + "grad_norm": 0.05238744243979454, + "learning_rate": 3.3336344816257017e-06, + "loss": 0.0008, + "step": 37952 + }, + { + "epoch": 0.75908, + "grad_norm": 2.718933343887329, + "learning_rate": 3.332593795710154e-06, + "loss": 0.0284, + "step": 37954 + }, + { + "epoch": 0.75912, + "grad_norm": 0.09604766964912415, + "learning_rate": 3.331553239779124e-06, + "loss": 0.0031, + "step": 37956 + }, + { + "epoch": 0.75916, + "grad_norm": 0.6662356853485107, + "learning_rate": 3.330512813852895e-06, + "loss": 0.0074, + "step": 37958 + }, + { + "epoch": 0.7592, + "grad_norm": 0.37962260842323303, + "learning_rate": 3.3294725179517573e-06, + "loss": 0.0042, + "step": 37960 + }, + { + "epoch": 0.75924, + "grad_norm": 0.026549633592367172, + "learning_rate": 3.3284323520959826e-06, + "loss": 0.0007, + "step": 37962 + }, + { + "epoch": 0.75928, + "grad_norm": 0.027462337166070938, + "learning_rate": 3.3273923163058553e-06, + "loss": 0.0004, + "step": 37964 + }, + { + "epoch": 0.75932, + "grad_norm": 0.04164035990834236, + "learning_rate": 3.3263524106016488e-06, + "loss": 0.0019, + "step": 37966 + }, + { + "epoch": 0.75936, + "grad_norm": 0.4877092242240906, + "learning_rate": 3.3253126350036413e-06, + "loss": 0.0046, + "step": 37968 + }, + { + "epoch": 0.7594, + "grad_norm": 0.038816843181848526, + "learning_rate": 3.3242729895320945e-06, + "loss": 0.0011, + "step": 37970 + }, + { + "epoch": 0.75944, + "grad_norm": 0.027156328782439232, + "learning_rate": 3.323233474207289e-06, + "loss": 0.0005, + "step": 37972 + }, + { + "epoch": 0.75948, + "grad_norm": 0.11187222599983215, + "learning_rate": 3.322194089049481e-06, + "loss": 0.0015, + "step": 37974 + }, + { + "epoch": 0.75952, + "grad_norm": 0.16906657814979553, + "learning_rate": 3.3211548340789366e-06, + "loss": 0.3934, + "step": 37976 + }, + { + "epoch": 0.75956, + "grad_norm": 0.20199136435985565, + "learning_rate": 3.3201157093159207e-06, + "loss": 0.0019, + "step": 37978 + }, + { + "epoch": 0.7596, + "grad_norm": 0.12275883555412292, + "learning_rate": 3.3190767147806825e-06, + "loss": 0.004, + "step": 37980 + }, + { + "epoch": 0.75964, + "grad_norm": 0.39049479365348816, + "learning_rate": 3.3180378504934896e-06, + "loss": 0.0045, + "step": 37982 + }, + { + "epoch": 0.75968, + "grad_norm": 0.003426542039960623, + "learning_rate": 3.3169991164745853e-06, + "loss": 0.0002, + "step": 37984 + }, + { + "epoch": 0.75972, + "grad_norm": 0.008939338847994804, + "learning_rate": 3.315960512744223e-06, + "loss": 0.0005, + "step": 37986 + }, + { + "epoch": 0.75976, + "grad_norm": 0.14211013913154602, + "learning_rate": 3.314922039322652e-06, + "loss": 0.0015, + "step": 37988 + }, + { + "epoch": 0.7598, + "grad_norm": 0.8052306175231934, + "learning_rate": 3.3138836962301192e-06, + "loss": 0.0092, + "step": 37990 + }, + { + "epoch": 0.75984, + "grad_norm": 0.018211303278803825, + "learning_rate": 3.312845483486867e-06, + "loss": 0.0002, + "step": 37992 + }, + { + "epoch": 0.75988, + "grad_norm": 0.02815418131649494, + "learning_rate": 3.311807401113133e-06, + "loss": 0.0004, + "step": 37994 + }, + { + "epoch": 0.75992, + "grad_norm": 0.06916926801204681, + "learning_rate": 3.3107694491291574e-06, + "loss": 0.0009, + "step": 37996 + }, + { + "epoch": 0.75996, + "grad_norm": 0.12923665344715118, + "learning_rate": 3.3097316275551737e-06, + "loss": 0.0036, + "step": 37998 + }, + { + "epoch": 0.76, + "grad_norm": 0.036393798887729645, + "learning_rate": 3.308693936411421e-06, + "loss": 0.0014, + "step": 38000 + }, + { + "epoch": 0.76004, + "grad_norm": 0.22926615178585052, + "learning_rate": 3.307656375718118e-06, + "loss": 0.0033, + "step": 38002 + }, + { + "epoch": 0.76008, + "grad_norm": 0.46222516894340515, + "learning_rate": 3.3066189454955056e-06, + "loss": 0.2234, + "step": 38004 + }, + { + "epoch": 0.76012, + "grad_norm": 0.04463130980730057, + "learning_rate": 3.3055816457638e-06, + "loss": 0.0005, + "step": 38006 + }, + { + "epoch": 0.76016, + "grad_norm": 0.3130010664463043, + "learning_rate": 3.3045444765432266e-06, + "loss": 0.0057, + "step": 38008 + }, + { + "epoch": 0.7602, + "grad_norm": 0.0025086437817662954, + "learning_rate": 3.3035074378540087e-06, + "loss": 0.0064, + "step": 38010 + }, + { + "epoch": 0.76024, + "grad_norm": 0.07789559662342072, + "learning_rate": 3.302470529716354e-06, + "loss": 0.3194, + "step": 38012 + }, + { + "epoch": 0.76028, + "grad_norm": 1.3343604803085327, + "learning_rate": 3.3014337521504914e-06, + "loss": 0.0249, + "step": 38014 + }, + { + "epoch": 0.76032, + "grad_norm": 0.14574602246284485, + "learning_rate": 3.300397105176624e-06, + "loss": 0.2679, + "step": 38016 + }, + { + "epoch": 0.76036, + "grad_norm": 0.03683604300022125, + "learning_rate": 3.299360588814963e-06, + "loss": 0.0005, + "step": 38018 + }, + { + "epoch": 0.7604, + "grad_norm": 4.16314697265625, + "learning_rate": 3.2983242030857177e-06, + "loss": 0.0479, + "step": 38020 + }, + { + "epoch": 0.76044, + "grad_norm": 0.06020946800708771, + "learning_rate": 3.2972879480090926e-06, + "loss": 0.0006, + "step": 38022 + }, + { + "epoch": 0.76048, + "grad_norm": 0.030932592228055, + "learning_rate": 3.296251823605289e-06, + "loss": 0.0095, + "step": 38024 + }, + { + "epoch": 0.76052, + "grad_norm": 0.1477384716272354, + "learning_rate": 3.2952158298945104e-06, + "loss": 0.0063, + "step": 38026 + }, + { + "epoch": 0.76056, + "grad_norm": 0.0640091821551323, + "learning_rate": 3.2941799668969487e-06, + "loss": 0.0038, + "step": 38028 + }, + { + "epoch": 0.7606, + "grad_norm": 0.04328611493110657, + "learning_rate": 3.2931442346328e-06, + "loss": 0.0008, + "step": 38030 + }, + { + "epoch": 0.76064, + "grad_norm": 0.17659327387809753, + "learning_rate": 3.2921086331222583e-06, + "loss": 0.0026, + "step": 38032 + }, + { + "epoch": 0.76068, + "grad_norm": 0.016959037631750107, + "learning_rate": 3.2910731623855118e-06, + "loss": 0.0021, + "step": 38034 + }, + { + "epoch": 0.76072, + "grad_norm": 0.34261026978492737, + "learning_rate": 3.290037822442752e-06, + "loss": 0.0039, + "step": 38036 + }, + { + "epoch": 0.76076, + "grad_norm": 1.0297460556030273, + "learning_rate": 3.2890026133141516e-06, + "loss": 0.0161, + "step": 38038 + }, + { + "epoch": 0.7608, + "grad_norm": 0.010961159132421017, + "learning_rate": 3.287967535019908e-06, + "loss": 0.0001, + "step": 38040 + }, + { + "epoch": 0.76084, + "grad_norm": 0.8283347487449646, + "learning_rate": 3.2869325875801883e-06, + "loss": 0.022, + "step": 38042 + }, + { + "epoch": 0.76088, + "grad_norm": 0.6271109580993652, + "learning_rate": 3.2858977710151753e-06, + "loss": 0.0078, + "step": 38044 + }, + { + "epoch": 0.76092, + "grad_norm": 0.2739318907260895, + "learning_rate": 3.2848630853450447e-06, + "loss": 0.0032, + "step": 38046 + }, + { + "epoch": 0.76096, + "grad_norm": 0.3373001515865326, + "learning_rate": 3.2838285305899586e-06, + "loss": 0.0026, + "step": 38048 + }, + { + "epoch": 0.761, + "grad_norm": 0.09628437459468842, + "learning_rate": 3.2827941067700996e-06, + "loss": 0.0726, + "step": 38050 + }, + { + "epoch": 0.76104, + "grad_norm": 0.001925322925671935, + "learning_rate": 3.2817598139056238e-06, + "loss": 0.0024, + "step": 38052 + }, + { + "epoch": 0.76108, + "grad_norm": 3.226872444152832, + "learning_rate": 3.2807256520167006e-06, + "loss": 0.034, + "step": 38054 + }, + { + "epoch": 0.76112, + "grad_norm": 6.076913356781006, + "learning_rate": 3.27969162112349e-06, + "loss": 0.0762, + "step": 38056 + }, + { + "epoch": 0.76116, + "grad_norm": 0.39425379037857056, + "learning_rate": 3.2786577212461536e-06, + "loss": 0.0045, + "step": 38058 + }, + { + "epoch": 0.7612, + "grad_norm": 0.02576332725584507, + "learning_rate": 3.2776239524048426e-06, + "loss": 0.0004, + "step": 38060 + }, + { + "epoch": 0.76124, + "grad_norm": 0.008264167234301567, + "learning_rate": 3.276590314619713e-06, + "loss": 0.0082, + "step": 38062 + }, + { + "epoch": 0.76128, + "grad_norm": 0.20710551738739014, + "learning_rate": 3.2755568079109167e-06, + "loss": 0.0026, + "step": 38064 + }, + { + "epoch": 0.76132, + "grad_norm": 2.169475555419922, + "learning_rate": 3.274523432298603e-06, + "loss": 0.0213, + "step": 38066 + }, + { + "epoch": 0.76136, + "grad_norm": 0.11752032488584518, + "learning_rate": 3.27349018780292e-06, + "loss": 0.0013, + "step": 38068 + }, + { + "epoch": 0.7614, + "grad_norm": 0.020917793735861778, + "learning_rate": 3.272457074444003e-06, + "loss": 0.0007, + "step": 38070 + }, + { + "epoch": 0.76144, + "grad_norm": 0.36339762806892395, + "learning_rate": 3.271424092242005e-06, + "loss": 0.1302, + "step": 38072 + }, + { + "epoch": 0.76148, + "grad_norm": 0.634331464767456, + "learning_rate": 3.2703912412170547e-06, + "loss": 0.007, + "step": 38074 + }, + { + "epoch": 0.76152, + "grad_norm": 12.356786727905273, + "learning_rate": 3.269358521389293e-06, + "loss": 0.0973, + "step": 38076 + }, + { + "epoch": 0.76156, + "grad_norm": 0.2443520724773407, + "learning_rate": 3.268325932778854e-06, + "loss": 0.0028, + "step": 38078 + }, + { + "epoch": 0.7616, + "grad_norm": 1.7280997037887573, + "learning_rate": 3.2672934754058615e-06, + "loss": 0.015, + "step": 38080 + }, + { + "epoch": 0.76164, + "grad_norm": 0.2294825315475464, + "learning_rate": 3.266261149290455e-06, + "loss": 0.0026, + "step": 38082 + }, + { + "epoch": 0.76168, + "grad_norm": 0.009764998219907284, + "learning_rate": 3.2652289544527504e-06, + "loss": 0.0002, + "step": 38084 + }, + { + "epoch": 0.76172, + "grad_norm": 0.27901870012283325, + "learning_rate": 3.2641968909128753e-06, + "loss": 0.0031, + "step": 38086 + }, + { + "epoch": 0.76176, + "grad_norm": 0.013169911690056324, + "learning_rate": 3.2631649586909486e-06, + "loss": 0.0005, + "step": 38088 + }, + { + "epoch": 0.7618, + "grad_norm": 0.0058658914640545845, + "learning_rate": 3.2621331578070936e-06, + "loss": 0.1055, + "step": 38090 + }, + { + "epoch": 0.76184, + "grad_norm": 0.06920450180768967, + "learning_rate": 3.261101488281415e-06, + "loss": 0.0044, + "step": 38092 + }, + { + "epoch": 0.76188, + "grad_norm": 0.000351660477463156, + "learning_rate": 3.2600699501340383e-06, + "loss": 0.2189, + "step": 38094 + }, + { + "epoch": 0.76192, + "grad_norm": 0.3661144971847534, + "learning_rate": 3.2590385433850648e-06, + "loss": 0.004, + "step": 38096 + }, + { + "epoch": 0.76196, + "grad_norm": 0.006966846529394388, + "learning_rate": 3.258007268054606e-06, + "loss": 0.0001, + "step": 38098 + }, + { + "epoch": 0.762, + "grad_norm": 0.014927344396710396, + "learning_rate": 3.2569761241627694e-06, + "loss": 0.0003, + "step": 38100 + }, + { + "epoch": 0.76204, + "grad_norm": 0.5066494345664978, + "learning_rate": 3.255945111729648e-06, + "loss": 0.0061, + "step": 38102 + }, + { + "epoch": 0.76208, + "grad_norm": 0.03050728514790535, + "learning_rate": 3.2549142307753554e-06, + "loss": 0.0151, + "step": 38104 + }, + { + "epoch": 0.76212, + "grad_norm": 4.58661413192749, + "learning_rate": 3.2538834813199794e-06, + "loss": 0.7374, + "step": 38106 + }, + { + "epoch": 0.76216, + "grad_norm": 0.010528475977480412, + "learning_rate": 3.2528528633836174e-06, + "loss": 0.0007, + "step": 38108 + }, + { + "epoch": 0.7622, + "grad_norm": 0.0458671897649765, + "learning_rate": 3.2518223769863633e-06, + "loss": 0.0008, + "step": 38110 + }, + { + "epoch": 0.76224, + "grad_norm": 0.04198329895734787, + "learning_rate": 3.2507920221483057e-06, + "loss": 0.0122, + "step": 38112 + }, + { + "epoch": 0.76228, + "grad_norm": 0.03158620744943619, + "learning_rate": 3.2497617988895357e-06, + "loss": 0.0005, + "step": 38114 + }, + { + "epoch": 0.76232, + "grad_norm": 0.056460391730070114, + "learning_rate": 3.2487317072301315e-06, + "loss": 0.0006, + "step": 38116 + }, + { + "epoch": 0.76236, + "grad_norm": 0.37599000334739685, + "learning_rate": 3.2477017471901783e-06, + "loss": 0.0045, + "step": 38118 + }, + { + "epoch": 0.7624, + "grad_norm": 0.0017696607392281294, + "learning_rate": 3.2466719187897555e-06, + "loss": 0.0027, + "step": 38120 + }, + { + "epoch": 0.76244, + "grad_norm": 0.11873491108417511, + "learning_rate": 3.245642222048945e-06, + "loss": 0.0013, + "step": 38122 + }, + { + "epoch": 0.76248, + "grad_norm": 0.055883169174194336, + "learning_rate": 3.244612656987809e-06, + "loss": 0.0015, + "step": 38124 + }, + { + "epoch": 0.76252, + "grad_norm": 0.00963860284537077, + "learning_rate": 3.243583223626434e-06, + "loss": 0.0016, + "step": 38126 + }, + { + "epoch": 0.76256, + "grad_norm": 0.23148077726364136, + "learning_rate": 3.24255392198488e-06, + "loss": 0.002, + "step": 38128 + }, + { + "epoch": 0.7626, + "grad_norm": 0.20209915935993195, + "learning_rate": 3.241524752083215e-06, + "loss": 0.0034, + "step": 38130 + }, + { + "epoch": 0.76264, + "grad_norm": 0.00859333761036396, + "learning_rate": 3.240495713941508e-06, + "loss": 0.0008, + "step": 38132 + }, + { + "epoch": 0.76268, + "grad_norm": 0.061015717685222626, + "learning_rate": 3.239466807579812e-06, + "loss": 0.0013, + "step": 38134 + }, + { + "epoch": 0.76272, + "grad_norm": 5.599708557128906, + "learning_rate": 3.2384380330181964e-06, + "loss": 0.0531, + "step": 38136 + }, + { + "epoch": 0.76276, + "grad_norm": 0.1349974274635315, + "learning_rate": 3.237409390276706e-06, + "loss": 0.0033, + "step": 38138 + }, + { + "epoch": 0.7628, + "grad_norm": 0.0020822174847126007, + "learning_rate": 3.2363808793754082e-06, + "loss": 0.0027, + "step": 38140 + }, + { + "epoch": 0.76284, + "grad_norm": 0.2232138067483902, + "learning_rate": 3.2353525003343434e-06, + "loss": 0.0048, + "step": 38142 + }, + { + "epoch": 0.76288, + "grad_norm": 0.01039174199104309, + "learning_rate": 3.234324253173564e-06, + "loss": 0.0002, + "step": 38144 + }, + { + "epoch": 0.76292, + "grad_norm": 0.004977206699550152, + "learning_rate": 3.23329613791312e-06, + "loss": 0.0073, + "step": 38146 + }, + { + "epoch": 0.76296, + "grad_norm": 0.08874630182981491, + "learning_rate": 3.2322681545730494e-06, + "loss": 0.0021, + "step": 38148 + }, + { + "epoch": 0.763, + "grad_norm": 0.21431881189346313, + "learning_rate": 3.2312403031733943e-06, + "loss": 0.0091, + "step": 38150 + }, + { + "epoch": 0.76304, + "grad_norm": 0.042682912200689316, + "learning_rate": 3.2302125837341946e-06, + "loss": 0.0009, + "step": 38152 + }, + { + "epoch": 0.76308, + "grad_norm": 0.005111386999487877, + "learning_rate": 3.229184996275485e-06, + "loss": 0.0017, + "step": 38154 + }, + { + "epoch": 0.76312, + "grad_norm": 0.3948865234851837, + "learning_rate": 3.2281575408173004e-06, + "loss": 0.0044, + "step": 38156 + }, + { + "epoch": 0.76316, + "grad_norm": 1.1042969226837158, + "learning_rate": 3.2271302173796747e-06, + "loss": 0.0125, + "step": 38158 + }, + { + "epoch": 0.7632, + "grad_norm": 0.0027815077919512987, + "learning_rate": 3.2261030259826287e-06, + "loss": 0.0001, + "step": 38160 + }, + { + "epoch": 0.76324, + "grad_norm": 5.931808948516846, + "learning_rate": 3.225075966646192e-06, + "loss": 0.0544, + "step": 38162 + }, + { + "epoch": 0.76328, + "grad_norm": 0.20901651680469513, + "learning_rate": 3.2240490393903866e-06, + "loss": 0.0045, + "step": 38164 + }, + { + "epoch": 0.76332, + "grad_norm": 0.014986636117100716, + "learning_rate": 3.2230222442352343e-06, + "loss": 0.0004, + "step": 38166 + }, + { + "epoch": 0.76336, + "grad_norm": 0.015686657279729843, + "learning_rate": 3.2219955812007555e-06, + "loss": 0.0005, + "step": 38168 + }, + { + "epoch": 0.7634, + "grad_norm": 0.1048240214586258, + "learning_rate": 3.2209690503069545e-06, + "loss": 0.0011, + "step": 38170 + }, + { + "epoch": 0.76344, + "grad_norm": 0.06147996708750725, + "learning_rate": 3.2199426515738607e-06, + "loss": 0.0027, + "step": 38172 + }, + { + "epoch": 0.76348, + "grad_norm": 0.01812787540256977, + "learning_rate": 3.2189163850214713e-06, + "loss": 0.0018, + "step": 38174 + }, + { + "epoch": 0.76352, + "grad_norm": 0.013807956129312515, + "learning_rate": 3.2178902506697974e-06, + "loss": 0.047, + "step": 38176 + }, + { + "epoch": 0.76356, + "grad_norm": 0.046989694237709045, + "learning_rate": 3.216864248538846e-06, + "loss": 0.0424, + "step": 38178 + }, + { + "epoch": 0.7636, + "grad_norm": 0.06516039371490479, + "learning_rate": 3.2158383786486204e-06, + "loss": 0.0009, + "step": 38180 + }, + { + "epoch": 0.76364, + "grad_norm": 0.012541557662189007, + "learning_rate": 3.214812641019115e-06, + "loss": 0.0008, + "step": 38182 + }, + { + "epoch": 0.76368, + "grad_norm": 2.966162919998169, + "learning_rate": 3.2137870356703314e-06, + "loss": 0.0377, + "step": 38184 + }, + { + "epoch": 0.76372, + "grad_norm": 0.010710958391427994, + "learning_rate": 3.212761562622263e-06, + "loss": 0.1157, + "step": 38186 + }, + { + "epoch": 0.76376, + "grad_norm": 0.2538725733757019, + "learning_rate": 3.2117362218949023e-06, + "loss": 0.0032, + "step": 38188 + }, + { + "epoch": 0.7638, + "grad_norm": 0.13734710216522217, + "learning_rate": 3.210711013508242e-06, + "loss": 0.0016, + "step": 38190 + }, + { + "epoch": 0.76384, + "grad_norm": 0.17254279553890228, + "learning_rate": 3.2096859374822587e-06, + "loss": 0.0017, + "step": 38192 + }, + { + "epoch": 0.76388, + "grad_norm": 0.8502935171127319, + "learning_rate": 3.2086609938369504e-06, + "loss": 0.0105, + "step": 38194 + }, + { + "epoch": 0.76392, + "grad_norm": 4.712854862213135, + "learning_rate": 3.207636182592291e-06, + "loss": 0.0722, + "step": 38196 + }, + { + "epoch": 0.76396, + "grad_norm": 0.029078328981995583, + "learning_rate": 3.2066115037682587e-06, + "loss": 0.0004, + "step": 38198 + }, + { + "epoch": 0.764, + "grad_norm": 0.06609377264976501, + "learning_rate": 3.2055869573848374e-06, + "loss": 0.0021, + "step": 38200 + }, + { + "epoch": 0.76404, + "grad_norm": 0.006746009457856417, + "learning_rate": 3.20456254346199e-06, + "loss": 0.0006, + "step": 38202 + }, + { + "epoch": 0.76408, + "grad_norm": 0.964792788028717, + "learning_rate": 3.2035382620197e-06, + "loss": 0.0147, + "step": 38204 + }, + { + "epoch": 0.76412, + "grad_norm": 0.026453807950019836, + "learning_rate": 3.202514113077928e-06, + "loss": 0.0005, + "step": 38206 + }, + { + "epoch": 0.76416, + "grad_norm": 0.05995936319231987, + "learning_rate": 3.201490096656643e-06, + "loss": 0.0012, + "step": 38208 + }, + { + "epoch": 0.7642, + "grad_norm": 0.13300469517707825, + "learning_rate": 3.200466212775808e-06, + "loss": 0.0013, + "step": 38210 + }, + { + "epoch": 0.76424, + "grad_norm": 8.663825988769531, + "learning_rate": 3.1994424614553886e-06, + "loss": 0.1494, + "step": 38212 + }, + { + "epoch": 0.76428, + "grad_norm": 0.09251787513494492, + "learning_rate": 3.1984188427153364e-06, + "loss": 0.0041, + "step": 38214 + }, + { + "epoch": 0.76432, + "grad_norm": 0.022783329710364342, + "learning_rate": 3.19739535657561e-06, + "loss": 0.0042, + "step": 38216 + }, + { + "epoch": 0.76436, + "grad_norm": 0.03716212511062622, + "learning_rate": 3.196372003056164e-06, + "loss": 0.0007, + "step": 38218 + }, + { + "epoch": 0.7644, + "grad_norm": 0.3513886034488678, + "learning_rate": 3.195348782176948e-06, + "loss": 0.0046, + "step": 38220 + }, + { + "epoch": 0.76444, + "grad_norm": 0.030687304213643074, + "learning_rate": 3.1943256939579138e-06, + "loss": 0.0004, + "step": 38222 + }, + { + "epoch": 0.76448, + "grad_norm": 0.02986607700586319, + "learning_rate": 3.1933027384189984e-06, + "loss": 0.0034, + "step": 38224 + }, + { + "epoch": 0.76452, + "grad_norm": 0.0033848388120532036, + "learning_rate": 3.1922799155801566e-06, + "loss": 0.0002, + "step": 38226 + }, + { + "epoch": 0.76456, + "grad_norm": 0.0038776013534516096, + "learning_rate": 3.1912572254613183e-06, + "loss": 0.0002, + "step": 38228 + }, + { + "epoch": 0.7646, + "grad_norm": 0.043406981974840164, + "learning_rate": 3.190234668082427e-06, + "loss": 0.0012, + "step": 38230 + }, + { + "epoch": 0.76464, + "grad_norm": 13.535776138305664, + "learning_rate": 3.1892122434634163e-06, + "loss": 0.4884, + "step": 38232 + }, + { + "epoch": 0.76468, + "grad_norm": 0.35494762659072876, + "learning_rate": 3.18818995162422e-06, + "loss": 0.0032, + "step": 38234 + }, + { + "epoch": 0.76472, + "grad_norm": 2.295006036758423, + "learning_rate": 3.1871677925847697e-06, + "loss": 0.024, + "step": 38236 + }, + { + "epoch": 0.76476, + "grad_norm": 0.10248809307813644, + "learning_rate": 3.1861457663649887e-06, + "loss": 0.0019, + "step": 38238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.051414210349321365, + "learning_rate": 3.1851238729848033e-06, + "loss": 0.0006, + "step": 38240 + }, + { + "epoch": 0.76484, + "grad_norm": 0.381330668926239, + "learning_rate": 3.184102112464137e-06, + "loss": 0.0054, + "step": 38242 + }, + { + "epoch": 0.76488, + "grad_norm": 0.009433358907699585, + "learning_rate": 3.183080484822909e-06, + "loss": 0.0005, + "step": 38244 + }, + { + "epoch": 0.76492, + "grad_norm": 0.17848044633865356, + "learning_rate": 3.182058990081036e-06, + "loss": 0.0022, + "step": 38246 + }, + { + "epoch": 0.76496, + "grad_norm": 1.298609972000122, + "learning_rate": 3.1810376282584363e-06, + "loss": 0.0176, + "step": 38248 + }, + { + "epoch": 0.765, + "grad_norm": 0.13306830823421478, + "learning_rate": 3.1800163993750166e-06, + "loss": 0.0016, + "step": 38250 + }, + { + "epoch": 0.76504, + "grad_norm": 0.025282662361860275, + "learning_rate": 3.178995303450687e-06, + "loss": 0.0004, + "step": 38252 + }, + { + "epoch": 0.76508, + "grad_norm": 0.0027042068541049957, + "learning_rate": 3.1779743405053598e-06, + "loss": 0.0024, + "step": 38254 + }, + { + "epoch": 0.76512, + "grad_norm": 0.03359360247850418, + "learning_rate": 3.1769535105589278e-06, + "loss": 0.0007, + "step": 38256 + }, + { + "epoch": 0.76516, + "grad_norm": 9.797099113464355, + "learning_rate": 3.175932813631307e-06, + "loss": 0.1634, + "step": 38258 + }, + { + "epoch": 0.7652, + "grad_norm": 0.0986252948641777, + "learning_rate": 3.174912249742382e-06, + "loss": 0.0016, + "step": 38260 + }, + { + "epoch": 0.76524, + "grad_norm": 0.024219820275902748, + "learning_rate": 3.173891818912064e-06, + "loss": 0.0013, + "step": 38262 + }, + { + "epoch": 0.76528, + "grad_norm": 0.08333341777324677, + "learning_rate": 3.172871521160236e-06, + "loss": 0.0305, + "step": 38264 + }, + { + "epoch": 0.76532, + "grad_norm": 1.1334205865859985, + "learning_rate": 3.171851356506792e-06, + "loss": 0.013, + "step": 38266 + }, + { + "epoch": 0.76536, + "grad_norm": 0.052993208169937134, + "learning_rate": 3.1708313249716238e-06, + "loss": 0.001, + "step": 38268 + }, + { + "epoch": 0.7654, + "grad_norm": 0.08819083124399185, + "learning_rate": 3.1698114265746126e-06, + "loss": 0.0019, + "step": 38270 + }, + { + "epoch": 0.76544, + "grad_norm": 0.4169895052909851, + "learning_rate": 3.1687916613356438e-06, + "loss": 0.0256, + "step": 38272 + }, + { + "epoch": 0.76548, + "grad_norm": 1.550552248954773, + "learning_rate": 3.1677720292745984e-06, + "loss": 0.019, + "step": 38274 + }, + { + "epoch": 0.76552, + "grad_norm": 0.05158895626664162, + "learning_rate": 3.1667525304113546e-06, + "loss": 0.0796, + "step": 38276 + }, + { + "epoch": 0.76556, + "grad_norm": 0.10064905881881714, + "learning_rate": 3.1657331647657886e-06, + "loss": 0.2845, + "step": 38278 + }, + { + "epoch": 0.7656, + "grad_norm": 0.1487428992986679, + "learning_rate": 3.164713932357776e-06, + "loss": 0.0014, + "step": 38280 + }, + { + "epoch": 0.76564, + "grad_norm": 1.2032116651535034, + "learning_rate": 3.1636948332071814e-06, + "loss": 0.0098, + "step": 38282 + }, + { + "epoch": 0.76568, + "grad_norm": 0.05330892279744148, + "learning_rate": 3.1626758673338754e-06, + "loss": 0.0011, + "step": 38284 + }, + { + "epoch": 0.76572, + "grad_norm": 3.7320778369903564, + "learning_rate": 3.1616570347577234e-06, + "loss": 0.0426, + "step": 38286 + }, + { + "epoch": 0.76576, + "grad_norm": 0.4292931854724884, + "learning_rate": 3.1606383354985883e-06, + "loss": 0.3047, + "step": 38288 + }, + { + "epoch": 0.7658, + "grad_norm": 0.31757354736328125, + "learning_rate": 3.159619769576333e-06, + "loss": 0.0034, + "step": 38290 + }, + { + "epoch": 0.76584, + "grad_norm": 0.006792178377509117, + "learning_rate": 3.158601337010806e-06, + "loss": 0.0001, + "step": 38292 + }, + { + "epoch": 0.76588, + "grad_norm": 0.0011405008845031261, + "learning_rate": 3.157583037821874e-06, + "loss": 0.0002, + "step": 38294 + }, + { + "epoch": 0.76592, + "grad_norm": 0.026992663741111755, + "learning_rate": 3.1565648720293794e-06, + "loss": 0.0014, + "step": 38296 + }, + { + "epoch": 0.76596, + "grad_norm": 0.002351111965253949, + "learning_rate": 3.1555468396531775e-06, + "loss": 0.0001, + "step": 38298 + }, + { + "epoch": 0.766, + "grad_norm": 0.025956150144338608, + "learning_rate": 3.1545289407131128e-06, + "loss": 0.0016, + "step": 38300 + }, + { + "epoch": 0.76604, + "grad_norm": 0.009047850966453552, + "learning_rate": 3.153511175229034e-06, + "loss": 0.0003, + "step": 38302 + }, + { + "epoch": 0.76608, + "grad_norm": 0.21501843631267548, + "learning_rate": 3.152493543220777e-06, + "loss": 0.0023, + "step": 38304 + }, + { + "epoch": 0.76612, + "grad_norm": 1.1708879470825195, + "learning_rate": 3.151476044708183e-06, + "loss": 0.0198, + "step": 38306 + }, + { + "epoch": 0.76616, + "grad_norm": 0.6724026203155518, + "learning_rate": 3.150458679711089e-06, + "loss": 0.0076, + "step": 38308 + }, + { + "epoch": 0.7662, + "grad_norm": 0.1630762219429016, + "learning_rate": 3.149441448249331e-06, + "loss": 0.0167, + "step": 38310 + }, + { + "epoch": 0.76624, + "grad_norm": 0.24435150623321533, + "learning_rate": 3.1484243503427415e-06, + "loss": 0.0087, + "step": 38312 + }, + { + "epoch": 0.76628, + "grad_norm": 12.239954948425293, + "learning_rate": 3.14740738601114e-06, + "loss": 0.1762, + "step": 38314 + }, + { + "epoch": 0.76632, + "grad_norm": 0.018335822969675064, + "learning_rate": 3.1463905552743656e-06, + "loss": 0.0007, + "step": 38316 + }, + { + "epoch": 0.76636, + "grad_norm": 0.7067418694496155, + "learning_rate": 3.145373858152232e-06, + "loss": 0.012, + "step": 38318 + }, + { + "epoch": 0.7664, + "grad_norm": 2.0350797176361084, + "learning_rate": 3.144357294664565e-06, + "loss": 0.0151, + "step": 38320 + }, + { + "epoch": 0.76644, + "grad_norm": 0.027305232360959053, + "learning_rate": 3.143340864831186e-06, + "loss": 0.0004, + "step": 38322 + }, + { + "epoch": 0.76648, + "grad_norm": 0.5046447515487671, + "learning_rate": 3.1423245686718984e-06, + "loss": 0.0055, + "step": 38324 + }, + { + "epoch": 0.76652, + "grad_norm": 1.575073480606079, + "learning_rate": 3.1413084062065326e-06, + "loss": 0.0189, + "step": 38326 + }, + { + "epoch": 0.76656, + "grad_norm": 0.022180328145623207, + "learning_rate": 3.1402923774548855e-06, + "loss": 0.0006, + "step": 38328 + }, + { + "epoch": 0.7666, + "grad_norm": 0.14214982092380524, + "learning_rate": 3.1392764824367706e-06, + "loss": 0.0025, + "step": 38330 + }, + { + "epoch": 0.76664, + "grad_norm": 0.18468771874904633, + "learning_rate": 3.138260721171993e-06, + "loss": 0.002, + "step": 38332 + }, + { + "epoch": 0.76668, + "grad_norm": 0.13301491737365723, + "learning_rate": 3.137245093680359e-06, + "loss": 0.0015, + "step": 38334 + }, + { + "epoch": 0.76672, + "grad_norm": 0.01978948898613453, + "learning_rate": 3.1362295999816617e-06, + "loss": 0.0004, + "step": 38336 + }, + { + "epoch": 0.76676, + "grad_norm": 11.079514503479004, + "learning_rate": 3.135214240095701e-06, + "loss": 0.1404, + "step": 38338 + }, + { + "epoch": 0.7668, + "grad_norm": 0.06863008439540863, + "learning_rate": 3.134199014042274e-06, + "loss": 0.0011, + "step": 38340 + }, + { + "epoch": 0.76684, + "grad_norm": 1.6699700355529785, + "learning_rate": 3.133183921841172e-06, + "loss": 0.016, + "step": 38342 + }, + { + "epoch": 0.76688, + "grad_norm": 4.037399768829346, + "learning_rate": 3.1321689635121877e-06, + "loss": 0.047, + "step": 38344 + }, + { + "epoch": 0.76692, + "grad_norm": 0.19408760964870453, + "learning_rate": 3.1311541390750987e-06, + "loss": 0.0023, + "step": 38346 + }, + { + "epoch": 0.76696, + "grad_norm": 0.5334054827690125, + "learning_rate": 3.1301394485497026e-06, + "loss": 0.0062, + "step": 38348 + }, + { + "epoch": 0.767, + "grad_norm": 0.009648832492530346, + "learning_rate": 3.1291248919557717e-06, + "loss": 0.001, + "step": 38350 + }, + { + "epoch": 0.76704, + "grad_norm": 0.008105745539069176, + "learning_rate": 3.1281104693130883e-06, + "loss": 0.0017, + "step": 38352 + }, + { + "epoch": 0.76708, + "grad_norm": 0.26402783393859863, + "learning_rate": 3.1270961806414278e-06, + "loss": 0.0054, + "step": 38354 + }, + { + "epoch": 0.76712, + "grad_norm": 1.0463756322860718, + "learning_rate": 3.1260820259605675e-06, + "loss": 0.0133, + "step": 38356 + }, + { + "epoch": 0.76716, + "grad_norm": 0.22317110002040863, + "learning_rate": 3.1250680052902793e-06, + "loss": 0.003, + "step": 38358 + }, + { + "epoch": 0.7672, + "grad_norm": 0.25883957743644714, + "learning_rate": 3.124054118650327e-06, + "loss": 0.0045, + "step": 38360 + }, + { + "epoch": 0.76724, + "grad_norm": 0.09485504031181335, + "learning_rate": 3.1230403660604804e-06, + "loss": 0.0018, + "step": 38362 + }, + { + "epoch": 0.76728, + "grad_norm": 0.10647350549697876, + "learning_rate": 3.1220267475405007e-06, + "loss": 0.151, + "step": 38364 + }, + { + "epoch": 0.76732, + "grad_norm": 0.015268945135176182, + "learning_rate": 3.1210132631101518e-06, + "loss": 0.0083, + "step": 38366 + }, + { + "epoch": 0.76736, + "grad_norm": 9.368019104003906, + "learning_rate": 3.119999912789191e-06, + "loss": 0.1268, + "step": 38368 + }, + { + "epoch": 0.7674, + "grad_norm": 0.008393548429012299, + "learning_rate": 3.118986696597377e-06, + "loss": 0.0015, + "step": 38370 + }, + { + "epoch": 0.76744, + "grad_norm": 0.3448280990123749, + "learning_rate": 3.1179736145544557e-06, + "loss": 0.0046, + "step": 38372 + }, + { + "epoch": 0.76748, + "grad_norm": 0.14526373147964478, + "learning_rate": 3.1169606666801834e-06, + "loss": 0.0039, + "step": 38374 + }, + { + "epoch": 0.76752, + "grad_norm": 0.9980112910270691, + "learning_rate": 3.1159478529943087e-06, + "loss": 0.0111, + "step": 38376 + }, + { + "epoch": 0.76756, + "grad_norm": 0.014904039911925793, + "learning_rate": 3.114935173516569e-06, + "loss": 0.0007, + "step": 38378 + }, + { + "epoch": 0.7676, + "grad_norm": 0.29509660601615906, + "learning_rate": 3.113922628266718e-06, + "loss": 0.0022, + "step": 38380 + }, + { + "epoch": 0.76764, + "grad_norm": 0.003502664854750037, + "learning_rate": 3.1129102172644856e-06, + "loss": 0.0003, + "step": 38382 + }, + { + "epoch": 0.76768, + "grad_norm": 1.9211207628250122, + "learning_rate": 3.111897940529619e-06, + "loss": 0.0189, + "step": 38384 + }, + { + "epoch": 0.76772, + "grad_norm": 0.008010782301425934, + "learning_rate": 3.1108857980818463e-06, + "loss": 0.0073, + "step": 38386 + }, + { + "epoch": 0.76776, + "grad_norm": 0.9576330184936523, + "learning_rate": 3.1098737899409013e-06, + "loss": 0.0105, + "step": 38388 + }, + { + "epoch": 0.7678, + "grad_norm": 0.1282198578119278, + "learning_rate": 3.108861916126518e-06, + "loss": 0.0021, + "step": 38390 + }, + { + "epoch": 0.76784, + "grad_norm": 0.8339408040046692, + "learning_rate": 3.1078501766584157e-06, + "loss": 0.0107, + "step": 38392 + }, + { + "epoch": 0.76788, + "grad_norm": 0.031176695600152016, + "learning_rate": 3.1068385715563222e-06, + "loss": 0.0004, + "step": 38394 + }, + { + "epoch": 0.76792, + "grad_norm": 0.018359461799263954, + "learning_rate": 3.10582710083996e-06, + "loss": 0.0004, + "step": 38396 + }, + { + "epoch": 0.76796, + "grad_norm": 5.334664821624756, + "learning_rate": 3.104815764529048e-06, + "loss": 0.0584, + "step": 38398 + }, + { + "epoch": 0.768, + "grad_norm": 0.13287582993507385, + "learning_rate": 3.103804562643302e-06, + "loss": 0.0014, + "step": 38400 + }, + { + "epoch": 0.76804, + "grad_norm": 0.05482267960906029, + "learning_rate": 3.1027934952024396e-06, + "loss": 0.0053, + "step": 38402 + }, + { + "epoch": 0.76808, + "grad_norm": 0.010372234508395195, + "learning_rate": 3.1017825622261664e-06, + "loss": 0.0006, + "step": 38404 + }, + { + "epoch": 0.76812, + "grad_norm": 0.015217731706798077, + "learning_rate": 3.1007717637341927e-06, + "loss": 0.0005, + "step": 38406 + }, + { + "epoch": 0.76816, + "grad_norm": 0.07071471959352493, + "learning_rate": 3.0997610997462292e-06, + "loss": 0.0013, + "step": 38408 + }, + { + "epoch": 0.7682, + "grad_norm": 0.01656685769557953, + "learning_rate": 3.0987505702819687e-06, + "loss": 0.0065, + "step": 38410 + }, + { + "epoch": 0.76824, + "grad_norm": 0.2198263555765152, + "learning_rate": 3.097740175361126e-06, + "loss": 0.002, + "step": 38412 + }, + { + "epoch": 0.76828, + "grad_norm": 0.07066398113965988, + "learning_rate": 3.0967299150033857e-06, + "loss": 0.0008, + "step": 38414 + }, + { + "epoch": 0.76832, + "grad_norm": 0.3183133602142334, + "learning_rate": 3.0957197892284553e-06, + "loss": 0.0052, + "step": 38416 + }, + { + "epoch": 0.76836, + "grad_norm": 2.612422227859497, + "learning_rate": 3.0947097980560193e-06, + "loss": 0.1173, + "step": 38418 + }, + { + "epoch": 0.7684, + "grad_norm": 0.04917977377772331, + "learning_rate": 3.0936999415057712e-06, + "loss": 0.0005, + "step": 38420 + }, + { + "epoch": 0.76844, + "grad_norm": 0.026956690475344658, + "learning_rate": 3.0926902195973986e-06, + "loss": 0.0004, + "step": 38422 + }, + { + "epoch": 0.76848, + "grad_norm": 0.10634249448776245, + "learning_rate": 3.0916806323505887e-06, + "loss": 0.0012, + "step": 38424 + }, + { + "epoch": 0.76852, + "grad_norm": 0.01666395366191864, + "learning_rate": 3.0906711797850188e-06, + "loss": 0.0003, + "step": 38426 + }, + { + "epoch": 0.76856, + "grad_norm": 0.35669127106666565, + "learning_rate": 3.089661861920371e-06, + "loss": 0.0126, + "step": 38428 + }, + { + "epoch": 0.7686, + "grad_norm": 0.5447810888290405, + "learning_rate": 3.0886526787763237e-06, + "loss": 0.0053, + "step": 38430 + }, + { + "epoch": 0.76864, + "grad_norm": 3.277651786804199, + "learning_rate": 3.08764363037255e-06, + "loss": 0.0289, + "step": 38432 + }, + { + "epoch": 0.76868, + "grad_norm": 0.49103233218193054, + "learning_rate": 3.0866347167287257e-06, + "loss": 0.0065, + "step": 38434 + }, + { + "epoch": 0.76872, + "grad_norm": 0.11007943749427795, + "learning_rate": 3.0856259378645103e-06, + "loss": 0.0011, + "step": 38436 + }, + { + "epoch": 0.76876, + "grad_norm": 0.03501405194401741, + "learning_rate": 3.0846172937995835e-06, + "loss": 0.0003, + "step": 38438 + }, + { + "epoch": 0.7688, + "grad_norm": 0.38833704590797424, + "learning_rate": 3.0836087845536e-06, + "loss": 0.0039, + "step": 38440 + }, + { + "epoch": 0.76884, + "grad_norm": 0.017685476690530777, + "learning_rate": 3.0826004101462237e-06, + "loss": 0.0025, + "step": 38442 + }, + { + "epoch": 0.76888, + "grad_norm": 0.0059349872171878815, + "learning_rate": 3.0815921705971163e-06, + "loss": 0.0005, + "step": 38444 + }, + { + "epoch": 0.76892, + "grad_norm": 0.06859871000051498, + "learning_rate": 3.0805840659259255e-06, + "loss": 0.1162, + "step": 38446 + }, + { + "epoch": 0.76896, + "grad_norm": 0.03970867767930031, + "learning_rate": 3.079576096152318e-06, + "loss": 0.0021, + "step": 38448 + }, + { + "epoch": 0.769, + "grad_norm": 2.7842161655426025, + "learning_rate": 3.0785682612959334e-06, + "loss": 0.0397, + "step": 38450 + }, + { + "epoch": 0.76904, + "grad_norm": 0.03639841079711914, + "learning_rate": 3.0775605613764235e-06, + "loss": 0.0047, + "step": 38452 + }, + { + "epoch": 0.76908, + "grad_norm": 0.004295404069125652, + "learning_rate": 3.0765529964134343e-06, + "loss": 0.001, + "step": 38454 + }, + { + "epoch": 0.76912, + "grad_norm": 0.37324273586273193, + "learning_rate": 3.0755455664266098e-06, + "loss": 0.0053, + "step": 38456 + }, + { + "epoch": 0.76916, + "grad_norm": 0.10530084371566772, + "learning_rate": 3.074538271435592e-06, + "loss": 0.0011, + "step": 38458 + }, + { + "epoch": 0.7692, + "grad_norm": 2.265509843826294, + "learning_rate": 3.073531111460013e-06, + "loss": 0.0226, + "step": 38460 + }, + { + "epoch": 0.76924, + "grad_norm": 0.08033714443445206, + "learning_rate": 3.0725240865195117e-06, + "loss": 0.0025, + "step": 38462 + }, + { + "epoch": 0.76928, + "grad_norm": 1.6090362071990967, + "learning_rate": 3.0715171966337186e-06, + "loss": 0.0233, + "step": 38464 + }, + { + "epoch": 0.76932, + "grad_norm": 0.24778470396995544, + "learning_rate": 3.070510441822269e-06, + "loss": 0.0021, + "step": 38466 + }, + { + "epoch": 0.76936, + "grad_norm": 0.2216847985982895, + "learning_rate": 3.069503822104779e-06, + "loss": 0.0027, + "step": 38468 + }, + { + "epoch": 0.7694, + "grad_norm": 0.1836395263671875, + "learning_rate": 3.0684973375008865e-06, + "loss": 0.0023, + "step": 38470 + }, + { + "epoch": 0.76944, + "grad_norm": 0.00873227696865797, + "learning_rate": 3.067490988030204e-06, + "loss": 0.0005, + "step": 38472 + }, + { + "epoch": 0.76948, + "grad_norm": 0.19133327901363373, + "learning_rate": 3.0664847737123538e-06, + "loss": 0.0035, + "step": 38474 + }, + { + "epoch": 0.76952, + "grad_norm": 1.0238044261932373, + "learning_rate": 3.0654786945669544e-06, + "loss": 0.0116, + "step": 38476 + }, + { + "epoch": 0.76956, + "grad_norm": 0.20874708890914917, + "learning_rate": 3.0644727506136128e-06, + "loss": 0.0028, + "step": 38478 + }, + { + "epoch": 0.7696, + "grad_norm": 0.008450813591480255, + "learning_rate": 3.063466941871952e-06, + "loss": 0.0005, + "step": 38480 + }, + { + "epoch": 0.76964, + "grad_norm": 0.07072997093200684, + "learning_rate": 3.0624612683615717e-06, + "loss": 0.0009, + "step": 38482 + }, + { + "epoch": 0.76968, + "grad_norm": 0.02743016928434372, + "learning_rate": 3.0614557301020797e-06, + "loss": 0.0029, + "step": 38484 + }, + { + "epoch": 0.76972, + "grad_norm": 1.006465196609497, + "learning_rate": 3.0604503271130803e-06, + "loss": 0.0103, + "step": 38486 + }, + { + "epoch": 0.76976, + "grad_norm": 0.047442324459552765, + "learning_rate": 3.0594450594141758e-06, + "loss": 0.0008, + "step": 38488 + }, + { + "epoch": 0.7698, + "grad_norm": 0.00902220606803894, + "learning_rate": 3.058439927024962e-06, + "loss": 0.0006, + "step": 38490 + }, + { + "epoch": 0.76984, + "grad_norm": 0.2208077609539032, + "learning_rate": 3.0574349299650387e-06, + "loss": 0.0021, + "step": 38492 + }, + { + "epoch": 0.76988, + "grad_norm": 0.04425165802240372, + "learning_rate": 3.056430068253993e-06, + "loss": 0.0015, + "step": 38494 + }, + { + "epoch": 0.76992, + "grad_norm": 0.8103742599487305, + "learning_rate": 3.0554253419114175e-06, + "loss": 0.0064, + "step": 38496 + }, + { + "epoch": 0.76996, + "grad_norm": 0.08398205041885376, + "learning_rate": 3.0544207509569033e-06, + "loss": 0.001, + "step": 38498 + }, + { + "epoch": 0.77, + "grad_norm": 0.170162171125412, + "learning_rate": 3.0534162954100264e-06, + "loss": 0.0025, + "step": 38500 + }, + { + "epoch": 0.77004, + "grad_norm": 0.26429370045661926, + "learning_rate": 3.0524119752903812e-06, + "loss": 0.3757, + "step": 38502 + }, + { + "epoch": 0.77008, + "grad_norm": 1.3110814094543457, + "learning_rate": 3.0514077906175355e-06, + "loss": 0.0113, + "step": 38504 + }, + { + "epoch": 0.77012, + "grad_norm": 2.5087573528289795, + "learning_rate": 3.0504037414110786e-06, + "loss": 0.024, + "step": 38506 + }, + { + "epoch": 0.77016, + "grad_norm": 0.021633993834257126, + "learning_rate": 3.0493998276905757e-06, + "loss": 0.0006, + "step": 38508 + }, + { + "epoch": 0.7702, + "grad_norm": 0.014341375790536404, + "learning_rate": 3.0483960494756017e-06, + "loss": 0.0004, + "step": 38510 + }, + { + "epoch": 0.77024, + "grad_norm": 0.04968715459108353, + "learning_rate": 3.0473924067857273e-06, + "loss": 0.001, + "step": 38512 + }, + { + "epoch": 0.77028, + "grad_norm": 0.03766226768493652, + "learning_rate": 3.0463888996405155e-06, + "loss": 0.0004, + "step": 38514 + }, + { + "epoch": 0.77032, + "grad_norm": 14.920443534851074, + "learning_rate": 3.0453855280595326e-06, + "loss": 0.3546, + "step": 38516 + }, + { + "epoch": 0.77036, + "grad_norm": 0.06485830247402191, + "learning_rate": 3.044382292062338e-06, + "loss": 0.0012, + "step": 38518 + }, + { + "epoch": 0.7704, + "grad_norm": 0.10027432441711426, + "learning_rate": 3.043379191668492e-06, + "loss": 0.0014, + "step": 38520 + }, + { + "epoch": 0.77044, + "grad_norm": 0.10994622111320496, + "learning_rate": 3.042376226897551e-06, + "loss": 0.0013, + "step": 38522 + }, + { + "epoch": 0.77048, + "grad_norm": 0.026219233870506287, + "learning_rate": 3.0413733977690697e-06, + "loss": 0.0004, + "step": 38524 + }, + { + "epoch": 0.77052, + "grad_norm": 0.003969974350184202, + "learning_rate": 3.0403707043025934e-06, + "loss": 0.0003, + "step": 38526 + }, + { + "epoch": 0.77056, + "grad_norm": 0.09093854576349258, + "learning_rate": 3.039368146517674e-06, + "loss": 0.003, + "step": 38528 + }, + { + "epoch": 0.7706, + "grad_norm": 0.44171422719955444, + "learning_rate": 3.038365724433858e-06, + "loss": 0.0031, + "step": 38530 + }, + { + "epoch": 0.77064, + "grad_norm": 0.01793062314391136, + "learning_rate": 3.0373634380706807e-06, + "loss": 0.0037, + "step": 38532 + }, + { + "epoch": 0.77068, + "grad_norm": 0.4382476806640625, + "learning_rate": 3.0363612874476943e-06, + "loss": 0.004, + "step": 38534 + }, + { + "epoch": 0.77072, + "grad_norm": 0.0702294260263443, + "learning_rate": 3.0353592725844238e-06, + "loss": 0.0023, + "step": 38536 + }, + { + "epoch": 0.77076, + "grad_norm": 0.05663343146443367, + "learning_rate": 3.0343573935004157e-06, + "loss": 0.0027, + "step": 38538 + }, + { + "epoch": 0.7708, + "grad_norm": 0.09272366017103195, + "learning_rate": 3.033355650215193e-06, + "loss": 0.0017, + "step": 38540 + }, + { + "epoch": 0.77084, + "grad_norm": 0.09799176454544067, + "learning_rate": 3.0323540427482887e-06, + "loss": 0.0008, + "step": 38542 + }, + { + "epoch": 0.77088, + "grad_norm": 0.032311029732227325, + "learning_rate": 3.0313525711192325e-06, + "loss": 0.0005, + "step": 38544 + }, + { + "epoch": 0.77092, + "grad_norm": 0.6719352006912231, + "learning_rate": 3.0303512353475406e-06, + "loss": 0.01, + "step": 38546 + }, + { + "epoch": 0.77096, + "grad_norm": 0.16353604197502136, + "learning_rate": 3.029350035452745e-06, + "loss": 0.0015, + "step": 38548 + }, + { + "epoch": 0.771, + "grad_norm": 0.10467034578323364, + "learning_rate": 3.028348971454356e-06, + "loss": 0.0039, + "step": 38550 + }, + { + "epoch": 0.77104, + "grad_norm": 0.015986202284693718, + "learning_rate": 3.027348043371893e-06, + "loss": 0.0171, + "step": 38552 + }, + { + "epoch": 0.77108, + "grad_norm": 0.002072657458484173, + "learning_rate": 3.026347251224869e-06, + "loss": 0.0002, + "step": 38554 + }, + { + "epoch": 0.77112, + "grad_norm": 0.10213206708431244, + "learning_rate": 3.0253465950328e-06, + "loss": 0.0015, + "step": 38556 + }, + { + "epoch": 0.77116, + "grad_norm": 0.03485596179962158, + "learning_rate": 3.0243460748151833e-06, + "loss": 0.0006, + "step": 38558 + }, + { + "epoch": 0.7712, + "grad_norm": 0.24357295036315918, + "learning_rate": 3.023345690591537e-06, + "loss": 0.0093, + "step": 38560 + }, + { + "epoch": 0.77124, + "grad_norm": 1.0006273984909058, + "learning_rate": 3.0223454423813547e-06, + "loss": 0.0092, + "step": 38562 + }, + { + "epoch": 0.77128, + "grad_norm": 0.04529676213860512, + "learning_rate": 3.021345330204142e-06, + "loss": 0.0172, + "step": 38564 + }, + { + "epoch": 0.77132, + "grad_norm": 0.02009405754506588, + "learning_rate": 3.020345354079397e-06, + "loss": 0.0014, + "step": 38566 + }, + { + "epoch": 0.77136, + "grad_norm": 0.03252434730529785, + "learning_rate": 3.019345514026606e-06, + "loss": 0.0006, + "step": 38568 + }, + { + "epoch": 0.7714, + "grad_norm": 9.682716369628906, + "learning_rate": 3.0183458100652752e-06, + "loss": 0.0873, + "step": 38570 + }, + { + "epoch": 0.77144, + "grad_norm": 0.1846078336238861, + "learning_rate": 3.0173462422148835e-06, + "loss": 0.0017, + "step": 38572 + }, + { + "epoch": 0.77148, + "grad_norm": 0.6890981197357178, + "learning_rate": 3.016346810494922e-06, + "loss": 0.0054, + "step": 38574 + }, + { + "epoch": 0.77152, + "grad_norm": 0.0209378469735384, + "learning_rate": 3.0153475149248744e-06, + "loss": 0.0002, + "step": 38576 + }, + { + "epoch": 0.77156, + "grad_norm": 0.0603577084839344, + "learning_rate": 3.014348355524224e-06, + "loss": 0.0017, + "step": 38578 + }, + { + "epoch": 0.7716, + "grad_norm": 0.024390894919633865, + "learning_rate": 3.013349332312451e-06, + "loss": 0.0135, + "step": 38580 + }, + { + "epoch": 0.77164, + "grad_norm": 0.22260315716266632, + "learning_rate": 3.0123504453090267e-06, + "loss": 0.0024, + "step": 38582 + }, + { + "epoch": 0.77168, + "grad_norm": 0.0021271235309541225, + "learning_rate": 3.0113516945334287e-06, + "loss": 0.0, + "step": 38584 + }, + { + "epoch": 0.77172, + "grad_norm": 4.551032066345215, + "learning_rate": 3.010353080005127e-06, + "loss": 0.0472, + "step": 38586 + }, + { + "epoch": 0.77176, + "grad_norm": 0.0008624649490229785, + "learning_rate": 3.0093546017435936e-06, + "loss": 0.0003, + "step": 38588 + }, + { + "epoch": 0.7718, + "grad_norm": 0.006847964599728584, + "learning_rate": 3.008356259768285e-06, + "loss": 0.0002, + "step": 38590 + }, + { + "epoch": 0.77184, + "grad_norm": 0.014984481036663055, + "learning_rate": 3.0073580540986767e-06, + "loss": 0.0003, + "step": 38592 + }, + { + "epoch": 0.77188, + "grad_norm": 0.04717952385544777, + "learning_rate": 3.0063599847542204e-06, + "loss": 0.0006, + "step": 38594 + }, + { + "epoch": 0.77192, + "grad_norm": 0.023980678990483284, + "learning_rate": 3.005362051754377e-06, + "loss": 0.0004, + "step": 38596 + }, + { + "epoch": 0.77196, + "grad_norm": 0.3274465501308441, + "learning_rate": 3.0043642551186037e-06, + "loss": 0.0032, + "step": 38598 + }, + { + "epoch": 0.772, + "grad_norm": 22.262365341186523, + "learning_rate": 3.003366594866345e-06, + "loss": 1.3426, + "step": 38600 + }, + { + "epoch": 0.77204, + "grad_norm": 0.0726616382598877, + "learning_rate": 3.002369071017064e-06, + "loss": 0.001, + "step": 38602 + }, + { + "epoch": 0.77208, + "grad_norm": 0.0826551616191864, + "learning_rate": 3.001371683590196e-06, + "loss": 0.0013, + "step": 38604 + }, + { + "epoch": 0.77212, + "grad_norm": 0.08101446181535721, + "learning_rate": 3.0003744326051907e-06, + "loss": 0.0022, + "step": 38606 + }, + { + "epoch": 0.77216, + "grad_norm": 0.004916548263281584, + "learning_rate": 2.999377318081489e-06, + "loss": 0.0003, + "step": 38608 + }, + { + "epoch": 0.7722, + "grad_norm": 0.04312318190932274, + "learning_rate": 2.9983803400385313e-06, + "loss": 0.0035, + "step": 38610 + }, + { + "epoch": 0.77224, + "grad_norm": 0.23884664475917816, + "learning_rate": 2.997383498495756e-06, + "loss": 0.0074, + "step": 38612 + }, + { + "epoch": 0.77228, + "grad_norm": 0.01399147417396307, + "learning_rate": 2.996386793472592e-06, + "loss": 0.0002, + "step": 38614 + }, + { + "epoch": 0.77232, + "grad_norm": 0.0377235971391201, + "learning_rate": 2.9953902249884737e-06, + "loss": 0.0008, + "step": 38616 + }, + { + "epoch": 0.77236, + "grad_norm": 0.2085174024105072, + "learning_rate": 2.9943937930628286e-06, + "loss": 0.0015, + "step": 38618 + }, + { + "epoch": 0.7724, + "grad_norm": 11.517739295959473, + "learning_rate": 2.993397497715086e-06, + "loss": 0.1059, + "step": 38620 + }, + { + "epoch": 0.77244, + "grad_norm": 0.05287846922874451, + "learning_rate": 2.992401338964661e-06, + "loss": 0.0063, + "step": 38622 + }, + { + "epoch": 0.77248, + "grad_norm": 0.6762475371360779, + "learning_rate": 2.991405316830985e-06, + "loss": 0.0112, + "step": 38624 + }, + { + "epoch": 0.77252, + "grad_norm": 0.051597293466329575, + "learning_rate": 2.9904094313334652e-06, + "loss": 0.0006, + "step": 38626 + }, + { + "epoch": 0.77256, + "grad_norm": 0.0987963154911995, + "learning_rate": 2.9894136824915277e-06, + "loss": 0.0074, + "step": 38628 + }, + { + "epoch": 0.7726, + "grad_norm": 0.11643895506858826, + "learning_rate": 2.988418070324577e-06, + "loss": 0.0018, + "step": 38630 + }, + { + "epoch": 0.77264, + "grad_norm": 0.7024509906768799, + "learning_rate": 2.987422594852026e-06, + "loss": 0.0092, + "step": 38632 + }, + { + "epoch": 0.77268, + "grad_norm": 0.11738646775484085, + "learning_rate": 2.986427256093285e-06, + "loss": 0.7114, + "step": 38634 + }, + { + "epoch": 0.77272, + "grad_norm": 0.08020079880952835, + "learning_rate": 2.985432054067752e-06, + "loss": 0.0008, + "step": 38636 + }, + { + "epoch": 0.77276, + "grad_norm": 0.0652008205652237, + "learning_rate": 2.9844369887948334e-06, + "loss": 0.0066, + "step": 38638 + }, + { + "epoch": 0.7728, + "grad_norm": 0.005589385516941547, + "learning_rate": 2.983442060293926e-06, + "loss": 0.0004, + "step": 38640 + }, + { + "epoch": 0.77284, + "grad_norm": 0.004270027857273817, + "learning_rate": 2.9824472685844286e-06, + "loss": 0.0001, + "step": 38642 + }, + { + "epoch": 0.77288, + "grad_norm": 0.11357799917459488, + "learning_rate": 2.9814526136857347e-06, + "loss": 0.0058, + "step": 38644 + }, + { + "epoch": 0.77292, + "grad_norm": 0.01418856531381607, + "learning_rate": 2.980458095617239e-06, + "loss": 0.0003, + "step": 38646 + }, + { + "epoch": 0.77296, + "grad_norm": 0.512511134147644, + "learning_rate": 2.979463714398323e-06, + "loss": 0.0053, + "step": 38648 + }, + { + "epoch": 0.773, + "grad_norm": 0.011823141947388649, + "learning_rate": 2.978469470048376e-06, + "loss": 0.0007, + "step": 38650 + }, + { + "epoch": 0.77304, + "grad_norm": 0.9366788864135742, + "learning_rate": 2.9774753625867826e-06, + "loss": 0.0107, + "step": 38652 + }, + { + "epoch": 0.77308, + "grad_norm": 0.16719895601272583, + "learning_rate": 2.976481392032923e-06, + "loss": 0.0037, + "step": 38654 + }, + { + "epoch": 0.77312, + "grad_norm": 0.029971010982990265, + "learning_rate": 2.975487558406176e-06, + "loss": 0.0024, + "step": 38656 + }, + { + "epoch": 0.77316, + "grad_norm": 17.096574783325195, + "learning_rate": 2.97449386172591e-06, + "loss": 0.4199, + "step": 38658 + }, + { + "epoch": 0.7732, + "grad_norm": 0.03790530562400818, + "learning_rate": 2.9735003020115095e-06, + "loss": 0.0012, + "step": 38660 + }, + { + "epoch": 0.77324, + "grad_norm": 0.2242283821105957, + "learning_rate": 2.9725068792823343e-06, + "loss": 0.026, + "step": 38662 + }, + { + "epoch": 0.77328, + "grad_norm": 0.09660671651363373, + "learning_rate": 2.9715135935577567e-06, + "loss": 0.0656, + "step": 38664 + }, + { + "epoch": 0.77332, + "grad_norm": 0.005315043032169342, + "learning_rate": 2.970520444857142e-06, + "loss": 0.002, + "step": 38666 + }, + { + "epoch": 0.77336, + "grad_norm": 0.38772720098495483, + "learning_rate": 2.9695274331998447e-06, + "loss": 0.0109, + "step": 38668 + }, + { + "epoch": 0.7734, + "grad_norm": 0.0067345378920435905, + "learning_rate": 2.968534558605236e-06, + "loss": 0.0007, + "step": 38670 + }, + { + "epoch": 0.77344, + "grad_norm": 0.035484082996845245, + "learning_rate": 2.967541821092662e-06, + "loss": 0.0085, + "step": 38672 + }, + { + "epoch": 0.77348, + "grad_norm": 0.2152896523475647, + "learning_rate": 2.9665492206814818e-06, + "loss": 0.0127, + "step": 38674 + }, + { + "epoch": 0.77352, + "grad_norm": 0.008001788519322872, + "learning_rate": 2.965556757391045e-06, + "loss": 0.0093, + "step": 38676 + }, + { + "epoch": 0.77356, + "grad_norm": 0.019010040909051895, + "learning_rate": 2.964564431240704e-06, + "loss": 0.0005, + "step": 38678 + }, + { + "epoch": 0.7736, + "grad_norm": 0.06581708788871765, + "learning_rate": 2.963572242249799e-06, + "loss": 0.0015, + "step": 38680 + }, + { + "epoch": 0.77364, + "grad_norm": 0.06838617473840714, + "learning_rate": 2.9625801904376748e-06, + "loss": 0.0109, + "step": 38682 + }, + { + "epoch": 0.77368, + "grad_norm": 0.573382556438446, + "learning_rate": 2.9615882758236736e-06, + "loss": 0.0063, + "step": 38684 + }, + { + "epoch": 0.77372, + "grad_norm": 0.03469743952155113, + "learning_rate": 2.9605964984271328e-06, + "loss": 0.0003, + "step": 38686 + }, + { + "epoch": 0.77376, + "grad_norm": 0.5034112334251404, + "learning_rate": 2.9596048582673897e-06, + "loss": 0.0049, + "step": 38688 + }, + { + "epoch": 0.7738, + "grad_norm": 0.02969883196055889, + "learning_rate": 2.9586133553637687e-06, + "loss": 0.0002, + "step": 38690 + }, + { + "epoch": 0.77384, + "grad_norm": 0.0829893946647644, + "learning_rate": 2.9576219897356116e-06, + "loss": 0.004, + "step": 38692 + }, + { + "epoch": 0.77388, + "grad_norm": 1.5883636474609375, + "learning_rate": 2.956630761402237e-06, + "loss": 0.1173, + "step": 38694 + }, + { + "epoch": 0.77392, + "grad_norm": 0.02471546269953251, + "learning_rate": 2.9556396703829705e-06, + "loss": 0.0012, + "step": 38696 + }, + { + "epoch": 0.77396, + "grad_norm": 0.21575988829135895, + "learning_rate": 2.9546487166971362e-06, + "loss": 0.0025, + "step": 38698 + }, + { + "epoch": 0.774, + "grad_norm": 0.007326961029320955, + "learning_rate": 2.953657900364053e-06, + "loss": 0.0002, + "step": 38700 + }, + { + "epoch": 0.77404, + "grad_norm": 0.18610040843486786, + "learning_rate": 2.9526672214030383e-06, + "loss": 0.003, + "step": 38702 + }, + { + "epoch": 0.77408, + "grad_norm": 0.41816678643226624, + "learning_rate": 2.9516766798334028e-06, + "loss": 0.0136, + "step": 38704 + }, + { + "epoch": 0.77412, + "grad_norm": 1.6172184944152832, + "learning_rate": 2.9506862756744583e-06, + "loss": 0.0192, + "step": 38706 + }, + { + "epoch": 0.77416, + "grad_norm": 0.03560103103518486, + "learning_rate": 2.949696008945514e-06, + "loss": 0.001, + "step": 38708 + }, + { + "epoch": 0.7742, + "grad_norm": 0.28021886944770813, + "learning_rate": 2.9487058796658785e-06, + "loss": 0.0053, + "step": 38710 + }, + { + "epoch": 0.77424, + "grad_norm": 0.08521713316440582, + "learning_rate": 2.947715887854847e-06, + "loss": 0.0015, + "step": 38712 + }, + { + "epoch": 0.77428, + "grad_norm": 0.044858694076538086, + "learning_rate": 2.9467260335317304e-06, + "loss": 0.7316, + "step": 38714 + }, + { + "epoch": 0.77432, + "grad_norm": 0.04385141655802727, + "learning_rate": 2.9457363167158183e-06, + "loss": 0.0006, + "step": 38716 + }, + { + "epoch": 0.77436, + "grad_norm": 0.0469413585960865, + "learning_rate": 2.9447467374264082e-06, + "loss": 0.0022, + "step": 38718 + }, + { + "epoch": 0.7744, + "grad_norm": 0.029522700235247612, + "learning_rate": 2.9437572956827965e-06, + "loss": 0.0009, + "step": 38720 + }, + { + "epoch": 0.77444, + "grad_norm": 0.040286336094141006, + "learning_rate": 2.942767991504263e-06, + "loss": 0.0878, + "step": 38722 + }, + { + "epoch": 0.77448, + "grad_norm": 0.29976385831832886, + "learning_rate": 2.9417788249101053e-06, + "loss": 0.004, + "step": 38724 + }, + { + "epoch": 0.77452, + "grad_norm": 0.07820185273885727, + "learning_rate": 2.940789795919602e-06, + "loss": 0.0011, + "step": 38726 + }, + { + "epoch": 0.77456, + "grad_norm": 0.17719072103500366, + "learning_rate": 2.9398009045520348e-06, + "loss": 0.004, + "step": 38728 + }, + { + "epoch": 0.7746, + "grad_norm": 1.5260494947433472, + "learning_rate": 2.938812150826684e-06, + "loss": 0.0171, + "step": 38730 + }, + { + "epoch": 0.77464, + "grad_norm": 0.01627448759973049, + "learning_rate": 2.937823534762827e-06, + "loss": 0.0021, + "step": 38732 + }, + { + "epoch": 0.77468, + "grad_norm": 0.10278894007205963, + "learning_rate": 2.9368350563797377e-06, + "loss": 0.0014, + "step": 38734 + }, + { + "epoch": 0.77472, + "grad_norm": 0.07677490264177322, + "learning_rate": 2.935846715696683e-06, + "loss": 0.0246, + "step": 38736 + }, + { + "epoch": 0.77476, + "grad_norm": 0.030059320852160454, + "learning_rate": 2.934858512732932e-06, + "loss": 0.0007, + "step": 38738 + }, + { + "epoch": 0.7748, + "grad_norm": 0.007174032274633646, + "learning_rate": 2.9338704475077527e-06, + "loss": 0.0064, + "step": 38740 + }, + { + "epoch": 0.77484, + "grad_norm": 0.0041343094781041145, + "learning_rate": 2.932882520040411e-06, + "loss": 0.0002, + "step": 38742 + }, + { + "epoch": 0.77488, + "grad_norm": 0.1730545461177826, + "learning_rate": 2.9318947303501554e-06, + "loss": 0.0977, + "step": 38744 + }, + { + "epoch": 0.77492, + "grad_norm": 0.19142547249794006, + "learning_rate": 2.9309070784562575e-06, + "loss": 0.0042, + "step": 38746 + }, + { + "epoch": 0.77496, + "grad_norm": 0.07554144412279129, + "learning_rate": 2.9299195643779634e-06, + "loss": 0.0013, + "step": 38748 + }, + { + "epoch": 0.775, + "grad_norm": 0.04428773745894432, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.0273, + "step": 38750 + }, + { + "epoch": 0.77504, + "grad_norm": 0.020250236615538597, + "learning_rate": 2.927944949745196e-06, + "loss": 0.3732, + "step": 38752 + }, + { + "epoch": 0.77508, + "grad_norm": 0.27997079491615295, + "learning_rate": 2.9269578492292206e-06, + "loss": 0.0028, + "step": 38754 + }, + { + "epoch": 0.77512, + "grad_norm": 0.1554865539073944, + "learning_rate": 2.9259708866058467e-06, + "loss": 0.0149, + "step": 38756 + }, + { + "epoch": 0.77516, + "grad_norm": 0.15269288420677185, + "learning_rate": 2.924984061894306e-06, + "loss": 0.0015, + "step": 38758 + }, + { + "epoch": 0.7752, + "grad_norm": 14.052945137023926, + "learning_rate": 2.9239973751138495e-06, + "loss": 0.3918, + "step": 38760 + }, + { + "epoch": 0.77524, + "grad_norm": 0.33682528138160706, + "learning_rate": 2.9230108262837043e-06, + "loss": 0.0038, + "step": 38762 + }, + { + "epoch": 0.77528, + "grad_norm": 0.908301830291748, + "learning_rate": 2.922024415423106e-06, + "loss": 0.0143, + "step": 38764 + }, + { + "epoch": 0.77532, + "grad_norm": 0.010327757336199284, + "learning_rate": 2.9210381425512856e-06, + "loss": 0.001, + "step": 38766 + }, + { + "epoch": 0.77536, + "grad_norm": 0.22379617393016815, + "learning_rate": 2.920052007687475e-06, + "loss": 0.0036, + "step": 38768 + }, + { + "epoch": 0.7754, + "grad_norm": 0.01936810277402401, + "learning_rate": 2.919066010850892e-06, + "loss": 0.0004, + "step": 38770 + }, + { + "epoch": 0.77544, + "grad_norm": 0.17369769513607025, + "learning_rate": 2.918080152060763e-06, + "loss": 0.003, + "step": 38772 + }, + { + "epoch": 0.77548, + "grad_norm": 0.2427680790424347, + "learning_rate": 2.917094431336307e-06, + "loss": 0.0024, + "step": 38774 + }, + { + "epoch": 0.77552, + "grad_norm": 0.036310840398073196, + "learning_rate": 2.9161088486967416e-06, + "loss": 0.006, + "step": 38776 + }, + { + "epoch": 0.77556, + "grad_norm": 0.060326337814331055, + "learning_rate": 2.915123404161284e-06, + "loss": 0.0022, + "step": 38778 + }, + { + "epoch": 0.7756, + "grad_norm": 0.02398478053510189, + "learning_rate": 2.9141380977491373e-06, + "loss": 0.2192, + "step": 38780 + }, + { + "epoch": 0.77564, + "grad_norm": 12.335684776306152, + "learning_rate": 2.9131529294795224e-06, + "loss": 0.1757, + "step": 38782 + }, + { + "epoch": 0.77568, + "grad_norm": 0.19374151527881622, + "learning_rate": 2.9121678993716374e-06, + "loss": 0.0093, + "step": 38784 + }, + { + "epoch": 0.77572, + "grad_norm": 0.2819027602672577, + "learning_rate": 2.9111830074446877e-06, + "loss": 0.0021, + "step": 38786 + }, + { + "epoch": 0.77576, + "grad_norm": 6.7607293128967285, + "learning_rate": 2.9101982537178786e-06, + "loss": 0.065, + "step": 38788 + }, + { + "epoch": 0.7758, + "grad_norm": 0.1715596467256546, + "learning_rate": 2.9092136382103976e-06, + "loss": 0.0074, + "step": 38790 + }, + { + "epoch": 0.77584, + "grad_norm": 0.024960223585367203, + "learning_rate": 2.9082291609414537e-06, + "loss": 0.017, + "step": 38792 + }, + { + "epoch": 0.77588, + "grad_norm": 0.1529480516910553, + "learning_rate": 2.9072448219302295e-06, + "loss": 0.048, + "step": 38794 + }, + { + "epoch": 0.77592, + "grad_norm": 0.37073037028312683, + "learning_rate": 2.906260621195921e-06, + "loss": 0.0032, + "step": 38796 + }, + { + "epoch": 0.77596, + "grad_norm": 0.7177188992500305, + "learning_rate": 2.9052765587577127e-06, + "loss": 0.0092, + "step": 38798 + }, + { + "epoch": 0.776, + "grad_norm": 0.06865242123603821, + "learning_rate": 2.9042926346347932e-06, + "loss": 0.003, + "step": 38800 + }, + { + "epoch": 0.77604, + "grad_norm": 0.00435389531776309, + "learning_rate": 2.903308848846339e-06, + "loss": 0.0004, + "step": 38802 + }, + { + "epoch": 0.77608, + "grad_norm": 0.12582194805145264, + "learning_rate": 2.9023252014115335e-06, + "loss": 0.0016, + "step": 38804 + }, + { + "epoch": 0.77612, + "grad_norm": 0.11452308297157288, + "learning_rate": 2.9013416923495507e-06, + "loss": 0.0013, + "step": 38806 + }, + { + "epoch": 0.77616, + "grad_norm": 0.006769176572561264, + "learning_rate": 2.900358321679567e-06, + "loss": 0.0008, + "step": 38808 + }, + { + "epoch": 0.7762, + "grad_norm": 0.05502234399318695, + "learning_rate": 2.8993750894207563e-06, + "loss": 0.0007, + "step": 38810 + }, + { + "epoch": 0.77624, + "grad_norm": 0.007466347888112068, + "learning_rate": 2.898391995592278e-06, + "loss": 0.0035, + "step": 38812 + }, + { + "epoch": 0.77628, + "grad_norm": 0.47268834710121155, + "learning_rate": 2.8974090402133093e-06, + "loss": 0.009, + "step": 38814 + }, + { + "epoch": 0.77632, + "grad_norm": 0.06596298515796661, + "learning_rate": 2.896426223303005e-06, + "loss": 0.002, + "step": 38816 + }, + { + "epoch": 0.77636, + "grad_norm": 0.03331892564892769, + "learning_rate": 2.8954435448805287e-06, + "loss": 0.0084, + "step": 38818 + }, + { + "epoch": 0.7764, + "grad_norm": 0.07803518325090408, + "learning_rate": 2.8944610049650377e-06, + "loss": 0.0011, + "step": 38820 + }, + { + "epoch": 0.77644, + "grad_norm": 0.1266164779663086, + "learning_rate": 2.893478603575689e-06, + "loss": 0.0015, + "step": 38822 + }, + { + "epoch": 0.77648, + "grad_norm": 0.3874717652797699, + "learning_rate": 2.8924963407316354e-06, + "loss": 0.0035, + "step": 38824 + }, + { + "epoch": 0.77652, + "grad_norm": 0.006854180246591568, + "learning_rate": 2.8915142164520226e-06, + "loss": 0.0189, + "step": 38826 + }, + { + "epoch": 0.77656, + "grad_norm": 0.003731965087354183, + "learning_rate": 2.8905322307559993e-06, + "loss": 0.0004, + "step": 38828 + }, + { + "epoch": 0.7766, + "grad_norm": 0.08729197084903717, + "learning_rate": 2.8895503836627105e-06, + "loss": 0.0013, + "step": 38830 + }, + { + "epoch": 0.77664, + "grad_norm": 0.06599102914333344, + "learning_rate": 2.8885686751913013e-06, + "loss": 0.088, + "step": 38832 + }, + { + "epoch": 0.77668, + "grad_norm": 0.0071319579146802425, + "learning_rate": 2.8875871053608994e-06, + "loss": 0.0006, + "step": 38834 + }, + { + "epoch": 0.77672, + "grad_norm": 0.5362113118171692, + "learning_rate": 2.8866056741906558e-06, + "loss": 0.0071, + "step": 38836 + }, + { + "epoch": 0.77676, + "grad_norm": 0.03336801007390022, + "learning_rate": 2.8856243816996943e-06, + "loss": 0.0005, + "step": 38838 + }, + { + "epoch": 0.7768, + "grad_norm": 0.19355961680412292, + "learning_rate": 2.884643227907147e-06, + "loss": 0.0026, + "step": 38840 + }, + { + "epoch": 0.77684, + "grad_norm": 0.2102281153202057, + "learning_rate": 2.883662212832147e-06, + "loss": 0.0017, + "step": 38842 + }, + { + "epoch": 0.77688, + "grad_norm": 0.21954567730426788, + "learning_rate": 2.8826813364938088e-06, + "loss": 0.0021, + "step": 38844 + }, + { + "epoch": 0.77692, + "grad_norm": 0.0014314060099422932, + "learning_rate": 2.88170059891127e-06, + "loss": 0.0003, + "step": 38846 + }, + { + "epoch": 0.77696, + "grad_norm": 0.15874609351158142, + "learning_rate": 2.8807200001036382e-06, + "loss": 0.0038, + "step": 38848 + }, + { + "epoch": 0.777, + "grad_norm": 0.026737332344055176, + "learning_rate": 2.8797395400900362e-06, + "loss": 0.012, + "step": 38850 + }, + { + "epoch": 0.77704, + "grad_norm": 0.03165046498179436, + "learning_rate": 2.8787592188895772e-06, + "loss": 0.0107, + "step": 38852 + }, + { + "epoch": 0.77708, + "grad_norm": 0.0052760569378733635, + "learning_rate": 2.8777790365213733e-06, + "loss": 0.0082, + "step": 38854 + }, + { + "epoch": 0.77712, + "grad_norm": 2.4360299110412598, + "learning_rate": 2.876798993004537e-06, + "loss": 0.0426, + "step": 38856 + }, + { + "epoch": 0.77716, + "grad_norm": 0.016012871637940407, + "learning_rate": 2.8758190883581694e-06, + "loss": 0.0001, + "step": 38858 + }, + { + "epoch": 0.7772, + "grad_norm": 0.0019779836293309927, + "learning_rate": 2.874839322601375e-06, + "loss": 0.0002, + "step": 38860 + }, + { + "epoch": 0.77724, + "grad_norm": 5.052751541137695, + "learning_rate": 2.8738596957532572e-06, + "loss": 0.0652, + "step": 38862 + }, + { + "epoch": 0.77728, + "grad_norm": 6.158420085906982, + "learning_rate": 2.8728802078329133e-06, + "loss": 0.0877, + "step": 38864 + }, + { + "epoch": 0.77732, + "grad_norm": 0.024868350476026535, + "learning_rate": 2.871900858859439e-06, + "loss": 0.0016, + "step": 38866 + }, + { + "epoch": 0.77736, + "grad_norm": 0.01631864346563816, + "learning_rate": 2.8709216488519297e-06, + "loss": 0.0066, + "step": 38868 + }, + { + "epoch": 0.7774, + "grad_norm": 0.004910769406706095, + "learning_rate": 2.869942577829471e-06, + "loss": 0.0015, + "step": 38870 + }, + { + "epoch": 0.77744, + "grad_norm": 19.181659698486328, + "learning_rate": 2.8689636458111523e-06, + "loss": 0.63, + "step": 38872 + }, + { + "epoch": 0.77748, + "grad_norm": 0.03028940036892891, + "learning_rate": 2.8679848528160616e-06, + "loss": 0.0006, + "step": 38874 + }, + { + "epoch": 0.77752, + "grad_norm": 0.005017402116209269, + "learning_rate": 2.8670061988632715e-06, + "loss": 0.0057, + "step": 38876 + }, + { + "epoch": 0.77756, + "grad_norm": 0.14452534914016724, + "learning_rate": 2.866027683971875e-06, + "loss": 0.0029, + "step": 38878 + }, + { + "epoch": 0.7776, + "grad_norm": 20.875904083251953, + "learning_rate": 2.8650493081609344e-06, + "loss": 0.3967, + "step": 38880 + }, + { + "epoch": 0.77764, + "grad_norm": 0.01637001521885395, + "learning_rate": 2.8640710714495378e-06, + "loss": 0.0074, + "step": 38882 + }, + { + "epoch": 0.77768, + "grad_norm": 0.09502430260181427, + "learning_rate": 2.863092973856746e-06, + "loss": 0.0019, + "step": 38884 + }, + { + "epoch": 0.77772, + "grad_norm": 17.137170791625977, + "learning_rate": 2.862115015401631e-06, + "loss": 0.3736, + "step": 38886 + }, + { + "epoch": 0.77776, + "grad_norm": 0.12989096343517303, + "learning_rate": 2.8611371961032587e-06, + "loss": 0.0108, + "step": 38888 + }, + { + "epoch": 0.7778, + "grad_norm": 0.04719027504324913, + "learning_rate": 2.860159515980695e-06, + "loss": 0.001, + "step": 38890 + }, + { + "epoch": 0.77784, + "grad_norm": 0.23227089643478394, + "learning_rate": 2.8591819750529946e-06, + "loss": 0.003, + "step": 38892 + }, + { + "epoch": 0.77788, + "grad_norm": 0.07608193159103394, + "learning_rate": 2.858204573339217e-06, + "loss": 0.0218, + "step": 38894 + }, + { + "epoch": 0.77792, + "grad_norm": 1.1562819480895996, + "learning_rate": 2.8572273108584193e-06, + "loss": 0.0224, + "step": 38896 + }, + { + "epoch": 0.77796, + "grad_norm": 0.0021160575561225414, + "learning_rate": 2.8562501876296524e-06, + "loss": 0.0021, + "step": 38898 + }, + { + "epoch": 0.778, + "grad_norm": 0.00497770681977272, + "learning_rate": 2.855273203671969e-06, + "loss": 0.0004, + "step": 38900 + }, + { + "epoch": 0.77804, + "grad_norm": 0.012961115688085556, + "learning_rate": 2.854296359004406e-06, + "loss": 0.0073, + "step": 38902 + }, + { + "epoch": 0.77808, + "grad_norm": 0.0065575288608670235, + "learning_rate": 2.8533196536460205e-06, + "loss": 0.0001, + "step": 38904 + }, + { + "epoch": 0.77812, + "grad_norm": 0.07669331878423691, + "learning_rate": 2.8523430876158454e-06, + "loss": 0.0017, + "step": 38906 + }, + { + "epoch": 0.77816, + "grad_norm": 0.014547708444297314, + "learning_rate": 2.8513666609329218e-06, + "loss": 0.0004, + "step": 38908 + }, + { + "epoch": 0.7782, + "grad_norm": 0.06669726967811584, + "learning_rate": 2.8503903736162876e-06, + "loss": 0.0007, + "step": 38910 + }, + { + "epoch": 0.77824, + "grad_norm": 0.004590108525007963, + "learning_rate": 2.8494142256849676e-06, + "loss": 0.0003, + "step": 38912 + }, + { + "epoch": 0.77828, + "grad_norm": 0.020014286041259766, + "learning_rate": 2.848438217158006e-06, + "loss": 0.0008, + "step": 38914 + }, + { + "epoch": 0.77832, + "grad_norm": 4.747763156890869, + "learning_rate": 2.8474623480544194e-06, + "loss": 0.0433, + "step": 38916 + }, + { + "epoch": 0.77836, + "grad_norm": 0.08814319223165512, + "learning_rate": 2.8464866183932373e-06, + "loss": 0.0018, + "step": 38918 + }, + { + "epoch": 0.7784, + "grad_norm": 0.3649187982082367, + "learning_rate": 2.8455110281934804e-06, + "loss": 0.0039, + "step": 38920 + }, + { + "epoch": 0.77844, + "grad_norm": 0.14988790452480316, + "learning_rate": 2.8445355774741735e-06, + "loss": 0.0015, + "step": 38922 + }, + { + "epoch": 0.77848, + "grad_norm": 0.3234986960887909, + "learning_rate": 2.843560266254326e-06, + "loss": 0.0029, + "step": 38924 + }, + { + "epoch": 0.77852, + "grad_norm": 1.9553855657577515, + "learning_rate": 2.8425850945529554e-06, + "loss": 0.0239, + "step": 38926 + }, + { + "epoch": 0.77856, + "grad_norm": 1.4843337535858154, + "learning_rate": 2.841610062389073e-06, + "loss": 0.0194, + "step": 38928 + }, + { + "epoch": 0.7786, + "grad_norm": 0.08885427564382553, + "learning_rate": 2.840635169781688e-06, + "loss": 0.002, + "step": 38930 + }, + { + "epoch": 0.77864, + "grad_norm": 0.013465563766658306, + "learning_rate": 2.8396604167498097e-06, + "loss": 0.0015, + "step": 38932 + }, + { + "epoch": 0.77868, + "grad_norm": 0.02533332072198391, + "learning_rate": 2.8386858033124322e-06, + "loss": 0.0003, + "step": 38934 + }, + { + "epoch": 0.77872, + "grad_norm": 0.35531967878341675, + "learning_rate": 2.837711329488567e-06, + "loss": 0.0411, + "step": 38936 + }, + { + "epoch": 0.77876, + "grad_norm": 0.1882704794406891, + "learning_rate": 2.8367369952972047e-06, + "loss": 0.0016, + "step": 38938 + }, + { + "epoch": 0.7788, + "grad_norm": 0.21426482498645782, + "learning_rate": 2.8357628007573412e-06, + "loss": 0.0028, + "step": 38940 + }, + { + "epoch": 0.77884, + "grad_norm": 0.018842389807105064, + "learning_rate": 2.834788745887974e-06, + "loss": 0.0006, + "step": 38942 + }, + { + "epoch": 0.77888, + "grad_norm": 0.022659923881292343, + "learning_rate": 2.8338148307080836e-06, + "loss": 0.0006, + "step": 38944 + }, + { + "epoch": 0.77892, + "grad_norm": 0.046457599848508835, + "learning_rate": 2.832841055236668e-06, + "loss": 0.0013, + "step": 38946 + }, + { + "epoch": 0.77896, + "grad_norm": 0.411052942276001, + "learning_rate": 2.831867419492703e-06, + "loss": 0.0045, + "step": 38948 + }, + { + "epoch": 0.779, + "grad_norm": 0.4686237871646881, + "learning_rate": 2.830893923495173e-06, + "loss": 0.0068, + "step": 38950 + }, + { + "epoch": 0.77904, + "grad_norm": 0.030923165380954742, + "learning_rate": 2.829920567263057e-06, + "loss": 0.0005, + "step": 38952 + }, + { + "epoch": 0.77908, + "grad_norm": 1.3526691198349, + "learning_rate": 2.828947350815334e-06, + "loss": 0.0153, + "step": 38954 + }, + { + "epoch": 0.77912, + "grad_norm": 0.04773758351802826, + "learning_rate": 2.827974274170968e-06, + "loss": 0.0026, + "step": 38956 + }, + { + "epoch": 0.77916, + "grad_norm": 0.171071395277977, + "learning_rate": 2.8270013373489423e-06, + "loss": 0.0026, + "step": 38958 + }, + { + "epoch": 0.7792, + "grad_norm": 0.19420550763607025, + "learning_rate": 2.8260285403682153e-06, + "loss": 0.0019, + "step": 38960 + }, + { + "epoch": 0.77924, + "grad_norm": 0.0994194969534874, + "learning_rate": 2.8250558832477546e-06, + "loss": 0.0023, + "step": 38962 + }, + { + "epoch": 0.77928, + "grad_norm": 0.05704222619533539, + "learning_rate": 2.8240833660065283e-06, + "loss": 0.0005, + "step": 38964 + }, + { + "epoch": 0.77932, + "grad_norm": 0.23639225959777832, + "learning_rate": 2.823110988663483e-06, + "loss": 0.0071, + "step": 38966 + }, + { + "epoch": 0.77936, + "grad_norm": 0.01977512799203396, + "learning_rate": 2.8221387512375907e-06, + "loss": 0.0018, + "step": 38968 + }, + { + "epoch": 0.7794, + "grad_norm": 0.05231205374002457, + "learning_rate": 2.821166653747793e-06, + "loss": 0.0425, + "step": 38970 + }, + { + "epoch": 0.77944, + "grad_norm": 0.0607023760676384, + "learning_rate": 2.8201946962130533e-06, + "loss": 0.0008, + "step": 38972 + }, + { + "epoch": 0.77948, + "grad_norm": 0.0035932695027440786, + "learning_rate": 2.8192228786523113e-06, + "loss": 0.0082, + "step": 38974 + }, + { + "epoch": 0.77952, + "grad_norm": 0.19206243753433228, + "learning_rate": 2.8182512010845166e-06, + "loss": 0.0059, + "step": 38976 + }, + { + "epoch": 0.77956, + "grad_norm": 0.0003795618540607393, + "learning_rate": 2.8172796635286148e-06, + "loss": 0.0007, + "step": 38978 + }, + { + "epoch": 0.7796, + "grad_norm": 0.1992635577917099, + "learning_rate": 2.816308266003541e-06, + "loss": 0.0071, + "step": 38980 + }, + { + "epoch": 0.77964, + "grad_norm": 0.0005003287806175649, + "learning_rate": 2.815337008528236e-06, + "loss": 0.0005, + "step": 38982 + }, + { + "epoch": 0.77968, + "grad_norm": 0.40595439076423645, + "learning_rate": 2.814365891121634e-06, + "loss": 0.0055, + "step": 38984 + }, + { + "epoch": 0.77972, + "grad_norm": 0.13612666726112366, + "learning_rate": 2.8133949138026693e-06, + "loss": 0.0014, + "step": 38986 + }, + { + "epoch": 0.77976, + "grad_norm": 0.012007579207420349, + "learning_rate": 2.812424076590271e-06, + "loss": 0.0001, + "step": 38988 + }, + { + "epoch": 0.7798, + "grad_norm": 0.18807849287986755, + "learning_rate": 2.8114533795033685e-06, + "loss": 0.0023, + "step": 38990 + }, + { + "epoch": 0.77984, + "grad_norm": 0.1263541430234909, + "learning_rate": 2.810482822560879e-06, + "loss": 0.0009, + "step": 38992 + }, + { + "epoch": 0.77988, + "grad_norm": 0.3791816234588623, + "learning_rate": 2.80951240578173e-06, + "loss": 0.0048, + "step": 38994 + }, + { + "epoch": 0.77992, + "grad_norm": 0.2530404031276703, + "learning_rate": 2.8085421291848414e-06, + "loss": 0.0032, + "step": 38996 + }, + { + "epoch": 0.77996, + "grad_norm": 0.1213623508810997, + "learning_rate": 2.8075719927891197e-06, + "loss": 0.0017, + "step": 38998 + }, + { + "epoch": 0.78, + "grad_norm": 0.3028661906719208, + "learning_rate": 2.8066019966134907e-06, + "loss": 0.0031, + "step": 39000 + }, + { + "epoch": 0.78004, + "grad_norm": 0.20016008615493774, + "learning_rate": 2.805632140676854e-06, + "loss": 0.0071, + "step": 39002 + }, + { + "epoch": 0.78008, + "grad_norm": 0.07908754050731659, + "learning_rate": 2.804662424998128e-06, + "loss": 0.0008, + "step": 39004 + }, + { + "epoch": 0.78012, + "grad_norm": 0.6159638166427612, + "learning_rate": 2.80369284959621e-06, + "loss": 0.0049, + "step": 39006 + }, + { + "epoch": 0.78016, + "grad_norm": 0.1567429155111313, + "learning_rate": 2.8027234144900038e-06, + "loss": 0.0025, + "step": 39008 + }, + { + "epoch": 0.7802, + "grad_norm": 0.025780145078897476, + "learning_rate": 2.8017541196984144e-06, + "loss": 0.0012, + "step": 39010 + }, + { + "epoch": 0.78024, + "grad_norm": 0.26150670647621155, + "learning_rate": 2.8007849652403307e-06, + "loss": 0.0031, + "step": 39012 + }, + { + "epoch": 0.78028, + "grad_norm": 0.07056489586830139, + "learning_rate": 2.79981595113465e-06, + "loss": 0.001, + "step": 39014 + }, + { + "epoch": 0.78032, + "grad_norm": 0.056818753480911255, + "learning_rate": 2.798847077400265e-06, + "loss": 0.0014, + "step": 39016 + }, + { + "epoch": 0.78036, + "grad_norm": 0.04888211563229561, + "learning_rate": 2.797878344056063e-06, + "loss": 0.0005, + "step": 39018 + }, + { + "epoch": 0.7804, + "grad_norm": 1.3581510782241821, + "learning_rate": 2.796909751120931e-06, + "loss": 0.0138, + "step": 39020 + }, + { + "epoch": 0.78044, + "grad_norm": 0.0465787909924984, + "learning_rate": 2.7959412986137544e-06, + "loss": 0.0055, + "step": 39022 + }, + { + "epoch": 0.78048, + "grad_norm": 0.011969005689024925, + "learning_rate": 2.794972986553406e-06, + "loss": 0.0007, + "step": 39024 + }, + { + "epoch": 0.78052, + "grad_norm": 0.01163554284721613, + "learning_rate": 2.794004814958774e-06, + "loss": 0.0005, + "step": 39026 + }, + { + "epoch": 0.78056, + "grad_norm": 0.05967660993337631, + "learning_rate": 2.793036783848725e-06, + "loss": 0.0063, + "step": 39028 + }, + { + "epoch": 0.7806, + "grad_norm": 0.11474435776472092, + "learning_rate": 2.7920688932421337e-06, + "loss": 0.0008, + "step": 39030 + }, + { + "epoch": 0.78064, + "grad_norm": 3.79166841506958, + "learning_rate": 2.7911011431578737e-06, + "loss": 0.0303, + "step": 39032 + }, + { + "epoch": 0.78068, + "grad_norm": 0.004252232611179352, + "learning_rate": 2.790133533614803e-06, + "loss": 0.0168, + "step": 39034 + }, + { + "epoch": 0.78072, + "grad_norm": 0.4866752028465271, + "learning_rate": 2.7891660646317964e-06, + "loss": 0.0068, + "step": 39036 + }, + { + "epoch": 0.78076, + "grad_norm": 0.22044123709201813, + "learning_rate": 2.7881987362277065e-06, + "loss": 0.0026, + "step": 39038 + }, + { + "epoch": 0.7808, + "grad_norm": 0.018185056746006012, + "learning_rate": 2.7872315484213954e-06, + "loss": 0.0003, + "step": 39040 + }, + { + "epoch": 0.78084, + "grad_norm": 0.021658092737197876, + "learning_rate": 2.786264501231719e-06, + "loss": 0.0006, + "step": 39042 + }, + { + "epoch": 0.78088, + "grad_norm": 0.4326307475566864, + "learning_rate": 2.785297594677533e-06, + "loss": 0.1664, + "step": 39044 + }, + { + "epoch": 0.78092, + "grad_norm": 0.12979745864868164, + "learning_rate": 2.7843308287776806e-06, + "loss": 0.0017, + "step": 39046 + }, + { + "epoch": 0.78096, + "grad_norm": 0.0721609890460968, + "learning_rate": 2.7833642035510146e-06, + "loss": 0.0013, + "step": 39048 + }, + { + "epoch": 0.781, + "grad_norm": 0.22166629135608673, + "learning_rate": 2.7823977190163788e-06, + "loss": 0.0031, + "step": 39050 + }, + { + "epoch": 0.78104, + "grad_norm": 0.15643392503261566, + "learning_rate": 2.7814313751926146e-06, + "loss": 0.0022, + "step": 39052 + }, + { + "epoch": 0.78108, + "grad_norm": 0.0033675748854875565, + "learning_rate": 2.7804651720985664e-06, + "loss": 0.0003, + "step": 39054 + }, + { + "epoch": 0.78112, + "grad_norm": 0.17063689231872559, + "learning_rate": 2.77949910975306e-06, + "loss": 0.0024, + "step": 39056 + }, + { + "epoch": 0.78116, + "grad_norm": 1.587612509727478, + "learning_rate": 2.778533188174941e-06, + "loss": 0.0171, + "step": 39058 + }, + { + "epoch": 0.7812, + "grad_norm": 0.23417146503925323, + "learning_rate": 2.7775674073830337e-06, + "loss": 0.0031, + "step": 39060 + }, + { + "epoch": 0.78124, + "grad_norm": 0.021396271884441376, + "learning_rate": 2.7766017673961674e-06, + "loss": 0.5084, + "step": 39062 + }, + { + "epoch": 0.78128, + "grad_norm": 0.061715833842754364, + "learning_rate": 2.7756362682331717e-06, + "loss": 0.0048, + "step": 39064 + }, + { + "epoch": 0.78132, + "grad_norm": 0.11283606290817261, + "learning_rate": 2.77467090991286e-06, + "loss": 0.0016, + "step": 39066 + }, + { + "epoch": 0.78136, + "grad_norm": 0.0488479882478714, + "learning_rate": 2.7737056924540653e-06, + "loss": 0.0008, + "step": 39068 + }, + { + "epoch": 0.7814, + "grad_norm": 14.071322441101074, + "learning_rate": 2.7727406158755943e-06, + "loss": 0.1764, + "step": 39070 + }, + { + "epoch": 0.78144, + "grad_norm": 0.1792541742324829, + "learning_rate": 2.771775680196267e-06, + "loss": 0.0028, + "step": 39072 + }, + { + "epoch": 0.78148, + "grad_norm": 3.663688898086548, + "learning_rate": 2.7708108854348936e-06, + "loss": 0.0403, + "step": 39074 + }, + { + "epoch": 0.78152, + "grad_norm": 0.3133929371833801, + "learning_rate": 2.7698462316102837e-06, + "loss": 0.0061, + "step": 39076 + }, + { + "epoch": 0.78156, + "grad_norm": 0.09102638810873032, + "learning_rate": 2.7688817187412477e-06, + "loss": 0.0011, + "step": 39078 + }, + { + "epoch": 0.7816, + "grad_norm": 0.01904316060245037, + "learning_rate": 2.7679173468465813e-06, + "loss": 0.0003, + "step": 39080 + }, + { + "epoch": 0.78164, + "grad_norm": 0.04029123857617378, + "learning_rate": 2.7669531159450904e-06, + "loss": 0.0023, + "step": 39082 + }, + { + "epoch": 0.78168, + "grad_norm": 0.18216487765312195, + "learning_rate": 2.7659890260555722e-06, + "loss": 0.002, + "step": 39084 + }, + { + "epoch": 0.78172, + "grad_norm": 0.10216310620307922, + "learning_rate": 2.7650250771968246e-06, + "loss": 0.0014, + "step": 39086 + }, + { + "epoch": 0.78176, + "grad_norm": 0.05265998840332031, + "learning_rate": 2.7640612693876323e-06, + "loss": 0.0014, + "step": 39088 + }, + { + "epoch": 0.7818, + "grad_norm": 0.1780884712934494, + "learning_rate": 2.763097602646797e-06, + "loss": 0.0028, + "step": 39090 + }, + { + "epoch": 0.78184, + "grad_norm": 2.1495420932769775, + "learning_rate": 2.7621340769930938e-06, + "loss": 0.025, + "step": 39092 + }, + { + "epoch": 0.78188, + "grad_norm": 0.9427950978279114, + "learning_rate": 2.76117069244532e-06, + "loss": 0.0095, + "step": 39094 + }, + { + "epoch": 0.78192, + "grad_norm": 0.015348291955888271, + "learning_rate": 2.7602074490222466e-06, + "loss": 0.0244, + "step": 39096 + }, + { + "epoch": 0.78196, + "grad_norm": 0.03586435690522194, + "learning_rate": 2.7592443467426555e-06, + "loss": 0.0068, + "step": 39098 + }, + { + "epoch": 0.782, + "grad_norm": 0.22310812771320343, + "learning_rate": 2.7582813856253276e-06, + "loss": 0.0033, + "step": 39100 + }, + { + "epoch": 0.78204, + "grad_norm": 0.5792775750160217, + "learning_rate": 2.757318565689029e-06, + "loss": 0.008, + "step": 39102 + }, + { + "epoch": 0.78208, + "grad_norm": 0.015612471848726273, + "learning_rate": 2.7563558869525333e-06, + "loss": 0.0003, + "step": 39104 + }, + { + "epoch": 0.78212, + "grad_norm": 0.0732794925570488, + "learning_rate": 2.75539334943461e-06, + "loss": 0.0089, + "step": 39106 + }, + { + "epoch": 0.78216, + "grad_norm": 0.7557937502861023, + "learning_rate": 2.7544309531540213e-06, + "loss": 0.0094, + "step": 39108 + }, + { + "epoch": 0.7822, + "grad_norm": 0.03657982125878334, + "learning_rate": 2.7534686981295335e-06, + "loss": 0.0003, + "step": 39110 + }, + { + "epoch": 0.78224, + "grad_norm": 0.49926823377609253, + "learning_rate": 2.7525065843799058e-06, + "loss": 0.0043, + "step": 39112 + }, + { + "epoch": 0.78228, + "grad_norm": 0.07027894258499146, + "learning_rate": 2.751544611923891e-06, + "loss": 0.0011, + "step": 39114 + }, + { + "epoch": 0.78232, + "grad_norm": 9.291159629821777, + "learning_rate": 2.7505827807802454e-06, + "loss": 0.2513, + "step": 39116 + }, + { + "epoch": 0.78236, + "grad_norm": 0.4628932774066925, + "learning_rate": 2.7496210909677245e-06, + "loss": 0.0087, + "step": 39118 + }, + { + "epoch": 0.7824, + "grad_norm": 0.36588603258132935, + "learning_rate": 2.7486595425050667e-06, + "loss": 0.004, + "step": 39120 + }, + { + "epoch": 0.78244, + "grad_norm": 0.024388212710618973, + "learning_rate": 2.74769813541103e-06, + "loss": 0.0005, + "step": 39122 + }, + { + "epoch": 0.78248, + "grad_norm": 0.3766483664512634, + "learning_rate": 2.746736869704346e-06, + "loss": 0.0054, + "step": 39124 + }, + { + "epoch": 0.78252, + "grad_norm": 0.435893177986145, + "learning_rate": 2.745775745403768e-06, + "loss": 0.0052, + "step": 39126 + }, + { + "epoch": 0.78256, + "grad_norm": 0.13481605052947998, + "learning_rate": 2.744814762528022e-06, + "loss": 0.0025, + "step": 39128 + }, + { + "epoch": 0.7826, + "grad_norm": 0.017551617696881294, + "learning_rate": 2.7438539210958483e-06, + "loss": 0.0134, + "step": 39130 + }, + { + "epoch": 0.78264, + "grad_norm": 0.053209561854600906, + "learning_rate": 2.7428932211259816e-06, + "loss": 0.0019, + "step": 39132 + }, + { + "epoch": 0.78268, + "grad_norm": 0.016122732311487198, + "learning_rate": 2.7419326626371446e-06, + "loss": 0.1379, + "step": 39134 + }, + { + "epoch": 0.78272, + "grad_norm": 0.004326315131038427, + "learning_rate": 2.740972245648066e-06, + "loss": 0.0012, + "step": 39136 + }, + { + "epoch": 0.78276, + "grad_norm": 0.02765718102455139, + "learning_rate": 2.740011970177472e-06, + "loss": 0.0014, + "step": 39138 + }, + { + "epoch": 0.7828, + "grad_norm": 0.006664380431175232, + "learning_rate": 2.739051836244081e-06, + "loss": 0.0001, + "step": 39140 + }, + { + "epoch": 0.78284, + "grad_norm": 0.770507276058197, + "learning_rate": 2.7380918438666136e-06, + "loss": 0.0796, + "step": 39142 + }, + { + "epoch": 0.78288, + "grad_norm": 0.5402851104736328, + "learning_rate": 2.7371319930637874e-06, + "loss": 0.0071, + "step": 39144 + }, + { + "epoch": 0.78292, + "grad_norm": 0.2130667120218277, + "learning_rate": 2.736172283854308e-06, + "loss": 0.0025, + "step": 39146 + }, + { + "epoch": 0.78296, + "grad_norm": 0.017654655501246452, + "learning_rate": 2.7352127162568907e-06, + "loss": 0.0065, + "step": 39148 + }, + { + "epoch": 0.783, + "grad_norm": 0.45121654868125916, + "learning_rate": 2.7342532902902418e-06, + "loss": 0.0055, + "step": 39150 + }, + { + "epoch": 0.78304, + "grad_norm": 0.15622295439243317, + "learning_rate": 2.733294005973065e-06, + "loss": 0.0018, + "step": 39152 + }, + { + "epoch": 0.78308, + "grad_norm": 0.13220354914665222, + "learning_rate": 2.7323348633240664e-06, + "loss": 0.0011, + "step": 39154 + }, + { + "epoch": 0.78312, + "grad_norm": 0.008399520069360733, + "learning_rate": 2.7313758623619348e-06, + "loss": 0.0012, + "step": 39156 + }, + { + "epoch": 0.78316, + "grad_norm": 1.9920111894607544, + "learning_rate": 2.730417003105379e-06, + "loss": 0.0303, + "step": 39158 + }, + { + "epoch": 0.7832, + "grad_norm": 2.6911799907684326, + "learning_rate": 2.7294582855730835e-06, + "loss": 0.0285, + "step": 39160 + }, + { + "epoch": 0.78324, + "grad_norm": 0.1425781100988388, + "learning_rate": 2.728499709783741e-06, + "loss": 0.0015, + "step": 39162 + }, + { + "epoch": 0.78328, + "grad_norm": 0.04768740013241768, + "learning_rate": 2.7275412757560418e-06, + "loss": 0.0008, + "step": 39164 + }, + { + "epoch": 0.78332, + "grad_norm": 0.03355240821838379, + "learning_rate": 2.7265829835086675e-06, + "loss": 0.0022, + "step": 39166 + }, + { + "epoch": 0.78336, + "grad_norm": 0.4388653635978699, + "learning_rate": 2.7256248330603075e-06, + "loss": 0.0089, + "step": 39168 + }, + { + "epoch": 0.7834, + "grad_norm": 0.001471688854508102, + "learning_rate": 2.7246668244296328e-06, + "loss": 0.0019, + "step": 39170 + }, + { + "epoch": 0.78344, + "grad_norm": 0.09446105360984802, + "learning_rate": 2.723708957635324e-06, + "loss": 0.0023, + "step": 39172 + }, + { + "epoch": 0.78348, + "grad_norm": 0.18683688342571259, + "learning_rate": 2.722751232696055e-06, + "loss": 0.0025, + "step": 39174 + }, + { + "epoch": 0.78352, + "grad_norm": 0.09881992638111115, + "learning_rate": 2.7217936496304996e-06, + "loss": 0.001, + "step": 39176 + }, + { + "epoch": 0.78356, + "grad_norm": 18.22791290283203, + "learning_rate": 2.7208362084573193e-06, + "loss": 0.2193, + "step": 39178 + }, + { + "epoch": 0.7836, + "grad_norm": 0.03749227523803711, + "learning_rate": 2.7198789091951903e-06, + "loss": 0.0012, + "step": 39180 + }, + { + "epoch": 0.78364, + "grad_norm": 0.07916378229856491, + "learning_rate": 2.7189217518627663e-06, + "loss": 0.0009, + "step": 39182 + }, + { + "epoch": 0.78368, + "grad_norm": 0.23798136413097382, + "learning_rate": 2.717964736478712e-06, + "loss": 0.0048, + "step": 39184 + }, + { + "epoch": 0.78372, + "grad_norm": 0.500457227230072, + "learning_rate": 2.717007863061687e-06, + "loss": 0.008, + "step": 39186 + }, + { + "epoch": 0.78376, + "grad_norm": 0.009156513027846813, + "learning_rate": 2.7160511316303374e-06, + "loss": 0.0001, + "step": 39188 + }, + { + "epoch": 0.7838, + "grad_norm": 0.2358282059431076, + "learning_rate": 2.715094542203327e-06, + "loss": 0.0026, + "step": 39190 + }, + { + "epoch": 0.78384, + "grad_norm": 0.08654425293207169, + "learning_rate": 2.7141380947992966e-06, + "loss": 0.0009, + "step": 39192 + }, + { + "epoch": 0.78388, + "grad_norm": 0.3774561285972595, + "learning_rate": 2.713181789436894e-06, + "loss": 0.0039, + "step": 39194 + }, + { + "epoch": 0.78392, + "grad_norm": 0.18953707814216614, + "learning_rate": 2.7122256261347647e-06, + "loss": 0.0019, + "step": 39196 + }, + { + "epoch": 0.78396, + "grad_norm": 1.2650623321533203, + "learning_rate": 2.7112696049115485e-06, + "loss": 0.0142, + "step": 39198 + }, + { + "epoch": 0.784, + "grad_norm": 0.06315165758132935, + "learning_rate": 2.7103137257858867e-06, + "loss": 0.1162, + "step": 39200 + }, + { + "epoch": 0.78404, + "grad_norm": 0.0014731265837326646, + "learning_rate": 2.709357988776409e-06, + "loss": 0.0019, + "step": 39202 + }, + { + "epoch": 0.78408, + "grad_norm": 0.9779066443443298, + "learning_rate": 2.708402393901751e-06, + "loss": 0.0129, + "step": 39204 + }, + { + "epoch": 0.78412, + "grad_norm": 0.04114940017461777, + "learning_rate": 2.7074469411805417e-06, + "loss": 0.0032, + "step": 39206 + }, + { + "epoch": 0.78416, + "grad_norm": 0.022080816328525543, + "learning_rate": 2.7064916306314124e-06, + "loss": 0.0004, + "step": 39208 + }, + { + "epoch": 0.7842, + "grad_norm": 0.007258220110088587, + "learning_rate": 2.7055364622729772e-06, + "loss": 0.1625, + "step": 39210 + }, + { + "epoch": 0.78424, + "grad_norm": 7.373882293701172, + "learning_rate": 2.70458143612387e-06, + "loss": 0.1412, + "step": 39212 + }, + { + "epoch": 0.78428, + "grad_norm": 0.18716566264629364, + "learning_rate": 2.7036265522027018e-06, + "loss": 0.0023, + "step": 39214 + }, + { + "epoch": 0.78432, + "grad_norm": 0.006757650058716536, + "learning_rate": 2.702671810528089e-06, + "loss": 0.1759, + "step": 39216 + }, + { + "epoch": 0.78436, + "grad_norm": 0.0022618495859205723, + "learning_rate": 2.701717211118646e-06, + "loss": 0.0019, + "step": 39218 + }, + { + "epoch": 0.7844, + "grad_norm": 0.47541525959968567, + "learning_rate": 2.7007627539929847e-06, + "loss": 0.0063, + "step": 39220 + }, + { + "epoch": 0.78444, + "grad_norm": 0.0516134649515152, + "learning_rate": 2.6998084391697143e-06, + "loss": 0.0009, + "step": 39222 + }, + { + "epoch": 0.78448, + "grad_norm": 0.11757609993219376, + "learning_rate": 2.6988542666674323e-06, + "loss": 0.0019, + "step": 39224 + }, + { + "epoch": 0.78452, + "grad_norm": 0.0018529180670157075, + "learning_rate": 2.697900236504747e-06, + "loss": 0.2344, + "step": 39226 + }, + { + "epoch": 0.78456, + "grad_norm": 0.21396614611148834, + "learning_rate": 2.6969463487002544e-06, + "loss": 0.0033, + "step": 39228 + }, + { + "epoch": 0.7846, + "grad_norm": 0.02530461736023426, + "learning_rate": 2.6959926032725537e-06, + "loss": 0.0006, + "step": 39230 + }, + { + "epoch": 0.78464, + "grad_norm": 0.1107836663722992, + "learning_rate": 2.6950390002402372e-06, + "loss": 0.002, + "step": 39232 + }, + { + "epoch": 0.78468, + "grad_norm": 0.00045220073661766946, + "learning_rate": 2.694085539621899e-06, + "loss": 0.0027, + "step": 39234 + }, + { + "epoch": 0.78472, + "grad_norm": 0.035989560186862946, + "learning_rate": 2.693132221436122e-06, + "loss": 0.0026, + "step": 39236 + }, + { + "epoch": 0.78476, + "grad_norm": 0.054080430418252945, + "learning_rate": 2.692179045701494e-06, + "loss": 0.0086, + "step": 39238 + }, + { + "epoch": 0.7848, + "grad_norm": 0.01593686453998089, + "learning_rate": 2.6912260124366007e-06, + "loss": 0.0025, + "step": 39240 + }, + { + "epoch": 0.78484, + "grad_norm": 0.01411640364676714, + "learning_rate": 2.6902731216600133e-06, + "loss": 0.0023, + "step": 39242 + }, + { + "epoch": 0.78488, + "grad_norm": 0.5218486189842224, + "learning_rate": 2.6893203733903207e-06, + "loss": 0.0044, + "step": 39244 + }, + { + "epoch": 0.78492, + "grad_norm": 0.00011564776650629938, + "learning_rate": 2.688367767646085e-06, + "loss": 0.0001, + "step": 39246 + }, + { + "epoch": 0.78496, + "grad_norm": 0.017621001228690147, + "learning_rate": 2.6874153044458908e-06, + "loss": 0.0009, + "step": 39248 + }, + { + "epoch": 0.785, + "grad_norm": 0.07083931565284729, + "learning_rate": 2.6864629838082957e-06, + "loss": 0.0006, + "step": 39250 + }, + { + "epoch": 0.78504, + "grad_norm": 0.061931002885103226, + "learning_rate": 2.685510805751871e-06, + "loss": 0.0014, + "step": 39252 + }, + { + "epoch": 0.78508, + "grad_norm": 13.913490295410156, + "learning_rate": 2.6845587702951815e-06, + "loss": 0.2043, + "step": 39254 + }, + { + "epoch": 0.78512, + "grad_norm": 0.03185909986495972, + "learning_rate": 2.6836068774567815e-06, + "loss": 0.0004, + "step": 39256 + }, + { + "epoch": 0.78516, + "grad_norm": 7.033887231955305e-05, + "learning_rate": 2.6826551272552328e-06, + "loss": 0.0002, + "step": 39258 + }, + { + "epoch": 0.7852, + "grad_norm": 0.1913134753704071, + "learning_rate": 2.6817035197090892e-06, + "loss": 0.0232, + "step": 39260 + }, + { + "epoch": 0.78524, + "grad_norm": 0.04541141912341118, + "learning_rate": 2.680752054836904e-06, + "loss": 0.0014, + "step": 39262 + }, + { + "epoch": 0.78528, + "grad_norm": 0.4223106801509857, + "learning_rate": 2.679800732657225e-06, + "loss": 0.0233, + "step": 39264 + }, + { + "epoch": 0.78532, + "grad_norm": 0.8295596241950989, + "learning_rate": 2.6788495531886026e-06, + "loss": 0.0072, + "step": 39266 + }, + { + "epoch": 0.78536, + "grad_norm": 0.0758824348449707, + "learning_rate": 2.677898516449574e-06, + "loss": 0.0022, + "step": 39268 + }, + { + "epoch": 0.7854, + "grad_norm": 0.0010887698736041784, + "learning_rate": 2.676947622458683e-06, + "loss": 0.0009, + "step": 39270 + }, + { + "epoch": 0.78544, + "grad_norm": 0.004298834595829248, + "learning_rate": 2.67599687123447e-06, + "loss": 0.0024, + "step": 39272 + }, + { + "epoch": 0.78548, + "grad_norm": 0.32610705494880676, + "learning_rate": 2.675046262795468e-06, + "loss": 0.0099, + "step": 39274 + }, + { + "epoch": 0.78552, + "grad_norm": 0.6047691106796265, + "learning_rate": 2.6740957971602123e-06, + "loss": 0.0059, + "step": 39276 + }, + { + "epoch": 0.78556, + "grad_norm": 0.041448719799518585, + "learning_rate": 2.6731454743472254e-06, + "loss": 0.0038, + "step": 39278 + }, + { + "epoch": 0.7856, + "grad_norm": 0.6811713576316833, + "learning_rate": 2.672195294375045e-06, + "loss": 0.04, + "step": 39280 + }, + { + "epoch": 0.78564, + "grad_norm": 0.18893486261367798, + "learning_rate": 2.671245257262188e-06, + "loss": 0.0045, + "step": 39282 + }, + { + "epoch": 0.78568, + "grad_norm": 0.015927372500300407, + "learning_rate": 2.6702953630271765e-06, + "loss": 0.0005, + "step": 39284 + }, + { + "epoch": 0.78572, + "grad_norm": 0.20143339037895203, + "learning_rate": 2.669345611688532e-06, + "loss": 0.0063, + "step": 39286 + }, + { + "epoch": 0.78576, + "grad_norm": 0.015645701438188553, + "learning_rate": 2.6683960032647683e-06, + "loss": 0.0058, + "step": 39288 + }, + { + "epoch": 0.7858, + "grad_norm": 0.05618621036410332, + "learning_rate": 2.667446537774402e-06, + "loss": 0.0005, + "step": 39290 + }, + { + "epoch": 0.78584, + "grad_norm": 0.07716099172830582, + "learning_rate": 2.6664972152359368e-06, + "loss": 0.001, + "step": 39292 + }, + { + "epoch": 0.78588, + "grad_norm": 0.009525904431939125, + "learning_rate": 2.6655480356678842e-06, + "loss": 0.0791, + "step": 39294 + }, + { + "epoch": 0.78592, + "grad_norm": 0.26640841364860535, + "learning_rate": 2.664598999088749e-06, + "loss": 0.0034, + "step": 39296 + }, + { + "epoch": 0.78596, + "grad_norm": 0.003909673076122999, + "learning_rate": 2.663650105517036e-06, + "loss": 0.0, + "step": 39298 + }, + { + "epoch": 0.786, + "grad_norm": 0.17681995034217834, + "learning_rate": 2.6627013549712355e-06, + "loss": 0.006, + "step": 39300 + }, + { + "epoch": 0.78604, + "grad_norm": 0.22326600551605225, + "learning_rate": 2.6617527474698546e-06, + "loss": 0.0028, + "step": 39302 + }, + { + "epoch": 0.78608, + "grad_norm": 0.009195733815431595, + "learning_rate": 2.6608042830313796e-06, + "loss": 0.0268, + "step": 39304 + }, + { + "epoch": 0.78612, + "grad_norm": 0.028487777337431908, + "learning_rate": 2.6598559616743034e-06, + "loss": 0.0005, + "step": 39306 + }, + { + "epoch": 0.78616, + "grad_norm": 0.10770770907402039, + "learning_rate": 2.658907783417117e-06, + "loss": 0.001, + "step": 39308 + }, + { + "epoch": 0.7862, + "grad_norm": 0.040831055492162704, + "learning_rate": 2.6579597482782972e-06, + "loss": 0.0022, + "step": 39310 + }, + { + "epoch": 0.78624, + "grad_norm": 0.011215458624064922, + "learning_rate": 2.657011856276338e-06, + "loss": 0.0004, + "step": 39312 + }, + { + "epoch": 0.78628, + "grad_norm": 0.04567578807473183, + "learning_rate": 2.65606410742971e-06, + "loss": 0.0013, + "step": 39314 + }, + { + "epoch": 0.78632, + "grad_norm": 0.03189217299222946, + "learning_rate": 2.6551165017568924e-06, + "loss": 0.0041, + "step": 39316 + }, + { + "epoch": 0.78636, + "grad_norm": 0.06127486005425453, + "learning_rate": 2.654169039276361e-06, + "loss": 0.0012, + "step": 39318 + }, + { + "epoch": 0.7864, + "grad_norm": 0.21439659595489502, + "learning_rate": 2.6532217200065856e-06, + "loss": 0.0028, + "step": 39320 + }, + { + "epoch": 0.78644, + "grad_norm": 0.22628507018089294, + "learning_rate": 2.652274543966038e-06, + "loss": 0.1522, + "step": 39322 + }, + { + "epoch": 0.78648, + "grad_norm": 0.4081178307533264, + "learning_rate": 2.6513275111731783e-06, + "loss": 0.0063, + "step": 39324 + }, + { + "epoch": 0.78652, + "grad_norm": 0.1423233449459076, + "learning_rate": 2.6503806216464725e-06, + "loss": 0.0012, + "step": 39326 + }, + { + "epoch": 0.78656, + "grad_norm": 0.026620492339134216, + "learning_rate": 2.649433875404379e-06, + "loss": 0.0007, + "step": 39328 + }, + { + "epoch": 0.7866, + "grad_norm": 0.07911191135644913, + "learning_rate": 2.648487272465361e-06, + "loss": 0.0023, + "step": 39330 + }, + { + "epoch": 0.78664, + "grad_norm": 0.018147101625800133, + "learning_rate": 2.647540812847862e-06, + "loss": 0.0006, + "step": 39332 + }, + { + "epoch": 0.78668, + "grad_norm": 0.5411960482597351, + "learning_rate": 2.646594496570346e-06, + "loss": 0.0067, + "step": 39334 + }, + { + "epoch": 0.78672, + "grad_norm": 0.8425455093383789, + "learning_rate": 2.645648323651252e-06, + "loss": 0.2418, + "step": 39336 + }, + { + "epoch": 0.78676, + "grad_norm": 0.014903917908668518, + "learning_rate": 2.6447022941090317e-06, + "loss": 0.0013, + "step": 39338 + }, + { + "epoch": 0.7868, + "grad_norm": 2.1138997077941895, + "learning_rate": 2.643756407962127e-06, + "loss": 0.0225, + "step": 39340 + }, + { + "epoch": 0.78684, + "grad_norm": 0.21752475202083588, + "learning_rate": 2.6428106652289775e-06, + "loss": 0.0034, + "step": 39342 + }, + { + "epoch": 0.78688, + "grad_norm": 0.06435326486825943, + "learning_rate": 2.6418650659280253e-06, + "loss": 0.0008, + "step": 39344 + }, + { + "epoch": 0.78692, + "grad_norm": 0.3415642976760864, + "learning_rate": 2.640919610077698e-06, + "loss": 0.0038, + "step": 39346 + }, + { + "epoch": 0.78696, + "grad_norm": 0.04885084927082062, + "learning_rate": 2.6399742976964326e-06, + "loss": 0.0017, + "step": 39348 + }, + { + "epoch": 0.787, + "grad_norm": 0.11537729203701019, + "learning_rate": 2.639029128802657e-06, + "loss": 0.0027, + "step": 39350 + }, + { + "epoch": 0.78704, + "grad_norm": 0.34446585178375244, + "learning_rate": 2.6380841034147986e-06, + "loss": 0.0035, + "step": 39352 + }, + { + "epoch": 0.78708, + "grad_norm": 0.1006234884262085, + "learning_rate": 2.6371392215512804e-06, + "loss": 0.059, + "step": 39354 + }, + { + "epoch": 0.78712, + "grad_norm": 0.36259427666664124, + "learning_rate": 2.6361944832305275e-06, + "loss": 0.0043, + "step": 39356 + }, + { + "epoch": 0.78716, + "grad_norm": 0.6015061736106873, + "learning_rate": 2.63524988847095e-06, + "loss": 0.0053, + "step": 39358 + }, + { + "epoch": 0.7872, + "grad_norm": 0.059759415686130524, + "learning_rate": 2.634305437290968e-06, + "loss": 0.0009, + "step": 39360 + }, + { + "epoch": 0.78724, + "grad_norm": 0.02483806386590004, + "learning_rate": 2.633361129708998e-06, + "loss": 0.0005, + "step": 39362 + }, + { + "epoch": 0.78728, + "grad_norm": 21.311708450317383, + "learning_rate": 2.6324169657434374e-06, + "loss": 0.5893, + "step": 39364 + }, + { + "epoch": 0.78732, + "grad_norm": 0.031703364104032516, + "learning_rate": 2.6314729454127085e-06, + "loss": 0.0004, + "step": 39366 + }, + { + "epoch": 0.78736, + "grad_norm": 0.0437578447163105, + "learning_rate": 2.6305290687352013e-06, + "loss": 0.0009, + "step": 39368 + }, + { + "epoch": 0.7874, + "grad_norm": 0.010134037584066391, + "learning_rate": 2.62958533572933e-06, + "loss": 0.0008, + "step": 39370 + }, + { + "epoch": 0.78744, + "grad_norm": 4.619869232177734, + "learning_rate": 2.6286417464134852e-06, + "loss": 0.0475, + "step": 39372 + }, + { + "epoch": 0.78748, + "grad_norm": 0.04500530660152435, + "learning_rate": 2.627698300806063e-06, + "loss": 0.0014, + "step": 39374 + }, + { + "epoch": 0.78752, + "grad_norm": 2.1951422691345215, + "learning_rate": 2.6267549989254614e-06, + "loss": 0.0252, + "step": 39376 + }, + { + "epoch": 0.78756, + "grad_norm": 0.034841056913137436, + "learning_rate": 2.625811840790061e-06, + "loss": 0.0009, + "step": 39378 + }, + { + "epoch": 0.7876, + "grad_norm": 0.39022326469421387, + "learning_rate": 2.624868826418262e-06, + "loss": 0.0049, + "step": 39380 + }, + { + "epoch": 0.78764, + "grad_norm": 0.044489357620477676, + "learning_rate": 2.623925955828439e-06, + "loss": 0.0137, + "step": 39382 + }, + { + "epoch": 0.78768, + "grad_norm": 0.13557957112789154, + "learning_rate": 2.622983229038977e-06, + "loss": 0.0021, + "step": 39384 + }, + { + "epoch": 0.78772, + "grad_norm": 0.07376018911600113, + "learning_rate": 2.6220406460682547e-06, + "loss": 0.0012, + "step": 39386 + }, + { + "epoch": 0.78776, + "grad_norm": 0.033751193434000015, + "learning_rate": 2.621098206934651e-06, + "loss": 0.0004, + "step": 39388 + }, + { + "epoch": 0.7878, + "grad_norm": 0.04647928103804588, + "learning_rate": 2.6201559116565346e-06, + "loss": 0.0006, + "step": 39390 + }, + { + "epoch": 0.78784, + "grad_norm": 0.052647992968559265, + "learning_rate": 2.619213760252277e-06, + "loss": 0.0008, + "step": 39392 + }, + { + "epoch": 0.78788, + "grad_norm": 0.2738063633441925, + "learning_rate": 2.618271752740248e-06, + "loss": 0.0032, + "step": 39394 + }, + { + "epoch": 0.78792, + "grad_norm": 0.027441076934337616, + "learning_rate": 2.617329889138811e-06, + "loss": 0.0013, + "step": 39396 + }, + { + "epoch": 0.78796, + "grad_norm": 0.10812460631132126, + "learning_rate": 2.6163881694663317e-06, + "loss": 0.0045, + "step": 39398 + }, + { + "epoch": 0.788, + "grad_norm": 0.004418670199811459, + "learning_rate": 2.615446593741161e-06, + "loss": 0.0, + "step": 39400 + }, + { + "epoch": 0.78804, + "grad_norm": 0.8026946187019348, + "learning_rate": 2.6145051619816664e-06, + "loss": 0.0082, + "step": 39402 + }, + { + "epoch": 0.78808, + "grad_norm": 0.005881453398615122, + "learning_rate": 2.613563874206193e-06, + "loss": 0.0002, + "step": 39404 + }, + { + "epoch": 0.78812, + "grad_norm": 0.15254177153110504, + "learning_rate": 2.612622730433094e-06, + "loss": 0.0018, + "step": 39406 + }, + { + "epoch": 0.78816, + "grad_norm": 0.07831470668315887, + "learning_rate": 2.6116817306807218e-06, + "loss": 0.001, + "step": 39408 + }, + { + "epoch": 0.7882, + "grad_norm": 0.1315215826034546, + "learning_rate": 2.6107408749674125e-06, + "loss": 0.0014, + "step": 39410 + }, + { + "epoch": 0.78824, + "grad_norm": 0.1325589418411255, + "learning_rate": 2.609800163311519e-06, + "loss": 0.0014, + "step": 39412 + }, + { + "epoch": 0.78828, + "grad_norm": 0.15687035024166107, + "learning_rate": 2.6088595957313733e-06, + "loss": 0.0034, + "step": 39414 + }, + { + "epoch": 0.78832, + "grad_norm": 0.008012475445866585, + "learning_rate": 2.6079191722453157e-06, + "loss": 0.0002, + "step": 39416 + }, + { + "epoch": 0.78836, + "grad_norm": 0.04206167533993721, + "learning_rate": 2.606978892871679e-06, + "loss": 0.0007, + "step": 39418 + }, + { + "epoch": 0.7884, + "grad_norm": 0.14883816242218018, + "learning_rate": 2.6060387576287983e-06, + "loss": 0.0034, + "step": 39420 + }, + { + "epoch": 0.78844, + "grad_norm": 2.1332216262817383, + "learning_rate": 2.6050987665349926e-06, + "loss": 0.0154, + "step": 39422 + }, + { + "epoch": 0.78848, + "grad_norm": 0.004903148859739304, + "learning_rate": 2.6041589196085993e-06, + "loss": 0.0001, + "step": 39424 + }, + { + "epoch": 0.78852, + "grad_norm": 0.23019324243068695, + "learning_rate": 2.6032192168679315e-06, + "loss": 0.004, + "step": 39426 + }, + { + "epoch": 0.78856, + "grad_norm": 0.004748174455016851, + "learning_rate": 2.602279658331315e-06, + "loss": 0.0421, + "step": 39428 + }, + { + "epoch": 0.7886, + "grad_norm": 0.002187244361266494, + "learning_rate": 2.6013402440170676e-06, + "loss": 0.1377, + "step": 39430 + }, + { + "epoch": 0.78864, + "grad_norm": 0.011004212312400341, + "learning_rate": 2.6004009739434945e-06, + "loss": 0.0012, + "step": 39432 + }, + { + "epoch": 0.78868, + "grad_norm": 0.0353066511452198, + "learning_rate": 2.5994618481289213e-06, + "loss": 0.0013, + "step": 39434 + }, + { + "epoch": 0.78872, + "grad_norm": 0.17417502403259277, + "learning_rate": 2.5985228665916452e-06, + "loss": 0.0085, + "step": 39436 + }, + { + "epoch": 0.78876, + "grad_norm": 0.0126308249309659, + "learning_rate": 2.5975840293499765e-06, + "loss": 0.176, + "step": 39438 + }, + { + "epoch": 0.7888, + "grad_norm": 0.34158048033714294, + "learning_rate": 2.596645336422219e-06, + "loss": 0.007, + "step": 39440 + }, + { + "epoch": 0.78884, + "grad_norm": 3.104515790939331, + "learning_rate": 2.5957067878266717e-06, + "loss": 0.0274, + "step": 39442 + }, + { + "epoch": 0.78888, + "grad_norm": 0.029231196269392967, + "learning_rate": 2.594768383581635e-06, + "loss": 0.027, + "step": 39444 + }, + { + "epoch": 0.78892, + "grad_norm": 0.0847451463341713, + "learning_rate": 2.5938301237054e-06, + "loss": 0.0018, + "step": 39446 + }, + { + "epoch": 0.78896, + "grad_norm": 0.25733113288879395, + "learning_rate": 2.5928920082162577e-06, + "loss": 0.1398, + "step": 39448 + }, + { + "epoch": 0.789, + "grad_norm": 0.5828744769096375, + "learning_rate": 2.5919540371325005e-06, + "loss": 0.0058, + "step": 39450 + }, + { + "epoch": 0.78904, + "grad_norm": 0.22739428281784058, + "learning_rate": 2.5910162104724144e-06, + "loss": 0.0026, + "step": 39452 + }, + { + "epoch": 0.78908, + "grad_norm": 0.2011481374502182, + "learning_rate": 2.590078528254277e-06, + "loss": 0.003, + "step": 39454 + }, + { + "epoch": 0.78912, + "grad_norm": 0.13472618162631989, + "learning_rate": 2.5891409904963793e-06, + "loss": 0.0018, + "step": 39456 + }, + { + "epoch": 0.78916, + "grad_norm": 0.41138994693756104, + "learning_rate": 2.5882035972169907e-06, + "loss": 0.005, + "step": 39458 + }, + { + "epoch": 0.7892, + "grad_norm": 0.1565125733613968, + "learning_rate": 2.5872663484343887e-06, + "loss": 0.0089, + "step": 39460 + }, + { + "epoch": 0.78924, + "grad_norm": 0.6195765733718872, + "learning_rate": 2.5863292441668485e-06, + "loss": 0.0071, + "step": 39462 + }, + { + "epoch": 0.78928, + "grad_norm": 1.66695237159729, + "learning_rate": 2.5853922844326308e-06, + "loss": 0.018, + "step": 39464 + }, + { + "epoch": 0.78932, + "grad_norm": 11.351592063903809, + "learning_rate": 2.5844554692500133e-06, + "loss": 0.1635, + "step": 39466 + }, + { + "epoch": 0.78936, + "grad_norm": 0.1728358268737793, + "learning_rate": 2.5835187986372514e-06, + "loss": 0.0016, + "step": 39468 + }, + { + "epoch": 0.7894, + "grad_norm": 2.3268866539001465, + "learning_rate": 2.5825822726126095e-06, + "loss": 0.0241, + "step": 39470 + }, + { + "epoch": 0.78944, + "grad_norm": 0.04141337051987648, + "learning_rate": 2.5816458911943434e-06, + "loss": 0.0004, + "step": 39472 + }, + { + "epoch": 0.78948, + "grad_norm": 19.774015426635742, + "learning_rate": 2.5807096544007103e-06, + "loss": 0.6086, + "step": 39474 + }, + { + "epoch": 0.78952, + "grad_norm": 0.023861296474933624, + "learning_rate": 2.5797735622499665e-06, + "loss": 0.0012, + "step": 39476 + }, + { + "epoch": 0.78956, + "grad_norm": 0.026804126799106598, + "learning_rate": 2.578837614760353e-06, + "loss": 0.0093, + "step": 39478 + }, + { + "epoch": 0.7896, + "grad_norm": 0.12386640161275864, + "learning_rate": 2.577901811950121e-06, + "loss": 0.0035, + "step": 39480 + }, + { + "epoch": 0.78964, + "grad_norm": 0.18186545372009277, + "learning_rate": 2.5769661538375144e-06, + "loss": 0.0035, + "step": 39482 + }, + { + "epoch": 0.78968, + "grad_norm": 0.01955995336174965, + "learning_rate": 2.5760306404407754e-06, + "loss": 0.0002, + "step": 39484 + }, + { + "epoch": 0.78972, + "grad_norm": 0.015716666355729103, + "learning_rate": 2.5750952717781395e-06, + "loss": 0.0013, + "step": 39486 + }, + { + "epoch": 0.78976, + "grad_norm": 4.601013660430908, + "learning_rate": 2.5741600478678474e-06, + "loss": 0.0718, + "step": 39488 + }, + { + "epoch": 0.7898, + "grad_norm": 0.40487897396087646, + "learning_rate": 2.5732249687281228e-06, + "loss": 0.0074, + "step": 39490 + }, + { + "epoch": 0.78984, + "grad_norm": 1.271293044090271, + "learning_rate": 2.5722900343772075e-06, + "loss": 0.0156, + "step": 39492 + }, + { + "epoch": 0.78988, + "grad_norm": 0.10426841676235199, + "learning_rate": 2.5713552448333177e-06, + "loss": 0.0018, + "step": 39494 + }, + { + "epoch": 0.78992, + "grad_norm": 0.48615923523902893, + "learning_rate": 2.5704206001146825e-06, + "loss": 0.0046, + "step": 39496 + }, + { + "epoch": 0.78996, + "grad_norm": 0.038942232728004456, + "learning_rate": 2.5694861002395254e-06, + "loss": 0.0032, + "step": 39498 + }, + { + "epoch": 0.79, + "grad_norm": 0.07908865809440613, + "learning_rate": 2.5685517452260566e-06, + "loss": 0.0009, + "step": 39500 + }, + { + "epoch": 0.79004, + "grad_norm": 0.49800440669059753, + "learning_rate": 2.5676175350925036e-06, + "loss": 0.006, + "step": 39502 + }, + { + "epoch": 0.79008, + "grad_norm": 0.0006388640031218529, + "learning_rate": 2.56668346985707e-06, + "loss": 0.0003, + "step": 39504 + }, + { + "epoch": 0.79012, + "grad_norm": 2.975513219833374, + "learning_rate": 2.565749549537969e-06, + "loss": 0.0268, + "step": 39506 + }, + { + "epoch": 0.79016, + "grad_norm": 0.04113160818815231, + "learning_rate": 2.5648157741534072e-06, + "loss": 0.0011, + "step": 39508 + }, + { + "epoch": 0.7902, + "grad_norm": 0.04490087181329727, + "learning_rate": 2.5638821437215944e-06, + "loss": 0.0015, + "step": 39510 + }, + { + "epoch": 0.79024, + "grad_norm": 3.7964444160461426, + "learning_rate": 2.5629486582607233e-06, + "loss": 0.0418, + "step": 39512 + }, + { + "epoch": 0.79028, + "grad_norm": 0.0019963777158409357, + "learning_rate": 2.5620153177889974e-06, + "loss": 0.0003, + "step": 39514 + }, + { + "epoch": 0.79032, + "grad_norm": 2.8355939388275146, + "learning_rate": 2.561082122324612e-06, + "loss": 0.0301, + "step": 39516 + }, + { + "epoch": 0.79036, + "grad_norm": 2.418372631072998, + "learning_rate": 2.5601490718857615e-06, + "loss": 0.0171, + "step": 39518 + }, + { + "epoch": 0.7904, + "grad_norm": 0.14985620975494385, + "learning_rate": 2.5592161664906366e-06, + "loss": 0.0535, + "step": 39520 + }, + { + "epoch": 0.79044, + "grad_norm": 0.26526591181755066, + "learning_rate": 2.558283406157418e-06, + "loss": 0.0057, + "step": 39522 + }, + { + "epoch": 0.79048, + "grad_norm": 0.054945144802331924, + "learning_rate": 2.5573507909043016e-06, + "loss": 0.0006, + "step": 39524 + }, + { + "epoch": 0.79052, + "grad_norm": 2.481717109680176, + "learning_rate": 2.556418320749461e-06, + "loss": 0.0255, + "step": 39526 + }, + { + "epoch": 0.79056, + "grad_norm": 22.94109535217285, + "learning_rate": 2.5554859957110766e-06, + "loss": 0.5506, + "step": 39528 + }, + { + "epoch": 0.7906, + "grad_norm": 0.03092464804649353, + "learning_rate": 2.5545538158073278e-06, + "loss": 0.0004, + "step": 39530 + }, + { + "epoch": 0.79064, + "grad_norm": 0.010717403143644333, + "learning_rate": 2.553621781056381e-06, + "loss": 0.0338, + "step": 39532 + }, + { + "epoch": 0.79068, + "grad_norm": 0.25015029311180115, + "learning_rate": 2.5526898914764166e-06, + "loss": 0.0068, + "step": 39534 + }, + { + "epoch": 0.79072, + "grad_norm": 0.01838015206158161, + "learning_rate": 2.5517581470855933e-06, + "loss": 0.0003, + "step": 39536 + }, + { + "epoch": 0.79076, + "grad_norm": 3.2310571670532227, + "learning_rate": 2.55082654790208e-06, + "loss": 0.0268, + "step": 39538 + }, + { + "epoch": 0.7908, + "grad_norm": 0.15003171563148499, + "learning_rate": 2.549895093944039e-06, + "loss": 0.0024, + "step": 39540 + }, + { + "epoch": 0.79084, + "grad_norm": 0.1644691675901413, + "learning_rate": 2.5489637852296314e-06, + "loss": 0.0021, + "step": 39542 + }, + { + "epoch": 0.79088, + "grad_norm": 0.027008000761270523, + "learning_rate": 2.5480326217770067e-06, + "loss": 0.0008, + "step": 39544 + }, + { + "epoch": 0.79092, + "grad_norm": 0.05165939778089523, + "learning_rate": 2.547101603604324e-06, + "loss": 0.4111, + "step": 39546 + }, + { + "epoch": 0.79096, + "grad_norm": 0.020605111494660378, + "learning_rate": 2.5461707307297314e-06, + "loss": 0.0018, + "step": 39548 + }, + { + "epoch": 0.791, + "grad_norm": 0.06227554380893707, + "learning_rate": 2.5452400031713786e-06, + "loss": 0.0005, + "step": 39550 + }, + { + "epoch": 0.79104, + "grad_norm": 0.009172205813229084, + "learning_rate": 2.544309420947413e-06, + "loss": 0.0008, + "step": 39552 + }, + { + "epoch": 0.79108, + "grad_norm": 0.28463613986968994, + "learning_rate": 2.543378984075967e-06, + "loss": 0.0027, + "step": 39554 + }, + { + "epoch": 0.79112, + "grad_norm": 1.4840561151504517, + "learning_rate": 2.5424486925751934e-06, + "loss": 0.0137, + "step": 39556 + }, + { + "epoch": 0.79116, + "grad_norm": 0.0036991422530263662, + "learning_rate": 2.5415185464632185e-06, + "loss": 0.0008, + "step": 39558 + }, + { + "epoch": 0.7912, + "grad_norm": 0.5390657186508179, + "learning_rate": 2.5405885457581793e-06, + "loss": 0.0053, + "step": 39560 + }, + { + "epoch": 0.79124, + "grad_norm": 0.016379453241825104, + "learning_rate": 2.539658690478207e-06, + "loss": 0.0016, + "step": 39562 + }, + { + "epoch": 0.79128, + "grad_norm": 6.765904903411865, + "learning_rate": 2.5387289806414293e-06, + "loss": 0.081, + "step": 39564 + }, + { + "epoch": 0.79132, + "grad_norm": 0.0034275732468813658, + "learning_rate": 2.5377994162659748e-06, + "loss": 0.0002, + "step": 39566 + }, + { + "epoch": 0.79136, + "grad_norm": 0.020400051027536392, + "learning_rate": 2.5368699973699583e-06, + "loss": 0.0015, + "step": 39568 + }, + { + "epoch": 0.7914, + "grad_norm": 26.75891876220703, + "learning_rate": 2.535940723971505e-06, + "loss": 0.5965, + "step": 39570 + }, + { + "epoch": 0.79144, + "grad_norm": 0.04224366694688797, + "learning_rate": 2.535011596088729e-06, + "loss": 0.0003, + "step": 39572 + }, + { + "epoch": 0.79148, + "grad_norm": 0.7476136684417725, + "learning_rate": 2.534082613739749e-06, + "loss": 0.0068, + "step": 39574 + }, + { + "epoch": 0.79152, + "grad_norm": 0.0008735117735341191, + "learning_rate": 2.5331537769426653e-06, + "loss": 0.0039, + "step": 39576 + }, + { + "epoch": 0.79156, + "grad_norm": 0.07576747983694077, + "learning_rate": 2.5322250857155996e-06, + "loss": 0.0011, + "step": 39578 + }, + { + "epoch": 0.7916, + "grad_norm": 0.02301269769668579, + "learning_rate": 2.5312965400766475e-06, + "loss": 0.0047, + "step": 39580 + }, + { + "epoch": 0.79164, + "grad_norm": 0.6995832920074463, + "learning_rate": 2.530368140043914e-06, + "loss": 0.0078, + "step": 39582 + }, + { + "epoch": 0.79168, + "grad_norm": 0.35410264134407043, + "learning_rate": 2.5294398856355017e-06, + "loss": 0.0094, + "step": 39584 + }, + { + "epoch": 0.79172, + "grad_norm": 0.19646292924880981, + "learning_rate": 2.5285117768695e-06, + "loss": 0.0043, + "step": 39586 + }, + { + "epoch": 0.79176, + "grad_norm": 0.0005885930149815977, + "learning_rate": 2.527583813764014e-06, + "loss": 0.0149, + "step": 39588 + }, + { + "epoch": 0.7918, + "grad_norm": 16.468185424804688, + "learning_rate": 2.5266559963371216e-06, + "loss": 0.6265, + "step": 39590 + }, + { + "epoch": 0.79184, + "grad_norm": 0.7523819804191589, + "learning_rate": 2.5257283246069254e-06, + "loss": 0.0372, + "step": 39592 + }, + { + "epoch": 0.79188, + "grad_norm": 0.7505091428756714, + "learning_rate": 2.524800798591499e-06, + "loss": 0.0065, + "step": 39594 + }, + { + "epoch": 0.79192, + "grad_norm": 0.041734449565410614, + "learning_rate": 2.5238734183089307e-06, + "loss": 0.001, + "step": 39596 + }, + { + "epoch": 0.79196, + "grad_norm": 0.10925478488206863, + "learning_rate": 2.5229461837773017e-06, + "loss": 0.0032, + "step": 39598 + }, + { + "epoch": 0.792, + "grad_norm": 0.004407142288982868, + "learning_rate": 2.522019095014683e-06, + "loss": 0.0017, + "step": 39600 + }, + { + "epoch": 0.79204, + "grad_norm": 0.1191752627491951, + "learning_rate": 2.5210921520391518e-06, + "loss": 0.0041, + "step": 39602 + }, + { + "epoch": 0.79208, + "grad_norm": 0.004625073634088039, + "learning_rate": 2.52016535486878e-06, + "loss": 0.0022, + "step": 39604 + }, + { + "epoch": 0.79212, + "grad_norm": 1.5951396226882935, + "learning_rate": 2.5192387035216358e-06, + "loss": 0.0104, + "step": 39606 + }, + { + "epoch": 0.79216, + "grad_norm": 0.3166086971759796, + "learning_rate": 2.518312198015784e-06, + "loss": 0.004, + "step": 39608 + }, + { + "epoch": 0.7922, + "grad_norm": 0.26227691769599915, + "learning_rate": 2.5173858383692906e-06, + "loss": 0.0099, + "step": 39610 + }, + { + "epoch": 0.79224, + "grad_norm": 0.03095373697578907, + "learning_rate": 2.5164596246002093e-06, + "loss": 0.0003, + "step": 39612 + }, + { + "epoch": 0.79228, + "grad_norm": 0.2657248079776764, + "learning_rate": 2.5155335567266014e-06, + "loss": 0.0044, + "step": 39614 + }, + { + "epoch": 0.79232, + "grad_norm": 0.11056269705295563, + "learning_rate": 2.5146076347665195e-06, + "loss": 0.8565, + "step": 39616 + }, + { + "epoch": 0.79236, + "grad_norm": 0.1443827599287033, + "learning_rate": 2.5136818587380154e-06, + "loss": 0.0019, + "step": 39618 + }, + { + "epoch": 0.7924, + "grad_norm": 0.1269202083349228, + "learning_rate": 2.512756228659141e-06, + "loss": 0.0063, + "step": 39620 + }, + { + "epoch": 0.79244, + "grad_norm": 0.3154838979244232, + "learning_rate": 2.5118307445479316e-06, + "loss": 0.007, + "step": 39622 + }, + { + "epoch": 0.79248, + "grad_norm": 0.13815419375896454, + "learning_rate": 2.510905406422445e-06, + "loss": 0.0015, + "step": 39624 + }, + { + "epoch": 0.79252, + "grad_norm": 0.1773916482925415, + "learning_rate": 2.509980214300708e-06, + "loss": 0.0357, + "step": 39626 + }, + { + "epoch": 0.79256, + "grad_norm": 0.008172071538865566, + "learning_rate": 2.5090551682007636e-06, + "loss": 0.0005, + "step": 39628 + }, + { + "epoch": 0.7926, + "grad_norm": 0.03160092607140541, + "learning_rate": 2.5081302681406463e-06, + "loss": 0.0241, + "step": 39630 + }, + { + "epoch": 0.79264, + "grad_norm": 0.10702577978372574, + "learning_rate": 2.5072055141383876e-06, + "loss": 0.0043, + "step": 39632 + }, + { + "epoch": 0.79268, + "grad_norm": 2.0172922611236572, + "learning_rate": 2.506280906212014e-06, + "loss": 0.0225, + "step": 39634 + }, + { + "epoch": 0.79272, + "grad_norm": 0.004886722657829523, + "learning_rate": 2.505356444379551e-06, + "loss": 0.0013, + "step": 39636 + }, + { + "epoch": 0.79276, + "grad_norm": 0.032287728041410446, + "learning_rate": 2.5044321286590223e-06, + "loss": 0.0043, + "step": 39638 + }, + { + "epoch": 0.7928, + "grad_norm": 0.17170090973377228, + "learning_rate": 2.5035079590684496e-06, + "loss": 0.0351, + "step": 39640 + }, + { + "epoch": 0.79284, + "grad_norm": 0.17192474007606506, + "learning_rate": 2.502583935625851e-06, + "loss": 0.0019, + "step": 39642 + }, + { + "epoch": 0.79288, + "grad_norm": 0.1617523431777954, + "learning_rate": 2.5016600583492324e-06, + "loss": 0.0042, + "step": 39644 + }, + { + "epoch": 0.79292, + "grad_norm": 0.003531751921400428, + "learning_rate": 2.5007363272566165e-06, + "loss": 0.0017, + "step": 39646 + }, + { + "epoch": 0.79296, + "grad_norm": 0.5601457357406616, + "learning_rate": 2.4998127423660047e-06, + "loss": 0.0076, + "step": 39648 + }, + { + "epoch": 0.793, + "grad_norm": 2.434377670288086, + "learning_rate": 2.4988893036954045e-06, + "loss": 0.0222, + "step": 39650 + }, + { + "epoch": 0.79304, + "grad_norm": 0.009700160473585129, + "learning_rate": 2.497966011262822e-06, + "loss": 0.0014, + "step": 39652 + }, + { + "epoch": 0.79308, + "grad_norm": 0.017597200348973274, + "learning_rate": 2.49704286508625e-06, + "loss": 0.0066, + "step": 39654 + }, + { + "epoch": 0.79312, + "grad_norm": 0.10559315234422684, + "learning_rate": 2.496119865183695e-06, + "loss": 0.0011, + "step": 39656 + }, + { + "epoch": 0.79316, + "grad_norm": 0.25766521692276, + "learning_rate": 2.495197011573143e-06, + "loss": 0.0028, + "step": 39658 + }, + { + "epoch": 0.7932, + "grad_norm": 8.144111633300781, + "learning_rate": 2.494274304272589e-06, + "loss": 0.1076, + "step": 39660 + }, + { + "epoch": 0.79324, + "grad_norm": 0.26541396975517273, + "learning_rate": 2.493351743300022e-06, + "loss": 0.0081, + "step": 39662 + }, + { + "epoch": 0.79328, + "grad_norm": 0.4830383360385895, + "learning_rate": 2.492429328673431e-06, + "loss": 0.0064, + "step": 39664 + }, + { + "epoch": 0.79332, + "grad_norm": 0.2168295979499817, + "learning_rate": 2.4915070604107915e-06, + "loss": 0.0029, + "step": 39666 + }, + { + "epoch": 0.79336, + "grad_norm": 0.07568003982305527, + "learning_rate": 2.4905849385300886e-06, + "loss": 0.0021, + "step": 39668 + }, + { + "epoch": 0.7934, + "grad_norm": 0.08277338743209839, + "learning_rate": 2.4896629630492974e-06, + "loss": 0.0018, + "step": 39670 + }, + { + "epoch": 0.79344, + "grad_norm": 0.019850904121994972, + "learning_rate": 2.488741133986393e-06, + "loss": 0.0004, + "step": 39672 + }, + { + "epoch": 0.79348, + "grad_norm": 0.2823656499385834, + "learning_rate": 2.4878194513593513e-06, + "loss": 0.0049, + "step": 39674 + }, + { + "epoch": 0.79352, + "grad_norm": 0.03868034854531288, + "learning_rate": 2.4868979151861304e-06, + "loss": 0.0021, + "step": 39676 + }, + { + "epoch": 0.79356, + "grad_norm": 0.874508261680603, + "learning_rate": 2.485976525484709e-06, + "loss": 0.0106, + "step": 39678 + }, + { + "epoch": 0.7936, + "grad_norm": 0.07449622452259064, + "learning_rate": 2.48505528227304e-06, + "loss": 0.1764, + "step": 39680 + }, + { + "epoch": 0.79364, + "grad_norm": 0.07283949851989746, + "learning_rate": 2.4841341855690883e-06, + "loss": 0.0016, + "step": 39682 + }, + { + "epoch": 0.79368, + "grad_norm": 0.030219873413443565, + "learning_rate": 2.4832132353908088e-06, + "loss": 0.0085, + "step": 39684 + }, + { + "epoch": 0.79372, + "grad_norm": 0.0012540770694613457, + "learning_rate": 2.4822924317561583e-06, + "loss": 0.0022, + "step": 39686 + }, + { + "epoch": 0.79376, + "grad_norm": 0.017365461215376854, + "learning_rate": 2.48137177468309e-06, + "loss": 0.0169, + "step": 39688 + }, + { + "epoch": 0.7938, + "grad_norm": 10.707550048828125, + "learning_rate": 2.480451264189546e-06, + "loss": 0.2345, + "step": 39690 + }, + { + "epoch": 0.79384, + "grad_norm": 0.4862927794456482, + "learning_rate": 2.4795309002934774e-06, + "loss": 0.0052, + "step": 39692 + }, + { + "epoch": 0.79388, + "grad_norm": 0.034940849989652634, + "learning_rate": 2.4786106830128243e-06, + "loss": 0.0022, + "step": 39694 + }, + { + "epoch": 0.79392, + "grad_norm": 0.001094528241083026, + "learning_rate": 2.477690612365529e-06, + "loss": 0.0006, + "step": 39696 + }, + { + "epoch": 0.79396, + "grad_norm": 0.8417149186134338, + "learning_rate": 2.4767706883695276e-06, + "loss": 0.6574, + "step": 39698 + }, + { + "epoch": 0.794, + "grad_norm": 2.0819852352142334, + "learning_rate": 2.4758509110427576e-06, + "loss": 0.0191, + "step": 39700 + }, + { + "epoch": 0.79404, + "grad_norm": 0.002212346298620105, + "learning_rate": 2.4749312804031455e-06, + "loss": 0.0005, + "step": 39702 + }, + { + "epoch": 0.79408, + "grad_norm": 0.050415974110364914, + "learning_rate": 2.4740117964686218e-06, + "loss": 0.0154, + "step": 39704 + }, + { + "epoch": 0.79412, + "grad_norm": 0.12236665189266205, + "learning_rate": 2.473092459257116e-06, + "loss": 0.0114, + "step": 39706 + }, + { + "epoch": 0.79416, + "grad_norm": 0.1731536090373993, + "learning_rate": 2.472173268786542e-06, + "loss": 0.0035, + "step": 39708 + }, + { + "epoch": 0.7942, + "grad_norm": 0.028363682329654694, + "learning_rate": 2.4712542250748305e-06, + "loss": 0.0004, + "step": 39710 + }, + { + "epoch": 0.79424, + "grad_norm": 13.163870811462402, + "learning_rate": 2.470335328139889e-06, + "loss": 0.5892, + "step": 39712 + }, + { + "epoch": 0.79428, + "grad_norm": 0.2279006987810135, + "learning_rate": 2.4694165779996428e-06, + "loss": 0.0139, + "step": 39714 + }, + { + "epoch": 0.79432, + "grad_norm": 0.17830593883991241, + "learning_rate": 2.468497974671994e-06, + "loss": 0.0021, + "step": 39716 + }, + { + "epoch": 0.79436, + "grad_norm": 0.21131102740764618, + "learning_rate": 2.467579518174855e-06, + "loss": 0.0024, + "step": 39718 + }, + { + "epoch": 0.7944, + "grad_norm": 0.7048508524894714, + "learning_rate": 2.4666612085261344e-06, + "loss": 0.0457, + "step": 39720 + }, + { + "epoch": 0.79444, + "grad_norm": 3.0535483360290527, + "learning_rate": 2.465743045743728e-06, + "loss": 0.0377, + "step": 39722 + }, + { + "epoch": 0.79448, + "grad_norm": 0.22638782858848572, + "learning_rate": 2.464825029845541e-06, + "loss": 0.0038, + "step": 39724 + }, + { + "epoch": 0.79452, + "grad_norm": 0.3255164921283722, + "learning_rate": 2.463907160849469e-06, + "loss": 0.0047, + "step": 39726 + }, + { + "epoch": 0.79456, + "grad_norm": 0.13139964640140533, + "learning_rate": 2.4629894387734067e-06, + "loss": 0.0022, + "step": 39728 + }, + { + "epoch": 0.7946, + "grad_norm": 0.5604928135871887, + "learning_rate": 2.4620718636352457e-06, + "loss": 0.0646, + "step": 39730 + }, + { + "epoch": 0.79464, + "grad_norm": 0.041911303997039795, + "learning_rate": 2.461154435452877e-06, + "loss": 0.0009, + "step": 39732 + }, + { + "epoch": 0.79468, + "grad_norm": 1.034425973892212, + "learning_rate": 2.4602371542441815e-06, + "loss": 0.0148, + "step": 39734 + }, + { + "epoch": 0.79472, + "grad_norm": 0.035981934517621994, + "learning_rate": 2.4593200200270437e-06, + "loss": 0.001, + "step": 39736 + }, + { + "epoch": 0.79476, + "grad_norm": 0.0051051960326731205, + "learning_rate": 2.458403032819345e-06, + "loss": 0.0051, + "step": 39738 + }, + { + "epoch": 0.7948, + "grad_norm": 0.10412905365228653, + "learning_rate": 2.4574861926389615e-06, + "loss": 0.0008, + "step": 39740 + }, + { + "epoch": 0.79484, + "grad_norm": 1.6722878217697144, + "learning_rate": 2.4565694995037715e-06, + "loss": 0.0196, + "step": 39742 + }, + { + "epoch": 0.79488, + "grad_norm": 0.0020308946259319782, + "learning_rate": 2.4556529534316352e-06, + "loss": 0.0001, + "step": 39744 + }, + { + "epoch": 0.79492, + "grad_norm": 2.2997992038726807, + "learning_rate": 2.4547365544404366e-06, + "loss": 0.0269, + "step": 39746 + }, + { + "epoch": 0.79496, + "grad_norm": 0.013357670977711678, + "learning_rate": 2.453820302548029e-06, + "loss": 0.0002, + "step": 39748 + }, + { + "epoch": 0.795, + "grad_norm": 0.17242096364498138, + "learning_rate": 2.45290419777228e-06, + "loss": 0.0024, + "step": 39750 + }, + { + "epoch": 0.79504, + "grad_norm": 0.04923572391271591, + "learning_rate": 2.4519882401310492e-06, + "loss": 0.0008, + "step": 39752 + }, + { + "epoch": 0.79508, + "grad_norm": 0.017048193141818047, + "learning_rate": 2.4510724296421975e-06, + "loss": 0.0006, + "step": 39754 + }, + { + "epoch": 0.79512, + "grad_norm": 0.012479172088205814, + "learning_rate": 2.4501567663235703e-06, + "loss": 0.0004, + "step": 39756 + }, + { + "epoch": 0.79516, + "grad_norm": 0.057764921337366104, + "learning_rate": 2.4492412501930254e-06, + "loss": 0.0021, + "step": 39758 + }, + { + "epoch": 0.7952, + "grad_norm": 0.17419308423995972, + "learning_rate": 2.4483258812684096e-06, + "loss": 0.0043, + "step": 39760 + }, + { + "epoch": 0.79524, + "grad_norm": 0.004359856713563204, + "learning_rate": 2.4474106595675683e-06, + "loss": 0.0005, + "step": 39762 + }, + { + "epoch": 0.79528, + "grad_norm": 0.06173481419682503, + "learning_rate": 2.446495585108347e-06, + "loss": 0.0027, + "step": 39764 + }, + { + "epoch": 0.79532, + "grad_norm": 0.164476215839386, + "learning_rate": 2.4455806579085783e-06, + "loss": 0.0018, + "step": 39766 + }, + { + "epoch": 0.79536, + "grad_norm": 4.249120235443115, + "learning_rate": 2.444665877986109e-06, + "loss": 0.0423, + "step": 39768 + }, + { + "epoch": 0.7954, + "grad_norm": 0.10828981548547745, + "learning_rate": 2.4437512453587653e-06, + "loss": 0.0017, + "step": 39770 + }, + { + "epoch": 0.79544, + "grad_norm": 0.03568847104907036, + "learning_rate": 2.4428367600443812e-06, + "loss": 0.0054, + "step": 39772 + }, + { + "epoch": 0.79548, + "grad_norm": 0.18404635787010193, + "learning_rate": 2.4419224220607883e-06, + "loss": 0.018, + "step": 39774 + }, + { + "epoch": 0.79552, + "grad_norm": 0.038449693471193314, + "learning_rate": 2.4410082314258034e-06, + "loss": 0.0053, + "step": 39776 + }, + { + "epoch": 0.79556, + "grad_norm": 0.11386524885892868, + "learning_rate": 2.4400941881572604e-06, + "loss": 0.0103, + "step": 39778 + }, + { + "epoch": 0.7956, + "grad_norm": 0.4370831847190857, + "learning_rate": 2.4391802922729703e-06, + "loss": 0.0056, + "step": 39780 + }, + { + "epoch": 0.79564, + "grad_norm": 0.32380032539367676, + "learning_rate": 2.4382665437907526e-06, + "loss": 0.012, + "step": 39782 + }, + { + "epoch": 0.79568, + "grad_norm": 0.00925025250762701, + "learning_rate": 2.4373529427284216e-06, + "loss": 0.0003, + "step": 39784 + }, + { + "epoch": 0.79572, + "grad_norm": 0.008798868395388126, + "learning_rate": 2.436439489103789e-06, + "loss": 0.0001, + "step": 39786 + }, + { + "epoch": 0.79576, + "grad_norm": 0.11356943845748901, + "learning_rate": 2.4355261829346654e-06, + "loss": 0.004, + "step": 39788 + }, + { + "epoch": 0.7958, + "grad_norm": 0.019986804574728012, + "learning_rate": 2.43461302423885e-06, + "loss": 0.0032, + "step": 39790 + }, + { + "epoch": 0.79584, + "grad_norm": 0.14839023351669312, + "learning_rate": 2.4337000130341494e-06, + "loss": 0.0021, + "step": 39792 + }, + { + "epoch": 0.79588, + "grad_norm": 0.24566364288330078, + "learning_rate": 2.4327871493383614e-06, + "loss": 0.0082, + "step": 39794 + }, + { + "epoch": 0.79592, + "grad_norm": 0.045453257858753204, + "learning_rate": 2.431874433169288e-06, + "loss": 0.0018, + "step": 39796 + }, + { + "epoch": 0.79596, + "grad_norm": 0.10747483372688293, + "learning_rate": 2.430961864544712e-06, + "loss": 0.0018, + "step": 39798 + }, + { + "epoch": 0.796, + "grad_norm": 0.04978342726826668, + "learning_rate": 2.4300494434824373e-06, + "loss": 0.0009, + "step": 39800 + }, + { + "epoch": 0.79604, + "grad_norm": 0.07833255082368851, + "learning_rate": 2.4291371700002432e-06, + "loss": 0.0017, + "step": 39802 + }, + { + "epoch": 0.79608, + "grad_norm": 0.00335190468467772, + "learning_rate": 2.428225044115917e-06, + "loss": 0.0004, + "step": 39804 + }, + { + "epoch": 0.79612, + "grad_norm": 0.5321149826049805, + "learning_rate": 2.4273130658472433e-06, + "loss": 0.0079, + "step": 39806 + }, + { + "epoch": 0.79616, + "grad_norm": 0.12877203524112701, + "learning_rate": 2.4264012352119993e-06, + "loss": 0.0015, + "step": 39808 + }, + { + "epoch": 0.7962, + "grad_norm": 0.37519359588623047, + "learning_rate": 2.4254895522279642e-06, + "loss": 0.0331, + "step": 39810 + }, + { + "epoch": 0.79624, + "grad_norm": 0.0631907731294632, + "learning_rate": 2.4245780169129083e-06, + "loss": 0.0029, + "step": 39812 + }, + { + "epoch": 0.79628, + "grad_norm": 0.023644505068659782, + "learning_rate": 2.4236666292846032e-06, + "loss": 0.0015, + "step": 39814 + }, + { + "epoch": 0.79632, + "grad_norm": 0.15809394419193268, + "learning_rate": 2.4227553893608193e-06, + "loss": 0.004, + "step": 39816 + }, + { + "epoch": 0.79636, + "grad_norm": 4.538595199584961, + "learning_rate": 2.4218442971593183e-06, + "loss": 0.0337, + "step": 39818 + }, + { + "epoch": 0.7964, + "grad_norm": 0.10955221205949783, + "learning_rate": 2.420933352697865e-06, + "loss": 0.002, + "step": 39820 + }, + { + "epoch": 0.79644, + "grad_norm": 19.2742977142334, + "learning_rate": 2.4200225559942213e-06, + "loss": 0.3199, + "step": 39822 + }, + { + "epoch": 0.79648, + "grad_norm": 0.046156637370586395, + "learning_rate": 2.4191119070661363e-06, + "loss": 0.0005, + "step": 39824 + }, + { + "epoch": 0.79652, + "grad_norm": 0.05128570273518562, + "learning_rate": 2.4182014059313685e-06, + "loss": 0.0018, + "step": 39826 + }, + { + "epoch": 0.79656, + "grad_norm": 0.07411618530750275, + "learning_rate": 2.4172910526076698e-06, + "loss": 0.8144, + "step": 39828 + }, + { + "epoch": 0.7966, + "grad_norm": 0.051429640501737595, + "learning_rate": 2.4163808471127815e-06, + "loss": 0.0008, + "step": 39830 + }, + { + "epoch": 0.79664, + "grad_norm": 0.012720831669867039, + "learning_rate": 2.4154707894644568e-06, + "loss": 0.0012, + "step": 39832 + }, + { + "epoch": 0.79668, + "grad_norm": 0.36682233214378357, + "learning_rate": 2.41456087968043e-06, + "loss": 0.0505, + "step": 39834 + }, + { + "epoch": 0.79672, + "grad_norm": 0.04950970411300659, + "learning_rate": 2.413651117778448e-06, + "loss": 0.0009, + "step": 39836 + }, + { + "epoch": 0.79676, + "grad_norm": 6.943201065063477, + "learning_rate": 2.4127415037762414e-06, + "loss": 0.0728, + "step": 39838 + }, + { + "epoch": 0.7968, + "grad_norm": 2.140331506729126, + "learning_rate": 2.411832037691545e-06, + "loss": 0.0245, + "step": 39840 + }, + { + "epoch": 0.79684, + "grad_norm": 0.005413950886577368, + "learning_rate": 2.4109227195420926e-06, + "loss": 0.0006, + "step": 39842 + }, + { + "epoch": 0.79688, + "grad_norm": 0.005563961807638407, + "learning_rate": 2.4100135493456065e-06, + "loss": 0.0005, + "step": 39844 + }, + { + "epoch": 0.79692, + "grad_norm": 0.3177870512008667, + "learning_rate": 2.409104527119813e-06, + "loss": 0.0071, + "step": 39846 + }, + { + "epoch": 0.79696, + "grad_norm": 0.21143707633018494, + "learning_rate": 2.408195652882436e-06, + "loss": 0.0024, + "step": 39848 + }, + { + "epoch": 0.797, + "grad_norm": 9.0357027053833, + "learning_rate": 2.407286926651192e-06, + "loss": 0.1498, + "step": 39850 + }, + { + "epoch": 0.79704, + "grad_norm": 0.04015853628516197, + "learning_rate": 2.4063783484437984e-06, + "loss": 0.0008, + "step": 39852 + }, + { + "epoch": 0.79708, + "grad_norm": 0.21850819885730743, + "learning_rate": 2.4054699182779717e-06, + "loss": 0.0025, + "step": 39854 + }, + { + "epoch": 0.79712, + "grad_norm": 0.33421796560287476, + "learning_rate": 2.404561636171416e-06, + "loss": 0.0033, + "step": 39856 + }, + { + "epoch": 0.79716, + "grad_norm": 1.0170960426330566, + "learning_rate": 2.4036535021418417e-06, + "loss": 0.3848, + "step": 39858 + }, + { + "epoch": 0.7972, + "grad_norm": 0.17720212042331696, + "learning_rate": 2.4027455162069567e-06, + "loss": 0.0028, + "step": 39860 + }, + { + "epoch": 0.79724, + "grad_norm": 0.006007204297930002, + "learning_rate": 2.4018376783844523e-06, + "loss": 0.0064, + "step": 39862 + }, + { + "epoch": 0.79728, + "grad_norm": 0.15369926393032074, + "learning_rate": 2.40092998869204e-06, + "loss": 0.0052, + "step": 39864 + }, + { + "epoch": 0.79732, + "grad_norm": 0.02831125818192959, + "learning_rate": 2.400022447147404e-06, + "loss": 0.0005, + "step": 39866 + }, + { + "epoch": 0.79736, + "grad_norm": 0.10041141510009766, + "learning_rate": 2.3991150537682507e-06, + "loss": 0.0015, + "step": 39868 + }, + { + "epoch": 0.7974, + "grad_norm": 12.994394302368164, + "learning_rate": 2.398207808572258e-06, + "loss": 0.2674, + "step": 39870 + }, + { + "epoch": 0.79744, + "grad_norm": 13.793779373168945, + "learning_rate": 2.3973007115771175e-06, + "loss": 0.2347, + "step": 39872 + }, + { + "epoch": 0.79748, + "grad_norm": 0.34205198287963867, + "learning_rate": 2.3963937628005152e-06, + "loss": 0.0033, + "step": 39874 + }, + { + "epoch": 0.79752, + "grad_norm": 1.5365546941757202, + "learning_rate": 2.395486962260133e-06, + "loss": 0.0242, + "step": 39876 + }, + { + "epoch": 0.79756, + "grad_norm": 0.004964130464941263, + "learning_rate": 2.3945803099736444e-06, + "loss": 0.0045, + "step": 39878 + }, + { + "epoch": 0.7976, + "grad_norm": 0.19651570916175842, + "learning_rate": 2.3936738059587284e-06, + "loss": 0.0056, + "step": 39880 + }, + { + "epoch": 0.79764, + "grad_norm": 0.8262583613395691, + "learning_rate": 2.392767450233058e-06, + "loss": 0.0145, + "step": 39882 + }, + { + "epoch": 0.79768, + "grad_norm": 0.5619106888771057, + "learning_rate": 2.3918612428143016e-06, + "loss": 0.0143, + "step": 39884 + }, + { + "epoch": 0.79772, + "grad_norm": 0.11352942883968353, + "learning_rate": 2.3909551837201315e-06, + "loss": 0.0025, + "step": 39886 + }, + { + "epoch": 0.79776, + "grad_norm": 4.473034381866455, + "learning_rate": 2.390049272968201e-06, + "loss": 0.0428, + "step": 39888 + }, + { + "epoch": 0.7978, + "grad_norm": 0.9029903411865234, + "learning_rate": 2.3891435105761838e-06, + "loss": 0.0281, + "step": 39890 + }, + { + "epoch": 0.79784, + "grad_norm": 0.41527149081230164, + "learning_rate": 2.388237896561729e-06, + "loss": 0.0077, + "step": 39892 + }, + { + "epoch": 0.79788, + "grad_norm": 8.682380676269531, + "learning_rate": 2.387332430942495e-06, + "loss": 0.1509, + "step": 39894 + }, + { + "epoch": 0.79792, + "grad_norm": 0.025695987045764923, + "learning_rate": 2.3864271137361383e-06, + "loss": 0.0021, + "step": 39896 + }, + { + "epoch": 0.79796, + "grad_norm": 0.01961393468081951, + "learning_rate": 2.3855219449602985e-06, + "loss": 0.0013, + "step": 39898 + }, + { + "epoch": 0.798, + "grad_norm": 0.4102046489715576, + "learning_rate": 2.3846169246326345e-06, + "loss": 0.0148, + "step": 39900 + }, + { + "epoch": 0.79804, + "grad_norm": 0.04646211490035057, + "learning_rate": 2.3837120527707803e-06, + "loss": 0.0031, + "step": 39902 + }, + { + "epoch": 0.79808, + "grad_norm": 0.10354209691286087, + "learning_rate": 2.3828073293923813e-06, + "loss": 0.013, + "step": 39904 + }, + { + "epoch": 0.79812, + "grad_norm": 0.04325222223997116, + "learning_rate": 2.3819027545150766e-06, + "loss": 0.0029, + "step": 39906 + }, + { + "epoch": 0.79816, + "grad_norm": 0.07214318215847015, + "learning_rate": 2.3809983281564974e-06, + "loss": 0.0016, + "step": 39908 + }, + { + "epoch": 0.7982, + "grad_norm": 0.17680059373378754, + "learning_rate": 2.380094050334283e-06, + "loss": 0.003, + "step": 39910 + }, + { + "epoch": 0.79824, + "grad_norm": 0.09512564539909363, + "learning_rate": 2.379189921066054e-06, + "loss": 0.0007, + "step": 39912 + }, + { + "epoch": 0.79828, + "grad_norm": 0.060219984501600266, + "learning_rate": 2.3782859403694415e-06, + "loss": 0.0012, + "step": 39914 + }, + { + "epoch": 0.79832, + "grad_norm": 0.09361053258180618, + "learning_rate": 2.3773821082620684e-06, + "loss": 0.0081, + "step": 39916 + }, + { + "epoch": 0.79836, + "grad_norm": 0.00684027373790741, + "learning_rate": 2.376478424761558e-06, + "loss": 0.0007, + "step": 39918 + }, + { + "epoch": 0.7984, + "grad_norm": 0.8604049682617188, + "learning_rate": 2.37557488988552e-06, + "loss": 0.0124, + "step": 39920 + }, + { + "epoch": 0.79844, + "grad_norm": 0.9199936985969543, + "learning_rate": 2.374671503651581e-06, + "loss": 0.0101, + "step": 39922 + }, + { + "epoch": 0.79848, + "grad_norm": 0.1339188665151596, + "learning_rate": 2.373768266077344e-06, + "loss": 0.0016, + "step": 39924 + }, + { + "epoch": 0.79852, + "grad_norm": 6.330020427703857, + "learning_rate": 2.37286517718042e-06, + "loss": 0.0589, + "step": 39926 + }, + { + "epoch": 0.79856, + "grad_norm": 0.7259714007377625, + "learning_rate": 2.37196223697842e-06, + "loss": 0.0083, + "step": 39928 + }, + { + "epoch": 0.7986, + "grad_norm": 0.11303292959928513, + "learning_rate": 2.371059445488938e-06, + "loss": 0.0046, + "step": 39930 + }, + { + "epoch": 0.79864, + "grad_norm": 0.03453380614519119, + "learning_rate": 2.370156802729584e-06, + "loss": 0.0022, + "step": 39932 + }, + { + "epoch": 0.79868, + "grad_norm": 0.04915398731827736, + "learning_rate": 2.3692543087179487e-06, + "loss": 0.001, + "step": 39934 + }, + { + "epoch": 0.79872, + "grad_norm": 0.021074078977108, + "learning_rate": 2.36835196347163e-06, + "loss": 0.0015, + "step": 39936 + }, + { + "epoch": 0.79876, + "grad_norm": 0.5587324500083923, + "learning_rate": 2.3674497670082185e-06, + "loss": 0.0061, + "step": 39938 + }, + { + "epoch": 0.7988, + "grad_norm": 0.027091002091765404, + "learning_rate": 2.3665477193453037e-06, + "loss": 0.0004, + "step": 39940 + }, + { + "epoch": 0.79884, + "grad_norm": 0.05497258901596069, + "learning_rate": 2.365645820500473e-06, + "loss": 0.0036, + "step": 39942 + }, + { + "epoch": 0.79888, + "grad_norm": 0.43813857436180115, + "learning_rate": 2.364744070491305e-06, + "loss": 0.0039, + "step": 39944 + }, + { + "epoch": 0.79892, + "grad_norm": 0.021174680441617966, + "learning_rate": 2.363842469335382e-06, + "loss": 0.001, + "step": 39946 + }, + { + "epoch": 0.79896, + "grad_norm": 0.49588513374328613, + "learning_rate": 2.362941017050282e-06, + "loss": 0.0075, + "step": 39948 + }, + { + "epoch": 0.799, + "grad_norm": 0.004356580786406994, + "learning_rate": 2.362039713653581e-06, + "loss": 0.0001, + "step": 39950 + }, + { + "epoch": 0.79904, + "grad_norm": 0.36548537015914917, + "learning_rate": 2.361138559162842e-06, + "loss": 0.0071, + "step": 39952 + }, + { + "epoch": 0.79908, + "grad_norm": 0.14570732414722443, + "learning_rate": 2.3602375535956467e-06, + "loss": 0.0251, + "step": 39954 + }, + { + "epoch": 0.79912, + "grad_norm": 0.35261082649230957, + "learning_rate": 2.3593366969695462e-06, + "loss": 0.0036, + "step": 39956 + }, + { + "epoch": 0.79916, + "grad_norm": 0.0695595070719719, + "learning_rate": 2.358435989302118e-06, + "loss": 0.0014, + "step": 39958 + }, + { + "epoch": 0.7992, + "grad_norm": 0.12021855264902115, + "learning_rate": 2.35753543061091e-06, + "loss": 0.0031, + "step": 39960 + }, + { + "epoch": 0.79924, + "grad_norm": 0.022928807884454727, + "learning_rate": 2.3566350209134835e-06, + "loss": 0.0026, + "step": 39962 + }, + { + "epoch": 0.79928, + "grad_norm": 0.01852254569530487, + "learning_rate": 2.355734760227396e-06, + "loss": 0.0028, + "step": 39964 + }, + { + "epoch": 0.79932, + "grad_norm": 0.032149046659469604, + "learning_rate": 2.354834648570191e-06, + "loss": 0.0008, + "step": 39966 + }, + { + "epoch": 0.79936, + "grad_norm": 2.2159981727600098, + "learning_rate": 2.3539346859594215e-06, + "loss": 0.0293, + "step": 39968 + }, + { + "epoch": 0.7994, + "grad_norm": 0.2931164503097534, + "learning_rate": 2.3530348724126304e-06, + "loss": 0.0031, + "step": 39970 + }, + { + "epoch": 0.79944, + "grad_norm": 0.18443936109542847, + "learning_rate": 2.3521352079473625e-06, + "loss": 0.0041, + "step": 39972 + }, + { + "epoch": 0.79948, + "grad_norm": 0.47186776995658875, + "learning_rate": 2.3512356925811564e-06, + "loss": 0.0053, + "step": 39974 + }, + { + "epoch": 0.79952, + "grad_norm": 0.0017694105627015233, + "learning_rate": 2.3503363263315503e-06, + "loss": 0.0001, + "step": 39976 + }, + { + "epoch": 0.79956, + "grad_norm": 0.1822298765182495, + "learning_rate": 2.349437109216074e-06, + "loss": 0.0018, + "step": 39978 + }, + { + "epoch": 0.7996, + "grad_norm": 0.03421296551823616, + "learning_rate": 2.3485380412522586e-06, + "loss": 0.4303, + "step": 39980 + }, + { + "epoch": 0.79964, + "grad_norm": 1.8337643146514893, + "learning_rate": 2.347639122457638e-06, + "loss": 0.0119, + "step": 39982 + }, + { + "epoch": 0.79968, + "grad_norm": 0.009898738004267216, + "learning_rate": 2.3467403528497246e-06, + "loss": 0.0003, + "step": 39984 + }, + { + "epoch": 0.79972, + "grad_norm": 0.6946495175361633, + "learning_rate": 2.3458417324460546e-06, + "loss": 0.0072, + "step": 39986 + }, + { + "epoch": 0.79976, + "grad_norm": 0.09273622930049896, + "learning_rate": 2.3449432612641353e-06, + "loss": 0.001, + "step": 39988 + }, + { + "epoch": 0.7998, + "grad_norm": 0.14498405158519745, + "learning_rate": 2.3440449393214947e-06, + "loss": 0.0057, + "step": 39990 + }, + { + "epoch": 0.79984, + "grad_norm": 0.2979459762573242, + "learning_rate": 2.3431467666356356e-06, + "loss": 0.0034, + "step": 39992 + }, + { + "epoch": 0.79988, + "grad_norm": 4.610910892486572, + "learning_rate": 2.3422487432240725e-06, + "loss": 0.0421, + "step": 39994 + }, + { + "epoch": 0.79992, + "grad_norm": 0.10318450629711151, + "learning_rate": 2.3413508691043165e-06, + "loss": 0.0066, + "step": 39996 + }, + { + "epoch": 0.79996, + "grad_norm": 0.046142254024744034, + "learning_rate": 2.3404531442938604e-06, + "loss": 0.0035, + "step": 39998 + }, + { + "epoch": 0.8, + "grad_norm": 0.13134565949440002, + "learning_rate": 2.339555568810221e-06, + "loss": 0.0035, + "step": 40000 + }, + { + "epoch": 0.80004, + "grad_norm": 0.04512129724025726, + "learning_rate": 2.3386581426708867e-06, + "loss": 0.0008, + "step": 40002 + }, + { + "epoch": 0.80008, + "grad_norm": 0.054434869438409805, + "learning_rate": 2.337760865893356e-06, + "loss": 0.0021, + "step": 40004 + }, + { + "epoch": 0.80012, + "grad_norm": 0.13070155680179596, + "learning_rate": 2.336863738495122e-06, + "loss": 0.0017, + "step": 40006 + }, + { + "epoch": 0.80016, + "grad_norm": 0.011462952941656113, + "learning_rate": 2.3359667604936773e-06, + "loss": 0.0133, + "step": 40008 + }, + { + "epoch": 0.8002, + "grad_norm": 0.008151198737323284, + "learning_rate": 2.335069931906503e-06, + "loss": 0.0002, + "step": 40010 + }, + { + "epoch": 0.80024, + "grad_norm": 0.12932425737380981, + "learning_rate": 2.3341732527510874e-06, + "loss": 0.0024, + "step": 40012 + }, + { + "epoch": 0.80028, + "grad_norm": 0.48172977566719055, + "learning_rate": 2.3332767230449107e-06, + "loss": 0.0069, + "step": 40014 + }, + { + "epoch": 0.80032, + "grad_norm": 0.03321640565991402, + "learning_rate": 2.3323803428054516e-06, + "loss": 0.0004, + "step": 40016 + }, + { + "epoch": 0.80036, + "grad_norm": 0.08897783607244492, + "learning_rate": 2.3314841120501876e-06, + "loss": 0.0014, + "step": 40018 + }, + { + "epoch": 0.8004, + "grad_norm": 0.2624671459197998, + "learning_rate": 2.3305880307965834e-06, + "loss": 0.0028, + "step": 40020 + }, + { + "epoch": 0.80044, + "grad_norm": 0.04441326484084129, + "learning_rate": 2.32969209906212e-06, + "loss": 0.0027, + "step": 40022 + }, + { + "epoch": 0.80048, + "grad_norm": 0.02513796091079712, + "learning_rate": 2.3287963168642546e-06, + "loss": 0.0013, + "step": 40024 + }, + { + "epoch": 0.80052, + "grad_norm": 0.0663326233625412, + "learning_rate": 2.3279006842204544e-06, + "loss": 0.0007, + "step": 40026 + }, + { + "epoch": 0.80056, + "grad_norm": 0.11281615495681763, + "learning_rate": 2.3270052011481804e-06, + "loss": 0.0027, + "step": 40028 + }, + { + "epoch": 0.8006, + "grad_norm": 0.004832735285162926, + "learning_rate": 2.3261098676648908e-06, + "loss": 0.0002, + "step": 40030 + }, + { + "epoch": 0.80064, + "grad_norm": 0.1471533179283142, + "learning_rate": 2.325214683788043e-06, + "loss": 0.0023, + "step": 40032 + }, + { + "epoch": 0.80068, + "grad_norm": 0.12875594198703766, + "learning_rate": 2.324319649535083e-06, + "loss": 0.0014, + "step": 40034 + }, + { + "epoch": 0.80072, + "grad_norm": 0.023972878232598305, + "learning_rate": 2.323424764923463e-06, + "loss": 0.0002, + "step": 40036 + }, + { + "epoch": 0.80076, + "grad_norm": 0.562940776348114, + "learning_rate": 2.322530029970629e-06, + "loss": 0.0074, + "step": 40038 + }, + { + "epoch": 0.8008, + "grad_norm": 0.5315657258033752, + "learning_rate": 2.321635444694028e-06, + "loss": 0.008, + "step": 40040 + }, + { + "epoch": 0.80084, + "grad_norm": 1.2772436141967773, + "learning_rate": 2.3207410091110914e-06, + "loss": 0.0161, + "step": 40042 + }, + { + "epoch": 0.80088, + "grad_norm": 1.6653978824615479, + "learning_rate": 2.3198467232392686e-06, + "loss": 0.027, + "step": 40044 + }, + { + "epoch": 0.80092, + "grad_norm": 0.433991938829422, + "learning_rate": 2.318952587095984e-06, + "loss": 0.0063, + "step": 40046 + }, + { + "epoch": 0.80096, + "grad_norm": 0.17032209038734436, + "learning_rate": 2.318058600698674e-06, + "loss": 0.1171, + "step": 40048 + }, + { + "epoch": 0.801, + "grad_norm": 0.2028113752603531, + "learning_rate": 2.317164764064769e-06, + "loss": 0.0043, + "step": 40050 + }, + { + "epoch": 0.80104, + "grad_norm": 0.3987536132335663, + "learning_rate": 2.316271077211687e-06, + "loss": 0.0054, + "step": 40052 + }, + { + "epoch": 0.80108, + "grad_norm": 0.030514230951666832, + "learning_rate": 2.3153775401568613e-06, + "loss": 0.0006, + "step": 40054 + }, + { + "epoch": 0.80112, + "grad_norm": 0.011750804260373116, + "learning_rate": 2.3144841529177044e-06, + "loss": 0.0015, + "step": 40056 + }, + { + "epoch": 0.80116, + "grad_norm": 0.4964043200016022, + "learning_rate": 2.3135909155116354e-06, + "loss": 0.0044, + "step": 40058 + }, + { + "epoch": 0.8012, + "grad_norm": 0.1007123813033104, + "learning_rate": 2.3126978279560687e-06, + "loss": 0.0013, + "step": 40060 + }, + { + "epoch": 0.80124, + "grad_norm": 0.0220542773604393, + "learning_rate": 2.3118048902684165e-06, + "loss": 0.0004, + "step": 40062 + }, + { + "epoch": 0.80128, + "grad_norm": 0.02381562441587448, + "learning_rate": 2.3109121024660873e-06, + "loss": 0.0015, + "step": 40064 + }, + { + "epoch": 0.80132, + "grad_norm": 0.016221145167946815, + "learning_rate": 2.310019464566484e-06, + "loss": 0.0003, + "step": 40066 + }, + { + "epoch": 0.80136, + "grad_norm": 0.019895898178219795, + "learning_rate": 2.3091269765870096e-06, + "loss": 0.0026, + "step": 40068 + }, + { + "epoch": 0.8014, + "grad_norm": 0.5766962170600891, + "learning_rate": 2.308234638545064e-06, + "loss": 0.0071, + "step": 40070 + }, + { + "epoch": 0.80144, + "grad_norm": 0.6526752710342407, + "learning_rate": 2.307342450458048e-06, + "loss": 0.0102, + "step": 40072 + }, + { + "epoch": 0.80148, + "grad_norm": 0.026338893920183182, + "learning_rate": 2.306450412343344e-06, + "loss": 0.0002, + "step": 40074 + }, + { + "epoch": 0.80152, + "grad_norm": 0.12783317267894745, + "learning_rate": 2.3055585242183574e-06, + "loss": 0.0016, + "step": 40076 + }, + { + "epoch": 0.80156, + "grad_norm": 0.13530197739601135, + "learning_rate": 2.3046667861004646e-06, + "loss": 0.0118, + "step": 40078 + }, + { + "epoch": 0.8016, + "grad_norm": 0.12985701858997345, + "learning_rate": 2.3037751980070557e-06, + "loss": 0.0027, + "step": 40080 + }, + { + "epoch": 0.80164, + "grad_norm": 0.05077918991446495, + "learning_rate": 2.302883759955511e-06, + "loss": 0.0799, + "step": 40082 + }, + { + "epoch": 0.80168, + "grad_norm": 0.11756107211112976, + "learning_rate": 2.3019924719632103e-06, + "loss": 0.0386, + "step": 40084 + }, + { + "epoch": 0.80172, + "grad_norm": 1.6036376953125, + "learning_rate": 2.301101334047533e-06, + "loss": 0.0154, + "step": 40086 + }, + { + "epoch": 0.80176, + "grad_norm": 1.489960789680481, + "learning_rate": 2.3002103462258453e-06, + "loss": 0.0167, + "step": 40088 + }, + { + "epoch": 0.8018, + "grad_norm": 0.01951507478952408, + "learning_rate": 2.2993195085155205e-06, + "loss": 0.001, + "step": 40090 + }, + { + "epoch": 0.80184, + "grad_norm": 0.013177023269236088, + "learning_rate": 2.2984288209339278e-06, + "loss": 0.0065, + "step": 40092 + }, + { + "epoch": 0.80188, + "grad_norm": 0.23273134231567383, + "learning_rate": 2.2975382834984294e-06, + "loss": 0.0039, + "step": 40094 + }, + { + "epoch": 0.80192, + "grad_norm": 0.018373379483819008, + "learning_rate": 2.2966478962263885e-06, + "loss": 0.001, + "step": 40096 + }, + { + "epoch": 0.80196, + "grad_norm": 0.20710167288780212, + "learning_rate": 2.295757659135165e-06, + "loss": 0.0064, + "step": 40098 + }, + { + "epoch": 0.802, + "grad_norm": 0.1522323489189148, + "learning_rate": 2.2948675722421086e-06, + "loss": 0.0021, + "step": 40100 + }, + { + "epoch": 0.80204, + "grad_norm": 0.16086913645267487, + "learning_rate": 2.293977635564577e-06, + "loss": 0.0054, + "step": 40102 + }, + { + "epoch": 0.80208, + "grad_norm": 0.18351249396800995, + "learning_rate": 2.2930878491199182e-06, + "loss": 0.0022, + "step": 40104 + }, + { + "epoch": 0.80212, + "grad_norm": 0.25014689564704895, + "learning_rate": 2.2921982129254806e-06, + "loss": 0.0032, + "step": 40106 + }, + { + "epoch": 0.80216, + "grad_norm": 0.014963624998927116, + "learning_rate": 2.291308726998609e-06, + "loss": 0.0012, + "step": 40108 + }, + { + "epoch": 0.8022, + "grad_norm": 0.26527467370033264, + "learning_rate": 2.2904193913566363e-06, + "loss": 0.0034, + "step": 40110 + }, + { + "epoch": 0.80224, + "grad_norm": 0.009125037118792534, + "learning_rate": 2.2895302060169135e-06, + "loss": 0.0962, + "step": 40112 + }, + { + "epoch": 0.80228, + "grad_norm": 0.028168076649308205, + "learning_rate": 2.2886411709967647e-06, + "loss": 0.001, + "step": 40114 + }, + { + "epoch": 0.80232, + "grad_norm": 0.03891298547387123, + "learning_rate": 2.2877522863135283e-06, + "loss": 0.0007, + "step": 40116 + }, + { + "epoch": 0.80236, + "grad_norm": 0.22330407798290253, + "learning_rate": 2.2868635519845328e-06, + "loss": 0.0022, + "step": 40118 + }, + { + "epoch": 0.8024, + "grad_norm": 0.05592722445726395, + "learning_rate": 2.2859749680270983e-06, + "loss": 0.001, + "step": 40120 + }, + { + "epoch": 0.80244, + "grad_norm": 0.005747529678046703, + "learning_rate": 2.2850865344585595e-06, + "loss": 0.0025, + "step": 40122 + }, + { + "epoch": 0.80248, + "grad_norm": 0.052604060620069504, + "learning_rate": 2.2841982512962267e-06, + "loss": 0.0017, + "step": 40124 + }, + { + "epoch": 0.80252, + "grad_norm": 0.15381786227226257, + "learning_rate": 2.2833101185574212e-06, + "loss": 0.0018, + "step": 40126 + }, + { + "epoch": 0.80256, + "grad_norm": 16.16809844970703, + "learning_rate": 2.282422136259459e-06, + "loss": 0.897, + "step": 40128 + }, + { + "epoch": 0.8026, + "grad_norm": 0.05838136747479439, + "learning_rate": 2.2815343044196523e-06, + "loss": 0.0012, + "step": 40130 + }, + { + "epoch": 0.80264, + "grad_norm": 0.3632507920265198, + "learning_rate": 2.280646623055306e-06, + "loss": 0.0043, + "step": 40132 + }, + { + "epoch": 0.80268, + "grad_norm": 1.0277187824249268, + "learning_rate": 2.2797590921837274e-06, + "loss": 0.009, + "step": 40134 + }, + { + "epoch": 0.80272, + "grad_norm": 0.009891382418572903, + "learning_rate": 2.2788717118222203e-06, + "loss": 0.0377, + "step": 40136 + }, + { + "epoch": 0.80276, + "grad_norm": 0.1895674467086792, + "learning_rate": 2.2779844819880847e-06, + "loss": 0.0023, + "step": 40138 + }, + { + "epoch": 0.8028, + "grad_norm": 0.01641271449625492, + "learning_rate": 2.277097402698619e-06, + "loss": 0.0036, + "step": 40140 + }, + { + "epoch": 0.80284, + "grad_norm": 0.04678908362984657, + "learning_rate": 2.27621047397111e-06, + "loss": 0.0075, + "step": 40142 + }, + { + "epoch": 0.80288, + "grad_norm": 0.06505586206912994, + "learning_rate": 2.275323695822861e-06, + "loss": 0.0015, + "step": 40144 + }, + { + "epoch": 0.80292, + "grad_norm": 0.8819131851196289, + "learning_rate": 2.274437068271149e-06, + "loss": 0.0082, + "step": 40146 + }, + { + "epoch": 0.80296, + "grad_norm": 0.012449941597878933, + "learning_rate": 2.2735505913332644e-06, + "loss": 0.0002, + "step": 40148 + }, + { + "epoch": 0.803, + "grad_norm": 0.02305869571864605, + "learning_rate": 2.27266426502649e-06, + "loss": 0.0009, + "step": 40150 + }, + { + "epoch": 0.80304, + "grad_norm": 0.04535587877035141, + "learning_rate": 2.2717780893681028e-06, + "loss": 0.0006, + "step": 40152 + }, + { + "epoch": 0.80308, + "grad_norm": 0.11744172126054764, + "learning_rate": 2.270892064375384e-06, + "loss": 0.0014, + "step": 40154 + }, + { + "epoch": 0.80312, + "grad_norm": 0.012988526374101639, + "learning_rate": 2.2700061900656002e-06, + "loss": 0.0009, + "step": 40156 + }, + { + "epoch": 0.80316, + "grad_norm": 0.08397096395492554, + "learning_rate": 2.269120466456025e-06, + "loss": 0.031, + "step": 40158 + }, + { + "epoch": 0.8032, + "grad_norm": 0.35424184799194336, + "learning_rate": 2.2682348935639274e-06, + "loss": 0.0079, + "step": 40160 + }, + { + "epoch": 0.80324, + "grad_norm": 0.060457952320575714, + "learning_rate": 2.2673494714065736e-06, + "loss": 0.19, + "step": 40162 + }, + { + "epoch": 0.80328, + "grad_norm": 0.3153594136238098, + "learning_rate": 2.266464200001217e-06, + "loss": 0.007, + "step": 40164 + }, + { + "epoch": 0.80332, + "grad_norm": 0.35289737582206726, + "learning_rate": 2.2655790793651288e-06, + "loss": 0.0049, + "step": 40166 + }, + { + "epoch": 0.80336, + "grad_norm": 0.009734692983329296, + "learning_rate": 2.264694109515555e-06, + "loss": 0.1896, + "step": 40168 + }, + { + "epoch": 0.8034, + "grad_norm": 1.7853021621704102, + "learning_rate": 2.2638092904697516e-06, + "loss": 0.0193, + "step": 40170 + }, + { + "epoch": 0.80344, + "grad_norm": 0.022112293168902397, + "learning_rate": 2.262924622244973e-06, + "loss": 0.0053, + "step": 40172 + }, + { + "epoch": 0.80348, + "grad_norm": 0.08892888575792313, + "learning_rate": 2.262040104858455e-06, + "loss": 0.0014, + "step": 40174 + }, + { + "epoch": 0.80352, + "grad_norm": 1.95011568069458, + "learning_rate": 2.261155738327455e-06, + "loss": 0.0217, + "step": 40176 + }, + { + "epoch": 0.80356, + "grad_norm": 0.3482917845249176, + "learning_rate": 2.2602715226692064e-06, + "loss": 0.0034, + "step": 40178 + }, + { + "epoch": 0.8036, + "grad_norm": 0.06757109612226486, + "learning_rate": 2.259387457900948e-06, + "loss": 0.0012, + "step": 40180 + }, + { + "epoch": 0.80364, + "grad_norm": 0.005201201420277357, + "learning_rate": 2.2585035440399162e-06, + "loss": 0.0064, + "step": 40182 + }, + { + "epoch": 0.80368, + "grad_norm": 0.005801192484796047, + "learning_rate": 2.257619781103344e-06, + "loss": 0.0045, + "step": 40184 + }, + { + "epoch": 0.80372, + "grad_norm": 0.005555194336920977, + "learning_rate": 2.256736169108463e-06, + "loss": 0.001, + "step": 40186 + }, + { + "epoch": 0.80376, + "grad_norm": 0.010084817185997963, + "learning_rate": 2.2558527080724936e-06, + "loss": 0.0002, + "step": 40188 + }, + { + "epoch": 0.8038, + "grad_norm": 0.005978851113468409, + "learning_rate": 2.254969398012663e-06, + "loss": 0.0001, + "step": 40190 + }, + { + "epoch": 0.80384, + "grad_norm": 0.05861804634332657, + "learning_rate": 2.2540862389461917e-06, + "loss": 0.0011, + "step": 40192 + }, + { + "epoch": 0.80388, + "grad_norm": 0.06354363262653351, + "learning_rate": 2.2532032308902985e-06, + "loss": 0.0019, + "step": 40194 + }, + { + "epoch": 0.80392, + "grad_norm": 9.286205022362992e-05, + "learning_rate": 2.2523203738621924e-06, + "loss": 0.0009, + "step": 40196 + }, + { + "epoch": 0.80396, + "grad_norm": 2.1823158264160156, + "learning_rate": 2.251437667879094e-06, + "loss": 0.0257, + "step": 40198 + }, + { + "epoch": 0.804, + "grad_norm": 0.0420895479619503, + "learning_rate": 2.2505551129582047e-06, + "loss": 0.0014, + "step": 40200 + }, + { + "epoch": 0.80404, + "grad_norm": 0.24990971386432648, + "learning_rate": 2.249672709116735e-06, + "loss": 0.003, + "step": 40202 + }, + { + "epoch": 0.80408, + "grad_norm": 0.026755979284644127, + "learning_rate": 2.2487904563718844e-06, + "loss": 0.0025, + "step": 40204 + }, + { + "epoch": 0.80412, + "grad_norm": 0.002051054500043392, + "learning_rate": 2.247908354740854e-06, + "loss": 0.0002, + "step": 40206 + }, + { + "epoch": 0.80416, + "grad_norm": 0.08352583646774292, + "learning_rate": 2.2470264042408452e-06, + "loss": 0.0009, + "step": 40208 + }, + { + "epoch": 0.8042, + "grad_norm": 0.18195784091949463, + "learning_rate": 2.2461446048890424e-06, + "loss": 0.0018, + "step": 40210 + }, + { + "epoch": 0.80424, + "grad_norm": 0.16938389837741852, + "learning_rate": 2.2452629567026475e-06, + "loss": 0.0022, + "step": 40212 + }, + { + "epoch": 0.80428, + "grad_norm": 0.1710880547761917, + "learning_rate": 2.244381459698841e-06, + "loss": 0.1774, + "step": 40214 + }, + { + "epoch": 0.80432, + "grad_norm": 0.7287099361419678, + "learning_rate": 2.2435001138948108e-06, + "loss": 0.012, + "step": 40216 + }, + { + "epoch": 0.80436, + "grad_norm": 0.0032963543199002743, + "learning_rate": 2.24261891930774e-06, + "loss": 0.0019, + "step": 40218 + }, + { + "epoch": 0.8044, + "grad_norm": 0.06786112487316132, + "learning_rate": 2.241737875954808e-06, + "loss": 0.0016, + "step": 40220 + }, + { + "epoch": 0.80444, + "grad_norm": 7.835666656494141, + "learning_rate": 2.240856983853189e-06, + "loss": 0.0822, + "step": 40222 + }, + { + "epoch": 0.80448, + "grad_norm": 0.027506565675139427, + "learning_rate": 2.239976243020058e-06, + "loss": 0.0171, + "step": 40224 + }, + { + "epoch": 0.80452, + "grad_norm": 0.12509535253047943, + "learning_rate": 2.2390956534725848e-06, + "loss": 0.1166, + "step": 40226 + }, + { + "epoch": 0.80456, + "grad_norm": 0.291996568441391, + "learning_rate": 2.238215215227937e-06, + "loss": 0.0028, + "step": 40228 + }, + { + "epoch": 0.8046, + "grad_norm": 0.10495610535144806, + "learning_rate": 2.237334928303283e-06, + "loss": 0.0014, + "step": 40230 + }, + { + "epoch": 0.80464, + "grad_norm": 0.0009505589841865003, + "learning_rate": 2.2364547927157755e-06, + "loss": 0.0149, + "step": 40232 + }, + { + "epoch": 0.80468, + "grad_norm": 0.02085408754646778, + "learning_rate": 2.2355748084825847e-06, + "loss": 0.0007, + "step": 40234 + }, + { + "epoch": 0.80472, + "grad_norm": 0.030658261850476265, + "learning_rate": 2.234694975620857e-06, + "loss": 0.0008, + "step": 40236 + }, + { + "epoch": 0.80476, + "grad_norm": 0.005571441259235144, + "learning_rate": 2.2338152941477487e-06, + "loss": 0.0031, + "step": 40238 + }, + { + "epoch": 0.8048, + "grad_norm": 0.4561085104942322, + "learning_rate": 2.2329357640804118e-06, + "loss": 0.0052, + "step": 40240 + }, + { + "epoch": 0.80484, + "grad_norm": 0.024251218885183334, + "learning_rate": 2.2320563854359855e-06, + "loss": 0.0003, + "step": 40242 + }, + { + "epoch": 0.80488, + "grad_norm": 0.16869929432868958, + "learning_rate": 2.2311771582316257e-06, + "loss": 0.0018, + "step": 40244 + }, + { + "epoch": 0.80492, + "grad_norm": 0.050253238528966904, + "learning_rate": 2.230298082484463e-06, + "loss": 0.0024, + "step": 40246 + }, + { + "epoch": 0.80496, + "grad_norm": 2.9044337272644043, + "learning_rate": 2.2294191582116397e-06, + "loss": 0.027, + "step": 40248 + }, + { + "epoch": 0.805, + "grad_norm": 0.0512562170624733, + "learning_rate": 2.2285403854302912e-06, + "loss": 0.0012, + "step": 40250 + }, + { + "epoch": 0.80504, + "grad_norm": 0.6366552710533142, + "learning_rate": 2.2276617641575503e-06, + "loss": 0.0069, + "step": 40252 + }, + { + "epoch": 0.80508, + "grad_norm": 0.007484450004994869, + "learning_rate": 2.226783294410544e-06, + "loss": 0.004, + "step": 40254 + }, + { + "epoch": 0.80512, + "grad_norm": 0.014951524324715137, + "learning_rate": 2.2259049762063978e-06, + "loss": 0.0014, + "step": 40256 + }, + { + "epoch": 0.80516, + "grad_norm": 0.30017492175102234, + "learning_rate": 2.2250268095622364e-06, + "loss": 0.0036, + "step": 40258 + }, + { + "epoch": 0.8052, + "grad_norm": 0.06985579431056976, + "learning_rate": 2.22414879449518e-06, + "loss": 0.0041, + "step": 40260 + }, + { + "epoch": 0.80524, + "grad_norm": 0.4527776837348938, + "learning_rate": 2.22327093102235e-06, + "loss": 0.0047, + "step": 40262 + }, + { + "epoch": 0.80528, + "grad_norm": 11.09804630279541, + "learning_rate": 2.222393219160851e-06, + "loss": 0.1627, + "step": 40264 + }, + { + "epoch": 0.80532, + "grad_norm": 0.09245135635137558, + "learning_rate": 2.2215156589278052e-06, + "loss": 0.0008, + "step": 40266 + }, + { + "epoch": 0.80536, + "grad_norm": 0.14225447177886963, + "learning_rate": 2.2206382503403146e-06, + "loss": 0.4313, + "step": 40268 + }, + { + "epoch": 0.8054, + "grad_norm": 0.030719313770532608, + "learning_rate": 2.219760993415485e-06, + "loss": 0.0135, + "step": 40270 + }, + { + "epoch": 0.80544, + "grad_norm": 0.6257684230804443, + "learning_rate": 2.218883888170421e-06, + "loss": 0.0076, + "step": 40272 + }, + { + "epoch": 0.80548, + "grad_norm": 0.12106110155582428, + "learning_rate": 2.218006934622222e-06, + "loss": 0.0037, + "step": 40274 + }, + { + "epoch": 0.80552, + "grad_norm": 0.607171356678009, + "learning_rate": 2.2171301327879867e-06, + "loss": 0.0082, + "step": 40276 + }, + { + "epoch": 0.80556, + "grad_norm": 0.08168953657150269, + "learning_rate": 2.2162534826848036e-06, + "loss": 0.0044, + "step": 40278 + }, + { + "epoch": 0.8056, + "grad_norm": 0.23762273788452148, + "learning_rate": 2.215376984329767e-06, + "loss": 0.0069, + "step": 40280 + }, + { + "epoch": 0.80564, + "grad_norm": 0.3356786370277405, + "learning_rate": 2.2145006377399623e-06, + "loss": 0.0043, + "step": 40282 + }, + { + "epoch": 0.80568, + "grad_norm": 0.03092443197965622, + "learning_rate": 2.2136244429324793e-06, + "loss": 0.0095, + "step": 40284 + }, + { + "epoch": 0.80572, + "grad_norm": 0.05482104420661926, + "learning_rate": 2.2127483999243915e-06, + "loss": 0.0006, + "step": 40286 + }, + { + "epoch": 0.80576, + "grad_norm": 0.09449297934770584, + "learning_rate": 2.2118725087327887e-06, + "loss": 0.0014, + "step": 40288 + }, + { + "epoch": 0.8058, + "grad_norm": 0.0009977365843951702, + "learning_rate": 2.210996769374737e-06, + "loss": 0.0003, + "step": 40290 + }, + { + "epoch": 0.80584, + "grad_norm": 0.011430549435317516, + "learning_rate": 2.210121181867314e-06, + "loss": 0.0025, + "step": 40292 + }, + { + "epoch": 0.80588, + "grad_norm": 0.09107626974582672, + "learning_rate": 2.2092457462275906e-06, + "loss": 0.0009, + "step": 40294 + }, + { + "epoch": 0.80592, + "grad_norm": 0.017016427591443062, + "learning_rate": 2.208370462472629e-06, + "loss": 0.0011, + "step": 40296 + }, + { + "epoch": 0.80596, + "grad_norm": 11.951042175292969, + "learning_rate": 2.2074953306195002e-06, + "loss": 0.4426, + "step": 40298 + }, + { + "epoch": 0.806, + "grad_norm": 0.1546524316072464, + "learning_rate": 2.206620350685257e-06, + "loss": 0.0023, + "step": 40300 + }, + { + "epoch": 0.80604, + "grad_norm": 0.09400918334722519, + "learning_rate": 2.2057455226869684e-06, + "loss": 0.0057, + "step": 40302 + }, + { + "epoch": 0.80608, + "grad_norm": 0.5747314095497131, + "learning_rate": 2.20487084664168e-06, + "loss": 0.0302, + "step": 40304 + }, + { + "epoch": 0.80612, + "grad_norm": 0.3786856532096863, + "learning_rate": 2.2039963225664486e-06, + "loss": 0.0041, + "step": 40306 + }, + { + "epoch": 0.80616, + "grad_norm": 0.1334161013364792, + "learning_rate": 2.203121950478324e-06, + "loss": 0.0076, + "step": 40308 + }, + { + "epoch": 0.8062, + "grad_norm": 0.03224088251590729, + "learning_rate": 2.202247730394349e-06, + "loss": 0.0018, + "step": 40310 + }, + { + "epoch": 0.80624, + "grad_norm": 0.01328386552631855, + "learning_rate": 2.2013736623315695e-06, + "loss": 0.0005, + "step": 40312 + }, + { + "epoch": 0.80628, + "grad_norm": 0.002915566088631749, + "learning_rate": 2.2004997463070245e-06, + "loss": 0.0081, + "step": 40314 + }, + { + "epoch": 0.80632, + "grad_norm": 0.042690083384513855, + "learning_rate": 2.1996259823377532e-06, + "loss": 0.0012, + "step": 40316 + }, + { + "epoch": 0.80636, + "grad_norm": 0.00011479188106022775, + "learning_rate": 2.1987523704407886e-06, + "loss": 0.0001, + "step": 40318 + }, + { + "epoch": 0.8064, + "grad_norm": 0.5908628702163696, + "learning_rate": 2.1978789106331666e-06, + "loss": 0.01, + "step": 40320 + }, + { + "epoch": 0.80644, + "grad_norm": 0.06026080995798111, + "learning_rate": 2.1970056029319087e-06, + "loss": 0.0056, + "step": 40322 + }, + { + "epoch": 0.80648, + "grad_norm": 0.007949216291308403, + "learning_rate": 2.1961324473540434e-06, + "loss": 0.0007, + "step": 40324 + }, + { + "epoch": 0.80652, + "grad_norm": 0.1636258363723755, + "learning_rate": 2.1952594439165976e-06, + "loss": 0.0033, + "step": 40326 + }, + { + "epoch": 0.80656, + "grad_norm": 0.11460039764642715, + "learning_rate": 2.19438659263658e-06, + "loss": 0.0015, + "step": 40328 + }, + { + "epoch": 0.8066, + "grad_norm": 0.046263761818408966, + "learning_rate": 2.1935138935310208e-06, + "loss": 0.1161, + "step": 40330 + }, + { + "epoch": 0.80664, + "grad_norm": 0.03816003352403641, + "learning_rate": 2.1926413466169214e-06, + "loss": 0.0272, + "step": 40332 + }, + { + "epoch": 0.80668, + "grad_norm": 0.003467412432655692, + "learning_rate": 2.191768951911305e-06, + "loss": 0.0003, + "step": 40334 + }, + { + "epoch": 0.80672, + "grad_norm": 0.002777245594188571, + "learning_rate": 2.1908967094311693e-06, + "loss": 0.0, + "step": 40336 + }, + { + "epoch": 0.80676, + "grad_norm": 0.0004041787760797888, + "learning_rate": 2.1900246191935227e-06, + "loss": 0.0, + "step": 40338 + }, + { + "epoch": 0.8068, + "grad_norm": 0.001845690654590726, + "learning_rate": 2.1891526812153674e-06, + "loss": 0.0001, + "step": 40340 + }, + { + "epoch": 0.80684, + "grad_norm": 0.3205163776874542, + "learning_rate": 2.188280895513705e-06, + "loss": 0.0036, + "step": 40342 + }, + { + "epoch": 0.80688, + "grad_norm": 0.045968059450387955, + "learning_rate": 2.187409262105524e-06, + "loss": 0.0006, + "step": 40344 + }, + { + "epoch": 0.80692, + "grad_norm": 0.03642125055193901, + "learning_rate": 2.1865377810078227e-06, + "loss": 0.0003, + "step": 40346 + }, + { + "epoch": 0.80696, + "grad_norm": 0.018021630123257637, + "learning_rate": 2.1856664522375893e-06, + "loss": 0.0026, + "step": 40348 + }, + { + "epoch": 0.807, + "grad_norm": 0.7058955430984497, + "learning_rate": 2.1847952758118118e-06, + "loss": 0.0106, + "step": 40350 + }, + { + "epoch": 0.80704, + "grad_norm": 0.947364330291748, + "learning_rate": 2.183924251747477e-06, + "loss": 0.0111, + "step": 40352 + }, + { + "epoch": 0.80708, + "grad_norm": 0.0008255366701632738, + "learning_rate": 2.183053380061557e-06, + "loss": 0.0, + "step": 40354 + }, + { + "epoch": 0.80712, + "grad_norm": 0.08279673755168915, + "learning_rate": 2.1821826607710416e-06, + "loss": 0.0123, + "step": 40356 + }, + { + "epoch": 0.80716, + "grad_norm": 0.2542854845523834, + "learning_rate": 2.1813120938928976e-06, + "loss": 0.0116, + "step": 40358 + }, + { + "epoch": 0.8072, + "grad_norm": 0.8167082667350769, + "learning_rate": 2.1804416794441e-06, + "loss": 0.0072, + "step": 40360 + }, + { + "epoch": 0.80724, + "grad_norm": 0.08823342621326447, + "learning_rate": 2.1795714174416195e-06, + "loss": 0.001, + "step": 40362 + }, + { + "epoch": 0.80728, + "grad_norm": 0.26685622334480286, + "learning_rate": 2.1787013079024143e-06, + "loss": 0.0041, + "step": 40364 + }, + { + "epoch": 0.80732, + "grad_norm": 0.16566993296146393, + "learning_rate": 2.177831350843461e-06, + "loss": 0.0057, + "step": 40366 + }, + { + "epoch": 0.80736, + "grad_norm": 0.17289046943187714, + "learning_rate": 2.1769615462817085e-06, + "loss": 0.012, + "step": 40368 + }, + { + "epoch": 0.8074, + "grad_norm": 0.08162564784288406, + "learning_rate": 2.1760918942341193e-06, + "loss": 0.0015, + "step": 40370 + }, + { + "epoch": 0.80744, + "grad_norm": 0.22555983066558838, + "learning_rate": 2.1752223947176466e-06, + "loss": 0.0893, + "step": 40372 + }, + { + "epoch": 0.80748, + "grad_norm": 0.31406062841415405, + "learning_rate": 2.1743530477492438e-06, + "loss": 0.0033, + "step": 40374 + }, + { + "epoch": 0.80752, + "grad_norm": 0.1278936266899109, + "learning_rate": 2.1734838533458548e-06, + "loss": 0.0012, + "step": 40376 + }, + { + "epoch": 0.80756, + "grad_norm": 0.006611743476241827, + "learning_rate": 2.172614811524427e-06, + "loss": 0.1625, + "step": 40378 + }, + { + "epoch": 0.8076, + "grad_norm": 0.010130767710506916, + "learning_rate": 2.171745922301903e-06, + "loss": 0.0005, + "step": 40380 + }, + { + "epoch": 0.80764, + "grad_norm": 0.03080233372747898, + "learning_rate": 2.170877185695224e-06, + "loss": 0.003, + "step": 40382 + }, + { + "epoch": 0.80768, + "grad_norm": 0.8940938115119934, + "learning_rate": 2.170008601721326e-06, + "loss": 0.0123, + "step": 40384 + }, + { + "epoch": 0.80772, + "grad_norm": 0.09731322526931763, + "learning_rate": 2.1691401703971373e-06, + "loss": 0.0249, + "step": 40386 + }, + { + "epoch": 0.80776, + "grad_norm": 0.11140146106481552, + "learning_rate": 2.1682718917395973e-06, + "loss": 0.0016, + "step": 40388 + }, + { + "epoch": 0.8078, + "grad_norm": 0.043508097529411316, + "learning_rate": 2.1674037657656265e-06, + "loss": 0.0006, + "step": 40390 + }, + { + "epoch": 0.80784, + "grad_norm": 0.00578870065510273, + "learning_rate": 2.1665357924921517e-06, + "loss": 0.6493, + "step": 40392 + }, + { + "epoch": 0.80788, + "grad_norm": 0.7127072215080261, + "learning_rate": 2.1656679719360974e-06, + "loss": 0.0102, + "step": 40394 + }, + { + "epoch": 0.80792, + "grad_norm": 0.02163851447403431, + "learning_rate": 2.164800304114374e-06, + "loss": 0.0006, + "step": 40396 + }, + { + "epoch": 0.80796, + "grad_norm": 0.02526041865348816, + "learning_rate": 2.1639327890439076e-06, + "loss": 0.0134, + "step": 40398 + }, + { + "epoch": 0.808, + "grad_norm": 0.05041968822479248, + "learning_rate": 2.163065426741603e-06, + "loss": 0.001, + "step": 40400 + }, + { + "epoch": 0.80804, + "grad_norm": 1.8405448198318481, + "learning_rate": 2.1621982172243727e-06, + "loss": 0.0153, + "step": 40402 + }, + { + "epoch": 0.80808, + "grad_norm": 0.2935488224029541, + "learning_rate": 2.1613311605091235e-06, + "loss": 0.003, + "step": 40404 + }, + { + "epoch": 0.80812, + "grad_norm": 0.08707653731107712, + "learning_rate": 2.1604642566127587e-06, + "loss": 0.0029, + "step": 40406 + }, + { + "epoch": 0.80816, + "grad_norm": 0.04528637230396271, + "learning_rate": 2.15959750555218e-06, + "loss": 0.0077, + "step": 40408 + }, + { + "epoch": 0.8082, + "grad_norm": 0.02977386862039566, + "learning_rate": 2.1587309073442865e-06, + "loss": 0.0021, + "step": 40410 + }, + { + "epoch": 0.80824, + "grad_norm": 0.007704003714025021, + "learning_rate": 2.1578644620059684e-06, + "loss": 0.004, + "step": 40412 + }, + { + "epoch": 0.80828, + "grad_norm": 2.488788604736328, + "learning_rate": 2.15699816955412e-06, + "loss": 0.0213, + "step": 40414 + }, + { + "epoch": 0.80832, + "grad_norm": 6.555973052978516, + "learning_rate": 2.1561320300056323e-06, + "loss": 0.0792, + "step": 40416 + }, + { + "epoch": 0.80836, + "grad_norm": 0.02055458351969719, + "learning_rate": 2.155266043377384e-06, + "loss": 0.0026, + "step": 40418 + }, + { + "epoch": 0.8084, + "grad_norm": 5.037328720092773, + "learning_rate": 2.154400209686268e-06, + "loss": 0.0648, + "step": 40420 + }, + { + "epoch": 0.80844, + "grad_norm": 0.07439775764942169, + "learning_rate": 2.1535345289491547e-06, + "loss": 0.001, + "step": 40422 + }, + { + "epoch": 0.80848, + "grad_norm": 0.2501787841320038, + "learning_rate": 2.1526690011829297e-06, + "loss": 0.0023, + "step": 40424 + }, + { + "epoch": 0.80852, + "grad_norm": 2.3680527210235596, + "learning_rate": 2.1518036264044605e-06, + "loss": 0.0311, + "step": 40426 + }, + { + "epoch": 0.80856, + "grad_norm": 0.6160816550254822, + "learning_rate": 2.1509384046306203e-06, + "loss": 0.0054, + "step": 40428 + }, + { + "epoch": 0.8086, + "grad_norm": 0.23439404368400574, + "learning_rate": 2.1500733358782786e-06, + "loss": 0.0021, + "step": 40430 + }, + { + "epoch": 0.80864, + "grad_norm": 0.1330760419368744, + "learning_rate": 2.149208420164297e-06, + "loss": 0.0037, + "step": 40432 + }, + { + "epoch": 0.80868, + "grad_norm": 0.10891197621822357, + "learning_rate": 2.148343657505538e-06, + "loss": 0.0047, + "step": 40434 + }, + { + "epoch": 0.80872, + "grad_norm": 0.02064662054181099, + "learning_rate": 2.1474790479188636e-06, + "loss": 0.0014, + "step": 40436 + }, + { + "epoch": 0.80876, + "grad_norm": 0.8022426962852478, + "learning_rate": 2.1466145914211266e-06, + "loss": 0.009, + "step": 40438 + }, + { + "epoch": 0.8088, + "grad_norm": 0.19690141081809998, + "learning_rate": 2.1457502880291815e-06, + "loss": 0.0017, + "step": 40440 + }, + { + "epoch": 0.80884, + "grad_norm": 0.010142671875655651, + "learning_rate": 2.1448861377598805e-06, + "loss": 0.0008, + "step": 40442 + }, + { + "epoch": 0.80888, + "grad_norm": 0.4775330722332001, + "learning_rate": 2.1440221406300654e-06, + "loss": 0.0045, + "step": 40444 + }, + { + "epoch": 0.80892, + "grad_norm": 0.16789130866527557, + "learning_rate": 2.1431582966565844e-06, + "loss": 0.0024, + "step": 40446 + }, + { + "epoch": 0.80896, + "grad_norm": 0.4949643909931183, + "learning_rate": 2.1422946058562788e-06, + "loss": 0.1427, + "step": 40448 + }, + { + "epoch": 0.809, + "grad_norm": 0.005725356284528971, + "learning_rate": 2.1414310682459805e-06, + "loss": 0.0082, + "step": 40450 + }, + { + "epoch": 0.80904, + "grad_norm": 0.06839549541473389, + "learning_rate": 2.140567683842535e-06, + "loss": 0.8559, + "step": 40452 + }, + { + "epoch": 0.80908, + "grad_norm": 0.008638526313006878, + "learning_rate": 2.139704452662763e-06, + "loss": 0.1157, + "step": 40454 + }, + { + "epoch": 0.80912, + "grad_norm": 0.15231876075267792, + "learning_rate": 2.138841374723506e-06, + "loss": 0.0014, + "step": 40456 + }, + { + "epoch": 0.80916, + "grad_norm": 0.07050865143537521, + "learning_rate": 2.137978450041579e-06, + "loss": 0.0529, + "step": 40458 + }, + { + "epoch": 0.8092, + "grad_norm": 0.02993226796388626, + "learning_rate": 2.1371156786338108e-06, + "loss": 0.0016, + "step": 40460 + }, + { + "epoch": 0.80924, + "grad_norm": 3.7888309955596924, + "learning_rate": 2.1362530605170227e-06, + "loss": 0.0349, + "step": 40462 + }, + { + "epoch": 0.80928, + "grad_norm": 0.6972890496253967, + "learning_rate": 2.135390595708028e-06, + "loss": 0.0091, + "step": 40464 + }, + { + "epoch": 0.80932, + "grad_norm": 0.389166921377182, + "learning_rate": 2.1345282842236427e-06, + "loss": 0.1538, + "step": 40466 + }, + { + "epoch": 0.80936, + "grad_norm": 0.20050878822803497, + "learning_rate": 2.1336661260806767e-06, + "loss": 0.0087, + "step": 40468 + }, + { + "epoch": 0.8094, + "grad_norm": 0.012153765186667442, + "learning_rate": 2.1328041212959403e-06, + "loss": 0.0045, + "step": 40470 + }, + { + "epoch": 0.80944, + "grad_norm": 0.016142386943101883, + "learning_rate": 2.131942269886237e-06, + "loss": 0.0005, + "step": 40472 + }, + { + "epoch": 0.80948, + "grad_norm": 4.452688694000244, + "learning_rate": 2.1310805718683737e-06, + "loss": 0.0422, + "step": 40474 + }, + { + "epoch": 0.80952, + "grad_norm": 0.20412415266036987, + "learning_rate": 2.1302190272591426e-06, + "loss": 0.0019, + "step": 40476 + }, + { + "epoch": 0.80956, + "grad_norm": 1.439944863319397, + "learning_rate": 2.129357636075343e-06, + "loss": 0.0151, + "step": 40478 + }, + { + "epoch": 0.8096, + "grad_norm": 0.004573892802000046, + "learning_rate": 2.128496398333768e-06, + "loss": 0.0021, + "step": 40480 + }, + { + "epoch": 0.80964, + "grad_norm": 14.690239906311035, + "learning_rate": 2.1276353140512097e-06, + "loss": 0.9386, + "step": 40482 + }, + { + "epoch": 0.80968, + "grad_norm": 0.027442054823040962, + "learning_rate": 2.1267743832444555e-06, + "loss": 0.0004, + "step": 40484 + }, + { + "epoch": 0.80972, + "grad_norm": 8.816210746765137, + "learning_rate": 2.125913605930283e-06, + "loss": 0.1059, + "step": 40486 + }, + { + "epoch": 0.80976, + "grad_norm": 0.7678293585777283, + "learning_rate": 2.125052982125484e-06, + "loss": 0.0502, + "step": 40488 + }, + { + "epoch": 0.8098, + "grad_norm": 0.07384584844112396, + "learning_rate": 2.1241925118468288e-06, + "loss": 0.0013, + "step": 40490 + }, + { + "epoch": 0.80984, + "grad_norm": 0.8035688996315002, + "learning_rate": 2.1233321951110952e-06, + "loss": 0.01, + "step": 40492 + }, + { + "epoch": 0.80988, + "grad_norm": 3.571643590927124, + "learning_rate": 2.1224720319350556e-06, + "loss": 0.0447, + "step": 40494 + }, + { + "epoch": 0.80992, + "grad_norm": 0.19850754737854004, + "learning_rate": 2.1216120223354817e-06, + "loss": 0.0017, + "step": 40496 + }, + { + "epoch": 0.80996, + "grad_norm": 0.10055943578481674, + "learning_rate": 2.1207521663291354e-06, + "loss": 0.0023, + "step": 40498 + }, + { + "epoch": 0.81, + "grad_norm": 0.07117875665426254, + "learning_rate": 2.119892463932781e-06, + "loss": 0.0063, + "step": 40500 + }, + { + "epoch": 0.81004, + "grad_norm": 0.037794873118400574, + "learning_rate": 2.1190329151631818e-06, + "loss": 0.0007, + "step": 40502 + }, + { + "epoch": 0.81008, + "grad_norm": 0.48601919412612915, + "learning_rate": 2.1181735200370924e-06, + "loss": 0.0123, + "step": 40504 + }, + { + "epoch": 0.81012, + "grad_norm": 0.2680770754814148, + "learning_rate": 2.1173142785712707e-06, + "loss": 0.0035, + "step": 40506 + }, + { + "epoch": 0.81016, + "grad_norm": 0.09019983559846878, + "learning_rate": 2.1164551907824592e-06, + "loss": 0.0026, + "step": 40508 + }, + { + "epoch": 0.8102, + "grad_norm": 0.08437006175518036, + "learning_rate": 2.115596256687419e-06, + "loss": 0.0007, + "step": 40510 + }, + { + "epoch": 0.81024, + "grad_norm": 0.25641778111457825, + "learning_rate": 2.114737476302886e-06, + "loss": 0.0027, + "step": 40512 + }, + { + "epoch": 0.81028, + "grad_norm": 0.02684866078197956, + "learning_rate": 2.113878849645605e-06, + "loss": 0.0007, + "step": 40514 + }, + { + "epoch": 0.81032, + "grad_norm": 0.297895610332489, + "learning_rate": 2.11302037673232e-06, + "loss": 0.0037, + "step": 40516 + }, + { + "epoch": 0.81036, + "grad_norm": 0.005651023704558611, + "learning_rate": 2.1121620575797566e-06, + "loss": 0.0045, + "step": 40518 + }, + { + "epoch": 0.8104, + "grad_norm": 0.03349275887012482, + "learning_rate": 2.1113038922046603e-06, + "loss": 0.0171, + "step": 40520 + }, + { + "epoch": 0.81044, + "grad_norm": 0.017004869878292084, + "learning_rate": 2.110445880623754e-06, + "loss": 0.0422, + "step": 40522 + }, + { + "epoch": 0.81048, + "grad_norm": 0.004231296479701996, + "learning_rate": 2.1095880228537656e-06, + "loss": 0.0003, + "step": 40524 + }, + { + "epoch": 0.81052, + "grad_norm": 0.00157267227768898, + "learning_rate": 2.1087303189114226e-06, + "loss": 0.0027, + "step": 40526 + }, + { + "epoch": 0.81056, + "grad_norm": 0.18693196773529053, + "learning_rate": 2.1078727688134435e-06, + "loss": 0.0045, + "step": 40528 + }, + { + "epoch": 0.8106, + "grad_norm": 0.032838888466358185, + "learning_rate": 2.107015372576552e-06, + "loss": 0.0016, + "step": 40530 + }, + { + "epoch": 0.81064, + "grad_norm": 0.027103880420327187, + "learning_rate": 2.106158130217456e-06, + "loss": 0.0008, + "step": 40532 + }, + { + "epoch": 0.81068, + "grad_norm": 0.009139073081314564, + "learning_rate": 2.1053010417528707e-06, + "loss": 0.0007, + "step": 40534 + }, + { + "epoch": 0.81072, + "grad_norm": 0.4293757677078247, + "learning_rate": 2.104444107199507e-06, + "loss": 0.0078, + "step": 40536 + }, + { + "epoch": 0.81076, + "grad_norm": 0.6344034075737, + "learning_rate": 2.1035873265740724e-06, + "loss": 0.0058, + "step": 40538 + }, + { + "epoch": 0.8108, + "grad_norm": 0.27882376313209534, + "learning_rate": 2.102730699893263e-06, + "loss": 0.0062, + "step": 40540 + }, + { + "epoch": 0.81084, + "grad_norm": 2.623703956604004, + "learning_rate": 2.101874227173789e-06, + "loss": 0.0302, + "step": 40542 + }, + { + "epoch": 0.81088, + "grad_norm": 0.09019487351179123, + "learning_rate": 2.1010179084323413e-06, + "loss": 0.0027, + "step": 40544 + }, + { + "epoch": 0.81092, + "grad_norm": 0.008643079549074173, + "learning_rate": 2.100161743685616e-06, + "loss": 0.001, + "step": 40546 + }, + { + "epoch": 0.81096, + "grad_norm": 0.010010127909481525, + "learning_rate": 2.0993057329503053e-06, + "loss": 0.0024, + "step": 40548 + }, + { + "epoch": 0.811, + "grad_norm": 0.0006858505657874048, + "learning_rate": 2.098449876243096e-06, + "loss": 0.0104, + "step": 40550 + }, + { + "epoch": 0.81104, + "grad_norm": 0.11011673510074615, + "learning_rate": 2.097594173580677e-06, + "loss": 0.0034, + "step": 40552 + }, + { + "epoch": 0.81108, + "grad_norm": 0.02177254483103752, + "learning_rate": 2.0967386249797263e-06, + "loss": 0.0015, + "step": 40554 + }, + { + "epoch": 0.81112, + "grad_norm": 0.054311543703079224, + "learning_rate": 2.0958832304569244e-06, + "loss": 0.0009, + "step": 40556 + }, + { + "epoch": 0.81116, + "grad_norm": 0.14207173883914948, + "learning_rate": 2.095027990028948e-06, + "loss": 0.0013, + "step": 40558 + }, + { + "epoch": 0.8112, + "grad_norm": 0.06543494760990143, + "learning_rate": 2.09417290371247e-06, + "loss": 0.0008, + "step": 40560 + }, + { + "epoch": 0.81124, + "grad_norm": 0.012727347202599049, + "learning_rate": 2.0933179715241626e-06, + "loss": 0.0001, + "step": 40562 + }, + { + "epoch": 0.81128, + "grad_norm": 0.09654092788696289, + "learning_rate": 2.092463193480695e-06, + "loss": 0.0023, + "step": 40564 + }, + { + "epoch": 0.81132, + "grad_norm": 0.00632754061371088, + "learning_rate": 2.0916085695987253e-06, + "loss": 0.0064, + "step": 40566 + }, + { + "epoch": 0.81136, + "grad_norm": 0.3254680037498474, + "learning_rate": 2.090754099894917e-06, + "loss": 0.4333, + "step": 40568 + }, + { + "epoch": 0.8114, + "grad_norm": 0.0032421008218079805, + "learning_rate": 2.0898997843859338e-06, + "loss": 0.0001, + "step": 40570 + }, + { + "epoch": 0.81144, + "grad_norm": 0.3108966648578644, + "learning_rate": 2.0890456230884203e-06, + "loss": 0.0051, + "step": 40572 + }, + { + "epoch": 0.81148, + "grad_norm": 1.6928155422210693, + "learning_rate": 2.0881916160190407e-06, + "loss": 0.0294, + "step": 40574 + }, + { + "epoch": 0.81152, + "grad_norm": 0.09137169271707535, + "learning_rate": 2.0873377631944336e-06, + "loss": 0.0022, + "step": 40576 + }, + { + "epoch": 0.81156, + "grad_norm": 1.3979445695877075, + "learning_rate": 2.086484064631257e-06, + "loss": 0.0163, + "step": 40578 + }, + { + "epoch": 0.8116, + "grad_norm": 0.053712762892246246, + "learning_rate": 2.0856305203461436e-06, + "loss": 0.0006, + "step": 40580 + }, + { + "epoch": 0.81164, + "grad_norm": 0.0037222662940621376, + "learning_rate": 2.0847771303557375e-06, + "loss": 0.001, + "step": 40582 + }, + { + "epoch": 0.81168, + "grad_norm": 0.13785342872142792, + "learning_rate": 2.0839238946766804e-06, + "loss": 0.0013, + "step": 40584 + }, + { + "epoch": 0.81172, + "grad_norm": 0.16865018010139465, + "learning_rate": 2.0830708133255985e-06, + "loss": 0.0031, + "step": 40586 + }, + { + "epoch": 0.81176, + "grad_norm": 0.3748343288898468, + "learning_rate": 2.0822178863191277e-06, + "loss": 0.0039, + "step": 40588 + }, + { + "epoch": 0.8118, + "grad_norm": 0.16766943037509918, + "learning_rate": 2.0813651136738957e-06, + "loss": 0.0017, + "step": 40590 + }, + { + "epoch": 0.81184, + "grad_norm": 0.04102865234017372, + "learning_rate": 2.080512495406527e-06, + "loss": 0.0009, + "step": 40592 + }, + { + "epoch": 0.81188, + "grad_norm": 0.15267407894134521, + "learning_rate": 2.0796600315336445e-06, + "loss": 0.0037, + "step": 40594 + }, + { + "epoch": 0.81192, + "grad_norm": 5.0213189125061035, + "learning_rate": 2.0788077220718705e-06, + "loss": 0.0698, + "step": 40596 + }, + { + "epoch": 0.81196, + "grad_norm": 0.2584049105644226, + "learning_rate": 2.077955567037815e-06, + "loss": 0.0046, + "step": 40598 + }, + { + "epoch": 0.812, + "grad_norm": 0.09437257796525955, + "learning_rate": 2.0771035664480944e-06, + "loss": 0.0048, + "step": 40600 + }, + { + "epoch": 0.81204, + "grad_norm": 0.429264098405838, + "learning_rate": 2.076251720319319e-06, + "loss": 0.0065, + "step": 40602 + }, + { + "epoch": 0.81208, + "grad_norm": 0.914793074131012, + "learning_rate": 2.075400028668094e-06, + "loss": 0.0449, + "step": 40604 + }, + { + "epoch": 0.81212, + "grad_norm": 0.20142754912376404, + "learning_rate": 2.0745484915110303e-06, + "loss": 0.0018, + "step": 40606 + }, + { + "epoch": 0.81216, + "grad_norm": 16.03992462158203, + "learning_rate": 2.073697108864717e-06, + "loss": 0.7934, + "step": 40608 + }, + { + "epoch": 0.8122, + "grad_norm": 0.14591632783412933, + "learning_rate": 2.072845880745766e-06, + "loss": 0.0033, + "step": 40610 + }, + { + "epoch": 0.81224, + "grad_norm": 0.6283569931983948, + "learning_rate": 2.071994807170763e-06, + "loss": 0.0058, + "step": 40612 + }, + { + "epoch": 0.81228, + "grad_norm": 0.46926674246788025, + "learning_rate": 2.0711438881563016e-06, + "loss": 0.0054, + "step": 40614 + }, + { + "epoch": 0.81232, + "grad_norm": 5.276253700256348, + "learning_rate": 2.070293123718974e-06, + "loss": 0.0652, + "step": 40616 + }, + { + "epoch": 0.81236, + "grad_norm": 0.008177300915122032, + "learning_rate": 2.069442513875363e-06, + "loss": 0.0002, + "step": 40618 + }, + { + "epoch": 0.8124, + "grad_norm": 0.011355983093380928, + "learning_rate": 2.0685920586420562e-06, + "loss": 0.0045, + "step": 40620 + }, + { + "epoch": 0.81244, + "grad_norm": 0.03607631102204323, + "learning_rate": 2.0677417580356272e-06, + "loss": 0.0003, + "step": 40622 + }, + { + "epoch": 0.81248, + "grad_norm": 0.22921434044837952, + "learning_rate": 2.0668916120726578e-06, + "loss": 0.0605, + "step": 40624 + }, + { + "epoch": 0.81252, + "grad_norm": 0.16180680692195892, + "learning_rate": 2.06604162076972e-06, + "loss": 0.0144, + "step": 40626 + }, + { + "epoch": 0.81256, + "grad_norm": 0.015692416578531265, + "learning_rate": 2.065191784143388e-06, + "loss": 0.0006, + "step": 40628 + }, + { + "epoch": 0.8126, + "grad_norm": 0.11389127373695374, + "learning_rate": 2.0643421022102216e-06, + "loss": 0.0015, + "step": 40630 + }, + { + "epoch": 0.81264, + "grad_norm": 0.27522391080856323, + "learning_rate": 2.0634925749867974e-06, + "loss": 0.0035, + "step": 40632 + }, + { + "epoch": 0.81268, + "grad_norm": 2.3677821159362793, + "learning_rate": 2.0626432024896682e-06, + "loss": 0.0306, + "step": 40634 + }, + { + "epoch": 0.81272, + "grad_norm": 0.09725001454353333, + "learning_rate": 2.0617939847353975e-06, + "loss": 0.0054, + "step": 40636 + }, + { + "epoch": 0.81276, + "grad_norm": 0.03332533687353134, + "learning_rate": 2.060944921740541e-06, + "loss": 0.0021, + "step": 40638 + }, + { + "epoch": 0.8128, + "grad_norm": 0.03052567131817341, + "learning_rate": 2.0600960135216463e-06, + "loss": 0.0004, + "step": 40640 + }, + { + "epoch": 0.81284, + "grad_norm": 0.023449163883924484, + "learning_rate": 2.0592472600952718e-06, + "loss": 0.0004, + "step": 40642 + }, + { + "epoch": 0.81288, + "grad_norm": 0.05120355263352394, + "learning_rate": 2.0583986614779584e-06, + "loss": 0.0004, + "step": 40644 + }, + { + "epoch": 0.81292, + "grad_norm": 0.05853865668177605, + "learning_rate": 2.0575502176862514e-06, + "loss": 0.0007, + "step": 40646 + }, + { + "epoch": 0.81296, + "grad_norm": 0.010120519436895847, + "learning_rate": 2.056701928736693e-06, + "loss": 0.0035, + "step": 40648 + }, + { + "epoch": 0.813, + "grad_norm": 0.0008616396808065474, + "learning_rate": 2.0558537946458177e-06, + "loss": 0.0001, + "step": 40650 + }, + { + "epoch": 0.81304, + "grad_norm": 0.9077181220054626, + "learning_rate": 2.0550058154301676e-06, + "loss": 0.0097, + "step": 40652 + }, + { + "epoch": 0.81308, + "grad_norm": 0.05369074270129204, + "learning_rate": 2.0541579911062657e-06, + "loss": 0.0026, + "step": 40654 + }, + { + "epoch": 0.81312, + "grad_norm": 0.05324035882949829, + "learning_rate": 2.0533103216906448e-06, + "loss": 0.0006, + "step": 40656 + }, + { + "epoch": 0.81316, + "grad_norm": 0.08677050471305847, + "learning_rate": 2.05246280719983e-06, + "loss": 0.0015, + "step": 40658 + }, + { + "epoch": 0.8132, + "grad_norm": 0.2471955418586731, + "learning_rate": 2.051615447650347e-06, + "loss": 0.0981, + "step": 40660 + }, + { + "epoch": 0.81324, + "grad_norm": 0.27136683464050293, + "learning_rate": 2.0507682430587083e-06, + "loss": 0.0068, + "step": 40662 + }, + { + "epoch": 0.81328, + "grad_norm": 0.5802980065345764, + "learning_rate": 2.0499211934414407e-06, + "loss": 0.0128, + "step": 40664 + }, + { + "epoch": 0.81332, + "grad_norm": 13.221457481384277, + "learning_rate": 2.049074298815049e-06, + "loss": 0.2054, + "step": 40666 + }, + { + "epoch": 0.81336, + "grad_norm": 0.2092617303133011, + "learning_rate": 2.0482275591960477e-06, + "loss": 0.003, + "step": 40668 + }, + { + "epoch": 0.8134, + "grad_norm": 0.011510376818478107, + "learning_rate": 2.0473809746009444e-06, + "loss": 0.0011, + "step": 40670 + }, + { + "epoch": 0.81344, + "grad_norm": 0.4891480505466461, + "learning_rate": 2.0465345450462426e-06, + "loss": 0.0351, + "step": 40672 + }, + { + "epoch": 0.81348, + "grad_norm": 0.10528818517923355, + "learning_rate": 2.0456882705484472e-06, + "loss": 0.0025, + "step": 40674 + }, + { + "epoch": 0.81352, + "grad_norm": 0.1091013178229332, + "learning_rate": 2.0448421511240514e-06, + "loss": 0.0019, + "step": 40676 + }, + { + "epoch": 0.81356, + "grad_norm": 0.6912553310394287, + "learning_rate": 2.0439961867895543e-06, + "loss": 0.0077, + "step": 40678 + }, + { + "epoch": 0.8136, + "grad_norm": 16.87244415283203, + "learning_rate": 2.0431503775614457e-06, + "loss": 0.2742, + "step": 40680 + }, + { + "epoch": 0.81364, + "grad_norm": 0.06349025666713715, + "learning_rate": 2.042304723456219e-06, + "loss": 0.0007, + "step": 40682 + }, + { + "epoch": 0.81368, + "grad_norm": 0.09576846659183502, + "learning_rate": 2.0414592244903577e-06, + "loss": 0.001, + "step": 40684 + }, + { + "epoch": 0.81372, + "grad_norm": 0.0006581289926543832, + "learning_rate": 2.0406138806803487e-06, + "loss": 0.0015, + "step": 40686 + }, + { + "epoch": 0.81376, + "grad_norm": 0.03576042130589485, + "learning_rate": 2.0397686920426675e-06, + "loss": 0.0031, + "step": 40688 + }, + { + "epoch": 0.8138, + "grad_norm": 0.58111572265625, + "learning_rate": 2.0389236585937944e-06, + "loss": 0.0068, + "step": 40690 + }, + { + "epoch": 0.81384, + "grad_norm": 0.12701262533664703, + "learning_rate": 2.0380787803502046e-06, + "loss": 0.139, + "step": 40692 + }, + { + "epoch": 0.81388, + "grad_norm": 0.4111558496952057, + "learning_rate": 2.0372340573283633e-06, + "loss": 0.0111, + "step": 40694 + }, + { + "epoch": 0.81392, + "grad_norm": 0.26933297514915466, + "learning_rate": 2.036389489544749e-06, + "loss": 0.0027, + "step": 40696 + }, + { + "epoch": 0.81396, + "grad_norm": 0.01390150934457779, + "learning_rate": 2.035545077015817e-06, + "loss": 0.0013, + "step": 40698 + }, + { + "epoch": 0.814, + "grad_norm": 0.2559211850166321, + "learning_rate": 2.0347008197580376e-06, + "loss": 0.0294, + "step": 40700 + }, + { + "epoch": 0.81404, + "grad_norm": 0.053499579429626465, + "learning_rate": 2.0338567177878653e-06, + "loss": 0.0006, + "step": 40702 + }, + { + "epoch": 0.81408, + "grad_norm": 0.25552284717559814, + "learning_rate": 2.0330127711217573e-06, + "loss": 0.0042, + "step": 40704 + }, + { + "epoch": 0.81412, + "grad_norm": 0.07310500741004944, + "learning_rate": 2.032168979776169e-06, + "loss": 0.0099, + "step": 40706 + }, + { + "epoch": 0.81416, + "grad_norm": 0.9763017892837524, + "learning_rate": 2.0313253437675474e-06, + "loss": 0.0081, + "step": 40708 + }, + { + "epoch": 0.8142, + "grad_norm": 11.685737609863281, + "learning_rate": 2.0304818631123393e-06, + "loss": 0.8978, + "step": 40710 + }, + { + "epoch": 0.81424, + "grad_norm": 0.2104143351316452, + "learning_rate": 2.02963853782699e-06, + "loss": 0.0102, + "step": 40712 + }, + { + "epoch": 0.81428, + "grad_norm": 0.09812401980161667, + "learning_rate": 2.0287953679279426e-06, + "loss": 0.0103, + "step": 40714 + }, + { + "epoch": 0.81432, + "grad_norm": 0.04423553869128227, + "learning_rate": 2.0279523534316316e-06, + "loss": 0.1161, + "step": 40716 + }, + { + "epoch": 0.81436, + "grad_norm": 0.3140316605567932, + "learning_rate": 2.0271094943544977e-06, + "loss": 0.0051, + "step": 40718 + }, + { + "epoch": 0.8144, + "grad_norm": 0.20815882086753845, + "learning_rate": 2.026266790712965e-06, + "loss": 0.0037, + "step": 40720 + }, + { + "epoch": 0.81444, + "grad_norm": 0.19933761656284332, + "learning_rate": 2.025424242523467e-06, + "loss": 0.0051, + "step": 40722 + }, + { + "epoch": 0.81448, + "grad_norm": 0.03156723082065582, + "learning_rate": 2.0245818498024296e-06, + "loss": 0.0005, + "step": 40724 + }, + { + "epoch": 0.81452, + "grad_norm": 0.4402037560939789, + "learning_rate": 2.0237396125662745e-06, + "loss": 0.0036, + "step": 40726 + }, + { + "epoch": 0.81456, + "grad_norm": 0.006759822368621826, + "learning_rate": 2.0228975308314245e-06, + "loss": 0.0009, + "step": 40728 + }, + { + "epoch": 0.8146, + "grad_norm": 0.2181251347064972, + "learning_rate": 2.022055604614289e-06, + "loss": 0.0024, + "step": 40730 + }, + { + "epoch": 0.81464, + "grad_norm": 0.3709140419960022, + "learning_rate": 2.0212138339312927e-06, + "loss": 0.004, + "step": 40732 + }, + { + "epoch": 0.81468, + "grad_norm": 0.0683482363820076, + "learning_rate": 2.0203722187988363e-06, + "loss": 0.0011, + "step": 40734 + }, + { + "epoch": 0.81472, + "grad_norm": 0.04216763377189636, + "learning_rate": 2.0195307592333324e-06, + "loss": 0.0015, + "step": 40736 + }, + { + "epoch": 0.81476, + "grad_norm": 0.025313979014754295, + "learning_rate": 2.018689455251186e-06, + "loss": 0.0018, + "step": 40738 + }, + { + "epoch": 0.8148, + "grad_norm": 0.4416086971759796, + "learning_rate": 2.017848306868797e-06, + "loss": 0.0059, + "step": 40740 + }, + { + "epoch": 0.81484, + "grad_norm": 0.03583068400621414, + "learning_rate": 2.017007314102567e-06, + "loss": 0.0005, + "step": 40742 + }, + { + "epoch": 0.81488, + "grad_norm": 0.14959701895713806, + "learning_rate": 2.0161664769688873e-06, + "loss": 0.0058, + "step": 40744 + }, + { + "epoch": 0.81492, + "grad_norm": 0.1175818145275116, + "learning_rate": 2.0153257954841523e-06, + "loss": 0.0026, + "step": 40746 + }, + { + "epoch": 0.81496, + "grad_norm": 0.02232537232339382, + "learning_rate": 2.014485269664751e-06, + "loss": 0.0024, + "step": 40748 + }, + { + "epoch": 0.815, + "grad_norm": 0.003607056336477399, + "learning_rate": 2.013644899527074e-06, + "loss": 0.0003, + "step": 40750 + }, + { + "epoch": 0.81504, + "grad_norm": 0.011632237583398819, + "learning_rate": 2.012804685087496e-06, + "loss": 0.0028, + "step": 40752 + }, + { + "epoch": 0.81508, + "grad_norm": 0.056009866297245026, + "learning_rate": 2.011964626362407e-06, + "loss": 0.0009, + "step": 40754 + }, + { + "epoch": 0.81512, + "grad_norm": 0.31128910183906555, + "learning_rate": 2.0111247233681787e-06, + "loss": 0.0025, + "step": 40756 + }, + { + "epoch": 0.81516, + "grad_norm": 0.08888448774814606, + "learning_rate": 2.010284976121186e-06, + "loss": 0.0012, + "step": 40758 + }, + { + "epoch": 0.8152, + "grad_norm": 0.030834799632430077, + "learning_rate": 2.009445384637805e-06, + "loss": 0.0011, + "step": 40760 + }, + { + "epoch": 0.81524, + "grad_norm": 0.1894257366657257, + "learning_rate": 2.0086059489343935e-06, + "loss": 0.0021, + "step": 40762 + }, + { + "epoch": 0.81528, + "grad_norm": 0.39706555008888245, + "learning_rate": 2.007766669027329e-06, + "loss": 0.0038, + "step": 40764 + }, + { + "epoch": 0.81532, + "grad_norm": 0.49156326055526733, + "learning_rate": 2.0069275449329663e-06, + "loss": 0.0063, + "step": 40766 + }, + { + "epoch": 0.81536, + "grad_norm": 0.03557628393173218, + "learning_rate": 2.006088576667665e-06, + "loss": 0.0121, + "step": 40768 + }, + { + "epoch": 0.8154, + "grad_norm": 0.1477237343788147, + "learning_rate": 2.005249764247783e-06, + "loss": 0.0018, + "step": 40770 + }, + { + "epoch": 0.81544, + "grad_norm": 0.0513736717402935, + "learning_rate": 2.004411107689672e-06, + "loss": 0.0011, + "step": 40772 + }, + { + "epoch": 0.81548, + "grad_norm": 0.09754109382629395, + "learning_rate": 2.0035726070096863e-06, + "loss": 0.0011, + "step": 40774 + }, + { + "epoch": 0.81552, + "grad_norm": 0.0022657180670648813, + "learning_rate": 2.0027342622241663e-06, + "loss": 0.0019, + "step": 40776 + }, + { + "epoch": 0.81556, + "grad_norm": 0.04496057704091072, + "learning_rate": 2.00189607334946e-06, + "loss": 0.0017, + "step": 40778 + }, + { + "epoch": 0.8156, + "grad_norm": 0.05359011888504028, + "learning_rate": 2.0010580404019066e-06, + "loss": 0.0078, + "step": 40780 + }, + { + "epoch": 0.81564, + "grad_norm": 0.06614382565021515, + "learning_rate": 2.000220163397848e-06, + "loss": 0.0009, + "step": 40782 + }, + { + "epoch": 0.81568, + "grad_norm": 0.04561622440814972, + "learning_rate": 1.99938244235361e-06, + "loss": 0.0014, + "step": 40784 + }, + { + "epoch": 0.81572, + "grad_norm": 0.04069332778453827, + "learning_rate": 1.9985448772855353e-06, + "loss": 0.0018, + "step": 40786 + }, + { + "epoch": 0.81576, + "grad_norm": 0.0028802051674574614, + "learning_rate": 1.9977074682099464e-06, + "loss": 0.0045, + "step": 40788 + }, + { + "epoch": 0.8158, + "grad_norm": 0.5483225584030151, + "learning_rate": 1.9968702151431697e-06, + "loss": 0.0063, + "step": 40790 + }, + { + "epoch": 0.81584, + "grad_norm": 0.5149176716804504, + "learning_rate": 1.99603311810153e-06, + "loss": 0.0059, + "step": 40792 + }, + { + "epoch": 0.81588, + "grad_norm": 0.1316489279270172, + "learning_rate": 1.9951961771013415e-06, + "loss": 0.0018, + "step": 40794 + }, + { + "epoch": 0.81592, + "grad_norm": 0.003816647455096245, + "learning_rate": 1.9943593921589288e-06, + "loss": 0.0003, + "step": 40796 + }, + { + "epoch": 0.81596, + "grad_norm": 0.7465727925300598, + "learning_rate": 1.9935227632905994e-06, + "loss": 0.0091, + "step": 40798 + }, + { + "epoch": 0.816, + "grad_norm": 0.37256091833114624, + "learning_rate": 1.9926862905126663e-06, + "loss": 0.0029, + "step": 40800 + }, + { + "epoch": 0.81604, + "grad_norm": 0.10950668901205063, + "learning_rate": 1.9918499738414354e-06, + "loss": 0.0113, + "step": 40802 + }, + { + "epoch": 0.81608, + "grad_norm": 15.576366424560547, + "learning_rate": 1.991013813293212e-06, + "loss": 0.2675, + "step": 40804 + }, + { + "epoch": 0.81612, + "grad_norm": 0.8826540112495422, + "learning_rate": 1.990177808884298e-06, + "loss": 0.0089, + "step": 40806 + }, + { + "epoch": 0.81616, + "grad_norm": 0.003983111120760441, + "learning_rate": 1.989341960630995e-06, + "loss": 0.0004, + "step": 40808 + }, + { + "epoch": 0.8162, + "grad_norm": 0.01873781718313694, + "learning_rate": 1.9885062685495905e-06, + "loss": 0.0041, + "step": 40810 + }, + { + "epoch": 0.81624, + "grad_norm": 0.01537269912660122, + "learning_rate": 1.9876707326563817e-06, + "loss": 0.002, + "step": 40812 + }, + { + "epoch": 0.81628, + "grad_norm": 0.010543499141931534, + "learning_rate": 1.98683535296766e-06, + "loss": 0.0001, + "step": 40814 + }, + { + "epoch": 0.81632, + "grad_norm": 0.23216700553894043, + "learning_rate": 1.986000129499702e-06, + "loss": 0.0021, + "step": 40816 + }, + { + "epoch": 0.81636, + "grad_norm": 0.402722030878067, + "learning_rate": 1.9851650622688035e-06, + "loss": 0.0044, + "step": 40818 + }, + { + "epoch": 0.8164, + "grad_norm": 0.051537588238716125, + "learning_rate": 1.984330151291233e-06, + "loss": 0.0005, + "step": 40820 + }, + { + "epoch": 0.81644, + "grad_norm": 0.29424211382865906, + "learning_rate": 1.9834953965832783e-06, + "loss": 0.0027, + "step": 40822 + }, + { + "epoch": 0.81648, + "grad_norm": 0.02967548929154873, + "learning_rate": 1.9826607981612056e-06, + "loss": 0.0037, + "step": 40824 + }, + { + "epoch": 0.81652, + "grad_norm": 0.013183651491999626, + "learning_rate": 1.9818263560412875e-06, + "loss": 0.0002, + "step": 40826 + }, + { + "epoch": 0.81656, + "grad_norm": 0.05501433089375496, + "learning_rate": 1.9809920702397954e-06, + "loss": 0.0096, + "step": 40828 + }, + { + "epoch": 0.8166, + "grad_norm": 0.0668850988149643, + "learning_rate": 1.9801579407729866e-06, + "loss": 0.0111, + "step": 40830 + }, + { + "epoch": 0.81664, + "grad_norm": 0.07796379178762436, + "learning_rate": 1.979323967657133e-06, + "loss": 0.0306, + "step": 40832 + }, + { + "epoch": 0.81668, + "grad_norm": 0.08698194473981857, + "learning_rate": 1.9784901509084854e-06, + "loss": 0.0023, + "step": 40834 + }, + { + "epoch": 0.81672, + "grad_norm": 0.030284030362963676, + "learning_rate": 1.9776564905433014e-06, + "loss": 0.0005, + "step": 40836 + }, + { + "epoch": 0.81676, + "grad_norm": 0.026446137577295303, + "learning_rate": 1.9768229865778345e-06, + "loss": 0.0053, + "step": 40838 + }, + { + "epoch": 0.8168, + "grad_norm": 0.0008125390158966184, + "learning_rate": 1.9759896390283362e-06, + "loss": 0.0873, + "step": 40840 + }, + { + "epoch": 0.81684, + "grad_norm": 8.64661979675293, + "learning_rate": 1.97515644791105e-06, + "loss": 0.1615, + "step": 40842 + }, + { + "epoch": 0.81688, + "grad_norm": 0.1776813119649887, + "learning_rate": 1.9743234132422184e-06, + "loss": 0.0021, + "step": 40844 + }, + { + "epoch": 0.81692, + "grad_norm": 0.16243389248847961, + "learning_rate": 1.973490535038085e-06, + "loss": 0.0015, + "step": 40846 + }, + { + "epoch": 0.81696, + "grad_norm": 0.013376208953559399, + "learning_rate": 1.9726578133148853e-06, + "loss": 0.0003, + "step": 40848 + }, + { + "epoch": 0.817, + "grad_norm": 0.0325530543923378, + "learning_rate": 1.9718252480888567e-06, + "loss": 0.0008, + "step": 40850 + }, + { + "epoch": 0.81704, + "grad_norm": 0.12074586004018784, + "learning_rate": 1.9709928393762235e-06, + "loss": 0.0012, + "step": 40852 + }, + { + "epoch": 0.81708, + "grad_norm": 0.1491279900074005, + "learning_rate": 1.970160587193224e-06, + "loss": 0.0023, + "step": 40854 + }, + { + "epoch": 0.81712, + "grad_norm": 0.003550633555278182, + "learning_rate": 1.969328491556074e-06, + "loss": 0.0, + "step": 40856 + }, + { + "epoch": 0.81716, + "grad_norm": 0.056780099868774414, + "learning_rate": 1.968496552481001e-06, + "loss": 0.0136, + "step": 40858 + }, + { + "epoch": 0.8172, + "grad_norm": 0.0013429169775918126, + "learning_rate": 1.9676647699842246e-06, + "loss": 0.0104, + "step": 40860 + }, + { + "epoch": 0.81724, + "grad_norm": 0.005373054184019566, + "learning_rate": 1.9668331440819533e-06, + "loss": 0.0002, + "step": 40862 + }, + { + "epoch": 0.81728, + "grad_norm": 0.10843916237354279, + "learning_rate": 1.9660016747904107e-06, + "loss": 0.0019, + "step": 40864 + }, + { + "epoch": 0.81732, + "grad_norm": 0.045420512557029724, + "learning_rate": 1.965170362125799e-06, + "loss": 0.0242, + "step": 40866 + }, + { + "epoch": 0.81736, + "grad_norm": 14.7114839553833, + "learning_rate": 1.9643392061043287e-06, + "loss": 0.3203, + "step": 40868 + }, + { + "epoch": 0.8174, + "grad_norm": 1.9526759386062622, + "learning_rate": 1.963508206742202e-06, + "loss": 0.6324, + "step": 40870 + }, + { + "epoch": 0.81744, + "grad_norm": 0.006216802168637514, + "learning_rate": 1.962677364055623e-06, + "loss": 0.204, + "step": 40872 + }, + { + "epoch": 0.81748, + "grad_norm": 0.19897077977657318, + "learning_rate": 1.9618466780607803e-06, + "loss": 0.0127, + "step": 40874 + }, + { + "epoch": 0.81752, + "grad_norm": 0.1329190731048584, + "learning_rate": 1.961016148773881e-06, + "loss": 0.0019, + "step": 40876 + }, + { + "epoch": 0.81756, + "grad_norm": 0.06982547789812088, + "learning_rate": 1.9601857762111085e-06, + "loss": 0.0012, + "step": 40878 + }, + { + "epoch": 0.8176, + "grad_norm": 0.10934104025363922, + "learning_rate": 1.959355560388654e-06, + "loss": 0.0012, + "step": 40880 + }, + { + "epoch": 0.81764, + "grad_norm": 0.0005781972431577742, + "learning_rate": 1.9585255013227035e-06, + "loss": 0.0092, + "step": 40882 + }, + { + "epoch": 0.81768, + "grad_norm": 0.1638367772102356, + "learning_rate": 1.9576955990294345e-06, + "loss": 0.0066, + "step": 40884 + }, + { + "epoch": 0.81772, + "grad_norm": 0.016679594293236732, + "learning_rate": 1.956865853525034e-06, + "loss": 0.0009, + "step": 40886 + }, + { + "epoch": 0.81776, + "grad_norm": 0.031487852334976196, + "learning_rate": 1.9560362648256736e-06, + "loss": 0.0047, + "step": 40888 + }, + { + "epoch": 0.8178, + "grad_norm": 0.18332651257514954, + "learning_rate": 1.955206832947526e-06, + "loss": 0.0019, + "step": 40890 + }, + { + "epoch": 0.81784, + "grad_norm": 0.001298077986575663, + "learning_rate": 1.9543775579067637e-06, + "loss": 0.0002, + "step": 40892 + }, + { + "epoch": 0.81788, + "grad_norm": 0.30527862906455994, + "learning_rate": 1.9535484397195537e-06, + "loss": 0.0041, + "step": 40894 + }, + { + "epoch": 0.81792, + "grad_norm": 0.010655539110302925, + "learning_rate": 1.952719478402061e-06, + "loss": 0.0022, + "step": 40896 + }, + { + "epoch": 0.81796, + "grad_norm": 0.12108282744884491, + "learning_rate": 1.9518906739704425e-06, + "loss": 0.0018, + "step": 40898 + }, + { + "epoch": 0.818, + "grad_norm": 2.4594817659817636e-05, + "learning_rate": 1.95106202644086e-06, + "loss": 0.0006, + "step": 40900 + }, + { + "epoch": 0.81804, + "grad_norm": 0.01038007065653801, + "learning_rate": 1.9502335358294657e-06, + "loss": 0.0093, + "step": 40902 + }, + { + "epoch": 0.81808, + "grad_norm": 0.04467746987938881, + "learning_rate": 1.9494052021524167e-06, + "loss": 0.0009, + "step": 40904 + }, + { + "epoch": 0.81812, + "grad_norm": 0.20513440668582916, + "learning_rate": 1.948577025425852e-06, + "loss": 0.0085, + "step": 40906 + }, + { + "epoch": 0.81816, + "grad_norm": 0.22719058394432068, + "learning_rate": 1.9477490056659286e-06, + "loss": 0.0032, + "step": 40908 + }, + { + "epoch": 0.8182, + "grad_norm": 0.1316664218902588, + "learning_rate": 1.9469211428887813e-06, + "loss": 0.0034, + "step": 40910 + }, + { + "epoch": 0.81824, + "grad_norm": 0.06897076219320297, + "learning_rate": 1.9460934371105523e-06, + "loss": 0.0037, + "step": 40912 + }, + { + "epoch": 0.81828, + "grad_norm": 0.22802850604057312, + "learning_rate": 1.945265888347382e-06, + "loss": 0.0038, + "step": 40914 + }, + { + "epoch": 0.81832, + "grad_norm": 0.5450577139854431, + "learning_rate": 1.944438496615393e-06, + "loss": 0.0085, + "step": 40916 + }, + { + "epoch": 0.81836, + "grad_norm": 0.40883809328079224, + "learning_rate": 1.943611261930729e-06, + "loss": 0.0035, + "step": 40918 + }, + { + "epoch": 0.8184, + "grad_norm": 0.2208106368780136, + "learning_rate": 1.9427841843095063e-06, + "loss": 0.0018, + "step": 40920 + }, + { + "epoch": 0.81844, + "grad_norm": 0.08570851385593414, + "learning_rate": 1.9419572637678585e-06, + "loss": 0.0011, + "step": 40922 + }, + { + "epoch": 0.81848, + "grad_norm": 0.010981002822518349, + "learning_rate": 1.9411305003219005e-06, + "loss": 0.0035, + "step": 40924 + }, + { + "epoch": 0.81852, + "grad_norm": 0.2621561586856842, + "learning_rate": 1.9403038939877517e-06, + "loss": 0.4325, + "step": 40926 + }, + { + "epoch": 0.81856, + "grad_norm": 0.022838549688458443, + "learning_rate": 1.93947744478153e-06, + "loss": 0.0013, + "step": 40928 + }, + { + "epoch": 0.8186, + "grad_norm": 2.2630395889282227, + "learning_rate": 1.938651152719344e-06, + "loss": 0.017, + "step": 40930 + }, + { + "epoch": 0.81864, + "grad_norm": 0.011226636357605457, + "learning_rate": 1.937825017817303e-06, + "loss": 0.0025, + "step": 40932 + }, + { + "epoch": 0.81868, + "grad_norm": 0.13658025860786438, + "learning_rate": 1.936999040091515e-06, + "loss": 0.0017, + "step": 40934 + }, + { + "epoch": 0.81872, + "grad_norm": 0.10541069507598877, + "learning_rate": 1.9361732195580818e-06, + "loss": 0.0035, + "step": 40936 + }, + { + "epoch": 0.81876, + "grad_norm": 0.31015002727508545, + "learning_rate": 1.9353475562331027e-06, + "loss": 0.0031, + "step": 40938 + }, + { + "epoch": 0.8188, + "grad_norm": 1.7221763134002686, + "learning_rate": 1.934522050132678e-06, + "loss": 0.0246, + "step": 40940 + }, + { + "epoch": 0.81884, + "grad_norm": 0.1039011999964714, + "learning_rate": 1.9336967012728924e-06, + "loss": 0.0035, + "step": 40942 + }, + { + "epoch": 0.81888, + "grad_norm": 0.004118501208722591, + "learning_rate": 1.932871509669848e-06, + "loss": 0.0035, + "step": 40944 + }, + { + "epoch": 0.81892, + "grad_norm": 0.07131004333496094, + "learning_rate": 1.932046475339624e-06, + "loss": 0.001, + "step": 40946 + }, + { + "epoch": 0.81896, + "grad_norm": 0.37509819865226746, + "learning_rate": 1.9312215982983072e-06, + "loss": 0.0102, + "step": 40948 + }, + { + "epoch": 0.819, + "grad_norm": 0.06441409885883331, + "learning_rate": 1.930396878561983e-06, + "loss": 0.0009, + "step": 40950 + }, + { + "epoch": 0.81904, + "grad_norm": 0.3603532910346985, + "learning_rate": 1.9295723161467206e-06, + "loss": 0.0053, + "step": 40952 + }, + { + "epoch": 0.81908, + "grad_norm": 1.0300976037979126, + "learning_rate": 1.928747911068606e-06, + "loss": 0.0108, + "step": 40954 + }, + { + "epoch": 0.81912, + "grad_norm": 0.07639555633068085, + "learning_rate": 1.9279236633437037e-06, + "loss": 0.0176, + "step": 40956 + }, + { + "epoch": 0.81916, + "grad_norm": 0.016573484987020493, + "learning_rate": 1.927099572988085e-06, + "loss": 0.0059, + "step": 40958 + }, + { + "epoch": 0.8192, + "grad_norm": 5.540950775146484, + "learning_rate": 1.9262756400178163e-06, + "loss": 0.0658, + "step": 40960 + }, + { + "epoch": 0.81924, + "grad_norm": 0.19863538444042206, + "learning_rate": 1.925451864448964e-06, + "loss": 0.0049, + "step": 40962 + }, + { + "epoch": 0.81928, + "grad_norm": 0.09863756597042084, + "learning_rate": 1.924628246297581e-06, + "loss": 0.0014, + "step": 40964 + }, + { + "epoch": 0.81932, + "grad_norm": 0.027420764788985252, + "learning_rate": 1.923804785579728e-06, + "loss": 0.0004, + "step": 40966 + }, + { + "epoch": 0.81936, + "grad_norm": 0.028791598975658417, + "learning_rate": 1.92298148231146e-06, + "loss": 0.0047, + "step": 40968 + }, + { + "epoch": 0.8194, + "grad_norm": 0.005344393663108349, + "learning_rate": 1.9221583365088246e-06, + "loss": 0.0001, + "step": 40970 + }, + { + "epoch": 0.81944, + "grad_norm": 0.019762106239795685, + "learning_rate": 1.921335348187875e-06, + "loss": 0.0012, + "step": 40972 + }, + { + "epoch": 0.81948, + "grad_norm": 0.00935053825378418, + "learning_rate": 1.9205125173646465e-06, + "loss": 0.0006, + "step": 40974 + }, + { + "epoch": 0.81952, + "grad_norm": 0.028462912887334824, + "learning_rate": 1.91968984405519e-06, + "loss": 0.0648, + "step": 40976 + }, + { + "epoch": 0.81956, + "grad_norm": 0.21252907812595367, + "learning_rate": 1.918867328275539e-06, + "loss": 0.0021, + "step": 40978 + }, + { + "epoch": 0.8196, + "grad_norm": 0.029670894145965576, + "learning_rate": 1.918044970041729e-06, + "loss": 0.0106, + "step": 40980 + }, + { + "epoch": 0.81964, + "grad_norm": 1.5461894273757935, + "learning_rate": 1.9172227693697963e-06, + "loss": 0.0138, + "step": 40982 + }, + { + "epoch": 0.81968, + "grad_norm": 0.04196453467011452, + "learning_rate": 1.916400726275762e-06, + "loss": 0.0068, + "step": 40984 + }, + { + "epoch": 0.81972, + "grad_norm": 0.02909558266401291, + "learning_rate": 1.9155788407756626e-06, + "loss": 0.0047, + "step": 40986 + }, + { + "epoch": 0.81976, + "grad_norm": 0.6167750954627991, + "learning_rate": 1.9147571128855125e-06, + "loss": 0.0071, + "step": 40988 + }, + { + "epoch": 0.8198, + "grad_norm": 0.0903601422905922, + "learning_rate": 1.9139355426213346e-06, + "loss": 0.0342, + "step": 40990 + }, + { + "epoch": 0.81984, + "grad_norm": 5.681537628173828, + "learning_rate": 1.913114129999147e-06, + "loss": 0.0583, + "step": 40992 + }, + { + "epoch": 0.81988, + "grad_norm": 0.03821408376097679, + "learning_rate": 1.9122928750349655e-06, + "loss": 0.0004, + "step": 40994 + }, + { + "epoch": 0.81992, + "grad_norm": 0.5665810108184814, + "learning_rate": 1.9114717777447955e-06, + "loss": 0.0063, + "step": 40996 + }, + { + "epoch": 0.81996, + "grad_norm": 8.756016731262207, + "learning_rate": 1.9106508381446478e-06, + "loss": 0.0794, + "step": 40998 + }, + { + "epoch": 0.82, + "grad_norm": 0.5230412483215332, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.0071, + "step": 41000 + }, + { + "epoch": 0.82004, + "grad_norm": 0.05294477939605713, + "learning_rate": 1.9090094320784337e-06, + "loss": 0.0024, + "step": 41002 + }, + { + "epoch": 0.82008, + "grad_norm": 0.0537724532186985, + "learning_rate": 1.9081889656443696e-06, + "loss": 0.0008, + "step": 41004 + }, + { + "epoch": 0.82012, + "grad_norm": 0.1878962367773056, + "learning_rate": 1.9073686569643224e-06, + "loss": 0.0289, + "step": 41006 + }, + { + "epoch": 0.82016, + "grad_norm": 0.09286952018737793, + "learning_rate": 1.9065485060542942e-06, + "loss": 0.0016, + "step": 41008 + }, + { + "epoch": 0.8202, + "grad_norm": 1.5399374961853027, + "learning_rate": 1.9057285129302682e-06, + "loss": 0.0149, + "step": 41010 + }, + { + "epoch": 0.82024, + "grad_norm": 0.2006540596485138, + "learning_rate": 1.9049086776082315e-06, + "loss": 0.0029, + "step": 41012 + }, + { + "epoch": 0.82028, + "grad_norm": 0.2570883333683014, + "learning_rate": 1.904089000104168e-06, + "loss": 0.0202, + "step": 41014 + }, + { + "epoch": 0.82032, + "grad_norm": 0.08554255962371826, + "learning_rate": 1.9032694804340568e-06, + "loss": 0.0015, + "step": 41016 + }, + { + "epoch": 0.82036, + "grad_norm": 0.16342398524284363, + "learning_rate": 1.9024501186138788e-06, + "loss": 0.0022, + "step": 41018 + }, + { + "epoch": 0.8204, + "grad_norm": 29.206111907958984, + "learning_rate": 1.9016309146596024e-06, + "loss": 0.4362, + "step": 41020 + }, + { + "epoch": 0.82044, + "grad_norm": 0.030212130397558212, + "learning_rate": 1.9008118685872002e-06, + "loss": 0.0047, + "step": 41022 + }, + { + "epoch": 0.82048, + "grad_norm": 0.07647576928138733, + "learning_rate": 1.8999929804126405e-06, + "loss": 0.0007, + "step": 41024 + }, + { + "epoch": 0.82052, + "grad_norm": 0.03931758180260658, + "learning_rate": 1.8991742501518873e-06, + "loss": 0.0016, + "step": 41026 + }, + { + "epoch": 0.82056, + "grad_norm": 0.23779229819774628, + "learning_rate": 1.8983556778209021e-06, + "loss": 0.0026, + "step": 41028 + }, + { + "epoch": 0.8206, + "grad_norm": 0.021650170907378197, + "learning_rate": 1.8975372634356481e-06, + "loss": 0.0717, + "step": 41030 + }, + { + "epoch": 0.82064, + "grad_norm": 0.022472310811281204, + "learning_rate": 1.8967190070120734e-06, + "loss": 0.0008, + "step": 41032 + }, + { + "epoch": 0.82068, + "grad_norm": 0.07980572432279587, + "learning_rate": 1.895900908566134e-06, + "loss": 0.0074, + "step": 41034 + }, + { + "epoch": 0.82072, + "grad_norm": 0.05370068922638893, + "learning_rate": 1.8950829681137806e-06, + "loss": 0.0008, + "step": 41036 + }, + { + "epoch": 0.82076, + "grad_norm": 9.09918212890625, + "learning_rate": 1.8942651856709516e-06, + "loss": 0.0719, + "step": 41038 + }, + { + "epoch": 0.8208, + "grad_norm": 0.08516528457403183, + "learning_rate": 1.8934475612536019e-06, + "loss": 0.001, + "step": 41040 + }, + { + "epoch": 0.82084, + "grad_norm": 0.19771356880664825, + "learning_rate": 1.892630094877661e-06, + "loss": 0.0019, + "step": 41042 + }, + { + "epoch": 0.82088, + "grad_norm": 0.042416106909513474, + "learning_rate": 1.891812786559074e-06, + "loss": 0.0015, + "step": 41044 + }, + { + "epoch": 0.82092, + "grad_norm": 2.5590803623199463, + "learning_rate": 1.8909956363137694e-06, + "loss": 0.0339, + "step": 41046 + }, + { + "epoch": 0.82096, + "grad_norm": 0.016190877184271812, + "learning_rate": 1.8901786441576798e-06, + "loss": 0.0006, + "step": 41048 + }, + { + "epoch": 0.821, + "grad_norm": 0.46151161193847656, + "learning_rate": 1.8893618101067357e-06, + "loss": 0.0061, + "step": 41050 + }, + { + "epoch": 0.82104, + "grad_norm": 0.19450466334819794, + "learning_rate": 1.8885451341768556e-06, + "loss": 0.003, + "step": 41052 + }, + { + "epoch": 0.82108, + "grad_norm": 0.059783685952425, + "learning_rate": 1.8877286163839636e-06, + "loss": 0.0041, + "step": 41054 + }, + { + "epoch": 0.82112, + "grad_norm": 4.647763729095459, + "learning_rate": 1.8869122567439802e-06, + "loss": 0.0654, + "step": 41056 + }, + { + "epoch": 0.82116, + "grad_norm": 0.2029763013124466, + "learning_rate": 1.8860960552728181e-06, + "loss": 0.0021, + "step": 41058 + }, + { + "epoch": 0.8212, + "grad_norm": 0.0636165589094162, + "learning_rate": 1.8852800119863912e-06, + "loss": 0.0011, + "step": 41060 + }, + { + "epoch": 0.82124, + "grad_norm": 1.713245153427124, + "learning_rate": 1.8844641269006103e-06, + "loss": 0.0367, + "step": 41062 + }, + { + "epoch": 0.82128, + "grad_norm": 0.013762904331088066, + "learning_rate": 1.8836484000313771e-06, + "loss": 0.0005, + "step": 41064 + }, + { + "epoch": 0.82132, + "grad_norm": 0.9232901930809021, + "learning_rate": 1.8828328313945964e-06, + "loss": 0.0099, + "step": 41066 + }, + { + "epoch": 0.82136, + "grad_norm": 0.2086166888475418, + "learning_rate": 1.8820174210061693e-06, + "loss": 0.0037, + "step": 41068 + }, + { + "epoch": 0.8214, + "grad_norm": 0.0618886761367321, + "learning_rate": 1.8812021688819914e-06, + "loss": 0.0042, + "step": 41070 + }, + { + "epoch": 0.82144, + "grad_norm": 0.0054923612624406815, + "learning_rate": 1.8803870750379605e-06, + "loss": 0.0001, + "step": 41072 + }, + { + "epoch": 0.82148, + "grad_norm": 0.04652936011552811, + "learning_rate": 1.879572139489958e-06, + "loss": 0.0005, + "step": 41074 + }, + { + "epoch": 0.82152, + "grad_norm": 0.049122512340545654, + "learning_rate": 1.8787573622538836e-06, + "loss": 0.0011, + "step": 41076 + }, + { + "epoch": 0.82156, + "grad_norm": 0.21063107252120972, + "learning_rate": 1.877942743345612e-06, + "loss": 0.0111, + "step": 41078 + }, + { + "epoch": 0.8216, + "grad_norm": 0.28361016511917114, + "learning_rate": 1.8771282827810278e-06, + "loss": 0.007, + "step": 41080 + }, + { + "epoch": 0.82164, + "grad_norm": 0.18825262784957886, + "learning_rate": 1.87631398057601e-06, + "loss": 0.0019, + "step": 41082 + }, + { + "epoch": 0.82168, + "grad_norm": 1.7844475507736206, + "learning_rate": 1.875499836746436e-06, + "loss": 0.0213, + "step": 41084 + }, + { + "epoch": 0.82172, + "grad_norm": 0.0006250101723708212, + "learning_rate": 1.8746858513081733e-06, + "loss": 0.0, + "step": 41086 + }, + { + "epoch": 0.82176, + "grad_norm": 0.010479368269443512, + "learning_rate": 1.8738720242770926e-06, + "loss": 0.0004, + "step": 41088 + }, + { + "epoch": 0.8218, + "grad_norm": 0.0002823368413373828, + "learning_rate": 1.8730583556690607e-06, + "loss": 0.0034, + "step": 41090 + }, + { + "epoch": 0.82184, + "grad_norm": 0.2946648895740509, + "learning_rate": 1.8722448454999409e-06, + "loss": 0.004, + "step": 41092 + }, + { + "epoch": 0.82188, + "grad_norm": 0.10201944410800934, + "learning_rate": 1.8714314937855938e-06, + "loss": 0.0014, + "step": 41094 + }, + { + "epoch": 0.82192, + "grad_norm": 0.03333418443799019, + "learning_rate": 1.8706183005418699e-06, + "loss": 0.038, + "step": 41096 + }, + { + "epoch": 0.82196, + "grad_norm": 0.0009647415718063712, + "learning_rate": 1.8698052657846323e-06, + "loss": 0.0006, + "step": 41098 + }, + { + "epoch": 0.822, + "grad_norm": 9.894988059997559, + "learning_rate": 1.8689923895297247e-06, + "loss": 0.0963, + "step": 41100 + }, + { + "epoch": 0.82204, + "grad_norm": 12.402405738830566, + "learning_rate": 1.8681796717929956e-06, + "loss": 0.2516, + "step": 41102 + }, + { + "epoch": 0.82208, + "grad_norm": 0.047372594475746155, + "learning_rate": 1.8673671125902937e-06, + "loss": 0.0022, + "step": 41104 + }, + { + "epoch": 0.82212, + "grad_norm": 0.01940496638417244, + "learning_rate": 1.8665547119374516e-06, + "loss": 0.0011, + "step": 41106 + }, + { + "epoch": 0.82216, + "grad_norm": 0.14041896164417267, + "learning_rate": 1.865742469850318e-06, + "loss": 0.0016, + "step": 41108 + }, + { + "epoch": 0.8222, + "grad_norm": 0.04280690848827362, + "learning_rate": 1.86493038634472e-06, + "loss": 0.0003, + "step": 41110 + }, + { + "epoch": 0.82224, + "grad_norm": 0.12583567202091217, + "learning_rate": 1.8641184614364926e-06, + "loss": 0.0047, + "step": 41112 + }, + { + "epoch": 0.82228, + "grad_norm": 0.21954117715358734, + "learning_rate": 1.8633066951414647e-06, + "loss": 0.8983, + "step": 41114 + }, + { + "epoch": 0.82232, + "grad_norm": 0.559812605381012, + "learning_rate": 1.8624950874754644e-06, + "loss": 0.0051, + "step": 41116 + }, + { + "epoch": 0.82236, + "grad_norm": 0.12320636212825775, + "learning_rate": 1.861683638454308e-06, + "loss": 0.006, + "step": 41118 + }, + { + "epoch": 0.8224, + "grad_norm": 14.80784797668457, + "learning_rate": 1.8608723480938207e-06, + "loss": 0.6311, + "step": 41120 + }, + { + "epoch": 0.82244, + "grad_norm": 0.05618329718708992, + "learning_rate": 1.8600612164098165e-06, + "loss": 0.0078, + "step": 41122 + }, + { + "epoch": 0.82248, + "grad_norm": 19.244522094726562, + "learning_rate": 1.8592502434181092e-06, + "loss": 0.7347, + "step": 41124 + }, + { + "epoch": 0.82252, + "grad_norm": 0.1297149658203125, + "learning_rate": 1.858439429134512e-06, + "loss": 0.0013, + "step": 41126 + }, + { + "epoch": 0.82256, + "grad_norm": 0.09779436141252518, + "learning_rate": 1.8576287735748255e-06, + "loss": 0.0019, + "step": 41128 + }, + { + "epoch": 0.8226, + "grad_norm": 0.0031969526316970587, + "learning_rate": 1.8568182767548626e-06, + "loss": 0.0005, + "step": 41130 + }, + { + "epoch": 0.82264, + "grad_norm": 0.26201605796813965, + "learning_rate": 1.8560079386904173e-06, + "loss": 0.0054, + "step": 41132 + }, + { + "epoch": 0.82268, + "grad_norm": 0.018444588407874107, + "learning_rate": 1.8551977593972903e-06, + "loss": 0.0004, + "step": 41134 + }, + { + "epoch": 0.82272, + "grad_norm": 0.048011764883995056, + "learning_rate": 1.8543877388912756e-06, + "loss": 0.0021, + "step": 41136 + }, + { + "epoch": 0.82276, + "grad_norm": 0.015004988759756088, + "learning_rate": 1.8535778771881652e-06, + "loss": 0.0018, + "step": 41138 + }, + { + "epoch": 0.8228, + "grad_norm": 0.0348891019821167, + "learning_rate": 1.8527681743037518e-06, + "loss": 0.0003, + "step": 41140 + }, + { + "epoch": 0.82284, + "grad_norm": 0.04013890027999878, + "learning_rate": 1.851958630253815e-06, + "loss": 0.0005, + "step": 41142 + }, + { + "epoch": 0.82288, + "grad_norm": 0.02937096171081066, + "learning_rate": 1.8511492450541379e-06, + "loss": 0.0033, + "step": 41144 + }, + { + "epoch": 0.82292, + "grad_norm": 0.04895791411399841, + "learning_rate": 1.8503400187205034e-06, + "loss": 0.0085, + "step": 41146 + }, + { + "epoch": 0.82296, + "grad_norm": 0.10563423484563828, + "learning_rate": 1.8495309512686854e-06, + "loss": 0.0022, + "step": 41148 + }, + { + "epoch": 0.823, + "grad_norm": 0.00681409053504467, + "learning_rate": 1.848722042714457e-06, + "loss": 0.0027, + "step": 41150 + }, + { + "epoch": 0.82304, + "grad_norm": 0.08235202729701996, + "learning_rate": 1.847913293073592e-06, + "loss": 0.0008, + "step": 41152 + }, + { + "epoch": 0.82308, + "grad_norm": 0.2403695285320282, + "learning_rate": 1.8471047023618516e-06, + "loss": 0.0023, + "step": 41154 + }, + { + "epoch": 0.82312, + "grad_norm": 0.02979101426899433, + "learning_rate": 1.8462962705950028e-06, + "loss": 0.0006, + "step": 41156 + }, + { + "epoch": 0.82316, + "grad_norm": 0.05746546760201454, + "learning_rate": 1.8454879977888085e-06, + "loss": 0.0012, + "step": 41158 + }, + { + "epoch": 0.8232, + "grad_norm": 11.489385604858398, + "learning_rate": 1.8446798839590186e-06, + "loss": 0.2517, + "step": 41160 + }, + { + "epoch": 0.82324, + "grad_norm": 1.0548641681671143, + "learning_rate": 1.843871929121398e-06, + "loss": 0.0109, + "step": 41162 + }, + { + "epoch": 0.82328, + "grad_norm": 0.011194063350558281, + "learning_rate": 1.8430641332916877e-06, + "loss": 0.0008, + "step": 41164 + }, + { + "epoch": 0.82332, + "grad_norm": 0.2559393048286438, + "learning_rate": 1.8422564964856481e-06, + "loss": 0.0051, + "step": 41166 + }, + { + "epoch": 0.82336, + "grad_norm": 0.031444817781448364, + "learning_rate": 1.8414490187190137e-06, + "loss": 0.0003, + "step": 41168 + }, + { + "epoch": 0.8234, + "grad_norm": 0.005868889857083559, + "learning_rate": 1.8406417000075327e-06, + "loss": 0.005, + "step": 41170 + }, + { + "epoch": 0.82344, + "grad_norm": 1.2962716817855835, + "learning_rate": 1.8398345403669437e-06, + "loss": 0.0149, + "step": 41172 + }, + { + "epoch": 0.82348, + "grad_norm": 0.45652279257774353, + "learning_rate": 1.8390275398129798e-06, + "loss": 0.0057, + "step": 41174 + }, + { + "epoch": 0.82352, + "grad_norm": 0.15597105026245117, + "learning_rate": 1.838220698361375e-06, + "loss": 0.0015, + "step": 41176 + }, + { + "epoch": 0.82356, + "grad_norm": 0.21496395766735077, + "learning_rate": 1.8374140160278598e-06, + "loss": 0.0024, + "step": 41178 + }, + { + "epoch": 0.8236, + "grad_norm": 2.035144090652466, + "learning_rate": 1.8366074928281608e-06, + "loss": 0.0189, + "step": 41180 + }, + { + "epoch": 0.82364, + "grad_norm": 0.6576247215270996, + "learning_rate": 1.835801128778001e-06, + "loss": 0.0086, + "step": 41182 + }, + { + "epoch": 0.82368, + "grad_norm": 0.008973336778581142, + "learning_rate": 1.8349949238931042e-06, + "loss": 0.1157, + "step": 41184 + }, + { + "epoch": 0.82372, + "grad_norm": 0.3750559985637665, + "learning_rate": 1.8341888781891836e-06, + "loss": 0.0036, + "step": 41186 + }, + { + "epoch": 0.82376, + "grad_norm": 0.01680944301187992, + "learning_rate": 1.8333829916819533e-06, + "loss": 0.0002, + "step": 41188 + }, + { + "epoch": 0.8238, + "grad_norm": 0.21145206689834595, + "learning_rate": 1.8325772643871264e-06, + "loss": 0.0026, + "step": 41190 + }, + { + "epoch": 0.82384, + "grad_norm": 0.1842556595802307, + "learning_rate": 1.8317716963204114e-06, + "loss": 0.0059, + "step": 41192 + }, + { + "epoch": 0.82388, + "grad_norm": 0.018060917034745216, + "learning_rate": 1.8309662874975142e-06, + "loss": 0.0002, + "step": 41194 + }, + { + "epoch": 0.82392, + "grad_norm": 0.18297691643238068, + "learning_rate": 1.8301610379341307e-06, + "loss": 0.0102, + "step": 41196 + }, + { + "epoch": 0.82396, + "grad_norm": 0.1058768630027771, + "learning_rate": 1.8293559476459676e-06, + "loss": 0.0011, + "step": 41198 + }, + { + "epoch": 0.824, + "grad_norm": 0.00396683719009161, + "learning_rate": 1.8285510166487154e-06, + "loss": 0.0012, + "step": 41200 + }, + { + "epoch": 0.82404, + "grad_norm": 0.10579708963632584, + "learning_rate": 1.8277462449580675e-06, + "loss": 0.0025, + "step": 41202 + }, + { + "epoch": 0.82408, + "grad_norm": 0.03953680768609047, + "learning_rate": 1.826941632589715e-06, + "loss": 0.0794, + "step": 41204 + }, + { + "epoch": 0.82412, + "grad_norm": 0.045212533324956894, + "learning_rate": 1.8261371795593442e-06, + "loss": 0.001, + "step": 41206 + }, + { + "epoch": 0.82416, + "grad_norm": 0.0435299426317215, + "learning_rate": 1.8253328858826357e-06, + "loss": 0.0005, + "step": 41208 + }, + { + "epoch": 0.8242, + "grad_norm": 0.19549278914928436, + "learning_rate": 1.8245287515752708e-06, + "loss": 0.0125, + "step": 41210 + }, + { + "epoch": 0.82424, + "grad_norm": 0.0024470242206007242, + "learning_rate": 1.8237247766529265e-06, + "loss": 0.0015, + "step": 41212 + }, + { + "epoch": 0.82428, + "grad_norm": 0.24130012094974518, + "learning_rate": 1.822920961131278e-06, + "loss": 0.0046, + "step": 41214 + }, + { + "epoch": 0.82432, + "grad_norm": 0.1120024174451828, + "learning_rate": 1.8221173050259976e-06, + "loss": 0.0077, + "step": 41216 + }, + { + "epoch": 0.82436, + "grad_norm": 0.0021931491792201996, + "learning_rate": 1.821313808352745e-06, + "loss": 0.0009, + "step": 41218 + }, + { + "epoch": 0.8244, + "grad_norm": 2.1036155223846436, + "learning_rate": 1.820510471127196e-06, + "loss": 0.0242, + "step": 41220 + }, + { + "epoch": 0.82444, + "grad_norm": 0.043570101261138916, + "learning_rate": 1.8197072933650028e-06, + "loss": 0.001, + "step": 41222 + }, + { + "epoch": 0.82448, + "grad_norm": 0.029301151633262634, + "learning_rate": 1.8189042750818287e-06, + "loss": 0.0003, + "step": 41224 + }, + { + "epoch": 0.82452, + "grad_norm": 0.12221525609493256, + "learning_rate": 1.8181014162933297e-06, + "loss": 0.0015, + "step": 41226 + }, + { + "epoch": 0.82456, + "grad_norm": 0.14287982881069183, + "learning_rate": 1.8172987170151514e-06, + "loss": 0.0047, + "step": 41228 + }, + { + "epoch": 0.8246, + "grad_norm": 0.04203531891107559, + "learning_rate": 1.816496177262952e-06, + "loss": 0.0006, + "step": 41230 + }, + { + "epoch": 0.82464, + "grad_norm": 0.17000159621238708, + "learning_rate": 1.8156937970523702e-06, + "loss": 0.0017, + "step": 41232 + }, + { + "epoch": 0.82468, + "grad_norm": 0.009493180550634861, + "learning_rate": 1.8148915763990505e-06, + "loss": 0.0002, + "step": 41234 + }, + { + "epoch": 0.82472, + "grad_norm": 0.24444226920604706, + "learning_rate": 1.8140895153186344e-06, + "loss": 0.0032, + "step": 41236 + }, + { + "epoch": 0.82476, + "grad_norm": 0.013660451397299767, + "learning_rate": 1.8132876138267574e-06, + "loss": 0.0022, + "step": 41238 + }, + { + "epoch": 0.8248, + "grad_norm": 0.03459491208195686, + "learning_rate": 1.812485871939056e-06, + "loss": 0.006, + "step": 41240 + }, + { + "epoch": 0.82484, + "grad_norm": 0.7612330317497253, + "learning_rate": 1.811684289671155e-06, + "loss": 0.0089, + "step": 41242 + }, + { + "epoch": 0.82488, + "grad_norm": 0.05424916744232178, + "learning_rate": 1.8108828670386847e-06, + "loss": 0.0007, + "step": 41244 + }, + { + "epoch": 0.82492, + "grad_norm": 0.3895120620727539, + "learning_rate": 1.8100816040572688e-06, + "loss": 0.0038, + "step": 41246 + }, + { + "epoch": 0.82496, + "grad_norm": 0.18534733355045319, + "learning_rate": 1.8092805007425307e-06, + "loss": 0.0021, + "step": 41248 + }, + { + "epoch": 0.825, + "grad_norm": 0.23987974226474762, + "learning_rate": 1.808479557110081e-06, + "loss": 0.0028, + "step": 41250 + }, + { + "epoch": 0.82504, + "grad_norm": 0.011968418955802917, + "learning_rate": 1.807678773175545e-06, + "loss": 0.0002, + "step": 41252 + }, + { + "epoch": 0.82508, + "grad_norm": 0.2193831354379654, + "learning_rate": 1.8068781489545273e-06, + "loss": 0.0021, + "step": 41254 + }, + { + "epoch": 0.82512, + "grad_norm": 0.03218618780374527, + "learning_rate": 1.806077684462637e-06, + "loss": 0.0047, + "step": 41256 + }, + { + "epoch": 0.82516, + "grad_norm": 0.0581137016415596, + "learning_rate": 1.8052773797154833e-06, + "loss": 0.0007, + "step": 41258 + }, + { + "epoch": 0.8252, + "grad_norm": 0.07419241219758987, + "learning_rate": 1.804477234728661e-06, + "loss": 0.0027, + "step": 41260 + }, + { + "epoch": 0.82524, + "grad_norm": 0.14743027091026306, + "learning_rate": 1.8036772495177802e-06, + "loss": 0.004, + "step": 41262 + }, + { + "epoch": 0.82528, + "grad_norm": 0.14164945483207703, + "learning_rate": 1.8028774240984282e-06, + "loss": 0.0017, + "step": 41264 + }, + { + "epoch": 0.82532, + "grad_norm": 0.0710655227303505, + "learning_rate": 1.8020777584862004e-06, + "loss": 0.0013, + "step": 41266 + }, + { + "epoch": 0.82536, + "grad_norm": 0.073117196559906, + "learning_rate": 1.8012782526966877e-06, + "loss": 0.0029, + "step": 41268 + }, + { + "epoch": 0.8254, + "grad_norm": 0.3615918457508087, + "learning_rate": 1.8004789067454763e-06, + "loss": 0.0307, + "step": 41270 + }, + { + "epoch": 0.82544, + "grad_norm": 0.16993743181228638, + "learning_rate": 1.799679720648151e-06, + "loss": 0.0022, + "step": 41272 + }, + { + "epoch": 0.82548, + "grad_norm": 1.6396030187606812, + "learning_rate": 1.798880694420293e-06, + "loss": 0.0198, + "step": 41274 + }, + { + "epoch": 0.82552, + "grad_norm": 0.0038501673843711615, + "learning_rate": 1.7980818280774749e-06, + "loss": 0.0, + "step": 41276 + }, + { + "epoch": 0.82556, + "grad_norm": 4.387983798980713, + "learning_rate": 1.797283121635275e-06, + "loss": 0.0523, + "step": 41278 + }, + { + "epoch": 0.8256, + "grad_norm": 0.013717332854866982, + "learning_rate": 1.7964845751092663e-06, + "loss": 0.0002, + "step": 41280 + }, + { + "epoch": 0.82564, + "grad_norm": 2.34997820854187, + "learning_rate": 1.795686188515009e-06, + "loss": 0.0268, + "step": 41282 + }, + { + "epoch": 0.82568, + "grad_norm": 0.2428501397371292, + "learning_rate": 1.7948879618680781e-06, + "loss": 0.0017, + "step": 41284 + }, + { + "epoch": 0.82572, + "grad_norm": 0.7505718469619751, + "learning_rate": 1.7940898951840257e-06, + "loss": 0.008, + "step": 41286 + }, + { + "epoch": 0.82576, + "grad_norm": 0.02736750617623329, + "learning_rate": 1.7932919884784216e-06, + "loss": 0.001, + "step": 41288 + }, + { + "epoch": 0.8258, + "grad_norm": 0.03409423306584358, + "learning_rate": 1.7924942417668113e-06, + "loss": 0.001, + "step": 41290 + }, + { + "epoch": 0.82584, + "grad_norm": 0.04878401756286621, + "learning_rate": 1.7916966550647518e-06, + "loss": 0.0008, + "step": 41292 + }, + { + "epoch": 0.82588, + "grad_norm": 0.3909296691417694, + "learning_rate": 1.7908992283877946e-06, + "loss": 0.0055, + "step": 41294 + }, + { + "epoch": 0.82592, + "grad_norm": 0.8666507005691528, + "learning_rate": 1.7901019617514815e-06, + "loss": 0.0235, + "step": 41296 + }, + { + "epoch": 0.82596, + "grad_norm": 0.012156969867646694, + "learning_rate": 1.7893048551713566e-06, + "loss": 0.0031, + "step": 41298 + }, + { + "epoch": 0.826, + "grad_norm": 0.041405849158763885, + "learning_rate": 1.7885079086629598e-06, + "loss": 0.0007, + "step": 41300 + }, + { + "epoch": 0.82604, + "grad_norm": 0.0008252176339738071, + "learning_rate": 1.7877111222418298e-06, + "loss": 0.0056, + "step": 41302 + }, + { + "epoch": 0.82608, + "grad_norm": 0.02392655797302723, + "learning_rate": 1.7869144959234996e-06, + "loss": 0.0002, + "step": 41304 + }, + { + "epoch": 0.82612, + "grad_norm": 0.004729226231575012, + "learning_rate": 1.7861180297235025e-06, + "loss": 0.0002, + "step": 41306 + }, + { + "epoch": 0.82616, + "grad_norm": 0.04826056957244873, + "learning_rate": 1.7853217236573594e-06, + "loss": 0.0024, + "step": 41308 + }, + { + "epoch": 0.8262, + "grad_norm": 0.003416006453335285, + "learning_rate": 1.7845255777406e-06, + "loss": 0.0017, + "step": 41310 + }, + { + "epoch": 0.82624, + "grad_norm": 0.056297894567251205, + "learning_rate": 1.7837295919887454e-06, + "loss": 0.0009, + "step": 41312 + }, + { + "epoch": 0.82628, + "grad_norm": 0.017202110961079597, + "learning_rate": 1.7829337664173086e-06, + "loss": 0.0003, + "step": 41314 + }, + { + "epoch": 0.82632, + "grad_norm": 0.2567130923271179, + "learning_rate": 1.7821381010418126e-06, + "loss": 0.0034, + "step": 41316 + }, + { + "epoch": 0.82636, + "grad_norm": 1.1957752704620361, + "learning_rate": 1.7813425958777597e-06, + "loss": 0.0148, + "step": 41318 + }, + { + "epoch": 0.8264, + "grad_norm": 0.033727120608091354, + "learning_rate": 1.7805472509406695e-06, + "loss": 0.0006, + "step": 41320 + }, + { + "epoch": 0.82644, + "grad_norm": 10.999728202819824, + "learning_rate": 1.7797520662460411e-06, + "loss": 0.1632, + "step": 41322 + }, + { + "epoch": 0.82648, + "grad_norm": 0.18408508598804474, + "learning_rate": 1.7789570418093772e-06, + "loss": 0.0051, + "step": 41324 + }, + { + "epoch": 0.82652, + "grad_norm": 0.03336691856384277, + "learning_rate": 1.77816217764618e-06, + "loss": 0.0007, + "step": 41326 + }, + { + "epoch": 0.82656, + "grad_norm": 0.09880004823207855, + "learning_rate": 1.7773674737719426e-06, + "loss": 0.0016, + "step": 41328 + }, + { + "epoch": 0.8266, + "grad_norm": 0.006599291693419218, + "learning_rate": 1.7765729302021596e-06, + "loss": 0.0012, + "step": 41330 + }, + { + "epoch": 0.82664, + "grad_norm": 0.18464449048042297, + "learning_rate": 1.7757785469523203e-06, + "loss": 0.0018, + "step": 41332 + }, + { + "epoch": 0.82668, + "grad_norm": 19.115760803222656, + "learning_rate": 1.7749843240379128e-06, + "loss": 0.4443, + "step": 41334 + }, + { + "epoch": 0.82672, + "grad_norm": 1.7571851015090942, + "learning_rate": 1.77419026147442e-06, + "loss": 0.0149, + "step": 41336 + }, + { + "epoch": 0.82676, + "grad_norm": 0.002642508829012513, + "learning_rate": 1.7733963592773263e-06, + "loss": 0.0001, + "step": 41338 + }, + { + "epoch": 0.8268, + "grad_norm": 6.405312538146973, + "learning_rate": 1.7726026174621004e-06, + "loss": 0.1636, + "step": 41340 + }, + { + "epoch": 0.82684, + "grad_norm": 0.18397657573223114, + "learning_rate": 1.7718090360442275e-06, + "loss": 0.0023, + "step": 41342 + }, + { + "epoch": 0.82688, + "grad_norm": 0.06729254871606827, + "learning_rate": 1.77101561503917e-06, + "loss": 0.0007, + "step": 41344 + }, + { + "epoch": 0.82692, + "grad_norm": 0.009701604023575783, + "learning_rate": 1.7702223544624008e-06, + "loss": 0.0151, + "step": 41346 + }, + { + "epoch": 0.82696, + "grad_norm": 0.02625950239598751, + "learning_rate": 1.7694292543293857e-06, + "loss": 0.0151, + "step": 41348 + }, + { + "epoch": 0.827, + "grad_norm": 0.016862956807017326, + "learning_rate": 1.7686363146555807e-06, + "loss": 0.0012, + "step": 41350 + }, + { + "epoch": 0.82704, + "grad_norm": 0.011372131295502186, + "learning_rate": 1.7678435354564528e-06, + "loss": 0.0011, + "step": 41352 + }, + { + "epoch": 0.82708, + "grad_norm": 0.1617102473974228, + "learning_rate": 1.7670509167474503e-06, + "loss": 0.0047, + "step": 41354 + }, + { + "epoch": 0.82712, + "grad_norm": 0.07647239416837692, + "learning_rate": 1.7662584585440302e-06, + "loss": 0.001, + "step": 41356 + }, + { + "epoch": 0.82716, + "grad_norm": 0.22075513005256653, + "learning_rate": 1.7654661608616398e-06, + "loss": 0.0033, + "step": 41358 + }, + { + "epoch": 0.8272, + "grad_norm": 0.030770564451813698, + "learning_rate": 1.7646740237157256e-06, + "loss": 0.0043, + "step": 41360 + }, + { + "epoch": 0.82724, + "grad_norm": 14.744437217712402, + "learning_rate": 1.7638820471217332e-06, + "loss": 0.3191, + "step": 41362 + }, + { + "epoch": 0.82728, + "grad_norm": 0.07320283353328705, + "learning_rate": 1.7630902310950993e-06, + "loss": 0.0017, + "step": 41364 + }, + { + "epoch": 0.82732, + "grad_norm": 0.024458538740873337, + "learning_rate": 1.7622985756512613e-06, + "loss": 0.0006, + "step": 41366 + }, + { + "epoch": 0.82736, + "grad_norm": 0.04348212108016014, + "learning_rate": 1.7615070808056533e-06, + "loss": 0.0015, + "step": 41368 + }, + { + "epoch": 0.8274, + "grad_norm": 0.1441025286912918, + "learning_rate": 1.760715746573709e-06, + "loss": 0.0011, + "step": 41370 + }, + { + "epoch": 0.82744, + "grad_norm": 0.6506040096282959, + "learning_rate": 1.7599245729708491e-06, + "loss": 0.0072, + "step": 41372 + }, + { + "epoch": 0.82748, + "grad_norm": 0.18470297753810883, + "learning_rate": 1.7591335600125058e-06, + "loss": 0.0022, + "step": 41374 + }, + { + "epoch": 0.82752, + "grad_norm": 2.633981227874756, + "learning_rate": 1.7583427077140947e-06, + "loss": 0.0244, + "step": 41376 + }, + { + "epoch": 0.82756, + "grad_norm": 0.17491614818572998, + "learning_rate": 1.7575520160910341e-06, + "loss": 0.006, + "step": 41378 + }, + { + "epoch": 0.8276, + "grad_norm": 0.7305518388748169, + "learning_rate": 1.7567614851587444e-06, + "loss": 0.0063, + "step": 41380 + }, + { + "epoch": 0.82764, + "grad_norm": 0.11351778358221054, + "learning_rate": 1.7559711149326276e-06, + "loss": 0.0018, + "step": 41382 + }, + { + "epoch": 0.82768, + "grad_norm": 0.011882133781909943, + "learning_rate": 1.7551809054281045e-06, + "loss": 0.0066, + "step": 41384 + }, + { + "epoch": 0.82772, + "grad_norm": 0.03750584274530411, + "learning_rate": 1.7543908566605705e-06, + "loss": 0.0015, + "step": 41386 + }, + { + "epoch": 0.82776, + "grad_norm": 0.1168479323387146, + "learning_rate": 1.7536009686454325e-06, + "loss": 0.0046, + "step": 41388 + }, + { + "epoch": 0.8278, + "grad_norm": 0.04459555074572563, + "learning_rate": 1.7528112413980892e-06, + "loss": 0.0007, + "step": 41390 + }, + { + "epoch": 0.82784, + "grad_norm": 0.8688352108001709, + "learning_rate": 1.7520216749339357e-06, + "loss": 0.0093, + "step": 41392 + }, + { + "epoch": 0.82788, + "grad_norm": 0.16860410571098328, + "learning_rate": 1.7512322692683702e-06, + "loss": 0.0018, + "step": 41394 + }, + { + "epoch": 0.82792, + "grad_norm": 0.019998818635940552, + "learning_rate": 1.7504430244167747e-06, + "loss": 0.0009, + "step": 41396 + }, + { + "epoch": 0.82796, + "grad_norm": 0.09672877192497253, + "learning_rate": 1.74965394039454e-06, + "loss": 0.0028, + "step": 41398 + }, + { + "epoch": 0.828, + "grad_norm": 2.200066089630127, + "learning_rate": 1.7488650172170496e-06, + "loss": 0.0195, + "step": 41400 + }, + { + "epoch": 0.82804, + "grad_norm": 0.1010538712143898, + "learning_rate": 1.7480762548996855e-06, + "loss": 0.0059, + "step": 41402 + }, + { + "epoch": 0.82808, + "grad_norm": 0.11818861961364746, + "learning_rate": 1.7472876534578177e-06, + "loss": 0.0018, + "step": 41404 + }, + { + "epoch": 0.82812, + "grad_norm": 0.1277223527431488, + "learning_rate": 1.7464992129068327e-06, + "loss": 0.0027, + "step": 41406 + }, + { + "epoch": 0.82816, + "grad_norm": 0.004742464050650597, + "learning_rate": 1.7457109332620881e-06, + "loss": 0.0006, + "step": 41408 + }, + { + "epoch": 0.8282, + "grad_norm": 0.003209729678928852, + "learning_rate": 1.744922814538964e-06, + "loss": 0.1377, + "step": 41410 + }, + { + "epoch": 0.82824, + "grad_norm": 0.06591405719518661, + "learning_rate": 1.7441348567528172e-06, + "loss": 0.001, + "step": 41412 + }, + { + "epoch": 0.82828, + "grad_norm": 0.34997087717056274, + "learning_rate": 1.7433470599190117e-06, + "loss": 0.004, + "step": 41414 + }, + { + "epoch": 0.82832, + "grad_norm": 0.10648290067911148, + "learning_rate": 1.7425594240529087e-06, + "loss": 0.0011, + "step": 41416 + }, + { + "epoch": 0.82836, + "grad_norm": 14.789148330688477, + "learning_rate": 1.741771949169858e-06, + "loss": 0.1896, + "step": 41418 + }, + { + "epoch": 0.8284, + "grad_norm": 0.2943202257156372, + "learning_rate": 1.7409846352852144e-06, + "loss": 0.0028, + "step": 41420 + }, + { + "epoch": 0.82844, + "grad_norm": 0.002422054298222065, + "learning_rate": 1.7401974824143286e-06, + "loss": 0.0007, + "step": 41422 + }, + { + "epoch": 0.82848, + "grad_norm": 0.1049223244190216, + "learning_rate": 1.739410490572545e-06, + "loss": 0.001, + "step": 41424 + }, + { + "epoch": 0.82852, + "grad_norm": 0.004844437353312969, + "learning_rate": 1.738623659775206e-06, + "loss": 0.0002, + "step": 41426 + }, + { + "epoch": 0.82856, + "grad_norm": 0.17884503304958344, + "learning_rate": 1.7378369900376557e-06, + "loss": 0.0147, + "step": 41428 + }, + { + "epoch": 0.8286, + "grad_norm": 0.08358876407146454, + "learning_rate": 1.7370504813752232e-06, + "loss": 0.0026, + "step": 41430 + }, + { + "epoch": 0.82864, + "grad_norm": 0.04287567362189293, + "learning_rate": 1.736264133803246e-06, + "loss": 0.0005, + "step": 41432 + }, + { + "epoch": 0.82868, + "grad_norm": 0.02722402662038803, + "learning_rate": 1.7354779473370542e-06, + "loss": 0.0003, + "step": 41434 + }, + { + "epoch": 0.82872, + "grad_norm": 0.4238235652446747, + "learning_rate": 1.7346919219919744e-06, + "loss": 0.0061, + "step": 41436 + }, + { + "epoch": 0.82876, + "grad_norm": 0.08611765503883362, + "learning_rate": 1.7339060577833334e-06, + "loss": 0.001, + "step": 41438 + }, + { + "epoch": 0.8288, + "grad_norm": 0.09598975628614426, + "learning_rate": 1.7331203547264452e-06, + "loss": 0.0531, + "step": 41440 + }, + { + "epoch": 0.82884, + "grad_norm": 0.09712383896112442, + "learning_rate": 1.7323348128366358e-06, + "loss": 0.0017, + "step": 41442 + }, + { + "epoch": 0.82888, + "grad_norm": 2.0771684646606445, + "learning_rate": 1.7315494321292136e-06, + "loss": 0.0315, + "step": 41444 + }, + { + "epoch": 0.82892, + "grad_norm": 0.037800103425979614, + "learning_rate": 1.7307642126194924e-06, + "loss": 0.0061, + "step": 41446 + }, + { + "epoch": 0.82896, + "grad_norm": 0.19393625855445862, + "learning_rate": 1.7299791543227817e-06, + "loss": 0.0051, + "step": 41448 + }, + { + "epoch": 0.829, + "grad_norm": 0.31672731041908264, + "learning_rate": 1.7291942572543806e-06, + "loss": 0.0048, + "step": 41450 + }, + { + "epoch": 0.82904, + "grad_norm": 0.18426457047462463, + "learning_rate": 1.7284095214296015e-06, + "loss": 0.0076, + "step": 41452 + }, + { + "epoch": 0.82908, + "grad_norm": 0.0188264399766922, + "learning_rate": 1.7276249468637329e-06, + "loss": 0.0008, + "step": 41454 + }, + { + "epoch": 0.82912, + "grad_norm": 0.06345869600772858, + "learning_rate": 1.7268405335720762e-06, + "loss": 0.0008, + "step": 41456 + }, + { + "epoch": 0.82916, + "grad_norm": 0.029214832931756973, + "learning_rate": 1.7260562815699223e-06, + "loss": 0.0017, + "step": 41458 + }, + { + "epoch": 0.8292, + "grad_norm": 0.004979033954441547, + "learning_rate": 1.7252721908725633e-06, + "loss": 0.0008, + "step": 41460 + }, + { + "epoch": 0.82924, + "grad_norm": 0.34155139327049255, + "learning_rate": 1.7244882614952808e-06, + "loss": 0.0447, + "step": 41462 + }, + { + "epoch": 0.82928, + "grad_norm": 0.04379443824291229, + "learning_rate": 1.7237044934533598e-06, + "loss": 0.0004, + "step": 41464 + }, + { + "epoch": 0.82932, + "grad_norm": 0.060091789811849594, + "learning_rate": 1.7229208867620817e-06, + "loss": 0.0007, + "step": 41466 + }, + { + "epoch": 0.82936, + "grad_norm": 0.04753374680876732, + "learning_rate": 1.7221374414367208e-06, + "loss": 0.0018, + "step": 41468 + }, + { + "epoch": 0.8294, + "grad_norm": 0.03140515834093094, + "learning_rate": 1.7213541574925551e-06, + "loss": 0.0004, + "step": 41470 + }, + { + "epoch": 0.82944, + "grad_norm": 0.057839568704366684, + "learning_rate": 1.720571034944848e-06, + "loss": 0.065, + "step": 41472 + }, + { + "epoch": 0.82948, + "grad_norm": 1.0210357904434204, + "learning_rate": 1.7197880738088757e-06, + "loss": 0.0086, + "step": 41474 + }, + { + "epoch": 0.82952, + "grad_norm": 0.1387186199426651, + "learning_rate": 1.7190052740998953e-06, + "loss": 0.0026, + "step": 41476 + }, + { + "epoch": 0.82956, + "grad_norm": 0.02129155956208706, + "learning_rate": 1.71822263583317e-06, + "loss": 0.0003, + "step": 41478 + }, + { + "epoch": 0.8296, + "grad_norm": 0.2984957695007324, + "learning_rate": 1.7174401590239587e-06, + "loss": 0.0026, + "step": 41480 + }, + { + "epoch": 0.82964, + "grad_norm": 0.018368136137723923, + "learning_rate": 1.7166578436875147e-06, + "loss": 0.0717, + "step": 41482 + }, + { + "epoch": 0.82968, + "grad_norm": 0.028833860531449318, + "learning_rate": 1.7158756898390939e-06, + "loss": 0.0004, + "step": 41484 + }, + { + "epoch": 0.82972, + "grad_norm": 0.25017601251602173, + "learning_rate": 1.7150936974939392e-06, + "loss": 0.0061, + "step": 41486 + }, + { + "epoch": 0.82976, + "grad_norm": 0.150814950466156, + "learning_rate": 1.7143118666672975e-06, + "loss": 0.0021, + "step": 41488 + }, + { + "epoch": 0.8298, + "grad_norm": 1.0006741285324097, + "learning_rate": 1.7135301973744122e-06, + "loss": 0.0118, + "step": 41490 + }, + { + "epoch": 0.82984, + "grad_norm": 0.09005827456712723, + "learning_rate": 1.7127486896305246e-06, + "loss": 0.0036, + "step": 41492 + }, + { + "epoch": 0.82988, + "grad_norm": 0.00909881666302681, + "learning_rate": 1.7119673434508632e-06, + "loss": 0.0269, + "step": 41494 + }, + { + "epoch": 0.82992, + "grad_norm": 0.131677508354187, + "learning_rate": 1.7111861588506695e-06, + "loss": 0.0018, + "step": 41496 + }, + { + "epoch": 0.82996, + "grad_norm": 0.024832407012581825, + "learning_rate": 1.7104051358451668e-06, + "loss": 0.001, + "step": 41498 + }, + { + "epoch": 0.83, + "grad_norm": 0.001552559551782906, + "learning_rate": 1.709624274449584e-06, + "loss": 0.0005, + "step": 41500 + }, + { + "epoch": 0.83004, + "grad_norm": 0.667356550693512, + "learning_rate": 1.7088435746791454e-06, + "loss": 0.0068, + "step": 41502 + }, + { + "epoch": 0.83008, + "grad_norm": 0.06029976159334183, + "learning_rate": 1.7080630365490658e-06, + "loss": 0.0037, + "step": 41504 + }, + { + "epoch": 0.83012, + "grad_norm": 0.0008612662786617875, + "learning_rate": 1.7072826600745719e-06, + "loss": 0.001, + "step": 41506 + }, + { + "epoch": 0.83016, + "grad_norm": 0.45751237869262695, + "learning_rate": 1.706502445270869e-06, + "loss": 0.0095, + "step": 41508 + }, + { + "epoch": 0.8302, + "grad_norm": 0.023354753851890564, + "learning_rate": 1.7057223921531706e-06, + "loss": 0.0006, + "step": 41510 + }, + { + "epoch": 0.83024, + "grad_norm": 0.14968429505825043, + "learning_rate": 1.7049425007366838e-06, + "loss": 0.0132, + "step": 41512 + }, + { + "epoch": 0.83028, + "grad_norm": 20.93052864074707, + "learning_rate": 1.7041627710366137e-06, + "loss": 0.4886, + "step": 41514 + }, + { + "epoch": 0.83032, + "grad_norm": 0.4868069589138031, + "learning_rate": 1.7033832030681652e-06, + "loss": 0.0074, + "step": 41516 + }, + { + "epoch": 0.83036, + "grad_norm": 0.04123077541589737, + "learning_rate": 1.7026037968465281e-06, + "loss": 0.0005, + "step": 41518 + }, + { + "epoch": 0.8304, + "grad_norm": 1.159233570098877, + "learning_rate": 1.7018245523869038e-06, + "loss": 0.0117, + "step": 41520 + }, + { + "epoch": 0.83044, + "grad_norm": 0.00020901445532217622, + "learning_rate": 1.7010454697044809e-06, + "loss": 0.003, + "step": 41522 + }, + { + "epoch": 0.83048, + "grad_norm": 0.06635459512472153, + "learning_rate": 1.700266548814451e-06, + "loss": 0.0008, + "step": 41524 + }, + { + "epoch": 0.83052, + "grad_norm": 0.16979274153709412, + "learning_rate": 1.6994877897319939e-06, + "loss": 0.0017, + "step": 41526 + }, + { + "epoch": 0.83056, + "grad_norm": 1.0400465726852417, + "learning_rate": 1.6987091924722999e-06, + "loss": 0.0779, + "step": 41528 + }, + { + "epoch": 0.8306, + "grad_norm": 0.649001955986023, + "learning_rate": 1.6979307570505422e-06, + "loss": 0.0096, + "step": 41530 + }, + { + "epoch": 0.83064, + "grad_norm": 0.016842197626829147, + "learning_rate": 1.6971524834818975e-06, + "loss": 0.0011, + "step": 41532 + }, + { + "epoch": 0.83068, + "grad_norm": 0.4616349935531616, + "learning_rate": 1.6963743717815406e-06, + "loss": 0.0052, + "step": 41534 + }, + { + "epoch": 0.83072, + "grad_norm": 0.003958732821047306, + "learning_rate": 1.6955964219646405e-06, + "loss": 0.0, + "step": 41536 + }, + { + "epoch": 0.83076, + "grad_norm": 0.010229837149381638, + "learning_rate": 1.6948186340463656e-06, + "loss": 0.0058, + "step": 41538 + }, + { + "epoch": 0.8308, + "grad_norm": 0.0042479149997234344, + "learning_rate": 1.6940410080418723e-06, + "loss": 0.0001, + "step": 41540 + }, + { + "epoch": 0.83084, + "grad_norm": 0.029673773795366287, + "learning_rate": 1.6932635439663313e-06, + "loss": 0.0339, + "step": 41542 + }, + { + "epoch": 0.83088, + "grad_norm": 0.006906392518430948, + "learning_rate": 1.6924862418348908e-06, + "loss": 0.0002, + "step": 41544 + }, + { + "epoch": 0.83092, + "grad_norm": 0.0044000339694321156, + "learning_rate": 1.6917091016627085e-06, + "loss": 0.0044, + "step": 41546 + }, + { + "epoch": 0.83096, + "grad_norm": 0.07743094116449356, + "learning_rate": 1.6909321234649344e-06, + "loss": 0.0014, + "step": 41548 + }, + { + "epoch": 0.831, + "grad_norm": 0.017827725037932396, + "learning_rate": 1.6901553072567189e-06, + "loss": 0.001, + "step": 41550 + }, + { + "epoch": 0.83104, + "grad_norm": 0.00783897191286087, + "learning_rate": 1.689378653053201e-06, + "loss": 0.0002, + "step": 41552 + }, + { + "epoch": 0.83108, + "grad_norm": 0.014981252141296864, + "learning_rate": 1.6886021608695257e-06, + "loss": 0.001, + "step": 41554 + }, + { + "epoch": 0.83112, + "grad_norm": 0.027863243594765663, + "learning_rate": 1.6878258307208295e-06, + "loss": 0.0022, + "step": 41556 + }, + { + "epoch": 0.83116, + "grad_norm": 0.10077191889286041, + "learning_rate": 1.6870496626222489e-06, + "loss": 0.0012, + "step": 41558 + }, + { + "epoch": 0.8312, + "grad_norm": 0.16478024423122406, + "learning_rate": 1.686273656588917e-06, + "loss": 0.0018, + "step": 41560 + }, + { + "epoch": 0.83124, + "grad_norm": 0.014517510309815407, + "learning_rate": 1.6854978126359556e-06, + "loss": 0.0025, + "step": 41562 + }, + { + "epoch": 0.83128, + "grad_norm": 0.014744896441698074, + "learning_rate": 1.6847221307784988e-06, + "loss": 0.0029, + "step": 41564 + }, + { + "epoch": 0.83132, + "grad_norm": 0.025317253544926643, + "learning_rate": 1.6839466110316639e-06, + "loss": 0.0038, + "step": 41566 + }, + { + "epoch": 0.83136, + "grad_norm": 0.10569531470537186, + "learning_rate": 1.6831712534105705e-06, + "loss": 0.0022, + "step": 41568 + }, + { + "epoch": 0.8314, + "grad_norm": 0.002041344763711095, + "learning_rate": 1.6823960579303378e-06, + "loss": 0.0039, + "step": 41570 + }, + { + "epoch": 0.83144, + "grad_norm": 0.0711650401353836, + "learning_rate": 1.6816210246060715e-06, + "loss": 0.003, + "step": 41572 + }, + { + "epoch": 0.83148, + "grad_norm": 1.5490797758102417, + "learning_rate": 1.6808461534528908e-06, + "loss": 0.0096, + "step": 41574 + }, + { + "epoch": 0.83152, + "grad_norm": 0.010104835964739323, + "learning_rate": 1.6800714444858945e-06, + "loss": 0.0003, + "step": 41576 + }, + { + "epoch": 0.83156, + "grad_norm": 0.033581312745809555, + "learning_rate": 1.6792968977201896e-06, + "loss": 0.0018, + "step": 41578 + }, + { + "epoch": 0.8316, + "grad_norm": 0.04772094264626503, + "learning_rate": 1.6785225131708749e-06, + "loss": 0.001, + "step": 41580 + }, + { + "epoch": 0.83164, + "grad_norm": 0.03382211551070213, + "learning_rate": 1.6777482908530508e-06, + "loss": 0.0005, + "step": 41582 + }, + { + "epoch": 0.83168, + "grad_norm": 1.1218806505203247, + "learning_rate": 1.676974230781805e-06, + "loss": 0.0085, + "step": 41584 + }, + { + "epoch": 0.83172, + "grad_norm": 0.08415393531322479, + "learning_rate": 1.6762003329722321e-06, + "loss": 0.0007, + "step": 41586 + }, + { + "epoch": 0.83176, + "grad_norm": 0.7023165822029114, + "learning_rate": 1.67542659743942e-06, + "loss": 0.0106, + "step": 41588 + }, + { + "epoch": 0.8318, + "grad_norm": 0.2504107356071472, + "learning_rate": 1.6746530241984504e-06, + "loss": 0.0037, + "step": 41590 + }, + { + "epoch": 0.83184, + "grad_norm": 0.009423820301890373, + "learning_rate": 1.6738796132644098e-06, + "loss": 0.0005, + "step": 41592 + }, + { + "epoch": 0.83188, + "grad_norm": 0.07594969123601913, + "learning_rate": 1.6731063646523682e-06, + "loss": 0.0027, + "step": 41594 + }, + { + "epoch": 0.83192, + "grad_norm": 0.03999447822570801, + "learning_rate": 1.6723332783774094e-06, + "loss": 0.0012, + "step": 41596 + }, + { + "epoch": 0.83196, + "grad_norm": 3.3053221702575684, + "learning_rate": 1.6715603544545977e-06, + "loss": 0.057, + "step": 41598 + }, + { + "epoch": 0.832, + "grad_norm": 0.00496123218908906, + "learning_rate": 1.6707875928990059e-06, + "loss": 0.0001, + "step": 41600 + }, + { + "epoch": 0.83204, + "grad_norm": 0.0031315158121287823, + "learning_rate": 1.6700149937256971e-06, + "loss": 0.0004, + "step": 41602 + }, + { + "epoch": 0.83208, + "grad_norm": 0.04898878559470177, + "learning_rate": 1.669242556949735e-06, + "loss": 0.0004, + "step": 41604 + }, + { + "epoch": 0.83212, + "grad_norm": 0.01144388783723116, + "learning_rate": 1.6684702825861798e-06, + "loss": 0.001, + "step": 41606 + }, + { + "epoch": 0.83216, + "grad_norm": 0.5227490067481995, + "learning_rate": 1.667698170650085e-06, + "loss": 0.0063, + "step": 41608 + }, + { + "epoch": 0.8322, + "grad_norm": 0.324901819229126, + "learning_rate": 1.666926221156503e-06, + "loss": 0.0055, + "step": 41610 + }, + { + "epoch": 0.83224, + "grad_norm": 0.019917519763112068, + "learning_rate": 1.6661544341204848e-06, + "loss": 0.0012, + "step": 41612 + }, + { + "epoch": 0.83228, + "grad_norm": 0.23678776621818542, + "learning_rate": 1.665382809557079e-06, + "loss": 0.0025, + "step": 41614 + }, + { + "epoch": 0.83232, + "grad_norm": 0.00024731754092499614, + "learning_rate": 1.6646113474813209e-06, + "loss": 0.0034, + "step": 41616 + }, + { + "epoch": 0.83236, + "grad_norm": 0.005908305291086435, + "learning_rate": 1.6638400479082607e-06, + "loss": 0.0004, + "step": 41618 + }, + { + "epoch": 0.8324, + "grad_norm": 0.025355197489261627, + "learning_rate": 1.6630689108529286e-06, + "loss": 0.0007, + "step": 41620 + }, + { + "epoch": 0.83244, + "grad_norm": 0.4598189890384674, + "learning_rate": 1.6622979363303605e-06, + "loss": 0.0112, + "step": 41622 + }, + { + "epoch": 0.83248, + "grad_norm": 0.07675281912088394, + "learning_rate": 1.6615271243555887e-06, + "loss": 0.0008, + "step": 41624 + }, + { + "epoch": 0.83252, + "grad_norm": 0.0013677689712494612, + "learning_rate": 1.6607564749436334e-06, + "loss": 0.0, + "step": 41626 + }, + { + "epoch": 0.83256, + "grad_norm": 0.01319247204810381, + "learning_rate": 1.6599859881095292e-06, + "loss": 0.2191, + "step": 41628 + }, + { + "epoch": 0.8326, + "grad_norm": 0.3928472101688385, + "learning_rate": 1.6592156638682887e-06, + "loss": 0.0043, + "step": 41630 + }, + { + "epoch": 0.83264, + "grad_norm": 0.021756822243332863, + "learning_rate": 1.6584455022349343e-06, + "loss": 0.0002, + "step": 41632 + }, + { + "epoch": 0.83268, + "grad_norm": 0.31435298919677734, + "learning_rate": 1.6576755032244786e-06, + "loss": 0.004, + "step": 41634 + }, + { + "epoch": 0.83272, + "grad_norm": 0.01788540929555893, + "learning_rate": 1.6569056668519334e-06, + "loss": 0.0105, + "step": 41636 + }, + { + "epoch": 0.83276, + "grad_norm": 3.8299975395202637, + "learning_rate": 1.6561359931323107e-06, + "loss": 0.0527, + "step": 41638 + }, + { + "epoch": 0.8328, + "grad_norm": 0.19850584864616394, + "learning_rate": 1.6553664820806102e-06, + "loss": 0.0018, + "step": 41640 + }, + { + "epoch": 0.83284, + "grad_norm": 0.3782351016998291, + "learning_rate": 1.654597133711836e-06, + "loss": 0.0035, + "step": 41642 + }, + { + "epoch": 0.83288, + "grad_norm": 0.032786983996629715, + "learning_rate": 1.653827948040987e-06, + "loss": 0.0068, + "step": 41644 + }, + { + "epoch": 0.83292, + "grad_norm": 0.0042536137625575066, + "learning_rate": 1.6530589250830597e-06, + "loss": 0.0004, + "step": 41646 + }, + { + "epoch": 0.83296, + "grad_norm": 0.004625913221389055, + "learning_rate": 1.652290064853045e-06, + "loss": 0.0006, + "step": 41648 + }, + { + "epoch": 0.833, + "grad_norm": 0.069521963596344, + "learning_rate": 1.651521367365936e-06, + "loss": 0.0032, + "step": 41650 + }, + { + "epoch": 0.83304, + "grad_norm": 0.5829141736030579, + "learning_rate": 1.650752832636714e-06, + "loss": 0.005, + "step": 41652 + }, + { + "epoch": 0.83308, + "grad_norm": 0.19596222043037415, + "learning_rate": 1.6499844606803627e-06, + "loss": 0.0042, + "step": 41654 + }, + { + "epoch": 0.83312, + "grad_norm": 0.12274515628814697, + "learning_rate": 1.6492162515118648e-06, + "loss": 0.0016, + "step": 41656 + }, + { + "epoch": 0.83316, + "grad_norm": 0.5326573848724365, + "learning_rate": 1.6484482051461947e-06, + "loss": 0.0067, + "step": 41658 + }, + { + "epoch": 0.8332, + "grad_norm": 0.03961324691772461, + "learning_rate": 1.6476803215983295e-06, + "loss": 0.0006, + "step": 41660 + }, + { + "epoch": 0.83324, + "grad_norm": 0.20036302506923676, + "learning_rate": 1.6469126008832303e-06, + "loss": 0.0022, + "step": 41662 + }, + { + "epoch": 0.83328, + "grad_norm": 0.007313919719308615, + "learning_rate": 1.6461450430158766e-06, + "loss": 0.0039, + "step": 41664 + }, + { + "epoch": 0.83332, + "grad_norm": 0.18075868487358093, + "learning_rate": 1.6453776480112227e-06, + "loss": 0.0021, + "step": 41666 + }, + { + "epoch": 0.83336, + "grad_norm": 0.3648218810558319, + "learning_rate": 1.6446104158842334e-06, + "loss": 0.0036, + "step": 41668 + }, + { + "epoch": 0.8334, + "grad_norm": 0.22744634747505188, + "learning_rate": 1.643843346649866e-06, + "loss": 0.0026, + "step": 41670 + }, + { + "epoch": 0.83344, + "grad_norm": 0.0721699595451355, + "learning_rate": 1.6430764403230758e-06, + "loss": 0.001, + "step": 41672 + }, + { + "epoch": 0.83348, + "grad_norm": 0.05064447596669197, + "learning_rate": 1.6423096969188113e-06, + "loss": 0.0008, + "step": 41674 + }, + { + "epoch": 0.83352, + "grad_norm": 0.01889854110777378, + "learning_rate": 1.6415431164520223e-06, + "loss": 0.001, + "step": 41676 + }, + { + "epoch": 0.83356, + "grad_norm": 0.0348614864051342, + "learning_rate": 1.6407766989376528e-06, + "loss": 0.0004, + "step": 41678 + }, + { + "epoch": 0.8336, + "grad_norm": 1.3842675685882568, + "learning_rate": 1.6400104443906463e-06, + "loss": 0.0117, + "step": 41680 + }, + { + "epoch": 0.83364, + "grad_norm": 0.054047584533691406, + "learning_rate": 1.6392443528259417e-06, + "loss": 0.0006, + "step": 41682 + }, + { + "epoch": 0.83368, + "grad_norm": 0.04164738953113556, + "learning_rate": 1.6384784242584672e-06, + "loss": 0.0037, + "step": 41684 + }, + { + "epoch": 0.83372, + "grad_norm": 0.0013345779152587056, + "learning_rate": 1.6377126587031667e-06, + "loss": 0.0002, + "step": 41686 + }, + { + "epoch": 0.83376, + "grad_norm": 0.44261130690574646, + "learning_rate": 1.6369470561749601e-06, + "loss": 0.0056, + "step": 41688 + }, + { + "epoch": 0.8338, + "grad_norm": 0.05125127360224724, + "learning_rate": 1.6361816166887768e-06, + "loss": 0.0009, + "step": 41690 + }, + { + "epoch": 0.83384, + "grad_norm": 0.0033993434626609087, + "learning_rate": 1.6354163402595413e-06, + "loss": 0.0021, + "step": 41692 + }, + { + "epoch": 0.83388, + "grad_norm": 0.11434174329042435, + "learning_rate": 1.6346512269021654e-06, + "loss": 0.0031, + "step": 41694 + }, + { + "epoch": 0.83392, + "grad_norm": 0.010999031364917755, + "learning_rate": 1.6338862766315754e-06, + "loss": 0.001, + "step": 41696 + }, + { + "epoch": 0.83396, + "grad_norm": 0.061122920364141464, + "learning_rate": 1.633121489462678e-06, + "loss": 0.0008, + "step": 41698 + }, + { + "epoch": 0.834, + "grad_norm": 3.7208542823791504, + "learning_rate": 1.6323568654103838e-06, + "loss": 0.2742, + "step": 41700 + }, + { + "epoch": 0.83404, + "grad_norm": 0.031298328191041946, + "learning_rate": 1.6315924044896002e-06, + "loss": 0.0016, + "step": 41702 + }, + { + "epoch": 0.83408, + "grad_norm": 0.0015134196728467941, + "learning_rate": 1.6308281067152342e-06, + "loss": 0.0004, + "step": 41704 + }, + { + "epoch": 0.83412, + "grad_norm": 0.045497827231884, + "learning_rate": 1.6300639721021806e-06, + "loss": 0.0077, + "step": 41706 + }, + { + "epoch": 0.83416, + "grad_norm": 1.2054884433746338, + "learning_rate": 1.629300000665338e-06, + "loss": 0.0126, + "step": 41708 + }, + { + "epoch": 0.8342, + "grad_norm": 0.022089272737503052, + "learning_rate": 1.6285361924196031e-06, + "loss": 0.0026, + "step": 41710 + }, + { + "epoch": 0.83424, + "grad_norm": 0.011918171308934689, + "learning_rate": 1.6277725473798633e-06, + "loss": 0.0009, + "step": 41712 + }, + { + "epoch": 0.83428, + "grad_norm": 0.39403054118156433, + "learning_rate": 1.6270090655610115e-06, + "loss": 0.006, + "step": 41714 + }, + { + "epoch": 0.83432, + "grad_norm": 0.0032382740173488855, + "learning_rate": 1.6262457469779236e-06, + "loss": 0.0006, + "step": 41716 + }, + { + "epoch": 0.83436, + "grad_norm": 0.015813851729035378, + "learning_rate": 1.6254825916454908e-06, + "loss": 0.0008, + "step": 41718 + }, + { + "epoch": 0.8344, + "grad_norm": 7.25081729888916, + "learning_rate": 1.6247195995785836e-06, + "loss": 0.0873, + "step": 41720 + }, + { + "epoch": 0.83444, + "grad_norm": 1.0957704782485962, + "learning_rate": 1.6239567707920801e-06, + "loss": 0.0098, + "step": 41722 + }, + { + "epoch": 0.83448, + "grad_norm": 0.014300738461315632, + "learning_rate": 1.6231941053008516e-06, + "loss": 0.0009, + "step": 41724 + }, + { + "epoch": 0.83452, + "grad_norm": 0.03984789922833443, + "learning_rate": 1.6224316031197663e-06, + "loss": 0.0152, + "step": 41726 + }, + { + "epoch": 0.83456, + "grad_norm": 0.012566915713250637, + "learning_rate": 1.6216692642636934e-06, + "loss": 0.0213, + "step": 41728 + }, + { + "epoch": 0.8346, + "grad_norm": 0.5917866230010986, + "learning_rate": 1.6209070887474876e-06, + "loss": 0.0059, + "step": 41730 + }, + { + "epoch": 0.83464, + "grad_norm": 0.32251667976379395, + "learning_rate": 1.6201450765860127e-06, + "loss": 0.0021, + "step": 41732 + }, + { + "epoch": 0.83468, + "grad_norm": 0.031467635184526443, + "learning_rate": 1.6193832277941247e-06, + "loss": 0.0095, + "step": 41734 + }, + { + "epoch": 0.83472, + "grad_norm": 0.03270554170012474, + "learning_rate": 1.618621542386677e-06, + "loss": 0.0027, + "step": 41736 + }, + { + "epoch": 0.83476, + "grad_norm": 0.19632747769355774, + "learning_rate": 1.6178600203785122e-06, + "loss": 0.0012, + "step": 41738 + }, + { + "epoch": 0.8348, + "grad_norm": 0.0030576190911233425, + "learning_rate": 1.6170986617844864e-06, + "loss": 0.0002, + "step": 41740 + }, + { + "epoch": 0.83484, + "grad_norm": 0.02740154229104519, + "learning_rate": 1.6163374666194365e-06, + "loss": 0.0005, + "step": 41742 + }, + { + "epoch": 0.83488, + "grad_norm": 7.790560245513916, + "learning_rate": 1.6155764348982027e-06, + "loss": 0.08, + "step": 41744 + }, + { + "epoch": 0.83492, + "grad_norm": 0.01051044836640358, + "learning_rate": 1.614815566635627e-06, + "loss": 0.0006, + "step": 41746 + }, + { + "epoch": 0.83496, + "grad_norm": 0.02336907386779785, + "learning_rate": 1.6140548618465323e-06, + "loss": 0.0006, + "step": 41748 + }, + { + "epoch": 0.835, + "grad_norm": 0.22279620170593262, + "learning_rate": 1.6132943205457607e-06, + "loss": 0.0026, + "step": 41750 + }, + { + "epoch": 0.83504, + "grad_norm": 0.12786558270454407, + "learning_rate": 1.61253394274813e-06, + "loss": 0.0025, + "step": 41752 + }, + { + "epoch": 0.83508, + "grad_norm": 0.05411210656166077, + "learning_rate": 1.6117737284684731e-06, + "loss": 0.0016, + "step": 41754 + }, + { + "epoch": 0.83512, + "grad_norm": 0.18721190094947815, + "learning_rate": 1.6110136777216023e-06, + "loss": 0.002, + "step": 41756 + }, + { + "epoch": 0.83516, + "grad_norm": 0.9269645810127258, + "learning_rate": 1.6102537905223391e-06, + "loss": 0.0083, + "step": 41758 + }, + { + "epoch": 0.8352, + "grad_norm": 7.848387718200684, + "learning_rate": 1.6094940668855008e-06, + "loss": 0.1266, + "step": 41760 + }, + { + "epoch": 0.83524, + "grad_norm": 0.07190337777137756, + "learning_rate": 1.608734506825893e-06, + "loss": 0.0032, + "step": 41762 + }, + { + "epoch": 0.83528, + "grad_norm": 0.010028907097876072, + "learning_rate": 1.6079751103583252e-06, + "loss": 0.0006, + "step": 41764 + }, + { + "epoch": 0.83532, + "grad_norm": 0.17640142142772675, + "learning_rate": 1.6072158774976032e-06, + "loss": 0.0038, + "step": 41766 + }, + { + "epoch": 0.83536, + "grad_norm": 0.008002844639122486, + "learning_rate": 1.6064568082585297e-06, + "loss": 0.0007, + "step": 41768 + }, + { + "epoch": 0.8354, + "grad_norm": 0.15482714772224426, + "learning_rate": 1.6056979026559005e-06, + "loss": 0.0066, + "step": 41770 + }, + { + "epoch": 0.83544, + "grad_norm": 0.10940662771463394, + "learning_rate": 1.604939160704516e-06, + "loss": 0.001, + "step": 41772 + }, + { + "epoch": 0.83548, + "grad_norm": 0.08967845886945724, + "learning_rate": 1.6041805824191614e-06, + "loss": 0.001, + "step": 41774 + }, + { + "epoch": 0.83552, + "grad_norm": 22.202791213989258, + "learning_rate": 1.60342216781463e-06, + "loss": 0.4108, + "step": 41776 + }, + { + "epoch": 0.83556, + "grad_norm": 0.06542295962572098, + "learning_rate": 1.6026639169057079e-06, + "loss": 0.001, + "step": 41778 + }, + { + "epoch": 0.8356, + "grad_norm": 0.060229938477277756, + "learning_rate": 1.601905829707171e-06, + "loss": 0.0006, + "step": 41780 + }, + { + "epoch": 0.83564, + "grad_norm": 0.8903540968894958, + "learning_rate": 1.6011479062338088e-06, + "loss": 0.0082, + "step": 41782 + }, + { + "epoch": 0.83568, + "grad_norm": 0.052776336669921875, + "learning_rate": 1.6003901465003868e-06, + "loss": 0.0011, + "step": 41784 + }, + { + "epoch": 0.83572, + "grad_norm": 0.1679103821516037, + "learning_rate": 1.599632550521688e-06, + "loss": 0.002, + "step": 41786 + }, + { + "epoch": 0.83576, + "grad_norm": 0.04761099815368652, + "learning_rate": 1.5988751183124751e-06, + "loss": 0.0076, + "step": 41788 + }, + { + "epoch": 0.8358, + "grad_norm": 1.6097508668899536, + "learning_rate": 1.5981178498875182e-06, + "loss": 0.0117, + "step": 41790 + }, + { + "epoch": 0.83584, + "grad_norm": 0.022763101384043694, + "learning_rate": 1.5973607452615802e-06, + "loss": 0.0009, + "step": 41792 + }, + { + "epoch": 0.83588, + "grad_norm": 0.029255440458655357, + "learning_rate": 1.5966038044494193e-06, + "loss": 0.0003, + "step": 41794 + }, + { + "epoch": 0.83592, + "grad_norm": 0.027386296540498734, + "learning_rate": 1.5958470274657922e-06, + "loss": 0.0004, + "step": 41796 + }, + { + "epoch": 0.83596, + "grad_norm": 0.04001842066645622, + "learning_rate": 1.5950904143254554e-06, + "loss": 0.0015, + "step": 41798 + }, + { + "epoch": 0.836, + "grad_norm": 0.35161104798316956, + "learning_rate": 1.5943339650431578e-06, + "loss": 0.0058, + "step": 41800 + }, + { + "epoch": 0.83604, + "grad_norm": 0.0550137460231781, + "learning_rate": 1.5935776796336466e-06, + "loss": 0.0026, + "step": 41802 + }, + { + "epoch": 0.83608, + "grad_norm": 0.036057621240615845, + "learning_rate": 1.592821558111669e-06, + "loss": 0.0004, + "step": 41804 + }, + { + "epoch": 0.83612, + "grad_norm": 0.030726393684744835, + "learning_rate": 1.5920656004919588e-06, + "loss": 0.0008, + "step": 41806 + }, + { + "epoch": 0.83616, + "grad_norm": 0.008541367016732693, + "learning_rate": 1.5913098067892629e-06, + "loss": 0.0011, + "step": 41808 + }, + { + "epoch": 0.8362, + "grad_norm": 0.006021412555128336, + "learning_rate": 1.5905541770183096e-06, + "loss": 0.0017, + "step": 41810 + }, + { + "epoch": 0.83624, + "grad_norm": 0.07523338496685028, + "learning_rate": 1.5897987111938317e-06, + "loss": 0.003, + "step": 41812 + }, + { + "epoch": 0.83628, + "grad_norm": 0.08588501811027527, + "learning_rate": 1.5890434093305607e-06, + "loss": 0.0276, + "step": 41814 + }, + { + "epoch": 0.83632, + "grad_norm": 0.657433271408081, + "learning_rate": 1.5882882714432136e-06, + "loss": 0.0051, + "step": 41816 + }, + { + "epoch": 0.83636, + "grad_norm": 0.1160077378153801, + "learning_rate": 1.5875332975465218e-06, + "loss": 0.0016, + "step": 41818 + }, + { + "epoch": 0.8364, + "grad_norm": 18.31694984436035, + "learning_rate": 1.5867784876551973e-06, + "loss": 0.3918, + "step": 41820 + }, + { + "epoch": 0.83644, + "grad_norm": 0.0045176912099123, + "learning_rate": 1.5860238417839569e-06, + "loss": 0.0004, + "step": 41822 + }, + { + "epoch": 0.83648, + "grad_norm": 0.016747483983635902, + "learning_rate": 1.5852693599475144e-06, + "loss": 0.001, + "step": 41824 + }, + { + "epoch": 0.83652, + "grad_norm": 0.06436780095100403, + "learning_rate": 1.5845150421605792e-06, + "loss": 0.0012, + "step": 41826 + }, + { + "epoch": 0.83656, + "grad_norm": 0.014823219738900661, + "learning_rate": 1.5837608884378542e-06, + "loss": 0.0002, + "step": 41828 + }, + { + "epoch": 0.8366, + "grad_norm": 0.2533433437347412, + "learning_rate": 1.583006898794044e-06, + "loss": 0.0021, + "step": 41830 + }, + { + "epoch": 0.83664, + "grad_norm": 0.06777861714363098, + "learning_rate": 1.5822530732438468e-06, + "loss": 0.0006, + "step": 41832 + }, + { + "epoch": 0.83668, + "grad_norm": 0.03254950791597366, + "learning_rate": 1.5814994118019612e-06, + "loss": 0.001, + "step": 41834 + }, + { + "epoch": 0.83672, + "grad_norm": 0.009709106758236885, + "learning_rate": 1.5807459144830795e-06, + "loss": 0.001, + "step": 41836 + }, + { + "epoch": 0.83676, + "grad_norm": 0.00058556447038427, + "learning_rate": 1.5799925813018868e-06, + "loss": 0.0015, + "step": 41838 + }, + { + "epoch": 0.8368, + "grad_norm": 0.047370508313179016, + "learning_rate": 1.579239412273078e-06, + "loss": 0.0009, + "step": 41840 + }, + { + "epoch": 0.83684, + "grad_norm": 0.41975101828575134, + "learning_rate": 1.5784864074113304e-06, + "loss": 0.0055, + "step": 41842 + }, + { + "epoch": 0.83688, + "grad_norm": 0.07007408887147903, + "learning_rate": 1.5777335667313254e-06, + "loss": 0.0126, + "step": 41844 + }, + { + "epoch": 0.83692, + "grad_norm": 0.035480111837387085, + "learning_rate": 1.5769808902477423e-06, + "loss": 0.0005, + "step": 41846 + }, + { + "epoch": 0.83696, + "grad_norm": 0.9375584125518799, + "learning_rate": 1.5762283779752496e-06, + "loss": 0.0122, + "step": 41848 + }, + { + "epoch": 0.837, + "grad_norm": 0.43368396162986755, + "learning_rate": 1.5754760299285255e-06, + "loss": 0.0033, + "step": 41850 + }, + { + "epoch": 0.83704, + "grad_norm": 0.0012161793420091271, + "learning_rate": 1.5747238461222313e-06, + "loss": 0.0003, + "step": 41852 + }, + { + "epoch": 0.83708, + "grad_norm": 0.036745671182870865, + "learning_rate": 1.5739718265710336e-06, + "loss": 0.0004, + "step": 41854 + }, + { + "epoch": 0.83712, + "grad_norm": 3.7016453742980957, + "learning_rate": 1.5732199712895935e-06, + "loss": 0.0376, + "step": 41856 + }, + { + "epoch": 0.83716, + "grad_norm": 0.3200599253177643, + "learning_rate": 1.5724682802925685e-06, + "loss": 0.0031, + "step": 41858 + }, + { + "epoch": 0.8372, + "grad_norm": 0.048359621316194534, + "learning_rate": 1.5717167535946142e-06, + "loss": 0.0006, + "step": 41860 + }, + { + "epoch": 0.83724, + "grad_norm": 0.05683423951268196, + "learning_rate": 1.5709653912103795e-06, + "loss": 0.0008, + "step": 41862 + }, + { + "epoch": 0.83728, + "grad_norm": 0.20778173208236694, + "learning_rate": 1.5702141931545144e-06, + "loss": 0.0024, + "step": 41864 + }, + { + "epoch": 0.83732, + "grad_norm": 0.1904417723417282, + "learning_rate": 1.5694631594416631e-06, + "loss": 0.002, + "step": 41866 + }, + { + "epoch": 0.83736, + "grad_norm": 0.027165912091732025, + "learning_rate": 1.5687122900864704e-06, + "loss": 0.0002, + "step": 41868 + }, + { + "epoch": 0.8374, + "grad_norm": 0.02054673805832863, + "learning_rate": 1.5679615851035669e-06, + "loss": 0.001, + "step": 41870 + }, + { + "epoch": 0.83744, + "grad_norm": 0.45893722772598267, + "learning_rate": 1.5672110445075994e-06, + "loss": 0.0035, + "step": 41872 + }, + { + "epoch": 0.83748, + "grad_norm": 0.0014801385113969445, + "learning_rate": 1.5664606683131877e-06, + "loss": 0.0001, + "step": 41874 + }, + { + "epoch": 0.83752, + "grad_norm": 0.06211363151669502, + "learning_rate": 1.5657104565349734e-06, + "loss": 0.001, + "step": 41876 + }, + { + "epoch": 0.83756, + "grad_norm": 0.008274764753878117, + "learning_rate": 1.5649604091875726e-06, + "loss": 0.0011, + "step": 41878 + }, + { + "epoch": 0.8376, + "grad_norm": 2.165008068084717, + "learning_rate": 1.5642105262856122e-06, + "loss": 0.0762, + "step": 41880 + }, + { + "epoch": 0.83764, + "grad_norm": 0.340125173330307, + "learning_rate": 1.563460807843712e-06, + "loss": 0.0034, + "step": 41882 + }, + { + "epoch": 0.83768, + "grad_norm": 0.0015105244237929583, + "learning_rate": 1.5627112538764855e-06, + "loss": 0.0001, + "step": 41884 + }, + { + "epoch": 0.83772, + "grad_norm": 0.04748263582587242, + "learning_rate": 1.5619618643985457e-06, + "loss": 0.0004, + "step": 41886 + }, + { + "epoch": 0.83776, + "grad_norm": 0.21589644253253937, + "learning_rate": 1.561212639424504e-06, + "loss": 0.0037, + "step": 41888 + }, + { + "epoch": 0.8378, + "grad_norm": 0.026859160512685776, + "learning_rate": 1.560463578968967e-06, + "loss": 0.0004, + "step": 41890 + }, + { + "epoch": 0.83784, + "grad_norm": 0.012236645445227623, + "learning_rate": 1.5597146830465371e-06, + "loss": 0.0003, + "step": 41892 + }, + { + "epoch": 0.83788, + "grad_norm": 0.4922836422920227, + "learning_rate": 1.558965951671817e-06, + "loss": 0.0068, + "step": 41894 + }, + { + "epoch": 0.83792, + "grad_norm": 0.034466009587049484, + "learning_rate": 1.5582173848593995e-06, + "loss": 0.0006, + "step": 41896 + }, + { + "epoch": 0.83796, + "grad_norm": 0.034912168979644775, + "learning_rate": 1.5574689826238798e-06, + "loss": 0.0015, + "step": 41898 + }, + { + "epoch": 0.838, + "grad_norm": 0.012513859197497368, + "learning_rate": 1.5567207449798517e-06, + "loss": 0.004, + "step": 41900 + }, + { + "epoch": 0.83804, + "grad_norm": 0.34634554386138916, + "learning_rate": 1.5559726719418944e-06, + "loss": 0.0042, + "step": 41902 + }, + { + "epoch": 0.83808, + "grad_norm": 0.0838359072804451, + "learning_rate": 1.5552247635246021e-06, + "loss": 0.001, + "step": 41904 + }, + { + "epoch": 0.83812, + "grad_norm": 0.0005560569697991014, + "learning_rate": 1.5544770197425462e-06, + "loss": 0.0012, + "step": 41906 + }, + { + "epoch": 0.83816, + "grad_norm": 0.01020293589681387, + "learning_rate": 1.5537294406103132e-06, + "loss": 0.0001, + "step": 41908 + }, + { + "epoch": 0.8382, + "grad_norm": 0.062036965042352676, + "learning_rate": 1.55298202614247e-06, + "loss": 0.0015, + "step": 41910 + }, + { + "epoch": 0.83824, + "grad_norm": 0.07662570476531982, + "learning_rate": 1.5522347763535917e-06, + "loss": 0.0007, + "step": 41912 + }, + { + "epoch": 0.83828, + "grad_norm": 0.4795999825000763, + "learning_rate": 1.551487691258249e-06, + "loss": 0.0055, + "step": 41914 + }, + { + "epoch": 0.83832, + "grad_norm": 0.02043536864221096, + "learning_rate": 1.5507407708709999e-06, + "loss": 0.0066, + "step": 41916 + }, + { + "epoch": 0.83836, + "grad_norm": 0.029689205810427666, + "learning_rate": 1.5499940152064096e-06, + "loss": 0.0014, + "step": 41918 + }, + { + "epoch": 0.8384, + "grad_norm": 0.24606473743915558, + "learning_rate": 1.5492474242790368e-06, + "loss": 0.6923, + "step": 41920 + }, + { + "epoch": 0.83844, + "grad_norm": 0.08366849273443222, + "learning_rate": 1.548500998103435e-06, + "loss": 0.003, + "step": 41922 + }, + { + "epoch": 0.83848, + "grad_norm": 0.03518341854214668, + "learning_rate": 1.547754736694158e-06, + "loss": 0.0015, + "step": 41924 + }, + { + "epoch": 0.83852, + "grad_norm": 0.132132425904274, + "learning_rate": 1.5470086400657568e-06, + "loss": 0.002, + "step": 41926 + }, + { + "epoch": 0.83856, + "grad_norm": 0.04137164354324341, + "learning_rate": 1.546262708232772e-06, + "loss": 0.4888, + "step": 41928 + }, + { + "epoch": 0.8386, + "grad_norm": 0.001504620537161827, + "learning_rate": 1.545516941209747e-06, + "loss": 0.0001, + "step": 41930 + }, + { + "epoch": 0.83864, + "grad_norm": 0.01294475607573986, + "learning_rate": 1.5447713390112218e-06, + "loss": 0.0011, + "step": 41932 + }, + { + "epoch": 0.83868, + "grad_norm": 0.09732021391391754, + "learning_rate": 1.5440259016517334e-06, + "loss": 0.0012, + "step": 41934 + }, + { + "epoch": 0.83872, + "grad_norm": 0.08567433059215546, + "learning_rate": 1.5432806291458146e-06, + "loss": 0.0023, + "step": 41936 + }, + { + "epoch": 0.83876, + "grad_norm": 0.0912812203168869, + "learning_rate": 1.5425355215079896e-06, + "loss": 0.0012, + "step": 41938 + }, + { + "epoch": 0.8388, + "grad_norm": 0.00572650833055377, + "learning_rate": 1.5417905787527943e-06, + "loss": 0.0003, + "step": 41940 + }, + { + "epoch": 0.83884, + "grad_norm": 0.0877695307135582, + "learning_rate": 1.5410458008947426e-06, + "loss": 0.0013, + "step": 41942 + }, + { + "epoch": 0.83888, + "grad_norm": 0.003944962285459042, + "learning_rate": 1.5403011879483587e-06, + "loss": 0.0134, + "step": 41944 + }, + { + "epoch": 0.83892, + "grad_norm": 6.038171768188477, + "learning_rate": 1.5395567399281585e-06, + "loss": 0.0239, + "step": 41946 + }, + { + "epoch": 0.83896, + "grad_norm": 0.16935184597969055, + "learning_rate": 1.5388124568486574e-06, + "loss": 0.0104, + "step": 41948 + }, + { + "epoch": 0.839, + "grad_norm": 0.15804219245910645, + "learning_rate": 1.538068338724361e-06, + "loss": 0.0018, + "step": 41950 + }, + { + "epoch": 0.83904, + "grad_norm": 0.1742805391550064, + "learning_rate": 1.5373243855697795e-06, + "loss": 0.0013, + "step": 41952 + }, + { + "epoch": 0.83908, + "grad_norm": 0.004044805653393269, + "learning_rate": 1.5365805973994153e-06, + "loss": 0.0072, + "step": 41954 + }, + { + "epoch": 0.83912, + "grad_norm": 0.0014637840213254094, + "learning_rate": 1.5358369742277702e-06, + "loss": 0.0002, + "step": 41956 + }, + { + "epoch": 0.83916, + "grad_norm": 0.10732409358024597, + "learning_rate": 1.5350935160693437e-06, + "loss": 0.0013, + "step": 41958 + }, + { + "epoch": 0.8392, + "grad_norm": 0.02348492108285427, + "learning_rate": 1.5343502229386209e-06, + "loss": 0.002, + "step": 41960 + }, + { + "epoch": 0.83924, + "grad_norm": 0.054674867540597916, + "learning_rate": 1.5336070948501046e-06, + "loss": 0.0004, + "step": 41962 + }, + { + "epoch": 0.83928, + "grad_norm": 0.05790136009454727, + "learning_rate": 1.5328641318182747e-06, + "loss": 0.0017, + "step": 41964 + }, + { + "epoch": 0.83932, + "grad_norm": 2.022096633911133, + "learning_rate": 1.532121333857618e-06, + "loss": 0.0228, + "step": 41966 + }, + { + "epoch": 0.83936, + "grad_norm": 0.16574998199939728, + "learning_rate": 1.5313787009826163e-06, + "loss": 0.0021, + "step": 41968 + }, + { + "epoch": 0.8394, + "grad_norm": 0.3541686534881592, + "learning_rate": 1.530636233207743e-06, + "loss": 0.0029, + "step": 41970 + }, + { + "epoch": 0.83944, + "grad_norm": 0.09995470941066742, + "learning_rate": 1.5298939305474814e-06, + "loss": 0.0023, + "step": 41972 + }, + { + "epoch": 0.83948, + "grad_norm": 0.0008390345610678196, + "learning_rate": 1.5291517930162957e-06, + "loss": 0.0004, + "step": 41974 + }, + { + "epoch": 0.83952, + "grad_norm": 0.07451850175857544, + "learning_rate": 1.528409820628658e-06, + "loss": 0.002, + "step": 41976 + }, + { + "epoch": 0.83956, + "grad_norm": 0.020574573427438736, + "learning_rate": 1.5276680133990308e-06, + "loss": 0.0008, + "step": 41978 + }, + { + "epoch": 0.8396, + "grad_norm": 0.06616182625293732, + "learning_rate": 1.526926371341878e-06, + "loss": 0.0013, + "step": 41980 + }, + { + "epoch": 0.83964, + "grad_norm": 0.09232286363840103, + "learning_rate": 1.5261848944716607e-06, + "loss": 0.0013, + "step": 41982 + }, + { + "epoch": 0.83968, + "grad_norm": 0.027501139789819717, + "learning_rate": 1.525443582802828e-06, + "loss": 0.0008, + "step": 41984 + }, + { + "epoch": 0.83972, + "grad_norm": 0.13736987113952637, + "learning_rate": 1.5247024363498364e-06, + "loss": 0.0042, + "step": 41986 + }, + { + "epoch": 0.83976, + "grad_norm": 0.0974268913269043, + "learning_rate": 1.5239614551271343e-06, + "loss": 0.0011, + "step": 41988 + }, + { + "epoch": 0.8398, + "grad_norm": 2.2349936962127686, + "learning_rate": 1.52322063914917e-06, + "loss": 0.0197, + "step": 41990 + }, + { + "epoch": 0.83984, + "grad_norm": 0.01904044859111309, + "learning_rate": 1.5224799884303788e-06, + "loss": 0.0018, + "step": 41992 + }, + { + "epoch": 0.83988, + "grad_norm": 0.003395787440240383, + "learning_rate": 1.521739502985209e-06, + "loss": 0.0001, + "step": 41994 + }, + { + "epoch": 0.83992, + "grad_norm": 0.050641853362321854, + "learning_rate": 1.52099918282809e-06, + "loss": 0.0005, + "step": 41996 + }, + { + "epoch": 0.83996, + "grad_norm": 0.13099783658981323, + "learning_rate": 1.5202590279734575e-06, + "loss": 0.0021, + "step": 41998 + }, + { + "epoch": 0.84, + "grad_norm": 0.25871357321739197, + "learning_rate": 1.5195190384357405e-06, + "loss": 0.0037, + "step": 42000 + }, + { + "epoch": 0.84004, + "grad_norm": 0.19547894597053528, + "learning_rate": 1.5187792142293657e-06, + "loss": 0.0023, + "step": 42002 + }, + { + "epoch": 0.84008, + "grad_norm": 0.033861592411994934, + "learning_rate": 1.51803955536876e-06, + "loss": 0.0006, + "step": 42004 + }, + { + "epoch": 0.84012, + "grad_norm": 0.054250556975603104, + "learning_rate": 1.5173000618683364e-06, + "loss": 0.0012, + "step": 42006 + }, + { + "epoch": 0.84016, + "grad_norm": 0.5058565735816956, + "learning_rate": 1.5165607337425158e-06, + "loss": 0.0047, + "step": 42008 + }, + { + "epoch": 0.8402, + "grad_norm": 0.045827463269233704, + "learning_rate": 1.5158215710057123e-06, + "loss": 0.0014, + "step": 42010 + }, + { + "epoch": 0.84024, + "grad_norm": 0.7137693166732788, + "learning_rate": 1.515082573672334e-06, + "loss": 0.0072, + "step": 42012 + }, + { + "epoch": 0.84028, + "grad_norm": 0.013173762708902359, + "learning_rate": 1.5143437417567896e-06, + "loss": 0.0006, + "step": 42014 + }, + { + "epoch": 0.84032, + "grad_norm": 0.2128884494304657, + "learning_rate": 1.5136050752734844e-06, + "loss": 0.0028, + "step": 42016 + }, + { + "epoch": 0.84036, + "grad_norm": 0.022348571568727493, + "learning_rate": 1.5128665742368154e-06, + "loss": 0.0023, + "step": 42018 + }, + { + "epoch": 0.8404, + "grad_norm": 0.5331457257270813, + "learning_rate": 1.5121282386611823e-06, + "loss": 0.0277, + "step": 42020 + }, + { + "epoch": 0.84044, + "grad_norm": 0.012420360930263996, + "learning_rate": 1.5113900685609817e-06, + "loss": 0.0019, + "step": 42022 + }, + { + "epoch": 0.84048, + "grad_norm": 0.20567841827869415, + "learning_rate": 1.5106520639505972e-06, + "loss": 0.0052, + "step": 42024 + }, + { + "epoch": 0.84052, + "grad_norm": 0.6610877513885498, + "learning_rate": 1.5099142248444254e-06, + "loss": 0.0052, + "step": 42026 + }, + { + "epoch": 0.84056, + "grad_norm": 0.06685037910938263, + "learning_rate": 1.5091765512568425e-06, + "loss": 0.0006, + "step": 42028 + }, + { + "epoch": 0.8406, + "grad_norm": 12.809395790100098, + "learning_rate": 1.5084390432022377e-06, + "loss": 0.2104, + "step": 42030 + }, + { + "epoch": 0.84064, + "grad_norm": 0.022027455270290375, + "learning_rate": 1.5077017006949846e-06, + "loss": 0.0005, + "step": 42032 + }, + { + "epoch": 0.84068, + "grad_norm": 0.04961603507399559, + "learning_rate": 1.5069645237494578e-06, + "loss": 0.0043, + "step": 42034 + }, + { + "epoch": 0.84072, + "grad_norm": 1.7516932487487793, + "learning_rate": 1.506227512380034e-06, + "loss": 0.0194, + "step": 42036 + }, + { + "epoch": 0.84076, + "grad_norm": 2.838703155517578, + "learning_rate": 1.5054906666010737e-06, + "loss": 0.019, + "step": 42038 + }, + { + "epoch": 0.8408, + "grad_norm": 0.2292882800102234, + "learning_rate": 1.5047539864269477e-06, + "loss": 0.1912, + "step": 42040 + }, + { + "epoch": 0.84084, + "grad_norm": 5.836162567138672, + "learning_rate": 1.5040174718720146e-06, + "loss": 0.047, + "step": 42042 + }, + { + "epoch": 0.84088, + "grad_norm": 0.00054070824990049, + "learning_rate": 1.503281122950636e-06, + "loss": 0.0006, + "step": 42044 + }, + { + "epoch": 0.84092, + "grad_norm": 0.061199311167001724, + "learning_rate": 1.5025449396771663e-06, + "loss": 0.0024, + "step": 42046 + }, + { + "epoch": 0.84096, + "grad_norm": 0.059672314673662186, + "learning_rate": 1.5018089220659605e-06, + "loss": 0.0016, + "step": 42048 + }, + { + "epoch": 0.841, + "grad_norm": 0.07796284556388855, + "learning_rate": 1.5010730701313626e-06, + "loss": 0.0013, + "step": 42050 + }, + { + "epoch": 0.84104, + "grad_norm": 0.02680014818906784, + "learning_rate": 1.50033738388772e-06, + "loss": 0.0003, + "step": 42052 + }, + { + "epoch": 0.84108, + "grad_norm": 0.48481371998786926, + "learning_rate": 1.4996018633493758e-06, + "loss": 0.0049, + "step": 42054 + }, + { + "epoch": 0.84112, + "grad_norm": 0.12803924083709717, + "learning_rate": 1.4988665085306698e-06, + "loss": 0.0077, + "step": 42056 + }, + { + "epoch": 0.84116, + "grad_norm": 0.017982034012675285, + "learning_rate": 1.4981313194459414e-06, + "loss": 0.0002, + "step": 42058 + }, + { + "epoch": 0.8412, + "grad_norm": 0.08649703860282898, + "learning_rate": 1.4973962961095135e-06, + "loss": 0.001, + "step": 42060 + }, + { + "epoch": 0.84124, + "grad_norm": 0.498043417930603, + "learning_rate": 1.496661438535728e-06, + "loss": 0.0038, + "step": 42062 + }, + { + "epoch": 0.84128, + "grad_norm": 0.16987791657447815, + "learning_rate": 1.4959267467389039e-06, + "loss": 0.0051, + "step": 42064 + }, + { + "epoch": 0.84132, + "grad_norm": 0.9481579661369324, + "learning_rate": 1.4951922207333648e-06, + "loss": 0.0084, + "step": 42066 + }, + { + "epoch": 0.84136, + "grad_norm": 0.021988701075315475, + "learning_rate": 1.4944578605334326e-06, + "loss": 0.0012, + "step": 42068 + }, + { + "epoch": 0.8414, + "grad_norm": 0.10779404640197754, + "learning_rate": 1.4937236661534227e-06, + "loss": 0.0013, + "step": 42070 + }, + { + "epoch": 0.84144, + "grad_norm": 0.0057437848299741745, + "learning_rate": 1.4929896376076525e-06, + "loss": 0.0001, + "step": 42072 + }, + { + "epoch": 0.84148, + "grad_norm": 0.05561397224664688, + "learning_rate": 1.4922557749104261e-06, + "loss": 0.001, + "step": 42074 + }, + { + "epoch": 0.84152, + "grad_norm": 0.059636376798152924, + "learning_rate": 1.491522078076053e-06, + "loss": 0.0008, + "step": 42076 + }, + { + "epoch": 0.84156, + "grad_norm": 0.06331272423267365, + "learning_rate": 1.4907885471188377e-06, + "loss": 0.0007, + "step": 42078 + }, + { + "epoch": 0.8416, + "grad_norm": 0.01029039453715086, + "learning_rate": 1.490055182053083e-06, + "loss": 0.0011, + "step": 42080 + }, + { + "epoch": 0.84164, + "grad_norm": 0.0036244832444936037, + "learning_rate": 1.4893219828930793e-06, + "loss": 0.0006, + "step": 42082 + }, + { + "epoch": 0.84168, + "grad_norm": 0.041460759937763214, + "learning_rate": 1.4885889496531302e-06, + "loss": 0.0005, + "step": 42084 + }, + { + "epoch": 0.84172, + "grad_norm": 0.03412436321377754, + "learning_rate": 1.4878560823475185e-06, + "loss": 0.0015, + "step": 42086 + }, + { + "epoch": 0.84176, + "grad_norm": 11.237584114074707, + "learning_rate": 1.487123380990535e-06, + "loss": 0.1265, + "step": 42088 + }, + { + "epoch": 0.8418, + "grad_norm": 0.018900714814662933, + "learning_rate": 1.486390845596466e-06, + "loss": 0.9176, + "step": 42090 + }, + { + "epoch": 0.84184, + "grad_norm": 0.01206993032246828, + "learning_rate": 1.4856584761795866e-06, + "loss": 0.0009, + "step": 42092 + }, + { + "epoch": 0.84188, + "grad_norm": 0.22162629663944244, + "learning_rate": 1.4849262727541824e-06, + "loss": 0.0029, + "step": 42094 + }, + { + "epoch": 0.84192, + "grad_norm": 0.617184579372406, + "learning_rate": 1.4841942353345228e-06, + "loss": 0.0068, + "step": 42096 + }, + { + "epoch": 0.84196, + "grad_norm": 0.010759739205241203, + "learning_rate": 1.48346236393488e-06, + "loss": 0.0002, + "step": 42098 + }, + { + "epoch": 0.842, + "grad_norm": 0.005955592263489962, + "learning_rate": 1.4827306585695234e-06, + "loss": 0.0051, + "step": 42100 + }, + { + "epoch": 0.84204, + "grad_norm": 0.13864654302597046, + "learning_rate": 1.4819991192527183e-06, + "loss": 0.0013, + "step": 42102 + }, + { + "epoch": 0.84208, + "grad_norm": 0.11227704584598541, + "learning_rate": 1.481267745998728e-06, + "loss": 0.0015, + "step": 42104 + }, + { + "epoch": 0.84212, + "grad_norm": 0.011737396009266376, + "learning_rate": 1.4805365388218052e-06, + "loss": 0.0002, + "step": 42106 + }, + { + "epoch": 0.84216, + "grad_norm": 0.84186851978302, + "learning_rate": 1.4798054977362098e-06, + "loss": 0.0051, + "step": 42108 + }, + { + "epoch": 0.8422, + "grad_norm": 0.0904688686132431, + "learning_rate": 1.4790746227561925e-06, + "loss": 0.0016, + "step": 42110 + }, + { + "epoch": 0.84224, + "grad_norm": 0.007885754108428955, + "learning_rate": 1.4783439138960042e-06, + "loss": 0.0006, + "step": 42112 + }, + { + "epoch": 0.84228, + "grad_norm": 0.08274056017398834, + "learning_rate": 1.4776133711698836e-06, + "loss": 0.0195, + "step": 42114 + }, + { + "epoch": 0.84232, + "grad_norm": 0.018040455877780914, + "learning_rate": 1.4768829945920838e-06, + "loss": 0.0007, + "step": 42116 + }, + { + "epoch": 0.84236, + "grad_norm": 0.04709245637059212, + "learning_rate": 1.4761527841768342e-06, + "loss": 0.0005, + "step": 42118 + }, + { + "epoch": 0.8424, + "grad_norm": 0.02928125485777855, + "learning_rate": 1.4754227399383758e-06, + "loss": 0.0022, + "step": 42120 + }, + { + "epoch": 0.84244, + "grad_norm": 0.15711072087287903, + "learning_rate": 1.4746928618909396e-06, + "loss": 0.0313, + "step": 42122 + }, + { + "epoch": 0.84248, + "grad_norm": 0.6816355586051941, + "learning_rate": 1.4739631500487539e-06, + "loss": 0.0059, + "step": 42124 + }, + { + "epoch": 0.84252, + "grad_norm": 0.03171147406101227, + "learning_rate": 1.4732336044260498e-06, + "loss": 0.0005, + "step": 42126 + }, + { + "epoch": 0.84256, + "grad_norm": 0.061756979674100876, + "learning_rate": 1.4725042250370436e-06, + "loss": 0.0035, + "step": 42128 + }, + { + "epoch": 0.8426, + "grad_norm": 0.04747011512517929, + "learning_rate": 1.4717750118959583e-06, + "loss": 0.0019, + "step": 42130 + }, + { + "epoch": 0.84264, + "grad_norm": 0.00438065966591239, + "learning_rate": 1.4710459650170094e-06, + "loss": 0.0072, + "step": 42132 + }, + { + "epoch": 0.84268, + "grad_norm": 1.2292536497116089, + "learning_rate": 1.4703170844144099e-06, + "loss": 0.015, + "step": 42134 + }, + { + "epoch": 0.84272, + "grad_norm": 0.04426782578229904, + "learning_rate": 1.4695883701023705e-06, + "loss": 0.0005, + "step": 42136 + }, + { + "epoch": 0.84276, + "grad_norm": 0.08031922578811646, + "learning_rate": 1.4688598220951012e-06, + "loss": 0.0028, + "step": 42138 + }, + { + "epoch": 0.8428, + "grad_norm": 0.04117940738797188, + "learning_rate": 1.468131440406798e-06, + "loss": 0.0004, + "step": 42140 + }, + { + "epoch": 0.84284, + "grad_norm": 0.0021258920896798372, + "learning_rate": 1.4674032250516646e-06, + "loss": 0.005, + "step": 42142 + }, + { + "epoch": 0.84288, + "grad_norm": 0.12068499624729156, + "learning_rate": 1.4666751760439013e-06, + "loss": 0.0055, + "step": 42144 + }, + { + "epoch": 0.84292, + "grad_norm": 0.6974189877510071, + "learning_rate": 1.4659472933976938e-06, + "loss": 0.0079, + "step": 42146 + }, + { + "epoch": 0.84296, + "grad_norm": 0.019404232501983643, + "learning_rate": 1.4652195771272427e-06, + "loss": 0.0011, + "step": 42148 + }, + { + "epoch": 0.843, + "grad_norm": 0.07391136884689331, + "learning_rate": 1.4644920272467245e-06, + "loss": 0.0155, + "step": 42150 + }, + { + "epoch": 0.84304, + "grad_norm": 0.020905403420329094, + "learning_rate": 1.4637646437703335e-06, + "loss": 0.002, + "step": 42152 + }, + { + "epoch": 0.84308, + "grad_norm": 0.17072920501232147, + "learning_rate": 1.4630374267122438e-06, + "loss": 0.0026, + "step": 42154 + }, + { + "epoch": 0.84312, + "grad_norm": 0.040923044085502625, + "learning_rate": 1.4623103760866342e-06, + "loss": 0.0011, + "step": 42156 + }, + { + "epoch": 0.84316, + "grad_norm": 0.06323245167732239, + "learning_rate": 1.461583491907681e-06, + "loss": 0.0031, + "step": 42158 + }, + { + "epoch": 0.8432, + "grad_norm": 3.6554248332977295, + "learning_rate": 1.4608567741895496e-06, + "loss": 0.0274, + "step": 42160 + }, + { + "epoch": 0.84324, + "grad_norm": 0.3016599714756012, + "learning_rate": 1.4601302229464155e-06, + "loss": 0.0054, + "step": 42162 + }, + { + "epoch": 0.84328, + "grad_norm": 0.029589297249913216, + "learning_rate": 1.459403838192438e-06, + "loss": 0.0005, + "step": 42164 + }, + { + "epoch": 0.84332, + "grad_norm": 0.026213286444544792, + "learning_rate": 1.4586776199417784e-06, + "loss": 0.001, + "step": 42166 + }, + { + "epoch": 0.84336, + "grad_norm": 0.0040268851444125175, + "learning_rate": 1.4579515682085977e-06, + "loss": 0.0009, + "step": 42168 + }, + { + "epoch": 0.8434, + "grad_norm": 0.0009396144887432456, + "learning_rate": 1.4572256830070497e-06, + "loss": 0.0064, + "step": 42170 + }, + { + "epoch": 0.84344, + "grad_norm": 0.012923960573971272, + "learning_rate": 1.4564999643512834e-06, + "loss": 0.0002, + "step": 42172 + }, + { + "epoch": 0.84348, + "grad_norm": 0.06305178254842758, + "learning_rate": 1.4557744122554495e-06, + "loss": 0.0011, + "step": 42174 + }, + { + "epoch": 0.84352, + "grad_norm": 0.0249912329018116, + "learning_rate": 1.4550490267336915e-06, + "loss": 0.0019, + "step": 42176 + }, + { + "epoch": 0.84356, + "grad_norm": 0.3560550808906555, + "learning_rate": 1.454323807800152e-06, + "loss": 0.0044, + "step": 42178 + }, + { + "epoch": 0.8436, + "grad_norm": 0.15015578269958496, + "learning_rate": 1.4535987554689712e-06, + "loss": 0.0017, + "step": 42180 + }, + { + "epoch": 0.84364, + "grad_norm": 0.04306351765990257, + "learning_rate": 1.4528738697542789e-06, + "loss": 0.0009, + "step": 42182 + }, + { + "epoch": 0.84368, + "grad_norm": 0.02334895357489586, + "learning_rate": 1.4521491506702157e-06, + "loss": 0.0006, + "step": 42184 + }, + { + "epoch": 0.84372, + "grad_norm": 0.011030231602489948, + "learning_rate": 1.4514245982309027e-06, + "loss": 0.0051, + "step": 42186 + }, + { + "epoch": 0.84376, + "grad_norm": 0.21126849949359894, + "learning_rate": 1.4507002124504676e-06, + "loss": 0.0018, + "step": 42188 + }, + { + "epoch": 0.8438, + "grad_norm": 0.011179660446941853, + "learning_rate": 1.4499759933430347e-06, + "loss": 0.0001, + "step": 42190 + }, + { + "epoch": 0.84384, + "grad_norm": 0.044593628495931625, + "learning_rate": 1.4492519409227213e-06, + "loss": 0.0009, + "step": 42192 + }, + { + "epoch": 0.84388, + "grad_norm": 0.455283522605896, + "learning_rate": 1.4485280552036451e-06, + "loss": 0.0036, + "step": 42194 + }, + { + "epoch": 0.84392, + "grad_norm": 0.035672128200531006, + "learning_rate": 1.447804336199915e-06, + "loss": 0.0012, + "step": 42196 + }, + { + "epoch": 0.84396, + "grad_norm": 0.09565547108650208, + "learning_rate": 1.4470807839256428e-06, + "loss": 0.0023, + "step": 42198 + }, + { + "epoch": 0.844, + "grad_norm": 0.018491128459572792, + "learning_rate": 1.446357398394934e-06, + "loss": 0.0019, + "step": 42200 + }, + { + "epoch": 0.84404, + "grad_norm": 6.982218265533447, + "learning_rate": 1.445634179621893e-06, + "loss": 0.0962, + "step": 42202 + }, + { + "epoch": 0.84408, + "grad_norm": 0.1779394894838333, + "learning_rate": 1.444911127620614e-06, + "loss": 0.0017, + "step": 42204 + }, + { + "epoch": 0.84412, + "grad_norm": 0.02536393515765667, + "learning_rate": 1.444188242405201e-06, + "loss": 0.0059, + "step": 42206 + }, + { + "epoch": 0.84416, + "grad_norm": 0.007283321116119623, + "learning_rate": 1.443465523989741e-06, + "loss": 0.0013, + "step": 42208 + }, + { + "epoch": 0.8442, + "grad_norm": 0.13546182215213776, + "learning_rate": 1.4427429723883256e-06, + "loss": 0.0311, + "step": 42210 + }, + { + "epoch": 0.84424, + "grad_norm": 0.06021787226200104, + "learning_rate": 1.442020587615045e-06, + "loss": 0.0013, + "step": 42212 + }, + { + "epoch": 0.84428, + "grad_norm": 0.035904642194509506, + "learning_rate": 1.4412983696839734e-06, + "loss": 0.0006, + "step": 42214 + }, + { + "epoch": 0.84432, + "grad_norm": 0.03492145240306854, + "learning_rate": 1.4405763186092004e-06, + "loss": 0.0021, + "step": 42216 + }, + { + "epoch": 0.84436, + "grad_norm": 0.007167469710111618, + "learning_rate": 1.4398544344047971e-06, + "loss": 0.0001, + "step": 42218 + }, + { + "epoch": 0.8444, + "grad_norm": 0.22544415295124054, + "learning_rate": 1.439132717084839e-06, + "loss": 0.0037, + "step": 42220 + }, + { + "epoch": 0.84444, + "grad_norm": 0.020487379282712936, + "learning_rate": 1.438411166663396e-06, + "loss": 0.0005, + "step": 42222 + }, + { + "epoch": 0.84448, + "grad_norm": 0.24712017178535461, + "learning_rate": 1.4376897831545345e-06, + "loss": 0.0026, + "step": 42224 + }, + { + "epoch": 0.84452, + "grad_norm": 0.19094155728816986, + "learning_rate": 1.436968566572322e-06, + "loss": 0.0014, + "step": 42226 + }, + { + "epoch": 0.84456, + "grad_norm": 0.06845049560070038, + "learning_rate": 1.4362475169308133e-06, + "loss": 0.1161, + "step": 42228 + }, + { + "epoch": 0.8446, + "grad_norm": 0.02004486322402954, + "learning_rate": 1.4355266342440678e-06, + "loss": 0.0008, + "step": 42230 + }, + { + "epoch": 0.84464, + "grad_norm": 0.12199696153402328, + "learning_rate": 1.43480591852614e-06, + "loss": 0.0085, + "step": 42232 + }, + { + "epoch": 0.84468, + "grad_norm": 0.021642133593559265, + "learning_rate": 1.4340853697910817e-06, + "loss": 0.0002, + "step": 42234 + }, + { + "epoch": 0.84472, + "grad_norm": 19.643287658691406, + "learning_rate": 1.4333649880529365e-06, + "loss": 0.7725, + "step": 42236 + }, + { + "epoch": 0.84476, + "grad_norm": 0.018143733963370323, + "learning_rate": 1.4326447733257542e-06, + "loss": 0.0046, + "step": 42238 + }, + { + "epoch": 0.8448, + "grad_norm": 0.0012407518224790692, + "learning_rate": 1.4319247256235713e-06, + "loss": 0.0021, + "step": 42240 + }, + { + "epoch": 0.84484, + "grad_norm": 0.24232421815395355, + "learning_rate": 1.4312048449604277e-06, + "loss": 0.0032, + "step": 42242 + }, + { + "epoch": 0.84488, + "grad_norm": 0.019311577081680298, + "learning_rate": 1.4304851313503587e-06, + "loss": 0.0006, + "step": 42244 + }, + { + "epoch": 0.84492, + "grad_norm": 0.1839057058095932, + "learning_rate": 1.4297655848073888e-06, + "loss": 0.0021, + "step": 42246 + }, + { + "epoch": 0.84496, + "grad_norm": 0.9923902750015259, + "learning_rate": 1.4290462053455566e-06, + "loss": 0.0092, + "step": 42248 + }, + { + "epoch": 0.845, + "grad_norm": 13.405145645141602, + "learning_rate": 1.4283269929788779e-06, + "loss": 0.303, + "step": 42250 + }, + { + "epoch": 0.84504, + "grad_norm": 0.04709972068667412, + "learning_rate": 1.4276079477213788e-06, + "loss": 0.0003, + "step": 42252 + }, + { + "epoch": 0.84508, + "grad_norm": 0.05828311666846275, + "learning_rate": 1.4268890695870753e-06, + "loss": 0.0019, + "step": 42254 + }, + { + "epoch": 0.84512, + "grad_norm": 0.38163748383522034, + "learning_rate": 1.4261703585899833e-06, + "loss": 0.0034, + "step": 42256 + }, + { + "epoch": 0.84516, + "grad_norm": 0.002736020367592573, + "learning_rate": 1.4254518147441143e-06, + "loss": 0.0003, + "step": 42258 + }, + { + "epoch": 0.8452, + "grad_norm": 0.03637805953621864, + "learning_rate": 1.4247334380634792e-06, + "loss": 0.001, + "step": 42260 + }, + { + "epoch": 0.84524, + "grad_norm": 0.10048086941242218, + "learning_rate": 1.424015228562078e-06, + "loss": 0.0013, + "step": 42262 + }, + { + "epoch": 0.84528, + "grad_norm": 0.2971240282058716, + "learning_rate": 1.4232971862539146e-06, + "loss": 0.7337, + "step": 42264 + }, + { + "epoch": 0.84532, + "grad_norm": 0.025023531168699265, + "learning_rate": 1.4225793111529885e-06, + "loss": 0.0036, + "step": 42266 + }, + { + "epoch": 0.84536, + "grad_norm": 0.0921776220202446, + "learning_rate": 1.4218616032732946e-06, + "loss": 0.001, + "step": 42268 + }, + { + "epoch": 0.8454, + "grad_norm": 0.99650639295578, + "learning_rate": 1.4211440626288286e-06, + "loss": 0.0066, + "step": 42270 + }, + { + "epoch": 0.84544, + "grad_norm": 0.04542176425457001, + "learning_rate": 1.4204266892335695e-06, + "loss": 0.0016, + "step": 42272 + }, + { + "epoch": 0.84548, + "grad_norm": 0.6479612588882446, + "learning_rate": 1.4197094831015145e-06, + "loss": 0.0083, + "step": 42274 + }, + { + "epoch": 0.84552, + "grad_norm": 0.03625049442052841, + "learning_rate": 1.4189924442466384e-06, + "loss": 0.0038, + "step": 42276 + }, + { + "epoch": 0.84556, + "grad_norm": 0.11890588700771332, + "learning_rate": 1.4182755726829222e-06, + "loss": 0.0026, + "step": 42278 + }, + { + "epoch": 0.8456, + "grad_norm": 0.024689659476280212, + "learning_rate": 1.4175588684243447e-06, + "loss": 0.0037, + "step": 42280 + }, + { + "epoch": 0.84564, + "grad_norm": 0.35204431414604187, + "learning_rate": 1.41684233148487e-06, + "loss": 0.0066, + "step": 42282 + }, + { + "epoch": 0.84568, + "grad_norm": 0.29142311215400696, + "learning_rate": 1.4161259618784783e-06, + "loss": 0.0065, + "step": 42284 + }, + { + "epoch": 0.84572, + "grad_norm": 0.49469462037086487, + "learning_rate": 1.4154097596191275e-06, + "loss": 0.0046, + "step": 42286 + }, + { + "epoch": 0.84576, + "grad_norm": 0.3985521197319031, + "learning_rate": 1.4146937247207836e-06, + "loss": 0.0051, + "step": 42288 + }, + { + "epoch": 0.8458, + "grad_norm": 0.0004866792296525091, + "learning_rate": 1.413977857197405e-06, + "loss": 0.0003, + "step": 42290 + }, + { + "epoch": 0.84584, + "grad_norm": 0.05546008050441742, + "learning_rate": 1.4132621570629512e-06, + "loss": 0.0005, + "step": 42292 + }, + { + "epoch": 0.84588, + "grad_norm": 1.196445345878601, + "learning_rate": 1.412546624331369e-06, + "loss": 0.0115, + "step": 42294 + }, + { + "epoch": 0.84592, + "grad_norm": 0.006405708845704794, + "learning_rate": 1.4118312590166128e-06, + "loss": 0.0523, + "step": 42296 + }, + { + "epoch": 0.84596, + "grad_norm": 0.01884870044887066, + "learning_rate": 1.411116061132627e-06, + "loss": 0.0003, + "step": 42298 + }, + { + "epoch": 0.846, + "grad_norm": 0.060663238167762756, + "learning_rate": 1.4104010306933558e-06, + "loss": 0.001, + "step": 42300 + }, + { + "epoch": 0.84604, + "grad_norm": 0.11654330790042877, + "learning_rate": 1.4096861677127415e-06, + "loss": 0.0015, + "step": 42302 + }, + { + "epoch": 0.84608, + "grad_norm": 0.9093104004859924, + "learning_rate": 1.408971472204712e-06, + "loss": 0.0085, + "step": 42304 + }, + { + "epoch": 0.84612, + "grad_norm": 0.0031753124203532934, + "learning_rate": 1.4082569441832128e-06, + "loss": 0.0001, + "step": 42306 + }, + { + "epoch": 0.84616, + "grad_norm": 0.30688154697418213, + "learning_rate": 1.4075425836621636e-06, + "loss": 0.0027, + "step": 42308 + }, + { + "epoch": 0.8462, + "grad_norm": 0.006967737805098295, + "learning_rate": 1.4068283906554969e-06, + "loss": 0.0001, + "step": 42310 + }, + { + "epoch": 0.84624, + "grad_norm": 0.8883883953094482, + "learning_rate": 1.406114365177137e-06, + "loss": 0.2407, + "step": 42312 + }, + { + "epoch": 0.84628, + "grad_norm": 0.01397330965846777, + "learning_rate": 1.4054005072409971e-06, + "loss": 0.0005, + "step": 42314 + }, + { + "epoch": 0.84632, + "grad_norm": 1.059781789779663, + "learning_rate": 1.404686816861004e-06, + "loss": 0.0104, + "step": 42316 + }, + { + "epoch": 0.84636, + "grad_norm": 0.09348633885383606, + "learning_rate": 1.403973294051063e-06, + "loss": 0.0026, + "step": 42318 + }, + { + "epoch": 0.8464, + "grad_norm": 0.7314796447753906, + "learning_rate": 1.40325993882509e-06, + "loss": 0.0113, + "step": 42320 + }, + { + "epoch": 0.84644, + "grad_norm": 0.08974923938512802, + "learning_rate": 1.4025467511969891e-06, + "loss": 0.0061, + "step": 42322 + }, + { + "epoch": 0.84648, + "grad_norm": 0.04548301920294762, + "learning_rate": 1.4018337311806685e-06, + "loss": 0.0007, + "step": 42324 + }, + { + "epoch": 0.84652, + "grad_norm": 0.024703463539481163, + "learning_rate": 1.4011208787900231e-06, + "loss": 0.0005, + "step": 42326 + }, + { + "epoch": 0.84656, + "grad_norm": 0.0842038244009018, + "learning_rate": 1.4004081940389536e-06, + "loss": 0.0013, + "step": 42328 + }, + { + "epoch": 0.8466, + "grad_norm": 0.003173259785398841, + "learning_rate": 1.399695676941354e-06, + "loss": 0.0002, + "step": 42330 + }, + { + "epoch": 0.84664, + "grad_norm": 0.0230412594974041, + "learning_rate": 1.3989833275111141e-06, + "loss": 0.0006, + "step": 42332 + }, + { + "epoch": 0.84668, + "grad_norm": 0.02755957469344139, + "learning_rate": 1.3982711457621245e-06, + "loss": 0.0007, + "step": 42334 + }, + { + "epoch": 0.84672, + "grad_norm": 0.31440556049346924, + "learning_rate": 1.3975591317082638e-06, + "loss": 0.0022, + "step": 42336 + }, + { + "epoch": 0.84676, + "grad_norm": 4.005842208862305, + "learning_rate": 1.3968472853634208e-06, + "loss": 0.0301, + "step": 42338 + }, + { + "epoch": 0.8468, + "grad_norm": 0.5153895616531372, + "learning_rate": 1.3961356067414667e-06, + "loss": 0.0058, + "step": 42340 + }, + { + "epoch": 0.84684, + "grad_norm": 0.3621648848056793, + "learning_rate": 1.3954240958562792e-06, + "loss": 0.0031, + "step": 42342 + }, + { + "epoch": 0.84688, + "grad_norm": 0.00906358752399683, + "learning_rate": 1.3947127527217287e-06, + "loss": 0.0004, + "step": 42344 + }, + { + "epoch": 0.84692, + "grad_norm": 3.484917402267456, + "learning_rate": 1.3940015773516824e-06, + "loss": 0.0423, + "step": 42346 + }, + { + "epoch": 0.84696, + "grad_norm": 0.021495291963219643, + "learning_rate": 1.3932905697600086e-06, + "loss": 0.0648, + "step": 42348 + }, + { + "epoch": 0.847, + "grad_norm": 0.20213721692562103, + "learning_rate": 1.3925797299605649e-06, + "loss": 0.0021, + "step": 42350 + }, + { + "epoch": 0.84704, + "grad_norm": 0.22943587601184845, + "learning_rate": 1.39186905796721e-06, + "loss": 0.006, + "step": 42352 + }, + { + "epoch": 0.84708, + "grad_norm": 0.00019643866107799113, + "learning_rate": 1.3911585537938e-06, + "loss": 0.0002, + "step": 42354 + }, + { + "epoch": 0.84712, + "grad_norm": 0.006742370314896107, + "learning_rate": 1.3904482174541877e-06, + "loss": 0.0017, + "step": 42356 + }, + { + "epoch": 0.84716, + "grad_norm": 0.06658373028039932, + "learning_rate": 1.3897380489622158e-06, + "loss": 0.002, + "step": 42358 + }, + { + "epoch": 0.8472, + "grad_norm": 0.40913593769073486, + "learning_rate": 1.3890280483317375e-06, + "loss": 0.0032, + "step": 42360 + }, + { + "epoch": 0.84724, + "grad_norm": 0.6920329332351685, + "learning_rate": 1.3883182155765895e-06, + "loss": 0.0106, + "step": 42362 + }, + { + "epoch": 0.84728, + "grad_norm": 0.006599453277885914, + "learning_rate": 1.3876085507106108e-06, + "loss": 0.0001, + "step": 42364 + }, + { + "epoch": 0.84732, + "grad_norm": 0.14220081269741058, + "learning_rate": 1.386899053747639e-06, + "loss": 0.0019, + "step": 42366 + }, + { + "epoch": 0.84736, + "grad_norm": 0.09606333076953888, + "learning_rate": 1.386189724701501e-06, + "loss": 0.0014, + "step": 42368 + }, + { + "epoch": 0.8474, + "grad_norm": 0.1115606352686882, + "learning_rate": 1.3854805635860335e-06, + "loss": 0.0012, + "step": 42370 + }, + { + "epoch": 0.84744, + "grad_norm": 0.21616867184638977, + "learning_rate": 1.3847715704150521e-06, + "loss": 0.8379, + "step": 42372 + }, + { + "epoch": 0.84748, + "grad_norm": 0.3311677575111389, + "learning_rate": 1.384062745202389e-06, + "loss": 0.1532, + "step": 42374 + }, + { + "epoch": 0.84752, + "grad_norm": 0.02011648379266262, + "learning_rate": 1.3833540879618567e-06, + "loss": 0.0036, + "step": 42376 + }, + { + "epoch": 0.84756, + "grad_norm": 0.17434296011924744, + "learning_rate": 1.3826455987072729e-06, + "loss": 0.0056, + "step": 42378 + }, + { + "epoch": 0.8476, + "grad_norm": 0.01659093238413334, + "learning_rate": 1.381937277452451e-06, + "loss": 0.0026, + "step": 42380 + }, + { + "epoch": 0.84764, + "grad_norm": 0.1201779916882515, + "learning_rate": 1.3812291242111975e-06, + "loss": 0.001, + "step": 42382 + }, + { + "epoch": 0.84768, + "grad_norm": 0.040475837886333466, + "learning_rate": 1.3805211389973184e-06, + "loss": 0.0077, + "step": 42384 + }, + { + "epoch": 0.84772, + "grad_norm": 0.02831646054983139, + "learning_rate": 1.379813321824618e-06, + "loss": 0.0004, + "step": 42386 + }, + { + "epoch": 0.84776, + "grad_norm": 7.248795986175537, + "learning_rate": 1.3791056727068953e-06, + "loss": 0.0793, + "step": 42388 + }, + { + "epoch": 0.8478, + "grad_norm": 0.006830867845565081, + "learning_rate": 1.3783981916579448e-06, + "loss": 0.0004, + "step": 42390 + }, + { + "epoch": 0.84784, + "grad_norm": 0.032445941120386124, + "learning_rate": 1.3776908786915633e-06, + "loss": 0.0025, + "step": 42392 + }, + { + "epoch": 0.84788, + "grad_norm": 0.05191221088171005, + "learning_rate": 1.3769837338215342e-06, + "loss": 0.0033, + "step": 42394 + }, + { + "epoch": 0.84792, + "grad_norm": 0.011074544861912727, + "learning_rate": 1.3762767570616463e-06, + "loss": 0.0002, + "step": 42396 + }, + { + "epoch": 0.84796, + "grad_norm": 0.05428781360387802, + "learning_rate": 1.3755699484256845e-06, + "loss": 0.0009, + "step": 42398 + }, + { + "epoch": 0.848, + "grad_norm": 0.0023128814063966274, + "learning_rate": 1.3748633079274254e-06, + "loss": 0.0, + "step": 42400 + }, + { + "epoch": 0.84804, + "grad_norm": 0.08082844316959381, + "learning_rate": 1.3741568355806488e-06, + "loss": 0.0071, + "step": 42402 + }, + { + "epoch": 0.84808, + "grad_norm": 0.18446746468544006, + "learning_rate": 1.373450531399122e-06, + "loss": 0.0088, + "step": 42404 + }, + { + "epoch": 0.84812, + "grad_norm": 0.013860723935067654, + "learning_rate": 1.3727443953966223e-06, + "loss": 0.0002, + "step": 42406 + }, + { + "epoch": 0.84816, + "grad_norm": 0.44798460602760315, + "learning_rate": 1.3720384275869103e-06, + "loss": 0.0042, + "step": 42408 + }, + { + "epoch": 0.8482, + "grad_norm": 1.4829143285751343, + "learning_rate": 1.3713326279837502e-06, + "loss": 0.0169, + "step": 42410 + }, + { + "epoch": 0.84824, + "grad_norm": 0.1316135972738266, + "learning_rate": 1.3706269966009033e-06, + "loss": 0.0037, + "step": 42412 + }, + { + "epoch": 0.84828, + "grad_norm": 0.004549400415271521, + "learning_rate": 1.3699215334521287e-06, + "loss": 0.0001, + "step": 42414 + }, + { + "epoch": 0.84832, + "grad_norm": 0.752167284488678, + "learning_rate": 1.3692162385511743e-06, + "loss": 0.0088, + "step": 42416 + }, + { + "epoch": 0.84836, + "grad_norm": 0.1818041354417801, + "learning_rate": 1.3685111119117923e-06, + "loss": 0.0019, + "step": 42418 + }, + { + "epoch": 0.8484, + "grad_norm": 0.005918905138969421, + "learning_rate": 1.3678061535477305e-06, + "loss": 0.0001, + "step": 42420 + }, + { + "epoch": 0.84844, + "grad_norm": 0.20848581194877625, + "learning_rate": 1.3671013634727314e-06, + "loss": 0.002, + "step": 42422 + }, + { + "epoch": 0.84848, + "grad_norm": 0.6069077253341675, + "learning_rate": 1.3663967417005387e-06, + "loss": 0.0079, + "step": 42424 + }, + { + "epoch": 0.84852, + "grad_norm": 0.18151524662971497, + "learning_rate": 1.365692288244881e-06, + "loss": 0.0017, + "step": 42426 + }, + { + "epoch": 0.84856, + "grad_norm": 0.4012829661369324, + "learning_rate": 1.3649880031195029e-06, + "loss": 0.0042, + "step": 42428 + }, + { + "epoch": 0.8486, + "grad_norm": 0.019642408937215805, + "learning_rate": 1.3642838863381258e-06, + "loss": 0.0007, + "step": 42430 + }, + { + "epoch": 0.84864, + "grad_norm": 0.31495755910873413, + "learning_rate": 1.363579937914481e-06, + "loss": 0.0075, + "step": 42432 + }, + { + "epoch": 0.84868, + "grad_norm": 0.11711214482784271, + "learning_rate": 1.362876157862294e-06, + "loss": 0.0013, + "step": 42434 + }, + { + "epoch": 0.84872, + "grad_norm": 0.1421690434217453, + "learning_rate": 1.3621725461952784e-06, + "loss": 0.0118, + "step": 42436 + }, + { + "epoch": 0.84876, + "grad_norm": 0.02610200084745884, + "learning_rate": 1.361469102927161e-06, + "loss": 0.0004, + "step": 42438 + }, + { + "epoch": 0.8488, + "grad_norm": 0.01833287626504898, + "learning_rate": 1.3607658280716474e-06, + "loss": 0.0014, + "step": 42440 + }, + { + "epoch": 0.84884, + "grad_norm": 0.10292219370603561, + "learning_rate": 1.3600627216424521e-06, + "loss": 0.008, + "step": 42442 + }, + { + "epoch": 0.84888, + "grad_norm": 0.27084118127822876, + "learning_rate": 1.3593597836532834e-06, + "loss": 0.0042, + "step": 42444 + }, + { + "epoch": 0.84892, + "grad_norm": 0.000603521941229701, + "learning_rate": 1.3586570141178457e-06, + "loss": 0.0002, + "step": 42446 + }, + { + "epoch": 0.84896, + "grad_norm": 0.11787191033363342, + "learning_rate": 1.3579544130498357e-06, + "loss": 0.002, + "step": 42448 + }, + { + "epoch": 0.849, + "grad_norm": 0.08319979906082153, + "learning_rate": 1.3572519804629537e-06, + "loss": 0.0014, + "step": 42450 + }, + { + "epoch": 0.84904, + "grad_norm": 0.02754344791173935, + "learning_rate": 1.3565497163708941e-06, + "loss": 0.0003, + "step": 42452 + }, + { + "epoch": 0.84908, + "grad_norm": 0.019367529079318047, + "learning_rate": 1.3558476207873484e-06, + "loss": 0.0033, + "step": 42454 + }, + { + "epoch": 0.84912, + "grad_norm": 0.04323722422122955, + "learning_rate": 1.3551456937260055e-06, + "loss": 0.002, + "step": 42456 + }, + { + "epoch": 0.84916, + "grad_norm": 7.672364234924316, + "learning_rate": 1.3544439352005434e-06, + "loss": 0.0719, + "step": 42458 + }, + { + "epoch": 0.8492, + "grad_norm": 0.18069951236248016, + "learning_rate": 1.3537423452246522e-06, + "loss": 0.0071, + "step": 42460 + }, + { + "epoch": 0.84924, + "grad_norm": 0.12118938565254211, + "learning_rate": 1.3530409238120023e-06, + "loss": 0.0591, + "step": 42462 + }, + { + "epoch": 0.84928, + "grad_norm": 0.30169421434402466, + "learning_rate": 1.3523396709762727e-06, + "loss": 0.0017, + "step": 42464 + }, + { + "epoch": 0.84932, + "grad_norm": 0.052037563174963, + "learning_rate": 1.3516385867311333e-06, + "loss": 0.0006, + "step": 42466 + }, + { + "epoch": 0.84936, + "grad_norm": 0.02585579827427864, + "learning_rate": 1.3509376710902512e-06, + "loss": 0.0002, + "step": 42468 + }, + { + "epoch": 0.8494, + "grad_norm": 0.18767531216144562, + "learning_rate": 1.3502369240672941e-06, + "loss": 0.01, + "step": 42470 + }, + { + "epoch": 0.84944, + "grad_norm": 0.0011003839317709208, + "learning_rate": 1.3495363456759202e-06, + "loss": 0.0027, + "step": 42472 + }, + { + "epoch": 0.84948, + "grad_norm": 0.0015685507096350193, + "learning_rate": 1.3488359359297886e-06, + "loss": 0.0003, + "step": 42474 + }, + { + "epoch": 0.84952, + "grad_norm": 1.1569280624389648, + "learning_rate": 1.348135694842554e-06, + "loss": 0.0072, + "step": 42476 + }, + { + "epoch": 0.84956, + "grad_norm": 0.4174829125404358, + "learning_rate": 1.3474356224278684e-06, + "loss": 0.0036, + "step": 42478 + }, + { + "epoch": 0.8496, + "grad_norm": 0.00028587752603925765, + "learning_rate": 1.3467357186993802e-06, + "loss": 0.0009, + "step": 42480 + }, + { + "epoch": 0.84964, + "grad_norm": 0.5380117297172546, + "learning_rate": 1.346035983670736e-06, + "loss": 0.0059, + "step": 42482 + }, + { + "epoch": 0.84968, + "grad_norm": 0.1497165709733963, + "learning_rate": 1.3453364173555738e-06, + "loss": 0.0013, + "step": 42484 + }, + { + "epoch": 0.84972, + "grad_norm": 1.3017634153366089, + "learning_rate": 1.344637019767533e-06, + "loss": 0.0117, + "step": 42486 + }, + { + "epoch": 0.84976, + "grad_norm": 0.07893674075603485, + "learning_rate": 1.3439377909202533e-06, + "loss": 0.0013, + "step": 42488 + }, + { + "epoch": 0.8498, + "grad_norm": 0.008562091737985611, + "learning_rate": 1.3432387308273576e-06, + "loss": 0.0003, + "step": 42490 + }, + { + "epoch": 0.84984, + "grad_norm": 0.8020035028457642, + "learning_rate": 1.3425398395024835e-06, + "loss": 0.0116, + "step": 42492 + }, + { + "epoch": 0.84988, + "grad_norm": 0.013353683054447174, + "learning_rate": 1.3418411169592482e-06, + "loss": 0.0005, + "step": 42494 + }, + { + "epoch": 0.84992, + "grad_norm": 0.0042521473951637745, + "learning_rate": 1.341142563211283e-06, + "loss": 0.0003, + "step": 42496 + }, + { + "epoch": 0.84996, + "grad_norm": 0.02476016990840435, + "learning_rate": 1.3404441782721988e-06, + "loss": 0.0029, + "step": 42498 + }, + { + "epoch": 0.85, + "grad_norm": 0.3046378791332245, + "learning_rate": 1.339745962155613e-06, + "loss": 0.0028, + "step": 42500 + }, + { + "epoch": 0.85004, + "grad_norm": 0.0026918817311525345, + "learning_rate": 1.3390479148751413e-06, + "loss": 0.0035, + "step": 42502 + }, + { + "epoch": 0.85008, + "grad_norm": 0.0421946756541729, + "learning_rate": 1.338350036444387e-06, + "loss": 0.0006, + "step": 42504 + }, + { + "epoch": 0.85012, + "grad_norm": 0.02410723827779293, + "learning_rate": 1.3376523268769582e-06, + "loss": 0.0004, + "step": 42506 + }, + { + "epoch": 0.85016, + "grad_norm": 0.012686993926763535, + "learning_rate": 1.3369547861864563e-06, + "loss": 0.0001, + "step": 42508 + }, + { + "epoch": 0.8502, + "grad_norm": 0.6409704685211182, + "learning_rate": 1.3362574143864816e-06, + "loss": 0.4549, + "step": 42510 + }, + { + "epoch": 0.85024, + "grad_norm": 0.055853866040706635, + "learning_rate": 1.3355602114906285e-06, + "loss": 0.0009, + "step": 42512 + }, + { + "epoch": 0.85028, + "grad_norm": 0.007705458905547857, + "learning_rate": 1.3348631775124931e-06, + "loss": 0.0005, + "step": 42514 + }, + { + "epoch": 0.85032, + "grad_norm": 0.9010357856750488, + "learning_rate": 1.334166312465658e-06, + "loss": 0.475, + "step": 42516 + }, + { + "epoch": 0.85036, + "grad_norm": 0.05061478167772293, + "learning_rate": 1.3334696163637118e-06, + "loss": 0.0006, + "step": 42518 + }, + { + "epoch": 0.8504, + "grad_norm": 0.0007310020155273378, + "learning_rate": 1.3327730892202384e-06, + "loss": 0.0001, + "step": 42520 + }, + { + "epoch": 0.85044, + "grad_norm": 1.4776195287704468, + "learning_rate": 1.3320767310488148e-06, + "loss": 0.0107, + "step": 42522 + }, + { + "epoch": 0.85048, + "grad_norm": 0.1308382749557495, + "learning_rate": 1.33138054186302e-06, + "loss": 0.0023, + "step": 42524 + }, + { + "epoch": 0.85052, + "grad_norm": 0.002355200005695224, + "learning_rate": 1.330684521676421e-06, + "loss": 0.0002, + "step": 42526 + }, + { + "epoch": 0.85056, + "grad_norm": 0.42429110407829285, + "learning_rate": 1.3299886705025944e-06, + "loss": 0.0041, + "step": 42528 + }, + { + "epoch": 0.8506, + "grad_norm": 0.21843938529491425, + "learning_rate": 1.3292929883550998e-06, + "loss": 0.0026, + "step": 42530 + }, + { + "epoch": 0.85064, + "grad_norm": 0.012765023857355118, + "learning_rate": 1.3285974752475027e-06, + "loss": 0.0004, + "step": 42532 + }, + { + "epoch": 0.85068, + "grad_norm": 0.08442762494087219, + "learning_rate": 1.3279021311933626e-06, + "loss": 0.001, + "step": 42534 + }, + { + "epoch": 0.85072, + "grad_norm": 0.07453622668981552, + "learning_rate": 1.3272069562062362e-06, + "loss": 0.0005, + "step": 42536 + }, + { + "epoch": 0.85076, + "grad_norm": 0.0005157595151104033, + "learning_rate": 1.3265119502996738e-06, + "loss": 0.0011, + "step": 42538 + }, + { + "epoch": 0.8508, + "grad_norm": 0.025487463921308517, + "learning_rate": 1.3258171134872267e-06, + "loss": 0.1627, + "step": 42540 + }, + { + "epoch": 0.85084, + "grad_norm": 0.029122892767190933, + "learning_rate": 1.3251224457824397e-06, + "loss": 0.0004, + "step": 42542 + }, + { + "epoch": 0.85088, + "grad_norm": 0.5148288011550903, + "learning_rate": 1.3244279471988575e-06, + "loss": 0.0049, + "step": 42544 + }, + { + "epoch": 0.85092, + "grad_norm": 0.14291180670261383, + "learning_rate": 1.3237336177500214e-06, + "loss": 0.0392, + "step": 42546 + }, + { + "epoch": 0.85096, + "grad_norm": 0.5333674550056458, + "learning_rate": 1.3230394574494598e-06, + "loss": 0.0051, + "step": 42548 + }, + { + "epoch": 0.851, + "grad_norm": 0.31890368461608887, + "learning_rate": 1.322345466310717e-06, + "loss": 0.0038, + "step": 42550 + }, + { + "epoch": 0.85104, + "grad_norm": 0.06867668777704239, + "learning_rate": 1.3216516443473137e-06, + "loss": 0.0012, + "step": 42552 + }, + { + "epoch": 0.85108, + "grad_norm": 0.00591964041814208, + "learning_rate": 1.3209579915727799e-06, + "loss": 0.0008, + "step": 42554 + }, + { + "epoch": 0.85112, + "grad_norm": 0.06838324666023254, + "learning_rate": 1.3202645080006404e-06, + "loss": 0.0005, + "step": 42556 + }, + { + "epoch": 0.85116, + "grad_norm": 1.1246647834777832, + "learning_rate": 1.3195711936444077e-06, + "loss": 0.0073, + "step": 42558 + }, + { + "epoch": 0.8512, + "grad_norm": 0.5090397596359253, + "learning_rate": 1.3188780485176089e-06, + "loss": 0.0071, + "step": 42560 + }, + { + "epoch": 0.85124, + "grad_norm": 0.020561568439006805, + "learning_rate": 1.3181850726337498e-06, + "loss": 0.0005, + "step": 42562 + }, + { + "epoch": 0.85128, + "grad_norm": 1.0020453929901123, + "learning_rate": 1.3174922660063428e-06, + "loss": 0.0083, + "step": 42564 + }, + { + "epoch": 0.85132, + "grad_norm": 0.67638099193573, + "learning_rate": 1.3167996286488926e-06, + "loss": 0.0086, + "step": 42566 + }, + { + "epoch": 0.85136, + "grad_norm": 0.06289058923721313, + "learning_rate": 1.3161071605749076e-06, + "loss": 0.0054, + "step": 42568 + }, + { + "epoch": 0.8514, + "grad_norm": 0.004839160013943911, + "learning_rate": 1.3154148617978813e-06, + "loss": 0.0012, + "step": 42570 + }, + { + "epoch": 0.85144, + "grad_norm": 0.14553849399089813, + "learning_rate": 1.314722732331314e-06, + "loss": 0.0013, + "step": 42572 + }, + { + "epoch": 0.85148, + "grad_norm": 0.067535400390625, + "learning_rate": 1.314030772188698e-06, + "loss": 0.0011, + "step": 42574 + }, + { + "epoch": 0.85152, + "grad_norm": 2.1218628883361816, + "learning_rate": 1.313338981383524e-06, + "loss": 0.0152, + "step": 42576 + }, + { + "epoch": 0.85156, + "grad_norm": 0.0062806433998048306, + "learning_rate": 1.312647359929281e-06, + "loss": 0.0013, + "step": 42578 + }, + { + "epoch": 0.8516, + "grad_norm": 0.19378773868083954, + "learning_rate": 1.3119559078394462e-06, + "loss": 0.0053, + "step": 42580 + }, + { + "epoch": 0.85164, + "grad_norm": 0.40831685066223145, + "learning_rate": 1.3112646251275085e-06, + "loss": 0.0051, + "step": 42582 + }, + { + "epoch": 0.85168, + "grad_norm": 0.36066728830337524, + "learning_rate": 1.3105735118069373e-06, + "loss": 0.0108, + "step": 42584 + }, + { + "epoch": 0.85172, + "grad_norm": 0.18542106449604034, + "learning_rate": 1.3098825678912097e-06, + "loss": 0.0023, + "step": 42586 + }, + { + "epoch": 0.85176, + "grad_norm": 0.024788320064544678, + "learning_rate": 1.3091917933937948e-06, + "loss": 0.0012, + "step": 42588 + }, + { + "epoch": 0.8518, + "grad_norm": 0.5872648358345032, + "learning_rate": 1.3085011883281606e-06, + "loss": 0.0061, + "step": 42590 + }, + { + "epoch": 0.85184, + "grad_norm": 0.34721216559410095, + "learning_rate": 1.307810752707772e-06, + "loss": 0.0054, + "step": 42592 + }, + { + "epoch": 0.85188, + "grad_norm": 0.017006024718284607, + "learning_rate": 1.3071204865460863e-06, + "loss": 0.0025, + "step": 42594 + }, + { + "epoch": 0.85192, + "grad_norm": 2.4457452297210693, + "learning_rate": 1.3064303898565622e-06, + "loss": 0.0233, + "step": 42596 + }, + { + "epoch": 0.85196, + "grad_norm": 0.0022683199495077133, + "learning_rate": 1.3057404626526527e-06, + "loss": 0.0004, + "step": 42598 + }, + { + "epoch": 0.852, + "grad_norm": 8.442403793334961, + "learning_rate": 1.30505070494781e-06, + "loss": 0.0723, + "step": 42600 + }, + { + "epoch": 0.85204, + "grad_norm": 0.010394465178251266, + "learning_rate": 1.3043611167554792e-06, + "loss": 0.0337, + "step": 42602 + }, + { + "epoch": 0.85208, + "grad_norm": 0.0023621656000614166, + "learning_rate": 1.3036716980891084e-06, + "loss": 0.0, + "step": 42604 + }, + { + "epoch": 0.85212, + "grad_norm": 0.009409599006175995, + "learning_rate": 1.3029824489621324e-06, + "loss": 0.1157, + "step": 42606 + }, + { + "epoch": 0.85216, + "grad_norm": 0.07700295001268387, + "learning_rate": 1.3022933693879913e-06, + "loss": 0.0034, + "step": 42608 + }, + { + "epoch": 0.8522, + "grad_norm": 0.00025980634381994605, + "learning_rate": 1.3016044593801202e-06, + "loss": 0.0001, + "step": 42610 + }, + { + "epoch": 0.85224, + "grad_norm": 0.051270175725221634, + "learning_rate": 1.300915718951945e-06, + "loss": 0.0008, + "step": 42612 + }, + { + "epoch": 0.85228, + "grad_norm": 0.010167308151721954, + "learning_rate": 1.3002271481169004e-06, + "loss": 0.0005, + "step": 42614 + }, + { + "epoch": 0.85232, + "grad_norm": 0.5055748224258423, + "learning_rate": 1.2995387468884014e-06, + "loss": 0.0044, + "step": 42616 + }, + { + "epoch": 0.85236, + "grad_norm": 0.14726512134075165, + "learning_rate": 1.298850515279879e-06, + "loss": 0.0015, + "step": 42618 + }, + { + "epoch": 0.8524, + "grad_norm": 0.013174456544220448, + "learning_rate": 1.2981624533047432e-06, + "loss": 0.0073, + "step": 42620 + }, + { + "epoch": 0.85244, + "grad_norm": 0.29353034496307373, + "learning_rate": 1.2974745609764106e-06, + "loss": 0.0022, + "step": 42622 + }, + { + "epoch": 0.85248, + "grad_norm": 0.0923752635717392, + "learning_rate": 1.2967868383082939e-06, + "loss": 0.0012, + "step": 42624 + }, + { + "epoch": 0.85252, + "grad_norm": 0.14649778604507446, + "learning_rate": 1.2960992853137965e-06, + "loss": 0.0146, + "step": 42626 + }, + { + "epoch": 0.85256, + "grad_norm": 0.002327250549569726, + "learning_rate": 1.2954119020063238e-06, + "loss": 0.0, + "step": 42628 + }, + { + "epoch": 0.8526, + "grad_norm": 0.2941616177558899, + "learning_rate": 1.294724688399278e-06, + "loss": 0.008, + "step": 42630 + }, + { + "epoch": 0.85264, + "grad_norm": 0.3162691295146942, + "learning_rate": 1.2940376445060564e-06, + "loss": 0.0027, + "step": 42632 + }, + { + "epoch": 0.85268, + "grad_norm": 0.03918793797492981, + "learning_rate": 1.2933507703400527e-06, + "loss": 0.0004, + "step": 42634 + }, + { + "epoch": 0.85272, + "grad_norm": 0.21762514114379883, + "learning_rate": 1.2926640659146606e-06, + "loss": 0.015, + "step": 42636 + }, + { + "epoch": 0.85276, + "grad_norm": 0.004162197466939688, + "learning_rate": 1.2919775312432636e-06, + "loss": 0.0002, + "step": 42638 + }, + { + "epoch": 0.8528, + "grad_norm": 0.03028460219502449, + "learning_rate": 1.2912911663392468e-06, + "loss": 0.0004, + "step": 42640 + }, + { + "epoch": 0.85284, + "grad_norm": 0.014953208155930042, + "learning_rate": 1.2906049712159952e-06, + "loss": 0.0002, + "step": 42642 + }, + { + "epoch": 0.85288, + "grad_norm": 0.004297535866498947, + "learning_rate": 1.2899189458868788e-06, + "loss": 0.005, + "step": 42644 + }, + { + "epoch": 0.85292, + "grad_norm": 0.03495680168271065, + "learning_rate": 1.2892330903652817e-06, + "loss": 0.0022, + "step": 42646 + }, + { + "epoch": 0.85296, + "grad_norm": 0.26354607939720154, + "learning_rate": 1.2885474046645651e-06, + "loss": 0.0547, + "step": 42648 + }, + { + "epoch": 0.853, + "grad_norm": 0.013574305921792984, + "learning_rate": 1.2878618887981064e-06, + "loss": 0.0051, + "step": 42650 + }, + { + "epoch": 0.85304, + "grad_norm": 23.1075439453125, + "learning_rate": 1.2871765427792637e-06, + "loss": 0.6495, + "step": 42652 + }, + { + "epoch": 0.85308, + "grad_norm": 0.1752166748046875, + "learning_rate": 1.2864913666213996e-06, + "loss": 0.0022, + "step": 42654 + }, + { + "epoch": 0.85312, + "grad_norm": 0.004636054392904043, + "learning_rate": 1.2858063603378724e-06, + "loss": 0.0002, + "step": 42656 + }, + { + "epoch": 0.85316, + "grad_norm": 17.461198806762695, + "learning_rate": 1.285121523942039e-06, + "loss": 0.3187, + "step": 42658 + }, + { + "epoch": 0.8532, + "grad_norm": 0.016772257164120674, + "learning_rate": 1.2844368574472454e-06, + "loss": 0.0133, + "step": 42660 + }, + { + "epoch": 0.85324, + "grad_norm": 0.07307910919189453, + "learning_rate": 1.2837523608668424e-06, + "loss": 0.0013, + "step": 42662 + }, + { + "epoch": 0.85328, + "grad_norm": 0.009614848531782627, + "learning_rate": 1.2830680342141754e-06, + "loss": 0.0001, + "step": 42664 + }, + { + "epoch": 0.85332, + "grad_norm": 0.004936000797897577, + "learning_rate": 1.2823838775025843e-06, + "loss": 0.0582, + "step": 42666 + }, + { + "epoch": 0.85336, + "grad_norm": 0.188114732503891, + "learning_rate": 1.2816998907454093e-06, + "loss": 0.002, + "step": 42668 + }, + { + "epoch": 0.8534, + "grad_norm": 0.05033601075410843, + "learning_rate": 1.2810160739559797e-06, + "loss": 0.0013, + "step": 42670 + }, + { + "epoch": 0.85344, + "grad_norm": 3.9283103942871094, + "learning_rate": 1.2803324271476347e-06, + "loss": 0.0338, + "step": 42672 + }, + { + "epoch": 0.85348, + "grad_norm": 0.03117547184228897, + "learning_rate": 1.2796489503336962e-06, + "loss": 0.001, + "step": 42674 + }, + { + "epoch": 0.85352, + "grad_norm": 1.8088626861572266, + "learning_rate": 1.278965643527491e-06, + "loss": 0.0167, + "step": 42676 + }, + { + "epoch": 0.85356, + "grad_norm": 0.22373829782009125, + "learning_rate": 1.2782825067423422e-06, + "loss": 0.0042, + "step": 42678 + }, + { + "epoch": 0.8536, + "grad_norm": 0.027580907568335533, + "learning_rate": 1.277599539991563e-06, + "loss": 0.003, + "step": 42680 + }, + { + "epoch": 0.85364, + "grad_norm": 0.01786392740905285, + "learning_rate": 1.2769167432884743e-06, + "loss": 0.0032, + "step": 42682 + }, + { + "epoch": 0.85368, + "grad_norm": 0.11814898252487183, + "learning_rate": 1.2762341166463832e-06, + "loss": 0.0013, + "step": 42684 + }, + { + "epoch": 0.85372, + "grad_norm": 0.686008632183075, + "learning_rate": 1.2755516600785988e-06, + "loss": 0.0065, + "step": 42686 + }, + { + "epoch": 0.85376, + "grad_norm": 0.018739039078354836, + "learning_rate": 1.2748693735984275e-06, + "loss": 0.0006, + "step": 42688 + }, + { + "epoch": 0.8538, + "grad_norm": 0.13448715209960938, + "learning_rate": 1.2741872572191684e-06, + "loss": 0.0024, + "step": 42690 + }, + { + "epoch": 0.85384, + "grad_norm": 0.11078233271837234, + "learning_rate": 1.273505310954124e-06, + "loss": 0.0034, + "step": 42692 + }, + { + "epoch": 0.85388, + "grad_norm": 0.06629932671785355, + "learning_rate": 1.272823534816584e-06, + "loss": 0.0426, + "step": 42694 + }, + { + "epoch": 0.85392, + "grad_norm": 0.15575838088989258, + "learning_rate": 1.2721419288198423e-06, + "loss": 0.0015, + "step": 42696 + }, + { + "epoch": 0.85396, + "grad_norm": 0.8298280835151672, + "learning_rate": 1.2714604929771868e-06, + "loss": 0.0074, + "step": 42698 + }, + { + "epoch": 0.854, + "grad_norm": 0.004454044625163078, + "learning_rate": 1.2707792273019049e-06, + "loss": 0.0022, + "step": 42700 + }, + { + "epoch": 0.85404, + "grad_norm": 0.04768560454249382, + "learning_rate": 1.2700981318072724e-06, + "loss": 0.0006, + "step": 42702 + }, + { + "epoch": 0.85408, + "grad_norm": 0.1538013517856598, + "learning_rate": 1.2694172065065746e-06, + "loss": 0.0039, + "step": 42704 + }, + { + "epoch": 0.85412, + "grad_norm": 13.305890083312988, + "learning_rate": 1.2687364514130817e-06, + "loss": 0.2347, + "step": 42706 + }, + { + "epoch": 0.85416, + "grad_norm": 0.0073629822582006454, + "learning_rate": 1.2680558665400667e-06, + "loss": 0.0001, + "step": 42708 + }, + { + "epoch": 0.8542, + "grad_norm": 0.08918716758489609, + "learning_rate": 1.2673754519008008e-06, + "loss": 0.0013, + "step": 42710 + }, + { + "epoch": 0.85424, + "grad_norm": 0.03724024444818497, + "learning_rate": 1.2666952075085414e-06, + "loss": 0.002, + "step": 42712 + }, + { + "epoch": 0.85428, + "grad_norm": 0.04848112538456917, + "learning_rate": 1.266015133376559e-06, + "loss": 0.0015, + "step": 42714 + }, + { + "epoch": 0.85432, + "grad_norm": 0.0002984111779369414, + "learning_rate": 1.2653352295181065e-06, + "loss": 0.0, + "step": 42716 + }, + { + "epoch": 0.85436, + "grad_norm": 0.24002821743488312, + "learning_rate": 1.2646554959464407e-06, + "loss": 0.0048, + "step": 42718 + }, + { + "epoch": 0.8544, + "grad_norm": 0.09972447901964188, + "learning_rate": 1.2639759326748136e-06, + "loss": 0.0025, + "step": 42720 + }, + { + "epoch": 0.85444, + "grad_norm": 0.009860699996352196, + "learning_rate": 1.263296539716473e-06, + "loss": 0.0003, + "step": 42722 + }, + { + "epoch": 0.85448, + "grad_norm": 0.014252918772399426, + "learning_rate": 1.2626173170846633e-06, + "loss": 0.0011, + "step": 42724 + }, + { + "epoch": 0.85452, + "grad_norm": 0.09945983439683914, + "learning_rate": 1.2619382647926304e-06, + "loss": 0.0026, + "step": 42726 + }, + { + "epoch": 0.85456, + "grad_norm": 0.01199052669107914, + "learning_rate": 1.261259382853608e-06, + "loss": 0.0004, + "step": 42728 + }, + { + "epoch": 0.8546, + "grad_norm": 0.05305318161845207, + "learning_rate": 1.2605806712808322e-06, + "loss": 0.0005, + "step": 42730 + }, + { + "epoch": 0.85464, + "grad_norm": 0.0066393776796758175, + "learning_rate": 1.259902130087538e-06, + "loss": 0.0001, + "step": 42732 + }, + { + "epoch": 0.85468, + "grad_norm": 0.017765160650014877, + "learning_rate": 1.259223759286947e-06, + "loss": 0.0004, + "step": 42734 + }, + { + "epoch": 0.85472, + "grad_norm": 1.4292083978652954, + "learning_rate": 1.2585455588922935e-06, + "loss": 0.0149, + "step": 42736 + }, + { + "epoch": 0.85476, + "grad_norm": 0.08232983946800232, + "learning_rate": 1.2578675289167897e-06, + "loss": 0.0009, + "step": 42738 + }, + { + "epoch": 0.8548, + "grad_norm": 0.049945808947086334, + "learning_rate": 1.257189669373664e-06, + "loss": 0.0007, + "step": 42740 + }, + { + "epoch": 0.85484, + "grad_norm": 0.04830142483115196, + "learning_rate": 1.256511980276125e-06, + "loss": 0.0008, + "step": 42742 + }, + { + "epoch": 0.85488, + "grad_norm": 4.952301025390625, + "learning_rate": 1.2558344616373862e-06, + "loss": 0.0384, + "step": 42744 + }, + { + "epoch": 0.85492, + "grad_norm": 0.17827042937278748, + "learning_rate": 1.2551571134706585e-06, + "loss": 0.0019, + "step": 42746 + }, + { + "epoch": 0.85496, + "grad_norm": 0.002838440239429474, + "learning_rate": 1.2544799357891425e-06, + "loss": 0.0001, + "step": 42748 + }, + { + "epoch": 0.855, + "grad_norm": 0.00011732838174793869, + "learning_rate": 1.2538029286060428e-06, + "loss": 0.0009, + "step": 42750 + }, + { + "epoch": 0.85504, + "grad_norm": 0.3500504493713379, + "learning_rate": 1.2531260919345577e-06, + "loss": 0.0069, + "step": 42752 + }, + { + "epoch": 0.85508, + "grad_norm": 0.011399664916098118, + "learning_rate": 1.2524494257878828e-06, + "loss": 0.0011, + "step": 42754 + }, + { + "epoch": 0.85512, + "grad_norm": 0.09151656180620193, + "learning_rate": 1.2517729301792102e-06, + "loss": 0.0013, + "step": 42756 + }, + { + "epoch": 0.85516, + "grad_norm": 0.0018520840676501393, + "learning_rate": 1.25109660512173e-06, + "loss": 0.0003, + "step": 42758 + }, + { + "epoch": 0.8552, + "grad_norm": 0.04243931174278259, + "learning_rate": 1.2504204506286244e-06, + "loss": 0.0005, + "step": 42760 + }, + { + "epoch": 0.85524, + "grad_norm": 0.05072589963674545, + "learning_rate": 1.2497444667130753e-06, + "loss": 0.0004, + "step": 42762 + }, + { + "epoch": 0.85528, + "grad_norm": 0.001527918502688408, + "learning_rate": 1.2490686533882656e-06, + "loss": 0.0013, + "step": 42764 + }, + { + "epoch": 0.85532, + "grad_norm": 0.1450149267911911, + "learning_rate": 1.2483930106673635e-06, + "loss": 0.0014, + "step": 42766 + }, + { + "epoch": 0.85536, + "grad_norm": 0.006702201906591654, + "learning_rate": 1.2477175385635499e-06, + "loss": 0.0133, + "step": 42768 + }, + { + "epoch": 0.8554, + "grad_norm": 0.0016228918684646487, + "learning_rate": 1.2470422370899838e-06, + "loss": 0.0002, + "step": 42770 + }, + { + "epoch": 0.85544, + "grad_norm": 0.008257693611085415, + "learning_rate": 1.2463671062598404e-06, + "loss": 0.0523, + "step": 42772 + }, + { + "epoch": 0.85548, + "grad_norm": 0.2783275544643402, + "learning_rate": 1.2456921460862748e-06, + "loss": 0.0022, + "step": 42774 + }, + { + "epoch": 0.85552, + "grad_norm": 0.11715634167194366, + "learning_rate": 1.2450173565824474e-06, + "loss": 0.0967, + "step": 42776 + }, + { + "epoch": 0.85556, + "grad_norm": 1.1745059490203857, + "learning_rate": 1.244342737761517e-06, + "loss": 0.0104, + "step": 42778 + }, + { + "epoch": 0.8556, + "grad_norm": 0.0686262995004654, + "learning_rate": 1.2436682896366282e-06, + "loss": 0.0007, + "step": 42780 + }, + { + "epoch": 0.85564, + "grad_norm": 0.0013645283179357648, + "learning_rate": 1.2429940122209372e-06, + "loss": 0.0002, + "step": 42782 + }, + { + "epoch": 0.85568, + "grad_norm": 0.05534737929701805, + "learning_rate": 1.242319905527586e-06, + "loss": 0.0009, + "step": 42784 + }, + { + "epoch": 0.85572, + "grad_norm": 0.0669277012348175, + "learning_rate": 1.241645969569716e-06, + "loss": 0.0007, + "step": 42786 + }, + { + "epoch": 0.85576, + "grad_norm": 0.12146365642547607, + "learning_rate": 1.2409722043604665e-06, + "loss": 0.0013, + "step": 42788 + }, + { + "epoch": 0.8558, + "grad_norm": 0.007523350417613983, + "learning_rate": 1.2402986099129765e-06, + "loss": 0.0003, + "step": 42790 + }, + { + "epoch": 0.85584, + "grad_norm": 0.00875544361770153, + "learning_rate": 1.2396251862403696e-06, + "loss": 0.0118, + "step": 42792 + }, + { + "epoch": 0.85588, + "grad_norm": 0.003595782909542322, + "learning_rate": 1.2389519333557853e-06, + "loss": 0.3545, + "step": 42794 + }, + { + "epoch": 0.85592, + "grad_norm": 0.12026180326938629, + "learning_rate": 1.2382788512723398e-06, + "loss": 0.0083, + "step": 42796 + }, + { + "epoch": 0.85596, + "grad_norm": 0.2932863235473633, + "learning_rate": 1.2376059400031605e-06, + "loss": 0.0048, + "step": 42798 + }, + { + "epoch": 0.856, + "grad_norm": 0.025225501507520676, + "learning_rate": 1.2369331995613664e-06, + "loss": 0.0005, + "step": 42800 + }, + { + "epoch": 0.85604, + "grad_norm": 0.007304903585463762, + "learning_rate": 1.2362606299600667e-06, + "loss": 0.0006, + "step": 42802 + }, + { + "epoch": 0.85608, + "grad_norm": 0.03650616109371185, + "learning_rate": 1.2355882312123813e-06, + "loss": 0.0006, + "step": 42804 + }, + { + "epoch": 0.85612, + "grad_norm": 0.022652002051472664, + "learning_rate": 1.2349160033314145e-06, + "loss": 0.0024, + "step": 42806 + }, + { + "epoch": 0.85616, + "grad_norm": 0.011407597921788692, + "learning_rate": 1.2342439463302725e-06, + "loss": 0.0001, + "step": 42808 + }, + { + "epoch": 0.8562, + "grad_norm": 0.003057433757930994, + "learning_rate": 1.233572060222057e-06, + "loss": 0.8759, + "step": 42810 + }, + { + "epoch": 0.85624, + "grad_norm": 0.006773192901164293, + "learning_rate": 1.2329003450198684e-06, + "loss": 0.0001, + "step": 42812 + }, + { + "epoch": 0.85628, + "grad_norm": 0.447542667388916, + "learning_rate": 1.2322288007368034e-06, + "loss": 0.0049, + "step": 42814 + }, + { + "epoch": 0.85632, + "grad_norm": 0.008445605635643005, + "learning_rate": 1.2315574273859487e-06, + "loss": 0.0001, + "step": 42816 + }, + { + "epoch": 0.85636, + "grad_norm": 1.0054519176483154, + "learning_rate": 1.2308862249803965e-06, + "loss": 0.0083, + "step": 42818 + }, + { + "epoch": 0.8564, + "grad_norm": 0.03423657640814781, + "learning_rate": 1.230215193533233e-06, + "loss": 0.0006, + "step": 42820 + }, + { + "epoch": 0.85644, + "grad_norm": 0.017561400309205055, + "learning_rate": 1.2295443330575407e-06, + "loss": 0.1158, + "step": 42822 + }, + { + "epoch": 0.85648, + "grad_norm": 0.054315559566020966, + "learning_rate": 1.228873643566393e-06, + "loss": 0.0006, + "step": 42824 + }, + { + "epoch": 0.85652, + "grad_norm": 0.15308034420013428, + "learning_rate": 1.2282031250728732e-06, + "loss": 0.0016, + "step": 42826 + }, + { + "epoch": 0.85656, + "grad_norm": 0.027865534648299217, + "learning_rate": 1.227532777590047e-06, + "loss": 0.0005, + "step": 42828 + }, + { + "epoch": 0.8566, + "grad_norm": 0.028024716302752495, + "learning_rate": 1.2268626011309858e-06, + "loss": 0.0002, + "step": 42830 + }, + { + "epoch": 0.85664, + "grad_norm": 0.061947982758283615, + "learning_rate": 1.2261925957087584e-06, + "loss": 0.001, + "step": 42832 + }, + { + "epoch": 0.85668, + "grad_norm": 1.3531250953674316, + "learning_rate": 1.2255227613364174e-06, + "loss": 0.0118, + "step": 42834 + }, + { + "epoch": 0.85672, + "grad_norm": 3.3687074184417725, + "learning_rate": 1.2248530980270322e-06, + "loss": 0.0943, + "step": 42836 + }, + { + "epoch": 0.85676, + "grad_norm": 0.07965295761823654, + "learning_rate": 1.224183605793653e-06, + "loss": 0.8144, + "step": 42838 + }, + { + "epoch": 0.8568, + "grad_norm": 0.07379266619682312, + "learning_rate": 1.223514284649331e-06, + "loss": 0.0007, + "step": 42840 + }, + { + "epoch": 0.85684, + "grad_norm": 0.4419170916080475, + "learning_rate": 1.2228451346071168e-06, + "loss": 0.0242, + "step": 42842 + }, + { + "epoch": 0.85688, + "grad_norm": 0.11927718669176102, + "learning_rate": 1.222176155680055e-06, + "loss": 0.0099, + "step": 42844 + }, + { + "epoch": 0.85692, + "grad_norm": 0.051823701709508896, + "learning_rate": 1.2215073478811912e-06, + "loss": 0.0009, + "step": 42846 + }, + { + "epoch": 0.85696, + "grad_norm": 0.20172688364982605, + "learning_rate": 1.2208387112235586e-06, + "loss": 0.0059, + "step": 42848 + }, + { + "epoch": 0.857, + "grad_norm": 0.058411888778209686, + "learning_rate": 1.2201702457201948e-06, + "loss": 0.0013, + "step": 42850 + }, + { + "epoch": 0.85704, + "grad_norm": 0.2987733781337738, + "learning_rate": 1.2195019513841334e-06, + "loss": 0.0048, + "step": 42852 + }, + { + "epoch": 0.85708, + "grad_norm": 0.047184284776449203, + "learning_rate": 1.2188338282284028e-06, + "loss": 0.0005, + "step": 42854 + }, + { + "epoch": 0.85712, + "grad_norm": 0.07542332261800766, + "learning_rate": 1.2181658762660231e-06, + "loss": 0.0306, + "step": 42856 + }, + { + "epoch": 0.85716, + "grad_norm": 0.4889719784259796, + "learning_rate": 1.2174980955100258e-06, + "loss": 0.0062, + "step": 42858 + }, + { + "epoch": 0.8572, + "grad_norm": 0.003689340315759182, + "learning_rate": 1.2168304859734226e-06, + "loss": 0.0006, + "step": 42860 + }, + { + "epoch": 0.85724, + "grad_norm": 0.36480674147605896, + "learning_rate": 1.216163047669231e-06, + "loss": 0.0046, + "step": 42862 + }, + { + "epoch": 0.85728, + "grad_norm": 0.0013843657216057181, + "learning_rate": 1.2154957806104628e-06, + "loss": 0.0001, + "step": 42864 + }, + { + "epoch": 0.85732, + "grad_norm": 1.2487986087799072, + "learning_rate": 1.2148286848101276e-06, + "loss": 0.0176, + "step": 42866 + }, + { + "epoch": 0.85736, + "grad_norm": 0.012982449494302273, + "learning_rate": 1.2141617602812316e-06, + "loss": 0.0036, + "step": 42868 + }, + { + "epoch": 0.8574, + "grad_norm": 0.8124808669090271, + "learning_rate": 1.2134950070367723e-06, + "loss": 0.0319, + "step": 42870 + }, + { + "epoch": 0.85744, + "grad_norm": 0.013316529802978039, + "learning_rate": 1.2128284250897526e-06, + "loss": 0.2191, + "step": 42872 + }, + { + "epoch": 0.85748, + "grad_norm": 1.4602442979812622, + "learning_rate": 1.2121620144531665e-06, + "loss": 0.0151, + "step": 42874 + }, + { + "epoch": 0.85752, + "grad_norm": 0.006247389130294323, + "learning_rate": 1.211495775140007e-06, + "loss": 0.0003, + "step": 42876 + }, + { + "epoch": 0.85756, + "grad_norm": 0.026024335995316505, + "learning_rate": 1.2108297071632603e-06, + "loss": 0.0015, + "step": 42878 + }, + { + "epoch": 0.8576, + "grad_norm": 0.1727014183998108, + "learning_rate": 1.210163810535917e-06, + "loss": 0.002, + "step": 42880 + }, + { + "epoch": 0.85764, + "grad_norm": 3.6229286193847656, + "learning_rate": 1.2094980852709527e-06, + "loss": 0.0381, + "step": 42882 + }, + { + "epoch": 0.85768, + "grad_norm": 0.5145373344421387, + "learning_rate": 1.2088325313813488e-06, + "loss": 0.0057, + "step": 42884 + }, + { + "epoch": 0.85772, + "grad_norm": 0.020993521437048912, + "learning_rate": 1.2081671488800805e-06, + "loss": 0.0003, + "step": 42886 + }, + { + "epoch": 0.85776, + "grad_norm": 2.571669340133667, + "learning_rate": 1.2075019377801212e-06, + "loss": 0.0242, + "step": 42888 + }, + { + "epoch": 0.8578, + "grad_norm": 0.14748020470142365, + "learning_rate": 1.206836898094439e-06, + "loss": 0.0115, + "step": 42890 + }, + { + "epoch": 0.85784, + "grad_norm": 0.4633631408214569, + "learning_rate": 1.2061720298359947e-06, + "loss": 0.0074, + "step": 42892 + }, + { + "epoch": 0.85788, + "grad_norm": 0.7948264479637146, + "learning_rate": 1.205507333017759e-06, + "loss": 0.0085, + "step": 42894 + }, + { + "epoch": 0.85792, + "grad_norm": 0.041722215712070465, + "learning_rate": 1.204842807652683e-06, + "loss": 0.0008, + "step": 42896 + }, + { + "epoch": 0.85796, + "grad_norm": 0.1657480001449585, + "learning_rate": 1.2041784537537237e-06, + "loss": 0.0014, + "step": 42898 + }, + { + "epoch": 0.858, + "grad_norm": 0.06501957774162292, + "learning_rate": 1.2035142713338366e-06, + "loss": 0.0011, + "step": 42900 + }, + { + "epoch": 0.85804, + "grad_norm": 0.28422021865844727, + "learning_rate": 1.2028502604059633e-06, + "loss": 0.0031, + "step": 42902 + }, + { + "epoch": 0.85808, + "grad_norm": 0.05297829955816269, + "learning_rate": 1.202186420983058e-06, + "loss": 0.0153, + "step": 42904 + }, + { + "epoch": 0.85812, + "grad_norm": 0.013020445592701435, + "learning_rate": 1.2015227530780548e-06, + "loss": 0.0012, + "step": 42906 + }, + { + "epoch": 0.85816, + "grad_norm": 0.16296641528606415, + "learning_rate": 1.2008592567038956e-06, + "loss": 0.0017, + "step": 42908 + }, + { + "epoch": 0.8582, + "grad_norm": 0.012599436566233635, + "learning_rate": 1.2001959318735158e-06, + "loss": 0.0022, + "step": 42910 + }, + { + "epoch": 0.85824, + "grad_norm": 0.06816589087247849, + "learning_rate": 1.1995327785998478e-06, + "loss": 0.0799, + "step": 42912 + }, + { + "epoch": 0.85828, + "grad_norm": 0.16056804358959198, + "learning_rate": 1.1988697968958184e-06, + "loss": 0.0012, + "step": 42914 + }, + { + "epoch": 0.85832, + "grad_norm": 0.30355122685432434, + "learning_rate": 1.1982069867743528e-06, + "loss": 0.0023, + "step": 42916 + }, + { + "epoch": 0.85836, + "grad_norm": 0.010708729736506939, + "learning_rate": 1.1975443482483728e-06, + "loss": 0.0002, + "step": 42918 + }, + { + "epoch": 0.8584, + "grad_norm": 0.03182968124747276, + "learning_rate": 1.196881881330798e-06, + "loss": 0.0004, + "step": 42920 + }, + { + "epoch": 0.85844, + "grad_norm": 0.057013530284166336, + "learning_rate": 1.196219586034545e-06, + "loss": 0.0013, + "step": 42922 + }, + { + "epoch": 0.85848, + "grad_norm": 0.2771814167499542, + "learning_rate": 1.1955574623725197e-06, + "loss": 0.0991, + "step": 42924 + }, + { + "epoch": 0.85852, + "grad_norm": 0.1035560593008995, + "learning_rate": 1.1948955103576387e-06, + "loss": 0.0011, + "step": 42926 + }, + { + "epoch": 0.85856, + "grad_norm": 0.05358911678195, + "learning_rate": 1.1942337300028017e-06, + "loss": 0.0009, + "step": 42928 + }, + { + "epoch": 0.8586, + "grad_norm": 0.16084636747837067, + "learning_rate": 1.1935721213209106e-06, + "loss": 0.0012, + "step": 42930 + }, + { + "epoch": 0.85864, + "grad_norm": 1.3129206895828247, + "learning_rate": 1.192910684324865e-06, + "loss": 0.0118, + "step": 42932 + }, + { + "epoch": 0.85868, + "grad_norm": 0.04420515149831772, + "learning_rate": 1.192249419027559e-06, + "loss": 0.0018, + "step": 42934 + }, + { + "epoch": 0.85872, + "grad_norm": 0.02282702550292015, + "learning_rate": 1.191588325441888e-06, + "loss": 0.0026, + "step": 42936 + }, + { + "epoch": 0.85876, + "grad_norm": 0.22049251198768616, + "learning_rate": 1.190927403580736e-06, + "loss": 0.0031, + "step": 42938 + }, + { + "epoch": 0.8588, + "grad_norm": 0.004357991740107536, + "learning_rate": 1.1902666534569884e-06, + "loss": 0.1757, + "step": 42940 + }, + { + "epoch": 0.85884, + "grad_norm": 0.5660353899002075, + "learning_rate": 1.1896060750835292e-06, + "loss": 0.0065, + "step": 42942 + }, + { + "epoch": 0.85888, + "grad_norm": 0.18402594327926636, + "learning_rate": 1.1889456684732358e-06, + "loss": 0.0029, + "step": 42944 + }, + { + "epoch": 0.85892, + "grad_norm": 0.2058754861354828, + "learning_rate": 1.1882854336389804e-06, + "loss": 0.2222, + "step": 42946 + }, + { + "epoch": 0.85896, + "grad_norm": 0.023649035021662712, + "learning_rate": 1.1876253705936403e-06, + "loss": 0.0009, + "step": 42948 + }, + { + "epoch": 0.859, + "grad_norm": 0.35310012102127075, + "learning_rate": 1.1869654793500784e-06, + "loss": 0.0052, + "step": 42950 + }, + { + "epoch": 0.85904, + "grad_norm": 0.06567490845918655, + "learning_rate": 1.1863057599211625e-06, + "loss": 0.0078, + "step": 42952 + }, + { + "epoch": 0.85908, + "grad_norm": 0.02305823750793934, + "learning_rate": 1.1856462123197544e-06, + "loss": 0.0004, + "step": 42954 + }, + { + "epoch": 0.85912, + "grad_norm": 0.4380294978618622, + "learning_rate": 1.184986836558708e-06, + "loss": 0.0317, + "step": 42956 + }, + { + "epoch": 0.85916, + "grad_norm": 0.007007732521742582, + "learning_rate": 1.1843276326508846e-06, + "loss": 0.0001, + "step": 42958 + }, + { + "epoch": 0.8592, + "grad_norm": 1.8960671424865723, + "learning_rate": 1.1836686006091313e-06, + "loss": 0.0214, + "step": 42960 + }, + { + "epoch": 0.85924, + "grad_norm": 0.022355832159519196, + "learning_rate": 1.1830097404462982e-06, + "loss": 0.0029, + "step": 42962 + }, + { + "epoch": 0.85928, + "grad_norm": 0.014271225780248642, + "learning_rate": 1.182351052175229e-06, + "loss": 0.0004, + "step": 42964 + }, + { + "epoch": 0.85932, + "grad_norm": 0.00874114315956831, + "learning_rate": 1.181692535808765e-06, + "loss": 0.0011, + "step": 42966 + }, + { + "epoch": 0.85936, + "grad_norm": 0.10668221861124039, + "learning_rate": 1.1810341913597479e-06, + "loss": 0.0012, + "step": 42968 + }, + { + "epoch": 0.8594, + "grad_norm": 0.04680668190121651, + "learning_rate": 1.1803760188410074e-06, + "loss": 0.0007, + "step": 42970 + }, + { + "epoch": 0.85944, + "grad_norm": 0.04829931631684303, + "learning_rate": 1.1797180182653767e-06, + "loss": 0.0007, + "step": 42972 + }, + { + "epoch": 0.85948, + "grad_norm": 0.0995616689324379, + "learning_rate": 1.1790601896456843e-06, + "loss": 0.0021, + "step": 42974 + }, + { + "epoch": 0.85952, + "grad_norm": 0.8090832233428955, + "learning_rate": 1.1784025329947569e-06, + "loss": 0.0076, + "step": 42976 + }, + { + "epoch": 0.85956, + "grad_norm": 0.2890858054161072, + "learning_rate": 1.1777450483254094e-06, + "loss": 0.3572, + "step": 42978 + }, + { + "epoch": 0.8596, + "grad_norm": 0.04418330639600754, + "learning_rate": 1.1770877356504684e-06, + "loss": 0.0003, + "step": 42980 + }, + { + "epoch": 0.85964, + "grad_norm": 0.4696704149246216, + "learning_rate": 1.1764305949827425e-06, + "loss": 0.0043, + "step": 42982 + }, + { + "epoch": 0.85968, + "grad_norm": 0.024219797924160957, + "learning_rate": 1.175773626335045e-06, + "loss": 0.0019, + "step": 42984 + }, + { + "epoch": 0.85972, + "grad_norm": 0.027151882648468018, + "learning_rate": 1.1751168297201831e-06, + "loss": 0.0003, + "step": 42986 + }, + { + "epoch": 0.85976, + "grad_norm": 0.1342061460018158, + "learning_rate": 1.1744602051509623e-06, + "loss": 0.0024, + "step": 42988 + }, + { + "epoch": 0.8598, + "grad_norm": 0.0610949844121933, + "learning_rate": 1.1738037526401857e-06, + "loss": 0.0016, + "step": 42990 + }, + { + "epoch": 0.85984, + "grad_norm": 0.016145093366503716, + "learning_rate": 1.1731474722006453e-06, + "loss": 0.0005, + "step": 42992 + }, + { + "epoch": 0.85988, + "grad_norm": 0.06541810929775238, + "learning_rate": 1.172491363845143e-06, + "loss": 0.0045, + "step": 42994 + }, + { + "epoch": 0.85992, + "grad_norm": 0.17917989194393158, + "learning_rate": 1.171835427586463e-06, + "loss": 0.0042, + "step": 42996 + }, + { + "epoch": 0.85996, + "grad_norm": 0.018722089007496834, + "learning_rate": 1.1711796634373984e-06, + "loss": 0.0021, + "step": 42998 + }, + { + "epoch": 0.86, + "grad_norm": 0.18915212154388428, + "learning_rate": 1.1705240714107301e-06, + "loss": 0.0019, + "step": 43000 + }, + { + "epoch": 0.86004, + "grad_norm": 0.018431950360536575, + "learning_rate": 1.1698686515192436e-06, + "loss": 0.0008, + "step": 43002 + }, + { + "epoch": 0.86008, + "grad_norm": 0.03506772965192795, + "learning_rate": 1.169213403775712e-06, + "loss": 0.0007, + "step": 43004 + }, + { + "epoch": 0.86012, + "grad_norm": 0.17094090580940247, + "learning_rate": 1.1685583281929102e-06, + "loss": 0.0017, + "step": 43006 + }, + { + "epoch": 0.86016, + "grad_norm": 0.046441830694675446, + "learning_rate": 1.1679034247836118e-06, + "loss": 0.0026, + "step": 43008 + }, + { + "epoch": 0.8602, + "grad_norm": 0.17394110560417175, + "learning_rate": 1.167248693560583e-06, + "loss": 0.0053, + "step": 43010 + }, + { + "epoch": 0.86024, + "grad_norm": 0.03909577801823616, + "learning_rate": 1.1665941345365895e-06, + "loss": 0.0004, + "step": 43012 + }, + { + "epoch": 0.86028, + "grad_norm": 0.0431346669793129, + "learning_rate": 1.1659397477243883e-06, + "loss": 0.0005, + "step": 43014 + }, + { + "epoch": 0.86032, + "grad_norm": 0.01417108066380024, + "learning_rate": 1.165285533136743e-06, + "loss": 0.0119, + "step": 43016 + }, + { + "epoch": 0.86036, + "grad_norm": 0.048383843153715134, + "learning_rate": 1.164631490786403e-06, + "loss": 0.0037, + "step": 43018 + }, + { + "epoch": 0.8604, + "grad_norm": 0.09370710700750351, + "learning_rate": 1.1639776206861197e-06, + "loss": 0.0009, + "step": 43020 + }, + { + "epoch": 0.86044, + "grad_norm": 0.17456623911857605, + "learning_rate": 1.1633239228486447e-06, + "loss": 0.0021, + "step": 43022 + }, + { + "epoch": 0.86048, + "grad_norm": 0.908338189125061, + "learning_rate": 1.162670397286716e-06, + "loss": 0.0093, + "step": 43024 + }, + { + "epoch": 0.86052, + "grad_norm": 0.03977709636092186, + "learning_rate": 1.1620170440130808e-06, + "loss": 0.0007, + "step": 43026 + }, + { + "epoch": 0.86056, + "grad_norm": 0.00021497404668480158, + "learning_rate": 1.1613638630404722e-06, + "loss": 0.0007, + "step": 43028 + }, + { + "epoch": 0.8606, + "grad_norm": 0.12211402505636215, + "learning_rate": 1.1607108543816247e-06, + "loss": 0.0024, + "step": 43030 + }, + { + "epoch": 0.86064, + "grad_norm": 0.28133177757263184, + "learning_rate": 1.1600580180492692e-06, + "loss": 0.0031, + "step": 43032 + }, + { + "epoch": 0.86068, + "grad_norm": 0.03727961331605911, + "learning_rate": 1.1594053540561367e-06, + "loss": 0.0005, + "step": 43034 + }, + { + "epoch": 0.86072, + "grad_norm": 0.529256284236908, + "learning_rate": 1.158752862414947e-06, + "loss": 0.006, + "step": 43036 + }, + { + "epoch": 0.86076, + "grad_norm": 0.7716314792633057, + "learning_rate": 1.1581005431384208e-06, + "loss": 0.0064, + "step": 43038 + }, + { + "epoch": 0.8608, + "grad_norm": 0.07466800510883331, + "learning_rate": 1.1574483962392768e-06, + "loss": 0.0028, + "step": 43040 + }, + { + "epoch": 0.86084, + "grad_norm": 0.29104241728782654, + "learning_rate": 1.1567964217302296e-06, + "loss": 0.0032, + "step": 43042 + }, + { + "epoch": 0.86088, + "grad_norm": 0.08069198578596115, + "learning_rate": 1.1561446196239901e-06, + "loss": 0.0023, + "step": 43044 + }, + { + "epoch": 0.86092, + "grad_norm": 1.327372431755066, + "learning_rate": 1.1554929899332612e-06, + "loss": 0.0084, + "step": 43046 + }, + { + "epoch": 0.86096, + "grad_norm": 0.2513379454612732, + "learning_rate": 1.154841532670753e-06, + "loss": 0.0068, + "step": 43048 + }, + { + "epoch": 0.861, + "grad_norm": 0.005293599329888821, + "learning_rate": 1.1541902478491607e-06, + "loss": 0.0001, + "step": 43050 + }, + { + "epoch": 0.86104, + "grad_norm": 0.10389691591262817, + "learning_rate": 1.1535391354811842e-06, + "loss": 0.0043, + "step": 43052 + }, + { + "epoch": 0.86108, + "grad_norm": 0.02724379673600197, + "learning_rate": 1.1528881955795167e-06, + "loss": 0.0002, + "step": 43054 + }, + { + "epoch": 0.86112, + "grad_norm": 0.0027977966237813234, + "learning_rate": 1.1522374281568493e-06, + "loss": 0.0001, + "step": 43056 + }, + { + "epoch": 0.86116, + "grad_norm": 0.8685436248779297, + "learning_rate": 1.1515868332258695e-06, + "loss": 0.0089, + "step": 43058 + }, + { + "epoch": 0.8612, + "grad_norm": 0.0060243611223995686, + "learning_rate": 1.1509364107992582e-06, + "loss": 0.0035, + "step": 43060 + }, + { + "epoch": 0.86124, + "grad_norm": 0.04637926071882248, + "learning_rate": 1.1502861608896976e-06, + "loss": 0.0007, + "step": 43062 + }, + { + "epoch": 0.86128, + "grad_norm": 0.15886853635311127, + "learning_rate": 1.1496360835098642e-06, + "loss": 0.0018, + "step": 43064 + }, + { + "epoch": 0.86132, + "grad_norm": 0.8725893497467041, + "learning_rate": 1.1489861786724332e-06, + "loss": 0.0107, + "step": 43066 + }, + { + "epoch": 0.86136, + "grad_norm": 0.001298760180361569, + "learning_rate": 1.1483364463900703e-06, + "loss": 0.0019, + "step": 43068 + }, + { + "epoch": 0.8614, + "grad_norm": 0.000671324844006449, + "learning_rate": 1.1476868866754488e-06, + "loss": 0.0057, + "step": 43070 + }, + { + "epoch": 0.86144, + "grad_norm": 0.0029847018886357546, + "learning_rate": 1.147037499541227e-06, + "loss": 0.0168, + "step": 43072 + }, + { + "epoch": 0.86148, + "grad_norm": 0.0014245226047933102, + "learning_rate": 1.1463882850000675e-06, + "loss": 0.0004, + "step": 43074 + }, + { + "epoch": 0.86152, + "grad_norm": 0.5108069777488708, + "learning_rate": 1.1457392430646287e-06, + "loss": 0.0061, + "step": 43076 + }, + { + "epoch": 0.86156, + "grad_norm": 0.009787889197468758, + "learning_rate": 1.1450903737475572e-06, + "loss": 0.0001, + "step": 43078 + }, + { + "epoch": 0.8616, + "grad_norm": 0.0012258567148819566, + "learning_rate": 1.1444416770615118e-06, + "loss": 0.0027, + "step": 43080 + }, + { + "epoch": 0.86164, + "grad_norm": 0.17206253111362457, + "learning_rate": 1.1437931530191338e-06, + "loss": 0.0023, + "step": 43082 + }, + { + "epoch": 0.86168, + "grad_norm": 0.15640889108181, + "learning_rate": 1.143144801633067e-06, + "loss": 0.0029, + "step": 43084 + }, + { + "epoch": 0.86172, + "grad_norm": 0.041807178407907486, + "learning_rate": 1.1424966229159517e-06, + "loss": 0.0009, + "step": 43086 + }, + { + "epoch": 0.86176, + "grad_norm": 0.3004247546195984, + "learning_rate": 1.1418486168804255e-06, + "loss": 0.003, + "step": 43088 + }, + { + "epoch": 0.8618, + "grad_norm": 2.4056990146636963, + "learning_rate": 1.1412007835391237e-06, + "loss": 0.0215, + "step": 43090 + }, + { + "epoch": 0.86184, + "grad_norm": 0.02879805862903595, + "learning_rate": 1.1405531229046707e-06, + "loss": 0.0003, + "step": 43092 + }, + { + "epoch": 0.86188, + "grad_norm": 1.0562382936477661, + "learning_rate": 1.1399056349896953e-06, + "loss": 0.0147, + "step": 43094 + }, + { + "epoch": 0.86192, + "grad_norm": 0.048617344349622726, + "learning_rate": 1.1392583198068208e-06, + "loss": 0.0038, + "step": 43096 + }, + { + "epoch": 0.86196, + "grad_norm": 0.09505458176136017, + "learning_rate": 1.1386111773686681e-06, + "loss": 0.0023, + "step": 43098 + }, + { + "epoch": 0.862, + "grad_norm": 0.046396370977163315, + "learning_rate": 1.1379642076878528e-06, + "loss": 0.0013, + "step": 43100 + }, + { + "epoch": 0.86204, + "grad_norm": 0.3582623600959778, + "learning_rate": 1.1373174107769892e-06, + "loss": 0.0042, + "step": 43102 + }, + { + "epoch": 0.86208, + "grad_norm": 0.5900169014930725, + "learning_rate": 1.1366707866486836e-06, + "loss": 0.0089, + "step": 43104 + }, + { + "epoch": 0.86212, + "grad_norm": 0.009664301760494709, + "learning_rate": 1.1360243353155442e-06, + "loss": 0.0006, + "step": 43106 + }, + { + "epoch": 0.86216, + "grad_norm": 0.04263177886605263, + "learning_rate": 1.1353780567901763e-06, + "loss": 0.0016, + "step": 43108 + }, + { + "epoch": 0.8622, + "grad_norm": 0.2771606147289276, + "learning_rate": 1.1347319510851718e-06, + "loss": 0.0029, + "step": 43110 + }, + { + "epoch": 0.86224, + "grad_norm": 0.04387686774134636, + "learning_rate": 1.1340860182131363e-06, + "loss": 0.0017, + "step": 43112 + }, + { + "epoch": 0.86228, + "grad_norm": 0.10964637249708176, + "learning_rate": 1.1334402581866555e-06, + "loss": 0.001, + "step": 43114 + }, + { + "epoch": 0.86232, + "grad_norm": 0.01284195575863123, + "learning_rate": 1.1327946710183247e-06, + "loss": 0.0004, + "step": 43116 + }, + { + "epoch": 0.86236, + "grad_norm": 0.9679412841796875, + "learning_rate": 1.132149256720726e-06, + "loss": 0.3455, + "step": 43118 + }, + { + "epoch": 0.8624, + "grad_norm": 0.009366140700876713, + "learning_rate": 1.1315040153064416e-06, + "loss": 0.0065, + "step": 43120 + }, + { + "epoch": 0.86244, + "grad_norm": 0.0073425862938165665, + "learning_rate": 1.1308589467880526e-06, + "loss": 0.0044, + "step": 43122 + }, + { + "epoch": 0.86248, + "grad_norm": 0.013277132995426655, + "learning_rate": 1.1302140511781378e-06, + "loss": 0.0045, + "step": 43124 + }, + { + "epoch": 0.86252, + "grad_norm": 0.021053031086921692, + "learning_rate": 1.1295693284892628e-06, + "loss": 0.015, + "step": 43126 + }, + { + "epoch": 0.86256, + "grad_norm": 0.017879702150821686, + "learning_rate": 1.128924778734002e-06, + "loss": 0.0004, + "step": 43128 + }, + { + "epoch": 0.8626, + "grad_norm": 0.009424307383596897, + "learning_rate": 1.1282804019249183e-06, + "loss": 0.0028, + "step": 43130 + }, + { + "epoch": 0.86264, + "grad_norm": 0.07334893196821213, + "learning_rate": 1.1276361980745764e-06, + "loss": 0.0027, + "step": 43132 + }, + { + "epoch": 0.86268, + "grad_norm": 0.07984413206577301, + "learning_rate": 1.1269921671955376e-06, + "loss": 0.0008, + "step": 43134 + }, + { + "epoch": 0.86272, + "grad_norm": 0.08449406176805496, + "learning_rate": 1.1263483093003491e-06, + "loss": 0.0011, + "step": 43136 + }, + { + "epoch": 0.86276, + "grad_norm": 0.08623393625020981, + "learning_rate": 1.1257046244015734e-06, + "loss": 0.0015, + "step": 43138 + }, + { + "epoch": 0.8628, + "grad_norm": 1.1129136085510254, + "learning_rate": 1.1250611125117527e-06, + "loss": 0.0084, + "step": 43140 + }, + { + "epoch": 0.86284, + "grad_norm": 0.12472494691610336, + "learning_rate": 1.1244177736434358e-06, + "loss": 0.0015, + "step": 43142 + }, + { + "epoch": 0.86288, + "grad_norm": 0.06705548614263535, + "learning_rate": 1.123774607809165e-06, + "loss": 0.0115, + "step": 43144 + }, + { + "epoch": 0.86292, + "grad_norm": 0.028960175812244415, + "learning_rate": 1.1231316150214732e-06, + "loss": 0.0067, + "step": 43146 + }, + { + "epoch": 0.86296, + "grad_norm": 0.005832461174577475, + "learning_rate": 1.1224887952929054e-06, + "loss": 0.0044, + "step": 43148 + }, + { + "epoch": 0.863, + "grad_norm": 0.22071801126003265, + "learning_rate": 1.1218461486359878e-06, + "loss": 0.0023, + "step": 43150 + }, + { + "epoch": 0.86304, + "grad_norm": 0.029121048748493195, + "learning_rate": 1.1212036750632493e-06, + "loss": 0.002, + "step": 43152 + }, + { + "epoch": 0.86308, + "grad_norm": 0.1685829758644104, + "learning_rate": 1.1205613745872157e-06, + "loss": 0.0071, + "step": 43154 + }, + { + "epoch": 0.86312, + "grad_norm": 0.019529549404978752, + "learning_rate": 1.1199192472204135e-06, + "loss": 0.0002, + "step": 43156 + }, + { + "epoch": 0.86316, + "grad_norm": 0.05253909155726433, + "learning_rate": 1.1192772929753538e-06, + "loss": 0.0012, + "step": 43158 + }, + { + "epoch": 0.8632, + "grad_norm": 0.05160735920071602, + "learning_rate": 1.1186355118645552e-06, + "loss": 0.0023, + "step": 43160 + }, + { + "epoch": 0.86324, + "grad_norm": 0.15813185274600983, + "learning_rate": 1.1179939039005304e-06, + "loss": 0.0034, + "step": 43162 + }, + { + "epoch": 0.86328, + "grad_norm": 0.16577252745628357, + "learning_rate": 1.117352469095787e-06, + "loss": 0.004, + "step": 43164 + }, + { + "epoch": 0.86332, + "grad_norm": 0.0008812913438305259, + "learning_rate": 1.1167112074628327e-06, + "loss": 0.0003, + "step": 43166 + }, + { + "epoch": 0.86336, + "grad_norm": 3.118175983428955, + "learning_rate": 1.116070119014162e-06, + "loss": 0.0238, + "step": 43168 + }, + { + "epoch": 0.8634, + "grad_norm": 0.008199412375688553, + "learning_rate": 1.1154292037622838e-06, + "loss": 0.0104, + "step": 43170 + }, + { + "epoch": 0.86344, + "grad_norm": 0.027353806421160698, + "learning_rate": 1.1147884617196835e-06, + "loss": 0.0018, + "step": 43172 + }, + { + "epoch": 0.86348, + "grad_norm": 0.01002395711839199, + "learning_rate": 1.114147892898857e-06, + "loss": 0.0009, + "step": 43174 + }, + { + "epoch": 0.86352, + "grad_norm": 0.07879824191331863, + "learning_rate": 1.1135074973122951e-06, + "loss": 0.0009, + "step": 43176 + }, + { + "epoch": 0.86356, + "grad_norm": 0.4531298577785492, + "learning_rate": 1.1128672749724746e-06, + "loss": 0.0083, + "step": 43178 + }, + { + "epoch": 0.8636, + "grad_norm": 0.028877288103103638, + "learning_rate": 1.1122272258918864e-06, + "loss": 0.0012, + "step": 43180 + }, + { + "epoch": 0.86364, + "grad_norm": 0.18402495980262756, + "learning_rate": 1.1115873500830022e-06, + "loss": 0.0016, + "step": 43182 + }, + { + "epoch": 0.86368, + "grad_norm": 0.021094882860779762, + "learning_rate": 1.110947647558298e-06, + "loss": 0.0007, + "step": 43184 + }, + { + "epoch": 0.86372, + "grad_norm": 0.03426814451813698, + "learning_rate": 1.1103081183302467e-06, + "loss": 0.0004, + "step": 43186 + }, + { + "epoch": 0.86376, + "grad_norm": 0.04815448448061943, + "learning_rate": 1.1096687624113168e-06, + "loss": 0.0027, + "step": 43188 + }, + { + "epoch": 0.8638, + "grad_norm": 0.13246822357177734, + "learning_rate": 1.1090295798139672e-06, + "loss": 0.001, + "step": 43190 + }, + { + "epoch": 0.86384, + "grad_norm": 1.1297129392623901, + "learning_rate": 1.1083905705506681e-06, + "loss": 0.0093, + "step": 43192 + }, + { + "epoch": 0.86388, + "grad_norm": 0.00970251951366663, + "learning_rate": 1.1077517346338695e-06, + "loss": 0.0006, + "step": 43194 + }, + { + "epoch": 0.86392, + "grad_norm": 0.0011972148204222322, + "learning_rate": 1.1071130720760292e-06, + "loss": 0.0002, + "step": 43196 + }, + { + "epoch": 0.86396, + "grad_norm": 0.011387806385755539, + "learning_rate": 1.1064745828896006e-06, + "loss": 0.001, + "step": 43198 + }, + { + "epoch": 0.864, + "grad_norm": 0.014942280016839504, + "learning_rate": 1.1058362670870248e-06, + "loss": 0.0134, + "step": 43200 + }, + { + "epoch": 0.86404, + "grad_norm": 0.26334285736083984, + "learning_rate": 1.105198124680753e-06, + "loss": 0.003, + "step": 43202 + }, + { + "epoch": 0.86408, + "grad_norm": 0.018374230712652206, + "learning_rate": 1.1045601556832197e-06, + "loss": 0.0003, + "step": 43204 + }, + { + "epoch": 0.86412, + "grad_norm": 0.0657837763428688, + "learning_rate": 1.1039223601068694e-06, + "loss": 0.0194, + "step": 43206 + }, + { + "epoch": 0.86416, + "grad_norm": 0.026225019246339798, + "learning_rate": 1.103284737964131e-06, + "loss": 0.0005, + "step": 43208 + }, + { + "epoch": 0.8642, + "grad_norm": 0.21709376573562622, + "learning_rate": 1.102647289267438e-06, + "loss": 0.0053, + "step": 43210 + }, + { + "epoch": 0.86424, + "grad_norm": 0.9465600848197937, + "learning_rate": 1.1020100140292166e-06, + "loss": 0.005, + "step": 43212 + }, + { + "epoch": 0.86428, + "grad_norm": 0.11162865161895752, + "learning_rate": 1.101372912261891e-06, + "loss": 0.0013, + "step": 43214 + }, + { + "epoch": 0.86432, + "grad_norm": 0.000834622245747596, + "learning_rate": 1.1007359839778808e-06, + "loss": 0.0001, + "step": 43216 + }, + { + "epoch": 0.86436, + "grad_norm": 0.2725360095500946, + "learning_rate": 1.100099229189604e-06, + "loss": 0.0355, + "step": 43218 + }, + { + "epoch": 0.8644, + "grad_norm": 0.026063023135066032, + "learning_rate": 1.0994626479094749e-06, + "loss": 0.0037, + "step": 43220 + }, + { + "epoch": 0.86444, + "grad_norm": 0.04748744145035744, + "learning_rate": 1.098826240149904e-06, + "loss": 0.0011, + "step": 43222 + }, + { + "epoch": 0.86448, + "grad_norm": 0.009741703979671001, + "learning_rate": 1.0981900059233008e-06, + "loss": 0.0001, + "step": 43224 + }, + { + "epoch": 0.86452, + "grad_norm": 0.0066741653718054295, + "learning_rate": 1.0975539452420636e-06, + "loss": 0.0007, + "step": 43226 + }, + { + "epoch": 0.86456, + "grad_norm": 0.023846052587032318, + "learning_rate": 1.0969180581185957e-06, + "loss": 0.0962, + "step": 43228 + }, + { + "epoch": 0.8646, + "grad_norm": 0.006702915765345097, + "learning_rate": 1.096282344565296e-06, + "loss": 0.0002, + "step": 43230 + }, + { + "epoch": 0.86464, + "grad_norm": 0.38001689314842224, + "learning_rate": 1.0956468045945512e-06, + "loss": 0.0065, + "step": 43232 + }, + { + "epoch": 0.86468, + "grad_norm": 0.3413599133491516, + "learning_rate": 1.0950114382187616e-06, + "loss": 0.0047, + "step": 43234 + }, + { + "epoch": 0.86472, + "grad_norm": 0.0004179319948889315, + "learning_rate": 1.0943762454503036e-06, + "loss": 0.0104, + "step": 43236 + }, + { + "epoch": 0.86476, + "grad_norm": 0.43161851167678833, + "learning_rate": 1.09374122630157e-06, + "loss": 0.0039, + "step": 43238 + }, + { + "epoch": 0.8648, + "grad_norm": 0.13946713507175446, + "learning_rate": 1.093106380784934e-06, + "loss": 0.0014, + "step": 43240 + }, + { + "epoch": 0.86484, + "grad_norm": 0.008591345511376858, + "learning_rate": 1.0924717089127746e-06, + "loss": 0.0001, + "step": 43242 + }, + { + "epoch": 0.86488, + "grad_norm": 0.0014697249280288815, + "learning_rate": 1.0918372106974672e-06, + "loss": 0.0001, + "step": 43244 + }, + { + "epoch": 0.86492, + "grad_norm": 0.013489813543856144, + "learning_rate": 1.091202886151379e-06, + "loss": 0.0003, + "step": 43246 + }, + { + "epoch": 0.86496, + "grad_norm": 0.08827891200780869, + "learning_rate": 1.0905687352868754e-06, + "loss": 0.0247, + "step": 43248 + }, + { + "epoch": 0.865, + "grad_norm": 0.07275061309337616, + "learning_rate": 1.0899347581163222e-06, + "loss": 0.0008, + "step": 43250 + }, + { + "epoch": 0.86504, + "grad_norm": 0.0967876985669136, + "learning_rate": 1.0893009546520773e-06, + "loss": 0.0884, + "step": 43252 + }, + { + "epoch": 0.86508, + "grad_norm": 0.00244124885648489, + "learning_rate": 1.0886673249064984e-06, + "loss": 0.0, + "step": 43254 + }, + { + "epoch": 0.86512, + "grad_norm": 0.1576329618692398, + "learning_rate": 1.0880338688919401e-06, + "loss": 0.0018, + "step": 43256 + }, + { + "epoch": 0.86516, + "grad_norm": 0.03879005089402199, + "learning_rate": 1.0874005866207449e-06, + "loss": 0.0006, + "step": 43258 + }, + { + "epoch": 0.8652, + "grad_norm": 0.03688175603747368, + "learning_rate": 1.0867674781052683e-06, + "loss": 0.0009, + "step": 43260 + }, + { + "epoch": 0.86524, + "grad_norm": 20.99746322631836, + "learning_rate": 1.0861345433578463e-06, + "loss": 0.3555, + "step": 43262 + }, + { + "epoch": 0.86528, + "grad_norm": 1.1349079608917236, + "learning_rate": 1.085501782390821e-06, + "loss": 0.0093, + "step": 43264 + }, + { + "epoch": 0.86532, + "grad_norm": 0.013812865130603313, + "learning_rate": 1.084869195216529e-06, + "loss": 0.0014, + "step": 43266 + }, + { + "epoch": 0.86536, + "grad_norm": 18.507057189941406, + "learning_rate": 1.0842367818472988e-06, + "loss": 1.0212, + "step": 43268 + }, + { + "epoch": 0.8654, + "grad_norm": 0.0859227254986763, + "learning_rate": 1.0836045422954665e-06, + "loss": 0.0009, + "step": 43270 + }, + { + "epoch": 0.86544, + "grad_norm": 0.009744440205395222, + "learning_rate": 1.0829724765733519e-06, + "loss": 0.0005, + "step": 43272 + }, + { + "epoch": 0.86548, + "grad_norm": 0.0074003622867167, + "learning_rate": 1.08234058469328e-06, + "loss": 0.0013, + "step": 43274 + }, + { + "epoch": 0.86552, + "grad_norm": 0.11208202689886093, + "learning_rate": 1.081708866667569e-06, + "loss": 0.0036, + "step": 43276 + }, + { + "epoch": 0.86556, + "grad_norm": 0.044949568808078766, + "learning_rate": 1.081077322508537e-06, + "loss": 0.4303, + "step": 43278 + }, + { + "epoch": 0.8656, + "grad_norm": 0.06451455503702164, + "learning_rate": 1.0804459522284927e-06, + "loss": 0.0007, + "step": 43280 + }, + { + "epoch": 0.86564, + "grad_norm": 0.00815745908766985, + "learning_rate": 1.0798147558397464e-06, + "loss": 0.0003, + "step": 43282 + }, + { + "epoch": 0.86568, + "grad_norm": 0.063560850918293, + "learning_rate": 1.0791837333546052e-06, + "loss": 0.0022, + "step": 43284 + }, + { + "epoch": 0.86572, + "grad_norm": 0.8879965543746948, + "learning_rate": 1.0785528847853689e-06, + "loss": 0.0082, + "step": 43286 + }, + { + "epoch": 0.86576, + "grad_norm": 0.059140123426914215, + "learning_rate": 1.0779222101443388e-06, + "loss": 0.0016, + "step": 43288 + }, + { + "epoch": 0.8658, + "grad_norm": 0.3130803406238556, + "learning_rate": 1.0772917094438052e-06, + "loss": 0.0034, + "step": 43290 + }, + { + "epoch": 0.86584, + "grad_norm": 0.005083165597170591, + "learning_rate": 1.076661382696067e-06, + "loss": 0.0019, + "step": 43292 + }, + { + "epoch": 0.86588, + "grad_norm": 0.005289943423122168, + "learning_rate": 1.0760312299134067e-06, + "loss": 0.0003, + "step": 43294 + }, + { + "epoch": 0.86592, + "grad_norm": 0.052530474960803986, + "learning_rate": 1.0754012511081124e-06, + "loss": 0.0062, + "step": 43296 + }, + { + "epoch": 0.86596, + "grad_norm": 0.036437686532735825, + "learning_rate": 1.0747714462924674e-06, + "loss": 0.0007, + "step": 43298 + }, + { + "epoch": 0.866, + "grad_norm": 0.04143941402435303, + "learning_rate": 1.0741418154787443e-06, + "loss": 0.0016, + "step": 43300 + }, + { + "epoch": 0.86604, + "grad_norm": 0.021626416593790054, + "learning_rate": 1.0735123586792263e-06, + "loss": 0.0017, + "step": 43302 + }, + { + "epoch": 0.86608, + "grad_norm": 0.04555729031562805, + "learning_rate": 1.0728830759061781e-06, + "loss": 0.0007, + "step": 43304 + }, + { + "epoch": 0.86612, + "grad_norm": 2.4375224113464355, + "learning_rate": 1.07225396717187e-06, + "loss": 0.0241, + "step": 43306 + }, + { + "epoch": 0.86616, + "grad_norm": 0.3799538016319275, + "learning_rate": 1.071625032488568e-06, + "loss": 0.0035, + "step": 43308 + }, + { + "epoch": 0.8662, + "grad_norm": 1.0962367057800293, + "learning_rate": 1.0709962718685318e-06, + "loss": 0.0153, + "step": 43310 + }, + { + "epoch": 0.86624, + "grad_norm": 1.07764732837677, + "learning_rate": 1.070367685324022e-06, + "loss": 0.0143, + "step": 43312 + }, + { + "epoch": 0.86628, + "grad_norm": 0.8095245361328125, + "learning_rate": 1.0697392728672896e-06, + "loss": 0.0088, + "step": 43314 + }, + { + "epoch": 0.86632, + "grad_norm": 1.4539953470230103, + "learning_rate": 1.0691110345105881e-06, + "loss": 0.0119, + "step": 43316 + }, + { + "epoch": 0.86636, + "grad_norm": 0.03322431072592735, + "learning_rate": 1.0684829702661647e-06, + "loss": 0.0007, + "step": 43318 + }, + { + "epoch": 0.8664, + "grad_norm": 0.012622406706213951, + "learning_rate": 1.0678550801462662e-06, + "loss": 0.0004, + "step": 43320 + }, + { + "epoch": 0.86644, + "grad_norm": 0.00898487213999033, + "learning_rate": 1.067227364163128e-06, + "loss": 0.0008, + "step": 43322 + }, + { + "epoch": 0.86648, + "grad_norm": 0.011521783657371998, + "learning_rate": 1.0665998223289942e-06, + "loss": 0.0006, + "step": 43324 + }, + { + "epoch": 0.86652, + "grad_norm": 0.009164786897599697, + "learning_rate": 1.0659724546560923e-06, + "loss": 0.0022, + "step": 43326 + }, + { + "epoch": 0.86656, + "grad_norm": 0.14908838272094727, + "learning_rate": 1.0653452611566617e-06, + "loss": 0.0015, + "step": 43328 + }, + { + "epoch": 0.8666, + "grad_norm": 0.17936372756958008, + "learning_rate": 1.0647182418429224e-06, + "loss": 0.0021, + "step": 43330 + }, + { + "epoch": 0.86664, + "grad_norm": 0.1899794042110443, + "learning_rate": 1.0640913967271016e-06, + "loss": 0.0021, + "step": 43332 + }, + { + "epoch": 0.86668, + "grad_norm": 0.03315621614456177, + "learning_rate": 1.0634647258214215e-06, + "loss": 0.0006, + "step": 43334 + }, + { + "epoch": 0.86672, + "grad_norm": 0.025449715554714203, + "learning_rate": 1.0628382291380946e-06, + "loss": 0.0004, + "step": 43336 + }, + { + "epoch": 0.86676, + "grad_norm": 0.808081328868866, + "learning_rate": 1.0622119066893389e-06, + "loss": 0.0082, + "step": 43338 + }, + { + "epoch": 0.8668, + "grad_norm": 0.0008752387366257608, + "learning_rate": 1.0615857584873624e-06, + "loss": 0.001, + "step": 43340 + }, + { + "epoch": 0.86684, + "grad_norm": 0.16299188137054443, + "learning_rate": 1.0609597845443742e-06, + "loss": 0.0024, + "step": 43342 + }, + { + "epoch": 0.86688, + "grad_norm": 1.592153549194336, + "learning_rate": 1.0603339848725768e-06, + "loss": 0.0151, + "step": 43344 + }, + { + "epoch": 0.86692, + "grad_norm": 0.006979885511100292, + "learning_rate": 1.0597083594841729e-06, + "loss": 0.0001, + "step": 43346 + }, + { + "epoch": 0.86696, + "grad_norm": 0.929963231086731, + "learning_rate": 1.0590829083913546e-06, + "loss": 0.0081, + "step": 43348 + }, + { + "epoch": 0.867, + "grad_norm": 0.003512803465127945, + "learning_rate": 1.058457631606319e-06, + "loss": 0.0092, + "step": 43350 + }, + { + "epoch": 0.86704, + "grad_norm": 0.5983216762542725, + "learning_rate": 1.057832529141256e-06, + "loss": 0.0073, + "step": 43352 + }, + { + "epoch": 0.86708, + "grad_norm": 0.09609613567590714, + "learning_rate": 1.0572076010083487e-06, + "loss": 0.0035, + "step": 43354 + }, + { + "epoch": 0.86712, + "grad_norm": 0.02143201418220997, + "learning_rate": 1.056582847219788e-06, + "loss": 0.0009, + "step": 43356 + }, + { + "epoch": 0.86716, + "grad_norm": 0.15509352087974548, + "learning_rate": 1.0559582677877434e-06, + "loss": 0.0071, + "step": 43358 + }, + { + "epoch": 0.8672, + "grad_norm": 0.00222953362390399, + "learning_rate": 1.0553338627244026e-06, + "loss": 0.005, + "step": 43360 + }, + { + "epoch": 0.86724, + "grad_norm": 0.18295037746429443, + "learning_rate": 1.0547096320419303e-06, + "loss": 0.0022, + "step": 43362 + }, + { + "epoch": 0.86728, + "grad_norm": 0.008571012876927853, + "learning_rate": 1.0540855757524993e-06, + "loss": 0.0268, + "step": 43364 + }, + { + "epoch": 0.86732, + "grad_norm": 0.054869260638952255, + "learning_rate": 1.0534616938682774e-06, + "loss": 0.0055, + "step": 43366 + }, + { + "epoch": 0.86736, + "grad_norm": 0.006220203824341297, + "learning_rate": 1.0528379864014238e-06, + "loss": 0.0005, + "step": 43368 + }, + { + "epoch": 0.8674, + "grad_norm": 0.044344447553157806, + "learning_rate": 1.0522144533641e-06, + "loss": 0.006, + "step": 43370 + }, + { + "epoch": 0.86744, + "grad_norm": 0.010272370651364326, + "learning_rate": 1.0515910947684627e-06, + "loss": 0.0002, + "step": 43372 + }, + { + "epoch": 0.86748, + "grad_norm": 0.021338574588298798, + "learning_rate": 1.0509679106266623e-06, + "loss": 0.0119, + "step": 43374 + }, + { + "epoch": 0.86752, + "grad_norm": 0.10060153156518936, + "learning_rate": 1.0503449009508516e-06, + "loss": 0.0014, + "step": 43376 + }, + { + "epoch": 0.86756, + "grad_norm": 0.13831068575382233, + "learning_rate": 1.049722065753176e-06, + "loss": 0.0026, + "step": 43378 + }, + { + "epoch": 0.8676, + "grad_norm": 0.10091646015644073, + "learning_rate": 1.0490994050457748e-06, + "loss": 0.0039, + "step": 43380 + }, + { + "epoch": 0.86764, + "grad_norm": 0.039511967450380325, + "learning_rate": 1.0484769188407883e-06, + "loss": 0.0013, + "step": 43382 + }, + { + "epoch": 0.86768, + "grad_norm": 0.04961691424250603, + "learning_rate": 1.0478546071503525e-06, + "loss": 0.0032, + "step": 43384 + }, + { + "epoch": 0.86772, + "grad_norm": 0.035420648753643036, + "learning_rate": 1.047232469986601e-06, + "loss": 0.0005, + "step": 43386 + }, + { + "epoch": 0.86776, + "grad_norm": 0.10444341599941254, + "learning_rate": 1.046610507361664e-06, + "loss": 0.0012, + "step": 43388 + }, + { + "epoch": 0.8678, + "grad_norm": 0.0005961362039670348, + "learning_rate": 1.0459887192876595e-06, + "loss": 0.0027, + "step": 43390 + }, + { + "epoch": 0.86784, + "grad_norm": 0.007657751906663179, + "learning_rate": 1.04536710577672e-06, + "loss": 0.0008, + "step": 43392 + }, + { + "epoch": 0.86788, + "grad_norm": 0.26181408762931824, + "learning_rate": 1.044745666840956e-06, + "loss": 0.0024, + "step": 43394 + }, + { + "epoch": 0.86792, + "grad_norm": 0.006393331103026867, + "learning_rate": 1.0441244024924858e-06, + "loss": 0.0015, + "step": 43396 + }, + { + "epoch": 0.86796, + "grad_norm": 0.00529452646151185, + "learning_rate": 1.0435033127434213e-06, + "loss": 0.0013, + "step": 43398 + }, + { + "epoch": 0.868, + "grad_norm": 0.019417637959122658, + "learning_rate": 1.042882397605871e-06, + "loss": 0.0074, + "step": 43400 + }, + { + "epoch": 0.86804, + "grad_norm": 0.08746626228094101, + "learning_rate": 1.042261657091942e-06, + "loss": 0.0031, + "step": 43402 + }, + { + "epoch": 0.86808, + "grad_norm": 0.1690121293067932, + "learning_rate": 1.041641091213731e-06, + "loss": 0.0013, + "step": 43404 + }, + { + "epoch": 0.86812, + "grad_norm": 0.07427758723497391, + "learning_rate": 1.0410206999833395e-06, + "loss": 0.0012, + "step": 43406 + }, + { + "epoch": 0.86816, + "grad_norm": 1.927541732788086, + "learning_rate": 1.0404004834128612e-06, + "loss": 0.0118, + "step": 43408 + }, + { + "epoch": 0.8682, + "grad_norm": 0.008457016199827194, + "learning_rate": 1.039780441514391e-06, + "loss": 0.0007, + "step": 43410 + }, + { + "epoch": 0.86824, + "grad_norm": 0.05351409688591957, + "learning_rate": 1.03916057430001e-06, + "loss": 0.0013, + "step": 43412 + }, + { + "epoch": 0.86828, + "grad_norm": 0.008650905452668667, + "learning_rate": 1.038540881781811e-06, + "loss": 0.0008, + "step": 43414 + }, + { + "epoch": 0.86832, + "grad_norm": 0.3804022967815399, + "learning_rate": 1.0379213639718688e-06, + "loss": 0.0063, + "step": 43416 + }, + { + "epoch": 0.86836, + "grad_norm": 0.12664207816123962, + "learning_rate": 1.0373020208822637e-06, + "loss": 0.0034, + "step": 43418 + }, + { + "epoch": 0.8684, + "grad_norm": 0.01537935808300972, + "learning_rate": 1.0366828525250728e-06, + "loss": 0.0377, + "step": 43420 + }, + { + "epoch": 0.86844, + "grad_norm": 0.5880038738250732, + "learning_rate": 1.0360638589123594e-06, + "loss": 0.006, + "step": 43422 + }, + { + "epoch": 0.86848, + "grad_norm": 0.019954100251197815, + "learning_rate": 1.0354450400562e-06, + "loss": 0.0003, + "step": 43424 + }, + { + "epoch": 0.86852, + "grad_norm": 0.046402763575315475, + "learning_rate": 1.0348263959686533e-06, + "loss": 0.0005, + "step": 43426 + }, + { + "epoch": 0.86856, + "grad_norm": 0.008745850063860416, + "learning_rate": 1.034207926661781e-06, + "loss": 0.0006, + "step": 43428 + }, + { + "epoch": 0.8686, + "grad_norm": 0.01640147902071476, + "learning_rate": 1.0335896321476413e-06, + "loss": 0.024, + "step": 43430 + }, + { + "epoch": 0.86864, + "grad_norm": 0.010627501644194126, + "learning_rate": 1.0329715124382878e-06, + "loss": 0.0009, + "step": 43432 + }, + { + "epoch": 0.86868, + "grad_norm": 0.024835413321852684, + "learning_rate": 1.0323535675457741e-06, + "loss": 0.0005, + "step": 43434 + }, + { + "epoch": 0.86872, + "grad_norm": 0.00027176423463970423, + "learning_rate": 1.031735797482142e-06, + "loss": 0.0001, + "step": 43436 + }, + { + "epoch": 0.86876, + "grad_norm": 0.014688162133097649, + "learning_rate": 1.0311182022594368e-06, + "loss": 0.0004, + "step": 43438 + }, + { + "epoch": 0.8688, + "grad_norm": 0.09154660254716873, + "learning_rate": 1.0305007818897006e-06, + "loss": 0.0007, + "step": 43440 + }, + { + "epoch": 0.86884, + "grad_norm": 0.14823219180107117, + "learning_rate": 1.0298835363849713e-06, + "loss": 0.0015, + "step": 43442 + }, + { + "epoch": 0.86888, + "grad_norm": 0.06310434639453888, + "learning_rate": 1.0292664657572771e-06, + "loss": 0.0156, + "step": 43444 + }, + { + "epoch": 0.86892, + "grad_norm": 0.5009521842002869, + "learning_rate": 1.0286495700186549e-06, + "loss": 0.0211, + "step": 43446 + }, + { + "epoch": 0.86896, + "grad_norm": 0.008368448354303837, + "learning_rate": 1.0280328491811265e-06, + "loss": 0.0011, + "step": 43448 + }, + { + "epoch": 0.869, + "grad_norm": 0.3063586950302124, + "learning_rate": 1.0274163032567165e-06, + "loss": 0.0027, + "step": 43450 + }, + { + "epoch": 0.86904, + "grad_norm": 0.12099000066518784, + "learning_rate": 1.026799932257445e-06, + "loss": 0.0037, + "step": 43452 + }, + { + "epoch": 0.86908, + "grad_norm": 0.27462631464004517, + "learning_rate": 1.0261837361953287e-06, + "loss": 0.0036, + "step": 43454 + }, + { + "epoch": 0.86912, + "grad_norm": 0.03988056257367134, + "learning_rate": 1.0255677150823816e-06, + "loss": 0.0043, + "step": 43456 + }, + { + "epoch": 0.86916, + "grad_norm": 0.5391311049461365, + "learning_rate": 1.0249518689306114e-06, + "loss": 0.0057, + "step": 43458 + }, + { + "epoch": 0.8692, + "grad_norm": 0.4135192930698395, + "learning_rate": 1.024336197752025e-06, + "loss": 0.0058, + "step": 43460 + }, + { + "epoch": 0.86924, + "grad_norm": 0.06763981282711029, + "learning_rate": 1.023720701558626e-06, + "loss": 0.0012, + "step": 43462 + }, + { + "epoch": 0.86928, + "grad_norm": 0.5209671258926392, + "learning_rate": 1.0231053803624124e-06, + "loss": 0.0037, + "step": 43464 + }, + { + "epoch": 0.86932, + "grad_norm": 0.10476354509592056, + "learning_rate": 1.0224902341753806e-06, + "loss": 0.0018, + "step": 43466 + }, + { + "epoch": 0.86936, + "grad_norm": 0.20422056317329407, + "learning_rate": 1.0218752630095264e-06, + "loss": 0.0017, + "step": 43468 + }, + { + "epoch": 0.8694, + "grad_norm": 0.007897060364484787, + "learning_rate": 1.0212604668768343e-06, + "loss": 0.2345, + "step": 43470 + }, + { + "epoch": 0.86944, + "grad_norm": 0.457225501537323, + "learning_rate": 1.020645845789291e-06, + "loss": 0.006, + "step": 43472 + }, + { + "epoch": 0.86948, + "grad_norm": 0.4378207325935364, + "learning_rate": 1.020031399758883e-06, + "loss": 0.0045, + "step": 43474 + }, + { + "epoch": 0.86952, + "grad_norm": 0.017708489671349525, + "learning_rate": 1.019417128797582e-06, + "loss": 0.0032, + "step": 43476 + }, + { + "epoch": 0.86956, + "grad_norm": 0.024663297459483147, + "learning_rate": 1.0188030329173704e-06, + "loss": 0.0005, + "step": 43478 + }, + { + "epoch": 0.8696, + "grad_norm": 0.5458707809448242, + "learning_rate": 1.0181891121302145e-06, + "loss": 0.0053, + "step": 43480 + }, + { + "epoch": 0.86964, + "grad_norm": 0.0005892417393624783, + "learning_rate": 1.0175753664480891e-06, + "loss": 0.0002, + "step": 43482 + }, + { + "epoch": 0.86968, + "grad_norm": 0.0005501863779500127, + "learning_rate": 1.0169617958829558e-06, + "loss": 0.0002, + "step": 43484 + }, + { + "epoch": 0.86972, + "grad_norm": 1.255226492881775, + "learning_rate": 1.0163484004467761e-06, + "loss": 0.0095, + "step": 43486 + }, + { + "epoch": 0.86976, + "grad_norm": 0.021252667531371117, + "learning_rate": 1.0157351801515113e-06, + "loss": 0.0007, + "step": 43488 + }, + { + "epoch": 0.8698, + "grad_norm": 0.24488025903701782, + "learning_rate": 1.0151221350091134e-06, + "loss": 0.004, + "step": 43490 + }, + { + "epoch": 0.86984, + "grad_norm": 0.9114441871643066, + "learning_rate": 1.0145092650315346e-06, + "loss": 0.007, + "step": 43492 + }, + { + "epoch": 0.86988, + "grad_norm": 0.009685130789875984, + "learning_rate": 1.0138965702307235e-06, + "loss": 0.0017, + "step": 43494 + }, + { + "epoch": 0.86992, + "grad_norm": 9.54699420928955, + "learning_rate": 1.013284050618626e-06, + "loss": 0.0965, + "step": 43496 + }, + { + "epoch": 0.86996, + "grad_norm": 0.34266602993011475, + "learning_rate": 1.0126717062071812e-06, + "loss": 0.0031, + "step": 43498 + }, + { + "epoch": 0.87, + "grad_norm": 0.3489668071269989, + "learning_rate": 1.012059537008332e-06, + "loss": 0.0033, + "step": 43500 + }, + { + "epoch": 0.87004, + "grad_norm": 0.03646565601229668, + "learning_rate": 1.0114475430340064e-06, + "loss": 0.0004, + "step": 43502 + }, + { + "epoch": 0.87008, + "grad_norm": 0.017134837806224823, + "learning_rate": 1.0108357242961387e-06, + "loss": 0.002, + "step": 43504 + }, + { + "epoch": 0.87012, + "grad_norm": 0.2609270513057709, + "learning_rate": 1.0102240808066566e-06, + "loss": 0.0028, + "step": 43506 + }, + { + "epoch": 0.87016, + "grad_norm": 0.06303129345178604, + "learning_rate": 1.009612612577484e-06, + "loss": 0.0006, + "step": 43508 + }, + { + "epoch": 0.8702, + "grad_norm": 4.2568583488464355, + "learning_rate": 1.009001319620545e-06, + "loss": 0.0388, + "step": 43510 + }, + { + "epoch": 0.87024, + "grad_norm": 0.042873065918684006, + "learning_rate": 1.00839020194775e-06, + "loss": 0.0006, + "step": 43512 + }, + { + "epoch": 0.87028, + "grad_norm": 0.30198049545288086, + "learning_rate": 1.00777925957102e-06, + "loss": 0.0021, + "step": 43514 + }, + { + "epoch": 0.87032, + "grad_norm": 0.17066292464733124, + "learning_rate": 1.0071684925022619e-06, + "loss": 0.0015, + "step": 43516 + }, + { + "epoch": 0.87036, + "grad_norm": 0.09014023095369339, + "learning_rate": 1.0065579007533843e-06, + "loss": 0.0024, + "step": 43518 + }, + { + "epoch": 0.8704, + "grad_norm": 1.1863594055175781, + "learning_rate": 1.0059474843362893e-06, + "loss": 0.0111, + "step": 43520 + }, + { + "epoch": 0.87044, + "grad_norm": 2.0884852409362793, + "learning_rate": 1.0053372432628795e-06, + "loss": 0.024, + "step": 43522 + }, + { + "epoch": 0.87048, + "grad_norm": 0.00032401850330643356, + "learning_rate": 1.0047271775450518e-06, + "loss": 0.0009, + "step": 43524 + }, + { + "epoch": 0.87052, + "grad_norm": 0.6451297998428345, + "learning_rate": 1.0041172871946981e-06, + "loss": 0.0085, + "step": 43526 + }, + { + "epoch": 0.87056, + "grad_norm": 0.1748199760913849, + "learning_rate": 1.0035075722237085e-06, + "loss": 0.0014, + "step": 43528 + }, + { + "epoch": 0.8706, + "grad_norm": 0.01189016830176115, + "learning_rate": 1.0028980326439708e-06, + "loss": 0.0006, + "step": 43530 + }, + { + "epoch": 0.87064, + "grad_norm": 6.974067211151123, + "learning_rate": 1.0022886684673704e-06, + "loss": 0.0875, + "step": 43532 + }, + { + "epoch": 0.87068, + "grad_norm": 0.005183430388569832, + "learning_rate": 1.0016794797057805e-06, + "loss": 0.0149, + "step": 43534 + }, + { + "epoch": 0.87072, + "grad_norm": 0.0022292612120509148, + "learning_rate": 1.0010704663710846e-06, + "loss": 0.0082, + "step": 43536 + }, + { + "epoch": 0.87076, + "grad_norm": 0.4332984983921051, + "learning_rate": 1.0004616284751524e-06, + "loss": 0.0045, + "step": 43538 + }, + { + "epoch": 0.8708, + "grad_norm": 0.018540380522608757, + "learning_rate": 9.99852966029854e-07, + "loss": 0.0003, + "step": 43540 + }, + { + "epoch": 0.87084, + "grad_norm": 0.04917187988758087, + "learning_rate": 9.992444790470567e-07, + "loss": 0.0085, + "step": 43542 + }, + { + "epoch": 0.87088, + "grad_norm": 0.09343062341213226, + "learning_rate": 9.98636167538618e-07, + "loss": 0.0016, + "step": 43544 + }, + { + "epoch": 0.87092, + "grad_norm": 0.08601628988981247, + "learning_rate": 9.980280315164071e-07, + "loss": 0.001, + "step": 43546 + }, + { + "epoch": 0.87096, + "grad_norm": 0.008546549826860428, + "learning_rate": 9.97420070992271e-07, + "loss": 0.0045, + "step": 43548 + }, + { + "epoch": 0.871, + "grad_norm": 0.08153210580348969, + "learning_rate": 9.968122859780648e-07, + "loss": 0.0012, + "step": 43550 + }, + { + "epoch": 0.87104, + "grad_norm": 0.09738915413618088, + "learning_rate": 9.962046764856393e-07, + "loss": 0.0042, + "step": 43552 + }, + { + "epoch": 0.87108, + "grad_norm": 0.21712720394134521, + "learning_rate": 9.95597242526839e-07, + "loss": 0.0021, + "step": 43554 + }, + { + "epoch": 0.87112, + "grad_norm": 0.08552368730306625, + "learning_rate": 9.94989984113508e-07, + "loss": 0.0053, + "step": 43556 + }, + { + "epoch": 0.87116, + "grad_norm": 0.7102864980697632, + "learning_rate": 9.943829012574814e-07, + "loss": 0.0067, + "step": 43558 + }, + { + "epoch": 0.8712, + "grad_norm": 0.2658998668193817, + "learning_rate": 9.93775993970597e-07, + "loss": 0.2534, + "step": 43560 + }, + { + "epoch": 0.87124, + "grad_norm": 0.008058860898017883, + "learning_rate": 9.931692622646861e-07, + "loss": 0.0009, + "step": 43562 + }, + { + "epoch": 0.87128, + "grad_norm": 0.1346932351589203, + "learning_rate": 9.925627061515785e-07, + "loss": 0.002, + "step": 43564 + }, + { + "epoch": 0.87132, + "grad_norm": 0.22435742616653442, + "learning_rate": 9.919563256430952e-07, + "loss": 0.0055, + "step": 43566 + }, + { + "epoch": 0.87136, + "grad_norm": 0.0024404237046837807, + "learning_rate": 9.913501207510657e-07, + "loss": 0.0377, + "step": 43568 + }, + { + "epoch": 0.8714, + "grad_norm": 0.00504287937656045, + "learning_rate": 9.907440914873e-07, + "loss": 0.0005, + "step": 43570 + }, + { + "epoch": 0.87144, + "grad_norm": 0.0020460430532693863, + "learning_rate": 9.901382378636172e-07, + "loss": 0.0064, + "step": 43572 + }, + { + "epoch": 0.87148, + "grad_norm": 0.1898498684167862, + "learning_rate": 9.895325598918305e-07, + "loss": 0.0035, + "step": 43574 + }, + { + "epoch": 0.87152, + "grad_norm": 0.008138156495988369, + "learning_rate": 9.889270575837406e-07, + "loss": 0.0001, + "step": 43576 + }, + { + "epoch": 0.87156, + "grad_norm": 0.0016273453366011381, + "learning_rate": 9.883217309511616e-07, + "loss": 0.0004, + "step": 43578 + }, + { + "epoch": 0.8716, + "grad_norm": 0.01122431829571724, + "learning_rate": 9.877165800058874e-07, + "loss": 0.0012, + "step": 43580 + }, + { + "epoch": 0.87164, + "grad_norm": 0.020623018965125084, + "learning_rate": 9.871116047597185e-07, + "loss": 0.0003, + "step": 43582 + }, + { + "epoch": 0.87168, + "grad_norm": 0.01334068551659584, + "learning_rate": 9.865068052244498e-07, + "loss": 0.0002, + "step": 43584 + }, + { + "epoch": 0.87172, + "grad_norm": 0.1546219289302826, + "learning_rate": 9.859021814118708e-07, + "loss": 0.0016, + "step": 43586 + }, + { + "epoch": 0.87176, + "grad_norm": 0.007973826490342617, + "learning_rate": 9.852977333337687e-07, + "loss": 0.0011, + "step": 43588 + }, + { + "epoch": 0.8718, + "grad_norm": 0.16760793328285217, + "learning_rate": 9.84693461001932e-07, + "loss": 0.0017, + "step": 43590 + }, + { + "epoch": 0.87184, + "grad_norm": 0.04350893199443817, + "learning_rate": 9.840893644281347e-07, + "loss": 0.0006, + "step": 43592 + }, + { + "epoch": 0.87188, + "grad_norm": 0.0023931656032800674, + "learning_rate": 9.83485443624157e-07, + "loss": 0.0006, + "step": 43594 + }, + { + "epoch": 0.87192, + "grad_norm": 0.001559058902785182, + "learning_rate": 9.828816986017742e-07, + "loss": 0.0001, + "step": 43596 + }, + { + "epoch": 0.87196, + "grad_norm": 0.025081925094127655, + "learning_rate": 9.822781293727513e-07, + "loss": 0.0026, + "step": 43598 + }, + { + "epoch": 0.872, + "grad_norm": 0.05438871681690216, + "learning_rate": 9.816747359488632e-07, + "loss": 0.0009, + "step": 43600 + }, + { + "epoch": 0.87204, + "grad_norm": 0.029837746173143387, + "learning_rate": 9.81071518341865e-07, + "loss": 0.1058, + "step": 43602 + }, + { + "epoch": 0.87208, + "grad_norm": 0.02185889147222042, + "learning_rate": 9.804684765635253e-07, + "loss": 0.0041, + "step": 43604 + }, + { + "epoch": 0.87212, + "grad_norm": 13.098525047302246, + "learning_rate": 9.798656106255932e-07, + "loss": 0.1499, + "step": 43606 + }, + { + "epoch": 0.87216, + "grad_norm": 0.05709592252969742, + "learning_rate": 9.792629205398252e-07, + "loss": 0.0013, + "step": 43608 + }, + { + "epoch": 0.8722, + "grad_norm": 9.155546188354492, + "learning_rate": 9.786604063179728e-07, + "loss": 0.0715, + "step": 43610 + }, + { + "epoch": 0.87224, + "grad_norm": 0.2172488570213318, + "learning_rate": 9.780580679717766e-07, + "loss": 0.0015, + "step": 43612 + }, + { + "epoch": 0.87228, + "grad_norm": 0.012093372642993927, + "learning_rate": 9.774559055129873e-07, + "loss": 0.0001, + "step": 43614 + }, + { + "epoch": 0.87232, + "grad_norm": 0.923574686050415, + "learning_rate": 9.768539189533377e-07, + "loss": 0.0126, + "step": 43616 + }, + { + "epoch": 0.87236, + "grad_norm": 0.014230215921998024, + "learning_rate": 9.762521083045661e-07, + "loss": 0.0008, + "step": 43618 + }, + { + "epoch": 0.8724, + "grad_norm": 0.037490371614694595, + "learning_rate": 9.756504735784067e-07, + "loss": 0.0048, + "step": 43620 + }, + { + "epoch": 0.87244, + "grad_norm": 0.005134487058967352, + "learning_rate": 9.750490147865888e-07, + "loss": 0.0001, + "step": 43622 + }, + { + "epoch": 0.87248, + "grad_norm": 0.2371729165315628, + "learning_rate": 9.74447731940834e-07, + "loss": 0.0032, + "step": 43624 + }, + { + "epoch": 0.87252, + "grad_norm": 0.008915345184504986, + "learning_rate": 9.738466250528677e-07, + "loss": 0.0005, + "step": 43626 + }, + { + "epoch": 0.87256, + "grad_norm": 0.03849785402417183, + "learning_rate": 9.732456941344083e-07, + "loss": 0.0076, + "step": 43628 + }, + { + "epoch": 0.8726, + "grad_norm": 0.03795246779918671, + "learning_rate": 9.726449391971716e-07, + "loss": 0.0151, + "step": 43630 + }, + { + "epoch": 0.87264, + "grad_norm": 0.30829551815986633, + "learning_rate": 9.720443602528717e-07, + "loss": 0.0033, + "step": 43632 + }, + { + "epoch": 0.87268, + "grad_norm": 0.0002990520733874291, + "learning_rate": 9.714439573132106e-07, + "loss": 0.0016, + "step": 43634 + }, + { + "epoch": 0.87272, + "grad_norm": 0.25823649764060974, + "learning_rate": 9.70843730389902e-07, + "loss": 0.0028, + "step": 43636 + }, + { + "epoch": 0.87276, + "grad_norm": 0.0020670113153755665, + "learning_rate": 9.702436794946412e-07, + "loss": 0.0002, + "step": 43638 + }, + { + "epoch": 0.8728, + "grad_norm": 0.0016885119257494807, + "learning_rate": 9.696438046391288e-07, + "loss": 0.0002, + "step": 43640 + }, + { + "epoch": 0.87284, + "grad_norm": 0.03916889429092407, + "learning_rate": 9.690441058350607e-07, + "loss": 0.0004, + "step": 43642 + }, + { + "epoch": 0.87288, + "grad_norm": 29.487369537353516, + "learning_rate": 9.684445830941235e-07, + "loss": 0.5153, + "step": 43644 + }, + { + "epoch": 0.87292, + "grad_norm": 0.029085911810398102, + "learning_rate": 9.678452364280145e-07, + "loss": 0.0005, + "step": 43646 + }, + { + "epoch": 0.87296, + "grad_norm": 0.6111557483673096, + "learning_rate": 9.672460658484085e-07, + "loss": 0.0065, + "step": 43648 + }, + { + "epoch": 0.873, + "grad_norm": 5.659285545349121, + "learning_rate": 9.666470713669918e-07, + "loss": 0.0604, + "step": 43650 + }, + { + "epoch": 0.87304, + "grad_norm": 0.018345117568969727, + "learning_rate": 9.660482529954419e-07, + "loss": 0.0004, + "step": 43652 + }, + { + "epoch": 0.87308, + "grad_norm": 0.2608485519886017, + "learning_rate": 9.654496107454335e-07, + "loss": 0.0025, + "step": 43654 + }, + { + "epoch": 0.87312, + "grad_norm": 0.7813897728919983, + "learning_rate": 9.648511446286324e-07, + "loss": 0.0054, + "step": 43656 + }, + { + "epoch": 0.87316, + "grad_norm": 1.5038834810256958, + "learning_rate": 9.642528546567132e-07, + "loss": 0.0134, + "step": 43658 + }, + { + "epoch": 0.8732, + "grad_norm": 6.99662971496582, + "learning_rate": 9.636547408413355e-07, + "loss": 0.0646, + "step": 43660 + }, + { + "epoch": 0.87324, + "grad_norm": 0.01342944335192442, + "learning_rate": 9.630568031941601e-07, + "loss": 0.0001, + "step": 43662 + }, + { + "epoch": 0.87328, + "grad_norm": 0.2815221846103668, + "learning_rate": 9.624590417268475e-07, + "loss": 0.0027, + "step": 43664 + }, + { + "epoch": 0.87332, + "grad_norm": 0.34485751390457153, + "learning_rate": 9.618614564510442e-07, + "loss": 0.0027, + "step": 43666 + }, + { + "epoch": 0.87336, + "grad_norm": 0.09118498116731644, + "learning_rate": 9.612640473784096e-07, + "loss": 0.001, + "step": 43668 + }, + { + "epoch": 0.8734, + "grad_norm": 0.04364308342337608, + "learning_rate": 9.606668145205833e-07, + "loss": 0.0004, + "step": 43670 + }, + { + "epoch": 0.87344, + "grad_norm": 0.6321345567703247, + "learning_rate": 9.600697578892116e-07, + "loss": 0.0054, + "step": 43672 + }, + { + "epoch": 0.87348, + "grad_norm": 0.10029944777488708, + "learning_rate": 9.594728774959328e-07, + "loss": 0.0013, + "step": 43674 + }, + { + "epoch": 0.87352, + "grad_norm": 0.12135826796293259, + "learning_rate": 9.588761733523855e-07, + "loss": 0.0012, + "step": 43676 + }, + { + "epoch": 0.87356, + "grad_norm": 0.0010073194280266762, + "learning_rate": 9.582796454702049e-07, + "loss": 0.0, + "step": 43678 + }, + { + "epoch": 0.8736, + "grad_norm": 0.0008584526949562132, + "learning_rate": 9.576832938610137e-07, + "loss": 0.0001, + "step": 43680 + }, + { + "epoch": 0.87364, + "grad_norm": 0.037943337112665176, + "learning_rate": 9.57087118536444e-07, + "loss": 0.0006, + "step": 43682 + }, + { + "epoch": 0.87368, + "grad_norm": 0.001888508559204638, + "learning_rate": 9.56491119508115e-07, + "loss": 0.0003, + "step": 43684 + }, + { + "epoch": 0.87372, + "grad_norm": 0.05001566559076309, + "learning_rate": 9.558952967876501e-07, + "loss": 0.0055, + "step": 43686 + }, + { + "epoch": 0.87376, + "grad_norm": 0.06551729887723923, + "learning_rate": 9.552996503866585e-07, + "loss": 0.0009, + "step": 43688 + }, + { + "epoch": 0.8738, + "grad_norm": 0.04705207422375679, + "learning_rate": 9.547041803167601e-07, + "loss": 0.0005, + "step": 43690 + }, + { + "epoch": 0.87384, + "grad_norm": 0.7786220908164978, + "learning_rate": 9.541088865895599e-07, + "loss": 0.0331, + "step": 43692 + }, + { + "epoch": 0.87388, + "grad_norm": 0.11896979808807373, + "learning_rate": 9.535137692166629e-07, + "loss": 0.0026, + "step": 43694 + }, + { + "epoch": 0.87392, + "grad_norm": 0.3635362386703491, + "learning_rate": 9.529188282096747e-07, + "loss": 0.008, + "step": 43696 + }, + { + "epoch": 0.87396, + "grad_norm": 0.03588591143488884, + "learning_rate": 9.523240635801889e-07, + "loss": 0.0004, + "step": 43698 + }, + { + "epoch": 0.874, + "grad_norm": 3.775616407394409, + "learning_rate": 9.517294753398066e-07, + "loss": 0.0519, + "step": 43700 + }, + { + "epoch": 0.87404, + "grad_norm": 0.06828639656305313, + "learning_rate": 9.511350635001138e-07, + "loss": 0.001, + "step": 43702 + }, + { + "epoch": 0.87408, + "grad_norm": 1.142211675643921, + "learning_rate": 9.505408280727025e-07, + "loss": 0.0103, + "step": 43704 + }, + { + "epoch": 0.87412, + "grad_norm": 0.066839799284935, + "learning_rate": 9.499467690691566e-07, + "loss": 0.0008, + "step": 43706 + }, + { + "epoch": 0.87416, + "grad_norm": 0.019090691581368446, + "learning_rate": 9.493528865010571e-07, + "loss": 0.0032, + "step": 43708 + }, + { + "epoch": 0.8742, + "grad_norm": 0.1354745328426361, + "learning_rate": 9.487591803799856e-07, + "loss": 0.0014, + "step": 43710 + }, + { + "epoch": 0.87424, + "grad_norm": 0.7381198406219482, + "learning_rate": 9.481656507175119e-07, + "loss": 0.0064, + "step": 43712 + }, + { + "epoch": 0.87428, + "grad_norm": 0.22283439338207245, + "learning_rate": 9.475722975252078e-07, + "loss": 0.0021, + "step": 43714 + }, + { + "epoch": 0.87432, + "grad_norm": 0.0048783873207867146, + "learning_rate": 9.469791208146428e-07, + "loss": 0.0149, + "step": 43716 + }, + { + "epoch": 0.87436, + "grad_norm": 0.05632944405078888, + "learning_rate": 9.463861205973812e-07, + "loss": 0.0077, + "step": 43718 + }, + { + "epoch": 0.8744, + "grad_norm": 0.07456810772418976, + "learning_rate": 9.457932968849826e-07, + "loss": 0.0018, + "step": 43720 + }, + { + "epoch": 0.87444, + "grad_norm": 0.01855863444507122, + "learning_rate": 9.452006496890076e-07, + "loss": 0.0002, + "step": 43722 + }, + { + "epoch": 0.87448, + "grad_norm": 0.005988276097923517, + "learning_rate": 9.446081790210038e-07, + "loss": 0.0004, + "step": 43724 + }, + { + "epoch": 0.87452, + "grad_norm": 0.03425271436572075, + "learning_rate": 9.440158848925296e-07, + "loss": 0.0075, + "step": 43726 + }, + { + "epoch": 0.87456, + "grad_norm": 0.042084045708179474, + "learning_rate": 9.434237673151247e-07, + "loss": 0.0014, + "step": 43728 + }, + { + "epoch": 0.8746, + "grad_norm": 0.023704243823885918, + "learning_rate": 9.428318263003378e-07, + "loss": 0.0065, + "step": 43730 + }, + { + "epoch": 0.87464, + "grad_norm": 0.06621301919221878, + "learning_rate": 9.422400618597083e-07, + "loss": 0.0008, + "step": 43732 + }, + { + "epoch": 0.87468, + "grad_norm": 0.08651405572891235, + "learning_rate": 9.416484740047683e-07, + "loss": 0.0018, + "step": 43734 + }, + { + "epoch": 0.87472, + "grad_norm": 0.27806901931762695, + "learning_rate": 9.410570627470594e-07, + "loss": 0.0053, + "step": 43736 + }, + { + "epoch": 0.87476, + "grad_norm": 0.06395302712917328, + "learning_rate": 9.404658280981049e-07, + "loss": 0.0028, + "step": 43738 + }, + { + "epoch": 0.8748, + "grad_norm": 0.04243403300642967, + "learning_rate": 9.398747700694322e-07, + "loss": 0.0012, + "step": 43740 + }, + { + "epoch": 0.87484, + "grad_norm": 0.057940974831581116, + "learning_rate": 9.392838886725663e-07, + "loss": 0.0007, + "step": 43742 + }, + { + "epoch": 0.87488, + "grad_norm": 0.3553573489189148, + "learning_rate": 9.386931839190272e-07, + "loss": 0.0028, + "step": 43744 + }, + { + "epoch": 0.87492, + "grad_norm": 0.0032442142255604267, + "learning_rate": 9.381026558203277e-07, + "loss": 0.0001, + "step": 43746 + }, + { + "epoch": 0.87496, + "grad_norm": 0.00797975156456232, + "learning_rate": 9.37512304387983e-07, + "loss": 0.0009, + "step": 43748 + }, + { + "epoch": 0.875, + "grad_norm": 0.5925260186195374, + "learning_rate": 9.369221296335007e-07, + "loss": 0.0051, + "step": 43750 + }, + { + "epoch": 0.87504, + "grad_norm": 0.2180008590221405, + "learning_rate": 9.363321315683882e-07, + "loss": 0.0024, + "step": 43752 + }, + { + "epoch": 0.87508, + "grad_norm": 3.209508180618286, + "learning_rate": 9.357423102041485e-07, + "loss": 0.0535, + "step": 43754 + }, + { + "epoch": 0.87512, + "grad_norm": 0.009715937077999115, + "learning_rate": 9.351526655522747e-07, + "loss": 0.0002, + "step": 43756 + }, + { + "epoch": 0.87516, + "grad_norm": 0.026909301057457924, + "learning_rate": 9.345631976242708e-07, + "loss": 0.001, + "step": 43758 + }, + { + "epoch": 0.8752, + "grad_norm": 0.0816129520535469, + "learning_rate": 9.339739064316233e-07, + "loss": 0.0012, + "step": 43760 + }, + { + "epoch": 0.87524, + "grad_norm": 0.005477210506796837, + "learning_rate": 9.333847919858219e-07, + "loss": 0.0082, + "step": 43762 + }, + { + "epoch": 0.87528, + "grad_norm": 0.006796444300562143, + "learning_rate": 9.327958542983528e-07, + "loss": 0.0002, + "step": 43764 + }, + { + "epoch": 0.87532, + "grad_norm": 0.007840147241950035, + "learning_rate": 9.322070933806937e-07, + "loss": 0.0008, + "step": 43766 + }, + { + "epoch": 0.87536, + "grad_norm": 0.08577024191617966, + "learning_rate": 9.316185092443297e-07, + "loss": 0.0009, + "step": 43768 + }, + { + "epoch": 0.8754, + "grad_norm": 0.016243617981672287, + "learning_rate": 9.310301019007284e-07, + "loss": 0.0004, + "step": 43770 + }, + { + "epoch": 0.87544, + "grad_norm": 0.10423444211483002, + "learning_rate": 9.304418713613661e-07, + "loss": 0.0114, + "step": 43772 + }, + { + "epoch": 0.87548, + "grad_norm": 0.36409831047058105, + "learning_rate": 9.29853817637707e-07, + "loss": 0.0036, + "step": 43774 + }, + { + "epoch": 0.87552, + "grad_norm": 0.03906276449561119, + "learning_rate": 9.292659407412208e-07, + "loss": 0.0043, + "step": 43776 + }, + { + "epoch": 0.87556, + "grad_norm": 0.1035260409116745, + "learning_rate": 9.286782406833617e-07, + "loss": 0.0008, + "step": 43778 + }, + { + "epoch": 0.8756, + "grad_norm": 0.00882738083600998, + "learning_rate": 9.280907174755916e-07, + "loss": 0.0003, + "step": 43780 + }, + { + "epoch": 0.87564, + "grad_norm": 0.551414966583252, + "learning_rate": 9.275033711293635e-07, + "loss": 0.005, + "step": 43782 + }, + { + "epoch": 0.87568, + "grad_norm": 0.03833397477865219, + "learning_rate": 9.269162016561273e-07, + "loss": 0.0005, + "step": 43784 + }, + { + "epoch": 0.87572, + "grad_norm": 0.6110978722572327, + "learning_rate": 9.263292090673326e-07, + "loss": 0.0625, + "step": 43786 + }, + { + "epoch": 0.87576, + "grad_norm": 0.49621257185935974, + "learning_rate": 9.25742393374418e-07, + "loss": 0.0046, + "step": 43788 + }, + { + "epoch": 0.8758, + "grad_norm": 0.00026348407845944166, + "learning_rate": 9.251557545888312e-07, + "loss": 0.0005, + "step": 43790 + }, + { + "epoch": 0.87584, + "grad_norm": 1.2136516571044922, + "learning_rate": 9.24569292722004e-07, + "loss": 0.0083, + "step": 43792 + }, + { + "epoch": 0.87588, + "grad_norm": 0.002721572294831276, + "learning_rate": 9.239830077853695e-07, + "loss": 0.0013, + "step": 43794 + }, + { + "epoch": 0.87592, + "grad_norm": 0.552842378616333, + "learning_rate": 9.233968997903586e-07, + "loss": 0.0068, + "step": 43796 + }, + { + "epoch": 0.87596, + "grad_norm": 7.439134120941162, + "learning_rate": 9.228109687483988e-07, + "loss": 0.1056, + "step": 43798 + }, + { + "epoch": 0.876, + "grad_norm": 1.9960905313491821, + "learning_rate": 9.222252146709143e-07, + "loss": 0.015, + "step": 43800 + }, + { + "epoch": 0.87604, + "grad_norm": 0.022165173664689064, + "learning_rate": 9.216396375693215e-07, + "loss": 0.001, + "step": 43802 + }, + { + "epoch": 0.87608, + "grad_norm": 0.8867759108543396, + "learning_rate": 9.210542374550369e-07, + "loss": 0.0099, + "step": 43804 + }, + { + "epoch": 0.87612, + "grad_norm": 0.010662198998034, + "learning_rate": 9.204690143394746e-07, + "loss": 0.0012, + "step": 43806 + }, + { + "epoch": 0.87616, + "grad_norm": 0.017506320029497147, + "learning_rate": 9.198839682340432e-07, + "loss": 0.0011, + "step": 43808 + }, + { + "epoch": 0.8762, + "grad_norm": 0.044271502643823624, + "learning_rate": 9.192990991501483e-07, + "loss": 0.0031, + "step": 43810 + }, + { + "epoch": 0.87624, + "grad_norm": 0.020770227536559105, + "learning_rate": 9.187144070991937e-07, + "loss": 0.0018, + "step": 43812 + }, + { + "epoch": 0.87628, + "grad_norm": 0.0012982584303244948, + "learning_rate": 9.181298920925763e-07, + "loss": 0.0001, + "step": 43814 + }, + { + "epoch": 0.87632, + "grad_norm": 0.047877728939056396, + "learning_rate": 9.175455541416922e-07, + "loss": 0.1382, + "step": 43816 + }, + { + "epoch": 0.87636, + "grad_norm": 0.16616006195545197, + "learning_rate": 9.169613932579357e-07, + "loss": 0.002, + "step": 43818 + }, + { + "epoch": 0.8764, + "grad_norm": 0.26289257407188416, + "learning_rate": 9.16377409452689e-07, + "loss": 0.0035, + "step": 43820 + }, + { + "epoch": 0.87644, + "grad_norm": 0.1045980304479599, + "learning_rate": 9.157936027373449e-07, + "loss": 0.004, + "step": 43822 + }, + { + "epoch": 0.87648, + "grad_norm": 0.10055004060268402, + "learning_rate": 9.152099731232777e-07, + "loss": 0.0016, + "step": 43824 + }, + { + "epoch": 0.87652, + "grad_norm": 2.5491349697113037, + "learning_rate": 9.146265206218729e-07, + "loss": 0.0242, + "step": 43826 + }, + { + "epoch": 0.87656, + "grad_norm": 0.009146971628069878, + "learning_rate": 9.140432452444991e-07, + "loss": 0.001, + "step": 43828 + }, + { + "epoch": 0.8766, + "grad_norm": 0.020778313279151917, + "learning_rate": 9.134601470025306e-07, + "loss": 0.0002, + "step": 43830 + }, + { + "epoch": 0.87664, + "grad_norm": 0.014209726825356483, + "learning_rate": 9.128772259073371e-07, + "loss": 0.0005, + "step": 43832 + }, + { + "epoch": 0.87668, + "grad_norm": 8.031652450561523, + "learning_rate": 9.122944819702772e-07, + "loss": 0.0726, + "step": 43834 + }, + { + "epoch": 0.87672, + "grad_norm": 0.07183204591274261, + "learning_rate": 9.117119152027165e-07, + "loss": 0.0005, + "step": 43836 + }, + { + "epoch": 0.87676, + "grad_norm": 0.009157557971775532, + "learning_rate": 9.111295256160102e-07, + "loss": 0.005, + "step": 43838 + }, + { + "epoch": 0.8768, + "grad_norm": 0.01490337960422039, + "learning_rate": 9.105473132215126e-07, + "loss": 0.0016, + "step": 43840 + }, + { + "epoch": 0.87684, + "grad_norm": 0.4309057295322418, + "learning_rate": 9.099652780305757e-07, + "loss": 0.222, + "step": 43842 + }, + { + "epoch": 0.87688, + "grad_norm": 0.04778898507356644, + "learning_rate": 9.09383420054547e-07, + "loss": 0.0006, + "step": 43844 + }, + { + "epoch": 0.87692, + "grad_norm": 0.09291935712099075, + "learning_rate": 9.088017393047665e-07, + "loss": 0.0011, + "step": 43846 + }, + { + "epoch": 0.87696, + "grad_norm": 0.09773416817188263, + "learning_rate": 9.082202357925774e-07, + "loss": 0.0021, + "step": 43848 + }, + { + "epoch": 0.877, + "grad_norm": 0.002921083476394415, + "learning_rate": 9.076389095293148e-07, + "loss": 0.0035, + "step": 43850 + }, + { + "epoch": 0.87704, + "grad_norm": 0.0026395353488624096, + "learning_rate": 9.070577605263131e-07, + "loss": 0.0019, + "step": 43852 + }, + { + "epoch": 0.87708, + "grad_norm": 0.017851337790489197, + "learning_rate": 9.064767887949033e-07, + "loss": 0.0004, + "step": 43854 + }, + { + "epoch": 0.87712, + "grad_norm": 0.07522232085466385, + "learning_rate": 9.058959943464063e-07, + "loss": 0.0079, + "step": 43856 + }, + { + "epoch": 0.87716, + "grad_norm": 0.607393205165863, + "learning_rate": 9.05315377192153e-07, + "loss": 0.0057, + "step": 43858 + }, + { + "epoch": 0.8772, + "grad_norm": 0.00623669009655714, + "learning_rate": 9.047349373434566e-07, + "loss": 0.0001, + "step": 43860 + }, + { + "epoch": 0.87724, + "grad_norm": 0.013821080327033997, + "learning_rate": 9.041546748116358e-07, + "loss": 0.0013, + "step": 43862 + }, + { + "epoch": 0.87728, + "grad_norm": 0.007692767307162285, + "learning_rate": 9.035745896080017e-07, + "loss": 0.0017, + "step": 43864 + }, + { + "epoch": 0.87732, + "grad_norm": 0.05742158740758896, + "learning_rate": 9.029946817438662e-07, + "loss": 0.0012, + "step": 43866 + }, + { + "epoch": 0.87736, + "grad_norm": 0.18189416825771332, + "learning_rate": 9.024149512305302e-07, + "loss": 0.006, + "step": 43868 + }, + { + "epoch": 0.8774, + "grad_norm": 0.0026600065175443888, + "learning_rate": 9.018353980792993e-07, + "loss": 0.0001, + "step": 43870 + }, + { + "epoch": 0.87744, + "grad_norm": 0.003249053843319416, + "learning_rate": 9.012560223014721e-07, + "loss": 0.0006, + "step": 43872 + }, + { + "epoch": 0.87748, + "grad_norm": 0.002133807400241494, + "learning_rate": 9.006768239083418e-07, + "loss": 0.0004, + "step": 43874 + }, + { + "epoch": 0.87752, + "grad_norm": 0.03130290284752846, + "learning_rate": 9.00097802911204e-07, + "loss": 0.0009, + "step": 43876 + }, + { + "epoch": 0.87756, + "grad_norm": 0.013134486973285675, + "learning_rate": 8.995189593213405e-07, + "loss": 0.002, + "step": 43878 + }, + { + "epoch": 0.8776, + "grad_norm": 0.35154619812965393, + "learning_rate": 8.989402931500434e-07, + "loss": 0.0028, + "step": 43880 + }, + { + "epoch": 0.87764, + "grad_norm": 0.001090727630071342, + "learning_rate": 8.983618044085895e-07, + "loss": 0.0, + "step": 43882 + }, + { + "epoch": 0.87768, + "grad_norm": 0.0565468855202198, + "learning_rate": 8.977834931082585e-07, + "loss": 0.0007, + "step": 43884 + }, + { + "epoch": 0.87772, + "grad_norm": 0.002462368458509445, + "learning_rate": 8.972053592603259e-07, + "loss": 0.0132, + "step": 43886 + }, + { + "epoch": 0.87776, + "grad_norm": 0.002148351399227977, + "learning_rate": 8.96627402876058e-07, + "loss": 0.0002, + "step": 43888 + }, + { + "epoch": 0.8778, + "grad_norm": 0.045785900205373764, + "learning_rate": 8.960496239667282e-07, + "loss": 0.0039, + "step": 43890 + }, + { + "epoch": 0.87784, + "grad_norm": 0.0008818940259516239, + "learning_rate": 8.954720225435964e-07, + "loss": 0.0003, + "step": 43892 + }, + { + "epoch": 0.87788, + "grad_norm": 0.009096289984881878, + "learning_rate": 8.948945986179258e-07, + "loss": 0.001, + "step": 43894 + }, + { + "epoch": 0.87792, + "grad_norm": 0.14445160329341888, + "learning_rate": 8.943173522009718e-07, + "loss": 0.0014, + "step": 43896 + }, + { + "epoch": 0.87796, + "grad_norm": 0.049827273935079575, + "learning_rate": 8.937402833039909e-07, + "loss": 0.0043, + "step": 43898 + }, + { + "epoch": 0.878, + "grad_norm": 0.07038652896881104, + "learning_rate": 8.931633919382299e-07, + "loss": 0.001, + "step": 43900 + }, + { + "epoch": 0.87804, + "grad_norm": 0.14427921175956726, + "learning_rate": 8.925866781149373e-07, + "loss": 0.0018, + "step": 43902 + }, + { + "epoch": 0.87808, + "grad_norm": 24.080162048339844, + "learning_rate": 8.920101418453553e-07, + "loss": 0.4301, + "step": 43904 + }, + { + "epoch": 0.87812, + "grad_norm": 0.043128062039613724, + "learning_rate": 8.914337831407249e-07, + "loss": 0.0004, + "step": 43906 + }, + { + "epoch": 0.87816, + "grad_norm": 3.0104918479919434, + "learning_rate": 8.90857602012285e-07, + "loss": 0.032, + "step": 43908 + }, + { + "epoch": 0.8782, + "grad_norm": 0.03406841307878494, + "learning_rate": 8.902815984712621e-07, + "loss": 0.0007, + "step": 43910 + }, + { + "epoch": 0.87824, + "grad_norm": 0.4098423719406128, + "learning_rate": 8.897057725288916e-07, + "loss": 0.0061, + "step": 43912 + }, + { + "epoch": 0.87828, + "grad_norm": 0.003387621371075511, + "learning_rate": 8.891301241963968e-07, + "loss": 0.0132, + "step": 43914 + }, + { + "epoch": 0.87832, + "grad_norm": 0.07254963368177414, + "learning_rate": 8.885546534850009e-07, + "loss": 0.001, + "step": 43916 + }, + { + "epoch": 0.87836, + "grad_norm": 0.010562547482550144, + "learning_rate": 8.879793604059229e-07, + "loss": 0.0002, + "step": 43918 + }, + { + "epoch": 0.8784, + "grad_norm": 0.12296228110790253, + "learning_rate": 8.874042449703779e-07, + "loss": 0.0008, + "step": 43920 + }, + { + "epoch": 0.87844, + "grad_norm": 0.012030758894979954, + "learning_rate": 8.868293071895806e-07, + "loss": 0.0022, + "step": 43922 + }, + { + "epoch": 0.87848, + "grad_norm": 0.3588441014289856, + "learning_rate": 8.862545470747363e-07, + "loss": 0.0034, + "step": 43924 + }, + { + "epoch": 0.87852, + "grad_norm": 0.6803919076919556, + "learning_rate": 8.856799646370506e-07, + "loss": 0.0067, + "step": 43926 + }, + { + "epoch": 0.87856, + "grad_norm": 0.49041882157325745, + "learning_rate": 8.851055598877279e-07, + "loss": 0.004, + "step": 43928 + }, + { + "epoch": 0.8786, + "grad_norm": 0.017189735546708107, + "learning_rate": 8.845313328379635e-07, + "loss": 0.0012, + "step": 43930 + }, + { + "epoch": 0.87864, + "grad_norm": 0.02326061949133873, + "learning_rate": 8.839572834989551e-07, + "loss": 0.0002, + "step": 43932 + }, + { + "epoch": 0.87868, + "grad_norm": 0.04015637934207916, + "learning_rate": 8.83383411881893e-07, + "loss": 0.0019, + "step": 43934 + }, + { + "epoch": 0.87872, + "grad_norm": 0.10764462500810623, + "learning_rate": 8.828097179979644e-07, + "loss": 0.0009, + "step": 43936 + }, + { + "epoch": 0.87876, + "grad_norm": 0.03206993266940117, + "learning_rate": 8.82236201858353e-07, + "loss": 0.0004, + "step": 43938 + }, + { + "epoch": 0.8788, + "grad_norm": 0.03535008057951927, + "learning_rate": 8.816628634742441e-07, + "loss": 0.1058, + "step": 43940 + }, + { + "epoch": 0.87884, + "grad_norm": 0.027744118124246597, + "learning_rate": 8.810897028568077e-07, + "loss": 0.0005, + "step": 43942 + }, + { + "epoch": 0.87888, + "grad_norm": 0.01543590147048235, + "learning_rate": 8.80516720017226e-07, + "loss": 0.002, + "step": 43944 + }, + { + "epoch": 0.87892, + "grad_norm": 0.01027588453143835, + "learning_rate": 8.799439149666623e-07, + "loss": 0.0003, + "step": 43946 + }, + { + "epoch": 0.87896, + "grad_norm": 0.0075839608907699585, + "learning_rate": 8.793712877162908e-07, + "loss": 0.0011, + "step": 43948 + }, + { + "epoch": 0.879, + "grad_norm": 0.07463163137435913, + "learning_rate": 8.787988382772705e-07, + "loss": 0.0007, + "step": 43950 + }, + { + "epoch": 0.87904, + "grad_norm": 0.3680560290813446, + "learning_rate": 8.782265666607614e-07, + "loss": 0.0032, + "step": 43952 + }, + { + "epoch": 0.87908, + "grad_norm": 0.032967206090688705, + "learning_rate": 8.776544728779246e-07, + "loss": 0.0005, + "step": 43954 + }, + { + "epoch": 0.87912, + "grad_norm": 0.05011490732431412, + "learning_rate": 8.770825569399088e-07, + "loss": 0.0004, + "step": 43956 + }, + { + "epoch": 0.87916, + "grad_norm": 0.004631461575627327, + "learning_rate": 8.765108188578641e-07, + "loss": 0.0003, + "step": 43958 + }, + { + "epoch": 0.8792, + "grad_norm": 0.004491300787776709, + "learning_rate": 8.759392586429394e-07, + "loss": 0.0117, + "step": 43960 + }, + { + "epoch": 0.87924, + "grad_norm": 0.013296466320753098, + "learning_rate": 8.753678763062745e-07, + "loss": 0.0001, + "step": 43962 + }, + { + "epoch": 0.87928, + "grad_norm": 0.16701605916023254, + "learning_rate": 8.747966718590118e-07, + "loss": 0.0023, + "step": 43964 + }, + { + "epoch": 0.87932, + "grad_norm": 0.023298515006899834, + "learning_rate": 8.742256453122877e-07, + "loss": 0.0002, + "step": 43966 + }, + { + "epoch": 0.87936, + "grad_norm": 0.1056765541434288, + "learning_rate": 8.736547966772313e-07, + "loss": 0.0157, + "step": 43968 + }, + { + "epoch": 0.8794, + "grad_norm": 0.0140609759837389, + "learning_rate": 8.730841259649725e-07, + "loss": 0.0007, + "step": 43970 + }, + { + "epoch": 0.87944, + "grad_norm": 0.03261836618185043, + "learning_rate": 8.725136331866379e-07, + "loss": 0.0015, + "step": 43972 + }, + { + "epoch": 0.87948, + "grad_norm": 0.20591101050376892, + "learning_rate": 8.719433183533488e-07, + "loss": 0.002, + "step": 43974 + }, + { + "epoch": 0.87952, + "grad_norm": 0.014566759578883648, + "learning_rate": 8.713731814762261e-07, + "loss": 0.0001, + "step": 43976 + }, + { + "epoch": 0.87956, + "grad_norm": 0.4276575446128845, + "learning_rate": 8.708032225663798e-07, + "loss": 0.1663, + "step": 43978 + }, + { + "epoch": 0.8796, + "grad_norm": 0.3285498321056366, + "learning_rate": 8.702334416349279e-07, + "loss": 0.0024, + "step": 43980 + }, + { + "epoch": 0.87964, + "grad_norm": 1.700506329536438, + "learning_rate": 8.696638386929734e-07, + "loss": 0.0212, + "step": 43982 + }, + { + "epoch": 0.87968, + "grad_norm": 0.015146106481552124, + "learning_rate": 8.690944137516233e-07, + "loss": 0.0007, + "step": 43984 + }, + { + "epoch": 0.87972, + "grad_norm": 0.17281201481819153, + "learning_rate": 8.685251668219785e-07, + "loss": 0.0015, + "step": 43986 + }, + { + "epoch": 0.87976, + "grad_norm": 0.19912032783031464, + "learning_rate": 8.679560979151391e-07, + "loss": 0.0057, + "step": 43988 + }, + { + "epoch": 0.8798, + "grad_norm": 0.022775761783123016, + "learning_rate": 8.67387207042194e-07, + "loss": 0.0004, + "step": 43990 + }, + { + "epoch": 0.87984, + "grad_norm": 0.03035920299589634, + "learning_rate": 8.668184942142388e-07, + "loss": 0.0003, + "step": 43992 + }, + { + "epoch": 0.87988, + "grad_norm": 1.3068311214447021, + "learning_rate": 8.662499594423579e-07, + "loss": 0.0105, + "step": 43994 + }, + { + "epoch": 0.87992, + "grad_norm": 11.3201322555542, + "learning_rate": 8.656816027376369e-07, + "loss": 0.1502, + "step": 43996 + }, + { + "epoch": 0.87996, + "grad_norm": 0.20347654819488525, + "learning_rate": 8.65113424111157e-07, + "loss": 0.0017, + "step": 43998 + }, + { + "epoch": 0.88, + "grad_norm": 0.11725126951932907, + "learning_rate": 8.645454235739903e-07, + "loss": 0.0019, + "step": 44000 + }, + { + "epoch": 0.88004, + "grad_norm": 1.673608422279358, + "learning_rate": 8.63977601137218e-07, + "loss": 0.0144, + "step": 44002 + }, + { + "epoch": 0.88008, + "grad_norm": 0.012434090487658978, + "learning_rate": 8.634099568119036e-07, + "loss": 0.0005, + "step": 44004 + }, + { + "epoch": 0.88012, + "grad_norm": 0.08216436952352524, + "learning_rate": 8.62842490609116e-07, + "loss": 0.0029, + "step": 44006 + }, + { + "epoch": 0.88016, + "grad_norm": 0.01447528786957264, + "learning_rate": 8.622752025399195e-07, + "loss": 0.0001, + "step": 44008 + }, + { + "epoch": 0.8802, + "grad_norm": 0.028773613274097443, + "learning_rate": 8.617080926153698e-07, + "loss": 0.0004, + "step": 44010 + }, + { + "epoch": 0.88024, + "grad_norm": 0.05687430500984192, + "learning_rate": 8.611411608465281e-07, + "loss": 0.0013, + "step": 44012 + }, + { + "epoch": 0.88028, + "grad_norm": 4.625046253204346, + "learning_rate": 8.605744072444433e-07, + "loss": 0.0477, + "step": 44014 + }, + { + "epoch": 0.88032, + "grad_norm": 0.03611121326684952, + "learning_rate": 8.600078318201654e-07, + "loss": 0.0019, + "step": 44016 + }, + { + "epoch": 0.88036, + "grad_norm": 0.04618369787931442, + "learning_rate": 8.594414345847413e-07, + "loss": 0.0023, + "step": 44018 + }, + { + "epoch": 0.8804, + "grad_norm": 0.3966462016105652, + "learning_rate": 8.58875215549212e-07, + "loss": 0.0032, + "step": 44020 + }, + { + "epoch": 0.88044, + "grad_norm": 0.1454640030860901, + "learning_rate": 8.583091747246175e-07, + "loss": 0.0022, + "step": 44022 + }, + { + "epoch": 0.88048, + "grad_norm": 0.3208531439304352, + "learning_rate": 8.577433121219903e-07, + "loss": 0.0043, + "step": 44024 + }, + { + "epoch": 0.88052, + "grad_norm": 0.12616096436977386, + "learning_rate": 8.571776277523647e-07, + "loss": 0.0016, + "step": 44026 + }, + { + "epoch": 0.88056, + "grad_norm": 0.10260511934757233, + "learning_rate": 8.566121216267686e-07, + "loss": 0.0015, + "step": 44028 + }, + { + "epoch": 0.8806, + "grad_norm": 0.04639606922864914, + "learning_rate": 8.560467937562278e-07, + "loss": 0.0006, + "step": 44030 + }, + { + "epoch": 0.88064, + "grad_norm": 0.0005647437064908445, + "learning_rate": 8.554816441517588e-07, + "loss": 0.0005, + "step": 44032 + }, + { + "epoch": 0.88068, + "grad_norm": 0.012398971244692802, + "learning_rate": 8.549166728243863e-07, + "loss": 0.0239, + "step": 44034 + }, + { + "epoch": 0.88072, + "grad_norm": 0.046162933111190796, + "learning_rate": 8.543518797851202e-07, + "loss": 0.001, + "step": 44036 + }, + { + "epoch": 0.88076, + "grad_norm": 0.02945714257657528, + "learning_rate": 8.537872650449719e-07, + "loss": 0.0006, + "step": 44038 + }, + { + "epoch": 0.8808, + "grad_norm": 0.0954863578081131, + "learning_rate": 8.532228286149502e-07, + "loss": 0.0008, + "step": 44040 + }, + { + "epoch": 0.88084, + "grad_norm": 0.013529340736567974, + "learning_rate": 8.526585705060586e-07, + "loss": 0.0011, + "step": 44042 + }, + { + "epoch": 0.88088, + "grad_norm": 0.01588556356728077, + "learning_rate": 8.520944907292994e-07, + "loss": 0.0005, + "step": 44044 + }, + { + "epoch": 0.88092, + "grad_norm": 0.14221398532390594, + "learning_rate": 8.51530589295666e-07, + "loss": 0.0011, + "step": 44046 + }, + { + "epoch": 0.88096, + "grad_norm": 0.00941754225641489, + "learning_rate": 8.509668662161541e-07, + "loss": 0.0007, + "step": 44048 + }, + { + "epoch": 0.881, + "grad_norm": 0.023886719718575478, + "learning_rate": 8.504033215017527e-07, + "loss": 0.0003, + "step": 44050 + }, + { + "epoch": 0.88104, + "grad_norm": 0.0069595458917319775, + "learning_rate": 8.498399551634484e-07, + "loss": 0.0012, + "step": 44052 + }, + { + "epoch": 0.88108, + "grad_norm": 0.018262702971696854, + "learning_rate": 8.49276767212226e-07, + "loss": 0.0002, + "step": 44054 + }, + { + "epoch": 0.88112, + "grad_norm": 0.41678252816200256, + "learning_rate": 8.487137576590665e-07, + "loss": 0.0058, + "step": 44056 + }, + { + "epoch": 0.88116, + "grad_norm": 0.049046002328395844, + "learning_rate": 8.481509265149412e-07, + "loss": 0.0011, + "step": 44058 + }, + { + "epoch": 0.8812, + "grad_norm": 0.029741620644927025, + "learning_rate": 8.475882737908248e-07, + "loss": 0.0006, + "step": 44060 + }, + { + "epoch": 0.88124, + "grad_norm": 0.004407032392919064, + "learning_rate": 8.470257994976893e-07, + "loss": 0.0027, + "step": 44062 + }, + { + "epoch": 0.88128, + "grad_norm": 12.165179252624512, + "learning_rate": 8.464635036464941e-07, + "loss": 0.1307, + "step": 44064 + }, + { + "epoch": 0.88132, + "grad_norm": 0.0952974259853363, + "learning_rate": 8.459013862482091e-07, + "loss": 0.0012, + "step": 44066 + }, + { + "epoch": 0.88136, + "grad_norm": 8.410630226135254, + "learning_rate": 8.453394473137866e-07, + "loss": 0.0834, + "step": 44068 + }, + { + "epoch": 0.8814, + "grad_norm": 0.010035376995801926, + "learning_rate": 8.447776868541879e-07, + "loss": 0.0007, + "step": 44070 + }, + { + "epoch": 0.88144, + "grad_norm": 0.005918523296713829, + "learning_rate": 8.442161048803599e-07, + "loss": 0.0001, + "step": 44072 + }, + { + "epoch": 0.88148, + "grad_norm": 0.04650877043604851, + "learning_rate": 8.436547014032526e-07, + "loss": 0.0016, + "step": 44074 + }, + { + "epoch": 0.88152, + "grad_norm": 0.11270258575677872, + "learning_rate": 8.430934764338117e-07, + "loss": 0.0009, + "step": 44076 + }, + { + "epoch": 0.88156, + "grad_norm": 3.2798619270324707, + "learning_rate": 8.425324299829774e-07, + "loss": 0.0212, + "step": 44078 + }, + { + "epoch": 0.8816, + "grad_norm": 2.328361749649048, + "learning_rate": 8.419715620616875e-07, + "loss": 0.019, + "step": 44080 + }, + { + "epoch": 0.88164, + "grad_norm": 0.007445495575666428, + "learning_rate": 8.414108726808767e-07, + "loss": 0.0001, + "step": 44082 + }, + { + "epoch": 0.88168, + "grad_norm": 0.0003443591413088143, + "learning_rate": 8.408503618514763e-07, + "loss": 0.0008, + "step": 44084 + }, + { + "epoch": 0.88172, + "grad_norm": 0.0017418160568922758, + "learning_rate": 8.40290029584413e-07, + "loss": 0.0035, + "step": 44086 + }, + { + "epoch": 0.88176, + "grad_norm": 23.476764678955078, + "learning_rate": 8.397298758906136e-07, + "loss": 1.0436, + "step": 44088 + }, + { + "epoch": 0.8818, + "grad_norm": 0.0021496466360986233, + "learning_rate": 8.39169900780995e-07, + "loss": 0.0082, + "step": 44090 + }, + { + "epoch": 0.88184, + "grad_norm": 0.7872788906097412, + "learning_rate": 8.38610104266474e-07, + "loss": 0.0072, + "step": 44092 + }, + { + "epoch": 0.88188, + "grad_norm": 0.26327526569366455, + "learning_rate": 8.380504863579685e-07, + "loss": 0.0045, + "step": 44094 + }, + { + "epoch": 0.88192, + "grad_norm": 0.04940564185380936, + "learning_rate": 8.374910470663821e-07, + "loss": 0.0014, + "step": 44096 + }, + { + "epoch": 0.88196, + "grad_norm": 0.0469152070581913, + "learning_rate": 8.369317864026283e-07, + "loss": 0.0005, + "step": 44098 + }, + { + "epoch": 0.882, + "grad_norm": 0.1691005676984787, + "learning_rate": 8.363727043776037e-07, + "loss": 0.0017, + "step": 44100 + }, + { + "epoch": 0.88204, + "grad_norm": 0.06829790025949478, + "learning_rate": 8.358138010022132e-07, + "loss": 0.0098, + "step": 44102 + }, + { + "epoch": 0.88208, + "grad_norm": 0.045613303780555725, + "learning_rate": 8.352550762873502e-07, + "loss": 0.0006, + "step": 44104 + }, + { + "epoch": 0.88212, + "grad_norm": 0.01838703081011772, + "learning_rate": 8.34696530243907e-07, + "loss": 0.019, + "step": 44106 + }, + { + "epoch": 0.88216, + "grad_norm": 0.43290138244628906, + "learning_rate": 8.341381628827761e-07, + "loss": 0.0032, + "step": 44108 + }, + { + "epoch": 0.8822, + "grad_norm": 0.18236400187015533, + "learning_rate": 8.335799742148387e-07, + "loss": 0.004, + "step": 44110 + }, + { + "epoch": 0.88224, + "grad_norm": 0.004834628663957119, + "learning_rate": 8.330219642509785e-07, + "loss": 0.0, + "step": 44112 + }, + { + "epoch": 0.88228, + "grad_norm": 0.41952958703041077, + "learning_rate": 8.324641330020744e-07, + "loss": 0.0045, + "step": 44114 + }, + { + "epoch": 0.88232, + "grad_norm": 0.011648561805486679, + "learning_rate": 8.319064804790022e-07, + "loss": 0.0006, + "step": 44116 + }, + { + "epoch": 0.88236, + "grad_norm": 0.15275880694389343, + "learning_rate": 8.313490066926333e-07, + "loss": 0.0019, + "step": 44118 + }, + { + "epoch": 0.8824, + "grad_norm": 0.01711440458893776, + "learning_rate": 8.307917116538378e-07, + "loss": 0.0003, + "step": 44120 + }, + { + "epoch": 0.88244, + "grad_norm": 0.10171779245138168, + "learning_rate": 8.302345953734748e-07, + "loss": 0.0031, + "step": 44122 + }, + { + "epoch": 0.88248, + "grad_norm": 0.09623062610626221, + "learning_rate": 8.296776578624121e-07, + "loss": 0.0008, + "step": 44124 + }, + { + "epoch": 0.88252, + "grad_norm": 0.0912865698337555, + "learning_rate": 8.291208991315036e-07, + "loss": 0.0017, + "step": 44126 + }, + { + "epoch": 0.88256, + "grad_norm": 0.012225314043462276, + "learning_rate": 8.285643191916048e-07, + "loss": 0.0012, + "step": 44128 + }, + { + "epoch": 0.8826, + "grad_norm": 0.018541166558861732, + "learning_rate": 8.280079180535672e-07, + "loss": 0.0008, + "step": 44130 + }, + { + "epoch": 0.88264, + "grad_norm": 0.22009655833244324, + "learning_rate": 8.274516957282352e-07, + "loss": 0.002, + "step": 44132 + }, + { + "epoch": 0.88268, + "grad_norm": 0.10726459324359894, + "learning_rate": 8.268956522264571e-07, + "loss": 0.0034, + "step": 44134 + }, + { + "epoch": 0.88272, + "grad_norm": 0.007256061304360628, + "learning_rate": 8.263397875590695e-07, + "loss": 0.0001, + "step": 44136 + }, + { + "epoch": 0.88276, + "grad_norm": 0.0006913837278261781, + "learning_rate": 8.257841017369106e-07, + "loss": 0.0002, + "step": 44138 + }, + { + "epoch": 0.8828, + "grad_norm": 0.23605796694755554, + "learning_rate": 8.252285947708139e-07, + "loss": 0.0024, + "step": 44140 + }, + { + "epoch": 0.88284, + "grad_norm": 0.13695186376571655, + "learning_rate": 8.246732666716095e-07, + "loss": 0.0013, + "step": 44142 + }, + { + "epoch": 0.88288, + "grad_norm": 0.24069496989250183, + "learning_rate": 8.241181174501245e-07, + "loss": 0.0058, + "step": 44144 + }, + { + "epoch": 0.88292, + "grad_norm": 0.02688758261501789, + "learning_rate": 8.235631471171801e-07, + "loss": 0.0005, + "step": 44146 + }, + { + "epoch": 0.88296, + "grad_norm": 0.9352176785469055, + "learning_rate": 8.230083556835955e-07, + "loss": 0.417, + "step": 44148 + }, + { + "epoch": 0.883, + "grad_norm": 0.23829235136508942, + "learning_rate": 8.224537431601886e-07, + "loss": 0.0029, + "step": 44150 + }, + { + "epoch": 0.88304, + "grad_norm": 0.04001845791935921, + "learning_rate": 8.218993095577721e-07, + "loss": 0.0015, + "step": 44152 + }, + { + "epoch": 0.88308, + "grad_norm": 0.015104013495147228, + "learning_rate": 8.213450548871505e-07, + "loss": 0.0003, + "step": 44154 + }, + { + "epoch": 0.88312, + "grad_norm": 0.01891651749610901, + "learning_rate": 8.207909791591361e-07, + "loss": 0.0013, + "step": 44156 + }, + { + "epoch": 0.88316, + "grad_norm": 0.014320656657218933, + "learning_rate": 8.202370823845252e-07, + "loss": 0.0004, + "step": 44158 + }, + { + "epoch": 0.8832, + "grad_norm": 0.003776633646339178, + "learning_rate": 8.196833645741187e-07, + "loss": 0.0024, + "step": 44160 + }, + { + "epoch": 0.88324, + "grad_norm": 0.029095306992530823, + "learning_rate": 8.191298257387137e-07, + "loss": 0.0021, + "step": 44162 + }, + { + "epoch": 0.88328, + "grad_norm": 0.024976594373583794, + "learning_rate": 8.185764658890949e-07, + "loss": 0.0026, + "step": 44164 + }, + { + "epoch": 0.88332, + "grad_norm": 0.016292531043291092, + "learning_rate": 8.180232850360581e-07, + "loss": 0.0028, + "step": 44166 + }, + { + "epoch": 0.88336, + "grad_norm": 0.020692916586995125, + "learning_rate": 8.174702831903836e-07, + "loss": 0.001, + "step": 44168 + }, + { + "epoch": 0.8834, + "grad_norm": 0.06869280338287354, + "learning_rate": 8.169174603628538e-07, + "loss": 0.0009, + "step": 44170 + }, + { + "epoch": 0.88344, + "grad_norm": 0.004498591180890799, + "learning_rate": 8.163648165642446e-07, + "loss": 0.0003, + "step": 44172 + }, + { + "epoch": 0.88348, + "grad_norm": 0.26791441440582275, + "learning_rate": 8.158123518053318e-07, + "loss": 0.004, + "step": 44174 + }, + { + "epoch": 0.88352, + "grad_norm": 24.910640716552734, + "learning_rate": 8.152600660968879e-07, + "loss": 0.4494, + "step": 44176 + }, + { + "epoch": 0.88356, + "grad_norm": 0.0007857005693949759, + "learning_rate": 8.147079594496754e-07, + "loss": 0.0581, + "step": 44178 + }, + { + "epoch": 0.8836, + "grad_norm": 0.23494870960712433, + "learning_rate": 8.141560318744601e-07, + "loss": 0.003, + "step": 44180 + }, + { + "epoch": 0.88364, + "grad_norm": 0.0824824795126915, + "learning_rate": 8.136042833820023e-07, + "loss": 0.0017, + "step": 44182 + }, + { + "epoch": 0.88368, + "grad_norm": 0.00029837817419320345, + "learning_rate": 8.130527139830602e-07, + "loss": 0.1897, + "step": 44184 + }, + { + "epoch": 0.88372, + "grad_norm": 0.037946078926324844, + "learning_rate": 8.125013236883816e-07, + "loss": 0.3366, + "step": 44186 + }, + { + "epoch": 0.88376, + "grad_norm": 0.02193094789981842, + "learning_rate": 8.119501125087236e-07, + "loss": 0.0002, + "step": 44188 + }, + { + "epoch": 0.8838, + "grad_norm": 0.019084513187408447, + "learning_rate": 8.113990804548244e-07, + "loss": 0.0016, + "step": 44190 + }, + { + "epoch": 0.88384, + "grad_norm": 0.021706491708755493, + "learning_rate": 8.108482275374352e-07, + "loss": 0.0001, + "step": 44192 + }, + { + "epoch": 0.88388, + "grad_norm": 0.15022088587284088, + "learning_rate": 8.102975537672886e-07, + "loss": 0.0015, + "step": 44194 + }, + { + "epoch": 0.88392, + "grad_norm": 0.1409820169210434, + "learning_rate": 8.097470591551216e-07, + "loss": 0.0433, + "step": 44196 + }, + { + "epoch": 0.88396, + "grad_norm": 0.1996009796857834, + "learning_rate": 8.091967437116699e-07, + "loss": 0.002, + "step": 44198 + }, + { + "epoch": 0.884, + "grad_norm": 0.01514735072851181, + "learning_rate": 8.086466074476562e-07, + "loss": 0.0018, + "step": 44200 + }, + { + "epoch": 0.88404, + "grad_norm": 0.035133108496665955, + "learning_rate": 8.080966503738108e-07, + "loss": 0.0009, + "step": 44202 + }, + { + "epoch": 0.88408, + "grad_norm": 0.2686457335948944, + "learning_rate": 8.075468725008517e-07, + "loss": 0.0032, + "step": 44204 + }, + { + "epoch": 0.88412, + "grad_norm": 0.47768834233283997, + "learning_rate": 8.069972738395004e-07, + "loss": 0.0049, + "step": 44206 + }, + { + "epoch": 0.88416, + "grad_norm": 0.04189668968319893, + "learning_rate": 8.064478544004694e-07, + "loss": 0.0018, + "step": 44208 + }, + { + "epoch": 0.8842, + "grad_norm": 0.055192165076732635, + "learning_rate": 8.058986141944724e-07, + "loss": 0.0086, + "step": 44210 + }, + { + "epoch": 0.88424, + "grad_norm": 0.022885063663125038, + "learning_rate": 8.053495532322142e-07, + "loss": 0.0018, + "step": 44212 + }, + { + "epoch": 0.88428, + "grad_norm": 0.5195564031600952, + "learning_rate": 8.048006715243995e-07, + "loss": 0.007, + "step": 44214 + }, + { + "epoch": 0.88432, + "grad_norm": 0.02961321361362934, + "learning_rate": 8.042519690817319e-07, + "loss": 0.0012, + "step": 44216 + }, + { + "epoch": 0.88436, + "grad_norm": 0.0013349974760785699, + "learning_rate": 8.037034459149018e-07, + "loss": 0.0336, + "step": 44218 + }, + { + "epoch": 0.8844, + "grad_norm": 0.13566432893276215, + "learning_rate": 8.031551020346129e-07, + "loss": 0.0013, + "step": 44220 + }, + { + "epoch": 0.88444, + "grad_norm": 0.012146239168941975, + "learning_rate": 8.026069374515454e-07, + "loss": 0.0005, + "step": 44222 + }, + { + "epoch": 0.88448, + "grad_norm": 0.008905529975891113, + "learning_rate": 8.020589521763944e-07, + "loss": 0.0873, + "step": 44224 + }, + { + "epoch": 0.88452, + "grad_norm": 0.05445248261094093, + "learning_rate": 8.015111462198377e-07, + "loss": 0.0011, + "step": 44226 + }, + { + "epoch": 0.88456, + "grad_norm": 0.012835894711315632, + "learning_rate": 8.00963519592558e-07, + "loss": 0.0004, + "step": 44228 + }, + { + "epoch": 0.8846, + "grad_norm": 0.21553529798984528, + "learning_rate": 8.004160723052312e-07, + "loss": 0.0017, + "step": 44230 + }, + { + "epoch": 0.88464, + "grad_norm": 4.774372100830078, + "learning_rate": 7.998688043685254e-07, + "loss": 0.048, + "step": 44232 + }, + { + "epoch": 0.88468, + "grad_norm": 0.028656408190727234, + "learning_rate": 7.993217157931188e-07, + "loss": 0.0003, + "step": 44234 + }, + { + "epoch": 0.88472, + "grad_norm": 0.06402518600225449, + "learning_rate": 7.987748065896695e-07, + "loss": 0.0018, + "step": 44236 + }, + { + "epoch": 0.88476, + "grad_norm": 0.10101655125617981, + "learning_rate": 7.982280767688422e-07, + "loss": 0.0009, + "step": 44238 + }, + { + "epoch": 0.8848, + "grad_norm": 0.03966158255934715, + "learning_rate": 7.976815263412963e-07, + "loss": 0.0004, + "step": 44240 + }, + { + "epoch": 0.88484, + "grad_norm": 0.03569132834672928, + "learning_rate": 7.971351553176887e-07, + "loss": 0.0003, + "step": 44242 + }, + { + "epoch": 0.88488, + "grad_norm": 0.0756651982665062, + "learning_rate": 7.965889637086677e-07, + "loss": 0.0011, + "step": 44244 + }, + { + "epoch": 0.88492, + "grad_norm": 0.004823034163564444, + "learning_rate": 7.960429515248824e-07, + "loss": 0.0015, + "step": 44246 + }, + { + "epoch": 0.88496, + "grad_norm": 0.040613844990730286, + "learning_rate": 7.954971187769778e-07, + "loss": 0.0014, + "step": 44248 + }, + { + "epoch": 0.885, + "grad_norm": 0.08572753518819809, + "learning_rate": 7.949514654755963e-07, + "loss": 0.0006, + "step": 44250 + }, + { + "epoch": 0.88504, + "grad_norm": 0.04737156257033348, + "learning_rate": 7.944059916313773e-07, + "loss": 0.0005, + "step": 44252 + }, + { + "epoch": 0.88508, + "grad_norm": 0.005594321060925722, + "learning_rate": 7.93860697254949e-07, + "loss": 0.0001, + "step": 44254 + }, + { + "epoch": 0.88512, + "grad_norm": 0.005134099628776312, + "learning_rate": 7.933155823569494e-07, + "loss": 0.0082, + "step": 44256 + }, + { + "epoch": 0.88516, + "grad_norm": 0.03342567756772041, + "learning_rate": 7.927706469480012e-07, + "loss": 0.0003, + "step": 44258 + }, + { + "epoch": 0.8852, + "grad_norm": 0.021733582019805908, + "learning_rate": 7.922258910387282e-07, + "loss": 0.0003, + "step": 44260 + }, + { + "epoch": 0.88524, + "grad_norm": 0.03137430548667908, + "learning_rate": 7.916813146397528e-07, + "loss": 0.0009, + "step": 44262 + }, + { + "epoch": 0.88528, + "grad_norm": 0.13791614770889282, + "learning_rate": 7.911369177616912e-07, + "loss": 0.0013, + "step": 44264 + }, + { + "epoch": 0.88532, + "grad_norm": 0.9400132298469543, + "learning_rate": 7.905927004151582e-07, + "loss": 0.0078, + "step": 44266 + }, + { + "epoch": 0.88536, + "grad_norm": 0.05915524810552597, + "learning_rate": 7.900486626107595e-07, + "loss": 0.0007, + "step": 44268 + }, + { + "epoch": 0.8854, + "grad_norm": 0.2278469353914261, + "learning_rate": 7.895048043591036e-07, + "loss": 0.0023, + "step": 44270 + }, + { + "epoch": 0.88544, + "grad_norm": 0.002130495849996805, + "learning_rate": 7.889611256707941e-07, + "loss": 0.0, + "step": 44272 + }, + { + "epoch": 0.88548, + "grad_norm": 0.042879413813352585, + "learning_rate": 7.884176265564314e-07, + "loss": 0.0006, + "step": 44274 + }, + { + "epoch": 0.88552, + "grad_norm": 0.24809418618679047, + "learning_rate": 7.878743070266049e-07, + "loss": 0.0021, + "step": 44276 + }, + { + "epoch": 0.88556, + "grad_norm": 0.06288658082485199, + "learning_rate": 7.873311670919159e-07, + "loss": 0.0006, + "step": 44278 + }, + { + "epoch": 0.8856, + "grad_norm": 0.14328943192958832, + "learning_rate": 7.867882067629473e-07, + "loss": 0.002, + "step": 44280 + }, + { + "epoch": 0.88564, + "grad_norm": 0.12066241353750229, + "learning_rate": 7.86245426050285e-07, + "loss": 0.0096, + "step": 44282 + }, + { + "epoch": 0.88568, + "grad_norm": 0.023391885682940483, + "learning_rate": 7.857028249645138e-07, + "loss": 0.0005, + "step": 44284 + }, + { + "epoch": 0.88572, + "grad_norm": 0.005522120278328657, + "learning_rate": 7.851604035162064e-07, + "loss": 0.0002, + "step": 44286 + }, + { + "epoch": 0.88576, + "grad_norm": 0.0017887179274111986, + "learning_rate": 7.846181617159454e-07, + "loss": 0.0024, + "step": 44288 + }, + { + "epoch": 0.8858, + "grad_norm": 0.03213481232523918, + "learning_rate": 7.840760995742946e-07, + "loss": 0.0006, + "step": 44290 + }, + { + "epoch": 0.88584, + "grad_norm": 0.012821538373827934, + "learning_rate": 7.835342171018257e-07, + "loss": 0.0003, + "step": 44292 + }, + { + "epoch": 0.88588, + "grad_norm": 0.01965339481830597, + "learning_rate": 7.829925143091021e-07, + "loss": 0.0006, + "step": 44294 + }, + { + "epoch": 0.88592, + "grad_norm": 0.04180469363927841, + "learning_rate": 7.824509912066846e-07, + "loss": 0.0004, + "step": 44296 + }, + { + "epoch": 0.88596, + "grad_norm": 0.032519519329071045, + "learning_rate": 7.819096478051325e-07, + "loss": 0.0002, + "step": 44298 + }, + { + "epoch": 0.886, + "grad_norm": 0.018336454406380653, + "learning_rate": 7.81368484114996e-07, + "loss": 0.002, + "step": 44300 + }, + { + "epoch": 0.88604, + "grad_norm": 0.03582429140806198, + "learning_rate": 7.808275001468258e-07, + "loss": 0.0008, + "step": 44302 + }, + { + "epoch": 0.88608, + "grad_norm": 0.011338025331497192, + "learning_rate": 7.80286695911171e-07, + "loss": 0.0001, + "step": 44304 + }, + { + "epoch": 0.88612, + "grad_norm": 0.18375985324382782, + "learning_rate": 7.797460714185756e-07, + "loss": 0.0021, + "step": 44306 + }, + { + "epoch": 0.88616, + "grad_norm": 0.01937943324446678, + "learning_rate": 7.792056266795733e-07, + "loss": 0.0338, + "step": 44308 + }, + { + "epoch": 0.8862, + "grad_norm": 0.014048554003238678, + "learning_rate": 7.78665361704708e-07, + "loss": 0.0001, + "step": 44310 + }, + { + "epoch": 0.88624, + "grad_norm": 0.025232290849089622, + "learning_rate": 7.781252765045078e-07, + "loss": 0.0084, + "step": 44312 + }, + { + "epoch": 0.88628, + "grad_norm": 0.04953666403889656, + "learning_rate": 7.77585371089502e-07, + "loss": 0.0007, + "step": 44314 + }, + { + "epoch": 0.88632, + "grad_norm": 0.09789388626813889, + "learning_rate": 7.77045645470218e-07, + "loss": 0.0006, + "step": 44316 + }, + { + "epoch": 0.88636, + "grad_norm": 0.02883659489452839, + "learning_rate": 7.765060996571772e-07, + "loss": 0.0004, + "step": 44318 + }, + { + "epoch": 0.8864, + "grad_norm": 0.018273115158081055, + "learning_rate": 7.759667336609011e-07, + "loss": 0.0006, + "step": 44320 + }, + { + "epoch": 0.88644, + "grad_norm": 3.9756312370300293, + "learning_rate": 7.754275474919004e-07, + "loss": 0.0272, + "step": 44322 + }, + { + "epoch": 0.88648, + "grad_norm": 0.15607450902462006, + "learning_rate": 7.748885411606877e-07, + "loss": 0.0025, + "step": 44324 + }, + { + "epoch": 0.88652, + "grad_norm": 0.019607357680797577, + "learning_rate": 7.743497146777734e-07, + "loss": 0.0007, + "step": 44326 + }, + { + "epoch": 0.88656, + "grad_norm": 0.23837536573410034, + "learning_rate": 7.738110680536603e-07, + "loss": 0.0024, + "step": 44328 + }, + { + "epoch": 0.8866, + "grad_norm": 0.072823666036129, + "learning_rate": 7.732726012988512e-07, + "loss": 0.0022, + "step": 44330 + }, + { + "epoch": 0.88664, + "grad_norm": 1.7570098638534546, + "learning_rate": 7.727343144238442e-07, + "loss": 0.0152, + "step": 44332 + }, + { + "epoch": 0.88668, + "grad_norm": 0.030986730009317398, + "learning_rate": 7.721962074391309e-07, + "loss": 0.0021, + "step": 44334 + }, + { + "epoch": 0.88672, + "grad_norm": 0.4972400665283203, + "learning_rate": 7.716582803552031e-07, + "loss": 0.0057, + "step": 44336 + }, + { + "epoch": 0.88676, + "grad_norm": 0.03160589188337326, + "learning_rate": 7.711205331825489e-07, + "loss": 0.0004, + "step": 44338 + }, + { + "epoch": 0.8868, + "grad_norm": 0.26770561933517456, + "learning_rate": 7.7058296593165e-07, + "loss": 0.0026, + "step": 44340 + }, + { + "epoch": 0.88684, + "grad_norm": 16.838592529296875, + "learning_rate": 7.700455786129901e-07, + "loss": 0.3738, + "step": 44342 + }, + { + "epoch": 0.88688, + "grad_norm": 0.02585156261920929, + "learning_rate": 7.695083712370399e-07, + "loss": 0.0004, + "step": 44344 + }, + { + "epoch": 0.88692, + "grad_norm": 0.016926441341638565, + "learning_rate": 7.689713438142799e-07, + "loss": 0.0011, + "step": 44346 + }, + { + "epoch": 0.88696, + "grad_norm": 0.19917736947536469, + "learning_rate": 7.684344963551749e-07, + "loss": 0.0029, + "step": 44348 + }, + { + "epoch": 0.887, + "grad_norm": 0.17762550711631775, + "learning_rate": 7.678978288701911e-07, + "loss": 0.002, + "step": 44350 + }, + { + "epoch": 0.88704, + "grad_norm": 0.023172149434685707, + "learning_rate": 7.673613413697945e-07, + "loss": 0.0005, + "step": 44352 + }, + { + "epoch": 0.88708, + "grad_norm": 0.1453837752342224, + "learning_rate": 7.668250338644379e-07, + "loss": 0.0017, + "step": 44354 + }, + { + "epoch": 0.88712, + "grad_norm": 0.04400624334812164, + "learning_rate": 7.66288906364584e-07, + "loss": 0.0005, + "step": 44356 + }, + { + "epoch": 0.88716, + "grad_norm": 0.03779996931552887, + "learning_rate": 7.657529588806811e-07, + "loss": 0.0003, + "step": 44358 + }, + { + "epoch": 0.8872, + "grad_norm": 0.07266335189342499, + "learning_rate": 7.652171914231777e-07, + "loss": 0.4498, + "step": 44360 + }, + { + "epoch": 0.88724, + "grad_norm": 0.07583773881196976, + "learning_rate": 7.646816040025195e-07, + "loss": 0.0011, + "step": 44362 + }, + { + "epoch": 0.88728, + "grad_norm": 0.14378178119659424, + "learning_rate": 7.641461966291497e-07, + "loss": 0.0037, + "step": 44364 + }, + { + "epoch": 0.88732, + "grad_norm": 0.01375820953398943, + "learning_rate": 7.63610969313503e-07, + "loss": 0.0009, + "step": 44366 + }, + { + "epoch": 0.88736, + "grad_norm": 0.004139971453696489, + "learning_rate": 7.630759220660156e-07, + "loss": 0.0001, + "step": 44368 + }, + { + "epoch": 0.8874, + "grad_norm": 0.04113760590553284, + "learning_rate": 7.62541054897119e-07, + "loss": 0.0008, + "step": 44370 + }, + { + "epoch": 0.88744, + "grad_norm": 0.008197426795959473, + "learning_rate": 7.620063678172407e-07, + "loss": 0.0002, + "step": 44372 + }, + { + "epoch": 0.88748, + "grad_norm": 0.2672041952610016, + "learning_rate": 7.614718608368055e-07, + "loss": 0.0018, + "step": 44374 + }, + { + "epoch": 0.88752, + "grad_norm": 0.0052734240889549255, + "learning_rate": 7.609375339662284e-07, + "loss": 0.0132, + "step": 44376 + }, + { + "epoch": 0.88756, + "grad_norm": 0.0319330096244812, + "learning_rate": 7.604033872159355e-07, + "loss": 0.0004, + "step": 44378 + }, + { + "epoch": 0.8876, + "grad_norm": 1.7030187845230103, + "learning_rate": 7.598694205963331e-07, + "loss": 0.0214, + "step": 44380 + }, + { + "epoch": 0.88764, + "grad_norm": 0.011135113425552845, + "learning_rate": 7.593356341178337e-07, + "loss": 0.0004, + "step": 44382 + }, + { + "epoch": 0.88768, + "grad_norm": 0.15149357914924622, + "learning_rate": 7.588020277908426e-07, + "loss": 0.0013, + "step": 44384 + }, + { + "epoch": 0.88772, + "grad_norm": 0.4272399842739105, + "learning_rate": 7.582686016257646e-07, + "loss": 0.0035, + "step": 44386 + }, + { + "epoch": 0.88776, + "grad_norm": 0.006015623454004526, + "learning_rate": 7.577353556330003e-07, + "loss": 0.0022, + "step": 44388 + }, + { + "epoch": 0.8878, + "grad_norm": 0.23081375658512115, + "learning_rate": 7.572022898229403e-07, + "loss": 0.0024, + "step": 44390 + }, + { + "epoch": 0.88784, + "grad_norm": 0.9417335987091064, + "learning_rate": 7.566694042059808e-07, + "loss": 0.0079, + "step": 44392 + }, + { + "epoch": 0.88788, + "grad_norm": 2.1708922386169434, + "learning_rate": 7.561366987925112e-07, + "loss": 0.0243, + "step": 44394 + }, + { + "epoch": 0.88792, + "grad_norm": 0.4211347699165344, + "learning_rate": 7.556041735929165e-07, + "loss": 0.0048, + "step": 44396 + }, + { + "epoch": 0.88796, + "grad_norm": 0.006833972875028849, + "learning_rate": 7.55071828617574e-07, + "loss": 0.0057, + "step": 44398 + }, + { + "epoch": 0.888, + "grad_norm": 0.08730877935886383, + "learning_rate": 7.545396638768698e-07, + "loss": 0.0015, + "step": 44400 + }, + { + "epoch": 0.88804, + "grad_norm": 0.03382579982280731, + "learning_rate": 7.540076793811724e-07, + "loss": 0.0009, + "step": 44402 + }, + { + "epoch": 0.88808, + "grad_norm": 0.30748823285102844, + "learning_rate": 7.534758751408556e-07, + "loss": 0.0071, + "step": 44404 + }, + { + "epoch": 0.88812, + "grad_norm": 0.014003572054207325, + "learning_rate": 7.529442511662899e-07, + "loss": 0.0004, + "step": 44406 + }, + { + "epoch": 0.88816, + "grad_norm": 0.10022258758544922, + "learning_rate": 7.524128074678316e-07, + "loss": 0.0019, + "step": 44408 + }, + { + "epoch": 0.8882, + "grad_norm": 0.02551341988146305, + "learning_rate": 7.518815440558514e-07, + "loss": 0.0004, + "step": 44410 + }, + { + "epoch": 0.88824, + "grad_norm": 0.01757902465760708, + "learning_rate": 7.513504609406996e-07, + "loss": 0.001, + "step": 44412 + }, + { + "epoch": 0.88828, + "grad_norm": 1.3925269842147827, + "learning_rate": 7.508195581327315e-07, + "loss": 0.0112, + "step": 44414 + }, + { + "epoch": 0.88832, + "grad_norm": 0.3253851532936096, + "learning_rate": 7.502888356422988e-07, + "loss": 0.0027, + "step": 44416 + }, + { + "epoch": 0.88836, + "grad_norm": 0.014464757405221462, + "learning_rate": 7.497582934797465e-07, + "loss": 0.0962, + "step": 44418 + }, + { + "epoch": 0.8884, + "grad_norm": 0.00960595067590475, + "learning_rate": 7.492279316554207e-07, + "loss": 0.0337, + "step": 44420 + }, + { + "epoch": 0.88844, + "grad_norm": 0.000781664508394897, + "learning_rate": 7.486977501796577e-07, + "loss": 0.0005, + "step": 44422 + }, + { + "epoch": 0.88848, + "grad_norm": 0.4779490530490875, + "learning_rate": 7.481677490627948e-07, + "loss": 0.0035, + "step": 44424 + }, + { + "epoch": 0.88852, + "grad_norm": 0.37970373034477234, + "learning_rate": 7.476379283151636e-07, + "loss": 0.0024, + "step": 44426 + }, + { + "epoch": 0.88856, + "grad_norm": 0.46967577934265137, + "learning_rate": 7.471082879470959e-07, + "loss": 0.0036, + "step": 44428 + }, + { + "epoch": 0.8886, + "grad_norm": 0.3096354603767395, + "learning_rate": 7.465788279689156e-07, + "loss": 0.0041, + "step": 44430 + }, + { + "epoch": 0.88864, + "grad_norm": 0.003490525996312499, + "learning_rate": 7.460495483909458e-07, + "loss": 0.0001, + "step": 44432 + }, + { + "epoch": 0.88868, + "grad_norm": 0.003619431983679533, + "learning_rate": 7.455204492235036e-07, + "loss": 0.0012, + "step": 44434 + }, + { + "epoch": 0.88872, + "grad_norm": 0.003963508643209934, + "learning_rate": 7.44991530476904e-07, + "loss": 0.0001, + "step": 44436 + }, + { + "epoch": 0.88876, + "grad_norm": 0.009597375057637691, + "learning_rate": 7.4446279216146e-07, + "loss": 0.0012, + "step": 44438 + }, + { + "epoch": 0.8888, + "grad_norm": 0.0099650789052248, + "learning_rate": 7.439342342874789e-07, + "loss": 0.0014, + "step": 44440 + }, + { + "epoch": 0.88884, + "grad_norm": 0.005538920871913433, + "learning_rate": 7.434058568652669e-07, + "loss": 0.0001, + "step": 44442 + }, + { + "epoch": 0.88888, + "grad_norm": 0.02292831614613533, + "learning_rate": 7.42877659905119e-07, + "loss": 0.0009, + "step": 44444 + }, + { + "epoch": 0.88892, + "grad_norm": 0.007859383709728718, + "learning_rate": 7.423496434173417e-07, + "loss": 0.0005, + "step": 44446 + }, + { + "epoch": 0.88896, + "grad_norm": 0.0361606627702713, + "learning_rate": 7.418218074122219e-07, + "loss": 0.0006, + "step": 44448 + }, + { + "epoch": 0.889, + "grad_norm": 0.058199476450681686, + "learning_rate": 7.412941519000527e-07, + "loss": 0.0036, + "step": 44450 + }, + { + "epoch": 0.88904, + "grad_norm": 0.004839983303099871, + "learning_rate": 7.407666768911204e-07, + "loss": 0.0002, + "step": 44452 + }, + { + "epoch": 0.88908, + "grad_norm": 0.0007009475957602262, + "learning_rate": 7.402393823957099e-07, + "loss": 0.0003, + "step": 44454 + }, + { + "epoch": 0.88912, + "grad_norm": 0.10203355550765991, + "learning_rate": 7.397122684240998e-07, + "loss": 0.0017, + "step": 44456 + }, + { + "epoch": 0.88916, + "grad_norm": 0.032120008021593094, + "learning_rate": 7.391853349865652e-07, + "loss": 0.0003, + "step": 44458 + }, + { + "epoch": 0.8892, + "grad_norm": 0.038807984441518784, + "learning_rate": 7.386585820933812e-07, + "loss": 0.002, + "step": 44460 + }, + { + "epoch": 0.88924, + "grad_norm": 0.4813205301761627, + "learning_rate": 7.38132009754815e-07, + "loss": 0.0051, + "step": 44462 + }, + { + "epoch": 0.88928, + "grad_norm": 0.02349676936864853, + "learning_rate": 7.376056179811363e-07, + "loss": 0.0002, + "step": 44464 + }, + { + "epoch": 0.88932, + "grad_norm": 0.6817000508308411, + "learning_rate": 7.370794067826004e-07, + "loss": 0.0039, + "step": 44466 + }, + { + "epoch": 0.88936, + "grad_norm": 0.03714128956198692, + "learning_rate": 7.365533761694743e-07, + "loss": 0.0016, + "step": 44468 + }, + { + "epoch": 0.8894, + "grad_norm": 0.06331392377614975, + "learning_rate": 7.360275261520078e-07, + "loss": 0.0007, + "step": 44470 + }, + { + "epoch": 0.88944, + "grad_norm": 0.018301066011190414, + "learning_rate": 7.355018567404537e-07, + "loss": 0.0646, + "step": 44472 + }, + { + "epoch": 0.88948, + "grad_norm": 0.0026136813685297966, + "learning_rate": 7.349763679450628e-07, + "loss": 0.0001, + "step": 44474 + }, + { + "epoch": 0.88952, + "grad_norm": 0.08883921056985855, + "learning_rate": 7.344510597760734e-07, + "loss": 0.0023, + "step": 44476 + }, + { + "epoch": 0.88956, + "grad_norm": 0.025253720581531525, + "learning_rate": 7.339259322437342e-07, + "loss": 0.0007, + "step": 44478 + }, + { + "epoch": 0.8896, + "grad_norm": 21.18213653564453, + "learning_rate": 7.334009853582791e-07, + "loss": 1.0454, + "step": 44480 + }, + { + "epoch": 0.88964, + "grad_norm": 0.01550872903317213, + "learning_rate": 7.32876219129941e-07, + "loss": 0.0002, + "step": 44482 + }, + { + "epoch": 0.88968, + "grad_norm": 0.01252374704927206, + "learning_rate": 7.323516335689529e-07, + "loss": 0.0001, + "step": 44484 + }, + { + "epoch": 0.88972, + "grad_norm": 0.06313568353652954, + "learning_rate": 7.318272286855421e-07, + "loss": 0.0063, + "step": 44486 + }, + { + "epoch": 0.88976, + "grad_norm": 0.010730498470366001, + "learning_rate": 7.313030044899305e-07, + "loss": 0.0009, + "step": 44488 + }, + { + "epoch": 0.8898, + "grad_norm": 0.21561527252197266, + "learning_rate": 7.307789609923377e-07, + "loss": 0.0019, + "step": 44490 + }, + { + "epoch": 0.88984, + "grad_norm": 0.011314528062939644, + "learning_rate": 7.30255098202981e-07, + "loss": 0.0001, + "step": 44492 + }, + { + "epoch": 0.88988, + "grad_norm": 0.005841286852955818, + "learning_rate": 7.297314161320746e-07, + "loss": 0.0376, + "step": 44494 + }, + { + "epoch": 0.88992, + "grad_norm": 0.054078079760074615, + "learning_rate": 7.292079147898267e-07, + "loss": 0.0013, + "step": 44496 + }, + { + "epoch": 0.88996, + "grad_norm": 0.041400060057640076, + "learning_rate": 7.286845941864418e-07, + "loss": 0.0008, + "step": 44498 + }, + { + "epoch": 0.89, + "grad_norm": 0.0409855991601944, + "learning_rate": 7.281614543321269e-07, + "loss": 0.0009, + "step": 44500 + }, + { + "epoch": 0.89004, + "grad_norm": 0.061520665884017944, + "learning_rate": 7.276384952370763e-07, + "loss": 0.0005, + "step": 44502 + }, + { + "epoch": 0.89008, + "grad_norm": 0.003602901939302683, + "learning_rate": 7.271157169114862e-07, + "loss": 0.0001, + "step": 44504 + }, + { + "epoch": 0.89012, + "grad_norm": 0.09158299118280411, + "learning_rate": 7.265931193655495e-07, + "loss": 0.0015, + "step": 44506 + }, + { + "epoch": 0.89016, + "grad_norm": 0.004006173461675644, + "learning_rate": 7.260707026094549e-07, + "loss": 0.0028, + "step": 44508 + }, + { + "epoch": 0.8902, + "grad_norm": 0.10598589479923248, + "learning_rate": 7.255484666533874e-07, + "loss": 0.0247, + "step": 44510 + }, + { + "epoch": 0.89024, + "grad_norm": 0.07567408680915833, + "learning_rate": 7.250264115075267e-07, + "loss": 0.0009, + "step": 44512 + }, + { + "epoch": 0.89028, + "grad_norm": 0.05851931869983673, + "learning_rate": 7.245045371820503e-07, + "loss": 0.0007, + "step": 44514 + }, + { + "epoch": 0.89032, + "grad_norm": 0.30075639486312866, + "learning_rate": 7.239828436871332e-07, + "loss": 0.0028, + "step": 44516 + }, + { + "epoch": 0.89036, + "grad_norm": 3.9778220653533936, + "learning_rate": 7.234613310329486e-07, + "loss": 0.0531, + "step": 44518 + }, + { + "epoch": 0.8904, + "grad_norm": 0.06422201544046402, + "learning_rate": 7.22939999229657e-07, + "loss": 0.0009, + "step": 44520 + }, + { + "epoch": 0.89044, + "grad_norm": 0.021561255678534508, + "learning_rate": 7.224188482874306e-07, + "loss": 0.0003, + "step": 44522 + }, + { + "epoch": 0.89048, + "grad_norm": 0.059352584183216095, + "learning_rate": 7.218978782164221e-07, + "loss": 0.004, + "step": 44524 + }, + { + "epoch": 0.89052, + "grad_norm": 0.022816967219114304, + "learning_rate": 7.213770890267924e-07, + "loss": 0.0006, + "step": 44526 + }, + { + "epoch": 0.89056, + "grad_norm": 0.6486538648605347, + "learning_rate": 7.208564807286945e-07, + "loss": 0.005, + "step": 44528 + }, + { + "epoch": 0.8906, + "grad_norm": 0.1834724247455597, + "learning_rate": 7.203360533322734e-07, + "loss": 0.0019, + "step": 44530 + }, + { + "epoch": 0.89064, + "grad_norm": 0.024510037153959274, + "learning_rate": 7.198158068476823e-07, + "loss": 0.0003, + "step": 44532 + }, + { + "epoch": 0.89068, + "grad_norm": 0.009121201001107693, + "learning_rate": 7.192957412850554e-07, + "loss": 0.004, + "step": 44534 + }, + { + "epoch": 0.89072, + "grad_norm": 0.05045890808105469, + "learning_rate": 7.187758566545399e-07, + "loss": 0.0005, + "step": 44536 + }, + { + "epoch": 0.89076, + "grad_norm": 0.013680070638656616, + "learning_rate": 7.182561529662657e-07, + "loss": 0.0003, + "step": 44538 + }, + { + "epoch": 0.8908, + "grad_norm": 0.05701548978686333, + "learning_rate": 7.177366302303667e-07, + "loss": 0.0005, + "step": 44540 + }, + { + "epoch": 0.89084, + "grad_norm": 0.23787769675254822, + "learning_rate": 7.172172884569728e-07, + "loss": 0.0024, + "step": 44542 + }, + { + "epoch": 0.89088, + "grad_norm": 0.01757241226732731, + "learning_rate": 7.166981276562046e-07, + "loss": 0.0014, + "step": 44544 + }, + { + "epoch": 0.89092, + "grad_norm": 0.26963692903518677, + "learning_rate": 7.161791478381863e-07, + "loss": 0.0035, + "step": 44546 + }, + { + "epoch": 0.89096, + "grad_norm": 0.02824978716671467, + "learning_rate": 7.156603490130343e-07, + "loss": 0.0003, + "step": 44548 + }, + { + "epoch": 0.891, + "grad_norm": 0.08523894846439362, + "learning_rate": 7.151417311908648e-07, + "loss": 0.0008, + "step": 44550 + }, + { + "epoch": 0.89104, + "grad_norm": 0.002224439289420843, + "learning_rate": 7.146232943817866e-07, + "loss": 0.0001, + "step": 44552 + }, + { + "epoch": 0.89108, + "grad_norm": 0.047148291021585464, + "learning_rate": 7.141050385959092e-07, + "loss": 0.0006, + "step": 44554 + }, + { + "epoch": 0.89112, + "grad_norm": 0.07741163671016693, + "learning_rate": 7.135869638433335e-07, + "loss": 0.0008, + "step": 44556 + }, + { + "epoch": 0.89116, + "grad_norm": 8.740735054016113, + "learning_rate": 7.130690701341614e-07, + "loss": 0.1085, + "step": 44558 + }, + { + "epoch": 0.8912, + "grad_norm": 2.1040239334106445, + "learning_rate": 7.125513574784904e-07, + "loss": 0.0189, + "step": 44560 + }, + { + "epoch": 0.89124, + "grad_norm": 18.4693603515625, + "learning_rate": 7.12033825886409e-07, + "loss": 0.5885, + "step": 44562 + }, + { + "epoch": 0.89128, + "grad_norm": 0.014462070539593697, + "learning_rate": 7.115164753680126e-07, + "loss": 0.0016, + "step": 44564 + }, + { + "epoch": 0.89132, + "grad_norm": 0.17748849093914032, + "learning_rate": 7.10999305933382e-07, + "loss": 0.0035, + "step": 44566 + }, + { + "epoch": 0.89136, + "grad_norm": 0.010067224502563477, + "learning_rate": 7.104823175926045e-07, + "loss": 0.0028, + "step": 44568 + }, + { + "epoch": 0.8914, + "grad_norm": 3.3789427280426025, + "learning_rate": 7.099655103557557e-07, + "loss": 0.025, + "step": 44570 + }, + { + "epoch": 0.89144, + "grad_norm": 0.011230187490582466, + "learning_rate": 7.094488842329128e-07, + "loss": 0.0057, + "step": 44572 + }, + { + "epoch": 0.89148, + "grad_norm": 0.0017498828237876296, + "learning_rate": 7.089324392341457e-07, + "loss": 0.0004, + "step": 44574 + }, + { + "epoch": 0.89152, + "grad_norm": 0.09575331956148148, + "learning_rate": 7.084161753695263e-07, + "loss": 0.0059, + "step": 44576 + }, + { + "epoch": 0.89156, + "grad_norm": 0.42442232370376587, + "learning_rate": 7.079000926491164e-07, + "loss": 0.0058, + "step": 44578 + }, + { + "epoch": 0.8916, + "grad_norm": 0.014580945484340191, + "learning_rate": 7.073841910829771e-07, + "loss": 0.0018, + "step": 44580 + }, + { + "epoch": 0.89164, + "grad_norm": 0.030710140243172646, + "learning_rate": 7.068684706811668e-07, + "loss": 0.0059, + "step": 44582 + }, + { + "epoch": 0.89168, + "grad_norm": 0.038628265261650085, + "learning_rate": 7.06352931453741e-07, + "loss": 0.0004, + "step": 44584 + }, + { + "epoch": 0.89172, + "grad_norm": 0.1993909776210785, + "learning_rate": 7.058375734107503e-07, + "loss": 0.0021, + "step": 44586 + }, + { + "epoch": 0.89176, + "grad_norm": 2.9095969200134277, + "learning_rate": 7.053223965622391e-07, + "loss": 0.0247, + "step": 44588 + }, + { + "epoch": 0.8918, + "grad_norm": 0.018873613327741623, + "learning_rate": 7.048074009182548e-07, + "loss": 0.0003, + "step": 44590 + }, + { + "epoch": 0.89184, + "grad_norm": 0.10746650397777557, + "learning_rate": 7.042925864888351e-07, + "loss": 0.0011, + "step": 44592 + }, + { + "epoch": 0.89188, + "grad_norm": 0.0044318912550807, + "learning_rate": 7.037779532840161e-07, + "loss": 0.0009, + "step": 44594 + }, + { + "epoch": 0.89192, + "grad_norm": 0.04050038754940033, + "learning_rate": 7.032635013138344e-07, + "loss": 0.0006, + "step": 44596 + }, + { + "epoch": 0.89196, + "grad_norm": 0.010619411244988441, + "learning_rate": 7.027492305883144e-07, + "loss": 0.0003, + "step": 44598 + }, + { + "epoch": 0.892, + "grad_norm": 0.014721899293363094, + "learning_rate": 7.022351411174866e-07, + "loss": 0.0013, + "step": 44600 + }, + { + "epoch": 0.89204, + "grad_norm": 0.07305271923542023, + "learning_rate": 7.01721232911371e-07, + "loss": 0.0381, + "step": 44602 + }, + { + "epoch": 0.89208, + "grad_norm": 0.05422572046518326, + "learning_rate": 7.012075059799861e-07, + "loss": 0.0009, + "step": 44604 + }, + { + "epoch": 0.89212, + "grad_norm": 0.07619806379079819, + "learning_rate": 7.006939603333485e-07, + "loss": 0.0007, + "step": 44606 + }, + { + "epoch": 0.89216, + "grad_norm": 0.050802163779735565, + "learning_rate": 7.001805959814712e-07, + "loss": 0.0009, + "step": 44608 + }, + { + "epoch": 0.8922, + "grad_norm": 0.01967443712055683, + "learning_rate": 6.996674129343606e-07, + "loss": 0.0002, + "step": 44610 + }, + { + "epoch": 0.89224, + "grad_norm": 0.0014469365123659372, + "learning_rate": 6.99154411202021e-07, + "loss": 0.0008, + "step": 44612 + }, + { + "epoch": 0.89228, + "grad_norm": 0.04471229016780853, + "learning_rate": 6.986415907944544e-07, + "loss": 0.002, + "step": 44614 + }, + { + "epoch": 0.89232, + "grad_norm": 0.3937211036682129, + "learning_rate": 6.981289517216583e-07, + "loss": 0.0047, + "step": 44616 + }, + { + "epoch": 0.89236, + "grad_norm": 16.826095581054688, + "learning_rate": 6.976164939936292e-07, + "loss": 0.3921, + "step": 44618 + }, + { + "epoch": 0.8924, + "grad_norm": 0.0029519633390009403, + "learning_rate": 6.971042176203535e-07, + "loss": 0.0031, + "step": 44620 + }, + { + "epoch": 0.89244, + "grad_norm": 0.1715921312570572, + "learning_rate": 6.965921226118222e-07, + "loss": 0.0011, + "step": 44622 + }, + { + "epoch": 0.89248, + "grad_norm": 0.08439244329929352, + "learning_rate": 6.960802089780172e-07, + "loss": 0.001, + "step": 44624 + }, + { + "epoch": 0.89252, + "grad_norm": 0.08319562673568726, + "learning_rate": 6.955684767289172e-07, + "loss": 0.0023, + "step": 44626 + }, + { + "epoch": 0.89256, + "grad_norm": 0.005530118942260742, + "learning_rate": 6.950569258745022e-07, + "loss": 0.0007, + "step": 44628 + }, + { + "epoch": 0.8926, + "grad_norm": 20.64242935180664, + "learning_rate": 6.945455564247394e-07, + "loss": 0.1758, + "step": 44630 + }, + { + "epoch": 0.89264, + "grad_norm": 0.18412551283836365, + "learning_rate": 6.940343683896044e-07, + "loss": 0.002, + "step": 44632 + }, + { + "epoch": 0.89268, + "grad_norm": 0.1814943552017212, + "learning_rate": 6.935233617790604e-07, + "loss": 0.0031, + "step": 44634 + }, + { + "epoch": 0.89272, + "grad_norm": 0.01998683251440525, + "learning_rate": 6.930125366030682e-07, + "loss": 0.0046, + "step": 44636 + }, + { + "epoch": 0.89276, + "grad_norm": 5.882521152496338, + "learning_rate": 6.925018928715888e-07, + "loss": 0.0582, + "step": 44638 + }, + { + "epoch": 0.8928, + "grad_norm": 0.006078273989260197, + "learning_rate": 6.919914305945774e-07, + "loss": 0.0001, + "step": 44640 + }, + { + "epoch": 0.89284, + "grad_norm": 0.010277091525495052, + "learning_rate": 6.91481149781984e-07, + "loss": 0.0006, + "step": 44642 + }, + { + "epoch": 0.89288, + "grad_norm": 0.4548700451850891, + "learning_rate": 6.909710504437606e-07, + "loss": 0.0054, + "step": 44644 + }, + { + "epoch": 0.89292, + "grad_norm": 0.020567571744322777, + "learning_rate": 6.904611325898469e-07, + "loss": 0.0005, + "step": 44646 + }, + { + "epoch": 0.89296, + "grad_norm": 0.05919228494167328, + "learning_rate": 6.899513962301862e-07, + "loss": 0.0011, + "step": 44648 + }, + { + "epoch": 0.893, + "grad_norm": 0.09780355542898178, + "learning_rate": 6.894418413747183e-07, + "loss": 0.0018, + "step": 44650 + }, + { + "epoch": 0.89304, + "grad_norm": 0.007936269044876099, + "learning_rate": 6.88932468033372e-07, + "loss": 0.0001, + "step": 44652 + }, + { + "epoch": 0.89308, + "grad_norm": 0.39876487851142883, + "learning_rate": 6.884232762160836e-07, + "loss": 0.0041, + "step": 44654 + }, + { + "epoch": 0.89312, + "grad_norm": 0.0022569114807993174, + "learning_rate": 6.879142659327743e-07, + "loss": 0.0004, + "step": 44656 + }, + { + "epoch": 0.89316, + "grad_norm": 5.6677199609111995e-05, + "learning_rate": 6.874054371933736e-07, + "loss": 0.0103, + "step": 44658 + }, + { + "epoch": 0.8932, + "grad_norm": 0.04989553242921829, + "learning_rate": 6.868967900077972e-07, + "loss": 0.0005, + "step": 44660 + }, + { + "epoch": 0.89324, + "grad_norm": 0.03515206277370453, + "learning_rate": 6.863883243859626e-07, + "loss": 0.0004, + "step": 44662 + }, + { + "epoch": 0.89328, + "grad_norm": 4.299713134765625, + "learning_rate": 6.858800403377841e-07, + "loss": 0.0472, + "step": 44664 + }, + { + "epoch": 0.89332, + "grad_norm": 0.0004967203130945563, + "learning_rate": 6.853719378731672e-07, + "loss": 0.0021, + "step": 44666 + }, + { + "epoch": 0.89336, + "grad_norm": 0.04836329072713852, + "learning_rate": 6.848640170020204e-07, + "loss": 0.0006, + "step": 44668 + }, + { + "epoch": 0.8934, + "grad_norm": 0.26452144980430603, + "learning_rate": 6.84356277734245e-07, + "loss": 0.0016, + "step": 44670 + }, + { + "epoch": 0.89344, + "grad_norm": 0.03744921088218689, + "learning_rate": 6.838487200797406e-07, + "loss": 0.0024, + "step": 44672 + }, + { + "epoch": 0.89348, + "grad_norm": 0.0749809741973877, + "learning_rate": 6.833413440484016e-07, + "loss": 0.0018, + "step": 44674 + }, + { + "epoch": 0.89352, + "grad_norm": 0.013375180773437023, + "learning_rate": 6.828341496501211e-07, + "loss": 0.0032, + "step": 44676 + }, + { + "epoch": 0.89356, + "grad_norm": 0.03201187029480934, + "learning_rate": 6.823271368947837e-07, + "loss": 0.0003, + "step": 44678 + }, + { + "epoch": 0.8936, + "grad_norm": 0.086241215467453, + "learning_rate": 6.818203057922756e-07, + "loss": 0.0007, + "step": 44680 + }, + { + "epoch": 0.89364, + "grad_norm": 0.36784425377845764, + "learning_rate": 6.813136563524803e-07, + "loss": 0.0027, + "step": 44682 + }, + { + "epoch": 0.89368, + "grad_norm": 0.0216384194791317, + "learning_rate": 6.808071885852696e-07, + "loss": 0.0012, + "step": 44684 + }, + { + "epoch": 0.89372, + "grad_norm": 19.10209846496582, + "learning_rate": 6.803009025005236e-07, + "loss": 0.2216, + "step": 44686 + }, + { + "epoch": 0.89376, + "grad_norm": 0.003907834179699421, + "learning_rate": 6.797947981081065e-07, + "loss": 0.0003, + "step": 44688 + }, + { + "epoch": 0.8938, + "grad_norm": 0.005743320565670729, + "learning_rate": 6.792888754178906e-07, + "loss": 0.0035, + "step": 44690 + }, + { + "epoch": 0.89384, + "grad_norm": 0.04578418284654617, + "learning_rate": 6.787831344397355e-07, + "loss": 0.0012, + "step": 44692 + }, + { + "epoch": 0.89388, + "grad_norm": 0.3902113139629364, + "learning_rate": 6.782775751835025e-07, + "loss": 0.0039, + "step": 44694 + }, + { + "epoch": 0.89392, + "grad_norm": 0.12458081543445587, + "learning_rate": 6.77772197659049e-07, + "loss": 0.0012, + "step": 44696 + }, + { + "epoch": 0.89396, + "grad_norm": 0.01250387728214264, + "learning_rate": 6.772670018762239e-07, + "loss": 0.0016, + "step": 44698 + }, + { + "epoch": 0.894, + "grad_norm": 0.005770309828221798, + "learning_rate": 6.767619878448783e-07, + "loss": 0.0002, + "step": 44700 + }, + { + "epoch": 0.89404, + "grad_norm": 0.015095357783138752, + "learning_rate": 6.762571555748587e-07, + "loss": 0.0011, + "step": 44702 + }, + { + "epoch": 0.89408, + "grad_norm": 0.005122168455272913, + "learning_rate": 6.757525050760049e-07, + "loss": 0.0039, + "step": 44704 + }, + { + "epoch": 0.89412, + "grad_norm": 0.7519627213478088, + "learning_rate": 6.752480363581559e-07, + "loss": 0.0064, + "step": 44706 + }, + { + "epoch": 0.89416, + "grad_norm": 0.017729023471474648, + "learning_rate": 6.747437494311504e-07, + "loss": 0.0006, + "step": 44708 + }, + { + "epoch": 0.8942, + "grad_norm": 0.022741930559277534, + "learning_rate": 6.742396443048138e-07, + "loss": 0.0003, + "step": 44710 + }, + { + "epoch": 0.89424, + "grad_norm": 0.03955937922000885, + "learning_rate": 6.737357209889772e-07, + "loss": 0.0007, + "step": 44712 + }, + { + "epoch": 0.89428, + "grad_norm": 5.607554912567139, + "learning_rate": 6.732319794934628e-07, + "loss": 0.0589, + "step": 44714 + }, + { + "epoch": 0.89432, + "grad_norm": 0.14230075478553772, + "learning_rate": 6.727284198280937e-07, + "loss": 0.0026, + "step": 44716 + }, + { + "epoch": 0.89436, + "grad_norm": 0.0022889748215675354, + "learning_rate": 6.722250420026876e-07, + "loss": 0.0001, + "step": 44718 + }, + { + "epoch": 0.8944, + "grad_norm": 0.25436124205589294, + "learning_rate": 6.717218460270536e-07, + "loss": 0.0025, + "step": 44720 + }, + { + "epoch": 0.89444, + "grad_norm": 0.0073343683034181595, + "learning_rate": 6.71218831911008e-07, + "loss": 0.0017, + "step": 44722 + }, + { + "epoch": 0.89448, + "grad_norm": 1.0400623083114624, + "learning_rate": 6.70715999664352e-07, + "loss": 0.011, + "step": 44724 + }, + { + "epoch": 0.89452, + "grad_norm": 0.10544668883085251, + "learning_rate": 6.70213349296891e-07, + "loss": 0.2199, + "step": 44726 + }, + { + "epoch": 0.89456, + "grad_norm": 0.049328628927469254, + "learning_rate": 6.697108808184239e-07, + "loss": 0.0062, + "step": 44728 + }, + { + "epoch": 0.8946, + "grad_norm": 0.10436397790908813, + "learning_rate": 6.692085942387483e-07, + "loss": 0.0007, + "step": 44730 + }, + { + "epoch": 0.89464, + "grad_norm": 0.020987149327993393, + "learning_rate": 6.687064895676532e-07, + "loss": 0.0083, + "step": 44732 + }, + { + "epoch": 0.89468, + "grad_norm": 0.03462234139442444, + "learning_rate": 6.682045668149295e-07, + "loss": 0.0012, + "step": 44734 + }, + { + "epoch": 0.89472, + "grad_norm": 0.03596135973930359, + "learning_rate": 6.677028259903618e-07, + "loss": 0.0014, + "step": 44736 + }, + { + "epoch": 0.89476, + "grad_norm": 0.014644475653767586, + "learning_rate": 6.67201267103732e-07, + "loss": 0.0031, + "step": 44738 + }, + { + "epoch": 0.8948, + "grad_norm": 0.0037447495851665735, + "learning_rate": 6.666998901648203e-07, + "loss": 0.0035, + "step": 44740 + }, + { + "epoch": 0.89484, + "grad_norm": 0.015952525660395622, + "learning_rate": 6.661986951833965e-07, + "loss": 0.001, + "step": 44742 + }, + { + "epoch": 0.89488, + "grad_norm": 0.006744462065398693, + "learning_rate": 6.656976821692374e-07, + "loss": 0.0002, + "step": 44744 + }, + { + "epoch": 0.89492, + "grad_norm": 0.8009361028671265, + "learning_rate": 6.65196851132105e-07, + "loss": 0.0096, + "step": 44746 + }, + { + "epoch": 0.89496, + "grad_norm": 0.010931103490293026, + "learning_rate": 6.646962020817671e-07, + "loss": 0.0002, + "step": 44748 + }, + { + "epoch": 0.895, + "grad_norm": 0.002540052402764559, + "learning_rate": 6.641957350279838e-07, + "loss": 0.0008, + "step": 44750 + }, + { + "epoch": 0.89504, + "grad_norm": 0.3361905813217163, + "learning_rate": 6.636954499805082e-07, + "loss": 0.0042, + "step": 44752 + }, + { + "epoch": 0.89508, + "grad_norm": 0.07193204760551453, + "learning_rate": 6.631953469490993e-07, + "loss": 0.0008, + "step": 44754 + }, + { + "epoch": 0.89512, + "grad_norm": 0.058054227381944656, + "learning_rate": 6.626954259435025e-07, + "loss": 0.0028, + "step": 44756 + }, + { + "epoch": 0.89516, + "grad_norm": 1.3760216236114502, + "learning_rate": 6.621956869734659e-07, + "loss": 0.0132, + "step": 44758 + }, + { + "epoch": 0.8952, + "grad_norm": 0.000994816073216498, + "learning_rate": 6.616961300487323e-07, + "loss": 0.0051, + "step": 44760 + }, + { + "epoch": 0.89524, + "grad_norm": 0.004135424736887217, + "learning_rate": 6.611967551790399e-07, + "loss": 0.0003, + "step": 44762 + }, + { + "epoch": 0.89528, + "grad_norm": 0.025705251842737198, + "learning_rate": 6.606975623741252e-07, + "loss": 0.0133, + "step": 44764 + }, + { + "epoch": 0.89532, + "grad_norm": 0.08358591049909592, + "learning_rate": 6.601985516437193e-07, + "loss": 0.0009, + "step": 44766 + }, + { + "epoch": 0.89536, + "grad_norm": 19.525815963745117, + "learning_rate": 6.596997229975511e-07, + "loss": 1.3577, + "step": 44768 + }, + { + "epoch": 0.8954, + "grad_norm": 0.006677338387817144, + "learning_rate": 6.592010764453449e-07, + "loss": 0.0039, + "step": 44770 + }, + { + "epoch": 0.89544, + "grad_norm": 0.0009164889925159514, + "learning_rate": 6.587026119968242e-07, + "loss": 0.0003, + "step": 44772 + }, + { + "epoch": 0.89548, + "grad_norm": 0.4581005871295929, + "learning_rate": 6.582043296617025e-07, + "loss": 0.0051, + "step": 44774 + }, + { + "epoch": 0.89552, + "grad_norm": 0.034186895936727524, + "learning_rate": 6.577062294497005e-07, + "loss": 0.0003, + "step": 44776 + }, + { + "epoch": 0.89556, + "grad_norm": 0.010417202487587929, + "learning_rate": 6.57208311370523e-07, + "loss": 0.0016, + "step": 44778 + }, + { + "epoch": 0.8956, + "grad_norm": 0.01771383173763752, + "learning_rate": 6.567105754338798e-07, + "loss": 0.0003, + "step": 44780 + }, + { + "epoch": 0.89564, + "grad_norm": 0.0007955326582305133, + "learning_rate": 6.562130216494744e-07, + "loss": 0.0005, + "step": 44782 + }, + { + "epoch": 0.89568, + "grad_norm": 0.295370876789093, + "learning_rate": 6.557156500270057e-07, + "loss": 0.0357, + "step": 44784 + }, + { + "epoch": 0.89572, + "grad_norm": 5.653387069702148, + "learning_rate": 6.552184605761736e-07, + "loss": 0.065, + "step": 44786 + }, + { + "epoch": 0.89576, + "grad_norm": 0.16234131157398224, + "learning_rate": 6.547214533066671e-07, + "loss": 0.0016, + "step": 44788 + }, + { + "epoch": 0.8958, + "grad_norm": 0.11883711069822311, + "learning_rate": 6.542246282281772e-07, + "loss": 0.0021, + "step": 44790 + }, + { + "epoch": 0.89584, + "grad_norm": 0.0008100278791971505, + "learning_rate": 6.537279853503886e-07, + "loss": 0.0002, + "step": 44792 + }, + { + "epoch": 0.89588, + "grad_norm": 0.026568682864308357, + "learning_rate": 6.532315246829857e-07, + "loss": 0.0004, + "step": 44794 + }, + { + "epoch": 0.89592, + "grad_norm": 0.06908904016017914, + "learning_rate": 6.527352462356462e-07, + "loss": 0.0013, + "step": 44796 + }, + { + "epoch": 0.89596, + "grad_norm": 0.02977694384753704, + "learning_rate": 6.522391500180469e-07, + "loss": 0.0008, + "step": 44798 + }, + { + "epoch": 0.896, + "grad_norm": 0.013954325579106808, + "learning_rate": 6.517432360398556e-07, + "loss": 0.001, + "step": 44800 + }, + { + "epoch": 0.89604, + "grad_norm": 0.11561136692762375, + "learning_rate": 6.512475043107435e-07, + "loss": 0.0015, + "step": 44802 + }, + { + "epoch": 0.89608, + "grad_norm": 0.03132690116763115, + "learning_rate": 6.507519548403762e-07, + "loss": 0.0005, + "step": 44804 + }, + { + "epoch": 0.89612, + "grad_norm": 0.032950036227703094, + "learning_rate": 6.502565876384093e-07, + "loss": 0.0009, + "step": 44806 + }, + { + "epoch": 0.89616, + "grad_norm": 0.007698066998273134, + "learning_rate": 6.497614027145072e-07, + "loss": 0.0001, + "step": 44808 + }, + { + "epoch": 0.8962, + "grad_norm": 0.09598388522863388, + "learning_rate": 6.492664000783166e-07, + "loss": 0.0007, + "step": 44810 + }, + { + "epoch": 0.89624, + "grad_norm": 0.32325029373168945, + "learning_rate": 6.487715797394955e-07, + "loss": 0.0027, + "step": 44812 + }, + { + "epoch": 0.89628, + "grad_norm": 0.03646111115813255, + "learning_rate": 6.482769417076851e-07, + "loss": 0.0012, + "step": 44814 + }, + { + "epoch": 0.89632, + "grad_norm": 0.01919885352253914, + "learning_rate": 6.477824859925297e-07, + "loss": 0.015, + "step": 44816 + }, + { + "epoch": 0.89636, + "grad_norm": 0.2685765326023102, + "learning_rate": 6.472882126036717e-07, + "loss": 0.002, + "step": 44818 + }, + { + "epoch": 0.8964, + "grad_norm": 0.0009849801426753402, + "learning_rate": 6.467941215507434e-07, + "loss": 0.0001, + "step": 44820 + }, + { + "epoch": 0.89644, + "grad_norm": 0.12929247319698334, + "learning_rate": 6.463002128433782e-07, + "loss": 0.0038, + "step": 44822 + }, + { + "epoch": 0.89648, + "grad_norm": 0.02844635769724846, + "learning_rate": 6.458064864912073e-07, + "loss": 0.0004, + "step": 44824 + }, + { + "epoch": 0.89652, + "grad_norm": 0.03100392036139965, + "learning_rate": 6.45312942503854e-07, + "loss": 0.0003, + "step": 44826 + }, + { + "epoch": 0.89656, + "grad_norm": 0.04217042401432991, + "learning_rate": 6.448195808909419e-07, + "loss": 0.0006, + "step": 44828 + }, + { + "epoch": 0.8966, + "grad_norm": 0.05025571212172508, + "learning_rate": 6.443264016620887e-07, + "loss": 0.0007, + "step": 44830 + }, + { + "epoch": 0.89664, + "grad_norm": 1.6701449155807495, + "learning_rate": 6.438334048269079e-07, + "loss": 0.01, + "step": 44832 + }, + { + "epoch": 0.89668, + "grad_norm": 0.09893934428691864, + "learning_rate": 6.433405903950118e-07, + "loss": 0.009, + "step": 44834 + }, + { + "epoch": 0.89672, + "grad_norm": 0.003154935548081994, + "learning_rate": 6.428479583760083e-07, + "loss": 0.301, + "step": 44836 + }, + { + "epoch": 0.89676, + "grad_norm": 0.026605406776070595, + "learning_rate": 6.423555087795019e-07, + "loss": 0.3365, + "step": 44838 + }, + { + "epoch": 0.8968, + "grad_norm": 0.07042668759822845, + "learning_rate": 6.418632416150927e-07, + "loss": 0.0051, + "step": 44840 + }, + { + "epoch": 0.89684, + "grad_norm": 0.5926299691200256, + "learning_rate": 6.413711568923752e-07, + "loss": 0.0073, + "step": 44842 + }, + { + "epoch": 0.89688, + "grad_norm": 0.2549257278442383, + "learning_rate": 6.408792546209475e-07, + "loss": 0.0018, + "step": 44844 + }, + { + "epoch": 0.89692, + "grad_norm": 0.000861183856613934, + "learning_rate": 6.40387534810396e-07, + "loss": 0.0024, + "step": 44846 + }, + { + "epoch": 0.89696, + "grad_norm": 0.04019809886813164, + "learning_rate": 6.39895997470309e-07, + "loss": 0.0013, + "step": 44848 + }, + { + "epoch": 0.897, + "grad_norm": 0.06146037578582764, + "learning_rate": 6.394046426102673e-07, + "loss": 0.001, + "step": 44850 + }, + { + "epoch": 0.89704, + "grad_norm": 0.06140908971428871, + "learning_rate": 6.389134702398514e-07, + "loss": 0.0007, + "step": 44852 + }, + { + "epoch": 0.89708, + "grad_norm": 17.17371940612793, + "learning_rate": 6.384224803686389e-07, + "loss": 0.4498, + "step": 44854 + }, + { + "epoch": 0.89712, + "grad_norm": 0.012577199377119541, + "learning_rate": 6.379316730061979e-07, + "loss": 0.0009, + "step": 44856 + }, + { + "epoch": 0.89716, + "grad_norm": 0.42855778336524963, + "learning_rate": 6.374410481620986e-07, + "loss": 0.0046, + "step": 44858 + }, + { + "epoch": 0.8972, + "grad_norm": 0.1194814145565033, + "learning_rate": 6.369506058459063e-07, + "loss": 0.0013, + "step": 44860 + }, + { + "epoch": 0.89724, + "grad_norm": 0.6858583092689514, + "learning_rate": 6.364603460671837e-07, + "loss": 0.7157, + "step": 44862 + }, + { + "epoch": 0.89728, + "grad_norm": 0.0012339944951236248, + "learning_rate": 6.359702688354852e-07, + "loss": 0.0, + "step": 44864 + }, + { + "epoch": 0.89732, + "grad_norm": 0.009829523041844368, + "learning_rate": 6.354803741603699e-07, + "loss": 0.004, + "step": 44866 + }, + { + "epoch": 0.89736, + "grad_norm": 0.011131517589092255, + "learning_rate": 6.349906620513835e-07, + "loss": 0.0003, + "step": 44868 + }, + { + "epoch": 0.8974, + "grad_norm": 0.608616054058075, + "learning_rate": 6.345011325180772e-07, + "loss": 0.0047, + "step": 44870 + }, + { + "epoch": 0.89744, + "grad_norm": 0.007527011446654797, + "learning_rate": 6.340117855699934e-07, + "loss": 0.0213, + "step": 44872 + }, + { + "epoch": 0.89748, + "grad_norm": 0.009595931507647038, + "learning_rate": 6.33522621216669e-07, + "loss": 0.0004, + "step": 44874 + }, + { + "epoch": 0.89752, + "grad_norm": 0.04256172478199005, + "learning_rate": 6.330336394676462e-07, + "loss": 0.0011, + "step": 44876 + }, + { + "epoch": 0.89756, + "grad_norm": 0.029734767973423004, + "learning_rate": 6.32544840332453e-07, + "loss": 0.0014, + "step": 44878 + }, + { + "epoch": 0.8976, + "grad_norm": 0.185505673289299, + "learning_rate": 6.320562238206218e-07, + "loss": 0.3382, + "step": 44880 + }, + { + "epoch": 0.89764, + "grad_norm": 0.04947748780250549, + "learning_rate": 6.315677899416772e-07, + "loss": 0.0005, + "step": 44882 + }, + { + "epoch": 0.89768, + "grad_norm": 0.8240656852722168, + "learning_rate": 6.310795387051416e-07, + "loss": 0.0092, + "step": 44884 + }, + { + "epoch": 0.89772, + "grad_norm": 0.011586945503950119, + "learning_rate": 6.305914701205351e-07, + "loss": 0.0001, + "step": 44886 + }, + { + "epoch": 0.89776, + "grad_norm": 6.706545352935791, + "learning_rate": 6.301035841973702e-07, + "loss": 0.0647, + "step": 44888 + }, + { + "epoch": 0.8978, + "grad_norm": 0.08861039578914642, + "learning_rate": 6.296158809451602e-07, + "loss": 0.0014, + "step": 44890 + }, + { + "epoch": 0.89784, + "grad_norm": 0.1326577514410019, + "learning_rate": 6.291283603734121e-07, + "loss": 0.0024, + "step": 44892 + }, + { + "epoch": 0.89788, + "grad_norm": 3.820193290710449, + "learning_rate": 6.28641022491634e-07, + "loss": 0.0485, + "step": 44894 + }, + { + "epoch": 0.89792, + "grad_norm": 0.042620256543159485, + "learning_rate": 6.281538673093191e-07, + "loss": 0.0005, + "step": 44896 + }, + { + "epoch": 0.89796, + "grad_norm": 0.16850373148918152, + "learning_rate": 6.276668948359743e-07, + "loss": 0.0017, + "step": 44898 + }, + { + "epoch": 0.898, + "grad_norm": 4.302546977996826, + "learning_rate": 6.271801050810856e-07, + "loss": 0.0447, + "step": 44900 + }, + { + "epoch": 0.89804, + "grad_norm": 0.03717673197388649, + "learning_rate": 6.266934980541472e-07, + "loss": 0.0006, + "step": 44902 + }, + { + "epoch": 0.89808, + "grad_norm": 0.059048671275377274, + "learning_rate": 6.262070737646441e-07, + "loss": 0.001, + "step": 44904 + }, + { + "epoch": 0.89812, + "grad_norm": 9.105271339416504, + "learning_rate": 6.257208322220598e-07, + "loss": 0.0717, + "step": 44906 + }, + { + "epoch": 0.89816, + "grad_norm": 0.06660954654216766, + "learning_rate": 6.252347734358766e-07, + "loss": 0.0009, + "step": 44908 + }, + { + "epoch": 0.8982, + "grad_norm": 0.010787594132125378, + "learning_rate": 6.247488974155657e-07, + "loss": 0.0022, + "step": 44910 + }, + { + "epoch": 0.89824, + "grad_norm": 1.0495699644088745, + "learning_rate": 6.24263204170602e-07, + "loss": 0.0133, + "step": 44912 + }, + { + "epoch": 0.89828, + "grad_norm": 0.019193271175026894, + "learning_rate": 6.237776937104534e-07, + "loss": 0.0006, + "step": 44914 + }, + { + "epoch": 0.89832, + "grad_norm": 0.1667785346508026, + "learning_rate": 6.232923660445866e-07, + "loss": 0.0023, + "step": 44916 + }, + { + "epoch": 0.89836, + "grad_norm": 0.010009566321969032, + "learning_rate": 6.22807221182462e-07, + "loss": 0.0003, + "step": 44918 + }, + { + "epoch": 0.8984, + "grad_norm": 0.023225590586662292, + "learning_rate": 6.223222591335409e-07, + "loss": 0.0007, + "step": 44920 + }, + { + "epoch": 0.89844, + "grad_norm": 4.655019283294678, + "learning_rate": 6.218374799072735e-07, + "loss": 0.0546, + "step": 44922 + }, + { + "epoch": 0.89848, + "grad_norm": 0.012426775880157948, + "learning_rate": 6.213528835131122e-07, + "loss": 0.0007, + "step": 44924 + }, + { + "epoch": 0.89852, + "grad_norm": 0.049313630908727646, + "learning_rate": 6.208684699605061e-07, + "loss": 0.0005, + "step": 44926 + }, + { + "epoch": 0.89856, + "grad_norm": 0.009870368987321854, + "learning_rate": 6.203842392588954e-07, + "loss": 0.0213, + "step": 44928 + }, + { + "epoch": 0.8986, + "grad_norm": 0.01365737710148096, + "learning_rate": 6.199001914177261e-07, + "loss": 0.0012, + "step": 44930 + }, + { + "epoch": 0.89864, + "grad_norm": 0.03010387346148491, + "learning_rate": 6.194163264464282e-07, + "loss": 0.0003, + "step": 44932 + }, + { + "epoch": 0.89868, + "grad_norm": 0.1401572823524475, + "learning_rate": 6.189326443544418e-07, + "loss": 0.004, + "step": 44934 + }, + { + "epoch": 0.89872, + "grad_norm": 0.01544036902487278, + "learning_rate": 6.18449145151192e-07, + "loss": 0.0016, + "step": 44936 + }, + { + "epoch": 0.89876, + "grad_norm": 0.013179364614188671, + "learning_rate": 6.179658288461054e-07, + "loss": 0.047, + "step": 44938 + }, + { + "epoch": 0.8988, + "grad_norm": 0.02594858966767788, + "learning_rate": 6.174826954486069e-07, + "loss": 0.0003, + "step": 44940 + }, + { + "epoch": 0.89884, + "grad_norm": 0.007438040338456631, + "learning_rate": 6.16999744968112e-07, + "loss": 0.0025, + "step": 44942 + }, + { + "epoch": 0.89888, + "grad_norm": 0.05135654658079147, + "learning_rate": 6.165169774140379e-07, + "loss": 0.0005, + "step": 44944 + }, + { + "epoch": 0.89892, + "grad_norm": 0.00826714001595974, + "learning_rate": 6.160343927957968e-07, + "loss": 0.0007, + "step": 44946 + }, + { + "epoch": 0.89896, + "grad_norm": 0.01761603355407715, + "learning_rate": 6.155519911227958e-07, + "loss": 0.0005, + "step": 44948 + }, + { + "epoch": 0.899, + "grad_norm": 0.9122256636619568, + "learning_rate": 6.150697724044407e-07, + "loss": 0.0142, + "step": 44950 + }, + { + "epoch": 0.89904, + "grad_norm": 0.0028938129544258118, + "learning_rate": 6.145877366501329e-07, + "loss": 0.0064, + "step": 44952 + }, + { + "epoch": 0.89908, + "grad_norm": 0.5486725568771362, + "learning_rate": 6.141058838692681e-07, + "loss": 0.0051, + "step": 44954 + }, + { + "epoch": 0.89912, + "grad_norm": 0.014340662397444248, + "learning_rate": 6.136242140712412e-07, + "loss": 0.0002, + "step": 44956 + }, + { + "epoch": 0.89916, + "grad_norm": 0.48588988184928894, + "learning_rate": 6.131427272654422e-07, + "loss": 0.0037, + "step": 44958 + }, + { + "epoch": 0.8992, + "grad_norm": 0.011696835048496723, + "learning_rate": 6.126614234612593e-07, + "loss": 0.0003, + "step": 44960 + }, + { + "epoch": 0.89924, + "grad_norm": 0.027682475745677948, + "learning_rate": 6.121803026680761e-07, + "loss": 0.0007, + "step": 44962 + }, + { + "epoch": 0.89928, + "grad_norm": 0.01279602013528347, + "learning_rate": 6.116993648952674e-07, + "loss": 0.0002, + "step": 44964 + }, + { + "epoch": 0.89932, + "grad_norm": 0.020381445065140724, + "learning_rate": 6.112186101522166e-07, + "loss": 0.0004, + "step": 44966 + }, + { + "epoch": 0.89936, + "grad_norm": 0.32801613211631775, + "learning_rate": 6.107380384482909e-07, + "loss": 0.0026, + "step": 44968 + }, + { + "epoch": 0.8994, + "grad_norm": 0.09239122271537781, + "learning_rate": 6.102576497928614e-07, + "loss": 0.0007, + "step": 44970 + }, + { + "epoch": 0.89944, + "grad_norm": 0.249105766415596, + "learning_rate": 6.097774441952931e-07, + "loss": 0.0025, + "step": 44972 + }, + { + "epoch": 0.89948, + "grad_norm": 0.17446917295455933, + "learning_rate": 6.092974216649472e-07, + "loss": 0.0019, + "step": 44974 + }, + { + "epoch": 0.89952, + "grad_norm": 0.03790745511651039, + "learning_rate": 6.088175822111853e-07, + "loss": 0.0005, + "step": 44976 + }, + { + "epoch": 0.89956, + "grad_norm": 0.14387328922748566, + "learning_rate": 6.083379258433575e-07, + "loss": 0.0011, + "step": 44978 + }, + { + "epoch": 0.8996, + "grad_norm": 0.019801251590251923, + "learning_rate": 6.078584525708175e-07, + "loss": 0.0007, + "step": 44980 + }, + { + "epoch": 0.89964, + "grad_norm": 0.00945192202925682, + "learning_rate": 6.073791624029112e-07, + "loss": 0.0004, + "step": 44982 + }, + { + "epoch": 0.89968, + "grad_norm": 0.032504595816135406, + "learning_rate": 6.069000553489867e-07, + "loss": 0.0037, + "step": 44984 + }, + { + "epoch": 0.89972, + "grad_norm": 4.418294906616211, + "learning_rate": 6.064211314183777e-07, + "loss": 0.0337, + "step": 44986 + }, + { + "epoch": 0.89976, + "grad_norm": 0.029839687049388885, + "learning_rate": 6.059423906204276e-07, + "loss": 0.0005, + "step": 44988 + }, + { + "epoch": 0.8998, + "grad_norm": 0.21556536853313446, + "learning_rate": 6.054638329644658e-07, + "loss": 0.0018, + "step": 44990 + }, + { + "epoch": 0.89984, + "grad_norm": 0.2679544687271118, + "learning_rate": 6.049854584598236e-07, + "loss": 0.0029, + "step": 44992 + }, + { + "epoch": 0.89988, + "grad_norm": 0.04230131581425667, + "learning_rate": 6.04507267115828e-07, + "loss": 0.0015, + "step": 44994 + }, + { + "epoch": 0.89992, + "grad_norm": 0.6382015943527222, + "learning_rate": 6.040292589417973e-07, + "loss": 0.0057, + "step": 44996 + }, + { + "epoch": 0.89996, + "grad_norm": 0.12170760333538055, + "learning_rate": 6.03551433947056e-07, + "loss": 0.0017, + "step": 44998 + }, + { + "epoch": 0.9, + "grad_norm": 0.12054393440485, + "learning_rate": 6.030737921409169e-07, + "loss": 0.0013, + "step": 45000 + }, + { + "epoch": 0.90004, + "grad_norm": 0.04718780517578125, + "learning_rate": 6.025963335326912e-07, + "loss": 0.0005, + "step": 45002 + }, + { + "epoch": 0.90008, + "grad_norm": 0.019814854487776756, + "learning_rate": 6.021190581316882e-07, + "loss": 0.0007, + "step": 45004 + }, + { + "epoch": 0.90012, + "grad_norm": 0.0015663199592381716, + "learning_rate": 6.016419659472128e-07, + "loss": 0.0019, + "step": 45006 + }, + { + "epoch": 0.90016, + "grad_norm": 0.02561948634684086, + "learning_rate": 6.011650569885674e-07, + "loss": 0.0041, + "step": 45008 + }, + { + "epoch": 0.9002, + "grad_norm": 0.027875298634171486, + "learning_rate": 6.006883312650458e-07, + "loss": 0.0003, + "step": 45010 + }, + { + "epoch": 0.90024, + "grad_norm": 0.17961335182189941, + "learning_rate": 6.002117887859449e-07, + "loss": 0.0024, + "step": 45012 + }, + { + "epoch": 0.90028, + "grad_norm": 5.064502716064453, + "learning_rate": 5.99735429560554e-07, + "loss": 0.0425, + "step": 45014 + }, + { + "epoch": 0.90032, + "grad_norm": 0.07780703902244568, + "learning_rate": 5.992592535981634e-07, + "loss": 0.0015, + "step": 45016 + }, + { + "epoch": 0.90036, + "grad_norm": 0.13905197381973267, + "learning_rate": 5.987832609080491e-07, + "loss": 0.0012, + "step": 45018 + }, + { + "epoch": 0.9004, + "grad_norm": 0.4212152361869812, + "learning_rate": 5.98307451499498e-07, + "loss": 0.0039, + "step": 45020 + }, + { + "epoch": 0.90044, + "grad_norm": 0.004785614088177681, + "learning_rate": 5.978318253817828e-07, + "loss": 0.0001, + "step": 45022 + }, + { + "epoch": 0.90048, + "grad_norm": 0.10213963687419891, + "learning_rate": 5.97356382564177e-07, + "loss": 0.001, + "step": 45024 + }, + { + "epoch": 0.90052, + "grad_norm": 0.07170584052801132, + "learning_rate": 5.968811230559501e-07, + "loss": 0.0006, + "step": 45026 + }, + { + "epoch": 0.90056, + "grad_norm": 0.06672817468643188, + "learning_rate": 5.964060468663635e-07, + "loss": 0.0044, + "step": 45028 + }, + { + "epoch": 0.9006, + "grad_norm": 0.05738427862524986, + "learning_rate": 5.959311540046863e-07, + "loss": 0.0011, + "step": 45030 + }, + { + "epoch": 0.90064, + "grad_norm": 0.1366671323776245, + "learning_rate": 5.954564444801703e-07, + "loss": 0.0018, + "step": 45032 + }, + { + "epoch": 0.90068, + "grad_norm": 0.013457215391099453, + "learning_rate": 5.949819183020733e-07, + "loss": 0.0002, + "step": 45034 + }, + { + "epoch": 0.90072, + "grad_norm": 0.028068946674466133, + "learning_rate": 5.945075754796459e-07, + "loss": 0.0005, + "step": 45036 + }, + { + "epoch": 0.90076, + "grad_norm": 0.0009184819646179676, + "learning_rate": 5.94033416022135e-07, + "loss": 0.0002, + "step": 45038 + }, + { + "epoch": 0.9008, + "grad_norm": 0.05178483948111534, + "learning_rate": 5.935594399387856e-07, + "loss": 0.0043, + "step": 45040 + }, + { + "epoch": 0.90084, + "grad_norm": 0.13210612535476685, + "learning_rate": 5.930856472388391e-07, + "loss": 0.0022, + "step": 45042 + }, + { + "epoch": 0.90088, + "grad_norm": 0.02732941508293152, + "learning_rate": 5.926120379315292e-07, + "loss": 0.0074, + "step": 45044 + }, + { + "epoch": 0.90092, + "grad_norm": 0.018289851024746895, + "learning_rate": 5.921386120260919e-07, + "loss": 0.0002, + "step": 45046 + }, + { + "epoch": 0.90096, + "grad_norm": 0.5296049118041992, + "learning_rate": 5.916653695317542e-07, + "loss": 0.0058, + "step": 45048 + }, + { + "epoch": 0.901, + "grad_norm": 0.06687520444393158, + "learning_rate": 5.911923104577455e-07, + "loss": 0.0007, + "step": 45050 + }, + { + "epoch": 0.90104, + "grad_norm": 0.0046949926763772964, + "learning_rate": 5.907194348132872e-07, + "loss": 0.0003, + "step": 45052 + }, + { + "epoch": 0.90108, + "grad_norm": 0.0025087001267820597, + "learning_rate": 5.902467426075942e-07, + "loss": 0.0001, + "step": 45054 + }, + { + "epoch": 0.90112, + "grad_norm": 0.03404439613223076, + "learning_rate": 5.897742338498902e-07, + "loss": 0.0009, + "step": 45056 + }, + { + "epoch": 0.90116, + "grad_norm": 0.16494128108024597, + "learning_rate": 5.89301908549379e-07, + "loss": 0.0016, + "step": 45058 + }, + { + "epoch": 0.9012, + "grad_norm": 0.962709367275238, + "learning_rate": 5.888297667152731e-07, + "loss": 0.0066, + "step": 45060 + }, + { + "epoch": 0.90124, + "grad_norm": 0.006136162672191858, + "learning_rate": 5.883578083567776e-07, + "loss": 0.0104, + "step": 45062 + }, + { + "epoch": 0.90128, + "grad_norm": 0.0192639771848917, + "learning_rate": 5.878860334830883e-07, + "loss": 0.0002, + "step": 45064 + }, + { + "epoch": 0.90132, + "grad_norm": 0.2631482183933258, + "learning_rate": 5.874144421034101e-07, + "loss": 0.0027, + "step": 45066 + }, + { + "epoch": 0.90136, + "grad_norm": 0.02012268267571926, + "learning_rate": 5.869430342269311e-07, + "loss": 0.0041, + "step": 45068 + }, + { + "epoch": 0.9014, + "grad_norm": 0.05633966624736786, + "learning_rate": 5.864718098628441e-07, + "loss": 0.0008, + "step": 45070 + }, + { + "epoch": 0.90144, + "grad_norm": 0.022714857012033463, + "learning_rate": 5.860007690203362e-07, + "loss": 0.0008, + "step": 45072 + }, + { + "epoch": 0.90148, + "grad_norm": 0.09004299342632294, + "learning_rate": 5.85529911708591e-07, + "loss": 0.0035, + "step": 45074 + }, + { + "epoch": 0.90152, + "grad_norm": 0.20271410048007965, + "learning_rate": 5.850592379367847e-07, + "loss": 0.0021, + "step": 45076 + }, + { + "epoch": 0.90156, + "grad_norm": 0.07364513725042343, + "learning_rate": 5.845887477140966e-07, + "loss": 0.0009, + "step": 45078 + }, + { + "epoch": 0.9016, + "grad_norm": 0.02267405577003956, + "learning_rate": 5.841184410496992e-07, + "loss": 0.002, + "step": 45080 + }, + { + "epoch": 0.90164, + "grad_norm": 0.07002232223749161, + "learning_rate": 5.836483179527586e-07, + "loss": 0.0011, + "step": 45082 + }, + { + "epoch": 0.90168, + "grad_norm": 0.030612943693995476, + "learning_rate": 5.831783784324441e-07, + "loss": 0.0008, + "step": 45084 + }, + { + "epoch": 0.90172, + "grad_norm": 0.06985320150852203, + "learning_rate": 5.827086224979128e-07, + "loss": 0.0007, + "step": 45086 + }, + { + "epoch": 0.90176, + "grad_norm": 0.01042187213897705, + "learning_rate": 5.822390501583275e-07, + "loss": 0.0001, + "step": 45088 + }, + { + "epoch": 0.9018, + "grad_norm": 0.017817698419094086, + "learning_rate": 5.817696614228396e-07, + "loss": 0.0003, + "step": 45090 + }, + { + "epoch": 0.90184, + "grad_norm": 0.06333111971616745, + "learning_rate": 5.813004563006009e-07, + "loss": 0.0008, + "step": 45092 + }, + { + "epoch": 0.90188, + "grad_norm": 0.013691233471035957, + "learning_rate": 5.808314348007593e-07, + "loss": 0.004, + "step": 45094 + }, + { + "epoch": 0.90192, + "grad_norm": 0.1045929491519928, + "learning_rate": 5.803625969324567e-07, + "loss": 0.0013, + "step": 45096 + }, + { + "epoch": 0.90196, + "grad_norm": 0.09212920069694519, + "learning_rate": 5.798939427048366e-07, + "loss": 0.3373, + "step": 45098 + }, + { + "epoch": 0.902, + "grad_norm": 0.13997343182563782, + "learning_rate": 5.794254721270331e-07, + "loss": 0.0023, + "step": 45100 + }, + { + "epoch": 0.90204, + "grad_norm": 0.009151789359748363, + "learning_rate": 5.789571852081799e-07, + "loss": 0.0011, + "step": 45102 + }, + { + "epoch": 0.90208, + "grad_norm": 0.01025193277746439, + "learning_rate": 5.784890819574074e-07, + "loss": 0.0009, + "step": 45104 + }, + { + "epoch": 0.90212, + "grad_norm": 0.010657844133675098, + "learning_rate": 5.780211623838405e-07, + "loss": 0.0005, + "step": 45106 + }, + { + "epoch": 0.90216, + "grad_norm": 0.007640262134373188, + "learning_rate": 5.775534264965999e-07, + "loss": 0.0005, + "step": 45108 + }, + { + "epoch": 0.9022, + "grad_norm": 0.014299812726676464, + "learning_rate": 5.770858743048091e-07, + "loss": 0.0423, + "step": 45110 + }, + { + "epoch": 0.90224, + "grad_norm": 0.0030748890712857246, + "learning_rate": 5.766185058175799e-07, + "loss": 0.0002, + "step": 45112 + }, + { + "epoch": 0.90228, + "grad_norm": 0.006092643365263939, + "learning_rate": 5.761513210440229e-07, + "loss": 0.0003, + "step": 45114 + }, + { + "epoch": 0.90232, + "grad_norm": 0.049064088612794876, + "learning_rate": 5.756843199932504e-07, + "loss": 0.0023, + "step": 45116 + }, + { + "epoch": 0.90236, + "grad_norm": 0.3959454894065857, + "learning_rate": 5.752175026743601e-07, + "loss": 0.0031, + "step": 45118 + }, + { + "epoch": 0.9024, + "grad_norm": 0.24303847551345825, + "learning_rate": 5.747508690964599e-07, + "loss": 0.0021, + "step": 45120 + }, + { + "epoch": 0.90244, + "grad_norm": 0.03516751527786255, + "learning_rate": 5.742844192686425e-07, + "loss": 0.001, + "step": 45122 + }, + { + "epoch": 0.90248, + "grad_norm": 0.012552163563668728, + "learning_rate": 5.738181532000031e-07, + "loss": 0.0022, + "step": 45124 + }, + { + "epoch": 0.90252, + "grad_norm": 0.021838538348674774, + "learning_rate": 5.733520708996321e-07, + "loss": 0.0004, + "step": 45126 + }, + { + "epoch": 0.90256, + "grad_norm": 0.3540797829627991, + "learning_rate": 5.728861723766155e-07, + "loss": 0.0024, + "step": 45128 + }, + { + "epoch": 0.9026, + "grad_norm": 0.2526926100254059, + "learning_rate": 5.724204576400372e-07, + "loss": 0.0026, + "step": 45130 + }, + { + "epoch": 0.90264, + "grad_norm": 0.020515119656920433, + "learning_rate": 5.719549266989755e-07, + "loss": 0.0013, + "step": 45132 + }, + { + "epoch": 0.90268, + "grad_norm": 0.022726403549313545, + "learning_rate": 5.714895795625053e-07, + "loss": 0.0009, + "step": 45134 + }, + { + "epoch": 0.90272, + "grad_norm": 0.0710725337266922, + "learning_rate": 5.710244162397005e-07, + "loss": 0.0014, + "step": 45136 + }, + { + "epoch": 0.90276, + "grad_norm": 0.08318418264389038, + "learning_rate": 5.705594367396294e-07, + "loss": 0.0009, + "step": 45138 + }, + { + "epoch": 0.9028, + "grad_norm": 0.03812505677342415, + "learning_rate": 5.700946410713548e-07, + "loss": 0.0004, + "step": 45140 + }, + { + "epoch": 0.90284, + "grad_norm": 0.006985220592468977, + "learning_rate": 5.696300292439427e-07, + "loss": 0.0006, + "step": 45142 + }, + { + "epoch": 0.90288, + "grad_norm": 0.2771201729774475, + "learning_rate": 5.691656012664471e-07, + "loss": 0.2066, + "step": 45144 + }, + { + "epoch": 0.90292, + "grad_norm": 0.018758023157715797, + "learning_rate": 5.687013571479228e-07, + "loss": 0.0014, + "step": 45146 + }, + { + "epoch": 0.90296, + "grad_norm": 0.012910385616123676, + "learning_rate": 5.682372968974237e-07, + "loss": 0.0004, + "step": 45148 + }, + { + "epoch": 0.903, + "grad_norm": 4.148768424987793, + "learning_rate": 5.677734205239904e-07, + "loss": 0.0377, + "step": 45150 + }, + { + "epoch": 0.90304, + "grad_norm": 0.029712097719311714, + "learning_rate": 5.673097280366735e-07, + "loss": 0.0585, + "step": 45152 + }, + { + "epoch": 0.90308, + "grad_norm": 0.002705417573451996, + "learning_rate": 5.668462194445068e-07, + "loss": 0.0024, + "step": 45154 + }, + { + "epoch": 0.90312, + "grad_norm": 0.03688277676701546, + "learning_rate": 5.66382894756532e-07, + "loss": 0.0017, + "step": 45156 + }, + { + "epoch": 0.90316, + "grad_norm": 2.8983256816864014, + "learning_rate": 5.659197539817785e-07, + "loss": 0.0273, + "step": 45158 + }, + { + "epoch": 0.9032, + "grad_norm": 0.14094147086143494, + "learning_rate": 5.654567971292757e-07, + "loss": 0.0014, + "step": 45160 + }, + { + "epoch": 0.90324, + "grad_norm": 0.041168637573719025, + "learning_rate": 5.649940242080521e-07, + "loss": 0.0013, + "step": 45162 + }, + { + "epoch": 0.90328, + "grad_norm": 0.0005456319195218384, + "learning_rate": 5.645314352271258e-07, + "loss": 0.0003, + "step": 45164 + }, + { + "epoch": 0.90332, + "grad_norm": 0.10522683709859848, + "learning_rate": 5.640690301955165e-07, + "loss": 0.0019, + "step": 45166 + }, + { + "epoch": 0.90336, + "grad_norm": 0.014756415970623493, + "learning_rate": 5.636068091222402e-07, + "loss": 0.0003, + "step": 45168 + }, + { + "epoch": 0.9034, + "grad_norm": 0.014168836176395416, + "learning_rate": 5.631447720163074e-07, + "loss": 0.0002, + "step": 45170 + }, + { + "epoch": 0.90344, + "grad_norm": 0.024150429293513298, + "learning_rate": 5.626829188867245e-07, + "loss": 0.0003, + "step": 45172 + }, + { + "epoch": 0.90348, + "grad_norm": 0.0020742116030305624, + "learning_rate": 5.622212497424994e-07, + "loss": 0.0017, + "step": 45174 + }, + { + "epoch": 0.90352, + "grad_norm": 0.012119121849536896, + "learning_rate": 5.617597645926265e-07, + "loss": 0.0007, + "step": 45176 + }, + { + "epoch": 0.90356, + "grad_norm": 0.0007794807897880673, + "learning_rate": 5.612984634461093e-07, + "loss": 0.0006, + "step": 45178 + }, + { + "epoch": 0.9036, + "grad_norm": 1.8904142379760742, + "learning_rate": 5.608373463119354e-07, + "loss": 0.0168, + "step": 45180 + }, + { + "epoch": 0.90364, + "grad_norm": 0.10193649679422379, + "learning_rate": 5.603764131990985e-07, + "loss": 0.0009, + "step": 45182 + }, + { + "epoch": 0.90368, + "grad_norm": 0.27103671431541443, + "learning_rate": 5.599156641165837e-07, + "loss": 0.0032, + "step": 45184 + }, + { + "epoch": 0.90372, + "grad_norm": 0.2148228883743286, + "learning_rate": 5.594550990733705e-07, + "loss": 0.0026, + "step": 45186 + }, + { + "epoch": 0.90376, + "grad_norm": 0.04874979332089424, + "learning_rate": 5.589947180784439e-07, + "loss": 0.0719, + "step": 45188 + }, + { + "epoch": 0.9038, + "grad_norm": 1.2401912212371826, + "learning_rate": 5.585345211407734e-07, + "loss": 0.0057, + "step": 45190 + }, + { + "epoch": 0.90384, + "grad_norm": 0.01569538563489914, + "learning_rate": 5.58074508269334e-07, + "loss": 0.0025, + "step": 45192 + }, + { + "epoch": 0.90388, + "grad_norm": 0.04288516566157341, + "learning_rate": 5.57614679473093e-07, + "loss": 0.0014, + "step": 45194 + }, + { + "epoch": 0.90392, + "grad_norm": 0.09379430115222931, + "learning_rate": 5.571550347610166e-07, + "loss": 0.0008, + "step": 45196 + }, + { + "epoch": 0.90396, + "grad_norm": 0.005520825739949942, + "learning_rate": 5.566955741420621e-07, + "loss": 0.0003, + "step": 45198 + }, + { + "epoch": 0.904, + "grad_norm": 0.05380573123693466, + "learning_rate": 5.562362976251901e-07, + "loss": 0.0013, + "step": 45200 + }, + { + "epoch": 0.90404, + "grad_norm": 0.15741732716560364, + "learning_rate": 5.557772052193533e-07, + "loss": 0.002, + "step": 45202 + }, + { + "epoch": 0.90408, + "grad_norm": 0.29199251532554626, + "learning_rate": 5.553182969335013e-07, + "loss": 0.0143, + "step": 45204 + }, + { + "epoch": 0.90412, + "grad_norm": 0.003913238178938627, + "learning_rate": 5.548595727765838e-07, + "loss": 0.0002, + "step": 45206 + }, + { + "epoch": 0.90416, + "grad_norm": 0.14455632865428925, + "learning_rate": 5.544010327575377e-07, + "loss": 0.0018, + "step": 45208 + }, + { + "epoch": 0.9042, + "grad_norm": 11.166241645812988, + "learning_rate": 5.539426768853107e-07, + "loss": 0.1895, + "step": 45210 + }, + { + "epoch": 0.90424, + "grad_norm": 0.18036220967769623, + "learning_rate": 5.53484505168832e-07, + "loss": 0.0016, + "step": 45212 + }, + { + "epoch": 0.90428, + "grad_norm": 15.253926277160645, + "learning_rate": 5.530265176170368e-07, + "loss": 0.4498, + "step": 45214 + }, + { + "epoch": 0.90432, + "grad_norm": 0.34053441882133484, + "learning_rate": 5.525687142388547e-07, + "loss": 0.0036, + "step": 45216 + }, + { + "epoch": 0.90436, + "grad_norm": 0.9080893993377686, + "learning_rate": 5.521110950432051e-07, + "loss": 0.0084, + "step": 45218 + }, + { + "epoch": 0.9044, + "grad_norm": 0.19835388660430908, + "learning_rate": 5.516536600390188e-07, + "loss": 0.0028, + "step": 45220 + }, + { + "epoch": 0.90444, + "grad_norm": 0.4699271023273468, + "learning_rate": 5.511964092352062e-07, + "loss": 0.0043, + "step": 45222 + }, + { + "epoch": 0.90448, + "grad_norm": 0.029632894322276115, + "learning_rate": 5.507393426406837e-07, + "loss": 0.0004, + "step": 45224 + }, + { + "epoch": 0.90452, + "grad_norm": 12.534945487976074, + "learning_rate": 5.502824602643631e-07, + "loss": 0.2365, + "step": 45226 + }, + { + "epoch": 0.90456, + "grad_norm": 0.04983288049697876, + "learning_rate": 5.498257621151526e-07, + "loss": 0.251, + "step": 45228 + }, + { + "epoch": 0.9046, + "grad_norm": 17.700916290283203, + "learning_rate": 5.49369248201953e-07, + "loss": 1.334, + "step": 45230 + }, + { + "epoch": 0.90464, + "grad_norm": 0.0802575871348381, + "learning_rate": 5.489129185336651e-07, + "loss": 0.0007, + "step": 45232 + }, + { + "epoch": 0.90468, + "grad_norm": 0.368643194437027, + "learning_rate": 5.484567731191848e-07, + "loss": 0.0037, + "step": 45234 + }, + { + "epoch": 0.90472, + "grad_norm": 0.0853601023554802, + "learning_rate": 5.480008119674062e-07, + "loss": 0.0198, + "step": 45236 + }, + { + "epoch": 0.90476, + "grad_norm": 0.01697823405265808, + "learning_rate": 5.475450350872202e-07, + "loss": 0.0006, + "step": 45238 + }, + { + "epoch": 0.9048, + "grad_norm": 0.013546928763389587, + "learning_rate": 5.470894424875062e-07, + "loss": 0.0009, + "step": 45240 + }, + { + "epoch": 0.90484, + "grad_norm": 0.007528768852353096, + "learning_rate": 5.466340341771536e-07, + "loss": 0.0015, + "step": 45242 + }, + { + "epoch": 0.90488, + "grad_norm": 0.04801836237311363, + "learning_rate": 5.461788101650356e-07, + "loss": 0.3922, + "step": 45244 + }, + { + "epoch": 0.90492, + "grad_norm": 0.05928632616996765, + "learning_rate": 5.457237704600294e-07, + "loss": 0.0007, + "step": 45246 + }, + { + "epoch": 0.90496, + "grad_norm": 0.044097281992435455, + "learning_rate": 5.452689150710055e-07, + "loss": 0.0025, + "step": 45248 + }, + { + "epoch": 0.905, + "grad_norm": 0.01993567682802677, + "learning_rate": 5.448142440068316e-07, + "loss": 0.0015, + "step": 45250 + }, + { + "epoch": 0.90504, + "grad_norm": 0.006663935258984566, + "learning_rate": 5.443597572763737e-07, + "loss": 0.0132, + "step": 45252 + }, + { + "epoch": 0.90508, + "grad_norm": 0.05013173073530197, + "learning_rate": 5.439054548884881e-07, + "loss": 0.0011, + "step": 45254 + }, + { + "epoch": 0.90512, + "grad_norm": 0.056963082402944565, + "learning_rate": 5.434513368520344e-07, + "loss": 0.0005, + "step": 45256 + }, + { + "epoch": 0.90516, + "grad_norm": 0.4905813932418823, + "learning_rate": 5.429974031758656e-07, + "loss": 0.0084, + "step": 45258 + }, + { + "epoch": 0.9052, + "grad_norm": 0.01896822266280651, + "learning_rate": 5.425436538688322e-07, + "loss": 0.0002, + "step": 45260 + }, + { + "epoch": 0.90524, + "grad_norm": 0.08281061053276062, + "learning_rate": 5.420900889397773e-07, + "loss": 0.001, + "step": 45262 + }, + { + "epoch": 0.90528, + "grad_norm": 0.020308880135416985, + "learning_rate": 5.416367083975482e-07, + "loss": 0.0023, + "step": 45264 + }, + { + "epoch": 0.90532, + "grad_norm": 0.0070083243772387505, + "learning_rate": 5.411835122509789e-07, + "loss": 0.0001, + "step": 45266 + }, + { + "epoch": 0.90536, + "grad_norm": 0.027893880382180214, + "learning_rate": 5.40730500508907e-07, + "loss": 0.0024, + "step": 45268 + }, + { + "epoch": 0.9054, + "grad_norm": 0.03947487473487854, + "learning_rate": 5.402776731801662e-07, + "loss": 0.001, + "step": 45270 + }, + { + "epoch": 0.90544, + "grad_norm": 0.03769966587424278, + "learning_rate": 5.398250302735786e-07, + "loss": 0.0011, + "step": 45272 + }, + { + "epoch": 0.90548, + "grad_norm": 0.0006238453206606209, + "learning_rate": 5.393725717979747e-07, + "loss": 0.0581, + "step": 45274 + }, + { + "epoch": 0.90552, + "grad_norm": 0.02152138575911522, + "learning_rate": 5.38920297762171e-07, + "loss": 0.0003, + "step": 45276 + }, + { + "epoch": 0.90556, + "grad_norm": 8.116745948791504, + "learning_rate": 5.384682081749903e-07, + "loss": 0.0645, + "step": 45278 + }, + { + "epoch": 0.9056, + "grad_norm": 0.13837175071239471, + "learning_rate": 5.380163030452412e-07, + "loss": 0.002, + "step": 45280 + }, + { + "epoch": 0.90564, + "grad_norm": 0.001265303697437048, + "learning_rate": 5.375645823817355e-07, + "loss": 0.1498, + "step": 45282 + }, + { + "epoch": 0.90568, + "grad_norm": 0.8057765960693359, + "learning_rate": 5.371130461932816e-07, + "loss": 0.0081, + "step": 45284 + }, + { + "epoch": 0.90572, + "grad_norm": 0.05478113889694214, + "learning_rate": 5.366616944886793e-07, + "loss": 0.0005, + "step": 45286 + }, + { + "epoch": 0.90576, + "grad_norm": 0.018273593857884407, + "learning_rate": 5.362105272767282e-07, + "loss": 0.0003, + "step": 45288 + }, + { + "epoch": 0.9058, + "grad_norm": 0.008823666721582413, + "learning_rate": 5.357595445662267e-07, + "loss": 0.0002, + "step": 45290 + }, + { + "epoch": 0.90584, + "grad_norm": 0.00917169451713562, + "learning_rate": 5.353087463659646e-07, + "loss": 0.001, + "step": 45292 + }, + { + "epoch": 0.90588, + "grad_norm": 0.031178442761301994, + "learning_rate": 5.348581326847313e-07, + "loss": 0.0009, + "step": 45294 + }, + { + "epoch": 0.90592, + "grad_norm": 0.07381512969732285, + "learning_rate": 5.344077035313133e-07, + "loss": 0.0012, + "step": 45296 + }, + { + "epoch": 0.90596, + "grad_norm": 0.005549580790102482, + "learning_rate": 5.339574589144891e-07, + "loss": 0.0006, + "step": 45298 + }, + { + "epoch": 0.906, + "grad_norm": 0.03602112457156181, + "learning_rate": 5.335073988430373e-07, + "loss": 0.0004, + "step": 45300 + }, + { + "epoch": 0.90604, + "grad_norm": 0.007998595014214516, + "learning_rate": 5.330575233257318e-07, + "loss": 0.0001, + "step": 45302 + }, + { + "epoch": 0.90608, + "grad_norm": 0.006342852953821421, + "learning_rate": 5.326078323713446e-07, + "loss": 0.0022, + "step": 45304 + }, + { + "epoch": 0.90612, + "grad_norm": 0.02323007769882679, + "learning_rate": 5.321583259886432e-07, + "loss": 0.0002, + "step": 45306 + }, + { + "epoch": 0.90616, + "grad_norm": 0.07543856650590897, + "learning_rate": 5.31709004186387e-07, + "loss": 0.0032, + "step": 45308 + }, + { + "epoch": 0.9062, + "grad_norm": 0.00028721580747514963, + "learning_rate": 5.312598669733404e-07, + "loss": 0.001, + "step": 45310 + }, + { + "epoch": 0.90624, + "grad_norm": 0.21756680309772491, + "learning_rate": 5.308109143582574e-07, + "loss": 0.0028, + "step": 45312 + }, + { + "epoch": 0.90628, + "grad_norm": 0.0060144527815282345, + "learning_rate": 5.303621463498898e-07, + "loss": 0.0008, + "step": 45314 + }, + { + "epoch": 0.90632, + "grad_norm": 8.149044036865234, + "learning_rate": 5.299135629569874e-07, + "loss": 0.1378, + "step": 45316 + }, + { + "epoch": 0.90636, + "grad_norm": 0.004664205946028233, + "learning_rate": 5.294651641882976e-07, + "loss": 0.0003, + "step": 45318 + }, + { + "epoch": 0.9064, + "grad_norm": 0.09309462457895279, + "learning_rate": 5.290169500525577e-07, + "loss": 0.0011, + "step": 45320 + }, + { + "epoch": 0.90644, + "grad_norm": 0.002292429096996784, + "learning_rate": 5.285689205585088e-07, + "loss": 0.0004, + "step": 45322 + }, + { + "epoch": 0.90648, + "grad_norm": 0.11947732418775558, + "learning_rate": 5.281210757148858e-07, + "loss": 0.0035, + "step": 45324 + }, + { + "epoch": 0.90652, + "grad_norm": 0.032931551337242126, + "learning_rate": 5.276734155304187e-07, + "loss": 0.0017, + "step": 45326 + }, + { + "epoch": 0.90656, + "grad_norm": 0.0059158033691346645, + "learning_rate": 5.272259400138357e-07, + "loss": 0.0104, + "step": 45328 + }, + { + "epoch": 0.9066, + "grad_norm": 0.07223809510469437, + "learning_rate": 5.267786491738569e-07, + "loss": 0.0035, + "step": 45330 + }, + { + "epoch": 0.90664, + "grad_norm": 0.007132494356483221, + "learning_rate": 5.263315430192096e-07, + "loss": 0.0104, + "step": 45332 + }, + { + "epoch": 0.90668, + "grad_norm": 0.0016877719899639487, + "learning_rate": 5.258846215586044e-07, + "loss": 0.0002, + "step": 45334 + }, + { + "epoch": 0.90672, + "grad_norm": 0.14235541224479675, + "learning_rate": 5.254378848007557e-07, + "loss": 0.001, + "step": 45336 + }, + { + "epoch": 0.90676, + "grad_norm": 0.06439365446567535, + "learning_rate": 5.249913327543743e-07, + "loss": 0.0007, + "step": 45338 + }, + { + "epoch": 0.9068, + "grad_norm": 0.018364954739809036, + "learning_rate": 5.245449654281632e-07, + "loss": 0.0002, + "step": 45340 + }, + { + "epoch": 0.90684, + "grad_norm": 0.0005587567575275898, + "learning_rate": 5.240987828308275e-07, + "loss": 0.0015, + "step": 45342 + }, + { + "epoch": 0.90688, + "grad_norm": 1.0886609554290771, + "learning_rate": 5.236527849710648e-07, + "loss": 0.0114, + "step": 45344 + }, + { + "epoch": 0.90692, + "grad_norm": 0.08441917598247528, + "learning_rate": 5.232069718575694e-07, + "loss": 0.0014, + "step": 45346 + }, + { + "epoch": 0.90696, + "grad_norm": 0.014857497066259384, + "learning_rate": 5.22761343499032e-07, + "loss": 0.0002, + "step": 45348 + }, + { + "epoch": 0.907, + "grad_norm": 0.019023830071091652, + "learning_rate": 5.223158999041444e-07, + "loss": 0.0002, + "step": 45350 + }, + { + "epoch": 0.90704, + "grad_norm": 0.0625775009393692, + "learning_rate": 5.218706410815855e-07, + "loss": 0.001, + "step": 45352 + }, + { + "epoch": 0.90708, + "grad_norm": 0.24624212086200714, + "learning_rate": 5.214255670400381e-07, + "loss": 0.0025, + "step": 45354 + }, + { + "epoch": 0.90712, + "grad_norm": 0.05490069463849068, + "learning_rate": 5.209806777881798e-07, + "loss": 0.002, + "step": 45356 + }, + { + "epoch": 0.90716, + "grad_norm": 0.13301189243793488, + "learning_rate": 5.205359733346826e-07, + "loss": 0.0012, + "step": 45358 + }, + { + "epoch": 0.9072, + "grad_norm": 0.09869492053985596, + "learning_rate": 5.200914536882184e-07, + "loss": 0.0026, + "step": 45360 + }, + { + "epoch": 0.90724, + "grad_norm": 0.001238993019796908, + "learning_rate": 5.196471188574492e-07, + "loss": 0.0001, + "step": 45362 + }, + { + "epoch": 0.90728, + "grad_norm": 0.7395303845405579, + "learning_rate": 5.192029688510436e-07, + "loss": 0.0073, + "step": 45364 + }, + { + "epoch": 0.90732, + "grad_norm": 0.0167100690305233, + "learning_rate": 5.187590036776547e-07, + "loss": 0.0002, + "step": 45366 + }, + { + "epoch": 0.90736, + "grad_norm": 0.7913087606430054, + "learning_rate": 5.18315223345941e-07, + "loss": 0.0072, + "step": 45368 + }, + { + "epoch": 0.9074, + "grad_norm": 0.021883202716708183, + "learning_rate": 5.178716278645534e-07, + "loss": 0.0003, + "step": 45370 + }, + { + "epoch": 0.90744, + "grad_norm": 0.021443571895360947, + "learning_rate": 5.174282172421396e-07, + "loss": 0.0004, + "step": 45372 + }, + { + "epoch": 0.90748, + "grad_norm": 0.0009776366641744971, + "learning_rate": 5.16984991487347e-07, + "loss": 0.0001, + "step": 45374 + }, + { + "epoch": 0.90752, + "grad_norm": 0.0068003954365849495, + "learning_rate": 5.165419506088121e-07, + "loss": 0.0005, + "step": 45376 + }, + { + "epoch": 0.90756, + "grad_norm": 0.0742018073797226, + "learning_rate": 5.160990946151756e-07, + "loss": 0.0007, + "step": 45378 + }, + { + "epoch": 0.9076, + "grad_norm": 0.17352591454982758, + "learning_rate": 5.156564235150686e-07, + "loss": 0.0017, + "step": 45380 + }, + { + "epoch": 0.90764, + "grad_norm": 0.009153527207672596, + "learning_rate": 5.152139373171228e-07, + "loss": 0.0022, + "step": 45382 + }, + { + "epoch": 0.90768, + "grad_norm": 0.19579005241394043, + "learning_rate": 5.147716360299637e-07, + "loss": 0.0018, + "step": 45384 + }, + { + "epoch": 0.90772, + "grad_norm": 0.04573840647935867, + "learning_rate": 5.143295196622178e-07, + "loss": 0.0037, + "step": 45386 + }, + { + "epoch": 0.90776, + "grad_norm": 0.10193298012018204, + "learning_rate": 5.13887588222498e-07, + "loss": 0.0009, + "step": 45388 + }, + { + "epoch": 0.9078, + "grad_norm": 0.7238231301307678, + "learning_rate": 5.134458417194255e-07, + "loss": 0.0081, + "step": 45390 + }, + { + "epoch": 0.90784, + "grad_norm": 0.20290589332580566, + "learning_rate": 5.130042801616109e-07, + "loss": 0.0024, + "step": 45392 + }, + { + "epoch": 0.90788, + "grad_norm": 0.011198448948562145, + "learning_rate": 5.125629035576585e-07, + "loss": 0.0002, + "step": 45394 + }, + { + "epoch": 0.90792, + "grad_norm": 0.022550975903868675, + "learning_rate": 5.121217119161803e-07, + "loss": 0.0005, + "step": 45396 + }, + { + "epoch": 0.90796, + "grad_norm": 0.025777017697691917, + "learning_rate": 5.116807052457717e-07, + "loss": 0.0009, + "step": 45398 + }, + { + "epoch": 0.908, + "grad_norm": 2.1268274784088135, + "learning_rate": 5.112398835550348e-07, + "loss": 0.0356, + "step": 45400 + }, + { + "epoch": 0.90804, + "grad_norm": 1.5609270334243774, + "learning_rate": 5.107992468525602e-07, + "loss": 0.022, + "step": 45402 + }, + { + "epoch": 0.90808, + "grad_norm": 5.098311901092529, + "learning_rate": 5.103587951469401e-07, + "loss": 0.0582, + "step": 45404 + }, + { + "epoch": 0.90812, + "grad_norm": 8.396093368530273, + "learning_rate": 5.099185284467623e-07, + "loss": 0.129, + "step": 45406 + }, + { + "epoch": 0.90816, + "grad_norm": 0.00563378119841218, + "learning_rate": 5.094784467606062e-07, + "loss": 0.0006, + "step": 45408 + }, + { + "epoch": 0.9082, + "grad_norm": 0.02411573752760887, + "learning_rate": 5.090385500970551e-07, + "loss": 0.0004, + "step": 45410 + }, + { + "epoch": 0.90824, + "grad_norm": 0.04512020945549011, + "learning_rate": 5.085988384646823e-07, + "loss": 0.0054, + "step": 45412 + }, + { + "epoch": 0.90828, + "grad_norm": 0.002413407200947404, + "learning_rate": 5.081593118720629e-07, + "loss": 0.0004, + "step": 45414 + }, + { + "epoch": 0.90832, + "grad_norm": 0.17701028287410736, + "learning_rate": 5.077199703277636e-07, + "loss": 0.0019, + "step": 45416 + }, + { + "epoch": 0.90836, + "grad_norm": 0.0701277032494545, + "learning_rate": 5.072808138403529e-07, + "loss": 0.0013, + "step": 45418 + }, + { + "epoch": 0.9084, + "grad_norm": 0.007291205693036318, + "learning_rate": 5.068418424183874e-07, + "loss": 0.0009, + "step": 45420 + }, + { + "epoch": 0.90844, + "grad_norm": 0.09251400828361511, + "learning_rate": 5.06403056070428e-07, + "loss": 0.0011, + "step": 45422 + }, + { + "epoch": 0.90848, + "grad_norm": 1.410827875137329, + "learning_rate": 5.059644548050302e-07, + "loss": 1.4091, + "step": 45424 + }, + { + "epoch": 0.90852, + "grad_norm": 0.11761964112520218, + "learning_rate": 5.055260386307403e-07, + "loss": 0.0018, + "step": 45426 + }, + { + "epoch": 0.90856, + "grad_norm": 0.0036830215249210596, + "learning_rate": 5.050878075561116e-07, + "loss": 0.0001, + "step": 45428 + }, + { + "epoch": 0.9086, + "grad_norm": 0.3217577338218689, + "learning_rate": 5.046497615896806e-07, + "loss": 0.0039, + "step": 45430 + }, + { + "epoch": 0.90864, + "grad_norm": 1.4575003385543823, + "learning_rate": 5.042119007399948e-07, + "loss": 0.0132, + "step": 45432 + }, + { + "epoch": 0.90868, + "grad_norm": 0.017084626480937004, + "learning_rate": 5.037742250155852e-07, + "loss": 0.0302, + "step": 45434 + }, + { + "epoch": 0.90872, + "grad_norm": 0.020886410027742386, + "learning_rate": 5.033367344249851e-07, + "loss": 0.0036, + "step": 45436 + }, + { + "epoch": 0.90876, + "grad_norm": 0.8149697780609131, + "learning_rate": 5.028994289767253e-07, + "loss": 0.0084, + "step": 45438 + }, + { + "epoch": 0.9088, + "grad_norm": 0.013998621143400669, + "learning_rate": 5.024623086793323e-07, + "loss": 0.0003, + "step": 45440 + }, + { + "epoch": 0.90884, + "grad_norm": 0.05642670392990112, + "learning_rate": 5.020253735413249e-07, + "loss": 0.0006, + "step": 45442 + }, + { + "epoch": 0.90888, + "grad_norm": 3.9766974449157715, + "learning_rate": 5.01588623571222e-07, + "loss": 0.3044, + "step": 45444 + }, + { + "epoch": 0.90892, + "grad_norm": 0.039553601294755936, + "learning_rate": 5.011520587775398e-07, + "loss": 0.0007, + "step": 45446 + }, + { + "epoch": 0.90896, + "grad_norm": 0.009209729731082916, + "learning_rate": 5.007156791687872e-07, + "loss": 0.0012, + "step": 45448 + }, + { + "epoch": 0.909, + "grad_norm": 14.306232452392578, + "learning_rate": 5.002794847534765e-07, + "loss": 0.204, + "step": 45450 + }, + { + "epoch": 0.90904, + "grad_norm": 0.10204005986452103, + "learning_rate": 4.998434755401038e-07, + "loss": 0.0049, + "step": 45452 + }, + { + "epoch": 0.90908, + "grad_norm": 0.49234774708747864, + "learning_rate": 4.99407651537176e-07, + "loss": 0.0167, + "step": 45454 + }, + { + "epoch": 0.90912, + "grad_norm": 0.1758583039045334, + "learning_rate": 4.989720127531872e-07, + "loss": 0.0028, + "step": 45456 + }, + { + "epoch": 0.90916, + "grad_norm": 0.017334559932351112, + "learning_rate": 4.985365591966296e-07, + "loss": 0.0003, + "step": 45458 + }, + { + "epoch": 0.9092, + "grad_norm": 0.0008692457340657711, + "learning_rate": 4.981012908759941e-07, + "loss": 0.0002, + "step": 45460 + }, + { + "epoch": 0.90924, + "grad_norm": 0.044519610702991486, + "learning_rate": 4.97666207799764e-07, + "loss": 0.0019, + "step": 45462 + }, + { + "epoch": 0.90928, + "grad_norm": 0.4534904956817627, + "learning_rate": 4.97231309976427e-07, + "loss": 0.0035, + "step": 45464 + }, + { + "epoch": 0.90932, + "grad_norm": 0.010844903066754341, + "learning_rate": 4.967965974144551e-07, + "loss": 0.0082, + "step": 45466 + }, + { + "epoch": 0.90936, + "grad_norm": 0.11869558691978455, + "learning_rate": 4.963620701223271e-07, + "loss": 0.0016, + "step": 45468 + }, + { + "epoch": 0.9094, + "grad_norm": 0.01963881030678749, + "learning_rate": 4.959277281085128e-07, + "loss": 0.0014, + "step": 45470 + }, + { + "epoch": 0.90944, + "grad_norm": 0.07853793352842331, + "learning_rate": 4.954935713814802e-07, + "loss": 0.0006, + "step": 45472 + }, + { + "epoch": 0.90948, + "grad_norm": 0.019091609865427017, + "learning_rate": 4.950595999496955e-07, + "loss": 0.002, + "step": 45474 + }, + { + "epoch": 0.90952, + "grad_norm": 0.007316984236240387, + "learning_rate": 4.946258138216153e-07, + "loss": 0.0003, + "step": 45476 + }, + { + "epoch": 0.90956, + "grad_norm": 0.38160473108291626, + "learning_rate": 4.941922130056998e-07, + "loss": 0.004, + "step": 45478 + }, + { + "epoch": 0.9096, + "grad_norm": 0.609358012676239, + "learning_rate": 4.937587975103997e-07, + "loss": 0.0064, + "step": 45480 + }, + { + "epoch": 0.90964, + "grad_norm": 0.17270773649215698, + "learning_rate": 4.933255673441683e-07, + "loss": 0.0031, + "step": 45482 + }, + { + "epoch": 0.90968, + "grad_norm": 0.002217050176113844, + "learning_rate": 4.928925225154468e-07, + "loss": 0.0003, + "step": 45484 + }, + { + "epoch": 0.90972, + "grad_norm": 0.06843740493059158, + "learning_rate": 4.924596630326817e-07, + "loss": 0.0018, + "step": 45486 + }, + { + "epoch": 0.90976, + "grad_norm": 0.23149941861629486, + "learning_rate": 4.920269889043094e-07, + "loss": 0.0025, + "step": 45488 + }, + { + "epoch": 0.9098, + "grad_norm": 0.2709139585494995, + "learning_rate": 4.915945001387668e-07, + "loss": 0.0048, + "step": 45490 + }, + { + "epoch": 0.90984, + "grad_norm": 0.0013649997999891639, + "learning_rate": 4.911621967444858e-07, + "loss": 0.0002, + "step": 45492 + }, + { + "epoch": 0.90988, + "grad_norm": 5.6811089515686035, + "learning_rate": 4.907300787298896e-07, + "loss": 0.0523, + "step": 45494 + }, + { + "epoch": 0.90992, + "grad_norm": 2.4204676151275635, + "learning_rate": 4.902981461034095e-07, + "loss": 0.0214, + "step": 45496 + }, + { + "epoch": 0.90996, + "grad_norm": 0.024129534140229225, + "learning_rate": 4.89866398873462e-07, + "loss": 0.0005, + "step": 45498 + }, + { + "epoch": 0.91, + "grad_norm": 0.0035975612699985504, + "learning_rate": 4.894348370484648e-07, + "loss": 0.0004, + "step": 45500 + }, + { + "epoch": 0.91004, + "grad_norm": 0.0007562413811683655, + "learning_rate": 4.890034606368321e-07, + "loss": 0.0, + "step": 45502 + }, + { + "epoch": 0.91008, + "grad_norm": 1.050803542137146, + "learning_rate": 4.88572269646973e-07, + "loss": 0.0134, + "step": 45504 + }, + { + "epoch": 0.91012, + "grad_norm": 0.005314392503350973, + "learning_rate": 4.881412640872941e-07, + "loss": 0.0004, + "step": 45506 + }, + { + "epoch": 0.91016, + "grad_norm": 0.001912024337798357, + "learning_rate": 4.877104439662006e-07, + "loss": 0.0002, + "step": 45508 + }, + { + "epoch": 0.9102, + "grad_norm": 0.03345148637890816, + "learning_rate": 4.872798092920871e-07, + "loss": 0.0005, + "step": 45510 + }, + { + "epoch": 0.91024, + "grad_norm": 0.015057292766869068, + "learning_rate": 4.868493600733515e-07, + "loss": 0.0058, + "step": 45512 + }, + { + "epoch": 0.91028, + "grad_norm": 0.014493574388325214, + "learning_rate": 4.864190963183868e-07, + "loss": 0.0083, + "step": 45514 + }, + { + "epoch": 0.91032, + "grad_norm": 0.18531756103038788, + "learning_rate": 4.859890180355776e-07, + "loss": 0.0031, + "step": 45516 + }, + { + "epoch": 0.91036, + "grad_norm": 1.0877020359039307, + "learning_rate": 4.855591252333125e-07, + "loss": 0.0118, + "step": 45518 + }, + { + "epoch": 0.9104, + "grad_norm": 0.08404012024402618, + "learning_rate": 4.851294179199673e-07, + "loss": 0.0011, + "step": 45520 + }, + { + "epoch": 0.91044, + "grad_norm": 0.01179815735667944, + "learning_rate": 4.846998961039262e-07, + "loss": 0.0003, + "step": 45522 + }, + { + "epoch": 0.91048, + "grad_norm": 0.0036745714023709297, + "learning_rate": 4.842705597935582e-07, + "loss": 0.001, + "step": 45524 + }, + { + "epoch": 0.91052, + "grad_norm": 0.026993626728653908, + "learning_rate": 4.838414089972343e-07, + "loss": 0.0005, + "step": 45526 + }, + { + "epoch": 0.91056, + "grad_norm": 0.0018055589171126485, + "learning_rate": 4.834124437233234e-07, + "loss": 0.0, + "step": 45528 + }, + { + "epoch": 0.9106, + "grad_norm": 0.009600551798939705, + "learning_rate": 4.829836639801844e-07, + "loss": 0.0011, + "step": 45530 + }, + { + "epoch": 0.91064, + "grad_norm": 0.07823600620031357, + "learning_rate": 4.825550697761794e-07, + "loss": 0.0008, + "step": 45532 + }, + { + "epoch": 0.91068, + "grad_norm": 0.020070483908057213, + "learning_rate": 4.821266611196618e-07, + "loss": 0.0016, + "step": 45534 + }, + { + "epoch": 0.91072, + "grad_norm": 0.016154177486896515, + "learning_rate": 4.81698438018986e-07, + "loss": 0.0018, + "step": 45536 + }, + { + "epoch": 0.91076, + "grad_norm": 0.008086562156677246, + "learning_rate": 4.812704004824998e-07, + "loss": 0.0017, + "step": 45538 + }, + { + "epoch": 0.9108, + "grad_norm": 0.024917790666222572, + "learning_rate": 4.808425485185486e-07, + "loss": 0.1265, + "step": 45540 + }, + { + "epoch": 0.91084, + "grad_norm": 0.1585368812084198, + "learning_rate": 4.804148821354715e-07, + "loss": 0.0017, + "step": 45542 + }, + { + "epoch": 0.91088, + "grad_norm": 0.024872751906514168, + "learning_rate": 4.799874013416072e-07, + "loss": 0.0074, + "step": 45544 + }, + { + "epoch": 0.91092, + "grad_norm": 0.06435281783342361, + "learning_rate": 4.795601061452903e-07, + "loss": 0.0011, + "step": 45546 + }, + { + "epoch": 0.91096, + "grad_norm": 0.6181953549385071, + "learning_rate": 4.791329965548486e-07, + "loss": 0.005, + "step": 45548 + }, + { + "epoch": 0.911, + "grad_norm": 0.1183336079120636, + "learning_rate": 4.787060725786141e-07, + "loss": 0.0021, + "step": 45550 + }, + { + "epoch": 0.91104, + "grad_norm": 0.03640351444482803, + "learning_rate": 4.782793342249037e-07, + "loss": 0.0042, + "step": 45552 + }, + { + "epoch": 0.91108, + "grad_norm": 2.178490400314331, + "learning_rate": 4.778527815020418e-07, + "loss": 0.0247, + "step": 45554 + }, + { + "epoch": 0.91112, + "grad_norm": 1.7265369892120361, + "learning_rate": 4.774264144183416e-07, + "loss": 0.0156, + "step": 45556 + }, + { + "epoch": 0.91116, + "grad_norm": 0.07742492109537125, + "learning_rate": 4.770002329821155e-07, + "loss": 0.0013, + "step": 45558 + }, + { + "epoch": 0.9112, + "grad_norm": 0.09603019058704376, + "learning_rate": 4.765742372016735e-07, + "loss": 0.0013, + "step": 45560 + }, + { + "epoch": 0.91124, + "grad_norm": 0.01110874954611063, + "learning_rate": 4.761484270853178e-07, + "loss": 0.0008, + "step": 45562 + }, + { + "epoch": 0.91128, + "grad_norm": 0.023722775280475616, + "learning_rate": 4.7572280264135405e-07, + "loss": 0.0033, + "step": 45564 + }, + { + "epoch": 0.91132, + "grad_norm": 0.004278185777366161, + "learning_rate": 4.752973638780767e-07, + "loss": 0.0, + "step": 45566 + }, + { + "epoch": 0.91136, + "grad_norm": 0.058697089552879333, + "learning_rate": 4.7487211080378127e-07, + "loss": 0.0007, + "step": 45568 + }, + { + "epoch": 0.9114, + "grad_norm": 0.03618575632572174, + "learning_rate": 4.7444704342675673e-07, + "loss": 0.0048, + "step": 45570 + }, + { + "epoch": 0.91144, + "grad_norm": 0.07416889071464539, + "learning_rate": 4.7402216175529315e-07, + "loss": 0.0011, + "step": 45572 + }, + { + "epoch": 0.91148, + "grad_norm": 0.01939447596669197, + "learning_rate": 4.735974657976694e-07, + "loss": 0.0002, + "step": 45574 + }, + { + "epoch": 0.91152, + "grad_norm": 0.0050792936235666275, + "learning_rate": 4.7317295556217e-07, + "loss": 0.0003, + "step": 45576 + }, + { + "epoch": 0.91156, + "grad_norm": 0.35267579555511475, + "learning_rate": 4.7274863105706726e-07, + "loss": 0.0051, + "step": 45578 + }, + { + "epoch": 0.9116, + "grad_norm": 0.07717102020978928, + "learning_rate": 4.723244922906356e-07, + "loss": 0.0025, + "step": 45580 + }, + { + "epoch": 0.91164, + "grad_norm": 0.026585914194583893, + "learning_rate": 4.719005392711429e-07, + "loss": 0.0033, + "step": 45582 + }, + { + "epoch": 0.91168, + "grad_norm": 0.016396932303905487, + "learning_rate": 4.714767720068536e-07, + "loss": 0.0008, + "step": 45584 + }, + { + "epoch": 0.91172, + "grad_norm": 0.5562829375267029, + "learning_rate": 4.7105319050603114e-07, + "loss": 0.004, + "step": 45586 + }, + { + "epoch": 0.91176, + "grad_norm": 0.8467613458633423, + "learning_rate": 4.7062979477693226e-07, + "loss": 0.0064, + "step": 45588 + }, + { + "epoch": 0.9118, + "grad_norm": 0.06621099263429642, + "learning_rate": 4.702065848278126e-07, + "loss": 0.0006, + "step": 45590 + }, + { + "epoch": 0.91184, + "grad_norm": 0.02524716593325138, + "learning_rate": 4.6978356066691987e-07, + "loss": 0.1761, + "step": 45592 + }, + { + "epoch": 0.91188, + "grad_norm": 1.3254956007003784, + "learning_rate": 4.693607223025043e-07, + "loss": 0.0151, + "step": 45594 + }, + { + "epoch": 0.91192, + "grad_norm": 0.047865405678749084, + "learning_rate": 4.689380697428092e-07, + "loss": 0.005, + "step": 45596 + }, + { + "epoch": 0.91196, + "grad_norm": 0.007613718044012785, + "learning_rate": 4.6851560299607247e-07, + "loss": 0.0001, + "step": 45598 + }, + { + "epoch": 0.912, + "grad_norm": 0.010219083167612553, + "learning_rate": 4.6809332207053083e-07, + "loss": 0.0002, + "step": 45600 + }, + { + "epoch": 0.91204, + "grad_norm": 0.030519481748342514, + "learning_rate": 4.676712269744166e-07, + "loss": 0.0003, + "step": 45602 + }, + { + "epoch": 0.91208, + "grad_norm": 0.4623255133628845, + "learning_rate": 4.672493177159609e-07, + "loss": 0.0206, + "step": 45604 + }, + { + "epoch": 0.91212, + "grad_norm": 0.07685395330190659, + "learning_rate": 4.668275943033851e-07, + "loss": 0.001, + "step": 45606 + }, + { + "epoch": 0.91216, + "grad_norm": 0.006082004867494106, + "learning_rate": 4.664060567449169e-07, + "loss": 0.0002, + "step": 45608 + }, + { + "epoch": 0.9122, + "grad_norm": 0.007091447710990906, + "learning_rate": 4.659847050487687e-07, + "loss": 0.0006, + "step": 45610 + }, + { + "epoch": 0.91224, + "grad_norm": 0.441413551568985, + "learning_rate": 4.655635392231572e-07, + "loss": 0.0041, + "step": 45612 + }, + { + "epoch": 0.91228, + "grad_norm": 0.1580965518951416, + "learning_rate": 4.651425592762948e-07, + "loss": 0.0036, + "step": 45614 + }, + { + "epoch": 0.91232, + "grad_norm": 0.10012742131948471, + "learning_rate": 4.6472176521638377e-07, + "loss": 0.0017, + "step": 45616 + }, + { + "epoch": 0.91236, + "grad_norm": 0.18976612389087677, + "learning_rate": 4.6430115705163424e-07, + "loss": 0.0024, + "step": 45618 + }, + { + "epoch": 0.9124, + "grad_norm": 1.7313634157180786, + "learning_rate": 4.638807347902408e-07, + "loss": 0.0215, + "step": 45620 + }, + { + "epoch": 0.91244, + "grad_norm": 0.03253389522433281, + "learning_rate": 4.634604984404023e-07, + "loss": 0.0042, + "step": 45622 + }, + { + "epoch": 0.91248, + "grad_norm": 0.038686975836753845, + "learning_rate": 4.6304044801031123e-07, + "loss": 0.0005, + "step": 45624 + }, + { + "epoch": 0.91252, + "grad_norm": 0.004635222721844912, + "learning_rate": 4.6262058350815766e-07, + "loss": 0.0, + "step": 45626 + }, + { + "epoch": 0.91256, + "grad_norm": 0.02133043482899666, + "learning_rate": 4.622009049421261e-07, + "loss": 0.0015, + "step": 45628 + }, + { + "epoch": 0.9126, + "grad_norm": 0.01671716384589672, + "learning_rate": 4.6178141232039676e-07, + "loss": 0.0002, + "step": 45630 + }, + { + "epoch": 0.91264, + "grad_norm": 0.45831993222236633, + "learning_rate": 4.613621056511508e-07, + "loss": 0.0067, + "step": 45632 + }, + { + "epoch": 0.91268, + "grad_norm": 0.007265306543558836, + "learning_rate": 4.609429849425595e-07, + "loss": 0.0024, + "step": 45634 + }, + { + "epoch": 0.91272, + "grad_norm": 0.04142669215798378, + "learning_rate": 4.6052405020279856e-07, + "loss": 0.0005, + "step": 45636 + }, + { + "epoch": 0.91276, + "grad_norm": 0.00901731662452221, + "learning_rate": 4.601053014400292e-07, + "loss": 0.0001, + "step": 45638 + }, + { + "epoch": 0.9128, + "grad_norm": 21.05182456970215, + "learning_rate": 4.596867386624215e-07, + "loss": 0.8349, + "step": 45640 + }, + { + "epoch": 0.91284, + "grad_norm": 0.019556045532226562, + "learning_rate": 4.5926836187812905e-07, + "loss": 0.0001, + "step": 45642 + }, + { + "epoch": 0.91288, + "grad_norm": 0.020928578451275826, + "learning_rate": 4.588501710953153e-07, + "loss": 0.0004, + "step": 45644 + }, + { + "epoch": 0.91292, + "grad_norm": 0.04590655490756035, + "learning_rate": 4.5843216632212804e-07, + "loss": 0.0012, + "step": 45646 + }, + { + "epoch": 0.91296, + "grad_norm": 0.030268749222159386, + "learning_rate": 4.580143475667176e-07, + "loss": 0.0066, + "step": 45648 + }, + { + "epoch": 0.913, + "grad_norm": 0.138286292552948, + "learning_rate": 4.575967148372318e-07, + "loss": 0.0027, + "step": 45650 + }, + { + "epoch": 0.91304, + "grad_norm": 0.1781550645828247, + "learning_rate": 4.571792681418097e-07, + "loss": 0.0028, + "step": 45652 + }, + { + "epoch": 0.91308, + "grad_norm": 0.1462535411119461, + "learning_rate": 4.5676200748859036e-07, + "loss": 0.0047, + "step": 45654 + }, + { + "epoch": 0.91312, + "grad_norm": 0.03916283696889877, + "learning_rate": 4.5634493288570944e-07, + "loss": 0.0005, + "step": 45656 + }, + { + "epoch": 0.91316, + "grad_norm": 0.12317857891321182, + "learning_rate": 4.5592804434129613e-07, + "loss": 0.0021, + "step": 45658 + }, + { + "epoch": 0.9132, + "grad_norm": 0.11090940982103348, + "learning_rate": 4.5551134186348045e-07, + "loss": 0.0016, + "step": 45660 + }, + { + "epoch": 0.91324, + "grad_norm": 0.014522609300911427, + "learning_rate": 4.55094825460386e-07, + "loss": 0.0014, + "step": 45662 + }, + { + "epoch": 0.91328, + "grad_norm": 0.0006729814340360463, + "learning_rate": 4.5467849514013063e-07, + "loss": 0.0004, + "step": 45664 + }, + { + "epoch": 0.91332, + "grad_norm": 0.03220541030168533, + "learning_rate": 4.5426235091083236e-07, + "loss": 0.0025, + "step": 45666 + }, + { + "epoch": 0.91336, + "grad_norm": 0.05177832022309303, + "learning_rate": 4.538463927806036e-07, + "loss": 0.0005, + "step": 45668 + }, + { + "epoch": 0.9134, + "grad_norm": 0.023404328152537346, + "learning_rate": 4.534306207575545e-07, + "loss": 0.0033, + "step": 45670 + }, + { + "epoch": 0.91344, + "grad_norm": 0.033049389719963074, + "learning_rate": 4.5301503484979083e-07, + "loss": 0.0011, + "step": 45672 + }, + { + "epoch": 0.91348, + "grad_norm": 0.030476344749331474, + "learning_rate": 4.525996350654127e-07, + "loss": 0.012, + "step": 45674 + }, + { + "epoch": 0.91352, + "grad_norm": 0.09380701929330826, + "learning_rate": 4.5218442141252263e-07, + "loss": 0.6093, + "step": 45676 + }, + { + "epoch": 0.91356, + "grad_norm": 0.0379021055996418, + "learning_rate": 4.517693938992107e-07, + "loss": 0.0008, + "step": 45678 + }, + { + "epoch": 0.9136, + "grad_norm": 0.12923119962215424, + "learning_rate": 4.5135455253357053e-07, + "loss": 0.0037, + "step": 45680 + }, + { + "epoch": 0.91364, + "grad_norm": 0.17233781516551971, + "learning_rate": 4.5093989732369114e-07, + "loss": 0.3021, + "step": 45682 + }, + { + "epoch": 0.91368, + "grad_norm": 0.07991618663072586, + "learning_rate": 4.5052542827765055e-07, + "loss": 0.0124, + "step": 45684 + }, + { + "epoch": 0.91372, + "grad_norm": 0.08085878938436508, + "learning_rate": 4.5011114540353674e-07, + "loss": 0.0021, + "step": 45686 + }, + { + "epoch": 0.91376, + "grad_norm": 0.05671398714184761, + "learning_rate": 4.496970487094221e-07, + "loss": 0.0014, + "step": 45688 + }, + { + "epoch": 0.9138, + "grad_norm": 3.1163136959075928, + "learning_rate": 4.492831382033791e-07, + "loss": 0.034, + "step": 45690 + }, + { + "epoch": 0.91384, + "grad_norm": 0.052032019942998886, + "learning_rate": 4.4886941389347906e-07, + "loss": 0.0028, + "step": 45692 + }, + { + "epoch": 0.91388, + "grad_norm": 0.016421576961874962, + "learning_rate": 4.484558757877888e-07, + "loss": 0.0151, + "step": 45694 + }, + { + "epoch": 0.91392, + "grad_norm": 0.12248552590608597, + "learning_rate": 4.4804252389436645e-07, + "loss": 0.9184, + "step": 45696 + }, + { + "epoch": 0.91396, + "grad_norm": 0.0732320100069046, + "learning_rate": 4.4762935822127316e-07, + "loss": 0.0007, + "step": 45698 + }, + { + "epoch": 0.914, + "grad_norm": 0.42603200674057007, + "learning_rate": 4.4721637877656377e-07, + "loss": 0.0044, + "step": 45700 + }, + { + "epoch": 0.91404, + "grad_norm": 0.520805835723877, + "learning_rate": 4.468035855682884e-07, + "loss": 0.0173, + "step": 45702 + }, + { + "epoch": 0.91408, + "grad_norm": 0.10667315125465393, + "learning_rate": 4.463909786044973e-07, + "loss": 0.0016, + "step": 45704 + }, + { + "epoch": 0.91412, + "grad_norm": 0.03865272179245949, + "learning_rate": 4.4597855789322963e-07, + "loss": 0.0005, + "step": 45706 + }, + { + "epoch": 0.91416, + "grad_norm": 0.008268434554338455, + "learning_rate": 4.4556632344253225e-07, + "loss": 0.0003, + "step": 45708 + }, + { + "epoch": 0.9142, + "grad_norm": 5.635179042816162, + "learning_rate": 4.451542752604365e-07, + "loss": 0.0583, + "step": 45710 + }, + { + "epoch": 0.91424, + "grad_norm": 1.0054491758346558, + "learning_rate": 4.447424133549771e-07, + "loss": 0.0094, + "step": 45712 + }, + { + "epoch": 0.91428, + "grad_norm": 0.007673680782318115, + "learning_rate": 4.443307377341832e-07, + "loss": 0.0003, + "step": 45714 + }, + { + "epoch": 0.91432, + "grad_norm": 0.28264012932777405, + "learning_rate": 4.4391924840608167e-07, + "loss": 0.0032, + "step": 45716 + }, + { + "epoch": 0.91436, + "grad_norm": 0.2133650928735733, + "learning_rate": 4.4350794537869614e-07, + "loss": 0.0017, + "step": 45718 + }, + { + "epoch": 0.9144, + "grad_norm": 0.0042638350278139114, + "learning_rate": 4.4309682866004124e-07, + "loss": 0.0004, + "step": 45720 + }, + { + "epoch": 0.91444, + "grad_norm": 0.01543044950813055, + "learning_rate": 4.426858982581339e-07, + "loss": 0.0005, + "step": 45722 + }, + { + "epoch": 0.91448, + "grad_norm": 0.016161395236849785, + "learning_rate": 4.4227515418098554e-07, + "loss": 0.0009, + "step": 45724 + }, + { + "epoch": 0.91452, + "grad_norm": 0.039571017026901245, + "learning_rate": 4.4186459643660526e-07, + "loss": 0.0034, + "step": 45726 + }, + { + "epoch": 0.91456, + "grad_norm": 0.11531388014554977, + "learning_rate": 4.414542250329923e-07, + "loss": 0.0032, + "step": 45728 + }, + { + "epoch": 0.9146, + "grad_norm": 0.0033759032376110554, + "learning_rate": 4.4104403997815346e-07, + "loss": 0.0189, + "step": 45730 + }, + { + "epoch": 0.91464, + "grad_norm": 0.5047092437744141, + "learning_rate": 4.4063404128008133e-07, + "loss": 0.006, + "step": 45732 + }, + { + "epoch": 0.91468, + "grad_norm": 0.013742920942604542, + "learning_rate": 4.4022422894676953e-07, + "loss": 0.0002, + "step": 45734 + }, + { + "epoch": 0.91472, + "grad_norm": 0.07432834804058075, + "learning_rate": 4.398146029862094e-07, + "loss": 0.0381, + "step": 45736 + }, + { + "epoch": 0.91476, + "grad_norm": 0.022235097363591194, + "learning_rate": 4.394051634063823e-07, + "loss": 0.0011, + "step": 45738 + }, + { + "epoch": 0.9148, + "grad_norm": 0.12750479578971863, + "learning_rate": 4.3899591021527743e-07, + "loss": 0.5493, + "step": 45740 + }, + { + "epoch": 0.91484, + "grad_norm": 0.00787275843322277, + "learning_rate": 4.385868434208673e-07, + "loss": 0.0001, + "step": 45742 + }, + { + "epoch": 0.91488, + "grad_norm": 0.0020663850009441376, + "learning_rate": 4.3817796303113004e-07, + "loss": 0.0167, + "step": 45744 + }, + { + "epoch": 0.91492, + "grad_norm": 0.02649892307817936, + "learning_rate": 4.377692690540358e-07, + "loss": 0.0065, + "step": 45746 + }, + { + "epoch": 0.91496, + "grad_norm": 0.028146883472800255, + "learning_rate": 4.373607614975528e-07, + "loss": 0.0004, + "step": 45748 + }, + { + "epoch": 0.915, + "grad_norm": 0.07928602397441864, + "learning_rate": 4.3695244036964567e-07, + "loss": 0.0088, + "step": 45750 + }, + { + "epoch": 0.91504, + "grad_norm": 0.04611869156360626, + "learning_rate": 4.3654430567827367e-07, + "loss": 0.0005, + "step": 45752 + }, + { + "epoch": 0.91508, + "grad_norm": 0.23784436285495758, + "learning_rate": 4.3613635743139373e-07, + "loss": 0.0024, + "step": 45754 + }, + { + "epoch": 0.91512, + "grad_norm": 0.0023583294823765755, + "learning_rate": 4.3572859563695944e-07, + "loss": 0.0017, + "step": 45756 + }, + { + "epoch": 0.91516, + "grad_norm": 0.37004873156547546, + "learning_rate": 4.353210203029212e-07, + "loss": 0.0045, + "step": 45758 + }, + { + "epoch": 0.9152, + "grad_norm": 0.024126458913087845, + "learning_rate": 4.349136314372204e-07, + "loss": 0.0006, + "step": 45760 + }, + { + "epoch": 0.91524, + "grad_norm": 0.057348109781742096, + "learning_rate": 4.345064290478063e-07, + "loss": 0.0068, + "step": 45762 + }, + { + "epoch": 0.91528, + "grad_norm": 0.17272500693798065, + "learning_rate": 4.340994131426135e-07, + "loss": 0.0013, + "step": 45764 + }, + { + "epoch": 0.91532, + "grad_norm": 0.03638453036546707, + "learning_rate": 4.336925837295769e-07, + "loss": 0.0007, + "step": 45766 + }, + { + "epoch": 0.91536, + "grad_norm": 20.48628044128418, + "learning_rate": 4.332859408166279e-07, + "loss": 1.2148, + "step": 45768 + }, + { + "epoch": 0.9154, + "grad_norm": 0.2938521206378937, + "learning_rate": 4.3287948441169457e-07, + "loss": 0.0039, + "step": 45770 + }, + { + "epoch": 0.91544, + "grad_norm": 0.007384158205240965, + "learning_rate": 4.3247321452270395e-07, + "loss": 0.0011, + "step": 45772 + }, + { + "epoch": 0.91548, + "grad_norm": 0.07944093644618988, + "learning_rate": 4.3206713115756973e-07, + "loss": 0.0033, + "step": 45774 + }, + { + "epoch": 0.91552, + "grad_norm": 0.30237051844596863, + "learning_rate": 4.316612343242155e-07, + "loss": 0.0058, + "step": 45776 + }, + { + "epoch": 0.91556, + "grad_norm": 0.29010751843452454, + "learning_rate": 4.312555240305505e-07, + "loss": 0.0038, + "step": 45778 + }, + { + "epoch": 0.9156, + "grad_norm": 0.34638193249702454, + "learning_rate": 4.308500002844862e-07, + "loss": 0.003, + "step": 45780 + }, + { + "epoch": 0.91564, + "grad_norm": 0.002195166191086173, + "learning_rate": 4.304446630939263e-07, + "loss": 0.0006, + "step": 45782 + }, + { + "epoch": 0.91568, + "grad_norm": 0.0016053443541750312, + "learning_rate": 4.3003951246677665e-07, + "loss": 0.0009, + "step": 45784 + }, + { + "epoch": 0.91572, + "grad_norm": 0.02574184350669384, + "learning_rate": 4.296345484109321e-07, + "loss": 0.001, + "step": 45786 + }, + { + "epoch": 0.91576, + "grad_norm": 0.033975303173065186, + "learning_rate": 4.292297709342885e-07, + "loss": 0.0004, + "step": 45788 + }, + { + "epoch": 0.9158, + "grad_norm": 0.021741919219493866, + "learning_rate": 4.288251800447385e-07, + "loss": 0.0012, + "step": 45790 + }, + { + "epoch": 0.91584, + "grad_norm": 0.04053414985537529, + "learning_rate": 4.2842077575017015e-07, + "loss": 0.0014, + "step": 45792 + }, + { + "epoch": 0.91588, + "grad_norm": 0.2205086052417755, + "learning_rate": 4.280165580584661e-07, + "loss": 0.0024, + "step": 45794 + }, + { + "epoch": 0.91592, + "grad_norm": 0.050063714385032654, + "learning_rate": 4.2761252697750557e-07, + "loss": 0.0019, + "step": 45796 + }, + { + "epoch": 0.91596, + "grad_norm": 1.2593461275100708, + "learning_rate": 4.2720868251517e-07, + "loss": 0.0266, + "step": 45798 + }, + { + "epoch": 0.916, + "grad_norm": 0.05665284022688866, + "learning_rate": 4.268050246793276e-07, + "loss": 0.0056, + "step": 45800 + }, + { + "epoch": 0.91604, + "grad_norm": 0.15340556204319, + "learning_rate": 4.2640155347784985e-07, + "loss": 0.0017, + "step": 45802 + }, + { + "epoch": 0.91608, + "grad_norm": 0.012025482021272182, + "learning_rate": 4.2599826891860484e-07, + "loss": 0.0213, + "step": 45804 + }, + { + "epoch": 0.91612, + "grad_norm": 0.0020335048902779818, + "learning_rate": 4.255951710094486e-07, + "loss": 0.0004, + "step": 45806 + }, + { + "epoch": 0.91616, + "grad_norm": 0.02044866606593132, + "learning_rate": 4.2519225975824695e-07, + "loss": 0.0006, + "step": 45808 + }, + { + "epoch": 0.9162, + "grad_norm": 3.689035415649414, + "learning_rate": 4.247895351728504e-07, + "loss": 0.0427, + "step": 45810 + }, + { + "epoch": 0.91624, + "grad_norm": 0.003971248399466276, + "learning_rate": 4.243869972611114e-07, + "loss": 0.0003, + "step": 45812 + }, + { + "epoch": 0.91628, + "grad_norm": 0.018061859533190727, + "learning_rate": 4.239846460308783e-07, + "loss": 0.0003, + "step": 45814 + }, + { + "epoch": 0.91632, + "grad_norm": 0.010744895786046982, + "learning_rate": 4.235824814899958e-07, + "loss": 0.0003, + "step": 45816 + }, + { + "epoch": 0.91636, + "grad_norm": 14.371161460876465, + "learning_rate": 4.2318050364630105e-07, + "loss": 0.1277, + "step": 45818 + }, + { + "epoch": 0.9164, + "grad_norm": 0.5646331906318665, + "learning_rate": 4.2277871250763327e-07, + "loss": 0.0058, + "step": 45820 + }, + { + "epoch": 0.91644, + "grad_norm": 0.07749311625957489, + "learning_rate": 4.2237710808182507e-07, + "loss": 0.0013, + "step": 45822 + }, + { + "epoch": 0.91648, + "grad_norm": 0.024758046492934227, + "learning_rate": 4.2197569037670584e-07, + "loss": 0.0003, + "step": 45824 + }, + { + "epoch": 0.91652, + "grad_norm": 0.00478333281353116, + "learning_rate": 4.2157445940010365e-07, + "loss": 0.0004, + "step": 45826 + }, + { + "epoch": 0.91656, + "grad_norm": 0.0044906302355229855, + "learning_rate": 4.2117341515983565e-07, + "loss": 0.0003, + "step": 45828 + }, + { + "epoch": 0.9166, + "grad_norm": 0.1043848767876625, + "learning_rate": 4.207725576637256e-07, + "loss": 0.0026, + "step": 45830 + }, + { + "epoch": 0.91664, + "grad_norm": 0.24415051937103271, + "learning_rate": 4.203718869195861e-07, + "loss": 0.0025, + "step": 45832 + }, + { + "epoch": 0.91668, + "grad_norm": 0.004227447789162397, + "learning_rate": 4.199714029352275e-07, + "loss": 0.0008, + "step": 45834 + }, + { + "epoch": 0.91672, + "grad_norm": 0.008789680898189545, + "learning_rate": 4.1957110571845925e-07, + "loss": 0.0004, + "step": 45836 + }, + { + "epoch": 0.91676, + "grad_norm": 0.23966343700885773, + "learning_rate": 4.1917099527708393e-07, + "loss": 0.0026, + "step": 45838 + }, + { + "epoch": 0.9168, + "grad_norm": 0.04547624662518501, + "learning_rate": 4.1877107161890416e-07, + "loss": 0.0005, + "step": 45840 + }, + { + "epoch": 0.91684, + "grad_norm": 0.07707823067903519, + "learning_rate": 4.1837133475171376e-07, + "loss": 0.0007, + "step": 45842 + }, + { + "epoch": 0.91688, + "grad_norm": 7.147840976715088, + "learning_rate": 4.1797178468330756e-07, + "loss": 0.0525, + "step": 45844 + }, + { + "epoch": 0.91692, + "grad_norm": 0.08570300042629242, + "learning_rate": 4.175724214214749e-07, + "loss": 0.0011, + "step": 45846 + }, + { + "epoch": 0.91696, + "grad_norm": 0.08119997382164001, + "learning_rate": 4.171732449740018e-07, + "loss": 0.0071, + "step": 45848 + }, + { + "epoch": 0.917, + "grad_norm": 0.031224077567458153, + "learning_rate": 4.167742553486676e-07, + "loss": 0.0013, + "step": 45850 + }, + { + "epoch": 0.91704, + "grad_norm": 0.010749456472694874, + "learning_rate": 4.1637545255325486e-07, + "loss": 0.0004, + "step": 45852 + }, + { + "epoch": 0.91708, + "grad_norm": 0.07802475988864899, + "learning_rate": 4.159768365955363e-07, + "loss": 0.0124, + "step": 45854 + }, + { + "epoch": 0.91712, + "grad_norm": 0.005230088252574205, + "learning_rate": 4.155784074832836e-07, + "loss": 0.0003, + "step": 45856 + }, + { + "epoch": 0.91716, + "grad_norm": 0.0026423679664731026, + "learning_rate": 4.1518016522426484e-07, + "loss": 0.0017, + "step": 45858 + }, + { + "epoch": 0.9172, + "grad_norm": 0.05646779388189316, + "learning_rate": 4.1478210982624055e-07, + "loss": 0.0011, + "step": 45860 + }, + { + "epoch": 0.91724, + "grad_norm": 0.001857804716564715, + "learning_rate": 4.1438424129697675e-07, + "loss": 0.0003, + "step": 45862 + }, + { + "epoch": 0.91728, + "grad_norm": 0.013236061669886112, + "learning_rate": 4.139865596442261e-07, + "loss": 0.0007, + "step": 45864 + }, + { + "epoch": 0.91732, + "grad_norm": 0.006350190378725529, + "learning_rate": 4.135890648757435e-07, + "loss": 0.0004, + "step": 45866 + }, + { + "epoch": 0.91736, + "grad_norm": 0.06105455011129379, + "learning_rate": 4.1319175699927603e-07, + "loss": 0.0006, + "step": 45868 + }, + { + "epoch": 0.9174, + "grad_norm": 3.35213303565979, + "learning_rate": 4.1279463602257207e-07, + "loss": 0.0238, + "step": 45870 + }, + { + "epoch": 0.91744, + "grad_norm": 0.012918034568428993, + "learning_rate": 4.1239770195337315e-07, + "loss": 0.0002, + "step": 45872 + }, + { + "epoch": 0.91748, + "grad_norm": 0.07345303893089294, + "learning_rate": 4.1200095479941525e-07, + "loss": 0.0047, + "step": 45874 + }, + { + "epoch": 0.91752, + "grad_norm": 1.9904526472091675, + "learning_rate": 4.116043945684356e-07, + "loss": 0.0252, + "step": 45876 + }, + { + "epoch": 0.91756, + "grad_norm": 0.0007041185162961483, + "learning_rate": 4.112080212681646e-07, + "loss": 0.0003, + "step": 45878 + }, + { + "epoch": 0.9176, + "grad_norm": 0.07169447094202042, + "learning_rate": 4.108118349063306e-07, + "loss": 0.001, + "step": 45880 + }, + { + "epoch": 0.91764, + "grad_norm": 0.04735250025987625, + "learning_rate": 4.1041583549065513e-07, + "loss": 0.0085, + "step": 45882 + }, + { + "epoch": 0.91768, + "grad_norm": 0.16719195246696472, + "learning_rate": 4.100200230288631e-07, + "loss": 0.0018, + "step": 45884 + }, + { + "epoch": 0.91772, + "grad_norm": 0.07024604082107544, + "learning_rate": 4.0962439752866513e-07, + "loss": 0.0081, + "step": 45886 + }, + { + "epoch": 0.91776, + "grad_norm": 0.026847148314118385, + "learning_rate": 4.0922895899777823e-07, + "loss": 0.0005, + "step": 45888 + }, + { + "epoch": 0.9178, + "grad_norm": 0.008259918540716171, + "learning_rate": 4.0883370744390973e-07, + "loss": 0.1156, + "step": 45890 + }, + { + "epoch": 0.91784, + "grad_norm": 0.22398100793361664, + "learning_rate": 4.084386428747655e-07, + "loss": 0.0026, + "step": 45892 + }, + { + "epoch": 0.91788, + "grad_norm": 0.04109858348965645, + "learning_rate": 4.0804376529805066e-07, + "loss": 0.001, + "step": 45894 + }, + { + "epoch": 0.91792, + "grad_norm": 0.002052825875580311, + "learning_rate": 4.0764907472145677e-07, + "loss": 0.0034, + "step": 45896 + }, + { + "epoch": 0.91796, + "grad_norm": 0.2535124123096466, + "learning_rate": 4.0725457115268654e-07, + "loss": 0.0041, + "step": 45898 + }, + { + "epoch": 0.918, + "grad_norm": 0.06350277364253998, + "learning_rate": 4.068602545994249e-07, + "loss": 0.0036, + "step": 45900 + }, + { + "epoch": 0.91804, + "grad_norm": 0.20438151061534882, + "learning_rate": 4.0646612506936245e-07, + "loss": 0.5705, + "step": 45902 + }, + { + "epoch": 0.91808, + "grad_norm": 0.03400660306215286, + "learning_rate": 4.0607218257018077e-07, + "loss": 0.0019, + "step": 45904 + }, + { + "epoch": 0.91812, + "grad_norm": 0.02560708113014698, + "learning_rate": 4.0567842710956374e-07, + "loss": 0.0005, + "step": 45906 + }, + { + "epoch": 0.91816, + "grad_norm": 0.03982136398553848, + "learning_rate": 4.0528485869518295e-07, + "loss": 0.001, + "step": 45908 + }, + { + "epoch": 0.9182, + "grad_norm": 0.3059442639350891, + "learning_rate": 4.0489147733471347e-07, + "loss": 0.0054, + "step": 45910 + }, + { + "epoch": 0.91824, + "grad_norm": 0.03397591412067413, + "learning_rate": 4.044982830358257e-07, + "loss": 0.0003, + "step": 45912 + }, + { + "epoch": 0.91828, + "grad_norm": 0.004802675452083349, + "learning_rate": 4.041052758061825e-07, + "loss": 0.0006, + "step": 45914 + }, + { + "epoch": 0.91832, + "grad_norm": 0.025368118658661842, + "learning_rate": 4.0371245565344994e-07, + "loss": 0.0008, + "step": 45916 + }, + { + "epoch": 0.91836, + "grad_norm": 0.11478802561759949, + "learning_rate": 4.033198225852797e-07, + "loss": 0.0014, + "step": 45918 + }, + { + "epoch": 0.9184, + "grad_norm": 0.03436170890927315, + "learning_rate": 4.0292737660933335e-07, + "loss": 0.1898, + "step": 45920 + }, + { + "epoch": 0.91844, + "grad_norm": 0.02123062126338482, + "learning_rate": 4.0253511773325813e-07, + "loss": 0.0016, + "step": 45922 + }, + { + "epoch": 0.91848, + "grad_norm": 0.14470374584197998, + "learning_rate": 4.0214304596470134e-07, + "loss": 0.0017, + "step": 45924 + }, + { + "epoch": 0.91852, + "grad_norm": 0.5892335772514343, + "learning_rate": 4.017511613113079e-07, + "loss": 0.0058, + "step": 45926 + }, + { + "epoch": 0.91856, + "grad_norm": 0.038561590015888214, + "learning_rate": 4.01359463780715e-07, + "loss": 0.0011, + "step": 45928 + }, + { + "epoch": 0.9186, + "grad_norm": 0.002602650783956051, + "learning_rate": 4.009679533805633e-07, + "loss": 0.0002, + "step": 45930 + }, + { + "epoch": 0.91864, + "grad_norm": 0.01422242820262909, + "learning_rate": 4.0057663011848324e-07, + "loss": 0.0002, + "step": 45932 + }, + { + "epoch": 0.91868, + "grad_norm": 0.40952742099761963, + "learning_rate": 4.001854940021022e-07, + "loss": 0.0036, + "step": 45934 + }, + { + "epoch": 0.91872, + "grad_norm": 0.06578662246465683, + "learning_rate": 3.9979454503904836e-07, + "loss": 0.001, + "step": 45936 + }, + { + "epoch": 0.91876, + "grad_norm": 0.010061727836728096, + "learning_rate": 3.994037832369435e-07, + "loss": 0.0001, + "step": 45938 + }, + { + "epoch": 0.9188, + "grad_norm": 0.003866961458697915, + "learning_rate": 3.990132086034026e-07, + "loss": 0.0002, + "step": 45940 + }, + { + "epoch": 0.91884, + "grad_norm": 0.0654391348361969, + "learning_rate": 3.98622821146043e-07, + "loss": 0.0011, + "step": 45942 + }, + { + "epoch": 0.91888, + "grad_norm": 0.02480776607990265, + "learning_rate": 3.9823262087247407e-07, + "loss": 0.0003, + "step": 45944 + }, + { + "epoch": 0.91892, + "grad_norm": 0.007490887772291899, + "learning_rate": 3.9784260779030413e-07, + "loss": 0.0004, + "step": 45946 + }, + { + "epoch": 0.91896, + "grad_norm": 0.054156988859176636, + "learning_rate": 3.974527819071372e-07, + "loss": 0.0018, + "step": 45948 + }, + { + "epoch": 0.919, + "grad_norm": 0.0581778958439827, + "learning_rate": 3.9706314323056936e-07, + "loss": 0.0474, + "step": 45950 + }, + { + "epoch": 0.91904, + "grad_norm": 0.0642884224653244, + "learning_rate": 3.9667369176820124e-07, + "loss": 0.002, + "step": 45952 + }, + { + "epoch": 0.91908, + "grad_norm": 0.036759331822395325, + "learning_rate": 3.962844275276234e-07, + "loss": 0.0012, + "step": 45954 + }, + { + "epoch": 0.91912, + "grad_norm": 0.0050952802412211895, + "learning_rate": 3.9589535051642425e-07, + "loss": 0.0002, + "step": 45956 + }, + { + "epoch": 0.91916, + "grad_norm": 0.011782840825617313, + "learning_rate": 3.95506460742191e-07, + "loss": 0.0051, + "step": 45958 + }, + { + "epoch": 0.9192, + "grad_norm": 0.13369207084178925, + "learning_rate": 3.9511775821250206e-07, + "loss": 0.002, + "step": 45960 + }, + { + "epoch": 0.91924, + "grad_norm": 0.1684371680021286, + "learning_rate": 3.947292429349403e-07, + "loss": 0.0029, + "step": 45962 + }, + { + "epoch": 0.91928, + "grad_norm": 0.23935022950172424, + "learning_rate": 3.9434091491707516e-07, + "loss": 0.0022, + "step": 45964 + }, + { + "epoch": 0.91932, + "grad_norm": 0.07203440368175507, + "learning_rate": 3.9395277416647835e-07, + "loss": 0.0008, + "step": 45966 + }, + { + "epoch": 0.91936, + "grad_norm": 0.12072347104549408, + "learning_rate": 3.935648206907194e-07, + "loss": 0.0178, + "step": 45968 + }, + { + "epoch": 0.9194, + "grad_norm": 0.5319276452064514, + "learning_rate": 3.931770544973601e-07, + "loss": 0.0046, + "step": 45970 + }, + { + "epoch": 0.91944, + "grad_norm": 0.04236568883061409, + "learning_rate": 3.927894755939576e-07, + "loss": 0.0024, + "step": 45972 + }, + { + "epoch": 0.91948, + "grad_norm": 0.0993051677942276, + "learning_rate": 3.9240208398807376e-07, + "loss": 0.0011, + "step": 45974 + }, + { + "epoch": 0.91952, + "grad_norm": 0.06417974829673767, + "learning_rate": 3.9201487968725584e-07, + "loss": 0.0124, + "step": 45976 + }, + { + "epoch": 0.91956, + "grad_norm": 0.031124215573072433, + "learning_rate": 3.916278626990544e-07, + "loss": 0.0018, + "step": 45978 + }, + { + "epoch": 0.9196, + "grad_norm": 0.04320380464196205, + "learning_rate": 3.912410330310157e-07, + "loss": 0.0007, + "step": 45980 + }, + { + "epoch": 0.91964, + "grad_norm": 0.043846480548381805, + "learning_rate": 3.90854390690677e-07, + "loss": 0.0005, + "step": 45982 + }, + { + "epoch": 0.91968, + "grad_norm": 0.3585543930530548, + "learning_rate": 3.904679356855823e-07, + "loss": 0.0032, + "step": 45984 + }, + { + "epoch": 0.91972, + "grad_norm": 0.0020582994911819696, + "learning_rate": 3.9008166802325997e-07, + "loss": 0.0001, + "step": 45986 + }, + { + "epoch": 0.91976, + "grad_norm": 0.08348649740219116, + "learning_rate": 3.896955877112452e-07, + "loss": 0.0015, + "step": 45988 + }, + { + "epoch": 0.9198, + "grad_norm": 3.8879575729370117, + "learning_rate": 3.8930969475706183e-07, + "loss": 0.0658, + "step": 45990 + }, + { + "epoch": 0.91984, + "grad_norm": 0.21692995727062225, + "learning_rate": 3.889239891682328e-07, + "loss": 0.0026, + "step": 45992 + }, + { + "epoch": 0.91988, + "grad_norm": 0.18367139995098114, + "learning_rate": 3.8853847095228105e-07, + "loss": 0.0013, + "step": 45994 + }, + { + "epoch": 0.91992, + "grad_norm": 0.016626635566353798, + "learning_rate": 3.8815314011671934e-07, + "loss": 0.0045, + "step": 45996 + }, + { + "epoch": 0.91996, + "grad_norm": 0.5453198552131653, + "learning_rate": 3.877679966690595e-07, + "loss": 0.0077, + "step": 45998 + }, + { + "epoch": 0.92, + "grad_norm": 0.04965073615312576, + "learning_rate": 3.8738304061681107e-07, + "loss": 0.0007, + "step": 46000 + }, + { + "epoch": 0.92004, + "grad_norm": 0.020392416045069695, + "learning_rate": 3.869982719674803e-07, + "loss": 0.0011, + "step": 46002 + }, + { + "epoch": 0.92008, + "grad_norm": 0.0189571063965559, + "learning_rate": 3.866136907285667e-07, + "loss": 0.0005, + "step": 46004 + }, + { + "epoch": 0.92012, + "grad_norm": 0.13701175153255463, + "learning_rate": 3.8622929690756983e-07, + "loss": 0.0013, + "step": 46006 + }, + { + "epoch": 0.92016, + "grad_norm": 0.024177588522434235, + "learning_rate": 3.858450905119815e-07, + "loss": 0.0012, + "step": 46008 + }, + { + "epoch": 0.9202, + "grad_norm": 0.08610852062702179, + "learning_rate": 3.854610715492924e-07, + "loss": 0.0057, + "step": 46010 + }, + { + "epoch": 0.92024, + "grad_norm": 0.1178896501660347, + "learning_rate": 3.8507724002699086e-07, + "loss": 0.0092, + "step": 46012 + }, + { + "epoch": 0.92028, + "grad_norm": 0.25748252868652344, + "learning_rate": 3.8469359595255663e-07, + "loss": 0.0017, + "step": 46014 + }, + { + "epoch": 0.92032, + "grad_norm": 0.12073256820440292, + "learning_rate": 3.843101393334725e-07, + "loss": 0.0014, + "step": 46016 + }, + { + "epoch": 0.92036, + "grad_norm": 0.0010615263599902391, + "learning_rate": 3.839268701772103e-07, + "loss": 0.0002, + "step": 46018 + }, + { + "epoch": 0.9204, + "grad_norm": 0.035454161465168, + "learning_rate": 3.835437884912474e-07, + "loss": 0.002, + "step": 46020 + }, + { + "epoch": 0.92044, + "grad_norm": 0.11138821393251419, + "learning_rate": 3.8316089428304894e-07, + "loss": 0.0013, + "step": 46022 + }, + { + "epoch": 0.92048, + "grad_norm": 0.03485461696982384, + "learning_rate": 3.827781875600789e-07, + "loss": 0.0526, + "step": 46024 + }, + { + "epoch": 0.92052, + "grad_norm": 0.0006212678272277117, + "learning_rate": 3.8239566832980026e-07, + "loss": 0.0002, + "step": 46026 + }, + { + "epoch": 0.92056, + "grad_norm": 0.03748089447617531, + "learning_rate": 3.820133365996692e-07, + "loss": 0.0007, + "step": 46028 + }, + { + "epoch": 0.9206, + "grad_norm": 0.6817750930786133, + "learning_rate": 3.8163119237713877e-07, + "loss": 0.0058, + "step": 46030 + }, + { + "epoch": 0.92064, + "grad_norm": 0.020115520805120468, + "learning_rate": 3.812492356696607e-07, + "loss": 0.0003, + "step": 46032 + }, + { + "epoch": 0.92068, + "grad_norm": 12.541598320007324, + "learning_rate": 3.808674664846812e-07, + "loss": 1.3135, + "step": 46034 + }, + { + "epoch": 0.92072, + "grad_norm": 3.158984899520874, + "learning_rate": 3.8048588482964224e-07, + "loss": 0.3784, + "step": 46036 + }, + { + "epoch": 0.92076, + "grad_norm": 0.022719522938132286, + "learning_rate": 3.801044907119844e-07, + "loss": 0.0004, + "step": 46038 + }, + { + "epoch": 0.9208, + "grad_norm": 0.017893420532345772, + "learning_rate": 3.7972328413914074e-07, + "loss": 0.2346, + "step": 46040 + }, + { + "epoch": 0.92084, + "grad_norm": 0.8384968638420105, + "learning_rate": 3.793422651185463e-07, + "loss": 0.0064, + "step": 46042 + }, + { + "epoch": 0.92088, + "grad_norm": 0.08551355451345444, + "learning_rate": 3.789614336576264e-07, + "loss": 0.0011, + "step": 46044 + }, + { + "epoch": 0.92092, + "grad_norm": 0.029380494728684425, + "learning_rate": 3.785807897638061e-07, + "loss": 0.0005, + "step": 46046 + }, + { + "epoch": 0.92096, + "grad_norm": 0.01932702213525772, + "learning_rate": 3.7820033344450836e-07, + "loss": 0.0026, + "step": 46048 + }, + { + "epoch": 0.921, + "grad_norm": 0.6527878642082214, + "learning_rate": 3.7782006470714614e-07, + "loss": 0.0089, + "step": 46050 + }, + { + "epoch": 0.92104, + "grad_norm": 0.10394315421581268, + "learning_rate": 3.77439983559138e-07, + "loss": 0.0031, + "step": 46052 + }, + { + "epoch": 0.92108, + "grad_norm": 0.24264055490493774, + "learning_rate": 3.7706009000789024e-07, + "loss": 0.0029, + "step": 46054 + }, + { + "epoch": 0.92112, + "grad_norm": 0.03322688117623329, + "learning_rate": 3.766803840608102e-07, + "loss": 0.0074, + "step": 46056 + }, + { + "epoch": 0.92116, + "grad_norm": 0.09484600275754929, + "learning_rate": 3.763008657253009e-07, + "loss": 0.0019, + "step": 46058 + }, + { + "epoch": 0.9212, + "grad_norm": 0.054110292345285416, + "learning_rate": 3.759215350087619e-07, + "loss": 0.0005, + "step": 46060 + }, + { + "epoch": 0.92124, + "grad_norm": 0.045226603746414185, + "learning_rate": 3.755423919185863e-07, + "loss": 0.0025, + "step": 46062 + }, + { + "epoch": 0.92128, + "grad_norm": 0.07574599981307983, + "learning_rate": 3.751634364621659e-07, + "loss": 0.002, + "step": 46064 + }, + { + "epoch": 0.92132, + "grad_norm": 0.5396245718002319, + "learning_rate": 3.7478466864689036e-07, + "loss": 0.0058, + "step": 46066 + }, + { + "epoch": 0.92136, + "grad_norm": 0.0008893092162907124, + "learning_rate": 3.744060884801426e-07, + "loss": 0.0019, + "step": 46068 + }, + { + "epoch": 0.9214, + "grad_norm": 0.004073928575962782, + "learning_rate": 3.7402769596930567e-07, + "loss": 0.0002, + "step": 46070 + }, + { + "epoch": 0.92144, + "grad_norm": 0.027670923620462418, + "learning_rate": 3.736494911217514e-07, + "loss": 0.0006, + "step": 46072 + }, + { + "epoch": 0.92148, + "grad_norm": 0.2048349678516388, + "learning_rate": 3.7327147394485954e-07, + "loss": 0.0016, + "step": 46074 + }, + { + "epoch": 0.92152, + "grad_norm": 0.03332135081291199, + "learning_rate": 3.7289364444599406e-07, + "loss": 0.1059, + "step": 46076 + }, + { + "epoch": 0.92156, + "grad_norm": 0.014702389016747475, + "learning_rate": 3.725160026325247e-07, + "loss": 0.0013, + "step": 46078 + }, + { + "epoch": 0.9216, + "grad_norm": 0.02492348477244377, + "learning_rate": 3.721385485118123e-07, + "loss": 0.0003, + "step": 46080 + }, + { + "epoch": 0.92164, + "grad_norm": 0.17042456567287445, + "learning_rate": 3.717612820912142e-07, + "loss": 0.0019, + "step": 46082 + }, + { + "epoch": 0.92168, + "grad_norm": 0.3437637388706207, + "learning_rate": 3.71384203378089e-07, + "loss": 0.0036, + "step": 46084 + }, + { + "epoch": 0.92172, + "grad_norm": 0.010597768239676952, + "learning_rate": 3.7100731237978414e-07, + "loss": 0.0003, + "step": 46086 + }, + { + "epoch": 0.92176, + "grad_norm": 0.005580546800047159, + "learning_rate": 3.7063060910365043e-07, + "loss": 0.0004, + "step": 46088 + }, + { + "epoch": 0.9218, + "grad_norm": 0.13274219632148743, + "learning_rate": 3.7025409355702977e-07, + "loss": 0.0015, + "step": 46090 + }, + { + "epoch": 0.92184, + "grad_norm": 2.4070355892181396, + "learning_rate": 3.6987776574726297e-07, + "loss": 0.0191, + "step": 46092 + }, + { + "epoch": 0.92188, + "grad_norm": 0.10727208852767944, + "learning_rate": 3.6950162568168855e-07, + "loss": 0.0431, + "step": 46094 + }, + { + "epoch": 0.92192, + "grad_norm": 0.015618831850588322, + "learning_rate": 3.691256733676363e-07, + "loss": 0.0018, + "step": 46096 + }, + { + "epoch": 0.92196, + "grad_norm": 0.14582398533821106, + "learning_rate": 3.68749908812438e-07, + "loss": 0.0019, + "step": 46098 + }, + { + "epoch": 0.922, + "grad_norm": 0.019040226936340332, + "learning_rate": 3.68374332023419e-07, + "loss": 0.0006, + "step": 46100 + }, + { + "epoch": 0.92204, + "grad_norm": 0.03876359388232231, + "learning_rate": 3.6799894300790117e-07, + "loss": 0.0011, + "step": 46102 + }, + { + "epoch": 0.92208, + "grad_norm": 0.0005412101745605469, + "learning_rate": 3.676237417732009e-07, + "loss": 0.0007, + "step": 46104 + }, + { + "epoch": 0.92212, + "grad_norm": 0.007566587999463081, + "learning_rate": 3.6724872832663793e-07, + "loss": 0.0003, + "step": 46106 + }, + { + "epoch": 0.92216, + "grad_norm": 0.23689444363117218, + "learning_rate": 3.668739026755175e-07, + "loss": 0.0024, + "step": 46108 + }, + { + "epoch": 0.9222, + "grad_norm": 1.1080619096755981, + "learning_rate": 3.664992648271526e-07, + "loss": 0.0125, + "step": 46110 + }, + { + "epoch": 0.92224, + "grad_norm": 0.05615372583270073, + "learning_rate": 3.661248147888419e-07, + "loss": 0.0014, + "step": 46112 + }, + { + "epoch": 0.92228, + "grad_norm": 0.0004897756734862924, + "learning_rate": 3.6575055256788836e-07, + "loss": 0.0044, + "step": 46114 + }, + { + "epoch": 0.92232, + "grad_norm": 0.015014316886663437, + "learning_rate": 3.653764781715896e-07, + "loss": 0.0003, + "step": 46116 + }, + { + "epoch": 0.92236, + "grad_norm": 0.393184632062912, + "learning_rate": 3.650025916072342e-07, + "loss": 0.004, + "step": 46118 + }, + { + "epoch": 0.9224, + "grad_norm": 0.718443751335144, + "learning_rate": 3.646288928821151e-07, + "loss": 0.0125, + "step": 46120 + }, + { + "epoch": 0.92244, + "grad_norm": 0.026583366096019745, + "learning_rate": 3.642553820035155e-07, + "loss": 0.0029, + "step": 46122 + }, + { + "epoch": 0.92248, + "grad_norm": 0.8774346709251404, + "learning_rate": 3.638820589787184e-07, + "loss": 0.0072, + "step": 46124 + }, + { + "epoch": 0.92252, + "grad_norm": 2.342155694961548, + "learning_rate": 3.6350892381500135e-07, + "loss": 0.0197, + "step": 46126 + }, + { + "epoch": 0.92256, + "grad_norm": 0.1262385994195938, + "learning_rate": 3.631359765196407e-07, + "loss": 0.0035, + "step": 46128 + }, + { + "epoch": 0.9226, + "grad_norm": 0.07077689468860626, + "learning_rate": 3.627632170999029e-07, + "loss": 0.0023, + "step": 46130 + }, + { + "epoch": 0.92264, + "grad_norm": 0.07839836925268173, + "learning_rate": 3.623906455630599e-07, + "loss": 0.0008, + "step": 46132 + }, + { + "epoch": 0.92268, + "grad_norm": 0.5930638313293457, + "learning_rate": 3.6201826191637255e-07, + "loss": 0.0066, + "step": 46134 + }, + { + "epoch": 0.92272, + "grad_norm": 0.1595178246498108, + "learning_rate": 3.616460661670995e-07, + "loss": 0.0022, + "step": 46136 + }, + { + "epoch": 0.92276, + "grad_norm": 1.6235812902450562, + "learning_rate": 3.612740583225005e-07, + "loss": 0.015, + "step": 46138 + }, + { + "epoch": 0.9228, + "grad_norm": 0.014698355458676815, + "learning_rate": 3.609022383898242e-07, + "loss": 0.0016, + "step": 46140 + }, + { + "epoch": 0.92284, + "grad_norm": 0.010234924033284187, + "learning_rate": 3.6053060637632365e-07, + "loss": 0.0008, + "step": 46142 + }, + { + "epoch": 0.92288, + "grad_norm": 3.2251322269439697, + "learning_rate": 3.6015916228924083e-07, + "loss": 0.0378, + "step": 46144 + }, + { + "epoch": 0.92292, + "grad_norm": 0.8520952463150024, + "learning_rate": 3.597879061358167e-07, + "loss": 0.0145, + "step": 46146 + }, + { + "epoch": 0.92296, + "grad_norm": 0.0005503965076059103, + "learning_rate": 3.5941683792329316e-07, + "loss": 0.0008, + "step": 46148 + }, + { + "epoch": 0.923, + "grad_norm": 0.04108404740691185, + "learning_rate": 3.590459576589e-07, + "loss": 0.0055, + "step": 46150 + }, + { + "epoch": 0.92304, + "grad_norm": 0.1181265264749527, + "learning_rate": 3.586752653498693e-07, + "loss": 0.0014, + "step": 46152 + }, + { + "epoch": 0.92308, + "grad_norm": 0.024476433172822, + "learning_rate": 3.5830476100342734e-07, + "loss": 0.0011, + "step": 46154 + }, + { + "epoch": 0.92312, + "grad_norm": 15.198389053344727, + "learning_rate": 3.579344446267985e-07, + "loss": 0.6088, + "step": 46156 + }, + { + "epoch": 0.92316, + "grad_norm": 0.1250234842300415, + "learning_rate": 3.5756431622720136e-07, + "loss": 0.0096, + "step": 46158 + }, + { + "epoch": 0.9232, + "grad_norm": 0.15404841303825378, + "learning_rate": 3.571943758118546e-07, + "loss": 0.0022, + "step": 46160 + }, + { + "epoch": 0.92324, + "grad_norm": 0.1919483095407486, + "learning_rate": 3.5682462338796574e-07, + "loss": 0.0022, + "step": 46162 + }, + { + "epoch": 0.92328, + "grad_norm": 0.09478466212749481, + "learning_rate": 3.5645505896274577e-07, + "loss": 0.0008, + "step": 46164 + }, + { + "epoch": 0.92332, + "grad_norm": 0.19983425736427307, + "learning_rate": 3.560856825433989e-07, + "loss": 0.003, + "step": 46166 + }, + { + "epoch": 0.92336, + "grad_norm": 0.08474752306938171, + "learning_rate": 3.5571649413712606e-07, + "loss": 0.0016, + "step": 46168 + }, + { + "epoch": 0.9234, + "grad_norm": 0.12556816637516022, + "learning_rate": 3.553474937511281e-07, + "loss": 0.0014, + "step": 46170 + }, + { + "epoch": 0.92344, + "grad_norm": 7.289741039276123, + "learning_rate": 3.549786813925926e-07, + "loss": 0.1383, + "step": 46172 + }, + { + "epoch": 0.92348, + "grad_norm": 0.14714793860912323, + "learning_rate": 3.546100570687161e-07, + "loss": 0.0047, + "step": 46174 + }, + { + "epoch": 0.92352, + "grad_norm": 0.02049958147108555, + "learning_rate": 3.5424162078668167e-07, + "loss": 0.0003, + "step": 46176 + }, + { + "epoch": 0.92356, + "grad_norm": 0.11546533554792404, + "learning_rate": 3.538733725536725e-07, + "loss": 0.0009, + "step": 46178 + }, + { + "epoch": 0.9236, + "grad_norm": 0.057628732174634933, + "learning_rate": 3.5350531237686723e-07, + "loss": 0.0008, + "step": 46180 + }, + { + "epoch": 0.92364, + "grad_norm": 0.005031578708440065, + "learning_rate": 3.5313744026344353e-07, + "loss": 0.0004, + "step": 46182 + }, + { + "epoch": 0.92368, + "grad_norm": 0.5041580200195312, + "learning_rate": 3.527697562205723e-07, + "loss": 0.0053, + "step": 46184 + }, + { + "epoch": 0.92372, + "grad_norm": 0.029566088691353798, + "learning_rate": 3.5240226025542003e-07, + "loss": 0.0008, + "step": 46186 + }, + { + "epoch": 0.92376, + "grad_norm": 3.537062883377075, + "learning_rate": 3.520349523751532e-07, + "loss": 0.0476, + "step": 46188 + }, + { + "epoch": 0.9238, + "grad_norm": 0.037108514457941055, + "learning_rate": 3.516678325869316e-07, + "loss": 0.0006, + "step": 46190 + }, + { + "epoch": 0.92384, + "grad_norm": 0.8485611081123352, + "learning_rate": 3.5130090089791513e-07, + "loss": 0.0118, + "step": 46192 + }, + { + "epoch": 0.92388, + "grad_norm": 0.09171444177627563, + "learning_rate": 3.509341573152514e-07, + "loss": 0.0013, + "step": 46194 + }, + { + "epoch": 0.92392, + "grad_norm": 0.11840806901454926, + "learning_rate": 3.5056760184609684e-07, + "loss": 0.001, + "step": 46196 + }, + { + "epoch": 0.92396, + "grad_norm": 0.18421146273612976, + "learning_rate": 3.5020123449759356e-07, + "loss": 0.0025, + "step": 46198 + }, + { + "epoch": 0.924, + "grad_norm": 0.017751678824424744, + "learning_rate": 3.498350552768859e-07, + "loss": 0.0004, + "step": 46200 + }, + { + "epoch": 0.92404, + "grad_norm": 0.007029278203845024, + "learning_rate": 3.494690641911125e-07, + "loss": 0.0017, + "step": 46202 + }, + { + "epoch": 0.92408, + "grad_norm": 0.06050393730401993, + "learning_rate": 3.4910326124740545e-07, + "loss": 0.0012, + "step": 46204 + }, + { + "epoch": 0.92412, + "grad_norm": 0.4922419786453247, + "learning_rate": 3.487376464529024e-07, + "loss": 0.0065, + "step": 46206 + }, + { + "epoch": 0.92416, + "grad_norm": 0.45089423656463623, + "learning_rate": 3.4837221981472546e-07, + "loss": 0.0054, + "step": 46208 + }, + { + "epoch": 0.9242, + "grad_norm": 0.49682408571243286, + "learning_rate": 3.480069813400022e-07, + "loss": 0.0054, + "step": 46210 + }, + { + "epoch": 0.92424, + "grad_norm": 0.4783375561237335, + "learning_rate": 3.4764193103585144e-07, + "loss": 0.0039, + "step": 46212 + }, + { + "epoch": 0.92428, + "grad_norm": 7.580302715301514, + "learning_rate": 3.4727706890938964e-07, + "loss": 0.1172, + "step": 46214 + }, + { + "epoch": 0.92432, + "grad_norm": 0.0025025722570717335, + "learning_rate": 3.469123949677333e-07, + "loss": 0.0031, + "step": 46216 + }, + { + "epoch": 0.92436, + "grad_norm": 0.011595956049859524, + "learning_rate": 3.46547909217988e-07, + "loss": 0.0007, + "step": 46218 + }, + { + "epoch": 0.9244, + "grad_norm": 0.11487287282943726, + "learning_rate": 3.4618361166726123e-07, + "loss": 0.001, + "step": 46220 + }, + { + "epoch": 0.92444, + "grad_norm": 0.021143576130270958, + "learning_rate": 3.4581950232265513e-07, + "loss": 0.0045, + "step": 46222 + }, + { + "epoch": 0.92448, + "grad_norm": 0.014442713931202888, + "learning_rate": 3.4545558119126966e-07, + "loss": 0.0002, + "step": 46224 + }, + { + "epoch": 0.92452, + "grad_norm": 0.0022191940806806087, + "learning_rate": 3.450918482801957e-07, + "loss": 0.0007, + "step": 46226 + }, + { + "epoch": 0.92456, + "grad_norm": 0.10836409777402878, + "learning_rate": 3.4472830359652875e-07, + "loss": 0.0025, + "step": 46228 + }, + { + "epoch": 0.9246, + "grad_norm": 0.07360844314098358, + "learning_rate": 3.4436494714735313e-07, + "loss": 0.0154, + "step": 46230 + }, + { + "epoch": 0.92464, + "grad_norm": 0.09947749227285385, + "learning_rate": 3.4400177893975426e-07, + "loss": 0.0011, + "step": 46232 + }, + { + "epoch": 0.92468, + "grad_norm": 9.461776733398438, + "learning_rate": 3.436387989808121e-07, + "loss": 0.0802, + "step": 46234 + }, + { + "epoch": 0.92472, + "grad_norm": 0.06947571784257889, + "learning_rate": 3.4327600727760203e-07, + "loss": 0.001, + "step": 46236 + }, + { + "epoch": 0.92476, + "grad_norm": 0.006386901717633009, + "learning_rate": 3.4291340383719953e-07, + "loss": 0.0002, + "step": 46238 + }, + { + "epoch": 0.9248, + "grad_norm": 0.5728170275688171, + "learning_rate": 3.4255098866667114e-07, + "loss": 0.0067, + "step": 46240 + }, + { + "epoch": 0.92484, + "grad_norm": 0.06917519122362137, + "learning_rate": 3.4218876177308345e-07, + "loss": 0.0013, + "step": 46242 + }, + { + "epoch": 0.92488, + "grad_norm": 0.027403458952903748, + "learning_rate": 3.4182672316349754e-07, + "loss": 0.0134, + "step": 46244 + }, + { + "epoch": 0.92492, + "grad_norm": 0.002860757987946272, + "learning_rate": 3.414648728449721e-07, + "loss": 0.0009, + "step": 46246 + }, + { + "epoch": 0.92496, + "grad_norm": 0.0012545902281999588, + "learning_rate": 3.4110321082456154e-07, + "loss": 0.0002, + "step": 46248 + }, + { + "epoch": 0.925, + "grad_norm": 0.023402608931064606, + "learning_rate": 3.4074173710931804e-07, + "loss": 0.0106, + "step": 46250 + }, + { + "epoch": 0.92504, + "grad_norm": 0.06234217435121536, + "learning_rate": 3.4038045170628585e-07, + "loss": 0.0015, + "step": 46252 + }, + { + "epoch": 0.92508, + "grad_norm": 0.2753390669822693, + "learning_rate": 3.4001935462250943e-07, + "loss": 0.0074, + "step": 46254 + }, + { + "epoch": 0.92512, + "grad_norm": 0.08606875687837601, + "learning_rate": 3.3965844586503095e-07, + "loss": 0.0011, + "step": 46256 + }, + { + "epoch": 0.92516, + "grad_norm": 0.02870182693004608, + "learning_rate": 3.3929772544088137e-07, + "loss": 0.0027, + "step": 46258 + }, + { + "epoch": 0.9252, + "grad_norm": 0.059943825006484985, + "learning_rate": 3.3893719335709953e-07, + "loss": 0.011, + "step": 46260 + }, + { + "epoch": 0.92524, + "grad_norm": 0.07946133613586426, + "learning_rate": 3.385768496207087e-07, + "loss": 0.0006, + "step": 46262 + }, + { + "epoch": 0.92528, + "grad_norm": 0.32869893312454224, + "learning_rate": 3.382166942387377e-07, + "loss": 0.0058, + "step": 46264 + }, + { + "epoch": 0.92532, + "grad_norm": 0.0022607475984841585, + "learning_rate": 3.378567272182054e-07, + "loss": 0.0024, + "step": 46266 + }, + { + "epoch": 0.92536, + "grad_norm": 0.046467047184705734, + "learning_rate": 3.3749694856613167e-07, + "loss": 0.0006, + "step": 46268 + }, + { + "epoch": 0.9254, + "grad_norm": 0.1628330647945404, + "learning_rate": 3.3713735828952985e-07, + "loss": 0.0049, + "step": 46270 + }, + { + "epoch": 0.92544, + "grad_norm": 0.12159696966409683, + "learning_rate": 3.3677795639540877e-07, + "loss": 0.0012, + "step": 46272 + }, + { + "epoch": 0.92548, + "grad_norm": 20.10405731201172, + "learning_rate": 3.3641874289077614e-07, + "loss": 0.5116, + "step": 46274 + }, + { + "epoch": 0.92552, + "grad_norm": 0.07831979542970657, + "learning_rate": 3.3605971778263524e-07, + "loss": 0.0015, + "step": 46276 + }, + { + "epoch": 0.92556, + "grad_norm": 0.016888368874788284, + "learning_rate": 3.3570088107798605e-07, + "loss": 0.0003, + "step": 46278 + }, + { + "epoch": 0.9256, + "grad_norm": 0.06491062790155411, + "learning_rate": 3.3534223278382405e-07, + "loss": 0.0008, + "step": 46280 + }, + { + "epoch": 0.92564, + "grad_norm": 0.13102425634860992, + "learning_rate": 3.3498377290714145e-07, + "loss": 0.0014, + "step": 46282 + }, + { + "epoch": 0.92568, + "grad_norm": 0.45188984274864197, + "learning_rate": 3.346255014549249e-07, + "loss": 0.006, + "step": 46284 + }, + { + "epoch": 0.92572, + "grad_norm": 0.2590523958206177, + "learning_rate": 3.3426741843416097e-07, + "loss": 0.0042, + "step": 46286 + }, + { + "epoch": 0.92576, + "grad_norm": 0.0019331182120367885, + "learning_rate": 3.339095238518286e-07, + "loss": 0.0003, + "step": 46288 + }, + { + "epoch": 0.9258, + "grad_norm": 0.026858225464820862, + "learning_rate": 3.3355181771490776e-07, + "loss": 0.0012, + "step": 46290 + }, + { + "epoch": 0.92584, + "grad_norm": 0.02269257791340351, + "learning_rate": 3.3319430003037165e-07, + "loss": 0.0005, + "step": 46292 + }, + { + "epoch": 0.92588, + "grad_norm": 0.031209899112582207, + "learning_rate": 3.3283697080518706e-07, + "loss": 0.0007, + "step": 46294 + }, + { + "epoch": 0.92592, + "grad_norm": 0.058717045933008194, + "learning_rate": 3.32479830046325e-07, + "loss": 0.0007, + "step": 46296 + }, + { + "epoch": 0.92596, + "grad_norm": 0.29124557971954346, + "learning_rate": 3.3212287776074437e-07, + "loss": 0.0032, + "step": 46298 + }, + { + "epoch": 0.926, + "grad_norm": 0.3711288571357727, + "learning_rate": 3.3176611395540625e-07, + "loss": 0.0036, + "step": 46300 + }, + { + "epoch": 0.92604, + "grad_norm": 0.005964586045593023, + "learning_rate": 3.314095386372651e-07, + "loss": 0.0004, + "step": 46302 + }, + { + "epoch": 0.92608, + "grad_norm": 3.9877758026123047, + "learning_rate": 3.31053151813272e-07, + "loss": 0.0528, + "step": 46304 + }, + { + "epoch": 0.92612, + "grad_norm": 0.46252715587615967, + "learning_rate": 3.306969534903781e-07, + "loss": 0.0074, + "step": 46306 + }, + { + "epoch": 0.92616, + "grad_norm": 0.0037873212713748217, + "learning_rate": 3.303409436755234e-07, + "loss": 0.0044, + "step": 46308 + }, + { + "epoch": 0.9262, + "grad_norm": 0.0987103283405304, + "learning_rate": 3.2998512237565005e-07, + "loss": 0.0029, + "step": 46310 + }, + { + "epoch": 0.92624, + "grad_norm": 0.04651351273059845, + "learning_rate": 3.296294895976948e-07, + "loss": 0.0014, + "step": 46312 + }, + { + "epoch": 0.92628, + "grad_norm": 1.0951178073883057, + "learning_rate": 3.2927404534859317e-07, + "loss": 0.5964, + "step": 46314 + }, + { + "epoch": 0.92632, + "grad_norm": 0.7390003800392151, + "learning_rate": 3.289187896352708e-07, + "loss": 0.0065, + "step": 46316 + }, + { + "epoch": 0.92636, + "grad_norm": 0.021618837490677834, + "learning_rate": 3.285637224646565e-07, + "loss": 0.0003, + "step": 46318 + }, + { + "epoch": 0.9264, + "grad_norm": 0.20493987202644348, + "learning_rate": 3.282088438436715e-07, + "loss": 0.0022, + "step": 46320 + }, + { + "epoch": 0.92644, + "grad_norm": 0.01222136989235878, + "learning_rate": 3.2785415377923457e-07, + "loss": 0.0012, + "step": 46322 + }, + { + "epoch": 0.92648, + "grad_norm": 0.343513548374176, + "learning_rate": 3.2749965227826144e-07, + "loss": 0.0028, + "step": 46324 + }, + { + "epoch": 0.92652, + "grad_norm": 0.007292186841368675, + "learning_rate": 3.271453393476598e-07, + "loss": 0.0073, + "step": 46326 + }, + { + "epoch": 0.92656, + "grad_norm": 0.00963062234222889, + "learning_rate": 3.267912149943431e-07, + "loss": 0.0002, + "step": 46328 + }, + { + "epoch": 0.9266, + "grad_norm": 0.006616816855967045, + "learning_rate": 3.2643727922520905e-07, + "loss": 0.0008, + "step": 46330 + }, + { + "epoch": 0.92664, + "grad_norm": 0.4794946312904358, + "learning_rate": 3.260835320471611e-07, + "loss": 0.0041, + "step": 46332 + }, + { + "epoch": 0.92668, + "grad_norm": 0.04531199485063553, + "learning_rate": 3.25729973467096e-07, + "loss": 0.0007, + "step": 46334 + }, + { + "epoch": 0.92672, + "grad_norm": 0.3145814538002014, + "learning_rate": 3.2537660349190483e-07, + "loss": 0.004, + "step": 46336 + }, + { + "epoch": 0.92676, + "grad_norm": 0.3438817262649536, + "learning_rate": 3.250234221284787e-07, + "loss": 0.0028, + "step": 46338 + }, + { + "epoch": 0.9268, + "grad_norm": 0.13649438321590424, + "learning_rate": 3.246704293837011e-07, + "loss": 0.0034, + "step": 46340 + }, + { + "epoch": 0.92684, + "grad_norm": 0.0030964447651058435, + "learning_rate": 3.243176252644542e-07, + "loss": 0.0008, + "step": 46342 + }, + { + "epoch": 0.92688, + "grad_norm": 0.0003583258076105267, + "learning_rate": 3.23965009777617e-07, + "loss": 0.001, + "step": 46344 + }, + { + "epoch": 0.92692, + "grad_norm": 0.0010267728939652443, + "learning_rate": 3.236125829300651e-07, + "loss": 0.0004, + "step": 46346 + }, + { + "epoch": 0.92696, + "grad_norm": 0.11201495677232742, + "learning_rate": 3.232603447286653e-07, + "loss": 0.002, + "step": 46348 + }, + { + "epoch": 0.927, + "grad_norm": 0.015615635551512241, + "learning_rate": 3.2290829518028867e-07, + "loss": 0.3731, + "step": 46350 + }, + { + "epoch": 0.92704, + "grad_norm": 0.014533006586134434, + "learning_rate": 3.2255643429179526e-07, + "loss": 0.0002, + "step": 46352 + }, + { + "epoch": 0.92708, + "grad_norm": 0.023927723988890648, + "learning_rate": 3.222047620700475e-07, + "loss": 0.0005, + "step": 46354 + }, + { + "epoch": 0.92712, + "grad_norm": 0.2116091549396515, + "learning_rate": 3.2185327852189974e-07, + "loss": 0.004, + "step": 46356 + }, + { + "epoch": 0.92716, + "grad_norm": 0.24467086791992188, + "learning_rate": 3.215019836542055e-07, + "loss": 0.008, + "step": 46358 + }, + { + "epoch": 0.9272, + "grad_norm": 0.2045249491930008, + "learning_rate": 3.211508774738137e-07, + "loss": 0.0025, + "step": 46360 + }, + { + "epoch": 0.92724, + "grad_norm": 0.2488834708929062, + "learning_rate": 3.2079995998756775e-07, + "loss": 0.0099, + "step": 46362 + }, + { + "epoch": 0.92728, + "grad_norm": 0.016162993386387825, + "learning_rate": 3.2044923120230996e-07, + "loss": 0.0134, + "step": 46364 + }, + { + "epoch": 0.92732, + "grad_norm": 0.021963533014059067, + "learning_rate": 3.2009869112487714e-07, + "loss": 0.3011, + "step": 46366 + }, + { + "epoch": 0.92736, + "grad_norm": 0.03951166570186615, + "learning_rate": 3.1974833976210376e-07, + "loss": 0.0009, + "step": 46368 + }, + { + "epoch": 0.9274, + "grad_norm": 0.021226955577731133, + "learning_rate": 3.19398177120821e-07, + "loss": 0.0046, + "step": 46370 + }, + { + "epoch": 0.92744, + "grad_norm": 0.05220678076148033, + "learning_rate": 3.1904820320785567e-07, + "loss": 0.0006, + "step": 46372 + }, + { + "epoch": 0.92748, + "grad_norm": 0.02072591707110405, + "learning_rate": 3.186984180300279e-07, + "loss": 0.0019, + "step": 46374 + }, + { + "epoch": 0.92752, + "grad_norm": 0.23487120866775513, + "learning_rate": 3.1834882159415883e-07, + "loss": 0.0022, + "step": 46376 + }, + { + "epoch": 0.92756, + "grad_norm": 0.02070455066859722, + "learning_rate": 3.1799941390706525e-07, + "loss": 0.0003, + "step": 46378 + }, + { + "epoch": 0.9276, + "grad_norm": 2.032776117324829, + "learning_rate": 3.1765019497555617e-07, + "loss": 0.0213, + "step": 46380 + }, + { + "epoch": 0.92764, + "grad_norm": 0.07202746719121933, + "learning_rate": 3.173011648064428e-07, + "loss": 0.0007, + "step": 46382 + }, + { + "epoch": 0.92768, + "grad_norm": 0.010795900598168373, + "learning_rate": 3.1695232340652637e-07, + "loss": 0.0002, + "step": 46384 + }, + { + "epoch": 0.92772, + "grad_norm": 0.01705450564622879, + "learning_rate": 3.166036707826115e-07, + "loss": 0.0003, + "step": 46386 + }, + { + "epoch": 0.92776, + "grad_norm": 0.010308584198355675, + "learning_rate": 3.162552069414926e-07, + "loss": 0.0016, + "step": 46388 + }, + { + "epoch": 0.9278, + "grad_norm": 0.04324901103973389, + "learning_rate": 3.1590693188996324e-07, + "loss": 0.0006, + "step": 46390 + }, + { + "epoch": 0.92784, + "grad_norm": 0.05843216925859451, + "learning_rate": 3.1555884563481577e-07, + "loss": 0.0012, + "step": 46392 + }, + { + "epoch": 0.92788, + "grad_norm": 3.7606377601623535, + "learning_rate": 3.152109481828325e-07, + "loss": 0.9095, + "step": 46394 + }, + { + "epoch": 0.92792, + "grad_norm": 0.10030464828014374, + "learning_rate": 3.148632395407991e-07, + "loss": 0.0017, + "step": 46396 + }, + { + "epoch": 0.92796, + "grad_norm": 0.1909165382385254, + "learning_rate": 3.1451571971549246e-07, + "loss": 0.0015, + "step": 46398 + }, + { + "epoch": 0.928, + "grad_norm": 0.9527091979980469, + "learning_rate": 3.1416838871368925e-07, + "loss": 0.0075, + "step": 46400 + }, + { + "epoch": 0.92804, + "grad_norm": 0.927218496799469, + "learning_rate": 3.1382124654215864e-07, + "loss": 0.0106, + "step": 46402 + }, + { + "epoch": 0.92808, + "grad_norm": 0.01814902573823929, + "learning_rate": 3.1347429320767064e-07, + "loss": 0.0009, + "step": 46404 + }, + { + "epoch": 0.92812, + "grad_norm": 0.04464906081557274, + "learning_rate": 3.131275287169877e-07, + "loss": 0.0008, + "step": 46406 + }, + { + "epoch": 0.92816, + "grad_norm": 0.10114946216344833, + "learning_rate": 3.127809530768711e-07, + "loss": 0.0014, + "step": 46408 + }, + { + "epoch": 0.9282, + "grad_norm": 0.07661156356334686, + "learning_rate": 3.1243456629407644e-07, + "loss": 0.0012, + "step": 46410 + }, + { + "epoch": 0.92824, + "grad_norm": 0.10275425016880035, + "learning_rate": 3.120883683753584e-07, + "loss": 0.0014, + "step": 46412 + }, + { + "epoch": 0.92828, + "grad_norm": 0.32570645213127136, + "learning_rate": 3.117423593274649e-07, + "loss": 0.033, + "step": 46414 + }, + { + "epoch": 0.92832, + "grad_norm": 0.05533728376030922, + "learning_rate": 3.1139653915714053e-07, + "loss": 0.0076, + "step": 46416 + }, + { + "epoch": 0.92836, + "grad_norm": 0.08931943774223328, + "learning_rate": 3.11050907871131e-07, + "loss": 0.0016, + "step": 46418 + }, + { + "epoch": 0.9284, + "grad_norm": 1.1324278116226196, + "learning_rate": 3.10705465476171e-07, + "loss": 0.0127, + "step": 46420 + }, + { + "epoch": 0.92844, + "grad_norm": 0.4795975089073181, + "learning_rate": 3.1036021197899613e-07, + "loss": 0.0126, + "step": 46422 + }, + { + "epoch": 0.92848, + "grad_norm": 0.03335003927350044, + "learning_rate": 3.1001514738633775e-07, + "loss": 0.0006, + "step": 46424 + }, + { + "epoch": 0.92852, + "grad_norm": 0.2958142161369324, + "learning_rate": 3.096702717049227e-07, + "loss": 0.0019, + "step": 46426 + }, + { + "epoch": 0.92856, + "grad_norm": 0.03961130604147911, + "learning_rate": 3.093255849414756e-07, + "loss": 0.0005, + "step": 46428 + }, + { + "epoch": 0.9286, + "grad_norm": 0.4908190369606018, + "learning_rate": 3.0898108710271437e-07, + "loss": 0.0059, + "step": 46430 + }, + { + "epoch": 0.92864, + "grad_norm": 0.1213863268494606, + "learning_rate": 3.0863677819535587e-07, + "loss": 0.0277, + "step": 46432 + }, + { + "epoch": 0.92868, + "grad_norm": 0.00408953009173274, + "learning_rate": 3.0829265822611367e-07, + "loss": 0.0031, + "step": 46434 + }, + { + "epoch": 0.92872, + "grad_norm": 0.05670106038451195, + "learning_rate": 3.079487272016957e-07, + "loss": 0.0005, + "step": 46436 + }, + { + "epoch": 0.92876, + "grad_norm": 0.5348190069198608, + "learning_rate": 3.0760498512880545e-07, + "loss": 0.0069, + "step": 46438 + }, + { + "epoch": 0.9288, + "grad_norm": 0.0044934446923434734, + "learning_rate": 3.072614320141487e-07, + "loss": 0.0004, + "step": 46440 + }, + { + "epoch": 0.92884, + "grad_norm": 0.0332091823220253, + "learning_rate": 3.06918067864419e-07, + "loss": 0.0006, + "step": 46442 + }, + { + "epoch": 0.92888, + "grad_norm": 0.06603484600782394, + "learning_rate": 3.06574892686311e-07, + "loss": 0.0009, + "step": 46444 + }, + { + "epoch": 0.92892, + "grad_norm": 0.7820624709129333, + "learning_rate": 3.0623190648651824e-07, + "loss": 0.0088, + "step": 46446 + }, + { + "epoch": 0.92896, + "grad_norm": 0.167524054646492, + "learning_rate": 3.0588910927172313e-07, + "loss": 0.0049, + "step": 46448 + }, + { + "epoch": 0.929, + "grad_norm": 0.06547926366329193, + "learning_rate": 3.0554650104861137e-07, + "loss": 0.0014, + "step": 46450 + }, + { + "epoch": 0.92904, + "grad_norm": 0.19434323906898499, + "learning_rate": 3.052040818238622e-07, + "loss": 0.6307, + "step": 46452 + }, + { + "epoch": 0.92908, + "grad_norm": 0.009476535022258759, + "learning_rate": 3.0486185160415015e-07, + "loss": 0.0301, + "step": 46454 + }, + { + "epoch": 0.92912, + "grad_norm": 0.03710077330470085, + "learning_rate": 3.0451981039614664e-07, + "loss": 0.0068, + "step": 46456 + }, + { + "epoch": 0.92916, + "grad_norm": 0.3132037818431854, + "learning_rate": 3.0417795820652184e-07, + "loss": 0.0144, + "step": 46458 + }, + { + "epoch": 0.9292, + "grad_norm": 0.020187024027109146, + "learning_rate": 3.0383629504194047e-07, + "loss": 0.0011, + "step": 46460 + }, + { + "epoch": 0.92924, + "grad_norm": 0.02501506358385086, + "learning_rate": 3.034948209090616e-07, + "loss": 0.0004, + "step": 46462 + }, + { + "epoch": 0.92928, + "grad_norm": 0.04516242817044258, + "learning_rate": 3.031535358145432e-07, + "loss": 0.0136, + "step": 46464 + }, + { + "epoch": 0.92932, + "grad_norm": 0.31906428933143616, + "learning_rate": 3.0281243976503894e-07, + "loss": 0.0035, + "step": 46466 + }, + { + "epoch": 0.92936, + "grad_norm": 0.436533123254776, + "learning_rate": 3.0247153276720007e-07, + "loss": 0.0043, + "step": 46468 + }, + { + "epoch": 0.9294, + "grad_norm": 0.21519462764263153, + "learning_rate": 3.0213081482766803e-07, + "loss": 0.0019, + "step": 46470 + }, + { + "epoch": 0.92944, + "grad_norm": 0.012629467062652111, + "learning_rate": 3.017902859530919e-07, + "loss": 0.0016, + "step": 46472 + }, + { + "epoch": 0.92948, + "grad_norm": 3.2418220043182373, + "learning_rate": 3.0144994615010526e-07, + "loss": 0.0301, + "step": 46474 + }, + { + "epoch": 0.92952, + "grad_norm": 0.012300434522330761, + "learning_rate": 3.0110979542534503e-07, + "loss": 0.0014, + "step": 46476 + }, + { + "epoch": 0.92956, + "grad_norm": 0.008328976109623909, + "learning_rate": 3.007698337854448e-07, + "loss": 0.6903, + "step": 46478 + }, + { + "epoch": 0.9296, + "grad_norm": 0.23492804169654846, + "learning_rate": 3.00430061237027e-07, + "loss": 0.0023, + "step": 46480 + }, + { + "epoch": 0.92964, + "grad_norm": 0.04157400131225586, + "learning_rate": 3.0009047778672083e-07, + "loss": 0.0005, + "step": 46482 + }, + { + "epoch": 0.92968, + "grad_norm": 0.21173754334449768, + "learning_rate": 2.997510834411443e-07, + "loss": 0.0052, + "step": 46484 + }, + { + "epoch": 0.92972, + "grad_norm": 27.628141403198242, + "learning_rate": 2.994118782069133e-07, + "loss": 0.5481, + "step": 46486 + }, + { + "epoch": 0.92976, + "grad_norm": 0.0006969412788748741, + "learning_rate": 2.990728620906436e-07, + "loss": 0.0005, + "step": 46488 + }, + { + "epoch": 0.9298, + "grad_norm": 19.548465728759766, + "learning_rate": 2.987340350989421e-07, + "loss": 0.8552, + "step": 46490 + }, + { + "epoch": 0.92984, + "grad_norm": 0.10247638821601868, + "learning_rate": 2.983953972384157e-07, + "loss": 0.0013, + "step": 46492 + }, + { + "epoch": 0.92988, + "grad_norm": 0.36915457248687744, + "learning_rate": 2.980569485156648e-07, + "loss": 0.0027, + "step": 46494 + }, + { + "epoch": 0.92992, + "grad_norm": 0.028785523027181625, + "learning_rate": 2.977186889372885e-07, + "loss": 0.0059, + "step": 46496 + }, + { + "epoch": 0.92996, + "grad_norm": 0.10748118907213211, + "learning_rate": 2.9738061850988154e-07, + "loss": 0.0038, + "step": 46498 + }, + { + "epoch": 0.93, + "grad_norm": 0.012369571253657341, + "learning_rate": 2.970427372400353e-07, + "loss": 0.0073, + "step": 46500 + }, + { + "epoch": 0.93004, + "grad_norm": 0.10865344852209091, + "learning_rate": 2.967050451343356e-07, + "loss": 0.0012, + "step": 46502 + }, + { + "epoch": 0.93008, + "grad_norm": 1.5243234634399414, + "learning_rate": 2.9636754219936835e-07, + "loss": 0.0157, + "step": 46504 + }, + { + "epoch": 0.93012, + "grad_norm": 0.01808112859725952, + "learning_rate": 2.960302284417094e-07, + "loss": 0.0793, + "step": 46506 + }, + { + "epoch": 0.93016, + "grad_norm": 0.0883224755525589, + "learning_rate": 2.9569310386793894e-07, + "loss": 0.0017, + "step": 46508 + }, + { + "epoch": 0.9302, + "grad_norm": 0.015687353909015656, + "learning_rate": 2.9535616848462624e-07, + "loss": 0.0006, + "step": 46510 + }, + { + "epoch": 0.93024, + "grad_norm": 0.06594481319189072, + "learning_rate": 2.950194222983416e-07, + "loss": 0.0016, + "step": 46512 + }, + { + "epoch": 0.93028, + "grad_norm": 0.15648865699768066, + "learning_rate": 2.946828653156508e-07, + "loss": 0.0022, + "step": 46514 + }, + { + "epoch": 0.93032, + "grad_norm": 0.5040339231491089, + "learning_rate": 2.9434649754311206e-07, + "loss": 0.006, + "step": 46516 + }, + { + "epoch": 0.93036, + "grad_norm": 0.03377876430749893, + "learning_rate": 2.9401031898728674e-07, + "loss": 0.0004, + "step": 46518 + }, + { + "epoch": 0.9304, + "grad_norm": 0.046463385224342346, + "learning_rate": 2.936743296547273e-07, + "loss": 0.0009, + "step": 46520 + }, + { + "epoch": 0.93044, + "grad_norm": 0.020969701930880547, + "learning_rate": 2.9333852955198306e-07, + "loss": 0.0007, + "step": 46522 + }, + { + "epoch": 0.93048, + "grad_norm": 0.4853673577308655, + "learning_rate": 2.9300291868560206e-07, + "loss": 0.0073, + "step": 46524 + }, + { + "epoch": 0.93052, + "grad_norm": 0.291946679353714, + "learning_rate": 2.926674970621268e-07, + "loss": 0.3378, + "step": 46526 + }, + { + "epoch": 0.93056, + "grad_norm": 0.014315925538539886, + "learning_rate": 2.923322646880966e-07, + "loss": 0.0001, + "step": 46528 + }, + { + "epoch": 0.9306, + "grad_norm": 0.04477554187178612, + "learning_rate": 2.919972215700462e-07, + "loss": 0.0012, + "step": 46530 + }, + { + "epoch": 0.93064, + "grad_norm": 0.0005692692939192057, + "learning_rate": 2.9166236771450804e-07, + "loss": 0.0212, + "step": 46532 + }, + { + "epoch": 0.93068, + "grad_norm": 3.8984007835388184, + "learning_rate": 2.9132770312800926e-07, + "loss": 0.0317, + "step": 46534 + }, + { + "epoch": 0.93072, + "grad_norm": 0.01916678249835968, + "learning_rate": 2.9099322781707794e-07, + "loss": 0.002, + "step": 46536 + }, + { + "epoch": 0.93076, + "grad_norm": 0.045305319130420685, + "learning_rate": 2.9065894178822886e-07, + "loss": 0.0028, + "step": 46538 + }, + { + "epoch": 0.9308, + "grad_norm": 0.12223788350820541, + "learning_rate": 2.9032484504798454e-07, + "loss": 0.0014, + "step": 46540 + }, + { + "epoch": 0.93084, + "grad_norm": 0.5815977454185486, + "learning_rate": 2.8999093760285646e-07, + "loss": 0.0044, + "step": 46542 + }, + { + "epoch": 0.93088, + "grad_norm": 0.023348478600382805, + "learning_rate": 2.896572194593528e-07, + "loss": 0.0011, + "step": 46544 + }, + { + "epoch": 0.93092, + "grad_norm": 0.06225133314728737, + "learning_rate": 2.893236906239827e-07, + "loss": 0.001, + "step": 46546 + }, + { + "epoch": 0.93096, + "grad_norm": 0.07239591330289841, + "learning_rate": 2.8899035110324326e-07, + "loss": 0.0022, + "step": 46548 + }, + { + "epoch": 0.931, + "grad_norm": 0.031556081026792526, + "learning_rate": 2.8865720090364037e-07, + "loss": 0.0004, + "step": 46550 + }, + { + "epoch": 0.93104, + "grad_norm": 0.002425007987767458, + "learning_rate": 2.8832424003166326e-07, + "loss": 0.0004, + "step": 46552 + }, + { + "epoch": 0.93108, + "grad_norm": 10.667125701904297, + "learning_rate": 2.8799146849380454e-07, + "loss": 0.2847, + "step": 46554 + }, + { + "epoch": 0.93112, + "grad_norm": 0.05952005833387375, + "learning_rate": 2.876588862965524e-07, + "loss": 0.0078, + "step": 46556 + }, + { + "epoch": 0.93116, + "grad_norm": 0.6706293821334839, + "learning_rate": 2.873264934463915e-07, + "loss": 0.0082, + "step": 46558 + }, + { + "epoch": 0.9312, + "grad_norm": 0.004008987452834845, + "learning_rate": 2.8699428994980017e-07, + "loss": 0.0008, + "step": 46560 + }, + { + "epoch": 0.93124, + "grad_norm": 0.021603941917419434, + "learning_rate": 2.8666227581325644e-07, + "loss": 0.0003, + "step": 46562 + }, + { + "epoch": 0.93128, + "grad_norm": 0.09198226779699326, + "learning_rate": 2.8633045104323187e-07, + "loss": 0.0048, + "step": 46564 + }, + { + "epoch": 0.93132, + "grad_norm": 0.00456645805388689, + "learning_rate": 2.859988156461957e-07, + "loss": 0.0001, + "step": 46566 + }, + { + "epoch": 0.93136, + "grad_norm": 0.6916640996932983, + "learning_rate": 2.8566736962861494e-07, + "loss": 0.0071, + "step": 46568 + }, + { + "epoch": 0.9314, + "grad_norm": 0.007118504494428635, + "learning_rate": 2.8533611299694784e-07, + "loss": 0.0002, + "step": 46570 + }, + { + "epoch": 0.93144, + "grad_norm": 0.5805028676986694, + "learning_rate": 2.8500504575765695e-07, + "loss": 0.0042, + "step": 46572 + }, + { + "epoch": 0.93148, + "grad_norm": 0.009575504809617996, + "learning_rate": 2.8467416791719385e-07, + "loss": 0.0012, + "step": 46574 + }, + { + "epoch": 0.93152, + "grad_norm": 0.0192607119679451, + "learning_rate": 2.843434794820088e-07, + "loss": 0.0007, + "step": 46576 + }, + { + "epoch": 0.93156, + "grad_norm": 0.9554929733276367, + "learning_rate": 2.840129804585501e-07, + "loss": 0.0104, + "step": 46578 + }, + { + "epoch": 0.9316, + "grad_norm": 0.05563617870211601, + "learning_rate": 2.836826708532603e-07, + "loss": 0.0005, + "step": 46580 + }, + { + "epoch": 0.93164, + "grad_norm": 0.5351448655128479, + "learning_rate": 2.8335255067257983e-07, + "loss": 0.0067, + "step": 46582 + }, + { + "epoch": 0.93168, + "grad_norm": 1.2730791568756104, + "learning_rate": 2.8302261992294354e-07, + "loss": 0.1772, + "step": 46584 + }, + { + "epoch": 0.93172, + "grad_norm": 0.0764676183462143, + "learning_rate": 2.826928786107841e-07, + "loss": 0.0015, + "step": 46586 + }, + { + "epoch": 0.93176, + "grad_norm": 0.24869553744792938, + "learning_rate": 2.8236332674252855e-07, + "loss": 0.0035, + "step": 46588 + }, + { + "epoch": 0.9318, + "grad_norm": 0.007134598214179277, + "learning_rate": 2.8203396432460507e-07, + "loss": 0.1157, + "step": 46590 + }, + { + "epoch": 0.93184, + "grad_norm": 0.13922119140625, + "learning_rate": 2.8170479136342965e-07, + "loss": 0.0017, + "step": 46592 + }, + { + "epoch": 0.93188, + "grad_norm": 0.4079894423484802, + "learning_rate": 2.81375807865425e-07, + "loss": 0.0041, + "step": 46594 + }, + { + "epoch": 0.93192, + "grad_norm": 0.06117340549826622, + "learning_rate": 2.810470138370014e-07, + "loss": 0.0008, + "step": 46596 + }, + { + "epoch": 0.93196, + "grad_norm": 0.1361958086490631, + "learning_rate": 2.807184092845705e-07, + "loss": 0.0009, + "step": 46598 + }, + { + "epoch": 0.932, + "grad_norm": 0.1932877153158188, + "learning_rate": 2.8038999421453827e-07, + "loss": 0.0017, + "step": 46600 + }, + { + "epoch": 0.93204, + "grad_norm": 0.026970071718096733, + "learning_rate": 2.8006176863330625e-07, + "loss": 0.0094, + "step": 46602 + }, + { + "epoch": 0.93208, + "grad_norm": 0.06681429594755173, + "learning_rate": 2.797337325472749e-07, + "loss": 0.0009, + "step": 46604 + }, + { + "epoch": 0.93212, + "grad_norm": 0.7644580006599426, + "learning_rate": 2.794058859628368e-07, + "loss": 0.0077, + "step": 46606 + }, + { + "epoch": 0.93216, + "grad_norm": 0.11663782596588135, + "learning_rate": 2.79078228886388e-07, + "loss": 0.0014, + "step": 46608 + }, + { + "epoch": 0.9322, + "grad_norm": 0.18108022212982178, + "learning_rate": 2.7875076132431344e-07, + "loss": 0.0016, + "step": 46610 + }, + { + "epoch": 0.93224, + "grad_norm": 0.06274441629648209, + "learning_rate": 2.7842348328299684e-07, + "loss": 0.0015, + "step": 46612 + }, + { + "epoch": 0.93228, + "grad_norm": 0.0010040528140962124, + "learning_rate": 2.7809639476882087e-07, + "loss": 0.0013, + "step": 46614 + }, + { + "epoch": 0.93232, + "grad_norm": 0.16248725354671478, + "learning_rate": 2.777694957881594e-07, + "loss": 0.0018, + "step": 46616 + }, + { + "epoch": 0.93236, + "grad_norm": 0.008884708397090435, + "learning_rate": 2.774427863473861e-07, + "loss": 0.0001, + "step": 46618 + }, + { + "epoch": 0.9324, + "grad_norm": 0.005828348454087973, + "learning_rate": 2.771162664528726e-07, + "loss": 0.0015, + "step": 46620 + }, + { + "epoch": 0.93244, + "grad_norm": 7.520783424377441, + "learning_rate": 2.767899361109827e-07, + "loss": 0.0726, + "step": 46622 + }, + { + "epoch": 0.93248, + "grad_norm": 0.7071046829223633, + "learning_rate": 2.764637953280791e-07, + "loss": 0.0066, + "step": 46624 + }, + { + "epoch": 0.93252, + "grad_norm": 0.44971227645874023, + "learning_rate": 2.7613784411052e-07, + "loss": 0.0053, + "step": 46626 + }, + { + "epoch": 0.93256, + "grad_norm": 0.003249574452638626, + "learning_rate": 2.758120824646593e-07, + "loss": 0.001, + "step": 46628 + }, + { + "epoch": 0.9326, + "grad_norm": 1.372573971748352, + "learning_rate": 2.7548651039684847e-07, + "loss": 0.0175, + "step": 46630 + }, + { + "epoch": 0.93264, + "grad_norm": 0.17409998178482056, + "learning_rate": 2.7516112791343477e-07, + "loss": 0.0016, + "step": 46632 + }, + { + "epoch": 0.93268, + "grad_norm": 0.05198587104678154, + "learning_rate": 2.748359350207619e-07, + "loss": 0.0068, + "step": 46634 + }, + { + "epoch": 0.93272, + "grad_norm": 0.10921543091535568, + "learning_rate": 2.745109317251693e-07, + "loss": 0.0011, + "step": 46636 + }, + { + "epoch": 0.93276, + "grad_norm": 0.05636519938707352, + "learning_rate": 2.741861180329919e-07, + "loss": 0.0014, + "step": 46638 + }, + { + "epoch": 0.9328, + "grad_norm": 0.06521883606910706, + "learning_rate": 2.7386149395056463e-07, + "loss": 0.0011, + "step": 46640 + }, + { + "epoch": 0.93284, + "grad_norm": 0.055687714368104935, + "learning_rate": 2.735370594842146e-07, + "loss": 0.0008, + "step": 46642 + }, + { + "epoch": 0.93288, + "grad_norm": 0.030482683330774307, + "learning_rate": 2.7321281464026684e-07, + "loss": 0.0004, + "step": 46644 + }, + { + "epoch": 0.93292, + "grad_norm": 0.1832575798034668, + "learning_rate": 2.728887594250429e-07, + "loss": 0.0018, + "step": 46646 + }, + { + "epoch": 0.93296, + "grad_norm": 0.27964186668395996, + "learning_rate": 2.725648938448622e-07, + "loss": 0.2528, + "step": 46648 + }, + { + "epoch": 0.933, + "grad_norm": 0.4276241362094879, + "learning_rate": 2.7224121790603517e-07, + "loss": 0.0071, + "step": 46650 + }, + { + "epoch": 0.93304, + "grad_norm": 0.011206112802028656, + "learning_rate": 2.7191773161487354e-07, + "loss": 1.0213, + "step": 46652 + }, + { + "epoch": 0.93308, + "grad_norm": 0.06083129718899727, + "learning_rate": 2.7159443497768445e-07, + "loss": 0.0099, + "step": 46654 + }, + { + "epoch": 0.93312, + "grad_norm": 0.17745183408260345, + "learning_rate": 2.712713280007706e-07, + "loss": 0.0031, + "step": 46656 + }, + { + "epoch": 0.93316, + "grad_norm": 0.012027120217680931, + "learning_rate": 2.709484106904314e-07, + "loss": 0.0015, + "step": 46658 + }, + { + "epoch": 0.9332, + "grad_norm": 0.7049444913864136, + "learning_rate": 2.7062568305295967e-07, + "loss": 0.0035, + "step": 46660 + }, + { + "epoch": 0.93324, + "grad_norm": 0.2093285769224167, + "learning_rate": 2.703031450946514e-07, + "loss": 0.0024, + "step": 46662 + }, + { + "epoch": 0.93328, + "grad_norm": 0.7393057346343994, + "learning_rate": 2.699807968217916e-07, + "loss": 0.0095, + "step": 46664 + }, + { + "epoch": 0.93332, + "grad_norm": 0.06025678291916847, + "learning_rate": 2.696586382406663e-07, + "loss": 0.0007, + "step": 46666 + }, + { + "epoch": 0.93336, + "grad_norm": 0.10829240828752518, + "learning_rate": 2.69336669357555e-07, + "loss": 0.2849, + "step": 46668 + }, + { + "epoch": 0.9334, + "grad_norm": 0.009453226812183857, + "learning_rate": 2.6901489017873375e-07, + "loss": 0.0132, + "step": 46670 + }, + { + "epoch": 0.93344, + "grad_norm": 0.07589362561702728, + "learning_rate": 2.686933007104797e-07, + "loss": 0.0019, + "step": 46672 + }, + { + "epoch": 0.93348, + "grad_norm": 0.06853535026311874, + "learning_rate": 2.6837190095905795e-07, + "loss": 0.0031, + "step": 46674 + }, + { + "epoch": 0.93352, + "grad_norm": 0.815249502658844, + "learning_rate": 2.6805069093073567e-07, + "loss": 0.0103, + "step": 46676 + }, + { + "epoch": 0.93356, + "grad_norm": 0.014215490780770779, + "learning_rate": 2.6772967063177667e-07, + "loss": 0.0022, + "step": 46678 + }, + { + "epoch": 0.9336, + "grad_norm": 0.005186168011277914, + "learning_rate": 2.6740884006843826e-07, + "loss": 0.0072, + "step": 46680 + }, + { + "epoch": 0.93364, + "grad_norm": 0.10706274211406708, + "learning_rate": 2.670881992469754e-07, + "loss": 0.0011, + "step": 46682 + }, + { + "epoch": 0.93368, + "grad_norm": 0.2736431360244751, + "learning_rate": 2.667677481736375e-07, + "loss": 0.0046, + "step": 46684 + }, + { + "epoch": 0.93372, + "grad_norm": 0.14401178061962128, + "learning_rate": 2.66447486854674e-07, + "loss": 0.0017, + "step": 46686 + }, + { + "epoch": 0.93376, + "grad_norm": 0.14766114950180054, + "learning_rate": 2.661274152963278e-07, + "loss": 0.0043, + "step": 46688 + }, + { + "epoch": 0.9338, + "grad_norm": 0.03460688889026642, + "learning_rate": 2.6580753350484044e-07, + "loss": 0.0028, + "step": 46690 + }, + { + "epoch": 0.93384, + "grad_norm": 0.027290398254990578, + "learning_rate": 2.654878414864448e-07, + "loss": 0.0003, + "step": 46692 + }, + { + "epoch": 0.93388, + "grad_norm": 0.02638443559408188, + "learning_rate": 2.6516833924737697e-07, + "loss": 0.0016, + "step": 46694 + }, + { + "epoch": 0.93392, + "grad_norm": 0.02406461536884308, + "learning_rate": 2.6484902679386305e-07, + "loss": 0.001, + "step": 46696 + }, + { + "epoch": 0.93396, + "grad_norm": 0.011796439997851849, + "learning_rate": 2.645299041321292e-07, + "loss": 0.0008, + "step": 46698 + }, + { + "epoch": 0.934, + "grad_norm": 0.08157648891210556, + "learning_rate": 2.6421097126839714e-07, + "loss": 0.0052, + "step": 46700 + }, + { + "epoch": 0.93404, + "grad_norm": 0.044787175953388214, + "learning_rate": 2.6389222820888515e-07, + "loss": 0.8764, + "step": 46702 + }, + { + "epoch": 0.93408, + "grad_norm": 0.18157599866390228, + "learning_rate": 2.6357367495980724e-07, + "loss": 0.003, + "step": 46704 + }, + { + "epoch": 0.93412, + "grad_norm": 0.08215895295143127, + "learning_rate": 2.6325531152737174e-07, + "loss": 0.0011, + "step": 46706 + }, + { + "epoch": 0.93416, + "grad_norm": 0.04304298013448715, + "learning_rate": 2.62937137917787e-07, + "loss": 0.0108, + "step": 46708 + }, + { + "epoch": 0.9342, + "grad_norm": 8.461516380310059, + "learning_rate": 2.626191541372558e-07, + "loss": 0.1273, + "step": 46710 + }, + { + "epoch": 0.93424, + "grad_norm": 0.009758836589753628, + "learning_rate": 2.623013601919777e-07, + "loss": 0.0025, + "step": 46712 + }, + { + "epoch": 0.93428, + "grad_norm": 0.17823709547519684, + "learning_rate": 2.6198375608814774e-07, + "loss": 0.0022, + "step": 46714 + }, + { + "epoch": 0.93432, + "grad_norm": 0.009912054985761642, + "learning_rate": 2.6166634183195873e-07, + "loss": 0.0013, + "step": 46716 + }, + { + "epoch": 0.93436, + "grad_norm": 0.1478135883808136, + "learning_rate": 2.613491174295968e-07, + "loss": 0.0037, + "step": 46718 + }, + { + "epoch": 0.9344, + "grad_norm": 0.007020802702754736, + "learning_rate": 2.6103208288724815e-07, + "loss": 0.0005, + "step": 46720 + }, + { + "epoch": 0.93444, + "grad_norm": 0.011134977452456951, + "learning_rate": 2.607152382110933e-07, + "loss": 0.0011, + "step": 46722 + }, + { + "epoch": 0.93448, + "grad_norm": 0.0019539312925189734, + "learning_rate": 2.603985834073075e-07, + "loss": 0.0006, + "step": 46724 + }, + { + "epoch": 0.93452, + "grad_norm": 0.5618862509727478, + "learning_rate": 2.600821184820679e-07, + "loss": 0.0057, + "step": 46726 + }, + { + "epoch": 0.93456, + "grad_norm": 0.2361438274383545, + "learning_rate": 2.5976584344153955e-07, + "loss": 0.0045, + "step": 46728 + }, + { + "epoch": 0.9346, + "grad_norm": 0.25537732243537903, + "learning_rate": 2.59449758291892e-07, + "loss": 0.0029, + "step": 46730 + }, + { + "epoch": 0.93464, + "grad_norm": 0.004805430769920349, + "learning_rate": 2.591338630392859e-07, + "loss": 0.0001, + "step": 46732 + }, + { + "epoch": 0.93468, + "grad_norm": 0.06108749657869339, + "learning_rate": 2.588181576898807e-07, + "loss": 0.0009, + "step": 46734 + }, + { + "epoch": 0.93472, + "grad_norm": 0.1357797384262085, + "learning_rate": 2.5850264224983046e-07, + "loss": 0.0028, + "step": 46736 + }, + { + "epoch": 0.93476, + "grad_norm": 0.9378530383110046, + "learning_rate": 2.5818731672528574e-07, + "loss": 0.0066, + "step": 46738 + }, + { + "epoch": 0.9348, + "grad_norm": 0.00924795214086771, + "learning_rate": 2.57872181122395e-07, + "loss": 0.0022, + "step": 46740 + }, + { + "epoch": 0.93484, + "grad_norm": 0.09744500368833542, + "learning_rate": 2.575572354473022e-07, + "loss": 0.0009, + "step": 46742 + }, + { + "epoch": 0.93488, + "grad_norm": 0.15073241293430328, + "learning_rate": 2.572424797061457e-07, + "loss": 0.0019, + "step": 46744 + }, + { + "epoch": 0.93492, + "grad_norm": 0.0112692816182971, + "learning_rate": 2.56927913905064e-07, + "loss": 0.0011, + "step": 46746 + }, + { + "epoch": 0.93496, + "grad_norm": 0.3922451436519623, + "learning_rate": 2.5661353805018886e-07, + "loss": 0.0025, + "step": 46748 + }, + { + "epoch": 0.935, + "grad_norm": 0.0740840956568718, + "learning_rate": 2.5629935214764866e-07, + "loss": 0.0025, + "step": 46750 + }, + { + "epoch": 0.93504, + "grad_norm": 0.03230872377753258, + "learning_rate": 2.559853562035686e-07, + "loss": 0.0003, + "step": 46752 + }, + { + "epoch": 0.93508, + "grad_norm": 0.016413060948252678, + "learning_rate": 2.556715502240703e-07, + "loss": 0.0134, + "step": 46754 + }, + { + "epoch": 0.93512, + "grad_norm": 0.022929400205612183, + "learning_rate": 2.553579342152723e-07, + "loss": 0.0008, + "step": 46756 + }, + { + "epoch": 0.93516, + "grad_norm": 0.4087289273738861, + "learning_rate": 2.5504450818328865e-07, + "loss": 0.0029, + "step": 46758 + }, + { + "epoch": 0.9352, + "grad_norm": 0.018151836469769478, + "learning_rate": 2.547312721342277e-07, + "loss": 0.0028, + "step": 46760 + }, + { + "epoch": 0.93524, + "grad_norm": 0.04928950220346451, + "learning_rate": 2.544182260742001e-07, + "loss": 0.0005, + "step": 46762 + }, + { + "epoch": 0.93528, + "grad_norm": 0.027983451262116432, + "learning_rate": 2.541053700093055e-07, + "loss": 0.8554, + "step": 46764 + }, + { + "epoch": 0.93532, + "grad_norm": 0.03591723367571831, + "learning_rate": 2.537927039456445e-07, + "loss": 0.0008, + "step": 46766 + }, + { + "epoch": 0.93536, + "grad_norm": 1.6240695714950562, + "learning_rate": 2.5348022788931227e-07, + "loss": 0.015, + "step": 46768 + }, + { + "epoch": 0.9354, + "grad_norm": 0.015412972308695316, + "learning_rate": 2.5316794184640056e-07, + "loss": 0.002, + "step": 46770 + }, + { + "epoch": 0.93544, + "grad_norm": 0.22595302760601044, + "learning_rate": 2.5285584582299793e-07, + "loss": 0.0015, + "step": 46772 + }, + { + "epoch": 0.93548, + "grad_norm": 0.33722400665283203, + "learning_rate": 2.5254393982518945e-07, + "loss": 0.0041, + "step": 46774 + }, + { + "epoch": 0.93552, + "grad_norm": 0.03740725666284561, + "learning_rate": 2.5223222385905467e-07, + "loss": 0.0023, + "step": 46776 + }, + { + "epoch": 0.93556, + "grad_norm": 0.18866153061389923, + "learning_rate": 2.51920697930671e-07, + "loss": 0.0033, + "step": 46778 + }, + { + "epoch": 0.9356, + "grad_norm": 0.022283509373664856, + "learning_rate": 2.516093620461124e-07, + "loss": 0.0094, + "step": 46780 + }, + { + "epoch": 0.93564, + "grad_norm": 0.025310812518000603, + "learning_rate": 2.5129821621144634e-07, + "loss": 0.0017, + "step": 46782 + }, + { + "epoch": 0.93568, + "grad_norm": 0.1864365041255951, + "learning_rate": 2.509872604327435e-07, + "loss": 0.0017, + "step": 46784 + }, + { + "epoch": 0.93572, + "grad_norm": 0.036642033606767654, + "learning_rate": 2.5067649471606117e-07, + "loss": 0.0034, + "step": 46786 + }, + { + "epoch": 0.93576, + "grad_norm": 0.044314030557870865, + "learning_rate": 2.503659190674601e-07, + "loss": 0.0021, + "step": 46788 + }, + { + "epoch": 0.9358, + "grad_norm": 0.2031516283750534, + "learning_rate": 2.500555334929955e-07, + "loss": 0.0138, + "step": 46790 + }, + { + "epoch": 0.93584, + "grad_norm": 0.29209673404693604, + "learning_rate": 2.497453379987169e-07, + "loss": 0.0027, + "step": 46792 + }, + { + "epoch": 0.93588, + "grad_norm": 0.45235320925712585, + "learning_rate": 2.49435332590674e-07, + "loss": 0.0049, + "step": 46794 + }, + { + "epoch": 0.93592, + "grad_norm": 0.9690679311752319, + "learning_rate": 2.491255172749085e-07, + "loss": 0.0093, + "step": 46796 + }, + { + "epoch": 0.93596, + "grad_norm": 0.0778161808848381, + "learning_rate": 2.488158920574613e-07, + "loss": 0.0021, + "step": 46798 + }, + { + "epoch": 0.936, + "grad_norm": 0.40542832016944885, + "learning_rate": 2.4850645694436736e-07, + "loss": 0.0054, + "step": 46800 + }, + { + "epoch": 0.93604, + "grad_norm": 1.738165259361267, + "learning_rate": 2.4819721194166205e-07, + "loss": 0.015, + "step": 46802 + }, + { + "epoch": 0.93608, + "grad_norm": 0.35908088088035583, + "learning_rate": 2.478881570553726e-07, + "loss": 0.0125, + "step": 46804 + }, + { + "epoch": 0.93612, + "grad_norm": 0.026089271530508995, + "learning_rate": 2.4757929229152325e-07, + "loss": 0.0014, + "step": 46806 + }, + { + "epoch": 0.93616, + "grad_norm": 0.17172613739967346, + "learning_rate": 2.472706176561368e-07, + "loss": 0.0019, + "step": 46808 + }, + { + "epoch": 0.9362, + "grad_norm": 0.018785584717988968, + "learning_rate": 2.4696213315523074e-07, + "loss": 0.0003, + "step": 46810 + }, + { + "epoch": 0.93624, + "grad_norm": 0.4665956497192383, + "learning_rate": 2.466538387948203e-07, + "loss": 0.0053, + "step": 46812 + }, + { + "epoch": 0.93628, + "grad_norm": 0.006476337090134621, + "learning_rate": 2.4634573458091173e-07, + "loss": 0.0582, + "step": 46814 + }, + { + "epoch": 0.93632, + "grad_norm": 0.014218726195394993, + "learning_rate": 2.4603782051951795e-07, + "loss": 0.0022, + "step": 46816 + }, + { + "epoch": 0.93636, + "grad_norm": 0.20912258327007294, + "learning_rate": 2.457300966166365e-07, + "loss": 0.002, + "step": 46818 + }, + { + "epoch": 0.9364, + "grad_norm": 0.1584133803844452, + "learning_rate": 2.4542256287826915e-07, + "loss": 0.0018, + "step": 46820 + }, + { + "epoch": 0.93644, + "grad_norm": 0.0926467627286911, + "learning_rate": 2.4511521931041114e-07, + "loss": 0.0011, + "step": 46822 + }, + { + "epoch": 0.93648, + "grad_norm": 0.21056625247001648, + "learning_rate": 2.4480806591905325e-07, + "loss": 0.0017, + "step": 46824 + }, + { + "epoch": 0.93652, + "grad_norm": 0.003014845307916403, + "learning_rate": 2.445011027101862e-07, + "loss": 0.0002, + "step": 46826 + }, + { + "epoch": 0.93656, + "grad_norm": 0.24934056401252747, + "learning_rate": 2.4419432968979086e-07, + "loss": 0.0045, + "step": 46828 + }, + { + "epoch": 0.9366, + "grad_norm": 0.22146303951740265, + "learning_rate": 2.4388774686385007e-07, + "loss": 0.0026, + "step": 46830 + }, + { + "epoch": 0.93664, + "grad_norm": 0.14694035053253174, + "learning_rate": 2.435813542383403e-07, + "loss": 0.0017, + "step": 46832 + }, + { + "epoch": 0.93668, + "grad_norm": 0.04883898049592972, + "learning_rate": 2.432751518192356e-07, + "loss": 0.0036, + "step": 46834 + }, + { + "epoch": 0.93672, + "grad_norm": 1.0896224975585938, + "learning_rate": 2.4296913961250446e-07, + "loss": 0.0095, + "step": 46836 + }, + { + "epoch": 0.93676, + "grad_norm": 0.008405135944485664, + "learning_rate": 2.4266331762411446e-07, + "loss": 0.0073, + "step": 46838 + }, + { + "epoch": 0.9368, + "grad_norm": 1.9789398908615112, + "learning_rate": 2.423576858600252e-07, + "loss": 0.0228, + "step": 46840 + }, + { + "epoch": 0.93684, + "grad_norm": 0.2512096166610718, + "learning_rate": 2.420522443261963e-07, + "loss": 0.0025, + "step": 46842 + }, + { + "epoch": 0.93688, + "grad_norm": 0.437448650598526, + "learning_rate": 2.417469930285843e-07, + "loss": 0.0037, + "step": 46844 + }, + { + "epoch": 0.93692, + "grad_norm": 0.28868523240089417, + "learning_rate": 2.414419319731354e-07, + "loss": 0.0087, + "step": 46846 + }, + { + "epoch": 0.93696, + "grad_norm": 0.038380227982997894, + "learning_rate": 2.411370611658026e-07, + "loss": 0.0009, + "step": 46848 + }, + { + "epoch": 0.937, + "grad_norm": 1.5728596448898315, + "learning_rate": 2.4083238061252565e-07, + "loss": 0.0118, + "step": 46850 + }, + { + "epoch": 0.93704, + "grad_norm": 0.7614489793777466, + "learning_rate": 2.4052789031924764e-07, + "loss": 0.0084, + "step": 46852 + }, + { + "epoch": 0.93708, + "grad_norm": 0.003409582655876875, + "learning_rate": 2.4022359029190144e-07, + "loss": 0.0009, + "step": 46854 + }, + { + "epoch": 0.93712, + "grad_norm": 1.726511836051941, + "learning_rate": 2.399194805364213e-07, + "loss": 0.019, + "step": 46856 + }, + { + "epoch": 0.93716, + "grad_norm": 0.10112373530864716, + "learning_rate": 2.396155610587358e-07, + "loss": 0.0019, + "step": 46858 + }, + { + "epoch": 0.9372, + "grad_norm": 0.01739409752190113, + "learning_rate": 2.3931183186477026e-07, + "loss": 0.001, + "step": 46860 + }, + { + "epoch": 0.93724, + "grad_norm": 0.008299008943140507, + "learning_rate": 2.390082929604454e-07, + "loss": 0.0002, + "step": 46862 + }, + { + "epoch": 0.93728, + "grad_norm": 0.5275524258613586, + "learning_rate": 2.387049443516787e-07, + "loss": 0.005, + "step": 46864 + }, + { + "epoch": 0.93732, + "grad_norm": 0.32124191522598267, + "learning_rate": 2.3840178604438435e-07, + "loss": 0.0029, + "step": 46866 + }, + { + "epoch": 0.93736, + "grad_norm": 0.06162375211715698, + "learning_rate": 2.3809881804447322e-07, + "loss": 0.0014, + "step": 46868 + }, + { + "epoch": 0.9374, + "grad_norm": 0.004958147183060646, + "learning_rate": 2.3779604035785277e-07, + "loss": 0.0004, + "step": 46870 + }, + { + "epoch": 0.93744, + "grad_norm": 1.077763319015503, + "learning_rate": 2.374934529904227e-07, + "loss": 0.0173, + "step": 46872 + }, + { + "epoch": 0.93748, + "grad_norm": 0.031096940860152245, + "learning_rate": 2.3719105594808388e-07, + "loss": 0.0009, + "step": 46874 + }, + { + "epoch": 0.93752, + "grad_norm": 0.2753640413284302, + "learning_rate": 2.3688884923673272e-07, + "loss": 0.0041, + "step": 46876 + }, + { + "epoch": 0.93756, + "grad_norm": 0.16470910608768463, + "learning_rate": 2.3658683286225892e-07, + "loss": 0.0129, + "step": 46878 + }, + { + "epoch": 0.9376, + "grad_norm": 0.09339933842420578, + "learning_rate": 2.3628500683055222e-07, + "loss": 0.0053, + "step": 46880 + }, + { + "epoch": 0.93764, + "grad_norm": 0.3967657685279846, + "learning_rate": 2.359833711474946e-07, + "loss": 0.0038, + "step": 46882 + }, + { + "epoch": 0.93768, + "grad_norm": 3.2769997119903564, + "learning_rate": 2.356819258189691e-07, + "loss": 0.0493, + "step": 46884 + }, + { + "epoch": 0.93772, + "grad_norm": 2.366994857788086, + "learning_rate": 2.3538067085085103e-07, + "loss": 0.0268, + "step": 46886 + }, + { + "epoch": 0.93776, + "grad_norm": 0.5648670792579651, + "learning_rate": 2.350796062490146e-07, + "loss": 0.0041, + "step": 46888 + }, + { + "epoch": 0.9378, + "grad_norm": 0.059789568185806274, + "learning_rate": 2.3477873201932733e-07, + "loss": 0.0013, + "step": 46890 + }, + { + "epoch": 0.93784, + "grad_norm": 0.09133592993021011, + "learning_rate": 2.3447804816765784e-07, + "loss": 0.001, + "step": 46892 + }, + { + "epoch": 0.93788, + "grad_norm": 0.022504059597849846, + "learning_rate": 2.3417755469986591e-07, + "loss": 0.0029, + "step": 46894 + }, + { + "epoch": 0.93792, + "grad_norm": 0.007445306982845068, + "learning_rate": 2.3387725162180906e-07, + "loss": 0.0001, + "step": 46896 + }, + { + "epoch": 0.93796, + "grad_norm": 0.09141288697719574, + "learning_rate": 2.3357713893934487e-07, + "loss": 0.0025, + "step": 46898 + }, + { + "epoch": 0.938, + "grad_norm": 0.06623678654432297, + "learning_rate": 2.332772166583208e-07, + "loss": 0.0024, + "step": 46900 + }, + { + "epoch": 0.93804, + "grad_norm": 0.008727355860173702, + "learning_rate": 2.3297748478458782e-07, + "loss": 0.0001, + "step": 46902 + }, + { + "epoch": 0.93808, + "grad_norm": 0.38208848237991333, + "learning_rate": 2.3267794332398454e-07, + "loss": 0.0038, + "step": 46904 + }, + { + "epoch": 0.93812, + "grad_norm": 0.01551750861108303, + "learning_rate": 2.3237859228235515e-07, + "loss": 0.0083, + "step": 46906 + }, + { + "epoch": 0.93816, + "grad_norm": 0.043425124138593674, + "learning_rate": 2.3207943166553282e-07, + "loss": 0.0007, + "step": 46908 + }, + { + "epoch": 0.9382, + "grad_norm": 0.0014627272030338645, + "learning_rate": 2.3178046147935173e-07, + "loss": 0.0267, + "step": 46910 + }, + { + "epoch": 0.93824, + "grad_norm": 0.054567936807870865, + "learning_rate": 2.3148168172963946e-07, + "loss": 0.0005, + "step": 46912 + }, + { + "epoch": 0.93828, + "grad_norm": 0.01625237613916397, + "learning_rate": 2.3118309242221914e-07, + "loss": 0.015, + "step": 46914 + }, + { + "epoch": 0.93832, + "grad_norm": 0.01658705808222294, + "learning_rate": 2.308846935629161e-07, + "loss": 0.0002, + "step": 46916 + }, + { + "epoch": 0.93836, + "grad_norm": 0.12415102869272232, + "learning_rate": 2.3058648515754456e-07, + "loss": 0.0026, + "step": 46918 + }, + { + "epoch": 0.9384, + "grad_norm": 0.009545561857521534, + "learning_rate": 2.3028846721191878e-07, + "loss": 0.0005, + "step": 46920 + }, + { + "epoch": 0.93844, + "grad_norm": 0.21118275821208954, + "learning_rate": 2.2999063973184965e-07, + "loss": 0.0031, + "step": 46922 + }, + { + "epoch": 0.93848, + "grad_norm": 0.07561810314655304, + "learning_rate": 2.2969300272314254e-07, + "loss": 0.0046, + "step": 46924 + }, + { + "epoch": 0.93852, + "grad_norm": 0.122555211186409, + "learning_rate": 2.293955561916017e-07, + "loss": 0.0019, + "step": 46926 + }, + { + "epoch": 0.93856, + "grad_norm": 1.3659523725509644, + "learning_rate": 2.2909830014302359e-07, + "loss": 0.02, + "step": 46928 + }, + { + "epoch": 0.9386, + "grad_norm": 0.057879023253917694, + "learning_rate": 2.288012345832047e-07, + "loss": 0.0009, + "step": 46930 + }, + { + "epoch": 0.93864, + "grad_norm": 0.8544695377349854, + "learning_rate": 2.2850435951793592e-07, + "loss": 0.0073, + "step": 46932 + }, + { + "epoch": 0.93868, + "grad_norm": 0.8794114589691162, + "learning_rate": 2.28207674953006e-07, + "loss": 0.0075, + "step": 46934 + }, + { + "epoch": 0.93872, + "grad_norm": 0.030490171164274216, + "learning_rate": 2.2791118089419696e-07, + "loss": 0.0019, + "step": 46936 + }, + { + "epoch": 0.93876, + "grad_norm": 0.14048044383525848, + "learning_rate": 2.2761487734729192e-07, + "loss": 0.0016, + "step": 46938 + }, + { + "epoch": 0.9388, + "grad_norm": 0.02321486547589302, + "learning_rate": 2.273187643180652e-07, + "loss": 0.0005, + "step": 46940 + }, + { + "epoch": 0.93884, + "grad_norm": 1.0099537372589111, + "learning_rate": 2.2702284181229106e-07, + "loss": 0.0098, + "step": 46942 + }, + { + "epoch": 0.93888, + "grad_norm": 0.15203922986984253, + "learning_rate": 2.267271098357393e-07, + "loss": 0.0021, + "step": 46944 + }, + { + "epoch": 0.93892, + "grad_norm": 2.1518261432647705, + "learning_rate": 2.264315683941709e-07, + "loss": 0.0271, + "step": 46946 + }, + { + "epoch": 0.93896, + "grad_norm": 0.014911146834492683, + "learning_rate": 2.2613621749335347e-07, + "loss": 0.001, + "step": 46948 + }, + { + "epoch": 0.939, + "grad_norm": 0.008593350648880005, + "learning_rate": 2.2584105713904126e-07, + "loss": 0.0002, + "step": 46950 + }, + { + "epoch": 0.93904, + "grad_norm": 0.09294839948415756, + "learning_rate": 2.2554608733699079e-07, + "loss": 0.0014, + "step": 46952 + }, + { + "epoch": 0.93908, + "grad_norm": 0.060208193957805634, + "learning_rate": 2.252513080929497e-07, + "loss": 0.004, + "step": 46954 + }, + { + "epoch": 0.93912, + "grad_norm": 0.038976311683654785, + "learning_rate": 2.2495671941266784e-07, + "loss": 0.0037, + "step": 46956 + }, + { + "epoch": 0.93916, + "grad_norm": 0.021100327372550964, + "learning_rate": 2.2466232130188726e-07, + "loss": 0.0003, + "step": 46958 + }, + { + "epoch": 0.9392, + "grad_norm": 0.0022076833993196487, + "learning_rate": 2.2436811376634893e-07, + "loss": 0.0008, + "step": 46960 + }, + { + "epoch": 0.93924, + "grad_norm": 0.01377617847174406, + "learning_rate": 2.2407409681178494e-07, + "loss": 0.0008, + "step": 46962 + }, + { + "epoch": 0.93928, + "grad_norm": 0.06084601953625679, + "learning_rate": 2.2378027044392958e-07, + "loss": 0.0014, + "step": 46964 + }, + { + "epoch": 0.93932, + "grad_norm": 0.05589309334754944, + "learning_rate": 2.2348663466851272e-07, + "loss": 0.0173, + "step": 46966 + }, + { + "epoch": 0.93936, + "grad_norm": 0.150962233543396, + "learning_rate": 2.2319318949125423e-07, + "loss": 0.0105, + "step": 46968 + }, + { + "epoch": 0.9394, + "grad_norm": 0.0040695806965231895, + "learning_rate": 2.2289993491788065e-07, + "loss": 0.0031, + "step": 46970 + }, + { + "epoch": 0.93944, + "grad_norm": 0.2700886130332947, + "learning_rate": 2.2260687095410515e-07, + "loss": 0.0019, + "step": 46972 + }, + { + "epoch": 0.93948, + "grad_norm": 0.06861413270235062, + "learning_rate": 2.223139976056432e-07, + "loss": 0.0023, + "step": 46974 + }, + { + "epoch": 0.93952, + "grad_norm": 0.026361793279647827, + "learning_rate": 2.2202131487820244e-07, + "loss": 0.0006, + "step": 46976 + }, + { + "epoch": 0.93956, + "grad_norm": 0.21626979112625122, + "learning_rate": 2.2172882277749162e-07, + "loss": 0.0022, + "step": 46978 + }, + { + "epoch": 0.9396, + "grad_norm": 0.02945089340209961, + "learning_rate": 2.214365213092118e-07, + "loss": 0.0074, + "step": 46980 + }, + { + "epoch": 0.93964, + "grad_norm": 0.03077578730881214, + "learning_rate": 2.2114441047906054e-07, + "loss": 0.0003, + "step": 46982 + }, + { + "epoch": 0.93968, + "grad_norm": 0.08092815428972244, + "learning_rate": 2.2085249029273337e-07, + "loss": 0.0063, + "step": 46984 + }, + { + "epoch": 0.93972, + "grad_norm": 0.1735752373933792, + "learning_rate": 2.2056076075592125e-07, + "loss": 0.0063, + "step": 46986 + }, + { + "epoch": 0.93976, + "grad_norm": 0.07814636081457138, + "learning_rate": 2.202692218743119e-07, + "loss": 0.008, + "step": 46988 + }, + { + "epoch": 0.9398, + "grad_norm": 0.8288397789001465, + "learning_rate": 2.1997787365358958e-07, + "loss": 0.0086, + "step": 46990 + }, + { + "epoch": 0.93984, + "grad_norm": 0.1957339346408844, + "learning_rate": 2.1968671609943426e-07, + "loss": 0.0027, + "step": 46992 + }, + { + "epoch": 0.93988, + "grad_norm": 0.029398499056696892, + "learning_rate": 2.1939574921752026e-07, + "loss": 0.0015, + "step": 46994 + }, + { + "epoch": 0.93992, + "grad_norm": 0.10905970633029938, + "learning_rate": 2.1910497301352308e-07, + "loss": 0.0012, + "step": 46996 + }, + { + "epoch": 0.93996, + "grad_norm": 0.28966575860977173, + "learning_rate": 2.1881438749310925e-07, + "loss": 0.0028, + "step": 46998 + }, + { + "epoch": 0.94, + "grad_norm": 0.019403424113988876, + "learning_rate": 2.1852399266194312e-07, + "loss": 0.001, + "step": 47000 + }, + { + "epoch": 0.94004, + "grad_norm": 0.05166294798254967, + "learning_rate": 2.182337885256902e-07, + "loss": 0.0049, + "step": 47002 + }, + { + "epoch": 0.94008, + "grad_norm": 0.05394904315471649, + "learning_rate": 2.1794377509000375e-07, + "loss": 0.0017, + "step": 47004 + }, + { + "epoch": 0.94012, + "grad_norm": 0.030473314225673676, + "learning_rate": 2.1765395236054032e-07, + "loss": 0.0013, + "step": 47006 + }, + { + "epoch": 0.94016, + "grad_norm": 0.13439656794071198, + "learning_rate": 2.1736432034294875e-07, + "loss": 0.0017, + "step": 47008 + }, + { + "epoch": 0.9402, + "grad_norm": 0.00998743437230587, + "learning_rate": 2.1707487904287672e-07, + "loss": 0.0057, + "step": 47010 + }, + { + "epoch": 0.94024, + "grad_norm": 0.02085278369486332, + "learning_rate": 2.167856284659675e-07, + "loss": 0.0004, + "step": 47012 + }, + { + "epoch": 0.94028, + "grad_norm": 0.06944169104099274, + "learning_rate": 2.1649656861785662e-07, + "loss": 0.0005, + "step": 47014 + }, + { + "epoch": 0.94032, + "grad_norm": 0.0009318895172327757, + "learning_rate": 2.1620769950418508e-07, + "loss": 0.0148, + "step": 47016 + }, + { + "epoch": 0.94036, + "grad_norm": 0.01570315659046173, + "learning_rate": 2.1591902113057949e-07, + "loss": 0.0007, + "step": 47018 + }, + { + "epoch": 0.9404, + "grad_norm": 0.3180919289588928, + "learning_rate": 2.1563053350266983e-07, + "loss": 0.0028, + "step": 47020 + }, + { + "epoch": 0.94044, + "grad_norm": 0.24191495776176453, + "learning_rate": 2.1534223662608044e-07, + "loss": 0.0017, + "step": 47022 + }, + { + "epoch": 0.94048, + "grad_norm": 0.01899532973766327, + "learning_rate": 2.1505413050643243e-07, + "loss": 0.0016, + "step": 47024 + }, + { + "epoch": 0.94052, + "grad_norm": 0.039526261389255524, + "learning_rate": 2.1476621514934127e-07, + "loss": 0.0019, + "step": 47026 + }, + { + "epoch": 0.94056, + "grad_norm": 0.11550043523311615, + "learning_rate": 2.1447849056042024e-07, + "loss": 0.0014, + "step": 47028 + }, + { + "epoch": 0.9406, + "grad_norm": 1.6020522117614746, + "learning_rate": 2.1419095674527934e-07, + "loss": 0.0133, + "step": 47030 + }, + { + "epoch": 0.94064, + "grad_norm": 0.3033439517021179, + "learning_rate": 2.1390361370952295e-07, + "loss": 0.0023, + "step": 47032 + }, + { + "epoch": 0.94068, + "grad_norm": 0.44741740822792053, + "learning_rate": 2.1361646145875548e-07, + "loss": 0.0053, + "step": 47034 + }, + { + "epoch": 0.94072, + "grad_norm": 0.3111584484577179, + "learning_rate": 2.1332949999857132e-07, + "loss": 0.004, + "step": 47036 + }, + { + "epoch": 0.94076, + "grad_norm": 0.008286155760288239, + "learning_rate": 2.1304272933456937e-07, + "loss": 0.005, + "step": 47038 + }, + { + "epoch": 0.9408, + "grad_norm": 0.008917824365198612, + "learning_rate": 2.1275614947233624e-07, + "loss": 0.0001, + "step": 47040 + }, + { + "epoch": 0.94084, + "grad_norm": 0.0412081740796566, + "learning_rate": 2.1246976041746193e-07, + "loss": 0.0005, + "step": 47042 + }, + { + "epoch": 0.94088, + "grad_norm": 0.011846023611724377, + "learning_rate": 2.121835621755275e-07, + "loss": 0.0002, + "step": 47044 + }, + { + "epoch": 0.94092, + "grad_norm": 0.24371202290058136, + "learning_rate": 2.1189755475211404e-07, + "loss": 0.0027, + "step": 47046 + }, + { + "epoch": 0.94096, + "grad_norm": 0.008000660687685013, + "learning_rate": 2.1161173815279824e-07, + "loss": 0.001, + "step": 47048 + }, + { + "epoch": 0.941, + "grad_norm": 0.02179463766515255, + "learning_rate": 2.1132611238315004e-07, + "loss": 0.0005, + "step": 47050 + }, + { + "epoch": 0.94104, + "grad_norm": 0.042330965399742126, + "learning_rate": 2.1104067744873835e-07, + "loss": 0.0004, + "step": 47052 + }, + { + "epoch": 0.94108, + "grad_norm": 0.13142991065979004, + "learning_rate": 2.1075543335512872e-07, + "loss": 0.0485, + "step": 47054 + }, + { + "epoch": 0.94112, + "grad_norm": 0.008610725402832031, + "learning_rate": 2.1047038010788334e-07, + "loss": 0.0002, + "step": 47056 + }, + { + "epoch": 0.94116, + "grad_norm": 0.5717132091522217, + "learning_rate": 2.1018551771255558e-07, + "loss": 0.0067, + "step": 47058 + }, + { + "epoch": 0.9412, + "grad_norm": 0.022669006139039993, + "learning_rate": 2.0990084617470207e-07, + "loss": 0.0003, + "step": 47060 + }, + { + "epoch": 0.94124, + "grad_norm": 0.04155811667442322, + "learning_rate": 2.0961636549987175e-07, + "loss": 0.0004, + "step": 47062 + }, + { + "epoch": 0.94128, + "grad_norm": 0.020106153562664986, + "learning_rate": 2.0933207569361124e-07, + "loss": 0.0011, + "step": 47064 + }, + { + "epoch": 0.94132, + "grad_norm": 0.09419609606266022, + "learning_rate": 2.0904797676146282e-07, + "loss": 0.0014, + "step": 47066 + }, + { + "epoch": 0.94136, + "grad_norm": 0.06640073657035828, + "learning_rate": 2.0876406870896315e-07, + "loss": 0.002, + "step": 47068 + }, + { + "epoch": 0.9414, + "grad_norm": 0.32559478282928467, + "learning_rate": 2.0848035154165113e-07, + "loss": 0.0035, + "step": 47070 + }, + { + "epoch": 0.94144, + "grad_norm": 0.16931436955928802, + "learning_rate": 2.0819682526505347e-07, + "loss": 0.0017, + "step": 47072 + }, + { + "epoch": 0.94148, + "grad_norm": 0.0019871117547154427, + "learning_rate": 2.079134898847013e-07, + "loss": 0.0008, + "step": 47074 + }, + { + "epoch": 0.94152, + "grad_norm": 0.2407904416322708, + "learning_rate": 2.076303454061157e-07, + "loss": 0.0027, + "step": 47076 + }, + { + "epoch": 0.94156, + "grad_norm": 15.482378959655762, + "learning_rate": 2.07347391834819e-07, + "loss": 0.4499, + "step": 47078 + }, + { + "epoch": 0.9416, + "grad_norm": 0.5148418545722961, + "learning_rate": 2.0706462917632676e-07, + "loss": 0.0238, + "step": 47080 + }, + { + "epoch": 0.94164, + "grad_norm": 0.07515294849872589, + "learning_rate": 2.067820574361501e-07, + "loss": 0.0069, + "step": 47082 + }, + { + "epoch": 0.94168, + "grad_norm": 0.013327146880328655, + "learning_rate": 2.064996766198002e-07, + "loss": 0.0006, + "step": 47084 + }, + { + "epoch": 0.94172, + "grad_norm": 0.1758868247270584, + "learning_rate": 2.0621748673278042e-07, + "loss": 0.0021, + "step": 47086 + }, + { + "epoch": 0.94176, + "grad_norm": 0.1870318502187729, + "learning_rate": 2.0593548778059414e-07, + "loss": 0.0029, + "step": 47088 + }, + { + "epoch": 0.9418, + "grad_norm": 0.033628880977630615, + "learning_rate": 2.0565367976873584e-07, + "loss": 0.0009, + "step": 47090 + }, + { + "epoch": 0.94184, + "grad_norm": 0.006453545764088631, + "learning_rate": 2.0537206270270338e-07, + "loss": 0.0002, + "step": 47092 + }, + { + "epoch": 0.94188, + "grad_norm": 4.585812091827393, + "learning_rate": 2.0509063658798457e-07, + "loss": 0.0488, + "step": 47094 + }, + { + "epoch": 0.94192, + "grad_norm": 18.67816734313965, + "learning_rate": 2.0480940143006612e-07, + "loss": 0.666, + "step": 47096 + }, + { + "epoch": 0.94196, + "grad_norm": 0.014409041963517666, + "learning_rate": 2.0452835723443142e-07, + "loss": 0.0006, + "step": 47098 + }, + { + "epoch": 0.942, + "grad_norm": 0.005920633673667908, + "learning_rate": 2.0424750400655947e-07, + "loss": 0.0003, + "step": 47100 + }, + { + "epoch": 0.94204, + "grad_norm": 0.12909628450870514, + "learning_rate": 2.0396684175192583e-07, + "loss": 0.0015, + "step": 47102 + }, + { + "epoch": 0.94208, + "grad_norm": 6.804061558796093e-05, + "learning_rate": 2.036863704760017e-07, + "loss": 0.0011, + "step": 47104 + }, + { + "epoch": 0.94212, + "grad_norm": 0.05968155711889267, + "learning_rate": 2.0340609018425605e-07, + "loss": 0.0036, + "step": 47106 + }, + { + "epoch": 0.94216, + "grad_norm": 0.06024816632270813, + "learning_rate": 2.0312600088215119e-07, + "loss": 0.0005, + "step": 47108 + }, + { + "epoch": 0.9422, + "grad_norm": 0.2366897016763687, + "learning_rate": 2.0284610257514936e-07, + "loss": 0.0021, + "step": 47110 + }, + { + "epoch": 0.94224, + "grad_norm": 0.0011153091909363866, + "learning_rate": 2.0256639526870624e-07, + "loss": 0.003, + "step": 47112 + }, + { + "epoch": 0.94228, + "grad_norm": 0.17817461490631104, + "learning_rate": 2.0228687896827637e-07, + "loss": 0.0025, + "step": 47114 + }, + { + "epoch": 0.94232, + "grad_norm": 0.052317261695861816, + "learning_rate": 2.0200755367930646e-07, + "loss": 0.007, + "step": 47116 + }, + { + "epoch": 0.94236, + "grad_norm": 0.8696808815002441, + "learning_rate": 2.0172841940724442e-07, + "loss": 0.0093, + "step": 47118 + }, + { + "epoch": 0.9424, + "grad_norm": 0.04541279375553131, + "learning_rate": 2.014494761575314e-07, + "loss": 0.0031, + "step": 47120 + }, + { + "epoch": 0.94244, + "grad_norm": 0.004784082528203726, + "learning_rate": 2.0117072393560533e-07, + "loss": 0.0057, + "step": 47122 + }, + { + "epoch": 0.94248, + "grad_norm": 0.10037579387426376, + "learning_rate": 2.0089216274690182e-07, + "loss": 0.0013, + "step": 47124 + }, + { + "epoch": 0.94252, + "grad_norm": 0.07006374001502991, + "learning_rate": 2.0061379259684764e-07, + "loss": 0.0089, + "step": 47126 + }, + { + "epoch": 0.94256, + "grad_norm": 0.00748651335015893, + "learning_rate": 2.0033561349087627e-07, + "loss": 0.0028, + "step": 47128 + }, + { + "epoch": 0.9426, + "grad_norm": 0.011859873309731483, + "learning_rate": 2.0005762543440444e-07, + "loss": 0.0045, + "step": 47130 + }, + { + "epoch": 0.94264, + "grad_norm": 0.05311233922839165, + "learning_rate": 1.9977982843285559e-07, + "loss": 0.0006, + "step": 47132 + }, + { + "epoch": 0.94268, + "grad_norm": 0.35341379046440125, + "learning_rate": 1.9950222249164542e-07, + "loss": 0.0032, + "step": 47134 + }, + { + "epoch": 0.94272, + "grad_norm": 0.08082398027181625, + "learning_rate": 1.9922480761618402e-07, + "loss": 0.0016, + "step": 47136 + }, + { + "epoch": 0.94276, + "grad_norm": 0.11405527591705322, + "learning_rate": 1.9894758381188152e-07, + "loss": 0.0014, + "step": 47138 + }, + { + "epoch": 0.9428, + "grad_norm": 0.00888507068157196, + "learning_rate": 1.9867055108414023e-07, + "loss": 0.0028, + "step": 47140 + }, + { + "epoch": 0.94284, + "grad_norm": 0.13319425284862518, + "learning_rate": 1.9839370943836366e-07, + "loss": 0.0017, + "step": 47142 + }, + { + "epoch": 0.94288, + "grad_norm": 0.01282294001430273, + "learning_rate": 1.9811705887994748e-07, + "loss": 0.0378, + "step": 47144 + }, + { + "epoch": 0.94292, + "grad_norm": 0.07805546373128891, + "learning_rate": 1.9784059941428734e-07, + "loss": 0.0024, + "step": 47146 + }, + { + "epoch": 0.94296, + "grad_norm": 0.03182089328765869, + "learning_rate": 1.9756433104677008e-07, + "loss": 0.0022, + "step": 47148 + }, + { + "epoch": 0.943, + "grad_norm": 0.035585712641477585, + "learning_rate": 1.9728825378278248e-07, + "loss": 0.0048, + "step": 47150 + }, + { + "epoch": 0.94304, + "grad_norm": 0.6290593147277832, + "learning_rate": 1.9701236762770692e-07, + "loss": 0.0083, + "step": 47152 + }, + { + "epoch": 0.94308, + "grad_norm": 0.007634189911186695, + "learning_rate": 1.9673667258692242e-07, + "loss": 0.0003, + "step": 47154 + }, + { + "epoch": 0.94312, + "grad_norm": 0.03200457617640495, + "learning_rate": 1.9646116866580355e-07, + "loss": 0.0011, + "step": 47156 + }, + { + "epoch": 0.94316, + "grad_norm": 1.2491776943206787, + "learning_rate": 1.961858558697205e-07, + "loss": 0.0171, + "step": 47158 + }, + { + "epoch": 0.9432, + "grad_norm": 1.5762735605239868, + "learning_rate": 1.9591073420404338e-07, + "loss": 0.0171, + "step": 47160 + }, + { + "epoch": 0.94324, + "grad_norm": 0.03235949948430061, + "learning_rate": 1.9563580367413348e-07, + "loss": 0.0011, + "step": 47162 + }, + { + "epoch": 0.94328, + "grad_norm": 6.925677735125646e-05, + "learning_rate": 1.9536106428535097e-07, + "loss": 0.0021, + "step": 47164 + }, + { + "epoch": 0.94332, + "grad_norm": 0.3884568214416504, + "learning_rate": 1.950865160430515e-07, + "loss": 0.0039, + "step": 47166 + }, + { + "epoch": 0.94336, + "grad_norm": 0.08089280128479004, + "learning_rate": 1.9481215895258866e-07, + "loss": 0.011, + "step": 47168 + }, + { + "epoch": 0.9434, + "grad_norm": 0.06916222721338272, + "learning_rate": 1.9453799301931253e-07, + "loss": 0.0056, + "step": 47170 + }, + { + "epoch": 0.94344, + "grad_norm": 0.008253815583884716, + "learning_rate": 1.9426401824856445e-07, + "loss": 0.0009, + "step": 47172 + }, + { + "epoch": 0.94348, + "grad_norm": 0.002670716494321823, + "learning_rate": 1.939902346456879e-07, + "loss": 0.0004, + "step": 47174 + }, + { + "epoch": 0.94352, + "grad_norm": 0.2226211279630661, + "learning_rate": 1.937166422160208e-07, + "loss": 0.0025, + "step": 47176 + }, + { + "epoch": 0.94356, + "grad_norm": 1.501672625541687, + "learning_rate": 1.9344324096489675e-07, + "loss": 0.5597, + "step": 47178 + }, + { + "epoch": 0.9436, + "grad_norm": 0.0802399218082428, + "learning_rate": 1.9317003089764365e-07, + "loss": 0.0033, + "step": 47180 + }, + { + "epoch": 0.94364, + "grad_norm": 0.4294843077659607, + "learning_rate": 1.9289701201959166e-07, + "loss": 0.0035, + "step": 47182 + }, + { + "epoch": 0.94368, + "grad_norm": 0.019899826496839523, + "learning_rate": 1.92624184336061e-07, + "loss": 0.0005, + "step": 47184 + }, + { + "epoch": 0.94372, + "grad_norm": 0.2165437936782837, + "learning_rate": 1.9235154785237076e-07, + "loss": 0.0027, + "step": 47186 + }, + { + "epoch": 0.94376, + "grad_norm": 0.20066285133361816, + "learning_rate": 1.9207910257383667e-07, + "loss": 0.0028, + "step": 47188 + }, + { + "epoch": 0.9438, + "grad_norm": 0.02259049378335476, + "learning_rate": 1.9180684850576893e-07, + "loss": 0.0004, + "step": 47190 + }, + { + "epoch": 0.94384, + "grad_norm": 0.057561151683330536, + "learning_rate": 1.9153478565347773e-07, + "loss": 0.0017, + "step": 47192 + }, + { + "epoch": 0.94388, + "grad_norm": 0.06801827251911163, + "learning_rate": 1.9126291402226438e-07, + "loss": 0.0032, + "step": 47194 + }, + { + "epoch": 0.94392, + "grad_norm": 0.007185365539044142, + "learning_rate": 1.9099123361743132e-07, + "loss": 0.0004, + "step": 47196 + }, + { + "epoch": 0.94396, + "grad_norm": 0.8512943387031555, + "learning_rate": 1.907197444442732e-07, + "loss": 0.0061, + "step": 47198 + }, + { + "epoch": 0.944, + "grad_norm": 0.009970106184482574, + "learning_rate": 1.9044844650808468e-07, + "loss": 0.0012, + "step": 47200 + }, + { + "epoch": 0.94404, + "grad_norm": 0.0040230052545666695, + "learning_rate": 1.9017733981415376e-07, + "loss": 0.0001, + "step": 47202 + }, + { + "epoch": 0.94408, + "grad_norm": 0.3028808534145355, + "learning_rate": 1.8990642436776618e-07, + "loss": 0.0043, + "step": 47204 + }, + { + "epoch": 0.94412, + "grad_norm": 0.5037416815757751, + "learning_rate": 1.896357001742033e-07, + "loss": 0.0035, + "step": 47206 + }, + { + "epoch": 0.94416, + "grad_norm": 0.06738677620887756, + "learning_rate": 1.893651672387431e-07, + "loss": 0.0968, + "step": 47208 + }, + { + "epoch": 0.9442, + "grad_norm": 0.10459358245134354, + "learning_rate": 1.8909482556666026e-07, + "loss": 0.0014, + "step": 47210 + }, + { + "epoch": 0.94424, + "grad_norm": 0.024509793147444725, + "learning_rate": 1.8882467516322278e-07, + "loss": 0.0003, + "step": 47212 + }, + { + "epoch": 0.94428, + "grad_norm": 0.21303795278072357, + "learning_rate": 1.8855471603370202e-07, + "loss": 0.002, + "step": 47214 + }, + { + "epoch": 0.94432, + "grad_norm": 0.06424427777528763, + "learning_rate": 1.8828494818335708e-07, + "loss": 0.0006, + "step": 47216 + }, + { + "epoch": 0.94436, + "grad_norm": 0.5596404075622559, + "learning_rate": 1.8801537161744822e-07, + "loss": 0.0052, + "step": 47218 + }, + { + "epoch": 0.9444, + "grad_norm": 0.03170083835721016, + "learning_rate": 1.877459863412323e-07, + "loss": 0.0027, + "step": 47220 + }, + { + "epoch": 0.94444, + "grad_norm": 0.1874547153711319, + "learning_rate": 1.8747679235995853e-07, + "loss": 0.0029, + "step": 47222 + }, + { + "epoch": 0.94448, + "grad_norm": 0.08224158734083176, + "learning_rate": 1.872077896788782e-07, + "loss": 0.0018, + "step": 47224 + }, + { + "epoch": 0.94452, + "grad_norm": 0.060370221734046936, + "learning_rate": 1.8693897830323272e-07, + "loss": 0.0008, + "step": 47226 + }, + { + "epoch": 0.94456, + "grad_norm": 0.0003176628379151225, + "learning_rate": 1.8667035823826562e-07, + "loss": 0.0007, + "step": 47228 + }, + { + "epoch": 0.9446, + "grad_norm": 0.05056934803724289, + "learning_rate": 1.8640192948921053e-07, + "loss": 0.0006, + "step": 47230 + }, + { + "epoch": 0.94464, + "grad_norm": 0.026570850983262062, + "learning_rate": 1.8613369206130217e-07, + "loss": 0.0007, + "step": 47232 + }, + { + "epoch": 0.94468, + "grad_norm": 0.1297617405653, + "learning_rate": 1.8586564595977074e-07, + "loss": 0.0179, + "step": 47234 + }, + { + "epoch": 0.94472, + "grad_norm": 18.607025146484375, + "learning_rate": 1.85597791189841e-07, + "loss": 0.7563, + "step": 47236 + }, + { + "epoch": 0.94476, + "grad_norm": 0.0007856097654439509, + "learning_rate": 1.8533012775673541e-07, + "loss": 0.0015, + "step": 47238 + }, + { + "epoch": 0.9448, + "grad_norm": 0.027439825236797333, + "learning_rate": 1.8506265566567095e-07, + "loss": 0.0006, + "step": 47240 + }, + { + "epoch": 0.94484, + "grad_norm": 0.1806919425725937, + "learning_rate": 1.847953749218645e-07, + "loss": 0.002, + "step": 47242 + }, + { + "epoch": 0.94488, + "grad_norm": 0.24144704639911652, + "learning_rate": 1.8452828553052416e-07, + "loss": 0.0159, + "step": 47244 + }, + { + "epoch": 0.94492, + "grad_norm": 0.18935135006904602, + "learning_rate": 1.8426138749685906e-07, + "loss": 0.0021, + "step": 47246 + }, + { + "epoch": 0.94496, + "grad_norm": 0.10690954327583313, + "learning_rate": 1.8399468082607063e-07, + "loss": 0.0018, + "step": 47248 + }, + { + "epoch": 0.945, + "grad_norm": 0.09643878042697906, + "learning_rate": 1.8372816552336025e-07, + "loss": 0.0009, + "step": 47250 + }, + { + "epoch": 0.94504, + "grad_norm": 0.11700485646724701, + "learning_rate": 1.8346184159392376e-07, + "loss": 0.0011, + "step": 47252 + }, + { + "epoch": 0.94508, + "grad_norm": 0.35971397161483765, + "learning_rate": 1.8319570904295148e-07, + "loss": 0.0041, + "step": 47254 + }, + { + "epoch": 0.94512, + "grad_norm": 0.01977221854031086, + "learning_rate": 1.8292976787563366e-07, + "loss": 0.0003, + "step": 47256 + }, + { + "epoch": 0.94516, + "grad_norm": 0.10402785986661911, + "learning_rate": 1.8266401809715284e-07, + "loss": 0.0013, + "step": 47258 + }, + { + "epoch": 0.9452, + "grad_norm": 2.0265045166015625, + "learning_rate": 1.8239845971269266e-07, + "loss": 0.0238, + "step": 47260 + }, + { + "epoch": 0.94524, + "grad_norm": 0.14688381552696228, + "learning_rate": 1.8213309272742895e-07, + "loss": 0.0031, + "step": 47262 + }, + { + "epoch": 0.94528, + "grad_norm": 0.16933728754520416, + "learning_rate": 1.8186791714653428e-07, + "loss": 0.0023, + "step": 47264 + }, + { + "epoch": 0.94532, + "grad_norm": 0.02113826386630535, + "learning_rate": 1.8160293297518007e-07, + "loss": 0.0002, + "step": 47266 + }, + { + "epoch": 0.94536, + "grad_norm": 0.5024039149284363, + "learning_rate": 1.8133814021853102e-07, + "loss": 0.0041, + "step": 47268 + }, + { + "epoch": 0.9454, + "grad_norm": 0.3423639237880707, + "learning_rate": 1.8107353888175083e-07, + "loss": 0.0127, + "step": 47270 + }, + { + "epoch": 0.94544, + "grad_norm": 0.025139546021819115, + "learning_rate": 1.8080912896999537e-07, + "loss": 0.0006, + "step": 47272 + }, + { + "epoch": 0.94548, + "grad_norm": 0.4366994798183441, + "learning_rate": 1.805449104884227e-07, + "loss": 0.0044, + "step": 47274 + }, + { + "epoch": 0.94552, + "grad_norm": 0.5252172350883484, + "learning_rate": 1.8028088344218097e-07, + "loss": 0.0054, + "step": 47276 + }, + { + "epoch": 0.94556, + "grad_norm": 0.059428758919239044, + "learning_rate": 1.800170478364216e-07, + "loss": 0.0062, + "step": 47278 + }, + { + "epoch": 0.9456, + "grad_norm": 0.013740652240812778, + "learning_rate": 1.7975340367628269e-07, + "loss": 0.0005, + "step": 47280 + }, + { + "epoch": 0.94564, + "grad_norm": 0.10798018425703049, + "learning_rate": 1.7948995096690791e-07, + "loss": 0.0043, + "step": 47282 + }, + { + "epoch": 0.94568, + "grad_norm": 0.0801984891295433, + "learning_rate": 1.7922668971343314e-07, + "loss": 0.002, + "step": 47284 + }, + { + "epoch": 0.94572, + "grad_norm": 0.05829716473817825, + "learning_rate": 1.7896361992098986e-07, + "loss": 1.0633, + "step": 47286 + }, + { + "epoch": 0.94576, + "grad_norm": 0.01249612495303154, + "learning_rate": 1.787007415947062e-07, + "loss": 0.0002, + "step": 47288 + }, + { + "epoch": 0.9458, + "grad_norm": 0.06235992908477783, + "learning_rate": 1.7843805473970798e-07, + "loss": 0.0071, + "step": 47290 + }, + { + "epoch": 0.94584, + "grad_norm": 0.07234952598810196, + "learning_rate": 1.7817555936111787e-07, + "loss": 0.0139, + "step": 47292 + }, + { + "epoch": 0.94588, + "grad_norm": 0.04718619957566261, + "learning_rate": 1.7791325546405059e-07, + "loss": 0.0007, + "step": 47294 + }, + { + "epoch": 0.94592, + "grad_norm": 0.02862536534667015, + "learning_rate": 1.7765114305362208e-07, + "loss": 0.0005, + "step": 47296 + }, + { + "epoch": 0.94596, + "grad_norm": 0.18452225625514984, + "learning_rate": 1.773892221349405e-07, + "loss": 0.0122, + "step": 47298 + }, + { + "epoch": 0.946, + "grad_norm": 0.013295751065015793, + "learning_rate": 1.7712749271311392e-07, + "loss": 0.0646, + "step": 47300 + }, + { + "epoch": 0.94604, + "grad_norm": 0.032251857221126556, + "learning_rate": 1.7686595479324276e-07, + "loss": 0.0009, + "step": 47302 + }, + { + "epoch": 0.94608, + "grad_norm": 0.01750182919204235, + "learning_rate": 1.7660460838042848e-07, + "loss": 0.0029, + "step": 47304 + }, + { + "epoch": 0.94612, + "grad_norm": 0.01376225333660841, + "learning_rate": 1.7634345347976368e-07, + "loss": 0.001, + "step": 47306 + }, + { + "epoch": 0.94616, + "grad_norm": 0.02515323832631111, + "learning_rate": 1.7608249009634092e-07, + "loss": 0.0014, + "step": 47308 + }, + { + "epoch": 0.9462, + "grad_norm": 0.014190065674483776, + "learning_rate": 1.758217182352495e-07, + "loss": 0.0003, + "step": 47310 + }, + { + "epoch": 0.94624, + "grad_norm": 0.258876234292984, + "learning_rate": 1.755611379015687e-07, + "loss": 0.0026, + "step": 47312 + }, + { + "epoch": 0.94628, + "grad_norm": 0.09119043499231339, + "learning_rate": 1.753007491003833e-07, + "loss": 0.0025, + "step": 47314 + }, + { + "epoch": 0.94632, + "grad_norm": 0.2274535894393921, + "learning_rate": 1.7504055183676704e-07, + "loss": 0.0022, + "step": 47316 + }, + { + "epoch": 0.94636, + "grad_norm": 0.14517317712306976, + "learning_rate": 1.7478054611579364e-07, + "loss": 0.0019, + "step": 47318 + }, + { + "epoch": 0.9464, + "grad_norm": 0.015991175547242165, + "learning_rate": 1.7452073194253237e-07, + "loss": 0.0018, + "step": 47320 + }, + { + "epoch": 0.94644, + "grad_norm": 0.00010706041939556599, + "learning_rate": 1.7426110932204698e-07, + "loss": 0.0008, + "step": 47322 + }, + { + "epoch": 0.94648, + "grad_norm": 0.14022912085056305, + "learning_rate": 1.7400167825940116e-07, + "loss": 0.0015, + "step": 47324 + }, + { + "epoch": 0.94652, + "grad_norm": 13.350934028625488, + "learning_rate": 1.737424387596509e-07, + "loss": 0.2669, + "step": 47326 + }, + { + "epoch": 0.94656, + "grad_norm": 0.15578292310237885, + "learning_rate": 1.7348339082785104e-07, + "loss": 0.0024, + "step": 47328 + }, + { + "epoch": 0.9466, + "grad_norm": 0.08320269733667374, + "learning_rate": 1.7322453446905084e-07, + "loss": 0.0016, + "step": 47330 + }, + { + "epoch": 0.94664, + "grad_norm": 0.023320956155657768, + "learning_rate": 1.7296586968829743e-07, + "loss": 0.0009, + "step": 47332 + }, + { + "epoch": 0.94668, + "grad_norm": 0.2586158215999603, + "learning_rate": 1.727073964906334e-07, + "loss": 0.0042, + "step": 47334 + }, + { + "epoch": 0.94672, + "grad_norm": 0.4488958418369293, + "learning_rate": 1.7244911488109916e-07, + "loss": 0.0057, + "step": 47336 + }, + { + "epoch": 0.94676, + "grad_norm": 0.01979895681142807, + "learning_rate": 1.721910248647285e-07, + "loss": 0.0003, + "step": 47338 + }, + { + "epoch": 0.9468, + "grad_norm": 0.27914726734161377, + "learning_rate": 1.719331264465529e-07, + "loss": 0.0033, + "step": 47340 + }, + { + "epoch": 0.94684, + "grad_norm": 0.38326185941696167, + "learning_rate": 1.7167541963160284e-07, + "loss": 0.0038, + "step": 47342 + }, + { + "epoch": 0.94688, + "grad_norm": 0.1361127644777298, + "learning_rate": 1.714179044248976e-07, + "loss": 0.0015, + "step": 47344 + }, + { + "epoch": 0.94692, + "grad_norm": 0.045397017151117325, + "learning_rate": 1.7116058083146315e-07, + "loss": 0.0136, + "step": 47346 + }, + { + "epoch": 0.94696, + "grad_norm": 0.1051873043179512, + "learning_rate": 1.7090344885631104e-07, + "loss": 0.0011, + "step": 47348 + }, + { + "epoch": 0.947, + "grad_norm": 3.3246078491210938, + "learning_rate": 1.706465085044584e-07, + "loss": 0.0317, + "step": 47350 + }, + { + "epoch": 0.94704, + "grad_norm": 0.07137434929609299, + "learning_rate": 1.703897597809112e-07, + "loss": 0.0035, + "step": 47352 + }, + { + "epoch": 0.94708, + "grad_norm": 0.07484451681375504, + "learning_rate": 1.7013320269067657e-07, + "loss": 0.0008, + "step": 47354 + }, + { + "epoch": 0.94712, + "grad_norm": 0.04626674950122833, + "learning_rate": 1.6987683723875603e-07, + "loss": 0.0012, + "step": 47356 + }, + { + "epoch": 0.94716, + "grad_norm": 0.3994136154651642, + "learning_rate": 1.6962066343014782e-07, + "loss": 0.0054, + "step": 47358 + }, + { + "epoch": 0.9472, + "grad_norm": 0.007898240350186825, + "learning_rate": 1.6936468126984573e-07, + "loss": 0.0002, + "step": 47360 + }, + { + "epoch": 0.94724, + "grad_norm": 0.04872648045420647, + "learning_rate": 1.691088907628402e-07, + "loss": 0.0018, + "step": 47362 + }, + { + "epoch": 0.94728, + "grad_norm": 0.0682872086763382, + "learning_rate": 1.6885329191411725e-07, + "loss": 0.0014, + "step": 47364 + }, + { + "epoch": 0.94732, + "grad_norm": 0.28360500931739807, + "learning_rate": 1.6859788472866178e-07, + "loss": 0.0397, + "step": 47366 + }, + { + "epoch": 0.94736, + "grad_norm": 0.8529021143913269, + "learning_rate": 1.6834266921145315e-07, + "loss": 0.0106, + "step": 47368 + }, + { + "epoch": 0.9474, + "grad_norm": 0.02125098928809166, + "learning_rate": 1.680876453674629e-07, + "loss": 0.0002, + "step": 47370 + }, + { + "epoch": 0.94744, + "grad_norm": 0.03325355052947998, + "learning_rate": 1.6783281320166822e-07, + "loss": 0.0003, + "step": 47372 + }, + { + "epoch": 0.94748, + "grad_norm": 4.313797473907471, + "learning_rate": 1.67578172719034e-07, + "loss": 0.0473, + "step": 47374 + }, + { + "epoch": 0.94752, + "grad_norm": 0.6306561827659607, + "learning_rate": 1.6732372392452622e-07, + "loss": 0.0073, + "step": 47376 + }, + { + "epoch": 0.94756, + "grad_norm": 0.302408367395401, + "learning_rate": 1.6706946682310431e-07, + "loss": 0.0066, + "step": 47378 + }, + { + "epoch": 0.9476, + "grad_norm": 0.02840528078377247, + "learning_rate": 1.668154014197243e-07, + "loss": 0.0215, + "step": 47380 + }, + { + "epoch": 0.94764, + "grad_norm": 0.6455199718475342, + "learning_rate": 1.665615277193411e-07, + "loss": 0.0077, + "step": 47382 + }, + { + "epoch": 0.94768, + "grad_norm": 0.23363418877124786, + "learning_rate": 1.6630784572690406e-07, + "loss": 0.003, + "step": 47384 + }, + { + "epoch": 0.94772, + "grad_norm": 0.01773117668926716, + "learning_rate": 1.6605435544735815e-07, + "loss": 0.0003, + "step": 47386 + }, + { + "epoch": 0.94776, + "grad_norm": 0.019604496657848358, + "learning_rate": 1.6580105688564495e-07, + "loss": 0.0526, + "step": 47388 + }, + { + "epoch": 0.9478, + "grad_norm": 0.05119842290878296, + "learning_rate": 1.6554795004670389e-07, + "loss": 0.0006, + "step": 47390 + }, + { + "epoch": 0.94784, + "grad_norm": 0.048610106110572815, + "learning_rate": 1.6529503493546762e-07, + "loss": 0.001, + "step": 47392 + }, + { + "epoch": 0.94788, + "grad_norm": 0.0761517658829689, + "learning_rate": 1.650423115568689e-07, + "loss": 0.0009, + "step": 47394 + }, + { + "epoch": 0.94792, + "grad_norm": 0.5294376611709595, + "learning_rate": 1.6478977991583267e-07, + "loss": 0.0057, + "step": 47396 + }, + { + "epoch": 0.94796, + "grad_norm": 0.053483814001083374, + "learning_rate": 1.645374400172839e-07, + "loss": 0.0006, + "step": 47398 + }, + { + "epoch": 0.948, + "grad_norm": 0.11196155846118927, + "learning_rate": 1.6428529186614195e-07, + "loss": 0.0161, + "step": 47400 + }, + { + "epoch": 0.94804, + "grad_norm": 0.010840656235814095, + "learning_rate": 1.6403333546731959e-07, + "loss": 0.0002, + "step": 47402 + }, + { + "epoch": 0.94808, + "grad_norm": 0.03202573210000992, + "learning_rate": 1.63781570825734e-07, + "loss": 0.0004, + "step": 47404 + }, + { + "epoch": 0.94812, + "grad_norm": 0.47060784697532654, + "learning_rate": 1.635299979462901e-07, + "loss": 0.0042, + "step": 47406 + }, + { + "epoch": 0.94816, + "grad_norm": 0.034605320543050766, + "learning_rate": 1.6327861683389179e-07, + "loss": 0.0014, + "step": 47408 + }, + { + "epoch": 0.9482, + "grad_norm": 0.04111749306321144, + "learning_rate": 1.6302742749344292e-07, + "loss": 0.0023, + "step": 47410 + }, + { + "epoch": 0.94824, + "grad_norm": 0.6350134015083313, + "learning_rate": 1.627764299298362e-07, + "loss": 0.0094, + "step": 47412 + }, + { + "epoch": 0.94828, + "grad_norm": 0.05317813530564308, + "learning_rate": 1.6252562414797e-07, + "loss": 0.0007, + "step": 47414 + }, + { + "epoch": 0.94832, + "grad_norm": 0.17114660143852234, + "learning_rate": 1.622750101527304e-07, + "loss": 0.0017, + "step": 47416 + }, + { + "epoch": 0.94836, + "grad_norm": 0.009251728653907776, + "learning_rate": 1.6202458794900455e-07, + "loss": 0.0004, + "step": 47418 + }, + { + "epoch": 0.9484, + "grad_norm": 0.33395177125930786, + "learning_rate": 1.6177435754167413e-07, + "loss": 0.0041, + "step": 47420 + }, + { + "epoch": 0.94844, + "grad_norm": 0.013645654544234276, + "learning_rate": 1.6152431893561748e-07, + "loss": 0.0016, + "step": 47422 + }, + { + "epoch": 0.94848, + "grad_norm": 0.3315942585468292, + "learning_rate": 1.6127447213570956e-07, + "loss": 0.0094, + "step": 47424 + }, + { + "epoch": 0.94852, + "grad_norm": 0.034972138702869415, + "learning_rate": 1.6102481714682204e-07, + "loss": 0.003, + "step": 47426 + }, + { + "epoch": 0.94856, + "grad_norm": 0.011837017722427845, + "learning_rate": 1.6077535397381993e-07, + "loss": 0.0057, + "step": 47428 + }, + { + "epoch": 0.9486, + "grad_norm": 0.007121220231056213, + "learning_rate": 1.605260826215682e-07, + "loss": 0.0008, + "step": 47430 + }, + { + "epoch": 0.94864, + "grad_norm": 0.5735270380973816, + "learning_rate": 1.6027700309492633e-07, + "loss": 0.0038, + "step": 47432 + }, + { + "epoch": 0.94868, + "grad_norm": 19.97915267944336, + "learning_rate": 1.6002811539874929e-07, + "loss": 0.373, + "step": 47434 + }, + { + "epoch": 0.94872, + "grad_norm": 0.12743684649467468, + "learning_rate": 1.597794195378921e-07, + "loss": 0.0016, + "step": 47436 + }, + { + "epoch": 0.94876, + "grad_norm": 0.17061008512973785, + "learning_rate": 1.5953091551719867e-07, + "loss": 0.0016, + "step": 47438 + }, + { + "epoch": 0.9488, + "grad_norm": 0.1924816370010376, + "learning_rate": 1.5928260334151847e-07, + "loss": 0.0016, + "step": 47440 + }, + { + "epoch": 0.94884, + "grad_norm": 0.2314843088388443, + "learning_rate": 1.590344830156887e-07, + "loss": 0.0031, + "step": 47442 + }, + { + "epoch": 0.94888, + "grad_norm": 0.11706498265266418, + "learning_rate": 1.5878655454454883e-07, + "loss": 0.003, + "step": 47444 + }, + { + "epoch": 0.94892, + "grad_norm": 0.1924884170293808, + "learning_rate": 1.5853881793293168e-07, + "loss": 0.0014, + "step": 47446 + }, + { + "epoch": 0.94896, + "grad_norm": 0.650091826915741, + "learning_rate": 1.5829127318566783e-07, + "loss": 0.0046, + "step": 47448 + }, + { + "epoch": 0.949, + "grad_norm": 2.242896318435669, + "learning_rate": 1.580439203075812e-07, + "loss": 0.0217, + "step": 47450 + }, + { + "epoch": 0.94904, + "grad_norm": 0.010663025081157684, + "learning_rate": 1.5779675930349568e-07, + "loss": 0.0004, + "step": 47452 + }, + { + "epoch": 0.94908, + "grad_norm": 6.000804424285889, + "learning_rate": 1.5754979017822858e-07, + "loss": 0.0587, + "step": 47454 + }, + { + "epoch": 0.94912, + "grad_norm": 4.18230676651001, + "learning_rate": 1.573030129365971e-07, + "loss": 0.0406, + "step": 47456 + }, + { + "epoch": 0.94916, + "grad_norm": 0.08656811714172363, + "learning_rate": 1.5705642758341076e-07, + "loss": 0.0019, + "step": 47458 + }, + { + "epoch": 0.9492, + "grad_norm": 11.920201301574707, + "learning_rate": 1.5681003412347573e-07, + "loss": 0.0792, + "step": 47460 + }, + { + "epoch": 0.94924, + "grad_norm": 0.09214754402637482, + "learning_rate": 1.5656383256159703e-07, + "loss": 0.0006, + "step": 47462 + }, + { + "epoch": 0.94928, + "grad_norm": 0.11280041188001633, + "learning_rate": 1.5631782290257525e-07, + "loss": 0.0011, + "step": 47464 + }, + { + "epoch": 0.94932, + "grad_norm": 0.19474361836910248, + "learning_rate": 1.5607200515120323e-07, + "loss": 0.0035, + "step": 47466 + }, + { + "epoch": 0.94936, + "grad_norm": 0.09247445315122604, + "learning_rate": 1.5582637931227718e-07, + "loss": 0.0023, + "step": 47468 + }, + { + "epoch": 0.9494, + "grad_norm": 7.473809719085693, + "learning_rate": 1.555809453905821e-07, + "loss": 0.0737, + "step": 47470 + }, + { + "epoch": 0.94944, + "grad_norm": 0.06058323010802269, + "learning_rate": 1.5533570339090752e-07, + "loss": 0.001, + "step": 47472 + }, + { + "epoch": 0.94948, + "grad_norm": 0.08108265697956085, + "learning_rate": 1.5509065331802964e-07, + "loss": 0.0019, + "step": 47474 + }, + { + "epoch": 0.94952, + "grad_norm": 0.011581103317439556, + "learning_rate": 1.5484579517672904e-07, + "loss": 0.0002, + "step": 47476 + }, + { + "epoch": 0.94956, + "grad_norm": 0.03389595076441765, + "learning_rate": 1.546011289717786e-07, + "loss": 0.0019, + "step": 47478 + }, + { + "epoch": 0.9496, + "grad_norm": 0.13205844163894653, + "learning_rate": 1.543566547079467e-07, + "loss": 0.0014, + "step": 47480 + }, + { + "epoch": 0.94964, + "grad_norm": 0.036283209919929504, + "learning_rate": 1.5411237239000065e-07, + "loss": 0.0004, + "step": 47482 + }, + { + "epoch": 0.94968, + "grad_norm": 0.015098853968083858, + "learning_rate": 1.5386828202270332e-07, + "loss": 0.0013, + "step": 47484 + }, + { + "epoch": 0.94972, + "grad_norm": 0.006312292534857988, + "learning_rate": 1.536243836108131e-07, + "loss": 0.0022, + "step": 47486 + }, + { + "epoch": 0.94976, + "grad_norm": 0.2524687349796295, + "learning_rate": 1.533806771590851e-07, + "loss": 0.0033, + "step": 47488 + }, + { + "epoch": 0.9498, + "grad_norm": 0.9690833687782288, + "learning_rate": 1.5313716267226997e-07, + "loss": 0.0092, + "step": 47490 + }, + { + "epoch": 0.94984, + "grad_norm": 2.2862305641174316, + "learning_rate": 1.52893840155115e-07, + "loss": 0.0212, + "step": 47492 + }, + { + "epoch": 0.94988, + "grad_norm": 0.16216495633125305, + "learning_rate": 1.5265070961236529e-07, + "loss": 0.0015, + "step": 47494 + }, + { + "epoch": 0.94992, + "grad_norm": 0.04648507758975029, + "learning_rate": 1.5240777104875814e-07, + "loss": 0.002, + "step": 47496 + }, + { + "epoch": 0.94996, + "grad_norm": 0.011663718149065971, + "learning_rate": 1.5216502446903202e-07, + "loss": 0.0422, + "step": 47498 + }, + { + "epoch": 0.95, + "grad_norm": 0.07961133867502213, + "learning_rate": 1.519224698779198e-07, + "loss": 0.001, + "step": 47500 + }, + { + "epoch": 0.95004, + "grad_norm": 0.00040672245086170733, + "learning_rate": 1.5168010728014771e-07, + "loss": 0.0021, + "step": 47502 + }, + { + "epoch": 0.95008, + "grad_norm": 5.5826461903052405e-05, + "learning_rate": 1.5143793668044305e-07, + "loss": 0.0001, + "step": 47504 + }, + { + "epoch": 0.95012, + "grad_norm": 0.20529115200042725, + "learning_rate": 1.511959580835265e-07, + "loss": 0.0026, + "step": 47506 + }, + { + "epoch": 0.95016, + "grad_norm": 0.8732017278671265, + "learning_rate": 1.509541714941143e-07, + "loss": 0.0088, + "step": 47508 + }, + { + "epoch": 0.9502, + "grad_norm": 0.005487771704792976, + "learning_rate": 1.5071257691692153e-07, + "loss": 0.0002, + "step": 47510 + }, + { + "epoch": 0.95024, + "grad_norm": 0.028997257351875305, + "learning_rate": 1.504711743566578e-07, + "loss": 0.0004, + "step": 47512 + }, + { + "epoch": 0.95028, + "grad_norm": 0.013694792985916138, + "learning_rate": 1.5022996381802934e-07, + "loss": 0.0005, + "step": 47514 + }, + { + "epoch": 0.95032, + "grad_norm": 0.15488757193088531, + "learning_rate": 1.4998894530573905e-07, + "loss": 0.0015, + "step": 47516 + }, + { + "epoch": 0.95036, + "grad_norm": 0.25786709785461426, + "learning_rate": 1.4974811882448536e-07, + "loss": 0.0071, + "step": 47518 + }, + { + "epoch": 0.9504, + "grad_norm": 0.010023198090493679, + "learning_rate": 1.4950748437896235e-07, + "loss": 0.0002, + "step": 47520 + }, + { + "epoch": 0.95044, + "grad_norm": 0.37195885181427, + "learning_rate": 1.4926704197386398e-07, + "loss": 0.0041, + "step": 47522 + }, + { + "epoch": 0.95048, + "grad_norm": 1.4147111177444458, + "learning_rate": 1.4902679161387435e-07, + "loss": 0.0161, + "step": 47524 + }, + { + "epoch": 0.95052, + "grad_norm": 0.029283301904797554, + "learning_rate": 1.4878673330368077e-07, + "loss": 0.0004, + "step": 47526 + }, + { + "epoch": 0.95056, + "grad_norm": 0.09072189033031464, + "learning_rate": 1.4854686704796063e-07, + "loss": 0.0031, + "step": 47528 + }, + { + "epoch": 0.9506, + "grad_norm": 0.0062412056140601635, + "learning_rate": 1.483071928513913e-07, + "loss": 0.0001, + "step": 47530 + }, + { + "epoch": 0.95064, + "grad_norm": 0.03224409744143486, + "learning_rate": 1.4806771071864568e-07, + "loss": 0.0059, + "step": 47532 + }, + { + "epoch": 0.95068, + "grad_norm": 0.00280937971547246, + "learning_rate": 1.478284206543912e-07, + "loss": 0.0002, + "step": 47534 + }, + { + "epoch": 0.95072, + "grad_norm": 0.0016535649774596095, + "learning_rate": 1.4758932266329518e-07, + "loss": 0.0031, + "step": 47536 + }, + { + "epoch": 0.95076, + "grad_norm": 0.14273861050605774, + "learning_rate": 1.4735041675001616e-07, + "loss": 0.0012, + "step": 47538 + }, + { + "epoch": 0.9508, + "grad_norm": 0.013353690505027771, + "learning_rate": 1.4711170291921485e-07, + "loss": 0.0007, + "step": 47540 + }, + { + "epoch": 0.95084, + "grad_norm": 0.007812637835741043, + "learning_rate": 1.4687318117554196e-07, + "loss": 0.0005, + "step": 47542 + }, + { + "epoch": 0.95088, + "grad_norm": 0.016192298382520676, + "learning_rate": 1.4663485152364933e-07, + "loss": 0.0004, + "step": 47544 + }, + { + "epoch": 0.95092, + "grad_norm": 0.02926718443632126, + "learning_rate": 1.4639671396818434e-07, + "loss": 0.0029, + "step": 47546 + }, + { + "epoch": 0.95096, + "grad_norm": 0.004772381391376257, + "learning_rate": 1.4615876851378775e-07, + "loss": 0.0019, + "step": 47548 + }, + { + "epoch": 0.951, + "grad_norm": 0.004160948097705841, + "learning_rate": 1.4592101516509916e-07, + "loss": 0.001, + "step": 47550 + }, + { + "epoch": 0.95104, + "grad_norm": 0.00274408096447587, + "learning_rate": 1.4568345392675375e-07, + "loss": 0.0024, + "step": 47552 + }, + { + "epoch": 0.95108, + "grad_norm": 0.021635623648762703, + "learning_rate": 1.454460848033823e-07, + "loss": 0.0005, + "step": 47554 + }, + { + "epoch": 0.95112, + "grad_norm": 0.03837146610021591, + "learning_rate": 1.4520890779961217e-07, + "loss": 0.0014, + "step": 47556 + }, + { + "epoch": 0.95116, + "grad_norm": 0.012633263133466244, + "learning_rate": 1.4497192292006968e-07, + "loss": 0.0018, + "step": 47558 + }, + { + "epoch": 0.9512, + "grad_norm": 0.1911843866109848, + "learning_rate": 1.4473513016937223e-07, + "loss": 0.0025, + "step": 47560 + }, + { + "epoch": 0.95124, + "grad_norm": 0.034709565341472626, + "learning_rate": 1.4449852955213728e-07, + "loss": 0.001, + "step": 47562 + }, + { + "epoch": 0.95128, + "grad_norm": 0.020028015598654747, + "learning_rate": 1.4426212107297778e-07, + "loss": 0.0005, + "step": 47564 + }, + { + "epoch": 0.95132, + "grad_norm": 0.23346304893493652, + "learning_rate": 1.4402590473650223e-07, + "loss": 0.0026, + "step": 47566 + }, + { + "epoch": 0.95136, + "grad_norm": 0.036882296204566956, + "learning_rate": 1.437898805473159e-07, + "loss": 0.0023, + "step": 47568 + }, + { + "epoch": 0.9514, + "grad_norm": 0.02505774050951004, + "learning_rate": 1.4355404851001953e-07, + "loss": 0.0024, + "step": 47570 + }, + { + "epoch": 0.95144, + "grad_norm": 0.05527157709002495, + "learning_rate": 1.4331840862921274e-07, + "loss": 0.0006, + "step": 47572 + }, + { + "epoch": 0.95148, + "grad_norm": 0.03731367737054825, + "learning_rate": 1.4308296090948638e-07, + "loss": 0.0004, + "step": 47574 + }, + { + "epoch": 0.95152, + "grad_norm": 0.31431183218955994, + "learning_rate": 1.4284770535543334e-07, + "loss": 0.0996, + "step": 47576 + }, + { + "epoch": 0.95156, + "grad_norm": 0.07281405478715897, + "learning_rate": 1.4261264197163893e-07, + "loss": 0.0016, + "step": 47578 + }, + { + "epoch": 0.9516, + "grad_norm": 0.1268392950296402, + "learning_rate": 1.4237777076268723e-07, + "loss": 0.0063, + "step": 47580 + }, + { + "epoch": 0.95164, + "grad_norm": 0.029913559556007385, + "learning_rate": 1.4214309173315455e-07, + "loss": 0.0095, + "step": 47582 + }, + { + "epoch": 0.95168, + "grad_norm": 0.0946434810757637, + "learning_rate": 1.4190860488761726e-07, + "loss": 0.0051, + "step": 47584 + }, + { + "epoch": 0.95172, + "grad_norm": 0.9236512780189514, + "learning_rate": 1.4167431023064838e-07, + "loss": 0.0065, + "step": 47586 + }, + { + "epoch": 0.95176, + "grad_norm": 0.04443792253732681, + "learning_rate": 1.4144020776681311e-07, + "loss": 0.0009, + "step": 47588 + }, + { + "epoch": 0.9518, + "grad_norm": 0.004967071581631899, + "learning_rate": 1.4120629750067672e-07, + "loss": 0.0104, + "step": 47590 + }, + { + "epoch": 0.95184, + "grad_norm": 0.03585207462310791, + "learning_rate": 1.4097257943679889e-07, + "loss": 0.0013, + "step": 47592 + }, + { + "epoch": 0.95188, + "grad_norm": 0.08884910494089127, + "learning_rate": 1.4073905357973704e-07, + "loss": 0.0012, + "step": 47594 + }, + { + "epoch": 0.95192, + "grad_norm": 0.012209041975438595, + "learning_rate": 1.4050571993404315e-07, + "loss": 0.0003, + "step": 47596 + }, + { + "epoch": 0.95196, + "grad_norm": 0.48757848143577576, + "learning_rate": 1.402725785042658e-07, + "loss": 0.0033, + "step": 47598 + }, + { + "epoch": 0.952, + "grad_norm": 0.0698639303445816, + "learning_rate": 1.400396292949513e-07, + "loss": 0.0016, + "step": 47600 + }, + { + "epoch": 0.95204, + "grad_norm": 0.0015222500078380108, + "learning_rate": 1.398068723106405e-07, + "loss": 0.0004, + "step": 47602 + }, + { + "epoch": 0.95208, + "grad_norm": 0.13106457889080048, + "learning_rate": 1.3957430755586977e-07, + "loss": 0.0011, + "step": 47604 + }, + { + "epoch": 0.95212, + "grad_norm": 0.0020825553219765425, + "learning_rate": 1.3934193503517546e-07, + "loss": 0.0024, + "step": 47606 + }, + { + "epoch": 0.95216, + "grad_norm": 0.13265039026737213, + "learning_rate": 1.3910975475308618e-07, + "loss": 0.0016, + "step": 47608 + }, + { + "epoch": 0.9522, + "grad_norm": 0.07884472608566284, + "learning_rate": 1.3887776671412943e-07, + "loss": 0.0011, + "step": 47610 + }, + { + "epoch": 0.95224, + "grad_norm": 0.03963545709848404, + "learning_rate": 1.3864597092282827e-07, + "loss": 0.0017, + "step": 47612 + }, + { + "epoch": 0.95228, + "grad_norm": 1.3719971179962158, + "learning_rate": 1.3841436738370017e-07, + "loss": 0.0128, + "step": 47614 + }, + { + "epoch": 0.95232, + "grad_norm": 0.10832107067108154, + "learning_rate": 1.3818295610126042e-07, + "loss": 0.0021, + "step": 47616 + }, + { + "epoch": 0.95236, + "grad_norm": 0.009406141936779022, + "learning_rate": 1.3795173708002097e-07, + "loss": 0.0012, + "step": 47618 + }, + { + "epoch": 0.9524, + "grad_norm": 0.09007204324007034, + "learning_rate": 1.377207103244904e-07, + "loss": 0.0032, + "step": 47620 + }, + { + "epoch": 0.95244, + "grad_norm": 0.0063188630156219006, + "learning_rate": 1.3748987583917295e-07, + "loss": 0.0012, + "step": 47622 + }, + { + "epoch": 0.95248, + "grad_norm": 0.034082770347595215, + "learning_rate": 1.3725923362856607e-07, + "loss": 0.0042, + "step": 47624 + }, + { + "epoch": 0.95252, + "grad_norm": 0.05009734258055687, + "learning_rate": 1.3702878369716953e-07, + "loss": 0.0008, + "step": 47626 + }, + { + "epoch": 0.95256, + "grad_norm": 1.0154556035995483, + "learning_rate": 1.367985260494742e-07, + "loss": 0.0087, + "step": 47628 + }, + { + "epoch": 0.9526, + "grad_norm": 0.13343389332294464, + "learning_rate": 1.3656846068996976e-07, + "loss": 0.0015, + "step": 47630 + }, + { + "epoch": 0.95264, + "grad_norm": 0.0029471307061612606, + "learning_rate": 1.3633858762314046e-07, + "loss": 0.0092, + "step": 47632 + }, + { + "epoch": 0.95268, + "grad_norm": 0.008984026499092579, + "learning_rate": 1.3610890685346933e-07, + "loss": 0.0015, + "step": 47634 + }, + { + "epoch": 0.95272, + "grad_norm": 0.12813584506511688, + "learning_rate": 1.3587941838543284e-07, + "loss": 0.0198, + "step": 47636 + }, + { + "epoch": 0.95276, + "grad_norm": 0.060467105358839035, + "learning_rate": 1.3565012222350626e-07, + "loss": 0.0007, + "step": 47638 + }, + { + "epoch": 0.9528, + "grad_norm": 0.012355606071650982, + "learning_rate": 1.3542101837215826e-07, + "loss": 0.0011, + "step": 47640 + }, + { + "epoch": 0.95284, + "grad_norm": 10.333474159240723, + "learning_rate": 1.3519210683585637e-07, + "loss": 0.1773, + "step": 47642 + }, + { + "epoch": 0.95288, + "grad_norm": 10.471550941467285, + "learning_rate": 1.3496338761906368e-07, + "loss": 0.0734, + "step": 47644 + }, + { + "epoch": 0.95292, + "grad_norm": 0.013890771195292473, + "learning_rate": 1.3473486072623664e-07, + "loss": 0.0007, + "step": 47646 + }, + { + "epoch": 0.95296, + "grad_norm": 0.014589031226933002, + "learning_rate": 1.3450652616183502e-07, + "loss": 0.0011, + "step": 47648 + }, + { + "epoch": 0.953, + "grad_norm": 0.1772274225950241, + "learning_rate": 1.3427838393030634e-07, + "loss": 0.0016, + "step": 47650 + }, + { + "epoch": 0.95304, + "grad_norm": 0.061156176030635834, + "learning_rate": 1.340504340360993e-07, + "loss": 0.001, + "step": 47652 + }, + { + "epoch": 0.95308, + "grad_norm": 0.08674213290214539, + "learning_rate": 1.3382267648366032e-07, + "loss": 0.0014, + "step": 47654 + }, + { + "epoch": 0.95312, + "grad_norm": 0.2862124741077423, + "learning_rate": 1.3359511127742585e-07, + "loss": 0.0019, + "step": 47656 + }, + { + "epoch": 0.95316, + "grad_norm": 0.24869488179683685, + "learning_rate": 1.333677384218346e-07, + "loss": 0.0029, + "step": 47658 + }, + { + "epoch": 0.9532, + "grad_norm": 0.0010749456705525517, + "learning_rate": 1.3314055792131964e-07, + "loss": 0.0006, + "step": 47660 + }, + { + "epoch": 0.95324, + "grad_norm": 0.11973924934864044, + "learning_rate": 1.3291356978030856e-07, + "loss": 0.0013, + "step": 47662 + }, + { + "epoch": 0.95328, + "grad_norm": 0.01601281762123108, + "learning_rate": 1.3268677400322783e-07, + "loss": 0.0002, + "step": 47664 + }, + { + "epoch": 0.95332, + "grad_norm": 0.05543931946158409, + "learning_rate": 1.3246017059449835e-07, + "loss": 0.0077, + "step": 47666 + }, + { + "epoch": 0.95336, + "grad_norm": 1.7817575931549072, + "learning_rate": 1.3223375955853878e-07, + "loss": 0.0167, + "step": 47668 + }, + { + "epoch": 0.9534, + "grad_norm": 0.020976394414901733, + "learning_rate": 1.320075408997612e-07, + "loss": 0.0016, + "step": 47670 + }, + { + "epoch": 0.95344, + "grad_norm": 0.03542126342654228, + "learning_rate": 1.317815146225776e-07, + "loss": 0.0009, + "step": 47672 + }, + { + "epoch": 0.95348, + "grad_norm": 20.1268253326416, + "learning_rate": 1.3155568073139336e-07, + "loss": 0.7525, + "step": 47674 + }, + { + "epoch": 0.95352, + "grad_norm": 0.0012958855368196964, + "learning_rate": 1.3133003923061272e-07, + "loss": 0.0004, + "step": 47676 + }, + { + "epoch": 0.95356, + "grad_norm": 0.03357825428247452, + "learning_rate": 1.3110459012463217e-07, + "loss": 0.0007, + "step": 47678 + }, + { + "epoch": 0.9536, + "grad_norm": 5.609711647033691, + "learning_rate": 1.308793334178493e-07, + "loss": 0.0528, + "step": 47680 + }, + { + "epoch": 0.95364, + "grad_norm": 0.121212899684906, + "learning_rate": 1.3065426911465506e-07, + "loss": 0.0127, + "step": 47682 + }, + { + "epoch": 0.95368, + "grad_norm": 0.016928743571043015, + "learning_rate": 1.3042939721943703e-07, + "loss": 0.0005, + "step": 47684 + }, + { + "epoch": 0.95372, + "grad_norm": 0.027954481542110443, + "learning_rate": 1.302047177365784e-07, + "loss": 0.0019, + "step": 47686 + }, + { + "epoch": 0.95376, + "grad_norm": 0.19244910776615143, + "learning_rate": 1.2998023067046005e-07, + "loss": 0.0029, + "step": 47688 + }, + { + "epoch": 0.9538, + "grad_norm": 0.49656936526298523, + "learning_rate": 1.2975593602545966e-07, + "loss": 0.0058, + "step": 47690 + }, + { + "epoch": 0.95384, + "grad_norm": 0.04983087256550789, + "learning_rate": 1.2953183380594814e-07, + "loss": 0.004, + "step": 47692 + }, + { + "epoch": 0.95388, + "grad_norm": 0.14167392253875732, + "learning_rate": 1.2930792401629532e-07, + "loss": 0.0018, + "step": 47694 + }, + { + "epoch": 0.95392, + "grad_norm": 0.45608580112457275, + "learning_rate": 1.290842066608655e-07, + "loss": 0.003, + "step": 47696 + }, + { + "epoch": 0.95396, + "grad_norm": 0.1390668749809265, + "learning_rate": 1.288606817440219e-07, + "loss": 0.0035, + "step": 47698 + }, + { + "epoch": 0.954, + "grad_norm": 0.06648256629705429, + "learning_rate": 1.2863734927012094e-07, + "loss": 0.0006, + "step": 47700 + }, + { + "epoch": 0.95404, + "grad_norm": 0.02029835619032383, + "learning_rate": 1.28414209243517e-07, + "loss": 0.0014, + "step": 47702 + }, + { + "epoch": 0.95408, + "grad_norm": 0.8291375637054443, + "learning_rate": 1.28191261668561e-07, + "loss": 0.0083, + "step": 47704 + }, + { + "epoch": 0.95412, + "grad_norm": 0.008693240582942963, + "learning_rate": 1.2796850654959726e-07, + "loss": 0.0019, + "step": 47706 + }, + { + "epoch": 0.95416, + "grad_norm": 1.4024646282196045, + "learning_rate": 1.2774594389097116e-07, + "loss": 0.0106, + "step": 47708 + }, + { + "epoch": 0.9542, + "grad_norm": 0.01217613648623228, + "learning_rate": 1.275235736970193e-07, + "loss": 0.0051, + "step": 47710 + }, + { + "epoch": 0.95424, + "grad_norm": 0.039487238973379135, + "learning_rate": 1.2730139597207924e-07, + "loss": 0.0034, + "step": 47712 + }, + { + "epoch": 0.95428, + "grad_norm": 9.419697761535645, + "learning_rate": 1.270794107204809e-07, + "loss": 0.116, + "step": 47714 + }, + { + "epoch": 0.95432, + "grad_norm": 0.13129736483097076, + "learning_rate": 1.2685761794655193e-07, + "loss": 0.0021, + "step": 47716 + }, + { + "epoch": 0.95436, + "grad_norm": 0.3721875250339508, + "learning_rate": 1.2663601765461775e-07, + "loss": 0.004, + "step": 47718 + }, + { + "epoch": 0.9544, + "grad_norm": 0.02698362246155739, + "learning_rate": 1.26414609848996e-07, + "loss": 0.1266, + "step": 47720 + }, + { + "epoch": 0.95444, + "grad_norm": 0.0821719840168953, + "learning_rate": 1.261933945340066e-07, + "loss": 0.0027, + "step": 47722 + }, + { + "epoch": 0.95448, + "grad_norm": 0.027408864349126816, + "learning_rate": 1.2597237171395937e-07, + "loss": 0.0023, + "step": 47724 + }, + { + "epoch": 0.95452, + "grad_norm": 0.19037604331970215, + "learning_rate": 1.2575154139316427e-07, + "loss": 0.0024, + "step": 47726 + }, + { + "epoch": 0.95456, + "grad_norm": 0.030065927654504776, + "learning_rate": 1.2553090357592667e-07, + "loss": 0.0005, + "step": 47728 + }, + { + "epoch": 0.9546, + "grad_norm": 0.3487330675125122, + "learning_rate": 1.2531045826654652e-07, + "loss": 0.0031, + "step": 47730 + }, + { + "epoch": 0.95464, + "grad_norm": 0.036020200699567795, + "learning_rate": 1.2509020546932372e-07, + "loss": 0.0004, + "step": 47732 + }, + { + "epoch": 0.95468, + "grad_norm": 11.162447929382324, + "learning_rate": 1.2487014518855255e-07, + "loss": 0.1057, + "step": 47734 + }, + { + "epoch": 0.95472, + "grad_norm": 1.0512439012527466, + "learning_rate": 1.2465027742852076e-07, + "loss": 0.0131, + "step": 47736 + }, + { + "epoch": 0.95476, + "grad_norm": 0.07034733146429062, + "learning_rate": 1.24430602193516e-07, + "loss": 0.0016, + "step": 47738 + }, + { + "epoch": 0.9548, + "grad_norm": 2.9260776042938232, + "learning_rate": 1.242111194878215e-07, + "loss": 0.0604, + "step": 47740 + }, + { + "epoch": 0.95484, + "grad_norm": 0.036305058747529984, + "learning_rate": 1.2399182931571497e-07, + "loss": 0.0004, + "step": 47742 + }, + { + "epoch": 0.95488, + "grad_norm": 0.07929883897304535, + "learning_rate": 1.2377273168147296e-07, + "loss": 0.0034, + "step": 47744 + }, + { + "epoch": 0.95492, + "grad_norm": 3.696978807449341, + "learning_rate": 1.2355382658936433e-07, + "loss": 0.0447, + "step": 47746 + }, + { + "epoch": 0.95496, + "grad_norm": 0.008886225521564484, + "learning_rate": 1.2333511404366116e-07, + "loss": 0.0008, + "step": 47748 + }, + { + "epoch": 0.955, + "grad_norm": 0.1564522534608841, + "learning_rate": 1.231165940486234e-07, + "loss": 0.002, + "step": 47750 + }, + { + "epoch": 0.95504, + "grad_norm": 0.10038420557975769, + "learning_rate": 1.2289826660851213e-07, + "loss": 0.0247, + "step": 47752 + }, + { + "epoch": 0.95508, + "grad_norm": 0.049488142132759094, + "learning_rate": 1.2268013172758498e-07, + "loss": 0.0049, + "step": 47754 + }, + { + "epoch": 0.95512, + "grad_norm": 0.1267862319946289, + "learning_rate": 1.22462189410093e-07, + "loss": 0.0015, + "step": 47756 + }, + { + "epoch": 0.95516, + "grad_norm": 1.8252888917922974, + "learning_rate": 1.2224443966028733e-07, + "loss": 0.0418, + "step": 47758 + }, + { + "epoch": 0.9552, + "grad_norm": 0.1097579374909401, + "learning_rate": 1.2202688248241113e-07, + "loss": 0.0017, + "step": 47760 + }, + { + "epoch": 0.95524, + "grad_norm": 0.25241973996162415, + "learning_rate": 1.218095178807066e-07, + "loss": 0.0037, + "step": 47762 + }, + { + "epoch": 0.95528, + "grad_norm": 0.5723252296447754, + "learning_rate": 1.2159234585941038e-07, + "loss": 0.006, + "step": 47764 + }, + { + "epoch": 0.95532, + "grad_norm": 0.02954300306737423, + "learning_rate": 1.2137536642275795e-07, + "loss": 0.0119, + "step": 47766 + }, + { + "epoch": 0.95536, + "grad_norm": 0.03970440477132797, + "learning_rate": 1.2115857957497813e-07, + "loss": 0.0004, + "step": 47768 + }, + { + "epoch": 0.9554, + "grad_norm": 0.03394264727830887, + "learning_rate": 1.2094198532029754e-07, + "loss": 0.0009, + "step": 47770 + }, + { + "epoch": 0.95544, + "grad_norm": 0.22003817558288574, + "learning_rate": 1.207255836629395e-07, + "loss": 0.0022, + "step": 47772 + }, + { + "epoch": 0.95548, + "grad_norm": 7.496278285980225, + "learning_rate": 1.2050937460712287e-07, + "loss": 0.1064, + "step": 47774 + }, + { + "epoch": 0.95552, + "grad_norm": 0.00568317249417305, + "learning_rate": 1.20293358157062e-07, + "loss": 0.0001, + "step": 47776 + }, + { + "epoch": 0.95556, + "grad_norm": 0.07514892518520355, + "learning_rate": 1.200775343169669e-07, + "loss": 0.0035, + "step": 47778 + }, + { + "epoch": 0.9556, + "grad_norm": 0.024093423038721085, + "learning_rate": 1.1986190309104861e-07, + "loss": 0.001, + "step": 47780 + }, + { + "epoch": 0.95564, + "grad_norm": 1.228920578956604, + "learning_rate": 1.1964646448350936e-07, + "loss": 0.0118, + "step": 47782 + }, + { + "epoch": 0.95568, + "grad_norm": 0.3598453104496002, + "learning_rate": 1.19431218498548e-07, + "loss": 0.0071, + "step": 47784 + }, + { + "epoch": 0.95572, + "grad_norm": 0.8179587721824646, + "learning_rate": 1.1921616514036227e-07, + "loss": 0.0091, + "step": 47786 + }, + { + "epoch": 0.95576, + "grad_norm": 0.11895644664764404, + "learning_rate": 1.1900130441314328e-07, + "loss": 0.0013, + "step": 47788 + }, + { + "epoch": 0.9558, + "grad_norm": 0.02909894473850727, + "learning_rate": 1.1878663632108322e-07, + "loss": 0.0009, + "step": 47790 + }, + { + "epoch": 0.95584, + "grad_norm": 0.006887041963636875, + "learning_rate": 1.185721608683632e-07, + "loss": 0.0011, + "step": 47792 + }, + { + "epoch": 0.95588, + "grad_norm": 0.004338342696428299, + "learning_rate": 1.1835787805916654e-07, + "loss": 0.0002, + "step": 47794 + }, + { + "epoch": 0.95592, + "grad_norm": 0.07337003201246262, + "learning_rate": 1.1814378789767101e-07, + "loss": 0.0014, + "step": 47796 + }, + { + "epoch": 0.95596, + "grad_norm": 0.05541984364390373, + "learning_rate": 1.1792989038804992e-07, + "loss": 0.0005, + "step": 47798 + }, + { + "epoch": 0.956, + "grad_norm": 3.369847536087036, + "learning_rate": 1.1771618553447217e-07, + "loss": 0.0478, + "step": 47800 + }, + { + "epoch": 0.95604, + "grad_norm": 0.011047654785215855, + "learning_rate": 1.1750267334110555e-07, + "loss": 0.002, + "step": 47802 + }, + { + "epoch": 0.95608, + "grad_norm": 0.3579522967338562, + "learning_rate": 1.1728935381211226e-07, + "loss": 0.0045, + "step": 47804 + }, + { + "epoch": 0.95612, + "grad_norm": 0.05775494500994682, + "learning_rate": 1.1707622695165121e-07, + "loss": 0.0017, + "step": 47806 + }, + { + "epoch": 0.95616, + "grad_norm": 0.028005918487906456, + "learning_rate": 1.1686329276387798e-07, + "loss": 0.0051, + "step": 47808 + }, + { + "epoch": 0.9562, + "grad_norm": 0.07969773560762405, + "learning_rate": 1.1665055125294033e-07, + "loss": 0.0008, + "step": 47810 + }, + { + "epoch": 0.95624, + "grad_norm": 0.06647253781557083, + "learning_rate": 1.1643800242299164e-07, + "loss": 0.0009, + "step": 47812 + }, + { + "epoch": 0.95628, + "grad_norm": 0.14611977338790894, + "learning_rate": 1.1622564627817079e-07, + "loss": 0.002, + "step": 47814 + }, + { + "epoch": 0.95632, + "grad_norm": 0.26871705055236816, + "learning_rate": 1.1601348282261893e-07, + "loss": 0.0027, + "step": 47816 + }, + { + "epoch": 0.95636, + "grad_norm": 0.01296564657241106, + "learning_rate": 1.1580151206047385e-07, + "loss": 0.0057, + "step": 47818 + }, + { + "epoch": 0.9564, + "grad_norm": 0.19015581905841827, + "learning_rate": 1.1558973399586671e-07, + "loss": 0.0032, + "step": 47820 + }, + { + "epoch": 0.95644, + "grad_norm": 0.034347280859947205, + "learning_rate": 1.153781486329264e-07, + "loss": 0.0038, + "step": 47822 + }, + { + "epoch": 0.95648, + "grad_norm": 0.399747371673584, + "learning_rate": 1.1516675597577964e-07, + "loss": 0.0042, + "step": 47824 + }, + { + "epoch": 0.95652, + "grad_norm": 0.14344224333763123, + "learning_rate": 1.1495555602854425e-07, + "loss": 0.0025, + "step": 47826 + }, + { + "epoch": 0.95656, + "grad_norm": 0.006634004879742861, + "learning_rate": 1.1474454879533914e-07, + "loss": 0.0001, + "step": 47828 + }, + { + "epoch": 0.9566, + "grad_norm": 0.3003668785095215, + "learning_rate": 1.1453373428027992e-07, + "loss": 0.0112, + "step": 47830 + }, + { + "epoch": 0.95664, + "grad_norm": 0.010908480733633041, + "learning_rate": 1.143231124874733e-07, + "loss": 0.001, + "step": 47832 + }, + { + "epoch": 0.95668, + "grad_norm": 0.01104507502168417, + "learning_rate": 1.1411268342102822e-07, + "loss": 0.0003, + "step": 47834 + }, + { + "epoch": 0.95672, + "grad_norm": 0.025691324844956398, + "learning_rate": 1.1390244708504583e-07, + "loss": 0.0005, + "step": 47836 + }, + { + "epoch": 0.95676, + "grad_norm": 0.019461283460259438, + "learning_rate": 1.136924034836251e-07, + "loss": 0.0036, + "step": 47838 + }, + { + "epoch": 0.9568, + "grad_norm": 0.005068236496299505, + "learning_rate": 1.134825526208605e-07, + "loss": 0.0057, + "step": 47840 + }, + { + "epoch": 0.95684, + "grad_norm": 0.00142060371581465, + "learning_rate": 1.1327289450084322e-07, + "loss": 0.003, + "step": 47842 + }, + { + "epoch": 0.95688, + "grad_norm": 0.08173862099647522, + "learning_rate": 1.130634291276611e-07, + "loss": 0.0088, + "step": 47844 + }, + { + "epoch": 0.95692, + "grad_norm": 0.10900827497243881, + "learning_rate": 1.1285415650539644e-07, + "loss": 0.0015, + "step": 47846 + }, + { + "epoch": 0.95696, + "grad_norm": 0.043924447149038315, + "learning_rate": 1.1264507663813262e-07, + "loss": 0.0009, + "step": 47848 + }, + { + "epoch": 0.957, + "grad_norm": 0.05400928109884262, + "learning_rate": 1.1243618952994195e-07, + "loss": 0.0011, + "step": 47850 + }, + { + "epoch": 0.95704, + "grad_norm": 0.6402652263641357, + "learning_rate": 1.1222749518489784e-07, + "loss": 0.1307, + "step": 47852 + }, + { + "epoch": 0.95708, + "grad_norm": 0.005053660366684198, + "learning_rate": 1.1201899360706925e-07, + "loss": 0.0003, + "step": 47854 + }, + { + "epoch": 0.95712, + "grad_norm": 2.4764764308929443, + "learning_rate": 1.1181068480052292e-07, + "loss": 0.0213, + "step": 47856 + }, + { + "epoch": 0.95716, + "grad_norm": 0.0076633840799331665, + "learning_rate": 1.1160256876931674e-07, + "loss": 0.0001, + "step": 47858 + }, + { + "epoch": 0.9572, + "grad_norm": 2.8436293601989746, + "learning_rate": 1.1139464551750857e-07, + "loss": 0.0301, + "step": 47860 + }, + { + "epoch": 0.95724, + "grad_norm": 0.0140982149168849, + "learning_rate": 1.1118691504915402e-07, + "loss": 0.0014, + "step": 47862 + }, + { + "epoch": 0.95728, + "grad_norm": 0.0009521651081740856, + "learning_rate": 1.10979377368301e-07, + "loss": 0.0027, + "step": 47864 + }, + { + "epoch": 0.95732, + "grad_norm": 0.01270050648599863, + "learning_rate": 1.1077203247899737e-07, + "loss": 0.0001, + "step": 47866 + }, + { + "epoch": 0.95736, + "grad_norm": 0.08537761121988297, + "learning_rate": 1.1056488038528212e-07, + "loss": 0.0006, + "step": 47868 + }, + { + "epoch": 0.9574, + "grad_norm": 0.277962327003479, + "learning_rate": 1.1035792109119758e-07, + "loss": 0.0032, + "step": 47870 + }, + { + "epoch": 0.95744, + "grad_norm": 0.18858391046524048, + "learning_rate": 1.1015115460077718e-07, + "loss": 0.0014, + "step": 47872 + }, + { + "epoch": 0.95748, + "grad_norm": 0.06343396008014679, + "learning_rate": 1.0994458091805104e-07, + "loss": 0.0007, + "step": 47874 + }, + { + "epoch": 0.95752, + "grad_norm": 0.00395262660458684, + "learning_rate": 1.0973820004704705e-07, + "loss": 0.0005, + "step": 47876 + }, + { + "epoch": 0.95756, + "grad_norm": 0.031079431995749474, + "learning_rate": 1.0953201199178753e-07, + "loss": 0.0014, + "step": 47878 + }, + { + "epoch": 0.9576, + "grad_norm": 0.06447862833738327, + "learning_rate": 1.0932601675629595e-07, + "loss": 0.0017, + "step": 47880 + }, + { + "epoch": 0.95764, + "grad_norm": 0.22151164710521698, + "learning_rate": 1.0912021434458353e-07, + "loss": 0.0032, + "step": 47882 + }, + { + "epoch": 0.95768, + "grad_norm": 0.17254649102687836, + "learning_rate": 1.0891460476066596e-07, + "loss": 0.003, + "step": 47884 + }, + { + "epoch": 0.95772, + "grad_norm": 0.008925323374569416, + "learning_rate": 1.0870918800855002e-07, + "loss": 0.0072, + "step": 47886 + }, + { + "epoch": 0.95776, + "grad_norm": 0.05700865387916565, + "learning_rate": 1.085039640922414e-07, + "loss": 0.0026, + "step": 47888 + }, + { + "epoch": 0.9578, + "grad_norm": 0.45987173914909363, + "learning_rate": 1.0829893301573913e-07, + "loss": 0.0035, + "step": 47890 + }, + { + "epoch": 0.95784, + "grad_norm": 0.3283904194831848, + "learning_rate": 1.0809409478304223e-07, + "loss": 0.0032, + "step": 47892 + }, + { + "epoch": 0.95788, + "grad_norm": 0.5136334896087646, + "learning_rate": 1.0788944939814417e-07, + "loss": 0.0088, + "step": 47894 + }, + { + "epoch": 0.95792, + "grad_norm": 1.171811580657959, + "learning_rate": 1.0768499686503397e-07, + "loss": 0.0106, + "step": 47896 + }, + { + "epoch": 0.95796, + "grad_norm": 0.008914442732930183, + "learning_rate": 1.0748073718769736e-07, + "loss": 0.0028, + "step": 47898 + }, + { + "epoch": 0.958, + "grad_norm": 0.10284782201051712, + "learning_rate": 1.0727667037011668e-07, + "loss": 0.0014, + "step": 47900 + }, + { + "epoch": 0.95804, + "grad_norm": 0.078354611992836, + "learning_rate": 1.0707279641627099e-07, + "loss": 0.0013, + "step": 47902 + }, + { + "epoch": 0.95808, + "grad_norm": 0.369334876537323, + "learning_rate": 1.0686911533013377e-07, + "loss": 0.005, + "step": 47904 + }, + { + "epoch": 0.95812, + "grad_norm": 0.6451904773712158, + "learning_rate": 1.066656271156774e-07, + "loss": 0.0056, + "step": 47906 + }, + { + "epoch": 0.95816, + "grad_norm": 0.27156245708465576, + "learning_rate": 1.0646233177686649e-07, + "loss": 0.0025, + "step": 47908 + }, + { + "epoch": 0.9582, + "grad_norm": 0.021569758653640747, + "learning_rate": 1.0625922931766786e-07, + "loss": 0.0023, + "step": 47910 + }, + { + "epoch": 0.95824, + "grad_norm": 0.028434591367840767, + "learning_rate": 1.0605631974203834e-07, + "loss": 0.0085, + "step": 47912 + }, + { + "epoch": 0.95828, + "grad_norm": 0.07877251505851746, + "learning_rate": 1.0585360305393478e-07, + "loss": 0.0013, + "step": 47914 + }, + { + "epoch": 0.95832, + "grad_norm": 0.025532353669404984, + "learning_rate": 1.0565107925730845e-07, + "loss": 0.0047, + "step": 47916 + }, + { + "epoch": 0.95836, + "grad_norm": 0.007163981907069683, + "learning_rate": 1.054487483561084e-07, + "loss": 0.0007, + "step": 47918 + }, + { + "epoch": 0.9584, + "grad_norm": 0.049060553312301636, + "learning_rate": 1.052466103542793e-07, + "loss": 0.0012, + "step": 47920 + }, + { + "epoch": 0.95844, + "grad_norm": 0.4202289283275604, + "learning_rate": 1.050446652557613e-07, + "loss": 0.0046, + "step": 47922 + }, + { + "epoch": 0.95848, + "grad_norm": 0.2311088889837265, + "learning_rate": 1.0484291306449346e-07, + "loss": 0.0029, + "step": 47924 + }, + { + "epoch": 0.95852, + "grad_norm": 0.07226838171482086, + "learning_rate": 1.0464135378440599e-07, + "loss": 0.0021, + "step": 47926 + }, + { + "epoch": 0.95856, + "grad_norm": 0.0017817517509683967, + "learning_rate": 1.0443998741942907e-07, + "loss": 0.0006, + "step": 47928 + }, + { + "epoch": 0.9586, + "grad_norm": 0.0465933196246624, + "learning_rate": 1.0423881397349067e-07, + "loss": 0.0024, + "step": 47930 + }, + { + "epoch": 0.95864, + "grad_norm": 0.05307987332344055, + "learning_rate": 1.0403783345050989e-07, + "loss": 0.0122, + "step": 47932 + }, + { + "epoch": 0.95868, + "grad_norm": 0.02254272624850273, + "learning_rate": 1.0383704585440802e-07, + "loss": 0.0012, + "step": 47934 + }, + { + "epoch": 0.95872, + "grad_norm": 0.01257575023919344, + "learning_rate": 1.0363645118909637e-07, + "loss": 0.0016, + "step": 47936 + }, + { + "epoch": 0.95876, + "grad_norm": 0.015794754028320312, + "learning_rate": 1.034360494584874e-07, + "loss": 0.0003, + "step": 47938 + }, + { + "epoch": 0.9588, + "grad_norm": 0.021988172084093094, + "learning_rate": 1.0323584066648795e-07, + "loss": 0.0023, + "step": 47940 + }, + { + "epoch": 0.95884, + "grad_norm": 0.28693437576293945, + "learning_rate": 1.0303582481700159e-07, + "loss": 0.0058, + "step": 47942 + }, + { + "epoch": 0.95888, + "grad_norm": 0.7298724055290222, + "learning_rate": 1.028360019139274e-07, + "loss": 0.0081, + "step": 47944 + }, + { + "epoch": 0.95892, + "grad_norm": 0.12045908719301224, + "learning_rate": 1.0263637196115894e-07, + "loss": 0.0013, + "step": 47946 + }, + { + "epoch": 0.95896, + "grad_norm": 0.0630020871758461, + "learning_rate": 1.0243693496259088e-07, + "loss": 0.0008, + "step": 47948 + }, + { + "epoch": 0.959, + "grad_norm": 0.009496806189417839, + "learning_rate": 1.0223769092211012e-07, + "loss": 0.0004, + "step": 47950 + }, + { + "epoch": 0.95904, + "grad_norm": 0.10319336503744125, + "learning_rate": 1.020386398436024e-07, + "loss": 0.0013, + "step": 47952 + }, + { + "epoch": 0.95908, + "grad_norm": 0.47049593925476074, + "learning_rate": 1.0183978173094578e-07, + "loss": 0.005, + "step": 47954 + }, + { + "epoch": 0.95912, + "grad_norm": 0.0354476273059845, + "learning_rate": 1.0164111658801934e-07, + "loss": 0.0034, + "step": 47956 + }, + { + "epoch": 0.95916, + "grad_norm": 16.786104202270508, + "learning_rate": 1.0144264441869444e-07, + "loss": 1.1462, + "step": 47958 + }, + { + "epoch": 0.9592, + "grad_norm": 0.059118978679180145, + "learning_rate": 1.0124436522684244e-07, + "loss": 0.0005, + "step": 47960 + }, + { + "epoch": 0.95924, + "grad_norm": 0.03724191337823868, + "learning_rate": 1.0104627901632803e-07, + "loss": 0.0007, + "step": 47962 + }, + { + "epoch": 0.95928, + "grad_norm": 0.3736352324485779, + "learning_rate": 1.0084838579101142e-07, + "loss": 0.0037, + "step": 47964 + }, + { + "epoch": 0.95932, + "grad_norm": 0.14797890186309814, + "learning_rate": 1.0065068555475287e-07, + "loss": 0.0021, + "step": 47966 + }, + { + "epoch": 0.95936, + "grad_norm": 0.015793772414326668, + "learning_rate": 1.0045317831140489e-07, + "loss": 0.0002, + "step": 47968 + }, + { + "epoch": 0.9594, + "grad_norm": 0.04605020210146904, + "learning_rate": 1.002558640648199e-07, + "loss": 0.0039, + "step": 47970 + }, + { + "epoch": 0.95944, + "grad_norm": 0.06391613930463791, + "learning_rate": 1.0005874281884265e-07, + "loss": 0.0006, + "step": 47972 + }, + { + "epoch": 0.95948, + "grad_norm": 0.025520093739032745, + "learning_rate": 9.98618145773178e-08, + "loss": 0.0023, + "step": 47974 + }, + { + "epoch": 0.95952, + "grad_norm": 0.02224813587963581, + "learning_rate": 9.966507934408343e-08, + "loss": 0.0003, + "step": 47976 + }, + { + "epoch": 0.95956, + "grad_norm": 0.027906855568289757, + "learning_rate": 9.946853712297533e-08, + "loss": 0.0007, + "step": 47978 + }, + { + "epoch": 0.9596, + "grad_norm": 0.02318762242794037, + "learning_rate": 9.9272187917826e-08, + "loss": 0.0002, + "step": 47980 + }, + { + "epoch": 0.95964, + "grad_norm": 0.9132853150367737, + "learning_rate": 9.907603173246127e-08, + "loss": 0.0073, + "step": 47982 + }, + { + "epoch": 0.95968, + "grad_norm": 0.3209895193576813, + "learning_rate": 9.888006857070698e-08, + "loss": 0.0031, + "step": 47984 + }, + { + "epoch": 0.95972, + "grad_norm": 1.5231512784957886, + "learning_rate": 9.868429843638339e-08, + "loss": 0.0154, + "step": 47986 + }, + { + "epoch": 0.95976, + "grad_norm": 0.2570863366127014, + "learning_rate": 9.848872133330745e-08, + "loss": 0.0042, + "step": 47988 + }, + { + "epoch": 0.9598, + "grad_norm": 0.5104730725288391, + "learning_rate": 9.829333726529056e-08, + "loss": 0.0052, + "step": 47990 + }, + { + "epoch": 0.95984, + "grad_norm": 0.03195055574178696, + "learning_rate": 9.809814623614411e-08, + "loss": 0.0023, + "step": 47992 + }, + { + "epoch": 0.95988, + "grad_norm": 0.008835788816213608, + "learning_rate": 9.790314824967173e-08, + "loss": 0.0002, + "step": 47994 + }, + { + "epoch": 0.95992, + "grad_norm": 0.18718047440052032, + "learning_rate": 9.770834330967483e-08, + "loss": 0.002, + "step": 47996 + }, + { + "epoch": 0.95996, + "grad_norm": 0.0075706304050982, + "learning_rate": 9.751373141995257e-08, + "loss": 0.0001, + "step": 47998 + }, + { + "epoch": 0.96, + "grad_norm": 0.025152016431093216, + "learning_rate": 9.731931258429638e-08, + "loss": 0.0017, + "step": 48000 + }, + { + "epoch": 0.96004, + "grad_norm": 0.06900133192539215, + "learning_rate": 9.712508680650102e-08, + "loss": 0.0021, + "step": 48002 + }, + { + "epoch": 0.96008, + "grad_norm": 0.06944279372692108, + "learning_rate": 9.693105409034897e-08, + "loss": 0.0005, + "step": 48004 + }, + { + "epoch": 0.96012, + "grad_norm": 0.18215644359588623, + "learning_rate": 9.673721443962391e-08, + "loss": 0.0098, + "step": 48006 + }, + { + "epoch": 0.96016, + "grad_norm": 0.14919188618659973, + "learning_rate": 9.654356785810614e-08, + "loss": 0.0013, + "step": 48008 + }, + { + "epoch": 0.9602, + "grad_norm": 0.14778894186019897, + "learning_rate": 9.635011434957153e-08, + "loss": 0.0026, + "step": 48010 + }, + { + "epoch": 0.96024, + "grad_norm": 0.005875328090041876, + "learning_rate": 9.615685391778818e-08, + "loss": 0.0002, + "step": 48012 + }, + { + "epoch": 0.96028, + "grad_norm": 0.016491426154971123, + "learning_rate": 9.596378656652638e-08, + "loss": 0.0002, + "step": 48014 + }, + { + "epoch": 0.96032, + "grad_norm": 0.15236394107341766, + "learning_rate": 9.577091229954982e-08, + "loss": 0.0038, + "step": 48016 + }, + { + "epoch": 0.96036, + "grad_norm": 2.37264347076416, + "learning_rate": 9.55782311206177e-08, + "loss": 0.0194, + "step": 48018 + }, + { + "epoch": 0.9604, + "grad_norm": 0.16286678612232208, + "learning_rate": 9.538574303348813e-08, + "loss": 0.0017, + "step": 48020 + }, + { + "epoch": 0.96044, + "grad_norm": 0.007842309772968292, + "learning_rate": 9.519344804191144e-08, + "loss": 0.0064, + "step": 48022 + }, + { + "epoch": 0.96048, + "grad_norm": 0.5598406791687012, + "learning_rate": 9.500134614963908e-08, + "loss": 0.0059, + "step": 48024 + }, + { + "epoch": 0.96052, + "grad_norm": 0.048954322934150696, + "learning_rate": 9.48094373604147e-08, + "loss": 0.0009, + "step": 48026 + }, + { + "epoch": 0.96056, + "grad_norm": 0.007398990914225578, + "learning_rate": 9.461772167797978e-08, + "loss": 0.0008, + "step": 48028 + }, + { + "epoch": 0.9606, + "grad_norm": 0.0015857063699513674, + "learning_rate": 9.442619910607131e-08, + "loss": 0.0081, + "step": 48030 + }, + { + "epoch": 0.96064, + "grad_norm": 0.0459505058825016, + "learning_rate": 9.42348696484241e-08, + "loss": 0.0031, + "step": 48032 + }, + { + "epoch": 0.96068, + "grad_norm": 0.23037832975387573, + "learning_rate": 9.404373330876849e-08, + "loss": 0.0191, + "step": 48034 + }, + { + "epoch": 0.96072, + "grad_norm": 0.3453786075115204, + "learning_rate": 9.385279009083037e-08, + "loss": 0.0039, + "step": 48036 + }, + { + "epoch": 0.96076, + "grad_norm": 1.1328771114349365, + "learning_rate": 9.366203999833123e-08, + "loss": 0.0104, + "step": 48038 + }, + { + "epoch": 0.9608, + "grad_norm": 0.016370652243494987, + "learning_rate": 9.347148303499143e-08, + "loss": 0.0012, + "step": 48040 + }, + { + "epoch": 0.96084, + "grad_norm": 0.01368305180221796, + "learning_rate": 9.328111920452465e-08, + "loss": 0.0004, + "step": 48042 + }, + { + "epoch": 0.96088, + "grad_norm": 0.13718406856060028, + "learning_rate": 9.309094851064238e-08, + "loss": 0.0726, + "step": 48044 + }, + { + "epoch": 0.96092, + "grad_norm": 0.031577467918395996, + "learning_rate": 9.290097095705386e-08, + "loss": 0.0003, + "step": 48046 + }, + { + "epoch": 0.96096, + "grad_norm": 0.04009409621357918, + "learning_rate": 9.271118654746058e-08, + "loss": 0.0015, + "step": 48048 + }, + { + "epoch": 0.961, + "grad_norm": 12.801275253295898, + "learning_rate": 9.252159528556404e-08, + "loss": 0.2416, + "step": 48050 + }, + { + "epoch": 0.96104, + "grad_norm": 0.004922761116176844, + "learning_rate": 9.233219717506015e-08, + "loss": 0.0002, + "step": 48052 + }, + { + "epoch": 0.96108, + "grad_norm": 0.1038641482591629, + "learning_rate": 9.214299221963929e-08, + "loss": 0.0045, + "step": 48054 + }, + { + "epoch": 0.96112, + "grad_norm": 0.05939005687832832, + "learning_rate": 9.195398042299298e-08, + "loss": 0.0011, + "step": 48056 + }, + { + "epoch": 0.96116, + "grad_norm": 0.0011179705616086721, + "learning_rate": 9.176516178880268e-08, + "loss": 0.0027, + "step": 48058 + }, + { + "epoch": 0.9612, + "grad_norm": 0.03727985918521881, + "learning_rate": 9.157653632075435e-08, + "loss": 0.001, + "step": 48060 + }, + { + "epoch": 0.96124, + "grad_norm": 0.13672153651714325, + "learning_rate": 9.138810402252174e-08, + "loss": 0.0018, + "step": 48062 + }, + { + "epoch": 0.96128, + "grad_norm": 0.01365495752543211, + "learning_rate": 9.119986489777855e-08, + "loss": 0.0005, + "step": 48064 + }, + { + "epoch": 0.96132, + "grad_norm": 0.22841371595859528, + "learning_rate": 9.101181895019629e-08, + "loss": 0.0031, + "step": 48066 + }, + { + "epoch": 0.96136, + "grad_norm": 0.38016316294670105, + "learning_rate": 9.082396618343981e-08, + "loss": 0.0054, + "step": 48068 + }, + { + "epoch": 0.9614, + "grad_norm": 0.03325197473168373, + "learning_rate": 9.063630660117172e-08, + "loss": 0.0004, + "step": 48070 + }, + { + "epoch": 0.96144, + "grad_norm": 0.017844699323177338, + "learning_rate": 9.044884020705025e-08, + "loss": 0.0014, + "step": 48072 + }, + { + "epoch": 0.96148, + "grad_norm": 0.0069037433713674545, + "learning_rate": 9.026156700473021e-08, + "loss": 0.0002, + "step": 48074 + }, + { + "epoch": 0.96152, + "grad_norm": 0.01620158739387989, + "learning_rate": 9.007448699786314e-08, + "loss": 0.0007, + "step": 48076 + }, + { + "epoch": 0.96156, + "grad_norm": 0.027850888669490814, + "learning_rate": 8.988760019009723e-08, + "loss": 0.0084, + "step": 48078 + }, + { + "epoch": 0.9616, + "grad_norm": 0.18464383482933044, + "learning_rate": 8.970090658507291e-08, + "loss": 0.0037, + "step": 48080 + }, + { + "epoch": 0.96164, + "grad_norm": 1.153607726097107, + "learning_rate": 8.951440618643171e-08, + "loss": 0.0191, + "step": 48082 + }, + { + "epoch": 0.96168, + "grad_norm": 0.054379746317863464, + "learning_rate": 8.932809899780959e-08, + "loss": 0.0007, + "step": 48084 + }, + { + "epoch": 0.96172, + "grad_norm": 0.008123165927827358, + "learning_rate": 8.914198502283921e-08, + "loss": 0.005, + "step": 48086 + }, + { + "epoch": 0.96176, + "grad_norm": 0.028303347527980804, + "learning_rate": 8.895606426514769e-08, + "loss": 0.0003, + "step": 48088 + }, + { + "epoch": 0.9618, + "grad_norm": 0.16210870444774628, + "learning_rate": 8.877033672835988e-08, + "loss": 0.0015, + "step": 48090 + }, + { + "epoch": 0.96184, + "grad_norm": 0.6582755446434021, + "learning_rate": 8.858480241609846e-08, + "loss": 0.0094, + "step": 48092 + }, + { + "epoch": 0.96188, + "grad_norm": 0.0861993208527565, + "learning_rate": 8.839946133197719e-08, + "loss": 0.0058, + "step": 48094 + }, + { + "epoch": 0.96192, + "grad_norm": 0.0673251822590828, + "learning_rate": 8.821431347961207e-08, + "loss": 0.0044, + "step": 48096 + }, + { + "epoch": 0.96196, + "grad_norm": 0.04886326193809509, + "learning_rate": 8.802935886261244e-08, + "loss": 0.001, + "step": 48098 + }, + { + "epoch": 0.962, + "grad_norm": 0.019978880882263184, + "learning_rate": 8.784459748458318e-08, + "loss": 0.0015, + "step": 48100 + }, + { + "epoch": 0.96204, + "grad_norm": 0.21815012395381927, + "learning_rate": 8.766002934912588e-08, + "loss": 0.004, + "step": 48102 + }, + { + "epoch": 0.96208, + "grad_norm": 0.0629066601395607, + "learning_rate": 8.747565445983985e-08, + "loss": 0.0024, + "step": 48104 + }, + { + "epoch": 0.96212, + "grad_norm": 0.024839991703629494, + "learning_rate": 8.729147282032002e-08, + "loss": 0.0018, + "step": 48106 + }, + { + "epoch": 0.96216, + "grad_norm": 0.009181762114167213, + "learning_rate": 8.710748443415573e-08, + "loss": 0.0007, + "step": 48108 + }, + { + "epoch": 0.9622, + "grad_norm": 0.24056333303451538, + "learning_rate": 8.692368930493522e-08, + "loss": 0.0031, + "step": 48110 + }, + { + "epoch": 0.96224, + "grad_norm": 0.11017266660928726, + "learning_rate": 8.674008743623897e-08, + "loss": 0.002, + "step": 48112 + }, + { + "epoch": 0.96228, + "grad_norm": 0.0044027226977050304, + "learning_rate": 8.655667883165076e-08, + "loss": 0.0001, + "step": 48114 + }, + { + "epoch": 0.96232, + "grad_norm": 0.05720845237374306, + "learning_rate": 8.637346349474218e-08, + "loss": 0.001, + "step": 48116 + }, + { + "epoch": 0.96236, + "grad_norm": 0.059314146637916565, + "learning_rate": 8.619044142908817e-08, + "loss": 0.0019, + "step": 48118 + }, + { + "epoch": 0.9624, + "grad_norm": 0.033589549362659454, + "learning_rate": 8.600761263825475e-08, + "loss": 0.0003, + "step": 48120 + }, + { + "epoch": 0.96244, + "grad_norm": 0.00669095991179347, + "learning_rate": 8.582497712580684e-08, + "loss": 0.002, + "step": 48122 + }, + { + "epoch": 0.96248, + "grad_norm": 0.047799285501241684, + "learning_rate": 8.564253489530494e-08, + "loss": 0.0007, + "step": 48124 + }, + { + "epoch": 0.96252, + "grad_norm": 0.006433264352381229, + "learning_rate": 8.546028595030731e-08, + "loss": 0.0002, + "step": 48126 + }, + { + "epoch": 0.96256, + "grad_norm": 0.5238192081451416, + "learning_rate": 8.527823029436444e-08, + "loss": 0.0053, + "step": 48128 + }, + { + "epoch": 0.9626, + "grad_norm": 0.03979776054620743, + "learning_rate": 8.509636793102683e-08, + "loss": 0.0007, + "step": 48130 + }, + { + "epoch": 0.96264, + "grad_norm": 0.10031469166278839, + "learning_rate": 8.491469886384162e-08, + "loss": 0.0055, + "step": 48132 + }, + { + "epoch": 0.96268, + "grad_norm": 0.01621917635202408, + "learning_rate": 8.473322309634823e-08, + "loss": 0.0017, + "step": 48134 + }, + { + "epoch": 0.96272, + "grad_norm": 0.1492583006620407, + "learning_rate": 8.455194063208494e-08, + "loss": 0.0014, + "step": 48136 + }, + { + "epoch": 0.96276, + "grad_norm": 0.017529014497995377, + "learning_rate": 8.437085147458668e-08, + "loss": 0.0004, + "step": 48138 + }, + { + "epoch": 0.9628, + "grad_norm": 0.00013202897389419377, + "learning_rate": 8.418995562738286e-08, + "loss": 0.0002, + "step": 48140 + }, + { + "epoch": 0.96284, + "grad_norm": 0.03827584534883499, + "learning_rate": 8.400925309400177e-08, + "loss": 0.0004, + "step": 48142 + }, + { + "epoch": 0.96288, + "grad_norm": 0.04431719332933426, + "learning_rate": 8.382874387796503e-08, + "loss": 0.0031, + "step": 48144 + }, + { + "epoch": 0.96292, + "grad_norm": 0.304193377494812, + "learning_rate": 8.364842798279205e-08, + "loss": 0.0071, + "step": 48146 + }, + { + "epoch": 0.96296, + "grad_norm": 0.1805160790681839, + "learning_rate": 8.34683054119989e-08, + "loss": 0.0034, + "step": 48148 + }, + { + "epoch": 0.963, + "grad_norm": 0.08590460568666458, + "learning_rate": 8.328837616909612e-08, + "loss": 0.0004, + "step": 48150 + }, + { + "epoch": 0.96304, + "grad_norm": 0.1544213443994522, + "learning_rate": 8.310864025759091e-08, + "loss": 0.0025, + "step": 48152 + }, + { + "epoch": 0.96308, + "grad_norm": 3.4857513904571533, + "learning_rate": 8.292909768098934e-08, + "loss": 0.0339, + "step": 48154 + }, + { + "epoch": 0.96312, + "grad_norm": 3.8822920322418213, + "learning_rate": 8.274974844279082e-08, + "loss": 0.0403, + "step": 48156 + }, + { + "epoch": 0.96316, + "grad_norm": 0.006233731750398874, + "learning_rate": 8.257059254649146e-08, + "loss": 0.0082, + "step": 48158 + }, + { + "epoch": 0.9632, + "grad_norm": 0.09028542786836624, + "learning_rate": 8.239162999558403e-08, + "loss": 0.0012, + "step": 48160 + }, + { + "epoch": 0.96324, + "grad_norm": 0.01279196236282587, + "learning_rate": 8.221286079355795e-08, + "loss": 0.0004, + "step": 48162 + }, + { + "epoch": 0.96328, + "grad_norm": 0.2604065239429474, + "learning_rate": 8.20342849438982e-08, + "loss": 0.0037, + "step": 48164 + }, + { + "epoch": 0.96332, + "grad_norm": 0.060395125299692154, + "learning_rate": 8.185590245008646e-08, + "loss": 0.0008, + "step": 48166 + }, + { + "epoch": 0.96336, + "grad_norm": 0.1362524926662445, + "learning_rate": 8.167771331559993e-08, + "loss": 0.1637, + "step": 48168 + }, + { + "epoch": 0.9634, + "grad_norm": 1.686852216720581, + "learning_rate": 8.149971754391251e-08, + "loss": 0.0122, + "step": 48170 + }, + { + "epoch": 0.96344, + "grad_norm": 0.004646810237318277, + "learning_rate": 8.132191513849363e-08, + "loss": 0.0003, + "step": 48172 + }, + { + "epoch": 0.96348, + "grad_norm": 4.745859622955322, + "learning_rate": 8.114430610281277e-08, + "loss": 0.0541, + "step": 48174 + }, + { + "epoch": 0.96352, + "grad_norm": 0.0005508260219357908, + "learning_rate": 8.096689044032712e-08, + "loss": 0.0001, + "step": 48176 + }, + { + "epoch": 0.96356, + "grad_norm": 0.21588779985904694, + "learning_rate": 8.078966815450062e-08, + "loss": 0.0054, + "step": 48178 + }, + { + "epoch": 0.9636, + "grad_norm": 0.0009098941227421165, + "learning_rate": 8.061263924878604e-08, + "loss": 0.0006, + "step": 48180 + }, + { + "epoch": 0.96364, + "grad_norm": 0.1459631621837616, + "learning_rate": 8.043580372663507e-08, + "loss": 0.0029, + "step": 48182 + }, + { + "epoch": 0.96368, + "grad_norm": 0.19455532729625702, + "learning_rate": 8.025916159149494e-08, + "loss": 0.0021, + "step": 48184 + }, + { + "epoch": 0.96372, + "grad_norm": 0.001916185487061739, + "learning_rate": 8.008271284680846e-08, + "loss": 0.0019, + "step": 48186 + }, + { + "epoch": 0.96376, + "grad_norm": 0.06009786203503609, + "learning_rate": 7.990645749601844e-08, + "loss": 0.0015, + "step": 48188 + }, + { + "epoch": 0.9638, + "grad_norm": 0.04454736411571503, + "learning_rate": 7.973039554255768e-08, + "loss": 0.0006, + "step": 48190 + }, + { + "epoch": 0.96384, + "grad_norm": 0.2943691611289978, + "learning_rate": 7.955452698986122e-08, + "loss": 0.207, + "step": 48192 + }, + { + "epoch": 0.96388, + "grad_norm": 0.352139413356781, + "learning_rate": 7.937885184135629e-08, + "loss": 0.0026, + "step": 48194 + }, + { + "epoch": 0.96392, + "grad_norm": 0.005705563351511955, + "learning_rate": 7.920337010046686e-08, + "loss": 0.0104, + "step": 48196 + }, + { + "epoch": 0.96396, + "grad_norm": 0.19967016577720642, + "learning_rate": 7.902808177061683e-08, + "loss": 0.0036, + "step": 48198 + }, + { + "epoch": 0.964, + "grad_norm": 0.056916892528533936, + "learning_rate": 7.885298685522235e-08, + "loss": 0.0005, + "step": 48200 + }, + { + "epoch": 0.96404, + "grad_norm": 0.07887697964906693, + "learning_rate": 7.867808535769516e-08, + "loss": 0.0024, + "step": 48202 + }, + { + "epoch": 0.96408, + "grad_norm": 0.2749974727630615, + "learning_rate": 7.850337728144696e-08, + "loss": 0.0028, + "step": 48204 + }, + { + "epoch": 0.96412, + "grad_norm": 0.07795609533786774, + "learning_rate": 7.832886262988393e-08, + "loss": 0.0012, + "step": 48206 + }, + { + "epoch": 0.96416, + "grad_norm": 0.04529675468802452, + "learning_rate": 7.815454140640777e-08, + "loss": 0.0473, + "step": 48208 + }, + { + "epoch": 0.9642, + "grad_norm": 20.682071685791016, + "learning_rate": 7.798041361441688e-08, + "loss": 0.338, + "step": 48210 + }, + { + "epoch": 0.96424, + "grad_norm": 0.0301359873265028, + "learning_rate": 7.780647925730523e-08, + "loss": 0.0004, + "step": 48212 + }, + { + "epoch": 0.96428, + "grad_norm": 0.05761253461241722, + "learning_rate": 7.763273833846562e-08, + "loss": 0.0007, + "step": 48214 + }, + { + "epoch": 0.96432, + "grad_norm": 2.7676329612731934, + "learning_rate": 7.745919086128317e-08, + "loss": 0.0319, + "step": 48216 + }, + { + "epoch": 0.96436, + "grad_norm": 0.008806812576949596, + "learning_rate": 7.728583682914292e-08, + "loss": 0.0024, + "step": 48218 + }, + { + "epoch": 0.9644, + "grad_norm": 1.4186772108078003, + "learning_rate": 7.71126762454233e-08, + "loss": 0.0172, + "step": 48220 + }, + { + "epoch": 0.96444, + "grad_norm": 0.004771501291543245, + "learning_rate": 7.693970911350157e-08, + "loss": 0.0003, + "step": 48222 + }, + { + "epoch": 0.96448, + "grad_norm": 0.08912143111228943, + "learning_rate": 7.67669354367484e-08, + "loss": 0.0022, + "step": 48224 + }, + { + "epoch": 0.96452, + "grad_norm": 0.026358235627412796, + "learning_rate": 7.65943552185322e-08, + "loss": 0.0004, + "step": 48226 + }, + { + "epoch": 0.96456, + "grad_norm": 0.21804285049438477, + "learning_rate": 7.642196846221917e-08, + "loss": 0.015, + "step": 48228 + }, + { + "epoch": 0.9646, + "grad_norm": 0.06258933246135712, + "learning_rate": 7.624977517116772e-08, + "loss": 0.0878, + "step": 48230 + }, + { + "epoch": 0.96464, + "grad_norm": 0.038098592311143875, + "learning_rate": 7.607777534873739e-08, + "loss": 0.0013, + "step": 48232 + }, + { + "epoch": 0.96468, + "grad_norm": 0.356330007314682, + "learning_rate": 7.590596899827773e-08, + "loss": 0.0094, + "step": 48234 + }, + { + "epoch": 0.96472, + "grad_norm": 0.6069625020027161, + "learning_rate": 7.573435612314272e-08, + "loss": 0.0098, + "step": 48236 + }, + { + "epoch": 0.96476, + "grad_norm": 8.661138534545898, + "learning_rate": 7.556293672667414e-08, + "loss": 0.0896, + "step": 48238 + }, + { + "epoch": 0.9648, + "grad_norm": 0.014461798593401909, + "learning_rate": 7.539171081221597e-08, + "loss": 0.0002, + "step": 48240 + }, + { + "epoch": 0.96484, + "grad_norm": 1.018939733505249, + "learning_rate": 7.522067838310665e-08, + "loss": 0.0109, + "step": 48242 + }, + { + "epoch": 0.96488, + "grad_norm": 0.06260515004396439, + "learning_rate": 7.504983944267907e-08, + "loss": 0.0063, + "step": 48244 + }, + { + "epoch": 0.96492, + "grad_norm": 0.02021077647805214, + "learning_rate": 7.487919399426503e-08, + "loss": 0.0005, + "step": 48246 + }, + { + "epoch": 0.96496, + "grad_norm": 0.0013882460771128535, + "learning_rate": 7.470874204119072e-08, + "loss": 0.0001, + "step": 48248 + }, + { + "epoch": 0.965, + "grad_norm": 0.09804227948188782, + "learning_rate": 7.453848358678018e-08, + "loss": 0.0011, + "step": 48250 + }, + { + "epoch": 0.96504, + "grad_norm": 0.005875001661479473, + "learning_rate": 7.436841863435073e-08, + "loss": 0.0008, + "step": 48252 + }, + { + "epoch": 0.96508, + "grad_norm": 0.13261699676513672, + "learning_rate": 7.419854718721863e-08, + "loss": 0.0021, + "step": 48254 + }, + { + "epoch": 0.96512, + "grad_norm": 0.026551855728030205, + "learning_rate": 7.40288692486979e-08, + "loss": 0.0004, + "step": 48256 + }, + { + "epoch": 0.96516, + "grad_norm": 0.08987367153167725, + "learning_rate": 7.385938482209365e-08, + "loss": 0.001, + "step": 48258 + }, + { + "epoch": 0.9652, + "grad_norm": 0.24437960982322693, + "learning_rate": 7.369009391070992e-08, + "loss": 0.0031, + "step": 48260 + }, + { + "epoch": 0.96524, + "grad_norm": 0.020660411566495895, + "learning_rate": 7.35209965178496e-08, + "loss": 0.0003, + "step": 48262 + }, + { + "epoch": 0.96528, + "grad_norm": 0.019673917442560196, + "learning_rate": 7.335209264680787e-08, + "loss": 0.0036, + "step": 48264 + }, + { + "epoch": 0.96532, + "grad_norm": 0.126413956284523, + "learning_rate": 7.318338230087652e-08, + "loss": 0.0015, + "step": 48266 + }, + { + "epoch": 0.96536, + "grad_norm": 0.00829426757991314, + "learning_rate": 7.301486548334736e-08, + "loss": 0.0006, + "step": 48268 + }, + { + "epoch": 0.9654, + "grad_norm": 0.1271773725748062, + "learning_rate": 7.284654219750332e-08, + "loss": 0.0019, + "step": 48270 + }, + { + "epoch": 0.96544, + "grad_norm": 0.047970857471227646, + "learning_rate": 7.267841244662622e-08, + "loss": 0.0006, + "step": 48272 + }, + { + "epoch": 0.96548, + "grad_norm": 0.019139111042022705, + "learning_rate": 7.251047623399454e-08, + "loss": 0.0012, + "step": 48274 + }, + { + "epoch": 0.96552, + "grad_norm": 0.012908004224300385, + "learning_rate": 7.234273356288346e-08, + "loss": 0.0133, + "step": 48276 + }, + { + "epoch": 0.96556, + "grad_norm": 0.0029577361419796944, + "learning_rate": 7.217518443656146e-08, + "loss": 0.0007, + "step": 48278 + }, + { + "epoch": 0.9656, + "grad_norm": 0.003951725084334612, + "learning_rate": 7.200782885829482e-08, + "loss": 0.0, + "step": 48280 + }, + { + "epoch": 0.96564, + "grad_norm": 0.3347410559654236, + "learning_rate": 7.18406668313465e-08, + "loss": 0.0046, + "step": 48282 + }, + { + "epoch": 0.96568, + "grad_norm": 0.08716471493244171, + "learning_rate": 7.167369835897608e-08, + "loss": 0.4695, + "step": 48284 + }, + { + "epoch": 0.96572, + "grad_norm": 0.010467887856066227, + "learning_rate": 7.150692344443877e-08, + "loss": 0.0019, + "step": 48286 + }, + { + "epoch": 0.96576, + "grad_norm": 0.10214535146951675, + "learning_rate": 7.134034209098529e-08, + "loss": 0.0012, + "step": 48288 + }, + { + "epoch": 0.9658, + "grad_norm": 0.026706375181674957, + "learning_rate": 7.117395430186414e-08, + "loss": 0.0004, + "step": 48290 + }, + { + "epoch": 0.96584, + "grad_norm": 0.0012797758681699634, + "learning_rate": 7.100776008031717e-08, + "loss": 0.0031, + "step": 48292 + }, + { + "epoch": 0.96588, + "grad_norm": 0.007516988553106785, + "learning_rate": 7.084175942958626e-08, + "loss": 0.0001, + "step": 48294 + }, + { + "epoch": 0.96592, + "grad_norm": 0.04317539557814598, + "learning_rate": 7.067595235290769e-08, + "loss": 0.0005, + "step": 48296 + }, + { + "epoch": 0.96596, + "grad_norm": 0.09864909201860428, + "learning_rate": 7.051033885351222e-08, + "loss": 0.0011, + "step": 48298 + }, + { + "epoch": 0.966, + "grad_norm": 0.06788335740566254, + "learning_rate": 7.034491893463059e-08, + "loss": 0.0008, + "step": 48300 + }, + { + "epoch": 0.96604, + "grad_norm": 0.02146550640463829, + "learning_rate": 7.017969259948576e-08, + "loss": 0.0006, + "step": 48302 + }, + { + "epoch": 0.96608, + "grad_norm": 0.08900021761655807, + "learning_rate": 7.001465985130185e-08, + "loss": 0.002, + "step": 48304 + }, + { + "epoch": 0.96612, + "grad_norm": 0.010081817395985126, + "learning_rate": 6.984982069329405e-08, + "loss": 0.0007, + "step": 48306 + }, + { + "epoch": 0.96616, + "grad_norm": 0.006025494076311588, + "learning_rate": 6.968517512867535e-08, + "loss": 0.0007, + "step": 48308 + }, + { + "epoch": 0.9662, + "grad_norm": 0.03562102094292641, + "learning_rate": 6.95207231606576e-08, + "loss": 0.0011, + "step": 48310 + }, + { + "epoch": 0.96624, + "grad_norm": 0.03282199800014496, + "learning_rate": 6.935646479244496e-08, + "loss": 0.0003, + "step": 48312 + }, + { + "epoch": 0.96628, + "grad_norm": 0.3076525628566742, + "learning_rate": 6.919240002724148e-08, + "loss": 0.0033, + "step": 48314 + }, + { + "epoch": 0.96632, + "grad_norm": 0.015398677438497543, + "learning_rate": 6.902852886824462e-08, + "loss": 0.0058, + "step": 48316 + }, + { + "epoch": 0.96636, + "grad_norm": 0.008621045388281345, + "learning_rate": 6.886485131864962e-08, + "loss": 0.0035, + "step": 48318 + }, + { + "epoch": 0.9664, + "grad_norm": 0.12458070367574692, + "learning_rate": 6.870136738164612e-08, + "loss": 0.0013, + "step": 48320 + }, + { + "epoch": 0.96644, + "grad_norm": 0.4192633628845215, + "learning_rate": 6.853807706042381e-08, + "loss": 0.0038, + "step": 48322 + }, + { + "epoch": 0.96648, + "grad_norm": 0.12204118072986603, + "learning_rate": 6.83749803581646e-08, + "loss": 0.0019, + "step": 48324 + }, + { + "epoch": 0.96652, + "grad_norm": 0.05774744600057602, + "learning_rate": 6.821207727804812e-08, + "loss": 0.0009, + "step": 48326 + }, + { + "epoch": 0.96656, + "grad_norm": 0.11640504002571106, + "learning_rate": 6.804936782325077e-08, + "loss": 0.1765, + "step": 48328 + }, + { + "epoch": 0.9666, + "grad_norm": 0.4980456829071045, + "learning_rate": 6.788685199694222e-08, + "loss": 0.0084, + "step": 48330 + }, + { + "epoch": 0.96664, + "grad_norm": 0.01621166244149208, + "learning_rate": 6.772452980229549e-08, + "loss": 0.0006, + "step": 48332 + }, + { + "epoch": 0.96668, + "grad_norm": 1.2706185579299927, + "learning_rate": 6.756240124247138e-08, + "loss": 0.013, + "step": 48334 + }, + { + "epoch": 0.96672, + "grad_norm": 0.010569312609732151, + "learning_rate": 6.740046632063179e-08, + "loss": 0.0022, + "step": 48336 + }, + { + "epoch": 0.96676, + "grad_norm": 0.002786114811897278, + "learning_rate": 6.723872503993422e-08, + "loss": 0.0017, + "step": 48338 + }, + { + "epoch": 0.9668, + "grad_norm": 0.015127060003578663, + "learning_rate": 6.707717740353059e-08, + "loss": 0.0029, + "step": 48340 + }, + { + "epoch": 0.96684, + "grad_norm": 0.09479676932096481, + "learning_rate": 6.69158234145728e-08, + "loss": 0.001, + "step": 48342 + }, + { + "epoch": 0.96688, + "grad_norm": 0.06918515264987946, + "learning_rate": 6.675466307620282e-08, + "loss": 0.0037, + "step": 48344 + }, + { + "epoch": 0.96692, + "grad_norm": 0.4319515824317932, + "learning_rate": 6.659369639156588e-08, + "loss": 0.0044, + "step": 48346 + }, + { + "epoch": 0.96696, + "grad_norm": 0.007202039938420057, + "learning_rate": 6.643292336379836e-08, + "loss": 0.0004, + "step": 48348 + }, + { + "epoch": 0.967, + "grad_norm": 0.05044082924723625, + "learning_rate": 6.627234399603554e-08, + "loss": 0.0011, + "step": 48350 + }, + { + "epoch": 0.96704, + "grad_norm": 0.5685361623764038, + "learning_rate": 6.611195829140826e-08, + "loss": 0.044, + "step": 48352 + }, + { + "epoch": 0.96708, + "grad_norm": 0.06030876934528351, + "learning_rate": 6.595176625304178e-08, + "loss": 0.0006, + "step": 48354 + }, + { + "epoch": 0.96712, + "grad_norm": 0.027710871770977974, + "learning_rate": 6.579176788406028e-08, + "loss": 0.0005, + "step": 48356 + }, + { + "epoch": 0.96716, + "grad_norm": 0.008888917975127697, + "learning_rate": 6.563196318758235e-08, + "loss": 0.0031, + "step": 48358 + }, + { + "epoch": 0.9672, + "grad_norm": 0.3848625123500824, + "learning_rate": 6.547235216672443e-08, + "loss": 0.0043, + "step": 48360 + }, + { + "epoch": 0.96724, + "grad_norm": 0.5754768252372742, + "learning_rate": 6.531293482459733e-08, + "loss": 0.01, + "step": 48362 + }, + { + "epoch": 0.96728, + "grad_norm": 0.05809011682868004, + "learning_rate": 6.515371116430969e-08, + "loss": 0.0015, + "step": 48364 + }, + { + "epoch": 0.96732, + "grad_norm": 0.0779058188199997, + "learning_rate": 6.499468118896457e-08, + "loss": 0.001, + "step": 48366 + }, + { + "epoch": 0.96736, + "grad_norm": 0.2705867290496826, + "learning_rate": 6.483584490166284e-08, + "loss": 0.0027, + "step": 48368 + }, + { + "epoch": 0.9674, + "grad_norm": 0.5074098110198975, + "learning_rate": 6.4677202305502e-08, + "loss": 0.0113, + "step": 48370 + }, + { + "epoch": 0.96744, + "grad_norm": 0.0002523253206163645, + "learning_rate": 6.451875340357405e-08, + "loss": 0.0009, + "step": 48372 + }, + { + "epoch": 0.96748, + "grad_norm": 0.01641915738582611, + "learning_rate": 6.436049819896761e-08, + "loss": 0.0003, + "step": 48374 + }, + { + "epoch": 0.96752, + "grad_norm": 0.26790693402290344, + "learning_rate": 6.420243669476911e-08, + "loss": 0.0031, + "step": 48376 + }, + { + "epoch": 0.96756, + "grad_norm": 0.0743890330195427, + "learning_rate": 6.40445688940594e-08, + "loss": 0.0031, + "step": 48378 + }, + { + "epoch": 0.9676, + "grad_norm": 0.1104246973991394, + "learning_rate": 6.388689479991606e-08, + "loss": 0.0143, + "step": 48380 + }, + { + "epoch": 0.96764, + "grad_norm": 0.01472836546599865, + "learning_rate": 6.372941441541325e-08, + "loss": 0.002, + "step": 48382 + }, + { + "epoch": 0.96768, + "grad_norm": 0.004298057407140732, + "learning_rate": 6.357212774362077e-08, + "loss": 0.0002, + "step": 48384 + }, + { + "epoch": 0.96772, + "grad_norm": 0.07999508082866669, + "learning_rate": 6.341503478760613e-08, + "loss": 0.0016, + "step": 48386 + }, + { + "epoch": 0.96776, + "grad_norm": 0.0005587530904449522, + "learning_rate": 6.325813555042915e-08, + "loss": 0.0003, + "step": 48388 + }, + { + "epoch": 0.9678, + "grad_norm": 0.00884082168340683, + "learning_rate": 6.310143003515179e-08, + "loss": 0.0017, + "step": 48390 + }, + { + "epoch": 0.96784, + "grad_norm": 0.0016793693648651242, + "learning_rate": 6.294491824482829e-08, + "loss": 0.0017, + "step": 48392 + }, + { + "epoch": 0.96788, + "grad_norm": 0.06724883615970612, + "learning_rate": 6.278860018250954e-08, + "loss": 0.0006, + "step": 48394 + }, + { + "epoch": 0.96792, + "grad_norm": 0.28269749879837036, + "learning_rate": 6.26324758512431e-08, + "loss": 0.0028, + "step": 48396 + }, + { + "epoch": 0.96796, + "grad_norm": 0.1820680946111679, + "learning_rate": 6.247654525407099e-08, + "loss": 0.0021, + "step": 48398 + }, + { + "epoch": 0.968, + "grad_norm": 0.02380252443253994, + "learning_rate": 6.232080839403631e-08, + "loss": 0.0005, + "step": 48400 + }, + { + "epoch": 0.96804, + "grad_norm": 0.0019377931021153927, + "learning_rate": 6.216526527417332e-08, + "loss": 0.0004, + "step": 48402 + }, + { + "epoch": 0.96808, + "grad_norm": 0.009628630243241787, + "learning_rate": 6.200991589751515e-08, + "loss": 0.0003, + "step": 48404 + }, + { + "epoch": 0.96812, + "grad_norm": 0.1346512883901596, + "learning_rate": 6.185476026708936e-08, + "loss": 0.0018, + "step": 48406 + }, + { + "epoch": 0.96816, + "grad_norm": 0.013976692222058773, + "learning_rate": 6.169979838592244e-08, + "loss": 0.0013, + "step": 48408 + }, + { + "epoch": 0.9682, + "grad_norm": 0.061443183571100235, + "learning_rate": 6.154503025703418e-08, + "loss": 0.0014, + "step": 48410 + }, + { + "epoch": 0.96824, + "grad_norm": 0.01856265403330326, + "learning_rate": 6.139045588344217e-08, + "loss": 0.0021, + "step": 48412 + }, + { + "epoch": 0.96828, + "grad_norm": 0.13503749668598175, + "learning_rate": 6.123607526816067e-08, + "loss": 0.003, + "step": 48414 + }, + { + "epoch": 0.96832, + "grad_norm": 0.0029329080134630203, + "learning_rate": 6.108188841419727e-08, + "loss": 0.0027, + "step": 48416 + }, + { + "epoch": 0.96836, + "grad_norm": 0.0015610884875059128, + "learning_rate": 6.092789532456068e-08, + "loss": 0.0001, + "step": 48418 + }, + { + "epoch": 0.9684, + "grad_norm": 0.001562241232022643, + "learning_rate": 6.07740960022507e-08, + "loss": 0.0004, + "step": 48420 + }, + { + "epoch": 0.96844, + "grad_norm": 0.009767686016857624, + "learning_rate": 6.062049045026719e-08, + "loss": 0.0001, + "step": 48422 + }, + { + "epoch": 0.96848, + "grad_norm": 0.03682475537061691, + "learning_rate": 6.046707867160439e-08, + "loss": 0.0005, + "step": 48424 + }, + { + "epoch": 0.96852, + "grad_norm": 0.10673223435878754, + "learning_rate": 6.031386066925327e-08, + "loss": 0.0012, + "step": 48426 + }, + { + "epoch": 0.96856, + "grad_norm": 0.10460526496171951, + "learning_rate": 6.01608364462003e-08, + "loss": 0.0037, + "step": 48428 + }, + { + "epoch": 0.9686, + "grad_norm": 0.018763763830065727, + "learning_rate": 6.000800600542977e-08, + "loss": 0.0003, + "step": 48430 + }, + { + "epoch": 0.96864, + "grad_norm": 0.21523867547512054, + "learning_rate": 5.985536934992042e-08, + "loss": 0.002, + "step": 48432 + }, + { + "epoch": 0.96868, + "grad_norm": 0.372031033039093, + "learning_rate": 5.970292648264876e-08, + "loss": 0.0028, + "step": 48434 + }, + { + "epoch": 0.96872, + "grad_norm": 0.07547970861196518, + "learning_rate": 5.95506774065846e-08, + "loss": 0.0079, + "step": 48436 + }, + { + "epoch": 0.96876, + "grad_norm": 0.006409336347132921, + "learning_rate": 5.939862212469893e-08, + "loss": 0.0004, + "step": 48438 + }, + { + "epoch": 0.9688, + "grad_norm": 0.02713572047650814, + "learning_rate": 5.9246760639953824e-08, + "loss": 0.0011, + "step": 48440 + }, + { + "epoch": 0.96884, + "grad_norm": 16.548274993896484, + "learning_rate": 5.909509295531246e-08, + "loss": 0.7553, + "step": 48442 + }, + { + "epoch": 0.96888, + "grad_norm": 0.0011687714140862226, + "learning_rate": 5.894361907372914e-08, + "loss": 0.0001, + "step": 48444 + }, + { + "epoch": 0.96892, + "grad_norm": 0.9028334617614746, + "learning_rate": 5.8792338998158173e-08, + "loss": 0.0092, + "step": 48446 + }, + { + "epoch": 0.96896, + "grad_norm": 0.626731812953949, + "learning_rate": 5.8641252731548306e-08, + "loss": 0.0062, + "step": 48448 + }, + { + "epoch": 0.969, + "grad_norm": 0.3500480055809021, + "learning_rate": 5.849036027684607e-08, + "loss": 0.0035, + "step": 48450 + }, + { + "epoch": 0.96904, + "grad_norm": 0.19859802722930908, + "learning_rate": 5.833966163699245e-08, + "loss": 0.0186, + "step": 48452 + }, + { + "epoch": 0.96908, + "grad_norm": 0.022976849228143692, + "learning_rate": 5.8189156814925094e-08, + "loss": 0.0032, + "step": 48454 + }, + { + "epoch": 0.96912, + "grad_norm": 0.13976672291755676, + "learning_rate": 5.8038845813578324e-08, + "loss": 0.0021, + "step": 48456 + }, + { + "epoch": 0.96916, + "grad_norm": 0.027049144729971886, + "learning_rate": 5.788872863588313e-08, + "loss": 0.0011, + "step": 48458 + }, + { + "epoch": 0.9692, + "grad_norm": 0.05602727457880974, + "learning_rate": 5.7738805284764945e-08, + "loss": 0.0049, + "step": 48460 + }, + { + "epoch": 0.96924, + "grad_norm": 0.3876785337924957, + "learning_rate": 5.758907576314809e-08, + "loss": 0.0148, + "step": 48462 + }, + { + "epoch": 0.96928, + "grad_norm": 0.05902092158794403, + "learning_rate": 5.743954007395136e-08, + "loss": 0.0109, + "step": 48464 + }, + { + "epoch": 0.96932, + "grad_norm": 0.1370006948709488, + "learning_rate": 5.729019822008908e-08, + "loss": 0.0011, + "step": 48466 + }, + { + "epoch": 0.96936, + "grad_norm": 0.019646907225251198, + "learning_rate": 5.714105020447336e-08, + "loss": 0.0006, + "step": 48468 + }, + { + "epoch": 0.9694, + "grad_norm": 0.28057289123535156, + "learning_rate": 5.699209603001077e-08, + "loss": 0.0029, + "step": 48470 + }, + { + "epoch": 0.96944, + "grad_norm": 0.04615844413638115, + "learning_rate": 5.6843335699607874e-08, + "loss": 0.0009, + "step": 48472 + }, + { + "epoch": 0.96948, + "grad_norm": 0.0031669153831899166, + "learning_rate": 5.6694769216162347e-08, + "loss": 0.0001, + "step": 48474 + }, + { + "epoch": 0.96952, + "grad_norm": 0.003281723242253065, + "learning_rate": 5.654639658257188e-08, + "loss": 0.0003, + "step": 48476 + }, + { + "epoch": 0.96956, + "grad_norm": 0.07786745578050613, + "learning_rate": 5.63982178017275e-08, + "loss": 0.0018, + "step": 48478 + }, + { + "epoch": 0.9696, + "grad_norm": 0.2635781466960907, + "learning_rate": 5.625023287652021e-08, + "loss": 0.0033, + "step": 48480 + }, + { + "epoch": 0.96964, + "grad_norm": 21.38875389099121, + "learning_rate": 5.610244180983438e-08, + "loss": 0.373, + "step": 48482 + }, + { + "epoch": 0.96968, + "grad_norm": 0.05107639729976654, + "learning_rate": 5.595484460454992e-08, + "loss": 0.0966, + "step": 48484 + }, + { + "epoch": 0.96972, + "grad_norm": 0.31309008598327637, + "learning_rate": 5.580744126354565e-08, + "loss": 0.0024, + "step": 48486 + }, + { + "epoch": 0.96976, + "grad_norm": 0.1076425090432167, + "learning_rate": 5.566023178969482e-08, + "loss": 0.0028, + "step": 48488 + }, + { + "epoch": 0.9698, + "grad_norm": 0.2230367660522461, + "learning_rate": 5.5513216185867356e-08, + "loss": 0.0029, + "step": 48490 + }, + { + "epoch": 0.96984, + "grad_norm": 0.14661994576454163, + "learning_rate": 5.536639445492986e-08, + "loss": 0.0097, + "step": 48492 + }, + { + "epoch": 0.96988, + "grad_norm": 0.17128470540046692, + "learning_rate": 5.521976659974448e-08, + "loss": 0.0016, + "step": 48494 + }, + { + "epoch": 0.96992, + "grad_norm": 0.80520099401474, + "learning_rate": 5.507333262316894e-08, + "loss": 0.0106, + "step": 48496 + }, + { + "epoch": 0.96996, + "grad_norm": 0.01076373178511858, + "learning_rate": 5.492709252805983e-08, + "loss": 0.0004, + "step": 48498 + }, + { + "epoch": 0.97, + "grad_norm": 0.22714824974536896, + "learning_rate": 5.4781046317267103e-08, + "loss": 0.002, + "step": 48500 + }, + { + "epoch": 0.97004, + "grad_norm": 0.043554168194532394, + "learning_rate": 5.463519399363737e-08, + "loss": 0.0004, + "step": 48502 + }, + { + "epoch": 0.97008, + "grad_norm": 0.08378026634454727, + "learning_rate": 5.4489535560015013e-08, + "loss": 0.0013, + "step": 48504 + }, + { + "epoch": 0.97012, + "grad_norm": 0.01619790494441986, + "learning_rate": 5.4344071019238884e-08, + "loss": 0.0005, + "step": 48506 + }, + { + "epoch": 0.97016, + "grad_norm": 0.07597445696592331, + "learning_rate": 5.419880037414671e-08, + "loss": 0.0012, + "step": 48508 + }, + { + "epoch": 0.9702, + "grad_norm": 0.0008781516808085144, + "learning_rate": 5.4053723627567336e-08, + "loss": 0.0001, + "step": 48510 + }, + { + "epoch": 0.97024, + "grad_norm": 0.02750439941883087, + "learning_rate": 5.3908840782332936e-08, + "loss": 0.0047, + "step": 48512 + }, + { + "epoch": 0.97028, + "grad_norm": 0.47132083773612976, + "learning_rate": 5.376415184126571e-08, + "loss": 0.0057, + "step": 48514 + }, + { + "epoch": 0.97032, + "grad_norm": 0.02056579478085041, + "learning_rate": 5.3619656807186727e-08, + "loss": 0.0003, + "step": 48516 + }, + { + "epoch": 0.97036, + "grad_norm": 0.007378938142210245, + "learning_rate": 5.3475355682913734e-08, + "loss": 0.0031, + "step": 48518 + }, + { + "epoch": 0.9704, + "grad_norm": 0.001036174944601953, + "learning_rate": 5.3331248471258926e-08, + "loss": 0.0, + "step": 48520 + }, + { + "epoch": 0.97044, + "grad_norm": 0.05777682736515999, + "learning_rate": 5.3187335175033386e-08, + "loss": 0.0006, + "step": 48522 + }, + { + "epoch": 0.97048, + "grad_norm": 0.027522418648004532, + "learning_rate": 5.304361579704043e-08, + "loss": 0.0169, + "step": 48524 + }, + { + "epoch": 0.97052, + "grad_norm": 0.004239134956151247, + "learning_rate": 5.2900090340084476e-08, + "loss": 0.0168, + "step": 48526 + }, + { + "epoch": 0.97056, + "grad_norm": 0.006081786938011646, + "learning_rate": 5.275675880696107e-08, + "loss": 0.0002, + "step": 48528 + }, + { + "epoch": 0.9706, + "grad_norm": 9.463527385378256e-05, + "learning_rate": 5.261362120046687e-08, + "loss": 0.0015, + "step": 48530 + }, + { + "epoch": 0.97064, + "grad_norm": 0.35971367359161377, + "learning_rate": 5.2470677523390744e-08, + "loss": 0.0041, + "step": 48532 + }, + { + "epoch": 0.97068, + "grad_norm": 0.048197414726018906, + "learning_rate": 5.232792777852047e-08, + "loss": 0.0007, + "step": 48534 + }, + { + "epoch": 0.97072, + "grad_norm": 0.09409575909376144, + "learning_rate": 5.218537196863827e-08, + "loss": 0.0022, + "step": 48536 + }, + { + "epoch": 0.97076, + "grad_norm": 0.5108703970909119, + "learning_rate": 5.2043010096524124e-08, + "loss": 0.005, + "step": 48538 + }, + { + "epoch": 0.9708, + "grad_norm": 0.3461368978023529, + "learning_rate": 5.190084216495361e-08, + "loss": 0.0054, + "step": 48540 + }, + { + "epoch": 0.97084, + "grad_norm": 0.0047412351705133915, + "learning_rate": 5.175886817669673e-08, + "loss": 0.219, + "step": 48542 + }, + { + "epoch": 0.97088, + "grad_norm": 0.5315918326377869, + "learning_rate": 5.161708813452348e-08, + "loss": 0.0063, + "step": 48544 + }, + { + "epoch": 0.97092, + "grad_norm": 0.19215360283851624, + "learning_rate": 5.147550204119611e-08, + "loss": 0.003, + "step": 48546 + }, + { + "epoch": 0.97096, + "grad_norm": 0.003232579445466399, + "learning_rate": 5.133410989947463e-08, + "loss": 0.0001, + "step": 48548 + }, + { + "epoch": 0.971, + "grad_norm": 0.04332568868994713, + "learning_rate": 5.119291171211793e-08, + "loss": 0.0007, + "step": 48550 + }, + { + "epoch": 0.97104, + "grad_norm": 0.09609533101320267, + "learning_rate": 5.105190748187716e-08, + "loss": 0.0009, + "step": 48552 + }, + { + "epoch": 0.97108, + "grad_norm": 0.09438776224851608, + "learning_rate": 5.091109721150233e-08, + "loss": 0.0037, + "step": 48554 + }, + { + "epoch": 0.97112, + "grad_norm": 0.0497434139251709, + "learning_rate": 5.0770480903735707e-08, + "loss": 0.0007, + "step": 48556 + }, + { + "epoch": 0.97116, + "grad_norm": 0.0446191243827343, + "learning_rate": 5.063005856132286e-08, + "loss": 0.0006, + "step": 48558 + }, + { + "epoch": 0.9712, + "grad_norm": 0.11227778345346451, + "learning_rate": 5.048983018699827e-08, + "loss": 0.0014, + "step": 48560 + }, + { + "epoch": 0.97124, + "grad_norm": 0.3834603726863861, + "learning_rate": 5.0349795783497524e-08, + "loss": 0.0042, + "step": 48562 + }, + { + "epoch": 0.97128, + "grad_norm": 0.005181239452213049, + "learning_rate": 5.0209955353549554e-08, + "loss": 0.0019, + "step": 48564 + }, + { + "epoch": 0.97132, + "grad_norm": 0.1758437305688858, + "learning_rate": 5.007030889988107e-08, + "loss": 0.0021, + "step": 48566 + }, + { + "epoch": 0.97136, + "grad_norm": 0.09695606678724289, + "learning_rate": 4.993085642521434e-08, + "loss": 0.0028, + "step": 48568 + }, + { + "epoch": 0.9714, + "grad_norm": 0.038561247289180756, + "learning_rate": 4.979159793226718e-08, + "loss": 0.0011, + "step": 48570 + }, + { + "epoch": 0.97144, + "grad_norm": 0.03940099477767944, + "learning_rate": 4.965253342375742e-08, + "loss": 0.0005, + "step": 48572 + }, + { + "epoch": 0.97148, + "grad_norm": 0.028597384691238403, + "learning_rate": 4.9513662902392904e-08, + "loss": 0.0006, + "step": 48574 + }, + { + "epoch": 0.97152, + "grad_norm": 0.5156111121177673, + "learning_rate": 4.937498637088367e-08, + "loss": 0.0061, + "step": 48576 + }, + { + "epoch": 0.97156, + "grad_norm": 0.09214311838150024, + "learning_rate": 4.92365038319298e-08, + "loss": 0.001, + "step": 48578 + }, + { + "epoch": 0.9716, + "grad_norm": 0.0032542417757213116, + "learning_rate": 4.9098215288235776e-08, + "loss": 0.0003, + "step": 48580 + }, + { + "epoch": 0.97164, + "grad_norm": 0.011179485358297825, + "learning_rate": 4.89601207424939e-08, + "loss": 0.0093, + "step": 48582 + }, + { + "epoch": 0.97168, + "grad_norm": 0.01343468576669693, + "learning_rate": 4.882222019739758e-08, + "loss": 0.0003, + "step": 48584 + }, + { + "epoch": 0.97172, + "grad_norm": 0.15407727658748627, + "learning_rate": 4.868451365563576e-08, + "loss": 0.0036, + "step": 48586 + }, + { + "epoch": 0.97176, + "grad_norm": 0.047489993274211884, + "learning_rate": 4.854700111989186e-08, + "loss": 0.0029, + "step": 48588 + }, + { + "epoch": 0.9718, + "grad_norm": 0.014070612378418446, + "learning_rate": 4.840968259284817e-08, + "loss": 0.0002, + "step": 48590 + }, + { + "epoch": 0.97184, + "grad_norm": 0.024244198575615883, + "learning_rate": 4.827255807718145e-08, + "loss": 0.0005, + "step": 48592 + }, + { + "epoch": 0.97188, + "grad_norm": 0.009666653349995613, + "learning_rate": 4.8135627575564e-08, + "loss": 0.0073, + "step": 48594 + }, + { + "epoch": 0.97192, + "grad_norm": 0.008654600009322166, + "learning_rate": 4.799889109066591e-08, + "loss": 0.0008, + "step": 48596 + }, + { + "epoch": 0.97196, + "grad_norm": 0.01348558347672224, + "learning_rate": 4.786234862515393e-08, + "loss": 0.0005, + "step": 48598 + }, + { + "epoch": 0.972, + "grad_norm": 0.12291279435157776, + "learning_rate": 4.772600018168816e-08, + "loss": 0.0012, + "step": 48600 + }, + { + "epoch": 0.97204, + "grad_norm": 0.02463330142199993, + "learning_rate": 4.758984576292869e-08, + "loss": 0.0008, + "step": 48602 + }, + { + "epoch": 0.97208, + "grad_norm": 0.09919515997171402, + "learning_rate": 4.745388537152895e-08, + "loss": 0.0066, + "step": 48604 + }, + { + "epoch": 0.97212, + "grad_norm": 0.13543665409088135, + "learning_rate": 4.731811901013905e-08, + "loss": 0.0057, + "step": 48606 + }, + { + "epoch": 0.97216, + "grad_norm": 0.4384033977985382, + "learning_rate": 4.718254668140687e-08, + "loss": 0.0049, + "step": 48608 + }, + { + "epoch": 0.9722, + "grad_norm": 6.492677688598633, + "learning_rate": 4.704716838797363e-08, + "loss": 0.071, + "step": 48610 + }, + { + "epoch": 0.97224, + "grad_norm": 0.033020686358213425, + "learning_rate": 4.6911984132481656e-08, + "loss": 0.0005, + "step": 48612 + }, + { + "epoch": 0.97228, + "grad_norm": 0.026595665141940117, + "learning_rate": 4.6776993917564405e-08, + "loss": 0.0006, + "step": 48614 + }, + { + "epoch": 0.97232, + "grad_norm": 0.005248531699180603, + "learning_rate": 4.6642197745854215e-08, + "loss": 0.0004, + "step": 48616 + }, + { + "epoch": 0.97236, + "grad_norm": 0.020215587690472603, + "learning_rate": 4.6507595619977866e-08, + "loss": 0.0018, + "step": 48618 + }, + { + "epoch": 0.9724, + "grad_norm": 0.03342439606785774, + "learning_rate": 4.6373187542561036e-08, + "loss": 0.0005, + "step": 48620 + }, + { + "epoch": 0.97244, + "grad_norm": 2.4480748176574707, + "learning_rate": 4.623897351622275e-08, + "loss": 0.022, + "step": 48622 + }, + { + "epoch": 0.97248, + "grad_norm": 1.96736478805542, + "learning_rate": 4.610495354358091e-08, + "loss": 0.0201, + "step": 48624 + }, + { + "epoch": 0.97252, + "grad_norm": 8.193500518798828, + "learning_rate": 4.5971127627246756e-08, + "loss": 0.1057, + "step": 48626 + }, + { + "epoch": 0.97256, + "grad_norm": 0.06553591042757034, + "learning_rate": 4.583749576983043e-08, + "loss": 0.0044, + "step": 48628 + }, + { + "epoch": 0.9726, + "grad_norm": 0.08795527368783951, + "learning_rate": 4.570405797393762e-08, + "loss": 0.0008, + "step": 48630 + }, + { + "epoch": 0.97264, + "grad_norm": 0.1763114333152771, + "learning_rate": 4.557081424216847e-08, + "loss": 0.0016, + "step": 48632 + }, + { + "epoch": 0.97268, + "grad_norm": 4.753526210784912, + "learning_rate": 4.54377645771209e-08, + "loss": 0.0423, + "step": 48634 + }, + { + "epoch": 0.97272, + "grad_norm": 0.02878309227526188, + "learning_rate": 4.5304908981389503e-08, + "loss": 0.1159, + "step": 48636 + }, + { + "epoch": 0.97276, + "grad_norm": 0.7183654308319092, + "learning_rate": 4.5172247457563325e-08, + "loss": 0.0116, + "step": 48638 + }, + { + "epoch": 0.9728, + "grad_norm": 0.06017201021313667, + "learning_rate": 4.503978000823028e-08, + "loss": 0.0026, + "step": 48640 + }, + { + "epoch": 0.97284, + "grad_norm": 0.0007072960725054145, + "learning_rate": 4.4907506635970544e-08, + "loss": 0.0008, + "step": 48642 + }, + { + "epoch": 0.97288, + "grad_norm": 0.07651177793741226, + "learning_rate": 4.4775427343365375e-08, + "loss": 0.0009, + "step": 48644 + }, + { + "epoch": 0.97292, + "grad_norm": 0.020598934963345528, + "learning_rate": 4.4643542132988274e-08, + "loss": 0.0016, + "step": 48646 + }, + { + "epoch": 0.97296, + "grad_norm": 0.1179739236831665, + "learning_rate": 4.451185100741051e-08, + "loss": 0.02, + "step": 48648 + }, + { + "epoch": 0.973, + "grad_norm": 0.08329502493143082, + "learning_rate": 4.438035396920004e-08, + "loss": 0.0009, + "step": 48650 + }, + { + "epoch": 0.97304, + "grad_norm": 15.593476295471191, + "learning_rate": 4.424905102091925e-08, + "loss": 0.2509, + "step": 48652 + }, + { + "epoch": 0.97308, + "grad_norm": 0.010700470767915249, + "learning_rate": 4.411794216512944e-08, + "loss": 0.0011, + "step": 48654 + }, + { + "epoch": 0.97312, + "grad_norm": 0.008971086703240871, + "learning_rate": 4.3987027404386315e-08, + "loss": 0.0014, + "step": 48656 + }, + { + "epoch": 0.97316, + "grad_norm": 0.007622810080647469, + "learning_rate": 4.3856306741241196e-08, + "loss": 0.0017, + "step": 48658 + }, + { + "epoch": 0.9732, + "grad_norm": 0.43310850858688354, + "learning_rate": 4.3725780178243135e-08, + "loss": 0.0037, + "step": 48660 + }, + { + "epoch": 0.97324, + "grad_norm": 0.0734124705195427, + "learning_rate": 4.3595447717936776e-08, + "loss": 0.0012, + "step": 48662 + }, + { + "epoch": 0.97328, + "grad_norm": 0.29923656582832336, + "learning_rate": 4.346530936286342e-08, + "loss": 0.1082, + "step": 48664 + }, + { + "epoch": 0.97332, + "grad_norm": 0.1283990740776062, + "learning_rate": 4.3335365115559915e-08, + "loss": 0.0021, + "step": 48666 + }, + { + "epoch": 0.97336, + "grad_norm": 0.23940208554267883, + "learning_rate": 4.320561497855869e-08, + "loss": 0.0212, + "step": 48668 + }, + { + "epoch": 0.9734, + "grad_norm": 1.237518072128296, + "learning_rate": 4.3076058954391045e-08, + "loss": 0.0153, + "step": 48670 + }, + { + "epoch": 0.97344, + "grad_norm": 0.09636841714382172, + "learning_rate": 4.2946697045580524e-08, + "loss": 0.0018, + "step": 48672 + }, + { + "epoch": 0.97348, + "grad_norm": 0.12802955508232117, + "learning_rate": 4.2817529254651766e-08, + "loss": 0.0018, + "step": 48674 + }, + { + "epoch": 0.97352, + "grad_norm": 0.22684061527252197, + "learning_rate": 4.268855558412055e-08, + "loss": 0.0031, + "step": 48676 + }, + { + "epoch": 0.97356, + "grad_norm": 0.06320494413375854, + "learning_rate": 4.2559776036502633e-08, + "loss": 0.0009, + "step": 48678 + }, + { + "epoch": 0.9736, + "grad_norm": 0.002651833463460207, + "learning_rate": 4.2431190614309334e-08, + "loss": 0.0072, + "step": 48680 + }, + { + "epoch": 0.97364, + "grad_norm": 0.048251181840896606, + "learning_rate": 4.230279932004533e-08, + "loss": 0.0011, + "step": 48682 + }, + { + "epoch": 0.97368, + "grad_norm": 23.52515983581543, + "learning_rate": 4.217460215621527e-08, + "loss": 0.253, + "step": 48684 + }, + { + "epoch": 0.97372, + "grad_norm": 12.376188278198242, + "learning_rate": 4.204659912531717e-08, + "loss": 0.1055, + "step": 48686 + }, + { + "epoch": 0.97376, + "grad_norm": 0.10409151017665863, + "learning_rate": 4.191879022984791e-08, + "loss": 0.0043, + "step": 48688 + }, + { + "epoch": 0.9738, + "grad_norm": 0.5673985481262207, + "learning_rate": 4.179117547229883e-08, + "loss": 0.0047, + "step": 48690 + }, + { + "epoch": 0.97384, + "grad_norm": 0.07965242862701416, + "learning_rate": 4.166375485515795e-08, + "loss": 0.0007, + "step": 48692 + }, + { + "epoch": 0.97388, + "grad_norm": 0.04733224958181381, + "learning_rate": 4.153652838090772e-08, + "loss": 0.0011, + "step": 48694 + }, + { + "epoch": 0.97392, + "grad_norm": 0.0034440425224602222, + "learning_rate": 4.1409496052030597e-08, + "loss": 0.0005, + "step": 48696 + }, + { + "epoch": 0.97396, + "grad_norm": 0.046437039971351624, + "learning_rate": 4.1282657871003496e-08, + "loss": 0.0009, + "step": 48698 + }, + { + "epoch": 0.974, + "grad_norm": 0.046198002994060516, + "learning_rate": 4.115601384029666e-08, + "loss": 0.0029, + "step": 48700 + }, + { + "epoch": 0.97404, + "grad_norm": 0.04890226572751999, + "learning_rate": 4.102956396238034e-08, + "loss": 0.0005, + "step": 48702 + }, + { + "epoch": 0.97408, + "grad_norm": 0.014005514793097973, + "learning_rate": 4.090330823972033e-08, + "loss": 0.0006, + "step": 48704 + }, + { + "epoch": 0.97412, + "grad_norm": 0.003116979729384184, + "learning_rate": 4.0777246674778005e-08, + "loss": 0.0002, + "step": 48706 + }, + { + "epoch": 0.97416, + "grad_norm": 1.2329689264297485, + "learning_rate": 4.0651379270010284e-08, + "loss": 0.0125, + "step": 48708 + }, + { + "epoch": 0.9742, + "grad_norm": 0.026058724150061607, + "learning_rate": 4.052570602787076e-08, + "loss": 0.0017, + "step": 48710 + }, + { + "epoch": 0.97424, + "grad_norm": 0.03924069181084633, + "learning_rate": 4.040022695081192e-08, + "loss": 0.0008, + "step": 48712 + }, + { + "epoch": 0.97428, + "grad_norm": 0.0034058811143040657, + "learning_rate": 4.027494204127624e-08, + "loss": 0.0001, + "step": 48714 + }, + { + "epoch": 0.97432, + "grad_norm": 0.034372150897979736, + "learning_rate": 4.014985130170845e-08, + "loss": 0.0152, + "step": 48716 + }, + { + "epoch": 0.97436, + "grad_norm": 0.14310982823371887, + "learning_rate": 4.002495473454771e-08, + "loss": 0.0032, + "step": 48718 + }, + { + "epoch": 0.9744, + "grad_norm": 0.008325617760419846, + "learning_rate": 3.990025234222872e-08, + "loss": 0.0004, + "step": 48720 + }, + { + "epoch": 0.97444, + "grad_norm": 0.5572325587272644, + "learning_rate": 3.977574412718066e-08, + "loss": 0.0052, + "step": 48722 + }, + { + "epoch": 0.97448, + "grad_norm": 0.008054961450397968, + "learning_rate": 3.965143009183381e-08, + "loss": 0.0005, + "step": 48724 + }, + { + "epoch": 0.97452, + "grad_norm": 0.06834327429533005, + "learning_rate": 3.9527310238608453e-08, + "loss": 0.0013, + "step": 48726 + }, + { + "epoch": 0.97456, + "grad_norm": 0.2127111405134201, + "learning_rate": 3.9403384569927093e-08, + "loss": 0.002, + "step": 48728 + }, + { + "epoch": 0.9746, + "grad_norm": 0.016171084716916084, + "learning_rate": 3.927965308820558e-08, + "loss": 0.0005, + "step": 48730 + }, + { + "epoch": 0.97464, + "grad_norm": 0.1733284890651703, + "learning_rate": 3.915611579585421e-08, + "loss": 0.0035, + "step": 48732 + }, + { + "epoch": 0.97468, + "grad_norm": 0.021707385778427124, + "learning_rate": 3.903277269528327e-08, + "loss": 0.0004, + "step": 48734 + }, + { + "epoch": 0.97472, + "grad_norm": 0.8144947290420532, + "learning_rate": 3.89096237888964e-08, + "loss": 0.0077, + "step": 48736 + }, + { + "epoch": 0.97476, + "grad_norm": 0.07265906035900116, + "learning_rate": 3.878666907909501e-08, + "loss": 0.0018, + "step": 48738 + }, + { + "epoch": 0.9748, + "grad_norm": 0.018452664837241173, + "learning_rate": 3.866390856827495e-08, + "loss": 0.0018, + "step": 48740 + }, + { + "epoch": 0.97484, + "grad_norm": 0.0743502676486969, + "learning_rate": 3.854134225883099e-08, + "loss": 0.0025, + "step": 48742 + }, + { + "epoch": 0.97488, + "grad_norm": 0.02248181588947773, + "learning_rate": 3.8418970153153435e-08, + "loss": 0.0006, + "step": 48744 + }, + { + "epoch": 0.97492, + "grad_norm": 3.964041233062744, + "learning_rate": 3.8296792253624814e-08, + "loss": 0.0444, + "step": 48746 + }, + { + "epoch": 0.97496, + "grad_norm": 0.0398852676153183, + "learning_rate": 3.81748085626299e-08, + "loss": 0.0009, + "step": 48748 + }, + { + "epoch": 0.975, + "grad_norm": 0.008970139548182487, + "learning_rate": 3.805301908254455e-08, + "loss": 0.0008, + "step": 48750 + }, + { + "epoch": 0.97504, + "grad_norm": 0.03903377056121826, + "learning_rate": 3.793142381574577e-08, + "loss": 0.0004, + "step": 48752 + }, + { + "epoch": 0.97508, + "grad_norm": 0.007080165669322014, + "learning_rate": 3.781002276460166e-08, + "loss": 0.0009, + "step": 48754 + }, + { + "epoch": 0.97512, + "grad_norm": 0.21785423159599304, + "learning_rate": 3.768881593148033e-08, + "loss": 0.0023, + "step": 48756 + }, + { + "epoch": 0.97516, + "grad_norm": 0.14349888265132904, + "learning_rate": 3.7567803318744324e-08, + "loss": 0.0031, + "step": 48758 + }, + { + "epoch": 0.9752, + "grad_norm": 0.002689550630748272, + "learning_rate": 3.7446984928753984e-08, + "loss": 0.0, + "step": 48760 + }, + { + "epoch": 0.97524, + "grad_norm": 0.14264211058616638, + "learning_rate": 3.7326360763862976e-08, + "loss": 0.0022, + "step": 48762 + }, + { + "epoch": 0.97528, + "grad_norm": 0.21273578703403473, + "learning_rate": 3.720593082642276e-08, + "loss": 0.006, + "step": 48764 + }, + { + "epoch": 0.97532, + "grad_norm": 0.029437709599733353, + "learning_rate": 3.708569511878368e-08, + "loss": 0.0005, + "step": 48766 + }, + { + "epoch": 0.97536, + "grad_norm": 0.32328933477401733, + "learning_rate": 3.696565364328719e-08, + "loss": 0.003, + "step": 48768 + }, + { + "epoch": 0.9754, + "grad_norm": 1.3165234327316284, + "learning_rate": 3.684580640227586e-08, + "loss": 0.0095, + "step": 48770 + }, + { + "epoch": 0.97544, + "grad_norm": 0.08946775645017624, + "learning_rate": 3.672615339808339e-08, + "loss": 0.001, + "step": 48772 + }, + { + "epoch": 0.97548, + "grad_norm": 0.15678007900714874, + "learning_rate": 3.660669463304456e-08, + "loss": 0.0012, + "step": 48774 + }, + { + "epoch": 0.97552, + "grad_norm": 0.05541422963142395, + "learning_rate": 3.648743010948863e-08, + "loss": 0.0012, + "step": 48776 + }, + { + "epoch": 0.97556, + "grad_norm": 0.09351546317338943, + "learning_rate": 3.6368359829738186e-08, + "loss": 0.0008, + "step": 48778 + }, + { + "epoch": 0.9756, + "grad_norm": 1.6018102169036865, + "learning_rate": 3.6249483796116924e-08, + "loss": 0.0169, + "step": 48780 + }, + { + "epoch": 0.97564, + "grad_norm": 0.05003476142883301, + "learning_rate": 3.6130802010941876e-08, + "loss": 0.0004, + "step": 48782 + }, + { + "epoch": 0.97568, + "grad_norm": 0.01453381311148405, + "learning_rate": 3.601231447652675e-08, + "loss": 0.0011, + "step": 48784 + }, + { + "epoch": 0.97572, + "grad_norm": 0.09184141457080841, + "learning_rate": 3.5894021195180816e-08, + "loss": 0.0009, + "step": 48786 + }, + { + "epoch": 0.97576, + "grad_norm": 1.9055118560791016, + "learning_rate": 3.577592216921111e-08, + "loss": 0.016, + "step": 48788 + }, + { + "epoch": 0.9758, + "grad_norm": 0.17009112238883972, + "learning_rate": 3.565801740092023e-08, + "loss": 0.0016, + "step": 48790 + }, + { + "epoch": 0.97584, + "grad_norm": 0.3108646273612976, + "learning_rate": 3.5540306892605236e-08, + "loss": 0.4134, + "step": 48792 + }, + { + "epoch": 0.97588, + "grad_norm": 12.735353469848633, + "learning_rate": 3.5422790646563176e-08, + "loss": 0.1387, + "step": 48794 + }, + { + "epoch": 0.97592, + "grad_norm": 0.021321957930922508, + "learning_rate": 3.530546866508222e-08, + "loss": 0.0378, + "step": 48796 + }, + { + "epoch": 0.97596, + "grad_norm": 12.318086624145508, + "learning_rate": 3.518834095045276e-08, + "loss": 0.6344, + "step": 48798 + }, + { + "epoch": 0.976, + "grad_norm": 0.21558524668216705, + "learning_rate": 3.50714075049563e-08, + "loss": 0.0054, + "step": 48800 + }, + { + "epoch": 0.97604, + "grad_norm": 0.0347406342625618, + "learning_rate": 3.495466833087324e-08, + "loss": 0.0019, + "step": 48802 + }, + { + "epoch": 0.97608, + "grad_norm": 0.09274883568286896, + "learning_rate": 3.483812343047954e-08, + "loss": 0.0015, + "step": 48804 + }, + { + "epoch": 0.97612, + "grad_norm": 0.0009515942074358463, + "learning_rate": 3.4721772806046715e-08, + "loss": 0.0001, + "step": 48806 + }, + { + "epoch": 0.97616, + "grad_norm": 0.3115265667438507, + "learning_rate": 3.460561645984295e-08, + "loss": 0.0035, + "step": 48808 + }, + { + "epoch": 0.9762, + "grad_norm": 0.031223302707076073, + "learning_rate": 3.4489654394134206e-08, + "loss": 0.0095, + "step": 48810 + }, + { + "epoch": 0.97624, + "grad_norm": 0.008489124476909637, + "learning_rate": 3.43738866111798e-08, + "loss": 0.0015, + "step": 48812 + }, + { + "epoch": 0.97628, + "grad_norm": 0.17061173915863037, + "learning_rate": 3.4258313113236796e-08, + "loss": 0.0015, + "step": 48814 + }, + { + "epoch": 0.97632, + "grad_norm": 0.04600680619478226, + "learning_rate": 3.414293390255896e-08, + "loss": 0.0009, + "step": 48816 + }, + { + "epoch": 0.97636, + "grad_norm": 0.16794651746749878, + "learning_rate": 3.4027748981395604e-08, + "loss": 0.0015, + "step": 48818 + }, + { + "epoch": 0.9764, + "grad_norm": 0.6120791435241699, + "learning_rate": 3.391275835199159e-08, + "loss": 0.0085, + "step": 48820 + }, + { + "epoch": 0.97644, + "grad_norm": 0.005341395270079374, + "learning_rate": 3.379796201658958e-08, + "loss": 0.0168, + "step": 48822 + }, + { + "epoch": 0.97648, + "grad_norm": 0.29353034496307373, + "learning_rate": 3.3683359977426665e-08, + "loss": 0.0035, + "step": 48824 + }, + { + "epoch": 0.97652, + "grad_norm": 0.04124412685632706, + "learning_rate": 3.356895223673884e-08, + "loss": 0.0035, + "step": 48826 + }, + { + "epoch": 0.97656, + "grad_norm": 0.1581372320652008, + "learning_rate": 3.3454738796754316e-08, + "loss": 0.0038, + "step": 48828 + }, + { + "epoch": 0.9766, + "grad_norm": 0.08551901578903198, + "learning_rate": 3.3340719659701315e-08, + "loss": 0.0009, + "step": 48830 + }, + { + "epoch": 0.97664, + "grad_norm": 0.08307669311761856, + "learning_rate": 3.322689482780139e-08, + "loss": 0.0011, + "step": 48832 + }, + { + "epoch": 0.97668, + "grad_norm": 0.14928796887397766, + "learning_rate": 3.31132643032761e-08, + "loss": 0.0026, + "step": 48834 + }, + { + "epoch": 0.97672, + "grad_norm": 0.0084481630474329, + "learning_rate": 3.2999828088337016e-08, + "loss": 0.0002, + "step": 48836 + }, + { + "epoch": 0.97676, + "grad_norm": 0.1611432284116745, + "learning_rate": 3.288658618519902e-08, + "loss": 0.0021, + "step": 48838 + }, + { + "epoch": 0.9768, + "grad_norm": 0.014853106811642647, + "learning_rate": 3.2773538596068134e-08, + "loss": 0.0003, + "step": 48840 + }, + { + "epoch": 0.97684, + "grad_norm": 0.06095391511917114, + "learning_rate": 3.2660685323148144e-08, + "loss": 0.0007, + "step": 48842 + }, + { + "epoch": 0.97688, + "grad_norm": 0.19768227636814117, + "learning_rate": 3.2548026368639515e-08, + "loss": 0.002, + "step": 48844 + }, + { + "epoch": 0.97692, + "grad_norm": 0.015965906903147697, + "learning_rate": 3.243556173473938e-08, + "loss": 0.1265, + "step": 48846 + }, + { + "epoch": 0.97696, + "grad_norm": 0.06744110584259033, + "learning_rate": 3.232329142363933e-08, + "loss": 0.0016, + "step": 48848 + }, + { + "epoch": 0.977, + "grad_norm": 0.05595477297902107, + "learning_rate": 3.22112154375287e-08, + "loss": 0.0063, + "step": 48850 + }, + { + "epoch": 0.97704, + "grad_norm": 0.10534892231225967, + "learning_rate": 3.2099333778591314e-08, + "loss": 0.0083, + "step": 48852 + }, + { + "epoch": 0.97708, + "grad_norm": 0.07439931482076645, + "learning_rate": 3.198764644900987e-08, + "loss": 0.0024, + "step": 48854 + }, + { + "epoch": 0.97712, + "grad_norm": 0.24347089231014252, + "learning_rate": 3.187615345096151e-08, + "loss": 0.0019, + "step": 48856 + }, + { + "epoch": 0.97716, + "grad_norm": 0.012165741063654423, + "learning_rate": 3.176485478661895e-08, + "loss": 0.0014, + "step": 48858 + }, + { + "epoch": 0.9772, + "grad_norm": 0.02857128158211708, + "learning_rate": 3.165375045815266e-08, + "loss": 0.0014, + "step": 48860 + }, + { + "epoch": 0.97724, + "grad_norm": 0.04184829071164131, + "learning_rate": 3.154284046772871e-08, + "loss": 0.0006, + "step": 48862 + }, + { + "epoch": 0.97728, + "grad_norm": 0.08509792387485504, + "learning_rate": 3.143212481750868e-08, + "loss": 0.0009, + "step": 48864 + }, + { + "epoch": 0.97732, + "grad_norm": 0.34324875473976135, + "learning_rate": 3.1321603509651965e-08, + "loss": 0.0047, + "step": 48866 + }, + { + "epoch": 0.97736, + "grad_norm": 0.01018314529210329, + "learning_rate": 3.12112765463124e-08, + "loss": 0.0051, + "step": 48868 + }, + { + "epoch": 0.9774, + "grad_norm": 1.9360768795013428, + "learning_rate": 3.110114392964159e-08, + "loss": 0.0176, + "step": 48870 + }, + { + "epoch": 0.97744, + "grad_norm": 0.00285937893204391, + "learning_rate": 3.099120566178671e-08, + "loss": 0.0007, + "step": 48872 + }, + { + "epoch": 0.97748, + "grad_norm": 0.012002572417259216, + "learning_rate": 3.0881461744890483e-08, + "loss": 0.0007, + "step": 48874 + }, + { + "epoch": 0.97752, + "grad_norm": 0.012685767374932766, + "learning_rate": 3.077191218109232e-08, + "loss": 0.0001, + "step": 48876 + }, + { + "epoch": 0.97756, + "grad_norm": 0.5952199697494507, + "learning_rate": 3.066255697252829e-08, + "loss": 0.0055, + "step": 48878 + }, + { + "epoch": 0.9776, + "grad_norm": 0.05423248931765556, + "learning_rate": 3.0553396121330015e-08, + "loss": 0.0006, + "step": 48880 + }, + { + "epoch": 0.97764, + "grad_norm": 0.04888250678777695, + "learning_rate": 3.0444429629625794e-08, + "loss": 0.0009, + "step": 48882 + }, + { + "epoch": 0.97768, + "grad_norm": 0.048364993184804916, + "learning_rate": 3.0335657499541705e-08, + "loss": 0.0035, + "step": 48884 + }, + { + "epoch": 0.97772, + "grad_norm": 0.049933407455682755, + "learning_rate": 3.022707973319494e-08, + "loss": 0.0012, + "step": 48886 + }, + { + "epoch": 0.97776, + "grad_norm": 0.3206560015678406, + "learning_rate": 3.011869633270492e-08, + "loss": 0.0069, + "step": 48888 + }, + { + "epoch": 0.9778, + "grad_norm": 0.05873824283480644, + "learning_rate": 3.001050730018218e-08, + "loss": 0.0008, + "step": 48890 + }, + { + "epoch": 0.97784, + "grad_norm": 0.08080413192510605, + "learning_rate": 2.990251263773947e-08, + "loss": 0.0011, + "step": 48892 + }, + { + "epoch": 0.97788, + "grad_norm": 0.09082574397325516, + "learning_rate": 2.9794712347479547e-08, + "loss": 0.0008, + "step": 48894 + }, + { + "epoch": 0.97792, + "grad_norm": 1.0644159317016602, + "learning_rate": 2.9687106431504074e-08, + "loss": 0.0083, + "step": 48896 + }, + { + "epoch": 0.97796, + "grad_norm": 0.02255820855498314, + "learning_rate": 2.957969489191248e-08, + "loss": 0.0007, + "step": 48898 + }, + { + "epoch": 0.978, + "grad_norm": 1.2225679159164429, + "learning_rate": 2.947247773079753e-08, + "loss": 0.1109, + "step": 48900 + }, + { + "epoch": 0.97804, + "grad_norm": 0.07400946319103241, + "learning_rate": 2.9365454950249783e-08, + "loss": 0.001, + "step": 48902 + }, + { + "epoch": 0.97808, + "grad_norm": 0.018368378281593323, + "learning_rate": 2.925862655235534e-08, + "loss": 0.0026, + "step": 48904 + }, + { + "epoch": 0.97812, + "grad_norm": 0.03924131765961647, + "learning_rate": 2.9151992539198092e-08, + "loss": 0.0018, + "step": 48906 + }, + { + "epoch": 0.97816, + "grad_norm": 0.010574966669082642, + "learning_rate": 2.904555291285527e-08, + "loss": 0.0002, + "step": 48908 + }, + { + "epoch": 0.9782, + "grad_norm": 2.4039313793182373, + "learning_rate": 2.8939307675402983e-08, + "loss": 0.0268, + "step": 48910 + }, + { + "epoch": 0.97824, + "grad_norm": 0.053206879645586014, + "learning_rate": 2.8833256828912914e-08, + "loss": 0.0007, + "step": 48912 + }, + { + "epoch": 0.97828, + "grad_norm": 0.018465397879481316, + "learning_rate": 2.8727400375451187e-08, + "loss": 0.0002, + "step": 48914 + }, + { + "epoch": 0.97832, + "grad_norm": 0.005951170343905687, + "learning_rate": 2.8621738317082816e-08, + "loss": 0.0003, + "step": 48916 + }, + { + "epoch": 0.97836, + "grad_norm": 0.1479027420282364, + "learning_rate": 2.851627065586726e-08, + "loss": 0.0017, + "step": 48918 + }, + { + "epoch": 0.9784, + "grad_norm": 0.6928149461746216, + "learning_rate": 2.8410997393860663e-08, + "loss": 0.008, + "step": 48920 + }, + { + "epoch": 0.97844, + "grad_norm": 20.884695053100586, + "learning_rate": 2.8305918533114708e-08, + "loss": 0.3918, + "step": 48922 + }, + { + "epoch": 0.97848, + "grad_norm": 0.2162555754184723, + "learning_rate": 2.8201034075679978e-08, + "loss": 0.0294, + "step": 48924 + }, + { + "epoch": 0.97852, + "grad_norm": 2.1886463165283203, + "learning_rate": 2.8096344023599287e-08, + "loss": 0.0305, + "step": 48926 + }, + { + "epoch": 0.97856, + "grad_norm": 0.29658830165863037, + "learning_rate": 2.7991848378914334e-08, + "loss": 0.0026, + "step": 48928 + }, + { + "epoch": 0.9786, + "grad_norm": 0.25323471426963806, + "learning_rate": 2.7887547143662375e-08, + "loss": 0.0036, + "step": 48930 + }, + { + "epoch": 0.97864, + "grad_norm": 0.17954866588115692, + "learning_rate": 2.778344031987623e-08, + "loss": 0.0026, + "step": 48932 + }, + { + "epoch": 0.97868, + "grad_norm": 0.06380099803209305, + "learning_rate": 2.767952790958539e-08, + "loss": 0.0006, + "step": 48934 + }, + { + "epoch": 0.97872, + "grad_norm": 0.8585436344146729, + "learning_rate": 2.757580991481712e-08, + "loss": 0.0085, + "step": 48936 + }, + { + "epoch": 0.97876, + "grad_norm": 0.0007736994884908199, + "learning_rate": 2.7472286337592023e-08, + "loss": 0.0072, + "step": 48938 + }, + { + "epoch": 0.9788, + "grad_norm": 0.01719604805111885, + "learning_rate": 2.7368957179929602e-08, + "loss": 0.0002, + "step": 48940 + }, + { + "epoch": 0.97884, + "grad_norm": 0.054781366139650345, + "learning_rate": 2.7265822443843792e-08, + "loss": 0.0021, + "step": 48942 + }, + { + "epoch": 0.97888, + "grad_norm": 0.19164644181728363, + "learning_rate": 2.7162882131344104e-08, + "loss": 0.0026, + "step": 48944 + }, + { + "epoch": 0.97892, + "grad_norm": 0.014440788887441158, + "learning_rate": 2.7060136244438928e-08, + "loss": 0.0028, + "step": 48946 + }, + { + "epoch": 0.97896, + "grad_norm": 0.08239882439374924, + "learning_rate": 2.6957584785131106e-08, + "loss": 0.0046, + "step": 48948 + }, + { + "epoch": 0.979, + "grad_norm": 0.04707146808505058, + "learning_rate": 2.6855227755419046e-08, + "loss": 0.0009, + "step": 48950 + }, + { + "epoch": 0.97904, + "grad_norm": 0.22219102084636688, + "learning_rate": 2.6753065157298917e-08, + "loss": 0.004, + "step": 48952 + }, + { + "epoch": 0.97908, + "grad_norm": 0.02011255919933319, + "learning_rate": 2.665109699276247e-08, + "loss": 0.0008, + "step": 48954 + }, + { + "epoch": 0.97912, + "grad_norm": 0.002195677487179637, + "learning_rate": 2.654932326379811e-08, + "loss": 0.0872, + "step": 48956 + }, + { + "epoch": 0.97916, + "grad_norm": 0.10943592339754105, + "learning_rate": 2.6447743972388695e-08, + "loss": 0.0063, + "step": 48958 + }, + { + "epoch": 0.9792, + "grad_norm": 0.007190809119492769, + "learning_rate": 2.6346359120514863e-08, + "loss": 0.0007, + "step": 48960 + }, + { + "epoch": 0.97924, + "grad_norm": 17.01584243774414, + "learning_rate": 2.6245168710153924e-08, + "loss": 0.8148, + "step": 48962 + }, + { + "epoch": 0.97928, + "grad_norm": 0.06739337742328644, + "learning_rate": 2.614417274327874e-08, + "loss": 0.0008, + "step": 48964 + }, + { + "epoch": 0.97932, + "grad_norm": 0.024812912568449974, + "learning_rate": 2.6043371221857738e-08, + "loss": 0.0017, + "step": 48966 + }, + { + "epoch": 0.97936, + "grad_norm": 0.005001641344279051, + "learning_rate": 2.5942764147856014e-08, + "loss": 0.005, + "step": 48968 + }, + { + "epoch": 0.9794, + "grad_norm": 0.20961511135101318, + "learning_rate": 2.584235152323422e-08, + "loss": 0.0048, + "step": 48970 + }, + { + "epoch": 0.97944, + "grad_norm": 0.031234245747327805, + "learning_rate": 2.5742133349951903e-08, + "loss": 0.0005, + "step": 48972 + }, + { + "epoch": 0.97948, + "grad_norm": 0.015665296465158463, + "learning_rate": 2.5642109629961942e-08, + "loss": 0.0008, + "step": 48974 + }, + { + "epoch": 0.97952, + "grad_norm": 0.02887054719030857, + "learning_rate": 2.554228036521278e-08, + "loss": 0.0008, + "step": 48976 + }, + { + "epoch": 0.97956, + "grad_norm": 0.3917269706726074, + "learning_rate": 2.5442645557653965e-08, + "loss": 0.0044, + "step": 48978 + }, + { + "epoch": 0.9796, + "grad_norm": 0.04330277442932129, + "learning_rate": 2.5343205209225062e-08, + "loss": 0.0006, + "step": 48980 + }, + { + "epoch": 0.97964, + "grad_norm": 0.03408755734562874, + "learning_rate": 2.524395932186563e-08, + "loss": 0.0004, + "step": 48982 + }, + { + "epoch": 0.97968, + "grad_norm": 0.43366822600364685, + "learning_rate": 2.5144907897510785e-08, + "loss": 0.0046, + "step": 48984 + }, + { + "epoch": 0.97972, + "grad_norm": 0.05369458720088005, + "learning_rate": 2.5046050938090095e-08, + "loss": 0.0027, + "step": 48986 + }, + { + "epoch": 0.97976, + "grad_norm": 0.010436337441205978, + "learning_rate": 2.4947388445533126e-08, + "loss": 0.0189, + "step": 48988 + }, + { + "epoch": 0.9798, + "grad_norm": 0.18482163548469543, + "learning_rate": 2.484892042176279e-08, + "loss": 0.0022, + "step": 48990 + }, + { + "epoch": 0.97984, + "grad_norm": 0.03218565136194229, + "learning_rate": 2.4750646868697546e-08, + "loss": 0.0004, + "step": 48992 + }, + { + "epoch": 0.97988, + "grad_norm": 0.6860380172729492, + "learning_rate": 2.4652567788254757e-08, + "loss": 0.0056, + "step": 48994 + }, + { + "epoch": 0.97992, + "grad_norm": 0.03642759099602699, + "learning_rate": 2.4554683182345106e-08, + "loss": 0.0024, + "step": 48996 + }, + { + "epoch": 0.97996, + "grad_norm": 0.3985912501811981, + "learning_rate": 2.4456993052878185e-08, + "loss": 0.0038, + "step": 48998 + }, + { + "epoch": 0.98, + "grad_norm": 0.007053168956190348, + "learning_rate": 2.4359497401758026e-08, + "loss": 0.0021, + "step": 49000 + }, + { + "epoch": 0.98004, + "grad_norm": 2.5796239376068115, + "learning_rate": 2.4262196230884216e-08, + "loss": 0.0252, + "step": 49002 + }, + { + "epoch": 0.98008, + "grad_norm": 0.0922170951962471, + "learning_rate": 2.416508954215524e-08, + "loss": 0.0012, + "step": 49004 + }, + { + "epoch": 0.98012, + "grad_norm": 0.2340092658996582, + "learning_rate": 2.4068177337465137e-08, + "loss": 0.0019, + "step": 49006 + }, + { + "epoch": 0.98016, + "grad_norm": 0.14472484588623047, + "learning_rate": 2.3971459618700176e-08, + "loss": 0.3023, + "step": 49008 + }, + { + "epoch": 0.9802, + "grad_norm": 0.08012693375349045, + "learning_rate": 2.3874936387747738e-08, + "loss": 0.0018, + "step": 49010 + }, + { + "epoch": 0.98024, + "grad_norm": 0.754603385925293, + "learning_rate": 2.377860764648965e-08, + "loss": 0.0086, + "step": 49012 + }, + { + "epoch": 0.98028, + "grad_norm": 0.02747746743261814, + "learning_rate": 2.3682473396804405e-08, + "loss": 0.0004, + "step": 49014 + }, + { + "epoch": 0.98032, + "grad_norm": 0.022150151431560516, + "learning_rate": 2.3586533640564955e-08, + "loss": 0.0005, + "step": 49016 + }, + { + "epoch": 0.98036, + "grad_norm": 2.7053072452545166, + "learning_rate": 2.3490788379642027e-08, + "loss": 0.0269, + "step": 49018 + }, + { + "epoch": 0.9804, + "grad_norm": 0.19659340381622314, + "learning_rate": 2.339523761590301e-08, + "loss": 0.0024, + "step": 49020 + }, + { + "epoch": 0.98044, + "grad_norm": 0.021633578464388847, + "learning_rate": 2.3299881351209753e-08, + "loss": 0.0016, + "step": 49022 + }, + { + "epoch": 0.98048, + "grad_norm": 0.12705226242542267, + "learning_rate": 2.3204719587421877e-08, + "loss": 0.0078, + "step": 49024 + }, + { + "epoch": 0.98052, + "grad_norm": 0.02472815103828907, + "learning_rate": 2.3109752326394564e-08, + "loss": 0.0005, + "step": 49026 + }, + { + "epoch": 0.98056, + "grad_norm": 0.024631354957818985, + "learning_rate": 2.3014979569978556e-08, + "loss": 0.0007, + "step": 49028 + }, + { + "epoch": 0.9806, + "grad_norm": 0.05684038996696472, + "learning_rate": 2.292040132002238e-08, + "loss": 0.0011, + "step": 49030 + }, + { + "epoch": 0.98064, + "grad_norm": 0.0276411734521389, + "learning_rate": 2.2826017578368996e-08, + "loss": 0.0002, + "step": 49032 + }, + { + "epoch": 0.98068, + "grad_norm": 0.010915798135101795, + "learning_rate": 2.2731828346859164e-08, + "loss": 0.0031, + "step": 49034 + }, + { + "epoch": 0.98072, + "grad_norm": 0.027339816093444824, + "learning_rate": 2.2637833627329188e-08, + "loss": 0.0134, + "step": 49036 + }, + { + "epoch": 0.98076, + "grad_norm": 0.008645149879157543, + "learning_rate": 2.2544033421612043e-08, + "loss": 0.0003, + "step": 49038 + }, + { + "epoch": 0.9808, + "grad_norm": 0.007886305451393127, + "learning_rate": 2.2450427731534052e-08, + "loss": 0.0001, + "step": 49040 + }, + { + "epoch": 0.98084, + "grad_norm": 0.045375093817710876, + "learning_rate": 2.235701655892375e-08, + "loss": 0.0171, + "step": 49042 + }, + { + "epoch": 0.98088, + "grad_norm": 0.012355121783912182, + "learning_rate": 2.226379990559857e-08, + "loss": 0.004, + "step": 49044 + }, + { + "epoch": 0.98092, + "grad_norm": 0.0349663570523262, + "learning_rate": 2.217077777337817e-08, + "loss": 0.0006, + "step": 49046 + }, + { + "epoch": 0.98096, + "grad_norm": 0.09024751931428909, + "learning_rate": 2.2077950164075544e-08, + "loss": 0.0014, + "step": 49048 + }, + { + "epoch": 0.981, + "grad_norm": 0.06259354948997498, + "learning_rate": 2.1985317079500358e-08, + "loss": 0.0008, + "step": 49050 + }, + { + "epoch": 0.98104, + "grad_norm": 0.14340932667255402, + "learning_rate": 2.1892878521457828e-08, + "loss": 0.0013, + "step": 49052 + }, + { + "epoch": 0.98108, + "grad_norm": 0.0764143168926239, + "learning_rate": 2.180063449175096e-08, + "loss": 0.0007, + "step": 49054 + }, + { + "epoch": 0.98112, + "grad_norm": 0.07842046767473221, + "learning_rate": 2.1708584992178315e-08, + "loss": 0.0023, + "step": 49056 + }, + { + "epoch": 0.98116, + "grad_norm": 0.03948910906910896, + "learning_rate": 2.1616730024532907e-08, + "loss": 0.0005, + "step": 49058 + }, + { + "epoch": 0.9812, + "grad_norm": 0.36697569489479065, + "learning_rate": 2.152506959060774e-08, + "loss": 0.0073, + "step": 49060 + }, + { + "epoch": 0.98124, + "grad_norm": 0.009639622643589973, + "learning_rate": 2.1433603692188053e-08, + "loss": 0.0002, + "step": 49062 + }, + { + "epoch": 0.98128, + "grad_norm": 0.001541878911666572, + "learning_rate": 2.134233233105798e-08, + "loss": 0.0011, + "step": 49064 + }, + { + "epoch": 0.98132, + "grad_norm": 0.008211609907448292, + "learning_rate": 2.1251255508996095e-08, + "loss": 0.0002, + "step": 49066 + }, + { + "epoch": 0.98136, + "grad_norm": 0.04616617038846016, + "learning_rate": 2.1160373227777644e-08, + "loss": 0.3368, + "step": 49068 + }, + { + "epoch": 0.9814, + "grad_norm": 0.05450156331062317, + "learning_rate": 2.1069685489176762e-08, + "loss": 0.0011, + "step": 49070 + }, + { + "epoch": 0.98144, + "grad_norm": 0.060707081109285355, + "learning_rate": 2.0979192294958707e-08, + "loss": 0.0011, + "step": 49072 + }, + { + "epoch": 0.98148, + "grad_norm": 0.1290896087884903, + "learning_rate": 2.088889364688984e-08, + "loss": 0.0007, + "step": 49074 + }, + { + "epoch": 0.98152, + "grad_norm": 0.49311673641204834, + "learning_rate": 2.079878954672876e-08, + "loss": 0.0036, + "step": 49076 + }, + { + "epoch": 0.98156, + "grad_norm": 0.043581075966358185, + "learning_rate": 2.0708879996232945e-08, + "loss": 0.0034, + "step": 49078 + }, + { + "epoch": 0.9816, + "grad_norm": 0.002059312304481864, + "learning_rate": 2.061916499715544e-08, + "loss": 0.0007, + "step": 49080 + }, + { + "epoch": 0.98164, + "grad_norm": 0.011514224112033844, + "learning_rate": 2.052964455124484e-08, + "loss": 0.0239, + "step": 49082 + }, + { + "epoch": 0.98168, + "grad_norm": 0.038465335965156555, + "learning_rate": 2.0440318660246428e-08, + "loss": 0.0008, + "step": 49084 + }, + { + "epoch": 0.98172, + "grad_norm": 0.04076235368847847, + "learning_rate": 2.0351187325902132e-08, + "loss": 0.0006, + "step": 49086 + }, + { + "epoch": 0.98176, + "grad_norm": 0.02100212313234806, + "learning_rate": 2.026225054994946e-08, + "loss": 0.0003, + "step": 49088 + }, + { + "epoch": 0.9818, + "grad_norm": 3.3017170429229736, + "learning_rate": 2.017350833412146e-08, + "loss": 0.0383, + "step": 49090 + }, + { + "epoch": 0.98184, + "grad_norm": 0.0006470308289863169, + "learning_rate": 2.008496068015009e-08, + "loss": 0.0189, + "step": 49092 + }, + { + "epoch": 0.98188, + "grad_norm": 1.5994144678115845, + "learning_rate": 1.999660758975952e-08, + "loss": 0.0151, + "step": 49094 + }, + { + "epoch": 0.98192, + "grad_norm": 0.0037868358194828033, + "learning_rate": 1.990844906467393e-08, + "loss": 0.0377, + "step": 49096 + }, + { + "epoch": 0.98196, + "grad_norm": 0.043190550059080124, + "learning_rate": 1.982048510660972e-08, + "loss": 0.0019, + "step": 49098 + }, + { + "epoch": 0.982, + "grad_norm": 0.006877920124679804, + "learning_rate": 1.973271571728441e-08, + "loss": 0.0031, + "step": 49100 + }, + { + "epoch": 0.98204, + "grad_norm": 0.030519740656018257, + "learning_rate": 1.9645140898407745e-08, + "loss": 0.0009, + "step": 49102 + }, + { + "epoch": 0.98208, + "grad_norm": 0.09514974802732468, + "learning_rate": 1.955776065168724e-08, + "loss": 0.0016, + "step": 49104 + }, + { + "epoch": 0.98212, + "grad_norm": 0.07069840282201767, + "learning_rate": 1.9470574978827094e-08, + "loss": 0.0018, + "step": 49106 + }, + { + "epoch": 0.98216, + "grad_norm": 0.008407256565988064, + "learning_rate": 1.9383583881524836e-08, + "loss": 0.0003, + "step": 49108 + }, + { + "epoch": 0.9822, + "grad_norm": 0.025875983759760857, + "learning_rate": 1.929678736148022e-08, + "loss": 0.0004, + "step": 49110 + }, + { + "epoch": 0.98224, + "grad_norm": 0.037715550512075424, + "learning_rate": 1.9210185420381887e-08, + "loss": 0.0004, + "step": 49112 + }, + { + "epoch": 0.98228, + "grad_norm": 0.046083223074674606, + "learning_rate": 1.9123778059918497e-08, + "loss": 0.0003, + "step": 49114 + }, + { + "epoch": 0.98232, + "grad_norm": 0.03297983855009079, + "learning_rate": 1.903756528177647e-08, + "loss": 0.0007, + "step": 49116 + }, + { + "epoch": 0.98236, + "grad_norm": 0.6786656379699707, + "learning_rate": 1.895154708763669e-08, + "loss": 0.0078, + "step": 49118 + }, + { + "epoch": 0.9824, + "grad_norm": 0.04279199242591858, + "learning_rate": 1.886572347917337e-08, + "loss": 0.001, + "step": 49120 + }, + { + "epoch": 0.98244, + "grad_norm": 0.24973741173744202, + "learning_rate": 1.8780094458062948e-08, + "loss": 0.0025, + "step": 49122 + }, + { + "epoch": 0.98248, + "grad_norm": 0.07411574572324753, + "learning_rate": 1.8694660025971867e-08, + "loss": 0.0008, + "step": 49124 + }, + { + "epoch": 0.98252, + "grad_norm": 0.24602653086185455, + "learning_rate": 1.8609420184567685e-08, + "loss": 0.0087, + "step": 49126 + }, + { + "epoch": 0.98256, + "grad_norm": 0.0008042985573410988, + "learning_rate": 1.8524374935512402e-08, + "loss": 0.0013, + "step": 49128 + }, + { + "epoch": 0.9826, + "grad_norm": 0.0008536680834367871, + "learning_rate": 1.8439524280462474e-08, + "loss": 0.0, + "step": 49130 + }, + { + "epoch": 0.98264, + "grad_norm": 0.041905440390110016, + "learning_rate": 1.835486822107213e-08, + "loss": 0.0011, + "step": 49132 + }, + { + "epoch": 0.98268, + "grad_norm": 0.0666172057390213, + "learning_rate": 1.8270406758993386e-08, + "loss": 0.0028, + "step": 49134 + }, + { + "epoch": 0.98272, + "grad_norm": 0.07417710870504379, + "learning_rate": 1.81861398958727e-08, + "loss": 0.0013, + "step": 49136 + }, + { + "epoch": 0.98276, + "grad_norm": 0.09467674046754837, + "learning_rate": 1.8102067633350983e-08, + "loss": 0.0017, + "step": 49138 + }, + { + "epoch": 0.9828, + "grad_norm": 0.006991939153522253, + "learning_rate": 1.8018189973069144e-08, + "loss": 0.0001, + "step": 49140 + }, + { + "epoch": 0.98284, + "grad_norm": 0.07940565049648285, + "learning_rate": 1.793450691666143e-08, + "loss": 0.0015, + "step": 49142 + }, + { + "epoch": 0.98288, + "grad_norm": 0.1067904531955719, + "learning_rate": 1.7851018465759874e-08, + "loss": 0.0013, + "step": 49144 + }, + { + "epoch": 0.98292, + "grad_norm": 0.05783621594309807, + "learning_rate": 1.7767724621990946e-08, + "loss": 0.0006, + "step": 49146 + }, + { + "epoch": 0.98296, + "grad_norm": 0.1135101243853569, + "learning_rate": 1.7684625386981126e-08, + "loss": 0.0024, + "step": 49148 + }, + { + "epoch": 0.983, + "grad_norm": 0.011635428294539452, + "learning_rate": 1.7601720762346895e-08, + "loss": 0.0012, + "step": 49150 + }, + { + "epoch": 0.98304, + "grad_norm": 9.351855278015137, + "learning_rate": 1.7519010749708077e-08, + "loss": 0.1089, + "step": 49152 + }, + { + "epoch": 0.98308, + "grad_norm": 0.8030880093574524, + "learning_rate": 1.7436495350674486e-08, + "loss": 0.0073, + "step": 49154 + }, + { + "epoch": 0.98312, + "grad_norm": 1.3110449314117432, + "learning_rate": 1.7354174566855952e-08, + "loss": 0.0161, + "step": 49156 + }, + { + "epoch": 0.98316, + "grad_norm": 0.010097893886268139, + "learning_rate": 1.7272048399857854e-08, + "loss": 0.0001, + "step": 49158 + }, + { + "epoch": 0.9832, + "grad_norm": 0.024287233129143715, + "learning_rate": 1.7190116851280024e-08, + "loss": 0.0004, + "step": 49160 + }, + { + "epoch": 0.98324, + "grad_norm": 0.01984052173793316, + "learning_rate": 1.7108379922720074e-08, + "loss": 0.0003, + "step": 49162 + }, + { + "epoch": 0.98328, + "grad_norm": 0.003705260343849659, + "learning_rate": 1.7026837615772285e-08, + "loss": 0.0006, + "step": 49164 + }, + { + "epoch": 0.98332, + "grad_norm": 0.028017397969961166, + "learning_rate": 1.6945489932025382e-08, + "loss": 0.0005, + "step": 49166 + }, + { + "epoch": 0.98336, + "grad_norm": 1.0653213262557983, + "learning_rate": 1.686433687306588e-08, + "loss": 0.0092, + "step": 49168 + }, + { + "epoch": 0.9834, + "grad_norm": 0.01570475473999977, + "learning_rate": 1.678337844047695e-08, + "loss": 0.004, + "step": 49170 + }, + { + "epoch": 0.98344, + "grad_norm": 0.5375080108642578, + "learning_rate": 1.6702614635834004e-08, + "loss": 0.0037, + "step": 49172 + }, + { + "epoch": 0.98348, + "grad_norm": 2.0649964809417725, + "learning_rate": 1.6622045460714663e-08, + "loss": 0.0255, + "step": 49174 + }, + { + "epoch": 0.98352, + "grad_norm": 0.11423545330762863, + "learning_rate": 1.654167091668768e-08, + "loss": 0.8559, + "step": 49176 + }, + { + "epoch": 0.98356, + "grad_norm": 0.0011990293860435486, + "learning_rate": 1.6461491005320686e-08, + "loss": 0.0004, + "step": 49178 + }, + { + "epoch": 0.9836, + "grad_norm": 0.10357651859521866, + "learning_rate": 1.6381505728176872e-08, + "loss": 0.0043, + "step": 49180 + }, + { + "epoch": 0.98364, + "grad_norm": 0.06174065172672272, + "learning_rate": 1.6301715086816107e-08, + "loss": 0.0009, + "step": 49182 + }, + { + "epoch": 0.98368, + "grad_norm": 0.05007588118314743, + "learning_rate": 1.62221190827927e-08, + "loss": 0.0216, + "step": 49184 + }, + { + "epoch": 0.98372, + "grad_norm": 0.001514178584329784, + "learning_rate": 1.614271771765874e-08, + "loss": 0.0013, + "step": 49186 + }, + { + "epoch": 0.98376, + "grad_norm": 0.0656992718577385, + "learning_rate": 1.6063510992962993e-08, + "loss": 0.0007, + "step": 49188 + }, + { + "epoch": 0.9838, + "grad_norm": 0.04882548004388809, + "learning_rate": 1.5984498910249778e-08, + "loss": 0.0122, + "step": 49190 + }, + { + "epoch": 0.98384, + "grad_norm": 0.08588024973869324, + "learning_rate": 1.5905681471057867e-08, + "loss": 0.0079, + "step": 49192 + }, + { + "epoch": 0.98388, + "grad_norm": 0.21772849559783936, + "learning_rate": 1.5827058676926022e-08, + "loss": 0.0019, + "step": 49194 + }, + { + "epoch": 0.98392, + "grad_norm": 0.23511254787445068, + "learning_rate": 1.5748630529385244e-08, + "loss": 0.0027, + "step": 49196 + }, + { + "epoch": 0.98396, + "grad_norm": 0.5469135642051697, + "learning_rate": 1.567039702996431e-08, + "loss": 0.0045, + "step": 49198 + }, + { + "epoch": 0.984, + "grad_norm": 0.0035750586539506912, + "learning_rate": 1.5592358180189782e-08, + "loss": 0.0002, + "step": 49200 + }, + { + "epoch": 0.98404, + "grad_norm": 0.11875851452350616, + "learning_rate": 1.5514513981582656e-08, + "loss": 0.0015, + "step": 49202 + }, + { + "epoch": 0.98408, + "grad_norm": 0.8784301280975342, + "learning_rate": 1.5436864435660614e-08, + "loss": 0.0081, + "step": 49204 + }, + { + "epoch": 0.98412, + "grad_norm": 0.10039620846509933, + "learning_rate": 1.5359409543937996e-08, + "loss": 0.0013, + "step": 49206 + }, + { + "epoch": 0.98416, + "grad_norm": 0.023218804970383644, + "learning_rate": 1.5282149307922488e-08, + "loss": 0.0046, + "step": 49208 + }, + { + "epoch": 0.9842, + "grad_norm": 0.029735775664448738, + "learning_rate": 1.5205083729122883e-08, + "loss": 0.001, + "step": 49210 + }, + { + "epoch": 0.98424, + "grad_norm": 0.06558876484632492, + "learning_rate": 1.512821280904131e-08, + "loss": 0.0138, + "step": 49212 + }, + { + "epoch": 0.98428, + "grad_norm": 0.001040369737893343, + "learning_rate": 1.505153654917435e-08, + "loss": 0.0012, + "step": 49214 + }, + { + "epoch": 0.98432, + "grad_norm": 0.8881053328514099, + "learning_rate": 1.4975054951019696e-08, + "loss": 0.009, + "step": 49216 + }, + { + "epoch": 0.98436, + "grad_norm": 0.6107960939407349, + "learning_rate": 1.4898768016066156e-08, + "loss": 0.0071, + "step": 49218 + }, + { + "epoch": 0.9844, + "grad_norm": 0.8039907813072205, + "learning_rate": 1.482267574580143e-08, + "loss": 0.0145, + "step": 49220 + }, + { + "epoch": 0.98444, + "grad_norm": 0.008625146932899952, + "learning_rate": 1.4746778141709883e-08, + "loss": 0.0001, + "step": 49222 + }, + { + "epoch": 0.98448, + "grad_norm": 0.004236192442476749, + "learning_rate": 1.4671075205271445e-08, + "loss": 0.0003, + "step": 49224 + }, + { + "epoch": 0.98452, + "grad_norm": 0.06525560468435287, + "learning_rate": 1.4595566937960493e-08, + "loss": 0.0032, + "step": 49226 + }, + { + "epoch": 0.98456, + "grad_norm": 0.24539422988891602, + "learning_rate": 1.452025334124918e-08, + "loss": 0.0024, + "step": 49228 + }, + { + "epoch": 0.9846, + "grad_norm": 0.03478451445698738, + "learning_rate": 1.4445134416607442e-08, + "loss": 0.0016, + "step": 49230 + }, + { + "epoch": 0.98464, + "grad_norm": 0.11509694159030914, + "learning_rate": 1.4370210165497444e-08, + "loss": 0.0018, + "step": 49232 + }, + { + "epoch": 0.98468, + "grad_norm": 0.03829604759812355, + "learning_rate": 1.429548058938246e-08, + "loss": 0.0007, + "step": 49234 + }, + { + "epoch": 0.98472, + "grad_norm": 0.37992849946022034, + "learning_rate": 1.422094568971688e-08, + "loss": 0.004, + "step": 49236 + }, + { + "epoch": 0.98476, + "grad_norm": 0.7407736778259277, + "learning_rate": 1.4146605467956208e-08, + "loss": 0.0092, + "step": 49238 + }, + { + "epoch": 0.9848, + "grad_norm": 0.029384171590209007, + "learning_rate": 1.4072459925548176e-08, + "loss": 0.0058, + "step": 49240 + }, + { + "epoch": 0.98484, + "grad_norm": 0.0913633182644844, + "learning_rate": 1.3998509063938293e-08, + "loss": 0.0006, + "step": 49242 + }, + { + "epoch": 0.98488, + "grad_norm": 0.5226402282714844, + "learning_rate": 1.3924752884568737e-08, + "loss": 0.0059, + "step": 49244 + }, + { + "epoch": 0.98492, + "grad_norm": 0.13949064910411835, + "learning_rate": 1.385119138887725e-08, + "loss": 0.0658, + "step": 49246 + }, + { + "epoch": 0.98496, + "grad_norm": 0.19658692181110382, + "learning_rate": 1.3777824578298238e-08, + "loss": 0.0025, + "step": 49248 + }, + { + "epoch": 0.985, + "grad_norm": 0.11924660950899124, + "learning_rate": 1.370465245426167e-08, + "loss": 0.0014, + "step": 49250 + }, + { + "epoch": 0.98504, + "grad_norm": 0.0698394849896431, + "learning_rate": 1.363167501819418e-08, + "loss": 0.0006, + "step": 49252 + }, + { + "epoch": 0.98508, + "grad_norm": 0.18875961005687714, + "learning_rate": 1.3558892271517964e-08, + "loss": 0.0045, + "step": 49254 + }, + { + "epoch": 0.98512, + "grad_norm": 0.0016341005684807897, + "learning_rate": 1.3486304215653e-08, + "loss": 0.0017, + "step": 49256 + }, + { + "epoch": 0.98516, + "grad_norm": 0.12297838181257248, + "learning_rate": 1.3413910852013712e-08, + "loss": 0.0017, + "step": 49258 + }, + { + "epoch": 0.9852, + "grad_norm": 0.011584047228097916, + "learning_rate": 1.3341712182012301e-08, + "loss": 0.0004, + "step": 49260 + }, + { + "epoch": 0.98524, + "grad_norm": 0.006972517818212509, + "learning_rate": 1.326970820705542e-08, + "loss": 0.0716, + "step": 49262 + }, + { + "epoch": 0.98528, + "grad_norm": 0.009300255216658115, + "learning_rate": 1.3197898928546393e-08, + "loss": 0.0051, + "step": 49264 + }, + { + "epoch": 0.98532, + "grad_norm": 13.337095260620117, + "learning_rate": 1.312628434788743e-08, + "loss": 0.6699, + "step": 49266 + }, + { + "epoch": 0.98536, + "grad_norm": 0.058554790914058685, + "learning_rate": 1.3054864466471862e-08, + "loss": 0.0008, + "step": 49268 + }, + { + "epoch": 0.9854, + "grad_norm": 0.2644476294517517, + "learning_rate": 1.2983639285693018e-08, + "loss": 0.003, + "step": 49270 + }, + { + "epoch": 0.98544, + "grad_norm": 0.04285217449069023, + "learning_rate": 1.2912608806940896e-08, + "loss": 0.0011, + "step": 49272 + }, + { + "epoch": 0.98548, + "grad_norm": 0.04903688654303551, + "learning_rate": 1.2841773031598836e-08, + "loss": 0.0039, + "step": 49274 + }, + { + "epoch": 0.98552, + "grad_norm": 0.343993216753006, + "learning_rate": 1.2771131961047956e-08, + "loss": 0.0037, + "step": 49276 + }, + { + "epoch": 0.98556, + "grad_norm": 0.045244988054037094, + "learning_rate": 1.270068559666493e-08, + "loss": 0.001, + "step": 49278 + }, + { + "epoch": 0.9856, + "grad_norm": 0.0023113738279789686, + "learning_rate": 1.2630433939825326e-08, + "loss": 0.0001, + "step": 49280 + }, + { + "epoch": 0.98564, + "grad_norm": 7.461952209472656, + "learning_rate": 1.2560376991895828e-08, + "loss": 0.0645, + "step": 49282 + }, + { + "epoch": 0.98568, + "grad_norm": 0.5100427865982056, + "learning_rate": 1.2490514754244231e-08, + "loss": 0.006, + "step": 49284 + }, + { + "epoch": 0.98572, + "grad_norm": 0.18509627878665924, + "learning_rate": 1.242084722823278e-08, + "loss": 0.0023, + "step": 49286 + }, + { + "epoch": 0.98576, + "grad_norm": 0.02116151712834835, + "learning_rate": 1.2351374415218164e-08, + "loss": 0.0004, + "step": 49288 + }, + { + "epoch": 0.9858, + "grad_norm": 0.0009781464468687773, + "learning_rate": 1.2282096316554858e-08, + "loss": 0.0, + "step": 49290 + }, + { + "epoch": 0.98584, + "grad_norm": 0.01129746064543724, + "learning_rate": 1.2213012933595114e-08, + "loss": 0.0002, + "step": 49292 + }, + { + "epoch": 0.98588, + "grad_norm": 0.021268067881464958, + "learning_rate": 1.2144124267684521e-08, + "loss": 0.0003, + "step": 49294 + }, + { + "epoch": 0.98592, + "grad_norm": 0.04828823730349541, + "learning_rate": 1.2075430320166449e-08, + "loss": 0.0031, + "step": 49296 + }, + { + "epoch": 0.98596, + "grad_norm": 0.10422065109014511, + "learning_rate": 1.2006931092378716e-08, + "loss": 0.0013, + "step": 49298 + }, + { + "epoch": 0.986, + "grad_norm": 0.010382793843746185, + "learning_rate": 1.1938626585660252e-08, + "loss": 0.0009, + "step": 49300 + }, + { + "epoch": 0.98604, + "grad_norm": 0.3391743004322052, + "learning_rate": 1.1870516801338884e-08, + "loss": 0.0047, + "step": 49302 + }, + { + "epoch": 0.98608, + "grad_norm": 0.14299550652503967, + "learning_rate": 1.1802601740744657e-08, + "loss": 0.0026, + "step": 49304 + }, + { + "epoch": 0.98612, + "grad_norm": 1.7324053049087524, + "learning_rate": 1.173488140519985e-08, + "loss": 0.0215, + "step": 49306 + }, + { + "epoch": 0.98616, + "grad_norm": 0.009998207911849022, + "learning_rate": 1.1667355796026736e-08, + "loss": 0.0007, + "step": 49308 + }, + { + "epoch": 0.9862, + "grad_norm": 0.022671151906251907, + "learning_rate": 1.1600024914540931e-08, + "loss": 0.0084, + "step": 49310 + }, + { + "epoch": 0.98624, + "grad_norm": 0.15419170260429382, + "learning_rate": 1.1532888762054716e-08, + "loss": 0.0015, + "step": 49312 + }, + { + "epoch": 0.98628, + "grad_norm": 0.4557327628135681, + "learning_rate": 1.1465947339878159e-08, + "loss": 0.0053, + "step": 49314 + }, + { + "epoch": 0.98632, + "grad_norm": 0.0077108112163841724, + "learning_rate": 1.1399200649314656e-08, + "loss": 0.0092, + "step": 49316 + }, + { + "epoch": 0.98636, + "grad_norm": 0.026284553110599518, + "learning_rate": 1.1332648691666503e-08, + "loss": 0.1158, + "step": 49318 + }, + { + "epoch": 0.9864, + "grad_norm": 0.04664325714111328, + "learning_rate": 1.126629146822933e-08, + "loss": 0.0005, + "step": 49320 + }, + { + "epoch": 0.98644, + "grad_norm": 0.17876432836055756, + "learning_rate": 1.1200128980299874e-08, + "loss": 0.0032, + "step": 49322 + }, + { + "epoch": 0.98648, + "grad_norm": 14.977853775024414, + "learning_rate": 1.1134161229167107e-08, + "loss": 0.9799, + "step": 49324 + }, + { + "epoch": 0.98652, + "grad_norm": 0.02262495458126068, + "learning_rate": 1.1068388216115556e-08, + "loss": 0.0014, + "step": 49326 + }, + { + "epoch": 0.98656, + "grad_norm": 0.17570903897285461, + "learning_rate": 1.1002809942428638e-08, + "loss": 0.0071, + "step": 49328 + }, + { + "epoch": 0.9866, + "grad_norm": 0.07064253091812134, + "learning_rate": 1.0937426409384223e-08, + "loss": 0.0006, + "step": 49330 + }, + { + "epoch": 0.98664, + "grad_norm": 0.2501499652862549, + "learning_rate": 1.0872237618256842e-08, + "loss": 0.0029, + "step": 49332 + }, + { + "epoch": 0.98668, + "grad_norm": 0.2383454591035843, + "learning_rate": 1.0807243570318814e-08, + "loss": 0.0018, + "step": 49334 + }, + { + "epoch": 0.98672, + "grad_norm": 0.010101689957082272, + "learning_rate": 1.0742444266836904e-08, + "loss": 0.0001, + "step": 49336 + }, + { + "epoch": 0.98676, + "grad_norm": 0.03822507709264755, + "learning_rate": 1.0677839709072323e-08, + "loss": 0.0041, + "step": 49338 + }, + { + "epoch": 0.9868, + "grad_norm": 0.5066724419593811, + "learning_rate": 1.0613429898287397e-08, + "loss": 0.0765, + "step": 49340 + }, + { + "epoch": 0.98684, + "grad_norm": 0.2101847380399704, + "learning_rate": 1.0549214835736677e-08, + "loss": 0.0068, + "step": 49342 + }, + { + "epoch": 0.98688, + "grad_norm": 0.05971361696720123, + "learning_rate": 1.0485194522671383e-08, + "loss": 0.005, + "step": 49344 + }, + { + "epoch": 0.98692, + "grad_norm": 0.03190398961305618, + "learning_rate": 1.0421368960339406e-08, + "loss": 0.0005, + "step": 49346 + }, + { + "epoch": 0.98696, + "grad_norm": 0.006771876476705074, + "learning_rate": 1.0357738149986419e-08, + "loss": 0.0003, + "step": 49348 + }, + { + "epoch": 0.987, + "grad_norm": 0.01524572167545557, + "learning_rate": 1.0294302092853647e-08, + "loss": 0.0004, + "step": 49350 + }, + { + "epoch": 0.98704, + "grad_norm": 0.06105587258934975, + "learning_rate": 1.0231060790174552e-08, + "loss": 0.2843, + "step": 49352 + }, + { + "epoch": 0.98708, + "grad_norm": 0.13921289145946503, + "learning_rate": 1.016801424318481e-08, + "loss": 0.0016, + "step": 49354 + }, + { + "epoch": 0.98712, + "grad_norm": 0.7201807498931885, + "learning_rate": 1.0105162453113438e-08, + "loss": 0.0477, + "step": 49356 + }, + { + "epoch": 0.98716, + "grad_norm": 0.16136455535888672, + "learning_rate": 1.00425054211839e-08, + "loss": 0.0021, + "step": 49358 + }, + { + "epoch": 0.9872, + "grad_norm": 0.04920452833175659, + "learning_rate": 9.980043148619668e-09, + "loss": 0.0008, + "step": 49360 + }, + { + "epoch": 0.98724, + "grad_norm": 0.029441652819514275, + "learning_rate": 9.91777563663754e-09, + "loss": 0.0013, + "step": 49362 + }, + { + "epoch": 0.98728, + "grad_norm": 0.0024905807804316282, + "learning_rate": 9.85570288645099e-09, + "loss": 0.0003, + "step": 49364 + }, + { + "epoch": 0.98732, + "grad_norm": 0.31505391001701355, + "learning_rate": 9.793824899271276e-09, + "loss": 0.0029, + "step": 49366 + }, + { + "epoch": 0.98736, + "grad_norm": 18.531490325927734, + "learning_rate": 9.732141676304097e-09, + "loss": 0.1642, + "step": 49368 + }, + { + "epoch": 0.9874, + "grad_norm": 0.018112868070602417, + "learning_rate": 9.670653218752935e-09, + "loss": 0.2191, + "step": 49370 + }, + { + "epoch": 0.98744, + "grad_norm": 0.03615109995007515, + "learning_rate": 9.60935952781461e-09, + "loss": 0.0008, + "step": 49372 + }, + { + "epoch": 0.98748, + "grad_norm": 0.022913740947842598, + "learning_rate": 9.548260604685944e-09, + "loss": 0.0002, + "step": 49374 + }, + { + "epoch": 0.98752, + "grad_norm": 0.2619551718235016, + "learning_rate": 9.487356450557094e-09, + "loss": 0.0033, + "step": 49376 + }, + { + "epoch": 0.98756, + "grad_norm": 0.015656257048249245, + "learning_rate": 9.426647066616002e-09, + "loss": 0.0009, + "step": 49378 + }, + { + "epoch": 0.9876, + "grad_norm": 0.03781168907880783, + "learning_rate": 9.366132454046162e-09, + "loss": 0.0009, + "step": 49380 + }, + { + "epoch": 0.98764, + "grad_norm": 0.044954389333724976, + "learning_rate": 9.305812614026633e-09, + "loss": 0.0039, + "step": 49382 + }, + { + "epoch": 0.98768, + "grad_norm": 0.05970839783549309, + "learning_rate": 9.245687547733139e-09, + "loss": 0.0014, + "step": 49384 + }, + { + "epoch": 0.98772, + "grad_norm": 0.01447313278913498, + "learning_rate": 9.18575725633919e-09, + "loss": 0.0007, + "step": 49386 + }, + { + "epoch": 0.98776, + "grad_norm": 0.014263913035392761, + "learning_rate": 9.126021741012737e-09, + "loss": 0.0032, + "step": 49388 + }, + { + "epoch": 0.9878, + "grad_norm": 0.028548620641231537, + "learning_rate": 9.066481002918403e-09, + "loss": 0.0015, + "step": 49390 + }, + { + "epoch": 0.98784, + "grad_norm": 0.2720244228839874, + "learning_rate": 9.007135043216375e-09, + "loss": 0.0032, + "step": 49392 + }, + { + "epoch": 0.98788, + "grad_norm": 0.28798186779022217, + "learning_rate": 8.947983863063504e-09, + "loss": 0.0047, + "step": 49394 + }, + { + "epoch": 0.98792, + "grad_norm": 0.01718197576701641, + "learning_rate": 8.889027463614419e-09, + "loss": 0.0009, + "step": 49396 + }, + { + "epoch": 0.98796, + "grad_norm": 0.009740750305354595, + "learning_rate": 8.830265846015984e-09, + "loss": 0.001, + "step": 49398 + }, + { + "epoch": 0.988, + "grad_norm": 0.01727389171719551, + "learning_rate": 8.771699011416169e-09, + "loss": 0.0005, + "step": 49400 + }, + { + "epoch": 0.98804, + "grad_norm": 0.06111575663089752, + "learning_rate": 8.713326960956281e-09, + "loss": 0.001, + "step": 49402 + }, + { + "epoch": 0.98808, + "grad_norm": 0.00905335322022438, + "learning_rate": 8.65514969577319e-09, + "loss": 0.0004, + "step": 49404 + }, + { + "epoch": 0.98812, + "grad_norm": 0.009067347273230553, + "learning_rate": 8.597167217002656e-09, + "loss": 0.0012, + "step": 49406 + }, + { + "epoch": 0.98816, + "grad_norm": 0.34386059641838074, + "learning_rate": 8.539379525773771e-09, + "loss": 0.0035, + "step": 49408 + }, + { + "epoch": 0.9882, + "grad_norm": 0.3140876889228821, + "learning_rate": 8.481786623214527e-09, + "loss": 0.004, + "step": 49410 + }, + { + "epoch": 0.98824, + "grad_norm": 0.03480858728289604, + "learning_rate": 8.424388510445137e-09, + "loss": 0.0048, + "step": 49412 + }, + { + "epoch": 0.98828, + "grad_norm": 0.059879232197999954, + "learning_rate": 8.367185188588034e-09, + "loss": 0.0025, + "step": 49414 + }, + { + "epoch": 0.98832, + "grad_norm": 2.5300915241241455, + "learning_rate": 8.310176658755664e-09, + "loss": 0.0262, + "step": 49416 + }, + { + "epoch": 0.98836, + "grad_norm": 0.501985490322113, + "learning_rate": 8.253362922060471e-09, + "loss": 0.0057, + "step": 49418 + }, + { + "epoch": 0.9884, + "grad_norm": 0.05211947113275528, + "learning_rate": 8.196743979610455e-09, + "loss": 0.0042, + "step": 49420 + }, + { + "epoch": 0.98844, + "grad_norm": 0.12998846173286438, + "learning_rate": 8.14031983250918e-09, + "loss": 0.0012, + "step": 49422 + }, + { + "epoch": 0.98848, + "grad_norm": 0.261477530002594, + "learning_rate": 8.084090481855766e-09, + "loss": 0.0029, + "step": 49424 + }, + { + "epoch": 0.98852, + "grad_norm": 0.05771410092711449, + "learning_rate": 8.028055928747114e-09, + "loss": 0.0033, + "step": 49426 + }, + { + "epoch": 0.98856, + "grad_norm": 0.016408927738666534, + "learning_rate": 7.972216174276792e-09, + "loss": 0.002, + "step": 49428 + }, + { + "epoch": 0.9886, + "grad_norm": 0.05493808537721634, + "learning_rate": 7.916571219531711e-09, + "loss": 0.0023, + "step": 49430 + }, + { + "epoch": 0.98864, + "grad_norm": 0.3034878075122833, + "learning_rate": 7.861121065597665e-09, + "loss": 0.0045, + "step": 49432 + }, + { + "epoch": 0.98868, + "grad_norm": 0.39145714044570923, + "learning_rate": 7.805865713554905e-09, + "loss": 0.0141, + "step": 49434 + }, + { + "epoch": 0.98872, + "grad_norm": 0.2922143042087555, + "learning_rate": 7.750805164481457e-09, + "loss": 0.0022, + "step": 49436 + }, + { + "epoch": 0.98876, + "grad_norm": 0.13155506551265717, + "learning_rate": 7.695939419450904e-09, + "loss": 0.0012, + "step": 49438 + }, + { + "epoch": 0.9888, + "grad_norm": 0.10509524494409561, + "learning_rate": 7.641268479531283e-09, + "loss": 0.0009, + "step": 49440 + }, + { + "epoch": 0.98884, + "grad_norm": 0.08596666902303696, + "learning_rate": 7.586792345790628e-09, + "loss": 0.0023, + "step": 49442 + }, + { + "epoch": 0.98888, + "grad_norm": 0.0414569266140461, + "learning_rate": 7.532511019289202e-09, + "loss": 0.0004, + "step": 49444 + }, + { + "epoch": 0.98892, + "grad_norm": 0.07585777342319489, + "learning_rate": 7.478424501086156e-09, + "loss": 0.0013, + "step": 49446 + }, + { + "epoch": 0.98896, + "grad_norm": 0.2271891087293625, + "learning_rate": 7.424532792235095e-09, + "loss": 0.0022, + "step": 49448 + }, + { + "epoch": 0.989, + "grad_norm": 0.09668578207492828, + "learning_rate": 7.370835893788508e-09, + "loss": 0.0018, + "step": 49450 + }, + { + "epoch": 0.98904, + "grad_norm": 0.1430421769618988, + "learning_rate": 7.3173338067911156e-09, + "loss": 0.0038, + "step": 49452 + }, + { + "epoch": 0.98908, + "grad_norm": 0.09118765592575073, + "learning_rate": 7.264026532286528e-09, + "loss": 0.0011, + "step": 49454 + }, + { + "epoch": 0.98912, + "grad_norm": 0.00376721378415823, + "learning_rate": 7.210914071315022e-09, + "loss": 0.0028, + "step": 49456 + }, + { + "epoch": 0.98916, + "grad_norm": 0.10124620050191879, + "learning_rate": 7.157996424911329e-09, + "loss": 0.0023, + "step": 49458 + }, + { + "epoch": 0.9892, + "grad_norm": 0.01668650656938553, + "learning_rate": 7.105273594107953e-09, + "loss": 0.0007, + "step": 49460 + }, + { + "epoch": 0.98924, + "grad_norm": 0.5475424528121948, + "learning_rate": 7.052745579930742e-09, + "loss": 0.0054, + "step": 49462 + }, + { + "epoch": 0.98928, + "grad_norm": 0.027628403156995773, + "learning_rate": 7.0004123834055415e-09, + "loss": 0.0004, + "step": 49464 + }, + { + "epoch": 0.98932, + "grad_norm": 0.12973150610923767, + "learning_rate": 6.948274005551536e-09, + "loss": 0.0027, + "step": 49466 + }, + { + "epoch": 0.98936, + "grad_norm": 0.011081020347774029, + "learning_rate": 6.896330447386801e-09, + "loss": 0.0032, + "step": 49468 + }, + { + "epoch": 0.9894, + "grad_norm": 0.8779584765434265, + "learning_rate": 6.844581709921639e-09, + "loss": 0.0086, + "step": 49470 + }, + { + "epoch": 0.98944, + "grad_norm": 0.03462686762213707, + "learning_rate": 6.7930277941663515e-09, + "loss": 0.0007, + "step": 49472 + }, + { + "epoch": 0.98948, + "grad_norm": 0.035797443240880966, + "learning_rate": 6.741668701126802e-09, + "loss": 0.0006, + "step": 49474 + }, + { + "epoch": 0.98952, + "grad_norm": 7.848355744499713e-05, + "learning_rate": 6.690504431802191e-09, + "loss": 0.0006, + "step": 49476 + }, + { + "epoch": 0.98956, + "grad_norm": 0.1721765547990799, + "learning_rate": 6.63953498719172e-09, + "loss": 0.0025, + "step": 49478 + }, + { + "epoch": 0.9896, + "grad_norm": 0.002452990273013711, + "learning_rate": 6.588760368287928e-09, + "loss": 0.0004, + "step": 49480 + }, + { + "epoch": 0.98964, + "grad_norm": 0.15873056650161743, + "learning_rate": 6.538180576082242e-09, + "loss": 0.0043, + "step": 49482 + }, + { + "epoch": 0.98968, + "grad_norm": 0.1446545273065567, + "learning_rate": 6.487795611558323e-09, + "loss": 0.0015, + "step": 49484 + }, + { + "epoch": 0.98972, + "grad_norm": 0.08031139522790909, + "learning_rate": 6.437605475700936e-09, + "loss": 0.0013, + "step": 49486 + }, + { + "epoch": 0.98976, + "grad_norm": 0.012858430854976177, + "learning_rate": 6.3876101694870795e-09, + "loss": 0.0002, + "step": 49488 + }, + { + "epoch": 0.9898, + "grad_norm": 0.19437052309513092, + "learning_rate": 6.3378096938915276e-09, + "loss": 0.002, + "step": 49490 + }, + { + "epoch": 0.98984, + "grad_norm": 0.05952255427837372, + "learning_rate": 6.288204049885727e-09, + "loss": 0.0011, + "step": 49492 + }, + { + "epoch": 0.98988, + "grad_norm": 0.27937641739845276, + "learning_rate": 6.23879323843668e-09, + "loss": 0.005, + "step": 49494 + }, + { + "epoch": 0.98992, + "grad_norm": 0.007400699891149998, + "learning_rate": 6.189577260508062e-09, + "loss": 0.0001, + "step": 49496 + }, + { + "epoch": 0.98996, + "grad_norm": 0.03504485264420509, + "learning_rate": 6.140556117057994e-09, + "loss": 0.0028, + "step": 49498 + }, + { + "epoch": 0.99, + "grad_norm": 0.038067255169153214, + "learning_rate": 6.091729809042379e-09, + "loss": 0.0013, + "step": 49500 + }, + { + "epoch": 0.99004, + "grad_norm": 0.026826879009604454, + "learning_rate": 6.0430983374148985e-09, + "loss": 0.0006, + "step": 49502 + }, + { + "epoch": 0.99008, + "grad_norm": 0.14221642911434174, + "learning_rate": 5.994661703121463e-09, + "loss": 0.0017, + "step": 49504 + }, + { + "epoch": 0.99012, + "grad_norm": 0.18701569736003876, + "learning_rate": 5.946419907107981e-09, + "loss": 0.0019, + "step": 49506 + }, + { + "epoch": 0.99016, + "grad_norm": 0.022396966814994812, + "learning_rate": 5.898372950314812e-09, + "loss": 0.0003, + "step": 49508 + }, + { + "epoch": 0.9902, + "grad_norm": 0.0824577733874321, + "learning_rate": 5.850520833676765e-09, + "loss": 0.0027, + "step": 49510 + }, + { + "epoch": 0.99024, + "grad_norm": 0.014530436135828495, + "learning_rate": 5.802863558128646e-09, + "loss": 0.0073, + "step": 49512 + }, + { + "epoch": 0.99028, + "grad_norm": 0.029854722321033478, + "learning_rate": 5.755401124599713e-09, + "loss": 0.0012, + "step": 49514 + }, + { + "epoch": 0.99032, + "grad_norm": 15.858756065368652, + "learning_rate": 5.708133534014781e-09, + "loss": 0.2354, + "step": 49516 + }, + { + "epoch": 0.99036, + "grad_norm": 0.07473654299974442, + "learning_rate": 5.661060787294226e-09, + "loss": 0.0013, + "step": 49518 + }, + { + "epoch": 0.9904, + "grad_norm": 0.004620464984327555, + "learning_rate": 5.614182885357311e-09, + "loss": 0.0002, + "step": 49520 + }, + { + "epoch": 0.99044, + "grad_norm": 0.07055880129337311, + "learning_rate": 5.56749982911664e-09, + "loss": 0.0009, + "step": 49522 + }, + { + "epoch": 0.99048, + "grad_norm": 0.26413044333457947, + "learning_rate": 5.521011619483707e-09, + "loss": 0.0021, + "step": 49524 + }, + { + "epoch": 0.99052, + "grad_norm": 0.0038288107607513666, + "learning_rate": 5.474718257364453e-09, + "loss": 0.0003, + "step": 49526 + }, + { + "epoch": 0.99056, + "grad_norm": 0.0365980789065361, + "learning_rate": 5.428619743660379e-09, + "loss": 0.0005, + "step": 49528 + }, + { + "epoch": 0.9906, + "grad_norm": 0.8285037279129028, + "learning_rate": 5.382716079271877e-09, + "loss": 0.0078, + "step": 49530 + }, + { + "epoch": 0.99064, + "grad_norm": 0.2473219931125641, + "learning_rate": 5.337007265091565e-09, + "loss": 0.0036, + "step": 49532 + }, + { + "epoch": 0.99068, + "grad_norm": 0.08593659847974777, + "learning_rate": 5.291493302013173e-09, + "loss": 0.0008, + "step": 49534 + }, + { + "epoch": 0.99072, + "grad_norm": 0.006306661292910576, + "learning_rate": 5.246174190921549e-09, + "loss": 0.0013, + "step": 49536 + }, + { + "epoch": 0.99076, + "grad_norm": 0.1662210077047348, + "learning_rate": 5.201049932702651e-09, + "loss": 0.0017, + "step": 49538 + }, + { + "epoch": 0.9908, + "grad_norm": 0.01483256183564663, + "learning_rate": 5.156120528233555e-09, + "loss": 0.0045, + "step": 49540 + }, + { + "epoch": 0.99084, + "grad_norm": 0.6616607308387756, + "learning_rate": 5.111385978392447e-09, + "loss": 0.0072, + "step": 49542 + }, + { + "epoch": 0.99088, + "grad_norm": 3.512376070022583, + "learning_rate": 5.0668462840508525e-09, + "loss": 0.0318, + "step": 49544 + }, + { + "epoch": 0.99092, + "grad_norm": 0.15886227786540985, + "learning_rate": 5.022501446075856e-09, + "loss": 0.0021, + "step": 49546 + }, + { + "epoch": 0.99096, + "grad_norm": 0.031170697882771492, + "learning_rate": 4.978351465333431e-09, + "loss": 0.0003, + "step": 49548 + }, + { + "epoch": 0.991, + "grad_norm": 0.024432217702269554, + "learning_rate": 4.9343963426840006e-09, + "loss": 0.0017, + "step": 49550 + }, + { + "epoch": 0.99104, + "grad_norm": 0.15759535133838654, + "learning_rate": 4.890636078984656e-09, + "loss": 0.0029, + "step": 49552 + }, + { + "epoch": 0.99108, + "grad_norm": 0.09190522879362106, + "learning_rate": 4.84707067508805e-09, + "loss": 0.0126, + "step": 49554 + }, + { + "epoch": 0.99112, + "grad_norm": 0.03553001582622528, + "learning_rate": 4.803700131843503e-09, + "loss": 0.0215, + "step": 49556 + }, + { + "epoch": 0.99116, + "grad_norm": 0.05565456673502922, + "learning_rate": 4.760524450095894e-09, + "loss": 0.001, + "step": 49558 + }, + { + "epoch": 0.9912, + "grad_norm": 0.0034977616742253304, + "learning_rate": 4.717543630688992e-09, + "loss": 0.0035, + "step": 49560 + }, + { + "epoch": 0.99124, + "grad_norm": 0.07462748140096664, + "learning_rate": 4.674757674458796e-09, + "loss": 0.0015, + "step": 49562 + }, + { + "epoch": 0.99128, + "grad_norm": 0.14434771239757538, + "learning_rate": 4.632166582240194e-09, + "loss": 0.0025, + "step": 49564 + }, + { + "epoch": 0.99132, + "grad_norm": 0.018197186291217804, + "learning_rate": 4.589770354863632e-09, + "loss": 0.0002, + "step": 49566 + }, + { + "epoch": 0.99136, + "grad_norm": 0.5657923817634583, + "learning_rate": 4.5475689931551155e-09, + "loss": 0.005, + "step": 49568 + }, + { + "epoch": 0.9914, + "grad_norm": 0.02176397293806076, + "learning_rate": 4.505562497938431e-09, + "loss": 0.0002, + "step": 49570 + }, + { + "epoch": 0.99144, + "grad_norm": 0.11459486186504364, + "learning_rate": 4.463750870031813e-09, + "loss": 0.0011, + "step": 49572 + }, + { + "epoch": 0.99148, + "grad_norm": 0.1053018793463707, + "learning_rate": 4.422134110249054e-09, + "loss": 0.0018, + "step": 49574 + }, + { + "epoch": 0.99152, + "grad_norm": 0.01705525815486908, + "learning_rate": 4.380712219403949e-09, + "loss": 0.0003, + "step": 49576 + }, + { + "epoch": 0.99156, + "grad_norm": 0.09483542293310165, + "learning_rate": 4.339485198303628e-09, + "loss": 0.001, + "step": 49578 + }, + { + "epoch": 0.9916, + "grad_norm": 0.048087552189826965, + "learning_rate": 4.298453047749674e-09, + "loss": 0.0009, + "step": 49580 + }, + { + "epoch": 0.99164, + "grad_norm": 0.1306835114955902, + "learning_rate": 4.257615768544776e-09, + "loss": 0.0052, + "step": 49582 + }, + { + "epoch": 0.99168, + "grad_norm": 0.02074175328016281, + "learning_rate": 4.2169733614827455e-09, + "loss": 0.0015, + "step": 49584 + }, + { + "epoch": 0.99172, + "grad_norm": 0.00497670192271471, + "learning_rate": 4.176525827358502e-09, + "loss": 0.0007, + "step": 49586 + }, + { + "epoch": 0.99176, + "grad_norm": 0.0104172108694911, + "learning_rate": 4.136273166958083e-09, + "loss": 0.0005, + "step": 49588 + }, + { + "epoch": 0.9918, + "grad_norm": 0.011418584734201431, + "learning_rate": 4.096215381066415e-09, + "loss": 0.0007, + "step": 49590 + }, + { + "epoch": 0.99184, + "grad_norm": 6.419322490692139, + "learning_rate": 4.056352470466207e-09, + "loss": 0.0657, + "step": 49592 + }, + { + "epoch": 0.99188, + "grad_norm": 5.64932107925415, + "learning_rate": 4.016684435933504e-09, + "loss": 0.0561, + "step": 49594 + }, + { + "epoch": 0.99192, + "grad_norm": 0.01813998632133007, + "learning_rate": 3.9772112782421325e-09, + "loss": 0.0002, + "step": 49596 + }, + { + "epoch": 0.99196, + "grad_norm": 0.23258094489574432, + "learning_rate": 3.937932998161476e-09, + "loss": 0.011, + "step": 49598 + }, + { + "epoch": 0.992, + "grad_norm": 0.002287456300109625, + "learning_rate": 3.898849596456477e-09, + "loss": 0.0007, + "step": 49600 + }, + { + "epoch": 0.99204, + "grad_norm": 0.19486334919929504, + "learning_rate": 3.8599610738898615e-09, + "loss": 0.0018, + "step": 49602 + }, + { + "epoch": 0.99208, + "grad_norm": 0.1705116331577301, + "learning_rate": 3.8212674312199106e-09, + "loss": 0.0014, + "step": 49604 + }, + { + "epoch": 0.99212, + "grad_norm": 0.2078579217195511, + "learning_rate": 3.782768669200465e-09, + "loss": 0.0017, + "step": 49606 + }, + { + "epoch": 0.99216, + "grad_norm": 0.07259746640920639, + "learning_rate": 3.744464788582036e-09, + "loss": 0.0009, + "step": 49608 + }, + { + "epoch": 0.9922, + "grad_norm": 0.022585885599255562, + "learning_rate": 3.7063557901129144e-09, + "loss": 0.0105, + "step": 49610 + }, + { + "epoch": 0.99224, + "grad_norm": 0.012055144645273685, + "learning_rate": 3.668441674533618e-09, + "loss": 0.0018, + "step": 49612 + }, + { + "epoch": 0.99228, + "grad_norm": 0.030721476301550865, + "learning_rate": 3.6307224425846666e-09, + "loss": 0.0005, + "step": 49614 + }, + { + "epoch": 0.99232, + "grad_norm": 0.14801083505153656, + "learning_rate": 3.5931980950021373e-09, + "loss": 0.0044, + "step": 49616 + }, + { + "epoch": 0.99236, + "grad_norm": 1.4109501838684082, + "learning_rate": 3.5558686325154467e-09, + "loss": 0.012, + "step": 49618 + }, + { + "epoch": 0.9924, + "grad_norm": 0.05032746121287346, + "learning_rate": 3.518734055855122e-09, + "loss": 0.0025, + "step": 49620 + }, + { + "epoch": 0.99244, + "grad_norm": 0.4687305688858032, + "learning_rate": 3.481794365742808e-09, + "loss": 0.0058, + "step": 49622 + }, + { + "epoch": 0.99248, + "grad_norm": 0.25009939074516296, + "learning_rate": 3.445049562899039e-09, + "loss": 0.0027, + "step": 49624 + }, + { + "epoch": 0.99252, + "grad_norm": 0.01573486067354679, + "learning_rate": 3.4084996480410194e-09, + "loss": 0.0018, + "step": 49626 + }, + { + "epoch": 0.99256, + "grad_norm": 0.012398999184370041, + "learning_rate": 3.372144621880402e-09, + "loss": 0.0105, + "step": 49628 + }, + { + "epoch": 0.9926, + "grad_norm": 0.4814050793647766, + "learning_rate": 3.3359844851277302e-09, + "loss": 0.0057, + "step": 49630 + }, + { + "epoch": 0.99264, + "grad_norm": 0.010691494680941105, + "learning_rate": 3.300019238485774e-09, + "loss": 0.0009, + "step": 49632 + }, + { + "epoch": 0.99268, + "grad_norm": 0.09470219910144806, + "learning_rate": 3.264248882656196e-09, + "loss": 0.0011, + "step": 49634 + }, + { + "epoch": 0.99272, + "grad_norm": 0.027445780113339424, + "learning_rate": 3.228673418337325e-09, + "loss": 0.0014, + "step": 49636 + }, + { + "epoch": 0.99276, + "grad_norm": 0.10140612721443176, + "learning_rate": 3.19329284622194e-09, + "loss": 0.0013, + "step": 49638 + }, + { + "epoch": 0.9928, + "grad_norm": 0.003962620161473751, + "learning_rate": 3.1581071670006013e-09, + "loss": 0.4884, + "step": 49640 + }, + { + "epoch": 0.99284, + "grad_norm": 0.19662028551101685, + "learning_rate": 3.1231163813583152e-09, + "loss": 0.0017, + "step": 49642 + }, + { + "epoch": 0.99288, + "grad_norm": 0.0940132737159729, + "learning_rate": 3.0883204899767594e-09, + "loss": 0.0012, + "step": 49644 + }, + { + "epoch": 0.99292, + "grad_norm": 0.011976227164268494, + "learning_rate": 3.0537194935365e-09, + "loss": 0.0005, + "step": 49646 + }, + { + "epoch": 0.99296, + "grad_norm": 0.025938453152775764, + "learning_rate": 3.019313392709222e-09, + "loss": 0.0014, + "step": 49648 + }, + { + "epoch": 0.993, + "grad_norm": 0.9926908016204834, + "learning_rate": 2.9851021881688314e-09, + "loss": 0.0084, + "step": 49650 + }, + { + "epoch": 0.99304, + "grad_norm": 0.08836106210947037, + "learning_rate": 2.951085880580351e-09, + "loss": 0.0009, + "step": 49652 + }, + { + "epoch": 0.99308, + "grad_norm": 0.03056834079325199, + "learning_rate": 2.9172644706065845e-09, + "loss": 0.0022, + "step": 49654 + }, + { + "epoch": 0.99312, + "grad_norm": 0.002525219228118658, + "learning_rate": 2.8836379589081143e-09, + "loss": 0.0004, + "step": 49656 + }, + { + "epoch": 0.99316, + "grad_norm": 0.12685967981815338, + "learning_rate": 2.850206346139972e-09, + "loss": 0.002, + "step": 49658 + }, + { + "epoch": 0.9932, + "grad_norm": 0.1714051216840744, + "learning_rate": 2.8169696329527484e-09, + "loss": 0.0063, + "step": 49660 + }, + { + "epoch": 0.99324, + "grad_norm": 0.6021934747695923, + "learning_rate": 2.7839278199970343e-09, + "loss": 0.0038, + "step": 49662 + }, + { + "epoch": 0.99328, + "grad_norm": 0.2543030083179474, + "learning_rate": 2.7510809079145384e-09, + "loss": 0.0018, + "step": 49664 + }, + { + "epoch": 0.99332, + "grad_norm": 0.03911743313074112, + "learning_rate": 2.7184288973480798e-09, + "loss": 0.0004, + "step": 49666 + }, + { + "epoch": 0.99336, + "grad_norm": 0.01726524718105793, + "learning_rate": 2.685971788931596e-09, + "loss": 0.0036, + "step": 49668 + }, + { + "epoch": 0.9934, + "grad_norm": 0.023222360759973526, + "learning_rate": 2.6537095832990247e-09, + "loss": 0.0052, + "step": 49670 + }, + { + "epoch": 0.99344, + "grad_norm": 0.007494242396205664, + "learning_rate": 2.6216422810798614e-09, + "loss": 0.0022, + "step": 49672 + }, + { + "epoch": 0.99348, + "grad_norm": 0.03093784488737583, + "learning_rate": 2.5897698828991623e-09, + "loss": 0.0043, + "step": 49674 + }, + { + "epoch": 0.99352, + "grad_norm": 18.876792907714844, + "learning_rate": 2.5580923893775424e-09, + "loss": 0.5692, + "step": 49676 + }, + { + "epoch": 0.99356, + "grad_norm": 0.0006662766681984067, + "learning_rate": 2.5266098011322847e-09, + "loss": 0.0003, + "step": 49678 + }, + { + "epoch": 0.9936, + "grad_norm": 0.0461544543504715, + "learning_rate": 2.495322118778454e-09, + "loss": 0.0013, + "step": 49680 + }, + { + "epoch": 0.99364, + "grad_norm": 0.020487666130065918, + "learning_rate": 2.4642293429255615e-09, + "loss": 0.0058, + "step": 49682 + }, + { + "epoch": 0.99368, + "grad_norm": 0.24326568841934204, + "learning_rate": 2.4333314741797896e-09, + "loss": 0.0028, + "step": 49684 + }, + { + "epoch": 0.99372, + "grad_norm": 0.318471759557724, + "learning_rate": 2.402628513141769e-09, + "loss": 0.0032, + "step": 49686 + }, + { + "epoch": 0.99376, + "grad_norm": 0.1979094296693802, + "learning_rate": 2.3721204604132407e-09, + "loss": 0.0021, + "step": 49688 + }, + { + "epoch": 0.9938, + "grad_norm": 0.011484864167869091, + "learning_rate": 2.341807316587064e-09, + "loss": 0.6087, + "step": 49690 + }, + { + "epoch": 0.99384, + "grad_norm": 2.618332624435425, + "learning_rate": 2.311689082253876e-09, + "loss": 0.0268, + "step": 49692 + }, + { + "epoch": 0.99388, + "grad_norm": 0.004202450159937143, + "learning_rate": 2.281765758002097e-09, + "loss": 0.0002, + "step": 49694 + }, + { + "epoch": 0.99392, + "grad_norm": 0.01749313995242119, + "learning_rate": 2.2520373444134823e-09, + "loss": 0.0018, + "step": 49696 + }, + { + "epoch": 0.99396, + "grad_norm": 0.01305398065596819, + "learning_rate": 2.22250384206979e-09, + "loss": 0.0011, + "step": 49698 + }, + { + "epoch": 0.994, + "grad_norm": 0.0050053671002388, + "learning_rate": 2.193165251545004e-09, + "loss": 0.0001, + "step": 49700 + }, + { + "epoch": 0.99404, + "grad_norm": 0.003960936795920134, + "learning_rate": 2.164021573412001e-09, + "loss": 0.0004, + "step": 49702 + }, + { + "epoch": 0.99408, + "grad_norm": 0.9244872331619263, + "learning_rate": 2.1350728082381033e-09, + "loss": 0.013, + "step": 49704 + }, + { + "epoch": 0.99412, + "grad_norm": 0.04395931959152222, + "learning_rate": 2.1063189565884157e-09, + "loss": 0.0068, + "step": 49706 + }, + { + "epoch": 0.99416, + "grad_norm": 1.1104239225387573, + "learning_rate": 2.0777600190236003e-09, + "loss": 0.0136, + "step": 49708 + }, + { + "epoch": 0.9942, + "grad_norm": 0.01656469516456127, + "learning_rate": 2.049395996099879e-09, + "loss": 0.0003, + "step": 49710 + }, + { + "epoch": 0.99424, + "grad_norm": 0.14355158805847168, + "learning_rate": 2.0212268883701424e-09, + "loss": 0.0075, + "step": 49712 + }, + { + "epoch": 0.99428, + "grad_norm": 0.8654983043670654, + "learning_rate": 1.993252696385062e-09, + "loss": 0.0094, + "step": 49714 + }, + { + "epoch": 0.99432, + "grad_norm": 1.6006407737731934, + "learning_rate": 1.9654734206875357e-09, + "loss": 0.0216, + "step": 49716 + }, + { + "epoch": 0.99436, + "grad_norm": 0.586148202419281, + "learning_rate": 1.937889061821574e-09, + "loss": 0.0051, + "step": 49718 + }, + { + "epoch": 0.9944, + "grad_norm": 0.009549097158014774, + "learning_rate": 1.910499620322304e-09, + "loss": 0.0001, + "step": 49720 + }, + { + "epoch": 0.99444, + "grad_norm": 0.935295820236206, + "learning_rate": 1.883305096727073e-09, + "loss": 0.012, + "step": 49722 + }, + { + "epoch": 0.99448, + "grad_norm": 0.026942351832985878, + "learning_rate": 1.8563054915632373e-09, + "loss": 0.0005, + "step": 49724 + }, + { + "epoch": 0.99452, + "grad_norm": 0.20153410732746124, + "learning_rate": 1.8295008053581532e-09, + "loss": 0.0016, + "step": 49726 + }, + { + "epoch": 0.99456, + "grad_norm": 0.05758533254265785, + "learning_rate": 1.802891038633625e-09, + "loss": 0.0011, + "step": 49728 + }, + { + "epoch": 0.9946, + "grad_norm": 0.031055662781000137, + "learning_rate": 1.776476191910348e-09, + "loss": 0.0004, + "step": 49730 + }, + { + "epoch": 0.99464, + "grad_norm": 0.00986973661929369, + "learning_rate": 1.7502562657012446e-09, + "loss": 0.0001, + "step": 49732 + }, + { + "epoch": 0.99468, + "grad_norm": 17.941482543945312, + "learning_rate": 1.7242312605181277e-09, + "loss": 0.3366, + "step": 49734 + }, + { + "epoch": 0.99472, + "grad_norm": 0.020310208201408386, + "learning_rate": 1.6984011768694797e-09, + "loss": 0.0338, + "step": 49736 + }, + { + "epoch": 0.99476, + "grad_norm": 0.006593160796910524, + "learning_rate": 1.6727660152571212e-09, + "loss": 0.0015, + "step": 49738 + }, + { + "epoch": 0.9948, + "grad_norm": 0.11405951529741287, + "learning_rate": 1.647325776182873e-09, + "loss": 0.0015, + "step": 49740 + }, + { + "epoch": 0.99484, + "grad_norm": 0.08894351869821548, + "learning_rate": 1.6220804601407847e-09, + "loss": 0.0044, + "step": 49742 + }, + { + "epoch": 0.99488, + "grad_norm": 0.43350592255592346, + "learning_rate": 1.5970300676237948e-09, + "loss": 0.0053, + "step": 49744 + }, + { + "epoch": 0.99492, + "grad_norm": 0.21032293140888214, + "learning_rate": 1.5721745991204018e-09, + "loss": 0.0024, + "step": 49746 + }, + { + "epoch": 0.99496, + "grad_norm": 0.03183349594473839, + "learning_rate": 1.5475140551146629e-09, + "loss": 0.0008, + "step": 49748 + }, + { + "epoch": 0.995, + "grad_norm": 0.0045123533345758915, + "learning_rate": 1.5230484360873043e-09, + "loss": 0.0009, + "step": 49750 + }, + { + "epoch": 0.99504, + "grad_norm": 0.013740968890488148, + "learning_rate": 1.4987777425168326e-09, + "loss": 0.4688, + "step": 49752 + }, + { + "epoch": 0.99508, + "grad_norm": 0.010911144316196442, + "learning_rate": 1.4747019748739822e-09, + "loss": 0.0002, + "step": 49754 + }, + { + "epoch": 0.99512, + "grad_norm": 0.0007454887381754816, + "learning_rate": 1.4508211336305977e-09, + "loss": 0.204, + "step": 49756 + }, + { + "epoch": 0.99516, + "grad_norm": 0.018255963921546936, + "learning_rate": 1.4271352192507525e-09, + "loss": 0.0004, + "step": 49758 + }, + { + "epoch": 0.9952, + "grad_norm": 0.037380192428827286, + "learning_rate": 1.4036442321962995e-09, + "loss": 0.0018, + "step": 49760 + }, + { + "epoch": 0.99524, + "grad_norm": 0.004499451257288456, + "learning_rate": 1.3803481729246503e-09, + "loss": 0.0006, + "step": 49762 + }, + { + "epoch": 0.99528, + "grad_norm": 0.07512569427490234, + "learning_rate": 1.3572470418921069e-09, + "loss": 0.0014, + "step": 49764 + }, + { + "epoch": 0.99532, + "grad_norm": 4.91817045211792, + "learning_rate": 1.334340839547199e-09, + "loss": 0.0429, + "step": 49766 + }, + { + "epoch": 0.99536, + "grad_norm": 0.23431816697120667, + "learning_rate": 1.3116295663362366e-09, + "loss": 0.0021, + "step": 49768 + }, + { + "epoch": 0.9954, + "grad_norm": 0.12962688505649567, + "learning_rate": 1.2891132227033087e-09, + "loss": 0.0016, + "step": 49770 + }, + { + "epoch": 0.99544, + "grad_norm": 0.19428099691867828, + "learning_rate": 1.2667918090869536e-09, + "loss": 0.0025, + "step": 49772 + }, + { + "epoch": 0.99548, + "grad_norm": 0.0044585298746824265, + "learning_rate": 1.2446653259223785e-09, + "loss": 0.0104, + "step": 49774 + }, + { + "epoch": 0.99552, + "grad_norm": 0.17253538966178894, + "learning_rate": 1.22273377364035e-09, + "loss": 0.0069, + "step": 49776 + }, + { + "epoch": 0.99556, + "grad_norm": 0.018051613122224808, + "learning_rate": 1.200997152668304e-09, + "loss": 0.0008, + "step": 49778 + }, + { + "epoch": 0.9956, + "grad_norm": 0.20632478594779968, + "learning_rate": 1.1794554634314558e-09, + "loss": 0.0017, + "step": 49780 + }, + { + "epoch": 0.99564, + "grad_norm": 16.397951126098633, + "learning_rate": 1.1581087063483598e-09, + "loss": 0.176, + "step": 49782 + }, + { + "epoch": 0.99568, + "grad_norm": 0.011169926263391972, + "learning_rate": 1.1369568818364596e-09, + "loss": 0.0015, + "step": 49784 + }, + { + "epoch": 0.99572, + "grad_norm": 16.985187530517578, + "learning_rate": 1.1159999903065377e-09, + "loss": 0.3365, + "step": 49786 + }, + { + "epoch": 0.99576, + "grad_norm": 0.004668087232857943, + "learning_rate": 1.0952380321682666e-09, + "loss": 0.0004, + "step": 49788 + }, + { + "epoch": 0.9958, + "grad_norm": 0.07695416361093521, + "learning_rate": 1.0746710078257673e-09, + "loss": 0.0008, + "step": 49790 + }, + { + "epoch": 0.99584, + "grad_norm": 0.0249258354306221, + "learning_rate": 1.0542989176809403e-09, + "loss": 0.0014, + "step": 49792 + }, + { + "epoch": 0.99588, + "grad_norm": 0.045817017555236816, + "learning_rate": 1.0341217621301358e-09, + "loss": 0.0008, + "step": 49794 + }, + { + "epoch": 0.99592, + "grad_norm": 1.1788856983184814, + "learning_rate": 1.0141395415674826e-09, + "loss": 0.2306, + "step": 49796 + }, + { + "epoch": 0.99596, + "grad_norm": 0.049064815044403076, + "learning_rate": 9.94352256381559e-10, + "loss": 0.0014, + "step": 49798 + }, + { + "epoch": 0.996, + "grad_norm": 0.45307600498199463, + "learning_rate": 9.74759906957612e-10, + "loss": 0.0121, + "step": 49800 + }, + { + "epoch": 0.99604, + "grad_norm": 0.2075360268354416, + "learning_rate": 9.55362493679779e-10, + "loss": 0.0037, + "step": 49802 + }, + { + "epoch": 0.99608, + "grad_norm": 0.03664705529808998, + "learning_rate": 9.361600169244255e-10, + "loss": 0.0003, + "step": 49804 + }, + { + "epoch": 0.99612, + "grad_norm": 0.0052748373709619045, + "learning_rate": 9.171524770668072e-10, + "loss": 0.0001, + "step": 49806 + }, + { + "epoch": 0.99616, + "grad_norm": 0.594349205493927, + "learning_rate": 8.98339874476628e-10, + "loss": 0.0056, + "step": 49808 + }, + { + "epoch": 0.9962, + "grad_norm": 0.043669018894433975, + "learning_rate": 8.797222095224822e-10, + "loss": 0.0017, + "step": 49810 + }, + { + "epoch": 0.99624, + "grad_norm": 0.1632157862186432, + "learning_rate": 8.612994825651921e-10, + "loss": 0.0015, + "step": 49812 + }, + { + "epoch": 0.99628, + "grad_norm": 0.0033794413320720196, + "learning_rate": 8.4307169396447e-10, + "loss": 0.0027, + "step": 49814 + }, + { + "epoch": 0.99632, + "grad_norm": 0.008723455481231213, + "learning_rate": 8.250388440755874e-10, + "loss": 0.0002, + "step": 49816 + }, + { + "epoch": 0.99636, + "grad_norm": 0.10590913891792297, + "learning_rate": 8.072009332515951e-10, + "loss": 0.0012, + "step": 49818 + }, + { + "epoch": 0.9964, + "grad_norm": 0.1268715262413025, + "learning_rate": 7.895579618388827e-10, + "loss": 0.003, + "step": 49820 + }, + { + "epoch": 0.99644, + "grad_norm": 0.47606194019317627, + "learning_rate": 7.721099301816193e-10, + "loss": 0.0054, + "step": 49822 + }, + { + "epoch": 0.99648, + "grad_norm": 0.043390966951847076, + "learning_rate": 7.548568386195332e-10, + "loss": 0.0006, + "step": 49824 + }, + { + "epoch": 0.99652, + "grad_norm": 0.06540705263614655, + "learning_rate": 7.377986874901321e-10, + "loss": 0.0008, + "step": 49826 + }, + { + "epoch": 0.99656, + "grad_norm": 0.0027852889616042376, + "learning_rate": 7.20935477125373e-10, + "loss": 0.0015, + "step": 49828 + }, + { + "epoch": 0.9966, + "grad_norm": 0.017771774902939796, + "learning_rate": 7.042672078527712e-10, + "loss": 0.0005, + "step": 49830 + }, + { + "epoch": 0.99664, + "grad_norm": 0.23302620649337769, + "learning_rate": 6.877938799998429e-10, + "loss": 0.0051, + "step": 49832 + }, + { + "epoch": 0.99668, + "grad_norm": 0.1982937902212143, + "learning_rate": 6.715154938863322e-10, + "loss": 0.0063, + "step": 49834 + }, + { + "epoch": 0.99672, + "grad_norm": 0.006093168631196022, + "learning_rate": 6.554320498286527e-10, + "loss": 0.0005, + "step": 49836 + }, + { + "epoch": 0.99676, + "grad_norm": 0.04415091127157211, + "learning_rate": 6.395435481421075e-10, + "loss": 0.0008, + "step": 49838 + }, + { + "epoch": 0.9968, + "grad_norm": 0.048255447298288345, + "learning_rate": 6.238499891353389e-10, + "loss": 0.0017, + "step": 49840 + }, + { + "epoch": 0.99684, + "grad_norm": 0.0529785230755806, + "learning_rate": 6.083513731147683e-10, + "loss": 0.0012, + "step": 49842 + }, + { + "epoch": 0.99688, + "grad_norm": 0.009394826367497444, + "learning_rate": 5.930477003823765e-10, + "loss": 0.0791, + "step": 49844 + }, + { + "epoch": 0.99692, + "grad_norm": 0.038316477090120316, + "learning_rate": 5.779389712357031e-10, + "loss": 0.0007, + "step": 49846 + }, + { + "epoch": 0.99696, + "grad_norm": 0.32203418016433716, + "learning_rate": 5.630251859711777e-10, + "loss": 0.0046, + "step": 49848 + }, + { + "epoch": 0.997, + "grad_norm": 0.034254640340805054, + "learning_rate": 5.483063448785686e-10, + "loss": 0.0005, + "step": 49850 + }, + { + "epoch": 0.99704, + "grad_norm": 0.20303314924240112, + "learning_rate": 5.337824482443132e-10, + "loss": 0.0021, + "step": 49852 + }, + { + "epoch": 0.99708, + "grad_norm": 0.050196487456560135, + "learning_rate": 5.194534963526288e-10, + "loss": 0.0038, + "step": 49854 + }, + { + "epoch": 0.99712, + "grad_norm": 0.06333783268928528, + "learning_rate": 5.053194894821811e-10, + "loss": 0.0007, + "step": 49856 + }, + { + "epoch": 0.99716, + "grad_norm": 0.07727035135030746, + "learning_rate": 4.913804279083057e-10, + "loss": 0.0023, + "step": 49858 + }, + { + "epoch": 0.9972, + "grad_norm": 0.17590539157390594, + "learning_rate": 4.77636311903007e-10, + "loss": 0.0018, + "step": 49860 + }, + { + "epoch": 0.99724, + "grad_norm": 0.009407839737832546, + "learning_rate": 4.6408714173495904e-10, + "loss": 0.0004, + "step": 49862 + }, + { + "epoch": 0.99728, + "grad_norm": 0.028790557757019997, + "learning_rate": 4.5073291766617456e-10, + "loss": 0.0005, + "step": 49864 + }, + { + "epoch": 0.99732, + "grad_norm": 0.040706489235162735, + "learning_rate": 4.3757363995977633e-10, + "loss": 0.0009, + "step": 49866 + }, + { + "epoch": 0.99736, + "grad_norm": 0.5423567891120911, + "learning_rate": 4.246093088711156e-10, + "loss": 0.0091, + "step": 49868 + }, + { + "epoch": 0.9974, + "grad_norm": 0.04080836847424507, + "learning_rate": 4.118399246522131e-10, + "loss": 0.0019, + "step": 49870 + }, + { + "epoch": 0.99744, + "grad_norm": 0.1956353336572647, + "learning_rate": 3.9926548755286896e-10, + "loss": 0.0029, + "step": 49872 + }, + { + "epoch": 0.99748, + "grad_norm": 0.01612791046500206, + "learning_rate": 3.868859978173323e-10, + "loss": 0.0009, + "step": 49874 + }, + { + "epoch": 0.99752, + "grad_norm": 0.09498188644647598, + "learning_rate": 3.747014556876316e-10, + "loss": 0.0011, + "step": 49876 + }, + { + "epoch": 0.99756, + "grad_norm": 0.0407729372382164, + "learning_rate": 3.627118614024649e-10, + "loss": 0.002, + "step": 49878 + }, + { + "epoch": 0.9976, + "grad_norm": 0.20248456299304962, + "learning_rate": 3.509172151938689e-10, + "loss": 0.0023, + "step": 49880 + }, + { + "epoch": 0.99764, + "grad_norm": 1.6802953481674194, + "learning_rate": 3.3931751729165965e-10, + "loss": 0.0149, + "step": 49882 + }, + { + "epoch": 0.99768, + "grad_norm": 0.006165670230984688, + "learning_rate": 3.2791276792343286e-10, + "loss": 0.0005, + "step": 49884 + }, + { + "epoch": 0.99772, + "grad_norm": 0.08167996257543564, + "learning_rate": 3.1670296731012295e-10, + "loss": 0.0022, + "step": 49886 + }, + { + "epoch": 0.99776, + "grad_norm": 0.3772916793823242, + "learning_rate": 3.056881156715541e-10, + "loss": 0.006, + "step": 49888 + }, + { + "epoch": 0.9978, + "grad_norm": 0.25369781255722046, + "learning_rate": 2.948682132208891e-10, + "loss": 0.0029, + "step": 49890 + }, + { + "epoch": 0.99784, + "grad_norm": 0.010188985615968704, + "learning_rate": 2.8424326017018056e-10, + "loss": 0.0006, + "step": 49892 + }, + { + "epoch": 0.99788, + "grad_norm": 0.1177404373884201, + "learning_rate": 2.738132567270402e-10, + "loss": 0.0012, + "step": 49894 + }, + { + "epoch": 0.99792, + "grad_norm": 0.37446850538253784, + "learning_rate": 2.6357820309352855e-10, + "loss": 0.0042, + "step": 49896 + }, + { + "epoch": 0.99796, + "grad_norm": 0.010272244922816753, + "learning_rate": 2.5353809947059603e-10, + "loss": 0.0006, + "step": 49898 + }, + { + "epoch": 0.998, + "grad_norm": 2.892843246459961, + "learning_rate": 2.436929460525317e-10, + "loss": 0.0337, + "step": 49900 + }, + { + "epoch": 0.99804, + "grad_norm": 0.02396099455654621, + "learning_rate": 2.3404274303140405e-10, + "loss": 0.0004, + "step": 49902 + }, + { + "epoch": 0.99808, + "grad_norm": 0.004423920530825853, + "learning_rate": 2.245874905959511e-10, + "loss": 0.0092, + "step": 49904 + }, + { + "epoch": 0.99812, + "grad_norm": 0.6486548781394958, + "learning_rate": 2.1532718893158e-10, + "loss": 0.0051, + "step": 49906 + }, + { + "epoch": 0.99816, + "grad_norm": 0.28087836503982544, + "learning_rate": 2.0626183821592649e-10, + "loss": 0.0034, + "step": 49908 + }, + { + "epoch": 0.9982, + "grad_norm": 0.04491516947746277, + "learning_rate": 1.9739143862884668e-10, + "loss": 0.0013, + "step": 49910 + }, + { + "epoch": 0.99824, + "grad_norm": 0.012324302457273006, + "learning_rate": 1.8871599034131493e-10, + "loss": 0.002, + "step": 49912 + }, + { + "epoch": 0.99828, + "grad_norm": 14.253555297851562, + "learning_rate": 1.8023549352319535e-10, + "loss": 0.1898, + "step": 49914 + }, + { + "epoch": 0.99832, + "grad_norm": 0.015699142590165138, + "learning_rate": 1.7194994833880096e-10, + "loss": 0.0007, + "step": 49916 + }, + { + "epoch": 0.99836, + "grad_norm": 0.12340181320905685, + "learning_rate": 1.6385935495133455e-10, + "loss": 0.0017, + "step": 49918 + }, + { + "epoch": 0.9984, + "grad_norm": 0.003330583916977048, + "learning_rate": 1.559637135173375e-10, + "loss": 0.0009, + "step": 49920 + }, + { + "epoch": 0.99844, + "grad_norm": 0.09821150451898575, + "learning_rate": 1.4826302419224114e-10, + "loss": 0.0023, + "step": 49922 + }, + { + "epoch": 0.99848, + "grad_norm": 1.0132322311401367, + "learning_rate": 1.4075728712370507e-10, + "loss": 0.0093, + "step": 49924 + }, + { + "epoch": 0.99852, + "grad_norm": 0.05250365659594536, + "learning_rate": 1.3344650245938894e-10, + "loss": 0.0097, + "step": 49926 + }, + { + "epoch": 0.99856, + "grad_norm": 19.07329559326172, + "learning_rate": 1.2633067034251156e-10, + "loss": 1.0014, + "step": 49928 + }, + { + "epoch": 0.9986, + "grad_norm": 0.018801378086209297, + "learning_rate": 1.1940979091074056e-10, + "loss": 0.0005, + "step": 49930 + }, + { + "epoch": 0.99864, + "grad_norm": 0.19928258657455444, + "learning_rate": 1.1268386429952316e-10, + "loss": 0.0019, + "step": 49932 + }, + { + "epoch": 0.99868, + "grad_norm": 0.233146533370018, + "learning_rate": 1.0615289063986567e-10, + "loss": 0.0031, + "step": 49934 + }, + { + "epoch": 0.99872, + "grad_norm": 0.5521958470344543, + "learning_rate": 9.981687005833351e-11, + "loss": 0.0046, + "step": 49936 + }, + { + "epoch": 0.99876, + "grad_norm": 0.31755322217941284, + "learning_rate": 9.36758026803819e-11, + "loss": 0.0054, + "step": 49938 + }, + { + "epoch": 0.9988, + "grad_norm": 2.13240122795105, + "learning_rate": 8.772968862369447e-11, + "loss": 0.0217, + "step": 49940 + }, + { + "epoch": 0.99884, + "grad_norm": 0.3590889573097229, + "learning_rate": 8.197852800484463e-11, + "loss": 0.0041, + "step": 49942 + }, + { + "epoch": 0.99888, + "grad_norm": 0.010365858674049377, + "learning_rate": 7.642232093596491e-11, + "loss": 0.0003, + "step": 49944 + }, + { + "epoch": 0.99892, + "grad_norm": 1.4123692512512207, + "learning_rate": 7.10610675269674e-11, + "loss": 0.019, + "step": 49946 + }, + { + "epoch": 0.99896, + "grad_norm": 0.02599950321018696, + "learning_rate": 6.58947678799926e-11, + "loss": 0.0019, + "step": 49948 + }, + { + "epoch": 0.999, + "grad_norm": 0.007110028062015772, + "learning_rate": 6.092342209607083e-11, + "loss": 0.0006, + "step": 49950 + }, + { + "epoch": 0.99904, + "grad_norm": 0.13648353517055511, + "learning_rate": 5.6147030274011914e-11, + "loss": 0.002, + "step": 49952 + }, + { + "epoch": 0.99908, + "grad_norm": 0.012289400212466717, + "learning_rate": 5.1565592504854156e-11, + "loss": 0.0022, + "step": 49954 + }, + { + "epoch": 0.99912, + "grad_norm": 0.033137835562229156, + "learning_rate": 4.7179108878525614e-11, + "loss": 0.0004, + "step": 49956 + }, + { + "epoch": 0.99916, + "grad_norm": 0.13582828640937805, + "learning_rate": 4.2987579481623685e-11, + "loss": 0.002, + "step": 49958 + }, + { + "epoch": 0.9992, + "grad_norm": 0.0584644116461277, + "learning_rate": 3.899100439408443e-11, + "loss": 0.0009, + "step": 49960 + }, + { + "epoch": 0.99924, + "grad_norm": 0.00031333829974755645, + "learning_rate": 3.51893836958439e-11, + "loss": 0.0003, + "step": 49962 + }, + { + "epoch": 0.99928, + "grad_norm": 0.0004144512349739671, + "learning_rate": 3.15827174590666e-11, + "loss": 0.0004, + "step": 49964 + }, + { + "epoch": 0.99932, + "grad_norm": 0.7310954928398132, + "learning_rate": 2.81710057548068e-11, + "loss": 0.0061, + "step": 49966 + }, + { + "epoch": 0.99936, + "grad_norm": 0.09573674201965332, + "learning_rate": 2.4954248650788103e-11, + "loss": 0.0016, + "step": 49968 + }, + { + "epoch": 0.9994, + "grad_norm": 7.981333255767822, + "learning_rate": 2.1932446206962556e-11, + "loss": 0.0878, + "step": 49970 + }, + { + "epoch": 0.99944, + "grad_norm": 0.08364388346672058, + "learning_rate": 1.910559848550264e-11, + "loss": 0.0019, + "step": 49972 + }, + { + "epoch": 0.99948, + "grad_norm": 0.0059067485854029655, + "learning_rate": 1.6473705538588847e-11, + "loss": 0.0064, + "step": 49974 + }, + { + "epoch": 0.99952, + "grad_norm": 0.030860206112265587, + "learning_rate": 1.4036767419511876e-11, + "loss": 0.0005, + "step": 49976 + }, + { + "epoch": 0.99956, + "grad_norm": 0.005260678939521313, + "learning_rate": 1.179478417601132e-11, + "loss": 0.0001, + "step": 49978 + }, + { + "epoch": 0.9996, + "grad_norm": 0.010125854052603245, + "learning_rate": 9.74775584916543e-12, + "loss": 0.0301, + "step": 49980 + }, + { + "epoch": 0.99964, + "grad_norm": 0.043903838843107224, + "learning_rate": 7.895682482272904e-12, + "loss": 0.0016, + "step": 49982 + }, + { + "epoch": 0.99968, + "grad_norm": 0.012056859210133553, + "learning_rate": 6.238564108640432e-12, + "loss": 0.0004, + "step": 49984 + }, + { + "epoch": 0.99972, + "grad_norm": 0.011114319786429405, + "learning_rate": 4.776400761574706e-12, + "loss": 0.0002, + "step": 49986 + }, + { + "epoch": 0.99976, + "grad_norm": 0.33909502625465393, + "learning_rate": 3.509192471051748e-12, + "loss": 0.0246, + "step": 49988 + }, + { + "epoch": 0.9998, + "grad_norm": 0.03315838426351547, + "learning_rate": 2.4369392592760166e-12, + "loss": 0.0006, + "step": 49990 + }, + { + "epoch": 0.99984, + "grad_norm": 0.02098158933222294, + "learning_rate": 1.5596411484519736e-12, + "loss": 0.1158, + "step": 49992 + }, + { + "epoch": 0.99988, + "grad_norm": 0.04275734722614288, + "learning_rate": 8.77298156343187e-13, + "loss": 0.0242, + "step": 49994 + }, + { + "epoch": 0.99992, + "grad_norm": 0.018231211230158806, + "learning_rate": 3.899102951621103e-13, + "loss": 0.0058, + "step": 49996 + }, + { + "epoch": 0.99996, + "grad_norm": 0.014792344532907009, + "learning_rate": 9.747757379052758e-14, + "loss": 0.0011, + "step": 49998 + }, + { + "epoch": 1.0, + "grad_norm": 0.014886812306940556, + "learning_rate": 0.0, + "loss": 0.019, + "step": 50000 + }, + { + "epoch": 1.0, + "step": 50000, + "total_flos": 3.415247159296e+17, + "train_loss": 0.10827204646465427, + "train_runtime": 60882.1061, + "train_samples_per_second": 0.821, + "train_steps_per_second": 0.821 + } + ], + "logging_steps": 2, + "max_steps": 50000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 3.415247159296e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario17_new_10000_nosampling_seed1/client_0/global_step50000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario17_new_10000_nosampling_seed1/client_0/global_step50000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c86c27d2704e60e0875ed5c50bb4744dbcbc86d3 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario17_new_10000_nosampling_seed1/client_0/global_step50000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a220f2622e4467978e759336799edca764bbd09e2bda41725d9be86d42838bb5 +size 3837841200 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario17_new_10000_nosampling_seed1/client_0/global_step50000/mp_rank_00_model_states.pt b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario17_new_10000_nosampling_seed1/client_0/global_step50000/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c15b956c218b09aedb36360c493f8e9122ee25f --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario17_new_10000_nosampling_seed1/client_0/global_step50000/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46fbc560c215150b1123261e526cde1080c6522a0a531a02db6bc8a7fd7acce7 +size 639989420 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario17_new_10000_nosampling_seed1/client_0/latest b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario17_new_10000_nosampling_seed1/client_0/latest new file mode 100644 index 0000000000000000000000000000000000000000..59a558c0ee32a055bd8fc16bf0d99a5950c59ac1 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario17_new_10000_nosampling_seed1/client_0/latest @@ -0,0 +1 @@ +global_step50000 \ No newline at end of file diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario17_new_10000_nosampling_seed1/client_0/scheduler.pt b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario17_new_10000_nosampling_seed1/client_0/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..284cef13f91d0223f323fd8e41cb55952cac2215 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario17_new_10000_nosampling_seed1/client_0/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb5449a7d1157ebe24a62914ad5f086f572edadbd4a057fb9316d37abb636e18 +size 1064 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario17_new_10000_nosampling_seed1/client_0/zero_to_fp32.py b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario17_new_10000_nosampling_seed1/client_0/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..e93cb1c95f15c1474642edb1978714075361bc04 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario17_new_10000_nosampling_seed1/client_0/zero_to_fp32.py @@ -0,0 +1,758 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: + shared_tensor = state_dict[converted_tensors[tensor_id]] + state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + state_dict[name] = tensor.contiguous() + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in shard_state_dict: + del state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario17_new_10000_nosampling_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario17_new_10000_nosampling_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..4daef721ccbdb198f5f2ec5ea68d0af2d704c04f --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario17_new_10000_nosampling_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2c68335f836c7cd5f8784c0fb8ca4d659ae0d306962e8d139647e8fd471b8d1 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario18_new_10000_nosampling_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario18_new_10000_nosampling_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b0a5a43d87d5e8204a4f3963af54f27c8798ce25 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario18_new_10000_nosampling_seed1/0_trainer_state.json @@ -0,0 +1,35032 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002, + "grad_norm": 7.891052722930908, + "learning_rate": 4e-08, + "loss": 0.3058, + "step": 2 + }, + { + "epoch": 0.0004, + "grad_norm": 2.4686591625213623, + "learning_rate": 8e-08, + "loss": 0.0834, + "step": 4 + }, + { + "epoch": 0.0006, + "grad_norm": 1.5330673456192017, + "learning_rate": 1.2000000000000002e-07, + "loss": 0.0277, + "step": 6 + }, + { + "epoch": 0.0008, + "grad_norm": 2.3548452854156494, + "learning_rate": 1.6e-07, + "loss": 0.1368, + "step": 8 + }, + { + "epoch": 0.001, + "grad_norm": 1.5155742168426514, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.1672, + "step": 10 + }, + { + "epoch": 0.0012, + "grad_norm": 2.4463071823120117, + "learning_rate": 2.4000000000000003e-07, + "loss": 0.1324, + "step": 12 + }, + { + "epoch": 0.0014, + "grad_norm": 3.1462676525115967, + "learning_rate": 2.8e-07, + "loss": 0.1363, + "step": 14 + }, + { + "epoch": 0.0016, + "grad_norm": 2.0467514991760254, + "learning_rate": 3.2e-07, + "loss": 0.11, + "step": 16 + }, + { + "epoch": 0.0018, + "grad_norm": 2.1256260871887207, + "learning_rate": 3.6e-07, + "loss": 0.0794, + "step": 18 + }, + { + "epoch": 0.002, + "grad_norm": 3.02065372467041, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.1648, + "step": 20 + }, + { + "epoch": 0.0022, + "grad_norm": 1.5983836650848389, + "learning_rate": 4.4e-07, + "loss": 0.0839, + "step": 22 + }, + { + "epoch": 0.0024, + "grad_norm": 3.4472038745880127, + "learning_rate": 4.800000000000001e-07, + "loss": 0.2054, + "step": 24 + }, + { + "epoch": 0.0026, + "grad_norm": 1.29579758644104, + "learning_rate": 5.2e-07, + "loss": 0.0833, + "step": 26 + }, + { + "epoch": 0.0028, + "grad_norm": 4.378201961517334, + "learning_rate": 5.6e-07, + "loss": 0.3132, + "step": 28 + }, + { + "epoch": 0.003, + "grad_norm": 2.2574729919433594, + "learning_rate": 6.000000000000001e-07, + "loss": 0.1946, + "step": 30 + }, + { + "epoch": 0.0032, + "grad_norm": 2.5200417041778564, + "learning_rate": 6.4e-07, + "loss": 0.1183, + "step": 32 + }, + { + "epoch": 0.0034, + "grad_norm": 5.305577754974365, + "learning_rate": 6.800000000000001e-07, + "loss": 0.3102, + "step": 34 + }, + { + "epoch": 0.0036, + "grad_norm": 1.732666254043579, + "learning_rate": 7.2e-07, + "loss": 0.216, + "step": 36 + }, + { + "epoch": 0.0038, + "grad_norm": 0.9977903962135315, + "learning_rate": 7.6e-07, + "loss": 0.0279, + "step": 38 + }, + { + "epoch": 0.004, + "grad_norm": 3.6954915523529053, + "learning_rate": 8.000000000000001e-07, + "loss": 0.2231, + "step": 40 + }, + { + "epoch": 0.0042, + "grad_norm": 4.067526340484619, + "learning_rate": 8.400000000000001e-07, + "loss": 0.2338, + "step": 42 + }, + { + "epoch": 0.0044, + "grad_norm": 4.16857385635376, + "learning_rate": 8.8e-07, + "loss": 0.1114, + "step": 44 + }, + { + "epoch": 0.0046, + "grad_norm": 0.5818256139755249, + "learning_rate": 9.200000000000001e-07, + "loss": 0.0434, + "step": 46 + }, + { + "epoch": 0.0048, + "grad_norm": 1.6243209838867188, + "learning_rate": 9.600000000000001e-07, + "loss": 0.0698, + "step": 48 + }, + { + "epoch": 0.005, + "grad_norm": 1.4156688451766968, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.1311, + "step": 50 + }, + { + "epoch": 0.0052, + "grad_norm": 2.7777271270751953, + "learning_rate": 1.04e-06, + "loss": 0.0676, + "step": 52 + }, + { + "epoch": 0.0054, + "grad_norm": 3.915895700454712, + "learning_rate": 1.08e-06, + "loss": 0.2122, + "step": 54 + }, + { + "epoch": 0.0056, + "grad_norm": 3.3363256454467773, + "learning_rate": 1.12e-06, + "loss": 0.1203, + "step": 56 + }, + { + "epoch": 0.0058, + "grad_norm": 2.696258306503296, + "learning_rate": 1.1600000000000001e-06, + "loss": 0.3414, + "step": 58 + }, + { + "epoch": 0.006, + "grad_norm": 5.9033522605896, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.2479, + "step": 60 + }, + { + "epoch": 0.0062, + "grad_norm": 3.224116563796997, + "learning_rate": 1.2400000000000002e-06, + "loss": 0.0653, + "step": 62 + }, + { + "epoch": 0.0064, + "grad_norm": 1.128644585609436, + "learning_rate": 1.28e-06, + "loss": 0.076, + "step": 64 + }, + { + "epoch": 0.0066, + "grad_norm": 4.540005683898926, + "learning_rate": 1.32e-06, + "loss": 0.1979, + "step": 66 + }, + { + "epoch": 0.0068, + "grad_norm": 3.2932956218719482, + "learning_rate": 1.3600000000000001e-06, + "loss": 0.225, + "step": 68 + }, + { + "epoch": 0.007, + "grad_norm": 1.7632191181182861, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.0935, + "step": 70 + }, + { + "epoch": 0.0072, + "grad_norm": 3.8703603744506836, + "learning_rate": 1.44e-06, + "loss": 0.1339, + "step": 72 + }, + { + "epoch": 0.0074, + "grad_norm": 2.3640084266662598, + "learning_rate": 1.48e-06, + "loss": 0.1275, + "step": 74 + }, + { + "epoch": 0.0076, + "grad_norm": 4.302120208740234, + "learning_rate": 1.52e-06, + "loss": 0.194, + "step": 76 + }, + { + "epoch": 0.0078, + "grad_norm": 3.9767751693725586, + "learning_rate": 1.56e-06, + "loss": 0.1561, + "step": 78 + }, + { + "epoch": 0.008, + "grad_norm": 1.9820116758346558, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.0417, + "step": 80 + }, + { + "epoch": 0.0082, + "grad_norm": 2.136436939239502, + "learning_rate": 1.6400000000000002e-06, + "loss": 0.0648, + "step": 82 + }, + { + "epoch": 0.0084, + "grad_norm": 1.4634268283843994, + "learning_rate": 1.6800000000000002e-06, + "loss": 0.1274, + "step": 84 + }, + { + "epoch": 0.0086, + "grad_norm": 10.614486694335938, + "learning_rate": 1.72e-06, + "loss": 0.3536, + "step": 86 + }, + { + "epoch": 0.0088, + "grad_norm": 5.8255615234375, + "learning_rate": 1.76e-06, + "loss": 0.2085, + "step": 88 + }, + { + "epoch": 0.009, + "grad_norm": 1.5749590396881104, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0712, + "step": 90 + }, + { + "epoch": 0.0092, + "grad_norm": 7.172019004821777, + "learning_rate": 1.8400000000000002e-06, + "loss": 0.1763, + "step": 92 + }, + { + "epoch": 0.0094, + "grad_norm": 1.460688829421997, + "learning_rate": 1.8800000000000002e-06, + "loss": 0.0469, + "step": 94 + }, + { + "epoch": 0.0096, + "grad_norm": 11.736851692199707, + "learning_rate": 1.9200000000000003e-06, + "loss": 0.306, + "step": 96 + }, + { + "epoch": 0.0098, + "grad_norm": 3.9747514724731445, + "learning_rate": 1.9600000000000003e-06, + "loss": 0.092, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 9.097468376159668, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.1885, + "step": 100 + }, + { + "epoch": 0.0102, + "grad_norm": 3.6342320442199707, + "learning_rate": 2.04e-06, + "loss": 0.1509, + "step": 102 + }, + { + "epoch": 0.0104, + "grad_norm": 2.3103630542755127, + "learning_rate": 2.08e-06, + "loss": 0.0976, + "step": 104 + }, + { + "epoch": 0.0106, + "grad_norm": 6.172794818878174, + "learning_rate": 2.12e-06, + "loss": 0.1036, + "step": 106 + }, + { + "epoch": 0.0108, + "grad_norm": 2.819502830505371, + "learning_rate": 2.16e-06, + "loss": 0.093, + "step": 108 + }, + { + "epoch": 0.011, + "grad_norm": 2.2891812324523926, + "learning_rate": 2.2e-06, + "loss": 0.1373, + "step": 110 + }, + { + "epoch": 0.0112, + "grad_norm": 1.1214839220046997, + "learning_rate": 2.24e-06, + "loss": 0.0207, + "step": 112 + }, + { + "epoch": 0.0114, + "grad_norm": 6.288011074066162, + "learning_rate": 2.28e-06, + "loss": 0.1657, + "step": 114 + }, + { + "epoch": 0.0116, + "grad_norm": 0.2250189185142517, + "learning_rate": 2.3200000000000002e-06, + "loss": 0.0278, + "step": 116 + }, + { + "epoch": 0.0118, + "grad_norm": 7.994297981262207, + "learning_rate": 2.3600000000000003e-06, + "loss": 0.2061, + "step": 118 + }, + { + "epoch": 0.012, + "grad_norm": 1.2017138004302979, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0399, + "step": 120 + }, + { + "epoch": 0.0122, + "grad_norm": 3.6338677406311035, + "learning_rate": 2.4400000000000004e-06, + "loss": 0.0962, + "step": 122 + }, + { + "epoch": 0.0124, + "grad_norm": 1.3988661766052246, + "learning_rate": 2.4800000000000004e-06, + "loss": 0.1388, + "step": 124 + }, + { + "epoch": 0.0126, + "grad_norm": 3.1045339107513428, + "learning_rate": 2.52e-06, + "loss": 0.1314, + "step": 126 + }, + { + "epoch": 0.0128, + "grad_norm": 1.2294882535934448, + "learning_rate": 2.56e-06, + "loss": 0.0861, + "step": 128 + }, + { + "epoch": 0.013, + "grad_norm": 5.410033702850342, + "learning_rate": 2.6e-06, + "loss": 0.1347, + "step": 130 + }, + { + "epoch": 0.0132, + "grad_norm": 3.0339627265930176, + "learning_rate": 2.64e-06, + "loss": 0.0566, + "step": 132 + }, + { + "epoch": 0.0134, + "grad_norm": 6.3241987228393555, + "learning_rate": 2.68e-06, + "loss": 0.1871, + "step": 134 + }, + { + "epoch": 0.0136, + "grad_norm": 2.325749158859253, + "learning_rate": 2.7200000000000002e-06, + "loss": 0.0325, + "step": 136 + }, + { + "epoch": 0.0138, + "grad_norm": 2.4841835498809814, + "learning_rate": 2.7600000000000003e-06, + "loss": 0.067, + "step": 138 + }, + { + "epoch": 0.014, + "grad_norm": 5.0882568359375, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0725, + "step": 140 + }, + { + "epoch": 0.0142, + "grad_norm": 9.139763832092285, + "learning_rate": 2.84e-06, + "loss": 0.2027, + "step": 142 + }, + { + "epoch": 0.0144, + "grad_norm": 3.976593494415283, + "learning_rate": 2.88e-06, + "loss": 0.1228, + "step": 144 + }, + { + "epoch": 0.0146, + "grad_norm": 2.708190441131592, + "learning_rate": 2.92e-06, + "loss": 0.0416, + "step": 146 + }, + { + "epoch": 0.0148, + "grad_norm": 13.530736923217773, + "learning_rate": 2.96e-06, + "loss": 0.3111, + "step": 148 + }, + { + "epoch": 0.015, + "grad_norm": 0.7364652156829834, + "learning_rate": 3e-06, + "loss": 0.0593, + "step": 150 + }, + { + "epoch": 0.0152, + "grad_norm": 1.4686942100524902, + "learning_rate": 3.04e-06, + "loss": 0.2107, + "step": 152 + }, + { + "epoch": 0.0154, + "grad_norm": 1.294390320777893, + "learning_rate": 3.08e-06, + "loss": 0.0483, + "step": 154 + }, + { + "epoch": 0.0156, + "grad_norm": 1.4581637382507324, + "learning_rate": 3.12e-06, + "loss": 0.0505, + "step": 156 + }, + { + "epoch": 0.0158, + "grad_norm": 9.68393611907959, + "learning_rate": 3.1600000000000002e-06, + "loss": 0.2252, + "step": 158 + }, + { + "epoch": 0.016, + "grad_norm": 9.05947208404541, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0966, + "step": 160 + }, + { + "epoch": 0.0162, + "grad_norm": 6.091263771057129, + "learning_rate": 3.2400000000000003e-06, + "loss": 0.0786, + "step": 162 + }, + { + "epoch": 0.0164, + "grad_norm": 4.266232490539551, + "learning_rate": 3.2800000000000004e-06, + "loss": 0.0591, + "step": 164 + }, + { + "epoch": 0.0166, + "grad_norm": 4.026641368865967, + "learning_rate": 3.3200000000000004e-06, + "loss": 0.1561, + "step": 166 + }, + { + "epoch": 0.0168, + "grad_norm": 0.1592453420162201, + "learning_rate": 3.3600000000000004e-06, + "loss": 0.0618, + "step": 168 + }, + { + "epoch": 0.017, + "grad_norm": 1.3939039707183838, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0575, + "step": 170 + }, + { + "epoch": 0.0172, + "grad_norm": 1.0217359066009521, + "learning_rate": 3.44e-06, + "loss": 0.0479, + "step": 172 + }, + { + "epoch": 0.0174, + "grad_norm": 3.400691509246826, + "learning_rate": 3.48e-06, + "loss": 0.0658, + "step": 174 + }, + { + "epoch": 0.0176, + "grad_norm": 5.411176681518555, + "learning_rate": 3.52e-06, + "loss": 0.1328, + "step": 176 + }, + { + "epoch": 0.0178, + "grad_norm": 4.289204120635986, + "learning_rate": 3.5600000000000002e-06, + "loss": 0.1019, + "step": 178 + }, + { + "epoch": 0.018, + "grad_norm": 5.8218255043029785, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.1575, + "step": 180 + }, + { + "epoch": 0.0182, + "grad_norm": 3.4071547985076904, + "learning_rate": 3.6400000000000003e-06, + "loss": 0.551, + "step": 182 + }, + { + "epoch": 0.0184, + "grad_norm": 4.8909173011779785, + "learning_rate": 3.6800000000000003e-06, + "loss": 0.1577, + "step": 184 + }, + { + "epoch": 0.0186, + "grad_norm": 1.7209688425064087, + "learning_rate": 3.7200000000000004e-06, + "loss": 0.1719, + "step": 186 + }, + { + "epoch": 0.0188, + "grad_norm": 1.1896027326583862, + "learning_rate": 3.7600000000000004e-06, + "loss": 0.0138, + "step": 188 + }, + { + "epoch": 0.019, + "grad_norm": 1.213350534439087, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0286, + "step": 190 + }, + { + "epoch": 0.0192, + "grad_norm": 2.3885979652404785, + "learning_rate": 3.8400000000000005e-06, + "loss": 0.0558, + "step": 192 + }, + { + "epoch": 0.0194, + "grad_norm": 2.479022979736328, + "learning_rate": 3.88e-06, + "loss": 0.0734, + "step": 194 + }, + { + "epoch": 0.0196, + "grad_norm": 3.997546911239624, + "learning_rate": 3.920000000000001e-06, + "loss": 0.0874, + "step": 196 + }, + { + "epoch": 0.0198, + "grad_norm": 1.5055571794509888, + "learning_rate": 3.96e-06, + "loss": 0.0151, + "step": 198 + }, + { + "epoch": 0.02, + "grad_norm": 0.4214957654476166, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0091, + "step": 200 + }, + { + "epoch": 0.0202, + "grad_norm": 0.5224543213844299, + "learning_rate": 4.04e-06, + "loss": 0.0111, + "step": 202 + }, + { + "epoch": 0.0204, + "grad_norm": 0.7501801252365112, + "learning_rate": 4.08e-06, + "loss": 0.1264, + "step": 204 + }, + { + "epoch": 0.0206, + "grad_norm": 0.5432851910591125, + "learning_rate": 4.12e-06, + "loss": 0.2344, + "step": 206 + }, + { + "epoch": 0.0208, + "grad_norm": 0.6007727384567261, + "learning_rate": 4.16e-06, + "loss": 0.0222, + "step": 208 + }, + { + "epoch": 0.021, + "grad_norm": 7.078505039215088, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.3001, + "step": 210 + }, + { + "epoch": 0.0212, + "grad_norm": 10.34742546081543, + "learning_rate": 4.24e-06, + "loss": 0.2582, + "step": 212 + }, + { + "epoch": 0.0214, + "grad_norm": 2.5187301635742188, + "learning_rate": 4.2800000000000005e-06, + "loss": 0.0379, + "step": 214 + }, + { + "epoch": 0.0216, + "grad_norm": 0.7150712609291077, + "learning_rate": 4.32e-06, + "loss": 0.0081, + "step": 216 + }, + { + "epoch": 0.0218, + "grad_norm": 9.740117073059082, + "learning_rate": 4.360000000000001e-06, + "loss": 0.1544, + "step": 218 + }, + { + "epoch": 0.022, + "grad_norm": 0.3307875394821167, + "learning_rate": 4.4e-06, + "loss": 0.022, + "step": 220 + }, + { + "epoch": 0.0222, + "grad_norm": 0.05461273342370987, + "learning_rate": 4.440000000000001e-06, + "loss": 0.0617, + "step": 222 + }, + { + "epoch": 0.0224, + "grad_norm": 2.2222883701324463, + "learning_rate": 4.48e-06, + "loss": 0.0542, + "step": 224 + }, + { + "epoch": 0.0226, + "grad_norm": 4.866563320159912, + "learning_rate": 4.520000000000001e-06, + "loss": 0.0637, + "step": 226 + }, + { + "epoch": 0.0228, + "grad_norm": 5.642423152923584, + "learning_rate": 4.56e-06, + "loss": 0.0561, + "step": 228 + }, + { + "epoch": 0.023, + "grad_norm": 11.454272270202637, + "learning_rate": 4.600000000000001e-06, + "loss": 0.3841, + "step": 230 + }, + { + "epoch": 0.0232, + "grad_norm": 10.527713775634766, + "learning_rate": 4.6400000000000005e-06, + "loss": 0.1268, + "step": 232 + }, + { + "epoch": 0.0234, + "grad_norm": 0.6880031228065491, + "learning_rate": 4.680000000000001e-06, + "loss": 0.0078, + "step": 234 + }, + { + "epoch": 0.0236, + "grad_norm": 0.2188214659690857, + "learning_rate": 4.7200000000000005e-06, + "loss": 0.0054, + "step": 236 + }, + { + "epoch": 0.0238, + "grad_norm": 3.4803402423858643, + "learning_rate": 4.76e-06, + "loss": 0.1396, + "step": 238 + }, + { + "epoch": 0.024, + "grad_norm": 0.13678428530693054, + "learning_rate": 4.800000000000001e-06, + "loss": 0.1317, + "step": 240 + }, + { + "epoch": 0.0242, + "grad_norm": 0.3744032084941864, + "learning_rate": 4.84e-06, + "loss": 0.0072, + "step": 242 + }, + { + "epoch": 0.0244, + "grad_norm": 0.3565969467163086, + "learning_rate": 4.880000000000001e-06, + "loss": 0.1532, + "step": 244 + }, + { + "epoch": 0.0246, + "grad_norm": 0.9765319228172302, + "learning_rate": 4.92e-06, + "loss": 0.0609, + "step": 246 + }, + { + "epoch": 0.0248, + "grad_norm": 4.889562606811523, + "learning_rate": 4.960000000000001e-06, + "loss": 0.0576, + "step": 248 + }, + { + "epoch": 0.025, + "grad_norm": 9.00096321105957, + "learning_rate": 5e-06, + "loss": 0.1882, + "step": 250 + }, + { + "epoch": 0.0252, + "grad_norm": 2.581041097640991, + "learning_rate": 5.04e-06, + "loss": 0.1149, + "step": 252 + }, + { + "epoch": 0.0254, + "grad_norm": 6.577972888946533, + "learning_rate": 5.0800000000000005e-06, + "loss": 0.119, + "step": 254 + }, + { + "epoch": 0.0256, + "grad_norm": 5.057807922363281, + "learning_rate": 5.12e-06, + "loss": 0.0773, + "step": 256 + }, + { + "epoch": 0.0258, + "grad_norm": 0.5239856839179993, + "learning_rate": 5.1600000000000006e-06, + "loss": 0.0206, + "step": 258 + }, + { + "epoch": 0.026, + "grad_norm": 6.262078285217285, + "learning_rate": 5.2e-06, + "loss": 0.1045, + "step": 260 + }, + { + "epoch": 0.0262, + "grad_norm": 2.622756004333496, + "learning_rate": 5.240000000000001e-06, + "loss": 0.048, + "step": 262 + }, + { + "epoch": 0.0264, + "grad_norm": 0.15171781182289124, + "learning_rate": 5.28e-06, + "loss": 0.122, + "step": 264 + }, + { + "epoch": 0.0266, + "grad_norm": 0.13422168791294098, + "learning_rate": 5.320000000000001e-06, + "loss": 0.3531, + "step": 266 + }, + { + "epoch": 0.0268, + "grad_norm": 5.99261474609375, + "learning_rate": 5.36e-06, + "loss": 0.1609, + "step": 268 + }, + { + "epoch": 0.027, + "grad_norm": 2.575153350830078, + "learning_rate": 5.400000000000001e-06, + "loss": 0.0302, + "step": 270 + }, + { + "epoch": 0.0272, + "grad_norm": 5.9588942527771, + "learning_rate": 5.4400000000000004e-06, + "loss": 0.1191, + "step": 272 + }, + { + "epoch": 0.0274, + "grad_norm": 0.4209674596786499, + "learning_rate": 5.480000000000001e-06, + "loss": 0.0295, + "step": 274 + }, + { + "epoch": 0.0276, + "grad_norm": 0.8670297861099243, + "learning_rate": 5.5200000000000005e-06, + "loss": 0.0315, + "step": 276 + }, + { + "epoch": 0.0278, + "grad_norm": 0.45448747277259827, + "learning_rate": 5.560000000000001e-06, + "loss": 0.017, + "step": 278 + }, + { + "epoch": 0.028, + "grad_norm": 0.35916391015052795, + "learning_rate": 5.600000000000001e-06, + "loss": 0.009, + "step": 280 + }, + { + "epoch": 0.0282, + "grad_norm": 3.5158276557922363, + "learning_rate": 5.64e-06, + "loss": 0.0412, + "step": 282 + }, + { + "epoch": 0.0284, + "grad_norm": 5.774011135101318, + "learning_rate": 5.68e-06, + "loss": 0.1169, + "step": 284 + }, + { + "epoch": 0.0286, + "grad_norm": 0.22655341029167175, + "learning_rate": 5.72e-06, + "loss": 0.1148, + "step": 286 + }, + { + "epoch": 0.0288, + "grad_norm": 0.2625415325164795, + "learning_rate": 5.76e-06, + "loss": 0.0149, + "step": 288 + }, + { + "epoch": 0.029, + "grad_norm": 0.166410431265831, + "learning_rate": 5.8e-06, + "loss": 0.0285, + "step": 290 + }, + { + "epoch": 0.0292, + "grad_norm": 3.158982276916504, + "learning_rate": 5.84e-06, + "loss": 0.0977, + "step": 292 + }, + { + "epoch": 0.0294, + "grad_norm": 12.927409172058105, + "learning_rate": 5.8800000000000005e-06, + "loss": 0.3901, + "step": 294 + }, + { + "epoch": 0.0296, + "grad_norm": 2.5911247730255127, + "learning_rate": 5.92e-06, + "loss": 0.0484, + "step": 296 + }, + { + "epoch": 0.0298, + "grad_norm": 2.250699758529663, + "learning_rate": 5.9600000000000005e-06, + "loss": 0.0517, + "step": 298 + }, + { + "epoch": 0.03, + "grad_norm": 6.005315780639648, + "learning_rate": 6e-06, + "loss": 0.08, + "step": 300 + }, + { + "epoch": 0.0302, + "grad_norm": 1.1021876335144043, + "learning_rate": 6.040000000000001e-06, + "loss": 0.0403, + "step": 302 + }, + { + "epoch": 0.0304, + "grad_norm": 3.2484521865844727, + "learning_rate": 6.08e-06, + "loss": 0.13, + "step": 304 + }, + { + "epoch": 0.0306, + "grad_norm": 0.051231734454631805, + "learning_rate": 6.120000000000001e-06, + "loss": 0.0079, + "step": 306 + }, + { + "epoch": 0.0308, + "grad_norm": 0.05670686438679695, + "learning_rate": 6.16e-06, + "loss": 0.0284, + "step": 308 + }, + { + "epoch": 0.031, + "grad_norm": 0.44017884135246277, + "learning_rate": 6.200000000000001e-06, + "loss": 0.0324, + "step": 310 + }, + { + "epoch": 0.0312, + "grad_norm": 0.6401331424713135, + "learning_rate": 6.24e-06, + "loss": 0.1786, + "step": 312 + }, + { + "epoch": 0.0314, + "grad_norm": 0.6487441062927246, + "learning_rate": 6.280000000000001e-06, + "loss": 0.0206, + "step": 314 + }, + { + "epoch": 0.0316, + "grad_norm": 2.3416686058044434, + "learning_rate": 6.3200000000000005e-06, + "loss": 0.0576, + "step": 316 + }, + { + "epoch": 0.0318, + "grad_norm": 0.5750493407249451, + "learning_rate": 6.360000000000001e-06, + "loss": 0.0959, + "step": 318 + }, + { + "epoch": 0.032, + "grad_norm": 3.0366621017456055, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.0398, + "step": 320 + }, + { + "epoch": 0.0322, + "grad_norm": 1.8224941492080688, + "learning_rate": 6.440000000000001e-06, + "loss": 0.0257, + "step": 322 + }, + { + "epoch": 0.0324, + "grad_norm": 0.3713086247444153, + "learning_rate": 6.480000000000001e-06, + "loss": 0.0446, + "step": 324 + }, + { + "epoch": 0.0326, + "grad_norm": 4.842414855957031, + "learning_rate": 6.520000000000001e-06, + "loss": 0.0581, + "step": 326 + }, + { + "epoch": 0.0328, + "grad_norm": 0.6216129064559937, + "learning_rate": 6.560000000000001e-06, + "loss": 0.0037, + "step": 328 + }, + { + "epoch": 0.033, + "grad_norm": 2.05338191986084, + "learning_rate": 6.600000000000001e-06, + "loss": 0.022, + "step": 330 + }, + { + "epoch": 0.0332, + "grad_norm": 1.3908491134643555, + "learning_rate": 6.640000000000001e-06, + "loss": 0.0237, + "step": 332 + }, + { + "epoch": 0.0334, + "grad_norm": 5.522376537322998, + "learning_rate": 6.680000000000001e-06, + "loss": 0.03, + "step": 334 + }, + { + "epoch": 0.0336, + "grad_norm": 0.5217986702919006, + "learning_rate": 6.720000000000001e-06, + "loss": 0.0037, + "step": 336 + }, + { + "epoch": 0.0338, + "grad_norm": 9.795446395874023, + "learning_rate": 6.760000000000001e-06, + "loss": 0.1296, + "step": 338 + }, + { + "epoch": 0.034, + "grad_norm": 1.3551901578903198, + "learning_rate": 6.800000000000001e-06, + "loss": 0.0261, + "step": 340 + }, + { + "epoch": 0.0342, + "grad_norm": 0.19913724064826965, + "learning_rate": 6.8400000000000014e-06, + "loss": 0.0953, + "step": 342 + }, + { + "epoch": 0.0344, + "grad_norm": 20.22366714477539, + "learning_rate": 6.88e-06, + "loss": 0.3472, + "step": 344 + }, + { + "epoch": 0.0346, + "grad_norm": 10.937487602233887, + "learning_rate": 6.92e-06, + "loss": 0.082, + "step": 346 + }, + { + "epoch": 0.0348, + "grad_norm": 1.9515151977539062, + "learning_rate": 6.96e-06, + "loss": 0.0383, + "step": 348 + }, + { + "epoch": 0.035, + "grad_norm": 8.523832321166992, + "learning_rate": 7e-06, + "loss": 0.0771, + "step": 350 + }, + { + "epoch": 0.0352, + "grad_norm": 0.6160967946052551, + "learning_rate": 7.04e-06, + "loss": 0.0095, + "step": 352 + }, + { + "epoch": 0.0354, + "grad_norm": 19.58991241455078, + "learning_rate": 7.08e-06, + "loss": 0.4814, + "step": 354 + }, + { + "epoch": 0.0356, + "grad_norm": 0.20071199536323547, + "learning_rate": 7.1200000000000004e-06, + "loss": 0.0033, + "step": 356 + }, + { + "epoch": 0.0358, + "grad_norm": 5.302737712860107, + "learning_rate": 7.16e-06, + "loss": 0.104, + "step": 358 + }, + { + "epoch": 0.036, + "grad_norm": 8.249676704406738, + "learning_rate": 7.2000000000000005e-06, + "loss": 0.2806, + "step": 360 + }, + { + "epoch": 0.0362, + "grad_norm": 10.859025001525879, + "learning_rate": 7.24e-06, + "loss": 0.2145, + "step": 362 + }, + { + "epoch": 0.0364, + "grad_norm": 4.3820343017578125, + "learning_rate": 7.280000000000001e-06, + "loss": 0.0566, + "step": 364 + }, + { + "epoch": 0.0366, + "grad_norm": 5.49815034866333, + "learning_rate": 7.32e-06, + "loss": 0.1103, + "step": 366 + }, + { + "epoch": 0.0368, + "grad_norm": 5.8264689445495605, + "learning_rate": 7.360000000000001e-06, + "loss": 0.1113, + "step": 368 + }, + { + "epoch": 0.037, + "grad_norm": 1.5490245819091797, + "learning_rate": 7.4e-06, + "loss": 0.0287, + "step": 370 + }, + { + "epoch": 0.0372, + "grad_norm": 3.684230089187622, + "learning_rate": 7.440000000000001e-06, + "loss": 0.0539, + "step": 372 + }, + { + "epoch": 0.0374, + "grad_norm": 2.9347400665283203, + "learning_rate": 7.48e-06, + "loss": 0.0992, + "step": 374 + }, + { + "epoch": 0.0376, + "grad_norm": 0.7671056389808655, + "learning_rate": 7.520000000000001e-06, + "loss": 0.0442, + "step": 376 + }, + { + "epoch": 0.0378, + "grad_norm": 2.8931450843811035, + "learning_rate": 7.5600000000000005e-06, + "loss": 0.2516, + "step": 378 + }, + { + "epoch": 0.038, + "grad_norm": 1.317975401878357, + "learning_rate": 7.600000000000001e-06, + "loss": 0.0487, + "step": 380 + }, + { + "epoch": 0.0382, + "grad_norm": 0.5621810555458069, + "learning_rate": 7.640000000000001e-06, + "loss": 0.0624, + "step": 382 + }, + { + "epoch": 0.0384, + "grad_norm": 2.544818878173828, + "learning_rate": 7.680000000000001e-06, + "loss": 0.047, + "step": 384 + }, + { + "epoch": 0.0386, + "grad_norm": 6.256683826446533, + "learning_rate": 7.72e-06, + "loss": 0.1198, + "step": 386 + }, + { + "epoch": 0.0388, + "grad_norm": 8.151278495788574, + "learning_rate": 7.76e-06, + "loss": 0.131, + "step": 388 + }, + { + "epoch": 0.039, + "grad_norm": 3.0273163318634033, + "learning_rate": 7.800000000000002e-06, + "loss": 0.0572, + "step": 390 + }, + { + "epoch": 0.0392, + "grad_norm": 0.1967913806438446, + "learning_rate": 7.840000000000001e-06, + "loss": 0.0107, + "step": 392 + }, + { + "epoch": 0.0394, + "grad_norm": 0.8853193521499634, + "learning_rate": 7.88e-06, + "loss": 0.1075, + "step": 394 + }, + { + "epoch": 0.0396, + "grad_norm": 6.4745588302612305, + "learning_rate": 7.92e-06, + "loss": 0.105, + "step": 396 + }, + { + "epoch": 0.0398, + "grad_norm": 4.3995466232299805, + "learning_rate": 7.960000000000002e-06, + "loss": 0.2222, + "step": 398 + }, + { + "epoch": 0.04, + "grad_norm": 0.810116171836853, + "learning_rate": 8.000000000000001e-06, + "loss": 0.0378, + "step": 400 + }, + { + "epoch": 0.0402, + "grad_norm": 0.6136965751647949, + "learning_rate": 8.040000000000001e-06, + "loss": 0.0103, + "step": 402 + }, + { + "epoch": 0.0404, + "grad_norm": 0.1753395050764084, + "learning_rate": 8.08e-06, + "loss": 0.0032, + "step": 404 + }, + { + "epoch": 0.0406, + "grad_norm": 2.924393892288208, + "learning_rate": 8.120000000000002e-06, + "loss": 0.0502, + "step": 406 + }, + { + "epoch": 0.0408, + "grad_norm": 5.497615337371826, + "learning_rate": 8.16e-06, + "loss": 0.0549, + "step": 408 + }, + { + "epoch": 0.041, + "grad_norm": 0.16068236529827118, + "learning_rate": 8.2e-06, + "loss": 0.0038, + "step": 410 + }, + { + "epoch": 0.0412, + "grad_norm": 7.15321683883667, + "learning_rate": 8.24e-06, + "loss": 0.0883, + "step": 412 + }, + { + "epoch": 0.0414, + "grad_norm": 1.6140506267547607, + "learning_rate": 8.28e-06, + "loss": 0.0508, + "step": 414 + }, + { + "epoch": 0.0416, + "grad_norm": 5.315743923187256, + "learning_rate": 8.32e-06, + "loss": 0.0922, + "step": 416 + }, + { + "epoch": 0.0418, + "grad_norm": 4.189519882202148, + "learning_rate": 8.36e-06, + "loss": 0.0252, + "step": 418 + }, + { + "epoch": 0.042, + "grad_norm": 1.6238949298858643, + "learning_rate": 8.400000000000001e-06, + "loss": 0.1712, + "step": 420 + }, + { + "epoch": 0.0422, + "grad_norm": 3.877796173095703, + "learning_rate": 8.44e-06, + "loss": 0.0746, + "step": 422 + }, + { + "epoch": 0.0424, + "grad_norm": 3.496936082839966, + "learning_rate": 8.48e-06, + "loss": 0.1276, + "step": 424 + }, + { + "epoch": 0.0426, + "grad_norm": 0.6668111085891724, + "learning_rate": 8.52e-06, + "loss": 0.0037, + "step": 426 + }, + { + "epoch": 0.0428, + "grad_norm": 1.0620479583740234, + "learning_rate": 8.560000000000001e-06, + "loss": 0.1001, + "step": 428 + }, + { + "epoch": 0.043, + "grad_norm": 5.599069595336914, + "learning_rate": 8.6e-06, + "loss": 0.0711, + "step": 430 + }, + { + "epoch": 0.0432, + "grad_norm": 0.3877107799053192, + "learning_rate": 8.64e-06, + "loss": 0.007, + "step": 432 + }, + { + "epoch": 0.0434, + "grad_norm": 0.15291278064250946, + "learning_rate": 8.68e-06, + "loss": 0.002, + "step": 434 + }, + { + "epoch": 0.0436, + "grad_norm": 0.7974997162818909, + "learning_rate": 8.720000000000001e-06, + "loss": 0.0301, + "step": 436 + }, + { + "epoch": 0.0438, + "grad_norm": 1.735231876373291, + "learning_rate": 8.76e-06, + "loss": 0.0591, + "step": 438 + }, + { + "epoch": 0.044, + "grad_norm": 1.9598917961120605, + "learning_rate": 8.8e-06, + "loss": 0.0376, + "step": 440 + }, + { + "epoch": 0.0442, + "grad_norm": 0.34212180972099304, + "learning_rate": 8.84e-06, + "loss": 0.0051, + "step": 442 + }, + { + "epoch": 0.0444, + "grad_norm": 0.5759320259094238, + "learning_rate": 8.880000000000001e-06, + "loss": 0.036, + "step": 444 + }, + { + "epoch": 0.0446, + "grad_norm": 1.1490092277526855, + "learning_rate": 8.920000000000001e-06, + "loss": 0.0159, + "step": 446 + }, + { + "epoch": 0.0448, + "grad_norm": 0.2773312032222748, + "learning_rate": 8.96e-06, + "loss": 0.0486, + "step": 448 + }, + { + "epoch": 0.045, + "grad_norm": 8.514900207519531, + "learning_rate": 9e-06, + "loss": 0.0674, + "step": 450 + }, + { + "epoch": 0.0452, + "grad_norm": 0.30805352330207825, + "learning_rate": 9.040000000000002e-06, + "loss": 0.0162, + "step": 452 + }, + { + "epoch": 0.0454, + "grad_norm": 2.3758792877197266, + "learning_rate": 9.080000000000001e-06, + "loss": 0.0435, + "step": 454 + }, + { + "epoch": 0.0456, + "grad_norm": 0.032780300825834274, + "learning_rate": 9.12e-06, + "loss": 0.0436, + "step": 456 + }, + { + "epoch": 0.0458, + "grad_norm": 5.317171573638916, + "learning_rate": 9.16e-06, + "loss": 0.03, + "step": 458 + }, + { + "epoch": 0.046, + "grad_norm": 2.995816707611084, + "learning_rate": 9.200000000000002e-06, + "loss": 0.0259, + "step": 460 + }, + { + "epoch": 0.0462, + "grad_norm": 3.447620153427124, + "learning_rate": 9.240000000000001e-06, + "loss": 0.052, + "step": 462 + }, + { + "epoch": 0.0464, + "grad_norm": 0.026386817917227745, + "learning_rate": 9.280000000000001e-06, + "loss": 0.0013, + "step": 464 + }, + { + "epoch": 0.0466, + "grad_norm": 0.3765723407268524, + "learning_rate": 9.32e-06, + "loss": 0.0019, + "step": 466 + }, + { + "epoch": 0.0468, + "grad_norm": 2.415679931640625, + "learning_rate": 9.360000000000002e-06, + "loss": 0.0081, + "step": 468 + }, + { + "epoch": 0.047, + "grad_norm": 0.7193875312805176, + "learning_rate": 9.4e-06, + "loss": 0.0881, + "step": 470 + }, + { + "epoch": 0.0472, + "grad_norm": 0.3809020519256592, + "learning_rate": 9.440000000000001e-06, + "loss": 0.0132, + "step": 472 + }, + { + "epoch": 0.0474, + "grad_norm": 50.25654220581055, + "learning_rate": 9.48e-06, + "loss": 0.5608, + "step": 474 + }, + { + "epoch": 0.0476, + "grad_norm": 16.293516159057617, + "learning_rate": 9.52e-06, + "loss": 0.4694, + "step": 476 + }, + { + "epoch": 0.0478, + "grad_norm": 1.6846319437026978, + "learning_rate": 9.56e-06, + "loss": 0.0167, + "step": 478 + }, + { + "epoch": 0.048, + "grad_norm": 0.07287383824586868, + "learning_rate": 9.600000000000001e-06, + "loss": 0.0017, + "step": 480 + }, + { + "epoch": 0.0482, + "grad_norm": 13.16028118133545, + "learning_rate": 9.640000000000001e-06, + "loss": 0.1237, + "step": 482 + }, + { + "epoch": 0.0484, + "grad_norm": 0.07811762392520905, + "learning_rate": 9.68e-06, + "loss": 0.3412, + "step": 484 + }, + { + "epoch": 0.0486, + "grad_norm": 1.126184105873108, + "learning_rate": 9.72e-06, + "loss": 0.0111, + "step": 486 + }, + { + "epoch": 0.0488, + "grad_norm": 1.2375993728637695, + "learning_rate": 9.760000000000001e-06, + "loss": 0.024, + "step": 488 + }, + { + "epoch": 0.049, + "grad_norm": 1.6575353145599365, + "learning_rate": 9.800000000000001e-06, + "loss": 0.0249, + "step": 490 + }, + { + "epoch": 0.0492, + "grad_norm": 2.348123788833618, + "learning_rate": 9.84e-06, + "loss": 0.0472, + "step": 492 + }, + { + "epoch": 0.0494, + "grad_norm": 0.5226090550422668, + "learning_rate": 9.88e-06, + "loss": 0.0086, + "step": 494 + }, + { + "epoch": 0.0496, + "grad_norm": 5.408533573150635, + "learning_rate": 9.920000000000002e-06, + "loss": 0.1022, + "step": 496 + }, + { + "epoch": 0.0498, + "grad_norm": 5.715994358062744, + "learning_rate": 9.960000000000001e-06, + "loss": 0.1085, + "step": 498 + }, + { + "epoch": 0.05, + "grad_norm": 2.252636432647705, + "learning_rate": 1e-05, + "loss": 0.0506, + "step": 500 + }, + { + "epoch": 0.0502, + "grad_norm": 2.805305004119873, + "learning_rate": 1.004e-05, + "loss": 0.1445, + "step": 502 + }, + { + "epoch": 0.0504, + "grad_norm": 5.076096534729004, + "learning_rate": 1.008e-05, + "loss": 0.1057, + "step": 504 + }, + { + "epoch": 0.0506, + "grad_norm": 0.7857909798622131, + "learning_rate": 1.0120000000000001e-05, + "loss": 0.0154, + "step": 506 + }, + { + "epoch": 0.0508, + "grad_norm": 3.2602732181549072, + "learning_rate": 1.0160000000000001e-05, + "loss": 0.0406, + "step": 508 + }, + { + "epoch": 0.051, + "grad_norm": 0.38033732771873474, + "learning_rate": 1.02e-05, + "loss": 0.015, + "step": 510 + }, + { + "epoch": 0.0512, + "grad_norm": 1.0226104259490967, + "learning_rate": 1.024e-05, + "loss": 0.0952, + "step": 512 + }, + { + "epoch": 0.0514, + "grad_norm": 6.626057147979736, + "learning_rate": 1.0280000000000002e-05, + "loss": 0.1468, + "step": 514 + }, + { + "epoch": 0.0516, + "grad_norm": 4.810810565948486, + "learning_rate": 1.0320000000000001e-05, + "loss": 0.1167, + "step": 516 + }, + { + "epoch": 0.0518, + "grad_norm": 0.3220195472240448, + "learning_rate": 1.036e-05, + "loss": 0.187, + "step": 518 + }, + { + "epoch": 0.052, + "grad_norm": 0.13600780069828033, + "learning_rate": 1.04e-05, + "loss": 0.0277, + "step": 520 + }, + { + "epoch": 0.0522, + "grad_norm": 0.8904654383659363, + "learning_rate": 1.0440000000000002e-05, + "loss": 0.0318, + "step": 522 + }, + { + "epoch": 0.0524, + "grad_norm": 0.6135651469230652, + "learning_rate": 1.0480000000000001e-05, + "loss": 0.0626, + "step": 524 + }, + { + "epoch": 0.0526, + "grad_norm": 3.5439252853393555, + "learning_rate": 1.0520000000000001e-05, + "loss": 0.0473, + "step": 526 + }, + { + "epoch": 0.0528, + "grad_norm": 3.3210060596466064, + "learning_rate": 1.056e-05, + "loss": 0.0804, + "step": 528 + }, + { + "epoch": 0.053, + "grad_norm": 3.9398391246795654, + "learning_rate": 1.0600000000000002e-05, + "loss": 0.0517, + "step": 530 + }, + { + "epoch": 0.0532, + "grad_norm": 0.19761674106121063, + "learning_rate": 1.0640000000000001e-05, + "loss": 0.0047, + "step": 532 + }, + { + "epoch": 0.0534, + "grad_norm": 1.3000322580337524, + "learning_rate": 1.0680000000000001e-05, + "loss": 0.0399, + "step": 534 + }, + { + "epoch": 0.0536, + "grad_norm": 8.660439491271973, + "learning_rate": 1.072e-05, + "loss": 0.0831, + "step": 536 + }, + { + "epoch": 0.0538, + "grad_norm": 0.2834869623184204, + "learning_rate": 1.0760000000000002e-05, + "loss": 0.0306, + "step": 538 + }, + { + "epoch": 0.054, + "grad_norm": 0.6934462785720825, + "learning_rate": 1.0800000000000002e-05, + "loss": 0.068, + "step": 540 + }, + { + "epoch": 0.0542, + "grad_norm": 0.08366841077804565, + "learning_rate": 1.0840000000000001e-05, + "loss": 0.0059, + "step": 542 + }, + { + "epoch": 0.0544, + "grad_norm": 0.2585448920726776, + "learning_rate": 1.0880000000000001e-05, + "loss": 0.0032, + "step": 544 + }, + { + "epoch": 0.0546, + "grad_norm": 0.7899631857872009, + "learning_rate": 1.0920000000000002e-05, + "loss": 0.3116, + "step": 546 + }, + { + "epoch": 0.0548, + "grad_norm": 7.943477153778076, + "learning_rate": 1.0960000000000002e-05, + "loss": 0.1583, + "step": 548 + }, + { + "epoch": 0.055, + "grad_norm": 0.03509616479277611, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.0673, + "step": 550 + }, + { + "epoch": 0.0552, + "grad_norm": 7.288227081298828, + "learning_rate": 1.1040000000000001e-05, + "loss": 0.0941, + "step": 552 + }, + { + "epoch": 0.0554, + "grad_norm": 2.8598203659057617, + "learning_rate": 1.1080000000000002e-05, + "loss": 0.0426, + "step": 554 + }, + { + "epoch": 0.0556, + "grad_norm": 1.6013342142105103, + "learning_rate": 1.1120000000000002e-05, + "loss": 0.1923, + "step": 556 + }, + { + "epoch": 0.0558, + "grad_norm": 0.12508748471736908, + "learning_rate": 1.1160000000000002e-05, + "loss": 0.0021, + "step": 558 + }, + { + "epoch": 0.056, + "grad_norm": 19.09836196899414, + "learning_rate": 1.1200000000000001e-05, + "loss": 0.22, + "step": 560 + }, + { + "epoch": 0.0562, + "grad_norm": 0.5086609125137329, + "learning_rate": 1.1240000000000002e-05, + "loss": 0.0071, + "step": 562 + }, + { + "epoch": 0.0564, + "grad_norm": 1.445392370223999, + "learning_rate": 1.128e-05, + "loss": 0.1547, + "step": 564 + }, + { + "epoch": 0.0566, + "grad_norm": 0.08431223779916763, + "learning_rate": 1.132e-05, + "loss": 0.0877, + "step": 566 + }, + { + "epoch": 0.0568, + "grad_norm": 0.7562392354011536, + "learning_rate": 1.136e-05, + "loss": 0.0201, + "step": 568 + }, + { + "epoch": 0.057, + "grad_norm": 0.4318715035915375, + "learning_rate": 1.14e-05, + "loss": 0.0636, + "step": 570 + }, + { + "epoch": 0.0572, + "grad_norm": 0.6561262607574463, + "learning_rate": 1.144e-05, + "loss": 0.0103, + "step": 572 + }, + { + "epoch": 0.0574, + "grad_norm": 0.5552188158035278, + "learning_rate": 1.148e-05, + "loss": 0.005, + "step": 574 + }, + { + "epoch": 0.0576, + "grad_norm": 0.8549359440803528, + "learning_rate": 1.152e-05, + "loss": 0.0844, + "step": 576 + }, + { + "epoch": 0.0578, + "grad_norm": 0.8166272640228271, + "learning_rate": 1.156e-05, + "loss": 0.0071, + "step": 578 + }, + { + "epoch": 0.058, + "grad_norm": 0.44811683893203735, + "learning_rate": 1.16e-05, + "loss": 0.0056, + "step": 580 + }, + { + "epoch": 0.0582, + "grad_norm": 0.2967444956302643, + "learning_rate": 1.164e-05, + "loss": 0.0035, + "step": 582 + }, + { + "epoch": 0.0584, + "grad_norm": 2.5751090049743652, + "learning_rate": 1.168e-05, + "loss": 0.0203, + "step": 584 + }, + { + "epoch": 0.0586, + "grad_norm": 8.234184265136719, + "learning_rate": 1.172e-05, + "loss": 0.1066, + "step": 586 + }, + { + "epoch": 0.0588, + "grad_norm": 0.13364800810813904, + "learning_rate": 1.1760000000000001e-05, + "loss": 0.011, + "step": 588 + }, + { + "epoch": 0.059, + "grad_norm": 0.027422282844781876, + "learning_rate": 1.18e-05, + "loss": 0.0046, + "step": 590 + }, + { + "epoch": 0.0592, + "grad_norm": 0.031063809990882874, + "learning_rate": 1.184e-05, + "loss": 0.0029, + "step": 592 + }, + { + "epoch": 0.0594, + "grad_norm": 7.534713268280029, + "learning_rate": 1.188e-05, + "loss": 0.1994, + "step": 594 + }, + { + "epoch": 0.0596, + "grad_norm": 4.395976543426514, + "learning_rate": 1.1920000000000001e-05, + "loss": 0.0454, + "step": 596 + }, + { + "epoch": 0.0598, + "grad_norm": 2.4455931186676025, + "learning_rate": 1.196e-05, + "loss": 0.0677, + "step": 598 + }, + { + "epoch": 0.06, + "grad_norm": 0.1233295351266861, + "learning_rate": 1.2e-05, + "loss": 0.0026, + "step": 600 + }, + { + "epoch": 0.0602, + "grad_norm": 0.3310345709323883, + "learning_rate": 1.204e-05, + "loss": 0.0101, + "step": 602 + }, + { + "epoch": 0.0604, + "grad_norm": 10.374943733215332, + "learning_rate": 1.2080000000000001e-05, + "loss": 0.1502, + "step": 604 + }, + { + "epoch": 0.0606, + "grad_norm": 4.758152484893799, + "learning_rate": 1.2120000000000001e-05, + "loss": 0.059, + "step": 606 + }, + { + "epoch": 0.0608, + "grad_norm": 1.1679095029830933, + "learning_rate": 1.216e-05, + "loss": 0.0243, + "step": 608 + }, + { + "epoch": 0.061, + "grad_norm": 0.44120439887046814, + "learning_rate": 1.22e-05, + "loss": 0.0735, + "step": 610 + }, + { + "epoch": 0.0612, + "grad_norm": 6.496969699859619, + "learning_rate": 1.2240000000000001e-05, + "loss": 0.203, + "step": 612 + }, + { + "epoch": 0.0614, + "grad_norm": 0.19022636115550995, + "learning_rate": 1.2280000000000001e-05, + "loss": 0.0038, + "step": 614 + }, + { + "epoch": 0.0616, + "grad_norm": 4.954254150390625, + "learning_rate": 1.232e-05, + "loss": 0.1246, + "step": 616 + }, + { + "epoch": 0.0618, + "grad_norm": 16.531105041503906, + "learning_rate": 1.236e-05, + "loss": 0.2032, + "step": 618 + }, + { + "epoch": 0.062, + "grad_norm": 13.358026504516602, + "learning_rate": 1.2400000000000002e-05, + "loss": 0.2662, + "step": 620 + }, + { + "epoch": 0.0622, + "grad_norm": 3.175753355026245, + "learning_rate": 1.2440000000000001e-05, + "loss": 0.2142, + "step": 622 + }, + { + "epoch": 0.0624, + "grad_norm": 10.631996154785156, + "learning_rate": 1.248e-05, + "loss": 0.0738, + "step": 624 + }, + { + "epoch": 0.0626, + "grad_norm": 0.12158078700304031, + "learning_rate": 1.252e-05, + "loss": 0.0026, + "step": 626 + }, + { + "epoch": 0.0628, + "grad_norm": 4.100497245788574, + "learning_rate": 1.2560000000000002e-05, + "loss": 0.164, + "step": 628 + }, + { + "epoch": 0.063, + "grad_norm": 4.666815280914307, + "learning_rate": 1.2600000000000001e-05, + "loss": 0.1705, + "step": 630 + }, + { + "epoch": 0.0632, + "grad_norm": 0.06823929399251938, + "learning_rate": 1.2640000000000001e-05, + "loss": 0.0089, + "step": 632 + }, + { + "epoch": 0.0634, + "grad_norm": 0.19273613393306732, + "learning_rate": 1.268e-05, + "loss": 0.004, + "step": 634 + }, + { + "epoch": 0.0636, + "grad_norm": 20.104347229003906, + "learning_rate": 1.2720000000000002e-05, + "loss": 0.1582, + "step": 636 + }, + { + "epoch": 0.0638, + "grad_norm": 1.4931129217147827, + "learning_rate": 1.2760000000000001e-05, + "loss": 0.1148, + "step": 638 + }, + { + "epoch": 0.064, + "grad_norm": 1.9569244384765625, + "learning_rate": 1.2800000000000001e-05, + "loss": 0.0977, + "step": 640 + }, + { + "epoch": 0.0642, + "grad_norm": 2.0437886714935303, + "learning_rate": 1.284e-05, + "loss": 0.2767, + "step": 642 + }, + { + "epoch": 0.0644, + "grad_norm": 0.025619642809033394, + "learning_rate": 1.2880000000000002e-05, + "loss": 0.0143, + "step": 644 + }, + { + "epoch": 0.0646, + "grad_norm": 1.4934087991714478, + "learning_rate": 1.2920000000000002e-05, + "loss": 0.0445, + "step": 646 + }, + { + "epoch": 0.0648, + "grad_norm": 0.041106488555669785, + "learning_rate": 1.2960000000000001e-05, + "loss": 0.0531, + "step": 648 + }, + { + "epoch": 0.065, + "grad_norm": 0.17218433320522308, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.0121, + "step": 650 + }, + { + "epoch": 0.0652, + "grad_norm": 3.6348907947540283, + "learning_rate": 1.3040000000000002e-05, + "loss": 0.0369, + "step": 652 + }, + { + "epoch": 0.0654, + "grad_norm": 7.699862003326416, + "learning_rate": 1.3080000000000002e-05, + "loss": 0.1224, + "step": 654 + }, + { + "epoch": 0.0656, + "grad_norm": 13.808294296264648, + "learning_rate": 1.3120000000000001e-05, + "loss": 0.246, + "step": 656 + }, + { + "epoch": 0.0658, + "grad_norm": 2.471940755844116, + "learning_rate": 1.3160000000000001e-05, + "loss": 0.055, + "step": 658 + }, + { + "epoch": 0.066, + "grad_norm": 3.0965073108673096, + "learning_rate": 1.3200000000000002e-05, + "loss": 0.1273, + "step": 660 + }, + { + "epoch": 0.0662, + "grad_norm": 4.218968391418457, + "learning_rate": 1.3240000000000002e-05, + "loss": 0.0634, + "step": 662 + }, + { + "epoch": 0.0664, + "grad_norm": 0.3047191798686981, + "learning_rate": 1.3280000000000002e-05, + "loss": 0.0064, + "step": 664 + }, + { + "epoch": 0.0666, + "grad_norm": 0.9290439486503601, + "learning_rate": 1.3320000000000001e-05, + "loss": 0.0215, + "step": 666 + }, + { + "epoch": 0.0668, + "grad_norm": 8.676103591918945, + "learning_rate": 1.3360000000000003e-05, + "loss": 0.1204, + "step": 668 + }, + { + "epoch": 0.067, + "grad_norm": 0.09511227160692215, + "learning_rate": 1.3400000000000002e-05, + "loss": 0.0068, + "step": 670 + }, + { + "epoch": 0.0672, + "grad_norm": 4.673782825469971, + "learning_rate": 1.3440000000000002e-05, + "loss": 0.1193, + "step": 672 + }, + { + "epoch": 0.0674, + "grad_norm": 0.022720495238900185, + "learning_rate": 1.3480000000000001e-05, + "loss": 0.0032, + "step": 674 + }, + { + "epoch": 0.0676, + "grad_norm": 0.4234844744205475, + "learning_rate": 1.3520000000000003e-05, + "loss": 0.0053, + "step": 676 + }, + { + "epoch": 0.0678, + "grad_norm": 0.5365747809410095, + "learning_rate": 1.3560000000000002e-05, + "loss": 0.0048, + "step": 678 + }, + { + "epoch": 0.068, + "grad_norm": 0.24364806711673737, + "learning_rate": 1.3600000000000002e-05, + "loss": 0.011, + "step": 680 + }, + { + "epoch": 0.0682, + "grad_norm": 0.08427201211452484, + "learning_rate": 1.3640000000000002e-05, + "loss": 0.003, + "step": 682 + }, + { + "epoch": 0.0684, + "grad_norm": 0.4336061179637909, + "learning_rate": 1.3680000000000003e-05, + "loss": 0.0052, + "step": 684 + }, + { + "epoch": 0.0686, + "grad_norm": 0.11636728048324585, + "learning_rate": 1.3720000000000002e-05, + "loss": 0.0021, + "step": 686 + }, + { + "epoch": 0.0688, + "grad_norm": 0.06528569757938385, + "learning_rate": 1.376e-05, + "loss": 0.0061, + "step": 688 + }, + { + "epoch": 0.069, + "grad_norm": 0.44940832257270813, + "learning_rate": 1.38e-05, + "loss": 0.007, + "step": 690 + }, + { + "epoch": 0.0692, + "grad_norm": 0.04947667941451073, + "learning_rate": 1.384e-05, + "loss": 0.0011, + "step": 692 + }, + { + "epoch": 0.0694, + "grad_norm": 0.036093901842832565, + "learning_rate": 1.3880000000000001e-05, + "loss": 0.0009, + "step": 694 + }, + { + "epoch": 0.0696, + "grad_norm": 0.2991200387477875, + "learning_rate": 1.392e-05, + "loss": 0.0252, + "step": 696 + }, + { + "epoch": 0.0698, + "grad_norm": 0.03702569752931595, + "learning_rate": 1.396e-05, + "loss": 0.0018, + "step": 698 + }, + { + "epoch": 0.07, + "grad_norm": 0.15200470387935638, + "learning_rate": 1.4e-05, + "loss": 0.0026, + "step": 700 + }, + { + "epoch": 0.0702, + "grad_norm": 0.1665496677160263, + "learning_rate": 1.4040000000000001e-05, + "loss": 0.004, + "step": 702 + }, + { + "epoch": 0.0704, + "grad_norm": 7.192049026489258, + "learning_rate": 1.408e-05, + "loss": 0.0606, + "step": 704 + }, + { + "epoch": 0.0706, + "grad_norm": 0.07649670541286469, + "learning_rate": 1.412e-05, + "loss": 0.004, + "step": 706 + }, + { + "epoch": 0.0708, + "grad_norm": 0.14424924552440643, + "learning_rate": 1.416e-05, + "loss": 0.0052, + "step": 708 + }, + { + "epoch": 0.071, + "grad_norm": 0.13287456333637238, + "learning_rate": 1.4200000000000001e-05, + "loss": 0.0015, + "step": 710 + }, + { + "epoch": 0.0712, + "grad_norm": 0.2644335925579071, + "learning_rate": 1.4240000000000001e-05, + "loss": 0.0027, + "step": 712 + }, + { + "epoch": 0.0714, + "grad_norm": 0.14322231709957123, + "learning_rate": 1.428e-05, + "loss": 0.0054, + "step": 714 + }, + { + "epoch": 0.0716, + "grad_norm": 0.22239793837070465, + "learning_rate": 1.432e-05, + "loss": 0.0011, + "step": 716 + }, + { + "epoch": 0.0718, + "grad_norm": 5.70852518081665, + "learning_rate": 1.4360000000000001e-05, + "loss": 0.31, + "step": 718 + }, + { + "epoch": 0.072, + "grad_norm": 0.10798366367816925, + "learning_rate": 1.4400000000000001e-05, + "loss": 0.0026, + "step": 720 + }, + { + "epoch": 0.0722, + "grad_norm": 0.41496142745018005, + "learning_rate": 1.444e-05, + "loss": 0.013, + "step": 722 + }, + { + "epoch": 0.0724, + "grad_norm": 0.49521708488464355, + "learning_rate": 1.448e-05, + "loss": 0.061, + "step": 724 + }, + { + "epoch": 0.0726, + "grad_norm": 23.348159790039062, + "learning_rate": 1.4520000000000002e-05, + "loss": 0.2532, + "step": 726 + }, + { + "epoch": 0.0728, + "grad_norm": 0.016146836802363396, + "learning_rate": 1.4560000000000001e-05, + "loss": 0.2006, + "step": 728 + }, + { + "epoch": 0.073, + "grad_norm": 7.464707851409912, + "learning_rate": 1.46e-05, + "loss": 0.1189, + "step": 730 + }, + { + "epoch": 0.0732, + "grad_norm": 0.04762694612145424, + "learning_rate": 1.464e-05, + "loss": 0.0013, + "step": 732 + }, + { + "epoch": 0.0734, + "grad_norm": 0.032708119601011276, + "learning_rate": 1.4680000000000002e-05, + "loss": 0.0038, + "step": 734 + }, + { + "epoch": 0.0736, + "grad_norm": 0.2535775303840637, + "learning_rate": 1.4720000000000001e-05, + "loss": 0.3931, + "step": 736 + }, + { + "epoch": 0.0738, + "grad_norm": 0.12570743262767792, + "learning_rate": 1.4760000000000001e-05, + "loss": 0.2173, + "step": 738 + }, + { + "epoch": 0.074, + "grad_norm": 0.728451669216156, + "learning_rate": 1.48e-05, + "loss": 0.0099, + "step": 740 + }, + { + "epoch": 0.0742, + "grad_norm": 1.6588236093521118, + "learning_rate": 1.4840000000000002e-05, + "loss": 0.1078, + "step": 742 + }, + { + "epoch": 0.0744, + "grad_norm": 1.2144207954406738, + "learning_rate": 1.4880000000000002e-05, + "loss": 0.027, + "step": 744 + }, + { + "epoch": 0.0746, + "grad_norm": 3.520796537399292, + "learning_rate": 1.4920000000000001e-05, + "loss": 0.0665, + "step": 746 + }, + { + "epoch": 0.0748, + "grad_norm": 0.8078785538673401, + "learning_rate": 1.496e-05, + "loss": 0.1188, + "step": 748 + }, + { + "epoch": 0.075, + "grad_norm": 0.462663471698761, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.0062, + "step": 750 + }, + { + "epoch": 0.0752, + "grad_norm": 0.35456106066703796, + "learning_rate": 1.5040000000000002e-05, + "loss": 0.0841, + "step": 752 + }, + { + "epoch": 0.0754, + "grad_norm": 0.09146533906459808, + "learning_rate": 1.5080000000000001e-05, + "loss": 0.0045, + "step": 754 + }, + { + "epoch": 0.0756, + "grad_norm": 0.055356353521347046, + "learning_rate": 1.5120000000000001e-05, + "loss": 0.0064, + "step": 756 + }, + { + "epoch": 0.0758, + "grad_norm": 0.1287112832069397, + "learning_rate": 1.516e-05, + "loss": 0.0031, + "step": 758 + }, + { + "epoch": 0.076, + "grad_norm": 0.14357025921344757, + "learning_rate": 1.5200000000000002e-05, + "loss": 0.0105, + "step": 760 + }, + { + "epoch": 0.0762, + "grad_norm": 0.16768306493759155, + "learning_rate": 1.5240000000000001e-05, + "loss": 0.0048, + "step": 762 + }, + { + "epoch": 0.0764, + "grad_norm": 0.7932330369949341, + "learning_rate": 1.5280000000000003e-05, + "loss": 0.0165, + "step": 764 + }, + { + "epoch": 0.0766, + "grad_norm": 4.531347751617432, + "learning_rate": 1.5320000000000002e-05, + "loss": 0.093, + "step": 766 + }, + { + "epoch": 0.0768, + "grad_norm": 1.74014151096344, + "learning_rate": 1.5360000000000002e-05, + "loss": 0.0524, + "step": 768 + }, + { + "epoch": 0.077, + "grad_norm": 0.17875352501869202, + "learning_rate": 1.54e-05, + "loss": 0.0092, + "step": 770 + }, + { + "epoch": 0.0772, + "grad_norm": 3.6613104343414307, + "learning_rate": 1.544e-05, + "loss": 0.2412, + "step": 772 + }, + { + "epoch": 0.0774, + "grad_norm": 0.14558683335781097, + "learning_rate": 1.548e-05, + "loss": 0.0082, + "step": 774 + }, + { + "epoch": 0.0776, + "grad_norm": 0.12894275784492493, + "learning_rate": 1.552e-05, + "loss": 0.0624, + "step": 776 + }, + { + "epoch": 0.0778, + "grad_norm": 2.3371500968933105, + "learning_rate": 1.556e-05, + "loss": 0.0637, + "step": 778 + }, + { + "epoch": 0.078, + "grad_norm": 0.019597765058279037, + "learning_rate": 1.5600000000000003e-05, + "loss": 0.0103, + "step": 780 + }, + { + "epoch": 0.0782, + "grad_norm": 0.5953422784805298, + "learning_rate": 1.5640000000000003e-05, + "loss": 0.0043, + "step": 782 + }, + { + "epoch": 0.0784, + "grad_norm": 6.380403518676758, + "learning_rate": 1.5680000000000002e-05, + "loss": 0.0928, + "step": 784 + }, + { + "epoch": 0.0786, + "grad_norm": 1.3575915098190308, + "learning_rate": 1.5720000000000002e-05, + "loss": 0.0872, + "step": 786 + }, + { + "epoch": 0.0788, + "grad_norm": 1.4853304624557495, + "learning_rate": 1.576e-05, + "loss": 0.024, + "step": 788 + }, + { + "epoch": 0.079, + "grad_norm": 0.6447727680206299, + "learning_rate": 1.58e-05, + "loss": 0.012, + "step": 790 + }, + { + "epoch": 0.0792, + "grad_norm": 0.45425736904144287, + "learning_rate": 1.584e-05, + "loss": 0.0195, + "step": 792 + }, + { + "epoch": 0.0794, + "grad_norm": 1.145123839378357, + "learning_rate": 1.588e-05, + "loss": 0.0321, + "step": 794 + }, + { + "epoch": 0.0796, + "grad_norm": 0.47205641865730286, + "learning_rate": 1.5920000000000003e-05, + "loss": 0.0053, + "step": 796 + }, + { + "epoch": 0.0798, + "grad_norm": 2.8216500282287598, + "learning_rate": 1.5960000000000003e-05, + "loss": 0.1908, + "step": 798 + }, + { + "epoch": 0.08, + "grad_norm": 0.3376106321811676, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.0052, + "step": 800 + }, + { + "epoch": 0.0802, + "grad_norm": 0.0245286263525486, + "learning_rate": 1.6040000000000002e-05, + "loss": 0.0009, + "step": 802 + }, + { + "epoch": 0.0804, + "grad_norm": 0.2712961435317993, + "learning_rate": 1.6080000000000002e-05, + "loss": 0.0331, + "step": 804 + }, + { + "epoch": 0.0806, + "grad_norm": 0.3325439989566803, + "learning_rate": 1.612e-05, + "loss": 0.0137, + "step": 806 + }, + { + "epoch": 0.0808, + "grad_norm": 0.11627910286188126, + "learning_rate": 1.616e-05, + "loss": 0.3011, + "step": 808 + }, + { + "epoch": 0.081, + "grad_norm": 0.14348295331001282, + "learning_rate": 1.62e-05, + "loss": 0.0025, + "step": 810 + }, + { + "epoch": 0.0812, + "grad_norm": 0.224946528673172, + "learning_rate": 1.6240000000000004e-05, + "loss": 0.0203, + "step": 812 + }, + { + "epoch": 0.0814, + "grad_norm": 2.125964403152466, + "learning_rate": 1.628e-05, + "loss": 0.0503, + "step": 814 + }, + { + "epoch": 0.0816, + "grad_norm": 13.63076400756836, + "learning_rate": 1.632e-05, + "loss": 0.3814, + "step": 816 + }, + { + "epoch": 0.0818, + "grad_norm": 12.862025260925293, + "learning_rate": 1.636e-05, + "loss": 0.1625, + "step": 818 + }, + { + "epoch": 0.082, + "grad_norm": 0.140502467751503, + "learning_rate": 1.64e-05, + "loss": 0.0052, + "step": 820 + }, + { + "epoch": 0.0822, + "grad_norm": 0.06871003657579422, + "learning_rate": 1.6440000000000002e-05, + "loss": 0.0391, + "step": 822 + }, + { + "epoch": 0.0824, + "grad_norm": 4.500019073486328, + "learning_rate": 1.648e-05, + "loss": 0.0518, + "step": 824 + }, + { + "epoch": 0.0826, + "grad_norm": 0.13748720288276672, + "learning_rate": 1.652e-05, + "loss": 0.005, + "step": 826 + }, + { + "epoch": 0.0828, + "grad_norm": 0.23885299265384674, + "learning_rate": 1.656e-05, + "loss": 0.1087, + "step": 828 + }, + { + "epoch": 0.083, + "grad_norm": 0.4058366119861603, + "learning_rate": 1.66e-05, + "loss": 0.0073, + "step": 830 + }, + { + "epoch": 0.0832, + "grad_norm": 0.20017540454864502, + "learning_rate": 1.664e-05, + "loss": 0.2906, + "step": 832 + }, + { + "epoch": 0.0834, + "grad_norm": 0.9447119235992432, + "learning_rate": 1.668e-05, + "loss": 0.0163, + "step": 834 + }, + { + "epoch": 0.0836, + "grad_norm": 5.156620025634766, + "learning_rate": 1.672e-05, + "loss": 0.1006, + "step": 836 + }, + { + "epoch": 0.0838, + "grad_norm": 0.36408019065856934, + "learning_rate": 1.6760000000000002e-05, + "loss": 0.0073, + "step": 838 + }, + { + "epoch": 0.084, + "grad_norm": 0.02338819019496441, + "learning_rate": 1.6800000000000002e-05, + "loss": 0.0062, + "step": 840 + }, + { + "epoch": 0.0842, + "grad_norm": 0.17009037733078003, + "learning_rate": 1.684e-05, + "loss": 0.0239, + "step": 842 + }, + { + "epoch": 0.0844, + "grad_norm": 0.3014303743839264, + "learning_rate": 1.688e-05, + "loss": 0.0073, + "step": 844 + }, + { + "epoch": 0.0846, + "grad_norm": 0.7396109104156494, + "learning_rate": 1.692e-05, + "loss": 0.0095, + "step": 846 + }, + { + "epoch": 0.0848, + "grad_norm": 0.03792131692171097, + "learning_rate": 1.696e-05, + "loss": 0.0023, + "step": 848 + }, + { + "epoch": 0.085, + "grad_norm": 0.3130897283554077, + "learning_rate": 1.7e-05, + "loss": 0.0066, + "step": 850 + }, + { + "epoch": 0.0852, + "grad_norm": 0.2763802111148834, + "learning_rate": 1.704e-05, + "loss": 0.0033, + "step": 852 + }, + { + "epoch": 0.0854, + "grad_norm": 1.027415156364441, + "learning_rate": 1.7080000000000002e-05, + "loss": 0.0434, + "step": 854 + }, + { + "epoch": 0.0856, + "grad_norm": 0.5856494307518005, + "learning_rate": 1.7120000000000002e-05, + "loss": 0.0114, + "step": 856 + }, + { + "epoch": 0.0858, + "grad_norm": 0.36967721581459045, + "learning_rate": 1.7160000000000002e-05, + "loss": 0.0129, + "step": 858 + }, + { + "epoch": 0.086, + "grad_norm": 0.055420853197574615, + "learning_rate": 1.72e-05, + "loss": 0.0011, + "step": 860 + }, + { + "epoch": 0.0862, + "grad_norm": 0.1655367612838745, + "learning_rate": 1.724e-05, + "loss": 0.0648, + "step": 862 + }, + { + "epoch": 0.0864, + "grad_norm": 11.015512466430664, + "learning_rate": 1.728e-05, + "loss": 0.1935, + "step": 864 + }, + { + "epoch": 0.0866, + "grad_norm": 1.0146524906158447, + "learning_rate": 1.732e-05, + "loss": 0.0564, + "step": 866 + }, + { + "epoch": 0.0868, + "grad_norm": 1.0265992879867554, + "learning_rate": 1.736e-05, + "loss": 0.0084, + "step": 868 + }, + { + "epoch": 0.087, + "grad_norm": 0.9410724639892578, + "learning_rate": 1.7400000000000003e-05, + "loss": 0.1034, + "step": 870 + }, + { + "epoch": 0.0872, + "grad_norm": 0.25905174016952515, + "learning_rate": 1.7440000000000002e-05, + "loss": 0.0032, + "step": 872 + }, + { + "epoch": 0.0874, + "grad_norm": 0.2699975073337555, + "learning_rate": 1.7480000000000002e-05, + "loss": 0.0025, + "step": 874 + }, + { + "epoch": 0.0876, + "grad_norm": 0.06897442042827606, + "learning_rate": 1.752e-05, + "loss": 0.0288, + "step": 876 + }, + { + "epoch": 0.0878, + "grad_norm": 0.17766621708869934, + "learning_rate": 1.756e-05, + "loss": 0.0065, + "step": 878 + }, + { + "epoch": 0.088, + "grad_norm": 0.2617378532886505, + "learning_rate": 1.76e-05, + "loss": 0.0039, + "step": 880 + }, + { + "epoch": 0.0882, + "grad_norm": 0.0959143191576004, + "learning_rate": 1.764e-05, + "loss": 0.0042, + "step": 882 + }, + { + "epoch": 0.0884, + "grad_norm": 0.6153528690338135, + "learning_rate": 1.768e-05, + "loss": 0.0084, + "step": 884 + }, + { + "epoch": 0.0886, + "grad_norm": 16.4722900390625, + "learning_rate": 1.7720000000000003e-05, + "loss": 0.1227, + "step": 886 + }, + { + "epoch": 0.0888, + "grad_norm": 0.02755027823150158, + "learning_rate": 1.7760000000000003e-05, + "loss": 0.001, + "step": 888 + }, + { + "epoch": 0.089, + "grad_norm": 0.010769467800855637, + "learning_rate": 1.7800000000000002e-05, + "loss": 0.0023, + "step": 890 + }, + { + "epoch": 0.0892, + "grad_norm": 0.8854973912239075, + "learning_rate": 1.7840000000000002e-05, + "loss": 0.0079, + "step": 892 + }, + { + "epoch": 0.0894, + "grad_norm": 2.2318098545074463, + "learning_rate": 1.788e-05, + "loss": 0.0171, + "step": 894 + }, + { + "epoch": 0.0896, + "grad_norm": 1.6873263120651245, + "learning_rate": 1.792e-05, + "loss": 0.0169, + "step": 896 + }, + { + "epoch": 0.0898, + "grad_norm": 0.26432839035987854, + "learning_rate": 1.796e-05, + "loss": 0.0092, + "step": 898 + }, + { + "epoch": 0.09, + "grad_norm": 1.766089677810669, + "learning_rate": 1.8e-05, + "loss": 0.0211, + "step": 900 + }, + { + "epoch": 0.0902, + "grad_norm": 0.02442696876823902, + "learning_rate": 1.8040000000000003e-05, + "loss": 0.0363, + "step": 902 + }, + { + "epoch": 0.0904, + "grad_norm": 5.68058967590332, + "learning_rate": 1.8080000000000003e-05, + "loss": 0.0808, + "step": 904 + }, + { + "epoch": 0.0906, + "grad_norm": 0.22869311273097992, + "learning_rate": 1.8120000000000003e-05, + "loss": 0.0715, + "step": 906 + }, + { + "epoch": 0.0908, + "grad_norm": 0.026124386116862297, + "learning_rate": 1.8160000000000002e-05, + "loss": 0.0667, + "step": 908 + }, + { + "epoch": 0.091, + "grad_norm": 0.10206609964370728, + "learning_rate": 1.8200000000000002e-05, + "loss": 0.0019, + "step": 910 + }, + { + "epoch": 0.0912, + "grad_norm": 0.1673208475112915, + "learning_rate": 1.824e-05, + "loss": 0.0091, + "step": 912 + }, + { + "epoch": 0.0914, + "grad_norm": 0.7121985554695129, + "learning_rate": 1.828e-05, + "loss": 0.0067, + "step": 914 + }, + { + "epoch": 0.0916, + "grad_norm": 0.019836019724607468, + "learning_rate": 1.832e-05, + "loss": 0.0402, + "step": 916 + }, + { + "epoch": 0.0918, + "grad_norm": 0.003126104362308979, + "learning_rate": 1.8360000000000004e-05, + "loss": 0.0011, + "step": 918 + }, + { + "epoch": 0.092, + "grad_norm": 0.18643835186958313, + "learning_rate": 1.8400000000000003e-05, + "loss": 0.1437, + "step": 920 + }, + { + "epoch": 0.0922, + "grad_norm": 0.026968825608491898, + "learning_rate": 1.8440000000000003e-05, + "loss": 0.0006, + "step": 922 + }, + { + "epoch": 0.0924, + "grad_norm": 6.744277477264404, + "learning_rate": 1.8480000000000003e-05, + "loss": 0.0409, + "step": 924 + }, + { + "epoch": 0.0926, + "grad_norm": 0.03759264573454857, + "learning_rate": 1.8520000000000002e-05, + "loss": 0.0006, + "step": 926 + }, + { + "epoch": 0.0928, + "grad_norm": 15.648412704467773, + "learning_rate": 1.8560000000000002e-05, + "loss": 0.1382, + "step": 928 + }, + { + "epoch": 0.093, + "grad_norm": 0.9888555407524109, + "learning_rate": 1.86e-05, + "loss": 0.011, + "step": 930 + }, + { + "epoch": 0.0932, + "grad_norm": 0.038130708038806915, + "learning_rate": 1.864e-05, + "loss": 0.0008, + "step": 932 + }, + { + "epoch": 0.0934, + "grad_norm": 0.0070160808973014355, + "learning_rate": 1.8680000000000004e-05, + "loss": 0.0015, + "step": 934 + }, + { + "epoch": 0.0936, + "grad_norm": 0.04739934578537941, + "learning_rate": 1.8720000000000004e-05, + "loss": 0.0019, + "step": 936 + }, + { + "epoch": 0.0938, + "grad_norm": 0.0046441154554486275, + "learning_rate": 1.876e-05, + "loss": 0.0008, + "step": 938 + }, + { + "epoch": 0.094, + "grad_norm": 18.411754608154297, + "learning_rate": 1.88e-05, + "loss": 0.4128, + "step": 940 + }, + { + "epoch": 0.0942, + "grad_norm": 1.4371658563613892, + "learning_rate": 1.884e-05, + "loss": 0.1021, + "step": 942 + }, + { + "epoch": 0.0944, + "grad_norm": 3.600908041000366, + "learning_rate": 1.8880000000000002e-05, + "loss": 0.0496, + "step": 944 + }, + { + "epoch": 0.0946, + "grad_norm": 5.968558311462402, + "learning_rate": 1.8920000000000002e-05, + "loss": 0.1582, + "step": 946 + }, + { + "epoch": 0.0948, + "grad_norm": 7.8968586921691895, + "learning_rate": 1.896e-05, + "loss": 0.3353, + "step": 948 + }, + { + "epoch": 0.095, + "grad_norm": 7.85651159286499, + "learning_rate": 1.9e-05, + "loss": 0.0873, + "step": 950 + }, + { + "epoch": 0.0952, + "grad_norm": 0.8258829712867737, + "learning_rate": 1.904e-05, + "loss": 0.0103, + "step": 952 + }, + { + "epoch": 0.0954, + "grad_norm": 0.03309421241283417, + "learning_rate": 1.908e-05, + "loss": 0.0724, + "step": 954 + }, + { + "epoch": 0.0956, + "grad_norm": 0.04674983024597168, + "learning_rate": 1.912e-05, + "loss": 0.005, + "step": 956 + }, + { + "epoch": 0.0958, + "grad_norm": 0.6511566638946533, + "learning_rate": 1.916e-05, + "loss": 0.1066, + "step": 958 + }, + { + "epoch": 0.096, + "grad_norm": 0.03737279027700424, + "learning_rate": 1.9200000000000003e-05, + "loss": 0.0249, + "step": 960 + }, + { + "epoch": 0.0962, + "grad_norm": 0.3685421049594879, + "learning_rate": 1.9240000000000002e-05, + "loss": 0.0075, + "step": 962 + }, + { + "epoch": 0.0964, + "grad_norm": 0.04106299579143524, + "learning_rate": 1.9280000000000002e-05, + "loss": 0.001, + "step": 964 + }, + { + "epoch": 0.0966, + "grad_norm": 0.310161828994751, + "learning_rate": 1.932e-05, + "loss": 0.0047, + "step": 966 + }, + { + "epoch": 0.0968, + "grad_norm": 0.9552341103553772, + "learning_rate": 1.936e-05, + "loss": 0.0496, + "step": 968 + }, + { + "epoch": 0.097, + "grad_norm": 4.5019402503967285, + "learning_rate": 1.94e-05, + "loss": 0.0578, + "step": 970 + }, + { + "epoch": 0.0972, + "grad_norm": 1.6543866395950317, + "learning_rate": 1.944e-05, + "loss": 0.0427, + "step": 972 + }, + { + "epoch": 0.0974, + "grad_norm": 0.09543387591838837, + "learning_rate": 1.948e-05, + "loss": 0.0127, + "step": 974 + }, + { + "epoch": 0.0976, + "grad_norm": 1.4422893524169922, + "learning_rate": 1.9520000000000003e-05, + "loss": 0.0224, + "step": 976 + }, + { + "epoch": 0.0978, + "grad_norm": 1.0803011655807495, + "learning_rate": 1.9560000000000002e-05, + "loss": 0.0314, + "step": 978 + }, + { + "epoch": 0.098, + "grad_norm": 0.45399999618530273, + "learning_rate": 1.9600000000000002e-05, + "loss": 0.0287, + "step": 980 + }, + { + "epoch": 0.0982, + "grad_norm": 0.06116114556789398, + "learning_rate": 1.9640000000000002e-05, + "loss": 0.006, + "step": 982 + }, + { + "epoch": 0.0984, + "grad_norm": 0.03305156156420708, + "learning_rate": 1.968e-05, + "loss": 0.0014, + "step": 984 + }, + { + "epoch": 0.0986, + "grad_norm": 6.56599760055542, + "learning_rate": 1.972e-05, + "loss": 0.0603, + "step": 986 + }, + { + "epoch": 0.0988, + "grad_norm": 0.17988504469394684, + "learning_rate": 1.976e-05, + "loss": 0.0824, + "step": 988 + }, + { + "epoch": 0.099, + "grad_norm": 0.037596091628074646, + "learning_rate": 1.98e-05, + "loss": 0.0005, + "step": 990 + }, + { + "epoch": 0.0992, + "grad_norm": 0.05098281800746918, + "learning_rate": 1.9840000000000003e-05, + "loss": 0.012, + "step": 992 + }, + { + "epoch": 0.0994, + "grad_norm": 5.900199890136719, + "learning_rate": 1.9880000000000003e-05, + "loss": 0.0694, + "step": 994 + }, + { + "epoch": 0.0996, + "grad_norm": 0.5512561798095703, + "learning_rate": 1.9920000000000002e-05, + "loss": 0.0066, + "step": 996 + }, + { + "epoch": 0.0998, + "grad_norm": 0.005231560207903385, + "learning_rate": 1.9960000000000002e-05, + "loss": 0.0008, + "step": 998 + }, + { + "epoch": 0.1, + "grad_norm": 13.596817016601562, + "learning_rate": 2e-05, + "loss": 0.0784, + "step": 1000 + }, + { + "epoch": 0.1002, + "grad_norm": 0.15206997096538544, + "learning_rate": 1.9999997563060744e-05, + "loss": 0.0019, + "step": 1002 + }, + { + "epoch": 0.1004, + "grad_norm": 0.11161712557077408, + "learning_rate": 1.9999990252244153e-05, + "loss": 0.0785, + "step": 1004 + }, + { + "epoch": 0.1006, + "grad_norm": 0.049802348017692566, + "learning_rate": 1.9999978067553796e-05, + "loss": 0.0013, + "step": 1006 + }, + { + "epoch": 0.1008, + "grad_norm": 0.050102267414331436, + "learning_rate": 1.9999961008995607e-05, + "loss": 0.0012, + "step": 1008 + }, + { + "epoch": 0.101, + "grad_norm": 0.14536163210868835, + "learning_rate": 1.9999939076577906e-05, + "loss": 0.0019, + "step": 1010 + }, + { + "epoch": 0.1012, + "grad_norm": 0.6797391772270203, + "learning_rate": 1.9999912270311376e-05, + "loss": 0.0242, + "step": 1012 + }, + { + "epoch": 0.1014, + "grad_norm": 0.7126854658126831, + "learning_rate": 1.999988059020909e-05, + "loss": 0.0052, + "step": 1014 + }, + { + "epoch": 0.1016, + "grad_norm": 0.008834795095026493, + "learning_rate": 1.9999844036286483e-05, + "loss": 0.0005, + "step": 1016 + }, + { + "epoch": 0.1018, + "grad_norm": 6.238382339477539, + "learning_rate": 1.999980260856137e-05, + "loss": 0.1402, + "step": 1018 + }, + { + "epoch": 0.102, + "grad_norm": 0.02619868703186512, + "learning_rate": 1.9999756307053947e-05, + "loss": 0.0023, + "step": 1020 + }, + { + "epoch": 0.1022, + "grad_norm": 1.3335634469985962, + "learning_rate": 1.999970513178678e-05, + "loss": 0.0112, + "step": 1022 + }, + { + "epoch": 0.1024, + "grad_norm": 0.6505542397499084, + "learning_rate": 1.9999649082784807e-05, + "loss": 0.088, + "step": 1024 + }, + { + "epoch": 0.1026, + "grad_norm": 1.9595348834991455, + "learning_rate": 1.999958816007535e-05, + "loss": 0.017, + "step": 1026 + }, + { + "epoch": 0.1028, + "grad_norm": 0.11649677902460098, + "learning_rate": 1.99995223636881e-05, + "loss": 0.0029, + "step": 1028 + }, + { + "epoch": 0.103, + "grad_norm": 1.7934468984603882, + "learning_rate": 1.9999451693655125e-05, + "loss": 0.0117, + "step": 1030 + }, + { + "epoch": 0.1032, + "grad_norm": 0.017943479120731354, + "learning_rate": 1.9999376150010868e-05, + "loss": 0.0007, + "step": 1032 + }, + { + "epoch": 0.1034, + "grad_norm": 6.529239177703857, + "learning_rate": 1.9999295732792146e-05, + "loss": 0.0768, + "step": 1034 + }, + { + "epoch": 0.1036, + "grad_norm": 0.6557533740997314, + "learning_rate": 1.9999210442038164e-05, + "loss": 0.0103, + "step": 1036 + }, + { + "epoch": 0.1038, + "grad_norm": 0.018608463928103447, + "learning_rate": 1.9999120277790477e-05, + "loss": 0.0123, + "step": 1038 + }, + { + "epoch": 0.104, + "grad_norm": 0.14448994398117065, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.0034, + "step": 1040 + }, + { + "epoch": 0.1042, + "grad_norm": 0.1843729168176651, + "learning_rate": 1.9998925328992175e-05, + "loss": 0.0024, + "step": 1042 + }, + { + "epoch": 0.1044, + "grad_norm": 0.057048432528972626, + "learning_rate": 1.999882054453657e-05, + "loss": 0.0031, + "step": 1044 + }, + { + "epoch": 0.1046, + "grad_norm": 0.07556223124265671, + "learning_rate": 1.9998710886777298e-05, + "loss": 0.0028, + "step": 1046 + }, + { + "epoch": 0.1048, + "grad_norm": 0.39768025279045105, + "learning_rate": 1.9998596355767805e-05, + "loss": 0.007, + "step": 1048 + }, + { + "epoch": 0.105, + "grad_norm": 0.027762170881032944, + "learning_rate": 1.9998476951563914e-05, + "loss": 0.0353, + "step": 1050 + }, + { + "epoch": 0.1052, + "grad_norm": 0.031105121597647667, + "learning_rate": 1.9998352674223816e-05, + "loss": 0.0041, + "step": 1052 + }, + { + "epoch": 0.1054, + "grad_norm": 0.07511039823293686, + "learning_rate": 1.9998223523808092e-05, + "loss": 0.0939, + "step": 1054 + }, + { + "epoch": 0.1056, + "grad_norm": 2.1802122592926025, + "learning_rate": 1.999808950037968e-05, + "loss": 0.0123, + "step": 1056 + }, + { + "epoch": 0.1058, + "grad_norm": 22.36979103088379, + "learning_rate": 1.99979506040039e-05, + "loss": 0.1622, + "step": 1058 + }, + { + "epoch": 0.106, + "grad_norm": 0.035107679665088654, + "learning_rate": 1.9997806834748455e-05, + "loss": 0.0122, + "step": 1060 + }, + { + "epoch": 0.1062, + "grad_norm": 3.012120246887207, + "learning_rate": 1.9997658192683412e-05, + "loss": 0.0302, + "step": 1062 + }, + { + "epoch": 0.1064, + "grad_norm": 0.10728099197149277, + "learning_rate": 1.9997504677881224e-05, + "loss": 0.0411, + "step": 1064 + }, + { + "epoch": 0.1066, + "grad_norm": 0.40667951107025146, + "learning_rate": 1.9997346290416703e-05, + "loss": 0.0757, + "step": 1066 + }, + { + "epoch": 0.1068, + "grad_norm": 0.4692269265651703, + "learning_rate": 1.999718303036705e-05, + "loss": 0.0773, + "step": 1068 + }, + { + "epoch": 0.107, + "grad_norm": 4.770490646362305, + "learning_rate": 1.9997014897811834e-05, + "loss": 0.0836, + "step": 1070 + }, + { + "epoch": 0.1072, + "grad_norm": 0.025640618056058884, + "learning_rate": 1.9996841892833e-05, + "loss": 0.0007, + "step": 1072 + }, + { + "epoch": 0.1074, + "grad_norm": 0.18866486847400665, + "learning_rate": 1.999666401551487e-05, + "loss": 0.0897, + "step": 1074 + }, + { + "epoch": 0.1076, + "grad_norm": 0.05359744653105736, + "learning_rate": 1.9996481265944146e-05, + "loss": 0.1241, + "step": 1076 + }, + { + "epoch": 0.1078, + "grad_norm": 0.09065702557563782, + "learning_rate": 1.9996293644209886e-05, + "loss": 0.0021, + "step": 1078 + }, + { + "epoch": 0.108, + "grad_norm": 0.033856190741062164, + "learning_rate": 1.9996101150403543e-05, + "loss": 0.1648, + "step": 1080 + }, + { + "epoch": 0.1082, + "grad_norm": 0.3671296238899231, + "learning_rate": 1.9995903784618936e-05, + "loss": 0.0031, + "step": 1082 + }, + { + "epoch": 0.1084, + "grad_norm": 0.11223959177732468, + "learning_rate": 1.9995701546952252e-05, + "loss": 0.004, + "step": 1084 + }, + { + "epoch": 0.1086, + "grad_norm": 3.1531410217285156, + "learning_rate": 1.9995494437502064e-05, + "loss": 0.1758, + "step": 1086 + }, + { + "epoch": 0.1088, + "grad_norm": 3.3224568367004395, + "learning_rate": 1.9995282456369313e-05, + "loss": 0.0426, + "step": 1088 + }, + { + "epoch": 0.109, + "grad_norm": 5.398089408874512, + "learning_rate": 1.9995065603657317e-05, + "loss": 0.1217, + "step": 1090 + }, + { + "epoch": 0.1092, + "grad_norm": 1.1839919090270996, + "learning_rate": 1.999484387947177e-05, + "loss": 0.0161, + "step": 1092 + }, + { + "epoch": 0.1094, + "grad_norm": 0.03067704848945141, + "learning_rate": 1.999461728392073e-05, + "loss": 0.0168, + "step": 1094 + }, + { + "epoch": 0.1096, + "grad_norm": 0.26849377155303955, + "learning_rate": 1.9994385817114644e-05, + "loss": 0.0318, + "step": 1096 + }, + { + "epoch": 0.1098, + "grad_norm": 0.05761175975203514, + "learning_rate": 1.9994149479166324e-05, + "loss": 0.0613, + "step": 1098 + }, + { + "epoch": 0.11, + "grad_norm": 2.381849765777588, + "learning_rate": 1.999390827019096e-05, + "loss": 0.0227, + "step": 1100 + }, + { + "epoch": 0.1102, + "grad_norm": 0.045722026377916336, + "learning_rate": 1.999366219030611e-05, + "loss": 0.0448, + "step": 1102 + }, + { + "epoch": 0.1104, + "grad_norm": 0.2772955894470215, + "learning_rate": 1.9993411239631713e-05, + "loss": 0.0035, + "step": 1104 + }, + { + "epoch": 0.1106, + "grad_norm": 2.4110190868377686, + "learning_rate": 1.999315541829008e-05, + "loss": 0.196, + "step": 1106 + }, + { + "epoch": 0.1108, + "grad_norm": 0.9082090854644775, + "learning_rate": 1.9992894726405894e-05, + "loss": 0.0106, + "step": 1108 + }, + { + "epoch": 0.111, + "grad_norm": 0.3917919099330902, + "learning_rate": 1.999262916410621e-05, + "loss": 0.006, + "step": 1110 + }, + { + "epoch": 0.1112, + "grad_norm": 0.14785261452198029, + "learning_rate": 1.999235873152047e-05, + "loss": 0.0111, + "step": 1112 + }, + { + "epoch": 0.1114, + "grad_norm": 0.11184711754322052, + "learning_rate": 1.999208342878047e-05, + "loss": 0.0022, + "step": 1114 + }, + { + "epoch": 0.1116, + "grad_norm": 7.117114067077637, + "learning_rate": 1.9991803256020393e-05, + "loss": 0.13, + "step": 1116 + }, + { + "epoch": 0.1118, + "grad_norm": 0.39685356616973877, + "learning_rate": 1.9991518213376787e-05, + "loss": 0.0164, + "step": 1118 + }, + { + "epoch": 0.112, + "grad_norm": 0.030006125569343567, + "learning_rate": 1.9991228300988586e-05, + "loss": 0.0038, + "step": 1120 + }, + { + "epoch": 0.1122, + "grad_norm": 17.81867790222168, + "learning_rate": 1.9990933518997086e-05, + "loss": 0.5821, + "step": 1122 + }, + { + "epoch": 0.1124, + "grad_norm": 0.05829422548413277, + "learning_rate": 1.9990633867545956e-05, + "loss": 0.0032, + "step": 1124 + }, + { + "epoch": 0.1126, + "grad_norm": 1.148648977279663, + "learning_rate": 1.999032934678125e-05, + "loss": 0.0125, + "step": 1126 + }, + { + "epoch": 0.1128, + "grad_norm": 2.143779993057251, + "learning_rate": 1.9990019956851384e-05, + "loss": 0.0176, + "step": 1128 + }, + { + "epoch": 0.113, + "grad_norm": 1.600830316543579, + "learning_rate": 1.998970569790715e-05, + "loss": 0.0276, + "step": 1130 + }, + { + "epoch": 0.1132, + "grad_norm": 0.03173547238111496, + "learning_rate": 1.9989386570101716e-05, + "loss": 0.0061, + "step": 1132 + }, + { + "epoch": 0.1134, + "grad_norm": 0.18635587394237518, + "learning_rate": 1.9989062573590618e-05, + "loss": 0.0012, + "step": 1134 + }, + { + "epoch": 0.1136, + "grad_norm": 0.23560801148414612, + "learning_rate": 1.9988733708531772e-05, + "loss": 0.022, + "step": 1136 + }, + { + "epoch": 0.1138, + "grad_norm": 0.5049400329589844, + "learning_rate": 1.998839997508546e-05, + "loss": 0.0086, + "step": 1138 + }, + { + "epoch": 0.114, + "grad_norm": 0.785917341709137, + "learning_rate": 1.9988061373414342e-05, + "loss": 0.0064, + "step": 1140 + }, + { + "epoch": 0.1142, + "grad_norm": 2.8807077407836914, + "learning_rate": 1.9987717903683447e-05, + "loss": 0.0225, + "step": 1142 + }, + { + "epoch": 0.1144, + "grad_norm": 0.18683984875679016, + "learning_rate": 1.998736956606018e-05, + "loss": 0.2759, + "step": 1144 + }, + { + "epoch": 0.1146, + "grad_norm": 0.034841280430555344, + "learning_rate": 1.9987016360714307e-05, + "loss": 0.0009, + "step": 1146 + }, + { + "epoch": 0.1148, + "grad_norm": 0.19769151508808136, + "learning_rate": 1.998665828781799e-05, + "loss": 0.0846, + "step": 1148 + }, + { + "epoch": 0.115, + "grad_norm": 0.25113654136657715, + "learning_rate": 1.9986295347545738e-05, + "loss": 0.0034, + "step": 1150 + }, + { + "epoch": 0.1152, + "grad_norm": 4.652162075042725, + "learning_rate": 1.9985927540074453e-05, + "loss": 0.1006, + "step": 1152 + }, + { + "epoch": 0.1154, + "grad_norm": 0.010491312481462955, + "learning_rate": 1.9985554865583394e-05, + "loss": 0.0003, + "step": 1154 + }, + { + "epoch": 0.1156, + "grad_norm": 2.0099501609802246, + "learning_rate": 1.99851773242542e-05, + "loss": 0.045, + "step": 1156 + }, + { + "epoch": 0.1158, + "grad_norm": 0.12630201876163483, + "learning_rate": 1.9984794916270876e-05, + "loss": 0.01, + "step": 1158 + }, + { + "epoch": 0.116, + "grad_norm": 0.010731992311775684, + "learning_rate": 1.9984407641819812e-05, + "loss": 0.0023, + "step": 1160 + }, + { + "epoch": 0.1162, + "grad_norm": 0.06892300397157669, + "learning_rate": 1.998401550108975e-05, + "loss": 0.0028, + "step": 1162 + }, + { + "epoch": 0.1164, + "grad_norm": 0.6083425879478455, + "learning_rate": 1.9983618494271825e-05, + "loss": 0.0064, + "step": 1164 + }, + { + "epoch": 0.1166, + "grad_norm": 0.023032793775200844, + "learning_rate": 1.9983216621559525e-05, + "loss": 0.0114, + "step": 1166 + }, + { + "epoch": 0.1168, + "grad_norm": 0.03522677719593048, + "learning_rate": 1.998280988314872e-05, + "loss": 0.0179, + "step": 1168 + }, + { + "epoch": 0.117, + "grad_norm": 0.00437203049659729, + "learning_rate": 1.9982398279237657e-05, + "loss": 0.0715, + "step": 1170 + }, + { + "epoch": 0.1172, + "grad_norm": 0.7062327861785889, + "learning_rate": 1.9981981810026932e-05, + "loss": 0.0022, + "step": 1172 + }, + { + "epoch": 0.1174, + "grad_norm": 0.1941969096660614, + "learning_rate": 1.998156047571954e-05, + "loss": 0.0376, + "step": 1174 + }, + { + "epoch": 0.1176, + "grad_norm": 0.4734842777252197, + "learning_rate": 1.9981134276520828e-05, + "loss": 0.0048, + "step": 1176 + }, + { + "epoch": 0.1178, + "grad_norm": 1.0663330554962158, + "learning_rate": 1.9980703212638522e-05, + "loss": 0.0748, + "step": 1178 + }, + { + "epoch": 0.118, + "grad_norm": 0.43177008628845215, + "learning_rate": 1.9980267284282718e-05, + "loss": 0.0046, + "step": 1180 + }, + { + "epoch": 0.1182, + "grad_norm": 0.04091016203165054, + "learning_rate": 1.997982649166588e-05, + "loss": 0.0019, + "step": 1182 + }, + { + "epoch": 0.1184, + "grad_norm": 1.5347044467926025, + "learning_rate": 1.9979380835002846e-05, + "loss": 0.005, + "step": 1184 + }, + { + "epoch": 0.1186, + "grad_norm": 0.7050328850746155, + "learning_rate": 1.9978930314510826e-05, + "loss": 0.0129, + "step": 1186 + }, + { + "epoch": 0.1188, + "grad_norm": 11.314016342163086, + "learning_rate": 1.9978474930409396e-05, + "loss": 0.0336, + "step": 1188 + }, + { + "epoch": 0.119, + "grad_norm": 0.19373276829719543, + "learning_rate": 1.9978014682920503e-05, + "loss": 0.0023, + "step": 1190 + }, + { + "epoch": 0.1192, + "grad_norm": 0.010127636604011059, + "learning_rate": 1.997754957226847e-05, + "loss": 0.0087, + "step": 1192 + }, + { + "epoch": 0.1194, + "grad_norm": 0.002806772943586111, + "learning_rate": 1.9977079598679978e-05, + "loss": 0.1266, + "step": 1194 + }, + { + "epoch": 0.1196, + "grad_norm": 2.620042562484741, + "learning_rate": 1.99766047623841e-05, + "loss": 0.3249, + "step": 1196 + }, + { + "epoch": 0.1198, + "grad_norm": 2.8222479820251465, + "learning_rate": 1.9976125063612254e-05, + "loss": 0.0498, + "step": 1198 + }, + { + "epoch": 0.12, + "grad_norm": 7.3607497215271, + "learning_rate": 1.9975640502598243e-05, + "loss": 0.103, + "step": 1200 + }, + { + "epoch": 0.1202, + "grad_norm": 0.028434379026293755, + "learning_rate": 1.9975151079578238e-05, + "loss": 0.0137, + "step": 1202 + }, + { + "epoch": 0.1204, + "grad_norm": 0.0145788649097085, + "learning_rate": 1.9974656794790777e-05, + "loss": 0.0118, + "step": 1204 + }, + { + "epoch": 0.1206, + "grad_norm": 2.804687738418579, + "learning_rate": 1.9974157648476768e-05, + "loss": 0.07, + "step": 1206 + }, + { + "epoch": 0.1208, + "grad_norm": 0.0058864871971309185, + "learning_rate": 1.9973653640879486e-05, + "loss": 0.0011, + "step": 1208 + }, + { + "epoch": 0.121, + "grad_norm": 0.4885385036468506, + "learning_rate": 1.997314477224458e-05, + "loss": 0.0069, + "step": 1210 + }, + { + "epoch": 0.1212, + "grad_norm": 0.08892624080181122, + "learning_rate": 1.997263104282007e-05, + "loss": 0.0012, + "step": 1212 + }, + { + "epoch": 0.1214, + "grad_norm": 0.2034476399421692, + "learning_rate": 1.997211245285634e-05, + "loss": 0.031, + "step": 1214 + }, + { + "epoch": 0.1216, + "grad_norm": 0.019697565585374832, + "learning_rate": 1.997158900260614e-05, + "loss": 0.0014, + "step": 1216 + }, + { + "epoch": 0.1218, + "grad_norm": 2.9700591564178467, + "learning_rate": 1.99710606923246e-05, + "loss": 0.1406, + "step": 1218 + }, + { + "epoch": 0.122, + "grad_norm": 0.2855655252933502, + "learning_rate": 1.9970527522269204e-05, + "loss": 0.0074, + "step": 1220 + }, + { + "epoch": 0.1222, + "grad_norm": 4.746980667114258, + "learning_rate": 1.996998949269982e-05, + "loss": 0.028, + "step": 1222 + }, + { + "epoch": 0.1224, + "grad_norm": 0.017970973625779152, + "learning_rate": 1.9969446603878673e-05, + "loss": 0.0212, + "step": 1224 + }, + { + "epoch": 0.1226, + "grad_norm": 0.20111936330795288, + "learning_rate": 1.996889885607036e-05, + "loss": 0.0024, + "step": 1226 + }, + { + "epoch": 0.1228, + "grad_norm": 0.062297046184539795, + "learning_rate": 1.9968346249541848e-05, + "loss": 0.0007, + "step": 1228 + }, + { + "epoch": 0.123, + "grad_norm": 2.6733858585357666, + "learning_rate": 1.9967788784562474e-05, + "loss": 0.0316, + "step": 1230 + }, + { + "epoch": 0.1232, + "grad_norm": 9.275200843811035, + "learning_rate": 1.9967226461403934e-05, + "loss": 0.1874, + "step": 1232 + }, + { + "epoch": 0.1234, + "grad_norm": 0.6728002429008484, + "learning_rate": 1.99666592803403e-05, + "loss": 0.0203, + "step": 1234 + }, + { + "epoch": 0.1236, + "grad_norm": 1.0480875968933105, + "learning_rate": 1.996608724164801e-05, + "loss": 0.0376, + "step": 1236 + }, + { + "epoch": 0.1238, + "grad_norm": 0.07690905034542084, + "learning_rate": 1.9965510345605866e-05, + "loss": 0.0015, + "step": 1238 + }, + { + "epoch": 0.124, + "grad_norm": 0.03618597611784935, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.0004, + "step": 1240 + }, + { + "epoch": 0.1242, + "grad_norm": 0.02543410286307335, + "learning_rate": 1.996434198259908e-05, + "loss": 0.214, + "step": 1242 + }, + { + "epoch": 0.1244, + "grad_norm": 4.129934787750244, + "learning_rate": 1.9963750516203887e-05, + "loss": 0.0238, + "step": 1244 + }, + { + "epoch": 0.1246, + "grad_norm": 3.3425614833831787, + "learning_rate": 1.9963154193597728e-05, + "loss": 0.018, + "step": 1246 + }, + { + "epoch": 0.1248, + "grad_norm": 0.6469829082489014, + "learning_rate": 1.996255301507125e-05, + "loss": 0.027, + "step": 1248 + }, + { + "epoch": 0.125, + "grad_norm": 0.02871621772646904, + "learning_rate": 1.9961946980917457e-05, + "loss": 0.0016, + "step": 1250 + }, + { + "epoch": 0.1252, + "grad_norm": 3.0828857421875, + "learning_rate": 1.9961336091431728e-05, + "loss": 0.0269, + "step": 1252 + }, + { + "epoch": 0.1254, + "grad_norm": 0.10735322535037994, + "learning_rate": 1.9960720346911798e-05, + "loss": 0.0195, + "step": 1254 + }, + { + "epoch": 0.1256, + "grad_norm": 0.014837349765002728, + "learning_rate": 1.9960099747657774e-05, + "loss": 0.0018, + "step": 1256 + }, + { + "epoch": 0.1258, + "grad_norm": 3.7286489009857178, + "learning_rate": 1.995947429397213e-05, + "loss": 0.4751, + "step": 1258 + }, + { + "epoch": 0.126, + "grad_norm": 2.4342591762542725, + "learning_rate": 1.9958843986159705e-05, + "loss": 0.4824, + "step": 1260 + }, + { + "epoch": 0.1262, + "grad_norm": 0.49592849612236023, + "learning_rate": 1.9958208824527702e-05, + "loss": 0.0124, + "step": 1262 + }, + { + "epoch": 0.1264, + "grad_norm": 2.0224084854125977, + "learning_rate": 1.9957568809385693e-05, + "loss": 0.2662, + "step": 1264 + }, + { + "epoch": 0.1266, + "grad_norm": 4.23607873916626, + "learning_rate": 1.9956923941045613e-05, + "loss": 0.2974, + "step": 1266 + }, + { + "epoch": 0.1268, + "grad_norm": 4.137272357940674, + "learning_rate": 1.995627421982176e-05, + "loss": 0.1221, + "step": 1268 + }, + { + "epoch": 0.127, + "grad_norm": 0.03634059429168701, + "learning_rate": 1.99556196460308e-05, + "loss": 0.1744, + "step": 1270 + }, + { + "epoch": 0.1272, + "grad_norm": 0.15639591217041016, + "learning_rate": 1.995496021999177e-05, + "loss": 0.0274, + "step": 1272 + }, + { + "epoch": 0.1274, + "grad_norm": 0.028070412576198578, + "learning_rate": 1.9954295942026065e-05, + "loss": 0.0035, + "step": 1274 + }, + { + "epoch": 0.1276, + "grad_norm": 0.045294590294361115, + "learning_rate": 1.995362681245744e-05, + "loss": 0.0317, + "step": 1276 + }, + { + "epoch": 0.1278, + "grad_norm": 3.639981508255005, + "learning_rate": 1.9952952831612027e-05, + "loss": 0.0602, + "step": 1278 + }, + { + "epoch": 0.128, + "grad_norm": 0.1096467673778534, + "learning_rate": 1.9952273999818312e-05, + "loss": 0.0047, + "step": 1280 + }, + { + "epoch": 0.1282, + "grad_norm": 1.2931418418884277, + "learning_rate": 1.9951590317407152e-05, + "loss": 0.0192, + "step": 1282 + }, + { + "epoch": 0.1284, + "grad_norm": 2.0058557987213135, + "learning_rate": 1.9950901784711765e-05, + "loss": 0.0204, + "step": 1284 + }, + { + "epoch": 0.1286, + "grad_norm": 3.0547780990600586, + "learning_rate": 1.9950208402067735e-05, + "loss": 0.032, + "step": 1286 + }, + { + "epoch": 0.1288, + "grad_norm": 1.2890464067459106, + "learning_rate": 1.9949510169813006e-05, + "loss": 0.0279, + "step": 1288 + }, + { + "epoch": 0.129, + "grad_norm": 0.10376125574111938, + "learning_rate": 1.9948807088287884e-05, + "loss": 0.0479, + "step": 1290 + }, + { + "epoch": 0.1292, + "grad_norm": 0.19125308096408844, + "learning_rate": 1.994809915783505e-05, + "loss": 0.0033, + "step": 1292 + }, + { + "epoch": 0.1294, + "grad_norm": 0.1831279844045639, + "learning_rate": 1.9947386378799534e-05, + "loss": 0.0211, + "step": 1294 + }, + { + "epoch": 0.1296, + "grad_norm": 0.052361588925123215, + "learning_rate": 1.9946668751528745e-05, + "loss": 0.139, + "step": 1296 + }, + { + "epoch": 0.1298, + "grad_norm": 0.05507492646574974, + "learning_rate": 1.9945946276372435e-05, + "loss": 0.0342, + "step": 1298 + }, + { + "epoch": 0.13, + "grad_norm": 0.02012588456273079, + "learning_rate": 1.9945218953682736e-05, + "loss": 0.0009, + "step": 1300 + }, + { + "epoch": 0.1302, + "grad_norm": 6.117043972015381, + "learning_rate": 1.9944486783814135e-05, + "loss": 0.0974, + "step": 1302 + }, + { + "epoch": 0.1304, + "grad_norm": 0.02295588329434395, + "learning_rate": 1.994374976712348e-05, + "loss": 0.001, + "step": 1304 + }, + { + "epoch": 0.1306, + "grad_norm": 0.04033515229821205, + "learning_rate": 1.994300790396999e-05, + "loss": 0.0017, + "step": 1306 + }, + { + "epoch": 0.1308, + "grad_norm": 11.651762008666992, + "learning_rate": 1.9942261194715236e-05, + "loss": 0.3024, + "step": 1308 + }, + { + "epoch": 0.131, + "grad_norm": 4.510021686553955, + "learning_rate": 1.9941509639723155e-05, + "loss": 0.0678, + "step": 1310 + }, + { + "epoch": 0.1312, + "grad_norm": 0.16653968393802643, + "learning_rate": 1.9940753239360047e-05, + "loss": 0.007, + "step": 1312 + }, + { + "epoch": 0.1314, + "grad_norm": 7.62673282623291, + "learning_rate": 1.993999199399457e-05, + "loss": 0.2535, + "step": 1314 + }, + { + "epoch": 0.1316, + "grad_norm": 3.1238396167755127, + "learning_rate": 1.9939225903997748e-05, + "loss": 0.0312, + "step": 1316 + }, + { + "epoch": 0.1318, + "grad_norm": 2.193711996078491, + "learning_rate": 1.993845496974297e-05, + "loss": 0.122, + "step": 1318 + }, + { + "epoch": 0.132, + "grad_norm": 1.0461170673370361, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.0064, + "step": 1320 + }, + { + "epoch": 0.1322, + "grad_norm": 6.287940979003906, + "learning_rate": 1.993689856996485e-05, + "loss": 0.0487, + "step": 1322 + }, + { + "epoch": 0.1324, + "grad_norm": 0.9636908769607544, + "learning_rate": 1.9936113105200085e-05, + "loss": 0.0906, + "step": 1324 + }, + { + "epoch": 0.1326, + "grad_norm": 0.7021270990371704, + "learning_rate": 1.99353227976945e-05, + "loss": 0.0203, + "step": 1326 + }, + { + "epoch": 0.1328, + "grad_norm": 0.052062153816223145, + "learning_rate": 1.9934527647833276e-05, + "loss": 0.0081, + "step": 1328 + }, + { + "epoch": 0.133, + "grad_norm": 0.16536319255828857, + "learning_rate": 1.9933727656003964e-05, + "loss": 0.0267, + "step": 1330 + }, + { + "epoch": 0.1332, + "grad_norm": 3.9922337532043457, + "learning_rate": 1.993292282259647e-05, + "loss": 0.0386, + "step": 1332 + }, + { + "epoch": 0.1334, + "grad_norm": 1.616478681564331, + "learning_rate": 1.9932113148003057e-05, + "loss": 0.0287, + "step": 1334 + }, + { + "epoch": 0.1336, + "grad_norm": 0.028906622901558876, + "learning_rate": 1.9931298632618355e-05, + "loss": 0.0121, + "step": 1336 + }, + { + "epoch": 0.1338, + "grad_norm": 2.2539427280426025, + "learning_rate": 1.9930479276839347e-05, + "loss": 0.1497, + "step": 1338 + }, + { + "epoch": 0.134, + "grad_norm": 0.3397001326084137, + "learning_rate": 1.992965508106537e-05, + "loss": 0.0112, + "step": 1340 + }, + { + "epoch": 0.1342, + "grad_norm": 0.0142103536054492, + "learning_rate": 1.9928826045698138e-05, + "loss": 0.0012, + "step": 1342 + }, + { + "epoch": 0.1344, + "grad_norm": 0.06143352389335632, + "learning_rate": 1.9927992171141707e-05, + "loss": 0.0017, + "step": 1344 + }, + { + "epoch": 0.1346, + "grad_norm": 7.655733585357666, + "learning_rate": 1.99271534578025e-05, + "loss": 0.0724, + "step": 1346 + }, + { + "epoch": 0.1348, + "grad_norm": 0.03948098048567772, + "learning_rate": 1.992630990608929e-05, + "loss": 0.0407, + "step": 1348 + }, + { + "epoch": 0.135, + "grad_norm": 0.06326455622911453, + "learning_rate": 1.9925461516413224e-05, + "loss": 0.1956, + "step": 1350 + }, + { + "epoch": 0.1352, + "grad_norm": 0.04573410749435425, + "learning_rate": 1.9924608289187786e-05, + "loss": 0.0112, + "step": 1352 + }, + { + "epoch": 0.1354, + "grad_norm": 8.840446472167969, + "learning_rate": 1.9923750224828833e-05, + "loss": 0.137, + "step": 1354 + }, + { + "epoch": 0.1356, + "grad_norm": 0.39997485280036926, + "learning_rate": 1.992288732375458e-05, + "loss": 0.1451, + "step": 1356 + }, + { + "epoch": 0.1358, + "grad_norm": 0.08093598484992981, + "learning_rate": 1.9922019586385587e-05, + "loss": 0.0197, + "step": 1358 + }, + { + "epoch": 0.136, + "grad_norm": 0.26595836877822876, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.0059, + "step": 1360 + }, + { + "epoch": 0.1362, + "grad_norm": 4.732010364532471, + "learning_rate": 1.9920269604457444e-05, + "loss": 0.1717, + "step": 1362 + }, + { + "epoch": 0.1364, + "grad_norm": 0.7414751648902893, + "learning_rate": 1.9919387360751216e-05, + "loss": 0.0347, + "step": 1364 + }, + { + "epoch": 0.1366, + "grad_norm": 1.034635066986084, + "learning_rate": 1.991850028245609e-05, + "loss": 0.3654, + "step": 1366 + }, + { + "epoch": 0.1368, + "grad_norm": 0.6977332234382629, + "learning_rate": 1.9917608370004417e-05, + "loss": 0.0153, + "step": 1368 + }, + { + "epoch": 0.137, + "grad_norm": 1.4707825183868408, + "learning_rate": 1.9916711623830904e-05, + "loss": 0.0355, + "step": 1370 + }, + { + "epoch": 0.1372, + "grad_norm": 0.4493428170681, + "learning_rate": 1.9915810044372618e-05, + "loss": 0.0179, + "step": 1372 + }, + { + "epoch": 0.1374, + "grad_norm": 0.42118555307388306, + "learning_rate": 1.9914903632068975e-05, + "loss": 0.0076, + "step": 1374 + }, + { + "epoch": 0.1376, + "grad_norm": 0.11252127587795258, + "learning_rate": 1.9913992387361747e-05, + "loss": 0.0074, + "step": 1376 + }, + { + "epoch": 0.1378, + "grad_norm": 0.16665929555892944, + "learning_rate": 1.9913076310695068e-05, + "loss": 0.021, + "step": 1378 + }, + { + "epoch": 0.138, + "grad_norm": 0.2285078763961792, + "learning_rate": 1.991215540251542e-05, + "loss": 0.0148, + "step": 1380 + }, + { + "epoch": 0.1382, + "grad_norm": 3.302819013595581, + "learning_rate": 1.991122966327164e-05, + "loss": 0.0328, + "step": 1382 + }, + { + "epoch": 0.1384, + "grad_norm": 0.7041147947311401, + "learning_rate": 1.991029909341493e-05, + "loss": 0.0096, + "step": 1384 + }, + { + "epoch": 0.1386, + "grad_norm": 1.0595413446426392, + "learning_rate": 1.9909363693398828e-05, + "loss": 0.0115, + "step": 1386 + }, + { + "epoch": 0.1388, + "grad_norm": 0.36925196647644043, + "learning_rate": 1.9908423463679246e-05, + "loss": 0.0822, + "step": 1388 + }, + { + "epoch": 0.139, + "grad_norm": 5.077276706695557, + "learning_rate": 1.9907478404714438e-05, + "loss": 0.1734, + "step": 1390 + }, + { + "epoch": 0.1392, + "grad_norm": 0.5629354119300842, + "learning_rate": 1.990652851696501e-05, + "loss": 0.0108, + "step": 1392 + }, + { + "epoch": 0.1394, + "grad_norm": 2.0771939754486084, + "learning_rate": 1.990557380089393e-05, + "loss": 0.1112, + "step": 1394 + }, + { + "epoch": 0.1396, + "grad_norm": 0.6706334948539734, + "learning_rate": 1.9904614256966514e-05, + "loss": 0.1431, + "step": 1396 + }, + { + "epoch": 0.1398, + "grad_norm": 0.7656484246253967, + "learning_rate": 1.990364988565043e-05, + "loss": 0.0202, + "step": 1398 + }, + { + "epoch": 0.14, + "grad_norm": 0.1375696212053299, + "learning_rate": 1.9902680687415704e-05, + "loss": 0.0132, + "step": 1400 + }, + { + "epoch": 0.1402, + "grad_norm": 0.5605471134185791, + "learning_rate": 1.990170666273471e-05, + "loss": 0.0063, + "step": 1402 + }, + { + "epoch": 0.1404, + "grad_norm": 0.04822947084903717, + "learning_rate": 1.9900727812082177e-05, + "loss": 0.0143, + "step": 1404 + }, + { + "epoch": 0.1406, + "grad_norm": 1.94780695438385, + "learning_rate": 1.989974413593518e-05, + "loss": 0.1986, + "step": 1406 + }, + { + "epoch": 0.1408, + "grad_norm": 0.12060584872961044, + "learning_rate": 1.989875563477316e-05, + "loss": 0.0022, + "step": 1408 + }, + { + "epoch": 0.141, + "grad_norm": 0.12252520024776459, + "learning_rate": 1.989776230907789e-05, + "loss": 0.0033, + "step": 1410 + }, + { + "epoch": 0.1412, + "grad_norm": 0.5971289873123169, + "learning_rate": 1.989676415933351e-05, + "loss": 0.0123, + "step": 1412 + }, + { + "epoch": 0.1414, + "grad_norm": 0.34857481718063354, + "learning_rate": 1.989576118602651e-05, + "loss": 0.1184, + "step": 1414 + }, + { + "epoch": 0.1416, + "grad_norm": 0.3228130340576172, + "learning_rate": 1.9894753389645723e-05, + "loss": 0.0088, + "step": 1416 + }, + { + "epoch": 0.1418, + "grad_norm": 0.3592380881309509, + "learning_rate": 1.9893740770682334e-05, + "loss": 0.0057, + "step": 1418 + }, + { + "epoch": 0.142, + "grad_norm": 0.21173013746738434, + "learning_rate": 1.9892723329629885e-05, + "loss": 0.0525, + "step": 1420 + }, + { + "epoch": 0.1422, + "grad_norm": 1.0427559614181519, + "learning_rate": 1.9891701066984264e-05, + "loss": 0.0216, + "step": 1422 + }, + { + "epoch": 0.1424, + "grad_norm": 0.01450520008802414, + "learning_rate": 1.9890673983243708e-05, + "loss": 0.0626, + "step": 1424 + }, + { + "epoch": 0.1426, + "grad_norm": 0.08278901129961014, + "learning_rate": 1.9889642078908805e-05, + "loss": 0.002, + "step": 1426 + }, + { + "epoch": 0.1428, + "grad_norm": 0.20296621322631836, + "learning_rate": 1.9888605354482494e-05, + "loss": 0.0087, + "step": 1428 + }, + { + "epoch": 0.143, + "grad_norm": 3.787844181060791, + "learning_rate": 1.988756381047006e-05, + "loss": 0.4084, + "step": 1430 + }, + { + "epoch": 0.1432, + "grad_norm": 0.06036250665783882, + "learning_rate": 1.988651744737914e-05, + "loss": 0.0442, + "step": 1432 + }, + { + "epoch": 0.1434, + "grad_norm": 1.0307059288024902, + "learning_rate": 1.9885466265719723e-05, + "loss": 0.0152, + "step": 1434 + }, + { + "epoch": 0.1436, + "grad_norm": 0.15952594578266144, + "learning_rate": 1.9884410266004134e-05, + "loss": 0.0152, + "step": 1436 + }, + { + "epoch": 0.1438, + "grad_norm": 0.09727516770362854, + "learning_rate": 1.988334944874706e-05, + "loss": 0.01, + "step": 1438 + }, + { + "epoch": 0.144, + "grad_norm": 0.1701594591140747, + "learning_rate": 1.988228381446553e-05, + "loss": 0.0073, + "step": 1440 + }, + { + "epoch": 0.1442, + "grad_norm": 0.1255943477153778, + "learning_rate": 1.988121336367892e-05, + "loss": 0.0041, + "step": 1442 + }, + { + "epoch": 0.1444, + "grad_norm": 0.11053822189569473, + "learning_rate": 1.9880138096908955e-05, + "loss": 0.0077, + "step": 1444 + }, + { + "epoch": 0.1446, + "grad_norm": 0.026598094031214714, + "learning_rate": 1.9879058014679704e-05, + "loss": 0.0862, + "step": 1446 + }, + { + "epoch": 0.1448, + "grad_norm": 0.04089822992682457, + "learning_rate": 1.987797311751759e-05, + "loss": 0.0008, + "step": 1448 + }, + { + "epoch": 0.145, + "grad_norm": 0.3417094051837921, + "learning_rate": 1.9876883405951378e-05, + "loss": 0.0048, + "step": 1450 + }, + { + "epoch": 0.1452, + "grad_norm": 0.053567614406347275, + "learning_rate": 1.9875788880512183e-05, + "loss": 0.0014, + "step": 1452 + }, + { + "epoch": 0.1454, + "grad_norm": 0.09175176918506622, + "learning_rate": 1.9874689541733455e-05, + "loss": 0.0192, + "step": 1454 + }, + { + "epoch": 0.1456, + "grad_norm": 0.8265829086303711, + "learning_rate": 1.9873585390151003e-05, + "loss": 0.0104, + "step": 1456 + }, + { + "epoch": 0.1458, + "grad_norm": 0.05232410877943039, + "learning_rate": 1.9872476426302983e-05, + "loss": 0.0025, + "step": 1458 + }, + { + "epoch": 0.146, + "grad_norm": 0.4126329720020294, + "learning_rate": 1.987136265072988e-05, + "loss": 0.0086, + "step": 1460 + }, + { + "epoch": 0.1462, + "grad_norm": 0.02060386724770069, + "learning_rate": 1.987024406397454e-05, + "loss": 0.0018, + "step": 1462 + }, + { + "epoch": 0.1464, + "grad_norm": 0.015154527500271797, + "learning_rate": 1.9869120666582153e-05, + "loss": 0.0007, + "step": 1464 + }, + { + "epoch": 0.1466, + "grad_norm": 0.38272103667259216, + "learning_rate": 1.986799245910024e-05, + "loss": 0.0036, + "step": 1466 + }, + { + "epoch": 0.1468, + "grad_norm": 0.17997866868972778, + "learning_rate": 1.986685944207868e-05, + "loss": 0.1131, + "step": 1468 + }, + { + "epoch": 0.147, + "grad_norm": 2.0380289554595947, + "learning_rate": 1.9865721616069695e-05, + "loss": 0.0347, + "step": 1470 + }, + { + "epoch": 0.1472, + "grad_norm": 0.03604673221707344, + "learning_rate": 1.9864578981627844e-05, + "loss": 0.0011, + "step": 1472 + }, + { + "epoch": 0.1474, + "grad_norm": 0.08283986151218414, + "learning_rate": 1.9863431539310033e-05, + "loss": 0.0031, + "step": 1474 + }, + { + "epoch": 0.1476, + "grad_norm": 0.018679983913898468, + "learning_rate": 1.986227928967551e-05, + "loss": 0.0104, + "step": 1476 + }, + { + "epoch": 0.1478, + "grad_norm": 0.8941382169723511, + "learning_rate": 1.9861122233285873e-05, + "loss": 0.0137, + "step": 1478 + }, + { + "epoch": 0.148, + "grad_norm": 0.11180183291435242, + "learning_rate": 1.985996037070505e-05, + "loss": 0.0075, + "step": 1480 + }, + { + "epoch": 0.1482, + "grad_norm": 16.184219360351562, + "learning_rate": 1.9858793702499322e-05, + "loss": 0.4042, + "step": 1482 + }, + { + "epoch": 0.1484, + "grad_norm": 0.08209467679262161, + "learning_rate": 1.9857622229237315e-05, + "loss": 0.001, + "step": 1484 + }, + { + "epoch": 0.1486, + "grad_norm": 2.3698644638061523, + "learning_rate": 1.9856445951489984e-05, + "loss": 0.0196, + "step": 1486 + }, + { + "epoch": 0.1488, + "grad_norm": 0.04401962831616402, + "learning_rate": 1.985526486983063e-05, + "loss": 0.0802, + "step": 1488 + }, + { + "epoch": 0.149, + "grad_norm": 0.15793541073799133, + "learning_rate": 1.9854078984834904e-05, + "loss": 0.0297, + "step": 1490 + }, + { + "epoch": 0.1492, + "grad_norm": 0.04324957728385925, + "learning_rate": 1.985288829708079e-05, + "loss": 0.0577, + "step": 1492 + }, + { + "epoch": 0.1494, + "grad_norm": 5.518545150756836, + "learning_rate": 1.9851692807148612e-05, + "loss": 0.0821, + "step": 1494 + }, + { + "epoch": 0.1496, + "grad_norm": 0.10561300069093704, + "learning_rate": 1.9850492515621038e-05, + "loss": 0.0057, + "step": 1496 + }, + { + "epoch": 0.1498, + "grad_norm": 1.8455551862716675, + "learning_rate": 1.984928742308308e-05, + "loss": 0.0197, + "step": 1498 + }, + { + "epoch": 0.15, + "grad_norm": 0.832514762878418, + "learning_rate": 1.9848077530122083e-05, + "loss": 0.0133, + "step": 1500 + }, + { + "epoch": 0.1502, + "grad_norm": 0.09570958465337753, + "learning_rate": 1.9846862837327733e-05, + "loss": 0.0223, + "step": 1502 + }, + { + "epoch": 0.1504, + "grad_norm": 1.4203414916992188, + "learning_rate": 1.9845643345292055e-05, + "loss": 0.0206, + "step": 1504 + }, + { + "epoch": 0.1506, + "grad_norm": 0.047811802476644516, + "learning_rate": 1.9844419054609418e-05, + "loss": 0.0005, + "step": 1506 + }, + { + "epoch": 0.1508, + "grad_norm": 0.008102440275251865, + "learning_rate": 1.9843189965876525e-05, + "loss": 0.0119, + "step": 1508 + }, + { + "epoch": 0.151, + "grad_norm": 0.17529189586639404, + "learning_rate": 1.984195607969242e-05, + "loss": 0.0031, + "step": 1510 + }, + { + "epoch": 0.1512, + "grad_norm": 0.014633730053901672, + "learning_rate": 1.9840717396658483e-05, + "loss": 0.0021, + "step": 1512 + }, + { + "epoch": 0.1514, + "grad_norm": 0.1455528289079666, + "learning_rate": 1.9839473917378432e-05, + "loss": 0.0031, + "step": 1514 + }, + { + "epoch": 0.1516, + "grad_norm": 0.7601430416107178, + "learning_rate": 1.983822564245833e-05, + "loss": 0.0071, + "step": 1516 + }, + { + "epoch": 0.1518, + "grad_norm": 0.668778121471405, + "learning_rate": 1.9836972572506557e-05, + "loss": 0.0112, + "step": 1518 + }, + { + "epoch": 0.152, + "grad_norm": 1.6042250394821167, + "learning_rate": 1.983571470813386e-05, + "loss": 0.0093, + "step": 1520 + }, + { + "epoch": 0.1522, + "grad_norm": 2.2383713722229004, + "learning_rate": 1.98344520499533e-05, + "loss": 0.0154, + "step": 1522 + }, + { + "epoch": 0.1524, + "grad_norm": 4.556458950042725, + "learning_rate": 1.983318459858028e-05, + "loss": 0.031, + "step": 1524 + }, + { + "epoch": 0.1526, + "grad_norm": 0.1331334114074707, + "learning_rate": 1.9831912354632537e-05, + "loss": 0.0094, + "step": 1526 + }, + { + "epoch": 0.1528, + "grad_norm": 4.572842121124268, + "learning_rate": 1.9830635318730155e-05, + "loss": 0.1187, + "step": 1528 + }, + { + "epoch": 0.153, + "grad_norm": 0.9818114042282104, + "learning_rate": 1.9829353491495545e-05, + "loss": 0.0643, + "step": 1530 + }, + { + "epoch": 0.1532, + "grad_norm": 0.033077552914619446, + "learning_rate": 1.982806687355345e-05, + "loss": 0.0006, + "step": 1532 + }, + { + "epoch": 0.1534, + "grad_norm": 0.012128075584769249, + "learning_rate": 1.982677546553095e-05, + "loss": 0.0003, + "step": 1534 + }, + { + "epoch": 0.1536, + "grad_norm": 0.6357725858688354, + "learning_rate": 1.982547926805747e-05, + "loss": 0.0085, + "step": 1536 + }, + { + "epoch": 0.1538, + "grad_norm": 0.18377017974853516, + "learning_rate": 1.9824178281764753e-05, + "loss": 0.0024, + "step": 1538 + }, + { + "epoch": 0.154, + "grad_norm": 7.984691143035889, + "learning_rate": 1.982287250728689e-05, + "loss": 0.1201, + "step": 1540 + }, + { + "epoch": 0.1542, + "grad_norm": 0.5691299438476562, + "learning_rate": 1.9821561945260292e-05, + "loss": 0.2641, + "step": 1542 + }, + { + "epoch": 0.1544, + "grad_norm": 0.03305581957101822, + "learning_rate": 1.982024659632372e-05, + "loss": 0.0006, + "step": 1544 + }, + { + "epoch": 0.1546, + "grad_norm": 9.223968505859375, + "learning_rate": 1.9818926461118254e-05, + "loss": 0.1913, + "step": 1546 + }, + { + "epoch": 0.1548, + "grad_norm": 12.118476867675781, + "learning_rate": 1.981760154028731e-05, + "loss": 0.2744, + "step": 1548 + }, + { + "epoch": 0.155, + "grad_norm": 0.038102488964796066, + "learning_rate": 1.9816271834476642e-05, + "loss": 0.0008, + "step": 1550 + }, + { + "epoch": 0.1552, + "grad_norm": 0.4508296251296997, + "learning_rate": 1.981493734433433e-05, + "loss": 0.0084, + "step": 1552 + }, + { + "epoch": 0.1554, + "grad_norm": 0.5790154337882996, + "learning_rate": 1.981359807051079e-05, + "loss": 0.0087, + "step": 1554 + }, + { + "epoch": 0.1556, + "grad_norm": 2.9088401794433594, + "learning_rate": 1.981225401365877e-05, + "loss": 0.1845, + "step": 1556 + }, + { + "epoch": 0.1558, + "grad_norm": 0.0655171126127243, + "learning_rate": 1.981090517443334e-05, + "loss": 0.0175, + "step": 1558 + }, + { + "epoch": 0.156, + "grad_norm": 0.4880290925502777, + "learning_rate": 1.9809551553491918e-05, + "loss": 0.007, + "step": 1560 + }, + { + "epoch": 0.1562, + "grad_norm": 0.2099134474992752, + "learning_rate": 1.9808193151494233e-05, + "loss": 0.0324, + "step": 1562 + }, + { + "epoch": 0.1564, + "grad_norm": 0.0339023619890213, + "learning_rate": 1.9806829969102356e-05, + "loss": 0.0224, + "step": 1564 + }, + { + "epoch": 0.1566, + "grad_norm": 0.25429868698120117, + "learning_rate": 1.9805462006980688e-05, + "loss": 0.0157, + "step": 1566 + }, + { + "epoch": 0.1568, + "grad_norm": 0.24032142758369446, + "learning_rate": 1.980408926579596e-05, + "loss": 0.0142, + "step": 1568 + }, + { + "epoch": 0.157, + "grad_norm": 0.20247194170951843, + "learning_rate": 1.9802711746217222e-05, + "loss": 0.0038, + "step": 1570 + }, + { + "epoch": 0.1572, + "grad_norm": 0.5999649167060852, + "learning_rate": 1.9801329448915863e-05, + "loss": 0.0255, + "step": 1572 + }, + { + "epoch": 0.1574, + "grad_norm": 0.06455890834331512, + "learning_rate": 1.9799942374565597e-05, + "loss": 0.0168, + "step": 1574 + }, + { + "epoch": 0.1576, + "grad_norm": 0.3796960711479187, + "learning_rate": 1.979855052384247e-05, + "loss": 0.4151, + "step": 1576 + }, + { + "epoch": 0.1578, + "grad_norm": 1.1111547946929932, + "learning_rate": 1.9797153897424854e-05, + "loss": 0.0094, + "step": 1578 + }, + { + "epoch": 0.158, + "grad_norm": 0.029557403177022934, + "learning_rate": 1.979575249599344e-05, + "loss": 0.0063, + "step": 1580 + }, + { + "epoch": 0.1582, + "grad_norm": 0.17860916256904602, + "learning_rate": 1.9794346320231265e-05, + "loss": 0.0034, + "step": 1582 + }, + { + "epoch": 0.1584, + "grad_norm": 0.20679481327533722, + "learning_rate": 1.9792935370823676e-05, + "loss": 0.0048, + "step": 1584 + }, + { + "epoch": 0.1586, + "grad_norm": 1.361494541168213, + "learning_rate": 1.9791519648458352e-05, + "loss": 0.0416, + "step": 1586 + }, + { + "epoch": 0.1588, + "grad_norm": 2.585038423538208, + "learning_rate": 1.97900991538253e-05, + "loss": 0.1082, + "step": 1588 + }, + { + "epoch": 0.159, + "grad_norm": 10.644813537597656, + "learning_rate": 1.9788673887616852e-05, + "loss": 0.2784, + "step": 1590 + }, + { + "epoch": 0.1592, + "grad_norm": 0.6961618661880493, + "learning_rate": 1.9787243850527663e-05, + "loss": 0.0617, + "step": 1592 + }, + { + "epoch": 0.1594, + "grad_norm": 2.480100154876709, + "learning_rate": 1.978580904325472e-05, + "loss": 0.0377, + "step": 1594 + }, + { + "epoch": 0.1596, + "grad_norm": 0.8590832948684692, + "learning_rate": 1.9784369466497333e-05, + "loss": 0.0125, + "step": 1596 + }, + { + "epoch": 0.1598, + "grad_norm": 0.33869338035583496, + "learning_rate": 1.9782925120957123e-05, + "loss": 0.0865, + "step": 1598 + }, + { + "epoch": 0.16, + "grad_norm": 0.10754403471946716, + "learning_rate": 1.9781476007338058e-05, + "loss": 0.1365, + "step": 1600 + }, + { + "epoch": 0.1602, + "grad_norm": 0.9042314887046814, + "learning_rate": 1.9780022126346413e-05, + "loss": 0.0078, + "step": 1602 + }, + { + "epoch": 0.1604, + "grad_norm": 0.3064735233783722, + "learning_rate": 1.977856347869079e-05, + "loss": 0.0891, + "step": 1604 + }, + { + "epoch": 0.1606, + "grad_norm": 0.16003231704235077, + "learning_rate": 1.977710006508212e-05, + "loss": 0.3817, + "step": 1606 + }, + { + "epoch": 0.1608, + "grad_norm": 0.47364571690559387, + "learning_rate": 1.9775631886233655e-05, + "loss": 0.0049, + "step": 1608 + }, + { + "epoch": 0.161, + "grad_norm": 0.27074864506721497, + "learning_rate": 1.9774158942860962e-05, + "loss": 0.0047, + "step": 1610 + }, + { + "epoch": 0.1612, + "grad_norm": 4.0374860763549805, + "learning_rate": 1.9772681235681936e-05, + "loss": 0.0351, + "step": 1612 + }, + { + "epoch": 0.1614, + "grad_norm": 0.6387918591499329, + "learning_rate": 1.97711987654168e-05, + "loss": 0.2472, + "step": 1614 + }, + { + "epoch": 0.1616, + "grad_norm": 0.4807264804840088, + "learning_rate": 1.9769711532788083e-05, + "loss": 0.0138, + "step": 1616 + }, + { + "epoch": 0.1618, + "grad_norm": 0.08415462076663971, + "learning_rate": 1.976821953852065e-05, + "loss": 0.0042, + "step": 1618 + }, + { + "epoch": 0.162, + "grad_norm": 6.727123260498047, + "learning_rate": 1.9766722783341682e-05, + "loss": 0.2205, + "step": 1620 + }, + { + "epoch": 0.1622, + "grad_norm": 3.534585475921631, + "learning_rate": 1.9765221267980675e-05, + "loss": 0.1309, + "step": 1622 + }, + { + "epoch": 0.1624, + "grad_norm": 1.0627446174621582, + "learning_rate": 1.976371499316945e-05, + "loss": 0.0301, + "step": 1624 + }, + { + "epoch": 0.1626, + "grad_norm": 5.742453575134277, + "learning_rate": 1.976220395964215e-05, + "loss": 0.1279, + "step": 1626 + }, + { + "epoch": 0.1628, + "grad_norm": 3.841376304626465, + "learning_rate": 1.9760688168135233e-05, + "loss": 0.0802, + "step": 1628 + }, + { + "epoch": 0.163, + "grad_norm": 0.906880259513855, + "learning_rate": 1.9759167619387474e-05, + "loss": 0.0291, + "step": 1630 + }, + { + "epoch": 0.1632, + "grad_norm": 0.057972412556409836, + "learning_rate": 1.9757642314139977e-05, + "loss": 0.0036, + "step": 1632 + }, + { + "epoch": 0.1634, + "grad_norm": 1.701025128364563, + "learning_rate": 1.9756112253136154e-05, + "loss": 0.0666, + "step": 1634 + }, + { + "epoch": 0.1636, + "grad_norm": 0.33582958579063416, + "learning_rate": 1.9754577437121733e-05, + "loss": 0.0497, + "step": 1636 + }, + { + "epoch": 0.1638, + "grad_norm": 4.420130252838135, + "learning_rate": 1.975303786684477e-05, + "loss": 0.3759, + "step": 1638 + }, + { + "epoch": 0.164, + "grad_norm": 0.47684478759765625, + "learning_rate": 1.9751493543055634e-05, + "loss": 0.0646, + "step": 1640 + }, + { + "epoch": 0.1642, + "grad_norm": 4.017007350921631, + "learning_rate": 1.9749944466507007e-05, + "loss": 0.0584, + "step": 1642 + }, + { + "epoch": 0.1644, + "grad_norm": 1.9324309825897217, + "learning_rate": 1.974839063795389e-05, + "loss": 0.041, + "step": 1644 + }, + { + "epoch": 0.1646, + "grad_norm": 0.3472937047481537, + "learning_rate": 1.9746832058153602e-05, + "loss": 0.0442, + "step": 1646 + }, + { + "epoch": 0.1648, + "grad_norm": 0.08370023220777512, + "learning_rate": 1.9745268727865774e-05, + "loss": 0.0021, + "step": 1648 + }, + { + "epoch": 0.165, + "grad_norm": 0.26875799894332886, + "learning_rate": 1.9743700647852356e-05, + "loss": 0.0376, + "step": 1650 + }, + { + "epoch": 0.1652, + "grad_norm": 0.21494188904762268, + "learning_rate": 1.9742127818877605e-05, + "loss": 0.0062, + "step": 1652 + }, + { + "epoch": 0.1654, + "grad_norm": 3.51088285446167, + "learning_rate": 1.974055024170811e-05, + "loss": 0.1287, + "step": 1654 + }, + { + "epoch": 0.1656, + "grad_norm": 1.3254921436309814, + "learning_rate": 1.9738967917112752e-05, + "loss": 0.0552, + "step": 1656 + }, + { + "epoch": 0.1658, + "grad_norm": 1.3793773651123047, + "learning_rate": 1.9737380845862745e-05, + "loss": 0.0271, + "step": 1658 + }, + { + "epoch": 0.166, + "grad_norm": 0.6239833235740662, + "learning_rate": 1.9735789028731603e-05, + "loss": 0.0105, + "step": 1660 + }, + { + "epoch": 0.1662, + "grad_norm": 0.24508033692836761, + "learning_rate": 1.9734192466495162e-05, + "loss": 0.08, + "step": 1662 + }, + { + "epoch": 0.1664, + "grad_norm": 0.020669130608439445, + "learning_rate": 1.9732591159931564e-05, + "loss": 0.0184, + "step": 1664 + }, + { + "epoch": 0.1666, + "grad_norm": 0.14853385090827942, + "learning_rate": 1.9730985109821268e-05, + "loss": 0.0053, + "step": 1666 + }, + { + "epoch": 0.1668, + "grad_norm": 0.025661231949925423, + "learning_rate": 1.972937431694704e-05, + "loss": 0.2529, + "step": 1668 + }, + { + "epoch": 0.167, + "grad_norm": 5.671712398529053, + "learning_rate": 1.972775878209397e-05, + "loss": 0.1606, + "step": 1670 + }, + { + "epoch": 0.1672, + "grad_norm": 0.04642321914434433, + "learning_rate": 1.9726138506049438e-05, + "loss": 0.0047, + "step": 1672 + }, + { + "epoch": 0.1674, + "grad_norm": 11.684229850769043, + "learning_rate": 1.9724513489603153e-05, + "loss": 0.1871, + "step": 1674 + }, + { + "epoch": 0.1676, + "grad_norm": 0.16226550936698914, + "learning_rate": 1.9722883733547128e-05, + "loss": 0.003, + "step": 1676 + }, + { + "epoch": 0.1678, + "grad_norm": 0.007359792478382587, + "learning_rate": 1.9721249238675688e-05, + "loss": 0.0129, + "step": 1678 + }, + { + "epoch": 0.168, + "grad_norm": 0.04819686710834503, + "learning_rate": 1.9719610005785466e-05, + "loss": 0.0014, + "step": 1680 + }, + { + "epoch": 0.1682, + "grad_norm": 1.5777416229248047, + "learning_rate": 1.97179660356754e-05, + "loss": 0.0323, + "step": 1682 + }, + { + "epoch": 0.1684, + "grad_norm": 0.041134439408779144, + "learning_rate": 1.971631732914674e-05, + "loss": 0.0022, + "step": 1684 + }, + { + "epoch": 0.1686, + "grad_norm": 0.13911724090576172, + "learning_rate": 1.9714663887003055e-05, + "loss": 0.0062, + "step": 1686 + }, + { + "epoch": 0.1688, + "grad_norm": 0.11488854140043259, + "learning_rate": 1.9713005710050203e-05, + "loss": 0.0029, + "step": 1688 + }, + { + "epoch": 0.169, + "grad_norm": 0.031515780836343765, + "learning_rate": 1.971134279909636e-05, + "loss": 0.0008, + "step": 1690 + }, + { + "epoch": 0.1692, + "grad_norm": 0.14839695394039154, + "learning_rate": 1.9709675154952017e-05, + "loss": 0.0036, + "step": 1692 + }, + { + "epoch": 0.1694, + "grad_norm": 0.11014973372220993, + "learning_rate": 1.9708002778429957e-05, + "loss": 0.0133, + "step": 1694 + }, + { + "epoch": 0.1696, + "grad_norm": 0.25504204630851746, + "learning_rate": 1.9706325670345276e-05, + "loss": 0.006, + "step": 1696 + }, + { + "epoch": 0.1698, + "grad_norm": 0.14577852189540863, + "learning_rate": 1.9704643831515377e-05, + "loss": 0.0062, + "step": 1698 + }, + { + "epoch": 0.17, + "grad_norm": 0.047967396676540375, + "learning_rate": 1.9702957262759964e-05, + "loss": 0.0062, + "step": 1700 + }, + { + "epoch": 0.1702, + "grad_norm": 0.014396405778825283, + "learning_rate": 1.970126596490106e-05, + "loss": 0.006, + "step": 1702 + }, + { + "epoch": 0.1704, + "grad_norm": 0.5217381119728088, + "learning_rate": 1.9699569938762975e-05, + "loss": 0.015, + "step": 1704 + }, + { + "epoch": 0.1706, + "grad_norm": 0.10545945912599564, + "learning_rate": 1.969786918517233e-05, + "loss": 0.0051, + "step": 1706 + }, + { + "epoch": 0.1708, + "grad_norm": 0.014590825885534286, + "learning_rate": 1.969616370495806e-05, + "loss": 0.0925, + "step": 1708 + }, + { + "epoch": 0.171, + "grad_norm": 0.02237093448638916, + "learning_rate": 1.9694453498951392e-05, + "loss": 0.0006, + "step": 1710 + }, + { + "epoch": 0.1712, + "grad_norm": 0.012718232348561287, + "learning_rate": 1.9692738567985853e-05, + "loss": 0.0006, + "step": 1712 + }, + { + "epoch": 0.1714, + "grad_norm": 1.0510637760162354, + "learning_rate": 1.9691018912897285e-05, + "loss": 0.0169, + "step": 1714 + }, + { + "epoch": 0.1716, + "grad_norm": 0.02472413145005703, + "learning_rate": 1.968929453452383e-05, + "loss": 0.0012, + "step": 1716 + }, + { + "epoch": 0.1718, + "grad_norm": 0.04015633836388588, + "learning_rate": 1.9687565433705926e-05, + "loss": 0.001, + "step": 1718 + }, + { + "epoch": 0.172, + "grad_norm": 0.07887991517782211, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.0017, + "step": 1720 + }, + { + "epoch": 0.1722, + "grad_norm": 0.9725723266601562, + "learning_rate": 1.968409306811004e-05, + "loss": 0.027, + "step": 1722 + }, + { + "epoch": 0.1724, + "grad_norm": 0.21646623313426971, + "learning_rate": 1.9682349805024447e-05, + "loss": 0.0115, + "step": 1724 + }, + { + "epoch": 0.1726, + "grad_norm": 0.01479239109903574, + "learning_rate": 1.968060182287918e-05, + "loss": 0.0006, + "step": 1726 + }, + { + "epoch": 0.1728, + "grad_norm": 0.24675583839416504, + "learning_rate": 1.967884912252619e-05, + "loss": 0.0026, + "step": 1728 + }, + { + "epoch": 0.173, + "grad_norm": 0.09851530939340591, + "learning_rate": 1.9677091704819714e-05, + "loss": 0.0024, + "step": 1730 + }, + { + "epoch": 0.1732, + "grad_norm": 0.020430725067853928, + "learning_rate": 1.96753295706163e-05, + "loss": 0.0006, + "step": 1732 + }, + { + "epoch": 0.1734, + "grad_norm": 8.681171417236328, + "learning_rate": 1.9673562720774792e-05, + "loss": 0.1425, + "step": 1734 + }, + { + "epoch": 0.1736, + "grad_norm": 5.805352687835693, + "learning_rate": 1.967179115615633e-05, + "loss": 0.103, + "step": 1736 + }, + { + "epoch": 0.1738, + "grad_norm": 0.0037315445952117443, + "learning_rate": 1.9670014877624353e-05, + "loss": 0.0119, + "step": 1738 + }, + { + "epoch": 0.174, + "grad_norm": 0.13130338490009308, + "learning_rate": 1.9668233886044597e-05, + "loss": 0.062, + "step": 1740 + }, + { + "epoch": 0.1742, + "grad_norm": 7.849946022033691, + "learning_rate": 1.9666448182285095e-05, + "loss": 0.0385, + "step": 1742 + }, + { + "epoch": 0.1744, + "grad_norm": 0.037035949528217316, + "learning_rate": 1.9664657767216176e-05, + "loss": 0.0016, + "step": 1744 + }, + { + "epoch": 0.1746, + "grad_norm": 0.020151015371084213, + "learning_rate": 1.966286264171047e-05, + "loss": 0.0008, + "step": 1746 + }, + { + "epoch": 0.1748, + "grad_norm": 0.029449328780174255, + "learning_rate": 1.9661062806642903e-05, + "loss": 0.001, + "step": 1748 + }, + { + "epoch": 0.175, + "grad_norm": 0.011587665416300297, + "learning_rate": 1.9659258262890683e-05, + "loss": 0.0006, + "step": 1750 + }, + { + "epoch": 0.1752, + "grad_norm": 8.328936576843262, + "learning_rate": 1.9657449011333328e-05, + "loss": 0.3191, + "step": 1752 + }, + { + "epoch": 0.1754, + "grad_norm": 4.062107563018799, + "learning_rate": 1.9655635052852648e-05, + "loss": 0.0249, + "step": 1754 + }, + { + "epoch": 0.1756, + "grad_norm": 2.1259400844573975, + "learning_rate": 1.965381638833274e-05, + "loss": 0.0588, + "step": 1756 + }, + { + "epoch": 0.1758, + "grad_norm": 0.03820859268307686, + "learning_rate": 1.9651993018660002e-05, + "loss": 0.0508, + "step": 1758 + }, + { + "epoch": 0.176, + "grad_norm": 4.045016765594482, + "learning_rate": 1.9650164944723116e-05, + "loss": 0.0111, + "step": 1760 + }, + { + "epoch": 0.1762, + "grad_norm": 0.03176054358482361, + "learning_rate": 1.9648332167413067e-05, + "loss": 0.0006, + "step": 1762 + }, + { + "epoch": 0.1764, + "grad_norm": 0.007070845924317837, + "learning_rate": 1.9646494687623135e-05, + "loss": 0.0004, + "step": 1764 + }, + { + "epoch": 0.1766, + "grad_norm": 0.016085347160696983, + "learning_rate": 1.9644652506248872e-05, + "loss": 0.0021, + "step": 1766 + }, + { + "epoch": 0.1768, + "grad_norm": 0.007734335958957672, + "learning_rate": 1.964280562418815e-05, + "loss": 0.0007, + "step": 1768 + }, + { + "epoch": 0.177, + "grad_norm": 3.766744613647461, + "learning_rate": 1.96409540423411e-05, + "loss": 0.0139, + "step": 1770 + }, + { + "epoch": 0.1772, + "grad_norm": 0.021023424342274666, + "learning_rate": 1.9639097761610174e-05, + "loss": 0.0012, + "step": 1772 + }, + { + "epoch": 0.1774, + "grad_norm": 0.03179573640227318, + "learning_rate": 1.96372367829001e-05, + "loss": 0.0123, + "step": 1774 + }, + { + "epoch": 0.1776, + "grad_norm": 0.02547609433531761, + "learning_rate": 1.963537110711789e-05, + "loss": 0.001, + "step": 1776 + }, + { + "epoch": 0.1778, + "grad_norm": 0.02194964326918125, + "learning_rate": 1.963350073517285e-05, + "loss": 0.0122, + "step": 1778 + }, + { + "epoch": 0.178, + "grad_norm": 0.06821738928556442, + "learning_rate": 1.9631625667976584e-05, + "loss": 0.0087, + "step": 1780 + }, + { + "epoch": 0.1782, + "grad_norm": 0.12401845306158066, + "learning_rate": 1.9629745906442973e-05, + "loss": 0.0027, + "step": 1782 + }, + { + "epoch": 0.1784, + "grad_norm": 0.2758760154247284, + "learning_rate": 1.962786145148819e-05, + "loss": 0.0025, + "step": 1784 + }, + { + "epoch": 0.1786, + "grad_norm": 0.006753586232662201, + "learning_rate": 1.9625972304030697e-05, + "loss": 0.0007, + "step": 1786 + }, + { + "epoch": 0.1788, + "grad_norm": 0.015575782395899296, + "learning_rate": 1.962407846499124e-05, + "loss": 0.0005, + "step": 1788 + }, + { + "epoch": 0.179, + "grad_norm": 1.6683244705200195, + "learning_rate": 1.9622179935292855e-05, + "loss": 0.0093, + "step": 1790 + }, + { + "epoch": 0.1792, + "grad_norm": 0.04968750476837158, + "learning_rate": 1.962027671586086e-05, + "loss": 0.0009, + "step": 1792 + }, + { + "epoch": 0.1794, + "grad_norm": 1.9229198694229126, + "learning_rate": 1.9618368807622863e-05, + "loss": 0.0224, + "step": 1794 + }, + { + "epoch": 0.1796, + "grad_norm": 0.039036739617586136, + "learning_rate": 1.9616456211508756e-05, + "loss": 0.1002, + "step": 1796 + }, + { + "epoch": 0.1798, + "grad_norm": 0.14403876662254333, + "learning_rate": 1.961453892845071e-05, + "loss": 0.0037, + "step": 1798 + }, + { + "epoch": 0.18, + "grad_norm": 0.18207748234272003, + "learning_rate": 1.961261695938319e-05, + "loss": 0.0017, + "step": 1800 + }, + { + "epoch": 0.1802, + "grad_norm": 0.05407615751028061, + "learning_rate": 1.961069030524294e-05, + "loss": 0.0127, + "step": 1802 + }, + { + "epoch": 0.1804, + "grad_norm": 12.013261795043945, + "learning_rate": 1.9608758966968987e-05, + "loss": 0.6636, + "step": 1804 + }, + { + "epoch": 0.1806, + "grad_norm": 1.0460909605026245, + "learning_rate": 1.9606822945502642e-05, + "loss": 0.1051, + "step": 1806 + }, + { + "epoch": 0.1808, + "grad_norm": 0.031028548255562782, + "learning_rate": 1.96048822417875e-05, + "loss": 0.0008, + "step": 1808 + }, + { + "epoch": 0.181, + "grad_norm": 0.26885655522346497, + "learning_rate": 1.9602936856769432e-05, + "loss": 0.0043, + "step": 1810 + }, + { + "epoch": 0.1812, + "grad_norm": 1.1734826564788818, + "learning_rate": 1.96009867913966e-05, + "loss": 0.0167, + "step": 1812 + }, + { + "epoch": 0.1814, + "grad_norm": 0.21402058005332947, + "learning_rate": 1.9599032046619437e-05, + "loss": 0.0737, + "step": 1814 + }, + { + "epoch": 0.1816, + "grad_norm": 0.18755151331424713, + "learning_rate": 1.9597072623390668e-05, + "loss": 0.0242, + "step": 1816 + }, + { + "epoch": 0.1818, + "grad_norm": 2.8003621101379395, + "learning_rate": 1.959510852266529e-05, + "loss": 0.0351, + "step": 1818 + }, + { + "epoch": 0.182, + "grad_norm": 0.4113674759864807, + "learning_rate": 1.9593139745400575e-05, + "loss": 0.0068, + "step": 1820 + }, + { + "epoch": 0.1822, + "grad_norm": 2.0572762489318848, + "learning_rate": 1.9591166292556093e-05, + "loss": 0.0206, + "step": 1822 + }, + { + "epoch": 0.1824, + "grad_norm": 0.24020570516586304, + "learning_rate": 1.958918816509367e-05, + "loss": 0.007, + "step": 1824 + }, + { + "epoch": 0.1826, + "grad_norm": 0.10435029864311218, + "learning_rate": 1.9587205363977428e-05, + "loss": 0.0071, + "step": 1826 + }, + { + "epoch": 0.1828, + "grad_norm": 0.02752918377518654, + "learning_rate": 1.958521789017376e-05, + "loss": 0.0366, + "step": 1828 + }, + { + "epoch": 0.183, + "grad_norm": 0.01565142348408699, + "learning_rate": 1.9583225744651334e-05, + "loss": 0.0144, + "step": 1830 + }, + { + "epoch": 0.1832, + "grad_norm": 0.38379427790641785, + "learning_rate": 1.95812289283811e-05, + "loss": 0.0064, + "step": 1832 + }, + { + "epoch": 0.1834, + "grad_norm": 1.4675413370132446, + "learning_rate": 1.9579227442336276e-05, + "loss": 0.0167, + "step": 1834 + }, + { + "epoch": 0.1836, + "grad_norm": 5.345584392547607, + "learning_rate": 1.9577221287492368e-05, + "loss": 0.2436, + "step": 1836 + }, + { + "epoch": 0.1838, + "grad_norm": 0.7807623147964478, + "learning_rate": 1.957521046482715e-05, + "loss": 0.014, + "step": 1838 + }, + { + "epoch": 0.184, + "grad_norm": 3.233402729034424, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.0537, + "step": 1840 + }, + { + "epoch": 0.1842, + "grad_norm": 0.8909895420074463, + "learning_rate": 1.9571174819955264e-05, + "loss": 0.0545, + "step": 1842 + }, + { + "epoch": 0.1844, + "grad_norm": 4.533488750457764, + "learning_rate": 1.9569149999715514e-05, + "loss": 0.0337, + "step": 1844 + }, + { + "epoch": 0.1846, + "grad_norm": 0.01374148577451706, + "learning_rate": 1.9567120515588307e-05, + "loss": 0.001, + "step": 1846 + }, + { + "epoch": 0.1848, + "grad_norm": 0.47488805651664734, + "learning_rate": 1.956508636856278e-05, + "loss": 0.0538, + "step": 1848 + }, + { + "epoch": 0.185, + "grad_norm": 0.012145903892815113, + "learning_rate": 1.9563047559630356e-05, + "loss": 0.0008, + "step": 1850 + }, + { + "epoch": 0.1852, + "grad_norm": 0.550186038017273, + "learning_rate": 1.9561004089784726e-05, + "loss": 0.0092, + "step": 1852 + }, + { + "epoch": 0.1854, + "grad_norm": 0.7320125699043274, + "learning_rate": 1.9558955960021847e-05, + "loss": 0.0213, + "step": 1854 + }, + { + "epoch": 0.1856, + "grad_norm": 2.6293041706085205, + "learning_rate": 1.9556903171339963e-05, + "loss": 0.0474, + "step": 1856 + }, + { + "epoch": 0.1858, + "grad_norm": 0.07266008853912354, + "learning_rate": 1.9554845724739565e-05, + "loss": 0.0038, + "step": 1858 + }, + { + "epoch": 0.186, + "grad_norm": 0.1012662798166275, + "learning_rate": 1.9552783621223437e-05, + "loss": 0.0023, + "step": 1860 + }, + { + "epoch": 0.1862, + "grad_norm": 3.5694475173950195, + "learning_rate": 1.9550716861796623e-05, + "loss": 0.1896, + "step": 1862 + }, + { + "epoch": 0.1864, + "grad_norm": 0.027705637738108635, + "learning_rate": 1.9548645447466433e-05, + "loss": 0.0005, + "step": 1864 + }, + { + "epoch": 0.1866, + "grad_norm": 1.817319393157959, + "learning_rate": 1.9546569379242446e-05, + "loss": 0.0287, + "step": 1866 + }, + { + "epoch": 0.1868, + "grad_norm": 0.25596562027931213, + "learning_rate": 1.9544488658136522e-05, + "loss": 0.0034, + "step": 1868 + }, + { + "epoch": 0.187, + "grad_norm": 0.0446191243827343, + "learning_rate": 1.954240328516277e-05, + "loss": 0.024, + "step": 1870 + }, + { + "epoch": 0.1872, + "grad_norm": 0.18289881944656372, + "learning_rate": 1.954031326133758e-05, + "loss": 0.0026, + "step": 1872 + }, + { + "epoch": 0.1874, + "grad_norm": 0.026523396372795105, + "learning_rate": 1.9538218587679605e-05, + "loss": 0.0004, + "step": 1874 + }, + { + "epoch": 0.1876, + "grad_norm": 0.025924570858478546, + "learning_rate": 1.9536119265209763e-05, + "loss": 0.0007, + "step": 1876 + }, + { + "epoch": 0.1878, + "grad_norm": 0.03167114034295082, + "learning_rate": 1.9534015294951235e-05, + "loss": 0.009, + "step": 1878 + }, + { + "epoch": 0.188, + "grad_norm": 9.902305603027344, + "learning_rate": 1.9531906677929472e-05, + "loss": 0.1571, + "step": 1880 + }, + { + "epoch": 0.1882, + "grad_norm": 2.7613272666931152, + "learning_rate": 1.952979341517219e-05, + "loss": 0.033, + "step": 1882 + }, + { + "epoch": 0.1884, + "grad_norm": 0.011075474321842194, + "learning_rate": 1.9527675507709368e-05, + "loss": 0.0227, + "step": 1884 + }, + { + "epoch": 0.1886, + "grad_norm": 0.022259246557950974, + "learning_rate": 1.9525552956573244e-05, + "loss": 0.0006, + "step": 1886 + }, + { + "epoch": 0.1888, + "grad_norm": 0.008662869222462177, + "learning_rate": 1.9523425762798328e-05, + "loss": 0.0013, + "step": 1888 + }, + { + "epoch": 0.189, + "grad_norm": 0.022098258137702942, + "learning_rate": 1.9521293927421388e-05, + "loss": 0.0018, + "step": 1890 + }, + { + "epoch": 0.1892, + "grad_norm": 0.11530886590480804, + "learning_rate": 1.9519157451481453e-05, + "loss": 0.0022, + "step": 1892 + }, + { + "epoch": 0.1894, + "grad_norm": 0.7266804575920105, + "learning_rate": 1.9517016336019817e-05, + "loss": 0.0562, + "step": 1894 + }, + { + "epoch": 0.1896, + "grad_norm": 0.8632596135139465, + "learning_rate": 1.951487058208003e-05, + "loss": 0.0046, + "step": 1896 + }, + { + "epoch": 0.1898, + "grad_norm": 2.1361777782440186, + "learning_rate": 1.9512720190707915e-05, + "loss": 0.0148, + "step": 1898 + }, + { + "epoch": 0.19, + "grad_norm": 0.011426636017858982, + "learning_rate": 1.9510565162951538e-05, + "loss": 0.1343, + "step": 1900 + }, + { + "epoch": 0.1902, + "grad_norm": 0.8813074231147766, + "learning_rate": 1.9508405499861235e-05, + "loss": 0.0566, + "step": 1902 + }, + { + "epoch": 0.1904, + "grad_norm": 0.019410129636526108, + "learning_rate": 1.95062412024896e-05, + "loss": 0.0019, + "step": 1904 + }, + { + "epoch": 0.1906, + "grad_norm": 0.0646788626909256, + "learning_rate": 1.9504072271891486e-05, + "loss": 0.0024, + "step": 1906 + }, + { + "epoch": 0.1908, + "grad_norm": 0.627941370010376, + "learning_rate": 1.950189870912401e-05, + "loss": 0.0123, + "step": 1908 + }, + { + "epoch": 0.191, + "grad_norm": 0.03145056962966919, + "learning_rate": 1.9499720515246524e-05, + "loss": 0.0007, + "step": 1910 + }, + { + "epoch": 0.1912, + "grad_norm": 0.12984590232372284, + "learning_rate": 1.949753769132067e-05, + "loss": 0.0023, + "step": 1912 + }, + { + "epoch": 0.1914, + "grad_norm": 3.517881155014038, + "learning_rate": 1.949535023841032e-05, + "loss": 0.0258, + "step": 1914 + }, + { + "epoch": 0.1916, + "grad_norm": 0.016166003420948982, + "learning_rate": 1.9493158157581617e-05, + "loss": 0.0006, + "step": 1916 + }, + { + "epoch": 0.1918, + "grad_norm": 0.03517754375934601, + "learning_rate": 1.9490961449902946e-05, + "loss": 0.0165, + "step": 1918 + }, + { + "epoch": 0.192, + "grad_norm": 0.028589127585291862, + "learning_rate": 1.9488760116444966e-05, + "loss": 0.0006, + "step": 1920 + }, + { + "epoch": 0.1922, + "grad_norm": 0.01118551567196846, + "learning_rate": 1.9486554158280576e-05, + "loss": 0.0099, + "step": 1922 + }, + { + "epoch": 0.1924, + "grad_norm": 1.2418416738510132, + "learning_rate": 1.9484343576484935e-05, + "loss": 0.0167, + "step": 1924 + }, + { + "epoch": 0.1926, + "grad_norm": 4.116117000579834, + "learning_rate": 1.9482128372135446e-05, + "loss": 0.0608, + "step": 1926 + }, + { + "epoch": 0.1928, + "grad_norm": 0.8087245225906372, + "learning_rate": 1.9479908546311783e-05, + "loss": 0.0205, + "step": 1928 + }, + { + "epoch": 0.193, + "grad_norm": 0.030291816219687462, + "learning_rate": 1.947768410009586e-05, + "loss": 0.0011, + "step": 1930 + }, + { + "epoch": 0.1932, + "grad_norm": 5.965202808380127, + "learning_rate": 1.947545503457184e-05, + "loss": 0.0789, + "step": 1932 + }, + { + "epoch": 0.1934, + "grad_norm": 0.39404237270355225, + "learning_rate": 1.9473221350826145e-05, + "loss": 0.0034, + "step": 1934 + }, + { + "epoch": 0.1936, + "grad_norm": 0.06247009336948395, + "learning_rate": 1.9470983049947446e-05, + "loss": 0.1525, + "step": 1936 + }, + { + "epoch": 0.1938, + "grad_norm": 0.003908657003194094, + "learning_rate": 1.946874013302666e-05, + "loss": 0.0384, + "step": 1938 + }, + { + "epoch": 0.194, + "grad_norm": 0.46678000688552856, + "learning_rate": 1.9466492601156964e-05, + "loss": 0.0083, + "step": 1940 + }, + { + "epoch": 0.1942, + "grad_norm": 18.993093490600586, + "learning_rate": 1.9464240455433775e-05, + "loss": 0.6735, + "step": 1942 + }, + { + "epoch": 0.1944, + "grad_norm": 12.377114295959473, + "learning_rate": 1.946198369695476e-05, + "loss": 0.2567, + "step": 1944 + }, + { + "epoch": 0.1946, + "grad_norm": 0.08415055274963379, + "learning_rate": 1.945972232681984e-05, + "loss": 0.0015, + "step": 1946 + }, + { + "epoch": 0.1948, + "grad_norm": 0.008462129160761833, + "learning_rate": 1.945745634613117e-05, + "loss": 0.0599, + "step": 1948 + }, + { + "epoch": 0.195, + "grad_norm": 1.0364161729812622, + "learning_rate": 1.945518575599317e-05, + "loss": 0.0074, + "step": 1950 + }, + { + "epoch": 0.1952, + "grad_norm": 2.098079204559326, + "learning_rate": 1.9452910557512497e-05, + "loss": 0.0336, + "step": 1952 + }, + { + "epoch": 0.1954, + "grad_norm": 0.02097848616540432, + "learning_rate": 1.945063075179805e-05, + "loss": 0.001, + "step": 1954 + }, + { + "epoch": 0.1956, + "grad_norm": 0.3212715685367584, + "learning_rate": 1.9448346339960984e-05, + "loss": 0.0054, + "step": 1956 + }, + { + "epoch": 0.1958, + "grad_norm": 0.05411422625184059, + "learning_rate": 1.944605732311469e-05, + "loss": 0.0783, + "step": 1958 + }, + { + "epoch": 0.196, + "grad_norm": 6.25040340423584, + "learning_rate": 1.944376370237481e-05, + "loss": 0.1453, + "step": 1960 + }, + { + "epoch": 0.1962, + "grad_norm": 0.201955646276474, + "learning_rate": 1.944146547885923e-05, + "loss": 0.008, + "step": 1962 + }, + { + "epoch": 0.1964, + "grad_norm": 0.7432675361633301, + "learning_rate": 1.9439162653688066e-05, + "loss": 0.0794, + "step": 1964 + }, + { + "epoch": 0.1966, + "grad_norm": 4.184023857116699, + "learning_rate": 1.9436855227983695e-05, + "loss": 0.02, + "step": 1966 + }, + { + "epoch": 0.1968, + "grad_norm": 0.06273335963487625, + "learning_rate": 1.9434543202870726e-05, + "loss": 0.0014, + "step": 1968 + }, + { + "epoch": 0.197, + "grad_norm": 0.05212380737066269, + "learning_rate": 1.943222657947601e-05, + "loss": 0.0015, + "step": 1970 + }, + { + "epoch": 0.1972, + "grad_norm": 0.8884997963905334, + "learning_rate": 1.9429905358928648e-05, + "loss": 0.0103, + "step": 1972 + }, + { + "epoch": 0.1974, + "grad_norm": 0.03661374747753143, + "learning_rate": 1.9427579542359966e-05, + "loss": 0.003, + "step": 1974 + }, + { + "epoch": 0.1976, + "grad_norm": 0.028899747878313065, + "learning_rate": 1.9425249130903544e-05, + "loss": 0.0015, + "step": 1976 + }, + { + "epoch": 0.1978, + "grad_norm": 3.9954328536987305, + "learning_rate": 1.942291412569519e-05, + "loss": 0.1564, + "step": 1978 + }, + { + "epoch": 0.198, + "grad_norm": 0.025423921644687653, + "learning_rate": 1.942057452787297e-05, + "loss": 0.009, + "step": 1980 + }, + { + "epoch": 0.1982, + "grad_norm": 0.06341349333524704, + "learning_rate": 1.9418230338577164e-05, + "loss": 0.0017, + "step": 1982 + }, + { + "epoch": 0.1984, + "grad_norm": 0.22687208652496338, + "learning_rate": 1.9415881558950302e-05, + "loss": 0.0128, + "step": 1984 + }, + { + "epoch": 0.1986, + "grad_norm": 5.926903247833252, + "learning_rate": 1.9413528190137158e-05, + "loss": 0.0627, + "step": 1986 + }, + { + "epoch": 0.1988, + "grad_norm": 0.2503984570503235, + "learning_rate": 1.9411170233284728e-05, + "loss": 0.005, + "step": 1988 + }, + { + "epoch": 0.199, + "grad_norm": 5.226849555969238, + "learning_rate": 1.9408807689542257e-05, + "loss": 0.0947, + "step": 1990 + }, + { + "epoch": 0.1992, + "grad_norm": 2.294002056121826, + "learning_rate": 1.9406440560061214e-05, + "loss": 0.0237, + "step": 1992 + }, + { + "epoch": 0.1994, + "grad_norm": 0.010231749154627323, + "learning_rate": 1.9404068845995317e-05, + "loss": 0.0017, + "step": 1994 + }, + { + "epoch": 0.1996, + "grad_norm": 0.005598574411123991, + "learning_rate": 1.9401692548500504e-05, + "loss": 0.0041, + "step": 1996 + }, + { + "epoch": 0.1998, + "grad_norm": 0.5769960284233093, + "learning_rate": 1.9399311668734957e-05, + "loss": 0.0166, + "step": 1998 + }, + { + "epoch": 0.2, + "grad_norm": 0.13281476497650146, + "learning_rate": 1.9396926207859085e-05, + "loss": 0.0024, + "step": 2000 + }, + { + "epoch": 0.2002, + "grad_norm": 0.1366058737039566, + "learning_rate": 1.9394536167035535e-05, + "loss": 0.0516, + "step": 2002 + }, + { + "epoch": 0.2004, + "grad_norm": 0.18046924471855164, + "learning_rate": 1.9392141547429183e-05, + "loss": 0.002, + "step": 2004 + }, + { + "epoch": 0.2006, + "grad_norm": 2.909897565841675, + "learning_rate": 1.938974235020714e-05, + "loss": 0.0385, + "step": 2006 + }, + { + "epoch": 0.2008, + "grad_norm": 0.009151826612651348, + "learning_rate": 1.9387338576538743e-05, + "loss": 0.0005, + "step": 2008 + }, + { + "epoch": 0.201, + "grad_norm": 0.206527441740036, + "learning_rate": 1.938493022759556e-05, + "loss": 0.0037, + "step": 2010 + }, + { + "epoch": 0.2012, + "grad_norm": 0.1068803071975708, + "learning_rate": 1.9382517304551397e-05, + "loss": 0.0018, + "step": 2012 + }, + { + "epoch": 0.2014, + "grad_norm": 0.023786919191479683, + "learning_rate": 1.9380099808582278e-05, + "loss": 0.0003, + "step": 2014 + }, + { + "epoch": 0.2016, + "grad_norm": 0.02308562956750393, + "learning_rate": 1.937767774086646e-05, + "loss": 0.0761, + "step": 2016 + }, + { + "epoch": 0.2018, + "grad_norm": 0.15342102944850922, + "learning_rate": 1.9375251102584438e-05, + "loss": 0.0012, + "step": 2018 + }, + { + "epoch": 0.202, + "grad_norm": 0.03255116567015648, + "learning_rate": 1.937281989491892e-05, + "loss": 0.0012, + "step": 2020 + }, + { + "epoch": 0.2022, + "grad_norm": 0.0029225016478449106, + "learning_rate": 1.937038411905484e-05, + "loss": 0.0002, + "step": 2022 + }, + { + "epoch": 0.2024, + "grad_norm": 0.0046910485252738, + "learning_rate": 1.936794377617938e-05, + "loss": 0.003, + "step": 2024 + }, + { + "epoch": 0.2026, + "grad_norm": 0.4820229113101959, + "learning_rate": 1.9365498867481926e-05, + "loss": 0.0295, + "step": 2026 + }, + { + "epoch": 0.2028, + "grad_norm": 7.77714729309082, + "learning_rate": 1.9363049394154095e-05, + "loss": 0.0987, + "step": 2028 + }, + { + "epoch": 0.203, + "grad_norm": 10.243021965026855, + "learning_rate": 1.9360595357389735e-05, + "loss": 0.4387, + "step": 2030 + }, + { + "epoch": 0.2032, + "grad_norm": 1.1921589374542236, + "learning_rate": 1.935813675838491e-05, + "loss": 0.0075, + "step": 2032 + }, + { + "epoch": 0.2034, + "grad_norm": 0.0038111598696559668, + "learning_rate": 1.9355673598337916e-05, + "loss": 0.0065, + "step": 2034 + }, + { + "epoch": 0.2036, + "grad_norm": 12.805474281311035, + "learning_rate": 1.935320587844926e-05, + "loss": 0.5568, + "step": 2036 + }, + { + "epoch": 0.2038, + "grad_norm": 0.29013487696647644, + "learning_rate": 1.9350733599921684e-05, + "loss": 0.0048, + "step": 2038 + }, + { + "epoch": 0.204, + "grad_norm": 0.2536258101463318, + "learning_rate": 1.9348256763960146e-05, + "loss": 0.0384, + "step": 2040 + }, + { + "epoch": 0.2042, + "grad_norm": 0.27669623494148254, + "learning_rate": 1.9345775371771826e-05, + "loss": 0.0105, + "step": 2042 + }, + { + "epoch": 0.2044, + "grad_norm": 0.018283938989043236, + "learning_rate": 1.9343289424566122e-05, + "loss": 0.0043, + "step": 2044 + }, + { + "epoch": 0.2046, + "grad_norm": 0.2906957268714905, + "learning_rate": 1.9340798923554657e-05, + "loss": 0.0094, + "step": 2046 + }, + { + "epoch": 0.2048, + "grad_norm": 0.3426998257637024, + "learning_rate": 1.933830386995127e-05, + "loss": 0.0104, + "step": 2048 + }, + { + "epoch": 0.205, + "grad_norm": 0.24148772656917572, + "learning_rate": 1.9335804264972018e-05, + "loss": 0.0045, + "step": 2050 + }, + { + "epoch": 0.2052, + "grad_norm": 0.382171630859375, + "learning_rate": 1.9333300109835182e-05, + "loss": 0.0169, + "step": 2052 + }, + { + "epoch": 0.2054, + "grad_norm": 0.022230731323361397, + "learning_rate": 1.9330791405761254e-05, + "loss": 0.0404, + "step": 2054 + }, + { + "epoch": 0.2056, + "grad_norm": 1.058637261390686, + "learning_rate": 1.9328278153972947e-05, + "loss": 0.0248, + "step": 2056 + }, + { + "epoch": 0.2058, + "grad_norm": 0.6960458159446716, + "learning_rate": 1.932576035569519e-05, + "loss": 0.0152, + "step": 2058 + }, + { + "epoch": 0.206, + "grad_norm": 0.2162676304578781, + "learning_rate": 1.9323238012155125e-05, + "loss": 0.0051, + "step": 2060 + }, + { + "epoch": 0.2062, + "grad_norm": 0.32197609543800354, + "learning_rate": 1.932071112458211e-05, + "loss": 0.0054, + "step": 2062 + }, + { + "epoch": 0.2064, + "grad_norm": 6.887453079223633, + "learning_rate": 1.9318179694207726e-05, + "loss": 0.2863, + "step": 2064 + }, + { + "epoch": 0.2066, + "grad_norm": 0.06752581149339676, + "learning_rate": 1.931564372226576e-05, + "loss": 0.001, + "step": 2066 + }, + { + "epoch": 0.2068, + "grad_norm": 0.29920390248298645, + "learning_rate": 1.9313103209992205e-05, + "loss": 0.0365, + "step": 2068 + }, + { + "epoch": 0.207, + "grad_norm": 0.511427104473114, + "learning_rate": 1.9310558158625286e-05, + "loss": 0.0116, + "step": 2070 + }, + { + "epoch": 0.2072, + "grad_norm": 0.015301091596484184, + "learning_rate": 1.9308008569405424e-05, + "loss": 0.1556, + "step": 2072 + }, + { + "epoch": 0.2074, + "grad_norm": 0.3246140778064728, + "learning_rate": 1.930545444357526e-05, + "loss": 0.0435, + "step": 2074 + }, + { + "epoch": 0.2076, + "grad_norm": 0.1288760006427765, + "learning_rate": 1.9302895782379648e-05, + "loss": 0.0068, + "step": 2076 + }, + { + "epoch": 0.2078, + "grad_norm": 1.1523135900497437, + "learning_rate": 1.9300332587065644e-05, + "loss": 0.0175, + "step": 2078 + }, + { + "epoch": 0.208, + "grad_norm": 11.971757888793945, + "learning_rate": 1.9297764858882516e-05, + "loss": 0.1509, + "step": 2080 + }, + { + "epoch": 0.2082, + "grad_norm": 0.07135136425495148, + "learning_rate": 1.9295192599081747e-05, + "loss": 0.0088, + "step": 2082 + }, + { + "epoch": 0.2084, + "grad_norm": 0.028098061680793762, + "learning_rate": 1.9292615808917027e-05, + "loss": 0.0059, + "step": 2084 + }, + { + "epoch": 0.2086, + "grad_norm": 1.015657901763916, + "learning_rate": 1.9290034489644247e-05, + "loss": 0.0147, + "step": 2086 + }, + { + "epoch": 0.2088, + "grad_norm": 0.01227156538516283, + "learning_rate": 1.9287448642521513e-05, + "loss": 0.0113, + "step": 2088 + }, + { + "epoch": 0.209, + "grad_norm": 0.2829113304615021, + "learning_rate": 1.9284858268809135e-05, + "loss": 0.0053, + "step": 2090 + }, + { + "epoch": 0.2092, + "grad_norm": 0.1125040352344513, + "learning_rate": 1.9282263369769633e-05, + "loss": 0.0031, + "step": 2092 + }, + { + "epoch": 0.2094, + "grad_norm": 0.3446142375469208, + "learning_rate": 1.927966394666773e-05, + "loss": 0.0057, + "step": 2094 + }, + { + "epoch": 0.2096, + "grad_norm": 0.005892605055123568, + "learning_rate": 1.9277060000770342e-05, + "loss": 0.0046, + "step": 2096 + }, + { + "epoch": 0.2098, + "grad_norm": 0.031233420595526695, + "learning_rate": 1.9274451533346617e-05, + "loss": 0.0014, + "step": 2098 + }, + { + "epoch": 0.21, + "grad_norm": 0.787681519985199, + "learning_rate": 1.9271838545667876e-05, + "loss": 0.0124, + "step": 2100 + }, + { + "epoch": 0.2102, + "grad_norm": 0.765907883644104, + "learning_rate": 1.9269221039007666e-05, + "loss": 0.0231, + "step": 2102 + }, + { + "epoch": 0.2104, + "grad_norm": 0.010463012382388115, + "learning_rate": 1.9266599014641724e-05, + "loss": 0.0026, + "step": 2104 + }, + { + "epoch": 0.2106, + "grad_norm": 2.658442497253418, + "learning_rate": 1.9263972473847995e-05, + "loss": 0.0134, + "step": 2106 + }, + { + "epoch": 0.2108, + "grad_norm": 0.5674359798431396, + "learning_rate": 1.9261341417906622e-05, + "loss": 0.0161, + "step": 2108 + }, + { + "epoch": 0.211, + "grad_norm": 1.2547845840454102, + "learning_rate": 1.925870584809995e-05, + "loss": 0.0121, + "step": 2110 + }, + { + "epoch": 0.2112, + "grad_norm": 12.533675193786621, + "learning_rate": 1.9256065765712524e-05, + "loss": 0.1511, + "step": 2112 + }, + { + "epoch": 0.2114, + "grad_norm": 0.02157876268029213, + "learning_rate": 1.9253421172031086e-05, + "loss": 0.0067, + "step": 2114 + }, + { + "epoch": 0.2116, + "grad_norm": 0.29280591011047363, + "learning_rate": 1.925077206834458e-05, + "loss": 0.0043, + "step": 2116 + }, + { + "epoch": 0.2118, + "grad_norm": 6.694315433502197, + "learning_rate": 1.9248118455944153e-05, + "loss": 0.2873, + "step": 2118 + }, + { + "epoch": 0.212, + "grad_norm": 0.02138350158929825, + "learning_rate": 1.9245460336123136e-05, + "loss": 0.0007, + "step": 2120 + }, + { + "epoch": 0.2122, + "grad_norm": 2.2434916496276855, + "learning_rate": 1.924279771017706e-05, + "loss": 0.0165, + "step": 2122 + }, + { + "epoch": 0.2124, + "grad_norm": 0.036864154040813446, + "learning_rate": 1.924013057940367e-05, + "loss": 0.3278, + "step": 2124 + }, + { + "epoch": 0.2126, + "grad_norm": 1.0681812763214111, + "learning_rate": 1.923745894510288e-05, + "loss": 0.0193, + "step": 2126 + }, + { + "epoch": 0.2128, + "grad_norm": 0.008122602477669716, + "learning_rate": 1.9234782808576823e-05, + "loss": 0.0806, + "step": 2128 + }, + { + "epoch": 0.213, + "grad_norm": 0.05375581979751587, + "learning_rate": 1.923210217112981e-05, + "loss": 0.0008, + "step": 2130 + }, + { + "epoch": 0.2132, + "grad_norm": 0.06588897109031677, + "learning_rate": 1.9229417034068352e-05, + "loss": 0.002, + "step": 2132 + }, + { + "epoch": 0.2134, + "grad_norm": 0.18324661254882812, + "learning_rate": 1.922672739870115e-05, + "loss": 0.1468, + "step": 2134 + }, + { + "epoch": 0.2136, + "grad_norm": 0.013792909681797028, + "learning_rate": 1.9224033266339103e-05, + "loss": 0.0022, + "step": 2136 + }, + { + "epoch": 0.2138, + "grad_norm": 0.15121284127235413, + "learning_rate": 1.9221334638295296e-05, + "loss": 0.1203, + "step": 2138 + }, + { + "epoch": 0.214, + "grad_norm": 0.8271176815032959, + "learning_rate": 1.9218631515885007e-05, + "loss": 0.0381, + "step": 2140 + }, + { + "epoch": 0.2142, + "grad_norm": 0.358563095331192, + "learning_rate": 1.921592390042571e-05, + "loss": 0.0085, + "step": 2142 + }, + { + "epoch": 0.2144, + "grad_norm": 0.07152023166418076, + "learning_rate": 1.9213211793237056e-05, + "loss": 0.0071, + "step": 2144 + }, + { + "epoch": 0.2146, + "grad_norm": 1.5384330749511719, + "learning_rate": 1.9210495195640895e-05, + "loss": 0.012, + "step": 2146 + }, + { + "epoch": 0.2148, + "grad_norm": 0.03345411643385887, + "learning_rate": 1.9207774108961273e-05, + "loss": 0.4287, + "step": 2148 + }, + { + "epoch": 0.215, + "grad_norm": 0.5375115275382996, + "learning_rate": 1.9205048534524405e-05, + "loss": 0.0102, + "step": 2150 + }, + { + "epoch": 0.2152, + "grad_norm": 0.6341204047203064, + "learning_rate": 1.9202318473658707e-05, + "loss": 0.1535, + "step": 2152 + }, + { + "epoch": 0.2154, + "grad_norm": 0.1360151171684265, + "learning_rate": 1.9199583927694775e-05, + "loss": 0.0038, + "step": 2154 + }, + { + "epoch": 0.2156, + "grad_norm": 0.8312820196151733, + "learning_rate": 1.9196844897965393e-05, + "loss": 0.0697, + "step": 2156 + }, + { + "epoch": 0.2158, + "grad_norm": 0.016392238438129425, + "learning_rate": 1.919410138580553e-05, + "loss": 0.1419, + "step": 2158 + }, + { + "epoch": 0.216, + "grad_norm": 1.025468349456787, + "learning_rate": 1.9191353392552346e-05, + "loss": 0.076, + "step": 2160 + }, + { + "epoch": 0.2162, + "grad_norm": 0.9956053495407104, + "learning_rate": 1.9188600919545176e-05, + "loss": 0.1893, + "step": 2162 + }, + { + "epoch": 0.2164, + "grad_norm": 2.1126253604888916, + "learning_rate": 1.9185843968125543e-05, + "loss": 0.0599, + "step": 2164 + }, + { + "epoch": 0.2166, + "grad_norm": 7.153873443603516, + "learning_rate": 1.918308253963715e-05, + "loss": 0.218, + "step": 2166 + }, + { + "epoch": 0.2168, + "grad_norm": 0.021131642162799835, + "learning_rate": 1.9180316635425883e-05, + "loss": 0.0024, + "step": 2168 + }, + { + "epoch": 0.217, + "grad_norm": 7.829666614532471, + "learning_rate": 1.9177546256839814e-05, + "loss": 0.3245, + "step": 2170 + }, + { + "epoch": 0.2172, + "grad_norm": 1.9065892696380615, + "learning_rate": 1.9174771405229187e-05, + "loss": 0.2238, + "step": 2172 + }, + { + "epoch": 0.2174, + "grad_norm": 2.316793441772461, + "learning_rate": 1.9171992081946436e-05, + "loss": 0.0681, + "step": 2174 + }, + { + "epoch": 0.2176, + "grad_norm": 0.32732558250427246, + "learning_rate": 1.9169208288346168e-05, + "loss": 0.0235, + "step": 2176 + }, + { + "epoch": 0.2178, + "grad_norm": 10.594958305358887, + "learning_rate": 1.9166420025785165e-05, + "loss": 0.0586, + "step": 2178 + }, + { + "epoch": 0.218, + "grad_norm": 2.2788915634155273, + "learning_rate": 1.9163627295622397e-05, + "loss": 0.1292, + "step": 2180 + }, + { + "epoch": 0.2182, + "grad_norm": 0.20932915806770325, + "learning_rate": 1.9160830099219007e-05, + "loss": 0.0061, + "step": 2182 + }, + { + "epoch": 0.2184, + "grad_norm": 4.895481586456299, + "learning_rate": 1.9158028437938316e-05, + "loss": 0.1553, + "step": 2184 + }, + { + "epoch": 0.2186, + "grad_norm": 1.042946219444275, + "learning_rate": 1.9155222313145817e-05, + "loss": 0.0196, + "step": 2186 + }, + { + "epoch": 0.2188, + "grad_norm": 0.3173183798789978, + "learning_rate": 1.9152411726209176e-05, + "loss": 0.0108, + "step": 2188 + }, + { + "epoch": 0.219, + "grad_norm": 0.3091374337673187, + "learning_rate": 1.914959667849825e-05, + "loss": 0.0102, + "step": 2190 + }, + { + "epoch": 0.2192, + "grad_norm": 0.1928303986787796, + "learning_rate": 1.914677717138505e-05, + "loss": 0.1777, + "step": 2192 + }, + { + "epoch": 0.2194, + "grad_norm": 0.12030305713415146, + "learning_rate": 1.9143953206243778e-05, + "loss": 0.0244, + "step": 2194 + }, + { + "epoch": 0.2196, + "grad_norm": 0.09869807213544846, + "learning_rate": 1.914112478445079e-05, + "loss": 0.023, + "step": 2196 + }, + { + "epoch": 0.2198, + "grad_norm": 0.17673590779304504, + "learning_rate": 1.9138291907384632e-05, + "loss": 0.0053, + "step": 2198 + }, + { + "epoch": 0.22, + "grad_norm": 0.12634213268756866, + "learning_rate": 1.913545457642601e-05, + "loss": 0.0208, + "step": 2200 + }, + { + "epoch": 0.2202, + "grad_norm": 0.42820316553115845, + "learning_rate": 1.9132612792957808e-05, + "loss": 0.0221, + "step": 2202 + }, + { + "epoch": 0.2204, + "grad_norm": 0.15101373195648193, + "learning_rate": 1.9129766558365076e-05, + "loss": 0.0044, + "step": 2204 + }, + { + "epoch": 0.2206, + "grad_norm": 0.7563822269439697, + "learning_rate": 1.912691587403503e-05, + "loss": 0.0715, + "step": 2206 + }, + { + "epoch": 0.2208, + "grad_norm": 1.417348861694336, + "learning_rate": 1.9124060741357065e-05, + "loss": 0.0436, + "step": 2208 + }, + { + "epoch": 0.221, + "grad_norm": 0.678869903087616, + "learning_rate": 1.9121201161722732e-05, + "loss": 0.0648, + "step": 2210 + }, + { + "epoch": 0.2212, + "grad_norm": 0.700285792350769, + "learning_rate": 1.911833713652576e-05, + "loss": 0.0112, + "step": 2212 + }, + { + "epoch": 0.2214, + "grad_norm": 1.3849844932556152, + "learning_rate": 1.9115468667162038e-05, + "loss": 0.0259, + "step": 2214 + }, + { + "epoch": 0.2216, + "grad_norm": 0.9156738519668579, + "learning_rate": 1.9112595755029625e-05, + "loss": 0.0133, + "step": 2216 + }, + { + "epoch": 0.2218, + "grad_norm": 0.41304925084114075, + "learning_rate": 1.9109718401528742e-05, + "loss": 0.0098, + "step": 2218 + }, + { + "epoch": 0.222, + "grad_norm": 0.050658710300922394, + "learning_rate": 1.910683660806177e-05, + "loss": 0.0029, + "step": 2220 + }, + { + "epoch": 0.2222, + "grad_norm": 0.005183200351893902, + "learning_rate": 1.9103950376033276e-05, + "loss": 0.0009, + "step": 2222 + }, + { + "epoch": 0.2224, + "grad_norm": 0.4933754801750183, + "learning_rate": 1.9101059706849957e-05, + "loss": 0.0071, + "step": 2224 + }, + { + "epoch": 0.2226, + "grad_norm": 0.006258076056838036, + "learning_rate": 1.9098164601920702e-05, + "loss": 0.148, + "step": 2226 + }, + { + "epoch": 0.2228, + "grad_norm": 3.0806140899658203, + "learning_rate": 1.9095265062656546e-05, + "loss": 0.0191, + "step": 2228 + }, + { + "epoch": 0.223, + "grad_norm": 0.6800084114074707, + "learning_rate": 1.9092361090470688e-05, + "loss": 0.0591, + "step": 2230 + }, + { + "epoch": 0.2232, + "grad_norm": 0.031130176037549973, + "learning_rate": 1.908945268677849e-05, + "loss": 0.0007, + "step": 2232 + }, + { + "epoch": 0.2234, + "grad_norm": 0.02257359027862549, + "learning_rate": 1.908653985299747e-05, + "loss": 0.0531, + "step": 2234 + }, + { + "epoch": 0.2236, + "grad_norm": 0.07785782963037491, + "learning_rate": 1.9083622590547313e-05, + "loss": 0.0188, + "step": 2236 + }, + { + "epoch": 0.2238, + "grad_norm": 0.44433024525642395, + "learning_rate": 1.9080700900849855e-05, + "loss": 0.0114, + "step": 2238 + }, + { + "epoch": 0.224, + "grad_norm": 0.01809157244861126, + "learning_rate": 1.907777478532909e-05, + "loss": 0.0009, + "step": 2240 + }, + { + "epoch": 0.2242, + "grad_norm": 0.016320038586854935, + "learning_rate": 1.907484424541117e-05, + "loss": 0.0017, + "step": 2242 + }, + { + "epoch": 0.2244, + "grad_norm": 0.03684142604470253, + "learning_rate": 1.907190928252441e-05, + "loss": 0.0266, + "step": 2244 + }, + { + "epoch": 0.2246, + "grad_norm": 1.5787662267684937, + "learning_rate": 1.906896989809927e-05, + "loss": 0.0108, + "step": 2246 + }, + { + "epoch": 0.2248, + "grad_norm": 0.09943072497844696, + "learning_rate": 1.906602609356838e-05, + "loss": 0.0014, + "step": 2248 + }, + { + "epoch": 0.225, + "grad_norm": 0.006743444595485926, + "learning_rate": 1.9063077870366504e-05, + "loss": 0.0018, + "step": 2250 + }, + { + "epoch": 0.2252, + "grad_norm": 0.9977972507476807, + "learning_rate": 1.9060125229930572e-05, + "loss": 0.0108, + "step": 2252 + }, + { + "epoch": 0.2254, + "grad_norm": 0.047536883503198624, + "learning_rate": 1.9057168173699664e-05, + "loss": 0.0107, + "step": 2254 + }, + { + "epoch": 0.2256, + "grad_norm": 0.009802975691854954, + "learning_rate": 1.905420670311502e-05, + "loss": 0.0704, + "step": 2256 + }, + { + "epoch": 0.2258, + "grad_norm": 0.007023252546787262, + "learning_rate": 1.9051240819620018e-05, + "loss": 0.0014, + "step": 2258 + }, + { + "epoch": 0.226, + "grad_norm": 8.470361709594727, + "learning_rate": 1.9048270524660197e-05, + "loss": 0.0881, + "step": 2260 + }, + { + "epoch": 0.2262, + "grad_norm": 8.82681941986084, + "learning_rate": 1.904529581968324e-05, + "loss": 0.1526, + "step": 2262 + }, + { + "epoch": 0.2264, + "grad_norm": 0.6663078665733337, + "learning_rate": 1.9042316706138987e-05, + "loss": 0.007, + "step": 2264 + }, + { + "epoch": 0.2266, + "grad_norm": 0.13293421268463135, + "learning_rate": 1.903933318547942e-05, + "loss": 0.0025, + "step": 2266 + }, + { + "epoch": 0.2268, + "grad_norm": 2.720273733139038, + "learning_rate": 1.9036345259158667e-05, + "loss": 0.0581, + "step": 2268 + }, + { + "epoch": 0.227, + "grad_norm": 0.15201669931411743, + "learning_rate": 1.903335292863301e-05, + "loss": 0.0112, + "step": 2270 + }, + { + "epoch": 0.2272, + "grad_norm": 0.02272774837911129, + "learning_rate": 1.9030356195360875e-05, + "loss": 0.0266, + "step": 2272 + }, + { + "epoch": 0.2274, + "grad_norm": 0.009700953029096127, + "learning_rate": 1.902735506080283e-05, + "loss": 0.0441, + "step": 2274 + }, + { + "epoch": 0.2276, + "grad_norm": 0.014973854646086693, + "learning_rate": 1.9024349526421596e-05, + "loss": 0.0005, + "step": 2276 + }, + { + "epoch": 0.2278, + "grad_norm": 0.008522269316017628, + "learning_rate": 1.902133959368203e-05, + "loss": 0.0078, + "step": 2278 + }, + { + "epoch": 0.228, + "grad_norm": 0.10257984697818756, + "learning_rate": 1.901832526405114e-05, + "loss": 0.0039, + "step": 2280 + }, + { + "epoch": 0.2282, + "grad_norm": 2.330382823944092, + "learning_rate": 1.901530653899807e-05, + "loss": 0.0399, + "step": 2282 + }, + { + "epoch": 0.2284, + "grad_norm": 26.918113708496094, + "learning_rate": 1.9012283419994115e-05, + "loss": 0.2796, + "step": 2284 + }, + { + "epoch": 0.2286, + "grad_norm": 1.1314181089401245, + "learning_rate": 1.9009255908512704e-05, + "loss": 0.0486, + "step": 2286 + }, + { + "epoch": 0.2288, + "grad_norm": 0.041868533939123154, + "learning_rate": 1.9006224006029404e-05, + "loss": 0.0017, + "step": 2288 + }, + { + "epoch": 0.229, + "grad_norm": 0.7161178588867188, + "learning_rate": 1.9003187714021936e-05, + "loss": 0.0097, + "step": 2290 + }, + { + "epoch": 0.2292, + "grad_norm": 0.007341247983276844, + "learning_rate": 1.9000147033970148e-05, + "loss": 0.0714, + "step": 2292 + }, + { + "epoch": 0.2294, + "grad_norm": 0.10849364101886749, + "learning_rate": 1.899710196735603e-05, + "loss": 0.0011, + "step": 2294 + }, + { + "epoch": 0.2296, + "grad_norm": 0.5477367639541626, + "learning_rate": 1.899405251566371e-05, + "loss": 0.0049, + "step": 2296 + }, + { + "epoch": 0.2298, + "grad_norm": 1.9254839420318604, + "learning_rate": 1.8990998680379458e-05, + "loss": 0.0522, + "step": 2298 + }, + { + "epoch": 0.23, + "grad_norm": 0.00316081615164876, + "learning_rate": 1.8987940462991673e-05, + "loss": 0.0074, + "step": 2300 + }, + { + "epoch": 0.2302, + "grad_norm": 0.015676138922572136, + "learning_rate": 1.8984877864990888e-05, + "loss": 0.0007, + "step": 2302 + }, + { + "epoch": 0.2304, + "grad_norm": 2.9232866764068604, + "learning_rate": 1.8981810887869784e-05, + "loss": 0.1584, + "step": 2304 + }, + { + "epoch": 0.2306, + "grad_norm": 0.41468268632888794, + "learning_rate": 1.897873953312317e-05, + "loss": 0.0048, + "step": 2306 + }, + { + "epoch": 0.2308, + "grad_norm": 1.6694873571395874, + "learning_rate": 1.8975663802247978e-05, + "loss": 0.0165, + "step": 2308 + }, + { + "epoch": 0.231, + "grad_norm": 2.215498685836792, + "learning_rate": 1.8972583696743284e-05, + "loss": 0.0199, + "step": 2310 + }, + { + "epoch": 0.2312, + "grad_norm": 0.003365688491612673, + "learning_rate": 1.8969499218110302e-05, + "loss": 0.0311, + "step": 2312 + }, + { + "epoch": 0.2314, + "grad_norm": 0.046891309320926666, + "learning_rate": 1.896641036785236e-05, + "loss": 0.0012, + "step": 2314 + }, + { + "epoch": 0.2316, + "grad_norm": 0.006527389399707317, + "learning_rate": 1.896331714747493e-05, + "loss": 0.0105, + "step": 2316 + }, + { + "epoch": 0.2318, + "grad_norm": 2.046299695968628, + "learning_rate": 1.896021955848561e-05, + "loss": 0.0085, + "step": 2318 + }, + { + "epoch": 0.232, + "grad_norm": 0.01309885736554861, + "learning_rate": 1.895711760239413e-05, + "loss": 0.0214, + "step": 2320 + }, + { + "epoch": 0.2322, + "grad_norm": 0.02459213137626648, + "learning_rate": 1.895401128071234e-05, + "loss": 0.002, + "step": 2322 + }, + { + "epoch": 0.2324, + "grad_norm": 0.03705708682537079, + "learning_rate": 1.8950900594954226e-05, + "loss": 0.001, + "step": 2324 + }, + { + "epoch": 0.2326, + "grad_norm": 0.16528575122356415, + "learning_rate": 1.8947785546635905e-05, + "loss": 0.0016, + "step": 2326 + }, + { + "epoch": 0.2328, + "grad_norm": 0.014888141304254532, + "learning_rate": 1.89446661372756e-05, + "loss": 0.2201, + "step": 2328 + }, + { + "epoch": 0.233, + "grad_norm": 1.7522190809249878, + "learning_rate": 1.8941542368393683e-05, + "loss": 0.231, + "step": 2330 + }, + { + "epoch": 0.2332, + "grad_norm": 0.2812393605709076, + "learning_rate": 1.893841424151264e-05, + "loss": 0.0377, + "step": 2332 + }, + { + "epoch": 0.2334, + "grad_norm": 0.20066316425800323, + "learning_rate": 1.893528175815708e-05, + "loss": 0.003, + "step": 2334 + }, + { + "epoch": 0.2336, + "grad_norm": 0.0927211344242096, + "learning_rate": 1.893214491985374e-05, + "loss": 0.0018, + "step": 2336 + }, + { + "epoch": 0.2338, + "grad_norm": 0.015758885070681572, + "learning_rate": 1.892900372813147e-05, + "loss": 0.0006, + "step": 2338 + }, + { + "epoch": 0.234, + "grad_norm": 0.04744737222790718, + "learning_rate": 1.892585818452126e-05, + "loss": 0.0018, + "step": 2340 + }, + { + "epoch": 0.2342, + "grad_norm": 4.923985004425049, + "learning_rate": 1.8922708290556197e-05, + "loss": 0.0502, + "step": 2342 + }, + { + "epoch": 0.2344, + "grad_norm": 1.2474650144577026, + "learning_rate": 1.8919554047771508e-05, + "loss": 0.0357, + "step": 2344 + }, + { + "epoch": 0.2346, + "grad_norm": 0.004084486979991198, + "learning_rate": 1.8916395457704536e-05, + "loss": 0.0003, + "step": 2346 + }, + { + "epoch": 0.2348, + "grad_norm": 0.10831256955862045, + "learning_rate": 1.8913232521894734e-05, + "loss": 0.0023, + "step": 2348 + }, + { + "epoch": 0.235, + "grad_norm": 0.18655095994472504, + "learning_rate": 1.891006524188368e-05, + "loss": 0.0018, + "step": 2350 + }, + { + "epoch": 0.2352, + "grad_norm": 0.9880556464195251, + "learning_rate": 1.890689361921507e-05, + "loss": 0.0143, + "step": 2352 + }, + { + "epoch": 0.2354, + "grad_norm": 0.023437025025486946, + "learning_rate": 1.8903717655434708e-05, + "loss": 0.0398, + "step": 2354 + }, + { + "epoch": 0.2356, + "grad_norm": 0.31925874948501587, + "learning_rate": 1.8900537352090523e-05, + "loss": 0.0043, + "step": 2356 + }, + { + "epoch": 0.2358, + "grad_norm": 0.03191694989800453, + "learning_rate": 1.8897352710732564e-05, + "loss": 0.0637, + "step": 2358 + }, + { + "epoch": 0.236, + "grad_norm": 0.14477726817131042, + "learning_rate": 1.889416373291298e-05, + "loss": 0.0013, + "step": 2360 + }, + { + "epoch": 0.2362, + "grad_norm": 0.07636052370071411, + "learning_rate": 1.8890970420186035e-05, + "loss": 0.0648, + "step": 2362 + }, + { + "epoch": 0.2364, + "grad_norm": 0.3140102028846741, + "learning_rate": 1.8887772774108116e-05, + "loss": 0.0049, + "step": 2364 + }, + { + "epoch": 0.2366, + "grad_norm": 1.0827863216400146, + "learning_rate": 1.888457079623772e-05, + "loss": 0.0094, + "step": 2366 + }, + { + "epoch": 0.2368, + "grad_norm": 0.3655107319355011, + "learning_rate": 1.8881364488135448e-05, + "loss": 0.0023, + "step": 2368 + }, + { + "epoch": 0.237, + "grad_norm": 0.29985347390174866, + "learning_rate": 1.8878153851364013e-05, + "loss": 0.0078, + "step": 2370 + }, + { + "epoch": 0.2372, + "grad_norm": 0.172471821308136, + "learning_rate": 1.887493888748825e-05, + "loss": 0.0237, + "step": 2372 + }, + { + "epoch": 0.2374, + "grad_norm": 0.004568289499729872, + "learning_rate": 1.8871719598075083e-05, + "loss": 0.0016, + "step": 2374 + }, + { + "epoch": 0.2376, + "grad_norm": 4.679413795471191, + "learning_rate": 1.886849598469356e-05, + "loss": 0.0527, + "step": 2376 + }, + { + "epoch": 0.2378, + "grad_norm": 0.003720531240105629, + "learning_rate": 1.8865268048914828e-05, + "loss": 0.0169, + "step": 2378 + }, + { + "epoch": 0.238, + "grad_norm": 2.204672336578369, + "learning_rate": 1.8862035792312148e-05, + "loss": 0.0192, + "step": 2380 + }, + { + "epoch": 0.2382, + "grad_norm": 0.013307425193488598, + "learning_rate": 1.8858799216460883e-05, + "loss": 0.0002, + "step": 2382 + }, + { + "epoch": 0.2384, + "grad_norm": 0.8099572062492371, + "learning_rate": 1.8855558322938492e-05, + "loss": 0.0068, + "step": 2384 + }, + { + "epoch": 0.2386, + "grad_norm": 0.02402317337691784, + "learning_rate": 1.8852313113324553e-05, + "loss": 0.0088, + "step": 2386 + }, + { + "epoch": 0.2388, + "grad_norm": 0.00947823841124773, + "learning_rate": 1.8849063589200744e-05, + "loss": 0.0011, + "step": 2388 + }, + { + "epoch": 0.239, + "grad_norm": 8.517850875854492, + "learning_rate": 1.884580975215084e-05, + "loss": 0.1211, + "step": 2390 + }, + { + "epoch": 0.2392, + "grad_norm": 0.06760305911302567, + "learning_rate": 1.8842551603760725e-05, + "loss": 0.0005, + "step": 2392 + }, + { + "epoch": 0.2394, + "grad_norm": 0.003327383892610669, + "learning_rate": 1.8839289145618378e-05, + "loss": 0.0267, + "step": 2394 + }, + { + "epoch": 0.2396, + "grad_norm": 0.009006769396364689, + "learning_rate": 1.8836022379313884e-05, + "loss": 0.0545, + "step": 2396 + }, + { + "epoch": 0.2398, + "grad_norm": 0.007364633958786726, + "learning_rate": 1.883275130643942e-05, + "loss": 0.004, + "step": 2398 + }, + { + "epoch": 0.24, + "grad_norm": 0.3914918303489685, + "learning_rate": 1.8829475928589272e-05, + "loss": 0.0041, + "step": 2400 + }, + { + "epoch": 0.2402, + "grad_norm": 2.822472333908081, + "learning_rate": 1.882619624735982e-05, + "loss": 0.1016, + "step": 2402 + }, + { + "epoch": 0.2404, + "grad_norm": 8.486278533935547, + "learning_rate": 1.8822912264349535e-05, + "loss": 0.1279, + "step": 2404 + }, + { + "epoch": 0.2406, + "grad_norm": 0.3313959240913391, + "learning_rate": 1.8819623981158996e-05, + "loss": 0.0031, + "step": 2406 + }, + { + "epoch": 0.2408, + "grad_norm": 0.008212238550186157, + "learning_rate": 1.881633139939087e-05, + "loss": 0.0145, + "step": 2408 + }, + { + "epoch": 0.241, + "grad_norm": 0.013080619275569916, + "learning_rate": 1.8813034520649923e-05, + "loss": 0.0016, + "step": 2410 + }, + { + "epoch": 0.2412, + "grad_norm": 4.607761383056641, + "learning_rate": 1.8809733346543013e-05, + "loss": 0.1372, + "step": 2412 + }, + { + "epoch": 0.2414, + "grad_norm": 2.0770394802093506, + "learning_rate": 1.880642787867909e-05, + "loss": 0.0156, + "step": 2414 + }, + { + "epoch": 0.2416, + "grad_norm": 0.14130425453186035, + "learning_rate": 1.8803118118669203e-05, + "loss": 0.0016, + "step": 2416 + }, + { + "epoch": 0.2418, + "grad_norm": 0.006875086110085249, + "learning_rate": 1.8799804068126487e-05, + "loss": 0.0006, + "step": 2418 + }, + { + "epoch": 0.242, + "grad_norm": 0.0676179975271225, + "learning_rate": 1.879648572866617e-05, + "loss": 0.0081, + "step": 2420 + }, + { + "epoch": 0.2422, + "grad_norm": 0.23715166747570038, + "learning_rate": 1.8793163101905562e-05, + "loss": 0.0047, + "step": 2422 + }, + { + "epoch": 0.2424, + "grad_norm": 0.07062618434429169, + "learning_rate": 1.878983618946409e-05, + "loss": 0.0008, + "step": 2424 + }, + { + "epoch": 0.2426, + "grad_norm": 0.07525460422039032, + "learning_rate": 1.878650499296323e-05, + "loss": 0.0016, + "step": 2426 + }, + { + "epoch": 0.2428, + "grad_norm": 0.4602449834346771, + "learning_rate": 1.878316951402658e-05, + "loss": 0.0043, + "step": 2428 + }, + { + "epoch": 0.243, + "grad_norm": 0.07667896151542664, + "learning_rate": 1.8779829754279806e-05, + "loss": 0.002, + "step": 2430 + }, + { + "epoch": 0.2432, + "grad_norm": 0.0832936242222786, + "learning_rate": 1.8776485715350672e-05, + "loss": 0.0006, + "step": 2432 + }, + { + "epoch": 0.2434, + "grad_norm": 0.011641599237918854, + "learning_rate": 1.8773137398869017e-05, + "loss": 0.0094, + "step": 2434 + }, + { + "epoch": 0.2436, + "grad_norm": 0.20685087144374847, + "learning_rate": 1.8769784806466768e-05, + "loss": 0.0028, + "step": 2436 + }, + { + "epoch": 0.2438, + "grad_norm": 0.1979910284280777, + "learning_rate": 1.8766427939777943e-05, + "loss": 0.5024, + "step": 2438 + }, + { + "epoch": 0.244, + "grad_norm": 0.7747049927711487, + "learning_rate": 1.8763066800438638e-05, + "loss": 0.0116, + "step": 2440 + }, + { + "epoch": 0.2442, + "grad_norm": 0.0012697032652795315, + "learning_rate": 1.8759701390087026e-05, + "loss": 0.0001, + "step": 2442 + }, + { + "epoch": 0.2444, + "grad_norm": 0.009616876021027565, + "learning_rate": 1.8756331710363375e-05, + "loss": 0.008, + "step": 2444 + }, + { + "epoch": 0.2446, + "grad_norm": 0.02790871076285839, + "learning_rate": 1.8752957762910016e-05, + "loss": 0.0007, + "step": 2446 + }, + { + "epoch": 0.2448, + "grad_norm": 3.606736660003662, + "learning_rate": 1.874957954937138e-05, + "loss": 0.0928, + "step": 2448 + }, + { + "epoch": 0.245, + "grad_norm": 0.09963903576135635, + "learning_rate": 1.874619707139396e-05, + "loss": 0.0241, + "step": 2450 + }, + { + "epoch": 0.2452, + "grad_norm": 0.666336178779602, + "learning_rate": 1.8742810330626338e-05, + "loss": 0.0077, + "step": 2452 + }, + { + "epoch": 0.2454, + "grad_norm": 0.4768604040145874, + "learning_rate": 1.873941932871917e-05, + "loss": 0.0041, + "step": 2454 + }, + { + "epoch": 0.2456, + "grad_norm": 0.146087184548378, + "learning_rate": 1.8736024067325188e-05, + "loss": 0.003, + "step": 2456 + }, + { + "epoch": 0.2458, + "grad_norm": 0.1800411194562912, + "learning_rate": 1.8732624548099204e-05, + "loss": 0.0674, + "step": 2458 + }, + { + "epoch": 0.246, + "grad_norm": 0.04004288464784622, + "learning_rate": 1.8729220772698096e-05, + "loss": 0.0458, + "step": 2460 + }, + { + "epoch": 0.2462, + "grad_norm": 0.028937095776200294, + "learning_rate": 1.8725812742780832e-05, + "loss": 0.0011, + "step": 2462 + }, + { + "epoch": 0.2464, + "grad_norm": 0.35365161299705505, + "learning_rate": 1.8722400460008437e-05, + "loss": 0.0288, + "step": 2464 + }, + { + "epoch": 0.2466, + "grad_norm": 0.09459855407476425, + "learning_rate": 1.871898392604402e-05, + "loss": 0.0013, + "step": 2466 + }, + { + "epoch": 0.2468, + "grad_norm": 0.011824218556284904, + "learning_rate": 1.8715563142552758e-05, + "loss": 0.0038, + "step": 2468 + }, + { + "epoch": 0.247, + "grad_norm": 0.9769842028617859, + "learning_rate": 1.8712138111201898e-05, + "loss": 0.0648, + "step": 2470 + }, + { + "epoch": 0.2472, + "grad_norm": 0.008695041760802269, + "learning_rate": 1.8708708833660755e-05, + "loss": 0.0054, + "step": 2472 + }, + { + "epoch": 0.2474, + "grad_norm": 0.11631220579147339, + "learning_rate": 1.8705275311600724e-05, + "loss": 0.0024, + "step": 2474 + }, + { + "epoch": 0.2476, + "grad_norm": 9.470869064331055, + "learning_rate": 1.870183754669526e-05, + "loss": 0.4383, + "step": 2476 + }, + { + "epoch": 0.2478, + "grad_norm": 0.014450752176344395, + "learning_rate": 1.8698395540619883e-05, + "loss": 0.0004, + "step": 2478 + }, + { + "epoch": 0.248, + "grad_norm": 0.013003915548324585, + "learning_rate": 1.869494929505219e-05, + "loss": 0.0004, + "step": 2480 + }, + { + "epoch": 0.2482, + "grad_norm": 12.799243927001953, + "learning_rate": 1.869149881167184e-05, + "loss": 0.2364, + "step": 2482 + }, + { + "epoch": 0.2484, + "grad_norm": 0.028044288977980614, + "learning_rate": 1.8688044092160554e-05, + "loss": 0.0009, + "step": 2484 + }, + { + "epoch": 0.2486, + "grad_norm": 1.208638072013855, + "learning_rate": 1.8684585138202122e-05, + "loss": 0.0602, + "step": 2486 + }, + { + "epoch": 0.2488, + "grad_norm": 0.13599373400211334, + "learning_rate": 1.8681121951482397e-05, + "loss": 0.003, + "step": 2488 + }, + { + "epoch": 0.249, + "grad_norm": 0.5881529450416565, + "learning_rate": 1.8677654533689287e-05, + "loss": 0.0102, + "step": 2490 + }, + { + "epoch": 0.2492, + "grad_norm": 6.612403392791748, + "learning_rate": 1.8674182886512776e-05, + "loss": 0.1643, + "step": 2492 + }, + { + "epoch": 0.2494, + "grad_norm": 0.12871763110160828, + "learning_rate": 1.86707070116449e-05, + "loss": 0.0103, + "step": 2494 + }, + { + "epoch": 0.2496, + "grad_norm": 0.05006370693445206, + "learning_rate": 1.8667226910779767e-05, + "loss": 0.0056, + "step": 2496 + }, + { + "epoch": 0.2498, + "grad_norm": 0.16465511918067932, + "learning_rate": 1.866374258561352e-05, + "loss": 0.0071, + "step": 2498 + }, + { + "epoch": 0.25, + "grad_norm": 0.18987645208835602, + "learning_rate": 1.866025403784439e-05, + "loss": 0.0059, + "step": 2500 + }, + { + "epoch": 0.2502, + "grad_norm": 0.017385145649313927, + "learning_rate": 1.8656761269172645e-05, + "loss": 0.0323, + "step": 2502 + }, + { + "epoch": 0.2504, + "grad_norm": 0.24159257113933563, + "learning_rate": 1.8653264281300622e-05, + "loss": 0.006, + "step": 2504 + }, + { + "epoch": 0.2506, + "grad_norm": 0.01998945325613022, + "learning_rate": 1.864976307593271e-05, + "loss": 0.0355, + "step": 2506 + }, + { + "epoch": 0.2508, + "grad_norm": 0.4201987087726593, + "learning_rate": 1.864625765477535e-05, + "loss": 0.1025, + "step": 2508 + }, + { + "epoch": 0.251, + "grad_norm": 0.01113971322774887, + "learning_rate": 1.864274801953705e-05, + "loss": 0.0007, + "step": 2510 + }, + { + "epoch": 0.2512, + "grad_norm": 1.2167398929595947, + "learning_rate": 1.8639234171928355e-05, + "loss": 0.0158, + "step": 2512 + }, + { + "epoch": 0.2514, + "grad_norm": 0.03170488029718399, + "learning_rate": 1.8635716113661876e-05, + "loss": 0.0108, + "step": 2514 + }, + { + "epoch": 0.2516, + "grad_norm": 0.5664913654327393, + "learning_rate": 1.863219384645227e-05, + "loss": 0.0121, + "step": 2516 + }, + { + "epoch": 0.2518, + "grad_norm": 3.211520195007324, + "learning_rate": 1.862866737201625e-05, + "loss": 0.0533, + "step": 2518 + }, + { + "epoch": 0.252, + "grad_norm": 0.4617428481578827, + "learning_rate": 1.8625136692072577e-05, + "loss": 0.0132, + "step": 2520 + }, + { + "epoch": 0.2522, + "grad_norm": 3.6172094345092773, + "learning_rate": 1.862160180834206e-05, + "loss": 0.0781, + "step": 2522 + }, + { + "epoch": 0.2524, + "grad_norm": 5.696885108947754, + "learning_rate": 1.861806272254755e-05, + "loss": 0.0783, + "step": 2524 + }, + { + "epoch": 0.2526, + "grad_norm": 0.09143863618373871, + "learning_rate": 1.8614519436413968e-05, + "loss": 0.0017, + "step": 2526 + }, + { + "epoch": 0.2528, + "grad_norm": 0.0744725689291954, + "learning_rate": 1.8610971951668265e-05, + "loss": 0.2624, + "step": 2528 + }, + { + "epoch": 0.253, + "grad_norm": 0.013282938860356808, + "learning_rate": 1.860742027003944e-05, + "loss": 0.0169, + "step": 2530 + }, + { + "epoch": 0.2532, + "grad_norm": 0.020966295152902603, + "learning_rate": 1.8603864393258534e-05, + "loss": 0.0008, + "step": 2532 + }, + { + "epoch": 0.2534, + "grad_norm": 6.862277984619141, + "learning_rate": 1.860030432305865e-05, + "loss": 0.0378, + "step": 2534 + }, + { + "epoch": 0.2536, + "grad_norm": 0.07314666360616684, + "learning_rate": 1.8596740061174912e-05, + "loss": 0.0142, + "step": 2536 + }, + { + "epoch": 0.2538, + "grad_norm": 0.25515419244766235, + "learning_rate": 1.8593171609344505e-05, + "loss": 0.0023, + "step": 2538 + }, + { + "epoch": 0.254, + "grad_norm": 0.2179339975118637, + "learning_rate": 1.8589598969306646e-05, + "loss": 0.0038, + "step": 2540 + }, + { + "epoch": 0.2542, + "grad_norm": 9.988442420959473, + "learning_rate": 1.8586022142802597e-05, + "loss": 0.4326, + "step": 2542 + }, + { + "epoch": 0.2544, + "grad_norm": 0.6861976981163025, + "learning_rate": 1.8582441131575658e-05, + "loss": 0.0086, + "step": 2544 + }, + { + "epoch": 0.2546, + "grad_norm": 0.5423780083656311, + "learning_rate": 1.8578855937371176e-05, + "loss": 0.0059, + "step": 2546 + }, + { + "epoch": 0.2548, + "grad_norm": 0.05230497568845749, + "learning_rate": 1.8575266561936526e-05, + "loss": 0.0053, + "step": 2548 + }, + { + "epoch": 0.255, + "grad_norm": 0.14102262258529663, + "learning_rate": 1.8571673007021124e-05, + "loss": 0.006, + "step": 2550 + }, + { + "epoch": 0.2552, + "grad_norm": 0.028064826503396034, + "learning_rate": 1.856807527437643e-05, + "loss": 0.0397, + "step": 2552 + }, + { + "epoch": 0.2554, + "grad_norm": 1.3811519145965576, + "learning_rate": 1.8564473365755936e-05, + "loss": 0.036, + "step": 2554 + }, + { + "epoch": 0.2556, + "grad_norm": 0.11765344440937042, + "learning_rate": 1.8560867282915164e-05, + "loss": 0.0024, + "step": 2556 + }, + { + "epoch": 0.2558, + "grad_norm": 2.8922412395477295, + "learning_rate": 1.8557257027611677e-05, + "loss": 0.1099, + "step": 2558 + }, + { + "epoch": 0.256, + "grad_norm": 0.14319667220115662, + "learning_rate": 1.855364260160507e-05, + "loss": 0.003, + "step": 2560 + }, + { + "epoch": 0.2562, + "grad_norm": 0.553295373916626, + "learning_rate": 1.8550024006656967e-05, + "loss": 0.0129, + "step": 2562 + }, + { + "epoch": 0.2564, + "grad_norm": 3.658304214477539, + "learning_rate": 1.854640124453103e-05, + "loss": 0.0555, + "step": 2564 + }, + { + "epoch": 0.2566, + "grad_norm": 0.2935638725757599, + "learning_rate": 1.8542774316992953e-05, + "loss": 0.0074, + "step": 2566 + }, + { + "epoch": 0.2568, + "grad_norm": 6.764472961425781, + "learning_rate": 1.8539143225810453e-05, + "loss": 0.0971, + "step": 2568 + }, + { + "epoch": 0.257, + "grad_norm": 0.008479473181068897, + "learning_rate": 1.8535507972753275e-05, + "loss": 0.002, + "step": 2570 + }, + { + "epoch": 0.2572, + "grad_norm": 0.2971707582473755, + "learning_rate": 1.8531868559593205e-05, + "loss": 0.0088, + "step": 2572 + }, + { + "epoch": 0.2574, + "grad_norm": 0.020390568301081657, + "learning_rate": 1.8528224988104044e-05, + "loss": 0.0013, + "step": 2574 + }, + { + "epoch": 0.2576, + "grad_norm": 0.3303556442260742, + "learning_rate": 1.8524577260061628e-05, + "loss": 0.0046, + "step": 2576 + }, + { + "epoch": 0.2578, + "grad_norm": 2.2684614658355713, + "learning_rate": 1.8520925377243812e-05, + "loss": 0.0622, + "step": 2578 + }, + { + "epoch": 0.258, + "grad_norm": 0.014609359204769135, + "learning_rate": 1.851726934143048e-05, + "loss": 0.0243, + "step": 2580 + }, + { + "epoch": 0.2582, + "grad_norm": 1.4709503650665283, + "learning_rate": 1.8513609154403535e-05, + "loss": 0.0325, + "step": 2582 + }, + { + "epoch": 0.2584, + "grad_norm": 1.4245136976242065, + "learning_rate": 1.850994481794692e-05, + "loss": 0.0236, + "step": 2584 + }, + { + "epoch": 0.2586, + "grad_norm": 0.03192967176437378, + "learning_rate": 1.850627633384658e-05, + "loss": 0.0023, + "step": 2586 + }, + { + "epoch": 0.2588, + "grad_norm": 0.2730173170566559, + "learning_rate": 1.8502603703890488e-05, + "loss": 0.0049, + "step": 2588 + }, + { + "epoch": 0.259, + "grad_norm": 0.012575945816934109, + "learning_rate": 1.849892692986864e-05, + "loss": 0.0075, + "step": 2590 + }, + { + "epoch": 0.2592, + "grad_norm": 0.023760244250297546, + "learning_rate": 1.8495246013573057e-05, + "loss": 0.001, + "step": 2592 + }, + { + "epoch": 0.2594, + "grad_norm": 0.05033790320158005, + "learning_rate": 1.8491560956797766e-05, + "loss": 0.0017, + "step": 2594 + }, + { + "epoch": 0.2596, + "grad_norm": 0.00511547364294529, + "learning_rate": 1.848787176133882e-05, + "loss": 0.068, + "step": 2596 + }, + { + "epoch": 0.2598, + "grad_norm": 0.14851805567741394, + "learning_rate": 1.848417842899429e-05, + "loss": 0.2025, + "step": 2598 + }, + { + "epoch": 0.26, + "grad_norm": 0.009046013467013836, + "learning_rate": 1.848048096156426e-05, + "loss": 0.001, + "step": 2600 + }, + { + "epoch": 0.2602, + "grad_norm": 0.29887494444847107, + "learning_rate": 1.8476779360850833e-05, + "loss": 0.008, + "step": 2602 + }, + { + "epoch": 0.2604, + "grad_norm": 0.27408722043037415, + "learning_rate": 1.8473073628658123e-05, + "loss": 0.127, + "step": 2604 + }, + { + "epoch": 0.2606, + "grad_norm": 0.04907645285129547, + "learning_rate": 1.8469363766792258e-05, + "loss": 0.0009, + "step": 2606 + }, + { + "epoch": 0.2608, + "grad_norm": 0.012145808897912502, + "learning_rate": 1.8465649777061377e-05, + "loss": 0.006, + "step": 2608 + }, + { + "epoch": 0.261, + "grad_norm": 0.010839198715984821, + "learning_rate": 1.8461931661275642e-05, + "loss": 0.0027, + "step": 2610 + }, + { + "epoch": 0.2612, + "grad_norm": 0.4327690899372101, + "learning_rate": 1.8458209421247208e-05, + "loss": 0.0056, + "step": 2612 + }, + { + "epoch": 0.2614, + "grad_norm": 0.010758395306766033, + "learning_rate": 1.8454483058790254e-05, + "loss": 0.0177, + "step": 2614 + }, + { + "epoch": 0.2616, + "grad_norm": 0.067116379737854, + "learning_rate": 1.8450752575720967e-05, + "loss": 0.0016, + "step": 2616 + }, + { + "epoch": 0.2618, + "grad_norm": 0.017441291362047195, + "learning_rate": 1.844701797385753e-05, + "loss": 0.0007, + "step": 2618 + }, + { + "epoch": 0.262, + "grad_norm": 0.023930616676807404, + "learning_rate": 1.8443279255020153e-05, + "loss": 0.0007, + "step": 2620 + }, + { + "epoch": 0.2622, + "grad_norm": 0.34430786967277527, + "learning_rate": 1.8439536421031035e-05, + "loss": 0.0067, + "step": 2622 + }, + { + "epoch": 0.2624, + "grad_norm": 0.009939122013747692, + "learning_rate": 1.843578947371439e-05, + "loss": 0.0014, + "step": 2624 + }, + { + "epoch": 0.2626, + "grad_norm": 1.0190532207489014, + "learning_rate": 1.8432038414896432e-05, + "loss": 0.0139, + "step": 2626 + }, + { + "epoch": 0.2628, + "grad_norm": 0.025965211912989616, + "learning_rate": 1.842828324640539e-05, + "loss": 0.0011, + "step": 2628 + }, + { + "epoch": 0.263, + "grad_norm": 0.17891834676265717, + "learning_rate": 1.842452397007148e-05, + "loss": 0.0494, + "step": 2630 + }, + { + "epoch": 0.2632, + "grad_norm": 0.14101457595825195, + "learning_rate": 1.8420760587726925e-05, + "loss": 0.0027, + "step": 2632 + }, + { + "epoch": 0.2634, + "grad_norm": 0.03354544937610626, + "learning_rate": 1.8416993101205957e-05, + "loss": 0.0009, + "step": 2634 + }, + { + "epoch": 0.2636, + "grad_norm": 0.21595126390457153, + "learning_rate": 1.8413221512344805e-05, + "loss": 0.0033, + "step": 2636 + }, + { + "epoch": 0.2638, + "grad_norm": 0.01768454536795616, + "learning_rate": 1.8409445822981694e-05, + "loss": 0.0132, + "step": 2638 + }, + { + "epoch": 0.264, + "grad_norm": 1.6864780187606812, + "learning_rate": 1.8405666034956842e-05, + "loss": 0.0626, + "step": 2640 + }, + { + "epoch": 0.2642, + "grad_norm": 8.990567207336426, + "learning_rate": 1.8401882150112485e-05, + "loss": 0.1125, + "step": 2642 + }, + { + "epoch": 0.2644, + "grad_norm": 0.08969954401254654, + "learning_rate": 1.839809417029283e-05, + "loss": 0.0048, + "step": 2644 + }, + { + "epoch": 0.2646, + "grad_norm": 0.13560540974140167, + "learning_rate": 1.8394302097344103e-05, + "loss": 0.0015, + "step": 2646 + }, + { + "epoch": 0.2648, + "grad_norm": 0.021610038354992867, + "learning_rate": 1.8390505933114503e-05, + "loss": 0.0008, + "step": 2648 + }, + { + "epoch": 0.265, + "grad_norm": 0.7256364226341248, + "learning_rate": 1.8386705679454243e-05, + "loss": 0.0181, + "step": 2650 + }, + { + "epoch": 0.2652, + "grad_norm": 6.4516801834106445, + "learning_rate": 1.8382901338215515e-05, + "loss": 0.284, + "step": 2652 + }, + { + "epoch": 0.2654, + "grad_norm": 0.4056808352470398, + "learning_rate": 1.8379092911252515e-05, + "loss": 0.0047, + "step": 2654 + }, + { + "epoch": 0.2656, + "grad_norm": 0.038895171135663986, + "learning_rate": 1.837528040042142e-05, + "loss": 0.0238, + "step": 2656 + }, + { + "epoch": 0.2658, + "grad_norm": 5.717173099517822, + "learning_rate": 1.83714638075804e-05, + "loss": 0.0619, + "step": 2658 + }, + { + "epoch": 0.266, + "grad_norm": 0.14607678353786469, + "learning_rate": 1.836764313458962e-05, + "loss": 0.0051, + "step": 2660 + }, + { + "epoch": 0.2662, + "grad_norm": 0.24222518503665924, + "learning_rate": 1.8363818383311226e-05, + "loss": 0.0027, + "step": 2662 + }, + { + "epoch": 0.2664, + "grad_norm": 0.034252434968948364, + "learning_rate": 1.8359989555609355e-05, + "loss": 0.0262, + "step": 2664 + }, + { + "epoch": 0.2666, + "grad_norm": 4.830218315124512, + "learning_rate": 1.8356156653350138e-05, + "loss": 0.1177, + "step": 2666 + }, + { + "epoch": 0.2668, + "grad_norm": 1.7377582788467407, + "learning_rate": 1.8352319678401677e-05, + "loss": 0.0879, + "step": 2668 + }, + { + "epoch": 0.267, + "grad_norm": 0.6362704634666443, + "learning_rate": 1.8348478632634067e-05, + "loss": 0.0299, + "step": 2670 + }, + { + "epoch": 0.2672, + "grad_norm": 1.0574699640274048, + "learning_rate": 1.834463351791939e-05, + "loss": 0.0064, + "step": 2672 + }, + { + "epoch": 0.2674, + "grad_norm": 0.028000079095363617, + "learning_rate": 1.8340784336131715e-05, + "loss": 0.0008, + "step": 2674 + }, + { + "epoch": 0.2676, + "grad_norm": 0.10095448791980743, + "learning_rate": 1.8336931089147076e-05, + "loss": 0.0021, + "step": 2676 + }, + { + "epoch": 0.2678, + "grad_norm": 0.14685167372226715, + "learning_rate": 1.83330737788435e-05, + "loss": 0.009, + "step": 2678 + }, + { + "epoch": 0.268, + "grad_norm": 0.006706394720822573, + "learning_rate": 1.8329212407100996e-05, + "loss": 0.0003, + "step": 2680 + }, + { + "epoch": 0.2682, + "grad_norm": 20.198530197143555, + "learning_rate": 1.832534697580155e-05, + "loss": 0.166, + "step": 2682 + }, + { + "epoch": 0.2684, + "grad_norm": 0.01745854876935482, + "learning_rate": 1.8321477486829128e-05, + "loss": 0.0012, + "step": 2684 + }, + { + "epoch": 0.2686, + "grad_norm": 3.6448111534118652, + "learning_rate": 1.8317603942069665e-05, + "loss": 0.0863, + "step": 2686 + }, + { + "epoch": 0.2688, + "grad_norm": 0.009269112721085548, + "learning_rate": 1.8313726343411085e-05, + "loss": 0.0007, + "step": 2688 + }, + { + "epoch": 0.269, + "grad_norm": 0.007505328394472599, + "learning_rate": 1.8309844692743283e-05, + "loss": 0.0004, + "step": 2690 + }, + { + "epoch": 0.2692, + "grad_norm": 0.4565262794494629, + "learning_rate": 1.830595899195813e-05, + "loss": 0.0061, + "step": 2692 + }, + { + "epoch": 0.2694, + "grad_norm": 0.07098139077425003, + "learning_rate": 1.830206924294946e-05, + "loss": 0.1557, + "step": 2694 + }, + { + "epoch": 0.2696, + "grad_norm": 0.0034711388871073723, + "learning_rate": 1.82981754476131e-05, + "loss": 0.0291, + "step": 2696 + }, + { + "epoch": 0.2698, + "grad_norm": 0.06259769201278687, + "learning_rate": 1.8294277607846834e-05, + "loss": 0.0012, + "step": 2698 + }, + { + "epoch": 0.27, + "grad_norm": 0.006141963880509138, + "learning_rate": 1.8290375725550417e-05, + "loss": 0.0006, + "step": 2700 + }, + { + "epoch": 0.2702, + "grad_norm": 7.65177059173584, + "learning_rate": 1.828646980262559e-05, + "loss": 0.0852, + "step": 2702 + }, + { + "epoch": 0.2704, + "grad_norm": 0.2841600179672241, + "learning_rate": 1.8282559840976043e-05, + "loss": 0.3265, + "step": 2704 + }, + { + "epoch": 0.2706, + "grad_norm": 1.11445152759552, + "learning_rate": 1.8278645842507448e-05, + "loss": 0.0088, + "step": 2706 + }, + { + "epoch": 0.2708, + "grad_norm": 0.06791538745164871, + "learning_rate": 1.827472780912744e-05, + "loss": 0.083, + "step": 2708 + }, + { + "epoch": 0.271, + "grad_norm": 0.05314825847744942, + "learning_rate": 1.827080574274562e-05, + "loss": 0.003, + "step": 2710 + }, + { + "epoch": 0.2712, + "grad_norm": 0.06297188997268677, + "learning_rate": 1.8266879645273557e-05, + "loss": 0.0086, + "step": 2712 + }, + { + "epoch": 0.2714, + "grad_norm": 2.0160982608795166, + "learning_rate": 1.826294951862478e-05, + "loss": 0.0644, + "step": 2714 + }, + { + "epoch": 0.2716, + "grad_norm": 2.8800697326660156, + "learning_rate": 1.8259015364714786e-05, + "loss": 0.0293, + "step": 2716 + }, + { + "epoch": 0.2718, + "grad_norm": 0.17973102629184723, + "learning_rate": 1.825507718546104e-05, + "loss": 0.0022, + "step": 2718 + }, + { + "epoch": 0.272, + "grad_norm": 8.987637519836426, + "learning_rate": 1.8251134982782952e-05, + "loss": 0.1108, + "step": 2720 + }, + { + "epoch": 0.2722, + "grad_norm": 0.042749110609292984, + "learning_rate": 1.8247188758601912e-05, + "loss": 0.0007, + "step": 2722 + }, + { + "epoch": 0.2724, + "grad_norm": 8.104391098022461, + "learning_rate": 1.824323851484126e-05, + "loss": 0.1879, + "step": 2724 + }, + { + "epoch": 0.2726, + "grad_norm": 0.1954660713672638, + "learning_rate": 1.8239284253426294e-05, + "loss": 0.0034, + "step": 2726 + }, + { + "epoch": 0.2728, + "grad_norm": 0.002123346086591482, + "learning_rate": 1.8235325976284276e-05, + "loss": 0.0002, + "step": 2728 + }, + { + "epoch": 0.273, + "grad_norm": 0.06568791717290878, + "learning_rate": 1.8231363685344422e-05, + "loss": 0.0121, + "step": 2730 + }, + { + "epoch": 0.2732, + "grad_norm": 0.39412832260131836, + "learning_rate": 1.82273973825379e-05, + "loss": 0.2527, + "step": 2732 + }, + { + "epoch": 0.2734, + "grad_norm": 0.0381833016872406, + "learning_rate": 1.8223427069797845e-05, + "loss": 0.0119, + "step": 2734 + }, + { + "epoch": 0.2736, + "grad_norm": 0.3031303882598877, + "learning_rate": 1.8219452749059332e-05, + "loss": 0.1801, + "step": 2736 + }, + { + "epoch": 0.2738, + "grad_norm": 0.03847409784793854, + "learning_rate": 1.8215474422259403e-05, + "loss": 0.0033, + "step": 2738 + }, + { + "epoch": 0.274, + "grad_norm": 1.0125293731689453, + "learning_rate": 1.821149209133704e-05, + "loss": 0.0113, + "step": 2740 + }, + { + "epoch": 0.2742, + "grad_norm": 0.014345673844218254, + "learning_rate": 1.820750575823319e-05, + "loss": 0.011, + "step": 2742 + }, + { + "epoch": 0.2744, + "grad_norm": 2.353058338165283, + "learning_rate": 1.8203515424890738e-05, + "loss": 0.0255, + "step": 2744 + }, + { + "epoch": 0.2746, + "grad_norm": 0.040130615234375, + "learning_rate": 1.8199521093254524e-05, + "loss": 0.0013, + "step": 2746 + }, + { + "epoch": 0.2748, + "grad_norm": 0.3483254611492157, + "learning_rate": 1.819552276527134e-05, + "loss": 0.0084, + "step": 2748 + }, + { + "epoch": 0.275, + "grad_norm": 0.016726382076740265, + "learning_rate": 1.819152044288992e-05, + "loss": 0.0058, + "step": 2750 + }, + { + "epoch": 0.2752, + "grad_norm": 1.2433677911758423, + "learning_rate": 1.8187514128060946e-05, + "loss": 0.0281, + "step": 2752 + }, + { + "epoch": 0.2754, + "grad_norm": 9.612857818603516, + "learning_rate": 1.818350382273705e-05, + "loss": 0.5651, + "step": 2754 + }, + { + "epoch": 0.2756, + "grad_norm": 1.5803875923156738, + "learning_rate": 1.8179489528872808e-05, + "loss": 0.182, + "step": 2756 + }, + { + "epoch": 0.2758, + "grad_norm": 0.01602315343916416, + "learning_rate": 1.817547124842473e-05, + "loss": 0.0012, + "step": 2758 + }, + { + "epoch": 0.276, + "grad_norm": 0.014849991537630558, + "learning_rate": 1.8171448983351284e-05, + "loss": 0.0031, + "step": 2760 + }, + { + "epoch": 0.2762, + "grad_norm": 3.56184720993042, + "learning_rate": 1.8167422735612877e-05, + "loss": 0.1433, + "step": 2762 + }, + { + "epoch": 0.2764, + "grad_norm": 1.5880169868469238, + "learning_rate": 1.816339250717184e-05, + "loss": 0.0211, + "step": 2764 + }, + { + "epoch": 0.2766, + "grad_norm": 5.55112886428833, + "learning_rate": 1.815935829999247e-05, + "loss": 0.1319, + "step": 2766 + }, + { + "epoch": 0.2768, + "grad_norm": 1.8089768886566162, + "learning_rate": 1.8155320116040983e-05, + "loss": 0.0678, + "step": 2768 + }, + { + "epoch": 0.277, + "grad_norm": 0.84001624584198, + "learning_rate": 1.815127795728554e-05, + "loss": 0.0874, + "step": 2770 + }, + { + "epoch": 0.2772, + "grad_norm": 0.21358610689640045, + "learning_rate": 1.814723182569625e-05, + "loss": 0.1355, + "step": 2772 + }, + { + "epoch": 0.2774, + "grad_norm": 1.7645044326782227, + "learning_rate": 1.814318172324514e-05, + "loss": 0.125, + "step": 2774 + }, + { + "epoch": 0.2776, + "grad_norm": 0.03456607460975647, + "learning_rate": 1.8139127651906183e-05, + "loss": 0.0055, + "step": 2776 + }, + { + "epoch": 0.2778, + "grad_norm": 0.11924094706773758, + "learning_rate": 1.813506961365528e-05, + "loss": 0.0062, + "step": 2778 + }, + { + "epoch": 0.278, + "grad_norm": 0.020719969645142555, + "learning_rate": 1.8131007610470278e-05, + "loss": 0.1202, + "step": 2780 + }, + { + "epoch": 0.2782, + "grad_norm": 0.05252492055296898, + "learning_rate": 1.812694164433094e-05, + "loss": 0.0081, + "step": 2782 + }, + { + "epoch": 0.2784, + "grad_norm": 0.11162712424993515, + "learning_rate": 1.812287171721897e-05, + "loss": 0.0278, + "step": 2784 + }, + { + "epoch": 0.2786, + "grad_norm": 0.41010212898254395, + "learning_rate": 1.811879783111801e-05, + "loss": 0.0067, + "step": 2786 + }, + { + "epoch": 0.2788, + "grad_norm": 0.438539057970047, + "learning_rate": 1.8114719988013612e-05, + "loss": 0.0127, + "step": 2788 + }, + { + "epoch": 0.279, + "grad_norm": 0.0798589214682579, + "learning_rate": 1.8110638189893267e-05, + "loss": 0.0041, + "step": 2790 + }, + { + "epoch": 0.2792, + "grad_norm": 0.2505303919315338, + "learning_rate": 1.81065524387464e-05, + "loss": 0.007, + "step": 2792 + }, + { + "epoch": 0.2794, + "grad_norm": 0.2887933552265167, + "learning_rate": 1.8102462736564355e-05, + "loss": 0.0232, + "step": 2794 + }, + { + "epoch": 0.2796, + "grad_norm": 0.09318225830793381, + "learning_rate": 1.80983690853404e-05, + "loss": 0.0828, + "step": 2796 + }, + { + "epoch": 0.2798, + "grad_norm": 0.10834887623786926, + "learning_rate": 1.8094271487069733e-05, + "loss": 0.003, + "step": 2798 + }, + { + "epoch": 0.28, + "grad_norm": 0.015791188925504684, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.0005, + "step": 2800 + }, + { + "epoch": 0.2802, + "grad_norm": 3.965282440185547, + "learning_rate": 1.8086064457378667e-05, + "loss": 0.1173, + "step": 2802 + }, + { + "epoch": 0.2804, + "grad_norm": 0.9248358607292175, + "learning_rate": 1.8081955029958272e-05, + "loss": 0.0344, + "step": 2804 + }, + { + "epoch": 0.2806, + "grad_norm": 0.7074441909790039, + "learning_rate": 1.8077841663491174e-05, + "loss": 0.0115, + "step": 2806 + }, + { + "epoch": 0.2808, + "grad_norm": 0.02141636423766613, + "learning_rate": 1.8073724359982184e-05, + "loss": 0.001, + "step": 2808 + }, + { + "epoch": 0.281, + "grad_norm": 0.4664030373096466, + "learning_rate": 1.806960312143802e-05, + "loss": 0.009, + "step": 2810 + }, + { + "epoch": 0.2812, + "grad_norm": 2.30702543258667, + "learning_rate": 1.8065477949867327e-05, + "loss": 0.0755, + "step": 2812 + }, + { + "epoch": 0.2814, + "grad_norm": 0.3388296663761139, + "learning_rate": 1.806134884728066e-05, + "loss": 0.0054, + "step": 2814 + }, + { + "epoch": 0.2816, + "grad_norm": 1.5970219373703003, + "learning_rate": 1.8057215815690494e-05, + "loss": 0.0146, + "step": 2816 + }, + { + "epoch": 0.2818, + "grad_norm": 0.09882229566574097, + "learning_rate": 1.8053078857111218e-05, + "loss": 0.0015, + "step": 2818 + }, + { + "epoch": 0.282, + "grad_norm": 0.01699494756758213, + "learning_rate": 1.804893797355914e-05, + "loss": 0.0025, + "step": 2820 + }, + { + "epoch": 0.2822, + "grad_norm": 0.09507785737514496, + "learning_rate": 1.8044793167052476e-05, + "loss": 0.0261, + "step": 2822 + }, + { + "epoch": 0.2824, + "grad_norm": 0.010463139042258263, + "learning_rate": 1.8040644439611348e-05, + "loss": 0.024, + "step": 2824 + }, + { + "epoch": 0.2826, + "grad_norm": 1.2571064233779907, + "learning_rate": 1.80364917932578e-05, + "loss": 0.033, + "step": 2826 + }, + { + "epoch": 0.2828, + "grad_norm": 0.025711998343467712, + "learning_rate": 1.803233523001578e-05, + "loss": 0.0011, + "step": 2828 + }, + { + "epoch": 0.283, + "grad_norm": 0.1303061991930008, + "learning_rate": 1.8028174751911147e-05, + "loss": 0.0022, + "step": 2830 + }, + { + "epoch": 0.2832, + "grad_norm": 0.46507424116134644, + "learning_rate": 1.802401036097167e-05, + "loss": 0.0487, + "step": 2832 + }, + { + "epoch": 0.2834, + "grad_norm": 0.010381229221820831, + "learning_rate": 1.801984205922701e-05, + "loss": 0.0235, + "step": 2834 + }, + { + "epoch": 0.2836, + "grad_norm": 0.13466478884220123, + "learning_rate": 1.8015669848708768e-05, + "loss": 0.0037, + "step": 2836 + }, + { + "epoch": 0.2838, + "grad_norm": 4.629215717315674, + "learning_rate": 1.8011493731450412e-05, + "loss": 0.0646, + "step": 2838 + }, + { + "epoch": 0.284, + "grad_norm": 0.01943955570459366, + "learning_rate": 1.8007313709487334e-05, + "loss": 0.1089, + "step": 2840 + }, + { + "epoch": 0.2842, + "grad_norm": 0.09953303635120392, + "learning_rate": 1.8003129784856832e-05, + "loss": 0.005, + "step": 2842 + }, + { + "epoch": 0.2844, + "grad_norm": 0.34914568066596985, + "learning_rate": 1.7998941959598097e-05, + "loss": 0.0025, + "step": 2844 + }, + { + "epoch": 0.2846, + "grad_norm": 0.018800677731633186, + "learning_rate": 1.799475023575222e-05, + "loss": 0.0007, + "step": 2846 + }, + { + "epoch": 0.2848, + "grad_norm": 0.02653719112277031, + "learning_rate": 1.79905546153622e-05, + "loss": 0.0005, + "step": 2848 + }, + { + "epoch": 0.285, + "grad_norm": 0.16337107121944427, + "learning_rate": 1.798635510047293e-05, + "loss": 0.0035, + "step": 2850 + }, + { + "epoch": 0.2852, + "grad_norm": 0.006025433074682951, + "learning_rate": 1.7982151693131206e-05, + "loss": 0.0003, + "step": 2852 + }, + { + "epoch": 0.2854, + "grad_norm": 0.7866569757461548, + "learning_rate": 1.7977944395385713e-05, + "loss": 0.0192, + "step": 2854 + }, + { + "epoch": 0.2856, + "grad_norm": 0.48430755734443665, + "learning_rate": 1.7973733209287036e-05, + "loss": 0.0041, + "step": 2856 + }, + { + "epoch": 0.2858, + "grad_norm": 0.006719428580254316, + "learning_rate": 1.7969518136887664e-05, + "loss": 0.0042, + "step": 2858 + }, + { + "epoch": 0.286, + "grad_norm": 21.938451766967773, + "learning_rate": 1.7965299180241963e-05, + "loss": 0.1239, + "step": 2860 + }, + { + "epoch": 0.2862, + "grad_norm": 0.002685297280550003, + "learning_rate": 1.796107634140621e-05, + "loss": 0.0001, + "step": 2862 + }, + { + "epoch": 0.2864, + "grad_norm": 0.005755813326686621, + "learning_rate": 1.7956849622438554e-05, + "loss": 0.9201, + "step": 2864 + }, + { + "epoch": 0.2866, + "grad_norm": 0.8111469745635986, + "learning_rate": 1.795261902539906e-05, + "loss": 0.0709, + "step": 2866 + }, + { + "epoch": 0.2868, + "grad_norm": 1.7512388229370117, + "learning_rate": 1.794838455234966e-05, + "loss": 0.0183, + "step": 2868 + }, + { + "epoch": 0.287, + "grad_norm": 0.18515585362911224, + "learning_rate": 1.7944146205354182e-05, + "loss": 0.018, + "step": 2870 + }, + { + "epoch": 0.2872, + "grad_norm": 0.023890402168035507, + "learning_rate": 1.7939903986478354e-05, + "loss": 0.0006, + "step": 2872 + }, + { + "epoch": 0.2874, + "grad_norm": 0.011195428669452667, + "learning_rate": 1.793565789778978e-05, + "loss": 0.0009, + "step": 2874 + }, + { + "epoch": 0.2876, + "grad_norm": 2.0449960231781006, + "learning_rate": 1.793140794135795e-05, + "loss": 0.0106, + "step": 2876 + }, + { + "epoch": 0.2878, + "grad_norm": 0.008460085839033127, + "learning_rate": 1.7927154119254234e-05, + "loss": 0.0003, + "step": 2878 + }, + { + "epoch": 0.288, + "grad_norm": 0.03704865276813507, + "learning_rate": 1.792289643355191e-05, + "loss": 0.0011, + "step": 2880 + }, + { + "epoch": 0.2882, + "grad_norm": 0.0191958025097847, + "learning_rate": 1.791863488632611e-05, + "loss": 0.008, + "step": 2882 + }, + { + "epoch": 0.2884, + "grad_norm": 0.08678248524665833, + "learning_rate": 1.7914369479653858e-05, + "loss": 0.003, + "step": 2884 + }, + { + "epoch": 0.2886, + "grad_norm": 1.1532390117645264, + "learning_rate": 1.791010021561407e-05, + "loss": 0.0166, + "step": 2886 + }, + { + "epoch": 0.2888, + "grad_norm": 0.05805323272943497, + "learning_rate": 1.7905827096287532e-05, + "loss": 0.0014, + "step": 2888 + }, + { + "epoch": 0.289, + "grad_norm": 3.1419084072113037, + "learning_rate": 1.7901550123756906e-05, + "loss": 0.0723, + "step": 2890 + }, + { + "epoch": 0.2892, + "grad_norm": 0.010222111828625202, + "learning_rate": 1.789726930010674e-05, + "loss": 0.0003, + "step": 2892 + }, + { + "epoch": 0.2894, + "grad_norm": 0.35604751110076904, + "learning_rate": 1.789298462742345e-05, + "loss": 0.0075, + "step": 2894 + }, + { + "epoch": 0.2896, + "grad_norm": 5.877459526062012, + "learning_rate": 1.7888696107795343e-05, + "loss": 0.1023, + "step": 2896 + }, + { + "epoch": 0.2898, + "grad_norm": 0.027293343096971512, + "learning_rate": 1.7884403743312583e-05, + "loss": 0.0009, + "step": 2898 + }, + { + "epoch": 0.29, + "grad_norm": 0.013623971492052078, + "learning_rate": 1.788010753606722e-05, + "loss": 0.0003, + "step": 2900 + }, + { + "epoch": 0.2902, + "grad_norm": 0.03823085129261017, + "learning_rate": 1.7875807488153173e-05, + "loss": 0.0012, + "step": 2902 + }, + { + "epoch": 0.2904, + "grad_norm": 0.016132116317749023, + "learning_rate": 1.7871503601666233e-05, + "loss": 0.0913, + "step": 2904 + }, + { + "epoch": 0.2906, + "grad_norm": 2.3978123664855957, + "learning_rate": 1.7867195878704062e-05, + "loss": 0.0122, + "step": 2906 + }, + { + "epoch": 0.2908, + "grad_norm": 0.29414743185043335, + "learning_rate": 1.786288432136619e-05, + "loss": 0.0196, + "step": 2908 + }, + { + "epoch": 0.291, + "grad_norm": 0.07482676208019257, + "learning_rate": 1.785856893175402e-05, + "loss": 0.0049, + "step": 2910 + }, + { + "epoch": 0.2912, + "grad_norm": 0.5600738525390625, + "learning_rate": 1.785424971197082e-05, + "loss": 0.007, + "step": 2912 + }, + { + "epoch": 0.2914, + "grad_norm": 9.395570755004883, + "learning_rate": 1.7849926664121726e-05, + "loss": 0.1206, + "step": 2914 + }, + { + "epoch": 0.2916, + "grad_norm": 0.03088049776852131, + "learning_rate": 1.7845599790313735e-05, + "loss": 0.0009, + "step": 2916 + }, + { + "epoch": 0.2918, + "grad_norm": 0.08165595680475235, + "learning_rate": 1.7841269092655714e-05, + "loss": 0.0025, + "step": 2918 + }, + { + "epoch": 0.292, + "grad_norm": 0.27552640438079834, + "learning_rate": 1.78369345732584e-05, + "loss": 0.3024, + "step": 2920 + }, + { + "epoch": 0.2922, + "grad_norm": 0.0037995472084730864, + "learning_rate": 1.7832596234234376e-05, + "loss": 0.0003, + "step": 2922 + }, + { + "epoch": 0.2924, + "grad_norm": 0.3043214976787567, + "learning_rate": 1.78282540776981e-05, + "loss": 0.0025, + "step": 2924 + }, + { + "epoch": 0.2926, + "grad_norm": 0.02880287542939186, + "learning_rate": 1.7823908105765883e-05, + "loss": 0.0024, + "step": 2926 + }, + { + "epoch": 0.2928, + "grad_norm": 0.07615689188241959, + "learning_rate": 1.7819558320555902e-05, + "loss": 0.016, + "step": 2928 + }, + { + "epoch": 0.293, + "grad_norm": 0.03465000540018082, + "learning_rate": 1.781520472418819e-05, + "loss": 0.0009, + "step": 2930 + }, + { + "epoch": 0.2932, + "grad_norm": 0.07078465074300766, + "learning_rate": 1.7810847318784632e-05, + "loss": 0.0025, + "step": 2932 + }, + { + "epoch": 0.2934, + "grad_norm": 0.010036272928118706, + "learning_rate": 1.7806486106468983e-05, + "loss": 0.1128, + "step": 2934 + }, + { + "epoch": 0.2936, + "grad_norm": 0.040595199912786484, + "learning_rate": 1.780212108936684e-05, + "loss": 0.0078, + "step": 2936 + }, + { + "epoch": 0.2938, + "grad_norm": 0.010727690532803535, + "learning_rate": 1.7797752269605654e-05, + "loss": 0.001, + "step": 2938 + }, + { + "epoch": 0.294, + "grad_norm": 3.6340279579162598, + "learning_rate": 1.7793379649314743e-05, + "loss": 0.0724, + "step": 2940 + }, + { + "epoch": 0.2942, + "grad_norm": 0.03119977004826069, + "learning_rate": 1.7789003230625266e-05, + "loss": 0.0205, + "step": 2942 + }, + { + "epoch": 0.2944, + "grad_norm": 0.024382494390010834, + "learning_rate": 1.7784623015670237e-05, + "loss": 0.0014, + "step": 2944 + }, + { + "epoch": 0.2946, + "grad_norm": 0.7623934149742126, + "learning_rate": 1.7780239006584515e-05, + "loss": 0.0183, + "step": 2946 + }, + { + "epoch": 0.2948, + "grad_norm": 1.230650544166565, + "learning_rate": 1.7775851205504823e-05, + "loss": 0.0316, + "step": 2948 + }, + { + "epoch": 0.295, + "grad_norm": 0.050762493163347244, + "learning_rate": 1.777145961456971e-05, + "loss": 0.0024, + "step": 2950 + }, + { + "epoch": 0.2952, + "grad_norm": 0.12976284325122833, + "learning_rate": 1.7767064235919594e-05, + "loss": 0.0029, + "step": 2952 + }, + { + "epoch": 0.2954, + "grad_norm": 0.03910979628562927, + "learning_rate": 1.776266507169672e-05, + "loss": 0.0006, + "step": 2954 + }, + { + "epoch": 0.2956, + "grad_norm": 0.48871737718582153, + "learning_rate": 1.7758262124045195e-05, + "loss": 0.0059, + "step": 2956 + }, + { + "epoch": 0.2958, + "grad_norm": 9.118988990783691, + "learning_rate": 1.775385539511096e-05, + "loss": 0.056, + "step": 2958 + }, + { + "epoch": 0.296, + "grad_norm": 0.010915910825133324, + "learning_rate": 1.7749444887041797e-05, + "loss": 0.0004, + "step": 2960 + }, + { + "epoch": 0.2962, + "grad_norm": 0.6175291538238525, + "learning_rate": 1.7745030601987338e-05, + "loss": 0.0059, + "step": 2962 + }, + { + "epoch": 0.2964, + "grad_norm": 0.6484206914901733, + "learning_rate": 1.7740612542099054e-05, + "loss": 0.0151, + "step": 2964 + }, + { + "epoch": 0.2966, + "grad_norm": 0.010245351120829582, + "learning_rate": 1.773619070953025e-05, + "loss": 0.0005, + "step": 2966 + }, + { + "epoch": 0.2968, + "grad_norm": 0.005198138765990734, + "learning_rate": 1.7731765106436073e-05, + "loss": 0.0004, + "step": 2968 + }, + { + "epoch": 0.297, + "grad_norm": 7.488677978515625, + "learning_rate": 1.7727335734973512e-05, + "loss": 0.0504, + "step": 2970 + }, + { + "epoch": 0.2972, + "grad_norm": 2.869818687438965, + "learning_rate": 1.7722902597301385e-05, + "loss": 0.0824, + "step": 2972 + }, + { + "epoch": 0.2974, + "grad_norm": 0.11220328509807587, + "learning_rate": 1.771846569558035e-05, + "loss": 0.0504, + "step": 2974 + }, + { + "epoch": 0.2976, + "grad_norm": 0.16732782125473022, + "learning_rate": 1.7714025031972904e-05, + "loss": 0.003, + "step": 2976 + }, + { + "epoch": 0.2978, + "grad_norm": 0.2663513720035553, + "learning_rate": 1.7709580608643364e-05, + "loss": 0.0401, + "step": 2978 + }, + { + "epoch": 0.298, + "grad_norm": 0.009158470667898655, + "learning_rate": 1.7705132427757895e-05, + "loss": 0.0006, + "step": 2980 + }, + { + "epoch": 0.2982, + "grad_norm": 0.25305524468421936, + "learning_rate": 1.770068049148448e-05, + "loss": 0.0376, + "step": 2982 + }, + { + "epoch": 0.2984, + "grad_norm": 0.06420150399208069, + "learning_rate": 1.7696224801992947e-05, + "loss": 0.0019, + "step": 2984 + }, + { + "epoch": 0.2986, + "grad_norm": 0.010242782533168793, + "learning_rate": 1.769176536145494e-05, + "loss": 0.0055, + "step": 2986 + }, + { + "epoch": 0.2988, + "grad_norm": 1.413893222808838, + "learning_rate": 1.7687302172043933e-05, + "loss": 0.0605, + "step": 2988 + }, + { + "epoch": 0.299, + "grad_norm": 0.017410963773727417, + "learning_rate": 1.7682835235935236e-05, + "loss": 0.0003, + "step": 2990 + }, + { + "epoch": 0.2992, + "grad_norm": 0.11974873393774033, + "learning_rate": 1.767836455530598e-05, + "loss": 0.01, + "step": 2992 + }, + { + "epoch": 0.2994, + "grad_norm": 0.002127000130712986, + "learning_rate": 1.767389013233511e-05, + "loss": 0.4959, + "step": 2994 + }, + { + "epoch": 0.2996, + "grad_norm": 10.890230178833008, + "learning_rate": 1.7669411969203417e-05, + "loss": 0.2107, + "step": 2996 + }, + { + "epoch": 0.2998, + "grad_norm": 0.14034514129161835, + "learning_rate": 1.76649300680935e-05, + "loss": 0.0015, + "step": 2998 + }, + { + "epoch": 0.3, + "grad_norm": 0.13414713740348816, + "learning_rate": 1.766044443118978e-05, + "loss": 0.0413, + "step": 3000 + }, + { + "epoch": 0.3002, + "grad_norm": 0.20692890882492065, + "learning_rate": 1.7655955060678508e-05, + "loss": 0.0043, + "step": 3002 + }, + { + "epoch": 0.3004, + "grad_norm": 0.00765811325982213, + "learning_rate": 1.7651461958747745e-05, + "loss": 0.0003, + "step": 3004 + }, + { + "epoch": 0.3006, + "grad_norm": 0.05482453107833862, + "learning_rate": 1.7646965127587373e-05, + "loss": 0.0017, + "step": 3006 + }, + { + "epoch": 0.3008, + "grad_norm": 0.31301790475845337, + "learning_rate": 1.764246456938909e-05, + "loss": 0.009, + "step": 3008 + }, + { + "epoch": 0.301, + "grad_norm": 0.10657927393913269, + "learning_rate": 1.7637960286346423e-05, + "loss": 0.0024, + "step": 3010 + }, + { + "epoch": 0.3012, + "grad_norm": 0.05424473062157631, + "learning_rate": 1.76334522806547e-05, + "loss": 0.002, + "step": 3012 + }, + { + "epoch": 0.3014, + "grad_norm": 0.11894019693136215, + "learning_rate": 1.7628940554511064e-05, + "loss": 0.0041, + "step": 3014 + }, + { + "epoch": 0.3016, + "grad_norm": 1.239383339881897, + "learning_rate": 1.762442511011448e-05, + "loss": 0.0336, + "step": 3016 + }, + { + "epoch": 0.3018, + "grad_norm": 0.04522930085659027, + "learning_rate": 1.761990594966572e-05, + "loss": 0.004, + "step": 3018 + }, + { + "epoch": 0.302, + "grad_norm": 0.48217910528182983, + "learning_rate": 1.761538307536737e-05, + "loss": 0.0185, + "step": 3020 + }, + { + "epoch": 0.3022, + "grad_norm": 0.1863677054643631, + "learning_rate": 1.761085648942382e-05, + "loss": 0.0039, + "step": 3022 + }, + { + "epoch": 0.3024, + "grad_norm": 0.8006262183189392, + "learning_rate": 1.7606326194041274e-05, + "loss": 0.0475, + "step": 3024 + }, + { + "epoch": 0.3026, + "grad_norm": 0.9740577340126038, + "learning_rate": 1.760179219142774e-05, + "loss": 0.018, + "step": 3026 + }, + { + "epoch": 0.3028, + "grad_norm": 0.2052193284034729, + "learning_rate": 1.759725448379305e-05, + "loss": 0.0059, + "step": 3028 + }, + { + "epoch": 0.303, + "grad_norm": 0.08983856439590454, + "learning_rate": 1.759271307334881e-05, + "loss": 0.0026, + "step": 3030 + }, + { + "epoch": 0.3032, + "grad_norm": 3.330824851989746, + "learning_rate": 1.7588167962308458e-05, + "loss": 0.0774, + "step": 3032 + }, + { + "epoch": 0.3034, + "grad_norm": 0.6715301275253296, + "learning_rate": 1.7583619152887222e-05, + "loss": 0.0603, + "step": 3034 + }, + { + "epoch": 0.3036, + "grad_norm": 5.981576442718506, + "learning_rate": 1.7579066647302134e-05, + "loss": 0.1907, + "step": 3036 + }, + { + "epoch": 0.3038, + "grad_norm": 0.048778459429740906, + "learning_rate": 1.757451044777204e-05, + "loss": 0.0062, + "step": 3038 + }, + { + "epoch": 0.304, + "grad_norm": 0.15335293114185333, + "learning_rate": 1.7569950556517566e-05, + "loss": 0.0145, + "step": 3040 + }, + { + "epoch": 0.3042, + "grad_norm": 0.14485983550548553, + "learning_rate": 1.756538697576115e-05, + "loss": 0.0101, + "step": 3042 + }, + { + "epoch": 0.3044, + "grad_norm": 0.04363395646214485, + "learning_rate": 1.7560819707727034e-05, + "loss": 0.0017, + "step": 3044 + }, + { + "epoch": 0.3046, + "grad_norm": 0.18354494869709015, + "learning_rate": 1.7556248754641237e-05, + "loss": 0.024, + "step": 3046 + }, + { + "epoch": 0.3048, + "grad_norm": 1.5058907270431519, + "learning_rate": 1.7551674118731592e-05, + "loss": 0.026, + "step": 3048 + }, + { + "epoch": 0.305, + "grad_norm": 0.019125523045659065, + "learning_rate": 1.7547095802227723e-05, + "loss": 0.0298, + "step": 3050 + }, + { + "epoch": 0.3052, + "grad_norm": 0.11163520067930222, + "learning_rate": 1.754251380736104e-05, + "loss": 0.0027, + "step": 3052 + }, + { + "epoch": 0.3054, + "grad_norm": 0.506487250328064, + "learning_rate": 1.7537928136364756e-05, + "loss": 0.0077, + "step": 3054 + }, + { + "epoch": 0.3056, + "grad_norm": 0.5844160318374634, + "learning_rate": 1.7533338791473872e-05, + "loss": 0.0085, + "step": 3056 + }, + { + "epoch": 0.3058, + "grad_norm": 0.12395542860031128, + "learning_rate": 1.7528745774925175e-05, + "loss": 0.0055, + "step": 3058 + }, + { + "epoch": 0.306, + "grad_norm": 0.15788047015666962, + "learning_rate": 1.7524149088957244e-05, + "loss": 0.0024, + "step": 3060 + }, + { + "epoch": 0.3062, + "grad_norm": 0.9070379137992859, + "learning_rate": 1.7519548735810456e-05, + "loss": 0.0929, + "step": 3062 + }, + { + "epoch": 0.3064, + "grad_norm": 0.1830589473247528, + "learning_rate": 1.7514944717726962e-05, + "loss": 0.0038, + "step": 3064 + }, + { + "epoch": 0.3066, + "grad_norm": 6.603653430938721, + "learning_rate": 1.7510337036950703e-05, + "loss": 0.3096, + "step": 3066 + }, + { + "epoch": 0.3068, + "grad_norm": 0.4023731052875519, + "learning_rate": 1.7505725695727414e-05, + "loss": 0.226, + "step": 3068 + }, + { + "epoch": 0.307, + "grad_norm": 0.05441810563206673, + "learning_rate": 1.7501110696304598e-05, + "loss": 0.0053, + "step": 3070 + }, + { + "epoch": 0.3072, + "grad_norm": 0.021053334698081017, + "learning_rate": 1.749649204093155e-05, + "loss": 0.0027, + "step": 3072 + }, + { + "epoch": 0.3074, + "grad_norm": 1.6434757709503174, + "learning_rate": 1.7491869731859353e-05, + "loss": 0.0334, + "step": 3074 + }, + { + "epoch": 0.3076, + "grad_norm": 0.10476858168840408, + "learning_rate": 1.7487243771340862e-05, + "loss": 0.0026, + "step": 3076 + }, + { + "epoch": 0.3078, + "grad_norm": 0.02222847379744053, + "learning_rate": 1.7482614161630714e-05, + "loss": 0.0839, + "step": 3078 + }, + { + "epoch": 0.308, + "grad_norm": 0.009191624820232391, + "learning_rate": 1.747798090498532e-05, + "loss": 0.0029, + "step": 3080 + }, + { + "epoch": 0.3082, + "grad_norm": 0.7185196876525879, + "learning_rate": 1.7473344003662877e-05, + "loss": 0.1027, + "step": 3082 + }, + { + "epoch": 0.3084, + "grad_norm": 0.618700385093689, + "learning_rate": 1.746870345992336e-05, + "loss": 0.0094, + "step": 3084 + }, + { + "epoch": 0.3086, + "grad_norm": 0.15484216809272766, + "learning_rate": 1.7464059276028497e-05, + "loss": 0.0293, + "step": 3086 + }, + { + "epoch": 0.3088, + "grad_norm": 1.191298484802246, + "learning_rate": 1.7459411454241822e-05, + "loss": 0.0482, + "step": 3088 + }, + { + "epoch": 0.309, + "grad_norm": 0.20910286903381348, + "learning_rate": 1.7454759996828622e-05, + "loss": 0.0127, + "step": 3090 + }, + { + "epoch": 0.3092, + "grad_norm": 0.6651049256324768, + "learning_rate": 1.7450104906055963e-05, + "loss": 0.0144, + "step": 3092 + }, + { + "epoch": 0.3094, + "grad_norm": 0.22190743684768677, + "learning_rate": 1.7445446184192674e-05, + "loss": 0.0071, + "step": 3094 + }, + { + "epoch": 0.3096, + "grad_norm": 0.1612924188375473, + "learning_rate": 1.7440783833509366e-05, + "loss": 0.0265, + "step": 3096 + }, + { + "epoch": 0.3098, + "grad_norm": 0.26073792576789856, + "learning_rate": 1.743611785627841e-05, + "loss": 0.0056, + "step": 3098 + }, + { + "epoch": 0.31, + "grad_norm": 0.027563758194446564, + "learning_rate": 1.7431448254773943e-05, + "loss": 0.0186, + "step": 3100 + }, + { + "epoch": 0.3102, + "grad_norm": 0.050092682242393494, + "learning_rate": 1.7426775031271876e-05, + "loss": 0.0251, + "step": 3102 + }, + { + "epoch": 0.3104, + "grad_norm": 0.015051164664328098, + "learning_rate": 1.7422098188049885e-05, + "loss": 0.0207, + "step": 3104 + }, + { + "epoch": 0.3106, + "grad_norm": 0.07332376390695572, + "learning_rate": 1.7417417727387392e-05, + "loss": 0.0013, + "step": 3106 + }, + { + "epoch": 0.3108, + "grad_norm": 0.00795117486268282, + "learning_rate": 1.741273365156561e-05, + "loss": 0.0011, + "step": 3108 + }, + { + "epoch": 0.311, + "grad_norm": 2.9841666221618652, + "learning_rate": 1.74080459628675e-05, + "loss": 0.067, + "step": 3110 + }, + { + "epoch": 0.3112, + "grad_norm": 0.06486763805150986, + "learning_rate": 1.7403354663577782e-05, + "loss": 0.0077, + "step": 3112 + }, + { + "epoch": 0.3114, + "grad_norm": 0.021873516961932182, + "learning_rate": 1.7398659755982937e-05, + "loss": 0.0143, + "step": 3114 + }, + { + "epoch": 0.3116, + "grad_norm": 0.011448029428720474, + "learning_rate": 1.7393961242371203e-05, + "loss": 0.0171, + "step": 3116 + }, + { + "epoch": 0.3118, + "grad_norm": 0.16443422436714172, + "learning_rate": 1.738925912503259e-05, + "loss": 0.004, + "step": 3118 + }, + { + "epoch": 0.312, + "grad_norm": 3.9635562896728516, + "learning_rate": 1.7384553406258842e-05, + "loss": 0.0591, + "step": 3120 + }, + { + "epoch": 0.3122, + "grad_norm": 0.152618408203125, + "learning_rate": 1.737984408834347e-05, + "loss": 0.215, + "step": 3122 + }, + { + "epoch": 0.3124, + "grad_norm": 0.1450982242822647, + "learning_rate": 1.737513117358174e-05, + "loss": 0.0127, + "step": 3124 + }, + { + "epoch": 0.3126, + "grad_norm": 0.04693948104977608, + "learning_rate": 1.7370414664270675e-05, + "loss": 0.0016, + "step": 3126 + }, + { + "epoch": 0.3128, + "grad_norm": 0.04794273525476456, + "learning_rate": 1.7365694562709034e-05, + "loss": 0.001, + "step": 3128 + }, + { + "epoch": 0.313, + "grad_norm": 0.050553470849990845, + "learning_rate": 1.7360970871197347e-05, + "loss": 0.0012, + "step": 3130 + }, + { + "epoch": 0.3132, + "grad_norm": 0.6768764853477478, + "learning_rate": 1.7356243592037876e-05, + "loss": 0.0165, + "step": 3132 + }, + { + "epoch": 0.3134, + "grad_norm": 0.7094630599021912, + "learning_rate": 1.7351512727534645e-05, + "loss": 0.0224, + "step": 3134 + }, + { + "epoch": 0.3136, + "grad_norm": 0.0645882859826088, + "learning_rate": 1.7346778279993417e-05, + "loss": 0.011, + "step": 3136 + }, + { + "epoch": 0.3138, + "grad_norm": 0.8315227031707764, + "learning_rate": 1.7342040251721702e-05, + "loss": 0.0155, + "step": 3138 + }, + { + "epoch": 0.314, + "grad_norm": 0.05002930015325546, + "learning_rate": 1.7337298645028764e-05, + "loss": 0.0009, + "step": 3140 + }, + { + "epoch": 0.3142, + "grad_norm": 0.06902045011520386, + "learning_rate": 1.7332553462225604e-05, + "loss": 0.0034, + "step": 3142 + }, + { + "epoch": 0.3144, + "grad_norm": 1.3589166402816772, + "learning_rate": 1.732780470562496e-05, + "loss": 0.0245, + "step": 3144 + }, + { + "epoch": 0.3146, + "grad_norm": 0.2619578540325165, + "learning_rate": 1.732305237754132e-05, + "loss": 0.0029, + "step": 3146 + }, + { + "epoch": 0.3148, + "grad_norm": 0.006062662694603205, + "learning_rate": 1.7318296480290912e-05, + "loss": 0.0009, + "step": 3148 + }, + { + "epoch": 0.315, + "grad_norm": 0.027885327115654945, + "learning_rate": 1.7313537016191706e-05, + "loss": 0.0035, + "step": 3150 + }, + { + "epoch": 0.3152, + "grad_norm": 0.019586361944675446, + "learning_rate": 1.7308773987563406e-05, + "loss": 0.0008, + "step": 3152 + }, + { + "epoch": 0.3154, + "grad_norm": 0.005444913636893034, + "learning_rate": 1.730400739672745e-05, + "loss": 0.0003, + "step": 3154 + }, + { + "epoch": 0.3156, + "grad_norm": 1.7462350130081177, + "learning_rate": 1.7299237246007018e-05, + "loss": 0.0991, + "step": 3156 + }, + { + "epoch": 0.3158, + "grad_norm": 0.0360061340034008, + "learning_rate": 1.7294463537727026e-05, + "loss": 0.0005, + "step": 3158 + }, + { + "epoch": 0.316, + "grad_norm": 1.8418307304382324, + "learning_rate": 1.7289686274214116e-05, + "loss": 0.028, + "step": 3160 + }, + { + "epoch": 0.3162, + "grad_norm": 0.003376666223630309, + "learning_rate": 1.7284905457796678e-05, + "loss": 0.0041, + "step": 3162 + }, + { + "epoch": 0.3164, + "grad_norm": 0.020619291812181473, + "learning_rate": 1.7280121090804813e-05, + "loss": 0.005, + "step": 3164 + }, + { + "epoch": 0.3166, + "grad_norm": 0.9706044793128967, + "learning_rate": 1.727533317557037e-05, + "loss": 0.0149, + "step": 3166 + }, + { + "epoch": 0.3168, + "grad_norm": 0.0034657225478440523, + "learning_rate": 1.727054171442692e-05, + "loss": 0.0001, + "step": 3168 + }, + { + "epoch": 0.317, + "grad_norm": 7.882265090942383, + "learning_rate": 1.7265746709709762e-05, + "loss": 0.1749, + "step": 3170 + }, + { + "epoch": 0.3172, + "grad_norm": 0.32060983777046204, + "learning_rate": 1.7260948163755918e-05, + "loss": 0.0038, + "step": 3172 + }, + { + "epoch": 0.3174, + "grad_norm": 0.4775964915752411, + "learning_rate": 1.7256146078904153e-05, + "loss": 0.0198, + "step": 3174 + }, + { + "epoch": 0.3176, + "grad_norm": 0.006655476056039333, + "learning_rate": 1.7251340457494934e-05, + "loss": 0.0011, + "step": 3176 + }, + { + "epoch": 0.3178, + "grad_norm": 0.005904811434447765, + "learning_rate": 1.7246531301870467e-05, + "loss": 0.0009, + "step": 3178 + }, + { + "epoch": 0.318, + "grad_norm": 1.4022754430770874, + "learning_rate": 1.7241718614374678e-05, + "loss": 0.0154, + "step": 3180 + }, + { + "epoch": 0.3182, + "grad_norm": 0.09416387975215912, + "learning_rate": 1.7236902397353204e-05, + "loss": 0.0012, + "step": 3182 + }, + { + "epoch": 0.3184, + "grad_norm": 1.969428539276123, + "learning_rate": 1.7232082653153422e-05, + "loss": 0.3784, + "step": 3184 + }, + { + "epoch": 0.3186, + "grad_norm": 0.019654830917716026, + "learning_rate": 1.7227259384124408e-05, + "loss": 0.0003, + "step": 3186 + }, + { + "epoch": 0.3188, + "grad_norm": 0.0019007449736818671, + "learning_rate": 1.722243259261697e-05, + "loss": 0.0874, + "step": 3188 + }, + { + "epoch": 0.319, + "grad_norm": 1.950238823890686, + "learning_rate": 1.7217602280983622e-05, + "loss": 0.0171, + "step": 3190 + }, + { + "epoch": 0.3192, + "grad_norm": 0.18302948772907257, + "learning_rate": 1.721276845157861e-05, + "loss": 0.0025, + "step": 3192 + }, + { + "epoch": 0.3194, + "grad_norm": 5.234598159790039, + "learning_rate": 1.7207931106757867e-05, + "loss": 0.1029, + "step": 3194 + }, + { + "epoch": 0.3196, + "grad_norm": 0.01990848407149315, + "learning_rate": 1.720309024887907e-05, + "loss": 0.0008, + "step": 3196 + }, + { + "epoch": 0.3198, + "grad_norm": 0.024229446426033974, + "learning_rate": 1.719824588030159e-05, + "loss": 0.0511, + "step": 3198 + }, + { + "epoch": 0.32, + "grad_norm": 0.0060981037095189095, + "learning_rate": 1.7193398003386514e-05, + "loss": 0.0004, + "step": 3200 + }, + { + "epoch": 0.3202, + "grad_norm": 4.986056804656982, + "learning_rate": 1.7188546620496634e-05, + "loss": 0.0901, + "step": 3202 + }, + { + "epoch": 0.3204, + "grad_norm": 0.013943852856755257, + "learning_rate": 1.7183691733996463e-05, + "loss": 0.0068, + "step": 3204 + }, + { + "epoch": 0.3206, + "grad_norm": 0.374370813369751, + "learning_rate": 1.7178833346252208e-05, + "loss": 0.0057, + "step": 3206 + }, + { + "epoch": 0.3208, + "grad_norm": 14.131662368774414, + "learning_rate": 1.717397145963179e-05, + "loss": 0.2525, + "step": 3208 + }, + { + "epoch": 0.321, + "grad_norm": 0.004676810931414366, + "learning_rate": 1.716910607650483e-05, + "loss": 0.0044, + "step": 3210 + }, + { + "epoch": 0.3212, + "grad_norm": 0.3059453070163727, + "learning_rate": 1.716423719924266e-05, + "loss": 0.0035, + "step": 3212 + }, + { + "epoch": 0.3214, + "grad_norm": 0.10520552843809128, + "learning_rate": 1.7159364830218312e-05, + "loss": 0.0016, + "step": 3214 + }, + { + "epoch": 0.3216, + "grad_norm": 8.533829689025879, + "learning_rate": 1.715448897180652e-05, + "loss": 0.1769, + "step": 3216 + }, + { + "epoch": 0.3218, + "grad_norm": 3.2465550899505615, + "learning_rate": 1.7149609626383718e-05, + "loss": 0.0898, + "step": 3218 + }, + { + "epoch": 0.322, + "grad_norm": 2.5786564350128174, + "learning_rate": 1.7144726796328034e-05, + "loss": 0.077, + "step": 3220 + }, + { + "epoch": 0.3222, + "grad_norm": 9.064810752868652, + "learning_rate": 1.713984048401931e-05, + "loss": 0.3315, + "step": 3222 + }, + { + "epoch": 0.3224, + "grad_norm": 0.18401555716991425, + "learning_rate": 1.7134950691839063e-05, + "loss": 0.0034, + "step": 3224 + }, + { + "epoch": 0.3226, + "grad_norm": 0.20044183731079102, + "learning_rate": 1.713005742217053e-05, + "loss": 0.0035, + "step": 3226 + }, + { + "epoch": 0.3228, + "grad_norm": 0.3936220109462738, + "learning_rate": 1.7125160677398625e-05, + "loss": 0.0077, + "step": 3228 + }, + { + "epoch": 0.323, + "grad_norm": 0.43855494260787964, + "learning_rate": 1.712026045990997e-05, + "loss": 0.03, + "step": 3230 + }, + { + "epoch": 0.3232, + "grad_norm": 0.010615245439112186, + "learning_rate": 1.7115356772092858e-05, + "loss": 0.0007, + "step": 3232 + }, + { + "epoch": 0.3234, + "grad_norm": 2.0116074085235596, + "learning_rate": 1.711044961633729e-05, + "loss": 0.0332, + "step": 3234 + }, + { + "epoch": 0.3236, + "grad_norm": 0.08622007817029953, + "learning_rate": 1.710553899503496e-05, + "loss": 0.0024, + "step": 3236 + }, + { + "epoch": 0.3238, + "grad_norm": 0.6214799284934998, + "learning_rate": 1.710062491057925e-05, + "loss": 0.0082, + "step": 3238 + }, + { + "epoch": 0.324, + "grad_norm": 0.4321895241737366, + "learning_rate": 1.709570736536521e-05, + "loss": 0.0094, + "step": 3240 + }, + { + "epoch": 0.3242, + "grad_norm": 0.06627032905817032, + "learning_rate": 1.7090786361789602e-05, + "loss": 0.0031, + "step": 3242 + }, + { + "epoch": 0.3244, + "grad_norm": 0.07586369663476944, + "learning_rate": 1.7085861902250864e-05, + "loss": 0.1072, + "step": 3244 + }, + { + "epoch": 0.3246, + "grad_norm": 2.0612521171569824, + "learning_rate": 1.7080933989149112e-05, + "loss": 0.0422, + "step": 3246 + }, + { + "epoch": 0.3248, + "grad_norm": 0.0708855539560318, + "learning_rate": 1.7076002624886156e-05, + "loss": 0.0018, + "step": 3248 + }, + { + "epoch": 0.325, + "grad_norm": 0.006492982618510723, + "learning_rate": 1.7071067811865477e-05, + "loss": 0.0017, + "step": 3250 + }, + { + "epoch": 0.3252, + "grad_norm": 0.12834718823432922, + "learning_rate": 1.706612955249225e-05, + "loss": 0.0027, + "step": 3252 + }, + { + "epoch": 0.3254, + "grad_norm": 0.03708411753177643, + "learning_rate": 1.7061187849173318e-05, + "loss": 0.0054, + "step": 3254 + }, + { + "epoch": 0.3256, + "grad_norm": 0.062188249081373215, + "learning_rate": 1.705624270431721e-05, + "loss": 0.0021, + "step": 3256 + }, + { + "epoch": 0.3258, + "grad_norm": 0.009319106116890907, + "learning_rate": 1.7051294120334126e-05, + "loss": 0.0158, + "step": 3258 + }, + { + "epoch": 0.326, + "grad_norm": 3.1476051807403564, + "learning_rate": 1.7046342099635948e-05, + "loss": 0.0294, + "step": 3260 + }, + { + "epoch": 0.3262, + "grad_norm": 1.2315092086791992, + "learning_rate": 1.704138664463623e-05, + "loss": 0.0211, + "step": 3262 + }, + { + "epoch": 0.3264, + "grad_norm": 1.072055459022522, + "learning_rate": 1.7036427757750205e-05, + "loss": 0.0199, + "step": 3264 + }, + { + "epoch": 0.3266, + "grad_norm": 0.07827428728342056, + "learning_rate": 1.7031465441394766e-05, + "loss": 0.0014, + "step": 3266 + }, + { + "epoch": 0.3268, + "grad_norm": 0.32840678095817566, + "learning_rate": 1.7026499697988496e-05, + "loss": 0.0193, + "step": 3268 + }, + { + "epoch": 0.327, + "grad_norm": 0.004599344450980425, + "learning_rate": 1.7021530529951627e-05, + "loss": 0.0007, + "step": 3270 + }, + { + "epoch": 0.3272, + "grad_norm": 0.1182892695069313, + "learning_rate": 1.7016557939706075e-05, + "loss": 0.0038, + "step": 3272 + }, + { + "epoch": 0.3274, + "grad_norm": 4.302881717681885, + "learning_rate": 1.7011581929675424e-05, + "loss": 0.071, + "step": 3274 + }, + { + "epoch": 0.3276, + "grad_norm": 0.00931827537715435, + "learning_rate": 1.700660250228492e-05, + "loss": 0.159, + "step": 3276 + }, + { + "epoch": 0.3278, + "grad_norm": 3.4438703060150146, + "learning_rate": 1.7001619659961467e-05, + "loss": 0.12, + "step": 3278 + }, + { + "epoch": 0.328, + "grad_norm": 0.0024581076577305794, + "learning_rate": 1.6996633405133656e-05, + "loss": 0.0032, + "step": 3280 + }, + { + "epoch": 0.3282, + "grad_norm": 0.036081183701753616, + "learning_rate": 1.6991643740231714e-05, + "loss": 0.0011, + "step": 3282 + }, + { + "epoch": 0.3284, + "grad_norm": 1.308199167251587, + "learning_rate": 1.6986650667687552e-05, + "loss": 0.0106, + "step": 3284 + }, + { + "epoch": 0.3286, + "grad_norm": 0.04118772968649864, + "learning_rate": 1.698165418993473e-05, + "loss": 0.0042, + "step": 3286 + }, + { + "epoch": 0.3288, + "grad_norm": 0.0034228512085974216, + "learning_rate": 1.6976654309408464e-05, + "loss": 0.0002, + "step": 3288 + }, + { + "epoch": 0.329, + "grad_norm": 0.0022913902066648006, + "learning_rate": 1.697165102854565e-05, + "loss": 0.0117, + "step": 3290 + }, + { + "epoch": 0.3292, + "grad_norm": 0.004312537610530853, + "learning_rate": 1.696664434978481e-05, + "loss": 0.1839, + "step": 3292 + }, + { + "epoch": 0.3294, + "grad_norm": 0.015636850148439407, + "learning_rate": 1.6961634275566147e-05, + "loss": 0.0007, + "step": 3294 + }, + { + "epoch": 0.3296, + "grad_norm": 0.06407980620861053, + "learning_rate": 1.695662080833151e-05, + "loss": 0.0024, + "step": 3296 + }, + { + "epoch": 0.3298, + "grad_norm": 0.043681494891643524, + "learning_rate": 1.69516039505244e-05, + "loss": 0.0025, + "step": 3298 + }, + { + "epoch": 0.33, + "grad_norm": 0.029340870678424835, + "learning_rate": 1.6946583704589973e-05, + "loss": 0.0025, + "step": 3300 + }, + { + "epoch": 0.3302, + "grad_norm": 0.27946147322654724, + "learning_rate": 1.694156007297504e-05, + "loss": 0.0039, + "step": 3302 + }, + { + "epoch": 0.3304, + "grad_norm": 0.05017179623246193, + "learning_rate": 1.693653305812805e-05, + "loss": 0.0009, + "step": 3304 + }, + { + "epoch": 0.3306, + "grad_norm": 0.005709236487746239, + "learning_rate": 1.6931502662499116e-05, + "loss": 0.0002, + "step": 3306 + }, + { + "epoch": 0.3308, + "grad_norm": 0.014585177414119244, + "learning_rate": 1.6926468888539988e-05, + "loss": 0.0004, + "step": 3308 + }, + { + "epoch": 0.331, + "grad_norm": 0.003783398075029254, + "learning_rate": 1.692143173870407e-05, + "loss": 0.0002, + "step": 3310 + }, + { + "epoch": 0.3312, + "grad_norm": 0.008064119145274162, + "learning_rate": 1.6916391215446403e-05, + "loss": 0.0005, + "step": 3312 + }, + { + "epoch": 0.3314, + "grad_norm": 6.366088390350342, + "learning_rate": 1.691134732122368e-05, + "loss": 0.0352, + "step": 3314 + }, + { + "epoch": 0.3316, + "grad_norm": 0.09524200111627579, + "learning_rate": 1.690630005849423e-05, + "loss": 0.0008, + "step": 3316 + }, + { + "epoch": 0.3318, + "grad_norm": 0.005891555920243263, + "learning_rate": 1.6901249429718033e-05, + "loss": 0.0003, + "step": 3318 + }, + { + "epoch": 0.332, + "grad_norm": 0.05611858516931534, + "learning_rate": 1.68961954373567e-05, + "loss": 0.0024, + "step": 3320 + }, + { + "epoch": 0.3322, + "grad_norm": 0.01645403541624546, + "learning_rate": 1.6891138083873486e-05, + "loss": 0.1693, + "step": 3322 + }, + { + "epoch": 0.3324, + "grad_norm": 0.009655345231294632, + "learning_rate": 1.6886077371733285e-05, + "loss": 0.0002, + "step": 3324 + }, + { + "epoch": 0.3326, + "grad_norm": 2.272428274154663, + "learning_rate": 1.688101330340263e-05, + "loss": 0.1088, + "step": 3326 + }, + { + "epoch": 0.3328, + "grad_norm": 0.06794866919517517, + "learning_rate": 1.6875945881349676e-05, + "loss": 0.0276, + "step": 3328 + }, + { + "epoch": 0.333, + "grad_norm": 0.015088796615600586, + "learning_rate": 1.6870875108044233e-05, + "loss": 0.0004, + "step": 3330 + }, + { + "epoch": 0.3332, + "grad_norm": 0.02303614467382431, + "learning_rate": 1.686580098595773e-05, + "loss": 0.0236, + "step": 3332 + }, + { + "epoch": 0.3334, + "grad_norm": 0.0624302439391613, + "learning_rate": 1.6860723517563232e-05, + "loss": 0.0068, + "step": 3334 + }, + { + "epoch": 0.3336, + "grad_norm": 7.455021381378174, + "learning_rate": 1.6855642705335438e-05, + "loss": 0.1597, + "step": 3336 + }, + { + "epoch": 0.3338, + "grad_norm": 0.4509100615978241, + "learning_rate": 1.685055855175067e-05, + "loss": 0.0074, + "step": 3338 + }, + { + "epoch": 0.334, + "grad_norm": 0.01678750291466713, + "learning_rate": 1.684547105928689e-05, + "loss": 0.0155, + "step": 3340 + }, + { + "epoch": 0.3342, + "grad_norm": 0.05036754533648491, + "learning_rate": 1.684038023042367e-05, + "loss": 0.0028, + "step": 3342 + }, + { + "epoch": 0.3344, + "grad_norm": 0.03933846578001976, + "learning_rate": 1.6835286067642228e-05, + "loss": 0.0173, + "step": 3344 + }, + { + "epoch": 0.3346, + "grad_norm": 0.004518347326666117, + "learning_rate": 1.683018857342539e-05, + "loss": 0.1251, + "step": 3346 + }, + { + "epoch": 0.3348, + "grad_norm": 0.23386144638061523, + "learning_rate": 1.6825087750257617e-05, + "loss": 0.0017, + "step": 3348 + }, + { + "epoch": 0.335, + "grad_norm": 4.0202202796936035, + "learning_rate": 1.6819983600624986e-05, + "loss": 0.0377, + "step": 3350 + }, + { + "epoch": 0.3352, + "grad_norm": 1.6617562770843506, + "learning_rate": 1.68148761270152e-05, + "loss": 0.0286, + "step": 3352 + }, + { + "epoch": 0.3354, + "grad_norm": 0.024906061589717865, + "learning_rate": 1.6809765331917576e-05, + "loss": 0.0287, + "step": 3354 + }, + { + "epoch": 0.3356, + "grad_norm": 0.0029040479566901922, + "learning_rate": 1.6804651217823055e-05, + "loss": 0.0003, + "step": 3356 + }, + { + "epoch": 0.3358, + "grad_norm": 0.02794128842651844, + "learning_rate": 1.6799533787224192e-05, + "loss": 0.0005, + "step": 3358 + }, + { + "epoch": 0.336, + "grad_norm": 2.9943435192108154, + "learning_rate": 1.6794413042615168e-05, + "loss": 0.1008, + "step": 3360 + }, + { + "epoch": 0.3362, + "grad_norm": 5.490565776824951, + "learning_rate": 1.6789288986491764e-05, + "loss": 0.2119, + "step": 3362 + }, + { + "epoch": 0.3364, + "grad_norm": 0.0049570901319384575, + "learning_rate": 1.6784161621351384e-05, + "loss": 0.0002, + "step": 3364 + }, + { + "epoch": 0.3366, + "grad_norm": 1.1424684524536133, + "learning_rate": 1.6779030949693044e-05, + "loss": 0.0212, + "step": 3366 + }, + { + "epoch": 0.3368, + "grad_norm": 0.030230073258280754, + "learning_rate": 1.6773896974017373e-05, + "loss": 0.0056, + "step": 3368 + }, + { + "epoch": 0.337, + "grad_norm": 0.011045556515455246, + "learning_rate": 1.6768759696826608e-05, + "loss": 0.0005, + "step": 3370 + }, + { + "epoch": 0.3372, + "grad_norm": 0.02466612122952938, + "learning_rate": 1.6763619120624595e-05, + "loss": 0.0132, + "step": 3372 + }, + { + "epoch": 0.3374, + "grad_norm": 0.012244025245308876, + "learning_rate": 1.6758475247916786e-05, + "loss": 0.0032, + "step": 3374 + }, + { + "epoch": 0.3376, + "grad_norm": 14.923068046569824, + "learning_rate": 1.6753328081210244e-05, + "loss": 0.4058, + "step": 3376 + }, + { + "epoch": 0.3378, + "grad_norm": 3.856356382369995, + "learning_rate": 1.6748177623013638e-05, + "loss": 0.1103, + "step": 3378 + }, + { + "epoch": 0.338, + "grad_norm": 0.023330383002758026, + "learning_rate": 1.6743023875837233e-05, + "loss": 0.0013, + "step": 3380 + }, + { + "epoch": 0.3382, + "grad_norm": 1.0074591636657715, + "learning_rate": 1.6737866842192908e-05, + "loss": 0.035, + "step": 3382 + }, + { + "epoch": 0.3384, + "grad_norm": 0.04795592650771141, + "learning_rate": 1.6732706524594138e-05, + "loss": 0.0029, + "step": 3384 + }, + { + "epoch": 0.3386, + "grad_norm": 0.9997116327285767, + "learning_rate": 1.6727542925556e-05, + "loss": 0.0571, + "step": 3386 + }, + { + "epoch": 0.3388, + "grad_norm": 0.20015831291675568, + "learning_rate": 1.6722376047595163e-05, + "loss": 0.0082, + "step": 3388 + }, + { + "epoch": 0.339, + "grad_norm": 0.46921902894973755, + "learning_rate": 1.6717205893229904e-05, + "loss": 0.0074, + "step": 3390 + }, + { + "epoch": 0.3392, + "grad_norm": 0.04215814918279648, + "learning_rate": 1.6712032464980094e-05, + "loss": 0.0049, + "step": 3392 + }, + { + "epoch": 0.3394, + "grad_norm": 0.5583164095878601, + "learning_rate": 1.6706855765367202e-05, + "loss": 0.0143, + "step": 3394 + }, + { + "epoch": 0.3396, + "grad_norm": 1.49314284324646, + "learning_rate": 1.6701675796914284e-05, + "loss": 0.0391, + "step": 3396 + }, + { + "epoch": 0.3398, + "grad_norm": 0.48825567960739136, + "learning_rate": 1.6696492562145996e-05, + "loss": 0.0219, + "step": 3398 + }, + { + "epoch": 0.34, + "grad_norm": 0.015591224655508995, + "learning_rate": 1.6691306063588583e-05, + "loss": 0.0016, + "step": 3400 + }, + { + "epoch": 0.3402, + "grad_norm": 1.1567492485046387, + "learning_rate": 1.6686116303769884e-05, + "loss": 0.039, + "step": 3402 + }, + { + "epoch": 0.3404, + "grad_norm": 0.06416214257478714, + "learning_rate": 1.668092328521932e-05, + "loss": 0.007, + "step": 3404 + }, + { + "epoch": 0.3406, + "grad_norm": 0.5625594854354858, + "learning_rate": 1.667572701046791e-05, + "loss": 0.0122, + "step": 3406 + }, + { + "epoch": 0.3408, + "grad_norm": 0.8872382640838623, + "learning_rate": 1.6670527482048246e-05, + "loss": 0.181, + "step": 3408 + }, + { + "epoch": 0.341, + "grad_norm": 0.6225885152816772, + "learning_rate": 1.6665324702494524e-05, + "loss": 0.0959, + "step": 3410 + }, + { + "epoch": 0.3412, + "grad_norm": 0.5917980074882507, + "learning_rate": 1.666011867434252e-05, + "loss": 0.2987, + "step": 3412 + }, + { + "epoch": 0.3414, + "grad_norm": 2.6016342639923096, + "learning_rate": 1.6654909400129575e-05, + "loss": 0.075, + "step": 3414 + }, + { + "epoch": 0.3416, + "grad_norm": 0.38774314522743225, + "learning_rate": 1.6649696882394635e-05, + "loss": 0.0082, + "step": 3416 + }, + { + "epoch": 0.3418, + "grad_norm": 0.029687369242310524, + "learning_rate": 1.664448112367822e-05, + "loss": 0.0505, + "step": 3418 + }, + { + "epoch": 0.342, + "grad_norm": 0.44596394896507263, + "learning_rate": 1.6639262126522417e-05, + "loss": 0.0074, + "step": 3420 + }, + { + "epoch": 0.3422, + "grad_norm": 3.634861946105957, + "learning_rate": 1.6634039893470912e-05, + "loss": 0.1411, + "step": 3422 + }, + { + "epoch": 0.3424, + "grad_norm": 0.061530936509370804, + "learning_rate": 1.6628814427068954e-05, + "loss": 0.0647, + "step": 3424 + }, + { + "epoch": 0.3426, + "grad_norm": 0.0406322255730629, + "learning_rate": 1.662358572986337e-05, + "loss": 0.0009, + "step": 3426 + }, + { + "epoch": 0.3428, + "grad_norm": 0.18547497689723969, + "learning_rate": 1.6618353804402567e-05, + "loss": 0.0028, + "step": 3428 + }, + { + "epoch": 0.343, + "grad_norm": 0.11492137610912323, + "learning_rate": 1.661311865323652e-05, + "loss": 0.1143, + "step": 3430 + }, + { + "epoch": 0.3432, + "grad_norm": 0.059278469532728195, + "learning_rate": 1.6607880278916778e-05, + "loss": 0.0171, + "step": 3432 + }, + { + "epoch": 0.3434, + "grad_norm": 0.8687169551849365, + "learning_rate": 1.6602638683996462e-05, + "loss": 0.0109, + "step": 3434 + }, + { + "epoch": 0.3436, + "grad_norm": 0.029279228299856186, + "learning_rate": 1.6597393871030264e-05, + "loss": 0.0012, + "step": 3436 + }, + { + "epoch": 0.3438, + "grad_norm": 5.858804225921631, + "learning_rate": 1.6592145842574433e-05, + "loss": 0.1703, + "step": 3438 + }, + { + "epoch": 0.344, + "grad_norm": 0.37119826674461365, + "learning_rate": 1.6586894601186804e-05, + "loss": 0.0076, + "step": 3440 + }, + { + "epoch": 0.3442, + "grad_norm": 1.6144152879714966, + "learning_rate": 1.6581640149426766e-05, + "loss": 0.1324, + "step": 3442 + }, + { + "epoch": 0.3444, + "grad_norm": 9.847855567932129, + "learning_rate": 1.6576382489855274e-05, + "loss": 0.1746, + "step": 3444 + }, + { + "epoch": 0.3446, + "grad_norm": 0.11904001235961914, + "learning_rate": 1.6571121625034847e-05, + "loss": 0.0169, + "step": 3446 + }, + { + "epoch": 0.3448, + "grad_norm": 0.012749559246003628, + "learning_rate": 1.6565857557529567e-05, + "loss": 0.0013, + "step": 3448 + }, + { + "epoch": 0.345, + "grad_norm": 0.0667443722486496, + "learning_rate": 1.6560590289905074e-05, + "loss": 0.0008, + "step": 3450 + }, + { + "epoch": 0.3452, + "grad_norm": 0.027050312608480453, + "learning_rate": 1.6555319824728577e-05, + "loss": 0.0104, + "step": 3452 + }, + { + "epoch": 0.3454, + "grad_norm": 0.120109923183918, + "learning_rate": 1.6550046164568827e-05, + "loss": 0.0359, + "step": 3454 + }, + { + "epoch": 0.3456, + "grad_norm": 0.1986231803894043, + "learning_rate": 1.654476931199615e-05, + "loss": 0.0659, + "step": 3456 + }, + { + "epoch": 0.3458, + "grad_norm": 0.11120156943798065, + "learning_rate": 1.6539489269582414e-05, + "loss": 0.0047, + "step": 3458 + }, + { + "epoch": 0.346, + "grad_norm": 0.0161144882440567, + "learning_rate": 1.6534206039901057e-05, + "loss": 0.0165, + "step": 3460 + }, + { + "epoch": 0.3462, + "grad_norm": 0.5118119716644287, + "learning_rate": 1.652891962552705e-05, + "loss": 0.0075, + "step": 3462 + }, + { + "epoch": 0.3464, + "grad_norm": 0.08039485663175583, + "learning_rate": 1.652363002903693e-05, + "loss": 0.0052, + "step": 3464 + }, + { + "epoch": 0.3466, + "grad_norm": 0.04041441157460213, + "learning_rate": 1.651833725300879e-05, + "loss": 0.0056, + "step": 3466 + }, + { + "epoch": 0.3468, + "grad_norm": 0.4430837035179138, + "learning_rate": 1.6513041300022253e-05, + "loss": 0.0081, + "step": 3468 + }, + { + "epoch": 0.347, + "grad_norm": 0.015078449621796608, + "learning_rate": 1.650774217265851e-05, + "loss": 0.0026, + "step": 3470 + }, + { + "epoch": 0.3472, + "grad_norm": 0.04156745225191116, + "learning_rate": 1.650243987350029e-05, + "loss": 0.0012, + "step": 3472 + }, + { + "epoch": 0.3474, + "grad_norm": 0.16728971898555756, + "learning_rate": 1.649713440513187e-05, + "loss": 0.0021, + "step": 3474 + }, + { + "epoch": 0.3476, + "grad_norm": 0.37982654571533203, + "learning_rate": 1.649182577013906e-05, + "loss": 0.0033, + "step": 3476 + }, + { + "epoch": 0.3478, + "grad_norm": 0.04140719398856163, + "learning_rate": 1.6486513971109245e-05, + "loss": 0.0007, + "step": 3478 + }, + { + "epoch": 0.348, + "grad_norm": 0.390001118183136, + "learning_rate": 1.6481199010631312e-05, + "loss": 0.0059, + "step": 3480 + }, + { + "epoch": 0.3482, + "grad_norm": 0.3312391936779022, + "learning_rate": 1.6475880891295716e-05, + "loss": 0.0072, + "step": 3482 + }, + { + "epoch": 0.3484, + "grad_norm": 0.02585277147591114, + "learning_rate": 1.6470559615694445e-05, + "loss": 0.0007, + "step": 3484 + }, + { + "epoch": 0.3486, + "grad_norm": 0.02166840247809887, + "learning_rate": 1.6465235186421024e-05, + "loss": 0.001, + "step": 3486 + }, + { + "epoch": 0.3488, + "grad_norm": 0.7614883780479431, + "learning_rate": 1.6459907606070513e-05, + "loss": 0.0474, + "step": 3488 + }, + { + "epoch": 0.349, + "grad_norm": 0.011535302735865116, + "learning_rate": 1.645457687723951e-05, + "loss": 0.0005, + "step": 3490 + }, + { + "epoch": 0.3492, + "grad_norm": 0.531298041343689, + "learning_rate": 1.6449243002526146e-05, + "loss": 0.008, + "step": 3492 + }, + { + "epoch": 0.3494, + "grad_norm": 1.1808953285217285, + "learning_rate": 1.6443905984530092e-05, + "loss": 0.0266, + "step": 3494 + }, + { + "epoch": 0.3496, + "grad_norm": 0.626410186290741, + "learning_rate": 1.643856582585254e-05, + "loss": 0.0608, + "step": 3496 + }, + { + "epoch": 0.3498, + "grad_norm": 0.050834137946367264, + "learning_rate": 1.643322252909622e-05, + "loss": 0.0011, + "step": 3498 + }, + { + "epoch": 0.35, + "grad_norm": 0.08706483989953995, + "learning_rate": 1.6427876096865394e-05, + "loss": 0.0688, + "step": 3500 + }, + { + "epoch": 0.3502, + "grad_norm": 0.04700781777501106, + "learning_rate": 1.6422526531765846e-05, + "loss": 0.0015, + "step": 3502 + }, + { + "epoch": 0.3504, + "grad_norm": 0.5916557908058167, + "learning_rate": 1.6417173836404888e-05, + "loss": 0.0113, + "step": 3504 + }, + { + "epoch": 0.3506, + "grad_norm": 0.007529082708060741, + "learning_rate": 1.6411818013391357e-05, + "loss": 0.02, + "step": 3506 + }, + { + "epoch": 0.3508, + "grad_norm": 1.190001368522644, + "learning_rate": 1.6406459065335616e-05, + "loss": 0.0134, + "step": 3508 + }, + { + "epoch": 0.351, + "grad_norm": 0.7624766826629639, + "learning_rate": 1.6401096994849558e-05, + "loss": 0.0118, + "step": 3510 + }, + { + "epoch": 0.3512, + "grad_norm": 0.09007766842842102, + "learning_rate": 1.6395731804546582e-05, + "loss": 0.0016, + "step": 3512 + }, + { + "epoch": 0.3514, + "grad_norm": 1.4800273180007935, + "learning_rate": 1.639036349704162e-05, + "loss": 0.0126, + "step": 3514 + }, + { + "epoch": 0.3516, + "grad_norm": 0.007910960353910923, + "learning_rate": 1.6384992074951124e-05, + "loss": 0.0003, + "step": 3516 + }, + { + "epoch": 0.3518, + "grad_norm": 0.343647837638855, + "learning_rate": 1.6379617540893056e-05, + "loss": 0.0067, + "step": 3518 + }, + { + "epoch": 0.352, + "grad_norm": 0.0020036513451486826, + "learning_rate": 1.63742398974869e-05, + "loss": 0.0002, + "step": 3520 + }, + { + "epoch": 0.3522, + "grad_norm": 0.10268548876047134, + "learning_rate": 1.636885914735365e-05, + "loss": 0.0012, + "step": 3522 + }, + { + "epoch": 0.3524, + "grad_norm": 0.11562567204236984, + "learning_rate": 1.6363475293115824e-05, + "loss": 0.0259, + "step": 3524 + }, + { + "epoch": 0.3526, + "grad_norm": 0.001223837141878903, + "learning_rate": 1.6358088337397444e-05, + "loss": 0.0036, + "step": 3526 + }, + { + "epoch": 0.3528, + "grad_norm": 0.873450517654419, + "learning_rate": 1.6352698282824045e-05, + "loss": 0.0083, + "step": 3528 + }, + { + "epoch": 0.353, + "grad_norm": 0.005781423766165972, + "learning_rate": 1.6347305132022677e-05, + "loss": 0.0002, + "step": 3530 + }, + { + "epoch": 0.3532, + "grad_norm": 0.02506238967180252, + "learning_rate": 1.6341908887621894e-05, + "loss": 0.102, + "step": 3532 + }, + { + "epoch": 0.3534, + "grad_norm": 0.026499155908823013, + "learning_rate": 1.6336509552251766e-05, + "loss": 0.0008, + "step": 3534 + }, + { + "epoch": 0.3536, + "grad_norm": 0.0028556303586810827, + "learning_rate": 1.6331107128543856e-05, + "loss": 0.0036, + "step": 3536 + }, + { + "epoch": 0.3538, + "grad_norm": 0.0036969115026295185, + "learning_rate": 1.6325701619131246e-05, + "loss": 0.0008, + "step": 3538 + }, + { + "epoch": 0.354, + "grad_norm": 0.09436988830566406, + "learning_rate": 1.632029302664851e-05, + "loss": 0.0081, + "step": 3540 + }, + { + "epoch": 0.3542, + "grad_norm": 0.006512045860290527, + "learning_rate": 1.6314881353731733e-05, + "loss": 0.0003, + "step": 3542 + }, + { + "epoch": 0.3544, + "grad_norm": 0.030937625095248222, + "learning_rate": 1.6309466603018497e-05, + "loss": 0.003, + "step": 3544 + }, + { + "epoch": 0.3546, + "grad_norm": 0.009215878322720528, + "learning_rate": 1.630404877714789e-05, + "loss": 0.0003, + "step": 3546 + }, + { + "epoch": 0.3548, + "grad_norm": 14.354619026184082, + "learning_rate": 1.6298627878760488e-05, + "loss": 0.2644, + "step": 3548 + }, + { + "epoch": 0.355, + "grad_norm": 0.018204383552074432, + "learning_rate": 1.6293203910498375e-05, + "loss": 0.0003, + "step": 3550 + }, + { + "epoch": 0.3552, + "grad_norm": 0.7680230736732483, + "learning_rate": 1.628777687500513e-05, + "loss": 0.0059, + "step": 3552 + }, + { + "epoch": 0.3554, + "grad_norm": 5.155229091644287, + "learning_rate": 1.6282346774925816e-05, + "loss": 0.0332, + "step": 3554 + }, + { + "epoch": 0.3556, + "grad_norm": 0.026679372414946556, + "learning_rate": 1.6276913612907005e-05, + "loss": 0.1053, + "step": 3556 + }, + { + "epoch": 0.3558, + "grad_norm": 0.027998652309179306, + "learning_rate": 1.6271477391596754e-05, + "loss": 0.002, + "step": 3558 + }, + { + "epoch": 0.356, + "grad_norm": 0.017679724842309952, + "learning_rate": 1.6266038113644605e-05, + "loss": 0.0012, + "step": 3560 + }, + { + "epoch": 0.3562, + "grad_norm": 0.0199788436293602, + "learning_rate": 1.6260595781701605e-05, + "loss": 0.0006, + "step": 3562 + }, + { + "epoch": 0.3564, + "grad_norm": 0.020151536911725998, + "learning_rate": 1.6255150398420273e-05, + "loss": 0.0005, + "step": 3564 + }, + { + "epoch": 0.3566, + "grad_norm": 0.7634250521659851, + "learning_rate": 1.6249701966454626e-05, + "loss": 0.0183, + "step": 3566 + }, + { + "epoch": 0.3568, + "grad_norm": 2.214284896850586, + "learning_rate": 1.624425048846016e-05, + "loss": 0.3322, + "step": 3568 + }, + { + "epoch": 0.357, + "grad_norm": 0.09718557447195053, + "learning_rate": 1.6238795967093865e-05, + "loss": 0.0034, + "step": 3570 + }, + { + "epoch": 0.3572, + "grad_norm": 0.0061694420874118805, + "learning_rate": 1.6233338405014204e-05, + "loss": 0.0353, + "step": 3572 + }, + { + "epoch": 0.3574, + "grad_norm": 0.09387592226266861, + "learning_rate": 1.6227877804881126e-05, + "loss": 0.0026, + "step": 3574 + }, + { + "epoch": 0.3576, + "grad_norm": 0.0679655373096466, + "learning_rate": 1.6222414169356066e-05, + "loss": 0.0033, + "step": 3576 + }, + { + "epoch": 0.3578, + "grad_norm": 2.3160040378570557, + "learning_rate": 1.621694750110193e-05, + "loss": 0.0268, + "step": 3578 + }, + { + "epoch": 0.358, + "grad_norm": 0.02434500865638256, + "learning_rate": 1.6211477802783105e-05, + "loss": 0.0007, + "step": 3580 + }, + { + "epoch": 0.3582, + "grad_norm": 1.0480221509933472, + "learning_rate": 1.6206005077065457e-05, + "loss": 0.1203, + "step": 3582 + }, + { + "epoch": 0.3584, + "grad_norm": 0.016781963407993317, + "learning_rate": 1.620052932661633e-05, + "loss": 0.0014, + "step": 3584 + }, + { + "epoch": 0.3586, + "grad_norm": 0.25913119316101074, + "learning_rate": 1.619505055410453e-05, + "loss": 0.0089, + "step": 3586 + }, + { + "epoch": 0.3588, + "grad_norm": 0.4124950170516968, + "learning_rate": 1.618956876220035e-05, + "loss": 0.0057, + "step": 3588 + }, + { + "epoch": 0.359, + "grad_norm": 0.0552266389131546, + "learning_rate": 1.6184083953575543e-05, + "loss": 0.0014, + "step": 3590 + }, + { + "epoch": 0.3592, + "grad_norm": 0.0033946477342396975, + "learning_rate": 1.6178596130903345e-05, + "loss": 0.0016, + "step": 3592 + }, + { + "epoch": 0.3594, + "grad_norm": 0.6171644926071167, + "learning_rate": 1.617310529685845e-05, + "loss": 0.0054, + "step": 3594 + }, + { + "epoch": 0.3596, + "grad_norm": 0.06245751306414604, + "learning_rate": 1.6167611454117027e-05, + "loss": 0.0014, + "step": 3596 + }, + { + "epoch": 0.3598, + "grad_norm": 0.3096908628940582, + "learning_rate": 1.6162114605356704e-05, + "loss": 0.0129, + "step": 3598 + }, + { + "epoch": 0.36, + "grad_norm": 0.1537632942199707, + "learning_rate": 1.6156614753256583e-05, + "loss": 0.0167, + "step": 3600 + }, + { + "epoch": 0.3602, + "grad_norm": 0.6485740542411804, + "learning_rate": 1.6151111900497225e-05, + "loss": 0.0086, + "step": 3602 + }, + { + "epoch": 0.3604, + "grad_norm": 0.10599882900714874, + "learning_rate": 1.6145606049760644e-05, + "loss": 0.005, + "step": 3604 + }, + { + "epoch": 0.3606, + "grad_norm": 0.12418285012245178, + "learning_rate": 1.614009720373034e-05, + "loss": 0.003, + "step": 3606 + }, + { + "epoch": 0.3608, + "grad_norm": 0.01650208979845047, + "learning_rate": 1.6134585365091243e-05, + "loss": 0.0021, + "step": 3608 + }, + { + "epoch": 0.361, + "grad_norm": 0.05889071896672249, + "learning_rate": 1.6129070536529767e-05, + "loss": 0.0022, + "step": 3610 + }, + { + "epoch": 0.3612, + "grad_norm": 0.05440518260002136, + "learning_rate": 1.6123552720733767e-05, + "loss": 0.0031, + "step": 3612 + }, + { + "epoch": 0.3614, + "grad_norm": 0.033205002546310425, + "learning_rate": 1.611803192039256e-05, + "loss": 0.0008, + "step": 3614 + }, + { + "epoch": 0.3616, + "grad_norm": 0.0461873896420002, + "learning_rate": 1.611250813819692e-05, + "loss": 0.0007, + "step": 3616 + }, + { + "epoch": 0.3618, + "grad_norm": 0.06850118935108185, + "learning_rate": 1.6106981376839064e-05, + "loss": 0.0012, + "step": 3618 + }, + { + "epoch": 0.362, + "grad_norm": 0.08129389584064484, + "learning_rate": 1.610145163901268e-05, + "loss": 0.0062, + "step": 3620 + }, + { + "epoch": 0.3622, + "grad_norm": 0.014567009173333645, + "learning_rate": 1.6095918927412883e-05, + "loss": 0.0004, + "step": 3622 + }, + { + "epoch": 0.3624, + "grad_norm": 0.4603691101074219, + "learning_rate": 1.6090383244736256e-05, + "loss": 0.0066, + "step": 3624 + }, + { + "epoch": 0.3626, + "grad_norm": 0.06390134245157242, + "learning_rate": 1.608484459368082e-05, + "loss": 0.0012, + "step": 3626 + }, + { + "epoch": 0.3628, + "grad_norm": 0.014701683074235916, + "learning_rate": 1.6079302976946055e-05, + "loss": 0.0005, + "step": 3628 + }, + { + "epoch": 0.363, + "grad_norm": 0.25187280774116516, + "learning_rate": 1.607375839723287e-05, + "loss": 0.0033, + "step": 3630 + }, + { + "epoch": 0.3632, + "grad_norm": 0.011308551765978336, + "learning_rate": 1.6068210857243625e-05, + "loss": 0.0014, + "step": 3632 + }, + { + "epoch": 0.3634, + "grad_norm": 0.029800329357385635, + "learning_rate": 1.6062660359682124e-05, + "loss": 0.0013, + "step": 3634 + }, + { + "epoch": 0.3636, + "grad_norm": 0.02312925085425377, + "learning_rate": 1.6057106907253617e-05, + "loss": 0.0029, + "step": 3636 + }, + { + "epoch": 0.3638, + "grad_norm": 7.662700653076172, + "learning_rate": 1.605155050266478e-05, + "loss": 0.0756, + "step": 3638 + }, + { + "epoch": 0.364, + "grad_norm": 0.07050482928752899, + "learning_rate": 1.6045991148623752e-05, + "loss": 0.0752, + "step": 3640 + }, + { + "epoch": 0.3642, + "grad_norm": 0.05736454203724861, + "learning_rate": 1.6040428847840078e-05, + "loss": 0.0008, + "step": 3642 + }, + { + "epoch": 0.3644, + "grad_norm": 0.005929313134402037, + "learning_rate": 1.6034863603024768e-05, + "loss": 0.0006, + "step": 3644 + }, + { + "epoch": 0.3646, + "grad_norm": 0.011041248217225075, + "learning_rate": 1.602929541689025e-05, + "loss": 0.0004, + "step": 3646 + }, + { + "epoch": 0.3648, + "grad_norm": 1.299591064453125, + "learning_rate": 1.6023724292150387e-05, + "loss": 0.2381, + "step": 3648 + }, + { + "epoch": 0.365, + "grad_norm": 0.37637174129486084, + "learning_rate": 1.6018150231520486e-05, + "loss": 0.0748, + "step": 3650 + }, + { + "epoch": 0.3652, + "grad_norm": 0.013156242668628693, + "learning_rate": 1.601257323771727e-05, + "loss": 0.0031, + "step": 3652 + }, + { + "epoch": 0.3654, + "grad_norm": 0.2946488857269287, + "learning_rate": 1.6006993313458896e-05, + "loss": 0.0083, + "step": 3654 + }, + { + "epoch": 0.3656, + "grad_norm": 0.04146170988678932, + "learning_rate": 1.6001410461464955e-05, + "loss": 0.0017, + "step": 3656 + }, + { + "epoch": 0.3658, + "grad_norm": 0.003168781055137515, + "learning_rate": 1.5995824684456465e-05, + "loss": 0.0012, + "step": 3658 + }, + { + "epoch": 0.366, + "grad_norm": 0.8301743865013123, + "learning_rate": 1.599023598515586e-05, + "loss": 0.0144, + "step": 3660 + }, + { + "epoch": 0.3662, + "grad_norm": 1.6224066019058228, + "learning_rate": 1.5984644366287007e-05, + "loss": 0.0554, + "step": 3662 + }, + { + "epoch": 0.3664, + "grad_norm": 0.5822380185127258, + "learning_rate": 1.597904983057519e-05, + "loss": 0.0685, + "step": 3664 + }, + { + "epoch": 0.3666, + "grad_norm": 0.019077297300100327, + "learning_rate": 1.5973452380747125e-05, + "loss": 0.0011, + "step": 3666 + }, + { + "epoch": 0.3668, + "grad_norm": 0.01684684306383133, + "learning_rate": 1.596785201953093e-05, + "loss": 0.0003, + "step": 3668 + }, + { + "epoch": 0.367, + "grad_norm": 0.1353202909231186, + "learning_rate": 1.5962248749656158e-05, + "loss": 0.0034, + "step": 3670 + }, + { + "epoch": 0.3672, + "grad_norm": 0.10263316333293915, + "learning_rate": 1.5956642573853784e-05, + "loss": 0.0018, + "step": 3672 + }, + { + "epoch": 0.3674, + "grad_norm": 0.2141703963279724, + "learning_rate": 1.5951033494856174e-05, + "loss": 0.0022, + "step": 3674 + }, + { + "epoch": 0.3676, + "grad_norm": 0.015855850651860237, + "learning_rate": 1.5945421515397135e-05, + "loss": 0.0626, + "step": 3676 + }, + { + "epoch": 0.3678, + "grad_norm": 0.12415570020675659, + "learning_rate": 1.593980663821187e-05, + "loss": 0.0026, + "step": 3678 + }, + { + "epoch": 0.368, + "grad_norm": 4.565486431121826, + "learning_rate": 1.5934188866037017e-05, + "loss": 0.1032, + "step": 3680 + }, + { + "epoch": 0.3682, + "grad_norm": 0.7961174845695496, + "learning_rate": 1.5928568201610593e-05, + "loss": 0.0104, + "step": 3682 + }, + { + "epoch": 0.3684, + "grad_norm": 0.282111257314682, + "learning_rate": 1.592294464767205e-05, + "loss": 0.005, + "step": 3684 + }, + { + "epoch": 0.3686, + "grad_norm": 0.07937809824943542, + "learning_rate": 1.591731820696224e-05, + "loss": 0.0044, + "step": 3686 + }, + { + "epoch": 0.3688, + "grad_norm": 0.007726403418928385, + "learning_rate": 1.591168888222342e-05, + "loss": 0.0013, + "step": 3688 + }, + { + "epoch": 0.369, + "grad_norm": 0.7588459849357605, + "learning_rate": 1.5906056676199256e-05, + "loss": 0.0221, + "step": 3690 + }, + { + "epoch": 0.3692, + "grad_norm": 0.4603768289089203, + "learning_rate": 1.5900421591634813e-05, + "loss": 0.0091, + "step": 3692 + }, + { + "epoch": 0.3694, + "grad_norm": 0.05512309446930885, + "learning_rate": 1.589478363127657e-05, + "loss": 0.0021, + "step": 3694 + }, + { + "epoch": 0.3696, + "grad_norm": 0.18189407885074615, + "learning_rate": 1.5889142797872387e-05, + "loss": 0.0021, + "step": 3696 + }, + { + "epoch": 0.3698, + "grad_norm": 5.235508441925049, + "learning_rate": 1.5883499094171556e-05, + "loss": 0.0846, + "step": 3698 + }, + { + "epoch": 0.37, + "grad_norm": 0.03465460613369942, + "learning_rate": 1.5877852522924733e-05, + "loss": 0.1435, + "step": 3700 + }, + { + "epoch": 0.3702, + "grad_norm": 0.04051093012094498, + "learning_rate": 1.5872203086883996e-05, + "loss": 0.0009, + "step": 3702 + }, + { + "epoch": 0.3704, + "grad_norm": 4.172172546386719, + "learning_rate": 1.5866550788802815e-05, + "loss": 0.1664, + "step": 3704 + }, + { + "epoch": 0.3706, + "grad_norm": 1.1669952869415283, + "learning_rate": 1.5860895631436044e-05, + "loss": 0.0137, + "step": 3706 + }, + { + "epoch": 0.3708, + "grad_norm": 0.02322334423661232, + "learning_rate": 1.5855237617539943e-05, + "loss": 0.0009, + "step": 3708 + }, + { + "epoch": 0.371, + "grad_norm": 3.7106451988220215, + "learning_rate": 1.584957674987216e-05, + "loss": 0.0362, + "step": 3710 + }, + { + "epoch": 0.3712, + "grad_norm": 0.0933113545179367, + "learning_rate": 1.5843913031191722e-05, + "loss": 0.0009, + "step": 3712 + }, + { + "epoch": 0.3714, + "grad_norm": 0.09366972744464874, + "learning_rate": 1.583824646425907e-05, + "loss": 0.0024, + "step": 3714 + }, + { + "epoch": 0.3716, + "grad_norm": 0.11350619792938232, + "learning_rate": 1.5832577051836016e-05, + "loss": 0.0083, + "step": 3716 + }, + { + "epoch": 0.3718, + "grad_norm": 4.781865119934082, + "learning_rate": 1.5826904796685763e-05, + "loss": 0.0711, + "step": 3718 + }, + { + "epoch": 0.372, + "grad_norm": 0.013110661879181862, + "learning_rate": 1.5821229701572897e-05, + "loss": 0.0058, + "step": 3720 + }, + { + "epoch": 0.3722, + "grad_norm": 4.5792622566223145, + "learning_rate": 1.5815551769263387e-05, + "loss": 0.0575, + "step": 3722 + }, + { + "epoch": 0.3724, + "grad_norm": 0.0034454537089914083, + "learning_rate": 1.5809871002524602e-05, + "loss": 0.0164, + "step": 3724 + }, + { + "epoch": 0.3726, + "grad_norm": 0.131917804479599, + "learning_rate": 1.580418740412526e-05, + "loss": 0.0028, + "step": 3726 + }, + { + "epoch": 0.3728, + "grad_norm": 0.047324951738119125, + "learning_rate": 1.5798500976835493e-05, + "loss": 0.0013, + "step": 3728 + }, + { + "epoch": 0.373, + "grad_norm": 1.9808778762817383, + "learning_rate": 1.5792811723426787e-05, + "loss": 0.0421, + "step": 3730 + }, + { + "epoch": 0.3732, + "grad_norm": 1.3721117973327637, + "learning_rate": 1.5787119646672025e-05, + "loss": 0.0298, + "step": 3732 + }, + { + "epoch": 0.3734, + "grad_norm": 1.4042298793792725, + "learning_rate": 1.5781424749345447e-05, + "loss": 0.0568, + "step": 3734 + }, + { + "epoch": 0.3736, + "grad_norm": 0.6713020205497742, + "learning_rate": 1.5775727034222675e-05, + "loss": 0.0133, + "step": 3736 + }, + { + "epoch": 0.3738, + "grad_norm": 0.04910013824701309, + "learning_rate": 1.577002650408072e-05, + "loss": 0.0017, + "step": 3738 + }, + { + "epoch": 0.374, + "grad_norm": 0.13005828857421875, + "learning_rate": 1.5764323161697933e-05, + "loss": 0.0049, + "step": 3740 + }, + { + "epoch": 0.3742, + "grad_norm": 0.034822914749383926, + "learning_rate": 1.5758617009854068e-05, + "loss": 0.0023, + "step": 3742 + }, + { + "epoch": 0.3744, + "grad_norm": 4.641127586364746, + "learning_rate": 1.575290805133023e-05, + "loss": 0.106, + "step": 3744 + }, + { + "epoch": 0.3746, + "grad_norm": 0.030292700976133347, + "learning_rate": 1.5747196288908887e-05, + "loss": 0.1605, + "step": 3746 + }, + { + "epoch": 0.3748, + "grad_norm": 0.032084908336400986, + "learning_rate": 1.57414817253739e-05, + "loss": 0.0184, + "step": 3748 + }, + { + "epoch": 0.375, + "grad_norm": 0.03296246379613876, + "learning_rate": 1.573576436351046e-05, + "loss": 0.0022, + "step": 3750 + }, + { + "epoch": 0.3752, + "grad_norm": 3.1083357334136963, + "learning_rate": 1.5730044206105156e-05, + "loss": 0.0553, + "step": 3752 + }, + { + "epoch": 0.3754, + "grad_norm": 1.0394147634506226, + "learning_rate": 1.572432125594591e-05, + "loss": 0.0153, + "step": 3754 + }, + { + "epoch": 0.3756, + "grad_norm": 0.010168555192649364, + "learning_rate": 1.5718595515822027e-05, + "loss": 0.3446, + "step": 3756 + }, + { + "epoch": 0.3758, + "grad_norm": 0.013337628915905952, + "learning_rate": 1.5712866988524157e-05, + "loss": 0.0008, + "step": 3758 + }, + { + "epoch": 0.376, + "grad_norm": 0.027712909504771233, + "learning_rate": 1.570713567684432e-05, + "loss": 0.0061, + "step": 3760 + }, + { + "epoch": 0.3762, + "grad_norm": 2.446803092956543, + "learning_rate": 1.5701401583575883e-05, + "loss": 0.0365, + "step": 3762 + }, + { + "epoch": 0.3764, + "grad_norm": 0.5644667744636536, + "learning_rate": 1.5695664711513575e-05, + "loss": 0.0158, + "step": 3764 + }, + { + "epoch": 0.3766, + "grad_norm": 0.26387450098991394, + "learning_rate": 1.5689925063453483e-05, + "loss": 0.0334, + "step": 3766 + }, + { + "epoch": 0.3768, + "grad_norm": 2.9615821838378906, + "learning_rate": 1.568418264219303e-05, + "loss": 0.0552, + "step": 3768 + }, + { + "epoch": 0.377, + "grad_norm": 0.6957718729972839, + "learning_rate": 1.5678437450531014e-05, + "loss": 0.008, + "step": 3770 + }, + { + "epoch": 0.3772, + "grad_norm": 0.17936263978481293, + "learning_rate": 1.567268949126757e-05, + "loss": 0.0053, + "step": 3772 + }, + { + "epoch": 0.3774, + "grad_norm": 0.04516501724720001, + "learning_rate": 1.5666938767204173e-05, + "loss": 0.002, + "step": 3774 + }, + { + "epoch": 0.3776, + "grad_norm": 0.465433806180954, + "learning_rate": 1.5661185281143666e-05, + "loss": 0.0063, + "step": 3776 + }, + { + "epoch": 0.3778, + "grad_norm": 0.3608492314815521, + "learning_rate": 1.565542903589023e-05, + "loss": 0.0045, + "step": 3778 + }, + { + "epoch": 0.378, + "grad_norm": 9.13187026977539, + "learning_rate": 1.564967003424938e-05, + "loss": 0.143, + "step": 3780 + }, + { + "epoch": 0.3782, + "grad_norm": 2.5370075702667236, + "learning_rate": 1.5643908279027994e-05, + "loss": 0.0621, + "step": 3782 + }, + { + "epoch": 0.3784, + "grad_norm": 0.007117291446775198, + "learning_rate": 1.5638143773034268e-05, + "loss": 0.0974, + "step": 3784 + }, + { + "epoch": 0.3786, + "grad_norm": 1.766635775566101, + "learning_rate": 1.563237651907777e-05, + "loss": 0.0332, + "step": 3786 + }, + { + "epoch": 0.3788, + "grad_norm": 0.11343780159950256, + "learning_rate": 1.562660651996937e-05, + "loss": 0.0051, + "step": 3788 + }, + { + "epoch": 0.379, + "grad_norm": 4.259795665740967, + "learning_rate": 1.5620833778521306e-05, + "loss": 0.105, + "step": 3790 + }, + { + "epoch": 0.3792, + "grad_norm": 0.14029952883720398, + "learning_rate": 1.5615058297547144e-05, + "loss": 0.0041, + "step": 3792 + }, + { + "epoch": 0.3794, + "grad_norm": 0.404513418674469, + "learning_rate": 1.560928007986178e-05, + "loss": 0.0135, + "step": 3794 + }, + { + "epoch": 0.3796, + "grad_norm": 0.10605732351541519, + "learning_rate": 1.5603499128281447e-05, + "loss": 0.002, + "step": 3796 + }, + { + "epoch": 0.3798, + "grad_norm": 0.19381476938724518, + "learning_rate": 1.5597715445623714e-05, + "loss": 0.0034, + "step": 3798 + }, + { + "epoch": 0.38, + "grad_norm": 0.012709299102425575, + "learning_rate": 1.5591929034707468e-05, + "loss": 0.0015, + "step": 3800 + }, + { + "epoch": 0.3802, + "grad_norm": 0.04493028670549393, + "learning_rate": 1.558613989835295e-05, + "loss": 0.0011, + "step": 3802 + }, + { + "epoch": 0.3804, + "grad_norm": 0.16351035237312317, + "learning_rate": 1.55803480393817e-05, + "loss": 0.0029, + "step": 3804 + }, + { + "epoch": 0.3806, + "grad_norm": 0.013622669503092766, + "learning_rate": 1.5574553460616608e-05, + "loss": 0.0534, + "step": 3806 + }, + { + "epoch": 0.3808, + "grad_norm": 2.428973913192749, + "learning_rate": 1.556875616488188e-05, + "loss": 0.2864, + "step": 3808 + }, + { + "epoch": 0.381, + "grad_norm": 0.3380669355392456, + "learning_rate": 1.556295615500305e-05, + "loss": 0.0101, + "step": 3810 + }, + { + "epoch": 0.3812, + "grad_norm": 0.01798013225197792, + "learning_rate": 1.5557153433806967e-05, + "loss": 0.0009, + "step": 3812 + }, + { + "epoch": 0.3814, + "grad_norm": 0.07259425520896912, + "learning_rate": 1.555134800412181e-05, + "loss": 0.0513, + "step": 3814 + }, + { + "epoch": 0.3816, + "grad_norm": 0.022921714931726456, + "learning_rate": 1.5545539868777075e-05, + "loss": 0.0142, + "step": 3816 + }, + { + "epoch": 0.3818, + "grad_norm": 0.140417218208313, + "learning_rate": 1.5539729030603574e-05, + "loss": 0.0045, + "step": 3818 + }, + { + "epoch": 0.382, + "grad_norm": 5.2961745262146, + "learning_rate": 1.553391549243344e-05, + "loss": 0.0887, + "step": 3820 + }, + { + "epoch": 0.3822, + "grad_norm": 0.9727416038513184, + "learning_rate": 1.5528099257100126e-05, + "loss": 0.0134, + "step": 3822 + }, + { + "epoch": 0.3824, + "grad_norm": 0.018895955756306648, + "learning_rate": 1.5522280327438388e-05, + "loss": 0.5087, + "step": 3824 + }, + { + "epoch": 0.3826, + "grad_norm": 0.008767371065914631, + "learning_rate": 1.5516458706284306e-05, + "loss": 0.0118, + "step": 3826 + }, + { + "epoch": 0.3828, + "grad_norm": 0.009288896806538105, + "learning_rate": 1.5510634396475262e-05, + "loss": 0.007, + "step": 3828 + }, + { + "epoch": 0.383, + "grad_norm": 1.3818188905715942, + "learning_rate": 1.5504807400849957e-05, + "loss": 0.0221, + "step": 3830 + }, + { + "epoch": 0.3832, + "grad_norm": 0.1867048442363739, + "learning_rate": 1.54989777222484e-05, + "loss": 0.1099, + "step": 3832 + }, + { + "epoch": 0.3834, + "grad_norm": 6.3578290939331055, + "learning_rate": 1.54931453635119e-05, + "loss": 0.1395, + "step": 3834 + }, + { + "epoch": 0.3836, + "grad_norm": 0.07468026131391525, + "learning_rate": 1.5487310327483087e-05, + "loss": 0.0013, + "step": 3836 + }, + { + "epoch": 0.3838, + "grad_norm": 1.3556787967681885, + "learning_rate": 1.5481472617005878e-05, + "loss": 0.0255, + "step": 3838 + }, + { + "epoch": 0.384, + "grad_norm": 0.05185524746775627, + "learning_rate": 1.5475632234925505e-05, + "loss": 0.0017, + "step": 3840 + }, + { + "epoch": 0.3842, + "grad_norm": 0.04230990633368492, + "learning_rate": 1.5469789184088498e-05, + "loss": 0.0017, + "step": 3842 + }, + { + "epoch": 0.3844, + "grad_norm": 0.047545116394758224, + "learning_rate": 1.5463943467342694e-05, + "loss": 0.0025, + "step": 3844 + }, + { + "epoch": 0.3846, + "grad_norm": 0.11296418309211731, + "learning_rate": 1.5458095087537216e-05, + "loss": 0.0028, + "step": 3846 + }, + { + "epoch": 0.3848, + "grad_norm": 0.7581992745399475, + "learning_rate": 1.5452244047522504e-05, + "loss": 0.0057, + "step": 3848 + }, + { + "epoch": 0.385, + "grad_norm": 2.148582935333252, + "learning_rate": 1.5446390350150272e-05, + "loss": 0.1476, + "step": 3850 + }, + { + "epoch": 0.3852, + "grad_norm": 0.31833577156066895, + "learning_rate": 1.544053399827355e-05, + "loss": 0.0106, + "step": 3852 + }, + { + "epoch": 0.3854, + "grad_norm": 0.007811460178345442, + "learning_rate": 1.543467499474665e-05, + "loss": 0.0824, + "step": 3854 + }, + { + "epoch": 0.3856, + "grad_norm": 0.37094277143478394, + "learning_rate": 1.5428813342425177e-05, + "loss": 0.0037, + "step": 3856 + }, + { + "epoch": 0.3858, + "grad_norm": 2.1373021602630615, + "learning_rate": 1.542294904416603e-05, + "loss": 0.0457, + "step": 3858 + }, + { + "epoch": 0.386, + "grad_norm": 0.10086699575185776, + "learning_rate": 1.54170821028274e-05, + "loss": 0.0026, + "step": 3860 + }, + { + "epoch": 0.3862, + "grad_norm": 0.16612508893013, + "learning_rate": 1.541121252126876e-05, + "loss": 0.0659, + "step": 3862 + }, + { + "epoch": 0.3864, + "grad_norm": 4.4267706871032715, + "learning_rate": 1.540534030235087e-05, + "loss": 0.1182, + "step": 3864 + }, + { + "epoch": 0.3866, + "grad_norm": 0.07160582393407822, + "learning_rate": 1.5399465448935788e-05, + "loss": 0.0224, + "step": 3866 + }, + { + "epoch": 0.3868, + "grad_norm": 0.21964769065380096, + "learning_rate": 1.5393587963886837e-05, + "loss": 0.0035, + "step": 3868 + }, + { + "epoch": 0.387, + "grad_norm": 0.19739437103271484, + "learning_rate": 1.5387707850068633e-05, + "loss": 0.0491, + "step": 3870 + }, + { + "epoch": 0.3872, + "grad_norm": 0.05676277354359627, + "learning_rate": 1.5381825110347072e-05, + "loss": 0.0177, + "step": 3872 + }, + { + "epoch": 0.3874, + "grad_norm": 0.14836107194423676, + "learning_rate": 1.5375939747589334e-05, + "loss": 0.0089, + "step": 3874 + }, + { + "epoch": 0.3876, + "grad_norm": 0.0585356168448925, + "learning_rate": 1.5370051764663872e-05, + "loss": 0.0013, + "step": 3876 + }, + { + "epoch": 0.3878, + "grad_norm": 0.09393389523029327, + "learning_rate": 1.5364161164440413e-05, + "loss": 0.0161, + "step": 3878 + }, + { + "epoch": 0.388, + "grad_norm": 0.1522544026374817, + "learning_rate": 1.5358267949789968e-05, + "loss": 0.0035, + "step": 3880 + }, + { + "epoch": 0.3882, + "grad_norm": 0.5751826763153076, + "learning_rate": 1.5352372123584816e-05, + "loss": 0.015, + "step": 3882 + }, + { + "epoch": 0.3884, + "grad_norm": 0.7491849064826965, + "learning_rate": 1.5346473688698514e-05, + "loss": 0.0473, + "step": 3884 + }, + { + "epoch": 0.3886, + "grad_norm": 0.008021576330065727, + "learning_rate": 1.5340572648005887e-05, + "loss": 0.0822, + "step": 3886 + }, + { + "epoch": 0.3888, + "grad_norm": 8.71572494506836, + "learning_rate": 1.533466900438303e-05, + "loss": 0.076, + "step": 3888 + }, + { + "epoch": 0.389, + "grad_norm": 0.033293794840574265, + "learning_rate": 1.53287627607073e-05, + "loss": 0.0025, + "step": 3890 + }, + { + "epoch": 0.3892, + "grad_norm": 0.15652146935462952, + "learning_rate": 1.532285391985734e-05, + "loss": 0.0031, + "step": 3892 + }, + { + "epoch": 0.3894, + "grad_norm": 0.31066206097602844, + "learning_rate": 1.5316942484713043e-05, + "loss": 0.006, + "step": 3894 + }, + { + "epoch": 0.3896, + "grad_norm": 0.016399305313825607, + "learning_rate": 1.5311028458155567e-05, + "loss": 0.0011, + "step": 3896 + }, + { + "epoch": 0.3898, + "grad_norm": 0.4905998706817627, + "learning_rate": 1.5305111843067343e-05, + "loss": 0.0133, + "step": 3898 + }, + { + "epoch": 0.39, + "grad_norm": 1.0526819229125977, + "learning_rate": 1.529919264233205e-05, + "loss": 0.0497, + "step": 3900 + }, + { + "epoch": 0.3902, + "grad_norm": 0.743313193321228, + "learning_rate": 1.5293270858834643e-05, + "loss": 0.0056, + "step": 3902 + }, + { + "epoch": 0.3904, + "grad_norm": 0.047929517924785614, + "learning_rate": 1.528734649546132e-05, + "loss": 0.0075, + "step": 3904 + }, + { + "epoch": 0.3906, + "grad_norm": 0.07828183472156525, + "learning_rate": 1.5281419555099547e-05, + "loss": 0.0069, + "step": 3906 + }, + { + "epoch": 0.3908, + "grad_norm": 0.034840699285268784, + "learning_rate": 1.5275490040638038e-05, + "loss": 0.0674, + "step": 3908 + }, + { + "epoch": 0.391, + "grad_norm": 0.4320925176143646, + "learning_rate": 1.5269557954966777e-05, + "loss": 0.0485, + "step": 3910 + }, + { + "epoch": 0.3912, + "grad_norm": 0.08421758562326431, + "learning_rate": 1.526362330097698e-05, + "loss": 0.0017, + "step": 3912 + }, + { + "epoch": 0.3914, + "grad_norm": 0.007263932842761278, + "learning_rate": 1.5257686081561134e-05, + "loss": 0.0012, + "step": 3914 + }, + { + "epoch": 0.3916, + "grad_norm": 0.03403512015938759, + "learning_rate": 1.5251746299612959e-05, + "loss": 0.0027, + "step": 3916 + }, + { + "epoch": 0.3918, + "grad_norm": 0.019348274916410446, + "learning_rate": 1.5245803958027434e-05, + "loss": 0.0005, + "step": 3918 + }, + { + "epoch": 0.392, + "grad_norm": 1.0262651443481445, + "learning_rate": 1.5239859059700794e-05, + "loss": 0.0113, + "step": 3920 + }, + { + "epoch": 0.3922, + "grad_norm": 5.849361419677734, + "learning_rate": 1.5233911607530499e-05, + "loss": 0.2634, + "step": 3922 + }, + { + "epoch": 0.3924, + "grad_norm": 0.10183145850896835, + "learning_rate": 1.5227961604415266e-05, + "loss": 0.1206, + "step": 3924 + }, + { + "epoch": 0.3926, + "grad_norm": 0.009683744981884956, + "learning_rate": 1.5222009053255061e-05, + "loss": 0.0108, + "step": 3926 + }, + { + "epoch": 0.3928, + "grad_norm": 0.020247915759682655, + "learning_rate": 1.5216053956951081e-05, + "loss": 0.0003, + "step": 3928 + }, + { + "epoch": 0.393, + "grad_norm": 0.004691064823418856, + "learning_rate": 1.5210096318405768e-05, + "loss": 0.0267, + "step": 3930 + }, + { + "epoch": 0.3932, + "grad_norm": 0.006928126327693462, + "learning_rate": 1.5204136140522799e-05, + "loss": 0.0002, + "step": 3932 + }, + { + "epoch": 0.3934, + "grad_norm": 0.002671307884156704, + "learning_rate": 1.5198173426207095e-05, + "loss": 0.0001, + "step": 3934 + }, + { + "epoch": 0.3936, + "grad_norm": 0.24183911085128784, + "learning_rate": 1.5192208178364815e-05, + "loss": 0.0099, + "step": 3936 + }, + { + "epoch": 0.3938, + "grad_norm": 0.10515677183866501, + "learning_rate": 1.5186240399903343e-05, + "loss": 0.0026, + "step": 3938 + }, + { + "epoch": 0.394, + "grad_norm": 0.0037104368675500154, + "learning_rate": 1.5180270093731305e-05, + "loss": 0.0015, + "step": 3940 + }, + { + "epoch": 0.3942, + "grad_norm": 0.011255364865064621, + "learning_rate": 1.5174297262758551e-05, + "loss": 0.0004, + "step": 3942 + }, + { + "epoch": 0.3944, + "grad_norm": 0.007747524417936802, + "learning_rate": 1.5168321909896171e-05, + "loss": 0.0002, + "step": 3944 + }, + { + "epoch": 0.3946, + "grad_norm": 0.008599160239100456, + "learning_rate": 1.5162344038056476e-05, + "loss": 0.0002, + "step": 3946 + }, + { + "epoch": 0.3948, + "grad_norm": 0.0024281456135213375, + "learning_rate": 1.5156363650153012e-05, + "loss": 0.0005, + "step": 3948 + }, + { + "epoch": 0.395, + "grad_norm": 0.038005270063877106, + "learning_rate": 1.5150380749100545e-05, + "loss": 0.0097, + "step": 3950 + }, + { + "epoch": 0.3952, + "grad_norm": 0.0042138840071856976, + "learning_rate": 1.5144395337815066e-05, + "loss": 0.0069, + "step": 3952 + }, + { + "epoch": 0.3954, + "grad_norm": 0.14800424873828888, + "learning_rate": 1.5138407419213797e-05, + "loss": 0.0561, + "step": 3954 + }, + { + "epoch": 0.3956, + "grad_norm": 0.012650755234062672, + "learning_rate": 1.5132416996215171e-05, + "loss": 0.0018, + "step": 3956 + }, + { + "epoch": 0.3958, + "grad_norm": 0.09555479139089584, + "learning_rate": 1.5126424071738853e-05, + "loss": 0.0035, + "step": 3958 + }, + { + "epoch": 0.396, + "grad_norm": 0.008416387252509594, + "learning_rate": 1.5120428648705716e-05, + "loss": 0.0004, + "step": 3960 + }, + { + "epoch": 0.3962, + "grad_norm": 0.11297925561666489, + "learning_rate": 1.511443073003786e-05, + "loss": 0.0013, + "step": 3962 + }, + { + "epoch": 0.3964, + "grad_norm": 0.21167872846126556, + "learning_rate": 1.51084303186586e-05, + "loss": 0.0034, + "step": 3964 + }, + { + "epoch": 0.3966, + "grad_norm": 0.007712114602327347, + "learning_rate": 1.510242741749246e-05, + "loss": 0.0002, + "step": 3966 + }, + { + "epoch": 0.3968, + "grad_norm": 0.025546744465827942, + "learning_rate": 1.5096422029465178e-05, + "loss": 0.0004, + "step": 3968 + }, + { + "epoch": 0.397, + "grad_norm": 0.016382060945034027, + "learning_rate": 1.5090414157503715e-05, + "loss": 0.0892, + "step": 3970 + }, + { + "epoch": 0.3972, + "grad_norm": 1.171995997428894, + "learning_rate": 1.508440380453623e-05, + "loss": 0.0271, + "step": 3972 + }, + { + "epoch": 0.3974, + "grad_norm": 0.018101204186677933, + "learning_rate": 1.5078390973492094e-05, + "loss": 0.0265, + "step": 3974 + }, + { + "epoch": 0.3976, + "grad_norm": 0.23239172995090485, + "learning_rate": 1.5072375667301893e-05, + "loss": 0.0023, + "step": 3976 + }, + { + "epoch": 0.3978, + "grad_norm": 0.6607217788696289, + "learning_rate": 1.506635788889741e-05, + "loss": 0.0044, + "step": 3978 + }, + { + "epoch": 0.398, + "grad_norm": 0.11109142005443573, + "learning_rate": 1.5060337641211637e-05, + "loss": 0.0013, + "step": 3980 + }, + { + "epoch": 0.3982, + "grad_norm": 0.06312084197998047, + "learning_rate": 1.5054314927178779e-05, + "loss": 0.0012, + "step": 3982 + }, + { + "epoch": 0.3984, + "grad_norm": 0.004263977520167828, + "learning_rate": 1.504828974973422e-05, + "loss": 0.0005, + "step": 3984 + }, + { + "epoch": 0.3986, + "grad_norm": 0.027037225663661957, + "learning_rate": 1.5042262111814566e-05, + "loss": 0.0973, + "step": 3986 + }, + { + "epoch": 0.3988, + "grad_norm": 1.0940755605697632, + "learning_rate": 1.503623201635761e-05, + "loss": 0.0124, + "step": 3988 + }, + { + "epoch": 0.399, + "grad_norm": 1.0780673027038574, + "learning_rate": 1.5030199466302354e-05, + "loss": 0.0811, + "step": 3990 + }, + { + "epoch": 0.3992, + "grad_norm": 0.03392007201910019, + "learning_rate": 1.5024164464588982e-05, + "loss": 0.0017, + "step": 3992 + }, + { + "epoch": 0.3994, + "grad_norm": 0.07791830599308014, + "learning_rate": 1.5018127014158886e-05, + "loss": 0.0713, + "step": 3994 + }, + { + "epoch": 0.3996, + "grad_norm": 0.2710253596305847, + "learning_rate": 1.5012087117954643e-05, + "loss": 0.0019, + "step": 3996 + }, + { + "epoch": 0.3998, + "grad_norm": 0.0034132369328290224, + "learning_rate": 1.5006044778920028e-05, + "loss": 0.0011, + "step": 3998 + }, + { + "epoch": 0.4, + "grad_norm": 0.0010066486429423094, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.0004, + "step": 4000 + }, + { + "epoch": 0.4002, + "grad_norm": 0.0012097051367163658, + "learning_rate": 1.4993952784140716e-05, + "loss": 0.0076, + "step": 4002 + }, + { + "epoch": 0.4004, + "grad_norm": 7.488657474517822, + "learning_rate": 1.498790313428951e-05, + "loss": 0.151, + "step": 4004 + }, + { + "epoch": 0.4006, + "grad_norm": 0.018615037202835083, + "learning_rate": 1.498185105339491e-05, + "loss": 0.0006, + "step": 4006 + }, + { + "epoch": 0.4008, + "grad_norm": 6.189553737640381, + "learning_rate": 1.4975796544406627e-05, + "loss": 0.1291, + "step": 4008 + }, + { + "epoch": 0.401, + "grad_norm": 0.003375563072040677, + "learning_rate": 1.4969739610275556e-05, + "loss": 0.0002, + "step": 4010 + }, + { + "epoch": 0.4012, + "grad_norm": 0.6401863098144531, + "learning_rate": 1.496368025395377e-05, + "loss": 0.0093, + "step": 4012 + }, + { + "epoch": 0.4014, + "grad_norm": 0.018535539507865906, + "learning_rate": 1.4957618478394529e-05, + "loss": 0.0005, + "step": 4014 + }, + { + "epoch": 0.4016, + "grad_norm": 0.3361450433731079, + "learning_rate": 1.4951554286552266e-05, + "loss": 0.0209, + "step": 4016 + }, + { + "epoch": 0.4018, + "grad_norm": 0.024857625365257263, + "learning_rate": 1.4945487681382597e-05, + "loss": 0.0009, + "step": 4018 + }, + { + "epoch": 0.402, + "grad_norm": 0.08390836417675018, + "learning_rate": 1.493941866584231e-05, + "loss": 0.0018, + "step": 4020 + }, + { + "epoch": 0.4022, + "grad_norm": 0.28712043166160583, + "learning_rate": 1.4933347242889371e-05, + "loss": 0.0061, + "step": 4022 + }, + { + "epoch": 0.4024, + "grad_norm": 0.002709442749619484, + "learning_rate": 1.4927273415482916e-05, + "loss": 0.0064, + "step": 4024 + }, + { + "epoch": 0.4026, + "grad_norm": 0.08692855387926102, + "learning_rate": 1.4921197186583256e-05, + "loss": 0.0016, + "step": 4026 + }, + { + "epoch": 0.4028, + "grad_norm": 0.055103711783885956, + "learning_rate": 1.4915118559151871e-05, + "loss": 0.0161, + "step": 4028 + }, + { + "epoch": 0.403, + "grad_norm": 0.015656063333153725, + "learning_rate": 1.490903753615141e-05, + "loss": 0.0016, + "step": 4030 + }, + { + "epoch": 0.4032, + "grad_norm": 1.0969243049621582, + "learning_rate": 1.4902954120545687e-05, + "loss": 0.0251, + "step": 4032 + }, + { + "epoch": 0.4034, + "grad_norm": 0.014136174693703651, + "learning_rate": 1.4896868315299692e-05, + "loss": 0.0004, + "step": 4034 + }, + { + "epoch": 0.4036, + "grad_norm": 0.10661386698484421, + "learning_rate": 1.4890780123379565e-05, + "loss": 0.0023, + "step": 4036 + }, + { + "epoch": 0.4038, + "grad_norm": 0.27954062819480896, + "learning_rate": 1.488468954775262e-05, + "loss": 0.0049, + "step": 4038 + }, + { + "epoch": 0.404, + "grad_norm": 0.2153235524892807, + "learning_rate": 1.4878596591387329e-05, + "loss": 0.0043, + "step": 4040 + }, + { + "epoch": 0.4042, + "grad_norm": 0.0046199169009923935, + "learning_rate": 1.4872501257253325e-05, + "loss": 0.0024, + "step": 4042 + }, + { + "epoch": 0.4044, + "grad_norm": 1.0823814868927002, + "learning_rate": 1.4866403548321402e-05, + "loss": 0.0317, + "step": 4044 + }, + { + "epoch": 0.4046, + "grad_norm": 0.02915104292333126, + "learning_rate": 1.4860303467563504e-05, + "loss": 0.001, + "step": 4046 + }, + { + "epoch": 0.4048, + "grad_norm": 0.058679159730672836, + "learning_rate": 1.485420101795274e-05, + "loss": 0.0004, + "step": 4048 + }, + { + "epoch": 0.405, + "grad_norm": 0.027454029768705368, + "learning_rate": 1.4848096202463373e-05, + "loss": 0.1633, + "step": 4050 + }, + { + "epoch": 0.4052, + "grad_norm": 0.005620952695608139, + "learning_rate": 1.4841989024070809e-05, + "loss": 0.0059, + "step": 4052 + }, + { + "epoch": 0.4054, + "grad_norm": 6.726145267486572, + "learning_rate": 1.4835879485751617e-05, + "loss": 0.077, + "step": 4054 + }, + { + "epoch": 0.4056, + "grad_norm": 0.09932540357112885, + "learning_rate": 1.4829767590483508e-05, + "loss": 0.0015, + "step": 4056 + }, + { + "epoch": 0.4058, + "grad_norm": 0.0911027118563652, + "learning_rate": 1.4823653341245353e-05, + "loss": 0.0012, + "step": 4058 + }, + { + "epoch": 0.406, + "grad_norm": 0.7479990124702454, + "learning_rate": 1.4817536741017153e-05, + "loss": 0.0108, + "step": 4060 + }, + { + "epoch": 0.4062, + "grad_norm": 0.0024848836474120617, + "learning_rate": 1.4811417792780074e-05, + "loss": 0.0001, + "step": 4062 + }, + { + "epoch": 0.4064, + "grad_norm": 0.43139439821243286, + "learning_rate": 1.4805296499516408e-05, + "loss": 0.0028, + "step": 4064 + }, + { + "epoch": 0.4066, + "grad_norm": 0.0025419145822525024, + "learning_rate": 1.4799172864209607e-05, + "loss": 0.0005, + "step": 4066 + }, + { + "epoch": 0.4068, + "grad_norm": 7.5017991065979, + "learning_rate": 1.4793046889844252e-05, + "loss": 0.078, + "step": 4068 + }, + { + "epoch": 0.407, + "grad_norm": 0.029985904693603516, + "learning_rate": 1.478691857940607e-05, + "loss": 0.0004, + "step": 4070 + }, + { + "epoch": 0.4072, + "grad_norm": 0.09868378937244415, + "learning_rate": 1.4780787935881925e-05, + "loss": 0.0013, + "step": 4072 + }, + { + "epoch": 0.4074, + "grad_norm": 0.10456223040819168, + "learning_rate": 1.4774654962259813e-05, + "loss": 0.0007, + "step": 4074 + }, + { + "epoch": 0.4076, + "grad_norm": 0.009078724309802055, + "learning_rate": 1.4768519661528879e-05, + "loss": 0.0003, + "step": 4076 + }, + { + "epoch": 0.4078, + "grad_norm": 0.06973922252655029, + "learning_rate": 1.4762382036679393e-05, + "loss": 0.0012, + "step": 4078 + }, + { + "epoch": 0.408, + "grad_norm": 1.1182459592819214, + "learning_rate": 1.4756242090702756e-05, + "loss": 0.03, + "step": 4080 + }, + { + "epoch": 0.4082, + "grad_norm": 0.05416171997785568, + "learning_rate": 1.47500998265915e-05, + "loss": 0.0007, + "step": 4082 + }, + { + "epoch": 0.4084, + "grad_norm": 0.17100895941257477, + "learning_rate": 1.4743955247339292e-05, + "loss": 0.0039, + "step": 4084 + }, + { + "epoch": 0.4086, + "grad_norm": 2.7567813396453857, + "learning_rate": 1.4737808355940932e-05, + "loss": 0.0503, + "step": 4086 + }, + { + "epoch": 0.4088, + "grad_norm": 0.010471885092556477, + "learning_rate": 1.4731659155392332e-05, + "loss": 0.0545, + "step": 4088 + }, + { + "epoch": 0.409, + "grad_norm": 0.01819382794201374, + "learning_rate": 1.4725507648690542e-05, + "loss": 0.0043, + "step": 4090 + }, + { + "epoch": 0.4092, + "grad_norm": 1.243837594985962, + "learning_rate": 1.4719353838833729e-05, + "loss": 0.0028, + "step": 4092 + }, + { + "epoch": 0.4094, + "grad_norm": 0.006483010016381741, + "learning_rate": 1.4713197728821185e-05, + "loss": 0.0036, + "step": 4094 + }, + { + "epoch": 0.4096, + "grad_norm": 3.5446197986602783, + "learning_rate": 1.470703932165333e-05, + "loss": 0.1185, + "step": 4096 + }, + { + "epoch": 0.4098, + "grad_norm": 0.009313512593507767, + "learning_rate": 1.4700878620331684e-05, + "loss": 0.0002, + "step": 4098 + }, + { + "epoch": 0.41, + "grad_norm": 0.6181483864784241, + "learning_rate": 1.469471562785891e-05, + "loss": 0.0025, + "step": 4100 + }, + { + "epoch": 0.4102, + "grad_norm": 0.071558378636837, + "learning_rate": 1.468855034723877e-05, + "loss": 0.0006, + "step": 4102 + }, + { + "epoch": 0.4104, + "grad_norm": 0.06560239940881729, + "learning_rate": 1.4682382781476146e-05, + "loss": 0.0006, + "step": 4104 + }, + { + "epoch": 0.4106, + "grad_norm": 0.009676429443061352, + "learning_rate": 1.467621293357704e-05, + "loss": 0.0002, + "step": 4106 + }, + { + "epoch": 0.4108, + "grad_norm": 1.5334439277648926, + "learning_rate": 1.4670040806548555e-05, + "loss": 0.012, + "step": 4108 + }, + { + "epoch": 0.411, + "grad_norm": 0.05009615048766136, + "learning_rate": 1.4663866403398915e-05, + "loss": 0.0012, + "step": 4110 + }, + { + "epoch": 0.4112, + "grad_norm": 0.023047203198075294, + "learning_rate": 1.4657689727137443e-05, + "loss": 0.0073, + "step": 4112 + }, + { + "epoch": 0.4114, + "grad_norm": 0.06641614437103271, + "learning_rate": 1.4651510780774585e-05, + "loss": 0.0004, + "step": 4114 + }, + { + "epoch": 0.4116, + "grad_norm": 0.0064299521036446095, + "learning_rate": 1.464532956732188e-05, + "loss": 0.0002, + "step": 4116 + }, + { + "epoch": 0.4118, + "grad_norm": 4.150933265686035, + "learning_rate": 1.4639146089791972e-05, + "loss": 0.0167, + "step": 4118 + }, + { + "epoch": 0.412, + "grad_norm": 0.004347555339336395, + "learning_rate": 1.463296035119862e-05, + "loss": 0.0022, + "step": 4120 + }, + { + "epoch": 0.4122, + "grad_norm": 0.003759652841836214, + "learning_rate": 1.462677235455667e-05, + "loss": 0.0228, + "step": 4122 + }, + { + "epoch": 0.4124, + "grad_norm": 0.08497020602226257, + "learning_rate": 1.4620582102882088e-05, + "loss": 0.001, + "step": 4124 + }, + { + "epoch": 0.4126, + "grad_norm": 4.555630683898926, + "learning_rate": 1.4614389599191917e-05, + "loss": 0.0583, + "step": 4126 + }, + { + "epoch": 0.4128, + "grad_norm": 0.0018910232465714216, + "learning_rate": 1.4608194846504311e-05, + "loss": 0.0001, + "step": 4128 + }, + { + "epoch": 0.413, + "grad_norm": 0.2823841869831085, + "learning_rate": 1.4601997847838518e-05, + "loss": 0.0038, + "step": 4130 + }, + { + "epoch": 0.4132, + "grad_norm": 0.17895640432834625, + "learning_rate": 1.4595798606214882e-05, + "loss": 0.0016, + "step": 4132 + }, + { + "epoch": 0.4134, + "grad_norm": 0.007653316482901573, + "learning_rate": 1.4589597124654834e-05, + "loss": 0.1435, + "step": 4134 + }, + { + "epoch": 0.4136, + "grad_norm": 0.022800175473093987, + "learning_rate": 1.4583393406180898e-05, + "loss": 0.0023, + "step": 4136 + }, + { + "epoch": 0.4138, + "grad_norm": 0.007024161051958799, + "learning_rate": 1.4577187453816702e-05, + "loss": 0.0001, + "step": 4138 + }, + { + "epoch": 0.414, + "grad_norm": 0.0012554213171824813, + "learning_rate": 1.4570979270586944e-05, + "loss": 0.0095, + "step": 4140 + }, + { + "epoch": 0.4142, + "grad_norm": 0.01026988122612238, + "learning_rate": 1.4564768859517417e-05, + "loss": 0.0005, + "step": 4142 + }, + { + "epoch": 0.4144, + "grad_norm": 0.8671157956123352, + "learning_rate": 1.4558556223635004e-05, + "loss": 0.0126, + "step": 4144 + }, + { + "epoch": 0.4146, + "grad_norm": 0.00039303197991102934, + "learning_rate": 1.455234136596766e-05, + "loss": 0.0015, + "step": 4146 + }, + { + "epoch": 0.4148, + "grad_norm": 0.003586740931496024, + "learning_rate": 1.454612428954444e-05, + "loss": 0.0002, + "step": 4148 + }, + { + "epoch": 0.415, + "grad_norm": 0.05460529401898384, + "learning_rate": 1.4539904997395468e-05, + "loss": 0.0017, + "step": 4150 + }, + { + "epoch": 0.4152, + "grad_norm": 0.47280794382095337, + "learning_rate": 1.4533683492551954e-05, + "loss": 0.0189, + "step": 4152 + }, + { + "epoch": 0.4154, + "grad_norm": 0.00272808107547462, + "learning_rate": 1.452745977804618e-05, + "loss": 0.0001, + "step": 4154 + }, + { + "epoch": 0.4156, + "grad_norm": 0.08435536175966263, + "learning_rate": 1.4521233856911507e-05, + "loss": 0.0008, + "step": 4156 + }, + { + "epoch": 0.4158, + "grad_norm": 0.07618145644664764, + "learning_rate": 1.4515005732182384e-05, + "loss": 0.0008, + "step": 4158 + }, + { + "epoch": 0.416, + "grad_norm": 0.0011928946478292346, + "learning_rate": 1.4508775406894308e-05, + "loss": 0.0001, + "step": 4160 + }, + { + "epoch": 0.4162, + "grad_norm": 0.007583409082144499, + "learning_rate": 1.4502542884083876e-05, + "loss": 0.2167, + "step": 4162 + }, + { + "epoch": 0.4164, + "grad_norm": 0.006976098287850618, + "learning_rate": 1.449630816678874e-05, + "loss": 0.0014, + "step": 4164 + }, + { + "epoch": 0.4166, + "grad_norm": 0.017582811415195465, + "learning_rate": 1.4490071258047625e-05, + "loss": 0.0008, + "step": 4166 + }, + { + "epoch": 0.4168, + "grad_norm": 0.0019234591163694859, + "learning_rate": 1.4483832160900326e-05, + "loss": 0.0006, + "step": 4168 + }, + { + "epoch": 0.417, + "grad_norm": 3.767482280731201, + "learning_rate": 1.4477590878387697e-05, + "loss": 0.0981, + "step": 4170 + }, + { + "epoch": 0.4172, + "grad_norm": 0.011735809035599232, + "learning_rate": 1.4471347413551673e-05, + "loss": 0.0011, + "step": 4172 + }, + { + "epoch": 0.4174, + "grad_norm": 0.13634267449378967, + "learning_rate": 1.4465101769435235e-05, + "loss": 0.0053, + "step": 4174 + }, + { + "epoch": 0.4176, + "grad_norm": 1.8765021562576294, + "learning_rate": 1.4458853949082443e-05, + "loss": 0.0209, + "step": 4176 + }, + { + "epoch": 0.4178, + "grad_norm": 2.12265682220459, + "learning_rate": 1.4452603955538397e-05, + "loss": 0.2331, + "step": 4178 + }, + { + "epoch": 0.418, + "grad_norm": 0.14923697710037231, + "learning_rate": 1.4446351791849276e-05, + "loss": 0.0172, + "step": 4180 + }, + { + "epoch": 0.4182, + "grad_norm": 0.013123412616550922, + "learning_rate": 1.4440097461062308e-05, + "loss": 0.0047, + "step": 4182 + }, + { + "epoch": 0.4184, + "grad_norm": 0.4339183270931244, + "learning_rate": 1.4433840966225772e-05, + "loss": 0.0029, + "step": 4184 + }, + { + "epoch": 0.4186, + "grad_norm": 0.00499812513589859, + "learning_rate": 1.442758231038902e-05, + "loss": 0.0118, + "step": 4186 + }, + { + "epoch": 0.4188, + "grad_norm": 1.8668009042739868, + "learning_rate": 1.4421321496602428e-05, + "loss": 0.0975, + "step": 4188 + }, + { + "epoch": 0.419, + "grad_norm": 0.005973338149487972, + "learning_rate": 1.4415058527917454e-05, + "loss": 0.0004, + "step": 4190 + }, + { + "epoch": 0.4192, + "grad_norm": 0.8828155994415283, + "learning_rate": 1.4408793407386587e-05, + "loss": 0.0148, + "step": 4192 + }, + { + "epoch": 0.4194, + "grad_norm": 0.17874793708324432, + "learning_rate": 1.4402526138063373e-05, + "loss": 0.0088, + "step": 4194 + }, + { + "epoch": 0.4196, + "grad_norm": 0.006637539714574814, + "learning_rate": 1.43962567230024e-05, + "loss": 0.0004, + "step": 4196 + }, + { + "epoch": 0.4198, + "grad_norm": 0.3876243829727173, + "learning_rate": 1.4389985165259308e-05, + "loss": 0.0053, + "step": 4198 + }, + { + "epoch": 0.42, + "grad_norm": 0.03771546110510826, + "learning_rate": 1.4383711467890776e-05, + "loss": 0.0007, + "step": 4200 + }, + { + "epoch": 0.4202, + "grad_norm": 0.16907650232315063, + "learning_rate": 1.4377435633954528e-05, + "loss": 0.0018, + "step": 4202 + }, + { + "epoch": 0.4204, + "grad_norm": 0.011080153286457062, + "learning_rate": 1.437115766650933e-05, + "loss": 0.0182, + "step": 4204 + }, + { + "epoch": 0.4206, + "grad_norm": 0.022120462730526924, + "learning_rate": 1.436487756861499e-05, + "loss": 0.0004, + "step": 4206 + }, + { + "epoch": 0.4208, + "grad_norm": 0.023927027359604836, + "learning_rate": 1.4358595343332342e-05, + "loss": 0.0006, + "step": 4208 + }, + { + "epoch": 0.421, + "grad_norm": 0.28061774373054504, + "learning_rate": 1.4352310993723277e-05, + "loss": 0.0088, + "step": 4210 + }, + { + "epoch": 0.4212, + "grad_norm": 5.390331745147705, + "learning_rate": 1.4346024522850704e-05, + "loss": 0.2817, + "step": 4212 + }, + { + "epoch": 0.4214, + "grad_norm": 0.01118430495262146, + "learning_rate": 1.4339735933778576e-05, + "loss": 0.0004, + "step": 4214 + }, + { + "epoch": 0.4216, + "grad_norm": 0.003243677783757448, + "learning_rate": 1.4333445229571874e-05, + "loss": 0.0003, + "step": 4216 + }, + { + "epoch": 0.4218, + "grad_norm": 0.00445472402498126, + "learning_rate": 1.4327152413296607e-05, + "loss": 0.0005, + "step": 4218 + }, + { + "epoch": 0.422, + "grad_norm": 9.387557983398438, + "learning_rate": 1.4320857488019826e-05, + "loss": 0.084, + "step": 4220 + }, + { + "epoch": 0.4222, + "grad_norm": 0.015622277744114399, + "learning_rate": 1.4314560456809592e-05, + "loss": 0.0004, + "step": 4222 + }, + { + "epoch": 0.4224, + "grad_norm": 0.19367706775665283, + "learning_rate": 1.4308261322735006e-05, + "loss": 0.0024, + "step": 4224 + }, + { + "epoch": 0.4226, + "grad_norm": 0.012830954045057297, + "learning_rate": 1.4301960088866187e-05, + "loss": 0.001, + "step": 4226 + }, + { + "epoch": 0.4228, + "grad_norm": 0.29874253273010254, + "learning_rate": 1.4295656758274283e-05, + "loss": 0.0078, + "step": 4228 + }, + { + "epoch": 0.423, + "grad_norm": 0.41310539841651917, + "learning_rate": 1.4289351334031461e-05, + "loss": 0.0035, + "step": 4230 + }, + { + "epoch": 0.4232, + "grad_norm": 0.11896264553070068, + "learning_rate": 1.4283043819210905e-05, + "loss": 0.0022, + "step": 4232 + }, + { + "epoch": 0.4234, + "grad_norm": 0.0035026641562581062, + "learning_rate": 1.4276734216886823e-05, + "loss": 0.0038, + "step": 4234 + }, + { + "epoch": 0.4236, + "grad_norm": 0.2302071452140808, + "learning_rate": 1.4270422530134433e-05, + "loss": 0.0025, + "step": 4236 + }, + { + "epoch": 0.4238, + "grad_norm": 0.8147134184837341, + "learning_rate": 1.4264108762029989e-05, + "loss": 0.0247, + "step": 4238 + }, + { + "epoch": 0.424, + "grad_norm": 0.02560199610888958, + "learning_rate": 1.4257792915650728e-05, + "loss": 0.0075, + "step": 4240 + }, + { + "epoch": 0.4242, + "grad_norm": 0.08809062838554382, + "learning_rate": 1.4251474994074927e-05, + "loss": 0.0016, + "step": 4242 + }, + { + "epoch": 0.4244, + "grad_norm": 0.07098710536956787, + "learning_rate": 1.424515500038186e-05, + "loss": 0.0013, + "step": 4244 + }, + { + "epoch": 0.4246, + "grad_norm": 0.004130031447857618, + "learning_rate": 1.4238832937651816e-05, + "loss": 0.0007, + "step": 4246 + }, + { + "epoch": 0.4248, + "grad_norm": 1.4861756563186646, + "learning_rate": 1.4232508808966097e-05, + "loss": 0.0635, + "step": 4248 + }, + { + "epoch": 0.425, + "grad_norm": 0.05797583609819412, + "learning_rate": 1.4226182617406996e-05, + "loss": 0.0663, + "step": 4250 + }, + { + "epoch": 0.4252, + "grad_norm": 0.1855604201555252, + "learning_rate": 1.4219854366057831e-05, + "loss": 0.005, + "step": 4252 + }, + { + "epoch": 0.4254, + "grad_norm": 0.004737631883472204, + "learning_rate": 1.421352405800291e-05, + "loss": 0.0094, + "step": 4254 + }, + { + "epoch": 0.4256, + "grad_norm": 0.00786942895501852, + "learning_rate": 1.420719169632755e-05, + "loss": 0.0003, + "step": 4256 + }, + { + "epoch": 0.4258, + "grad_norm": 0.6153473258018494, + "learning_rate": 1.4200857284118067e-05, + "loss": 0.0149, + "step": 4258 + }, + { + "epoch": 0.426, + "grad_norm": 0.017163407057523727, + "learning_rate": 1.4194520824461773e-05, + "loss": 0.0559, + "step": 4260 + }, + { + "epoch": 0.4262, + "grad_norm": 0.04352276399731636, + "learning_rate": 1.4188182320446985e-05, + "loss": 0.0013, + "step": 4262 + }, + { + "epoch": 0.4264, + "grad_norm": 0.00745047302916646, + "learning_rate": 1.4181841775163014e-05, + "loss": 0.0004, + "step": 4264 + }, + { + "epoch": 0.4266, + "grad_norm": 0.3807438313961029, + "learning_rate": 1.4175499191700169e-05, + "loss": 0.0085, + "step": 4266 + }, + { + "epoch": 0.4268, + "grad_norm": 0.021252285689115524, + "learning_rate": 1.4169154573149737e-05, + "loss": 0.0005, + "step": 4268 + }, + { + "epoch": 0.427, + "grad_norm": 1.7833621501922607, + "learning_rate": 1.4162807922604014e-05, + "loss": 0.0233, + "step": 4270 + }, + { + "epoch": 0.4272, + "grad_norm": 0.10315141081809998, + "learning_rate": 1.415645924315628e-05, + "loss": 0.0015, + "step": 4272 + }, + { + "epoch": 0.4274, + "grad_norm": 2.9062981605529785, + "learning_rate": 1.4150108537900805e-05, + "loss": 0.0414, + "step": 4274 + }, + { + "epoch": 0.4276, + "grad_norm": 1.5822625160217285, + "learning_rate": 1.4143755809932843e-05, + "loss": 0.023, + "step": 4276 + }, + { + "epoch": 0.4278, + "grad_norm": 0.016894085332751274, + "learning_rate": 1.4137401062348639e-05, + "loss": 0.0547, + "step": 4278 + }, + { + "epoch": 0.428, + "grad_norm": 0.000790109159424901, + "learning_rate": 1.413104429824542e-05, + "loss": 0.0002, + "step": 4280 + }, + { + "epoch": 0.4282, + "grad_norm": 0.01344808004796505, + "learning_rate": 1.4124685520721393e-05, + "loss": 0.0006, + "step": 4282 + }, + { + "epoch": 0.4284, + "grad_norm": 0.003000301541760564, + "learning_rate": 1.411832473287575e-05, + "loss": 0.0002, + "step": 4284 + }, + { + "epoch": 0.4286, + "grad_norm": 0.5635800361633301, + "learning_rate": 1.4111961937808665e-05, + "loss": 0.3631, + "step": 4286 + }, + { + "epoch": 0.4288, + "grad_norm": 0.058850646018981934, + "learning_rate": 1.4105597138621281e-05, + "loss": 0.0023, + "step": 4288 + }, + { + "epoch": 0.429, + "grad_norm": 0.0026777039747685194, + "learning_rate": 1.4099230338415728e-05, + "loss": 0.1229, + "step": 4290 + }, + { + "epoch": 0.4292, + "grad_norm": 0.011974510736763477, + "learning_rate": 1.4092861540295109e-05, + "loss": 0.0003, + "step": 4292 + }, + { + "epoch": 0.4294, + "grad_norm": 0.005734940525144339, + "learning_rate": 1.4086490747363492e-05, + "loss": 0.0097, + "step": 4294 + }, + { + "epoch": 0.4296, + "grad_norm": 1.4212397336959839, + "learning_rate": 1.4080117962725929e-05, + "loss": 0.0771, + "step": 4296 + }, + { + "epoch": 0.4298, + "grad_norm": 0.1726626306772232, + "learning_rate": 1.4073743189488436e-05, + "loss": 0.0033, + "step": 4298 + }, + { + "epoch": 0.43, + "grad_norm": 0.011654144152998924, + "learning_rate": 1.4067366430758004e-05, + "loss": 0.0005, + "step": 4300 + }, + { + "epoch": 0.4302, + "grad_norm": 0.5136719346046448, + "learning_rate": 1.4060987689642581e-05, + "loss": 0.009, + "step": 4302 + }, + { + "epoch": 0.4304, + "grad_norm": 0.00856290478259325, + "learning_rate": 1.4054606969251095e-05, + "loss": 0.0002, + "step": 4304 + }, + { + "epoch": 0.4306, + "grad_norm": 0.01691259816288948, + "learning_rate": 1.4048224272693426e-05, + "loss": 0.0055, + "step": 4306 + }, + { + "epoch": 0.4308, + "grad_norm": 0.020957162603735924, + "learning_rate": 1.4041839603080423e-05, + "loss": 0.0551, + "step": 4308 + }, + { + "epoch": 0.431, + "grad_norm": 0.29797789454460144, + "learning_rate": 1.4035452963523903e-05, + "loss": 0.0057, + "step": 4310 + }, + { + "epoch": 0.4312, + "grad_norm": 1.1213783025741577, + "learning_rate": 1.4029064357136628e-05, + "loss": 0.0141, + "step": 4312 + }, + { + "epoch": 0.4314, + "grad_norm": 0.8615202903747559, + "learning_rate": 1.4022673787032333e-05, + "loss": 0.0151, + "step": 4314 + }, + { + "epoch": 0.4316, + "grad_norm": 0.009252429008483887, + "learning_rate": 1.4016281256325702e-05, + "loss": 0.0005, + "step": 4316 + }, + { + "epoch": 0.4318, + "grad_norm": 0.5595703721046448, + "learning_rate": 1.4009886768132375e-05, + "loss": 0.007, + "step": 4318 + }, + { + "epoch": 0.432, + "grad_norm": 0.10888846963644028, + "learning_rate": 1.4003490325568953e-05, + "loss": 0.003, + "step": 4320 + }, + { + "epoch": 0.4322, + "grad_norm": 0.08357217162847519, + "learning_rate": 1.3997091931752978e-05, + "loss": 0.0038, + "step": 4322 + }, + { + "epoch": 0.4324, + "grad_norm": 0.026594994589686394, + "learning_rate": 1.3990691589802955e-05, + "loss": 0.0064, + "step": 4324 + }, + { + "epoch": 0.4326, + "grad_norm": 0.011515522375702858, + "learning_rate": 1.3984289302838327e-05, + "loss": 0.0005, + "step": 4326 + }, + { + "epoch": 0.4328, + "grad_norm": 0.00622312119230628, + "learning_rate": 1.39778850739795e-05, + "loss": 0.0179, + "step": 4328 + }, + { + "epoch": 0.433, + "grad_norm": 0.7498053312301636, + "learning_rate": 1.3971478906347806e-05, + "loss": 0.0118, + "step": 4330 + }, + { + "epoch": 0.4332, + "grad_norm": 5.681878089904785, + "learning_rate": 1.3965070803065543e-05, + "loss": 0.0793, + "step": 4332 + }, + { + "epoch": 0.4334, + "grad_norm": 1.1296112537384033, + "learning_rate": 1.3958660767255938e-05, + "loss": 0.0246, + "step": 4334 + }, + { + "epoch": 0.4336, + "grad_norm": 1.5128792524337769, + "learning_rate": 1.3952248802043166e-05, + "loss": 0.0167, + "step": 4336 + }, + { + "epoch": 0.4338, + "grad_norm": 0.07446404546499252, + "learning_rate": 1.394583491055234e-05, + "loss": 0.1127, + "step": 4338 + }, + { + "epoch": 0.434, + "grad_norm": 0.0073697823099792, + "learning_rate": 1.3939419095909513e-05, + "loss": 0.001, + "step": 4340 + }, + { + "epoch": 0.4342, + "grad_norm": 0.0288555808365345, + "learning_rate": 1.3933001361241674e-05, + "loss": 0.0045, + "step": 4342 + }, + { + "epoch": 0.4344, + "grad_norm": 8.936887741088867, + "learning_rate": 1.3926581709676752e-05, + "loss": 0.1076, + "step": 4344 + }, + { + "epoch": 0.4346, + "grad_norm": 0.008612725883722305, + "learning_rate": 1.3920160144343604e-05, + "loss": 0.0918, + "step": 4346 + }, + { + "epoch": 0.4348, + "grad_norm": 0.020289232954382896, + "learning_rate": 1.3913736668372027e-05, + "loss": 0.0005, + "step": 4348 + }, + { + "epoch": 0.435, + "grad_norm": 0.002765468554571271, + "learning_rate": 1.3907311284892737e-05, + "loss": 0.0004, + "step": 4350 + }, + { + "epoch": 0.4352, + "grad_norm": 0.0072141289710998535, + "learning_rate": 1.3900883997037398e-05, + "loss": 0.001, + "step": 4352 + }, + { + "epoch": 0.4354, + "grad_norm": 0.0014934578211978078, + "learning_rate": 1.3894454807938587e-05, + "loss": 0.0003, + "step": 4354 + }, + { + "epoch": 0.4356, + "grad_norm": 0.02268093079328537, + "learning_rate": 1.388802372072981e-05, + "loss": 0.0163, + "step": 4356 + }, + { + "epoch": 0.4358, + "grad_norm": 0.004925106652081013, + "learning_rate": 1.3881590738545508e-05, + "loss": 0.001, + "step": 4358 + }, + { + "epoch": 0.436, + "grad_norm": 4.533937931060791, + "learning_rate": 1.3875155864521031e-05, + "loss": 0.0166, + "step": 4360 + }, + { + "epoch": 0.4362, + "grad_norm": 0.010726350359618664, + "learning_rate": 1.3868719101792664e-05, + "loss": 0.0003, + "step": 4362 + }, + { + "epoch": 0.4364, + "grad_norm": 0.012416677549481392, + "learning_rate": 1.3862280453497601e-05, + "loss": 0.0007, + "step": 4364 + }, + { + "epoch": 0.4366, + "grad_norm": 0.8840945959091187, + "learning_rate": 1.3855839922773968e-05, + "loss": 0.0295, + "step": 4366 + }, + { + "epoch": 0.4368, + "grad_norm": 0.16212370991706848, + "learning_rate": 1.3849397512760797e-05, + "loss": 0.0304, + "step": 4368 + }, + { + "epoch": 0.437, + "grad_norm": 0.0032995198853313923, + "learning_rate": 1.3842953226598036e-05, + "loss": 0.0002, + "step": 4370 + }, + { + "epoch": 0.4372, + "grad_norm": 2.1747775077819824, + "learning_rate": 1.3836507067426565e-05, + "loss": 0.0213, + "step": 4372 + }, + { + "epoch": 0.4374, + "grad_norm": 0.029776230454444885, + "learning_rate": 1.3830059038388153e-05, + "loss": 0.0006, + "step": 4374 + }, + { + "epoch": 0.4376, + "grad_norm": 0.47254031896591187, + "learning_rate": 1.3823609142625492e-05, + "loss": 0.009, + "step": 4376 + }, + { + "epoch": 0.4378, + "grad_norm": 0.015225819312036037, + "learning_rate": 1.3817157383282184e-05, + "loss": 0.3016, + "step": 4378 + }, + { + "epoch": 0.438, + "grad_norm": 0.361844003200531, + "learning_rate": 1.3810703763502744e-05, + "loss": 0.0016, + "step": 4380 + }, + { + "epoch": 0.4382, + "grad_norm": 1.176446795463562, + "learning_rate": 1.3804248286432577e-05, + "loss": 0.0107, + "step": 4382 + }, + { + "epoch": 0.4384, + "grad_norm": 0.00805577915161848, + "learning_rate": 1.3797790955218014e-05, + "loss": 0.0005, + "step": 4384 + }, + { + "epoch": 0.4386, + "grad_norm": 0.0015456798719242215, + "learning_rate": 1.3791331773006272e-05, + "loss": 0.0356, + "step": 4386 + }, + { + "epoch": 0.4388, + "grad_norm": 0.272953599691391, + "learning_rate": 1.3784870742945482e-05, + "loss": 0.0042, + "step": 4388 + }, + { + "epoch": 0.439, + "grad_norm": 0.0010703380685299635, + "learning_rate": 1.3778407868184674e-05, + "loss": 0.0004, + "step": 4390 + }, + { + "epoch": 0.4392, + "grad_norm": 0.00960696954280138, + "learning_rate": 1.3771943151873768e-05, + "loss": 0.6567, + "step": 4392 + }, + { + "epoch": 0.4394, + "grad_norm": 0.005632512737065554, + "learning_rate": 1.3765476597163595e-05, + "loss": 0.0008, + "step": 4394 + }, + { + "epoch": 0.4396, + "grad_norm": 0.04591122642159462, + "learning_rate": 1.3759008207205869e-05, + "loss": 0.0287, + "step": 4396 + }, + { + "epoch": 0.4398, + "grad_norm": 0.12488532066345215, + "learning_rate": 1.375253798515321e-05, + "loss": 0.0156, + "step": 4398 + }, + { + "epoch": 0.44, + "grad_norm": 3.0000522136688232, + "learning_rate": 1.3746065934159123e-05, + "loss": 0.0697, + "step": 4400 + }, + { + "epoch": 0.4402, + "grad_norm": 0.0855686292052269, + "learning_rate": 1.3739592057378005e-05, + "loss": 0.0021, + "step": 4402 + }, + { + "epoch": 0.4404, + "grad_norm": 0.029379313811659813, + "learning_rate": 1.373311635796515e-05, + "loss": 0.0016, + "step": 4404 + }, + { + "epoch": 0.4406, + "grad_norm": 0.13703589141368866, + "learning_rate": 1.3726638839076732e-05, + "loss": 0.0046, + "step": 4406 + }, + { + "epoch": 0.4408, + "grad_norm": 0.18750295042991638, + "learning_rate": 1.3720159503869816e-05, + "loss": 0.0155, + "step": 4408 + }, + { + "epoch": 0.441, + "grad_norm": 0.0955473780632019, + "learning_rate": 1.371367835550235e-05, + "loss": 0.0049, + "step": 4410 + }, + { + "epoch": 0.4412, + "grad_norm": 0.47452670335769653, + "learning_rate": 1.3707195397133165e-05, + "loss": 0.0072, + "step": 4412 + }, + { + "epoch": 0.4414, + "grad_norm": 0.027009958401322365, + "learning_rate": 1.3700710631921984e-05, + "loss": 0.209, + "step": 4414 + }, + { + "epoch": 0.4416, + "grad_norm": 0.026818370446562767, + "learning_rate": 1.3694224063029396e-05, + "loss": 0.0028, + "step": 4416 + }, + { + "epoch": 0.4418, + "grad_norm": 0.07760671526193619, + "learning_rate": 1.3687735693616876e-05, + "loss": 0.0163, + "step": 4418 + }, + { + "epoch": 0.442, + "grad_norm": 0.01585383713245392, + "learning_rate": 1.3681245526846782e-05, + "loss": 0.0011, + "step": 4420 + }, + { + "epoch": 0.4422, + "grad_norm": 0.32313063740730286, + "learning_rate": 1.3674753565882336e-05, + "loss": 0.0026, + "step": 4422 + }, + { + "epoch": 0.4424, + "grad_norm": 0.1177743673324585, + "learning_rate": 1.3668259813887644e-05, + "loss": 0.0024, + "step": 4424 + }, + { + "epoch": 0.4426, + "grad_norm": 0.7799848318099976, + "learning_rate": 1.3661764274027678e-05, + "loss": 0.0668, + "step": 4426 + }, + { + "epoch": 0.4428, + "grad_norm": 0.04007822275161743, + "learning_rate": 1.365526694946829e-05, + "loss": 0.2571, + "step": 4428 + }, + { + "epoch": 0.443, + "grad_norm": 0.6678617596626282, + "learning_rate": 1.3648767843376196e-05, + "loss": 0.1059, + "step": 4430 + }, + { + "epoch": 0.4432, + "grad_norm": 0.039370644837617874, + "learning_rate": 1.3642266958918985e-05, + "loss": 0.005, + "step": 4432 + }, + { + "epoch": 0.4434, + "grad_norm": 0.017719067633152008, + "learning_rate": 1.36357642992651e-05, + "loss": 0.0018, + "step": 4434 + }, + { + "epoch": 0.4436, + "grad_norm": 1.7385810613632202, + "learning_rate": 1.3629259867583864e-05, + "loss": 0.0494, + "step": 4436 + }, + { + "epoch": 0.4438, + "grad_norm": 1.1164069175720215, + "learning_rate": 1.3622753667045459e-05, + "loss": 0.023, + "step": 4438 + }, + { + "epoch": 0.444, + "grad_norm": 0.005505646578967571, + "learning_rate": 1.3616245700820922e-05, + "loss": 0.0015, + "step": 4440 + }, + { + "epoch": 0.4442, + "grad_norm": 1.337144136428833, + "learning_rate": 1.3609735972082168e-05, + "loss": 0.0277, + "step": 4442 + }, + { + "epoch": 0.4444, + "grad_norm": 1.9877222776412964, + "learning_rate": 1.3603224484001949e-05, + "loss": 0.0241, + "step": 4444 + }, + { + "epoch": 0.4446, + "grad_norm": 0.027417417615652084, + "learning_rate": 1.3596711239753889e-05, + "loss": 0.007, + "step": 4446 + }, + { + "epoch": 0.4448, + "grad_norm": 0.016857368871569633, + "learning_rate": 1.3590196242512463e-05, + "loss": 0.0003, + "step": 4448 + }, + { + "epoch": 0.445, + "grad_norm": 0.24232782423496246, + "learning_rate": 1.3583679495453e-05, + "loss": 0.0044, + "step": 4450 + }, + { + "epoch": 0.4452, + "grad_norm": 0.371620774269104, + "learning_rate": 1.3577161001751696e-05, + "loss": 0.0625, + "step": 4452 + }, + { + "epoch": 0.4454, + "grad_norm": 0.05445350706577301, + "learning_rate": 1.3570640764585567e-05, + "loss": 0.0098, + "step": 4454 + }, + { + "epoch": 0.4456, + "grad_norm": 0.012272791005671024, + "learning_rate": 1.3564118787132507e-05, + "loss": 0.0037, + "step": 4456 + }, + { + "epoch": 0.4458, + "grad_norm": 1.526041030883789, + "learning_rate": 1.355759507257125e-05, + "loss": 0.0079, + "step": 4458 + }, + { + "epoch": 0.446, + "grad_norm": 0.10209405422210693, + "learning_rate": 1.3551069624081372e-05, + "loss": 0.041, + "step": 4460 + }, + { + "epoch": 0.4462, + "grad_norm": 0.025909623131155968, + "learning_rate": 1.3544542444843298e-05, + "loss": 0.0006, + "step": 4462 + }, + { + "epoch": 0.4464, + "grad_norm": 0.0032747257500886917, + "learning_rate": 1.3538013538038295e-05, + "loss": 0.006, + "step": 4464 + }, + { + "epoch": 0.4466, + "grad_norm": 0.8688629269599915, + "learning_rate": 1.3531482906848474e-05, + "loss": 0.1126, + "step": 4466 + }, + { + "epoch": 0.4468, + "grad_norm": 0.0030304803512990475, + "learning_rate": 1.3524950554456786e-05, + "loss": 0.0664, + "step": 4468 + }, + { + "epoch": 0.447, + "grad_norm": 6.896958351135254, + "learning_rate": 1.3518416484047018e-05, + "loss": 0.0526, + "step": 4470 + }, + { + "epoch": 0.4472, + "grad_norm": 0.036162689328193665, + "learning_rate": 1.3511880698803801e-05, + "loss": 0.0012, + "step": 4472 + }, + { + "epoch": 0.4474, + "grad_norm": 0.003917496185749769, + "learning_rate": 1.350534320191259e-05, + "loss": 0.0003, + "step": 4474 + }, + { + "epoch": 0.4476, + "grad_norm": 0.005924155469983816, + "learning_rate": 1.349880399655969e-05, + "loss": 0.0057, + "step": 4476 + }, + { + "epoch": 0.4478, + "grad_norm": 0.007985994219779968, + "learning_rate": 1.3492263085932224e-05, + "loss": 0.0026, + "step": 4478 + }, + { + "epoch": 0.448, + "grad_norm": 0.007225338369607925, + "learning_rate": 1.3485720473218153e-05, + "loss": 0.0004, + "step": 4480 + }, + { + "epoch": 0.4482, + "grad_norm": 0.04798229783773422, + "learning_rate": 1.3479176161606269e-05, + "loss": 0.001, + "step": 4482 + }, + { + "epoch": 0.4484, + "grad_norm": 0.18864776194095612, + "learning_rate": 1.347263015428619e-05, + "loss": 0.0028, + "step": 4484 + }, + { + "epoch": 0.4486, + "grad_norm": 0.0026007751002907753, + "learning_rate": 1.3466082454448364e-05, + "loss": 0.0002, + "step": 4486 + }, + { + "epoch": 0.4488, + "grad_norm": 0.3755494952201843, + "learning_rate": 1.3459533065284049e-05, + "loss": 0.0063, + "step": 4488 + }, + { + "epoch": 0.449, + "grad_norm": 0.03243531286716461, + "learning_rate": 1.3452981989985347e-05, + "loss": 0.0063, + "step": 4490 + }, + { + "epoch": 0.4492, + "grad_norm": 0.004570594988763332, + "learning_rate": 1.344642923174517e-05, + "loss": 0.0003, + "step": 4492 + }, + { + "epoch": 0.4494, + "grad_norm": 4.26309871673584, + "learning_rate": 1.3439874793757255e-05, + "loss": 0.2121, + "step": 4494 + }, + { + "epoch": 0.4496, + "grad_norm": 0.2515552043914795, + "learning_rate": 1.3433318679216154e-05, + "loss": 0.2047, + "step": 4496 + }, + { + "epoch": 0.4498, + "grad_norm": 0.061145082116127014, + "learning_rate": 1.3426760891317236e-05, + "loss": 0.0006, + "step": 4498 + }, + { + "epoch": 0.45, + "grad_norm": 4.610527515411377, + "learning_rate": 1.342020143325669e-05, + "loss": 0.2754, + "step": 4500 + }, + { + "epoch": 0.4502, + "grad_norm": 2.2247467041015625, + "learning_rate": 1.3413640308231511e-05, + "loss": 0.052, + "step": 4502 + }, + { + "epoch": 0.4504, + "grad_norm": 0.0026343660429120064, + "learning_rate": 1.340707751943952e-05, + "loss": 0.0022, + "step": 4504 + }, + { + "epoch": 0.4506, + "grad_norm": 0.03276082128286362, + "learning_rate": 1.340051307007933e-05, + "loss": 0.0008, + "step": 4506 + }, + { + "epoch": 0.4508, + "grad_norm": 0.018470998853445053, + "learning_rate": 1.3393946963350381e-05, + "loss": 0.0066, + "step": 4508 + }, + { + "epoch": 0.451, + "grad_norm": 0.010074581950902939, + "learning_rate": 1.3387379202452917e-05, + "loss": 0.0006, + "step": 4510 + }, + { + "epoch": 0.4512, + "grad_norm": 0.8642851710319519, + "learning_rate": 1.3380809790587975e-05, + "loss": 0.0242, + "step": 4512 + }, + { + "epoch": 0.4514, + "grad_norm": 0.025080500170588493, + "learning_rate": 1.3374238730957414e-05, + "loss": 0.2251, + "step": 4514 + }, + { + "epoch": 0.4516, + "grad_norm": 0.005171892233192921, + "learning_rate": 1.3367666026763884e-05, + "loss": 0.0033, + "step": 4516 + }, + { + "epoch": 0.4518, + "grad_norm": 1.725402593612671, + "learning_rate": 1.3361091681210846e-05, + "loss": 0.0217, + "step": 4518 + }, + { + "epoch": 0.452, + "grad_norm": 0.20833410322666168, + "learning_rate": 1.3354515697502552e-05, + "loss": 0.0117, + "step": 4520 + }, + { + "epoch": 0.4522, + "grad_norm": 0.06321299821138382, + "learning_rate": 1.3347938078844058e-05, + "loss": 0.0015, + "step": 4522 + }, + { + "epoch": 0.4524, + "grad_norm": 0.0385909341275692, + "learning_rate": 1.3341358828441217e-05, + "loss": 0.0091, + "step": 4524 + }, + { + "epoch": 0.4526, + "grad_norm": 0.07927745580673218, + "learning_rate": 1.3334777949500673e-05, + "loss": 0.0028, + "step": 4526 + }, + { + "epoch": 0.4528, + "grad_norm": 1.4416955709457397, + "learning_rate": 1.3328195445229869e-05, + "loss": 0.0415, + "step": 4528 + }, + { + "epoch": 0.453, + "grad_norm": 0.008741031400859356, + "learning_rate": 1.3321611318837033e-05, + "loss": 0.0642, + "step": 4530 + }, + { + "epoch": 0.4532, + "grad_norm": 0.007618233095854521, + "learning_rate": 1.3315025573531198e-05, + "loss": 0.0005, + "step": 4532 + }, + { + "epoch": 0.4534, + "grad_norm": 0.04557926952838898, + "learning_rate": 1.3308438212522164e-05, + "loss": 0.0021, + "step": 4534 + }, + { + "epoch": 0.4536, + "grad_norm": 1.0185385942459106, + "learning_rate": 1.3301849239020537e-05, + "loss": 0.0163, + "step": 4536 + }, + { + "epoch": 0.4538, + "grad_norm": 0.011403060518205166, + "learning_rate": 1.3295258656237703e-05, + "loss": 0.0004, + "step": 4538 + }, + { + "epoch": 0.454, + "grad_norm": 0.6815643906593323, + "learning_rate": 1.3288666467385834e-05, + "loss": 0.0157, + "step": 4540 + }, + { + "epoch": 0.4542, + "grad_norm": 0.002167606493458152, + "learning_rate": 1.328207267567788e-05, + "loss": 0.0661, + "step": 4542 + }, + { + "epoch": 0.4544, + "grad_norm": 0.08753622323274612, + "learning_rate": 1.327547728432757e-05, + "loss": 0.0335, + "step": 4544 + }, + { + "epoch": 0.4546, + "grad_norm": 0.011360513046383858, + "learning_rate": 1.3268880296549424e-05, + "loss": 0.0098, + "step": 4546 + }, + { + "epoch": 0.4548, + "grad_norm": 1.5974136590957642, + "learning_rate": 1.3262281715558736e-05, + "loss": 0.0547, + "step": 4548 + }, + { + "epoch": 0.455, + "grad_norm": 0.038656968623399734, + "learning_rate": 1.3255681544571568e-05, + "loss": 0.0464, + "step": 4550 + }, + { + "epoch": 0.4552, + "grad_norm": 0.029907729476690292, + "learning_rate": 1.3249079786804765e-05, + "loss": 0.0103, + "step": 4552 + }, + { + "epoch": 0.4554, + "grad_norm": 0.008109742775559425, + "learning_rate": 1.3242476445475945e-05, + "loss": 0.0004, + "step": 4554 + }, + { + "epoch": 0.4556, + "grad_norm": 0.28778931498527527, + "learning_rate": 1.3235871523803496e-05, + "loss": 0.0027, + "step": 4556 + }, + { + "epoch": 0.4558, + "grad_norm": 0.224399596452713, + "learning_rate": 1.3229265025006577e-05, + "loss": 0.004, + "step": 4558 + }, + { + "epoch": 0.456, + "grad_norm": 0.9410289525985718, + "learning_rate": 1.3222656952305113e-05, + "loss": 0.0295, + "step": 4560 + }, + { + "epoch": 0.4562, + "grad_norm": 0.008287561126053333, + "learning_rate": 1.32160473089198e-05, + "loss": 0.0004, + "step": 4562 + }, + { + "epoch": 0.4564, + "grad_norm": 0.20843489468097687, + "learning_rate": 1.3209436098072095e-05, + "loss": 0.0038, + "step": 4564 + }, + { + "epoch": 0.4566, + "grad_norm": 3.5498220920562744, + "learning_rate": 1.3202823322984228e-05, + "loss": 0.3038, + "step": 4566 + }, + { + "epoch": 0.4568, + "grad_norm": 0.028411030769348145, + "learning_rate": 1.319620898687918e-05, + "loss": 0.0006, + "step": 4568 + }, + { + "epoch": 0.457, + "grad_norm": 0.7635385394096375, + "learning_rate": 1.3189593092980701e-05, + "loss": 0.0246, + "step": 4570 + }, + { + "epoch": 0.4572, + "grad_norm": 0.01396875362843275, + "learning_rate": 1.3182975644513296e-05, + "loss": 0.0071, + "step": 4572 + }, + { + "epoch": 0.4574, + "grad_norm": 0.11150379478931427, + "learning_rate": 1.3176356644702225e-05, + "loss": 0.0251, + "step": 4574 + }, + { + "epoch": 0.4576, + "grad_norm": 0.014401976019144058, + "learning_rate": 1.316973609677352e-05, + "loss": 0.0025, + "step": 4576 + }, + { + "epoch": 0.4578, + "grad_norm": 2.115203619003296, + "learning_rate": 1.316311400395394e-05, + "loss": 0.165, + "step": 4578 + }, + { + "epoch": 0.458, + "grad_norm": 0.4054473340511322, + "learning_rate": 1.3156490369471026e-05, + "loss": 0.0263, + "step": 4580 + }, + { + "epoch": 0.4582, + "grad_norm": 0.14285209774971008, + "learning_rate": 1.3149865196553049e-05, + "loss": 0.0027, + "step": 4582 + }, + { + "epoch": 0.4584, + "grad_norm": 0.4553835391998291, + "learning_rate": 1.3143238488429042e-05, + "loss": 0.0236, + "step": 4584 + }, + { + "epoch": 0.4586, + "grad_norm": 4.270960330963135, + "learning_rate": 1.3136610248328779e-05, + "loss": 0.0752, + "step": 4586 + }, + { + "epoch": 0.4588, + "grad_norm": 0.012890392914414406, + "learning_rate": 1.3129980479482783e-05, + "loss": 0.0073, + "step": 4588 + }, + { + "epoch": 0.459, + "grad_norm": 0.21101470291614532, + "learning_rate": 1.3123349185122328e-05, + "loss": 0.005, + "step": 4590 + }, + { + "epoch": 0.4592, + "grad_norm": 0.6034485101699829, + "learning_rate": 1.3116716368479418e-05, + "loss": 0.0105, + "step": 4592 + }, + { + "epoch": 0.4594, + "grad_norm": 0.21657228469848633, + "learning_rate": 1.311008203278682e-05, + "loss": 0.0085, + "step": 4594 + }, + { + "epoch": 0.4596, + "grad_norm": 0.02357448637485504, + "learning_rate": 1.3103446181278015e-05, + "loss": 0.0007, + "step": 4596 + }, + { + "epoch": 0.4598, + "grad_norm": 0.01017722673714161, + "learning_rate": 1.3096808817187243e-05, + "loss": 0.0004, + "step": 4598 + }, + { + "epoch": 0.46, + "grad_norm": 0.016734778881072998, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.0047, + "step": 4600 + }, + { + "epoch": 0.4602, + "grad_norm": 0.7092567682266235, + "learning_rate": 1.3083529564200417e-05, + "loss": 0.056, + "step": 4602 + }, + { + "epoch": 0.4604, + "grad_norm": 2.8853061199188232, + "learning_rate": 1.3076887681776509e-05, + "loss": 0.1489, + "step": 4604 + }, + { + "epoch": 0.4606, + "grad_norm": 1.0601797103881836, + "learning_rate": 1.307024429971492e-05, + "loss": 0.0198, + "step": 4606 + }, + { + "epoch": 0.4608, + "grad_norm": 0.044642187654972076, + "learning_rate": 1.306359942125356e-05, + "loss": 0.0009, + "step": 4608 + }, + { + "epoch": 0.461, + "grad_norm": 0.5149310827255249, + "learning_rate": 1.3056953049631059e-05, + "loss": 0.0088, + "step": 4610 + }, + { + "epoch": 0.4612, + "grad_norm": 0.06488745659589767, + "learning_rate": 1.3050305188086778e-05, + "loss": 0.0028, + "step": 4612 + }, + { + "epoch": 0.4614, + "grad_norm": 0.06895953416824341, + "learning_rate": 1.3043655839860803e-05, + "loss": 0.0013, + "step": 4614 + }, + { + "epoch": 0.4616, + "grad_norm": 0.0019626871217042208, + "learning_rate": 1.3037005008193944e-05, + "loss": 0.0002, + "step": 4616 + }, + { + "epoch": 0.4618, + "grad_norm": 3.9980180263519287, + "learning_rate": 1.3030352696327741e-05, + "loss": 0.1315, + "step": 4618 + }, + { + "epoch": 0.462, + "grad_norm": 0.0032478254288434982, + "learning_rate": 1.3023698907504447e-05, + "loss": 0.0001, + "step": 4620 + }, + { + "epoch": 0.4622, + "grad_norm": 1.4499030113220215, + "learning_rate": 1.3017043644967036e-05, + "loss": 0.0151, + "step": 4622 + }, + { + "epoch": 0.4624, + "grad_norm": 0.07923920452594757, + "learning_rate": 1.3010386911959207e-05, + "loss": 0.0015, + "step": 4624 + }, + { + "epoch": 0.4626, + "grad_norm": 0.004986742045730352, + "learning_rate": 1.3003728711725364e-05, + "loss": 0.0163, + "step": 4626 + }, + { + "epoch": 0.4628, + "grad_norm": 0.09433140605688095, + "learning_rate": 1.299706904751064e-05, + "loss": 0.0018, + "step": 4628 + }, + { + "epoch": 0.463, + "grad_norm": 0.0020928175654262304, + "learning_rate": 1.2990407922560869e-05, + "loss": 0.0002, + "step": 4630 + }, + { + "epoch": 0.4632, + "grad_norm": 0.12095297873020172, + "learning_rate": 1.2983745340122604e-05, + "loss": 0.0015, + "step": 4632 + }, + { + "epoch": 0.4634, + "grad_norm": 0.15674124658107758, + "learning_rate": 1.2977081303443107e-05, + "loss": 0.0027, + "step": 4634 + }, + { + "epoch": 0.4636, + "grad_norm": 1.4650381803512573, + "learning_rate": 1.297041581577035e-05, + "loss": 0.0771, + "step": 4636 + }, + { + "epoch": 0.4638, + "grad_norm": 0.10285313427448273, + "learning_rate": 1.2963748880353011e-05, + "loss": 0.0012, + "step": 4638 + }, + { + "epoch": 0.464, + "grad_norm": 0.07474137842655182, + "learning_rate": 1.2957080500440469e-05, + "loss": 0.0006, + "step": 4640 + }, + { + "epoch": 0.4642, + "grad_norm": 12.434475898742676, + "learning_rate": 1.2950410679282815e-05, + "loss": 0.0801, + "step": 4642 + }, + { + "epoch": 0.4644, + "grad_norm": 0.002674081362783909, + "learning_rate": 1.2943739420130837e-05, + "loss": 0.0007, + "step": 4644 + }, + { + "epoch": 0.4646, + "grad_norm": 0.03647740185260773, + "learning_rate": 1.2937066726236029e-05, + "loss": 0.0011, + "step": 4646 + }, + { + "epoch": 0.4648, + "grad_norm": 0.01862105168402195, + "learning_rate": 1.2930392600850574e-05, + "loss": 0.0003, + "step": 4648 + }, + { + "epoch": 0.465, + "grad_norm": 0.023201579228043556, + "learning_rate": 1.2923717047227368e-05, + "loss": 0.01, + "step": 4650 + }, + { + "epoch": 0.4652, + "grad_norm": 7.717526912689209, + "learning_rate": 1.291704006861999e-05, + "loss": 0.147, + "step": 4652 + }, + { + "epoch": 0.4654, + "grad_norm": 0.006111873313784599, + "learning_rate": 1.2910361668282718e-05, + "loss": 0.0002, + "step": 4654 + }, + { + "epoch": 0.4656, + "grad_norm": 0.10105755180120468, + "learning_rate": 1.2903681849470528e-05, + "loss": 0.0011, + "step": 4656 + }, + { + "epoch": 0.4658, + "grad_norm": 0.683431088924408, + "learning_rate": 1.2897000615439075e-05, + "loss": 0.0068, + "step": 4658 + }, + { + "epoch": 0.466, + "grad_norm": 0.38173237442970276, + "learning_rate": 1.2890317969444716e-05, + "loss": 0.1446, + "step": 4660 + }, + { + "epoch": 0.4662, + "grad_norm": 1.078770399093628, + "learning_rate": 1.2883633914744493e-05, + "loss": 0.0154, + "step": 4662 + }, + { + "epoch": 0.4664, + "grad_norm": 0.020040517672896385, + "learning_rate": 1.287694845459613e-05, + "loss": 0.0863, + "step": 4664 + }, + { + "epoch": 0.4666, + "grad_norm": 0.11527664214372635, + "learning_rate": 1.2870261592258038e-05, + "loss": 0.0128, + "step": 4666 + }, + { + "epoch": 0.4668, + "grad_norm": 0.007066130638122559, + "learning_rate": 1.2863573330989315e-05, + "loss": 0.0225, + "step": 4668 + }, + { + "epoch": 0.467, + "grad_norm": 0.01157929003238678, + "learning_rate": 1.2856883674049736e-05, + "loss": 0.249, + "step": 4670 + }, + { + "epoch": 0.4672, + "grad_norm": 2.2256672382354736, + "learning_rate": 1.2850192624699762e-05, + "loss": 0.1484, + "step": 4672 + }, + { + "epoch": 0.4674, + "grad_norm": 0.3420431315898895, + "learning_rate": 1.2843500186200529e-05, + "loss": 0.013, + "step": 4674 + }, + { + "epoch": 0.4676, + "grad_norm": 7.132969379425049, + "learning_rate": 1.2836806361813846e-05, + "loss": 0.2039, + "step": 4676 + }, + { + "epoch": 0.4678, + "grad_norm": 0.008713888004422188, + "learning_rate": 1.2830111154802203e-05, + "loss": 0.0005, + "step": 4678 + }, + { + "epoch": 0.468, + "grad_norm": 0.007720759138464928, + "learning_rate": 1.2823414568428767e-05, + "loss": 0.0601, + "step": 4680 + }, + { + "epoch": 0.4682, + "grad_norm": 0.027655847370624542, + "learning_rate": 1.2816716605957366e-05, + "loss": 0.0024, + "step": 4682 + }, + { + "epoch": 0.4684, + "grad_norm": 0.006037370301783085, + "learning_rate": 1.2810017270652513e-05, + "loss": 0.0002, + "step": 4684 + }, + { + "epoch": 0.4686, + "grad_norm": 0.40826937556266785, + "learning_rate": 1.2803316565779378e-05, + "loss": 0.011, + "step": 4686 + }, + { + "epoch": 0.4688, + "grad_norm": 0.05440462380647659, + "learning_rate": 1.27966144946038e-05, + "loss": 0.0056, + "step": 4688 + }, + { + "epoch": 0.469, + "grad_norm": 0.006343033630400896, + "learning_rate": 1.2789911060392295e-05, + "loss": 0.0002, + "step": 4690 + }, + { + "epoch": 0.4692, + "grad_norm": 7.344333648681641, + "learning_rate": 1.278320626641203e-05, + "loss": 0.3648, + "step": 4692 + }, + { + "epoch": 0.4694, + "grad_norm": 0.03792087733745575, + "learning_rate": 1.2776500115930842e-05, + "loss": 0.004, + "step": 4694 + }, + { + "epoch": 0.4696, + "grad_norm": 0.0032740081660449505, + "learning_rate": 1.2769792612217224e-05, + "loss": 0.0006, + "step": 4696 + }, + { + "epoch": 0.4698, + "grad_norm": 0.001952683669514954, + "learning_rate": 1.2763083758540337e-05, + "loss": 0.0162, + "step": 4698 + }, + { + "epoch": 0.47, + "grad_norm": 0.3980342447757721, + "learning_rate": 1.2756373558169992e-05, + "loss": 0.0236, + "step": 4700 + }, + { + "epoch": 0.4702, + "grad_norm": 0.027441302314400673, + "learning_rate": 1.2749662014376662e-05, + "loss": 0.0024, + "step": 4702 + }, + { + "epoch": 0.4704, + "grad_norm": 0.002660147612914443, + "learning_rate": 1.2742949130431468e-05, + "loss": 0.055, + "step": 4704 + }, + { + "epoch": 0.4706, + "grad_norm": 0.018689384683966637, + "learning_rate": 1.2736234909606186e-05, + "loss": 0.0005, + "step": 4706 + }, + { + "epoch": 0.4708, + "grad_norm": 0.00509532680734992, + "learning_rate": 1.2729519355173254e-05, + "loss": 0.0007, + "step": 4708 + }, + { + "epoch": 0.471, + "grad_norm": 0.17001405358314514, + "learning_rate": 1.2722802470405744e-05, + "loss": 0.013, + "step": 4710 + }, + { + "epoch": 0.4712, + "grad_norm": 0.012736499309539795, + "learning_rate": 1.2716084258577388e-05, + "loss": 0.0064, + "step": 4712 + }, + { + "epoch": 0.4714, + "grad_norm": 0.014948666095733643, + "learning_rate": 1.270936472296256e-05, + "loss": 0.0029, + "step": 4714 + }, + { + "epoch": 0.4716, + "grad_norm": 0.9166982173919678, + "learning_rate": 1.270264386683628e-05, + "loss": 0.0056, + "step": 4716 + }, + { + "epoch": 0.4718, + "grad_norm": 0.4435138404369354, + "learning_rate": 1.2695921693474211e-05, + "loss": 0.0085, + "step": 4718 + }, + { + "epoch": 0.472, + "grad_norm": 0.42634761333465576, + "learning_rate": 1.2689198206152657e-05, + "loss": 0.0098, + "step": 4720 + }, + { + "epoch": 0.4722, + "grad_norm": 0.0416552796959877, + "learning_rate": 1.268247340814857e-05, + "loss": 0.0009, + "step": 4722 + }, + { + "epoch": 0.4724, + "grad_norm": 0.5749559998512268, + "learning_rate": 1.2675747302739528e-05, + "loss": 0.0113, + "step": 4724 + }, + { + "epoch": 0.4726, + "grad_norm": 0.6518208384513855, + "learning_rate": 1.2669019893203758e-05, + "loss": 0.0047, + "step": 4726 + }, + { + "epoch": 0.4728, + "grad_norm": 0.059844255447387695, + "learning_rate": 1.2662291182820115e-05, + "loss": 0.0028, + "step": 4728 + }, + { + "epoch": 0.473, + "grad_norm": 0.10520908236503601, + "learning_rate": 1.265556117486809e-05, + "loss": 0.0023, + "step": 4730 + }, + { + "epoch": 0.4732, + "grad_norm": 0.024799227714538574, + "learning_rate": 1.2648829872627809e-05, + "loss": 0.0008, + "step": 4732 + }, + { + "epoch": 0.4734, + "grad_norm": 0.029013769701123238, + "learning_rate": 1.2642097279380025e-05, + "loss": 0.0006, + "step": 4734 + }, + { + "epoch": 0.4736, + "grad_norm": 0.6678987145423889, + "learning_rate": 1.263536339840613e-05, + "loss": 0.0102, + "step": 4736 + }, + { + "epoch": 0.4738, + "grad_norm": 0.04675715044140816, + "learning_rate": 1.2628628232988123e-05, + "loss": 0.0103, + "step": 4738 + }, + { + "epoch": 0.474, + "grad_norm": 0.17973555624485016, + "learning_rate": 1.2621891786408648e-05, + "loss": 0.003, + "step": 4740 + }, + { + "epoch": 0.4742, + "grad_norm": 0.02979743853211403, + "learning_rate": 1.261515406195097e-05, + "loss": 0.0975, + "step": 4742 + }, + { + "epoch": 0.4744, + "grad_norm": 0.3622768521308899, + "learning_rate": 1.2608415062898971e-05, + "loss": 0.0046, + "step": 4744 + }, + { + "epoch": 0.4746, + "grad_norm": 0.016521204262971878, + "learning_rate": 1.2601674792537157e-05, + "loss": 0.0004, + "step": 4746 + }, + { + "epoch": 0.4748, + "grad_norm": 0.04998796433210373, + "learning_rate": 1.2594933254150654e-05, + "loss": 0.0214, + "step": 4748 + }, + { + "epoch": 0.475, + "grad_norm": 0.0882638469338417, + "learning_rate": 1.2588190451025209e-05, + "loss": 0.0015, + "step": 4750 + }, + { + "epoch": 0.4752, + "grad_norm": 0.00765781057998538, + "learning_rate": 1.2581446386447178e-05, + "loss": 0.0011, + "step": 4752 + }, + { + "epoch": 0.4754, + "grad_norm": 0.9342531561851501, + "learning_rate": 1.257470106370354e-05, + "loss": 0.0106, + "step": 4754 + }, + { + "epoch": 0.4756, + "grad_norm": 0.5248991847038269, + "learning_rate": 1.256795448608188e-05, + "loss": 0.0082, + "step": 4756 + }, + { + "epoch": 0.4758, + "grad_norm": 0.1074688732624054, + "learning_rate": 1.2561206656870397e-05, + "loss": 0.0196, + "step": 4758 + }, + { + "epoch": 0.476, + "grad_norm": 0.04217443987727165, + "learning_rate": 1.2554457579357906e-05, + "loss": 0.0009, + "step": 4760 + }, + { + "epoch": 0.4762, + "grad_norm": 0.04064987972378731, + "learning_rate": 1.2547707256833823e-05, + "loss": 0.0019, + "step": 4762 + }, + { + "epoch": 0.4764, + "grad_norm": 0.006909624207764864, + "learning_rate": 1.2540955692588173e-05, + "loss": 0.0004, + "step": 4764 + }, + { + "epoch": 0.4766, + "grad_norm": 0.016788315027952194, + "learning_rate": 1.2534202889911584e-05, + "loss": 0.0007, + "step": 4766 + }, + { + "epoch": 0.4768, + "grad_norm": 0.06506343185901642, + "learning_rate": 1.2527448852095295e-05, + "loss": 0.0039, + "step": 4768 + }, + { + "epoch": 0.477, + "grad_norm": 0.012688920833170414, + "learning_rate": 1.252069358243114e-05, + "loss": 0.0074, + "step": 4770 + }, + { + "epoch": 0.4772, + "grad_norm": 0.032042741775512695, + "learning_rate": 1.251393708421155e-05, + "loss": 0.0636, + "step": 4772 + }, + { + "epoch": 0.4774, + "grad_norm": 0.012870047241449356, + "learning_rate": 1.2507179360729569e-05, + "loss": 0.0295, + "step": 4774 + }, + { + "epoch": 0.4776, + "grad_norm": 0.86605304479599, + "learning_rate": 1.2500420415278822e-05, + "loss": 0.0031, + "step": 4776 + }, + { + "epoch": 0.4778, + "grad_norm": 0.07407654821872711, + "learning_rate": 1.249366025115354e-05, + "loss": 0.0026, + "step": 4778 + }, + { + "epoch": 0.478, + "grad_norm": 0.008723016828298569, + "learning_rate": 1.2486898871648552e-05, + "loss": 0.0003, + "step": 4780 + }, + { + "epoch": 0.4782, + "grad_norm": 0.3270271420478821, + "learning_rate": 1.2480136280059256e-05, + "loss": 0.0018, + "step": 4782 + }, + { + "epoch": 0.4784, + "grad_norm": 0.055202171206474304, + "learning_rate": 1.2473372479681671e-05, + "loss": 0.0039, + "step": 4784 + }, + { + "epoch": 0.4786, + "grad_norm": 1.048992395401001, + "learning_rate": 1.2466607473812386e-05, + "loss": 0.0545, + "step": 4786 + }, + { + "epoch": 0.4788, + "grad_norm": 1.0141496658325195, + "learning_rate": 1.2459841265748582e-05, + "loss": 0.0798, + "step": 4788 + }, + { + "epoch": 0.479, + "grad_norm": 0.001779337297193706, + "learning_rate": 1.2453073858788027e-05, + "loss": 0.0005, + "step": 4790 + }, + { + "epoch": 0.4792, + "grad_norm": 0.008690553717315197, + "learning_rate": 1.2446305256229074e-05, + "loss": 0.0002, + "step": 4792 + }, + { + "epoch": 0.4794, + "grad_norm": 0.05628308281302452, + "learning_rate": 1.2439535461370658e-05, + "loss": 0.0013, + "step": 4794 + }, + { + "epoch": 0.4796, + "grad_norm": 0.002492513507604599, + "learning_rate": 1.2432764477512294e-05, + "loss": 0.0001, + "step": 4796 + }, + { + "epoch": 0.4798, + "grad_norm": 0.04145023226737976, + "learning_rate": 1.2425992307954075e-05, + "loss": 0.0011, + "step": 4798 + }, + { + "epoch": 0.48, + "grad_norm": 0.00400395505130291, + "learning_rate": 1.2419218955996677e-05, + "loss": 0.0002, + "step": 4800 + }, + { + "epoch": 0.4802, + "grad_norm": 0.027546226978302002, + "learning_rate": 1.241244442494135e-05, + "loss": 0.0013, + "step": 4802 + }, + { + "epoch": 0.4804, + "grad_norm": 7.194404602050781, + "learning_rate": 1.2405668718089918e-05, + "loss": 0.1423, + "step": 4804 + }, + { + "epoch": 0.4806, + "grad_norm": 0.030482744798064232, + "learning_rate": 1.2398891838744777e-05, + "loss": 0.0004, + "step": 4806 + }, + { + "epoch": 0.4808, + "grad_norm": 0.09253943711519241, + "learning_rate": 1.2392113790208895e-05, + "loss": 0.001, + "step": 4808 + }, + { + "epoch": 0.481, + "grad_norm": 0.7763925194740295, + "learning_rate": 1.238533457578581e-05, + "loss": 0.0139, + "step": 4810 + }, + { + "epoch": 0.4812, + "grad_norm": 0.13028913736343384, + "learning_rate": 1.2378554198779632e-05, + "loss": 0.0022, + "step": 4812 + }, + { + "epoch": 0.4814, + "grad_norm": 2.9474382400512695, + "learning_rate": 1.2371772662495031e-05, + "loss": 0.0531, + "step": 4814 + }, + { + "epoch": 0.4816, + "grad_norm": 0.2969997823238373, + "learning_rate": 1.236498997023725e-05, + "loss": 0.1551, + "step": 4816 + }, + { + "epoch": 0.4818, + "grad_norm": 0.044091012328863144, + "learning_rate": 1.2358206125312085e-05, + "loss": 0.0009, + "step": 4818 + }, + { + "epoch": 0.482, + "grad_norm": 0.5534703731536865, + "learning_rate": 1.23514211310259e-05, + "loss": 0.0021, + "step": 4820 + }, + { + "epoch": 0.4822, + "grad_norm": 0.03960588574409485, + "learning_rate": 1.2344634990685624e-05, + "loss": 0.0006, + "step": 4822 + }, + { + "epoch": 0.4824, + "grad_norm": 0.04439321905374527, + "learning_rate": 1.2337847707598738e-05, + "loss": 0.0009, + "step": 4824 + }, + { + "epoch": 0.4826, + "grad_norm": 4.758847236633301, + "learning_rate": 1.233105928507328e-05, + "loss": 0.1749, + "step": 4826 + }, + { + "epoch": 0.4828, + "grad_norm": 0.2427894026041031, + "learning_rate": 1.2324269726417841e-05, + "loss": 0.003, + "step": 4828 + }, + { + "epoch": 0.483, + "grad_norm": 0.014088919386267662, + "learning_rate": 1.2317479034941572e-05, + "loss": 0.0018, + "step": 4830 + }, + { + "epoch": 0.4832, + "grad_norm": 0.00970782246440649, + "learning_rate": 1.2310687213954182e-05, + "loss": 0.0015, + "step": 4832 + }, + { + "epoch": 0.4834, + "grad_norm": 0.04705693945288658, + "learning_rate": 1.2303894266765908e-05, + "loss": 0.0008, + "step": 4834 + }, + { + "epoch": 0.4836, + "grad_norm": 0.002432899083942175, + "learning_rate": 1.2297100196687557e-05, + "loss": 0.0045, + "step": 4836 + }, + { + "epoch": 0.4838, + "grad_norm": 0.021786682307720184, + "learning_rate": 1.2290305007030479e-05, + "loss": 0.004, + "step": 4838 + }, + { + "epoch": 0.484, + "grad_norm": 0.1607823222875595, + "learning_rate": 1.2283508701106559e-05, + "loss": 0.0032, + "step": 4840 + }, + { + "epoch": 0.4842, + "grad_norm": 0.003314563538879156, + "learning_rate": 1.2276711282228241e-05, + "loss": 0.0003, + "step": 4842 + }, + { + "epoch": 0.4844, + "grad_norm": 0.00965543556958437, + "learning_rate": 1.2269912753708502e-05, + "loss": 0.0002, + "step": 4844 + }, + { + "epoch": 0.4846, + "grad_norm": 0.006575581151992083, + "learning_rate": 1.226311311886086e-05, + "loss": 0.1199, + "step": 4846 + }, + { + "epoch": 0.4848, + "grad_norm": 0.009454894810914993, + "learning_rate": 1.2256312380999376e-05, + "loss": 0.0025, + "step": 4848 + }, + { + "epoch": 0.485, + "grad_norm": 0.012494003400206566, + "learning_rate": 1.2249510543438652e-05, + "loss": 0.0002, + "step": 4850 + }, + { + "epoch": 0.4852, + "grad_norm": 0.002781290328130126, + "learning_rate": 1.2242707609493814e-05, + "loss": 0.0002, + "step": 4852 + }, + { + "epoch": 0.4854, + "grad_norm": 2.396446704864502, + "learning_rate": 1.223590358248053e-05, + "loss": 0.108, + "step": 4854 + }, + { + "epoch": 0.4856, + "grad_norm": 0.4151923656463623, + "learning_rate": 1.2229098465715005e-05, + "loss": 0.0139, + "step": 4856 + }, + { + "epoch": 0.4858, + "grad_norm": 0.00530809722840786, + "learning_rate": 1.2222292262513967e-05, + "loss": 0.2232, + "step": 4858 + }, + { + "epoch": 0.486, + "grad_norm": 1.2294361591339111, + "learning_rate": 1.2215484976194675e-05, + "loss": 0.0254, + "step": 4860 + }, + { + "epoch": 0.4862, + "grad_norm": 0.5693625807762146, + "learning_rate": 1.220867661007492e-05, + "loss": 0.0095, + "step": 4862 + }, + { + "epoch": 0.4864, + "grad_norm": 0.6662741899490356, + "learning_rate": 1.2201867167473015e-05, + "loss": 0.0295, + "step": 4864 + }, + { + "epoch": 0.4866, + "grad_norm": 0.007327545899897814, + "learning_rate": 1.2195056651707806e-05, + "loss": 0.0005, + "step": 4866 + }, + { + "epoch": 0.4868, + "grad_norm": 0.19217099249362946, + "learning_rate": 1.2188245066098647e-05, + "loss": 0.003, + "step": 4868 + }, + { + "epoch": 0.487, + "grad_norm": 0.02174895629286766, + "learning_rate": 1.2181432413965428e-05, + "loss": 0.0005, + "step": 4870 + }, + { + "epoch": 0.4872, + "grad_norm": 0.02029268629848957, + "learning_rate": 1.217461869862855e-05, + "loss": 0.0886, + "step": 4872 + }, + { + "epoch": 0.4874, + "grad_norm": 0.09311186522245407, + "learning_rate": 1.2167803923408935e-05, + "loss": 0.0014, + "step": 4874 + }, + { + "epoch": 0.4876, + "grad_norm": 0.006149850320070982, + "learning_rate": 1.2160988091628023e-05, + "loss": 0.0252, + "step": 4876 + }, + { + "epoch": 0.4878, + "grad_norm": 3.897334098815918, + "learning_rate": 1.2154171206607765e-05, + "loss": 0.0714, + "step": 4878 + }, + { + "epoch": 0.488, + "grad_norm": 0.004754783120006323, + "learning_rate": 1.2147353271670634e-05, + "loss": 0.0038, + "step": 4880 + }, + { + "epoch": 0.4882, + "grad_norm": 0.006104717962443829, + "learning_rate": 1.2140534290139601e-05, + "loss": 0.0021, + "step": 4882 + }, + { + "epoch": 0.4884, + "grad_norm": 0.040922634303569794, + "learning_rate": 1.2133714265338162e-05, + "loss": 0.0014, + "step": 4884 + }, + { + "epoch": 0.4886, + "grad_norm": 0.9207740426063538, + "learning_rate": 1.2126893200590309e-05, + "loss": 0.0167, + "step": 4886 + }, + { + "epoch": 0.4888, + "grad_norm": 0.4556030035018921, + "learning_rate": 1.212007109922055e-05, + "loss": 0.009, + "step": 4888 + }, + { + "epoch": 0.489, + "grad_norm": 1.64279305934906, + "learning_rate": 1.211324796455389e-05, + "loss": 0.0223, + "step": 4890 + }, + { + "epoch": 0.4892, + "grad_norm": 0.039349909871816635, + "learning_rate": 1.2106423799915841e-05, + "loss": 0.0252, + "step": 4892 + }, + { + "epoch": 0.4894, + "grad_norm": 0.029827672988176346, + "learning_rate": 1.2099598608632427e-05, + "loss": 0.0007, + "step": 4894 + }, + { + "epoch": 0.4896, + "grad_norm": 0.003958471119403839, + "learning_rate": 1.2092772394030153e-05, + "loss": 0.0003, + "step": 4896 + }, + { + "epoch": 0.4898, + "grad_norm": 0.6724847555160522, + "learning_rate": 1.208594515943604e-05, + "loss": 0.0134, + "step": 4898 + }, + { + "epoch": 0.49, + "grad_norm": 5.414525032043457, + "learning_rate": 1.2079116908177592e-05, + "loss": 0.0851, + "step": 4900 + }, + { + "epoch": 0.4902, + "grad_norm": 0.005068269558250904, + "learning_rate": 1.2072287643582825e-05, + "loss": 0.2532, + "step": 4902 + }, + { + "epoch": 0.4904, + "grad_norm": 0.005374759901314974, + "learning_rate": 1.2065457368980236e-05, + "loss": 0.0003, + "step": 4904 + }, + { + "epoch": 0.4906, + "grad_norm": 0.019918736070394516, + "learning_rate": 1.2058626087698814e-05, + "loss": 0.0009, + "step": 4906 + }, + { + "epoch": 0.4908, + "grad_norm": 0.0057598380371928215, + "learning_rate": 1.2051793803068046e-05, + "loss": 0.0013, + "step": 4908 + }, + { + "epoch": 0.491, + "grad_norm": 0.004915578290820122, + "learning_rate": 1.2044960518417902e-05, + "loss": 0.0001, + "step": 4910 + }, + { + "epoch": 0.4912, + "grad_norm": 0.380962610244751, + "learning_rate": 1.203812623707885e-05, + "loss": 0.0036, + "step": 4912 + }, + { + "epoch": 0.4914, + "grad_norm": 0.002852587029337883, + "learning_rate": 1.2031290962381823e-05, + "loss": 0.0004, + "step": 4914 + }, + { + "epoch": 0.4916, + "grad_norm": 0.0033303718082606792, + "learning_rate": 1.202445469765826e-05, + "loss": 0.0464, + "step": 4916 + }, + { + "epoch": 0.4918, + "grad_norm": 0.1755681037902832, + "learning_rate": 1.201761744624007e-05, + "loss": 0.003, + "step": 4918 + }, + { + "epoch": 0.492, + "grad_norm": 0.3475470244884491, + "learning_rate": 1.2010779211459649e-05, + "loss": 0.046, + "step": 4920 + }, + { + "epoch": 0.4922, + "grad_norm": 0.1542423665523529, + "learning_rate": 1.2003939996649864e-05, + "loss": 0.0141, + "step": 4922 + }, + { + "epoch": 0.4924, + "grad_norm": 0.004593782592564821, + "learning_rate": 1.1997099805144071e-05, + "loss": 0.0118, + "step": 4924 + }, + { + "epoch": 0.4926, + "grad_norm": 0.011740049347281456, + "learning_rate": 1.1990258640276094e-05, + "loss": 0.0041, + "step": 4926 + }, + { + "epoch": 0.4928, + "grad_norm": 0.23000331223011017, + "learning_rate": 1.1983416505380234e-05, + "loss": 0.0067, + "step": 4928 + }, + { + "epoch": 0.493, + "grad_norm": 1.9113956689834595, + "learning_rate": 1.1976573403791263e-05, + "loss": 0.0168, + "step": 4930 + }, + { + "epoch": 0.4932, + "grad_norm": 0.6684818267822266, + "learning_rate": 1.1969729338844429e-05, + "loss": 0.1611, + "step": 4932 + }, + { + "epoch": 0.4934, + "grad_norm": 0.00442750146612525, + "learning_rate": 1.196288431387544e-05, + "loss": 0.0001, + "step": 4934 + }, + { + "epoch": 0.4936, + "grad_norm": 0.23943960666656494, + "learning_rate": 1.1956038332220484e-05, + "loss": 0.0023, + "step": 4936 + }, + { + "epoch": 0.4938, + "grad_norm": 0.0022925022058188915, + "learning_rate": 1.1949191397216207e-05, + "loss": 0.0001, + "step": 4938 + }, + { + "epoch": 0.494, + "grad_norm": 0.7065297961235046, + "learning_rate": 1.194234351219972e-05, + "loss": 0.0095, + "step": 4940 + }, + { + "epoch": 0.4942, + "grad_norm": 5.739408493041992, + "learning_rate": 1.1935494680508606e-05, + "loss": 0.0633, + "step": 4942 + }, + { + "epoch": 0.4944, + "grad_norm": 0.11675383150577545, + "learning_rate": 1.192864490548089e-05, + "loss": 0.0015, + "step": 4944 + }, + { + "epoch": 0.4946, + "grad_norm": 0.5026741623878479, + "learning_rate": 1.1921794190455082e-05, + "loss": 0.009, + "step": 4946 + }, + { + "epoch": 0.4948, + "grad_norm": 0.000695785041898489, + "learning_rate": 1.191494253877013e-05, + "loss": 0.0817, + "step": 4948 + }, + { + "epoch": 0.495, + "grad_norm": 0.003841114230453968, + "learning_rate": 1.190808995376545e-05, + "loss": 0.0014, + "step": 4950 + }, + { + "epoch": 0.4952, + "grad_norm": 0.30832067131996155, + "learning_rate": 1.1901236438780902e-05, + "loss": 0.0054, + "step": 4952 + }, + { + "epoch": 0.4954, + "grad_norm": 0.0012745620915666223, + "learning_rate": 1.1894381997156814e-05, + "loss": 0.0002, + "step": 4954 + }, + { + "epoch": 0.4956, + "grad_norm": 0.03574787825345993, + "learning_rate": 1.1887526632233954e-05, + "loss": 0.0007, + "step": 4956 + }, + { + "epoch": 0.4958, + "grad_norm": 0.000626647612079978, + "learning_rate": 1.188067034735354e-05, + "loss": 0.0, + "step": 4958 + }, + { + "epoch": 0.496, + "grad_norm": 0.0029249959625303745, + "learning_rate": 1.187381314585725e-05, + "loss": 0.003, + "step": 4960 + }, + { + "epoch": 0.4962, + "grad_norm": 0.0009499417501501739, + "learning_rate": 1.186695503108719e-05, + "loss": 0.0002, + "step": 4962 + }, + { + "epoch": 0.4964, + "grad_norm": 0.11263551563024521, + "learning_rate": 1.186009600638593e-05, + "loss": 0.0095, + "step": 4964 + }, + { + "epoch": 0.4966, + "grad_norm": 0.0011778415646404028, + "learning_rate": 1.1853236075096474e-05, + "loss": 0.0001, + "step": 4966 + }, + { + "epoch": 0.4968, + "grad_norm": 5.410984992980957, + "learning_rate": 1.184637524056227e-05, + "loss": 0.0991, + "step": 4968 + }, + { + "epoch": 0.497, + "grad_norm": 3.9701294898986816, + "learning_rate": 1.1839513506127202e-05, + "loss": 0.1915, + "step": 4970 + }, + { + "epoch": 0.4972, + "grad_norm": 0.0016444140346720815, + "learning_rate": 1.1832650875135599e-05, + "loss": 0.0, + "step": 4972 + }, + { + "epoch": 0.4974, + "grad_norm": 0.0024806547444313765, + "learning_rate": 1.1825787350932224e-05, + "loss": 0.0001, + "step": 4974 + }, + { + "epoch": 0.4976, + "grad_norm": 0.006723110098391771, + "learning_rate": 1.181892293686227e-05, + "loss": 0.0011, + "step": 4976 + }, + { + "epoch": 0.4978, + "grad_norm": 0.002986724954098463, + "learning_rate": 1.1812057636271374e-05, + "loss": 0.0676, + "step": 4978 + }, + { + "epoch": 0.498, + "grad_norm": 0.00792678166180849, + "learning_rate": 1.1805191452505602e-05, + "loss": 0.0027, + "step": 4980 + }, + { + "epoch": 0.4982, + "grad_norm": 4.030731201171875, + "learning_rate": 1.1798324388911445e-05, + "loss": 0.0425, + "step": 4982 + }, + { + "epoch": 0.4984, + "grad_norm": 0.004671743605285883, + "learning_rate": 1.1791456448835825e-05, + "loss": 0.0001, + "step": 4984 + }, + { + "epoch": 0.4986, + "grad_norm": 0.5931873321533203, + "learning_rate": 1.1784587635626095e-05, + "loss": 0.0073, + "step": 4986 + }, + { + "epoch": 0.4988, + "grad_norm": 0.00902344100177288, + "learning_rate": 1.1777717952630033e-05, + "loss": 0.0082, + "step": 4988 + }, + { + "epoch": 0.499, + "grad_norm": 0.013310415670275688, + "learning_rate": 1.1770847403195836e-05, + "loss": 0.0268, + "step": 4990 + }, + { + "epoch": 0.4992, + "grad_norm": 0.0004276745021343231, + "learning_rate": 1.1763975990672125e-05, + "loss": 0.0001, + "step": 4992 + }, + { + "epoch": 0.4994, + "grad_norm": 0.027563774958252907, + "learning_rate": 1.1757103718407948e-05, + "loss": 0.0003, + "step": 4994 + }, + { + "epoch": 0.4996, + "grad_norm": 0.002570014912635088, + "learning_rate": 1.1750230589752763e-05, + "loss": 0.0001, + "step": 4996 + }, + { + "epoch": 0.4998, + "grad_norm": 0.014624022878706455, + "learning_rate": 1.1743356608056448e-05, + "loss": 0.0002, + "step": 4998 + }, + { + "epoch": 0.5, + "grad_norm": 0.16040463745594025, + "learning_rate": 1.1736481776669307e-05, + "loss": 0.0025, + "step": 5000 + }, + { + "epoch": 0.5002, + "grad_norm": 0.0015741355018690228, + "learning_rate": 1.1729606098942039e-05, + "loss": 0.0, + "step": 5002 + }, + { + "epoch": 0.5004, + "grad_norm": 0.005802201107144356, + "learning_rate": 1.1722729578225769e-05, + "loss": 0.0002, + "step": 5004 + }, + { + "epoch": 0.5006, + "grad_norm": 0.12660819292068481, + "learning_rate": 1.171585221787203e-05, + "loss": 0.0016, + "step": 5006 + }, + { + "epoch": 0.5008, + "grad_norm": 0.0018554242560639977, + "learning_rate": 1.1708974021232768e-05, + "loss": 0.0752, + "step": 5008 + }, + { + "epoch": 0.501, + "grad_norm": 0.004762299358844757, + "learning_rate": 1.1702094991660326e-05, + "loss": 0.0001, + "step": 5010 + }, + { + "epoch": 0.5012, + "grad_norm": 0.0002570402866695076, + "learning_rate": 1.1695215132507465e-05, + "loss": 0.0004, + "step": 5012 + }, + { + "epoch": 0.5014, + "grad_norm": 0.023528771474957466, + "learning_rate": 1.1688334447127338e-05, + "loss": 0.0466, + "step": 5014 + }, + { + "epoch": 0.5016, + "grad_norm": 0.054228276014328, + "learning_rate": 1.1681452938873516e-05, + "loss": 0.0205, + "step": 5016 + }, + { + "epoch": 0.5018, + "grad_norm": 0.0063551440834999084, + "learning_rate": 1.1674570611099956e-05, + "loss": 0.0003, + "step": 5018 + }, + { + "epoch": 0.502, + "grad_norm": 0.0010154942283406854, + "learning_rate": 1.1667687467161025e-05, + "loss": 0.0044, + "step": 5020 + }, + { + "epoch": 0.5022, + "grad_norm": 0.008705521002411842, + "learning_rate": 1.166080351041148e-05, + "loss": 0.0001, + "step": 5022 + }, + { + "epoch": 0.5024, + "grad_norm": 0.0015981526812538505, + "learning_rate": 1.1653918744206478e-05, + "loss": 0.0057, + "step": 5024 + }, + { + "epoch": 0.5026, + "grad_norm": 0.0012694841716438532, + "learning_rate": 1.1647033171901573e-05, + "loss": 0.0007, + "step": 5026 + }, + { + "epoch": 0.5028, + "grad_norm": 0.044417500495910645, + "learning_rate": 1.1640146796852711e-05, + "loss": 0.0004, + "step": 5028 + }, + { + "epoch": 0.503, + "grad_norm": 0.39796027541160583, + "learning_rate": 1.1633259622416224e-05, + "loss": 0.0053, + "step": 5030 + }, + { + "epoch": 0.5032, + "grad_norm": 0.0010648678289726377, + "learning_rate": 1.1626371651948839e-05, + "loss": 0.0003, + "step": 5032 + }, + { + "epoch": 0.5034, + "grad_norm": 0.0021903684828430414, + "learning_rate": 1.1619482888807662e-05, + "loss": 0.0001, + "step": 5034 + }, + { + "epoch": 0.5036, + "grad_norm": 0.11197850853204727, + "learning_rate": 1.1612593336350209e-05, + "loss": 0.0006, + "step": 5036 + }, + { + "epoch": 0.5038, + "grad_norm": 0.006284693256020546, + "learning_rate": 1.1605702997934345e-05, + "loss": 0.0002, + "step": 5038 + }, + { + "epoch": 0.504, + "grad_norm": 0.0033875086810439825, + "learning_rate": 1.159881187691835e-05, + "loss": 0.1308, + "step": 5040 + }, + { + "epoch": 0.5042, + "grad_norm": 0.002710583619773388, + "learning_rate": 1.1591919976660867e-05, + "loss": 0.0036, + "step": 5042 + }, + { + "epoch": 0.5044, + "grad_norm": 0.04057636111974716, + "learning_rate": 1.158502730052093e-05, + "loss": 0.0006, + "step": 5044 + }, + { + "epoch": 0.5046, + "grad_norm": 0.005807689391076565, + "learning_rate": 1.157813385185794e-05, + "loss": 0.0005, + "step": 5046 + }, + { + "epoch": 0.5048, + "grad_norm": 0.003710818709805608, + "learning_rate": 1.157123963403168e-05, + "loss": 0.0001, + "step": 5048 + }, + { + "epoch": 0.505, + "grad_norm": 0.006704575382173061, + "learning_rate": 1.156434465040231e-05, + "loss": 0.0001, + "step": 5050 + }, + { + "epoch": 0.5052, + "grad_norm": 0.0208235215395689, + "learning_rate": 1.1557448904330362e-05, + "loss": 0.0003, + "step": 5052 + }, + { + "epoch": 0.5054, + "grad_norm": 0.004932885989546776, + "learning_rate": 1.155055239917674e-05, + "loss": 0.0203, + "step": 5054 + }, + { + "epoch": 0.5056, + "grad_norm": 0.007799352984875441, + "learning_rate": 1.1543655138302714e-05, + "loss": 0.0001, + "step": 5056 + }, + { + "epoch": 0.5058, + "grad_norm": 0.0017768051475286484, + "learning_rate": 1.1536757125069924e-05, + "loss": 0.0001, + "step": 5058 + }, + { + "epoch": 0.506, + "grad_norm": 0.8444119691848755, + "learning_rate": 1.1529858362840383e-05, + "loss": 0.0118, + "step": 5060 + }, + { + "epoch": 0.5062, + "grad_norm": 0.0011019686935469508, + "learning_rate": 1.1522958854976458e-05, + "loss": 0.0, + "step": 5062 + }, + { + "epoch": 0.5064, + "grad_norm": 0.06644455343484879, + "learning_rate": 1.1516058604840891e-05, + "loss": 0.0011, + "step": 5064 + }, + { + "epoch": 0.5066, + "grad_norm": 17.787294387817383, + "learning_rate": 1.1509157615796775e-05, + "loss": 0.12, + "step": 5066 + }, + { + "epoch": 0.5068, + "grad_norm": 0.0020676737185567617, + "learning_rate": 1.1502255891207572e-05, + "loss": 0.0, + "step": 5068 + }, + { + "epoch": 0.507, + "grad_norm": 0.08391569554805756, + "learning_rate": 1.1495353434437098e-05, + "loss": 0.0984, + "step": 5070 + }, + { + "epoch": 0.5072, + "grad_norm": 0.005483422894030809, + "learning_rate": 1.1488450248849523e-05, + "loss": 0.001, + "step": 5072 + }, + { + "epoch": 0.5074, + "grad_norm": 0.0014438582584261894, + "learning_rate": 1.1481546337809381e-05, + "loss": 0.0001, + "step": 5074 + }, + { + "epoch": 0.5076, + "grad_norm": 0.0015402055578306317, + "learning_rate": 1.1474641704681551e-05, + "loss": 0.0001, + "step": 5076 + }, + { + "epoch": 0.5078, + "grad_norm": 0.5452094078063965, + "learning_rate": 1.1467736352831266e-05, + "loss": 0.0191, + "step": 5078 + }, + { + "epoch": 0.508, + "grad_norm": 0.001232723705470562, + "learning_rate": 1.1460830285624119e-05, + "loss": 0.0002, + "step": 5080 + }, + { + "epoch": 0.5082, + "grad_norm": 0.0008251170511357486, + "learning_rate": 1.1453923506426032e-05, + "loss": 0.0005, + "step": 5082 + }, + { + "epoch": 0.5084, + "grad_norm": 0.0021602152846753597, + "learning_rate": 1.1447016018603293e-05, + "loss": 0.0754, + "step": 5084 + }, + { + "epoch": 0.5086, + "grad_norm": 0.03619348257780075, + "learning_rate": 1.1440107825522522e-05, + "loss": 0.0048, + "step": 5086 + }, + { + "epoch": 0.5088, + "grad_norm": 0.015428325161337852, + "learning_rate": 1.1433198930550694e-05, + "loss": 0.0004, + "step": 5088 + }, + { + "epoch": 0.509, + "grad_norm": 0.01815527305006981, + "learning_rate": 1.1426289337055119e-05, + "loss": 0.0007, + "step": 5090 + }, + { + "epoch": 0.5092, + "grad_norm": 0.09703899919986725, + "learning_rate": 1.1419379048403446e-05, + "loss": 0.0011, + "step": 5092 + }, + { + "epoch": 0.5094, + "grad_norm": 3.087618827819824, + "learning_rate": 1.141246806796367e-05, + "loss": 0.0217, + "step": 5094 + }, + { + "epoch": 0.5096, + "grad_norm": 0.009573054499924183, + "learning_rate": 1.140555639910411e-05, + "loss": 0.1373, + "step": 5096 + }, + { + "epoch": 0.5098, + "grad_norm": 0.05338152125477791, + "learning_rate": 1.1398644045193443e-05, + "loss": 0.0008, + "step": 5098 + }, + { + "epoch": 0.51, + "grad_norm": 0.8303212523460388, + "learning_rate": 1.1391731009600655e-05, + "loss": 0.0165, + "step": 5100 + }, + { + "epoch": 0.5102, + "grad_norm": 0.003168339841067791, + "learning_rate": 1.1384817295695083e-05, + "loss": 0.0003, + "step": 5102 + }, + { + "epoch": 0.5104, + "grad_norm": 0.06483063846826553, + "learning_rate": 1.137790290684638e-05, + "loss": 0.0006, + "step": 5104 + }, + { + "epoch": 0.5106, + "grad_norm": 1.6664884090423584, + "learning_rate": 1.1370987846424547e-05, + "loss": 0.015, + "step": 5106 + }, + { + "epoch": 0.5108, + "grad_norm": 0.054305993020534515, + "learning_rate": 1.1364072117799884e-05, + "loss": 0.003, + "step": 5108 + }, + { + "epoch": 0.511, + "grad_norm": 0.07861971855163574, + "learning_rate": 1.1357155724343046e-05, + "loss": 0.0009, + "step": 5110 + }, + { + "epoch": 0.5112, + "grad_norm": 0.00736458133906126, + "learning_rate": 1.1350238669424993e-05, + "loss": 0.0003, + "step": 5112 + }, + { + "epoch": 0.5114, + "grad_norm": 0.43456703424453735, + "learning_rate": 1.1343320956417015e-05, + "loss": 0.0132, + "step": 5114 + }, + { + "epoch": 0.5116, + "grad_norm": 3.2648847103118896, + "learning_rate": 1.1336402588690727e-05, + "loss": 0.0834, + "step": 5116 + }, + { + "epoch": 0.5118, + "grad_norm": 0.031225070357322693, + "learning_rate": 1.1329483569618045e-05, + "loss": 0.0008, + "step": 5118 + }, + { + "epoch": 0.512, + "grad_norm": 0.012407049536705017, + "learning_rate": 1.1322563902571227e-05, + "loss": 0.0002, + "step": 5120 + }, + { + "epoch": 0.5122, + "grad_norm": 0.003968607634305954, + "learning_rate": 1.1315643590922827e-05, + "loss": 0.0003, + "step": 5122 + }, + { + "epoch": 0.5124, + "grad_norm": 0.038799069821834564, + "learning_rate": 1.1308722638045724e-05, + "loss": 0.4875, + "step": 5124 + }, + { + "epoch": 0.5126, + "grad_norm": 0.0019043533829972148, + "learning_rate": 1.1301801047313106e-05, + "loss": 0.0002, + "step": 5126 + }, + { + "epoch": 0.5128, + "grad_norm": 0.49253496527671814, + "learning_rate": 1.129487882209847e-05, + "loss": 0.0073, + "step": 5128 + }, + { + "epoch": 0.513, + "grad_norm": 8.183236122131348, + "learning_rate": 1.128795596577563e-05, + "loss": 0.216, + "step": 5130 + }, + { + "epoch": 0.5132, + "grad_norm": 0.7820392847061157, + "learning_rate": 1.1281032481718696e-05, + "loss": 0.0055, + "step": 5132 + }, + { + "epoch": 0.5134, + "grad_norm": 0.09559250622987747, + "learning_rate": 1.1274108373302095e-05, + "loss": 0.0016, + "step": 5134 + }, + { + "epoch": 0.5136, + "grad_norm": 0.017955780029296875, + "learning_rate": 1.1267183643900548e-05, + "loss": 0.0007, + "step": 5136 + }, + { + "epoch": 0.5138, + "grad_norm": 0.1303221881389618, + "learning_rate": 1.1260258296889086e-05, + "loss": 0.0024, + "step": 5138 + }, + { + "epoch": 0.514, + "grad_norm": 0.021032750606536865, + "learning_rate": 1.1253332335643043e-05, + "loss": 0.0107, + "step": 5140 + }, + { + "epoch": 0.5142, + "grad_norm": 0.030578672885894775, + "learning_rate": 1.1246405763538047e-05, + "loss": 0.0007, + "step": 5142 + }, + { + "epoch": 0.5144, + "grad_norm": 0.0031001942697912455, + "learning_rate": 1.1239478583950019e-05, + "loss": 0.002, + "step": 5144 + }, + { + "epoch": 0.5146, + "grad_norm": 0.0059151374734938145, + "learning_rate": 1.1232550800255188e-05, + "loss": 0.0003, + "step": 5146 + }, + { + "epoch": 0.5148, + "grad_norm": 0.02061804011464119, + "learning_rate": 1.1225622415830068e-05, + "loss": 0.0004, + "step": 5148 + }, + { + "epoch": 0.515, + "grad_norm": 0.7972946763038635, + "learning_rate": 1.1218693434051475e-05, + "loss": 0.0199, + "step": 5150 + }, + { + "epoch": 0.5152, + "grad_norm": 0.8366619348526001, + "learning_rate": 1.1211763858296507e-05, + "loss": 0.0303, + "step": 5152 + }, + { + "epoch": 0.5154, + "grad_norm": 0.5487963557243347, + "learning_rate": 1.1204833691942553e-05, + "loss": 0.0069, + "step": 5154 + }, + { + "epoch": 0.5156, + "grad_norm": 0.07850831747055054, + "learning_rate": 1.1197902938367297e-05, + "loss": 0.0025, + "step": 5156 + }, + { + "epoch": 0.5158, + "grad_norm": 0.016984989866614342, + "learning_rate": 1.11909716009487e-05, + "loss": 0.0005, + "step": 5158 + }, + { + "epoch": 0.516, + "grad_norm": 0.6517826318740845, + "learning_rate": 1.1184039683065014e-05, + "loss": 0.0404, + "step": 5160 + }, + { + "epoch": 0.5162, + "grad_norm": 0.009693140164017677, + "learning_rate": 1.1177107188094765e-05, + "loss": 0.0018, + "step": 5162 + }, + { + "epoch": 0.5164, + "grad_norm": 6.085196495056152, + "learning_rate": 1.1170174119416778e-05, + "loss": 0.0314, + "step": 5164 + }, + { + "epoch": 0.5166, + "grad_norm": 0.0757184848189354, + "learning_rate": 1.1163240480410136e-05, + "loss": 0.0013, + "step": 5166 + }, + { + "epoch": 0.5168, + "grad_norm": 0.1107916459441185, + "learning_rate": 1.1156306274454218e-05, + "loss": 0.0021, + "step": 5168 + }, + { + "epoch": 0.517, + "grad_norm": 4.454555034637451, + "learning_rate": 1.1149371504928667e-05, + "loss": 0.0322, + "step": 5170 + }, + { + "epoch": 0.5172, + "grad_norm": 6.210215091705322, + "learning_rate": 1.1142436175213409e-05, + "loss": 0.3457, + "step": 5172 + }, + { + "epoch": 0.5174, + "grad_norm": 4.068458557128906, + "learning_rate": 1.1135500288688636e-05, + "loss": 0.0239, + "step": 5174 + }, + { + "epoch": 0.5176, + "grad_norm": 3.2004318237304688, + "learning_rate": 1.1128563848734817e-05, + "loss": 0.1019, + "step": 5176 + }, + { + "epoch": 0.5178, + "grad_norm": 0.0052442350424826145, + "learning_rate": 1.112162685873269e-05, + "loss": 0.0002, + "step": 5178 + }, + { + "epoch": 0.518, + "grad_norm": 0.5888645052909851, + "learning_rate": 1.1114689322063255e-05, + "loss": 0.0135, + "step": 5180 + }, + { + "epoch": 0.5182, + "grad_norm": 0.08077683299779892, + "learning_rate": 1.1107751242107786e-05, + "loss": 0.0016, + "step": 5182 + }, + { + "epoch": 0.5184, + "grad_norm": 0.0035193776711821556, + "learning_rate": 1.1100812622247823e-05, + "loss": 0.0003, + "step": 5184 + }, + { + "epoch": 0.5186, + "grad_norm": 4.683455944061279, + "learning_rate": 1.1093873465865156e-05, + "loss": 0.2858, + "step": 5186 + }, + { + "epoch": 0.5188, + "grad_norm": 0.00700599979609251, + "learning_rate": 1.1086933776341853e-05, + "loss": 0.0004, + "step": 5188 + }, + { + "epoch": 0.519, + "grad_norm": 0.007628573104739189, + "learning_rate": 1.1079993557060228e-05, + "loss": 0.0184, + "step": 5190 + }, + { + "epoch": 0.5192, + "grad_norm": 0.010340808890759945, + "learning_rate": 1.1073052811402867e-05, + "loss": 0.0022, + "step": 5192 + }, + { + "epoch": 0.5194, + "grad_norm": 0.018958045169711113, + "learning_rate": 1.10661115427526e-05, + "loss": 0.0062, + "step": 5194 + }, + { + "epoch": 0.5196, + "grad_norm": 0.0619320347905159, + "learning_rate": 1.105916975449252e-05, + "loss": 0.01, + "step": 5196 + }, + { + "epoch": 0.5198, + "grad_norm": 0.09355242550373077, + "learning_rate": 1.1052227450005968e-05, + "loss": 0.0031, + "step": 5198 + }, + { + "epoch": 0.52, + "grad_norm": 0.06753471493721008, + "learning_rate": 1.1045284632676535e-05, + "loss": 0.0164, + "step": 5200 + }, + { + "epoch": 0.5202, + "grad_norm": 0.01477564126253128, + "learning_rate": 1.1038341305888074e-05, + "loss": 0.0005, + "step": 5202 + }, + { + "epoch": 0.5204, + "grad_norm": 0.6878687739372253, + "learning_rate": 1.1031397473024674e-05, + "loss": 0.0128, + "step": 5204 + }, + { + "epoch": 0.5206, + "grad_norm": 0.11110419034957886, + "learning_rate": 1.1024453137470677e-05, + "loss": 0.3437, + "step": 5206 + }, + { + "epoch": 0.5208, + "grad_norm": 1.3259077072143555, + "learning_rate": 1.1017508302610665e-05, + "loss": 0.0606, + "step": 5208 + }, + { + "epoch": 0.521, + "grad_norm": 0.11398515105247498, + "learning_rate": 1.1010562971829464e-05, + "loss": 0.0082, + "step": 5210 + }, + { + "epoch": 0.5212, + "grad_norm": 0.0793682336807251, + "learning_rate": 1.1003617148512149e-05, + "loss": 0.0084, + "step": 5212 + }, + { + "epoch": 0.5214, + "grad_norm": 2.2939064502716064, + "learning_rate": 1.099667083604403e-05, + "loss": 0.0899, + "step": 5214 + }, + { + "epoch": 0.5216, + "grad_norm": 0.09216311573982239, + "learning_rate": 1.0989724037810651e-05, + "loss": 0.0021, + "step": 5216 + }, + { + "epoch": 0.5218, + "grad_norm": 0.03347010537981987, + "learning_rate": 1.0982776757197799e-05, + "loss": 0.0048, + "step": 5218 + }, + { + "epoch": 0.522, + "grad_norm": 0.019893767312169075, + "learning_rate": 1.0975828997591496e-05, + "loss": 0.014, + "step": 5220 + }, + { + "epoch": 0.5222, + "grad_norm": 0.083654023706913, + "learning_rate": 1.0968880762377994e-05, + "loss": 0.0082, + "step": 5222 + }, + { + "epoch": 0.5224, + "grad_norm": 0.22295142710208893, + "learning_rate": 1.0961932054943778e-05, + "loss": 0.0055, + "step": 5224 + }, + { + "epoch": 0.5226, + "grad_norm": 1.2872710227966309, + "learning_rate": 1.0954982878675564e-05, + "loss": 0.0374, + "step": 5226 + }, + { + "epoch": 0.5228, + "grad_norm": 1.6100529432296753, + "learning_rate": 1.0948033236960294e-05, + "loss": 0.0338, + "step": 5228 + }, + { + "epoch": 0.523, + "grad_norm": 0.34258604049682617, + "learning_rate": 1.0941083133185146e-05, + "loss": 0.0024, + "step": 5230 + }, + { + "epoch": 0.5232, + "grad_norm": 0.008143546059727669, + "learning_rate": 1.0934132570737508e-05, + "loss": 0.0005, + "step": 5232 + }, + { + "epoch": 0.5234, + "grad_norm": 0.009332441724836826, + "learning_rate": 1.0927181553005001e-05, + "loss": 0.0015, + "step": 5234 + }, + { + "epoch": 0.5236, + "grad_norm": 0.015302395448088646, + "learning_rate": 1.0920230083375474e-05, + "loss": 0.0049, + "step": 5236 + }, + { + "epoch": 0.5238, + "grad_norm": 0.20676903426647186, + "learning_rate": 1.0913278165236977e-05, + "loss": 0.005, + "step": 5238 + }, + { + "epoch": 0.524, + "grad_norm": 1.2031056880950928, + "learning_rate": 1.0906325801977804e-05, + "loss": 0.0122, + "step": 5240 + }, + { + "epoch": 0.5242, + "grad_norm": 0.007145308889448643, + "learning_rate": 1.0899372996986439e-05, + "loss": 0.0004, + "step": 5242 + }, + { + "epoch": 0.5244, + "grad_norm": 0.04492818936705589, + "learning_rate": 1.0892419753651606e-05, + "loss": 0.0336, + "step": 5244 + }, + { + "epoch": 0.5246, + "grad_norm": 0.007078573107719421, + "learning_rate": 1.0885466075362224e-05, + "loss": 0.0015, + "step": 5246 + }, + { + "epoch": 0.5248, + "grad_norm": 0.08759498596191406, + "learning_rate": 1.0878511965507435e-05, + "loss": 0.0018, + "step": 5248 + }, + { + "epoch": 0.525, + "grad_norm": 0.048651181161403656, + "learning_rate": 1.0871557427476585e-05, + "loss": 0.0032, + "step": 5250 + }, + { + "epoch": 0.5252, + "grad_norm": 2.9239840507507324, + "learning_rate": 1.086460246465923e-05, + "loss": 0.1217, + "step": 5252 + }, + { + "epoch": 0.5254, + "grad_norm": 0.2373381406068802, + "learning_rate": 1.085764708044514e-05, + "loss": 0.0056, + "step": 5254 + }, + { + "epoch": 0.5256, + "grad_norm": 0.1338191032409668, + "learning_rate": 1.0850691278224282e-05, + "loss": 0.0364, + "step": 5256 + }, + { + "epoch": 0.5258, + "grad_norm": 0.01867036521434784, + "learning_rate": 1.0843735061386829e-05, + "loss": 0.0005, + "step": 5258 + }, + { + "epoch": 0.526, + "grad_norm": 3.1767046451568604, + "learning_rate": 1.083677843332316e-05, + "loss": 0.1706, + "step": 5260 + }, + { + "epoch": 0.5262, + "grad_norm": 0.004554878454655409, + "learning_rate": 1.082982139742384e-05, + "loss": 0.0006, + "step": 5262 + }, + { + "epoch": 0.5264, + "grad_norm": 0.01163015328347683, + "learning_rate": 1.0822863957079657e-05, + "loss": 0.0006, + "step": 5264 + }, + { + "epoch": 0.5266, + "grad_norm": 0.02894357405602932, + "learning_rate": 1.0815906115681579e-05, + "loss": 0.0006, + "step": 5266 + }, + { + "epoch": 0.5268, + "grad_norm": 0.0028580210637301207, + "learning_rate": 1.0808947876620768e-05, + "loss": 0.0009, + "step": 5268 + }, + { + "epoch": 0.527, + "grad_norm": 0.029090994969010353, + "learning_rate": 1.0801989243288588e-05, + "loss": 0.0048, + "step": 5270 + }, + { + "epoch": 0.5272, + "grad_norm": 1.7279040813446045, + "learning_rate": 1.07950302190766e-05, + "loss": 0.0054, + "step": 5272 + }, + { + "epoch": 0.5274, + "grad_norm": 0.003043987089768052, + "learning_rate": 1.0788070807376536e-05, + "loss": 0.0055, + "step": 5274 + }, + { + "epoch": 0.5276, + "grad_norm": 2.206516981124878, + "learning_rate": 1.0781111011580336e-05, + "loss": 0.0913, + "step": 5276 + }, + { + "epoch": 0.5278, + "grad_norm": 0.02334114909172058, + "learning_rate": 1.0774150835080119e-05, + "loss": 0.0015, + "step": 5278 + }, + { + "epoch": 0.528, + "grad_norm": 0.042423345148563385, + "learning_rate": 1.0767190281268187e-05, + "loss": 0.0012, + "step": 5280 + }, + { + "epoch": 0.5282, + "grad_norm": 0.009121214970946312, + "learning_rate": 1.0760229353537032e-05, + "loss": 0.0004, + "step": 5282 + }, + { + "epoch": 0.5284, + "grad_norm": 0.6815376281738281, + "learning_rate": 1.0753268055279328e-05, + "loss": 0.0075, + "step": 5284 + }, + { + "epoch": 0.5286, + "grad_norm": 0.004675368312746286, + "learning_rate": 1.0746306389887924e-05, + "loss": 0.0021, + "step": 5286 + }, + { + "epoch": 0.5288, + "grad_norm": 0.04359656572341919, + "learning_rate": 1.0739344360755853e-05, + "loss": 0.2965, + "step": 5288 + }, + { + "epoch": 0.529, + "grad_norm": 0.11382051557302475, + "learning_rate": 1.0732381971276318e-05, + "loss": 0.0021, + "step": 5290 + }, + { + "epoch": 0.5292, + "grad_norm": 0.005684115458279848, + "learning_rate": 1.072541922484271e-05, + "loss": 0.0008, + "step": 5292 + }, + { + "epoch": 0.5294, + "grad_norm": 0.013828211463987827, + "learning_rate": 1.0718456124848584e-05, + "loss": 0.0003, + "step": 5294 + }, + { + "epoch": 0.5296, + "grad_norm": 0.12612712383270264, + "learning_rate": 1.071149267468767e-05, + "loss": 0.0019, + "step": 5296 + }, + { + "epoch": 0.5298, + "grad_norm": 0.0234049204736948, + "learning_rate": 1.070452887775387e-05, + "loss": 0.0012, + "step": 5298 + }, + { + "epoch": 0.53, + "grad_norm": 0.031237905845046043, + "learning_rate": 1.0697564737441254e-05, + "loss": 0.0009, + "step": 5300 + }, + { + "epoch": 0.5302, + "grad_norm": 0.005027621053159237, + "learning_rate": 1.0690600257144062e-05, + "loss": 0.005, + "step": 5302 + }, + { + "epoch": 0.5304, + "grad_norm": 0.007525821682065725, + "learning_rate": 1.0683635440256689e-05, + "loss": 0.0006, + "step": 5304 + }, + { + "epoch": 0.5306, + "grad_norm": 1.504065990447998, + "learning_rate": 1.067667029017371e-05, + "loss": 0.0286, + "step": 5306 + }, + { + "epoch": 0.5308, + "grad_norm": 0.0064430758357048035, + "learning_rate": 1.0669704810289852e-05, + "loss": 0.0009, + "step": 5308 + }, + { + "epoch": 0.531, + "grad_norm": 1.4708762168884277, + "learning_rate": 1.0662739004000005e-05, + "loss": 0.0266, + "step": 5310 + }, + { + "epoch": 0.5312, + "grad_norm": 0.18051497638225555, + "learning_rate": 1.0655772874699217e-05, + "loss": 0.0026, + "step": 5312 + }, + { + "epoch": 0.5314, + "grad_norm": 0.10870801657438278, + "learning_rate": 1.0648806425782697e-05, + "loss": 0.002, + "step": 5314 + }, + { + "epoch": 0.5316, + "grad_norm": 0.06935898214578629, + "learning_rate": 1.0641839660645806e-05, + "loss": 0.0031, + "step": 5316 + }, + { + "epoch": 0.5318, + "grad_norm": 11.955612182617188, + "learning_rate": 1.0634872582684062e-05, + "loss": 0.0399, + "step": 5318 + }, + { + "epoch": 0.532, + "grad_norm": 0.09979400783777237, + "learning_rate": 1.0627905195293135e-05, + "loss": 0.0015, + "step": 5320 + }, + { + "epoch": 0.5322, + "grad_norm": 0.030835749581456184, + "learning_rate": 1.0620937501868842e-05, + "loss": 0.0006, + "step": 5322 + }, + { + "epoch": 0.5324, + "grad_norm": 0.005574041977524757, + "learning_rate": 1.0613969505807157e-05, + "loss": 0.0279, + "step": 5324 + }, + { + "epoch": 0.5326, + "grad_norm": 0.030256735160946846, + "learning_rate": 1.060700121050419e-05, + "loss": 0.0008, + "step": 5326 + }, + { + "epoch": 0.5328, + "grad_norm": 0.005091114901006222, + "learning_rate": 1.0600032619356208e-05, + "loss": 0.0007, + "step": 5328 + }, + { + "epoch": 0.533, + "grad_norm": 0.02230304479598999, + "learning_rate": 1.0593063735759619e-05, + "loss": 0.0007, + "step": 5330 + }, + { + "epoch": 0.5332, + "grad_norm": 0.053279757499694824, + "learning_rate": 1.0586094563110965e-05, + "loss": 0.0058, + "step": 5332 + }, + { + "epoch": 0.5334, + "grad_norm": 3.3807308673858643, + "learning_rate": 1.0579125104806944e-05, + "loss": 0.0565, + "step": 5334 + }, + { + "epoch": 0.5336, + "grad_norm": 0.009292911738157272, + "learning_rate": 1.0572155364244383e-05, + "loss": 0.1632, + "step": 5336 + }, + { + "epoch": 0.5338, + "grad_norm": 0.15712110698223114, + "learning_rate": 1.0565185344820248e-05, + "loss": 0.001, + "step": 5338 + }, + { + "epoch": 0.534, + "grad_norm": 0.0035791730042546988, + "learning_rate": 1.055821504993164e-05, + "loss": 0.0461, + "step": 5340 + }, + { + "epoch": 0.5342, + "grad_norm": 0.009456896223127842, + "learning_rate": 1.0551244482975798e-05, + "loss": 0.0145, + "step": 5342 + }, + { + "epoch": 0.5344, + "grad_norm": 0.002932540373876691, + "learning_rate": 1.0544273647350091e-05, + "loss": 0.0004, + "step": 5344 + }, + { + "epoch": 0.5346, + "grad_norm": 0.023604964837431908, + "learning_rate": 1.0537302546452022e-05, + "loss": 0.002, + "step": 5346 + }, + { + "epoch": 0.5348, + "grad_norm": 16.595191955566406, + "learning_rate": 1.053033118367922e-05, + "loss": 0.2277, + "step": 5348 + }, + { + "epoch": 0.535, + "grad_norm": 0.03939419984817505, + "learning_rate": 1.0523359562429441e-05, + "loss": 0.009, + "step": 5350 + }, + { + "epoch": 0.5352, + "grad_norm": 0.02852332592010498, + "learning_rate": 1.0516387686100566e-05, + "loss": 0.0011, + "step": 5352 + }, + { + "epoch": 0.5354, + "grad_norm": 0.17980004847049713, + "learning_rate": 1.050941555809061e-05, + "loss": 0.0027, + "step": 5354 + }, + { + "epoch": 0.5356, + "grad_norm": 0.018359750509262085, + "learning_rate": 1.0502443181797696e-05, + "loss": 0.0006, + "step": 5356 + }, + { + "epoch": 0.5358, + "grad_norm": 0.007397879846394062, + "learning_rate": 1.0495470560620082e-05, + "loss": 0.0003, + "step": 5358 + }, + { + "epoch": 0.536, + "grad_norm": 0.0038733186665922403, + "learning_rate": 1.0488497697956134e-05, + "loss": 0.0004, + "step": 5360 + }, + { + "epoch": 0.5362, + "grad_norm": 0.003698385087773204, + "learning_rate": 1.0481524597204342e-05, + "loss": 0.0007, + "step": 5362 + }, + { + "epoch": 0.5364, + "grad_norm": 0.0016194775234907866, + "learning_rate": 1.0474551261763315e-05, + "loss": 0.0001, + "step": 5364 + }, + { + "epoch": 0.5366, + "grad_norm": 2.31986665725708, + "learning_rate": 1.0467577695031763e-05, + "loss": 0.1087, + "step": 5366 + }, + { + "epoch": 0.5368, + "grad_norm": 0.0006677248748019338, + "learning_rate": 1.0460603900408523e-05, + "loss": 0.0003, + "step": 5368 + }, + { + "epoch": 0.537, + "grad_norm": 0.151048481464386, + "learning_rate": 1.0453629881292537e-05, + "loss": 0.0009, + "step": 5370 + }, + { + "epoch": 0.5372, + "grad_norm": 0.05879317969083786, + "learning_rate": 1.0446655641082864e-05, + "loss": 0.0009, + "step": 5372 + }, + { + "epoch": 0.5374, + "grad_norm": 0.9527776837348938, + "learning_rate": 1.043968118317865e-05, + "loss": 0.0105, + "step": 5374 + }, + { + "epoch": 0.5376, + "grad_norm": 0.003949533216655254, + "learning_rate": 1.0432706510979172e-05, + "loss": 0.0008, + "step": 5376 + }, + { + "epoch": 0.5378, + "grad_norm": 0.00881669856607914, + "learning_rate": 1.0425731627883798e-05, + "loss": 0.0254, + "step": 5378 + }, + { + "epoch": 0.538, + "grad_norm": 0.005804808810353279, + "learning_rate": 1.0418756537291996e-05, + "loss": 0.0066, + "step": 5380 + }, + { + "epoch": 0.5382, + "grad_norm": 0.11764631420373917, + "learning_rate": 1.0411781242603352e-05, + "loss": 0.0017, + "step": 5382 + }, + { + "epoch": 0.5384, + "grad_norm": 0.05647899582982063, + "learning_rate": 1.0404805747217525e-05, + "loss": 0.0006, + "step": 5384 + }, + { + "epoch": 0.5386, + "grad_norm": 0.0009773897472769022, + "learning_rate": 1.03978300545343e-05, + "loss": 0.0152, + "step": 5386 + }, + { + "epoch": 0.5388, + "grad_norm": 0.0036057832185178995, + "learning_rate": 1.0390854167953537e-05, + "loss": 0.0017, + "step": 5388 + }, + { + "epoch": 0.539, + "grad_norm": 0.017616206780076027, + "learning_rate": 1.03838780908752e-05, + "loss": 0.0003, + "step": 5390 + }, + { + "epoch": 0.5392, + "grad_norm": 0.0008860017987899482, + "learning_rate": 1.0376901826699349e-05, + "loss": 0.0073, + "step": 5392 + }, + { + "epoch": 0.5394, + "grad_norm": 0.00622866814956069, + "learning_rate": 1.036992537882612e-05, + "loss": 0.0002, + "step": 5394 + }, + { + "epoch": 0.5396, + "grad_norm": 0.015628648921847343, + "learning_rate": 1.036294875065576e-05, + "loss": 0.0003, + "step": 5396 + }, + { + "epoch": 0.5398, + "grad_norm": 0.004503821022808552, + "learning_rate": 1.0355971945588586e-05, + "loss": 0.0002, + "step": 5398 + }, + { + "epoch": 0.54, + "grad_norm": 0.027697235345840454, + "learning_rate": 1.0348994967025012e-05, + "loss": 0.0009, + "step": 5400 + }, + { + "epoch": 0.5402, + "grad_norm": 0.004421606194227934, + "learning_rate": 1.034201781836553e-05, + "loss": 0.0003, + "step": 5402 + }, + { + "epoch": 0.5404, + "grad_norm": 0.1917393058538437, + "learning_rate": 1.0335040503010715e-05, + "loss": 0.0015, + "step": 5404 + }, + { + "epoch": 0.5406, + "grad_norm": 0.03175688162446022, + "learning_rate": 1.0328063024361232e-05, + "loss": 0.0072, + "step": 5406 + }, + { + "epoch": 0.5408, + "grad_norm": 0.06283185631036758, + "learning_rate": 1.0321085385817818e-05, + "loss": 0.0004, + "step": 5408 + }, + { + "epoch": 0.541, + "grad_norm": 9.332277297973633, + "learning_rate": 1.0314107590781284e-05, + "loss": 0.0505, + "step": 5410 + }, + { + "epoch": 0.5412, + "grad_norm": 0.0027318752836436033, + "learning_rate": 1.030712964265253e-05, + "loss": 0.0889, + "step": 5412 + }, + { + "epoch": 0.5414, + "grad_norm": 0.02466205134987831, + "learning_rate": 1.0300151544832513e-05, + "loss": 0.0003, + "step": 5414 + }, + { + "epoch": 0.5416, + "grad_norm": 0.07158800214529037, + "learning_rate": 1.0293173300722286e-05, + "loss": 0.0011, + "step": 5416 + }, + { + "epoch": 0.5418, + "grad_norm": 0.38409423828125, + "learning_rate": 1.0286194913722948e-05, + "loss": 0.011, + "step": 5418 + }, + { + "epoch": 0.542, + "grad_norm": 0.6029567122459412, + "learning_rate": 1.0279216387235691e-05, + "loss": 0.0083, + "step": 5420 + }, + { + "epoch": 0.5422, + "grad_norm": 0.011244487017393112, + "learning_rate": 1.0272237724661753e-05, + "loss": 0.0003, + "step": 5422 + }, + { + "epoch": 0.5424, + "grad_norm": 0.9919204115867615, + "learning_rate": 1.026525892940246e-05, + "loss": 0.0232, + "step": 5424 + }, + { + "epoch": 0.5426, + "grad_norm": 0.01761072501540184, + "learning_rate": 1.0258280004859189e-05, + "loss": 0.0002, + "step": 5426 + }, + { + "epoch": 0.5428, + "grad_norm": 0.009653969667851925, + "learning_rate": 1.0251300954433377e-05, + "loss": 0.0001, + "step": 5428 + }, + { + "epoch": 0.543, + "grad_norm": 0.005797072779387236, + "learning_rate": 1.0244321781526533e-05, + "loss": 0.0002, + "step": 5430 + }, + { + "epoch": 0.5432, + "grad_norm": 0.006223728414624929, + "learning_rate": 1.0237342489540221e-05, + "loss": 0.003, + "step": 5432 + }, + { + "epoch": 0.5434, + "grad_norm": 0.000808560405857861, + "learning_rate": 1.0230363081876065e-05, + "loss": 0.0036, + "step": 5434 + }, + { + "epoch": 0.5436, + "grad_norm": 0.009006867185235023, + "learning_rate": 1.0223383561935738e-05, + "loss": 0.0001, + "step": 5436 + }, + { + "epoch": 0.5438, + "grad_norm": 0.0020682441536337137, + "learning_rate": 1.0216403933120979e-05, + "loss": 0.0001, + "step": 5438 + }, + { + "epoch": 0.544, + "grad_norm": 0.33109214901924133, + "learning_rate": 1.0209424198833571e-05, + "loss": 0.0025, + "step": 5440 + }, + { + "epoch": 0.5442, + "grad_norm": 0.010915069840848446, + "learning_rate": 1.0202444362475352e-05, + "loss": 0.0016, + "step": 5442 + }, + { + "epoch": 0.5444, + "grad_norm": 0.05338164046406746, + "learning_rate": 1.0195464427448213e-05, + "loss": 0.0025, + "step": 5444 + }, + { + "epoch": 0.5446, + "grad_norm": 0.05724494159221649, + "learning_rate": 1.0188484397154083e-05, + "loss": 0.0004, + "step": 5446 + }, + { + "epoch": 0.5448, + "grad_norm": 0.00046551506966352463, + "learning_rate": 1.0181504274994949e-05, + "loss": 0.0016, + "step": 5448 + }, + { + "epoch": 0.545, + "grad_norm": 0.001311804517172277, + "learning_rate": 1.0174524064372837e-05, + "loss": 0.0001, + "step": 5450 + }, + { + "epoch": 0.5452, + "grad_norm": 0.048440154641866684, + "learning_rate": 1.0167543768689816e-05, + "loss": 0.0329, + "step": 5452 + }, + { + "epoch": 0.5454, + "grad_norm": 0.002095951000228524, + "learning_rate": 1.0160563391347998e-05, + "loss": 0.0001, + "step": 5454 + }, + { + "epoch": 0.5456, + "grad_norm": 0.015501032583415508, + "learning_rate": 1.0153582935749531e-05, + "loss": 0.0007, + "step": 5456 + }, + { + "epoch": 0.5458, + "grad_norm": 0.0734148845076561, + "learning_rate": 1.0146602405296608e-05, + "loss": 0.0008, + "step": 5458 + }, + { + "epoch": 0.546, + "grad_norm": 0.03041154146194458, + "learning_rate": 1.0139621803391454e-05, + "loss": 0.0004, + "step": 5460 + }, + { + "epoch": 0.5462, + "grad_norm": 0.0039372872561216354, + "learning_rate": 1.013264113343633e-05, + "loss": 0.0001, + "step": 5462 + }, + { + "epoch": 0.5464, + "grad_norm": 2.318969964981079, + "learning_rate": 1.0125660398833528e-05, + "loss": 0.0552, + "step": 5464 + }, + { + "epoch": 0.5466, + "grad_norm": 0.03819714114069939, + "learning_rate": 1.0118679602985373e-05, + "loss": 0.0005, + "step": 5466 + }, + { + "epoch": 0.5468, + "grad_norm": 0.008496999740600586, + "learning_rate": 1.0111698749294223e-05, + "loss": 0.0001, + "step": 5468 + }, + { + "epoch": 0.547, + "grad_norm": 0.0036377273499965668, + "learning_rate": 1.010471784116246e-05, + "loss": 0.0001, + "step": 5470 + }, + { + "epoch": 0.5472, + "grad_norm": 0.011280832812190056, + "learning_rate": 1.0097736881992492e-05, + "loss": 0.0069, + "step": 5472 + }, + { + "epoch": 0.5474, + "grad_norm": 0.04893023520708084, + "learning_rate": 1.0090755875186752e-05, + "loss": 0.0007, + "step": 5474 + }, + { + "epoch": 0.5476, + "grad_norm": 0.0010557884816080332, + "learning_rate": 1.0083774824147707e-05, + "loss": 0.0003, + "step": 5476 + }, + { + "epoch": 0.5478, + "grad_norm": 0.0009004581370390952, + "learning_rate": 1.007679373227783e-05, + "loss": 0.0, + "step": 5478 + }, + { + "epoch": 0.548, + "grad_norm": 0.0008254596032202244, + "learning_rate": 1.0069812602979617e-05, + "loss": 0.0, + "step": 5480 + }, + { + "epoch": 0.5482, + "grad_norm": 0.02797691337764263, + "learning_rate": 1.0062831439655591e-05, + "loss": 0.0193, + "step": 5482 + }, + { + "epoch": 0.5484, + "grad_norm": 4.234598636627197, + "learning_rate": 1.0055850245708283e-05, + "loss": 0.0923, + "step": 5484 + }, + { + "epoch": 0.5486, + "grad_norm": 1.715204119682312, + "learning_rate": 1.0048869024540247e-05, + "loss": 0.0181, + "step": 5486 + }, + { + "epoch": 0.5488, + "grad_norm": 0.003946878481656313, + "learning_rate": 1.0041887779554041e-05, + "loss": 0.0006, + "step": 5488 + }, + { + "epoch": 0.549, + "grad_norm": 12.337197303771973, + "learning_rate": 1.0034906514152239e-05, + "loss": 0.108, + "step": 5490 + }, + { + "epoch": 0.5492, + "grad_norm": 0.03938230872154236, + "learning_rate": 1.0027925231737428e-05, + "loss": 0.0005, + "step": 5492 + }, + { + "epoch": 0.5494, + "grad_norm": 0.11202102899551392, + "learning_rate": 1.0020943935712193e-05, + "loss": 0.2002, + "step": 5494 + }, + { + "epoch": 0.5496, + "grad_norm": 3.2084200382232666, + "learning_rate": 1.0013962629479145e-05, + "loss": 0.0363, + "step": 5496 + }, + { + "epoch": 0.5498, + "grad_norm": 0.001675883075222373, + "learning_rate": 1.0006981316440876e-05, + "loss": 0.0001, + "step": 5498 + }, + { + "epoch": 0.55, + "grad_norm": 0.19754436612129211, + "learning_rate": 1e-05, + "loss": 0.004, + "step": 5500 + }, + { + "epoch": 0.5502, + "grad_norm": 0.0013347043422982097, + "learning_rate": 9.993018683559126e-06, + "loss": 0.0002, + "step": 5502 + }, + { + "epoch": 0.5504, + "grad_norm": 0.004689590074121952, + "learning_rate": 9.986037370520856e-06, + "loss": 0.0002, + "step": 5504 + }, + { + "epoch": 0.5506, + "grad_norm": 0.002834759186953306, + "learning_rate": 9.979056064287807e-06, + "loss": 0.0001, + "step": 5506 + }, + { + "epoch": 0.5508, + "grad_norm": 0.004042088985443115, + "learning_rate": 9.972074768262576e-06, + "loss": 0.0001, + "step": 5508 + }, + { + "epoch": 0.551, + "grad_norm": 0.208099365234375, + "learning_rate": 9.965093485847766e-06, + "loss": 0.0038, + "step": 5510 + }, + { + "epoch": 0.5512, + "grad_norm": 0.07564619183540344, + "learning_rate": 9.958112220445964e-06, + "loss": 0.0009, + "step": 5512 + }, + { + "epoch": 0.5514, + "grad_norm": 0.0341436043381691, + "learning_rate": 9.951130975459758e-06, + "loss": 0.0007, + "step": 5514 + }, + { + "epoch": 0.5516, + "grad_norm": 0.02134198136627674, + "learning_rate": 9.944149754291719e-06, + "loss": 0.0004, + "step": 5516 + }, + { + "epoch": 0.5518, + "grad_norm": 0.0275783222168684, + "learning_rate": 9.937168560344412e-06, + "loss": 0.0004, + "step": 5518 + }, + { + "epoch": 0.552, + "grad_norm": 8.529264450073242, + "learning_rate": 9.930187397020385e-06, + "loss": 0.5845, + "step": 5520 + }, + { + "epoch": 0.5522, + "grad_norm": 0.0037587126716971397, + "learning_rate": 9.923206267722173e-06, + "loss": 0.0024, + "step": 5522 + }, + { + "epoch": 0.5524, + "grad_norm": 0.0109561737626791, + "learning_rate": 9.916225175852295e-06, + "loss": 0.0012, + "step": 5524 + }, + { + "epoch": 0.5526, + "grad_norm": 0.021112458780407906, + "learning_rate": 9.909244124813246e-06, + "loss": 0.0436, + "step": 5526 + }, + { + "epoch": 0.5528, + "grad_norm": 0.004106454085558653, + "learning_rate": 9.902263118007513e-06, + "loss": 0.0002, + "step": 5528 + }, + { + "epoch": 0.553, + "grad_norm": 0.093162402510643, + "learning_rate": 9.895282158837545e-06, + "loss": 0.0012, + "step": 5530 + }, + { + "epoch": 0.5532, + "grad_norm": 2.184117078781128, + "learning_rate": 9.88830125070578e-06, + "loss": 0.0382, + "step": 5532 + }, + { + "epoch": 0.5534, + "grad_norm": 0.1474815458059311, + "learning_rate": 9.88132039701463e-06, + "loss": 0.0065, + "step": 5534 + }, + { + "epoch": 0.5536, + "grad_norm": 0.006423664279282093, + "learning_rate": 9.874339601166474e-06, + "loss": 0.1257, + "step": 5536 + }, + { + "epoch": 0.5538, + "grad_norm": 0.0034974035806953907, + "learning_rate": 9.867358866563674e-06, + "loss": 0.0026, + "step": 5538 + }, + { + "epoch": 0.554, + "grad_norm": 0.02070695161819458, + "learning_rate": 9.860378196608549e-06, + "loss": 0.001, + "step": 5540 + }, + { + "epoch": 0.5542, + "grad_norm": 0.14843086898326874, + "learning_rate": 9.853397594703394e-06, + "loss": 0.0073, + "step": 5542 + }, + { + "epoch": 0.5544, + "grad_norm": 0.008832412771880627, + "learning_rate": 9.84641706425047e-06, + "loss": 0.0006, + "step": 5544 + }, + { + "epoch": 0.5546, + "grad_norm": 0.12601199746131897, + "learning_rate": 9.839436608652007e-06, + "loss": 0.0261, + "step": 5546 + }, + { + "epoch": 0.5548, + "grad_norm": 0.02399817481637001, + "learning_rate": 9.832456231310189e-06, + "loss": 0.0006, + "step": 5548 + }, + { + "epoch": 0.555, + "grad_norm": 0.03433481603860855, + "learning_rate": 9.825475935627165e-06, + "loss": 0.0009, + "step": 5550 + }, + { + "epoch": 0.5552, + "grad_norm": 0.043457016348838806, + "learning_rate": 9.818495725005053e-06, + "loss": 0.0025, + "step": 5552 + }, + { + "epoch": 0.5554, + "grad_norm": 3.5949180126190186, + "learning_rate": 9.81151560284592e-06, + "loss": 0.0562, + "step": 5554 + }, + { + "epoch": 0.5556, + "grad_norm": 0.00793186854571104, + "learning_rate": 9.80453557255179e-06, + "loss": 0.0373, + "step": 5556 + }, + { + "epoch": 0.5558, + "grad_norm": 0.09078215062618256, + "learning_rate": 9.79755563752465e-06, + "loss": 0.002, + "step": 5558 + }, + { + "epoch": 0.556, + "grad_norm": 0.018207378685474396, + "learning_rate": 9.790575801166432e-06, + "loss": 0.048, + "step": 5560 + }, + { + "epoch": 0.5562, + "grad_norm": 0.07096519321203232, + "learning_rate": 9.783596066879023e-06, + "loss": 0.0015, + "step": 5562 + }, + { + "epoch": 0.5564, + "grad_norm": 1.5099375247955322, + "learning_rate": 9.776616438064265e-06, + "loss": 0.0195, + "step": 5564 + }, + { + "epoch": 0.5566, + "grad_norm": 0.0023095242213457823, + "learning_rate": 9.76963691812394e-06, + "loss": 0.0015, + "step": 5566 + }, + { + "epoch": 0.5568, + "grad_norm": 0.15602193772792816, + "learning_rate": 9.762657510459784e-06, + "loss": 0.0045, + "step": 5568 + }, + { + "epoch": 0.557, + "grad_norm": 0.7204335927963257, + "learning_rate": 9.75567821847347e-06, + "loss": 0.0224, + "step": 5570 + }, + { + "epoch": 0.5572, + "grad_norm": 0.12045953422784805, + "learning_rate": 9.748699045566626e-06, + "loss": 0.0032, + "step": 5572 + }, + { + "epoch": 0.5574, + "grad_norm": 1.5617765188217163, + "learning_rate": 9.741719995140814e-06, + "loss": 0.0517, + "step": 5574 + }, + { + "epoch": 0.5576, + "grad_norm": 0.007147267926484346, + "learning_rate": 9.73474107059754e-06, + "loss": 0.0026, + "step": 5576 + }, + { + "epoch": 0.5578, + "grad_norm": 0.10315397381782532, + "learning_rate": 9.727762275338246e-06, + "loss": 0.0039, + "step": 5578 + }, + { + "epoch": 0.558, + "grad_norm": 0.03552604466676712, + "learning_rate": 9.720783612764314e-06, + "loss": 0.0141, + "step": 5580 + }, + { + "epoch": 0.5582, + "grad_norm": 0.02789045125246048, + "learning_rate": 9.713805086277055e-06, + "loss": 0.0005, + "step": 5582 + }, + { + "epoch": 0.5584, + "grad_norm": 0.8716373443603516, + "learning_rate": 9.706826699277719e-06, + "loss": 0.0052, + "step": 5584 + }, + { + "epoch": 0.5586, + "grad_norm": 0.07595672458410263, + "learning_rate": 9.699848455167489e-06, + "loss": 0.012, + "step": 5586 + }, + { + "epoch": 0.5588, + "grad_norm": 14.249016761779785, + "learning_rate": 9.692870357347474e-06, + "loss": 0.3416, + "step": 5588 + }, + { + "epoch": 0.559, + "grad_norm": 0.016022566705942154, + "learning_rate": 9.685892409218718e-06, + "loss": 0.0013, + "step": 5590 + }, + { + "epoch": 0.5592, + "grad_norm": 0.056700363755226135, + "learning_rate": 9.678914614182185e-06, + "loss": 0.001, + "step": 5592 + }, + { + "epoch": 0.5594, + "grad_norm": 0.0696277990937233, + "learning_rate": 9.671936975638768e-06, + "loss": 0.0044, + "step": 5594 + }, + { + "epoch": 0.5596, + "grad_norm": 0.01540061179548502, + "learning_rate": 9.664959496989286e-06, + "loss": 0.0004, + "step": 5596 + }, + { + "epoch": 0.5598, + "grad_norm": 2.471543788909912, + "learning_rate": 9.657982181634476e-06, + "loss": 0.2031, + "step": 5598 + }, + { + "epoch": 0.56, + "grad_norm": 0.10763819515705109, + "learning_rate": 9.651005032974994e-06, + "loss": 0.0023, + "step": 5600 + }, + { + "epoch": 0.5602, + "grad_norm": 4.534322738647461, + "learning_rate": 9.644028054411416e-06, + "loss": 0.2537, + "step": 5602 + }, + { + "epoch": 0.5604, + "grad_norm": 0.2531780004501343, + "learning_rate": 9.637051249344244e-06, + "loss": 0.0148, + "step": 5604 + }, + { + "epoch": 0.5606, + "grad_norm": 0.10188718140125275, + "learning_rate": 9.630074621173882e-06, + "loss": 0.0028, + "step": 5606 + }, + { + "epoch": 0.5608, + "grad_norm": 0.2696961462497711, + "learning_rate": 9.623098173300655e-06, + "loss": 0.0045, + "step": 5608 + }, + { + "epoch": 0.561, + "grad_norm": 0.12542051076889038, + "learning_rate": 9.616121909124801e-06, + "loss": 0.0286, + "step": 5610 + }, + { + "epoch": 0.5612, + "grad_norm": 0.37251508235931396, + "learning_rate": 9.609145832046465e-06, + "loss": 0.0057, + "step": 5612 + }, + { + "epoch": 0.5614, + "grad_norm": 0.049126531928777695, + "learning_rate": 9.602169945465702e-06, + "loss": 0.0034, + "step": 5614 + }, + { + "epoch": 0.5616, + "grad_norm": 0.03408662974834442, + "learning_rate": 9.595194252782476e-06, + "loss": 0.0008, + "step": 5616 + }, + { + "epoch": 0.5618, + "grad_norm": 0.011194417253136635, + "learning_rate": 9.588218757396655e-06, + "loss": 0.0011, + "step": 5618 + }, + { + "epoch": 0.562, + "grad_norm": 0.0089698676019907, + "learning_rate": 9.581243462708007e-06, + "loss": 0.0005, + "step": 5620 + }, + { + "epoch": 0.5622, + "grad_norm": 1.9367331266403198, + "learning_rate": 9.574268372116205e-06, + "loss": 0.0413, + "step": 5622 + }, + { + "epoch": 0.5624, + "grad_norm": 2.409564256668091, + "learning_rate": 9.567293489020831e-06, + "loss": 0.1533, + "step": 5624 + }, + { + "epoch": 0.5626, + "grad_norm": 0.053089503198862076, + "learning_rate": 9.560318816821354e-06, + "loss": 0.0124, + "step": 5626 + }, + { + "epoch": 0.5628, + "grad_norm": 0.005421430338174105, + "learning_rate": 9.553344358917141e-06, + "loss": 0.0022, + "step": 5628 + }, + { + "epoch": 0.563, + "grad_norm": 0.004718018230050802, + "learning_rate": 9.546370118707463e-06, + "loss": 0.002, + "step": 5630 + }, + { + "epoch": 0.5632, + "grad_norm": 0.045248810201883316, + "learning_rate": 9.539396099591477e-06, + "loss": 0.0035, + "step": 5632 + }, + { + "epoch": 0.5634, + "grad_norm": 0.5779730081558228, + "learning_rate": 9.532422304968243e-06, + "loss": 0.0187, + "step": 5634 + }, + { + "epoch": 0.5636, + "grad_norm": 0.03181467950344086, + "learning_rate": 9.525448738236691e-06, + "loss": 0.0011, + "step": 5636 + }, + { + "epoch": 0.5638, + "grad_norm": 0.93709397315979, + "learning_rate": 9.518475402795661e-06, + "loss": 0.0165, + "step": 5638 + }, + { + "epoch": 0.564, + "grad_norm": 0.051886703819036484, + "learning_rate": 9.511502302043867e-06, + "loss": 0.0112, + "step": 5640 + }, + { + "epoch": 0.5642, + "grad_norm": 0.003035301109775901, + "learning_rate": 9.504529439379921e-06, + "loss": 0.0006, + "step": 5642 + }, + { + "epoch": 0.5644, + "grad_norm": 0.0413944348692894, + "learning_rate": 9.497556818202306e-06, + "loss": 0.0006, + "step": 5644 + }, + { + "epoch": 0.5646, + "grad_norm": 0.013973523862659931, + "learning_rate": 9.490584441909392e-06, + "loss": 0.0043, + "step": 5646 + }, + { + "epoch": 0.5648, + "grad_norm": 0.07701071351766586, + "learning_rate": 9.483612313899436e-06, + "loss": 0.0018, + "step": 5648 + }, + { + "epoch": 0.565, + "grad_norm": 0.004722742363810539, + "learning_rate": 9.476640437570562e-06, + "loss": 0.0002, + "step": 5650 + }, + { + "epoch": 0.5652, + "grad_norm": 0.05006446689367294, + "learning_rate": 9.469668816320785e-06, + "loss": 0.0057, + "step": 5652 + }, + { + "epoch": 0.5654, + "grad_norm": 0.0033187023364007473, + "learning_rate": 9.46269745354798e-06, + "loss": 0.0276, + "step": 5654 + }, + { + "epoch": 0.5656, + "grad_norm": 0.05654537305235863, + "learning_rate": 9.45572635264991e-06, + "loss": 0.0028, + "step": 5656 + }, + { + "epoch": 0.5658, + "grad_norm": 0.009696856141090393, + "learning_rate": 9.448755517024207e-06, + "loss": 0.0003, + "step": 5658 + }, + { + "epoch": 0.566, + "grad_norm": 0.021220305934548378, + "learning_rate": 9.441784950068362e-06, + "loss": 0.0004, + "step": 5660 + }, + { + "epoch": 0.5662, + "grad_norm": 0.007797572296112776, + "learning_rate": 9.434814655179756e-06, + "loss": 0.0015, + "step": 5662 + }, + { + "epoch": 0.5664, + "grad_norm": 0.015251967124640942, + "learning_rate": 9.42784463575562e-06, + "loss": 0.0026, + "step": 5664 + }, + { + "epoch": 0.5666, + "grad_norm": 0.04089045897126198, + "learning_rate": 9.420874895193056e-06, + "loss": 0.002, + "step": 5666 + }, + { + "epoch": 0.5668, + "grad_norm": 0.07888978719711304, + "learning_rate": 9.413905436889035e-06, + "loss": 0.0011, + "step": 5668 + }, + { + "epoch": 0.567, + "grad_norm": 1.219332218170166, + "learning_rate": 9.406936264240386e-06, + "loss": 0.0226, + "step": 5670 + }, + { + "epoch": 0.5672, + "grad_norm": 0.010707270354032516, + "learning_rate": 9.399967380643795e-06, + "loss": 0.001, + "step": 5672 + }, + { + "epoch": 0.5674, + "grad_norm": 0.01889883354306221, + "learning_rate": 9.392998789495813e-06, + "loss": 0.0024, + "step": 5674 + }, + { + "epoch": 0.5676, + "grad_norm": 12.52667236328125, + "learning_rate": 9.386030494192847e-06, + "loss": 0.071, + "step": 5676 + }, + { + "epoch": 0.5678, + "grad_norm": 1.1745764017105103, + "learning_rate": 9.379062498131161e-06, + "loss": 0.0323, + "step": 5678 + }, + { + "epoch": 0.568, + "grad_norm": 0.0052489652298390865, + "learning_rate": 9.372094804706867e-06, + "loss": 0.0002, + "step": 5680 + }, + { + "epoch": 0.5682, + "grad_norm": 2.0030677318573, + "learning_rate": 9.36512741731594e-06, + "loss": 0.0164, + "step": 5682 + }, + { + "epoch": 0.5684, + "grad_norm": 0.05206075310707092, + "learning_rate": 9.358160339354194e-06, + "loss": 0.0012, + "step": 5684 + }, + { + "epoch": 0.5686, + "grad_norm": 0.2692386507987976, + "learning_rate": 9.351193574217305e-06, + "loss": 0.0025, + "step": 5686 + }, + { + "epoch": 0.5688, + "grad_norm": 1.5064018964767456, + "learning_rate": 9.344227125300788e-06, + "loss": 0.0393, + "step": 5688 + }, + { + "epoch": 0.569, + "grad_norm": 0.002799364272505045, + "learning_rate": 9.337260996000002e-06, + "loss": 0.0002, + "step": 5690 + }, + { + "epoch": 0.5692, + "grad_norm": 4.173019886016846, + "learning_rate": 9.330295189710153e-06, + "loss": 0.0442, + "step": 5692 + }, + { + "epoch": 0.5694, + "grad_norm": 0.01426468975841999, + "learning_rate": 9.323329709826294e-06, + "loss": 0.0003, + "step": 5694 + }, + { + "epoch": 0.5696, + "grad_norm": 0.04507346451282501, + "learning_rate": 9.316364559743315e-06, + "loss": 0.0017, + "step": 5696 + }, + { + "epoch": 0.5698, + "grad_norm": 0.004786056932061911, + "learning_rate": 9.309399742855943e-06, + "loss": 0.0002, + "step": 5698 + }, + { + "epoch": 0.57, + "grad_norm": 0.02857264131307602, + "learning_rate": 9.302435262558748e-06, + "loss": 0.0004, + "step": 5700 + }, + { + "epoch": 0.5702, + "grad_norm": 0.019242098554968834, + "learning_rate": 9.295471122246131e-06, + "loss": 0.0005, + "step": 5702 + }, + { + "epoch": 0.5704, + "grad_norm": 0.016486436128616333, + "learning_rate": 9.288507325312334e-06, + "loss": 0.0005, + "step": 5704 + }, + { + "epoch": 0.5706, + "grad_norm": 0.012178613804280758, + "learning_rate": 9.281543875151419e-06, + "loss": 0.0003, + "step": 5706 + }, + { + "epoch": 0.5708, + "grad_norm": 0.004035234451293945, + "learning_rate": 9.274580775157294e-06, + "loss": 0.0012, + "step": 5708 + }, + { + "epoch": 0.571, + "grad_norm": 4.219400882720947, + "learning_rate": 9.267618028723687e-06, + "loss": 0.03, + "step": 5710 + }, + { + "epoch": 0.5712, + "grad_norm": 0.023604754358530045, + "learning_rate": 9.260655639244152e-06, + "loss": 0.0004, + "step": 5712 + }, + { + "epoch": 0.5714, + "grad_norm": 0.008218868635594845, + "learning_rate": 9.253693610112079e-06, + "loss": 0.0001, + "step": 5714 + }, + { + "epoch": 0.5716, + "grad_norm": 1.4292826652526855, + "learning_rate": 9.246731944720675e-06, + "loss": 0.0454, + "step": 5716 + }, + { + "epoch": 0.5718, + "grad_norm": 0.6374015212059021, + "learning_rate": 9.239770646462968e-06, + "loss": 0.0037, + "step": 5718 + }, + { + "epoch": 0.572, + "grad_norm": 0.22700774669647217, + "learning_rate": 9.232809718731815e-06, + "loss": 0.0024, + "step": 5720 + }, + { + "epoch": 0.5722, + "grad_norm": 0.02398046851158142, + "learning_rate": 9.225849164919886e-06, + "loss": 0.0018, + "step": 5722 + }, + { + "epoch": 0.5724, + "grad_norm": 0.005302689038217068, + "learning_rate": 9.218888988419668e-06, + "loss": 0.0002, + "step": 5724 + }, + { + "epoch": 0.5726, + "grad_norm": 0.026787390932440758, + "learning_rate": 9.211929192623466e-06, + "loss": 0.0004, + "step": 5726 + }, + { + "epoch": 0.5728, + "grad_norm": 0.005451446399092674, + "learning_rate": 9.204969780923404e-06, + "loss": 0.0018, + "step": 5728 + }, + { + "epoch": 0.573, + "grad_norm": 2.1429078578948975, + "learning_rate": 9.198010756711413e-06, + "loss": 0.0103, + "step": 5730 + }, + { + "epoch": 0.5732, + "grad_norm": 0.05866081640124321, + "learning_rate": 9.191052123379234e-06, + "loss": 0.0004, + "step": 5732 + }, + { + "epoch": 0.5734, + "grad_norm": 0.0014736796729266644, + "learning_rate": 9.184093884318426e-06, + "loss": 0.0004, + "step": 5734 + }, + { + "epoch": 0.5736, + "grad_norm": 0.0007752690580673516, + "learning_rate": 9.177136042920344e-06, + "loss": 0.0, + "step": 5736 + }, + { + "epoch": 0.5738, + "grad_norm": 0.032862622290849686, + "learning_rate": 9.170178602576161e-06, + "loss": 0.0014, + "step": 5738 + }, + { + "epoch": 0.574, + "grad_norm": 0.022066950798034668, + "learning_rate": 9.163221566676847e-06, + "loss": 0.0003, + "step": 5740 + }, + { + "epoch": 0.5742, + "grad_norm": 2.1261720657348633, + "learning_rate": 9.156264938613176e-06, + "loss": 0.0326, + "step": 5742 + }, + { + "epoch": 0.5744, + "grad_norm": 0.026734760031104088, + "learning_rate": 9.14930872177572e-06, + "loss": 0.0002, + "step": 5744 + }, + { + "epoch": 0.5746, + "grad_norm": 2.43144154548645, + "learning_rate": 9.142352919554862e-06, + "loss": 0.0543, + "step": 5746 + }, + { + "epoch": 0.5748, + "grad_norm": 4.606100559234619, + "learning_rate": 9.135397535340773e-06, + "loss": 0.0635, + "step": 5748 + }, + { + "epoch": 0.575, + "grad_norm": 0.07358632236719131, + "learning_rate": 9.128442572523418e-06, + "loss": 0.0004, + "step": 5750 + }, + { + "epoch": 0.5752, + "grad_norm": 0.009963885881006718, + "learning_rate": 9.121488034492569e-06, + "loss": 0.0002, + "step": 5752 + }, + { + "epoch": 0.5754, + "grad_norm": 0.09452220052480698, + "learning_rate": 9.114533924637778e-06, + "loss": 0.0012, + "step": 5754 + }, + { + "epoch": 0.5756, + "grad_norm": 0.004401827231049538, + "learning_rate": 9.107580246348395e-06, + "loss": 0.0001, + "step": 5756 + }, + { + "epoch": 0.5758, + "grad_norm": 0.0013822222827002406, + "learning_rate": 9.100627003013563e-06, + "loss": 0.0005, + "step": 5758 + }, + { + "epoch": 0.576, + "grad_norm": 0.0048095160163939, + "learning_rate": 9.093674198022201e-06, + "loss": 0.0001, + "step": 5760 + }, + { + "epoch": 0.5762, + "grad_norm": 0.4330982565879822, + "learning_rate": 9.086721834763024e-06, + "loss": 0.0038, + "step": 5762 + }, + { + "epoch": 0.5764, + "grad_norm": 0.0014385252725332975, + "learning_rate": 9.07976991662453e-06, + "loss": 0.0003, + "step": 5764 + }, + { + "epoch": 0.5766, + "grad_norm": 4.225450038909912, + "learning_rate": 9.072818446995e-06, + "loss": 0.0404, + "step": 5766 + }, + { + "epoch": 0.5768, + "grad_norm": 0.15428045392036438, + "learning_rate": 9.065867429262497e-06, + "loss": 0.0021, + "step": 5768 + }, + { + "epoch": 0.577, + "grad_norm": 0.11834679543972015, + "learning_rate": 9.058916866814857e-06, + "loss": 0.0009, + "step": 5770 + }, + { + "epoch": 0.5772, + "grad_norm": 0.0019458065507933497, + "learning_rate": 9.051966763039706e-06, + "loss": 0.0435, + "step": 5772 + }, + { + "epoch": 0.5774, + "grad_norm": 0.0013528342824429274, + "learning_rate": 9.045017121324438e-06, + "loss": 0.0002, + "step": 5774 + }, + { + "epoch": 0.5776, + "grad_norm": 0.0008934661746025085, + "learning_rate": 9.038067945056229e-06, + "loss": 0.0, + "step": 5776 + }, + { + "epoch": 0.5778, + "grad_norm": 0.01822054386138916, + "learning_rate": 9.031119237622011e-06, + "loss": 0.0003, + "step": 5778 + }, + { + "epoch": 0.578, + "grad_norm": 0.014758003875613213, + "learning_rate": 9.024171002408507e-06, + "loss": 0.0002, + "step": 5780 + }, + { + "epoch": 0.5782, + "grad_norm": 0.08650072664022446, + "learning_rate": 9.017223242802205e-06, + "loss": 0.0385, + "step": 5782 + }, + { + "epoch": 0.5784, + "grad_norm": 0.0012293404433876276, + "learning_rate": 9.01027596218935e-06, + "loss": 0.0002, + "step": 5784 + }, + { + "epoch": 0.5786, + "grad_norm": 0.0060580577701330185, + "learning_rate": 9.003329163955973e-06, + "loss": 0.0388, + "step": 5786 + }, + { + "epoch": 0.5788, + "grad_norm": 0.00046603221562691033, + "learning_rate": 8.996382851487851e-06, + "loss": 0.0001, + "step": 5788 + }, + { + "epoch": 0.579, + "grad_norm": 0.0025162629317492247, + "learning_rate": 8.989437028170537e-06, + "loss": 0.0001, + "step": 5790 + }, + { + "epoch": 0.5792, + "grad_norm": 0.021162012591958046, + "learning_rate": 8.982491697389339e-06, + "loss": 0.0002, + "step": 5792 + }, + { + "epoch": 0.5794, + "grad_norm": 0.008044324815273285, + "learning_rate": 8.975546862529328e-06, + "loss": 0.0366, + "step": 5794 + }, + { + "epoch": 0.5796, + "grad_norm": 0.01782507635653019, + "learning_rate": 8.968602526975329e-06, + "loss": 0.0003, + "step": 5796 + }, + { + "epoch": 0.5798, + "grad_norm": 0.005040779709815979, + "learning_rate": 8.961658694111929e-06, + "loss": 0.0007, + "step": 5798 + }, + { + "epoch": 0.58, + "grad_norm": 0.0017778860637918115, + "learning_rate": 8.954715367323468e-06, + "loss": 0.3753, + "step": 5800 + }, + { + "epoch": 0.5802, + "grad_norm": 0.003766189096495509, + "learning_rate": 8.947772549994037e-06, + "loss": 0.0003, + "step": 5802 + }, + { + "epoch": 0.5804, + "grad_norm": 0.02651449851691723, + "learning_rate": 8.940830245507483e-06, + "loss": 0.0341, + "step": 5804 + }, + { + "epoch": 0.5806, + "grad_norm": 0.0227938462048769, + "learning_rate": 8.933888457247402e-06, + "loss": 0.0488, + "step": 5806 + }, + { + "epoch": 0.5808, + "grad_norm": 0.009680415503680706, + "learning_rate": 8.926947188597133e-06, + "loss": 0.0003, + "step": 5808 + }, + { + "epoch": 0.581, + "grad_norm": 0.17737530171871185, + "learning_rate": 8.920006442939772e-06, + "loss": 0.0031, + "step": 5810 + }, + { + "epoch": 0.5812, + "grad_norm": 0.030742216855287552, + "learning_rate": 8.913066223658152e-06, + "loss": 0.0024, + "step": 5812 + }, + { + "epoch": 0.5814, + "grad_norm": 0.009170843288302422, + "learning_rate": 8.906126534134849e-06, + "loss": 0.0011, + "step": 5814 + }, + { + "epoch": 0.5816, + "grad_norm": 0.030965084210038185, + "learning_rate": 8.89918737775218e-06, + "loss": 0.0072, + "step": 5816 + }, + { + "epoch": 0.5818, + "grad_norm": 11.167587280273438, + "learning_rate": 8.892248757892215e-06, + "loss": 0.252, + "step": 5818 + }, + { + "epoch": 0.582, + "grad_norm": 0.02400728315114975, + "learning_rate": 8.885310677936746e-06, + "loss": 0.0013, + "step": 5820 + }, + { + "epoch": 0.5822, + "grad_norm": 0.017089052125811577, + "learning_rate": 8.878373141267312e-06, + "loss": 0.0027, + "step": 5822 + }, + { + "epoch": 0.5824, + "grad_norm": 0.13953670859336853, + "learning_rate": 8.871436151265183e-06, + "loss": 0.0655, + "step": 5824 + }, + { + "epoch": 0.5826, + "grad_norm": 0.3469284772872925, + "learning_rate": 8.864499711311362e-06, + "loss": 0.0042, + "step": 5826 + }, + { + "epoch": 0.5828, + "grad_norm": 0.011598905548453331, + "learning_rate": 8.857563824786598e-06, + "loss": 0.0008, + "step": 5828 + }, + { + "epoch": 0.583, + "grad_norm": 0.8659214973449707, + "learning_rate": 8.850628495071336e-06, + "loss": 0.0108, + "step": 5830 + }, + { + "epoch": 0.5832, + "grad_norm": 0.011860637925565243, + "learning_rate": 8.843693725545787e-06, + "loss": 0.0009, + "step": 5832 + }, + { + "epoch": 0.5834, + "grad_norm": 0.013463614508509636, + "learning_rate": 8.836759519589869e-06, + "loss": 0.0004, + "step": 5834 + }, + { + "epoch": 0.5836, + "grad_norm": 2.133589029312134, + "learning_rate": 8.829825880583228e-06, + "loss": 0.0734, + "step": 5836 + }, + { + "epoch": 0.5838, + "grad_norm": 0.0055556753650307655, + "learning_rate": 8.822892811905237e-06, + "loss": 0.001, + "step": 5838 + }, + { + "epoch": 0.584, + "grad_norm": 3.836643695831299, + "learning_rate": 8.815960316934991e-06, + "loss": 0.0451, + "step": 5840 + }, + { + "epoch": 0.5842, + "grad_norm": 0.3377346694469452, + "learning_rate": 8.809028399051302e-06, + "loss": 0.0083, + "step": 5842 + }, + { + "epoch": 0.5844, + "grad_norm": 0.869687557220459, + "learning_rate": 8.802097061632706e-06, + "loss": 0.0112, + "step": 5844 + }, + { + "epoch": 0.5846, + "grad_norm": 0.2313605099916458, + "learning_rate": 8.79516630805745e-06, + "loss": 0.0026, + "step": 5846 + }, + { + "epoch": 0.5848, + "grad_norm": 0.20361725986003876, + "learning_rate": 8.788236141703498e-06, + "loss": 0.0222, + "step": 5848 + }, + { + "epoch": 0.585, + "grad_norm": 0.015002727508544922, + "learning_rate": 8.781306565948528e-06, + "loss": 0.0002, + "step": 5850 + }, + { + "epoch": 0.5852, + "grad_norm": 0.04638330638408661, + "learning_rate": 8.774377584169934e-06, + "loss": 0.0007, + "step": 5852 + }, + { + "epoch": 0.5854, + "grad_norm": 0.025802521035075188, + "learning_rate": 8.767449199744813e-06, + "loss": 0.0043, + "step": 5854 + }, + { + "epoch": 0.5856, + "grad_norm": 0.16406133770942688, + "learning_rate": 8.760521416049983e-06, + "loss": 0.0085, + "step": 5856 + }, + { + "epoch": 0.5858, + "grad_norm": 1.70779287815094, + "learning_rate": 8.753594236461957e-06, + "loss": 0.0416, + "step": 5858 + }, + { + "epoch": 0.586, + "grad_norm": 0.006545050535351038, + "learning_rate": 8.746667664356957e-06, + "loss": 0.0026, + "step": 5860 + }, + { + "epoch": 0.5862, + "grad_norm": 0.7421078681945801, + "learning_rate": 8.739741703110914e-06, + "loss": 0.0085, + "step": 5862 + }, + { + "epoch": 0.5864, + "grad_norm": 0.020199665799736977, + "learning_rate": 8.732816356099455e-06, + "loss": 0.007, + "step": 5864 + }, + { + "epoch": 0.5866, + "grad_norm": 0.07701466977596283, + "learning_rate": 8.725891626697912e-06, + "loss": 0.0018, + "step": 5866 + }, + { + "epoch": 0.5868, + "grad_norm": 0.17428062856197357, + "learning_rate": 8.718967518281307e-06, + "loss": 0.0029, + "step": 5868 + }, + { + "epoch": 0.587, + "grad_norm": 0.04930286109447479, + "learning_rate": 8.712044034224374e-06, + "loss": 0.0009, + "step": 5870 + }, + { + "epoch": 0.5872, + "grad_norm": 0.3213080167770386, + "learning_rate": 8.705121177901532e-06, + "loss": 0.0052, + "step": 5872 + }, + { + "epoch": 0.5874, + "grad_norm": 0.07222215831279755, + "learning_rate": 8.698198952686896e-06, + "loss": 0.0013, + "step": 5874 + }, + { + "epoch": 0.5876, + "grad_norm": 0.008136420510709286, + "learning_rate": 8.69127736195428e-06, + "loss": 0.2069, + "step": 5876 + }, + { + "epoch": 0.5878, + "grad_norm": 0.02528064139187336, + "learning_rate": 8.684356409077177e-06, + "loss": 0.0003, + "step": 5878 + }, + { + "epoch": 0.588, + "grad_norm": 0.012340657413005829, + "learning_rate": 8.677436097428775e-06, + "loss": 0.0003, + "step": 5880 + }, + { + "epoch": 0.5882, + "grad_norm": 0.0071798344142735004, + "learning_rate": 8.670516430381958e-06, + "loss": 0.0004, + "step": 5882 + }, + { + "epoch": 0.5884, + "grad_norm": 0.008808487094938755, + "learning_rate": 8.663597411309278e-06, + "loss": 0.0002, + "step": 5884 + }, + { + "epoch": 0.5886, + "grad_norm": 0.0046999575570225716, + "learning_rate": 8.656679043582986e-06, + "loss": 0.0003, + "step": 5886 + }, + { + "epoch": 0.5888, + "grad_norm": 0.009526312351226807, + "learning_rate": 8.649761330575009e-06, + "loss": 0.0002, + "step": 5888 + }, + { + "epoch": 0.589, + "grad_norm": 0.004400074947625399, + "learning_rate": 8.642844275656957e-06, + "loss": 0.0196, + "step": 5890 + }, + { + "epoch": 0.5892, + "grad_norm": 0.004190748557448387, + "learning_rate": 8.635927882200117e-06, + "loss": 0.0024, + "step": 5892 + }, + { + "epoch": 0.5894, + "grad_norm": 0.00404746737331152, + "learning_rate": 8.629012153575458e-06, + "loss": 0.0003, + "step": 5894 + }, + { + "epoch": 0.5896, + "grad_norm": 0.04544049873948097, + "learning_rate": 8.62209709315362e-06, + "loss": 0.0014, + "step": 5896 + }, + { + "epoch": 0.5898, + "grad_norm": 0.7943575978279114, + "learning_rate": 8.615182704304918e-06, + "loss": 0.0159, + "step": 5898 + }, + { + "epoch": 0.59, + "grad_norm": 0.0022741418797522783, + "learning_rate": 8.60826899039935e-06, + "loss": 0.0004, + "step": 5900 + }, + { + "epoch": 0.5902, + "grad_norm": 0.002504084026440978, + "learning_rate": 8.601355954806562e-06, + "loss": 0.0085, + "step": 5902 + }, + { + "epoch": 0.5904, + "grad_norm": 0.021921195089817047, + "learning_rate": 8.594443600895892e-06, + "loss": 0.0005, + "step": 5904 + }, + { + "epoch": 0.5906, + "grad_norm": 0.06554477661848068, + "learning_rate": 8.587531932036334e-06, + "loss": 0.0027, + "step": 5906 + }, + { + "epoch": 0.5908, + "grad_norm": 0.004686989355832338, + "learning_rate": 8.580620951596556e-06, + "loss": 0.0002, + "step": 5908 + }, + { + "epoch": 0.591, + "grad_norm": 0.0050573647022247314, + "learning_rate": 8.573710662944884e-06, + "loss": 0.0004, + "step": 5910 + }, + { + "epoch": 0.5912, + "grad_norm": 0.007796574849635363, + "learning_rate": 8.566801069449307e-06, + "loss": 0.0004, + "step": 5912 + }, + { + "epoch": 0.5914, + "grad_norm": 0.016195226460695267, + "learning_rate": 8.559892174477478e-06, + "loss": 0.0007, + "step": 5914 + }, + { + "epoch": 0.5916, + "grad_norm": 0.03886979818344116, + "learning_rate": 8.552983981396709e-06, + "loss": 0.0008, + "step": 5916 + }, + { + "epoch": 0.5918, + "grad_norm": 0.25255274772644043, + "learning_rate": 8.546076493573973e-06, + "loss": 0.0055, + "step": 5918 + }, + { + "epoch": 0.592, + "grad_norm": 1.9871923923492432, + "learning_rate": 8.539169714375885e-06, + "loss": 0.0218, + "step": 5920 + }, + { + "epoch": 0.5922, + "grad_norm": 0.01864420808851719, + "learning_rate": 8.532263647168735e-06, + "loss": 0.0003, + "step": 5922 + }, + { + "epoch": 0.5924, + "grad_norm": 0.10901441425085068, + "learning_rate": 8.525358295318454e-06, + "loss": 0.0018, + "step": 5924 + }, + { + "epoch": 0.5926, + "grad_norm": 0.07360785454511642, + "learning_rate": 8.518453662190622e-06, + "loss": 0.0008, + "step": 5926 + }, + { + "epoch": 0.5928, + "grad_norm": 0.008668484166264534, + "learning_rate": 8.511549751150478e-06, + "loss": 0.0002, + "step": 5928 + }, + { + "epoch": 0.593, + "grad_norm": 0.012917548418045044, + "learning_rate": 8.504646565562907e-06, + "loss": 0.0041, + "step": 5930 + }, + { + "epoch": 0.5932, + "grad_norm": 0.049521759152412415, + "learning_rate": 8.49774410879243e-06, + "loss": 0.0006, + "step": 5932 + }, + { + "epoch": 0.5934, + "grad_norm": 1.0640344619750977, + "learning_rate": 8.490842384203227e-06, + "loss": 0.014, + "step": 5934 + }, + { + "epoch": 0.5936, + "grad_norm": 0.005043534561991692, + "learning_rate": 8.483941395159114e-06, + "loss": 0.0012, + "step": 5936 + }, + { + "epoch": 0.5938, + "grad_norm": 3.716042995452881, + "learning_rate": 8.477041145023546e-06, + "loss": 0.1969, + "step": 5938 + }, + { + "epoch": 0.594, + "grad_norm": 0.008530069142580032, + "learning_rate": 8.47014163715962e-06, + "loss": 0.0002, + "step": 5940 + }, + { + "epoch": 0.5942, + "grad_norm": 0.009578175842761993, + "learning_rate": 8.46324287493008e-06, + "loss": 0.0002, + "step": 5942 + }, + { + "epoch": 0.5944, + "grad_norm": 0.008831110782921314, + "learning_rate": 8.45634486169729e-06, + "loss": 0.1016, + "step": 5944 + }, + { + "epoch": 0.5946, + "grad_norm": 0.00342177739366889, + "learning_rate": 8.449447600823262e-06, + "loss": 0.0003, + "step": 5946 + }, + { + "epoch": 0.5948, + "grad_norm": 0.00250125490128994, + "learning_rate": 8.44255109566964e-06, + "loss": 0.007, + "step": 5948 + }, + { + "epoch": 0.595, + "grad_norm": 0.006562127731740475, + "learning_rate": 8.43565534959769e-06, + "loss": 0.0006, + "step": 5950 + }, + { + "epoch": 0.5952, + "grad_norm": 0.016193782910704613, + "learning_rate": 8.428760365968327e-06, + "loss": 0.0003, + "step": 5952 + }, + { + "epoch": 0.5954, + "grad_norm": 1.4744502305984497, + "learning_rate": 8.421866148142066e-06, + "loss": 0.0276, + "step": 5954 + }, + { + "epoch": 0.5956, + "grad_norm": 0.008368766866624355, + "learning_rate": 8.414972699479076e-06, + "loss": 0.0006, + "step": 5956 + }, + { + "epoch": 0.5958, + "grad_norm": 0.08297697454690933, + "learning_rate": 8.408080023339134e-06, + "loss": 0.0142, + "step": 5958 + }, + { + "epoch": 0.596, + "grad_norm": 0.060819111764431, + "learning_rate": 8.401188123081653e-06, + "loss": 0.0011, + "step": 5960 + }, + { + "epoch": 0.5962, + "grad_norm": 0.0011768273543566465, + "learning_rate": 8.394297002065658e-06, + "loss": 0.0002, + "step": 5962 + }, + { + "epoch": 0.5964, + "grad_norm": 9.326791763305664, + "learning_rate": 8.387406663649796e-06, + "loss": 0.1663, + "step": 5964 + }, + { + "epoch": 0.5966, + "grad_norm": 0.020116111263632774, + "learning_rate": 8.380517111192336e-06, + "loss": 0.0004, + "step": 5966 + }, + { + "epoch": 0.5968, + "grad_norm": 0.00828049797564745, + "learning_rate": 8.373628348051165e-06, + "loss": 0.0002, + "step": 5968 + }, + { + "epoch": 0.597, + "grad_norm": 0.004319984465837479, + "learning_rate": 8.366740377583781e-06, + "loss": 0.0006, + "step": 5970 + }, + { + "epoch": 0.5972, + "grad_norm": 5.248754024505615, + "learning_rate": 8.35985320314729e-06, + "loss": 0.5006, + "step": 5972 + }, + { + "epoch": 0.5974, + "grad_norm": 0.09317602217197418, + "learning_rate": 8.352966828098428e-06, + "loss": 0.0017, + "step": 5974 + }, + { + "epoch": 0.5976, + "grad_norm": 7.860866546630859, + "learning_rate": 8.346081255793524e-06, + "loss": 0.2264, + "step": 5976 + }, + { + "epoch": 0.5978, + "grad_norm": 0.006181064061820507, + "learning_rate": 8.339196489588522e-06, + "loss": 0.0023, + "step": 5978 + }, + { + "epoch": 0.598, + "grad_norm": 0.01497645489871502, + "learning_rate": 8.332312532838978e-06, + "loss": 0.0003, + "step": 5980 + }, + { + "epoch": 0.5982, + "grad_norm": 0.08517112582921982, + "learning_rate": 8.325429388900046e-06, + "loss": 0.0028, + "step": 5982 + }, + { + "epoch": 0.5984, + "grad_norm": 0.00567723298445344, + "learning_rate": 8.318547061126485e-06, + "loss": 0.0003, + "step": 5984 + }, + { + "epoch": 0.5986, + "grad_norm": 0.006761125288903713, + "learning_rate": 8.311665552872662e-06, + "loss": 0.0003, + "step": 5986 + }, + { + "epoch": 0.5988, + "grad_norm": 0.07192026823759079, + "learning_rate": 8.30478486749254e-06, + "loss": 0.007, + "step": 5988 + }, + { + "epoch": 0.599, + "grad_norm": 0.02466716431081295, + "learning_rate": 8.297905008339677e-06, + "loss": 0.0006, + "step": 5990 + }, + { + "epoch": 0.5992, + "grad_norm": 0.006783660035580397, + "learning_rate": 8.291025978767236e-06, + "loss": 0.0012, + "step": 5992 + }, + { + "epoch": 0.5994, + "grad_norm": 0.06668759882450104, + "learning_rate": 8.284147782127971e-06, + "loss": 0.0034, + "step": 5994 + }, + { + "epoch": 0.5996, + "grad_norm": 0.3380028307437897, + "learning_rate": 8.277270421774234e-06, + "loss": 0.008, + "step": 5996 + }, + { + "epoch": 0.5998, + "grad_norm": 3.6517951488494873, + "learning_rate": 8.270393901057964e-06, + "loss": 0.0526, + "step": 5998 + }, + { + "epoch": 0.6, + "grad_norm": 1.6406141519546509, + "learning_rate": 8.263518223330698e-06, + "loss": 0.0464, + "step": 6000 + }, + { + "epoch": 0.6002, + "grad_norm": 0.01079369056969881, + "learning_rate": 8.25664339194355e-06, + "loss": 0.0003, + "step": 6002 + }, + { + "epoch": 0.6004, + "grad_norm": 0.23735855519771576, + "learning_rate": 8.249769410247239e-06, + "loss": 0.0043, + "step": 6004 + }, + { + "epoch": 0.6006, + "grad_norm": 0.030489902943372726, + "learning_rate": 8.242896281592057e-06, + "loss": 0.0012, + "step": 6006 + }, + { + "epoch": 0.6008, + "grad_norm": 0.13907794654369354, + "learning_rate": 8.236024009327879e-06, + "loss": 0.0027, + "step": 6008 + }, + { + "epoch": 0.601, + "grad_norm": 0.09063569456338882, + "learning_rate": 8.22915259680417e-06, + "loss": 0.0028, + "step": 6010 + }, + { + "epoch": 0.6012, + "grad_norm": 0.03285244107246399, + "learning_rate": 8.222282047369972e-06, + "loss": 0.0143, + "step": 6012 + }, + { + "epoch": 0.6014, + "grad_norm": 0.006735119502991438, + "learning_rate": 8.215412364373908e-06, + "loss": 0.0007, + "step": 6014 + }, + { + "epoch": 0.6016, + "grad_norm": 0.680019736289978, + "learning_rate": 8.208543551164178e-06, + "loss": 0.0179, + "step": 6016 + }, + { + "epoch": 0.6018, + "grad_norm": 0.7417156100273132, + "learning_rate": 8.201675611088558e-06, + "loss": 0.0123, + "step": 6018 + }, + { + "epoch": 0.602, + "grad_norm": 0.0142035111784935, + "learning_rate": 8.194808547494401e-06, + "loss": 0.0004, + "step": 6020 + }, + { + "epoch": 0.6022, + "grad_norm": 0.15673121809959412, + "learning_rate": 8.187942363728626e-06, + "loss": 0.0034, + "step": 6022 + }, + { + "epoch": 0.6024, + "grad_norm": 0.3712215721607208, + "learning_rate": 8.181077063137733e-06, + "loss": 0.0223, + "step": 6024 + }, + { + "epoch": 0.6026, + "grad_norm": 0.030768418684601784, + "learning_rate": 8.174212649067781e-06, + "loss": 0.0007, + "step": 6026 + }, + { + "epoch": 0.6028, + "grad_norm": 0.1611310839653015, + "learning_rate": 8.167349124864406e-06, + "loss": 0.0033, + "step": 6028 + }, + { + "epoch": 0.603, + "grad_norm": 0.016511669382452965, + "learning_rate": 8.1604864938728e-06, + "loss": 0.0005, + "step": 6030 + }, + { + "epoch": 0.6032, + "grad_norm": 0.006587593350559473, + "learning_rate": 8.153624759437733e-06, + "loss": 0.0007, + "step": 6032 + }, + { + "epoch": 0.6034, + "grad_norm": 0.01333674043416977, + "learning_rate": 8.146763924903527e-06, + "loss": 0.0005, + "step": 6034 + }, + { + "epoch": 0.6036, + "grad_norm": 0.024993792176246643, + "learning_rate": 8.139903993614069e-06, + "loss": 0.0022, + "step": 6036 + }, + { + "epoch": 0.6038, + "grad_norm": 0.005766687449067831, + "learning_rate": 8.133044968912811e-06, + "loss": 0.0013, + "step": 6038 + }, + { + "epoch": 0.604, + "grad_norm": 0.004925284534692764, + "learning_rate": 8.126186854142752e-06, + "loss": 0.0024, + "step": 6040 + }, + { + "epoch": 0.6042, + "grad_norm": 0.09375402331352234, + "learning_rate": 8.119329652646463e-06, + "loss": 0.0021, + "step": 6042 + }, + { + "epoch": 0.6044, + "grad_norm": 0.008857821114361286, + "learning_rate": 8.112473367766051e-06, + "loss": 0.0008, + "step": 6044 + }, + { + "epoch": 0.6046, + "grad_norm": 0.04930859059095383, + "learning_rate": 8.10561800284319e-06, + "loss": 0.041, + "step": 6046 + }, + { + "epoch": 0.6048, + "grad_norm": 6.532694339752197, + "learning_rate": 8.098763561219101e-06, + "loss": 0.1552, + "step": 6048 + }, + { + "epoch": 0.605, + "grad_norm": 0.0033741428051143885, + "learning_rate": 8.091910046234552e-06, + "loss": 0.0008, + "step": 6050 + }, + { + "epoch": 0.6052, + "grad_norm": 0.0026470657903701067, + "learning_rate": 8.08505746122987e-06, + "loss": 0.0668, + "step": 6052 + }, + { + "epoch": 0.6054, + "grad_norm": 0.0033008528407663107, + "learning_rate": 8.078205809544918e-06, + "loss": 0.0032, + "step": 6054 + }, + { + "epoch": 0.6056, + "grad_norm": 5.3650126457214355, + "learning_rate": 8.07135509451911e-06, + "loss": 0.0945, + "step": 6056 + }, + { + "epoch": 0.6058, + "grad_norm": 0.2547042965888977, + "learning_rate": 8.064505319491398e-06, + "loss": 0.0037, + "step": 6058 + }, + { + "epoch": 0.606, + "grad_norm": 0.04769492149353027, + "learning_rate": 8.057656487800283e-06, + "loss": 0.001, + "step": 6060 + }, + { + "epoch": 0.6062, + "grad_norm": 0.04581143707036972, + "learning_rate": 8.050808602783797e-06, + "loss": 0.0006, + "step": 6062 + }, + { + "epoch": 0.6064, + "grad_norm": 5.054426193237305, + "learning_rate": 8.04396166777952e-06, + "loss": 0.2404, + "step": 6064 + }, + { + "epoch": 0.6066, + "grad_norm": 0.10857391357421875, + "learning_rate": 8.037115686124564e-06, + "loss": 0.0021, + "step": 6066 + }, + { + "epoch": 0.6068, + "grad_norm": 0.10616010427474976, + "learning_rate": 8.030270661155575e-06, + "loss": 0.0015, + "step": 6068 + }, + { + "epoch": 0.607, + "grad_norm": 0.008366181515157223, + "learning_rate": 8.023426596208739e-06, + "loss": 0.0007, + "step": 6070 + }, + { + "epoch": 0.6072, + "grad_norm": 0.003918754868209362, + "learning_rate": 8.016583494619769e-06, + "loss": 0.0012, + "step": 6072 + }, + { + "epoch": 0.6074, + "grad_norm": 0.0247996486723423, + "learning_rate": 8.009741359723906e-06, + "loss": 0.0023, + "step": 6074 + }, + { + "epoch": 0.6076, + "grad_norm": 0.010689574293792248, + "learning_rate": 8.00290019485593e-06, + "loss": 0.0004, + "step": 6076 + }, + { + "epoch": 0.6078, + "grad_norm": 2.391011953353882, + "learning_rate": 7.996060003350139e-06, + "loss": 0.0351, + "step": 6078 + }, + { + "epoch": 0.608, + "grad_norm": 0.11735475808382034, + "learning_rate": 7.989220788540356e-06, + "loss": 0.0028, + "step": 6080 + }, + { + "epoch": 0.6082, + "grad_norm": 0.005976101849228144, + "learning_rate": 7.982382553759931e-06, + "loss": 0.0017, + "step": 6082 + }, + { + "epoch": 0.6084, + "grad_norm": 0.18079832196235657, + "learning_rate": 7.975545302341743e-06, + "loss": 0.0177, + "step": 6084 + }, + { + "epoch": 0.6086, + "grad_norm": 0.05345518887042999, + "learning_rate": 7.96870903761818e-06, + "loss": 0.0123, + "step": 6086 + }, + { + "epoch": 0.6088, + "grad_norm": 0.005460211541503668, + "learning_rate": 7.961873762921153e-06, + "loss": 0.0004, + "step": 6088 + }, + { + "epoch": 0.609, + "grad_norm": 0.013262820430099964, + "learning_rate": 7.955039481582098e-06, + "loss": 0.0003, + "step": 6090 + }, + { + "epoch": 0.6092, + "grad_norm": 0.2672465443611145, + "learning_rate": 7.948206196931953e-06, + "loss": 0.005, + "step": 6092 + }, + { + "epoch": 0.6094, + "grad_norm": 0.510901927947998, + "learning_rate": 7.94137391230119e-06, + "loss": 0.0157, + "step": 6094 + }, + { + "epoch": 0.6096, + "grad_norm": 0.13884001970291138, + "learning_rate": 7.934542631019767e-06, + "loss": 0.0015, + "step": 6096 + }, + { + "epoch": 0.6098, + "grad_norm": 0.0022263049613684416, + "learning_rate": 7.927712356417176e-06, + "loss": 0.0002, + "step": 6098 + }, + { + "epoch": 0.61, + "grad_norm": 0.07120132446289062, + "learning_rate": 7.92088309182241e-06, + "loss": 0.0011, + "step": 6100 + }, + { + "epoch": 0.6102, + "grad_norm": 0.05944075435400009, + "learning_rate": 7.914054840563962e-06, + "loss": 0.0012, + "step": 6102 + }, + { + "epoch": 0.6104, + "grad_norm": 1.5682899951934814, + "learning_rate": 7.907227605969849e-06, + "loss": 0.0258, + "step": 6104 + }, + { + "epoch": 0.6106, + "grad_norm": 0.022935714572668076, + "learning_rate": 7.900401391367576e-06, + "loss": 0.0005, + "step": 6106 + }, + { + "epoch": 0.6108, + "grad_norm": 0.0007354081608355045, + "learning_rate": 7.89357620008416e-06, + "loss": 0.0003, + "step": 6108 + }, + { + "epoch": 0.611, + "grad_norm": 0.007869753055274487, + "learning_rate": 7.886752035446116e-06, + "loss": 0.0496, + "step": 6110 + }, + { + "epoch": 0.6112, + "grad_norm": 0.011022109538316727, + "learning_rate": 7.879928900779457e-06, + "loss": 0.0007, + "step": 6112 + }, + { + "epoch": 0.6114, + "grad_norm": 0.7571703195571899, + "learning_rate": 7.873106799409696e-06, + "loss": 0.037, + "step": 6114 + }, + { + "epoch": 0.6116, + "grad_norm": 0.015296105295419693, + "learning_rate": 7.866285734661842e-06, + "loss": 0.0003, + "step": 6116 + }, + { + "epoch": 0.6118, + "grad_norm": 0.6294286251068115, + "learning_rate": 7.8594657098604e-06, + "loss": 0.0115, + "step": 6118 + }, + { + "epoch": 0.612, + "grad_norm": 0.015128479339182377, + "learning_rate": 7.852646728329368e-06, + "loss": 0.0028, + "step": 6120 + }, + { + "epoch": 0.6122, + "grad_norm": 0.008681886829435825, + "learning_rate": 7.845828793392236e-06, + "loss": 0.0004, + "step": 6122 + }, + { + "epoch": 0.6124, + "grad_norm": 0.017843838781118393, + "learning_rate": 7.83901190837198e-06, + "loss": 0.0003, + "step": 6124 + }, + { + "epoch": 0.6126, + "grad_norm": 0.047331303358078, + "learning_rate": 7.832196076591067e-06, + "loss": 0.0189, + "step": 6126 + }, + { + "epoch": 0.6128, + "grad_norm": 0.0312684029340744, + "learning_rate": 7.825381301371452e-06, + "loss": 0.0008, + "step": 6128 + }, + { + "epoch": 0.613, + "grad_norm": 0.014667468145489693, + "learning_rate": 7.818567586034578e-06, + "loss": 0.0002, + "step": 6130 + }, + { + "epoch": 0.6132, + "grad_norm": 0.09258867055177689, + "learning_rate": 7.811754933901358e-06, + "loss": 0.0017, + "step": 6132 + }, + { + "epoch": 0.6134, + "grad_norm": 0.04239170625805855, + "learning_rate": 7.804943348292197e-06, + "loss": 0.001, + "step": 6134 + }, + { + "epoch": 0.6136, + "grad_norm": 0.01511534396559, + "learning_rate": 7.798132832526986e-06, + "loss": 0.0011, + "step": 6136 + }, + { + "epoch": 0.6138, + "grad_norm": 0.006626872345805168, + "learning_rate": 7.791323389925084e-06, + "loss": 0.0003, + "step": 6138 + }, + { + "epoch": 0.614, + "grad_norm": 0.017680486664175987, + "learning_rate": 7.784515023805328e-06, + "loss": 0.0075, + "step": 6140 + }, + { + "epoch": 0.6142, + "grad_norm": 0.011893153190612793, + "learning_rate": 7.777707737486036e-06, + "loss": 0.0005, + "step": 6142 + }, + { + "epoch": 0.6144, + "grad_norm": 0.5383596420288086, + "learning_rate": 7.770901534284996e-06, + "loss": 0.0045, + "step": 6144 + }, + { + "epoch": 0.6146, + "grad_norm": 0.008556616492569447, + "learning_rate": 7.76409641751947e-06, + "loss": 0.0007, + "step": 6146 + }, + { + "epoch": 0.6148, + "grad_norm": 5.93360710144043, + "learning_rate": 7.757292390506191e-06, + "loss": 0.2529, + "step": 6148 + }, + { + "epoch": 0.615, + "grad_norm": 0.023564834147691727, + "learning_rate": 7.750489456561351e-06, + "loss": 0.0044, + "step": 6150 + }, + { + "epoch": 0.6152, + "grad_norm": 0.37154054641723633, + "learning_rate": 7.743687619000625e-06, + "loss": 0.0034, + "step": 6152 + }, + { + "epoch": 0.6154, + "grad_norm": 0.003975634463131428, + "learning_rate": 7.736886881139143e-06, + "loss": 0.0024, + "step": 6154 + }, + { + "epoch": 0.6156, + "grad_norm": 0.07548788189888, + "learning_rate": 7.730087246291503e-06, + "loss": 0.0132, + "step": 6156 + }, + { + "epoch": 0.6158, + "grad_norm": 0.010431443341076374, + "learning_rate": 7.72328871777176e-06, + "loss": 0.0013, + "step": 6158 + }, + { + "epoch": 0.616, + "grad_norm": 0.011982022784650326, + "learning_rate": 7.716491298893443e-06, + "loss": 0.0004, + "step": 6160 + }, + { + "epoch": 0.6162, + "grad_norm": 0.006679946091026068, + "learning_rate": 7.709694992969525e-06, + "loss": 0.0412, + "step": 6162 + }, + { + "epoch": 0.6164, + "grad_norm": 0.0022611997555941343, + "learning_rate": 7.702899803312443e-06, + "loss": 0.0001, + "step": 6164 + }, + { + "epoch": 0.6166, + "grad_norm": 0.024874437600374222, + "learning_rate": 7.696105733234099e-06, + "loss": 0.0018, + "step": 6166 + }, + { + "epoch": 0.6168, + "grad_norm": 0.01948913186788559, + "learning_rate": 7.689312786045823e-06, + "loss": 0.0004, + "step": 6168 + }, + { + "epoch": 0.617, + "grad_norm": 0.0830395296216011, + "learning_rate": 7.68252096505843e-06, + "loss": 0.0507, + "step": 6170 + }, + { + "epoch": 0.6172, + "grad_norm": 0.002193339169025421, + "learning_rate": 7.67573027358216e-06, + "loss": 0.0136, + "step": 6172 + }, + { + "epoch": 0.6174, + "grad_norm": 0.009014488197863102, + "learning_rate": 7.668940714926724e-06, + "loss": 0.0002, + "step": 6174 + }, + { + "epoch": 0.6176, + "grad_norm": 0.8518444299697876, + "learning_rate": 7.662152292401265e-06, + "loss": 0.0115, + "step": 6176 + }, + { + "epoch": 0.6178, + "grad_norm": 0.014260556548833847, + "learning_rate": 7.655365009314375e-06, + "loss": 0.0003, + "step": 6178 + }, + { + "epoch": 0.618, + "grad_norm": 0.0023976487573236227, + "learning_rate": 7.6485788689741e-06, + "loss": 0.0815, + "step": 6180 + }, + { + "epoch": 0.6182, + "grad_norm": 0.041796404868364334, + "learning_rate": 7.641793874687918e-06, + "loss": 0.0006, + "step": 6182 + }, + { + "epoch": 0.6184, + "grad_norm": 0.020484618842601776, + "learning_rate": 7.635010029762755e-06, + "loss": 0.0005, + "step": 6184 + }, + { + "epoch": 0.6186, + "grad_norm": 0.15321038663387299, + "learning_rate": 7.628227337504972e-06, + "loss": 0.001, + "step": 6186 + }, + { + "epoch": 0.6188, + "grad_norm": 6.54535436630249, + "learning_rate": 7.621445801220372e-06, + "loss": 0.0994, + "step": 6188 + }, + { + "epoch": 0.619, + "grad_norm": 1.0204079151153564, + "learning_rate": 7.6146654242141935e-06, + "loss": 0.0102, + "step": 6190 + }, + { + "epoch": 0.6192, + "grad_norm": 0.002034426899626851, + "learning_rate": 7.6078862097911075e-06, + "loss": 0.0036, + "step": 6192 + }, + { + "epoch": 0.6194, + "grad_norm": 0.016782769933342934, + "learning_rate": 7.6011081612552265e-06, + "loss": 0.0003, + "step": 6194 + }, + { + "epoch": 0.6196, + "grad_norm": 0.0009998665191233158, + "learning_rate": 7.594331281910082e-06, + "loss": 0.0001, + "step": 6196 + }, + { + "epoch": 0.6198, + "grad_norm": 0.004941287916153669, + "learning_rate": 7.58755557505865e-06, + "loss": 0.0009, + "step": 6198 + }, + { + "epoch": 0.62, + "grad_norm": 0.008052692748606205, + "learning_rate": 7.580781044003324e-06, + "loss": 0.095, + "step": 6200 + }, + { + "epoch": 0.6202, + "grad_norm": 0.0679377019405365, + "learning_rate": 7.574007692045928e-06, + "loss": 0.0011, + "step": 6202 + }, + { + "epoch": 0.6204, + "grad_norm": 1.5647019147872925, + "learning_rate": 7.5672355224877115e-06, + "loss": 0.0243, + "step": 6204 + }, + { + "epoch": 0.6206, + "grad_norm": 0.00719265453517437, + "learning_rate": 7.560464538629345e-06, + "loss": 0.0011, + "step": 6206 + }, + { + "epoch": 0.6208, + "grad_norm": 0.001768990303389728, + "learning_rate": 7.553694743770928e-06, + "loss": 0.0007, + "step": 6208 + }, + { + "epoch": 0.621, + "grad_norm": 0.6860306262969971, + "learning_rate": 7.546926141211975e-06, + "loss": 0.0134, + "step": 6210 + }, + { + "epoch": 0.6212, + "grad_norm": 0.034884221851825714, + "learning_rate": 7.54015873425142e-06, + "loss": 0.0008, + "step": 6212 + }, + { + "epoch": 0.6214, + "grad_norm": 0.14713671803474426, + "learning_rate": 7.533392526187617e-06, + "loss": 0.0039, + "step": 6214 + }, + { + "epoch": 0.6216, + "grad_norm": 0.007341773249208927, + "learning_rate": 7.526627520318329e-06, + "loss": 0.0014, + "step": 6216 + }, + { + "epoch": 0.6218, + "grad_norm": 0.011202127672731876, + "learning_rate": 7.519863719940748e-06, + "loss": 0.0004, + "step": 6218 + }, + { + "epoch": 0.622, + "grad_norm": 0.6739444732666016, + "learning_rate": 7.513101128351454e-06, + "loss": 0.0068, + "step": 6220 + }, + { + "epoch": 0.6222, + "grad_norm": 0.003230503061786294, + "learning_rate": 7.506339748846461e-06, + "loss": 0.0002, + "step": 6222 + }, + { + "epoch": 0.6224, + "grad_norm": 0.6761338114738464, + "learning_rate": 7.49957958472118e-06, + "loss": 0.1074, + "step": 6224 + }, + { + "epoch": 0.6226, + "grad_norm": 0.06029623746871948, + "learning_rate": 7.492820639270435e-06, + "loss": 0.0662, + "step": 6226 + }, + { + "epoch": 0.6228, + "grad_norm": 0.638403058052063, + "learning_rate": 7.486062915788453e-06, + "loss": 0.0099, + "step": 6228 + }, + { + "epoch": 0.623, + "grad_norm": 2.3008432388305664, + "learning_rate": 7.4793064175688635e-06, + "loss": 0.0373, + "step": 6230 + }, + { + "epoch": 0.6232, + "grad_norm": 0.014586096629500389, + "learning_rate": 7.472551147904708e-06, + "loss": 0.0003, + "step": 6232 + }, + { + "epoch": 0.6234, + "grad_norm": 0.005662992130964994, + "learning_rate": 7.465797110088417e-06, + "loss": 0.0001, + "step": 6234 + }, + { + "epoch": 0.6236, + "grad_norm": 0.9062532186508179, + "learning_rate": 7.4590443074118325e-06, + "loss": 0.0155, + "step": 6236 + }, + { + "epoch": 0.6238, + "grad_norm": 0.03111886978149414, + "learning_rate": 7.4522927431661805e-06, + "loss": 0.0006, + "step": 6238 + }, + { + "epoch": 0.624, + "grad_norm": 1.2385196685791016, + "learning_rate": 7.445542420642097e-06, + "loss": 0.0428, + "step": 6240 + }, + { + "epoch": 0.6242, + "grad_norm": 0.0061036753468215466, + "learning_rate": 7.438793343129605e-06, + "loss": 0.0004, + "step": 6242 + }, + { + "epoch": 0.6244, + "grad_norm": 0.022637875750660896, + "learning_rate": 7.432045513918122e-06, + "loss": 0.0005, + "step": 6244 + }, + { + "epoch": 0.6246, + "grad_norm": 0.0246578436344862, + "learning_rate": 7.4252989362964635e-06, + "loss": 0.0005, + "step": 6246 + }, + { + "epoch": 0.6248, + "grad_norm": 0.0047506652772426605, + "learning_rate": 7.418553613552824e-06, + "loss": 0.0014, + "step": 6248 + }, + { + "epoch": 0.625, + "grad_norm": 0.007656025234609842, + "learning_rate": 7.411809548974792e-06, + "loss": 0.0019, + "step": 6250 + }, + { + "epoch": 0.6252, + "grad_norm": 0.006880595814436674, + "learning_rate": 7.405066745849347e-06, + "loss": 0.0533, + "step": 6252 + }, + { + "epoch": 0.6254, + "grad_norm": 0.005934405606240034, + "learning_rate": 7.398325207462846e-06, + "loss": 0.0002, + "step": 6254 + }, + { + "epoch": 0.6256, + "grad_norm": 0.044368527829647064, + "learning_rate": 7.391584937101034e-06, + "loss": 0.0007, + "step": 6256 + }, + { + "epoch": 0.6258, + "grad_norm": 0.002319119405001402, + "learning_rate": 7.384845938049033e-06, + "loss": 0.0001, + "step": 6258 + }, + { + "epoch": 0.626, + "grad_norm": 0.004030182957649231, + "learning_rate": 7.378108213591355e-06, + "loss": 0.0008, + "step": 6260 + }, + { + "epoch": 0.6262, + "grad_norm": 1.1277096271514893, + "learning_rate": 7.37137176701188e-06, + "loss": 0.0134, + "step": 6262 + }, + { + "epoch": 0.6264, + "grad_norm": 0.052292127162218094, + "learning_rate": 7.364636601593875e-06, + "loss": 0.0006, + "step": 6264 + }, + { + "epoch": 0.6266, + "grad_norm": 2.0921053886413574, + "learning_rate": 7.357902720619976e-06, + "loss": 0.0319, + "step": 6266 + }, + { + "epoch": 0.6268, + "grad_norm": 0.0026325113140046597, + "learning_rate": 7.351170127372191e-06, + "loss": 0.003, + "step": 6268 + }, + { + "epoch": 0.627, + "grad_norm": 0.0010028316173702478, + "learning_rate": 7.344438825131912e-06, + "loss": 0.0005, + "step": 6270 + }, + { + "epoch": 0.6272, + "grad_norm": 0.004341122694313526, + "learning_rate": 7.33770881717989e-06, + "loss": 0.0001, + "step": 6272 + }, + { + "epoch": 0.6274, + "grad_norm": 0.0022686964366585016, + "learning_rate": 7.330980106796247e-06, + "loss": 0.0001, + "step": 6274 + }, + { + "epoch": 0.6276, + "grad_norm": 0.005014703143388033, + "learning_rate": 7.324252697260475e-06, + "loss": 0.0004, + "step": 6276 + }, + { + "epoch": 0.6278, + "grad_norm": 0.042383987456560135, + "learning_rate": 7.3175265918514335e-06, + "loss": 0.0005, + "step": 6278 + }, + { + "epoch": 0.628, + "grad_norm": 0.006554562132805586, + "learning_rate": 7.310801793847344e-06, + "loss": 0.0002, + "step": 6280 + }, + { + "epoch": 0.6282, + "grad_norm": 0.004061536397784948, + "learning_rate": 7.3040783065257906e-06, + "loss": 0.002, + "step": 6282 + }, + { + "epoch": 0.6284, + "grad_norm": 0.6545504331588745, + "learning_rate": 7.297356133163722e-06, + "loss": 0.0052, + "step": 6284 + }, + { + "epoch": 0.6286, + "grad_norm": 0.005832873750478029, + "learning_rate": 7.290635277037442e-06, + "loss": 0.0075, + "step": 6286 + }, + { + "epoch": 0.6288, + "grad_norm": 0.11077403277158737, + "learning_rate": 7.283915741422611e-06, + "loss": 0.0009, + "step": 6288 + }, + { + "epoch": 0.629, + "grad_norm": 0.001866760547272861, + "learning_rate": 7.277197529594257e-06, + "loss": 0.0004, + "step": 6290 + }, + { + "epoch": 0.6292, + "grad_norm": 0.005811986513435841, + "learning_rate": 7.27048064482675e-06, + "loss": 0.0001, + "step": 6292 + }, + { + "epoch": 0.6294, + "grad_norm": 0.014345050789415836, + "learning_rate": 7.263765090393817e-06, + "loss": 0.0007, + "step": 6294 + }, + { + "epoch": 0.6296, + "grad_norm": 0.002272570738568902, + "learning_rate": 7.257050869568536e-06, + "loss": 0.0005, + "step": 6296 + }, + { + "epoch": 0.6298, + "grad_norm": 0.009031481109559536, + "learning_rate": 7.250337985623342e-06, + "loss": 0.0044, + "step": 6298 + }, + { + "epoch": 0.63, + "grad_norm": 0.10438639670610428, + "learning_rate": 7.243626441830009e-06, + "loss": 0.0011, + "step": 6300 + }, + { + "epoch": 0.6302, + "grad_norm": 0.057148583233356476, + "learning_rate": 7.236916241459664e-06, + "loss": 0.0044, + "step": 6302 + }, + { + "epoch": 0.6304, + "grad_norm": 0.007957752794027328, + "learning_rate": 7.2302073877827775e-06, + "loss": 0.001, + "step": 6304 + }, + { + "epoch": 0.6306, + "grad_norm": 0.1753242164850235, + "learning_rate": 7.22349988406916e-06, + "loss": 0.0077, + "step": 6306 + }, + { + "epoch": 0.6308, + "grad_norm": 0.005395147018134594, + "learning_rate": 7.216793733587976e-06, + "loss": 0.0005, + "step": 6308 + }, + { + "epoch": 0.631, + "grad_norm": 0.482757568359375, + "learning_rate": 7.210088939607709e-06, + "loss": 0.0038, + "step": 6310 + }, + { + "epoch": 0.6312, + "grad_norm": 1.2518128156661987, + "learning_rate": 7.203385505396203e-06, + "loss": 0.0163, + "step": 6312 + }, + { + "epoch": 0.6314, + "grad_norm": 0.00978503841906786, + "learning_rate": 7.196683434220626e-06, + "loss": 0.0002, + "step": 6314 + }, + { + "epoch": 0.6316, + "grad_norm": 0.0004219703550916165, + "learning_rate": 7.189982729347491e-06, + "loss": 0.0215, + "step": 6316 + }, + { + "epoch": 0.6318, + "grad_norm": 0.014390732161700726, + "learning_rate": 7.1832833940426346e-06, + "loss": 0.0002, + "step": 6318 + }, + { + "epoch": 0.632, + "grad_norm": 0.001680375193245709, + "learning_rate": 7.176585431571235e-06, + "loss": 0.0135, + "step": 6320 + }, + { + "epoch": 0.6322, + "grad_norm": 0.004239986184984446, + "learning_rate": 7.169888845197798e-06, + "loss": 0.0004, + "step": 6322 + }, + { + "epoch": 0.6324, + "grad_norm": 0.9539825320243835, + "learning_rate": 7.163193638186159e-06, + "loss": 0.011, + "step": 6324 + }, + { + "epoch": 0.6326, + "grad_norm": 0.007187189534306526, + "learning_rate": 7.156499813799477e-06, + "loss": 0.0001, + "step": 6326 + }, + { + "epoch": 0.6328, + "grad_norm": 0.01990172639489174, + "learning_rate": 7.149807375300239e-06, + "loss": 0.0003, + "step": 6328 + }, + { + "epoch": 0.633, + "grad_norm": 0.005959507077932358, + "learning_rate": 7.143116325950266e-06, + "loss": 0.0346, + "step": 6330 + }, + { + "epoch": 0.6332, + "grad_norm": 2.286057472229004, + "learning_rate": 7.13642666901069e-06, + "loss": 0.0123, + "step": 6332 + }, + { + "epoch": 0.6334, + "grad_norm": 0.00047170661855489016, + "learning_rate": 7.129738407741964e-06, + "loss": 0.0008, + "step": 6334 + }, + { + "epoch": 0.6336, + "grad_norm": 0.02118528261780739, + "learning_rate": 7.123051545403874e-06, + "loss": 0.0007, + "step": 6336 + }, + { + "epoch": 0.6338, + "grad_norm": 0.03379423916339874, + "learning_rate": 7.116366085255511e-06, + "loss": 0.834, + "step": 6338 + }, + { + "epoch": 0.634, + "grad_norm": 0.0007208199822343886, + "learning_rate": 7.109682030555283e-06, + "loss": 0.0005, + "step": 6340 + }, + { + "epoch": 0.6342, + "grad_norm": 0.005072758998721838, + "learning_rate": 7.102999384560927e-06, + "loss": 0.0017, + "step": 6342 + }, + { + "epoch": 0.6344, + "grad_norm": 0.00040756474481895566, + "learning_rate": 7.096318150529476e-06, + "loss": 0.0013, + "step": 6344 + }, + { + "epoch": 0.6346, + "grad_norm": 0.014627089723944664, + "learning_rate": 7.0896383317172845e-06, + "loss": 0.0005, + "step": 6346 + }, + { + "epoch": 0.6348, + "grad_norm": 0.010936909355223179, + "learning_rate": 7.082959931380011e-06, + "loss": 0.0001, + "step": 6348 + }, + { + "epoch": 0.635, + "grad_norm": 16.616674423217773, + "learning_rate": 7.076282952772634e-06, + "loss": 0.1431, + "step": 6350 + }, + { + "epoch": 0.6352, + "grad_norm": 0.014766808599233627, + "learning_rate": 7.069607399149427e-06, + "loss": 0.0013, + "step": 6352 + }, + { + "epoch": 0.6354, + "grad_norm": 1.3039382696151733, + "learning_rate": 7.062933273763974e-06, + "loss": 0.0221, + "step": 6354 + }, + { + "epoch": 0.6356, + "grad_norm": 0.7807362675666809, + "learning_rate": 7.056260579869165e-06, + "loss": 0.0101, + "step": 6356 + }, + { + "epoch": 0.6358, + "grad_norm": 0.017717476934194565, + "learning_rate": 7.049589320717186e-06, + "loss": 0.0256, + "step": 6358 + }, + { + "epoch": 0.636, + "grad_norm": 0.06263952702283859, + "learning_rate": 7.042919499559538e-06, + "loss": 0.003, + "step": 6360 + }, + { + "epoch": 0.6362, + "grad_norm": 0.005028498359024525, + "learning_rate": 7.036251119646993e-06, + "loss": 0.0013, + "step": 6362 + }, + { + "epoch": 0.6364, + "grad_norm": 0.005652015097439289, + "learning_rate": 7.029584184229653e-06, + "loss": 0.0013, + "step": 6364 + }, + { + "epoch": 0.6366, + "grad_norm": 0.02517712488770485, + "learning_rate": 7.022918696556896e-06, + "loss": 0.0005, + "step": 6366 + }, + { + "epoch": 0.6368, + "grad_norm": 0.004406098276376724, + "learning_rate": 7.016254659877398e-06, + "loss": 0.0001, + "step": 6368 + }, + { + "epoch": 0.637, + "grad_norm": 0.028375104069709778, + "learning_rate": 7.009592077439135e-06, + "loss": 0.0011, + "step": 6370 + }, + { + "epoch": 0.6372, + "grad_norm": 0.052013978362083435, + "learning_rate": 7.002930952489362e-06, + "loss": 0.0008, + "step": 6372 + }, + { + "epoch": 0.6374, + "grad_norm": 3.7301740646362305, + "learning_rate": 6.996271288274636e-06, + "loss": 0.0821, + "step": 6374 + }, + { + "epoch": 0.6376, + "grad_norm": 0.005173301324248314, + "learning_rate": 6.9896130880407965e-06, + "loss": 0.0002, + "step": 6376 + }, + { + "epoch": 0.6378, + "grad_norm": 0.10514716058969498, + "learning_rate": 6.982956355032968e-06, + "loss": 0.0006, + "step": 6378 + }, + { + "epoch": 0.638, + "grad_norm": 0.0025836534332484007, + "learning_rate": 6.976301092495556e-06, + "loss": 0.0001, + "step": 6380 + }, + { + "epoch": 0.6382, + "grad_norm": 0.006139283999800682, + "learning_rate": 6.969647303672262e-06, + "loss": 0.0011, + "step": 6382 + }, + { + "epoch": 0.6384, + "grad_norm": 0.05269907787442207, + "learning_rate": 6.962994991806059e-06, + "loss": 0.0009, + "step": 6384 + }, + { + "epoch": 0.6386, + "grad_norm": 0.03301318734884262, + "learning_rate": 6.956344160139201e-06, + "loss": 0.0032, + "step": 6386 + }, + { + "epoch": 0.6388, + "grad_norm": 0.0171293243765831, + "learning_rate": 6.949694811913226e-06, + "loss": 0.0004, + "step": 6388 + }, + { + "epoch": 0.639, + "grad_norm": 1.36240816116333, + "learning_rate": 6.943046950368944e-06, + "loss": 0.0196, + "step": 6390 + }, + { + "epoch": 0.6392, + "grad_norm": 0.09799555689096451, + "learning_rate": 6.9364005787464406e-06, + "loss": 0.0014, + "step": 6392 + }, + { + "epoch": 0.6394, + "grad_norm": 0.020819375291466713, + "learning_rate": 6.929755700285082e-06, + "loss": 0.0129, + "step": 6394 + }, + { + "epoch": 0.6396, + "grad_norm": 2.846161127090454, + "learning_rate": 6.923112318223497e-06, + "loss": 0.0413, + "step": 6396 + }, + { + "epoch": 0.6398, + "grad_norm": 0.224862203001976, + "learning_rate": 6.9164704357995874e-06, + "loss": 0.0035, + "step": 6398 + }, + { + "epoch": 0.64, + "grad_norm": 0.013649526052176952, + "learning_rate": 6.909830056250527e-06, + "loss": 0.0006, + "step": 6400 + }, + { + "epoch": 0.6402, + "grad_norm": 0.02209734171628952, + "learning_rate": 6.903191182812759e-06, + "loss": 0.1172, + "step": 6402 + }, + { + "epoch": 0.6404, + "grad_norm": 0.001727678463794291, + "learning_rate": 6.896553818721989e-06, + "loss": 0.0001, + "step": 6404 + }, + { + "epoch": 0.6406, + "grad_norm": 0.010168460197746754, + "learning_rate": 6.889917967213184e-06, + "loss": 0.0002, + "step": 6406 + }, + { + "epoch": 0.6408, + "grad_norm": 0.04813385009765625, + "learning_rate": 6.883283631520582e-06, + "loss": 0.0006, + "step": 6408 + }, + { + "epoch": 0.641, + "grad_norm": 0.011933675035834312, + "learning_rate": 6.876650814877675e-06, + "loss": 0.0129, + "step": 6410 + }, + { + "epoch": 0.6412, + "grad_norm": 0.008830716833472252, + "learning_rate": 6.870019520517217e-06, + "loss": 0.0002, + "step": 6412 + }, + { + "epoch": 0.6414, + "grad_norm": 0.014586499892175198, + "learning_rate": 6.863389751671225e-06, + "loss": 0.0004, + "step": 6414 + }, + { + "epoch": 0.6416, + "grad_norm": 0.7471632361412048, + "learning_rate": 6.856761511570963e-06, + "loss": 0.0313, + "step": 6416 + }, + { + "epoch": 0.6418, + "grad_norm": 0.12464461475610733, + "learning_rate": 6.850134803446955e-06, + "loss": 0.0022, + "step": 6418 + }, + { + "epoch": 0.642, + "grad_norm": 8.677934646606445, + "learning_rate": 6.843509630528977e-06, + "loss": 0.1895, + "step": 6420 + }, + { + "epoch": 0.6422, + "grad_norm": 0.0017394011374562979, + "learning_rate": 6.836885996046061e-06, + "loss": 0.0003, + "step": 6422 + }, + { + "epoch": 0.6424, + "grad_norm": 0.0029692547395825386, + "learning_rate": 6.830263903226483e-06, + "loss": 0.0039, + "step": 6424 + }, + { + "epoch": 0.6426, + "grad_norm": 0.03625660389661789, + "learning_rate": 6.823643355297774e-06, + "loss": 0.0013, + "step": 6426 + }, + { + "epoch": 0.6428, + "grad_norm": 4.445709228515625, + "learning_rate": 6.8170243554867065e-06, + "loss": 0.0333, + "step": 6428 + }, + { + "epoch": 0.643, + "grad_norm": 0.009923825971782207, + "learning_rate": 6.8104069070193e-06, + "loss": 0.0002, + "step": 6430 + }, + { + "epoch": 0.6432, + "grad_norm": 0.009838100522756577, + "learning_rate": 6.803791013120822e-06, + "loss": 0.0009, + "step": 6432 + }, + { + "epoch": 0.6434, + "grad_norm": 0.0023253029212355614, + "learning_rate": 6.797176677015775e-06, + "loss": 0.0001, + "step": 6434 + }, + { + "epoch": 0.6436, + "grad_norm": 0.05718209967017174, + "learning_rate": 6.790563901927907e-06, + "loss": 0.0269, + "step": 6436 + }, + { + "epoch": 0.6438, + "grad_norm": 0.0016301325522363186, + "learning_rate": 6.783952691080203e-06, + "loss": 0.0002, + "step": 6438 + }, + { + "epoch": 0.644, + "grad_norm": 0.005922373849898577, + "learning_rate": 6.777343047694891e-06, + "loss": 0.0002, + "step": 6440 + }, + { + "epoch": 0.6442, + "grad_norm": 0.004041416570544243, + "learning_rate": 6.770734974993427e-06, + "loss": 0.002, + "step": 6442 + }, + { + "epoch": 0.6444, + "grad_norm": 0.00330925639718771, + "learning_rate": 6.764128476196505e-06, + "loss": 0.0003, + "step": 6444 + }, + { + "epoch": 0.6446, + "grad_norm": 0.006998240482062101, + "learning_rate": 6.757523554524056e-06, + "loss": 0.0004, + "step": 6446 + }, + { + "epoch": 0.6448, + "grad_norm": 0.3217023015022278, + "learning_rate": 6.750920213195238e-06, + "loss": 0.0056, + "step": 6448 + }, + { + "epoch": 0.645, + "grad_norm": 0.010212093591690063, + "learning_rate": 6.744318455428436e-06, + "loss": 0.0026, + "step": 6450 + }, + { + "epoch": 0.6452, + "grad_norm": 0.23843611776828766, + "learning_rate": 6.737718284441267e-06, + "loss": 0.0048, + "step": 6452 + }, + { + "epoch": 0.6454, + "grad_norm": 0.05035796016454697, + "learning_rate": 6.731119703450577e-06, + "loss": 0.0065, + "step": 6454 + }, + { + "epoch": 0.6456, + "grad_norm": 0.025010589510202408, + "learning_rate": 6.7245227156724324e-06, + "loss": 0.0003, + "step": 6456 + }, + { + "epoch": 0.6458, + "grad_norm": 0.009906667284667492, + "learning_rate": 6.717927324322124e-06, + "loss": 0.1335, + "step": 6458 + }, + { + "epoch": 0.646, + "grad_norm": 0.14416621625423431, + "learning_rate": 6.711333532614168e-06, + "loss": 0.0015, + "step": 6460 + }, + { + "epoch": 0.6462, + "grad_norm": 0.1086784228682518, + "learning_rate": 6.704741343762296e-06, + "loss": 0.0019, + "step": 6462 + }, + { + "epoch": 0.6464, + "grad_norm": 0.06236943230032921, + "learning_rate": 6.698150760979463e-06, + "loss": 0.0059, + "step": 6464 + }, + { + "epoch": 0.6466, + "grad_norm": 1.360823154449463, + "learning_rate": 6.69156178747784e-06, + "loss": 0.0051, + "step": 6466 + }, + { + "epoch": 0.6468, + "grad_norm": 0.1133948490023613, + "learning_rate": 6.684974426468809e-06, + "loss": 0.0014, + "step": 6468 + }, + { + "epoch": 0.647, + "grad_norm": 0.19379879534244537, + "learning_rate": 6.67838868116297e-06, + "loss": 0.002, + "step": 6470 + }, + { + "epoch": 0.6472, + "grad_norm": 0.05388861149549484, + "learning_rate": 6.671804554770135e-06, + "loss": 0.0012, + "step": 6472 + }, + { + "epoch": 0.6474, + "grad_norm": 0.005608732812106609, + "learning_rate": 6.6652220504993305e-06, + "loss": 0.0002, + "step": 6474 + }, + { + "epoch": 0.6476, + "grad_norm": 1.3931831121444702, + "learning_rate": 6.658641171558785e-06, + "loss": 0.012, + "step": 6476 + }, + { + "epoch": 0.6478, + "grad_norm": 0.00939975492656231, + "learning_rate": 6.6520619211559435e-06, + "loss": 0.0003, + "step": 6478 + }, + { + "epoch": 0.648, + "grad_norm": 0.005695897154510021, + "learning_rate": 6.645484302497452e-06, + "loss": 0.0002, + "step": 6480 + }, + { + "epoch": 0.6482, + "grad_norm": 4.1043806076049805, + "learning_rate": 6.638908318789156e-06, + "loss": 0.0585, + "step": 6482 + }, + { + "epoch": 0.6484, + "grad_norm": 0.009288030676543713, + "learning_rate": 6.63233397323612e-06, + "loss": 0.0084, + "step": 6484 + }, + { + "epoch": 0.6486, + "grad_norm": 0.006982842460274696, + "learning_rate": 6.62576126904259e-06, + "loss": 0.0004, + "step": 6486 + }, + { + "epoch": 0.6488, + "grad_norm": 0.08902841061353683, + "learning_rate": 6.6191902094120295e-06, + "loss": 0.0011, + "step": 6488 + }, + { + "epoch": 0.649, + "grad_norm": 0.362720787525177, + "learning_rate": 6.612620797547087e-06, + "loss": 0.0029, + "step": 6490 + }, + { + "epoch": 0.6492, + "grad_norm": 0.007656720466911793, + "learning_rate": 6.60605303664962e-06, + "loss": 0.0002, + "step": 6492 + }, + { + "epoch": 0.6494, + "grad_norm": 0.003788657719269395, + "learning_rate": 6.5994869299206736e-06, + "loss": 0.0008, + "step": 6494 + }, + { + "epoch": 0.6496, + "grad_norm": 0.002388895256444812, + "learning_rate": 6.5929224805604845e-06, + "loss": 0.0003, + "step": 6496 + }, + { + "epoch": 0.6498, + "grad_norm": 0.0037714431528002024, + "learning_rate": 6.58635969176849e-06, + "loss": 0.1691, + "step": 6498 + }, + { + "epoch": 0.65, + "grad_norm": 0.0007431897684000432, + "learning_rate": 6.579798566743314e-06, + "loss": 0.0, + "step": 6500 + }, + { + "epoch": 0.6502, + "grad_norm": 0.0006169555126689374, + "learning_rate": 6.573239108682769e-06, + "loss": 0.0071, + "step": 6502 + }, + { + "epoch": 0.6504, + "grad_norm": 0.041306983679533005, + "learning_rate": 6.566681320783849e-06, + "loss": 0.0007, + "step": 6504 + }, + { + "epoch": 0.6506, + "grad_norm": 1.168824553489685, + "learning_rate": 6.560125206242746e-06, + "loss": 0.1079, + "step": 6506 + }, + { + "epoch": 0.6508, + "grad_norm": 0.006744849029928446, + "learning_rate": 6.553570768254831e-06, + "loss": 0.0223, + "step": 6508 + }, + { + "epoch": 0.651, + "grad_norm": 0.005264751147478819, + "learning_rate": 6.547018010014654e-06, + "loss": 0.0001, + "step": 6510 + }, + { + "epoch": 0.6512, + "grad_norm": 0.054750990122556686, + "learning_rate": 6.540466934715953e-06, + "loss": 0.0019, + "step": 6512 + }, + { + "epoch": 0.6514, + "grad_norm": 0.059411171823740005, + "learning_rate": 6.53391754555164e-06, + "loss": 0.001, + "step": 6514 + }, + { + "epoch": 0.6516, + "grad_norm": 0.009196651168167591, + "learning_rate": 6.52736984571381e-06, + "loss": 0.0013, + "step": 6516 + }, + { + "epoch": 0.6518, + "grad_norm": 0.000669675471726805, + "learning_rate": 6.520823838393732e-06, + "loss": 0.0001, + "step": 6518 + }, + { + "epoch": 0.652, + "grad_norm": 4.0156965255737305, + "learning_rate": 6.5142795267818505e-06, + "loss": 0.0741, + "step": 6520 + }, + { + "epoch": 0.6522, + "grad_norm": 0.05136300250887871, + "learning_rate": 6.5077369140677815e-06, + "loss": 0.1301, + "step": 6522 + }, + { + "epoch": 0.6524, + "grad_norm": 0.01594615913927555, + "learning_rate": 6.501196003440313e-06, + "loss": 0.0007, + "step": 6524 + }, + { + "epoch": 0.6526, + "grad_norm": 0.1641697734594345, + "learning_rate": 6.494656798087412e-06, + "loss": 0.0019, + "step": 6526 + }, + { + "epoch": 0.6528, + "grad_norm": 1.9943448305130005, + "learning_rate": 6.488119301196201e-06, + "loss": 0.016, + "step": 6528 + }, + { + "epoch": 0.653, + "grad_norm": 0.24016273021697998, + "learning_rate": 6.481583515952983e-06, + "loss": 0.0037, + "step": 6530 + }, + { + "epoch": 0.6532, + "grad_norm": 0.052789706736803055, + "learning_rate": 6.475049445543215e-06, + "loss": 0.001, + "step": 6532 + }, + { + "epoch": 0.6534, + "grad_norm": 0.16242332756519318, + "learning_rate": 6.468517093151525e-06, + "loss": 0.0026, + "step": 6534 + }, + { + "epoch": 0.6536, + "grad_norm": 0.0051582385785877705, + "learning_rate": 6.461986461961706e-06, + "loss": 0.0002, + "step": 6536 + }, + { + "epoch": 0.6538, + "grad_norm": 0.02050444483757019, + "learning_rate": 6.455457555156706e-06, + "loss": 0.0005, + "step": 6538 + }, + { + "epoch": 0.654, + "grad_norm": 0.014511162415146828, + "learning_rate": 6.448930375918632e-06, + "loss": 0.0032, + "step": 6540 + }, + { + "epoch": 0.6542, + "grad_norm": 0.0059592886827886105, + "learning_rate": 6.442404927428751e-06, + "loss": 0.0006, + "step": 6542 + }, + { + "epoch": 0.6544, + "grad_norm": 0.121202751994133, + "learning_rate": 6.435881212867494e-06, + "loss": 0.0082, + "step": 6544 + }, + { + "epoch": 0.6546, + "grad_norm": 0.7882412672042847, + "learning_rate": 6.4293592354144365e-06, + "loss": 0.0051, + "step": 6546 + }, + { + "epoch": 0.6548, + "grad_norm": 0.005671194288879633, + "learning_rate": 6.422838998248308e-06, + "loss": 0.0006, + "step": 6548 + }, + { + "epoch": 0.655, + "grad_norm": 1.438106656074524, + "learning_rate": 6.4163205045469975e-06, + "loss": 0.0627, + "step": 6550 + }, + { + "epoch": 0.6552, + "grad_norm": 0.0020483327098190784, + "learning_rate": 6.409803757487539e-06, + "loss": 0.0008, + "step": 6552 + }, + { + "epoch": 0.6554, + "grad_norm": 0.47635334730148315, + "learning_rate": 6.403288760246112e-06, + "loss": 0.0166, + "step": 6554 + }, + { + "epoch": 0.6556, + "grad_norm": 0.3435601592063904, + "learning_rate": 6.396775515998055e-06, + "loss": 0.0047, + "step": 6556 + }, + { + "epoch": 0.6558, + "grad_norm": 0.003235279116779566, + "learning_rate": 6.390264027917836e-06, + "loss": 0.0001, + "step": 6558 + }, + { + "epoch": 0.656, + "grad_norm": 0.015407725237309933, + "learning_rate": 6.383754299179079e-06, + "loss": 0.0006, + "step": 6560 + }, + { + "epoch": 0.6562, + "grad_norm": 0.006486639380455017, + "learning_rate": 6.377246332954544e-06, + "loss": 0.0001, + "step": 6562 + }, + { + "epoch": 0.6564, + "grad_norm": 4.17015266418457, + "learning_rate": 6.370740132416138e-06, + "loss": 0.1133, + "step": 6564 + }, + { + "epoch": 0.6566, + "grad_norm": 0.002487716032192111, + "learning_rate": 6.364235700734903e-06, + "loss": 0.001, + "step": 6566 + }, + { + "epoch": 0.6568, + "grad_norm": 0.007318207528442144, + "learning_rate": 6.357733041081018e-06, + "loss": 0.001, + "step": 6568 + }, + { + "epoch": 0.657, + "grad_norm": 0.4330567419528961, + "learning_rate": 6.351232156623803e-06, + "loss": 0.0073, + "step": 6570 + }, + { + "epoch": 0.6572, + "grad_norm": 0.040842488408088684, + "learning_rate": 6.344733050531713e-06, + "loss": 0.0011, + "step": 6572 + }, + { + "epoch": 0.6574, + "grad_norm": 0.03309721499681473, + "learning_rate": 6.338235725972326e-06, + "loss": 0.0008, + "step": 6574 + }, + { + "epoch": 0.6576, + "grad_norm": 0.008517332375049591, + "learning_rate": 6.33174018611236e-06, + "loss": 0.0005, + "step": 6576 + }, + { + "epoch": 0.6578, + "grad_norm": 1.2280744314193726, + "learning_rate": 6.325246434117669e-06, + "loss": 0.0215, + "step": 6578 + }, + { + "epoch": 0.658, + "grad_norm": 0.007296715397387743, + "learning_rate": 6.318754473153221e-06, + "loss": 0.0004, + "step": 6580 + }, + { + "epoch": 0.6582, + "grad_norm": 0.006428585387766361, + "learning_rate": 6.3122643063831245e-06, + "loss": 0.0001, + "step": 6582 + }, + { + "epoch": 0.6584, + "grad_norm": 0.09260617941617966, + "learning_rate": 6.305775936970606e-06, + "loss": 0.0016, + "step": 6584 + }, + { + "epoch": 0.6586, + "grad_norm": 0.008841943927109241, + "learning_rate": 6.299289368078016e-06, + "loss": 0.0006, + "step": 6586 + }, + { + "epoch": 0.6588, + "grad_norm": 0.015550847165286541, + "learning_rate": 6.292804602866833e-06, + "loss": 0.0017, + "step": 6588 + }, + { + "epoch": 0.659, + "grad_norm": 0.007539430633187294, + "learning_rate": 6.286321644497655e-06, + "loss": 0.0002, + "step": 6590 + }, + { + "epoch": 0.6592, + "grad_norm": 0.7977985143661499, + "learning_rate": 6.27984049613019e-06, + "loss": 0.0063, + "step": 6592 + }, + { + "epoch": 0.6594, + "grad_norm": 0.001742080319672823, + "learning_rate": 6.273361160923271e-06, + "loss": 0.0005, + "step": 6594 + }, + { + "epoch": 0.6596, + "grad_norm": 0.0008708966779522598, + "learning_rate": 6.2668836420348535e-06, + "loss": 0.0004, + "step": 6596 + }, + { + "epoch": 0.6598, + "grad_norm": 0.04841049760580063, + "learning_rate": 6.260407942621998e-06, + "loss": 0.0011, + "step": 6598 + }, + { + "epoch": 0.66, + "grad_norm": 0.01063513569533825, + "learning_rate": 6.25393406584088e-06, + "loss": 0.0003, + "step": 6600 + }, + { + "epoch": 0.6602, + "grad_norm": 0.012021947652101517, + "learning_rate": 6.247462014846793e-06, + "loss": 0.0096, + "step": 6602 + }, + { + "epoch": 0.6604, + "grad_norm": 0.02140221744775772, + "learning_rate": 6.240991792794133e-06, + "loss": 0.0002, + "step": 6604 + }, + { + "epoch": 0.6606, + "grad_norm": 0.0017848938005045056, + "learning_rate": 6.234523402836408e-06, + "loss": 0.0001, + "step": 6606 + }, + { + "epoch": 0.6608, + "grad_norm": 0.41270384192466736, + "learning_rate": 6.228056848126236e-06, + "loss": 0.0084, + "step": 6608 + }, + { + "epoch": 0.661, + "grad_norm": 0.010343868285417557, + "learning_rate": 6.22159213181533e-06, + "loss": 0.0002, + "step": 6610 + }, + { + "epoch": 0.6612, + "grad_norm": 0.0011769182747229934, + "learning_rate": 6.2151292570545215e-06, + "loss": 0.0099, + "step": 6612 + }, + { + "epoch": 0.6614, + "grad_norm": 0.762412965297699, + "learning_rate": 6.208668226993731e-06, + "loss": 0.0127, + "step": 6614 + }, + { + "epoch": 0.6616, + "grad_norm": 0.011144968681037426, + "learning_rate": 6.202209044781991e-06, + "loss": 0.0003, + "step": 6616 + }, + { + "epoch": 0.6618, + "grad_norm": 8.074616432189941, + "learning_rate": 6.195751713567426e-06, + "loss": 0.2366, + "step": 6618 + }, + { + "epoch": 0.662, + "grad_norm": 0.013439065776765347, + "learning_rate": 6.18929623649726e-06, + "loss": 0.0002, + "step": 6620 + }, + { + "epoch": 0.6622, + "grad_norm": 0.008270317688584328, + "learning_rate": 6.182842616717817e-06, + "loss": 0.0002, + "step": 6622 + }, + { + "epoch": 0.6624, + "grad_norm": 0.00484053511172533, + "learning_rate": 6.176390857374508e-06, + "loss": 0.0001, + "step": 6624 + }, + { + "epoch": 0.6626, + "grad_norm": 0.25695109367370605, + "learning_rate": 6.169940961611853e-06, + "loss": 0.0026, + "step": 6626 + }, + { + "epoch": 0.6628, + "grad_norm": 0.002077881945297122, + "learning_rate": 6.1634929325734385e-06, + "loss": 0.0002, + "step": 6628 + }, + { + "epoch": 0.663, + "grad_norm": 0.07485402375459671, + "learning_rate": 6.157046773401964e-06, + "loss": 0.001, + "step": 6630 + }, + { + "epoch": 0.6632, + "grad_norm": 2.842996597290039, + "learning_rate": 6.150602487239207e-06, + "loss": 0.0749, + "step": 6632 + }, + { + "epoch": 0.6634, + "grad_norm": 0.001117366598919034, + "learning_rate": 6.144160077226035e-06, + "loss": 0.0004, + "step": 6634 + }, + { + "epoch": 0.6636, + "grad_norm": 12.996416091918945, + "learning_rate": 6.137719546502401e-06, + "loss": 0.1527, + "step": 6636 + }, + { + "epoch": 0.6638, + "grad_norm": 0.23966820538043976, + "learning_rate": 6.131280898207339e-06, + "loss": 0.0309, + "step": 6638 + }, + { + "epoch": 0.664, + "grad_norm": 0.06594846397638321, + "learning_rate": 6.124844135478971e-06, + "loss": 0.0053, + "step": 6640 + }, + { + "epoch": 0.6642, + "grad_norm": 0.03997421637177467, + "learning_rate": 6.118409261454494e-06, + "loss": 0.001, + "step": 6642 + }, + { + "epoch": 0.6644, + "grad_norm": 0.0023589867632836103, + "learning_rate": 6.1119762792701935e-06, + "loss": 0.0023, + "step": 6644 + }, + { + "epoch": 0.6646, + "grad_norm": 3.8594307899475098, + "learning_rate": 6.1055451920614165e-06, + "loss": 0.3075, + "step": 6646 + }, + { + "epoch": 0.6648, + "grad_norm": 0.002702753758057952, + "learning_rate": 6.099116002962604e-06, + "loss": 0.0056, + "step": 6648 + }, + { + "epoch": 0.665, + "grad_norm": 3.041414737701416, + "learning_rate": 6.092688715107265e-06, + "loss": 0.0733, + "step": 6650 + }, + { + "epoch": 0.6652, + "grad_norm": 0.01191772986203432, + "learning_rate": 6.086263331627976e-06, + "loss": 0.0072, + "step": 6652 + }, + { + "epoch": 0.6654, + "grad_norm": 0.5947399139404297, + "learning_rate": 6.079839855656397e-06, + "loss": 0.0081, + "step": 6654 + }, + { + "epoch": 0.6656, + "grad_norm": 1.1530510187149048, + "learning_rate": 6.073418290323251e-06, + "loss": 0.0396, + "step": 6656 + }, + { + "epoch": 0.6658, + "grad_norm": 0.02616964466869831, + "learning_rate": 6.066998638758326e-06, + "loss": 0.0004, + "step": 6658 + }, + { + "epoch": 0.666, + "grad_norm": 0.007014554925262928, + "learning_rate": 6.06058090409049e-06, + "loss": 0.0007, + "step": 6660 + }, + { + "epoch": 0.6662, + "grad_norm": 0.6230804324150085, + "learning_rate": 6.054165089447663e-06, + "loss": 0.0108, + "step": 6662 + }, + { + "epoch": 0.6664, + "grad_norm": 0.011729425750672817, + "learning_rate": 6.047751197956838e-06, + "loss": 0.0005, + "step": 6664 + }, + { + "epoch": 0.6666, + "grad_norm": 0.0330856591463089, + "learning_rate": 6.0413392327440635e-06, + "loss": 0.0067, + "step": 6666 + }, + { + "epoch": 0.6668, + "grad_norm": 0.009387311525642872, + "learning_rate": 6.0349291969344595e-06, + "loss": 0.0003, + "step": 6668 + }, + { + "epoch": 0.667, + "grad_norm": 0.15354883670806885, + "learning_rate": 6.028521093652195e-06, + "loss": 0.0033, + "step": 6670 + }, + { + "epoch": 0.6672, + "grad_norm": 0.16636072099208832, + "learning_rate": 6.022114926020504e-06, + "loss": 0.0029, + "step": 6672 + }, + { + "epoch": 0.6674, + "grad_norm": 0.004837170243263245, + "learning_rate": 6.015710697161674e-06, + "loss": 0.0006, + "step": 6674 + }, + { + "epoch": 0.6676, + "grad_norm": 0.17175152897834778, + "learning_rate": 6.009308410197048e-06, + "loss": 0.0021, + "step": 6676 + }, + { + "epoch": 0.6678, + "grad_norm": 2.6148271560668945, + "learning_rate": 6.002908068247024e-06, + "loss": 0.0568, + "step": 6678 + }, + { + "epoch": 0.668, + "grad_norm": 0.19363540410995483, + "learning_rate": 5.996509674431053e-06, + "loss": 0.0024, + "step": 6680 + }, + { + "epoch": 0.6682, + "grad_norm": 0.011035493575036526, + "learning_rate": 5.990113231867629e-06, + "loss": 0.0002, + "step": 6682 + }, + { + "epoch": 0.6684, + "grad_norm": 0.08786016702651978, + "learning_rate": 5.983718743674302e-06, + "loss": 0.0022, + "step": 6684 + }, + { + "epoch": 0.6686, + "grad_norm": 0.053424444049596786, + "learning_rate": 5.977326212967671e-06, + "loss": 0.0011, + "step": 6686 + }, + { + "epoch": 0.6688, + "grad_norm": 0.13360507786273956, + "learning_rate": 5.970935642863375e-06, + "loss": 0.0011, + "step": 6688 + }, + { + "epoch": 0.669, + "grad_norm": 0.007187319453805685, + "learning_rate": 5.9645470364761e-06, + "loss": 0.0005, + "step": 6690 + }, + { + "epoch": 0.6692, + "grad_norm": 0.0249757282435894, + "learning_rate": 5.958160396919577e-06, + "loss": 0.1853, + "step": 6692 + }, + { + "epoch": 0.6694, + "grad_norm": 0.0016929536359384656, + "learning_rate": 5.951775727306577e-06, + "loss": 0.0023, + "step": 6694 + }, + { + "epoch": 0.6696, + "grad_norm": 0.008336673490703106, + "learning_rate": 5.94539303074891e-06, + "loss": 0.0008, + "step": 6696 + }, + { + "epoch": 0.6698, + "grad_norm": 0.025992054492235184, + "learning_rate": 5.939012310357422e-06, + "loss": 0.0026, + "step": 6698 + }, + { + "epoch": 0.67, + "grad_norm": 0.010268126614391804, + "learning_rate": 5.932633569242e-06, + "loss": 0.0003, + "step": 6700 + }, + { + "epoch": 0.6702, + "grad_norm": 0.016421498730778694, + "learning_rate": 5.926256810511566e-06, + "loss": 0.0004, + "step": 6702 + }, + { + "epoch": 0.6704, + "grad_norm": 0.0041176024824380875, + "learning_rate": 5.9198820372740726e-06, + "loss": 0.0019, + "step": 6704 + }, + { + "epoch": 0.6706, + "grad_norm": 0.0056220171973109245, + "learning_rate": 5.913509252636511e-06, + "loss": 0.0153, + "step": 6706 + }, + { + "epoch": 0.6708, + "grad_norm": 0.003321915864944458, + "learning_rate": 5.907138459704895e-06, + "loss": 0.0001, + "step": 6708 + }, + { + "epoch": 0.671, + "grad_norm": 0.03227420151233673, + "learning_rate": 5.900769661584273e-06, + "loss": 0.0006, + "step": 6710 + }, + { + "epoch": 0.6712, + "grad_norm": 1.243624210357666, + "learning_rate": 5.894402861378721e-06, + "loss": 0.024, + "step": 6712 + }, + { + "epoch": 0.6714, + "grad_norm": 0.7957079410552979, + "learning_rate": 5.88803806219134e-06, + "loss": 0.009, + "step": 6714 + }, + { + "epoch": 0.6716, + "grad_norm": 0.09081167727708817, + "learning_rate": 5.881675267124254e-06, + "loss": 0.0016, + "step": 6716 + }, + { + "epoch": 0.6718, + "grad_norm": 0.15728148818016052, + "learning_rate": 5.8753144792786095e-06, + "loss": 0.0016, + "step": 6718 + }, + { + "epoch": 0.672, + "grad_norm": 0.059372611343860626, + "learning_rate": 5.868955701754584e-06, + "loss": 0.001, + "step": 6720 + }, + { + "epoch": 0.6722, + "grad_norm": 0.031107597053050995, + "learning_rate": 5.862598937651364e-06, + "loss": 0.0015, + "step": 6722 + }, + { + "epoch": 0.6724, + "grad_norm": 0.012427779845893383, + "learning_rate": 5.85624419006716e-06, + "loss": 0.0002, + "step": 6724 + }, + { + "epoch": 0.6726, + "grad_norm": 0.16562356054782867, + "learning_rate": 5.849891462099199e-06, + "loss": 0.0979, + "step": 6726 + }, + { + "epoch": 0.6728, + "grad_norm": 0.29345473647117615, + "learning_rate": 5.843540756843722e-06, + "loss": 0.0028, + "step": 6728 + }, + { + "epoch": 0.673, + "grad_norm": 0.3953923285007477, + "learning_rate": 5.83719207739599e-06, + "loss": 0.019, + "step": 6730 + }, + { + "epoch": 0.6732, + "grad_norm": 2.4097073078155518, + "learning_rate": 5.830845426850268e-06, + "loss": 0.0604, + "step": 6732 + }, + { + "epoch": 0.6734, + "grad_norm": 0.060556214302778244, + "learning_rate": 5.824500808299836e-06, + "loss": 0.0017, + "step": 6734 + }, + { + "epoch": 0.6736, + "grad_norm": 0.11853527277708054, + "learning_rate": 5.818158224836987e-06, + "loss": 0.0018, + "step": 6736 + }, + { + "epoch": 0.6738, + "grad_norm": 0.05164762958884239, + "learning_rate": 5.811817679553018e-06, + "loss": 0.0124, + "step": 6738 + }, + { + "epoch": 0.674, + "grad_norm": 0.02714848890900612, + "learning_rate": 5.8054791755382286e-06, + "loss": 0.0005, + "step": 6740 + }, + { + "epoch": 0.6742, + "grad_norm": 0.01557651162147522, + "learning_rate": 5.799142715881938e-06, + "loss": 0.1649, + "step": 6742 + }, + { + "epoch": 0.6744, + "grad_norm": 0.08352338522672653, + "learning_rate": 5.792808303672454e-06, + "loss": 0.0017, + "step": 6744 + }, + { + "epoch": 0.6746, + "grad_norm": 0.05321921780705452, + "learning_rate": 5.786475941997094e-06, + "loss": 0.0017, + "step": 6746 + }, + { + "epoch": 0.6748, + "grad_norm": 0.00256960466504097, + "learning_rate": 5.780145633942173e-06, + "loss": 0.0005, + "step": 6748 + }, + { + "epoch": 0.675, + "grad_norm": 0.00647959066554904, + "learning_rate": 5.773817382593008e-06, + "loss": 0.0008, + "step": 6750 + }, + { + "epoch": 0.6752, + "grad_norm": 0.24289295077323914, + "learning_rate": 5.7674911910339094e-06, + "loss": 0.0028, + "step": 6752 + }, + { + "epoch": 0.6754, + "grad_norm": 0.05629004165530205, + "learning_rate": 5.761167062348187e-06, + "loss": 0.0981, + "step": 6754 + }, + { + "epoch": 0.6756, + "grad_norm": 0.013790975324809551, + "learning_rate": 5.754844999618144e-06, + "loss": 0.0004, + "step": 6756 + }, + { + "epoch": 0.6758, + "grad_norm": 0.07368209213018417, + "learning_rate": 5.748525005925074e-06, + "loss": 0.0049, + "step": 6758 + }, + { + "epoch": 0.676, + "grad_norm": 0.02065877988934517, + "learning_rate": 5.742207084349274e-06, + "loss": 0.0009, + "step": 6760 + }, + { + "epoch": 0.6762, + "grad_norm": 0.03507755696773529, + "learning_rate": 5.735891237970015e-06, + "loss": 0.0012, + "step": 6762 + }, + { + "epoch": 0.6764, + "grad_norm": 0.09122828394174576, + "learning_rate": 5.729577469865566e-06, + "loss": 0.0014, + "step": 6764 + }, + { + "epoch": 0.6766, + "grad_norm": 0.0724729597568512, + "learning_rate": 5.723265783113181e-06, + "loss": 0.1222, + "step": 6766 + }, + { + "epoch": 0.6768, + "grad_norm": 0.5860071182250977, + "learning_rate": 5.716956180789098e-06, + "loss": 0.0206, + "step": 6768 + }, + { + "epoch": 0.677, + "grad_norm": 0.02372116595506668, + "learning_rate": 5.710648665968543e-06, + "loss": 0.0108, + "step": 6770 + }, + { + "epoch": 0.6772, + "grad_norm": 0.0415017232298851, + "learning_rate": 5.704343241725719e-06, + "loss": 0.002, + "step": 6772 + }, + { + "epoch": 0.6774, + "grad_norm": 0.022251788526773453, + "learning_rate": 5.698039911133816e-06, + "loss": 0.0018, + "step": 6774 + }, + { + "epoch": 0.6776, + "grad_norm": 0.04581568390130997, + "learning_rate": 5.691738677265e-06, + "loss": 0.0036, + "step": 6776 + }, + { + "epoch": 0.6778, + "grad_norm": 0.07199141383171082, + "learning_rate": 5.685439543190409e-06, + "loss": 0.0012, + "step": 6778 + }, + { + "epoch": 0.678, + "grad_norm": 0.0010102344676852226, + "learning_rate": 5.679142511980176e-06, + "loss": 0.0002, + "step": 6780 + }, + { + "epoch": 0.6782, + "grad_norm": 0.008399094454944134, + "learning_rate": 5.672847586703393e-06, + "loss": 0.0002, + "step": 6782 + }, + { + "epoch": 0.6784, + "grad_norm": 7.055850982666016, + "learning_rate": 5.666554770428129e-06, + "loss": 0.0627, + "step": 6784 + }, + { + "epoch": 0.6786, + "grad_norm": 0.3727249503135681, + "learning_rate": 5.660264066221426e-06, + "loss": 0.0027, + "step": 6786 + }, + { + "epoch": 0.6788, + "grad_norm": 1.195907711982727, + "learning_rate": 5.653975477149298e-06, + "loss": 0.0208, + "step": 6788 + }, + { + "epoch": 0.679, + "grad_norm": 1.87263023853302, + "learning_rate": 5.647689006276727e-06, + "loss": 0.0244, + "step": 6790 + }, + { + "epoch": 0.6792, + "grad_norm": 0.009281485341489315, + "learning_rate": 5.641404656667661e-06, + "loss": 0.0003, + "step": 6792 + }, + { + "epoch": 0.6794, + "grad_norm": 0.00837109237909317, + "learning_rate": 5.6351224313850165e-06, + "loss": 0.0009, + "step": 6794 + }, + { + "epoch": 0.6796, + "grad_norm": 0.0029078274965286255, + "learning_rate": 5.628842333490674e-06, + "loss": 0.0007, + "step": 6796 + }, + { + "epoch": 0.6798, + "grad_norm": 0.006026624236255884, + "learning_rate": 5.622564366045472e-06, + "loss": 0.0002, + "step": 6798 + }, + { + "epoch": 0.68, + "grad_norm": 0.11210373789072037, + "learning_rate": 5.616288532109225e-06, + "loss": 0.0036, + "step": 6800 + }, + { + "epoch": 0.6802, + "grad_norm": 0.020680518820881844, + "learning_rate": 5.610014834740694e-06, + "loss": 0.0037, + "step": 6802 + }, + { + "epoch": 0.6804, + "grad_norm": 0.01062803901731968, + "learning_rate": 5.603743276997607e-06, + "loss": 0.05, + "step": 6804 + }, + { + "epoch": 0.6806, + "grad_norm": 8.232890129089355, + "learning_rate": 5.59747386193663e-06, + "loss": 0.1136, + "step": 6806 + }, + { + "epoch": 0.6808, + "grad_norm": 0.0067025842145085335, + "learning_rate": 5.591206592613416e-06, + "loss": 0.0005, + "step": 6808 + }, + { + "epoch": 0.681, + "grad_norm": 0.008817274123430252, + "learning_rate": 5.584941472082549e-06, + "loss": 0.0057, + "step": 6810 + }, + { + "epoch": 0.6812, + "grad_norm": 0.02928899973630905, + "learning_rate": 5.5786785033975745e-06, + "loss": 0.0004, + "step": 6812 + }, + { + "epoch": 0.6814, + "grad_norm": 0.0040833731181919575, + "learning_rate": 5.572417689610987e-06, + "loss": 0.0014, + "step": 6814 + }, + { + "epoch": 0.6816, + "grad_norm": 0.026080012321472168, + "learning_rate": 5.5661590337742255e-06, + "loss": 0.0004, + "step": 6816 + }, + { + "epoch": 0.6818, + "grad_norm": 0.011470077559351921, + "learning_rate": 5.559902538937694e-06, + "loss": 0.0005, + "step": 6818 + }, + { + "epoch": 0.682, + "grad_norm": 0.0030344543047249317, + "learning_rate": 5.553648208150728e-06, + "loss": 0.0001, + "step": 6820 + }, + { + "epoch": 0.6822, + "grad_norm": 0.03141999617218971, + "learning_rate": 5.5473960444616085e-06, + "loss": 0.001, + "step": 6822 + }, + { + "epoch": 0.6824, + "grad_norm": 0.47262027859687805, + "learning_rate": 5.5411460509175605e-06, + "loss": 0.0209, + "step": 6824 + }, + { + "epoch": 0.6826, + "grad_norm": 0.0017993871588259935, + "learning_rate": 5.534898230564765e-06, + "loss": 0.0006, + "step": 6826 + }, + { + "epoch": 0.6828, + "grad_norm": 0.03312842547893524, + "learning_rate": 5.5286525864483285e-06, + "loss": 0.0007, + "step": 6828 + }, + { + "epoch": 0.683, + "grad_norm": 0.03399842604994774, + "learning_rate": 5.522409121612304e-06, + "loss": 0.0005, + "step": 6830 + }, + { + "epoch": 0.6832, + "grad_norm": 0.009786369279026985, + "learning_rate": 5.516167839099679e-06, + "loss": 0.0036, + "step": 6832 + }, + { + "epoch": 0.6834, + "grad_norm": 0.001568136503919959, + "learning_rate": 5.50992874195238e-06, + "loss": 0.0009, + "step": 6834 + }, + { + "epoch": 0.6836, + "grad_norm": 0.00060070282779634, + "learning_rate": 5.50369183321126e-06, + "loss": 0.1453, + "step": 6836 + }, + { + "epoch": 0.6838, + "grad_norm": 0.12824353575706482, + "learning_rate": 5.497457115916127e-06, + "loss": 0.0231, + "step": 6838 + }, + { + "epoch": 0.684, + "grad_norm": 0.00567758921533823, + "learning_rate": 5.491224593105695e-06, + "loss": 0.0053, + "step": 6840 + }, + { + "epoch": 0.6842, + "grad_norm": 0.006748121697455645, + "learning_rate": 5.484994267817624e-06, + "loss": 0.0005, + "step": 6842 + }, + { + "epoch": 0.6844, + "grad_norm": 0.00943030882626772, + "learning_rate": 5.478766143088492e-06, + "loss": 0.0005, + "step": 6844 + }, + { + "epoch": 0.6846, + "grad_norm": 0.01956344209611416, + "learning_rate": 5.472540221953824e-06, + "loss": 0.0145, + "step": 6846 + }, + { + "epoch": 0.6848, + "grad_norm": 1.3382668495178223, + "learning_rate": 5.466316507448049e-06, + "loss": 0.027, + "step": 6848 + }, + { + "epoch": 0.685, + "grad_norm": 0.03267885744571686, + "learning_rate": 5.460095002604533e-06, + "loss": 0.0009, + "step": 6850 + }, + { + "epoch": 0.6852, + "grad_norm": 0.09375008195638657, + "learning_rate": 5.453875710455562e-06, + "loss": 0.0103, + "step": 6852 + }, + { + "epoch": 0.6854, + "grad_norm": 0.02206331305205822, + "learning_rate": 5.447658634032338e-06, + "loss": 0.0007, + "step": 6854 + }, + { + "epoch": 0.6856, + "grad_norm": 0.0011815131874755025, + "learning_rate": 5.441443776365003e-06, + "loss": 0.0043, + "step": 6856 + }, + { + "epoch": 0.6858, + "grad_norm": 0.004387843422591686, + "learning_rate": 5.435231140482588e-06, + "loss": 0.0005, + "step": 6858 + }, + { + "epoch": 0.686, + "grad_norm": 0.061524372547864914, + "learning_rate": 5.429020729413062e-06, + "loss": 0.0012, + "step": 6860 + }, + { + "epoch": 0.6862, + "grad_norm": 0.019283220171928406, + "learning_rate": 5.4228125461833026e-06, + "loss": 0.0008, + "step": 6862 + }, + { + "epoch": 0.6864, + "grad_norm": 0.04242269694805145, + "learning_rate": 5.416606593819102e-06, + "loss": 0.0007, + "step": 6864 + }, + { + "epoch": 0.6866, + "grad_norm": 1.0819131135940552, + "learning_rate": 5.41040287534517e-06, + "loss": 0.0419, + "step": 6866 + }, + { + "epoch": 0.6868, + "grad_norm": 0.040847089141607285, + "learning_rate": 5.404201393785123e-06, + "loss": 0.001, + "step": 6868 + }, + { + "epoch": 0.687, + "grad_norm": 0.00235432805493474, + "learning_rate": 5.398002152161484e-06, + "loss": 0.0001, + "step": 6870 + }, + { + "epoch": 0.6872, + "grad_norm": 0.01538170501589775, + "learning_rate": 5.391805153495693e-06, + "loss": 0.0015, + "step": 6872 + }, + { + "epoch": 0.6874, + "grad_norm": 9.504051208496094, + "learning_rate": 5.385610400808088e-06, + "loss": 0.1901, + "step": 6874 + }, + { + "epoch": 0.6876, + "grad_norm": 2.1570615768432617, + "learning_rate": 5.379417897117917e-06, + "loss": 0.0441, + "step": 6876 + }, + { + "epoch": 0.6878, + "grad_norm": 0.06516844779253006, + "learning_rate": 5.373227645443332e-06, + "loss": 0.0009, + "step": 6878 + }, + { + "epoch": 0.688, + "grad_norm": 0.13168412446975708, + "learning_rate": 5.367039648801386e-06, + "loss": 0.0023, + "step": 6880 + }, + { + "epoch": 0.6882, + "grad_norm": 0.0010874213185161352, + "learning_rate": 5.360853910208028e-06, + "loss": 0.0004, + "step": 6882 + }, + { + "epoch": 0.6884, + "grad_norm": 0.1569470465183258, + "learning_rate": 5.354670432678124e-06, + "loss": 0.0095, + "step": 6884 + }, + { + "epoch": 0.6886, + "grad_norm": 0.16198831796646118, + "learning_rate": 5.348489219225417e-06, + "loss": 0.0026, + "step": 6886 + }, + { + "epoch": 0.6888, + "grad_norm": 3.654862403869629, + "learning_rate": 5.342310272862558e-06, + "loss": 0.0581, + "step": 6888 + }, + { + "epoch": 0.689, + "grad_norm": 0.04695506393909454, + "learning_rate": 5.336133596601089e-06, + "loss": 0.0007, + "step": 6890 + }, + { + "epoch": 0.6892, + "grad_norm": 0.04201231524348259, + "learning_rate": 5.3299591934514485e-06, + "loss": 0.0006, + "step": 6892 + }, + { + "epoch": 0.6894, + "grad_norm": 1.5836433172225952, + "learning_rate": 5.323787066422964e-06, + "loss": 0.0233, + "step": 6894 + }, + { + "epoch": 0.6896, + "grad_norm": 0.011214416474103928, + "learning_rate": 5.317617218523856e-06, + "loss": 0.0003, + "step": 6896 + }, + { + "epoch": 0.6898, + "grad_norm": 0.011503566987812519, + "learning_rate": 5.311449652761235e-06, + "loss": 0.0004, + "step": 6898 + }, + { + "epoch": 0.69, + "grad_norm": 0.2189198136329651, + "learning_rate": 5.305284372141095e-06, + "loss": 0.0146, + "step": 6900 + }, + { + "epoch": 0.6902, + "grad_norm": 0.7174133062362671, + "learning_rate": 5.299121379668316e-06, + "loss": 0.0079, + "step": 6902 + }, + { + "epoch": 0.6904, + "grad_norm": 0.018454063683748245, + "learning_rate": 5.292960678346674e-06, + "loss": 0.0003, + "step": 6904 + }, + { + "epoch": 0.6906, + "grad_norm": 0.005114637780934572, + "learning_rate": 5.286802271178815e-06, + "loss": 0.0002, + "step": 6906 + }, + { + "epoch": 0.6908, + "grad_norm": 0.06506377458572388, + "learning_rate": 5.280646161166274e-06, + "loss": 0.0012, + "step": 6908 + }, + { + "epoch": 0.691, + "grad_norm": 1.7541770935058594, + "learning_rate": 5.274492351309462e-06, + "loss": 0.0558, + "step": 6910 + }, + { + "epoch": 0.6912, + "grad_norm": 0.018325213342905045, + "learning_rate": 5.26834084460767e-06, + "loss": 0.0007, + "step": 6912 + }, + { + "epoch": 0.6914, + "grad_norm": 0.001591245410963893, + "learning_rate": 5.262191644059071e-06, + "loss": 0.0008, + "step": 6914 + }, + { + "epoch": 0.6916, + "grad_norm": 0.008631414733827114, + "learning_rate": 5.256044752660709e-06, + "loss": 0.0006, + "step": 6916 + }, + { + "epoch": 0.6918, + "grad_norm": 0.007518104277551174, + "learning_rate": 5.2499001734085045e-06, + "loss": 0.0002, + "step": 6918 + }, + { + "epoch": 0.692, + "grad_norm": 0.018207140266895294, + "learning_rate": 5.243757909297247e-06, + "loss": 0.1527, + "step": 6920 + }, + { + "epoch": 0.6922, + "grad_norm": 0.027980441227555275, + "learning_rate": 5.237617963320608e-06, + "loss": 0.0012, + "step": 6922 + }, + { + "epoch": 0.6924, + "grad_norm": 0.030276238918304443, + "learning_rate": 5.23148033847112e-06, + "loss": 0.0641, + "step": 6924 + }, + { + "epoch": 0.6926, + "grad_norm": 0.011113560758531094, + "learning_rate": 5.225345037740186e-06, + "loss": 0.0003, + "step": 6926 + }, + { + "epoch": 0.6928, + "grad_norm": 0.005331328138709068, + "learning_rate": 5.219212064118079e-06, + "loss": 0.0008, + "step": 6928 + }, + { + "epoch": 0.693, + "grad_norm": 0.09531562775373459, + "learning_rate": 5.213081420593933e-06, + "loss": 0.0022, + "step": 6930 + }, + { + "epoch": 0.6932, + "grad_norm": 0.06276984512805939, + "learning_rate": 5.2069531101557505e-06, + "loss": 0.0103, + "step": 6932 + }, + { + "epoch": 0.6934, + "grad_norm": 4.787047863006592, + "learning_rate": 5.200827135790396e-06, + "loss": 0.0313, + "step": 6934 + }, + { + "epoch": 0.6936, + "grad_norm": 1.154327392578125, + "learning_rate": 5.194703500483593e-06, + "loss": 0.0398, + "step": 6936 + }, + { + "epoch": 0.6938, + "grad_norm": 0.016400529071688652, + "learning_rate": 5.188582207219931e-06, + "loss": 0.0058, + "step": 6938 + }, + { + "epoch": 0.694, + "grad_norm": 0.024611592292785645, + "learning_rate": 5.1824632589828465e-06, + "loss": 0.0007, + "step": 6940 + }, + { + "epoch": 0.6942, + "grad_norm": 0.018706737086176872, + "learning_rate": 5.176346658754648e-06, + "loss": 0.0016, + "step": 6942 + }, + { + "epoch": 0.6944, + "grad_norm": 0.00897308625280857, + "learning_rate": 5.1702324095164955e-06, + "loss": 0.0003, + "step": 6944 + }, + { + "epoch": 0.6946, + "grad_norm": 0.0034114725422114134, + "learning_rate": 5.16412051424839e-06, + "loss": 0.0003, + "step": 6946 + }, + { + "epoch": 0.6948, + "grad_norm": 0.01823251135647297, + "learning_rate": 5.158010975929193e-06, + "loss": 0.0005, + "step": 6948 + }, + { + "epoch": 0.695, + "grad_norm": 1.6444127559661865, + "learning_rate": 5.151903797536631e-06, + "loss": 0.0327, + "step": 6950 + }, + { + "epoch": 0.6952, + "grad_norm": 0.010535416193306446, + "learning_rate": 5.145798982047261e-06, + "loss": 0.2109, + "step": 6952 + }, + { + "epoch": 0.6954, + "grad_norm": 0.012709192931652069, + "learning_rate": 5.139696532436499e-06, + "loss": 0.0009, + "step": 6954 + }, + { + "epoch": 0.6956, + "grad_norm": 0.005986664444208145, + "learning_rate": 5.133596451678603e-06, + "loss": 0.0073, + "step": 6956 + }, + { + "epoch": 0.6958, + "grad_norm": 0.005827431567013264, + "learning_rate": 5.127498742746675e-06, + "loss": 0.0002, + "step": 6958 + }, + { + "epoch": 0.696, + "grad_norm": 0.0093699274584651, + "learning_rate": 5.121403408612672e-06, + "loss": 0.0102, + "step": 6960 + }, + { + "epoch": 0.6962, + "grad_norm": 0.009417987428605556, + "learning_rate": 5.115310452247386e-06, + "loss": 0.0012, + "step": 6962 + }, + { + "epoch": 0.6964, + "grad_norm": 0.0032461797818541527, + "learning_rate": 5.109219876620441e-06, + "loss": 0.0001, + "step": 6964 + }, + { + "epoch": 0.6966, + "grad_norm": 0.10027706623077393, + "learning_rate": 5.103131684700315e-06, + "loss": 0.0343, + "step": 6966 + }, + { + "epoch": 0.6968, + "grad_norm": 0.029397081583738327, + "learning_rate": 5.0970458794543135e-06, + "loss": 0.0007, + "step": 6968 + }, + { + "epoch": 0.697, + "grad_norm": 0.11807326227426529, + "learning_rate": 5.090962463848592e-06, + "loss": 0.002, + "step": 6970 + }, + { + "epoch": 0.6972, + "grad_norm": 0.02331204153597355, + "learning_rate": 5.0848814408481305e-06, + "loss": 0.0007, + "step": 6972 + }, + { + "epoch": 0.6974, + "grad_norm": 0.006142944563180208, + "learning_rate": 5.078802813416746e-06, + "loss": 0.0013, + "step": 6974 + }, + { + "epoch": 0.6976, + "grad_norm": 8.515286445617676, + "learning_rate": 5.072726584517086e-06, + "loss": 0.0633, + "step": 6976 + }, + { + "epoch": 0.6978, + "grad_norm": 0.0038554775528609753, + "learning_rate": 5.066652757110628e-06, + "loss": 0.0004, + "step": 6978 + }, + { + "epoch": 0.698, + "grad_norm": 0.0054613142274320126, + "learning_rate": 5.060581334157693e-06, + "loss": 0.0242, + "step": 6980 + }, + { + "epoch": 0.6982, + "grad_norm": 0.06631436198949814, + "learning_rate": 5.054512318617406e-06, + "loss": 0.0013, + "step": 6982 + }, + { + "epoch": 0.6984, + "grad_norm": 0.10724499076604843, + "learning_rate": 5.048445713447738e-06, + "loss": 0.0432, + "step": 6984 + }, + { + "epoch": 0.6986, + "grad_norm": 0.0003937301808036864, + "learning_rate": 5.042381521605473e-06, + "loss": 0.0002, + "step": 6986 + }, + { + "epoch": 0.6988, + "grad_norm": 0.20628462731838226, + "learning_rate": 5.036319746046232e-06, + "loss": 0.0047, + "step": 6988 + }, + { + "epoch": 0.699, + "grad_norm": 0.0008893320336937904, + "learning_rate": 5.030260389724447e-06, + "loss": 0.0003, + "step": 6990 + }, + { + "epoch": 0.6992, + "grad_norm": 0.0035525488201528788, + "learning_rate": 5.024203455593375e-06, + "loss": 0.0004, + "step": 6992 + }, + { + "epoch": 0.6994, + "grad_norm": 0.0016102733789011836, + "learning_rate": 5.018148946605092e-06, + "loss": 0.006, + "step": 6994 + }, + { + "epoch": 0.6996, + "grad_norm": 0.010389769449830055, + "learning_rate": 5.012096865710494e-06, + "loss": 0.0003, + "step": 6996 + }, + { + "epoch": 0.6998, + "grad_norm": 0.19826185703277588, + "learning_rate": 5.0060472158592885e-06, + "loss": 0.0021, + "step": 6998 + }, + { + "epoch": 0.7, + "grad_norm": 0.6306955218315125, + "learning_rate": 5.000000000000003e-06, + "loss": 0.0089, + "step": 7000 + }, + { + "epoch": 0.7002, + "grad_norm": 0.22006924450397491, + "learning_rate": 4.993955221079976e-06, + "loss": 0.0081, + "step": 7002 + }, + { + "epoch": 0.7004, + "grad_norm": 0.05375058203935623, + "learning_rate": 4.98791288204536e-06, + "loss": 0.0023, + "step": 7004 + }, + { + "epoch": 0.7006, + "grad_norm": 0.07351608574390411, + "learning_rate": 4.981872985841115e-06, + "loss": 0.0008, + "step": 7006 + }, + { + "epoch": 0.7008, + "grad_norm": 0.5011048913002014, + "learning_rate": 4.97583553541102e-06, + "loss": 0.0051, + "step": 7008 + }, + { + "epoch": 0.701, + "grad_norm": 0.12022681534290314, + "learning_rate": 4.96980053369765e-06, + "loss": 0.0011, + "step": 7010 + }, + { + "epoch": 0.7012, + "grad_norm": 0.0010663126595318317, + "learning_rate": 4.9637679836423926e-06, + "loss": 0.0006, + "step": 7012 + }, + { + "epoch": 0.7014, + "grad_norm": 0.05113821104168892, + "learning_rate": 4.957737888185439e-06, + "loss": 0.0005, + "step": 7014 + }, + { + "epoch": 0.7016, + "grad_norm": 0.7952742576599121, + "learning_rate": 4.951710250265785e-06, + "loss": 0.0068, + "step": 7016 + }, + { + "epoch": 0.7018, + "grad_norm": 0.22491037845611572, + "learning_rate": 4.945685072821227e-06, + "loss": 0.0027, + "step": 7018 + }, + { + "epoch": 0.702, + "grad_norm": 0.02410183660686016, + "learning_rate": 4.939662358788364e-06, + "loss": 0.0426, + "step": 7020 + }, + { + "epoch": 0.7022, + "grad_norm": 0.0007158624939620495, + "learning_rate": 4.933642111102595e-06, + "loss": 0.0002, + "step": 7022 + }, + { + "epoch": 0.7024, + "grad_norm": 0.2799276113510132, + "learning_rate": 4.927624332698109e-06, + "loss": 0.0735, + "step": 7024 + }, + { + "epoch": 0.7026, + "grad_norm": 0.028988728299736977, + "learning_rate": 4.921609026507907e-06, + "loss": 0.0005, + "step": 7026 + }, + { + "epoch": 0.7028, + "grad_norm": 0.01711479015648365, + "learning_rate": 4.915596195463773e-06, + "loss": 0.002, + "step": 7028 + }, + { + "epoch": 0.703, + "grad_norm": 0.5638906955718994, + "learning_rate": 4.909585842496287e-06, + "loss": 0.0077, + "step": 7030 + }, + { + "epoch": 0.7032, + "grad_norm": 0.0008743267389945686, + "learning_rate": 4.903577970534823e-06, + "loss": 0.0491, + "step": 7032 + }, + { + "epoch": 0.7034, + "grad_norm": 3.8941051959991455, + "learning_rate": 4.897572582507544e-06, + "loss": 0.0444, + "step": 7034 + }, + { + "epoch": 0.7036, + "grad_norm": 0.007225708570331335, + "learning_rate": 4.891569681341403e-06, + "loss": 0.0002, + "step": 7036 + }, + { + "epoch": 0.7038, + "grad_norm": 0.002633677562698722, + "learning_rate": 4.885569269962142e-06, + "loss": 0.0255, + "step": 7038 + }, + { + "epoch": 0.704, + "grad_norm": 0.003760006744414568, + "learning_rate": 4.879571351294287e-06, + "loss": 0.0001, + "step": 7040 + }, + { + "epoch": 0.7042, + "grad_norm": 0.12538063526153564, + "learning_rate": 4.873575928261151e-06, + "loss": 0.0018, + "step": 7042 + }, + { + "epoch": 0.7044, + "grad_norm": 0.002150175394490361, + "learning_rate": 4.8675830037848295e-06, + "loss": 0.0, + "step": 7044 + }, + { + "epoch": 0.7046, + "grad_norm": 0.0037344847805798054, + "learning_rate": 4.861592580786205e-06, + "loss": 0.3382, + "step": 7046 + }, + { + "epoch": 0.7048, + "grad_norm": 0.014177968725562096, + "learning_rate": 4.855604662184935e-06, + "loss": 0.0012, + "step": 7048 + }, + { + "epoch": 0.705, + "grad_norm": 3.3478283882141113, + "learning_rate": 4.849619250899458e-06, + "loss": 0.1276, + "step": 7050 + }, + { + "epoch": 0.7052, + "grad_norm": 0.03939853236079216, + "learning_rate": 4.843636349846991e-06, + "loss": 0.0006, + "step": 7052 + }, + { + "epoch": 0.7054, + "grad_norm": 0.1458790898323059, + "learning_rate": 4.837655961943526e-06, + "loss": 0.0015, + "step": 7054 + }, + { + "epoch": 0.7056, + "grad_norm": 0.027901392430067062, + "learning_rate": 4.831678090103832e-06, + "loss": 0.0012, + "step": 7056 + }, + { + "epoch": 0.7058, + "grad_norm": 0.15738597512245178, + "learning_rate": 4.825702737241452e-06, + "loss": 0.0144, + "step": 7058 + }, + { + "epoch": 0.706, + "grad_norm": 0.0038130206521600485, + "learning_rate": 4.8197299062687e-06, + "loss": 0.001, + "step": 7060 + }, + { + "epoch": 0.7062, + "grad_norm": 0.05160106346011162, + "learning_rate": 4.813759600096661e-06, + "loss": 0.0017, + "step": 7062 + }, + { + "epoch": 0.7064, + "grad_norm": 2.2329866886138916, + "learning_rate": 4.807791821635186e-06, + "loss": 0.049, + "step": 7064 + }, + { + "epoch": 0.7066, + "grad_norm": 0.09297562390565872, + "learning_rate": 4.801826573792905e-06, + "loss": 0.002, + "step": 7066 + }, + { + "epoch": 0.7068, + "grad_norm": 0.0023454781621694565, + "learning_rate": 4.795863859477207e-06, + "loss": 0.0009, + "step": 7068 + }, + { + "epoch": 0.707, + "grad_norm": 0.002076120348647237, + "learning_rate": 4.78990368159424e-06, + "loss": 0.0014, + "step": 7070 + }, + { + "epoch": 0.7072, + "grad_norm": 0.0567765012383461, + "learning_rate": 4.783946043048922e-06, + "loss": 0.0419, + "step": 7072 + }, + { + "epoch": 0.7074, + "grad_norm": 4.062353610992432, + "learning_rate": 4.7779909467449416e-06, + "loss": 0.0576, + "step": 7074 + }, + { + "epoch": 0.7076, + "grad_norm": 0.9347897171974182, + "learning_rate": 4.772038395584735e-06, + "loss": 0.0315, + "step": 7076 + }, + { + "epoch": 0.7078, + "grad_norm": 0.017569538205862045, + "learning_rate": 4.7660883924695055e-06, + "loss": 0.0021, + "step": 7078 + }, + { + "epoch": 0.708, + "grad_norm": 0.0005629007355310023, + "learning_rate": 4.76014094029921e-06, + "loss": 0.0001, + "step": 7080 + }, + { + "epoch": 0.7082, + "grad_norm": 0.03154703974723816, + "learning_rate": 4.754196041972563e-06, + "loss": 0.0043, + "step": 7082 + }, + { + "epoch": 0.7084, + "grad_norm": 1.8465126752853394, + "learning_rate": 4.7482537003870425e-06, + "loss": 0.054, + "step": 7084 + }, + { + "epoch": 0.7086, + "grad_norm": 0.00341541669331491, + "learning_rate": 4.7423139184388725e-06, + "loss": 0.0164, + "step": 7086 + }, + { + "epoch": 0.7088, + "grad_norm": 1.0876895189285278, + "learning_rate": 4.736376699023023e-06, + "loss": 0.0216, + "step": 7088 + }, + { + "epoch": 0.709, + "grad_norm": 0.05083809420466423, + "learning_rate": 4.7304420450332244e-06, + "loss": 0.0007, + "step": 7090 + }, + { + "epoch": 0.7092, + "grad_norm": 0.09320921450853348, + "learning_rate": 4.724509959361961e-06, + "loss": 0.0026, + "step": 7092 + }, + { + "epoch": 0.7094, + "grad_norm": 0.0011155757820233703, + "learning_rate": 4.718580444900457e-06, + "loss": 0.0, + "step": 7094 + }, + { + "epoch": 0.7096, + "grad_norm": 0.013048029504716396, + "learning_rate": 4.712653504538684e-06, + "loss": 0.0002, + "step": 7096 + }, + { + "epoch": 0.7098, + "grad_norm": 0.3890015482902527, + "learning_rate": 4.706729141165362e-06, + "loss": 0.0045, + "step": 7098 + }, + { + "epoch": 0.71, + "grad_norm": 0.054237425327301025, + "learning_rate": 4.700807357667953e-06, + "loss": 0.0069, + "step": 7100 + }, + { + "epoch": 0.7102, + "grad_norm": 0.40832552313804626, + "learning_rate": 4.694888156932657e-06, + "loss": 0.0054, + "step": 7102 + }, + { + "epoch": 0.7104, + "grad_norm": 0.7871062159538269, + "learning_rate": 4.688971541844436e-06, + "loss": 0.008, + "step": 7104 + }, + { + "epoch": 0.7106, + "grad_norm": 0.000854452489875257, + "learning_rate": 4.6830575152869615e-06, + "loss": 0.0005, + "step": 7106 + }, + { + "epoch": 0.7108, + "grad_norm": 0.020824009552598, + "learning_rate": 4.677146080142664e-06, + "loss": 0.0004, + "step": 7108 + }, + { + "epoch": 0.711, + "grad_norm": 0.0015482951421290636, + "learning_rate": 4.671237239292699e-06, + "loss": 0.0005, + "step": 7110 + }, + { + "epoch": 0.7112, + "grad_norm": 0.008264116011559963, + "learning_rate": 4.6653309956169745e-06, + "loss": 0.0002, + "step": 7112 + }, + { + "epoch": 0.7114, + "grad_norm": 4.452683925628662, + "learning_rate": 4.659427351994116e-06, + "loss": 0.2895, + "step": 7114 + }, + { + "epoch": 0.7116, + "grad_norm": 0.007685402408242226, + "learning_rate": 4.6535263113014885e-06, + "loss": 0.0085, + "step": 7116 + }, + { + "epoch": 0.7118, + "grad_norm": 0.14094781875610352, + "learning_rate": 4.647627876415186e-06, + "loss": 0.0048, + "step": 7118 + }, + { + "epoch": 0.712, + "grad_norm": 0.24827337265014648, + "learning_rate": 4.641732050210032e-06, + "loss": 0.0033, + "step": 7120 + }, + { + "epoch": 0.7122, + "grad_norm": 0.012852887623012066, + "learning_rate": 4.635838835559591e-06, + "loss": 0.0002, + "step": 7122 + }, + { + "epoch": 0.7124, + "grad_norm": 0.062244608998298645, + "learning_rate": 4.629948235336133e-06, + "loss": 0.0009, + "step": 7124 + }, + { + "epoch": 0.7126, + "grad_norm": 0.005316096358001232, + "learning_rate": 4.62406025241067e-06, + "loss": 0.0005, + "step": 7126 + }, + { + "epoch": 0.7128, + "grad_norm": 0.0017135407542809844, + "learning_rate": 4.618174889652928e-06, + "loss": 0.0018, + "step": 7128 + }, + { + "epoch": 0.713, + "grad_norm": 0.05396157130599022, + "learning_rate": 4.612292149931369e-06, + "loss": 0.0047, + "step": 7130 + }, + { + "epoch": 0.7132, + "grad_norm": 0.19131122529506683, + "learning_rate": 4.606412036113166e-06, + "loss": 0.0022, + "step": 7132 + }, + { + "epoch": 0.7134, + "grad_norm": 0.4607093334197998, + "learning_rate": 4.600534551064215e-06, + "loss": 0.0076, + "step": 7134 + }, + { + "epoch": 0.7136, + "grad_norm": 0.004518249537795782, + "learning_rate": 4.59465969764913e-06, + "loss": 0.0002, + "step": 7136 + }, + { + "epoch": 0.7138, + "grad_norm": 0.006370124407112598, + "learning_rate": 4.588787478731242e-06, + "loss": 0.1066, + "step": 7138 + }, + { + "epoch": 0.714, + "grad_norm": 0.10069084912538528, + "learning_rate": 4.582917897172603e-06, + "loss": 0.0027, + "step": 7140 + }, + { + "epoch": 0.7142, + "grad_norm": 0.42796555161476135, + "learning_rate": 4.577050955833972e-06, + "loss": 0.0062, + "step": 7142 + }, + { + "epoch": 0.7144, + "grad_norm": 0.001995661063119769, + "learning_rate": 4.571186657574828e-06, + "loss": 0.0756, + "step": 7144 + }, + { + "epoch": 0.7146, + "grad_norm": 0.0032681161537766457, + "learning_rate": 4.565325005253356e-06, + "loss": 0.0001, + "step": 7146 + }, + { + "epoch": 0.7148, + "grad_norm": 0.003111182013526559, + "learning_rate": 4.559466001726451e-06, + "loss": 0.0006, + "step": 7148 + }, + { + "epoch": 0.715, + "grad_norm": 0.008597498759627342, + "learning_rate": 4.5536096498497295e-06, + "loss": 0.0002, + "step": 7150 + }, + { + "epoch": 0.7152, + "grad_norm": 2.3959999084472656, + "learning_rate": 4.5477559524775e-06, + "loss": 0.0228, + "step": 7152 + }, + { + "epoch": 0.7154, + "grad_norm": 0.38113638758659363, + "learning_rate": 4.541904912462785e-06, + "loss": 0.0051, + "step": 7154 + }, + { + "epoch": 0.7156, + "grad_norm": 0.07267725467681885, + "learning_rate": 4.53605653265731e-06, + "loss": 0.0012, + "step": 7156 + }, + { + "epoch": 0.7158, + "grad_norm": 0.0770830288529396, + "learning_rate": 4.530210815911504e-06, + "loss": 0.0009, + "step": 7158 + }, + { + "epoch": 0.716, + "grad_norm": 0.007295739371329546, + "learning_rate": 4.524367765074499e-06, + "loss": 0.0007, + "step": 7160 + }, + { + "epoch": 0.7162, + "grad_norm": 0.031691212207078934, + "learning_rate": 4.518527382994127e-06, + "loss": 0.0209, + "step": 7162 + }, + { + "epoch": 0.7164, + "grad_norm": 0.02135957032442093, + "learning_rate": 4.512689672516918e-06, + "loss": 0.0003, + "step": 7164 + }, + { + "epoch": 0.7166, + "grad_norm": 0.5460742115974426, + "learning_rate": 4.506854636488103e-06, + "loss": 0.0052, + "step": 7166 + }, + { + "epoch": 0.7168, + "grad_norm": 0.04639771580696106, + "learning_rate": 4.501022277751602e-06, + "loss": 0.001, + "step": 7168 + }, + { + "epoch": 0.717, + "grad_norm": 0.003160811262205243, + "learning_rate": 4.495192599150045e-06, + "loss": 0.0115, + "step": 7170 + }, + { + "epoch": 0.7172, + "grad_norm": 0.6785265803337097, + "learning_rate": 4.48936560352474e-06, + "loss": 0.0052, + "step": 7172 + }, + { + "epoch": 0.7174, + "grad_norm": 3.931917667388916, + "learning_rate": 4.483541293715699e-06, + "loss": 0.0423, + "step": 7174 + }, + { + "epoch": 0.7176, + "grad_norm": 0.044392701238393784, + "learning_rate": 4.477719672561615e-06, + "loss": 0.0007, + "step": 7176 + }, + { + "epoch": 0.7178, + "grad_norm": 0.059812601655721664, + "learning_rate": 4.471900742899876e-06, + "loss": 0.0028, + "step": 7178 + }, + { + "epoch": 0.718, + "grad_norm": 0.01762833073735237, + "learning_rate": 4.46608450756656e-06, + "loss": 0.0003, + "step": 7180 + }, + { + "epoch": 0.7182, + "grad_norm": 0.003751841839402914, + "learning_rate": 4.4602709693964296e-06, + "loss": 0.0048, + "step": 7182 + }, + { + "epoch": 0.7184, + "grad_norm": 0.01016370952129364, + "learning_rate": 4.4544601312229295e-06, + "loss": 0.0002, + "step": 7184 + }, + { + "epoch": 0.7186, + "grad_norm": 0.09167847037315369, + "learning_rate": 4.44865199587819e-06, + "loss": 0.0014, + "step": 7186 + }, + { + "epoch": 0.7188, + "grad_norm": 0.030100040137767792, + "learning_rate": 4.442846566193034e-06, + "loss": 0.0011, + "step": 7188 + }, + { + "epoch": 0.719, + "grad_norm": 0.008730552159249783, + "learning_rate": 4.437043844996952e-06, + "loss": 0.0281, + "step": 7190 + }, + { + "epoch": 0.7192, + "grad_norm": 0.08410181105136871, + "learning_rate": 4.4312438351181246e-06, + "loss": 0.0015, + "step": 7192 + }, + { + "epoch": 0.7194, + "grad_norm": 0.09827223420143127, + "learning_rate": 4.425446539383394e-06, + "loss": 0.0007, + "step": 7194 + }, + { + "epoch": 0.7196, + "grad_norm": 0.0028765464667230844, + "learning_rate": 4.419651960618302e-06, + "loss": 0.0001, + "step": 7196 + }, + { + "epoch": 0.7198, + "grad_norm": 0.03209836035966873, + "learning_rate": 4.413860101647055e-06, + "loss": 0.0004, + "step": 7198 + }, + { + "epoch": 0.72, + "grad_norm": 0.011527705006301403, + "learning_rate": 4.408070965292534e-06, + "loss": 0.0002, + "step": 7200 + }, + { + "epoch": 0.7202, + "grad_norm": 0.010479127056896687, + "learning_rate": 4.402284554376292e-06, + "loss": 0.0009, + "step": 7202 + }, + { + "epoch": 0.7204, + "grad_norm": 0.003262470941990614, + "learning_rate": 4.3965008717185555e-06, + "loss": 0.0032, + "step": 7204 + }, + { + "epoch": 0.7206, + "grad_norm": 0.009083700366318226, + "learning_rate": 4.39071992013822e-06, + "loss": 0.0004, + "step": 7206 + }, + { + "epoch": 0.7208, + "grad_norm": 0.06803792715072632, + "learning_rate": 4.384941702452856e-06, + "loss": 0.0012, + "step": 7208 + }, + { + "epoch": 0.721, + "grad_norm": 0.005579251330345869, + "learning_rate": 4.379166221478697e-06, + "loss": 0.0001, + "step": 7210 + }, + { + "epoch": 0.7212, + "grad_norm": 0.004025753121823072, + "learning_rate": 4.373393480030637e-06, + "loss": 0.0001, + "step": 7212 + }, + { + "epoch": 0.7214, + "grad_norm": 1.3598376512527466, + "learning_rate": 4.367623480922236e-06, + "loss": 0.0225, + "step": 7214 + }, + { + "epoch": 0.7216, + "grad_norm": 0.0008888101438060403, + "learning_rate": 4.361856226965733e-06, + "loss": 0.0002, + "step": 7216 + }, + { + "epoch": 0.7218, + "grad_norm": 0.0009162494679912925, + "learning_rate": 4.356091720972011e-06, + "loss": 0.0096, + "step": 7218 + }, + { + "epoch": 0.722, + "grad_norm": 0.6826017498970032, + "learning_rate": 4.350329965750622e-06, + "loss": 0.0112, + "step": 7220 + }, + { + "epoch": 0.7222, + "grad_norm": 0.0057380287908017635, + "learning_rate": 4.344570964109775e-06, + "loss": 0.0006, + "step": 7222 + }, + { + "epoch": 0.7224, + "grad_norm": 0.6049830317497253, + "learning_rate": 4.338814718856333e-06, + "loss": 0.0058, + "step": 7224 + }, + { + "epoch": 0.7226, + "grad_norm": 0.0024684439413249493, + "learning_rate": 4.3330612327958265e-06, + "loss": 0.0001, + "step": 7226 + }, + { + "epoch": 0.7228, + "grad_norm": 0.003494675038382411, + "learning_rate": 4.3273105087324375e-06, + "loss": 0.0007, + "step": 7228 + }, + { + "epoch": 0.723, + "grad_norm": 0.36128008365631104, + "learning_rate": 4.321562549468991e-06, + "loss": 0.0028, + "step": 7230 + }, + { + "epoch": 0.7232, + "grad_norm": 0.0051346044056117535, + "learning_rate": 4.315817357806974e-06, + "loss": 0.0544, + "step": 7232 + }, + { + "epoch": 0.7234, + "grad_norm": 0.0033201253972947598, + "learning_rate": 4.310074936546521e-06, + "loss": 0.0001, + "step": 7234 + }, + { + "epoch": 0.7236, + "grad_norm": 0.0034376203548163176, + "learning_rate": 4.304335288486426e-06, + "loss": 0.0009, + "step": 7236 + }, + { + "epoch": 0.7238, + "grad_norm": 0.0007489158306270838, + "learning_rate": 4.29859841642412e-06, + "loss": 0.0003, + "step": 7238 + }, + { + "epoch": 0.724, + "grad_norm": 0.0014194049872457981, + "learning_rate": 4.292864323155684e-06, + "loss": 0.0001, + "step": 7240 + }, + { + "epoch": 0.7242, + "grad_norm": 0.12167397141456604, + "learning_rate": 4.287133011475847e-06, + "loss": 0.0007, + "step": 7242 + }, + { + "epoch": 0.7244, + "grad_norm": 0.019804149866104126, + "learning_rate": 4.281404484177974e-06, + "loss": 0.0001, + "step": 7244 + }, + { + "epoch": 0.7246, + "grad_norm": 0.012201853096485138, + "learning_rate": 4.275678744054094e-06, + "loss": 0.0002, + "step": 7246 + }, + { + "epoch": 0.7248, + "grad_norm": 0.009577571414411068, + "learning_rate": 4.26995579389485e-06, + "loss": 0.0003, + "step": 7248 + }, + { + "epoch": 0.725, + "grad_norm": 0.014701656065881252, + "learning_rate": 4.264235636489542e-06, + "loss": 0.0002, + "step": 7250 + }, + { + "epoch": 0.7252, + "grad_norm": 1.2963123321533203, + "learning_rate": 4.258518274626103e-06, + "loss": 0.0158, + "step": 7252 + }, + { + "epoch": 0.7254, + "grad_norm": 0.00898891780525446, + "learning_rate": 4.2528037110911126e-06, + "loss": 0.0003, + "step": 7254 + }, + { + "epoch": 0.7256, + "grad_norm": 0.007188902236521244, + "learning_rate": 4.247091948669775e-06, + "loss": 0.0003, + "step": 7256 + }, + { + "epoch": 0.7258, + "grad_norm": 0.5577231645584106, + "learning_rate": 4.2413829901459345e-06, + "loss": 0.1153, + "step": 7258 + }, + { + "epoch": 0.726, + "grad_norm": 0.15754017233848572, + "learning_rate": 4.235676838302069e-06, + "loss": 0.0031, + "step": 7260 + }, + { + "epoch": 0.7262, + "grad_norm": 0.019452622160315514, + "learning_rate": 4.229973495919286e-06, + "loss": 0.0004, + "step": 7262 + }, + { + "epoch": 0.7264, + "grad_norm": 0.003342308336868882, + "learning_rate": 4.224272965777326e-06, + "loss": 0.001, + "step": 7264 + }, + { + "epoch": 0.7266, + "grad_norm": 0.3362865149974823, + "learning_rate": 4.218575250654559e-06, + "loss": 0.0051, + "step": 7266 + }, + { + "epoch": 0.7268, + "grad_norm": 0.005193654913455248, + "learning_rate": 4.21288035332798e-06, + "loss": 0.0006, + "step": 7268 + }, + { + "epoch": 0.727, + "grad_norm": 0.016276391223073006, + "learning_rate": 4.207188276573214e-06, + "loss": 0.0002, + "step": 7270 + }, + { + "epoch": 0.7272, + "grad_norm": 0.006719834636896849, + "learning_rate": 4.201499023164508e-06, + "loss": 0.0002, + "step": 7272 + }, + { + "epoch": 0.7274, + "grad_norm": 4.912821292877197, + "learning_rate": 4.19581259587474e-06, + "loss": 0.183, + "step": 7274 + }, + { + "epoch": 0.7276, + "grad_norm": 0.008116974495351315, + "learning_rate": 4.190128997475402e-06, + "loss": 0.0003, + "step": 7276 + }, + { + "epoch": 0.7278, + "grad_norm": 0.00897238776087761, + "learning_rate": 4.184448230736613e-06, + "loss": 0.0009, + "step": 7278 + }, + { + "epoch": 0.728, + "grad_norm": 0.0006961161852814257, + "learning_rate": 4.178770298427107e-06, + "loss": 0.0001, + "step": 7280 + }, + { + "epoch": 0.7282, + "grad_norm": 0.8838779926300049, + "learning_rate": 4.173095203314241e-06, + "loss": 0.0104, + "step": 7282 + }, + { + "epoch": 0.7284, + "grad_norm": 0.019008005037903786, + "learning_rate": 4.167422948163986e-06, + "loss": 0.0003, + "step": 7284 + }, + { + "epoch": 0.7286, + "grad_norm": 0.0005648411461152136, + "learning_rate": 4.161753535740932e-06, + "loss": 0.0, + "step": 7286 + }, + { + "epoch": 0.7288, + "grad_norm": 0.002607224741950631, + "learning_rate": 4.15608696880828e-06, + "loss": 0.0045, + "step": 7288 + }, + { + "epoch": 0.729, + "grad_norm": 0.017235226929187775, + "learning_rate": 4.150423250127846e-06, + "loss": 0.0003, + "step": 7290 + }, + { + "epoch": 0.7292, + "grad_norm": 2.4241840839385986, + "learning_rate": 4.144762382460059e-06, + "loss": 0.0094, + "step": 7292 + }, + { + "epoch": 0.7294, + "grad_norm": 0.002689911751076579, + "learning_rate": 4.1391043685639576e-06, + "loss": 0.0083, + "step": 7294 + }, + { + "epoch": 0.7296, + "grad_norm": 0.0020092676859349012, + "learning_rate": 4.133449211197188e-06, + "loss": 0.0006, + "step": 7296 + }, + { + "epoch": 0.7298, + "grad_norm": 0.002770183840766549, + "learning_rate": 4.127796913116004e-06, + "loss": 0.0007, + "step": 7298 + }, + { + "epoch": 0.73, + "grad_norm": 0.04196953773498535, + "learning_rate": 4.12214747707527e-06, + "loss": 0.019, + "step": 7300 + }, + { + "epoch": 0.7302, + "grad_norm": 0.0008381783263757825, + "learning_rate": 4.1165009058284496e-06, + "loss": 0.0001, + "step": 7302 + }, + { + "epoch": 0.7304, + "grad_norm": 0.002724381862208247, + "learning_rate": 4.110857202127615e-06, + "loss": 0.0053, + "step": 7304 + }, + { + "epoch": 0.7306, + "grad_norm": 0.0031751070637255907, + "learning_rate": 4.105216368723437e-06, + "loss": 0.0199, + "step": 7306 + }, + { + "epoch": 0.7308, + "grad_norm": 0.08941850066184998, + "learning_rate": 4.099578408365192e-06, + "loss": 0.0011, + "step": 7308 + }, + { + "epoch": 0.731, + "grad_norm": 6.171564102172852, + "learning_rate": 4.093943323800746e-06, + "loss": 0.0535, + "step": 7310 + }, + { + "epoch": 0.7312, + "grad_norm": 0.10253046452999115, + "learning_rate": 4.08831111777658e-06, + "loss": 0.001, + "step": 7312 + }, + { + "epoch": 0.7314, + "grad_norm": 0.006578122265636921, + "learning_rate": 4.08268179303776e-06, + "loss": 0.0001, + "step": 7314 + }, + { + "epoch": 0.7316, + "grad_norm": 0.0359555147588253, + "learning_rate": 4.0770553523279535e-06, + "loss": 0.001, + "step": 7316 + }, + { + "epoch": 0.7318, + "grad_norm": 0.026334408670663834, + "learning_rate": 4.071431798389408e-06, + "loss": 0.0003, + "step": 7318 + }, + { + "epoch": 0.732, + "grad_norm": 0.006755968555808067, + "learning_rate": 4.065811133962987e-06, + "loss": 0.0002, + "step": 7320 + }, + { + "epoch": 0.7322, + "grad_norm": 11.6577787399292, + "learning_rate": 4.06019336178813e-06, + "loss": 0.1599, + "step": 7322 + }, + { + "epoch": 0.7324, + "grad_norm": 0.017142869532108307, + "learning_rate": 4.05457848460287e-06, + "loss": 0.0002, + "step": 7324 + }, + { + "epoch": 0.7326, + "grad_norm": 9.04963493347168, + "learning_rate": 4.048966505143831e-06, + "loss": 0.1146, + "step": 7326 + }, + { + "epoch": 0.7328, + "grad_norm": 0.012293504551053047, + "learning_rate": 4.04335742614622e-06, + "loss": 0.0003, + "step": 7328 + }, + { + "epoch": 0.733, + "grad_norm": 0.022796254605054855, + "learning_rate": 4.037751250343841e-06, + "loss": 0.0034, + "step": 7330 + }, + { + "epoch": 0.7332, + "grad_norm": 0.07998229563236237, + "learning_rate": 4.032147980469072e-06, + "loss": 0.0009, + "step": 7332 + }, + { + "epoch": 0.7334, + "grad_norm": 5.401417255401611, + "learning_rate": 4.026547619252883e-06, + "loss": 0.0636, + "step": 7334 + }, + { + "epoch": 0.7336, + "grad_norm": 0.0006325480062514544, + "learning_rate": 4.020950169424815e-06, + "loss": 0.0, + "step": 7336 + }, + { + "epoch": 0.7338, + "grad_norm": 0.0014419344952329993, + "learning_rate": 4.015355633712996e-06, + "loss": 0.0012, + "step": 7338 + }, + { + "epoch": 0.734, + "grad_norm": 0.05532076209783554, + "learning_rate": 4.009764014844143e-06, + "loss": 0.0008, + "step": 7340 + }, + { + "epoch": 0.7342, + "grad_norm": 0.0017274465644732118, + "learning_rate": 4.004175315543538e-06, + "loss": 0.0053, + "step": 7342 + }, + { + "epoch": 0.7344, + "grad_norm": 0.024689000099897385, + "learning_rate": 3.998589538535046e-06, + "loss": 0.0005, + "step": 7344 + }, + { + "epoch": 0.7346, + "grad_norm": 0.020717991515994072, + "learning_rate": 3.993006686541108e-06, + "loss": 0.0007, + "step": 7346 + }, + { + "epoch": 0.7348, + "grad_norm": 0.0008002000395208597, + "learning_rate": 3.987426762282733e-06, + "loss": 0.0001, + "step": 7348 + }, + { + "epoch": 0.735, + "grad_norm": 0.002165192738175392, + "learning_rate": 3.981849768479516e-06, + "loss": 0.0002, + "step": 7350 + }, + { + "epoch": 0.7352, + "grad_norm": 0.008022121153771877, + "learning_rate": 3.976275707849616e-06, + "loss": 0.0012, + "step": 7352 + }, + { + "epoch": 0.7354, + "grad_norm": 0.01749715767800808, + "learning_rate": 3.970704583109755e-06, + "loss": 0.0278, + "step": 7354 + }, + { + "epoch": 0.7356, + "grad_norm": 0.10509452223777771, + "learning_rate": 3.965136396975235e-06, + "loss": 0.0014, + "step": 7356 + }, + { + "epoch": 0.7358, + "grad_norm": 0.012819766066968441, + "learning_rate": 3.959571152159922e-06, + "loss": 0.0599, + "step": 7358 + }, + { + "epoch": 0.736, + "grad_norm": 0.0013439450412988663, + "learning_rate": 3.954008851376252e-06, + "loss": 0.0001, + "step": 7360 + }, + { + "epoch": 0.7362, + "grad_norm": 0.00594932259991765, + "learning_rate": 3.94844949733522e-06, + "loss": 0.0001, + "step": 7362 + }, + { + "epoch": 0.7364, + "grad_norm": 0.008112025447189808, + "learning_rate": 3.942893092746387e-06, + "loss": 0.0041, + "step": 7364 + }, + { + "epoch": 0.7366, + "grad_norm": 0.10816899687051773, + "learning_rate": 3.937339640317879e-06, + "loss": 0.2007, + "step": 7366 + }, + { + "epoch": 0.7368, + "grad_norm": 0.05110194534063339, + "learning_rate": 3.931789142756377e-06, + "loss": 0.0007, + "step": 7368 + }, + { + "epoch": 0.737, + "grad_norm": 0.0044801016338169575, + "learning_rate": 3.9262416027671354e-06, + "loss": 0.0016, + "step": 7370 + }, + { + "epoch": 0.7372, + "grad_norm": 0.015039633959531784, + "learning_rate": 3.920697023053949e-06, + "loss": 0.0005, + "step": 7372 + }, + { + "epoch": 0.7374, + "grad_norm": 0.0011384140234440565, + "learning_rate": 3.915155406319181e-06, + "loss": 0.0371, + "step": 7374 + }, + { + "epoch": 0.7376, + "grad_norm": 0.004042440094053745, + "learning_rate": 3.9096167552637454e-06, + "loss": 0.0889, + "step": 7376 + }, + { + "epoch": 0.7378, + "grad_norm": 0.005818674806505442, + "learning_rate": 3.90408107258712e-06, + "loss": 0.0002, + "step": 7378 + }, + { + "epoch": 0.738, + "grad_norm": 0.194391667842865, + "learning_rate": 3.898548360987325e-06, + "loss": 0.0066, + "step": 7380 + }, + { + "epoch": 0.7382, + "grad_norm": 0.015000996179878712, + "learning_rate": 3.893018623160938e-06, + "loss": 0.0003, + "step": 7382 + }, + { + "epoch": 0.7384, + "grad_norm": 0.3958265483379364, + "learning_rate": 3.887491861803085e-06, + "loss": 0.0045, + "step": 7384 + }, + { + "epoch": 0.7386, + "grad_norm": 0.0010335248662158847, + "learning_rate": 3.88196807960744e-06, + "loss": 0.0001, + "step": 7386 + }, + { + "epoch": 0.7388, + "grad_norm": 0.42004430294036865, + "learning_rate": 3.876447279266238e-06, + "loss": 0.009, + "step": 7388 + }, + { + "epoch": 0.739, + "grad_norm": 0.06539393961429596, + "learning_rate": 3.8709294634702374e-06, + "loss": 0.0009, + "step": 7390 + }, + { + "epoch": 0.7392, + "grad_norm": 0.0014862024690955877, + "learning_rate": 3.86541463490876e-06, + "loss": 0.0001, + "step": 7392 + }, + { + "epoch": 0.7394, + "grad_norm": 0.001257197349332273, + "learning_rate": 3.859902796269664e-06, + "loss": 0.0001, + "step": 7394 + }, + { + "epoch": 0.7396, + "grad_norm": 0.0048982324078679085, + "learning_rate": 3.854393950239356e-06, + "loss": 0.0012, + "step": 7396 + }, + { + "epoch": 0.7398, + "grad_norm": 0.20494438707828522, + "learning_rate": 3.848888099502779e-06, + "loss": 0.0035, + "step": 7398 + }, + { + "epoch": 0.74, + "grad_norm": 0.15450944006443024, + "learning_rate": 3.8433852467434175e-06, + "loss": 0.0008, + "step": 7400 + }, + { + "epoch": 0.7402, + "grad_norm": 0.010694042779505253, + "learning_rate": 3.8378853946432956e-06, + "loss": 0.0003, + "step": 7402 + }, + { + "epoch": 0.7404, + "grad_norm": 0.043746452778577805, + "learning_rate": 3.832388545882975e-06, + "loss": 0.002, + "step": 7404 + }, + { + "epoch": 0.7406, + "grad_norm": 0.003611180931329727, + "learning_rate": 3.826894703141552e-06, + "loss": 0.0002, + "step": 7406 + }, + { + "epoch": 0.7408, + "grad_norm": 0.012803005054593086, + "learning_rate": 3.821403869096658e-06, + "loss": 0.0003, + "step": 7408 + }, + { + "epoch": 0.741, + "grad_norm": 0.006167138461023569, + "learning_rate": 3.81591604642446e-06, + "loss": 0.0004, + "step": 7410 + }, + { + "epoch": 0.7412, + "grad_norm": 1.492699384689331, + "learning_rate": 3.810431237799657e-06, + "loss": 0.0158, + "step": 7412 + }, + { + "epoch": 0.7414, + "grad_norm": 0.004019461572170258, + "learning_rate": 3.804949445895473e-06, + "loss": 0.0002, + "step": 7414 + }, + { + "epoch": 0.7416, + "grad_norm": 0.289786159992218, + "learning_rate": 3.7994706733836738e-06, + "loss": 0.0125, + "step": 7416 + }, + { + "epoch": 0.7418, + "grad_norm": 2.793522596359253, + "learning_rate": 3.793994922934544e-06, + "loss": 0.0158, + "step": 7418 + }, + { + "epoch": 0.742, + "grad_norm": 0.002761397510766983, + "learning_rate": 3.7885221972168974e-06, + "loss": 0.0003, + "step": 7420 + }, + { + "epoch": 0.7422, + "grad_norm": 0.35132578015327454, + "learning_rate": 3.783052498898073e-06, + "loss": 0.0083, + "step": 7422 + }, + { + "epoch": 0.7424, + "grad_norm": 0.0012769472086802125, + "learning_rate": 3.7775858306439374e-06, + "loss": 0.0001, + "step": 7424 + }, + { + "epoch": 0.7426, + "grad_norm": 0.6474995017051697, + "learning_rate": 3.772122195118877e-06, + "loss": 0.0056, + "step": 7426 + }, + { + "epoch": 0.7428, + "grad_norm": 0.04311046749353409, + "learning_rate": 3.766661594985801e-06, + "loss": 0.0005, + "step": 7428 + }, + { + "epoch": 0.743, + "grad_norm": 0.0005660529132001102, + "learning_rate": 3.7612040329061405e-06, + "loss": 0.0069, + "step": 7430 + }, + { + "epoch": 0.7432, + "grad_norm": 0.008577583357691765, + "learning_rate": 3.7557495115398446e-06, + "loss": 0.0011, + "step": 7432 + }, + { + "epoch": 0.7434, + "grad_norm": 0.4056112766265869, + "learning_rate": 3.7502980335453777e-06, + "loss": 0.011, + "step": 7434 + }, + { + "epoch": 0.7436, + "grad_norm": 0.16764940321445465, + "learning_rate": 3.7448496015797296e-06, + "loss": 0.0037, + "step": 7436 + }, + { + "epoch": 0.7438, + "grad_norm": 0.004395970609039068, + "learning_rate": 3.7394042182983983e-06, + "loss": 0.0001, + "step": 7438 + }, + { + "epoch": 0.744, + "grad_norm": 0.10055270045995712, + "learning_rate": 3.7339618863553983e-06, + "loss": 0.0028, + "step": 7440 + }, + { + "epoch": 0.7442, + "grad_norm": 0.002073992509394884, + "learning_rate": 3.728522608403249e-06, + "loss": 0.0003, + "step": 7442 + }, + { + "epoch": 0.7444, + "grad_norm": 0.1359260231256485, + "learning_rate": 3.723086387092997e-06, + "loss": 0.003, + "step": 7444 + }, + { + "epoch": 0.7446, + "grad_norm": 0.015888461843132973, + "learning_rate": 3.7176532250741857e-06, + "loss": 0.0139, + "step": 7446 + }, + { + "epoch": 0.7448, + "grad_norm": 0.0012904377654194832, + "learning_rate": 3.7122231249948747e-06, + "loss": 0.0001, + "step": 7448 + }, + { + "epoch": 0.745, + "grad_norm": 0.17686551809310913, + "learning_rate": 3.7067960895016277e-06, + "loss": 0.0034, + "step": 7450 + }, + { + "epoch": 0.7452, + "grad_norm": 3.0579440593719482, + "learning_rate": 3.7013721212395128e-06, + "loss": 0.041, + "step": 7452 + }, + { + "epoch": 0.7454, + "grad_norm": 0.7421361207962036, + "learning_rate": 3.6959512228521123e-06, + "loss": 0.0085, + "step": 7454 + }, + { + "epoch": 0.7456, + "grad_norm": 0.00039506948087364435, + "learning_rate": 3.6905333969815038e-06, + "loss": 0.0003, + "step": 7456 + }, + { + "epoch": 0.7458, + "grad_norm": 1.4459651708602905, + "learning_rate": 3.685118646268272e-06, + "loss": 0.0488, + "step": 7458 + }, + { + "epoch": 0.746, + "grad_norm": 0.0030328980647027493, + "learning_rate": 3.679706973351491e-06, + "loss": 0.0001, + "step": 7460 + }, + { + "epoch": 0.7462, + "grad_norm": 0.009225788526237011, + "learning_rate": 3.674298380868756e-06, + "loss": 0.006, + "step": 7462 + }, + { + "epoch": 0.7464, + "grad_norm": 5.6268510818481445, + "learning_rate": 3.6688928714561444e-06, + "loss": 0.0958, + "step": 7464 + }, + { + "epoch": 0.7466, + "grad_norm": 0.01312610786408186, + "learning_rate": 3.663490447748236e-06, + "loss": 0.0005, + "step": 7466 + }, + { + "epoch": 0.7468, + "grad_norm": 0.004822693765163422, + "learning_rate": 3.658091112378106e-06, + "loss": 0.0002, + "step": 7468 + }, + { + "epoch": 0.747, + "grad_norm": 0.003706004936248064, + "learning_rate": 3.6526948679773256e-06, + "loss": 0.0001, + "step": 7470 + }, + { + "epoch": 0.7472, + "grad_norm": 0.001979646272957325, + "learning_rate": 3.6473017171759563e-06, + "loss": 0.0001, + "step": 7472 + }, + { + "epoch": 0.7474, + "grad_norm": 1.2524163722991943, + "learning_rate": 3.6419116626025585e-06, + "loss": 0.0443, + "step": 7474 + }, + { + "epoch": 0.7476, + "grad_norm": 0.011209771037101746, + "learning_rate": 3.636524706884181e-06, + "loss": 0.0002, + "step": 7476 + }, + { + "epoch": 0.7478, + "grad_norm": 0.3757669925689697, + "learning_rate": 3.6311408526463554e-06, + "loss": 0.0084, + "step": 7478 + }, + { + "epoch": 0.748, + "grad_norm": 0.007484117988497019, + "learning_rate": 3.625760102513103e-06, + "loss": 0.0001, + "step": 7480 + }, + { + "epoch": 0.7482, + "grad_norm": 0.006481144577264786, + "learning_rate": 3.620382459106946e-06, + "loss": 0.0003, + "step": 7482 + }, + { + "epoch": 0.7484, + "grad_norm": 0.031087132170796394, + "learning_rate": 3.615007925048878e-06, + "loss": 0.0009, + "step": 7484 + }, + { + "epoch": 0.7486, + "grad_norm": 0.0047991094179451466, + "learning_rate": 3.6096365029583803e-06, + "loss": 0.0124, + "step": 7486 + }, + { + "epoch": 0.7488, + "grad_norm": 0.059957969933748245, + "learning_rate": 3.604268195453421e-06, + "loss": 0.0007, + "step": 7488 + }, + { + "epoch": 0.749, + "grad_norm": 1.0074795484542847, + "learning_rate": 3.598903005150444e-06, + "loss": 0.0119, + "step": 7490 + }, + { + "epoch": 0.7492, + "grad_norm": 0.4780077338218689, + "learning_rate": 3.5935409346643835e-06, + "loss": 0.02, + "step": 7492 + }, + { + "epoch": 0.7494, + "grad_norm": 4.021981239318848, + "learning_rate": 3.5881819866086485e-06, + "loss": 0.0928, + "step": 7494 + }, + { + "epoch": 0.7496, + "grad_norm": 0.000982491415925324, + "learning_rate": 3.582826163595119e-06, + "loss": 0.0005, + "step": 7496 + }, + { + "epoch": 0.7498, + "grad_norm": 0.0036135693080723286, + "learning_rate": 3.5774734682341563e-06, + "loss": 0.0001, + "step": 7498 + }, + { + "epoch": 0.75, + "grad_norm": 0.03230786323547363, + "learning_rate": 3.5721239031346067e-06, + "loss": 0.0003, + "step": 7500 + }, + { + "epoch": 0.7502, + "grad_norm": 0.1618584841489792, + "learning_rate": 3.5667774709037804e-06, + "loss": 0.0034, + "step": 7502 + }, + { + "epoch": 0.7504, + "grad_norm": 0.000461128365714103, + "learning_rate": 3.5614341741474633e-06, + "loss": 0.0008, + "step": 7504 + }, + { + "epoch": 0.7506, + "grad_norm": 0.748110294342041, + "learning_rate": 3.5560940154699133e-06, + "loss": 0.0204, + "step": 7506 + }, + { + "epoch": 0.7508, + "grad_norm": 0.01519402489066124, + "learning_rate": 3.5507569974738575e-06, + "loss": 0.0007, + "step": 7508 + }, + { + "epoch": 0.751, + "grad_norm": 0.04003501683473587, + "learning_rate": 3.545423122760493e-06, + "loss": 0.0004, + "step": 7510 + }, + { + "epoch": 0.7512, + "grad_norm": 0.0013931014109402895, + "learning_rate": 3.540092393929494e-06, + "loss": 0.0001, + "step": 7512 + }, + { + "epoch": 0.7514, + "grad_norm": 0.000793785962741822, + "learning_rate": 3.5347648135789823e-06, + "loss": 0.0199, + "step": 7514 + }, + { + "epoch": 0.7516, + "grad_norm": 0.007698449306190014, + "learning_rate": 3.5294403843055604e-06, + "loss": 0.0003, + "step": 7516 + }, + { + "epoch": 0.7518, + "grad_norm": 0.0007050863932818174, + "learning_rate": 3.524119108704286e-06, + "loss": 0.0001, + "step": 7518 + }, + { + "epoch": 0.752, + "grad_norm": 0.02157473750412464, + "learning_rate": 3.5188009893686916e-06, + "loss": 0.0009, + "step": 7520 + }, + { + "epoch": 0.7522, + "grad_norm": 0.0026433274615556, + "learning_rate": 3.5134860288907602e-06, + "loss": 0.0036, + "step": 7522 + }, + { + "epoch": 0.7524, + "grad_norm": 0.001054043648764491, + "learning_rate": 3.50817422986094e-06, + "loss": 0.0008, + "step": 7524 + }, + { + "epoch": 0.7526, + "grad_norm": 0.3365785777568817, + "learning_rate": 3.502865594868136e-06, + "loss": 0.0029, + "step": 7526 + }, + { + "epoch": 0.7528, + "grad_norm": 0.004468355793505907, + "learning_rate": 3.4975601264997094e-06, + "loss": 0.0034, + "step": 7528 + }, + { + "epoch": 0.753, + "grad_norm": 0.006128070876002312, + "learning_rate": 3.492257827341492e-06, + "loss": 0.0005, + "step": 7530 + }, + { + "epoch": 0.7532, + "grad_norm": 0.0004481377836782485, + "learning_rate": 3.4869586999777492e-06, + "loss": 0.0003, + "step": 7532 + }, + { + "epoch": 0.7534, + "grad_norm": 0.0034616102930158377, + "learning_rate": 3.4816627469912147e-06, + "loss": 0.0004, + "step": 7534 + }, + { + "epoch": 0.7536, + "grad_norm": 0.001436813035979867, + "learning_rate": 3.476369970963072e-06, + "loss": 0.0001, + "step": 7536 + }, + { + "epoch": 0.7538, + "grad_norm": 0.007658860180526972, + "learning_rate": 3.4710803744729517e-06, + "loss": 0.0003, + "step": 7538 + }, + { + "epoch": 0.754, + "grad_norm": 0.0016072827856987715, + "learning_rate": 3.4657939600989453e-06, + "loss": 0.0002, + "step": 7540 + }, + { + "epoch": 0.7542, + "grad_norm": 0.0038294177502393723, + "learning_rate": 3.4605107304175855e-06, + "loss": 0.0002, + "step": 7542 + }, + { + "epoch": 0.7544, + "grad_norm": 0.008113127201795578, + "learning_rate": 3.455230688003852e-06, + "loss": 0.0002, + "step": 7544 + }, + { + "epoch": 0.7546, + "grad_norm": 0.010739192366600037, + "learning_rate": 3.4499538354311757e-06, + "loss": 0.0002, + "step": 7546 + }, + { + "epoch": 0.7548, + "grad_norm": 0.03985929861664772, + "learning_rate": 3.4446801752714287e-06, + "loss": 0.0004, + "step": 7548 + }, + { + "epoch": 0.755, + "grad_norm": 0.018192561343312263, + "learning_rate": 3.4394097100949286e-06, + "loss": 0.0003, + "step": 7550 + }, + { + "epoch": 0.7552, + "grad_norm": 0.0005859850789420307, + "learning_rate": 3.4341424424704373e-06, + "loss": 0.0001, + "step": 7552 + }, + { + "epoch": 0.7554, + "grad_norm": 0.003998196683824062, + "learning_rate": 3.4288783749651568e-06, + "loss": 0.0001, + "step": 7554 + }, + { + "epoch": 0.7556, + "grad_norm": 0.2041451334953308, + "learning_rate": 3.4236175101447265e-06, + "loss": 0.0033, + "step": 7556 + }, + { + "epoch": 0.7558, + "grad_norm": 0.4047185778617859, + "learning_rate": 3.418359850573234e-06, + "loss": 0.006, + "step": 7558 + }, + { + "epoch": 0.756, + "grad_norm": 0.0007364086923189461, + "learning_rate": 3.4131053988131947e-06, + "loss": 0.0001, + "step": 7560 + }, + { + "epoch": 0.7562, + "grad_norm": 0.0031552226282656193, + "learning_rate": 3.4078541574255664e-06, + "loss": 0.0001, + "step": 7562 + }, + { + "epoch": 0.7564, + "grad_norm": 0.011115320026874542, + "learning_rate": 3.4026061289697397e-06, + "loss": 0.0004, + "step": 7564 + }, + { + "epoch": 0.7566, + "grad_norm": 0.00717696500942111, + "learning_rate": 3.397361316003539e-06, + "loss": 0.0001, + "step": 7566 + }, + { + "epoch": 0.7568, + "grad_norm": 0.007398365996778011, + "learning_rate": 3.3921197210832235e-06, + "loss": 0.0001, + "step": 7568 + }, + { + "epoch": 0.757, + "grad_norm": 0.00034905580105260015, + "learning_rate": 3.3868813467634833e-06, + "loss": 0.0, + "step": 7570 + }, + { + "epoch": 0.7572, + "grad_norm": 0.002766211051493883, + "learning_rate": 3.381646195597437e-06, + "loss": 0.0001, + "step": 7572 + }, + { + "epoch": 0.7574, + "grad_norm": 0.0030209871474653482, + "learning_rate": 3.376414270136633e-06, + "loss": 0.0001, + "step": 7574 + }, + { + "epoch": 0.7576, + "grad_norm": 0.005247184541076422, + "learning_rate": 3.3711855729310482e-06, + "loss": 0.0001, + "step": 7576 + }, + { + "epoch": 0.7578, + "grad_norm": 0.006143445614725351, + "learning_rate": 3.3659601065290893e-06, + "loss": 0.0005, + "step": 7578 + }, + { + "epoch": 0.758, + "grad_norm": 0.008762378245592117, + "learning_rate": 3.360737873477584e-06, + "loss": 0.0002, + "step": 7580 + }, + { + "epoch": 0.7582, + "grad_norm": 0.07832848280668259, + "learning_rate": 3.355518876321787e-06, + "loss": 0.0008, + "step": 7582 + }, + { + "epoch": 0.7584, + "grad_norm": 0.004128525033593178, + "learning_rate": 3.3503031176053657e-06, + "loss": 0.0005, + "step": 7584 + }, + { + "epoch": 0.7586, + "grad_norm": 0.003643403295427561, + "learning_rate": 3.3450905998704274e-06, + "loss": 0.0001, + "step": 7586 + }, + { + "epoch": 0.7588, + "grad_norm": 0.011518268845975399, + "learning_rate": 3.3398813256574847e-06, + "loss": 0.0058, + "step": 7588 + }, + { + "epoch": 0.759, + "grad_norm": 0.0012711421586573124, + "learning_rate": 3.3346752975054763e-06, + "loss": 0.0, + "step": 7590 + }, + { + "epoch": 0.7592, + "grad_norm": 0.013934291899204254, + "learning_rate": 3.3294725179517573e-06, + "loss": 0.0003, + "step": 7592 + }, + { + "epoch": 0.7594, + "grad_norm": 0.008794787339866161, + "learning_rate": 3.3242729895320945e-06, + "loss": 0.0002, + "step": 7594 + }, + { + "epoch": 0.7596, + "grad_norm": 0.003758070059120655, + "learning_rate": 3.3190767147806825e-06, + "loss": 0.0001, + "step": 7596 + }, + { + "epoch": 0.7598, + "grad_norm": 1.3592554330825806, + "learning_rate": 3.3138836962301192e-06, + "loss": 0.0641, + "step": 7598 + }, + { + "epoch": 0.76, + "grad_norm": 0.0005482393899001181, + "learning_rate": 3.308693936411421e-06, + "loss": 0.0004, + "step": 7600 + }, + { + "epoch": 0.7602, + "grad_norm": 0.002925200155004859, + "learning_rate": 3.3035074378540087e-06, + "loss": 0.0013, + "step": 7602 + }, + { + "epoch": 0.7604, + "grad_norm": 0.006318102590739727, + "learning_rate": 3.2983242030857177e-06, + "loss": 0.0002, + "step": 7604 + }, + { + "epoch": 0.7606, + "grad_norm": 0.05646161362528801, + "learning_rate": 3.2931442346328e-06, + "loss": 0.0031, + "step": 7606 + }, + { + "epoch": 0.7608, + "grad_norm": 0.002847203053534031, + "learning_rate": 3.287967535019908e-06, + "loss": 0.0028, + "step": 7608 + }, + { + "epoch": 0.761, + "grad_norm": 0.004052122589200735, + "learning_rate": 3.2827941067700996e-06, + "loss": 0.0164, + "step": 7610 + }, + { + "epoch": 0.7612, + "grad_norm": 0.03972117602825165, + "learning_rate": 3.2776239524048426e-06, + "loss": 0.0003, + "step": 7612 + }, + { + "epoch": 0.7614, + "grad_norm": 0.23112408816814423, + "learning_rate": 3.272457074444003e-06, + "loss": 0.002, + "step": 7614 + }, + { + "epoch": 0.7616, + "grad_norm": 0.009027528576552868, + "learning_rate": 3.2672934754058615e-06, + "loss": 0.0001, + "step": 7616 + }, + { + "epoch": 0.7618, + "grad_norm": 0.0005579781718552113, + "learning_rate": 3.2621331578070936e-06, + "loss": 0.0001, + "step": 7618 + }, + { + "epoch": 0.762, + "grad_norm": 1.6442835330963135, + "learning_rate": 3.2569761241627694e-06, + "loss": 0.022, + "step": 7620 + }, + { + "epoch": 0.7622, + "grad_norm": 0.00789779331535101, + "learning_rate": 3.2518223769863633e-06, + "loss": 0.0001, + "step": 7622 + }, + { + "epoch": 0.7624, + "grad_norm": 0.20502407848834991, + "learning_rate": 3.2466719187897555e-06, + "loss": 0.0013, + "step": 7624 + }, + { + "epoch": 0.7626, + "grad_norm": 0.007620343007147312, + "learning_rate": 3.241524752083215e-06, + "loss": 0.0007, + "step": 7626 + }, + { + "epoch": 0.7628, + "grad_norm": 0.0024080059956759214, + "learning_rate": 3.2363808793754082e-06, + "loss": 0.0026, + "step": 7628 + }, + { + "epoch": 0.763, + "grad_norm": 0.14045065641403198, + "learning_rate": 3.2312403031733943e-06, + "loss": 0.0012, + "step": 7630 + }, + { + "epoch": 0.7632, + "grad_norm": 0.3473765254020691, + "learning_rate": 3.2261030259826287e-06, + "loss": 0.0031, + "step": 7632 + }, + { + "epoch": 0.7634, + "grad_norm": 0.003005797043442726, + "learning_rate": 3.2209690503069545e-06, + "loss": 0.0057, + "step": 7634 + }, + { + "epoch": 0.7636, + "grad_norm": 0.0007699091802351177, + "learning_rate": 3.2158383786486204e-06, + "loss": 0.0001, + "step": 7636 + }, + { + "epoch": 0.7638, + "grad_norm": 0.002498387126252055, + "learning_rate": 3.210711013508242e-06, + "loss": 0.0, + "step": 7638 + }, + { + "epoch": 0.764, + "grad_norm": 0.002772006904706359, + "learning_rate": 3.2055869573848374e-06, + "loss": 0.0, + "step": 7640 + }, + { + "epoch": 0.7642, + "grad_norm": 0.06965163350105286, + "learning_rate": 3.200466212775808e-06, + "loss": 0.0007, + "step": 7642 + }, + { + "epoch": 0.7644, + "grad_norm": 0.0010534104658290744, + "learning_rate": 3.195348782176948e-06, + "loss": 0.0, + "step": 7644 + }, + { + "epoch": 0.7646, + "grad_norm": 0.1042354628443718, + "learning_rate": 3.190234668082427e-06, + "loss": 0.0246, + "step": 7646 + }, + { + "epoch": 0.7648, + "grad_norm": 0.6670034527778625, + "learning_rate": 3.1851238729848033e-06, + "loss": 0.0104, + "step": 7648 + }, + { + "epoch": 0.765, + "grad_norm": 0.000885169836692512, + "learning_rate": 3.1800163993750166e-06, + "loss": 0.0015, + "step": 7650 + }, + { + "epoch": 0.7652, + "grad_norm": 0.00013544142711907625, + "learning_rate": 3.174912249742382e-06, + "loss": 0.0001, + "step": 7652 + }, + { + "epoch": 0.7654, + "grad_norm": 0.0015116040594875813, + "learning_rate": 3.1698114265746126e-06, + "loss": 0.0, + "step": 7654 + }, + { + "epoch": 0.7656, + "grad_norm": 0.0014152118237689137, + "learning_rate": 3.164713932357776e-06, + "loss": 0.0001, + "step": 7656 + }, + { + "epoch": 0.7658, + "grad_norm": 0.0004955068579874933, + "learning_rate": 3.159619769576333e-06, + "loss": 0.0, + "step": 7658 + }, + { + "epoch": 0.766, + "grad_norm": 0.0016203036066144705, + "learning_rate": 3.1545289407131128e-06, + "loss": 0.0875, + "step": 7660 + }, + { + "epoch": 0.7662, + "grad_norm": 0.010785364545881748, + "learning_rate": 3.149441448249331e-06, + "loss": 0.0002, + "step": 7662 + }, + { + "epoch": 0.7664, + "grad_norm": 0.31781283020973206, + "learning_rate": 3.144357294664565e-06, + "loss": 0.0066, + "step": 7664 + }, + { + "epoch": 0.7666, + "grad_norm": 0.01870672218501568, + "learning_rate": 3.1392764824367706e-06, + "loss": 0.0344, + "step": 7666 + }, + { + "epoch": 0.7668, + "grad_norm": 0.8152282238006592, + "learning_rate": 3.134199014042274e-06, + "loss": 0.0087, + "step": 7668 + }, + { + "epoch": 0.767, + "grad_norm": 0.16172225773334503, + "learning_rate": 3.1291248919557717e-06, + "loss": 0.0028, + "step": 7670 + }, + { + "epoch": 0.7672, + "grad_norm": 3.1708061695098877, + "learning_rate": 3.124054118650327e-06, + "loss": 0.0387, + "step": 7672 + }, + { + "epoch": 0.7674, + "grad_norm": 0.0026065055280923843, + "learning_rate": 3.118986696597377e-06, + "loss": 0.0, + "step": 7674 + }, + { + "epoch": 0.7676, + "grad_norm": 0.0203844103962183, + "learning_rate": 3.113922628266718e-06, + "loss": 0.0004, + "step": 7676 + }, + { + "epoch": 0.7678, + "grad_norm": 0.000649117399007082, + "learning_rate": 3.108861916126518e-06, + "loss": 0.0, + "step": 7678 + }, + { + "epoch": 0.768, + "grad_norm": 0.06669940054416656, + "learning_rate": 3.103804562643302e-06, + "loss": 0.0005, + "step": 7680 + }, + { + "epoch": 0.7682, + "grad_norm": 0.00037250667810440063, + "learning_rate": 3.0987505702819687e-06, + "loss": 0.0025, + "step": 7682 + }, + { + "epoch": 0.7684, + "grad_norm": 0.02775619737803936, + "learning_rate": 3.0936999415057712e-06, + "loss": 0.0004, + "step": 7684 + }, + { + "epoch": 0.7686, + "grad_norm": 1.3705217838287354, + "learning_rate": 3.0886526787763237e-06, + "loss": 0.0086, + "step": 7686 + }, + { + "epoch": 0.7688, + "grad_norm": 0.003046102588996291, + "learning_rate": 3.0836087845536e-06, + "loss": 0.0, + "step": 7688 + }, + { + "epoch": 0.769, + "grad_norm": 0.004934491589665413, + "learning_rate": 3.0785682612959334e-06, + "loss": 0.0001, + "step": 7690 + }, + { + "epoch": 0.7692, + "grad_norm": 0.00021801037655677646, + "learning_rate": 3.073531111460013e-06, + "loss": 0.0002, + "step": 7692 + }, + { + "epoch": 0.7694, + "grad_norm": 0.0015593714779242873, + "learning_rate": 3.0684973375008865e-06, + "loss": 0.0, + "step": 7694 + }, + { + "epoch": 0.7696, + "grad_norm": 0.001237650983966887, + "learning_rate": 3.063466941871952e-06, + "loss": 0.3292, + "step": 7696 + }, + { + "epoch": 0.7698, + "grad_norm": 0.0012703153770416975, + "learning_rate": 3.058439927024962e-06, + "loss": 0.0001, + "step": 7698 + }, + { + "epoch": 0.77, + "grad_norm": 0.008489732630550861, + "learning_rate": 3.0534162954100264e-06, + "loss": 0.1228, + "step": 7700 + }, + { + "epoch": 0.7702, + "grad_norm": 1.738041877746582, + "learning_rate": 3.0483960494756017e-06, + "loss": 0.1417, + "step": 7702 + }, + { + "epoch": 0.7704, + "grad_norm": 0.30618152022361755, + "learning_rate": 3.043379191668492e-06, + "loss": 0.0029, + "step": 7704 + }, + { + "epoch": 0.7706, + "grad_norm": 0.047931063920259476, + "learning_rate": 3.038365724433858e-06, + "loss": 0.0005, + "step": 7706 + }, + { + "epoch": 0.7708, + "grad_norm": 0.15422756969928741, + "learning_rate": 3.033355650215193e-06, + "loss": 0.003, + "step": 7708 + }, + { + "epoch": 0.771, + "grad_norm": 0.3633419871330261, + "learning_rate": 3.028348971454356e-06, + "loss": 0.0078, + "step": 7710 + }, + { + "epoch": 0.7712, + "grad_norm": 0.0008197272545658052, + "learning_rate": 3.023345690591537e-06, + "loss": 0.0102, + "step": 7712 + }, + { + "epoch": 0.7714, + "grad_norm": 0.007479256484657526, + "learning_rate": 3.0183458100652752e-06, + "loss": 0.0002, + "step": 7714 + }, + { + "epoch": 0.7716, + "grad_norm": 2.4325296878814697, + "learning_rate": 3.013349332312451e-06, + "loss": 0.0456, + "step": 7716 + }, + { + "epoch": 0.7718, + "grad_norm": 0.002042904030531645, + "learning_rate": 3.008356259768285e-06, + "loss": 0.0, + "step": 7718 + }, + { + "epoch": 0.772, + "grad_norm": 9.824039459228516, + "learning_rate": 3.003366594866345e-06, + "loss": 0.1098, + "step": 7720 + }, + { + "epoch": 0.7722, + "grad_norm": 0.0006462453166022897, + "learning_rate": 2.9983803400385313e-06, + "loss": 0.0, + "step": 7722 + }, + { + "epoch": 0.7724, + "grad_norm": 0.03780452162027359, + "learning_rate": 2.993397497715086e-06, + "loss": 0.0004, + "step": 7724 + }, + { + "epoch": 0.7726, + "grad_norm": 0.0008563249721191823, + "learning_rate": 2.988418070324577e-06, + "loss": 0.0002, + "step": 7726 + }, + { + "epoch": 0.7728, + "grad_norm": 0.016837237402796745, + "learning_rate": 2.983442060293926e-06, + "loss": 0.0005, + "step": 7728 + }, + { + "epoch": 0.773, + "grad_norm": 2.4375810623168945, + "learning_rate": 2.978469470048376e-06, + "loss": 0.0754, + "step": 7730 + }, + { + "epoch": 0.7732, + "grad_norm": 0.002704768441617489, + "learning_rate": 2.9735003020115095e-06, + "loss": 0.0002, + "step": 7732 + }, + { + "epoch": 0.7734, + "grad_norm": 0.002358660800382495, + "learning_rate": 2.968534558605236e-06, + "loss": 0.0001, + "step": 7734 + }, + { + "epoch": 0.7736, + "grad_norm": 0.0034074652940034866, + "learning_rate": 2.963572242249799e-06, + "loss": 0.0001, + "step": 7736 + }, + { + "epoch": 0.7738, + "grad_norm": 0.2664194405078888, + "learning_rate": 2.9586133553637687e-06, + "loss": 0.0017, + "step": 7738 + }, + { + "epoch": 0.774, + "grad_norm": 6.8220319747924805, + "learning_rate": 2.953657900364053e-06, + "loss": 0.0838, + "step": 7740 + }, + { + "epoch": 0.7742, + "grad_norm": 0.0010056981118395925, + "learning_rate": 2.9487058796658785e-06, + "loss": 0.0103, + "step": 7742 + }, + { + "epoch": 0.7744, + "grad_norm": 0.0013668205356225371, + "learning_rate": 2.9437572956827965e-06, + "loss": 0.0002, + "step": 7744 + }, + { + "epoch": 0.7746, + "grad_norm": 0.18481360375881195, + "learning_rate": 2.938812150826684e-06, + "loss": 0.0017, + "step": 7746 + }, + { + "epoch": 0.7748, + "grad_norm": 1.3070766925811768, + "learning_rate": 2.9338704475077527e-06, + "loss": 0.007, + "step": 7748 + }, + { + "epoch": 0.775, + "grad_norm": 0.00028708463651128113, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.0001, + "step": 7750 + }, + { + "epoch": 0.7752, + "grad_norm": 0.019740071147680283, + "learning_rate": 2.9239973751138495e-06, + "loss": 0.0014, + "step": 7752 + }, + { + "epoch": 0.7754, + "grad_norm": 0.009999471716582775, + "learning_rate": 2.919066010850892e-06, + "loss": 0.0004, + "step": 7754 + }, + { + "epoch": 0.7756, + "grad_norm": 0.0034600805956870317, + "learning_rate": 2.9141380977491373e-06, + "loss": 0.0008, + "step": 7756 + }, + { + "epoch": 0.7758, + "grad_norm": 0.05117269977927208, + "learning_rate": 2.9092136382103976e-06, + "loss": 0.0043, + "step": 7758 + }, + { + "epoch": 0.776, + "grad_norm": 0.10187371075153351, + "learning_rate": 2.9042926346347932e-06, + "loss": 0.0012, + "step": 7760 + }, + { + "epoch": 0.7762, + "grad_norm": 0.003517097095027566, + "learning_rate": 2.8993750894207563e-06, + "loss": 0.0001, + "step": 7762 + }, + { + "epoch": 0.7764, + "grad_norm": 0.6706183552742004, + "learning_rate": 2.8944610049650377e-06, + "loss": 0.0042, + "step": 7764 + }, + { + "epoch": 0.7766, + "grad_norm": 0.0029750410467386246, + "learning_rate": 2.8895503836627105e-06, + "loss": 0.0064, + "step": 7766 + }, + { + "epoch": 0.7768, + "grad_norm": 0.046042077243328094, + "learning_rate": 2.884643227907147e-06, + "loss": 0.0004, + "step": 7768 + }, + { + "epoch": 0.777, + "grad_norm": 0.07346156239509583, + "learning_rate": 2.8797395400900362e-06, + "loss": 0.0008, + "step": 7770 + }, + { + "epoch": 0.7772, + "grad_norm": 0.0005752315046265721, + "learning_rate": 2.874839322601375e-06, + "loss": 0.0, + "step": 7772 + }, + { + "epoch": 0.7774, + "grad_norm": 0.05003635957837105, + "learning_rate": 2.869942577829471e-06, + "loss": 0.0005, + "step": 7774 + }, + { + "epoch": 0.7776, + "grad_norm": 0.03463726490736008, + "learning_rate": 2.8650493081609344e-06, + "loss": 0.0006, + "step": 7776 + }, + { + "epoch": 0.7778, + "grad_norm": 0.00023404900275636464, + "learning_rate": 2.860159515980695e-06, + "loss": 0.0, + "step": 7778 + }, + { + "epoch": 0.778, + "grad_norm": 0.04685758054256439, + "learning_rate": 2.855273203671969e-06, + "loss": 0.0011, + "step": 7780 + }, + { + "epoch": 0.7782, + "grad_norm": 0.005638661794364452, + "learning_rate": 2.8503903736162876e-06, + "loss": 0.0058, + "step": 7782 + }, + { + "epoch": 0.7784, + "grad_norm": 0.008483649231493473, + "learning_rate": 2.8455110281934804e-06, + "loss": 0.0002, + "step": 7784 + }, + { + "epoch": 0.7786, + "grad_norm": 0.0005016900249756873, + "learning_rate": 2.840635169781688e-06, + "loss": 0.0002, + "step": 7786 + }, + { + "epoch": 0.7788, + "grad_norm": 0.047565311193466187, + "learning_rate": 2.8357628007573412e-06, + "loss": 0.0004, + "step": 7788 + }, + { + "epoch": 0.779, + "grad_norm": 0.005029338877648115, + "learning_rate": 2.830893923495173e-06, + "loss": 0.0245, + "step": 7790 + }, + { + "epoch": 0.7792, + "grad_norm": 0.003328521503135562, + "learning_rate": 2.8260285403682153e-06, + "loss": 0.0008, + "step": 7792 + }, + { + "epoch": 0.7794, + "grad_norm": 0.050350505858659744, + "learning_rate": 2.821166653747793e-06, + "loss": 0.0005, + "step": 7794 + }, + { + "epoch": 0.7796, + "grad_norm": 0.0030710643623024225, + "learning_rate": 2.816308266003541e-06, + "loss": 0.0, + "step": 7796 + }, + { + "epoch": 0.7798, + "grad_norm": 0.0013446809025481343, + "learning_rate": 2.8114533795033685e-06, + "loss": 0.0, + "step": 7798 + }, + { + "epoch": 0.78, + "grad_norm": 0.001351590035483241, + "learning_rate": 2.8066019966134907e-06, + "loss": 0.0, + "step": 7800 + }, + { + "epoch": 0.7802, + "grad_norm": 0.0031251541804522276, + "learning_rate": 2.8017541196984144e-06, + "loss": 0.0001, + "step": 7802 + }, + { + "epoch": 0.7804, + "grad_norm": 0.0012660082429647446, + "learning_rate": 2.796909751120931e-06, + "loss": 0.0033, + "step": 7804 + }, + { + "epoch": 0.7806, + "grad_norm": 0.006595611106604338, + "learning_rate": 2.7920688932421337e-06, + "loss": 0.0004, + "step": 7806 + }, + { + "epoch": 0.7808, + "grad_norm": 0.0006254503969103098, + "learning_rate": 2.7872315484213954e-06, + "loss": 0.0028, + "step": 7808 + }, + { + "epoch": 0.781, + "grad_norm": 0.0022046093363314867, + "learning_rate": 2.7823977190163788e-06, + "loss": 0.0245, + "step": 7810 + }, + { + "epoch": 0.7812, + "grad_norm": 0.0011537553509697318, + "learning_rate": 2.7775674073830337e-06, + "loss": 0.0001, + "step": 7812 + }, + { + "epoch": 0.7814, + "grad_norm": 0.0007398512098006904, + "learning_rate": 2.7727406158755943e-06, + "loss": 0.0, + "step": 7814 + }, + { + "epoch": 0.7816, + "grad_norm": 0.43852072954177856, + "learning_rate": 2.7679173468465813e-06, + "loss": 0.0021, + "step": 7816 + }, + { + "epoch": 0.7818, + "grad_norm": 0.2323475033044815, + "learning_rate": 2.763097602646797e-06, + "loss": 0.005, + "step": 7818 + }, + { + "epoch": 0.782, + "grad_norm": 0.004691408481448889, + "learning_rate": 2.7582813856253276e-06, + "loss": 0.0001, + "step": 7820 + }, + { + "epoch": 0.7822, + "grad_norm": 0.0018788755405694246, + "learning_rate": 2.7534686981295335e-06, + "loss": 0.0011, + "step": 7822 + }, + { + "epoch": 0.7824, + "grad_norm": 6.781033515930176, + "learning_rate": 2.7486595425050667e-06, + "loss": 0.0523, + "step": 7824 + }, + { + "epoch": 0.7826, + "grad_norm": 0.0023282228503376245, + "learning_rate": 2.7438539210958483e-06, + "loss": 0.0001, + "step": 7826 + }, + { + "epoch": 0.7828, + "grad_norm": 0.1555267870426178, + "learning_rate": 2.739051836244081e-06, + "loss": 0.0031, + "step": 7828 + }, + { + "epoch": 0.783, + "grad_norm": 0.01900472864508629, + "learning_rate": 2.7342532902902418e-06, + "loss": 0.0004, + "step": 7830 + }, + { + "epoch": 0.7832, + "grad_norm": 0.016838891431689262, + "learning_rate": 2.7294582855730835e-06, + "loss": 0.0003, + "step": 7832 + }, + { + "epoch": 0.7834, + "grad_norm": 0.0019058225443586707, + "learning_rate": 2.7246668244296328e-06, + "loss": 0.0004, + "step": 7834 + }, + { + "epoch": 0.7836, + "grad_norm": 0.02736586518585682, + "learning_rate": 2.7198789091951903e-06, + "loss": 0.0002, + "step": 7836 + }, + { + "epoch": 0.7838, + "grad_norm": 0.009542311541736126, + "learning_rate": 2.715094542203327e-06, + "loss": 0.0357, + "step": 7838 + }, + { + "epoch": 0.784, + "grad_norm": 0.038740094751119614, + "learning_rate": 2.7103137257858867e-06, + "loss": 0.0005, + "step": 7840 + }, + { + "epoch": 0.7842, + "grad_norm": 0.0034680261742323637, + "learning_rate": 2.7055364622729772e-06, + "loss": 0.0001, + "step": 7842 + }, + { + "epoch": 0.7844, + "grad_norm": 3.2327969074249268, + "learning_rate": 2.7007627539929847e-06, + "loss": 0.0252, + "step": 7844 + }, + { + "epoch": 0.7846, + "grad_norm": 0.001702935784123838, + "learning_rate": 2.6959926032725537e-06, + "loss": 0.0007, + "step": 7846 + }, + { + "epoch": 0.7848, + "grad_norm": 0.3049376904964447, + "learning_rate": 2.6912260124366007e-06, + "loss": 0.0029, + "step": 7848 + }, + { + "epoch": 0.785, + "grad_norm": 0.00988093577325344, + "learning_rate": 2.6864629838082957e-06, + "loss": 0.0002, + "step": 7850 + }, + { + "epoch": 0.7852, + "grad_norm": 0.11300921440124512, + "learning_rate": 2.6817035197090892e-06, + "loss": 0.0019, + "step": 7852 + }, + { + "epoch": 0.7854, + "grad_norm": 0.00037352132494561374, + "learning_rate": 2.676947622458683e-06, + "loss": 0.0, + "step": 7854 + }, + { + "epoch": 0.7856, + "grad_norm": 0.6808595061302185, + "learning_rate": 2.672195294375045e-06, + "loss": 0.0069, + "step": 7856 + }, + { + "epoch": 0.7858, + "grad_norm": 0.0016300060087814927, + "learning_rate": 2.667446537774402e-06, + "loss": 0.0001, + "step": 7858 + }, + { + "epoch": 0.786, + "grad_norm": 0.0028399769216775894, + "learning_rate": 2.6627013549712355e-06, + "loss": 0.0001, + "step": 7860 + }, + { + "epoch": 0.7862, + "grad_norm": 0.001870567793957889, + "learning_rate": 2.6579597482782972e-06, + "loss": 0.0, + "step": 7862 + }, + { + "epoch": 0.7864, + "grad_norm": 0.1312119960784912, + "learning_rate": 2.6532217200065856e-06, + "loss": 0.0016, + "step": 7864 + }, + { + "epoch": 0.7866, + "grad_norm": 0.16325609385967255, + "learning_rate": 2.648487272465361e-06, + "loss": 0.0051, + "step": 7866 + }, + { + "epoch": 0.7868, + "grad_norm": 0.006855320185422897, + "learning_rate": 2.643756407962127e-06, + "loss": 0.0001, + "step": 7868 + }, + { + "epoch": 0.787, + "grad_norm": 0.0009626214159652591, + "learning_rate": 2.639029128802657e-06, + "loss": 0.0001, + "step": 7870 + }, + { + "epoch": 0.7872, + "grad_norm": 0.00013821799075230956, + "learning_rate": 2.634305437290968e-06, + "loss": 0.0006, + "step": 7872 + }, + { + "epoch": 0.7874, + "grad_norm": 0.0002959974226541817, + "learning_rate": 2.62958533572933e-06, + "loss": 0.0001, + "step": 7874 + }, + { + "epoch": 0.7876, + "grad_norm": 0.0014763366198167205, + "learning_rate": 2.624868826418262e-06, + "loss": 0.0, + "step": 7876 + }, + { + "epoch": 0.7878, + "grad_norm": 0.001856250804848969, + "learning_rate": 2.6201559116565346e-06, + "loss": 0.0001, + "step": 7878 + }, + { + "epoch": 0.788, + "grad_norm": 0.0010268893092870712, + "learning_rate": 2.615446593741161e-06, + "loss": 0.0001, + "step": 7880 + }, + { + "epoch": 0.7882, + "grad_norm": 0.001384546747431159, + "learning_rate": 2.6107408749674125e-06, + "loss": 0.0004, + "step": 7882 + }, + { + "epoch": 0.7884, + "grad_norm": 0.0035544722341001034, + "learning_rate": 2.6060387576287983e-06, + "loss": 0.0001, + "step": 7884 + }, + { + "epoch": 0.7886, + "grad_norm": 0.002196086337789893, + "learning_rate": 2.6013402440170676e-06, + "loss": 0.0001, + "step": 7886 + }, + { + "epoch": 0.7888, + "grad_norm": 0.0005125862662680447, + "learning_rate": 2.596645336422219e-06, + "loss": 0.0003, + "step": 7888 + }, + { + "epoch": 0.789, + "grad_norm": 0.002530121710151434, + "learning_rate": 2.5919540371325005e-06, + "loss": 0.0354, + "step": 7890 + }, + { + "epoch": 0.7892, + "grad_norm": 0.0025934120640158653, + "learning_rate": 2.5872663484343887e-06, + "loss": 0.0001, + "step": 7892 + }, + { + "epoch": 0.7894, + "grad_norm": 0.014923475682735443, + "learning_rate": 2.5825822726126095e-06, + "loss": 0.0001, + "step": 7894 + }, + { + "epoch": 0.7896, + "grad_norm": 0.0018050548387691379, + "learning_rate": 2.577901811950121e-06, + "loss": 0.0003, + "step": 7896 + }, + { + "epoch": 0.7898, + "grad_norm": 0.25487709045410156, + "learning_rate": 2.5732249687281228e-06, + "loss": 0.0027, + "step": 7898 + }, + { + "epoch": 0.79, + "grad_norm": 0.00028770140488632023, + "learning_rate": 2.5685517452260566e-06, + "loss": 0.0023, + "step": 7900 + }, + { + "epoch": 0.7902, + "grad_norm": 0.003668874269351363, + "learning_rate": 2.5638821437215944e-06, + "loss": 0.0001, + "step": 7902 + }, + { + "epoch": 0.7904, + "grad_norm": 0.001175088807940483, + "learning_rate": 2.5592161664906366e-06, + "loss": 0.0207, + "step": 7904 + }, + { + "epoch": 0.7906, + "grad_norm": 0.653542160987854, + "learning_rate": 2.5545538158073278e-06, + "loss": 0.0145, + "step": 7906 + }, + { + "epoch": 0.7908, + "grad_norm": 0.008460666984319687, + "learning_rate": 2.549895093944039e-06, + "loss": 0.0006, + "step": 7908 + }, + { + "epoch": 0.791, + "grad_norm": 0.49743592739105225, + "learning_rate": 2.5452400031713786e-06, + "loss": 0.0076, + "step": 7910 + }, + { + "epoch": 0.7912, + "grad_norm": 0.0093718022108078, + "learning_rate": 2.5405885457581793e-06, + "loss": 0.0002, + "step": 7912 + }, + { + "epoch": 0.7914, + "grad_norm": 0.00391780212521553, + "learning_rate": 2.535940723971505e-06, + "loss": 0.0011, + "step": 7914 + }, + { + "epoch": 0.7916, + "grad_norm": 0.045487429946660995, + "learning_rate": 2.5312965400766475e-06, + "loss": 0.0003, + "step": 7916 + }, + { + "epoch": 0.7918, + "grad_norm": 0.03980090096592903, + "learning_rate": 2.5266559963371216e-06, + "loss": 0.0108, + "step": 7918 + }, + { + "epoch": 0.792, + "grad_norm": 2.2553303241729736, + "learning_rate": 2.522019095014683e-06, + "loss": 0.0703, + "step": 7920 + }, + { + "epoch": 0.7922, + "grad_norm": 0.0023981567937880754, + "learning_rate": 2.5173858383692906e-06, + "loss": 0.0002, + "step": 7922 + }, + { + "epoch": 0.7924, + "grad_norm": 0.001124276197515428, + "learning_rate": 2.512756228659141e-06, + "loss": 0.0001, + "step": 7924 + }, + { + "epoch": 0.7926, + "grad_norm": 0.0039121550507843494, + "learning_rate": 2.5081302681406463e-06, + "loss": 0.0001, + "step": 7926 + }, + { + "epoch": 0.7928, + "grad_norm": 0.005757820792496204, + "learning_rate": 2.5035079590684496e-06, + "loss": 0.0001, + "step": 7928 + }, + { + "epoch": 0.793, + "grad_norm": 0.0009933533146977425, + "learning_rate": 2.4988893036954045e-06, + "loss": 0.0648, + "step": 7930 + }, + { + "epoch": 0.7932, + "grad_norm": 0.012599149718880653, + "learning_rate": 2.494274304272589e-06, + "loss": 0.0003, + "step": 7932 + }, + { + "epoch": 0.7934, + "grad_norm": 0.002703082514926791, + "learning_rate": 2.4896629630492974e-06, + "loss": 0.0002, + "step": 7934 + }, + { + "epoch": 0.7936, + "grad_norm": 0.00690749054774642, + "learning_rate": 2.48505528227304e-06, + "loss": 0.0057, + "step": 7936 + }, + { + "epoch": 0.7938, + "grad_norm": 0.001647350611165166, + "learning_rate": 2.480451264189546e-06, + "loss": 0.0, + "step": 7938 + }, + { + "epoch": 0.794, + "grad_norm": 0.012128596194088459, + "learning_rate": 2.4758509110427576e-06, + "loss": 0.0001, + "step": 7940 + }, + { + "epoch": 0.7942, + "grad_norm": 0.0003521951730363071, + "learning_rate": 2.4712542250748305e-06, + "loss": 0.0, + "step": 7942 + }, + { + "epoch": 0.7944, + "grad_norm": 0.003325430443510413, + "learning_rate": 2.4666612085261344e-06, + "loss": 0.0002, + "step": 7944 + }, + { + "epoch": 0.7946, + "grad_norm": 0.014263073913753033, + "learning_rate": 2.4620718636352457e-06, + "loss": 0.0002, + "step": 7946 + }, + { + "epoch": 0.7948, + "grad_norm": 0.0008806632249616086, + "learning_rate": 2.4574861926389615e-06, + "loss": 0.0033, + "step": 7948 + }, + { + "epoch": 0.795, + "grad_norm": 0.18881994485855103, + "learning_rate": 2.45290419777228e-06, + "loss": 0.0013, + "step": 7950 + }, + { + "epoch": 0.7952, + "grad_norm": 0.07760565727949142, + "learning_rate": 2.4483258812684096e-06, + "loss": 0.001, + "step": 7952 + }, + { + "epoch": 0.7954, + "grad_norm": 0.018867984414100647, + "learning_rate": 2.4437512453587653e-06, + "loss": 0.0018, + "step": 7954 + }, + { + "epoch": 0.7956, + "grad_norm": 0.5886408686637878, + "learning_rate": 2.4391802922729703e-06, + "loss": 0.0046, + "step": 7956 + }, + { + "epoch": 0.7958, + "grad_norm": 0.7392700910568237, + "learning_rate": 2.43461302423885e-06, + "loss": 0.016, + "step": 7958 + }, + { + "epoch": 0.796, + "grad_norm": 0.002010019961744547, + "learning_rate": 2.4300494434824373e-06, + "loss": 0.0, + "step": 7960 + }, + { + "epoch": 0.7962, + "grad_norm": 0.004633966833353043, + "learning_rate": 2.4254895522279642e-06, + "loss": 0.0001, + "step": 7962 + }, + { + "epoch": 0.7964, + "grad_norm": 0.0018769069574773312, + "learning_rate": 2.420933352697865e-06, + "loss": 0.0, + "step": 7964 + }, + { + "epoch": 0.7966, + "grad_norm": 0.0325138233602047, + "learning_rate": 2.4163808471127815e-06, + "loss": 0.0003, + "step": 7966 + }, + { + "epoch": 0.7968, + "grad_norm": 0.004111265763640404, + "learning_rate": 2.411832037691545e-06, + "loss": 0.0001, + "step": 7968 + }, + { + "epoch": 0.797, + "grad_norm": 0.006470171734690666, + "learning_rate": 2.407286926651192e-06, + "loss": 0.0001, + "step": 7970 + }, + { + "epoch": 0.7972, + "grad_norm": 0.014788406901061535, + "learning_rate": 2.4027455162069567e-06, + "loss": 0.0002, + "step": 7972 + }, + { + "epoch": 0.7974, + "grad_norm": 0.023000968620181084, + "learning_rate": 2.398207808572258e-06, + "loss": 0.0002, + "step": 7974 + }, + { + "epoch": 0.7976, + "grad_norm": 0.0023258591536432505, + "learning_rate": 2.3936738059587284e-06, + "loss": 0.0071, + "step": 7976 + }, + { + "epoch": 0.7978, + "grad_norm": 0.004105078522115946, + "learning_rate": 2.3891435105761838e-06, + "loss": 0.0001, + "step": 7978 + }, + { + "epoch": 0.798, + "grad_norm": 0.3157401382923126, + "learning_rate": 2.3846169246326345e-06, + "loss": 0.0027, + "step": 7980 + }, + { + "epoch": 0.7982, + "grad_norm": 0.020513730123639107, + "learning_rate": 2.380094050334283e-06, + "loss": 0.0004, + "step": 7982 + }, + { + "epoch": 0.7984, + "grad_norm": 0.0008874627528712153, + "learning_rate": 2.37557488988552e-06, + "loss": 0.0, + "step": 7984 + }, + { + "epoch": 0.7986, + "grad_norm": 0.0008233655244112015, + "learning_rate": 2.371059445488938e-06, + "loss": 0.0, + "step": 7986 + }, + { + "epoch": 0.7988, + "grad_norm": 0.00014059925160836428, + "learning_rate": 2.3665477193453037e-06, + "loss": 0.0041, + "step": 7988 + }, + { + "epoch": 0.799, + "grad_norm": 0.001736014150083065, + "learning_rate": 2.362039713653581e-06, + "loss": 0.0075, + "step": 7990 + }, + { + "epoch": 0.7992, + "grad_norm": 0.000442591990577057, + "learning_rate": 2.35753543061091e-06, + "loss": 0.0001, + "step": 7992 + }, + { + "epoch": 0.7994, + "grad_norm": 0.0006349280592985451, + "learning_rate": 2.3530348724126304e-06, + "loss": 0.0, + "step": 7994 + }, + { + "epoch": 0.7996, + "grad_norm": 0.012856575660407543, + "learning_rate": 2.3485380412522586e-06, + "loss": 0.0002, + "step": 7996 + }, + { + "epoch": 0.7998, + "grad_norm": 1.3608111143112183, + "learning_rate": 2.3440449393214947e-06, + "loss": 0.0198, + "step": 7998 + }, + { + "epoch": 0.8, + "grad_norm": 0.0006536359433084726, + "learning_rate": 2.339555568810221e-06, + "loss": 0.0, + "step": 8000 + }, + { + "epoch": 0.8002, + "grad_norm": 0.00275680935010314, + "learning_rate": 2.335069931906503e-06, + "loss": 0.0, + "step": 8002 + }, + { + "epoch": 0.8004, + "grad_norm": 0.32342860102653503, + "learning_rate": 2.3305880307965834e-06, + "loss": 0.0055, + "step": 8004 + }, + { + "epoch": 0.8006, + "grad_norm": 0.001991483848541975, + "learning_rate": 2.3261098676648908e-06, + "loss": 0.0, + "step": 8006 + }, + { + "epoch": 0.8008, + "grad_norm": 0.0096596609801054, + "learning_rate": 2.321635444694028e-06, + "loss": 0.0001, + "step": 8008 + }, + { + "epoch": 0.801, + "grad_norm": 0.0010511979926377535, + "learning_rate": 2.317164764064769e-06, + "loss": 0.0, + "step": 8010 + }, + { + "epoch": 0.8012, + "grad_norm": 6.577668190002441, + "learning_rate": 2.3126978279560687e-06, + "loss": 0.0197, + "step": 8012 + }, + { + "epoch": 0.8014, + "grad_norm": 0.00117375492118299, + "learning_rate": 2.308234638545064e-06, + "loss": 0.0, + "step": 8014 + }, + { + "epoch": 0.8016, + "grad_norm": 1.036012887954712, + "learning_rate": 2.3037751980070557e-06, + "loss": 0.0047, + "step": 8016 + }, + { + "epoch": 0.8018, + "grad_norm": 0.07157180458307266, + "learning_rate": 2.2993195085155205e-06, + "loss": 0.0009, + "step": 8018 + }, + { + "epoch": 0.802, + "grad_norm": 0.0005995839601382613, + "learning_rate": 2.2948675722421086e-06, + "loss": 0.0, + "step": 8020 + }, + { + "epoch": 0.8022, + "grad_norm": 0.004053444601595402, + "learning_rate": 2.2904193913566363e-06, + "loss": 0.0001, + "step": 8022 + }, + { + "epoch": 0.8024, + "grad_norm": 0.00418386934325099, + "learning_rate": 2.2859749680270983e-06, + "loss": 0.0001, + "step": 8024 + }, + { + "epoch": 0.8026, + "grad_norm": 0.05244496092200279, + "learning_rate": 2.2815343044196523e-06, + "loss": 0.0005, + "step": 8026 + }, + { + "epoch": 0.8028, + "grad_norm": 2.53639554977417, + "learning_rate": 2.277097402698619e-06, + "loss": 0.18, + "step": 8028 + }, + { + "epoch": 0.803, + "grad_norm": 0.558329164981842, + "learning_rate": 2.27266426502649e-06, + "loss": 0.0024, + "step": 8030 + }, + { + "epoch": 0.8032, + "grad_norm": 0.08980096131563187, + "learning_rate": 2.2682348935639274e-06, + "loss": 0.0145, + "step": 8032 + }, + { + "epoch": 0.8034, + "grad_norm": 0.002635281067341566, + "learning_rate": 2.2638092904697516e-06, + "loss": 0.0001, + "step": 8034 + }, + { + "epoch": 0.8036, + "grad_norm": 0.00094654003623873, + "learning_rate": 2.259387457900948e-06, + "loss": 0.0001, + "step": 8036 + }, + { + "epoch": 0.8038, + "grad_norm": 0.01265350915491581, + "learning_rate": 2.254969398012663e-06, + "loss": 0.0162, + "step": 8038 + }, + { + "epoch": 0.804, + "grad_norm": 0.0009693863103166223, + "learning_rate": 2.2505551129582047e-06, + "loss": 0.0, + "step": 8040 + }, + { + "epoch": 0.8042, + "grad_norm": 0.0020504258573055267, + "learning_rate": 2.2461446048890424e-06, + "loss": 0.0, + "step": 8042 + }, + { + "epoch": 0.8044, + "grad_norm": 0.0007230918854475021, + "learning_rate": 2.241737875954808e-06, + "loss": 0.0035, + "step": 8044 + }, + { + "epoch": 0.8046, + "grad_norm": 0.0008244227501563728, + "learning_rate": 2.237334928303283e-06, + "loss": 0.0006, + "step": 8046 + }, + { + "epoch": 0.8048, + "grad_norm": 0.02033505029976368, + "learning_rate": 2.2329357640804118e-06, + "loss": 0.0024, + "step": 8048 + }, + { + "epoch": 0.805, + "grad_norm": 0.0003696317144203931, + "learning_rate": 2.2285403854302912e-06, + "loss": 0.0003, + "step": 8050 + }, + { + "epoch": 0.8052, + "grad_norm": 0.08409211039543152, + "learning_rate": 2.22414879449518e-06, + "loss": 0.1069, + "step": 8052 + }, + { + "epoch": 0.8054, + "grad_norm": 0.0023776288144290447, + "learning_rate": 2.219760993415485e-06, + "loss": 0.0001, + "step": 8054 + }, + { + "epoch": 0.8056, + "grad_norm": 0.01859428733587265, + "learning_rate": 2.215376984329767e-06, + "loss": 0.0002, + "step": 8056 + }, + { + "epoch": 0.8058, + "grad_norm": 0.045115891844034195, + "learning_rate": 2.210996769374737e-06, + "loss": 0.0009, + "step": 8058 + }, + { + "epoch": 0.806, + "grad_norm": 4.622617244720459, + "learning_rate": 2.206620350685257e-06, + "loss": 0.1417, + "step": 8060 + }, + { + "epoch": 0.8062, + "grad_norm": 0.5786943435668945, + "learning_rate": 2.202247730394349e-06, + "loss": 0.0066, + "step": 8062 + }, + { + "epoch": 0.8064, + "grad_norm": 0.0009646361577324569, + "learning_rate": 2.1978789106331666e-06, + "loss": 0.0, + "step": 8064 + }, + { + "epoch": 0.8066, + "grad_norm": 0.0032946288120001554, + "learning_rate": 2.1935138935310208e-06, + "loss": 0.0015, + "step": 8066 + }, + { + "epoch": 0.8068, + "grad_norm": 2.37951922416687, + "learning_rate": 2.1891526812153674e-06, + "loss": 0.1098, + "step": 8068 + }, + { + "epoch": 0.807, + "grad_norm": 0.19815804064273834, + "learning_rate": 2.1847952758118118e-06, + "loss": 0.0025, + "step": 8070 + }, + { + "epoch": 0.8072, + "grad_norm": 0.007685777265578508, + "learning_rate": 2.1804416794441e-06, + "loss": 0.0001, + "step": 8072 + }, + { + "epoch": 0.8074, + "grad_norm": 0.05218838155269623, + "learning_rate": 2.1760918942341193e-06, + "loss": 0.0007, + "step": 8074 + }, + { + "epoch": 0.8076, + "grad_norm": 0.2372138798236847, + "learning_rate": 2.171745922301903e-06, + "loss": 0.0031, + "step": 8076 + }, + { + "epoch": 0.8078, + "grad_norm": 0.007340278942137957, + "learning_rate": 2.1674037657656265e-06, + "loss": 0.0002, + "step": 8078 + }, + { + "epoch": 0.808, + "grad_norm": 0.015255317091941833, + "learning_rate": 2.163065426741603e-06, + "loss": 0.0002, + "step": 8080 + }, + { + "epoch": 0.8082, + "grad_norm": 0.00047043466474860907, + "learning_rate": 2.1587309073442865e-06, + "loss": 0.0008, + "step": 8082 + }, + { + "epoch": 0.8084, + "grad_norm": 0.6950182914733887, + "learning_rate": 2.154400209686268e-06, + "loss": 0.0892, + "step": 8084 + }, + { + "epoch": 0.8086, + "grad_norm": 0.0026662403251975775, + "learning_rate": 2.1500733358782786e-06, + "loss": 0.0002, + "step": 8086 + }, + { + "epoch": 0.8088, + "grad_norm": 0.0051225158385932446, + "learning_rate": 2.1457502880291815e-06, + "loss": 0.0001, + "step": 8088 + }, + { + "epoch": 0.809, + "grad_norm": 0.005812905728816986, + "learning_rate": 2.1414310682459805e-06, + "loss": 0.066, + "step": 8090 + }, + { + "epoch": 0.8092, + "grad_norm": 0.9662243127822876, + "learning_rate": 2.1371156786338108e-06, + "loss": 0.012, + "step": 8092 + }, + { + "epoch": 0.8094, + "grad_norm": 0.009828532114624977, + "learning_rate": 2.1328041212959403e-06, + "loss": 0.0004, + "step": 8094 + }, + { + "epoch": 0.8096, + "grad_norm": 0.6043196320533752, + "learning_rate": 2.128496398333768e-06, + "loss": 0.0124, + "step": 8096 + }, + { + "epoch": 0.8098, + "grad_norm": 0.00447899429127574, + "learning_rate": 2.1241925118468288e-06, + "loss": 0.0349, + "step": 8098 + }, + { + "epoch": 0.81, + "grad_norm": 0.0004621342523023486, + "learning_rate": 2.119892463932781e-06, + "loss": 0.0, + "step": 8100 + }, + { + "epoch": 0.8102, + "grad_norm": 0.01376676931977272, + "learning_rate": 2.115596256687419e-06, + "loss": 0.0002, + "step": 8102 + }, + { + "epoch": 0.8104, + "grad_norm": 1.178440809249878, + "learning_rate": 2.1113038922046603e-06, + "loss": 0.0256, + "step": 8104 + }, + { + "epoch": 0.8106, + "grad_norm": 0.0026218537241220474, + "learning_rate": 2.107015372576552e-06, + "loss": 0.0004, + "step": 8106 + }, + { + "epoch": 0.8108, + "grad_norm": 0.17075462639331818, + "learning_rate": 2.102730699893263e-06, + "loss": 0.0025, + "step": 8108 + }, + { + "epoch": 0.811, + "grad_norm": 0.0005340329953469336, + "learning_rate": 2.098449876243096e-06, + "loss": 0.0, + "step": 8110 + }, + { + "epoch": 0.8112, + "grad_norm": 0.3020021617412567, + "learning_rate": 2.09417290371247e-06, + "loss": 0.0047, + "step": 8112 + }, + { + "epoch": 0.8114, + "grad_norm": 1.5801900625228882, + "learning_rate": 2.0898997843859338e-06, + "loss": 0.0387, + "step": 8114 + }, + { + "epoch": 0.8116, + "grad_norm": 0.002621731720864773, + "learning_rate": 2.0856305203461436e-06, + "loss": 0.0387, + "step": 8116 + }, + { + "epoch": 0.8118, + "grad_norm": 0.0063290661200881, + "learning_rate": 2.0813651136738957e-06, + "loss": 0.0002, + "step": 8118 + }, + { + "epoch": 0.812, + "grad_norm": 35.46337890625, + "learning_rate": 2.0771035664480944e-06, + "loss": 0.3148, + "step": 8120 + }, + { + "epoch": 0.8122, + "grad_norm": 0.0009489036165177822, + "learning_rate": 2.072845880745766e-06, + "loss": 0.0002, + "step": 8122 + }, + { + "epoch": 0.8124, + "grad_norm": 0.00014447011926677078, + "learning_rate": 2.0685920586420562e-06, + "loss": 0.0, + "step": 8124 + }, + { + "epoch": 0.8126, + "grad_norm": 0.03991344943642616, + "learning_rate": 2.0643421022102216e-06, + "loss": 0.0005, + "step": 8126 + }, + { + "epoch": 0.8128, + "grad_norm": 0.0005005242419429123, + "learning_rate": 2.0600960135216463e-06, + "loss": 0.0, + "step": 8128 + }, + { + "epoch": 0.813, + "grad_norm": 0.14201578497886658, + "learning_rate": 2.0558537946458177e-06, + "loss": 0.0018, + "step": 8130 + }, + { + "epoch": 0.8132, + "grad_norm": 0.004331748932600021, + "learning_rate": 2.051615447650347e-06, + "loss": 0.0003, + "step": 8132 + }, + { + "epoch": 0.8134, + "grad_norm": 0.003840005025267601, + "learning_rate": 2.0473809746009444e-06, + "loss": 0.0001, + "step": 8134 + }, + { + "epoch": 0.8136, + "grad_norm": 0.19447146356105804, + "learning_rate": 2.0431503775614457e-06, + "loss": 0.0018, + "step": 8136 + }, + { + "epoch": 0.8138, + "grad_norm": 0.0015483963070437312, + "learning_rate": 2.0389236585937944e-06, + "loss": 0.0002, + "step": 8138 + }, + { + "epoch": 0.814, + "grad_norm": 0.002751118503510952, + "learning_rate": 2.0347008197580376e-06, + "loss": 0.0014, + "step": 8140 + }, + { + "epoch": 0.8142, + "grad_norm": 0.008498846553266048, + "learning_rate": 2.0304818631123393e-06, + "loss": 0.0001, + "step": 8142 + }, + { + "epoch": 0.8144, + "grad_norm": 1.1088776588439941, + "learning_rate": 2.026266790712965e-06, + "loss": 0.0182, + "step": 8144 + }, + { + "epoch": 0.8146, + "grad_norm": 0.0030198507010936737, + "learning_rate": 2.022055604614289e-06, + "loss": 0.0163, + "step": 8146 + }, + { + "epoch": 0.8148, + "grad_norm": 0.04501443728804588, + "learning_rate": 2.017848306868797e-06, + "loss": 0.0005, + "step": 8148 + }, + { + "epoch": 0.815, + "grad_norm": 0.001165028428658843, + "learning_rate": 2.013644899527074e-06, + "loss": 0.1797, + "step": 8150 + }, + { + "epoch": 0.8152, + "grad_norm": 0.01457690354436636, + "learning_rate": 2.009445384637805e-06, + "loss": 0.0002, + "step": 8152 + }, + { + "epoch": 0.8154, + "grad_norm": 0.0005602201563306153, + "learning_rate": 2.005249764247783e-06, + "loss": 0.0031, + "step": 8154 + }, + { + "epoch": 0.8156, + "grad_norm": 0.0008515977533534169, + "learning_rate": 2.0010580404019066e-06, + "loss": 0.0001, + "step": 8156 + }, + { + "epoch": 0.8158, + "grad_norm": 0.001227827393449843, + "learning_rate": 1.9968702151431697e-06, + "loss": 0.0002, + "step": 8158 + }, + { + "epoch": 0.816, + "grad_norm": 0.00169298704713583, + "learning_rate": 1.9926862905126663e-06, + "loss": 0.0001, + "step": 8160 + }, + { + "epoch": 0.8162, + "grad_norm": 0.13009344041347504, + "learning_rate": 1.9885062685495905e-06, + "loss": 0.0011, + "step": 8162 + }, + { + "epoch": 0.8164, + "grad_norm": 0.0006246409611776471, + "learning_rate": 1.984330151291233e-06, + "loss": 0.0243, + "step": 8164 + }, + { + "epoch": 0.8166, + "grad_norm": 0.022186707705259323, + "learning_rate": 1.9801579407729866e-06, + "loss": 0.0003, + "step": 8166 + }, + { + "epoch": 0.8168, + "grad_norm": 0.0007052362780086696, + "learning_rate": 1.9759896390283362e-06, + "loss": 0.0019, + "step": 8168 + }, + { + "epoch": 0.817, + "grad_norm": 1.2346068620681763, + "learning_rate": 1.9718252480888567e-06, + "loss": 0.0193, + "step": 8170 + }, + { + "epoch": 0.8172, + "grad_norm": 0.0019988196436315775, + "learning_rate": 1.9676647699842246e-06, + "loss": 0.0001, + "step": 8172 + }, + { + "epoch": 0.8174, + "grad_norm": 0.002442598342895508, + "learning_rate": 1.963508206742202e-06, + "loss": 0.0001, + "step": 8174 + }, + { + "epoch": 0.8176, + "grad_norm": 0.0009460141882300377, + "learning_rate": 1.959355560388654e-06, + "loss": 0.0001, + "step": 8176 + }, + { + "epoch": 0.8178, + "grad_norm": 1.571142315864563, + "learning_rate": 1.955206832947526e-06, + "loss": 0.085, + "step": 8178 + }, + { + "epoch": 0.818, + "grad_norm": 0.01954672485589981, + "learning_rate": 1.95106202644086e-06, + "loss": 0.0002, + "step": 8180 + }, + { + "epoch": 0.8182, + "grad_norm": 0.0006200480274856091, + "learning_rate": 1.9469211428887813e-06, + "loss": 0.0011, + "step": 8182 + }, + { + "epoch": 0.8184, + "grad_norm": 5.33652925491333, + "learning_rate": 1.9427841843095063e-06, + "loss": 0.4084, + "step": 8184 + }, + { + "epoch": 0.8186, + "grad_norm": 0.02207491174340248, + "learning_rate": 1.938651152719344e-06, + "loss": 0.0006, + "step": 8186 + }, + { + "epoch": 0.8188, + "grad_norm": 0.03390936553478241, + "learning_rate": 1.934522050132678e-06, + "loss": 0.0003, + "step": 8188 + }, + { + "epoch": 0.819, + "grad_norm": 0.514678955078125, + "learning_rate": 1.930396878561983e-06, + "loss": 0.004, + "step": 8190 + }, + { + "epoch": 0.8192, + "grad_norm": 0.0030073714442551136, + "learning_rate": 1.9262756400178163e-06, + "loss": 0.0001, + "step": 8192 + }, + { + "epoch": 0.8194, + "grad_norm": 0.002646666020154953, + "learning_rate": 1.9221583365088246e-06, + "loss": 0.0008, + "step": 8194 + }, + { + "epoch": 0.8196, + "grad_norm": 0.0024332536850124598, + "learning_rate": 1.918044970041729e-06, + "loss": 0.0, + "step": 8196 + }, + { + "epoch": 0.8198, + "grad_norm": 0.03876730427145958, + "learning_rate": 1.9139355426213346e-06, + "loss": 0.0065, + "step": 8198 + }, + { + "epoch": 0.82, + "grad_norm": 5.155846118927002, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.1446, + "step": 8200 + }, + { + "epoch": 0.8202, + "grad_norm": 0.010540347546339035, + "learning_rate": 1.9057285129302682e-06, + "loss": 0.0046, + "step": 8202 + }, + { + "epoch": 0.8204, + "grad_norm": 0.1508115828037262, + "learning_rate": 1.9016309146596024e-06, + "loss": 0.0027, + "step": 8204 + }, + { + "epoch": 0.8206, + "grad_norm": 0.0066736615262925625, + "learning_rate": 1.8975372634356481e-06, + "loss": 0.0005, + "step": 8206 + }, + { + "epoch": 0.8208, + "grad_norm": 0.011085465550422668, + "learning_rate": 1.8934475612536019e-06, + "loss": 0.0279, + "step": 8208 + }, + { + "epoch": 0.821, + "grad_norm": 0.007447610143572092, + "learning_rate": 1.8893618101067357e-06, + "loss": 0.0001, + "step": 8210 + }, + { + "epoch": 0.8212, + "grad_norm": 0.8358701467514038, + "learning_rate": 1.8852800119863912e-06, + "loss": 0.0163, + "step": 8212 + }, + { + "epoch": 0.8214, + "grad_norm": 0.04260846972465515, + "learning_rate": 1.8812021688819914e-06, + "loss": 0.0014, + "step": 8214 + }, + { + "epoch": 0.8216, + "grad_norm": 0.0007005864172242582, + "learning_rate": 1.8771282827810278e-06, + "loss": 0.0161, + "step": 8216 + }, + { + "epoch": 0.8218, + "grad_norm": 0.0033226050436496735, + "learning_rate": 1.8730583556690607e-06, + "loss": 0.0001, + "step": 8218 + }, + { + "epoch": 0.822, + "grad_norm": 6.338954925537109, + "learning_rate": 1.8689923895297247e-06, + "loss": 0.1631, + "step": 8220 + }, + { + "epoch": 0.8222, + "grad_norm": 0.0005905752768740058, + "learning_rate": 1.86493038634472e-06, + "loss": 0.0003, + "step": 8222 + }, + { + "epoch": 0.8224, + "grad_norm": 0.06710558384656906, + "learning_rate": 1.8608723480938207e-06, + "loss": 0.0005, + "step": 8224 + }, + { + "epoch": 0.8226, + "grad_norm": 0.008281363174319267, + "learning_rate": 1.8568182767548626e-06, + "loss": 0.0008, + "step": 8226 + }, + { + "epoch": 0.8228, + "grad_norm": 0.019383281469345093, + "learning_rate": 1.8527681743037518e-06, + "loss": 0.0003, + "step": 8228 + }, + { + "epoch": 0.823, + "grad_norm": 0.008433347567915916, + "learning_rate": 1.848722042714457e-06, + "loss": 0.0005, + "step": 8230 + }, + { + "epoch": 0.8232, + "grad_norm": 8.296374320983887, + "learning_rate": 1.8446798839590186e-06, + "loss": 0.1188, + "step": 8232 + }, + { + "epoch": 0.8234, + "grad_norm": 0.002115958835929632, + "learning_rate": 1.8406417000075327e-06, + "loss": 0.003, + "step": 8234 + }, + { + "epoch": 0.8236, + "grad_norm": 0.0013981505762785673, + "learning_rate": 1.8366074928281608e-06, + "loss": 0.0005, + "step": 8236 + }, + { + "epoch": 0.8238, + "grad_norm": 0.028605543076992035, + "learning_rate": 1.8325772643871264e-06, + "loss": 0.0008, + "step": 8238 + }, + { + "epoch": 0.824, + "grad_norm": 0.0222330242395401, + "learning_rate": 1.8285510166487154e-06, + "loss": 0.0003, + "step": 8240 + }, + { + "epoch": 0.8242, + "grad_norm": 0.004416992422193289, + "learning_rate": 1.8245287515752708e-06, + "loss": 0.0002, + "step": 8242 + }, + { + "epoch": 0.8244, + "grad_norm": 0.0006583922659046948, + "learning_rate": 1.820510471127196e-06, + "loss": 0.0007, + "step": 8244 + }, + { + "epoch": 0.8246, + "grad_norm": 0.04283040016889572, + "learning_rate": 1.816496177262952e-06, + "loss": 0.0013, + "step": 8246 + }, + { + "epoch": 0.8248, + "grad_norm": 0.01032990776002407, + "learning_rate": 1.812485871939056e-06, + "loss": 0.0002, + "step": 8248 + }, + { + "epoch": 0.825, + "grad_norm": 0.00046905354247428477, + "learning_rate": 1.808479557110081e-06, + "loss": 0.0003, + "step": 8250 + }, + { + "epoch": 0.8252, + "grad_norm": 2.3315038681030273, + "learning_rate": 1.804477234728661e-06, + "loss": 0.0364, + "step": 8252 + }, + { + "epoch": 0.8254, + "grad_norm": 1.1871633529663086, + "learning_rate": 1.8004789067454763e-06, + "loss": 0.0381, + "step": 8254 + }, + { + "epoch": 0.8256, + "grad_norm": 0.06660787761211395, + "learning_rate": 1.7964845751092663e-06, + "loss": 0.001, + "step": 8256 + }, + { + "epoch": 0.8258, + "grad_norm": 0.002851859200745821, + "learning_rate": 1.7924942417668113e-06, + "loss": 0.0002, + "step": 8258 + }, + { + "epoch": 0.826, + "grad_norm": 0.010668888688087463, + "learning_rate": 1.7885079086629598e-06, + "loss": 0.0002, + "step": 8260 + }, + { + "epoch": 0.8262, + "grad_norm": 1.0064857006072998, + "learning_rate": 1.7845255777406e-06, + "loss": 0.0215, + "step": 8262 + }, + { + "epoch": 0.8264, + "grad_norm": 0.09824581444263458, + "learning_rate": 1.7805472509406695e-06, + "loss": 0.002, + "step": 8264 + }, + { + "epoch": 0.8266, + "grad_norm": 0.003803145606070757, + "learning_rate": 1.7765729302021596e-06, + "loss": 0.0001, + "step": 8266 + }, + { + "epoch": 0.8268, + "grad_norm": 0.06224323809146881, + "learning_rate": 1.7726026174621004e-06, + "loss": 0.0013, + "step": 8268 + }, + { + "epoch": 0.827, + "grad_norm": 2.081850528717041, + "learning_rate": 1.7686363146555807e-06, + "loss": 0.0239, + "step": 8270 + }, + { + "epoch": 0.8272, + "grad_norm": 0.003920484334230423, + "learning_rate": 1.7646740237157256e-06, + "loss": 0.0001, + "step": 8272 + }, + { + "epoch": 0.8274, + "grad_norm": 0.982369601726532, + "learning_rate": 1.760715746573709e-06, + "loss": 0.0068, + "step": 8274 + }, + { + "epoch": 0.8276, + "grad_norm": 0.005258915014564991, + "learning_rate": 1.7567614851587444e-06, + "loss": 0.0007, + "step": 8276 + }, + { + "epoch": 0.8278, + "grad_norm": 0.0019623662810772657, + "learning_rate": 1.7528112413980892e-06, + "loss": 0.0, + "step": 8278 + }, + { + "epoch": 0.828, + "grad_norm": 0.0438787005841732, + "learning_rate": 1.7488650172170496e-06, + "loss": 0.0005, + "step": 8280 + }, + { + "epoch": 0.8282, + "grad_norm": 0.0005212316173128784, + "learning_rate": 1.744922814538964e-06, + "loss": 0.0004, + "step": 8282 + }, + { + "epoch": 0.8284, + "grad_norm": 0.001044464879669249, + "learning_rate": 1.7409846352852144e-06, + "loss": 0.0004, + "step": 8284 + }, + { + "epoch": 0.8286, + "grad_norm": 0.007031246088445187, + "learning_rate": 1.7370504813752232e-06, + "loss": 0.0003, + "step": 8286 + }, + { + "epoch": 0.8288, + "grad_norm": 0.040476106107234955, + "learning_rate": 1.7331203547264452e-06, + "loss": 0.0019, + "step": 8288 + }, + { + "epoch": 0.829, + "grad_norm": 0.07659512013196945, + "learning_rate": 1.7291942572543806e-06, + "loss": 0.0028, + "step": 8290 + }, + { + "epoch": 0.8292, + "grad_norm": 0.038256581872701645, + "learning_rate": 1.7252721908725633e-06, + "loss": 0.0008, + "step": 8292 + }, + { + "epoch": 0.8294, + "grad_norm": 0.3084776699542999, + "learning_rate": 1.7213541574925551e-06, + "loss": 0.0047, + "step": 8294 + }, + { + "epoch": 0.8296, + "grad_norm": 0.00039029610343277454, + "learning_rate": 1.7174401590239587e-06, + "loss": 0.0001, + "step": 8296 + }, + { + "epoch": 0.8298, + "grad_norm": 0.5422524213790894, + "learning_rate": 1.7135301973744122e-06, + "loss": 0.0119, + "step": 8298 + }, + { + "epoch": 0.83, + "grad_norm": 0.02549673616886139, + "learning_rate": 1.709624274449584e-06, + "loss": 0.0004, + "step": 8300 + }, + { + "epoch": 0.8302, + "grad_norm": 6.76388692855835, + "learning_rate": 1.7057223921531706e-06, + "loss": 0.1582, + "step": 8302 + }, + { + "epoch": 0.8304, + "grad_norm": 0.006494888570159674, + "learning_rate": 1.7018245523869038e-06, + "loss": 0.0002, + "step": 8304 + }, + { + "epoch": 0.8306, + "grad_norm": 0.0017917719669640064, + "learning_rate": 1.6979307570505422e-06, + "loss": 0.0, + "step": 8306 + }, + { + "epoch": 0.8308, + "grad_norm": 0.014334027655422688, + "learning_rate": 1.6940410080418723e-06, + "loss": 0.001, + "step": 8308 + }, + { + "epoch": 0.831, + "grad_norm": 0.05651378631591797, + "learning_rate": 1.6901553072567189e-06, + "loss": 0.0013, + "step": 8310 + }, + { + "epoch": 0.8312, + "grad_norm": 0.003714303718879819, + "learning_rate": 1.686273656588917e-06, + "loss": 0.0002, + "step": 8312 + }, + { + "epoch": 0.8314, + "grad_norm": 0.014060968533158302, + "learning_rate": 1.6823960579303378e-06, + "loss": 0.0002, + "step": 8314 + }, + { + "epoch": 0.8316, + "grad_norm": 0.03175993263721466, + "learning_rate": 1.6785225131708749e-06, + "loss": 0.0071, + "step": 8316 + }, + { + "epoch": 0.8318, + "grad_norm": 0.01413018349558115, + "learning_rate": 1.6746530241984504e-06, + "loss": 0.0002, + "step": 8318 + }, + { + "epoch": 0.832, + "grad_norm": 0.007816050201654434, + "learning_rate": 1.6707875928990059e-06, + "loss": 0.0001, + "step": 8320 + }, + { + "epoch": 0.8322, + "grad_norm": 1.7831168174743652, + "learning_rate": 1.666926221156503e-06, + "loss": 0.0121, + "step": 8322 + }, + { + "epoch": 0.8324, + "grad_norm": 0.09984057396650314, + "learning_rate": 1.6630689108529286e-06, + "loss": 0.0008, + "step": 8324 + }, + { + "epoch": 0.8326, + "grad_norm": 0.0005068812752142549, + "learning_rate": 1.6592156638682887e-06, + "loss": 0.0002, + "step": 8326 + }, + { + "epoch": 0.8328, + "grad_norm": 0.02112271822988987, + "learning_rate": 1.6553664820806102e-06, + "loss": 0.0003, + "step": 8328 + }, + { + "epoch": 0.833, + "grad_norm": 0.026459449902176857, + "learning_rate": 1.651521367365936e-06, + "loss": 0.0211, + "step": 8330 + }, + { + "epoch": 0.8332, + "grad_norm": 0.06977059692144394, + "learning_rate": 1.6476803215983295e-06, + "loss": 0.0018, + "step": 8332 + }, + { + "epoch": 0.8334, + "grad_norm": 0.0004803847405128181, + "learning_rate": 1.643843346649866e-06, + "loss": 0.0001, + "step": 8334 + }, + { + "epoch": 0.8336, + "grad_norm": 0.46257665753364563, + "learning_rate": 1.6400104443906463e-06, + "loss": 0.0202, + "step": 8336 + }, + { + "epoch": 0.8338, + "grad_norm": 0.0013329677749425173, + "learning_rate": 1.6361816166887768e-06, + "loss": 0.0, + "step": 8338 + }, + { + "epoch": 0.834, + "grad_norm": 0.9695766568183899, + "learning_rate": 1.6323568654103838e-06, + "loss": 0.021, + "step": 8340 + }, + { + "epoch": 0.8342, + "grad_norm": 0.0027630110271275043, + "learning_rate": 1.6285361924196031e-06, + "loss": 0.0001, + "step": 8342 + }, + { + "epoch": 0.8344, + "grad_norm": 0.00125353061594069, + "learning_rate": 1.6247195995785836e-06, + "loss": 0.0001, + "step": 8344 + }, + { + "epoch": 0.8346, + "grad_norm": 0.006275543477386236, + "learning_rate": 1.6209070887474876e-06, + "loss": 0.0003, + "step": 8346 + }, + { + "epoch": 0.8348, + "grad_norm": 0.0007082067313604057, + "learning_rate": 1.6170986617844864e-06, + "loss": 0.0002, + "step": 8348 + }, + { + "epoch": 0.835, + "grad_norm": 0.0010846474906429648, + "learning_rate": 1.6132943205457607e-06, + "loss": 0.0002, + "step": 8350 + }, + { + "epoch": 0.8352, + "grad_norm": 0.006296102423220873, + "learning_rate": 1.6094940668855008e-06, + "loss": 0.0001, + "step": 8352 + }, + { + "epoch": 0.8354, + "grad_norm": 0.12025680392980576, + "learning_rate": 1.6056979026559005e-06, + "loss": 0.0015, + "step": 8354 + }, + { + "epoch": 0.8356, + "grad_norm": 0.0028710218612104654, + "learning_rate": 1.601905829707171e-06, + "loss": 0.0001, + "step": 8356 + }, + { + "epoch": 0.8358, + "grad_norm": 0.42043349146842957, + "learning_rate": 1.5981178498875182e-06, + "loss": 0.0049, + "step": 8358 + }, + { + "epoch": 0.836, + "grad_norm": 1.4233225584030151, + "learning_rate": 1.5943339650431578e-06, + "loss": 0.0284, + "step": 8360 + }, + { + "epoch": 0.8362, + "grad_norm": 1.5932995080947876, + "learning_rate": 1.5905541770183096e-06, + "loss": 0.0067, + "step": 8362 + }, + { + "epoch": 0.8364, + "grad_norm": 0.04215233400464058, + "learning_rate": 1.5867784876551973e-06, + "loss": 0.0551, + "step": 8364 + }, + { + "epoch": 0.8366, + "grad_norm": 0.009179639630019665, + "learning_rate": 1.583006898794044e-06, + "loss": 0.0259, + "step": 8366 + }, + { + "epoch": 0.8368, + "grad_norm": 0.0005002024117857218, + "learning_rate": 1.579239412273078e-06, + "loss": 0.0012, + "step": 8368 + }, + { + "epoch": 0.837, + "grad_norm": 1.3109058141708374, + "learning_rate": 1.5754760299285255e-06, + "loss": 0.0066, + "step": 8370 + }, + { + "epoch": 0.8372, + "grad_norm": 0.06204157695174217, + "learning_rate": 1.5717167535946142e-06, + "loss": 0.0008, + "step": 8372 + }, + { + "epoch": 0.8374, + "grad_norm": 0.6316210627555847, + "learning_rate": 1.5679615851035669e-06, + "loss": 0.0086, + "step": 8374 + }, + { + "epoch": 0.8376, + "grad_norm": 0.0010320601286366582, + "learning_rate": 1.5642105262856122e-06, + "loss": 0.0001, + "step": 8376 + }, + { + "epoch": 0.8378, + "grad_norm": 0.002577391220256686, + "learning_rate": 1.560463578968967e-06, + "loss": 0.0854, + "step": 8378 + }, + { + "epoch": 0.838, + "grad_norm": 0.0005603298195637763, + "learning_rate": 1.5567207449798517e-06, + "loss": 0.0536, + "step": 8380 + }, + { + "epoch": 0.8382, + "grad_norm": 0.0013683520955964923, + "learning_rate": 1.55298202614247e-06, + "loss": 0.0, + "step": 8382 + }, + { + "epoch": 0.8384, + "grad_norm": 0.006040265318006277, + "learning_rate": 1.5492474242790368e-06, + "loss": 0.0001, + "step": 8384 + }, + { + "epoch": 0.8386, + "grad_norm": 0.00894901528954506, + "learning_rate": 1.545516941209747e-06, + "loss": 0.0363, + "step": 8386 + }, + { + "epoch": 0.8388, + "grad_norm": 0.0023568575270473957, + "learning_rate": 1.5417905787527943e-06, + "loss": 0.0002, + "step": 8388 + }, + { + "epoch": 0.839, + "grad_norm": 0.0019272564677521586, + "learning_rate": 1.538068338724361e-06, + "loss": 0.0002, + "step": 8390 + }, + { + "epoch": 0.8392, + "grad_norm": 1.9709737300872803, + "learning_rate": 1.5343502229386209e-06, + "loss": 0.0943, + "step": 8392 + }, + { + "epoch": 0.8394, + "grad_norm": 0.003979414701461792, + "learning_rate": 1.530636233207743e-06, + "loss": 0.001, + "step": 8394 + }, + { + "epoch": 0.8396, + "grad_norm": 0.4992561340332031, + "learning_rate": 1.526926371341878e-06, + "loss": 0.0032, + "step": 8396 + }, + { + "epoch": 0.8398, + "grad_norm": 0.0013512111036106944, + "learning_rate": 1.52322063914917e-06, + "loss": 0.0, + "step": 8398 + }, + { + "epoch": 0.84, + "grad_norm": 1.6977605819702148, + "learning_rate": 1.5195190384357405e-06, + "loss": 0.0248, + "step": 8400 + }, + { + "epoch": 0.8402, + "grad_norm": 0.726084291934967, + "learning_rate": 1.5158215710057123e-06, + "loss": 0.0136, + "step": 8402 + }, + { + "epoch": 0.8404, + "grad_norm": 0.0037951464764773846, + "learning_rate": 1.5121282386611823e-06, + "loss": 0.0022, + "step": 8404 + }, + { + "epoch": 0.8406, + "grad_norm": 0.04585229605436325, + "learning_rate": 1.5084390432022377e-06, + "loss": 0.0004, + "step": 8406 + }, + { + "epoch": 0.8408, + "grad_norm": 0.7106782793998718, + "learning_rate": 1.5047539864269477e-06, + "loss": 0.0083, + "step": 8408 + }, + { + "epoch": 0.841, + "grad_norm": 0.0007771725067868829, + "learning_rate": 1.5010730701313626e-06, + "loss": 0.0001, + "step": 8410 + }, + { + "epoch": 0.8412, + "grad_norm": 0.0008692361298017204, + "learning_rate": 1.4973962961095135e-06, + "loss": 0.0002, + "step": 8412 + }, + { + "epoch": 0.8414, + "grad_norm": 0.013383302837610245, + "learning_rate": 1.4937236661534227e-06, + "loss": 0.0002, + "step": 8414 + }, + { + "epoch": 0.8416, + "grad_norm": 0.01933656446635723, + "learning_rate": 1.490055182053083e-06, + "loss": 0.0004, + "step": 8416 + }, + { + "epoch": 0.8418, + "grad_norm": 0.002225742908194661, + "learning_rate": 1.486390845596466e-06, + "loss": 0.0001, + "step": 8418 + }, + { + "epoch": 0.842, + "grad_norm": 0.03248829394578934, + "learning_rate": 1.4827306585695234e-06, + "loss": 0.0029, + "step": 8420 + }, + { + "epoch": 0.8422, + "grad_norm": 0.003355711232870817, + "learning_rate": 1.4790746227561925e-06, + "loss": 0.0005, + "step": 8422 + }, + { + "epoch": 0.8424, + "grad_norm": 0.0018070684745907784, + "learning_rate": 1.4754227399383758e-06, + "loss": 0.0001, + "step": 8424 + }, + { + "epoch": 0.8426, + "grad_norm": 0.13436755537986755, + "learning_rate": 1.4717750118959583e-06, + "loss": 0.0017, + "step": 8426 + }, + { + "epoch": 0.8428, + "grad_norm": 0.013132529333233833, + "learning_rate": 1.468131440406798e-06, + "loss": 0.0411, + "step": 8428 + }, + { + "epoch": 0.843, + "grad_norm": 1.5824227333068848, + "learning_rate": 1.4644920272467245e-06, + "loss": 0.0098, + "step": 8430 + }, + { + "epoch": 0.8432, + "grad_norm": 0.0008296039886772633, + "learning_rate": 1.4608567741895496e-06, + "loss": 0.0001, + "step": 8432 + }, + { + "epoch": 0.8434, + "grad_norm": 0.29664742946624756, + "learning_rate": 1.4572256830070497e-06, + "loss": 0.0064, + "step": 8434 + }, + { + "epoch": 0.8436, + "grad_norm": 0.021655384451150894, + "learning_rate": 1.4535987554689712e-06, + "loss": 0.0088, + "step": 8436 + }, + { + "epoch": 0.8438, + "grad_norm": 0.011972544714808464, + "learning_rate": 1.4499759933430347e-06, + "loss": 0.0001, + "step": 8438 + }, + { + "epoch": 0.844, + "grad_norm": 0.009521843865513802, + "learning_rate": 1.446357398394934e-06, + "loss": 0.0003, + "step": 8440 + }, + { + "epoch": 0.8442, + "grad_norm": 0.002768774749711156, + "learning_rate": 1.4427429723883256e-06, + "loss": 0.0001, + "step": 8442 + }, + { + "epoch": 0.8444, + "grad_norm": 0.04966406896710396, + "learning_rate": 1.439132717084839e-06, + "loss": 0.0006, + "step": 8444 + }, + { + "epoch": 0.8446, + "grad_norm": 0.08414862304925919, + "learning_rate": 1.4355266342440678e-06, + "loss": 0.0387, + "step": 8446 + }, + { + "epoch": 0.8448, + "grad_norm": 0.002471685642376542, + "learning_rate": 1.4319247256235713e-06, + "loss": 0.0, + "step": 8448 + }, + { + "epoch": 0.845, + "grad_norm": 0.14206556975841522, + "learning_rate": 1.4283269929788779e-06, + "loss": 0.001, + "step": 8450 + }, + { + "epoch": 0.8452, + "grad_norm": 0.0005137508851476014, + "learning_rate": 1.4247334380634792e-06, + "loss": 0.0002, + "step": 8452 + }, + { + "epoch": 0.8454, + "grad_norm": 0.8197813034057617, + "learning_rate": 1.4211440626288286e-06, + "loss": 0.0154, + "step": 8454 + }, + { + "epoch": 0.8456, + "grad_norm": 0.0015148221282288432, + "learning_rate": 1.4175588684243447e-06, + "loss": 0.0001, + "step": 8456 + }, + { + "epoch": 0.8458, + "grad_norm": 0.005814720410853624, + "learning_rate": 1.413977857197405e-06, + "loss": 0.0091, + "step": 8458 + }, + { + "epoch": 0.846, + "grad_norm": 0.003234460251405835, + "learning_rate": 1.4104010306933558e-06, + "loss": 0.0002, + "step": 8460 + }, + { + "epoch": 0.8462, + "grad_norm": 0.3293299078941345, + "learning_rate": 1.4068283906554969e-06, + "loss": 0.006, + "step": 8462 + }, + { + "epoch": 0.8464, + "grad_norm": 0.004457693547010422, + "learning_rate": 1.40325993882509e-06, + "loss": 0.0001, + "step": 8464 + }, + { + "epoch": 0.8466, + "grad_norm": 1.4460049867630005, + "learning_rate": 1.399695676941354e-06, + "loss": 0.0076, + "step": 8466 + }, + { + "epoch": 0.8468, + "grad_norm": 0.0006537841400131583, + "learning_rate": 1.3961356067414667e-06, + "loss": 0.0001, + "step": 8468 + }, + { + "epoch": 0.847, + "grad_norm": 1.2365604639053345, + "learning_rate": 1.3925797299605649e-06, + "loss": 0.0215, + "step": 8470 + }, + { + "epoch": 0.8472, + "grad_norm": 0.13720521330833435, + "learning_rate": 1.3890280483317375e-06, + "loss": 0.0007, + "step": 8472 + }, + { + "epoch": 0.8474, + "grad_norm": 0.02058267779648304, + "learning_rate": 1.3854805635860335e-06, + "loss": 0.0003, + "step": 8474 + }, + { + "epoch": 0.8476, + "grad_norm": 0.002125914441421628, + "learning_rate": 1.381937277452451e-06, + "loss": 0.0001, + "step": 8476 + }, + { + "epoch": 0.8478, + "grad_norm": 0.08589330315589905, + "learning_rate": 1.3783981916579448e-06, + "loss": 0.001, + "step": 8478 + }, + { + "epoch": 0.848, + "grad_norm": 0.022180544212460518, + "learning_rate": 1.3748633079274254e-06, + "loss": 0.0017, + "step": 8480 + }, + { + "epoch": 0.8482, + "grad_norm": 0.014213486574590206, + "learning_rate": 1.3713326279837502e-06, + "loss": 0.0034, + "step": 8482 + }, + { + "epoch": 0.8484, + "grad_norm": 1.3008003234863281, + "learning_rate": 1.3678061535477305e-06, + "loss": 0.0132, + "step": 8484 + }, + { + "epoch": 0.8486, + "grad_norm": 11.80712890625, + "learning_rate": 1.3642838863381258e-06, + "loss": 0.089, + "step": 8486 + }, + { + "epoch": 0.8488, + "grad_norm": 0.6937155723571777, + "learning_rate": 1.3607658280716474e-06, + "loss": 0.0132, + "step": 8488 + }, + { + "epoch": 0.849, + "grad_norm": 0.0005269335233606398, + "learning_rate": 1.3572519804629537e-06, + "loss": 0.0001, + "step": 8490 + }, + { + "epoch": 0.8492, + "grad_norm": 0.00974121131002903, + "learning_rate": 1.3537423452246522e-06, + "loss": 0.0001, + "step": 8492 + }, + { + "epoch": 0.8494, + "grad_norm": 1.0287929773330688, + "learning_rate": 1.3502369240672941e-06, + "loss": 0.0047, + "step": 8494 + }, + { + "epoch": 0.8496, + "grad_norm": 0.01620929315686226, + "learning_rate": 1.3467357186993802e-06, + "loss": 0.0006, + "step": 8496 + }, + { + "epoch": 0.8498, + "grad_norm": 0.0009320019162259996, + "learning_rate": 1.3432387308273576e-06, + "loss": 0.0001, + "step": 8498 + }, + { + "epoch": 0.85, + "grad_norm": 0.0013240034459158778, + "learning_rate": 1.339745962155613e-06, + "loss": 0.0, + "step": 8500 + }, + { + "epoch": 0.8502, + "grad_norm": 0.3939440846443176, + "learning_rate": 1.3362574143864816e-06, + "loss": 0.0032, + "step": 8502 + }, + { + "epoch": 0.8504, + "grad_norm": 0.7076901793479919, + "learning_rate": 1.3327730892202384e-06, + "loss": 0.0042, + "step": 8504 + }, + { + "epoch": 0.8506, + "grad_norm": 0.004551122896373272, + "learning_rate": 1.3292929883550998e-06, + "loss": 0.0002, + "step": 8506 + }, + { + "epoch": 0.8508, + "grad_norm": 0.00041518316720612347, + "learning_rate": 1.3258171134872267e-06, + "loss": 0.0003, + "step": 8508 + }, + { + "epoch": 0.851, + "grad_norm": 0.0017137882532551885, + "learning_rate": 1.322345466310717e-06, + "loss": 0.0, + "step": 8510 + }, + { + "epoch": 0.8512, + "grad_norm": 0.001326531171798706, + "learning_rate": 1.3188780485176089e-06, + "loss": 0.0006, + "step": 8512 + }, + { + "epoch": 0.8514, + "grad_norm": 0.0017944339197129011, + "learning_rate": 1.3154148617978813e-06, + "loss": 0.0004, + "step": 8514 + }, + { + "epoch": 0.8516, + "grad_norm": 0.0006457054987549782, + "learning_rate": 1.3119559078394462e-06, + "loss": 0.0001, + "step": 8516 + }, + { + "epoch": 0.8518, + "grad_norm": 0.00235418020747602, + "learning_rate": 1.3085011883281606e-06, + "loss": 0.0001, + "step": 8518 + }, + { + "epoch": 0.852, + "grad_norm": 0.07567211985588074, + "learning_rate": 1.30505070494781e-06, + "loss": 0.008, + "step": 8520 + }, + { + "epoch": 0.8522, + "grad_norm": 0.0072976574301719666, + "learning_rate": 1.3016044593801202e-06, + "loss": 0.0001, + "step": 8522 + }, + { + "epoch": 0.8524, + "grad_norm": 0.0004755329864565283, + "learning_rate": 1.2981624533047432e-06, + "loss": 0.0001, + "step": 8524 + }, + { + "epoch": 0.8526, + "grad_norm": 0.006093120202422142, + "learning_rate": 1.294724688399278e-06, + "loss": 0.0002, + "step": 8526 + }, + { + "epoch": 0.8528, + "grad_norm": 0.12347576022148132, + "learning_rate": 1.2912911663392468e-06, + "loss": 0.0011, + "step": 8528 + }, + { + "epoch": 0.853, + "grad_norm": 1.2630481719970703, + "learning_rate": 1.2878618887981064e-06, + "loss": 0.0177, + "step": 8530 + }, + { + "epoch": 0.8532, + "grad_norm": 0.0002681585028767586, + "learning_rate": 1.2844368574472454e-06, + "loss": 0.0009, + "step": 8532 + }, + { + "epoch": 0.8534, + "grad_norm": 0.002412468194961548, + "learning_rate": 1.2810160739559797e-06, + "loss": 0.0001, + "step": 8534 + }, + { + "epoch": 0.8536, + "grad_norm": 0.0014311923878267407, + "learning_rate": 1.277599539991563e-06, + "loss": 0.0, + "step": 8536 + }, + { + "epoch": 0.8538, + "grad_norm": 0.001781281316652894, + "learning_rate": 1.2741872572191684e-06, + "loss": 0.0003, + "step": 8538 + }, + { + "epoch": 0.854, + "grad_norm": 0.0024804221466183662, + "learning_rate": 1.2707792273019049e-06, + "loss": 0.0906, + "step": 8540 + }, + { + "epoch": 0.8542, + "grad_norm": 7.719443965470418e-05, + "learning_rate": 1.2673754519008008e-06, + "loss": 0.0, + "step": 8542 + }, + { + "epoch": 0.8544, + "grad_norm": 0.12835440039634705, + "learning_rate": 1.2639759326748136e-06, + "loss": 0.0008, + "step": 8544 + }, + { + "epoch": 0.8546, + "grad_norm": 0.05008590593934059, + "learning_rate": 1.2605806712808322e-06, + "loss": 0.0007, + "step": 8546 + }, + { + "epoch": 0.8548, + "grad_norm": 0.0010094065219163895, + "learning_rate": 1.257189669373664e-06, + "loss": 0.0008, + "step": 8548 + }, + { + "epoch": 0.855, + "grad_norm": 0.01871616579592228, + "learning_rate": 1.2538029286060428e-06, + "loss": 0.0016, + "step": 8550 + }, + { + "epoch": 0.8552, + "grad_norm": 0.5397655963897705, + "learning_rate": 1.2504204506286244e-06, + "loss": 0.0092, + "step": 8552 + }, + { + "epoch": 0.8554, + "grad_norm": 0.03014391101896763, + "learning_rate": 1.2470422370899838e-06, + "loss": 0.0016, + "step": 8554 + }, + { + "epoch": 0.8556, + "grad_norm": 0.0004949999856762588, + "learning_rate": 1.2436682896366282e-06, + "loss": 0.0, + "step": 8556 + }, + { + "epoch": 0.8558, + "grad_norm": 0.01944199576973915, + "learning_rate": 1.2402986099129765e-06, + "loss": 0.0014, + "step": 8558 + }, + { + "epoch": 0.856, + "grad_norm": 0.00037427921779453754, + "learning_rate": 1.2369331995613664e-06, + "loss": 0.0, + "step": 8560 + }, + { + "epoch": 0.8562, + "grad_norm": 0.0016885543009266257, + "learning_rate": 1.233572060222057e-06, + "loss": 0.0008, + "step": 8562 + }, + { + "epoch": 0.8564, + "grad_norm": 0.0006150964763946831, + "learning_rate": 1.230215193533233e-06, + "loss": 0.0002, + "step": 8564 + }, + { + "epoch": 0.8566, + "grad_norm": 0.0012216279283165932, + "learning_rate": 1.2268626011309858e-06, + "loss": 0.0043, + "step": 8566 + }, + { + "epoch": 0.8568, + "grad_norm": 0.01728738471865654, + "learning_rate": 1.223514284649331e-06, + "loss": 0.0002, + "step": 8568 + }, + { + "epoch": 0.857, + "grad_norm": 0.005035240203142166, + "learning_rate": 1.2201702457201948e-06, + "loss": 0.0001, + "step": 8570 + }, + { + "epoch": 0.8572, + "grad_norm": 0.04047268256545067, + "learning_rate": 1.2168304859734226e-06, + "loss": 0.0005, + "step": 8572 + }, + { + "epoch": 0.8574, + "grad_norm": 0.0005609961226582527, + "learning_rate": 1.2134950070367723e-06, + "loss": 0.0, + "step": 8574 + }, + { + "epoch": 0.8576, + "grad_norm": 0.16825322806835175, + "learning_rate": 1.210163810535917e-06, + "loss": 0.0015, + "step": 8576 + }, + { + "epoch": 0.8578, + "grad_norm": 0.0029374300502240658, + "learning_rate": 1.206836898094439e-06, + "loss": 0.0001, + "step": 8578 + }, + { + "epoch": 0.858, + "grad_norm": 0.010807315818965435, + "learning_rate": 1.2035142713338366e-06, + "loss": 0.0007, + "step": 8580 + }, + { + "epoch": 0.8582, + "grad_norm": 0.009548510424792767, + "learning_rate": 1.2001959318735158e-06, + "loss": 0.0009, + "step": 8582 + }, + { + "epoch": 0.8584, + "grad_norm": 0.04834384471178055, + "learning_rate": 1.196881881330798e-06, + "loss": 0.0005, + "step": 8584 + }, + { + "epoch": 0.8586, + "grad_norm": 0.0067449817433953285, + "learning_rate": 1.1935721213209106e-06, + "loss": 0.0079, + "step": 8586 + }, + { + "epoch": 0.8588, + "grad_norm": 0.009293978102505207, + "learning_rate": 1.1902666534569884e-06, + "loss": 0.0003, + "step": 8588 + }, + { + "epoch": 0.859, + "grad_norm": 0.6056520938873291, + "learning_rate": 1.1869654793500784e-06, + "loss": 0.0043, + "step": 8590 + }, + { + "epoch": 0.8592, + "grad_norm": 0.0025735758244991302, + "learning_rate": 1.1836686006091313e-06, + "loss": 0.0093, + "step": 8592 + }, + { + "epoch": 0.8594, + "grad_norm": 0.03689901530742645, + "learning_rate": 1.1803760188410074e-06, + "loss": 0.0005, + "step": 8594 + }, + { + "epoch": 0.8596, + "grad_norm": 4.918539047241211, + "learning_rate": 1.1770877356504684e-06, + "loss": 0.0662, + "step": 8596 + }, + { + "epoch": 0.8598, + "grad_norm": 1.001694917678833, + "learning_rate": 1.1738037526401857e-06, + "loss": 0.0038, + "step": 8598 + }, + { + "epoch": 0.86, + "grad_norm": 0.0025787760969251394, + "learning_rate": 1.1705240714107301e-06, + "loss": 0.0001, + "step": 8600 + }, + { + "epoch": 0.8602, + "grad_norm": 0.007446447387337685, + "learning_rate": 1.167248693560583e-06, + "loss": 0.0127, + "step": 8602 + }, + { + "epoch": 0.8604, + "grad_norm": 0.23790431022644043, + "learning_rate": 1.1639776206861197e-06, + "loss": 0.0076, + "step": 8604 + }, + { + "epoch": 0.8606, + "grad_norm": 0.013053270056843758, + "learning_rate": 1.1607108543816247e-06, + "loss": 0.0001, + "step": 8606 + }, + { + "epoch": 0.8608, + "grad_norm": 0.0005083503783680499, + "learning_rate": 1.1574483962392768e-06, + "loss": 0.0001, + "step": 8608 + }, + { + "epoch": 0.861, + "grad_norm": 0.014885506592690945, + "learning_rate": 1.1541902478491607e-06, + "loss": 0.0003, + "step": 8610 + }, + { + "epoch": 0.8612, + "grad_norm": 0.001770651899278164, + "learning_rate": 1.1509364107992582e-06, + "loss": 0.0001, + "step": 8612 + }, + { + "epoch": 0.8614, + "grad_norm": 6.115728378295898, + "learning_rate": 1.1476868866754488e-06, + "loss": 0.1265, + "step": 8614 + }, + { + "epoch": 0.8616, + "grad_norm": 0.00017777379252947867, + "learning_rate": 1.1444416770615118e-06, + "loss": 0.0647, + "step": 8616 + }, + { + "epoch": 0.8618, + "grad_norm": 1.20986008644104, + "learning_rate": 1.1412007835391237e-06, + "loss": 0.0069, + "step": 8618 + }, + { + "epoch": 0.862, + "grad_norm": 0.11025630682706833, + "learning_rate": 1.1379642076878528e-06, + "loss": 0.0351, + "step": 8620 + }, + { + "epoch": 0.8622, + "grad_norm": 0.012161720544099808, + "learning_rate": 1.1347319510851718e-06, + "loss": 0.0008, + "step": 8622 + }, + { + "epoch": 0.8624, + "grad_norm": 0.010575959458947182, + "learning_rate": 1.1315040153064416e-06, + "loss": 0.0001, + "step": 8624 + }, + { + "epoch": 0.8626, + "grad_norm": 0.0002598080027382821, + "learning_rate": 1.1282804019249183e-06, + "loss": 0.0, + "step": 8626 + }, + { + "epoch": 0.8628, + "grad_norm": 0.0015926206251606345, + "learning_rate": 1.1250611125117527e-06, + "loss": 0.0, + "step": 8628 + }, + { + "epoch": 0.863, + "grad_norm": 0.0009946086211130023, + "learning_rate": 1.1218461486359878e-06, + "loss": 0.0, + "step": 8630 + }, + { + "epoch": 0.8632, + "grad_norm": 0.011857069097459316, + "learning_rate": 1.1186355118645552e-06, + "loss": 0.0002, + "step": 8632 + }, + { + "epoch": 0.8634, + "grad_norm": 0.049102235585451126, + "learning_rate": 1.1154292037622838e-06, + "loss": 0.0019, + "step": 8634 + }, + { + "epoch": 0.8636, + "grad_norm": 0.022473618388175964, + "learning_rate": 1.1122272258918864e-06, + "loss": 0.0003, + "step": 8636 + }, + { + "epoch": 0.8638, + "grad_norm": 0.03211773559451103, + "learning_rate": 1.1090295798139672e-06, + "loss": 0.0003, + "step": 8638 + }, + { + "epoch": 0.864, + "grad_norm": 0.924582839012146, + "learning_rate": 1.1058362670870248e-06, + "loss": 0.0187, + "step": 8640 + }, + { + "epoch": 0.8642, + "grad_norm": 0.11969007551670074, + "learning_rate": 1.102647289267438e-06, + "loss": 0.0015, + "step": 8642 + }, + { + "epoch": 0.8644, + "grad_norm": 0.016621524468064308, + "learning_rate": 1.0994626479094749e-06, + "loss": 0.0099, + "step": 8644 + }, + { + "epoch": 0.8646, + "grad_norm": 0.0013540590880438685, + "learning_rate": 1.096282344565296e-06, + "loss": 0.0013, + "step": 8646 + }, + { + "epoch": 0.8648, + "grad_norm": 0.005571642890572548, + "learning_rate": 1.093106380784934e-06, + "loss": 0.0005, + "step": 8648 + }, + { + "epoch": 0.865, + "grad_norm": 0.00042671713163144886, + "learning_rate": 1.0899347581163222e-06, + "loss": 0.0001, + "step": 8650 + }, + { + "epoch": 0.8652, + "grad_norm": 0.0011315580923110247, + "learning_rate": 1.0867674781052683e-06, + "loss": 0.0002, + "step": 8652 + }, + { + "epoch": 0.8654, + "grad_norm": 0.0005337001639418304, + "learning_rate": 1.0836045422954665e-06, + "loss": 0.0, + "step": 8654 + }, + { + "epoch": 0.8656, + "grad_norm": 0.010678710415959358, + "learning_rate": 1.0804459522284927e-06, + "loss": 0.0001, + "step": 8656 + }, + { + "epoch": 0.8658, + "grad_norm": 0.18385793268680573, + "learning_rate": 1.0772917094438052e-06, + "loss": 0.0036, + "step": 8658 + }, + { + "epoch": 0.866, + "grad_norm": 0.004739090800285339, + "learning_rate": 1.0741418154787443e-06, + "loss": 0.0002, + "step": 8660 + }, + { + "epoch": 0.8662, + "grad_norm": 0.003034044988453388, + "learning_rate": 1.0709962718685318e-06, + "loss": 0.0351, + "step": 8662 + }, + { + "epoch": 0.8664, + "grad_norm": 0.03608427196741104, + "learning_rate": 1.0678550801462662e-06, + "loss": 0.0069, + "step": 8664 + }, + { + "epoch": 0.8666, + "grad_norm": 0.01470916997641325, + "learning_rate": 1.0647182418429224e-06, + "loss": 0.0003, + "step": 8666 + }, + { + "epoch": 0.8668, + "grad_norm": 0.016879333183169365, + "learning_rate": 1.0615857584873624e-06, + "loss": 0.0003, + "step": 8668 + }, + { + "epoch": 0.867, + "grad_norm": 0.5471417307853699, + "learning_rate": 1.058457631606319e-06, + "loss": 0.0105, + "step": 8670 + }, + { + "epoch": 0.8672, + "grad_norm": 0.006307593081146479, + "learning_rate": 1.0553338627244026e-06, + "loss": 0.0001, + "step": 8672 + }, + { + "epoch": 0.8674, + "grad_norm": 0.031591154634952545, + "learning_rate": 1.0522144533641e-06, + "loss": 0.0003, + "step": 8674 + }, + { + "epoch": 0.8676, + "grad_norm": 0.0018800118705257773, + "learning_rate": 1.0490994050457748e-06, + "loss": 0.0001, + "step": 8676 + }, + { + "epoch": 0.8678, + "grad_norm": 0.003107279073446989, + "learning_rate": 1.0459887192876595e-06, + "loss": 0.0001, + "step": 8678 + }, + { + "epoch": 0.868, + "grad_norm": 0.0005650668754242361, + "learning_rate": 1.042882397605871e-06, + "loss": 0.0001, + "step": 8680 + }, + { + "epoch": 0.8682, + "grad_norm": 0.027822257950901985, + "learning_rate": 1.039780441514391e-06, + "loss": 0.0003, + "step": 8682 + }, + { + "epoch": 0.8684, + "grad_norm": 0.0013806704664602876, + "learning_rate": 1.0366828525250728e-06, + "loss": 0.0007, + "step": 8684 + }, + { + "epoch": 0.8686, + "grad_norm": 0.0069888001307845116, + "learning_rate": 1.0335896321476413e-06, + "loss": 0.0034, + "step": 8686 + }, + { + "epoch": 0.8688, + "grad_norm": 0.01602291874587536, + "learning_rate": 1.0305007818897006e-06, + "loss": 0.0002, + "step": 8688 + }, + { + "epoch": 0.869, + "grad_norm": 0.0019839031156152487, + "learning_rate": 1.0274163032567165e-06, + "loss": 0.0, + "step": 8690 + }, + { + "epoch": 0.8692, + "grad_norm": 0.010594687424600124, + "learning_rate": 1.024336197752025e-06, + "loss": 0.0001, + "step": 8692 + }, + { + "epoch": 0.8694, + "grad_norm": 1.9658867120742798, + "learning_rate": 1.0212604668768343e-06, + "loss": 0.0479, + "step": 8694 + }, + { + "epoch": 0.8696, + "grad_norm": 0.0008130822097882628, + "learning_rate": 1.0181891121302145e-06, + "loss": 0.0002, + "step": 8696 + }, + { + "epoch": 0.8698, + "grad_norm": 0.002105005318298936, + "learning_rate": 1.0151221350091134e-06, + "loss": 0.0, + "step": 8698 + }, + { + "epoch": 0.87, + "grad_norm": 0.0002624909393489361, + "learning_rate": 1.012059537008332e-06, + "loss": 0.0, + "step": 8700 + }, + { + "epoch": 0.8702, + "grad_norm": 3.4873125553131104, + "learning_rate": 1.009001319620545e-06, + "loss": 0.141, + "step": 8702 + }, + { + "epoch": 0.8704, + "grad_norm": 0.0006983196944929659, + "learning_rate": 1.0059474843362893e-06, + "loss": 0.0005, + "step": 8704 + }, + { + "epoch": 0.8706, + "grad_norm": 0.017655204981565475, + "learning_rate": 1.0028980326439708e-06, + "loss": 0.0003, + "step": 8706 + }, + { + "epoch": 0.8708, + "grad_norm": 0.0056636701337993145, + "learning_rate": 9.99852966029854e-07, + "loss": 0.0001, + "step": 8708 + }, + { + "epoch": 0.871, + "grad_norm": 0.039779551327228546, + "learning_rate": 9.968122859780648e-07, + "loss": 0.0024, + "step": 8710 + }, + { + "epoch": 0.8712, + "grad_norm": 0.0011804018868133426, + "learning_rate": 9.93775993970597e-07, + "loss": 0.0, + "step": 8712 + }, + { + "epoch": 0.8714, + "grad_norm": 0.0759008526802063, + "learning_rate": 9.907440914873e-07, + "loss": 0.0026, + "step": 8714 + }, + { + "epoch": 0.8716, + "grad_norm": 0.0023237213026732206, + "learning_rate": 9.877165800058874e-07, + "loss": 0.0076, + "step": 8716 + }, + { + "epoch": 0.8718, + "grad_norm": 0.00020540702098514885, + "learning_rate": 9.84693461001932e-07, + "loss": 0.002, + "step": 8718 + }, + { + "epoch": 0.872, + "grad_norm": 0.006014043465256691, + "learning_rate": 9.816747359488632e-07, + "loss": 0.0002, + "step": 8720 + }, + { + "epoch": 0.8722, + "grad_norm": 0.00871846079826355, + "learning_rate": 9.786604063179728e-07, + "loss": 0.0001, + "step": 8722 + }, + { + "epoch": 0.8724, + "grad_norm": 0.09031897038221359, + "learning_rate": 9.756504735784067e-07, + "loss": 0.9763, + "step": 8724 + }, + { + "epoch": 0.8726, + "grad_norm": 0.0009740254608914256, + "learning_rate": 9.726449391971716e-07, + "loss": 0.0001, + "step": 8726 + }, + { + "epoch": 0.8728, + "grad_norm": 0.3079567849636078, + "learning_rate": 9.696438046391288e-07, + "loss": 0.3676, + "step": 8728 + }, + { + "epoch": 0.873, + "grad_norm": 0.0029431055299937725, + "learning_rate": 9.666470713669918e-07, + "loss": 0.0, + "step": 8730 + }, + { + "epoch": 0.8732, + "grad_norm": 0.8259663581848145, + "learning_rate": 9.636547408413355e-07, + "loss": 0.0302, + "step": 8732 + }, + { + "epoch": 0.8734, + "grad_norm": 0.15342853963375092, + "learning_rate": 9.606668145205833e-07, + "loss": 0.0015, + "step": 8734 + }, + { + "epoch": 0.8736, + "grad_norm": 1.4084707498550415, + "learning_rate": 9.576832938610137e-07, + "loss": 0.0063, + "step": 8736 + }, + { + "epoch": 0.8738, + "grad_norm": 0.00024926275364123285, + "learning_rate": 9.547041803167601e-07, + "loss": 0.0081, + "step": 8738 + }, + { + "epoch": 0.874, + "grad_norm": 0.0025695927906781435, + "learning_rate": 9.517294753398066e-07, + "loss": 0.0002, + "step": 8740 + }, + { + "epoch": 0.8742, + "grad_norm": 0.0016439679311588407, + "learning_rate": 9.487591803799856e-07, + "loss": 0.0993, + "step": 8742 + }, + { + "epoch": 0.8744, + "grad_norm": 0.006956426426768303, + "learning_rate": 9.457932968849826e-07, + "loss": 0.4765, + "step": 8744 + }, + { + "epoch": 0.8746, + "grad_norm": 0.6390620470046997, + "learning_rate": 9.428318263003378e-07, + "loss": 0.0049, + "step": 8746 + }, + { + "epoch": 0.8748, + "grad_norm": 0.003181263105943799, + "learning_rate": 9.398747700694322e-07, + "loss": 0.005, + "step": 8748 + }, + { + "epoch": 0.875, + "grad_norm": 0.09196940809488297, + "learning_rate": 9.369221296335007e-07, + "loss": 0.001, + "step": 8750 + }, + { + "epoch": 0.8752, + "grad_norm": 0.03231183812022209, + "learning_rate": 9.339739064316233e-07, + "loss": 0.0026, + "step": 8752 + }, + { + "epoch": 0.8754, + "grad_norm": 0.001062851632013917, + "learning_rate": 9.310301019007284e-07, + "loss": 0.0, + "step": 8754 + }, + { + "epoch": 0.8756, + "grad_norm": 3.650752067565918, + "learning_rate": 9.280907174755916e-07, + "loss": 0.4266, + "step": 8756 + }, + { + "epoch": 0.8758, + "grad_norm": 0.07900829613208771, + "learning_rate": 9.251557545888312e-07, + "loss": 0.0009, + "step": 8758 + }, + { + "epoch": 0.876, + "grad_norm": 0.00655725970864296, + "learning_rate": 9.222252146709143e-07, + "loss": 0.0001, + "step": 8760 + }, + { + "epoch": 0.8762, + "grad_norm": 0.004223995376378298, + "learning_rate": 9.192990991501483e-07, + "loss": 0.0001, + "step": 8762 + }, + { + "epoch": 0.8764, + "grad_norm": 0.023325800895690918, + "learning_rate": 9.16377409452689e-07, + "loss": 0.0002, + "step": 8764 + }, + { + "epoch": 0.8766, + "grad_norm": 2.692704439163208, + "learning_rate": 9.134601470025306e-07, + "loss": 0.0696, + "step": 8766 + }, + { + "epoch": 0.8768, + "grad_norm": 0.16217485070228577, + "learning_rate": 9.105473132215126e-07, + "loss": 0.0012, + "step": 8768 + }, + { + "epoch": 0.877, + "grad_norm": 0.051316551864147186, + "learning_rate": 9.076389095293148e-07, + "loss": 0.0005, + "step": 8770 + }, + { + "epoch": 0.8772, + "grad_norm": 0.0004509160353336483, + "learning_rate": 9.047349373434566e-07, + "loss": 0.0001, + "step": 8772 + }, + { + "epoch": 0.8774, + "grad_norm": 0.09486281871795654, + "learning_rate": 9.018353980792993e-07, + "loss": 0.0015, + "step": 8774 + }, + { + "epoch": 0.8776, + "grad_norm": 0.0017787051619961858, + "learning_rate": 8.989402931500434e-07, + "loss": 0.0005, + "step": 8776 + }, + { + "epoch": 0.8778, + "grad_norm": 0.0022461269982159138, + "learning_rate": 8.960496239667282e-07, + "loss": 0.0103, + "step": 8778 + }, + { + "epoch": 0.878, + "grad_norm": 0.04450037702918053, + "learning_rate": 8.931633919382299e-07, + "loss": 0.0006, + "step": 8780 + }, + { + "epoch": 0.8782, + "grad_norm": 0.0007876426680013537, + "learning_rate": 8.902815984712621e-07, + "loss": 0.0008, + "step": 8782 + }, + { + "epoch": 0.8784, + "grad_norm": 0.29537996649742126, + "learning_rate": 8.874042449703779e-07, + "loss": 0.0022, + "step": 8784 + }, + { + "epoch": 0.8786, + "grad_norm": 0.0011141860159114003, + "learning_rate": 8.845313328379635e-07, + "loss": 0.0001, + "step": 8786 + }, + { + "epoch": 0.8788, + "grad_norm": 0.008596479892730713, + "learning_rate": 8.816628634742441e-07, + "loss": 0.0002, + "step": 8788 + }, + { + "epoch": 0.879, + "grad_norm": 0.02593809738755226, + "learning_rate": 8.787988382772705e-07, + "loss": 0.0001, + "step": 8790 + }, + { + "epoch": 0.8792, + "grad_norm": 2.0389602184295654, + "learning_rate": 8.759392586429394e-07, + "loss": 0.0461, + "step": 8792 + }, + { + "epoch": 0.8794, + "grad_norm": 0.0019100038334727287, + "learning_rate": 8.730841259649725e-07, + "loss": 0.0009, + "step": 8794 + }, + { + "epoch": 0.8796, + "grad_norm": 0.0022484876681119204, + "learning_rate": 8.702334416349279e-07, + "loss": 0.0153, + "step": 8796 + }, + { + "epoch": 0.8798, + "grad_norm": 0.005952616687864065, + "learning_rate": 8.67387207042194e-07, + "loss": 0.0003, + "step": 8798 + }, + { + "epoch": 0.88, + "grad_norm": 0.0006055427365936339, + "learning_rate": 8.645454235739903e-07, + "loss": 0.0, + "step": 8800 + }, + { + "epoch": 0.8802, + "grad_norm": 0.022218206897377968, + "learning_rate": 8.617080926153698e-07, + "loss": 0.0003, + "step": 8802 + }, + { + "epoch": 0.8804, + "grad_norm": 0.0010121924569830298, + "learning_rate": 8.58875215549212e-07, + "loss": 0.01, + "step": 8804 + }, + { + "epoch": 0.8806, + "grad_norm": 0.05503355711698532, + "learning_rate": 8.560467937562278e-07, + "loss": 0.0006, + "step": 8806 + }, + { + "epoch": 0.8808, + "grad_norm": 0.0008312283316627145, + "learning_rate": 8.532228286149502e-07, + "loss": 0.0003, + "step": 8808 + }, + { + "epoch": 0.881, + "grad_norm": 0.0005935761728323996, + "learning_rate": 8.504033215017527e-07, + "loss": 0.0002, + "step": 8810 + }, + { + "epoch": 0.8812, + "grad_norm": 5.450923442840576, + "learning_rate": 8.475882737908248e-07, + "loss": 0.0327, + "step": 8812 + }, + { + "epoch": 0.8814, + "grad_norm": 0.08706055581569672, + "learning_rate": 8.447776868541879e-07, + "loss": 0.0112, + "step": 8814 + }, + { + "epoch": 0.8816, + "grad_norm": 0.004508425947278738, + "learning_rate": 8.419715620616875e-07, + "loss": 0.0001, + "step": 8816 + }, + { + "epoch": 0.8818, + "grad_norm": 4.468612194061279, + "learning_rate": 8.39169900780995e-07, + "loss": 0.3051, + "step": 8818 + }, + { + "epoch": 0.882, + "grad_norm": 0.6728643774986267, + "learning_rate": 8.363727043776037e-07, + "loss": 0.0051, + "step": 8820 + }, + { + "epoch": 0.8822, + "grad_norm": 0.0013167005963623524, + "learning_rate": 8.335799742148387e-07, + "loss": 0.0047, + "step": 8822 + }, + { + "epoch": 0.8824, + "grad_norm": 0.6244345307350159, + "learning_rate": 8.307917116538378e-07, + "loss": 0.006, + "step": 8824 + }, + { + "epoch": 0.8826, + "grad_norm": 0.00027318342472426593, + "learning_rate": 8.280079180535672e-07, + "loss": 0.0002, + "step": 8826 + }, + { + "epoch": 0.8828, + "grad_norm": 0.0009880070574581623, + "learning_rate": 8.252285947708139e-07, + "loss": 0.0001, + "step": 8828 + }, + { + "epoch": 0.883, + "grad_norm": 0.004479461815208197, + "learning_rate": 8.224537431601886e-07, + "loss": 0.0002, + "step": 8830 + }, + { + "epoch": 0.8832, + "grad_norm": 0.0018130372045561671, + "learning_rate": 8.196833645741187e-07, + "loss": 0.0, + "step": 8832 + }, + { + "epoch": 0.8834, + "grad_norm": 0.012711212038993835, + "learning_rate": 8.169174603628538e-07, + "loss": 0.0002, + "step": 8834 + }, + { + "epoch": 0.8836, + "grad_norm": 0.001498597557656467, + "learning_rate": 8.141560318744601e-07, + "loss": 0.0451, + "step": 8836 + }, + { + "epoch": 0.8838, + "grad_norm": 2.066742181777954, + "learning_rate": 8.113990804548244e-07, + "loss": 0.0253, + "step": 8838 + }, + { + "epoch": 0.884, + "grad_norm": 0.18812337517738342, + "learning_rate": 8.086466074476562e-07, + "loss": 0.0077, + "step": 8840 + }, + { + "epoch": 0.8842, + "grad_norm": 0.0011174757964909077, + "learning_rate": 8.058986141944724e-07, + "loss": 0.244, + "step": 8842 + }, + { + "epoch": 0.8844, + "grad_norm": 0.003231189213693142, + "learning_rate": 8.031551020346129e-07, + "loss": 0.0004, + "step": 8844 + }, + { + "epoch": 0.8846, + "grad_norm": 0.006906331516802311, + "learning_rate": 8.004160723052312e-07, + "loss": 0.0001, + "step": 8846 + }, + { + "epoch": 0.8848, + "grad_norm": 0.02149323932826519, + "learning_rate": 7.976815263412963e-07, + "loss": 0.0005, + "step": 8848 + }, + { + "epoch": 0.885, + "grad_norm": 0.0019702508579939604, + "learning_rate": 7.949514654755963e-07, + "loss": 0.0103, + "step": 8850 + }, + { + "epoch": 0.8852, + "grad_norm": 0.007292153313755989, + "learning_rate": 7.922258910387282e-07, + "loss": 0.0002, + "step": 8852 + }, + { + "epoch": 0.8854, + "grad_norm": 0.0006624742527492344, + "learning_rate": 7.895048043591036e-07, + "loss": 0.0002, + "step": 8854 + }, + { + "epoch": 0.8856, + "grad_norm": 0.0004683614824898541, + "learning_rate": 7.867882067629473e-07, + "loss": 0.0005, + "step": 8856 + }, + { + "epoch": 0.8858, + "grad_norm": 0.043483391404151917, + "learning_rate": 7.840760995742946e-07, + "loss": 0.0012, + "step": 8858 + }, + { + "epoch": 0.886, + "grad_norm": 0.02718784660100937, + "learning_rate": 7.81368484114996e-07, + "loss": 0.0005, + "step": 8860 + }, + { + "epoch": 0.8862, + "grad_norm": 0.01017186138778925, + "learning_rate": 7.78665361704708e-07, + "loss": 0.0053, + "step": 8862 + }, + { + "epoch": 0.8864, + "grad_norm": 0.009047391824424267, + "learning_rate": 7.759667336609011e-07, + "loss": 0.0001, + "step": 8864 + }, + { + "epoch": 0.8866, + "grad_norm": 0.3050302565097809, + "learning_rate": 7.732726012988512e-07, + "loss": 0.0019, + "step": 8866 + }, + { + "epoch": 0.8868, + "grad_norm": 0.5807020664215088, + "learning_rate": 7.7058296593165e-07, + "loss": 0.0281, + "step": 8868 + }, + { + "epoch": 0.887, + "grad_norm": 0.001499394653365016, + "learning_rate": 7.678978288701911e-07, + "loss": 0.0001, + "step": 8870 + }, + { + "epoch": 0.8872, + "grad_norm": 0.02504338137805462, + "learning_rate": 7.652171914231777e-07, + "loss": 0.0003, + "step": 8872 + }, + { + "epoch": 0.8874, + "grad_norm": 0.002462511183694005, + "learning_rate": 7.62541054897119e-07, + "loss": 0.0, + "step": 8874 + }, + { + "epoch": 0.8876, + "grad_norm": 4.901753902435303, + "learning_rate": 7.598694205963331e-07, + "loss": 0.0203, + "step": 8876 + }, + { + "epoch": 0.8878, + "grad_norm": 0.0015968725783750415, + "learning_rate": 7.572022898229403e-07, + "loss": 0.0001, + "step": 8878 + }, + { + "epoch": 0.888, + "grad_norm": 0.02044246718287468, + "learning_rate": 7.545396638768698e-07, + "loss": 0.0003, + "step": 8880 + }, + { + "epoch": 0.8882, + "grad_norm": 0.02866695262491703, + "learning_rate": 7.518815440558514e-07, + "loss": 0.0005, + "step": 8882 + }, + { + "epoch": 0.8884, + "grad_norm": 0.09207144379615784, + "learning_rate": 7.492279316554207e-07, + "loss": 0.0014, + "step": 8884 + }, + { + "epoch": 0.8886, + "grad_norm": 0.04131068289279938, + "learning_rate": 7.465788279689156e-07, + "loss": 0.0006, + "step": 8886 + }, + { + "epoch": 0.8888, + "grad_norm": 0.0006243702373467386, + "learning_rate": 7.439342342874789e-07, + "loss": 0.0005, + "step": 8888 + }, + { + "epoch": 0.889, + "grad_norm": 0.00453182915225625, + "learning_rate": 7.412941519000527e-07, + "loss": 0.0002, + "step": 8890 + }, + { + "epoch": 0.8892, + "grad_norm": 0.00047707732301205397, + "learning_rate": 7.386585820933812e-07, + "loss": 0.0015, + "step": 8892 + }, + { + "epoch": 0.8894, + "grad_norm": 0.05216575041413307, + "learning_rate": 7.360275261520078e-07, + "loss": 0.001, + "step": 8894 + }, + { + "epoch": 0.8896, + "grad_norm": 0.002491111634299159, + "learning_rate": 7.334009853582791e-07, + "loss": 0.0009, + "step": 8896 + }, + { + "epoch": 0.8898, + "grad_norm": 0.12492559105157852, + "learning_rate": 7.307789609923377e-07, + "loss": 0.0011, + "step": 8898 + }, + { + "epoch": 0.89, + "grad_norm": 0.005450807046145201, + "learning_rate": 7.281614543321269e-07, + "loss": 0.0001, + "step": 8900 + }, + { + "epoch": 0.8902, + "grad_norm": 0.001330655300989747, + "learning_rate": 7.255484666533874e-07, + "loss": 0.0005, + "step": 8902 + }, + { + "epoch": 0.8904, + "grad_norm": 0.003524497617036104, + "learning_rate": 7.22939999229657e-07, + "loss": 0.0001, + "step": 8904 + }, + { + "epoch": 0.8906, + "grad_norm": 0.022082306444644928, + "learning_rate": 7.203360533322734e-07, + "loss": 0.0008, + "step": 8906 + }, + { + "epoch": 0.8908, + "grad_norm": 0.05399677902460098, + "learning_rate": 7.177366302303667e-07, + "loss": 0.0008, + "step": 8908 + }, + { + "epoch": 0.891, + "grad_norm": 0.012549402192234993, + "learning_rate": 7.151417311908648e-07, + "loss": 0.0003, + "step": 8910 + }, + { + "epoch": 0.8912, + "grad_norm": 0.5013459920883179, + "learning_rate": 7.125513574784904e-07, + "loss": 0.0084, + "step": 8912 + }, + { + "epoch": 0.8914, + "grad_norm": 0.0019770192448049784, + "learning_rate": 7.099655103557557e-07, + "loss": 0.0003, + "step": 8914 + }, + { + "epoch": 0.8916, + "grad_norm": 0.0012883911840617657, + "learning_rate": 7.073841910829771e-07, + "loss": 0.0016, + "step": 8916 + }, + { + "epoch": 0.8918, + "grad_norm": 0.0028861132450401783, + "learning_rate": 7.048074009182548e-07, + "loss": 0.0001, + "step": 8918 + }, + { + "epoch": 0.892, + "grad_norm": 0.20207184553146362, + "learning_rate": 7.022351411174866e-07, + "loss": 0.0024, + "step": 8920 + }, + { + "epoch": 0.8922, + "grad_norm": 0.02324596419930458, + "learning_rate": 6.996674129343606e-07, + "loss": 0.0011, + "step": 8922 + }, + { + "epoch": 0.8924, + "grad_norm": 0.007026148959994316, + "learning_rate": 6.971042176203535e-07, + "loss": 0.0001, + "step": 8924 + }, + { + "epoch": 0.8926, + "grad_norm": 0.0004916478064842522, + "learning_rate": 6.945455564247394e-07, + "loss": 0.0294, + "step": 8926 + }, + { + "epoch": 0.8928, + "grad_norm": 0.002204000251367688, + "learning_rate": 6.919914305945774e-07, + "loss": 0.0003, + "step": 8928 + }, + { + "epoch": 0.893, + "grad_norm": 0.0022561708465218544, + "learning_rate": 6.894418413747183e-07, + "loss": 0.0012, + "step": 8930 + }, + { + "epoch": 0.8932, + "grad_norm": 1.4391995668411255, + "learning_rate": 6.868967900077972e-07, + "loss": 0.0556, + "step": 8932 + }, + { + "epoch": 0.8934, + "grad_norm": 4.37544584274292, + "learning_rate": 6.84356277734245e-07, + "loss": 0.059, + "step": 8934 + }, + { + "epoch": 0.8936, + "grad_norm": 0.06577788293361664, + "learning_rate": 6.818203057922756e-07, + "loss": 0.0011, + "step": 8936 + }, + { + "epoch": 0.8938, + "grad_norm": 0.0066007161512970924, + "learning_rate": 6.792888754178906e-07, + "loss": 0.0002, + "step": 8938 + }, + { + "epoch": 0.894, + "grad_norm": 0.0022604616824537516, + "learning_rate": 6.767619878448783e-07, + "loss": 0.0001, + "step": 8940 + }, + { + "epoch": 0.8942, + "grad_norm": 0.03202838450670242, + "learning_rate": 6.742396443048138e-07, + "loss": 0.0036, + "step": 8942 + }, + { + "epoch": 0.8944, + "grad_norm": 0.0017581527354195714, + "learning_rate": 6.717218460270536e-07, + "loss": 0.0001, + "step": 8944 + }, + { + "epoch": 0.8946, + "grad_norm": 0.2053954005241394, + "learning_rate": 6.692085942387483e-07, + "loss": 0.0022, + "step": 8946 + }, + { + "epoch": 0.8948, + "grad_norm": 0.016185414046049118, + "learning_rate": 6.666998901648203e-07, + "loss": 0.0005, + "step": 8948 + }, + { + "epoch": 0.895, + "grad_norm": 0.0005539777921512723, + "learning_rate": 6.641957350279838e-07, + "loss": 0.0004, + "step": 8950 + }, + { + "epoch": 0.8952, + "grad_norm": 0.015722928568720818, + "learning_rate": 6.616961300487323e-07, + "loss": 0.0002, + "step": 8952 + }, + { + "epoch": 0.8954, + "grad_norm": 0.01318634208291769, + "learning_rate": 6.592010764453449e-07, + "loss": 0.0003, + "step": 8954 + }, + { + "epoch": 0.8956, + "grad_norm": 0.002893816214054823, + "learning_rate": 6.567105754338798e-07, + "loss": 0.0001, + "step": 8956 + }, + { + "epoch": 0.8958, + "grad_norm": 0.006513205356895924, + "learning_rate": 6.542246282281772e-07, + "loss": 0.0019, + "step": 8958 + }, + { + "epoch": 0.896, + "grad_norm": 0.011136309243738651, + "learning_rate": 6.517432360398556e-07, + "loss": 0.0003, + "step": 8960 + }, + { + "epoch": 0.8962, + "grad_norm": 0.1645621508359909, + "learning_rate": 6.492664000783166e-07, + "loss": 0.0013, + "step": 8962 + }, + { + "epoch": 0.8964, + "grad_norm": 0.011641278862953186, + "learning_rate": 6.467941215507434e-07, + "loss": 0.0003, + "step": 8964 + }, + { + "epoch": 0.8966, + "grad_norm": 0.0005086003220640123, + "learning_rate": 6.443264016620887e-07, + "loss": 0.0001, + "step": 8966 + }, + { + "epoch": 0.8968, + "grad_norm": 0.015787001699209213, + "learning_rate": 6.418632416150927e-07, + "loss": 0.0008, + "step": 8968 + }, + { + "epoch": 0.897, + "grad_norm": 0.025704769417643547, + "learning_rate": 6.394046426102673e-07, + "loss": 0.0004, + "step": 8970 + }, + { + "epoch": 0.8972, + "grad_norm": 0.0033214434515684843, + "learning_rate": 6.369506058459063e-07, + "loss": 0.0002, + "step": 8972 + }, + { + "epoch": 0.8974, + "grad_norm": 2.5959296226501465, + "learning_rate": 6.345011325180772e-07, + "loss": 0.0572, + "step": 8974 + }, + { + "epoch": 0.8976, + "grad_norm": 0.0031330196652561426, + "learning_rate": 6.320562238206218e-07, + "loss": 0.0006, + "step": 8976 + }, + { + "epoch": 0.8978, + "grad_norm": 0.004238041117787361, + "learning_rate": 6.296158809451602e-07, + "loss": 0.055, + "step": 8978 + }, + { + "epoch": 0.898, + "grad_norm": 1.8096587657928467, + "learning_rate": 6.271801050810856e-07, + "loss": 0.0085, + "step": 8980 + }, + { + "epoch": 0.8982, + "grad_norm": 0.0003953804261982441, + "learning_rate": 6.247488974155657e-07, + "loss": 0.0009, + "step": 8982 + }, + { + "epoch": 0.8984, + "grad_norm": 0.6784815788269043, + "learning_rate": 6.223222591335409e-07, + "loss": 0.0135, + "step": 8984 + }, + { + "epoch": 0.8986, + "grad_norm": 0.12522661685943604, + "learning_rate": 6.199001914177261e-07, + "loss": 0.0153, + "step": 8986 + }, + { + "epoch": 0.8988, + "grad_norm": 0.004869851749390364, + "learning_rate": 6.174826954486069e-07, + "loss": 0.0001, + "step": 8988 + }, + { + "epoch": 0.899, + "grad_norm": 0.0001556078641442582, + "learning_rate": 6.150697724044407e-07, + "loss": 0.0, + "step": 8990 + }, + { + "epoch": 0.8992, + "grad_norm": 0.017263950780034065, + "learning_rate": 6.126614234612593e-07, + "loss": 0.0002, + "step": 8992 + }, + { + "epoch": 0.8994, + "grad_norm": 0.0005573662929236889, + "learning_rate": 6.102576497928614e-07, + "loss": 0.0001, + "step": 8994 + }, + { + "epoch": 0.8996, + "grad_norm": 0.015651792287826538, + "learning_rate": 6.078584525708175e-07, + "loss": 0.0003, + "step": 8996 + }, + { + "epoch": 0.8998, + "grad_norm": 0.007567434571683407, + "learning_rate": 6.054638329644658e-07, + "loss": 0.0007, + "step": 8998 + }, + { + "epoch": 0.9, + "grad_norm": 0.015790555626153946, + "learning_rate": 6.030737921409169e-07, + "loss": 0.0003, + "step": 9000 + }, + { + "epoch": 0.9002, + "grad_norm": 0.026151658967137337, + "learning_rate": 6.006883312650458e-07, + "loss": 0.001, + "step": 9002 + }, + { + "epoch": 0.9004, + "grad_norm": 1.700005292892456, + "learning_rate": 5.98307451499498e-07, + "loss": 0.0122, + "step": 9004 + }, + { + "epoch": 0.9006, + "grad_norm": 0.006365341134369373, + "learning_rate": 5.959311540046863e-07, + "loss": 0.0017, + "step": 9006 + }, + { + "epoch": 0.9008, + "grad_norm": 0.001651525148190558, + "learning_rate": 5.935594399387856e-07, + "loss": 0.0, + "step": 9008 + }, + { + "epoch": 0.901, + "grad_norm": 0.005223275627940893, + "learning_rate": 5.911923104577455e-07, + "loss": 0.0157, + "step": 9010 + }, + { + "epoch": 0.9012, + "grad_norm": 0.010104109533131123, + "learning_rate": 5.888297667152731e-07, + "loss": 0.0006, + "step": 9012 + }, + { + "epoch": 0.9014, + "grad_norm": 0.03302191197872162, + "learning_rate": 5.864718098628441e-07, + "loss": 0.0042, + "step": 9014 + }, + { + "epoch": 0.9016, + "grad_norm": 0.0038547227159142494, + "learning_rate": 5.841184410496992e-07, + "loss": 0.0002, + "step": 9016 + }, + { + "epoch": 0.9018, + "grad_norm": 0.007676271256059408, + "learning_rate": 5.817696614228396e-07, + "loss": 0.0001, + "step": 9018 + }, + { + "epoch": 0.902, + "grad_norm": 0.006478148512542248, + "learning_rate": 5.794254721270331e-07, + "loss": 0.001, + "step": 9020 + }, + { + "epoch": 0.9022, + "grad_norm": 0.0009445254690945148, + "learning_rate": 5.770858743048091e-07, + "loss": 0.0002, + "step": 9022 + }, + { + "epoch": 0.9024, + "grad_norm": 0.4705252945423126, + "learning_rate": 5.747508690964599e-07, + "loss": 0.0039, + "step": 9024 + }, + { + "epoch": 0.9026, + "grad_norm": 2.473776340484619, + "learning_rate": 5.724204576400372e-07, + "loss": 0.0379, + "step": 9026 + }, + { + "epoch": 0.9028, + "grad_norm": 0.005857715383172035, + "learning_rate": 5.700946410713548e-07, + "loss": 0.0001, + "step": 9028 + }, + { + "epoch": 0.903, + "grad_norm": 0.0017394099850207567, + "learning_rate": 5.677734205239904e-07, + "loss": 0.0022, + "step": 9030 + }, + { + "epoch": 0.9032, + "grad_norm": 0.06453706324100494, + "learning_rate": 5.654567971292757e-07, + "loss": 0.0009, + "step": 9032 + }, + { + "epoch": 0.9034, + "grad_norm": 0.002694027964025736, + "learning_rate": 5.631447720163074e-07, + "loss": 0.0, + "step": 9034 + }, + { + "epoch": 0.9036, + "grad_norm": 0.0023555713705718517, + "learning_rate": 5.608373463119354e-07, + "loss": 0.1828, + "step": 9036 + }, + { + "epoch": 0.9038, + "grad_norm": 0.0020046683494001627, + "learning_rate": 5.585345211407734e-07, + "loss": 0.0414, + "step": 9038 + }, + { + "epoch": 0.904, + "grad_norm": 0.22666165232658386, + "learning_rate": 5.562362976251901e-07, + "loss": 0.0029, + "step": 9040 + }, + { + "epoch": 0.9042, + "grad_norm": 0.0005367922713048756, + "learning_rate": 5.539426768853107e-07, + "loss": 0.0001, + "step": 9042 + }, + { + "epoch": 0.9044, + "grad_norm": 0.02118678390979767, + "learning_rate": 5.516536600390188e-07, + "loss": 0.0003, + "step": 9044 + }, + { + "epoch": 0.9046, + "grad_norm": 0.00394785962998867, + "learning_rate": 5.49369248201953e-07, + "loss": 0.0002, + "step": 9046 + }, + { + "epoch": 0.9048, + "grad_norm": 0.04220879450440407, + "learning_rate": 5.470894424875062e-07, + "loss": 0.0299, + "step": 9048 + }, + { + "epoch": 0.905, + "grad_norm": 0.004710644483566284, + "learning_rate": 5.448142440068316e-07, + "loss": 0.0001, + "step": 9050 + }, + { + "epoch": 0.9052, + "grad_norm": 0.0068033128045499325, + "learning_rate": 5.425436538688322e-07, + "loss": 0.0003, + "step": 9052 + }, + { + "epoch": 0.9054, + "grad_norm": 0.012676646001636982, + "learning_rate": 5.402776731801662e-07, + "loss": 0.0002, + "step": 9054 + }, + { + "epoch": 0.9056, + "grad_norm": 0.00258189020678401, + "learning_rate": 5.380163030452412e-07, + "loss": 0.0002, + "step": 9056 + }, + { + "epoch": 0.9058, + "grad_norm": 0.0029046060517430305, + "learning_rate": 5.357595445662267e-07, + "loss": 0.0007, + "step": 9058 + }, + { + "epoch": 0.906, + "grad_norm": 0.0008775270543992519, + "learning_rate": 5.335073988430373e-07, + "loss": 0.0033, + "step": 9060 + }, + { + "epoch": 0.9062, + "grad_norm": 0.00044235633686184883, + "learning_rate": 5.312598669733404e-07, + "loss": 0.0004, + "step": 9062 + }, + { + "epoch": 0.9064, + "grad_norm": 0.15758486092090607, + "learning_rate": 5.290169500525577e-07, + "loss": 0.0016, + "step": 9064 + }, + { + "epoch": 0.9066, + "grad_norm": 0.036873627454042435, + "learning_rate": 5.267786491738569e-07, + "loss": 0.0005, + "step": 9066 + }, + { + "epoch": 0.9068, + "grad_norm": 0.0009205452515743673, + "learning_rate": 5.245449654281632e-07, + "loss": 0.0001, + "step": 9068 + }, + { + "epoch": 0.907, + "grad_norm": 1.0824106931686401, + "learning_rate": 5.223158999041444e-07, + "loss": 0.0149, + "step": 9070 + }, + { + "epoch": 0.9072, + "grad_norm": 0.05270678177475929, + "learning_rate": 5.200914536882184e-07, + "loss": 0.0019, + "step": 9072 + }, + { + "epoch": 0.9074, + "grad_norm": 0.007729296572506428, + "learning_rate": 5.178716278645534e-07, + "loss": 0.0067, + "step": 9074 + }, + { + "epoch": 0.9076, + "grad_norm": 0.0006919130682945251, + "learning_rate": 5.156564235150686e-07, + "loss": 0.0, + "step": 9076 + }, + { + "epoch": 0.9078, + "grad_norm": 0.010271912440657616, + "learning_rate": 5.134458417194255e-07, + "loss": 0.0003, + "step": 9078 + }, + { + "epoch": 0.908, + "grad_norm": 0.004584020469337702, + "learning_rate": 5.112398835550348e-07, + "loss": 0.0002, + "step": 9080 + }, + { + "epoch": 0.9082, + "grad_norm": 0.0005196076235733926, + "learning_rate": 5.090385500970551e-07, + "loss": 0.0127, + "step": 9082 + }, + { + "epoch": 0.9084, + "grad_norm": 0.0027549846563488245, + "learning_rate": 5.068418424183874e-07, + "loss": 0.0, + "step": 9084 + }, + { + "epoch": 0.9086, + "grad_norm": 0.0016741121653467417, + "learning_rate": 5.046497615896806e-07, + "loss": 0.0001, + "step": 9086 + }, + { + "epoch": 0.9088, + "grad_norm": 0.14909707009792328, + "learning_rate": 5.024623086793323e-07, + "loss": 0.0033, + "step": 9088 + }, + { + "epoch": 0.909, + "grad_norm": 0.06873803585767746, + "learning_rate": 5.002794847534765e-07, + "loss": 0.0053, + "step": 9090 + }, + { + "epoch": 0.9092, + "grad_norm": 0.004192798864096403, + "learning_rate": 4.981012908759941e-07, + "loss": 0.0003, + "step": 9092 + }, + { + "epoch": 0.9094, + "grad_norm": 0.0381673127412796, + "learning_rate": 4.959277281085128e-07, + "loss": 0.0006, + "step": 9094 + }, + { + "epoch": 0.9096, + "grad_norm": 0.021458936855196953, + "learning_rate": 4.937587975103997e-07, + "loss": 0.0002, + "step": 9096 + }, + { + "epoch": 0.9098, + "grad_norm": 0.014890610240399837, + "learning_rate": 4.915945001387668e-07, + "loss": 0.0006, + "step": 9098 + }, + { + "epoch": 0.91, + "grad_norm": 0.8325202465057373, + "learning_rate": 4.894348370484648e-07, + "loss": 0.0052, + "step": 9100 + }, + { + "epoch": 0.9102, + "grad_norm": 0.0023431112058460712, + "learning_rate": 4.872798092920871e-07, + "loss": 0.0073, + "step": 9102 + }, + { + "epoch": 0.9104, + "grad_norm": 0.412405401468277, + "learning_rate": 4.851294179199673e-07, + "loss": 0.0065, + "step": 9104 + }, + { + "epoch": 0.9106, + "grad_norm": 0.002860542619600892, + "learning_rate": 4.829836639801844e-07, + "loss": 0.0001, + "step": 9106 + }, + { + "epoch": 0.9108, + "grad_norm": 0.006801479030400515, + "learning_rate": 4.808425485185486e-07, + "loss": 0.0005, + "step": 9108 + }, + { + "epoch": 0.911, + "grad_norm": 0.10820533335208893, + "learning_rate": 4.787060725786141e-07, + "loss": 0.0034, + "step": 9110 + }, + { + "epoch": 0.9112, + "grad_norm": 0.001444389228709042, + "learning_rate": 4.765742372016735e-07, + "loss": 0.0012, + "step": 9112 + }, + { + "epoch": 0.9114, + "grad_norm": 0.432973712682724, + "learning_rate": 4.7444704342675673e-07, + "loss": 0.0204, + "step": 9114 + }, + { + "epoch": 0.9116, + "grad_norm": 0.03381405398249626, + "learning_rate": 4.723244922906356e-07, + "loss": 0.0003, + "step": 9116 + }, + { + "epoch": 0.9118, + "grad_norm": 0.04558473080396652, + "learning_rate": 4.702065848278126e-07, + "loss": 0.0058, + "step": 9118 + }, + { + "epoch": 0.912, + "grad_norm": 0.006005990318953991, + "learning_rate": 4.6809332207053083e-07, + "loss": 0.0178, + "step": 9120 + }, + { + "epoch": 0.9122, + "grad_norm": 0.46174949407577515, + "learning_rate": 4.659847050487687e-07, + "loss": 0.0062, + "step": 9122 + }, + { + "epoch": 0.9124, + "grad_norm": 0.006815544795244932, + "learning_rate": 4.638807347902408e-07, + "loss": 0.0001, + "step": 9124 + }, + { + "epoch": 0.9126, + "grad_norm": 0.006291653495281935, + "learning_rate": 4.6178141232039676e-07, + "loss": 0.0001, + "step": 9126 + }, + { + "epoch": 0.9128, + "grad_norm": 0.0008765861857682467, + "learning_rate": 4.596867386624215e-07, + "loss": 0.0, + "step": 9128 + }, + { + "epoch": 0.913, + "grad_norm": 0.0006343265413306653, + "learning_rate": 4.575967148372318e-07, + "loss": 0.0, + "step": 9130 + }, + { + "epoch": 0.9132, + "grad_norm": 3.982851505279541, + "learning_rate": 4.5551134186348045e-07, + "loss": 0.1235, + "step": 9132 + }, + { + "epoch": 0.9134, + "grad_norm": 0.004405516665428877, + "learning_rate": 4.534306207575545e-07, + "loss": 0.0001, + "step": 9134 + }, + { + "epoch": 0.9136, + "grad_norm": 0.005522248800843954, + "learning_rate": 4.5135455253357053e-07, + "loss": 0.0001, + "step": 9136 + }, + { + "epoch": 0.9138, + "grad_norm": 0.008160091005265713, + "learning_rate": 4.492831382033791e-07, + "loss": 0.0713, + "step": 9138 + }, + { + "epoch": 0.914, + "grad_norm": 0.05511947348713875, + "learning_rate": 4.4721637877656377e-07, + "loss": 0.0017, + "step": 9140 + }, + { + "epoch": 0.9142, + "grad_norm": 0.0026717744767665863, + "learning_rate": 4.451542752604365e-07, + "loss": 0.0002, + "step": 9142 + }, + { + "epoch": 0.9144, + "grad_norm": 0.6478389501571655, + "learning_rate": 4.4309682866004124e-07, + "loss": 0.0042, + "step": 9144 + }, + { + "epoch": 0.9146, + "grad_norm": 0.003846412757411599, + "learning_rate": 4.4104403997815346e-07, + "loss": 0.0004, + "step": 9146 + }, + { + "epoch": 0.9148, + "grad_norm": 0.01254007127135992, + "learning_rate": 4.3899591021527743e-07, + "loss": 0.0003, + "step": 9148 + }, + { + "epoch": 0.915, + "grad_norm": 0.012208998203277588, + "learning_rate": 4.3695244036964567e-07, + "loss": 0.0001, + "step": 9150 + }, + { + "epoch": 0.9152, + "grad_norm": 0.0010226896265521646, + "learning_rate": 4.349136314372204e-07, + "loss": 0.0006, + "step": 9152 + }, + { + "epoch": 0.9154, + "grad_norm": 0.0020321242045611143, + "learning_rate": 4.3287948441169457e-07, + "loss": 0.0001, + "step": 9154 + }, + { + "epoch": 0.9156, + "grad_norm": 0.0015954102855175734, + "learning_rate": 4.308500002844862e-07, + "loss": 0.0, + "step": 9156 + }, + { + "epoch": 0.9158, + "grad_norm": 0.0012492898385971785, + "learning_rate": 4.288251800447385e-07, + "loss": 0.0005, + "step": 9158 + }, + { + "epoch": 0.916, + "grad_norm": 0.7175946831703186, + "learning_rate": 4.268050246793276e-07, + "loss": 0.0092, + "step": 9160 + }, + { + "epoch": 0.9162, + "grad_norm": 0.005767956841737032, + "learning_rate": 4.247895351728504e-07, + "loss": 0.0001, + "step": 9162 + }, + { + "epoch": 0.9164, + "grad_norm": 0.0007307982305064797, + "learning_rate": 4.2277871250763327e-07, + "loss": 0.0002, + "step": 9164 + }, + { + "epoch": 0.9166, + "grad_norm": 0.020438876003026962, + "learning_rate": 4.207725576637256e-07, + "loss": 0.0005, + "step": 9166 + }, + { + "epoch": 0.9168, + "grad_norm": 0.05337672680616379, + "learning_rate": 4.1877107161890416e-07, + "loss": 0.0005, + "step": 9168 + }, + { + "epoch": 0.917, + "grad_norm": 0.0030349232256412506, + "learning_rate": 4.167742553486676e-07, + "loss": 0.0039, + "step": 9170 + }, + { + "epoch": 0.9172, + "grad_norm": 0.0009005886968225241, + "learning_rate": 4.1478210982624055e-07, + "loss": 0.0001, + "step": 9172 + }, + { + "epoch": 0.9174, + "grad_norm": 0.007981435395777225, + "learning_rate": 4.1279463602257207e-07, + "loss": 0.0001, + "step": 9174 + }, + { + "epoch": 0.9176, + "grad_norm": 0.0016536259790882468, + "learning_rate": 4.108118349063306e-07, + "loss": 0.0001, + "step": 9176 + }, + { + "epoch": 0.9178, + "grad_norm": 0.0012642795918509364, + "learning_rate": 4.0883370744390973e-07, + "loss": 0.0001, + "step": 9178 + }, + { + "epoch": 0.918, + "grad_norm": 0.0020684448536485434, + "learning_rate": 4.068602545994249e-07, + "loss": 0.0002, + "step": 9180 + }, + { + "epoch": 0.9182, + "grad_norm": 0.002953573828563094, + "learning_rate": 4.0489147733471347e-07, + "loss": 0.0006, + "step": 9182 + }, + { + "epoch": 0.9184, + "grad_norm": 0.009420221671462059, + "learning_rate": 4.0292737660933335e-07, + "loss": 0.0005, + "step": 9184 + }, + { + "epoch": 0.9186, + "grad_norm": 0.15709254145622253, + "learning_rate": 4.009679533805633e-07, + "loss": 0.0019, + "step": 9186 + }, + { + "epoch": 0.9188, + "grad_norm": 0.005296363960951567, + "learning_rate": 3.990132086034026e-07, + "loss": 0.003, + "step": 9188 + }, + { + "epoch": 0.919, + "grad_norm": 0.0012408009497448802, + "learning_rate": 3.9706314323056936e-07, + "loss": 0.0001, + "step": 9190 + }, + { + "epoch": 0.9192, + "grad_norm": 0.02593318559229374, + "learning_rate": 3.9511775821250206e-07, + "loss": 0.0003, + "step": 9192 + }, + { + "epoch": 0.9194, + "grad_norm": 0.005263639148324728, + "learning_rate": 3.931770544973601e-07, + "loss": 0.0001, + "step": 9194 + }, + { + "epoch": 0.9196, + "grad_norm": 0.005043665878474712, + "learning_rate": 3.912410330310157e-07, + "loss": 0.0066, + "step": 9196 + }, + { + "epoch": 0.9198, + "grad_norm": 0.0010395165299996734, + "learning_rate": 3.8930969475706183e-07, + "loss": 0.0002, + "step": 9198 + }, + { + "epoch": 0.92, + "grad_norm": 0.6076346039772034, + "learning_rate": 3.8738304061681107e-07, + "loss": 0.006, + "step": 9200 + }, + { + "epoch": 0.9202, + "grad_norm": 0.0310218408703804, + "learning_rate": 3.854610715492924e-07, + "loss": 0.0003, + "step": 9202 + }, + { + "epoch": 0.9204, + "grad_norm": 0.017640557140111923, + "learning_rate": 3.835437884912474e-07, + "loss": 0.0013, + "step": 9204 + }, + { + "epoch": 0.9206, + "grad_norm": 0.055013056844472885, + "learning_rate": 3.8163119237713877e-07, + "loss": 0.0012, + "step": 9206 + }, + { + "epoch": 0.9208, + "grad_norm": 0.002831836463883519, + "learning_rate": 3.7972328413914074e-07, + "loss": 0.0001, + "step": 9208 + }, + { + "epoch": 0.921, + "grad_norm": 0.005615271162241697, + "learning_rate": 3.7782006470714614e-07, + "loss": 0.0001, + "step": 9210 + }, + { + "epoch": 0.9212, + "grad_norm": 0.007831217721104622, + "learning_rate": 3.759215350087619e-07, + "loss": 0.0002, + "step": 9212 + }, + { + "epoch": 0.9214, + "grad_norm": 0.2775794565677643, + "learning_rate": 3.7402769596930567e-07, + "loss": 0.0026, + "step": 9214 + }, + { + "epoch": 0.9216, + "grad_norm": 0.009692407213151455, + "learning_rate": 3.721385485118123e-07, + "loss": 0.014, + "step": 9216 + }, + { + "epoch": 0.9218, + "grad_norm": 0.003108486533164978, + "learning_rate": 3.7025409355702977e-07, + "loss": 0.0001, + "step": 9218 + }, + { + "epoch": 0.922, + "grad_norm": 0.1662442833185196, + "learning_rate": 3.68374332023419e-07, + "loss": 0.0032, + "step": 9220 + }, + { + "epoch": 0.9222, + "grad_norm": 0.0033893249928951263, + "learning_rate": 3.664992648271526e-07, + "loss": 0.0001, + "step": 9222 + }, + { + "epoch": 0.9224, + "grad_norm": 0.08284036815166473, + "learning_rate": 3.646288928821151e-07, + "loss": 0.0012, + "step": 9224 + }, + { + "epoch": 0.9226, + "grad_norm": 0.5032443404197693, + "learning_rate": 3.627632170999029e-07, + "loss": 0.0341, + "step": 9226 + }, + { + "epoch": 0.9228, + "grad_norm": 0.0009935481939464808, + "learning_rate": 3.609022383898242e-07, + "loss": 0.0028, + "step": 9228 + }, + { + "epoch": 0.923, + "grad_norm": 0.0031193834729492664, + "learning_rate": 3.590459576589e-07, + "loss": 0.0005, + "step": 9230 + }, + { + "epoch": 0.9232, + "grad_norm": 6.312903881072998, + "learning_rate": 3.571943758118546e-07, + "loss": 0.0542, + "step": 9232 + }, + { + "epoch": 0.9234, + "grad_norm": 0.002392771653831005, + "learning_rate": 3.553474937511281e-07, + "loss": 0.2407, + "step": 9234 + }, + { + "epoch": 0.9236, + "grad_norm": 0.019273338839411736, + "learning_rate": 3.5350531237686723e-07, + "loss": 0.0003, + "step": 9236 + }, + { + "epoch": 0.9238, + "grad_norm": 0.004903374705463648, + "learning_rate": 3.516678325869316e-07, + "loss": 0.0001, + "step": 9238 + }, + { + "epoch": 0.924, + "grad_norm": 0.005206699948757887, + "learning_rate": 3.498350552768859e-07, + "loss": 0.0001, + "step": 9240 + }, + { + "epoch": 0.9242, + "grad_norm": 0.14477728307247162, + "learning_rate": 3.480069813400022e-07, + "loss": 0.0023, + "step": 9242 + }, + { + "epoch": 0.9244, + "grad_norm": 0.1340457797050476, + "learning_rate": 3.4618361166726123e-07, + "loss": 0.0017, + "step": 9244 + }, + { + "epoch": 0.9246, + "grad_norm": 0.047222983092069626, + "learning_rate": 3.4436494714735313e-07, + "loss": 0.0005, + "step": 9246 + }, + { + "epoch": 0.9248, + "grad_norm": 0.07752601057291031, + "learning_rate": 3.4255098866667114e-07, + "loss": 0.0014, + "step": 9248 + }, + { + "epoch": 0.925, + "grad_norm": 1.0749626159667969, + "learning_rate": 3.4074173710931804e-07, + "loss": 0.0605, + "step": 9250 + }, + { + "epoch": 0.9252, + "grad_norm": 0.0023080757819116116, + "learning_rate": 3.3893719335709953e-07, + "loss": 0.0022, + "step": 9252 + }, + { + "epoch": 0.9254, + "grad_norm": 0.002027643145993352, + "learning_rate": 3.3713735828952985e-07, + "loss": 0.0118, + "step": 9254 + }, + { + "epoch": 0.9256, + "grad_norm": 0.036269430071115494, + "learning_rate": 3.3534223278382405e-07, + "loss": 0.0005, + "step": 9256 + }, + { + "epoch": 0.9258, + "grad_norm": 0.004972180817276239, + "learning_rate": 3.3355181771490776e-07, + "loss": 0.0001, + "step": 9258 + }, + { + "epoch": 0.926, + "grad_norm": 0.0004680544079747051, + "learning_rate": 3.3176611395540625e-07, + "loss": 0.0001, + "step": 9260 + }, + { + "epoch": 0.9262, + "grad_norm": 0.01292356289923191, + "learning_rate": 3.2998512237565005e-07, + "loss": 0.0002, + "step": 9262 + }, + { + "epoch": 0.9264, + "grad_norm": 0.027166588231921196, + "learning_rate": 3.282088438436715e-07, + "loss": 0.0151, + "step": 9264 + }, + { + "epoch": 0.9266, + "grad_norm": 0.006498547270894051, + "learning_rate": 3.2643727922520905e-07, + "loss": 0.0001, + "step": 9266 + }, + { + "epoch": 0.9268, + "grad_norm": 0.0014640835579484701, + "learning_rate": 3.246704293837011e-07, + "loss": 0.0027, + "step": 9268 + }, + { + "epoch": 0.927, + "grad_norm": 0.0003628222912084311, + "learning_rate": 3.2290829518028867e-07, + "loss": 0.0095, + "step": 9270 + }, + { + "epoch": 0.9272, + "grad_norm": 0.00041137568769045174, + "learning_rate": 3.211508774738137e-07, + "loss": 0.0, + "step": 9272 + }, + { + "epoch": 0.9274, + "grad_norm": 0.1654614806175232, + "learning_rate": 3.19398177120821e-07, + "loss": 0.0018, + "step": 9274 + }, + { + "epoch": 0.9276, + "grad_norm": 0.006245431490242481, + "learning_rate": 3.1765019497555617e-07, + "loss": 0.0005, + "step": 9276 + }, + { + "epoch": 0.9278, + "grad_norm": 0.03237215429544449, + "learning_rate": 3.1590693188996324e-07, + "loss": 0.0006, + "step": 9278 + }, + { + "epoch": 0.928, + "grad_norm": 0.001990276388823986, + "learning_rate": 3.1416838871368925e-07, + "loss": 0.0, + "step": 9280 + }, + { + "epoch": 0.9282, + "grad_norm": 4.570094108581543, + "learning_rate": 3.1243456629407644e-07, + "loss": 0.0642, + "step": 9282 + }, + { + "epoch": 0.9284, + "grad_norm": 0.01608077436685562, + "learning_rate": 3.10705465476171e-07, + "loss": 0.0006, + "step": 9284 + }, + { + "epoch": 0.9286, + "grad_norm": 0.09085498750209808, + "learning_rate": 3.0898108710271437e-07, + "loss": 0.0023, + "step": 9286 + }, + { + "epoch": 0.9288, + "grad_norm": 0.01598670892417431, + "learning_rate": 3.072614320141487e-07, + "loss": 0.0002, + "step": 9288 + }, + { + "epoch": 0.929, + "grad_norm": 0.005314527545124292, + "learning_rate": 3.0554650104861137e-07, + "loss": 0.0001, + "step": 9290 + }, + { + "epoch": 0.9292, + "grad_norm": 0.00790812261402607, + "learning_rate": 3.0383629504194047e-07, + "loss": 0.0001, + "step": 9292 + }, + { + "epoch": 0.9294, + "grad_norm": 3.4601247310638428, + "learning_rate": 3.0213081482766803e-07, + "loss": 0.0659, + "step": 9294 + }, + { + "epoch": 0.9296, + "grad_norm": 0.0004404897044878453, + "learning_rate": 3.00430061237027e-07, + "loss": 0.0004, + "step": 9296 + }, + { + "epoch": 0.9298, + "grad_norm": 0.025838039815425873, + "learning_rate": 2.987340350989421e-07, + "loss": 0.0005, + "step": 9298 + }, + { + "epoch": 0.93, + "grad_norm": 0.0033167453948408365, + "learning_rate": 2.970427372400353e-07, + "loss": 0.002, + "step": 9300 + }, + { + "epoch": 0.9302, + "grad_norm": 1.9716365337371826, + "learning_rate": 2.9535616848462624e-07, + "loss": 0.0103, + "step": 9302 + }, + { + "epoch": 0.9304, + "grad_norm": 0.00519172428175807, + "learning_rate": 2.936743296547273e-07, + "loss": 0.1017, + "step": 9304 + }, + { + "epoch": 0.9306, + "grad_norm": 0.18141263723373413, + "learning_rate": 2.919972215700462e-07, + "loss": 0.0015, + "step": 9306 + }, + { + "epoch": 0.9308, + "grad_norm": 0.04096468538045883, + "learning_rate": 2.9032484504798454e-07, + "loss": 0.0005, + "step": 9308 + }, + { + "epoch": 0.931, + "grad_norm": 0.3931426405906677, + "learning_rate": 2.8865720090364037e-07, + "loss": 0.0061, + "step": 9310 + }, + { + "epoch": 0.9312, + "grad_norm": 0.0005480980616994202, + "learning_rate": 2.8699428994980017e-07, + "loss": 0.0, + "step": 9312 + }, + { + "epoch": 0.9314, + "grad_norm": 0.0004938548081554472, + "learning_rate": 2.8533611299694784e-07, + "loss": 0.0002, + "step": 9314 + }, + { + "epoch": 0.9316, + "grad_norm": 0.5617260932922363, + "learning_rate": 2.836826708532603e-07, + "loss": 0.0059, + "step": 9316 + }, + { + "epoch": 0.9318, + "grad_norm": 0.013287640176713467, + "learning_rate": 2.8203396432460507e-07, + "loss": 0.0003, + "step": 9318 + }, + { + "epoch": 0.932, + "grad_norm": 0.023277277126908302, + "learning_rate": 2.8038999421453827e-07, + "loss": 0.0003, + "step": 9320 + }, + { + "epoch": 0.9322, + "grad_norm": 0.007376612164080143, + "learning_rate": 2.7875076132431344e-07, + "loss": 0.0002, + "step": 9322 + }, + { + "epoch": 0.9324, + "grad_norm": 0.0021899580024182796, + "learning_rate": 2.771162664528726e-07, + "loss": 0.0, + "step": 9324 + }, + { + "epoch": 0.9326, + "grad_norm": 0.2603866159915924, + "learning_rate": 2.7548651039684847e-07, + "loss": 0.0081, + "step": 9326 + }, + { + "epoch": 0.9328, + "grad_norm": 0.26747721433639526, + "learning_rate": 2.7386149395056463e-07, + "loss": 0.0025, + "step": 9328 + }, + { + "epoch": 0.933, + "grad_norm": 0.053543783724308014, + "learning_rate": 2.7224121790603517e-07, + "loss": 0.005, + "step": 9330 + }, + { + "epoch": 0.9332, + "grad_norm": 0.0036784906405955553, + "learning_rate": 2.7062568305295967e-07, + "loss": 0.0002, + "step": 9332 + }, + { + "epoch": 0.9334, + "grad_norm": 0.0009925945196300745, + "learning_rate": 2.6901489017873375e-07, + "loss": 0.0003, + "step": 9334 + }, + { + "epoch": 0.9336, + "grad_norm": 0.004439488984644413, + "learning_rate": 2.6740884006843826e-07, + "loss": 0.0001, + "step": 9336 + }, + { + "epoch": 0.9338, + "grad_norm": 0.007651673164218664, + "learning_rate": 2.6580753350484044e-07, + "loss": 0.0001, + "step": 9338 + }, + { + "epoch": 0.934, + "grad_norm": 0.008723709732294083, + "learning_rate": 2.6421097126839714e-07, + "loss": 0.0001, + "step": 9340 + }, + { + "epoch": 0.9342, + "grad_norm": 0.044701967388391495, + "learning_rate": 2.626191541372558e-07, + "loss": 0.0004, + "step": 9342 + }, + { + "epoch": 0.9344, + "grad_norm": 0.0007819094462320209, + "learning_rate": 2.6103208288724815e-07, + "loss": 0.0, + "step": 9344 + }, + { + "epoch": 0.9346, + "grad_norm": 0.0053397114388644695, + "learning_rate": 2.59449758291892e-07, + "loss": 0.0004, + "step": 9346 + }, + { + "epoch": 0.9348, + "grad_norm": 0.31216931343078613, + "learning_rate": 2.57872181122395e-07, + "loss": 0.0055, + "step": 9348 + }, + { + "epoch": 0.935, + "grad_norm": 0.01422576792538166, + "learning_rate": 2.5629935214764866e-07, + "loss": 0.0004, + "step": 9350 + }, + { + "epoch": 0.9352, + "grad_norm": 0.006394328083842993, + "learning_rate": 2.547312721342277e-07, + "loss": 0.0003, + "step": 9352 + }, + { + "epoch": 0.9354, + "grad_norm": 0.8618606328964233, + "learning_rate": 2.5316794184640056e-07, + "loss": 0.0462, + "step": 9354 + }, + { + "epoch": 0.9356, + "grad_norm": 0.011838570237159729, + "learning_rate": 2.516093620461124e-07, + "loss": 0.0004, + "step": 9356 + }, + { + "epoch": 0.9358, + "grad_norm": 0.0035455210600048304, + "learning_rate": 2.500555334929955e-07, + "loss": 0.0001, + "step": 9358 + }, + { + "epoch": 0.936, + "grad_norm": 0.023324856534600258, + "learning_rate": 2.4850645694436736e-07, + "loss": 0.0004, + "step": 9360 + }, + { + "epoch": 0.9362, + "grad_norm": 0.007477073930203915, + "learning_rate": 2.4696213315523074e-07, + "loss": 0.0008, + "step": 9362 + }, + { + "epoch": 0.9364, + "grad_norm": 0.0005978263216093183, + "learning_rate": 2.4542256287826915e-07, + "loss": 0.0, + "step": 9364 + }, + { + "epoch": 0.9366, + "grad_norm": 0.6314413547515869, + "learning_rate": 2.4388774686385007e-07, + "loss": 0.0049, + "step": 9366 + }, + { + "epoch": 0.9368, + "grad_norm": 0.04643668979406357, + "learning_rate": 2.423576858600252e-07, + "loss": 0.0013, + "step": 9368 + }, + { + "epoch": 0.937, + "grad_norm": 0.014583180658519268, + "learning_rate": 2.4083238061252565e-07, + "loss": 0.0035, + "step": 9370 + }, + { + "epoch": 0.9372, + "grad_norm": 0.18054738640785217, + "learning_rate": 2.3931183186477026e-07, + "loss": 0.0037, + "step": 9372 + }, + { + "epoch": 0.9374, + "grad_norm": 0.0025223020929843187, + "learning_rate": 2.3779604035785277e-07, + "loss": 0.0001, + "step": 9374 + }, + { + "epoch": 0.9376, + "grad_norm": 0.004421973135322332, + "learning_rate": 2.3628500683055222e-07, + "loss": 0.0001, + "step": 9376 + }, + { + "epoch": 0.9378, + "grad_norm": 0.14030815660953522, + "learning_rate": 2.3477873201932733e-07, + "loss": 0.003, + "step": 9378 + }, + { + "epoch": 0.938, + "grad_norm": 0.0037309511099010706, + "learning_rate": 2.332772166583208e-07, + "loss": 0.0455, + "step": 9380 + }, + { + "epoch": 0.9382, + "grad_norm": 0.000732988235540688, + "learning_rate": 2.3178046147935173e-07, + "loss": 0.0021, + "step": 9382 + }, + { + "epoch": 0.9384, + "grad_norm": 0.0012078880099579692, + "learning_rate": 2.3028846721191878e-07, + "loss": 0.0, + "step": 9384 + }, + { + "epoch": 0.9386, + "grad_norm": 0.0022729667834937572, + "learning_rate": 2.288012345832047e-07, + "loss": 0.0001, + "step": 9386 + }, + { + "epoch": 0.9388, + "grad_norm": 0.40486547350883484, + "learning_rate": 2.273187643180652e-07, + "loss": 0.0147, + "step": 9388 + }, + { + "epoch": 0.939, + "grad_norm": 0.7548208832740784, + "learning_rate": 2.2584105713904126e-07, + "loss": 0.0231, + "step": 9390 + }, + { + "epoch": 0.9392, + "grad_norm": 0.5265133380889893, + "learning_rate": 2.2436811376634893e-07, + "loss": 0.011, + "step": 9392 + }, + { + "epoch": 0.9394, + "grad_norm": 0.002443412085995078, + "learning_rate": 2.2289993491788065e-07, + "loss": 0.0002, + "step": 9394 + }, + { + "epoch": 0.9396, + "grad_norm": 0.02507212944328785, + "learning_rate": 2.214365213092118e-07, + "loss": 0.0004, + "step": 9396 + }, + { + "epoch": 0.9398, + "grad_norm": 0.015932569280266762, + "learning_rate": 2.1997787365358958e-07, + "loss": 0.0035, + "step": 9398 + }, + { + "epoch": 0.94, + "grad_norm": 0.0029559119138866663, + "learning_rate": 2.1852399266194312e-07, + "loss": 0.0865, + "step": 9400 + }, + { + "epoch": 0.9402, + "grad_norm": 0.001254458213225007, + "learning_rate": 2.1707487904287672e-07, + "loss": 0.0001, + "step": 9402 + }, + { + "epoch": 0.9404, + "grad_norm": 0.1917531043291092, + "learning_rate": 2.1563053350266983e-07, + "loss": 0.0133, + "step": 9404 + }, + { + "epoch": 0.9406, + "grad_norm": 0.015932146459817886, + "learning_rate": 2.1419095674527934e-07, + "loss": 0.0002, + "step": 9406 + }, + { + "epoch": 0.9408, + "grad_norm": 0.026388367637991905, + "learning_rate": 2.1275614947233624e-07, + "loss": 0.0003, + "step": 9408 + }, + { + "epoch": 0.941, + "grad_norm": 0.00107996363658458, + "learning_rate": 2.1132611238315004e-07, + "loss": 0.0001, + "step": 9410 + }, + { + "epoch": 0.9412, + "grad_norm": 0.0006051200907677412, + "learning_rate": 2.0990084617470207e-07, + "loss": 0.0002, + "step": 9412 + }, + { + "epoch": 0.9414, + "grad_norm": 0.04466747120022774, + "learning_rate": 2.0848035154165113e-07, + "loss": 0.001, + "step": 9414 + }, + { + "epoch": 0.9416, + "grad_norm": 0.0016761892475187778, + "learning_rate": 2.0706462917632676e-07, + "loss": 0.0102, + "step": 9416 + }, + { + "epoch": 0.9418, + "grad_norm": 0.01282675564289093, + "learning_rate": 2.0565367976873584e-07, + "loss": 0.0002, + "step": 9418 + }, + { + "epoch": 0.942, + "grad_norm": 0.004467017948627472, + "learning_rate": 2.0424750400655947e-07, + "loss": 0.0142, + "step": 9420 + }, + { + "epoch": 0.9422, + "grad_norm": 0.001015584566630423, + "learning_rate": 2.0284610257514936e-07, + "loss": 0.0, + "step": 9422 + }, + { + "epoch": 0.9424, + "grad_norm": 0.004995131399482489, + "learning_rate": 2.014494761575314e-07, + "loss": 0.0246, + "step": 9424 + }, + { + "epoch": 0.9426, + "grad_norm": 0.0009096034336835146, + "learning_rate": 2.0005762543440444e-07, + "loss": 0.0022, + "step": 9426 + }, + { + "epoch": 0.9428, + "grad_norm": 0.1864551454782486, + "learning_rate": 1.9867055108414023e-07, + "loss": 0.0668, + "step": 9428 + }, + { + "epoch": 0.943, + "grad_norm": 0.04033835604786873, + "learning_rate": 1.9728825378278248e-07, + "loss": 0.0008, + "step": 9430 + }, + { + "epoch": 0.9432, + "grad_norm": 1.1340100765228271, + "learning_rate": 1.9591073420404338e-07, + "loss": 0.0118, + "step": 9432 + }, + { + "epoch": 0.9434, + "grad_norm": 0.0008874423801898956, + "learning_rate": 1.9453799301931253e-07, + "loss": 0.0, + "step": 9434 + }, + { + "epoch": 0.9436, + "grad_norm": 3.660346031188965, + "learning_rate": 1.9317003089764365e-07, + "loss": 0.0943, + "step": 9436 + }, + { + "epoch": 0.9438, + "grad_norm": 0.0005071136984042823, + "learning_rate": 1.9180684850576893e-07, + "loss": 0.0104, + "step": 9438 + }, + { + "epoch": 0.944, + "grad_norm": 0.0008760679047554731, + "learning_rate": 1.9044844650808468e-07, + "loss": 0.0005, + "step": 9440 + }, + { + "epoch": 0.9442, + "grad_norm": 0.00235182698816061, + "learning_rate": 1.8909482556666026e-07, + "loss": 0.0095, + "step": 9442 + }, + { + "epoch": 0.9444, + "grad_norm": 0.007288962136954069, + "learning_rate": 1.877459863412323e-07, + "loss": 0.0001, + "step": 9444 + }, + { + "epoch": 0.9446, + "grad_norm": 0.001927840057760477, + "learning_rate": 1.8640192948921053e-07, + "loss": 0.0, + "step": 9446 + }, + { + "epoch": 0.9448, + "grad_norm": 0.0008169939392246306, + "learning_rate": 1.8506265566567095e-07, + "loss": 0.0, + "step": 9448 + }, + { + "epoch": 0.945, + "grad_norm": 0.0006360902916640043, + "learning_rate": 1.8372816552336025e-07, + "loss": 0.0, + "step": 9450 + }, + { + "epoch": 0.9452, + "grad_norm": 0.0010668913600966334, + "learning_rate": 1.8239845971269266e-07, + "loss": 0.0614, + "step": 9452 + }, + { + "epoch": 0.9454, + "grad_norm": 0.10900649428367615, + "learning_rate": 1.8107353888175083e-07, + "loss": 0.0015, + "step": 9454 + }, + { + "epoch": 0.9456, + "grad_norm": 0.003889723215252161, + "learning_rate": 1.7975340367628269e-07, + "loss": 0.0001, + "step": 9456 + }, + { + "epoch": 0.9458, + "grad_norm": 0.012136850506067276, + "learning_rate": 1.7843805473970798e-07, + "loss": 0.0001, + "step": 9458 + }, + { + "epoch": 0.946, + "grad_norm": 31.623138427734375, + "learning_rate": 1.7712749271311392e-07, + "loss": 0.1882, + "step": 9460 + }, + { + "epoch": 0.9462, + "grad_norm": 0.03352311626076698, + "learning_rate": 1.758217182352495e-07, + "loss": 0.0002, + "step": 9462 + }, + { + "epoch": 0.9464, + "grad_norm": 0.011460027657449245, + "learning_rate": 1.7452073194253237e-07, + "loss": 0.0003, + "step": 9464 + }, + { + "epoch": 0.9466, + "grad_norm": 0.000567289418540895, + "learning_rate": 1.7322453446905084e-07, + "loss": 0.0087, + "step": 9466 + }, + { + "epoch": 0.9468, + "grad_norm": 0.017951754853129387, + "learning_rate": 1.719331264465529e-07, + "loss": 0.0002, + "step": 9468 + }, + { + "epoch": 0.947, + "grad_norm": 0.10238996148109436, + "learning_rate": 1.706465085044584e-07, + "loss": 0.0013, + "step": 9470 + }, + { + "epoch": 0.9472, + "grad_norm": 0.012017922475934029, + "learning_rate": 1.6936468126984573e-07, + "loss": 0.0002, + "step": 9472 + }, + { + "epoch": 0.9474, + "grad_norm": 0.00073683459777385, + "learning_rate": 1.680876453674629e-07, + "loss": 0.0, + "step": 9474 + }, + { + "epoch": 0.9476, + "grad_norm": 0.008065691217780113, + "learning_rate": 1.668154014197243e-07, + "loss": 0.0006, + "step": 9476 + }, + { + "epoch": 0.9478, + "grad_norm": 2.444490671157837, + "learning_rate": 1.6554795004670389e-07, + "loss": 0.0709, + "step": 9478 + }, + { + "epoch": 0.948, + "grad_norm": 0.0029757532756775618, + "learning_rate": 1.6428529186614195e-07, + "loss": 0.0009, + "step": 9480 + }, + { + "epoch": 0.9482, + "grad_norm": 0.04249466955661774, + "learning_rate": 1.6302742749344292e-07, + "loss": 0.0005, + "step": 9482 + }, + { + "epoch": 0.9484, + "grad_norm": 0.04048140347003937, + "learning_rate": 1.6177435754167413e-07, + "loss": 0.0009, + "step": 9484 + }, + { + "epoch": 0.9486, + "grad_norm": 0.0025531023275107145, + "learning_rate": 1.605260826215682e-07, + "loss": 0.0003, + "step": 9486 + }, + { + "epoch": 0.9488, + "grad_norm": 0.004982620012015104, + "learning_rate": 1.5928260334151847e-07, + "loss": 0.0001, + "step": 9488 + }, + { + "epoch": 0.949, + "grad_norm": 0.006177753675729036, + "learning_rate": 1.580439203075812e-07, + "loss": 0.0002, + "step": 9490 + }, + { + "epoch": 0.9492, + "grad_norm": 0.000622976163867861, + "learning_rate": 1.5681003412347573e-07, + "loss": 0.0007, + "step": 9492 + }, + { + "epoch": 0.9494, + "grad_norm": 0.08549576252698898, + "learning_rate": 1.555809453905821e-07, + "loss": 0.0008, + "step": 9494 + }, + { + "epoch": 0.9496, + "grad_norm": 0.24609702825546265, + "learning_rate": 1.543566547079467e-07, + "loss": 0.0042, + "step": 9496 + }, + { + "epoch": 0.9498, + "grad_norm": 0.048033785074949265, + "learning_rate": 1.5313716267226997e-07, + "loss": 0.0008, + "step": 9498 + }, + { + "epoch": 0.95, + "grad_norm": 0.0009441141155548394, + "learning_rate": 1.519224698779198e-07, + "loss": 0.1928, + "step": 9500 + }, + { + "epoch": 0.9502, + "grad_norm": 0.004390642512589693, + "learning_rate": 1.5071257691692153e-07, + "loss": 0.0271, + "step": 9502 + }, + { + "epoch": 0.9504, + "grad_norm": 0.0009063211618922651, + "learning_rate": 1.4950748437896235e-07, + "loss": 0.0001, + "step": 9504 + }, + { + "epoch": 0.9506, + "grad_norm": 0.026389766484498978, + "learning_rate": 1.483071928513913e-07, + "loss": 0.0005, + "step": 9506 + }, + { + "epoch": 0.9508, + "grad_norm": 0.2349082976579666, + "learning_rate": 1.4711170291921485e-07, + "loss": 0.0015, + "step": 9508 + }, + { + "epoch": 0.951, + "grad_norm": 0.24739402532577515, + "learning_rate": 1.4592101516509916e-07, + "loss": 0.0019, + "step": 9510 + }, + { + "epoch": 0.9512, + "grad_norm": 0.011877845972776413, + "learning_rate": 1.4473513016937223e-07, + "loss": 0.0002, + "step": 9512 + }, + { + "epoch": 0.9514, + "grad_norm": 0.0009022158337756991, + "learning_rate": 1.4355404851001953e-07, + "loss": 0.0318, + "step": 9514 + }, + { + "epoch": 0.9516, + "grad_norm": 0.08477446436882019, + "learning_rate": 1.4237777076268723e-07, + "loss": 0.0012, + "step": 9516 + }, + { + "epoch": 0.9518, + "grad_norm": 0.3093426525592804, + "learning_rate": 1.4120629750067672e-07, + "loss": 0.0049, + "step": 9518 + }, + { + "epoch": 0.952, + "grad_norm": 0.0002375059702899307, + "learning_rate": 1.400396292949513e-07, + "loss": 0.0002, + "step": 9520 + }, + { + "epoch": 0.9522, + "grad_norm": 0.0024085494223982096, + "learning_rate": 1.3887776671412943e-07, + "loss": 0.0027, + "step": 9522 + }, + { + "epoch": 0.9524, + "grad_norm": 1.8135734796524048, + "learning_rate": 1.377207103244904e-07, + "loss": 0.0092, + "step": 9524 + }, + { + "epoch": 0.9526, + "grad_norm": 0.016087913885712624, + "learning_rate": 1.3656846068996976e-07, + "loss": 0.0033, + "step": 9526 + }, + { + "epoch": 0.9528, + "grad_norm": 3.135099411010742, + "learning_rate": 1.3542101837215826e-07, + "loss": 0.0814, + "step": 9528 + }, + { + "epoch": 0.953, + "grad_norm": 0.01986018940806389, + "learning_rate": 1.3427838393030634e-07, + "loss": 0.0004, + "step": 9530 + }, + { + "epoch": 0.9532, + "grad_norm": 0.008074639365077019, + "learning_rate": 1.3314055792131964e-07, + "loss": 0.0168, + "step": 9532 + }, + { + "epoch": 0.9534, + "grad_norm": 0.1766495555639267, + "learning_rate": 1.320075408997612e-07, + "loss": 0.0157, + "step": 9534 + }, + { + "epoch": 0.9536, + "grad_norm": 0.08442287147045135, + "learning_rate": 1.308793334178493e-07, + "loss": 0.0009, + "step": 9536 + }, + { + "epoch": 0.9538, + "grad_norm": 0.001066313125193119, + "learning_rate": 1.2975593602545966e-07, + "loss": 0.0005, + "step": 9538 + }, + { + "epoch": 0.954, + "grad_norm": 0.002294759498909116, + "learning_rate": 1.2863734927012094e-07, + "loss": 0.0001, + "step": 9540 + }, + { + "epoch": 0.9542, + "grad_norm": 1.704738974571228, + "learning_rate": 1.275235736970193e-07, + "loss": 0.0249, + "step": 9542 + }, + { + "epoch": 0.9544, + "grad_norm": 0.04288797080516815, + "learning_rate": 1.26414609848996e-07, + "loss": 0.0007, + "step": 9544 + }, + { + "epoch": 0.9546, + "grad_norm": 4.062255859375, + "learning_rate": 1.2531045826654652e-07, + "loss": 0.0641, + "step": 9546 + }, + { + "epoch": 0.9548, + "grad_norm": 0.0034237587824463844, + "learning_rate": 1.242111194878215e-07, + "loss": 0.0001, + "step": 9548 + }, + { + "epoch": 0.955, + "grad_norm": 0.33842501044273376, + "learning_rate": 1.231165940486234e-07, + "loss": 0.0033, + "step": 9550 + }, + { + "epoch": 0.9552, + "grad_norm": 0.0034394455142319202, + "learning_rate": 1.2202688248241113e-07, + "loss": 0.0022, + "step": 9552 + }, + { + "epoch": 0.9554, + "grad_norm": 0.001072950311936438, + "learning_rate": 1.2094198532029754e-07, + "loss": 0.0001, + "step": 9554 + }, + { + "epoch": 0.9556, + "grad_norm": 0.1431407779455185, + "learning_rate": 1.1986190309104861e-07, + "loss": 0.0009, + "step": 9556 + }, + { + "epoch": 0.9558, + "grad_norm": 0.015973830595612526, + "learning_rate": 1.1878663632108322e-07, + "loss": 0.0002, + "step": 9558 + }, + { + "epoch": 0.956, + "grad_norm": 0.013407652266323566, + "learning_rate": 1.1771618553447217e-07, + "loss": 0.0005, + "step": 9560 + }, + { + "epoch": 0.9562, + "grad_norm": 1.1709768772125244, + "learning_rate": 1.1665055125294033e-07, + "loss": 0.0073, + "step": 9562 + }, + { + "epoch": 0.9564, + "grad_norm": 0.009171949699521065, + "learning_rate": 1.1558973399586671e-07, + "loss": 0.0145, + "step": 9564 + }, + { + "epoch": 0.9566, + "grad_norm": 0.0035213064402341843, + "learning_rate": 1.1453373428027992e-07, + "loss": 0.0117, + "step": 9566 + }, + { + "epoch": 0.9568, + "grad_norm": 1.1558613777160645, + "learning_rate": 1.134825526208605e-07, + "loss": 0.033, + "step": 9568 + }, + { + "epoch": 0.957, + "grad_norm": 2.7924442291259766, + "learning_rate": 1.1243618952994195e-07, + "loss": 0.0589, + "step": 9570 + }, + { + "epoch": 0.9572, + "grad_norm": 3.1730668544769287, + "learning_rate": 1.1139464551750857e-07, + "loss": 0.0614, + "step": 9572 + }, + { + "epoch": 0.9574, + "grad_norm": 0.011718553490936756, + "learning_rate": 1.1035792109119758e-07, + "loss": 0.0002, + "step": 9574 + }, + { + "epoch": 0.9576, + "grad_norm": 0.0009756018407642841, + "learning_rate": 1.0932601675629595e-07, + "loss": 0.0007, + "step": 9576 + }, + { + "epoch": 0.9578, + "grad_norm": 0.002148932544514537, + "learning_rate": 1.0829893301573913e-07, + "loss": 0.0046, + "step": 9578 + }, + { + "epoch": 0.958, + "grad_norm": 0.1613122969865799, + "learning_rate": 1.0727667037011668e-07, + "loss": 0.0029, + "step": 9580 + }, + { + "epoch": 0.9582, + "grad_norm": 0.679431676864624, + "learning_rate": 1.0625922931766786e-07, + "loss": 0.0078, + "step": 9582 + }, + { + "epoch": 0.9584, + "grad_norm": 0.040964752435684204, + "learning_rate": 1.052466103542793e-07, + "loss": 0.001, + "step": 9584 + }, + { + "epoch": 0.9586, + "grad_norm": 0.0032893477473407984, + "learning_rate": 1.0423881397349067e-07, + "loss": 0.0001, + "step": 9586 + }, + { + "epoch": 0.9588, + "grad_norm": 0.011994869448244572, + "learning_rate": 1.0323584066648795e-07, + "loss": 0.0278, + "step": 9588 + }, + { + "epoch": 0.959, + "grad_norm": 0.008879956789314747, + "learning_rate": 1.0223769092211012e-07, + "loss": 0.0002, + "step": 9590 + }, + { + "epoch": 0.9592, + "grad_norm": 0.10344457626342773, + "learning_rate": 1.0124436522684244e-07, + "loss": 0.0156, + "step": 9592 + }, + { + "epoch": 0.9594, + "grad_norm": 0.0033746720291674137, + "learning_rate": 1.002558640648199e-07, + "loss": 0.0002, + "step": 9594 + }, + { + "epoch": 0.9596, + "grad_norm": 0.007138144690543413, + "learning_rate": 9.9272187917826e-08, + "loss": 0.0002, + "step": 9596 + }, + { + "epoch": 0.9598, + "grad_norm": 0.0024950613733381033, + "learning_rate": 9.829333726529056e-08, + "loss": 0.0001, + "step": 9598 + }, + { + "epoch": 0.96, + "grad_norm": 0.05406169220805168, + "learning_rate": 9.731931258429638e-08, + "loss": 0.007, + "step": 9600 + }, + { + "epoch": 0.9602, + "grad_norm": 0.006694332230836153, + "learning_rate": 9.635011434957153e-08, + "loss": 0.0002, + "step": 9602 + }, + { + "epoch": 0.9604, + "grad_norm": 0.0032262636814266443, + "learning_rate": 9.538574303348813e-08, + "loss": 0.0001, + "step": 9604 + }, + { + "epoch": 0.9606, + "grad_norm": 0.0026647774502635, + "learning_rate": 9.442619910607131e-08, + "loss": 0.0001, + "step": 9606 + }, + { + "epoch": 0.9608, + "grad_norm": 0.010079916566610336, + "learning_rate": 9.347148303499143e-08, + "loss": 0.0014, + "step": 9608 + }, + { + "epoch": 0.961, + "grad_norm": 0.00014433097385335714, + "learning_rate": 9.252159528556404e-08, + "loss": 0.0003, + "step": 9610 + }, + { + "epoch": 0.9612, + "grad_norm": 0.00117563980165869, + "learning_rate": 9.157653632075435e-08, + "loss": 0.0002, + "step": 9612 + }, + { + "epoch": 0.9614, + "grad_norm": 0.009321482852101326, + "learning_rate": 9.063630660117172e-08, + "loss": 0.0033, + "step": 9614 + }, + { + "epoch": 0.9616, + "grad_norm": 0.050532303750514984, + "learning_rate": 8.970090658507291e-08, + "loss": 0.011, + "step": 9616 + }, + { + "epoch": 0.9618, + "grad_norm": 0.0025306011084467173, + "learning_rate": 8.877033672835988e-08, + "loss": 0.0007, + "step": 9618 + }, + { + "epoch": 0.962, + "grad_norm": 0.7344707250595093, + "learning_rate": 8.784459748458318e-08, + "loss": 0.0515, + "step": 9620 + }, + { + "epoch": 0.9622, + "grad_norm": 0.0051144822500646114, + "learning_rate": 8.692368930493522e-08, + "loss": 0.0073, + "step": 9622 + }, + { + "epoch": 0.9624, + "grad_norm": 0.0791313499212265, + "learning_rate": 8.600761263825475e-08, + "loss": 0.0049, + "step": 9624 + }, + { + "epoch": 0.9626, + "grad_norm": 0.009289727546274662, + "learning_rate": 8.509636793102683e-08, + "loss": 0.0002, + "step": 9626 + }, + { + "epoch": 0.9628, + "grad_norm": 0.0006142943748272955, + "learning_rate": 8.418995562738286e-08, + "loss": 0.0001, + "step": 9628 + }, + { + "epoch": 0.963, + "grad_norm": 0.004864751826971769, + "learning_rate": 8.328837616909612e-08, + "loss": 0.0003, + "step": 9630 + }, + { + "epoch": 0.9632, + "grad_norm": 0.097641721367836, + "learning_rate": 8.239162999558403e-08, + "loss": 0.0015, + "step": 9632 + }, + { + "epoch": 0.9634, + "grad_norm": 0.0361097976565361, + "learning_rate": 8.149971754391251e-08, + "loss": 0.0006, + "step": 9634 + }, + { + "epoch": 0.9636, + "grad_norm": 0.0021181763149797916, + "learning_rate": 8.061263924878604e-08, + "loss": 0.0002, + "step": 9636 + }, + { + "epoch": 0.9638, + "grad_norm": 0.0024758130311965942, + "learning_rate": 7.973039554255768e-08, + "loss": 0.0021, + "step": 9638 + }, + { + "epoch": 0.964, + "grad_norm": 0.008334488607943058, + "learning_rate": 7.885298685522235e-08, + "loss": 0.0005, + "step": 9640 + }, + { + "epoch": 0.9642, + "grad_norm": 0.003025052137672901, + "learning_rate": 7.798041361441688e-08, + "loss": 0.0688, + "step": 9642 + }, + { + "epoch": 0.9644, + "grad_norm": 0.004908234346657991, + "learning_rate": 7.71126762454233e-08, + "loss": 0.0004, + "step": 9644 + }, + { + "epoch": 0.9646, + "grad_norm": 0.001810424611903727, + "learning_rate": 7.624977517116772e-08, + "loss": 0.0635, + "step": 9646 + }, + { + "epoch": 0.9648, + "grad_norm": 0.0019908135291188955, + "learning_rate": 7.539171081221597e-08, + "loss": 0.0001, + "step": 9648 + }, + { + "epoch": 0.965, + "grad_norm": 0.005871981382369995, + "learning_rate": 7.453848358678018e-08, + "loss": 0.0001, + "step": 9650 + }, + { + "epoch": 0.9652, + "grad_norm": 0.04634464159607887, + "learning_rate": 7.369009391070992e-08, + "loss": 0.0006, + "step": 9652 + }, + { + "epoch": 0.9654, + "grad_norm": 0.0013069580309092999, + "learning_rate": 7.284654219750332e-08, + "loss": 0.0026, + "step": 9654 + }, + { + "epoch": 0.9656, + "grad_norm": 0.012396058067679405, + "learning_rate": 7.200782885829482e-08, + "loss": 0.0003, + "step": 9656 + }, + { + "epoch": 0.9658, + "grad_norm": 0.0062915789894759655, + "learning_rate": 7.117395430186414e-08, + "loss": 0.0002, + "step": 9658 + }, + { + "epoch": 0.966, + "grad_norm": 0.02978573925793171, + "learning_rate": 7.034491893463059e-08, + "loss": 0.0006, + "step": 9660 + }, + { + "epoch": 0.9662, + "grad_norm": 0.04025733470916748, + "learning_rate": 6.95207231606576e-08, + "loss": 0.0004, + "step": 9662 + }, + { + "epoch": 0.9664, + "grad_norm": 0.018713587895035744, + "learning_rate": 6.870136738164612e-08, + "loss": 0.0003, + "step": 9664 + }, + { + "epoch": 0.9666, + "grad_norm": 0.0012101591564714909, + "learning_rate": 6.788685199694222e-08, + "loss": 0.0, + "step": 9666 + }, + { + "epoch": 0.9668, + "grad_norm": 0.013979877345263958, + "learning_rate": 6.707717740353059e-08, + "loss": 0.0002, + "step": 9668 + }, + { + "epoch": 0.967, + "grad_norm": 0.003871943335980177, + "learning_rate": 6.627234399603554e-08, + "loss": 0.0014, + "step": 9670 + }, + { + "epoch": 0.9672, + "grad_norm": 0.001704428461380303, + "learning_rate": 6.547235216672443e-08, + "loss": 0.0007, + "step": 9672 + }, + { + "epoch": 0.9674, + "grad_norm": 0.018707402050495148, + "learning_rate": 6.4677202305502e-08, + "loss": 0.0003, + "step": 9674 + }, + { + "epoch": 0.9676, + "grad_norm": 0.21460489928722382, + "learning_rate": 6.388689479991606e-08, + "loss": 0.0015, + "step": 9676 + }, + { + "epoch": 0.9678, + "grad_norm": 0.03427169844508171, + "learning_rate": 6.310143003515179e-08, + "loss": 0.0003, + "step": 9678 + }, + { + "epoch": 0.968, + "grad_norm": 0.06439043581485748, + "learning_rate": 6.232080839403631e-08, + "loss": 0.0005, + "step": 9680 + }, + { + "epoch": 0.9682, + "grad_norm": 0.046173095703125, + "learning_rate": 6.154503025703418e-08, + "loss": 0.0006, + "step": 9682 + }, + { + "epoch": 0.9684, + "grad_norm": 0.006045571994036436, + "learning_rate": 6.07740960022507e-08, + "loss": 0.0005, + "step": 9684 + }, + { + "epoch": 0.9686, + "grad_norm": 0.000979679636657238, + "learning_rate": 6.000800600542977e-08, + "loss": 0.0008, + "step": 9686 + }, + { + "epoch": 0.9688, + "grad_norm": 0.0030669576954096556, + "learning_rate": 5.9246760639953824e-08, + "loss": 0.0002, + "step": 9688 + }, + { + "epoch": 0.969, + "grad_norm": 0.0015330484602600336, + "learning_rate": 5.849036027684607e-08, + "loss": 0.0007, + "step": 9690 + }, + { + "epoch": 0.9692, + "grad_norm": 0.004015665501356125, + "learning_rate": 5.7738805284764945e-08, + "loss": 0.0007, + "step": 9692 + }, + { + "epoch": 0.9694, + "grad_norm": 0.014422358013689518, + "learning_rate": 5.699209603001077e-08, + "loss": 0.0004, + "step": 9694 + }, + { + "epoch": 0.9696, + "grad_norm": 0.004266129806637764, + "learning_rate": 5.625023287652021e-08, + "loss": 0.0001, + "step": 9696 + }, + { + "epoch": 0.9698, + "grad_norm": 0.008096534758806229, + "learning_rate": 5.5513216185867356e-08, + "loss": 0.0001, + "step": 9698 + }, + { + "epoch": 0.97, + "grad_norm": 0.008068396709859371, + "learning_rate": 5.4781046317267103e-08, + "loss": 0.0018, + "step": 9700 + }, + { + "epoch": 0.9702, + "grad_norm": 0.019442075863480568, + "learning_rate": 5.4053723627567336e-08, + "loss": 0.0068, + "step": 9702 + }, + { + "epoch": 0.9704, + "grad_norm": 0.0015021816361695528, + "learning_rate": 5.3331248471258926e-08, + "loss": 0.0001, + "step": 9704 + }, + { + "epoch": 0.9706, + "grad_norm": 0.01777404174208641, + "learning_rate": 5.261362120046687e-08, + "loss": 0.0003, + "step": 9706 + }, + { + "epoch": 0.9708, + "grad_norm": 0.002592511475086212, + "learning_rate": 5.190084216495361e-08, + "loss": 0.0001, + "step": 9708 + }, + { + "epoch": 0.971, + "grad_norm": 0.0988200232386589, + "learning_rate": 5.119291171211793e-08, + "loss": 0.0015, + "step": 9710 + }, + { + "epoch": 0.9712, + "grad_norm": 0.011560709215700626, + "learning_rate": 5.048983018699827e-08, + "loss": 0.0002, + "step": 9712 + }, + { + "epoch": 0.9714, + "grad_norm": 0.01569121517241001, + "learning_rate": 4.979159793226718e-08, + "loss": 0.0002, + "step": 9714 + }, + { + "epoch": 0.9716, + "grad_norm": 0.0005910595064051449, + "learning_rate": 4.9098215288235776e-08, + "loss": 0.0, + "step": 9716 + }, + { + "epoch": 0.9718, + "grad_norm": 0.0005452656769193709, + "learning_rate": 4.840968259284817e-08, + "loss": 0.0012, + "step": 9718 + }, + { + "epoch": 0.972, + "grad_norm": 0.040690094232559204, + "learning_rate": 4.772600018168816e-08, + "loss": 0.0005, + "step": 9720 + }, + { + "epoch": 0.9722, + "grad_norm": 0.6022855043411255, + "learning_rate": 4.704716838797363e-08, + "loss": 0.02, + "step": 9722 + }, + { + "epoch": 0.9724, + "grad_norm": 0.0004979079822078347, + "learning_rate": 4.6373187542561036e-08, + "loss": 0.0, + "step": 9724 + }, + { + "epoch": 0.9726, + "grad_norm": 0.019431613385677338, + "learning_rate": 4.570405797393762e-08, + "loss": 0.0003, + "step": 9726 + }, + { + "epoch": 0.9728, + "grad_norm": 0.4952191710472107, + "learning_rate": 4.503978000823028e-08, + "loss": 0.0204, + "step": 9728 + }, + { + "epoch": 0.973, + "grad_norm": 0.00437450036406517, + "learning_rate": 4.438035396920004e-08, + "loss": 0.0002, + "step": 9730 + }, + { + "epoch": 0.9732, + "grad_norm": 0.6243838667869568, + "learning_rate": 4.3725780178243135e-08, + "loss": 0.0505, + "step": 9732 + }, + { + "epoch": 0.9734, + "grad_norm": 1.8909474611282349, + "learning_rate": 4.3076058954391045e-08, + "loss": 0.0806, + "step": 9734 + }, + { + "epoch": 0.9736, + "grad_norm": 0.03636506199836731, + "learning_rate": 4.2431190614309334e-08, + "loss": 0.0018, + "step": 9736 + }, + { + "epoch": 0.9738, + "grad_norm": 0.00047098391223698854, + "learning_rate": 4.179117547229883e-08, + "loss": 0.0009, + "step": 9738 + }, + { + "epoch": 0.974, + "grad_norm": 0.005016247276216745, + "learning_rate": 4.115601384029666e-08, + "loss": 0.0005, + "step": 9740 + }, + { + "epoch": 0.9742, + "grad_norm": 0.0016705391462892294, + "learning_rate": 4.052570602787076e-08, + "loss": 0.0, + "step": 9742 + }, + { + "epoch": 0.9744, + "grad_norm": 0.0010169981978833675, + "learning_rate": 3.990025234222872e-08, + "loss": 0.0001, + "step": 9744 + }, + { + "epoch": 0.9746, + "grad_norm": 0.032705601304769516, + "learning_rate": 3.927965308820558e-08, + "loss": 0.0016, + "step": 9746 + }, + { + "epoch": 0.9748, + "grad_norm": 0.0017658134456723928, + "learning_rate": 3.866390856827495e-08, + "loss": 0.0008, + "step": 9748 + }, + { + "epoch": 0.975, + "grad_norm": 0.005930292420089245, + "learning_rate": 3.805301908254455e-08, + "loss": 0.0002, + "step": 9750 + }, + { + "epoch": 0.9752, + "grad_norm": 2.404426336288452, + "learning_rate": 3.7446984928753984e-08, + "loss": 0.0383, + "step": 9752 + }, + { + "epoch": 0.9754, + "grad_norm": 0.06499545276165009, + "learning_rate": 3.684580640227586e-08, + "loss": 0.0009, + "step": 9754 + }, + { + "epoch": 0.9756, + "grad_norm": 0.008937448263168335, + "learning_rate": 3.6249483796116924e-08, + "loss": 0.0003, + "step": 9756 + }, + { + "epoch": 0.9758, + "grad_norm": 0.0042498051188886166, + "learning_rate": 3.565801740092023e-08, + "loss": 0.0001, + "step": 9758 + }, + { + "epoch": 0.976, + "grad_norm": 11.414470672607422, + "learning_rate": 3.50714075049563e-08, + "loss": 0.3497, + "step": 9760 + }, + { + "epoch": 0.9762, + "grad_norm": 0.05076073110103607, + "learning_rate": 3.4489654394134206e-08, + "loss": 0.0006, + "step": 9762 + }, + { + "epoch": 0.9764, + "grad_norm": 0.0009702980751171708, + "learning_rate": 3.391275835199159e-08, + "loss": 0.0002, + "step": 9764 + }, + { + "epoch": 0.9766, + "grad_norm": 0.0014147055335342884, + "learning_rate": 3.3340719659701315e-08, + "loss": 0.0001, + "step": 9766 + }, + { + "epoch": 0.9768, + "grad_norm": 0.02458469569683075, + "learning_rate": 3.2773538596068134e-08, + "loss": 0.0005, + "step": 9768 + }, + { + "epoch": 0.977, + "grad_norm": 2.083968162536621, + "learning_rate": 3.22112154375287e-08, + "loss": 0.0188, + "step": 9770 + }, + { + "epoch": 0.9772, + "grad_norm": 0.0048561193980276585, + "learning_rate": 3.165375045815266e-08, + "loss": 0.0002, + "step": 9772 + }, + { + "epoch": 0.9774, + "grad_norm": 0.0007593539776280522, + "learning_rate": 3.110114392964159e-08, + "loss": 0.0, + "step": 9774 + }, + { + "epoch": 0.9776, + "grad_norm": 0.0005935663357377052, + "learning_rate": 3.0553396121330015e-08, + "loss": 0.0001, + "step": 9776 + }, + { + "epoch": 0.9778, + "grad_norm": 6.604820728302002, + "learning_rate": 3.001050730018218e-08, + "loss": 0.0349, + "step": 9778 + }, + { + "epoch": 0.978, + "grad_norm": 1.3071397542953491, + "learning_rate": 2.947247773079753e-08, + "loss": 0.0204, + "step": 9780 + }, + { + "epoch": 0.9782, + "grad_norm": 0.13879956305027008, + "learning_rate": 2.8939307675402983e-08, + "loss": 0.0016, + "step": 9782 + }, + { + "epoch": 0.9784, + "grad_norm": 0.011085927486419678, + "learning_rate": 2.8410997393860663e-08, + "loss": 0.1089, + "step": 9784 + }, + { + "epoch": 0.9786, + "grad_norm": 0.4285180866718292, + "learning_rate": 2.7887547143662375e-08, + "loss": 0.0061, + "step": 9786 + }, + { + "epoch": 0.9788, + "grad_norm": 0.04123024642467499, + "learning_rate": 2.7368957179929602e-08, + "loss": 0.0004, + "step": 9788 + }, + { + "epoch": 0.979, + "grad_norm": 0.6912966966629028, + "learning_rate": 2.6855227755419046e-08, + "loss": 0.0216, + "step": 9790 + }, + { + "epoch": 0.9792, + "grad_norm": 0.0030163368210196495, + "learning_rate": 2.6346359120514863e-08, + "loss": 0.1554, + "step": 9792 + }, + { + "epoch": 0.9794, + "grad_norm": 0.2920261025428772, + "learning_rate": 2.584235152323422e-08, + "loss": 0.0022, + "step": 9794 + }, + { + "epoch": 0.9796, + "grad_norm": 1.1908093690872192, + "learning_rate": 2.5343205209225062e-08, + "loss": 0.0161, + "step": 9796 + }, + { + "epoch": 0.9798, + "grad_norm": 0.00045356823829934, + "learning_rate": 2.484892042176279e-08, + "loss": 0.0, + "step": 9798 + }, + { + "epoch": 0.98, + "grad_norm": 0.009485905058681965, + "learning_rate": 2.4359497401758026e-08, + "loss": 0.0002, + "step": 9800 + }, + { + "epoch": 0.9802, + "grad_norm": 0.02484193630516529, + "learning_rate": 2.3874936387747738e-08, + "loss": 0.0002, + "step": 9802 + }, + { + "epoch": 0.9804, + "grad_norm": 1.2955589294433594, + "learning_rate": 2.339523761590301e-08, + "loss": 0.0233, + "step": 9804 + }, + { + "epoch": 0.9806, + "grad_norm": 0.001301737385801971, + "learning_rate": 2.292040132002238e-08, + "loss": 0.0001, + "step": 9806 + }, + { + "epoch": 0.9808, + "grad_norm": 0.020048178732395172, + "learning_rate": 2.2450427731534052e-08, + "loss": 0.0006, + "step": 9808 + }, + { + "epoch": 0.981, + "grad_norm": 0.014243993908166885, + "learning_rate": 2.1985317079500358e-08, + "loss": 0.0002, + "step": 9810 + }, + { + "epoch": 0.9812, + "grad_norm": 0.11014243960380554, + "learning_rate": 2.152506959060774e-08, + "loss": 0.0008, + "step": 9812 + }, + { + "epoch": 0.9814, + "grad_norm": 0.023686420172452927, + "learning_rate": 2.1069685489176762e-08, + "loss": 0.0003, + "step": 9814 + }, + { + "epoch": 0.9816, + "grad_norm": 0.059651125222444534, + "learning_rate": 2.061916499715544e-08, + "loss": 0.0032, + "step": 9816 + }, + { + "epoch": 0.9818, + "grad_norm": 0.6046698689460754, + "learning_rate": 2.017350833412146e-08, + "loss": 0.0029, + "step": 9818 + }, + { + "epoch": 0.982, + "grad_norm": 0.04083731025457382, + "learning_rate": 1.973271571728441e-08, + "loss": 0.0007, + "step": 9820 + }, + { + "epoch": 0.9822, + "grad_norm": 0.022510092705488205, + "learning_rate": 1.929678736148022e-08, + "loss": 0.0003, + "step": 9822 + }, + { + "epoch": 0.9824, + "grad_norm": 0.0037700217217206955, + "learning_rate": 1.886572347917337e-08, + "loss": 0.0002, + "step": 9824 + }, + { + "epoch": 0.9826, + "grad_norm": 0.0019860719330608845, + "learning_rate": 1.8439524280462474e-08, + "loss": 0.0002, + "step": 9826 + }, + { + "epoch": 0.9828, + "grad_norm": 0.11852262914180756, + "learning_rate": 1.8018189973069144e-08, + "loss": 0.0014, + "step": 9828 + }, + { + "epoch": 0.983, + "grad_norm": 0.14133097231388092, + "learning_rate": 1.7601720762346895e-08, + "loss": 0.0014, + "step": 9830 + }, + { + "epoch": 0.9832, + "grad_norm": 0.22748076915740967, + "learning_rate": 1.7190116851280024e-08, + "loss": 0.0074, + "step": 9832 + }, + { + "epoch": 0.9834, + "grad_norm": 0.005235692951828241, + "learning_rate": 1.678337844047695e-08, + "loss": 0.0001, + "step": 9834 + }, + { + "epoch": 0.9836, + "grad_norm": 2.0903544425964355, + "learning_rate": 1.6381505728176872e-08, + "loss": 0.0328, + "step": 9836 + }, + { + "epoch": 0.9838, + "grad_norm": 0.033989403396844864, + "learning_rate": 1.5984498910249778e-08, + "loss": 0.0027, + "step": 9838 + }, + { + "epoch": 0.984, + "grad_norm": 0.0031472332775592804, + "learning_rate": 1.5592358180189782e-08, + "loss": 0.0001, + "step": 9840 + }, + { + "epoch": 0.9842, + "grad_norm": 0.0008642724715173244, + "learning_rate": 1.5205083729122883e-08, + "loss": 0.0001, + "step": 9842 + }, + { + "epoch": 0.9844, + "grad_norm": 0.09379525482654572, + "learning_rate": 1.482267574580143e-08, + "loss": 0.0017, + "step": 9844 + }, + { + "epoch": 0.9846, + "grad_norm": 0.013511535711586475, + "learning_rate": 1.4445134416607442e-08, + "loss": 0.0114, + "step": 9846 + }, + { + "epoch": 0.9848, + "grad_norm": 0.054899122565984726, + "learning_rate": 1.4072459925548176e-08, + "loss": 0.0009, + "step": 9848 + }, + { + "epoch": 0.985, + "grad_norm": 0.000500019290484488, + "learning_rate": 1.370465245426167e-08, + "loss": 0.0001, + "step": 9850 + }, + { + "epoch": 0.9852, + "grad_norm": 0.024312395602464676, + "learning_rate": 1.3341712182012301e-08, + "loss": 0.0004, + "step": 9852 + }, + { + "epoch": 0.9854, + "grad_norm": 0.2681405544281006, + "learning_rate": 1.2983639285693018e-08, + "loss": 0.005, + "step": 9854 + }, + { + "epoch": 0.9856, + "grad_norm": 0.010303812101483345, + "learning_rate": 1.2630433939825326e-08, + "loss": 0.0005, + "step": 9856 + }, + { + "epoch": 0.9858, + "grad_norm": 0.0004365990462247282, + "learning_rate": 1.2282096316554858e-08, + "loss": 0.0, + "step": 9858 + }, + { + "epoch": 0.986, + "grad_norm": 0.010925608687102795, + "learning_rate": 1.1938626585660252e-08, + "loss": 0.1507, + "step": 9860 + }, + { + "epoch": 0.9862, + "grad_norm": 0.005408755969256163, + "learning_rate": 1.1600024914540931e-08, + "loss": 0.0002, + "step": 9862 + }, + { + "epoch": 0.9864, + "grad_norm": 0.0005411783931776881, + "learning_rate": 1.126629146822933e-08, + "loss": 0.0049, + "step": 9864 + }, + { + "epoch": 0.9866, + "grad_norm": 0.057588059455156326, + "learning_rate": 1.0937426409384223e-08, + "loss": 0.0713, + "step": 9866 + }, + { + "epoch": 0.9868, + "grad_norm": 0.0026749393437057734, + "learning_rate": 1.0613429898287397e-08, + "loss": 0.0002, + "step": 9868 + }, + { + "epoch": 0.987, + "grad_norm": 0.40004727244377136, + "learning_rate": 1.0294302092853647e-08, + "loss": 0.0062, + "step": 9870 + }, + { + "epoch": 0.9872, + "grad_norm": 1.300053358078003, + "learning_rate": 9.980043148619668e-09, + "loss": 0.0431, + "step": 9872 + }, + { + "epoch": 0.9874, + "grad_norm": 0.004145804326981306, + "learning_rate": 9.670653218752935e-09, + "loss": 0.0008, + "step": 9874 + }, + { + "epoch": 0.9876, + "grad_norm": 0.0005447008879855275, + "learning_rate": 9.366132454046162e-09, + "loss": 0.0001, + "step": 9876 + }, + { + "epoch": 0.9878, + "grad_norm": 0.010229198262095451, + "learning_rate": 9.066481002918403e-09, + "loss": 0.0029, + "step": 9878 + }, + { + "epoch": 0.988, + "grad_norm": 0.002171460073441267, + "learning_rate": 8.771699011416169e-09, + "loss": 0.0, + "step": 9880 + }, + { + "epoch": 0.9882, + "grad_norm": 0.0031350483186542988, + "learning_rate": 8.481786623214527e-09, + "loss": 0.0002, + "step": 9882 + }, + { + "epoch": 0.9884, + "grad_norm": 0.010180605575442314, + "learning_rate": 8.196743979610455e-09, + "loss": 0.0002, + "step": 9884 + }, + { + "epoch": 0.9886, + "grad_norm": 0.006291084457188845, + "learning_rate": 7.916571219531711e-09, + "loss": 0.0001, + "step": 9886 + }, + { + "epoch": 0.9888, + "grad_norm": 0.025077784433960915, + "learning_rate": 7.641268479531283e-09, + "loss": 0.0001, + "step": 9888 + }, + { + "epoch": 0.989, + "grad_norm": 0.08336751163005829, + "learning_rate": 7.370835893788508e-09, + "loss": 0.0017, + "step": 9890 + }, + { + "epoch": 0.9892, + "grad_norm": 0.021895723417401314, + "learning_rate": 7.105273594107953e-09, + "loss": 0.0002, + "step": 9892 + }, + { + "epoch": 0.9894, + "grad_norm": 0.020106522366404533, + "learning_rate": 6.844581709921639e-09, + "loss": 0.0003, + "step": 9894 + }, + { + "epoch": 0.9896, + "grad_norm": 0.0007052246364764869, + "learning_rate": 6.588760368287928e-09, + "loss": 0.0, + "step": 9896 + }, + { + "epoch": 0.9898, + "grad_norm": 0.0012414405355229974, + "learning_rate": 6.3378096938915276e-09, + "loss": 0.0003, + "step": 9898 + }, + { + "epoch": 0.99, + "grad_norm": 0.0053047253750264645, + "learning_rate": 6.091729809042379e-09, + "loss": 0.0008, + "step": 9900 + }, + { + "epoch": 0.9902, + "grad_norm": 0.050614792853593826, + "learning_rate": 5.850520833676765e-09, + "loss": 0.0007, + "step": 9902 + }, + { + "epoch": 0.9904, + "grad_norm": 0.07377874851226807, + "learning_rate": 5.614182885357311e-09, + "loss": 0.0007, + "step": 9904 + }, + { + "epoch": 0.9906, + "grad_norm": 0.009654352441430092, + "learning_rate": 5.382716079271877e-09, + "loss": 0.0003, + "step": 9906 + }, + { + "epoch": 0.9908, + "grad_norm": 1.3750072717666626, + "learning_rate": 5.156120528233555e-09, + "loss": 0.0378, + "step": 9908 + }, + { + "epoch": 0.991, + "grad_norm": 0.004398927092552185, + "learning_rate": 4.9343963426840006e-09, + "loss": 0.0005, + "step": 9910 + }, + { + "epoch": 0.9912, + "grad_norm": 0.0005592371453531086, + "learning_rate": 4.717543630688992e-09, + "loss": 0.0003, + "step": 9912 + }, + { + "epoch": 0.9914, + "grad_norm": 0.001230758149176836, + "learning_rate": 4.505562497938431e-09, + "loss": 0.0446, + "step": 9914 + }, + { + "epoch": 0.9916, + "grad_norm": 0.00954238697886467, + "learning_rate": 4.298453047749674e-09, + "loss": 0.0004, + "step": 9916 + }, + { + "epoch": 0.9918, + "grad_norm": 2.928744316101074, + "learning_rate": 4.096215381066415e-09, + "loss": 0.0546, + "step": 9918 + }, + { + "epoch": 0.992, + "grad_norm": 0.0145756546407938, + "learning_rate": 3.898849596456477e-09, + "loss": 0.0002, + "step": 9920 + }, + { + "epoch": 0.9922, + "grad_norm": 0.006091123912483454, + "learning_rate": 3.7063557901129144e-09, + "loss": 0.0001, + "step": 9922 + }, + { + "epoch": 0.9924, + "grad_norm": 0.0016660407418385148, + "learning_rate": 3.518734055855122e-09, + "loss": 0.0014, + "step": 9924 + }, + { + "epoch": 0.9926, + "grad_norm": 0.00107058126013726, + "learning_rate": 3.3359844851277302e-09, + "loss": 0.0001, + "step": 9926 + }, + { + "epoch": 0.9928, + "grad_norm": 0.037201445549726486, + "learning_rate": 3.1581071670006013e-09, + "loss": 0.0004, + "step": 9928 + }, + { + "epoch": 0.993, + "grad_norm": 1.7036985158920288, + "learning_rate": 2.9851021881688314e-09, + "loss": 0.0314, + "step": 9930 + }, + { + "epoch": 0.9932, + "grad_norm": 0.02274240180850029, + "learning_rate": 2.8169696329527484e-09, + "loss": 0.0009, + "step": 9932 + }, + { + "epoch": 0.9934, + "grad_norm": 0.005747299641370773, + "learning_rate": 2.6537095832990247e-09, + "loss": 0.0001, + "step": 9934 + }, + { + "epoch": 0.9936, + "grad_norm": 0.0055058617144823074, + "learning_rate": 2.495322118778454e-09, + "loss": 0.0002, + "step": 9936 + }, + { + "epoch": 0.9938, + "grad_norm": 0.009398364461958408, + "learning_rate": 2.341807316587064e-09, + "loss": 0.0003, + "step": 9938 + }, + { + "epoch": 0.994, + "grad_norm": 0.0009311933536082506, + "learning_rate": 2.193165251545004e-09, + "loss": 0.0001, + "step": 9940 + }, + { + "epoch": 0.9942, + "grad_norm": 0.008084950968623161, + "learning_rate": 2.049395996099879e-09, + "loss": 0.0003, + "step": 9942 + }, + { + "epoch": 0.9944, + "grad_norm": 0.10257738828659058, + "learning_rate": 1.910499620322304e-09, + "loss": 0.0012, + "step": 9944 + }, + { + "epoch": 0.9946, + "grad_norm": 0.012498859316110611, + "learning_rate": 1.776476191910348e-09, + "loss": 0.0002, + "step": 9946 + }, + { + "epoch": 0.9948, + "grad_norm": 0.0018180054612457752, + "learning_rate": 1.647325776182873e-09, + "loss": 0.1433, + "step": 9948 + }, + { + "epoch": 0.995, + "grad_norm": 0.03316076099872589, + "learning_rate": 1.5230484360873043e-09, + "loss": 0.0005, + "step": 9950 + }, + { + "epoch": 0.9952, + "grad_norm": 0.26783066987991333, + "learning_rate": 1.4036442321962995e-09, + "loss": 0.0061, + "step": 9952 + }, + { + "epoch": 0.9954, + "grad_norm": 0.0009544492932036519, + "learning_rate": 1.2891132227033087e-09, + "loss": 0.0205, + "step": 9954 + }, + { + "epoch": 0.9956, + "grad_norm": 0.0017017232021316886, + "learning_rate": 1.1794554634314558e-09, + "loss": 0.0001, + "step": 9956 + }, + { + "epoch": 0.9958, + "grad_norm": 0.10491269826889038, + "learning_rate": 1.0746710078257673e-09, + "loss": 0.0015, + "step": 9958 + }, + { + "epoch": 0.996, + "grad_norm": 0.00231228768825531, + "learning_rate": 9.74759906957612e-10, + "loss": 0.014, + "step": 9960 + }, + { + "epoch": 0.9962, + "grad_norm": 0.014171341434121132, + "learning_rate": 8.797222095224822e-10, + "loss": 0.0052, + "step": 9962 + }, + { + "epoch": 0.9964, + "grad_norm": 0.027096904814243317, + "learning_rate": 7.895579618388827e-10, + "loss": 0.0008, + "step": 9964 + }, + { + "epoch": 0.9966, + "grad_norm": 0.4172673225402832, + "learning_rate": 7.042672078527712e-10, + "loss": 0.0019, + "step": 9966 + }, + { + "epoch": 0.9968, + "grad_norm": 0.07428724318742752, + "learning_rate": 6.238499891353389e-10, + "loss": 0.0008, + "step": 9968 + }, + { + "epoch": 0.997, + "grad_norm": 0.02402421087026596, + "learning_rate": 5.483063448785686e-10, + "loss": 0.0004, + "step": 9970 + }, + { + "epoch": 0.9972, + "grad_norm": 0.001810584799386561, + "learning_rate": 4.77636311903007e-10, + "loss": 0.0064, + "step": 9972 + }, + { + "epoch": 0.9974, + "grad_norm": 0.023384347558021545, + "learning_rate": 4.118399246522131e-10, + "loss": 0.0003, + "step": 9974 + }, + { + "epoch": 0.9976, + "grad_norm": 3.815945863723755, + "learning_rate": 3.509172151938689e-10, + "loss": 0.0354, + "step": 9976 + }, + { + "epoch": 0.9978, + "grad_norm": 1.9820475578308105, + "learning_rate": 2.948682132208891e-10, + "loss": 0.0263, + "step": 9978 + }, + { + "epoch": 0.998, + "grad_norm": 0.004215274006128311, + "learning_rate": 2.436929460525317e-10, + "loss": 0.0, + "step": 9980 + }, + { + "epoch": 0.9982, + "grad_norm": 0.010364760644733906, + "learning_rate": 1.9739143862884668e-10, + "loss": 0.0002, + "step": 9982 + }, + { + "epoch": 0.9984, + "grad_norm": 0.008931980468332767, + "learning_rate": 1.559637135173375e-10, + "loss": 0.001, + "step": 9984 + }, + { + "epoch": 0.9986, + "grad_norm": 0.02755766548216343, + "learning_rate": 1.1940979091074056e-10, + "loss": 0.0003, + "step": 9986 + }, + { + "epoch": 0.9988, + "grad_norm": 0.003968787379562855, + "learning_rate": 8.772968862369447e-11, + "loss": 0.0013, + "step": 9988 + }, + { + "epoch": 0.999, + "grad_norm": 0.005172291304916143, + "learning_rate": 6.092342209607083e-11, + "loss": 0.0001, + "step": 9990 + }, + { + "epoch": 0.9992, + "grad_norm": 0.13315589725971222, + "learning_rate": 3.899100439408443e-11, + "loss": 0.0009, + "step": 9992 + }, + { + "epoch": 0.9994, + "grad_norm": 3.4757349491119385, + "learning_rate": 2.1932446206962556e-11, + "loss": 0.0473, + "step": 9994 + }, + { + "epoch": 0.9996, + "grad_norm": 0.0218975730240345, + "learning_rate": 9.74775584916543e-12, + "loss": 0.0005, + "step": 9996 + }, + { + "epoch": 0.9998, + "grad_norm": 0.12219374626874924, + "learning_rate": 2.4369392592760166e-12, + "loss": 0.0019, + "step": 9998 + }, + { + "epoch": 1.0, + "grad_norm": 0.0004802099138032645, + "learning_rate": 0.0, + "loss": 0.0133, + "step": 10000 + }, + { + "epoch": 1.0, + "step": 10000, + "total_flos": 8.33401238501458e+16, + "train_loss": 0.03169609568425331, + "train_runtime": 14997.0891, + "train_samples_per_second": 0.667, + "train_steps_per_second": 0.667 + } + ], + "logging_steps": 2, + "max_steps": 10000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 8.33401238501458e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario18_new_10000_nosampling_seed1/client_0/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario18_new_10000_nosampling_seed1/client_0/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..48487a3133dbe34686d50efae8f4c558b60be615 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario18_new_10000_nosampling_seed1/client_0/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52c2b207932e0f61b36dd9d9db99b7fe55fb77706c202392f7cfe670114c0dd5 +size 3837841200 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario18_new_10000_nosampling_seed1/client_0/global_step10000/mp_rank_00_model_states.pt b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario18_new_10000_nosampling_seed1/client_0/global_step10000/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a1bc6cf5784ca76d2c2656815a387e0c3a5f3ad0 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario18_new_10000_nosampling_seed1/client_0/global_step10000/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0abc47d0875933d43988b6c2d7a5cced803978624733bca6d6a72b61228d629 +size 639989420 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario18_new_10000_nosampling_seed1/client_0/latest b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario18_new_10000_nosampling_seed1/client_0/latest new file mode 100644 index 0000000000000000000000000000000000000000..25c776ee3abcad1c4d1e16e8275e4e00984a237c --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario18_new_10000_nosampling_seed1/client_0/latest @@ -0,0 +1 @@ +global_step10000 \ No newline at end of file diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario18_new_10000_nosampling_seed1/client_0/scheduler.pt b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario18_new_10000_nosampling_seed1/client_0/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..25eb571999231336d0a702eb239de6c45b7611dd --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario18_new_10000_nosampling_seed1/client_0/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939ec7493baff042202ebbde234df8c237e604474ce9fde178ca98bdf8572092 +size 1064 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario18_new_10000_nosampling_seed1/client_0/zero_to_fp32.py b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario18_new_10000_nosampling_seed1/client_0/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..e93cb1c95f15c1474642edb1978714075361bc04 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario18_new_10000_nosampling_seed1/client_0/zero_to_fp32.py @@ -0,0 +1,758 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: + shared_tensor = state_dict[converted_tensors[tensor_id]] + state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + state_dict[name] = tensor.contiguous() + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in shard_state_dict: + del state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario18_new_10000_nosampling_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario18_new_10000_nosampling_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b49cd841e7447f75ca1e470f29df74f3e20898e --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario18_new_10000_nosampling_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46fb8d7584d1c76a7c8d7e000e957001c874614c2a3dd7af11bae9a1c49c203c +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario20_new_10000_nosampling_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario20_new_10000_nosampling_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9714f8fbdb71d3265d8dc6aae7a109b6baa7a35f --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario20_new_10000_nosampling_seed1/0_trainer_state.json @@ -0,0 +1,70032 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 20000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001, + "grad_norm": 6.722116470336914, + "learning_rate": 2e-08, + "loss": 1.4467, + "step": 2 + }, + { + "epoch": 0.0002, + "grad_norm": 6.59185791015625, + "learning_rate": 4e-08, + "loss": 1.5076, + "step": 4 + }, + { + "epoch": 0.0003, + "grad_norm": 6.0696120262146, + "learning_rate": 6.000000000000001e-08, + "loss": 1.9249, + "step": 6 + }, + { + "epoch": 0.0004, + "grad_norm": 7.438314437866211, + "learning_rate": 8e-08, + "loss": 1.8913, + "step": 8 + }, + { + "epoch": 0.0005, + "grad_norm": 9.137560844421387, + "learning_rate": 1.0000000000000001e-07, + "loss": 1.3831, + "step": 10 + }, + { + "epoch": 0.0006, + "grad_norm": 4.764645099639893, + "learning_rate": 1.2000000000000002e-07, + "loss": 1.3421, + "step": 12 + }, + { + "epoch": 0.0007, + "grad_norm": 10.489251136779785, + "learning_rate": 1.4e-07, + "loss": 1.4776, + "step": 14 + }, + { + "epoch": 0.0008, + "grad_norm": 7.695075035095215, + "learning_rate": 1.6e-07, + "loss": 2.108, + "step": 16 + }, + { + "epoch": 0.0009, + "grad_norm": 6.208449363708496, + "learning_rate": 1.8e-07, + "loss": 1.5616, + "step": 18 + }, + { + "epoch": 0.001, + "grad_norm": 11.07870101928711, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.7037, + "step": 20 + }, + { + "epoch": 0.0011, + "grad_norm": 10.948630332946777, + "learning_rate": 2.2e-07, + "loss": 2.0427, + "step": 22 + }, + { + "epoch": 0.0012, + "grad_norm": 7.2671990394592285, + "learning_rate": 2.4000000000000003e-07, + "loss": 1.2756, + "step": 24 + }, + { + "epoch": 0.0013, + "grad_norm": 3.0278313159942627, + "learning_rate": 2.6e-07, + "loss": 1.0777, + "step": 26 + }, + { + "epoch": 0.0014, + "grad_norm": 11.63021469116211, + "learning_rate": 2.8e-07, + "loss": 2.129, + "step": 28 + }, + { + "epoch": 0.0015, + "grad_norm": 9.821151733398438, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.9555, + "step": 30 + }, + { + "epoch": 0.0016, + "grad_norm": 7.317845344543457, + "learning_rate": 3.2e-07, + "loss": 1.8536, + "step": 32 + }, + { + "epoch": 0.0017, + "grad_norm": 14.145334243774414, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.919, + "step": 34 + }, + { + "epoch": 0.0018, + "grad_norm": 8.644704818725586, + "learning_rate": 3.6e-07, + "loss": 1.6427, + "step": 36 + }, + { + "epoch": 0.0019, + "grad_norm": 8.156972885131836, + "learning_rate": 3.8e-07, + "loss": 1.7352, + "step": 38 + }, + { + "epoch": 0.002, + "grad_norm": 4.838464260101318, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.0946, + "step": 40 + }, + { + "epoch": 0.0021, + "grad_norm": 9.193564414978027, + "learning_rate": 4.2000000000000006e-07, + "loss": 1.1387, + "step": 42 + }, + { + "epoch": 0.0022, + "grad_norm": 8.509073257446289, + "learning_rate": 4.4e-07, + "loss": 1.7457, + "step": 44 + }, + { + "epoch": 0.0023, + "grad_norm": 5.22540807723999, + "learning_rate": 4.6000000000000004e-07, + "loss": 1.8927, + "step": 46 + }, + { + "epoch": 0.0024, + "grad_norm": 5.877729892730713, + "learning_rate": 4.800000000000001e-07, + "loss": 1.0511, + "step": 48 + }, + { + "epoch": 0.0025, + "grad_norm": 11.879194259643555, + "learning_rate": 5.000000000000001e-07, + "loss": 2.0665, + "step": 50 + }, + { + "epoch": 0.0026, + "grad_norm": 6.148487091064453, + "learning_rate": 5.2e-07, + "loss": 1.5941, + "step": 52 + }, + { + "epoch": 0.0027, + "grad_norm": 7.928617000579834, + "learning_rate": 5.4e-07, + "loss": 1.6603, + "step": 54 + }, + { + "epoch": 0.0028, + "grad_norm": 10.760047912597656, + "learning_rate": 5.6e-07, + "loss": 2.4731, + "step": 56 + }, + { + "epoch": 0.0029, + "grad_norm": 4.9813337326049805, + "learning_rate": 5.800000000000001e-07, + "loss": 1.0821, + "step": 58 + }, + { + "epoch": 0.003, + "grad_norm": 9.314231872558594, + "learning_rate": 6.000000000000001e-07, + "loss": 1.7992, + "step": 60 + }, + { + "epoch": 0.0031, + "grad_norm": 6.395276069641113, + "learning_rate": 6.200000000000001e-07, + "loss": 1.3945, + "step": 62 + }, + { + "epoch": 0.0032, + "grad_norm": 14.916035652160645, + "learning_rate": 6.4e-07, + "loss": 2.251, + "step": 64 + }, + { + "epoch": 0.0033, + "grad_norm": 4.867149829864502, + "learning_rate": 6.6e-07, + "loss": 1.363, + "step": 66 + }, + { + "epoch": 0.0034, + "grad_norm": 4.761037826538086, + "learning_rate": 6.800000000000001e-07, + "loss": 1.504, + "step": 68 + }, + { + "epoch": 0.0035, + "grad_norm": 10.509197235107422, + "learning_rate": 7.000000000000001e-07, + "loss": 1.5833, + "step": 70 + }, + { + "epoch": 0.0036, + "grad_norm": 8.699819564819336, + "learning_rate": 7.2e-07, + "loss": 2.2623, + "step": 72 + }, + { + "epoch": 0.0037, + "grad_norm": 11.462303161621094, + "learning_rate": 7.4e-07, + "loss": 1.4817, + "step": 74 + }, + { + "epoch": 0.0038, + "grad_norm": 4.736894130706787, + "learning_rate": 7.6e-07, + "loss": 1.3709, + "step": 76 + }, + { + "epoch": 0.0039, + "grad_norm": 4.942889213562012, + "learning_rate": 7.8e-07, + "loss": 1.3961, + "step": 78 + }, + { + "epoch": 0.004, + "grad_norm": 6.310662746429443, + "learning_rate": 8.000000000000001e-07, + "loss": 1.5629, + "step": 80 + }, + { + "epoch": 0.0041, + "grad_norm": 3.9962210655212402, + "learning_rate": 8.200000000000001e-07, + "loss": 0.3812, + "step": 82 + }, + { + "epoch": 0.0042, + "grad_norm": 38.576622009277344, + "learning_rate": 8.400000000000001e-07, + "loss": 2.0118, + "step": 84 + }, + { + "epoch": 0.0043, + "grad_norm": 5.966770172119141, + "learning_rate": 8.6e-07, + "loss": 2.1142, + "step": 86 + }, + { + "epoch": 0.0044, + "grad_norm": 9.525362968444824, + "learning_rate": 8.8e-07, + "loss": 1.6367, + "step": 88 + }, + { + "epoch": 0.0045, + "grad_norm": 12.968073844909668, + "learning_rate": 9.000000000000001e-07, + "loss": 2.0267, + "step": 90 + }, + { + "epoch": 0.0046, + "grad_norm": 13.147738456726074, + "learning_rate": 9.200000000000001e-07, + "loss": 1.7129, + "step": 92 + }, + { + "epoch": 0.0047, + "grad_norm": 22.31426239013672, + "learning_rate": 9.400000000000001e-07, + "loss": 1.4718, + "step": 94 + }, + { + "epoch": 0.0048, + "grad_norm": 7.183313846588135, + "learning_rate": 9.600000000000001e-07, + "loss": 1.183, + "step": 96 + }, + { + "epoch": 0.0049, + "grad_norm": 8.38509750366211, + "learning_rate": 9.800000000000001e-07, + "loss": 2.008, + "step": 98 + }, + { + "epoch": 0.005, + "grad_norm": 6.748653888702393, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.9373, + "step": 100 + }, + { + "epoch": 0.0051, + "grad_norm": 5.471179962158203, + "learning_rate": 1.02e-06, + "loss": 2.0228, + "step": 102 + }, + { + "epoch": 0.0052, + "grad_norm": 22.71720314025879, + "learning_rate": 1.04e-06, + "loss": 0.6502, + "step": 104 + }, + { + "epoch": 0.0053, + "grad_norm": 7.394750118255615, + "learning_rate": 1.06e-06, + "loss": 1.3157, + "step": 106 + }, + { + "epoch": 0.0054, + "grad_norm": 10.901415824890137, + "learning_rate": 1.08e-06, + "loss": 1.0879, + "step": 108 + }, + { + "epoch": 0.0055, + "grad_norm": 10.816811561584473, + "learning_rate": 1.1e-06, + "loss": 1.3509, + "step": 110 + }, + { + "epoch": 0.0056, + "grad_norm": 6.019052505493164, + "learning_rate": 1.12e-06, + "loss": 1.0494, + "step": 112 + }, + { + "epoch": 0.0057, + "grad_norm": 4.872289657592773, + "learning_rate": 1.14e-06, + "loss": 1.4334, + "step": 114 + }, + { + "epoch": 0.0058, + "grad_norm": 6.806595325469971, + "learning_rate": 1.1600000000000001e-06, + "loss": 1.4856, + "step": 116 + }, + { + "epoch": 0.0059, + "grad_norm": 6.402238845825195, + "learning_rate": 1.1800000000000001e-06, + "loss": 1.7224, + "step": 118 + }, + { + "epoch": 0.006, + "grad_norm": 4.996392726898193, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.4459, + "step": 120 + }, + { + "epoch": 0.0061, + "grad_norm": 3.3365249633789062, + "learning_rate": 1.2200000000000002e-06, + "loss": 1.0797, + "step": 122 + }, + { + "epoch": 0.0062, + "grad_norm": 11.870892524719238, + "learning_rate": 1.2400000000000002e-06, + "loss": 1.7119, + "step": 124 + }, + { + "epoch": 0.0063, + "grad_norm": 3.633608818054199, + "learning_rate": 1.26e-06, + "loss": 1.7276, + "step": 126 + }, + { + "epoch": 0.0064, + "grad_norm": 6.249979019165039, + "learning_rate": 1.28e-06, + "loss": 1.6752, + "step": 128 + }, + { + "epoch": 0.0065, + "grad_norm": 5.411247730255127, + "learning_rate": 1.3e-06, + "loss": 1.4276, + "step": 130 + }, + { + "epoch": 0.0066, + "grad_norm": 10.018465995788574, + "learning_rate": 1.32e-06, + "loss": 2.0518, + "step": 132 + }, + { + "epoch": 0.0067, + "grad_norm": 7.774552345275879, + "learning_rate": 1.34e-06, + "loss": 1.5164, + "step": 134 + }, + { + "epoch": 0.0068, + "grad_norm": 4.431427001953125, + "learning_rate": 1.3600000000000001e-06, + "loss": 1.7128, + "step": 136 + }, + { + "epoch": 0.0069, + "grad_norm": 18.442142486572266, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.879, + "step": 138 + }, + { + "epoch": 0.007, + "grad_norm": 3.7832610607147217, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.1669, + "step": 140 + }, + { + "epoch": 0.0071, + "grad_norm": 6.167732238769531, + "learning_rate": 1.42e-06, + "loss": 0.8307, + "step": 142 + }, + { + "epoch": 0.0072, + "grad_norm": 23.711885452270508, + "learning_rate": 1.44e-06, + "loss": 2.5909, + "step": 144 + }, + { + "epoch": 0.0073, + "grad_norm": 23.42848777770996, + "learning_rate": 1.46e-06, + "loss": 2.1127, + "step": 146 + }, + { + "epoch": 0.0074, + "grad_norm": 6.957925796508789, + "learning_rate": 1.48e-06, + "loss": 1.1717, + "step": 148 + }, + { + "epoch": 0.0075, + "grad_norm": 7.659520626068115, + "learning_rate": 1.5e-06, + "loss": 0.9896, + "step": 150 + }, + { + "epoch": 0.0076, + "grad_norm": 6.103281021118164, + "learning_rate": 1.52e-06, + "loss": 2.1241, + "step": 152 + }, + { + "epoch": 0.0077, + "grad_norm": 4.733657360076904, + "learning_rate": 1.54e-06, + "loss": 1.9771, + "step": 154 + }, + { + "epoch": 0.0078, + "grad_norm": 7.680426120758057, + "learning_rate": 1.56e-06, + "loss": 0.8261, + "step": 156 + }, + { + "epoch": 0.0079, + "grad_norm": 4.450775146484375, + "learning_rate": 1.5800000000000001e-06, + "loss": 1.0696, + "step": 158 + }, + { + "epoch": 0.008, + "grad_norm": 13.675750732421875, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.573, + "step": 160 + }, + { + "epoch": 0.0081, + "grad_norm": 58.174339294433594, + "learning_rate": 1.6200000000000002e-06, + "loss": 1.9582, + "step": 162 + }, + { + "epoch": 0.0082, + "grad_norm": 22.218017578125, + "learning_rate": 1.6400000000000002e-06, + "loss": 2.1834, + "step": 164 + }, + { + "epoch": 0.0083, + "grad_norm": 3.873138666152954, + "learning_rate": 1.6600000000000002e-06, + "loss": 0.957, + "step": 166 + }, + { + "epoch": 0.0084, + "grad_norm": 6.219630718231201, + "learning_rate": 1.6800000000000002e-06, + "loss": 1.2542, + "step": 168 + }, + { + "epoch": 0.0085, + "grad_norm": 5.211266040802002, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.3436, + "step": 170 + }, + { + "epoch": 0.0086, + "grad_norm": 5.380180835723877, + "learning_rate": 1.72e-06, + "loss": 1.3841, + "step": 172 + }, + { + "epoch": 0.0087, + "grad_norm": 11.519235610961914, + "learning_rate": 1.74e-06, + "loss": 1.0924, + "step": 174 + }, + { + "epoch": 0.0088, + "grad_norm": 5.888484477996826, + "learning_rate": 1.76e-06, + "loss": 1.9876, + "step": 176 + }, + { + "epoch": 0.0089, + "grad_norm": 5.616212844848633, + "learning_rate": 1.7800000000000001e-06, + "loss": 1.8849, + "step": 178 + }, + { + "epoch": 0.009, + "grad_norm": 9.498773574829102, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.8252, + "step": 180 + }, + { + "epoch": 0.0091, + "grad_norm": 9.563142776489258, + "learning_rate": 1.8200000000000002e-06, + "loss": 1.6184, + "step": 182 + }, + { + "epoch": 0.0092, + "grad_norm": 6.5288472175598145, + "learning_rate": 1.8400000000000002e-06, + "loss": 1.609, + "step": 184 + }, + { + "epoch": 0.0093, + "grad_norm": 4.788239002227783, + "learning_rate": 1.8600000000000002e-06, + "loss": 1.1585, + "step": 186 + }, + { + "epoch": 0.0094, + "grad_norm": 6.091237545013428, + "learning_rate": 1.8800000000000002e-06, + "loss": 1.4646, + "step": 188 + }, + { + "epoch": 0.0095, + "grad_norm": 7.6937384605407715, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.1982, + "step": 190 + }, + { + "epoch": 0.0096, + "grad_norm": 12.853693962097168, + "learning_rate": 1.9200000000000003e-06, + "loss": 1.9083, + "step": 192 + }, + { + "epoch": 0.0097, + "grad_norm": 6.350897312164307, + "learning_rate": 1.94e-06, + "loss": 1.0644, + "step": 194 + }, + { + "epoch": 0.0098, + "grad_norm": 12.703025817871094, + "learning_rate": 1.9600000000000003e-06, + "loss": 0.901, + "step": 196 + }, + { + "epoch": 0.0099, + "grad_norm": 9.076923370361328, + "learning_rate": 1.98e-06, + "loss": 1.7538, + "step": 198 + }, + { + "epoch": 0.01, + "grad_norm": 9.356685638427734, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.6516, + "step": 200 + }, + { + "epoch": 0.0101, + "grad_norm": 20.570571899414062, + "learning_rate": 2.02e-06, + "loss": 1.3644, + "step": 202 + }, + { + "epoch": 0.0102, + "grad_norm": 3.941533088684082, + "learning_rate": 2.04e-06, + "loss": 1.2417, + "step": 204 + }, + { + "epoch": 0.0103, + "grad_norm": 5.079562187194824, + "learning_rate": 2.06e-06, + "loss": 1.0266, + "step": 206 + }, + { + "epoch": 0.0104, + "grad_norm": 4.848267078399658, + "learning_rate": 2.08e-06, + "loss": 1.2318, + "step": 208 + }, + { + "epoch": 0.0105, + "grad_norm": 7.622250080108643, + "learning_rate": 2.1000000000000002e-06, + "loss": 1.7981, + "step": 210 + }, + { + "epoch": 0.0106, + "grad_norm": 4.826131343841553, + "learning_rate": 2.12e-06, + "loss": 1.6824, + "step": 212 + }, + { + "epoch": 0.0107, + "grad_norm": 5.336329460144043, + "learning_rate": 2.1400000000000003e-06, + "loss": 1.5293, + "step": 214 + }, + { + "epoch": 0.0108, + "grad_norm": 6.429103851318359, + "learning_rate": 2.16e-06, + "loss": 1.6983, + "step": 216 + }, + { + "epoch": 0.0109, + "grad_norm": 9.482562065124512, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.9429, + "step": 218 + }, + { + "epoch": 0.011, + "grad_norm": 6.3570427894592285, + "learning_rate": 2.2e-06, + "loss": 1.3137, + "step": 220 + }, + { + "epoch": 0.0111, + "grad_norm": 27.207380294799805, + "learning_rate": 2.2200000000000003e-06, + "loss": 0.644, + "step": 222 + }, + { + "epoch": 0.0112, + "grad_norm": 3.4968111515045166, + "learning_rate": 2.24e-06, + "loss": 0.5091, + "step": 224 + }, + { + "epoch": 0.0113, + "grad_norm": 34.037899017333984, + "learning_rate": 2.2600000000000004e-06, + "loss": 2.8512, + "step": 226 + }, + { + "epoch": 0.0114, + "grad_norm": 4.967566013336182, + "learning_rate": 2.28e-06, + "loss": 1.156, + "step": 228 + }, + { + "epoch": 0.0115, + "grad_norm": 12.437668800354004, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.0839, + "step": 230 + }, + { + "epoch": 0.0116, + "grad_norm": 11.843010902404785, + "learning_rate": 2.3200000000000002e-06, + "loss": 1.2498, + "step": 232 + }, + { + "epoch": 0.0117, + "grad_norm": 7.022974014282227, + "learning_rate": 2.3400000000000005e-06, + "loss": 0.8435, + "step": 234 + }, + { + "epoch": 0.0118, + "grad_norm": 6.839353084564209, + "learning_rate": 2.3600000000000003e-06, + "loss": 1.7632, + "step": 236 + }, + { + "epoch": 0.0119, + "grad_norm": 7.099884510040283, + "learning_rate": 2.38e-06, + "loss": 0.8134, + "step": 238 + }, + { + "epoch": 0.012, + "grad_norm": 7.361279487609863, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.7355, + "step": 240 + }, + { + "epoch": 0.0121, + "grad_norm": 5.530487060546875, + "learning_rate": 2.42e-06, + "loss": 1.2372, + "step": 242 + }, + { + "epoch": 0.0122, + "grad_norm": 9.614493370056152, + "learning_rate": 2.4400000000000004e-06, + "loss": 0.9642, + "step": 244 + }, + { + "epoch": 0.0123, + "grad_norm": 4.956291675567627, + "learning_rate": 2.46e-06, + "loss": 1.1808, + "step": 246 + }, + { + "epoch": 0.0124, + "grad_norm": 13.800895690917969, + "learning_rate": 2.4800000000000004e-06, + "loss": 0.8156, + "step": 248 + }, + { + "epoch": 0.0125, + "grad_norm": 5.915309429168701, + "learning_rate": 2.5e-06, + "loss": 1.1592, + "step": 250 + }, + { + "epoch": 0.0126, + "grad_norm": 8.119839668273926, + "learning_rate": 2.52e-06, + "loss": 1.4902, + "step": 252 + }, + { + "epoch": 0.0127, + "grad_norm": 8.106505393981934, + "learning_rate": 2.5400000000000002e-06, + "loss": 0.5298, + "step": 254 + }, + { + "epoch": 0.0128, + "grad_norm": 3.3539164066314697, + "learning_rate": 2.56e-06, + "loss": 0.6365, + "step": 256 + }, + { + "epoch": 0.0129, + "grad_norm": 5.755402565002441, + "learning_rate": 2.5800000000000003e-06, + "loss": 1.2824, + "step": 258 + }, + { + "epoch": 0.013, + "grad_norm": 10.709782600402832, + "learning_rate": 2.6e-06, + "loss": 0.8058, + "step": 260 + }, + { + "epoch": 0.0131, + "grad_norm": 9.427067756652832, + "learning_rate": 2.6200000000000003e-06, + "loss": 0.9425, + "step": 262 + }, + { + "epoch": 0.0132, + "grad_norm": 17.854101181030273, + "learning_rate": 2.64e-06, + "loss": 0.8965, + "step": 264 + }, + { + "epoch": 0.0133, + "grad_norm": 11.431461334228516, + "learning_rate": 2.6600000000000004e-06, + "loss": 1.3146, + "step": 266 + }, + { + "epoch": 0.0134, + "grad_norm": 7.187524318695068, + "learning_rate": 2.68e-06, + "loss": 0.928, + "step": 268 + }, + { + "epoch": 0.0135, + "grad_norm": 12.783543586730957, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.1136, + "step": 270 + }, + { + "epoch": 0.0136, + "grad_norm": 12.117506980895996, + "learning_rate": 2.7200000000000002e-06, + "loss": 1.2816, + "step": 272 + }, + { + "epoch": 0.0137, + "grad_norm": 6.339529991149902, + "learning_rate": 2.7400000000000004e-06, + "loss": 1.9676, + "step": 274 + }, + { + "epoch": 0.0138, + "grad_norm": 8.248180389404297, + "learning_rate": 2.7600000000000003e-06, + "loss": 1.4926, + "step": 276 + }, + { + "epoch": 0.0139, + "grad_norm": 7.3455891609191895, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.7308, + "step": 278 + }, + { + "epoch": 0.014, + "grad_norm": 4.847438335418701, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.0023, + "step": 280 + }, + { + "epoch": 0.0141, + "grad_norm": 12.257122993469238, + "learning_rate": 2.82e-06, + "loss": 1.1475, + "step": 282 + }, + { + "epoch": 0.0142, + "grad_norm": 7.57803201675415, + "learning_rate": 2.84e-06, + "loss": 1.155, + "step": 284 + }, + { + "epoch": 0.0143, + "grad_norm": 8.113622665405273, + "learning_rate": 2.86e-06, + "loss": 0.763, + "step": 286 + }, + { + "epoch": 0.0144, + "grad_norm": 5.788171291351318, + "learning_rate": 2.88e-06, + "loss": 0.9653, + "step": 288 + }, + { + "epoch": 0.0145, + "grad_norm": 6.072340488433838, + "learning_rate": 2.9e-06, + "loss": 1.4727, + "step": 290 + }, + { + "epoch": 0.0146, + "grad_norm": 9.972478866577148, + "learning_rate": 2.92e-06, + "loss": 0.849, + "step": 292 + }, + { + "epoch": 0.0147, + "grad_norm": 9.08491325378418, + "learning_rate": 2.9400000000000002e-06, + "loss": 1.662, + "step": 294 + }, + { + "epoch": 0.0148, + "grad_norm": 10.545151710510254, + "learning_rate": 2.96e-06, + "loss": 1.1428, + "step": 296 + }, + { + "epoch": 0.0149, + "grad_norm": 23.00973892211914, + "learning_rate": 2.9800000000000003e-06, + "loss": 2.3398, + "step": 298 + }, + { + "epoch": 0.015, + "grad_norm": 12.130746841430664, + "learning_rate": 3e-06, + "loss": 1.0533, + "step": 300 + }, + { + "epoch": 0.0151, + "grad_norm": 6.054661750793457, + "learning_rate": 3.0200000000000003e-06, + "loss": 0.8761, + "step": 302 + }, + { + "epoch": 0.0152, + "grad_norm": 12.52880573272705, + "learning_rate": 3.04e-06, + "loss": 1.9947, + "step": 304 + }, + { + "epoch": 0.0153, + "grad_norm": 10.304773330688477, + "learning_rate": 3.0600000000000003e-06, + "loss": 1.6924, + "step": 306 + }, + { + "epoch": 0.0154, + "grad_norm": 10.51093578338623, + "learning_rate": 3.08e-06, + "loss": 1.0099, + "step": 308 + }, + { + "epoch": 0.0155, + "grad_norm": 8.752665519714355, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.2872, + "step": 310 + }, + { + "epoch": 0.0156, + "grad_norm": 8.910305976867676, + "learning_rate": 3.12e-06, + "loss": 1.4038, + "step": 312 + }, + { + "epoch": 0.0157, + "grad_norm": 6.232647895812988, + "learning_rate": 3.1400000000000004e-06, + "loss": 0.9452, + "step": 314 + }, + { + "epoch": 0.0158, + "grad_norm": 6.129457473754883, + "learning_rate": 3.1600000000000002e-06, + "loss": 1.4469, + "step": 316 + }, + { + "epoch": 0.0159, + "grad_norm": 7.6200690269470215, + "learning_rate": 3.1800000000000005e-06, + "loss": 1.1104, + "step": 318 + }, + { + "epoch": 0.016, + "grad_norm": 7.369961738586426, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.6276, + "step": 320 + }, + { + "epoch": 0.0161, + "grad_norm": 7.526132106781006, + "learning_rate": 3.2200000000000005e-06, + "loss": 1.1214, + "step": 322 + }, + { + "epoch": 0.0162, + "grad_norm": 7.033531188964844, + "learning_rate": 3.2400000000000003e-06, + "loss": 1.1249, + "step": 324 + }, + { + "epoch": 0.0163, + "grad_norm": 5.38082218170166, + "learning_rate": 3.2600000000000006e-06, + "loss": 1.1197, + "step": 326 + }, + { + "epoch": 0.0164, + "grad_norm": 15.086496353149414, + "learning_rate": 3.2800000000000004e-06, + "loss": 0.7579, + "step": 328 + }, + { + "epoch": 0.0165, + "grad_norm": 8.862452507019043, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.2822, + "step": 330 + }, + { + "epoch": 0.0166, + "grad_norm": 28.40138053894043, + "learning_rate": 3.3200000000000004e-06, + "loss": 2.4472, + "step": 332 + }, + { + "epoch": 0.0167, + "grad_norm": 13.816384315490723, + "learning_rate": 3.3400000000000006e-06, + "loss": 1.5625, + "step": 334 + }, + { + "epoch": 0.0168, + "grad_norm": 9.021915435791016, + "learning_rate": 3.3600000000000004e-06, + "loss": 0.7949, + "step": 336 + }, + { + "epoch": 0.0169, + "grad_norm": 4.971908092498779, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.6226, + "step": 338 + }, + { + "epoch": 0.017, + "grad_norm": 11.621842384338379, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.9126, + "step": 340 + }, + { + "epoch": 0.0171, + "grad_norm": 5.7583441734313965, + "learning_rate": 3.4200000000000007e-06, + "loss": 1.348, + "step": 342 + }, + { + "epoch": 0.0172, + "grad_norm": 10.133981704711914, + "learning_rate": 3.44e-06, + "loss": 1.0839, + "step": 344 + }, + { + "epoch": 0.0173, + "grad_norm": 41.400352478027344, + "learning_rate": 3.46e-06, + "loss": 1.7104, + "step": 346 + }, + { + "epoch": 0.0174, + "grad_norm": 9.705741882324219, + "learning_rate": 3.48e-06, + "loss": 1.3744, + "step": 348 + }, + { + "epoch": 0.0175, + "grad_norm": 5.28177547454834, + "learning_rate": 3.5e-06, + "loss": 1.2249, + "step": 350 + }, + { + "epoch": 0.0176, + "grad_norm": 11.901725769042969, + "learning_rate": 3.52e-06, + "loss": 1.6618, + "step": 352 + }, + { + "epoch": 0.0177, + "grad_norm": 5.607026100158691, + "learning_rate": 3.54e-06, + "loss": 0.6614, + "step": 354 + }, + { + "epoch": 0.0178, + "grad_norm": 6.4344658851623535, + "learning_rate": 3.5600000000000002e-06, + "loss": 1.6989, + "step": 356 + }, + { + "epoch": 0.0179, + "grad_norm": 15.286064147949219, + "learning_rate": 3.58e-06, + "loss": 1.6266, + "step": 358 + }, + { + "epoch": 0.018, + "grad_norm": 9.539347648620605, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.4106, + "step": 360 + }, + { + "epoch": 0.0181, + "grad_norm": 5.025078773498535, + "learning_rate": 3.62e-06, + "loss": 0.7202, + "step": 362 + }, + { + "epoch": 0.0182, + "grad_norm": 6.700334548950195, + "learning_rate": 3.6400000000000003e-06, + "loss": 2.1285, + "step": 364 + }, + { + "epoch": 0.0183, + "grad_norm": 5.544528007507324, + "learning_rate": 3.66e-06, + "loss": 0.846, + "step": 366 + }, + { + "epoch": 0.0184, + "grad_norm": 13.229118347167969, + "learning_rate": 3.6800000000000003e-06, + "loss": 1.9301, + "step": 368 + }, + { + "epoch": 0.0185, + "grad_norm": 6.799722671508789, + "learning_rate": 3.7e-06, + "loss": 0.9754, + "step": 370 + }, + { + "epoch": 0.0186, + "grad_norm": 8.992018699645996, + "learning_rate": 3.7200000000000004e-06, + "loss": 1.1862, + "step": 372 + }, + { + "epoch": 0.0187, + "grad_norm": 26.713502883911133, + "learning_rate": 3.74e-06, + "loss": 2.6847, + "step": 374 + }, + { + "epoch": 0.0188, + "grad_norm": 7.295320510864258, + "learning_rate": 3.7600000000000004e-06, + "loss": 1.5353, + "step": 376 + }, + { + "epoch": 0.0189, + "grad_norm": 16.028779983520508, + "learning_rate": 3.7800000000000002e-06, + "loss": 1.559, + "step": 378 + }, + { + "epoch": 0.019, + "grad_norm": 17.779096603393555, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.6329, + "step": 380 + }, + { + "epoch": 0.0191, + "grad_norm": 0.6413224935531616, + "learning_rate": 3.820000000000001e-06, + "loss": 0.742, + "step": 382 + }, + { + "epoch": 0.0192, + "grad_norm": 12.411666870117188, + "learning_rate": 3.8400000000000005e-06, + "loss": 0.3629, + "step": 384 + }, + { + "epoch": 0.0193, + "grad_norm": 6.096014976501465, + "learning_rate": 3.86e-06, + "loss": 1.2026, + "step": 386 + }, + { + "epoch": 0.0194, + "grad_norm": 12.838081359863281, + "learning_rate": 3.88e-06, + "loss": 1.5524, + "step": 388 + }, + { + "epoch": 0.0195, + "grad_norm": 5.8187785148620605, + "learning_rate": 3.900000000000001e-06, + "loss": 1.6047, + "step": 390 + }, + { + "epoch": 0.0196, + "grad_norm": 11.065489768981934, + "learning_rate": 3.920000000000001e-06, + "loss": 1.0967, + "step": 392 + }, + { + "epoch": 0.0197, + "grad_norm": 8.01314640045166, + "learning_rate": 3.94e-06, + "loss": 0.756, + "step": 394 + }, + { + "epoch": 0.0198, + "grad_norm": 7.341931343078613, + "learning_rate": 3.96e-06, + "loss": 1.231, + "step": 396 + }, + { + "epoch": 0.0199, + "grad_norm": 8.558012962341309, + "learning_rate": 3.980000000000001e-06, + "loss": 0.9433, + "step": 398 + }, + { + "epoch": 0.02, + "grad_norm": 3.676767349243164, + "learning_rate": 4.000000000000001e-06, + "loss": 1.2199, + "step": 400 + }, + { + "epoch": 0.0201, + "grad_norm": 13.688055038452148, + "learning_rate": 4.0200000000000005e-06, + "loss": 1.3623, + "step": 402 + }, + { + "epoch": 0.0202, + "grad_norm": 11.50728702545166, + "learning_rate": 4.04e-06, + "loss": 1.0583, + "step": 404 + }, + { + "epoch": 0.0203, + "grad_norm": 8.873517036437988, + "learning_rate": 4.060000000000001e-06, + "loss": 1.7395, + "step": 406 + }, + { + "epoch": 0.0204, + "grad_norm": 6.274494171142578, + "learning_rate": 4.08e-06, + "loss": 1.6023, + "step": 408 + }, + { + "epoch": 0.0205, + "grad_norm": 5.957613945007324, + "learning_rate": 4.1e-06, + "loss": 1.3606, + "step": 410 + }, + { + "epoch": 0.0206, + "grad_norm": 18.59316062927246, + "learning_rate": 4.12e-06, + "loss": 1.5108, + "step": 412 + }, + { + "epoch": 0.0207, + "grad_norm": 7.973069667816162, + "learning_rate": 4.14e-06, + "loss": 1.4772, + "step": 414 + }, + { + "epoch": 0.0208, + "grad_norm": 7.352391719818115, + "learning_rate": 4.16e-06, + "loss": 0.9461, + "step": 416 + }, + { + "epoch": 0.0209, + "grad_norm": 7.954921722412109, + "learning_rate": 4.18e-06, + "loss": 0.9984, + "step": 418 + }, + { + "epoch": 0.021, + "grad_norm": 3.9163498878479004, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.6017, + "step": 420 + }, + { + "epoch": 0.0211, + "grad_norm": 8.084186553955078, + "learning_rate": 4.22e-06, + "loss": 1.5281, + "step": 422 + }, + { + "epoch": 0.0212, + "grad_norm": 6.099013805389404, + "learning_rate": 4.24e-06, + "loss": 1.1283, + "step": 424 + }, + { + "epoch": 0.0213, + "grad_norm": 17.58872413635254, + "learning_rate": 4.26e-06, + "loss": 1.5531, + "step": 426 + }, + { + "epoch": 0.0214, + "grad_norm": 16.349336624145508, + "learning_rate": 4.2800000000000005e-06, + "loss": 0.782, + "step": 428 + }, + { + "epoch": 0.0215, + "grad_norm": 4.779919147491455, + "learning_rate": 4.3e-06, + "loss": 0.7865, + "step": 430 + }, + { + "epoch": 0.0216, + "grad_norm": 11.017399787902832, + "learning_rate": 4.32e-06, + "loss": 1.5758, + "step": 432 + }, + { + "epoch": 0.0217, + "grad_norm": 8.038894653320312, + "learning_rate": 4.34e-06, + "loss": 1.6224, + "step": 434 + }, + { + "epoch": 0.0218, + "grad_norm": 13.095144271850586, + "learning_rate": 4.360000000000001e-06, + "loss": 1.2849, + "step": 436 + }, + { + "epoch": 0.0219, + "grad_norm": 15.031999588012695, + "learning_rate": 4.38e-06, + "loss": 1.6006, + "step": 438 + }, + { + "epoch": 0.022, + "grad_norm": 7.249359607696533, + "learning_rate": 4.4e-06, + "loss": 1.5067, + "step": 440 + }, + { + "epoch": 0.0221, + "grad_norm": 10.482078552246094, + "learning_rate": 4.42e-06, + "loss": 4.529, + "step": 442 + }, + { + "epoch": 0.0222, + "grad_norm": 11.821629524230957, + "learning_rate": 4.440000000000001e-06, + "loss": 1.3312, + "step": 444 + }, + { + "epoch": 0.0223, + "grad_norm": 12.11413860321045, + "learning_rate": 4.4600000000000005e-06, + "loss": 1.182, + "step": 446 + }, + { + "epoch": 0.0224, + "grad_norm": 19.612667083740234, + "learning_rate": 4.48e-06, + "loss": 1.6559, + "step": 448 + }, + { + "epoch": 0.0225, + "grad_norm": 7.073517322540283, + "learning_rate": 4.5e-06, + "loss": 2.4962, + "step": 450 + }, + { + "epoch": 0.0226, + "grad_norm": 5.668583869934082, + "learning_rate": 4.520000000000001e-06, + "loss": 1.0803, + "step": 452 + }, + { + "epoch": 0.0227, + "grad_norm": 8.523815155029297, + "learning_rate": 4.540000000000001e-06, + "loss": 1.1874, + "step": 454 + }, + { + "epoch": 0.0228, + "grad_norm": 8.882444381713867, + "learning_rate": 4.56e-06, + "loss": 1.0706, + "step": 456 + }, + { + "epoch": 0.0229, + "grad_norm": 4.8277268409729, + "learning_rate": 4.58e-06, + "loss": 1.4402, + "step": 458 + }, + { + "epoch": 0.023, + "grad_norm": 5.825888156890869, + "learning_rate": 4.600000000000001e-06, + "loss": 1.2058, + "step": 460 + }, + { + "epoch": 0.0231, + "grad_norm": 6.73979377746582, + "learning_rate": 4.620000000000001e-06, + "loss": 0.7222, + "step": 462 + }, + { + "epoch": 0.0232, + "grad_norm": 13.041484832763672, + "learning_rate": 4.6400000000000005e-06, + "loss": 1.4789, + "step": 464 + }, + { + "epoch": 0.0233, + "grad_norm": 5.967507839202881, + "learning_rate": 4.66e-06, + "loss": 1.2472, + "step": 466 + }, + { + "epoch": 0.0234, + "grad_norm": 9.755995750427246, + "learning_rate": 4.680000000000001e-06, + "loss": 1.1654, + "step": 468 + }, + { + "epoch": 0.0235, + "grad_norm": 6.786223411560059, + "learning_rate": 4.7e-06, + "loss": 1.2485, + "step": 470 + }, + { + "epoch": 0.0236, + "grad_norm": 11.694995880126953, + "learning_rate": 4.7200000000000005e-06, + "loss": 1.4958, + "step": 472 + }, + { + "epoch": 0.0237, + "grad_norm": 6.679696559906006, + "learning_rate": 4.74e-06, + "loss": 0.7143, + "step": 474 + }, + { + "epoch": 0.0238, + "grad_norm": 7.714452743530273, + "learning_rate": 4.76e-06, + "loss": 1.7157, + "step": 476 + }, + { + "epoch": 0.0239, + "grad_norm": 8.948019981384277, + "learning_rate": 4.78e-06, + "loss": 1.283, + "step": 478 + }, + { + "epoch": 0.024, + "grad_norm": 5.995326995849609, + "learning_rate": 4.800000000000001e-06, + "loss": 1.354, + "step": 480 + }, + { + "epoch": 0.0241, + "grad_norm": 6.972031593322754, + "learning_rate": 4.8200000000000004e-06, + "loss": 1.3696, + "step": 482 + }, + { + "epoch": 0.0242, + "grad_norm": 6.997723579406738, + "learning_rate": 4.84e-06, + "loss": 1.4976, + "step": 484 + }, + { + "epoch": 0.0243, + "grad_norm": 8.927756309509277, + "learning_rate": 4.86e-06, + "loss": 1.1605, + "step": 486 + }, + { + "epoch": 0.0244, + "grad_norm": 6.939820289611816, + "learning_rate": 4.880000000000001e-06, + "loss": 1.848, + "step": 488 + }, + { + "epoch": 0.0245, + "grad_norm": 16.713390350341797, + "learning_rate": 4.9000000000000005e-06, + "loss": 1.1161, + "step": 490 + }, + { + "epoch": 0.0246, + "grad_norm": 6.598463535308838, + "learning_rate": 4.92e-06, + "loss": 1.2412, + "step": 492 + }, + { + "epoch": 0.0247, + "grad_norm": 3.242373466491699, + "learning_rate": 4.94e-06, + "loss": 0.8841, + "step": 494 + }, + { + "epoch": 0.0248, + "grad_norm": 7.1952643394470215, + "learning_rate": 4.960000000000001e-06, + "loss": 1.3215, + "step": 496 + }, + { + "epoch": 0.0249, + "grad_norm": 33.82792282104492, + "learning_rate": 4.980000000000001e-06, + "loss": 2.1522, + "step": 498 + }, + { + "epoch": 0.025, + "grad_norm": 2.9614458084106445, + "learning_rate": 5e-06, + "loss": 0.8969, + "step": 500 + }, + { + "epoch": 0.0251, + "grad_norm": 10.754796981811523, + "learning_rate": 5.02e-06, + "loss": 1.1372, + "step": 502 + }, + { + "epoch": 0.0252, + "grad_norm": 3.6312789916992188, + "learning_rate": 5.04e-06, + "loss": 0.3397, + "step": 504 + }, + { + "epoch": 0.0253, + "grad_norm": 6.178099155426025, + "learning_rate": 5.060000000000001e-06, + "loss": 1.1971, + "step": 506 + }, + { + "epoch": 0.0254, + "grad_norm": 6.608092784881592, + "learning_rate": 5.0800000000000005e-06, + "loss": 1.3212, + "step": 508 + }, + { + "epoch": 0.0255, + "grad_norm": 7.020211696624756, + "learning_rate": 5.1e-06, + "loss": 1.1342, + "step": 510 + }, + { + "epoch": 0.0256, + "grad_norm": 3.805220603942871, + "learning_rate": 5.12e-06, + "loss": 1.3933, + "step": 512 + }, + { + "epoch": 0.0257, + "grad_norm": 7.273865222930908, + "learning_rate": 5.140000000000001e-06, + "loss": 0.9579, + "step": 514 + }, + { + "epoch": 0.0258, + "grad_norm": 7.474109649658203, + "learning_rate": 5.1600000000000006e-06, + "loss": 1.1737, + "step": 516 + }, + { + "epoch": 0.0259, + "grad_norm": 6.4864420890808105, + "learning_rate": 5.18e-06, + "loss": 1.1857, + "step": 518 + }, + { + "epoch": 0.026, + "grad_norm": 5.540512561798096, + "learning_rate": 5.2e-06, + "loss": 1.302, + "step": 520 + }, + { + "epoch": 0.0261, + "grad_norm": 17.714309692382812, + "learning_rate": 5.220000000000001e-06, + "loss": 1.1869, + "step": 522 + }, + { + "epoch": 0.0262, + "grad_norm": 9.72828197479248, + "learning_rate": 5.240000000000001e-06, + "loss": 1.6285, + "step": 524 + }, + { + "epoch": 0.0263, + "grad_norm": 4.576055526733398, + "learning_rate": 5.2600000000000005e-06, + "loss": 1.4564, + "step": 526 + }, + { + "epoch": 0.0264, + "grad_norm": 10.650053977966309, + "learning_rate": 5.28e-06, + "loss": 1.151, + "step": 528 + }, + { + "epoch": 0.0265, + "grad_norm": 9.198973655700684, + "learning_rate": 5.300000000000001e-06, + "loss": 1.5071, + "step": 530 + }, + { + "epoch": 0.0266, + "grad_norm": 14.383681297302246, + "learning_rate": 5.320000000000001e-06, + "loss": 1.0972, + "step": 532 + }, + { + "epoch": 0.0267, + "grad_norm": 16.428918838500977, + "learning_rate": 5.3400000000000005e-06, + "loss": 1.3456, + "step": 534 + }, + { + "epoch": 0.0268, + "grad_norm": 25.257295608520508, + "learning_rate": 5.36e-06, + "loss": 1.2413, + "step": 536 + }, + { + "epoch": 0.0269, + "grad_norm": 5.658939838409424, + "learning_rate": 5.380000000000001e-06, + "loss": 1.2484, + "step": 538 + }, + { + "epoch": 0.027, + "grad_norm": 23.65157127380371, + "learning_rate": 5.400000000000001e-06, + "loss": 1.4635, + "step": 540 + }, + { + "epoch": 0.0271, + "grad_norm": 13.021081924438477, + "learning_rate": 5.420000000000001e-06, + "loss": 0.5639, + "step": 542 + }, + { + "epoch": 0.0272, + "grad_norm": 5.461696624755859, + "learning_rate": 5.4400000000000004e-06, + "loss": 1.241, + "step": 544 + }, + { + "epoch": 0.0273, + "grad_norm": 4.397319793701172, + "learning_rate": 5.460000000000001e-06, + "loss": 0.442, + "step": 546 + }, + { + "epoch": 0.0274, + "grad_norm": 12.093127250671387, + "learning_rate": 5.480000000000001e-06, + "loss": 1.2586, + "step": 548 + }, + { + "epoch": 0.0275, + "grad_norm": 12.650091171264648, + "learning_rate": 5.500000000000001e-06, + "loss": 1.2203, + "step": 550 + }, + { + "epoch": 0.0276, + "grad_norm": 9.32306957244873, + "learning_rate": 5.5200000000000005e-06, + "loss": 0.7574, + "step": 552 + }, + { + "epoch": 0.0277, + "grad_norm": 14.518628120422363, + "learning_rate": 5.540000000000001e-06, + "loss": 2.2571, + "step": 554 + }, + { + "epoch": 0.0278, + "grad_norm": 5.79930305480957, + "learning_rate": 5.560000000000001e-06, + "loss": 1.761, + "step": 556 + }, + { + "epoch": 0.0279, + "grad_norm": 15.645671844482422, + "learning_rate": 5.580000000000001e-06, + "loss": 1.3673, + "step": 558 + }, + { + "epoch": 0.028, + "grad_norm": 3.7598376274108887, + "learning_rate": 5.600000000000001e-06, + "loss": 0.7111, + "step": 560 + }, + { + "epoch": 0.0281, + "grad_norm": 13.239677429199219, + "learning_rate": 5.620000000000001e-06, + "loss": 1.292, + "step": 562 + }, + { + "epoch": 0.0282, + "grad_norm": 3.9817440509796143, + "learning_rate": 5.64e-06, + "loss": 0.7303, + "step": 564 + }, + { + "epoch": 0.0283, + "grad_norm": 11.066258430480957, + "learning_rate": 5.66e-06, + "loss": 1.6701, + "step": 566 + }, + { + "epoch": 0.0284, + "grad_norm": 7.764070510864258, + "learning_rate": 5.68e-06, + "loss": 1.1854, + "step": 568 + }, + { + "epoch": 0.0285, + "grad_norm": 20.231374740600586, + "learning_rate": 5.7e-06, + "loss": 1.7067, + "step": 570 + }, + { + "epoch": 0.0286, + "grad_norm": 11.376474380493164, + "learning_rate": 5.72e-06, + "loss": 2.187, + "step": 572 + }, + { + "epoch": 0.0287, + "grad_norm": 10.824491500854492, + "learning_rate": 5.74e-06, + "loss": 1.3206, + "step": 574 + }, + { + "epoch": 0.0288, + "grad_norm": 12.41994857788086, + "learning_rate": 5.76e-06, + "loss": 1.5109, + "step": 576 + }, + { + "epoch": 0.0289, + "grad_norm": 8.523578643798828, + "learning_rate": 5.78e-06, + "loss": 0.8466, + "step": 578 + }, + { + "epoch": 0.029, + "grad_norm": 9.072184562683105, + "learning_rate": 5.8e-06, + "loss": 1.2843, + "step": 580 + }, + { + "epoch": 0.0291, + "grad_norm": 8.727999687194824, + "learning_rate": 5.82e-06, + "loss": 1.8068, + "step": 582 + }, + { + "epoch": 0.0292, + "grad_norm": 8.394844055175781, + "learning_rate": 5.84e-06, + "loss": 1.8053, + "step": 584 + }, + { + "epoch": 0.0293, + "grad_norm": 6.108077526092529, + "learning_rate": 5.86e-06, + "loss": 3.1004, + "step": 586 + }, + { + "epoch": 0.0294, + "grad_norm": 4.474924087524414, + "learning_rate": 5.8800000000000005e-06, + "loss": 1.4132, + "step": 588 + }, + { + "epoch": 0.0295, + "grad_norm": 7.37600564956665, + "learning_rate": 5.9e-06, + "loss": 1.0191, + "step": 590 + }, + { + "epoch": 0.0296, + "grad_norm": 6.244680881500244, + "learning_rate": 5.92e-06, + "loss": 1.4568, + "step": 592 + }, + { + "epoch": 0.0297, + "grad_norm": 12.595189094543457, + "learning_rate": 5.94e-06, + "loss": 0.535, + "step": 594 + }, + { + "epoch": 0.0298, + "grad_norm": 4.573352813720703, + "learning_rate": 5.9600000000000005e-06, + "loss": 0.9191, + "step": 596 + }, + { + "epoch": 0.0299, + "grad_norm": 10.911251068115234, + "learning_rate": 5.98e-06, + "loss": 1.7577, + "step": 598 + }, + { + "epoch": 0.03, + "grad_norm": 4.806443214416504, + "learning_rate": 6e-06, + "loss": 1.1699, + "step": 600 + }, + { + "epoch": 0.0301, + "grad_norm": 5.119315147399902, + "learning_rate": 6.02e-06, + "loss": 1.1427, + "step": 602 + }, + { + "epoch": 0.0302, + "grad_norm": 4.651810646057129, + "learning_rate": 6.040000000000001e-06, + "loss": 1.0717, + "step": 604 + }, + { + "epoch": 0.0303, + "grad_norm": 6.3296685218811035, + "learning_rate": 6.0600000000000004e-06, + "loss": 1.3626, + "step": 606 + }, + { + "epoch": 0.0304, + "grad_norm": 6.287442207336426, + "learning_rate": 6.08e-06, + "loss": 0.9757, + "step": 608 + }, + { + "epoch": 0.0305, + "grad_norm": 5.116812705993652, + "learning_rate": 6.1e-06, + "loss": 1.0724, + "step": 610 + }, + { + "epoch": 0.0306, + "grad_norm": 3.7264227867126465, + "learning_rate": 6.120000000000001e-06, + "loss": 0.9469, + "step": 612 + }, + { + "epoch": 0.0307, + "grad_norm": 5.087530136108398, + "learning_rate": 6.1400000000000005e-06, + "loss": 0.9895, + "step": 614 + }, + { + "epoch": 0.0308, + "grad_norm": 5.696181297302246, + "learning_rate": 6.16e-06, + "loss": 1.3028, + "step": 616 + }, + { + "epoch": 0.0309, + "grad_norm": 4.104210376739502, + "learning_rate": 6.18e-06, + "loss": 0.844, + "step": 618 + }, + { + "epoch": 0.031, + "grad_norm": 19.02704620361328, + "learning_rate": 6.200000000000001e-06, + "loss": 1.1628, + "step": 620 + }, + { + "epoch": 0.0311, + "grad_norm": 5.110633373260498, + "learning_rate": 6.220000000000001e-06, + "loss": 3.447, + "step": 622 + }, + { + "epoch": 0.0312, + "grad_norm": 22.753070831298828, + "learning_rate": 6.24e-06, + "loss": 1.3084, + "step": 624 + }, + { + "epoch": 0.0313, + "grad_norm": 15.797767639160156, + "learning_rate": 6.26e-06, + "loss": 0.4142, + "step": 626 + }, + { + "epoch": 0.0314, + "grad_norm": 11.590780258178711, + "learning_rate": 6.280000000000001e-06, + "loss": 1.3215, + "step": 628 + }, + { + "epoch": 0.0315, + "grad_norm": 13.721484184265137, + "learning_rate": 6.300000000000001e-06, + "loss": 1.5172, + "step": 630 + }, + { + "epoch": 0.0316, + "grad_norm": 9.264723777770996, + "learning_rate": 6.3200000000000005e-06, + "loss": 1.1937, + "step": 632 + }, + { + "epoch": 0.0317, + "grad_norm": 6.040861129760742, + "learning_rate": 6.34e-06, + "loss": 0.9655, + "step": 634 + }, + { + "epoch": 0.0318, + "grad_norm": 11.548483848571777, + "learning_rate": 6.360000000000001e-06, + "loss": 1.6599, + "step": 636 + }, + { + "epoch": 0.0319, + "grad_norm": 9.334105491638184, + "learning_rate": 6.380000000000001e-06, + "loss": 0.612, + "step": 638 + }, + { + "epoch": 0.032, + "grad_norm": 9.61484146118164, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.512, + "step": 640 + }, + { + "epoch": 0.0321, + "grad_norm": 14.151845932006836, + "learning_rate": 6.42e-06, + "loss": 0.908, + "step": 642 + }, + { + "epoch": 0.0322, + "grad_norm": 7.992030143737793, + "learning_rate": 6.440000000000001e-06, + "loss": 1.4064, + "step": 644 + }, + { + "epoch": 0.0323, + "grad_norm": 12.73095989227295, + "learning_rate": 6.460000000000001e-06, + "loss": 1.3909, + "step": 646 + }, + { + "epoch": 0.0324, + "grad_norm": 11.843143463134766, + "learning_rate": 6.480000000000001e-06, + "loss": 1.0298, + "step": 648 + }, + { + "epoch": 0.0325, + "grad_norm": 5.982705593109131, + "learning_rate": 6.5000000000000004e-06, + "loss": 0.6385, + "step": 650 + }, + { + "epoch": 0.0326, + "grad_norm": 12.112163543701172, + "learning_rate": 6.520000000000001e-06, + "loss": 1.4256, + "step": 652 + }, + { + "epoch": 0.0327, + "grad_norm": 7.312185764312744, + "learning_rate": 6.540000000000001e-06, + "loss": 0.8345, + "step": 654 + }, + { + "epoch": 0.0328, + "grad_norm": 3.142749309539795, + "learning_rate": 6.560000000000001e-06, + "loss": 0.9221, + "step": 656 + }, + { + "epoch": 0.0329, + "grad_norm": 8.15288257598877, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.85, + "step": 658 + }, + { + "epoch": 0.033, + "grad_norm": 4.124783039093018, + "learning_rate": 6.600000000000001e-06, + "loss": 0.591, + "step": 660 + }, + { + "epoch": 0.0331, + "grad_norm": 13.599148750305176, + "learning_rate": 6.620000000000001e-06, + "loss": 1.5663, + "step": 662 + }, + { + "epoch": 0.0332, + "grad_norm": 11.211814880371094, + "learning_rate": 6.640000000000001e-06, + "loss": 1.8206, + "step": 664 + }, + { + "epoch": 0.0333, + "grad_norm": 17.080181121826172, + "learning_rate": 6.660000000000001e-06, + "loss": 1.9961, + "step": 666 + }, + { + "epoch": 0.0334, + "grad_norm": 9.77067756652832, + "learning_rate": 6.680000000000001e-06, + "loss": 1.5384, + "step": 668 + }, + { + "epoch": 0.0335, + "grad_norm": 5.493101596832275, + "learning_rate": 6.700000000000001e-06, + "loss": 0.7224, + "step": 670 + }, + { + "epoch": 0.0336, + "grad_norm": 9.874269485473633, + "learning_rate": 6.720000000000001e-06, + "loss": 1.5328, + "step": 672 + }, + { + "epoch": 0.0337, + "grad_norm": 4.794497013092041, + "learning_rate": 6.740000000000001e-06, + "loss": 0.9961, + "step": 674 + }, + { + "epoch": 0.0338, + "grad_norm": 27.30553436279297, + "learning_rate": 6.760000000000001e-06, + "loss": 1.9316, + "step": 676 + }, + { + "epoch": 0.0339, + "grad_norm": 15.157391548156738, + "learning_rate": 6.780000000000001e-06, + "loss": 1.0405, + "step": 678 + }, + { + "epoch": 0.034, + "grad_norm": 7.838613033294678, + "learning_rate": 6.800000000000001e-06, + "loss": 1.1751, + "step": 680 + }, + { + "epoch": 0.0341, + "grad_norm": 13.477240562438965, + "learning_rate": 6.820000000000001e-06, + "loss": 0.9936, + "step": 682 + }, + { + "epoch": 0.0342, + "grad_norm": 24.373586654663086, + "learning_rate": 6.8400000000000014e-06, + "loss": 2.1205, + "step": 684 + }, + { + "epoch": 0.0343, + "grad_norm": 5.127707004547119, + "learning_rate": 6.860000000000001e-06, + "loss": 0.7371, + "step": 686 + }, + { + "epoch": 0.0344, + "grad_norm": 9.504162788391113, + "learning_rate": 6.88e-06, + "loss": 1.2083, + "step": 688 + }, + { + "epoch": 0.0345, + "grad_norm": 13.711008071899414, + "learning_rate": 6.9e-06, + "loss": 1.2185, + "step": 690 + }, + { + "epoch": 0.0346, + "grad_norm": 11.118414878845215, + "learning_rate": 6.92e-06, + "loss": 1.7168, + "step": 692 + }, + { + "epoch": 0.0347, + "grad_norm": 1.7871979475021362, + "learning_rate": 6.9400000000000005e-06, + "loss": 0.6188, + "step": 694 + }, + { + "epoch": 0.0348, + "grad_norm": 6.365922451019287, + "learning_rate": 6.96e-06, + "loss": 1.2344, + "step": 696 + }, + { + "epoch": 0.0349, + "grad_norm": 9.172911643981934, + "learning_rate": 6.98e-06, + "loss": 1.1108, + "step": 698 + }, + { + "epoch": 0.035, + "grad_norm": 8.224485397338867, + "learning_rate": 7e-06, + "loss": 1.6197, + "step": 700 + }, + { + "epoch": 0.0351, + "grad_norm": 19.158857345581055, + "learning_rate": 7.0200000000000006e-06, + "loss": 1.7009, + "step": 702 + }, + { + "epoch": 0.0352, + "grad_norm": 4.4779372215271, + "learning_rate": 7.04e-06, + "loss": 0.5423, + "step": 704 + }, + { + "epoch": 0.0353, + "grad_norm": 3.2915830612182617, + "learning_rate": 7.06e-06, + "loss": 1.0732, + "step": 706 + }, + { + "epoch": 0.0354, + "grad_norm": 5.723826885223389, + "learning_rate": 7.08e-06, + "loss": 0.7792, + "step": 708 + }, + { + "epoch": 0.0355, + "grad_norm": 12.854735374450684, + "learning_rate": 7.100000000000001e-06, + "loss": 0.9664, + "step": 710 + }, + { + "epoch": 0.0356, + "grad_norm": 3.800884962081909, + "learning_rate": 7.1200000000000004e-06, + "loss": 1.034, + "step": 712 + }, + { + "epoch": 0.0357, + "grad_norm": 9.83360767364502, + "learning_rate": 7.14e-06, + "loss": 0.809, + "step": 714 + }, + { + "epoch": 0.0358, + "grad_norm": 9.197179794311523, + "learning_rate": 7.16e-06, + "loss": 1.5367, + "step": 716 + }, + { + "epoch": 0.0359, + "grad_norm": 6.2496724128723145, + "learning_rate": 7.180000000000001e-06, + "loss": 1.3413, + "step": 718 + }, + { + "epoch": 0.036, + "grad_norm": 8.781137466430664, + "learning_rate": 7.2000000000000005e-06, + "loss": 0.7266, + "step": 720 + }, + { + "epoch": 0.0361, + "grad_norm": 8.04161548614502, + "learning_rate": 7.22e-06, + "loss": 1.0251, + "step": 722 + }, + { + "epoch": 0.0362, + "grad_norm": 12.544044494628906, + "learning_rate": 7.24e-06, + "loss": 1.2083, + "step": 724 + }, + { + "epoch": 0.0363, + "grad_norm": 9.307494163513184, + "learning_rate": 7.260000000000001e-06, + "loss": 0.7683, + "step": 726 + }, + { + "epoch": 0.0364, + "grad_norm": 11.053923606872559, + "learning_rate": 7.280000000000001e-06, + "loss": 1.1767, + "step": 728 + }, + { + "epoch": 0.0365, + "grad_norm": 10.243159294128418, + "learning_rate": 7.3e-06, + "loss": 1.4787, + "step": 730 + }, + { + "epoch": 0.0366, + "grad_norm": 13.418939590454102, + "learning_rate": 7.32e-06, + "loss": 1.3036, + "step": 732 + }, + { + "epoch": 0.0367, + "grad_norm": 6.749165058135986, + "learning_rate": 7.340000000000001e-06, + "loss": 1.0913, + "step": 734 + }, + { + "epoch": 0.0368, + "grad_norm": 6.156402587890625, + "learning_rate": 7.360000000000001e-06, + "loss": 1.1262, + "step": 736 + }, + { + "epoch": 0.0369, + "grad_norm": 9.809892654418945, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.5157, + "step": 738 + }, + { + "epoch": 0.037, + "grad_norm": 10.635805130004883, + "learning_rate": 7.4e-06, + "loss": 1.1925, + "step": 740 + }, + { + "epoch": 0.0371, + "grad_norm": 9.014545440673828, + "learning_rate": 7.420000000000001e-06, + "loss": 2.0821, + "step": 742 + }, + { + "epoch": 0.0372, + "grad_norm": 4.825770378112793, + "learning_rate": 7.440000000000001e-06, + "loss": 0.335, + "step": 744 + }, + { + "epoch": 0.0373, + "grad_norm": 4.863062381744385, + "learning_rate": 7.4600000000000006e-06, + "loss": 0.3296, + "step": 746 + }, + { + "epoch": 0.0374, + "grad_norm": 26.39769172668457, + "learning_rate": 7.48e-06, + "loss": 0.6662, + "step": 748 + }, + { + "epoch": 0.0375, + "grad_norm": 8.724063873291016, + "learning_rate": 7.500000000000001e-06, + "loss": 0.8746, + "step": 750 + }, + { + "epoch": 0.0376, + "grad_norm": 10.780169486999512, + "learning_rate": 7.520000000000001e-06, + "loss": 1.4152, + "step": 752 + }, + { + "epoch": 0.0377, + "grad_norm": 6.278567314147949, + "learning_rate": 7.540000000000001e-06, + "loss": 0.8591, + "step": 754 + }, + { + "epoch": 0.0378, + "grad_norm": 10.548782348632812, + "learning_rate": 7.5600000000000005e-06, + "loss": 1.6395, + "step": 756 + }, + { + "epoch": 0.0379, + "grad_norm": 6.304843902587891, + "learning_rate": 7.58e-06, + "loss": 0.7942, + "step": 758 + }, + { + "epoch": 0.038, + "grad_norm": 11.753259658813477, + "learning_rate": 7.600000000000001e-06, + "loss": 0.9437, + "step": 760 + }, + { + "epoch": 0.0381, + "grad_norm": 6.880615234375, + "learning_rate": 7.620000000000001e-06, + "loss": 1.3416, + "step": 762 + }, + { + "epoch": 0.0382, + "grad_norm": 7.959550857543945, + "learning_rate": 7.640000000000001e-06, + "loss": 0.4749, + "step": 764 + }, + { + "epoch": 0.0383, + "grad_norm": 9.332571029663086, + "learning_rate": 7.660000000000001e-06, + "loss": 0.775, + "step": 766 + }, + { + "epoch": 0.0384, + "grad_norm": 7.062748908996582, + "learning_rate": 7.680000000000001e-06, + "loss": 0.8876, + "step": 768 + }, + { + "epoch": 0.0385, + "grad_norm": 28.259037017822266, + "learning_rate": 7.7e-06, + "loss": 1.4088, + "step": 770 + }, + { + "epoch": 0.0386, + "grad_norm": 39.42667770385742, + "learning_rate": 7.72e-06, + "loss": 3.0897, + "step": 772 + }, + { + "epoch": 0.0387, + "grad_norm": 4.962769031524658, + "learning_rate": 7.74e-06, + "loss": 0.965, + "step": 774 + }, + { + "epoch": 0.0388, + "grad_norm": 9.543427467346191, + "learning_rate": 7.76e-06, + "loss": 0.8086, + "step": 776 + }, + { + "epoch": 0.0389, + "grad_norm": 11.88182258605957, + "learning_rate": 7.78e-06, + "loss": 2.5149, + "step": 778 + }, + { + "epoch": 0.039, + "grad_norm": 6.334836006164551, + "learning_rate": 7.800000000000002e-06, + "loss": 1.088, + "step": 780 + }, + { + "epoch": 0.0391, + "grad_norm": 6.286275386810303, + "learning_rate": 7.820000000000001e-06, + "loss": 1.1908, + "step": 782 + }, + { + "epoch": 0.0392, + "grad_norm": 16.75080680847168, + "learning_rate": 7.840000000000001e-06, + "loss": 1.4201, + "step": 784 + }, + { + "epoch": 0.0393, + "grad_norm": 6.567637920379639, + "learning_rate": 7.860000000000001e-06, + "loss": 1.9708, + "step": 786 + }, + { + "epoch": 0.0394, + "grad_norm": 11.87468433380127, + "learning_rate": 7.88e-06, + "loss": 1.4971, + "step": 788 + }, + { + "epoch": 0.0395, + "grad_norm": 6.129155158996582, + "learning_rate": 7.9e-06, + "loss": 1.2361, + "step": 790 + }, + { + "epoch": 0.0396, + "grad_norm": 12.25063419342041, + "learning_rate": 7.92e-06, + "loss": 0.9188, + "step": 792 + }, + { + "epoch": 0.0397, + "grad_norm": 6.518564224243164, + "learning_rate": 7.94e-06, + "loss": 1.5852, + "step": 794 + }, + { + "epoch": 0.0398, + "grad_norm": 5.806295871734619, + "learning_rate": 7.960000000000002e-06, + "loss": 0.2891, + "step": 796 + }, + { + "epoch": 0.0399, + "grad_norm": 6.761224746704102, + "learning_rate": 7.980000000000002e-06, + "loss": 1.003, + "step": 798 + }, + { + "epoch": 0.04, + "grad_norm": 6.914753437042236, + "learning_rate": 8.000000000000001e-06, + "loss": 1.3228, + "step": 800 + }, + { + "epoch": 0.0401, + "grad_norm": 6.201905250549316, + "learning_rate": 8.020000000000001e-06, + "loss": 1.0898, + "step": 802 + }, + { + "epoch": 0.0402, + "grad_norm": 6.105688571929932, + "learning_rate": 8.040000000000001e-06, + "loss": 1.9678, + "step": 804 + }, + { + "epoch": 0.0403, + "grad_norm": 10.549553871154785, + "learning_rate": 8.06e-06, + "loss": 1.6127, + "step": 806 + }, + { + "epoch": 0.0404, + "grad_norm": 8.363807678222656, + "learning_rate": 8.08e-06, + "loss": 0.4236, + "step": 808 + }, + { + "epoch": 0.0405, + "grad_norm": 11.548357963562012, + "learning_rate": 8.1e-06, + "loss": 1.4529, + "step": 810 + }, + { + "epoch": 0.0406, + "grad_norm": 7.282200336456299, + "learning_rate": 8.120000000000002e-06, + "loss": 1.3092, + "step": 812 + }, + { + "epoch": 0.0407, + "grad_norm": 6.200380802154541, + "learning_rate": 8.14e-06, + "loss": 1.4706, + "step": 814 + }, + { + "epoch": 0.0408, + "grad_norm": 4.164087772369385, + "learning_rate": 8.16e-06, + "loss": 1.5453, + "step": 816 + }, + { + "epoch": 0.0409, + "grad_norm": 5.4005045890808105, + "learning_rate": 8.18e-06, + "loss": 1.2143, + "step": 818 + }, + { + "epoch": 0.041, + "grad_norm": 8.297542572021484, + "learning_rate": 8.2e-06, + "loss": 1.0398, + "step": 820 + }, + { + "epoch": 0.0411, + "grad_norm": 7.819747447967529, + "learning_rate": 8.220000000000001e-06, + "loss": 0.6427, + "step": 822 + }, + { + "epoch": 0.0412, + "grad_norm": 6.177331924438477, + "learning_rate": 8.24e-06, + "loss": 0.823, + "step": 824 + }, + { + "epoch": 0.0413, + "grad_norm": 7.742039203643799, + "learning_rate": 8.26e-06, + "loss": 1.4932, + "step": 826 + }, + { + "epoch": 0.0414, + "grad_norm": 5.163416385650635, + "learning_rate": 8.28e-06, + "loss": 1.7458, + "step": 828 + }, + { + "epoch": 0.0415, + "grad_norm": 7.835866928100586, + "learning_rate": 8.3e-06, + "loss": 0.8655, + "step": 830 + }, + { + "epoch": 0.0416, + "grad_norm": 7.003748416900635, + "learning_rate": 8.32e-06, + "loss": 2.1344, + "step": 832 + }, + { + "epoch": 0.0417, + "grad_norm": 2.7499520778656006, + "learning_rate": 8.34e-06, + "loss": 0.9642, + "step": 834 + }, + { + "epoch": 0.0418, + "grad_norm": 6.913974285125732, + "learning_rate": 8.36e-06, + "loss": 1.32, + "step": 836 + }, + { + "epoch": 0.0419, + "grad_norm": 5.829736709594727, + "learning_rate": 8.380000000000001e-06, + "loss": 1.1847, + "step": 838 + }, + { + "epoch": 0.042, + "grad_norm": 9.069207191467285, + "learning_rate": 8.400000000000001e-06, + "loss": 0.926, + "step": 840 + }, + { + "epoch": 0.0421, + "grad_norm": 12.516983985900879, + "learning_rate": 8.42e-06, + "loss": 1.1337, + "step": 842 + }, + { + "epoch": 0.0422, + "grad_norm": 8.753403663635254, + "learning_rate": 8.44e-06, + "loss": 0.8875, + "step": 844 + }, + { + "epoch": 0.0423, + "grad_norm": 8.338911056518555, + "learning_rate": 8.46e-06, + "loss": 1.5464, + "step": 846 + }, + { + "epoch": 0.0424, + "grad_norm": 8.752188682556152, + "learning_rate": 8.48e-06, + "loss": 1.4982, + "step": 848 + }, + { + "epoch": 0.0425, + "grad_norm": 8.076589584350586, + "learning_rate": 8.5e-06, + "loss": 1.2613, + "step": 850 + }, + { + "epoch": 0.0426, + "grad_norm": 7.790770053863525, + "learning_rate": 8.52e-06, + "loss": 0.7892, + "step": 852 + }, + { + "epoch": 0.0427, + "grad_norm": 6.647616863250732, + "learning_rate": 8.540000000000001e-06, + "loss": 1.1131, + "step": 854 + }, + { + "epoch": 0.0428, + "grad_norm": 16.549556732177734, + "learning_rate": 8.560000000000001e-06, + "loss": 2.0449, + "step": 856 + }, + { + "epoch": 0.0429, + "grad_norm": 4.804563045501709, + "learning_rate": 8.580000000000001e-06, + "loss": 0.9306, + "step": 858 + }, + { + "epoch": 0.043, + "grad_norm": 11.127938270568848, + "learning_rate": 8.6e-06, + "loss": 0.9364, + "step": 860 + }, + { + "epoch": 0.0431, + "grad_norm": 10.764187812805176, + "learning_rate": 8.62e-06, + "loss": 1.3672, + "step": 862 + }, + { + "epoch": 0.0432, + "grad_norm": 7.000422954559326, + "learning_rate": 8.64e-06, + "loss": 1.5377, + "step": 864 + }, + { + "epoch": 0.0433, + "grad_norm": 9.697588920593262, + "learning_rate": 8.66e-06, + "loss": 0.4857, + "step": 866 + }, + { + "epoch": 0.0434, + "grad_norm": 3.863271951675415, + "learning_rate": 8.68e-06, + "loss": 1.2653, + "step": 868 + }, + { + "epoch": 0.0435, + "grad_norm": 6.525504112243652, + "learning_rate": 8.700000000000001e-06, + "loss": 0.7381, + "step": 870 + }, + { + "epoch": 0.0436, + "grad_norm": 4.088057041168213, + "learning_rate": 8.720000000000001e-06, + "loss": 1.229, + "step": 872 + }, + { + "epoch": 0.0437, + "grad_norm": 8.730039596557617, + "learning_rate": 8.740000000000001e-06, + "loss": 1.3606, + "step": 874 + }, + { + "epoch": 0.0438, + "grad_norm": 10.253605842590332, + "learning_rate": 8.76e-06, + "loss": 1.4437, + "step": 876 + }, + { + "epoch": 0.0439, + "grad_norm": 9.139019012451172, + "learning_rate": 8.78e-06, + "loss": 1.6377, + "step": 878 + }, + { + "epoch": 0.044, + "grad_norm": 7.622931957244873, + "learning_rate": 8.8e-06, + "loss": 1.2581, + "step": 880 + }, + { + "epoch": 0.0441, + "grad_norm": 9.1115140914917, + "learning_rate": 8.82e-06, + "loss": 1.2643, + "step": 882 + }, + { + "epoch": 0.0442, + "grad_norm": 13.870454788208008, + "learning_rate": 8.84e-06, + "loss": 1.2702, + "step": 884 + }, + { + "epoch": 0.0443, + "grad_norm": 7.685462474822998, + "learning_rate": 8.860000000000002e-06, + "loss": 0.8145, + "step": 886 + }, + { + "epoch": 0.0444, + "grad_norm": 7.611894130706787, + "learning_rate": 8.880000000000001e-06, + "loss": 1.885, + "step": 888 + }, + { + "epoch": 0.0445, + "grad_norm": 10.213020324707031, + "learning_rate": 8.900000000000001e-06, + "loss": 1.183, + "step": 890 + }, + { + "epoch": 0.0446, + "grad_norm": 11.117721557617188, + "learning_rate": 8.920000000000001e-06, + "loss": 0.949, + "step": 892 + }, + { + "epoch": 0.0447, + "grad_norm": 8.329266548156738, + "learning_rate": 8.94e-06, + "loss": 1.1052, + "step": 894 + }, + { + "epoch": 0.0448, + "grad_norm": 17.15361785888672, + "learning_rate": 8.96e-06, + "loss": 1.4662, + "step": 896 + }, + { + "epoch": 0.0449, + "grad_norm": 10.485551834106445, + "learning_rate": 8.98e-06, + "loss": 1.0774, + "step": 898 + }, + { + "epoch": 0.045, + "grad_norm": 13.250389099121094, + "learning_rate": 9e-06, + "loss": 0.9136, + "step": 900 + }, + { + "epoch": 0.0451, + "grad_norm": 4.921250820159912, + "learning_rate": 9.020000000000002e-06, + "loss": 0.6136, + "step": 902 + }, + { + "epoch": 0.0452, + "grad_norm": 27.90711784362793, + "learning_rate": 9.040000000000002e-06, + "loss": 1.6426, + "step": 904 + }, + { + "epoch": 0.0453, + "grad_norm": 9.57217025756836, + "learning_rate": 9.060000000000001e-06, + "loss": 0.8417, + "step": 906 + }, + { + "epoch": 0.0454, + "grad_norm": 14.797211647033691, + "learning_rate": 9.080000000000001e-06, + "loss": 1.2451, + "step": 908 + }, + { + "epoch": 0.0455, + "grad_norm": 6.9856743812561035, + "learning_rate": 9.100000000000001e-06, + "loss": 0.7314, + "step": 910 + }, + { + "epoch": 0.0456, + "grad_norm": 11.038307189941406, + "learning_rate": 9.12e-06, + "loss": 1.0146, + "step": 912 + }, + { + "epoch": 0.0457, + "grad_norm": 5.8561110496521, + "learning_rate": 9.14e-06, + "loss": 1.0373, + "step": 914 + }, + { + "epoch": 0.0458, + "grad_norm": 18.916406631469727, + "learning_rate": 9.16e-06, + "loss": 1.2587, + "step": 916 + }, + { + "epoch": 0.0459, + "grad_norm": 13.945976257324219, + "learning_rate": 9.180000000000002e-06, + "loss": 1.6754, + "step": 918 + }, + { + "epoch": 0.046, + "grad_norm": 10.516154289245605, + "learning_rate": 9.200000000000002e-06, + "loss": 1.9376, + "step": 920 + }, + { + "epoch": 0.0461, + "grad_norm": 5.595226764678955, + "learning_rate": 9.220000000000002e-06, + "loss": 1.0282, + "step": 922 + }, + { + "epoch": 0.0462, + "grad_norm": 15.996903419494629, + "learning_rate": 9.240000000000001e-06, + "loss": 1.2788, + "step": 924 + }, + { + "epoch": 0.0463, + "grad_norm": 6.379086017608643, + "learning_rate": 9.260000000000001e-06, + "loss": 1.6335, + "step": 926 + }, + { + "epoch": 0.0464, + "grad_norm": 7.7937822341918945, + "learning_rate": 9.280000000000001e-06, + "loss": 1.2094, + "step": 928 + }, + { + "epoch": 0.0465, + "grad_norm": 4.825281143188477, + "learning_rate": 9.3e-06, + "loss": 0.8896, + "step": 930 + }, + { + "epoch": 0.0466, + "grad_norm": 7.53037166595459, + "learning_rate": 9.32e-06, + "loss": 1.3233, + "step": 932 + }, + { + "epoch": 0.0467, + "grad_norm": 12.8944730758667, + "learning_rate": 9.340000000000002e-06, + "loss": 1.53, + "step": 934 + }, + { + "epoch": 0.0468, + "grad_norm": 7.816258430480957, + "learning_rate": 9.360000000000002e-06, + "loss": 0.9672, + "step": 936 + }, + { + "epoch": 0.0469, + "grad_norm": 8.701108932495117, + "learning_rate": 9.38e-06, + "loss": 0.7859, + "step": 938 + }, + { + "epoch": 0.047, + "grad_norm": 7.416184425354004, + "learning_rate": 9.4e-06, + "loss": 1.4089, + "step": 940 + }, + { + "epoch": 0.0471, + "grad_norm": 11.333868026733398, + "learning_rate": 9.42e-06, + "loss": 0.4144, + "step": 942 + }, + { + "epoch": 0.0472, + "grad_norm": 13.085240364074707, + "learning_rate": 9.440000000000001e-06, + "loss": 1.4816, + "step": 944 + }, + { + "epoch": 0.0473, + "grad_norm": 8.114256858825684, + "learning_rate": 9.460000000000001e-06, + "loss": 1.4235, + "step": 946 + }, + { + "epoch": 0.0474, + "grad_norm": 15.602758407592773, + "learning_rate": 9.48e-06, + "loss": 0.8787, + "step": 948 + }, + { + "epoch": 0.0475, + "grad_norm": 3.7843024730682373, + "learning_rate": 9.5e-06, + "loss": 1.0338, + "step": 950 + }, + { + "epoch": 0.0476, + "grad_norm": 5.126802444458008, + "learning_rate": 9.52e-06, + "loss": 1.2334, + "step": 952 + }, + { + "epoch": 0.0477, + "grad_norm": 4.332019805908203, + "learning_rate": 9.54e-06, + "loss": 0.5773, + "step": 954 + }, + { + "epoch": 0.0478, + "grad_norm": 6.9492034912109375, + "learning_rate": 9.56e-06, + "loss": 1.087, + "step": 956 + }, + { + "epoch": 0.0479, + "grad_norm": 8.5418062210083, + "learning_rate": 9.58e-06, + "loss": 0.7343, + "step": 958 + }, + { + "epoch": 0.048, + "grad_norm": 13.486412048339844, + "learning_rate": 9.600000000000001e-06, + "loss": 1.1379, + "step": 960 + }, + { + "epoch": 0.0481, + "grad_norm": 5.084202289581299, + "learning_rate": 9.620000000000001e-06, + "loss": 1.6022, + "step": 962 + }, + { + "epoch": 0.0482, + "grad_norm": 4.291140556335449, + "learning_rate": 9.640000000000001e-06, + "loss": 1.1682, + "step": 964 + }, + { + "epoch": 0.0483, + "grad_norm": 5.342339992523193, + "learning_rate": 9.66e-06, + "loss": 0.6561, + "step": 966 + }, + { + "epoch": 0.0484, + "grad_norm": 6.108663082122803, + "learning_rate": 9.68e-06, + "loss": 1.3936, + "step": 968 + }, + { + "epoch": 0.0485, + "grad_norm": 5.865069389343262, + "learning_rate": 9.7e-06, + "loss": 0.9465, + "step": 970 + }, + { + "epoch": 0.0486, + "grad_norm": 11.0661039352417, + "learning_rate": 9.72e-06, + "loss": 0.9014, + "step": 972 + }, + { + "epoch": 0.0487, + "grad_norm": 13.568013191223145, + "learning_rate": 9.74e-06, + "loss": 1.8285, + "step": 974 + }, + { + "epoch": 0.0488, + "grad_norm": 10.661681175231934, + "learning_rate": 9.760000000000001e-06, + "loss": 0.9238, + "step": 976 + }, + { + "epoch": 0.0489, + "grad_norm": 5.342900276184082, + "learning_rate": 9.780000000000001e-06, + "loss": 1.5265, + "step": 978 + }, + { + "epoch": 0.049, + "grad_norm": 8.07319164276123, + "learning_rate": 9.800000000000001e-06, + "loss": 1.2506, + "step": 980 + }, + { + "epoch": 0.0491, + "grad_norm": 3.79182505607605, + "learning_rate": 9.820000000000001e-06, + "loss": 0.7611, + "step": 982 + }, + { + "epoch": 0.0492, + "grad_norm": 8.064897537231445, + "learning_rate": 9.84e-06, + "loss": 1.1223, + "step": 984 + }, + { + "epoch": 0.0493, + "grad_norm": 10.692683219909668, + "learning_rate": 9.86e-06, + "loss": 0.5429, + "step": 986 + }, + { + "epoch": 0.0494, + "grad_norm": 4.974823474884033, + "learning_rate": 9.88e-06, + "loss": 0.7468, + "step": 988 + }, + { + "epoch": 0.0495, + "grad_norm": 6.628968238830566, + "learning_rate": 9.9e-06, + "loss": 0.2551, + "step": 990 + }, + { + "epoch": 0.0496, + "grad_norm": 5.6833062171936035, + "learning_rate": 9.920000000000002e-06, + "loss": 0.7919, + "step": 992 + }, + { + "epoch": 0.0497, + "grad_norm": 6.701145648956299, + "learning_rate": 9.940000000000001e-06, + "loss": 0.7133, + "step": 994 + }, + { + "epoch": 0.0498, + "grad_norm": 20.590471267700195, + "learning_rate": 9.960000000000001e-06, + "loss": 1.4489, + "step": 996 + }, + { + "epoch": 0.0499, + "grad_norm": 5.7228264808654785, + "learning_rate": 9.980000000000001e-06, + "loss": 0.4453, + "step": 998 + }, + { + "epoch": 0.05, + "grad_norm": 8.980693817138672, + "learning_rate": 1e-05, + "loss": 1.3103, + "step": 1000 + }, + { + "epoch": 0.0501, + "grad_norm": 3.4178478717803955, + "learning_rate": 1.002e-05, + "loss": 0.7475, + "step": 1002 + }, + { + "epoch": 0.0502, + "grad_norm": 9.537713050842285, + "learning_rate": 1.004e-05, + "loss": 0.9032, + "step": 1004 + }, + { + "epoch": 0.0503, + "grad_norm": 8.356504440307617, + "learning_rate": 1.006e-05, + "loss": 1.214, + "step": 1006 + }, + { + "epoch": 0.0504, + "grad_norm": 8.353915214538574, + "learning_rate": 1.008e-05, + "loss": 0.6996, + "step": 1008 + }, + { + "epoch": 0.0505, + "grad_norm": 4.530254364013672, + "learning_rate": 1.0100000000000002e-05, + "loss": 1.354, + "step": 1010 + }, + { + "epoch": 0.0506, + "grad_norm": 13.355647087097168, + "learning_rate": 1.0120000000000001e-05, + "loss": 0.7193, + "step": 1012 + }, + { + "epoch": 0.0507, + "grad_norm": 8.11701488494873, + "learning_rate": 1.0140000000000001e-05, + "loss": 1.4941, + "step": 1014 + }, + { + "epoch": 0.0508, + "grad_norm": 3.741929292678833, + "learning_rate": 1.0160000000000001e-05, + "loss": 1.3037, + "step": 1016 + }, + { + "epoch": 0.0509, + "grad_norm": 6.1779327392578125, + "learning_rate": 1.018e-05, + "loss": 1.3255, + "step": 1018 + }, + { + "epoch": 0.051, + "grad_norm": 10.759147644042969, + "learning_rate": 1.02e-05, + "loss": 0.7806, + "step": 1020 + }, + { + "epoch": 0.0511, + "grad_norm": 6.174217224121094, + "learning_rate": 1.022e-05, + "loss": 0.9291, + "step": 1022 + }, + { + "epoch": 0.0512, + "grad_norm": 12.984908103942871, + "learning_rate": 1.024e-05, + "loss": 0.6831, + "step": 1024 + }, + { + "epoch": 0.0513, + "grad_norm": 6.997854232788086, + "learning_rate": 1.0260000000000002e-05, + "loss": 0.5873, + "step": 1026 + }, + { + "epoch": 0.0514, + "grad_norm": 12.01712703704834, + "learning_rate": 1.0280000000000002e-05, + "loss": 0.7543, + "step": 1028 + }, + { + "epoch": 0.0515, + "grad_norm": 5.849665641784668, + "learning_rate": 1.0300000000000001e-05, + "loss": 1.3762, + "step": 1030 + }, + { + "epoch": 0.0516, + "grad_norm": 9.432172775268555, + "learning_rate": 1.0320000000000001e-05, + "loss": 0.6094, + "step": 1032 + }, + { + "epoch": 0.0517, + "grad_norm": 8.276531219482422, + "learning_rate": 1.0340000000000001e-05, + "loss": 1.1422, + "step": 1034 + }, + { + "epoch": 0.0518, + "grad_norm": 8.886845588684082, + "learning_rate": 1.036e-05, + "loss": 1.5311, + "step": 1036 + }, + { + "epoch": 0.0519, + "grad_norm": 15.401714324951172, + "learning_rate": 1.038e-05, + "loss": 1.7084, + "step": 1038 + }, + { + "epoch": 0.052, + "grad_norm": 12.219975471496582, + "learning_rate": 1.04e-05, + "loss": 1.6178, + "step": 1040 + }, + { + "epoch": 0.0521, + "grad_norm": 9.366437911987305, + "learning_rate": 1.0420000000000002e-05, + "loss": 1.3961, + "step": 1042 + }, + { + "epoch": 0.0522, + "grad_norm": 5.551557540893555, + "learning_rate": 1.0440000000000002e-05, + "loss": 0.6485, + "step": 1044 + }, + { + "epoch": 0.0523, + "grad_norm": 16.46091651916504, + "learning_rate": 1.0460000000000001e-05, + "loss": 1.8214, + "step": 1046 + }, + { + "epoch": 0.0524, + "grad_norm": 3.0242676734924316, + "learning_rate": 1.0480000000000001e-05, + "loss": 0.9058, + "step": 1048 + }, + { + "epoch": 0.0525, + "grad_norm": 5.547236919403076, + "learning_rate": 1.0500000000000001e-05, + "loss": 1.0905, + "step": 1050 + }, + { + "epoch": 0.0526, + "grad_norm": 6.098644733428955, + "learning_rate": 1.0520000000000001e-05, + "loss": 0.928, + "step": 1052 + }, + { + "epoch": 0.0527, + "grad_norm": 13.627326965332031, + "learning_rate": 1.054e-05, + "loss": 0.3545, + "step": 1054 + }, + { + "epoch": 0.0528, + "grad_norm": 6.932798385620117, + "learning_rate": 1.056e-05, + "loss": 1.5865, + "step": 1056 + }, + { + "epoch": 0.0529, + "grad_norm": 5.842358112335205, + "learning_rate": 1.0580000000000002e-05, + "loss": 1.3088, + "step": 1058 + }, + { + "epoch": 0.053, + "grad_norm": 8.609539031982422, + "learning_rate": 1.0600000000000002e-05, + "loss": 1.4429, + "step": 1060 + }, + { + "epoch": 0.0531, + "grad_norm": 3.4639532566070557, + "learning_rate": 1.0620000000000002e-05, + "loss": 0.629, + "step": 1062 + }, + { + "epoch": 0.0532, + "grad_norm": 6.137248516082764, + "learning_rate": 1.0640000000000001e-05, + "loss": 0.9989, + "step": 1064 + }, + { + "epoch": 0.0533, + "grad_norm": 6.5123162269592285, + "learning_rate": 1.0660000000000001e-05, + "loss": 2.4505, + "step": 1066 + }, + { + "epoch": 0.0534, + "grad_norm": 5.573459625244141, + "learning_rate": 1.0680000000000001e-05, + "loss": 1.4627, + "step": 1068 + }, + { + "epoch": 0.0535, + "grad_norm": 10.339714050292969, + "learning_rate": 1.0700000000000001e-05, + "loss": 1.3603, + "step": 1070 + }, + { + "epoch": 0.0536, + "grad_norm": 4.260346412658691, + "learning_rate": 1.072e-05, + "loss": 1.519, + "step": 1072 + }, + { + "epoch": 0.0537, + "grad_norm": 11.616281509399414, + "learning_rate": 1.0740000000000002e-05, + "loss": 1.7094, + "step": 1074 + }, + { + "epoch": 0.0538, + "grad_norm": 5.782266616821289, + "learning_rate": 1.0760000000000002e-05, + "loss": 1.2967, + "step": 1076 + }, + { + "epoch": 0.0539, + "grad_norm": 12.521726608276367, + "learning_rate": 1.0780000000000002e-05, + "loss": 1.2565, + "step": 1078 + }, + { + "epoch": 0.054, + "grad_norm": 12.10187816619873, + "learning_rate": 1.0800000000000002e-05, + "loss": 1.7982, + "step": 1080 + }, + { + "epoch": 0.0541, + "grad_norm": 5.242071628570557, + "learning_rate": 1.0820000000000001e-05, + "loss": 1.2021, + "step": 1082 + }, + { + "epoch": 0.0542, + "grad_norm": 4.7590155601501465, + "learning_rate": 1.0840000000000001e-05, + "loss": 0.9488, + "step": 1084 + }, + { + "epoch": 0.0543, + "grad_norm": 5.933817386627197, + "learning_rate": 1.0860000000000001e-05, + "loss": 1.5255, + "step": 1086 + }, + { + "epoch": 0.0544, + "grad_norm": 5.641519069671631, + "learning_rate": 1.0880000000000001e-05, + "loss": 0.6402, + "step": 1088 + }, + { + "epoch": 0.0545, + "grad_norm": 3.456378221511841, + "learning_rate": 1.0900000000000002e-05, + "loss": 1.3599, + "step": 1090 + }, + { + "epoch": 0.0546, + "grad_norm": 19.68791961669922, + "learning_rate": 1.0920000000000002e-05, + "loss": 1.621, + "step": 1092 + }, + { + "epoch": 0.0547, + "grad_norm": 4.37565279006958, + "learning_rate": 1.0940000000000002e-05, + "loss": 0.9476, + "step": 1094 + }, + { + "epoch": 0.0548, + "grad_norm": 4.662720203399658, + "learning_rate": 1.0960000000000002e-05, + "loss": 1.2734, + "step": 1096 + }, + { + "epoch": 0.0549, + "grad_norm": 6.0873847007751465, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.51, + "step": 1098 + }, + { + "epoch": 0.055, + "grad_norm": 9.660544395446777, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.7626, + "step": 1100 + }, + { + "epoch": 0.0551, + "grad_norm": 3.7264022827148438, + "learning_rate": 1.1020000000000001e-05, + "loss": 1.2387, + "step": 1102 + }, + { + "epoch": 0.0552, + "grad_norm": 10.491249084472656, + "learning_rate": 1.1040000000000001e-05, + "loss": 0.5162, + "step": 1104 + }, + { + "epoch": 0.0553, + "grad_norm": 3.2731173038482666, + "learning_rate": 1.1060000000000003e-05, + "loss": 0.4542, + "step": 1106 + }, + { + "epoch": 0.0554, + "grad_norm": 3.0507400035858154, + "learning_rate": 1.1080000000000002e-05, + "loss": 1.1735, + "step": 1108 + }, + { + "epoch": 0.0555, + "grad_norm": 3.8592309951782227, + "learning_rate": 1.1100000000000002e-05, + "loss": 0.9266, + "step": 1110 + }, + { + "epoch": 0.0556, + "grad_norm": 7.010512351989746, + "learning_rate": 1.1120000000000002e-05, + "loss": 0.6259, + "step": 1112 + }, + { + "epoch": 0.0557, + "grad_norm": 8.180741310119629, + "learning_rate": 1.1140000000000002e-05, + "loss": 0.8928, + "step": 1114 + }, + { + "epoch": 0.0558, + "grad_norm": 6.110075950622559, + "learning_rate": 1.1160000000000002e-05, + "loss": 0.7037, + "step": 1116 + }, + { + "epoch": 0.0559, + "grad_norm": 5.6945881843566895, + "learning_rate": 1.1180000000000001e-05, + "loss": 1.1807, + "step": 1118 + }, + { + "epoch": 0.056, + "grad_norm": 5.580513954162598, + "learning_rate": 1.1200000000000001e-05, + "loss": 1.1925, + "step": 1120 + }, + { + "epoch": 0.0561, + "grad_norm": 17.22722816467285, + "learning_rate": 1.1220000000000003e-05, + "loss": 1.6194, + "step": 1122 + }, + { + "epoch": 0.0562, + "grad_norm": 19.820575714111328, + "learning_rate": 1.1240000000000002e-05, + "loss": 1.8445, + "step": 1124 + }, + { + "epoch": 0.0563, + "grad_norm": 10.652508735656738, + "learning_rate": 1.126e-05, + "loss": 0.9157, + "step": 1126 + }, + { + "epoch": 0.0564, + "grad_norm": 2.6193478107452393, + "learning_rate": 1.128e-05, + "loss": 0.465, + "step": 1128 + }, + { + "epoch": 0.0565, + "grad_norm": 12.602704048156738, + "learning_rate": 1.13e-05, + "loss": 1.2237, + "step": 1130 + }, + { + "epoch": 0.0566, + "grad_norm": 8.591469764709473, + "learning_rate": 1.132e-05, + "loss": 1.1851, + "step": 1132 + }, + { + "epoch": 0.0567, + "grad_norm": 8.863839149475098, + "learning_rate": 1.134e-05, + "loss": 1.6116, + "step": 1134 + }, + { + "epoch": 0.0568, + "grad_norm": 11.178498268127441, + "learning_rate": 1.136e-05, + "loss": 1.2251, + "step": 1136 + }, + { + "epoch": 0.0569, + "grad_norm": 6.378483772277832, + "learning_rate": 1.138e-05, + "loss": 1.1187, + "step": 1138 + }, + { + "epoch": 0.057, + "grad_norm": 11.614136695861816, + "learning_rate": 1.14e-05, + "loss": 1.0748, + "step": 1140 + }, + { + "epoch": 0.0571, + "grad_norm": 6.678395748138428, + "learning_rate": 1.142e-05, + "loss": 1.0537, + "step": 1142 + }, + { + "epoch": 0.0572, + "grad_norm": 7.787674903869629, + "learning_rate": 1.144e-05, + "loss": 1.543, + "step": 1144 + }, + { + "epoch": 0.0573, + "grad_norm": 13.141681671142578, + "learning_rate": 1.146e-05, + "loss": 1.3416, + "step": 1146 + }, + { + "epoch": 0.0574, + "grad_norm": 9.410745620727539, + "learning_rate": 1.148e-05, + "loss": 0.9764, + "step": 1148 + }, + { + "epoch": 0.0575, + "grad_norm": 3.8438985347747803, + "learning_rate": 1.15e-05, + "loss": 0.2784, + "step": 1150 + }, + { + "epoch": 0.0576, + "grad_norm": 11.42115592956543, + "learning_rate": 1.152e-05, + "loss": 1.2055, + "step": 1152 + }, + { + "epoch": 0.0577, + "grad_norm": 4.806264400482178, + "learning_rate": 1.154e-05, + "loss": 1.1993, + "step": 1154 + }, + { + "epoch": 0.0578, + "grad_norm": 2.8280930519104004, + "learning_rate": 1.156e-05, + "loss": 0.9028, + "step": 1156 + }, + { + "epoch": 0.0579, + "grad_norm": 5.086123943328857, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.7891, + "step": 1158 + }, + { + "epoch": 0.058, + "grad_norm": 9.256562232971191, + "learning_rate": 1.16e-05, + "loss": 0.9831, + "step": 1160 + }, + { + "epoch": 0.0581, + "grad_norm": 9.671289443969727, + "learning_rate": 1.162e-05, + "loss": 1.0247, + "step": 1162 + }, + { + "epoch": 0.0582, + "grad_norm": 4.576196193695068, + "learning_rate": 1.164e-05, + "loss": 1.0482, + "step": 1164 + }, + { + "epoch": 0.0583, + "grad_norm": 13.727309226989746, + "learning_rate": 1.166e-05, + "loss": 2.1033, + "step": 1166 + }, + { + "epoch": 0.0584, + "grad_norm": 6.551762580871582, + "learning_rate": 1.168e-05, + "loss": 1.6655, + "step": 1168 + }, + { + "epoch": 0.0585, + "grad_norm": 16.33987045288086, + "learning_rate": 1.17e-05, + "loss": 0.7898, + "step": 1170 + }, + { + "epoch": 0.0586, + "grad_norm": 10.016860008239746, + "learning_rate": 1.172e-05, + "loss": 1.0575, + "step": 1172 + }, + { + "epoch": 0.0587, + "grad_norm": 8.295694351196289, + "learning_rate": 1.1740000000000001e-05, + "loss": 1.1231, + "step": 1174 + }, + { + "epoch": 0.0588, + "grad_norm": 10.219819068908691, + "learning_rate": 1.1760000000000001e-05, + "loss": 1.2171, + "step": 1176 + }, + { + "epoch": 0.0589, + "grad_norm": 4.793157577514648, + "learning_rate": 1.178e-05, + "loss": 1.0596, + "step": 1178 + }, + { + "epoch": 0.059, + "grad_norm": 3.8004848957061768, + "learning_rate": 1.18e-05, + "loss": 0.736, + "step": 1180 + }, + { + "epoch": 0.0591, + "grad_norm": 5.411908149719238, + "learning_rate": 1.182e-05, + "loss": 0.6857, + "step": 1182 + }, + { + "epoch": 0.0592, + "grad_norm": 8.674311637878418, + "learning_rate": 1.184e-05, + "loss": 1.2233, + "step": 1184 + }, + { + "epoch": 0.0593, + "grad_norm": 11.885868072509766, + "learning_rate": 1.186e-05, + "loss": 1.1798, + "step": 1186 + }, + { + "epoch": 0.0594, + "grad_norm": 5.908788681030273, + "learning_rate": 1.188e-05, + "loss": 1.49, + "step": 1188 + }, + { + "epoch": 0.0595, + "grad_norm": 6.64235782623291, + "learning_rate": 1.1900000000000001e-05, + "loss": 1.0646, + "step": 1190 + }, + { + "epoch": 0.0596, + "grad_norm": 6.447852611541748, + "learning_rate": 1.1920000000000001e-05, + "loss": 1.4249, + "step": 1192 + }, + { + "epoch": 0.0597, + "grad_norm": 4.674562454223633, + "learning_rate": 1.1940000000000001e-05, + "loss": 1.173, + "step": 1194 + }, + { + "epoch": 0.0598, + "grad_norm": 7.454022407531738, + "learning_rate": 1.196e-05, + "loss": 0.8693, + "step": 1196 + }, + { + "epoch": 0.0599, + "grad_norm": 3.7640023231506348, + "learning_rate": 1.198e-05, + "loss": 1.1414, + "step": 1198 + }, + { + "epoch": 0.06, + "grad_norm": 5.955562591552734, + "learning_rate": 1.2e-05, + "loss": 0.5356, + "step": 1200 + }, + { + "epoch": 0.0601, + "grad_norm": 4.8271918296813965, + "learning_rate": 1.202e-05, + "loss": 1.154, + "step": 1202 + }, + { + "epoch": 0.0602, + "grad_norm": 17.07244873046875, + "learning_rate": 1.204e-05, + "loss": 1.3004, + "step": 1204 + }, + { + "epoch": 0.0603, + "grad_norm": 4.381977081298828, + "learning_rate": 1.2060000000000001e-05, + "loss": 1.4573, + "step": 1206 + }, + { + "epoch": 0.0604, + "grad_norm": 11.909292221069336, + "learning_rate": 1.2080000000000001e-05, + "loss": 0.6956, + "step": 1208 + }, + { + "epoch": 0.0605, + "grad_norm": 12.691996574401855, + "learning_rate": 1.2100000000000001e-05, + "loss": 1.5499, + "step": 1210 + }, + { + "epoch": 0.0606, + "grad_norm": 9.230574607849121, + "learning_rate": 1.2120000000000001e-05, + "loss": 1.876, + "step": 1212 + }, + { + "epoch": 0.0607, + "grad_norm": 11.550982475280762, + "learning_rate": 1.214e-05, + "loss": 1.4147, + "step": 1214 + }, + { + "epoch": 0.0608, + "grad_norm": 4.683970928192139, + "learning_rate": 1.216e-05, + "loss": 1.4949, + "step": 1216 + }, + { + "epoch": 0.0609, + "grad_norm": 6.341169357299805, + "learning_rate": 1.218e-05, + "loss": 1.6455, + "step": 1218 + }, + { + "epoch": 0.061, + "grad_norm": 5.272629737854004, + "learning_rate": 1.22e-05, + "loss": 1.738, + "step": 1220 + }, + { + "epoch": 0.0611, + "grad_norm": 1.7779905796051025, + "learning_rate": 1.2220000000000002e-05, + "loss": 0.5907, + "step": 1222 + }, + { + "epoch": 0.0612, + "grad_norm": 15.234939575195312, + "learning_rate": 1.2240000000000001e-05, + "loss": 1.7178, + "step": 1224 + }, + { + "epoch": 0.0613, + "grad_norm": 8.100945472717285, + "learning_rate": 1.2260000000000001e-05, + "loss": 1.236, + "step": 1226 + }, + { + "epoch": 0.0614, + "grad_norm": 4.357637882232666, + "learning_rate": 1.2280000000000001e-05, + "loss": 0.6428, + "step": 1228 + }, + { + "epoch": 0.0615, + "grad_norm": 9.809642791748047, + "learning_rate": 1.23e-05, + "loss": 0.5627, + "step": 1230 + }, + { + "epoch": 0.0616, + "grad_norm": 23.61371421813965, + "learning_rate": 1.232e-05, + "loss": 1.5347, + "step": 1232 + }, + { + "epoch": 0.0617, + "grad_norm": 3.477745532989502, + "learning_rate": 1.234e-05, + "loss": 1.1552, + "step": 1234 + }, + { + "epoch": 0.0618, + "grad_norm": 4.0273237228393555, + "learning_rate": 1.236e-05, + "loss": 1.6266, + "step": 1236 + }, + { + "epoch": 0.0619, + "grad_norm": 5.3053059577941895, + "learning_rate": 1.2380000000000002e-05, + "loss": 1.4976, + "step": 1238 + }, + { + "epoch": 0.062, + "grad_norm": 3.3822555541992188, + "learning_rate": 1.2400000000000002e-05, + "loss": 0.2363, + "step": 1240 + }, + { + "epoch": 0.0621, + "grad_norm": 5.396374225616455, + "learning_rate": 1.2420000000000001e-05, + "loss": 0.9525, + "step": 1242 + }, + { + "epoch": 0.0622, + "grad_norm": 3.3527379035949707, + "learning_rate": 1.2440000000000001e-05, + "loss": 0.7819, + "step": 1244 + }, + { + "epoch": 0.0623, + "grad_norm": 3.6670312881469727, + "learning_rate": 1.2460000000000001e-05, + "loss": 1.4641, + "step": 1246 + }, + { + "epoch": 0.0624, + "grad_norm": 8.00723648071289, + "learning_rate": 1.248e-05, + "loss": 0.9872, + "step": 1248 + }, + { + "epoch": 0.0625, + "grad_norm": 4.4231157302856445, + "learning_rate": 1.25e-05, + "loss": 1.5235, + "step": 1250 + }, + { + "epoch": 0.0626, + "grad_norm": 18.684049606323242, + "learning_rate": 1.252e-05, + "loss": 1.975, + "step": 1252 + }, + { + "epoch": 0.0627, + "grad_norm": 6.06675910949707, + "learning_rate": 1.254e-05, + "loss": 1.0791, + "step": 1254 + }, + { + "epoch": 0.0628, + "grad_norm": 7.9588823318481445, + "learning_rate": 1.2560000000000002e-05, + "loss": 1.2015, + "step": 1256 + }, + { + "epoch": 0.0629, + "grad_norm": 8.36133098602295, + "learning_rate": 1.2580000000000002e-05, + "loss": 1.8375, + "step": 1258 + }, + { + "epoch": 0.063, + "grad_norm": 4.90884256362915, + "learning_rate": 1.2600000000000001e-05, + "loss": 1.4052, + "step": 1260 + }, + { + "epoch": 0.0631, + "grad_norm": 8.785669326782227, + "learning_rate": 1.2620000000000001e-05, + "loss": 1.0115, + "step": 1262 + }, + { + "epoch": 0.0632, + "grad_norm": 4.349638938903809, + "learning_rate": 1.2640000000000001e-05, + "loss": 0.9398, + "step": 1264 + }, + { + "epoch": 0.0633, + "grad_norm": 3.0450875759124756, + "learning_rate": 1.266e-05, + "loss": 1.183, + "step": 1266 + }, + { + "epoch": 0.0634, + "grad_norm": 3.0999841690063477, + "learning_rate": 1.268e-05, + "loss": 1.1564, + "step": 1268 + }, + { + "epoch": 0.0635, + "grad_norm": 4.75046968460083, + "learning_rate": 1.27e-05, + "loss": 1.1906, + "step": 1270 + }, + { + "epoch": 0.0636, + "grad_norm": 3.026309013366699, + "learning_rate": 1.2720000000000002e-05, + "loss": 1.1548, + "step": 1272 + }, + { + "epoch": 0.0637, + "grad_norm": 5.294475555419922, + "learning_rate": 1.2740000000000002e-05, + "loss": 1.313, + "step": 1274 + }, + { + "epoch": 0.0638, + "grad_norm": 7.354281425476074, + "learning_rate": 1.2760000000000001e-05, + "loss": 1.25, + "step": 1276 + }, + { + "epoch": 0.0639, + "grad_norm": 6.807384014129639, + "learning_rate": 1.2780000000000001e-05, + "loss": 1.5111, + "step": 1278 + }, + { + "epoch": 0.064, + "grad_norm": 12.485224723815918, + "learning_rate": 1.2800000000000001e-05, + "loss": 1.7813, + "step": 1280 + }, + { + "epoch": 0.0641, + "grad_norm": 8.715313911437988, + "learning_rate": 1.2820000000000001e-05, + "loss": 0.8621, + "step": 1282 + }, + { + "epoch": 0.0642, + "grad_norm": 7.909346580505371, + "learning_rate": 1.284e-05, + "loss": 1.8056, + "step": 1284 + }, + { + "epoch": 0.0643, + "grad_norm": 14.64246940612793, + "learning_rate": 1.286e-05, + "loss": 1.1723, + "step": 1286 + }, + { + "epoch": 0.0644, + "grad_norm": 5.437039375305176, + "learning_rate": 1.2880000000000002e-05, + "loss": 1.4106, + "step": 1288 + }, + { + "epoch": 0.0645, + "grad_norm": 7.6248955726623535, + "learning_rate": 1.2900000000000002e-05, + "loss": 0.7768, + "step": 1290 + }, + { + "epoch": 0.0646, + "grad_norm": 1.0457950830459595, + "learning_rate": 1.2920000000000002e-05, + "loss": 0.0883, + "step": 1292 + }, + { + "epoch": 0.0647, + "grad_norm": 3.3968610763549805, + "learning_rate": 1.2940000000000001e-05, + "loss": 1.2317, + "step": 1294 + }, + { + "epoch": 0.0648, + "grad_norm": 7.840994358062744, + "learning_rate": 1.2960000000000001e-05, + "loss": 1.1943, + "step": 1296 + }, + { + "epoch": 0.0649, + "grad_norm": 5.603907585144043, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.5373, + "step": 1298 + }, + { + "epoch": 0.065, + "grad_norm": 9.096872329711914, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.2792, + "step": 1300 + }, + { + "epoch": 0.0651, + "grad_norm": 4.078126430511475, + "learning_rate": 1.302e-05, + "loss": 0.7723, + "step": 1302 + }, + { + "epoch": 0.0652, + "grad_norm": 7.531287670135498, + "learning_rate": 1.3040000000000002e-05, + "loss": 0.8235, + "step": 1304 + }, + { + "epoch": 0.0653, + "grad_norm": 2.9431893825531006, + "learning_rate": 1.3060000000000002e-05, + "loss": 0.8092, + "step": 1306 + }, + { + "epoch": 0.0654, + "grad_norm": 4.836350440979004, + "learning_rate": 1.3080000000000002e-05, + "loss": 1.6298, + "step": 1308 + }, + { + "epoch": 0.0655, + "grad_norm": 6.744271755218506, + "learning_rate": 1.3100000000000002e-05, + "loss": 1.1997, + "step": 1310 + }, + { + "epoch": 0.0656, + "grad_norm": 6.661096096038818, + "learning_rate": 1.3120000000000001e-05, + "loss": 1.2012, + "step": 1312 + }, + { + "epoch": 0.0657, + "grad_norm": 4.1906609535217285, + "learning_rate": 1.3140000000000001e-05, + "loss": 1.4475, + "step": 1314 + }, + { + "epoch": 0.0658, + "grad_norm": 0.8988006711006165, + "learning_rate": 1.3160000000000001e-05, + "loss": 0.8146, + "step": 1316 + }, + { + "epoch": 0.0659, + "grad_norm": 2.8178608417510986, + "learning_rate": 1.3180000000000001e-05, + "loss": 1.1275, + "step": 1318 + }, + { + "epoch": 0.066, + "grad_norm": 6.329268932342529, + "learning_rate": 1.3200000000000002e-05, + "loss": 1.1549, + "step": 1320 + }, + { + "epoch": 0.0661, + "grad_norm": 7.180791854858398, + "learning_rate": 1.3220000000000002e-05, + "loss": 1.1828, + "step": 1322 + }, + { + "epoch": 0.0662, + "grad_norm": 5.990562438964844, + "learning_rate": 1.3240000000000002e-05, + "loss": 1.2222, + "step": 1324 + }, + { + "epoch": 0.0663, + "grad_norm": 8.074301719665527, + "learning_rate": 1.3260000000000002e-05, + "loss": 0.9453, + "step": 1326 + }, + { + "epoch": 0.0664, + "grad_norm": 5.382881164550781, + "learning_rate": 1.3280000000000002e-05, + "loss": 0.6847, + "step": 1328 + }, + { + "epoch": 0.0665, + "grad_norm": 5.8735880851745605, + "learning_rate": 1.3300000000000001e-05, + "loss": 1.0956, + "step": 1330 + }, + { + "epoch": 0.0666, + "grad_norm": 5.41558313369751, + "learning_rate": 1.3320000000000001e-05, + "loss": 0.8872, + "step": 1332 + }, + { + "epoch": 0.0667, + "grad_norm": 4.2317023277282715, + "learning_rate": 1.3340000000000001e-05, + "loss": 0.9667, + "step": 1334 + }, + { + "epoch": 0.0668, + "grad_norm": 4.628069877624512, + "learning_rate": 1.3360000000000003e-05, + "loss": 0.9248, + "step": 1336 + }, + { + "epoch": 0.0669, + "grad_norm": 10.53170394897461, + "learning_rate": 1.3380000000000002e-05, + "loss": 1.442, + "step": 1338 + }, + { + "epoch": 0.067, + "grad_norm": 25.317049026489258, + "learning_rate": 1.3400000000000002e-05, + "loss": 1.4364, + "step": 1340 + }, + { + "epoch": 0.0671, + "grad_norm": 9.617036819458008, + "learning_rate": 1.3420000000000002e-05, + "loss": 1.1011, + "step": 1342 + }, + { + "epoch": 0.0672, + "grad_norm": 0.5644320845603943, + "learning_rate": 1.3440000000000002e-05, + "loss": 0.6402, + "step": 1344 + }, + { + "epoch": 0.0673, + "grad_norm": 3.920187473297119, + "learning_rate": 1.3460000000000002e-05, + "loss": 0.6549, + "step": 1346 + }, + { + "epoch": 0.0674, + "grad_norm": 9.409157752990723, + "learning_rate": 1.3480000000000001e-05, + "loss": 0.3173, + "step": 1348 + }, + { + "epoch": 0.0675, + "grad_norm": 10.54531192779541, + "learning_rate": 1.3500000000000001e-05, + "loss": 1.3915, + "step": 1350 + }, + { + "epoch": 0.0676, + "grad_norm": 5.8932576179504395, + "learning_rate": 1.3520000000000003e-05, + "loss": 2.328, + "step": 1352 + }, + { + "epoch": 0.0677, + "grad_norm": 5.742183685302734, + "learning_rate": 1.3540000000000003e-05, + "loss": 2.0529, + "step": 1354 + }, + { + "epoch": 0.0678, + "grad_norm": 4.316269874572754, + "learning_rate": 1.3560000000000002e-05, + "loss": 1.2957, + "step": 1356 + }, + { + "epoch": 0.0679, + "grad_norm": 6.087003231048584, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.4096, + "step": 1358 + }, + { + "epoch": 0.068, + "grad_norm": 25.204313278198242, + "learning_rate": 1.3600000000000002e-05, + "loss": 1.5525, + "step": 1360 + }, + { + "epoch": 0.0681, + "grad_norm": 3.828338146209717, + "learning_rate": 1.3620000000000002e-05, + "loss": 1.9449, + "step": 1362 + }, + { + "epoch": 0.0682, + "grad_norm": 5.355881690979004, + "learning_rate": 1.3640000000000002e-05, + "loss": 0.4766, + "step": 1364 + }, + { + "epoch": 0.0683, + "grad_norm": 4.019197940826416, + "learning_rate": 1.3660000000000001e-05, + "loss": 1.274, + "step": 1366 + }, + { + "epoch": 0.0684, + "grad_norm": 8.426148414611816, + "learning_rate": 1.3680000000000003e-05, + "loss": 1.2983, + "step": 1368 + }, + { + "epoch": 0.0685, + "grad_norm": 9.510321617126465, + "learning_rate": 1.3700000000000003e-05, + "loss": 0.5466, + "step": 1370 + }, + { + "epoch": 0.0686, + "grad_norm": 8.867439270019531, + "learning_rate": 1.3720000000000002e-05, + "loss": 0.7428, + "step": 1372 + }, + { + "epoch": 0.0687, + "grad_norm": 6.810635566711426, + "learning_rate": 1.3740000000000002e-05, + "loss": 1.5325, + "step": 1374 + }, + { + "epoch": 0.0688, + "grad_norm": 5.1031060218811035, + "learning_rate": 1.376e-05, + "loss": 1.7807, + "step": 1376 + }, + { + "epoch": 0.0689, + "grad_norm": 7.67212438583374, + "learning_rate": 1.378e-05, + "loss": 1.0147, + "step": 1378 + }, + { + "epoch": 0.069, + "grad_norm": 7.103573322296143, + "learning_rate": 1.38e-05, + "loss": 1.1867, + "step": 1380 + }, + { + "epoch": 0.0691, + "grad_norm": 3.3524107933044434, + "learning_rate": 1.382e-05, + "loss": 0.7475, + "step": 1382 + }, + { + "epoch": 0.0692, + "grad_norm": 8.985180854797363, + "learning_rate": 1.384e-05, + "loss": 1.0388, + "step": 1384 + }, + { + "epoch": 0.0693, + "grad_norm": 2.2054636478424072, + "learning_rate": 1.386e-05, + "loss": 0.6959, + "step": 1386 + }, + { + "epoch": 0.0694, + "grad_norm": 4.347651958465576, + "learning_rate": 1.3880000000000001e-05, + "loss": 1.1151, + "step": 1388 + }, + { + "epoch": 0.0695, + "grad_norm": 4.904783725738525, + "learning_rate": 1.39e-05, + "loss": 0.7432, + "step": 1390 + }, + { + "epoch": 0.0696, + "grad_norm": 8.925230979919434, + "learning_rate": 1.392e-05, + "loss": 1.2158, + "step": 1392 + }, + { + "epoch": 0.0697, + "grad_norm": 7.18452262878418, + "learning_rate": 1.394e-05, + "loss": 1.4887, + "step": 1394 + }, + { + "epoch": 0.0698, + "grad_norm": 32.89626693725586, + "learning_rate": 1.396e-05, + "loss": 1.9322, + "step": 1396 + }, + { + "epoch": 0.0699, + "grad_norm": 9.3427734375, + "learning_rate": 1.398e-05, + "loss": 1.1414, + "step": 1398 + }, + { + "epoch": 0.07, + "grad_norm": 28.44440460205078, + "learning_rate": 1.4e-05, + "loss": 1.2576, + "step": 1400 + }, + { + "epoch": 0.0701, + "grad_norm": 12.003947257995605, + "learning_rate": 1.402e-05, + "loss": 1.235, + "step": 1402 + }, + { + "epoch": 0.0702, + "grad_norm": 5.110561847686768, + "learning_rate": 1.4040000000000001e-05, + "loss": 1.6539, + "step": 1404 + }, + { + "epoch": 0.0703, + "grad_norm": 9.51003360748291, + "learning_rate": 1.4060000000000001e-05, + "loss": 1.178, + "step": 1406 + }, + { + "epoch": 0.0704, + "grad_norm": 8.452774047851562, + "learning_rate": 1.408e-05, + "loss": 1.342, + "step": 1408 + }, + { + "epoch": 0.0705, + "grad_norm": 7.109194278717041, + "learning_rate": 1.41e-05, + "loss": 1.1435, + "step": 1410 + }, + { + "epoch": 0.0706, + "grad_norm": 12.102173805236816, + "learning_rate": 1.412e-05, + "loss": 0.4738, + "step": 1412 + }, + { + "epoch": 0.0707, + "grad_norm": 16.124679565429688, + "learning_rate": 1.414e-05, + "loss": 1.8477, + "step": 1414 + }, + { + "epoch": 0.0708, + "grad_norm": 9.500402450561523, + "learning_rate": 1.416e-05, + "loss": 1.0423, + "step": 1416 + }, + { + "epoch": 0.0709, + "grad_norm": 6.512474060058594, + "learning_rate": 1.418e-05, + "loss": 1.4371, + "step": 1418 + }, + { + "epoch": 0.071, + "grad_norm": 5.4758710861206055, + "learning_rate": 1.4200000000000001e-05, + "loss": 1.0897, + "step": 1420 + }, + { + "epoch": 0.0711, + "grad_norm": 7.664000511169434, + "learning_rate": 1.4220000000000001e-05, + "loss": 1.1819, + "step": 1422 + }, + { + "epoch": 0.0712, + "grad_norm": 9.81187915802002, + "learning_rate": 1.4240000000000001e-05, + "loss": 1.6963, + "step": 1424 + }, + { + "epoch": 0.0713, + "grad_norm": 3.147498846054077, + "learning_rate": 1.426e-05, + "loss": 0.9685, + "step": 1426 + }, + { + "epoch": 0.0714, + "grad_norm": 2.132870674133301, + "learning_rate": 1.428e-05, + "loss": 0.5931, + "step": 1428 + }, + { + "epoch": 0.0715, + "grad_norm": 15.546916961669922, + "learning_rate": 1.43e-05, + "loss": 0.9393, + "step": 1430 + }, + { + "epoch": 0.0716, + "grad_norm": 27.764610290527344, + "learning_rate": 1.432e-05, + "loss": 1.008, + "step": 1432 + }, + { + "epoch": 0.0717, + "grad_norm": 9.892973899841309, + "learning_rate": 1.434e-05, + "loss": 1.4722, + "step": 1434 + }, + { + "epoch": 0.0718, + "grad_norm": 13.624576568603516, + "learning_rate": 1.4360000000000001e-05, + "loss": 1.0531, + "step": 1436 + }, + { + "epoch": 0.0719, + "grad_norm": 3.8066534996032715, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.7729, + "step": 1438 + }, + { + "epoch": 0.072, + "grad_norm": 6.129350185394287, + "learning_rate": 1.4400000000000001e-05, + "loss": 1.0242, + "step": 1440 + }, + { + "epoch": 0.0721, + "grad_norm": 11.786977767944336, + "learning_rate": 1.4420000000000001e-05, + "loss": 1.229, + "step": 1442 + }, + { + "epoch": 0.0722, + "grad_norm": 6.982891082763672, + "learning_rate": 1.444e-05, + "loss": 0.9284, + "step": 1444 + }, + { + "epoch": 0.0723, + "grad_norm": 7.703229904174805, + "learning_rate": 1.446e-05, + "loss": 1.5924, + "step": 1446 + }, + { + "epoch": 0.0724, + "grad_norm": 5.820496559143066, + "learning_rate": 1.448e-05, + "loss": 1.5815, + "step": 1448 + }, + { + "epoch": 0.0725, + "grad_norm": 1.5967824459075928, + "learning_rate": 1.45e-05, + "loss": 0.7221, + "step": 1450 + }, + { + "epoch": 0.0726, + "grad_norm": 6.790178298950195, + "learning_rate": 1.4520000000000002e-05, + "loss": 0.8961, + "step": 1452 + }, + { + "epoch": 0.0727, + "grad_norm": 18.988529205322266, + "learning_rate": 1.4540000000000001e-05, + "loss": 1.3786, + "step": 1454 + }, + { + "epoch": 0.0728, + "grad_norm": 7.767969131469727, + "learning_rate": 1.4560000000000001e-05, + "loss": 1.2954, + "step": 1456 + }, + { + "epoch": 0.0729, + "grad_norm": 10.46220874786377, + "learning_rate": 1.4580000000000001e-05, + "loss": 1.0217, + "step": 1458 + }, + { + "epoch": 0.073, + "grad_norm": 5.412830352783203, + "learning_rate": 1.46e-05, + "loss": 1.3571, + "step": 1460 + }, + { + "epoch": 0.0731, + "grad_norm": 5.603935241699219, + "learning_rate": 1.462e-05, + "loss": 1.1356, + "step": 1462 + }, + { + "epoch": 0.0732, + "grad_norm": 5.354146957397461, + "learning_rate": 1.464e-05, + "loss": 1.3713, + "step": 1464 + }, + { + "epoch": 0.0733, + "grad_norm": 24.12149429321289, + "learning_rate": 1.466e-05, + "loss": 2.626, + "step": 1466 + }, + { + "epoch": 0.0734, + "grad_norm": 5.933888912200928, + "learning_rate": 1.4680000000000002e-05, + "loss": 1.3579, + "step": 1468 + }, + { + "epoch": 0.0735, + "grad_norm": 2.0073676109313965, + "learning_rate": 1.4700000000000002e-05, + "loss": 0.1123, + "step": 1470 + }, + { + "epoch": 0.0736, + "grad_norm": 7.07862663269043, + "learning_rate": 1.4720000000000001e-05, + "loss": 1.7297, + "step": 1472 + }, + { + "epoch": 0.0737, + "grad_norm": 14.096522331237793, + "learning_rate": 1.4740000000000001e-05, + "loss": 2.5762, + "step": 1474 + }, + { + "epoch": 0.0738, + "grad_norm": 46.527130126953125, + "learning_rate": 1.4760000000000001e-05, + "loss": 1.472, + "step": 1476 + }, + { + "epoch": 0.0739, + "grad_norm": 1.9647259712219238, + "learning_rate": 1.478e-05, + "loss": 1.5602, + "step": 1478 + }, + { + "epoch": 0.074, + "grad_norm": 7.14523983001709, + "learning_rate": 1.48e-05, + "loss": 1.1067, + "step": 1480 + }, + { + "epoch": 0.0741, + "grad_norm": 10.191791534423828, + "learning_rate": 1.482e-05, + "loss": 1.3686, + "step": 1482 + }, + { + "epoch": 0.0742, + "grad_norm": 10.454906463623047, + "learning_rate": 1.4840000000000002e-05, + "loss": 1.7544, + "step": 1484 + }, + { + "epoch": 0.0743, + "grad_norm": 3.9503419399261475, + "learning_rate": 1.4860000000000002e-05, + "loss": 1.2513, + "step": 1486 + }, + { + "epoch": 0.0744, + "grad_norm": 8.244281768798828, + "learning_rate": 1.4880000000000002e-05, + "loss": 1.6148, + "step": 1488 + }, + { + "epoch": 0.0745, + "grad_norm": 14.929779052734375, + "learning_rate": 1.4900000000000001e-05, + "loss": 1.0532, + "step": 1490 + }, + { + "epoch": 0.0746, + "grad_norm": 5.732949733734131, + "learning_rate": 1.4920000000000001e-05, + "loss": 0.7212, + "step": 1492 + }, + { + "epoch": 0.0747, + "grad_norm": 6.898961067199707, + "learning_rate": 1.4940000000000001e-05, + "loss": 1.4537, + "step": 1494 + }, + { + "epoch": 0.0748, + "grad_norm": 4.438570499420166, + "learning_rate": 1.496e-05, + "loss": 0.5258, + "step": 1496 + }, + { + "epoch": 0.0749, + "grad_norm": 12.411385536193848, + "learning_rate": 1.498e-05, + "loss": 0.7323, + "step": 1498 + }, + { + "epoch": 0.075, + "grad_norm": 7.320066928863525, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.3767, + "step": 1500 + }, + { + "epoch": 0.0751, + "grad_norm": 4.7857279777526855, + "learning_rate": 1.5020000000000002e-05, + "loss": 1.7208, + "step": 1502 + }, + { + "epoch": 0.0752, + "grad_norm": 6.599417686462402, + "learning_rate": 1.5040000000000002e-05, + "loss": 1.2209, + "step": 1504 + }, + { + "epoch": 0.0753, + "grad_norm": 9.805431365966797, + "learning_rate": 1.5060000000000001e-05, + "loss": 1.3166, + "step": 1506 + }, + { + "epoch": 0.0754, + "grad_norm": 8.815316200256348, + "learning_rate": 1.5080000000000001e-05, + "loss": 0.2586, + "step": 1508 + }, + { + "epoch": 0.0755, + "grad_norm": 6.093076229095459, + "learning_rate": 1.5100000000000001e-05, + "loss": 2.5123, + "step": 1510 + }, + { + "epoch": 0.0756, + "grad_norm": 4.655435085296631, + "learning_rate": 1.5120000000000001e-05, + "loss": 1.6781, + "step": 1512 + }, + { + "epoch": 0.0757, + "grad_norm": 39.41606903076172, + "learning_rate": 1.514e-05, + "loss": 2.0758, + "step": 1514 + }, + { + "epoch": 0.0758, + "grad_norm": 3.747539758682251, + "learning_rate": 1.516e-05, + "loss": 0.828, + "step": 1516 + }, + { + "epoch": 0.0759, + "grad_norm": 3.5535902976989746, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.577, + "step": 1518 + }, + { + "epoch": 0.076, + "grad_norm": 4.786956310272217, + "learning_rate": 1.5200000000000002e-05, + "loss": 1.9045, + "step": 1520 + }, + { + "epoch": 0.0761, + "grad_norm": 4.762860298156738, + "learning_rate": 1.5220000000000002e-05, + "loss": 0.7032, + "step": 1522 + }, + { + "epoch": 0.0762, + "grad_norm": 3.188835859298706, + "learning_rate": 1.5240000000000001e-05, + "loss": 1.1837, + "step": 1524 + }, + { + "epoch": 0.0763, + "grad_norm": 4.279704570770264, + "learning_rate": 1.5260000000000003e-05, + "loss": 0.6111, + "step": 1526 + }, + { + "epoch": 0.0764, + "grad_norm": 4.4822540283203125, + "learning_rate": 1.5280000000000003e-05, + "loss": 1.6057, + "step": 1528 + }, + { + "epoch": 0.0765, + "grad_norm": 7.323385715484619, + "learning_rate": 1.5300000000000003e-05, + "loss": 0.8735, + "step": 1530 + }, + { + "epoch": 0.0766, + "grad_norm": 6.732241153717041, + "learning_rate": 1.5320000000000002e-05, + "loss": 1.7098, + "step": 1532 + }, + { + "epoch": 0.0767, + "grad_norm": 2.7791249752044678, + "learning_rate": 1.5340000000000002e-05, + "loss": 1.1935, + "step": 1534 + }, + { + "epoch": 0.0768, + "grad_norm": 3.514354944229126, + "learning_rate": 1.5360000000000002e-05, + "loss": 1.3353, + "step": 1536 + }, + { + "epoch": 0.0769, + "grad_norm": 6.972660541534424, + "learning_rate": 1.5380000000000002e-05, + "loss": 1.1248, + "step": 1538 + }, + { + "epoch": 0.077, + "grad_norm": 5.514278411865234, + "learning_rate": 1.54e-05, + "loss": 1.4866, + "step": 1540 + }, + { + "epoch": 0.0771, + "grad_norm": 8.505427360534668, + "learning_rate": 1.542e-05, + "loss": 1.4854, + "step": 1542 + }, + { + "epoch": 0.0772, + "grad_norm": 0.8456571102142334, + "learning_rate": 1.544e-05, + "loss": 0.6384, + "step": 1544 + }, + { + "epoch": 0.0773, + "grad_norm": 3.089761257171631, + "learning_rate": 1.546e-05, + "loss": 0.6169, + "step": 1546 + }, + { + "epoch": 0.0774, + "grad_norm": 6.673521995544434, + "learning_rate": 1.548e-05, + "loss": 0.784, + "step": 1548 + }, + { + "epoch": 0.0775, + "grad_norm": 6.985649108886719, + "learning_rate": 1.55e-05, + "loss": 1.0077, + "step": 1550 + }, + { + "epoch": 0.0776, + "grad_norm": 6.619007587432861, + "learning_rate": 1.552e-05, + "loss": 1.1551, + "step": 1552 + }, + { + "epoch": 0.0777, + "grad_norm": 4.011566162109375, + "learning_rate": 1.554e-05, + "loss": 0.8417, + "step": 1554 + }, + { + "epoch": 0.0778, + "grad_norm": 5.2276716232299805, + "learning_rate": 1.556e-05, + "loss": 1.527, + "step": 1556 + }, + { + "epoch": 0.0779, + "grad_norm": 3.5621542930603027, + "learning_rate": 1.5580000000000003e-05, + "loss": 1.2094, + "step": 1558 + }, + { + "epoch": 0.078, + "grad_norm": 3.9600889682769775, + "learning_rate": 1.5600000000000003e-05, + "loss": 1.6411, + "step": 1560 + }, + { + "epoch": 0.0781, + "grad_norm": 6.186970233917236, + "learning_rate": 1.5620000000000003e-05, + "loss": 0.8387, + "step": 1562 + }, + { + "epoch": 0.0782, + "grad_norm": 4.962181091308594, + "learning_rate": 1.5640000000000003e-05, + "loss": 0.9643, + "step": 1564 + }, + { + "epoch": 0.0783, + "grad_norm": 4.457250595092773, + "learning_rate": 1.5660000000000003e-05, + "loss": 1.5216, + "step": 1566 + }, + { + "epoch": 0.0784, + "grad_norm": 8.559389114379883, + "learning_rate": 1.5680000000000002e-05, + "loss": 1.0739, + "step": 1568 + }, + { + "epoch": 0.0785, + "grad_norm": 5.561888694763184, + "learning_rate": 1.5700000000000002e-05, + "loss": 0.6644, + "step": 1570 + }, + { + "epoch": 0.0786, + "grad_norm": 6.130020618438721, + "learning_rate": 1.5720000000000002e-05, + "loss": 0.8746, + "step": 1572 + }, + { + "epoch": 0.0787, + "grad_norm": 3.484860420227051, + "learning_rate": 1.5740000000000002e-05, + "loss": 0.666, + "step": 1574 + }, + { + "epoch": 0.0788, + "grad_norm": 3.045590400695801, + "learning_rate": 1.576e-05, + "loss": 1.0619, + "step": 1576 + }, + { + "epoch": 0.0789, + "grad_norm": 6.4036641120910645, + "learning_rate": 1.578e-05, + "loss": 0.8544, + "step": 1578 + }, + { + "epoch": 0.079, + "grad_norm": 5.566048622131348, + "learning_rate": 1.58e-05, + "loss": 1.0429, + "step": 1580 + }, + { + "epoch": 0.0791, + "grad_norm": 6.232741832733154, + "learning_rate": 1.582e-05, + "loss": 1.0851, + "step": 1582 + }, + { + "epoch": 0.0792, + "grad_norm": 6.198548793792725, + "learning_rate": 1.584e-05, + "loss": 1.1469, + "step": 1584 + }, + { + "epoch": 0.0793, + "grad_norm": 5.069294452667236, + "learning_rate": 1.586e-05, + "loss": 0.5196, + "step": 1586 + }, + { + "epoch": 0.0794, + "grad_norm": 5.8471221923828125, + "learning_rate": 1.588e-05, + "loss": 0.7458, + "step": 1588 + }, + { + "epoch": 0.0795, + "grad_norm": 6.009354114532471, + "learning_rate": 1.5900000000000004e-05, + "loss": 1.2436, + "step": 1590 + }, + { + "epoch": 0.0796, + "grad_norm": 4.490111827850342, + "learning_rate": 1.5920000000000003e-05, + "loss": 0.5711, + "step": 1592 + }, + { + "epoch": 0.0797, + "grad_norm": 6.5707831382751465, + "learning_rate": 1.5940000000000003e-05, + "loss": 1.4978, + "step": 1594 + }, + { + "epoch": 0.0798, + "grad_norm": 15.269163131713867, + "learning_rate": 1.5960000000000003e-05, + "loss": 1.3612, + "step": 1596 + }, + { + "epoch": 0.0799, + "grad_norm": 0.832142174243927, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.7232, + "step": 1598 + }, + { + "epoch": 0.08, + "grad_norm": 6.67850923538208, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.1882, + "step": 1600 + }, + { + "epoch": 0.0801, + "grad_norm": 6.903225421905518, + "learning_rate": 1.6020000000000002e-05, + "loss": 1.6177, + "step": 1602 + }, + { + "epoch": 0.0802, + "grad_norm": 3.276470899581909, + "learning_rate": 1.6040000000000002e-05, + "loss": 0.7831, + "step": 1604 + }, + { + "epoch": 0.0803, + "grad_norm": 4.4419660568237305, + "learning_rate": 1.6060000000000002e-05, + "loss": 0.7443, + "step": 1606 + }, + { + "epoch": 0.0804, + "grad_norm": 5.4527387619018555, + "learning_rate": 1.6080000000000002e-05, + "loss": 0.6405, + "step": 1608 + }, + { + "epoch": 0.0805, + "grad_norm": 14.332117080688477, + "learning_rate": 1.6100000000000002e-05, + "loss": 1.8046, + "step": 1610 + }, + { + "epoch": 0.0806, + "grad_norm": 6.156528949737549, + "learning_rate": 1.612e-05, + "loss": 0.8589, + "step": 1612 + }, + { + "epoch": 0.0807, + "grad_norm": 5.229343414306641, + "learning_rate": 1.614e-05, + "loss": 0.7946, + "step": 1614 + }, + { + "epoch": 0.0808, + "grad_norm": 5.473735332489014, + "learning_rate": 1.616e-05, + "loss": 1.0236, + "step": 1616 + }, + { + "epoch": 0.0809, + "grad_norm": 4.288589954376221, + "learning_rate": 1.618e-05, + "loss": 1.1782, + "step": 1618 + }, + { + "epoch": 0.081, + "grad_norm": 12.664408683776855, + "learning_rate": 1.62e-05, + "loss": 2.0657, + "step": 1620 + }, + { + "epoch": 0.0811, + "grad_norm": 5.344147205352783, + "learning_rate": 1.6220000000000004e-05, + "loss": 1.223, + "step": 1622 + }, + { + "epoch": 0.0812, + "grad_norm": 5.020016193389893, + "learning_rate": 1.6240000000000004e-05, + "loss": 1.3683, + "step": 1624 + }, + { + "epoch": 0.0813, + "grad_norm": 6.023993968963623, + "learning_rate": 1.626e-05, + "loss": 1.3277, + "step": 1626 + }, + { + "epoch": 0.0814, + "grad_norm": 9.604656219482422, + "learning_rate": 1.628e-05, + "loss": 1.5686, + "step": 1628 + }, + { + "epoch": 0.0815, + "grad_norm": 23.717288970947266, + "learning_rate": 1.63e-05, + "loss": 2.4441, + "step": 1630 + }, + { + "epoch": 0.0816, + "grad_norm": 13.477996826171875, + "learning_rate": 1.632e-05, + "loss": 1.1314, + "step": 1632 + }, + { + "epoch": 0.0817, + "grad_norm": 9.284826278686523, + "learning_rate": 1.634e-05, + "loss": 0.5121, + "step": 1634 + }, + { + "epoch": 0.0818, + "grad_norm": 3.542259693145752, + "learning_rate": 1.636e-05, + "loss": 0.9671, + "step": 1636 + }, + { + "epoch": 0.0819, + "grad_norm": 3.139369487762451, + "learning_rate": 1.638e-05, + "loss": 0.911, + "step": 1638 + }, + { + "epoch": 0.082, + "grad_norm": 2.65303111076355, + "learning_rate": 1.64e-05, + "loss": 0.639, + "step": 1640 + }, + { + "epoch": 0.0821, + "grad_norm": 3.2198517322540283, + "learning_rate": 1.6420000000000002e-05, + "loss": 1.5071, + "step": 1642 + }, + { + "epoch": 0.0822, + "grad_norm": 6.8495707511901855, + "learning_rate": 1.6440000000000002e-05, + "loss": 1.4739, + "step": 1644 + }, + { + "epoch": 0.0823, + "grad_norm": 45.6915283203125, + "learning_rate": 1.646e-05, + "loss": 1.0808, + "step": 1646 + }, + { + "epoch": 0.0824, + "grad_norm": 7.354249477386475, + "learning_rate": 1.648e-05, + "loss": 0.7287, + "step": 1648 + }, + { + "epoch": 0.0825, + "grad_norm": 6.704108238220215, + "learning_rate": 1.65e-05, + "loss": 1.0735, + "step": 1650 + }, + { + "epoch": 0.0826, + "grad_norm": 14.967645645141602, + "learning_rate": 1.652e-05, + "loss": 1.3308, + "step": 1652 + }, + { + "epoch": 0.0827, + "grad_norm": 11.237373352050781, + "learning_rate": 1.654e-05, + "loss": 0.8731, + "step": 1654 + }, + { + "epoch": 0.0828, + "grad_norm": 8.797039031982422, + "learning_rate": 1.656e-05, + "loss": 1.823, + "step": 1656 + }, + { + "epoch": 0.0829, + "grad_norm": 4.507464408874512, + "learning_rate": 1.658e-05, + "loss": 0.9308, + "step": 1658 + }, + { + "epoch": 0.083, + "grad_norm": 14.249191284179688, + "learning_rate": 1.66e-05, + "loss": 1.7732, + "step": 1660 + }, + { + "epoch": 0.0831, + "grad_norm": 4.250948905944824, + "learning_rate": 1.662e-05, + "loss": 0.557, + "step": 1662 + }, + { + "epoch": 0.0832, + "grad_norm": 2.8616936206817627, + "learning_rate": 1.664e-05, + "loss": 1.2346, + "step": 1664 + }, + { + "epoch": 0.0833, + "grad_norm": 2.836146593093872, + "learning_rate": 1.666e-05, + "loss": 0.6394, + "step": 1666 + }, + { + "epoch": 0.0834, + "grad_norm": 5.6544036865234375, + "learning_rate": 1.668e-05, + "loss": 1.7694, + "step": 1668 + }, + { + "epoch": 0.0835, + "grad_norm": 5.5352253913879395, + "learning_rate": 1.67e-05, + "loss": 1.547, + "step": 1670 + }, + { + "epoch": 0.0836, + "grad_norm": 10.180037498474121, + "learning_rate": 1.672e-05, + "loss": 0.5811, + "step": 1672 + }, + { + "epoch": 0.0837, + "grad_norm": 3.2550601959228516, + "learning_rate": 1.6740000000000002e-05, + "loss": 0.4983, + "step": 1674 + }, + { + "epoch": 0.0838, + "grad_norm": 4.309610366821289, + "learning_rate": 1.6760000000000002e-05, + "loss": 0.622, + "step": 1676 + }, + { + "epoch": 0.0839, + "grad_norm": 7.231729984283447, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.8145, + "step": 1678 + }, + { + "epoch": 0.084, + "grad_norm": 5.543543338775635, + "learning_rate": 1.6800000000000002e-05, + "loss": 1.2271, + "step": 1680 + }, + { + "epoch": 0.0841, + "grad_norm": 6.444944381713867, + "learning_rate": 1.682e-05, + "loss": 1.2366, + "step": 1682 + }, + { + "epoch": 0.0842, + "grad_norm": 9.841185569763184, + "learning_rate": 1.684e-05, + "loss": 1.2164, + "step": 1684 + }, + { + "epoch": 0.0843, + "grad_norm": 7.836580753326416, + "learning_rate": 1.686e-05, + "loss": 0.8447, + "step": 1686 + }, + { + "epoch": 0.0844, + "grad_norm": 9.705449104309082, + "learning_rate": 1.688e-05, + "loss": 1.5204, + "step": 1688 + }, + { + "epoch": 0.0845, + "grad_norm": 1.9366055727005005, + "learning_rate": 1.69e-05, + "loss": 0.6301, + "step": 1690 + }, + { + "epoch": 0.0846, + "grad_norm": 10.24492359161377, + "learning_rate": 1.692e-05, + "loss": 1.1414, + "step": 1692 + }, + { + "epoch": 0.0847, + "grad_norm": 5.299622058868408, + "learning_rate": 1.694e-05, + "loss": 0.7832, + "step": 1694 + }, + { + "epoch": 0.0848, + "grad_norm": 6.796903133392334, + "learning_rate": 1.696e-05, + "loss": 0.9887, + "step": 1696 + }, + { + "epoch": 0.0849, + "grad_norm": 1.03429114818573, + "learning_rate": 1.698e-05, + "loss": 0.8863, + "step": 1698 + }, + { + "epoch": 0.085, + "grad_norm": 19.534107208251953, + "learning_rate": 1.7e-05, + "loss": 1.2648, + "step": 1700 + }, + { + "epoch": 0.0851, + "grad_norm": 5.998940467834473, + "learning_rate": 1.702e-05, + "loss": 0.5759, + "step": 1702 + }, + { + "epoch": 0.0852, + "grad_norm": 0.4458163380622864, + "learning_rate": 1.704e-05, + "loss": 0.2933, + "step": 1704 + }, + { + "epoch": 0.0853, + "grad_norm": 6.0511274337768555, + "learning_rate": 1.7060000000000003e-05, + "loss": 1.4158, + "step": 1706 + }, + { + "epoch": 0.0854, + "grad_norm": 9.737689018249512, + "learning_rate": 1.7080000000000002e-05, + "loss": 1.2685, + "step": 1708 + }, + { + "epoch": 0.0855, + "grad_norm": 4.284607887268066, + "learning_rate": 1.7100000000000002e-05, + "loss": 0.7578, + "step": 1710 + }, + { + "epoch": 0.0856, + "grad_norm": 4.8825201988220215, + "learning_rate": 1.7120000000000002e-05, + "loss": 1.1533, + "step": 1712 + }, + { + "epoch": 0.0857, + "grad_norm": 3.368161678314209, + "learning_rate": 1.7140000000000002e-05, + "loss": 1.635, + "step": 1714 + }, + { + "epoch": 0.0858, + "grad_norm": 2.274350166320801, + "learning_rate": 1.7160000000000002e-05, + "loss": 0.7365, + "step": 1716 + }, + { + "epoch": 0.0859, + "grad_norm": 14.783956527709961, + "learning_rate": 1.718e-05, + "loss": 1.0363, + "step": 1718 + }, + { + "epoch": 0.086, + "grad_norm": 9.687543869018555, + "learning_rate": 1.72e-05, + "loss": 0.9087, + "step": 1720 + }, + { + "epoch": 0.0861, + "grad_norm": 4.544224262237549, + "learning_rate": 1.722e-05, + "loss": 0.9344, + "step": 1722 + }, + { + "epoch": 0.0862, + "grad_norm": 6.332923889160156, + "learning_rate": 1.724e-05, + "loss": 0.5821, + "step": 1724 + }, + { + "epoch": 0.0863, + "grad_norm": 8.802512168884277, + "learning_rate": 1.726e-05, + "loss": 0.834, + "step": 1726 + }, + { + "epoch": 0.0864, + "grad_norm": 14.101449966430664, + "learning_rate": 1.728e-05, + "loss": 2.0976, + "step": 1728 + }, + { + "epoch": 0.0865, + "grad_norm": 5.93363618850708, + "learning_rate": 1.73e-05, + "loss": 1.3464, + "step": 1730 + }, + { + "epoch": 0.0866, + "grad_norm": 4.3302764892578125, + "learning_rate": 1.732e-05, + "loss": 1.3794, + "step": 1732 + }, + { + "epoch": 0.0867, + "grad_norm": 5.948344707489014, + "learning_rate": 1.734e-05, + "loss": 0.7297, + "step": 1734 + }, + { + "epoch": 0.0868, + "grad_norm": 5.426723480224609, + "learning_rate": 1.736e-05, + "loss": 1.2631, + "step": 1736 + }, + { + "epoch": 0.0869, + "grad_norm": 7.150834083557129, + "learning_rate": 1.7380000000000003e-05, + "loss": 1.0231, + "step": 1738 + }, + { + "epoch": 0.087, + "grad_norm": 9.162540435791016, + "learning_rate": 1.7400000000000003e-05, + "loss": 1.0252, + "step": 1740 + }, + { + "epoch": 0.0871, + "grad_norm": 34.091148376464844, + "learning_rate": 1.7420000000000003e-05, + "loss": 1.5199, + "step": 1742 + }, + { + "epoch": 0.0872, + "grad_norm": 2.6562986373901367, + "learning_rate": 1.7440000000000002e-05, + "loss": 0.7726, + "step": 1744 + }, + { + "epoch": 0.0873, + "grad_norm": 4.370255947113037, + "learning_rate": 1.7460000000000002e-05, + "loss": 0.8121, + "step": 1746 + }, + { + "epoch": 0.0874, + "grad_norm": 3.62629771232605, + "learning_rate": 1.7480000000000002e-05, + "loss": 1.1303, + "step": 1748 + }, + { + "epoch": 0.0875, + "grad_norm": 7.243328094482422, + "learning_rate": 1.7500000000000002e-05, + "loss": 0.6101, + "step": 1750 + }, + { + "epoch": 0.0876, + "grad_norm": 4.262324333190918, + "learning_rate": 1.752e-05, + "loss": 1.5866, + "step": 1752 + }, + { + "epoch": 0.0877, + "grad_norm": 4.266479015350342, + "learning_rate": 1.754e-05, + "loss": 0.7793, + "step": 1754 + }, + { + "epoch": 0.0878, + "grad_norm": 3.399143934249878, + "learning_rate": 1.756e-05, + "loss": 1.2661, + "step": 1756 + }, + { + "epoch": 0.0879, + "grad_norm": 2.1558520793914795, + "learning_rate": 1.758e-05, + "loss": 0.2735, + "step": 1758 + }, + { + "epoch": 0.088, + "grad_norm": 4.199793815612793, + "learning_rate": 1.76e-05, + "loss": 0.9939, + "step": 1760 + }, + { + "epoch": 0.0881, + "grad_norm": 9.332736015319824, + "learning_rate": 1.762e-05, + "loss": 1.126, + "step": 1762 + }, + { + "epoch": 0.0882, + "grad_norm": 9.398019790649414, + "learning_rate": 1.764e-05, + "loss": 1.1932, + "step": 1764 + }, + { + "epoch": 0.0883, + "grad_norm": 5.996257781982422, + "learning_rate": 1.766e-05, + "loss": 0.6756, + "step": 1766 + }, + { + "epoch": 0.0884, + "grad_norm": 3.6213271617889404, + "learning_rate": 1.768e-05, + "loss": 1.1293, + "step": 1768 + }, + { + "epoch": 0.0885, + "grad_norm": 5.881955623626709, + "learning_rate": 1.77e-05, + "loss": 0.7709, + "step": 1770 + }, + { + "epoch": 0.0886, + "grad_norm": 17.262649536132812, + "learning_rate": 1.7720000000000003e-05, + "loss": 1.6913, + "step": 1772 + }, + { + "epoch": 0.0887, + "grad_norm": 2.969017267227173, + "learning_rate": 1.7740000000000003e-05, + "loss": 1.0729, + "step": 1774 + }, + { + "epoch": 0.0888, + "grad_norm": 6.495604038238525, + "learning_rate": 1.7760000000000003e-05, + "loss": 1.0538, + "step": 1776 + }, + { + "epoch": 0.0889, + "grad_norm": 4.9988837242126465, + "learning_rate": 1.7780000000000003e-05, + "loss": 1.0354, + "step": 1778 + }, + { + "epoch": 0.089, + "grad_norm": 5.415066719055176, + "learning_rate": 1.7800000000000002e-05, + "loss": 0.9608, + "step": 1780 + }, + { + "epoch": 0.0891, + "grad_norm": 1.6169946193695068, + "learning_rate": 1.7820000000000002e-05, + "loss": 0.5985, + "step": 1782 + }, + { + "epoch": 0.0892, + "grad_norm": 6.858080863952637, + "learning_rate": 1.7840000000000002e-05, + "loss": 1.0027, + "step": 1784 + }, + { + "epoch": 0.0893, + "grad_norm": 0.30530184507369995, + "learning_rate": 1.7860000000000002e-05, + "loss": 0.4139, + "step": 1786 + }, + { + "epoch": 0.0894, + "grad_norm": 2.352024793624878, + "learning_rate": 1.788e-05, + "loss": 0.6588, + "step": 1788 + }, + { + "epoch": 0.0895, + "grad_norm": 6.281435966491699, + "learning_rate": 1.79e-05, + "loss": 1.6035, + "step": 1790 + }, + { + "epoch": 0.0896, + "grad_norm": 5.124031066894531, + "learning_rate": 1.792e-05, + "loss": 1.2272, + "step": 1792 + }, + { + "epoch": 0.0897, + "grad_norm": 4.027751922607422, + "learning_rate": 1.794e-05, + "loss": 0.573, + "step": 1794 + }, + { + "epoch": 0.0898, + "grad_norm": 4.998028755187988, + "learning_rate": 1.796e-05, + "loss": 1.1441, + "step": 1796 + }, + { + "epoch": 0.0899, + "grad_norm": 3.309250593185425, + "learning_rate": 1.798e-05, + "loss": 1.3229, + "step": 1798 + }, + { + "epoch": 0.09, + "grad_norm": 8.782491683959961, + "learning_rate": 1.8e-05, + "loss": 0.662, + "step": 1800 + }, + { + "epoch": 0.0901, + "grad_norm": 3.6818721294403076, + "learning_rate": 1.802e-05, + "loss": 0.9969, + "step": 1802 + }, + { + "epoch": 0.0902, + "grad_norm": 14.847655296325684, + "learning_rate": 1.8040000000000003e-05, + "loss": 1.6097, + "step": 1804 + }, + { + "epoch": 0.0903, + "grad_norm": 8.780677795410156, + "learning_rate": 1.8060000000000003e-05, + "loss": 1.1406, + "step": 1806 + }, + { + "epoch": 0.0904, + "grad_norm": 4.410964488983154, + "learning_rate": 1.8080000000000003e-05, + "loss": 0.9433, + "step": 1808 + }, + { + "epoch": 0.0905, + "grad_norm": 6.927922248840332, + "learning_rate": 1.8100000000000003e-05, + "loss": 1.3179, + "step": 1810 + }, + { + "epoch": 0.0906, + "grad_norm": 14.951257705688477, + "learning_rate": 1.8120000000000003e-05, + "loss": 1.2183, + "step": 1812 + }, + { + "epoch": 0.0907, + "grad_norm": 16.835424423217773, + "learning_rate": 1.8140000000000003e-05, + "loss": 1.011, + "step": 1814 + }, + { + "epoch": 0.0908, + "grad_norm": 2.5623795986175537, + "learning_rate": 1.8160000000000002e-05, + "loss": 0.9126, + "step": 1816 + }, + { + "epoch": 0.0909, + "grad_norm": 3.048211097717285, + "learning_rate": 1.8180000000000002e-05, + "loss": 1.4878, + "step": 1818 + }, + { + "epoch": 0.091, + "grad_norm": 13.521275520324707, + "learning_rate": 1.8200000000000002e-05, + "loss": 1.0626, + "step": 1820 + }, + { + "epoch": 0.0911, + "grad_norm": 10.930695533752441, + "learning_rate": 1.8220000000000002e-05, + "loss": 0.8008, + "step": 1822 + }, + { + "epoch": 0.0912, + "grad_norm": 0.3371714949607849, + "learning_rate": 1.824e-05, + "loss": 0.5639, + "step": 1824 + }, + { + "epoch": 0.0913, + "grad_norm": 14.203812599182129, + "learning_rate": 1.826e-05, + "loss": 1.346, + "step": 1826 + }, + { + "epoch": 0.0914, + "grad_norm": 3.911320686340332, + "learning_rate": 1.828e-05, + "loss": 0.4734, + "step": 1828 + }, + { + "epoch": 0.0915, + "grad_norm": 4.598682403564453, + "learning_rate": 1.83e-05, + "loss": 1.2675, + "step": 1830 + }, + { + "epoch": 0.0916, + "grad_norm": 0.722104549407959, + "learning_rate": 1.832e-05, + "loss": 0.7247, + "step": 1832 + }, + { + "epoch": 0.0917, + "grad_norm": 12.244229316711426, + "learning_rate": 1.834e-05, + "loss": 1.1693, + "step": 1834 + }, + { + "epoch": 0.0918, + "grad_norm": 3.057046413421631, + "learning_rate": 1.8360000000000004e-05, + "loss": 1.3748, + "step": 1836 + }, + { + "epoch": 0.0919, + "grad_norm": 7.486143112182617, + "learning_rate": 1.8380000000000004e-05, + "loss": 1.0866, + "step": 1838 + }, + { + "epoch": 0.092, + "grad_norm": 14.562644958496094, + "learning_rate": 1.8400000000000003e-05, + "loss": 0.9966, + "step": 1840 + }, + { + "epoch": 0.0921, + "grad_norm": 7.560813903808594, + "learning_rate": 1.8420000000000003e-05, + "loss": 1.2113, + "step": 1842 + }, + { + "epoch": 0.0922, + "grad_norm": 5.076328754425049, + "learning_rate": 1.8440000000000003e-05, + "loss": 1.4944, + "step": 1844 + }, + { + "epoch": 0.0923, + "grad_norm": 8.508075714111328, + "learning_rate": 1.8460000000000003e-05, + "loss": 2.9955, + "step": 1846 + }, + { + "epoch": 0.0924, + "grad_norm": 3.322956085205078, + "learning_rate": 1.8480000000000003e-05, + "loss": 1.5473, + "step": 1848 + }, + { + "epoch": 0.0925, + "grad_norm": 6.028435707092285, + "learning_rate": 1.8500000000000002e-05, + "loss": 1.0391, + "step": 1850 + }, + { + "epoch": 0.0926, + "grad_norm": 2.968980312347412, + "learning_rate": 1.8520000000000002e-05, + "loss": 0.189, + "step": 1852 + }, + { + "epoch": 0.0927, + "grad_norm": 11.974676132202148, + "learning_rate": 1.8540000000000002e-05, + "loss": 1.8091, + "step": 1854 + }, + { + "epoch": 0.0928, + "grad_norm": 7.013382911682129, + "learning_rate": 1.8560000000000002e-05, + "loss": 1.7347, + "step": 1856 + }, + { + "epoch": 0.0929, + "grad_norm": 9.187153816223145, + "learning_rate": 1.858e-05, + "loss": 1.4714, + "step": 1858 + }, + { + "epoch": 0.093, + "grad_norm": 4.30838680267334, + "learning_rate": 1.86e-05, + "loss": 1.116, + "step": 1860 + }, + { + "epoch": 0.0931, + "grad_norm": 4.026073932647705, + "learning_rate": 1.862e-05, + "loss": 1.5222, + "step": 1862 + }, + { + "epoch": 0.0932, + "grad_norm": 5.548600196838379, + "learning_rate": 1.864e-05, + "loss": 1.1226, + "step": 1864 + }, + { + "epoch": 0.0933, + "grad_norm": 2.633415699005127, + "learning_rate": 1.866e-05, + "loss": 1.3933, + "step": 1866 + }, + { + "epoch": 0.0934, + "grad_norm": 2.496490478515625, + "learning_rate": 1.8680000000000004e-05, + "loss": 0.4539, + "step": 1868 + }, + { + "epoch": 0.0935, + "grad_norm": 8.710137367248535, + "learning_rate": 1.8700000000000004e-05, + "loss": 1.2389, + "step": 1870 + }, + { + "epoch": 0.0936, + "grad_norm": 4.028244972229004, + "learning_rate": 1.8720000000000004e-05, + "loss": 0.8122, + "step": 1872 + }, + { + "epoch": 0.0937, + "grad_norm": 2.9653894901275635, + "learning_rate": 1.8740000000000004e-05, + "loss": 1.3287, + "step": 1874 + }, + { + "epoch": 0.0938, + "grad_norm": 5.590135097503662, + "learning_rate": 1.876e-05, + "loss": 1.1, + "step": 1876 + }, + { + "epoch": 0.0939, + "grad_norm": 4.508907318115234, + "learning_rate": 1.878e-05, + "loss": 1.2423, + "step": 1878 + }, + { + "epoch": 0.094, + "grad_norm": 4.8375935554504395, + "learning_rate": 1.88e-05, + "loss": 0.9901, + "step": 1880 + }, + { + "epoch": 0.0941, + "grad_norm": 9.069660186767578, + "learning_rate": 1.882e-05, + "loss": 0.9405, + "step": 1882 + }, + { + "epoch": 0.0942, + "grad_norm": 8.112188339233398, + "learning_rate": 1.884e-05, + "loss": 1.408, + "step": 1884 + }, + { + "epoch": 0.0943, + "grad_norm": 3.526505470275879, + "learning_rate": 1.886e-05, + "loss": 1.6134, + "step": 1886 + }, + { + "epoch": 0.0944, + "grad_norm": 3.457552909851074, + "learning_rate": 1.8880000000000002e-05, + "loss": 0.9866, + "step": 1888 + }, + { + "epoch": 0.0945, + "grad_norm": 9.279394149780273, + "learning_rate": 1.8900000000000002e-05, + "loss": 1.1032, + "step": 1890 + }, + { + "epoch": 0.0946, + "grad_norm": 12.204876899719238, + "learning_rate": 1.8920000000000002e-05, + "loss": 0.8021, + "step": 1892 + }, + { + "epoch": 0.0947, + "grad_norm": 2.060600996017456, + "learning_rate": 1.894e-05, + "loss": 0.5606, + "step": 1894 + }, + { + "epoch": 0.0948, + "grad_norm": 6.055602073669434, + "learning_rate": 1.896e-05, + "loss": 0.9039, + "step": 1896 + }, + { + "epoch": 0.0949, + "grad_norm": 34.761539459228516, + "learning_rate": 1.898e-05, + "loss": 2.2622, + "step": 1898 + }, + { + "epoch": 0.095, + "grad_norm": 8.321675300598145, + "learning_rate": 1.9e-05, + "loss": 0.6783, + "step": 1900 + }, + { + "epoch": 0.0951, + "grad_norm": 5.075240612030029, + "learning_rate": 1.902e-05, + "loss": 1.1548, + "step": 1902 + }, + { + "epoch": 0.0952, + "grad_norm": 7.804504871368408, + "learning_rate": 1.904e-05, + "loss": 0.9496, + "step": 1904 + }, + { + "epoch": 0.0953, + "grad_norm": 5.6548566818237305, + "learning_rate": 1.906e-05, + "loss": 1.3082, + "step": 1906 + }, + { + "epoch": 0.0954, + "grad_norm": 3.6705029010772705, + "learning_rate": 1.908e-05, + "loss": 0.428, + "step": 1908 + }, + { + "epoch": 0.0955, + "grad_norm": 9.425215721130371, + "learning_rate": 1.91e-05, + "loss": 1.3122, + "step": 1910 + }, + { + "epoch": 0.0956, + "grad_norm": 5.690339088439941, + "learning_rate": 1.912e-05, + "loss": 1.0647, + "step": 1912 + }, + { + "epoch": 0.0957, + "grad_norm": 3.447035074234009, + "learning_rate": 1.914e-05, + "loss": 1.1348, + "step": 1914 + }, + { + "epoch": 0.0958, + "grad_norm": 2.801941156387329, + "learning_rate": 1.916e-05, + "loss": 1.0077, + "step": 1916 + }, + { + "epoch": 0.0959, + "grad_norm": 4.38978910446167, + "learning_rate": 1.918e-05, + "loss": 1.0676, + "step": 1918 + }, + { + "epoch": 0.096, + "grad_norm": 4.446739673614502, + "learning_rate": 1.9200000000000003e-05, + "loss": 1.078, + "step": 1920 + }, + { + "epoch": 0.0961, + "grad_norm": 2.371164560317993, + "learning_rate": 1.9220000000000002e-05, + "loss": 1.3155, + "step": 1922 + }, + { + "epoch": 0.0962, + "grad_norm": 4.746744155883789, + "learning_rate": 1.9240000000000002e-05, + "loss": 0.9704, + "step": 1924 + }, + { + "epoch": 0.0963, + "grad_norm": 2.719034433364868, + "learning_rate": 1.9260000000000002e-05, + "loss": 1.0352, + "step": 1926 + }, + { + "epoch": 0.0964, + "grad_norm": 5.245693206787109, + "learning_rate": 1.9280000000000002e-05, + "loss": 0.7016, + "step": 1928 + }, + { + "epoch": 0.0965, + "grad_norm": 4.651074409484863, + "learning_rate": 1.93e-05, + "loss": 1.2903, + "step": 1930 + }, + { + "epoch": 0.0966, + "grad_norm": 5.967883586883545, + "learning_rate": 1.932e-05, + "loss": 0.6146, + "step": 1932 + }, + { + "epoch": 0.0967, + "grad_norm": 7.372206687927246, + "learning_rate": 1.934e-05, + "loss": 1.371, + "step": 1934 + }, + { + "epoch": 0.0968, + "grad_norm": 4.903698444366455, + "learning_rate": 1.936e-05, + "loss": 1.0446, + "step": 1936 + }, + { + "epoch": 0.0969, + "grad_norm": 3.082838535308838, + "learning_rate": 1.938e-05, + "loss": 1.2458, + "step": 1938 + }, + { + "epoch": 0.097, + "grad_norm": 0.7963811755180359, + "learning_rate": 1.94e-05, + "loss": 0.496, + "step": 1940 + }, + { + "epoch": 0.0971, + "grad_norm": 6.869136810302734, + "learning_rate": 1.942e-05, + "loss": 1.0903, + "step": 1942 + }, + { + "epoch": 0.0972, + "grad_norm": 15.804500579833984, + "learning_rate": 1.944e-05, + "loss": 1.2119, + "step": 1944 + }, + { + "epoch": 0.0973, + "grad_norm": 5.02974796295166, + "learning_rate": 1.946e-05, + "loss": 1.2541, + "step": 1946 + }, + { + "epoch": 0.0974, + "grad_norm": 15.71091365814209, + "learning_rate": 1.948e-05, + "loss": 0.9498, + "step": 1948 + }, + { + "epoch": 0.0975, + "grad_norm": 5.3837127685546875, + "learning_rate": 1.95e-05, + "loss": 0.3154, + "step": 1950 + }, + { + "epoch": 0.0976, + "grad_norm": 1.1503568887710571, + "learning_rate": 1.9520000000000003e-05, + "loss": 0.833, + "step": 1952 + }, + { + "epoch": 0.0977, + "grad_norm": 3.077718496322632, + "learning_rate": 1.9540000000000003e-05, + "loss": 1.2294, + "step": 1954 + }, + { + "epoch": 0.0978, + "grad_norm": 11.421892166137695, + "learning_rate": 1.9560000000000002e-05, + "loss": 1.7425, + "step": 1956 + }, + { + "epoch": 0.0979, + "grad_norm": 5.130181312561035, + "learning_rate": 1.9580000000000002e-05, + "loss": 1.2019, + "step": 1958 + }, + { + "epoch": 0.098, + "grad_norm": 6.460605144500732, + "learning_rate": 1.9600000000000002e-05, + "loss": 0.5863, + "step": 1960 + }, + { + "epoch": 0.0981, + "grad_norm": 3.9000513553619385, + "learning_rate": 1.9620000000000002e-05, + "loss": 0.969, + "step": 1962 + }, + { + "epoch": 0.0982, + "grad_norm": 10.90976333618164, + "learning_rate": 1.9640000000000002e-05, + "loss": 1.1674, + "step": 1964 + }, + { + "epoch": 0.0983, + "grad_norm": 4.967965602874756, + "learning_rate": 1.966e-05, + "loss": 1.206, + "step": 1966 + }, + { + "epoch": 0.0984, + "grad_norm": 5.328191757202148, + "learning_rate": 1.968e-05, + "loss": 1.0352, + "step": 1968 + }, + { + "epoch": 0.0985, + "grad_norm": 10.202163696289062, + "learning_rate": 1.97e-05, + "loss": 0.5398, + "step": 1970 + }, + { + "epoch": 0.0986, + "grad_norm": 3.5973713397979736, + "learning_rate": 1.972e-05, + "loss": 0.9523, + "step": 1972 + }, + { + "epoch": 0.0987, + "grad_norm": 3.373403549194336, + "learning_rate": 1.974e-05, + "loss": 1.4359, + "step": 1974 + }, + { + "epoch": 0.0988, + "grad_norm": 2.853422164916992, + "learning_rate": 1.976e-05, + "loss": 1.4441, + "step": 1976 + }, + { + "epoch": 0.0989, + "grad_norm": 6.324268341064453, + "learning_rate": 1.978e-05, + "loss": 1.2338, + "step": 1978 + }, + { + "epoch": 0.099, + "grad_norm": 4.778919696807861, + "learning_rate": 1.98e-05, + "loss": 1.5342, + "step": 1980 + }, + { + "epoch": 0.0991, + "grad_norm": 5.240677833557129, + "learning_rate": 1.982e-05, + "loss": 1.1527, + "step": 1982 + }, + { + "epoch": 0.0992, + "grad_norm": 5.845341205596924, + "learning_rate": 1.9840000000000003e-05, + "loss": 0.7062, + "step": 1984 + }, + { + "epoch": 0.0993, + "grad_norm": 5.077467441558838, + "learning_rate": 1.9860000000000003e-05, + "loss": 0.9388, + "step": 1986 + }, + { + "epoch": 0.0994, + "grad_norm": 6.66290283203125, + "learning_rate": 1.9880000000000003e-05, + "loss": 0.988, + "step": 1988 + }, + { + "epoch": 0.0995, + "grad_norm": 6.298457145690918, + "learning_rate": 1.9900000000000003e-05, + "loss": 1.0309, + "step": 1990 + }, + { + "epoch": 0.0996, + "grad_norm": 4.712209224700928, + "learning_rate": 1.9920000000000002e-05, + "loss": 1.342, + "step": 1992 + }, + { + "epoch": 0.0997, + "grad_norm": 10.792159080505371, + "learning_rate": 1.9940000000000002e-05, + "loss": 1.1619, + "step": 1994 + }, + { + "epoch": 0.0998, + "grad_norm": 2.6689975261688232, + "learning_rate": 1.9960000000000002e-05, + "loss": 0.9886, + "step": 1996 + }, + { + "epoch": 0.0999, + "grad_norm": 3.061424732208252, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.976, + "step": 1998 + }, + { + "epoch": 0.1, + "grad_norm": 12.090456008911133, + "learning_rate": 2e-05, + "loss": 1.3007, + "step": 2000 + }, + { + "epoch": 0.1001, + "grad_norm": 3.7869791984558105, + "learning_rate": 1.9999999390765168e-05, + "loss": 0.7142, + "step": 2002 + }, + { + "epoch": 0.1002, + "grad_norm": 3.7019472122192383, + "learning_rate": 1.9999997563060744e-05, + "loss": 0.497, + "step": 2004 + }, + { + "epoch": 0.1003, + "grad_norm": 4.150712013244629, + "learning_rate": 1.9999994516886947e-05, + "loss": 1.2129, + "step": 2006 + }, + { + "epoch": 0.1004, + "grad_norm": 6.999486446380615, + "learning_rate": 1.9999990252244153e-05, + "loss": 1.0957, + "step": 2008 + }, + { + "epoch": 0.1005, + "grad_norm": 6.053045272827148, + "learning_rate": 1.999998476913288e-05, + "loss": 0.9829, + "step": 2010 + }, + { + "epoch": 0.1006, + "grad_norm": 4.081940650939941, + "learning_rate": 1.9999978067553796e-05, + "loss": 1.1124, + "step": 2012 + }, + { + "epoch": 0.1007, + "grad_norm": 2.9579901695251465, + "learning_rate": 1.9999970147507714e-05, + "loss": 0.4165, + "step": 2014 + }, + { + "epoch": 0.1008, + "grad_norm": 1.9643069505691528, + "learning_rate": 1.9999961008995607e-05, + "loss": 0.556, + "step": 2016 + }, + { + "epoch": 0.1009, + "grad_norm": 11.944250106811523, + "learning_rate": 1.9999950652018585e-05, + "loss": 1.2728, + "step": 2018 + }, + { + "epoch": 0.101, + "grad_norm": 15.715225219726562, + "learning_rate": 1.9999939076577906e-05, + "loss": 1.6404, + "step": 2020 + }, + { + "epoch": 0.1011, + "grad_norm": 18.668657302856445, + "learning_rate": 1.9999926282674985e-05, + "loss": 1.9278, + "step": 2022 + }, + { + "epoch": 0.1012, + "grad_norm": 10.032806396484375, + "learning_rate": 1.9999912270311376e-05, + "loss": 0.5904, + "step": 2024 + }, + { + "epoch": 0.1013, + "grad_norm": 3.2620043754577637, + "learning_rate": 1.9999897039488794e-05, + "loss": 0.398, + "step": 2026 + }, + { + "epoch": 0.1014, + "grad_norm": 4.808749675750732, + "learning_rate": 1.999988059020909e-05, + "loss": 1.5725, + "step": 2028 + }, + { + "epoch": 0.1015, + "grad_norm": 7.241910934448242, + "learning_rate": 1.999986292247427e-05, + "loss": 1.8173, + "step": 2030 + }, + { + "epoch": 0.1016, + "grad_norm": 7.789743900299072, + "learning_rate": 1.9999844036286483e-05, + "loss": 0.5972, + "step": 2032 + }, + { + "epoch": 0.1017, + "grad_norm": 6.311699867248535, + "learning_rate": 1.9999823931648036e-05, + "loss": 0.8872, + "step": 2034 + }, + { + "epoch": 0.1018, + "grad_norm": 2.9201772212982178, + "learning_rate": 1.999980260856137e-05, + "loss": 0.6031, + "step": 2036 + }, + { + "epoch": 0.1019, + "grad_norm": 5.625965118408203, + "learning_rate": 1.9999780067029095e-05, + "loss": 0.7215, + "step": 2038 + }, + { + "epoch": 0.102, + "grad_norm": 7.119924068450928, + "learning_rate": 1.9999756307053947e-05, + "loss": 1.4238, + "step": 2040 + }, + { + "epoch": 0.1021, + "grad_norm": 12.68378734588623, + "learning_rate": 1.9999731328638828e-05, + "loss": 1.0629, + "step": 2042 + }, + { + "epoch": 0.1022, + "grad_norm": 2.787313938140869, + "learning_rate": 1.999970513178678e-05, + "loss": 0.6362, + "step": 2044 + }, + { + "epoch": 0.1023, + "grad_norm": 7.80107307434082, + "learning_rate": 1.9999677716500994e-05, + "loss": 1.3612, + "step": 2046 + }, + { + "epoch": 0.1024, + "grad_norm": 3.896831750869751, + "learning_rate": 1.9999649082784807e-05, + "loss": 0.8396, + "step": 2048 + }, + { + "epoch": 0.1025, + "grad_norm": 8.54690933227539, + "learning_rate": 1.9999619230641714e-05, + "loss": 1.4529, + "step": 2050 + }, + { + "epoch": 0.1026, + "grad_norm": 2.662302017211914, + "learning_rate": 1.999958816007535e-05, + "loss": 1.0891, + "step": 2052 + }, + { + "epoch": 0.1027, + "grad_norm": 5.859204292297363, + "learning_rate": 1.99995558710895e-05, + "loss": 1.2567, + "step": 2054 + }, + { + "epoch": 0.1028, + "grad_norm": 19.12455177307129, + "learning_rate": 1.99995223636881e-05, + "loss": 2.1866, + "step": 2056 + }, + { + "epoch": 0.1029, + "grad_norm": 6.770497798919678, + "learning_rate": 1.999948763787523e-05, + "loss": 0.872, + "step": 2058 + }, + { + "epoch": 0.103, + "grad_norm": 5.640937328338623, + "learning_rate": 1.9999451693655125e-05, + "loss": 1.2069, + "step": 2060 + }, + { + "epoch": 0.1031, + "grad_norm": 6.5136637687683105, + "learning_rate": 1.999941453103216e-05, + "loss": 0.7334, + "step": 2062 + }, + { + "epoch": 0.1032, + "grad_norm": 4.797397613525391, + "learning_rate": 1.9999376150010868e-05, + "loss": 0.7819, + "step": 2064 + }, + { + "epoch": 0.1033, + "grad_norm": 3.6518616676330566, + "learning_rate": 1.999933655059592e-05, + "loss": 1.3179, + "step": 2066 + }, + { + "epoch": 0.1034, + "grad_norm": 0.4198700189590454, + "learning_rate": 1.9999295732792146e-05, + "loss": 0.4322, + "step": 2068 + }, + { + "epoch": 0.1035, + "grad_norm": 2.6212356090545654, + "learning_rate": 1.9999253696604522e-05, + "loss": 0.9648, + "step": 2070 + }, + { + "epoch": 0.1036, + "grad_norm": 4.474184036254883, + "learning_rate": 1.9999210442038164e-05, + "loss": 1.4586, + "step": 2072 + }, + { + "epoch": 0.1037, + "grad_norm": 22.656841278076172, + "learning_rate": 1.9999165969098344e-05, + "loss": 1.2245, + "step": 2074 + }, + { + "epoch": 0.1038, + "grad_norm": 7.073273658752441, + "learning_rate": 1.9999120277790477e-05, + "loss": 1.7835, + "step": 2076 + }, + { + "epoch": 0.1039, + "grad_norm": 12.887654304504395, + "learning_rate": 1.9999073368120142e-05, + "loss": 0.9149, + "step": 2078 + }, + { + "epoch": 0.104, + "grad_norm": 8.234147071838379, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.6979, + "step": 2080 + }, + { + "epoch": 0.1041, + "grad_norm": 6.819356918334961, + "learning_rate": 1.999897589371505e-05, + "loss": 0.4731, + "step": 2082 + }, + { + "epoch": 0.1042, + "grad_norm": 5.0821614265441895, + "learning_rate": 1.9998925328992175e-05, + "loss": 1.4853, + "step": 2084 + }, + { + "epoch": 0.1043, + "grad_norm": 9.256534576416016, + "learning_rate": 1.999887354593058e-05, + "loss": 0.5396, + "step": 2086 + }, + { + "epoch": 0.1044, + "grad_norm": 18.78339385986328, + "learning_rate": 1.999882054453657e-05, + "loss": 1.9107, + "step": 2088 + }, + { + "epoch": 0.1045, + "grad_norm": 3.0221211910247803, + "learning_rate": 1.9998766324816606e-05, + "loss": 1.1736, + "step": 2090 + }, + { + "epoch": 0.1046, + "grad_norm": 2.4923462867736816, + "learning_rate": 1.9998710886777298e-05, + "loss": 0.8533, + "step": 2092 + }, + { + "epoch": 0.1047, + "grad_norm": 3.2599658966064453, + "learning_rate": 1.9998654230425396e-05, + "loss": 0.7558, + "step": 2094 + }, + { + "epoch": 0.1048, + "grad_norm": 11.037864685058594, + "learning_rate": 1.9998596355767805e-05, + "loss": 1.0417, + "step": 2096 + }, + { + "epoch": 0.1049, + "grad_norm": 4.306044578552246, + "learning_rate": 1.999853726281158e-05, + "loss": 0.6919, + "step": 2098 + }, + { + "epoch": 0.105, + "grad_norm": 14.13884162902832, + "learning_rate": 1.9998476951563914e-05, + "loss": 1.2985, + "step": 2100 + }, + { + "epoch": 0.1051, + "grad_norm": 4.635307788848877, + "learning_rate": 1.9998415422032163e-05, + "loss": 0.9096, + "step": 2102 + }, + { + "epoch": 0.1052, + "grad_norm": 3.4774439334869385, + "learning_rate": 1.9998352674223816e-05, + "loss": 0.496, + "step": 2104 + }, + { + "epoch": 0.1053, + "grad_norm": 9.526859283447266, + "learning_rate": 1.999828870814653e-05, + "loss": 1.193, + "step": 2106 + }, + { + "epoch": 0.1054, + "grad_norm": 11.767050743103027, + "learning_rate": 1.9998223523808092e-05, + "loss": 1.5524, + "step": 2108 + }, + { + "epoch": 0.1055, + "grad_norm": 3.2314388751983643, + "learning_rate": 1.9998157121216442e-05, + "loss": 0.7628, + "step": 2110 + }, + { + "epoch": 0.1056, + "grad_norm": 4.044699668884277, + "learning_rate": 1.999808950037968e-05, + "loss": 0.9606, + "step": 2112 + }, + { + "epoch": 0.1057, + "grad_norm": 7.136442184448242, + "learning_rate": 1.9998020661306037e-05, + "loss": 1.4872, + "step": 2114 + }, + { + "epoch": 0.1058, + "grad_norm": 7.173998832702637, + "learning_rate": 1.99979506040039e-05, + "loss": 1.2486, + "step": 2116 + }, + { + "epoch": 0.1059, + "grad_norm": 10.983241081237793, + "learning_rate": 1.9997879328481816e-05, + "loss": 0.9657, + "step": 2118 + }, + { + "epoch": 0.106, + "grad_norm": 4.274838447570801, + "learning_rate": 1.9997806834748455e-05, + "loss": 0.6107, + "step": 2120 + }, + { + "epoch": 0.1061, + "grad_norm": 9.501237869262695, + "learning_rate": 1.9997733122812663e-05, + "loss": 1.5223, + "step": 2122 + }, + { + "epoch": 0.1062, + "grad_norm": 5.152162551879883, + "learning_rate": 1.9997658192683412e-05, + "loss": 0.9617, + "step": 2124 + }, + { + "epoch": 0.1063, + "grad_norm": 3.3322579860687256, + "learning_rate": 1.9997582044369843e-05, + "loss": 0.5272, + "step": 2126 + }, + { + "epoch": 0.1064, + "grad_norm": 9.916544914245605, + "learning_rate": 1.9997504677881224e-05, + "loss": 0.7575, + "step": 2128 + }, + { + "epoch": 0.1065, + "grad_norm": 3.7730565071105957, + "learning_rate": 1.9997426093226984e-05, + "loss": 1.0189, + "step": 2130 + }, + { + "epoch": 0.1066, + "grad_norm": 7.0244975090026855, + "learning_rate": 1.9997346290416703e-05, + "loss": 0.7603, + "step": 2132 + }, + { + "epoch": 0.1067, + "grad_norm": 14.411415100097656, + "learning_rate": 1.99972652694601e-05, + "loss": 1.5732, + "step": 2134 + }, + { + "epoch": 0.1068, + "grad_norm": 7.789346218109131, + "learning_rate": 1.999718303036705e-05, + "loss": 1.9374, + "step": 2136 + }, + { + "epoch": 0.1069, + "grad_norm": 8.24602222442627, + "learning_rate": 1.999709957314757e-05, + "loss": 0.8693, + "step": 2138 + }, + { + "epoch": 0.107, + "grad_norm": 2.801384449005127, + "learning_rate": 1.9997014897811834e-05, + "loss": 0.5823, + "step": 2140 + }, + { + "epoch": 0.1071, + "grad_norm": 6.298037528991699, + "learning_rate": 1.9996929004370152e-05, + "loss": 1.2315, + "step": 2142 + }, + { + "epoch": 0.1072, + "grad_norm": 5.116518497467041, + "learning_rate": 1.9996841892833e-05, + "loss": 1.6566, + "step": 2144 + }, + { + "epoch": 0.1073, + "grad_norm": 11.430571556091309, + "learning_rate": 1.9996753563210987e-05, + "loss": 0.5747, + "step": 2146 + }, + { + "epoch": 0.1074, + "grad_norm": 4.066583633422852, + "learning_rate": 1.999666401551487e-05, + "loss": 0.68, + "step": 2148 + }, + { + "epoch": 0.1075, + "grad_norm": 4.550212860107422, + "learning_rate": 1.9996573249755573e-05, + "loss": 0.7578, + "step": 2150 + }, + { + "epoch": 0.1076, + "grad_norm": 1.9346401691436768, + "learning_rate": 1.9996481265944146e-05, + "loss": 1.2955, + "step": 2152 + }, + { + "epoch": 0.1077, + "grad_norm": 9.615238189697266, + "learning_rate": 1.99963880640918e-05, + "loss": 1.6144, + "step": 2154 + }, + { + "epoch": 0.1078, + "grad_norm": 8.151801109313965, + "learning_rate": 1.9996293644209886e-05, + "loss": 1.2779, + "step": 2156 + }, + { + "epoch": 0.1079, + "grad_norm": 3.9254262447357178, + "learning_rate": 1.999619800630992e-05, + "loss": 1.0618, + "step": 2158 + }, + { + "epoch": 0.108, + "grad_norm": 3.1904208660125732, + "learning_rate": 1.9996101150403543e-05, + "loss": 1.4361, + "step": 2160 + }, + { + "epoch": 0.1081, + "grad_norm": 6.321918487548828, + "learning_rate": 1.9996003076502567e-05, + "loss": 1.7042, + "step": 2162 + }, + { + "epoch": 0.1082, + "grad_norm": 1.9248223304748535, + "learning_rate": 1.9995903784618936e-05, + "loss": 0.9029, + "step": 2164 + }, + { + "epoch": 0.1083, + "grad_norm": 6.264876842498779, + "learning_rate": 1.999580327476475e-05, + "loss": 1.349, + "step": 2166 + }, + { + "epoch": 0.1084, + "grad_norm": 20.350175857543945, + "learning_rate": 1.9995701546952252e-05, + "loss": 2.1448, + "step": 2168 + }, + { + "epoch": 0.1085, + "grad_norm": 3.654580593109131, + "learning_rate": 1.9995598601193842e-05, + "loss": 1.0403, + "step": 2170 + }, + { + "epoch": 0.1086, + "grad_norm": 4.36512565612793, + "learning_rate": 1.9995494437502064e-05, + "loss": 0.502, + "step": 2172 + }, + { + "epoch": 0.1087, + "grad_norm": 3.0719449520111084, + "learning_rate": 1.9995389055889607e-05, + "loss": 0.8388, + "step": 2174 + }, + { + "epoch": 0.1088, + "grad_norm": 10.096999168395996, + "learning_rate": 1.9995282456369313e-05, + "loss": 1.4743, + "step": 2176 + }, + { + "epoch": 0.1089, + "grad_norm": 6.288313865661621, + "learning_rate": 1.9995174638954167e-05, + "loss": 1.3979, + "step": 2178 + }, + { + "epoch": 0.109, + "grad_norm": 7.15155029296875, + "learning_rate": 1.9995065603657317e-05, + "loss": 0.7202, + "step": 2180 + }, + { + "epoch": 0.1091, + "grad_norm": 7.592592716217041, + "learning_rate": 1.9994955350492036e-05, + "loss": 1.4646, + "step": 2182 + }, + { + "epoch": 0.1092, + "grad_norm": 6.096820831298828, + "learning_rate": 1.999484387947177e-05, + "loss": 0.8945, + "step": 2184 + }, + { + "epoch": 0.1093, + "grad_norm": 3.718393325805664, + "learning_rate": 1.999473119061009e-05, + "loss": 1.2908, + "step": 2186 + }, + { + "epoch": 0.1094, + "grad_norm": 4.950618743896484, + "learning_rate": 1.999461728392073e-05, + "loss": 0.8924, + "step": 2188 + }, + { + "epoch": 0.1095, + "grad_norm": 4.613070011138916, + "learning_rate": 1.9994502159417576e-05, + "loss": 0.6977, + "step": 2190 + }, + { + "epoch": 0.1096, + "grad_norm": 4.979231834411621, + "learning_rate": 1.9994385817114644e-05, + "loss": 0.247, + "step": 2192 + }, + { + "epoch": 0.1097, + "grad_norm": 3.4652702808380127, + "learning_rate": 1.999426825702612e-05, + "loss": 1.2554, + "step": 2194 + }, + { + "epoch": 0.1098, + "grad_norm": 5.049636363983154, + "learning_rate": 1.9994149479166324e-05, + "loss": 1.1534, + "step": 2196 + }, + { + "epoch": 0.1099, + "grad_norm": 9.367578506469727, + "learning_rate": 1.9994029483549732e-05, + "loss": 1.4041, + "step": 2198 + }, + { + "epoch": 0.11, + "grad_norm": 5.547827243804932, + "learning_rate": 1.999390827019096e-05, + "loss": 0.9457, + "step": 2200 + }, + { + "epoch": 0.1101, + "grad_norm": 9.430879592895508, + "learning_rate": 1.999378583910478e-05, + "loss": 0.5682, + "step": 2202 + }, + { + "epoch": 0.1102, + "grad_norm": 7.12925386428833, + "learning_rate": 1.999366219030611e-05, + "loss": 0.7065, + "step": 2204 + }, + { + "epoch": 0.1103, + "grad_norm": 5.204456806182861, + "learning_rate": 1.9993537323810015e-05, + "loss": 1.4027, + "step": 2206 + }, + { + "epoch": 0.1104, + "grad_norm": 9.55704116821289, + "learning_rate": 1.9993411239631713e-05, + "loss": 1.2171, + "step": 2208 + }, + { + "epoch": 0.1105, + "grad_norm": 3.1763198375701904, + "learning_rate": 1.9993283937786562e-05, + "loss": 1.5133, + "step": 2210 + }, + { + "epoch": 0.1106, + "grad_norm": 12.832616806030273, + "learning_rate": 1.999315541829008e-05, + "loss": 0.6186, + "step": 2212 + }, + { + "epoch": 0.1107, + "grad_norm": 2.162348508834839, + "learning_rate": 1.999302568115792e-05, + "loss": 0.8644, + "step": 2214 + }, + { + "epoch": 0.1108, + "grad_norm": 5.43632173538208, + "learning_rate": 1.9992894726405894e-05, + "loss": 1.0432, + "step": 2216 + }, + { + "epoch": 0.1109, + "grad_norm": 9.1400728225708, + "learning_rate": 1.9992762554049955e-05, + "loss": 0.9307, + "step": 2218 + }, + { + "epoch": 0.111, + "grad_norm": 4.30924654006958, + "learning_rate": 1.999262916410621e-05, + "loss": 1.7031, + "step": 2220 + }, + { + "epoch": 0.1111, + "grad_norm": 3.452582597732544, + "learning_rate": 1.999249455659092e-05, + "loss": 1.2246, + "step": 2222 + }, + { + "epoch": 0.1112, + "grad_norm": 5.672556400299072, + "learning_rate": 1.999235873152047e-05, + "loss": 0.8674, + "step": 2224 + }, + { + "epoch": 0.1113, + "grad_norm": 6.418492794036865, + "learning_rate": 1.999222168891142e-05, + "loss": 1.267, + "step": 2226 + }, + { + "epoch": 0.1114, + "grad_norm": 3.612499952316284, + "learning_rate": 1.999208342878047e-05, + "loss": 0.604, + "step": 2228 + }, + { + "epoch": 0.1115, + "grad_norm": 4.543426513671875, + "learning_rate": 1.9991943951144462e-05, + "loss": 0.6551, + "step": 2230 + }, + { + "epoch": 0.1116, + "grad_norm": 7.4946746826171875, + "learning_rate": 1.9991803256020393e-05, + "loss": 1.1631, + "step": 2232 + }, + { + "epoch": 0.1117, + "grad_norm": 2.8215079307556152, + "learning_rate": 1.9991661343425402e-05, + "loss": 1.3479, + "step": 2234 + }, + { + "epoch": 0.1118, + "grad_norm": 6.819797992706299, + "learning_rate": 1.9991518213376787e-05, + "loss": 1.6273, + "step": 2236 + }, + { + "epoch": 0.1119, + "grad_norm": 3.3870904445648193, + "learning_rate": 1.9991373865891986e-05, + "loss": 0.4838, + "step": 2238 + }, + { + "epoch": 0.112, + "grad_norm": 5.165288925170898, + "learning_rate": 1.9991228300988586e-05, + "loss": 0.7079, + "step": 2240 + }, + { + "epoch": 0.1121, + "grad_norm": 5.924412727355957, + "learning_rate": 1.9991081518684322e-05, + "loss": 1.0628, + "step": 2242 + }, + { + "epoch": 0.1122, + "grad_norm": 4.116950511932373, + "learning_rate": 1.9990933518997086e-05, + "loss": 0.4376, + "step": 2244 + }, + { + "epoch": 0.1123, + "grad_norm": 0.1340176910161972, + "learning_rate": 1.9990784301944902e-05, + "loss": 0.4798, + "step": 2246 + }, + { + "epoch": 0.1124, + "grad_norm": 3.598942279815674, + "learning_rate": 1.9990633867545956e-05, + "loss": 0.9378, + "step": 2248 + }, + { + "epoch": 0.1125, + "grad_norm": 10.040220260620117, + "learning_rate": 1.999048221581858e-05, + "loss": 1.2937, + "step": 2250 + }, + { + "epoch": 0.1126, + "grad_norm": 1.1184306144714355, + "learning_rate": 1.999032934678125e-05, + "loss": 0.216, + "step": 2252 + }, + { + "epoch": 0.1127, + "grad_norm": 4.564691066741943, + "learning_rate": 1.999017526045259e-05, + "loss": 1.0199, + "step": 2254 + }, + { + "epoch": 0.1128, + "grad_norm": 7.264318943023682, + "learning_rate": 1.9990019956851384e-05, + "loss": 1.6547, + "step": 2256 + }, + { + "epoch": 0.1129, + "grad_norm": 3.8167576789855957, + "learning_rate": 1.9989863435996544e-05, + "loss": 1.0003, + "step": 2258 + }, + { + "epoch": 0.113, + "grad_norm": 6.685431957244873, + "learning_rate": 1.998970569790715e-05, + "loss": 1.1852, + "step": 2260 + }, + { + "epoch": 0.1131, + "grad_norm": 4.301551342010498, + "learning_rate": 1.9989546742602416e-05, + "loss": 0.6687, + "step": 2262 + }, + { + "epoch": 0.1132, + "grad_norm": 5.205286979675293, + "learning_rate": 1.9989386570101716e-05, + "loss": 0.5876, + "step": 2264 + }, + { + "epoch": 0.1133, + "grad_norm": 2.6659202575683594, + "learning_rate": 1.998922518042456e-05, + "loss": 1.4555, + "step": 2266 + }, + { + "epoch": 0.1134, + "grad_norm": 27.37856674194336, + "learning_rate": 1.9989062573590618e-05, + "loss": 3.7068, + "step": 2268 + }, + { + "epoch": 0.1135, + "grad_norm": 4.893657207489014, + "learning_rate": 1.9988898749619702e-05, + "loss": 1.7218, + "step": 2270 + }, + { + "epoch": 0.1136, + "grad_norm": 1.0645118951797485, + "learning_rate": 1.9988733708531772e-05, + "loss": 0.5466, + "step": 2272 + }, + { + "epoch": 0.1137, + "grad_norm": 10.216225624084473, + "learning_rate": 1.9988567450346937e-05, + "loss": 0.2358, + "step": 2274 + }, + { + "epoch": 0.1138, + "grad_norm": 3.7788190841674805, + "learning_rate": 1.998839997508546e-05, + "loss": 1.0685, + "step": 2276 + }, + { + "epoch": 0.1139, + "grad_norm": 7.114013195037842, + "learning_rate": 1.9988231282767744e-05, + "loss": 1.5347, + "step": 2278 + }, + { + "epoch": 0.114, + "grad_norm": 30.247486114501953, + "learning_rate": 1.9988061373414342e-05, + "loss": 2.7097, + "step": 2280 + }, + { + "epoch": 0.1141, + "grad_norm": 8.696284294128418, + "learning_rate": 1.998789024704596e-05, + "loss": 1.0593, + "step": 2282 + }, + { + "epoch": 0.1142, + "grad_norm": 0.5502188205718994, + "learning_rate": 1.9987717903683447e-05, + "loss": 0.7877, + "step": 2284 + }, + { + "epoch": 0.1143, + "grad_norm": 4.511127471923828, + "learning_rate": 1.9987544343347802e-05, + "loss": 1.4707, + "step": 2286 + }, + { + "epoch": 0.1144, + "grad_norm": 6.812150478363037, + "learning_rate": 1.998736956606018e-05, + "loss": 0.2066, + "step": 2288 + }, + { + "epoch": 0.1145, + "grad_norm": 7.071362018585205, + "learning_rate": 1.9987193571841865e-05, + "loss": 0.9834, + "step": 2290 + }, + { + "epoch": 0.1146, + "grad_norm": 9.933026313781738, + "learning_rate": 1.9987016360714307e-05, + "loss": 0.7288, + "step": 2292 + }, + { + "epoch": 0.1147, + "grad_norm": 4.7865495681762695, + "learning_rate": 1.9986837932699103e-05, + "loss": 1.2751, + "step": 2294 + }, + { + "epoch": 0.1148, + "grad_norm": 7.016463756561279, + "learning_rate": 1.998665828781799e-05, + "loss": 1.3927, + "step": 2296 + }, + { + "epoch": 0.1149, + "grad_norm": 3.5040836334228516, + "learning_rate": 1.9986477426092856e-05, + "loss": 1.4199, + "step": 2298 + }, + { + "epoch": 0.115, + "grad_norm": 2.956392765045166, + "learning_rate": 1.9986295347545738e-05, + "loss": 1.1088, + "step": 2300 + }, + { + "epoch": 0.1151, + "grad_norm": 3.5350821018218994, + "learning_rate": 1.998611205219883e-05, + "loss": 1.1018, + "step": 2302 + }, + { + "epoch": 0.1152, + "grad_norm": 3.199827194213867, + "learning_rate": 1.9985927540074453e-05, + "loss": 1.0503, + "step": 2304 + }, + { + "epoch": 0.1153, + "grad_norm": 3.526761531829834, + "learning_rate": 1.9985741811195098e-05, + "loss": 0.9056, + "step": 2306 + }, + { + "epoch": 0.1154, + "grad_norm": 6.841362476348877, + "learning_rate": 1.9985554865583394e-05, + "loss": 0.693, + "step": 2308 + }, + { + "epoch": 0.1155, + "grad_norm": 5.575036525726318, + "learning_rate": 1.998536670326212e-05, + "loss": 0.9589, + "step": 2310 + }, + { + "epoch": 0.1156, + "grad_norm": 5.7862958908081055, + "learning_rate": 1.99851773242542e-05, + "loss": 1.2854, + "step": 2312 + }, + { + "epoch": 0.1157, + "grad_norm": 8.18912124633789, + "learning_rate": 1.9984986728582712e-05, + "loss": 1.3922, + "step": 2314 + }, + { + "epoch": 0.1158, + "grad_norm": 4.412510395050049, + "learning_rate": 1.9984794916270876e-05, + "loss": 1.2915, + "step": 2316 + }, + { + "epoch": 0.1159, + "grad_norm": 5.349951267242432, + "learning_rate": 1.998460188734207e-05, + "loss": 0.8255, + "step": 2318 + }, + { + "epoch": 0.116, + "grad_norm": 3.297067403793335, + "learning_rate": 1.9984407641819812e-05, + "loss": 1.2304, + "step": 2320 + }, + { + "epoch": 0.1161, + "grad_norm": 4.224824905395508, + "learning_rate": 1.9984212179727768e-05, + "loss": 1.5923, + "step": 2322 + }, + { + "epoch": 0.1162, + "grad_norm": 2.8163981437683105, + "learning_rate": 1.998401550108975e-05, + "loss": 0.8195, + "step": 2324 + }, + { + "epoch": 0.1163, + "grad_norm": 4.195988178253174, + "learning_rate": 1.9983817605929735e-05, + "loss": 1.383, + "step": 2326 + }, + { + "epoch": 0.1164, + "grad_norm": 11.000553131103516, + "learning_rate": 1.9983618494271825e-05, + "loss": 1.3703, + "step": 2328 + }, + { + "epoch": 0.1165, + "grad_norm": 4.019565582275391, + "learning_rate": 1.9983418166140286e-05, + "loss": 0.9539, + "step": 2330 + }, + { + "epoch": 0.1166, + "grad_norm": 4.0825090408325195, + "learning_rate": 1.9983216621559525e-05, + "loss": 1.4436, + "step": 2332 + }, + { + "epoch": 0.1167, + "grad_norm": 3.6038551330566406, + "learning_rate": 1.99830138605541e-05, + "loss": 0.7867, + "step": 2334 + }, + { + "epoch": 0.1168, + "grad_norm": 9.209206581115723, + "learning_rate": 1.998280988314872e-05, + "loss": 1.3069, + "step": 2336 + }, + { + "epoch": 0.1169, + "grad_norm": 4.7387375831604, + "learning_rate": 1.998260468936824e-05, + "loss": 0.8243, + "step": 2338 + }, + { + "epoch": 0.117, + "grad_norm": 3.7702078819274902, + "learning_rate": 1.9982398279237657e-05, + "loss": 0.8429, + "step": 2340 + }, + { + "epoch": 0.1171, + "grad_norm": 4.91786527633667, + "learning_rate": 1.9982190652782122e-05, + "loss": 0.7127, + "step": 2342 + }, + { + "epoch": 0.1172, + "grad_norm": 5.498336315155029, + "learning_rate": 1.9981981810026932e-05, + "loss": 0.5711, + "step": 2344 + }, + { + "epoch": 0.1173, + "grad_norm": 10.579774856567383, + "learning_rate": 1.998177175099754e-05, + "loss": 0.9192, + "step": 2346 + }, + { + "epoch": 0.1174, + "grad_norm": 6.414841651916504, + "learning_rate": 1.998156047571954e-05, + "loss": 1.0564, + "step": 2348 + }, + { + "epoch": 0.1175, + "grad_norm": 2.970548152923584, + "learning_rate": 1.998134798421867e-05, + "loss": 0.2683, + "step": 2350 + }, + { + "epoch": 0.1176, + "grad_norm": 9.301301956176758, + "learning_rate": 1.9981134276520828e-05, + "loss": 1.2211, + "step": 2352 + }, + { + "epoch": 0.1177, + "grad_norm": 5.71303129196167, + "learning_rate": 1.998091935265205e-05, + "loss": 1.4333, + "step": 2354 + }, + { + "epoch": 0.1178, + "grad_norm": 6.043229103088379, + "learning_rate": 1.9980703212638522e-05, + "loss": 1.2387, + "step": 2356 + }, + { + "epoch": 0.1179, + "grad_norm": 12.5165376663208, + "learning_rate": 1.9980485856506582e-05, + "loss": 0.8061, + "step": 2358 + }, + { + "epoch": 0.118, + "grad_norm": 5.107753753662109, + "learning_rate": 1.9980267284282718e-05, + "loss": 1.2424, + "step": 2360 + }, + { + "epoch": 0.1181, + "grad_norm": 4.857999324798584, + "learning_rate": 1.9980047495993556e-05, + "loss": 0.9198, + "step": 2362 + }, + { + "epoch": 0.1182, + "grad_norm": 0.6260406970977783, + "learning_rate": 1.997982649166588e-05, + "loss": 0.5893, + "step": 2364 + }, + { + "epoch": 0.1183, + "grad_norm": 13.26055908203125, + "learning_rate": 1.9979604271326617e-05, + "loss": 0.9716, + "step": 2366 + }, + { + "epoch": 0.1184, + "grad_norm": 6.819946765899658, + "learning_rate": 1.9979380835002846e-05, + "loss": 1.0027, + "step": 2368 + }, + { + "epoch": 0.1185, + "grad_norm": 7.554774761199951, + "learning_rate": 1.997915618272179e-05, + "loss": 1.0515, + "step": 2370 + }, + { + "epoch": 0.1186, + "grad_norm": 20.577774047851562, + "learning_rate": 1.9978930314510826e-05, + "loss": 1.0807, + "step": 2372 + }, + { + "epoch": 0.1187, + "grad_norm": 5.905737400054932, + "learning_rate": 1.997870323039747e-05, + "loss": 1.7286, + "step": 2374 + }, + { + "epoch": 0.1188, + "grad_norm": 10.300658226013184, + "learning_rate": 1.9978474930409396e-05, + "loss": 1.6344, + "step": 2376 + }, + { + "epoch": 0.1189, + "grad_norm": 5.56805944442749, + "learning_rate": 1.997824541457442e-05, + "loss": 1.176, + "step": 2378 + }, + { + "epoch": 0.119, + "grad_norm": 6.114874839782715, + "learning_rate": 1.9978014682920503e-05, + "loss": 1.0216, + "step": 2380 + }, + { + "epoch": 0.1191, + "grad_norm": 6.011414051055908, + "learning_rate": 1.9977782735475765e-05, + "loss": 1.328, + "step": 2382 + }, + { + "epoch": 0.1192, + "grad_norm": 17.0596866607666, + "learning_rate": 1.997754957226847e-05, + "loss": 1.562, + "step": 2384 + }, + { + "epoch": 0.1193, + "grad_norm": 11.73183536529541, + "learning_rate": 1.9977315193327017e-05, + "loss": 0.989, + "step": 2386 + }, + { + "epoch": 0.1194, + "grad_norm": 8.757466316223145, + "learning_rate": 1.9977079598679978e-05, + "loss": 2.0697, + "step": 2388 + }, + { + "epoch": 0.1195, + "grad_norm": 7.243011474609375, + "learning_rate": 1.9976842788356054e-05, + "loss": 0.45, + "step": 2390 + }, + { + "epoch": 0.1196, + "grad_norm": 3.886674404144287, + "learning_rate": 1.99766047623841e-05, + "loss": 1.9106, + "step": 2392 + }, + { + "epoch": 0.1197, + "grad_norm": 7.188863754272461, + "learning_rate": 1.9976365520793114e-05, + "loss": 1.4716, + "step": 2394 + }, + { + "epoch": 0.1198, + "grad_norm": 5.046743392944336, + "learning_rate": 1.9976125063612254e-05, + "loss": 1.3002, + "step": 2396 + }, + { + "epoch": 0.1199, + "grad_norm": 8.908957481384277, + "learning_rate": 1.9975883390870817e-05, + "loss": 1.3472, + "step": 2398 + }, + { + "epoch": 0.12, + "grad_norm": 4.285129070281982, + "learning_rate": 1.9975640502598243e-05, + "loss": 0.8107, + "step": 2400 + }, + { + "epoch": 0.1201, + "grad_norm": 5.524387836456299, + "learning_rate": 1.997539639882414e-05, + "loss": 2.6343, + "step": 2402 + }, + { + "epoch": 0.1202, + "grad_norm": 3.7614943981170654, + "learning_rate": 1.9975151079578238e-05, + "loss": 1.5212, + "step": 2404 + }, + { + "epoch": 0.1203, + "grad_norm": 3.294126510620117, + "learning_rate": 1.997490454489044e-05, + "loss": 0.7553, + "step": 2406 + }, + { + "epoch": 0.1204, + "grad_norm": 2.893786668777466, + "learning_rate": 1.9974656794790777e-05, + "loss": 1.0615, + "step": 2408 + }, + { + "epoch": 0.1205, + "grad_norm": 7.447188854217529, + "learning_rate": 1.9974407829309442e-05, + "loss": 1.1466, + "step": 2410 + }, + { + "epoch": 0.1206, + "grad_norm": 3.53955078125, + "learning_rate": 1.9974157648476768e-05, + "loss": 0.9637, + "step": 2412 + }, + { + "epoch": 0.1207, + "grad_norm": 3.2533676624298096, + "learning_rate": 1.997390625232324e-05, + "loss": 0.7541, + "step": 2414 + }, + { + "epoch": 0.1208, + "grad_norm": 4.699741363525391, + "learning_rate": 1.9973653640879486e-05, + "loss": 1.2472, + "step": 2416 + }, + { + "epoch": 0.1209, + "grad_norm": 7.104421615600586, + "learning_rate": 1.9973399814176293e-05, + "loss": 0.9721, + "step": 2418 + }, + { + "epoch": 0.121, + "grad_norm": 5.14581823348999, + "learning_rate": 1.997314477224458e-05, + "loss": 1.3716, + "step": 2420 + }, + { + "epoch": 0.1211, + "grad_norm": 3.172131299972534, + "learning_rate": 1.9972888515115433e-05, + "loss": 1.1134, + "step": 2422 + }, + { + "epoch": 0.1212, + "grad_norm": 3.1627140045166016, + "learning_rate": 1.997263104282007e-05, + "loss": 1.2183, + "step": 2424 + }, + { + "epoch": 0.1213, + "grad_norm": 5.582789421081543, + "learning_rate": 1.997237235538987e-05, + "loss": 1.6806, + "step": 2426 + }, + { + "epoch": 0.1214, + "grad_norm": 2.7907042503356934, + "learning_rate": 1.997211245285634e-05, + "loss": 0.3098, + "step": 2428 + }, + { + "epoch": 0.1215, + "grad_norm": 4.569838047027588, + "learning_rate": 1.9971851335251162e-05, + "loss": 1.0165, + "step": 2430 + }, + { + "epoch": 0.1216, + "grad_norm": 4.7274675369262695, + "learning_rate": 1.997158900260614e-05, + "loss": 1.0811, + "step": 2432 + }, + { + "epoch": 0.1217, + "grad_norm": 5.511735439300537, + "learning_rate": 1.997132545495325e-05, + "loss": 1.1552, + "step": 2434 + }, + { + "epoch": 0.1218, + "grad_norm": 19.572017669677734, + "learning_rate": 1.99710606923246e-05, + "loss": 1.4189, + "step": 2436 + }, + { + "epoch": 0.1219, + "grad_norm": 18.904922485351562, + "learning_rate": 1.9970794714752448e-05, + "loss": 1.4151, + "step": 2438 + }, + { + "epoch": 0.122, + "grad_norm": 9.215799331665039, + "learning_rate": 1.9970527522269204e-05, + "loss": 1.5581, + "step": 2440 + }, + { + "epoch": 0.1221, + "grad_norm": 1.0655502080917358, + "learning_rate": 1.9970259114907428e-05, + "loss": 0.3857, + "step": 2442 + }, + { + "epoch": 0.1222, + "grad_norm": 20.88448715209961, + "learning_rate": 1.996998949269982e-05, + "loss": 1.5939, + "step": 2444 + }, + { + "epoch": 0.1223, + "grad_norm": 5.471306800842285, + "learning_rate": 1.9969718655679235e-05, + "loss": 1.7253, + "step": 2446 + }, + { + "epoch": 0.1224, + "grad_norm": 2.959651231765747, + "learning_rate": 1.9969446603878673e-05, + "loss": 1.1839, + "step": 2448 + }, + { + "epoch": 0.1225, + "grad_norm": 3.2930221557617188, + "learning_rate": 1.9969173337331283e-05, + "loss": 1.4165, + "step": 2450 + }, + { + "epoch": 0.1226, + "grad_norm": 5.585376262664795, + "learning_rate": 1.996889885607036e-05, + "loss": 0.8813, + "step": 2452 + }, + { + "epoch": 0.1227, + "grad_norm": 4.125856399536133, + "learning_rate": 1.9968623160129353e-05, + "loss": 1.0359, + "step": 2454 + }, + { + "epoch": 0.1228, + "grad_norm": 11.157347679138184, + "learning_rate": 1.9968346249541848e-05, + "loss": 1.056, + "step": 2456 + }, + { + "epoch": 0.1229, + "grad_norm": 7.516680717468262, + "learning_rate": 1.9968068124341593e-05, + "loss": 0.8415, + "step": 2458 + }, + { + "epoch": 0.123, + "grad_norm": 3.013368844985962, + "learning_rate": 1.9967788784562474e-05, + "loss": 1.1525, + "step": 2460 + }, + { + "epoch": 0.1231, + "grad_norm": 2.302889108657837, + "learning_rate": 1.9967508230238524e-05, + "loss": 0.5225, + "step": 2462 + }, + { + "epoch": 0.1232, + "grad_norm": 9.104236602783203, + "learning_rate": 1.9967226461403934e-05, + "loss": 1.1855, + "step": 2464 + }, + { + "epoch": 0.1233, + "grad_norm": 3.268207311630249, + "learning_rate": 1.996694347809303e-05, + "loss": 1.712, + "step": 2466 + }, + { + "epoch": 0.1234, + "grad_norm": 8.127994537353516, + "learning_rate": 1.99666592803403e-05, + "loss": 1.3474, + "step": 2468 + }, + { + "epoch": 0.1235, + "grad_norm": 7.297097206115723, + "learning_rate": 1.9966373868180367e-05, + "loss": 1.0092, + "step": 2470 + }, + { + "epoch": 0.1236, + "grad_norm": 6.642102241516113, + "learning_rate": 1.996608724164801e-05, + "loss": 1.0733, + "step": 2472 + }, + { + "epoch": 0.1237, + "grad_norm": 5.350168228149414, + "learning_rate": 1.9965799400778154e-05, + "loss": 0.7681, + "step": 2474 + }, + { + "epoch": 0.1238, + "grad_norm": 27.327241897583008, + "learning_rate": 1.9965510345605866e-05, + "loss": 2.2093, + "step": 2476 + }, + { + "epoch": 0.1239, + "grad_norm": 3.020846366882324, + "learning_rate": 1.9965220076166376e-05, + "loss": 1.4332, + "step": 2478 + }, + { + "epoch": 0.124, + "grad_norm": 10.865694999694824, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.993, + "step": 2480 + }, + { + "epoch": 0.1241, + "grad_norm": 7.889163970947266, + "learning_rate": 1.996463589462739e-05, + "loss": 0.9392, + "step": 2482 + }, + { + "epoch": 0.1242, + "grad_norm": 9.48397159576416, + "learning_rate": 1.996434198259908e-05, + "loss": 1.9361, + "step": 2484 + }, + { + "epoch": 0.1243, + "grad_norm": 9.69165325164795, + "learning_rate": 1.9964046856445926e-05, + "loss": 1.2371, + "step": 2486 + }, + { + "epoch": 0.1244, + "grad_norm": 2.814323902130127, + "learning_rate": 1.9963750516203887e-05, + "loss": 0.9327, + "step": 2488 + }, + { + "epoch": 0.1245, + "grad_norm": 4.466917991638184, + "learning_rate": 1.9963452961909065e-05, + "loss": 0.8609, + "step": 2490 + }, + { + "epoch": 0.1246, + "grad_norm": 4.664863586425781, + "learning_rate": 1.9963154193597728e-05, + "loss": 1.4949, + "step": 2492 + }, + { + "epoch": 0.1247, + "grad_norm": 11.637109756469727, + "learning_rate": 1.996285421130627e-05, + "loss": 1.3049, + "step": 2494 + }, + { + "epoch": 0.1248, + "grad_norm": 5.329936981201172, + "learning_rate": 1.996255301507125e-05, + "loss": 1.4083, + "step": 2496 + }, + { + "epoch": 0.1249, + "grad_norm": 3.2768547534942627, + "learning_rate": 1.9962250604929362e-05, + "loss": 0.7587, + "step": 2498 + }, + { + "epoch": 0.125, + "grad_norm": 7.820598602294922, + "learning_rate": 1.9961946980917457e-05, + "loss": 1.6387, + "step": 2500 + }, + { + "epoch": 0.1251, + "grad_norm": 5.81001615524292, + "learning_rate": 1.9961642143072532e-05, + "loss": 0.8818, + "step": 2502 + }, + { + "epoch": 0.1252, + "grad_norm": 3.466228723526001, + "learning_rate": 1.9961336091431728e-05, + "loss": 0.8803, + "step": 2504 + }, + { + "epoch": 0.1253, + "grad_norm": 15.122498512268066, + "learning_rate": 1.9961028826032335e-05, + "loss": 1.4078, + "step": 2506 + }, + { + "epoch": 0.1254, + "grad_norm": 2.620368480682373, + "learning_rate": 1.9960720346911798e-05, + "loss": 1.3986, + "step": 2508 + }, + { + "epoch": 0.1255, + "grad_norm": 4.434682846069336, + "learning_rate": 1.99604106541077e-05, + "loss": 0.9911, + "step": 2510 + }, + { + "epoch": 0.1256, + "grad_norm": 5.564939498901367, + "learning_rate": 1.9960099747657774e-05, + "loss": 1.4047, + "step": 2512 + }, + { + "epoch": 0.1257, + "grad_norm": 16.742172241210938, + "learning_rate": 1.9959787627599907e-05, + "loss": 1.1865, + "step": 2514 + }, + { + "epoch": 0.1258, + "grad_norm": 1.5496330261230469, + "learning_rate": 1.995947429397213e-05, + "loss": 0.1826, + "step": 2516 + }, + { + "epoch": 0.1259, + "grad_norm": 2.492791175842285, + "learning_rate": 1.995915974681262e-05, + "loss": 0.5891, + "step": 2518 + }, + { + "epoch": 0.126, + "grad_norm": 3.8430607318878174, + "learning_rate": 1.9958843986159705e-05, + "loss": 0.8966, + "step": 2520 + }, + { + "epoch": 0.1261, + "grad_norm": 3.3983030319213867, + "learning_rate": 1.995852701205186e-05, + "loss": 0.4936, + "step": 2522 + }, + { + "epoch": 0.1262, + "grad_norm": 5.5888447761535645, + "learning_rate": 1.9958208824527702e-05, + "loss": 0.7232, + "step": 2524 + }, + { + "epoch": 0.1263, + "grad_norm": 7.252414703369141, + "learning_rate": 1.9957889423626006e-05, + "loss": 1.5599, + "step": 2526 + }, + { + "epoch": 0.1264, + "grad_norm": 1.9697691202163696, + "learning_rate": 1.9957568809385693e-05, + "loss": 0.579, + "step": 2528 + }, + { + "epoch": 0.1265, + "grad_norm": 3.227403163909912, + "learning_rate": 1.9957246981845825e-05, + "loss": 0.7598, + "step": 2530 + }, + { + "epoch": 0.1266, + "grad_norm": 3.4866952896118164, + "learning_rate": 1.9956923941045613e-05, + "loss": 0.9016, + "step": 2532 + }, + { + "epoch": 0.1267, + "grad_norm": 4.27594518661499, + "learning_rate": 1.995659968702442e-05, + "loss": 0.6262, + "step": 2534 + }, + { + "epoch": 0.1268, + "grad_norm": 8.840963363647461, + "learning_rate": 1.995627421982176e-05, + "loss": 0.4288, + "step": 2536 + }, + { + "epoch": 0.1269, + "grad_norm": 1.15195631980896, + "learning_rate": 1.9955947539477285e-05, + "loss": 0.2787, + "step": 2538 + }, + { + "epoch": 0.127, + "grad_norm": 5.317230224609375, + "learning_rate": 1.99556196460308e-05, + "loss": 1.6267, + "step": 2540 + }, + { + "epoch": 0.1271, + "grad_norm": 15.514074325561523, + "learning_rate": 1.9955290539522262e-05, + "loss": 1.1494, + "step": 2542 + }, + { + "epoch": 0.1272, + "grad_norm": 13.821056365966797, + "learning_rate": 1.995496021999177e-05, + "loss": 1.6055, + "step": 2544 + }, + { + "epoch": 0.1273, + "grad_norm": 4.52501106262207, + "learning_rate": 1.995462868747957e-05, + "loss": 1.4695, + "step": 2546 + }, + { + "epoch": 0.1274, + "grad_norm": 10.272658348083496, + "learning_rate": 1.9954295942026065e-05, + "loss": 1.7007, + "step": 2548 + }, + { + "epoch": 0.1275, + "grad_norm": 4.978032112121582, + "learning_rate": 1.9953961983671792e-05, + "loss": 0.9653, + "step": 2550 + }, + { + "epoch": 0.1276, + "grad_norm": 11.113698959350586, + "learning_rate": 1.995362681245744e-05, + "loss": 1.453, + "step": 2552 + }, + { + "epoch": 0.1277, + "grad_norm": 3.6236960887908936, + "learning_rate": 1.9953290428423857e-05, + "loss": 1.107, + "step": 2554 + }, + { + "epoch": 0.1278, + "grad_norm": 4.298927307128906, + "learning_rate": 1.9952952831612027e-05, + "loss": 1.3085, + "step": 2556 + }, + { + "epoch": 0.1279, + "grad_norm": 6.730754852294922, + "learning_rate": 1.9952614022063085e-05, + "loss": 1.5286, + "step": 2558 + }, + { + "epoch": 0.128, + "grad_norm": 4.590338230133057, + "learning_rate": 1.9952273999818312e-05, + "loss": 0.9962, + "step": 2560 + }, + { + "epoch": 0.1281, + "grad_norm": 3.0388245582580566, + "learning_rate": 1.9951932764919143e-05, + "loss": 1.1027, + "step": 2562 + }, + { + "epoch": 0.1282, + "grad_norm": 3.278526782989502, + "learning_rate": 1.9951590317407152e-05, + "loss": 1.1077, + "step": 2564 + }, + { + "epoch": 0.1283, + "grad_norm": 5.762153148651123, + "learning_rate": 1.995124665732407e-05, + "loss": 1.3359, + "step": 2566 + }, + { + "epoch": 0.1284, + "grad_norm": 6.123600006103516, + "learning_rate": 1.9950901784711765e-05, + "loss": 2.2009, + "step": 2568 + }, + { + "epoch": 0.1285, + "grad_norm": 6.056911945343018, + "learning_rate": 1.9950555699612265e-05, + "loss": 0.7432, + "step": 2570 + }, + { + "epoch": 0.1286, + "grad_norm": 7.492073059082031, + "learning_rate": 1.9950208402067735e-05, + "loss": 0.8581, + "step": 2572 + }, + { + "epoch": 0.1287, + "grad_norm": 2.505643844604492, + "learning_rate": 1.9949859892120492e-05, + "loss": 0.8178, + "step": 2574 + }, + { + "epoch": 0.1288, + "grad_norm": 3.0937576293945312, + "learning_rate": 1.9949510169813006e-05, + "loss": 1.3499, + "step": 2576 + }, + { + "epoch": 0.1289, + "grad_norm": 3.119563341140747, + "learning_rate": 1.994915923518788e-05, + "loss": 0.4335, + "step": 2578 + }, + { + "epoch": 0.129, + "grad_norm": 7.092994213104248, + "learning_rate": 1.9948807088287884e-05, + "loss": 0.505, + "step": 2580 + }, + { + "epoch": 0.1291, + "grad_norm": 4.967925548553467, + "learning_rate": 1.994845372915592e-05, + "loss": 1.4231, + "step": 2582 + }, + { + "epoch": 0.1292, + "grad_norm": 5.247772693634033, + "learning_rate": 1.994809915783505e-05, + "loss": 1.545, + "step": 2584 + }, + { + "epoch": 0.1293, + "grad_norm": 6.719394207000732, + "learning_rate": 1.9947743374368467e-05, + "loss": 1.1659, + "step": 2586 + }, + { + "epoch": 0.1294, + "grad_norm": 5.380419731140137, + "learning_rate": 1.9947386378799534e-05, + "loss": 0.5625, + "step": 2588 + }, + { + "epoch": 0.1295, + "grad_norm": 1.7742748260498047, + "learning_rate": 1.9947028171171742e-05, + "loss": 1.07, + "step": 2590 + }, + { + "epoch": 0.1296, + "grad_norm": 5.645551681518555, + "learning_rate": 1.9946668751528745e-05, + "loss": 1.5077, + "step": 2592 + }, + { + "epoch": 0.1297, + "grad_norm": 6.4907002449035645, + "learning_rate": 1.9946308119914323e-05, + "loss": 1.1766, + "step": 2594 + }, + { + "epoch": 0.1298, + "grad_norm": 3.7087559700012207, + "learning_rate": 1.9945946276372435e-05, + "loss": 0.826, + "step": 2596 + }, + { + "epoch": 0.1299, + "grad_norm": 5.761462688446045, + "learning_rate": 1.9945583220947156e-05, + "loss": 1.3263, + "step": 2598 + }, + { + "epoch": 0.13, + "grad_norm": 5.205174446105957, + "learning_rate": 1.9945218953682736e-05, + "loss": 0.6394, + "step": 2600 + }, + { + "epoch": 0.1301, + "grad_norm": 1.6314979791641235, + "learning_rate": 1.994485347462355e-05, + "loss": 1.1303, + "step": 2602 + }, + { + "epoch": 0.1302, + "grad_norm": 7.874737739562988, + "learning_rate": 1.9944486783814135e-05, + "loss": 0.8984, + "step": 2604 + }, + { + "epoch": 0.1303, + "grad_norm": 7.737053394317627, + "learning_rate": 1.9944118881299167e-05, + "loss": 1.4694, + "step": 2606 + }, + { + "epoch": 0.1304, + "grad_norm": 6.665176868438721, + "learning_rate": 1.994374976712348e-05, + "loss": 1.3074, + "step": 2608 + }, + { + "epoch": 0.1305, + "grad_norm": 3.287774085998535, + "learning_rate": 1.994337944133205e-05, + "loss": 1.5448, + "step": 2610 + }, + { + "epoch": 0.1306, + "grad_norm": 8.860224723815918, + "learning_rate": 1.994300790396999e-05, + "loss": 1.2621, + "step": 2612 + }, + { + "epoch": 0.1307, + "grad_norm": 4.084372520446777, + "learning_rate": 1.994263515508258e-05, + "loss": 0.2517, + "step": 2614 + }, + { + "epoch": 0.1308, + "grad_norm": 4.275577545166016, + "learning_rate": 1.9942261194715236e-05, + "loss": 1.3829, + "step": 2616 + }, + { + "epoch": 0.1309, + "grad_norm": 7.550717830657959, + "learning_rate": 1.9941886022913523e-05, + "loss": 1.1457, + "step": 2618 + }, + { + "epoch": 0.131, + "grad_norm": 4.732592582702637, + "learning_rate": 1.9941509639723155e-05, + "loss": 0.8966, + "step": 2620 + }, + { + "epoch": 0.1311, + "grad_norm": 0.3357357680797577, + "learning_rate": 1.9941132045189993e-05, + "loss": 0.2699, + "step": 2622 + }, + { + "epoch": 0.1312, + "grad_norm": 3.6357531547546387, + "learning_rate": 1.9940753239360047e-05, + "loss": 0.7582, + "step": 2624 + }, + { + "epoch": 0.1313, + "grad_norm": 6.882838726043701, + "learning_rate": 1.9940373222279473e-05, + "loss": 0.6717, + "step": 2626 + }, + { + "epoch": 0.1314, + "grad_norm": 8.279189109802246, + "learning_rate": 1.993999199399457e-05, + "loss": 1.2919, + "step": 2628 + }, + { + "epoch": 0.1315, + "grad_norm": 8.887821197509766, + "learning_rate": 1.99396095545518e-05, + "loss": 0.8174, + "step": 2630 + }, + { + "epoch": 0.1316, + "grad_norm": 4.103753566741943, + "learning_rate": 1.9939225903997748e-05, + "loss": 0.8165, + "step": 2632 + }, + { + "epoch": 0.1317, + "grad_norm": 3.5220413208007812, + "learning_rate": 1.9938841042379174e-05, + "loss": 0.8991, + "step": 2634 + }, + { + "epoch": 0.1318, + "grad_norm": 8.836214065551758, + "learning_rate": 1.993845496974297e-05, + "loss": 1.1225, + "step": 2636 + }, + { + "epoch": 0.1319, + "grad_norm": 0.3582502603530884, + "learning_rate": 1.9938067686136167e-05, + "loss": 0.5789, + "step": 2638 + }, + { + "epoch": 0.132, + "grad_norm": 0.2662607431411743, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.4723, + "step": 2640 + }, + { + "epoch": 0.1321, + "grad_norm": 4.558809757232666, + "learning_rate": 1.9937289486199696e-05, + "loss": 0.8346, + "step": 2642 + }, + { + "epoch": 0.1322, + "grad_norm": 16.12396812438965, + "learning_rate": 1.993689856996485e-05, + "loss": 1.5549, + "step": 2644 + }, + { + "epoch": 0.1323, + "grad_norm": 6.000970840454102, + "learning_rate": 1.9936506442949054e-05, + "loss": 0.8655, + "step": 2646 + }, + { + "epoch": 0.1324, + "grad_norm": 8.055473327636719, + "learning_rate": 1.9936113105200085e-05, + "loss": 0.9361, + "step": 2648 + }, + { + "epoch": 0.1325, + "grad_norm": 5.028714179992676, + "learning_rate": 1.9935718556765878e-05, + "loss": 1.398, + "step": 2650 + }, + { + "epoch": 0.1326, + "grad_norm": 5.171218395233154, + "learning_rate": 1.99353227976945e-05, + "loss": 0.8624, + "step": 2652 + }, + { + "epoch": 0.1327, + "grad_norm": 5.87075138092041, + "learning_rate": 1.9934925828034174e-05, + "loss": 0.6243, + "step": 2654 + }, + { + "epoch": 0.1328, + "grad_norm": 2.4320201873779297, + "learning_rate": 1.9934527647833276e-05, + "loss": 0.8441, + "step": 2656 + }, + { + "epoch": 0.1329, + "grad_norm": 3.6358680725097656, + "learning_rate": 1.993412825714032e-05, + "loss": 0.9645, + "step": 2658 + }, + { + "epoch": 0.133, + "grad_norm": 8.542685508728027, + "learning_rate": 1.9933727656003964e-05, + "loss": 1.4976, + "step": 2660 + }, + { + "epoch": 0.1331, + "grad_norm": 3.807216167449951, + "learning_rate": 1.993332584447303e-05, + "loss": 0.7068, + "step": 2662 + }, + { + "epoch": 0.1332, + "grad_norm": 3.944674491882324, + "learning_rate": 1.993292282259647e-05, + "loss": 1.2018, + "step": 2664 + }, + { + "epoch": 0.1333, + "grad_norm": 8.424336433410645, + "learning_rate": 1.9932518590423396e-05, + "loss": 1.5493, + "step": 2666 + }, + { + "epoch": 0.1334, + "grad_norm": 9.925811767578125, + "learning_rate": 1.9932113148003057e-05, + "loss": 1.6752, + "step": 2668 + }, + { + "epoch": 0.1335, + "grad_norm": 2.767021656036377, + "learning_rate": 1.9931706495384865e-05, + "loss": 0.8449, + "step": 2670 + }, + { + "epoch": 0.1336, + "grad_norm": 3.846479654312134, + "learning_rate": 1.9931298632618355e-05, + "loss": 0.7966, + "step": 2672 + }, + { + "epoch": 0.1337, + "grad_norm": 2.854428768157959, + "learning_rate": 1.9930889559753235e-05, + "loss": 0.1655, + "step": 2674 + }, + { + "epoch": 0.1338, + "grad_norm": 6.840875625610352, + "learning_rate": 1.9930479276839347e-05, + "loss": 1.312, + "step": 2676 + }, + { + "epoch": 0.1339, + "grad_norm": 2.3326523303985596, + "learning_rate": 1.9930067783926676e-05, + "loss": 1.1889, + "step": 2678 + }, + { + "epoch": 0.134, + "grad_norm": 3.6496403217315674, + "learning_rate": 1.992965508106537e-05, + "loss": 0.3865, + "step": 2680 + }, + { + "epoch": 0.1341, + "grad_norm": 3.7017974853515625, + "learning_rate": 1.9929241168305715e-05, + "loss": 1.4951, + "step": 2682 + }, + { + "epoch": 0.1342, + "grad_norm": 2.321607828140259, + "learning_rate": 1.9928826045698138e-05, + "loss": 0.6705, + "step": 2684 + }, + { + "epoch": 0.1343, + "grad_norm": 7.220295429229736, + "learning_rate": 1.9928409713293226e-05, + "loss": 0.7831, + "step": 2686 + }, + { + "epoch": 0.1344, + "grad_norm": 3.6271328926086426, + "learning_rate": 1.9927992171141707e-05, + "loss": 1.2113, + "step": 2688 + }, + { + "epoch": 0.1345, + "grad_norm": 3.3895790576934814, + "learning_rate": 1.9927573419294456e-05, + "loss": 1.1047, + "step": 2690 + }, + { + "epoch": 0.1346, + "grad_norm": 4.482493877410889, + "learning_rate": 1.99271534578025e-05, + "loss": 0.3746, + "step": 2692 + }, + { + "epoch": 0.1347, + "grad_norm": 2.8534488677978516, + "learning_rate": 1.9926732286717005e-05, + "loss": 1.4188, + "step": 2694 + }, + { + "epoch": 0.1348, + "grad_norm": 4.832450866699219, + "learning_rate": 1.992630990608929e-05, + "loss": 1.3225, + "step": 2696 + }, + { + "epoch": 0.1349, + "grad_norm": 6.858399868011475, + "learning_rate": 1.9925886315970825e-05, + "loss": 1.4697, + "step": 2698 + }, + { + "epoch": 0.135, + "grad_norm": 6.352370738983154, + "learning_rate": 1.9925461516413224e-05, + "loss": 1.5797, + "step": 2700 + }, + { + "epoch": 0.1351, + "grad_norm": 2.8403942584991455, + "learning_rate": 1.992503550746824e-05, + "loss": 0.7948, + "step": 2702 + }, + { + "epoch": 0.1352, + "grad_norm": 4.758930206298828, + "learning_rate": 1.9924608289187786e-05, + "loss": 1.3752, + "step": 2704 + }, + { + "epoch": 0.1353, + "grad_norm": 5.386173725128174, + "learning_rate": 1.9924179861623917e-05, + "loss": 1.1346, + "step": 2706 + }, + { + "epoch": 0.1354, + "grad_norm": 3.001178026199341, + "learning_rate": 1.9923750224828833e-05, + "loss": 1.1336, + "step": 2708 + }, + { + "epoch": 0.1355, + "grad_norm": 5.818534851074219, + "learning_rate": 1.9923319378854888e-05, + "loss": 1.6472, + "step": 2710 + }, + { + "epoch": 0.1356, + "grad_norm": 13.348605155944824, + "learning_rate": 1.992288732375458e-05, + "loss": 1.085, + "step": 2712 + }, + { + "epoch": 0.1357, + "grad_norm": 7.9649128913879395, + "learning_rate": 1.9922454059580543e-05, + "loss": 1.9689, + "step": 2714 + }, + { + "epoch": 0.1358, + "grad_norm": 6.74570369720459, + "learning_rate": 1.9922019586385587e-05, + "loss": 1.7445, + "step": 2716 + }, + { + "epoch": 0.1359, + "grad_norm": 11.406176567077637, + "learning_rate": 1.9921583904222636e-05, + "loss": 1.7703, + "step": 2718 + }, + { + "epoch": 0.136, + "grad_norm": 5.279270648956299, + "learning_rate": 1.9921147013144782e-05, + "loss": 1.6338, + "step": 2720 + }, + { + "epoch": 0.1361, + "grad_norm": 4.91425085067749, + "learning_rate": 1.9920708913205254e-05, + "loss": 0.621, + "step": 2722 + }, + { + "epoch": 0.1362, + "grad_norm": 3.3681092262268066, + "learning_rate": 1.9920269604457444e-05, + "loss": 0.8234, + "step": 2724 + }, + { + "epoch": 0.1363, + "grad_norm": 3.396333694458008, + "learning_rate": 1.9919829086954872e-05, + "loss": 1.0884, + "step": 2726 + }, + { + "epoch": 0.1364, + "grad_norm": 3.93391752243042, + "learning_rate": 1.9919387360751216e-05, + "loss": 1.0878, + "step": 2728 + }, + { + "epoch": 0.1365, + "grad_norm": 8.498316764831543, + "learning_rate": 1.99189444259003e-05, + "loss": 0.7482, + "step": 2730 + }, + { + "epoch": 0.1366, + "grad_norm": 4.045608043670654, + "learning_rate": 1.991850028245609e-05, + "loss": 0.9013, + "step": 2732 + }, + { + "epoch": 0.1367, + "grad_norm": 3.65373158454895, + "learning_rate": 1.991805493047271e-05, + "loss": 0.885, + "step": 2734 + }, + { + "epoch": 0.1368, + "grad_norm": 4.5298848152160645, + "learning_rate": 1.9917608370004417e-05, + "loss": 0.9758, + "step": 2736 + }, + { + "epoch": 0.1369, + "grad_norm": 5.366786003112793, + "learning_rate": 1.9917160601105632e-05, + "loss": 1.4667, + "step": 2738 + }, + { + "epoch": 0.137, + "grad_norm": 5.533284664154053, + "learning_rate": 1.9916711623830904e-05, + "loss": 1.0423, + "step": 2740 + }, + { + "epoch": 0.1371, + "grad_norm": 3.454960346221924, + "learning_rate": 1.9916261438234953e-05, + "loss": 0.9618, + "step": 2742 + }, + { + "epoch": 0.1372, + "grad_norm": 4.7208709716796875, + "learning_rate": 1.9915810044372618e-05, + "loss": 1.2394, + "step": 2744 + }, + { + "epoch": 0.1373, + "grad_norm": 4.721938133239746, + "learning_rate": 1.991535744229891e-05, + "loss": 1.1543, + "step": 2746 + }, + { + "epoch": 0.1374, + "grad_norm": 4.098634243011475, + "learning_rate": 1.9914903632068975e-05, + "loss": 1.5546, + "step": 2748 + }, + { + "epoch": 0.1375, + "grad_norm": 8.684130668640137, + "learning_rate": 1.9914448613738107e-05, + "loss": 0.527, + "step": 2750 + }, + { + "epoch": 0.1376, + "grad_norm": 4.822518348693848, + "learning_rate": 1.9913992387361747e-05, + "loss": 0.9898, + "step": 2752 + }, + { + "epoch": 0.1377, + "grad_norm": 2.9980671405792236, + "learning_rate": 1.991353495299549e-05, + "loss": 0.8194, + "step": 2754 + }, + { + "epoch": 0.1378, + "grad_norm": 3.356388568878174, + "learning_rate": 1.9913076310695068e-05, + "loss": 1.5383, + "step": 2756 + }, + { + "epoch": 0.1379, + "grad_norm": 2.0201382637023926, + "learning_rate": 1.9912616460516364e-05, + "loss": 1.0365, + "step": 2758 + }, + { + "epoch": 0.138, + "grad_norm": 4.472102642059326, + "learning_rate": 1.991215540251542e-05, + "loss": 0.689, + "step": 2760 + }, + { + "epoch": 0.1381, + "grad_norm": 7.665932655334473, + "learning_rate": 1.9911693136748403e-05, + "loss": 1.7081, + "step": 2762 + }, + { + "epoch": 0.1382, + "grad_norm": 6.826009750366211, + "learning_rate": 1.991122966327164e-05, + "loss": 0.9299, + "step": 2764 + }, + { + "epoch": 0.1383, + "grad_norm": 7.781140327453613, + "learning_rate": 1.991076498214161e-05, + "loss": 1.2756, + "step": 2766 + }, + { + "epoch": 0.1384, + "grad_norm": 6.495415210723877, + "learning_rate": 1.991029909341493e-05, + "loss": 1.4673, + "step": 2768 + }, + { + "epoch": 0.1385, + "grad_norm": 4.903921604156494, + "learning_rate": 1.9909831997148363e-05, + "loss": 1.0051, + "step": 2770 + }, + { + "epoch": 0.1386, + "grad_norm": 4.840198516845703, + "learning_rate": 1.9909363693398828e-05, + "loss": 0.7551, + "step": 2772 + }, + { + "epoch": 0.1387, + "grad_norm": 2.352617025375366, + "learning_rate": 1.990889418222339e-05, + "loss": 1.4807, + "step": 2774 + }, + { + "epoch": 0.1388, + "grad_norm": 14.186120986938477, + "learning_rate": 1.9908423463679246e-05, + "loss": 1.5599, + "step": 2776 + }, + { + "epoch": 0.1389, + "grad_norm": 7.893065929412842, + "learning_rate": 1.9907951537823762e-05, + "loss": 0.7168, + "step": 2778 + }, + { + "epoch": 0.139, + "grad_norm": 5.471069812774658, + "learning_rate": 1.9907478404714438e-05, + "loss": 1.3684, + "step": 2780 + }, + { + "epoch": 0.1391, + "grad_norm": 2.461655855178833, + "learning_rate": 1.9907004064408923e-05, + "loss": 1.6439, + "step": 2782 + }, + { + "epoch": 0.1392, + "grad_norm": 12.992602348327637, + "learning_rate": 1.990652851696501e-05, + "loss": 0.6157, + "step": 2784 + }, + { + "epoch": 0.1393, + "grad_norm": 3.787693738937378, + "learning_rate": 1.990605176244065e-05, + "loss": 0.4149, + "step": 2786 + }, + { + "epoch": 0.1394, + "grad_norm": 5.761355400085449, + "learning_rate": 1.990557380089393e-05, + "loss": 0.9493, + "step": 2788 + }, + { + "epoch": 0.1395, + "grad_norm": 3.3799822330474854, + "learning_rate": 1.990509463238309e-05, + "loss": 1.0831, + "step": 2790 + }, + { + "epoch": 0.1396, + "grad_norm": 5.611127853393555, + "learning_rate": 1.9904614256966514e-05, + "loss": 1.3834, + "step": 2792 + }, + { + "epoch": 0.1397, + "grad_norm": 2.0193254947662354, + "learning_rate": 1.9904132674702734e-05, + "loss": 0.7459, + "step": 2794 + }, + { + "epoch": 0.1398, + "grad_norm": 1.029988408088684, + "learning_rate": 1.990364988565043e-05, + "loss": 0.644, + "step": 2796 + }, + { + "epoch": 0.1399, + "grad_norm": 6.708348751068115, + "learning_rate": 1.990316588986843e-05, + "loss": 1.0915, + "step": 2798 + }, + { + "epoch": 0.14, + "grad_norm": 8.6214017868042, + "learning_rate": 1.9902680687415704e-05, + "loss": 2.0406, + "step": 2800 + }, + { + "epoch": 0.1401, + "grad_norm": 9.714062690734863, + "learning_rate": 1.9902194278351375e-05, + "loss": 1.0285, + "step": 2802 + }, + { + "epoch": 0.1402, + "grad_norm": 6.41973876953125, + "learning_rate": 1.990170666273471e-05, + "loss": 1.1682, + "step": 2804 + }, + { + "epoch": 0.1403, + "grad_norm": 3.948748826980591, + "learning_rate": 1.990121784062512e-05, + "loss": 1.2411, + "step": 2806 + }, + { + "epoch": 0.1404, + "grad_norm": 2.2549142837524414, + "learning_rate": 1.9900727812082177e-05, + "loss": 1.7244, + "step": 2808 + }, + { + "epoch": 0.1405, + "grad_norm": 5.670393943786621, + "learning_rate": 1.990023657716558e-05, + "loss": 0.8541, + "step": 2810 + }, + { + "epoch": 0.1406, + "grad_norm": 8.428837776184082, + "learning_rate": 1.989974413593518e-05, + "loss": 1.3844, + "step": 2812 + }, + { + "epoch": 0.1407, + "grad_norm": 2.1426901817321777, + "learning_rate": 1.9899250488450993e-05, + "loss": 0.7533, + "step": 2814 + }, + { + "epoch": 0.1408, + "grad_norm": 12.303706169128418, + "learning_rate": 1.989875563477316e-05, + "loss": 1.5334, + "step": 2816 + }, + { + "epoch": 0.1409, + "grad_norm": 6.709261894226074, + "learning_rate": 1.9898259574961977e-05, + "loss": 1.3423, + "step": 2818 + }, + { + "epoch": 0.141, + "grad_norm": 14.038386344909668, + "learning_rate": 1.989776230907789e-05, + "loss": 1.7748, + "step": 2820 + }, + { + "epoch": 0.1411, + "grad_norm": 2.639826536178589, + "learning_rate": 1.9897263837181492e-05, + "loss": 1.1481, + "step": 2822 + }, + { + "epoch": 0.1412, + "grad_norm": 3.588552236557007, + "learning_rate": 1.989676415933351e-05, + "loss": 1.2008, + "step": 2824 + }, + { + "epoch": 0.1413, + "grad_norm": 10.812625885009766, + "learning_rate": 1.989626327559484e-05, + "loss": 1.4021, + "step": 2826 + }, + { + "epoch": 0.1414, + "grad_norm": 4.876836776733398, + "learning_rate": 1.989576118602651e-05, + "loss": 1.3356, + "step": 2828 + }, + { + "epoch": 0.1415, + "grad_norm": 4.898824214935303, + "learning_rate": 1.9895257890689698e-05, + "loss": 0.5084, + "step": 2830 + }, + { + "epoch": 0.1416, + "grad_norm": 3.1565403938293457, + "learning_rate": 1.9894753389645723e-05, + "loss": 0.5612, + "step": 2832 + }, + { + "epoch": 0.1417, + "grad_norm": 3.8819751739501953, + "learning_rate": 1.9894247682956064e-05, + "loss": 1.0264, + "step": 2834 + }, + { + "epoch": 0.1418, + "grad_norm": 3.7083919048309326, + "learning_rate": 1.9893740770682334e-05, + "loss": 1.0809, + "step": 2836 + }, + { + "epoch": 0.1419, + "grad_norm": 6.970932483673096, + "learning_rate": 1.9893232652886306e-05, + "loss": 0.7686, + "step": 2838 + }, + { + "epoch": 0.142, + "grad_norm": 5.875640869140625, + "learning_rate": 1.9892723329629885e-05, + "loss": 1.0884, + "step": 2840 + }, + { + "epoch": 0.1421, + "grad_norm": 6.656651496887207, + "learning_rate": 1.9892212800975136e-05, + "loss": 1.332, + "step": 2842 + }, + { + "epoch": 0.1422, + "grad_norm": 3.3948071002960205, + "learning_rate": 1.9891701066984264e-05, + "loss": 0.7295, + "step": 2844 + }, + { + "epoch": 0.1423, + "grad_norm": 1.9859538078308105, + "learning_rate": 1.989118812771962e-05, + "loss": 1.0637, + "step": 2846 + }, + { + "epoch": 0.1424, + "grad_norm": 13.198526382446289, + "learning_rate": 1.9890673983243708e-05, + "loss": 1.1239, + "step": 2848 + }, + { + "epoch": 0.1425, + "grad_norm": 9.791665077209473, + "learning_rate": 1.989015863361917e-05, + "loss": 1.3567, + "step": 2850 + }, + { + "epoch": 0.1426, + "grad_norm": 3.388995885848999, + "learning_rate": 1.9889642078908805e-05, + "loss": 0.4161, + "step": 2852 + }, + { + "epoch": 0.1427, + "grad_norm": 4.653116703033447, + "learning_rate": 1.9889124319175548e-05, + "loss": 1.32, + "step": 2854 + }, + { + "epoch": 0.1428, + "grad_norm": 3.042762279510498, + "learning_rate": 1.9888605354482494e-05, + "loss": 2.543, + "step": 2856 + }, + { + "epoch": 0.1429, + "grad_norm": 4.993569850921631, + "learning_rate": 1.9888085184892868e-05, + "loss": 1.019, + "step": 2858 + }, + { + "epoch": 0.143, + "grad_norm": 7.1897053718566895, + "learning_rate": 1.988756381047006e-05, + "loss": 1.541, + "step": 2860 + }, + { + "epoch": 0.1431, + "grad_norm": 5.674741268157959, + "learning_rate": 1.9887041231277593e-05, + "loss": 0.9442, + "step": 2862 + }, + { + "epoch": 0.1432, + "grad_norm": 3.6948630809783936, + "learning_rate": 1.988651744737914e-05, + "loss": 0.8894, + "step": 2864 + }, + { + "epoch": 0.1433, + "grad_norm": 3.4265942573547363, + "learning_rate": 1.9885992458838527e-05, + "loss": 0.7255, + "step": 2866 + }, + { + "epoch": 0.1434, + "grad_norm": 5.066208362579346, + "learning_rate": 1.9885466265719723e-05, + "loss": 1.4431, + "step": 2868 + }, + { + "epoch": 0.1435, + "grad_norm": 13.657660484313965, + "learning_rate": 1.9884938868086836e-05, + "loss": 0.9726, + "step": 2870 + }, + { + "epoch": 0.1436, + "grad_norm": 4.95789909362793, + "learning_rate": 1.9884410266004134e-05, + "loss": 1.1841, + "step": 2872 + }, + { + "epoch": 0.1437, + "grad_norm": 5.063854694366455, + "learning_rate": 1.9883880459536024e-05, + "loss": 1.4742, + "step": 2874 + }, + { + "epoch": 0.1438, + "grad_norm": 1.1481256484985352, + "learning_rate": 1.988334944874706e-05, + "loss": 0.6368, + "step": 2876 + }, + { + "epoch": 0.1439, + "grad_norm": 9.769880294799805, + "learning_rate": 1.988281723370195e-05, + "loss": 0.9884, + "step": 2878 + }, + { + "epoch": 0.144, + "grad_norm": 2.433487892150879, + "learning_rate": 1.988228381446553e-05, + "loss": 1.0009, + "step": 2880 + }, + { + "epoch": 0.1441, + "grad_norm": 6.582160472869873, + "learning_rate": 1.9881749191102807e-05, + "loss": 1.5192, + "step": 2882 + }, + { + "epoch": 0.1442, + "grad_norm": 7.437658309936523, + "learning_rate": 1.988121336367892e-05, + "loss": 1.669, + "step": 2884 + }, + { + "epoch": 0.1443, + "grad_norm": 4.728359699249268, + "learning_rate": 1.9880676332259155e-05, + "loss": 1.563, + "step": 2886 + }, + { + "epoch": 0.1444, + "grad_norm": 2.9585025310516357, + "learning_rate": 1.9880138096908955e-05, + "loss": 0.412, + "step": 2888 + }, + { + "epoch": 0.1445, + "grad_norm": 4.029636383056641, + "learning_rate": 1.9879598657693894e-05, + "loss": 1.4605, + "step": 2890 + }, + { + "epoch": 0.1446, + "grad_norm": 5.20360803604126, + "learning_rate": 1.9879058014679704e-05, + "loss": 1.8135, + "step": 2892 + }, + { + "epoch": 0.1447, + "grad_norm": 10.599518775939941, + "learning_rate": 1.987851616793226e-05, + "loss": 0.8166, + "step": 2894 + }, + { + "epoch": 0.1448, + "grad_norm": 7.597499370574951, + "learning_rate": 1.987797311751759e-05, + "loss": 0.9063, + "step": 2896 + }, + { + "epoch": 0.1449, + "grad_norm": 5.7822041511535645, + "learning_rate": 1.9877428863501857e-05, + "loss": 1.1889, + "step": 2898 + }, + { + "epoch": 0.145, + "grad_norm": 16.402481079101562, + "learning_rate": 1.9876883405951378e-05, + "loss": 1.4302, + "step": 2900 + }, + { + "epoch": 0.1451, + "grad_norm": 4.7846360206604, + "learning_rate": 1.9876336744932616e-05, + "loss": 1.0681, + "step": 2902 + }, + { + "epoch": 0.1452, + "grad_norm": 7.310203552246094, + "learning_rate": 1.9875788880512183e-05, + "loss": 1.431, + "step": 2904 + }, + { + "epoch": 0.1453, + "grad_norm": 9.08122730255127, + "learning_rate": 1.9875239812756826e-05, + "loss": 1.3562, + "step": 2906 + }, + { + "epoch": 0.1454, + "grad_norm": 3.357509136199951, + "learning_rate": 1.9874689541733455e-05, + "loss": 1.2751, + "step": 2908 + }, + { + "epoch": 0.1455, + "grad_norm": 5.065610885620117, + "learning_rate": 1.9874138067509116e-05, + "loss": 0.5562, + "step": 2910 + }, + { + "epoch": 0.1456, + "grad_norm": 10.659037590026855, + "learning_rate": 1.9873585390151003e-05, + "loss": 1.078, + "step": 2912 + }, + { + "epoch": 0.1457, + "grad_norm": 8.350022315979004, + "learning_rate": 1.9873031509726463e-05, + "loss": 0.9392, + "step": 2914 + }, + { + "epoch": 0.1458, + "grad_norm": 2.7491281032562256, + "learning_rate": 1.9872476426302983e-05, + "loss": 0.836, + "step": 2916 + }, + { + "epoch": 0.1459, + "grad_norm": 15.319228172302246, + "learning_rate": 1.9871920139948193e-05, + "loss": 1.4877, + "step": 2918 + }, + { + "epoch": 0.146, + "grad_norm": 7.338231563568115, + "learning_rate": 1.987136265072988e-05, + "loss": 1.7944, + "step": 2920 + }, + { + "epoch": 0.1461, + "grad_norm": 11.576202392578125, + "learning_rate": 1.987080395871597e-05, + "loss": 1.0943, + "step": 2922 + }, + { + "epoch": 0.1462, + "grad_norm": 15.521501541137695, + "learning_rate": 1.987024406397454e-05, + "loss": 1.9646, + "step": 2924 + }, + { + "epoch": 0.1463, + "grad_norm": 7.063384532928467, + "learning_rate": 1.9869682966573814e-05, + "loss": 2.143, + "step": 2926 + }, + { + "epoch": 0.1464, + "grad_norm": 6.120065212249756, + "learning_rate": 1.9869120666582153e-05, + "loss": 1.1733, + "step": 2928 + }, + { + "epoch": 0.1465, + "grad_norm": 3.150235414505005, + "learning_rate": 1.9868557164068073e-05, + "loss": 1.5445, + "step": 2930 + }, + { + "epoch": 0.1466, + "grad_norm": 3.4140143394470215, + "learning_rate": 1.986799245910024e-05, + "loss": 1.8371, + "step": 2932 + }, + { + "epoch": 0.1467, + "grad_norm": 7.480123996734619, + "learning_rate": 1.9867426551747457e-05, + "loss": 1.5541, + "step": 2934 + }, + { + "epoch": 0.1468, + "grad_norm": 1.9238793849945068, + "learning_rate": 1.986685944207868e-05, + "loss": 0.9756, + "step": 2936 + }, + { + "epoch": 0.1469, + "grad_norm": 9.98784351348877, + "learning_rate": 1.9866291130163013e-05, + "loss": 0.4715, + "step": 2938 + }, + { + "epoch": 0.147, + "grad_norm": 2.7469072341918945, + "learning_rate": 1.9865721616069695e-05, + "loss": 0.7141, + "step": 2940 + }, + { + "epoch": 0.1471, + "grad_norm": 3.2060813903808594, + "learning_rate": 1.9865150899868126e-05, + "loss": 1.325, + "step": 2942 + }, + { + "epoch": 0.1472, + "grad_norm": 7.058685302734375, + "learning_rate": 1.9864578981627844e-05, + "loss": 1.5805, + "step": 2944 + }, + { + "epoch": 0.1473, + "grad_norm": 3.064014434814453, + "learning_rate": 1.9864005861418537e-05, + "loss": 0.3358, + "step": 2946 + }, + { + "epoch": 0.1474, + "grad_norm": 8.428108215332031, + "learning_rate": 1.9863431539310033e-05, + "loss": 1.28, + "step": 2948 + }, + { + "epoch": 0.1475, + "grad_norm": 15.353477478027344, + "learning_rate": 1.9862856015372315e-05, + "loss": 1.3504, + "step": 2950 + }, + { + "epoch": 0.1476, + "grad_norm": 6.532548427581787, + "learning_rate": 1.986227928967551e-05, + "loss": 1.5422, + "step": 2952 + }, + { + "epoch": 0.1477, + "grad_norm": 6.068332672119141, + "learning_rate": 1.9861701362289892e-05, + "loss": 1.0251, + "step": 2954 + }, + { + "epoch": 0.1478, + "grad_norm": 8.714967727661133, + "learning_rate": 1.9861122233285873e-05, + "loss": 1.9499, + "step": 2956 + }, + { + "epoch": 0.1479, + "grad_norm": 5.351136684417725, + "learning_rate": 1.9860541902734023e-05, + "loss": 1.286, + "step": 2958 + }, + { + "epoch": 0.148, + "grad_norm": 2.1837174892425537, + "learning_rate": 1.985996037070505e-05, + "loss": 1.1184, + "step": 2960 + }, + { + "epoch": 0.1481, + "grad_norm": 3.524152994155884, + "learning_rate": 1.9859377637269817e-05, + "loss": 1.4027, + "step": 2962 + }, + { + "epoch": 0.1482, + "grad_norm": 5.256674766540527, + "learning_rate": 1.9858793702499322e-05, + "loss": 1.5004, + "step": 2964 + }, + { + "epoch": 0.1483, + "grad_norm": 2.9171342849731445, + "learning_rate": 1.9858208566464726e-05, + "loss": 1.4134, + "step": 2966 + }, + { + "epoch": 0.1484, + "grad_norm": 6.943163871765137, + "learning_rate": 1.9857622229237315e-05, + "loss": 1.5189, + "step": 2968 + }, + { + "epoch": 0.1485, + "grad_norm": 7.146639823913574, + "learning_rate": 1.985703469088854e-05, + "loss": 1.0064, + "step": 2970 + }, + { + "epoch": 0.1486, + "grad_norm": 7.589986801147461, + "learning_rate": 1.9856445951489984e-05, + "loss": 0.6354, + "step": 2972 + }, + { + "epoch": 0.1487, + "grad_norm": 3.1612439155578613, + "learning_rate": 1.9855856011113384e-05, + "loss": 0.6879, + "step": 2974 + }, + { + "epoch": 0.1488, + "grad_norm": 24.11284637451172, + "learning_rate": 1.985526486983063e-05, + "loss": 0.9542, + "step": 2976 + }, + { + "epoch": 0.1489, + "grad_norm": 2.150122880935669, + "learning_rate": 1.9854672527713745e-05, + "loss": 0.9859, + "step": 2978 + }, + { + "epoch": 0.149, + "grad_norm": 2.9944679737091064, + "learning_rate": 1.9854078984834904e-05, + "loss": 1.058, + "step": 2980 + }, + { + "epoch": 0.1491, + "grad_norm": 4.15227746963501, + "learning_rate": 1.985348424126643e-05, + "loss": 0.8917, + "step": 2982 + }, + { + "epoch": 0.1492, + "grad_norm": 3.012723445892334, + "learning_rate": 1.985288829708079e-05, + "loss": 0.6227, + "step": 2984 + }, + { + "epoch": 0.1493, + "grad_norm": 10.842888832092285, + "learning_rate": 1.9852291152350593e-05, + "loss": 1.0863, + "step": 2986 + }, + { + "epoch": 0.1494, + "grad_norm": 4.269872188568115, + "learning_rate": 1.9851692807148612e-05, + "loss": 0.8108, + "step": 2988 + }, + { + "epoch": 0.1495, + "grad_norm": 6.838650226593018, + "learning_rate": 1.985109326154774e-05, + "loss": 2.0592, + "step": 2990 + }, + { + "epoch": 0.1496, + "grad_norm": 1.82005774974823, + "learning_rate": 1.9850492515621038e-05, + "loss": 0.5983, + "step": 2992 + }, + { + "epoch": 0.1497, + "grad_norm": 4.234922885894775, + "learning_rate": 1.9849890569441704e-05, + "loss": 1.0152, + "step": 2994 + }, + { + "epoch": 0.1498, + "grad_norm": 8.511574745178223, + "learning_rate": 1.984928742308308e-05, + "loss": 2.5749, + "step": 2996 + }, + { + "epoch": 0.1499, + "grad_norm": 2.2854249477386475, + "learning_rate": 1.984868307661866e-05, + "loss": 1.2051, + "step": 2998 + }, + { + "epoch": 0.15, + "grad_norm": 2.9013209342956543, + "learning_rate": 1.9848077530122083e-05, + "loss": 0.8479, + "step": 3000 + }, + { + "epoch": 0.1501, + "grad_norm": 2.915924549102783, + "learning_rate": 1.9847470783667128e-05, + "loss": 0.3408, + "step": 3002 + }, + { + "epoch": 0.1502, + "grad_norm": 6.062166690826416, + "learning_rate": 1.9846862837327733e-05, + "loss": 0.6846, + "step": 3004 + }, + { + "epoch": 0.1503, + "grad_norm": 3.1459133625030518, + "learning_rate": 1.9846253691177965e-05, + "loss": 1.0044, + "step": 3006 + }, + { + "epoch": 0.1504, + "grad_norm": 10.034981727600098, + "learning_rate": 1.9845643345292055e-05, + "loss": 1.2907, + "step": 3008 + }, + { + "epoch": 0.1505, + "grad_norm": 3.069570779800415, + "learning_rate": 1.9845031799744367e-05, + "loss": 0.9031, + "step": 3010 + }, + { + "epoch": 0.1506, + "grad_norm": 9.379439353942871, + "learning_rate": 1.9844419054609418e-05, + "loss": 1.5575, + "step": 3012 + }, + { + "epoch": 0.1507, + "grad_norm": 10.447949409484863, + "learning_rate": 1.984380510996187e-05, + "loss": 1.4259, + "step": 3014 + }, + { + "epoch": 0.1508, + "grad_norm": 3.9248545169830322, + "learning_rate": 1.9843189965876525e-05, + "loss": 0.6467, + "step": 3016 + }, + { + "epoch": 0.1509, + "grad_norm": 3.2200443744659424, + "learning_rate": 1.9842573622428346e-05, + "loss": 1.0335, + "step": 3018 + }, + { + "epoch": 0.151, + "grad_norm": 9.898101806640625, + "learning_rate": 1.984195607969242e-05, + "loss": 0.6877, + "step": 3020 + }, + { + "epoch": 0.1511, + "grad_norm": 5.260511875152588, + "learning_rate": 1.9841337337744004e-05, + "loss": 0.445, + "step": 3022 + }, + { + "epoch": 0.1512, + "grad_norm": 2.885741710662842, + "learning_rate": 1.9840717396658483e-05, + "loss": 0.7617, + "step": 3024 + }, + { + "epoch": 0.1513, + "grad_norm": 6.620848655700684, + "learning_rate": 1.9840096256511398e-05, + "loss": 1.2346, + "step": 3026 + }, + { + "epoch": 0.1514, + "grad_norm": 2.267272710800171, + "learning_rate": 1.9839473917378432e-05, + "loss": 0.4633, + "step": 3028 + }, + { + "epoch": 0.1515, + "grad_norm": 12.896461486816406, + "learning_rate": 1.983885037933542e-05, + "loss": 1.7107, + "step": 3030 + }, + { + "epoch": 0.1516, + "grad_norm": 4.712326526641846, + "learning_rate": 1.983822564245833e-05, + "loss": 1.2308, + "step": 3032 + }, + { + "epoch": 0.1517, + "grad_norm": 24.37150764465332, + "learning_rate": 1.9837599706823284e-05, + "loss": 1.2106, + "step": 3034 + }, + { + "epoch": 0.1518, + "grad_norm": 11.452286720275879, + "learning_rate": 1.9836972572506557e-05, + "loss": 2.1904, + "step": 3036 + }, + { + "epoch": 0.1519, + "grad_norm": 6.235820293426514, + "learning_rate": 1.9836344239584566e-05, + "loss": 1.1832, + "step": 3038 + }, + { + "epoch": 0.152, + "grad_norm": 7.315890789031982, + "learning_rate": 1.983571470813386e-05, + "loss": 1.7206, + "step": 3040 + }, + { + "epoch": 0.1521, + "grad_norm": 8.701979637145996, + "learning_rate": 1.9835083978231157e-05, + "loss": 1.2541, + "step": 3042 + }, + { + "epoch": 0.1522, + "grad_norm": 5.142092704772949, + "learning_rate": 1.98344520499533e-05, + "loss": 1.4228, + "step": 3044 + }, + { + "epoch": 0.1523, + "grad_norm": 8.859126091003418, + "learning_rate": 1.9833818923377293e-05, + "loss": 1.4598, + "step": 3046 + }, + { + "epoch": 0.1524, + "grad_norm": 5.900530815124512, + "learning_rate": 1.983318459858028e-05, + "loss": 1.4278, + "step": 3048 + }, + { + "epoch": 0.1525, + "grad_norm": 4.294980525970459, + "learning_rate": 1.983254907563955e-05, + "loss": 0.6869, + "step": 3050 + }, + { + "epoch": 0.1526, + "grad_norm": 4.438492774963379, + "learning_rate": 1.9831912354632537e-05, + "loss": 1.1595, + "step": 3052 + }, + { + "epoch": 0.1527, + "grad_norm": 3.11960506439209, + "learning_rate": 1.983127443563683e-05, + "loss": 1.328, + "step": 3054 + }, + { + "epoch": 0.1528, + "grad_norm": 2.5467445850372314, + "learning_rate": 1.9830635318730155e-05, + "loss": 0.6844, + "step": 3056 + }, + { + "epoch": 0.1529, + "grad_norm": 6.141857147216797, + "learning_rate": 1.9829995003990387e-05, + "loss": 1.0392, + "step": 3058 + }, + { + "epoch": 0.153, + "grad_norm": 4.905157089233398, + "learning_rate": 1.9829353491495545e-05, + "loss": 0.8288, + "step": 3060 + }, + { + "epoch": 0.1531, + "grad_norm": 4.888780117034912, + "learning_rate": 1.9828710781323793e-05, + "loss": 1.3055, + "step": 3062 + }, + { + "epoch": 0.1532, + "grad_norm": 3.371579885482788, + "learning_rate": 1.982806687355345e-05, + "loss": 1.204, + "step": 3064 + }, + { + "epoch": 0.1533, + "grad_norm": 17.134716033935547, + "learning_rate": 1.9827421768262966e-05, + "loss": 1.3249, + "step": 3066 + }, + { + "epoch": 0.1534, + "grad_norm": 11.697425842285156, + "learning_rate": 1.982677546553095e-05, + "loss": 1.2417, + "step": 3068 + }, + { + "epoch": 0.1535, + "grad_norm": 2.6913154125213623, + "learning_rate": 1.9826127965436153e-05, + "loss": 0.6813, + "step": 3070 + }, + { + "epoch": 0.1536, + "grad_norm": 16.327850341796875, + "learning_rate": 1.982547926805747e-05, + "loss": 1.4278, + "step": 3072 + }, + { + "epoch": 0.1537, + "grad_norm": 3.7583916187286377, + "learning_rate": 1.9824829373473943e-05, + "loss": 1.3426, + "step": 3074 + }, + { + "epoch": 0.1538, + "grad_norm": 4.350657939910889, + "learning_rate": 1.9824178281764753e-05, + "loss": 1.3547, + "step": 3076 + }, + { + "epoch": 0.1539, + "grad_norm": 5.944058418273926, + "learning_rate": 1.9823525993009243e-05, + "loss": 0.7956, + "step": 3078 + }, + { + "epoch": 0.154, + "grad_norm": 1.9754207134246826, + "learning_rate": 1.982287250728689e-05, + "loss": 0.9842, + "step": 3080 + }, + { + "epoch": 0.1541, + "grad_norm": 11.43967342376709, + "learning_rate": 1.9822217824677313e-05, + "loss": 1.7649, + "step": 3082 + }, + { + "epoch": 0.1542, + "grad_norm": 3.1667089462280273, + "learning_rate": 1.9821561945260292e-05, + "loss": 0.5281, + "step": 3084 + }, + { + "epoch": 0.1543, + "grad_norm": 5.081853866577148, + "learning_rate": 1.982090486911574e-05, + "loss": 1.4094, + "step": 3086 + }, + { + "epoch": 0.1544, + "grad_norm": 2.805208206176758, + "learning_rate": 1.982024659632372e-05, + "loss": 1.0617, + "step": 3088 + }, + { + "epoch": 0.1545, + "grad_norm": 7.457943439483643, + "learning_rate": 1.981958712696444e-05, + "loss": 1.021, + "step": 3090 + }, + { + "epoch": 0.1546, + "grad_norm": 4.335594654083252, + "learning_rate": 1.9818926461118254e-05, + "loss": 0.4939, + "step": 3092 + }, + { + "epoch": 0.1547, + "grad_norm": 3.4366540908813477, + "learning_rate": 1.981826459886566e-05, + "loss": 1.5144, + "step": 3094 + }, + { + "epoch": 0.1548, + "grad_norm": 9.527952194213867, + "learning_rate": 1.981760154028731e-05, + "loss": 2.3379, + "step": 3096 + }, + { + "epoch": 0.1549, + "grad_norm": 7.4310526847839355, + "learning_rate": 1.9816937285463992e-05, + "loss": 1.6541, + "step": 3098 + }, + { + "epoch": 0.155, + "grad_norm": 3.677168130874634, + "learning_rate": 1.9816271834476642e-05, + "loss": 0.9143, + "step": 3100 + }, + { + "epoch": 0.1551, + "grad_norm": 1.920877456665039, + "learning_rate": 1.9815605187406345e-05, + "loss": 1.1511, + "step": 3102 + }, + { + "epoch": 0.1552, + "grad_norm": 2.192446231842041, + "learning_rate": 1.981493734433433e-05, + "loss": 0.1886, + "step": 3104 + }, + { + "epoch": 0.1553, + "grad_norm": 7.898262023925781, + "learning_rate": 1.9814268305341974e-05, + "loss": 1.8134, + "step": 3106 + }, + { + "epoch": 0.1554, + "grad_norm": 5.5405778884887695, + "learning_rate": 1.981359807051079e-05, + "loss": 1.4748, + "step": 3108 + }, + { + "epoch": 0.1555, + "grad_norm": 13.15390682220459, + "learning_rate": 1.981292663992245e-05, + "loss": 0.9896, + "step": 3110 + }, + { + "epoch": 0.1556, + "grad_norm": 3.3129498958587646, + "learning_rate": 1.981225401365877e-05, + "loss": 1.1993, + "step": 3112 + }, + { + "epoch": 0.1557, + "grad_norm": 2.731900453567505, + "learning_rate": 1.9811580191801697e-05, + "loss": 0.5781, + "step": 3114 + }, + { + "epoch": 0.1558, + "grad_norm": 4.152290344238281, + "learning_rate": 1.981090517443334e-05, + "loss": 1.0684, + "step": 3116 + }, + { + "epoch": 0.1559, + "grad_norm": 3.200712203979492, + "learning_rate": 1.981022896163595e-05, + "loss": 1.0067, + "step": 3118 + }, + { + "epoch": 0.156, + "grad_norm": 9.928376197814941, + "learning_rate": 1.9809551553491918e-05, + "loss": 1.8181, + "step": 3120 + }, + { + "epoch": 0.1561, + "grad_norm": 3.110574722290039, + "learning_rate": 1.9808872950083785e-05, + "loss": 0.8072, + "step": 3122 + }, + { + "epoch": 0.1562, + "grad_norm": 4.855128765106201, + "learning_rate": 1.9808193151494233e-05, + "loss": 1.1156, + "step": 3124 + }, + { + "epoch": 0.1563, + "grad_norm": 8.318655014038086, + "learning_rate": 1.98075121578061e-05, + "loss": 1.3351, + "step": 3126 + }, + { + "epoch": 0.1564, + "grad_norm": 4.765480041503906, + "learning_rate": 1.9806829969102356e-05, + "loss": 1.2757, + "step": 3128 + }, + { + "epoch": 0.1565, + "grad_norm": 8.076260566711426, + "learning_rate": 1.980614658546613e-05, + "loss": 1.4043, + "step": 3130 + }, + { + "epoch": 0.1566, + "grad_norm": 13.17011833190918, + "learning_rate": 1.9805462006980688e-05, + "loss": 1.0859, + "step": 3132 + }, + { + "epoch": 0.1567, + "grad_norm": 19.114961624145508, + "learning_rate": 1.9804776233729446e-05, + "loss": 1.7194, + "step": 3134 + }, + { + "epoch": 0.1568, + "grad_norm": 4.736938953399658, + "learning_rate": 1.980408926579596e-05, + "loss": 0.7879, + "step": 3136 + }, + { + "epoch": 0.1569, + "grad_norm": 8.51400089263916, + "learning_rate": 1.980340110326393e-05, + "loss": 1.0809, + "step": 3138 + }, + { + "epoch": 0.157, + "grad_norm": 1.9036192893981934, + "learning_rate": 1.9802711746217222e-05, + "loss": 1.4714, + "step": 3140 + }, + { + "epoch": 0.1571, + "grad_norm": 4.300196170806885, + "learning_rate": 1.9802021194739815e-05, + "loss": 0.7344, + "step": 3142 + }, + { + "epoch": 0.1572, + "grad_norm": 7.077413082122803, + "learning_rate": 1.9801329448915863e-05, + "loss": 0.9578, + "step": 3144 + }, + { + "epoch": 0.1573, + "grad_norm": 5.866244316101074, + "learning_rate": 1.9800636508829646e-05, + "loss": 1.6404, + "step": 3146 + }, + { + "epoch": 0.1574, + "grad_norm": 2.9871292114257812, + "learning_rate": 1.9799942374565597e-05, + "loss": 1.4633, + "step": 3148 + }, + { + "epoch": 0.1575, + "grad_norm": 2.714390993118286, + "learning_rate": 1.9799247046208297e-05, + "loss": 1.2004, + "step": 3150 + }, + { + "epoch": 0.1576, + "grad_norm": 3.0857858657836914, + "learning_rate": 1.979855052384247e-05, + "loss": 1.5724, + "step": 3152 + }, + { + "epoch": 0.1577, + "grad_norm": 12.700092315673828, + "learning_rate": 1.9797852807552983e-05, + "loss": 1.5872, + "step": 3154 + }, + { + "epoch": 0.1578, + "grad_norm": 4.746480464935303, + "learning_rate": 1.9797153897424854e-05, + "loss": 0.5526, + "step": 3156 + }, + { + "epoch": 0.1579, + "grad_norm": 6.827916145324707, + "learning_rate": 1.9796453793543237e-05, + "loss": 1.3836, + "step": 3158 + }, + { + "epoch": 0.158, + "grad_norm": 10.816061019897461, + "learning_rate": 1.979575249599344e-05, + "loss": 0.5488, + "step": 3160 + }, + { + "epoch": 0.1581, + "grad_norm": 4.69998836517334, + "learning_rate": 1.9795050004860918e-05, + "loss": 0.9951, + "step": 3162 + }, + { + "epoch": 0.1582, + "grad_norm": 2.3380491733551025, + "learning_rate": 1.9794346320231265e-05, + "loss": 1.2103, + "step": 3164 + }, + { + "epoch": 0.1583, + "grad_norm": 3.919517755508423, + "learning_rate": 1.979364144219022e-05, + "loss": 1.035, + "step": 3166 + }, + { + "epoch": 0.1584, + "grad_norm": 3.4496607780456543, + "learning_rate": 1.9792935370823676e-05, + "loss": 0.9864, + "step": 3168 + }, + { + "epoch": 0.1585, + "grad_norm": 11.731117248535156, + "learning_rate": 1.979222810621766e-05, + "loss": 1.3565, + "step": 3170 + }, + { + "epoch": 0.1586, + "grad_norm": 7.468094825744629, + "learning_rate": 1.9791519648458352e-05, + "loss": 1.2607, + "step": 3172 + }, + { + "epoch": 0.1587, + "grad_norm": 5.736685276031494, + "learning_rate": 1.9790809997632076e-05, + "loss": 0.5089, + "step": 3174 + }, + { + "epoch": 0.1588, + "grad_norm": 4.750212669372559, + "learning_rate": 1.97900991538253e-05, + "loss": 1.359, + "step": 3176 + }, + { + "epoch": 0.1589, + "grad_norm": 4.976505756378174, + "learning_rate": 1.9789387117124638e-05, + "loss": 1.0405, + "step": 3178 + }, + { + "epoch": 0.159, + "grad_norm": 7.305429458618164, + "learning_rate": 1.9788673887616852e-05, + "loss": 0.8101, + "step": 3180 + }, + { + "epoch": 0.1591, + "grad_norm": 8.060941696166992, + "learning_rate": 1.9787959465388845e-05, + "loss": 1.6194, + "step": 3182 + }, + { + "epoch": 0.1592, + "grad_norm": 3.366408109664917, + "learning_rate": 1.9787243850527663e-05, + "loss": 0.9825, + "step": 3184 + }, + { + "epoch": 0.1593, + "grad_norm": 7.811579704284668, + "learning_rate": 1.978652704312051e-05, + "loss": 1.2901, + "step": 3186 + }, + { + "epoch": 0.1594, + "grad_norm": 4.202051639556885, + "learning_rate": 1.978580904325472e-05, + "loss": 1.091, + "step": 3188 + }, + { + "epoch": 0.1595, + "grad_norm": 20.421506881713867, + "learning_rate": 1.9785089851017788e-05, + "loss": 0.9387, + "step": 3190 + }, + { + "epoch": 0.1596, + "grad_norm": 2.760284423828125, + "learning_rate": 1.9784369466497333e-05, + "loss": 1.2892, + "step": 3192 + }, + { + "epoch": 0.1597, + "grad_norm": 4.0723042488098145, + "learning_rate": 1.9783647889781138e-05, + "loss": 1.0726, + "step": 3194 + }, + { + "epoch": 0.1598, + "grad_norm": 7.418466091156006, + "learning_rate": 1.9782925120957123e-05, + "loss": 0.7808, + "step": 3196 + }, + { + "epoch": 0.1599, + "grad_norm": 5.683874130249023, + "learning_rate": 1.9782201160113362e-05, + "loss": 1.1197, + "step": 3198 + }, + { + "epoch": 0.16, + "grad_norm": 6.888369083404541, + "learning_rate": 1.9781476007338058e-05, + "loss": 1.3967, + "step": 3200 + }, + { + "epoch": 0.1601, + "grad_norm": 5.84624719619751, + "learning_rate": 1.9780749662719573e-05, + "loss": 0.5797, + "step": 3202 + }, + { + "epoch": 0.1602, + "grad_norm": 5.968034267425537, + "learning_rate": 1.9780022126346413e-05, + "loss": 0.9077, + "step": 3204 + }, + { + "epoch": 0.1603, + "grad_norm": 4.411718845367432, + "learning_rate": 1.977929339830722e-05, + "loss": 1.1967, + "step": 3206 + }, + { + "epoch": 0.1604, + "grad_norm": 3.5853612422943115, + "learning_rate": 1.977856347869079e-05, + "loss": 1.3155, + "step": 3208 + }, + { + "epoch": 0.1605, + "grad_norm": 4.8819661140441895, + "learning_rate": 1.977783236758606e-05, + "loss": 1.3369, + "step": 3210 + }, + { + "epoch": 0.1606, + "grad_norm": 3.516836166381836, + "learning_rate": 1.977710006508212e-05, + "loss": 1.224, + "step": 3212 + }, + { + "epoch": 0.1607, + "grad_norm": 3.7575340270996094, + "learning_rate": 1.9776366571268194e-05, + "loss": 1.0844, + "step": 3214 + }, + { + "epoch": 0.1608, + "grad_norm": 6.084173202514648, + "learning_rate": 1.9775631886233655e-05, + "loss": 1.2592, + "step": 3216 + }, + { + "epoch": 0.1609, + "grad_norm": 3.2876479625701904, + "learning_rate": 1.9774896010068022e-05, + "loss": 1.4962, + "step": 3218 + }, + { + "epoch": 0.161, + "grad_norm": 4.437551498413086, + "learning_rate": 1.9774158942860962e-05, + "loss": 1.3722, + "step": 3220 + }, + { + "epoch": 0.1611, + "grad_norm": 9.307719230651855, + "learning_rate": 1.977342068470228e-05, + "loss": 1.8007, + "step": 3222 + }, + { + "epoch": 0.1612, + "grad_norm": 4.785160541534424, + "learning_rate": 1.9772681235681936e-05, + "loss": 0.8218, + "step": 3224 + }, + { + "epoch": 0.1613, + "grad_norm": 4.417387008666992, + "learning_rate": 1.9771940595890025e-05, + "loss": 1.3214, + "step": 3226 + }, + { + "epoch": 0.1614, + "grad_norm": 7.142463207244873, + "learning_rate": 1.97711987654168e-05, + "loss": 1.2514, + "step": 3228 + }, + { + "epoch": 0.1615, + "grad_norm": 7.9744367599487305, + "learning_rate": 1.977045574435264e-05, + "loss": 1.252, + "step": 3230 + }, + { + "epoch": 0.1616, + "grad_norm": 5.3444013595581055, + "learning_rate": 1.9769711532788083e-05, + "loss": 1.3918, + "step": 3232 + }, + { + "epoch": 0.1617, + "grad_norm": 1.6359069347381592, + "learning_rate": 1.976896613081381e-05, + "loss": 0.3376, + "step": 3234 + }, + { + "epoch": 0.1618, + "grad_norm": 3.140669345855713, + "learning_rate": 1.976821953852065e-05, + "loss": 1.7202, + "step": 3236 + }, + { + "epoch": 0.1619, + "grad_norm": 5.967475414276123, + "learning_rate": 1.976747175599957e-05, + "loss": 1.4509, + "step": 3238 + }, + { + "epoch": 0.162, + "grad_norm": 3.942741870880127, + "learning_rate": 1.9766722783341682e-05, + "loss": 1.0076, + "step": 3240 + }, + { + "epoch": 0.1621, + "grad_norm": 4.776872634887695, + "learning_rate": 1.976597262063825e-05, + "loss": 1.3518, + "step": 3242 + }, + { + "epoch": 0.1622, + "grad_norm": 5.3095808029174805, + "learning_rate": 1.9765221267980675e-05, + "loss": 1.2197, + "step": 3244 + }, + { + "epoch": 0.1623, + "grad_norm": 2.8227086067199707, + "learning_rate": 1.976446872546051e-05, + "loss": 0.5187, + "step": 3246 + }, + { + "epoch": 0.1624, + "grad_norm": 4.953948974609375, + "learning_rate": 1.976371499316945e-05, + "loss": 0.5317, + "step": 3248 + }, + { + "epoch": 0.1625, + "grad_norm": 1.4120123386383057, + "learning_rate": 1.9762960071199334e-05, + "loss": 0.807, + "step": 3250 + }, + { + "epoch": 0.1626, + "grad_norm": 3.466647148132324, + "learning_rate": 1.976220395964215e-05, + "loss": 1.1116, + "step": 3252 + }, + { + "epoch": 0.1627, + "grad_norm": 4.8024678230285645, + "learning_rate": 1.9761446658590024e-05, + "loss": 1.7397, + "step": 3254 + }, + { + "epoch": 0.1628, + "grad_norm": 5.792929649353027, + "learning_rate": 1.9760688168135233e-05, + "loss": 1.0008, + "step": 3256 + }, + { + "epoch": 0.1629, + "grad_norm": 8.59044361114502, + "learning_rate": 1.9759928488370195e-05, + "loss": 1.5331, + "step": 3258 + }, + { + "epoch": 0.163, + "grad_norm": 0.8282305598258972, + "learning_rate": 1.9759167619387474e-05, + "loss": 0.2167, + "step": 3260 + }, + { + "epoch": 0.1631, + "grad_norm": 3.522747278213501, + "learning_rate": 1.9758405561279787e-05, + "loss": 1.3976, + "step": 3262 + }, + { + "epoch": 0.1632, + "grad_norm": 2.9071426391601562, + "learning_rate": 1.9757642314139977e-05, + "loss": 0.6277, + "step": 3264 + }, + { + "epoch": 0.1633, + "grad_norm": 4.742652416229248, + "learning_rate": 1.9756877878061053e-05, + "loss": 1.1857, + "step": 3266 + }, + { + "epoch": 0.1634, + "grad_norm": 5.718259334564209, + "learning_rate": 1.9756112253136154e-05, + "loss": 1.2524, + "step": 3268 + }, + { + "epoch": 0.1635, + "grad_norm": 2.0678813457489014, + "learning_rate": 1.9755345439458566e-05, + "loss": 0.7387, + "step": 3270 + }, + { + "epoch": 0.1636, + "grad_norm": 7.136361122131348, + "learning_rate": 1.9754577437121733e-05, + "loss": 1.076, + "step": 3272 + }, + { + "epoch": 0.1637, + "grad_norm": 4.209559917449951, + "learning_rate": 1.9753808246219226e-05, + "loss": 1.1813, + "step": 3274 + }, + { + "epoch": 0.1638, + "grad_norm": 4.458719730377197, + "learning_rate": 1.975303786684477e-05, + "loss": 0.6695, + "step": 3276 + }, + { + "epoch": 0.1639, + "grad_norm": 2.889893054962158, + "learning_rate": 1.9752266299092234e-05, + "loss": 1.2056, + "step": 3278 + }, + { + "epoch": 0.164, + "grad_norm": 0.699643075466156, + "learning_rate": 1.9751493543055634e-05, + "loss": 0.2413, + "step": 3280 + }, + { + "epoch": 0.1641, + "grad_norm": 2.028191089630127, + "learning_rate": 1.975071959882912e-05, + "loss": 0.5714, + "step": 3282 + }, + { + "epoch": 0.1642, + "grad_norm": 5.169890403747559, + "learning_rate": 1.9749944466507007e-05, + "loss": 0.7186, + "step": 3284 + }, + { + "epoch": 0.1643, + "grad_norm": 0.511881947517395, + "learning_rate": 1.9749168146183734e-05, + "loss": 0.5165, + "step": 3286 + }, + { + "epoch": 0.1644, + "grad_norm": 2.2416250705718994, + "learning_rate": 1.974839063795389e-05, + "loss": 0.7809, + "step": 3288 + }, + { + "epoch": 0.1645, + "grad_norm": 6.9311699867248535, + "learning_rate": 1.974761194191222e-05, + "loss": 1.066, + "step": 3290 + }, + { + "epoch": 0.1646, + "grad_norm": 7.807737827301025, + "learning_rate": 1.9746832058153602e-05, + "loss": 1.2728, + "step": 3292 + }, + { + "epoch": 0.1647, + "grad_norm": 7.705780982971191, + "learning_rate": 1.9746050986773062e-05, + "loss": 1.0991, + "step": 3294 + }, + { + "epoch": 0.1648, + "grad_norm": 5.4895477294921875, + "learning_rate": 1.9745268727865774e-05, + "loss": 1.265, + "step": 3296 + }, + { + "epoch": 0.1649, + "grad_norm": 4.655086040496826, + "learning_rate": 1.974448528152705e-05, + "loss": 1.2003, + "step": 3298 + }, + { + "epoch": 0.165, + "grad_norm": 4.722311496734619, + "learning_rate": 1.9743700647852356e-05, + "loss": 0.5579, + "step": 3300 + }, + { + "epoch": 0.1651, + "grad_norm": 7.795215606689453, + "learning_rate": 1.974291482693729e-05, + "loss": 0.2838, + "step": 3302 + }, + { + "epoch": 0.1652, + "grad_norm": 5.558425426483154, + "learning_rate": 1.9742127818877605e-05, + "loss": 1.0952, + "step": 3304 + }, + { + "epoch": 0.1653, + "grad_norm": 11.643099784851074, + "learning_rate": 1.97413396237692e-05, + "loss": 0.644, + "step": 3306 + }, + { + "epoch": 0.1654, + "grad_norm": 7.950251579284668, + "learning_rate": 1.974055024170811e-05, + "loss": 1.24, + "step": 3308 + }, + { + "epoch": 0.1655, + "grad_norm": 5.895716190338135, + "learning_rate": 1.973975967279052e-05, + "loss": 1.4445, + "step": 3310 + }, + { + "epoch": 0.1656, + "grad_norm": 8.270905494689941, + "learning_rate": 1.9738967917112752e-05, + "loss": 1.025, + "step": 3312 + }, + { + "epoch": 0.1657, + "grad_norm": 6.038565635681152, + "learning_rate": 1.9738174974771288e-05, + "loss": 0.7612, + "step": 3314 + }, + { + "epoch": 0.1658, + "grad_norm": 7.346343040466309, + "learning_rate": 1.9737380845862745e-05, + "loss": 1.3098, + "step": 3316 + }, + { + "epoch": 0.1659, + "grad_norm": 6.179316997528076, + "learning_rate": 1.973658553048388e-05, + "loss": 0.7591, + "step": 3318 + }, + { + "epoch": 0.166, + "grad_norm": 4.599422931671143, + "learning_rate": 1.9735789028731603e-05, + "loss": 0.8948, + "step": 3320 + }, + { + "epoch": 0.1661, + "grad_norm": 3.2620038986206055, + "learning_rate": 1.9734991340702966e-05, + "loss": 0.7414, + "step": 3322 + }, + { + "epoch": 0.1662, + "grad_norm": 6.361931324005127, + "learning_rate": 1.9734192466495162e-05, + "loss": 0.4572, + "step": 3324 + }, + { + "epoch": 0.1663, + "grad_norm": 3.6503727436065674, + "learning_rate": 1.973339240620553e-05, + "loss": 1.1542, + "step": 3326 + }, + { + "epoch": 0.1664, + "grad_norm": 17.456626892089844, + "learning_rate": 1.9732591159931564e-05, + "loss": 0.5204, + "step": 3328 + }, + { + "epoch": 0.1665, + "grad_norm": 16.23494529724121, + "learning_rate": 1.9731788727770885e-05, + "loss": 1.1975, + "step": 3330 + }, + { + "epoch": 0.1666, + "grad_norm": 11.916665077209473, + "learning_rate": 1.9730985109821268e-05, + "loss": 0.8418, + "step": 3332 + }, + { + "epoch": 0.1667, + "grad_norm": 5.965096473693848, + "learning_rate": 1.973018030618063e-05, + "loss": 1.9169, + "step": 3334 + }, + { + "epoch": 0.1668, + "grad_norm": 3.290766477584839, + "learning_rate": 1.972937431694704e-05, + "loss": 0.2119, + "step": 3336 + }, + { + "epoch": 0.1669, + "grad_norm": 6.262549877166748, + "learning_rate": 1.9728567142218705e-05, + "loss": 0.8012, + "step": 3338 + }, + { + "epoch": 0.167, + "grad_norm": 11.51827621459961, + "learning_rate": 1.972775878209397e-05, + "loss": 0.7203, + "step": 3340 + }, + { + "epoch": 0.1671, + "grad_norm": 6.797581672668457, + "learning_rate": 1.9726949236671332e-05, + "loss": 0.6113, + "step": 3342 + }, + { + "epoch": 0.1672, + "grad_norm": 6.213335990905762, + "learning_rate": 1.9726138506049438e-05, + "loss": 0.8286, + "step": 3344 + }, + { + "epoch": 0.1673, + "grad_norm": 4.0807929039001465, + "learning_rate": 1.9725326590327066e-05, + "loss": 0.9598, + "step": 3346 + }, + { + "epoch": 0.1674, + "grad_norm": 2.6338613033294678, + "learning_rate": 1.9724513489603153e-05, + "loss": 0.799, + "step": 3348 + }, + { + "epoch": 0.1675, + "grad_norm": 9.198699951171875, + "learning_rate": 1.9723699203976768e-05, + "loss": 1.9039, + "step": 3350 + }, + { + "epoch": 0.1676, + "grad_norm": 3.7369723320007324, + "learning_rate": 1.9722883733547128e-05, + "loss": 1.1728, + "step": 3352 + }, + { + "epoch": 0.1677, + "grad_norm": 7.213839054107666, + "learning_rate": 1.97220670784136e-05, + "loss": 1.1022, + "step": 3354 + }, + { + "epoch": 0.1678, + "grad_norm": 6.595007419586182, + "learning_rate": 1.9721249238675688e-05, + "loss": 1.1178, + "step": 3356 + }, + { + "epoch": 0.1679, + "grad_norm": 5.359902381896973, + "learning_rate": 1.9720430214433045e-05, + "loss": 0.359, + "step": 3358 + }, + { + "epoch": 0.168, + "grad_norm": 12.179906845092773, + "learning_rate": 1.9719610005785466e-05, + "loss": 0.7409, + "step": 3360 + }, + { + "epoch": 0.1681, + "grad_norm": 3.827298164367676, + "learning_rate": 1.9718788612832886e-05, + "loss": 1.1147, + "step": 3362 + }, + { + "epoch": 0.1682, + "grad_norm": 12.13782787322998, + "learning_rate": 1.97179660356754e-05, + "loss": 1.18, + "step": 3364 + }, + { + "epoch": 0.1683, + "grad_norm": 3.861980438232422, + "learning_rate": 1.9717142274413223e-05, + "loss": 1.088, + "step": 3366 + }, + { + "epoch": 0.1684, + "grad_norm": 3.15468430519104, + "learning_rate": 1.971631732914674e-05, + "loss": 1.2259, + "step": 3368 + }, + { + "epoch": 0.1685, + "grad_norm": 4.506976127624512, + "learning_rate": 1.9715491199976462e-05, + "loss": 0.7112, + "step": 3370 + }, + { + "epoch": 0.1686, + "grad_norm": 8.276803970336914, + "learning_rate": 1.9714663887003055e-05, + "loss": 1.1752, + "step": 3372 + }, + { + "epoch": 0.1687, + "grad_norm": 24.516862869262695, + "learning_rate": 1.9713835390327317e-05, + "loss": 1.2224, + "step": 3374 + }, + { + "epoch": 0.1688, + "grad_norm": 3.134481430053711, + "learning_rate": 1.9713005710050203e-05, + "loss": 0.8148, + "step": 3376 + }, + { + "epoch": 0.1689, + "grad_norm": 5.663099765777588, + "learning_rate": 1.9712174846272806e-05, + "loss": 1.0099, + "step": 3378 + }, + { + "epoch": 0.169, + "grad_norm": 17.75404930114746, + "learning_rate": 1.971134279909636e-05, + "loss": 2.609, + "step": 3380 + }, + { + "epoch": 0.1691, + "grad_norm": 5.314056396484375, + "learning_rate": 1.971050956862226e-05, + "loss": 0.9137, + "step": 3382 + }, + { + "epoch": 0.1692, + "grad_norm": 0.9673358798027039, + "learning_rate": 1.9709675154952017e-05, + "loss": 0.2382, + "step": 3384 + }, + { + "epoch": 0.1693, + "grad_norm": 2.7107348442077637, + "learning_rate": 1.9708839558187313e-05, + "loss": 1.0066, + "step": 3386 + }, + { + "epoch": 0.1694, + "grad_norm": 6.20440673828125, + "learning_rate": 1.9708002778429957e-05, + "loss": 1.7399, + "step": 3388 + }, + { + "epoch": 0.1695, + "grad_norm": 4.507438659667969, + "learning_rate": 1.970716481578191e-05, + "loss": 1.3488, + "step": 3390 + }, + { + "epoch": 0.1696, + "grad_norm": 10.166176795959473, + "learning_rate": 1.9706325670345276e-05, + "loss": 1.4627, + "step": 3392 + }, + { + "epoch": 0.1697, + "grad_norm": 4.498044490814209, + "learning_rate": 1.9705485342222302e-05, + "loss": 0.6106, + "step": 3394 + }, + { + "epoch": 0.1698, + "grad_norm": 2.310035467147827, + "learning_rate": 1.9704643831515377e-05, + "loss": 0.8652, + "step": 3396 + }, + { + "epoch": 0.1699, + "grad_norm": 3.7805655002593994, + "learning_rate": 1.970380113832704e-05, + "loss": 0.7803, + "step": 3398 + }, + { + "epoch": 0.17, + "grad_norm": 4.512269973754883, + "learning_rate": 1.9702957262759964e-05, + "loss": 1.2982, + "step": 3400 + }, + { + "epoch": 0.1701, + "grad_norm": 2.773451328277588, + "learning_rate": 1.9702112204916984e-05, + "loss": 0.6594, + "step": 3402 + }, + { + "epoch": 0.1702, + "grad_norm": 5.150076389312744, + "learning_rate": 1.970126596490106e-05, + "loss": 0.5168, + "step": 3404 + }, + { + "epoch": 0.1703, + "grad_norm": 3.4437360763549805, + "learning_rate": 1.9700418542815306e-05, + "loss": 0.8228, + "step": 3406 + }, + { + "epoch": 0.1704, + "grad_norm": 3.30574893951416, + "learning_rate": 1.9699569938762975e-05, + "loss": 0.5552, + "step": 3408 + }, + { + "epoch": 0.1705, + "grad_norm": 4.136180400848389, + "learning_rate": 1.969872015284747e-05, + "loss": 1.0684, + "step": 3410 + }, + { + "epoch": 0.1706, + "grad_norm": 4.998510360717773, + "learning_rate": 1.969786918517233e-05, + "loss": 0.859, + "step": 3412 + }, + { + "epoch": 0.1707, + "grad_norm": 3.3695359230041504, + "learning_rate": 1.969701703584125e-05, + "loss": 1.6533, + "step": 3414 + }, + { + "epoch": 0.1708, + "grad_norm": 5.341576099395752, + "learning_rate": 1.969616370495806e-05, + "loss": 1.9909, + "step": 3416 + }, + { + "epoch": 0.1709, + "grad_norm": 3.590191125869751, + "learning_rate": 1.9695309192626736e-05, + "loss": 1.3641, + "step": 3418 + }, + { + "epoch": 0.171, + "grad_norm": 6.558607578277588, + "learning_rate": 1.9694453498951392e-05, + "loss": 0.9321, + "step": 3420 + }, + { + "epoch": 0.1711, + "grad_norm": 3.5300028324127197, + "learning_rate": 1.9693596624036294e-05, + "loss": 0.7033, + "step": 3422 + }, + { + "epoch": 0.1712, + "grad_norm": 4.3782806396484375, + "learning_rate": 1.9692738567985853e-05, + "loss": 0.4744, + "step": 3424 + }, + { + "epoch": 0.1713, + "grad_norm": 4.198268413543701, + "learning_rate": 1.9691879330904618e-05, + "loss": 0.7465, + "step": 3426 + }, + { + "epoch": 0.1714, + "grad_norm": 5.230304718017578, + "learning_rate": 1.9691018912897285e-05, + "loss": 1.2495, + "step": 3428 + }, + { + "epoch": 0.1715, + "grad_norm": 2.957623243331909, + "learning_rate": 1.9690157314068696e-05, + "loss": 1.6095, + "step": 3430 + }, + { + "epoch": 0.1716, + "grad_norm": 5.3281097412109375, + "learning_rate": 1.968929453452383e-05, + "loss": 1.3356, + "step": 3432 + }, + { + "epoch": 0.1717, + "grad_norm": 13.442365646362305, + "learning_rate": 1.968843057436782e-05, + "loss": 0.7324, + "step": 3434 + }, + { + "epoch": 0.1718, + "grad_norm": 2.224287509918213, + "learning_rate": 1.9687565433705926e-05, + "loss": 0.6296, + "step": 3436 + }, + { + "epoch": 0.1719, + "grad_norm": 4.796894073486328, + "learning_rate": 1.9686699112643574e-05, + "loss": 1.0804, + "step": 3438 + }, + { + "epoch": 0.172, + "grad_norm": 5.427393436431885, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.9987, + "step": 3440 + }, + { + "epoch": 0.1721, + "grad_norm": 5.048694610595703, + "learning_rate": 1.968496292973985e-05, + "loss": 0.3904, + "step": 3442 + }, + { + "epoch": 0.1722, + "grad_norm": 3.773087501525879, + "learning_rate": 1.968409306811004e-05, + "loss": 1.3469, + "step": 3444 + }, + { + "epoch": 0.1723, + "grad_norm": 6.479730129241943, + "learning_rate": 1.9683222026502856e-05, + "loss": 1.1252, + "step": 3446 + }, + { + "epoch": 0.1724, + "grad_norm": 5.525974750518799, + "learning_rate": 1.9682349805024447e-05, + "loss": 1.2939, + "step": 3448 + }, + { + "epoch": 0.1725, + "grad_norm": 4.211203575134277, + "learning_rate": 1.968147640378108e-05, + "loss": 1.0799, + "step": 3450 + }, + { + "epoch": 0.1726, + "grad_norm": 5.850316047668457, + "learning_rate": 1.968060182287918e-05, + "loss": 0.7109, + "step": 3452 + }, + { + "epoch": 0.1727, + "grad_norm": 29.529144287109375, + "learning_rate": 1.9679726062425314e-05, + "loss": 2.0675, + "step": 3454 + }, + { + "epoch": 0.1728, + "grad_norm": 9.624133110046387, + "learning_rate": 1.967884912252619e-05, + "loss": 1.2509, + "step": 3456 + }, + { + "epoch": 0.1729, + "grad_norm": 3.0761775970458984, + "learning_rate": 1.9677971003288657e-05, + "loss": 0.88, + "step": 3458 + }, + { + "epoch": 0.173, + "grad_norm": 6.632855415344238, + "learning_rate": 1.9677091704819714e-05, + "loss": 1.1849, + "step": 3460 + }, + { + "epoch": 0.1731, + "grad_norm": 3.824172258377075, + "learning_rate": 1.96762112272265e-05, + "loss": 0.7773, + "step": 3462 + }, + { + "epoch": 0.1732, + "grad_norm": 5.251998424530029, + "learning_rate": 1.96753295706163e-05, + "loss": 1.213, + "step": 3464 + }, + { + "epoch": 0.1733, + "grad_norm": 3.798266649246216, + "learning_rate": 1.9674446735096542e-05, + "loss": 0.7352, + "step": 3466 + }, + { + "epoch": 0.1734, + "grad_norm": 7.548175811767578, + "learning_rate": 1.9673562720774792e-05, + "loss": 1.0819, + "step": 3468 + }, + { + "epoch": 0.1735, + "grad_norm": 5.704505920410156, + "learning_rate": 1.967267752775877e-05, + "loss": 1.3436, + "step": 3470 + }, + { + "epoch": 0.1736, + "grad_norm": 3.9181015491485596, + "learning_rate": 1.967179115615633e-05, + "loss": 1.4581, + "step": 3472 + }, + { + "epoch": 0.1737, + "grad_norm": 3.935399293899536, + "learning_rate": 1.9670903606075475e-05, + "loss": 0.5333, + "step": 3474 + }, + { + "epoch": 0.1738, + "grad_norm": 2.416853189468384, + "learning_rate": 1.9670014877624353e-05, + "loss": 1.0983, + "step": 3476 + }, + { + "epoch": 0.1739, + "grad_norm": 8.355942726135254, + "learning_rate": 1.9669124970911245e-05, + "loss": 0.8495, + "step": 3478 + }, + { + "epoch": 0.174, + "grad_norm": 2.9654757976531982, + "learning_rate": 1.9668233886044597e-05, + "loss": 1.1057, + "step": 3480 + }, + { + "epoch": 0.1741, + "grad_norm": 3.517077922821045, + "learning_rate": 1.966734162313297e-05, + "loss": 1.5296, + "step": 3482 + }, + { + "epoch": 0.1742, + "grad_norm": 3.2559874057769775, + "learning_rate": 1.9666448182285095e-05, + "loss": 1.0303, + "step": 3484 + }, + { + "epoch": 0.1743, + "grad_norm": 8.706599235534668, + "learning_rate": 1.9665553563609826e-05, + "loss": 2.1027, + "step": 3486 + }, + { + "epoch": 0.1744, + "grad_norm": 3.7548739910125732, + "learning_rate": 1.9664657767216176e-05, + "loss": 1.5851, + "step": 3488 + }, + { + "epoch": 0.1745, + "grad_norm": 3.117748498916626, + "learning_rate": 1.9663760793213297e-05, + "loss": 0.5984, + "step": 3490 + }, + { + "epoch": 0.1746, + "grad_norm": 3.4326868057250977, + "learning_rate": 1.966286264171047e-05, + "loss": 0.8103, + "step": 3492 + }, + { + "epoch": 0.1747, + "grad_norm": 4.538660049438477, + "learning_rate": 1.966196331281715e-05, + "loss": 1.412, + "step": 3494 + }, + { + "epoch": 0.1748, + "grad_norm": 6.14824914932251, + "learning_rate": 1.9661062806642903e-05, + "loss": 0.9825, + "step": 3496 + }, + { + "epoch": 0.1749, + "grad_norm": 3.931605100631714, + "learning_rate": 1.966016112329746e-05, + "loss": 0.7728, + "step": 3498 + }, + { + "epoch": 0.175, + "grad_norm": 5.976369857788086, + "learning_rate": 1.9659258262890683e-05, + "loss": 0.3962, + "step": 3500 + }, + { + "epoch": 0.1751, + "grad_norm": 5.059367656707764, + "learning_rate": 1.965835422553259e-05, + "loss": 0.7956, + "step": 3502 + }, + { + "epoch": 0.1752, + "grad_norm": 8.21647834777832, + "learning_rate": 1.9657449011333328e-05, + "loss": 1.1508, + "step": 3504 + }, + { + "epoch": 0.1753, + "grad_norm": 6.31795597076416, + "learning_rate": 1.9656542620403203e-05, + "loss": 0.9569, + "step": 3506 + }, + { + "epoch": 0.1754, + "grad_norm": 2.6270463466644287, + "learning_rate": 1.9655635052852648e-05, + "loss": 0.7151, + "step": 3508 + }, + { + "epoch": 0.1755, + "grad_norm": 3.533998966217041, + "learning_rate": 1.9654726308792252e-05, + "loss": 2.5773, + "step": 3510 + }, + { + "epoch": 0.1756, + "grad_norm": 4.51805305480957, + "learning_rate": 1.965381638833274e-05, + "loss": 0.4661, + "step": 3512 + }, + { + "epoch": 0.1757, + "grad_norm": 3.006725788116455, + "learning_rate": 1.9652905291584987e-05, + "loss": 0.6427, + "step": 3514 + }, + { + "epoch": 0.1758, + "grad_norm": 14.856990814208984, + "learning_rate": 1.9651993018660002e-05, + "loss": 1.6172, + "step": 3516 + }, + { + "epoch": 0.1759, + "grad_norm": 6.542018413543701, + "learning_rate": 1.9651079569668944e-05, + "loss": 1.1238, + "step": 3518 + }, + { + "epoch": 0.176, + "grad_norm": 8.760817527770996, + "learning_rate": 1.9650164944723116e-05, + "loss": 1.1319, + "step": 3520 + }, + { + "epoch": 0.1761, + "grad_norm": 2.3782193660736084, + "learning_rate": 1.9649249143933963e-05, + "loss": 0.875, + "step": 3522 + }, + { + "epoch": 0.1762, + "grad_norm": 5.4196624755859375, + "learning_rate": 1.9648332167413067e-05, + "loss": 1.371, + "step": 3524 + }, + { + "epoch": 0.1763, + "grad_norm": 5.494995594024658, + "learning_rate": 1.964741401527217e-05, + "loss": 0.6438, + "step": 3526 + }, + { + "epoch": 0.1764, + "grad_norm": 7.365622043609619, + "learning_rate": 1.9646494687623135e-05, + "loss": 1.215, + "step": 3528 + }, + { + "epoch": 0.1765, + "grad_norm": 7.010932445526123, + "learning_rate": 1.9645574184577982e-05, + "loss": 1.216, + "step": 3530 + }, + { + "epoch": 0.1766, + "grad_norm": 2.779446601867676, + "learning_rate": 1.9644652506248872e-05, + "loss": 0.5824, + "step": 3532 + }, + { + "epoch": 0.1767, + "grad_norm": 12.461880683898926, + "learning_rate": 1.9643729652748115e-05, + "loss": 0.7278, + "step": 3534 + }, + { + "epoch": 0.1768, + "grad_norm": 5.457529544830322, + "learning_rate": 1.964280562418815e-05, + "loss": 0.7968, + "step": 3536 + }, + { + "epoch": 0.1769, + "grad_norm": 1.6134533882141113, + "learning_rate": 1.9641880420681567e-05, + "loss": 1.2781, + "step": 3538 + }, + { + "epoch": 0.177, + "grad_norm": 5.873092174530029, + "learning_rate": 1.96409540423411e-05, + "loss": 1.2432, + "step": 3540 + }, + { + "epoch": 0.1771, + "grad_norm": 2.417600154876709, + "learning_rate": 1.9640026489279633e-05, + "loss": 1.0018, + "step": 3542 + }, + { + "epoch": 0.1772, + "grad_norm": 3.8101229667663574, + "learning_rate": 1.9639097761610174e-05, + "loss": 0.6626, + "step": 3544 + }, + { + "epoch": 0.1773, + "grad_norm": 8.055438041687012, + "learning_rate": 1.9638167859445894e-05, + "loss": 1.8091, + "step": 3546 + }, + { + "epoch": 0.1774, + "grad_norm": 14.192124366760254, + "learning_rate": 1.96372367829001e-05, + "loss": 1.1693, + "step": 3548 + }, + { + "epoch": 0.1775, + "grad_norm": 3.866668462753296, + "learning_rate": 1.963630453208623e-05, + "loss": 1.0038, + "step": 3550 + }, + { + "epoch": 0.1776, + "grad_norm": 4.334395885467529, + "learning_rate": 1.963537110711789e-05, + "loss": 1.0637, + "step": 3552 + }, + { + "epoch": 0.1777, + "grad_norm": 3.522068977355957, + "learning_rate": 1.96344365081088e-05, + "loss": 0.9879, + "step": 3554 + }, + { + "epoch": 0.1778, + "grad_norm": 7.1617584228515625, + "learning_rate": 1.963350073517285e-05, + "loss": 1.1271, + "step": 3556 + }, + { + "epoch": 0.1779, + "grad_norm": 2.9657628536224365, + "learning_rate": 1.9632563788424055e-05, + "loss": 0.3552, + "step": 3558 + }, + { + "epoch": 0.178, + "grad_norm": 7.423388481140137, + "learning_rate": 1.9631625667976584e-05, + "loss": 1.3387, + "step": 3560 + }, + { + "epoch": 0.1781, + "grad_norm": 12.328094482421875, + "learning_rate": 1.9630686373944738e-05, + "loss": 0.7519, + "step": 3562 + }, + { + "epoch": 0.1782, + "grad_norm": 2.5492336750030518, + "learning_rate": 1.9629745906442973e-05, + "loss": 0.7096, + "step": 3564 + }, + { + "epoch": 0.1783, + "grad_norm": 5.000533103942871, + "learning_rate": 1.9628804265585878e-05, + "loss": 1.4253, + "step": 3566 + }, + { + "epoch": 0.1784, + "grad_norm": 8.016119003295898, + "learning_rate": 1.962786145148819e-05, + "loss": 1.5305, + "step": 3568 + }, + { + "epoch": 0.1785, + "grad_norm": 2.876251220703125, + "learning_rate": 1.962691746426479e-05, + "loss": 0.5435, + "step": 3570 + }, + { + "epoch": 0.1786, + "grad_norm": 5.24503755569458, + "learning_rate": 1.9625972304030697e-05, + "loss": 1.0688, + "step": 3572 + }, + { + "epoch": 0.1787, + "grad_norm": 4.877496242523193, + "learning_rate": 1.9625025970901078e-05, + "loss": 1.2078, + "step": 3574 + }, + { + "epoch": 0.1788, + "grad_norm": 4.297965049743652, + "learning_rate": 1.962407846499124e-05, + "loss": 1.2588, + "step": 3576 + }, + { + "epoch": 0.1789, + "grad_norm": 4.7482709884643555, + "learning_rate": 1.9623129786416635e-05, + "loss": 0.9025, + "step": 3578 + }, + { + "epoch": 0.179, + "grad_norm": 4.835597038269043, + "learning_rate": 1.9622179935292855e-05, + "loss": 0.9778, + "step": 3580 + }, + { + "epoch": 0.1791, + "grad_norm": 7.864474773406982, + "learning_rate": 1.9621228911735637e-05, + "loss": 1.3047, + "step": 3582 + }, + { + "epoch": 0.1792, + "grad_norm": 3.750208616256714, + "learning_rate": 1.962027671586086e-05, + "loss": 0.2599, + "step": 3584 + }, + { + "epoch": 0.1793, + "grad_norm": 3.559546947479248, + "learning_rate": 1.961932334778455e-05, + "loss": 1.2862, + "step": 3586 + }, + { + "epoch": 0.1794, + "grad_norm": 4.463189125061035, + "learning_rate": 1.9618368807622863e-05, + "loss": 0.7898, + "step": 3588 + }, + { + "epoch": 0.1795, + "grad_norm": 6.204476356506348, + "learning_rate": 1.9617413095492114e-05, + "loss": 0.9427, + "step": 3590 + }, + { + "epoch": 0.1796, + "grad_norm": 7.522671222686768, + "learning_rate": 1.9616456211508756e-05, + "loss": 1.0428, + "step": 3592 + }, + { + "epoch": 0.1797, + "grad_norm": 2.1397197246551514, + "learning_rate": 1.9615498155789373e-05, + "loss": 1.2268, + "step": 3594 + }, + { + "epoch": 0.1798, + "grad_norm": 15.716395378112793, + "learning_rate": 1.961453892845071e-05, + "loss": 2.4664, + "step": 3596 + }, + { + "epoch": 0.1799, + "grad_norm": 4.866504669189453, + "learning_rate": 1.9613578529609642e-05, + "loss": 0.5379, + "step": 3598 + }, + { + "epoch": 0.18, + "grad_norm": 11.300771713256836, + "learning_rate": 1.961261695938319e-05, + "loss": 1.4512, + "step": 3600 + }, + { + "epoch": 0.1801, + "grad_norm": 4.767409801483154, + "learning_rate": 1.961165421788852e-05, + "loss": 1.4284, + "step": 3602 + }, + { + "epoch": 0.1802, + "grad_norm": 11.65634536743164, + "learning_rate": 1.961069030524294e-05, + "loss": 0.6712, + "step": 3604 + }, + { + "epoch": 0.1803, + "grad_norm": 3.715580463409424, + "learning_rate": 1.9609725221563898e-05, + "loss": 1.5484, + "step": 3606 + }, + { + "epoch": 0.1804, + "grad_norm": 8.684478759765625, + "learning_rate": 1.9608758966968987e-05, + "loss": 1.1309, + "step": 3608 + }, + { + "epoch": 0.1805, + "grad_norm": 6.294127941131592, + "learning_rate": 1.9607791541575944e-05, + "loss": 1.3617, + "step": 3610 + }, + { + "epoch": 0.1806, + "grad_norm": 5.147834777832031, + "learning_rate": 1.9606822945502642e-05, + "loss": 0.9487, + "step": 3612 + }, + { + "epoch": 0.1807, + "grad_norm": 6.61224889755249, + "learning_rate": 1.9605853178867107e-05, + "loss": 0.8412, + "step": 3614 + }, + { + "epoch": 0.1808, + "grad_norm": 6.111036777496338, + "learning_rate": 1.96048822417875e-05, + "loss": 0.6834, + "step": 3616 + }, + { + "epoch": 0.1809, + "grad_norm": 5.059226989746094, + "learning_rate": 1.9603910134382124e-05, + "loss": 1.1857, + "step": 3618 + }, + { + "epoch": 0.181, + "grad_norm": 13.335984230041504, + "learning_rate": 1.9602936856769432e-05, + "loss": 1.5079, + "step": 3620 + }, + { + "epoch": 0.1811, + "grad_norm": 7.632668972015381, + "learning_rate": 1.960196240906801e-05, + "loss": 1.1579, + "step": 3622 + }, + { + "epoch": 0.1812, + "grad_norm": 5.221138000488281, + "learning_rate": 1.96009867913966e-05, + "loss": 0.7553, + "step": 3624 + }, + { + "epoch": 0.1813, + "grad_norm": 5.519922256469727, + "learning_rate": 1.9600010003874067e-05, + "loss": 1.4995, + "step": 3626 + }, + { + "epoch": 0.1814, + "grad_norm": 12.409165382385254, + "learning_rate": 1.9599032046619437e-05, + "loss": 1.2599, + "step": 3628 + }, + { + "epoch": 0.1815, + "grad_norm": 2.2666499614715576, + "learning_rate": 1.959805291975187e-05, + "loss": 1.6225, + "step": 3630 + }, + { + "epoch": 0.1816, + "grad_norm": 2.9312219619750977, + "learning_rate": 1.9597072623390668e-05, + "loss": 1.0779, + "step": 3632 + }, + { + "epoch": 0.1817, + "grad_norm": 3.7696774005889893, + "learning_rate": 1.959609115765528e-05, + "loss": 1.0443, + "step": 3634 + }, + { + "epoch": 0.1818, + "grad_norm": 5.269700527191162, + "learning_rate": 1.959510852266529e-05, + "loss": 1.1438, + "step": 3636 + }, + { + "epoch": 0.1819, + "grad_norm": 7.025585651397705, + "learning_rate": 1.959412471854043e-05, + "loss": 0.7659, + "step": 3638 + }, + { + "epoch": 0.182, + "grad_norm": 7.015717506408691, + "learning_rate": 1.9593139745400575e-05, + "loss": 0.6571, + "step": 3640 + }, + { + "epoch": 0.1821, + "grad_norm": 8.150612831115723, + "learning_rate": 1.9592153603365746e-05, + "loss": 1.5436, + "step": 3642 + }, + { + "epoch": 0.1822, + "grad_norm": 6.609885215759277, + "learning_rate": 1.9591166292556093e-05, + "loss": 0.9391, + "step": 3644 + }, + { + "epoch": 0.1823, + "grad_norm": 7.188622951507568, + "learning_rate": 1.9590177813091918e-05, + "loss": 1.677, + "step": 3646 + }, + { + "epoch": 0.1824, + "grad_norm": 5.56619119644165, + "learning_rate": 1.958918816509367e-05, + "loss": 1.2058, + "step": 3648 + }, + { + "epoch": 0.1825, + "grad_norm": 11.25572681427002, + "learning_rate": 1.958819734868193e-05, + "loss": 0.675, + "step": 3650 + }, + { + "epoch": 0.1826, + "grad_norm": 9.429428100585938, + "learning_rate": 1.9587205363977428e-05, + "loss": 0.6154, + "step": 3652 + }, + { + "epoch": 0.1827, + "grad_norm": 5.83096170425415, + "learning_rate": 1.9586212211101036e-05, + "loss": 0.6118, + "step": 3654 + }, + { + "epoch": 0.1828, + "grad_norm": 12.151270866394043, + "learning_rate": 1.958521789017376e-05, + "loss": 1.1769, + "step": 3656 + }, + { + "epoch": 0.1829, + "grad_norm": 9.613458633422852, + "learning_rate": 1.958422240131676e-05, + "loss": 1.1961, + "step": 3658 + }, + { + "epoch": 0.183, + "grad_norm": 2.723691463470459, + "learning_rate": 1.9583225744651334e-05, + "loss": 1.4039, + "step": 3660 + }, + { + "epoch": 0.1831, + "grad_norm": 10.868335723876953, + "learning_rate": 1.9582227920298916e-05, + "loss": 1.8118, + "step": 3662 + }, + { + "epoch": 0.1832, + "grad_norm": 4.5961456298828125, + "learning_rate": 1.95812289283811e-05, + "loss": 1.3557, + "step": 3664 + }, + { + "epoch": 0.1833, + "grad_norm": 5.636166095733643, + "learning_rate": 1.9580228769019593e-05, + "loss": 0.6416, + "step": 3666 + }, + { + "epoch": 0.1834, + "grad_norm": 6.674808979034424, + "learning_rate": 1.9579227442336276e-05, + "loss": 2.0506, + "step": 3668 + }, + { + "epoch": 0.1835, + "grad_norm": 5.589504241943359, + "learning_rate": 1.957822494845315e-05, + "loss": 0.981, + "step": 3670 + }, + { + "epoch": 0.1836, + "grad_norm": 4.114501476287842, + "learning_rate": 1.9577221287492368e-05, + "loss": 0.9584, + "step": 3672 + }, + { + "epoch": 0.1837, + "grad_norm": 4.022830009460449, + "learning_rate": 1.9576216459576222e-05, + "loss": 0.879, + "step": 3674 + }, + { + "epoch": 0.1838, + "grad_norm": 12.398062705993652, + "learning_rate": 1.957521046482715e-05, + "loss": 1.3128, + "step": 3676 + }, + { + "epoch": 0.1839, + "grad_norm": 7.967578887939453, + "learning_rate": 1.9574203303367728e-05, + "loss": 0.8565, + "step": 3678 + }, + { + "epoch": 0.184, + "grad_norm": 3.555532932281494, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.7845, + "step": 3680 + }, + { + "epoch": 0.1841, + "grad_norm": 6.869021892547607, + "learning_rate": 1.9572185480808848e-05, + "loss": 1.2293, + "step": 3682 + }, + { + "epoch": 0.1842, + "grad_norm": 9.970760345458984, + "learning_rate": 1.9571174819955264e-05, + "loss": 0.7073, + "step": 3684 + }, + { + "epoch": 0.1843, + "grad_norm": 4.284979343414307, + "learning_rate": 1.9570162992883056e-05, + "loss": 0.8173, + "step": 3686 + }, + { + "epoch": 0.1844, + "grad_norm": 22.142621994018555, + "learning_rate": 1.9569149999715514e-05, + "loss": 0.9957, + "step": 3688 + }, + { + "epoch": 0.1845, + "grad_norm": 9.034977912902832, + "learning_rate": 1.956813584057608e-05, + "loss": 1.4411, + "step": 3690 + }, + { + "epoch": 0.1846, + "grad_norm": 6.037625312805176, + "learning_rate": 1.9567120515588307e-05, + "loss": 0.8543, + "step": 3692 + }, + { + "epoch": 0.1847, + "grad_norm": 11.387916564941406, + "learning_rate": 1.9566104024875924e-05, + "loss": 1.5936, + "step": 3694 + }, + { + "epoch": 0.1848, + "grad_norm": 10.5921049118042, + "learning_rate": 1.956508636856278e-05, + "loss": 1.3439, + "step": 3696 + }, + { + "epoch": 0.1849, + "grad_norm": 5.683332920074463, + "learning_rate": 1.9564067546772877e-05, + "loss": 1.1922, + "step": 3698 + }, + { + "epoch": 0.185, + "grad_norm": 17.14014434814453, + "learning_rate": 1.9563047559630356e-05, + "loss": 0.8446, + "step": 3700 + }, + { + "epoch": 0.1851, + "grad_norm": 8.75061321258545, + "learning_rate": 1.9562026407259497e-05, + "loss": 2.1146, + "step": 3702 + }, + { + "epoch": 0.1852, + "grad_norm": 4.486917972564697, + "learning_rate": 1.9561004089784726e-05, + "loss": 0.6552, + "step": 3704 + }, + { + "epoch": 0.1853, + "grad_norm": 6.099549770355225, + "learning_rate": 1.9559980607330607e-05, + "loss": 0.3081, + "step": 3706 + }, + { + "epoch": 0.1854, + "grad_norm": 5.253990650177002, + "learning_rate": 1.9558955960021847e-05, + "loss": 0.8229, + "step": 3708 + }, + { + "epoch": 0.1855, + "grad_norm": 7.3122992515563965, + "learning_rate": 1.9557930147983303e-05, + "loss": 1.0684, + "step": 3710 + }, + { + "epoch": 0.1856, + "grad_norm": 7.367587566375732, + "learning_rate": 1.9556903171339963e-05, + "loss": 1.8183, + "step": 3712 + }, + { + "epoch": 0.1857, + "grad_norm": 5.043776035308838, + "learning_rate": 1.9555875030216957e-05, + "loss": 1.4393, + "step": 3714 + }, + { + "epoch": 0.1858, + "grad_norm": 11.444462776184082, + "learning_rate": 1.9554845724739565e-05, + "loss": 0.8515, + "step": 3716 + }, + { + "epoch": 0.1859, + "grad_norm": 5.339797019958496, + "learning_rate": 1.9553815255033208e-05, + "loss": 1.0871, + "step": 3718 + }, + { + "epoch": 0.186, + "grad_norm": 17.04717254638672, + "learning_rate": 1.9552783621223437e-05, + "loss": 0.8539, + "step": 3720 + }, + { + "epoch": 0.1861, + "grad_norm": 4.5589704513549805, + "learning_rate": 1.9551750823435963e-05, + "loss": 1.8545, + "step": 3722 + }, + { + "epoch": 0.1862, + "grad_norm": 6.257421970367432, + "learning_rate": 1.9550716861796623e-05, + "loss": 0.8838, + "step": 3724 + }, + { + "epoch": 0.1863, + "grad_norm": 2.9653210639953613, + "learning_rate": 1.95496817364314e-05, + "loss": 1.268, + "step": 3726 + }, + { + "epoch": 0.1864, + "grad_norm": 16.991336822509766, + "learning_rate": 1.9548645447466433e-05, + "loss": 0.9271, + "step": 3728 + }, + { + "epoch": 0.1865, + "grad_norm": 3.0153753757476807, + "learning_rate": 1.954760799502798e-05, + "loss": 0.574, + "step": 3730 + }, + { + "epoch": 0.1866, + "grad_norm": 5.852304458618164, + "learning_rate": 1.9546569379242446e-05, + "loss": 1.0166, + "step": 3732 + }, + { + "epoch": 0.1867, + "grad_norm": 1.7926685810089111, + "learning_rate": 1.95455296002364e-05, + "loss": 0.6848, + "step": 3734 + }, + { + "epoch": 0.1868, + "grad_norm": 11.878852844238281, + "learning_rate": 1.9544488658136522e-05, + "loss": 1.093, + "step": 3736 + }, + { + "epoch": 0.1869, + "grad_norm": 3.276874542236328, + "learning_rate": 1.954344655306965e-05, + "loss": 1.0593, + "step": 3738 + }, + { + "epoch": 0.187, + "grad_norm": 8.699581146240234, + "learning_rate": 1.954240328516277e-05, + "loss": 1.6997, + "step": 3740 + }, + { + "epoch": 0.1871, + "grad_norm": 4.3797807693481445, + "learning_rate": 1.9541358854542993e-05, + "loss": 0.662, + "step": 3742 + }, + { + "epoch": 0.1872, + "grad_norm": 5.771872520446777, + "learning_rate": 1.954031326133758e-05, + "loss": 1.1233, + "step": 3744 + }, + { + "epoch": 0.1873, + "grad_norm": 2.6913576126098633, + "learning_rate": 1.9539266505673938e-05, + "loss": 0.7318, + "step": 3746 + }, + { + "epoch": 0.1874, + "grad_norm": 5.108476161956787, + "learning_rate": 1.9538218587679605e-05, + "loss": 0.9354, + "step": 3748 + }, + { + "epoch": 0.1875, + "grad_norm": 7.981147289276123, + "learning_rate": 1.953716950748227e-05, + "loss": 0.9106, + "step": 3750 + }, + { + "epoch": 0.1876, + "grad_norm": 9.044285774230957, + "learning_rate": 1.9536119265209763e-05, + "loss": 0.7175, + "step": 3752 + }, + { + "epoch": 0.1877, + "grad_norm": 11.214157104492188, + "learning_rate": 1.9535067860990046e-05, + "loss": 2.4633, + "step": 3754 + }, + { + "epoch": 0.1878, + "grad_norm": 2.717444896697998, + "learning_rate": 1.9534015294951235e-05, + "loss": 1.5324, + "step": 3756 + }, + { + "epoch": 0.1879, + "grad_norm": 17.50285530090332, + "learning_rate": 1.9532961567221577e-05, + "loss": 1.9567, + "step": 3758 + }, + { + "epoch": 0.188, + "grad_norm": 8.508543014526367, + "learning_rate": 1.9531906677929472e-05, + "loss": 1.6245, + "step": 3760 + }, + { + "epoch": 0.1881, + "grad_norm": 3.4432456493377686, + "learning_rate": 1.953085062720345e-05, + "loss": 0.9881, + "step": 3762 + }, + { + "epoch": 0.1882, + "grad_norm": 5.243241310119629, + "learning_rate": 1.952979341517219e-05, + "loss": 0.9784, + "step": 3764 + }, + { + "epoch": 0.1883, + "grad_norm": 6.175049304962158, + "learning_rate": 1.952873504196451e-05, + "loss": 1.6464, + "step": 3766 + }, + { + "epoch": 0.1884, + "grad_norm": 18.974966049194336, + "learning_rate": 1.9527675507709368e-05, + "loss": 0.5897, + "step": 3768 + }, + { + "epoch": 0.1885, + "grad_norm": 13.893299102783203, + "learning_rate": 1.9526614812535866e-05, + "loss": 1.2935, + "step": 3770 + }, + { + "epoch": 0.1886, + "grad_norm": 6.569645881652832, + "learning_rate": 1.9525552956573244e-05, + "loss": 0.7849, + "step": 3772 + }, + { + "epoch": 0.1887, + "grad_norm": 3.4405324459075928, + "learning_rate": 1.9524489939950892e-05, + "loss": 0.6265, + "step": 3774 + }, + { + "epoch": 0.1888, + "grad_norm": 5.285586833953857, + "learning_rate": 1.9523425762798328e-05, + "loss": 1.1702, + "step": 3776 + }, + { + "epoch": 0.1889, + "grad_norm": 2.553706169128418, + "learning_rate": 1.9522360425245226e-05, + "loss": 1.4972, + "step": 3778 + }, + { + "epoch": 0.189, + "grad_norm": 6.215848922729492, + "learning_rate": 1.9521293927421388e-05, + "loss": 1.5955, + "step": 3780 + }, + { + "epoch": 0.1891, + "grad_norm": 4.375369548797607, + "learning_rate": 1.9520226269456767e-05, + "loss": 0.864, + "step": 3782 + }, + { + "epoch": 0.1892, + "grad_norm": 2.3966686725616455, + "learning_rate": 1.9519157451481453e-05, + "loss": 0.7775, + "step": 3784 + }, + { + "epoch": 0.1893, + "grad_norm": 12.216120719909668, + "learning_rate": 1.951808747362568e-05, + "loss": 0.9709, + "step": 3786 + }, + { + "epoch": 0.1894, + "grad_norm": 4.150874137878418, + "learning_rate": 1.9517016336019817e-05, + "loss": 1.0839, + "step": 3788 + }, + { + "epoch": 0.1895, + "grad_norm": 5.053764820098877, + "learning_rate": 1.9515944038794384e-05, + "loss": 1.0557, + "step": 3790 + }, + { + "epoch": 0.1896, + "grad_norm": 7.356159687042236, + "learning_rate": 1.951487058208003e-05, + "loss": 0.8965, + "step": 3792 + }, + { + "epoch": 0.1897, + "grad_norm": 6.5343427658081055, + "learning_rate": 1.9513795966007563e-05, + "loss": 1.1799, + "step": 3794 + }, + { + "epoch": 0.1898, + "grad_norm": 1.162221908569336, + "learning_rate": 1.9512720190707915e-05, + "loss": 1.2191, + "step": 3796 + }, + { + "epoch": 0.1899, + "grad_norm": 1.9427121877670288, + "learning_rate": 1.9511643256312165e-05, + "loss": 0.9997, + "step": 3798 + }, + { + "epoch": 0.19, + "grad_norm": 7.352926254272461, + "learning_rate": 1.9510565162951538e-05, + "loss": 0.7768, + "step": 3800 + }, + { + "epoch": 0.1901, + "grad_norm": 1.5866447687149048, + "learning_rate": 1.9509485910757393e-05, + "loss": 1.0264, + "step": 3802 + }, + { + "epoch": 0.1902, + "grad_norm": 7.750997066497803, + "learning_rate": 1.9508405499861235e-05, + "loss": 0.997, + "step": 3804 + }, + { + "epoch": 0.1903, + "grad_norm": 2.8155038356781006, + "learning_rate": 1.950732393039471e-05, + "loss": 1.2107, + "step": 3806 + }, + { + "epoch": 0.1904, + "grad_norm": 0.12075606733560562, + "learning_rate": 1.95062412024896e-05, + "loss": 0.642, + "step": 3808 + }, + { + "epoch": 0.1905, + "grad_norm": 4.869328022003174, + "learning_rate": 1.950515731627784e-05, + "loss": 0.3114, + "step": 3810 + }, + { + "epoch": 0.1906, + "grad_norm": 3.206918716430664, + "learning_rate": 1.9504072271891486e-05, + "loss": 0.9029, + "step": 3812 + }, + { + "epoch": 0.1907, + "grad_norm": 3.0728392601013184, + "learning_rate": 1.950298606946276e-05, + "loss": 1.0011, + "step": 3814 + }, + { + "epoch": 0.1908, + "grad_norm": 5.480174541473389, + "learning_rate": 1.950189870912401e-05, + "loss": 1.0054, + "step": 3816 + }, + { + "epoch": 0.1909, + "grad_norm": 2.9480764865875244, + "learning_rate": 1.9500810191007717e-05, + "loss": 1.4937, + "step": 3818 + }, + { + "epoch": 0.191, + "grad_norm": 4.602784633636475, + "learning_rate": 1.9499720515246524e-05, + "loss": 0.9323, + "step": 3820 + }, + { + "epoch": 0.1911, + "grad_norm": 4.5084991455078125, + "learning_rate": 1.9498629681973208e-05, + "loss": 1.1936, + "step": 3822 + }, + { + "epoch": 0.1912, + "grad_norm": 13.928677558898926, + "learning_rate": 1.949753769132067e-05, + "loss": 0.5946, + "step": 3824 + }, + { + "epoch": 0.1913, + "grad_norm": 2.3143789768218994, + "learning_rate": 1.9496444543421975e-05, + "loss": 0.6854, + "step": 3826 + }, + { + "epoch": 0.1914, + "grad_norm": 3.730219602584839, + "learning_rate": 1.949535023841032e-05, + "loss": 1.0649, + "step": 3828 + }, + { + "epoch": 0.1915, + "grad_norm": 3.5458195209503174, + "learning_rate": 1.949425477641904e-05, + "loss": 0.6483, + "step": 3830 + }, + { + "epoch": 0.1916, + "grad_norm": 12.461370468139648, + "learning_rate": 1.9493158157581617e-05, + "loss": 1.1378, + "step": 3832 + }, + { + "epoch": 0.1917, + "grad_norm": 3.185137987136841, + "learning_rate": 1.9492060382031663e-05, + "loss": 1.2425, + "step": 3834 + }, + { + "epoch": 0.1918, + "grad_norm": 2.031921625137329, + "learning_rate": 1.9490961449902946e-05, + "loss": 0.844, + "step": 3836 + }, + { + "epoch": 0.1919, + "grad_norm": 19.389989852905273, + "learning_rate": 1.948986136132937e-05, + "loss": 1.3004, + "step": 3838 + }, + { + "epoch": 0.192, + "grad_norm": 6.4210710525512695, + "learning_rate": 1.9488760116444966e-05, + "loss": 1.5653, + "step": 3840 + }, + { + "epoch": 0.1921, + "grad_norm": 2.178706169128418, + "learning_rate": 1.9487657715383928e-05, + "loss": 1.0336, + "step": 3842 + }, + { + "epoch": 0.1922, + "grad_norm": 4.075501918792725, + "learning_rate": 1.9486554158280576e-05, + "loss": 1.0481, + "step": 3844 + }, + { + "epoch": 0.1923, + "grad_norm": 4.468177795410156, + "learning_rate": 1.9485449445269376e-05, + "loss": 1.0542, + "step": 3846 + }, + { + "epoch": 0.1924, + "grad_norm": 3.501829147338867, + "learning_rate": 1.9484343576484935e-05, + "loss": 1.0232, + "step": 3848 + }, + { + "epoch": 0.1925, + "grad_norm": 2.636869192123413, + "learning_rate": 1.9483236552061996e-05, + "loss": 1.3307, + "step": 3850 + }, + { + "epoch": 0.1926, + "grad_norm": 4.149578094482422, + "learning_rate": 1.9482128372135446e-05, + "loss": 1.0854, + "step": 3852 + }, + { + "epoch": 0.1927, + "grad_norm": 3.0289387702941895, + "learning_rate": 1.948101903684032e-05, + "loss": 0.9924, + "step": 3854 + }, + { + "epoch": 0.1928, + "grad_norm": 2.010291337966919, + "learning_rate": 1.9479908546311783e-05, + "loss": 0.6122, + "step": 3856 + }, + { + "epoch": 0.1929, + "grad_norm": 2.2430531978607178, + "learning_rate": 1.9478796900685145e-05, + "loss": 1.1963, + "step": 3858 + }, + { + "epoch": 0.193, + "grad_norm": 4.704161167144775, + "learning_rate": 1.947768410009586e-05, + "loss": 1.3082, + "step": 3860 + }, + { + "epoch": 0.1931, + "grad_norm": 1.1165249347686768, + "learning_rate": 1.9476570144679513e-05, + "loss": 0.0639, + "step": 3862 + }, + { + "epoch": 0.1932, + "grad_norm": 4.131162643432617, + "learning_rate": 1.947545503457184e-05, + "loss": 1.1089, + "step": 3864 + }, + { + "epoch": 0.1933, + "grad_norm": 7.927311897277832, + "learning_rate": 1.9474338769908712e-05, + "loss": 1.3928, + "step": 3866 + }, + { + "epoch": 0.1934, + "grad_norm": 0.5793778896331787, + "learning_rate": 1.9473221350826145e-05, + "loss": 0.657, + "step": 3868 + }, + { + "epoch": 0.1935, + "grad_norm": 3.113226890563965, + "learning_rate": 1.9472102777460292e-05, + "loss": 1.0141, + "step": 3870 + }, + { + "epoch": 0.1936, + "grad_norm": 4.168039321899414, + "learning_rate": 1.9470983049947446e-05, + "loss": 1.1224, + "step": 3872 + }, + { + "epoch": 0.1937, + "grad_norm": 22.540021896362305, + "learning_rate": 1.9469862168424042e-05, + "loss": 3.4666, + "step": 3874 + }, + { + "epoch": 0.1938, + "grad_norm": 6.84248161315918, + "learning_rate": 1.946874013302666e-05, + "loss": 0.9252, + "step": 3876 + }, + { + "epoch": 0.1939, + "grad_norm": 4.3440022468566895, + "learning_rate": 1.946761694389202e-05, + "loss": 0.6536, + "step": 3878 + }, + { + "epoch": 0.194, + "grad_norm": 2.028390407562256, + "learning_rate": 1.9466492601156964e-05, + "loss": 0.955, + "step": 3880 + }, + { + "epoch": 0.1941, + "grad_norm": 2.8740570545196533, + "learning_rate": 1.9465367104958507e-05, + "loss": 1.2311, + "step": 3882 + }, + { + "epoch": 0.1942, + "grad_norm": 4.962673187255859, + "learning_rate": 1.9464240455433775e-05, + "loss": 0.9572, + "step": 3884 + }, + { + "epoch": 0.1943, + "grad_norm": 11.261040687561035, + "learning_rate": 1.9463112652720055e-05, + "loss": 1.3854, + "step": 3886 + }, + { + "epoch": 0.1944, + "grad_norm": 2.9101791381835938, + "learning_rate": 1.946198369695476e-05, + "loss": 1.4885, + "step": 3888 + }, + { + "epoch": 0.1945, + "grad_norm": 10.259904861450195, + "learning_rate": 1.9460853588275454e-05, + "loss": 1.0329, + "step": 3890 + }, + { + "epoch": 0.1946, + "grad_norm": 4.843194961547852, + "learning_rate": 1.945972232681984e-05, + "loss": 0.8565, + "step": 3892 + }, + { + "epoch": 0.1947, + "grad_norm": 3.9601848125457764, + "learning_rate": 1.9458589912725746e-05, + "loss": 0.4018, + "step": 3894 + }, + { + "epoch": 0.1948, + "grad_norm": 3.0791192054748535, + "learning_rate": 1.945745634613117e-05, + "loss": 0.52, + "step": 3896 + }, + { + "epoch": 0.1949, + "grad_norm": 4.532678127288818, + "learning_rate": 1.9456321627174222e-05, + "loss": 1.264, + "step": 3898 + }, + { + "epoch": 0.195, + "grad_norm": 3.972975254058838, + "learning_rate": 1.945518575599317e-05, + "loss": 0.8141, + "step": 3900 + }, + { + "epoch": 0.1951, + "grad_norm": 3.9735283851623535, + "learning_rate": 1.9454048732726415e-05, + "loss": 0.91, + "step": 3902 + }, + { + "epoch": 0.1952, + "grad_norm": 3.377117156982422, + "learning_rate": 1.9452910557512497e-05, + "loss": 0.7542, + "step": 3904 + }, + { + "epoch": 0.1953, + "grad_norm": 4.239539623260498, + "learning_rate": 1.94517712304901e-05, + "loss": 1.0701, + "step": 3906 + }, + { + "epoch": 0.1954, + "grad_norm": 3.8567001819610596, + "learning_rate": 1.945063075179805e-05, + "loss": 1.3477, + "step": 3908 + }, + { + "epoch": 0.1955, + "grad_norm": 5.794613361358643, + "learning_rate": 1.944948912157531e-05, + "loss": 1.3987, + "step": 3910 + }, + { + "epoch": 0.1956, + "grad_norm": 8.212959289550781, + "learning_rate": 1.9448346339960984e-05, + "loss": 0.9624, + "step": 3912 + }, + { + "epoch": 0.1957, + "grad_norm": 4.268630027770996, + "learning_rate": 1.9447202407094315e-05, + "loss": 1.1955, + "step": 3914 + }, + { + "epoch": 0.1958, + "grad_norm": 1.6028770208358765, + "learning_rate": 1.944605732311469e-05, + "loss": 0.2224, + "step": 3916 + }, + { + "epoch": 0.1959, + "grad_norm": 0.43861475586891174, + "learning_rate": 1.9444911088161636e-05, + "loss": 0.6762, + "step": 3918 + }, + { + "epoch": 0.196, + "grad_norm": 3.46561598777771, + "learning_rate": 1.944376370237481e-05, + "loss": 0.9573, + "step": 3920 + }, + { + "epoch": 0.1961, + "grad_norm": 9.680695533752441, + "learning_rate": 1.944261516589403e-05, + "loss": 1.3486, + "step": 3922 + }, + { + "epoch": 0.1962, + "grad_norm": 5.298436641693115, + "learning_rate": 1.944146547885923e-05, + "loss": 1.2608, + "step": 3924 + }, + { + "epoch": 0.1963, + "grad_norm": 6.703926086425781, + "learning_rate": 1.94403146414105e-05, + "loss": 1.627, + "step": 3926 + }, + { + "epoch": 0.1964, + "grad_norm": 7.985109806060791, + "learning_rate": 1.9439162653688066e-05, + "loss": 0.8937, + "step": 3928 + }, + { + "epoch": 0.1965, + "grad_norm": 1.5500428676605225, + "learning_rate": 1.9438009515832298e-05, + "loss": 0.7775, + "step": 3930 + }, + { + "epoch": 0.1966, + "grad_norm": 3.015667200088501, + "learning_rate": 1.9436855227983695e-05, + "loss": 1.3686, + "step": 3932 + }, + { + "epoch": 0.1967, + "grad_norm": 14.157666206359863, + "learning_rate": 1.943569979028291e-05, + "loss": 2.3627, + "step": 3934 + }, + { + "epoch": 0.1968, + "grad_norm": 2.1042637825012207, + "learning_rate": 1.9434543202870726e-05, + "loss": 0.7565, + "step": 3936 + }, + { + "epoch": 0.1969, + "grad_norm": 3.8239521980285645, + "learning_rate": 1.9433385465888072e-05, + "loss": 1.1744, + "step": 3938 + }, + { + "epoch": 0.197, + "grad_norm": 3.2621114253997803, + "learning_rate": 1.943222657947601e-05, + "loss": 1.1716, + "step": 3940 + }, + { + "epoch": 0.1971, + "grad_norm": 3.8225104808807373, + "learning_rate": 1.9431066543775753e-05, + "loss": 0.543, + "step": 3942 + }, + { + "epoch": 0.1972, + "grad_norm": 3.7532153129577637, + "learning_rate": 1.9429905358928648e-05, + "loss": 0.6522, + "step": 3944 + }, + { + "epoch": 0.1973, + "grad_norm": 6.433023452758789, + "learning_rate": 1.9428743025076177e-05, + "loss": 1.903, + "step": 3946 + }, + { + "epoch": 0.1974, + "grad_norm": 10.251373291015625, + "learning_rate": 1.9427579542359966e-05, + "loss": 0.9631, + "step": 3948 + }, + { + "epoch": 0.1975, + "grad_norm": 4.877993583679199, + "learning_rate": 1.9426414910921785e-05, + "loss": 0.3482, + "step": 3950 + }, + { + "epoch": 0.1976, + "grad_norm": 3.035297393798828, + "learning_rate": 1.9425249130903544e-05, + "loss": 0.59, + "step": 3952 + }, + { + "epoch": 0.1977, + "grad_norm": 2.310696840286255, + "learning_rate": 1.942408220244728e-05, + "loss": 1.1745, + "step": 3954 + }, + { + "epoch": 0.1978, + "grad_norm": 4.3631815910339355, + "learning_rate": 1.942291412569519e-05, + "loss": 0.9275, + "step": 3956 + }, + { + "epoch": 0.1979, + "grad_norm": 4.176278114318848, + "learning_rate": 1.94217449007896e-05, + "loss": 1.5879, + "step": 3958 + }, + { + "epoch": 0.198, + "grad_norm": 7.796482086181641, + "learning_rate": 1.942057452787297e-05, + "loss": 0.9523, + "step": 3960 + }, + { + "epoch": 0.1981, + "grad_norm": 7.390254497528076, + "learning_rate": 1.9419403007087908e-05, + "loss": 1.7611, + "step": 3962 + }, + { + "epoch": 0.1982, + "grad_norm": 3.6845805644989014, + "learning_rate": 1.9418230338577164e-05, + "loss": 1.282, + "step": 3964 + }, + { + "epoch": 0.1983, + "grad_norm": 13.201141357421875, + "learning_rate": 1.941705652248362e-05, + "loss": 1.7468, + "step": 3966 + }, + { + "epoch": 0.1984, + "grad_norm": 1.8022669553756714, + "learning_rate": 1.9415881558950302e-05, + "loss": 1.2501, + "step": 3968 + }, + { + "epoch": 0.1985, + "grad_norm": 5.0633864402771, + "learning_rate": 1.941470544812038e-05, + "loss": 1.2834, + "step": 3970 + }, + { + "epoch": 0.1986, + "grad_norm": 5.236081123352051, + "learning_rate": 1.9413528190137158e-05, + "loss": 2.1329, + "step": 3972 + }, + { + "epoch": 0.1987, + "grad_norm": 4.792832851409912, + "learning_rate": 1.9412349785144076e-05, + "loss": 1.6453, + "step": 3974 + }, + { + "epoch": 0.1988, + "grad_norm": 2.6572139263153076, + "learning_rate": 1.9411170233284728e-05, + "loss": 1.3892, + "step": 3976 + }, + { + "epoch": 0.1989, + "grad_norm": 2.326242208480835, + "learning_rate": 1.9409989534702835e-05, + "loss": 1.0229, + "step": 3978 + }, + { + "epoch": 0.199, + "grad_norm": 12.49612808227539, + "learning_rate": 1.9408807689542257e-05, + "loss": 1.297, + "step": 3980 + }, + { + "epoch": 0.1991, + "grad_norm": 10.618572235107422, + "learning_rate": 1.9407624697947003e-05, + "loss": 1.0052, + "step": 3982 + }, + { + "epoch": 0.1992, + "grad_norm": 3.507004499435425, + "learning_rate": 1.9406440560061214e-05, + "loss": 1.337, + "step": 3984 + }, + { + "epoch": 0.1993, + "grad_norm": 6.024484634399414, + "learning_rate": 1.940525527602918e-05, + "loss": 0.3203, + "step": 3986 + }, + { + "epoch": 0.1994, + "grad_norm": 3.51568603515625, + "learning_rate": 1.9404068845995317e-05, + "loss": 1.4424, + "step": 3988 + }, + { + "epoch": 0.1995, + "grad_norm": 2.5321872234344482, + "learning_rate": 1.940288127010419e-05, + "loss": 0.8651, + "step": 3990 + }, + { + "epoch": 0.1996, + "grad_norm": 7.284533977508545, + "learning_rate": 1.9401692548500504e-05, + "loss": 1.3544, + "step": 3992 + }, + { + "epoch": 0.1997, + "grad_norm": 4.789323806762695, + "learning_rate": 1.94005026813291e-05, + "loss": 1.1415, + "step": 3994 + }, + { + "epoch": 0.1998, + "grad_norm": 2.2945096492767334, + "learning_rate": 1.9399311668734957e-05, + "loss": 1.2077, + "step": 3996 + }, + { + "epoch": 0.1999, + "grad_norm": 2.746645212173462, + "learning_rate": 1.9398119510863197e-05, + "loss": 1.6653, + "step": 3998 + }, + { + "epoch": 0.2, + "grad_norm": 4.2287116050720215, + "learning_rate": 1.9396926207859085e-05, + "loss": 1.0106, + "step": 4000 + }, + { + "epoch": 0.2001, + "grad_norm": 3.306647777557373, + "learning_rate": 1.939573175986802e-05, + "loss": 1.7599, + "step": 4002 + }, + { + "epoch": 0.2002, + "grad_norm": 2.809177875518799, + "learning_rate": 1.9394536167035535e-05, + "loss": 0.7302, + "step": 4004 + }, + { + "epoch": 0.2003, + "grad_norm": 3.9956493377685547, + "learning_rate": 1.9393339429507317e-05, + "loss": 0.4168, + "step": 4006 + }, + { + "epoch": 0.2004, + "grad_norm": 4.626999855041504, + "learning_rate": 1.9392141547429183e-05, + "loss": 0.6977, + "step": 4008 + }, + { + "epoch": 0.2005, + "grad_norm": 5.354297161102295, + "learning_rate": 1.939094252094709e-05, + "loss": 0.5876, + "step": 4010 + }, + { + "epoch": 0.2006, + "grad_norm": 7.442347049713135, + "learning_rate": 1.938974235020714e-05, + "loss": 1.5387, + "step": 4012 + }, + { + "epoch": 0.2007, + "grad_norm": 4.122570037841797, + "learning_rate": 1.9388541035355563e-05, + "loss": 1.7769, + "step": 4014 + }, + { + "epoch": 0.2008, + "grad_norm": 7.008672714233398, + "learning_rate": 1.9387338576538743e-05, + "loss": 1.3756, + "step": 4016 + }, + { + "epoch": 0.2009, + "grad_norm": 4.70452356338501, + "learning_rate": 1.938613497390319e-05, + "loss": 0.8591, + "step": 4018 + }, + { + "epoch": 0.201, + "grad_norm": 7.117315292358398, + "learning_rate": 1.938493022759556e-05, + "loss": 1.2538, + "step": 4020 + }, + { + "epoch": 0.2011, + "grad_norm": 0.837460994720459, + "learning_rate": 1.938372433776265e-05, + "loss": 0.592, + "step": 4022 + }, + { + "epoch": 0.2012, + "grad_norm": 2.698580503463745, + "learning_rate": 1.9382517304551397e-05, + "loss": 0.7271, + "step": 4024 + }, + { + "epoch": 0.2013, + "grad_norm": 3.2207963466644287, + "learning_rate": 1.9381309128108866e-05, + "loss": 1.4184, + "step": 4026 + }, + { + "epoch": 0.2014, + "grad_norm": 3.340899705886841, + "learning_rate": 1.9380099808582278e-05, + "loss": 1.1837, + "step": 4028 + }, + { + "epoch": 0.2015, + "grad_norm": 4.974916934967041, + "learning_rate": 1.937888934611898e-05, + "loss": 0.8552, + "step": 4030 + }, + { + "epoch": 0.2016, + "grad_norm": 6.524670600891113, + "learning_rate": 1.937767774086646e-05, + "loss": 0.7748, + "step": 4032 + }, + { + "epoch": 0.2017, + "grad_norm": 5.55673360824585, + "learning_rate": 1.9376464992972358e-05, + "loss": 1.1337, + "step": 4034 + }, + { + "epoch": 0.2018, + "grad_norm": 2.545783281326294, + "learning_rate": 1.9375251102584438e-05, + "loss": 0.6725, + "step": 4036 + }, + { + "epoch": 0.2019, + "grad_norm": 4.021820068359375, + "learning_rate": 1.937403606985061e-05, + "loss": 0.4652, + "step": 4038 + }, + { + "epoch": 0.202, + "grad_norm": 8.351466178894043, + "learning_rate": 1.937281989491892e-05, + "loss": 1.302, + "step": 4040 + }, + { + "epoch": 0.2021, + "grad_norm": 8.264143943786621, + "learning_rate": 1.9371602577937554e-05, + "loss": 1.0853, + "step": 4042 + }, + { + "epoch": 0.2022, + "grad_norm": 1.8441706895828247, + "learning_rate": 1.937038411905484e-05, + "loss": 0.918, + "step": 4044 + }, + { + "epoch": 0.2023, + "grad_norm": 2.12176251411438, + "learning_rate": 1.936916451841925e-05, + "loss": 1.2088, + "step": 4046 + }, + { + "epoch": 0.2024, + "grad_norm": 2.0184824466705322, + "learning_rate": 1.936794377617938e-05, + "loss": 1.0217, + "step": 4048 + }, + { + "epoch": 0.2025, + "grad_norm": 9.533634185791016, + "learning_rate": 1.9366721892483976e-05, + "loss": 1.0261, + "step": 4050 + }, + { + "epoch": 0.2026, + "grad_norm": 3.6693782806396484, + "learning_rate": 1.9365498867481926e-05, + "loss": 1.1828, + "step": 4052 + }, + { + "epoch": 0.2027, + "grad_norm": 2.5117831230163574, + "learning_rate": 1.9364274701322246e-05, + "loss": 0.771, + "step": 4054 + }, + { + "epoch": 0.2028, + "grad_norm": 4.776710033416748, + "learning_rate": 1.9363049394154095e-05, + "loss": 1.6041, + "step": 4056 + }, + { + "epoch": 0.2029, + "grad_norm": 9.00563907623291, + "learning_rate": 1.936182294612678e-05, + "loss": 1.1165, + "step": 4058 + }, + { + "epoch": 0.203, + "grad_norm": 9.904561996459961, + "learning_rate": 1.9360595357389735e-05, + "loss": 1.001, + "step": 4060 + }, + { + "epoch": 0.2031, + "grad_norm": 2.845752477645874, + "learning_rate": 1.935936662809254e-05, + "loss": 0.798, + "step": 4062 + }, + { + "epoch": 0.2032, + "grad_norm": 6.058747291564941, + "learning_rate": 1.935813675838491e-05, + "loss": 1.0434, + "step": 4064 + }, + { + "epoch": 0.2033, + "grad_norm": 5.158402442932129, + "learning_rate": 1.9356905748416704e-05, + "loss": 0.3471, + "step": 4066 + }, + { + "epoch": 0.2034, + "grad_norm": 21.092275619506836, + "learning_rate": 1.9355673598337916e-05, + "loss": 1.0013, + "step": 4068 + }, + { + "epoch": 0.2035, + "grad_norm": 3.1296143531799316, + "learning_rate": 1.9354440308298676e-05, + "loss": 0.8669, + "step": 4070 + }, + { + "epoch": 0.2036, + "grad_norm": 2.943918466567993, + "learning_rate": 1.935320587844926e-05, + "loss": 0.9164, + "step": 4072 + }, + { + "epoch": 0.2037, + "grad_norm": 3.9457662105560303, + "learning_rate": 1.935197030894008e-05, + "loss": 1.3716, + "step": 4074 + }, + { + "epoch": 0.2038, + "grad_norm": 6.392759323120117, + "learning_rate": 1.9350733599921684e-05, + "loss": 0.5044, + "step": 4076 + }, + { + "epoch": 0.2039, + "grad_norm": 4.394573211669922, + "learning_rate": 1.9349495751544763e-05, + "loss": 1.1979, + "step": 4078 + }, + { + "epoch": 0.204, + "grad_norm": 1.5038195848464966, + "learning_rate": 1.9348256763960146e-05, + "loss": 0.2413, + "step": 4080 + }, + { + "epoch": 0.2041, + "grad_norm": 5.458526611328125, + "learning_rate": 1.9347016637318797e-05, + "loss": 1.1545, + "step": 4082 + }, + { + "epoch": 0.2042, + "grad_norm": 1.9614018201828003, + "learning_rate": 1.9345775371771826e-05, + "loss": 1.3482, + "step": 4084 + }, + { + "epoch": 0.2043, + "grad_norm": 7.090372085571289, + "learning_rate": 1.934453296747047e-05, + "loss": 0.9351, + "step": 4086 + }, + { + "epoch": 0.2044, + "grad_norm": 2.699838161468506, + "learning_rate": 1.9343289424566122e-05, + "loss": 0.7256, + "step": 4088 + }, + { + "epoch": 0.2045, + "grad_norm": 2.767613649368286, + "learning_rate": 1.9342044743210295e-05, + "loss": 0.4637, + "step": 4090 + }, + { + "epoch": 0.2046, + "grad_norm": 6.92013692855835, + "learning_rate": 1.9340798923554657e-05, + "loss": 0.7968, + "step": 4092 + }, + { + "epoch": 0.2047, + "grad_norm": 3.8165340423583984, + "learning_rate": 1.9339551965751e-05, + "loss": 0.7284, + "step": 4094 + }, + { + "epoch": 0.2048, + "grad_norm": 4.432530879974365, + "learning_rate": 1.933830386995127e-05, + "loss": 0.7726, + "step": 4096 + }, + { + "epoch": 0.2049, + "grad_norm": 5.73716926574707, + "learning_rate": 1.9337054636307537e-05, + "loss": 1.4217, + "step": 4098 + }, + { + "epoch": 0.205, + "grad_norm": 6.8467607498168945, + "learning_rate": 1.9335804264972018e-05, + "loss": 0.5576, + "step": 4100 + }, + { + "epoch": 0.2051, + "grad_norm": 5.783648490905762, + "learning_rate": 1.933455275609707e-05, + "loss": 0.8994, + "step": 4102 + }, + { + "epoch": 0.2052, + "grad_norm": 11.302571296691895, + "learning_rate": 1.9333300109835182e-05, + "loss": 1.6785, + "step": 4104 + }, + { + "epoch": 0.2053, + "grad_norm": 6.98345422744751, + "learning_rate": 1.9332046326338985e-05, + "loss": 1.1696, + "step": 4106 + }, + { + "epoch": 0.2054, + "grad_norm": 7.395509243011475, + "learning_rate": 1.9330791405761254e-05, + "loss": 0.537, + "step": 4108 + }, + { + "epoch": 0.2055, + "grad_norm": 5.297239780426025, + "learning_rate": 1.9329535348254893e-05, + "loss": 0.8006, + "step": 4110 + }, + { + "epoch": 0.2056, + "grad_norm": 9.191320419311523, + "learning_rate": 1.9328278153972947e-05, + "loss": 1.6537, + "step": 4112 + }, + { + "epoch": 0.2057, + "grad_norm": 4.524987697601318, + "learning_rate": 1.9327019823068605e-05, + "loss": 1.5515, + "step": 4114 + }, + { + "epoch": 0.2058, + "grad_norm": 5.6511664390563965, + "learning_rate": 1.932576035569519e-05, + "loss": 1.0075, + "step": 4116 + }, + { + "epoch": 0.2059, + "grad_norm": 4.38016939163208, + "learning_rate": 1.932449975200616e-05, + "loss": 1.051, + "step": 4118 + }, + { + "epoch": 0.206, + "grad_norm": 3.5369303226470947, + "learning_rate": 1.9323238012155125e-05, + "loss": 1.915, + "step": 4120 + }, + { + "epoch": 0.2061, + "grad_norm": 1.6082144975662231, + "learning_rate": 1.9321975136295815e-05, + "loss": 0.3538, + "step": 4122 + }, + { + "epoch": 0.2062, + "grad_norm": 4.507069110870361, + "learning_rate": 1.932071112458211e-05, + "loss": 1.0068, + "step": 4124 + }, + { + "epoch": 0.2063, + "grad_norm": 4.048280715942383, + "learning_rate": 1.931944597716803e-05, + "loss": 1.0233, + "step": 4126 + }, + { + "epoch": 0.2064, + "grad_norm": 5.098574638366699, + "learning_rate": 1.9318179694207726e-05, + "loss": 1.1338, + "step": 4128 + }, + { + "epoch": 0.2065, + "grad_norm": 1.9608986377716064, + "learning_rate": 1.931691227585549e-05, + "loss": 0.6045, + "step": 4130 + }, + { + "epoch": 0.2066, + "grad_norm": 6.919966220855713, + "learning_rate": 1.931564372226576e-05, + "loss": 0.3075, + "step": 4132 + }, + { + "epoch": 0.2067, + "grad_norm": 9.792167663574219, + "learning_rate": 1.931437403359309e-05, + "loss": 1.4518, + "step": 4134 + }, + { + "epoch": 0.2068, + "grad_norm": 5.681524753570557, + "learning_rate": 1.9313103209992205e-05, + "loss": 1.0678, + "step": 4136 + }, + { + "epoch": 0.2069, + "grad_norm": 3.81765079498291, + "learning_rate": 1.9311831251617942e-05, + "loss": 0.8885, + "step": 4138 + }, + { + "epoch": 0.207, + "grad_norm": 4.782137393951416, + "learning_rate": 1.9310558158625286e-05, + "loss": 0.9358, + "step": 4140 + }, + { + "epoch": 0.2071, + "grad_norm": 3.6209051609039307, + "learning_rate": 1.930928393116936e-05, + "loss": 1.0005, + "step": 4142 + }, + { + "epoch": 0.2072, + "grad_norm": 12.126083374023438, + "learning_rate": 1.9308008569405424e-05, + "loss": 1.2997, + "step": 4144 + }, + { + "epoch": 0.2073, + "grad_norm": 3.3696022033691406, + "learning_rate": 1.930673207348888e-05, + "loss": 0.6234, + "step": 4146 + }, + { + "epoch": 0.2074, + "grad_norm": 13.742740631103516, + "learning_rate": 1.930545444357526e-05, + "loss": 0.8163, + "step": 4148 + }, + { + "epoch": 0.2075, + "grad_norm": 12.500672340393066, + "learning_rate": 1.9304175679820247e-05, + "loss": 0.6569, + "step": 4150 + }, + { + "epoch": 0.2076, + "grad_norm": 7.072859287261963, + "learning_rate": 1.9302895782379648e-05, + "loss": 0.6591, + "step": 4152 + }, + { + "epoch": 0.2077, + "grad_norm": 17.014738082885742, + "learning_rate": 1.9301614751409416e-05, + "loss": 1.5541, + "step": 4154 + }, + { + "epoch": 0.2078, + "grad_norm": 5.4855451583862305, + "learning_rate": 1.9300332587065644e-05, + "loss": 1.4986, + "step": 4156 + }, + { + "epoch": 0.2079, + "grad_norm": 5.877799987792969, + "learning_rate": 1.9299049289504555e-05, + "loss": 0.8749, + "step": 4158 + }, + { + "epoch": 0.208, + "grad_norm": 4.476706027984619, + "learning_rate": 1.9297764858882516e-05, + "loss": 0.9276, + "step": 4160 + }, + { + "epoch": 0.2081, + "grad_norm": 1.298448085784912, + "learning_rate": 1.9296479295356035e-05, + "loss": 0.5134, + "step": 4162 + }, + { + "epoch": 0.2082, + "grad_norm": 3.920815944671631, + "learning_rate": 1.9295192599081747e-05, + "loss": 1.7085, + "step": 4164 + }, + { + "epoch": 0.2083, + "grad_norm": 3.1415350437164307, + "learning_rate": 1.929390477021644e-05, + "loss": 0.2989, + "step": 4166 + }, + { + "epoch": 0.2084, + "grad_norm": 1.656104564666748, + "learning_rate": 1.9292615808917027e-05, + "loss": 0.6461, + "step": 4168 + }, + { + "epoch": 0.2085, + "grad_norm": 0.8046352863311768, + "learning_rate": 1.9291325715340562e-05, + "loss": 0.5587, + "step": 4170 + }, + { + "epoch": 0.2086, + "grad_norm": 9.637518882751465, + "learning_rate": 1.9290034489644247e-05, + "loss": 0.7683, + "step": 4172 + }, + { + "epoch": 0.2087, + "grad_norm": 3.385108709335327, + "learning_rate": 1.9288742131985408e-05, + "loss": 0.5081, + "step": 4174 + }, + { + "epoch": 0.2088, + "grad_norm": 5.7518391609191895, + "learning_rate": 1.9287448642521513e-05, + "loss": 0.916, + "step": 4176 + }, + { + "epoch": 0.2089, + "grad_norm": 4.639835357666016, + "learning_rate": 1.9286154021410177e-05, + "loss": 0.9792, + "step": 4178 + }, + { + "epoch": 0.209, + "grad_norm": 11.359848976135254, + "learning_rate": 1.9284858268809135e-05, + "loss": 0.3793, + "step": 4180 + }, + { + "epoch": 0.2091, + "grad_norm": 2.807684898376465, + "learning_rate": 1.9283561384876284e-05, + "loss": 0.6262, + "step": 4182 + }, + { + "epoch": 0.2092, + "grad_norm": 10.041668891906738, + "learning_rate": 1.9282263369769633e-05, + "loss": 1.7073, + "step": 4184 + }, + { + "epoch": 0.2093, + "grad_norm": 0.4209049344062805, + "learning_rate": 1.928096422364735e-05, + "loss": 0.2153, + "step": 4186 + }, + { + "epoch": 0.2094, + "grad_norm": 8.394857406616211, + "learning_rate": 1.927966394666773e-05, + "loss": 1.3643, + "step": 4188 + }, + { + "epoch": 0.2095, + "grad_norm": 2.93402361869812, + "learning_rate": 1.92783625389892e-05, + "loss": 0.7477, + "step": 4190 + }, + { + "epoch": 0.2096, + "grad_norm": 7.310904502868652, + "learning_rate": 1.9277060000770342e-05, + "loss": 0.261, + "step": 4192 + }, + { + "epoch": 0.2097, + "grad_norm": 5.5412139892578125, + "learning_rate": 1.9275756332169865e-05, + "loss": 1.533, + "step": 4194 + }, + { + "epoch": 0.2098, + "grad_norm": 12.066088676452637, + "learning_rate": 1.9274451533346617e-05, + "loss": 1.1949, + "step": 4196 + }, + { + "epoch": 0.2099, + "grad_norm": 5.0611677169799805, + "learning_rate": 1.9273145604459577e-05, + "loss": 1.4391, + "step": 4198 + }, + { + "epoch": 0.21, + "grad_norm": 12.228198051452637, + "learning_rate": 1.9271838545667876e-05, + "loss": 1.4859, + "step": 4200 + }, + { + "epoch": 0.2101, + "grad_norm": 8.373042106628418, + "learning_rate": 1.927053035713077e-05, + "loss": 2.0003, + "step": 4202 + }, + { + "epoch": 0.2102, + "grad_norm": 3.0603466033935547, + "learning_rate": 1.9269221039007666e-05, + "loss": 1.4459, + "step": 4204 + }, + { + "epoch": 0.2103, + "grad_norm": 2.9861605167388916, + "learning_rate": 1.926791059145809e-05, + "loss": 1.2608, + "step": 4206 + }, + { + "epoch": 0.2104, + "grad_norm": 21.509201049804688, + "learning_rate": 1.9266599014641724e-05, + "loss": 2.6455, + "step": 4208 + }, + { + "epoch": 0.2105, + "grad_norm": 2.5397844314575195, + "learning_rate": 1.9265286308718374e-05, + "loss": 1.46, + "step": 4210 + }, + { + "epoch": 0.2106, + "grad_norm": 4.630632400512695, + "learning_rate": 1.9263972473847995e-05, + "loss": 1.0712, + "step": 4212 + }, + { + "epoch": 0.2107, + "grad_norm": 9.88843822479248, + "learning_rate": 1.926265751019067e-05, + "loss": 1.0312, + "step": 4214 + }, + { + "epoch": 0.2108, + "grad_norm": 5.868338108062744, + "learning_rate": 1.9261341417906622e-05, + "loss": 0.9632, + "step": 4216 + }, + { + "epoch": 0.2109, + "grad_norm": 13.965494155883789, + "learning_rate": 1.9260024197156216e-05, + "loss": 1.2363, + "step": 4218 + }, + { + "epoch": 0.211, + "grad_norm": 11.52026653289795, + "learning_rate": 1.925870584809995e-05, + "loss": 1.0551, + "step": 4220 + }, + { + "epoch": 0.2111, + "grad_norm": 4.610317707061768, + "learning_rate": 1.925738637089846e-05, + "loss": 1.4211, + "step": 4222 + }, + { + "epoch": 0.2112, + "grad_norm": 4.922701358795166, + "learning_rate": 1.9256065765712524e-05, + "loss": 0.749, + "step": 4224 + }, + { + "epoch": 0.2113, + "grad_norm": 4.189015865325928, + "learning_rate": 1.925474403270305e-05, + "loss": 1.4095, + "step": 4226 + }, + { + "epoch": 0.2114, + "grad_norm": 4.8846235275268555, + "learning_rate": 1.9253421172031086e-05, + "loss": 1.1219, + "step": 4228 + }, + { + "epoch": 0.2115, + "grad_norm": 3.7512803077697754, + "learning_rate": 1.9252097183857822e-05, + "loss": 1.1275, + "step": 4230 + }, + { + "epoch": 0.2116, + "grad_norm": 5.02032995223999, + "learning_rate": 1.925077206834458e-05, + "loss": 1.1131, + "step": 4232 + }, + { + "epoch": 0.2117, + "grad_norm": 14.454684257507324, + "learning_rate": 1.9249445825652825e-05, + "loss": 1.5534, + "step": 4234 + }, + { + "epoch": 0.2118, + "grad_norm": 4.665948867797852, + "learning_rate": 1.9248118455944153e-05, + "loss": 0.7265, + "step": 4236 + }, + { + "epoch": 0.2119, + "grad_norm": 3.1689953804016113, + "learning_rate": 1.9246789959380297e-05, + "loss": 0.5888, + "step": 4238 + }, + { + "epoch": 0.212, + "grad_norm": 13.57048511505127, + "learning_rate": 1.9245460336123136e-05, + "loss": 1.5972, + "step": 4240 + }, + { + "epoch": 0.2121, + "grad_norm": 9.93704891204834, + "learning_rate": 1.924412958633467e-05, + "loss": 0.7649, + "step": 4242 + }, + { + "epoch": 0.2122, + "grad_norm": 7.415918350219727, + "learning_rate": 1.924279771017706e-05, + "loss": 1.4179, + "step": 4244 + }, + { + "epoch": 0.2123, + "grad_norm": 2.760165214538574, + "learning_rate": 1.9241464707812586e-05, + "loss": 0.4784, + "step": 4246 + }, + { + "epoch": 0.2124, + "grad_norm": 10.921284675598145, + "learning_rate": 1.924013057940367e-05, + "loss": 1.3591, + "step": 4248 + }, + { + "epoch": 0.2125, + "grad_norm": 2.806666135787964, + "learning_rate": 1.9238795325112867e-05, + "loss": 0.6808, + "step": 4250 + }, + { + "epoch": 0.2126, + "grad_norm": 3.8230743408203125, + "learning_rate": 1.923745894510288e-05, + "loss": 1.5537, + "step": 4252 + }, + { + "epoch": 0.2127, + "grad_norm": 7.269757270812988, + "learning_rate": 1.9236121439536544e-05, + "loss": 0.7634, + "step": 4254 + }, + { + "epoch": 0.2128, + "grad_norm": 3.416935443878174, + "learning_rate": 1.9234782808576823e-05, + "loss": 0.529, + "step": 4256 + }, + { + "epoch": 0.2129, + "grad_norm": 11.403532981872559, + "learning_rate": 1.9233443052386832e-05, + "loss": 1.4198, + "step": 4258 + }, + { + "epoch": 0.213, + "grad_norm": 3.4465367794036865, + "learning_rate": 1.923210217112981e-05, + "loss": 0.7564, + "step": 4260 + }, + { + "epoch": 0.2131, + "grad_norm": 0.7060465216636658, + "learning_rate": 1.9230760164969146e-05, + "loss": 0.4659, + "step": 4262 + }, + { + "epoch": 0.2132, + "grad_norm": 9.246868133544922, + "learning_rate": 1.9229417034068352e-05, + "loss": 1.3642, + "step": 4264 + }, + { + "epoch": 0.2133, + "grad_norm": 4.419397830963135, + "learning_rate": 1.922807277859109e-05, + "loss": 1.7591, + "step": 4266 + }, + { + "epoch": 0.2134, + "grad_norm": 8.267669677734375, + "learning_rate": 1.922672739870115e-05, + "loss": 1.1099, + "step": 4268 + }, + { + "epoch": 0.2135, + "grad_norm": 7.635485649108887, + "learning_rate": 1.9225380894562466e-05, + "loss": 1.6413, + "step": 4270 + }, + { + "epoch": 0.2136, + "grad_norm": 6.29491662979126, + "learning_rate": 1.9224033266339103e-05, + "loss": 1.1254, + "step": 4272 + }, + { + "epoch": 0.2137, + "grad_norm": 3.967179298400879, + "learning_rate": 1.9222684514195265e-05, + "loss": 1.3124, + "step": 4274 + }, + { + "epoch": 0.2138, + "grad_norm": 4.020313739776611, + "learning_rate": 1.9221334638295296e-05, + "loss": 1.3006, + "step": 4276 + }, + { + "epoch": 0.2139, + "grad_norm": 3.86285138130188, + "learning_rate": 1.9219983638803672e-05, + "loss": 1.0283, + "step": 4278 + }, + { + "epoch": 0.214, + "grad_norm": 3.40582013130188, + "learning_rate": 1.9218631515885007e-05, + "loss": 0.9443, + "step": 4280 + }, + { + "epoch": 0.2141, + "grad_norm": 5.497169494628906, + "learning_rate": 1.9217278269704055e-05, + "loss": 1.0671, + "step": 4282 + }, + { + "epoch": 0.2142, + "grad_norm": 5.279942512512207, + "learning_rate": 1.921592390042571e-05, + "loss": 0.8698, + "step": 4284 + }, + { + "epoch": 0.2143, + "grad_norm": 4.730970859527588, + "learning_rate": 1.9214568408214986e-05, + "loss": 0.9156, + "step": 4286 + }, + { + "epoch": 0.2144, + "grad_norm": 3.392134666442871, + "learning_rate": 1.9213211793237056e-05, + "loss": 1.2896, + "step": 4288 + }, + { + "epoch": 0.2145, + "grad_norm": 12.113791465759277, + "learning_rate": 1.9211854055657216e-05, + "loss": 1.8663, + "step": 4290 + }, + { + "epoch": 0.2146, + "grad_norm": 4.420086860656738, + "learning_rate": 1.9210495195640895e-05, + "loss": 1.0401, + "step": 4292 + }, + { + "epoch": 0.2147, + "grad_norm": 4.6862640380859375, + "learning_rate": 1.920913521335368e-05, + "loss": 1.5802, + "step": 4294 + }, + { + "epoch": 0.2148, + "grad_norm": 4.398149490356445, + "learning_rate": 1.9207774108961273e-05, + "loss": 0.8921, + "step": 4296 + }, + { + "epoch": 0.2149, + "grad_norm": 4.987364292144775, + "learning_rate": 1.920641188262952e-05, + "loss": 1.1553, + "step": 4298 + }, + { + "epoch": 0.215, + "grad_norm": 6.069809913635254, + "learning_rate": 1.9205048534524405e-05, + "loss": 0.3384, + "step": 4300 + }, + { + "epoch": 0.2151, + "grad_norm": 5.399630069732666, + "learning_rate": 1.9203684064812047e-05, + "loss": 1.0813, + "step": 4302 + }, + { + "epoch": 0.2152, + "grad_norm": 6.824250221252441, + "learning_rate": 1.9202318473658707e-05, + "loss": 0.9504, + "step": 4304 + }, + { + "epoch": 0.2153, + "grad_norm": 0.7651668787002563, + "learning_rate": 1.920095176123077e-05, + "loss": 0.572, + "step": 4306 + }, + { + "epoch": 0.2154, + "grad_norm": 1.956542730331421, + "learning_rate": 1.9199583927694775e-05, + "loss": 0.8018, + "step": 4308 + }, + { + "epoch": 0.2155, + "grad_norm": 1.7437810897827148, + "learning_rate": 1.919821497321738e-05, + "loss": 0.766, + "step": 4310 + }, + { + "epoch": 0.2156, + "grad_norm": 4.798074245452881, + "learning_rate": 1.9196844897965393e-05, + "loss": 0.9144, + "step": 4312 + }, + { + "epoch": 0.2157, + "grad_norm": 4.00486421585083, + "learning_rate": 1.9195473702105748e-05, + "loss": 1.043, + "step": 4314 + }, + { + "epoch": 0.2158, + "grad_norm": 3.0013797283172607, + "learning_rate": 1.919410138580553e-05, + "loss": 1.6263, + "step": 4316 + }, + { + "epoch": 0.2159, + "grad_norm": 4.593723773956299, + "learning_rate": 1.9192727949231945e-05, + "loss": 1.101, + "step": 4318 + }, + { + "epoch": 0.216, + "grad_norm": 5.8975629806518555, + "learning_rate": 1.9191353392552346e-05, + "loss": 1.0108, + "step": 4320 + }, + { + "epoch": 0.2161, + "grad_norm": 2.5277230739593506, + "learning_rate": 1.9189977715934214e-05, + "loss": 0.9122, + "step": 4322 + }, + { + "epoch": 0.2162, + "grad_norm": 3.9364230632781982, + "learning_rate": 1.9188600919545176e-05, + "loss": 0.6958, + "step": 4324 + }, + { + "epoch": 0.2163, + "grad_norm": 1.554376482963562, + "learning_rate": 1.9187223003552986e-05, + "loss": 0.6414, + "step": 4326 + }, + { + "epoch": 0.2164, + "grad_norm": 3.563204288482666, + "learning_rate": 1.9185843968125543e-05, + "loss": 0.6492, + "step": 4328 + }, + { + "epoch": 0.2165, + "grad_norm": 2.806635856628418, + "learning_rate": 1.9184463813430874e-05, + "loss": 1.1367, + "step": 4330 + }, + { + "epoch": 0.2166, + "grad_norm": 4.250726222991943, + "learning_rate": 1.918308253963715e-05, + "loss": 0.9333, + "step": 4332 + }, + { + "epoch": 0.2167, + "grad_norm": 3.8235340118408203, + "learning_rate": 1.918170014691267e-05, + "loss": 1.1207, + "step": 4334 + }, + { + "epoch": 0.2168, + "grad_norm": 9.220407485961914, + "learning_rate": 1.9180316635425883e-05, + "loss": 1.4318, + "step": 4336 + }, + { + "epoch": 0.2169, + "grad_norm": 2.979975938796997, + "learning_rate": 1.917893200534536e-05, + "loss": 0.6703, + "step": 4338 + }, + { + "epoch": 0.217, + "grad_norm": 7.020674228668213, + "learning_rate": 1.9177546256839814e-05, + "loss": 1.4201, + "step": 4340 + }, + { + "epoch": 0.2171, + "grad_norm": 9.040447235107422, + "learning_rate": 1.9176159390078095e-05, + "loss": 1.1207, + "step": 4342 + }, + { + "epoch": 0.2172, + "grad_norm": 7.646198272705078, + "learning_rate": 1.9174771405229187e-05, + "loss": 0.9736, + "step": 4344 + }, + { + "epoch": 0.2173, + "grad_norm": 13.356035232543945, + "learning_rate": 1.9173382302462217e-05, + "loss": 1.3672, + "step": 4346 + }, + { + "epoch": 0.2174, + "grad_norm": 11.262768745422363, + "learning_rate": 1.9171992081946436e-05, + "loss": 1.1334, + "step": 4348 + }, + { + "epoch": 0.2175, + "grad_norm": 6.858675479888916, + "learning_rate": 1.917060074385124e-05, + "loss": 0.7772, + "step": 4350 + }, + { + "epoch": 0.2176, + "grad_norm": 9.129745483398438, + "learning_rate": 1.9169208288346168e-05, + "loss": 0.7351, + "step": 4352 + }, + { + "epoch": 0.2177, + "grad_norm": 5.3441338539123535, + "learning_rate": 1.9167814715600872e-05, + "loss": 0.9072, + "step": 4354 + }, + { + "epoch": 0.2178, + "grad_norm": 8.230058670043945, + "learning_rate": 1.9166420025785165e-05, + "loss": 1.3967, + "step": 4356 + }, + { + "epoch": 0.2179, + "grad_norm": 3.8713393211364746, + "learning_rate": 1.916502421906898e-05, + "loss": 1.1656, + "step": 4358 + }, + { + "epoch": 0.218, + "grad_norm": 2.6674866676330566, + "learning_rate": 1.9163627295622397e-05, + "loss": 0.9336, + "step": 4360 + }, + { + "epoch": 0.2181, + "grad_norm": 8.034971237182617, + "learning_rate": 1.9162229255615624e-05, + "loss": 1.0858, + "step": 4362 + }, + { + "epoch": 0.2182, + "grad_norm": 12.77208423614502, + "learning_rate": 1.9160830099219007e-05, + "loss": 0.9337, + "step": 4364 + }, + { + "epoch": 0.2183, + "grad_norm": 7.069989204406738, + "learning_rate": 1.9159429826603032e-05, + "loss": 1.9903, + "step": 4366 + }, + { + "epoch": 0.2184, + "grad_norm": 4.284241676330566, + "learning_rate": 1.9158028437938316e-05, + "loss": 0.7455, + "step": 4368 + }, + { + "epoch": 0.2185, + "grad_norm": 2.678553819656372, + "learning_rate": 1.9156625933395614e-05, + "loss": 0.692, + "step": 4370 + }, + { + "epoch": 0.2186, + "grad_norm": 3.381423234939575, + "learning_rate": 1.9155222313145817e-05, + "loss": 1.1226, + "step": 4372 + }, + { + "epoch": 0.2187, + "grad_norm": 4.321918964385986, + "learning_rate": 1.915381757735995e-05, + "loss": 1.0466, + "step": 4374 + }, + { + "epoch": 0.2188, + "grad_norm": 1.0095785856246948, + "learning_rate": 1.9152411726209176e-05, + "loss": 0.1104, + "step": 4376 + }, + { + "epoch": 0.2189, + "grad_norm": 10.011317253112793, + "learning_rate": 1.91510047598648e-05, + "loss": 0.9496, + "step": 4378 + }, + { + "epoch": 0.219, + "grad_norm": 4.304776668548584, + "learning_rate": 1.914959667849825e-05, + "loss": 1.4988, + "step": 4380 + }, + { + "epoch": 0.2191, + "grad_norm": 5.432332515716553, + "learning_rate": 1.9148187482281097e-05, + "loss": 0.8792, + "step": 4382 + }, + { + "epoch": 0.2192, + "grad_norm": 3.218163251876831, + "learning_rate": 1.914677717138505e-05, + "loss": 1.0096, + "step": 4384 + }, + { + "epoch": 0.2193, + "grad_norm": 4.570298671722412, + "learning_rate": 1.914536574598195e-05, + "loss": 0.8212, + "step": 4386 + }, + { + "epoch": 0.2194, + "grad_norm": 0.9324037432670593, + "learning_rate": 1.9143953206243778e-05, + "loss": 0.4972, + "step": 4388 + }, + { + "epoch": 0.2195, + "grad_norm": 5.960536956787109, + "learning_rate": 1.9142539552342638e-05, + "loss": 1.393, + "step": 4390 + }, + { + "epoch": 0.2196, + "grad_norm": 5.201693534851074, + "learning_rate": 1.914112478445079e-05, + "loss": 1.402, + "step": 4392 + }, + { + "epoch": 0.2197, + "grad_norm": 2.796410083770752, + "learning_rate": 1.913970890274061e-05, + "loss": 1.2301, + "step": 4394 + }, + { + "epoch": 0.2198, + "grad_norm": 3.545417070388794, + "learning_rate": 1.9138291907384632e-05, + "loss": 1.3376, + "step": 4396 + }, + { + "epoch": 0.2199, + "grad_norm": 1.943535327911377, + "learning_rate": 1.91368737985555e-05, + "loss": 0.5062, + "step": 4398 + }, + { + "epoch": 0.22, + "grad_norm": 3.1774179935455322, + "learning_rate": 1.913545457642601e-05, + "loss": 1.4919, + "step": 4400 + }, + { + "epoch": 0.2201, + "grad_norm": 2.7136447429656982, + "learning_rate": 1.913403424116909e-05, + "loss": 0.8355, + "step": 4402 + }, + { + "epoch": 0.2202, + "grad_norm": 4.9000935554504395, + "learning_rate": 1.9132612792957808e-05, + "loss": 0.7989, + "step": 4404 + }, + { + "epoch": 0.2203, + "grad_norm": 5.346667289733887, + "learning_rate": 1.9131190231965356e-05, + "loss": 1.3224, + "step": 4406 + }, + { + "epoch": 0.2204, + "grad_norm": 3.683741331100464, + "learning_rate": 1.9129766558365076e-05, + "loss": 0.7972, + "step": 4408 + }, + { + "epoch": 0.2205, + "grad_norm": 3.409447193145752, + "learning_rate": 1.9128341772330428e-05, + "loss": 0.5544, + "step": 4410 + }, + { + "epoch": 0.2206, + "grad_norm": 10.872645378112793, + "learning_rate": 1.912691587403503e-05, + "loss": 1.843, + "step": 4412 + }, + { + "epoch": 0.2207, + "grad_norm": 7.069666385650635, + "learning_rate": 1.9125488863652614e-05, + "loss": 1.7528, + "step": 4414 + }, + { + "epoch": 0.2208, + "grad_norm": 4.190865993499756, + "learning_rate": 1.9124060741357065e-05, + "loss": 1.6968, + "step": 4416 + }, + { + "epoch": 0.2209, + "grad_norm": 2.097416639328003, + "learning_rate": 1.9122631507322388e-05, + "loss": 0.2332, + "step": 4418 + }, + { + "epoch": 0.221, + "grad_norm": 9.066206932067871, + "learning_rate": 1.9121201161722732e-05, + "loss": 0.6696, + "step": 4420 + }, + { + "epoch": 0.2211, + "grad_norm": 3.1999971866607666, + "learning_rate": 1.9119769704732382e-05, + "loss": 0.3022, + "step": 4422 + }, + { + "epoch": 0.2212, + "grad_norm": 4.745122909545898, + "learning_rate": 1.911833713652576e-05, + "loss": 0.7843, + "step": 4424 + }, + { + "epoch": 0.2213, + "grad_norm": 8.523913383483887, + "learning_rate": 1.9116903457277413e-05, + "loss": 1.3994, + "step": 4426 + }, + { + "epoch": 0.2214, + "grad_norm": 4.8880391120910645, + "learning_rate": 1.9115468667162038e-05, + "loss": 1.703, + "step": 4428 + }, + { + "epoch": 0.2215, + "grad_norm": 3.8832736015319824, + "learning_rate": 1.9114032766354453e-05, + "loss": 0.5178, + "step": 4430 + }, + { + "epoch": 0.2216, + "grad_norm": 6.457009315490723, + "learning_rate": 1.9112595755029625e-05, + "loss": 0.7477, + "step": 4432 + }, + { + "epoch": 0.2217, + "grad_norm": 6.177913665771484, + "learning_rate": 1.9111157633362642e-05, + "loss": 1.1214, + "step": 4434 + }, + { + "epoch": 0.2218, + "grad_norm": 2.768303871154785, + "learning_rate": 1.9109718401528742e-05, + "loss": 0.1969, + "step": 4436 + }, + { + "epoch": 0.2219, + "grad_norm": 3.1550614833831787, + "learning_rate": 1.910827805970328e-05, + "loss": 0.9716, + "step": 4438 + }, + { + "epoch": 0.222, + "grad_norm": 2.7532029151916504, + "learning_rate": 1.910683660806177e-05, + "loss": 1.3255, + "step": 4440 + }, + { + "epoch": 0.2221, + "grad_norm": 6.146064281463623, + "learning_rate": 1.9105394046779846e-05, + "loss": 1.296, + "step": 4442 + }, + { + "epoch": 0.2222, + "grad_norm": 5.719968795776367, + "learning_rate": 1.9103950376033276e-05, + "loss": 1.2163, + "step": 4444 + }, + { + "epoch": 0.2223, + "grad_norm": 28.04139518737793, + "learning_rate": 1.9102505595997965e-05, + "loss": 0.6094, + "step": 4446 + }, + { + "epoch": 0.2224, + "grad_norm": 2.1246485710144043, + "learning_rate": 1.9101059706849957e-05, + "loss": 0.8133, + "step": 4448 + }, + { + "epoch": 0.2225, + "grad_norm": 4.2640910148620605, + "learning_rate": 1.9099612708765432e-05, + "loss": 0.9955, + "step": 4450 + }, + { + "epoch": 0.2226, + "grad_norm": 4.180495262145996, + "learning_rate": 1.9098164601920702e-05, + "loss": 0.8438, + "step": 4452 + }, + { + "epoch": 0.2227, + "grad_norm": 3.6216506958007812, + "learning_rate": 1.9096715386492214e-05, + "loss": 1.2688, + "step": 4454 + }, + { + "epoch": 0.2228, + "grad_norm": 0.2002592235803604, + "learning_rate": 1.9095265062656546e-05, + "loss": 0.0212, + "step": 4456 + }, + { + "epoch": 0.2229, + "grad_norm": 5.850175380706787, + "learning_rate": 1.9093813630590417e-05, + "loss": 1.2408, + "step": 4458 + }, + { + "epoch": 0.223, + "grad_norm": 4.225883960723877, + "learning_rate": 1.9092361090470688e-05, + "loss": 1.2271, + "step": 4460 + }, + { + "epoch": 0.2231, + "grad_norm": 3.2742722034454346, + "learning_rate": 1.9090907442474334e-05, + "loss": 1.416, + "step": 4462 + }, + { + "epoch": 0.2232, + "grad_norm": 4.417510509490967, + "learning_rate": 1.908945268677849e-05, + "loss": 0.4755, + "step": 4464 + }, + { + "epoch": 0.2233, + "grad_norm": 10.340636253356934, + "learning_rate": 1.9087996823560404e-05, + "loss": 1.0327, + "step": 4466 + }, + { + "epoch": 0.2234, + "grad_norm": 6.438962936401367, + "learning_rate": 1.908653985299747e-05, + "loss": 1.3271, + "step": 4468 + }, + { + "epoch": 0.2235, + "grad_norm": 5.539638996124268, + "learning_rate": 1.908508177526722e-05, + "loss": 1.4659, + "step": 4470 + }, + { + "epoch": 0.2236, + "grad_norm": 2.343543767929077, + "learning_rate": 1.9083622590547313e-05, + "loss": 1.1748, + "step": 4472 + }, + { + "epoch": 0.2237, + "grad_norm": 3.7095565795898438, + "learning_rate": 1.9082162299015547e-05, + "loss": 0.3625, + "step": 4474 + }, + { + "epoch": 0.2238, + "grad_norm": 0.9257841110229492, + "learning_rate": 1.9080700900849855e-05, + "loss": 0.3946, + "step": 4476 + }, + { + "epoch": 0.2239, + "grad_norm": 3.813276767730713, + "learning_rate": 1.90792383962283e-05, + "loss": 1.1101, + "step": 4478 + }, + { + "epoch": 0.224, + "grad_norm": 7.6123833656311035, + "learning_rate": 1.907777478532909e-05, + "loss": 1.5896, + "step": 4480 + }, + { + "epoch": 0.2241, + "grad_norm": 8.003311157226562, + "learning_rate": 1.9076310068330554e-05, + "loss": 0.555, + "step": 4482 + }, + { + "epoch": 0.2242, + "grad_norm": 10.442736625671387, + "learning_rate": 1.907484424541117e-05, + "loss": 1.2784, + "step": 4484 + }, + { + "epoch": 0.2243, + "grad_norm": 4.008417129516602, + "learning_rate": 1.9073377316749543e-05, + "loss": 0.7425, + "step": 4486 + }, + { + "epoch": 0.2244, + "grad_norm": 5.851446628570557, + "learning_rate": 1.907190928252441e-05, + "loss": 1.4815, + "step": 4488 + }, + { + "epoch": 0.2245, + "grad_norm": 3.1835885047912598, + "learning_rate": 1.907044014291465e-05, + "loss": 0.5363, + "step": 4490 + }, + { + "epoch": 0.2246, + "grad_norm": 3.826143741607666, + "learning_rate": 1.906896989809927e-05, + "loss": 1.0668, + "step": 4492 + }, + { + "epoch": 0.2247, + "grad_norm": 3.2009689807891846, + "learning_rate": 1.9067498548257425e-05, + "loss": 1.4667, + "step": 4494 + }, + { + "epoch": 0.2248, + "grad_norm": 2.8277747631073, + "learning_rate": 1.906602609356838e-05, + "loss": 1.288, + "step": 4496 + }, + { + "epoch": 0.2249, + "grad_norm": 6.57643985748291, + "learning_rate": 1.9064552534211556e-05, + "loss": 1.0881, + "step": 4498 + }, + { + "epoch": 0.225, + "grad_norm": 8.134333610534668, + "learning_rate": 1.9063077870366504e-05, + "loss": 1.2754, + "step": 4500 + }, + { + "epoch": 0.2251, + "grad_norm": 3.4904563426971436, + "learning_rate": 1.9061602102212898e-05, + "loss": 1.1032, + "step": 4502 + }, + { + "epoch": 0.2252, + "grad_norm": 2.734119415283203, + "learning_rate": 1.9060125229930572e-05, + "loss": 1.495, + "step": 4504 + }, + { + "epoch": 0.2253, + "grad_norm": 5.412782669067383, + "learning_rate": 1.9058647253699462e-05, + "loss": 0.7606, + "step": 4506 + }, + { + "epoch": 0.2254, + "grad_norm": 4.139003753662109, + "learning_rate": 1.9057168173699664e-05, + "loss": 1.0848, + "step": 4508 + }, + { + "epoch": 0.2255, + "grad_norm": 2.971132278442383, + "learning_rate": 1.9055687990111397e-05, + "loss": 0.6826, + "step": 4510 + }, + { + "epoch": 0.2256, + "grad_norm": 2.6348979473114014, + "learning_rate": 1.905420670311502e-05, + "loss": 1.2112, + "step": 4512 + }, + { + "epoch": 0.2257, + "grad_norm": 5.375919342041016, + "learning_rate": 1.9052724312891017e-05, + "loss": 1.0538, + "step": 4514 + }, + { + "epoch": 0.2258, + "grad_norm": 4.431914806365967, + "learning_rate": 1.9051240819620018e-05, + "loss": 0.8403, + "step": 4516 + }, + { + "epoch": 0.2259, + "grad_norm": 5.55906343460083, + "learning_rate": 1.9049756223482777e-05, + "loss": 1.3473, + "step": 4518 + }, + { + "epoch": 0.226, + "grad_norm": 5.951138973236084, + "learning_rate": 1.9048270524660197e-05, + "loss": 1.3681, + "step": 4520 + }, + { + "epoch": 0.2261, + "grad_norm": 5.138930320739746, + "learning_rate": 1.9046783723333298e-05, + "loss": 0.7008, + "step": 4522 + }, + { + "epoch": 0.2262, + "grad_norm": 3.1420912742614746, + "learning_rate": 1.904529581968324e-05, + "loss": 0.8684, + "step": 4524 + }, + { + "epoch": 0.2263, + "grad_norm": 2.584878921508789, + "learning_rate": 1.904380681389133e-05, + "loss": 1.1207, + "step": 4526 + }, + { + "epoch": 0.2264, + "grad_norm": 3.687770366668701, + "learning_rate": 1.9042316706138987e-05, + "loss": 1.0932, + "step": 4528 + }, + { + "epoch": 0.2265, + "grad_norm": 2.863032579421997, + "learning_rate": 1.9040825496607788e-05, + "loss": 1.132, + "step": 4530 + }, + { + "epoch": 0.2266, + "grad_norm": 5.348052501678467, + "learning_rate": 1.903933318547942e-05, + "loss": 1.433, + "step": 4532 + }, + { + "epoch": 0.2267, + "grad_norm": 16.186155319213867, + "learning_rate": 1.903783977293572e-05, + "loss": 1.5723, + "step": 4534 + }, + { + "epoch": 0.2268, + "grad_norm": 2.378138542175293, + "learning_rate": 1.9036345259158667e-05, + "loss": 1.0936, + "step": 4536 + }, + { + "epoch": 0.2269, + "grad_norm": 8.492774963378906, + "learning_rate": 1.903484964433035e-05, + "loss": 1.3149, + "step": 4538 + }, + { + "epoch": 0.227, + "grad_norm": 3.6399784088134766, + "learning_rate": 1.903335292863301e-05, + "loss": 2.1789, + "step": 4540 + }, + { + "epoch": 0.2271, + "grad_norm": 6.386139869689941, + "learning_rate": 1.9031855112249016e-05, + "loss": 0.4453, + "step": 4542 + }, + { + "epoch": 0.2272, + "grad_norm": 2.274204730987549, + "learning_rate": 1.9030356195360875e-05, + "loss": 0.8814, + "step": 4544 + }, + { + "epoch": 0.2273, + "grad_norm": 4.312006950378418, + "learning_rate": 1.9028856178151222e-05, + "loss": 0.9011, + "step": 4546 + }, + { + "epoch": 0.2274, + "grad_norm": 13.200297355651855, + "learning_rate": 1.902735506080283e-05, + "loss": 0.8324, + "step": 4548 + }, + { + "epoch": 0.2275, + "grad_norm": 3.4767088890075684, + "learning_rate": 1.902585284349861e-05, + "loss": 1.4203, + "step": 4550 + }, + { + "epoch": 0.2276, + "grad_norm": 2.880091667175293, + "learning_rate": 1.9024349526421596e-05, + "loss": 1.7245, + "step": 4552 + }, + { + "epoch": 0.2277, + "grad_norm": 4.816240310668945, + "learning_rate": 1.9022845109754965e-05, + "loss": 1.4082, + "step": 4554 + }, + { + "epoch": 0.2278, + "grad_norm": 2.8961410522460938, + "learning_rate": 1.902133959368203e-05, + "loss": 1.116, + "step": 4556 + }, + { + "epoch": 0.2279, + "grad_norm": 6.777097702026367, + "learning_rate": 1.9019832978386227e-05, + "loss": 1.7791, + "step": 4558 + }, + { + "epoch": 0.228, + "grad_norm": 3.7877557277679443, + "learning_rate": 1.901832526405114e-05, + "loss": 1.368, + "step": 4560 + }, + { + "epoch": 0.2281, + "grad_norm": 2.5490524768829346, + "learning_rate": 1.9016816450860474e-05, + "loss": 0.8046, + "step": 4562 + }, + { + "epoch": 0.2282, + "grad_norm": 4.624067306518555, + "learning_rate": 1.901530653899807e-05, + "loss": 0.7345, + "step": 4564 + }, + { + "epoch": 0.2283, + "grad_norm": 3.926989793777466, + "learning_rate": 1.9013795528647913e-05, + "loss": 0.6654, + "step": 4566 + }, + { + "epoch": 0.2284, + "grad_norm": 8.664186477661133, + "learning_rate": 1.9012283419994115e-05, + "loss": 1.3415, + "step": 4568 + }, + { + "epoch": 0.2285, + "grad_norm": 3.5791964530944824, + "learning_rate": 1.9010770213220916e-05, + "loss": 1.1176, + "step": 4570 + }, + { + "epoch": 0.2286, + "grad_norm": 4.07172155380249, + "learning_rate": 1.9009255908512704e-05, + "loss": 0.7154, + "step": 4572 + }, + { + "epoch": 0.2287, + "grad_norm": 2.6065833568573, + "learning_rate": 1.9007740506053983e-05, + "loss": 0.9471, + "step": 4574 + }, + { + "epoch": 0.2288, + "grad_norm": 4.9754862785339355, + "learning_rate": 1.9006224006029404e-05, + "loss": 0.6193, + "step": 4576 + }, + { + "epoch": 0.2289, + "grad_norm": 1.9414689540863037, + "learning_rate": 1.900470640862375e-05, + "loss": 0.6368, + "step": 4578 + }, + { + "epoch": 0.229, + "grad_norm": 3.3403050899505615, + "learning_rate": 1.9003187714021936e-05, + "loss": 0.5624, + "step": 4580 + }, + { + "epoch": 0.2291, + "grad_norm": 3.4491429328918457, + "learning_rate": 1.9001667922409008e-05, + "loss": 0.5363, + "step": 4582 + }, + { + "epoch": 0.2292, + "grad_norm": 4.831780910491943, + "learning_rate": 1.9000147033970148e-05, + "loss": 0.4889, + "step": 4584 + }, + { + "epoch": 0.2293, + "grad_norm": 9.503016471862793, + "learning_rate": 1.8998625048890674e-05, + "loss": 1.1234, + "step": 4586 + }, + { + "epoch": 0.2294, + "grad_norm": 8.582951545715332, + "learning_rate": 1.899710196735603e-05, + "loss": 1.5441, + "step": 4588 + }, + { + "epoch": 0.2295, + "grad_norm": 2.623239755630493, + "learning_rate": 1.8995577789551806e-05, + "loss": 0.8803, + "step": 4590 + }, + { + "epoch": 0.2296, + "grad_norm": 2.94639253616333, + "learning_rate": 1.899405251566371e-05, + "loss": 0.5331, + "step": 4592 + }, + { + "epoch": 0.2297, + "grad_norm": 3.1825342178344727, + "learning_rate": 1.8992526145877603e-05, + "loss": 0.321, + "step": 4594 + }, + { + "epoch": 0.2298, + "grad_norm": 2.534773826599121, + "learning_rate": 1.8990998680379458e-05, + "loss": 1.174, + "step": 4596 + }, + { + "epoch": 0.2299, + "grad_norm": 4.110413551330566, + "learning_rate": 1.89894701193554e-05, + "loss": 1.0111, + "step": 4598 + }, + { + "epoch": 0.23, + "grad_norm": 4.512383937835693, + "learning_rate": 1.8987940462991673e-05, + "loss": 1.318, + "step": 4600 + }, + { + "epoch": 0.2301, + "grad_norm": 1.770237922668457, + "learning_rate": 1.8986409711474665e-05, + "loss": 1.4631, + "step": 4602 + }, + { + "epoch": 0.2302, + "grad_norm": 7.962291240692139, + "learning_rate": 1.8984877864990888e-05, + "loss": 1.4588, + "step": 4604 + }, + { + "epoch": 0.2303, + "grad_norm": 7.373212814331055, + "learning_rate": 1.8983344923727002e-05, + "loss": 1.4575, + "step": 4606 + }, + { + "epoch": 0.2304, + "grad_norm": 6.282426834106445, + "learning_rate": 1.8981810887869784e-05, + "loss": 1.0915, + "step": 4608 + }, + { + "epoch": 0.2305, + "grad_norm": 6.483310222625732, + "learning_rate": 1.8980275757606157e-05, + "loss": 1.2668, + "step": 4610 + }, + { + "epoch": 0.2306, + "grad_norm": 5.906004428863525, + "learning_rate": 1.897873953312317e-05, + "loss": 1.2757, + "step": 4612 + }, + { + "epoch": 0.2307, + "grad_norm": 3.7380828857421875, + "learning_rate": 1.8977202214608002e-05, + "loss": 0.8155, + "step": 4614 + }, + { + "epoch": 0.2308, + "grad_norm": 3.3887650966644287, + "learning_rate": 1.8975663802247978e-05, + "loss": 1.5954, + "step": 4616 + }, + { + "epoch": 0.2309, + "grad_norm": 2.091456890106201, + "learning_rate": 1.8974124296230543e-05, + "loss": 0.8762, + "step": 4618 + }, + { + "epoch": 0.231, + "grad_norm": 2.783294200897217, + "learning_rate": 1.8972583696743284e-05, + "loss": 1.0196, + "step": 4620 + }, + { + "epoch": 0.2311, + "grad_norm": 2.6698200702667236, + "learning_rate": 1.8971042003973923e-05, + "loss": 1.499, + "step": 4622 + }, + { + "epoch": 0.2312, + "grad_norm": 7.0650105476379395, + "learning_rate": 1.8969499218110302e-05, + "loss": 1.3354, + "step": 4624 + }, + { + "epoch": 0.2313, + "grad_norm": 5.410084247589111, + "learning_rate": 1.896795533934041e-05, + "loss": 1.0189, + "step": 4626 + }, + { + "epoch": 0.2314, + "grad_norm": 6.5962700843811035, + "learning_rate": 1.896641036785236e-05, + "loss": 0.5721, + "step": 4628 + }, + { + "epoch": 0.2315, + "grad_norm": 3.01941180229187, + "learning_rate": 1.8964864303834408e-05, + "loss": 1.4849, + "step": 4630 + }, + { + "epoch": 0.2316, + "grad_norm": 7.605664253234863, + "learning_rate": 1.896331714747493e-05, + "loss": 1.2961, + "step": 4632 + }, + { + "epoch": 0.2317, + "grad_norm": 4.428727626800537, + "learning_rate": 1.8961768898962448e-05, + "loss": 0.9437, + "step": 4634 + }, + { + "epoch": 0.2318, + "grad_norm": 5.310479164123535, + "learning_rate": 1.896021955848561e-05, + "loss": 1.2298, + "step": 4636 + }, + { + "epoch": 0.2319, + "grad_norm": 8.421478271484375, + "learning_rate": 1.89586691262332e-05, + "loss": 1.0572, + "step": 4638 + }, + { + "epoch": 0.232, + "grad_norm": 4.86716890335083, + "learning_rate": 1.895711760239413e-05, + "loss": 1.3627, + "step": 4640 + }, + { + "epoch": 0.2321, + "grad_norm": 5.0307135581970215, + "learning_rate": 1.895556498715745e-05, + "loss": 0.8311, + "step": 4642 + }, + { + "epoch": 0.2322, + "grad_norm": 3.904860258102417, + "learning_rate": 1.895401128071234e-05, + "loss": 1.703, + "step": 4644 + }, + { + "epoch": 0.2323, + "grad_norm": 6.6543097496032715, + "learning_rate": 1.8952456483248117e-05, + "loss": 1.1791, + "step": 4646 + }, + { + "epoch": 0.2324, + "grad_norm": 16.24559211730957, + "learning_rate": 1.8950900594954226e-05, + "loss": 0.9217, + "step": 4648 + }, + { + "epoch": 0.2325, + "grad_norm": 7.200473785400391, + "learning_rate": 1.894934361602025e-05, + "loss": 1.0919, + "step": 4650 + }, + { + "epoch": 0.2326, + "grad_norm": 3.7816162109375, + "learning_rate": 1.8947785546635905e-05, + "loss": 1.0262, + "step": 4652 + }, + { + "epoch": 0.2327, + "grad_norm": 2.4506185054779053, + "learning_rate": 1.8946226386991027e-05, + "loss": 1.2634, + "step": 4654 + }, + { + "epoch": 0.2328, + "grad_norm": 4.118920803070068, + "learning_rate": 1.89446661372756e-05, + "loss": 1.1877, + "step": 4656 + }, + { + "epoch": 0.2329, + "grad_norm": 2.163606643676758, + "learning_rate": 1.894310479767974e-05, + "loss": 2.7402, + "step": 4658 + }, + { + "epoch": 0.233, + "grad_norm": 4.419861793518066, + "learning_rate": 1.8941542368393683e-05, + "loss": 1.2399, + "step": 4660 + }, + { + "epoch": 0.2331, + "grad_norm": 2.0644843578338623, + "learning_rate": 1.8939978849607814e-05, + "loss": 0.8594, + "step": 4662 + }, + { + "epoch": 0.2332, + "grad_norm": 8.417153358459473, + "learning_rate": 1.893841424151264e-05, + "loss": 1.6025, + "step": 4664 + }, + { + "epoch": 0.2333, + "grad_norm": 7.248772621154785, + "learning_rate": 1.8936848544298804e-05, + "loss": 1.3242, + "step": 4666 + }, + { + "epoch": 0.2334, + "grad_norm": 4.434897422790527, + "learning_rate": 1.893528175815708e-05, + "loss": 1.2226, + "step": 4668 + }, + { + "epoch": 0.2335, + "grad_norm": 9.020395278930664, + "learning_rate": 1.893371388327838e-05, + "loss": 0.7425, + "step": 4670 + }, + { + "epoch": 0.2336, + "grad_norm": 7.036171913146973, + "learning_rate": 1.893214491985374e-05, + "loss": 1.188, + "step": 4672 + }, + { + "epoch": 0.2337, + "grad_norm": 3.477903366088867, + "learning_rate": 1.8930574868074333e-05, + "loss": 1.8041, + "step": 4674 + }, + { + "epoch": 0.2338, + "grad_norm": 2.103598117828369, + "learning_rate": 1.892900372813147e-05, + "loss": 0.8005, + "step": 4676 + }, + { + "epoch": 0.2339, + "grad_norm": 8.765957832336426, + "learning_rate": 1.8927431500216587e-05, + "loss": 0.7111, + "step": 4678 + }, + { + "epoch": 0.234, + "grad_norm": 2.3676345348358154, + "learning_rate": 1.892585818452126e-05, + "loss": 0.8669, + "step": 4680 + }, + { + "epoch": 0.2341, + "grad_norm": 2.137284994125366, + "learning_rate": 1.892428378123718e-05, + "loss": 1.4827, + "step": 4682 + }, + { + "epoch": 0.2342, + "grad_norm": 8.075108528137207, + "learning_rate": 1.8922708290556197e-05, + "loss": 0.6863, + "step": 4684 + }, + { + "epoch": 0.2343, + "grad_norm": 0.2712802588939667, + "learning_rate": 1.892113171267027e-05, + "loss": 0.1601, + "step": 4686 + }, + { + "epoch": 0.2344, + "grad_norm": 10.465614318847656, + "learning_rate": 1.8919554047771508e-05, + "loss": 1.7062, + "step": 4688 + }, + { + "epoch": 0.2345, + "grad_norm": 11.78783893585205, + "learning_rate": 1.8917975296052143e-05, + "loss": 1.4714, + "step": 4690 + }, + { + "epoch": 0.2346, + "grad_norm": 4.415833950042725, + "learning_rate": 1.8916395457704536e-05, + "loss": 1.3297, + "step": 4692 + }, + { + "epoch": 0.2347, + "grad_norm": 4.447108745574951, + "learning_rate": 1.891481453292119e-05, + "loss": 0.401, + "step": 4694 + }, + { + "epoch": 0.2348, + "grad_norm": 3.9074249267578125, + "learning_rate": 1.8913232521894734e-05, + "loss": 1.5414, + "step": 4696 + }, + { + "epoch": 0.2349, + "grad_norm": 15.652018547058105, + "learning_rate": 1.8911649424817934e-05, + "loss": 1.342, + "step": 4698 + }, + { + "epoch": 0.235, + "grad_norm": 6.872525215148926, + "learning_rate": 1.891006524188368e-05, + "loss": 1.7177, + "step": 4700 + }, + { + "epoch": 0.2351, + "grad_norm": 4.771570682525635, + "learning_rate": 1.8908479973285007e-05, + "loss": 0.4671, + "step": 4702 + }, + { + "epoch": 0.2352, + "grad_norm": 3.7847414016723633, + "learning_rate": 1.890689361921507e-05, + "loss": 0.8103, + "step": 4704 + }, + { + "epoch": 0.2353, + "grad_norm": 9.97509765625, + "learning_rate": 1.890530617986716e-05, + "loss": 0.7554, + "step": 4706 + }, + { + "epoch": 0.2354, + "grad_norm": 10.716634750366211, + "learning_rate": 1.8903717655434708e-05, + "loss": 1.256, + "step": 4708 + }, + { + "epoch": 0.2355, + "grad_norm": 3.605026960372925, + "learning_rate": 1.8902128046111267e-05, + "loss": 1.4195, + "step": 4710 + }, + { + "epoch": 0.2356, + "grad_norm": 4.505685806274414, + "learning_rate": 1.8900537352090523e-05, + "loss": 0.9907, + "step": 4712 + }, + { + "epoch": 0.2357, + "grad_norm": 2.8596718311309814, + "learning_rate": 1.8898945573566306e-05, + "loss": 0.5921, + "step": 4714 + }, + { + "epoch": 0.2358, + "grad_norm": 7.259385585784912, + "learning_rate": 1.8897352710732564e-05, + "loss": 1.0291, + "step": 4716 + }, + { + "epoch": 0.2359, + "grad_norm": 2.7094364166259766, + "learning_rate": 1.8895758763783383e-05, + "loss": 1.4691, + "step": 4718 + }, + { + "epoch": 0.236, + "grad_norm": 5.579313278198242, + "learning_rate": 1.889416373291298e-05, + "loss": 1.4245, + "step": 4720 + }, + { + "epoch": 0.2361, + "grad_norm": 4.919440746307373, + "learning_rate": 1.88925676183157e-05, + "loss": 1.2437, + "step": 4722 + }, + { + "epoch": 0.2362, + "grad_norm": 0.5872755646705627, + "learning_rate": 1.8890970420186035e-05, + "loss": 0.4948, + "step": 4724 + }, + { + "epoch": 0.2363, + "grad_norm": 4.427672863006592, + "learning_rate": 1.888937213871859e-05, + "loss": 0.2358, + "step": 4726 + }, + { + "epoch": 0.2364, + "grad_norm": 2.1155662536621094, + "learning_rate": 1.8887772774108116e-05, + "loss": 0.4713, + "step": 4728 + }, + { + "epoch": 0.2365, + "grad_norm": 2.8324646949768066, + "learning_rate": 1.888617232654949e-05, + "loss": 1.2267, + "step": 4730 + }, + { + "epoch": 0.2366, + "grad_norm": 4.6791582107543945, + "learning_rate": 1.888457079623772e-05, + "loss": 1.184, + "step": 4732 + }, + { + "epoch": 0.2367, + "grad_norm": 3.1124486923217773, + "learning_rate": 1.888296818336795e-05, + "loss": 0.7794, + "step": 4734 + }, + { + "epoch": 0.2368, + "grad_norm": 4.449089050292969, + "learning_rate": 1.8881364488135448e-05, + "loss": 0.6672, + "step": 4736 + }, + { + "epoch": 0.2369, + "grad_norm": 2.0954737663269043, + "learning_rate": 1.8879759710735625e-05, + "loss": 0.2859, + "step": 4738 + }, + { + "epoch": 0.237, + "grad_norm": 3.968837022781372, + "learning_rate": 1.8878153851364013e-05, + "loss": 0.9274, + "step": 4740 + }, + { + "epoch": 0.2371, + "grad_norm": 6.329759120941162, + "learning_rate": 1.887654691021629e-05, + "loss": 0.9021, + "step": 4742 + }, + { + "epoch": 0.2372, + "grad_norm": 13.26930046081543, + "learning_rate": 1.887493888748825e-05, + "loss": 1.4283, + "step": 4744 + }, + { + "epoch": 0.2373, + "grad_norm": 7.41019868850708, + "learning_rate": 1.8873329783375823e-05, + "loss": 0.8684, + "step": 4746 + }, + { + "epoch": 0.2374, + "grad_norm": 11.921358108520508, + "learning_rate": 1.8871719598075083e-05, + "loss": 1.2114, + "step": 4748 + }, + { + "epoch": 0.2375, + "grad_norm": 3.6348118782043457, + "learning_rate": 1.887010833178222e-05, + "loss": 0.795, + "step": 4750 + }, + { + "epoch": 0.2376, + "grad_norm": 2.373884677886963, + "learning_rate": 1.886849598469356e-05, + "loss": 0.6181, + "step": 4752 + }, + { + "epoch": 0.2377, + "grad_norm": 2.682265520095825, + "learning_rate": 1.8866882557005567e-05, + "loss": 0.5281, + "step": 4754 + }, + { + "epoch": 0.2378, + "grad_norm": 11.026163101196289, + "learning_rate": 1.8865268048914828e-05, + "loss": 1.3672, + "step": 4756 + }, + { + "epoch": 0.2379, + "grad_norm": 4.0072021484375, + "learning_rate": 1.886365246061807e-05, + "loss": 0.6359, + "step": 4758 + }, + { + "epoch": 0.238, + "grad_norm": 2.173490285873413, + "learning_rate": 1.8862035792312148e-05, + "loss": 0.513, + "step": 4760 + }, + { + "epoch": 0.2381, + "grad_norm": 5.415887832641602, + "learning_rate": 1.8860418044194048e-05, + "loss": 1.0265, + "step": 4762 + }, + { + "epoch": 0.2382, + "grad_norm": 19.77223014831543, + "learning_rate": 1.8858799216460883e-05, + "loss": 1.4973, + "step": 4764 + }, + { + "epoch": 0.2383, + "grad_norm": 4.627255916595459, + "learning_rate": 1.8857179309309902e-05, + "loss": 0.9198, + "step": 4766 + }, + { + "epoch": 0.2384, + "grad_norm": 6.075071811676025, + "learning_rate": 1.8855558322938492e-05, + "loss": 1.4353, + "step": 4768 + }, + { + "epoch": 0.2385, + "grad_norm": 3.3848061561584473, + "learning_rate": 1.885393625754416e-05, + "loss": 0.9393, + "step": 4770 + }, + { + "epoch": 0.2386, + "grad_norm": 9.788941383361816, + "learning_rate": 1.8852313113324553e-05, + "loss": 1.5419, + "step": 4772 + }, + { + "epoch": 0.2387, + "grad_norm": 5.049810886383057, + "learning_rate": 1.8850688890477446e-05, + "loss": 1.5391, + "step": 4774 + }, + { + "epoch": 0.2388, + "grad_norm": 3.029780149459839, + "learning_rate": 1.8849063589200744e-05, + "loss": 1.1564, + "step": 4776 + }, + { + "epoch": 0.2389, + "grad_norm": 7.05224084854126, + "learning_rate": 1.8847437209692486e-05, + "loss": 0.8034, + "step": 4778 + }, + { + "epoch": 0.239, + "grad_norm": 2.0141520500183105, + "learning_rate": 1.884580975215084e-05, + "loss": 0.8706, + "step": 4780 + }, + { + "epoch": 0.2391, + "grad_norm": 2.808906316757202, + "learning_rate": 1.884418121677411e-05, + "loss": 1.4892, + "step": 4782 + }, + { + "epoch": 0.2392, + "grad_norm": 1.9289047718048096, + "learning_rate": 1.8842551603760725e-05, + "loss": 0.4213, + "step": 4784 + }, + { + "epoch": 0.2393, + "grad_norm": 3.735866069793701, + "learning_rate": 1.884092091330925e-05, + "loss": 0.2276, + "step": 4786 + }, + { + "epoch": 0.2394, + "grad_norm": 2.2672078609466553, + "learning_rate": 1.8839289145618378e-05, + "loss": 0.9393, + "step": 4788 + }, + { + "epoch": 0.2395, + "grad_norm": 9.457794189453125, + "learning_rate": 1.8837656300886937e-05, + "loss": 1.5418, + "step": 4790 + }, + { + "epoch": 0.2396, + "grad_norm": 4.2155303955078125, + "learning_rate": 1.8836022379313884e-05, + "loss": 0.7703, + "step": 4792 + }, + { + "epoch": 0.2397, + "grad_norm": 3.675703763961792, + "learning_rate": 1.8834387381098302e-05, + "loss": 1.3653, + "step": 4794 + }, + { + "epoch": 0.2398, + "grad_norm": 6.794332504272461, + "learning_rate": 1.883275130643942e-05, + "loss": 0.1604, + "step": 4796 + }, + { + "epoch": 0.2399, + "grad_norm": 5.481836795806885, + "learning_rate": 1.883111415553658e-05, + "loss": 1.1978, + "step": 4798 + }, + { + "epoch": 0.24, + "grad_norm": 11.130902290344238, + "learning_rate": 1.8829475928589272e-05, + "loss": 1.4588, + "step": 4800 + }, + { + "epoch": 0.2401, + "grad_norm": 12.686787605285645, + "learning_rate": 1.88278366257971e-05, + "loss": 1.0209, + "step": 4802 + }, + { + "epoch": 0.2402, + "grad_norm": 7.36930513381958, + "learning_rate": 1.882619624735982e-05, + "loss": 1.7046, + "step": 4804 + }, + { + "epoch": 0.2403, + "grad_norm": 12.977546691894531, + "learning_rate": 1.8824554793477294e-05, + "loss": 1.6037, + "step": 4806 + }, + { + "epoch": 0.2404, + "grad_norm": 3.305619716644287, + "learning_rate": 1.8822912264349535e-05, + "loss": 0.3696, + "step": 4808 + }, + { + "epoch": 0.2405, + "grad_norm": 1.9257465600967407, + "learning_rate": 1.882126866017668e-05, + "loss": 0.8095, + "step": 4810 + }, + { + "epoch": 0.2406, + "grad_norm": 4.229753017425537, + "learning_rate": 1.8819623981158996e-05, + "loss": 1.5698, + "step": 4812 + }, + { + "epoch": 0.2407, + "grad_norm": 1.13596773147583, + "learning_rate": 1.8817978227496883e-05, + "loss": 0.6225, + "step": 4814 + }, + { + "epoch": 0.2408, + "grad_norm": 41.739410400390625, + "learning_rate": 1.881633139939087e-05, + "loss": 1.5414, + "step": 4816 + }, + { + "epoch": 0.2409, + "grad_norm": 8.28453254699707, + "learning_rate": 1.8814683497041622e-05, + "loss": 1.5414, + "step": 4818 + }, + { + "epoch": 0.241, + "grad_norm": 2.0293378829956055, + "learning_rate": 1.8813034520649923e-05, + "loss": 1.1842, + "step": 4820 + }, + { + "epoch": 0.2411, + "grad_norm": 2.3429973125457764, + "learning_rate": 1.8811384470416705e-05, + "loss": 0.2339, + "step": 4822 + }, + { + "epoch": 0.2412, + "grad_norm": 3.7826993465423584, + "learning_rate": 1.8809733346543013e-05, + "loss": 0.8108, + "step": 4824 + }, + { + "epoch": 0.2413, + "grad_norm": 2.8194098472595215, + "learning_rate": 1.8808081149230036e-05, + "loss": 1.066, + "step": 4826 + }, + { + "epoch": 0.2414, + "grad_norm": 6.051753044128418, + "learning_rate": 1.880642787867909e-05, + "loss": 1.2546, + "step": 4828 + }, + { + "epoch": 0.2415, + "grad_norm": 7.091906547546387, + "learning_rate": 1.880477353509162e-05, + "loss": 0.676, + "step": 4830 + }, + { + "epoch": 0.2416, + "grad_norm": 4.548340320587158, + "learning_rate": 1.8803118118669203e-05, + "loss": 1.0096, + "step": 4832 + }, + { + "epoch": 0.2417, + "grad_norm": 14.673118591308594, + "learning_rate": 1.8801461629613548e-05, + "loss": 1.2812, + "step": 4834 + }, + { + "epoch": 0.2418, + "grad_norm": 1.6770259141921997, + "learning_rate": 1.8799804068126487e-05, + "loss": 0.4239, + "step": 4836 + }, + { + "epoch": 0.2419, + "grad_norm": 3.112917423248291, + "learning_rate": 1.879814543440999e-05, + "loss": 1.7517, + "step": 4838 + }, + { + "epoch": 0.242, + "grad_norm": 4.053079605102539, + "learning_rate": 1.879648572866617e-05, + "loss": 1.6105, + "step": 4840 + }, + { + "epoch": 0.2421, + "grad_norm": 2.3772683143615723, + "learning_rate": 1.8794824951097237e-05, + "loss": 1.4237, + "step": 4842 + }, + { + "epoch": 0.2422, + "grad_norm": 5.391452312469482, + "learning_rate": 1.8793163101905562e-05, + "loss": 1.7463, + "step": 4844 + }, + { + "epoch": 0.2423, + "grad_norm": 3.151792049407959, + "learning_rate": 1.879150018129364e-05, + "loss": 1.3644, + "step": 4846 + }, + { + "epoch": 0.2424, + "grad_norm": 13.716440200805664, + "learning_rate": 1.878983618946409e-05, + "loss": 1.7854, + "step": 4848 + }, + { + "epoch": 0.2425, + "grad_norm": 2.7373030185699463, + "learning_rate": 1.8788171126619653e-05, + "loss": 1.3287, + "step": 4850 + }, + { + "epoch": 0.2426, + "grad_norm": 0.8795678019523621, + "learning_rate": 1.878650499296323e-05, + "loss": 0.7254, + "step": 4852 + }, + { + "epoch": 0.2427, + "grad_norm": 12.159734725952148, + "learning_rate": 1.8784837788697823e-05, + "loss": 0.7125, + "step": 4854 + }, + { + "epoch": 0.2428, + "grad_norm": 2.58683705329895, + "learning_rate": 1.878316951402658e-05, + "loss": 1.1989, + "step": 4856 + }, + { + "epoch": 0.2429, + "grad_norm": 3.902388095855713, + "learning_rate": 1.8781500169152774e-05, + "loss": 0.8974, + "step": 4858 + }, + { + "epoch": 0.243, + "grad_norm": 4.659647464752197, + "learning_rate": 1.8779829754279806e-05, + "loss": 0.808, + "step": 4860 + }, + { + "epoch": 0.2431, + "grad_norm": 4.144500732421875, + "learning_rate": 1.877815826961122e-05, + "loss": 0.4896, + "step": 4862 + }, + { + "epoch": 0.2432, + "grad_norm": 5.616020202636719, + "learning_rate": 1.8776485715350672e-05, + "loss": 1.5954, + "step": 4864 + }, + { + "epoch": 0.2433, + "grad_norm": 1.5747926235198975, + "learning_rate": 1.877481209170196e-05, + "loss": 1.376, + "step": 4866 + }, + { + "epoch": 0.2434, + "grad_norm": 7.932464599609375, + "learning_rate": 1.8773137398869017e-05, + "loss": 0.9412, + "step": 4868 + }, + { + "epoch": 0.2435, + "grad_norm": 0.5136166214942932, + "learning_rate": 1.877146163705589e-05, + "loss": 0.1676, + "step": 4870 + }, + { + "epoch": 0.2436, + "grad_norm": 2.2956249713897705, + "learning_rate": 1.8769784806466768e-05, + "loss": 0.4452, + "step": 4872 + }, + { + "epoch": 0.2437, + "grad_norm": 6.124969005584717, + "learning_rate": 1.8768106907305973e-05, + "loss": 1.2905, + "step": 4874 + }, + { + "epoch": 0.2438, + "grad_norm": 4.002076625823975, + "learning_rate": 1.8766427939777943e-05, + "loss": 0.9665, + "step": 4876 + }, + { + "epoch": 0.2439, + "grad_norm": 15.130281448364258, + "learning_rate": 1.8764747904087262e-05, + "loss": 0.8392, + "step": 4878 + }, + { + "epoch": 0.244, + "grad_norm": 12.704394340515137, + "learning_rate": 1.8763066800438638e-05, + "loss": 1.2527, + "step": 4880 + }, + { + "epoch": 0.2441, + "grad_norm": 8.527413368225098, + "learning_rate": 1.87613846290369e-05, + "loss": 1.2279, + "step": 4882 + }, + { + "epoch": 0.2442, + "grad_norm": 5.765957355499268, + "learning_rate": 1.8759701390087026e-05, + "loss": 1.3531, + "step": 4884 + }, + { + "epoch": 0.2443, + "grad_norm": 8.653681755065918, + "learning_rate": 1.875801708379411e-05, + "loss": 1.4612, + "step": 4886 + }, + { + "epoch": 0.2444, + "grad_norm": 4.603379726409912, + "learning_rate": 1.8756331710363375e-05, + "loss": 1.2269, + "step": 4888 + }, + { + "epoch": 0.2445, + "grad_norm": 2.668537139892578, + "learning_rate": 1.875464527000018e-05, + "loss": 0.7305, + "step": 4890 + }, + { + "epoch": 0.2446, + "grad_norm": 10.31418514251709, + "learning_rate": 1.8752957762910016e-05, + "loss": 1.9014, + "step": 4892 + }, + { + "epoch": 0.2447, + "grad_norm": 8.6339750289917, + "learning_rate": 1.87512691892985e-05, + "loss": 1.7922, + "step": 4894 + }, + { + "epoch": 0.2448, + "grad_norm": 8.483217239379883, + "learning_rate": 1.874957954937138e-05, + "loss": 1.2334, + "step": 4896 + }, + { + "epoch": 0.2449, + "grad_norm": 4.12829065322876, + "learning_rate": 1.8747888843334528e-05, + "loss": 0.9508, + "step": 4898 + }, + { + "epoch": 0.245, + "grad_norm": 4.021163463592529, + "learning_rate": 1.874619707139396e-05, + "loss": 1.7152, + "step": 4900 + }, + { + "epoch": 0.2451, + "grad_norm": 1.725623607635498, + "learning_rate": 1.874450423375581e-05, + "loss": 0.7644, + "step": 4902 + }, + { + "epoch": 0.2452, + "grad_norm": 3.281590700149536, + "learning_rate": 1.8742810330626338e-05, + "loss": 1.3828, + "step": 4904 + }, + { + "epoch": 0.2453, + "grad_norm": 3.395223379135132, + "learning_rate": 1.874111536221195e-05, + "loss": 1.3231, + "step": 4906 + }, + { + "epoch": 0.2454, + "grad_norm": 3.638993740081787, + "learning_rate": 1.873941932871917e-05, + "loss": 1.166, + "step": 4908 + }, + { + "epoch": 0.2455, + "grad_norm": 2.697174310684204, + "learning_rate": 1.8737722230354654e-05, + "loss": 0.8822, + "step": 4910 + }, + { + "epoch": 0.2456, + "grad_norm": 6.084716320037842, + "learning_rate": 1.8736024067325188e-05, + "loss": 0.8823, + "step": 4912 + }, + { + "epoch": 0.2457, + "grad_norm": 0.871938943862915, + "learning_rate": 1.873432483983769e-05, + "loss": 0.4619, + "step": 4914 + }, + { + "epoch": 0.2458, + "grad_norm": 4.110381126403809, + "learning_rate": 1.8732624548099204e-05, + "loss": 0.7044, + "step": 4916 + }, + { + "epoch": 0.2459, + "grad_norm": 3.001119613647461, + "learning_rate": 1.8730923192316903e-05, + "loss": 0.6926, + "step": 4918 + }, + { + "epoch": 0.246, + "grad_norm": 5.348030090332031, + "learning_rate": 1.8729220772698096e-05, + "loss": 1.0172, + "step": 4920 + }, + { + "epoch": 0.2461, + "grad_norm": 5.895864009857178, + "learning_rate": 1.872751728945022e-05, + "loss": 1.2512, + "step": 4922 + }, + { + "epoch": 0.2462, + "grad_norm": 10.934713363647461, + "learning_rate": 1.8725812742780832e-05, + "loss": 1.1446, + "step": 4924 + }, + { + "epoch": 0.2463, + "grad_norm": 2.406832218170166, + "learning_rate": 1.872410713289763e-05, + "loss": 0.6216, + "step": 4926 + }, + { + "epoch": 0.2464, + "grad_norm": 6.730684280395508, + "learning_rate": 1.8722400460008437e-05, + "loss": 0.523, + "step": 4928 + }, + { + "epoch": 0.2465, + "grad_norm": 7.843965530395508, + "learning_rate": 1.8720692724321207e-05, + "loss": 1.1256, + "step": 4930 + }, + { + "epoch": 0.2466, + "grad_norm": 6.861769676208496, + "learning_rate": 1.871898392604402e-05, + "loss": 1.029, + "step": 4932 + }, + { + "epoch": 0.2467, + "grad_norm": 8.046610832214355, + "learning_rate": 1.8717274065385092e-05, + "loss": 1.6782, + "step": 4934 + }, + { + "epoch": 0.2468, + "grad_norm": 10.603029251098633, + "learning_rate": 1.8715563142552758e-05, + "loss": 1.3419, + "step": 4936 + }, + { + "epoch": 0.2469, + "grad_norm": 18.863662719726562, + "learning_rate": 1.8713851157755495e-05, + "loss": 2.0061, + "step": 4938 + }, + { + "epoch": 0.247, + "grad_norm": 11.460387229919434, + "learning_rate": 1.8712138111201898e-05, + "loss": 1.0389, + "step": 4940 + }, + { + "epoch": 0.2471, + "grad_norm": 3.057176351547241, + "learning_rate": 1.8710424003100698e-05, + "loss": 1.0063, + "step": 4942 + }, + { + "epoch": 0.2472, + "grad_norm": 2.377750873565674, + "learning_rate": 1.8708708833660755e-05, + "loss": 1.1163, + "step": 4944 + }, + { + "epoch": 0.2473, + "grad_norm": 4.293394088745117, + "learning_rate": 1.8706992603091057e-05, + "loss": 1.2361, + "step": 4946 + }, + { + "epoch": 0.2474, + "grad_norm": 3.2973129749298096, + "learning_rate": 1.8705275311600724e-05, + "loss": 1.5419, + "step": 4948 + }, + { + "epoch": 0.2475, + "grad_norm": 5.6204681396484375, + "learning_rate": 1.8703556959398998e-05, + "loss": 0.5422, + "step": 4950 + }, + { + "epoch": 0.2476, + "grad_norm": 2.8142662048339844, + "learning_rate": 1.870183754669526e-05, + "loss": 1.1015, + "step": 4952 + }, + { + "epoch": 0.2477, + "grad_norm": 2.6998958587646484, + "learning_rate": 1.870011707369901e-05, + "loss": 1.3933, + "step": 4954 + }, + { + "epoch": 0.2478, + "grad_norm": 3.769869565963745, + "learning_rate": 1.8698395540619883e-05, + "loss": 0.456, + "step": 4956 + }, + { + "epoch": 0.2479, + "grad_norm": 5.179622650146484, + "learning_rate": 1.8696672947667648e-05, + "loss": 0.988, + "step": 4958 + }, + { + "epoch": 0.248, + "grad_norm": 5.244435787200928, + "learning_rate": 1.869494929505219e-05, + "loss": 1.1515, + "step": 4960 + }, + { + "epoch": 0.2481, + "grad_norm": 11.128602981567383, + "learning_rate": 1.869322458298354e-05, + "loss": 0.7827, + "step": 4962 + }, + { + "epoch": 0.2482, + "grad_norm": 6.174973964691162, + "learning_rate": 1.869149881167184e-05, + "loss": 0.8348, + "step": 4964 + }, + { + "epoch": 0.2483, + "grad_norm": 5.534095764160156, + "learning_rate": 1.8689771981327377e-05, + "loss": 1.0446, + "step": 4966 + }, + { + "epoch": 0.2484, + "grad_norm": 8.152544021606445, + "learning_rate": 1.8688044092160554e-05, + "loss": 1.4425, + "step": 4968 + }, + { + "epoch": 0.2485, + "grad_norm": 8.14814567565918, + "learning_rate": 1.8686315144381914e-05, + "loss": 0.8173, + "step": 4970 + }, + { + "epoch": 0.2486, + "grad_norm": 3.3756182193756104, + "learning_rate": 1.8684585138202122e-05, + "loss": 1.0289, + "step": 4972 + }, + { + "epoch": 0.2487, + "grad_norm": 5.694324493408203, + "learning_rate": 1.8682854073831974e-05, + "loss": 1.0373, + "step": 4974 + }, + { + "epoch": 0.2488, + "grad_norm": 8.755205154418945, + "learning_rate": 1.8681121951482397e-05, + "loss": 1.1288, + "step": 4976 + }, + { + "epoch": 0.2489, + "grad_norm": 4.570581912994385, + "learning_rate": 1.8679388771364438e-05, + "loss": 1.3171, + "step": 4978 + }, + { + "epoch": 0.249, + "grad_norm": 2.867494583129883, + "learning_rate": 1.8677654533689287e-05, + "loss": 1.2185, + "step": 4980 + }, + { + "epoch": 0.2491, + "grad_norm": 2.176805019378662, + "learning_rate": 1.867591923866825e-05, + "loss": 0.2317, + "step": 4982 + }, + { + "epoch": 0.2492, + "grad_norm": 5.572793483734131, + "learning_rate": 1.8674182886512776e-05, + "loss": 0.8702, + "step": 4984 + }, + { + "epoch": 0.2493, + "grad_norm": 5.84930419921875, + "learning_rate": 1.8672445477434428e-05, + "loss": 0.5903, + "step": 4986 + }, + { + "epoch": 0.2494, + "grad_norm": 6.536047458648682, + "learning_rate": 1.86707070116449e-05, + "loss": 1.2626, + "step": 4988 + }, + { + "epoch": 0.2495, + "grad_norm": 3.8931431770324707, + "learning_rate": 1.866896748935603e-05, + "loss": 0.8096, + "step": 4990 + }, + { + "epoch": 0.2496, + "grad_norm": 7.645061016082764, + "learning_rate": 1.8667226910779767e-05, + "loss": 1.2195, + "step": 4992 + }, + { + "epoch": 0.2497, + "grad_norm": 4.809864521026611, + "learning_rate": 1.866548527612819e-05, + "loss": 0.8381, + "step": 4994 + }, + { + "epoch": 0.2498, + "grad_norm": 2.229883909225464, + "learning_rate": 1.866374258561352e-05, + "loss": 1.576, + "step": 4996 + }, + { + "epoch": 0.2499, + "grad_norm": 2.5956242084503174, + "learning_rate": 1.8661998839448096e-05, + "loss": 0.7, + "step": 4998 + }, + { + "epoch": 0.25, + "grad_norm": 1.8476533889770508, + "learning_rate": 1.866025403784439e-05, + "loss": 1.16, + "step": 5000 + }, + { + "epoch": 0.2501, + "grad_norm": 1.8899433612823486, + "learning_rate": 1.8658508181014996e-05, + "loss": 1.0044, + "step": 5002 + }, + { + "epoch": 0.2502, + "grad_norm": 4.4291486740112305, + "learning_rate": 1.8656761269172645e-05, + "loss": 0.9005, + "step": 5004 + }, + { + "epoch": 0.2503, + "grad_norm": 3.633254051208496, + "learning_rate": 1.8655013302530193e-05, + "loss": 1.0008, + "step": 5006 + }, + { + "epoch": 0.2504, + "grad_norm": 3.0917656421661377, + "learning_rate": 1.8653264281300622e-05, + "loss": 0.8022, + "step": 5008 + }, + { + "epoch": 0.2505, + "grad_norm": 2.761812925338745, + "learning_rate": 1.8651514205697046e-05, + "loss": 1.3285, + "step": 5010 + }, + { + "epoch": 0.2506, + "grad_norm": 2.681440830230713, + "learning_rate": 1.864976307593271e-05, + "loss": 0.5635, + "step": 5012 + }, + { + "epoch": 0.2507, + "grad_norm": 3.692549228668213, + "learning_rate": 1.864801089222098e-05, + "loss": 1.0176, + "step": 5014 + }, + { + "epoch": 0.2508, + "grad_norm": 5.582091808319092, + "learning_rate": 1.864625765477535e-05, + "loss": 1.0938, + "step": 5016 + }, + { + "epoch": 0.2509, + "grad_norm": 4.730898857116699, + "learning_rate": 1.8644503363809456e-05, + "loss": 1.1031, + "step": 5018 + }, + { + "epoch": 0.251, + "grad_norm": 4.427846908569336, + "learning_rate": 1.864274801953705e-05, + "loss": 0.8496, + "step": 5020 + }, + { + "epoch": 0.2511, + "grad_norm": 4.82796049118042, + "learning_rate": 1.864099162217201e-05, + "loss": 1.7229, + "step": 5022 + }, + { + "epoch": 0.2512, + "grad_norm": 5.337724208831787, + "learning_rate": 1.8639234171928355e-05, + "loss": 0.8503, + "step": 5024 + }, + { + "epoch": 0.2513, + "grad_norm": 4.939192771911621, + "learning_rate": 1.863747566902022e-05, + "loss": 0.0953, + "step": 5026 + }, + { + "epoch": 0.2514, + "grad_norm": 9.92911148071289, + "learning_rate": 1.8635716113661876e-05, + "loss": 1.38, + "step": 5028 + }, + { + "epoch": 0.2515, + "grad_norm": 1.9879530668258667, + "learning_rate": 1.8633955506067717e-05, + "loss": 1.1784, + "step": 5030 + }, + { + "epoch": 0.2516, + "grad_norm": 1.0967121124267578, + "learning_rate": 1.863219384645227e-05, + "loss": 0.6969, + "step": 5032 + }, + { + "epoch": 0.2517, + "grad_norm": 3.1152560710906982, + "learning_rate": 1.863043113503019e-05, + "loss": 0.6237, + "step": 5034 + }, + { + "epoch": 0.2518, + "grad_norm": 2.0844643115997314, + "learning_rate": 1.862866737201625e-05, + "loss": 0.9572, + "step": 5036 + }, + { + "epoch": 0.2519, + "grad_norm": 0.9475278258323669, + "learning_rate": 1.862690255762537e-05, + "loss": 0.4821, + "step": 5038 + }, + { + "epoch": 0.252, + "grad_norm": 1.3529410362243652, + "learning_rate": 1.8625136692072577e-05, + "loss": 0.6653, + "step": 5040 + }, + { + "epoch": 0.2521, + "grad_norm": 8.07202434539795, + "learning_rate": 1.862336977557304e-05, + "loss": 1.2116, + "step": 5042 + }, + { + "epoch": 0.2522, + "grad_norm": 1.9327176809310913, + "learning_rate": 1.862160180834206e-05, + "loss": 0.4834, + "step": 5044 + }, + { + "epoch": 0.2523, + "grad_norm": 3.207608938217163, + "learning_rate": 1.8619832790595045e-05, + "loss": 0.8764, + "step": 5046 + }, + { + "epoch": 0.2524, + "grad_norm": 4.609564304351807, + "learning_rate": 1.861806272254755e-05, + "loss": 0.8155, + "step": 5048 + }, + { + "epoch": 0.2525, + "grad_norm": 7.206010341644287, + "learning_rate": 1.861629160441526e-05, + "loss": 0.8275, + "step": 5050 + }, + { + "epoch": 0.2526, + "grad_norm": 2.8138530254364014, + "learning_rate": 1.8614519436413968e-05, + "loss": 1.0503, + "step": 5052 + }, + { + "epoch": 0.2527, + "grad_norm": 4.016881465911865, + "learning_rate": 1.861274621875962e-05, + "loss": 1.4535, + "step": 5054 + }, + { + "epoch": 0.2528, + "grad_norm": 5.346767902374268, + "learning_rate": 1.8610971951668265e-05, + "loss": 1.2194, + "step": 5056 + }, + { + "epoch": 0.2529, + "grad_norm": 2.8086953163146973, + "learning_rate": 1.86091966353561e-05, + "loss": 1.2655, + "step": 5058 + }, + { + "epoch": 0.253, + "grad_norm": 3.734388589859009, + "learning_rate": 1.860742027003944e-05, + "loss": 1.0937, + "step": 5060 + }, + { + "epoch": 0.2531, + "grad_norm": 2.7529795169830322, + "learning_rate": 1.8605642855934727e-05, + "loss": 1.4021, + "step": 5062 + }, + { + "epoch": 0.2532, + "grad_norm": 7.373759746551514, + "learning_rate": 1.8603864393258534e-05, + "loss": 1.8031, + "step": 5064 + }, + { + "epoch": 0.2533, + "grad_norm": 4.04988956451416, + "learning_rate": 1.8602084882227568e-05, + "loss": 1.196, + "step": 5066 + }, + { + "epoch": 0.2534, + "grad_norm": 3.2891430854797363, + "learning_rate": 1.860030432305865e-05, + "loss": 0.9425, + "step": 5068 + }, + { + "epoch": 0.2535, + "grad_norm": 4.499414443969727, + "learning_rate": 1.8598522715968736e-05, + "loss": 1.0898, + "step": 5070 + }, + { + "epoch": 0.2536, + "grad_norm": 5.174615859985352, + "learning_rate": 1.8596740061174912e-05, + "loss": 1.1833, + "step": 5072 + }, + { + "epoch": 0.2537, + "grad_norm": 2.6696701049804688, + "learning_rate": 1.859495635889439e-05, + "loss": 0.4564, + "step": 5074 + }, + { + "epoch": 0.2538, + "grad_norm": 3.160609006881714, + "learning_rate": 1.8593171609344505e-05, + "loss": 1.1226, + "step": 5076 + }, + { + "epoch": 0.2539, + "grad_norm": 1.692405104637146, + "learning_rate": 1.8591385812742724e-05, + "loss": 0.8043, + "step": 5078 + }, + { + "epoch": 0.254, + "grad_norm": 5.1667633056640625, + "learning_rate": 1.8589598969306646e-05, + "loss": 1.171, + "step": 5080 + }, + { + "epoch": 0.2541, + "grad_norm": 7.754960536956787, + "learning_rate": 1.8587811079253985e-05, + "loss": 0.9986, + "step": 5082 + }, + { + "epoch": 0.2542, + "grad_norm": 11.08651351928711, + "learning_rate": 1.8586022142802597e-05, + "loss": 0.9011, + "step": 5084 + }, + { + "epoch": 0.2543, + "grad_norm": 24.27686882019043, + "learning_rate": 1.8584232160170452e-05, + "loss": 1.3344, + "step": 5086 + }, + { + "epoch": 0.2544, + "grad_norm": 0.22326301038265228, + "learning_rate": 1.8582441131575658e-05, + "loss": 0.1682, + "step": 5088 + }, + { + "epoch": 0.2545, + "grad_norm": 3.256504774093628, + "learning_rate": 1.858064905723645e-05, + "loss": 0.5104, + "step": 5090 + }, + { + "epoch": 0.2546, + "grad_norm": 3.2844150066375732, + "learning_rate": 1.8578855937371176e-05, + "loss": 1.2563, + "step": 5092 + }, + { + "epoch": 0.2547, + "grad_norm": 3.003678321838379, + "learning_rate": 1.857706177219833e-05, + "loss": 0.4506, + "step": 5094 + }, + { + "epoch": 0.2548, + "grad_norm": 7.528975486755371, + "learning_rate": 1.8575266561936526e-05, + "loss": 1.8109, + "step": 5096 + }, + { + "epoch": 0.2549, + "grad_norm": 9.873383522033691, + "learning_rate": 1.85734703068045e-05, + "loss": 1.2408, + "step": 5098 + }, + { + "epoch": 0.255, + "grad_norm": 11.03995132446289, + "learning_rate": 1.8571673007021124e-05, + "loss": 1.0656, + "step": 5100 + }, + { + "epoch": 0.2551, + "grad_norm": 8.29476261138916, + "learning_rate": 1.8569874662805394e-05, + "loss": 1.6344, + "step": 5102 + }, + { + "epoch": 0.2552, + "grad_norm": 4.3695807456970215, + "learning_rate": 1.856807527437643e-05, + "loss": 1.173, + "step": 5104 + }, + { + "epoch": 0.2553, + "grad_norm": 3.2075018882751465, + "learning_rate": 1.8566274841953485e-05, + "loss": 0.5204, + "step": 5106 + }, + { + "epoch": 0.2554, + "grad_norm": 2.661424398422241, + "learning_rate": 1.8564473365755936e-05, + "loss": 1.3499, + "step": 5108 + }, + { + "epoch": 0.2555, + "grad_norm": 8.154353141784668, + "learning_rate": 1.8562670846003283e-05, + "loss": 1.2883, + "step": 5110 + }, + { + "epoch": 0.2556, + "grad_norm": 3.644918441772461, + "learning_rate": 1.8560867282915164e-05, + "loss": 1.1251, + "step": 5112 + }, + { + "epoch": 0.2557, + "grad_norm": 3.631635904312134, + "learning_rate": 1.855906267671133e-05, + "loss": 0.8665, + "step": 5114 + }, + { + "epoch": 0.2558, + "grad_norm": 2.6284894943237305, + "learning_rate": 1.8557257027611677e-05, + "loss": 0.7075, + "step": 5116 + }, + { + "epoch": 0.2559, + "grad_norm": 8.602035522460938, + "learning_rate": 1.8555450335836206e-05, + "loss": 1.3068, + "step": 5118 + }, + { + "epoch": 0.256, + "grad_norm": 5.44516658782959, + "learning_rate": 1.855364260160507e-05, + "loss": 0.8862, + "step": 5120 + }, + { + "epoch": 0.2561, + "grad_norm": 4.219476222991943, + "learning_rate": 1.8551833825138522e-05, + "loss": 1.3356, + "step": 5122 + }, + { + "epoch": 0.2562, + "grad_norm": 2.6865992546081543, + "learning_rate": 1.8550024006656967e-05, + "loss": 0.652, + "step": 5124 + }, + { + "epoch": 0.2563, + "grad_norm": 5.555624008178711, + "learning_rate": 1.854821314638092e-05, + "loss": 0.9031, + "step": 5126 + }, + { + "epoch": 0.2564, + "grad_norm": 3.5138375759124756, + "learning_rate": 1.854640124453103e-05, + "loss": 1.192, + "step": 5128 + }, + { + "epoch": 0.2565, + "grad_norm": 3.3821327686309814, + "learning_rate": 1.8544588301328077e-05, + "loss": 0.6492, + "step": 5130 + }, + { + "epoch": 0.2566, + "grad_norm": 1.5934169292449951, + "learning_rate": 1.8542774316992953e-05, + "loss": 1.3618, + "step": 5132 + }, + { + "epoch": 0.2567, + "grad_norm": 3.152277708053589, + "learning_rate": 1.8540959291746694e-05, + "loss": 0.7674, + "step": 5134 + }, + { + "epoch": 0.2568, + "grad_norm": 3.1281466484069824, + "learning_rate": 1.8539143225810453e-05, + "loss": 0.7134, + "step": 5136 + }, + { + "epoch": 0.2569, + "grad_norm": 12.027440071105957, + "learning_rate": 1.8537326119405507e-05, + "loss": 1.507, + "step": 5138 + }, + { + "epoch": 0.257, + "grad_norm": 2.6452183723449707, + "learning_rate": 1.8535507972753275e-05, + "loss": 1.2587, + "step": 5140 + }, + { + "epoch": 0.2571, + "grad_norm": 0.16642001271247864, + "learning_rate": 1.853368878607529e-05, + "loss": 0.5822, + "step": 5142 + }, + { + "epoch": 0.2572, + "grad_norm": 5.947177410125732, + "learning_rate": 1.8531868559593205e-05, + "loss": 0.7229, + "step": 5144 + }, + { + "epoch": 0.2573, + "grad_norm": 3.555959939956665, + "learning_rate": 1.8530047293528818e-05, + "loss": 0.9177, + "step": 5146 + }, + { + "epoch": 0.2574, + "grad_norm": 9.145088195800781, + "learning_rate": 1.8528224988104044e-05, + "loss": 0.8067, + "step": 5148 + }, + { + "epoch": 0.2575, + "grad_norm": 6.092648983001709, + "learning_rate": 1.8526401643540924e-05, + "loss": 0.2463, + "step": 5150 + }, + { + "epoch": 0.2576, + "grad_norm": 2.9129953384399414, + "learning_rate": 1.8524577260061628e-05, + "loss": 0.8694, + "step": 5152 + }, + { + "epoch": 0.2577, + "grad_norm": 3.8088817596435547, + "learning_rate": 1.8522751837888448e-05, + "loss": 0.7592, + "step": 5154 + }, + { + "epoch": 0.2578, + "grad_norm": 19.66749382019043, + "learning_rate": 1.8520925377243812e-05, + "loss": 1.5127, + "step": 5156 + }, + { + "epoch": 0.2579, + "grad_norm": 6.12245512008667, + "learning_rate": 1.851909787835026e-05, + "loss": 0.5207, + "step": 5158 + }, + { + "epoch": 0.258, + "grad_norm": 2.6339662075042725, + "learning_rate": 1.851726934143048e-05, + "loss": 0.9852, + "step": 5160 + }, + { + "epoch": 0.2581, + "grad_norm": 11.662116050720215, + "learning_rate": 1.851543976670726e-05, + "loss": 1.5473, + "step": 5162 + }, + { + "epoch": 0.2582, + "grad_norm": 3.087620258331299, + "learning_rate": 1.8513609154403535e-05, + "loss": 0.7655, + "step": 5164 + }, + { + "epoch": 0.2583, + "grad_norm": 6.767923355102539, + "learning_rate": 1.8511777504742364e-05, + "loss": 1.1637, + "step": 5166 + }, + { + "epoch": 0.2584, + "grad_norm": 9.272624969482422, + "learning_rate": 1.850994481794692e-05, + "loss": 1.3915, + "step": 5168 + }, + { + "epoch": 0.2585, + "grad_norm": 5.977593421936035, + "learning_rate": 1.8508111094240516e-05, + "loss": 0.9558, + "step": 5170 + }, + { + "epoch": 0.2586, + "grad_norm": 5.955809593200684, + "learning_rate": 1.850627633384658e-05, + "loss": 1.044, + "step": 5172 + }, + { + "epoch": 0.2587, + "grad_norm": 9.691577911376953, + "learning_rate": 1.850444053698867e-05, + "loss": 1.3851, + "step": 5174 + }, + { + "epoch": 0.2588, + "grad_norm": 8.32938289642334, + "learning_rate": 1.8502603703890488e-05, + "loss": 2.3309, + "step": 5176 + }, + { + "epoch": 0.2589, + "grad_norm": 2.5457990169525146, + "learning_rate": 1.850076583477583e-05, + "loss": 0.703, + "step": 5178 + }, + { + "epoch": 0.259, + "grad_norm": 5.97235107421875, + "learning_rate": 1.849892692986864e-05, + "loss": 1.1236, + "step": 5180 + }, + { + "epoch": 0.2591, + "grad_norm": 2.383470296859741, + "learning_rate": 1.8497086989392986e-05, + "loss": 1.1059, + "step": 5182 + }, + { + "epoch": 0.2592, + "grad_norm": 7.335219383239746, + "learning_rate": 1.8495246013573057e-05, + "loss": 1.2156, + "step": 5184 + }, + { + "epoch": 0.2593, + "grad_norm": 2.722771644592285, + "learning_rate": 1.8493404002633167e-05, + "loss": 1.177, + "step": 5186 + }, + { + "epoch": 0.2594, + "grad_norm": 2.7192680835723877, + "learning_rate": 1.8491560956797766e-05, + "loss": 1.4745, + "step": 5188 + }, + { + "epoch": 0.2595, + "grad_norm": 9.532959938049316, + "learning_rate": 1.8489716876291417e-05, + "loss": 1.5626, + "step": 5190 + }, + { + "epoch": 0.2596, + "grad_norm": 3.41239333152771, + "learning_rate": 1.848787176133882e-05, + "loss": 1.1617, + "step": 5192 + }, + { + "epoch": 0.2597, + "grad_norm": 4.552682876586914, + "learning_rate": 1.8486025612164796e-05, + "loss": 1.0391, + "step": 5194 + }, + { + "epoch": 0.2598, + "grad_norm": 8.161871910095215, + "learning_rate": 1.848417842899429e-05, + "loss": 0.9805, + "step": 5196 + }, + { + "epoch": 0.2599, + "grad_norm": 4.137243270874023, + "learning_rate": 1.8482330212052377e-05, + "loss": 1.5263, + "step": 5198 + }, + { + "epoch": 0.26, + "grad_norm": 11.290227890014648, + "learning_rate": 1.848048096156426e-05, + "loss": 1.1309, + "step": 5200 + }, + { + "epoch": 0.2601, + "grad_norm": 0.14390045404434204, + "learning_rate": 1.8478630677755264e-05, + "loss": 1.0875, + "step": 5202 + }, + { + "epoch": 0.2602, + "grad_norm": 2.455298662185669, + "learning_rate": 1.8476779360850833e-05, + "loss": 0.5943, + "step": 5204 + }, + { + "epoch": 0.2603, + "grad_norm": 2.484677314758301, + "learning_rate": 1.8474927011076554e-05, + "loss": 1.0639, + "step": 5206 + }, + { + "epoch": 0.2604, + "grad_norm": 7.381721496582031, + "learning_rate": 1.8473073628658123e-05, + "loss": 1.7471, + "step": 5208 + }, + { + "epoch": 0.2605, + "grad_norm": 3.852033853530884, + "learning_rate": 1.8471219213821374e-05, + "loss": 1.3441, + "step": 5210 + }, + { + "epoch": 0.2606, + "grad_norm": 1.651127815246582, + "learning_rate": 1.8469363766792258e-05, + "loss": 0.5486, + "step": 5212 + }, + { + "epoch": 0.2607, + "grad_norm": 0.21382173895835876, + "learning_rate": 1.8467507287796857e-05, + "loss": 0.5314, + "step": 5214 + }, + { + "epoch": 0.2608, + "grad_norm": 3.112837314605713, + "learning_rate": 1.8465649777061377e-05, + "loss": 1.119, + "step": 5216 + }, + { + "epoch": 0.2609, + "grad_norm": 3.008004665374756, + "learning_rate": 1.8463791234812152e-05, + "loss": 0.368, + "step": 5218 + }, + { + "epoch": 0.261, + "grad_norm": 5.188312530517578, + "learning_rate": 1.8461931661275642e-05, + "loss": 0.5761, + "step": 5220 + }, + { + "epoch": 0.2611, + "grad_norm": 5.555667877197266, + "learning_rate": 1.8460071056678424e-05, + "loss": 1.8698, + "step": 5222 + }, + { + "epoch": 0.2612, + "grad_norm": 4.005917549133301, + "learning_rate": 1.8458209421247208e-05, + "loss": 0.7536, + "step": 5224 + }, + { + "epoch": 0.2613, + "grad_norm": 0.26645511388778687, + "learning_rate": 1.8456346755208834e-05, + "loss": 0.4312, + "step": 5226 + }, + { + "epoch": 0.2614, + "grad_norm": 2.945828914642334, + "learning_rate": 1.8454483058790254e-05, + "loss": 0.507, + "step": 5228 + }, + { + "epoch": 0.2615, + "grad_norm": 4.233397483825684, + "learning_rate": 1.8452618332218563e-05, + "loss": 0.504, + "step": 5230 + }, + { + "epoch": 0.2616, + "grad_norm": 11.92428970336914, + "learning_rate": 1.8450752575720967e-05, + "loss": 1.435, + "step": 5232 + }, + { + "epoch": 0.2617, + "grad_norm": 16.86953353881836, + "learning_rate": 1.8448885789524802e-05, + "loss": 1.339, + "step": 5234 + }, + { + "epoch": 0.2618, + "grad_norm": 3.1793715953826904, + "learning_rate": 1.844701797385753e-05, + "loss": 0.8011, + "step": 5236 + }, + { + "epoch": 0.2619, + "grad_norm": 13.40442180633545, + "learning_rate": 1.8445149128946744e-05, + "loss": 1.3789, + "step": 5238 + }, + { + "epoch": 0.262, + "grad_norm": 3.5498647689819336, + "learning_rate": 1.8443279255020153e-05, + "loss": 0.8105, + "step": 5240 + }, + { + "epoch": 0.2621, + "grad_norm": 2.480803966522217, + "learning_rate": 1.8441408352305595e-05, + "loss": 0.4726, + "step": 5242 + }, + { + "epoch": 0.2622, + "grad_norm": 4.234928607940674, + "learning_rate": 1.8439536421031035e-05, + "loss": 0.8138, + "step": 5244 + }, + { + "epoch": 0.2623, + "grad_norm": 14.584441184997559, + "learning_rate": 1.8437663461424563e-05, + "loss": 1.4723, + "step": 5246 + }, + { + "epoch": 0.2624, + "grad_norm": 1.4700027704238892, + "learning_rate": 1.843578947371439e-05, + "loss": 0.7125, + "step": 5248 + }, + { + "epoch": 0.2625, + "grad_norm": 4.329739093780518, + "learning_rate": 1.843391445812886e-05, + "loss": 1.154, + "step": 5250 + }, + { + "epoch": 0.2626, + "grad_norm": 4.649634838104248, + "learning_rate": 1.8432038414896432e-05, + "loss": 1.4249, + "step": 5252 + }, + { + "epoch": 0.2627, + "grad_norm": 6.550378322601318, + "learning_rate": 1.8430161344245708e-05, + "loss": 1.2964, + "step": 5254 + }, + { + "epoch": 0.2628, + "grad_norm": 4.713897228240967, + "learning_rate": 1.842828324640539e-05, + "loss": 1.0195, + "step": 5256 + }, + { + "epoch": 0.2629, + "grad_norm": 11.930916786193848, + "learning_rate": 1.8426404121604324e-05, + "loss": 1.3792, + "step": 5258 + }, + { + "epoch": 0.263, + "grad_norm": 2.616177797317505, + "learning_rate": 1.842452397007148e-05, + "loss": 1.4366, + "step": 5260 + }, + { + "epoch": 0.2631, + "grad_norm": 3.4381215572357178, + "learning_rate": 1.842264279203594e-05, + "loss": 0.7287, + "step": 5262 + }, + { + "epoch": 0.2632, + "grad_norm": 5.909813404083252, + "learning_rate": 1.8420760587726925e-05, + "loss": 1.5262, + "step": 5264 + }, + { + "epoch": 0.2633, + "grad_norm": 3.4046285152435303, + "learning_rate": 1.8418877357373776e-05, + "loss": 1.157, + "step": 5266 + }, + { + "epoch": 0.2634, + "grad_norm": 3.0482120513916016, + "learning_rate": 1.8416993101205957e-05, + "loss": 1.3468, + "step": 5268 + }, + { + "epoch": 0.2635, + "grad_norm": 3.0076639652252197, + "learning_rate": 1.8415107819453065e-05, + "loss": 1.0202, + "step": 5270 + }, + { + "epoch": 0.2636, + "grad_norm": 13.04200553894043, + "learning_rate": 1.8413221512344805e-05, + "loss": 1.231, + "step": 5272 + }, + { + "epoch": 0.2637, + "grad_norm": 2.7025017738342285, + "learning_rate": 1.8411334180111027e-05, + "loss": 1.4723, + "step": 5274 + }, + { + "epoch": 0.2638, + "grad_norm": 3.486161708831787, + "learning_rate": 1.8409445822981694e-05, + "loss": 0.9052, + "step": 5276 + }, + { + "epoch": 0.2639, + "grad_norm": 5.492757320404053, + "learning_rate": 1.8407556441186895e-05, + "loss": 0.772, + "step": 5278 + }, + { + "epoch": 0.264, + "grad_norm": 1.6070204973220825, + "learning_rate": 1.8405666034956842e-05, + "loss": 0.1811, + "step": 5280 + }, + { + "epoch": 0.2641, + "grad_norm": 4.169613838195801, + "learning_rate": 1.8403774604521885e-05, + "loss": 1.5719, + "step": 5282 + }, + { + "epoch": 0.2642, + "grad_norm": 2.926241397857666, + "learning_rate": 1.8401882150112485e-05, + "loss": 0.9249, + "step": 5284 + }, + { + "epoch": 0.2643, + "grad_norm": 3.003417491912842, + "learning_rate": 1.8399988671959227e-05, + "loss": 0.7892, + "step": 5286 + }, + { + "epoch": 0.2644, + "grad_norm": 2.597376823425293, + "learning_rate": 1.839809417029283e-05, + "loss": 1.7713, + "step": 5288 + }, + { + "epoch": 0.2645, + "grad_norm": 8.234180450439453, + "learning_rate": 1.8396198645344133e-05, + "loss": 0.4861, + "step": 5290 + }, + { + "epoch": 0.2646, + "grad_norm": 10.357369422912598, + "learning_rate": 1.8394302097344103e-05, + "loss": 1.3448, + "step": 5292 + }, + { + "epoch": 0.2647, + "grad_norm": 2.0996859073638916, + "learning_rate": 1.8392404526523816e-05, + "loss": 0.9743, + "step": 5294 + }, + { + "epoch": 0.2648, + "grad_norm": 6.879647254943848, + "learning_rate": 1.8390505933114503e-05, + "loss": 1.2529, + "step": 5296 + }, + { + "epoch": 0.2649, + "grad_norm": 3.1912221908569336, + "learning_rate": 1.838860631734749e-05, + "loss": 0.8552, + "step": 5298 + }, + { + "epoch": 0.265, + "grad_norm": 3.7050061225891113, + "learning_rate": 1.8386705679454243e-05, + "loss": 0.9372, + "step": 5300 + }, + { + "epoch": 0.2651, + "grad_norm": 3.083828926086426, + "learning_rate": 1.8384804019666348e-05, + "loss": 0.9007, + "step": 5302 + }, + { + "epoch": 0.2652, + "grad_norm": 10.239202499389648, + "learning_rate": 1.8382901338215515e-05, + "loss": 0.4104, + "step": 5304 + }, + { + "epoch": 0.2653, + "grad_norm": 6.221431255340576, + "learning_rate": 1.8380997635333587e-05, + "loss": 1.0264, + "step": 5306 + }, + { + "epoch": 0.2654, + "grad_norm": 5.696591854095459, + "learning_rate": 1.8379092911252515e-05, + "loss": 1.1004, + "step": 5308 + }, + { + "epoch": 0.2655, + "grad_norm": 4.577762126922607, + "learning_rate": 1.837718716620439e-05, + "loss": 1.6001, + "step": 5310 + }, + { + "epoch": 0.2656, + "grad_norm": 16.897348403930664, + "learning_rate": 1.837528040042142e-05, + "loss": 1.8081, + "step": 5312 + }, + { + "epoch": 0.2657, + "grad_norm": 3.850135326385498, + "learning_rate": 1.8373372614135935e-05, + "loss": 1.3513, + "step": 5314 + }, + { + "epoch": 0.2658, + "grad_norm": 3.40458083152771, + "learning_rate": 1.83714638075804e-05, + "loss": 1.4265, + "step": 5316 + }, + { + "epoch": 0.2659, + "grad_norm": 6.923820495605469, + "learning_rate": 1.8369553980987392e-05, + "loss": 1.721, + "step": 5318 + }, + { + "epoch": 0.266, + "grad_norm": 2.201185464859009, + "learning_rate": 1.836764313458962e-05, + "loss": 0.8036, + "step": 5320 + }, + { + "epoch": 0.2661, + "grad_norm": 5.357707977294922, + "learning_rate": 1.8365731268619912e-05, + "loss": 1.1247, + "step": 5322 + }, + { + "epoch": 0.2662, + "grad_norm": 3.0255346298217773, + "learning_rate": 1.8363818383311226e-05, + "loss": 0.7708, + "step": 5324 + }, + { + "epoch": 0.2663, + "grad_norm": 2.8823111057281494, + "learning_rate": 1.836190447889664e-05, + "loss": 0.8455, + "step": 5326 + }, + { + "epoch": 0.2664, + "grad_norm": 8.586832046508789, + "learning_rate": 1.8359989555609355e-05, + "loss": 2.139, + "step": 5328 + }, + { + "epoch": 0.2665, + "grad_norm": 2.093416213989258, + "learning_rate": 1.8358073613682705e-05, + "loss": 1.0162, + "step": 5330 + }, + { + "epoch": 0.2666, + "grad_norm": 11.542142868041992, + "learning_rate": 1.8356156653350138e-05, + "loss": 1.1902, + "step": 5332 + }, + { + "epoch": 0.2667, + "grad_norm": 4.392592906951904, + "learning_rate": 1.8354238674845225e-05, + "loss": 2.5384, + "step": 5334 + }, + { + "epoch": 0.2668, + "grad_norm": 4.519916534423828, + "learning_rate": 1.8352319678401677e-05, + "loss": 0.9599, + "step": 5336 + }, + { + "epoch": 0.2669, + "grad_norm": 2.964209794998169, + "learning_rate": 1.8350399664253307e-05, + "loss": 0.9538, + "step": 5338 + }, + { + "epoch": 0.267, + "grad_norm": 2.1713480949401855, + "learning_rate": 1.8348478632634067e-05, + "loss": 0.8103, + "step": 5340 + }, + { + "epoch": 0.2671, + "grad_norm": 3.885773181915283, + "learning_rate": 1.8346556583778032e-05, + "loss": 0.5951, + "step": 5342 + }, + { + "epoch": 0.2672, + "grad_norm": 6.641365051269531, + "learning_rate": 1.834463351791939e-05, + "loss": 0.8219, + "step": 5344 + }, + { + "epoch": 0.2673, + "grad_norm": 7.9884748458862305, + "learning_rate": 1.8342709435292476e-05, + "loss": 0.7667, + "step": 5346 + }, + { + "epoch": 0.2674, + "grad_norm": 2.3122620582580566, + "learning_rate": 1.8340784336131715e-05, + "loss": 0.3235, + "step": 5348 + }, + { + "epoch": 0.2675, + "grad_norm": 3.1258327960968018, + "learning_rate": 1.8338858220671683e-05, + "loss": 1.0042, + "step": 5350 + }, + { + "epoch": 0.2676, + "grad_norm": 8.227709770202637, + "learning_rate": 1.8336931089147076e-05, + "loss": 0.8316, + "step": 5352 + }, + { + "epoch": 0.2677, + "grad_norm": 1.2619078159332275, + "learning_rate": 1.83350029417927e-05, + "loss": 0.5877, + "step": 5354 + }, + { + "epoch": 0.2678, + "grad_norm": 3.372330904006958, + "learning_rate": 1.83330737788435e-05, + "loss": 0.6116, + "step": 5356 + }, + { + "epoch": 0.2679, + "grad_norm": 6.067659854888916, + "learning_rate": 1.8331143600534534e-05, + "loss": 0.5725, + "step": 5358 + }, + { + "epoch": 0.268, + "grad_norm": 0.6562060117721558, + "learning_rate": 1.8329212407100996e-05, + "loss": 0.4537, + "step": 5360 + }, + { + "epoch": 0.2681, + "grad_norm": 3.453183889389038, + "learning_rate": 1.832728019877819e-05, + "loss": 0.8691, + "step": 5362 + }, + { + "epoch": 0.2682, + "grad_norm": 11.1654634475708, + "learning_rate": 1.832534697580155e-05, + "loss": 1.4079, + "step": 5364 + }, + { + "epoch": 0.2683, + "grad_norm": 3.193570613861084, + "learning_rate": 1.8323412738406638e-05, + "loss": 0.3541, + "step": 5366 + }, + { + "epoch": 0.2684, + "grad_norm": 1.7900837659835815, + "learning_rate": 1.8321477486829128e-05, + "loss": 1.6184, + "step": 5368 + }, + { + "epoch": 0.2685, + "grad_norm": 3.134800434112549, + "learning_rate": 1.8319541221304825e-05, + "loss": 0.9389, + "step": 5370 + }, + { + "epoch": 0.2686, + "grad_norm": 16.955467224121094, + "learning_rate": 1.8317603942069665e-05, + "loss": 1.9114, + "step": 5372 + }, + { + "epoch": 0.2687, + "grad_norm": 6.278289318084717, + "learning_rate": 1.8315665649359692e-05, + "loss": 0.9387, + "step": 5374 + }, + { + "epoch": 0.2688, + "grad_norm": 4.115718364715576, + "learning_rate": 1.8313726343411085e-05, + "loss": 1.2011, + "step": 5376 + }, + { + "epoch": 0.2689, + "grad_norm": 3.314824342727661, + "learning_rate": 1.8311786024460145e-05, + "loss": 1.0649, + "step": 5378 + }, + { + "epoch": 0.269, + "grad_norm": 0.10543922334909439, + "learning_rate": 1.8309844692743283e-05, + "loss": 0.1973, + "step": 5380 + }, + { + "epoch": 0.2691, + "grad_norm": 0.6729246377944946, + "learning_rate": 1.8307902348497056e-05, + "loss": 0.5304, + "step": 5382 + }, + { + "epoch": 0.2692, + "grad_norm": 2.1011691093444824, + "learning_rate": 1.830595899195813e-05, + "loss": 0.7111, + "step": 5384 + }, + { + "epoch": 0.2693, + "grad_norm": 2.7163259983062744, + "learning_rate": 1.830401462336329e-05, + "loss": 0.6528, + "step": 5386 + }, + { + "epoch": 0.2694, + "grad_norm": 11.676880836486816, + "learning_rate": 1.830206924294946e-05, + "loss": 1.7607, + "step": 5388 + }, + { + "epoch": 0.2695, + "grad_norm": 5.557282447814941, + "learning_rate": 1.8300122850953678e-05, + "loss": 1.5223, + "step": 5390 + }, + { + "epoch": 0.2696, + "grad_norm": 2.8938655853271484, + "learning_rate": 1.82981754476131e-05, + "loss": 1.2846, + "step": 5392 + }, + { + "epoch": 0.2697, + "grad_norm": 2.466162919998169, + "learning_rate": 1.8296227033165016e-05, + "loss": 0.9626, + "step": 5394 + }, + { + "epoch": 0.2698, + "grad_norm": 2.6637558937072754, + "learning_rate": 1.8294277607846834e-05, + "loss": 0.2839, + "step": 5396 + }, + { + "epoch": 0.2699, + "grad_norm": 1.8826478719711304, + "learning_rate": 1.8292327171896082e-05, + "loss": 1.1782, + "step": 5398 + }, + { + "epoch": 0.27, + "grad_norm": 3.888155460357666, + "learning_rate": 1.8290375725550417e-05, + "loss": 1.2494, + "step": 5400 + }, + { + "epoch": 0.2701, + "grad_norm": 8.01353645324707, + "learning_rate": 1.828842326904762e-05, + "loss": 1.6252, + "step": 5402 + }, + { + "epoch": 0.2702, + "grad_norm": 16.479122161865234, + "learning_rate": 1.828646980262559e-05, + "loss": 1.2528, + "step": 5404 + }, + { + "epoch": 0.2703, + "grad_norm": 4.081845760345459, + "learning_rate": 1.8284515326522347e-05, + "loss": 0.8402, + "step": 5406 + }, + { + "epoch": 0.2704, + "grad_norm": 3.0767199993133545, + "learning_rate": 1.8282559840976043e-05, + "loss": 1.1319, + "step": 5408 + }, + { + "epoch": 0.2705, + "grad_norm": 3.037815570831299, + "learning_rate": 1.8280603346224945e-05, + "loss": 1.4873, + "step": 5410 + }, + { + "epoch": 0.2706, + "grad_norm": 7.578834533691406, + "learning_rate": 1.8278645842507448e-05, + "loss": 0.4814, + "step": 5412 + }, + { + "epoch": 0.2707, + "grad_norm": 4.941849708557129, + "learning_rate": 1.8276687330062067e-05, + "loss": 1.2561, + "step": 5414 + }, + { + "epoch": 0.2708, + "grad_norm": 7.169830322265625, + "learning_rate": 1.827472780912744e-05, + "loss": 1.119, + "step": 5416 + }, + { + "epoch": 0.2709, + "grad_norm": 9.09522819519043, + "learning_rate": 1.827276727994233e-05, + "loss": 1.7401, + "step": 5418 + }, + { + "epoch": 0.271, + "grad_norm": 4.180849552154541, + "learning_rate": 1.827080574274562e-05, + "loss": 0.7192, + "step": 5420 + }, + { + "epoch": 0.2711, + "grad_norm": 11.707079887390137, + "learning_rate": 1.826884319777632e-05, + "loss": 1.1133, + "step": 5422 + }, + { + "epoch": 0.2712, + "grad_norm": 13.58092975616455, + "learning_rate": 1.8266879645273557e-05, + "loss": 2.09, + "step": 5424 + }, + { + "epoch": 0.2713, + "grad_norm": 16.0809268951416, + "learning_rate": 1.8264915085476585e-05, + "loss": 1.3058, + "step": 5426 + }, + { + "epoch": 0.2714, + "grad_norm": 6.287859916687012, + "learning_rate": 1.826294951862478e-05, + "loss": 1.1249, + "step": 5428 + }, + { + "epoch": 0.2715, + "grad_norm": 8.369025230407715, + "learning_rate": 1.8260982944957638e-05, + "loss": 0.6447, + "step": 5430 + }, + { + "epoch": 0.2716, + "grad_norm": 1.1991387605667114, + "learning_rate": 1.8259015364714786e-05, + "loss": 0.833, + "step": 5432 + }, + { + "epoch": 0.2717, + "grad_norm": 5.20031213760376, + "learning_rate": 1.8257046778135966e-05, + "loss": 1.1945, + "step": 5434 + }, + { + "epoch": 0.2718, + "grad_norm": 6.398199081420898, + "learning_rate": 1.825507718546104e-05, + "loss": 0.7754, + "step": 5436 + }, + { + "epoch": 0.2719, + "grad_norm": 5.190743446350098, + "learning_rate": 1.825310658693e-05, + "loss": 1.5581, + "step": 5438 + }, + { + "epoch": 0.272, + "grad_norm": 11.858542442321777, + "learning_rate": 1.8251134982782952e-05, + "loss": 0.7814, + "step": 5440 + }, + { + "epoch": 0.2721, + "grad_norm": 3.719271659851074, + "learning_rate": 1.824916237326014e-05, + "loss": 0.9616, + "step": 5442 + }, + { + "epoch": 0.2722, + "grad_norm": 5.378784656524658, + "learning_rate": 1.8247188758601912e-05, + "loss": 0.8385, + "step": 5444 + }, + { + "epoch": 0.2723, + "grad_norm": 7.524791717529297, + "learning_rate": 1.8245214139048753e-05, + "loss": 0.2915, + "step": 5446 + }, + { + "epoch": 0.2724, + "grad_norm": 1.3268156051635742, + "learning_rate": 1.824323851484126e-05, + "loss": 0.5753, + "step": 5448 + }, + { + "epoch": 0.2725, + "grad_norm": 5.728011608123779, + "learning_rate": 1.8241261886220155e-05, + "loss": 1.2296, + "step": 5450 + }, + { + "epoch": 0.2726, + "grad_norm": 4.780437469482422, + "learning_rate": 1.8239284253426294e-05, + "loss": 1.3293, + "step": 5452 + }, + { + "epoch": 0.2727, + "grad_norm": 4.297122001647949, + "learning_rate": 1.823730561670064e-05, + "loss": 0.6298, + "step": 5454 + }, + { + "epoch": 0.2728, + "grad_norm": 6.73445987701416, + "learning_rate": 1.8235325976284276e-05, + "loss": 0.6777, + "step": 5456 + }, + { + "epoch": 0.2729, + "grad_norm": 2.8146207332611084, + "learning_rate": 1.8233345332418423e-05, + "loss": 1.2562, + "step": 5458 + }, + { + "epoch": 0.273, + "grad_norm": 3.7951252460479736, + "learning_rate": 1.8231363685344422e-05, + "loss": 0.8952, + "step": 5460 + }, + { + "epoch": 0.2731, + "grad_norm": 2.696418046951294, + "learning_rate": 1.822938103530372e-05, + "loss": 0.4838, + "step": 5462 + }, + { + "epoch": 0.2732, + "grad_norm": 5.417209148406982, + "learning_rate": 1.82273973825379e-05, + "loss": 0.9438, + "step": 5464 + }, + { + "epoch": 0.2733, + "grad_norm": 4.672487258911133, + "learning_rate": 1.8225412727288668e-05, + "loss": 1.4962, + "step": 5466 + }, + { + "epoch": 0.2734, + "grad_norm": 4.951944351196289, + "learning_rate": 1.8223427069797845e-05, + "loss": 0.9671, + "step": 5468 + }, + { + "epoch": 0.2735, + "grad_norm": 0.5010573267936707, + "learning_rate": 1.8221440410307375e-05, + "loss": 0.7742, + "step": 5470 + }, + { + "epoch": 0.2736, + "grad_norm": 4.916182041168213, + "learning_rate": 1.8219452749059332e-05, + "loss": 1.1443, + "step": 5472 + }, + { + "epoch": 0.2737, + "grad_norm": 5.185580730438232, + "learning_rate": 1.8217464086295904e-05, + "loss": 1.5274, + "step": 5474 + }, + { + "epoch": 0.2738, + "grad_norm": 12.130892753601074, + "learning_rate": 1.8215474422259403e-05, + "loss": 1.4102, + "step": 5476 + }, + { + "epoch": 0.2739, + "grad_norm": 4.264741897583008, + "learning_rate": 1.8213483757192263e-05, + "loss": 1.541, + "step": 5478 + }, + { + "epoch": 0.274, + "grad_norm": 7.995655536651611, + "learning_rate": 1.821149209133704e-05, + "loss": 0.2756, + "step": 5480 + }, + { + "epoch": 0.2741, + "grad_norm": 3.2331790924072266, + "learning_rate": 1.8209499424936416e-05, + "loss": 0.5199, + "step": 5482 + }, + { + "epoch": 0.2742, + "grad_norm": 5.553971290588379, + "learning_rate": 1.820750575823319e-05, + "loss": 0.7512, + "step": 5484 + }, + { + "epoch": 0.2743, + "grad_norm": 8.164937973022461, + "learning_rate": 1.8205511091470282e-05, + "loss": 1.048, + "step": 5486 + }, + { + "epoch": 0.2744, + "grad_norm": 12.183558464050293, + "learning_rate": 1.8203515424890738e-05, + "loss": 1.849, + "step": 5488 + }, + { + "epoch": 0.2745, + "grad_norm": 3.524376392364502, + "learning_rate": 1.8201518758737726e-05, + "loss": 0.9338, + "step": 5490 + }, + { + "epoch": 0.2746, + "grad_norm": 2.3841049671173096, + "learning_rate": 1.8199521093254524e-05, + "loss": 0.8097, + "step": 5492 + }, + { + "epoch": 0.2747, + "grad_norm": 15.2372407913208, + "learning_rate": 1.8197522428684554e-05, + "loss": 0.8105, + "step": 5494 + }, + { + "epoch": 0.2748, + "grad_norm": 6.083718299865723, + "learning_rate": 1.819552276527134e-05, + "loss": 1.2215, + "step": 5496 + }, + { + "epoch": 0.2749, + "grad_norm": 5.0349016189575195, + "learning_rate": 1.8193522103258535e-05, + "loss": 0.9622, + "step": 5498 + }, + { + "epoch": 0.275, + "grad_norm": 6.333629131317139, + "learning_rate": 1.819152044288992e-05, + "loss": 1.468, + "step": 5500 + }, + { + "epoch": 0.2751, + "grad_norm": 1.2028391361236572, + "learning_rate": 1.818951778440938e-05, + "loss": 0.822, + "step": 5502 + }, + { + "epoch": 0.2752, + "grad_norm": 3.523817539215088, + "learning_rate": 1.8187514128060946e-05, + "loss": 1.0873, + "step": 5504 + }, + { + "epoch": 0.2753, + "grad_norm": 7.709884166717529, + "learning_rate": 1.818550947408875e-05, + "loss": 0.8391, + "step": 5506 + }, + { + "epoch": 0.2754, + "grad_norm": 2.9305264949798584, + "learning_rate": 1.818350382273705e-05, + "loss": 0.4871, + "step": 5508 + }, + { + "epoch": 0.2755, + "grad_norm": 5.492305278778076, + "learning_rate": 1.8181497174250236e-05, + "loss": 0.8642, + "step": 5510 + }, + { + "epoch": 0.2756, + "grad_norm": 5.851364612579346, + "learning_rate": 1.8179489528872808e-05, + "loss": 1.3353, + "step": 5512 + }, + { + "epoch": 0.2757, + "grad_norm": 0.08833213150501251, + "learning_rate": 1.817748088684939e-05, + "loss": 0.6954, + "step": 5514 + }, + { + "epoch": 0.2758, + "grad_norm": 4.140590667724609, + "learning_rate": 1.817547124842473e-05, + "loss": 1.7325, + "step": 5516 + }, + { + "epoch": 0.2759, + "grad_norm": 9.513383865356445, + "learning_rate": 1.81734606138437e-05, + "loss": 0.6485, + "step": 5518 + }, + { + "epoch": 0.276, + "grad_norm": 9.749388694763184, + "learning_rate": 1.8171448983351284e-05, + "loss": 0.6567, + "step": 5520 + }, + { + "epoch": 0.2761, + "grad_norm": 12.969032287597656, + "learning_rate": 1.8169436357192602e-05, + "loss": 0.7376, + "step": 5522 + }, + { + "epoch": 0.2762, + "grad_norm": 3.033860921859741, + "learning_rate": 1.8167422735612877e-05, + "loss": 0.7296, + "step": 5524 + }, + { + "epoch": 0.2763, + "grad_norm": 2.7427029609680176, + "learning_rate": 1.8165408118857465e-05, + "loss": 1.1464, + "step": 5526 + }, + { + "epoch": 0.2764, + "grad_norm": 3.524777412414551, + "learning_rate": 1.816339250717184e-05, + "loss": 0.9594, + "step": 5528 + }, + { + "epoch": 0.2765, + "grad_norm": 13.789310455322266, + "learning_rate": 1.8161375900801603e-05, + "loss": 1.3498, + "step": 5530 + }, + { + "epoch": 0.2766, + "grad_norm": 3.27237868309021, + "learning_rate": 1.815935829999247e-05, + "loss": 0.7987, + "step": 5532 + }, + { + "epoch": 0.2767, + "grad_norm": 6.730371952056885, + "learning_rate": 1.8157339704990275e-05, + "loss": 1.2388, + "step": 5534 + }, + { + "epoch": 0.2768, + "grad_norm": 6.355111122131348, + "learning_rate": 1.8155320116040983e-05, + "loss": 1.6614, + "step": 5536 + }, + { + "epoch": 0.2769, + "grad_norm": 3.8604958057403564, + "learning_rate": 1.8153299533390672e-05, + "loss": 1.4737, + "step": 5538 + }, + { + "epoch": 0.277, + "grad_norm": 2.231224775314331, + "learning_rate": 1.815127795728554e-05, + "loss": 0.3887, + "step": 5540 + }, + { + "epoch": 0.2771, + "grad_norm": 3.756917953491211, + "learning_rate": 1.814925538797192e-05, + "loss": 0.8564, + "step": 5542 + }, + { + "epoch": 0.2772, + "grad_norm": 6.6450605392456055, + "learning_rate": 1.814723182569625e-05, + "loss": 0.7039, + "step": 5544 + }, + { + "epoch": 0.2773, + "grad_norm": 6.799836158752441, + "learning_rate": 1.8145207270705095e-05, + "loss": 1.5569, + "step": 5546 + }, + { + "epoch": 0.2774, + "grad_norm": 5.923506259918213, + "learning_rate": 1.814318172324514e-05, + "loss": 0.3803, + "step": 5548 + }, + { + "epoch": 0.2775, + "grad_norm": 6.5920209884643555, + "learning_rate": 1.8141155183563195e-05, + "loss": 0.7088, + "step": 5550 + }, + { + "epoch": 0.2776, + "grad_norm": 3.710768938064575, + "learning_rate": 1.8139127651906183e-05, + "loss": 0.7293, + "step": 5552 + }, + { + "epoch": 0.2777, + "grad_norm": 2.751868486404419, + "learning_rate": 1.8137099128521156e-05, + "loss": 1.1898, + "step": 5554 + }, + { + "epoch": 0.2778, + "grad_norm": 3.83080792427063, + "learning_rate": 1.813506961365528e-05, + "loss": 0.5229, + "step": 5556 + }, + { + "epoch": 0.2779, + "grad_norm": 3.2223446369171143, + "learning_rate": 1.813303910755585e-05, + "loss": 0.4654, + "step": 5558 + }, + { + "epoch": 0.278, + "grad_norm": 2.992215156555176, + "learning_rate": 1.8131007610470278e-05, + "loss": 0.7163, + "step": 5560 + }, + { + "epoch": 0.2781, + "grad_norm": 2.8727047443389893, + "learning_rate": 1.8128975122646092e-05, + "loss": 0.9083, + "step": 5562 + }, + { + "epoch": 0.2782, + "grad_norm": 6.551694393157959, + "learning_rate": 1.812694164433094e-05, + "loss": 1.0985, + "step": 5564 + }, + { + "epoch": 0.2783, + "grad_norm": 0.9391130805015564, + "learning_rate": 1.8124907175772604e-05, + "loss": 1.0036, + "step": 5566 + }, + { + "epoch": 0.2784, + "grad_norm": 5.52144718170166, + "learning_rate": 1.812287171721897e-05, + "loss": 0.9458, + "step": 5568 + }, + { + "epoch": 0.2785, + "grad_norm": 1.838525652885437, + "learning_rate": 1.8120835268918063e-05, + "loss": 0.6767, + "step": 5570 + }, + { + "epoch": 0.2786, + "grad_norm": 6.738411903381348, + "learning_rate": 1.811879783111801e-05, + "loss": 0.7962, + "step": 5572 + }, + { + "epoch": 0.2787, + "grad_norm": 5.9889607429504395, + "learning_rate": 1.8116759404067066e-05, + "loss": 0.1185, + "step": 5574 + }, + { + "epoch": 0.2788, + "grad_norm": 9.539549827575684, + "learning_rate": 1.8114719988013612e-05, + "loss": 0.7193, + "step": 5576 + }, + { + "epoch": 0.2789, + "grad_norm": 5.084212779998779, + "learning_rate": 1.8112679583206138e-05, + "loss": 0.7495, + "step": 5578 + }, + { + "epoch": 0.279, + "grad_norm": 5.651949405670166, + "learning_rate": 1.8110638189893267e-05, + "loss": 0.8866, + "step": 5580 + }, + { + "epoch": 0.2791, + "grad_norm": 3.3663339614868164, + "learning_rate": 1.8108595808323736e-05, + "loss": 0.9747, + "step": 5582 + }, + { + "epoch": 0.2792, + "grad_norm": 4.395166873931885, + "learning_rate": 1.81065524387464e-05, + "loss": 1.3709, + "step": 5584 + }, + { + "epoch": 0.2793, + "grad_norm": 4.818139553070068, + "learning_rate": 1.8104508081410242e-05, + "loss": 1.3307, + "step": 5586 + }, + { + "epoch": 0.2794, + "grad_norm": 8.03380298614502, + "learning_rate": 1.8102462736564355e-05, + "loss": 1.6009, + "step": 5588 + }, + { + "epoch": 0.2795, + "grad_norm": 3.4567837715148926, + "learning_rate": 1.8100416404457962e-05, + "loss": 1.1296, + "step": 5590 + }, + { + "epoch": 0.2796, + "grad_norm": 4.848606586456299, + "learning_rate": 1.80983690853404e-05, + "loss": 1.0949, + "step": 5592 + }, + { + "epoch": 0.2797, + "grad_norm": 6.359392166137695, + "learning_rate": 1.809632077946113e-05, + "loss": 0.4157, + "step": 5594 + }, + { + "epoch": 0.2798, + "grad_norm": 4.163013935089111, + "learning_rate": 1.8094271487069733e-05, + "loss": 0.8083, + "step": 5596 + }, + { + "epoch": 0.2799, + "grad_norm": 1.8970526456832886, + "learning_rate": 1.809222120841591e-05, + "loss": 0.3397, + "step": 5598 + }, + { + "epoch": 0.28, + "grad_norm": 5.580599784851074, + "learning_rate": 1.8090169943749477e-05, + "loss": 1.204, + "step": 5600 + }, + { + "epoch": 0.2801, + "grad_norm": 3.147188425064087, + "learning_rate": 1.8088117693320374e-05, + "loss": 0.259, + "step": 5602 + }, + { + "epoch": 0.2802, + "grad_norm": 2.487596273422241, + "learning_rate": 1.8086064457378667e-05, + "loss": 1.1819, + "step": 5604 + }, + { + "epoch": 0.2803, + "grad_norm": 4.295044898986816, + "learning_rate": 1.8084010236174533e-05, + "loss": 1.0275, + "step": 5606 + }, + { + "epoch": 0.2804, + "grad_norm": 6.797756671905518, + "learning_rate": 1.8081955029958272e-05, + "loss": 1.1816, + "step": 5608 + }, + { + "epoch": 0.2805, + "grad_norm": 8.292386054992676, + "learning_rate": 1.8079898838980304e-05, + "loss": 1.2842, + "step": 5610 + }, + { + "epoch": 0.2806, + "grad_norm": 3.26979923248291, + "learning_rate": 1.8077841663491174e-05, + "loss": 1.3363, + "step": 5612 + }, + { + "epoch": 0.2807, + "grad_norm": 3.1397314071655273, + "learning_rate": 1.8075783503741543e-05, + "loss": 0.5694, + "step": 5614 + }, + { + "epoch": 0.2808, + "grad_norm": 1.1218055486679077, + "learning_rate": 1.8073724359982184e-05, + "loss": 0.4851, + "step": 5616 + }, + { + "epoch": 0.2809, + "grad_norm": 4.563277721405029, + "learning_rate": 1.8071664232464005e-05, + "loss": 0.3068, + "step": 5618 + }, + { + "epoch": 0.281, + "grad_norm": 4.597377777099609, + "learning_rate": 1.806960312143802e-05, + "loss": 0.5301, + "step": 5620 + }, + { + "epoch": 0.2811, + "grad_norm": 4.784342288970947, + "learning_rate": 1.8067541027155376e-05, + "loss": 1.1144, + "step": 5622 + }, + { + "epoch": 0.2812, + "grad_norm": 4.637925624847412, + "learning_rate": 1.8065477949867327e-05, + "loss": 0.6618, + "step": 5624 + }, + { + "epoch": 0.2813, + "grad_norm": 2.698469877243042, + "learning_rate": 1.8063413889825254e-05, + "loss": 1.3926, + "step": 5626 + }, + { + "epoch": 0.2814, + "grad_norm": 5.617527484893799, + "learning_rate": 1.806134884728066e-05, + "loss": 1.3616, + "step": 5628 + }, + { + "epoch": 0.2815, + "grad_norm": 4.946309566497803, + "learning_rate": 1.805928282248516e-05, + "loss": 0.7588, + "step": 5630 + }, + { + "epoch": 0.2816, + "grad_norm": 10.59417724609375, + "learning_rate": 1.8057215815690494e-05, + "loss": 1.2335, + "step": 5632 + }, + { + "epoch": 0.2817, + "grad_norm": 2.688074827194214, + "learning_rate": 1.805514782714852e-05, + "loss": 0.9466, + "step": 5634 + }, + { + "epoch": 0.2818, + "grad_norm": 7.416165351867676, + "learning_rate": 1.8053078857111218e-05, + "loss": 1.2693, + "step": 5636 + }, + { + "epoch": 0.2819, + "grad_norm": 12.831111907958984, + "learning_rate": 1.805100890583069e-05, + "loss": 1.5075, + "step": 5638 + }, + { + "epoch": 0.282, + "grad_norm": 3.8207714557647705, + "learning_rate": 1.804893797355914e-05, + "loss": 1.7137, + "step": 5640 + }, + { + "epoch": 0.2821, + "grad_norm": 2.1981141567230225, + "learning_rate": 1.804686606054892e-05, + "loss": 0.6119, + "step": 5642 + }, + { + "epoch": 0.2822, + "grad_norm": 4.761033535003662, + "learning_rate": 1.8044793167052476e-05, + "loss": 0.8391, + "step": 5644 + }, + { + "epoch": 0.2823, + "grad_norm": 2.5771429538726807, + "learning_rate": 1.8042719293322388e-05, + "loss": 0.6666, + "step": 5646 + }, + { + "epoch": 0.2824, + "grad_norm": 4.199549198150635, + "learning_rate": 1.8040644439611348e-05, + "loss": 0.5388, + "step": 5648 + }, + { + "epoch": 0.2825, + "grad_norm": 5.374485969543457, + "learning_rate": 1.8038568606172172e-05, + "loss": 2.1716, + "step": 5650 + }, + { + "epoch": 0.2826, + "grad_norm": 4.377547264099121, + "learning_rate": 1.80364917932578e-05, + "loss": 1.3692, + "step": 5652 + }, + { + "epoch": 0.2827, + "grad_norm": 16.867521286010742, + "learning_rate": 1.8034414001121278e-05, + "loss": 0.9395, + "step": 5654 + }, + { + "epoch": 0.2828, + "grad_norm": 3.223914384841919, + "learning_rate": 1.803233523001578e-05, + "loss": 2.7146, + "step": 5656 + }, + { + "epoch": 0.2829, + "grad_norm": 5.667184352874756, + "learning_rate": 1.8030255480194602e-05, + "loss": 0.8114, + "step": 5658 + }, + { + "epoch": 0.283, + "grad_norm": 3.8983049392700195, + "learning_rate": 1.8028174751911147e-05, + "loss": 0.4261, + "step": 5660 + }, + { + "epoch": 0.2831, + "grad_norm": 4.672042369842529, + "learning_rate": 1.8026093045418955e-05, + "loss": 0.9522, + "step": 5662 + }, + { + "epoch": 0.2832, + "grad_norm": 1.8714582920074463, + "learning_rate": 1.802401036097167e-05, + "loss": 0.5976, + "step": 5664 + }, + { + "epoch": 0.2833, + "grad_norm": 2.8935842514038086, + "learning_rate": 1.8021926698823058e-05, + "loss": 0.5616, + "step": 5666 + }, + { + "epoch": 0.2834, + "grad_norm": 10.683247566223145, + "learning_rate": 1.801984205922701e-05, + "loss": 0.6621, + "step": 5668 + }, + { + "epoch": 0.2835, + "grad_norm": 7.923436164855957, + "learning_rate": 1.801775644243754e-05, + "loss": 1.2958, + "step": 5670 + }, + { + "epoch": 0.2836, + "grad_norm": 3.6616549491882324, + "learning_rate": 1.8015669848708768e-05, + "loss": 0.734, + "step": 5672 + }, + { + "epoch": 0.2837, + "grad_norm": 3.597929000854492, + "learning_rate": 1.8013582278294935e-05, + "loss": 0.6995, + "step": 5674 + }, + { + "epoch": 0.2838, + "grad_norm": 3.7195162773132324, + "learning_rate": 1.8011493731450412e-05, + "loss": 0.6691, + "step": 5676 + }, + { + "epoch": 0.2839, + "grad_norm": 20.398273468017578, + "learning_rate": 1.800940420842968e-05, + "loss": 1.9356, + "step": 5678 + }, + { + "epoch": 0.284, + "grad_norm": 2.27767014503479, + "learning_rate": 1.8007313709487334e-05, + "loss": 1.1089, + "step": 5680 + }, + { + "epoch": 0.2841, + "grad_norm": 2.0233535766601562, + "learning_rate": 1.8005222234878108e-05, + "loss": 0.8963, + "step": 5682 + }, + { + "epoch": 0.2842, + "grad_norm": 5.669861793518066, + "learning_rate": 1.8003129784856832e-05, + "loss": 0.894, + "step": 5684 + }, + { + "epoch": 0.2843, + "grad_norm": 2.587092876434326, + "learning_rate": 1.800103635967847e-05, + "loss": 1.0477, + "step": 5686 + }, + { + "epoch": 0.2844, + "grad_norm": 5.502596378326416, + "learning_rate": 1.7998941959598097e-05, + "loss": 0.6369, + "step": 5688 + }, + { + "epoch": 0.2845, + "grad_norm": 7.252860069274902, + "learning_rate": 1.799684658487091e-05, + "loss": 2.1052, + "step": 5690 + }, + { + "epoch": 0.2846, + "grad_norm": 4.26474142074585, + "learning_rate": 1.799475023575222e-05, + "loss": 0.8014, + "step": 5692 + }, + { + "epoch": 0.2847, + "grad_norm": 3.662581205368042, + "learning_rate": 1.7992652912497464e-05, + "loss": 0.8689, + "step": 5694 + }, + { + "epoch": 0.2848, + "grad_norm": 9.885845184326172, + "learning_rate": 1.79905546153622e-05, + "loss": 2.0209, + "step": 5696 + }, + { + "epoch": 0.2849, + "grad_norm": 6.681225776672363, + "learning_rate": 1.7988455344602093e-05, + "loss": 1.505, + "step": 5698 + }, + { + "epoch": 0.285, + "grad_norm": 8.777656555175781, + "learning_rate": 1.798635510047293e-05, + "loss": 1.9413, + "step": 5700 + }, + { + "epoch": 0.2851, + "grad_norm": 3.418020009994507, + "learning_rate": 1.7984253883230627e-05, + "loss": 0.9533, + "step": 5702 + }, + { + "epoch": 0.2852, + "grad_norm": 3.2113640308380127, + "learning_rate": 1.7982151693131206e-05, + "loss": 1.2924, + "step": 5704 + }, + { + "epoch": 0.2853, + "grad_norm": 8.054898262023926, + "learning_rate": 1.798004853043081e-05, + "loss": 0.9602, + "step": 5706 + }, + { + "epoch": 0.2854, + "grad_norm": 4.307859420776367, + "learning_rate": 1.7977944395385713e-05, + "loss": 1.4073, + "step": 5708 + }, + { + "epoch": 0.2855, + "grad_norm": 1.0132919549942017, + "learning_rate": 1.797583928825229e-05, + "loss": 0.0991, + "step": 5710 + }, + { + "epoch": 0.2856, + "grad_norm": 1.6712323427200317, + "learning_rate": 1.7973733209287036e-05, + "loss": 0.5831, + "step": 5712 + }, + { + "epoch": 0.2857, + "grad_norm": 5.745151519775391, + "learning_rate": 1.7971626158746585e-05, + "loss": 1.0301, + "step": 5714 + }, + { + "epoch": 0.2858, + "grad_norm": 2.8837971687316895, + "learning_rate": 1.7969518136887664e-05, + "loss": 1.6808, + "step": 5716 + }, + { + "epoch": 0.2859, + "grad_norm": 6.795064926147461, + "learning_rate": 1.7967409143967135e-05, + "loss": 0.8634, + "step": 5718 + }, + { + "epoch": 0.286, + "grad_norm": 4.444954872131348, + "learning_rate": 1.7965299180241963e-05, + "loss": 0.938, + "step": 5720 + }, + { + "epoch": 0.2861, + "grad_norm": 2.122982978820801, + "learning_rate": 1.7963188245969255e-05, + "loss": 0.7246, + "step": 5722 + }, + { + "epoch": 0.2862, + "grad_norm": 3.406773328781128, + "learning_rate": 1.796107634140621e-05, + "loss": 1.2301, + "step": 5724 + }, + { + "epoch": 0.2863, + "grad_norm": 18.12196159362793, + "learning_rate": 1.795896346681016e-05, + "loss": 1.4045, + "step": 5726 + }, + { + "epoch": 0.2864, + "grad_norm": 6.424037456512451, + "learning_rate": 1.7956849622438554e-05, + "loss": 1.4766, + "step": 5728 + }, + { + "epoch": 0.2865, + "grad_norm": 4.3992838859558105, + "learning_rate": 1.795473480854896e-05, + "loss": 0.7534, + "step": 5730 + }, + { + "epoch": 0.2866, + "grad_norm": 6.637948036193848, + "learning_rate": 1.795261902539906e-05, + "loss": 1.5148, + "step": 5732 + }, + { + "epoch": 0.2867, + "grad_norm": 4.414114475250244, + "learning_rate": 1.795050227324665e-05, + "loss": 1.5343, + "step": 5734 + }, + { + "epoch": 0.2868, + "grad_norm": 3.944383382797241, + "learning_rate": 1.794838455234966e-05, + "loss": 1.008, + "step": 5736 + }, + { + "epoch": 0.2869, + "grad_norm": 1.2037657499313354, + "learning_rate": 1.7946265862966114e-05, + "loss": 0.5229, + "step": 5738 + }, + { + "epoch": 0.287, + "grad_norm": 2.5317962169647217, + "learning_rate": 1.7944146205354182e-05, + "loss": 0.9602, + "step": 5740 + }, + { + "epoch": 0.2871, + "grad_norm": 3.0666213035583496, + "learning_rate": 1.794202557977213e-05, + "loss": 0.7787, + "step": 5742 + }, + { + "epoch": 0.2872, + "grad_norm": 6.0926923751831055, + "learning_rate": 1.7939903986478354e-05, + "loss": 1.5378, + "step": 5744 + }, + { + "epoch": 0.2873, + "grad_norm": 1.8563010692596436, + "learning_rate": 1.793778142573136e-05, + "loss": 1.5585, + "step": 5746 + }, + { + "epoch": 0.2874, + "grad_norm": 8.783059120178223, + "learning_rate": 1.793565789778978e-05, + "loss": 1.671, + "step": 5748 + }, + { + "epoch": 0.2875, + "grad_norm": 2.0095162391662598, + "learning_rate": 1.7933533402912354e-05, + "loss": 1.2567, + "step": 5750 + }, + { + "epoch": 0.2876, + "grad_norm": 8.833354949951172, + "learning_rate": 1.793140794135795e-05, + "loss": 1.7043, + "step": 5752 + }, + { + "epoch": 0.2877, + "grad_norm": 3.2648355960845947, + "learning_rate": 1.792928151338554e-05, + "loss": 1.1062, + "step": 5754 + }, + { + "epoch": 0.2878, + "grad_norm": 3.9661378860473633, + "learning_rate": 1.7927154119254234e-05, + "loss": 0.7891, + "step": 5756 + }, + { + "epoch": 0.2879, + "grad_norm": 6.672330856323242, + "learning_rate": 1.7925025759223248e-05, + "loss": 0.4329, + "step": 5758 + }, + { + "epoch": 0.288, + "grad_norm": 3.875537872314453, + "learning_rate": 1.792289643355191e-05, + "loss": 1.1465, + "step": 5760 + }, + { + "epoch": 0.2881, + "grad_norm": 2.6105103492736816, + "learning_rate": 1.7920766142499673e-05, + "loss": 0.7623, + "step": 5762 + }, + { + "epoch": 0.2882, + "grad_norm": 2.422969341278076, + "learning_rate": 1.791863488632611e-05, + "loss": 1.0587, + "step": 5764 + }, + { + "epoch": 0.2883, + "grad_norm": 5.21729850769043, + "learning_rate": 1.79165026652909e-05, + "loss": 1.3678, + "step": 5766 + }, + { + "epoch": 0.2884, + "grad_norm": 2.267012119293213, + "learning_rate": 1.7914369479653858e-05, + "loss": 1.3395, + "step": 5768 + }, + { + "epoch": 0.2885, + "grad_norm": 4.8983917236328125, + "learning_rate": 1.7912235329674903e-05, + "loss": 1.1901, + "step": 5770 + }, + { + "epoch": 0.2886, + "grad_norm": 3.8480615615844727, + "learning_rate": 1.791010021561407e-05, + "loss": 0.9777, + "step": 5772 + }, + { + "epoch": 0.2887, + "grad_norm": 1.9276090860366821, + "learning_rate": 1.790796413773152e-05, + "loss": 0.5735, + "step": 5774 + }, + { + "epoch": 0.2888, + "grad_norm": 5.038315773010254, + "learning_rate": 1.7905827096287532e-05, + "loss": 1.4941, + "step": 5776 + }, + { + "epoch": 0.2889, + "grad_norm": 3.5516533851623535, + "learning_rate": 1.790368909154249e-05, + "loss": 0.7201, + "step": 5778 + }, + { + "epoch": 0.289, + "grad_norm": 3.6032676696777344, + "learning_rate": 1.7901550123756906e-05, + "loss": 1.0334, + "step": 5780 + }, + { + "epoch": 0.2891, + "grad_norm": 4.221475124359131, + "learning_rate": 1.7899410193191408e-05, + "loss": 1.2647, + "step": 5782 + }, + { + "epoch": 0.2892, + "grad_norm": 11.793682098388672, + "learning_rate": 1.789726930010674e-05, + "loss": 1.7934, + "step": 5784 + }, + { + "epoch": 0.2893, + "grad_norm": 7.8429179191589355, + "learning_rate": 1.789512744476376e-05, + "loss": 1.3075, + "step": 5786 + }, + { + "epoch": 0.2894, + "grad_norm": 3.2671308517456055, + "learning_rate": 1.789298462742345e-05, + "loss": 1.607, + "step": 5788 + }, + { + "epoch": 0.2895, + "grad_norm": 2.0008039474487305, + "learning_rate": 1.789084084834691e-05, + "loss": 0.8195, + "step": 5790 + }, + { + "epoch": 0.2896, + "grad_norm": 2.567842721939087, + "learning_rate": 1.7888696107795343e-05, + "loss": 1.0952, + "step": 5792 + }, + { + "epoch": 0.2897, + "grad_norm": 9.272130012512207, + "learning_rate": 1.7886550406030084e-05, + "loss": 1.2121, + "step": 5794 + }, + { + "epoch": 0.2898, + "grad_norm": 5.723259925842285, + "learning_rate": 1.7884403743312583e-05, + "loss": 1.3008, + "step": 5796 + }, + { + "epoch": 0.2899, + "grad_norm": 2.7125203609466553, + "learning_rate": 1.78822561199044e-05, + "loss": 0.8577, + "step": 5798 + }, + { + "epoch": 0.29, + "grad_norm": 3.3606042861938477, + "learning_rate": 1.788010753606722e-05, + "loss": 1.38, + "step": 5800 + }, + { + "epoch": 0.2901, + "grad_norm": 4.4447855949401855, + "learning_rate": 1.787795799206284e-05, + "loss": 2.431, + "step": 5802 + }, + { + "epoch": 0.2902, + "grad_norm": 4.203016757965088, + "learning_rate": 1.7875807488153173e-05, + "loss": 0.5141, + "step": 5804 + }, + { + "epoch": 0.2903, + "grad_norm": 2.9073803424835205, + "learning_rate": 1.7873656024600254e-05, + "loss": 1.0608, + "step": 5806 + }, + { + "epoch": 0.2904, + "grad_norm": 3.4756391048431396, + "learning_rate": 1.7871503601666233e-05, + "loss": 0.6066, + "step": 5808 + }, + { + "epoch": 0.2905, + "grad_norm": 8.956949234008789, + "learning_rate": 1.7869350219613375e-05, + "loss": 0.9122, + "step": 5810 + }, + { + "epoch": 0.2906, + "grad_norm": 2.96207857131958, + "learning_rate": 1.7867195878704062e-05, + "loss": 1.3625, + "step": 5812 + }, + { + "epoch": 0.2907, + "grad_norm": 2.708817958831787, + "learning_rate": 1.7865040579200793e-05, + "loss": 1.1436, + "step": 5814 + }, + { + "epoch": 0.2908, + "grad_norm": 7.291810035705566, + "learning_rate": 1.786288432136619e-05, + "loss": 1.1972, + "step": 5816 + }, + { + "epoch": 0.2909, + "grad_norm": 6.009407043457031, + "learning_rate": 1.7860727105462982e-05, + "loss": 1.3049, + "step": 5818 + }, + { + "epoch": 0.291, + "grad_norm": 4.043642997741699, + "learning_rate": 1.785856893175402e-05, + "loss": 0.6663, + "step": 5820 + }, + { + "epoch": 0.2911, + "grad_norm": 17.814199447631836, + "learning_rate": 1.7856409800502272e-05, + "loss": 1.982, + "step": 5822 + }, + { + "epoch": 0.2912, + "grad_norm": 11.327033042907715, + "learning_rate": 1.785424971197082e-05, + "loss": 0.7758, + "step": 5824 + }, + { + "epoch": 0.2913, + "grad_norm": 3.8672754764556885, + "learning_rate": 1.7852088666422865e-05, + "loss": 0.9849, + "step": 5826 + }, + { + "epoch": 0.2914, + "grad_norm": 1.9744263887405396, + "learning_rate": 1.7849926664121726e-05, + "loss": 1.1378, + "step": 5828 + }, + { + "epoch": 0.2915, + "grad_norm": 4.872450351715088, + "learning_rate": 1.784776370533083e-05, + "loss": 1.2346, + "step": 5830 + }, + { + "epoch": 0.2916, + "grad_norm": 4.783740043640137, + "learning_rate": 1.7845599790313735e-05, + "loss": 1.2663, + "step": 5832 + }, + { + "epoch": 0.2917, + "grad_norm": 8.352570533752441, + "learning_rate": 1.7843434919334103e-05, + "loss": 1.6299, + "step": 5834 + }, + { + "epoch": 0.2918, + "grad_norm": 4.493703842163086, + "learning_rate": 1.7841269092655714e-05, + "loss": 1.4401, + "step": 5836 + }, + { + "epoch": 0.2919, + "grad_norm": 1.8875001668930054, + "learning_rate": 1.7839102310542477e-05, + "loss": 0.8558, + "step": 5838 + }, + { + "epoch": 0.292, + "grad_norm": 3.389873743057251, + "learning_rate": 1.78369345732584e-05, + "loss": 1.2842, + "step": 5840 + }, + { + "epoch": 0.2921, + "grad_norm": 2.4556446075439453, + "learning_rate": 1.7834765881067617e-05, + "loss": 1.3071, + "step": 5842 + }, + { + "epoch": 0.2922, + "grad_norm": 2.7654898166656494, + "learning_rate": 1.7832596234234376e-05, + "loss": 1.4236, + "step": 5844 + }, + { + "epoch": 0.2923, + "grad_norm": 2.3594207763671875, + "learning_rate": 1.7830425633023042e-05, + "loss": 0.3214, + "step": 5846 + }, + { + "epoch": 0.2924, + "grad_norm": 6.723749160766602, + "learning_rate": 1.78282540776981e-05, + "loss": 1.3011, + "step": 5848 + }, + { + "epoch": 0.2925, + "grad_norm": 3.3688576221466064, + "learning_rate": 1.782608156852414e-05, + "loss": 1.6961, + "step": 5850 + }, + { + "epoch": 0.2926, + "grad_norm": 3.9943923950195312, + "learning_rate": 1.7823908105765883e-05, + "loss": 0.7104, + "step": 5852 + }, + { + "epoch": 0.2927, + "grad_norm": 3.5762267112731934, + "learning_rate": 1.7821733689688154e-05, + "loss": 0.6924, + "step": 5854 + }, + { + "epoch": 0.2928, + "grad_norm": 4.778193950653076, + "learning_rate": 1.7819558320555902e-05, + "loss": 1.2419, + "step": 5856 + }, + { + "epoch": 0.2929, + "grad_norm": 3.8608970642089844, + "learning_rate": 1.7817381998634187e-05, + "loss": 0.8565, + "step": 5858 + }, + { + "epoch": 0.293, + "grad_norm": 1.639905333518982, + "learning_rate": 1.781520472418819e-05, + "loss": 0.3006, + "step": 5860 + }, + { + "epoch": 0.2931, + "grad_norm": 9.929722785949707, + "learning_rate": 1.78130264974832e-05, + "loss": 1.2515, + "step": 5862 + }, + { + "epoch": 0.2932, + "grad_norm": 3.1671059131622314, + "learning_rate": 1.7810847318784632e-05, + "loss": 1.1232, + "step": 5864 + }, + { + "epoch": 0.2933, + "grad_norm": 6.952856540679932, + "learning_rate": 1.7808667188358014e-05, + "loss": 1.3829, + "step": 5866 + }, + { + "epoch": 0.2934, + "grad_norm": 3.57110857963562, + "learning_rate": 1.7806486106468983e-05, + "loss": 0.6008, + "step": 5868 + }, + { + "epoch": 0.2935, + "grad_norm": 2.309749126434326, + "learning_rate": 1.7804304073383298e-05, + "loss": 0.7034, + "step": 5870 + }, + { + "epoch": 0.2936, + "grad_norm": 4.459983825683594, + "learning_rate": 1.780212108936684e-05, + "loss": 1.2332, + "step": 5872 + }, + { + "epoch": 0.2937, + "grad_norm": 2.400913715362549, + "learning_rate": 1.7799937154685587e-05, + "loss": 1.4163, + "step": 5874 + }, + { + "epoch": 0.2938, + "grad_norm": 2.8357393741607666, + "learning_rate": 1.7797752269605654e-05, + "loss": 0.7995, + "step": 5876 + }, + { + "epoch": 0.2939, + "grad_norm": 5.045809745788574, + "learning_rate": 1.7795566434393257e-05, + "loss": 1.4809, + "step": 5878 + }, + { + "epoch": 0.294, + "grad_norm": 3.345515489578247, + "learning_rate": 1.7793379649314743e-05, + "loss": 0.8645, + "step": 5880 + }, + { + "epoch": 0.2941, + "grad_norm": 2.5569536685943604, + "learning_rate": 1.7791191914636553e-05, + "loss": 1.0022, + "step": 5882 + }, + { + "epoch": 0.2942, + "grad_norm": 9.225686073303223, + "learning_rate": 1.7789003230625266e-05, + "loss": 1.2556, + "step": 5884 + }, + { + "epoch": 0.2943, + "grad_norm": 5.830507755279541, + "learning_rate": 1.778681359754756e-05, + "loss": 1.2178, + "step": 5886 + }, + { + "epoch": 0.2944, + "grad_norm": 4.380746841430664, + "learning_rate": 1.7784623015670237e-05, + "loss": 0.9222, + "step": 5888 + }, + { + "epoch": 0.2945, + "grad_norm": 4.869149208068848, + "learning_rate": 1.778243148526021e-05, + "loss": 0.9694, + "step": 5890 + }, + { + "epoch": 0.2946, + "grad_norm": 15.86425495147705, + "learning_rate": 1.7780239006584515e-05, + "loss": 1.983, + "step": 5892 + }, + { + "epoch": 0.2947, + "grad_norm": 3.595385789871216, + "learning_rate": 1.77780455799103e-05, + "loss": 1.1246, + "step": 5894 + }, + { + "epoch": 0.2948, + "grad_norm": 2.5761802196502686, + "learning_rate": 1.7775851205504823e-05, + "loss": 1.311, + "step": 5896 + }, + { + "epoch": 0.2949, + "grad_norm": 3.827997922897339, + "learning_rate": 1.7773655883635463e-05, + "loss": 1.0925, + "step": 5898 + }, + { + "epoch": 0.295, + "grad_norm": 1.3625086545944214, + "learning_rate": 1.777145961456971e-05, + "loss": 0.6622, + "step": 5900 + }, + { + "epoch": 0.2951, + "grad_norm": 7.003231048583984, + "learning_rate": 1.776926239857518e-05, + "loss": 3.1165, + "step": 5902 + }, + { + "epoch": 0.2952, + "grad_norm": 2.4787869453430176, + "learning_rate": 1.7767064235919594e-05, + "loss": 0.8669, + "step": 5904 + }, + { + "epoch": 0.2953, + "grad_norm": 14.51568603515625, + "learning_rate": 1.7764865126870788e-05, + "loss": 1.1454, + "step": 5906 + }, + { + "epoch": 0.2954, + "grad_norm": 6.608335494995117, + "learning_rate": 1.776266507169672e-05, + "loss": 0.718, + "step": 5908 + }, + { + "epoch": 0.2955, + "grad_norm": 6.729161262512207, + "learning_rate": 1.776046407066546e-05, + "loss": 1.054, + "step": 5910 + }, + { + "epoch": 0.2956, + "grad_norm": 2.428781509399414, + "learning_rate": 1.7758262124045195e-05, + "loss": 0.9645, + "step": 5912 + }, + { + "epoch": 0.2957, + "grad_norm": 2.95548152923584, + "learning_rate": 1.775605923210422e-05, + "loss": 0.423, + "step": 5914 + }, + { + "epoch": 0.2958, + "grad_norm": 3.2156903743743896, + "learning_rate": 1.775385539511096e-05, + "loss": 0.9455, + "step": 5916 + }, + { + "epoch": 0.2959, + "grad_norm": 2.5387721061706543, + "learning_rate": 1.7751650613333936e-05, + "loss": 0.7289, + "step": 5918 + }, + { + "epoch": 0.296, + "grad_norm": 15.134723663330078, + "learning_rate": 1.7749444887041797e-05, + "loss": 0.9856, + "step": 5920 + }, + { + "epoch": 0.2961, + "grad_norm": 4.544389247894287, + "learning_rate": 1.7747238216503308e-05, + "loss": 0.749, + "step": 5922 + }, + { + "epoch": 0.2962, + "grad_norm": 4.377408981323242, + "learning_rate": 1.7745030601987338e-05, + "loss": 0.922, + "step": 5924 + }, + { + "epoch": 0.2963, + "grad_norm": 4.22792387008667, + "learning_rate": 1.7742822043762888e-05, + "loss": 1.0101, + "step": 5926 + }, + { + "epoch": 0.2964, + "grad_norm": 7.208223342895508, + "learning_rate": 1.7740612542099054e-05, + "loss": 0.9794, + "step": 5928 + }, + { + "epoch": 0.2965, + "grad_norm": 6.5307793617248535, + "learning_rate": 1.7738402097265063e-05, + "loss": 0.8069, + "step": 5930 + }, + { + "epoch": 0.2966, + "grad_norm": 0.9253823161125183, + "learning_rate": 1.773619070953025e-05, + "loss": 0.6993, + "step": 5932 + }, + { + "epoch": 0.2967, + "grad_norm": 5.139352798461914, + "learning_rate": 1.7733978379164066e-05, + "loss": 0.6702, + "step": 5934 + }, + { + "epoch": 0.2968, + "grad_norm": 2.9697775840759277, + "learning_rate": 1.7731765106436073e-05, + "loss": 0.2105, + "step": 5936 + }, + { + "epoch": 0.2969, + "grad_norm": 4.801803112030029, + "learning_rate": 1.7729550891615958e-05, + "loss": 1.2987, + "step": 5938 + }, + { + "epoch": 0.297, + "grad_norm": 6.202559947967529, + "learning_rate": 1.7727335734973512e-05, + "loss": 0.8833, + "step": 5940 + }, + { + "epoch": 0.2971, + "grad_norm": 1.8538072109222412, + "learning_rate": 1.7725119636778644e-05, + "loss": 0.5761, + "step": 5942 + }, + { + "epoch": 0.2972, + "grad_norm": 6.960965633392334, + "learning_rate": 1.7722902597301385e-05, + "loss": 1.4153, + "step": 5944 + }, + { + "epoch": 0.2973, + "grad_norm": 3.0340051651000977, + "learning_rate": 1.7720684616811866e-05, + "loss": 1.0033, + "step": 5946 + }, + { + "epoch": 0.2974, + "grad_norm": 4.45806360244751, + "learning_rate": 1.771846569558035e-05, + "loss": 1.1784, + "step": 5948 + }, + { + "epoch": 0.2975, + "grad_norm": 3.2923290729522705, + "learning_rate": 1.7716245833877202e-05, + "loss": 0.6072, + "step": 5950 + }, + { + "epoch": 0.2976, + "grad_norm": 6.095458507537842, + "learning_rate": 1.7714025031972904e-05, + "loss": 1.143, + "step": 5952 + }, + { + "epoch": 0.2977, + "grad_norm": 3.94313383102417, + "learning_rate": 1.771180329013805e-05, + "loss": 0.9186, + "step": 5954 + }, + { + "epoch": 0.2978, + "grad_norm": 12.842588424682617, + "learning_rate": 1.7709580608643364e-05, + "loss": 1.934, + "step": 5956 + }, + { + "epoch": 0.2979, + "grad_norm": 4.732911109924316, + "learning_rate": 1.7707356987759664e-05, + "loss": 0.896, + "step": 5958 + }, + { + "epoch": 0.298, + "grad_norm": 2.557979106903076, + "learning_rate": 1.7705132427757895e-05, + "loss": 1.0624, + "step": 5960 + }, + { + "epoch": 0.2981, + "grad_norm": 3.1975839138031006, + "learning_rate": 1.770290692890911e-05, + "loss": 1.2438, + "step": 5962 + }, + { + "epoch": 0.2982, + "grad_norm": 4.6932454109191895, + "learning_rate": 1.770068049148448e-05, + "loss": 0.9774, + "step": 5964 + }, + { + "epoch": 0.2983, + "grad_norm": 6.300012588500977, + "learning_rate": 1.7698453115755294e-05, + "loss": 0.7122, + "step": 5966 + }, + { + "epoch": 0.2984, + "grad_norm": 7.930154323577881, + "learning_rate": 1.7696224801992947e-05, + "loss": 0.8321, + "step": 5968 + }, + { + "epoch": 0.2985, + "grad_norm": 6.356223106384277, + "learning_rate": 1.7693995550468952e-05, + "loss": 1.1691, + "step": 5970 + }, + { + "epoch": 0.2986, + "grad_norm": 7.255092620849609, + "learning_rate": 1.769176536145494e-05, + "loss": 1.7973, + "step": 5972 + }, + { + "epoch": 0.2987, + "grad_norm": 2.5332753658294678, + "learning_rate": 1.768953423522265e-05, + "loss": 0.7094, + "step": 5974 + }, + { + "epoch": 0.2988, + "grad_norm": 3.8375394344329834, + "learning_rate": 1.7687302172043933e-05, + "loss": 0.9931, + "step": 5976 + }, + { + "epoch": 0.2989, + "grad_norm": 4.832344055175781, + "learning_rate": 1.7685069172190766e-05, + "loss": 0.9674, + "step": 5978 + }, + { + "epoch": 0.299, + "grad_norm": 5.535735130310059, + "learning_rate": 1.7682835235935236e-05, + "loss": 0.4845, + "step": 5980 + }, + { + "epoch": 0.2991, + "grad_norm": 3.5665841102600098, + "learning_rate": 1.7680600363549534e-05, + "loss": 0.6371, + "step": 5982 + }, + { + "epoch": 0.2992, + "grad_norm": 2.8370773792266846, + "learning_rate": 1.767836455530598e-05, + "loss": 0.3889, + "step": 5984 + }, + { + "epoch": 0.2993, + "grad_norm": 5.953402996063232, + "learning_rate": 1.767612781147699e-05, + "loss": 1.0774, + "step": 5986 + }, + { + "epoch": 0.2994, + "grad_norm": 6.668912887573242, + "learning_rate": 1.767389013233511e-05, + "loss": 1.2189, + "step": 5988 + }, + { + "epoch": 0.2995, + "grad_norm": 2.678603410720825, + "learning_rate": 1.7671651518153e-05, + "loss": 2.0183, + "step": 5990 + }, + { + "epoch": 0.2996, + "grad_norm": 4.9709577560424805, + "learning_rate": 1.7669411969203417e-05, + "loss": 1.0435, + "step": 5992 + }, + { + "epoch": 0.2997, + "grad_norm": 6.217055797576904, + "learning_rate": 1.7667171485759253e-05, + "loss": 1.0542, + "step": 5994 + }, + { + "epoch": 0.2998, + "grad_norm": 2.0726773738861084, + "learning_rate": 1.76649300680935e-05, + "loss": 1.2074, + "step": 5996 + }, + { + "epoch": 0.2999, + "grad_norm": 2.282909631729126, + "learning_rate": 1.766268771647927e-05, + "loss": 0.5805, + "step": 5998 + }, + { + "epoch": 0.3, + "grad_norm": 5.96638298034668, + "learning_rate": 1.766044443118978e-05, + "loss": 1.0763, + "step": 6000 + }, + { + "epoch": 0.3001, + "grad_norm": 3.325730800628662, + "learning_rate": 1.7658200212498377e-05, + "loss": 0.7273, + "step": 6002 + }, + { + "epoch": 0.3002, + "grad_norm": 3.009345054626465, + "learning_rate": 1.7655955060678508e-05, + "loss": 0.5421, + "step": 6004 + }, + { + "epoch": 0.3003, + "grad_norm": 4.3436279296875, + "learning_rate": 1.7653708976003738e-05, + "loss": 1.051, + "step": 6006 + }, + { + "epoch": 0.3004, + "grad_norm": 4.592540740966797, + "learning_rate": 1.7651461958747745e-05, + "loss": 1.0706, + "step": 6008 + }, + { + "epoch": 0.3005, + "grad_norm": 9.615518569946289, + "learning_rate": 1.7649214009184323e-05, + "loss": 0.6634, + "step": 6010 + }, + { + "epoch": 0.3006, + "grad_norm": 5.04714298248291, + "learning_rate": 1.7646965127587373e-05, + "loss": 1.0983, + "step": 6012 + }, + { + "epoch": 0.3007, + "grad_norm": 4.8038434982299805, + "learning_rate": 1.764471531423092e-05, + "loss": 1.1795, + "step": 6014 + }, + { + "epoch": 0.3008, + "grad_norm": 6.288747310638428, + "learning_rate": 1.764246456938909e-05, + "loss": 1.2317, + "step": 6016 + }, + { + "epoch": 0.3009, + "grad_norm": 0.7406571507453918, + "learning_rate": 1.7640212893336143e-05, + "loss": 0.1588, + "step": 6018 + }, + { + "epoch": 0.301, + "grad_norm": 11.599528312683105, + "learning_rate": 1.7637960286346423e-05, + "loss": 1.2915, + "step": 6020 + }, + { + "epoch": 0.3011, + "grad_norm": 13.300393104553223, + "learning_rate": 1.7635706748694415e-05, + "loss": 0.8291, + "step": 6022 + }, + { + "epoch": 0.3012, + "grad_norm": 3.77823543548584, + "learning_rate": 1.76334522806547e-05, + "loss": 1.1548, + "step": 6024 + }, + { + "epoch": 0.3013, + "grad_norm": 4.652889728546143, + "learning_rate": 1.7631196882501975e-05, + "loss": 1.1817, + "step": 6026 + }, + { + "epoch": 0.3014, + "grad_norm": 2.629896402359009, + "learning_rate": 1.7628940554511064e-05, + "loss": 0.6537, + "step": 6028 + }, + { + "epoch": 0.3015, + "grad_norm": 7.238277435302734, + "learning_rate": 1.7626683296956885e-05, + "loss": 1.1323, + "step": 6030 + }, + { + "epoch": 0.3016, + "grad_norm": 9.617650032043457, + "learning_rate": 1.762442511011448e-05, + "loss": 0.7917, + "step": 6032 + }, + { + "epoch": 0.3017, + "grad_norm": 2.6847431659698486, + "learning_rate": 1.7622165994259003e-05, + "loss": 1.3023, + "step": 6034 + }, + { + "epoch": 0.3018, + "grad_norm": 1.9260051250457764, + "learning_rate": 1.761990594966572e-05, + "loss": 0.5766, + "step": 6036 + }, + { + "epoch": 0.3019, + "grad_norm": 5.939206123352051, + "learning_rate": 1.761764497661001e-05, + "loss": 1.6178, + "step": 6038 + }, + { + "epoch": 0.302, + "grad_norm": 11.979190826416016, + "learning_rate": 1.761538307536737e-05, + "loss": 1.0708, + "step": 6040 + }, + { + "epoch": 0.3021, + "grad_norm": 4.47234582901001, + "learning_rate": 1.76131202462134e-05, + "loss": 1.3556, + "step": 6042 + }, + { + "epoch": 0.3022, + "grad_norm": 11.958373069763184, + "learning_rate": 1.761085648942382e-05, + "loss": 1.1249, + "step": 6044 + }, + { + "epoch": 0.3023, + "grad_norm": 8.742549896240234, + "learning_rate": 1.7608591805274465e-05, + "loss": 1.1379, + "step": 6046 + }, + { + "epoch": 0.3024, + "grad_norm": 7.4266438484191895, + "learning_rate": 1.7606326194041274e-05, + "loss": 1.4596, + "step": 6048 + }, + { + "epoch": 0.3025, + "grad_norm": 5.887491226196289, + "learning_rate": 1.7604059656000313e-05, + "loss": 0.8644, + "step": 6050 + }, + { + "epoch": 0.3026, + "grad_norm": 4.761075019836426, + "learning_rate": 1.760179219142774e-05, + "loss": 1.686, + "step": 6052 + }, + { + "epoch": 0.3027, + "grad_norm": 2.4432215690612793, + "learning_rate": 1.7599523800599857e-05, + "loss": 1.2125, + "step": 6054 + }, + { + "epoch": 0.3028, + "grad_norm": 2.41945219039917, + "learning_rate": 1.759725448379305e-05, + "loss": 0.7447, + "step": 6056 + }, + { + "epoch": 0.3029, + "grad_norm": 4.421761512756348, + "learning_rate": 1.7594984241283826e-05, + "loss": 1.0271, + "step": 6058 + }, + { + "epoch": 0.303, + "grad_norm": 3.735668182373047, + "learning_rate": 1.759271307334881e-05, + "loss": 0.5019, + "step": 6060 + }, + { + "epoch": 0.3031, + "grad_norm": 4.728739261627197, + "learning_rate": 1.7590440980264738e-05, + "loss": 1.1734, + "step": 6062 + }, + { + "epoch": 0.3032, + "grad_norm": 3.6704447269439697, + "learning_rate": 1.7588167962308458e-05, + "loss": 0.6965, + "step": 6064 + }, + { + "epoch": 0.3033, + "grad_norm": 7.697761058807373, + "learning_rate": 1.7585894019756926e-05, + "loss": 1.4231, + "step": 6066 + }, + { + "epoch": 0.3034, + "grad_norm": 2.4905624389648438, + "learning_rate": 1.7583619152887222e-05, + "loss": 1.3655, + "step": 6068 + }, + { + "epoch": 0.3035, + "grad_norm": 12.315241813659668, + "learning_rate": 1.7581343361976523e-05, + "loss": 1.5309, + "step": 6070 + }, + { + "epoch": 0.3036, + "grad_norm": 3.9015891551971436, + "learning_rate": 1.7579066647302134e-05, + "loss": 0.2103, + "step": 6072 + }, + { + "epoch": 0.3037, + "grad_norm": 3.0862598419189453, + "learning_rate": 1.7576789009141465e-05, + "loss": 1.0439, + "step": 6074 + }, + { + "epoch": 0.3038, + "grad_norm": 3.5599777698516846, + "learning_rate": 1.757451044777204e-05, + "loss": 1.2209, + "step": 6076 + }, + { + "epoch": 0.3039, + "grad_norm": 5.506688594818115, + "learning_rate": 1.757223096347149e-05, + "loss": 0.9714, + "step": 6078 + }, + { + "epoch": 0.304, + "grad_norm": 2.7011444568634033, + "learning_rate": 1.7569950556517566e-05, + "loss": 0.57, + "step": 6080 + }, + { + "epoch": 0.3041, + "grad_norm": 6.910192966461182, + "learning_rate": 1.7567669227188128e-05, + "loss": 1.521, + "step": 6082 + }, + { + "epoch": 0.3042, + "grad_norm": 0.26622629165649414, + "learning_rate": 1.756538697576115e-05, + "loss": 1.2379, + "step": 6084 + }, + { + "epoch": 0.3043, + "grad_norm": 5.2772626876831055, + "learning_rate": 1.756310380251472e-05, + "loss": 0.6021, + "step": 6086 + }, + { + "epoch": 0.3044, + "grad_norm": 5.260009288787842, + "learning_rate": 1.7560819707727034e-05, + "loss": 1.0434, + "step": 6088 + }, + { + "epoch": 0.3045, + "grad_norm": 17.613876342773438, + "learning_rate": 1.7558534691676396e-05, + "loss": 1.359, + "step": 6090 + }, + { + "epoch": 0.3046, + "grad_norm": 2.6050002574920654, + "learning_rate": 1.7556248754641237e-05, + "loss": 1.1422, + "step": 6092 + }, + { + "epoch": 0.3047, + "grad_norm": 3.7164127826690674, + "learning_rate": 1.7553961896900086e-05, + "loss": 0.9178, + "step": 6094 + }, + { + "epoch": 0.3048, + "grad_norm": 13.857264518737793, + "learning_rate": 1.7551674118731592e-05, + "loss": 1.2315, + "step": 6096 + }, + { + "epoch": 0.3049, + "grad_norm": 4.92382287979126, + "learning_rate": 1.7549385420414514e-05, + "loss": 1.5439, + "step": 6098 + }, + { + "epoch": 0.305, + "grad_norm": 2.451744794845581, + "learning_rate": 1.7547095802227723e-05, + "loss": 1.32, + "step": 6100 + }, + { + "epoch": 0.3051, + "grad_norm": 3.580324411392212, + "learning_rate": 1.7544805264450197e-05, + "loss": 0.8028, + "step": 6102 + }, + { + "epoch": 0.3052, + "grad_norm": 4.613621711730957, + "learning_rate": 1.754251380736104e-05, + "loss": 1.1597, + "step": 6104 + }, + { + "epoch": 0.3053, + "grad_norm": 12.93421745300293, + "learning_rate": 1.754022143123945e-05, + "loss": 1.0813, + "step": 6106 + }, + { + "epoch": 0.3054, + "grad_norm": 5.536386013031006, + "learning_rate": 1.7537928136364756e-05, + "loss": 1.1575, + "step": 6108 + }, + { + "epoch": 0.3055, + "grad_norm": 5.828302383422852, + "learning_rate": 1.7535633923016382e-05, + "loss": 1.0161, + "step": 6110 + }, + { + "epoch": 0.3056, + "grad_norm": 5.2251973152160645, + "learning_rate": 1.7533338791473872e-05, + "loss": 1.1164, + "step": 6112 + }, + { + "epoch": 0.3057, + "grad_norm": 1.717422366142273, + "learning_rate": 1.7531042742016878e-05, + "loss": 0.8156, + "step": 6114 + }, + { + "epoch": 0.3058, + "grad_norm": 4.54273796081543, + "learning_rate": 1.7528745774925175e-05, + "loss": 1.2839, + "step": 6116 + }, + { + "epoch": 0.3059, + "grad_norm": 2.3370089530944824, + "learning_rate": 1.7526447890478633e-05, + "loss": 0.6963, + "step": 6118 + }, + { + "epoch": 0.306, + "grad_norm": 8.166106224060059, + "learning_rate": 1.7524149088957244e-05, + "loss": 1.3076, + "step": 6120 + }, + { + "epoch": 0.3061, + "grad_norm": 3.8375277519226074, + "learning_rate": 1.7521849370641116e-05, + "loss": 0.8979, + "step": 6122 + }, + { + "epoch": 0.3062, + "grad_norm": 2.7957048416137695, + "learning_rate": 1.7519548735810456e-05, + "loss": 0.9741, + "step": 6124 + }, + { + "epoch": 0.3063, + "grad_norm": 10.052007675170898, + "learning_rate": 1.7517247184745595e-05, + "loss": 1.1686, + "step": 6126 + }, + { + "epoch": 0.3064, + "grad_norm": 3.0696287155151367, + "learning_rate": 1.7514944717726962e-05, + "loss": 1.8012, + "step": 6128 + }, + { + "epoch": 0.3065, + "grad_norm": 2.5585641860961914, + "learning_rate": 1.7512641335035115e-05, + "loss": 0.5956, + "step": 6130 + }, + { + "epoch": 0.3066, + "grad_norm": 5.251585483551025, + "learning_rate": 1.7510337036950703e-05, + "loss": 1.0672, + "step": 6132 + }, + { + "epoch": 0.3067, + "grad_norm": 4.049012660980225, + "learning_rate": 1.750803182375451e-05, + "loss": 0.9793, + "step": 6134 + }, + { + "epoch": 0.3068, + "grad_norm": 3.8036112785339355, + "learning_rate": 1.7505725695727414e-05, + "loss": 1.015, + "step": 6136 + }, + { + "epoch": 0.3069, + "grad_norm": 3.6785523891448975, + "learning_rate": 1.7503418653150407e-05, + "loss": 0.3919, + "step": 6138 + }, + { + "epoch": 0.307, + "grad_norm": 7.185948371887207, + "learning_rate": 1.7501110696304598e-05, + "loss": 1.1394, + "step": 6140 + }, + { + "epoch": 0.3071, + "grad_norm": 3.3449008464813232, + "learning_rate": 1.7498801825471204e-05, + "loss": 1.0345, + "step": 6142 + }, + { + "epoch": 0.3072, + "grad_norm": 3.159142017364502, + "learning_rate": 1.749649204093155e-05, + "loss": 0.7967, + "step": 6144 + }, + { + "epoch": 0.3073, + "grad_norm": 4.296520709991455, + "learning_rate": 1.7494181342967083e-05, + "loss": 0.499, + "step": 6146 + }, + { + "epoch": 0.3074, + "grad_norm": 6.41526985168457, + "learning_rate": 1.7491869731859353e-05, + "loss": 0.9921, + "step": 6148 + }, + { + "epoch": 0.3075, + "grad_norm": 3.237259864807129, + "learning_rate": 1.7489557207890025e-05, + "loss": 0.6294, + "step": 6150 + }, + { + "epoch": 0.3076, + "grad_norm": 2.340196132659912, + "learning_rate": 1.7487243771340862e-05, + "loss": 0.917, + "step": 6152 + }, + { + "epoch": 0.3077, + "grad_norm": 4.328436851501465, + "learning_rate": 1.748492942249376e-05, + "loss": 1.1855, + "step": 6154 + }, + { + "epoch": 0.3078, + "grad_norm": 5.42263650894165, + "learning_rate": 1.7482614161630714e-05, + "loss": 1.3076, + "step": 6156 + }, + { + "epoch": 0.3079, + "grad_norm": 7.009166717529297, + "learning_rate": 1.7480297989033824e-05, + "loss": 0.9851, + "step": 6158 + }, + { + "epoch": 0.308, + "grad_norm": 5.9452362060546875, + "learning_rate": 1.747798090498532e-05, + "loss": 1.2334, + "step": 6160 + }, + { + "epoch": 0.3081, + "grad_norm": 5.954953193664551, + "learning_rate": 1.7475662909767523e-05, + "loss": 0.6821, + "step": 6162 + }, + { + "epoch": 0.3082, + "grad_norm": 3.800504684448242, + "learning_rate": 1.7473344003662877e-05, + "loss": 1.1563, + "step": 6164 + }, + { + "epoch": 0.3083, + "grad_norm": 5.159421920776367, + "learning_rate": 1.7471024186953936e-05, + "loss": 1.1184, + "step": 6166 + }, + { + "epoch": 0.3084, + "grad_norm": 2.765589475631714, + "learning_rate": 1.746870345992336e-05, + "loss": 1.2698, + "step": 6168 + }, + { + "epoch": 0.3085, + "grad_norm": 7.389960289001465, + "learning_rate": 1.7466381822853915e-05, + "loss": 0.518, + "step": 6170 + }, + { + "epoch": 0.3086, + "grad_norm": 4.809850215911865, + "learning_rate": 1.7464059276028497e-05, + "loss": 1.3633, + "step": 6172 + }, + { + "epoch": 0.3087, + "grad_norm": 2.8322434425354004, + "learning_rate": 1.7461735819730095e-05, + "loss": 0.7871, + "step": 6174 + }, + { + "epoch": 0.3088, + "grad_norm": 1.6877737045288086, + "learning_rate": 1.7459411454241822e-05, + "loss": 1.1944, + "step": 6176 + }, + { + "epoch": 0.3089, + "grad_norm": 6.164298057556152, + "learning_rate": 1.7457086179846888e-05, + "loss": 1.1736, + "step": 6178 + }, + { + "epoch": 0.309, + "grad_norm": 6.525917053222656, + "learning_rate": 1.7454759996828622e-05, + "loss": 1.4091, + "step": 6180 + }, + { + "epoch": 0.3091, + "grad_norm": 11.128629684448242, + "learning_rate": 1.7452432905470465e-05, + "loss": 1.172, + "step": 6182 + }, + { + "epoch": 0.3092, + "grad_norm": 4.12971830368042, + "learning_rate": 1.7450104906055963e-05, + "loss": 0.9774, + "step": 6184 + }, + { + "epoch": 0.3093, + "grad_norm": 4.315813064575195, + "learning_rate": 1.7447775998868778e-05, + "loss": 0.7618, + "step": 6186 + }, + { + "epoch": 0.3094, + "grad_norm": 5.248650074005127, + "learning_rate": 1.7445446184192674e-05, + "loss": 0.8079, + "step": 6188 + }, + { + "epoch": 0.3095, + "grad_norm": 3.7764089107513428, + "learning_rate": 1.744311546231154e-05, + "loss": 0.4071, + "step": 6190 + }, + { + "epoch": 0.3096, + "grad_norm": 2.071070432662964, + "learning_rate": 1.7440783833509366e-05, + "loss": 1.1428, + "step": 6192 + }, + { + "epoch": 0.3097, + "grad_norm": 14.681522369384766, + "learning_rate": 1.7438451298070252e-05, + "loss": 1.4486, + "step": 6194 + }, + { + "epoch": 0.3098, + "grad_norm": 6.977023601531982, + "learning_rate": 1.743611785627841e-05, + "loss": 0.9116, + "step": 6196 + }, + { + "epoch": 0.3099, + "grad_norm": 1.6971757411956787, + "learning_rate": 1.7433783508418163e-05, + "loss": 1.2064, + "step": 6198 + }, + { + "epoch": 0.31, + "grad_norm": 6.704974174499512, + "learning_rate": 1.7431448254773943e-05, + "loss": 1.0499, + "step": 6200 + }, + { + "epoch": 0.3101, + "grad_norm": 2.9503612518310547, + "learning_rate": 1.7429112095630296e-05, + "loss": 0.7269, + "step": 6202 + }, + { + "epoch": 0.3102, + "grad_norm": 1.570225477218628, + "learning_rate": 1.7426775031271876e-05, + "loss": 0.2424, + "step": 6204 + }, + { + "epoch": 0.3103, + "grad_norm": 3.1116690635681152, + "learning_rate": 1.7424437061983445e-05, + "loss": 1.0935, + "step": 6206 + }, + { + "epoch": 0.3104, + "grad_norm": 2.7624807357788086, + "learning_rate": 1.7422098188049885e-05, + "loss": 0.8897, + "step": 6208 + }, + { + "epoch": 0.3105, + "grad_norm": 3.900327444076538, + "learning_rate": 1.7419758409756163e-05, + "loss": 0.7467, + "step": 6210 + }, + { + "epoch": 0.3106, + "grad_norm": 5.0186567306518555, + "learning_rate": 1.7417417727387392e-05, + "loss": 1.1735, + "step": 6212 + }, + { + "epoch": 0.3107, + "grad_norm": 3.7180874347686768, + "learning_rate": 1.741507614122877e-05, + "loss": 0.6643, + "step": 6214 + }, + { + "epoch": 0.3108, + "grad_norm": 14.132955551147461, + "learning_rate": 1.741273365156561e-05, + "loss": 0.9146, + "step": 6216 + }, + { + "epoch": 0.3109, + "grad_norm": 4.023470878601074, + "learning_rate": 1.7410390258683345e-05, + "loss": 1.0488, + "step": 6218 + }, + { + "epoch": 0.311, + "grad_norm": 8.849810600280762, + "learning_rate": 1.74080459628675e-05, + "loss": 1.1974, + "step": 6220 + }, + { + "epoch": 0.3111, + "grad_norm": 8.461770057678223, + "learning_rate": 1.7405700764403726e-05, + "loss": 1.2585, + "step": 6222 + }, + { + "epoch": 0.3112, + "grad_norm": 7.008925437927246, + "learning_rate": 1.7403354663577782e-05, + "loss": 1.604, + "step": 6224 + }, + { + "epoch": 0.3113, + "grad_norm": 2.519697427749634, + "learning_rate": 1.7401007660675525e-05, + "loss": 0.691, + "step": 6226 + }, + { + "epoch": 0.3114, + "grad_norm": 5.040878772735596, + "learning_rate": 1.7398659755982937e-05, + "loss": 0.7222, + "step": 6228 + }, + { + "epoch": 0.3115, + "grad_norm": 7.504505157470703, + "learning_rate": 1.73963109497861e-05, + "loss": 0.6696, + "step": 6230 + }, + { + "epoch": 0.3116, + "grad_norm": 9.904169082641602, + "learning_rate": 1.7393961242371203e-05, + "loss": 1.0432, + "step": 6232 + }, + { + "epoch": 0.3117, + "grad_norm": 7.27670955657959, + "learning_rate": 1.7391610634024566e-05, + "loss": 1.4395, + "step": 6234 + }, + { + "epoch": 0.3118, + "grad_norm": 1.319972276687622, + "learning_rate": 1.738925912503259e-05, + "loss": 1.1015, + "step": 6236 + }, + { + "epoch": 0.3119, + "grad_norm": 5.022871017456055, + "learning_rate": 1.73869067156818e-05, + "loss": 1.3952, + "step": 6238 + }, + { + "epoch": 0.312, + "grad_norm": 11.482023239135742, + "learning_rate": 1.7384553406258842e-05, + "loss": 1.4968, + "step": 6240 + }, + { + "epoch": 0.3121, + "grad_norm": 2.9118878841400146, + "learning_rate": 1.7382199197050443e-05, + "loss": 0.8618, + "step": 6242 + }, + { + "epoch": 0.3122, + "grad_norm": 7.376513957977295, + "learning_rate": 1.737984408834347e-05, + "loss": 1.5634, + "step": 6244 + }, + { + "epoch": 0.3123, + "grad_norm": 2.945338487625122, + "learning_rate": 1.7377488080424875e-05, + "loss": 0.8387, + "step": 6246 + }, + { + "epoch": 0.3124, + "grad_norm": 4.036867141723633, + "learning_rate": 1.737513117358174e-05, + "loss": 1.024, + "step": 6248 + }, + { + "epoch": 0.3125, + "grad_norm": 2.138068675994873, + "learning_rate": 1.737277336810124e-05, + "loss": 0.9214, + "step": 6250 + }, + { + "epoch": 0.3126, + "grad_norm": 4.61332368850708, + "learning_rate": 1.7370414664270675e-05, + "loss": 1.8375, + "step": 6252 + }, + { + "epoch": 0.3127, + "grad_norm": 5.501501083374023, + "learning_rate": 1.7368055062377435e-05, + "loss": 1.3604, + "step": 6254 + }, + { + "epoch": 0.3128, + "grad_norm": 5.476563930511475, + "learning_rate": 1.7365694562709034e-05, + "loss": 0.8249, + "step": 6256 + }, + { + "epoch": 0.3129, + "grad_norm": 3.703364133834839, + "learning_rate": 1.7363333165553095e-05, + "loss": 2.396, + "step": 6258 + }, + { + "epoch": 0.313, + "grad_norm": 5.108853816986084, + "learning_rate": 1.7360970871197347e-05, + "loss": 1.1052, + "step": 6260 + }, + { + "epoch": 0.3131, + "grad_norm": 3.229113817214966, + "learning_rate": 1.7358607679929623e-05, + "loss": 1.3175, + "step": 6262 + }, + { + "epoch": 0.3132, + "grad_norm": 14.869935989379883, + "learning_rate": 1.7356243592037876e-05, + "loss": 1.0645, + "step": 6264 + }, + { + "epoch": 0.3133, + "grad_norm": 2.759594678878784, + "learning_rate": 1.735387860781016e-05, + "loss": 0.5268, + "step": 6266 + }, + { + "epoch": 0.3134, + "grad_norm": 4.925304889678955, + "learning_rate": 1.7351512727534645e-05, + "loss": 1.1533, + "step": 6268 + }, + { + "epoch": 0.3135, + "grad_norm": 1.317628026008606, + "learning_rate": 1.73491459514996e-05, + "loss": 0.231, + "step": 6270 + }, + { + "epoch": 0.3136, + "grad_norm": 2.4406416416168213, + "learning_rate": 1.7346778279993417e-05, + "loss": 0.5923, + "step": 6272 + }, + { + "epoch": 0.3137, + "grad_norm": 5.335829257965088, + "learning_rate": 1.7344409713304582e-05, + "loss": 0.8262, + "step": 6274 + }, + { + "epoch": 0.3138, + "grad_norm": 10.406717300415039, + "learning_rate": 1.7342040251721702e-05, + "loss": 1.4739, + "step": 6276 + }, + { + "epoch": 0.3139, + "grad_norm": 2.1169183254241943, + "learning_rate": 1.7339669895533493e-05, + "loss": 0.7402, + "step": 6278 + }, + { + "epoch": 0.314, + "grad_norm": 4.685873508453369, + "learning_rate": 1.7337298645028764e-05, + "loss": 1.2235, + "step": 6280 + }, + { + "epoch": 0.3141, + "grad_norm": 3.069225311279297, + "learning_rate": 1.7334926500496458e-05, + "loss": 1.3111, + "step": 6282 + }, + { + "epoch": 0.3142, + "grad_norm": 6.183600425720215, + "learning_rate": 1.7332553462225604e-05, + "loss": 0.5961, + "step": 6284 + }, + { + "epoch": 0.3143, + "grad_norm": 5.222561836242676, + "learning_rate": 1.733017953050535e-05, + "loss": 1.1131, + "step": 6286 + }, + { + "epoch": 0.3144, + "grad_norm": 19.906963348388672, + "learning_rate": 1.732780470562496e-05, + "loss": 2.0777, + "step": 6288 + }, + { + "epoch": 0.3145, + "grad_norm": 2.2102763652801514, + "learning_rate": 1.732542898787379e-05, + "loss": 1.1601, + "step": 6290 + }, + { + "epoch": 0.3146, + "grad_norm": 0.14022450149059296, + "learning_rate": 1.732305237754132e-05, + "loss": 0.6312, + "step": 6292 + }, + { + "epoch": 0.3147, + "grad_norm": 3.0285258293151855, + "learning_rate": 1.732067487491713e-05, + "loss": 2.0642, + "step": 6294 + }, + { + "epoch": 0.3148, + "grad_norm": 7.010583400726318, + "learning_rate": 1.7318296480290912e-05, + "loss": 1.1505, + "step": 6296 + }, + { + "epoch": 0.3149, + "grad_norm": 3.926870822906494, + "learning_rate": 1.731591719395247e-05, + "loss": 1.5907, + "step": 6298 + }, + { + "epoch": 0.315, + "grad_norm": 2.860609531402588, + "learning_rate": 1.7313537016191706e-05, + "loss": 0.8155, + "step": 6300 + }, + { + "epoch": 0.3151, + "grad_norm": 1.8487781286239624, + "learning_rate": 1.7311155947298644e-05, + "loss": 1.2959, + "step": 6302 + }, + { + "epoch": 0.3152, + "grad_norm": 5.594986438751221, + "learning_rate": 1.7308773987563406e-05, + "loss": 0.9849, + "step": 6304 + }, + { + "epoch": 0.3153, + "grad_norm": 4.69092321395874, + "learning_rate": 1.7306391137276227e-05, + "loss": 1.363, + "step": 6306 + }, + { + "epoch": 0.3154, + "grad_norm": 4.820458889007568, + "learning_rate": 1.730400739672745e-05, + "loss": 0.6904, + "step": 6308 + }, + { + "epoch": 0.3155, + "grad_norm": 2.4770424365997314, + "learning_rate": 1.7301622766207526e-05, + "loss": 1.5027, + "step": 6310 + }, + { + "epoch": 0.3156, + "grad_norm": 1.237027883529663, + "learning_rate": 1.7299237246007018e-05, + "loss": 0.8474, + "step": 6312 + }, + { + "epoch": 0.3157, + "grad_norm": 5.133941650390625, + "learning_rate": 1.729685083641659e-05, + "loss": 0.9481, + "step": 6314 + }, + { + "epoch": 0.3158, + "grad_norm": 3.072625160217285, + "learning_rate": 1.7294463537727026e-05, + "loss": 1.4681, + "step": 6316 + }, + { + "epoch": 0.3159, + "grad_norm": 1.858057975769043, + "learning_rate": 1.72920753502292e-05, + "loss": 0.6225, + "step": 6318 + }, + { + "epoch": 0.316, + "grad_norm": 11.724316596984863, + "learning_rate": 1.7289686274214116e-05, + "loss": 0.7155, + "step": 6320 + }, + { + "epoch": 0.3161, + "grad_norm": 7.000084400177002, + "learning_rate": 1.728729630997287e-05, + "loss": 0.4264, + "step": 6322 + }, + { + "epoch": 0.3162, + "grad_norm": 8.350211143493652, + "learning_rate": 1.7284905457796678e-05, + "loss": 0.9856, + "step": 6324 + }, + { + "epoch": 0.3163, + "grad_norm": 5.103370189666748, + "learning_rate": 1.7282513717976847e-05, + "loss": 0.5024, + "step": 6326 + }, + { + "epoch": 0.3164, + "grad_norm": 8.193317413330078, + "learning_rate": 1.7280121090804813e-05, + "loss": 1.4049, + "step": 6328 + }, + { + "epoch": 0.3165, + "grad_norm": 6.386935710906982, + "learning_rate": 1.7277727576572108e-05, + "loss": 0.6422, + "step": 6330 + }, + { + "epoch": 0.3166, + "grad_norm": 3.81465744972229, + "learning_rate": 1.727533317557037e-05, + "loss": 1.3911, + "step": 6332 + }, + { + "epoch": 0.3167, + "grad_norm": 3.9343323707580566, + "learning_rate": 1.7272937888091355e-05, + "loss": 0.5264, + "step": 6334 + }, + { + "epoch": 0.3168, + "grad_norm": 2.5354185104370117, + "learning_rate": 1.727054171442692e-05, + "loss": 0.4498, + "step": 6336 + }, + { + "epoch": 0.3169, + "grad_norm": 2.4566848278045654, + "learning_rate": 1.726814465486903e-05, + "loss": 0.9299, + "step": 6338 + }, + { + "epoch": 0.317, + "grad_norm": 4.4318742752075195, + "learning_rate": 1.7265746709709762e-05, + "loss": 0.8701, + "step": 6340 + }, + { + "epoch": 0.3171, + "grad_norm": 10.021151542663574, + "learning_rate": 1.7263347879241294e-05, + "loss": 1.3672, + "step": 6342 + }, + { + "epoch": 0.3172, + "grad_norm": 4.175997257232666, + "learning_rate": 1.7260948163755918e-05, + "loss": 0.8161, + "step": 6344 + }, + { + "epoch": 0.3173, + "grad_norm": 9.663496971130371, + "learning_rate": 1.725854756354604e-05, + "loss": 0.6034, + "step": 6346 + }, + { + "epoch": 0.3174, + "grad_norm": 12.625722885131836, + "learning_rate": 1.7256146078904153e-05, + "loss": 1.0003, + "step": 6348 + }, + { + "epoch": 0.3175, + "grad_norm": 13.36802864074707, + "learning_rate": 1.7253743710122877e-05, + "loss": 1.5072, + "step": 6350 + }, + { + "epoch": 0.3176, + "grad_norm": 5.012056350708008, + "learning_rate": 1.7251340457494934e-05, + "loss": 1.1554, + "step": 6352 + }, + { + "epoch": 0.3177, + "grad_norm": 4.643257141113281, + "learning_rate": 1.724893632131315e-05, + "loss": 0.7971, + "step": 6354 + }, + { + "epoch": 0.3178, + "grad_norm": 6.472977638244629, + "learning_rate": 1.7246531301870467e-05, + "loss": 1.2653, + "step": 6356 + }, + { + "epoch": 0.3179, + "grad_norm": 6.219174385070801, + "learning_rate": 1.7244125399459926e-05, + "loss": 1.0988, + "step": 6358 + }, + { + "epoch": 0.318, + "grad_norm": 7.8236541748046875, + "learning_rate": 1.7241718614374678e-05, + "loss": 1.2183, + "step": 6360 + }, + { + "epoch": 0.3181, + "grad_norm": 6.935734272003174, + "learning_rate": 1.723931094690798e-05, + "loss": 1.1605, + "step": 6362 + }, + { + "epoch": 0.3182, + "grad_norm": 6.227881908416748, + "learning_rate": 1.7236902397353204e-05, + "loss": 1.2893, + "step": 6364 + }, + { + "epoch": 0.3183, + "grad_norm": 3.6920053958892822, + "learning_rate": 1.7234492966003828e-05, + "loss": 1.6655, + "step": 6366 + }, + { + "epoch": 0.3184, + "grad_norm": 7.218326091766357, + "learning_rate": 1.7232082653153422e-05, + "loss": 0.5811, + "step": 6368 + }, + { + "epoch": 0.3185, + "grad_norm": 9.81350326538086, + "learning_rate": 1.7229671459095682e-05, + "loss": 1.0022, + "step": 6370 + }, + { + "epoch": 0.3186, + "grad_norm": 4.785780906677246, + "learning_rate": 1.7227259384124408e-05, + "loss": 0.8546, + "step": 6372 + }, + { + "epoch": 0.3187, + "grad_norm": 6.3726277351379395, + "learning_rate": 1.7224846428533498e-05, + "loss": 1.3447, + "step": 6374 + }, + { + "epoch": 0.3188, + "grad_norm": 6.131468296051025, + "learning_rate": 1.722243259261697e-05, + "loss": 0.9232, + "step": 6376 + }, + { + "epoch": 0.3189, + "grad_norm": 6.782886981964111, + "learning_rate": 1.7220017876668934e-05, + "loss": 0.6951, + "step": 6378 + }, + { + "epoch": 0.319, + "grad_norm": 3.363818883895874, + "learning_rate": 1.7217602280983622e-05, + "loss": 0.4469, + "step": 6380 + }, + { + "epoch": 0.3191, + "grad_norm": 8.072650909423828, + "learning_rate": 1.721518580585537e-05, + "loss": 1.5552, + "step": 6382 + }, + { + "epoch": 0.3192, + "grad_norm": 4.046308994293213, + "learning_rate": 1.721276845157861e-05, + "loss": 0.2255, + "step": 6384 + }, + { + "epoch": 0.3193, + "grad_norm": 3.486799478530884, + "learning_rate": 1.721035021844789e-05, + "loss": 1.111, + "step": 6386 + }, + { + "epoch": 0.3194, + "grad_norm": 2.0014688968658447, + "learning_rate": 1.7207931106757867e-05, + "loss": 1.539, + "step": 6388 + }, + { + "epoch": 0.3195, + "grad_norm": 5.316035270690918, + "learning_rate": 1.7205511116803306e-05, + "loss": 1.0367, + "step": 6390 + }, + { + "epoch": 0.3196, + "grad_norm": 9.056877136230469, + "learning_rate": 1.720309024887907e-05, + "loss": 1.3149, + "step": 6392 + }, + { + "epoch": 0.3197, + "grad_norm": 3.133908271789551, + "learning_rate": 1.720066850328014e-05, + "loss": 0.9339, + "step": 6394 + }, + { + "epoch": 0.3198, + "grad_norm": 5.81328010559082, + "learning_rate": 1.719824588030159e-05, + "loss": 1.0342, + "step": 6396 + }, + { + "epoch": 0.3199, + "grad_norm": 5.020055770874023, + "learning_rate": 1.7195822380238615e-05, + "loss": 1.2405, + "step": 6398 + }, + { + "epoch": 0.32, + "grad_norm": 12.712276458740234, + "learning_rate": 1.7193398003386514e-05, + "loss": 1.8228, + "step": 6400 + }, + { + "epoch": 0.3201, + "grad_norm": 0.9288871884346008, + "learning_rate": 1.719097275004068e-05, + "loss": 0.7817, + "step": 6402 + }, + { + "epoch": 0.3202, + "grad_norm": 2.8871610164642334, + "learning_rate": 1.7188546620496634e-05, + "loss": 1.0641, + "step": 6404 + }, + { + "epoch": 0.3203, + "grad_norm": 9.114827156066895, + "learning_rate": 1.7186119615049986e-05, + "loss": 1.9494, + "step": 6406 + }, + { + "epoch": 0.3204, + "grad_norm": 2.6083288192749023, + "learning_rate": 1.7183691733996463e-05, + "loss": 0.7563, + "step": 6408 + }, + { + "epoch": 0.3205, + "grad_norm": 2.074746608734131, + "learning_rate": 1.718126297763189e-05, + "loss": 0.7256, + "step": 6410 + }, + { + "epoch": 0.3206, + "grad_norm": 9.11270523071289, + "learning_rate": 1.7178833346252208e-05, + "loss": 1.9448, + "step": 6412 + }, + { + "epoch": 0.3207, + "grad_norm": 3.7582201957702637, + "learning_rate": 1.7176402840153453e-05, + "loss": 1.5551, + "step": 6414 + }, + { + "epoch": 0.3208, + "grad_norm": 2.746889114379883, + "learning_rate": 1.717397145963179e-05, + "loss": 1.2591, + "step": 6416 + }, + { + "epoch": 0.3209, + "grad_norm": 1.691536545753479, + "learning_rate": 1.7171539204983457e-05, + "loss": 0.3423, + "step": 6418 + }, + { + "epoch": 0.321, + "grad_norm": 0.6127015948295593, + "learning_rate": 1.716910607650483e-05, + "loss": 0.285, + "step": 6420 + }, + { + "epoch": 0.3211, + "grad_norm": 4.642768859863281, + "learning_rate": 1.7166672074492373e-05, + "loss": 0.4813, + "step": 6422 + }, + { + "epoch": 0.3212, + "grad_norm": 2.7750000953674316, + "learning_rate": 1.716423719924266e-05, + "loss": 0.9174, + "step": 6424 + }, + { + "epoch": 0.3213, + "grad_norm": 8.451303482055664, + "learning_rate": 1.7161801451052378e-05, + "loss": 0.7813, + "step": 6426 + }, + { + "epoch": 0.3214, + "grad_norm": 4.137636184692383, + "learning_rate": 1.7159364830218312e-05, + "loss": 0.904, + "step": 6428 + }, + { + "epoch": 0.3215, + "grad_norm": 4.467907428741455, + "learning_rate": 1.715692733703736e-05, + "loss": 1.1483, + "step": 6430 + }, + { + "epoch": 0.3216, + "grad_norm": 3.9588534832000732, + "learning_rate": 1.715448897180652e-05, + "loss": 0.3482, + "step": 6432 + }, + { + "epoch": 0.3217, + "grad_norm": 20.042619705200195, + "learning_rate": 1.7152049734822903e-05, + "loss": 1.2426, + "step": 6434 + }, + { + "epoch": 0.3218, + "grad_norm": 4.056471824645996, + "learning_rate": 1.7149609626383718e-05, + "loss": 0.9789, + "step": 6436 + }, + { + "epoch": 0.3219, + "grad_norm": 7.966047286987305, + "learning_rate": 1.7147168646786284e-05, + "loss": 1.2408, + "step": 6438 + }, + { + "epoch": 0.322, + "grad_norm": 2.862856864929199, + "learning_rate": 1.7144726796328034e-05, + "loss": 0.6229, + "step": 6440 + }, + { + "epoch": 0.3221, + "grad_norm": 5.347121238708496, + "learning_rate": 1.7142284075306497e-05, + "loss": 1.0693, + "step": 6442 + }, + { + "epoch": 0.3222, + "grad_norm": 2.323544502258301, + "learning_rate": 1.713984048401931e-05, + "loss": 1.0212, + "step": 6444 + }, + { + "epoch": 0.3223, + "grad_norm": 3.6693148612976074, + "learning_rate": 1.7137396022764216e-05, + "loss": 0.5977, + "step": 6446 + }, + { + "epoch": 0.3224, + "grad_norm": 4.284255027770996, + "learning_rate": 1.7134950691839063e-05, + "loss": 0.6685, + "step": 6448 + }, + { + "epoch": 0.3225, + "grad_norm": 3.6481821537017822, + "learning_rate": 1.713250449154182e-05, + "loss": 1.0812, + "step": 6450 + }, + { + "epoch": 0.3226, + "grad_norm": 4.391505718231201, + "learning_rate": 1.713005742217053e-05, + "loss": 1.163, + "step": 6452 + }, + { + "epoch": 0.3227, + "grad_norm": 2.212996244430542, + "learning_rate": 1.7127609484023377e-05, + "loss": 1.0481, + "step": 6454 + }, + { + "epoch": 0.3228, + "grad_norm": 3.5641157627105713, + "learning_rate": 1.7125160677398625e-05, + "loss": 0.6262, + "step": 6456 + }, + { + "epoch": 0.3229, + "grad_norm": 5.336263179779053, + "learning_rate": 1.712271100259466e-05, + "loss": 1.5009, + "step": 6458 + }, + { + "epoch": 0.323, + "grad_norm": 3.9754910469055176, + "learning_rate": 1.712026045990997e-05, + "loss": 0.696, + "step": 6460 + }, + { + "epoch": 0.3231, + "grad_norm": 3.168739080429077, + "learning_rate": 1.711780904964313e-05, + "loss": 0.5122, + "step": 6462 + }, + { + "epoch": 0.3232, + "grad_norm": 2.979757308959961, + "learning_rate": 1.7115356772092858e-05, + "loss": 0.7002, + "step": 6464 + }, + { + "epoch": 0.3233, + "grad_norm": 6.069407939910889, + "learning_rate": 1.711290362755794e-05, + "loss": 1.6938, + "step": 6466 + }, + { + "epoch": 0.3234, + "grad_norm": 30.044584274291992, + "learning_rate": 1.711044961633729e-05, + "loss": 1.5544, + "step": 6468 + }, + { + "epoch": 0.3235, + "grad_norm": 2.7767770290374756, + "learning_rate": 1.710799473872993e-05, + "loss": 0.8955, + "step": 6470 + }, + { + "epoch": 0.3236, + "grad_norm": 5.483248710632324, + "learning_rate": 1.710553899503496e-05, + "loss": 0.7546, + "step": 6472 + }, + { + "epoch": 0.3237, + "grad_norm": 2.9848787784576416, + "learning_rate": 1.7103082385551627e-05, + "loss": 1.0445, + "step": 6474 + }, + { + "epoch": 0.3238, + "grad_norm": 4.327979564666748, + "learning_rate": 1.710062491057925e-05, + "loss": 1.316, + "step": 6476 + }, + { + "epoch": 0.3239, + "grad_norm": 4.452634334564209, + "learning_rate": 1.7098166570417264e-05, + "loss": 1.129, + "step": 6478 + }, + { + "epoch": 0.324, + "grad_norm": 5.198127269744873, + "learning_rate": 1.709570736536521e-05, + "loss": 1.1768, + "step": 6480 + }, + { + "epoch": 0.3241, + "grad_norm": 2.7113683223724365, + "learning_rate": 1.709324729572274e-05, + "loss": 1.0158, + "step": 6482 + }, + { + "epoch": 0.3242, + "grad_norm": 5.819094181060791, + "learning_rate": 1.7090786361789602e-05, + "loss": 0.6728, + "step": 6484 + }, + { + "epoch": 0.3243, + "grad_norm": 3.263089895248413, + "learning_rate": 1.7088324563865658e-05, + "loss": 1.0615, + "step": 6486 + }, + { + "epoch": 0.3244, + "grad_norm": 2.7157273292541504, + "learning_rate": 1.7085861902250864e-05, + "loss": 0.759, + "step": 6488 + }, + { + "epoch": 0.3245, + "grad_norm": 7.007519721984863, + "learning_rate": 1.708339837724529e-05, + "loss": 1.2184, + "step": 6490 + }, + { + "epoch": 0.3246, + "grad_norm": 8.46757698059082, + "learning_rate": 1.7080933989149112e-05, + "loss": 0.7337, + "step": 6492 + }, + { + "epoch": 0.3247, + "grad_norm": 14.769843101501465, + "learning_rate": 1.7078468738262602e-05, + "loss": 1.754, + "step": 6494 + }, + { + "epoch": 0.3248, + "grad_norm": 4.805827617645264, + "learning_rate": 1.7076002624886156e-05, + "loss": 1.2587, + "step": 6496 + }, + { + "epoch": 0.3249, + "grad_norm": 5.8346428871154785, + "learning_rate": 1.707353564932025e-05, + "loss": 1.0846, + "step": 6498 + }, + { + "epoch": 0.325, + "grad_norm": 5.9035420417785645, + "learning_rate": 1.7071067811865477e-05, + "loss": 1.5148, + "step": 6500 + }, + { + "epoch": 0.3251, + "grad_norm": 3.5271096229553223, + "learning_rate": 1.7068599112822544e-05, + "loss": 0.6114, + "step": 6502 + }, + { + "epoch": 0.3252, + "grad_norm": 4.374392509460449, + "learning_rate": 1.706612955249225e-05, + "loss": 1.6622, + "step": 6504 + }, + { + "epoch": 0.3253, + "grad_norm": 2.1772942543029785, + "learning_rate": 1.7063659131175502e-05, + "loss": 0.5519, + "step": 6506 + }, + { + "epoch": 0.3254, + "grad_norm": 4.653169631958008, + "learning_rate": 1.7061187849173318e-05, + "loss": 0.5476, + "step": 6508 + }, + { + "epoch": 0.3255, + "grad_norm": 2.924428939819336, + "learning_rate": 1.7058715706786813e-05, + "loss": 1.1332, + "step": 6510 + }, + { + "epoch": 0.3256, + "grad_norm": 2.7395856380462646, + "learning_rate": 1.705624270431721e-05, + "loss": 1.1473, + "step": 6512 + }, + { + "epoch": 0.3257, + "grad_norm": 2.728713035583496, + "learning_rate": 1.7053768842065833e-05, + "loss": 1.132, + "step": 6514 + }, + { + "epoch": 0.3258, + "grad_norm": 5.054173946380615, + "learning_rate": 1.7051294120334126e-05, + "loss": 0.9094, + "step": 6516 + }, + { + "epoch": 0.3259, + "grad_norm": 2.2402241230010986, + "learning_rate": 1.7048818539423616e-05, + "loss": 1.2644, + "step": 6518 + }, + { + "epoch": 0.326, + "grad_norm": 2.422546863555908, + "learning_rate": 1.7046342099635948e-05, + "loss": 0.872, + "step": 6520 + }, + { + "epoch": 0.3261, + "grad_norm": 2.600785255432129, + "learning_rate": 1.704386480127287e-05, + "loss": 1.0975, + "step": 6522 + }, + { + "epoch": 0.3262, + "grad_norm": 10.368834495544434, + "learning_rate": 1.704138664463623e-05, + "loss": 0.7089, + "step": 6524 + }, + { + "epoch": 0.3263, + "grad_norm": 0.46608540415763855, + "learning_rate": 1.703890763002799e-05, + "loss": 0.3526, + "step": 6526 + }, + { + "epoch": 0.3264, + "grad_norm": 14.109301567077637, + "learning_rate": 1.7036427757750205e-05, + "loss": 1.9766, + "step": 6528 + }, + { + "epoch": 0.3265, + "grad_norm": 3.0828020572662354, + "learning_rate": 1.703394702810504e-05, + "loss": 0.5902, + "step": 6530 + }, + { + "epoch": 0.3266, + "grad_norm": 5.305752277374268, + "learning_rate": 1.7031465441394766e-05, + "loss": 1.1559, + "step": 6532 + }, + { + "epoch": 0.3267, + "grad_norm": 2.565917491912842, + "learning_rate": 1.702898299792176e-05, + "loss": 1.3445, + "step": 6534 + }, + { + "epoch": 0.3268, + "grad_norm": 2.613044023513794, + "learning_rate": 1.7026499697988496e-05, + "loss": 0.3906, + "step": 6536 + }, + { + "epoch": 0.3269, + "grad_norm": 5.644947528839111, + "learning_rate": 1.702401554189755e-05, + "loss": 1.06, + "step": 6538 + }, + { + "epoch": 0.327, + "grad_norm": 13.54793643951416, + "learning_rate": 1.7021530529951627e-05, + "loss": 2.4085, + "step": 6540 + }, + { + "epoch": 0.3271, + "grad_norm": 6.403335094451904, + "learning_rate": 1.7019044662453502e-05, + "loss": 1.7526, + "step": 6542 + }, + { + "epoch": 0.3272, + "grad_norm": 7.488145351409912, + "learning_rate": 1.7016557939706075e-05, + "loss": 1.762, + "step": 6544 + }, + { + "epoch": 0.3273, + "grad_norm": 2.152088165283203, + "learning_rate": 1.701407036201235e-05, + "loss": 0.655, + "step": 6546 + }, + { + "epoch": 0.3274, + "grad_norm": 11.381845474243164, + "learning_rate": 1.7011581929675424e-05, + "loss": 1.6102, + "step": 6548 + }, + { + "epoch": 0.3275, + "grad_norm": 4.917983055114746, + "learning_rate": 1.700909264299851e-05, + "loss": 1.2821, + "step": 6550 + }, + { + "epoch": 0.3276, + "grad_norm": 4.1111369132995605, + "learning_rate": 1.700660250228492e-05, + "loss": 0.809, + "step": 6552 + }, + { + "epoch": 0.3277, + "grad_norm": 5.6742844581604, + "learning_rate": 1.7004111507838067e-05, + "loss": 0.6693, + "step": 6554 + }, + { + "epoch": 0.3278, + "grad_norm": 3.6797635555267334, + "learning_rate": 1.7001619659961467e-05, + "loss": 1.4287, + "step": 6556 + }, + { + "epoch": 0.3279, + "grad_norm": 4.691934108734131, + "learning_rate": 1.6999126958958755e-05, + "loss": 1.2638, + "step": 6558 + }, + { + "epoch": 0.328, + "grad_norm": 9.87444019317627, + "learning_rate": 1.6996633405133656e-05, + "loss": 0.7803, + "step": 6560 + }, + { + "epoch": 0.3281, + "grad_norm": 3.195220947265625, + "learning_rate": 1.6994138998789997e-05, + "loss": 1.2785, + "step": 6562 + }, + { + "epoch": 0.3282, + "grad_norm": 2.245203733444214, + "learning_rate": 1.6991643740231714e-05, + "loss": 0.8664, + "step": 6564 + }, + { + "epoch": 0.3283, + "grad_norm": 4.292178630828857, + "learning_rate": 1.6989147629762852e-05, + "loss": 1.0183, + "step": 6566 + }, + { + "epoch": 0.3284, + "grad_norm": 2.6961123943328857, + "learning_rate": 1.6986650667687552e-05, + "loss": 0.7035, + "step": 6568 + }, + { + "epoch": 0.3285, + "grad_norm": 6.205986976623535, + "learning_rate": 1.6984152854310063e-05, + "loss": 1.3513, + "step": 6570 + }, + { + "epoch": 0.3286, + "grad_norm": 1.5919686555862427, + "learning_rate": 1.698165418993473e-05, + "loss": 0.9508, + "step": 6572 + }, + { + "epoch": 0.3287, + "grad_norm": 3.3274424076080322, + "learning_rate": 1.6979154674866012e-05, + "loss": 1.2532, + "step": 6574 + }, + { + "epoch": 0.3288, + "grad_norm": 5.911673545837402, + "learning_rate": 1.6976654309408464e-05, + "loss": 1.0586, + "step": 6576 + }, + { + "epoch": 0.3289, + "grad_norm": 2.228382110595703, + "learning_rate": 1.6974153093866757e-05, + "loss": 1.7311, + "step": 6578 + }, + { + "epoch": 0.329, + "grad_norm": 6.864614486694336, + "learning_rate": 1.697165102854565e-05, + "loss": 1.0148, + "step": 6580 + }, + { + "epoch": 0.3291, + "grad_norm": 6.253079891204834, + "learning_rate": 1.6969148113750007e-05, + "loss": 0.163, + "step": 6582 + }, + { + "epoch": 0.3292, + "grad_norm": 2.441910982131958, + "learning_rate": 1.696664434978481e-05, + "loss": 0.9342, + "step": 6584 + }, + { + "epoch": 0.3293, + "grad_norm": 3.081655979156494, + "learning_rate": 1.696413973695513e-05, + "loss": 0.8536, + "step": 6586 + }, + { + "epoch": 0.3294, + "grad_norm": 1.839276909828186, + "learning_rate": 1.6961634275566147e-05, + "loss": 0.6839, + "step": 6588 + }, + { + "epoch": 0.3295, + "grad_norm": 2.970372200012207, + "learning_rate": 1.6959127965923144e-05, + "loss": 1.2167, + "step": 6590 + }, + { + "epoch": 0.3296, + "grad_norm": 2.76967191696167, + "learning_rate": 1.695662080833151e-05, + "loss": 0.7532, + "step": 6592 + }, + { + "epoch": 0.3297, + "grad_norm": 10.833727836608887, + "learning_rate": 1.695411280309673e-05, + "loss": 0.8834, + "step": 6594 + }, + { + "epoch": 0.3298, + "grad_norm": 6.336899757385254, + "learning_rate": 1.69516039505244e-05, + "loss": 1.7104, + "step": 6596 + }, + { + "epoch": 0.3299, + "grad_norm": 5.508915424346924, + "learning_rate": 1.6949094250920216e-05, + "loss": 1.6319, + "step": 6598 + }, + { + "epoch": 0.33, + "grad_norm": 2.77347469329834, + "learning_rate": 1.6946583704589973e-05, + "loss": 0.6245, + "step": 6600 + }, + { + "epoch": 0.3301, + "grad_norm": 3.3018205165863037, + "learning_rate": 1.694407231183958e-05, + "loss": 1.5007, + "step": 6602 + }, + { + "epoch": 0.3302, + "grad_norm": 2.808913469314575, + "learning_rate": 1.694156007297504e-05, + "loss": 0.8218, + "step": 6604 + }, + { + "epoch": 0.3303, + "grad_norm": 13.403093338012695, + "learning_rate": 1.6939046988302458e-05, + "loss": 1.4227, + "step": 6606 + }, + { + "epoch": 0.3304, + "grad_norm": 3.703559160232544, + "learning_rate": 1.693653305812805e-05, + "loss": 0.9953, + "step": 6608 + }, + { + "epoch": 0.3305, + "grad_norm": 5.103291034698486, + "learning_rate": 1.693401828275813e-05, + "loss": 1.195, + "step": 6610 + }, + { + "epoch": 0.3306, + "grad_norm": 8.398486137390137, + "learning_rate": 1.6931502662499116e-05, + "loss": 1.9417, + "step": 6612 + }, + { + "epoch": 0.3307, + "grad_norm": 8.665949821472168, + "learning_rate": 1.6928986197657525e-05, + "loss": 1.8342, + "step": 6614 + }, + { + "epoch": 0.3308, + "grad_norm": 22.401748657226562, + "learning_rate": 1.6926468888539988e-05, + "loss": 1.1311, + "step": 6616 + }, + { + "epoch": 0.3309, + "grad_norm": 1.9432040452957153, + "learning_rate": 1.692395073545323e-05, + "loss": 1.1453, + "step": 6618 + }, + { + "epoch": 0.331, + "grad_norm": 3.5443267822265625, + "learning_rate": 1.692143173870407e-05, + "loss": 1.1138, + "step": 6620 + }, + { + "epoch": 0.3311, + "grad_norm": 6.40571928024292, + "learning_rate": 1.691891189859945e-05, + "loss": 1.2052, + "step": 6622 + }, + { + "epoch": 0.3312, + "grad_norm": 7.98395299911499, + "learning_rate": 1.6916391215446403e-05, + "loss": 0.4549, + "step": 6624 + }, + { + "epoch": 0.3313, + "grad_norm": 5.314472198486328, + "learning_rate": 1.6913869689552066e-05, + "loss": 0.6893, + "step": 6626 + }, + { + "epoch": 0.3314, + "grad_norm": 2.1915667057037354, + "learning_rate": 1.691134732122368e-05, + "loss": 0.7021, + "step": 6628 + }, + { + "epoch": 0.3315, + "grad_norm": 3.523676872253418, + "learning_rate": 1.6908824110768584e-05, + "loss": 1.3464, + "step": 6630 + }, + { + "epoch": 0.3316, + "grad_norm": 5.475132942199707, + "learning_rate": 1.690630005849423e-05, + "loss": 1.1297, + "step": 6632 + }, + { + "epoch": 0.3317, + "grad_norm": 1.9534711837768555, + "learning_rate": 1.6903775164708163e-05, + "loss": 1.1358, + "step": 6634 + }, + { + "epoch": 0.3318, + "grad_norm": 3.016777515411377, + "learning_rate": 1.6901249429718033e-05, + "loss": 1.3157, + "step": 6636 + }, + { + "epoch": 0.3319, + "grad_norm": 3.8256642818450928, + "learning_rate": 1.6898722853831594e-05, + "loss": 0.775, + "step": 6638 + }, + { + "epoch": 0.332, + "grad_norm": 2.7814252376556396, + "learning_rate": 1.68961954373567e-05, + "loss": 0.9574, + "step": 6640 + }, + { + "epoch": 0.3321, + "grad_norm": 2.0945186614990234, + "learning_rate": 1.6893667180601313e-05, + "loss": 1.3105, + "step": 6642 + }, + { + "epoch": 0.3322, + "grad_norm": 4.214285850524902, + "learning_rate": 1.6891138083873486e-05, + "loss": 1.1844, + "step": 6644 + }, + { + "epoch": 0.3323, + "grad_norm": 0.9338229894638062, + "learning_rate": 1.688860814748139e-05, + "loss": 0.5588, + "step": 6646 + }, + { + "epoch": 0.3324, + "grad_norm": 4.25277042388916, + "learning_rate": 1.6886077371733285e-05, + "loss": 0.7909, + "step": 6648 + }, + { + "epoch": 0.3325, + "grad_norm": 1.7801108360290527, + "learning_rate": 1.688354575693754e-05, + "loss": 0.9505, + "step": 6650 + }, + { + "epoch": 0.3326, + "grad_norm": 2.688926935195923, + "learning_rate": 1.688101330340263e-05, + "loss": 0.8684, + "step": 6652 + }, + { + "epoch": 0.3327, + "grad_norm": 6.321337699890137, + "learning_rate": 1.6878480011437113e-05, + "loss": 1.0053, + "step": 6654 + }, + { + "epoch": 0.3328, + "grad_norm": 6.205510139465332, + "learning_rate": 1.6875945881349676e-05, + "loss": 0.8292, + "step": 6656 + }, + { + "epoch": 0.3329, + "grad_norm": 6.209340572357178, + "learning_rate": 1.687341091344909e-05, + "loss": 0.5376, + "step": 6658 + }, + { + "epoch": 0.333, + "grad_norm": 1.9008119106292725, + "learning_rate": 1.6870875108044233e-05, + "loss": 0.6385, + "step": 6660 + }, + { + "epoch": 0.3331, + "grad_norm": 3.893198013305664, + "learning_rate": 1.6868338465444086e-05, + "loss": 0.275, + "step": 6662 + }, + { + "epoch": 0.3332, + "grad_norm": 1.731045126914978, + "learning_rate": 1.686580098595773e-05, + "loss": 0.3359, + "step": 6664 + }, + { + "epoch": 0.3333, + "grad_norm": 3.227175235748291, + "learning_rate": 1.686326266989435e-05, + "loss": 1.0071, + "step": 6666 + }, + { + "epoch": 0.3334, + "grad_norm": 2.068748950958252, + "learning_rate": 1.6860723517563232e-05, + "loss": 0.3656, + "step": 6668 + }, + { + "epoch": 0.3335, + "grad_norm": 2.3926103115081787, + "learning_rate": 1.6858183529273766e-05, + "loss": 0.7832, + "step": 6670 + }, + { + "epoch": 0.3336, + "grad_norm": 5.418905735015869, + "learning_rate": 1.6855642705335438e-05, + "loss": 0.6764, + "step": 6672 + }, + { + "epoch": 0.3337, + "grad_norm": 5.164461612701416, + "learning_rate": 1.685310104605784e-05, + "loss": 0.6573, + "step": 6674 + }, + { + "epoch": 0.3338, + "grad_norm": 4.361355304718018, + "learning_rate": 1.685055855175067e-05, + "loss": 0.6696, + "step": 6676 + }, + { + "epoch": 0.3339, + "grad_norm": 5.315448760986328, + "learning_rate": 1.6848015222723722e-05, + "loss": 1.3037, + "step": 6678 + }, + { + "epoch": 0.334, + "grad_norm": 4.616663455963135, + "learning_rate": 1.684547105928689e-05, + "loss": 1.0622, + "step": 6680 + }, + { + "epoch": 0.3341, + "grad_norm": 4.431815147399902, + "learning_rate": 1.684292606175017e-05, + "loss": 0.9352, + "step": 6682 + }, + { + "epoch": 0.3342, + "grad_norm": 3.1820192337036133, + "learning_rate": 1.684038023042367e-05, + "loss": 1.3288, + "step": 6684 + }, + { + "epoch": 0.3343, + "grad_norm": 8.51241397857666, + "learning_rate": 1.683783356561759e-05, + "loss": 0.7634, + "step": 6686 + }, + { + "epoch": 0.3344, + "grad_norm": 4.003381252288818, + "learning_rate": 1.6835286067642228e-05, + "loss": 1.0377, + "step": 6688 + }, + { + "epoch": 0.3345, + "grad_norm": 0.16505104303359985, + "learning_rate": 1.6832737736807994e-05, + "loss": 0.1661, + "step": 6690 + }, + { + "epoch": 0.3346, + "grad_norm": 8.30132007598877, + "learning_rate": 1.683018857342539e-05, + "loss": 0.7891, + "step": 6692 + }, + { + "epoch": 0.3347, + "grad_norm": 7.620794296264648, + "learning_rate": 1.6827638577805028e-05, + "loss": 0.8899, + "step": 6694 + }, + { + "epoch": 0.3348, + "grad_norm": 6.848331451416016, + "learning_rate": 1.6825087750257617e-05, + "loss": 1.0466, + "step": 6696 + }, + { + "epoch": 0.3349, + "grad_norm": 3.9465928077697754, + "learning_rate": 1.6822536091093967e-05, + "loss": 0.8579, + "step": 6698 + }, + { + "epoch": 0.335, + "grad_norm": 6.183547496795654, + "learning_rate": 1.6819983600624986e-05, + "loss": 0.7901, + "step": 6700 + }, + { + "epoch": 0.3351, + "grad_norm": 3.7975873947143555, + "learning_rate": 1.681743027916169e-05, + "loss": 1.913, + "step": 6702 + }, + { + "epoch": 0.3352, + "grad_norm": 6.598117828369141, + "learning_rate": 1.68148761270152e-05, + "loss": 0.6841, + "step": 6704 + }, + { + "epoch": 0.3353, + "grad_norm": 4.226545810699463, + "learning_rate": 1.6812321144496722e-05, + "loss": 0.7902, + "step": 6706 + }, + { + "epoch": 0.3354, + "grad_norm": 7.517683029174805, + "learning_rate": 1.6809765331917576e-05, + "loss": 1.3041, + "step": 6708 + }, + { + "epoch": 0.3355, + "grad_norm": 2.856471538543701, + "learning_rate": 1.680720868958918e-05, + "loss": 0.9762, + "step": 6710 + }, + { + "epoch": 0.3356, + "grad_norm": 4.432267189025879, + "learning_rate": 1.6804651217823055e-05, + "loss": 1.1307, + "step": 6712 + }, + { + "epoch": 0.3357, + "grad_norm": 5.646059989929199, + "learning_rate": 1.680209291693082e-05, + "loss": 1.2965, + "step": 6714 + }, + { + "epoch": 0.3358, + "grad_norm": 2.4449708461761475, + "learning_rate": 1.6799533787224192e-05, + "loss": 0.5092, + "step": 6716 + }, + { + "epoch": 0.3359, + "grad_norm": 10.906176567077637, + "learning_rate": 1.6796973829015003e-05, + "loss": 0.8388, + "step": 6718 + }, + { + "epoch": 0.336, + "grad_norm": 7.022308826446533, + "learning_rate": 1.6794413042615168e-05, + "loss": 1.3464, + "step": 6720 + }, + { + "epoch": 0.3361, + "grad_norm": 2.500659704208374, + "learning_rate": 1.679185142833671e-05, + "loss": 0.4468, + "step": 6722 + }, + { + "epoch": 0.3362, + "grad_norm": 2.639951705932617, + "learning_rate": 1.6789288986491764e-05, + "loss": 0.7022, + "step": 6724 + }, + { + "epoch": 0.3363, + "grad_norm": 3.80880069732666, + "learning_rate": 1.6786725717392544e-05, + "loss": 0.9453, + "step": 6726 + }, + { + "epoch": 0.3364, + "grad_norm": 13.077839851379395, + "learning_rate": 1.6784161621351384e-05, + "loss": 1.9896, + "step": 6728 + }, + { + "epoch": 0.3365, + "grad_norm": 11.2069730758667, + "learning_rate": 1.6781596698680708e-05, + "loss": 0.6505, + "step": 6730 + }, + { + "epoch": 0.3366, + "grad_norm": 4.956046104431152, + "learning_rate": 1.6779030949693044e-05, + "loss": 1.5481, + "step": 6732 + }, + { + "epoch": 0.3367, + "grad_norm": 6.74932336807251, + "learning_rate": 1.6776464374701026e-05, + "loss": 0.4578, + "step": 6734 + }, + { + "epoch": 0.3368, + "grad_norm": 5.874112129211426, + "learning_rate": 1.6773896974017373e-05, + "loss": 0.9601, + "step": 6736 + }, + { + "epoch": 0.3369, + "grad_norm": 5.519652843475342, + "learning_rate": 1.6771328747954924e-05, + "loss": 0.6553, + "step": 6738 + }, + { + "epoch": 0.337, + "grad_norm": 4.804926872253418, + "learning_rate": 1.6768759696826608e-05, + "loss": 0.9065, + "step": 6740 + }, + { + "epoch": 0.3371, + "grad_norm": 10.638840675354004, + "learning_rate": 1.6766189820945456e-05, + "loss": 1.3491, + "step": 6742 + }, + { + "epoch": 0.3372, + "grad_norm": 6.353051662445068, + "learning_rate": 1.6763619120624595e-05, + "loss": 0.9697, + "step": 6744 + }, + { + "epoch": 0.3373, + "grad_norm": 0.48637905716896057, + "learning_rate": 1.676104759617726e-05, + "loss": 0.6181, + "step": 6746 + }, + { + "epoch": 0.3374, + "grad_norm": 3.551626205444336, + "learning_rate": 1.6758475247916786e-05, + "loss": 0.9983, + "step": 6748 + }, + { + "epoch": 0.3375, + "grad_norm": 4.884723663330078, + "learning_rate": 1.6755902076156606e-05, + "loss": 0.7825, + "step": 6750 + }, + { + "epoch": 0.3376, + "grad_norm": 3.1796271800994873, + "learning_rate": 1.6753328081210244e-05, + "loss": 0.3597, + "step": 6752 + }, + { + "epoch": 0.3377, + "grad_norm": 3.5238637924194336, + "learning_rate": 1.6750753263391346e-05, + "loss": 0.627, + "step": 6754 + }, + { + "epoch": 0.3378, + "grad_norm": 35.097537994384766, + "learning_rate": 1.6748177623013638e-05, + "loss": 1.4668, + "step": 6756 + }, + { + "epoch": 0.3379, + "grad_norm": 2.9921224117279053, + "learning_rate": 1.6745601160390958e-05, + "loss": 0.731, + "step": 6758 + }, + { + "epoch": 0.338, + "grad_norm": 8.81347370147705, + "learning_rate": 1.6743023875837233e-05, + "loss": 0.6617, + "step": 6760 + }, + { + "epoch": 0.3381, + "grad_norm": 4.870397567749023, + "learning_rate": 1.674044576966651e-05, + "loss": 1.2084, + "step": 6762 + }, + { + "epoch": 0.3382, + "grad_norm": 2.264230966567993, + "learning_rate": 1.6737866842192908e-05, + "loss": 1.0831, + "step": 6764 + }, + { + "epoch": 0.3383, + "grad_norm": 4.525218486785889, + "learning_rate": 1.6735287093730677e-05, + "loss": 1.3965, + "step": 6766 + }, + { + "epoch": 0.3384, + "grad_norm": 4.267285346984863, + "learning_rate": 1.6732706524594138e-05, + "loss": 1.2154, + "step": 6768 + }, + { + "epoch": 0.3385, + "grad_norm": 1.6111267805099487, + "learning_rate": 1.6730125135097736e-05, + "loss": 1.2395, + "step": 6770 + }, + { + "epoch": 0.3386, + "grad_norm": 1.8096888065338135, + "learning_rate": 1.6727542925556e-05, + "loss": 0.921, + "step": 6772 + }, + { + "epoch": 0.3387, + "grad_norm": 2.8085145950317383, + "learning_rate": 1.672495989628356e-05, + "loss": 0.8767, + "step": 6774 + }, + { + "epoch": 0.3388, + "grad_norm": 3.756871461868286, + "learning_rate": 1.6722376047595163e-05, + "loss": 0.9365, + "step": 6776 + }, + { + "epoch": 0.3389, + "grad_norm": 8.014228820800781, + "learning_rate": 1.671979137980563e-05, + "loss": 0.8052, + "step": 6778 + }, + { + "epoch": 0.339, + "grad_norm": 4.146726608276367, + "learning_rate": 1.6717205893229904e-05, + "loss": 1.7043, + "step": 6780 + }, + { + "epoch": 0.3391, + "grad_norm": 5.454761981964111, + "learning_rate": 1.6714619588183015e-05, + "loss": 1.3302, + "step": 6782 + }, + { + "epoch": 0.3392, + "grad_norm": 3.4273557662963867, + "learning_rate": 1.6712032464980094e-05, + "loss": 0.8391, + "step": 6784 + }, + { + "epoch": 0.3393, + "grad_norm": 3.0662150382995605, + "learning_rate": 1.670944452393638e-05, + "loss": 0.6381, + "step": 6786 + }, + { + "epoch": 0.3394, + "grad_norm": 1.9548391103744507, + "learning_rate": 1.6706855765367202e-05, + "loss": 1.3516, + "step": 6788 + }, + { + "epoch": 0.3395, + "grad_norm": 4.182286739349365, + "learning_rate": 1.6704266189587992e-05, + "loss": 0.5955, + "step": 6790 + }, + { + "epoch": 0.3396, + "grad_norm": 0.3888656795024872, + "learning_rate": 1.6701675796914284e-05, + "loss": 0.1751, + "step": 6792 + }, + { + "epoch": 0.3397, + "grad_norm": 3.2993364334106445, + "learning_rate": 1.6699084587661708e-05, + "loss": 0.9253, + "step": 6794 + }, + { + "epoch": 0.3398, + "grad_norm": 5.093191623687744, + "learning_rate": 1.6696492562145996e-05, + "loss": 1.6558, + "step": 6796 + }, + { + "epoch": 0.3399, + "grad_norm": 2.711188793182373, + "learning_rate": 1.6693899720682977e-05, + "loss": 0.5459, + "step": 6798 + }, + { + "epoch": 0.34, + "grad_norm": 3.7448787689208984, + "learning_rate": 1.6691306063588583e-05, + "loss": 1.2782, + "step": 6800 + }, + { + "epoch": 0.3401, + "grad_norm": 6.8757171630859375, + "learning_rate": 1.668871159117884e-05, + "loss": 0.982, + "step": 6802 + }, + { + "epoch": 0.3402, + "grad_norm": 3.3351621627807617, + "learning_rate": 1.6686116303769884e-05, + "loss": 1.0857, + "step": 6804 + }, + { + "epoch": 0.3403, + "grad_norm": 1.8503789901733398, + "learning_rate": 1.6683520201677933e-05, + "loss": 1.1555, + "step": 6806 + }, + { + "epoch": 0.3404, + "grad_norm": 4.234323024749756, + "learning_rate": 1.668092328521932e-05, + "loss": 0.8268, + "step": 6808 + }, + { + "epoch": 0.3405, + "grad_norm": 4.6406941413879395, + "learning_rate": 1.6678325554710467e-05, + "loss": 0.7673, + "step": 6810 + }, + { + "epoch": 0.3406, + "grad_norm": 5.943925857543945, + "learning_rate": 1.667572701046791e-05, + "loss": 0.4637, + "step": 6812 + }, + { + "epoch": 0.3407, + "grad_norm": 2.2114670276641846, + "learning_rate": 1.6673127652808257e-05, + "loss": 0.796, + "step": 6814 + }, + { + "epoch": 0.3408, + "grad_norm": 4.905554294586182, + "learning_rate": 1.6670527482048246e-05, + "loss": 1.292, + "step": 6816 + }, + { + "epoch": 0.3409, + "grad_norm": 6.542201042175293, + "learning_rate": 1.6667926498504695e-05, + "loss": 1.1276, + "step": 6818 + }, + { + "epoch": 0.341, + "grad_norm": 4.4347100257873535, + "learning_rate": 1.6665324702494524e-05, + "loss": 1.5542, + "step": 6820 + }, + { + "epoch": 0.3411, + "grad_norm": 5.710458755493164, + "learning_rate": 1.666272209433476e-05, + "loss": 0.6651, + "step": 6822 + }, + { + "epoch": 0.3412, + "grad_norm": 3.82248854637146, + "learning_rate": 1.666011867434252e-05, + "loss": 0.1765, + "step": 6824 + }, + { + "epoch": 0.3413, + "grad_norm": 5.595436096191406, + "learning_rate": 1.6657514442835014e-05, + "loss": 0.9445, + "step": 6826 + }, + { + "epoch": 0.3414, + "grad_norm": 4.431736946105957, + "learning_rate": 1.6654909400129575e-05, + "loss": 1.0951, + "step": 6828 + }, + { + "epoch": 0.3415, + "grad_norm": 6.698805809020996, + "learning_rate": 1.665230354654361e-05, + "loss": 0.5969, + "step": 6830 + }, + { + "epoch": 0.3416, + "grad_norm": 18.391904830932617, + "learning_rate": 1.6649696882394635e-05, + "loss": 1.5785, + "step": 6832 + }, + { + "epoch": 0.3417, + "grad_norm": 2.957167863845825, + "learning_rate": 1.664708940800027e-05, + "loss": 0.7034, + "step": 6834 + }, + { + "epoch": 0.3418, + "grad_norm": 2.5664491653442383, + "learning_rate": 1.664448112367822e-05, + "loss": 0.6807, + "step": 6836 + }, + { + "epoch": 0.3419, + "grad_norm": 3.076568603515625, + "learning_rate": 1.6641872029746297e-05, + "loss": 0.8139, + "step": 6838 + }, + { + "epoch": 0.342, + "grad_norm": 4.633054733276367, + "learning_rate": 1.6639262126522417e-05, + "loss": 0.7748, + "step": 6840 + }, + { + "epoch": 0.3421, + "grad_norm": 1.6626815795898438, + "learning_rate": 1.6636651414324586e-05, + "loss": 0.537, + "step": 6842 + }, + { + "epoch": 0.3422, + "grad_norm": 5.630598545074463, + "learning_rate": 1.6634039893470912e-05, + "loss": 0.9178, + "step": 6844 + }, + { + "epoch": 0.3423, + "grad_norm": 3.0680220127105713, + "learning_rate": 1.6631427564279602e-05, + "loss": 0.8299, + "step": 6846 + }, + { + "epoch": 0.3424, + "grad_norm": 13.98422622680664, + "learning_rate": 1.6628814427068954e-05, + "loss": 1.4505, + "step": 6848 + }, + { + "epoch": 0.3425, + "grad_norm": 1.5644530057907104, + "learning_rate": 1.6626200482157378e-05, + "loss": 1.0002, + "step": 6850 + }, + { + "epoch": 0.3426, + "grad_norm": 4.397522449493408, + "learning_rate": 1.662358572986337e-05, + "loss": 1.3749, + "step": 6852 + }, + { + "epoch": 0.3427, + "grad_norm": 3.357121706008911, + "learning_rate": 1.6620970170505534e-05, + "loss": 0.9643, + "step": 6854 + }, + { + "epoch": 0.3428, + "grad_norm": 8.824518203735352, + "learning_rate": 1.6618353804402567e-05, + "loss": 1.1366, + "step": 6856 + }, + { + "epoch": 0.3429, + "grad_norm": 5.4767303466796875, + "learning_rate": 1.6615736631873263e-05, + "loss": 0.5017, + "step": 6858 + }, + { + "epoch": 0.343, + "grad_norm": 3.5640616416931152, + "learning_rate": 1.661311865323652e-05, + "loss": 0.2187, + "step": 6860 + }, + { + "epoch": 0.3431, + "grad_norm": 5.071417808532715, + "learning_rate": 1.6610499868811327e-05, + "loss": 0.7362, + "step": 6862 + }, + { + "epoch": 0.3432, + "grad_norm": 26.249488830566406, + "learning_rate": 1.6607880278916778e-05, + "loss": 1.5468, + "step": 6864 + }, + { + "epoch": 0.3433, + "grad_norm": 3.344099998474121, + "learning_rate": 1.6605259883872063e-05, + "loss": 1.0655, + "step": 6866 + }, + { + "epoch": 0.3434, + "grad_norm": 3.252223491668701, + "learning_rate": 1.6602638683996462e-05, + "loss": 1.0426, + "step": 6868 + }, + { + "epoch": 0.3435, + "grad_norm": 3.0424818992614746, + "learning_rate": 1.660001667960937e-05, + "loss": 1.1447, + "step": 6870 + }, + { + "epoch": 0.3436, + "grad_norm": 20.368797302246094, + "learning_rate": 1.6597393871030264e-05, + "loss": 0.8026, + "step": 6872 + }, + { + "epoch": 0.3437, + "grad_norm": 2.538060188293457, + "learning_rate": 1.6594770258578722e-05, + "loss": 0.1193, + "step": 6874 + }, + { + "epoch": 0.3438, + "grad_norm": 2.980478048324585, + "learning_rate": 1.6592145842574433e-05, + "loss": 1.2918, + "step": 6876 + }, + { + "epoch": 0.3439, + "grad_norm": 1.037065029144287, + "learning_rate": 1.6589520623337173e-05, + "loss": 0.0414, + "step": 6878 + }, + { + "epoch": 0.344, + "grad_norm": 3.6368160247802734, + "learning_rate": 1.6586894601186804e-05, + "loss": 1.4137, + "step": 6880 + }, + { + "epoch": 0.3441, + "grad_norm": 4.264307498931885, + "learning_rate": 1.6584267776443318e-05, + "loss": 1.2237, + "step": 6882 + }, + { + "epoch": 0.3442, + "grad_norm": 3.381162643432617, + "learning_rate": 1.6581640149426766e-05, + "loss": 1.3317, + "step": 6884 + }, + { + "epoch": 0.3443, + "grad_norm": 3.000434160232544, + "learning_rate": 1.6579011720457333e-05, + "loss": 1.2041, + "step": 6886 + }, + { + "epoch": 0.3444, + "grad_norm": 3.2736263275146484, + "learning_rate": 1.6576382489855274e-05, + "loss": 0.8624, + "step": 6888 + }, + { + "epoch": 0.3445, + "grad_norm": 2.303568124771118, + "learning_rate": 1.657375245794096e-05, + "loss": 1.2756, + "step": 6890 + }, + { + "epoch": 0.3446, + "grad_norm": 5.448188304901123, + "learning_rate": 1.6571121625034847e-05, + "loss": 0.9552, + "step": 6892 + }, + { + "epoch": 0.3447, + "grad_norm": 8.140000343322754, + "learning_rate": 1.6568489991457498e-05, + "loss": 0.6443, + "step": 6894 + }, + { + "epoch": 0.3448, + "grad_norm": 7.185689449310303, + "learning_rate": 1.6565857557529567e-05, + "loss": 0.503, + "step": 6896 + }, + { + "epoch": 0.3449, + "grad_norm": 2.756343126296997, + "learning_rate": 1.6563224323571807e-05, + "loss": 0.8243, + "step": 6898 + }, + { + "epoch": 0.345, + "grad_norm": 6.041160583496094, + "learning_rate": 1.6560590289905074e-05, + "loss": 1.0355, + "step": 6900 + }, + { + "epoch": 0.3451, + "grad_norm": 8.860857009887695, + "learning_rate": 1.6557955456850313e-05, + "loss": 0.834, + "step": 6902 + }, + { + "epoch": 0.3452, + "grad_norm": 2.597093105316162, + "learning_rate": 1.6555319824728577e-05, + "loss": 1.4956, + "step": 6904 + }, + { + "epoch": 0.3453, + "grad_norm": 4.386066913604736, + "learning_rate": 1.6552683393860998e-05, + "loss": 1.1288, + "step": 6906 + }, + { + "epoch": 0.3454, + "grad_norm": 4.9621686935424805, + "learning_rate": 1.6550046164568827e-05, + "loss": 0.7474, + "step": 6908 + }, + { + "epoch": 0.3455, + "grad_norm": 5.073673248291016, + "learning_rate": 1.6547408137173396e-05, + "loss": 1.0983, + "step": 6910 + }, + { + "epoch": 0.3456, + "grad_norm": 6.614372730255127, + "learning_rate": 1.654476931199615e-05, + "loss": 0.9108, + "step": 6912 + }, + { + "epoch": 0.3457, + "grad_norm": 6.297581672668457, + "learning_rate": 1.6542129689358613e-05, + "loss": 1.5874, + "step": 6914 + }, + { + "epoch": 0.3458, + "grad_norm": 10.82795524597168, + "learning_rate": 1.6539489269582414e-05, + "loss": 0.833, + "step": 6916 + }, + { + "epoch": 0.3459, + "grad_norm": 4.946888446807861, + "learning_rate": 1.6536848052989292e-05, + "loss": 0.9259, + "step": 6918 + }, + { + "epoch": 0.346, + "grad_norm": 2.3536829948425293, + "learning_rate": 1.6534206039901057e-05, + "loss": 0.5621, + "step": 6920 + }, + { + "epoch": 0.3461, + "grad_norm": 4.458491802215576, + "learning_rate": 1.653156323063964e-05, + "loss": 1.1463, + "step": 6922 + }, + { + "epoch": 0.3462, + "grad_norm": 5.734611511230469, + "learning_rate": 1.652891962552705e-05, + "loss": 0.6931, + "step": 6924 + }, + { + "epoch": 0.3463, + "grad_norm": 3.605623245239258, + "learning_rate": 1.652627522488541e-05, + "loss": 1.0121, + "step": 6926 + }, + { + "epoch": 0.3464, + "grad_norm": 8.229528427124023, + "learning_rate": 1.652363002903693e-05, + "loss": 1.0306, + "step": 6928 + }, + { + "epoch": 0.3465, + "grad_norm": 3.32973575592041, + "learning_rate": 1.6520984038303924e-05, + "loss": 0.872, + "step": 6930 + }, + { + "epoch": 0.3466, + "grad_norm": 7.0349040031433105, + "learning_rate": 1.651833725300879e-05, + "loss": 1.1486, + "step": 6932 + }, + { + "epoch": 0.3467, + "grad_norm": 6.717793941497803, + "learning_rate": 1.6515689673474035e-05, + "loss": 0.9329, + "step": 6934 + }, + { + "epoch": 0.3468, + "grad_norm": 7.459303379058838, + "learning_rate": 1.6513041300022253e-05, + "loss": 0.7192, + "step": 6936 + }, + { + "epoch": 0.3469, + "grad_norm": 5.7969512939453125, + "learning_rate": 1.6510392132976148e-05, + "loss": 0.7102, + "step": 6938 + }, + { + "epoch": 0.347, + "grad_norm": 10.000517845153809, + "learning_rate": 1.650774217265851e-05, + "loss": 1.6156, + "step": 6940 + }, + { + "epoch": 0.3471, + "grad_norm": 4.600161075592041, + "learning_rate": 1.650509141939223e-05, + "loss": 0.6776, + "step": 6942 + }, + { + "epoch": 0.3472, + "grad_norm": 4.718580722808838, + "learning_rate": 1.650243987350029e-05, + "loss": 0.9868, + "step": 6944 + }, + { + "epoch": 0.3473, + "grad_norm": 9.37221908569336, + "learning_rate": 1.6499787535305777e-05, + "loss": 0.9958, + "step": 6946 + }, + { + "epoch": 0.3474, + "grad_norm": 3.9572091102600098, + "learning_rate": 1.649713440513187e-05, + "loss": 1.0894, + "step": 6948 + }, + { + "epoch": 0.3475, + "grad_norm": 7.113871097564697, + "learning_rate": 1.6494480483301836e-05, + "loss": 1.1388, + "step": 6950 + }, + { + "epoch": 0.3476, + "grad_norm": 7.382544994354248, + "learning_rate": 1.649182577013906e-05, + "loss": 1.578, + "step": 6952 + }, + { + "epoch": 0.3477, + "grad_norm": 4.479026794433594, + "learning_rate": 1.648917026596701e-05, + "loss": 0.9734, + "step": 6954 + }, + { + "epoch": 0.3478, + "grad_norm": 3.9573729038238525, + "learning_rate": 1.6486513971109245e-05, + "loss": 0.8948, + "step": 6956 + }, + { + "epoch": 0.3479, + "grad_norm": 4.377284526824951, + "learning_rate": 1.648385688588942e-05, + "loss": 0.6173, + "step": 6958 + }, + { + "epoch": 0.348, + "grad_norm": 10.419088363647461, + "learning_rate": 1.6481199010631312e-05, + "loss": 0.8345, + "step": 6960 + }, + { + "epoch": 0.3481, + "grad_norm": 16.229862213134766, + "learning_rate": 1.6478540345658758e-05, + "loss": 0.5437, + "step": 6962 + }, + { + "epoch": 0.3482, + "grad_norm": 4.687533378601074, + "learning_rate": 1.6475880891295716e-05, + "loss": 0.7347, + "step": 6964 + }, + { + "epoch": 0.3483, + "grad_norm": 2.428267478942871, + "learning_rate": 1.647322064786623e-05, + "loss": 0.6243, + "step": 6966 + }, + { + "epoch": 0.3484, + "grad_norm": 4.347933292388916, + "learning_rate": 1.6470559615694445e-05, + "loss": 1.9372, + "step": 6968 + }, + { + "epoch": 0.3485, + "grad_norm": 3.4577863216400146, + "learning_rate": 1.64678977951046e-05, + "loss": 0.585, + "step": 6970 + }, + { + "epoch": 0.3486, + "grad_norm": 5.427358150482178, + "learning_rate": 1.6465235186421024e-05, + "loss": 1.147, + "step": 6972 + }, + { + "epoch": 0.3487, + "grad_norm": 4.688440322875977, + "learning_rate": 1.6462571789968153e-05, + "loss": 0.9837, + "step": 6974 + }, + { + "epoch": 0.3488, + "grad_norm": 8.750187873840332, + "learning_rate": 1.6459907606070513e-05, + "loss": 0.9719, + "step": 6976 + }, + { + "epoch": 0.3489, + "grad_norm": 3.2722978591918945, + "learning_rate": 1.6457242635052724e-05, + "loss": 0.7408, + "step": 6978 + }, + { + "epoch": 0.349, + "grad_norm": 12.000323295593262, + "learning_rate": 1.645457687723951e-05, + "loss": 1.2936, + "step": 6980 + }, + { + "epoch": 0.3491, + "grad_norm": 5.920811176300049, + "learning_rate": 1.645191033295568e-05, + "loss": 1.1249, + "step": 6982 + }, + { + "epoch": 0.3492, + "grad_norm": 7.330420970916748, + "learning_rate": 1.6449243002526146e-05, + "loss": 0.802, + "step": 6984 + }, + { + "epoch": 0.3493, + "grad_norm": 6.360883712768555, + "learning_rate": 1.6446574886275914e-05, + "loss": 0.4399, + "step": 6986 + }, + { + "epoch": 0.3494, + "grad_norm": 4.549631595611572, + "learning_rate": 1.6443905984530092e-05, + "loss": 1.0628, + "step": 6988 + }, + { + "epoch": 0.3495, + "grad_norm": 6.403287887573242, + "learning_rate": 1.644123629761387e-05, + "loss": 0.8137, + "step": 6990 + }, + { + "epoch": 0.3496, + "grad_norm": 3.482320785522461, + "learning_rate": 1.643856582585254e-05, + "loss": 1.4593, + "step": 6992 + }, + { + "epoch": 0.3497, + "grad_norm": 5.746005058288574, + "learning_rate": 1.6435894569571496e-05, + "loss": 1.7549, + "step": 6994 + }, + { + "epoch": 0.3498, + "grad_norm": 3.722102642059326, + "learning_rate": 1.643322252909622e-05, + "loss": 0.7253, + "step": 6996 + }, + { + "epoch": 0.3499, + "grad_norm": 6.917428016662598, + "learning_rate": 1.6430549704752295e-05, + "loss": 1.3758, + "step": 6998 + }, + { + "epoch": 0.35, + "grad_norm": 10.112482070922852, + "learning_rate": 1.6427876096865394e-05, + "loss": 1.2487, + "step": 7000 + }, + { + "epoch": 0.3501, + "grad_norm": 2.640888214111328, + "learning_rate": 1.6425201705761288e-05, + "loss": 0.9005, + "step": 7002 + }, + { + "epoch": 0.3502, + "grad_norm": 6.9115729331970215, + "learning_rate": 1.6422526531765846e-05, + "loss": 0.2454, + "step": 7004 + }, + { + "epoch": 0.3503, + "grad_norm": 17.123441696166992, + "learning_rate": 1.6419850575205026e-05, + "loss": 0.9693, + "step": 7006 + }, + { + "epoch": 0.3504, + "grad_norm": 3.8103244304656982, + "learning_rate": 1.6417173836404888e-05, + "loss": 1.1187, + "step": 7008 + }, + { + "epoch": 0.3505, + "grad_norm": 2.336451292037964, + "learning_rate": 1.641449631569158e-05, + "loss": 0.5476, + "step": 7010 + }, + { + "epoch": 0.3506, + "grad_norm": 6.575082302093506, + "learning_rate": 1.6411818013391357e-05, + "loss": 1.4408, + "step": 7012 + }, + { + "epoch": 0.3507, + "grad_norm": 3.2378077507019043, + "learning_rate": 1.6409138929830556e-05, + "loss": 1.2484, + "step": 7014 + }, + { + "epoch": 0.3508, + "grad_norm": 7.628367900848389, + "learning_rate": 1.6406459065335616e-05, + "loss": 0.8798, + "step": 7016 + }, + { + "epoch": 0.3509, + "grad_norm": 4.437039852142334, + "learning_rate": 1.6403778420233073e-05, + "loss": 0.7313, + "step": 7018 + }, + { + "epoch": 0.351, + "grad_norm": 1.7260408401489258, + "learning_rate": 1.6401096994849558e-05, + "loss": 0.4817, + "step": 7020 + }, + { + "epoch": 0.3511, + "grad_norm": 3.1031835079193115, + "learning_rate": 1.6398414789511784e-05, + "loss": 1.0961, + "step": 7022 + }, + { + "epoch": 0.3512, + "grad_norm": 12.148820877075195, + "learning_rate": 1.6395731804546582e-05, + "loss": 1.0897, + "step": 7024 + }, + { + "epoch": 0.3513, + "grad_norm": 9.668585777282715, + "learning_rate": 1.6393048040280857e-05, + "loss": 1.3597, + "step": 7026 + }, + { + "epoch": 0.3514, + "grad_norm": 4.566122531890869, + "learning_rate": 1.639036349704162e-05, + "loss": 0.6806, + "step": 7028 + }, + { + "epoch": 0.3515, + "grad_norm": 2.331366777420044, + "learning_rate": 1.638767817515598e-05, + "loss": 1.6797, + "step": 7030 + }, + { + "epoch": 0.3516, + "grad_norm": 0.5537089109420776, + "learning_rate": 1.6384992074951124e-05, + "loss": 0.0913, + "step": 7032 + }, + { + "epoch": 0.3517, + "grad_norm": 7.041565418243408, + "learning_rate": 1.6382305196754357e-05, + "loss": 1.5649, + "step": 7034 + }, + { + "epoch": 0.3518, + "grad_norm": 6.132613658905029, + "learning_rate": 1.6379617540893056e-05, + "loss": 0.403, + "step": 7036 + }, + { + "epoch": 0.3519, + "grad_norm": 5.235183238983154, + "learning_rate": 1.637692910769471e-05, + "loss": 1.2674, + "step": 7038 + }, + { + "epoch": 0.352, + "grad_norm": 15.608098030090332, + "learning_rate": 1.63742398974869e-05, + "loss": 2.2082, + "step": 7040 + }, + { + "epoch": 0.3521, + "grad_norm": 2.783031702041626, + "learning_rate": 1.637154991059729e-05, + "loss": 1.7958, + "step": 7042 + }, + { + "epoch": 0.3522, + "grad_norm": 7.075440406799316, + "learning_rate": 1.636885914735365e-05, + "loss": 0.7296, + "step": 7044 + }, + { + "epoch": 0.3523, + "grad_norm": 2.8500776290893555, + "learning_rate": 1.6366167608083844e-05, + "loss": 0.6696, + "step": 7046 + }, + { + "epoch": 0.3524, + "grad_norm": 3.0908122062683105, + "learning_rate": 1.6363475293115824e-05, + "loss": 1.6119, + "step": 7048 + }, + { + "epoch": 0.3525, + "grad_norm": 2.740650177001953, + "learning_rate": 1.636078220277764e-05, + "loss": 0.7351, + "step": 7050 + }, + { + "epoch": 0.3526, + "grad_norm": 34.31309509277344, + "learning_rate": 1.6358088337397444e-05, + "loss": 1.0078, + "step": 7052 + }, + { + "epoch": 0.3527, + "grad_norm": 10.328892707824707, + "learning_rate": 1.6355393697303465e-05, + "loss": 1.4589, + "step": 7054 + }, + { + "epoch": 0.3528, + "grad_norm": 2.3138535022735596, + "learning_rate": 1.6352698282824045e-05, + "loss": 0.6188, + "step": 7056 + }, + { + "epoch": 0.3529, + "grad_norm": 3.498004674911499, + "learning_rate": 1.6350002094287608e-05, + "loss": 1.0602, + "step": 7058 + }, + { + "epoch": 0.353, + "grad_norm": 3.8142309188842773, + "learning_rate": 1.6347305132022677e-05, + "loss": 1.3127, + "step": 7060 + }, + { + "epoch": 0.3531, + "grad_norm": 4.822014331817627, + "learning_rate": 1.634460739635787e-05, + "loss": 0.7418, + "step": 7062 + }, + { + "epoch": 0.3532, + "grad_norm": 4.449622631072998, + "learning_rate": 1.6341908887621894e-05, + "loss": 0.5353, + "step": 7064 + }, + { + "epoch": 0.3533, + "grad_norm": 4.665622234344482, + "learning_rate": 1.6339209606143566e-05, + "loss": 0.5334, + "step": 7066 + }, + { + "epoch": 0.3534, + "grad_norm": 5.477107048034668, + "learning_rate": 1.6336509552251766e-05, + "loss": 1.6742, + "step": 7068 + }, + { + "epoch": 0.3535, + "grad_norm": 2.3376471996307373, + "learning_rate": 1.6333808726275503e-05, + "loss": 0.7812, + "step": 7070 + }, + { + "epoch": 0.3536, + "grad_norm": 4.350001335144043, + "learning_rate": 1.6331107128543856e-05, + "loss": 0.8144, + "step": 7072 + }, + { + "epoch": 0.3537, + "grad_norm": 4.837708473205566, + "learning_rate": 1.6328404759386015e-05, + "loss": 0.9351, + "step": 7074 + }, + { + "epoch": 0.3538, + "grad_norm": 3.094282627105713, + "learning_rate": 1.6325701619131246e-05, + "loss": 1.2276, + "step": 7076 + }, + { + "epoch": 0.3539, + "grad_norm": 9.935402870178223, + "learning_rate": 1.6322997708108923e-05, + "loss": 1.1523, + "step": 7078 + }, + { + "epoch": 0.354, + "grad_norm": 1.9671307802200317, + "learning_rate": 1.632029302664851e-05, + "loss": 0.7961, + "step": 7080 + }, + { + "epoch": 0.3541, + "grad_norm": 17.476642608642578, + "learning_rate": 1.6317587575079564e-05, + "loss": 1.2283, + "step": 7082 + }, + { + "epoch": 0.3542, + "grad_norm": 2.114968776702881, + "learning_rate": 1.6314881353731733e-05, + "loss": 0.7001, + "step": 7084 + }, + { + "epoch": 0.3543, + "grad_norm": 5.045036315917969, + "learning_rate": 1.6312174362934765e-05, + "loss": 0.6536, + "step": 7086 + }, + { + "epoch": 0.3544, + "grad_norm": 6.310004711151123, + "learning_rate": 1.6309466603018497e-05, + "loss": 1.6045, + "step": 7088 + }, + { + "epoch": 0.3545, + "grad_norm": 15.53985595703125, + "learning_rate": 1.6306758074312866e-05, + "loss": 0.9753, + "step": 7090 + }, + { + "epoch": 0.3546, + "grad_norm": 3.9945693016052246, + "learning_rate": 1.630404877714789e-05, + "loss": 0.8889, + "step": 7092 + }, + { + "epoch": 0.3547, + "grad_norm": 1.694757103919983, + "learning_rate": 1.6301338711853695e-05, + "loss": 0.9073, + "step": 7094 + }, + { + "epoch": 0.3548, + "grad_norm": 7.463717460632324, + "learning_rate": 1.6298627878760488e-05, + "loss": 0.8851, + "step": 7096 + }, + { + "epoch": 0.3549, + "grad_norm": 6.031839370727539, + "learning_rate": 1.6295916278198584e-05, + "loss": 0.8305, + "step": 7098 + }, + { + "epoch": 0.355, + "grad_norm": 8.96522045135498, + "learning_rate": 1.6293203910498375e-05, + "loss": 1.494, + "step": 7100 + }, + { + "epoch": 0.3551, + "grad_norm": 8.520224571228027, + "learning_rate": 1.629049077599036e-05, + "loss": 1.1378, + "step": 7102 + }, + { + "epoch": 0.3552, + "grad_norm": 7.788579940795898, + "learning_rate": 1.628777687500513e-05, + "loss": 1.0194, + "step": 7104 + }, + { + "epoch": 0.3553, + "grad_norm": 3.093883991241455, + "learning_rate": 1.6285062207873354e-05, + "loss": 0.965, + "step": 7106 + }, + { + "epoch": 0.3554, + "grad_norm": 3.697948455810547, + "learning_rate": 1.6282346774925816e-05, + "loss": 1.0425, + "step": 7108 + }, + { + "epoch": 0.3555, + "grad_norm": 2.8607711791992188, + "learning_rate": 1.6279630576493383e-05, + "loss": 1.376, + "step": 7110 + }, + { + "epoch": 0.3556, + "grad_norm": 7.253470420837402, + "learning_rate": 1.6276913612907005e-05, + "loss": 1.1373, + "step": 7112 + }, + { + "epoch": 0.3557, + "grad_norm": 2.2996599674224854, + "learning_rate": 1.627419588449775e-05, + "loss": 1.1988, + "step": 7114 + }, + { + "epoch": 0.3558, + "grad_norm": 5.220028877258301, + "learning_rate": 1.6271477391596754e-05, + "loss": 1.0955, + "step": 7116 + }, + { + "epoch": 0.3559, + "grad_norm": 3.407620429992676, + "learning_rate": 1.626875813453526e-05, + "loss": 1.1032, + "step": 7118 + }, + { + "epoch": 0.356, + "grad_norm": 0.1304921954870224, + "learning_rate": 1.6266038113644605e-05, + "loss": 0.1874, + "step": 7120 + }, + { + "epoch": 0.3561, + "grad_norm": 4.362794876098633, + "learning_rate": 1.6263317329256215e-05, + "loss": 1.631, + "step": 7122 + }, + { + "epoch": 0.3562, + "grad_norm": 7.461014747619629, + "learning_rate": 1.6260595781701605e-05, + "loss": 1.2146, + "step": 7124 + }, + { + "epoch": 0.3563, + "grad_norm": 4.7737555503845215, + "learning_rate": 1.625787347131239e-05, + "loss": 0.4927, + "step": 7126 + }, + { + "epoch": 0.3564, + "grad_norm": 4.305506229400635, + "learning_rate": 1.6255150398420273e-05, + "loss": 0.3915, + "step": 7128 + }, + { + "epoch": 0.3565, + "grad_norm": 4.148859024047852, + "learning_rate": 1.6252426563357054e-05, + "loss": 1.0849, + "step": 7130 + }, + { + "epoch": 0.3566, + "grad_norm": 6.351440906524658, + "learning_rate": 1.6249701966454626e-05, + "loss": 1.2512, + "step": 7132 + }, + { + "epoch": 0.3567, + "grad_norm": 4.01349401473999, + "learning_rate": 1.624697660804497e-05, + "loss": 0.81, + "step": 7134 + }, + { + "epoch": 0.3568, + "grad_norm": 3.386058807373047, + "learning_rate": 1.624425048846016e-05, + "loss": 1.3252, + "step": 7136 + }, + { + "epoch": 0.3569, + "grad_norm": 5.992469310760498, + "learning_rate": 1.6241523608032372e-05, + "loss": 1.7365, + "step": 7138 + }, + { + "epoch": 0.357, + "grad_norm": 4.069461822509766, + "learning_rate": 1.6238795967093865e-05, + "loss": 0.7834, + "step": 7140 + }, + { + "epoch": 0.3571, + "grad_norm": 0.5112014412879944, + "learning_rate": 1.6236067565976992e-05, + "loss": 0.9564, + "step": 7142 + }, + { + "epoch": 0.3572, + "grad_norm": 0.1277620494365692, + "learning_rate": 1.6233338405014204e-05, + "loss": 0.806, + "step": 7144 + }, + { + "epoch": 0.3573, + "grad_norm": 5.8834967613220215, + "learning_rate": 1.6230608484538034e-05, + "loss": 1.0853, + "step": 7146 + }, + { + "epoch": 0.3574, + "grad_norm": 6.396674156188965, + "learning_rate": 1.6227877804881126e-05, + "loss": 0.7714, + "step": 7148 + }, + { + "epoch": 0.3575, + "grad_norm": 3.1135199069976807, + "learning_rate": 1.6225146366376198e-05, + "loss": 0.8729, + "step": 7150 + }, + { + "epoch": 0.3576, + "grad_norm": 2.561298370361328, + "learning_rate": 1.6222414169356066e-05, + "loss": 0.7922, + "step": 7152 + }, + { + "epoch": 0.3577, + "grad_norm": 3.122799873352051, + "learning_rate": 1.621968121415364e-05, + "loss": 0.8173, + "step": 7154 + }, + { + "epoch": 0.3578, + "grad_norm": 3.088099718093872, + "learning_rate": 1.621694750110193e-05, + "loss": 1.7037, + "step": 7156 + }, + { + "epoch": 0.3579, + "grad_norm": 3.9104230403900146, + "learning_rate": 1.621421303053402e-05, + "loss": 0.6041, + "step": 7158 + }, + { + "epoch": 0.358, + "grad_norm": 2.9602301120758057, + "learning_rate": 1.6211477802783105e-05, + "loss": 1.0897, + "step": 7160 + }, + { + "epoch": 0.3581, + "grad_norm": 6.63963508605957, + "learning_rate": 1.620874181818246e-05, + "loss": 1.0138, + "step": 7162 + }, + { + "epoch": 0.3582, + "grad_norm": 7.595367431640625, + "learning_rate": 1.6206005077065457e-05, + "loss": 1.4531, + "step": 7164 + }, + { + "epoch": 0.3583, + "grad_norm": 6.888699054718018, + "learning_rate": 1.6203267579765563e-05, + "loss": 1.2474, + "step": 7166 + }, + { + "epoch": 0.3584, + "grad_norm": 14.941417694091797, + "learning_rate": 1.620052932661633e-05, + "loss": 2.0905, + "step": 7168 + }, + { + "epoch": 0.3585, + "grad_norm": 2.3448615074157715, + "learning_rate": 1.6197790317951403e-05, + "loss": 0.5419, + "step": 7170 + }, + { + "epoch": 0.3586, + "grad_norm": 8.784735679626465, + "learning_rate": 1.619505055410453e-05, + "loss": 0.8383, + "step": 7172 + }, + { + "epoch": 0.3587, + "grad_norm": 3.8821752071380615, + "learning_rate": 1.6192310035409536e-05, + "loss": 1.7145, + "step": 7174 + }, + { + "epoch": 0.3588, + "grad_norm": 10.993499755859375, + "learning_rate": 1.618956876220035e-05, + "loss": 1.066, + "step": 7176 + }, + { + "epoch": 0.3589, + "grad_norm": 9.270833015441895, + "learning_rate": 1.618682673481098e-05, + "loss": 1.3275, + "step": 7178 + }, + { + "epoch": 0.359, + "grad_norm": 8.419032096862793, + "learning_rate": 1.6184083953575543e-05, + "loss": 0.8641, + "step": 7180 + }, + { + "epoch": 0.3591, + "grad_norm": 4.83672571182251, + "learning_rate": 1.6181340418828234e-05, + "loss": 0.7536, + "step": 7182 + }, + { + "epoch": 0.3592, + "grad_norm": 3.6731598377227783, + "learning_rate": 1.6178596130903345e-05, + "loss": 0.5644, + "step": 7184 + }, + { + "epoch": 0.3593, + "grad_norm": 6.5588178634643555, + "learning_rate": 1.617585109013526e-05, + "loss": 1.0333, + "step": 7186 + }, + { + "epoch": 0.3594, + "grad_norm": 8.674559593200684, + "learning_rate": 1.617310529685845e-05, + "loss": 0.8338, + "step": 7188 + }, + { + "epoch": 0.3595, + "grad_norm": 2.3396921157836914, + "learning_rate": 1.617035875140749e-05, + "loss": 0.6307, + "step": 7190 + }, + { + "epoch": 0.3596, + "grad_norm": 4.458493232727051, + "learning_rate": 1.6167611454117027e-05, + "loss": 1.1194, + "step": 7192 + }, + { + "epoch": 0.3597, + "grad_norm": 9.831637382507324, + "learning_rate": 1.616486340532182e-05, + "loss": 1.0178, + "step": 7194 + }, + { + "epoch": 0.3598, + "grad_norm": 4.828587055206299, + "learning_rate": 1.6162114605356704e-05, + "loss": 1.306, + "step": 7196 + }, + { + "epoch": 0.3599, + "grad_norm": 4.884326934814453, + "learning_rate": 1.615936505455662e-05, + "loss": 1.1218, + "step": 7198 + }, + { + "epoch": 0.36, + "grad_norm": 5.85085391998291, + "learning_rate": 1.6156614753256583e-05, + "loss": 0.927, + "step": 7200 + }, + { + "epoch": 0.3601, + "grad_norm": 3.1760218143463135, + "learning_rate": 1.6153863701791717e-05, + "loss": 0.9528, + "step": 7202 + }, + { + "epoch": 0.3602, + "grad_norm": 4.869912147521973, + "learning_rate": 1.6151111900497225e-05, + "loss": 1.0439, + "step": 7204 + }, + { + "epoch": 0.3603, + "grad_norm": 4.26956033706665, + "learning_rate": 1.61483593497084e-05, + "loss": 1.0184, + "step": 7206 + }, + { + "epoch": 0.3604, + "grad_norm": 7.222487449645996, + "learning_rate": 1.6145606049760644e-05, + "loss": 0.8265, + "step": 7208 + }, + { + "epoch": 0.3605, + "grad_norm": 4.85507345199585, + "learning_rate": 1.6142852000989432e-05, + "loss": 0.6824, + "step": 7210 + }, + { + "epoch": 0.3606, + "grad_norm": 3.3153553009033203, + "learning_rate": 1.614009720373034e-05, + "loss": 0.8566, + "step": 7212 + }, + { + "epoch": 0.3607, + "grad_norm": 5.780735492706299, + "learning_rate": 1.6137341658319022e-05, + "loss": 1.21, + "step": 7214 + }, + { + "epoch": 0.3608, + "grad_norm": 5.62933874130249, + "learning_rate": 1.6134585365091243e-05, + "loss": 0.3655, + "step": 7216 + }, + { + "epoch": 0.3609, + "grad_norm": 2.6655490398406982, + "learning_rate": 1.6131828324382848e-05, + "loss": 0.6192, + "step": 7218 + }, + { + "epoch": 0.361, + "grad_norm": 4.722904205322266, + "learning_rate": 1.6129070536529767e-05, + "loss": 1.3938, + "step": 7220 + }, + { + "epoch": 0.3611, + "grad_norm": 5.953144073486328, + "learning_rate": 1.6126312001868034e-05, + "loss": 0.8452, + "step": 7222 + }, + { + "epoch": 0.3612, + "grad_norm": 3.5768635272979736, + "learning_rate": 1.6123552720733767e-05, + "loss": 1.1541, + "step": 7224 + }, + { + "epoch": 0.3613, + "grad_norm": 4.5091118812561035, + "learning_rate": 1.6120792693463175e-05, + "loss": 1.9622, + "step": 7226 + }, + { + "epoch": 0.3614, + "grad_norm": 3.8241472244262695, + "learning_rate": 1.611803192039256e-05, + "loss": 0.2933, + "step": 7228 + }, + { + "epoch": 0.3615, + "grad_norm": 2.437955379486084, + "learning_rate": 1.6115270401858315e-05, + "loss": 0.8179, + "step": 7230 + }, + { + "epoch": 0.3616, + "grad_norm": 3.8149547576904297, + "learning_rate": 1.611250813819692e-05, + "loss": 1.2749, + "step": 7232 + }, + { + "epoch": 0.3617, + "grad_norm": 4.215950012207031, + "learning_rate": 1.610974512974495e-05, + "loss": 0.8575, + "step": 7234 + }, + { + "epoch": 0.3618, + "grad_norm": 2.836298704147339, + "learning_rate": 1.6106981376839064e-05, + "loss": 0.4574, + "step": 7236 + }, + { + "epoch": 0.3619, + "grad_norm": 10.506014823913574, + "learning_rate": 1.6104216879816027e-05, + "loss": 0.6864, + "step": 7238 + }, + { + "epoch": 0.362, + "grad_norm": 0.4539174437522888, + "learning_rate": 1.610145163901268e-05, + "loss": 0.45, + "step": 7240 + }, + { + "epoch": 0.3621, + "grad_norm": 3.8662757873535156, + "learning_rate": 1.6098685654765956e-05, + "loss": 0.8454, + "step": 7242 + }, + { + "epoch": 0.3622, + "grad_norm": 4.573432445526123, + "learning_rate": 1.6095918927412883e-05, + "loss": 0.6789, + "step": 7244 + }, + { + "epoch": 0.3623, + "grad_norm": 4.200647830963135, + "learning_rate": 1.609315145729058e-05, + "loss": 1.08, + "step": 7246 + }, + { + "epoch": 0.3624, + "grad_norm": 7.158957004547119, + "learning_rate": 1.6090383244736256e-05, + "loss": 1.563, + "step": 7248 + }, + { + "epoch": 0.3625, + "grad_norm": 2.028681755065918, + "learning_rate": 1.608761429008721e-05, + "loss": 1.2022, + "step": 7250 + }, + { + "epoch": 0.3626, + "grad_norm": 3.223787307739258, + "learning_rate": 1.608484459368082e-05, + "loss": 1.1638, + "step": 7252 + }, + { + "epoch": 0.3627, + "grad_norm": 24.48465347290039, + "learning_rate": 1.6082074155854583e-05, + "loss": 0.5458, + "step": 7254 + }, + { + "epoch": 0.3628, + "grad_norm": 8.976872444152832, + "learning_rate": 1.6079302976946055e-05, + "loss": 0.5291, + "step": 7256 + }, + { + "epoch": 0.3629, + "grad_norm": 2.2672982215881348, + "learning_rate": 1.60765310572929e-05, + "loss": 1.1462, + "step": 7258 + }, + { + "epoch": 0.363, + "grad_norm": 13.690779685974121, + "learning_rate": 1.607375839723287e-05, + "loss": 1.2648, + "step": 7260 + }, + { + "epoch": 0.3631, + "grad_norm": 2.551907539367676, + "learning_rate": 1.60709849971038e-05, + "loss": 1.0077, + "step": 7262 + }, + { + "epoch": 0.3632, + "grad_norm": 5.283805847167969, + "learning_rate": 1.6068210857243625e-05, + "loss": 0.4211, + "step": 7264 + }, + { + "epoch": 0.3633, + "grad_norm": 4.943664073944092, + "learning_rate": 1.606543597799036e-05, + "loss": 0.8326, + "step": 7266 + }, + { + "epoch": 0.3634, + "grad_norm": 4.107395648956299, + "learning_rate": 1.6062660359682124e-05, + "loss": 1.2361, + "step": 7268 + }, + { + "epoch": 0.3635, + "grad_norm": 2.6184234619140625, + "learning_rate": 1.605988400265711e-05, + "loss": 0.6555, + "step": 7270 + }, + { + "epoch": 0.3636, + "grad_norm": 4.209074020385742, + "learning_rate": 1.6057106907253617e-05, + "loss": 0.8908, + "step": 7272 + }, + { + "epoch": 0.3637, + "grad_norm": 6.445681095123291, + "learning_rate": 1.6054329073810016e-05, + "loss": 0.4017, + "step": 7274 + }, + { + "epoch": 0.3638, + "grad_norm": 8.576156616210938, + "learning_rate": 1.605155050266478e-05, + "loss": 0.7515, + "step": 7276 + }, + { + "epoch": 0.3639, + "grad_norm": 4.354447841644287, + "learning_rate": 1.6048771194156477e-05, + "loss": 1.0242, + "step": 7278 + }, + { + "epoch": 0.364, + "grad_norm": 9.960633277893066, + "learning_rate": 1.6045991148623752e-05, + "loss": 0.9004, + "step": 7280 + }, + { + "epoch": 0.3641, + "grad_norm": 3.3502016067504883, + "learning_rate": 1.6043210366405338e-05, + "loss": 0.8009, + "step": 7282 + }, + { + "epoch": 0.3642, + "grad_norm": 4.555631637573242, + "learning_rate": 1.6040428847840078e-05, + "loss": 1.1293, + "step": 7284 + }, + { + "epoch": 0.3643, + "grad_norm": 3.8463878631591797, + "learning_rate": 1.6037646593266883e-05, + "loss": 1.6357, + "step": 7286 + }, + { + "epoch": 0.3644, + "grad_norm": 17.68130111694336, + "learning_rate": 1.6034863603024768e-05, + "loss": 1.2625, + "step": 7288 + }, + { + "epoch": 0.3645, + "grad_norm": 8.984333038330078, + "learning_rate": 1.6032079877452825e-05, + "loss": 0.8096, + "step": 7290 + }, + { + "epoch": 0.3646, + "grad_norm": 9.379621505737305, + "learning_rate": 1.602929541689025e-05, + "loss": 1.3063, + "step": 7292 + }, + { + "epoch": 0.3647, + "grad_norm": 2.901810646057129, + "learning_rate": 1.6026510221676312e-05, + "loss": 1.1363, + "step": 7294 + }, + { + "epoch": 0.3648, + "grad_norm": 2.654109239578247, + "learning_rate": 1.6023724292150387e-05, + "loss": 0.6619, + "step": 7296 + }, + { + "epoch": 0.3649, + "grad_norm": 13.225042343139648, + "learning_rate": 1.6020937628651928e-05, + "loss": 1.296, + "step": 7298 + }, + { + "epoch": 0.365, + "grad_norm": 1.7318298816680908, + "learning_rate": 1.6018150231520486e-05, + "loss": 0.1757, + "step": 7300 + }, + { + "epoch": 0.3651, + "grad_norm": 15.924278259277344, + "learning_rate": 1.6015362101095688e-05, + "loss": 1.1467, + "step": 7302 + }, + { + "epoch": 0.3652, + "grad_norm": 10.234320640563965, + "learning_rate": 1.601257323771727e-05, + "loss": 1.3427, + "step": 7304 + }, + { + "epoch": 0.3653, + "grad_norm": 5.603830337524414, + "learning_rate": 1.6009783641725033e-05, + "loss": 1.5962, + "step": 7306 + }, + { + "epoch": 0.3654, + "grad_norm": 5.2604570388793945, + "learning_rate": 1.6006993313458896e-05, + "loss": 0.5729, + "step": 7308 + }, + { + "epoch": 0.3655, + "grad_norm": 2.895200729370117, + "learning_rate": 1.6004202253258844e-05, + "loss": 1.2292, + "step": 7310 + }, + { + "epoch": 0.3656, + "grad_norm": 4.680354595184326, + "learning_rate": 1.6001410461464955e-05, + "loss": 1.5249, + "step": 7312 + }, + { + "epoch": 0.3657, + "grad_norm": 4.762091159820557, + "learning_rate": 1.599861793841741e-05, + "loss": 0.7789, + "step": 7314 + }, + { + "epoch": 0.3658, + "grad_norm": 6.578746795654297, + "learning_rate": 1.5995824684456465e-05, + "loss": 0.5189, + "step": 7316 + }, + { + "epoch": 0.3659, + "grad_norm": 4.4913649559021, + "learning_rate": 1.5993030699922467e-05, + "loss": 0.5783, + "step": 7318 + }, + { + "epoch": 0.366, + "grad_norm": 4.846243381500244, + "learning_rate": 1.599023598515586e-05, + "loss": 1.0508, + "step": 7320 + }, + { + "epoch": 0.3661, + "grad_norm": 6.895331859588623, + "learning_rate": 1.5987440540497167e-05, + "loss": 1.0693, + "step": 7322 + }, + { + "epoch": 0.3662, + "grad_norm": 3.2011871337890625, + "learning_rate": 1.5984644366287007e-05, + "loss": 0.9752, + "step": 7324 + }, + { + "epoch": 0.3663, + "grad_norm": 8.715478897094727, + "learning_rate": 1.5981847462866086e-05, + "loss": 0.5202, + "step": 7326 + }, + { + "epoch": 0.3664, + "grad_norm": 10.01157283782959, + "learning_rate": 1.597904983057519e-05, + "loss": 1.338, + "step": 7328 + }, + { + "epoch": 0.3665, + "grad_norm": 1.1487444639205933, + "learning_rate": 1.5976251469755214e-05, + "loss": 0.4198, + "step": 7330 + }, + { + "epoch": 0.3666, + "grad_norm": 2.595773935317993, + "learning_rate": 1.5973452380747125e-05, + "loss": 0.3577, + "step": 7332 + }, + { + "epoch": 0.3667, + "grad_norm": 10.417640686035156, + "learning_rate": 1.5970652563891976e-05, + "loss": 1.2076, + "step": 7334 + }, + { + "epoch": 0.3668, + "grad_norm": 2.4658682346343994, + "learning_rate": 1.596785201953093e-05, + "loss": 1.0438, + "step": 7336 + }, + { + "epoch": 0.3669, + "grad_norm": 4.961884021759033, + "learning_rate": 1.5965050748005215e-05, + "loss": 1.212, + "step": 7338 + }, + { + "epoch": 0.367, + "grad_norm": 1.8554396629333496, + "learning_rate": 1.5962248749656158e-05, + "loss": 0.8982, + "step": 7340 + }, + { + "epoch": 0.3671, + "grad_norm": 11.671246528625488, + "learning_rate": 1.595944602482518e-05, + "loss": 0.8263, + "step": 7342 + }, + { + "epoch": 0.3672, + "grad_norm": 6.0888519287109375, + "learning_rate": 1.5956642573853784e-05, + "loss": 1.2615, + "step": 7344 + }, + { + "epoch": 0.3673, + "grad_norm": 2.9551479816436768, + "learning_rate": 1.595383839708355e-05, + "loss": 1.0329, + "step": 7346 + }, + { + "epoch": 0.3674, + "grad_norm": 7.479531288146973, + "learning_rate": 1.5951033494856174e-05, + "loss": 0.974, + "step": 7348 + }, + { + "epoch": 0.3675, + "grad_norm": 7.700357913970947, + "learning_rate": 1.5948227867513416e-05, + "loss": 0.9332, + "step": 7350 + }, + { + "epoch": 0.3676, + "grad_norm": 5.085978031158447, + "learning_rate": 1.5945421515397135e-05, + "loss": 0.6512, + "step": 7352 + }, + { + "epoch": 0.3677, + "grad_norm": 15.039106369018555, + "learning_rate": 1.5942614438849275e-05, + "loss": 1.1586, + "step": 7354 + }, + { + "epoch": 0.3678, + "grad_norm": 8.32210922241211, + "learning_rate": 1.593980663821187e-05, + "loss": 1.3603, + "step": 7356 + }, + { + "epoch": 0.3679, + "grad_norm": 2.488356828689575, + "learning_rate": 1.593699811382705e-05, + "loss": 1.0141, + "step": 7358 + }, + { + "epoch": 0.368, + "grad_norm": 3.2576141357421875, + "learning_rate": 1.5934188866037017e-05, + "loss": 1.036, + "step": 7360 + }, + { + "epoch": 0.3681, + "grad_norm": 3.6186630725860596, + "learning_rate": 1.5931378895184068e-05, + "loss": 0.6723, + "step": 7362 + }, + { + "epoch": 0.3682, + "grad_norm": 4.385692596435547, + "learning_rate": 1.5928568201610593e-05, + "loss": 1.2574, + "step": 7364 + }, + { + "epoch": 0.3683, + "grad_norm": 1.6087558269500732, + "learning_rate": 1.5925756785659066e-05, + "loss": 0.2199, + "step": 7366 + }, + { + "epoch": 0.3684, + "grad_norm": 3.1137001514434814, + "learning_rate": 1.592294464767205e-05, + "loss": 1.0079, + "step": 7368 + }, + { + "epoch": 0.3685, + "grad_norm": 4.854279518127441, + "learning_rate": 1.5920131787992198e-05, + "loss": 1.355, + "step": 7370 + }, + { + "epoch": 0.3686, + "grad_norm": 2.1175577640533447, + "learning_rate": 1.591731820696224e-05, + "loss": 1.0574, + "step": 7372 + }, + { + "epoch": 0.3687, + "grad_norm": 8.227606773376465, + "learning_rate": 1.5914503904925013e-05, + "loss": 1.6102, + "step": 7374 + }, + { + "epoch": 0.3688, + "grad_norm": 10.8551664352417, + "learning_rate": 1.591168888222342e-05, + "loss": 1.241, + "step": 7376 + }, + { + "epoch": 0.3689, + "grad_norm": 4.045969486236572, + "learning_rate": 1.5908873139200475e-05, + "loss": 1.8017, + "step": 7378 + }, + { + "epoch": 0.369, + "grad_norm": 3.604954957962036, + "learning_rate": 1.5906056676199256e-05, + "loss": 1.2306, + "step": 7380 + }, + { + "epoch": 0.3691, + "grad_norm": 4.108372688293457, + "learning_rate": 1.5903239493562948e-05, + "loss": 0.9253, + "step": 7382 + }, + { + "epoch": 0.3692, + "grad_norm": 9.771201133728027, + "learning_rate": 1.5900421591634813e-05, + "loss": 1.7805, + "step": 7384 + }, + { + "epoch": 0.3693, + "grad_norm": 7.640194416046143, + "learning_rate": 1.5897602970758207e-05, + "loss": 1.2343, + "step": 7386 + }, + { + "epoch": 0.3694, + "grad_norm": 4.049373626708984, + "learning_rate": 1.589478363127657e-05, + "loss": 1.129, + "step": 7388 + }, + { + "epoch": 0.3695, + "grad_norm": 3.0123343467712402, + "learning_rate": 1.5891963573533424e-05, + "loss": 1.6745, + "step": 7390 + }, + { + "epoch": 0.3696, + "grad_norm": 4.127812385559082, + "learning_rate": 1.5889142797872387e-05, + "loss": 0.5659, + "step": 7392 + }, + { + "epoch": 0.3697, + "grad_norm": 2.5936155319213867, + "learning_rate": 1.588632130463717e-05, + "loss": 0.9635, + "step": 7394 + }, + { + "epoch": 0.3698, + "grad_norm": 7.284082412719727, + "learning_rate": 1.5883499094171556e-05, + "loss": 1.1088, + "step": 7396 + }, + { + "epoch": 0.3699, + "grad_norm": 2.547081232070923, + "learning_rate": 1.588067616681942e-05, + "loss": 1.0672, + "step": 7398 + }, + { + "epoch": 0.37, + "grad_norm": 5.418205261230469, + "learning_rate": 1.5877852522924733e-05, + "loss": 0.8828, + "step": 7400 + }, + { + "epoch": 0.3701, + "grad_norm": 4.582211971282959, + "learning_rate": 1.5875028162831547e-05, + "loss": 0.8384, + "step": 7402 + }, + { + "epoch": 0.3702, + "grad_norm": 4.716906547546387, + "learning_rate": 1.5872203086883996e-05, + "loss": 0.3917, + "step": 7404 + }, + { + "epoch": 0.3703, + "grad_norm": 5.716814041137695, + "learning_rate": 1.5869377295426316e-05, + "loss": 0.82, + "step": 7406 + }, + { + "epoch": 0.3704, + "grad_norm": 3.375746965408325, + "learning_rate": 1.5866550788802815e-05, + "loss": 1.3711, + "step": 7408 + }, + { + "epoch": 0.3705, + "grad_norm": 2.9225425720214844, + "learning_rate": 1.5863723567357892e-05, + "loss": 1.1053, + "step": 7410 + }, + { + "epoch": 0.3706, + "grad_norm": 5.485970497131348, + "learning_rate": 1.5860895631436044e-05, + "loss": 0.7076, + "step": 7412 + }, + { + "epoch": 0.3707, + "grad_norm": 15.798709869384766, + "learning_rate": 1.5858066981381843e-05, + "loss": 1.4047, + "step": 7414 + }, + { + "epoch": 0.3708, + "grad_norm": 2.9794158935546875, + "learning_rate": 1.5855237617539943e-05, + "loss": 0.8879, + "step": 7416 + }, + { + "epoch": 0.3709, + "grad_norm": 6.269447326660156, + "learning_rate": 1.5852407540255103e-05, + "loss": 1.2922, + "step": 7418 + }, + { + "epoch": 0.371, + "grad_norm": 2.0160694122314453, + "learning_rate": 1.584957674987216e-05, + "loss": 0.7428, + "step": 7420 + }, + { + "epoch": 0.3711, + "grad_norm": 2.375972270965576, + "learning_rate": 1.5846745246736027e-05, + "loss": 0.8734, + "step": 7422 + }, + { + "epoch": 0.3712, + "grad_norm": 3.451451301574707, + "learning_rate": 1.5843913031191722e-05, + "loss": 0.2005, + "step": 7424 + }, + { + "epoch": 0.3713, + "grad_norm": 3.744215250015259, + "learning_rate": 1.5841080103584342e-05, + "loss": 1.4384, + "step": 7426 + }, + { + "epoch": 0.3714, + "grad_norm": 3.0242526531219482, + "learning_rate": 1.583824646425907e-05, + "loss": 0.8415, + "step": 7428 + }, + { + "epoch": 0.3715, + "grad_norm": 3.9509024620056152, + "learning_rate": 1.5835412113561176e-05, + "loss": 0.8975, + "step": 7430 + }, + { + "epoch": 0.3716, + "grad_norm": 6.827988624572754, + "learning_rate": 1.5832577051836016e-05, + "loss": 0.9696, + "step": 7432 + }, + { + "epoch": 0.3717, + "grad_norm": 12.72479248046875, + "learning_rate": 1.5829741279429035e-05, + "loss": 0.9396, + "step": 7434 + }, + { + "epoch": 0.3718, + "grad_norm": 3.6071979999542236, + "learning_rate": 1.5826904796685763e-05, + "loss": 0.9463, + "step": 7436 + }, + { + "epoch": 0.3719, + "grad_norm": 2.3721835613250732, + "learning_rate": 1.5824067603951814e-05, + "loss": 0.731, + "step": 7438 + }, + { + "epoch": 0.372, + "grad_norm": 5.940408229827881, + "learning_rate": 1.5821229701572897e-05, + "loss": 0.6597, + "step": 7440 + }, + { + "epoch": 0.3721, + "grad_norm": 4.540580749511719, + "learning_rate": 1.5818391089894797e-05, + "loss": 0.9598, + "step": 7442 + }, + { + "epoch": 0.3722, + "grad_norm": 5.967430591583252, + "learning_rate": 1.5815551769263387e-05, + "loss": 1.2342, + "step": 7444 + }, + { + "epoch": 0.3723, + "grad_norm": 4.271355152130127, + "learning_rate": 1.581271174002464e-05, + "loss": 0.4767, + "step": 7446 + }, + { + "epoch": 0.3724, + "grad_norm": 3.52215313911438, + "learning_rate": 1.5809871002524602e-05, + "loss": 0.9814, + "step": 7448 + }, + { + "epoch": 0.3725, + "grad_norm": 6.023251533508301, + "learning_rate": 1.5807029557109398e-05, + "loss": 1.2286, + "step": 7450 + }, + { + "epoch": 0.3726, + "grad_norm": 4.01070499420166, + "learning_rate": 1.580418740412526e-05, + "loss": 1.3646, + "step": 7452 + }, + { + "epoch": 0.3727, + "grad_norm": 24.0428409576416, + "learning_rate": 1.5801344543918495e-05, + "loss": 1.0718, + "step": 7454 + }, + { + "epoch": 0.3728, + "grad_norm": 4.931824684143066, + "learning_rate": 1.5798500976835493e-05, + "loss": 0.1917, + "step": 7456 + }, + { + "epoch": 0.3729, + "grad_norm": 2.978973865509033, + "learning_rate": 1.5795656703222736e-05, + "loss": 1.1432, + "step": 7458 + }, + { + "epoch": 0.373, + "grad_norm": 5.4848952293396, + "learning_rate": 1.5792811723426787e-05, + "loss": 1.0502, + "step": 7460 + }, + { + "epoch": 0.3731, + "grad_norm": 5.260626316070557, + "learning_rate": 1.5789966037794305e-05, + "loss": 0.927, + "step": 7462 + }, + { + "epoch": 0.3732, + "grad_norm": 7.665822982788086, + "learning_rate": 1.5787119646672025e-05, + "loss": 1.2492, + "step": 7464 + }, + { + "epoch": 0.3733, + "grad_norm": 3.745206832885742, + "learning_rate": 1.5784272550406767e-05, + "loss": 0.8945, + "step": 7466 + }, + { + "epoch": 0.3734, + "grad_norm": 3.6459007263183594, + "learning_rate": 1.5781424749345447e-05, + "loss": 0.9449, + "step": 7468 + }, + { + "epoch": 0.3735, + "grad_norm": 3.455899715423584, + "learning_rate": 1.5778576243835055e-05, + "loss": 1.1575, + "step": 7470 + }, + { + "epoch": 0.3736, + "grad_norm": 5.873315334320068, + "learning_rate": 1.5775727034222675e-05, + "loss": 1.1344, + "step": 7472 + }, + { + "epoch": 0.3737, + "grad_norm": 2.74369740486145, + "learning_rate": 1.577287712085548e-05, + "loss": 1.1413, + "step": 7474 + }, + { + "epoch": 0.3738, + "grad_norm": 3.6689321994781494, + "learning_rate": 1.577002650408072e-05, + "loss": 0.5707, + "step": 7476 + }, + { + "epoch": 0.3739, + "grad_norm": 1.0372940301895142, + "learning_rate": 1.5767175184245728e-05, + "loss": 0.4641, + "step": 7478 + }, + { + "epoch": 0.374, + "grad_norm": 4.923166275024414, + "learning_rate": 1.5764323161697933e-05, + "loss": 0.843, + "step": 7480 + }, + { + "epoch": 0.3741, + "grad_norm": 10.891165733337402, + "learning_rate": 1.5761470436784848e-05, + "loss": 1.7203, + "step": 7482 + }, + { + "epoch": 0.3742, + "grad_norm": 10.667366981506348, + "learning_rate": 1.5758617009854068e-05, + "loss": 1.1729, + "step": 7484 + }, + { + "epoch": 0.3743, + "grad_norm": 5.81751012802124, + "learning_rate": 1.575576288125327e-05, + "loss": 1.582, + "step": 7486 + }, + { + "epoch": 0.3744, + "grad_norm": 10.603717803955078, + "learning_rate": 1.575290805133023e-05, + "loss": 0.7256, + "step": 7488 + }, + { + "epoch": 0.3745, + "grad_norm": 5.766211986541748, + "learning_rate": 1.575005252043279e-05, + "loss": 1.1312, + "step": 7490 + }, + { + "epoch": 0.3746, + "grad_norm": 4.688629150390625, + "learning_rate": 1.5747196288908887e-05, + "loss": 1.5534, + "step": 7492 + }, + { + "epoch": 0.3747, + "grad_norm": 3.0609183311462402, + "learning_rate": 1.5744339357106557e-05, + "loss": 1.2336, + "step": 7494 + }, + { + "epoch": 0.3748, + "grad_norm": 2.953345775604248, + "learning_rate": 1.57414817253739e-05, + "loss": 0.8237, + "step": 7496 + }, + { + "epoch": 0.3749, + "grad_norm": 2.3105244636535645, + "learning_rate": 1.5738623394059105e-05, + "loss": 0.6251, + "step": 7498 + }, + { + "epoch": 0.375, + "grad_norm": 5.890372276306152, + "learning_rate": 1.573576436351046e-05, + "loss": 1.0343, + "step": 7500 + }, + { + "epoch": 0.3751, + "grad_norm": 2.289280891418457, + "learning_rate": 1.573290463407633e-05, + "loss": 1.2436, + "step": 7502 + }, + { + "epoch": 0.3752, + "grad_norm": 8.279025077819824, + "learning_rate": 1.5730044206105156e-05, + "loss": 0.9351, + "step": 7504 + }, + { + "epoch": 0.3753, + "grad_norm": 13.008124351501465, + "learning_rate": 1.5727183079945478e-05, + "loss": 1.097, + "step": 7506 + }, + { + "epoch": 0.3754, + "grad_norm": 1.4643731117248535, + "learning_rate": 1.572432125594591e-05, + "loss": 1.0392, + "step": 7508 + }, + { + "epoch": 0.3755, + "grad_norm": 5.774810314178467, + "learning_rate": 1.5721458734455164e-05, + "loss": 1.1863, + "step": 7510 + }, + { + "epoch": 0.3756, + "grad_norm": 14.972005844116211, + "learning_rate": 1.5718595515822027e-05, + "loss": 1.7151, + "step": 7512 + }, + { + "epoch": 0.3757, + "grad_norm": 4.595862865447998, + "learning_rate": 1.5715731600395368e-05, + "loss": 1.0404, + "step": 7514 + }, + { + "epoch": 0.3758, + "grad_norm": 5.765957832336426, + "learning_rate": 1.5712866988524157e-05, + "loss": 1.5682, + "step": 7516 + }, + { + "epoch": 0.3759, + "grad_norm": 26.122188568115234, + "learning_rate": 1.571000168055743e-05, + "loss": 1.9865, + "step": 7518 + }, + { + "epoch": 0.376, + "grad_norm": 4.410683631896973, + "learning_rate": 1.570713567684432e-05, + "loss": 1.1999, + "step": 7520 + }, + { + "epoch": 0.3761, + "grad_norm": 2.2032182216644287, + "learning_rate": 1.5704268977734035e-05, + "loss": 1.3085, + "step": 7522 + }, + { + "epoch": 0.3762, + "grad_norm": 9.452346801757812, + "learning_rate": 1.5701401583575883e-05, + "loss": 0.8574, + "step": 7524 + }, + { + "epoch": 0.3763, + "grad_norm": 2.4649155139923096, + "learning_rate": 1.5698533494719238e-05, + "loss": 0.7887, + "step": 7526 + }, + { + "epoch": 0.3764, + "grad_norm": 2.785460948944092, + "learning_rate": 1.5695664711513575e-05, + "loss": 0.4361, + "step": 7528 + }, + { + "epoch": 0.3765, + "grad_norm": 2.8784873485565186, + "learning_rate": 1.5692795234308446e-05, + "loss": 1.1717, + "step": 7530 + }, + { + "epoch": 0.3766, + "grad_norm": 7.337146759033203, + "learning_rate": 1.5689925063453483e-05, + "loss": 0.9666, + "step": 7532 + }, + { + "epoch": 0.3767, + "grad_norm": 3.196058511734009, + "learning_rate": 1.5687054199298408e-05, + "loss": 0.9649, + "step": 7534 + }, + { + "epoch": 0.3768, + "grad_norm": 1.769268274307251, + "learning_rate": 1.568418264219303e-05, + "loss": 0.6728, + "step": 7536 + }, + { + "epoch": 0.3769, + "grad_norm": 5.877250671386719, + "learning_rate": 1.568131039248724e-05, + "loss": 0.7791, + "step": 7538 + }, + { + "epoch": 0.377, + "grad_norm": 3.4014577865600586, + "learning_rate": 1.5678437450531014e-05, + "loss": 1.2706, + "step": 7540 + }, + { + "epoch": 0.3771, + "grad_norm": 4.433224678039551, + "learning_rate": 1.5675563816674407e-05, + "loss": 1.8532, + "step": 7542 + }, + { + "epoch": 0.3772, + "grad_norm": 5.489716053009033, + "learning_rate": 1.567268949126757e-05, + "loss": 0.3909, + "step": 7544 + }, + { + "epoch": 0.3773, + "grad_norm": 3.3330323696136475, + "learning_rate": 1.5669814474660718e-05, + "loss": 0.6372, + "step": 7546 + }, + { + "epoch": 0.3774, + "grad_norm": 3.4589388370513916, + "learning_rate": 1.5666938767204173e-05, + "loss": 1.2608, + "step": 7548 + }, + { + "epoch": 0.3775, + "grad_norm": 6.462843894958496, + "learning_rate": 1.566406236924833e-05, + "loss": 1.2453, + "step": 7550 + }, + { + "epoch": 0.3776, + "grad_norm": 5.2400689125061035, + "learning_rate": 1.5661185281143666e-05, + "loss": 1.0059, + "step": 7552 + }, + { + "epoch": 0.3777, + "grad_norm": 4.027349948883057, + "learning_rate": 1.565830750324075e-05, + "loss": 1.4916, + "step": 7554 + }, + { + "epoch": 0.3778, + "grad_norm": 3.252349615097046, + "learning_rate": 1.565542903589023e-05, + "loss": 1.2932, + "step": 7556 + }, + { + "epoch": 0.3779, + "grad_norm": 2.8836193084716797, + "learning_rate": 1.5652549879442834e-05, + "loss": 0.7413, + "step": 7558 + }, + { + "epoch": 0.378, + "grad_norm": 6.916121006011963, + "learning_rate": 1.564967003424938e-05, + "loss": 0.4735, + "step": 7560 + }, + { + "epoch": 0.3781, + "grad_norm": 3.009347677230835, + "learning_rate": 1.5646789500660772e-05, + "loss": 0.9115, + "step": 7562 + }, + { + "epoch": 0.3782, + "grad_norm": 2.236287832260132, + "learning_rate": 1.5643908279027994e-05, + "loss": 0.8901, + "step": 7564 + }, + { + "epoch": 0.3783, + "grad_norm": 5.177398204803467, + "learning_rate": 1.5641026369702105e-05, + "loss": 1.0174, + "step": 7566 + }, + { + "epoch": 0.3784, + "grad_norm": 3.271813154220581, + "learning_rate": 1.5638143773034268e-05, + "loss": 1.0904, + "step": 7568 + }, + { + "epoch": 0.3785, + "grad_norm": 4.89677095413208, + "learning_rate": 1.5635260489375714e-05, + "loss": 0.6312, + "step": 7570 + }, + { + "epoch": 0.3786, + "grad_norm": 3.4542183876037598, + "learning_rate": 1.563237651907777e-05, + "loss": 1.2485, + "step": 7572 + }, + { + "epoch": 0.3787, + "grad_norm": 3.543092727661133, + "learning_rate": 1.5629491862491822e-05, + "loss": 0.736, + "step": 7574 + }, + { + "epoch": 0.3788, + "grad_norm": 7.09877347946167, + "learning_rate": 1.562660651996937e-05, + "loss": 1.2159, + "step": 7576 + }, + { + "epoch": 0.3789, + "grad_norm": 3.9610185623168945, + "learning_rate": 1.562372049186198e-05, + "loss": 0.574, + "step": 7578 + }, + { + "epoch": 0.379, + "grad_norm": 3.1213488578796387, + "learning_rate": 1.5620833778521306e-05, + "loss": 1.2212, + "step": 7580 + }, + { + "epoch": 0.3791, + "grad_norm": 2.9939017295837402, + "learning_rate": 1.5617946380299088e-05, + "loss": 0.5052, + "step": 7582 + }, + { + "epoch": 0.3792, + "grad_norm": 5.418000221252441, + "learning_rate": 1.5615058297547144e-05, + "loss": 0.8695, + "step": 7584 + }, + { + "epoch": 0.3793, + "grad_norm": 6.05552339553833, + "learning_rate": 1.561216953061738e-05, + "loss": 0.9779, + "step": 7586 + }, + { + "epoch": 0.3794, + "grad_norm": 2.1520497798919678, + "learning_rate": 1.560928007986178e-05, + "loss": 0.9562, + "step": 7588 + }, + { + "epoch": 0.3795, + "grad_norm": 2.9833061695098877, + "learning_rate": 1.560638994563242e-05, + "loss": 0.6462, + "step": 7590 + }, + { + "epoch": 0.3796, + "grad_norm": 4.889338970184326, + "learning_rate": 1.5603499128281447e-05, + "loss": 0.6019, + "step": 7592 + }, + { + "epoch": 0.3797, + "grad_norm": 3.6669023036956787, + "learning_rate": 1.5600607628161104e-05, + "loss": 3.3404, + "step": 7594 + }, + { + "epoch": 0.3798, + "grad_norm": 3.844857692718506, + "learning_rate": 1.5597715445623714e-05, + "loss": 0.7868, + "step": 7596 + }, + { + "epoch": 0.3799, + "grad_norm": 5.49236536026001, + "learning_rate": 1.5594822581021673e-05, + "loss": 1.0997, + "step": 7598 + }, + { + "epoch": 0.38, + "grad_norm": 4.735849857330322, + "learning_rate": 1.5591929034707468e-05, + "loss": 0.2966, + "step": 7600 + }, + { + "epoch": 0.3801, + "grad_norm": 2.165688991546631, + "learning_rate": 1.558903480703368e-05, + "loss": 0.4354, + "step": 7602 + }, + { + "epoch": 0.3802, + "grad_norm": 2.8858773708343506, + "learning_rate": 1.558613989835295e-05, + "loss": 0.4257, + "step": 7604 + }, + { + "epoch": 0.3803, + "grad_norm": 3.686577558517456, + "learning_rate": 1.5583244309018016e-05, + "loss": 1.1075, + "step": 7606 + }, + { + "epoch": 0.3804, + "grad_norm": 5.637470722198486, + "learning_rate": 1.55803480393817e-05, + "loss": 0.8172, + "step": 7608 + }, + { + "epoch": 0.3805, + "grad_norm": 1.9153755903244019, + "learning_rate": 1.5577451089796904e-05, + "loss": 1.3714, + "step": 7610 + }, + { + "epoch": 0.3806, + "grad_norm": 7.933844089508057, + "learning_rate": 1.5574553460616608e-05, + "loss": 0.7083, + "step": 7612 + }, + { + "epoch": 0.3807, + "grad_norm": 2.694565773010254, + "learning_rate": 1.5571655152193886e-05, + "loss": 0.4996, + "step": 7614 + }, + { + "epoch": 0.3808, + "grad_norm": 5.0122880935668945, + "learning_rate": 1.556875616488188e-05, + "loss": 0.4794, + "step": 7616 + }, + { + "epoch": 0.3809, + "grad_norm": 2.8097448348999023, + "learning_rate": 1.556585649903383e-05, + "loss": 1.3872, + "step": 7618 + }, + { + "epoch": 0.381, + "grad_norm": 4.563841342926025, + "learning_rate": 1.556295615500305e-05, + "loss": 1.0237, + "step": 7620 + }, + { + "epoch": 0.3811, + "grad_norm": 7.180758953094482, + "learning_rate": 1.5560055133142934e-05, + "loss": 1.0732, + "step": 7622 + }, + { + "epoch": 0.3812, + "grad_norm": 25.031208038330078, + "learning_rate": 1.5557153433806967e-05, + "loss": 1.4426, + "step": 7624 + }, + { + "epoch": 0.3813, + "grad_norm": 3.6028339862823486, + "learning_rate": 1.5554251057348712e-05, + "loss": 1.1462, + "step": 7626 + }, + { + "epoch": 0.3814, + "grad_norm": 4.929617881774902, + "learning_rate": 1.555134800412181e-05, + "loss": 0.5708, + "step": 7628 + }, + { + "epoch": 0.3815, + "grad_norm": 5.890043258666992, + "learning_rate": 1.5548444274479995e-05, + "loss": 1.0003, + "step": 7630 + }, + { + "epoch": 0.3816, + "grad_norm": 4.31830358505249, + "learning_rate": 1.5545539868777075e-05, + "loss": 0.6897, + "step": 7632 + }, + { + "epoch": 0.3817, + "grad_norm": 5.451604843139648, + "learning_rate": 1.5542634787366942e-05, + "loss": 1.226, + "step": 7634 + }, + { + "epoch": 0.3818, + "grad_norm": 2.07985782623291, + "learning_rate": 1.5539729030603574e-05, + "loss": 0.8414, + "step": 7636 + }, + { + "epoch": 0.3819, + "grad_norm": 7.32326602935791, + "learning_rate": 1.5536822598841025e-05, + "loss": 0.9441, + "step": 7638 + }, + { + "epoch": 0.382, + "grad_norm": 3.956160068511963, + "learning_rate": 1.553391549243344e-05, + "loss": 1.1566, + "step": 7640 + }, + { + "epoch": 0.3821, + "grad_norm": 6.283393383026123, + "learning_rate": 1.553100771173504e-05, + "loss": 1.15, + "step": 7642 + }, + { + "epoch": 0.3822, + "grad_norm": 2.9090709686279297, + "learning_rate": 1.5528099257100126e-05, + "loss": 1.0307, + "step": 7644 + }, + { + "epoch": 0.3823, + "grad_norm": 6.740516662597656, + "learning_rate": 1.5525190128883084e-05, + "loss": 0.8764, + "step": 7646 + }, + { + "epoch": 0.3824, + "grad_norm": 3.5774786472320557, + "learning_rate": 1.5522280327438388e-05, + "loss": 0.7305, + "step": 7648 + }, + { + "epoch": 0.3825, + "grad_norm": 1.5131431818008423, + "learning_rate": 1.5519369853120584e-05, + "loss": 0.5497, + "step": 7650 + }, + { + "epoch": 0.3826, + "grad_norm": 5.770425796508789, + "learning_rate": 1.5516458706284306e-05, + "loss": 1.2202, + "step": 7652 + }, + { + "epoch": 0.3827, + "grad_norm": 2.8185043334960938, + "learning_rate": 1.5513546887284264e-05, + "loss": 1.4688, + "step": 7654 + }, + { + "epoch": 0.3828, + "grad_norm": 4.22883415222168, + "learning_rate": 1.5510634396475262e-05, + "loss": 0.8109, + "step": 7656 + }, + { + "epoch": 0.3829, + "grad_norm": 3.1105458736419678, + "learning_rate": 1.550772123421217e-05, + "loss": 0.9679, + "step": 7658 + }, + { + "epoch": 0.383, + "grad_norm": 4.9007649421691895, + "learning_rate": 1.5504807400849957e-05, + "loss": 0.8274, + "step": 7660 + }, + { + "epoch": 0.3831, + "grad_norm": 3.0862133502960205, + "learning_rate": 1.550189289674366e-05, + "loss": 0.3596, + "step": 7662 + }, + { + "epoch": 0.3832, + "grad_norm": 6.727219581604004, + "learning_rate": 1.54989777222484e-05, + "loss": 0.5788, + "step": 7664 + }, + { + "epoch": 0.3833, + "grad_norm": 7.085707187652588, + "learning_rate": 1.5496061877719384e-05, + "loss": 0.8681, + "step": 7666 + }, + { + "epoch": 0.3834, + "grad_norm": 5.831673622131348, + "learning_rate": 1.54931453635119e-05, + "loss": 0.9498, + "step": 7668 + }, + { + "epoch": 0.3835, + "grad_norm": 2.5997872352600098, + "learning_rate": 1.549022817998132e-05, + "loss": 1.1247, + "step": 7670 + }, + { + "epoch": 0.3836, + "grad_norm": 2.9959867000579834, + "learning_rate": 1.5487310327483087e-05, + "loss": 1.5116, + "step": 7672 + }, + { + "epoch": 0.3837, + "grad_norm": 2.573026418685913, + "learning_rate": 1.5484391806372732e-05, + "loss": 0.872, + "step": 7674 + }, + { + "epoch": 0.3838, + "grad_norm": 4.970774173736572, + "learning_rate": 1.5481472617005878e-05, + "loss": 1.2592, + "step": 7676 + }, + { + "epoch": 0.3839, + "grad_norm": 2.5105793476104736, + "learning_rate": 1.547855275973821e-05, + "loss": 0.4781, + "step": 7678 + }, + { + "epoch": 0.384, + "grad_norm": 4.039194583892822, + "learning_rate": 1.5475632234925505e-05, + "loss": 0.9285, + "step": 7680 + }, + { + "epoch": 0.3841, + "grad_norm": 4.327383995056152, + "learning_rate": 1.5472711042923623e-05, + "loss": 0.78, + "step": 7682 + }, + { + "epoch": 0.3842, + "grad_norm": 5.1363396644592285, + "learning_rate": 1.5469789184088498e-05, + "loss": 1.191, + "step": 7684 + }, + { + "epoch": 0.3843, + "grad_norm": 3.064643383026123, + "learning_rate": 1.5466866658776158e-05, + "loss": 0.4989, + "step": 7686 + }, + { + "epoch": 0.3844, + "grad_norm": 4.680543422698975, + "learning_rate": 1.5463943467342694e-05, + "loss": 0.8442, + "step": 7688 + }, + { + "epoch": 0.3845, + "grad_norm": 3.1156744956970215, + "learning_rate": 1.5461019610144292e-05, + "loss": 0.9165, + "step": 7690 + }, + { + "epoch": 0.3846, + "grad_norm": 4.82655143737793, + "learning_rate": 1.5458095087537216e-05, + "loss": 0.8103, + "step": 7692 + }, + { + "epoch": 0.3847, + "grad_norm": 3.922802448272705, + "learning_rate": 1.5455169899877814e-05, + "loss": 1.1682, + "step": 7694 + }, + { + "epoch": 0.3848, + "grad_norm": 3.1745522022247314, + "learning_rate": 1.5452244047522504e-05, + "loss": 0.9631, + "step": 7696 + }, + { + "epoch": 0.3849, + "grad_norm": 7.344381332397461, + "learning_rate": 1.5449317530827794e-05, + "loss": 0.6823, + "step": 7698 + }, + { + "epoch": 0.385, + "grad_norm": 3.1035778522491455, + "learning_rate": 1.5446390350150272e-05, + "loss": 0.86, + "step": 7700 + }, + { + "epoch": 0.3851, + "grad_norm": 3.7534735202789307, + "learning_rate": 1.544346250584661e-05, + "loss": 0.819, + "step": 7702 + }, + { + "epoch": 0.3852, + "grad_norm": 7.342137336730957, + "learning_rate": 1.544053399827355e-05, + "loss": 0.7206, + "step": 7704 + }, + { + "epoch": 0.3853, + "grad_norm": 2.760382890701294, + "learning_rate": 1.5437604827787925e-05, + "loss": 0.2801, + "step": 7706 + }, + { + "epoch": 0.3854, + "grad_norm": 3.1015729904174805, + "learning_rate": 1.543467499474665e-05, + "loss": 0.7807, + "step": 7708 + }, + { + "epoch": 0.3855, + "grad_norm": 2.580017328262329, + "learning_rate": 1.5431744499506707e-05, + "loss": 0.7123, + "step": 7710 + }, + { + "epoch": 0.3856, + "grad_norm": 3.6580283641815186, + "learning_rate": 1.5428813342425177e-05, + "loss": 0.7422, + "step": 7712 + }, + { + "epoch": 0.3857, + "grad_norm": 4.8602213859558105, + "learning_rate": 1.5425881523859207e-05, + "loss": 0.5956, + "step": 7714 + }, + { + "epoch": 0.3858, + "grad_norm": 1.3772222995758057, + "learning_rate": 1.542294904416603e-05, + "loss": 0.1163, + "step": 7716 + }, + { + "epoch": 0.3859, + "grad_norm": 9.33454418182373, + "learning_rate": 1.5420015903702964e-05, + "loss": 0.9254, + "step": 7718 + }, + { + "epoch": 0.386, + "grad_norm": 6.64533805847168, + "learning_rate": 1.54170821028274e-05, + "loss": 1.9218, + "step": 7720 + }, + { + "epoch": 0.3861, + "grad_norm": 9.86391544342041, + "learning_rate": 1.5414147641896815e-05, + "loss": 0.92, + "step": 7722 + }, + { + "epoch": 0.3862, + "grad_norm": 4.513948917388916, + "learning_rate": 1.541121252126876e-05, + "loss": 1.047, + "step": 7724 + }, + { + "epoch": 0.3863, + "grad_norm": 3.2093915939331055, + "learning_rate": 1.5408276741300874e-05, + "loss": 1.0241, + "step": 7726 + }, + { + "epoch": 0.3864, + "grad_norm": 4.130101680755615, + "learning_rate": 1.540534030235087e-05, + "loss": 1.3139, + "step": 7728 + }, + { + "epoch": 0.3865, + "grad_norm": 6.303407192230225, + "learning_rate": 1.5402403204776552e-05, + "loss": 1.2852, + "step": 7730 + }, + { + "epoch": 0.3866, + "grad_norm": 2.370204210281372, + "learning_rate": 1.5399465448935788e-05, + "loss": 0.9742, + "step": 7732 + }, + { + "epoch": 0.3867, + "grad_norm": 6.51655912399292, + "learning_rate": 1.5396527035186536e-05, + "loss": 1.4553, + "step": 7734 + }, + { + "epoch": 0.3868, + "grad_norm": 4.589208602905273, + "learning_rate": 1.5393587963886837e-05, + "loss": 1.2557, + "step": 7736 + }, + { + "epoch": 0.3869, + "grad_norm": 6.4013590812683105, + "learning_rate": 1.5390648235394802e-05, + "loss": 1.2095, + "step": 7738 + }, + { + "epoch": 0.387, + "grad_norm": 3.026979923248291, + "learning_rate": 1.5387707850068633e-05, + "loss": 0.3679, + "step": 7740 + }, + { + "epoch": 0.3871, + "grad_norm": 7.369647026062012, + "learning_rate": 1.5384766808266603e-05, + "loss": 1.0005, + "step": 7742 + }, + { + "epoch": 0.3872, + "grad_norm": 3.544870615005493, + "learning_rate": 1.5381825110347072e-05, + "loss": 1.6319, + "step": 7744 + }, + { + "epoch": 0.3873, + "grad_norm": 21.447967529296875, + "learning_rate": 1.5378882756668478e-05, + "loss": 1.1696, + "step": 7746 + }, + { + "epoch": 0.3874, + "grad_norm": 2.6223247051239014, + "learning_rate": 1.5375939747589334e-05, + "loss": 0.9162, + "step": 7748 + }, + { + "epoch": 0.3875, + "grad_norm": 6.141774654388428, + "learning_rate": 1.5372996083468242e-05, + "loss": 0.6758, + "step": 7750 + }, + { + "epoch": 0.3876, + "grad_norm": 23.930694580078125, + "learning_rate": 1.5370051764663872e-05, + "loss": 1.8431, + "step": 7752 + }, + { + "epoch": 0.3877, + "grad_norm": 3.996494770050049, + "learning_rate": 1.5367106791534983e-05, + "loss": 1.3795, + "step": 7754 + }, + { + "epoch": 0.3878, + "grad_norm": 7.146084308624268, + "learning_rate": 1.5364161164440413e-05, + "loss": 2.1767, + "step": 7756 + }, + { + "epoch": 0.3879, + "grad_norm": 4.063998222351074, + "learning_rate": 1.5361214883739076e-05, + "loss": 0.5559, + "step": 7758 + }, + { + "epoch": 0.388, + "grad_norm": 2.1194393634796143, + "learning_rate": 1.5358267949789968e-05, + "loss": 0.8805, + "step": 7760 + }, + { + "epoch": 0.3881, + "grad_norm": 12.575077056884766, + "learning_rate": 1.535532036295216e-05, + "loss": 1.7494, + "step": 7762 + }, + { + "epoch": 0.3882, + "grad_norm": 3.360548496246338, + "learning_rate": 1.5352372123584816e-05, + "loss": 1.4015, + "step": 7764 + }, + { + "epoch": 0.3883, + "grad_norm": 3.8604531288146973, + "learning_rate": 1.534942323204716e-05, + "loss": 1.1557, + "step": 7766 + }, + { + "epoch": 0.3884, + "grad_norm": 2.124558925628662, + "learning_rate": 1.5346473688698514e-05, + "loss": 0.8452, + "step": 7768 + }, + { + "epoch": 0.3885, + "grad_norm": 4.622012138366699, + "learning_rate": 1.5343523493898267e-05, + "loss": 0.6952, + "step": 7770 + }, + { + "epoch": 0.3886, + "grad_norm": 17.91881561279297, + "learning_rate": 1.5340572648005887e-05, + "loss": 2.1719, + "step": 7772 + }, + { + "epoch": 0.3887, + "grad_norm": 3.4855093955993652, + "learning_rate": 1.533762115138093e-05, + "loss": 0.8086, + "step": 7774 + }, + { + "epoch": 0.3888, + "grad_norm": 8.809122085571289, + "learning_rate": 1.533466900438303e-05, + "loss": 1.2778, + "step": 7776 + }, + { + "epoch": 0.3889, + "grad_norm": 1.5032325983047485, + "learning_rate": 1.5331716207371888e-05, + "loss": 0.9569, + "step": 7778 + }, + { + "epoch": 0.389, + "grad_norm": 3.1517720222473145, + "learning_rate": 1.53287627607073e-05, + "loss": 0.6278, + "step": 7780 + }, + { + "epoch": 0.3891, + "grad_norm": 2.5811407566070557, + "learning_rate": 1.5325808664749136e-05, + "loss": 1.3302, + "step": 7782 + }, + { + "epoch": 0.3892, + "grad_norm": 2.9219610691070557, + "learning_rate": 1.532285391985734e-05, + "loss": 0.7587, + "step": 7784 + }, + { + "epoch": 0.3893, + "grad_norm": 3.94174861907959, + "learning_rate": 1.5319898526391942e-05, + "loss": 0.9991, + "step": 7786 + }, + { + "epoch": 0.3894, + "grad_norm": 2.366976261138916, + "learning_rate": 1.5316942484713043e-05, + "loss": 1.0508, + "step": 7788 + }, + { + "epoch": 0.3895, + "grad_norm": 3.842512607574463, + "learning_rate": 1.531398579518083e-05, + "loss": 0.5912, + "step": 7790 + }, + { + "epoch": 0.3896, + "grad_norm": 5.822543621063232, + "learning_rate": 1.5311028458155567e-05, + "loss": 0.5848, + "step": 7792 + }, + { + "epoch": 0.3897, + "grad_norm": 4.58565616607666, + "learning_rate": 1.53080704739976e-05, + "loss": 1.3939, + "step": 7794 + }, + { + "epoch": 0.3898, + "grad_norm": 2.341019630432129, + "learning_rate": 1.5305111843067343e-05, + "loss": 1.4275, + "step": 7796 + }, + { + "epoch": 0.3899, + "grad_norm": 3.0766775608062744, + "learning_rate": 1.53021525657253e-05, + "loss": 1.1542, + "step": 7798 + }, + { + "epoch": 0.39, + "grad_norm": 25.911148071289062, + "learning_rate": 1.529919264233205e-05, + "loss": 2.7035, + "step": 7800 + }, + { + "epoch": 0.3901, + "grad_norm": 6.411290645599365, + "learning_rate": 1.529623207324825e-05, + "loss": 1.5201, + "step": 7802 + }, + { + "epoch": 0.3902, + "grad_norm": 2.1072540283203125, + "learning_rate": 1.5293270858834643e-05, + "loss": 1.2012, + "step": 7804 + }, + { + "epoch": 0.3903, + "grad_norm": 1.9039950370788574, + "learning_rate": 1.5290308999452034e-05, + "loss": 1.6481, + "step": 7806 + }, + { + "epoch": 0.3904, + "grad_norm": 7.615440368652344, + "learning_rate": 1.528734649546132e-05, + "loss": 0.4228, + "step": 7808 + }, + { + "epoch": 0.3905, + "grad_norm": 4.43136739730835, + "learning_rate": 1.5284383347223473e-05, + "loss": 1.2279, + "step": 7810 + }, + { + "epoch": 0.3906, + "grad_norm": 0.6169986128807068, + "learning_rate": 1.5281419555099547e-05, + "loss": 0.1541, + "step": 7812 + }, + { + "epoch": 0.3907, + "grad_norm": 3.6713902950286865, + "learning_rate": 1.5278455119450666e-05, + "loss": 0.9869, + "step": 7814 + }, + { + "epoch": 0.3908, + "grad_norm": 4.805038928985596, + "learning_rate": 1.5275490040638038e-05, + "loss": 0.7436, + "step": 7816 + }, + { + "epoch": 0.3909, + "grad_norm": 4.12627649307251, + "learning_rate": 1.5272524319022958e-05, + "loss": 1.2823, + "step": 7818 + }, + { + "epoch": 0.391, + "grad_norm": 4.569587707519531, + "learning_rate": 1.5269557954966777e-05, + "loss": 1.0117, + "step": 7820 + }, + { + "epoch": 0.3911, + "grad_norm": 3.777825117111206, + "learning_rate": 1.5266590948830946e-05, + "loss": 0.726, + "step": 7822 + }, + { + "epoch": 0.3912, + "grad_norm": 4.769070625305176, + "learning_rate": 1.526362330097698e-05, + "loss": 1.7625, + "step": 7824 + }, + { + "epoch": 0.3913, + "grad_norm": 3.149954319000244, + "learning_rate": 1.5260655011766485e-05, + "loss": 0.8366, + "step": 7826 + }, + { + "epoch": 0.3914, + "grad_norm": 2.9961843490600586, + "learning_rate": 1.5257686081561134e-05, + "loss": 1.0543, + "step": 7828 + }, + { + "epoch": 0.3915, + "grad_norm": 5.409584045410156, + "learning_rate": 1.5254716510722678e-05, + "loss": 1.9028, + "step": 7830 + }, + { + "epoch": 0.3916, + "grad_norm": 4.809046745300293, + "learning_rate": 1.5251746299612959e-05, + "loss": 1.4111, + "step": 7832 + }, + { + "epoch": 0.3917, + "grad_norm": 3.364126443862915, + "learning_rate": 1.5248775448593882e-05, + "loss": 0.5765, + "step": 7834 + }, + { + "epoch": 0.3918, + "grad_norm": 4.674091339111328, + "learning_rate": 1.5245803958027434e-05, + "loss": 0.6604, + "step": 7836 + }, + { + "epoch": 0.3919, + "grad_norm": 4.681238174438477, + "learning_rate": 1.5242831828275693e-05, + "loss": 0.6568, + "step": 7838 + }, + { + "epoch": 0.392, + "grad_norm": 3.512570381164551, + "learning_rate": 1.5239859059700794e-05, + "loss": 1.5429, + "step": 7840 + }, + { + "epoch": 0.3921, + "grad_norm": 5.334166526794434, + "learning_rate": 1.5236885652664963e-05, + "loss": 1.2365, + "step": 7842 + }, + { + "epoch": 0.3922, + "grad_norm": 5.122159004211426, + "learning_rate": 1.5233911607530499e-05, + "loss": 1.2632, + "step": 7844 + }, + { + "epoch": 0.3923, + "grad_norm": 4.87579870223999, + "learning_rate": 1.523093692465978e-05, + "loss": 0.9474, + "step": 7846 + }, + { + "epoch": 0.3924, + "grad_norm": 7.525646209716797, + "learning_rate": 1.5227961604415266e-05, + "loss": 1.0384, + "step": 7848 + }, + { + "epoch": 0.3925, + "grad_norm": 7.6175336837768555, + "learning_rate": 1.5224985647159489e-05, + "loss": 0.6834, + "step": 7850 + }, + { + "epoch": 0.3926, + "grad_norm": 10.482634544372559, + "learning_rate": 1.5222009053255061e-05, + "loss": 1.4375, + "step": 7852 + }, + { + "epoch": 0.3927, + "grad_norm": 2.2642323970794678, + "learning_rate": 1.5219031823064667e-05, + "loss": 0.3069, + "step": 7854 + }, + { + "epoch": 0.3928, + "grad_norm": 2.679117441177368, + "learning_rate": 1.5216053956951081e-05, + "loss": 0.724, + "step": 7856 + }, + { + "epoch": 0.3929, + "grad_norm": 7.858513832092285, + "learning_rate": 1.521307545527714e-05, + "loss": 0.7176, + "step": 7858 + }, + { + "epoch": 0.393, + "grad_norm": 12.288296699523926, + "learning_rate": 1.5210096318405768e-05, + "loss": 1.0267, + "step": 7860 + }, + { + "epoch": 0.3931, + "grad_norm": 2.135343313217163, + "learning_rate": 1.5207116546699961e-05, + "loss": 0.9939, + "step": 7862 + }, + { + "epoch": 0.3932, + "grad_norm": 14.005972862243652, + "learning_rate": 1.5204136140522799e-05, + "loss": 2.1025, + "step": 7864 + }, + { + "epoch": 0.3933, + "grad_norm": 4.362051963806152, + "learning_rate": 1.5201155100237433e-05, + "loss": 1.1563, + "step": 7866 + }, + { + "epoch": 0.3934, + "grad_norm": 5.713824272155762, + "learning_rate": 1.5198173426207095e-05, + "loss": 1.0444, + "step": 7868 + }, + { + "epoch": 0.3935, + "grad_norm": 2.530320644378662, + "learning_rate": 1.5195191118795095e-05, + "loss": 0.8471, + "step": 7870 + }, + { + "epoch": 0.3936, + "grad_norm": 2.992114782333374, + "learning_rate": 1.5192208178364815e-05, + "loss": 0.9167, + "step": 7872 + }, + { + "epoch": 0.3937, + "grad_norm": 4.678229808807373, + "learning_rate": 1.5189224605279718e-05, + "loss": 1.2938, + "step": 7874 + }, + { + "epoch": 0.3938, + "grad_norm": 5.444486618041992, + "learning_rate": 1.5186240399903343e-05, + "loss": 0.7917, + "step": 7876 + }, + { + "epoch": 0.3939, + "grad_norm": 4.387054920196533, + "learning_rate": 1.5183255562599308e-05, + "loss": 0.9543, + "step": 7878 + }, + { + "epoch": 0.394, + "grad_norm": 3.712486743927002, + "learning_rate": 1.5180270093731305e-05, + "loss": 1.4627, + "step": 7880 + }, + { + "epoch": 0.3941, + "grad_norm": 2.675258159637451, + "learning_rate": 1.5177283993663102e-05, + "loss": 0.6839, + "step": 7882 + }, + { + "epoch": 0.3942, + "grad_norm": 8.122580528259277, + "learning_rate": 1.5174297262758551e-05, + "loss": 1.137, + "step": 7884 + }, + { + "epoch": 0.3943, + "grad_norm": 2.6914796829223633, + "learning_rate": 1.5171309901381572e-05, + "loss": 1.0891, + "step": 7886 + }, + { + "epoch": 0.3944, + "grad_norm": 5.273355960845947, + "learning_rate": 1.5168321909896171e-05, + "loss": 0.5784, + "step": 7888 + }, + { + "epoch": 0.3945, + "grad_norm": 2.3256702423095703, + "learning_rate": 1.516533328866642e-05, + "loss": 1.0112, + "step": 7890 + }, + { + "epoch": 0.3946, + "grad_norm": 6.863749027252197, + "learning_rate": 1.5162344038056476e-05, + "loss": 1.1783, + "step": 7892 + }, + { + "epoch": 0.3947, + "grad_norm": 3.401350736618042, + "learning_rate": 1.5159354158430572e-05, + "loss": 0.8246, + "step": 7894 + }, + { + "epoch": 0.3948, + "grad_norm": 6.715448379516602, + "learning_rate": 1.5156363650153012e-05, + "loss": 0.7736, + "step": 7896 + }, + { + "epoch": 0.3949, + "grad_norm": 4.204506874084473, + "learning_rate": 1.5153372513588183e-05, + "loss": 1.0613, + "step": 7898 + }, + { + "epoch": 0.395, + "grad_norm": 0.9699141383171082, + "learning_rate": 1.5150380749100545e-05, + "loss": 0.7629, + "step": 7900 + }, + { + "epoch": 0.3951, + "grad_norm": 4.104607105255127, + "learning_rate": 1.5147388357054634e-05, + "loss": 0.6181, + "step": 7902 + }, + { + "epoch": 0.3952, + "grad_norm": 8.370087623596191, + "learning_rate": 1.5144395337815066e-05, + "loss": 0.8527, + "step": 7904 + }, + { + "epoch": 0.3953, + "grad_norm": 2.872593402862549, + "learning_rate": 1.514140169174653e-05, + "loss": 0.7501, + "step": 7906 + }, + { + "epoch": 0.3954, + "grad_norm": 5.400876998901367, + "learning_rate": 1.5138407419213797e-05, + "loss": 1.0203, + "step": 7908 + }, + { + "epoch": 0.3955, + "grad_norm": 5.38286018371582, + "learning_rate": 1.5135412520581703e-05, + "loss": 1.7883, + "step": 7910 + }, + { + "epoch": 0.3956, + "grad_norm": 2.532224178314209, + "learning_rate": 1.5132416996215171e-05, + "loss": 1.108, + "step": 7912 + }, + { + "epoch": 0.3957, + "grad_norm": 7.230428218841553, + "learning_rate": 1.5129420846479197e-05, + "loss": 0.8818, + "step": 7914 + }, + { + "epoch": 0.3958, + "grad_norm": 3.3611373901367188, + "learning_rate": 1.5126424071738853e-05, + "loss": 0.5643, + "step": 7916 + }, + { + "epoch": 0.3959, + "grad_norm": 5.810450553894043, + "learning_rate": 1.5123426672359284e-05, + "loss": 1.0441, + "step": 7918 + }, + { + "epoch": 0.396, + "grad_norm": 3.6379454135894775, + "learning_rate": 1.5120428648705716e-05, + "loss": 0.9178, + "step": 7920 + }, + { + "epoch": 0.3961, + "grad_norm": 6.294597148895264, + "learning_rate": 1.5117430001143451e-05, + "loss": 0.4675, + "step": 7922 + }, + { + "epoch": 0.3962, + "grad_norm": 4.016376495361328, + "learning_rate": 1.511443073003786e-05, + "loss": 0.9324, + "step": 7924 + }, + { + "epoch": 0.3963, + "grad_norm": 2.2282519340515137, + "learning_rate": 1.5111430835754401e-05, + "loss": 1.0222, + "step": 7926 + }, + { + "epoch": 0.3964, + "grad_norm": 4.17169189453125, + "learning_rate": 1.51084303186586e-05, + "loss": 0.7892, + "step": 7928 + }, + { + "epoch": 0.3965, + "grad_norm": 3.9721102714538574, + "learning_rate": 1.510542917911606e-05, + "loss": 0.5948, + "step": 7930 + }, + { + "epoch": 0.3966, + "grad_norm": 1.8427523374557495, + "learning_rate": 1.510242741749246e-05, + "loss": 1.1957, + "step": 7932 + }, + { + "epoch": 0.3967, + "grad_norm": 5.9319963455200195, + "learning_rate": 1.5099425034153554e-05, + "loss": 1.0102, + "step": 7934 + }, + { + "epoch": 0.3968, + "grad_norm": 8.14345932006836, + "learning_rate": 1.5096422029465178e-05, + "loss": 0.4583, + "step": 7936 + }, + { + "epoch": 0.3969, + "grad_norm": 9.87513542175293, + "learning_rate": 1.5093418403793238e-05, + "loss": 1.3204, + "step": 7938 + }, + { + "epoch": 0.397, + "grad_norm": 7.867947101593018, + "learning_rate": 1.5090414157503715e-05, + "loss": 0.8972, + "step": 7940 + }, + { + "epoch": 0.3971, + "grad_norm": 8.382620811462402, + "learning_rate": 1.5087409290962667e-05, + "loss": 1.9246, + "step": 7942 + }, + { + "epoch": 0.3972, + "grad_norm": 3.8400535583496094, + "learning_rate": 1.508440380453623e-05, + "loss": 0.8424, + "step": 7944 + }, + { + "epoch": 0.3973, + "grad_norm": 3.2666819095611572, + "learning_rate": 1.5081397698590609e-05, + "loss": 1.1381, + "step": 7946 + }, + { + "epoch": 0.3974, + "grad_norm": 4.064691066741943, + "learning_rate": 1.5078390973492094e-05, + "loss": 0.2176, + "step": 7948 + }, + { + "epoch": 0.3975, + "grad_norm": 5.687554836273193, + "learning_rate": 1.5075383629607043e-05, + "loss": 0.9299, + "step": 7950 + }, + { + "epoch": 0.3976, + "grad_norm": 4.748290061950684, + "learning_rate": 1.5072375667301893e-05, + "loss": 0.6351, + "step": 7952 + }, + { + "epoch": 0.3977, + "grad_norm": 3.514559745788574, + "learning_rate": 1.5069367086943155e-05, + "loss": 1.0763, + "step": 7954 + }, + { + "epoch": 0.3978, + "grad_norm": 4.175057888031006, + "learning_rate": 1.506635788889741e-05, + "loss": 1.0035, + "step": 7956 + }, + { + "epoch": 0.3979, + "grad_norm": 3.7758610248565674, + "learning_rate": 1.5063348073531325e-05, + "loss": 0.9617, + "step": 7958 + }, + { + "epoch": 0.398, + "grad_norm": 5.088552474975586, + "learning_rate": 1.5060337641211637e-05, + "loss": 1.4068, + "step": 7960 + }, + { + "epoch": 0.3981, + "grad_norm": 1.5268453359603882, + "learning_rate": 1.5057326592305161e-05, + "loss": 0.4556, + "step": 7962 + }, + { + "epoch": 0.3982, + "grad_norm": 3.4690072536468506, + "learning_rate": 1.5054314927178779e-05, + "loss": 0.1564, + "step": 7964 + }, + { + "epoch": 0.3983, + "grad_norm": 9.342171669006348, + "learning_rate": 1.505130264619945e-05, + "loss": 1.0526, + "step": 7966 + }, + { + "epoch": 0.3984, + "grad_norm": 5.135639190673828, + "learning_rate": 1.504828974973422e-05, + "loss": 1.1581, + "step": 7968 + }, + { + "epoch": 0.3985, + "grad_norm": 2.8082163333892822, + "learning_rate": 1.5045276238150194e-05, + "loss": 0.895, + "step": 7970 + }, + { + "epoch": 0.3986, + "grad_norm": 2.350785970687866, + "learning_rate": 1.5042262111814566e-05, + "loss": 0.9701, + "step": 7972 + }, + { + "epoch": 0.3987, + "grad_norm": 10.41858959197998, + "learning_rate": 1.5039247371094589e-05, + "loss": 1.4346, + "step": 7974 + }, + { + "epoch": 0.3988, + "grad_norm": 4.182562351226807, + "learning_rate": 1.503623201635761e-05, + "loss": 1.1264, + "step": 7976 + }, + { + "epoch": 0.3989, + "grad_norm": 9.761163711547852, + "learning_rate": 1.5033216047971032e-05, + "loss": 1.0181, + "step": 7978 + }, + { + "epoch": 0.399, + "grad_norm": 3.1673223972320557, + "learning_rate": 1.5030199466302354e-05, + "loss": 0.6567, + "step": 7980 + }, + { + "epoch": 0.3991, + "grad_norm": 1.6272261142730713, + "learning_rate": 1.5027182271719123e-05, + "loss": 1.639, + "step": 7982 + }, + { + "epoch": 0.3992, + "grad_norm": 4.5096635818481445, + "learning_rate": 1.5024164464588982e-05, + "loss": 1.1311, + "step": 7984 + }, + { + "epoch": 0.3993, + "grad_norm": 7.747420787811279, + "learning_rate": 1.502114604527964e-05, + "loss": 1.1049, + "step": 7986 + }, + { + "epoch": 0.3994, + "grad_norm": 0.9307705760002136, + "learning_rate": 1.5018127014158886e-05, + "loss": 0.5187, + "step": 7988 + }, + { + "epoch": 0.3995, + "grad_norm": 7.194799900054932, + "learning_rate": 1.5015107371594576e-05, + "loss": 1.2224, + "step": 7990 + }, + { + "epoch": 0.3996, + "grad_norm": 5.158553600311279, + "learning_rate": 1.5012087117954643e-05, + "loss": 0.976, + "step": 7992 + }, + { + "epoch": 0.3997, + "grad_norm": 10.166410446166992, + "learning_rate": 1.5009066253607101e-05, + "loss": 1.4526, + "step": 7994 + }, + { + "epoch": 0.3998, + "grad_norm": 2.6016299724578857, + "learning_rate": 1.5006044778920028e-05, + "loss": 0.5594, + "step": 7996 + }, + { + "epoch": 0.3999, + "grad_norm": 3.8949670791625977, + "learning_rate": 1.5003022694261584e-05, + "loss": 0.7941, + "step": 7998 + }, + { + "epoch": 0.4, + "grad_norm": 6.08013916015625, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.8726, + "step": 8000 + }, + { + "epoch": 0.4001, + "grad_norm": 12.028295516967773, + "learning_rate": 1.4996976696503586e-05, + "loss": 0.8545, + "step": 8002 + }, + { + "epoch": 0.4002, + "grad_norm": 5.011757850646973, + "learning_rate": 1.4993952784140716e-05, + "loss": 0.6864, + "step": 8004 + }, + { + "epoch": 0.4003, + "grad_norm": 5.823271751403809, + "learning_rate": 1.4990928263279847e-05, + "loss": 1.1918, + "step": 8006 + }, + { + "epoch": 0.4004, + "grad_norm": 4.366251468658447, + "learning_rate": 1.498790313428951e-05, + "loss": 0.8367, + "step": 8008 + }, + { + "epoch": 0.4005, + "grad_norm": 5.317407131195068, + "learning_rate": 1.4984877397538305e-05, + "loss": 0.9614, + "step": 8010 + }, + { + "epoch": 0.4006, + "grad_norm": 16.28762435913086, + "learning_rate": 1.498185105339491e-05, + "loss": 0.8943, + "step": 8012 + }, + { + "epoch": 0.4007, + "grad_norm": 3.591242790222168, + "learning_rate": 1.4978824102228076e-05, + "loss": 0.9687, + "step": 8014 + }, + { + "epoch": 0.4008, + "grad_norm": 0.24924838542938232, + "learning_rate": 1.4975796544406627e-05, + "loss": 0.4575, + "step": 8016 + }, + { + "epoch": 0.4009, + "grad_norm": 7.573010444641113, + "learning_rate": 1.4972768380299463e-05, + "loss": 1.6509, + "step": 8018 + }, + { + "epoch": 0.401, + "grad_norm": 1.4583487510681152, + "learning_rate": 1.4969739610275556e-05, + "loss": 0.6387, + "step": 8020 + }, + { + "epoch": 0.4011, + "grad_norm": 1.9006496667861938, + "learning_rate": 1.4966710234703952e-05, + "loss": 1.2135, + "step": 8022 + }, + { + "epoch": 0.4012, + "grad_norm": 4.192447185516357, + "learning_rate": 1.496368025395377e-05, + "loss": 0.619, + "step": 8024 + }, + { + "epoch": 0.4013, + "grad_norm": 5.574273109436035, + "learning_rate": 1.4960649668394206e-05, + "loss": 0.8472, + "step": 8026 + }, + { + "epoch": 0.4014, + "grad_norm": 11.1182861328125, + "learning_rate": 1.4957618478394529e-05, + "loss": 1.1185, + "step": 8028 + }, + { + "epoch": 0.4015, + "grad_norm": 3.2899627685546875, + "learning_rate": 1.4954586684324077e-05, + "loss": 0.8748, + "step": 8030 + }, + { + "epoch": 0.4016, + "grad_norm": 1.2295958995819092, + "learning_rate": 1.4951554286552266e-05, + "loss": 0.7065, + "step": 8032 + }, + { + "epoch": 0.4017, + "grad_norm": 3.3337886333465576, + "learning_rate": 1.4948521285448587e-05, + "loss": 1.1052, + "step": 8034 + }, + { + "epoch": 0.4018, + "grad_norm": 4.4009857177734375, + "learning_rate": 1.4945487681382597e-05, + "loss": 0.8845, + "step": 8036 + }, + { + "epoch": 0.4019, + "grad_norm": 3.856731653213501, + "learning_rate": 1.4942453474723936e-05, + "loss": 0.7266, + "step": 8038 + }, + { + "epoch": 0.402, + "grad_norm": 3.2477188110351562, + "learning_rate": 1.493941866584231e-05, + "loss": 0.5395, + "step": 8040 + }, + { + "epoch": 0.4021, + "grad_norm": 6.801644802093506, + "learning_rate": 1.4936383255107504e-05, + "loss": 1.425, + "step": 8042 + }, + { + "epoch": 0.4022, + "grad_norm": 8.840714454650879, + "learning_rate": 1.4933347242889371e-05, + "loss": 1.0441, + "step": 8044 + }, + { + "epoch": 0.4023, + "grad_norm": 12.970052719116211, + "learning_rate": 1.493031062955784e-05, + "loss": 0.8138, + "step": 8046 + }, + { + "epoch": 0.4024, + "grad_norm": 11.074006080627441, + "learning_rate": 1.4927273415482916e-05, + "loss": 2.0166, + "step": 8048 + }, + { + "epoch": 0.4025, + "grad_norm": 3.745694637298584, + "learning_rate": 1.4924235601034673e-05, + "loss": 0.9903, + "step": 8050 + }, + { + "epoch": 0.4026, + "grad_norm": 9.822965621948242, + "learning_rate": 1.4921197186583256e-05, + "loss": 1.0838, + "step": 8052 + }, + { + "epoch": 0.4027, + "grad_norm": 3.5768773555755615, + "learning_rate": 1.491815817249889e-05, + "loss": 1.3823, + "step": 8054 + }, + { + "epoch": 0.4028, + "grad_norm": 2.211178779602051, + "learning_rate": 1.4915118559151871e-05, + "loss": 0.835, + "step": 8056 + }, + { + "epoch": 0.4029, + "grad_norm": 3.4622230529785156, + "learning_rate": 1.4912078346912564e-05, + "loss": 0.7852, + "step": 8058 + }, + { + "epoch": 0.403, + "grad_norm": 6.9881110191345215, + "learning_rate": 1.490903753615141e-05, + "loss": 0.9442, + "step": 8060 + }, + { + "epoch": 0.4031, + "grad_norm": 2.7753233909606934, + "learning_rate": 1.4905996127238923e-05, + "loss": 1.1712, + "step": 8062 + }, + { + "epoch": 0.4032, + "grad_norm": 1.697413444519043, + "learning_rate": 1.4902954120545687e-05, + "loss": 0.5455, + "step": 8064 + }, + { + "epoch": 0.4033, + "grad_norm": 10.334945678710938, + "learning_rate": 1.4899911516442367e-05, + "loss": 0.8673, + "step": 8066 + }, + { + "epoch": 0.4034, + "grad_norm": 3.224506378173828, + "learning_rate": 1.4896868315299692e-05, + "loss": 1.1905, + "step": 8068 + }, + { + "epoch": 0.4035, + "grad_norm": 4.859696388244629, + "learning_rate": 1.4893824517488464e-05, + "loss": 0.9788, + "step": 8070 + }, + { + "epoch": 0.4036, + "grad_norm": 3.3027615547180176, + "learning_rate": 1.4890780123379565e-05, + "loss": 1.0177, + "step": 8072 + }, + { + "epoch": 0.4037, + "grad_norm": 10.301517486572266, + "learning_rate": 1.4887735133343942e-05, + "loss": 1.2386, + "step": 8074 + }, + { + "epoch": 0.4038, + "grad_norm": 3.533254384994507, + "learning_rate": 1.488468954775262e-05, + "loss": 0.8356, + "step": 8076 + }, + { + "epoch": 0.4039, + "grad_norm": 2.1829018592834473, + "learning_rate": 1.4881643366976692e-05, + "loss": 1.1036, + "step": 8078 + }, + { + "epoch": 0.404, + "grad_norm": 4.227790355682373, + "learning_rate": 1.4878596591387329e-05, + "loss": 1.2902, + "step": 8080 + }, + { + "epoch": 0.4041, + "grad_norm": 3.8906326293945312, + "learning_rate": 1.4875549221355768e-05, + "loss": 0.8087, + "step": 8082 + }, + { + "epoch": 0.4042, + "grad_norm": 7.078824996948242, + "learning_rate": 1.4872501257253325e-05, + "loss": 1.1043, + "step": 8084 + }, + { + "epoch": 0.4043, + "grad_norm": 5.368353843688965, + "learning_rate": 1.4869452699451384e-05, + "loss": 0.6543, + "step": 8086 + }, + { + "epoch": 0.4044, + "grad_norm": 3.7833898067474365, + "learning_rate": 1.4866403548321402e-05, + "loss": 1.0984, + "step": 8088 + }, + { + "epoch": 0.4045, + "grad_norm": 2.7864651679992676, + "learning_rate": 1.4863353804234906e-05, + "loss": 0.825, + "step": 8090 + }, + { + "epoch": 0.4046, + "grad_norm": 20.041000366210938, + "learning_rate": 1.4860303467563504e-05, + "loss": 1.75, + "step": 8092 + }, + { + "epoch": 0.4047, + "grad_norm": 5.518853187561035, + "learning_rate": 1.4857252538678866e-05, + "loss": 1.2911, + "step": 8094 + }, + { + "epoch": 0.4048, + "grad_norm": 1.9673649072647095, + "learning_rate": 1.485420101795274e-05, + "loss": 0.8394, + "step": 8096 + }, + { + "epoch": 0.4049, + "grad_norm": 3.2034595012664795, + "learning_rate": 1.4851148905756947e-05, + "loss": 0.2941, + "step": 8098 + }, + { + "epoch": 0.405, + "grad_norm": 5.9883880615234375, + "learning_rate": 1.4848096202463373e-05, + "loss": 1.0683, + "step": 8100 + }, + { + "epoch": 0.4051, + "grad_norm": 1.7082856893539429, + "learning_rate": 1.484504290844398e-05, + "loss": 0.8927, + "step": 8102 + }, + { + "epoch": 0.4052, + "grad_norm": 3.2329864501953125, + "learning_rate": 1.4841989024070809e-05, + "loss": 0.8758, + "step": 8104 + }, + { + "epoch": 0.4053, + "grad_norm": 5.332681179046631, + "learning_rate": 1.4838934549715962e-05, + "loss": 0.8296, + "step": 8106 + }, + { + "epoch": 0.4054, + "grad_norm": 8.297041893005371, + "learning_rate": 1.4835879485751617e-05, + "loss": 1.516, + "step": 8108 + }, + { + "epoch": 0.4055, + "grad_norm": 7.425530910491943, + "learning_rate": 1.4832823832550025e-05, + "loss": 0.8985, + "step": 8110 + }, + { + "epoch": 0.4056, + "grad_norm": 1.2230099439620972, + "learning_rate": 1.4829767590483508e-05, + "loss": 0.5864, + "step": 8112 + }, + { + "epoch": 0.4057, + "grad_norm": 2.7752888202667236, + "learning_rate": 1.4826710759924462e-05, + "loss": 1.0665, + "step": 8114 + }, + { + "epoch": 0.4058, + "grad_norm": 6.356277942657471, + "learning_rate": 1.4823653341245353e-05, + "loss": 0.8649, + "step": 8116 + }, + { + "epoch": 0.4059, + "grad_norm": 3.9691314697265625, + "learning_rate": 1.4820595334818712e-05, + "loss": 1.0449, + "step": 8118 + }, + { + "epoch": 0.406, + "grad_norm": 5.992074966430664, + "learning_rate": 1.4817536741017153e-05, + "loss": 1.2723, + "step": 8120 + }, + { + "epoch": 0.4061, + "grad_norm": 2.182751178741455, + "learning_rate": 1.4814477560213358e-05, + "loss": 0.2911, + "step": 8122 + }, + { + "epoch": 0.4062, + "grad_norm": 5.005777835845947, + "learning_rate": 1.4811417792780074e-05, + "loss": 0.4361, + "step": 8124 + }, + { + "epoch": 0.4063, + "grad_norm": 11.636612892150879, + "learning_rate": 1.4808357439090126e-05, + "loss": 0.853, + "step": 8126 + }, + { + "epoch": 0.4064, + "grad_norm": 2.897853136062622, + "learning_rate": 1.4805296499516408e-05, + "loss": 0.7701, + "step": 8128 + }, + { + "epoch": 0.4065, + "grad_norm": 2.2780940532684326, + "learning_rate": 1.480223497443189e-05, + "loss": 0.9781, + "step": 8130 + }, + { + "epoch": 0.4066, + "grad_norm": 2.1471922397613525, + "learning_rate": 1.4799172864209607e-05, + "loss": 0.4778, + "step": 8132 + }, + { + "epoch": 0.4067, + "grad_norm": 2.1236064434051514, + "learning_rate": 1.4796110169222666e-05, + "loss": 0.93, + "step": 8134 + }, + { + "epoch": 0.4068, + "grad_norm": 3.761945962905884, + "learning_rate": 1.4793046889844252e-05, + "loss": 1.1348, + "step": 8136 + }, + { + "epoch": 0.4069, + "grad_norm": 2.4778835773468018, + "learning_rate": 1.4789983026447612e-05, + "loss": 0.7402, + "step": 8138 + }, + { + "epoch": 0.407, + "grad_norm": 2.5204756259918213, + "learning_rate": 1.478691857940607e-05, + "loss": 1.6467, + "step": 8140 + }, + { + "epoch": 0.4071, + "grad_norm": 7.269756317138672, + "learning_rate": 1.4783853549093019e-05, + "loss": 0.6858, + "step": 8142 + }, + { + "epoch": 0.4072, + "grad_norm": 2.5975584983825684, + "learning_rate": 1.4780787935881925e-05, + "loss": 0.7338, + "step": 8144 + }, + { + "epoch": 0.4073, + "grad_norm": 6.1285271644592285, + "learning_rate": 1.477772174014632e-05, + "loss": 1.5586, + "step": 8146 + }, + { + "epoch": 0.4074, + "grad_norm": 6.618678569793701, + "learning_rate": 1.4774654962259813e-05, + "loss": 1.4819, + "step": 8148 + }, + { + "epoch": 0.4075, + "grad_norm": 2.0394861698150635, + "learning_rate": 1.4771587602596085e-05, + "loss": 0.3801, + "step": 8150 + }, + { + "epoch": 0.4076, + "grad_norm": 10.71844482421875, + "learning_rate": 1.4768519661528879e-05, + "loss": 0.8929, + "step": 8152 + }, + { + "epoch": 0.4077, + "grad_norm": 3.4767937660217285, + "learning_rate": 1.4765451139432018e-05, + "loss": 1.0581, + "step": 8154 + }, + { + "epoch": 0.4078, + "grad_norm": 3.578106641769409, + "learning_rate": 1.4762382036679393e-05, + "loss": 0.8761, + "step": 8156 + }, + { + "epoch": 0.4079, + "grad_norm": 6.48616361618042, + "learning_rate": 1.4759312353644962e-05, + "loss": 0.7469, + "step": 8158 + }, + { + "epoch": 0.408, + "grad_norm": 5.6378960609436035, + "learning_rate": 1.4756242090702756e-05, + "loss": 0.7009, + "step": 8160 + }, + { + "epoch": 0.4081, + "grad_norm": 8.877250671386719, + "learning_rate": 1.4753171248226876e-05, + "loss": 0.8523, + "step": 8162 + }, + { + "epoch": 0.4082, + "grad_norm": 1.2496097087860107, + "learning_rate": 1.47500998265915e-05, + "loss": 1.0429, + "step": 8164 + }, + { + "epoch": 0.4083, + "grad_norm": 2.4027621746063232, + "learning_rate": 1.4747027826170868e-05, + "loss": 1.1387, + "step": 8166 + }, + { + "epoch": 0.4084, + "grad_norm": 3.771939277648926, + "learning_rate": 1.4743955247339292e-05, + "loss": 1.1089, + "step": 8168 + }, + { + "epoch": 0.4085, + "grad_norm": 6.043416976928711, + "learning_rate": 1.4740882090471163e-05, + "loss": 0.9368, + "step": 8170 + }, + { + "epoch": 0.4086, + "grad_norm": 4.187002658843994, + "learning_rate": 1.4737808355940932e-05, + "loss": 0.7153, + "step": 8172 + }, + { + "epoch": 0.4087, + "grad_norm": 3.178269386291504, + "learning_rate": 1.4734734044123123e-05, + "loss": 0.9848, + "step": 8174 + }, + { + "epoch": 0.4088, + "grad_norm": 5.5658040046691895, + "learning_rate": 1.4731659155392332e-05, + "loss": 1.4135, + "step": 8176 + }, + { + "epoch": 0.4089, + "grad_norm": 5.173226356506348, + "learning_rate": 1.4728583690123226e-05, + "loss": 1.2732, + "step": 8178 + }, + { + "epoch": 0.409, + "grad_norm": 4.53049898147583, + "learning_rate": 1.4725507648690542e-05, + "loss": 1.4003, + "step": 8180 + }, + { + "epoch": 0.4091, + "grad_norm": 3.101749897003174, + "learning_rate": 1.4722431031469085e-05, + "loss": 1.1305, + "step": 8182 + }, + { + "epoch": 0.4092, + "grad_norm": 3.3952128887176514, + "learning_rate": 1.4719353838833729e-05, + "loss": 0.912, + "step": 8184 + }, + { + "epoch": 0.4093, + "grad_norm": 4.364431858062744, + "learning_rate": 1.4716276071159424e-05, + "loss": 1.2342, + "step": 8186 + }, + { + "epoch": 0.4094, + "grad_norm": 7.5405426025390625, + "learning_rate": 1.4713197728821185e-05, + "loss": 1.3806, + "step": 8188 + }, + { + "epoch": 0.4095, + "grad_norm": 1.9653781652450562, + "learning_rate": 1.47101188121941e-05, + "loss": 1.2028, + "step": 8190 + }, + { + "epoch": 0.4096, + "grad_norm": 4.725848197937012, + "learning_rate": 1.470703932165333e-05, + "loss": 0.916, + "step": 8192 + }, + { + "epoch": 0.4097, + "grad_norm": 8.245795249938965, + "learning_rate": 1.470395925757409e-05, + "loss": 1.3131, + "step": 8194 + }, + { + "epoch": 0.4098, + "grad_norm": 10.129526138305664, + "learning_rate": 1.4700878620331684e-05, + "loss": 0.9495, + "step": 8196 + }, + { + "epoch": 0.4099, + "grad_norm": 4.494096755981445, + "learning_rate": 1.469779741030148e-05, + "loss": 0.7228, + "step": 8198 + }, + { + "epoch": 0.41, + "grad_norm": 8.45512866973877, + "learning_rate": 1.469471562785891e-05, + "loss": 1.0568, + "step": 8200 + }, + { + "epoch": 0.4101, + "grad_norm": 2.622511625289917, + "learning_rate": 1.4691633273379483e-05, + "loss": 1.7582, + "step": 8202 + }, + { + "epoch": 0.4102, + "grad_norm": 2.54960036277771, + "learning_rate": 1.468855034723877e-05, + "loss": 0.8928, + "step": 8204 + }, + { + "epoch": 0.4103, + "grad_norm": 3.946146249771118, + "learning_rate": 1.4685466849812417e-05, + "loss": 1.2408, + "step": 8206 + }, + { + "epoch": 0.4104, + "grad_norm": 6.1583685874938965, + "learning_rate": 1.4682382781476146e-05, + "loss": 0.7967, + "step": 8208 + }, + { + "epoch": 0.4105, + "grad_norm": 3.3862979412078857, + "learning_rate": 1.4679298142605735e-05, + "loss": 0.8986, + "step": 8210 + }, + { + "epoch": 0.4106, + "grad_norm": 4.326569557189941, + "learning_rate": 1.467621293357704e-05, + "loss": 0.9626, + "step": 8212 + }, + { + "epoch": 0.4107, + "grad_norm": 5.3273844718933105, + "learning_rate": 1.467312715476598e-05, + "loss": 1.1821, + "step": 8214 + }, + { + "epoch": 0.4108, + "grad_norm": 7.027390480041504, + "learning_rate": 1.4670040806548555e-05, + "loss": 0.9783, + "step": 8216 + }, + { + "epoch": 0.4109, + "grad_norm": 3.7042994499206543, + "learning_rate": 1.4666953889300821e-05, + "loss": 0.737, + "step": 8218 + }, + { + "epoch": 0.411, + "grad_norm": 1.9243013858795166, + "learning_rate": 1.4663866403398915e-05, + "loss": 0.442, + "step": 8220 + }, + { + "epoch": 0.4111, + "grad_norm": 3.2835693359375, + "learning_rate": 1.4660778349219031e-05, + "loss": 1.1753, + "step": 8222 + }, + { + "epoch": 0.4112, + "grad_norm": 1.8979430198669434, + "learning_rate": 1.4657689727137443e-05, + "loss": 0.639, + "step": 8224 + }, + { + "epoch": 0.4113, + "grad_norm": 2.5827791690826416, + "learning_rate": 1.4654600537530492e-05, + "loss": 0.9468, + "step": 8226 + }, + { + "epoch": 0.4114, + "grad_norm": 0.48691070079803467, + "learning_rate": 1.4651510780774585e-05, + "loss": 0.5867, + "step": 8228 + }, + { + "epoch": 0.4115, + "grad_norm": 2.8151004314422607, + "learning_rate": 1.46484204572462e-05, + "loss": 1.3863, + "step": 8230 + }, + { + "epoch": 0.4116, + "grad_norm": 6.303434371948242, + "learning_rate": 1.464532956732188e-05, + "loss": 0.9184, + "step": 8232 + }, + { + "epoch": 0.4117, + "grad_norm": 4.041463375091553, + "learning_rate": 1.4642238111378242e-05, + "loss": 0.8578, + "step": 8234 + }, + { + "epoch": 0.4118, + "grad_norm": 7.467327117919922, + "learning_rate": 1.4639146089791972e-05, + "loss": 1.085, + "step": 8236 + }, + { + "epoch": 0.4119, + "grad_norm": 7.692877769470215, + "learning_rate": 1.4636053502939824e-05, + "loss": 1.3195, + "step": 8238 + }, + { + "epoch": 0.412, + "grad_norm": 4.341828346252441, + "learning_rate": 1.463296035119862e-05, + "loss": 1.2127, + "step": 8240 + }, + { + "epoch": 0.4121, + "grad_norm": 2.453871488571167, + "learning_rate": 1.4629866634945249e-05, + "loss": 0.592, + "step": 8242 + }, + { + "epoch": 0.4122, + "grad_norm": 9.388225555419922, + "learning_rate": 1.462677235455667e-05, + "loss": 2.0689, + "step": 8244 + }, + { + "epoch": 0.4123, + "grad_norm": 7.545323848724365, + "learning_rate": 1.462367751040992e-05, + "loss": 1.5695, + "step": 8246 + }, + { + "epoch": 0.4124, + "grad_norm": 7.047202110290527, + "learning_rate": 1.4620582102882088e-05, + "loss": 1.5247, + "step": 8248 + }, + { + "epoch": 0.4125, + "grad_norm": 6.402566432952881, + "learning_rate": 1.4617486132350343e-05, + "loss": 1.1692, + "step": 8250 + }, + { + "epoch": 0.4126, + "grad_norm": 9.08614730834961, + "learning_rate": 1.4614389599191917e-05, + "loss": 1.0831, + "step": 8252 + }, + { + "epoch": 0.4127, + "grad_norm": 1.888466715812683, + "learning_rate": 1.4611292503784116e-05, + "loss": 0.217, + "step": 8254 + }, + { + "epoch": 0.4128, + "grad_norm": 2.9797635078430176, + "learning_rate": 1.4608194846504311e-05, + "loss": 0.7631, + "step": 8256 + }, + { + "epoch": 0.4129, + "grad_norm": 1.4498778581619263, + "learning_rate": 1.4605096627729942e-05, + "loss": 0.4589, + "step": 8258 + }, + { + "epoch": 0.413, + "grad_norm": 6.777953624725342, + "learning_rate": 1.4601997847838518e-05, + "loss": 0.8858, + "step": 8260 + }, + { + "epoch": 0.4131, + "grad_norm": 8.966730117797852, + "learning_rate": 1.4598898507207614e-05, + "loss": 0.9892, + "step": 8262 + }, + { + "epoch": 0.4132, + "grad_norm": 8.737932205200195, + "learning_rate": 1.4595798606214882e-05, + "loss": 1.3773, + "step": 8264 + }, + { + "epoch": 0.4133, + "grad_norm": 5.791666507720947, + "learning_rate": 1.4592698145238027e-05, + "loss": 0.7219, + "step": 8266 + }, + { + "epoch": 0.4134, + "grad_norm": 3.210339069366455, + "learning_rate": 1.4589597124654834e-05, + "loss": 0.783, + "step": 8268 + }, + { + "epoch": 0.4135, + "grad_norm": 6.810711860656738, + "learning_rate": 1.4586495544843153e-05, + "loss": 1.3385, + "step": 8270 + }, + { + "epoch": 0.4136, + "grad_norm": 1.5046308040618896, + "learning_rate": 1.4583393406180898e-05, + "loss": 0.9081, + "step": 8272 + }, + { + "epoch": 0.4137, + "grad_norm": 8.68925666809082, + "learning_rate": 1.4580290709046065e-05, + "loss": 0.8357, + "step": 8274 + }, + { + "epoch": 0.4138, + "grad_norm": 2.8685567378997803, + "learning_rate": 1.4577187453816702e-05, + "loss": 0.9348, + "step": 8276 + }, + { + "epoch": 0.4139, + "grad_norm": 1.8945503234863281, + "learning_rate": 1.457408364087093e-05, + "loss": 1.3557, + "step": 8278 + }, + { + "epoch": 0.414, + "grad_norm": 7.655246734619141, + "learning_rate": 1.4570979270586944e-05, + "loss": 1.2781, + "step": 8280 + }, + { + "epoch": 0.4141, + "grad_norm": 5.2357282638549805, + "learning_rate": 1.4567874343342996e-05, + "loss": 1.287, + "step": 8282 + }, + { + "epoch": 0.4142, + "grad_norm": 2.722092628479004, + "learning_rate": 1.4564768859517417e-05, + "loss": 0.6975, + "step": 8284 + }, + { + "epoch": 0.4143, + "grad_norm": 5.462501049041748, + "learning_rate": 1.4561662819488597e-05, + "loss": 0.9414, + "step": 8286 + }, + { + "epoch": 0.4144, + "grad_norm": 2.1838033199310303, + "learning_rate": 1.4558556223635004e-05, + "loss": 0.3718, + "step": 8288 + }, + { + "epoch": 0.4145, + "grad_norm": 4.364599227905273, + "learning_rate": 1.4555449072335157e-05, + "loss": 0.6459, + "step": 8290 + }, + { + "epoch": 0.4146, + "grad_norm": 4.333549499511719, + "learning_rate": 1.455234136596766e-05, + "loss": 1.1745, + "step": 8292 + }, + { + "epoch": 0.4147, + "grad_norm": 8.280445098876953, + "learning_rate": 1.4549233104911178e-05, + "loss": 1.0838, + "step": 8294 + }, + { + "epoch": 0.4148, + "grad_norm": 5.75355863571167, + "learning_rate": 1.454612428954444e-05, + "loss": 1.1145, + "step": 8296 + }, + { + "epoch": 0.4149, + "grad_norm": 3.1063051223754883, + "learning_rate": 1.4543014920246248e-05, + "loss": 0.8333, + "step": 8298 + }, + { + "epoch": 0.415, + "grad_norm": 7.420324802398682, + "learning_rate": 1.4539904997395468e-05, + "loss": 0.7962, + "step": 8300 + }, + { + "epoch": 0.4151, + "grad_norm": 3.5512382984161377, + "learning_rate": 1.4536794521371035e-05, + "loss": 0.8999, + "step": 8302 + }, + { + "epoch": 0.4152, + "grad_norm": 4.846319198608398, + "learning_rate": 1.4533683492551954e-05, + "loss": 0.7741, + "step": 8304 + }, + { + "epoch": 0.4153, + "grad_norm": 8.722640991210938, + "learning_rate": 1.453057191131729e-05, + "loss": 1.6045, + "step": 8306 + }, + { + "epoch": 0.4154, + "grad_norm": 4.5917487144470215, + "learning_rate": 1.452745977804618e-05, + "loss": 0.5316, + "step": 8308 + }, + { + "epoch": 0.4155, + "grad_norm": 5.756523609161377, + "learning_rate": 1.4524347093117828e-05, + "loss": 1.4907, + "step": 8310 + }, + { + "epoch": 0.4156, + "grad_norm": 0.7060573101043701, + "learning_rate": 1.4521233856911507e-05, + "loss": 0.7738, + "step": 8312 + }, + { + "epoch": 0.4157, + "grad_norm": 11.252493858337402, + "learning_rate": 1.4518120069806556e-05, + "loss": 0.9152, + "step": 8314 + }, + { + "epoch": 0.4158, + "grad_norm": 2.21128249168396, + "learning_rate": 1.4515005732182384e-05, + "loss": 0.8443, + "step": 8316 + }, + { + "epoch": 0.4159, + "grad_norm": 3.4982972145080566, + "learning_rate": 1.4511890844418453e-05, + "loss": 1.0047, + "step": 8318 + }, + { + "epoch": 0.416, + "grad_norm": 3.796893358230591, + "learning_rate": 1.4508775406894308e-05, + "loss": 1.0824, + "step": 8320 + }, + { + "epoch": 0.4161, + "grad_norm": 4.4723944664001465, + "learning_rate": 1.4505659419989559e-05, + "loss": 0.3352, + "step": 8322 + }, + { + "epoch": 0.4162, + "grad_norm": 3.512035608291626, + "learning_rate": 1.4502542884083876e-05, + "loss": 0.929, + "step": 8324 + }, + { + "epoch": 0.4163, + "grad_norm": 2.8452351093292236, + "learning_rate": 1.4499425799557e-05, + "loss": 1.0247, + "step": 8326 + }, + { + "epoch": 0.4164, + "grad_norm": 3.3311054706573486, + "learning_rate": 1.449630816678874e-05, + "loss": 1.4409, + "step": 8328 + }, + { + "epoch": 0.4165, + "grad_norm": 7.710456371307373, + "learning_rate": 1.4493189986158966e-05, + "loss": 0.4279, + "step": 8330 + }, + { + "epoch": 0.4166, + "grad_norm": 13.193657875061035, + "learning_rate": 1.4490071258047625e-05, + "loss": 1.0713, + "step": 8332 + }, + { + "epoch": 0.4167, + "grad_norm": 3.509657621383667, + "learning_rate": 1.448695198283472e-05, + "loss": 1.2344, + "step": 8334 + }, + { + "epoch": 0.4168, + "grad_norm": 3.9043962955474854, + "learning_rate": 1.4483832160900326e-05, + "loss": 1.5047, + "step": 8336 + }, + { + "epoch": 0.4169, + "grad_norm": 3.9236109256744385, + "learning_rate": 1.448071179262458e-05, + "loss": 0.7544, + "step": 8338 + }, + { + "epoch": 0.417, + "grad_norm": 3.0967047214508057, + "learning_rate": 1.4477590878387697e-05, + "loss": 0.9053, + "step": 8340 + }, + { + "epoch": 0.4171, + "grad_norm": 2.104982614517212, + "learning_rate": 1.4474469418569949e-05, + "loss": 0.6641, + "step": 8342 + }, + { + "epoch": 0.4172, + "grad_norm": 14.461054801940918, + "learning_rate": 1.4471347413551673e-05, + "loss": 1.0456, + "step": 8344 + }, + { + "epoch": 0.4173, + "grad_norm": 4.168720722198486, + "learning_rate": 1.4468224863713278e-05, + "loss": 0.8312, + "step": 8346 + }, + { + "epoch": 0.4174, + "grad_norm": 1.1370717287063599, + "learning_rate": 1.4465101769435235e-05, + "loss": 0.5975, + "step": 8348 + }, + { + "epoch": 0.4175, + "grad_norm": 5.163273811340332, + "learning_rate": 1.4461978131098089e-05, + "loss": 0.3192, + "step": 8350 + }, + { + "epoch": 0.4176, + "grad_norm": 4.014307975769043, + "learning_rate": 1.4458853949082443e-05, + "loss": 1.2591, + "step": 8352 + }, + { + "epoch": 0.4177, + "grad_norm": 9.896376609802246, + "learning_rate": 1.4455729223768966e-05, + "loss": 1.3314, + "step": 8354 + }, + { + "epoch": 0.4178, + "grad_norm": 5.4113450050354, + "learning_rate": 1.4452603955538397e-05, + "loss": 0.7929, + "step": 8356 + }, + { + "epoch": 0.4179, + "grad_norm": 4.914163112640381, + "learning_rate": 1.4449478144771545e-05, + "loss": 1.0111, + "step": 8358 + }, + { + "epoch": 0.418, + "grad_norm": 4.193699359893799, + "learning_rate": 1.4446351791849276e-05, + "loss": 1.323, + "step": 8360 + }, + { + "epoch": 0.4181, + "grad_norm": 6.573829174041748, + "learning_rate": 1.444322489715253e-05, + "loss": 0.9171, + "step": 8362 + }, + { + "epoch": 0.4182, + "grad_norm": 2.79490327835083, + "learning_rate": 1.4440097461062308e-05, + "loss": 1.1983, + "step": 8364 + }, + { + "epoch": 0.4183, + "grad_norm": 4.869973182678223, + "learning_rate": 1.4436969483959677e-05, + "loss": 0.9813, + "step": 8366 + }, + { + "epoch": 0.4184, + "grad_norm": 3.916926145553589, + "learning_rate": 1.4433840966225772e-05, + "loss": 1.0389, + "step": 8368 + }, + { + "epoch": 0.4185, + "grad_norm": 1.5091807842254639, + "learning_rate": 1.4430711908241798e-05, + "loss": 1.0684, + "step": 8370 + }, + { + "epoch": 0.4186, + "grad_norm": 2.681718587875366, + "learning_rate": 1.442758231038902e-05, + "loss": 1.6872, + "step": 8372 + }, + { + "epoch": 0.4187, + "grad_norm": 2.7267751693725586, + "learning_rate": 1.4424452173048763e-05, + "loss": 1.0031, + "step": 8374 + }, + { + "epoch": 0.4188, + "grad_norm": 3.5119457244873047, + "learning_rate": 1.4421321496602428e-05, + "loss": 0.2846, + "step": 8376 + }, + { + "epoch": 0.4189, + "grad_norm": 6.778703212738037, + "learning_rate": 1.4418190281431484e-05, + "loss": 0.7767, + "step": 8378 + }, + { + "epoch": 0.419, + "grad_norm": 2.9281113147735596, + "learning_rate": 1.4415058527917454e-05, + "loss": 1.5116, + "step": 8380 + }, + { + "epoch": 0.4191, + "grad_norm": 4.812743663787842, + "learning_rate": 1.4411926236441935e-05, + "loss": 0.8362, + "step": 8382 + }, + { + "epoch": 0.4192, + "grad_norm": 3.0967414379119873, + "learning_rate": 1.4408793407386587e-05, + "loss": 0.8024, + "step": 8384 + }, + { + "epoch": 0.4193, + "grad_norm": 3.7937607765197754, + "learning_rate": 1.4405660041133133e-05, + "loss": 1.1176, + "step": 8386 + }, + { + "epoch": 0.4194, + "grad_norm": 4.172285079956055, + "learning_rate": 1.4402526138063373e-05, + "loss": 1.4473, + "step": 8388 + }, + { + "epoch": 0.4195, + "grad_norm": 2.897810697555542, + "learning_rate": 1.4399391698559153e-05, + "loss": 0.8642, + "step": 8390 + }, + { + "epoch": 0.4196, + "grad_norm": 4.7754716873168945, + "learning_rate": 1.43962567230024e-05, + "loss": 0.7836, + "step": 8392 + }, + { + "epoch": 0.4197, + "grad_norm": 7.422333717346191, + "learning_rate": 1.4393121211775101e-05, + "loss": 1.1807, + "step": 8394 + }, + { + "epoch": 0.4198, + "grad_norm": 4.130704402923584, + "learning_rate": 1.4389985165259308e-05, + "loss": 1.51, + "step": 8396 + }, + { + "epoch": 0.4199, + "grad_norm": 1.9977779388427734, + "learning_rate": 1.438684858383714e-05, + "loss": 0.7179, + "step": 8398 + }, + { + "epoch": 0.42, + "grad_norm": 6.0517988204956055, + "learning_rate": 1.4383711467890776e-05, + "loss": 0.8285, + "step": 8400 + }, + { + "epoch": 0.4201, + "grad_norm": 3.754455089569092, + "learning_rate": 1.4380573817802467e-05, + "loss": 0.5486, + "step": 8402 + }, + { + "epoch": 0.4202, + "grad_norm": 2.7239229679107666, + "learning_rate": 1.4377435633954528e-05, + "loss": 0.7314, + "step": 8404 + }, + { + "epoch": 0.4203, + "grad_norm": 3.1701152324676514, + "learning_rate": 1.4374296916729335e-05, + "loss": 0.8017, + "step": 8406 + }, + { + "epoch": 0.4204, + "grad_norm": 4.04392671585083, + "learning_rate": 1.437115766650933e-05, + "loss": 0.9643, + "step": 8408 + }, + { + "epoch": 0.4205, + "grad_norm": 4.257327556610107, + "learning_rate": 1.4368017883677024e-05, + "loss": 1.221, + "step": 8410 + }, + { + "epoch": 0.4206, + "grad_norm": 3.3957908153533936, + "learning_rate": 1.436487756861499e-05, + "loss": 0.851, + "step": 8412 + }, + { + "epoch": 0.4207, + "grad_norm": 1.2749783992767334, + "learning_rate": 1.436173672170586e-05, + "loss": 0.7057, + "step": 8414 + }, + { + "epoch": 0.4208, + "grad_norm": 4.415505886077881, + "learning_rate": 1.4358595343332342e-05, + "loss": 0.9114, + "step": 8416 + }, + { + "epoch": 0.4209, + "grad_norm": 3.005147695541382, + "learning_rate": 1.4355453433877204e-05, + "loss": 1.2095, + "step": 8418 + }, + { + "epoch": 0.421, + "grad_norm": 4.0817551612854, + "learning_rate": 1.4352310993723277e-05, + "loss": 0.6496, + "step": 8420 + }, + { + "epoch": 0.4211, + "grad_norm": 3.4803929328918457, + "learning_rate": 1.4349168023253457e-05, + "loss": 1.6964, + "step": 8422 + }, + { + "epoch": 0.4212, + "grad_norm": 5.9342498779296875, + "learning_rate": 1.4346024522850704e-05, + "loss": 1.2349, + "step": 8424 + }, + { + "epoch": 0.4213, + "grad_norm": 3.425287961959839, + "learning_rate": 1.4342880492898048e-05, + "loss": 0.2632, + "step": 8426 + }, + { + "epoch": 0.4214, + "grad_norm": 4.274807929992676, + "learning_rate": 1.4339735933778576e-05, + "loss": 0.3956, + "step": 8428 + }, + { + "epoch": 0.4215, + "grad_norm": 1.5945316553115845, + "learning_rate": 1.4336590845875446e-05, + "loss": 0.6566, + "step": 8430 + }, + { + "epoch": 0.4216, + "grad_norm": 14.431543350219727, + "learning_rate": 1.4333445229571874e-05, + "loss": 1.9128, + "step": 8432 + }, + { + "epoch": 0.4217, + "grad_norm": 0.24811899662017822, + "learning_rate": 1.4330299085251145e-05, + "loss": 0.4207, + "step": 8434 + }, + { + "epoch": 0.4218, + "grad_norm": 3.788613796234131, + "learning_rate": 1.4327152413296607e-05, + "loss": 0.4512, + "step": 8436 + }, + { + "epoch": 0.4219, + "grad_norm": 3.6410014629364014, + "learning_rate": 1.4324005214091676e-05, + "loss": 1.4155, + "step": 8438 + }, + { + "epoch": 0.422, + "grad_norm": 9.09390640258789, + "learning_rate": 1.4320857488019826e-05, + "loss": 0.9978, + "step": 8440 + }, + { + "epoch": 0.4221, + "grad_norm": 6.594457626342773, + "learning_rate": 1.4317709235464595e-05, + "loss": 1.0074, + "step": 8442 + }, + { + "epoch": 0.4222, + "grad_norm": 3.563671588897705, + "learning_rate": 1.4314560456809592e-05, + "loss": 0.7447, + "step": 8444 + }, + { + "epoch": 0.4223, + "grad_norm": 3.4145150184631348, + "learning_rate": 1.4311411152438483e-05, + "loss": 0.8359, + "step": 8446 + }, + { + "epoch": 0.4224, + "grad_norm": 3.348036289215088, + "learning_rate": 1.4308261322735006e-05, + "loss": 0.6788, + "step": 8448 + }, + { + "epoch": 0.4225, + "grad_norm": 5.754849910736084, + "learning_rate": 1.4305110968082953e-05, + "loss": 1.2515, + "step": 8450 + }, + { + "epoch": 0.4226, + "grad_norm": 1.2874023914337158, + "learning_rate": 1.4301960088866187e-05, + "loss": 1.3526, + "step": 8452 + }, + { + "epoch": 0.4227, + "grad_norm": 3.744318723678589, + "learning_rate": 1.4298808685468634e-05, + "loss": 1.0185, + "step": 8454 + }, + { + "epoch": 0.4228, + "grad_norm": 2.083646059036255, + "learning_rate": 1.4295656758274283e-05, + "loss": 1.145, + "step": 8456 + }, + { + "epoch": 0.4229, + "grad_norm": 4.317836761474609, + "learning_rate": 1.4292504307667188e-05, + "loss": 0.6194, + "step": 8458 + }, + { + "epoch": 0.423, + "grad_norm": 6.765156269073486, + "learning_rate": 1.4289351334031461e-05, + "loss": 1.0342, + "step": 8460 + }, + { + "epoch": 0.4231, + "grad_norm": 5.181052207946777, + "learning_rate": 1.4286197837751286e-05, + "loss": 1.3072, + "step": 8462 + }, + { + "epoch": 0.4232, + "grad_norm": 4.770481109619141, + "learning_rate": 1.4283043819210905e-05, + "loss": 0.9279, + "step": 8464 + }, + { + "epoch": 0.4233, + "grad_norm": 1.94223153591156, + "learning_rate": 1.4279889278794627e-05, + "loss": 0.4421, + "step": 8466 + }, + { + "epoch": 0.4234, + "grad_norm": 7.448968410491943, + "learning_rate": 1.4276734216886823e-05, + "loss": 1.4742, + "step": 8468 + }, + { + "epoch": 0.4235, + "grad_norm": 6.577226161956787, + "learning_rate": 1.4273578633871927e-05, + "loss": 1.0326, + "step": 8470 + }, + { + "epoch": 0.4236, + "grad_norm": 2.865485429763794, + "learning_rate": 1.4270422530134433e-05, + "loss": 0.6667, + "step": 8472 + }, + { + "epoch": 0.4237, + "grad_norm": 6.588530540466309, + "learning_rate": 1.4267265906058913e-05, + "loss": 0.8107, + "step": 8474 + }, + { + "epoch": 0.4238, + "grad_norm": 3.8379268646240234, + "learning_rate": 1.4264108762029989e-05, + "loss": 0.8232, + "step": 8476 + }, + { + "epoch": 0.4239, + "grad_norm": 10.840635299682617, + "learning_rate": 1.4260951098432342e-05, + "loss": 1.1452, + "step": 8478 + }, + { + "epoch": 0.424, + "grad_norm": 2.271772623062134, + "learning_rate": 1.4257792915650728e-05, + "loss": 0.7875, + "step": 8480 + }, + { + "epoch": 0.4241, + "grad_norm": 12.735661506652832, + "learning_rate": 1.4254634214069965e-05, + "loss": 1.377, + "step": 8482 + }, + { + "epoch": 0.4242, + "grad_norm": 11.945878982543945, + "learning_rate": 1.4251474994074927e-05, + "loss": 1.1595, + "step": 8484 + }, + { + "epoch": 0.4243, + "grad_norm": 2.9012112617492676, + "learning_rate": 1.4248315256050558e-05, + "loss": 1.2245, + "step": 8486 + }, + { + "epoch": 0.4244, + "grad_norm": 4.203147888183594, + "learning_rate": 1.424515500038186e-05, + "loss": 1.1502, + "step": 8488 + }, + { + "epoch": 0.4245, + "grad_norm": 6.475977897644043, + "learning_rate": 1.4241994227453902e-05, + "loss": 1.1505, + "step": 8490 + }, + { + "epoch": 0.4246, + "grad_norm": 11.731956481933594, + "learning_rate": 1.4238832937651816e-05, + "loss": 1.3378, + "step": 8492 + }, + { + "epoch": 0.4247, + "grad_norm": 11.320242881774902, + "learning_rate": 1.4235671131360797e-05, + "loss": 1.5201, + "step": 8494 + }, + { + "epoch": 0.4248, + "grad_norm": 7.114014625549316, + "learning_rate": 1.4232508808966097e-05, + "loss": 1.3179, + "step": 8496 + }, + { + "epoch": 0.4249, + "grad_norm": 4.004026889801025, + "learning_rate": 1.4229345970853032e-05, + "loss": 1.2178, + "step": 8498 + }, + { + "epoch": 0.425, + "grad_norm": 1.2267347574234009, + "learning_rate": 1.4226182617406996e-05, + "loss": 0.2669, + "step": 8500 + }, + { + "epoch": 0.4251, + "grad_norm": 6.625265598297119, + "learning_rate": 1.4223018749013424e-05, + "loss": 0.5278, + "step": 8502 + }, + { + "epoch": 0.4252, + "grad_norm": 2.471973180770874, + "learning_rate": 1.4219854366057831e-05, + "loss": 0.7857, + "step": 8504 + }, + { + "epoch": 0.4253, + "grad_norm": 4.4495849609375, + "learning_rate": 1.421668946892578e-05, + "loss": 1.3216, + "step": 8506 + }, + { + "epoch": 0.4254, + "grad_norm": 7.9065022468566895, + "learning_rate": 1.421352405800291e-05, + "loss": 0.7322, + "step": 8508 + }, + { + "epoch": 0.4255, + "grad_norm": 4.7814178466796875, + "learning_rate": 1.4210358133674912e-05, + "loss": 1.6004, + "step": 8510 + }, + { + "epoch": 0.4256, + "grad_norm": 6.6815667152404785, + "learning_rate": 1.420719169632755e-05, + "loss": 0.7504, + "step": 8512 + }, + { + "epoch": 0.4257, + "grad_norm": 11.269108772277832, + "learning_rate": 1.4204024746346638e-05, + "loss": 1.2569, + "step": 8514 + }, + { + "epoch": 0.4258, + "grad_norm": 2.924325704574585, + "learning_rate": 1.4200857284118067e-05, + "loss": 0.9154, + "step": 8516 + }, + { + "epoch": 0.4259, + "grad_norm": 2.7585678100585938, + "learning_rate": 1.4197689310027772e-05, + "loss": 0.6384, + "step": 8518 + }, + { + "epoch": 0.426, + "grad_norm": 8.464324951171875, + "learning_rate": 1.4194520824461773e-05, + "loss": 1.6962, + "step": 8520 + }, + { + "epoch": 0.4261, + "grad_norm": 2.7986137866973877, + "learning_rate": 1.4191351827806133e-05, + "loss": 0.5298, + "step": 8522 + }, + { + "epoch": 0.4262, + "grad_norm": 4.068133354187012, + "learning_rate": 1.4188182320446985e-05, + "loss": 0.423, + "step": 8524 + }, + { + "epoch": 0.4263, + "grad_norm": 5.422779083251953, + "learning_rate": 1.418501230277053e-05, + "loss": 1.0662, + "step": 8526 + }, + { + "epoch": 0.4264, + "grad_norm": 1.980682373046875, + "learning_rate": 1.4181841775163014e-05, + "loss": 0.7648, + "step": 8528 + }, + { + "epoch": 0.4265, + "grad_norm": 4.411659240722656, + "learning_rate": 1.4178670738010769e-05, + "loss": 1.2579, + "step": 8530 + }, + { + "epoch": 0.4266, + "grad_norm": 2.627147912979126, + "learning_rate": 1.4175499191700169e-05, + "loss": 0.596, + "step": 8532 + }, + { + "epoch": 0.4267, + "grad_norm": 2.2788949012756348, + "learning_rate": 1.4172327136617656e-05, + "loss": 0.9702, + "step": 8534 + }, + { + "epoch": 0.4268, + "grad_norm": 4.07106876373291, + "learning_rate": 1.4169154573149737e-05, + "loss": 1.1306, + "step": 8536 + }, + { + "epoch": 0.4269, + "grad_norm": 5.160365581512451, + "learning_rate": 1.4165981501682979e-05, + "loss": 0.9762, + "step": 8538 + }, + { + "epoch": 0.427, + "grad_norm": 2.7274138927459717, + "learning_rate": 1.4162807922604014e-05, + "loss": 1.3455, + "step": 8540 + }, + { + "epoch": 0.4271, + "grad_norm": 11.447150230407715, + "learning_rate": 1.415963383629953e-05, + "loss": 1.0466, + "step": 8542 + }, + { + "epoch": 0.4272, + "grad_norm": 3.824586868286133, + "learning_rate": 1.415645924315628e-05, + "loss": 0.8049, + "step": 8544 + }, + { + "epoch": 0.4273, + "grad_norm": 2.3509509563446045, + "learning_rate": 1.4153284143561078e-05, + "loss": 0.8705, + "step": 8546 + }, + { + "epoch": 0.4274, + "grad_norm": 6.841029644012451, + "learning_rate": 1.4150108537900805e-05, + "loss": 1.197, + "step": 8548 + }, + { + "epoch": 0.4275, + "grad_norm": 7.731337547302246, + "learning_rate": 1.4146932426562391e-05, + "loss": 1.104, + "step": 8550 + }, + { + "epoch": 0.4276, + "grad_norm": 1.7644728422164917, + "learning_rate": 1.4143755809932843e-05, + "loss": 0.6631, + "step": 8552 + }, + { + "epoch": 0.4277, + "grad_norm": 4.239624500274658, + "learning_rate": 1.4140578688399217e-05, + "loss": 1.2007, + "step": 8554 + }, + { + "epoch": 0.4278, + "grad_norm": 2.8557376861572266, + "learning_rate": 1.4137401062348639e-05, + "loss": 1.0141, + "step": 8556 + }, + { + "epoch": 0.4279, + "grad_norm": 4.626419544219971, + "learning_rate": 1.4134222932168291e-05, + "loss": 0.5996, + "step": 8558 + }, + { + "epoch": 0.428, + "grad_norm": 5.216811656951904, + "learning_rate": 1.413104429824542e-05, + "loss": 2.2711, + "step": 8560 + }, + { + "epoch": 0.4281, + "grad_norm": 2.969954252243042, + "learning_rate": 1.412786516096733e-05, + "loss": 0.8815, + "step": 8562 + }, + { + "epoch": 0.4282, + "grad_norm": 7.7451043128967285, + "learning_rate": 1.4124685520721393e-05, + "loss": 1.1016, + "step": 8564 + }, + { + "epoch": 0.4283, + "grad_norm": 3.486236572265625, + "learning_rate": 1.4121505377895037e-05, + "loss": 1.1471, + "step": 8566 + }, + { + "epoch": 0.4284, + "grad_norm": 3.8570878505706787, + "learning_rate": 1.411832473287575e-05, + "loss": 1.2658, + "step": 8568 + }, + { + "epoch": 0.4285, + "grad_norm": 4.668900489807129, + "learning_rate": 1.411514358605109e-05, + "loss": 0.666, + "step": 8570 + }, + { + "epoch": 0.4286, + "grad_norm": 2.0830585956573486, + "learning_rate": 1.4111961937808665e-05, + "loss": 0.6884, + "step": 8572 + }, + { + "epoch": 0.4287, + "grad_norm": 6.874061584472656, + "learning_rate": 1.4108779788536151e-05, + "loss": 0.7072, + "step": 8574 + }, + { + "epoch": 0.4288, + "grad_norm": 5.113643646240234, + "learning_rate": 1.4105597138621281e-05, + "loss": 1.2423, + "step": 8576 + }, + { + "epoch": 0.4289, + "grad_norm": 8.804728507995605, + "learning_rate": 1.4102413988451855e-05, + "loss": 0.7458, + "step": 8578 + }, + { + "epoch": 0.429, + "grad_norm": 2.9846396446228027, + "learning_rate": 1.4099230338415728e-05, + "loss": 0.8291, + "step": 8580 + }, + { + "epoch": 0.4291, + "grad_norm": 3.4401566982269287, + "learning_rate": 1.4096046188900823e-05, + "loss": 1.1621, + "step": 8582 + }, + { + "epoch": 0.4292, + "grad_norm": 4.714600563049316, + "learning_rate": 1.4092861540295109e-05, + "loss": 0.7512, + "step": 8584 + }, + { + "epoch": 0.4293, + "grad_norm": 4.616362571716309, + "learning_rate": 1.408967639298663e-05, + "loss": 1.0086, + "step": 8586 + }, + { + "epoch": 0.4294, + "grad_norm": 4.222672462463379, + "learning_rate": 1.4086490747363492e-05, + "loss": 0.9729, + "step": 8588 + }, + { + "epoch": 0.4295, + "grad_norm": 4.053589820861816, + "learning_rate": 1.408330460381385e-05, + "loss": 0.7398, + "step": 8590 + }, + { + "epoch": 0.4296, + "grad_norm": 3.7039597034454346, + "learning_rate": 1.4080117962725929e-05, + "loss": 1.3398, + "step": 8592 + }, + { + "epoch": 0.4297, + "grad_norm": 5.6531453132629395, + "learning_rate": 1.407693082448801e-05, + "loss": 0.7289, + "step": 8594 + }, + { + "epoch": 0.4298, + "grad_norm": 10.281307220458984, + "learning_rate": 1.4073743189488436e-05, + "loss": 1.8324, + "step": 8596 + }, + { + "epoch": 0.4299, + "grad_norm": 4.521692752838135, + "learning_rate": 1.4070555058115614e-05, + "loss": 1.1894, + "step": 8598 + }, + { + "epoch": 0.43, + "grad_norm": 2.4490256309509277, + "learning_rate": 1.4067366430758004e-05, + "loss": 0.5159, + "step": 8600 + }, + { + "epoch": 0.4301, + "grad_norm": 5.724756717681885, + "learning_rate": 1.4064177307804135e-05, + "loss": 0.7042, + "step": 8602 + }, + { + "epoch": 0.4302, + "grad_norm": 2.195220947265625, + "learning_rate": 1.4060987689642581e-05, + "loss": 0.9553, + "step": 8604 + }, + { + "epoch": 0.4303, + "grad_norm": 7.151241779327393, + "learning_rate": 1.4057797576662e-05, + "loss": 0.5606, + "step": 8606 + }, + { + "epoch": 0.4304, + "grad_norm": 2.3086371421813965, + "learning_rate": 1.4054606969251095e-05, + "loss": 1.2799, + "step": 8608 + }, + { + "epoch": 0.4305, + "grad_norm": 8.359182357788086, + "learning_rate": 1.4051415867798627e-05, + "loss": 0.5138, + "step": 8610 + }, + { + "epoch": 0.4306, + "grad_norm": 6.232611656188965, + "learning_rate": 1.4048224272693426e-05, + "loss": 0.8202, + "step": 8612 + }, + { + "epoch": 0.4307, + "grad_norm": 3.3881163597106934, + "learning_rate": 1.4045032184324374e-05, + "loss": 1.2238, + "step": 8614 + }, + { + "epoch": 0.4308, + "grad_norm": 3.7149922847747803, + "learning_rate": 1.4041839603080423e-05, + "loss": 0.8884, + "step": 8616 + }, + { + "epoch": 0.4309, + "grad_norm": 3.8741374015808105, + "learning_rate": 1.403864652935058e-05, + "loss": 1.274, + "step": 8618 + }, + { + "epoch": 0.431, + "grad_norm": 4.08222770690918, + "learning_rate": 1.4035452963523903e-05, + "loss": 1.0608, + "step": 8620 + }, + { + "epoch": 0.4311, + "grad_norm": 2.7519962787628174, + "learning_rate": 1.4032258905989521e-05, + "loss": 2.5808, + "step": 8622 + }, + { + "epoch": 0.4312, + "grad_norm": 9.551321983337402, + "learning_rate": 1.4029064357136628e-05, + "loss": 0.8197, + "step": 8624 + }, + { + "epoch": 0.4313, + "grad_norm": 3.5530545711517334, + "learning_rate": 1.402586931735446e-05, + "loss": 0.9939, + "step": 8626 + }, + { + "epoch": 0.4314, + "grad_norm": 5.426881313323975, + "learning_rate": 1.4022673787032333e-05, + "loss": 1.2452, + "step": 8628 + }, + { + "epoch": 0.4315, + "grad_norm": 3.213778495788574, + "learning_rate": 1.4019477766559604e-05, + "loss": 1.2506, + "step": 8630 + }, + { + "epoch": 0.4316, + "grad_norm": 6.132962226867676, + "learning_rate": 1.4016281256325702e-05, + "loss": 1.4776, + "step": 8632 + }, + { + "epoch": 0.4317, + "grad_norm": 4.63603401184082, + "learning_rate": 1.4013084256720109e-05, + "loss": 1.1103, + "step": 8634 + }, + { + "epoch": 0.4318, + "grad_norm": 3.2628676891326904, + "learning_rate": 1.4009886768132375e-05, + "loss": 1.0815, + "step": 8636 + }, + { + "epoch": 0.4319, + "grad_norm": 2.856783151626587, + "learning_rate": 1.4006688790952102e-05, + "loss": 1.0886, + "step": 8638 + }, + { + "epoch": 0.432, + "grad_norm": 2.738518238067627, + "learning_rate": 1.4003490325568953e-05, + "loss": 0.3579, + "step": 8640 + }, + { + "epoch": 0.4321, + "grad_norm": 2.4926414489746094, + "learning_rate": 1.4000291372372647e-05, + "loss": 1.3729, + "step": 8642 + }, + { + "epoch": 0.4322, + "grad_norm": 7.230648994445801, + "learning_rate": 1.3997091931752978e-05, + "loss": 1.0081, + "step": 8644 + }, + { + "epoch": 0.4323, + "grad_norm": 2.168767213821411, + "learning_rate": 1.3993892004099778e-05, + "loss": 0.9805, + "step": 8646 + }, + { + "epoch": 0.4324, + "grad_norm": 3.286931276321411, + "learning_rate": 1.3990691589802955e-05, + "loss": 1.19, + "step": 8648 + }, + { + "epoch": 0.4325, + "grad_norm": 3.247859477996826, + "learning_rate": 1.3987490689252463e-05, + "loss": 0.7972, + "step": 8650 + }, + { + "epoch": 0.4326, + "grad_norm": 2.6479387283325195, + "learning_rate": 1.3984289302838327e-05, + "loss": 0.7955, + "step": 8652 + }, + { + "epoch": 0.4327, + "grad_norm": 4.704957485198975, + "learning_rate": 1.3981087430950628e-05, + "loss": 0.8829, + "step": 8654 + }, + { + "epoch": 0.4328, + "grad_norm": 13.064567565917969, + "learning_rate": 1.39778850739795e-05, + "loss": 0.999, + "step": 8656 + }, + { + "epoch": 0.4329, + "grad_norm": 3.284128427505493, + "learning_rate": 1.3974682232315141e-05, + "loss": 0.9784, + "step": 8658 + }, + { + "epoch": 0.433, + "grad_norm": 3.350231170654297, + "learning_rate": 1.3971478906347806e-05, + "loss": 0.4804, + "step": 8660 + }, + { + "epoch": 0.4331, + "grad_norm": 5.437411308288574, + "learning_rate": 1.3968275096467818e-05, + "loss": 1.2457, + "step": 8662 + }, + { + "epoch": 0.4332, + "grad_norm": 2.875149726867676, + "learning_rate": 1.3965070803065543e-05, + "loss": 1.3746, + "step": 8664 + }, + { + "epoch": 0.4333, + "grad_norm": 4.799482822418213, + "learning_rate": 1.396186602653142e-05, + "loss": 1.1015, + "step": 8666 + }, + { + "epoch": 0.4334, + "grad_norm": 3.79795241355896, + "learning_rate": 1.3958660767255938e-05, + "loss": 0.6225, + "step": 8668 + }, + { + "epoch": 0.4335, + "grad_norm": 6.2611165046691895, + "learning_rate": 1.3955455025629652e-05, + "loss": 1.1533, + "step": 8670 + }, + { + "epoch": 0.4336, + "grad_norm": 2.639647960662842, + "learning_rate": 1.3952248802043166e-05, + "loss": 0.7838, + "step": 8672 + }, + { + "epoch": 0.4337, + "grad_norm": 1.361344337463379, + "learning_rate": 1.3949042096887154e-05, + "loss": 1.0859, + "step": 8674 + }, + { + "epoch": 0.4338, + "grad_norm": 2.664177179336548, + "learning_rate": 1.394583491055234e-05, + "loss": 1.3652, + "step": 8676 + }, + { + "epoch": 0.4339, + "grad_norm": 2.7832298278808594, + "learning_rate": 1.3942627243429512e-05, + "loss": 1.4018, + "step": 8678 + }, + { + "epoch": 0.434, + "grad_norm": 4.1976399421691895, + "learning_rate": 1.3939419095909513e-05, + "loss": 0.4712, + "step": 8680 + }, + { + "epoch": 0.4341, + "grad_norm": 12.397087097167969, + "learning_rate": 1.3936210468383246e-05, + "loss": 0.5249, + "step": 8682 + }, + { + "epoch": 0.4342, + "grad_norm": 7.355434417724609, + "learning_rate": 1.3933001361241674e-05, + "loss": 0.9018, + "step": 8684 + }, + { + "epoch": 0.4343, + "grad_norm": 4.063648700714111, + "learning_rate": 1.3929791774875817e-05, + "loss": 1.1373, + "step": 8686 + }, + { + "epoch": 0.4344, + "grad_norm": 2.465554714202881, + "learning_rate": 1.3926581709676752e-05, + "loss": 0.7054, + "step": 8688 + }, + { + "epoch": 0.4345, + "grad_norm": 3.1204628944396973, + "learning_rate": 1.3923371166035615e-05, + "loss": 0.4956, + "step": 8690 + }, + { + "epoch": 0.4346, + "grad_norm": 3.6058409214019775, + "learning_rate": 1.3920160144343604e-05, + "loss": 0.4024, + "step": 8692 + }, + { + "epoch": 0.4347, + "grad_norm": 3.2766363620758057, + "learning_rate": 1.3916948644991969e-05, + "loss": 0.6335, + "step": 8694 + }, + { + "epoch": 0.4348, + "grad_norm": 4.997308254241943, + "learning_rate": 1.3913736668372027e-05, + "loss": 1.1985, + "step": 8696 + }, + { + "epoch": 0.4349, + "grad_norm": 3.7991085052490234, + "learning_rate": 1.391052421487514e-05, + "loss": 0.707, + "step": 8698 + }, + { + "epoch": 0.435, + "grad_norm": 13.959881782531738, + "learning_rate": 1.3907311284892737e-05, + "loss": 1.3635, + "step": 8700 + }, + { + "epoch": 0.4351, + "grad_norm": 4.451152324676514, + "learning_rate": 1.3904097878816312e-05, + "loss": 1.4837, + "step": 8702 + }, + { + "epoch": 0.4352, + "grad_norm": 2.246303081512451, + "learning_rate": 1.3900883997037398e-05, + "loss": 1.255, + "step": 8704 + }, + { + "epoch": 0.4353, + "grad_norm": 4.80082893371582, + "learning_rate": 1.3897669639947606e-05, + "loss": 0.8882, + "step": 8706 + }, + { + "epoch": 0.4354, + "grad_norm": 20.29956817626953, + "learning_rate": 1.3894454807938587e-05, + "loss": 1.1365, + "step": 8708 + }, + { + "epoch": 0.4355, + "grad_norm": 3.9036712646484375, + "learning_rate": 1.3891239501402063e-05, + "loss": 0.6507, + "step": 8710 + }, + { + "epoch": 0.4356, + "grad_norm": 5.176523685455322, + "learning_rate": 1.388802372072981e-05, + "loss": 0.8131, + "step": 8712 + }, + { + "epoch": 0.4357, + "grad_norm": 5.464172840118408, + "learning_rate": 1.3884807466313664e-05, + "loss": 1.056, + "step": 8714 + }, + { + "epoch": 0.4358, + "grad_norm": 2.3601670265197754, + "learning_rate": 1.3881590738545508e-05, + "loss": 1.2427, + "step": 8716 + }, + { + "epoch": 0.4359, + "grad_norm": 4.603447914123535, + "learning_rate": 1.3878373537817294e-05, + "loss": 0.7688, + "step": 8718 + }, + { + "epoch": 0.436, + "grad_norm": 2.797574996948242, + "learning_rate": 1.3875155864521031e-05, + "loss": 0.4274, + "step": 8720 + }, + { + "epoch": 0.4361, + "grad_norm": 6.1515092849731445, + "learning_rate": 1.3871937719048777e-05, + "loss": 2.8491, + "step": 8722 + }, + { + "epoch": 0.4362, + "grad_norm": 5.585160255432129, + "learning_rate": 1.3868719101792664e-05, + "loss": 0.8375, + "step": 8724 + }, + { + "epoch": 0.4363, + "grad_norm": 2.9331390857696533, + "learning_rate": 1.3865500013144857e-05, + "loss": 0.7634, + "step": 8726 + }, + { + "epoch": 0.4364, + "grad_norm": 12.491473197937012, + "learning_rate": 1.3862280453497601e-05, + "loss": 1.6746, + "step": 8728 + }, + { + "epoch": 0.4365, + "grad_norm": 0.8673524856567383, + "learning_rate": 1.3859060423243187e-05, + "loss": 0.539, + "step": 8730 + }, + { + "epoch": 0.4366, + "grad_norm": 4.379856109619141, + "learning_rate": 1.3855839922773968e-05, + "loss": 1.2445, + "step": 8732 + }, + { + "epoch": 0.4367, + "grad_norm": 0.7519943714141846, + "learning_rate": 1.3852618952482348e-05, + "loss": 0.434, + "step": 8734 + }, + { + "epoch": 0.4368, + "grad_norm": 0.5222268104553223, + "learning_rate": 1.3849397512760797e-05, + "loss": 0.5632, + "step": 8736 + }, + { + "epoch": 0.4369, + "grad_norm": 4.238522052764893, + "learning_rate": 1.3846175604001832e-05, + "loss": 0.5707, + "step": 8738 + }, + { + "epoch": 0.437, + "grad_norm": 6.885987281799316, + "learning_rate": 1.3842953226598036e-05, + "loss": 1.3053, + "step": 8740 + }, + { + "epoch": 0.4371, + "grad_norm": 2.17354416847229, + "learning_rate": 1.3839730380942054e-05, + "loss": 1.8137, + "step": 8742 + }, + { + "epoch": 0.4372, + "grad_norm": 3.6123414039611816, + "learning_rate": 1.3836507067426565e-05, + "loss": 0.6384, + "step": 8744 + }, + { + "epoch": 0.4373, + "grad_norm": 7.243729114532471, + "learning_rate": 1.3833283286444327e-05, + "loss": 1.0795, + "step": 8746 + }, + { + "epoch": 0.4374, + "grad_norm": 4.680215835571289, + "learning_rate": 1.3830059038388153e-05, + "loss": 0.5289, + "step": 8748 + }, + { + "epoch": 0.4375, + "grad_norm": 4.424304008483887, + "learning_rate": 1.3826834323650899e-05, + "loss": 0.8083, + "step": 8750 + }, + { + "epoch": 0.4376, + "grad_norm": 9.426819801330566, + "learning_rate": 1.3823609142625492e-05, + "loss": 0.5037, + "step": 8752 + }, + { + "epoch": 0.4377, + "grad_norm": 3.8694703578948975, + "learning_rate": 1.382038349570491e-05, + "loss": 1.1337, + "step": 8754 + }, + { + "epoch": 0.4378, + "grad_norm": 9.139664649963379, + "learning_rate": 1.3817157383282184e-05, + "loss": 1.0079, + "step": 8756 + }, + { + "epoch": 0.4379, + "grad_norm": 6.114081859588623, + "learning_rate": 1.3813930805750413e-05, + "loss": 1.5096, + "step": 8758 + }, + { + "epoch": 0.438, + "grad_norm": 5.288512706756592, + "learning_rate": 1.3810703763502744e-05, + "loss": 1.3151, + "step": 8760 + }, + { + "epoch": 0.4381, + "grad_norm": 2.9206137657165527, + "learning_rate": 1.3807476256932378e-05, + "loss": 0.9841, + "step": 8762 + }, + { + "epoch": 0.4382, + "grad_norm": 1.001036286354065, + "learning_rate": 1.3804248286432577e-05, + "loss": 0.7048, + "step": 8764 + }, + { + "epoch": 0.4383, + "grad_norm": 8.24112319946289, + "learning_rate": 1.3801019852396664e-05, + "loss": 0.8052, + "step": 8766 + }, + { + "epoch": 0.4384, + "grad_norm": 8.120810508728027, + "learning_rate": 1.3797790955218014e-05, + "loss": 0.825, + "step": 8768 + }, + { + "epoch": 0.4385, + "grad_norm": 6.50949764251709, + "learning_rate": 1.3794561595290053e-05, + "loss": 0.9177, + "step": 8770 + }, + { + "epoch": 0.4386, + "grad_norm": 4.758340835571289, + "learning_rate": 1.3791331773006272e-05, + "loss": 0.337, + "step": 8772 + }, + { + "epoch": 0.4387, + "grad_norm": 3.01383376121521, + "learning_rate": 1.3788101488760215e-05, + "loss": 1.1951, + "step": 8774 + }, + { + "epoch": 0.4388, + "grad_norm": 4.922779560089111, + "learning_rate": 1.3784870742945482e-05, + "loss": 1.3677, + "step": 8776 + }, + { + "epoch": 0.4389, + "grad_norm": 7.281414031982422, + "learning_rate": 1.378163953595573e-05, + "loss": 1.1303, + "step": 8778 + }, + { + "epoch": 0.439, + "grad_norm": 4.075009822845459, + "learning_rate": 1.3778407868184674e-05, + "loss": 1.5918, + "step": 8780 + }, + { + "epoch": 0.4391, + "grad_norm": 2.9662721157073975, + "learning_rate": 1.3775175740026079e-05, + "loss": 1.2557, + "step": 8782 + }, + { + "epoch": 0.4392, + "grad_norm": 4.2612457275390625, + "learning_rate": 1.3771943151873768e-05, + "loss": 0.5399, + "step": 8784 + }, + { + "epoch": 0.4393, + "grad_norm": 2.399397850036621, + "learning_rate": 1.3768710104121628e-05, + "loss": 0.8862, + "step": 8786 + }, + { + "epoch": 0.4394, + "grad_norm": 4.385271072387695, + "learning_rate": 1.3765476597163595e-05, + "loss": 0.7644, + "step": 8788 + }, + { + "epoch": 0.4395, + "grad_norm": 22.027324676513672, + "learning_rate": 1.3762242631393656e-05, + "loss": 1.8003, + "step": 8790 + }, + { + "epoch": 0.4396, + "grad_norm": 9.018767356872559, + "learning_rate": 1.3759008207205869e-05, + "loss": 1.2715, + "step": 8792 + }, + { + "epoch": 0.4397, + "grad_norm": 4.890587329864502, + "learning_rate": 1.375577332499433e-05, + "loss": 1.1171, + "step": 8794 + }, + { + "epoch": 0.4398, + "grad_norm": 3.321429967880249, + "learning_rate": 1.375253798515321e-05, + "loss": 0.8182, + "step": 8796 + }, + { + "epoch": 0.4399, + "grad_norm": 14.012134552001953, + "learning_rate": 1.3749302188076716e-05, + "loss": 1.7279, + "step": 8798 + }, + { + "epoch": 0.44, + "grad_norm": 6.232757568359375, + "learning_rate": 1.3746065934159123e-05, + "loss": 1.0855, + "step": 8800 + }, + { + "epoch": 0.4401, + "grad_norm": 5.734096527099609, + "learning_rate": 1.374282922379476e-05, + "loss": 0.5708, + "step": 8802 + }, + { + "epoch": 0.4402, + "grad_norm": 3.529435157775879, + "learning_rate": 1.3739592057378005e-05, + "loss": 1.0306, + "step": 8804 + }, + { + "epoch": 0.4403, + "grad_norm": 4.622191429138184, + "learning_rate": 1.3736354435303305e-05, + "loss": 0.9453, + "step": 8806 + }, + { + "epoch": 0.4404, + "grad_norm": 3.7577075958251953, + "learning_rate": 1.373311635796515e-05, + "loss": 0.8752, + "step": 8808 + }, + { + "epoch": 0.4405, + "grad_norm": 5.436574459075928, + "learning_rate": 1.3729877825758091e-05, + "loss": 0.5835, + "step": 8810 + }, + { + "epoch": 0.4406, + "grad_norm": 2.7657830715179443, + "learning_rate": 1.3726638839076732e-05, + "loss": 0.9688, + "step": 8812 + }, + { + "epoch": 0.4407, + "grad_norm": 4.198677062988281, + "learning_rate": 1.3723399398315736e-05, + "loss": 1.1918, + "step": 8814 + }, + { + "epoch": 0.4408, + "grad_norm": 13.176215171813965, + "learning_rate": 1.3720159503869816e-05, + "loss": 1.4636, + "step": 8816 + }, + { + "epoch": 0.4409, + "grad_norm": 6.985131740570068, + "learning_rate": 1.3716919156133745e-05, + "loss": 1.338, + "step": 8818 + }, + { + "epoch": 0.441, + "grad_norm": 6.273284435272217, + "learning_rate": 1.371367835550235e-05, + "loss": 2.0201, + "step": 8820 + }, + { + "epoch": 0.4411, + "grad_norm": 2.1440186500549316, + "learning_rate": 1.3710437102370511e-05, + "loss": 1.4122, + "step": 8822 + }, + { + "epoch": 0.4412, + "grad_norm": 2.637079954147339, + "learning_rate": 1.3707195397133165e-05, + "loss": 1.611, + "step": 8824 + }, + { + "epoch": 0.4413, + "grad_norm": 3.370297431945801, + "learning_rate": 1.370395324018531e-05, + "loss": 0.7524, + "step": 8826 + }, + { + "epoch": 0.4414, + "grad_norm": 8.126354217529297, + "learning_rate": 1.3700710631921984e-05, + "loss": 1.3307, + "step": 8828 + }, + { + "epoch": 0.4415, + "grad_norm": 1.8157912492752075, + "learning_rate": 1.3697467572738294e-05, + "loss": 0.6036, + "step": 8830 + }, + { + "epoch": 0.4416, + "grad_norm": 1.5106494426727295, + "learning_rate": 1.3694224063029396e-05, + "loss": 1.1658, + "step": 8832 + }, + { + "epoch": 0.4417, + "grad_norm": 1.9297130107879639, + "learning_rate": 1.3690980103190504e-05, + "loss": 0.4904, + "step": 8834 + }, + { + "epoch": 0.4418, + "grad_norm": 3.109227180480957, + "learning_rate": 1.3687735693616876e-05, + "loss": 1.1992, + "step": 8836 + }, + { + "epoch": 0.4419, + "grad_norm": 4.1193037033081055, + "learning_rate": 1.3684490834703846e-05, + "loss": 0.9273, + "step": 8838 + }, + { + "epoch": 0.442, + "grad_norm": 4.610208988189697, + "learning_rate": 1.3681245526846782e-05, + "loss": 1.1884, + "step": 8840 + }, + { + "epoch": 0.4421, + "grad_norm": 7.989590167999268, + "learning_rate": 1.3677999770441115e-05, + "loss": 1.6688, + "step": 8842 + }, + { + "epoch": 0.4422, + "grad_norm": 2.770864963531494, + "learning_rate": 1.3674753565882336e-05, + "loss": 0.8831, + "step": 8844 + }, + { + "epoch": 0.4423, + "grad_norm": 8.969026565551758, + "learning_rate": 1.367150691356598e-05, + "loss": 0.8346, + "step": 8846 + }, + { + "epoch": 0.4424, + "grad_norm": 4.980639457702637, + "learning_rate": 1.3668259813887644e-05, + "loss": 0.5744, + "step": 8848 + }, + { + "epoch": 0.4425, + "grad_norm": 3.9965028762817383, + "learning_rate": 1.3665012267242974e-05, + "loss": 0.7092, + "step": 8850 + }, + { + "epoch": 0.4426, + "grad_norm": 2.5314321517944336, + "learning_rate": 1.3661764274027678e-05, + "loss": 0.556, + "step": 8852 + }, + { + "epoch": 0.4427, + "grad_norm": 2.2644214630126953, + "learning_rate": 1.3658515834637514e-05, + "loss": 0.6489, + "step": 8854 + }, + { + "epoch": 0.4428, + "grad_norm": 3.6010520458221436, + "learning_rate": 1.365526694946829e-05, + "loss": 1.1185, + "step": 8856 + }, + { + "epoch": 0.4429, + "grad_norm": 3.2159087657928467, + "learning_rate": 1.365201761891588e-05, + "loss": 1.1743, + "step": 8858 + }, + { + "epoch": 0.443, + "grad_norm": 1.7984013557434082, + "learning_rate": 1.3648767843376196e-05, + "loss": 1.0201, + "step": 8860 + }, + { + "epoch": 0.4431, + "grad_norm": 2.892404317855835, + "learning_rate": 1.3645517623245221e-05, + "loss": 1.4615, + "step": 8862 + }, + { + "epoch": 0.4432, + "grad_norm": 1.216408133506775, + "learning_rate": 1.3642266958918985e-05, + "loss": 0.1662, + "step": 8864 + }, + { + "epoch": 0.4433, + "grad_norm": 11.915616035461426, + "learning_rate": 1.3639015850793564e-05, + "loss": 0.6292, + "step": 8866 + }, + { + "epoch": 0.4434, + "grad_norm": 4.896627426147461, + "learning_rate": 1.36357642992651e-05, + "loss": 1.3299, + "step": 8868 + }, + { + "epoch": 0.4435, + "grad_norm": 4.0071024894714355, + "learning_rate": 1.3632512304729786e-05, + "loss": 0.8925, + "step": 8870 + }, + { + "epoch": 0.4436, + "grad_norm": 1.6942874193191528, + "learning_rate": 1.3629259867583864e-05, + "loss": 0.4062, + "step": 8872 + }, + { + "epoch": 0.4437, + "grad_norm": 9.123302459716797, + "learning_rate": 1.3626006988223637e-05, + "loss": 1.1461, + "step": 8874 + }, + { + "epoch": 0.4438, + "grad_norm": 0.24884067475795746, + "learning_rate": 1.3622753667045459e-05, + "loss": 0.0106, + "step": 8876 + }, + { + "epoch": 0.4439, + "grad_norm": 2.250575065612793, + "learning_rate": 1.3619499904445734e-05, + "loss": 1.1171, + "step": 8878 + }, + { + "epoch": 0.444, + "grad_norm": 2.325070858001709, + "learning_rate": 1.3616245700820922e-05, + "loss": 1.2243, + "step": 8880 + }, + { + "epoch": 0.4441, + "grad_norm": 3.8015127182006836, + "learning_rate": 1.3612991056567544e-05, + "loss": 0.9341, + "step": 8882 + }, + { + "epoch": 0.4442, + "grad_norm": 5.4707417488098145, + "learning_rate": 1.3609735972082168e-05, + "loss": 0.8727, + "step": 8884 + }, + { + "epoch": 0.4443, + "grad_norm": 4.751550197601318, + "learning_rate": 1.360648044776141e-05, + "loss": 0.6122, + "step": 8886 + }, + { + "epoch": 0.4444, + "grad_norm": 4.4850897789001465, + "learning_rate": 1.3603224484001949e-05, + "loss": 0.9919, + "step": 8888 + }, + { + "epoch": 0.4445, + "grad_norm": 3.056692600250244, + "learning_rate": 1.3599968081200515e-05, + "loss": 0.8937, + "step": 8890 + }, + { + "epoch": 0.4446, + "grad_norm": 8.799762725830078, + "learning_rate": 1.3596711239753889e-05, + "loss": 0.9879, + "step": 8892 + }, + { + "epoch": 0.4447, + "grad_norm": 3.891378402709961, + "learning_rate": 1.3593453960058909e-05, + "loss": 0.3692, + "step": 8894 + }, + { + "epoch": 0.4448, + "grad_norm": 4.697097301483154, + "learning_rate": 1.3590196242512463e-05, + "loss": 0.8518, + "step": 8896 + }, + { + "epoch": 0.4449, + "grad_norm": 12.67669677734375, + "learning_rate": 1.3586938087511494e-05, + "loss": 1.5525, + "step": 8898 + }, + { + "epoch": 0.445, + "grad_norm": 2.5614192485809326, + "learning_rate": 1.3583679495453e-05, + "loss": 0.5186, + "step": 8900 + }, + { + "epoch": 0.4451, + "grad_norm": 7.120272636413574, + "learning_rate": 1.3580420466734037e-05, + "loss": 0.9505, + "step": 8902 + }, + { + "epoch": 0.4452, + "grad_norm": 4.600688934326172, + "learning_rate": 1.3577161001751696e-05, + "loss": 1.5377, + "step": 8904 + }, + { + "epoch": 0.4453, + "grad_norm": 3.758892297744751, + "learning_rate": 1.3573901100903135e-05, + "loss": 1.4156, + "step": 8906 + }, + { + "epoch": 0.4454, + "grad_norm": 5.204721927642822, + "learning_rate": 1.3570640764585567e-05, + "loss": 1.2205, + "step": 8908 + }, + { + "epoch": 0.4455, + "grad_norm": 5.070221900939941, + "learning_rate": 1.3567379993196252e-05, + "loss": 1.0764, + "step": 8910 + }, + { + "epoch": 0.4456, + "grad_norm": 2.9338934421539307, + "learning_rate": 1.3564118787132507e-05, + "loss": 1.2537, + "step": 8912 + }, + { + "epoch": 0.4457, + "grad_norm": 4.174190521240234, + "learning_rate": 1.3560857146791699e-05, + "loss": 1.2491, + "step": 8914 + }, + { + "epoch": 0.4458, + "grad_norm": 2.1147477626800537, + "learning_rate": 1.355759507257125e-05, + "loss": 0.7688, + "step": 8916 + }, + { + "epoch": 0.4459, + "grad_norm": 1.5343540906906128, + "learning_rate": 1.3554332564868631e-05, + "loss": 0.2107, + "step": 8918 + }, + { + "epoch": 0.446, + "grad_norm": 7.207485198974609, + "learning_rate": 1.3551069624081372e-05, + "loss": 0.9987, + "step": 8920 + }, + { + "epoch": 0.4461, + "grad_norm": 2.110679864883423, + "learning_rate": 1.354780625060705e-05, + "loss": 0.9323, + "step": 8922 + }, + { + "epoch": 0.4462, + "grad_norm": 4.696387767791748, + "learning_rate": 1.3544542444843298e-05, + "loss": 0.5785, + "step": 8924 + }, + { + "epoch": 0.4463, + "grad_norm": 5.142842769622803, + "learning_rate": 1.35412782071878e-05, + "loss": 1.5633, + "step": 8926 + }, + { + "epoch": 0.4464, + "grad_norm": 8.174338340759277, + "learning_rate": 1.3538013538038295e-05, + "loss": 1.854, + "step": 8928 + }, + { + "epoch": 0.4465, + "grad_norm": 4.184696197509766, + "learning_rate": 1.3534748437792573e-05, + "loss": 1.2091, + "step": 8930 + }, + { + "epoch": 0.4466, + "grad_norm": 2.6092138290405273, + "learning_rate": 1.3531482906848474e-05, + "loss": 0.4883, + "step": 8932 + }, + { + "epoch": 0.4467, + "grad_norm": 7.908763408660889, + "learning_rate": 1.3528216945603898e-05, + "loss": 0.7365, + "step": 8934 + }, + { + "epoch": 0.4468, + "grad_norm": 1.997949242591858, + "learning_rate": 1.3524950554456786e-05, + "loss": 0.9136, + "step": 8936 + }, + { + "epoch": 0.4469, + "grad_norm": 3.458526849746704, + "learning_rate": 1.3521683733805144e-05, + "loss": 1.1679, + "step": 8938 + }, + { + "epoch": 0.447, + "grad_norm": 2.6334316730499268, + "learning_rate": 1.3518416484047018e-05, + "loss": 0.8799, + "step": 8940 + }, + { + "epoch": 0.4471, + "grad_norm": 21.514562606811523, + "learning_rate": 1.351514880558052e-05, + "loss": 1.7247, + "step": 8942 + }, + { + "epoch": 0.4472, + "grad_norm": 12.708584785461426, + "learning_rate": 1.3511880698803801e-05, + "loss": 1.3104, + "step": 8944 + }, + { + "epoch": 0.4473, + "grad_norm": 3.9041569232940674, + "learning_rate": 1.3508612164115069e-05, + "loss": 1.4388, + "step": 8946 + }, + { + "epoch": 0.4474, + "grad_norm": 7.09930944442749, + "learning_rate": 1.350534320191259e-05, + "loss": 1.1956, + "step": 8948 + }, + { + "epoch": 0.4475, + "grad_norm": 7.577471733093262, + "learning_rate": 1.3502073812594677e-05, + "loss": 0.7755, + "step": 8950 + }, + { + "epoch": 0.4476, + "grad_norm": 4.843506813049316, + "learning_rate": 1.349880399655969e-05, + "loss": 1.1139, + "step": 8952 + }, + { + "epoch": 0.4477, + "grad_norm": 3.4103426933288574, + "learning_rate": 1.349553375420605e-05, + "loss": 0.776, + "step": 8954 + }, + { + "epoch": 0.4478, + "grad_norm": 2.5191822052001953, + "learning_rate": 1.3492263085932224e-05, + "loss": 1.0529, + "step": 8956 + }, + { + "epoch": 0.4479, + "grad_norm": 1.8588461875915527, + "learning_rate": 1.3488991992136735e-05, + "loss": 1.064, + "step": 8958 + }, + { + "epoch": 0.448, + "grad_norm": 4.748047351837158, + "learning_rate": 1.3485720473218153e-05, + "loss": 0.761, + "step": 8960 + }, + { + "epoch": 0.4481, + "grad_norm": 7.300265312194824, + "learning_rate": 1.3482448529575107e-05, + "loss": 0.7474, + "step": 8962 + }, + { + "epoch": 0.4482, + "grad_norm": 5.525083065032959, + "learning_rate": 1.3479176161606269e-05, + "loss": 0.6838, + "step": 8964 + }, + { + "epoch": 0.4483, + "grad_norm": 0.24614769220352173, + "learning_rate": 1.347590336971037e-05, + "loss": 0.0069, + "step": 8966 + }, + { + "epoch": 0.4484, + "grad_norm": 4.374545097351074, + "learning_rate": 1.347263015428619e-05, + "loss": 0.6669, + "step": 8968 + }, + { + "epoch": 0.4485, + "grad_norm": 4.217963695526123, + "learning_rate": 1.3469356515732559e-05, + "loss": 1.6276, + "step": 8970 + }, + { + "epoch": 0.4486, + "grad_norm": 5.033113956451416, + "learning_rate": 1.3466082454448364e-05, + "loss": 1.3515, + "step": 8972 + }, + { + "epoch": 0.4487, + "grad_norm": 4.283261775970459, + "learning_rate": 1.346280797083253e-05, + "loss": 0.7008, + "step": 8974 + }, + { + "epoch": 0.4488, + "grad_norm": 1.7641348838806152, + "learning_rate": 1.3459533065284049e-05, + "loss": 0.5549, + "step": 8976 + }, + { + "epoch": 0.4489, + "grad_norm": 4.546355724334717, + "learning_rate": 1.3456257738201959e-05, + "loss": 0.9564, + "step": 8978 + }, + { + "epoch": 0.449, + "grad_norm": 4.095027446746826, + "learning_rate": 1.3452981989985347e-05, + "loss": 0.6693, + "step": 8980 + }, + { + "epoch": 0.4491, + "grad_norm": 3.9737884998321533, + "learning_rate": 1.3449705821033357e-05, + "loss": 1.4296, + "step": 8982 + }, + { + "epoch": 0.4492, + "grad_norm": 1.6187480688095093, + "learning_rate": 1.344642923174517e-05, + "loss": 0.81, + "step": 8984 + }, + { + "epoch": 0.4493, + "grad_norm": 3.6910815238952637, + "learning_rate": 1.344315222252004e-05, + "loss": 1.1208, + "step": 8986 + }, + { + "epoch": 0.4494, + "grad_norm": 6.558996200561523, + "learning_rate": 1.3439874793757255e-05, + "loss": 1.7159, + "step": 8988 + }, + { + "epoch": 0.4495, + "grad_norm": 7.962631702423096, + "learning_rate": 1.3436596945856164e-05, + "loss": 0.5272, + "step": 8990 + }, + { + "epoch": 0.4496, + "grad_norm": 2.807370662689209, + "learning_rate": 1.3433318679216154e-05, + "loss": 0.7202, + "step": 8992 + }, + { + "epoch": 0.4497, + "grad_norm": 2.55435848236084, + "learning_rate": 1.3430039994236679e-05, + "loss": 0.4522, + "step": 8994 + }, + { + "epoch": 0.4498, + "grad_norm": 4.151333332061768, + "learning_rate": 1.3426760891317236e-05, + "loss": 0.5939, + "step": 8996 + }, + { + "epoch": 0.4499, + "grad_norm": 6.6149516105651855, + "learning_rate": 1.3423481370857375e-05, + "loss": 1.0022, + "step": 8998 + }, + { + "epoch": 0.45, + "grad_norm": 6.215419292449951, + "learning_rate": 1.342020143325669e-05, + "loss": 0.676, + "step": 9000 + }, + { + "epoch": 0.4501, + "grad_norm": 4.569922924041748, + "learning_rate": 1.3416921078914835e-05, + "loss": 0.7692, + "step": 9002 + }, + { + "epoch": 0.4502, + "grad_norm": 2.757880687713623, + "learning_rate": 1.3413640308231511e-05, + "loss": 0.874, + "step": 9004 + }, + { + "epoch": 0.4503, + "grad_norm": 5.000626564025879, + "learning_rate": 1.3410359121606471e-05, + "loss": 2.5354, + "step": 9006 + }, + { + "epoch": 0.4504, + "grad_norm": 5.702583312988281, + "learning_rate": 1.340707751943952e-05, + "loss": 1.0993, + "step": 9008 + }, + { + "epoch": 0.4505, + "grad_norm": 1.6660493612289429, + "learning_rate": 1.3403795502130503e-05, + "loss": 0.9254, + "step": 9010 + }, + { + "epoch": 0.4506, + "grad_norm": 4.123348712921143, + "learning_rate": 1.340051307007933e-05, + "loss": 1.4924, + "step": 9012 + }, + { + "epoch": 0.4507, + "grad_norm": 3.8556909561157227, + "learning_rate": 1.3397230223685955e-05, + "loss": 0.8162, + "step": 9014 + }, + { + "epoch": 0.4508, + "grad_norm": 4.054632663726807, + "learning_rate": 1.3393946963350381e-05, + "loss": 0.7666, + "step": 9016 + }, + { + "epoch": 0.4509, + "grad_norm": 1.989202857017517, + "learning_rate": 1.3390663289472667e-05, + "loss": 1.4993, + "step": 9018 + }, + { + "epoch": 0.451, + "grad_norm": 6.870042324066162, + "learning_rate": 1.3387379202452917e-05, + "loss": 0.7434, + "step": 9020 + }, + { + "epoch": 0.4511, + "grad_norm": 4.308064937591553, + "learning_rate": 1.3384094702691282e-05, + "loss": 1.4547, + "step": 9022 + }, + { + "epoch": 0.4512, + "grad_norm": 3.1370222568511963, + "learning_rate": 1.3380809790587975e-05, + "loss": 1.6964, + "step": 9024 + }, + { + "epoch": 0.4513, + "grad_norm": 3.707247734069824, + "learning_rate": 1.337752446654325e-05, + "loss": 0.9103, + "step": 9026 + }, + { + "epoch": 0.4514, + "grad_norm": 3.820655584335327, + "learning_rate": 1.3374238730957414e-05, + "loss": 1.137, + "step": 9028 + }, + { + "epoch": 0.4515, + "grad_norm": 2.5812747478485107, + "learning_rate": 1.3370952584230823e-05, + "loss": 0.9575, + "step": 9030 + }, + { + "epoch": 0.4516, + "grad_norm": 4.311949729919434, + "learning_rate": 1.3367666026763884e-05, + "loss": 1.2271, + "step": 9032 + }, + { + "epoch": 0.4517, + "grad_norm": 3.1733593940734863, + "learning_rate": 1.3364379058957055e-05, + "loss": 0.9098, + "step": 9034 + }, + { + "epoch": 0.4518, + "grad_norm": 1.0159058570861816, + "learning_rate": 1.3361091681210846e-05, + "loss": 0.4495, + "step": 9036 + }, + { + "epoch": 0.4519, + "grad_norm": 8.080636978149414, + "learning_rate": 1.3357803893925807e-05, + "loss": 1.1699, + "step": 9038 + }, + { + "epoch": 0.452, + "grad_norm": 2.4710042476654053, + "learning_rate": 1.3354515697502552e-05, + "loss": 0.7114, + "step": 9040 + }, + { + "epoch": 0.4521, + "grad_norm": 4.095670223236084, + "learning_rate": 1.3351227092341732e-05, + "loss": 0.6671, + "step": 9042 + }, + { + "epoch": 0.4522, + "grad_norm": 2.739368438720703, + "learning_rate": 1.3347938078844058e-05, + "loss": 1.4011, + "step": 9044 + }, + { + "epoch": 0.4523, + "grad_norm": 3.7691452503204346, + "learning_rate": 1.3344648657410284e-05, + "loss": 0.387, + "step": 9046 + }, + { + "epoch": 0.4524, + "grad_norm": 5.512652397155762, + "learning_rate": 1.3341358828441217e-05, + "loss": 0.6808, + "step": 9048 + }, + { + "epoch": 0.4525, + "grad_norm": 5.089535713195801, + "learning_rate": 1.333806859233771e-05, + "loss": 0.6981, + "step": 9050 + }, + { + "epoch": 0.4526, + "grad_norm": 4.456161022186279, + "learning_rate": 1.3334777949500673e-05, + "loss": 0.496, + "step": 9052 + }, + { + "epoch": 0.4527, + "grad_norm": 5.994139194488525, + "learning_rate": 1.3331486900331057e-05, + "loss": 0.8772, + "step": 9054 + }, + { + "epoch": 0.4528, + "grad_norm": 4.694468975067139, + "learning_rate": 1.3328195445229869e-05, + "loss": 1.2861, + "step": 9056 + }, + { + "epoch": 0.4529, + "grad_norm": 4.556105613708496, + "learning_rate": 1.3324903584598159e-05, + "loss": 0.8247, + "step": 9058 + }, + { + "epoch": 0.453, + "grad_norm": 3.1166303157806396, + "learning_rate": 1.3321611318837033e-05, + "loss": 0.9399, + "step": 9060 + }, + { + "epoch": 0.4531, + "grad_norm": 4.390866279602051, + "learning_rate": 1.3318318648347646e-05, + "loss": 0.7639, + "step": 9062 + }, + { + "epoch": 0.4532, + "grad_norm": 8.027542114257812, + "learning_rate": 1.3315025573531198e-05, + "loss": 0.872, + "step": 9064 + }, + { + "epoch": 0.4533, + "grad_norm": 6.921693325042725, + "learning_rate": 1.3311732094788936e-05, + "loss": 0.9734, + "step": 9066 + }, + { + "epoch": 0.4534, + "grad_norm": 5.878570556640625, + "learning_rate": 1.3308438212522164e-05, + "loss": 0.8452, + "step": 9068 + }, + { + "epoch": 0.4535, + "grad_norm": 7.243222236633301, + "learning_rate": 1.3305143927132232e-05, + "loss": 1.1585, + "step": 9070 + }, + { + "epoch": 0.4536, + "grad_norm": 5.833813667297363, + "learning_rate": 1.3301849239020537e-05, + "loss": 0.8204, + "step": 9072 + }, + { + "epoch": 0.4537, + "grad_norm": 3.165919542312622, + "learning_rate": 1.329855414858853e-05, + "loss": 2.2589, + "step": 9074 + }, + { + "epoch": 0.4538, + "grad_norm": 9.074087142944336, + "learning_rate": 1.3295258656237703e-05, + "loss": 1.0161, + "step": 9076 + }, + { + "epoch": 0.4539, + "grad_norm": 3.3419275283813477, + "learning_rate": 1.3291962762369607e-05, + "loss": 0.911, + "step": 9078 + }, + { + "epoch": 0.454, + "grad_norm": 4.326785087585449, + "learning_rate": 1.3288666467385834e-05, + "loss": 0.8129, + "step": 9080 + }, + { + "epoch": 0.4541, + "grad_norm": 5.019748210906982, + "learning_rate": 1.3285369771688027e-05, + "loss": 1.3159, + "step": 9082 + }, + { + "epoch": 0.4542, + "grad_norm": 4.122664451599121, + "learning_rate": 1.328207267567788e-05, + "loss": 1.4296, + "step": 9084 + }, + { + "epoch": 0.4543, + "grad_norm": 8.566120147705078, + "learning_rate": 1.3278775179757131e-05, + "loss": 1.458, + "step": 9086 + }, + { + "epoch": 0.4544, + "grad_norm": 16.83333396911621, + "learning_rate": 1.327547728432757e-05, + "loss": 1.6694, + "step": 9088 + }, + { + "epoch": 0.4545, + "grad_norm": 2.195107936859131, + "learning_rate": 1.327217898979104e-05, + "loss": 0.3894, + "step": 9090 + }, + { + "epoch": 0.4546, + "grad_norm": 2.4301466941833496, + "learning_rate": 1.3268880296549424e-05, + "loss": 0.8363, + "step": 9092 + }, + { + "epoch": 0.4547, + "grad_norm": 3.567753314971924, + "learning_rate": 1.3265581205004662e-05, + "loss": 1.0665, + "step": 9094 + }, + { + "epoch": 0.4548, + "grad_norm": 1.815026044845581, + "learning_rate": 1.3262281715558736e-05, + "loss": 0.7605, + "step": 9096 + }, + { + "epoch": 0.4549, + "grad_norm": 3.0704307556152344, + "learning_rate": 1.3258981828613678e-05, + "loss": 0.8734, + "step": 9098 + }, + { + "epoch": 0.455, + "grad_norm": 2.27712345123291, + "learning_rate": 1.3255681544571568e-05, + "loss": 0.8114, + "step": 9100 + }, + { + "epoch": 0.4551, + "grad_norm": 2.636070966720581, + "learning_rate": 1.325238086383454e-05, + "loss": 1.0544, + "step": 9102 + }, + { + "epoch": 0.4552, + "grad_norm": 3.4374446868896484, + "learning_rate": 1.3249079786804765e-05, + "loss": 0.8333, + "step": 9104 + }, + { + "epoch": 0.4553, + "grad_norm": 7.605374813079834, + "learning_rate": 1.3245778313884478e-05, + "loss": 0.8802, + "step": 9106 + }, + { + "epoch": 0.4554, + "grad_norm": 5.51675271987915, + "learning_rate": 1.3242476445475945e-05, + "loss": 1.5344, + "step": 9108 + }, + { + "epoch": 0.4555, + "grad_norm": 9.270489692687988, + "learning_rate": 1.3239174181981496e-05, + "loss": 0.9541, + "step": 9110 + }, + { + "epoch": 0.4556, + "grad_norm": 3.471670389175415, + "learning_rate": 1.3235871523803496e-05, + "loss": 0.8636, + "step": 9112 + }, + { + "epoch": 0.4557, + "grad_norm": 2.7169783115386963, + "learning_rate": 1.3232568471344369e-05, + "loss": 0.7157, + "step": 9114 + }, + { + "epoch": 0.4558, + "grad_norm": 1.511616587638855, + "learning_rate": 1.3229265025006577e-05, + "loss": 0.717, + "step": 9116 + }, + { + "epoch": 0.4559, + "grad_norm": 2.324431896209717, + "learning_rate": 1.3225961185192638e-05, + "loss": 0.6117, + "step": 9118 + }, + { + "epoch": 0.456, + "grad_norm": 8.080517768859863, + "learning_rate": 1.3222656952305113e-05, + "loss": 1.0957, + "step": 9120 + }, + { + "epoch": 0.4561, + "grad_norm": 25.61760139465332, + "learning_rate": 1.3219352326746613e-05, + "loss": 1.0183, + "step": 9122 + }, + { + "epoch": 0.4562, + "grad_norm": 11.974292755126953, + "learning_rate": 1.32160473089198e-05, + "loss": 0.9111, + "step": 9124 + }, + { + "epoch": 0.4563, + "grad_norm": 17.20077896118164, + "learning_rate": 1.3212741899227375e-05, + "loss": 1.1697, + "step": 9126 + }, + { + "epoch": 0.4564, + "grad_norm": 5.253365516662598, + "learning_rate": 1.3209436098072095e-05, + "loss": 0.3646, + "step": 9128 + }, + { + "epoch": 0.4565, + "grad_norm": 2.775684118270874, + "learning_rate": 1.3206129905856765e-05, + "loss": 1.1262, + "step": 9130 + }, + { + "epoch": 0.4566, + "grad_norm": 6.888213157653809, + "learning_rate": 1.3202823322984228e-05, + "loss": 0.7451, + "step": 9132 + }, + { + "epoch": 0.4567, + "grad_norm": 3.557659864425659, + "learning_rate": 1.3199516349857384e-05, + "loss": 0.4874, + "step": 9134 + }, + { + "epoch": 0.4568, + "grad_norm": 5.269452095031738, + "learning_rate": 1.319620898687918e-05, + "loss": 1.0462, + "step": 9136 + }, + { + "epoch": 0.4569, + "grad_norm": 5.537091255187988, + "learning_rate": 1.3192901234452606e-05, + "loss": 1.1209, + "step": 9138 + }, + { + "epoch": 0.457, + "grad_norm": 7.4979424476623535, + "learning_rate": 1.3189593092980701e-05, + "loss": 0.9575, + "step": 9140 + }, + { + "epoch": 0.4571, + "grad_norm": 4.234434604644775, + "learning_rate": 1.3186284562866554e-05, + "loss": 1.1932, + "step": 9142 + }, + { + "epoch": 0.4572, + "grad_norm": 5.433716773986816, + "learning_rate": 1.3182975644513296e-05, + "loss": 0.527, + "step": 9144 + }, + { + "epoch": 0.4573, + "grad_norm": 1.3030126094818115, + "learning_rate": 1.317966633832411e-05, + "loss": 0.2334, + "step": 9146 + }, + { + "epoch": 0.4574, + "grad_norm": 3.86323618888855, + "learning_rate": 1.3176356644702225e-05, + "loss": 1.3638, + "step": 9148 + }, + { + "epoch": 0.4575, + "grad_norm": 4.536445617675781, + "learning_rate": 1.3173046564050923e-05, + "loss": 1.5478, + "step": 9150 + }, + { + "epoch": 0.4576, + "grad_norm": 2.4504520893096924, + "learning_rate": 1.316973609677352e-05, + "loss": 1.0931, + "step": 9152 + }, + { + "epoch": 0.4577, + "grad_norm": 4.302062511444092, + "learning_rate": 1.3166425243273387e-05, + "loss": 0.8439, + "step": 9154 + }, + { + "epoch": 0.4578, + "grad_norm": 2.407599687576294, + "learning_rate": 1.316311400395394e-05, + "loss": 0.5461, + "step": 9156 + }, + { + "epoch": 0.4579, + "grad_norm": 5.118942737579346, + "learning_rate": 1.315980237921865e-05, + "loss": 1.1737, + "step": 9158 + }, + { + "epoch": 0.458, + "grad_norm": 4.673958778381348, + "learning_rate": 1.3156490369471026e-05, + "loss": 1.1785, + "step": 9160 + }, + { + "epoch": 0.4581, + "grad_norm": 9.508761405944824, + "learning_rate": 1.3153177975114624e-05, + "loss": 1.1019, + "step": 9162 + }, + { + "epoch": 0.4582, + "grad_norm": 2.791632890701294, + "learning_rate": 1.3149865196553049e-05, + "loss": 0.8072, + "step": 9164 + }, + { + "epoch": 0.4583, + "grad_norm": 5.756307601928711, + "learning_rate": 1.3146552034189954e-05, + "loss": 0.8862, + "step": 9166 + }, + { + "epoch": 0.4584, + "grad_norm": 9.793404579162598, + "learning_rate": 1.3143238488429042e-05, + "loss": 0.6676, + "step": 9168 + }, + { + "epoch": 0.4585, + "grad_norm": 13.633255958557129, + "learning_rate": 1.313992455967405e-05, + "loss": 1.402, + "step": 9170 + }, + { + "epoch": 0.4586, + "grad_norm": 2.2253575325012207, + "learning_rate": 1.3136610248328779e-05, + "loss": 0.2733, + "step": 9172 + }, + { + "epoch": 0.4587, + "grad_norm": 2.441089391708374, + "learning_rate": 1.3133295554797058e-05, + "loss": 0.5237, + "step": 9174 + }, + { + "epoch": 0.4588, + "grad_norm": 7.566964626312256, + "learning_rate": 1.3129980479482783e-05, + "loss": 0.6497, + "step": 9176 + }, + { + "epoch": 0.4589, + "grad_norm": 5.614509582519531, + "learning_rate": 1.3126665022789879e-05, + "loss": 0.8824, + "step": 9178 + }, + { + "epoch": 0.459, + "grad_norm": 5.388372898101807, + "learning_rate": 1.3123349185122328e-05, + "loss": 0.8013, + "step": 9180 + }, + { + "epoch": 0.4591, + "grad_norm": 4.045365810394287, + "learning_rate": 1.3120032966884151e-05, + "loss": 1.2179, + "step": 9182 + }, + { + "epoch": 0.4592, + "grad_norm": 9.614696502685547, + "learning_rate": 1.3116716368479418e-05, + "loss": 1.4572, + "step": 9184 + }, + { + "epoch": 0.4593, + "grad_norm": 3.308502197265625, + "learning_rate": 1.3113399390312256e-05, + "loss": 1.1167, + "step": 9186 + }, + { + "epoch": 0.4594, + "grad_norm": 2.7250797748565674, + "learning_rate": 1.311008203278682e-05, + "loss": 0.9464, + "step": 9188 + }, + { + "epoch": 0.4595, + "grad_norm": 4.787267208099365, + "learning_rate": 1.310676429630732e-05, + "loss": 0.9317, + "step": 9190 + }, + { + "epoch": 0.4596, + "grad_norm": 3.722172737121582, + "learning_rate": 1.3103446181278015e-05, + "loss": 0.6874, + "step": 9192 + }, + { + "epoch": 0.4597, + "grad_norm": 4.231965065002441, + "learning_rate": 1.3100127688103206e-05, + "loss": 0.7737, + "step": 9194 + }, + { + "epoch": 0.4598, + "grad_norm": 5.732120990753174, + "learning_rate": 1.3096808817187243e-05, + "loss": 1.1129, + "step": 9196 + }, + { + "epoch": 0.4599, + "grad_norm": 2.145817756652832, + "learning_rate": 1.3093489568934522e-05, + "loss": 0.5618, + "step": 9198 + }, + { + "epoch": 0.46, + "grad_norm": 15.97574520111084, + "learning_rate": 1.3090169943749475e-05, + "loss": 1.121, + "step": 9200 + }, + { + "epoch": 0.4601, + "grad_norm": 9.022173881530762, + "learning_rate": 1.3086849942036597e-05, + "loss": 1.19, + "step": 9202 + }, + { + "epoch": 0.4602, + "grad_norm": 6.75143575668335, + "learning_rate": 1.3083529564200417e-05, + "loss": 1.2722, + "step": 9204 + }, + { + "epoch": 0.4603, + "grad_norm": 2.926807403564453, + "learning_rate": 1.3080208810645514e-05, + "loss": 0.9851, + "step": 9206 + }, + { + "epoch": 0.4604, + "grad_norm": 1.706776738166809, + "learning_rate": 1.3076887681776509e-05, + "loss": 0.7425, + "step": 9208 + }, + { + "epoch": 0.4605, + "grad_norm": 5.594606876373291, + "learning_rate": 1.3073566177998073e-05, + "loss": 0.7653, + "step": 9210 + }, + { + "epoch": 0.4606, + "grad_norm": 6.6243367195129395, + "learning_rate": 1.307024429971492e-05, + "loss": 0.6388, + "step": 9212 + }, + { + "epoch": 0.4607, + "grad_norm": 3.376725673675537, + "learning_rate": 1.3066922047331814e-05, + "loss": 1.3476, + "step": 9214 + }, + { + "epoch": 0.4608, + "grad_norm": 2.600083827972412, + "learning_rate": 1.306359942125356e-05, + "loss": 0.4459, + "step": 9216 + }, + { + "epoch": 0.4609, + "grad_norm": 2.8367021083831787, + "learning_rate": 1.306027642188501e-05, + "loss": 1.4658, + "step": 9218 + }, + { + "epoch": 0.461, + "grad_norm": 4.652093410491943, + "learning_rate": 1.3056953049631059e-05, + "loss": 1.0593, + "step": 9220 + }, + { + "epoch": 0.4611, + "grad_norm": 3.498109817504883, + "learning_rate": 1.305362930489665e-05, + "loss": 0.8689, + "step": 9222 + }, + { + "epoch": 0.4612, + "grad_norm": 2.767570734024048, + "learning_rate": 1.3050305188086778e-05, + "loss": 0.8867, + "step": 9224 + }, + { + "epoch": 0.4613, + "grad_norm": 0.6444360613822937, + "learning_rate": 1.3046980699606469e-05, + "loss": 0.2593, + "step": 9226 + }, + { + "epoch": 0.4614, + "grad_norm": 7.562550067901611, + "learning_rate": 1.3043655839860803e-05, + "loss": 1.3807, + "step": 9228 + }, + { + "epoch": 0.4615, + "grad_norm": 6.176542282104492, + "learning_rate": 1.3040330609254903e-05, + "loss": 1.2864, + "step": 9230 + }, + { + "epoch": 0.4616, + "grad_norm": 3.4990320205688477, + "learning_rate": 1.3037005008193944e-05, + "loss": 0.5759, + "step": 9232 + }, + { + "epoch": 0.4617, + "grad_norm": 5.681490898132324, + "learning_rate": 1.3033679037083137e-05, + "loss": 0.9932, + "step": 9234 + }, + { + "epoch": 0.4618, + "grad_norm": 4.867551326751709, + "learning_rate": 1.3030352696327741e-05, + "loss": 1.1061, + "step": 9236 + }, + { + "epoch": 0.4619, + "grad_norm": 15.039446830749512, + "learning_rate": 1.3027025986333061e-05, + "loss": 1.5945, + "step": 9238 + }, + { + "epoch": 0.462, + "grad_norm": 4.949581146240234, + "learning_rate": 1.3023698907504447e-05, + "loss": 0.3646, + "step": 9240 + }, + { + "epoch": 0.4621, + "grad_norm": 3.3606033325195312, + "learning_rate": 1.3020371460247292e-05, + "loss": 0.8436, + "step": 9242 + }, + { + "epoch": 0.4622, + "grad_norm": 4.540415287017822, + "learning_rate": 1.3017043644967036e-05, + "loss": 1.0313, + "step": 9244 + }, + { + "epoch": 0.4623, + "grad_norm": 3.1411378383636475, + "learning_rate": 1.3013715462069164e-05, + "loss": 0.3839, + "step": 9246 + }, + { + "epoch": 0.4624, + "grad_norm": 8.722574234008789, + "learning_rate": 1.3010386911959207e-05, + "loss": 0.885, + "step": 9248 + }, + { + "epoch": 0.4625, + "grad_norm": 5.3380584716796875, + "learning_rate": 1.300705799504273e-05, + "loss": 0.4144, + "step": 9250 + }, + { + "epoch": 0.4626, + "grad_norm": 1.6072068214416504, + "learning_rate": 1.3003728711725364e-05, + "loss": 2.1734, + "step": 9252 + }, + { + "epoch": 0.4627, + "grad_norm": 3.253143310546875, + "learning_rate": 1.3000399062412763e-05, + "loss": 1.1864, + "step": 9254 + }, + { + "epoch": 0.4628, + "grad_norm": 1.823738694190979, + "learning_rate": 1.299706904751064e-05, + "loss": 1.0772, + "step": 9256 + }, + { + "epoch": 0.4629, + "grad_norm": 2.5666306018829346, + "learning_rate": 1.299373866742474e-05, + "loss": 1.0107, + "step": 9258 + }, + { + "epoch": 0.463, + "grad_norm": 4.7316083908081055, + "learning_rate": 1.2990407922560869e-05, + "loss": 1.3076, + "step": 9260 + }, + { + "epoch": 0.4631, + "grad_norm": 1.91348135471344, + "learning_rate": 1.2987076813324859e-05, + "loss": 0.6237, + "step": 9262 + }, + { + "epoch": 0.4632, + "grad_norm": 5.560062885284424, + "learning_rate": 1.2983745340122604e-05, + "loss": 0.9163, + "step": 9264 + }, + { + "epoch": 0.4633, + "grad_norm": 7.730875015258789, + "learning_rate": 1.298041350336003e-05, + "loss": 1.1118, + "step": 9266 + }, + { + "epoch": 0.4634, + "grad_norm": 4.648440837860107, + "learning_rate": 1.2977081303443107e-05, + "loss": 0.9907, + "step": 9268 + }, + { + "epoch": 0.4635, + "grad_norm": 4.4319891929626465, + "learning_rate": 1.297374874077786e-05, + "loss": 1.1528, + "step": 9270 + }, + { + "epoch": 0.4636, + "grad_norm": 1.4580329656600952, + "learning_rate": 1.297041581577035e-05, + "loss": 0.7961, + "step": 9272 + }, + { + "epoch": 0.4637, + "grad_norm": 2.2489047050476074, + "learning_rate": 1.2967082528826685e-05, + "loss": 0.9787, + "step": 9274 + }, + { + "epoch": 0.4638, + "grad_norm": 2.094900131225586, + "learning_rate": 1.2963748880353011e-05, + "loss": 0.9615, + "step": 9276 + }, + { + "epoch": 0.4639, + "grad_norm": 5.212981700897217, + "learning_rate": 1.2960414870755525e-05, + "loss": 0.7477, + "step": 9278 + }, + { + "epoch": 0.464, + "grad_norm": 1.956628441810608, + "learning_rate": 1.2957080500440469e-05, + "loss": 0.8574, + "step": 9280 + }, + { + "epoch": 0.4641, + "grad_norm": 3.854973316192627, + "learning_rate": 1.2953745769814123e-05, + "loss": 0.5967, + "step": 9282 + }, + { + "epoch": 0.4642, + "grad_norm": 18.13898468017578, + "learning_rate": 1.2950410679282815e-05, + "loss": 1.0311, + "step": 9284 + }, + { + "epoch": 0.4643, + "grad_norm": 3.6867284774780273, + "learning_rate": 1.2947075229252915e-05, + "loss": 0.6568, + "step": 9286 + }, + { + "epoch": 0.4644, + "grad_norm": 3.584841012954712, + "learning_rate": 1.2943739420130837e-05, + "loss": 0.6289, + "step": 9288 + }, + { + "epoch": 0.4645, + "grad_norm": 7.062147617340088, + "learning_rate": 1.294040325232304e-05, + "loss": 1.0409, + "step": 9290 + }, + { + "epoch": 0.4646, + "grad_norm": 2.082798719406128, + "learning_rate": 1.2937066726236029e-05, + "loss": 1.1579, + "step": 9292 + }, + { + "epoch": 0.4647, + "grad_norm": 2.8414254188537598, + "learning_rate": 1.2933729842276343e-05, + "loss": 1.2921, + "step": 9294 + }, + { + "epoch": 0.4648, + "grad_norm": 3.4364964962005615, + "learning_rate": 1.2930392600850574e-05, + "loss": 0.8075, + "step": 9296 + }, + { + "epoch": 0.4649, + "grad_norm": 3.5354855060577393, + "learning_rate": 1.2927055002365359e-05, + "loss": 1.1783, + "step": 9298 + }, + { + "epoch": 0.465, + "grad_norm": 2.4590117931365967, + "learning_rate": 1.2923717047227368e-05, + "loss": 0.8411, + "step": 9300 + }, + { + "epoch": 0.4651, + "grad_norm": 6.58985710144043, + "learning_rate": 1.2920378735843326e-05, + "loss": 0.9483, + "step": 9302 + }, + { + "epoch": 0.4652, + "grad_norm": 6.449295520782471, + "learning_rate": 1.291704006861999e-05, + "loss": 1.2303, + "step": 9304 + }, + { + "epoch": 0.4653, + "grad_norm": 5.783991813659668, + "learning_rate": 1.2913701045964173e-05, + "loss": 0.6332, + "step": 9306 + }, + { + "epoch": 0.4654, + "grad_norm": 8.964516639709473, + "learning_rate": 1.2910361668282718e-05, + "loss": 1.3418, + "step": 9308 + }, + { + "epoch": 0.4655, + "grad_norm": 10.37958812713623, + "learning_rate": 1.2907021935982526e-05, + "loss": 0.6705, + "step": 9310 + }, + { + "epoch": 0.4656, + "grad_norm": 2.4570651054382324, + "learning_rate": 1.2903681849470528e-05, + "loss": 0.7588, + "step": 9312 + }, + { + "epoch": 0.4657, + "grad_norm": 4.901151180267334, + "learning_rate": 1.2900341409153705e-05, + "loss": 0.8994, + "step": 9314 + }, + { + "epoch": 0.4658, + "grad_norm": 5.719445705413818, + "learning_rate": 1.2897000615439075e-05, + "loss": 0.936, + "step": 9316 + }, + { + "epoch": 0.4659, + "grad_norm": 3.9914252758026123, + "learning_rate": 1.289365946873371e-05, + "loss": 1.4876, + "step": 9318 + }, + { + "epoch": 0.466, + "grad_norm": 7.006251811981201, + "learning_rate": 1.2890317969444716e-05, + "loss": 0.9254, + "step": 9320 + }, + { + "epoch": 0.4661, + "grad_norm": 7.33596658706665, + "learning_rate": 1.2886976117979246e-05, + "loss": 0.3221, + "step": 9322 + }, + { + "epoch": 0.4662, + "grad_norm": 1.5109565258026123, + "learning_rate": 1.2883633914744493e-05, + "loss": 1.2259, + "step": 9324 + }, + { + "epoch": 0.4663, + "grad_norm": 4.0253496170043945, + "learning_rate": 1.2880291360147694e-05, + "loss": 1.317, + "step": 9326 + }, + { + "epoch": 0.4664, + "grad_norm": 6.692993640899658, + "learning_rate": 1.287694845459613e-05, + "loss": 1.1031, + "step": 9328 + }, + { + "epoch": 0.4665, + "grad_norm": 2.376115560531616, + "learning_rate": 1.2873605198497123e-05, + "loss": 1.1437, + "step": 9330 + }, + { + "epoch": 0.4666, + "grad_norm": 7.3201775550842285, + "learning_rate": 1.2870261592258038e-05, + "loss": 1.0486, + "step": 9332 + }, + { + "epoch": 0.4667, + "grad_norm": 15.372695922851562, + "learning_rate": 1.2866917636286285e-05, + "loss": 0.6868, + "step": 9334 + }, + { + "epoch": 0.4668, + "grad_norm": 2.69608736038208, + "learning_rate": 1.2863573330989315e-05, + "loss": 0.1642, + "step": 9336 + }, + { + "epoch": 0.4669, + "grad_norm": 7.097043037414551, + "learning_rate": 1.286022867677462e-05, + "loss": 1.0776, + "step": 9338 + }, + { + "epoch": 0.467, + "grad_norm": 2.328922748565674, + "learning_rate": 1.2856883674049736e-05, + "loss": 1.774, + "step": 9340 + }, + { + "epoch": 0.4671, + "grad_norm": 7.251669406890869, + "learning_rate": 1.2853538323222244e-05, + "loss": 0.9399, + "step": 9342 + }, + { + "epoch": 0.4672, + "grad_norm": 3.3179173469543457, + "learning_rate": 1.2850192624699762e-05, + "loss": 0.9197, + "step": 9344 + }, + { + "epoch": 0.4673, + "grad_norm": 0.5909302830696106, + "learning_rate": 1.2846846578889956e-05, + "loss": 0.3854, + "step": 9346 + }, + { + "epoch": 0.4674, + "grad_norm": 11.632678031921387, + "learning_rate": 1.2843500186200529e-05, + "loss": 0.6852, + "step": 9348 + }, + { + "epoch": 0.4675, + "grad_norm": 5.207489490509033, + "learning_rate": 1.284015344703923e-05, + "loss": 1.4257, + "step": 9350 + }, + { + "epoch": 0.4676, + "grad_norm": 0.7287635207176208, + "learning_rate": 1.2836806361813846e-05, + "loss": 0.7005, + "step": 9352 + }, + { + "epoch": 0.4677, + "grad_norm": 8.108989715576172, + "learning_rate": 1.2833458930932211e-05, + "loss": 0.7743, + "step": 9354 + }, + { + "epoch": 0.4678, + "grad_norm": 2.8806498050689697, + "learning_rate": 1.2830111154802203e-05, + "loss": 0.954, + "step": 9356 + }, + { + "epoch": 0.4679, + "grad_norm": 4.796778202056885, + "learning_rate": 1.2826763033831735e-05, + "loss": 1.2007, + "step": 9358 + }, + { + "epoch": 0.468, + "grad_norm": 2.385571002960205, + "learning_rate": 1.2823414568428767e-05, + "loss": 1.179, + "step": 9360 + }, + { + "epoch": 0.4681, + "grad_norm": 0.24983294308185577, + "learning_rate": 1.2820065759001295e-05, + "loss": 0.0504, + "step": 9362 + }, + { + "epoch": 0.4682, + "grad_norm": 5.004319190979004, + "learning_rate": 1.2816716605957366e-05, + "loss": 1.2563, + "step": 9364 + }, + { + "epoch": 0.4683, + "grad_norm": 4.237897872924805, + "learning_rate": 1.2813367109705064e-05, + "loss": 0.583, + "step": 9366 + }, + { + "epoch": 0.4684, + "grad_norm": 4.627162456512451, + "learning_rate": 1.2810017270652513e-05, + "loss": 1.241, + "step": 9368 + }, + { + "epoch": 0.4685, + "grad_norm": 10.139849662780762, + "learning_rate": 1.280666708920788e-05, + "loss": 1.2079, + "step": 9370 + }, + { + "epoch": 0.4686, + "grad_norm": 4.465052604675293, + "learning_rate": 1.2803316565779378e-05, + "loss": 0.8231, + "step": 9372 + }, + { + "epoch": 0.4687, + "grad_norm": 13.121413230895996, + "learning_rate": 1.279996570077525e-05, + "loss": 0.6871, + "step": 9374 + }, + { + "epoch": 0.4688, + "grad_norm": 2.6394705772399902, + "learning_rate": 1.27966144946038e-05, + "loss": 0.7531, + "step": 9376 + }, + { + "epoch": 0.4689, + "grad_norm": 4.167197227478027, + "learning_rate": 1.2793262947673354e-05, + "loss": 0.7349, + "step": 9378 + }, + { + "epoch": 0.469, + "grad_norm": 2.642749309539795, + "learning_rate": 1.2789911060392295e-05, + "loss": 0.1293, + "step": 9380 + }, + { + "epoch": 0.4691, + "grad_norm": 6.3339691162109375, + "learning_rate": 1.2786558833169031e-05, + "loss": 1.0363, + "step": 9382 + }, + { + "epoch": 0.4692, + "grad_norm": 15.73581314086914, + "learning_rate": 1.278320626641203e-05, + "loss": 0.8774, + "step": 9384 + }, + { + "epoch": 0.4693, + "grad_norm": 8.416704177856445, + "learning_rate": 1.2779853360529787e-05, + "loss": 1.1548, + "step": 9386 + }, + { + "epoch": 0.4694, + "grad_norm": 14.46317195892334, + "learning_rate": 1.2776500115930842e-05, + "loss": 1.1373, + "step": 9388 + }, + { + "epoch": 0.4695, + "grad_norm": 9.487627029418945, + "learning_rate": 1.2773146533023782e-05, + "loss": 2.113, + "step": 9390 + }, + { + "epoch": 0.4696, + "grad_norm": 1.8087489604949951, + "learning_rate": 1.2769792612217224e-05, + "loss": 0.8955, + "step": 9392 + }, + { + "epoch": 0.4697, + "grad_norm": 4.179869651794434, + "learning_rate": 1.2766438353919841e-05, + "loss": 0.6078, + "step": 9394 + }, + { + "epoch": 0.4698, + "grad_norm": 2.5373334884643555, + "learning_rate": 1.2763083758540337e-05, + "loss": 0.7941, + "step": 9396 + }, + { + "epoch": 0.4699, + "grad_norm": 18.99103546142578, + "learning_rate": 1.2759728826487461e-05, + "loss": 1.2953, + "step": 9398 + }, + { + "epoch": 0.47, + "grad_norm": 16.1452579498291, + "learning_rate": 1.2756373558169992e-05, + "loss": 1.5352, + "step": 9400 + }, + { + "epoch": 0.4701, + "grad_norm": 2.6690328121185303, + "learning_rate": 1.275301795399677e-05, + "loss": 1.0947, + "step": 9402 + }, + { + "epoch": 0.4702, + "grad_norm": 1.0378944873809814, + "learning_rate": 1.2749662014376662e-05, + "loss": 0.0282, + "step": 9404 + }, + { + "epoch": 0.4703, + "grad_norm": 3.1996254920959473, + "learning_rate": 1.2746305739718576e-05, + "loss": 1.0977, + "step": 9406 + }, + { + "epoch": 0.4704, + "grad_norm": 13.610743522644043, + "learning_rate": 1.2742949130431468e-05, + "loss": 1.1539, + "step": 9408 + }, + { + "epoch": 0.4705, + "grad_norm": 3.9173943996429443, + "learning_rate": 1.2739592186924327e-05, + "loss": 1.6717, + "step": 9410 + }, + { + "epoch": 0.4706, + "grad_norm": 3.303675889968872, + "learning_rate": 1.2736234909606186e-05, + "loss": 0.725, + "step": 9412 + }, + { + "epoch": 0.4707, + "grad_norm": 4.099830150604248, + "learning_rate": 1.2732877298886124e-05, + "loss": 0.3678, + "step": 9414 + }, + { + "epoch": 0.4708, + "grad_norm": 2.189471483230591, + "learning_rate": 1.2729519355173254e-05, + "loss": 1.1082, + "step": 9416 + }, + { + "epoch": 0.4709, + "grad_norm": 0.11235307157039642, + "learning_rate": 1.2726161078876728e-05, + "loss": 0.5024, + "step": 9418 + }, + { + "epoch": 0.471, + "grad_norm": 4.873337268829346, + "learning_rate": 1.2722802470405744e-05, + "loss": 0.8518, + "step": 9420 + }, + { + "epoch": 0.4711, + "grad_norm": 2.4778120517730713, + "learning_rate": 1.271944353016954e-05, + "loss": 1.5272, + "step": 9422 + }, + { + "epoch": 0.4712, + "grad_norm": 5.405205249786377, + "learning_rate": 1.2716084258577388e-05, + "loss": 2.5606, + "step": 9424 + }, + { + "epoch": 0.4713, + "grad_norm": 2.4316089153289795, + "learning_rate": 1.271272465603861e-05, + "loss": 1.2413, + "step": 9426 + }, + { + "epoch": 0.4714, + "grad_norm": 4.908117294311523, + "learning_rate": 1.270936472296256e-05, + "loss": 0.6831, + "step": 9428 + }, + { + "epoch": 0.4715, + "grad_norm": 3.542947292327881, + "learning_rate": 1.2706004459758636e-05, + "loss": 1.0854, + "step": 9430 + }, + { + "epoch": 0.4716, + "grad_norm": 3.7672715187072754, + "learning_rate": 1.270264386683628e-05, + "loss": 1.1056, + "step": 9432 + }, + { + "epoch": 0.4717, + "grad_norm": 3.5947177410125732, + "learning_rate": 1.2699282944604968e-05, + "loss": 0.7076, + "step": 9434 + }, + { + "epoch": 0.4718, + "grad_norm": 3.4540114402770996, + "learning_rate": 1.2695921693474211e-05, + "loss": 0.3866, + "step": 9436 + }, + { + "epoch": 0.4719, + "grad_norm": 0.6506110429763794, + "learning_rate": 1.2692560113853576e-05, + "loss": 0.7021, + "step": 9438 + }, + { + "epoch": 0.472, + "grad_norm": 2.5712380409240723, + "learning_rate": 1.2689198206152657e-05, + "loss": 0.9277, + "step": 9440 + }, + { + "epoch": 0.4721, + "grad_norm": 9.605941772460938, + "learning_rate": 1.2685835970781097e-05, + "loss": 1.8577, + "step": 9442 + }, + { + "epoch": 0.4722, + "grad_norm": 5.7605390548706055, + "learning_rate": 1.268247340814857e-05, + "loss": 0.9751, + "step": 9444 + }, + { + "epoch": 0.4723, + "grad_norm": 3.4111647605895996, + "learning_rate": 1.2679110518664795e-05, + "loss": 1.1496, + "step": 9446 + }, + { + "epoch": 0.4724, + "grad_norm": 6.468646049499512, + "learning_rate": 1.2675747302739528e-05, + "loss": 0.6547, + "step": 9448 + }, + { + "epoch": 0.4725, + "grad_norm": 2.008244514465332, + "learning_rate": 1.267238376078257e-05, + "loss": 1.0946, + "step": 9450 + }, + { + "epoch": 0.4726, + "grad_norm": 1.589984655380249, + "learning_rate": 1.2669019893203758e-05, + "loss": 0.2911, + "step": 9452 + }, + { + "epoch": 0.4727, + "grad_norm": 4.990808963775635, + "learning_rate": 1.2665655700412967e-05, + "loss": 0.8178, + "step": 9454 + }, + { + "epoch": 0.4728, + "grad_norm": 20.19782257080078, + "learning_rate": 1.2662291182820115e-05, + "loss": 0.5118, + "step": 9456 + }, + { + "epoch": 0.4729, + "grad_norm": 4.536411285400391, + "learning_rate": 1.2658926340835156e-05, + "loss": 0.7537, + "step": 9458 + }, + { + "epoch": 0.473, + "grad_norm": 3.7724764347076416, + "learning_rate": 1.265556117486809e-05, + "loss": 1.6088, + "step": 9460 + }, + { + "epoch": 0.4731, + "grad_norm": 2.733006000518799, + "learning_rate": 1.2652195685328947e-05, + "loss": 0.8213, + "step": 9462 + }, + { + "epoch": 0.4732, + "grad_norm": 4.6641974449157715, + "learning_rate": 1.2648829872627809e-05, + "loss": 0.6234, + "step": 9464 + }, + { + "epoch": 0.4733, + "grad_norm": 7.550205707550049, + "learning_rate": 1.2645463737174783e-05, + "loss": 1.7743, + "step": 9466 + }, + { + "epoch": 0.4734, + "grad_norm": 2.7948968410491943, + "learning_rate": 1.2642097279380025e-05, + "loss": 1.0861, + "step": 9468 + }, + { + "epoch": 0.4735, + "grad_norm": 3.234891414642334, + "learning_rate": 1.2638730499653731e-05, + "loss": 0.7106, + "step": 9470 + }, + { + "epoch": 0.4736, + "grad_norm": 2.053260564804077, + "learning_rate": 1.263536339840613e-05, + "loss": 1.4665, + "step": 9472 + }, + { + "epoch": 0.4737, + "grad_norm": 4.176936626434326, + "learning_rate": 1.2631995976047488e-05, + "loss": 0.3533, + "step": 9474 + }, + { + "epoch": 0.4738, + "grad_norm": 4.677779197692871, + "learning_rate": 1.2628628232988123e-05, + "loss": 1.4039, + "step": 9476 + }, + { + "epoch": 0.4739, + "grad_norm": 5.667974472045898, + "learning_rate": 1.2625260169638378e-05, + "loss": 0.8864, + "step": 9478 + }, + { + "epoch": 0.474, + "grad_norm": 8.602962493896484, + "learning_rate": 1.2621891786408648e-05, + "loss": 1.4371, + "step": 9480 + }, + { + "epoch": 0.4741, + "grad_norm": 0.18271781504154205, + "learning_rate": 1.2618523083709358e-05, + "loss": 0.5835, + "step": 9482 + }, + { + "epoch": 0.4742, + "grad_norm": 17.44820785522461, + "learning_rate": 1.261515406195097e-05, + "loss": 1.644, + "step": 9484 + }, + { + "epoch": 0.4743, + "grad_norm": 2.8336105346679688, + "learning_rate": 1.2611784721543994e-05, + "loss": 0.67, + "step": 9486 + }, + { + "epoch": 0.4744, + "grad_norm": 5.203978061676025, + "learning_rate": 1.2608415062898971e-05, + "loss": 1.2504, + "step": 9488 + }, + { + "epoch": 0.4745, + "grad_norm": 2.059722423553467, + "learning_rate": 1.2605045086426487e-05, + "loss": 1.6804, + "step": 9490 + }, + { + "epoch": 0.4746, + "grad_norm": 2.1864094734191895, + "learning_rate": 1.2601674792537157e-05, + "loss": 0.4628, + "step": 9492 + }, + { + "epoch": 0.4747, + "grad_norm": 7.501515865325928, + "learning_rate": 1.2598304181641647e-05, + "loss": 1.4331, + "step": 9494 + }, + { + "epoch": 0.4748, + "grad_norm": 4.373516082763672, + "learning_rate": 1.2594933254150654e-05, + "loss": 1.3983, + "step": 9496 + }, + { + "epoch": 0.4749, + "grad_norm": 8.478450775146484, + "learning_rate": 1.2591562010474915e-05, + "loss": 1.0762, + "step": 9498 + }, + { + "epoch": 0.475, + "grad_norm": 2.73390531539917, + "learning_rate": 1.2588190451025209e-05, + "loss": 1.5606, + "step": 9500 + }, + { + "epoch": 0.4751, + "grad_norm": 4.498706817626953, + "learning_rate": 1.2584818576212346e-05, + "loss": 0.8365, + "step": 9502 + }, + { + "epoch": 0.4752, + "grad_norm": 3.058389902114868, + "learning_rate": 1.2581446386447178e-05, + "loss": 0.9842, + "step": 9504 + }, + { + "epoch": 0.4753, + "grad_norm": 15.693381309509277, + "learning_rate": 1.25780738821406e-05, + "loss": 0.9007, + "step": 9506 + }, + { + "epoch": 0.4754, + "grad_norm": 5.383185386657715, + "learning_rate": 1.257470106370354e-05, + "loss": 0.3933, + "step": 9508 + }, + { + "epoch": 0.4755, + "grad_norm": 10.027445793151855, + "learning_rate": 1.2571327931546964e-05, + "loss": 1.335, + "step": 9510 + }, + { + "epoch": 0.4756, + "grad_norm": 5.789966106414795, + "learning_rate": 1.256795448608188e-05, + "loss": 0.6244, + "step": 9512 + }, + { + "epoch": 0.4757, + "grad_norm": 2.286071300506592, + "learning_rate": 1.2564580727719331e-05, + "loss": 0.7819, + "step": 9514 + }, + { + "epoch": 0.4758, + "grad_norm": 2.9600207805633545, + "learning_rate": 1.2561206656870397e-05, + "loss": 1.1985, + "step": 9516 + }, + { + "epoch": 0.4759, + "grad_norm": 4.285052299499512, + "learning_rate": 1.2557832273946204e-05, + "loss": 0.92, + "step": 9518 + }, + { + "epoch": 0.476, + "grad_norm": 0.5420121550559998, + "learning_rate": 1.2554457579357906e-05, + "loss": 0.3433, + "step": 9520 + }, + { + "epoch": 0.4761, + "grad_norm": 11.89404010772705, + "learning_rate": 1.2551082573516705e-05, + "loss": 1.5747, + "step": 9522 + }, + { + "epoch": 0.4762, + "grad_norm": 4.459533214569092, + "learning_rate": 1.2547707256833823e-05, + "loss": 1.1901, + "step": 9524 + }, + { + "epoch": 0.4763, + "grad_norm": 1.203545093536377, + "learning_rate": 1.2544331629720544e-05, + "loss": 0.5966, + "step": 9526 + }, + { + "epoch": 0.4764, + "grad_norm": 2.4007411003112793, + "learning_rate": 1.2540955692588173e-05, + "loss": 0.8311, + "step": 9528 + }, + { + "epoch": 0.4765, + "grad_norm": 2.685943603515625, + "learning_rate": 1.2537579445848058e-05, + "loss": 0.972, + "step": 9530 + }, + { + "epoch": 0.4766, + "grad_norm": 3.313887357711792, + "learning_rate": 1.2534202889911584e-05, + "loss": 0.7678, + "step": 9532 + }, + { + "epoch": 0.4767, + "grad_norm": 2.3804779052734375, + "learning_rate": 1.2530826025190175e-05, + "loss": 0.5022, + "step": 9534 + }, + { + "epoch": 0.4768, + "grad_norm": 10.737191200256348, + "learning_rate": 1.2527448852095295e-05, + "loss": 2.6504, + "step": 9536 + }, + { + "epoch": 0.4769, + "grad_norm": 4.779470443725586, + "learning_rate": 1.2524071371038435e-05, + "loss": 0.7056, + "step": 9538 + }, + { + "epoch": 0.477, + "grad_norm": 4.7044358253479, + "learning_rate": 1.252069358243114e-05, + "loss": 1.2806, + "step": 9540 + }, + { + "epoch": 0.4771, + "grad_norm": 3.086465835571289, + "learning_rate": 1.2517315486684973e-05, + "loss": 0.6638, + "step": 9542 + }, + { + "epoch": 0.4772, + "grad_norm": 0.6983054280281067, + "learning_rate": 1.251393708421155e-05, + "loss": 0.9778, + "step": 9544 + }, + { + "epoch": 0.4773, + "grad_norm": 0.9981411099433899, + "learning_rate": 1.251055837542252e-05, + "loss": 0.4502, + "step": 9546 + }, + { + "epoch": 0.4774, + "grad_norm": 6.48091459274292, + "learning_rate": 1.2507179360729569e-05, + "loss": 0.8965, + "step": 9548 + }, + { + "epoch": 0.4775, + "grad_norm": 7.846324920654297, + "learning_rate": 1.2503800040544417e-05, + "loss": 1.075, + "step": 9550 + }, + { + "epoch": 0.4776, + "grad_norm": 3.6869635581970215, + "learning_rate": 1.2500420415278822e-05, + "loss": 0.6141, + "step": 9552 + }, + { + "epoch": 0.4777, + "grad_norm": 7.090290546417236, + "learning_rate": 1.2497040485344585e-05, + "loss": 1.1428, + "step": 9554 + }, + { + "epoch": 0.4778, + "grad_norm": 6.467564105987549, + "learning_rate": 1.249366025115354e-05, + "loss": 1.1344, + "step": 9556 + }, + { + "epoch": 0.4779, + "grad_norm": 6.215028285980225, + "learning_rate": 1.249027971311756e-05, + "loss": 1.1511, + "step": 9558 + }, + { + "epoch": 0.478, + "grad_norm": 3.297825574874878, + "learning_rate": 1.2486898871648552e-05, + "loss": 0.8662, + "step": 9560 + }, + { + "epoch": 0.4781, + "grad_norm": 2.907838821411133, + "learning_rate": 1.2483517727158454e-05, + "loss": 0.7821, + "step": 9562 + }, + { + "epoch": 0.4782, + "grad_norm": 5.930795192718506, + "learning_rate": 1.2480136280059256e-05, + "loss": 1.1945, + "step": 9564 + }, + { + "epoch": 0.4783, + "grad_norm": 0.15661121904850006, + "learning_rate": 1.2476754530762977e-05, + "loss": 0.4372, + "step": 9566 + }, + { + "epoch": 0.4784, + "grad_norm": 3.5479910373687744, + "learning_rate": 1.2473372479681671e-05, + "loss": 0.7281, + "step": 9568 + }, + { + "epoch": 0.4785, + "grad_norm": 3.162285804748535, + "learning_rate": 1.2469990127227432e-05, + "loss": 0.7597, + "step": 9570 + }, + { + "epoch": 0.4786, + "grad_norm": 6.038173198699951, + "learning_rate": 1.2466607473812386e-05, + "loss": 0.5186, + "step": 9572 + }, + { + "epoch": 0.4787, + "grad_norm": 0.29802951216697693, + "learning_rate": 1.2463224519848703e-05, + "loss": 0.212, + "step": 9574 + }, + { + "epoch": 0.4788, + "grad_norm": 18.0944881439209, + "learning_rate": 1.2459841265748582e-05, + "loss": 0.6621, + "step": 9576 + }, + { + "epoch": 0.4789, + "grad_norm": 13.132084846496582, + "learning_rate": 1.2456457711924266e-05, + "loss": 1.8619, + "step": 9578 + }, + { + "epoch": 0.479, + "grad_norm": 3.02400279045105, + "learning_rate": 1.2453073858788027e-05, + "loss": 0.9078, + "step": 9580 + }, + { + "epoch": 0.4791, + "grad_norm": 6.12772274017334, + "learning_rate": 1.2449689706752179e-05, + "loss": 0.1527, + "step": 9582 + }, + { + "epoch": 0.4792, + "grad_norm": 4.586264610290527, + "learning_rate": 1.2446305256229074e-05, + "loss": 1.0982, + "step": 9584 + }, + { + "epoch": 0.4793, + "grad_norm": 8.567770957946777, + "learning_rate": 1.2442920507631093e-05, + "loss": 0.8678, + "step": 9586 + }, + { + "epoch": 0.4794, + "grad_norm": 0.7859114408493042, + "learning_rate": 1.2439535461370658e-05, + "loss": 0.6032, + "step": 9588 + }, + { + "epoch": 0.4795, + "grad_norm": 5.772732257843018, + "learning_rate": 1.2436150117860226e-05, + "loss": 0.5429, + "step": 9590 + }, + { + "epoch": 0.4796, + "grad_norm": 2.62892484664917, + "learning_rate": 1.2432764477512294e-05, + "loss": 0.7669, + "step": 9592 + }, + { + "epoch": 0.4797, + "grad_norm": 1.96256422996521, + "learning_rate": 1.2429378540739387e-05, + "loss": 1.2431, + "step": 9594 + }, + { + "epoch": 0.4798, + "grad_norm": 2.5552501678466797, + "learning_rate": 1.2425992307954075e-05, + "loss": 0.4772, + "step": 9596 + }, + { + "epoch": 0.4799, + "grad_norm": 7.006968021392822, + "learning_rate": 1.242260577956896e-05, + "loss": 0.9163, + "step": 9598 + }, + { + "epoch": 0.48, + "grad_norm": 8.617835998535156, + "learning_rate": 1.2419218955996677e-05, + "loss": 1.0305, + "step": 9600 + }, + { + "epoch": 0.4801, + "grad_norm": 6.345519065856934, + "learning_rate": 1.2415831837649905e-05, + "loss": 0.3748, + "step": 9602 + }, + { + "epoch": 0.4802, + "grad_norm": 3.0235424041748047, + "learning_rate": 1.241244442494135e-05, + "loss": 0.5456, + "step": 9604 + }, + { + "epoch": 0.4803, + "grad_norm": 4.857796669006348, + "learning_rate": 1.240905671828376e-05, + "loss": 1.4368, + "step": 9606 + }, + { + "epoch": 0.4804, + "grad_norm": 4.0113749504089355, + "learning_rate": 1.2405668718089918e-05, + "loss": 0.5927, + "step": 9608 + }, + { + "epoch": 0.4805, + "grad_norm": 3.367171287536621, + "learning_rate": 1.2402280424772639e-05, + "loss": 0.6275, + "step": 9610 + }, + { + "epoch": 0.4806, + "grad_norm": 4.250309944152832, + "learning_rate": 1.2398891838744777e-05, + "loss": 0.6693, + "step": 9612 + }, + { + "epoch": 0.4807, + "grad_norm": 2.8743934631347656, + "learning_rate": 1.2395502960419221e-05, + "loss": 0.8434, + "step": 9614 + }, + { + "epoch": 0.4808, + "grad_norm": 5.321511745452881, + "learning_rate": 1.2392113790208895e-05, + "loss": 0.8234, + "step": 9616 + }, + { + "epoch": 0.4809, + "grad_norm": 7.3600993156433105, + "learning_rate": 1.238872432852676e-05, + "loss": 0.5458, + "step": 9618 + }, + { + "epoch": 0.481, + "grad_norm": 4.214389801025391, + "learning_rate": 1.238533457578581e-05, + "loss": 0.5115, + "step": 9620 + }, + { + "epoch": 0.4811, + "grad_norm": 6.27351188659668, + "learning_rate": 1.2381944532399079e-05, + "loss": 1.236, + "step": 9622 + }, + { + "epoch": 0.4812, + "grad_norm": 2.3322038650512695, + "learning_rate": 1.2378554198779632e-05, + "loss": 0.8437, + "step": 9624 + }, + { + "epoch": 0.4813, + "grad_norm": 13.070283889770508, + "learning_rate": 1.237516357534057e-05, + "loss": 0.9836, + "step": 9626 + }, + { + "epoch": 0.4814, + "grad_norm": 2.1867902278900146, + "learning_rate": 1.2371772662495031e-05, + "loss": 0.9723, + "step": 9628 + }, + { + "epoch": 0.4815, + "grad_norm": 6.1709065437316895, + "learning_rate": 1.236838146065619e-05, + "loss": 0.6512, + "step": 9630 + }, + { + "epoch": 0.4816, + "grad_norm": 3.127445697784424, + "learning_rate": 1.236498997023725e-05, + "loss": 0.8907, + "step": 9632 + }, + { + "epoch": 0.4817, + "grad_norm": 6.536229133605957, + "learning_rate": 1.2361598191651453e-05, + "loss": 1.1174, + "step": 9634 + }, + { + "epoch": 0.4818, + "grad_norm": 3.6246166229248047, + "learning_rate": 1.2358206125312085e-05, + "loss": 0.7231, + "step": 9636 + }, + { + "epoch": 0.4819, + "grad_norm": 6.607795715332031, + "learning_rate": 1.2354813771632447e-05, + "loss": 1.563, + "step": 9638 + }, + { + "epoch": 0.482, + "grad_norm": 6.9421281814575195, + "learning_rate": 1.23514211310259e-05, + "loss": 0.4078, + "step": 9640 + }, + { + "epoch": 0.4821, + "grad_norm": 9.876409530639648, + "learning_rate": 1.234802820390582e-05, + "loss": 0.7237, + "step": 9642 + }, + { + "epoch": 0.4822, + "grad_norm": 7.416710376739502, + "learning_rate": 1.2344634990685624e-05, + "loss": 1.2001, + "step": 9644 + }, + { + "epoch": 0.4823, + "grad_norm": 3.638911008834839, + "learning_rate": 1.2341241491778771e-05, + "loss": 0.6345, + "step": 9646 + }, + { + "epoch": 0.4824, + "grad_norm": 2.534850597381592, + "learning_rate": 1.2337847707598738e-05, + "loss": 1.0608, + "step": 9648 + }, + { + "epoch": 0.4825, + "grad_norm": 6.175812244415283, + "learning_rate": 1.2334453638559057e-05, + "loss": 0.7147, + "step": 9650 + }, + { + "epoch": 0.4826, + "grad_norm": 6.373758792877197, + "learning_rate": 1.233105928507328e-05, + "loss": 0.7002, + "step": 9652 + }, + { + "epoch": 0.4827, + "grad_norm": 3.037646532058716, + "learning_rate": 1.2327664647554998e-05, + "loss": 0.88, + "step": 9654 + }, + { + "epoch": 0.4828, + "grad_norm": 2.210761785507202, + "learning_rate": 1.2324269726417841e-05, + "loss": 0.7019, + "step": 9656 + }, + { + "epoch": 0.4829, + "grad_norm": 3.7542600631713867, + "learning_rate": 1.2320874522075467e-05, + "loss": 1.136, + "step": 9658 + }, + { + "epoch": 0.483, + "grad_norm": 2.8683736324310303, + "learning_rate": 1.2317479034941572e-05, + "loss": 1.8192, + "step": 9660 + }, + { + "epoch": 0.4831, + "grad_norm": 2.4873623847961426, + "learning_rate": 1.231408326542989e-05, + "loss": 0.767, + "step": 9662 + }, + { + "epoch": 0.4832, + "grad_norm": 10.103681564331055, + "learning_rate": 1.2310687213954182e-05, + "loss": 0.6672, + "step": 9664 + }, + { + "epoch": 0.4833, + "grad_norm": 4.31895637512207, + "learning_rate": 1.230729088092824e-05, + "loss": 1.3143, + "step": 9666 + }, + { + "epoch": 0.4834, + "grad_norm": 7.7421064376831055, + "learning_rate": 1.2303894266765908e-05, + "loss": 1.1434, + "step": 9668 + }, + { + "epoch": 0.4835, + "grad_norm": 2.2857329845428467, + "learning_rate": 1.2300497371881046e-05, + "loss": 1.4546, + "step": 9670 + }, + { + "epoch": 0.4836, + "grad_norm": 9.288818359375, + "learning_rate": 1.2297100196687557e-05, + "loss": 1.2373, + "step": 9672 + }, + { + "epoch": 0.4837, + "grad_norm": 5.116462707519531, + "learning_rate": 1.2293702741599378e-05, + "loss": 1.2343, + "step": 9674 + }, + { + "epoch": 0.4838, + "grad_norm": 4.15347146987915, + "learning_rate": 1.2290305007030479e-05, + "loss": 1.0809, + "step": 9676 + }, + { + "epoch": 0.4839, + "grad_norm": 2.800114154815674, + "learning_rate": 1.2286906993394856e-05, + "loss": 0.6992, + "step": 9678 + }, + { + "epoch": 0.484, + "grad_norm": 3.613544225692749, + "learning_rate": 1.2283508701106559e-05, + "loss": 1.1151, + "step": 9680 + }, + { + "epoch": 0.4841, + "grad_norm": 5.452028751373291, + "learning_rate": 1.2280110130579651e-05, + "loss": 0.8367, + "step": 9682 + }, + { + "epoch": 0.4842, + "grad_norm": 4.628815174102783, + "learning_rate": 1.2276711282228241e-05, + "loss": 1.337, + "step": 9684 + }, + { + "epoch": 0.4843, + "grad_norm": 3.681208610534668, + "learning_rate": 1.2273312156466466e-05, + "loss": 1.056, + "step": 9686 + }, + { + "epoch": 0.4844, + "grad_norm": 7.271030426025391, + "learning_rate": 1.2269912753708502e-05, + "loss": 0.5199, + "step": 9688 + }, + { + "epoch": 0.4845, + "grad_norm": 4.5697455406188965, + "learning_rate": 1.2266513074368552e-05, + "loss": 0.6968, + "step": 9690 + }, + { + "epoch": 0.4846, + "grad_norm": 2.7156283855438232, + "learning_rate": 1.226311311886086e-05, + "loss": 0.9989, + "step": 9692 + }, + { + "epoch": 0.4847, + "grad_norm": 3.8859710693359375, + "learning_rate": 1.22597128875997e-05, + "loss": 0.3649, + "step": 9694 + }, + { + "epoch": 0.4848, + "grad_norm": 2.713040590286255, + "learning_rate": 1.2256312380999376e-05, + "loss": 0.6517, + "step": 9696 + }, + { + "epoch": 0.4849, + "grad_norm": 1.952035665512085, + "learning_rate": 1.2252911599474237e-05, + "loss": 0.6685, + "step": 9698 + }, + { + "epoch": 0.485, + "grad_norm": 3.5823569297790527, + "learning_rate": 1.2249510543438652e-05, + "loss": 0.7983, + "step": 9700 + }, + { + "epoch": 0.4851, + "grad_norm": 8.155536651611328, + "learning_rate": 1.224610921330703e-05, + "loss": 1.2665, + "step": 9702 + }, + { + "epoch": 0.4852, + "grad_norm": 7.813540935516357, + "learning_rate": 1.2242707609493814e-05, + "loss": 1.5657, + "step": 9704 + }, + { + "epoch": 0.4853, + "grad_norm": 3.569730520248413, + "learning_rate": 1.2239305732413477e-05, + "loss": 0.5345, + "step": 9706 + }, + { + "epoch": 0.4854, + "grad_norm": 2.930570125579834, + "learning_rate": 1.223590358248053e-05, + "loss": 0.2075, + "step": 9708 + }, + { + "epoch": 0.4855, + "grad_norm": 7.324640274047852, + "learning_rate": 1.2232501160109516e-05, + "loss": 1.9717, + "step": 9710 + }, + { + "epoch": 0.4856, + "grad_norm": 2.918759346008301, + "learning_rate": 1.2229098465715005e-05, + "loss": 0.8185, + "step": 9712 + }, + { + "epoch": 0.4857, + "grad_norm": 3.215386152267456, + "learning_rate": 1.2225695499711607e-05, + "loss": 0.8038, + "step": 9714 + }, + { + "epoch": 0.4858, + "grad_norm": 2.869642734527588, + "learning_rate": 1.2222292262513967e-05, + "loss": 1.4296, + "step": 9716 + }, + { + "epoch": 0.4859, + "grad_norm": 1.5412944555282593, + "learning_rate": 1.2218888754536753e-05, + "loss": 0.3207, + "step": 9718 + }, + { + "epoch": 0.486, + "grad_norm": 3.650346517562866, + "learning_rate": 1.2215484976194675e-05, + "loss": 0.4321, + "step": 9720 + }, + { + "epoch": 0.4861, + "grad_norm": 9.473246574401855, + "learning_rate": 1.2212080927902474e-05, + "loss": 1.5925, + "step": 9722 + }, + { + "epoch": 0.4862, + "grad_norm": 2.4412031173706055, + "learning_rate": 1.220867661007492e-05, + "loss": 0.4016, + "step": 9724 + }, + { + "epoch": 0.4863, + "grad_norm": 1.547369360923767, + "learning_rate": 1.2205272023126822e-05, + "loss": 0.7525, + "step": 9726 + }, + { + "epoch": 0.4864, + "grad_norm": 2.439836263656616, + "learning_rate": 1.2201867167473015e-05, + "loss": 1.1079, + "step": 9728 + }, + { + "epoch": 0.4865, + "grad_norm": 9.714815139770508, + "learning_rate": 1.2198462043528376e-05, + "loss": 1.4824, + "step": 9730 + }, + { + "epoch": 0.4866, + "grad_norm": 1.6891977787017822, + "learning_rate": 1.2195056651707806e-05, + "loss": 0.4305, + "step": 9732 + }, + { + "epoch": 0.4867, + "grad_norm": 4.899801731109619, + "learning_rate": 1.2191650992426238e-05, + "loss": 0.7688, + "step": 9734 + }, + { + "epoch": 0.4868, + "grad_norm": 3.8657925128936768, + "learning_rate": 1.2188245066098647e-05, + "loss": 0.9151, + "step": 9736 + }, + { + "epoch": 0.4869, + "grad_norm": 2.0803451538085938, + "learning_rate": 1.2184838873140032e-05, + "loss": 1.1814, + "step": 9738 + }, + { + "epoch": 0.487, + "grad_norm": 3.132219076156616, + "learning_rate": 1.2181432413965428e-05, + "loss": 0.7017, + "step": 9740 + }, + { + "epoch": 0.4871, + "grad_norm": 2.6895627975463867, + "learning_rate": 1.21780256889899e-05, + "loss": 0.6904, + "step": 9742 + }, + { + "epoch": 0.4872, + "grad_norm": 7.455280303955078, + "learning_rate": 1.217461869862855e-05, + "loss": 1.532, + "step": 9744 + }, + { + "epoch": 0.4873, + "grad_norm": 3.542447090148926, + "learning_rate": 1.2171211443296505e-05, + "loss": 0.787, + "step": 9746 + }, + { + "epoch": 0.4874, + "grad_norm": 2.5107204914093018, + "learning_rate": 1.2167803923408935e-05, + "loss": 2.5396, + "step": 9748 + }, + { + "epoch": 0.4875, + "grad_norm": 5.218584060668945, + "learning_rate": 1.2164396139381029e-05, + "loss": 1.2196, + "step": 9750 + }, + { + "epoch": 0.4876, + "grad_norm": 3.6395747661590576, + "learning_rate": 1.2160988091628023e-05, + "loss": 0.8472, + "step": 9752 + }, + { + "epoch": 0.4877, + "grad_norm": 7.766417980194092, + "learning_rate": 1.215757978056517e-05, + "loss": 0.5692, + "step": 9754 + }, + { + "epoch": 0.4878, + "grad_norm": 8.944314956665039, + "learning_rate": 1.2154171206607765e-05, + "loss": 0.5552, + "step": 9756 + }, + { + "epoch": 0.4879, + "grad_norm": 3.1755712032318115, + "learning_rate": 1.2150762370171137e-05, + "loss": 1.1371, + "step": 9758 + }, + { + "epoch": 0.488, + "grad_norm": 4.38138484954834, + "learning_rate": 1.2147353271670634e-05, + "loss": 0.4126, + "step": 9760 + }, + { + "epoch": 0.4881, + "grad_norm": 4.142208576202393, + "learning_rate": 1.2143943911521647e-05, + "loss": 0.7538, + "step": 9762 + }, + { + "epoch": 0.4882, + "grad_norm": 6.070943832397461, + "learning_rate": 1.2140534290139601e-05, + "loss": 1.104, + "step": 9764 + }, + { + "epoch": 0.4883, + "grad_norm": 12.094667434692383, + "learning_rate": 1.2137124407939944e-05, + "loss": 0.7106, + "step": 9766 + }, + { + "epoch": 0.4884, + "grad_norm": 2.8196446895599365, + "learning_rate": 1.2133714265338162e-05, + "loss": 0.8993, + "step": 9768 + }, + { + "epoch": 0.4885, + "grad_norm": 6.432093143463135, + "learning_rate": 1.2130303862749769e-05, + "loss": 0.8552, + "step": 9770 + }, + { + "epoch": 0.4886, + "grad_norm": 18.948104858398438, + "learning_rate": 1.2126893200590309e-05, + "loss": 2.0072, + "step": 9772 + }, + { + "epoch": 0.4887, + "grad_norm": 3.275167226791382, + "learning_rate": 1.2123482279275364e-05, + "loss": 0.4037, + "step": 9774 + }, + { + "epoch": 0.4888, + "grad_norm": 4.560431003570557, + "learning_rate": 1.212007109922055e-05, + "loss": 1.2996, + "step": 9776 + }, + { + "epoch": 0.4889, + "grad_norm": 3.4564242362976074, + "learning_rate": 1.2116659660841499e-05, + "loss": 1.1661, + "step": 9778 + }, + { + "epoch": 0.489, + "grad_norm": 4.065776824951172, + "learning_rate": 1.211324796455389e-05, + "loss": 0.8924, + "step": 9780 + }, + { + "epoch": 0.4891, + "grad_norm": 0.5435367226600647, + "learning_rate": 1.2109836010773423e-05, + "loss": 0.8238, + "step": 9782 + }, + { + "epoch": 0.4892, + "grad_norm": 10.136314392089844, + "learning_rate": 1.2106423799915841e-05, + "loss": 2.4761, + "step": 9784 + }, + { + "epoch": 0.4893, + "grad_norm": 6.491214752197266, + "learning_rate": 1.2103011332396909e-05, + "loss": 0.5015, + "step": 9786 + }, + { + "epoch": 0.4894, + "grad_norm": 2.1237399578094482, + "learning_rate": 1.2099598608632427e-05, + "loss": 1.4507, + "step": 9788 + }, + { + "epoch": 0.4895, + "grad_norm": 3.835263967514038, + "learning_rate": 1.2096185629038219e-05, + "loss": 0.8363, + "step": 9790 + }, + { + "epoch": 0.4896, + "grad_norm": 7.059875965118408, + "learning_rate": 1.2092772394030153e-05, + "loss": 0.7181, + "step": 9792 + }, + { + "epoch": 0.4897, + "grad_norm": 4.99864387512207, + "learning_rate": 1.2089358904024117e-05, + "loss": 0.7596, + "step": 9794 + }, + { + "epoch": 0.4898, + "grad_norm": 4.480491638183594, + "learning_rate": 1.208594515943604e-05, + "loss": 1.5303, + "step": 9796 + }, + { + "epoch": 0.4899, + "grad_norm": 3.4552876949310303, + "learning_rate": 1.208253116068187e-05, + "loss": 1.0527, + "step": 9798 + }, + { + "epoch": 0.49, + "grad_norm": 6.377309799194336, + "learning_rate": 1.2079116908177592e-05, + "loss": 1.3906, + "step": 9800 + }, + { + "epoch": 0.4901, + "grad_norm": 3.582801580429077, + "learning_rate": 1.2075702402339231e-05, + "loss": 0.4555, + "step": 9802 + }, + { + "epoch": 0.4902, + "grad_norm": 1.493849515914917, + "learning_rate": 1.2072287643582825e-05, + "loss": 1.4624, + "step": 9804 + }, + { + "epoch": 0.4903, + "grad_norm": 2.2941300868988037, + "learning_rate": 1.206887263232446e-05, + "loss": 0.8819, + "step": 9806 + }, + { + "epoch": 0.4904, + "grad_norm": 12.967742919921875, + "learning_rate": 1.2065457368980236e-05, + "loss": 1.8335, + "step": 9808 + }, + { + "epoch": 0.4905, + "grad_norm": 2.2355897426605225, + "learning_rate": 1.2062041853966298e-05, + "loss": 0.5677, + "step": 9810 + }, + { + "epoch": 0.4906, + "grad_norm": 3.621666431427002, + "learning_rate": 1.2058626087698814e-05, + "loss": 1.7427, + "step": 9812 + }, + { + "epoch": 0.4907, + "grad_norm": 25.278778076171875, + "learning_rate": 1.2055210070593987e-05, + "loss": 0.5777, + "step": 9814 + }, + { + "epoch": 0.4908, + "grad_norm": 7.426086902618408, + "learning_rate": 1.2051793803068046e-05, + "loss": 0.6978, + "step": 9816 + }, + { + "epoch": 0.4909, + "grad_norm": 10.279983520507812, + "learning_rate": 1.2048377285537256e-05, + "loss": 0.9184, + "step": 9818 + }, + { + "epoch": 0.491, + "grad_norm": 4.287988662719727, + "learning_rate": 1.2044960518417902e-05, + "loss": 0.7286, + "step": 9820 + }, + { + "epoch": 0.4911, + "grad_norm": 2.8327653408050537, + "learning_rate": 1.2041543502126317e-05, + "loss": 0.6118, + "step": 9822 + }, + { + "epoch": 0.4912, + "grad_norm": 10.655893325805664, + "learning_rate": 1.203812623707885e-05, + "loss": 0.8643, + "step": 9824 + }, + { + "epoch": 0.4913, + "grad_norm": 3.577101469039917, + "learning_rate": 1.203470872369188e-05, + "loss": 1.0519, + "step": 9826 + }, + { + "epoch": 0.4914, + "grad_norm": 10.816025733947754, + "learning_rate": 1.2031290962381823e-05, + "loss": 1.8936, + "step": 9828 + }, + { + "epoch": 0.4915, + "grad_norm": 7.64378023147583, + "learning_rate": 1.2027872953565125e-05, + "loss": 1.3792, + "step": 9830 + }, + { + "epoch": 0.4916, + "grad_norm": 5.7048773765563965, + "learning_rate": 1.202445469765826e-05, + "loss": 1.007, + "step": 9832 + }, + { + "epoch": 0.4917, + "grad_norm": 4.493236541748047, + "learning_rate": 1.2021036195077731e-05, + "loss": 0.8554, + "step": 9834 + }, + { + "epoch": 0.4918, + "grad_norm": 12.477187156677246, + "learning_rate": 1.201761744624007e-05, + "loss": 1.3647, + "step": 9836 + }, + { + "epoch": 0.4919, + "grad_norm": 6.492858409881592, + "learning_rate": 1.2014198451561843e-05, + "loss": 1.3582, + "step": 9838 + }, + { + "epoch": 0.492, + "grad_norm": 2.810126304626465, + "learning_rate": 1.2010779211459649e-05, + "loss": 2.0716, + "step": 9840 + }, + { + "epoch": 0.4921, + "grad_norm": 3.0176196098327637, + "learning_rate": 1.2007359726350104e-05, + "loss": 1.2969, + "step": 9842 + }, + { + "epoch": 0.4922, + "grad_norm": 7.05873441696167, + "learning_rate": 1.2003939996649864e-05, + "loss": 0.3895, + "step": 9844 + }, + { + "epoch": 0.4923, + "grad_norm": 3.477430582046509, + "learning_rate": 1.2000520022775618e-05, + "loss": 1.0356, + "step": 9846 + }, + { + "epoch": 0.4924, + "grad_norm": 7.658217906951904, + "learning_rate": 1.1997099805144071e-05, + "loss": 0.838, + "step": 9848 + }, + { + "epoch": 0.4925, + "grad_norm": 8.18468189239502, + "learning_rate": 1.1993679344171973e-05, + "loss": 1.1087, + "step": 9850 + }, + { + "epoch": 0.4926, + "grad_norm": 1.8966267108917236, + "learning_rate": 1.1990258640276094e-05, + "loss": 0.7866, + "step": 9852 + }, + { + "epoch": 0.4927, + "grad_norm": 2.3197503089904785, + "learning_rate": 1.1986837693873237e-05, + "loss": 0.9294, + "step": 9854 + }, + { + "epoch": 0.4928, + "grad_norm": 4.618279933929443, + "learning_rate": 1.1983416505380234e-05, + "loss": 0.9482, + "step": 9856 + }, + { + "epoch": 0.4929, + "grad_norm": 4.363330841064453, + "learning_rate": 1.1979995075213946e-05, + "loss": 1.2554, + "step": 9858 + }, + { + "epoch": 0.493, + "grad_norm": 4.789938926696777, + "learning_rate": 1.1976573403791263e-05, + "loss": 1.3653, + "step": 9860 + }, + { + "epoch": 0.4931, + "grad_norm": 4.978847503662109, + "learning_rate": 1.1973151491529106e-05, + "loss": 0.7208, + "step": 9862 + }, + { + "epoch": 0.4932, + "grad_norm": 2.3675427436828613, + "learning_rate": 1.1969729338844429e-05, + "loss": 0.5891, + "step": 9864 + }, + { + "epoch": 0.4933, + "grad_norm": 2.763092279434204, + "learning_rate": 1.19663069461542e-05, + "loss": 0.397, + "step": 9866 + }, + { + "epoch": 0.4934, + "grad_norm": 4.864582061767578, + "learning_rate": 1.196288431387544e-05, + "loss": 0.8519, + "step": 9868 + }, + { + "epoch": 0.4935, + "grad_norm": 4.014121055603027, + "learning_rate": 1.1959461442425178e-05, + "loss": 1.12, + "step": 9870 + }, + { + "epoch": 0.4936, + "grad_norm": 4.543071269989014, + "learning_rate": 1.1956038332220484e-05, + "loss": 1.1749, + "step": 9872 + }, + { + "epoch": 0.4937, + "grad_norm": 8.106965065002441, + "learning_rate": 1.1952614983678452e-05, + "loss": 0.4683, + "step": 9874 + }, + { + "epoch": 0.4938, + "grad_norm": 5.726263046264648, + "learning_rate": 1.1949191397216207e-05, + "loss": 1.1291, + "step": 9876 + }, + { + "epoch": 0.4939, + "grad_norm": 5.309007167816162, + "learning_rate": 1.1945767573250904e-05, + "loss": 0.5522, + "step": 9878 + }, + { + "epoch": 0.494, + "grad_norm": 8.65357780456543, + "learning_rate": 1.194234351219972e-05, + "loss": 0.5542, + "step": 9880 + }, + { + "epoch": 0.4941, + "grad_norm": 6.471100330352783, + "learning_rate": 1.1938919214479876e-05, + "loss": 1.1015, + "step": 9882 + }, + { + "epoch": 0.4942, + "grad_norm": 2.1868410110473633, + "learning_rate": 1.1935494680508606e-05, + "loss": 0.5148, + "step": 9884 + }, + { + "epoch": 0.4943, + "grad_norm": 16.383243560791016, + "learning_rate": 1.1932069910703176e-05, + "loss": 1.0937, + "step": 9886 + }, + { + "epoch": 0.4944, + "grad_norm": 3.069474220275879, + "learning_rate": 1.192864490548089e-05, + "loss": 1.719, + "step": 9888 + }, + { + "epoch": 0.4945, + "grad_norm": 2.9230668544769287, + "learning_rate": 1.1925219665259076e-05, + "loss": 1.4257, + "step": 9890 + }, + { + "epoch": 0.4946, + "grad_norm": 4.7350287437438965, + "learning_rate": 1.1921794190455082e-05, + "loss": 1.2059, + "step": 9892 + }, + { + "epoch": 0.4947, + "grad_norm": 3.425025463104248, + "learning_rate": 1.1918368481486297e-05, + "loss": 1.197, + "step": 9894 + }, + { + "epoch": 0.4948, + "grad_norm": 1.131122350692749, + "learning_rate": 1.191494253877013e-05, + "loss": 0.7803, + "step": 9896 + }, + { + "epoch": 0.4949, + "grad_norm": 3.7569565773010254, + "learning_rate": 1.1911516362724024e-05, + "loss": 0.4511, + "step": 9898 + }, + { + "epoch": 0.495, + "grad_norm": 4.706131458282471, + "learning_rate": 1.190808995376545e-05, + "loss": 1.6316, + "step": 9900 + }, + { + "epoch": 0.4951, + "grad_norm": 8.769915580749512, + "learning_rate": 1.1904663312311902e-05, + "loss": 0.563, + "step": 9902 + }, + { + "epoch": 0.4952, + "grad_norm": 1.4325964450836182, + "learning_rate": 1.1901236438780902e-05, + "loss": 0.281, + "step": 9904 + }, + { + "epoch": 0.4953, + "grad_norm": 3.83693265914917, + "learning_rate": 1.1897809333590014e-05, + "loss": 0.5301, + "step": 9906 + }, + { + "epoch": 0.4954, + "grad_norm": 19.650043487548828, + "learning_rate": 1.1894381997156814e-05, + "loss": 2.5976, + "step": 9908 + }, + { + "epoch": 0.4955, + "grad_norm": 3.0171706676483154, + "learning_rate": 1.1890954429898914e-05, + "loss": 0.6075, + "step": 9910 + }, + { + "epoch": 0.4956, + "grad_norm": 2.87839412689209, + "learning_rate": 1.1887526632233954e-05, + "loss": 0.3016, + "step": 9912 + }, + { + "epoch": 0.4957, + "grad_norm": 3.5264904499053955, + "learning_rate": 1.1884098604579597e-05, + "loss": 0.759, + "step": 9914 + }, + { + "epoch": 0.4958, + "grad_norm": 3.9795026779174805, + "learning_rate": 1.188067034735354e-05, + "loss": 0.8106, + "step": 9916 + }, + { + "epoch": 0.4959, + "grad_norm": 3.6785988807678223, + "learning_rate": 1.1877241860973508e-05, + "loss": 0.9199, + "step": 9918 + }, + { + "epoch": 0.496, + "grad_norm": 8.45374584197998, + "learning_rate": 1.187381314585725e-05, + "loss": 0.9269, + "step": 9920 + }, + { + "epoch": 0.4961, + "grad_norm": 1.9192171096801758, + "learning_rate": 1.187038420242254e-05, + "loss": 0.6514, + "step": 9922 + }, + { + "epoch": 0.4962, + "grad_norm": 1.4629027843475342, + "learning_rate": 1.186695503108719e-05, + "loss": 0.1281, + "step": 9924 + }, + { + "epoch": 0.4963, + "grad_norm": 7.848942279815674, + "learning_rate": 1.1863525632269034e-05, + "loss": 0.7448, + "step": 9926 + }, + { + "epoch": 0.4964, + "grad_norm": 4.1048903465271, + "learning_rate": 1.186009600638593e-05, + "loss": 0.6919, + "step": 9928 + }, + { + "epoch": 0.4965, + "grad_norm": 3.203733205795288, + "learning_rate": 1.1856666153855776e-05, + "loss": 1.5899, + "step": 9930 + }, + { + "epoch": 0.4966, + "grad_norm": 9.252974510192871, + "learning_rate": 1.1853236075096474e-05, + "loss": 0.6633, + "step": 9932 + }, + { + "epoch": 0.4967, + "grad_norm": 6.561268329620361, + "learning_rate": 1.1849805770525984e-05, + "loss": 0.2897, + "step": 9934 + }, + { + "epoch": 0.4968, + "grad_norm": 13.425606727600098, + "learning_rate": 1.184637524056227e-05, + "loss": 0.9975, + "step": 9936 + }, + { + "epoch": 0.4969, + "grad_norm": 4.0238871574401855, + "learning_rate": 1.1842944485623335e-05, + "loss": 1.3699, + "step": 9938 + }, + { + "epoch": 0.497, + "grad_norm": 0.762266993522644, + "learning_rate": 1.1839513506127202e-05, + "loss": 0.5178, + "step": 9940 + }, + { + "epoch": 0.4971, + "grad_norm": 4.230667591094971, + "learning_rate": 1.1836082302491931e-05, + "loss": 0.7375, + "step": 9942 + }, + { + "epoch": 0.4972, + "grad_norm": 4.069673538208008, + "learning_rate": 1.1832650875135599e-05, + "loss": 0.937, + "step": 9944 + }, + { + "epoch": 0.4973, + "grad_norm": 4.456091403961182, + "learning_rate": 1.1829219224476318e-05, + "loss": 1.341, + "step": 9946 + }, + { + "epoch": 0.4974, + "grad_norm": 5.344761371612549, + "learning_rate": 1.1825787350932224e-05, + "loss": 1.318, + "step": 9948 + }, + { + "epoch": 0.4975, + "grad_norm": 5.775989532470703, + "learning_rate": 1.1822355254921478e-05, + "loss": 0.6481, + "step": 9950 + }, + { + "epoch": 0.4976, + "grad_norm": 3.1450462341308594, + "learning_rate": 1.181892293686227e-05, + "loss": 0.4532, + "step": 9952 + }, + { + "epoch": 0.4977, + "grad_norm": 4.320701599121094, + "learning_rate": 1.1815490397172822e-05, + "loss": 0.6739, + "step": 9954 + }, + { + "epoch": 0.4978, + "grad_norm": 2.7704195976257324, + "learning_rate": 1.1812057636271374e-05, + "loss": 1.3923, + "step": 9956 + }, + { + "epoch": 0.4979, + "grad_norm": 0.6585774421691895, + "learning_rate": 1.1808624654576202e-05, + "loss": 0.4957, + "step": 9958 + }, + { + "epoch": 0.498, + "grad_norm": 4.590575218200684, + "learning_rate": 1.1805191452505602e-05, + "loss": 1.1567, + "step": 9960 + }, + { + "epoch": 0.4981, + "grad_norm": 2.79784893989563, + "learning_rate": 1.1801758030477897e-05, + "loss": 1.4091, + "step": 9962 + }, + { + "epoch": 0.4982, + "grad_norm": 25.843847274780273, + "learning_rate": 1.1798324388911445e-05, + "loss": 0.6828, + "step": 9964 + }, + { + "epoch": 0.4983, + "grad_norm": 13.665376663208008, + "learning_rate": 1.1794890528224619e-05, + "loss": 0.8556, + "step": 9966 + }, + { + "epoch": 0.4984, + "grad_norm": 2.668562173843384, + "learning_rate": 1.1791456448835825e-05, + "loss": 0.557, + "step": 9968 + }, + { + "epoch": 0.4985, + "grad_norm": 2.3997819423675537, + "learning_rate": 1.1788022151163497e-05, + "loss": 0.6619, + "step": 9970 + }, + { + "epoch": 0.4986, + "grad_norm": 6.823646545410156, + "learning_rate": 1.1784587635626095e-05, + "loss": 1.5185, + "step": 9972 + }, + { + "epoch": 0.4987, + "grad_norm": 1.906243085861206, + "learning_rate": 1.17811529026421e-05, + "loss": 1.3183, + "step": 9974 + }, + { + "epoch": 0.4988, + "grad_norm": 1.3361669778823853, + "learning_rate": 1.1777717952630033e-05, + "loss": 0.6329, + "step": 9976 + }, + { + "epoch": 0.4989, + "grad_norm": 6.756249904632568, + "learning_rate": 1.1774282786008422e-05, + "loss": 1.492, + "step": 9978 + }, + { + "epoch": 0.499, + "grad_norm": 3.23209547996521, + "learning_rate": 1.1770847403195836e-05, + "loss": 0.84, + "step": 9980 + }, + { + "epoch": 0.4991, + "grad_norm": 6.674045562744141, + "learning_rate": 1.1767411804610864e-05, + "loss": 0.9361, + "step": 9982 + }, + { + "epoch": 0.4992, + "grad_norm": 6.834822654724121, + "learning_rate": 1.1763975990672125e-05, + "loss": 1.2166, + "step": 9984 + }, + { + "epoch": 0.4993, + "grad_norm": 4.827341556549072, + "learning_rate": 1.1760539961798263e-05, + "loss": 0.513, + "step": 9986 + }, + { + "epoch": 0.4994, + "grad_norm": 3.847322940826416, + "learning_rate": 1.1757103718407948e-05, + "loss": 0.9851, + "step": 9988 + }, + { + "epoch": 0.4995, + "grad_norm": 3.8296239376068115, + "learning_rate": 1.1753667260919872e-05, + "loss": 0.9614, + "step": 9990 + }, + { + "epoch": 0.4996, + "grad_norm": 5.808821678161621, + "learning_rate": 1.1750230589752763e-05, + "loss": 1.2444, + "step": 9992 + }, + { + "epoch": 0.4997, + "grad_norm": 3.2424697875976562, + "learning_rate": 1.1746793705325363e-05, + "loss": 0.6076, + "step": 9994 + }, + { + "epoch": 0.4998, + "grad_norm": 8.83584213256836, + "learning_rate": 1.1743356608056448e-05, + "loss": 1.1069, + "step": 9996 + }, + { + "epoch": 0.4999, + "grad_norm": 5.02914571762085, + "learning_rate": 1.173991929836482e-05, + "loss": 0.9277, + "step": 9998 + }, + { + "epoch": 0.5, + "grad_norm": 3.593430995941162, + "learning_rate": 1.1736481776669307e-05, + "loss": 0.965, + "step": 10000 + }, + { + "epoch": 0.5001, + "grad_norm": 2.906991720199585, + "learning_rate": 1.1733044043388752e-05, + "loss": 0.5218, + "step": 10002 + }, + { + "epoch": 0.5002, + "grad_norm": 8.287283897399902, + "learning_rate": 1.1729606098942039e-05, + "loss": 1.3239, + "step": 10004 + }, + { + "epoch": 0.5003, + "grad_norm": 3.8756725788116455, + "learning_rate": 1.1726167943748068e-05, + "loss": 1.2582, + "step": 10006 + }, + { + "epoch": 0.5004, + "grad_norm": 16.9570255279541, + "learning_rate": 1.1722729578225769e-05, + "loss": 0.9177, + "step": 10008 + }, + { + "epoch": 0.5005, + "grad_norm": 7.067464828491211, + "learning_rate": 1.1719291002794096e-05, + "loss": 1.4238, + "step": 10010 + }, + { + "epoch": 0.5006, + "grad_norm": 3.7788920402526855, + "learning_rate": 1.171585221787203e-05, + "loss": 1.337, + "step": 10012 + }, + { + "epoch": 0.5007, + "grad_norm": 3.2786476612091064, + "learning_rate": 1.1712413223878577e-05, + "loss": 1.862, + "step": 10014 + }, + { + "epoch": 0.5008, + "grad_norm": 6.26818323135376, + "learning_rate": 1.1708974021232768e-05, + "loss": 1.0146, + "step": 10016 + }, + { + "epoch": 0.5009, + "grad_norm": 3.1058807373046875, + "learning_rate": 1.1705534610353657e-05, + "loss": 0.5084, + "step": 10018 + }, + { + "epoch": 0.501, + "grad_norm": 7.0128173828125, + "learning_rate": 1.1702094991660326e-05, + "loss": 0.621, + "step": 10020 + }, + { + "epoch": 0.5011, + "grad_norm": 7.626229286193848, + "learning_rate": 1.1698655165571886e-05, + "loss": 0.7426, + "step": 10022 + }, + { + "epoch": 0.5012, + "grad_norm": 3.4865105152130127, + "learning_rate": 1.1695215132507465e-05, + "loss": 0.3421, + "step": 10024 + }, + { + "epoch": 0.5013, + "grad_norm": 4.186163902282715, + "learning_rate": 1.1691774892886223e-05, + "loss": 0.2619, + "step": 10026 + }, + { + "epoch": 0.5014, + "grad_norm": 4.802762031555176, + "learning_rate": 1.1688334447127338e-05, + "loss": 1.2677, + "step": 10028 + }, + { + "epoch": 0.5015, + "grad_norm": 3.432342529296875, + "learning_rate": 1.1684893795650028e-05, + "loss": 0.8849, + "step": 10030 + }, + { + "epoch": 0.5016, + "grad_norm": 4.424459457397461, + "learning_rate": 1.1681452938873516e-05, + "loss": 1.443, + "step": 10032 + }, + { + "epoch": 0.5017, + "grad_norm": 4.183582305908203, + "learning_rate": 1.1678011877217065e-05, + "loss": 1.0634, + "step": 10034 + }, + { + "epoch": 0.5018, + "grad_norm": 7.808913707733154, + "learning_rate": 1.1674570611099956e-05, + "loss": 0.5277, + "step": 10036 + }, + { + "epoch": 0.5019, + "grad_norm": 3.0171401500701904, + "learning_rate": 1.16711291409415e-05, + "loss": 0.5972, + "step": 10038 + }, + { + "epoch": 0.502, + "grad_norm": 2.2303411960601807, + "learning_rate": 1.1667687467161025e-05, + "loss": 0.2665, + "step": 10040 + }, + { + "epoch": 0.5021, + "grad_norm": 3.9571056365966797, + "learning_rate": 1.1664245590177891e-05, + "loss": 0.7706, + "step": 10042 + }, + { + "epoch": 0.5022, + "grad_norm": 5.607382297515869, + "learning_rate": 1.166080351041148e-05, + "loss": 0.7895, + "step": 10044 + }, + { + "epoch": 0.5023, + "grad_norm": 6.169378757476807, + "learning_rate": 1.1657361228281198e-05, + "loss": 1.1959, + "step": 10046 + }, + { + "epoch": 0.5024, + "grad_norm": 5.119609355926514, + "learning_rate": 1.1653918744206478e-05, + "loss": 1.4104, + "step": 10048 + }, + { + "epoch": 0.5025, + "grad_norm": 7.458414077758789, + "learning_rate": 1.1650476058606776e-05, + "loss": 1.7786, + "step": 10050 + }, + { + "epoch": 0.5026, + "grad_norm": 2.941744327545166, + "learning_rate": 1.1647033171901573e-05, + "loss": 1.4583, + "step": 10052 + }, + { + "epoch": 0.5027, + "grad_norm": 3.548300266265869, + "learning_rate": 1.1643590084510379e-05, + "loss": 1.1517, + "step": 10054 + }, + { + "epoch": 0.5028, + "grad_norm": 8.966436386108398, + "learning_rate": 1.1640146796852711e-05, + "loss": 0.6489, + "step": 10056 + }, + { + "epoch": 0.5029, + "grad_norm": 4.21544075012207, + "learning_rate": 1.1636703309348135e-05, + "loss": 1.0064, + "step": 10058 + }, + { + "epoch": 0.503, + "grad_norm": 3.536367654800415, + "learning_rate": 1.1633259622416224e-05, + "loss": 1.0254, + "step": 10060 + }, + { + "epoch": 0.5031, + "grad_norm": 7.176509857177734, + "learning_rate": 1.1629815736476582e-05, + "loss": 1.1684, + "step": 10062 + }, + { + "epoch": 0.5032, + "grad_norm": 2.6325557231903076, + "learning_rate": 1.1626371651948839e-05, + "loss": 0.3315, + "step": 10064 + }, + { + "epoch": 0.5033, + "grad_norm": 6.426602363586426, + "learning_rate": 1.1622927369252638e-05, + "loss": 0.5221, + "step": 10066 + }, + { + "epoch": 0.5034, + "grad_norm": 3.6249001026153564, + "learning_rate": 1.1619482888807662e-05, + "loss": 0.7008, + "step": 10068 + }, + { + "epoch": 0.5035, + "grad_norm": 3.923880100250244, + "learning_rate": 1.1616038211033613e-05, + "loss": 0.6618, + "step": 10070 + }, + { + "epoch": 0.5036, + "grad_norm": 2.906726360321045, + "learning_rate": 1.1612593336350209e-05, + "loss": 1.0031, + "step": 10072 + }, + { + "epoch": 0.5037, + "grad_norm": 4.216426372528076, + "learning_rate": 1.1609148265177193e-05, + "loss": 1.0833, + "step": 10074 + }, + { + "epoch": 0.5038, + "grad_norm": 7.89506721496582, + "learning_rate": 1.1605702997934345e-05, + "loss": 1.6097, + "step": 10076 + }, + { + "epoch": 0.5039, + "grad_norm": 1.8780869245529175, + "learning_rate": 1.160225753504146e-05, + "loss": 1.1826, + "step": 10078 + }, + { + "epoch": 0.504, + "grad_norm": 3.8740124702453613, + "learning_rate": 1.159881187691835e-05, + "loss": 1.3126, + "step": 10080 + }, + { + "epoch": 0.5041, + "grad_norm": 2.457077741622925, + "learning_rate": 1.1595366023984864e-05, + "loss": 0.3392, + "step": 10082 + }, + { + "epoch": 0.5042, + "grad_norm": 1.6379317045211792, + "learning_rate": 1.1591919976660867e-05, + "loss": 0.8996, + "step": 10084 + }, + { + "epoch": 0.5043, + "grad_norm": 2.9804718494415283, + "learning_rate": 1.158847373536625e-05, + "loss": 1.2964, + "step": 10086 + }, + { + "epoch": 0.5044, + "grad_norm": 0.3269278109073639, + "learning_rate": 1.158502730052093e-05, + "loss": 0.7542, + "step": 10088 + }, + { + "epoch": 0.5045, + "grad_norm": 2.629024028778076, + "learning_rate": 1.1581580672544839e-05, + "loss": 0.6728, + "step": 10090 + }, + { + "epoch": 0.5046, + "grad_norm": 3.0803191661834717, + "learning_rate": 1.157813385185794e-05, + "loss": 0.9691, + "step": 10092 + }, + { + "epoch": 0.5047, + "grad_norm": 8.099601745605469, + "learning_rate": 1.1574686838880216e-05, + "loss": 1.2641, + "step": 10094 + }, + { + "epoch": 0.5048, + "grad_norm": 2.7260212898254395, + "learning_rate": 1.157123963403168e-05, + "loss": 1.395, + "step": 10096 + }, + { + "epoch": 0.5049, + "grad_norm": 2.5342729091644287, + "learning_rate": 1.1567792237732358e-05, + "loss": 1.2602, + "step": 10098 + }, + { + "epoch": 0.505, + "grad_norm": 5.707026958465576, + "learning_rate": 1.156434465040231e-05, + "loss": 0.5721, + "step": 10100 + }, + { + "epoch": 0.5051, + "grad_norm": 3.7483394145965576, + "learning_rate": 1.156089687246161e-05, + "loss": 0.6782, + "step": 10102 + }, + { + "epoch": 0.5052, + "grad_norm": 1.1084489822387695, + "learning_rate": 1.1557448904330362e-05, + "loss": 0.7307, + "step": 10104 + }, + { + "epoch": 0.5053, + "grad_norm": 14.680147171020508, + "learning_rate": 1.155400074642869e-05, + "loss": 0.8071, + "step": 10106 + }, + { + "epoch": 0.5054, + "grad_norm": 2.1958324909210205, + "learning_rate": 1.155055239917674e-05, + "loss": 0.8584, + "step": 10108 + }, + { + "epoch": 0.5055, + "grad_norm": 5.832326412200928, + "learning_rate": 1.1547103862994683e-05, + "loss": 0.7858, + "step": 10110 + }, + { + "epoch": 0.5056, + "grad_norm": 5.174610137939453, + "learning_rate": 1.1543655138302714e-05, + "loss": 1.0263, + "step": 10112 + }, + { + "epoch": 0.5057, + "grad_norm": 4.536744117736816, + "learning_rate": 1.1540206225521046e-05, + "loss": 1.0203, + "step": 10114 + }, + { + "epoch": 0.5058, + "grad_norm": 5.1902546882629395, + "learning_rate": 1.1536757125069924e-05, + "loss": 0.5929, + "step": 10116 + }, + { + "epoch": 0.5059, + "grad_norm": 4.446071624755859, + "learning_rate": 1.1533307837369607e-05, + "loss": 0.7076, + "step": 10118 + }, + { + "epoch": 0.506, + "grad_norm": 14.787867546081543, + "learning_rate": 1.1529858362840383e-05, + "loss": 1.0869, + "step": 10120 + }, + { + "epoch": 0.5061, + "grad_norm": 4.1146368980407715, + "learning_rate": 1.1526408701902556e-05, + "loss": 0.9424, + "step": 10122 + }, + { + "epoch": 0.5062, + "grad_norm": 5.758910655975342, + "learning_rate": 1.1522958854976458e-05, + "loss": 1.5219, + "step": 10124 + }, + { + "epoch": 0.5063, + "grad_norm": 3.1975393295288086, + "learning_rate": 1.1519508822482445e-05, + "loss": 0.5378, + "step": 10126 + }, + { + "epoch": 0.5064, + "grad_norm": 4.854013919830322, + "learning_rate": 1.1516058604840891e-05, + "loss": 1.1167, + "step": 10128 + }, + { + "epoch": 0.5065, + "grad_norm": 5.8267717361450195, + "learning_rate": 1.1512608202472195e-05, + "loss": 1.3301, + "step": 10130 + }, + { + "epoch": 0.5066, + "grad_norm": 3.1598029136657715, + "learning_rate": 1.1509157615796775e-05, + "loss": 0.7763, + "step": 10132 + }, + { + "epoch": 0.5067, + "grad_norm": 4.096884727478027, + "learning_rate": 1.1505706845235078e-05, + "loss": 1.7784, + "step": 10134 + }, + { + "epoch": 0.5068, + "grad_norm": 6.457558631896973, + "learning_rate": 1.1502255891207572e-05, + "loss": 1.0298, + "step": 10136 + }, + { + "epoch": 0.5069, + "grad_norm": 3.0807063579559326, + "learning_rate": 1.1498804754134741e-05, + "loss": 0.8805, + "step": 10138 + }, + { + "epoch": 0.507, + "grad_norm": 6.2287702560424805, + "learning_rate": 1.1495353434437098e-05, + "loss": 0.7645, + "step": 10140 + }, + { + "epoch": 0.5071, + "grad_norm": 6.689541816711426, + "learning_rate": 1.1491901932535172e-05, + "loss": 1.4966, + "step": 10142 + }, + { + "epoch": 0.5072, + "grad_norm": 3.9163384437561035, + "learning_rate": 1.1488450248849523e-05, + "loss": 0.3045, + "step": 10144 + }, + { + "epoch": 0.5073, + "grad_norm": 2.3523054122924805, + "learning_rate": 1.1484998383800727e-05, + "loss": 1.8806, + "step": 10146 + }, + { + "epoch": 0.5074, + "grad_norm": 7.0019001960754395, + "learning_rate": 1.1481546337809381e-05, + "loss": 1.4121, + "step": 10148 + }, + { + "epoch": 0.5075, + "grad_norm": 5.015313148498535, + "learning_rate": 1.1478094111296109e-05, + "loss": 1.5423, + "step": 10150 + }, + { + "epoch": 0.5076, + "grad_norm": 7.782539367675781, + "learning_rate": 1.1474641704681551e-05, + "loss": 1.0752, + "step": 10152 + }, + { + "epoch": 0.5077, + "grad_norm": 3.7336606979370117, + "learning_rate": 1.1471189118386374e-05, + "loss": 0.7064, + "step": 10154 + }, + { + "epoch": 0.5078, + "grad_norm": 4.986889839172363, + "learning_rate": 1.1467736352831266e-05, + "loss": 0.7478, + "step": 10156 + }, + { + "epoch": 0.5079, + "grad_norm": 2.3117740154266357, + "learning_rate": 1.1464283408436937e-05, + "loss": 0.4698, + "step": 10158 + }, + { + "epoch": 0.508, + "grad_norm": 1.8563610315322876, + "learning_rate": 1.1460830285624119e-05, + "loss": 1.2077, + "step": 10160 + }, + { + "epoch": 0.5081, + "grad_norm": 3.0024209022521973, + "learning_rate": 1.1457376984813557e-05, + "loss": 0.7465, + "step": 10162 + }, + { + "epoch": 0.5082, + "grad_norm": 14.165668487548828, + "learning_rate": 1.1453923506426032e-05, + "loss": 1.2209, + "step": 10164 + }, + { + "epoch": 0.5083, + "grad_norm": 1.5801922082901, + "learning_rate": 1.1450469850882338e-05, + "loss": 0.7077, + "step": 10166 + }, + { + "epoch": 0.5084, + "grad_norm": 3.505019426345825, + "learning_rate": 1.1447016018603293e-05, + "loss": 0.181, + "step": 10168 + }, + { + "epoch": 0.5085, + "grad_norm": 0.05875837057828903, + "learning_rate": 1.1443562010009732e-05, + "loss": 0.5293, + "step": 10170 + }, + { + "epoch": 0.5086, + "grad_norm": 2.736614227294922, + "learning_rate": 1.1440107825522522e-05, + "loss": 0.971, + "step": 10172 + }, + { + "epoch": 0.5087, + "grad_norm": 5.212649345397949, + "learning_rate": 1.1436653465562542e-05, + "loss": 1.3395, + "step": 10174 + }, + { + "epoch": 0.5088, + "grad_norm": 4.004589557647705, + "learning_rate": 1.1433198930550694e-05, + "loss": 1.4286, + "step": 10176 + }, + { + "epoch": 0.5089, + "grad_norm": 10.916641235351562, + "learning_rate": 1.1429744220907904e-05, + "loss": 1.2947, + "step": 10178 + }, + { + "epoch": 0.509, + "grad_norm": 3.6498031616210938, + "learning_rate": 1.1426289337055119e-05, + "loss": 0.4961, + "step": 10180 + }, + { + "epoch": 0.5091, + "grad_norm": 3.5613596439361572, + "learning_rate": 1.1422834279413303e-05, + "loss": 1.0739, + "step": 10182 + }, + { + "epoch": 0.5092, + "grad_norm": 4.397598743438721, + "learning_rate": 1.1419379048403446e-05, + "loss": 0.7381, + "step": 10184 + }, + { + "epoch": 0.5093, + "grad_norm": 8.632627487182617, + "learning_rate": 1.1415923644446558e-05, + "loss": 0.7139, + "step": 10186 + }, + { + "epoch": 0.5094, + "grad_norm": 4.079705715179443, + "learning_rate": 1.141246806796367e-05, + "loss": 0.5725, + "step": 10188 + }, + { + "epoch": 0.5095, + "grad_norm": 15.809266090393066, + "learning_rate": 1.1409012319375828e-05, + "loss": 1.6901, + "step": 10190 + }, + { + "epoch": 0.5096, + "grad_norm": 3.670186996459961, + "learning_rate": 1.140555639910411e-05, + "loss": 0.3897, + "step": 10192 + }, + { + "epoch": 0.5097, + "grad_norm": 2.310760498046875, + "learning_rate": 1.1402100307569612e-05, + "loss": 0.8958, + "step": 10194 + }, + { + "epoch": 0.5098, + "grad_norm": 2.615910530090332, + "learning_rate": 1.1398644045193443e-05, + "loss": 0.7284, + "step": 10196 + }, + { + "epoch": 0.5099, + "grad_norm": 9.234100341796875, + "learning_rate": 1.1395187612396739e-05, + "loss": 1.0246, + "step": 10198 + }, + { + "epoch": 0.51, + "grad_norm": 1.9331437349319458, + "learning_rate": 1.1391731009600655e-05, + "loss": 0.0891, + "step": 10200 + }, + { + "epoch": 0.5101, + "grad_norm": 4.564942836761475, + "learning_rate": 1.138827423722637e-05, + "loss": 1.2832, + "step": 10202 + }, + { + "epoch": 0.5102, + "grad_norm": 5.0158257484436035, + "learning_rate": 1.1384817295695083e-05, + "loss": 0.7029, + "step": 10204 + }, + { + "epoch": 0.5103, + "grad_norm": 6.886776447296143, + "learning_rate": 1.1381360185428007e-05, + "loss": 1.4737, + "step": 10206 + }, + { + "epoch": 0.5104, + "grad_norm": 7.3064961433410645, + "learning_rate": 1.137790290684638e-05, + "loss": 2.4216, + "step": 10208 + }, + { + "epoch": 0.5105, + "grad_norm": 6.9644455909729, + "learning_rate": 1.1374445460371466e-05, + "loss": 0.6737, + "step": 10210 + }, + { + "epoch": 0.5106, + "grad_norm": 3.627544403076172, + "learning_rate": 1.1370987846424547e-05, + "loss": 0.7809, + "step": 10212 + }, + { + "epoch": 0.5107, + "grad_norm": 5.547853946685791, + "learning_rate": 1.136753006542691e-05, + "loss": 1.1332, + "step": 10214 + }, + { + "epoch": 0.5108, + "grad_norm": 13.312674522399902, + "learning_rate": 1.1364072117799884e-05, + "loss": 1.7544, + "step": 10216 + }, + { + "epoch": 0.5109, + "grad_norm": 6.75591516494751, + "learning_rate": 1.136061400396481e-05, + "loss": 1.3755, + "step": 10218 + }, + { + "epoch": 0.511, + "grad_norm": 4.305898666381836, + "learning_rate": 1.1357155724343046e-05, + "loss": 1.0997, + "step": 10220 + }, + { + "epoch": 0.5111, + "grad_norm": 3.5737123489379883, + "learning_rate": 1.1353697279355973e-05, + "loss": 0.2351, + "step": 10222 + }, + { + "epoch": 0.5112, + "grad_norm": 4.046712398529053, + "learning_rate": 1.1350238669424993e-05, + "loss": 0.4473, + "step": 10224 + }, + { + "epoch": 0.5113, + "grad_norm": 11.515847206115723, + "learning_rate": 1.1346779894971526e-05, + "loss": 1.1029, + "step": 10226 + }, + { + "epoch": 0.5114, + "grad_norm": 2.3166372776031494, + "learning_rate": 1.1343320956417015e-05, + "loss": 1.0913, + "step": 10228 + }, + { + "epoch": 0.5115, + "grad_norm": 3.6578309535980225, + "learning_rate": 1.1339861854182923e-05, + "loss": 1.0811, + "step": 10230 + }, + { + "epoch": 0.5116, + "grad_norm": 5.593793869018555, + "learning_rate": 1.1336402588690727e-05, + "loss": 1.2006, + "step": 10232 + }, + { + "epoch": 0.5117, + "grad_norm": 3.057377576828003, + "learning_rate": 1.1332943160361926e-05, + "loss": 1.6767, + "step": 10234 + }, + { + "epoch": 0.5118, + "grad_norm": 6.583431243896484, + "learning_rate": 1.1329483569618045e-05, + "loss": 0.9082, + "step": 10236 + }, + { + "epoch": 0.5119, + "grad_norm": 2.176053762435913, + "learning_rate": 1.1326023816880625e-05, + "loss": 1.2896, + "step": 10238 + }, + { + "epoch": 0.512, + "grad_norm": 1.6843582391738892, + "learning_rate": 1.1322563902571227e-05, + "loss": 0.7901, + "step": 10240 + }, + { + "epoch": 0.5121, + "grad_norm": 17.150733947753906, + "learning_rate": 1.1319103827111427e-05, + "loss": 1.1631, + "step": 10242 + }, + { + "epoch": 0.5122, + "grad_norm": 3.1912739276885986, + "learning_rate": 1.1315643590922827e-05, + "loss": 0.8606, + "step": 10244 + }, + { + "epoch": 0.5123, + "grad_norm": 10.940323829650879, + "learning_rate": 1.1312183194427047e-05, + "loss": 1.5358, + "step": 10246 + }, + { + "epoch": 0.5124, + "grad_norm": 2.238068103790283, + "learning_rate": 1.1308722638045724e-05, + "loss": 0.9721, + "step": 10248 + }, + { + "epoch": 0.5125, + "grad_norm": 3.8612098693847656, + "learning_rate": 1.130526192220052e-05, + "loss": 1.359, + "step": 10250 + }, + { + "epoch": 0.5126, + "grad_norm": 3.046185255050659, + "learning_rate": 1.1301801047313106e-05, + "loss": 1.3538, + "step": 10252 + }, + { + "epoch": 0.5127, + "grad_norm": 5.50653600692749, + "learning_rate": 1.1298340013805185e-05, + "loss": 0.6338, + "step": 10254 + }, + { + "epoch": 0.5128, + "grad_norm": 9.340885162353516, + "learning_rate": 1.129487882209847e-05, + "loss": 1.0024, + "step": 10256 + }, + { + "epoch": 0.5129, + "grad_norm": 9.222871780395508, + "learning_rate": 1.1291417472614701e-05, + "loss": 0.8635, + "step": 10258 + }, + { + "epoch": 0.513, + "grad_norm": 2.335216522216797, + "learning_rate": 1.128795596577563e-05, + "loss": 1.4405, + "step": 10260 + }, + { + "epoch": 0.5131, + "grad_norm": 1.9886078834533691, + "learning_rate": 1.128449430200303e-05, + "loss": 0.5474, + "step": 10262 + }, + { + "epoch": 0.5132, + "grad_norm": 7.188560962677002, + "learning_rate": 1.1281032481718696e-05, + "loss": 0.9103, + "step": 10264 + }, + { + "epoch": 0.5133, + "grad_norm": 1.902160406112671, + "learning_rate": 1.127757050534444e-05, + "loss": 0.8763, + "step": 10266 + }, + { + "epoch": 0.5134, + "grad_norm": 2.8050317764282227, + "learning_rate": 1.1274108373302095e-05, + "loss": 0.971, + "step": 10268 + }, + { + "epoch": 0.5135, + "grad_norm": 2.9109466075897217, + "learning_rate": 1.1270646086013507e-05, + "loss": 1.5257, + "step": 10270 + }, + { + "epoch": 0.5136, + "grad_norm": 2.515998363494873, + "learning_rate": 1.1267183643900548e-05, + "loss": 1.0657, + "step": 10272 + }, + { + "epoch": 0.5137, + "grad_norm": 16.905532836914062, + "learning_rate": 1.1263721047385106e-05, + "loss": 1.1021, + "step": 10274 + }, + { + "epoch": 0.5138, + "grad_norm": 4.1200337409973145, + "learning_rate": 1.1260258296889086e-05, + "loss": 0.6905, + "step": 10276 + }, + { + "epoch": 0.5139, + "grad_norm": 2.9639523029327393, + "learning_rate": 1.125679539283442e-05, + "loss": 1.3179, + "step": 10278 + }, + { + "epoch": 0.514, + "grad_norm": 5.454526424407959, + "learning_rate": 1.1253332335643043e-05, + "loss": 0.7464, + "step": 10280 + }, + { + "epoch": 0.5141, + "grad_norm": 10.995149612426758, + "learning_rate": 1.1249869125736926e-05, + "loss": 0.8264, + "step": 10282 + }, + { + "epoch": 0.5142, + "grad_norm": 7.418797492980957, + "learning_rate": 1.1246405763538047e-05, + "loss": 0.6258, + "step": 10284 + }, + { + "epoch": 0.5143, + "grad_norm": 5.6152753829956055, + "learning_rate": 1.1242942249468403e-05, + "loss": 0.8262, + "step": 10286 + }, + { + "epoch": 0.5144, + "grad_norm": 15.698098182678223, + "learning_rate": 1.1239478583950019e-05, + "loss": 0.8629, + "step": 10288 + }, + { + "epoch": 0.5145, + "grad_norm": 1.7749513387680054, + "learning_rate": 1.1236014767404929e-05, + "loss": 0.704, + "step": 10290 + }, + { + "epoch": 0.5146, + "grad_norm": 4.578884124755859, + "learning_rate": 1.1232550800255188e-05, + "loss": 0.6047, + "step": 10292 + }, + { + "epoch": 0.5147, + "grad_norm": 2.0545740127563477, + "learning_rate": 1.1229086682922869e-05, + "loss": 0.3612, + "step": 10294 + }, + { + "epoch": 0.5148, + "grad_norm": 2.725980281829834, + "learning_rate": 1.1225622415830068e-05, + "loss": 0.9687, + "step": 10296 + }, + { + "epoch": 0.5149, + "grad_norm": 4.068347930908203, + "learning_rate": 1.1222157999398895e-05, + "loss": 0.8837, + "step": 10298 + }, + { + "epoch": 0.515, + "grad_norm": 3.804323434829712, + "learning_rate": 1.1218693434051475e-05, + "loss": 0.7632, + "step": 10300 + }, + { + "epoch": 0.5151, + "grad_norm": 3.230854034423828, + "learning_rate": 1.1215228720209959e-05, + "loss": 0.4543, + "step": 10302 + }, + { + "epoch": 0.5152, + "grad_norm": 10.94419002532959, + "learning_rate": 1.1211763858296507e-05, + "loss": 0.5873, + "step": 10304 + }, + { + "epoch": 0.5153, + "grad_norm": 1.7701956033706665, + "learning_rate": 1.1208298848733305e-05, + "loss": 0.8503, + "step": 10306 + }, + { + "epoch": 0.5154, + "grad_norm": 3.344677448272705, + "learning_rate": 1.1204833691942553e-05, + "loss": 0.7765, + "step": 10308 + }, + { + "epoch": 0.5155, + "grad_norm": 4.6334686279296875, + "learning_rate": 1.1201368388346471e-05, + "loss": 0.6913, + "step": 10310 + }, + { + "epoch": 0.5156, + "grad_norm": 7.70579195022583, + "learning_rate": 1.1197902938367297e-05, + "loss": 0.5617, + "step": 10312 + }, + { + "epoch": 0.5157, + "grad_norm": 2.7274539470672607, + "learning_rate": 1.119443734242728e-05, + "loss": 1.0137, + "step": 10314 + }, + { + "epoch": 0.5158, + "grad_norm": 11.094327926635742, + "learning_rate": 1.11909716009487e-05, + "loss": 1.6316, + "step": 10316 + }, + { + "epoch": 0.5159, + "grad_norm": 3.157426118850708, + "learning_rate": 1.1187505714353841e-05, + "loss": 0.1381, + "step": 10318 + }, + { + "epoch": 0.516, + "grad_norm": 2.1214795112609863, + "learning_rate": 1.1184039683065014e-05, + "loss": 1.3677, + "step": 10320 + }, + { + "epoch": 0.5161, + "grad_norm": 2.6729769706726074, + "learning_rate": 1.1180573507504538e-05, + "loss": 0.5572, + "step": 10322 + }, + { + "epoch": 0.5162, + "grad_norm": 6.8219146728515625, + "learning_rate": 1.1177107188094765e-05, + "loss": 1.2167, + "step": 10324 + }, + { + "epoch": 0.5163, + "grad_norm": 1.59290611743927, + "learning_rate": 1.1173640725258053e-05, + "loss": 0.5609, + "step": 10326 + }, + { + "epoch": 0.5164, + "grad_norm": 4.1474385261535645, + "learning_rate": 1.1170174119416778e-05, + "loss": 0.8285, + "step": 10328 + }, + { + "epoch": 0.5165, + "grad_norm": 5.271899223327637, + "learning_rate": 1.1166707370993333e-05, + "loss": 1.0679, + "step": 10330 + }, + { + "epoch": 0.5166, + "grad_norm": 8.471711158752441, + "learning_rate": 1.1163240480410136e-05, + "loss": 1.1809, + "step": 10332 + }, + { + "epoch": 0.5167, + "grad_norm": 2.4307749271392822, + "learning_rate": 1.1159773448089615e-05, + "loss": 1.3613, + "step": 10334 + }, + { + "epoch": 0.5168, + "grad_norm": 2.0861012935638428, + "learning_rate": 1.1156306274454218e-05, + "loss": 0.6247, + "step": 10336 + }, + { + "epoch": 0.5169, + "grad_norm": 5.366135120391846, + "learning_rate": 1.1152838959926408e-05, + "loss": 0.8972, + "step": 10338 + }, + { + "epoch": 0.517, + "grad_norm": 1.892337679862976, + "learning_rate": 1.1149371504928667e-05, + "loss": 0.9516, + "step": 10340 + }, + { + "epoch": 0.5171, + "grad_norm": 11.783740043640137, + "learning_rate": 1.1145903909883496e-05, + "loss": 1.7808, + "step": 10342 + }, + { + "epoch": 0.5172, + "grad_norm": 3.9532041549682617, + "learning_rate": 1.1142436175213409e-05, + "loss": 0.493, + "step": 10344 + }, + { + "epoch": 0.5173, + "grad_norm": 9.581588745117188, + "learning_rate": 1.113896830134094e-05, + "loss": 1.6286, + "step": 10346 + }, + { + "epoch": 0.5174, + "grad_norm": 2.411257266998291, + "learning_rate": 1.1135500288688636e-05, + "loss": 0.622, + "step": 10348 + }, + { + "epoch": 0.5175, + "grad_norm": 1.5502060651779175, + "learning_rate": 1.113203213767907e-05, + "loss": 0.7013, + "step": 10350 + }, + { + "epoch": 0.5176, + "grad_norm": 2.8089959621429443, + "learning_rate": 1.1128563848734817e-05, + "loss": 0.9514, + "step": 10352 + }, + { + "epoch": 0.5177, + "grad_norm": 3.635596513748169, + "learning_rate": 1.1125095422278487e-05, + "loss": 0.7832, + "step": 10354 + }, + { + "epoch": 0.5178, + "grad_norm": 3.4714643955230713, + "learning_rate": 1.112162685873269e-05, + "loss": 0.9672, + "step": 10356 + }, + { + "epoch": 0.5179, + "grad_norm": 2.240156888961792, + "learning_rate": 1.1118158158520064e-05, + "loss": 0.7682, + "step": 10358 + }, + { + "epoch": 0.518, + "grad_norm": 4.485928058624268, + "learning_rate": 1.1114689322063255e-05, + "loss": 1.4197, + "step": 10360 + }, + { + "epoch": 0.5181, + "grad_norm": 6.502712726593018, + "learning_rate": 1.1111220349784937e-05, + "loss": 1.1392, + "step": 10362 + }, + { + "epoch": 0.5182, + "grad_norm": 2.1012144088745117, + "learning_rate": 1.1107751242107786e-05, + "loss": 1.2328, + "step": 10364 + }, + { + "epoch": 0.5183, + "grad_norm": 2.6143765449523926, + "learning_rate": 1.1104281999454511e-05, + "loss": 0.7449, + "step": 10366 + }, + { + "epoch": 0.5184, + "grad_norm": 3.9406237602233887, + "learning_rate": 1.1100812622247823e-05, + "loss": 0.9652, + "step": 10368 + }, + { + "epoch": 0.5185, + "grad_norm": 3.0171761512756348, + "learning_rate": 1.1097343110910452e-05, + "loss": 0.5057, + "step": 10370 + }, + { + "epoch": 0.5186, + "grad_norm": 2.584325075149536, + "learning_rate": 1.1093873465865156e-05, + "loss": 0.4658, + "step": 10372 + }, + { + "epoch": 0.5187, + "grad_norm": 9.09599781036377, + "learning_rate": 1.1090403687534696e-05, + "loss": 1.1923, + "step": 10374 + }, + { + "epoch": 0.5188, + "grad_norm": 3.600496768951416, + "learning_rate": 1.1086933776341853e-05, + "loss": 0.554, + "step": 10376 + }, + { + "epoch": 0.5189, + "grad_norm": 2.2847328186035156, + "learning_rate": 1.1083463732709426e-05, + "loss": 0.8538, + "step": 10378 + }, + { + "epoch": 0.519, + "grad_norm": 1.7629121541976929, + "learning_rate": 1.1079993557060228e-05, + "loss": 0.2666, + "step": 10380 + }, + { + "epoch": 0.5191, + "grad_norm": 2.4008123874664307, + "learning_rate": 1.1076523249817095e-05, + "loss": 1.1041, + "step": 10382 + }, + { + "epoch": 0.5192, + "grad_norm": 0.06879356503486633, + "learning_rate": 1.1073052811402867e-05, + "loss": 0.1955, + "step": 10384 + }, + { + "epoch": 0.5193, + "grad_norm": 5.006446838378906, + "learning_rate": 1.106958224224041e-05, + "loss": 0.9755, + "step": 10386 + }, + { + "epoch": 0.5194, + "grad_norm": 2.8918001651763916, + "learning_rate": 1.10661115427526e-05, + "loss": 1.3027, + "step": 10388 + }, + { + "epoch": 0.5195, + "grad_norm": 12.405911445617676, + "learning_rate": 1.1062640713362333e-05, + "loss": 0.5713, + "step": 10390 + }, + { + "epoch": 0.5196, + "grad_norm": 2.283526659011841, + "learning_rate": 1.105916975449252e-05, + "loss": 0.9695, + "step": 10392 + }, + { + "epoch": 0.5197, + "grad_norm": 3.4547650814056396, + "learning_rate": 1.1055698666566084e-05, + "loss": 0.8896, + "step": 10394 + }, + { + "epoch": 0.5198, + "grad_norm": 2.231252908706665, + "learning_rate": 1.1052227450005968e-05, + "loss": 0.806, + "step": 10396 + }, + { + "epoch": 0.5199, + "grad_norm": 2.720442533493042, + "learning_rate": 1.1048756105235126e-05, + "loss": 0.9711, + "step": 10398 + }, + { + "epoch": 0.52, + "grad_norm": 4.48943567276001, + "learning_rate": 1.1045284632676535e-05, + "loss": 1.1395, + "step": 10400 + }, + { + "epoch": 0.5201, + "grad_norm": 10.813779830932617, + "learning_rate": 1.1041813032753184e-05, + "loss": 1.3546, + "step": 10402 + }, + { + "epoch": 0.5202, + "grad_norm": 2.767439365386963, + "learning_rate": 1.1038341305888074e-05, + "loss": 1.2612, + "step": 10404 + }, + { + "epoch": 0.5203, + "grad_norm": 2.6146082878112793, + "learning_rate": 1.1034869452504227e-05, + "loss": 0.8294, + "step": 10406 + }, + { + "epoch": 0.5204, + "grad_norm": 1.6397088766098022, + "learning_rate": 1.1031397473024674e-05, + "loss": 0.5183, + "step": 10408 + }, + { + "epoch": 0.5205, + "grad_norm": 4.791729927062988, + "learning_rate": 1.102792536787247e-05, + "loss": 0.5813, + "step": 10410 + }, + { + "epoch": 0.5206, + "grad_norm": 8.333206176757812, + "learning_rate": 1.1024453137470677e-05, + "loss": 1.3276, + "step": 10412 + }, + { + "epoch": 0.5207, + "grad_norm": 4.152242660522461, + "learning_rate": 1.1020980782242376e-05, + "loss": 1.118, + "step": 10414 + }, + { + "epoch": 0.5208, + "grad_norm": 3.6267175674438477, + "learning_rate": 1.1017508302610665e-05, + "loss": 0.6734, + "step": 10416 + }, + { + "epoch": 0.5209, + "grad_norm": 0.27745160460472107, + "learning_rate": 1.1014035698998651e-05, + "loss": 0.7306, + "step": 10418 + }, + { + "epoch": 0.521, + "grad_norm": 1.7473349571228027, + "learning_rate": 1.1010562971829464e-05, + "loss": 0.8517, + "step": 10420 + }, + { + "epoch": 0.5211, + "grad_norm": 3.3855607509613037, + "learning_rate": 1.1007090121526246e-05, + "loss": 1.1151, + "step": 10422 + }, + { + "epoch": 0.5212, + "grad_norm": 3.276165246963501, + "learning_rate": 1.1003617148512149e-05, + "loss": 0.9358, + "step": 10424 + }, + { + "epoch": 0.5213, + "grad_norm": 2.4322171211242676, + "learning_rate": 1.100014405321035e-05, + "loss": 0.8685, + "step": 10426 + }, + { + "epoch": 0.5214, + "grad_norm": 3.5514938831329346, + "learning_rate": 1.099667083604403e-05, + "loss": 0.5442, + "step": 10428 + }, + { + "epoch": 0.5215, + "grad_norm": 8.339814186096191, + "learning_rate": 1.0993197497436392e-05, + "loss": 0.8811, + "step": 10430 + }, + { + "epoch": 0.5216, + "grad_norm": 2.0177063941955566, + "learning_rate": 1.0989724037810651e-05, + "loss": 0.769, + "step": 10432 + }, + { + "epoch": 0.5217, + "grad_norm": 8.960945129394531, + "learning_rate": 1.098625045759004e-05, + "loss": 0.4622, + "step": 10434 + }, + { + "epoch": 0.5218, + "grad_norm": 4.214173316955566, + "learning_rate": 1.0982776757197799e-05, + "loss": 0.777, + "step": 10436 + }, + { + "epoch": 0.5219, + "grad_norm": 4.191055774688721, + "learning_rate": 1.0979302937057192e-05, + "loss": 1.1652, + "step": 10438 + }, + { + "epoch": 0.522, + "grad_norm": 6.644555568695068, + "learning_rate": 1.0975828997591496e-05, + "loss": 1.3889, + "step": 10440 + }, + { + "epoch": 0.5221, + "grad_norm": 17.18328285217285, + "learning_rate": 1.0972354939223997e-05, + "loss": 1.0886, + "step": 10442 + }, + { + "epoch": 0.5222, + "grad_norm": 6.322267055511475, + "learning_rate": 1.0968880762377994e-05, + "loss": 1.3529, + "step": 10444 + }, + { + "epoch": 0.5223, + "grad_norm": 7.75439453125, + "learning_rate": 1.096540646747681e-05, + "loss": 0.7249, + "step": 10446 + }, + { + "epoch": 0.5224, + "grad_norm": 16.589357376098633, + "learning_rate": 1.0961932054943778e-05, + "loss": 0.8836, + "step": 10448 + }, + { + "epoch": 0.5225, + "grad_norm": 1.9154807329177856, + "learning_rate": 1.0958457525202241e-05, + "loss": 0.7477, + "step": 10450 + }, + { + "epoch": 0.5226, + "grad_norm": 3.728480815887451, + "learning_rate": 1.0954982878675564e-05, + "loss": 0.9784, + "step": 10452 + }, + { + "epoch": 0.5227, + "grad_norm": 4.4013447761535645, + "learning_rate": 1.0951508115787119e-05, + "loss": 0.503, + "step": 10454 + }, + { + "epoch": 0.5228, + "grad_norm": 6.36181640625, + "learning_rate": 1.0948033236960294e-05, + "loss": 0.6603, + "step": 10456 + }, + { + "epoch": 0.5229, + "grad_norm": 8.63705062866211, + "learning_rate": 1.0944558242618497e-05, + "loss": 1.4539, + "step": 10458 + }, + { + "epoch": 0.523, + "grad_norm": 18.11083984375, + "learning_rate": 1.0941083133185146e-05, + "loss": 0.7414, + "step": 10460 + }, + { + "epoch": 0.5231, + "grad_norm": 7.693545818328857, + "learning_rate": 1.0937607909083668e-05, + "loss": 0.6524, + "step": 10462 + }, + { + "epoch": 0.5232, + "grad_norm": 9.58978271484375, + "learning_rate": 1.0934132570737508e-05, + "loss": 1.5057, + "step": 10464 + }, + { + "epoch": 0.5233, + "grad_norm": 3.19104266166687, + "learning_rate": 1.0930657118570128e-05, + "loss": 1.1967, + "step": 10466 + }, + { + "epoch": 0.5234, + "grad_norm": 5.688962459564209, + "learning_rate": 1.0927181553005001e-05, + "loss": 1.0463, + "step": 10468 + }, + { + "epoch": 0.5235, + "grad_norm": 18.296859741210938, + "learning_rate": 1.0923705874465617e-05, + "loss": 1.5373, + "step": 10470 + }, + { + "epoch": 0.5236, + "grad_norm": 5.818504810333252, + "learning_rate": 1.0920230083375474e-05, + "loss": 0.7424, + "step": 10472 + }, + { + "epoch": 0.5237, + "grad_norm": 2.147671937942505, + "learning_rate": 1.0916754180158083e-05, + "loss": 1.0701, + "step": 10474 + }, + { + "epoch": 0.5238, + "grad_norm": 6.52361536026001, + "learning_rate": 1.0913278165236977e-05, + "loss": 1.1663, + "step": 10476 + }, + { + "epoch": 0.5239, + "grad_norm": 1.6899007558822632, + "learning_rate": 1.0909802039035702e-05, + "loss": 0.6655, + "step": 10478 + }, + { + "epoch": 0.524, + "grad_norm": 1.4077155590057373, + "learning_rate": 1.0906325801977804e-05, + "loss": 1.4939, + "step": 10480 + }, + { + "epoch": 0.5241, + "grad_norm": 2.6753976345062256, + "learning_rate": 1.0902849454486857e-05, + "loss": 1.2462, + "step": 10482 + }, + { + "epoch": 0.5242, + "grad_norm": 2.0910000801086426, + "learning_rate": 1.0899372996986439e-05, + "loss": 0.8054, + "step": 10484 + }, + { + "epoch": 0.5243, + "grad_norm": 3.310060977935791, + "learning_rate": 1.0895896429900155e-05, + "loss": 0.7273, + "step": 10486 + }, + { + "epoch": 0.5244, + "grad_norm": 3.231682062149048, + "learning_rate": 1.0892419753651606e-05, + "loss": 0.9186, + "step": 10488 + }, + { + "epoch": 0.5245, + "grad_norm": 4.127396583557129, + "learning_rate": 1.0888942968664417e-05, + "loss": 0.4385, + "step": 10490 + }, + { + "epoch": 0.5246, + "grad_norm": 3.181260585784912, + "learning_rate": 1.0885466075362224e-05, + "loss": 0.8203, + "step": 10492 + }, + { + "epoch": 0.5247, + "grad_norm": 8.65131950378418, + "learning_rate": 1.0881989074168673e-05, + "loss": 0.9217, + "step": 10494 + }, + { + "epoch": 0.5248, + "grad_norm": 8.470550537109375, + "learning_rate": 1.0878511965507435e-05, + "loss": 1.4636, + "step": 10496 + }, + { + "epoch": 0.5249, + "grad_norm": 24.993276596069336, + "learning_rate": 1.0875034749802174e-05, + "loss": 0.9299, + "step": 10498 + }, + { + "epoch": 0.525, + "grad_norm": 3.330655813217163, + "learning_rate": 1.0871557427476585e-05, + "loss": 0.8928, + "step": 10500 + }, + { + "epoch": 0.5251, + "grad_norm": 7.326821804046631, + "learning_rate": 1.0868079998954364e-05, + "loss": 0.7045, + "step": 10502 + }, + { + "epoch": 0.5252, + "grad_norm": 3.2534003257751465, + "learning_rate": 1.086460246465923e-05, + "loss": 1.1467, + "step": 10504 + }, + { + "epoch": 0.5253, + "grad_norm": 3.7707271575927734, + "learning_rate": 1.0861124825014908e-05, + "loss": 0.9182, + "step": 10506 + }, + { + "epoch": 0.5254, + "grad_norm": 5.096396446228027, + "learning_rate": 1.085764708044514e-05, + "loss": 0.591, + "step": 10508 + }, + { + "epoch": 0.5255, + "grad_norm": 6.4283881187438965, + "learning_rate": 1.0854169231373677e-05, + "loss": 0.4904, + "step": 10510 + }, + { + "epoch": 0.5256, + "grad_norm": 3.2960116863250732, + "learning_rate": 1.0850691278224282e-05, + "loss": 1.1662, + "step": 10512 + }, + { + "epoch": 0.5257, + "grad_norm": 4.305720329284668, + "learning_rate": 1.0847213221420735e-05, + "loss": 1.0128, + "step": 10514 + }, + { + "epoch": 0.5258, + "grad_norm": 2.9720065593719482, + "learning_rate": 1.0843735061386829e-05, + "loss": 0.5656, + "step": 10516 + }, + { + "epoch": 0.5259, + "grad_norm": 3.5052061080932617, + "learning_rate": 1.0840256798546365e-05, + "loss": 0.4961, + "step": 10518 + }, + { + "epoch": 0.526, + "grad_norm": 4.386289596557617, + "learning_rate": 1.083677843332316e-05, + "loss": 1.0311, + "step": 10520 + }, + { + "epoch": 0.5261, + "grad_norm": 4.258563041687012, + "learning_rate": 1.0833299966141035e-05, + "loss": 1.2295, + "step": 10522 + }, + { + "epoch": 0.5262, + "grad_norm": 5.957501411437988, + "learning_rate": 1.082982139742384e-05, + "loss": 0.3211, + "step": 10524 + }, + { + "epoch": 0.5263, + "grad_norm": 1.6617580652236938, + "learning_rate": 1.0826342727595427e-05, + "loss": 0.7529, + "step": 10526 + }, + { + "epoch": 0.5264, + "grad_norm": 3.77687668800354, + "learning_rate": 1.0822863957079657e-05, + "loss": 1.3509, + "step": 10528 + }, + { + "epoch": 0.5265, + "grad_norm": 1.9840303659439087, + "learning_rate": 1.0819385086300412e-05, + "loss": 1.3565, + "step": 10530 + }, + { + "epoch": 0.5266, + "grad_norm": 6.567816734313965, + "learning_rate": 1.0815906115681579e-05, + "loss": 1.3231, + "step": 10532 + }, + { + "epoch": 0.5267, + "grad_norm": 8.854735374450684, + "learning_rate": 1.0812427045647058e-05, + "loss": 0.9743, + "step": 10534 + }, + { + "epoch": 0.5268, + "grad_norm": 2.406243324279785, + "learning_rate": 1.0808947876620768e-05, + "loss": 0.7243, + "step": 10536 + }, + { + "epoch": 0.5269, + "grad_norm": 4.1268630027771, + "learning_rate": 1.0805468609026632e-05, + "loss": 0.6503, + "step": 10538 + }, + { + "epoch": 0.527, + "grad_norm": 5.255492210388184, + "learning_rate": 1.0801989243288588e-05, + "loss": 1.0416, + "step": 10540 + }, + { + "epoch": 0.5271, + "grad_norm": 9.964642524719238, + "learning_rate": 1.0798509779830591e-05, + "loss": 1.5322, + "step": 10542 + }, + { + "epoch": 0.5272, + "grad_norm": 9.424073219299316, + "learning_rate": 1.07950302190766e-05, + "loss": 1.6631, + "step": 10544 + }, + { + "epoch": 0.5273, + "grad_norm": 0.6429279446601868, + "learning_rate": 1.0791550561450585e-05, + "loss": 0.7316, + "step": 10546 + }, + { + "epoch": 0.5274, + "grad_norm": 2.2500765323638916, + "learning_rate": 1.0788070807376536e-05, + "loss": 0.9649, + "step": 10548 + }, + { + "epoch": 0.5275, + "grad_norm": 4.793017864227295, + "learning_rate": 1.0784590957278452e-05, + "loss": 0.7909, + "step": 10550 + }, + { + "epoch": 0.5276, + "grad_norm": 4.395087242126465, + "learning_rate": 1.0781111011580336e-05, + "loss": 0.5268, + "step": 10552 + }, + { + "epoch": 0.5277, + "grad_norm": 7.381994724273682, + "learning_rate": 1.0777630970706217e-05, + "loss": 2.4962, + "step": 10554 + }, + { + "epoch": 0.5278, + "grad_norm": 2.015286684036255, + "learning_rate": 1.0774150835080119e-05, + "loss": 1.0674, + "step": 10556 + }, + { + "epoch": 0.5279, + "grad_norm": 4.4855499267578125, + "learning_rate": 1.0770670605126092e-05, + "loss": 1.2264, + "step": 10558 + }, + { + "epoch": 0.528, + "grad_norm": 4.169837474822998, + "learning_rate": 1.0767190281268187e-05, + "loss": 0.4758, + "step": 10560 + }, + { + "epoch": 0.5281, + "grad_norm": 14.740524291992188, + "learning_rate": 1.0763709863930477e-05, + "loss": 1.6235, + "step": 10562 + }, + { + "epoch": 0.5282, + "grad_norm": 8.31937313079834, + "learning_rate": 1.0760229353537032e-05, + "loss": 1.0225, + "step": 10564 + }, + { + "epoch": 0.5283, + "grad_norm": 3.280348539352417, + "learning_rate": 1.0756748750511953e-05, + "loss": 0.6649, + "step": 10566 + }, + { + "epoch": 0.5284, + "grad_norm": 2.3194832801818848, + "learning_rate": 1.0753268055279328e-05, + "loss": 0.7992, + "step": 10568 + }, + { + "epoch": 0.5285, + "grad_norm": 17.794981002807617, + "learning_rate": 1.0749787268263279e-05, + "loss": 1.4921, + "step": 10570 + }, + { + "epoch": 0.5286, + "grad_norm": 5.256625175476074, + "learning_rate": 1.0746306389887924e-05, + "loss": 1.2751, + "step": 10572 + }, + { + "epoch": 0.5287, + "grad_norm": 1.50540030002594, + "learning_rate": 1.0742825420577401e-05, + "loss": 0.3722, + "step": 10574 + }, + { + "epoch": 0.5288, + "grad_norm": 3.3562843799591064, + "learning_rate": 1.0739344360755853e-05, + "loss": 1.1766, + "step": 10576 + }, + { + "epoch": 0.5289, + "grad_norm": 3.294435501098633, + "learning_rate": 1.0735863210847433e-05, + "loss": 0.9072, + "step": 10578 + }, + { + "epoch": 0.529, + "grad_norm": 2.023108720779419, + "learning_rate": 1.0732381971276318e-05, + "loss": 1.1137, + "step": 10580 + }, + { + "epoch": 0.5291, + "grad_norm": 5.044375896453857, + "learning_rate": 1.0728900642466679e-05, + "loss": 0.103, + "step": 10582 + }, + { + "epoch": 0.5292, + "grad_norm": 3.706956624984741, + "learning_rate": 1.072541922484271e-05, + "loss": 0.9451, + "step": 10584 + }, + { + "epoch": 0.5293, + "grad_norm": 3.100733757019043, + "learning_rate": 1.072193771882861e-05, + "loss": 1.4317, + "step": 10586 + }, + { + "epoch": 0.5294, + "grad_norm": 1.8073265552520752, + "learning_rate": 1.0718456124848584e-05, + "loss": 1.0769, + "step": 10588 + }, + { + "epoch": 0.5295, + "grad_norm": 8.037497520446777, + "learning_rate": 1.071497444332686e-05, + "loss": 1.0632, + "step": 10590 + }, + { + "epoch": 0.5296, + "grad_norm": 6.9563117027282715, + "learning_rate": 1.071149267468767e-05, + "loss": 0.5762, + "step": 10592 + }, + { + "epoch": 0.5297, + "grad_norm": 2.397181510925293, + "learning_rate": 1.0708010819355257e-05, + "loss": 0.7177, + "step": 10594 + }, + { + "epoch": 0.5298, + "grad_norm": 5.700536727905273, + "learning_rate": 1.070452887775387e-05, + "loss": 2.5619, + "step": 10596 + }, + { + "epoch": 0.5299, + "grad_norm": 2.7757883071899414, + "learning_rate": 1.0701046850307777e-05, + "loss": 0.465, + "step": 10598 + }, + { + "epoch": 0.53, + "grad_norm": 7.32316780090332, + "learning_rate": 1.0697564737441254e-05, + "loss": 1.4921, + "step": 10600 + }, + { + "epoch": 0.5301, + "grad_norm": 2.3326573371887207, + "learning_rate": 1.0694082539578585e-05, + "loss": 0.815, + "step": 10602 + }, + { + "epoch": 0.5302, + "grad_norm": 2.079329013824463, + "learning_rate": 1.0690600257144062e-05, + "loss": 0.4029, + "step": 10604 + }, + { + "epoch": 0.5303, + "grad_norm": 8.769913673400879, + "learning_rate": 1.0687117890561989e-05, + "loss": 0.329, + "step": 10606 + }, + { + "epoch": 0.5304, + "grad_norm": 3.599123477935791, + "learning_rate": 1.0683635440256689e-05, + "loss": 0.4589, + "step": 10608 + }, + { + "epoch": 0.5305, + "grad_norm": 3.809861183166504, + "learning_rate": 1.0680152906652483e-05, + "loss": 0.9533, + "step": 10610 + }, + { + "epoch": 0.5306, + "grad_norm": 6.674639701843262, + "learning_rate": 1.067667029017371e-05, + "loss": 1.037, + "step": 10612 + }, + { + "epoch": 0.5307, + "grad_norm": 5.13891077041626, + "learning_rate": 1.0673187591244714e-05, + "loss": 1.0714, + "step": 10614 + }, + { + "epoch": 0.5308, + "grad_norm": 6.99550724029541, + "learning_rate": 1.0669704810289852e-05, + "loss": 1.0707, + "step": 10616 + }, + { + "epoch": 0.5309, + "grad_norm": 3.353193759918213, + "learning_rate": 1.0666221947733486e-05, + "loss": 0.7505, + "step": 10618 + }, + { + "epoch": 0.531, + "grad_norm": 2.870889902114868, + "learning_rate": 1.0662739004000005e-05, + "loss": 0.8913, + "step": 10620 + }, + { + "epoch": 0.5311, + "grad_norm": 6.53781795501709, + "learning_rate": 1.065925597951378e-05, + "loss": 0.8507, + "step": 10622 + }, + { + "epoch": 0.5312, + "grad_norm": 2.8476028442382812, + "learning_rate": 1.0655772874699217e-05, + "loss": 0.7852, + "step": 10624 + }, + { + "epoch": 0.5313, + "grad_norm": 4.340022563934326, + "learning_rate": 1.0652289689980714e-05, + "loss": 1.1345, + "step": 10626 + }, + { + "epoch": 0.5314, + "grad_norm": 4.195581436157227, + "learning_rate": 1.0648806425782697e-05, + "loss": 1.3044, + "step": 10628 + }, + { + "epoch": 0.5315, + "grad_norm": 9.044445037841797, + "learning_rate": 1.0645323082529582e-05, + "loss": 0.8073, + "step": 10630 + }, + { + "epoch": 0.5316, + "grad_norm": 4.9420294761657715, + "learning_rate": 1.0641839660645806e-05, + "loss": 1.0179, + "step": 10632 + }, + { + "epoch": 0.5317, + "grad_norm": 3.199186086654663, + "learning_rate": 1.0638356160555816e-05, + "loss": 0.987, + "step": 10634 + }, + { + "epoch": 0.5318, + "grad_norm": 4.969277858734131, + "learning_rate": 1.0634872582684062e-05, + "loss": 1.4179, + "step": 10636 + }, + { + "epoch": 0.5319, + "grad_norm": 2.6299335956573486, + "learning_rate": 1.0631388927455012e-05, + "loss": 0.3261, + "step": 10638 + }, + { + "epoch": 0.532, + "grad_norm": 2.183562755584717, + "learning_rate": 1.0627905195293135e-05, + "loss": 0.1781, + "step": 10640 + }, + { + "epoch": 0.5321, + "grad_norm": 8.05198860168457, + "learning_rate": 1.0624421386622915e-05, + "loss": 1.0736, + "step": 10642 + }, + { + "epoch": 0.5322, + "grad_norm": 7.876754283905029, + "learning_rate": 1.0620937501868842e-05, + "loss": 1.2165, + "step": 10644 + }, + { + "epoch": 0.5323, + "grad_norm": 6.159606456756592, + "learning_rate": 1.061745354145542e-05, + "loss": 0.7372, + "step": 10646 + }, + { + "epoch": 0.5324, + "grad_norm": 2.581063985824585, + "learning_rate": 1.0613969505807157e-05, + "loss": 0.4507, + "step": 10648 + }, + { + "epoch": 0.5325, + "grad_norm": 3.4643869400024414, + "learning_rate": 1.0610485395348571e-05, + "loss": 0.859, + "step": 10650 + }, + { + "epoch": 0.5326, + "grad_norm": 2.1446735858917236, + "learning_rate": 1.060700121050419e-05, + "loss": 1.372, + "step": 10652 + }, + { + "epoch": 0.5327, + "grad_norm": 3.6940133571624756, + "learning_rate": 1.0603516951698555e-05, + "loss": 0.9834, + "step": 10654 + }, + { + "epoch": 0.5328, + "grad_norm": 7.40938138961792, + "learning_rate": 1.0600032619356208e-05, + "loss": 1.1299, + "step": 10656 + }, + { + "epoch": 0.5329, + "grad_norm": 0.05786965787410736, + "learning_rate": 1.059654821390171e-05, + "loss": 0.6956, + "step": 10658 + }, + { + "epoch": 0.533, + "grad_norm": 5.330459117889404, + "learning_rate": 1.0593063735759619e-05, + "loss": 0.7667, + "step": 10660 + }, + { + "epoch": 0.5331, + "grad_norm": 3.766947031021118, + "learning_rate": 1.058957918535451e-05, + "loss": 0.9776, + "step": 10662 + }, + { + "epoch": 0.5332, + "grad_norm": 3.5384249687194824, + "learning_rate": 1.0586094563110965e-05, + "loss": 0.6863, + "step": 10664 + }, + { + "epoch": 0.5333, + "grad_norm": 3.5713114738464355, + "learning_rate": 1.0582609869453578e-05, + "loss": 0.993, + "step": 10666 + }, + { + "epoch": 0.5334, + "grad_norm": 3.3867101669311523, + "learning_rate": 1.0579125104806944e-05, + "loss": 0.7113, + "step": 10668 + }, + { + "epoch": 0.5335, + "grad_norm": 4.050485610961914, + "learning_rate": 1.0575640269595675e-05, + "loss": 0.9271, + "step": 10670 + }, + { + "epoch": 0.5336, + "grad_norm": 12.985905647277832, + "learning_rate": 1.0572155364244383e-05, + "loss": 1.5226, + "step": 10672 + }, + { + "epoch": 0.5337, + "grad_norm": 5.9545674324035645, + "learning_rate": 1.0568670389177696e-05, + "loss": 0.8017, + "step": 10674 + }, + { + "epoch": 0.5338, + "grad_norm": 5.478658676147461, + "learning_rate": 1.0565185344820248e-05, + "loss": 1.2667, + "step": 10676 + }, + { + "epoch": 0.5339, + "grad_norm": 3.9246315956115723, + "learning_rate": 1.056170023159668e-05, + "loss": 1.1193, + "step": 10678 + }, + { + "epoch": 0.534, + "grad_norm": 3.961165189743042, + "learning_rate": 1.055821504993164e-05, + "loss": 0.8237, + "step": 10680 + }, + { + "epoch": 0.5341, + "grad_norm": 5.953139305114746, + "learning_rate": 1.0554729800249793e-05, + "loss": 1.1731, + "step": 10682 + }, + { + "epoch": 0.5342, + "grad_norm": 5.43373441696167, + "learning_rate": 1.0551244482975798e-05, + "loss": 1.3012, + "step": 10684 + }, + { + "epoch": 0.5343, + "grad_norm": 1.6397883892059326, + "learning_rate": 1.0547759098534335e-05, + "loss": 0.2635, + "step": 10686 + }, + { + "epoch": 0.5344, + "grad_norm": 10.203560829162598, + "learning_rate": 1.0544273647350091e-05, + "loss": 0.4118, + "step": 10688 + }, + { + "epoch": 0.5345, + "grad_norm": 1.7702304124832153, + "learning_rate": 1.0540788129847757e-05, + "loss": 0.8875, + "step": 10690 + }, + { + "epoch": 0.5346, + "grad_norm": 3.851076602935791, + "learning_rate": 1.0537302546452022e-05, + "loss": 0.9512, + "step": 10692 + }, + { + "epoch": 0.5347, + "grad_norm": 2.554675817489624, + "learning_rate": 1.0533816897587605e-05, + "loss": 0.9729, + "step": 10694 + }, + { + "epoch": 0.5348, + "grad_norm": 4.353896617889404, + "learning_rate": 1.053033118367922e-05, + "loss": 0.1024, + "step": 10696 + }, + { + "epoch": 0.5349, + "grad_norm": 3.0089669227600098, + "learning_rate": 1.0526845405151587e-05, + "loss": 1.0175, + "step": 10698 + }, + { + "epoch": 0.535, + "grad_norm": 6.844738006591797, + "learning_rate": 1.0523359562429441e-05, + "loss": 0.7671, + "step": 10700 + }, + { + "epoch": 0.5351, + "grad_norm": 4.052887916564941, + "learning_rate": 1.0519873655937515e-05, + "loss": 0.7622, + "step": 10702 + }, + { + "epoch": 0.5352, + "grad_norm": 7.800283432006836, + "learning_rate": 1.0516387686100566e-05, + "loss": 1.0151, + "step": 10704 + }, + { + "epoch": 0.5353, + "grad_norm": 3.629603147506714, + "learning_rate": 1.0512901653343343e-05, + "loss": 1.1672, + "step": 10706 + }, + { + "epoch": 0.5354, + "grad_norm": 5.361584186553955, + "learning_rate": 1.050941555809061e-05, + "loss": 0.8067, + "step": 10708 + }, + { + "epoch": 0.5355, + "grad_norm": 2.348989486694336, + "learning_rate": 1.0505929400767134e-05, + "loss": 2.0061, + "step": 10710 + }, + { + "epoch": 0.5356, + "grad_norm": 9.36653995513916, + "learning_rate": 1.0502443181797696e-05, + "loss": 1.2735, + "step": 10712 + }, + { + "epoch": 0.5357, + "grad_norm": 2.977250576019287, + "learning_rate": 1.0498956901607082e-05, + "loss": 1.0434, + "step": 10714 + }, + { + "epoch": 0.5358, + "grad_norm": 3.9870357513427734, + "learning_rate": 1.0495470560620082e-05, + "loss": 1.1602, + "step": 10716 + }, + { + "epoch": 0.5359, + "grad_norm": 10.870697975158691, + "learning_rate": 1.0491984159261496e-05, + "loss": 1.0642, + "step": 10718 + }, + { + "epoch": 0.536, + "grad_norm": 4.479517936706543, + "learning_rate": 1.0488497697956134e-05, + "loss": 0.3119, + "step": 10720 + }, + { + "epoch": 0.5361, + "grad_norm": 5.155757427215576, + "learning_rate": 1.0485011177128808e-05, + "loss": 0.6542, + "step": 10722 + }, + { + "epoch": 0.5362, + "grad_norm": 1.9815778732299805, + "learning_rate": 1.0481524597204342e-05, + "loss": 0.8006, + "step": 10724 + }, + { + "epoch": 0.5363, + "grad_norm": 2.3657712936401367, + "learning_rate": 1.0478037958607568e-05, + "loss": 1.261, + "step": 10726 + }, + { + "epoch": 0.5364, + "grad_norm": 7.8118062019348145, + "learning_rate": 1.0474551261763315e-05, + "loss": 0.7997, + "step": 10728 + }, + { + "epoch": 0.5365, + "grad_norm": 4.466432571411133, + "learning_rate": 1.0471064507096427e-05, + "loss": 0.8037, + "step": 10730 + }, + { + "epoch": 0.5366, + "grad_norm": 18.000682830810547, + "learning_rate": 1.0467577695031763e-05, + "loss": 0.985, + "step": 10732 + }, + { + "epoch": 0.5367, + "grad_norm": 8.910954475402832, + "learning_rate": 1.0464090825994173e-05, + "loss": 1.5696, + "step": 10734 + }, + { + "epoch": 0.5368, + "grad_norm": 5.521282196044922, + "learning_rate": 1.0460603900408523e-05, + "loss": 1.2773, + "step": 10736 + }, + { + "epoch": 0.5369, + "grad_norm": 1.845330834388733, + "learning_rate": 1.0457116918699687e-05, + "loss": 0.458, + "step": 10738 + }, + { + "epoch": 0.537, + "grad_norm": 1.6379836797714233, + "learning_rate": 1.0453629881292537e-05, + "loss": 0.2146, + "step": 10740 + }, + { + "epoch": 0.5371, + "grad_norm": 2.5112059116363525, + "learning_rate": 1.0450142788611965e-05, + "loss": 0.7432, + "step": 10742 + }, + { + "epoch": 0.5372, + "grad_norm": 2.4650959968566895, + "learning_rate": 1.0446655641082864e-05, + "loss": 0.5472, + "step": 10744 + }, + { + "epoch": 0.5373, + "grad_norm": 3.898991584777832, + "learning_rate": 1.0443168439130123e-05, + "loss": 1.265, + "step": 10746 + }, + { + "epoch": 0.5374, + "grad_norm": 4.383487224578857, + "learning_rate": 1.043968118317865e-05, + "loss": 1.323, + "step": 10748 + }, + { + "epoch": 0.5375, + "grad_norm": 7.416353225708008, + "learning_rate": 1.0436193873653362e-05, + "loss": 0.9677, + "step": 10750 + }, + { + "epoch": 0.5376, + "grad_norm": 2.5394625663757324, + "learning_rate": 1.0432706510979172e-05, + "loss": 0.9568, + "step": 10752 + }, + { + "epoch": 0.5377, + "grad_norm": 3.6475162506103516, + "learning_rate": 1.0429219095581007e-05, + "loss": 0.9216, + "step": 10754 + }, + { + "epoch": 0.5378, + "grad_norm": 7.727352619171143, + "learning_rate": 1.0425731627883798e-05, + "loss": 1.5513, + "step": 10756 + }, + { + "epoch": 0.5379, + "grad_norm": 12.122932434082031, + "learning_rate": 1.042224410831248e-05, + "loss": 1.0508, + "step": 10758 + }, + { + "epoch": 0.538, + "grad_norm": 2.0808794498443604, + "learning_rate": 1.0418756537291996e-05, + "loss": 0.1272, + "step": 10760 + }, + { + "epoch": 0.5381, + "grad_norm": 3.390597343444824, + "learning_rate": 1.0415268915247303e-05, + "loss": 1.0783, + "step": 10762 + }, + { + "epoch": 0.5382, + "grad_norm": 3.1114649772644043, + "learning_rate": 1.0411781242603352e-05, + "loss": 0.9449, + "step": 10764 + }, + { + "epoch": 0.5383, + "grad_norm": 14.88010311126709, + "learning_rate": 1.0408293519785103e-05, + "loss": 0.9255, + "step": 10766 + }, + { + "epoch": 0.5384, + "grad_norm": 1.3849844932556152, + "learning_rate": 1.0404805747217525e-05, + "loss": 0.3103, + "step": 10768 + }, + { + "epoch": 0.5385, + "grad_norm": 3.9052865505218506, + "learning_rate": 1.0401317925325598e-05, + "loss": 0.8098, + "step": 10770 + }, + { + "epoch": 0.5386, + "grad_norm": 9.177569389343262, + "learning_rate": 1.03978300545343e-05, + "loss": 0.9835, + "step": 10772 + }, + { + "epoch": 0.5387, + "grad_norm": 3.44264554977417, + "learning_rate": 1.0394342135268613e-05, + "loss": 2.1587, + "step": 10774 + }, + { + "epoch": 0.5388, + "grad_norm": 2.767573356628418, + "learning_rate": 1.0390854167953537e-05, + "loss": 1.6677, + "step": 10776 + }, + { + "epoch": 0.5389, + "grad_norm": 3.3800923824310303, + "learning_rate": 1.0387366153014063e-05, + "loss": 1.4532, + "step": 10778 + }, + { + "epoch": 0.539, + "grad_norm": 2.220958709716797, + "learning_rate": 1.03838780908752e-05, + "loss": 0.5761, + "step": 10780 + }, + { + "epoch": 0.5391, + "grad_norm": 2.168627977371216, + "learning_rate": 1.0380389981961958e-05, + "loss": 0.8627, + "step": 10782 + }, + { + "epoch": 0.5392, + "grad_norm": 2.0268940925598145, + "learning_rate": 1.0376901826699349e-05, + "loss": 0.9038, + "step": 10784 + }, + { + "epoch": 0.5393, + "grad_norm": 2.4124977588653564, + "learning_rate": 1.0373413625512393e-05, + "loss": 1.2527, + "step": 10786 + }, + { + "epoch": 0.5394, + "grad_norm": 4.065725326538086, + "learning_rate": 1.036992537882612e-05, + "loss": 0.8676, + "step": 10788 + }, + { + "epoch": 0.5395, + "grad_norm": 2.5141334533691406, + "learning_rate": 1.0366437087065564e-05, + "loss": 1.0163, + "step": 10790 + }, + { + "epoch": 0.5396, + "grad_norm": 1.6842893362045288, + "learning_rate": 1.036294875065576e-05, + "loss": 0.9266, + "step": 10792 + }, + { + "epoch": 0.5397, + "grad_norm": 8.191102027893066, + "learning_rate": 1.035946037002175e-05, + "loss": 0.6647, + "step": 10794 + }, + { + "epoch": 0.5398, + "grad_norm": 8.25532054901123, + "learning_rate": 1.0355971945588586e-05, + "loss": 1.2239, + "step": 10796 + }, + { + "epoch": 0.5399, + "grad_norm": 2.648442029953003, + "learning_rate": 1.035248347778132e-05, + "loss": 0.7632, + "step": 10798 + }, + { + "epoch": 0.54, + "grad_norm": 6.957012176513672, + "learning_rate": 1.0348994967025012e-05, + "loss": 0.6479, + "step": 10800 + }, + { + "epoch": 0.5401, + "grad_norm": 4.212419509887695, + "learning_rate": 1.0345506413744726e-05, + "loss": 0.4687, + "step": 10802 + }, + { + "epoch": 0.5402, + "grad_norm": 2.89841628074646, + "learning_rate": 1.034201781836553e-05, + "loss": 1.2776, + "step": 10804 + }, + { + "epoch": 0.5403, + "grad_norm": 5.01346492767334, + "learning_rate": 1.0338529181312498e-05, + "loss": 0.818, + "step": 10806 + }, + { + "epoch": 0.5404, + "grad_norm": 4.27815055847168, + "learning_rate": 1.0335040503010715e-05, + "loss": 1.4844, + "step": 10808 + }, + { + "epoch": 0.5405, + "grad_norm": 4.613982200622559, + "learning_rate": 1.0331551783885263e-05, + "loss": 1.1072, + "step": 10810 + }, + { + "epoch": 0.5406, + "grad_norm": 4.406118869781494, + "learning_rate": 1.0328063024361232e-05, + "loss": 0.9937, + "step": 10812 + }, + { + "epoch": 0.5407, + "grad_norm": 3.6547439098358154, + "learning_rate": 1.0324574224863717e-05, + "loss": 1.1665, + "step": 10814 + }, + { + "epoch": 0.5408, + "grad_norm": 2.686237096786499, + "learning_rate": 1.0321085385817818e-05, + "loss": 0.918, + "step": 10816 + }, + { + "epoch": 0.5409, + "grad_norm": 2.322258949279785, + "learning_rate": 1.0317596507648638e-05, + "loss": 1.6465, + "step": 10818 + }, + { + "epoch": 0.541, + "grad_norm": 4.197561264038086, + "learning_rate": 1.0314107590781284e-05, + "loss": 0.9345, + "step": 10820 + }, + { + "epoch": 0.5411, + "grad_norm": 2.5226247310638428, + "learning_rate": 1.0310618635640876e-05, + "loss": 0.9831, + "step": 10822 + }, + { + "epoch": 0.5412, + "grad_norm": 6.996884822845459, + "learning_rate": 1.030712964265253e-05, + "loss": 0.6628, + "step": 10824 + }, + { + "epoch": 0.5413, + "grad_norm": 4.5918450355529785, + "learning_rate": 1.0303640612241364e-05, + "loss": 1.0838, + "step": 10826 + }, + { + "epoch": 0.5414, + "grad_norm": 4.295327186584473, + "learning_rate": 1.0300151544832513e-05, + "loss": 1.4363, + "step": 10828 + }, + { + "epoch": 0.5415, + "grad_norm": 4.825708866119385, + "learning_rate": 1.0296662440851108e-05, + "loss": 0.2755, + "step": 10830 + }, + { + "epoch": 0.5416, + "grad_norm": 5.579224109649658, + "learning_rate": 1.0293173300722286e-05, + "loss": 1.2495, + "step": 10832 + }, + { + "epoch": 0.5417, + "grad_norm": 11.263312339782715, + "learning_rate": 1.0289684124871181e-05, + "loss": 1.037, + "step": 10834 + }, + { + "epoch": 0.5418, + "grad_norm": 4.30113410949707, + "learning_rate": 1.0286194913722948e-05, + "loss": 0.2785, + "step": 10836 + }, + { + "epoch": 0.5419, + "grad_norm": 0.33840370178222656, + "learning_rate": 1.0282705667702734e-05, + "loss": 0.8488, + "step": 10838 + }, + { + "epoch": 0.542, + "grad_norm": 4.378471374511719, + "learning_rate": 1.0279216387235691e-05, + "loss": 1.0389, + "step": 10840 + }, + { + "epoch": 0.5421, + "grad_norm": 2.940242290496826, + "learning_rate": 1.0275727072746977e-05, + "loss": 1.0459, + "step": 10842 + }, + { + "epoch": 0.5422, + "grad_norm": 5.6172285079956055, + "learning_rate": 1.0272237724661753e-05, + "loss": 0.9211, + "step": 10844 + }, + { + "epoch": 0.5423, + "grad_norm": 5.694525241851807, + "learning_rate": 1.0268748343405192e-05, + "loss": 0.7434, + "step": 10846 + }, + { + "epoch": 0.5424, + "grad_norm": 5.986548900604248, + "learning_rate": 1.026525892940246e-05, + "loss": 0.7457, + "step": 10848 + }, + { + "epoch": 0.5425, + "grad_norm": 1.8946846723556519, + "learning_rate": 1.0261769483078734e-05, + "loss": 0.7864, + "step": 10850 + }, + { + "epoch": 0.5426, + "grad_norm": 0.40404531359672546, + "learning_rate": 1.0258280004859189e-05, + "loss": 0.3989, + "step": 10852 + }, + { + "epoch": 0.5427, + "grad_norm": 2.6801135540008545, + "learning_rate": 1.0254790495169006e-05, + "loss": 0.279, + "step": 10854 + }, + { + "epoch": 0.5428, + "grad_norm": 4.992667198181152, + "learning_rate": 1.0251300954433377e-05, + "loss": 0.8889, + "step": 10856 + }, + { + "epoch": 0.5429, + "grad_norm": 3.690079927444458, + "learning_rate": 1.0247811383077488e-05, + "loss": 0.5234, + "step": 10858 + }, + { + "epoch": 0.543, + "grad_norm": 4.179633140563965, + "learning_rate": 1.0244321781526533e-05, + "loss": 0.4336, + "step": 10860 + }, + { + "epoch": 0.5431, + "grad_norm": 5.53205680847168, + "learning_rate": 1.024083215020571e-05, + "loss": 1.3091, + "step": 10862 + }, + { + "epoch": 0.5432, + "grad_norm": 2.55328106880188, + "learning_rate": 1.0237342489540221e-05, + "loss": 1.0224, + "step": 10864 + }, + { + "epoch": 0.5433, + "grad_norm": 6.917214393615723, + "learning_rate": 1.0233852799955268e-05, + "loss": 0.8013, + "step": 10866 + }, + { + "epoch": 0.5434, + "grad_norm": 0.2927723526954651, + "learning_rate": 1.0230363081876065e-05, + "loss": 0.405, + "step": 10868 + }, + { + "epoch": 0.5435, + "grad_norm": 3.8913376331329346, + "learning_rate": 1.0226873335727815e-05, + "loss": 0.4029, + "step": 10870 + }, + { + "epoch": 0.5436, + "grad_norm": 3.3218536376953125, + "learning_rate": 1.0223383561935738e-05, + "loss": 0.6824, + "step": 10872 + }, + { + "epoch": 0.5437, + "grad_norm": 2.824510335922241, + "learning_rate": 1.0219893760925053e-05, + "loss": 1.7445, + "step": 10874 + }, + { + "epoch": 0.5438, + "grad_norm": 7.519796848297119, + "learning_rate": 1.0216403933120979e-05, + "loss": 0.8983, + "step": 10876 + }, + { + "epoch": 0.5439, + "grad_norm": 1.6692882776260376, + "learning_rate": 1.0212914078948741e-05, + "loss": 0.4806, + "step": 10878 + }, + { + "epoch": 0.544, + "grad_norm": 2.433941602706909, + "learning_rate": 1.0209424198833571e-05, + "loss": 1.1363, + "step": 10880 + }, + { + "epoch": 0.5441, + "grad_norm": 4.612814426422119, + "learning_rate": 1.0205934293200697e-05, + "loss": 0.4985, + "step": 10882 + }, + { + "epoch": 0.5442, + "grad_norm": 0.2956062853336334, + "learning_rate": 1.0202444362475352e-05, + "loss": 0.1501, + "step": 10884 + }, + { + "epoch": 0.5443, + "grad_norm": 2.130885124206543, + "learning_rate": 1.019895440708278e-05, + "loss": 0.4671, + "step": 10886 + }, + { + "epoch": 0.5444, + "grad_norm": 4.351564884185791, + "learning_rate": 1.0195464427448213e-05, + "loss": 1.231, + "step": 10888 + }, + { + "epoch": 0.5445, + "grad_norm": 7.825654029846191, + "learning_rate": 1.01919744239969e-05, + "loss": 1.359, + "step": 10890 + }, + { + "epoch": 0.5446, + "grad_norm": 2.4471089839935303, + "learning_rate": 1.0188484397154083e-05, + "loss": 1.1866, + "step": 10892 + }, + { + "epoch": 0.5447, + "grad_norm": 6.571068286895752, + "learning_rate": 1.0184994347345017e-05, + "loss": 0.5522, + "step": 10894 + }, + { + "epoch": 0.5448, + "grad_norm": 0.19640235602855682, + "learning_rate": 1.0181504274994949e-05, + "loss": 0.4452, + "step": 10896 + }, + { + "epoch": 0.5449, + "grad_norm": 2.9302594661712646, + "learning_rate": 1.0178014180529136e-05, + "loss": 0.9353, + "step": 10898 + }, + { + "epoch": 0.545, + "grad_norm": 3.578629970550537, + "learning_rate": 1.0174524064372837e-05, + "loss": 0.4942, + "step": 10900 + }, + { + "epoch": 0.5451, + "grad_norm": 2.403764009475708, + "learning_rate": 1.0171033926951305e-05, + "loss": 1.1169, + "step": 10902 + }, + { + "epoch": 0.5452, + "grad_norm": 2.763538360595703, + "learning_rate": 1.0167543768689816e-05, + "loss": 0.7781, + "step": 10904 + }, + { + "epoch": 0.5453, + "grad_norm": 3.6670854091644287, + "learning_rate": 1.0164053590013623e-05, + "loss": 0.7155, + "step": 10906 + }, + { + "epoch": 0.5454, + "grad_norm": 11.658125877380371, + "learning_rate": 1.0160563391347998e-05, + "loss": 0.8299, + "step": 10908 + }, + { + "epoch": 0.5455, + "grad_norm": 4.727957725524902, + "learning_rate": 1.0157073173118207e-05, + "loss": 1.1235, + "step": 10910 + }, + { + "epoch": 0.5456, + "grad_norm": 4.89735746383667, + "learning_rate": 1.0153582935749531e-05, + "loss": 1.7549, + "step": 10912 + }, + { + "epoch": 0.5457, + "grad_norm": 2.7570714950561523, + "learning_rate": 1.0150092679667239e-05, + "loss": 0.6636, + "step": 10914 + }, + { + "epoch": 0.5458, + "grad_norm": 5.784925937652588, + "learning_rate": 1.0146602405296608e-05, + "loss": 1.0694, + "step": 10916 + }, + { + "epoch": 0.5459, + "grad_norm": 2.223001718521118, + "learning_rate": 1.0143112113062919e-05, + "loss": 0.5772, + "step": 10918 + }, + { + "epoch": 0.546, + "grad_norm": 13.858793258666992, + "learning_rate": 1.0139621803391454e-05, + "loss": 2.1764, + "step": 10920 + }, + { + "epoch": 0.5461, + "grad_norm": 3.819236993789673, + "learning_rate": 1.0136131476707496e-05, + "loss": 0.4322, + "step": 10922 + }, + { + "epoch": 0.5462, + "grad_norm": 4.049009323120117, + "learning_rate": 1.013264113343633e-05, + "loss": 1.2831, + "step": 10924 + }, + { + "epoch": 0.5463, + "grad_norm": 0.5067835450172424, + "learning_rate": 1.0129150774003245e-05, + "loss": 0.2045, + "step": 10926 + }, + { + "epoch": 0.5464, + "grad_norm": 4.214424133300781, + "learning_rate": 1.0125660398833528e-05, + "loss": 0.4941, + "step": 10928 + }, + { + "epoch": 0.5465, + "grad_norm": 2.9976212978363037, + "learning_rate": 1.0122170008352472e-05, + "loss": 0.7002, + "step": 10930 + }, + { + "epoch": 0.5466, + "grad_norm": 2.475050210952759, + "learning_rate": 1.0118679602985373e-05, + "loss": 1.2513, + "step": 10932 + }, + { + "epoch": 0.5467, + "grad_norm": 5.261926174163818, + "learning_rate": 1.0115189183157523e-05, + "loss": 0.6653, + "step": 10934 + }, + { + "epoch": 0.5468, + "grad_norm": 1.2247978448867798, + "learning_rate": 1.0111698749294223e-05, + "loss": 0.5669, + "step": 10936 + }, + { + "epoch": 0.5469, + "grad_norm": 4.475715160369873, + "learning_rate": 1.0108208301820768e-05, + "loss": 0.3505, + "step": 10938 + }, + { + "epoch": 0.547, + "grad_norm": 6.182151794433594, + "learning_rate": 1.010471784116246e-05, + "loss": 0.841, + "step": 10940 + }, + { + "epoch": 0.5471, + "grad_norm": 5.953324317932129, + "learning_rate": 1.0101227367744599e-05, + "loss": 0.8785, + "step": 10942 + }, + { + "epoch": 0.5472, + "grad_norm": 9.253725051879883, + "learning_rate": 1.0097736881992492e-05, + "loss": 1.2029, + "step": 10944 + }, + { + "epoch": 0.5473, + "grad_norm": 5.015621662139893, + "learning_rate": 1.0094246384331444e-05, + "loss": 0.6738, + "step": 10946 + }, + { + "epoch": 0.5474, + "grad_norm": 6.143064022064209, + "learning_rate": 1.0090755875186752e-05, + "loss": 0.3485, + "step": 10948 + }, + { + "epoch": 0.5475, + "grad_norm": 8.406189918518066, + "learning_rate": 1.008726535498374e-05, + "loss": 1.1868, + "step": 10950 + }, + { + "epoch": 0.5476, + "grad_norm": 0.13144558668136597, + "learning_rate": 1.0083774824147707e-05, + "loss": 0.0104, + "step": 10952 + }, + { + "epoch": 0.5477, + "grad_norm": 2.9619758129119873, + "learning_rate": 1.0080284283103965e-05, + "loss": 0.8676, + "step": 10954 + }, + { + "epoch": 0.5478, + "grad_norm": 5.885258197784424, + "learning_rate": 1.007679373227783e-05, + "loss": 1.1124, + "step": 10956 + }, + { + "epoch": 0.5479, + "grad_norm": 4.258270740509033, + "learning_rate": 1.0073303172094607e-05, + "loss": 0.8479, + "step": 10958 + }, + { + "epoch": 0.548, + "grad_norm": 11.84040641784668, + "learning_rate": 1.0069812602979617e-05, + "loss": 1.7757, + "step": 10960 + }, + { + "epoch": 0.5481, + "grad_norm": 3.4450132846832275, + "learning_rate": 1.0066322025358173e-05, + "loss": 1.5733, + "step": 10962 + }, + { + "epoch": 0.5482, + "grad_norm": 3.017760992050171, + "learning_rate": 1.0062831439655591e-05, + "loss": 0.4745, + "step": 10964 + }, + { + "epoch": 0.5483, + "grad_norm": 9.127965927124023, + "learning_rate": 1.005934084629719e-05, + "loss": 1.3052, + "step": 10966 + }, + { + "epoch": 0.5484, + "grad_norm": 1.9823741912841797, + "learning_rate": 1.0055850245708283e-05, + "loss": 0.7084, + "step": 10968 + }, + { + "epoch": 0.5485, + "grad_norm": 3.9367177486419678, + "learning_rate": 1.0052359638314195e-05, + "loss": 1.3177, + "step": 10970 + }, + { + "epoch": 0.5486, + "grad_norm": 11.50985050201416, + "learning_rate": 1.0048869024540247e-05, + "loss": 1.2922, + "step": 10972 + }, + { + "epoch": 0.5487, + "grad_norm": 3.2102627754211426, + "learning_rate": 1.0045378404811757e-05, + "loss": 1.1281, + "step": 10974 + }, + { + "epoch": 0.5488, + "grad_norm": 4.434567451477051, + "learning_rate": 1.0041887779554041e-05, + "loss": 0.4971, + "step": 10976 + }, + { + "epoch": 0.5489, + "grad_norm": 1.5449985265731812, + "learning_rate": 1.0038397149192426e-05, + "loss": 0.2584, + "step": 10978 + }, + { + "epoch": 0.549, + "grad_norm": 3.732191324234009, + "learning_rate": 1.0034906514152239e-05, + "loss": 0.5795, + "step": 10980 + }, + { + "epoch": 0.5491, + "grad_norm": 3.5206851959228516, + "learning_rate": 1.0031415874858796e-05, + "loss": 1.1679, + "step": 10982 + }, + { + "epoch": 0.5492, + "grad_norm": 3.503910541534424, + "learning_rate": 1.0027925231737428e-05, + "loss": 1.3201, + "step": 10984 + }, + { + "epoch": 0.5493, + "grad_norm": 5.558510780334473, + "learning_rate": 1.0024434585213452e-05, + "loss": 0.8427, + "step": 10986 + }, + { + "epoch": 0.5494, + "grad_norm": 0.5053393244743347, + "learning_rate": 1.0020943935712193e-05, + "loss": 0.5797, + "step": 10988 + }, + { + "epoch": 0.5495, + "grad_norm": 7.561919689178467, + "learning_rate": 1.0017453283658984e-05, + "loss": 1.3764, + "step": 10990 + }, + { + "epoch": 0.5496, + "grad_norm": 4.281322002410889, + "learning_rate": 1.0013962629479145e-05, + "loss": 0.4615, + "step": 10992 + }, + { + "epoch": 0.5497, + "grad_norm": 2.7793283462524414, + "learning_rate": 1.0010471973598002e-05, + "loss": 0.2648, + "step": 10994 + }, + { + "epoch": 0.5498, + "grad_norm": 2.696667432785034, + "learning_rate": 1.0006981316440876e-05, + "loss": 0.5673, + "step": 10996 + }, + { + "epoch": 0.5499, + "grad_norm": 4.628880977630615, + "learning_rate": 1.0003490658433102e-05, + "loss": 1.0105, + "step": 10998 + }, + { + "epoch": 0.55, + "grad_norm": 6.182992458343506, + "learning_rate": 1e-05, + "loss": 0.7249, + "step": 11000 + }, + { + "epoch": 0.5501, + "grad_norm": 28.15620231628418, + "learning_rate": 9.996509341566903e-06, + "loss": 1.2558, + "step": 11002 + }, + { + "epoch": 0.5502, + "grad_norm": 7.156608581542969, + "learning_rate": 9.993018683559126e-06, + "loss": 0.921, + "step": 11004 + }, + { + "epoch": 0.5503, + "grad_norm": 11.14852523803711, + "learning_rate": 9.989528026402003e-06, + "loss": 0.8109, + "step": 11006 + }, + { + "epoch": 0.5504, + "grad_norm": 4.888880729675293, + "learning_rate": 9.986037370520856e-06, + "loss": 1.0777, + "step": 11008 + }, + { + "epoch": 0.5505, + "grad_norm": 2.420140027999878, + "learning_rate": 9.982546716341019e-06, + "loss": 0.7492, + "step": 11010 + }, + { + "epoch": 0.5506, + "grad_norm": 1.8046997785568237, + "learning_rate": 9.979056064287807e-06, + "loss": 1.5284, + "step": 11012 + }, + { + "epoch": 0.5507, + "grad_norm": 2.8434739112854004, + "learning_rate": 9.975565414786551e-06, + "loss": 0.6804, + "step": 11014 + }, + { + "epoch": 0.5508, + "grad_norm": 5.19085693359375, + "learning_rate": 9.972074768262576e-06, + "loss": 1.2828, + "step": 11016 + }, + { + "epoch": 0.5509, + "grad_norm": 3.6332693099975586, + "learning_rate": 9.968584125141206e-06, + "loss": 0.9011, + "step": 11018 + }, + { + "epoch": 0.551, + "grad_norm": 4.89417839050293, + "learning_rate": 9.965093485847766e-06, + "loss": 0.3319, + "step": 11020 + }, + { + "epoch": 0.5511, + "grad_norm": 4.940824508666992, + "learning_rate": 9.961602850807575e-06, + "loss": 1.221, + "step": 11022 + }, + { + "epoch": 0.5512, + "grad_norm": 3.974336624145508, + "learning_rate": 9.958112220445964e-06, + "loss": 0.9614, + "step": 11024 + }, + { + "epoch": 0.5513, + "grad_norm": 2.200948476791382, + "learning_rate": 9.954621595188248e-06, + "loss": 0.7873, + "step": 11026 + }, + { + "epoch": 0.5514, + "grad_norm": 2.47896671295166, + "learning_rate": 9.951130975459758e-06, + "loss": 0.5752, + "step": 11028 + }, + { + "epoch": 0.5515, + "grad_norm": 4.293118476867676, + "learning_rate": 9.947640361685805e-06, + "loss": 1.1077, + "step": 11030 + }, + { + "epoch": 0.5516, + "grad_norm": 4.113046646118164, + "learning_rate": 9.944149754291719e-06, + "loss": 0.7455, + "step": 11032 + }, + { + "epoch": 0.5517, + "grad_norm": 2.4372265338897705, + "learning_rate": 9.940659153702813e-06, + "loss": 0.5979, + "step": 11034 + }, + { + "epoch": 0.5518, + "grad_norm": 3.9362378120422363, + "learning_rate": 9.937168560344412e-06, + "loss": 0.4831, + "step": 11036 + }, + { + "epoch": 0.5519, + "grad_norm": 15.112611770629883, + "learning_rate": 9.933677974641832e-06, + "loss": 0.5364, + "step": 11038 + }, + { + "epoch": 0.552, + "grad_norm": 6.507194995880127, + "learning_rate": 9.930187397020385e-06, + "loss": 0.9427, + "step": 11040 + }, + { + "epoch": 0.5521, + "grad_norm": 5.324178695678711, + "learning_rate": 9.926696827905395e-06, + "loss": 0.4985, + "step": 11042 + }, + { + "epoch": 0.5522, + "grad_norm": 5.25638484954834, + "learning_rate": 9.923206267722173e-06, + "loss": 1.1759, + "step": 11044 + }, + { + "epoch": 0.5523, + "grad_norm": 7.67529821395874, + "learning_rate": 9.919715716896037e-06, + "loss": 1.0655, + "step": 11046 + }, + { + "epoch": 0.5524, + "grad_norm": 4.736556529998779, + "learning_rate": 9.916225175852295e-06, + "loss": 0.945, + "step": 11048 + }, + { + "epoch": 0.5525, + "grad_norm": 3.1536006927490234, + "learning_rate": 9.912734645016262e-06, + "loss": 0.6008, + "step": 11050 + }, + { + "epoch": 0.5526, + "grad_norm": 5.421347618103027, + "learning_rate": 9.909244124813246e-06, + "loss": 1.0329, + "step": 11052 + }, + { + "epoch": 0.5527, + "grad_norm": 8.362146377563477, + "learning_rate": 9.905753615668561e-06, + "loss": 1.9626, + "step": 11054 + }, + { + "epoch": 0.5528, + "grad_norm": 10.638011932373047, + "learning_rate": 9.902263118007513e-06, + "loss": 0.7208, + "step": 11056 + }, + { + "epoch": 0.5529, + "grad_norm": 8.489493370056152, + "learning_rate": 9.898772632255403e-06, + "loss": 0.3164, + "step": 11058 + }, + { + "epoch": 0.553, + "grad_norm": 6.614761829376221, + "learning_rate": 9.895282158837545e-06, + "loss": 0.832, + "step": 11060 + }, + { + "epoch": 0.5531, + "grad_norm": 6.214354038238525, + "learning_rate": 9.891791698179236e-06, + "loss": 1.1932, + "step": 11062 + }, + { + "epoch": 0.5532, + "grad_norm": 9.896926879882812, + "learning_rate": 9.88830125070578e-06, + "loss": 1.3501, + "step": 11064 + }, + { + "epoch": 0.5533, + "grad_norm": 7.052300453186035, + "learning_rate": 9.884810816842477e-06, + "loss": 1.8046, + "step": 11066 + }, + { + "epoch": 0.5534, + "grad_norm": 22.28428077697754, + "learning_rate": 9.88132039701463e-06, + "loss": 1.63, + "step": 11068 + }, + { + "epoch": 0.5535, + "grad_norm": 33.34797286987305, + "learning_rate": 9.877829991647528e-06, + "loss": 1.6034, + "step": 11070 + }, + { + "epoch": 0.5536, + "grad_norm": 1.910385251045227, + "learning_rate": 9.874339601166474e-06, + "loss": 1.1384, + "step": 11072 + }, + { + "epoch": 0.5537, + "grad_norm": 2.8577330112457275, + "learning_rate": 9.87084922599676e-06, + "loss": 1.0708, + "step": 11074 + }, + { + "epoch": 0.5538, + "grad_norm": 2.8819119930267334, + "learning_rate": 9.867358866563674e-06, + "loss": 1.0871, + "step": 11076 + }, + { + "epoch": 0.5539, + "grad_norm": 3.886157512664795, + "learning_rate": 9.86386852329251e-06, + "loss": 1.0639, + "step": 11078 + }, + { + "epoch": 0.554, + "grad_norm": 1.80628502368927, + "learning_rate": 9.860378196608549e-06, + "loss": 0.6226, + "step": 11080 + }, + { + "epoch": 0.5541, + "grad_norm": 3.9751687049865723, + "learning_rate": 9.856887886937083e-06, + "loss": 0.9408, + "step": 11082 + }, + { + "epoch": 0.5542, + "grad_norm": 1.9997317790985107, + "learning_rate": 9.853397594703394e-06, + "loss": 0.9354, + "step": 11084 + }, + { + "epoch": 0.5543, + "grad_norm": 5.582647800445557, + "learning_rate": 9.849907320332766e-06, + "loss": 0.4965, + "step": 11086 + }, + { + "epoch": 0.5544, + "grad_norm": 4.30893087387085, + "learning_rate": 9.84641706425047e-06, + "loss": 0.4934, + "step": 11088 + }, + { + "epoch": 0.5545, + "grad_norm": 5.389228820800781, + "learning_rate": 9.842926826881796e-06, + "loss": 1.2982, + "step": 11090 + }, + { + "epoch": 0.5546, + "grad_norm": 2.013047218322754, + "learning_rate": 9.839436608652007e-06, + "loss": 0.1046, + "step": 11092 + }, + { + "epoch": 0.5547, + "grad_norm": 7.565781593322754, + "learning_rate": 9.83594640998638e-06, + "loss": 0.5206, + "step": 11094 + }, + { + "epoch": 0.5548, + "grad_norm": 4.326339244842529, + "learning_rate": 9.832456231310189e-06, + "loss": 0.5106, + "step": 11096 + }, + { + "epoch": 0.5549, + "grad_norm": 4.958189487457275, + "learning_rate": 9.828966073048693e-06, + "loss": 1.2077, + "step": 11098 + }, + { + "epoch": 0.555, + "grad_norm": 2.0068256855010986, + "learning_rate": 9.825475935627165e-06, + "loss": 0.776, + "step": 11100 + }, + { + "epoch": 0.5551, + "grad_norm": 2.847844362258911, + "learning_rate": 9.821985819470864e-06, + "loss": 0.5408, + "step": 11102 + }, + { + "epoch": 0.5552, + "grad_norm": 4.683908462524414, + "learning_rate": 9.818495725005053e-06, + "loss": 1.4, + "step": 11104 + }, + { + "epoch": 0.5553, + "grad_norm": 21.210586547851562, + "learning_rate": 9.815005652654985e-06, + "loss": 1.1931, + "step": 11106 + }, + { + "epoch": 0.5554, + "grad_norm": 3.428570508956909, + "learning_rate": 9.81151560284592e-06, + "loss": 0.4241, + "step": 11108 + }, + { + "epoch": 0.5555, + "grad_norm": 3.1283223628997803, + "learning_rate": 9.808025576003106e-06, + "loss": 1.0469, + "step": 11110 + }, + { + "epoch": 0.5556, + "grad_norm": 13.795999526977539, + "learning_rate": 9.80453557255179e-06, + "loss": 0.437, + "step": 11112 + }, + { + "epoch": 0.5557, + "grad_norm": 4.996266841888428, + "learning_rate": 9.801045592917227e-06, + "loss": 0.8906, + "step": 11114 + }, + { + "epoch": 0.5558, + "grad_norm": 3.8782007694244385, + "learning_rate": 9.79755563752465e-06, + "loss": 1.5612, + "step": 11116 + }, + { + "epoch": 0.5559, + "grad_norm": 0.9430502653121948, + "learning_rate": 9.794065706799307e-06, + "loss": 0.2533, + "step": 11118 + }, + { + "epoch": 0.556, + "grad_norm": 4.330248832702637, + "learning_rate": 9.790575801166432e-06, + "loss": 1.2056, + "step": 11120 + }, + { + "epoch": 0.5561, + "grad_norm": 2.8922007083892822, + "learning_rate": 9.78708592105126e-06, + "loss": 0.9568, + "step": 11122 + }, + { + "epoch": 0.5562, + "grad_norm": 8.159089088439941, + "learning_rate": 9.783596066879023e-06, + "loss": 0.9582, + "step": 11124 + }, + { + "epoch": 0.5563, + "grad_norm": 4.845394134521484, + "learning_rate": 9.78010623907495e-06, + "loss": 0.7923, + "step": 11126 + }, + { + "epoch": 0.5564, + "grad_norm": 8.418675422668457, + "learning_rate": 9.776616438064265e-06, + "loss": 2.4965, + "step": 11128 + }, + { + "epoch": 0.5565, + "grad_norm": 2.700693368911743, + "learning_rate": 9.773126664272186e-06, + "loss": 1.1836, + "step": 11130 + }, + { + "epoch": 0.5566, + "grad_norm": 8.392342567443848, + "learning_rate": 9.76963691812394e-06, + "loss": 0.9863, + "step": 11132 + }, + { + "epoch": 0.5567, + "grad_norm": 25.35110092163086, + "learning_rate": 9.766147200044731e-06, + "loss": 1.1326, + "step": 11134 + }, + { + "epoch": 0.5568, + "grad_norm": 4.845486640930176, + "learning_rate": 9.762657510459784e-06, + "loss": 0.7857, + "step": 11136 + }, + { + "epoch": 0.5569, + "grad_norm": 2.797741413116455, + "learning_rate": 9.759167849794293e-06, + "loss": 1.1081, + "step": 11138 + }, + { + "epoch": 0.557, + "grad_norm": 10.850926399230957, + "learning_rate": 9.75567821847347e-06, + "loss": 0.9245, + "step": 11140 + }, + { + "epoch": 0.5571, + "grad_norm": 2.9017813205718994, + "learning_rate": 9.752188616922517e-06, + "loss": 0.6996, + "step": 11142 + }, + { + "epoch": 0.5572, + "grad_norm": 3.006638765335083, + "learning_rate": 9.748699045566626e-06, + "loss": 0.9037, + "step": 11144 + }, + { + "epoch": 0.5573, + "grad_norm": 1.3651866912841797, + "learning_rate": 9.745209504830997e-06, + "loss": 0.5282, + "step": 11146 + }, + { + "epoch": 0.5574, + "grad_norm": 3.9401512145996094, + "learning_rate": 9.741719995140814e-06, + "loss": 0.9198, + "step": 11148 + }, + { + "epoch": 0.5575, + "grad_norm": 2.145486831665039, + "learning_rate": 9.738230516921272e-06, + "loss": 0.8808, + "step": 11150 + }, + { + "epoch": 0.5576, + "grad_norm": 4.733067035675049, + "learning_rate": 9.73474107059754e-06, + "loss": 1.0724, + "step": 11152 + }, + { + "epoch": 0.5577, + "grad_norm": 2.8456954956054688, + "learning_rate": 9.73125165659481e-06, + "loss": 0.744, + "step": 11154 + }, + { + "epoch": 0.5578, + "grad_norm": 4.340911388397217, + "learning_rate": 9.727762275338246e-06, + "loss": 1.3197, + "step": 11156 + }, + { + "epoch": 0.5579, + "grad_norm": 11.601901054382324, + "learning_rate": 9.724272927253025e-06, + "loss": 1.2573, + "step": 11158 + }, + { + "epoch": 0.558, + "grad_norm": 16.5797176361084, + "learning_rate": 9.720783612764314e-06, + "loss": 1.376, + "step": 11160 + }, + { + "epoch": 0.5581, + "grad_norm": 2.2633471488952637, + "learning_rate": 9.717294332297269e-06, + "loss": 1.198, + "step": 11162 + }, + { + "epoch": 0.5582, + "grad_norm": 3.386467933654785, + "learning_rate": 9.713805086277055e-06, + "loss": 1.006, + "step": 11164 + }, + { + "epoch": 0.5583, + "grad_norm": 3.841198682785034, + "learning_rate": 9.71031587512882e-06, + "loss": 0.7535, + "step": 11166 + }, + { + "epoch": 0.5584, + "grad_norm": 2.7600302696228027, + "learning_rate": 9.706826699277719e-06, + "loss": 1.1512, + "step": 11168 + }, + { + "epoch": 0.5585, + "grad_norm": 8.596510887145996, + "learning_rate": 9.703337559148892e-06, + "loss": 1.1953, + "step": 11170 + }, + { + "epoch": 0.5586, + "grad_norm": 5.507132530212402, + "learning_rate": 9.699848455167489e-06, + "loss": 0.7746, + "step": 11172 + }, + { + "epoch": 0.5587, + "grad_norm": 6.811617851257324, + "learning_rate": 9.696359387758638e-06, + "loss": 1.4819, + "step": 11174 + }, + { + "epoch": 0.5588, + "grad_norm": 5.03993034362793, + "learning_rate": 9.692870357347474e-06, + "loss": 0.5579, + "step": 11176 + }, + { + "epoch": 0.5589, + "grad_norm": 0.06438049674034119, + "learning_rate": 9.689381364359129e-06, + "loss": 0.7334, + "step": 11178 + }, + { + "epoch": 0.559, + "grad_norm": 2.8498904705047607, + "learning_rate": 9.685892409218718e-06, + "loss": 0.9788, + "step": 11180 + }, + { + "epoch": 0.5591, + "grad_norm": 3.900707244873047, + "learning_rate": 9.682403492351369e-06, + "loss": 0.6523, + "step": 11182 + }, + { + "epoch": 0.5592, + "grad_norm": 3.6745986938476562, + "learning_rate": 9.678914614182185e-06, + "loss": 0.91, + "step": 11184 + }, + { + "epoch": 0.5593, + "grad_norm": 4.902849197387695, + "learning_rate": 9.675425775136286e-06, + "loss": 0.7088, + "step": 11186 + }, + { + "epoch": 0.5594, + "grad_norm": 9.364858627319336, + "learning_rate": 9.671936975638768e-06, + "loss": 0.731, + "step": 11188 + }, + { + "epoch": 0.5595, + "grad_norm": 5.160921096801758, + "learning_rate": 9.668448216114739e-06, + "loss": 0.8691, + "step": 11190 + }, + { + "epoch": 0.5596, + "grad_norm": 7.221482753753662, + "learning_rate": 9.664959496989286e-06, + "loss": 0.9517, + "step": 11192 + }, + { + "epoch": 0.5597, + "grad_norm": 5.415144920349121, + "learning_rate": 9.661470818687503e-06, + "loss": 1.0659, + "step": 11194 + }, + { + "epoch": 0.5598, + "grad_norm": 5.5260443687438965, + "learning_rate": 9.657982181634476e-06, + "loss": 0.7087, + "step": 11196 + }, + { + "epoch": 0.5599, + "grad_norm": 0.05489982292056084, + "learning_rate": 9.654493586255279e-06, + "loss": 0.2773, + "step": 11198 + }, + { + "epoch": 0.56, + "grad_norm": 12.514266967773438, + "learning_rate": 9.651005032974994e-06, + "loss": 1.0542, + "step": 11200 + }, + { + "epoch": 0.5601, + "grad_norm": 8.219743728637695, + "learning_rate": 9.647516522218683e-06, + "loss": 0.8697, + "step": 11202 + }, + { + "epoch": 0.5602, + "grad_norm": 3.2738049030303955, + "learning_rate": 9.644028054411416e-06, + "loss": 0.891, + "step": 11204 + }, + { + "epoch": 0.5603, + "grad_norm": 6.262545108795166, + "learning_rate": 9.64053962997825e-06, + "loss": 0.3765, + "step": 11206 + }, + { + "epoch": 0.5604, + "grad_norm": 4.408055305480957, + "learning_rate": 9.637051249344244e-06, + "loss": 1.1578, + "step": 11208 + }, + { + "epoch": 0.5605, + "grad_norm": 3.811365842819214, + "learning_rate": 9.633562912934436e-06, + "loss": 0.995, + "step": 11210 + }, + { + "epoch": 0.5606, + "grad_norm": 6.340567588806152, + "learning_rate": 9.630074621173882e-06, + "loss": 1.1293, + "step": 11212 + }, + { + "epoch": 0.5607, + "grad_norm": 4.7449259757995605, + "learning_rate": 9.626586374487612e-06, + "loss": 0.8034, + "step": 11214 + }, + { + "epoch": 0.5608, + "grad_norm": 3.924880266189575, + "learning_rate": 9.623098173300655e-06, + "loss": 0.9555, + "step": 11216 + }, + { + "epoch": 0.5609, + "grad_norm": 4.450799465179443, + "learning_rate": 9.619610018038049e-06, + "loss": 0.6274, + "step": 11218 + }, + { + "epoch": 0.561, + "grad_norm": 2.3373589515686035, + "learning_rate": 9.616121909124801e-06, + "loss": 0.9957, + "step": 11220 + }, + { + "epoch": 0.5611, + "grad_norm": 3.263828992843628, + "learning_rate": 9.61263384698594e-06, + "loss": 0.3695, + "step": 11222 + }, + { + "epoch": 0.5612, + "grad_norm": 3.4467055797576904, + "learning_rate": 9.609145832046465e-06, + "loss": 0.8183, + "step": 11224 + }, + { + "epoch": 0.5613, + "grad_norm": 3.180023431777954, + "learning_rate": 9.605657864731388e-06, + "loss": 0.7882, + "step": 11226 + }, + { + "epoch": 0.5614, + "grad_norm": 1.9174391031265259, + "learning_rate": 9.602169945465702e-06, + "loss": 0.1801, + "step": 11228 + }, + { + "epoch": 0.5615, + "grad_norm": 2.5744967460632324, + "learning_rate": 9.598682074674405e-06, + "loss": 0.9623, + "step": 11230 + }, + { + "epoch": 0.5616, + "grad_norm": 4.386689186096191, + "learning_rate": 9.595194252782476e-06, + "loss": 1.5748, + "step": 11232 + }, + { + "epoch": 0.5617, + "grad_norm": 3.275697946548462, + "learning_rate": 9.5917064802149e-06, + "loss": 0.467, + "step": 11234 + }, + { + "epoch": 0.5618, + "grad_norm": 8.462721824645996, + "learning_rate": 9.588218757396655e-06, + "loss": 1.5534, + "step": 11236 + }, + { + "epoch": 0.5619, + "grad_norm": 0.8706592917442322, + "learning_rate": 9.5847310847527e-06, + "loss": 0.4301, + "step": 11238 + }, + { + "epoch": 0.562, + "grad_norm": 4.041539192199707, + "learning_rate": 9.581243462708007e-06, + "loss": 0.7305, + "step": 11240 + }, + { + "epoch": 0.5621, + "grad_norm": 3.687525987625122, + "learning_rate": 9.577755891687523e-06, + "loss": 1.6108, + "step": 11242 + }, + { + "epoch": 0.5622, + "grad_norm": 5.77704381942749, + "learning_rate": 9.574268372116205e-06, + "loss": 0.2201, + "step": 11244 + }, + { + "epoch": 0.5623, + "grad_norm": 6.85079288482666, + "learning_rate": 9.570780904418994e-06, + "loss": 1.2334, + "step": 11246 + }, + { + "epoch": 0.5624, + "grad_norm": 14.091039657592773, + "learning_rate": 9.567293489020831e-06, + "loss": 0.5988, + "step": 11248 + }, + { + "epoch": 0.5625, + "grad_norm": 4.236629962921143, + "learning_rate": 9.563806126346643e-06, + "loss": 0.7193, + "step": 11250 + }, + { + "epoch": 0.5626, + "grad_norm": 5.726737022399902, + "learning_rate": 9.560318816821354e-06, + "loss": 0.8865, + "step": 11252 + }, + { + "epoch": 0.5627, + "grad_norm": 2.721803903579712, + "learning_rate": 9.556831560869882e-06, + "loss": 0.768, + "step": 11254 + }, + { + "epoch": 0.5628, + "grad_norm": 5.717682838439941, + "learning_rate": 9.553344358917141e-06, + "loss": 1.884, + "step": 11256 + }, + { + "epoch": 0.5629, + "grad_norm": 3.64707612991333, + "learning_rate": 9.549857211388037e-06, + "loss": 1.284, + "step": 11258 + }, + { + "epoch": 0.563, + "grad_norm": 6.9798784255981445, + "learning_rate": 9.546370118707463e-06, + "loss": 1.0105, + "step": 11260 + }, + { + "epoch": 0.5631, + "grad_norm": 8.171552658081055, + "learning_rate": 9.542883081300316e-06, + "loss": 1.154, + "step": 11262 + }, + { + "epoch": 0.5632, + "grad_norm": 8.249751091003418, + "learning_rate": 9.539396099591477e-06, + "loss": 0.799, + "step": 11264 + }, + { + "epoch": 0.5633, + "grad_norm": 3.051008701324463, + "learning_rate": 9.53590917400583e-06, + "loss": 0.7396, + "step": 11266 + }, + { + "epoch": 0.5634, + "grad_norm": 3.3676815032958984, + "learning_rate": 9.532422304968243e-06, + "loss": 0.3894, + "step": 11268 + }, + { + "epoch": 0.5635, + "grad_norm": 2.7232282161712646, + "learning_rate": 9.528935492903575e-06, + "loss": 0.6909, + "step": 11270 + }, + { + "epoch": 0.5636, + "grad_norm": 0.28888145089149475, + "learning_rate": 9.525448738236691e-06, + "loss": 0.044, + "step": 11272 + }, + { + "epoch": 0.5637, + "grad_norm": 5.664491653442383, + "learning_rate": 9.521962041392436e-06, + "loss": 1.4115, + "step": 11274 + }, + { + "epoch": 0.5638, + "grad_norm": 8.358799934387207, + "learning_rate": 9.518475402795661e-06, + "loss": 0.9542, + "step": 11276 + }, + { + "epoch": 0.5639, + "grad_norm": 7.204640865325928, + "learning_rate": 9.514988822871194e-06, + "loss": 0.7419, + "step": 11278 + }, + { + "epoch": 0.564, + "grad_norm": 1.3092947006225586, + "learning_rate": 9.511502302043867e-06, + "loss": 0.7403, + "step": 11280 + }, + { + "epoch": 0.5641, + "grad_norm": 4.2047295570373535, + "learning_rate": 9.508015840738504e-06, + "loss": 0.7474, + "step": 11282 + }, + { + "epoch": 0.5642, + "grad_norm": 12.605886459350586, + "learning_rate": 9.504529439379921e-06, + "loss": 1.5131, + "step": 11284 + }, + { + "epoch": 0.5643, + "grad_norm": 4.18076753616333, + "learning_rate": 9.501043098392923e-06, + "loss": 0.6649, + "step": 11286 + }, + { + "epoch": 0.5644, + "grad_norm": 2.928281784057617, + "learning_rate": 9.497556818202306e-06, + "loss": 0.6329, + "step": 11288 + }, + { + "epoch": 0.5645, + "grad_norm": 3.869673252105713, + "learning_rate": 9.494070599232868e-06, + "loss": 0.4368, + "step": 11290 + }, + { + "epoch": 0.5646, + "grad_norm": 15.152042388916016, + "learning_rate": 9.490584441909392e-06, + "loss": 0.7085, + "step": 11292 + }, + { + "epoch": 0.5647, + "grad_norm": 6.556199550628662, + "learning_rate": 9.48709834665666e-06, + "loss": 0.6045, + "step": 11294 + }, + { + "epoch": 0.5648, + "grad_norm": 4.206418991088867, + "learning_rate": 9.483612313899436e-06, + "loss": 1.3491, + "step": 11296 + }, + { + "epoch": 0.5649, + "grad_norm": 15.046170234680176, + "learning_rate": 9.480126344062486e-06, + "loss": 1.3731, + "step": 11298 + }, + { + "epoch": 0.565, + "grad_norm": 4.3392109870910645, + "learning_rate": 9.476640437570562e-06, + "loss": 0.8196, + "step": 11300 + }, + { + "epoch": 0.5651, + "grad_norm": 4.848312854766846, + "learning_rate": 9.473154594848415e-06, + "loss": 1.0349, + "step": 11302 + }, + { + "epoch": 0.5652, + "grad_norm": 5.496140003204346, + "learning_rate": 9.469668816320785e-06, + "loss": 1.5787, + "step": 11304 + }, + { + "epoch": 0.5653, + "grad_norm": 3.2069156169891357, + "learning_rate": 9.466183102412397e-06, + "loss": 0.4873, + "step": 11306 + }, + { + "epoch": 0.5654, + "grad_norm": 3.634531259536743, + "learning_rate": 9.46269745354798e-06, + "loss": 1.1918, + "step": 11308 + }, + { + "epoch": 0.5655, + "grad_norm": 5.699024677276611, + "learning_rate": 9.459211870152247e-06, + "loss": 1.682, + "step": 11310 + }, + { + "epoch": 0.5656, + "grad_norm": 6.888819694519043, + "learning_rate": 9.45572635264991e-06, + "loss": 0.2586, + "step": 11312 + }, + { + "epoch": 0.5657, + "grad_norm": 0.7101389765739441, + "learning_rate": 9.452240901465663e-06, + "loss": 1.2759, + "step": 11314 + }, + { + "epoch": 0.5658, + "grad_norm": 0.2781287729740143, + "learning_rate": 9.448755517024207e-06, + "loss": 0.4907, + "step": 11316 + }, + { + "epoch": 0.5659, + "grad_norm": 2.201090097427368, + "learning_rate": 9.445270199750212e-06, + "loss": 1.2717, + "step": 11318 + }, + { + "epoch": 0.566, + "grad_norm": 3.9526476860046387, + "learning_rate": 9.441784950068362e-06, + "loss": 1.0541, + "step": 11320 + }, + { + "epoch": 0.5661, + "grad_norm": 6.678097248077393, + "learning_rate": 9.438299768403327e-06, + "loss": 0.5343, + "step": 11322 + }, + { + "epoch": 0.5662, + "grad_norm": 6.076599597930908, + "learning_rate": 9.434814655179756e-06, + "loss": 1.814, + "step": 11324 + }, + { + "epoch": 0.5663, + "grad_norm": 5.044340133666992, + "learning_rate": 9.43132961082231e-06, + "loss": 1.5302, + "step": 11326 + }, + { + "epoch": 0.5664, + "grad_norm": 2.798027276992798, + "learning_rate": 9.42784463575562e-06, + "loss": 3.1706, + "step": 11328 + }, + { + "epoch": 0.5665, + "grad_norm": 2.8769588470458984, + "learning_rate": 9.424359730404329e-06, + "loss": 0.7312, + "step": 11330 + }, + { + "epoch": 0.5666, + "grad_norm": 4.478175163269043, + "learning_rate": 9.420874895193056e-06, + "loss": 0.9224, + "step": 11332 + }, + { + "epoch": 0.5667, + "grad_norm": 3.3850293159484863, + "learning_rate": 9.417390130546425e-06, + "loss": 1.5029, + "step": 11334 + }, + { + "epoch": 0.5668, + "grad_norm": 9.837632179260254, + "learning_rate": 9.413905436889035e-06, + "loss": 0.5137, + "step": 11336 + }, + { + "epoch": 0.5669, + "grad_norm": 4.558687686920166, + "learning_rate": 9.410420814645493e-06, + "loss": 1.2704, + "step": 11338 + }, + { + "epoch": 0.567, + "grad_norm": 8.969514846801758, + "learning_rate": 9.406936264240386e-06, + "loss": 0.6461, + "step": 11340 + }, + { + "epoch": 0.5671, + "grad_norm": 4.787768840789795, + "learning_rate": 9.403451786098295e-06, + "loss": 0.9901, + "step": 11342 + }, + { + "epoch": 0.5672, + "grad_norm": 2.6523096561431885, + "learning_rate": 9.399967380643795e-06, + "loss": 1.284, + "step": 11344 + }, + { + "epoch": 0.5673, + "grad_norm": 4.368015766143799, + "learning_rate": 9.396483048301448e-06, + "loss": 1.0358, + "step": 11346 + }, + { + "epoch": 0.5674, + "grad_norm": 6.00566291809082, + "learning_rate": 9.392998789495813e-06, + "loss": 1.271, + "step": 11348 + }, + { + "epoch": 0.5675, + "grad_norm": 3.7697649002075195, + "learning_rate": 9.38951460465143e-06, + "loss": 0.7339, + "step": 11350 + }, + { + "epoch": 0.5676, + "grad_norm": 3.436737060546875, + "learning_rate": 9.386030494192847e-06, + "loss": 0.4447, + "step": 11352 + }, + { + "epoch": 0.5677, + "grad_norm": 3.9443883895874023, + "learning_rate": 9.382546458544582e-06, + "loss": 1.5735, + "step": 11354 + }, + { + "epoch": 0.5678, + "grad_norm": 5.995265007019043, + "learning_rate": 9.379062498131161e-06, + "loss": 0.7212, + "step": 11356 + }, + { + "epoch": 0.5679, + "grad_norm": 4.405023097991943, + "learning_rate": 9.375578613377088e-06, + "loss": 1.4016, + "step": 11358 + }, + { + "epoch": 0.568, + "grad_norm": 20.79657745361328, + "learning_rate": 9.372094804706867e-06, + "loss": 1.2983, + "step": 11360 + }, + { + "epoch": 0.5681, + "grad_norm": 14.505037307739258, + "learning_rate": 9.368611072544993e-06, + "loss": 1.651, + "step": 11362 + }, + { + "epoch": 0.5682, + "grad_norm": 3.378523826599121, + "learning_rate": 9.36512741731594e-06, + "loss": 0.7379, + "step": 11364 + }, + { + "epoch": 0.5683, + "grad_norm": 4.25665283203125, + "learning_rate": 9.361643839444187e-06, + "loss": 0.8426, + "step": 11366 + }, + { + "epoch": 0.5684, + "grad_norm": 16.677446365356445, + "learning_rate": 9.358160339354194e-06, + "loss": 1.8487, + "step": 11368 + }, + { + "epoch": 0.5685, + "grad_norm": 5.622416973114014, + "learning_rate": 9.354676917470421e-06, + "loss": 1.1619, + "step": 11370 + }, + { + "epoch": 0.5686, + "grad_norm": 1.4949896335601807, + "learning_rate": 9.351193574217305e-06, + "loss": 0.2862, + "step": 11372 + }, + { + "epoch": 0.5687, + "grad_norm": 11.95829963684082, + "learning_rate": 9.347710310019288e-06, + "loss": 1.6149, + "step": 11374 + }, + { + "epoch": 0.5688, + "grad_norm": 3.0919594764709473, + "learning_rate": 9.344227125300788e-06, + "loss": 0.9515, + "step": 11376 + }, + { + "epoch": 0.5689, + "grad_norm": 10.261185646057129, + "learning_rate": 9.340744020486223e-06, + "loss": 0.9915, + "step": 11378 + }, + { + "epoch": 0.569, + "grad_norm": 2.698460817337036, + "learning_rate": 9.337260996000002e-06, + "loss": 0.6027, + "step": 11380 + }, + { + "epoch": 0.5691, + "grad_norm": 4.857678413391113, + "learning_rate": 9.333778052266514e-06, + "loss": 1.0353, + "step": 11382 + }, + { + "epoch": 0.5692, + "grad_norm": 7.366321086883545, + "learning_rate": 9.330295189710153e-06, + "loss": 0.7776, + "step": 11384 + }, + { + "epoch": 0.5693, + "grad_norm": 3.26983642578125, + "learning_rate": 9.32681240875529e-06, + "loss": 0.8081, + "step": 11386 + }, + { + "epoch": 0.5694, + "grad_norm": 2.419377088546753, + "learning_rate": 9.323329709826294e-06, + "loss": 0.2751, + "step": 11388 + }, + { + "epoch": 0.5695, + "grad_norm": 4.330422401428223, + "learning_rate": 9.319847093347522e-06, + "loss": 2.6656, + "step": 11390 + }, + { + "epoch": 0.5696, + "grad_norm": 6.486429214477539, + "learning_rate": 9.316364559743315e-06, + "loss": 1.2374, + "step": 11392 + }, + { + "epoch": 0.5697, + "grad_norm": 1.3693828582763672, + "learning_rate": 9.312882109438015e-06, + "loss": 0.8214, + "step": 11394 + }, + { + "epoch": 0.5698, + "grad_norm": 3.6879453659057617, + "learning_rate": 9.309399742855943e-06, + "loss": 0.7664, + "step": 11396 + }, + { + "epoch": 0.5699, + "grad_norm": 12.607263565063477, + "learning_rate": 9.305917460421421e-06, + "loss": 1.1525, + "step": 11398 + }, + { + "epoch": 0.57, + "grad_norm": 5.411014080047607, + "learning_rate": 9.302435262558748e-06, + "loss": 1.0791, + "step": 11400 + }, + { + "epoch": 0.5701, + "grad_norm": 5.00938606262207, + "learning_rate": 9.298953149692226e-06, + "loss": 0.5937, + "step": 11402 + }, + { + "epoch": 0.5702, + "grad_norm": 2.9765193462371826, + "learning_rate": 9.295471122246131e-06, + "loss": 0.8252, + "step": 11404 + }, + { + "epoch": 0.5703, + "grad_norm": 10.846722602844238, + "learning_rate": 9.291989180644747e-06, + "loss": 1.6957, + "step": 11406 + }, + { + "epoch": 0.5704, + "grad_norm": 6.878485679626465, + "learning_rate": 9.288507325312334e-06, + "loss": 1.1444, + "step": 11408 + }, + { + "epoch": 0.5705, + "grad_norm": 6.2209601402282715, + "learning_rate": 9.285025556673141e-06, + "loss": 1.0313, + "step": 11410 + }, + { + "epoch": 0.5706, + "grad_norm": 8.031928062438965, + "learning_rate": 9.281543875151419e-06, + "loss": 0.8731, + "step": 11412 + }, + { + "epoch": 0.5707, + "grad_norm": 4.566481113433838, + "learning_rate": 9.278062281171394e-06, + "loss": 1.0059, + "step": 11414 + }, + { + "epoch": 0.5708, + "grad_norm": 3.627838134765625, + "learning_rate": 9.274580775157294e-06, + "loss": 0.994, + "step": 11416 + }, + { + "epoch": 0.5709, + "grad_norm": 4.158268928527832, + "learning_rate": 9.271099357533323e-06, + "loss": 0.3935, + "step": 11418 + }, + { + "epoch": 0.571, + "grad_norm": 14.403902053833008, + "learning_rate": 9.267618028723687e-06, + "loss": 2.1235, + "step": 11420 + }, + { + "epoch": 0.5711, + "grad_norm": 4.2034687995910645, + "learning_rate": 9.264136789152567e-06, + "loss": 0.8262, + "step": 11422 + }, + { + "epoch": 0.5712, + "grad_norm": 6.672013282775879, + "learning_rate": 9.260655639244152e-06, + "loss": 1.1314, + "step": 11424 + }, + { + "epoch": 0.5713, + "grad_norm": 2.989569664001465, + "learning_rate": 9.257174579422605e-06, + "loss": 0.4107, + "step": 11426 + }, + { + "epoch": 0.5714, + "grad_norm": 2.239394187927246, + "learning_rate": 9.253693610112079e-06, + "loss": 1.1308, + "step": 11428 + }, + { + "epoch": 0.5715, + "grad_norm": 7.9396586418151855, + "learning_rate": 9.250212731736726e-06, + "loss": 0.8189, + "step": 11430 + }, + { + "epoch": 0.5716, + "grad_norm": 4.052202224731445, + "learning_rate": 9.246731944720675e-06, + "loss": 0.5677, + "step": 11432 + }, + { + "epoch": 0.5717, + "grad_norm": 1.300941824913025, + "learning_rate": 9.243251249488052e-06, + "loss": 0.2523, + "step": 11434 + }, + { + "epoch": 0.5718, + "grad_norm": 2.074373245239258, + "learning_rate": 9.239770646462968e-06, + "loss": 0.6533, + "step": 11436 + }, + { + "epoch": 0.5719, + "grad_norm": 0.20698483288288116, + "learning_rate": 9.236290136069528e-06, + "loss": 0.4066, + "step": 11438 + }, + { + "epoch": 0.572, + "grad_norm": 8.522555351257324, + "learning_rate": 9.232809718731815e-06, + "loss": 1.3398, + "step": 11440 + }, + { + "epoch": 0.5721, + "grad_norm": 2.627645254135132, + "learning_rate": 9.229329394873911e-06, + "loss": 1.8755, + "step": 11442 + }, + { + "epoch": 0.5722, + "grad_norm": 2.9210734367370605, + "learning_rate": 9.225849164919886e-06, + "loss": 0.9995, + "step": 11444 + }, + { + "epoch": 0.5723, + "grad_norm": 8.41727066040039, + "learning_rate": 9.222369029293788e-06, + "loss": 0.7177, + "step": 11446 + }, + { + "epoch": 0.5724, + "grad_norm": 10.753632545471191, + "learning_rate": 9.218888988419668e-06, + "loss": 1.6065, + "step": 11448 + }, + { + "epoch": 0.5725, + "grad_norm": 3.8227620124816895, + "learning_rate": 9.215409042721553e-06, + "loss": 0.7079, + "step": 11450 + }, + { + "epoch": 0.5726, + "grad_norm": 2.2617335319519043, + "learning_rate": 9.211929192623466e-06, + "loss": 0.4533, + "step": 11452 + }, + { + "epoch": 0.5727, + "grad_norm": 4.701323986053467, + "learning_rate": 9.208449438549417e-06, + "loss": 0.8583, + "step": 11454 + }, + { + "epoch": 0.5728, + "grad_norm": 5.6364898681640625, + "learning_rate": 9.204969780923404e-06, + "loss": 0.7891, + "step": 11456 + }, + { + "epoch": 0.5729, + "grad_norm": 3.7665746212005615, + "learning_rate": 9.201490220169409e-06, + "loss": 1.1157, + "step": 11458 + }, + { + "epoch": 0.573, + "grad_norm": 1.9933013916015625, + "learning_rate": 9.198010756711413e-06, + "loss": 0.8749, + "step": 11460 + }, + { + "epoch": 0.5731, + "grad_norm": 2.8332831859588623, + "learning_rate": 9.19453139097337e-06, + "loss": 0.7018, + "step": 11462 + }, + { + "epoch": 0.5732, + "grad_norm": 7.9686737060546875, + "learning_rate": 9.191052123379234e-06, + "loss": 0.9864, + "step": 11464 + }, + { + "epoch": 0.5733, + "grad_norm": 5.10330867767334, + "learning_rate": 9.187572954352947e-06, + "loss": 1.3447, + "step": 11466 + }, + { + "epoch": 0.5734, + "grad_norm": 6.696396827697754, + "learning_rate": 9.184093884318426e-06, + "loss": 0.8808, + "step": 11468 + }, + { + "epoch": 0.5735, + "grad_norm": 4.0705461502075195, + "learning_rate": 9.180614913699593e-06, + "loss": 1.1322, + "step": 11470 + }, + { + "epoch": 0.5736, + "grad_norm": 0.2628389000892639, + "learning_rate": 9.177136042920344e-06, + "loss": 0.5748, + "step": 11472 + }, + { + "epoch": 0.5737, + "grad_norm": 6.4634175300598145, + "learning_rate": 9.173657272404577e-06, + "loss": 0.6496, + "step": 11474 + }, + { + "epoch": 0.5738, + "grad_norm": 5.001357555389404, + "learning_rate": 9.170178602576161e-06, + "loss": 1.3845, + "step": 11476 + }, + { + "epoch": 0.5739, + "grad_norm": 4.571360111236572, + "learning_rate": 9.166700033858968e-06, + "loss": 0.7453, + "step": 11478 + }, + { + "epoch": 0.574, + "grad_norm": 3.932654619216919, + "learning_rate": 9.163221566676847e-06, + "loss": 0.8441, + "step": 11480 + }, + { + "epoch": 0.5741, + "grad_norm": 2.9968883991241455, + "learning_rate": 9.159743201453638e-06, + "loss": 0.7516, + "step": 11482 + }, + { + "epoch": 0.5742, + "grad_norm": 3.3962762355804443, + "learning_rate": 9.156264938613176e-06, + "loss": 0.619, + "step": 11484 + }, + { + "epoch": 0.5743, + "grad_norm": 4.813145160675049, + "learning_rate": 9.152786778579266e-06, + "loss": 0.6099, + "step": 11486 + }, + { + "epoch": 0.5744, + "grad_norm": 5.939298152923584, + "learning_rate": 9.14930872177572e-06, + "loss": 1.9044, + "step": 11488 + }, + { + "epoch": 0.5745, + "grad_norm": 6.673669338226318, + "learning_rate": 9.145830768626326e-06, + "loss": 0.3675, + "step": 11490 + }, + { + "epoch": 0.5746, + "grad_norm": 3.8419172763824463, + "learning_rate": 9.142352919554862e-06, + "loss": 0.7198, + "step": 11492 + }, + { + "epoch": 0.5747, + "grad_norm": 4.801333427429199, + "learning_rate": 9.13887517498509e-06, + "loss": 0.7706, + "step": 11494 + }, + { + "epoch": 0.5748, + "grad_norm": 4.357640266418457, + "learning_rate": 9.135397535340773e-06, + "loss": 1.3123, + "step": 11496 + }, + { + "epoch": 0.5749, + "grad_norm": 2.0789072513580322, + "learning_rate": 9.13192000104564e-06, + "loss": 0.7643, + "step": 11498 + }, + { + "epoch": 0.575, + "grad_norm": 4.977556228637695, + "learning_rate": 9.128442572523418e-06, + "loss": 1.0746, + "step": 11500 + }, + { + "epoch": 0.5751, + "grad_norm": 2.7062828540802, + "learning_rate": 9.124965250197831e-06, + "loss": 0.6268, + "step": 11502 + }, + { + "epoch": 0.5752, + "grad_norm": 3.6477832794189453, + "learning_rate": 9.121488034492569e-06, + "loss": 0.6139, + "step": 11504 + }, + { + "epoch": 0.5753, + "grad_norm": 8.113978385925293, + "learning_rate": 9.11801092583133e-06, + "loss": 0.6595, + "step": 11506 + }, + { + "epoch": 0.5754, + "grad_norm": 4.6164703369140625, + "learning_rate": 9.114533924637778e-06, + "loss": 0.7163, + "step": 11508 + }, + { + "epoch": 0.5755, + "grad_norm": 2.320918560028076, + "learning_rate": 9.111057031335586e-06, + "loss": 1.3211, + "step": 11510 + }, + { + "epoch": 0.5756, + "grad_norm": 2.6764883995056152, + "learning_rate": 9.107580246348395e-06, + "loss": 0.9597, + "step": 11512 + }, + { + "epoch": 0.5757, + "grad_norm": 7.684656143188477, + "learning_rate": 9.10410357009985e-06, + "loss": 0.6412, + "step": 11514 + }, + { + "epoch": 0.5758, + "grad_norm": 6.289089679718018, + "learning_rate": 9.100627003013563e-06, + "loss": 0.9395, + "step": 11516 + }, + { + "epoch": 0.5759, + "grad_norm": 8.113457679748535, + "learning_rate": 9.097150545513147e-06, + "loss": 0.8214, + "step": 11518 + }, + { + "epoch": 0.576, + "grad_norm": 2.8711862564086914, + "learning_rate": 9.093674198022201e-06, + "loss": 1.4861, + "step": 11520 + }, + { + "epoch": 0.5761, + "grad_norm": 2.4644315242767334, + "learning_rate": 9.090197960964301e-06, + "loss": 0.8813, + "step": 11522 + }, + { + "epoch": 0.5762, + "grad_norm": 8.175093650817871, + "learning_rate": 9.086721834763024e-06, + "loss": 1.4838, + "step": 11524 + }, + { + "epoch": 0.5763, + "grad_norm": 3.276071548461914, + "learning_rate": 9.083245819841919e-06, + "loss": 0.804, + "step": 11526 + }, + { + "epoch": 0.5764, + "grad_norm": 4.628453254699707, + "learning_rate": 9.07976991662453e-06, + "loss": 0.6022, + "step": 11528 + }, + { + "epoch": 0.5765, + "grad_norm": 7.269804000854492, + "learning_rate": 9.076294125534382e-06, + "loss": 0.8835, + "step": 11530 + }, + { + "epoch": 0.5766, + "grad_norm": 5.077812671661377, + "learning_rate": 9.072818446995e-06, + "loss": 0.4203, + "step": 11532 + }, + { + "epoch": 0.5767, + "grad_norm": 2.9371378421783447, + "learning_rate": 9.069342881429877e-06, + "loss": 1.0189, + "step": 11534 + }, + { + "epoch": 0.5768, + "grad_norm": 2.079460859298706, + "learning_rate": 9.065867429262497e-06, + "loss": 0.7882, + "step": 11536 + }, + { + "epoch": 0.5769, + "grad_norm": 4.860224723815918, + "learning_rate": 9.062392090916337e-06, + "loss": 1.0568, + "step": 11538 + }, + { + "epoch": 0.577, + "grad_norm": 2.1422343254089355, + "learning_rate": 9.058916866814857e-06, + "loss": 0.8998, + "step": 11540 + }, + { + "epoch": 0.5771, + "grad_norm": 3.6696112155914307, + "learning_rate": 9.055441757381505e-06, + "loss": 1.16, + "step": 11542 + }, + { + "epoch": 0.5772, + "grad_norm": 5.764104843139648, + "learning_rate": 9.051966763039706e-06, + "loss": 0.9831, + "step": 11544 + }, + { + "epoch": 0.5773, + "grad_norm": 5.388243675231934, + "learning_rate": 9.048491884212884e-06, + "loss": 0.9679, + "step": 11546 + }, + { + "epoch": 0.5774, + "grad_norm": 3.445575714111328, + "learning_rate": 9.045017121324438e-06, + "loss": 1.1481, + "step": 11548 + }, + { + "epoch": 0.5775, + "grad_norm": 3.237842321395874, + "learning_rate": 9.04154247479776e-06, + "loss": 1.2358, + "step": 11550 + }, + { + "epoch": 0.5776, + "grad_norm": 3.2256360054016113, + "learning_rate": 9.038067945056229e-06, + "loss": 0.9544, + "step": 11552 + }, + { + "epoch": 0.5777, + "grad_norm": 3.82309889793396, + "learning_rate": 9.034593532523192e-06, + "loss": 0.265, + "step": 11554 + }, + { + "epoch": 0.5778, + "grad_norm": 2.813231945037842, + "learning_rate": 9.031119237622011e-06, + "loss": 0.831, + "step": 11556 + }, + { + "epoch": 0.5779, + "grad_norm": 2.1588664054870605, + "learning_rate": 9.027645060776008e-06, + "loss": 0.6252, + "step": 11558 + }, + { + "epoch": 0.578, + "grad_norm": 3.0235841274261475, + "learning_rate": 9.024171002408507e-06, + "loss": 0.7363, + "step": 11560 + }, + { + "epoch": 0.5781, + "grad_norm": 7.6520676612854, + "learning_rate": 9.020697062942807e-06, + "loss": 1.4181, + "step": 11562 + }, + { + "epoch": 0.5782, + "grad_norm": 5.056787967681885, + "learning_rate": 9.017223242802205e-06, + "loss": 1.8965, + "step": 11564 + }, + { + "epoch": 0.5783, + "grad_norm": 1.2817721366882324, + "learning_rate": 9.013749542409962e-06, + "loss": 0.4767, + "step": 11566 + }, + { + "epoch": 0.5784, + "grad_norm": 3.8697702884674072, + "learning_rate": 9.01027596218935e-06, + "loss": 0.5488, + "step": 11568 + }, + { + "epoch": 0.5785, + "grad_norm": 4.351583480834961, + "learning_rate": 9.006802502563613e-06, + "loss": 1.3381, + "step": 11570 + }, + { + "epoch": 0.5786, + "grad_norm": 3.1439313888549805, + "learning_rate": 9.003329163955973e-06, + "loss": 1.5775, + "step": 11572 + }, + { + "epoch": 0.5787, + "grad_norm": 5.789834499359131, + "learning_rate": 8.999855946789652e-06, + "loss": 0.9927, + "step": 11574 + }, + { + "epoch": 0.5788, + "grad_norm": 4.168580055236816, + "learning_rate": 8.996382851487851e-06, + "loss": 1.0098, + "step": 11576 + }, + { + "epoch": 0.5789, + "grad_norm": 2.9130349159240723, + "learning_rate": 8.992909878473758e-06, + "loss": 1.0304, + "step": 11578 + }, + { + "epoch": 0.579, + "grad_norm": 2.079735040664673, + "learning_rate": 8.989437028170537e-06, + "loss": 0.5348, + "step": 11580 + }, + { + "epoch": 0.5791, + "grad_norm": 10.302947998046875, + "learning_rate": 8.985964301001354e-06, + "loss": 1.0301, + "step": 11582 + }, + { + "epoch": 0.5792, + "grad_norm": 6.909488677978516, + "learning_rate": 8.982491697389339e-06, + "loss": 1.1423, + "step": 11584 + }, + { + "epoch": 0.5793, + "grad_norm": 3.8500890731811523, + "learning_rate": 8.979019217757626e-06, + "loss": 0.3549, + "step": 11586 + }, + { + "epoch": 0.5794, + "grad_norm": 3.5564723014831543, + "learning_rate": 8.975546862529328e-06, + "loss": 0.9955, + "step": 11588 + }, + { + "epoch": 0.5795, + "grad_norm": 3.807506799697876, + "learning_rate": 8.972074632127533e-06, + "loss": 0.9199, + "step": 11590 + }, + { + "epoch": 0.5796, + "grad_norm": 15.365499496459961, + "learning_rate": 8.968602526975329e-06, + "loss": 0.9338, + "step": 11592 + }, + { + "epoch": 0.5797, + "grad_norm": 5.757442474365234, + "learning_rate": 8.965130547495777e-06, + "loss": 1.3174, + "step": 11594 + }, + { + "epoch": 0.5798, + "grad_norm": 7.1973443031311035, + "learning_rate": 8.961658694111929e-06, + "loss": 0.4729, + "step": 11596 + }, + { + "epoch": 0.5799, + "grad_norm": 5.232731342315674, + "learning_rate": 8.958186967246818e-06, + "loss": 0.5153, + "step": 11598 + }, + { + "epoch": 0.58, + "grad_norm": 2.4571545124053955, + "learning_rate": 8.954715367323468e-06, + "loss": 0.8774, + "step": 11600 + }, + { + "epoch": 0.5801, + "grad_norm": 4.394287109375, + "learning_rate": 8.951243894764876e-06, + "loss": 1.0578, + "step": 11602 + }, + { + "epoch": 0.5802, + "grad_norm": 3.4315638542175293, + "learning_rate": 8.947772549994037e-06, + "loss": 1.7071, + "step": 11604 + }, + { + "epoch": 0.5803, + "grad_norm": 7.503978252410889, + "learning_rate": 8.944301333433923e-06, + "loss": 2.1179, + "step": 11606 + }, + { + "epoch": 0.5804, + "grad_norm": 5.005895614624023, + "learning_rate": 8.940830245507483e-06, + "loss": 1.0369, + "step": 11608 + }, + { + "epoch": 0.5805, + "grad_norm": 7.20158576965332, + "learning_rate": 8.937359286637672e-06, + "loss": 0.4748, + "step": 11610 + }, + { + "epoch": 0.5806, + "grad_norm": 5.567436218261719, + "learning_rate": 8.933888457247402e-06, + "loss": 2.2024, + "step": 11612 + }, + { + "epoch": 0.5807, + "grad_norm": 1.0253318548202515, + "learning_rate": 8.930417757759593e-06, + "loss": 0.042, + "step": 11614 + }, + { + "epoch": 0.5808, + "grad_norm": 11.814465522766113, + "learning_rate": 8.926947188597133e-06, + "loss": 1.4344, + "step": 11616 + }, + { + "epoch": 0.5809, + "grad_norm": 3.5571792125701904, + "learning_rate": 8.923476750182908e-06, + "loss": 0.3598, + "step": 11618 + }, + { + "epoch": 0.581, + "grad_norm": 0.5962051749229431, + "learning_rate": 8.920006442939772e-06, + "loss": 0.6974, + "step": 11620 + }, + { + "epoch": 0.5811, + "grad_norm": 0.12930184602737427, + "learning_rate": 8.916536267290578e-06, + "loss": 0.3023, + "step": 11622 + }, + { + "epoch": 0.5812, + "grad_norm": 6.3639960289001465, + "learning_rate": 8.913066223658152e-06, + "loss": 1.546, + "step": 11624 + }, + { + "epoch": 0.5813, + "grad_norm": 1.599079966545105, + "learning_rate": 8.909596312465307e-06, + "loss": 1.0486, + "step": 11626 + }, + { + "epoch": 0.5814, + "grad_norm": 2.4319138526916504, + "learning_rate": 8.906126534134849e-06, + "loss": 1.1073, + "step": 11628 + }, + { + "epoch": 0.5815, + "grad_norm": 4.06691837310791, + "learning_rate": 8.902656889089548e-06, + "loss": 0.535, + "step": 11630 + }, + { + "epoch": 0.5816, + "grad_norm": 0.6922029256820679, + "learning_rate": 8.89918737775218e-06, + "loss": 0.3792, + "step": 11632 + }, + { + "epoch": 0.5817, + "grad_norm": 4.12180757522583, + "learning_rate": 8.895718000545489e-06, + "loss": 1.5821, + "step": 11634 + }, + { + "epoch": 0.5818, + "grad_norm": 5.119474411010742, + "learning_rate": 8.892248757892215e-06, + "loss": 0.4432, + "step": 11636 + }, + { + "epoch": 0.5819, + "grad_norm": 4.728977680206299, + "learning_rate": 8.888779650215068e-06, + "loss": 1.0798, + "step": 11638 + }, + { + "epoch": 0.582, + "grad_norm": 5.532359600067139, + "learning_rate": 8.885310677936746e-06, + "loss": 0.8106, + "step": 11640 + }, + { + "epoch": 0.5821, + "grad_norm": 2.2802605628967285, + "learning_rate": 8.88184184147994e-06, + "loss": 1.1374, + "step": 11642 + }, + { + "epoch": 0.5822, + "grad_norm": 5.3389573097229, + "learning_rate": 8.878373141267312e-06, + "loss": 0.6306, + "step": 11644 + }, + { + "epoch": 0.5823, + "grad_norm": 6.3855767250061035, + "learning_rate": 8.874904577721518e-06, + "loss": 0.9384, + "step": 11646 + }, + { + "epoch": 0.5824, + "grad_norm": 1.9214075803756714, + "learning_rate": 8.871436151265183e-06, + "loss": 0.3768, + "step": 11648 + }, + { + "epoch": 0.5825, + "grad_norm": 3.508728265762329, + "learning_rate": 8.867967862320935e-06, + "loss": 0.9435, + "step": 11650 + }, + { + "epoch": 0.5826, + "grad_norm": 2.7790815830230713, + "learning_rate": 8.864499711311362e-06, + "loss": 0.4192, + "step": 11652 + }, + { + "epoch": 0.5827, + "grad_norm": 5.02672815322876, + "learning_rate": 8.861031698659064e-06, + "loss": 1.2607, + "step": 11654 + }, + { + "epoch": 0.5828, + "grad_norm": 2.7053301334381104, + "learning_rate": 8.857563824786598e-06, + "loss": 0.5812, + "step": 11656 + }, + { + "epoch": 0.5829, + "grad_norm": 0.9037389755249023, + "learning_rate": 8.854096090116507e-06, + "loss": 0.7988, + "step": 11658 + }, + { + "epoch": 0.583, + "grad_norm": 3.5217530727386475, + "learning_rate": 8.850628495071336e-06, + "loss": 1.0887, + "step": 11660 + }, + { + "epoch": 0.5831, + "grad_norm": 5.014571666717529, + "learning_rate": 8.847161040073593e-06, + "loss": 0.9953, + "step": 11662 + }, + { + "epoch": 0.5832, + "grad_norm": 1.7187532186508179, + "learning_rate": 8.843693725545787e-06, + "loss": 0.6604, + "step": 11664 + }, + { + "epoch": 0.5833, + "grad_norm": 1.5337390899658203, + "learning_rate": 8.840226551910387e-06, + "loss": 1.4498, + "step": 11666 + }, + { + "epoch": 0.5834, + "grad_norm": 2.8842687606811523, + "learning_rate": 8.836759519589869e-06, + "loss": 0.9405, + "step": 11668 + }, + { + "epoch": 0.5835, + "grad_norm": 3.236513137817383, + "learning_rate": 8.833292629006669e-06, + "loss": 0.9175, + "step": 11670 + }, + { + "epoch": 0.5836, + "grad_norm": 8.989675521850586, + "learning_rate": 8.829825880583228e-06, + "loss": 0.7768, + "step": 11672 + }, + { + "epoch": 0.5837, + "grad_norm": 2.8371551036834717, + "learning_rate": 8.826359274741954e-06, + "loss": 0.7843, + "step": 11674 + }, + { + "epoch": 0.5838, + "grad_norm": 2.522266149520874, + "learning_rate": 8.822892811905237e-06, + "loss": 0.7835, + "step": 11676 + }, + { + "epoch": 0.5839, + "grad_norm": 4.7707624435424805, + "learning_rate": 8.819426492495464e-06, + "loss": 0.7276, + "step": 11678 + }, + { + "epoch": 0.584, + "grad_norm": 7.213516712188721, + "learning_rate": 8.815960316934991e-06, + "loss": 0.4202, + "step": 11680 + }, + { + "epoch": 0.5841, + "grad_norm": 13.5982084274292, + "learning_rate": 8.812494285646164e-06, + "loss": 1.3673, + "step": 11682 + }, + { + "epoch": 0.5842, + "grad_norm": 10.335946083068848, + "learning_rate": 8.809028399051302e-06, + "loss": 1.0005, + "step": 11684 + }, + { + "epoch": 0.5843, + "grad_norm": 4.348811149597168, + "learning_rate": 8.805562657572723e-06, + "loss": 0.3861, + "step": 11686 + }, + { + "epoch": 0.5844, + "grad_norm": 0.04533953592181206, + "learning_rate": 8.802097061632706e-06, + "loss": 0.2718, + "step": 11688 + }, + { + "epoch": 0.5845, + "grad_norm": 5.928267002105713, + "learning_rate": 8.79863161165353e-06, + "loss": 0.5692, + "step": 11690 + }, + { + "epoch": 0.5846, + "grad_norm": 2.794260263442993, + "learning_rate": 8.79516630805745e-06, + "loss": 0.5892, + "step": 11692 + }, + { + "epoch": 0.5847, + "grad_norm": 1.939380168914795, + "learning_rate": 8.791701151266696e-06, + "loss": 1.3158, + "step": 11694 + }, + { + "epoch": 0.5848, + "grad_norm": 1.423446774482727, + "learning_rate": 8.788236141703498e-06, + "loss": 0.048, + "step": 11696 + }, + { + "epoch": 0.5849, + "grad_norm": 4.620400428771973, + "learning_rate": 8.784771279790044e-06, + "loss": 1.3323, + "step": 11698 + }, + { + "epoch": 0.585, + "grad_norm": 4.0723700523376465, + "learning_rate": 8.781306565948528e-06, + "loss": 0.8914, + "step": 11700 + }, + { + "epoch": 0.5851, + "grad_norm": 4.0028815269470215, + "learning_rate": 8.777842000601106e-06, + "loss": 1.2098, + "step": 11702 + }, + { + "epoch": 0.5852, + "grad_norm": 5.455604553222656, + "learning_rate": 8.774377584169934e-06, + "loss": 0.9187, + "step": 11704 + }, + { + "epoch": 0.5853, + "grad_norm": 2.0121712684631348, + "learning_rate": 8.77091331707713e-06, + "loss": 0.7402, + "step": 11706 + }, + { + "epoch": 0.5854, + "grad_norm": 3.230285406112671, + "learning_rate": 8.767449199744813e-06, + "loss": 1.1298, + "step": 11708 + }, + { + "epoch": 0.5855, + "grad_norm": 5.414671897888184, + "learning_rate": 8.763985232595076e-06, + "loss": 1.3602, + "step": 11710 + }, + { + "epoch": 0.5856, + "grad_norm": 5.932653903961182, + "learning_rate": 8.760521416049983e-06, + "loss": 0.9199, + "step": 11712 + }, + { + "epoch": 0.5857, + "grad_norm": 2.8061699867248535, + "learning_rate": 8.757057750531602e-06, + "loss": 1.0119, + "step": 11714 + }, + { + "epoch": 0.5858, + "grad_norm": 2.3186538219451904, + "learning_rate": 8.753594236461957e-06, + "loss": 0.9332, + "step": 11716 + }, + { + "epoch": 0.5859, + "grad_norm": 7.8132195472717285, + "learning_rate": 8.750130874263078e-06, + "loss": 1.1571, + "step": 11718 + }, + { + "epoch": 0.586, + "grad_norm": 1.7299259901046753, + "learning_rate": 8.746667664356957e-06, + "loss": 0.4763, + "step": 11720 + }, + { + "epoch": 0.5861, + "grad_norm": 5.268085479736328, + "learning_rate": 8.743204607165584e-06, + "loss": 1.0142, + "step": 11722 + }, + { + "epoch": 0.5862, + "grad_norm": 6.28386926651001, + "learning_rate": 8.739741703110914e-06, + "loss": 1.2055, + "step": 11724 + }, + { + "epoch": 0.5863, + "grad_norm": 9.949395179748535, + "learning_rate": 8.736278952614899e-06, + "loss": 1.0465, + "step": 11726 + }, + { + "epoch": 0.5864, + "grad_norm": 7.8255414962768555, + "learning_rate": 8.732816356099455e-06, + "loss": 0.8642, + "step": 11728 + }, + { + "epoch": 0.5865, + "grad_norm": 1.9023224115371704, + "learning_rate": 8.729353913986495e-06, + "loss": 0.777, + "step": 11730 + }, + { + "epoch": 0.5866, + "grad_norm": 5.584822177886963, + "learning_rate": 8.725891626697912e-06, + "loss": 0.7452, + "step": 11732 + }, + { + "epoch": 0.5867, + "grad_norm": 8.322867393493652, + "learning_rate": 8.722429494655561e-06, + "loss": 0.8113, + "step": 11734 + }, + { + "epoch": 0.5868, + "grad_norm": 2.1434032917022705, + "learning_rate": 8.718967518281307e-06, + "loss": 0.3696, + "step": 11736 + }, + { + "epoch": 0.5869, + "grad_norm": 3.872633695602417, + "learning_rate": 8.715505697996972e-06, + "loss": 0.4266, + "step": 11738 + }, + { + "epoch": 0.587, + "grad_norm": 3.1607041358947754, + "learning_rate": 8.712044034224374e-06, + "loss": 1.6186, + "step": 11740 + }, + { + "epoch": 0.5871, + "grad_norm": 8.454407691955566, + "learning_rate": 8.7085825273853e-06, + "loss": 2.0675, + "step": 11742 + }, + { + "epoch": 0.5872, + "grad_norm": 2.1758475303649902, + "learning_rate": 8.705121177901532e-06, + "loss": 0.3701, + "step": 11744 + }, + { + "epoch": 0.5873, + "grad_norm": 3.338834047317505, + "learning_rate": 8.701659986194819e-06, + "loss": 1.1613, + "step": 11746 + }, + { + "epoch": 0.5874, + "grad_norm": 3.6825907230377197, + "learning_rate": 8.698198952686896e-06, + "loss": 0.8687, + "step": 11748 + }, + { + "epoch": 0.5875, + "grad_norm": 4.971210479736328, + "learning_rate": 8.694738077799487e-06, + "loss": 0.8793, + "step": 11750 + }, + { + "epoch": 0.5876, + "grad_norm": 4.423511981964111, + "learning_rate": 8.69127736195428e-06, + "loss": 1.2446, + "step": 11752 + }, + { + "epoch": 0.5877, + "grad_norm": 2.3552303314208984, + "learning_rate": 8.687816805572957e-06, + "loss": 0.9278, + "step": 11754 + }, + { + "epoch": 0.5878, + "grad_norm": 6.12208366394043, + "learning_rate": 8.684356409077177e-06, + "loss": 0.4274, + "step": 11756 + }, + { + "epoch": 0.5879, + "grad_norm": 3.4991657733917236, + "learning_rate": 8.680896172888577e-06, + "loss": 0.5982, + "step": 11758 + }, + { + "epoch": 0.588, + "grad_norm": 2.633103847503662, + "learning_rate": 8.677436097428775e-06, + "loss": 0.7996, + "step": 11760 + }, + { + "epoch": 0.5881, + "grad_norm": 3.0693352222442627, + "learning_rate": 8.673976183119377e-06, + "loss": 0.7549, + "step": 11762 + }, + { + "epoch": 0.5882, + "grad_norm": 6.514312744140625, + "learning_rate": 8.670516430381958e-06, + "loss": 1.0096, + "step": 11764 + }, + { + "epoch": 0.5883, + "grad_norm": 7.177332878112793, + "learning_rate": 8.667056839638077e-06, + "loss": 1.1925, + "step": 11766 + }, + { + "epoch": 0.5884, + "grad_norm": 4.447117805480957, + "learning_rate": 8.663597411309278e-06, + "loss": 1.0799, + "step": 11768 + }, + { + "epoch": 0.5885, + "grad_norm": 2.2120604515075684, + "learning_rate": 8.66013814581708e-06, + "loss": 1.4922, + "step": 11770 + }, + { + "epoch": 0.5886, + "grad_norm": 4.052025318145752, + "learning_rate": 8.656679043582986e-06, + "loss": 0.7718, + "step": 11772 + }, + { + "epoch": 0.5887, + "grad_norm": 2.6387932300567627, + "learning_rate": 8.653220105028476e-06, + "loss": 0.776, + "step": 11774 + }, + { + "epoch": 0.5888, + "grad_norm": 12.479037284851074, + "learning_rate": 8.649761330575009e-06, + "loss": 1.2439, + "step": 11776 + }, + { + "epoch": 0.5889, + "grad_norm": 2.5092222690582275, + "learning_rate": 8.646302720644028e-06, + "loss": 1.0932, + "step": 11778 + }, + { + "epoch": 0.589, + "grad_norm": 4.146707057952881, + "learning_rate": 8.642844275656957e-06, + "loss": 1.1725, + "step": 11780 + }, + { + "epoch": 0.5891, + "grad_norm": 2.315200090408325, + "learning_rate": 8.639385996035194e-06, + "loss": 0.4528, + "step": 11782 + }, + { + "epoch": 0.5892, + "grad_norm": 1.6456025838851929, + "learning_rate": 8.635927882200117e-06, + "loss": 0.475, + "step": 11784 + }, + { + "epoch": 0.5893, + "grad_norm": 5.5546135902404785, + "learning_rate": 8.632469934573095e-06, + "loss": 0.4279, + "step": 11786 + }, + { + "epoch": 0.5894, + "grad_norm": 10.149056434631348, + "learning_rate": 8.629012153575458e-06, + "loss": 0.8336, + "step": 11788 + }, + { + "epoch": 0.5895, + "grad_norm": 4.5069169998168945, + "learning_rate": 8.625554539628536e-06, + "loss": 0.7675, + "step": 11790 + }, + { + "epoch": 0.5896, + "grad_norm": 4.020544052124023, + "learning_rate": 8.62209709315362e-06, + "loss": 0.8338, + "step": 11792 + }, + { + "epoch": 0.5897, + "grad_norm": 3.7487146854400635, + "learning_rate": 8.618639814571996e-06, + "loss": 1.1235, + "step": 11794 + }, + { + "epoch": 0.5898, + "grad_norm": 11.103203773498535, + "learning_rate": 8.615182704304918e-06, + "loss": 1.6267, + "step": 11796 + }, + { + "epoch": 0.5899, + "grad_norm": 2.720534324645996, + "learning_rate": 8.611725762773631e-06, + "loss": 1.1304, + "step": 11798 + }, + { + "epoch": 0.59, + "grad_norm": 3.2859249114990234, + "learning_rate": 8.60826899039935e-06, + "loss": 0.6334, + "step": 11800 + }, + { + "epoch": 0.5901, + "grad_norm": 9.212902069091797, + "learning_rate": 8.604812387603265e-06, + "loss": 0.572, + "step": 11802 + }, + { + "epoch": 0.5902, + "grad_norm": 3.994434356689453, + "learning_rate": 8.601355954806562e-06, + "loss": 0.9232, + "step": 11804 + }, + { + "epoch": 0.5903, + "grad_norm": 1.807895541191101, + "learning_rate": 8.59789969243039e-06, + "loss": 1.035, + "step": 11806 + }, + { + "epoch": 0.5904, + "grad_norm": 3.6560261249542236, + "learning_rate": 8.594443600895892e-06, + "loss": 1.234, + "step": 11808 + }, + { + "epoch": 0.5905, + "grad_norm": 2.848947525024414, + "learning_rate": 8.590987680624174e-06, + "loss": 0.7804, + "step": 11810 + }, + { + "epoch": 0.5906, + "grad_norm": 4.706329345703125, + "learning_rate": 8.587531932036334e-06, + "loss": 1.6244, + "step": 11812 + }, + { + "epoch": 0.5907, + "grad_norm": 4.0511250495910645, + "learning_rate": 8.584076355553444e-06, + "loss": 0.8841, + "step": 11814 + }, + { + "epoch": 0.5908, + "grad_norm": 3.0225470066070557, + "learning_rate": 8.580620951596556e-06, + "loss": 0.974, + "step": 11816 + }, + { + "epoch": 0.5909, + "grad_norm": 5.929433822631836, + "learning_rate": 8.577165720586702e-06, + "loss": 1.5874, + "step": 11818 + }, + { + "epoch": 0.591, + "grad_norm": 9.393218994140625, + "learning_rate": 8.573710662944884e-06, + "loss": 0.7162, + "step": 11820 + }, + { + "epoch": 0.5911, + "grad_norm": 17.578317642211914, + "learning_rate": 8.570255779092098e-06, + "loss": 1.9719, + "step": 11822 + }, + { + "epoch": 0.5912, + "grad_norm": 1.9753743410110474, + "learning_rate": 8.566801069449307e-06, + "loss": 0.2136, + "step": 11824 + }, + { + "epoch": 0.5913, + "grad_norm": 3.2145302295684814, + "learning_rate": 8.563346534437461e-06, + "loss": 0.8449, + "step": 11826 + }, + { + "epoch": 0.5914, + "grad_norm": 6.984745502471924, + "learning_rate": 8.559892174477478e-06, + "loss": 0.9887, + "step": 11828 + }, + { + "epoch": 0.5915, + "grad_norm": 2.791529893875122, + "learning_rate": 8.55643798999027e-06, + "loss": 1.0509, + "step": 11830 + }, + { + "epoch": 0.5916, + "grad_norm": 0.6519387364387512, + "learning_rate": 8.552983981396709e-06, + "loss": 0.0215, + "step": 11832 + }, + { + "epoch": 0.5917, + "grad_norm": 0.26372092962265015, + "learning_rate": 8.549530149117664e-06, + "loss": 0.7268, + "step": 11834 + }, + { + "epoch": 0.5918, + "grad_norm": 2.5203170776367188, + "learning_rate": 8.546076493573973e-06, + "loss": 0.8925, + "step": 11836 + }, + { + "epoch": 0.5919, + "grad_norm": 3.1197524070739746, + "learning_rate": 8.542623015186445e-06, + "loss": 1.6686, + "step": 11838 + }, + { + "epoch": 0.592, + "grad_norm": 9.938714027404785, + "learning_rate": 8.539169714375885e-06, + "loss": 1.6229, + "step": 11840 + }, + { + "epoch": 0.5921, + "grad_norm": 2.5055758953094482, + "learning_rate": 8.535716591563063e-06, + "loss": 1.4496, + "step": 11842 + }, + { + "epoch": 0.5922, + "grad_norm": 8.54660415649414, + "learning_rate": 8.532263647168735e-06, + "loss": 1.036, + "step": 11844 + }, + { + "epoch": 0.5923, + "grad_norm": 3.2478294372558594, + "learning_rate": 8.528810881613626e-06, + "loss": 0.7487, + "step": 11846 + }, + { + "epoch": 0.5924, + "grad_norm": 4.464147567749023, + "learning_rate": 8.525358295318454e-06, + "loss": 0.7201, + "step": 11848 + }, + { + "epoch": 0.5925, + "grad_norm": 2.544710874557495, + "learning_rate": 8.521905888703894e-06, + "loss": 0.9814, + "step": 11850 + }, + { + "epoch": 0.5926, + "grad_norm": 5.67169713973999, + "learning_rate": 8.518453662190622e-06, + "loss": 0.4952, + "step": 11852 + }, + { + "epoch": 0.5927, + "grad_norm": 3.861884593963623, + "learning_rate": 8.51500161619928e-06, + "loss": 1.0193, + "step": 11854 + }, + { + "epoch": 0.5928, + "grad_norm": 6.30306339263916, + "learning_rate": 8.511549751150478e-06, + "loss": 1.1322, + "step": 11856 + }, + { + "epoch": 0.5929, + "grad_norm": 3.214514970779419, + "learning_rate": 8.508098067464831e-06, + "loss": 0.5977, + "step": 11858 + }, + { + "epoch": 0.593, + "grad_norm": 3.4271228313446045, + "learning_rate": 8.504646565562907e-06, + "loss": 0.6942, + "step": 11860 + }, + { + "epoch": 0.5931, + "grad_norm": 1.547804594039917, + "learning_rate": 8.501195245865262e-06, + "loss": 1.0995, + "step": 11862 + }, + { + "epoch": 0.5932, + "grad_norm": 0.9324693083763123, + "learning_rate": 8.49774410879243e-06, + "loss": 0.1122, + "step": 11864 + }, + { + "epoch": 0.5933, + "grad_norm": 6.930073261260986, + "learning_rate": 8.494293154764924e-06, + "loss": 0.0909, + "step": 11866 + }, + { + "epoch": 0.5934, + "grad_norm": 2.830883741378784, + "learning_rate": 8.490842384203227e-06, + "loss": 0.8388, + "step": 11868 + }, + { + "epoch": 0.5935, + "grad_norm": 4.7271928787231445, + "learning_rate": 8.487391797527808e-06, + "loss": 1.2908, + "step": 11870 + }, + { + "epoch": 0.5936, + "grad_norm": 3.6774609088897705, + "learning_rate": 8.483941395159114e-06, + "loss": 0.9757, + "step": 11872 + }, + { + "epoch": 0.5937, + "grad_norm": 5.680489540100098, + "learning_rate": 8.480491177517557e-06, + "loss": 0.936, + "step": 11874 + }, + { + "epoch": 0.5938, + "grad_norm": 3.9615743160247803, + "learning_rate": 8.477041145023546e-06, + "loss": 1.4133, + "step": 11876 + }, + { + "epoch": 0.5939, + "grad_norm": 2.152733564376831, + "learning_rate": 8.473591298097447e-06, + "loss": 0.1259, + "step": 11878 + }, + { + "epoch": 0.594, + "grad_norm": 3.228583335876465, + "learning_rate": 8.47014163715962e-06, + "loss": 0.7349, + "step": 11880 + }, + { + "epoch": 0.5941, + "grad_norm": 4.390139102935791, + "learning_rate": 8.466692162630393e-06, + "loss": 0.4157, + "step": 11882 + }, + { + "epoch": 0.5942, + "grad_norm": 5.015795707702637, + "learning_rate": 8.46324287493008e-06, + "loss": 0.8598, + "step": 11884 + }, + { + "epoch": 0.5943, + "grad_norm": 5.815815448760986, + "learning_rate": 8.459793774478957e-06, + "loss": 1.2935, + "step": 11886 + }, + { + "epoch": 0.5944, + "grad_norm": 4.066465377807617, + "learning_rate": 8.45634486169729e-06, + "loss": 1.4041, + "step": 11888 + }, + { + "epoch": 0.5945, + "grad_norm": 14.979811668395996, + "learning_rate": 8.452896137005322e-06, + "loss": 1.8645, + "step": 11890 + }, + { + "epoch": 0.5946, + "grad_norm": 2.9546799659729004, + "learning_rate": 8.449447600823262e-06, + "loss": 0.6741, + "step": 11892 + }, + { + "epoch": 0.5947, + "grad_norm": 25.72138023376465, + "learning_rate": 8.445999253571316e-06, + "loss": 0.5889, + "step": 11894 + }, + { + "epoch": 0.5948, + "grad_norm": 2.062110185623169, + "learning_rate": 8.44255109566964e-06, + "loss": 0.5913, + "step": 11896 + }, + { + "epoch": 0.5949, + "grad_norm": 5.646230697631836, + "learning_rate": 8.439103127538391e-06, + "loss": 0.8073, + "step": 11898 + }, + { + "epoch": 0.595, + "grad_norm": 14.271876335144043, + "learning_rate": 8.43565534959769e-06, + "loss": 2.3986, + "step": 11900 + }, + { + "epoch": 0.5951, + "grad_norm": 2.818028450012207, + "learning_rate": 8.432207762267644e-06, + "loss": 0.9067, + "step": 11902 + }, + { + "epoch": 0.5952, + "grad_norm": 6.684349536895752, + "learning_rate": 8.428760365968327e-06, + "loss": 0.5654, + "step": 11904 + }, + { + "epoch": 0.5953, + "grad_norm": 2.7580392360687256, + "learning_rate": 8.425313161119788e-06, + "loss": 1.0537, + "step": 11906 + }, + { + "epoch": 0.5954, + "grad_norm": 0.7784945964813232, + "learning_rate": 8.421866148142066e-06, + "loss": 0.0584, + "step": 11908 + }, + { + "epoch": 0.5955, + "grad_norm": 6.454097270965576, + "learning_rate": 8.418419327455166e-06, + "loss": 0.4296, + "step": 11910 + }, + { + "epoch": 0.5956, + "grad_norm": 3.1473751068115234, + "learning_rate": 8.414972699479076e-06, + "loss": 1.6948, + "step": 11912 + }, + { + "epoch": 0.5957, + "grad_norm": 2.429743528366089, + "learning_rate": 8.41152626463375e-06, + "loss": 0.9615, + "step": 11914 + }, + { + "epoch": 0.5958, + "grad_norm": 2.5916390419006348, + "learning_rate": 8.408080023339134e-06, + "loss": 1.5149, + "step": 11916 + }, + { + "epoch": 0.5959, + "grad_norm": 3.8099961280822754, + "learning_rate": 8.404633976015136e-06, + "loss": 0.4027, + "step": 11918 + }, + { + "epoch": 0.596, + "grad_norm": 3.765334129333496, + "learning_rate": 8.401188123081653e-06, + "loss": 0.6899, + "step": 11920 + }, + { + "epoch": 0.5961, + "grad_norm": 5.870685577392578, + "learning_rate": 8.397742464958547e-06, + "loss": 0.9431, + "step": 11922 + }, + { + "epoch": 0.5962, + "grad_norm": 1.8744440078735352, + "learning_rate": 8.394297002065658e-06, + "loss": 0.836, + "step": 11924 + }, + { + "epoch": 0.5963, + "grad_norm": 5.235348224639893, + "learning_rate": 8.390851734822809e-06, + "loss": 1.277, + "step": 11926 + }, + { + "epoch": 0.5964, + "grad_norm": 2.6660425662994385, + "learning_rate": 8.387406663649796e-06, + "loss": 0.6398, + "step": 11928 + }, + { + "epoch": 0.5965, + "grad_norm": 3.3236587047576904, + "learning_rate": 8.38396178896639e-06, + "loss": 0.8088, + "step": 11930 + }, + { + "epoch": 0.5966, + "grad_norm": 7.4067559242248535, + "learning_rate": 8.380517111192336e-06, + "loss": 0.7602, + "step": 11932 + }, + { + "epoch": 0.5967, + "grad_norm": 2.675708055496216, + "learning_rate": 8.377072630747365e-06, + "loss": 0.838, + "step": 11934 + }, + { + "epoch": 0.5968, + "grad_norm": 3.0680744647979736, + "learning_rate": 8.373628348051165e-06, + "loss": 0.4221, + "step": 11936 + }, + { + "epoch": 0.5969, + "grad_norm": 7.000401973724365, + "learning_rate": 8.37018426352342e-06, + "loss": 1.5102, + "step": 11938 + }, + { + "epoch": 0.597, + "grad_norm": 8.296384811401367, + "learning_rate": 8.366740377583781e-06, + "loss": 1.1565, + "step": 11940 + }, + { + "epoch": 0.5971, + "grad_norm": 3.790158748626709, + "learning_rate": 8.363296690651869e-06, + "loss": 0.7493, + "step": 11942 + }, + { + "epoch": 0.5972, + "grad_norm": 2.651021957397461, + "learning_rate": 8.35985320314729e-06, + "loss": 0.1709, + "step": 11944 + }, + { + "epoch": 0.5973, + "grad_norm": 2.315916061401367, + "learning_rate": 8.356409915489625e-06, + "loss": 1.4457, + "step": 11946 + }, + { + "epoch": 0.5974, + "grad_norm": 3.930269479751587, + "learning_rate": 8.352966828098428e-06, + "loss": 1.1278, + "step": 11948 + }, + { + "epoch": 0.5975, + "grad_norm": 3.1518492698669434, + "learning_rate": 8.349523941393224e-06, + "loss": 1.1221, + "step": 11950 + }, + { + "epoch": 0.5976, + "grad_norm": 3.8961637020111084, + "learning_rate": 8.346081255793524e-06, + "loss": 0.8021, + "step": 11952 + }, + { + "epoch": 0.5977, + "grad_norm": 2.7412257194519043, + "learning_rate": 8.342638771718804e-06, + "loss": 1.2896, + "step": 11954 + }, + { + "epoch": 0.5978, + "grad_norm": 2.3707079887390137, + "learning_rate": 8.339196489588522e-06, + "loss": 0.9231, + "step": 11956 + }, + { + "epoch": 0.5979, + "grad_norm": 7.434700012207031, + "learning_rate": 8.335754409822114e-06, + "loss": 0.8476, + "step": 11958 + }, + { + "epoch": 0.598, + "grad_norm": 3.0871388912200928, + "learning_rate": 8.332312532838978e-06, + "loss": 1.1046, + "step": 11960 + }, + { + "epoch": 0.5981, + "grad_norm": 3.0792222023010254, + "learning_rate": 8.328870859058507e-06, + "loss": 1.0446, + "step": 11962 + }, + { + "epoch": 0.5982, + "grad_norm": 4.149078369140625, + "learning_rate": 8.325429388900046e-06, + "loss": 0.391, + "step": 11964 + }, + { + "epoch": 0.5983, + "grad_norm": 7.021173000335693, + "learning_rate": 8.321988122782938e-06, + "loss": 1.3714, + "step": 11966 + }, + { + "epoch": 0.5984, + "grad_norm": 4.063455581665039, + "learning_rate": 8.318547061126485e-06, + "loss": 0.7947, + "step": 11968 + }, + { + "epoch": 0.5985, + "grad_norm": 7.200834274291992, + "learning_rate": 8.315106204349976e-06, + "loss": 1.1563, + "step": 11970 + }, + { + "epoch": 0.5986, + "grad_norm": 2.701416015625, + "learning_rate": 8.311665552872662e-06, + "loss": 1.4837, + "step": 11972 + }, + { + "epoch": 0.5987, + "grad_norm": 7.271677017211914, + "learning_rate": 8.30822510711378e-06, + "loss": 1.2367, + "step": 11974 + }, + { + "epoch": 0.5988, + "grad_norm": 5.183056831359863, + "learning_rate": 8.30478486749254e-06, + "loss": 0.8081, + "step": 11976 + }, + { + "epoch": 0.5989, + "grad_norm": 6.955112934112549, + "learning_rate": 8.301344834428116e-06, + "loss": 1.1252, + "step": 11978 + }, + { + "epoch": 0.599, + "grad_norm": 3.5191030502319336, + "learning_rate": 8.297905008339677e-06, + "loss": 1.4111, + "step": 11980 + }, + { + "epoch": 0.5991, + "grad_norm": 11.717436790466309, + "learning_rate": 8.294465389646346e-06, + "loss": 1.5598, + "step": 11982 + }, + { + "epoch": 0.5992, + "grad_norm": 3.4673027992248535, + "learning_rate": 8.291025978767236e-06, + "loss": 0.8623, + "step": 11984 + }, + { + "epoch": 0.5993, + "grad_norm": 3.781233310699463, + "learning_rate": 8.287586776121424e-06, + "loss": 1.3752, + "step": 11986 + }, + { + "epoch": 0.5994, + "grad_norm": 2.3948609828948975, + "learning_rate": 8.284147782127971e-06, + "loss": 0.3999, + "step": 11988 + }, + { + "epoch": 0.5995, + "grad_norm": 0.15974372625350952, + "learning_rate": 8.280708997205904e-06, + "loss": 0.3017, + "step": 11990 + }, + { + "epoch": 0.5996, + "grad_norm": 4.643906116485596, + "learning_rate": 8.277270421774234e-06, + "loss": 0.741, + "step": 11992 + }, + { + "epoch": 0.5997, + "grad_norm": 10.208840370178223, + "learning_rate": 8.273832056251937e-06, + "loss": 1.0307, + "step": 11994 + }, + { + "epoch": 0.5998, + "grad_norm": 12.46338176727295, + "learning_rate": 8.270393901057964e-06, + "loss": 1.5432, + "step": 11996 + }, + { + "epoch": 0.5999, + "grad_norm": 4.864193916320801, + "learning_rate": 8.266955956611253e-06, + "loss": 0.8907, + "step": 11998 + }, + { + "epoch": 0.6, + "grad_norm": 6.626752853393555, + "learning_rate": 8.263518223330698e-06, + "loss": 1.0849, + "step": 12000 + }, + { + "epoch": 0.6001, + "grad_norm": 1.338853120803833, + "learning_rate": 8.26008070163518e-06, + "loss": 0.1502, + "step": 12002 + }, + { + "epoch": 0.6002, + "grad_norm": 5.453449249267578, + "learning_rate": 8.25664339194355e-06, + "loss": 1.3467, + "step": 12004 + }, + { + "epoch": 0.6003, + "grad_norm": 3.655322551727295, + "learning_rate": 8.25320629467464e-06, + "loss": 1.421, + "step": 12006 + }, + { + "epoch": 0.6004, + "grad_norm": 2.4323999881744385, + "learning_rate": 8.249769410247239e-06, + "loss": 0.8738, + "step": 12008 + }, + { + "epoch": 0.6005, + "grad_norm": 3.490825653076172, + "learning_rate": 8.246332739080131e-06, + "loss": 1.4592, + "step": 12010 + }, + { + "epoch": 0.6006, + "grad_norm": 3.8179094791412354, + "learning_rate": 8.242896281592057e-06, + "loss": 0.9074, + "step": 12012 + }, + { + "epoch": 0.6007, + "grad_norm": 8.426958084106445, + "learning_rate": 8.239460038201738e-06, + "loss": 1.199, + "step": 12014 + }, + { + "epoch": 0.6008, + "grad_norm": 0.37587738037109375, + "learning_rate": 8.236024009327879e-06, + "loss": 0.5891, + "step": 12016 + }, + { + "epoch": 0.6009, + "grad_norm": 3.4504058361053467, + "learning_rate": 8.232588195389139e-06, + "loss": 1.1843, + "step": 12018 + }, + { + "epoch": 0.601, + "grad_norm": 3.524294853210449, + "learning_rate": 8.22915259680417e-06, + "loss": 1.0307, + "step": 12020 + }, + { + "epoch": 0.6011, + "grad_norm": 3.698512315750122, + "learning_rate": 8.22571721399158e-06, + "loss": 1.5847, + "step": 12022 + }, + { + "epoch": 0.6012, + "grad_norm": 4.6063551902771, + "learning_rate": 8.222282047369972e-06, + "loss": 0.8534, + "step": 12024 + }, + { + "epoch": 0.6013, + "grad_norm": 5.086102485656738, + "learning_rate": 8.218847097357898e-06, + "loss": 1.2518, + "step": 12026 + }, + { + "epoch": 0.6014, + "grad_norm": 5.574816703796387, + "learning_rate": 8.215412364373908e-06, + "loss": 1.4315, + "step": 12028 + }, + { + "epoch": 0.6015, + "grad_norm": 4.36696720123291, + "learning_rate": 8.211977848836505e-06, + "loss": 0.8291, + "step": 12030 + }, + { + "epoch": 0.6016, + "grad_norm": 2.752659559249878, + "learning_rate": 8.208543551164178e-06, + "loss": 0.4372, + "step": 12032 + }, + { + "epoch": 0.6017, + "grad_norm": 3.0047714710235596, + "learning_rate": 8.205109471775388e-06, + "loss": 0.9075, + "step": 12034 + }, + { + "epoch": 0.6018, + "grad_norm": 3.842164993286133, + "learning_rate": 8.201675611088558e-06, + "loss": 0.4757, + "step": 12036 + }, + { + "epoch": 0.6019, + "grad_norm": 3.2640817165374756, + "learning_rate": 8.198241969522107e-06, + "loss": 1.2761, + "step": 12038 + }, + { + "epoch": 0.602, + "grad_norm": 3.138847589492798, + "learning_rate": 8.194808547494401e-06, + "loss": 1.0235, + "step": 12040 + }, + { + "epoch": 0.6021, + "grad_norm": 6.051507949829102, + "learning_rate": 8.1913753454238e-06, + "loss": 0.8625, + "step": 12042 + }, + { + "epoch": 0.6022, + "grad_norm": 2.0400807857513428, + "learning_rate": 8.187942363728626e-06, + "loss": 0.8551, + "step": 12044 + }, + { + "epoch": 0.6023, + "grad_norm": 3.283695936203003, + "learning_rate": 8.184509602827183e-06, + "loss": 0.9138, + "step": 12046 + }, + { + "epoch": 0.6024, + "grad_norm": 3.014892101287842, + "learning_rate": 8.181077063137733e-06, + "loss": 0.5254, + "step": 12048 + }, + { + "epoch": 0.6025, + "grad_norm": 13.82591438293457, + "learning_rate": 8.177644745078525e-06, + "loss": 1.1384, + "step": 12050 + }, + { + "epoch": 0.6026, + "grad_norm": 4.152532577514648, + "learning_rate": 8.174212649067781e-06, + "loss": 0.456, + "step": 12052 + }, + { + "epoch": 0.6027, + "grad_norm": 4.567274570465088, + "learning_rate": 8.170780775523685e-06, + "loss": 1.0633, + "step": 12054 + }, + { + "epoch": 0.6028, + "grad_norm": 4.023662090301514, + "learning_rate": 8.167349124864406e-06, + "loss": 0.9641, + "step": 12056 + }, + { + "epoch": 0.6029, + "grad_norm": 2.4454727172851562, + "learning_rate": 8.16391769750807e-06, + "loss": 0.8319, + "step": 12058 + }, + { + "epoch": 0.603, + "grad_norm": 14.108736038208008, + "learning_rate": 8.1604864938728e-06, + "loss": 1.3245, + "step": 12060 + }, + { + "epoch": 0.6031, + "grad_norm": 2.558208703994751, + "learning_rate": 8.157055514376667e-06, + "loss": 1.0007, + "step": 12062 + }, + { + "epoch": 0.6032, + "grad_norm": 6.104794502258301, + "learning_rate": 8.153624759437733e-06, + "loss": 0.9129, + "step": 12064 + }, + { + "epoch": 0.6033, + "grad_norm": 2.6093337535858154, + "learning_rate": 8.150194229474021e-06, + "loss": 0.7369, + "step": 12066 + }, + { + "epoch": 0.6034, + "grad_norm": 4.135458946228027, + "learning_rate": 8.146763924903527e-06, + "loss": 0.9499, + "step": 12068 + }, + { + "epoch": 0.6035, + "grad_norm": 0.5332887172698975, + "learning_rate": 8.143333846144231e-06, + "loss": 0.4887, + "step": 12070 + }, + { + "epoch": 0.6036, + "grad_norm": 2.820730209350586, + "learning_rate": 8.139903993614069e-06, + "loss": 0.7586, + "step": 12072 + }, + { + "epoch": 0.6037, + "grad_norm": 3.346647262573242, + "learning_rate": 8.13647436773097e-06, + "loss": 0.5553, + "step": 12074 + }, + { + "epoch": 0.6038, + "grad_norm": 8.848322868347168, + "learning_rate": 8.133044968912811e-06, + "loss": 2.1613, + "step": 12076 + }, + { + "epoch": 0.6039, + "grad_norm": 6.294447898864746, + "learning_rate": 8.129615797577462e-06, + "loss": 1.0773, + "step": 12078 + }, + { + "epoch": 0.604, + "grad_norm": 5.727674961090088, + "learning_rate": 8.126186854142752e-06, + "loss": 1.0765, + "step": 12080 + }, + { + "epoch": 0.6041, + "grad_norm": 2.5713770389556885, + "learning_rate": 8.122758139026495e-06, + "loss": 1.1243, + "step": 12082 + }, + { + "epoch": 0.6042, + "grad_norm": 4.631034851074219, + "learning_rate": 8.119329652646463e-06, + "loss": 0.781, + "step": 12084 + }, + { + "epoch": 0.6043, + "grad_norm": 3.3225293159484863, + "learning_rate": 8.115901395420407e-06, + "loss": 1.4365, + "step": 12086 + }, + { + "epoch": 0.6044, + "grad_norm": 3.1598551273345947, + "learning_rate": 8.112473367766051e-06, + "loss": 1.066, + "step": 12088 + }, + { + "epoch": 0.6045, + "grad_norm": 3.8542416095733643, + "learning_rate": 8.109045570101086e-06, + "loss": 1.2483, + "step": 12090 + }, + { + "epoch": 0.6046, + "grad_norm": 3.296107053756714, + "learning_rate": 8.10561800284319e-06, + "loss": 0.8755, + "step": 12092 + }, + { + "epoch": 0.6047, + "grad_norm": 3.3279643058776855, + "learning_rate": 8.102190666409988e-06, + "loss": 0.5472, + "step": 12094 + }, + { + "epoch": 0.6048, + "grad_norm": 2.180988311767578, + "learning_rate": 8.098763561219101e-06, + "loss": 0.9688, + "step": 12096 + }, + { + "epoch": 0.6049, + "grad_norm": 1.8771995306015015, + "learning_rate": 8.095336687688102e-06, + "loss": 0.8805, + "step": 12098 + }, + { + "epoch": 0.605, + "grad_norm": 5.79168176651001, + "learning_rate": 8.091910046234552e-06, + "loss": 1.3883, + "step": 12100 + }, + { + "epoch": 0.6051, + "grad_norm": 3.449120283126831, + "learning_rate": 8.088483637275979e-06, + "loss": 0.8952, + "step": 12102 + }, + { + "epoch": 0.6052, + "grad_norm": 4.20664119720459, + "learning_rate": 8.08505746122987e-06, + "loss": 3.3283, + "step": 12104 + }, + { + "epoch": 0.6053, + "grad_norm": 3.8467893600463867, + "learning_rate": 8.081631518513704e-06, + "loss": 0.899, + "step": 12106 + }, + { + "epoch": 0.6054, + "grad_norm": 5.488527297973633, + "learning_rate": 8.078205809544918e-06, + "loss": 1.2831, + "step": 12108 + }, + { + "epoch": 0.6055, + "grad_norm": 1.190455675125122, + "learning_rate": 8.074780334740929e-06, + "loss": 0.4304, + "step": 12110 + }, + { + "epoch": 0.6056, + "grad_norm": 3.2455074787139893, + "learning_rate": 8.07135509451911e-06, + "loss": 1.2819, + "step": 12112 + }, + { + "epoch": 0.6057, + "grad_norm": 3.006563901901245, + "learning_rate": 8.067930089296827e-06, + "loss": 0.3729, + "step": 12114 + }, + { + "epoch": 0.6058, + "grad_norm": 5.655300140380859, + "learning_rate": 8.064505319491398e-06, + "loss": 0.6512, + "step": 12116 + }, + { + "epoch": 0.6059, + "grad_norm": 1.809891700744629, + "learning_rate": 8.061080785520127e-06, + "loss": 0.6146, + "step": 12118 + }, + { + "epoch": 0.606, + "grad_norm": 3.2130632400512695, + "learning_rate": 8.057656487800283e-06, + "loss": 0.574, + "step": 12120 + }, + { + "epoch": 0.6061, + "grad_norm": 4.681155681610107, + "learning_rate": 8.0542324267491e-06, + "loss": 0.6857, + "step": 12122 + }, + { + "epoch": 0.6062, + "grad_norm": 4.176507949829102, + "learning_rate": 8.050808602783797e-06, + "loss": 1.063, + "step": 12124 + }, + { + "epoch": 0.6063, + "grad_norm": 2.8223488330841064, + "learning_rate": 8.047385016321552e-06, + "loss": 0.6007, + "step": 12126 + }, + { + "epoch": 0.6064, + "grad_norm": 2.9688899517059326, + "learning_rate": 8.04396166777952e-06, + "loss": 0.5496, + "step": 12128 + }, + { + "epoch": 0.6065, + "grad_norm": 5.3644232749938965, + "learning_rate": 8.040538557574822e-06, + "loss": 1.0485, + "step": 12130 + }, + { + "epoch": 0.6066, + "grad_norm": 5.750142574310303, + "learning_rate": 8.037115686124564e-06, + "loss": 0.6692, + "step": 12132 + }, + { + "epoch": 0.6067, + "grad_norm": 2.77099347114563, + "learning_rate": 8.033693053845801e-06, + "loss": 0.7518, + "step": 12134 + }, + { + "epoch": 0.6068, + "grad_norm": 14.01702880859375, + "learning_rate": 8.030270661155575e-06, + "loss": 1.1579, + "step": 12136 + }, + { + "epoch": 0.6069, + "grad_norm": 7.196412563323975, + "learning_rate": 8.026848508470897e-06, + "loss": 0.6221, + "step": 12138 + }, + { + "epoch": 0.607, + "grad_norm": 5.3092570304870605, + "learning_rate": 8.023426596208739e-06, + "loss": 0.7649, + "step": 12140 + }, + { + "epoch": 0.6071, + "grad_norm": 6.171720504760742, + "learning_rate": 8.02000492478606e-06, + "loss": 0.7321, + "step": 12142 + }, + { + "epoch": 0.6072, + "grad_norm": 7.763965606689453, + "learning_rate": 8.016583494619769e-06, + "loss": 0.6109, + "step": 12144 + }, + { + "epoch": 0.6073, + "grad_norm": 2.100151300430298, + "learning_rate": 8.013162306126766e-06, + "loss": 0.7172, + "step": 12146 + }, + { + "epoch": 0.6074, + "grad_norm": 5.2324676513671875, + "learning_rate": 8.009741359723906e-06, + "loss": 0.7191, + "step": 12148 + }, + { + "epoch": 0.6075, + "grad_norm": 10.079107284545898, + "learning_rate": 8.00632065582803e-06, + "loss": 0.4009, + "step": 12150 + }, + { + "epoch": 0.6076, + "grad_norm": 2.5057268142700195, + "learning_rate": 8.00290019485593e-06, + "loss": 0.7799, + "step": 12152 + }, + { + "epoch": 0.6077, + "grad_norm": 7.577979564666748, + "learning_rate": 7.999479977224384e-06, + "loss": 0.577, + "step": 12154 + }, + { + "epoch": 0.6078, + "grad_norm": 4.551283359527588, + "learning_rate": 7.996060003350139e-06, + "loss": 1.2211, + "step": 12156 + }, + { + "epoch": 0.6079, + "grad_norm": 7.638350963592529, + "learning_rate": 7.992640273649899e-06, + "loss": 0.4791, + "step": 12158 + }, + { + "epoch": 0.608, + "grad_norm": 3.2762184143066406, + "learning_rate": 7.989220788540356e-06, + "loss": 0.4144, + "step": 12160 + }, + { + "epoch": 0.6081, + "grad_norm": 0.9908294081687927, + "learning_rate": 7.985801548438157e-06, + "loss": 0.5735, + "step": 12162 + }, + { + "epoch": 0.6082, + "grad_norm": 4.841744899749756, + "learning_rate": 7.982382553759931e-06, + "loss": 1.0371, + "step": 12164 + }, + { + "epoch": 0.6083, + "grad_norm": 3.5774013996124268, + "learning_rate": 7.97896380492227e-06, + "loss": 1.4154, + "step": 12166 + }, + { + "epoch": 0.6084, + "grad_norm": 5.057089805603027, + "learning_rate": 7.975545302341743e-06, + "loss": 1.0294, + "step": 12168 + }, + { + "epoch": 0.6085, + "grad_norm": 6.1115946769714355, + "learning_rate": 7.972127046434878e-06, + "loss": 0.9465, + "step": 12170 + }, + { + "epoch": 0.6086, + "grad_norm": 6.377162456512451, + "learning_rate": 7.96870903761818e-06, + "loss": 1.1175, + "step": 12172 + }, + { + "epoch": 0.6087, + "grad_norm": 3.1544415950775146, + "learning_rate": 7.965291276308124e-06, + "loss": 0.5338, + "step": 12174 + }, + { + "epoch": 0.6088, + "grad_norm": 1.5071172714233398, + "learning_rate": 7.961873762921153e-06, + "loss": 0.7007, + "step": 12176 + }, + { + "epoch": 0.6089, + "grad_norm": 2.595285654067993, + "learning_rate": 7.958456497873686e-06, + "loss": 1.0091, + "step": 12178 + }, + { + "epoch": 0.609, + "grad_norm": 8.436539649963379, + "learning_rate": 7.955039481582098e-06, + "loss": 0.9932, + "step": 12180 + }, + { + "epoch": 0.6091, + "grad_norm": 5.220383644104004, + "learning_rate": 7.951622714462747e-06, + "loss": 0.348, + "step": 12182 + }, + { + "epoch": 0.6092, + "grad_norm": 10.013254165649414, + "learning_rate": 7.948206196931953e-06, + "loss": 1.4523, + "step": 12184 + }, + { + "epoch": 0.6093, + "grad_norm": 4.177830696105957, + "learning_rate": 7.944789929406016e-06, + "loss": 0.5533, + "step": 12186 + }, + { + "epoch": 0.6094, + "grad_norm": 8.550326347351074, + "learning_rate": 7.94137391230119e-06, + "loss": 1.1455, + "step": 12188 + }, + { + "epoch": 0.6095, + "grad_norm": 5.433356285095215, + "learning_rate": 7.937958146033706e-06, + "loss": 1.0599, + "step": 12190 + }, + { + "epoch": 0.6096, + "grad_norm": 8.136361122131348, + "learning_rate": 7.934542631019767e-06, + "loss": 1.2551, + "step": 12192 + }, + { + "epoch": 0.6097, + "grad_norm": 9.650341033935547, + "learning_rate": 7.931127367675544e-06, + "loss": 1.0381, + "step": 12194 + }, + { + "epoch": 0.6098, + "grad_norm": 3.3836758136749268, + "learning_rate": 7.927712356417176e-06, + "loss": 0.3925, + "step": 12196 + }, + { + "epoch": 0.6099, + "grad_norm": 3.691415309906006, + "learning_rate": 7.92429759766077e-06, + "loss": 1.1173, + "step": 12198 + }, + { + "epoch": 0.61, + "grad_norm": 3.6528923511505127, + "learning_rate": 7.92088309182241e-06, + "loss": 1.5846, + "step": 12200 + }, + { + "epoch": 0.6101, + "grad_norm": 9.732353210449219, + "learning_rate": 7.917468839318133e-06, + "loss": 0.6458, + "step": 12202 + }, + { + "epoch": 0.6102, + "grad_norm": 1.8584487438201904, + "learning_rate": 7.914054840563962e-06, + "loss": 0.7574, + "step": 12204 + }, + { + "epoch": 0.6103, + "grad_norm": 2.8907854557037354, + "learning_rate": 7.910641095975886e-06, + "loss": 1.1161, + "step": 12206 + }, + { + "epoch": 0.6104, + "grad_norm": 3.6177561283111572, + "learning_rate": 7.907227605969849e-06, + "loss": 1.1509, + "step": 12208 + }, + { + "epoch": 0.6105, + "grad_norm": 3.165443181991577, + "learning_rate": 7.903814370961785e-06, + "loss": 0.1899, + "step": 12210 + }, + { + "epoch": 0.6106, + "grad_norm": 1.8335907459259033, + "learning_rate": 7.900401391367576e-06, + "loss": 1.1138, + "step": 12212 + }, + { + "epoch": 0.6107, + "grad_norm": 4.04714822769165, + "learning_rate": 7.896988667603093e-06, + "loss": 0.3789, + "step": 12214 + }, + { + "epoch": 0.6108, + "grad_norm": 7.147403717041016, + "learning_rate": 7.89357620008416e-06, + "loss": 0.596, + "step": 12216 + }, + { + "epoch": 0.6109, + "grad_norm": 2.585162878036499, + "learning_rate": 7.89016398922658e-06, + "loss": 0.9045, + "step": 12218 + }, + { + "epoch": 0.611, + "grad_norm": 9.274861335754395, + "learning_rate": 7.886752035446116e-06, + "loss": 1.0134, + "step": 12220 + }, + { + "epoch": 0.6111, + "grad_norm": 3.7130846977233887, + "learning_rate": 7.883340339158505e-06, + "loss": 0.5117, + "step": 12222 + }, + { + "epoch": 0.6112, + "grad_norm": 4.984927177429199, + "learning_rate": 7.879928900779457e-06, + "loss": 0.6656, + "step": 12224 + }, + { + "epoch": 0.6113, + "grad_norm": 2.2829477787017822, + "learning_rate": 7.876517720724637e-06, + "loss": 0.8052, + "step": 12226 + }, + { + "epoch": 0.6114, + "grad_norm": 2.576571226119995, + "learning_rate": 7.873106799409696e-06, + "loss": 0.248, + "step": 12228 + }, + { + "epoch": 0.6115, + "grad_norm": 0.154028981924057, + "learning_rate": 7.869696137250235e-06, + "loss": 0.9079, + "step": 12230 + }, + { + "epoch": 0.6116, + "grad_norm": 4.386102676391602, + "learning_rate": 7.866285734661842e-06, + "loss": 0.295, + "step": 12232 + }, + { + "epoch": 0.6117, + "grad_norm": 2.3578145503997803, + "learning_rate": 7.862875592060056e-06, + "loss": 0.8724, + "step": 12234 + }, + { + "epoch": 0.6118, + "grad_norm": 4.532215595245361, + "learning_rate": 7.8594657098604e-06, + "loss": 1.1166, + "step": 12236 + }, + { + "epoch": 0.6119, + "grad_norm": 3.4569079875946045, + "learning_rate": 7.856056088478352e-06, + "loss": 0.6399, + "step": 12238 + }, + { + "epoch": 0.612, + "grad_norm": 2.343160629272461, + "learning_rate": 7.852646728329368e-06, + "loss": 0.7737, + "step": 12240 + }, + { + "epoch": 0.6121, + "grad_norm": 2.821321964263916, + "learning_rate": 7.84923762982887e-06, + "loss": 1.1379, + "step": 12242 + }, + { + "epoch": 0.6122, + "grad_norm": 6.806931972503662, + "learning_rate": 7.845828793392236e-06, + "loss": 0.6823, + "step": 12244 + }, + { + "epoch": 0.6123, + "grad_norm": 2.0326919555664062, + "learning_rate": 7.842420219434835e-06, + "loss": 0.6544, + "step": 12246 + }, + { + "epoch": 0.6124, + "grad_norm": 3.144611358642578, + "learning_rate": 7.83901190837198e-06, + "loss": 0.414, + "step": 12248 + }, + { + "epoch": 0.6125, + "grad_norm": 1.9092211723327637, + "learning_rate": 7.835603860618973e-06, + "loss": 0.415, + "step": 12250 + }, + { + "epoch": 0.6126, + "grad_norm": 3.875638961791992, + "learning_rate": 7.832196076591067e-06, + "loss": 0.5537, + "step": 12252 + }, + { + "epoch": 0.6127, + "grad_norm": 2.76585054397583, + "learning_rate": 7.828788556703498e-06, + "loss": 2.4842, + "step": 12254 + }, + { + "epoch": 0.6128, + "grad_norm": 3.343291997909546, + "learning_rate": 7.825381301371452e-06, + "loss": 1.2558, + "step": 12256 + }, + { + "epoch": 0.6129, + "grad_norm": 3.734835147857666, + "learning_rate": 7.821974311010103e-06, + "loss": 0.8601, + "step": 12258 + }, + { + "epoch": 0.613, + "grad_norm": 11.662543296813965, + "learning_rate": 7.818567586034578e-06, + "loss": 0.7929, + "step": 12260 + }, + { + "epoch": 0.6131, + "grad_norm": 6.242192268371582, + "learning_rate": 7.81516112685997e-06, + "loss": 1.3172, + "step": 12262 + }, + { + "epoch": 0.6132, + "grad_norm": 2.6422715187072754, + "learning_rate": 7.811754933901358e-06, + "loss": 0.723, + "step": 12264 + }, + { + "epoch": 0.6133, + "grad_norm": 4.842607021331787, + "learning_rate": 7.808349007573764e-06, + "loss": 0.8375, + "step": 12266 + }, + { + "epoch": 0.6134, + "grad_norm": 6.858392715454102, + "learning_rate": 7.804943348292197e-06, + "loss": 0.9361, + "step": 12268 + }, + { + "epoch": 0.6135, + "grad_norm": 2.2162466049194336, + "learning_rate": 7.801537956471624e-06, + "loss": 0.3554, + "step": 12270 + }, + { + "epoch": 0.6136, + "grad_norm": 14.241080284118652, + "learning_rate": 7.798132832526986e-06, + "loss": 1.3544, + "step": 12272 + }, + { + "epoch": 0.6137, + "grad_norm": 6.408077716827393, + "learning_rate": 7.79472797687318e-06, + "loss": 1.0578, + "step": 12274 + }, + { + "epoch": 0.6138, + "grad_norm": 2.1457266807556152, + "learning_rate": 7.791323389925084e-06, + "loss": 1.263, + "step": 12276 + }, + { + "epoch": 0.6139, + "grad_norm": 3.4059274196624756, + "learning_rate": 7.787919072097531e-06, + "loss": 1.458, + "step": 12278 + }, + { + "epoch": 0.614, + "grad_norm": 2.532041549682617, + "learning_rate": 7.784515023805328e-06, + "loss": 1.4341, + "step": 12280 + }, + { + "epoch": 0.6141, + "grad_norm": 8.309602737426758, + "learning_rate": 7.781111245463252e-06, + "loss": 1.0126, + "step": 12282 + }, + { + "epoch": 0.6142, + "grad_norm": 6.154748916625977, + "learning_rate": 7.777707737486036e-06, + "loss": 0.8937, + "step": 12284 + }, + { + "epoch": 0.6143, + "grad_norm": 3.407684087753296, + "learning_rate": 7.774304500288394e-06, + "loss": 1.4319, + "step": 12286 + }, + { + "epoch": 0.6144, + "grad_norm": 4.466482162475586, + "learning_rate": 7.770901534284996e-06, + "loss": 1.4332, + "step": 12288 + }, + { + "epoch": 0.6145, + "grad_norm": 11.781411170959473, + "learning_rate": 7.767498839890489e-06, + "loss": 1.0547, + "step": 12290 + }, + { + "epoch": 0.6146, + "grad_norm": 7.331947326660156, + "learning_rate": 7.76409641751947e-06, + "loss": 0.9203, + "step": 12292 + }, + { + "epoch": 0.6147, + "grad_norm": 5.770819664001465, + "learning_rate": 7.760694267586526e-06, + "loss": 1.2925, + "step": 12294 + }, + { + "epoch": 0.6148, + "grad_norm": 2.702004909515381, + "learning_rate": 7.757292390506191e-06, + "loss": 1.0079, + "step": 12296 + }, + { + "epoch": 0.6149, + "grad_norm": 3.8523828983306885, + "learning_rate": 7.753890786692973e-06, + "loss": 1.3361, + "step": 12298 + }, + { + "epoch": 0.615, + "grad_norm": 5.1811323165893555, + "learning_rate": 7.750489456561351e-06, + "loss": 0.5861, + "step": 12300 + }, + { + "epoch": 0.6151, + "grad_norm": 4.949836730957031, + "learning_rate": 7.747088400525765e-06, + "loss": 1.1045, + "step": 12302 + }, + { + "epoch": 0.6152, + "grad_norm": 4.062037467956543, + "learning_rate": 7.743687619000625e-06, + "loss": 0.6868, + "step": 12304 + }, + { + "epoch": 0.6153, + "grad_norm": 3.6964499950408936, + "learning_rate": 7.740287112400304e-06, + "loss": 0.9684, + "step": 12306 + }, + { + "epoch": 0.6154, + "grad_norm": 4.534641265869141, + "learning_rate": 7.736886881139143e-06, + "loss": 0.687, + "step": 12308 + }, + { + "epoch": 0.6155, + "grad_norm": 8.681703567504883, + "learning_rate": 7.733486925631448e-06, + "loss": 1.0194, + "step": 12310 + }, + { + "epoch": 0.6156, + "grad_norm": 5.067459583282471, + "learning_rate": 7.730087246291503e-06, + "loss": 1.8688, + "step": 12312 + }, + { + "epoch": 0.6157, + "grad_norm": 3.1601781845092773, + "learning_rate": 7.726687843533539e-06, + "loss": 0.5281, + "step": 12314 + }, + { + "epoch": 0.6158, + "grad_norm": 5.4743757247924805, + "learning_rate": 7.72328871777176e-06, + "loss": 0.853, + "step": 12316 + }, + { + "epoch": 0.6159, + "grad_norm": 4.353026390075684, + "learning_rate": 7.719889869420354e-06, + "loss": 0.9536, + "step": 12318 + }, + { + "epoch": 0.616, + "grad_norm": 3.0974032878875732, + "learning_rate": 7.716491298893443e-06, + "loss": 0.6164, + "step": 12320 + }, + { + "epoch": 0.6161, + "grad_norm": 1.7172235250473022, + "learning_rate": 7.713093006605146e-06, + "loss": 0.8246, + "step": 12322 + }, + { + "epoch": 0.6162, + "grad_norm": 3.4507639408111572, + "learning_rate": 7.709694992969525e-06, + "loss": 0.927, + "step": 12324 + }, + { + "epoch": 0.6163, + "grad_norm": 4.047074794769287, + "learning_rate": 7.706297258400624e-06, + "loss": 1.2838, + "step": 12326 + }, + { + "epoch": 0.6164, + "grad_norm": 4.8578057289123535, + "learning_rate": 7.702899803312443e-06, + "loss": 0.5539, + "step": 12328 + }, + { + "epoch": 0.6165, + "grad_norm": 5.630261421203613, + "learning_rate": 7.699502628118958e-06, + "loss": 0.703, + "step": 12330 + }, + { + "epoch": 0.6166, + "grad_norm": 10.796696662902832, + "learning_rate": 7.696105733234099e-06, + "loss": 1.0953, + "step": 12332 + }, + { + "epoch": 0.6167, + "grad_norm": 3.013516426086426, + "learning_rate": 7.692709119071761e-06, + "loss": 1.2171, + "step": 12334 + }, + { + "epoch": 0.6168, + "grad_norm": 6.076953887939453, + "learning_rate": 7.689312786045823e-06, + "loss": 0.867, + "step": 12336 + }, + { + "epoch": 0.6169, + "grad_norm": 3.6541147232055664, + "learning_rate": 7.685916734570112e-06, + "loss": 0.2901, + "step": 12338 + }, + { + "epoch": 0.617, + "grad_norm": 1.4883407354354858, + "learning_rate": 7.68252096505843e-06, + "loss": 0.2891, + "step": 12340 + }, + { + "epoch": 0.6171, + "grad_norm": 6.390429496765137, + "learning_rate": 7.679125477924535e-06, + "loss": 0.7651, + "step": 12342 + }, + { + "epoch": 0.6172, + "grad_norm": 7.2038445472717285, + "learning_rate": 7.67573027358216e-06, + "loss": 0.6303, + "step": 12344 + }, + { + "epoch": 0.6173, + "grad_norm": 1.6024566888809204, + "learning_rate": 7.672335352445002e-06, + "loss": 0.7762, + "step": 12346 + }, + { + "epoch": 0.6174, + "grad_norm": 0.9769496321678162, + "learning_rate": 7.668940714926724e-06, + "loss": 0.8776, + "step": 12348 + }, + { + "epoch": 0.6175, + "grad_norm": 6.563798904418945, + "learning_rate": 7.66554636144095e-06, + "loss": 1.0591, + "step": 12350 + }, + { + "epoch": 0.6176, + "grad_norm": 5.523858070373535, + "learning_rate": 7.662152292401265e-06, + "loss": 1.1476, + "step": 12352 + }, + { + "epoch": 0.6177, + "grad_norm": 4.639860153198242, + "learning_rate": 7.658758508221234e-06, + "loss": 2.4519, + "step": 12354 + }, + { + "epoch": 0.6178, + "grad_norm": 6.371888160705566, + "learning_rate": 7.655365009314375e-06, + "loss": 1.59, + "step": 12356 + }, + { + "epoch": 0.6179, + "grad_norm": 4.401912212371826, + "learning_rate": 7.651971796094183e-06, + "loss": 0.6115, + "step": 12358 + }, + { + "epoch": 0.618, + "grad_norm": 20.820724487304688, + "learning_rate": 7.6485788689741e-06, + "loss": 1.0409, + "step": 12360 + }, + { + "epoch": 0.6181, + "grad_norm": 5.715073108673096, + "learning_rate": 7.645186228367554e-06, + "loss": 0.655, + "step": 12362 + }, + { + "epoch": 0.6182, + "grad_norm": 4.639069080352783, + "learning_rate": 7.641793874687918e-06, + "loss": 0.7668, + "step": 12364 + }, + { + "epoch": 0.6183, + "grad_norm": 8.44040298461914, + "learning_rate": 7.638401808348548e-06, + "loss": 1.8716, + "step": 12366 + }, + { + "epoch": 0.6184, + "grad_norm": 5.933027744293213, + "learning_rate": 7.635010029762755e-06, + "loss": 1.6355, + "step": 12368 + }, + { + "epoch": 0.6185, + "grad_norm": 5.601518630981445, + "learning_rate": 7.631618539343815e-06, + "loss": 1.4321, + "step": 12370 + }, + { + "epoch": 0.6186, + "grad_norm": 0.09498314559459686, + "learning_rate": 7.628227337504972e-06, + "loss": 0.7499, + "step": 12372 + }, + { + "epoch": 0.6187, + "grad_norm": 6.43130350112915, + "learning_rate": 7.624836424659431e-06, + "loss": 0.9096, + "step": 12374 + }, + { + "epoch": 0.6188, + "grad_norm": 2.23078989982605, + "learning_rate": 7.621445801220372e-06, + "loss": 0.3169, + "step": 12376 + }, + { + "epoch": 0.6189, + "grad_norm": 5.399369239807129, + "learning_rate": 7.618055467600922e-06, + "loss": 1.7288, + "step": 12378 + }, + { + "epoch": 0.619, + "grad_norm": 2.2676377296447754, + "learning_rate": 7.6146654242141935e-06, + "loss": 1.1956, + "step": 12380 + }, + { + "epoch": 0.6191, + "grad_norm": 4.801334381103516, + "learning_rate": 7.611275671473245e-06, + "loss": 1.2583, + "step": 12382 + }, + { + "epoch": 0.6192, + "grad_norm": 3.6214609146118164, + "learning_rate": 7.6078862097911075e-06, + "loss": 0.4522, + "step": 12384 + }, + { + "epoch": 0.6193, + "grad_norm": 8.764137268066406, + "learning_rate": 7.604497039580785e-06, + "loss": 1.0853, + "step": 12386 + }, + { + "epoch": 0.6194, + "grad_norm": 3.8812756538391113, + "learning_rate": 7.6011081612552265e-06, + "loss": 1.113, + "step": 12388 + }, + { + "epoch": 0.6195, + "grad_norm": 3.0269298553466797, + "learning_rate": 7.597719575227364e-06, + "loss": 0.6585, + "step": 12390 + }, + { + "epoch": 0.6196, + "grad_norm": 2.794381618499756, + "learning_rate": 7.594331281910082e-06, + "loss": 0.8016, + "step": 12392 + }, + { + "epoch": 0.6197, + "grad_norm": 12.128643035888672, + "learning_rate": 7.590943281716241e-06, + "loss": 0.4938, + "step": 12394 + }, + { + "epoch": 0.6198, + "grad_norm": 6.749531269073486, + "learning_rate": 7.58755557505865e-06, + "loss": 0.5912, + "step": 12396 + }, + { + "epoch": 0.6199, + "grad_norm": 10.733187675476074, + "learning_rate": 7.584168162350097e-06, + "loss": 0.6658, + "step": 12398 + }, + { + "epoch": 0.62, + "grad_norm": 7.096673488616943, + "learning_rate": 7.580781044003324e-06, + "loss": 1.3569, + "step": 12400 + }, + { + "epoch": 0.6201, + "grad_norm": 2.4775044918060303, + "learning_rate": 7.577394220431041e-06, + "loss": 0.6454, + "step": 12402 + }, + { + "epoch": 0.6202, + "grad_norm": 3.5417282581329346, + "learning_rate": 7.574007692045928e-06, + "loss": 0.8818, + "step": 12404 + }, + { + "epoch": 0.6203, + "grad_norm": 3.4617929458618164, + "learning_rate": 7.570621459260614e-06, + "loss": 0.6894, + "step": 12406 + }, + { + "epoch": 0.6204, + "grad_norm": 1.8437968492507935, + "learning_rate": 7.5672355224877115e-06, + "loss": 0.7378, + "step": 12408 + }, + { + "epoch": 0.6205, + "grad_norm": 4.087181091308594, + "learning_rate": 7.5638498821397755e-06, + "loss": 0.9462, + "step": 12410 + }, + { + "epoch": 0.6206, + "grad_norm": 3.378187417984009, + "learning_rate": 7.560464538629345e-06, + "loss": 0.5092, + "step": 12412 + }, + { + "epoch": 0.6207, + "grad_norm": 6.521583557128906, + "learning_rate": 7.557079492368908e-06, + "loss": 1.3873, + "step": 12414 + }, + { + "epoch": 0.6208, + "grad_norm": 3.2202625274658203, + "learning_rate": 7.553694743770928e-06, + "loss": 0.6848, + "step": 12416 + }, + { + "epoch": 0.6209, + "grad_norm": 2.948021173477173, + "learning_rate": 7.550310293247823e-06, + "loss": 0.6745, + "step": 12418 + }, + { + "epoch": 0.621, + "grad_norm": 1.6944754123687744, + "learning_rate": 7.546926141211975e-06, + "loss": 0.7646, + "step": 12420 + }, + { + "epoch": 0.6211, + "grad_norm": 2.6145637035369873, + "learning_rate": 7.543542288075739e-06, + "loss": 0.3051, + "step": 12422 + }, + { + "epoch": 0.6212, + "grad_norm": 2.431088447570801, + "learning_rate": 7.54015873425142e-06, + "loss": 0.1745, + "step": 12424 + }, + { + "epoch": 0.6213, + "grad_norm": 6.933919906616211, + "learning_rate": 7.5367754801513025e-06, + "loss": 0.684, + "step": 12426 + }, + { + "epoch": 0.6214, + "grad_norm": 4.8099493980407715, + "learning_rate": 7.533392526187617e-06, + "loss": 0.531, + "step": 12428 + }, + { + "epoch": 0.6215, + "grad_norm": 1.7270604372024536, + "learning_rate": 7.530009872772572e-06, + "loss": 1.1195, + "step": 12430 + }, + { + "epoch": 0.6216, + "grad_norm": 2.414217472076416, + "learning_rate": 7.526627520318329e-06, + "loss": 0.464, + "step": 12432 + }, + { + "epoch": 0.6217, + "grad_norm": 5.34510612487793, + "learning_rate": 7.523245469237026e-06, + "loss": 0.7062, + "step": 12434 + }, + { + "epoch": 0.6218, + "grad_norm": 6.264241695404053, + "learning_rate": 7.519863719940748e-06, + "loss": 1.8944, + "step": 12436 + }, + { + "epoch": 0.6219, + "grad_norm": 4.24784517288208, + "learning_rate": 7.51648227284155e-06, + "loss": 0.796, + "step": 12438 + }, + { + "epoch": 0.622, + "grad_norm": 2.055544376373291, + "learning_rate": 7.513101128351454e-06, + "loss": 0.4809, + "step": 12440 + }, + { + "epoch": 0.6221, + "grad_norm": 2.727168083190918, + "learning_rate": 7.50972028688244e-06, + "loss": 1.0469, + "step": 12442 + }, + { + "epoch": 0.6222, + "grad_norm": 5.034714698791504, + "learning_rate": 7.506339748846461e-06, + "loss": 0.8233, + "step": 12444 + }, + { + "epoch": 0.6223, + "grad_norm": 5.084179878234863, + "learning_rate": 7.502959514655415e-06, + "loss": 1.2909, + "step": 12446 + }, + { + "epoch": 0.6224, + "grad_norm": 3.465461015701294, + "learning_rate": 7.49957958472118e-06, + "loss": 0.6255, + "step": 12448 + }, + { + "epoch": 0.6225, + "grad_norm": 4.545620918273926, + "learning_rate": 7.496199959455584e-06, + "loss": 0.4915, + "step": 12450 + }, + { + "epoch": 0.6226, + "grad_norm": 5.832085132598877, + "learning_rate": 7.492820639270435e-06, + "loss": 1.1724, + "step": 12452 + }, + { + "epoch": 0.6227, + "grad_norm": 8.513162612915039, + "learning_rate": 7.489441624577485e-06, + "loss": 0.7145, + "step": 12454 + }, + { + "epoch": 0.6228, + "grad_norm": 11.235247611999512, + "learning_rate": 7.486062915788453e-06, + "loss": 1.3972, + "step": 12456 + }, + { + "epoch": 0.6229, + "grad_norm": 4.074518203735352, + "learning_rate": 7.482684513315031e-06, + "loss": 1.2279, + "step": 12458 + }, + { + "epoch": 0.623, + "grad_norm": 15.801878929138184, + "learning_rate": 7.4793064175688635e-06, + "loss": 0.912, + "step": 12460 + }, + { + "epoch": 0.6231, + "grad_norm": 5.532405376434326, + "learning_rate": 7.475928628961567e-06, + "loss": 0.5956, + "step": 12462 + }, + { + "epoch": 0.6232, + "grad_norm": 3.9720709323883057, + "learning_rate": 7.472551147904708e-06, + "loss": 1.3218, + "step": 12464 + }, + { + "epoch": 0.6233, + "grad_norm": 6.542662143707275, + "learning_rate": 7.469173974809827e-06, + "loss": 0.8868, + "step": 12466 + }, + { + "epoch": 0.6234, + "grad_norm": 2.0901939868927, + "learning_rate": 7.465797110088417e-06, + "loss": 1.0935, + "step": 12468 + }, + { + "epoch": 0.6235, + "grad_norm": 2.9957542419433594, + "learning_rate": 7.462420554151945e-06, + "loss": 1.0723, + "step": 12470 + }, + { + "epoch": 0.6236, + "grad_norm": 4.398687362670898, + "learning_rate": 7.4590443074118325e-06, + "loss": 0.814, + "step": 12472 + }, + { + "epoch": 0.6237, + "grad_norm": 10.912601470947266, + "learning_rate": 7.45566837027946e-06, + "loss": 1.2322, + "step": 12474 + }, + { + "epoch": 0.6238, + "grad_norm": 5.925649166107178, + "learning_rate": 7.4522927431661805e-06, + "loss": 1.0459, + "step": 12476 + }, + { + "epoch": 0.6239, + "grad_norm": 4.730182647705078, + "learning_rate": 7.4489174264832995e-06, + "loss": 0.7991, + "step": 12478 + }, + { + "epoch": 0.624, + "grad_norm": 4.521225452423096, + "learning_rate": 7.445542420642097e-06, + "loss": 1.2231, + "step": 12480 + }, + { + "epoch": 0.6241, + "grad_norm": 10.423347473144531, + "learning_rate": 7.442167726053797e-06, + "loss": 0.9046, + "step": 12482 + }, + { + "epoch": 0.6242, + "grad_norm": 3.5701792240142822, + "learning_rate": 7.438793343129605e-06, + "loss": 0.9409, + "step": 12484 + }, + { + "epoch": 0.6243, + "grad_norm": 4.8600850105285645, + "learning_rate": 7.4354192722806724e-06, + "loss": 0.9434, + "step": 12486 + }, + { + "epoch": 0.6244, + "grad_norm": 3.06622052192688, + "learning_rate": 7.432045513918122e-06, + "loss": 0.8027, + "step": 12488 + }, + { + "epoch": 0.6245, + "grad_norm": 5.774558067321777, + "learning_rate": 7.428672068453041e-06, + "loss": 1.6479, + "step": 12490 + }, + { + "epoch": 0.6246, + "grad_norm": 3.9960408210754395, + "learning_rate": 7.4252989362964635e-06, + "loss": 0.9969, + "step": 12492 + }, + { + "epoch": 0.6247, + "grad_norm": 3.1431024074554443, + "learning_rate": 7.421926117859403e-06, + "loss": 0.6793, + "step": 12494 + }, + { + "epoch": 0.6248, + "grad_norm": 2.942859172821045, + "learning_rate": 7.418553613552824e-06, + "loss": 0.4273, + "step": 12496 + }, + { + "epoch": 0.6249, + "grad_norm": 1.602480411529541, + "learning_rate": 7.415181423787658e-06, + "loss": 0.8401, + "step": 12498 + }, + { + "epoch": 0.625, + "grad_norm": 4.444993019104004, + "learning_rate": 7.411809548974792e-06, + "loss": 0.8429, + "step": 12500 + }, + { + "epoch": 0.6251, + "grad_norm": 2.952838659286499, + "learning_rate": 7.408437989525086e-06, + "loss": 1.0572, + "step": 12502 + }, + { + "epoch": 0.6252, + "grad_norm": 0.08073297888040543, + "learning_rate": 7.405066745849347e-06, + "loss": 0.3287, + "step": 12504 + }, + { + "epoch": 0.6253, + "grad_norm": 1.6306889057159424, + "learning_rate": 7.401695818358354e-06, + "loss": 0.8437, + "step": 12506 + }, + { + "epoch": 0.6254, + "grad_norm": 4.411122798919678, + "learning_rate": 7.398325207462846e-06, + "loss": 1.0514, + "step": 12508 + }, + { + "epoch": 0.6255, + "grad_norm": 13.481008529663086, + "learning_rate": 7.394954913573517e-06, + "loss": 1.0304, + "step": 12510 + }, + { + "epoch": 0.6256, + "grad_norm": 3.0502378940582275, + "learning_rate": 7.391584937101034e-06, + "loss": 0.6798, + "step": 12512 + }, + { + "epoch": 0.6257, + "grad_norm": 5.624565601348877, + "learning_rate": 7.38821527845601e-06, + "loss": 0.892, + "step": 12514 + }, + { + "epoch": 0.6258, + "grad_norm": 7.703486442565918, + "learning_rate": 7.384845938049033e-06, + "loss": 1.0707, + "step": 12516 + }, + { + "epoch": 0.6259, + "grad_norm": 3.4031598567962646, + "learning_rate": 7.381476916290644e-06, + "loss": 1.0763, + "step": 12518 + }, + { + "epoch": 0.626, + "grad_norm": 5.108211040496826, + "learning_rate": 7.378108213591355e-06, + "loss": 1.1352, + "step": 12520 + }, + { + "epoch": 0.6261, + "grad_norm": 3.5120720863342285, + "learning_rate": 7.374739830361621e-06, + "loss": 0.8719, + "step": 12522 + }, + { + "epoch": 0.6262, + "grad_norm": 5.366032123565674, + "learning_rate": 7.37137176701188e-06, + "loss": 0.8601, + "step": 12524 + }, + { + "epoch": 0.6263, + "grad_norm": 2.0757033824920654, + "learning_rate": 7.368004023952518e-06, + "loss": 1.1958, + "step": 12526 + }, + { + "epoch": 0.6264, + "grad_norm": 4.70623779296875, + "learning_rate": 7.364636601593875e-06, + "loss": 0.7516, + "step": 12528 + }, + { + "epoch": 0.6265, + "grad_norm": 2.148764133453369, + "learning_rate": 7.361269500346274e-06, + "loss": 0.8631, + "step": 12530 + }, + { + "epoch": 0.6266, + "grad_norm": 3.400972604751587, + "learning_rate": 7.357902720619976e-06, + "loss": 0.728, + "step": 12532 + }, + { + "epoch": 0.6267, + "grad_norm": 2.1399950981140137, + "learning_rate": 7.354536262825219e-06, + "loss": 1.0127, + "step": 12534 + }, + { + "epoch": 0.6268, + "grad_norm": 10.596495628356934, + "learning_rate": 7.351170127372191e-06, + "loss": 0.8708, + "step": 12536 + }, + { + "epoch": 0.6269, + "grad_norm": 3.608133316040039, + "learning_rate": 7.347804314671055e-06, + "loss": 0.867, + "step": 12538 + }, + { + "epoch": 0.627, + "grad_norm": 6.4146223068237305, + "learning_rate": 7.344438825131912e-06, + "loss": 0.7602, + "step": 12540 + }, + { + "epoch": 0.6271, + "grad_norm": 3.9965221881866455, + "learning_rate": 7.341073659164848e-06, + "loss": 0.8701, + "step": 12542 + }, + { + "epoch": 0.6272, + "grad_norm": 4.309141635894775, + "learning_rate": 7.33770881717989e-06, + "loss": 0.6167, + "step": 12544 + }, + { + "epoch": 0.6273, + "grad_norm": 29.3109130859375, + "learning_rate": 7.3343442995870354e-06, + "loss": 1.2279, + "step": 12546 + }, + { + "epoch": 0.6274, + "grad_norm": 4.5676960945129395, + "learning_rate": 7.330980106796247e-06, + "loss": 0.416, + "step": 12548 + }, + { + "epoch": 0.6275, + "grad_norm": 5.683435916900635, + "learning_rate": 7.327616239217432e-06, + "loss": 1.0663, + "step": 12550 + }, + { + "epoch": 0.6276, + "grad_norm": 3.288706064224243, + "learning_rate": 7.324252697260475e-06, + "loss": 1.1038, + "step": 12552 + }, + { + "epoch": 0.6277, + "grad_norm": 7.216520309448242, + "learning_rate": 7.320889481335207e-06, + "loss": 0.8822, + "step": 12554 + }, + { + "epoch": 0.6278, + "grad_norm": 4.077920436859131, + "learning_rate": 7.3175265918514335e-06, + "loss": 1.5766, + "step": 12556 + }, + { + "epoch": 0.6279, + "grad_norm": 9.496657371520996, + "learning_rate": 7.314164029218904e-06, + "loss": 0.8875, + "step": 12558 + }, + { + "epoch": 0.628, + "grad_norm": 0.47052618861198425, + "learning_rate": 7.310801793847344e-06, + "loss": 0.5264, + "step": 12560 + }, + { + "epoch": 0.6281, + "grad_norm": 6.359327793121338, + "learning_rate": 7.307439886146428e-06, + "loss": 1.0102, + "step": 12562 + }, + { + "epoch": 0.6282, + "grad_norm": 9.660501480102539, + "learning_rate": 7.3040783065257906e-06, + "loss": 1.4313, + "step": 12564 + }, + { + "epoch": 0.6283, + "grad_norm": 3.006523847579956, + "learning_rate": 7.300717055395039e-06, + "loss": 0.7583, + "step": 12566 + }, + { + "epoch": 0.6284, + "grad_norm": 17.354869842529297, + "learning_rate": 7.297356133163722e-06, + "loss": 1.7731, + "step": 12568 + }, + { + "epoch": 0.6285, + "grad_norm": 6.460660457611084, + "learning_rate": 7.2939955402413666e-06, + "loss": 1.1472, + "step": 12570 + }, + { + "epoch": 0.6286, + "grad_norm": 4.315488815307617, + "learning_rate": 7.290635277037442e-06, + "loss": 1.0995, + "step": 12572 + }, + { + "epoch": 0.6287, + "grad_norm": 6.371575355529785, + "learning_rate": 7.287275343961393e-06, + "loss": 1.0103, + "step": 12574 + }, + { + "epoch": 0.6288, + "grad_norm": 4.770397186279297, + "learning_rate": 7.283915741422611e-06, + "loss": 1.0172, + "step": 12576 + }, + { + "epoch": 0.6289, + "grad_norm": 3.478126287460327, + "learning_rate": 7.280556469830464e-06, + "loss": 0.6583, + "step": 12578 + }, + { + "epoch": 0.629, + "grad_norm": 4.26977014541626, + "learning_rate": 7.277197529594257e-06, + "loss": 0.9275, + "step": 12580 + }, + { + "epoch": 0.6291, + "grad_norm": 2.5244557857513428, + "learning_rate": 7.273838921123273e-06, + "loss": 1.5818, + "step": 12582 + }, + { + "epoch": 0.6292, + "grad_norm": 1.68244206905365, + "learning_rate": 7.27048064482675e-06, + "loss": 0.6035, + "step": 12584 + }, + { + "epoch": 0.6293, + "grad_norm": 3.1783642768859863, + "learning_rate": 7.267122701113877e-06, + "loss": 1.3939, + "step": 12586 + }, + { + "epoch": 0.6294, + "grad_norm": 11.939421653747559, + "learning_rate": 7.263765090393817e-06, + "loss": 0.5167, + "step": 12588 + }, + { + "epoch": 0.6295, + "grad_norm": 4.045405864715576, + "learning_rate": 7.260407813075676e-06, + "loss": 0.775, + "step": 12590 + }, + { + "epoch": 0.6296, + "grad_norm": 5.2366743087768555, + "learning_rate": 7.257050869568536e-06, + "loss": 0.3667, + "step": 12592 + }, + { + "epoch": 0.6297, + "grad_norm": 13.411344528198242, + "learning_rate": 7.2536942602814255e-06, + "loss": 0.431, + "step": 12594 + }, + { + "epoch": 0.6298, + "grad_norm": 5.198902130126953, + "learning_rate": 7.250337985623342e-06, + "loss": 1.1063, + "step": 12596 + }, + { + "epoch": 0.6299, + "grad_norm": 8.023470878601074, + "learning_rate": 7.2469820460032345e-06, + "loss": 1.0072, + "step": 12598 + }, + { + "epoch": 0.63, + "grad_norm": 3.9888646602630615, + "learning_rate": 7.243626441830009e-06, + "loss": 1.0875, + "step": 12600 + }, + { + "epoch": 0.6301, + "grad_norm": 6.191049098968506, + "learning_rate": 7.240271173512545e-06, + "loss": 0.494, + "step": 12602 + }, + { + "epoch": 0.6302, + "grad_norm": 7.856168746948242, + "learning_rate": 7.236916241459664e-06, + "loss": 1.5103, + "step": 12604 + }, + { + "epoch": 0.6303, + "grad_norm": 4.077810287475586, + "learning_rate": 7.233561646080162e-06, + "loss": 1.3238, + "step": 12606 + }, + { + "epoch": 0.6304, + "grad_norm": 5.535099029541016, + "learning_rate": 7.2302073877827775e-06, + "loss": 0.2817, + "step": 12608 + }, + { + "epoch": 0.6305, + "grad_norm": 5.854419708251953, + "learning_rate": 7.226853466976222e-06, + "loss": 1.0663, + "step": 12610 + }, + { + "epoch": 0.6306, + "grad_norm": 3.1660473346710205, + "learning_rate": 7.22349988406916e-06, + "loss": 0.9901, + "step": 12612 + }, + { + "epoch": 0.6307, + "grad_norm": 5.944328308105469, + "learning_rate": 7.220146639470218e-06, + "loss": 0.8111, + "step": 12614 + }, + { + "epoch": 0.6308, + "grad_norm": 3.994776487350464, + "learning_rate": 7.216793733587976e-06, + "loss": 0.815, + "step": 12616 + }, + { + "epoch": 0.6309, + "grad_norm": 2.946131706237793, + "learning_rate": 7.21344116683097e-06, + "loss": 0.877, + "step": 12618 + }, + { + "epoch": 0.631, + "grad_norm": 4.897109031677246, + "learning_rate": 7.210088939607709e-06, + "loss": 0.6016, + "step": 12620 + }, + { + "epoch": 0.6311, + "grad_norm": 11.853781700134277, + "learning_rate": 7.206737052326646e-06, + "loss": 1.3902, + "step": 12622 + }, + { + "epoch": 0.6312, + "grad_norm": 5.823709011077881, + "learning_rate": 7.203385505396203e-06, + "loss": 0.6356, + "step": 12624 + }, + { + "epoch": 0.6313, + "grad_norm": 8.574670791625977, + "learning_rate": 7.20003429922475e-06, + "loss": 0.7525, + "step": 12626 + }, + { + "epoch": 0.6314, + "grad_norm": 0.27158331871032715, + "learning_rate": 7.196683434220626e-06, + "loss": 0.0694, + "step": 12628 + }, + { + "epoch": 0.6315, + "grad_norm": 6.169994354248047, + "learning_rate": 7.1933329107921244e-06, + "loss": 1.2298, + "step": 12630 + }, + { + "epoch": 0.6316, + "grad_norm": 5.476691722869873, + "learning_rate": 7.189982729347491e-06, + "loss": 1.0397, + "step": 12632 + }, + { + "epoch": 0.6317, + "grad_norm": 5.09492826461792, + "learning_rate": 7.1866328902949416e-06, + "loss": 1.1818, + "step": 12634 + }, + { + "epoch": 0.6318, + "grad_norm": 9.94953727722168, + "learning_rate": 7.1832833940426346e-06, + "loss": 1.1748, + "step": 12636 + }, + { + "epoch": 0.6319, + "grad_norm": 4.034902572631836, + "learning_rate": 7.179934240998707e-06, + "loss": 0.7327, + "step": 12638 + }, + { + "epoch": 0.632, + "grad_norm": 3.113407611846924, + "learning_rate": 7.176585431571235e-06, + "loss": 1.4638, + "step": 12640 + }, + { + "epoch": 0.6321, + "grad_norm": 4.46582555770874, + "learning_rate": 7.173236966168268e-06, + "loss": 1.0811, + "step": 12642 + }, + { + "epoch": 0.6322, + "grad_norm": 9.137925148010254, + "learning_rate": 7.169888845197798e-06, + "loss": 1.5404, + "step": 12644 + }, + { + "epoch": 0.6323, + "grad_norm": 7.49452543258667, + "learning_rate": 7.166541069067792e-06, + "loss": 0.8525, + "step": 12646 + }, + { + "epoch": 0.6324, + "grad_norm": 7.4226579666137695, + "learning_rate": 7.163193638186159e-06, + "loss": 0.7325, + "step": 12648 + }, + { + "epoch": 0.6325, + "grad_norm": 6.031006336212158, + "learning_rate": 7.159846552960774e-06, + "loss": 0.9355, + "step": 12650 + }, + { + "epoch": 0.6326, + "grad_norm": 7.037247657775879, + "learning_rate": 7.156499813799477e-06, + "loss": 1.0929, + "step": 12652 + }, + { + "epoch": 0.6327, + "grad_norm": 13.52134895324707, + "learning_rate": 7.153153421110047e-06, + "loss": 1.5773, + "step": 12654 + }, + { + "epoch": 0.6328, + "grad_norm": 1.6936638355255127, + "learning_rate": 7.149807375300239e-06, + "loss": 0.2041, + "step": 12656 + }, + { + "epoch": 0.6329, + "grad_norm": 3.001924753189087, + "learning_rate": 7.146461676777756e-06, + "loss": 1.421, + "step": 12658 + }, + { + "epoch": 0.633, + "grad_norm": 3.860050916671753, + "learning_rate": 7.143116325950266e-06, + "loss": 0.9364, + "step": 12660 + }, + { + "epoch": 0.6331, + "grad_norm": 2.9835009574890137, + "learning_rate": 7.139771323225382e-06, + "loss": 0.7296, + "step": 12662 + }, + { + "epoch": 0.6332, + "grad_norm": 4.764228343963623, + "learning_rate": 7.13642666901069e-06, + "loss": 0.8769, + "step": 12664 + }, + { + "epoch": 0.6333, + "grad_norm": 13.793121337890625, + "learning_rate": 7.133082363713719e-06, + "loss": 1.3607, + "step": 12666 + }, + { + "epoch": 0.6334, + "grad_norm": 1.5075019598007202, + "learning_rate": 7.129738407741964e-06, + "loss": 0.5436, + "step": 12668 + }, + { + "epoch": 0.6335, + "grad_norm": 6.880751132965088, + "learning_rate": 7.126394801502883e-06, + "loss": 1.422, + "step": 12670 + }, + { + "epoch": 0.6336, + "grad_norm": 2.3717336654663086, + "learning_rate": 7.123051545403874e-06, + "loss": 0.9741, + "step": 12672 + }, + { + "epoch": 0.6337, + "grad_norm": 4.674112319946289, + "learning_rate": 7.119708639852312e-06, + "loss": 1.0972, + "step": 12674 + }, + { + "epoch": 0.6338, + "grad_norm": 6.981712341308594, + "learning_rate": 7.116366085255511e-06, + "loss": 0.4632, + "step": 12676 + }, + { + "epoch": 0.6339, + "grad_norm": 7.5349955558776855, + "learning_rate": 7.113023882020756e-06, + "loss": 0.7331, + "step": 12678 + }, + { + "epoch": 0.634, + "grad_norm": 6.478065490722656, + "learning_rate": 7.109682030555283e-06, + "loss": 0.918, + "step": 12680 + }, + { + "epoch": 0.6341, + "grad_norm": 2.9535889625549316, + "learning_rate": 7.106340531266292e-06, + "loss": 0.2467, + "step": 12682 + }, + { + "epoch": 0.6342, + "grad_norm": 5.700320720672607, + "learning_rate": 7.102999384560927e-06, + "loss": 1.5325, + "step": 12684 + }, + { + "epoch": 0.6343, + "grad_norm": 2.722475051879883, + "learning_rate": 7.099658590846299e-06, + "loss": 1.2127, + "step": 12686 + }, + { + "epoch": 0.6344, + "grad_norm": 15.59815788269043, + "learning_rate": 7.096318150529476e-06, + "loss": 1.2154, + "step": 12688 + }, + { + "epoch": 0.6345, + "grad_norm": 4.295889854431152, + "learning_rate": 7.092978064017475e-06, + "loss": 0.6841, + "step": 12690 + }, + { + "epoch": 0.6346, + "grad_norm": 2.076974391937256, + "learning_rate": 7.0896383317172845e-06, + "loss": 0.0961, + "step": 12692 + }, + { + "epoch": 0.6347, + "grad_norm": 64.05394744873047, + "learning_rate": 7.086298954035831e-06, + "loss": 1.4167, + "step": 12694 + }, + { + "epoch": 0.6348, + "grad_norm": 1.4484866857528687, + "learning_rate": 7.082959931380011e-06, + "loss": 0.7364, + "step": 12696 + }, + { + "epoch": 0.6349, + "grad_norm": 3.030961275100708, + "learning_rate": 7.079621264156676e-06, + "loss": 0.7912, + "step": 12698 + }, + { + "epoch": 0.635, + "grad_norm": 3.188392162322998, + "learning_rate": 7.076282952772634e-06, + "loss": 0.8109, + "step": 12700 + }, + { + "epoch": 0.6351, + "grad_norm": 4.321324348449707, + "learning_rate": 7.072944997634646e-06, + "loss": 0.4609, + "step": 12702 + }, + { + "epoch": 0.6352, + "grad_norm": 6.544966220855713, + "learning_rate": 7.069607399149427e-06, + "loss": 0.504, + "step": 12704 + }, + { + "epoch": 0.6353, + "grad_norm": 35.62334060668945, + "learning_rate": 7.0662701577236605e-06, + "loss": 1.9419, + "step": 12706 + }, + { + "epoch": 0.6354, + "grad_norm": 5.31743049621582, + "learning_rate": 7.062933273763974e-06, + "loss": 1.0702, + "step": 12708 + }, + { + "epoch": 0.6355, + "grad_norm": 5.450620651245117, + "learning_rate": 7.059596747676963e-06, + "loss": 1.0684, + "step": 12710 + }, + { + "epoch": 0.6356, + "grad_norm": 11.118593215942383, + "learning_rate": 7.056260579869165e-06, + "loss": 0.3077, + "step": 12712 + }, + { + "epoch": 0.6357, + "grad_norm": 0.17501306533813477, + "learning_rate": 7.052924770747087e-06, + "loss": 0.0966, + "step": 12714 + }, + { + "epoch": 0.6358, + "grad_norm": 2.263526201248169, + "learning_rate": 7.049589320717186e-06, + "loss": 1.0932, + "step": 12716 + }, + { + "epoch": 0.6359, + "grad_norm": 4.577201843261719, + "learning_rate": 7.0462542301858805e-06, + "loss": 1.3712, + "step": 12718 + }, + { + "epoch": 0.636, + "grad_norm": 11.797422409057617, + "learning_rate": 7.042919499559538e-06, + "loss": 0.7553, + "step": 12720 + }, + { + "epoch": 0.6361, + "grad_norm": 16.42951011657715, + "learning_rate": 7.0395851292444775e-06, + "loss": 0.8032, + "step": 12722 + }, + { + "epoch": 0.6362, + "grad_norm": 34.95752716064453, + "learning_rate": 7.036251119646993e-06, + "loss": 1.223, + "step": 12724 + }, + { + "epoch": 0.6363, + "grad_norm": 3.638289213180542, + "learning_rate": 7.032917471173319e-06, + "loss": 1.3298, + "step": 12726 + }, + { + "epoch": 0.6364, + "grad_norm": 5.446099281311035, + "learning_rate": 7.029584184229653e-06, + "loss": 1.0237, + "step": 12728 + }, + { + "epoch": 0.6365, + "grad_norm": 7.848588943481445, + "learning_rate": 7.026251259222141e-06, + "loss": 0.8651, + "step": 12730 + }, + { + "epoch": 0.6366, + "grad_norm": 6.843198299407959, + "learning_rate": 7.022918696556896e-06, + "loss": 1.4281, + "step": 12732 + }, + { + "epoch": 0.6367, + "grad_norm": 2.1318113803863525, + "learning_rate": 7.019586496639974e-06, + "loss": 0.5334, + "step": 12734 + }, + { + "epoch": 0.6368, + "grad_norm": 4.358609199523926, + "learning_rate": 7.016254659877398e-06, + "loss": 0.6539, + "step": 12736 + }, + { + "epoch": 0.6369, + "grad_norm": 3.7976279258728027, + "learning_rate": 7.012923186675145e-06, + "loss": 0.7824, + "step": 12738 + }, + { + "epoch": 0.637, + "grad_norm": 3.957369565963745, + "learning_rate": 7.009592077439135e-06, + "loss": 1.0787, + "step": 12740 + }, + { + "epoch": 0.6371, + "grad_norm": 4.059211254119873, + "learning_rate": 7.006261332575262e-06, + "loss": 0.5849, + "step": 12742 + }, + { + "epoch": 0.6372, + "grad_norm": 5.223538875579834, + "learning_rate": 7.002930952489362e-06, + "loss": 1.0408, + "step": 12744 + }, + { + "epoch": 0.6373, + "grad_norm": 2.9616470336914062, + "learning_rate": 6.99960093758724e-06, + "loss": 1.2441, + "step": 12746 + }, + { + "epoch": 0.6374, + "grad_norm": 10.143073081970215, + "learning_rate": 6.996271288274636e-06, + "loss": 1.367, + "step": 12748 + }, + { + "epoch": 0.6375, + "grad_norm": 5.251339435577393, + "learning_rate": 6.992942004957271e-06, + "loss": 1.2328, + "step": 12750 + }, + { + "epoch": 0.6376, + "grad_norm": 3.7129805088043213, + "learning_rate": 6.9896130880407965e-06, + "loss": 1.58, + "step": 12752 + }, + { + "epoch": 0.6377, + "grad_norm": 2.98396897315979, + "learning_rate": 6.986284537930837e-06, + "loss": 1.1423, + "step": 12754 + }, + { + "epoch": 0.6378, + "grad_norm": 5.2683281898498535, + "learning_rate": 6.982956355032968e-06, + "loss": 0.8337, + "step": 12756 + }, + { + "epoch": 0.6379, + "grad_norm": 1.9340113401412964, + "learning_rate": 6.979628539752711e-06, + "loss": 2.3443, + "step": 12758 + }, + { + "epoch": 0.638, + "grad_norm": 6.771208763122559, + "learning_rate": 6.976301092495556e-06, + "loss": 0.6336, + "step": 12760 + }, + { + "epoch": 0.6381, + "grad_norm": 2.0868985652923584, + "learning_rate": 6.972974013666942e-06, + "loss": 1.0666, + "step": 12762 + }, + { + "epoch": 0.6382, + "grad_norm": 4.309401035308838, + "learning_rate": 6.969647303672262e-06, + "loss": 0.812, + "step": 12764 + }, + { + "epoch": 0.6383, + "grad_norm": 6.542867183685303, + "learning_rate": 6.966320962916864e-06, + "loss": 0.87, + "step": 12766 + }, + { + "epoch": 0.6384, + "grad_norm": 7.51300048828125, + "learning_rate": 6.962994991806059e-06, + "loss": 1.2277, + "step": 12768 + }, + { + "epoch": 0.6385, + "grad_norm": 4.088868618011475, + "learning_rate": 6.959669390745097e-06, + "loss": 0.7769, + "step": 12770 + }, + { + "epoch": 0.6386, + "grad_norm": 3.2047455310821533, + "learning_rate": 6.956344160139201e-06, + "loss": 0.6199, + "step": 12772 + }, + { + "epoch": 0.6387, + "grad_norm": 3.773235321044922, + "learning_rate": 6.953019300393538e-06, + "loss": 0.2624, + "step": 12774 + }, + { + "epoch": 0.6388, + "grad_norm": 7.330448627471924, + "learning_rate": 6.949694811913226e-06, + "loss": 0.5636, + "step": 12776 + }, + { + "epoch": 0.6389, + "grad_norm": 11.202969551086426, + "learning_rate": 6.946370695103353e-06, + "loss": 1.3409, + "step": 12778 + }, + { + "epoch": 0.639, + "grad_norm": 3.80289363861084, + "learning_rate": 6.943046950368944e-06, + "loss": 0.8485, + "step": 12780 + }, + { + "epoch": 0.6391, + "grad_norm": 5.2880048751831055, + "learning_rate": 6.9397235781149945e-06, + "loss": 1.213, + "step": 12782 + }, + { + "epoch": 0.6392, + "grad_norm": 7.083311080932617, + "learning_rate": 6.9364005787464406e-06, + "loss": 1.5587, + "step": 12784 + }, + { + "epoch": 0.6393, + "grad_norm": 2.6848509311676025, + "learning_rate": 6.933077952668189e-06, + "loss": 0.8042, + "step": 12786 + }, + { + "epoch": 0.6394, + "grad_norm": 3.987548828125, + "learning_rate": 6.929755700285082e-06, + "loss": 1.2366, + "step": 12788 + }, + { + "epoch": 0.6395, + "grad_norm": 4.714727878570557, + "learning_rate": 6.92643382200193e-06, + "loss": 1.5176, + "step": 12790 + }, + { + "epoch": 0.6396, + "grad_norm": 6.5660319328308105, + "learning_rate": 6.923112318223497e-06, + "loss": 0.9499, + "step": 12792 + }, + { + "epoch": 0.6397, + "grad_norm": 4.266496658325195, + "learning_rate": 6.91979118935449e-06, + "loss": 1.1392, + "step": 12794 + }, + { + "epoch": 0.6398, + "grad_norm": 4.248990058898926, + "learning_rate": 6.9164704357995874e-06, + "loss": 1.2513, + "step": 12796 + }, + { + "epoch": 0.6399, + "grad_norm": 5.52609920501709, + "learning_rate": 6.913150057963405e-06, + "loss": 0.5509, + "step": 12798 + }, + { + "epoch": 0.64, + "grad_norm": 8.443953514099121, + "learning_rate": 6.909830056250527e-06, + "loss": 0.5254, + "step": 12800 + }, + { + "epoch": 0.6401, + "grad_norm": 2.163978338241577, + "learning_rate": 6.906510431065481e-06, + "loss": 0.463, + "step": 12802 + }, + { + "epoch": 0.6402, + "grad_norm": 2.3092904090881348, + "learning_rate": 6.903191182812759e-06, + "loss": 1.352, + "step": 12804 + }, + { + "epoch": 0.6403, + "grad_norm": 4.242636203765869, + "learning_rate": 6.899872311896795e-06, + "loss": 0.8339, + "step": 12806 + }, + { + "epoch": 0.6404, + "grad_norm": 7.800588130950928, + "learning_rate": 6.896553818721989e-06, + "loss": 1.4397, + "step": 12808 + }, + { + "epoch": 0.6405, + "grad_norm": 8.069350242614746, + "learning_rate": 6.893235703692685e-06, + "loss": 0.657, + "step": 12810 + }, + { + "epoch": 0.6406, + "grad_norm": 4.370704650878906, + "learning_rate": 6.889917967213184e-06, + "loss": 1.0348, + "step": 12812 + }, + { + "epoch": 0.6407, + "grad_norm": 3.936901807785034, + "learning_rate": 6.8866006096877495e-06, + "loss": 1.2348, + "step": 12814 + }, + { + "epoch": 0.6408, + "grad_norm": 10.372121810913086, + "learning_rate": 6.883283631520582e-06, + "loss": 1.7034, + "step": 12816 + }, + { + "epoch": 0.6409, + "grad_norm": 3.593907594680786, + "learning_rate": 6.879967033115853e-06, + "loss": 0.8935, + "step": 12818 + }, + { + "epoch": 0.641, + "grad_norm": 7.085336208343506, + "learning_rate": 6.876650814877675e-06, + "loss": 0.6619, + "step": 12820 + }, + { + "epoch": 0.6411, + "grad_norm": 2.585451364517212, + "learning_rate": 6.8733349772101235e-06, + "loss": 1.5827, + "step": 12822 + }, + { + "epoch": 0.6412, + "grad_norm": 2.912605047225952, + "learning_rate": 6.870019520517217e-06, + "loss": 0.9776, + "step": 12824 + }, + { + "epoch": 0.6413, + "grad_norm": 4.154775142669678, + "learning_rate": 6.866704445202943e-06, + "loss": 0.9806, + "step": 12826 + }, + { + "epoch": 0.6414, + "grad_norm": 4.957984924316406, + "learning_rate": 6.863389751671225e-06, + "loss": 0.9715, + "step": 12828 + }, + { + "epoch": 0.6415, + "grad_norm": 2.383138418197632, + "learning_rate": 6.860075440325951e-06, + "loss": 0.2342, + "step": 12830 + }, + { + "epoch": 0.6416, + "grad_norm": 4.767331123352051, + "learning_rate": 6.856761511570963e-06, + "loss": 1.3239, + "step": 12832 + }, + { + "epoch": 0.6417, + "grad_norm": 5.472435474395752, + "learning_rate": 6.853447965810046e-06, + "loss": 0.5561, + "step": 12834 + }, + { + "epoch": 0.6418, + "grad_norm": 2.5810153484344482, + "learning_rate": 6.850134803446955e-06, + "loss": 1.18, + "step": 12836 + }, + { + "epoch": 0.6419, + "grad_norm": 5.649041175842285, + "learning_rate": 6.846822024885379e-06, + "loss": 1.4928, + "step": 12838 + }, + { + "epoch": 0.642, + "grad_norm": 3.8948142528533936, + "learning_rate": 6.843509630528977e-06, + "loss": 0.8662, + "step": 12840 + }, + { + "epoch": 0.6421, + "grad_norm": 3.3222391605377197, + "learning_rate": 6.840197620781349e-06, + "loss": 0.878, + "step": 12842 + }, + { + "epoch": 0.6422, + "grad_norm": 11.99074649810791, + "learning_rate": 6.836885996046061e-06, + "loss": 0.9134, + "step": 12844 + }, + { + "epoch": 0.6423, + "grad_norm": 2.7355237007141113, + "learning_rate": 6.8335747567266175e-06, + "loss": 0.5723, + "step": 12846 + }, + { + "epoch": 0.6424, + "grad_norm": 2.690721273422241, + "learning_rate": 6.830263903226483e-06, + "loss": 0.8146, + "step": 12848 + }, + { + "epoch": 0.6425, + "grad_norm": 4.079555511474609, + "learning_rate": 6.826953435949081e-06, + "loss": 0.7994, + "step": 12850 + }, + { + "epoch": 0.6426, + "grad_norm": 2.9589955806732178, + "learning_rate": 6.823643355297774e-06, + "loss": 0.6881, + "step": 12852 + }, + { + "epoch": 0.6427, + "grad_norm": 3.998776435852051, + "learning_rate": 6.820333661675893e-06, + "loss": 1.4603, + "step": 12854 + }, + { + "epoch": 0.6428, + "grad_norm": 2.8957254886627197, + "learning_rate": 6.8170243554867065e-06, + "loss": 0.302, + "step": 12856 + }, + { + "epoch": 0.6429, + "grad_norm": 18.248233795166016, + "learning_rate": 6.8137154371334505e-06, + "loss": 0.9483, + "step": 12858 + }, + { + "epoch": 0.643, + "grad_norm": 6.988523006439209, + "learning_rate": 6.8104069070193e-06, + "loss": 1.5586, + "step": 12860 + }, + { + "epoch": 0.6431, + "grad_norm": 7.870941162109375, + "learning_rate": 6.807098765547398e-06, + "loss": 1.33, + "step": 12862 + }, + { + "epoch": 0.6432, + "grad_norm": 6.355685234069824, + "learning_rate": 6.803791013120822e-06, + "loss": 0.997, + "step": 12864 + }, + { + "epoch": 0.6433, + "grad_norm": 8.84795093536377, + "learning_rate": 6.800483650142618e-06, + "loss": 0.6659, + "step": 12866 + }, + { + "epoch": 0.6434, + "grad_norm": 3.4820873737335205, + "learning_rate": 6.797176677015775e-06, + "loss": 0.9713, + "step": 12868 + }, + { + "epoch": 0.6435, + "grad_norm": 7.010158061981201, + "learning_rate": 6.793870094143238e-06, + "loss": 0.6797, + "step": 12870 + }, + { + "epoch": 0.6436, + "grad_norm": 4.575342655181885, + "learning_rate": 6.790563901927907e-06, + "loss": 1.0597, + "step": 12872 + }, + { + "epoch": 0.6437, + "grad_norm": 9.241729736328125, + "learning_rate": 6.7872581007726265e-06, + "loss": 2.1879, + "step": 12874 + }, + { + "epoch": 0.6438, + "grad_norm": 13.169028282165527, + "learning_rate": 6.783952691080203e-06, + "loss": 0.6771, + "step": 12876 + }, + { + "epoch": 0.6439, + "grad_norm": 8.084972381591797, + "learning_rate": 6.780647673253391e-06, + "loss": 0.933, + "step": 12878 + }, + { + "epoch": 0.644, + "grad_norm": 6.076374053955078, + "learning_rate": 6.777343047694891e-06, + "loss": 1.1903, + "step": 12880 + }, + { + "epoch": 0.6441, + "grad_norm": 0.8696038126945496, + "learning_rate": 6.774038814807369e-06, + "loss": 0.4272, + "step": 12882 + }, + { + "epoch": 0.6442, + "grad_norm": 4.477651596069336, + "learning_rate": 6.770734974993427e-06, + "loss": 1.2104, + "step": 12884 + }, + { + "epoch": 0.6443, + "grad_norm": 3.6354196071624756, + "learning_rate": 6.767431528655635e-06, + "loss": 0.9963, + "step": 12886 + }, + { + "epoch": 0.6444, + "grad_norm": 2.2914223670959473, + "learning_rate": 6.764128476196505e-06, + "loss": 0.4211, + "step": 12888 + }, + { + "epoch": 0.6445, + "grad_norm": 5.523476600646973, + "learning_rate": 6.7608258180185085e-06, + "loss": 1.1985, + "step": 12890 + }, + { + "epoch": 0.6446, + "grad_norm": 2.494701385498047, + "learning_rate": 6.757523554524056e-06, + "loss": 0.3723, + "step": 12892 + }, + { + "epoch": 0.6447, + "grad_norm": 8.114013671875, + "learning_rate": 6.754221686115525e-06, + "loss": 1.2673, + "step": 12894 + }, + { + "epoch": 0.6448, + "grad_norm": 3.7962660789489746, + "learning_rate": 6.750920213195238e-06, + "loss": 0.3468, + "step": 12896 + }, + { + "epoch": 0.6449, + "grad_norm": 2.9250123500823975, + "learning_rate": 6.747619136165464e-06, + "loss": 1.3381, + "step": 12898 + }, + { + "epoch": 0.645, + "grad_norm": 3.4591829776763916, + "learning_rate": 6.744318455428436e-06, + "loss": 0.9076, + "step": 12900 + }, + { + "epoch": 0.6451, + "grad_norm": 8.632670402526855, + "learning_rate": 6.741018171386325e-06, + "loss": 0.3478, + "step": 12902 + }, + { + "epoch": 0.6452, + "grad_norm": 5.755775451660156, + "learning_rate": 6.737718284441267e-06, + "loss": 0.9608, + "step": 12904 + }, + { + "epoch": 0.6453, + "grad_norm": 6.147887229919434, + "learning_rate": 6.734418794995338e-06, + "loss": 0.8594, + "step": 12906 + }, + { + "epoch": 0.6454, + "grad_norm": 4.218728065490723, + "learning_rate": 6.731119703450577e-06, + "loss": 0.6941, + "step": 12908 + }, + { + "epoch": 0.6455, + "grad_norm": 4.6935038566589355, + "learning_rate": 6.727821010208961e-06, + "loss": 0.6354, + "step": 12910 + }, + { + "epoch": 0.6456, + "grad_norm": 3.442505359649658, + "learning_rate": 6.7245227156724324e-06, + "loss": 0.9082, + "step": 12912 + }, + { + "epoch": 0.6457, + "grad_norm": 3.845705032348633, + "learning_rate": 6.721224820242876e-06, + "loss": 1.39, + "step": 12914 + }, + { + "epoch": 0.6458, + "grad_norm": 9.011568069458008, + "learning_rate": 6.717927324322124e-06, + "loss": 1.2391, + "step": 12916 + }, + { + "epoch": 0.6459, + "grad_norm": 6.9016008377075195, + "learning_rate": 6.714630228311978e-06, + "loss": 1.5054, + "step": 12918 + }, + { + "epoch": 0.646, + "grad_norm": 5.030428886413574, + "learning_rate": 6.711333532614168e-06, + "loss": 1.1471, + "step": 12920 + }, + { + "epoch": 0.6461, + "grad_norm": 4.381188869476318, + "learning_rate": 6.708037237630395e-06, + "loss": 1.7379, + "step": 12922 + }, + { + "epoch": 0.6462, + "grad_norm": 5.062760353088379, + "learning_rate": 6.704741343762296e-06, + "loss": 0.3676, + "step": 12924 + }, + { + "epoch": 0.6463, + "grad_norm": 3.586167097091675, + "learning_rate": 6.701445851411472e-06, + "loss": 0.6795, + "step": 12926 + }, + { + "epoch": 0.6464, + "grad_norm": 3.8556039333343506, + "learning_rate": 6.698150760979463e-06, + "loss": 1.5118, + "step": 12928 + }, + { + "epoch": 0.6465, + "grad_norm": 9.808699607849121, + "learning_rate": 6.694856072867772e-06, + "loss": 0.5705, + "step": 12930 + }, + { + "epoch": 0.6466, + "grad_norm": 3.0225670337677, + "learning_rate": 6.69156178747784e-06, + "loss": 0.7583, + "step": 12932 + }, + { + "epoch": 0.6467, + "grad_norm": 6.301138877868652, + "learning_rate": 6.688267905211068e-06, + "loss": 0.6303, + "step": 12934 + }, + { + "epoch": 0.6468, + "grad_norm": 6.008458137512207, + "learning_rate": 6.684974426468809e-06, + "loss": 0.4976, + "step": 12936 + }, + { + "epoch": 0.6469, + "grad_norm": 9.997257232666016, + "learning_rate": 6.681681351652356e-06, + "loss": 1.8067, + "step": 12938 + }, + { + "epoch": 0.647, + "grad_norm": 5.962292671203613, + "learning_rate": 6.67838868116297e-06, + "loss": 0.7909, + "step": 12940 + }, + { + "epoch": 0.6471, + "grad_norm": 5.20066499710083, + "learning_rate": 6.675096415401843e-06, + "loss": 1.332, + "step": 12942 + }, + { + "epoch": 0.6472, + "grad_norm": 2.801358461380005, + "learning_rate": 6.671804554770135e-06, + "loss": 0.6708, + "step": 12944 + }, + { + "epoch": 0.6473, + "grad_norm": 3.5652008056640625, + "learning_rate": 6.668513099668944e-06, + "loss": 1.5682, + "step": 12946 + }, + { + "epoch": 0.6474, + "grad_norm": 2.4433062076568604, + "learning_rate": 6.6652220504993305e-06, + "loss": 0.5518, + "step": 12948 + }, + { + "epoch": 0.6475, + "grad_norm": 4.309279918670654, + "learning_rate": 6.661931407662292e-06, + "loss": 0.9758, + "step": 12950 + }, + { + "epoch": 0.6476, + "grad_norm": 2.985142707824707, + "learning_rate": 6.658641171558785e-06, + "loss": 0.8414, + "step": 12952 + }, + { + "epoch": 0.6477, + "grad_norm": 3.406404495239258, + "learning_rate": 6.65535134258972e-06, + "loss": 0.7219, + "step": 12954 + }, + { + "epoch": 0.6478, + "grad_norm": 3.6787636280059814, + "learning_rate": 6.6520619211559435e-06, + "loss": 0.6915, + "step": 12956 + }, + { + "epoch": 0.6479, + "grad_norm": 2.984846830368042, + "learning_rate": 6.6487729076582715e-06, + "loss": 1.2323, + "step": 12958 + }, + { + "epoch": 0.648, + "grad_norm": 6.731113433837891, + "learning_rate": 6.645484302497452e-06, + "loss": 1.2518, + "step": 12960 + }, + { + "epoch": 0.6481, + "grad_norm": 2.905376434326172, + "learning_rate": 6.642196106074195e-06, + "loss": 0.942, + "step": 12962 + }, + { + "epoch": 0.6482, + "grad_norm": 3.5548853874206543, + "learning_rate": 6.638908318789156e-06, + "loss": 0.3872, + "step": 12964 + }, + { + "epoch": 0.6483, + "grad_norm": 2.5750832557678223, + "learning_rate": 6.635620941042946e-06, + "loss": 0.5321, + "step": 12966 + }, + { + "epoch": 0.6484, + "grad_norm": 4.088967800140381, + "learning_rate": 6.63233397323612e-06, + "loss": 0.9432, + "step": 12968 + }, + { + "epoch": 0.6485, + "grad_norm": 1.7776156663894653, + "learning_rate": 6.629047415769181e-06, + "loss": 0.815, + "step": 12970 + }, + { + "epoch": 0.6486, + "grad_norm": 2.4210689067840576, + "learning_rate": 6.62576126904259e-06, + "loss": 0.8456, + "step": 12972 + }, + { + "epoch": 0.6487, + "grad_norm": 5.039328575134277, + "learning_rate": 6.622475533456751e-06, + "loss": 0.6209, + "step": 12974 + }, + { + "epoch": 0.6488, + "grad_norm": 5.846212387084961, + "learning_rate": 6.6191902094120295e-06, + "loss": 1.0579, + "step": 12976 + }, + { + "epoch": 0.6489, + "grad_norm": 2.387763500213623, + "learning_rate": 6.61590529730872e-06, + "loss": 0.9599, + "step": 12978 + }, + { + "epoch": 0.649, + "grad_norm": 5.235284805297852, + "learning_rate": 6.612620797547087e-06, + "loss": 0.7427, + "step": 12980 + }, + { + "epoch": 0.6491, + "grad_norm": 1.996646523475647, + "learning_rate": 6.609336710527333e-06, + "loss": 0.5191, + "step": 12982 + }, + { + "epoch": 0.6492, + "grad_norm": 80.3396987915039, + "learning_rate": 6.60605303664962e-06, + "loss": 0.9013, + "step": 12984 + }, + { + "epoch": 0.6493, + "grad_norm": 7.489296913146973, + "learning_rate": 6.602769776314049e-06, + "loss": 1.1178, + "step": 12986 + }, + { + "epoch": 0.6494, + "grad_norm": 4.895565032958984, + "learning_rate": 6.5994869299206736e-06, + "loss": 1.2245, + "step": 12988 + }, + { + "epoch": 0.6495, + "grad_norm": 5.538414001464844, + "learning_rate": 6.596204497869501e-06, + "loss": 1.1107, + "step": 12990 + }, + { + "epoch": 0.6496, + "grad_norm": 3.815797805786133, + "learning_rate": 6.5929224805604845e-06, + "loss": 1.2667, + "step": 12992 + }, + { + "epoch": 0.6497, + "grad_norm": 5.826108455657959, + "learning_rate": 6.589640878393531e-06, + "loss": 1.3878, + "step": 12994 + }, + { + "epoch": 0.6498, + "grad_norm": 3.9368882179260254, + "learning_rate": 6.58635969176849e-06, + "loss": 1.0215, + "step": 12996 + }, + { + "epoch": 0.6499, + "grad_norm": 2.5175671577453613, + "learning_rate": 6.583078921085167e-06, + "loss": 1.0938, + "step": 12998 + }, + { + "epoch": 0.65, + "grad_norm": 3.804403781890869, + "learning_rate": 6.579798566743314e-06, + "loss": 0.8079, + "step": 13000 + }, + { + "epoch": 0.6501, + "grad_norm": 23.95124626159668, + "learning_rate": 6.5765186291426295e-06, + "loss": 1.7201, + "step": 13002 + }, + { + "epoch": 0.6502, + "grad_norm": 6.106164932250977, + "learning_rate": 6.573239108682769e-06, + "loss": 1.104, + "step": 13004 + }, + { + "epoch": 0.6503, + "grad_norm": 2.4725348949432373, + "learning_rate": 6.569960005763323e-06, + "loss": 0.7165, + "step": 13006 + }, + { + "epoch": 0.6504, + "grad_norm": 25.624048233032227, + "learning_rate": 6.566681320783849e-06, + "loss": 1.511, + "step": 13008 + }, + { + "epoch": 0.6505, + "grad_norm": 6.331400394439697, + "learning_rate": 6.56340305414384e-06, + "loss": 0.9684, + "step": 13010 + }, + { + "epoch": 0.6506, + "grad_norm": 2.27785325050354, + "learning_rate": 6.560125206242746e-06, + "loss": 0.4591, + "step": 13012 + }, + { + "epoch": 0.6507, + "grad_norm": 5.047182559967041, + "learning_rate": 6.55684777747996e-06, + "loss": 1.7301, + "step": 13014 + }, + { + "epoch": 0.6508, + "grad_norm": 5.377655982971191, + "learning_rate": 6.553570768254831e-06, + "loss": 1.3492, + "step": 13016 + }, + { + "epoch": 0.6509, + "grad_norm": 5.217703342437744, + "learning_rate": 6.550294178966647e-06, + "loss": 1.4812, + "step": 13018 + }, + { + "epoch": 0.651, + "grad_norm": 4.6786651611328125, + "learning_rate": 6.547018010014654e-06, + "loss": 0.8875, + "step": 13020 + }, + { + "epoch": 0.6511, + "grad_norm": 3.763824224472046, + "learning_rate": 6.543742261798045e-06, + "loss": 1.0414, + "step": 13022 + }, + { + "epoch": 0.6512, + "grad_norm": 3.412278175354004, + "learning_rate": 6.540466934715953e-06, + "loss": 0.1114, + "step": 13024 + }, + { + "epoch": 0.6513, + "grad_norm": 1.99562668800354, + "learning_rate": 6.537192029167474e-06, + "loss": 0.4828, + "step": 13026 + }, + { + "epoch": 0.6514, + "grad_norm": 3.157348155975342, + "learning_rate": 6.53391754555164e-06, + "loss": 0.3808, + "step": 13028 + }, + { + "epoch": 0.6515, + "grad_norm": 5.300275802612305, + "learning_rate": 6.530643484267443e-06, + "loss": 0.6466, + "step": 13030 + }, + { + "epoch": 0.6516, + "grad_norm": 2.744471549987793, + "learning_rate": 6.52736984571381e-06, + "loss": 0.5217, + "step": 13032 + }, + { + "epoch": 0.6517, + "grad_norm": 2.697944402694702, + "learning_rate": 6.524096630289632e-06, + "loss": 0.7822, + "step": 13034 + }, + { + "epoch": 0.6518, + "grad_norm": 3.978098154067993, + "learning_rate": 6.520823838393732e-06, + "loss": 0.707, + "step": 13036 + }, + { + "epoch": 0.6519, + "grad_norm": 4.206686019897461, + "learning_rate": 6.517551470424894e-06, + "loss": 0.5825, + "step": 13038 + }, + { + "epoch": 0.652, + "grad_norm": 6.843038558959961, + "learning_rate": 6.5142795267818505e-06, + "loss": 0.9368, + "step": 13040 + }, + { + "epoch": 0.6521, + "grad_norm": 4.194786548614502, + "learning_rate": 6.511008007863269e-06, + "loss": 1.1422, + "step": 13042 + }, + { + "epoch": 0.6522, + "grad_norm": 5.599597930908203, + "learning_rate": 6.5077369140677815e-06, + "loss": 0.7926, + "step": 13044 + }, + { + "epoch": 0.6523, + "grad_norm": 8.475319862365723, + "learning_rate": 6.504466245793955e-06, + "loss": 1.1009, + "step": 13046 + }, + { + "epoch": 0.6524, + "grad_norm": 3.9091155529022217, + "learning_rate": 6.501196003440313e-06, + "loss": 0.6881, + "step": 13048 + }, + { + "epoch": 0.6525, + "grad_norm": 5.027975559234619, + "learning_rate": 6.497926187405326e-06, + "loss": 0.6361, + "step": 13050 + }, + { + "epoch": 0.6526, + "grad_norm": 5.367111682891846, + "learning_rate": 6.494656798087412e-06, + "loss": 0.9789, + "step": 13052 + }, + { + "epoch": 0.6527, + "grad_norm": 6.763897895812988, + "learning_rate": 6.49138783588493e-06, + "loss": 1.0798, + "step": 13054 + }, + { + "epoch": 0.6528, + "grad_norm": 5.019383430480957, + "learning_rate": 6.488119301196201e-06, + "loss": 1.0154, + "step": 13056 + }, + { + "epoch": 0.6529, + "grad_norm": 7.2908735275268555, + "learning_rate": 6.484851194419484e-06, + "loss": 0.3271, + "step": 13058 + }, + { + "epoch": 0.653, + "grad_norm": 6.341938495635986, + "learning_rate": 6.481583515952983e-06, + "loss": 0.9571, + "step": 13060 + }, + { + "epoch": 0.6531, + "grad_norm": 10.108006477355957, + "learning_rate": 6.478316266194861e-06, + "loss": 1.0011, + "step": 13062 + }, + { + "epoch": 0.6532, + "grad_norm": 4.8285603523254395, + "learning_rate": 6.475049445543215e-06, + "loss": 1.2672, + "step": 13064 + }, + { + "epoch": 0.6533, + "grad_norm": 6.186895847320557, + "learning_rate": 6.471783054396106e-06, + "loss": 1.1918, + "step": 13066 + }, + { + "epoch": 0.6534, + "grad_norm": 9.061444282531738, + "learning_rate": 6.468517093151525e-06, + "loss": 0.7685, + "step": 13068 + }, + { + "epoch": 0.6535, + "grad_norm": 12.064282417297363, + "learning_rate": 6.465251562207431e-06, + "loss": 1.274, + "step": 13070 + }, + { + "epoch": 0.6536, + "grad_norm": 3.99114727973938, + "learning_rate": 6.461986461961706e-06, + "loss": 0.9526, + "step": 13072 + }, + { + "epoch": 0.6537, + "grad_norm": 1.3663185834884644, + "learning_rate": 6.458721792812204e-06, + "loss": 0.7885, + "step": 13074 + }, + { + "epoch": 0.6538, + "grad_norm": 2.3046557903289795, + "learning_rate": 6.455457555156706e-06, + "loss": 0.7883, + "step": 13076 + }, + { + "epoch": 0.6539, + "grad_norm": 3.4481749534606934, + "learning_rate": 6.452193749392952e-06, + "loss": 1.0342, + "step": 13078 + }, + { + "epoch": 0.654, + "grad_norm": 5.938236236572266, + "learning_rate": 6.448930375918632e-06, + "loss": 1.0889, + "step": 13080 + }, + { + "epoch": 0.6541, + "grad_norm": 16.907447814941406, + "learning_rate": 6.445667435131371e-06, + "loss": 1.4075, + "step": 13082 + }, + { + "epoch": 0.6542, + "grad_norm": 4.998201847076416, + "learning_rate": 6.442404927428751e-06, + "loss": 1.4339, + "step": 13084 + }, + { + "epoch": 0.6543, + "grad_norm": 4.822167873382568, + "learning_rate": 6.4391428532083e-06, + "loss": 1.2378, + "step": 13086 + }, + { + "epoch": 0.6544, + "grad_norm": 3.2689597606658936, + "learning_rate": 6.435881212867494e-06, + "loss": 0.7718, + "step": 13088 + }, + { + "epoch": 0.6545, + "grad_norm": 6.402206897735596, + "learning_rate": 6.432620006803747e-06, + "loss": 0.8752, + "step": 13090 + }, + { + "epoch": 0.6546, + "grad_norm": 5.090356349945068, + "learning_rate": 6.4293592354144365e-06, + "loss": 0.5961, + "step": 13092 + }, + { + "epoch": 0.6547, + "grad_norm": 4.26499605178833, + "learning_rate": 6.426098899096869e-06, + "loss": 0.6859, + "step": 13094 + }, + { + "epoch": 0.6548, + "grad_norm": 4.0196309089660645, + "learning_rate": 6.422838998248308e-06, + "loss": 0.4562, + "step": 13096 + }, + { + "epoch": 0.6549, + "grad_norm": 5.680851459503174, + "learning_rate": 6.419579533265968e-06, + "loss": 0.648, + "step": 13098 + }, + { + "epoch": 0.655, + "grad_norm": 3.7656338214874268, + "learning_rate": 6.4163205045469975e-06, + "loss": 0.6316, + "step": 13100 + }, + { + "epoch": 0.6551, + "grad_norm": 3.125953197479248, + "learning_rate": 6.413061912488507e-06, + "loss": 0.9097, + "step": 13102 + }, + { + "epoch": 0.6552, + "grad_norm": 4.557722091674805, + "learning_rate": 6.409803757487539e-06, + "loss": 1.3899, + "step": 13104 + }, + { + "epoch": 0.6553, + "grad_norm": 1.592656135559082, + "learning_rate": 6.406546039941095e-06, + "loss": 0.6717, + "step": 13106 + }, + { + "epoch": 0.6554, + "grad_norm": 2.8633813858032227, + "learning_rate": 6.403288760246112e-06, + "loss": 1.2345, + "step": 13108 + }, + { + "epoch": 0.6555, + "grad_norm": 5.481114387512207, + "learning_rate": 6.4000319187994895e-06, + "loss": 0.3052, + "step": 13110 + }, + { + "epoch": 0.6556, + "grad_norm": 1.9868654012680054, + "learning_rate": 6.396775515998055e-06, + "loss": 1.2881, + "step": 13112 + }, + { + "epoch": 0.6557, + "grad_norm": 8.365169525146484, + "learning_rate": 6.393519552238592e-06, + "loss": 0.8429, + "step": 13114 + }, + { + "epoch": 0.6558, + "grad_norm": 0.18367677927017212, + "learning_rate": 6.390264027917836e-06, + "loss": 0.5522, + "step": 13116 + }, + { + "epoch": 0.6559, + "grad_norm": 3.011138916015625, + "learning_rate": 6.387008943432456e-06, + "loss": 0.596, + "step": 13118 + }, + { + "epoch": 0.656, + "grad_norm": 7.497799396514893, + "learning_rate": 6.383754299179079e-06, + "loss": 0.4461, + "step": 13120 + }, + { + "epoch": 0.6561, + "grad_norm": 6.9043378829956055, + "learning_rate": 6.380500095554268e-06, + "loss": 0.8364, + "step": 13122 + }, + { + "epoch": 0.6562, + "grad_norm": 4.5696001052856445, + "learning_rate": 6.377246332954544e-06, + "loss": 1.1051, + "step": 13124 + }, + { + "epoch": 0.6563, + "grad_norm": 9.11587905883789, + "learning_rate": 6.373993011776367e-06, + "loss": 0.5386, + "step": 13126 + }, + { + "epoch": 0.6564, + "grad_norm": 1.9696928262710571, + "learning_rate": 6.370740132416138e-06, + "loss": 0.7936, + "step": 13128 + }, + { + "epoch": 0.6565, + "grad_norm": 3.777451276779175, + "learning_rate": 6.367487695270218e-06, + "loss": 0.9224, + "step": 13130 + }, + { + "epoch": 0.6566, + "grad_norm": 4.0631818771362305, + "learning_rate": 6.364235700734903e-06, + "loss": 1.204, + "step": 13132 + }, + { + "epoch": 0.6567, + "grad_norm": 9.444546699523926, + "learning_rate": 6.3609841492064395e-06, + "loss": 0.8738, + "step": 13134 + }, + { + "epoch": 0.6568, + "grad_norm": 3.5679984092712402, + "learning_rate": 6.357733041081018e-06, + "loss": 1.1845, + "step": 13136 + }, + { + "epoch": 0.6569, + "grad_norm": 5.904933452606201, + "learning_rate": 6.35448237675478e-06, + "loss": 1.1052, + "step": 13138 + }, + { + "epoch": 0.657, + "grad_norm": 13.202469825744629, + "learning_rate": 6.351232156623803e-06, + "loss": 0.532, + "step": 13140 + }, + { + "epoch": 0.6571, + "grad_norm": 1.74391508102417, + "learning_rate": 6.3479823810841235e-06, + "loss": 1.1086, + "step": 13142 + }, + { + "epoch": 0.6572, + "grad_norm": 4.552857398986816, + "learning_rate": 6.344733050531713e-06, + "loss": 0.9347, + "step": 13144 + }, + { + "epoch": 0.6573, + "grad_norm": 2.4324185848236084, + "learning_rate": 6.341484165362488e-06, + "loss": 0.5596, + "step": 13146 + }, + { + "epoch": 0.6574, + "grad_norm": 0.5123173594474792, + "learning_rate": 6.338235725972326e-06, + "loss": 0.6255, + "step": 13148 + }, + { + "epoch": 0.6575, + "grad_norm": 3.758772134780884, + "learning_rate": 6.334987732757028e-06, + "loss": 0.2415, + "step": 13150 + }, + { + "epoch": 0.6576, + "grad_norm": 9.132883071899414, + "learning_rate": 6.33174018611236e-06, + "loss": 1.048, + "step": 13152 + }, + { + "epoch": 0.6577, + "grad_norm": 5.133204936981201, + "learning_rate": 6.328493086434022e-06, + "loss": 1.2341, + "step": 13154 + }, + { + "epoch": 0.6578, + "grad_norm": 5.646528244018555, + "learning_rate": 6.325246434117669e-06, + "loss": 0.4492, + "step": 13156 + }, + { + "epoch": 0.6579, + "grad_norm": 4.433338165283203, + "learning_rate": 6.322000229558886e-06, + "loss": 0.4426, + "step": 13158 + }, + { + "epoch": 0.658, + "grad_norm": 5.994366645812988, + "learning_rate": 6.318754473153221e-06, + "loss": 1.3251, + "step": 13160 + }, + { + "epoch": 0.6581, + "grad_norm": 2.9360361099243164, + "learning_rate": 6.315509165296158e-06, + "loss": 2.1646, + "step": 13162 + }, + { + "epoch": 0.6582, + "grad_norm": 5.313868522644043, + "learning_rate": 6.3122643063831245e-06, + "loss": 0.7509, + "step": 13164 + }, + { + "epoch": 0.6583, + "grad_norm": 4.969653129577637, + "learning_rate": 6.309019896809503e-06, + "loss": 0.6269, + "step": 13166 + }, + { + "epoch": 0.6584, + "grad_norm": 7.529294967651367, + "learning_rate": 6.305775936970606e-06, + "loss": 1.2379, + "step": 13168 + }, + { + "epoch": 0.6585, + "grad_norm": 3.8312361240386963, + "learning_rate": 6.302532427261708e-06, + "loss": 0.8751, + "step": 13170 + }, + { + "epoch": 0.6586, + "grad_norm": 11.48926830291748, + "learning_rate": 6.299289368078016e-06, + "loss": 1.3746, + "step": 13172 + }, + { + "epoch": 0.6587, + "grad_norm": 2.8655049800872803, + "learning_rate": 6.2960467598146935e-06, + "loss": 0.8942, + "step": 13174 + }, + { + "epoch": 0.6588, + "grad_norm": 3.1366446018218994, + "learning_rate": 6.292804602866833e-06, + "loss": 0.5076, + "step": 13176 + }, + { + "epoch": 0.6589, + "grad_norm": 8.775218963623047, + "learning_rate": 6.289562897629492e-06, + "loss": 1.5255, + "step": 13178 + }, + { + "epoch": 0.659, + "grad_norm": 6.636314392089844, + "learning_rate": 6.286321644497655e-06, + "loss": 2.3891, + "step": 13180 + }, + { + "epoch": 0.6591, + "grad_norm": 4.2717156410217285, + "learning_rate": 6.283080843866257e-06, + "loss": 0.5062, + "step": 13182 + }, + { + "epoch": 0.6592, + "grad_norm": 2.0683062076568604, + "learning_rate": 6.27984049613019e-06, + "loss": 0.5394, + "step": 13184 + }, + { + "epoch": 0.6593, + "grad_norm": 5.995512008666992, + "learning_rate": 6.276600601684267e-06, + "loss": 1.5996, + "step": 13186 + }, + { + "epoch": 0.6594, + "grad_norm": 2.5744776725769043, + "learning_rate": 6.273361160923271e-06, + "loss": 0.6167, + "step": 13188 + }, + { + "epoch": 0.6595, + "grad_norm": 7.2879719734191895, + "learning_rate": 6.2701221742419106e-06, + "loss": 1.097, + "step": 13190 + }, + { + "epoch": 0.6596, + "grad_norm": 2.580294609069824, + "learning_rate": 6.2668836420348535e-06, + "loss": 0.5625, + "step": 13192 + }, + { + "epoch": 0.6597, + "grad_norm": 5.4763946533203125, + "learning_rate": 6.263645564696697e-06, + "loss": 0.6756, + "step": 13194 + }, + { + "epoch": 0.6598, + "grad_norm": 3.419520616531372, + "learning_rate": 6.260407942621998e-06, + "loss": 0.7881, + "step": 13196 + }, + { + "epoch": 0.6599, + "grad_norm": 1.9530197381973267, + "learning_rate": 6.257170776205246e-06, + "loss": 0.5206, + "step": 13198 + }, + { + "epoch": 0.66, + "grad_norm": 2.280752182006836, + "learning_rate": 6.25393406584088e-06, + "loss": 1.139, + "step": 13200 + }, + { + "epoch": 0.6601, + "grad_norm": 3.8884477615356445, + "learning_rate": 6.2506978119232896e-06, + "loss": 1.346, + "step": 13202 + }, + { + "epoch": 0.6602, + "grad_norm": 4.190537452697754, + "learning_rate": 6.247462014846793e-06, + "loss": 1.3806, + "step": 13204 + }, + { + "epoch": 0.6603, + "grad_norm": 6.493095397949219, + "learning_rate": 6.2442266750056715e-06, + "loss": 1.4283, + "step": 13206 + }, + { + "epoch": 0.6604, + "grad_norm": 3.3601112365722656, + "learning_rate": 6.240991792794133e-06, + "loss": 1.0162, + "step": 13208 + }, + { + "epoch": 0.6605, + "grad_norm": 3.28238844871521, + "learning_rate": 6.237757368606345e-06, + "loss": 0.5142, + "step": 13210 + }, + { + "epoch": 0.6606, + "grad_norm": 3.6117212772369385, + "learning_rate": 6.234523402836408e-06, + "loss": 0.7798, + "step": 13212 + }, + { + "epoch": 0.6607, + "grad_norm": 3.387968063354492, + "learning_rate": 6.231289895878375e-06, + "loss": 0.4092, + "step": 13214 + }, + { + "epoch": 0.6608, + "grad_norm": 7.020798206329346, + "learning_rate": 6.228056848126236e-06, + "loss": 0.9432, + "step": 13216 + }, + { + "epoch": 0.6609, + "grad_norm": 3.074298858642578, + "learning_rate": 6.224824259973925e-06, + "loss": 0.7854, + "step": 13218 + }, + { + "epoch": 0.661, + "grad_norm": 12.655393600463867, + "learning_rate": 6.22159213181533e-06, + "loss": 0.5725, + "step": 13220 + }, + { + "epoch": 0.6611, + "grad_norm": 2.349475145339966, + "learning_rate": 6.21836046404427e-06, + "loss": 0.2338, + "step": 13222 + }, + { + "epoch": 0.6612, + "grad_norm": 3.5513904094696045, + "learning_rate": 6.2151292570545215e-06, + "loss": 1.795, + "step": 13224 + }, + { + "epoch": 0.6613, + "grad_norm": 5.459303379058838, + "learning_rate": 6.2118985112397865e-06, + "loss": 0.4792, + "step": 13226 + }, + { + "epoch": 0.6614, + "grad_norm": 2.3438880443573, + "learning_rate": 6.208668226993731e-06, + "loss": 0.5587, + "step": 13228 + }, + { + "epoch": 0.6615, + "grad_norm": 0.2280549257993698, + "learning_rate": 6.205438404709948e-06, + "loss": 0.1639, + "step": 13230 + }, + { + "epoch": 0.6616, + "grad_norm": 3.1995913982391357, + "learning_rate": 6.202209044781991e-06, + "loss": 0.8549, + "step": 13232 + }, + { + "epoch": 0.6617, + "grad_norm": 1.1505849361419678, + "learning_rate": 6.1989801476033385e-06, + "loss": 0.7074, + "step": 13234 + }, + { + "epoch": 0.6618, + "grad_norm": 7.3956685066223145, + "learning_rate": 6.195751713567426e-06, + "loss": 1.0817, + "step": 13236 + }, + { + "epoch": 0.6619, + "grad_norm": 1.8950586318969727, + "learning_rate": 6.192523743067627e-06, + "loss": 1.1095, + "step": 13238 + }, + { + "epoch": 0.662, + "grad_norm": 9.03689193725586, + "learning_rate": 6.18929623649726e-06, + "loss": 0.7421, + "step": 13240 + }, + { + "epoch": 0.6621, + "grad_norm": 3.6595466136932373, + "learning_rate": 6.18606919424959e-06, + "loss": 0.5928, + "step": 13242 + }, + { + "epoch": 0.6622, + "grad_norm": 3.8793842792510986, + "learning_rate": 6.182842616717817e-06, + "loss": 0.5706, + "step": 13244 + }, + { + "epoch": 0.6623, + "grad_norm": 8.445667266845703, + "learning_rate": 6.1796165042950925e-06, + "loss": 0.7075, + "step": 13246 + }, + { + "epoch": 0.6624, + "grad_norm": 4.261101245880127, + "learning_rate": 6.176390857374508e-06, + "loss": 0.9092, + "step": 13248 + }, + { + "epoch": 0.6625, + "grad_norm": 3.5370688438415527, + "learning_rate": 6.173165676349103e-06, + "loss": 0.3508, + "step": 13250 + }, + { + "epoch": 0.6626, + "grad_norm": 9.847397804260254, + "learning_rate": 6.169940961611853e-06, + "loss": 0.629, + "step": 13252 + }, + { + "epoch": 0.6627, + "grad_norm": 5.488986492156982, + "learning_rate": 6.166716713555674e-06, + "loss": 1.4368, + "step": 13254 + }, + { + "epoch": 0.6628, + "grad_norm": 8.14022159576416, + "learning_rate": 6.1634929325734385e-06, + "loss": 0.4013, + "step": 13256 + }, + { + "epoch": 0.6629, + "grad_norm": 4.64265775680542, + "learning_rate": 6.16026961905795e-06, + "loss": 0.5595, + "step": 13258 + }, + { + "epoch": 0.663, + "grad_norm": 2.263746500015259, + "learning_rate": 6.157046773401964e-06, + "loss": 0.7144, + "step": 13260 + }, + { + "epoch": 0.6631, + "grad_norm": 7.673091411590576, + "learning_rate": 6.153824395998169e-06, + "loss": 1.0911, + "step": 13262 + }, + { + "epoch": 0.6632, + "grad_norm": 4.620915412902832, + "learning_rate": 6.150602487239207e-06, + "loss": 0.5446, + "step": 13264 + }, + { + "epoch": 0.6633, + "grad_norm": 0.1675797402858734, + "learning_rate": 6.147381047517655e-06, + "loss": 0.9583, + "step": 13266 + }, + { + "epoch": 0.6634, + "grad_norm": 2.7461400032043457, + "learning_rate": 6.144160077226035e-06, + "loss": 0.8709, + "step": 13268 + }, + { + "epoch": 0.6635, + "grad_norm": 6.5105695724487305, + "learning_rate": 6.140939576756817e-06, + "loss": 0.4647, + "step": 13270 + }, + { + "epoch": 0.6636, + "grad_norm": 8.118753433227539, + "learning_rate": 6.137719546502401e-06, + "loss": 1.7478, + "step": 13272 + }, + { + "epoch": 0.6637, + "grad_norm": 4.809286117553711, + "learning_rate": 6.134499986855145e-06, + "loss": 1.1466, + "step": 13274 + }, + { + "epoch": 0.6638, + "grad_norm": 6.203695774078369, + "learning_rate": 6.131280898207339e-06, + "loss": 0.9101, + "step": 13276 + }, + { + "epoch": 0.6639, + "grad_norm": 3.5607800483703613, + "learning_rate": 6.128062280951223e-06, + "loss": 0.8118, + "step": 13278 + }, + { + "epoch": 0.664, + "grad_norm": 3.697009801864624, + "learning_rate": 6.124844135478971e-06, + "loss": 0.329, + "step": 13280 + }, + { + "epoch": 0.6641, + "grad_norm": 4.530758380889893, + "learning_rate": 6.121626462182708e-06, + "loss": 0.1272, + "step": 13282 + }, + { + "epoch": 0.6642, + "grad_norm": 2.960885524749756, + "learning_rate": 6.118409261454494e-06, + "loss": 1.0417, + "step": 13284 + }, + { + "epoch": 0.6643, + "grad_norm": 4.8666229248046875, + "learning_rate": 6.115192533686341e-06, + "loss": 1.4134, + "step": 13286 + }, + { + "epoch": 0.6644, + "grad_norm": 9.327043533325195, + "learning_rate": 6.1119762792701935e-06, + "loss": 1.0408, + "step": 13288 + }, + { + "epoch": 0.6645, + "grad_norm": 9.989114761352539, + "learning_rate": 6.108760498597939e-06, + "loss": 1.0599, + "step": 13290 + }, + { + "epoch": 0.6646, + "grad_norm": 3.799633264541626, + "learning_rate": 6.1055451920614165e-06, + "loss": 1.4857, + "step": 13292 + }, + { + "epoch": 0.6647, + "grad_norm": 5.778425693511963, + "learning_rate": 6.1023303600523975e-06, + "loss": 0.9707, + "step": 13294 + }, + { + "epoch": 0.6648, + "grad_norm": 2.7759246826171875, + "learning_rate": 6.099116002962604e-06, + "loss": 1.028, + "step": 13296 + }, + { + "epoch": 0.6649, + "grad_norm": 3.3035099506378174, + "learning_rate": 6.09590212118369e-06, + "loss": 0.6602, + "step": 13298 + }, + { + "epoch": 0.665, + "grad_norm": 0.7998601198196411, + "learning_rate": 6.092688715107265e-06, + "loss": 0.2169, + "step": 13300 + }, + { + "epoch": 0.6651, + "grad_norm": 1.0460126399993896, + "learning_rate": 6.089475785124863e-06, + "loss": 0.3259, + "step": 13302 + }, + { + "epoch": 0.6652, + "grad_norm": 4.453971862792969, + "learning_rate": 6.086263331627976e-06, + "loss": 1.1147, + "step": 13304 + }, + { + "epoch": 0.6653, + "grad_norm": 3.788688898086548, + "learning_rate": 6.083051355008034e-06, + "loss": 0.6564, + "step": 13306 + }, + { + "epoch": 0.6654, + "grad_norm": 5.084733009338379, + "learning_rate": 6.079839855656397e-06, + "loss": 0.9474, + "step": 13308 + }, + { + "epoch": 0.6655, + "grad_norm": 0.45406368374824524, + "learning_rate": 6.076628833964389e-06, + "loss": 0.218, + "step": 13310 + }, + { + "epoch": 0.6656, + "grad_norm": 4.795419692993164, + "learning_rate": 6.073418290323251e-06, + "loss": 1.3693, + "step": 13312 + }, + { + "epoch": 0.6657, + "grad_norm": 1.0050653219223022, + "learning_rate": 6.070208225124186e-06, + "loss": 0.4162, + "step": 13314 + }, + { + "epoch": 0.6658, + "grad_norm": 3.3554861545562744, + "learning_rate": 6.066998638758326e-06, + "loss": 1.1026, + "step": 13316 + }, + { + "epoch": 0.6659, + "grad_norm": 5.512768745422363, + "learning_rate": 6.063789531616757e-06, + "loss": 0.7347, + "step": 13318 + }, + { + "epoch": 0.666, + "grad_norm": 2.9597434997558594, + "learning_rate": 6.06058090409049e-06, + "loss": 0.9764, + "step": 13320 + }, + { + "epoch": 0.6661, + "grad_norm": 2.623745918273926, + "learning_rate": 6.05737275657049e-06, + "loss": 0.7368, + "step": 13322 + }, + { + "epoch": 0.6662, + "grad_norm": 5.861087799072266, + "learning_rate": 6.054165089447663e-06, + "loss": 1.1694, + "step": 13324 + }, + { + "epoch": 0.6663, + "grad_norm": 6.355810642242432, + "learning_rate": 6.050957903112848e-06, + "loss": 1.3026, + "step": 13326 + }, + { + "epoch": 0.6664, + "grad_norm": 8.999313354492188, + "learning_rate": 6.047751197956838e-06, + "loss": 1.4444, + "step": 13328 + }, + { + "epoch": 0.6665, + "grad_norm": 5.814700126647949, + "learning_rate": 6.044544974370352e-06, + "loss": 0.875, + "step": 13330 + }, + { + "epoch": 0.6666, + "grad_norm": 10.95617389678955, + "learning_rate": 6.0413392327440635e-06, + "loss": 1.5918, + "step": 13332 + }, + { + "epoch": 0.6667, + "grad_norm": 13.113624572753906, + "learning_rate": 6.0381339734685805e-06, + "loss": 1.7104, + "step": 13334 + }, + { + "epoch": 0.6668, + "grad_norm": 3.8951022624969482, + "learning_rate": 6.0349291969344595e-06, + "loss": 0.7161, + "step": 13336 + }, + { + "epoch": 0.6669, + "grad_norm": 3.977911949157715, + "learning_rate": 6.031724903532184e-06, + "loss": 1.1705, + "step": 13338 + }, + { + "epoch": 0.667, + "grad_norm": 0.6857123374938965, + "learning_rate": 6.028521093652195e-06, + "loss": 0.2021, + "step": 13340 + }, + { + "epoch": 0.6671, + "grad_norm": 7.16627311706543, + "learning_rate": 6.025317767684864e-06, + "loss": 1.1528, + "step": 13342 + }, + { + "epoch": 0.6672, + "grad_norm": 9.040963172912598, + "learning_rate": 6.022114926020504e-06, + "loss": 1.4825, + "step": 13344 + }, + { + "epoch": 0.6673, + "grad_norm": 5.055968761444092, + "learning_rate": 6.018912569049376e-06, + "loss": 1.0484, + "step": 13346 + }, + { + "epoch": 0.6674, + "grad_norm": 9.349472999572754, + "learning_rate": 6.015710697161674e-06, + "loss": 1.2785, + "step": 13348 + }, + { + "epoch": 0.6675, + "grad_norm": 3.72189998626709, + "learning_rate": 6.0125093107475385e-06, + "loss": 0.544, + "step": 13350 + }, + { + "epoch": 0.6676, + "grad_norm": 3.4257538318634033, + "learning_rate": 6.009308410197048e-06, + "loss": 0.7063, + "step": 13352 + }, + { + "epoch": 0.6677, + "grad_norm": 3.906038522720337, + "learning_rate": 6.006107995900224e-06, + "loss": 0.5111, + "step": 13354 + }, + { + "epoch": 0.6678, + "grad_norm": 7.964054107666016, + "learning_rate": 6.002908068247024e-06, + "loss": 1.3044, + "step": 13356 + }, + { + "epoch": 0.6679, + "grad_norm": 21.34299659729004, + "learning_rate": 5.9997086276273545e-06, + "loss": 1.3702, + "step": 13358 + }, + { + "epoch": 0.668, + "grad_norm": 5.556235313415527, + "learning_rate": 5.996509674431053e-06, + "loss": 1.2759, + "step": 13360 + }, + { + "epoch": 0.6681, + "grad_norm": 1.3862098455429077, + "learning_rate": 5.993311209047901e-06, + "loss": 0.0879, + "step": 13362 + }, + { + "epoch": 0.6682, + "grad_norm": 5.320908069610596, + "learning_rate": 5.990113231867629e-06, + "loss": 0.9523, + "step": 13364 + }, + { + "epoch": 0.6683, + "grad_norm": 2.980177164077759, + "learning_rate": 5.986915743279893e-06, + "loss": 1.4054, + "step": 13366 + }, + { + "epoch": 0.6684, + "grad_norm": 4.221327304840088, + "learning_rate": 5.983718743674302e-06, + "loss": 0.9614, + "step": 13368 + }, + { + "epoch": 0.6685, + "grad_norm": 3.6437931060791016, + "learning_rate": 5.9805222334404e-06, + "loss": 1.2876, + "step": 13370 + }, + { + "epoch": 0.6686, + "grad_norm": 6.131580829620361, + "learning_rate": 5.977326212967671e-06, + "loss": 0.2421, + "step": 13372 + }, + { + "epoch": 0.6687, + "grad_norm": 5.652294158935547, + "learning_rate": 5.9741306826455384e-06, + "loss": 1.6038, + "step": 13374 + }, + { + "epoch": 0.6688, + "grad_norm": 4.207012176513672, + "learning_rate": 5.970935642863375e-06, + "loss": 0.6705, + "step": 13376 + }, + { + "epoch": 0.6689, + "grad_norm": 5.079837799072266, + "learning_rate": 5.967741094010479e-06, + "loss": 1.3795, + "step": 13378 + }, + { + "epoch": 0.669, + "grad_norm": 3.392698049545288, + "learning_rate": 5.9645470364761e-06, + "loss": 0.8184, + "step": 13380 + }, + { + "epoch": 0.6691, + "grad_norm": 12.02542495727539, + "learning_rate": 5.9613534706494254e-06, + "loss": 0.9224, + "step": 13382 + }, + { + "epoch": 0.6692, + "grad_norm": 6.679722785949707, + "learning_rate": 5.958160396919577e-06, + "loss": 0.8214, + "step": 13384 + }, + { + "epoch": 0.6693, + "grad_norm": 0.2798656225204468, + "learning_rate": 5.954967815675628e-06, + "loss": 0.3664, + "step": 13386 + }, + { + "epoch": 0.6694, + "grad_norm": 3.106462240219116, + "learning_rate": 5.951775727306577e-06, + "loss": 0.7942, + "step": 13388 + }, + { + "epoch": 0.6695, + "grad_norm": 12.742809295654297, + "learning_rate": 5.948584132201376e-06, + "loss": 1.2752, + "step": 13390 + }, + { + "epoch": 0.6696, + "grad_norm": 4.9504194259643555, + "learning_rate": 5.94539303074891e-06, + "loss": 1.1907, + "step": 13392 + }, + { + "epoch": 0.6697, + "grad_norm": 6.7591729164123535, + "learning_rate": 5.942202423338001e-06, + "loss": 1.0222, + "step": 13394 + }, + { + "epoch": 0.6698, + "grad_norm": 8.999286651611328, + "learning_rate": 5.939012310357422e-06, + "loss": 1.8038, + "step": 13396 + }, + { + "epoch": 0.6699, + "grad_norm": 3.5427052974700928, + "learning_rate": 5.935822692195869e-06, + "loss": 1.8003, + "step": 13398 + }, + { + "epoch": 0.67, + "grad_norm": 4.770504474639893, + "learning_rate": 5.932633569242e-06, + "loss": 1.3312, + "step": 13400 + }, + { + "epoch": 0.6701, + "grad_norm": 3.2127933502197266, + "learning_rate": 5.929444941884388e-06, + "loss": 1.1794, + "step": 13402 + }, + { + "epoch": 0.6702, + "grad_norm": 1.9726755619049072, + "learning_rate": 5.926256810511566e-06, + "loss": 0.9081, + "step": 13404 + }, + { + "epoch": 0.6703, + "grad_norm": 11.550957679748535, + "learning_rate": 5.9230691755119905e-06, + "loss": 1.1078, + "step": 13406 + }, + { + "epoch": 0.6704, + "grad_norm": 2.0927250385284424, + "learning_rate": 5.9198820372740726e-06, + "loss": 0.6661, + "step": 13408 + }, + { + "epoch": 0.6705, + "grad_norm": 5.022547721862793, + "learning_rate": 5.9166953961861536e-06, + "loss": 0.7226, + "step": 13410 + }, + { + "epoch": 0.6706, + "grad_norm": 0.9856055378913879, + "learning_rate": 5.913509252636511e-06, + "loss": 0.3814, + "step": 13412 + }, + { + "epoch": 0.6707, + "grad_norm": 2.8295648097991943, + "learning_rate": 5.910323607013373e-06, + "loss": 0.7058, + "step": 13414 + }, + { + "epoch": 0.6708, + "grad_norm": 3.7977652549743652, + "learning_rate": 5.907138459704895e-06, + "loss": 1.0416, + "step": 13416 + }, + { + "epoch": 0.6709, + "grad_norm": 3.29152250289917, + "learning_rate": 5.903953811099183e-06, + "loss": 1.0206, + "step": 13418 + }, + { + "epoch": 0.671, + "grad_norm": 11.704008102416992, + "learning_rate": 5.900769661584273e-06, + "loss": 0.7159, + "step": 13420 + }, + { + "epoch": 0.6711, + "grad_norm": 6.67307710647583, + "learning_rate": 5.897586011548148e-06, + "loss": 0.7638, + "step": 13422 + }, + { + "epoch": 0.6712, + "grad_norm": 3.4797580242156982, + "learning_rate": 5.894402861378721e-06, + "loss": 0.2655, + "step": 13424 + }, + { + "epoch": 0.6713, + "grad_norm": 7.196858882904053, + "learning_rate": 5.891220211463853e-06, + "loss": 1.1098, + "step": 13426 + }, + { + "epoch": 0.6714, + "grad_norm": 11.329870223999023, + "learning_rate": 5.88803806219134e-06, + "loss": 1.0922, + "step": 13428 + }, + { + "epoch": 0.6715, + "grad_norm": 4.129062175750732, + "learning_rate": 5.884856413948913e-06, + "loss": 0.877, + "step": 13430 + }, + { + "epoch": 0.6716, + "grad_norm": 4.564468860626221, + "learning_rate": 5.881675267124254e-06, + "loss": 0.7119, + "step": 13432 + }, + { + "epoch": 0.6717, + "grad_norm": 4.678146839141846, + "learning_rate": 5.878494622104967e-06, + "loss": 0.5973, + "step": 13434 + }, + { + "epoch": 0.6718, + "grad_norm": 3.482332229614258, + "learning_rate": 5.8753144792786095e-06, + "loss": 1.0041, + "step": 13436 + }, + { + "epoch": 0.6719, + "grad_norm": 6.4192914962768555, + "learning_rate": 5.872134839032671e-06, + "loss": 1.6759, + "step": 13438 + }, + { + "epoch": 0.672, + "grad_norm": 6.679666042327881, + "learning_rate": 5.868955701754584e-06, + "loss": 1.0486, + "step": 13440 + }, + { + "epoch": 0.6721, + "grad_norm": 6.360802173614502, + "learning_rate": 5.865777067831711e-06, + "loss": 1.168, + "step": 13442 + }, + { + "epoch": 0.6722, + "grad_norm": 3.82387638092041, + "learning_rate": 5.862598937651364e-06, + "loss": 0.6756, + "step": 13444 + }, + { + "epoch": 0.6723, + "grad_norm": 6.152732849121094, + "learning_rate": 5.859421311600786e-06, + "loss": 0.6365, + "step": 13446 + }, + { + "epoch": 0.6724, + "grad_norm": 2.324152708053589, + "learning_rate": 5.85624419006716e-06, + "loss": 0.8087, + "step": 13448 + }, + { + "epoch": 0.6725, + "grad_norm": 3.102942705154419, + "learning_rate": 5.853067573437612e-06, + "loss": 0.2103, + "step": 13450 + }, + { + "epoch": 0.6726, + "grad_norm": 2.788695812225342, + "learning_rate": 5.849891462099199e-06, + "loss": 0.7091, + "step": 13452 + }, + { + "epoch": 0.6727, + "grad_norm": 4.523034572601318, + "learning_rate": 5.846715856438923e-06, + "loss": 0.5248, + "step": 13454 + }, + { + "epoch": 0.6728, + "grad_norm": 4.43597412109375, + "learning_rate": 5.843540756843722e-06, + "loss": 0.6396, + "step": 13456 + }, + { + "epoch": 0.6729, + "grad_norm": 3.3275668621063232, + "learning_rate": 5.840366163700474e-06, + "loss": 0.6657, + "step": 13458 + }, + { + "epoch": 0.673, + "grad_norm": 6.224521636962891, + "learning_rate": 5.83719207739599e-06, + "loss": 1.0081, + "step": 13460 + }, + { + "epoch": 0.6731, + "grad_norm": 6.883062362670898, + "learning_rate": 5.8340184983170245e-06, + "loss": 0.8456, + "step": 13462 + }, + { + "epoch": 0.6732, + "grad_norm": 3.242295265197754, + "learning_rate": 5.830845426850268e-06, + "loss": 0.4979, + "step": 13464 + }, + { + "epoch": 0.6733, + "grad_norm": 3.8544013500213623, + "learning_rate": 5.8276728633823494e-06, + "loss": 0.431, + "step": 13466 + }, + { + "epoch": 0.6734, + "grad_norm": 4.884932518005371, + "learning_rate": 5.824500808299836e-06, + "loss": 0.3756, + "step": 13468 + }, + { + "epoch": 0.6735, + "grad_norm": 0.7713149785995483, + "learning_rate": 5.82132926198923e-06, + "loss": 0.5938, + "step": 13470 + }, + { + "epoch": 0.6736, + "grad_norm": 6.501901626586914, + "learning_rate": 5.818158224836987e-06, + "loss": 1.3827, + "step": 13472 + }, + { + "epoch": 0.6737, + "grad_norm": 1.5352202653884888, + "learning_rate": 5.814987697229471e-06, + "loss": 0.4595, + "step": 13474 + }, + { + "epoch": 0.6738, + "grad_norm": 4.25168514251709, + "learning_rate": 5.811817679553018e-06, + "loss": 1.0668, + "step": 13476 + }, + { + "epoch": 0.6739, + "grad_norm": 3.648937225341797, + "learning_rate": 5.8086481721938685e-06, + "loss": 0.8195, + "step": 13478 + }, + { + "epoch": 0.674, + "grad_norm": 4.09222936630249, + "learning_rate": 5.8054791755382286e-06, + "loss": 0.9282, + "step": 13480 + }, + { + "epoch": 0.6741, + "grad_norm": 5.374998569488525, + "learning_rate": 5.8023106899722325e-06, + "loss": 1.0782, + "step": 13482 + }, + { + "epoch": 0.6742, + "grad_norm": 6.287595272064209, + "learning_rate": 5.799142715881938e-06, + "loss": 1.1529, + "step": 13484 + }, + { + "epoch": 0.6743, + "grad_norm": 6.141125202178955, + "learning_rate": 5.795975253653364e-06, + "loss": 0.462, + "step": 13486 + }, + { + "epoch": 0.6744, + "grad_norm": 6.749085903167725, + "learning_rate": 5.792808303672454e-06, + "loss": 1.0616, + "step": 13488 + }, + { + "epoch": 0.6745, + "grad_norm": 3.8208365440368652, + "learning_rate": 5.789641866325091e-06, + "loss": 1.0254, + "step": 13490 + }, + { + "epoch": 0.6746, + "grad_norm": 4.440393924713135, + "learning_rate": 5.786475941997094e-06, + "loss": 1.4455, + "step": 13492 + }, + { + "epoch": 0.6747, + "grad_norm": 0.13435932993888855, + "learning_rate": 5.783310531074223e-06, + "loss": 0.3873, + "step": 13494 + }, + { + "epoch": 0.6748, + "grad_norm": 7.083047389984131, + "learning_rate": 5.780145633942173e-06, + "loss": 1.544, + "step": 13496 + }, + { + "epoch": 0.6749, + "grad_norm": 1.7228832244873047, + "learning_rate": 5.776981250986578e-06, + "loss": 0.8982, + "step": 13498 + }, + { + "epoch": 0.675, + "grad_norm": 7.1800312995910645, + "learning_rate": 5.773817382593008e-06, + "loss": 1.3435, + "step": 13500 + }, + { + "epoch": 0.6751, + "grad_norm": 4.531774044036865, + "learning_rate": 5.770654029146969e-06, + "loss": 0.9952, + "step": 13502 + }, + { + "epoch": 0.6752, + "grad_norm": 3.858125925064087, + "learning_rate": 5.7674911910339094e-06, + "loss": 0.6382, + "step": 13504 + }, + { + "epoch": 0.6753, + "grad_norm": 3.868234395980835, + "learning_rate": 5.7643288686392085e-06, + "loss": 0.5731, + "step": 13506 + }, + { + "epoch": 0.6754, + "grad_norm": 4.322309494018555, + "learning_rate": 5.761167062348187e-06, + "loss": 0.7065, + "step": 13508 + }, + { + "epoch": 0.6755, + "grad_norm": 5.688404560089111, + "learning_rate": 5.758005772546097e-06, + "loss": 1.0042, + "step": 13510 + }, + { + "epoch": 0.6756, + "grad_norm": 21.85991668701172, + "learning_rate": 5.754844999618144e-06, + "loss": 0.6075, + "step": 13512 + }, + { + "epoch": 0.6757, + "grad_norm": 3.097184419631958, + "learning_rate": 5.751684743949444e-06, + "loss": 0.9236, + "step": 13514 + }, + { + "epoch": 0.6758, + "grad_norm": 2.351855993270874, + "learning_rate": 5.748525005925074e-06, + "loss": 0.7576, + "step": 13516 + }, + { + "epoch": 0.6759, + "grad_norm": 10.803197860717773, + "learning_rate": 5.745365785930041e-06, + "loss": 0.2283, + "step": 13518 + }, + { + "epoch": 0.676, + "grad_norm": 3.5034008026123047, + "learning_rate": 5.742207084349274e-06, + "loss": 0.7587, + "step": 13520 + }, + { + "epoch": 0.6761, + "grad_norm": 0.7738643288612366, + "learning_rate": 5.739048901567665e-06, + "loss": 0.3398, + "step": 13522 + }, + { + "epoch": 0.6762, + "grad_norm": 0.7832783460617065, + "learning_rate": 5.735891237970015e-06, + "loss": 0.4572, + "step": 13524 + }, + { + "epoch": 0.6763, + "grad_norm": 6.808251857757568, + "learning_rate": 5.732734093941087e-06, + "loss": 0.752, + "step": 13526 + }, + { + "epoch": 0.6764, + "grad_norm": 9.039656639099121, + "learning_rate": 5.729577469865566e-06, + "loss": 0.5405, + "step": 13528 + }, + { + "epoch": 0.6765, + "grad_norm": 5.397425174713135, + "learning_rate": 5.726421366128076e-06, + "loss": 1.6564, + "step": 13530 + }, + { + "epoch": 0.6766, + "grad_norm": 7.434508800506592, + "learning_rate": 5.723265783113181e-06, + "loss": 1.006, + "step": 13532 + }, + { + "epoch": 0.6767, + "grad_norm": 7.234251976013184, + "learning_rate": 5.720110721205377e-06, + "loss": 1.3749, + "step": 13534 + }, + { + "epoch": 0.6768, + "grad_norm": 3.2089178562164307, + "learning_rate": 5.716956180789098e-06, + "loss": 1.3286, + "step": 13536 + }, + { + "epoch": 0.6769, + "grad_norm": 3.928860664367676, + "learning_rate": 5.713802162248718e-06, + "loss": 1.1903, + "step": 13538 + }, + { + "epoch": 0.677, + "grad_norm": 4.016988277435303, + "learning_rate": 5.710648665968543e-06, + "loss": 0.5556, + "step": 13540 + }, + { + "epoch": 0.6771, + "grad_norm": 4.370832920074463, + "learning_rate": 5.707495692332816e-06, + "loss": 0.3307, + "step": 13542 + }, + { + "epoch": 0.6772, + "grad_norm": 8.386446952819824, + "learning_rate": 5.704343241725719e-06, + "loss": 0.9282, + "step": 13544 + }, + { + "epoch": 0.6773, + "grad_norm": 9.584386825561523, + "learning_rate": 5.701191314531364e-06, + "loss": 0.8667, + "step": 13546 + }, + { + "epoch": 0.6774, + "grad_norm": 8.382237434387207, + "learning_rate": 5.698039911133816e-06, + "loss": 1.8013, + "step": 13548 + }, + { + "epoch": 0.6775, + "grad_norm": 6.255984783172607, + "learning_rate": 5.694889031917047e-06, + "loss": 1.3623, + "step": 13550 + }, + { + "epoch": 0.6776, + "grad_norm": 4.601725101470947, + "learning_rate": 5.691738677265e-06, + "loss": 0.7365, + "step": 13552 + }, + { + "epoch": 0.6777, + "grad_norm": 2.1295924186706543, + "learning_rate": 5.68858884756152e-06, + "loss": 0.1025, + "step": 13554 + }, + { + "epoch": 0.6778, + "grad_norm": 2.473543167114258, + "learning_rate": 5.685439543190409e-06, + "loss": 0.3188, + "step": 13556 + }, + { + "epoch": 0.6779, + "grad_norm": 2.248430013656616, + "learning_rate": 5.68229076453541e-06, + "loss": 0.9468, + "step": 13558 + }, + { + "epoch": 0.678, + "grad_norm": 3.3589398860931396, + "learning_rate": 5.679142511980176e-06, + "loss": 0.6635, + "step": 13560 + }, + { + "epoch": 0.6781, + "grad_norm": 3.1146044731140137, + "learning_rate": 5.67599478590833e-06, + "loss": 0.5673, + "step": 13562 + }, + { + "epoch": 0.6782, + "grad_norm": 5.812463760375977, + "learning_rate": 5.672847586703393e-06, + "loss": 0.6723, + "step": 13564 + }, + { + "epoch": 0.6783, + "grad_norm": 19.041399002075195, + "learning_rate": 5.6697009147488566e-06, + "loss": 0.6378, + "step": 13566 + }, + { + "epoch": 0.6784, + "grad_norm": 5.839048862457275, + "learning_rate": 5.666554770428129e-06, + "loss": 0.6387, + "step": 13568 + }, + { + "epoch": 0.6785, + "grad_norm": 14.690051078796387, + "learning_rate": 5.663409154124557e-06, + "loss": 1.1485, + "step": 13570 + }, + { + "epoch": 0.6786, + "grad_norm": 6.034825801849365, + "learning_rate": 5.660264066221426e-06, + "loss": 0.9526, + "step": 13572 + }, + { + "epoch": 0.6787, + "grad_norm": 4.336225986480713, + "learning_rate": 5.657119507101955e-06, + "loss": 0.9826, + "step": 13574 + }, + { + "epoch": 0.6788, + "grad_norm": 2.9649789333343506, + "learning_rate": 5.653975477149298e-06, + "loss": 1.0551, + "step": 13576 + }, + { + "epoch": 0.6789, + "grad_norm": 12.6412935256958, + "learning_rate": 5.650831976746547e-06, + "loss": 0.8034, + "step": 13578 + }, + { + "epoch": 0.679, + "grad_norm": 3.870180368423462, + "learning_rate": 5.647689006276727e-06, + "loss": 1.197, + "step": 13580 + }, + { + "epoch": 0.6791, + "grad_norm": 2.3551266193389893, + "learning_rate": 5.644546566122799e-06, + "loss": 1.2933, + "step": 13582 + }, + { + "epoch": 0.6792, + "grad_norm": 3.0311732292175293, + "learning_rate": 5.641404656667661e-06, + "loss": 0.9408, + "step": 13584 + }, + { + "epoch": 0.6793, + "grad_norm": 2.9135684967041016, + "learning_rate": 5.6382632782941405e-06, + "loss": 0.668, + "step": 13586 + }, + { + "epoch": 0.6794, + "grad_norm": 9.344533920288086, + "learning_rate": 5.6351224313850165e-06, + "loss": 1.7407, + "step": 13588 + }, + { + "epoch": 0.6795, + "grad_norm": 0.10699179768562317, + "learning_rate": 5.631982116322981e-06, + "loss": 0.6172, + "step": 13590 + }, + { + "epoch": 0.6796, + "grad_norm": 3.0056183338165283, + "learning_rate": 5.628842333490674e-06, + "loss": 0.8342, + "step": 13592 + }, + { + "epoch": 0.6797, + "grad_norm": 5.015976905822754, + "learning_rate": 5.625703083270669e-06, + "loss": 0.243, + "step": 13594 + }, + { + "epoch": 0.6798, + "grad_norm": 5.6259074211120605, + "learning_rate": 5.622564366045472e-06, + "loss": 0.7616, + "step": 13596 + }, + { + "epoch": 0.6799, + "grad_norm": 2.656156301498413, + "learning_rate": 5.619426182197536e-06, + "loss": 1.2752, + "step": 13598 + }, + { + "epoch": 0.68, + "grad_norm": 3.0627052783966064, + "learning_rate": 5.616288532109225e-06, + "loss": 0.6636, + "step": 13600 + }, + { + "epoch": 0.6801, + "grad_norm": 4.973540782928467, + "learning_rate": 5.613151416162863e-06, + "loss": 0.0959, + "step": 13602 + }, + { + "epoch": 0.6802, + "grad_norm": 7.260588645935059, + "learning_rate": 5.610014834740694e-06, + "loss": 1.0585, + "step": 13604 + }, + { + "epoch": 0.6803, + "grad_norm": 6.019016742706299, + "learning_rate": 5.606878788224901e-06, + "loss": 1.044, + "step": 13606 + }, + { + "epoch": 0.6804, + "grad_norm": 21.18696403503418, + "learning_rate": 5.603743276997607e-06, + "loss": 1.4264, + "step": 13608 + }, + { + "epoch": 0.6805, + "grad_norm": 11.879512786865234, + "learning_rate": 5.600608301440848e-06, + "loss": 0.8408, + "step": 13610 + }, + { + "epoch": 0.6806, + "grad_norm": 1.7944616079330444, + "learning_rate": 5.59747386193663e-06, + "loss": 0.7944, + "step": 13612 + }, + { + "epoch": 0.6807, + "grad_norm": 4.487357139587402, + "learning_rate": 5.5943399588668665e-06, + "loss": 0.6002, + "step": 13614 + }, + { + "epoch": 0.6808, + "grad_norm": 7.6161112785339355, + "learning_rate": 5.591206592613416e-06, + "loss": 1.7993, + "step": 13616 + }, + { + "epoch": 0.6809, + "grad_norm": 3.359732151031494, + "learning_rate": 5.588073763558068e-06, + "loss": 1.2557, + "step": 13618 + }, + { + "epoch": 0.681, + "grad_norm": 4.7611870765686035, + "learning_rate": 5.584941472082549e-06, + "loss": 1.3086, + "step": 13620 + }, + { + "epoch": 0.6811, + "grad_norm": 5.212316989898682, + "learning_rate": 5.5818097185685204e-06, + "loss": 1.0846, + "step": 13622 + }, + { + "epoch": 0.6812, + "grad_norm": 4.11433744430542, + "learning_rate": 5.5786785033975745e-06, + "loss": 0.4301, + "step": 13624 + }, + { + "epoch": 0.6813, + "grad_norm": 9.146018028259277, + "learning_rate": 5.575547826951242e-06, + "loss": 0.9317, + "step": 13626 + }, + { + "epoch": 0.6814, + "grad_norm": 1.9888914823532104, + "learning_rate": 5.572417689610987e-06, + "loss": 1.3389, + "step": 13628 + }, + { + "epoch": 0.6815, + "grad_norm": 3.3057305812835693, + "learning_rate": 5.569288091758205e-06, + "loss": 1.319, + "step": 13630 + }, + { + "epoch": 0.6816, + "grad_norm": 3.105041265487671, + "learning_rate": 5.5661590337742255e-06, + "loss": 1.2621, + "step": 13632 + }, + { + "epoch": 0.6817, + "grad_norm": 14.312016487121582, + "learning_rate": 5.563030516040327e-06, + "loss": 1.434, + "step": 13634 + }, + { + "epoch": 0.6818, + "grad_norm": 5.278201103210449, + "learning_rate": 5.559902538937694e-06, + "loss": 0.7258, + "step": 13636 + }, + { + "epoch": 0.6819, + "grad_norm": 11.623627662658691, + "learning_rate": 5.556775102847475e-06, + "loss": 0.4361, + "step": 13638 + }, + { + "epoch": 0.682, + "grad_norm": 1.98795485496521, + "learning_rate": 5.553648208150728e-06, + "loss": 0.9267, + "step": 13640 + }, + { + "epoch": 0.6821, + "grad_norm": 3.8071601390838623, + "learning_rate": 5.5505218552284565e-06, + "loss": 0.3771, + "step": 13642 + }, + { + "epoch": 0.6822, + "grad_norm": 4.452145576477051, + "learning_rate": 5.5473960444616085e-06, + "loss": 0.923, + "step": 13644 + }, + { + "epoch": 0.6823, + "grad_norm": 4.791646480560303, + "learning_rate": 5.544270776231038e-06, + "loss": 0.8614, + "step": 13646 + }, + { + "epoch": 0.6824, + "grad_norm": 4.364182949066162, + "learning_rate": 5.5411460509175605e-06, + "loss": 0.9929, + "step": 13648 + }, + { + "epoch": 0.6825, + "grad_norm": 5.32682466506958, + "learning_rate": 5.5380218689019125e-06, + "loss": 0.7036, + "step": 13650 + }, + { + "epoch": 0.6826, + "grad_norm": 41.631256103515625, + "learning_rate": 5.534898230564765e-06, + "loss": 2.2198, + "step": 13652 + }, + { + "epoch": 0.6827, + "grad_norm": 2.8122501373291016, + "learning_rate": 5.531775136286724e-06, + "loss": 1.0313, + "step": 13654 + }, + { + "epoch": 0.6828, + "grad_norm": 20.391557693481445, + "learning_rate": 5.5286525864483285e-06, + "loss": 1.5585, + "step": 13656 + }, + { + "epoch": 0.6829, + "grad_norm": 2.7624502182006836, + "learning_rate": 5.5255305814300545e-06, + "loss": 0.8689, + "step": 13658 + }, + { + "epoch": 0.683, + "grad_norm": 1.2101545333862305, + "learning_rate": 5.522409121612304e-06, + "loss": 0.752, + "step": 13660 + }, + { + "epoch": 0.6831, + "grad_norm": 5.166439056396484, + "learning_rate": 5.519288207375422e-06, + "loss": 0.6381, + "step": 13662 + }, + { + "epoch": 0.6832, + "grad_norm": 2.317445993423462, + "learning_rate": 5.516167839099679e-06, + "loss": 0.897, + "step": 13664 + }, + { + "epoch": 0.6833, + "grad_norm": 3.806579113006592, + "learning_rate": 5.5130480171652845e-06, + "loss": 0.6541, + "step": 13666 + }, + { + "epoch": 0.6834, + "grad_norm": 6.880817890167236, + "learning_rate": 5.50992874195238e-06, + "loss": 0.9897, + "step": 13668 + }, + { + "epoch": 0.6835, + "grad_norm": 7.346495628356934, + "learning_rate": 5.506810013841036e-06, + "loss": 0.7837, + "step": 13670 + }, + { + "epoch": 0.6836, + "grad_norm": 4.778952598571777, + "learning_rate": 5.50369183321126e-06, + "loss": 0.4631, + "step": 13672 + }, + { + "epoch": 0.6837, + "grad_norm": 3.6120965480804443, + "learning_rate": 5.500574200443003e-06, + "loss": 0.5925, + "step": 13674 + }, + { + "epoch": 0.6838, + "grad_norm": 6.647490501403809, + "learning_rate": 5.497457115916127e-06, + "loss": 1.0488, + "step": 13676 + }, + { + "epoch": 0.6839, + "grad_norm": 3.839970350265503, + "learning_rate": 5.494340580010441e-06, + "loss": 1.4379, + "step": 13678 + }, + { + "epoch": 0.684, + "grad_norm": 5.894059658050537, + "learning_rate": 5.491224593105695e-06, + "loss": 1.0054, + "step": 13680 + }, + { + "epoch": 0.6841, + "grad_norm": 2.5455851554870605, + "learning_rate": 5.488109155581549e-06, + "loss": 0.8366, + "step": 13682 + }, + { + "epoch": 0.6842, + "grad_norm": 7.139078140258789, + "learning_rate": 5.484994267817624e-06, + "loss": 0.8501, + "step": 13684 + }, + { + "epoch": 0.6843, + "grad_norm": 1.6801639795303345, + "learning_rate": 5.4818799301934435e-06, + "loss": 0.9123, + "step": 13686 + }, + { + "epoch": 0.6844, + "grad_norm": 5.29597282409668, + "learning_rate": 5.478766143088492e-06, + "loss": 0.5346, + "step": 13688 + }, + { + "epoch": 0.6845, + "grad_norm": 5.420430660247803, + "learning_rate": 5.475652906882173e-06, + "loss": 1.178, + "step": 13690 + }, + { + "epoch": 0.6846, + "grad_norm": 3.895549774169922, + "learning_rate": 5.472540221953824e-06, + "loss": 1.1757, + "step": 13692 + }, + { + "epoch": 0.6847, + "grad_norm": 2.2968058586120605, + "learning_rate": 5.469428088682718e-06, + "loss": 1.1508, + "step": 13694 + }, + { + "epoch": 0.6848, + "grad_norm": 2.9521501064300537, + "learning_rate": 5.466316507448049e-06, + "loss": 0.3329, + "step": 13696 + }, + { + "epoch": 0.6849, + "grad_norm": 3.5049118995666504, + "learning_rate": 5.463205478628965e-06, + "loss": 0.9698, + "step": 13698 + }, + { + "epoch": 0.685, + "grad_norm": 9.316485404968262, + "learning_rate": 5.460095002604533e-06, + "loss": 1.1357, + "step": 13700 + }, + { + "epoch": 0.6851, + "grad_norm": 3.8700649738311768, + "learning_rate": 5.456985079753754e-06, + "loss": 0.9356, + "step": 13702 + }, + { + "epoch": 0.6852, + "grad_norm": 1.4358758926391602, + "learning_rate": 5.453875710455562e-06, + "loss": 0.684, + "step": 13704 + }, + { + "epoch": 0.6853, + "grad_norm": 3.7468044757843018, + "learning_rate": 5.450766895088825e-06, + "loss": 1.5571, + "step": 13706 + }, + { + "epoch": 0.6854, + "grad_norm": 3.3428776264190674, + "learning_rate": 5.447658634032338e-06, + "loss": 0.7214, + "step": 13708 + }, + { + "epoch": 0.6855, + "grad_norm": 3.120806932449341, + "learning_rate": 5.444550927664847e-06, + "loss": 1.4491, + "step": 13710 + }, + { + "epoch": 0.6856, + "grad_norm": 12.809388160705566, + "learning_rate": 5.441443776365003e-06, + "loss": 0.93, + "step": 13712 + }, + { + "epoch": 0.6857, + "grad_norm": 6.0728325843811035, + "learning_rate": 5.438337180511406e-06, + "loss": 1.7382, + "step": 13714 + }, + { + "epoch": 0.6858, + "grad_norm": 5.081286430358887, + "learning_rate": 5.435231140482588e-06, + "loss": 1.0998, + "step": 13716 + }, + { + "epoch": 0.6859, + "grad_norm": 7.269882678985596, + "learning_rate": 5.432125656657004e-06, + "loss": 0.4875, + "step": 13718 + }, + { + "epoch": 0.686, + "grad_norm": 5.526793003082275, + "learning_rate": 5.429020729413062e-06, + "loss": 1.0343, + "step": 13720 + }, + { + "epoch": 0.6861, + "grad_norm": 12.287006378173828, + "learning_rate": 5.42591635912907e-06, + "loss": 1.0808, + "step": 13722 + }, + { + "epoch": 0.6862, + "grad_norm": 3.0442233085632324, + "learning_rate": 5.4228125461833026e-06, + "loss": 1.4148, + "step": 13724 + }, + { + "epoch": 0.6863, + "grad_norm": 0.7420535087585449, + "learning_rate": 5.4197092909539365e-06, + "loss": 0.8143, + "step": 13726 + }, + { + "epoch": 0.6864, + "grad_norm": 3.0737085342407227, + "learning_rate": 5.416606593819102e-06, + "loss": 1.051, + "step": 13728 + }, + { + "epoch": 0.6865, + "grad_norm": 3.385629653930664, + "learning_rate": 5.413504455156855e-06, + "loss": 0.6104, + "step": 13730 + }, + { + "epoch": 0.6866, + "grad_norm": 4.532958507537842, + "learning_rate": 5.41040287534517e-06, + "loss": 0.8762, + "step": 13732 + }, + { + "epoch": 0.6867, + "grad_norm": 3.5983121395111084, + "learning_rate": 5.407301854761977e-06, + "loss": 0.6675, + "step": 13734 + }, + { + "epoch": 0.6868, + "grad_norm": 2.885291576385498, + "learning_rate": 5.404201393785123e-06, + "loss": 1.1238, + "step": 13736 + }, + { + "epoch": 0.6869, + "grad_norm": 10.916486740112305, + "learning_rate": 5.401101492792386e-06, + "loss": 0.5519, + "step": 13738 + }, + { + "epoch": 0.687, + "grad_norm": 3.5777201652526855, + "learning_rate": 5.398002152161484e-06, + "loss": 0.9502, + "step": 13740 + }, + { + "epoch": 0.6871, + "grad_norm": 4.365091323852539, + "learning_rate": 5.394903372270062e-06, + "loss": 1.0324, + "step": 13742 + }, + { + "epoch": 0.6872, + "grad_norm": 5.102996826171875, + "learning_rate": 5.391805153495693e-06, + "loss": 1.4968, + "step": 13744 + }, + { + "epoch": 0.6873, + "grad_norm": 4.863210678100586, + "learning_rate": 5.388707496215888e-06, + "loss": 0.5552, + "step": 13746 + }, + { + "epoch": 0.6874, + "grad_norm": 5.452638149261475, + "learning_rate": 5.385610400808088e-06, + "loss": 0.5538, + "step": 13748 + }, + { + "epoch": 0.6875, + "grad_norm": 1.9486876726150513, + "learning_rate": 5.382513867649663e-06, + "loss": 0.9903, + "step": 13750 + }, + { + "epoch": 0.6876, + "grad_norm": 2.149754285812378, + "learning_rate": 5.379417897117917e-06, + "loss": 1.0408, + "step": 13752 + }, + { + "epoch": 0.6877, + "grad_norm": 5.530539512634277, + "learning_rate": 5.376322489590085e-06, + "loss": 0.9781, + "step": 13754 + }, + { + "epoch": 0.6878, + "grad_norm": 2.0520381927490234, + "learning_rate": 5.373227645443332e-06, + "loss": 0.8663, + "step": 13756 + }, + { + "epoch": 0.6879, + "grad_norm": 5.609691619873047, + "learning_rate": 5.3701333650547525e-06, + "loss": 0.7105, + "step": 13758 + }, + { + "epoch": 0.688, + "grad_norm": 2.30966854095459, + "learning_rate": 5.367039648801386e-06, + "loss": 0.5953, + "step": 13760 + }, + { + "epoch": 0.6881, + "grad_norm": 2.3130431175231934, + "learning_rate": 5.3639464970601775e-06, + "loss": 0.3991, + "step": 13762 + }, + { + "epoch": 0.6882, + "grad_norm": 6.1367316246032715, + "learning_rate": 5.360853910208028e-06, + "loss": 1.6389, + "step": 13764 + }, + { + "epoch": 0.6883, + "grad_norm": 2.5915191173553467, + "learning_rate": 5.357761888621764e-06, + "loss": 1.073, + "step": 13766 + }, + { + "epoch": 0.6884, + "grad_norm": 8.802299499511719, + "learning_rate": 5.354670432678124e-06, + "loss": 1.3266, + "step": 13768 + }, + { + "epoch": 0.6885, + "grad_norm": 1.2302074432373047, + "learning_rate": 5.351579542753808e-06, + "loss": 0.2211, + "step": 13770 + }, + { + "epoch": 0.6886, + "grad_norm": 4.3161540031433105, + "learning_rate": 5.348489219225417e-06, + "loss": 1.2319, + "step": 13772 + }, + { + "epoch": 0.6887, + "grad_norm": 0.11971203237771988, + "learning_rate": 5.345399462469509e-06, + "loss": 0.4616, + "step": 13774 + }, + { + "epoch": 0.6888, + "grad_norm": 11.48206615447998, + "learning_rate": 5.342310272862558e-06, + "loss": 1.1347, + "step": 13776 + }, + { + "epoch": 0.6889, + "grad_norm": 6.94742488861084, + "learning_rate": 5.339221650780971e-06, + "loss": 1.5102, + "step": 13778 + }, + { + "epoch": 0.689, + "grad_norm": 4.35225772857666, + "learning_rate": 5.336133596601089e-06, + "loss": 1.1828, + "step": 13780 + }, + { + "epoch": 0.6891, + "grad_norm": 9.11980152130127, + "learning_rate": 5.33304611069918e-06, + "loss": 1.2329, + "step": 13782 + }, + { + "epoch": 0.6892, + "grad_norm": 3.086998224258423, + "learning_rate": 5.3299591934514485e-06, + "loss": 1.4424, + "step": 13784 + }, + { + "epoch": 0.6893, + "grad_norm": 7.012237071990967, + "learning_rate": 5.326872845234021e-06, + "loss": 1.1382, + "step": 13786 + }, + { + "epoch": 0.6894, + "grad_norm": 1.628464937210083, + "learning_rate": 5.323787066422964e-06, + "loss": 0.9765, + "step": 13788 + }, + { + "epoch": 0.6895, + "grad_norm": 7.869226455688477, + "learning_rate": 5.3207018573942684e-06, + "loss": 0.6795, + "step": 13790 + }, + { + "epoch": 0.6896, + "grad_norm": 16.72992515563965, + "learning_rate": 5.317617218523856e-06, + "loss": 1.7227, + "step": 13792 + }, + { + "epoch": 0.6897, + "grad_norm": 5.953192710876465, + "learning_rate": 5.3145331501875796e-06, + "loss": 0.6015, + "step": 13794 + }, + { + "epoch": 0.6898, + "grad_norm": 0.3689664602279663, + "learning_rate": 5.311449652761235e-06, + "loss": 0.4982, + "step": 13796 + }, + { + "epoch": 0.6899, + "grad_norm": 26.371801376342773, + "learning_rate": 5.30836672662052e-06, + "loss": 1.9505, + "step": 13798 + }, + { + "epoch": 0.69, + "grad_norm": 3.3308017253875732, + "learning_rate": 5.305284372141095e-06, + "loss": 1.0305, + "step": 13800 + }, + { + "epoch": 0.6901, + "grad_norm": 2.8750176429748535, + "learning_rate": 5.302202589698525e-06, + "loss": 0.3966, + "step": 13802 + }, + { + "epoch": 0.6902, + "grad_norm": 10.604126930236816, + "learning_rate": 5.299121379668316e-06, + "loss": 0.8258, + "step": 13804 + }, + { + "epoch": 0.6903, + "grad_norm": 3.940150499343872, + "learning_rate": 5.2960407424259165e-06, + "loss": 0.8577, + "step": 13806 + }, + { + "epoch": 0.6904, + "grad_norm": 6.422769546508789, + "learning_rate": 5.292960678346674e-06, + "loss": 1.0218, + "step": 13808 + }, + { + "epoch": 0.6905, + "grad_norm": 4.573817253112793, + "learning_rate": 5.2898811878059e-06, + "loss": 0.9242, + "step": 13810 + }, + { + "epoch": 0.6906, + "grad_norm": 2.2993900775909424, + "learning_rate": 5.286802271178815e-06, + "loss": 0.5927, + "step": 13812 + }, + { + "epoch": 0.6907, + "grad_norm": 1.7524733543395996, + "learning_rate": 5.283723928840578e-06, + "loss": 0.5782, + "step": 13814 + }, + { + "epoch": 0.6908, + "grad_norm": 5.955625534057617, + "learning_rate": 5.280646161166274e-06, + "loss": 0.9158, + "step": 13816 + }, + { + "epoch": 0.6909, + "grad_norm": 1.6922942399978638, + "learning_rate": 5.27756896853092e-06, + "loss": 0.3239, + "step": 13818 + }, + { + "epoch": 0.691, + "grad_norm": 5.265444278717041, + "learning_rate": 5.274492351309462e-06, + "loss": 0.4576, + "step": 13820 + }, + { + "epoch": 0.6911, + "grad_norm": 5.016435623168945, + "learning_rate": 5.2714163098767755e-06, + "loss": 0.7704, + "step": 13822 + }, + { + "epoch": 0.6912, + "grad_norm": 2.802854061126709, + "learning_rate": 5.26834084460767e-06, + "loss": 0.5287, + "step": 13824 + }, + { + "epoch": 0.6913, + "grad_norm": 2.3842453956604004, + "learning_rate": 5.2652659558768795e-06, + "loss": 0.6947, + "step": 13826 + }, + { + "epoch": 0.6914, + "grad_norm": 2.198586940765381, + "learning_rate": 5.262191644059071e-06, + "loss": 1.0274, + "step": 13828 + }, + { + "epoch": 0.6915, + "grad_norm": 4.721749782562256, + "learning_rate": 5.259117909528839e-06, + "loss": 1.2881, + "step": 13830 + }, + { + "epoch": 0.6916, + "grad_norm": 4.137075901031494, + "learning_rate": 5.256044752660709e-06, + "loss": 1.3121, + "step": 13832 + }, + { + "epoch": 0.6917, + "grad_norm": 8.844496726989746, + "learning_rate": 5.252972173829132e-06, + "loss": 1.8639, + "step": 13834 + }, + { + "epoch": 0.6918, + "grad_norm": 2.830512762069702, + "learning_rate": 5.2499001734085045e-06, + "loss": 1.1084, + "step": 13836 + }, + { + "epoch": 0.6919, + "grad_norm": 1.8161470890045166, + "learning_rate": 5.246828751773128e-06, + "loss": 0.7848, + "step": 13838 + }, + { + "epoch": 0.692, + "grad_norm": 3.433748722076416, + "learning_rate": 5.243757909297247e-06, + "loss": 0.764, + "step": 13840 + }, + { + "epoch": 0.6921, + "grad_norm": 3.083862781524658, + "learning_rate": 5.240687646355045e-06, + "loss": 0.9819, + "step": 13842 + }, + { + "epoch": 0.6922, + "grad_norm": 9.997371673583984, + "learning_rate": 5.237617963320608e-06, + "loss": 1.7121, + "step": 13844 + }, + { + "epoch": 0.6923, + "grad_norm": 4.239420413970947, + "learning_rate": 5.234548860567985e-06, + "loss": 0.6071, + "step": 13846 + }, + { + "epoch": 0.6924, + "grad_norm": 9.365878105163574, + "learning_rate": 5.23148033847112e-06, + "loss": 1.038, + "step": 13848 + }, + { + "epoch": 0.6925, + "grad_norm": 0.2564602494239807, + "learning_rate": 5.228412397403916e-06, + "loss": 0.8875, + "step": 13850 + }, + { + "epoch": 0.6926, + "grad_norm": 6.822200298309326, + "learning_rate": 5.225345037740186e-06, + "loss": 0.4921, + "step": 13852 + }, + { + "epoch": 0.6927, + "grad_norm": 9.246847152709961, + "learning_rate": 5.222278259853681e-06, + "loss": 1.0566, + "step": 13854 + }, + { + "epoch": 0.6928, + "grad_norm": 3.5523548126220703, + "learning_rate": 5.219212064118079e-06, + "loss": 0.4194, + "step": 13856 + }, + { + "epoch": 0.6929, + "grad_norm": 6.395331382751465, + "learning_rate": 5.216146450906984e-06, + "loss": 0.3122, + "step": 13858 + }, + { + "epoch": 0.693, + "grad_norm": 8.167457580566406, + "learning_rate": 5.213081420593933e-06, + "loss": 1.5786, + "step": 13860 + }, + { + "epoch": 0.6931, + "grad_norm": 6.05748176574707, + "learning_rate": 5.2100169735523906e-06, + "loss": 1.0851, + "step": 13862 + }, + { + "epoch": 0.6932, + "grad_norm": 24.152080535888672, + "learning_rate": 5.2069531101557505e-06, + "loss": 1.5174, + "step": 13864 + }, + { + "epoch": 0.6933, + "grad_norm": 15.932144165039062, + "learning_rate": 5.203889830777336e-06, + "loss": 0.8042, + "step": 13866 + }, + { + "epoch": 0.6934, + "grad_norm": 2.1392667293548584, + "learning_rate": 5.200827135790396e-06, + "loss": 0.9807, + "step": 13868 + }, + { + "epoch": 0.6935, + "grad_norm": 8.566202163696289, + "learning_rate": 5.197765025568109e-06, + "loss": 1.2154, + "step": 13870 + }, + { + "epoch": 0.6936, + "grad_norm": 5.497293949127197, + "learning_rate": 5.194703500483593e-06, + "loss": 0.5615, + "step": 13872 + }, + { + "epoch": 0.6937, + "grad_norm": 4.795002460479736, + "learning_rate": 5.1916425609098775e-06, + "loss": 0.6543, + "step": 13874 + }, + { + "epoch": 0.6938, + "grad_norm": 4.455447673797607, + "learning_rate": 5.188582207219931e-06, + "loss": 1.0948, + "step": 13876 + }, + { + "epoch": 0.6939, + "grad_norm": 2.268562078475952, + "learning_rate": 5.1855224397866475e-06, + "loss": 0.4278, + "step": 13878 + }, + { + "epoch": 0.694, + "grad_norm": 2.2435848712921143, + "learning_rate": 5.1824632589828465e-06, + "loss": 1.0656, + "step": 13880 + }, + { + "epoch": 0.6941, + "grad_norm": 3.777982234954834, + "learning_rate": 5.1794046651812915e-06, + "loss": 1.1951, + "step": 13882 + }, + { + "epoch": 0.6942, + "grad_norm": 3.8525304794311523, + "learning_rate": 5.176346658754648e-06, + "loss": 0.4497, + "step": 13884 + }, + { + "epoch": 0.6943, + "grad_norm": 3.404672622680664, + "learning_rate": 5.1732892400755376e-06, + "loss": 1.1733, + "step": 13886 + }, + { + "epoch": 0.6944, + "grad_norm": 3.1965367794036865, + "learning_rate": 5.1702324095164955e-06, + "loss": 1.2181, + "step": 13888 + }, + { + "epoch": 0.6945, + "grad_norm": 5.601541042327881, + "learning_rate": 5.167176167449977e-06, + "loss": 1.0104, + "step": 13890 + }, + { + "epoch": 0.6946, + "grad_norm": 3.8085477352142334, + "learning_rate": 5.16412051424839e-06, + "loss": 1.0728, + "step": 13892 + }, + { + "epoch": 0.6947, + "grad_norm": 1.9403883218765259, + "learning_rate": 5.161065450284041e-06, + "loss": 1.9502, + "step": 13894 + }, + { + "epoch": 0.6948, + "grad_norm": 3.270271062850952, + "learning_rate": 5.158010975929193e-06, + "loss": 0.9284, + "step": 13896 + }, + { + "epoch": 0.6949, + "grad_norm": 4.3760833740234375, + "learning_rate": 5.154957091556021e-06, + "loss": 0.4761, + "step": 13898 + }, + { + "epoch": 0.695, + "grad_norm": 7.049617290496826, + "learning_rate": 5.151903797536631e-06, + "loss": 0.9593, + "step": 13900 + }, + { + "epoch": 0.6951, + "grad_norm": 5.575537204742432, + "learning_rate": 5.148851094243057e-06, + "loss": 1.3751, + "step": 13902 + }, + { + "epoch": 0.6952, + "grad_norm": 4.795339107513428, + "learning_rate": 5.145798982047261e-06, + "loss": 0.7044, + "step": 13904 + }, + { + "epoch": 0.6953, + "grad_norm": 8.359217643737793, + "learning_rate": 5.142747461321136e-06, + "loss": 1.347, + "step": 13906 + }, + { + "epoch": 0.6954, + "grad_norm": 2.619469404220581, + "learning_rate": 5.139696532436499e-06, + "loss": 0.6788, + "step": 13908 + }, + { + "epoch": 0.6955, + "grad_norm": 4.357308864593506, + "learning_rate": 5.136646195765096e-06, + "loss": 0.8071, + "step": 13910 + }, + { + "epoch": 0.6956, + "grad_norm": 3.711418628692627, + "learning_rate": 5.133596451678603e-06, + "loss": 0.7901, + "step": 13912 + }, + { + "epoch": 0.6957, + "grad_norm": 3.6490321159362793, + "learning_rate": 5.130547300548621e-06, + "loss": 0.8495, + "step": 13914 + }, + { + "epoch": 0.6958, + "grad_norm": 2.6848959922790527, + "learning_rate": 5.127498742746675e-06, + "loss": 0.9852, + "step": 13916 + }, + { + "epoch": 0.6959, + "grad_norm": 3.6156058311462402, + "learning_rate": 5.124450778644235e-06, + "loss": 0.6504, + "step": 13918 + }, + { + "epoch": 0.696, + "grad_norm": 5.999777793884277, + "learning_rate": 5.121403408612672e-06, + "loss": 0.8998, + "step": 13920 + }, + { + "epoch": 0.6961, + "grad_norm": 7.216436862945557, + "learning_rate": 5.118356633023313e-06, + "loss": 0.8276, + "step": 13922 + }, + { + "epoch": 0.6962, + "grad_norm": 2.397606134414673, + "learning_rate": 5.115310452247386e-06, + "loss": 1.1107, + "step": 13924 + }, + { + "epoch": 0.6963, + "grad_norm": 2.4346423149108887, + "learning_rate": 5.11226486665606e-06, + "loss": 0.3139, + "step": 13926 + }, + { + "epoch": 0.6964, + "grad_norm": 3.3114678859710693, + "learning_rate": 5.109219876620441e-06, + "loss": 0.6839, + "step": 13928 + }, + { + "epoch": 0.6965, + "grad_norm": 1.7818430662155151, + "learning_rate": 5.106175482511537e-06, + "loss": 0.5647, + "step": 13930 + }, + { + "epoch": 0.6966, + "grad_norm": 13.09050178527832, + "learning_rate": 5.103131684700315e-06, + "loss": 0.9955, + "step": 13932 + }, + { + "epoch": 0.6967, + "grad_norm": 8.958207130432129, + "learning_rate": 5.100088483557635e-06, + "loss": 0.9446, + "step": 13934 + }, + { + "epoch": 0.6968, + "grad_norm": 6.723180770874023, + "learning_rate": 5.0970458794543135e-06, + "loss": 1.6193, + "step": 13936 + }, + { + "epoch": 0.6969, + "grad_norm": 2.1581623554229736, + "learning_rate": 5.09400387276108e-06, + "loss": 0.4005, + "step": 13938 + }, + { + "epoch": 0.697, + "grad_norm": 4.209408760070801, + "learning_rate": 5.090962463848592e-06, + "loss": 0.767, + "step": 13940 + }, + { + "epoch": 0.6971, + "grad_norm": 3.317890167236328, + "learning_rate": 5.087921653087437e-06, + "loss": 1.0081, + "step": 13942 + }, + { + "epoch": 0.6972, + "grad_norm": 3.1393771171569824, + "learning_rate": 5.0848814408481305e-06, + "loss": 1.0106, + "step": 13944 + }, + { + "epoch": 0.6973, + "grad_norm": 4.9890666007995605, + "learning_rate": 5.08184182750111e-06, + "loss": 1.259, + "step": 13946 + }, + { + "epoch": 0.6974, + "grad_norm": 3.5549304485321045, + "learning_rate": 5.078802813416746e-06, + "loss": 0.6131, + "step": 13948 + }, + { + "epoch": 0.6975, + "grad_norm": 16.888748168945312, + "learning_rate": 5.075764398965331e-06, + "loss": 0.7305, + "step": 13950 + }, + { + "epoch": 0.6976, + "grad_norm": 3.040480136871338, + "learning_rate": 5.072726584517086e-06, + "loss": 0.4411, + "step": 13952 + }, + { + "epoch": 0.6977, + "grad_norm": 3.2106170654296875, + "learning_rate": 5.0696893704421615e-06, + "loss": 1.095, + "step": 13954 + }, + { + "epoch": 0.6978, + "grad_norm": 3.2128424644470215, + "learning_rate": 5.066652757110628e-06, + "loss": 0.7217, + "step": 13956 + }, + { + "epoch": 0.6979, + "grad_norm": 3.923772096633911, + "learning_rate": 5.063616744892499e-06, + "loss": 0.5627, + "step": 13958 + }, + { + "epoch": 0.698, + "grad_norm": 7.626582622528076, + "learning_rate": 5.060581334157693e-06, + "loss": 1.2125, + "step": 13960 + }, + { + "epoch": 0.6981, + "grad_norm": 3.7689332962036133, + "learning_rate": 5.057546525276068e-06, + "loss": 1.1272, + "step": 13962 + }, + { + "epoch": 0.6982, + "grad_norm": 18.356319427490234, + "learning_rate": 5.054512318617406e-06, + "loss": 0.9581, + "step": 13964 + }, + { + "epoch": 0.6983, + "grad_norm": 3.456378936767578, + "learning_rate": 5.0514787145514145e-06, + "loss": 1.0746, + "step": 13966 + }, + { + "epoch": 0.6984, + "grad_norm": 3.670635461807251, + "learning_rate": 5.048445713447738e-06, + "loss": 1.2022, + "step": 13968 + }, + { + "epoch": 0.6985, + "grad_norm": 10.532771110534668, + "learning_rate": 5.045413315675925e-06, + "loss": 0.7919, + "step": 13970 + }, + { + "epoch": 0.6986, + "grad_norm": 3.1419172286987305, + "learning_rate": 5.042381521605473e-06, + "loss": 1.6198, + "step": 13972 + }, + { + "epoch": 0.6987, + "grad_norm": 2.517711877822876, + "learning_rate": 5.0393503316057945e-06, + "loss": 1.159, + "step": 13974 + }, + { + "epoch": 0.6988, + "grad_norm": 2.609099864959717, + "learning_rate": 5.036319746046232e-06, + "loss": 0.9486, + "step": 13976 + }, + { + "epoch": 0.6989, + "grad_norm": 4.794731616973877, + "learning_rate": 5.033289765296055e-06, + "loss": 1.1284, + "step": 13978 + }, + { + "epoch": 0.699, + "grad_norm": 1.225008249282837, + "learning_rate": 5.030260389724447e-06, + "loss": 1.0017, + "step": 13980 + }, + { + "epoch": 0.6991, + "grad_norm": 5.726821422576904, + "learning_rate": 5.02723161970054e-06, + "loss": 0.8512, + "step": 13982 + }, + { + "epoch": 0.6992, + "grad_norm": 8.561227798461914, + "learning_rate": 5.024203455593375e-06, + "loss": 0.5474, + "step": 13984 + }, + { + "epoch": 0.6993, + "grad_norm": 4.2033586502075195, + "learning_rate": 5.021175897771927e-06, + "loss": 0.7838, + "step": 13986 + }, + { + "epoch": 0.6994, + "grad_norm": 5.069412708282471, + "learning_rate": 5.018148946605092e-06, + "loss": 1.2044, + "step": 13988 + }, + { + "epoch": 0.6995, + "grad_norm": 5.708666801452637, + "learning_rate": 5.015122602461698e-06, + "loss": 0.9609, + "step": 13990 + }, + { + "epoch": 0.6996, + "grad_norm": 4.546397686004639, + "learning_rate": 5.012096865710494e-06, + "loss": 0.7212, + "step": 13992 + }, + { + "epoch": 0.6997, + "grad_norm": 2.3339033126831055, + "learning_rate": 5.009071736720156e-06, + "loss": 1.1539, + "step": 13994 + }, + { + "epoch": 0.6998, + "grad_norm": 3.349881649017334, + "learning_rate": 5.0060472158592885e-06, + "loss": 0.7733, + "step": 13996 + }, + { + "epoch": 0.6999, + "grad_norm": 5.3801984786987305, + "learning_rate": 5.0030233034964195e-06, + "loss": 0.8627, + "step": 13998 + }, + { + "epoch": 0.7, + "grad_norm": 2.882290840148926, + "learning_rate": 5.000000000000003e-06, + "loss": 0.5443, + "step": 14000 + }, + { + "epoch": 0.7001, + "grad_norm": 3.4240877628326416, + "learning_rate": 4.996977305738415e-06, + "loss": 0.1149, + "step": 14002 + }, + { + "epoch": 0.7002, + "grad_norm": 15.572888374328613, + "learning_rate": 4.993955221079976e-06, + "loss": 1.3623, + "step": 14004 + }, + { + "epoch": 0.7003, + "grad_norm": 4.179569244384766, + "learning_rate": 4.9909337463929e-06, + "loss": 1.2337, + "step": 14006 + }, + { + "epoch": 0.7004, + "grad_norm": 0.7044614553451538, + "learning_rate": 4.98791288204536e-06, + "loss": 0.6021, + "step": 14008 + }, + { + "epoch": 0.7005, + "grad_norm": 5.789783477783203, + "learning_rate": 4.984892628405426e-06, + "loss": 1.0326, + "step": 14010 + }, + { + "epoch": 0.7006, + "grad_norm": 3.754133462905884, + "learning_rate": 4.981872985841115e-06, + "loss": 0.1588, + "step": 14012 + }, + { + "epoch": 0.7007, + "grad_norm": 3.7104008197784424, + "learning_rate": 4.978853954720364e-06, + "loss": 0.5936, + "step": 14014 + }, + { + "epoch": 0.7008, + "grad_norm": 5.041825294494629, + "learning_rate": 4.97583553541102e-06, + "loss": 0.4454, + "step": 14016 + }, + { + "epoch": 0.7009, + "grad_norm": 6.619148254394531, + "learning_rate": 4.9728177282808795e-06, + "loss": 1.0834, + "step": 14018 + }, + { + "epoch": 0.701, + "grad_norm": 8.422921180725098, + "learning_rate": 4.96980053369765e-06, + "loss": 1.1734, + "step": 14020 + }, + { + "epoch": 0.7011, + "grad_norm": 6.1498870849609375, + "learning_rate": 4.9667839520289675e-06, + "loss": 0.3498, + "step": 14022 + }, + { + "epoch": 0.7012, + "grad_norm": 3.239384651184082, + "learning_rate": 4.9637679836423926e-06, + "loss": 1.0052, + "step": 14024 + }, + { + "epoch": 0.7013, + "grad_norm": 3.5334765911102295, + "learning_rate": 4.960752628905412e-06, + "loss": 0.8573, + "step": 14026 + }, + { + "epoch": 0.7014, + "grad_norm": 10.532164573669434, + "learning_rate": 4.957737888185439e-06, + "loss": 0.8979, + "step": 14028 + }, + { + "epoch": 0.7015, + "grad_norm": 11.284088134765625, + "learning_rate": 4.954723761849809e-06, + "loss": 1.0885, + "step": 14030 + }, + { + "epoch": 0.7016, + "grad_norm": 2.9880504608154297, + "learning_rate": 4.951710250265785e-06, + "loss": 1.1989, + "step": 14032 + }, + { + "epoch": 0.7017, + "grad_norm": 3.1858928203582764, + "learning_rate": 4.9486973538005535e-06, + "loss": 0.9689, + "step": 14034 + }, + { + "epoch": 0.7018, + "grad_norm": 3.880657434463501, + "learning_rate": 4.945685072821227e-06, + "loss": 0.8325, + "step": 14036 + }, + { + "epoch": 0.7019, + "grad_norm": 3.6865172386169434, + "learning_rate": 4.942673407694844e-06, + "loss": 1.1479, + "step": 14038 + }, + { + "epoch": 0.702, + "grad_norm": 1.8025845289230347, + "learning_rate": 4.939662358788364e-06, + "loss": 0.5771, + "step": 14040 + }, + { + "epoch": 0.7021, + "grad_norm": 4.009914875030518, + "learning_rate": 4.936651926468673e-06, + "loss": 0.1505, + "step": 14042 + }, + { + "epoch": 0.7022, + "grad_norm": 5.531920433044434, + "learning_rate": 4.933642111102595e-06, + "loss": 0.7246, + "step": 14044 + }, + { + "epoch": 0.7023, + "grad_norm": 2.3428969383239746, + "learning_rate": 4.930632913056848e-06, + "loss": 0.3931, + "step": 14046 + }, + { + "epoch": 0.7024, + "grad_norm": 5.5072550773620605, + "learning_rate": 4.927624332698109e-06, + "loss": 0.9246, + "step": 14048 + }, + { + "epoch": 0.7025, + "grad_norm": 3.643946647644043, + "learning_rate": 4.924616370392962e-06, + "loss": 0.9198, + "step": 14050 + }, + { + "epoch": 0.7026, + "grad_norm": 19.985109329223633, + "learning_rate": 4.921609026507907e-06, + "loss": 0.9914, + "step": 14052 + }, + { + "epoch": 0.7027, + "grad_norm": 3.558375120162964, + "learning_rate": 4.918602301409395e-06, + "loss": 0.5217, + "step": 14054 + }, + { + "epoch": 0.7028, + "grad_norm": 12.070651054382324, + "learning_rate": 4.915596195463773e-06, + "loss": 2.1011, + "step": 14056 + }, + { + "epoch": 0.7029, + "grad_norm": 3.3143224716186523, + "learning_rate": 4.912590709037335e-06, + "loss": 1.6354, + "step": 14058 + }, + { + "epoch": 0.703, + "grad_norm": 3.3162267208099365, + "learning_rate": 4.909585842496287e-06, + "loss": 0.5851, + "step": 14060 + }, + { + "epoch": 0.7031, + "grad_norm": 4.756728172302246, + "learning_rate": 4.9065815962067645e-06, + "loss": 0.5873, + "step": 14062 + }, + { + "epoch": 0.7032, + "grad_norm": 5.736226558685303, + "learning_rate": 4.903577970534823e-06, + "loss": 1.1047, + "step": 14064 + }, + { + "epoch": 0.7033, + "grad_norm": 5.842128753662109, + "learning_rate": 4.9005749658464475e-06, + "loss": 2.3656, + "step": 14066 + }, + { + "epoch": 0.7034, + "grad_norm": 4.142919063568115, + "learning_rate": 4.897572582507544e-06, + "loss": 1.2473, + "step": 14068 + }, + { + "epoch": 0.7035, + "grad_norm": 5.135279655456543, + "learning_rate": 4.894570820883943e-06, + "loss": 0.7054, + "step": 14070 + }, + { + "epoch": 0.7036, + "grad_norm": 8.260522842407227, + "learning_rate": 4.891569681341403e-06, + "loss": 1.4699, + "step": 14072 + }, + { + "epoch": 0.7037, + "grad_norm": 12.789283752441406, + "learning_rate": 4.888569164245601e-06, + "loss": 1.3652, + "step": 14074 + }, + { + "epoch": 0.7038, + "grad_norm": 3.349705219268799, + "learning_rate": 4.885569269962142e-06, + "loss": 0.9654, + "step": 14076 + }, + { + "epoch": 0.7039, + "grad_norm": 6.006509780883789, + "learning_rate": 4.882569998856549e-06, + "loss": 0.6471, + "step": 14078 + }, + { + "epoch": 0.704, + "grad_norm": 4.933070182800293, + "learning_rate": 4.879571351294287e-06, + "loss": 1.0467, + "step": 14080 + }, + { + "epoch": 0.7041, + "grad_norm": 3.2129414081573486, + "learning_rate": 4.8765733276407156e-06, + "loss": 0.4283, + "step": 14082 + }, + { + "epoch": 0.7042, + "grad_norm": 2.2376132011413574, + "learning_rate": 4.873575928261151e-06, + "loss": 0.2856, + "step": 14084 + }, + { + "epoch": 0.7043, + "grad_norm": 3.4495022296905518, + "learning_rate": 4.870579153520807e-06, + "loss": 0.6283, + "step": 14086 + }, + { + "epoch": 0.7044, + "grad_norm": 7.446104049682617, + "learning_rate": 4.8675830037848295e-06, + "loss": 0.5158, + "step": 14088 + }, + { + "epoch": 0.7045, + "grad_norm": 2.8553826808929443, + "learning_rate": 4.864587479418302e-06, + "loss": 0.71, + "step": 14090 + }, + { + "epoch": 0.7046, + "grad_norm": 4.8535284996032715, + "learning_rate": 4.861592580786205e-06, + "loss": 0.6414, + "step": 14092 + }, + { + "epoch": 0.7047, + "grad_norm": 6.995304584503174, + "learning_rate": 4.8585983082534735e-06, + "loss": 0.2915, + "step": 14094 + }, + { + "epoch": 0.7048, + "grad_norm": 2.7490289211273193, + "learning_rate": 4.855604662184935e-06, + "loss": 0.1453, + "step": 14096 + }, + { + "epoch": 0.7049, + "grad_norm": 2.0485832691192627, + "learning_rate": 4.852611642945369e-06, + "loss": 0.6825, + "step": 14098 + }, + { + "epoch": 0.705, + "grad_norm": 9.173713684082031, + "learning_rate": 4.849619250899458e-06, + "loss": 1.0496, + "step": 14100 + }, + { + "epoch": 0.7051, + "grad_norm": 3.2020227909088135, + "learning_rate": 4.84662748641182e-06, + "loss": 0.5707, + "step": 14102 + }, + { + "epoch": 0.7052, + "grad_norm": 4.3683085441589355, + "learning_rate": 4.843636349846991e-06, + "loss": 0.6108, + "step": 14104 + }, + { + "epoch": 0.7053, + "grad_norm": 2.265611410140991, + "learning_rate": 4.840645841569431e-06, + "loss": 0.981, + "step": 14106 + }, + { + "epoch": 0.7054, + "grad_norm": 5.477222442626953, + "learning_rate": 4.837655961943526e-06, + "loss": 1.1883, + "step": 14108 + }, + { + "epoch": 0.7055, + "grad_norm": 14.175686836242676, + "learning_rate": 4.8346667113335824e-06, + "loss": 0.765, + "step": 14110 + }, + { + "epoch": 0.7056, + "grad_norm": 14.766940116882324, + "learning_rate": 4.831678090103832e-06, + "loss": 1.3123, + "step": 14112 + }, + { + "epoch": 0.7057, + "grad_norm": 15.301608085632324, + "learning_rate": 4.828690098618429e-06, + "loss": 1.2422, + "step": 14114 + }, + { + "epoch": 0.7058, + "grad_norm": 8.72625732421875, + "learning_rate": 4.825702737241452e-06, + "loss": 1.1183, + "step": 14116 + }, + { + "epoch": 0.7059, + "grad_norm": 5.573612689971924, + "learning_rate": 4.8227160063368974e-06, + "loss": 0.5649, + "step": 14118 + }, + { + "epoch": 0.706, + "grad_norm": 6.827432155609131, + "learning_rate": 4.8197299062687e-06, + "loss": 0.6594, + "step": 14120 + }, + { + "epoch": 0.7061, + "grad_norm": 4.468033790588379, + "learning_rate": 4.816744437400697e-06, + "loss": 1.1649, + "step": 14122 + }, + { + "epoch": 0.7062, + "grad_norm": 1.5370560884475708, + "learning_rate": 4.813759600096661e-06, + "loss": 0.4453, + "step": 14124 + }, + { + "epoch": 0.7063, + "grad_norm": 6.694179534912109, + "learning_rate": 4.810775394720286e-06, + "loss": 0.2421, + "step": 14126 + }, + { + "epoch": 0.7064, + "grad_norm": 4.300828456878662, + "learning_rate": 4.807791821635186e-06, + "loss": 0.5456, + "step": 14128 + }, + { + "epoch": 0.7065, + "grad_norm": 5.123035430908203, + "learning_rate": 4.80480888120491e-06, + "loss": 0.864, + "step": 14130 + }, + { + "epoch": 0.7066, + "grad_norm": 2.173388719558716, + "learning_rate": 4.801826573792905e-06, + "loss": 0.2921, + "step": 14132 + }, + { + "epoch": 0.7067, + "grad_norm": 3.5558135509490967, + "learning_rate": 4.798844899762568e-06, + "loss": 1.066, + "step": 14134 + }, + { + "epoch": 0.7068, + "grad_norm": 5.436522960662842, + "learning_rate": 4.795863859477207e-06, + "loss": 0.8776, + "step": 14136 + }, + { + "epoch": 0.7069, + "grad_norm": 8.704195022583008, + "learning_rate": 4.792883453300042e-06, + "loss": 0.6999, + "step": 14138 + }, + { + "epoch": 0.707, + "grad_norm": 7.384101390838623, + "learning_rate": 4.78990368159424e-06, + "loss": 1.2732, + "step": 14140 + }, + { + "epoch": 0.7071, + "grad_norm": 2.4568653106689453, + "learning_rate": 4.786924544722864e-06, + "loss": 1.0264, + "step": 14142 + }, + { + "epoch": 0.7072, + "grad_norm": 1.8860307931900024, + "learning_rate": 4.783946043048922e-06, + "loss": 0.2549, + "step": 14144 + }, + { + "epoch": 0.7073, + "grad_norm": 5.540805816650391, + "learning_rate": 4.780968176935333e-06, + "loss": 1.2689, + "step": 14146 + }, + { + "epoch": 0.7074, + "grad_norm": 2.173753261566162, + "learning_rate": 4.7779909467449416e-06, + "loss": 1.5403, + "step": 14148 + }, + { + "epoch": 0.7075, + "grad_norm": 6.544201850891113, + "learning_rate": 4.775014352840512e-06, + "loss": 0.612, + "step": 14150 + }, + { + "epoch": 0.7076, + "grad_norm": 13.051671981811523, + "learning_rate": 4.772038395584735e-06, + "loss": 0.6895, + "step": 14152 + }, + { + "epoch": 0.7077, + "grad_norm": 2.870914936065674, + "learning_rate": 4.7690630753402224e-06, + "loss": 0.5282, + "step": 14154 + }, + { + "epoch": 0.7078, + "grad_norm": 9.941814422607422, + "learning_rate": 4.7660883924695055e-06, + "loss": 0.8377, + "step": 14156 + }, + { + "epoch": 0.7079, + "grad_norm": 3.116992235183716, + "learning_rate": 4.763114347335043e-06, + "loss": 1.4837, + "step": 14158 + }, + { + "epoch": 0.708, + "grad_norm": 0.051921915262937546, + "learning_rate": 4.76014094029921e-06, + "loss": 0.6335, + "step": 14160 + }, + { + "epoch": 0.7081, + "grad_norm": 3.7727739810943604, + "learning_rate": 4.757168171724311e-06, + "loss": 0.7834, + "step": 14162 + }, + { + "epoch": 0.7082, + "grad_norm": 6.170034885406494, + "learning_rate": 4.754196041972563e-06, + "loss": 0.7766, + "step": 14164 + }, + { + "epoch": 0.7083, + "grad_norm": 0.9161992073059082, + "learning_rate": 4.751224551406123e-06, + "loss": 0.4156, + "step": 14166 + }, + { + "epoch": 0.7084, + "grad_norm": 4.107547283172607, + "learning_rate": 4.7482537003870425e-06, + "loss": 1.1696, + "step": 14168 + }, + { + "epoch": 0.7085, + "grad_norm": 1.145933747291565, + "learning_rate": 4.745283489277324e-06, + "loss": 0.5805, + "step": 14170 + }, + { + "epoch": 0.7086, + "grad_norm": 3.2327187061309814, + "learning_rate": 4.7423139184388725e-06, + "loss": 1.0407, + "step": 14172 + }, + { + "epoch": 0.7087, + "grad_norm": 10.571310043334961, + "learning_rate": 4.739344988233517e-06, + "loss": 0.7219, + "step": 14174 + }, + { + "epoch": 0.7088, + "grad_norm": 7.016014575958252, + "learning_rate": 4.736376699023023e-06, + "loss": 0.5174, + "step": 14176 + }, + { + "epoch": 0.7089, + "grad_norm": 3.7842302322387695, + "learning_rate": 4.733409051169055e-06, + "loss": 1.2764, + "step": 14178 + }, + { + "epoch": 0.709, + "grad_norm": 0.5675663352012634, + "learning_rate": 4.7304420450332244e-06, + "loss": 0.8894, + "step": 14180 + }, + { + "epoch": 0.7091, + "grad_norm": 2.828430652618408, + "learning_rate": 4.727475680977045e-06, + "loss": 0.2452, + "step": 14182 + }, + { + "epoch": 0.7092, + "grad_norm": 2.8454158306121826, + "learning_rate": 4.724509959361961e-06, + "loss": 0.5268, + "step": 14184 + }, + { + "epoch": 0.7093, + "grad_norm": 5.114732265472412, + "learning_rate": 4.721544880549337e-06, + "loss": 1.1643, + "step": 14186 + }, + { + "epoch": 0.7094, + "grad_norm": 2.045478105545044, + "learning_rate": 4.718580444900457e-06, + "loss": 0.3705, + "step": 14188 + }, + { + "epoch": 0.7095, + "grad_norm": 3.456357479095459, + "learning_rate": 4.71561665277653e-06, + "loss": 0.7867, + "step": 14190 + }, + { + "epoch": 0.7096, + "grad_norm": 11.224827766418457, + "learning_rate": 4.712653504538684e-06, + "loss": 0.9823, + "step": 14192 + }, + { + "epoch": 0.7097, + "grad_norm": 5.739790439605713, + "learning_rate": 4.70969100054797e-06, + "loss": 0.4966, + "step": 14194 + }, + { + "epoch": 0.7098, + "grad_norm": 2.1136491298675537, + "learning_rate": 4.706729141165362e-06, + "loss": 0.6185, + "step": 14196 + }, + { + "epoch": 0.7099, + "grad_norm": 2.558811902999878, + "learning_rate": 4.70376792675175e-06, + "loss": 0.3683, + "step": 14198 + }, + { + "epoch": 0.71, + "grad_norm": 3.9886281490325928, + "learning_rate": 4.700807357667953e-06, + "loss": 0.4294, + "step": 14200 + }, + { + "epoch": 0.7101, + "grad_norm": 6.247991561889648, + "learning_rate": 4.697847434274704e-06, + "loss": 0.8652, + "step": 14202 + }, + { + "epoch": 0.7102, + "grad_norm": 2.4059619903564453, + "learning_rate": 4.694888156932657e-06, + "loss": 1.3629, + "step": 14204 + }, + { + "epoch": 0.7103, + "grad_norm": 7.291842937469482, + "learning_rate": 4.691929526002405e-06, + "loss": 1.4775, + "step": 14206 + }, + { + "epoch": 0.7104, + "grad_norm": 6.164804935455322, + "learning_rate": 4.688971541844436e-06, + "loss": 0.9583, + "step": 14208 + }, + { + "epoch": 0.7105, + "grad_norm": 3.9974493980407715, + "learning_rate": 4.686014204819171e-06, + "loss": 1.0198, + "step": 14210 + }, + { + "epoch": 0.7106, + "grad_norm": 8.457474708557129, + "learning_rate": 4.6830575152869615e-06, + "loss": 1.389, + "step": 14212 + }, + { + "epoch": 0.7107, + "grad_norm": 1.8401305675506592, + "learning_rate": 4.68010147360806e-06, + "loss": 1.1633, + "step": 14214 + }, + { + "epoch": 0.7108, + "grad_norm": 3.9108786582946777, + "learning_rate": 4.677146080142664e-06, + "loss": 0.777, + "step": 14216 + }, + { + "epoch": 0.7109, + "grad_norm": 4.560543060302734, + "learning_rate": 4.674191335250865e-06, + "loss": 1.9924, + "step": 14218 + }, + { + "epoch": 0.711, + "grad_norm": 4.054705619812012, + "learning_rate": 4.671237239292699e-06, + "loss": 0.611, + "step": 14220 + }, + { + "epoch": 0.7111, + "grad_norm": 4.372707366943359, + "learning_rate": 4.668283792628114e-06, + "loss": 1.3162, + "step": 14222 + }, + { + "epoch": 0.7112, + "grad_norm": 7.385793209075928, + "learning_rate": 4.6653309956169745e-06, + "loss": 0.8482, + "step": 14224 + }, + { + "epoch": 0.7113, + "grad_norm": 2.8410959243774414, + "learning_rate": 4.662378848619073e-06, + "loss": 0.1378, + "step": 14226 + }, + { + "epoch": 0.7114, + "grad_norm": 3.167001724243164, + "learning_rate": 4.659427351994116e-06, + "loss": 0.537, + "step": 14228 + }, + { + "epoch": 0.7115, + "grad_norm": 20.370662689208984, + "learning_rate": 4.656476506101737e-06, + "loss": 1.4364, + "step": 14230 + }, + { + "epoch": 0.7116, + "grad_norm": 2.6698386669158936, + "learning_rate": 4.6535263113014885e-06, + "loss": 0.8463, + "step": 14232 + }, + { + "epoch": 0.7117, + "grad_norm": 3.9139153957366943, + "learning_rate": 4.65057676795284e-06, + "loss": 1.1157, + "step": 14234 + }, + { + "epoch": 0.7118, + "grad_norm": 2.15407395362854, + "learning_rate": 4.647627876415186e-06, + "loss": 0.5501, + "step": 14236 + }, + { + "epoch": 0.7119, + "grad_norm": 4.477741718292236, + "learning_rate": 4.64467963704784e-06, + "loss": 1.3268, + "step": 14238 + }, + { + "epoch": 0.712, + "grad_norm": 4.6167731285095215, + "learning_rate": 4.641732050210032e-06, + "loss": 1.0678, + "step": 14240 + }, + { + "epoch": 0.7121, + "grad_norm": 7.431152820587158, + "learning_rate": 4.638785116260928e-06, + "loss": 1.088, + "step": 14242 + }, + { + "epoch": 0.7122, + "grad_norm": 8.034310340881348, + "learning_rate": 4.635838835559591e-06, + "loss": 1.0013, + "step": 14244 + }, + { + "epoch": 0.7123, + "grad_norm": 23.3786563873291, + "learning_rate": 4.632893208465021e-06, + "loss": 2.2584, + "step": 14246 + }, + { + "epoch": 0.7124, + "grad_norm": 0.09378962218761444, + "learning_rate": 4.629948235336133e-06, + "loss": 0.5314, + "step": 14248 + }, + { + "epoch": 0.7125, + "grad_norm": 5.310001850128174, + "learning_rate": 4.627003916531761e-06, + "loss": 0.6249, + "step": 14250 + }, + { + "epoch": 0.7126, + "grad_norm": 1.8992887735366821, + "learning_rate": 4.62406025241067e-06, + "loss": 0.4492, + "step": 14252 + }, + { + "epoch": 0.7127, + "grad_norm": 5.022740840911865, + "learning_rate": 4.621117243331523e-06, + "loss": 1.5067, + "step": 14254 + }, + { + "epoch": 0.7128, + "grad_norm": 2.446215867996216, + "learning_rate": 4.618174889652928e-06, + "loss": 0.5382, + "step": 14256 + }, + { + "epoch": 0.7129, + "grad_norm": 5.628026485443115, + "learning_rate": 4.6152331917333985e-06, + "loss": 0.5054, + "step": 14258 + }, + { + "epoch": 0.713, + "grad_norm": 8.87894344329834, + "learning_rate": 4.612292149931369e-06, + "loss": 1.1197, + "step": 14260 + }, + { + "epoch": 0.7131, + "grad_norm": 4.274021148681641, + "learning_rate": 4.6093517646052036e-06, + "loss": 0.5973, + "step": 14262 + }, + { + "epoch": 0.7132, + "grad_norm": 4.2066755294799805, + "learning_rate": 4.606412036113166e-06, + "loss": 0.7993, + "step": 14264 + }, + { + "epoch": 0.7133, + "grad_norm": 10.555290222167969, + "learning_rate": 4.603472964813466e-06, + "loss": 1.0318, + "step": 14266 + }, + { + "epoch": 0.7134, + "grad_norm": 3.8312838077545166, + "learning_rate": 4.600534551064215e-06, + "loss": 0.3627, + "step": 14268 + }, + { + "epoch": 0.7135, + "grad_norm": 1.6572034358978271, + "learning_rate": 4.59759679522345e-06, + "loss": 0.5952, + "step": 14270 + }, + { + "epoch": 0.7136, + "grad_norm": 6.098135471343994, + "learning_rate": 4.59465969764913e-06, + "loss": 1.98, + "step": 14272 + }, + { + "epoch": 0.7137, + "grad_norm": 2.6896464824676514, + "learning_rate": 4.591723258699128e-06, + "loss": 0.9363, + "step": 14274 + }, + { + "epoch": 0.7138, + "grad_norm": 4.647006511688232, + "learning_rate": 4.588787478731242e-06, + "loss": 0.2997, + "step": 14276 + }, + { + "epoch": 0.7139, + "grad_norm": 3.0477638244628906, + "learning_rate": 4.585852358103189e-06, + "loss": 0.8343, + "step": 14278 + }, + { + "epoch": 0.714, + "grad_norm": 2.680546998977661, + "learning_rate": 4.582917897172603e-06, + "loss": 0.7993, + "step": 14280 + }, + { + "epoch": 0.7141, + "grad_norm": 4.804965019226074, + "learning_rate": 4.5799840962970385e-06, + "loss": 0.6904, + "step": 14282 + }, + { + "epoch": 0.7142, + "grad_norm": 14.69514274597168, + "learning_rate": 4.577050955833972e-06, + "loss": 1.2997, + "step": 14284 + }, + { + "epoch": 0.7143, + "grad_norm": 9.298330307006836, + "learning_rate": 4.574118476140794e-06, + "loss": 1.0015, + "step": 14286 + }, + { + "epoch": 0.7144, + "grad_norm": 4.1325483322143555, + "learning_rate": 4.571186657574828e-06, + "loss": 0.6351, + "step": 14288 + }, + { + "epoch": 0.7145, + "grad_norm": 3.6293118000030518, + "learning_rate": 4.568255500493292e-06, + "loss": 0.9543, + "step": 14290 + }, + { + "epoch": 0.7146, + "grad_norm": 4.367082118988037, + "learning_rate": 4.565325005253356e-06, + "loss": 0.8906, + "step": 14292 + }, + { + "epoch": 0.7147, + "grad_norm": 4.4892168045043945, + "learning_rate": 4.562395172212074e-06, + "loss": 0.8481, + "step": 14294 + }, + { + "epoch": 0.7148, + "grad_norm": 5.026358604431152, + "learning_rate": 4.559466001726451e-06, + "loss": 1.0726, + "step": 14296 + }, + { + "epoch": 0.7149, + "grad_norm": 3.6267197132110596, + "learning_rate": 4.556537494153397e-06, + "loss": 0.4685, + "step": 14298 + }, + { + "epoch": 0.715, + "grad_norm": 3.295344352722168, + "learning_rate": 4.5536096498497295e-06, + "loss": 0.6679, + "step": 14300 + }, + { + "epoch": 0.7151, + "grad_norm": 4.518075942993164, + "learning_rate": 4.550682469172213e-06, + "loss": 0.9725, + "step": 14302 + }, + { + "epoch": 0.7152, + "grad_norm": 4.288730621337891, + "learning_rate": 4.5477559524775e-06, + "loss": 1.0565, + "step": 14304 + }, + { + "epoch": 0.7153, + "grad_norm": 2.710545778274536, + "learning_rate": 4.544830100122189e-06, + "loss": 0.8884, + "step": 14306 + }, + { + "epoch": 0.7154, + "grad_norm": 6.79705286026001, + "learning_rate": 4.541904912462785e-06, + "loss": 0.635, + "step": 14308 + }, + { + "epoch": 0.7155, + "grad_norm": 5.333633899688721, + "learning_rate": 4.538980389855711e-06, + "loss": 0.8513, + "step": 14310 + }, + { + "epoch": 0.7156, + "grad_norm": 50.50318145751953, + "learning_rate": 4.53605653265731e-06, + "loss": 1.1997, + "step": 14312 + }, + { + "epoch": 0.7157, + "grad_norm": 15.968476295471191, + "learning_rate": 4.5331333412238475e-06, + "loss": 1.7118, + "step": 14314 + }, + { + "epoch": 0.7158, + "grad_norm": 4.81193208694458, + "learning_rate": 4.530210815911504e-06, + "loss": 0.8298, + "step": 14316 + }, + { + "epoch": 0.7159, + "grad_norm": 5.699814319610596, + "learning_rate": 4.527288957076382e-06, + "loss": 0.5726, + "step": 14318 + }, + { + "epoch": 0.716, + "grad_norm": 1.9891815185546875, + "learning_rate": 4.524367765074499e-06, + "loss": 0.8297, + "step": 14320 + }, + { + "epoch": 0.7161, + "grad_norm": 7.393092632293701, + "learning_rate": 4.521447240261795e-06, + "loss": 1.8244, + "step": 14322 + }, + { + "epoch": 0.7162, + "grad_norm": 4.953202247619629, + "learning_rate": 4.518527382994127e-06, + "loss": 0.3782, + "step": 14324 + }, + { + "epoch": 0.7163, + "grad_norm": 0.9053175449371338, + "learning_rate": 4.515608193627265e-06, + "loss": 0.6477, + "step": 14326 + }, + { + "epoch": 0.7164, + "grad_norm": 2.4765233993530273, + "learning_rate": 4.512689672516918e-06, + "loss": 1.1115, + "step": 14328 + }, + { + "epoch": 0.7165, + "grad_norm": 5.765244007110596, + "learning_rate": 4.509771820018682e-06, + "loss": 1.0102, + "step": 14330 + }, + { + "epoch": 0.7166, + "grad_norm": 0.14219403266906738, + "learning_rate": 4.506854636488103e-06, + "loss": 0.4086, + "step": 14332 + }, + { + "epoch": 0.7167, + "grad_norm": 6.126498699188232, + "learning_rate": 4.50393812228062e-06, + "loss": 1.207, + "step": 14334 + }, + { + "epoch": 0.7168, + "grad_norm": 10.060791015625, + "learning_rate": 4.501022277751602e-06, + "loss": 0.5865, + "step": 14336 + }, + { + "epoch": 0.7169, + "grad_norm": 3.975247621536255, + "learning_rate": 4.498107103256346e-06, + "loss": 0.7179, + "step": 14338 + }, + { + "epoch": 0.717, + "grad_norm": 7.054041385650635, + "learning_rate": 4.495192599150045e-06, + "loss": 1.3574, + "step": 14340 + }, + { + "epoch": 0.7171, + "grad_norm": 3.791097640991211, + "learning_rate": 4.49227876578783e-06, + "loss": 0.7569, + "step": 14342 + }, + { + "epoch": 0.7172, + "grad_norm": 7.293482780456543, + "learning_rate": 4.48936560352474e-06, + "loss": 1.221, + "step": 14344 + }, + { + "epoch": 0.7173, + "grad_norm": 8.936640739440918, + "learning_rate": 4.486453112715737e-06, + "loss": 1.1011, + "step": 14346 + }, + { + "epoch": 0.7174, + "grad_norm": 10.194849014282227, + "learning_rate": 4.483541293715699e-06, + "loss": 1.532, + "step": 14348 + }, + { + "epoch": 0.7175, + "grad_norm": 5.325584411621094, + "learning_rate": 4.480630146879419e-06, + "loss": 1.2077, + "step": 14350 + }, + { + "epoch": 0.7176, + "grad_norm": 3.254491090774536, + "learning_rate": 4.477719672561615e-06, + "loss": 0.7516, + "step": 14352 + }, + { + "epoch": 0.7177, + "grad_norm": 3.1440043449401855, + "learning_rate": 4.474809871116917e-06, + "loss": 0.7831, + "step": 14354 + }, + { + "epoch": 0.7178, + "grad_norm": 1.3532159328460693, + "learning_rate": 4.471900742899876e-06, + "loss": 0.8053, + "step": 14356 + }, + { + "epoch": 0.7179, + "grad_norm": 6.648905277252197, + "learning_rate": 4.468992288264963e-06, + "loss": 1.5868, + "step": 14358 + }, + { + "epoch": 0.718, + "grad_norm": 4.411965847015381, + "learning_rate": 4.46608450756656e-06, + "loss": 0.887, + "step": 14360 + }, + { + "epoch": 0.7181, + "grad_norm": 14.518061637878418, + "learning_rate": 4.463177401158976e-06, + "loss": 0.9061, + "step": 14362 + }, + { + "epoch": 0.7182, + "grad_norm": 5.420566082000732, + "learning_rate": 4.4602709693964296e-06, + "loss": 1.039, + "step": 14364 + }, + { + "epoch": 0.7183, + "grad_norm": 8.559656143188477, + "learning_rate": 4.457365212633058e-06, + "loss": 2.4247, + "step": 14366 + }, + { + "epoch": 0.7184, + "grad_norm": 6.621328353881836, + "learning_rate": 4.4544601312229295e-06, + "loss": 0.9431, + "step": 14368 + }, + { + "epoch": 0.7185, + "grad_norm": 3.7747819423675537, + "learning_rate": 4.451555725520009e-06, + "loss": 1.6357, + "step": 14370 + }, + { + "epoch": 0.7186, + "grad_norm": 14.417508125305176, + "learning_rate": 4.44865199587819e-06, + "loss": 0.6507, + "step": 14372 + }, + { + "epoch": 0.7187, + "grad_norm": 3.1814749240875244, + "learning_rate": 4.445748942651294e-06, + "loss": 1.3522, + "step": 14374 + }, + { + "epoch": 0.7188, + "grad_norm": 3.9814884662628174, + "learning_rate": 4.442846566193034e-06, + "loss": 0.7941, + "step": 14376 + }, + { + "epoch": 0.7189, + "grad_norm": 3.9228157997131348, + "learning_rate": 4.43994486685707e-06, + "loss": 1.5328, + "step": 14378 + }, + { + "epoch": 0.719, + "grad_norm": 12.55525016784668, + "learning_rate": 4.437043844996952e-06, + "loss": 0.966, + "step": 14380 + }, + { + "epoch": 0.7191, + "grad_norm": 2.377096652984619, + "learning_rate": 4.43414350096617e-06, + "loss": 0.8553, + "step": 14382 + }, + { + "epoch": 0.7192, + "grad_norm": 4.2961554527282715, + "learning_rate": 4.4312438351181246e-06, + "loss": 0.6992, + "step": 14384 + }, + { + "epoch": 0.7193, + "grad_norm": 0.37945735454559326, + "learning_rate": 4.428344847806116e-06, + "loss": 0.3647, + "step": 14386 + }, + { + "epoch": 0.7194, + "grad_norm": 6.785562038421631, + "learning_rate": 4.425446539383394e-06, + "loss": 0.9014, + "step": 14388 + }, + { + "epoch": 0.7195, + "grad_norm": 4.767810344696045, + "learning_rate": 4.422548910203099e-06, + "loss": 0.7034, + "step": 14390 + }, + { + "epoch": 0.7196, + "grad_norm": 3.8613076210021973, + "learning_rate": 4.419651960618302e-06, + "loss": 0.8938, + "step": 14392 + }, + { + "epoch": 0.7197, + "grad_norm": 2.2160279750823975, + "learning_rate": 4.416755690981988e-06, + "loss": 0.4891, + "step": 14394 + }, + { + "epoch": 0.7198, + "grad_norm": 2.9675798416137695, + "learning_rate": 4.413860101647055e-06, + "loss": 0.2824, + "step": 14396 + }, + { + "epoch": 0.7199, + "grad_norm": 2.554948329925537, + "learning_rate": 4.410965192966325e-06, + "loss": 1.1177, + "step": 14398 + }, + { + "epoch": 0.72, + "grad_norm": 2.893188714981079, + "learning_rate": 4.408070965292534e-06, + "loss": 0.952, + "step": 14400 + }, + { + "epoch": 0.7201, + "grad_norm": 0.4664068818092346, + "learning_rate": 4.405177418978331e-06, + "loss": 1.5096, + "step": 14402 + }, + { + "epoch": 0.7202, + "grad_norm": 3.026959180831909, + "learning_rate": 4.402284554376292e-06, + "loss": 0.9473, + "step": 14404 + }, + { + "epoch": 0.7203, + "grad_norm": 6.86403751373291, + "learning_rate": 4.399392371838897e-06, + "loss": 0.5973, + "step": 14406 + }, + { + "epoch": 0.7204, + "grad_norm": 4.891441822052002, + "learning_rate": 4.3965008717185555e-06, + "loss": 0.4647, + "step": 14408 + }, + { + "epoch": 0.7205, + "grad_norm": 3.4641284942626953, + "learning_rate": 4.393610054367585e-06, + "loss": 0.4787, + "step": 14410 + }, + { + "epoch": 0.7206, + "grad_norm": 3.010246515274048, + "learning_rate": 4.39071992013822e-06, + "loss": 0.7367, + "step": 14412 + }, + { + "epoch": 0.7207, + "grad_norm": 19.10235595703125, + "learning_rate": 4.387830469382624e-06, + "loss": 1.2763, + "step": 14414 + }, + { + "epoch": 0.7208, + "grad_norm": 1.4947657585144043, + "learning_rate": 4.384941702452856e-06, + "loss": 0.6265, + "step": 14416 + }, + { + "epoch": 0.7209, + "grad_norm": 1.788030743598938, + "learning_rate": 4.3820536197009125e-06, + "loss": 0.3676, + "step": 14418 + }, + { + "epoch": 0.721, + "grad_norm": 7.1901445388793945, + "learning_rate": 4.379166221478697e-06, + "loss": 0.7277, + "step": 14420 + }, + { + "epoch": 0.7211, + "grad_norm": 20.50046157836914, + "learning_rate": 4.376279508138021e-06, + "loss": 0.668, + "step": 14422 + }, + { + "epoch": 0.7212, + "grad_norm": 3.1855242252349854, + "learning_rate": 4.373393480030637e-06, + "loss": 0.5101, + "step": 14424 + }, + { + "epoch": 0.7213, + "grad_norm": 3.397754430770874, + "learning_rate": 4.37050813750818e-06, + "loss": 0.3319, + "step": 14426 + }, + { + "epoch": 0.7214, + "grad_norm": 3.214388132095337, + "learning_rate": 4.367623480922236e-06, + "loss": 1.5558, + "step": 14428 + }, + { + "epoch": 0.7215, + "grad_norm": 2.12058424949646, + "learning_rate": 4.3647395106242864e-06, + "loss": 0.6296, + "step": 14430 + }, + { + "epoch": 0.7216, + "grad_norm": 1.9910571575164795, + "learning_rate": 4.361856226965733e-06, + "loss": 0.9313, + "step": 14432 + }, + { + "epoch": 0.7217, + "grad_norm": 2.684523105621338, + "learning_rate": 4.358973630297896e-06, + "loss": 1.0653, + "step": 14434 + }, + { + "epoch": 0.7218, + "grad_norm": 11.664971351623535, + "learning_rate": 4.356091720972011e-06, + "loss": 0.3567, + "step": 14436 + }, + { + "epoch": 0.7219, + "grad_norm": 4.620944023132324, + "learning_rate": 4.353210499339231e-06, + "loss": 0.6279, + "step": 14438 + }, + { + "epoch": 0.722, + "grad_norm": 2.600080966949463, + "learning_rate": 4.350329965750622e-06, + "loss": 1.2498, + "step": 14440 + }, + { + "epoch": 0.7221, + "grad_norm": 2.244154214859009, + "learning_rate": 4.34745012055717e-06, + "loss": 1.5696, + "step": 14442 + }, + { + "epoch": 0.7222, + "grad_norm": 3.705406665802002, + "learning_rate": 4.344570964109775e-06, + "loss": 0.9671, + "step": 14444 + }, + { + "epoch": 0.7223, + "grad_norm": 5.064436435699463, + "learning_rate": 4.3416924967592526e-06, + "loss": 0.6091, + "step": 14446 + }, + { + "epoch": 0.7224, + "grad_norm": 4.6742658615112305, + "learning_rate": 4.338814718856333e-06, + "loss": 0.7067, + "step": 14448 + }, + { + "epoch": 0.7225, + "grad_norm": 5.467416286468506, + "learning_rate": 4.335937630751675e-06, + "loss": 1.278, + "step": 14450 + }, + { + "epoch": 0.7226, + "grad_norm": 6.767110824584961, + "learning_rate": 4.3330612327958265e-06, + "loss": 0.4705, + "step": 14452 + }, + { + "epoch": 0.7227, + "grad_norm": 3.543121337890625, + "learning_rate": 4.330185525339286e-06, + "loss": 0.7932, + "step": 14454 + }, + { + "epoch": 0.7228, + "grad_norm": 3.4080705642700195, + "learning_rate": 4.3273105087324375e-06, + "loss": 1.1251, + "step": 14456 + }, + { + "epoch": 0.7229, + "grad_norm": 4.39553165435791, + "learning_rate": 4.324436183325593e-06, + "loss": 0.2634, + "step": 14458 + }, + { + "epoch": 0.723, + "grad_norm": 8.824417114257812, + "learning_rate": 4.321562549468991e-06, + "loss": 0.8437, + "step": 14460 + }, + { + "epoch": 0.7231, + "grad_norm": 7.895962238311768, + "learning_rate": 4.318689607512759e-06, + "loss": 0.3653, + "step": 14462 + }, + { + "epoch": 0.7232, + "grad_norm": 2.1900501251220703, + "learning_rate": 4.315817357806974e-06, + "loss": 0.8023, + "step": 14464 + }, + { + "epoch": 0.7233, + "grad_norm": 4.2129926681518555, + "learning_rate": 4.312945800701595e-06, + "loss": 0.33, + "step": 14466 + }, + { + "epoch": 0.7234, + "grad_norm": 4.275369167327881, + "learning_rate": 4.310074936546521e-06, + "loss": 1.3963, + "step": 14468 + }, + { + "epoch": 0.7235, + "grad_norm": 3.2633097171783447, + "learning_rate": 4.307204765691559e-06, + "loss": 0.379, + "step": 14470 + }, + { + "epoch": 0.7236, + "grad_norm": 2.625490427017212, + "learning_rate": 4.304335288486426e-06, + "loss": 1.3723, + "step": 14472 + }, + { + "epoch": 0.7237, + "grad_norm": 4.903650760650635, + "learning_rate": 4.301466505280763e-06, + "loss": 1.4468, + "step": 14474 + }, + { + "epoch": 0.7238, + "grad_norm": 2.9947867393493652, + "learning_rate": 4.29859841642412e-06, + "loss": 1.0296, + "step": 14476 + }, + { + "epoch": 0.7239, + "grad_norm": 2.4990532398223877, + "learning_rate": 4.295731022265966e-06, + "loss": 0.785, + "step": 14478 + }, + { + "epoch": 0.724, + "grad_norm": 2.2360787391662598, + "learning_rate": 4.292864323155684e-06, + "loss": 0.0604, + "step": 14480 + }, + { + "epoch": 0.7241, + "grad_norm": 4.97609806060791, + "learning_rate": 4.289998319442573e-06, + "loss": 0.9385, + "step": 14482 + }, + { + "epoch": 0.7242, + "grad_norm": 5.895832538604736, + "learning_rate": 4.287133011475847e-06, + "loss": 0.5504, + "step": 14484 + }, + { + "epoch": 0.7243, + "grad_norm": 8.625070571899414, + "learning_rate": 4.284268399604633e-06, + "loss": 0.8865, + "step": 14486 + }, + { + "epoch": 0.7244, + "grad_norm": 3.624408483505249, + "learning_rate": 4.281404484177974e-06, + "loss": 0.5266, + "step": 14488 + }, + { + "epoch": 0.7245, + "grad_norm": 4.003900051116943, + "learning_rate": 4.27854126554484e-06, + "loss": 0.7632, + "step": 14490 + }, + { + "epoch": 0.7246, + "grad_norm": 4.072417736053467, + "learning_rate": 4.275678744054094e-06, + "loss": 0.8564, + "step": 14492 + }, + { + "epoch": 0.7247, + "grad_norm": 3.190377712249756, + "learning_rate": 4.272816920054529e-06, + "loss": 0.8357, + "step": 14494 + }, + { + "epoch": 0.7248, + "grad_norm": 0.8079109787940979, + "learning_rate": 4.26995579389485e-06, + "loss": 0.6912, + "step": 14496 + }, + { + "epoch": 0.7249, + "grad_norm": 3.8427932262420654, + "learning_rate": 4.267095365923673e-06, + "loss": 0.9294, + "step": 14498 + }, + { + "epoch": 0.725, + "grad_norm": 3.2338268756866455, + "learning_rate": 4.264235636489542e-06, + "loss": 0.9586, + "step": 14500 + }, + { + "epoch": 0.7251, + "grad_norm": 4.380280494689941, + "learning_rate": 4.2613766059408945e-06, + "loss": 0.5457, + "step": 14502 + }, + { + "epoch": 0.7252, + "grad_norm": 0.5489602088928223, + "learning_rate": 4.258518274626103e-06, + "loss": 0.2671, + "step": 14504 + }, + { + "epoch": 0.7253, + "grad_norm": 3.5112709999084473, + "learning_rate": 4.255660642893444e-06, + "loss": 0.24, + "step": 14506 + }, + { + "epoch": 0.7254, + "grad_norm": 3.00473690032959, + "learning_rate": 4.2528037110911126e-06, + "loss": 0.1854, + "step": 14508 + }, + { + "epoch": 0.7255, + "grad_norm": 2.813438653945923, + "learning_rate": 4.249947479567218e-06, + "loss": 0.3507, + "step": 14510 + }, + { + "epoch": 0.7256, + "grad_norm": 3.5219945907592773, + "learning_rate": 4.247091948669775e-06, + "loss": 1.0465, + "step": 14512 + }, + { + "epoch": 0.7257, + "grad_norm": 3.9945461750030518, + "learning_rate": 4.244237118746732e-06, + "loss": 1.1562, + "step": 14514 + }, + { + "epoch": 0.7258, + "grad_norm": 3.74810528755188, + "learning_rate": 4.2413829901459345e-06, + "loss": 0.6335, + "step": 14516 + }, + { + "epoch": 0.7259, + "grad_norm": 3.790018320083618, + "learning_rate": 4.238529563215154e-06, + "loss": 1.0379, + "step": 14518 + }, + { + "epoch": 0.726, + "grad_norm": 6.859541416168213, + "learning_rate": 4.235676838302069e-06, + "loss": 1.2916, + "step": 14520 + }, + { + "epoch": 0.7261, + "grad_norm": 5.683708190917969, + "learning_rate": 4.232824815754276e-06, + "loss": 0.9673, + "step": 14522 + }, + { + "epoch": 0.7262, + "grad_norm": 5.871768951416016, + "learning_rate": 4.229973495919286e-06, + "loss": 1.0362, + "step": 14524 + }, + { + "epoch": 0.7263, + "grad_norm": 3.127469539642334, + "learning_rate": 4.227122879144523e-06, + "loss": 0.367, + "step": 14526 + }, + { + "epoch": 0.7264, + "grad_norm": 0.41398876905441284, + "learning_rate": 4.224272965777326e-06, + "loss": 0.4531, + "step": 14528 + }, + { + "epoch": 0.7265, + "grad_norm": 4.399515151977539, + "learning_rate": 4.221423756164949e-06, + "loss": 1.5293, + "step": 14530 + }, + { + "epoch": 0.7266, + "grad_norm": 4.030212879180908, + "learning_rate": 4.218575250654559e-06, + "loss": 1.0073, + "step": 14532 + }, + { + "epoch": 0.7267, + "grad_norm": 12.1788330078125, + "learning_rate": 4.215727449593233e-06, + "loss": 1.02, + "step": 14534 + }, + { + "epoch": 0.7268, + "grad_norm": 3.311786413192749, + "learning_rate": 4.21288035332798e-06, + "loss": 0.6137, + "step": 14536 + }, + { + "epoch": 0.7269, + "grad_norm": 4.864205360412598, + "learning_rate": 4.210033962205694e-06, + "loss": 0.689, + "step": 14538 + }, + { + "epoch": 0.727, + "grad_norm": 2.985214948654175, + "learning_rate": 4.207188276573214e-06, + "loss": 1.4963, + "step": 14540 + }, + { + "epoch": 0.7271, + "grad_norm": 5.110130310058594, + "learning_rate": 4.204343296777265e-06, + "loss": 1.149, + "step": 14542 + }, + { + "epoch": 0.7272, + "grad_norm": 7.534146785736084, + "learning_rate": 4.201499023164508e-06, + "loss": 0.6316, + "step": 14544 + }, + { + "epoch": 0.7273, + "grad_norm": 3.9967050552368164, + "learning_rate": 4.1986554560815095e-06, + "loss": 0.508, + "step": 14546 + }, + { + "epoch": 0.7274, + "grad_norm": 6.807767868041992, + "learning_rate": 4.19581259587474e-06, + "loss": 0.236, + "step": 14548 + }, + { + "epoch": 0.7275, + "grad_norm": 3.1652626991271973, + "learning_rate": 4.192970442890602e-06, + "loss": 1.2596, + "step": 14550 + }, + { + "epoch": 0.7276, + "grad_norm": 0.6811718940734863, + "learning_rate": 4.190128997475402e-06, + "loss": 0.0667, + "step": 14552 + }, + { + "epoch": 0.7277, + "grad_norm": 3.414457321166992, + "learning_rate": 4.1872882599753605e-06, + "loss": 0.5595, + "step": 14554 + }, + { + "epoch": 0.7278, + "grad_norm": 8.77934455871582, + "learning_rate": 4.184448230736613e-06, + "loss": 0.9681, + "step": 14556 + }, + { + "epoch": 0.7279, + "grad_norm": 4.161296367645264, + "learning_rate": 4.181608910105207e-06, + "loss": 0.6993, + "step": 14558 + }, + { + "epoch": 0.728, + "grad_norm": 7.418280124664307, + "learning_rate": 4.178770298427107e-06, + "loss": 1.211, + "step": 14560 + }, + { + "epoch": 0.7281, + "grad_norm": 2.5628035068511963, + "learning_rate": 4.175932396048188e-06, + "loss": 0.5666, + "step": 14562 + }, + { + "epoch": 0.7282, + "grad_norm": 4.723456859588623, + "learning_rate": 4.173095203314241e-06, + "loss": 1.305, + "step": 14564 + }, + { + "epoch": 0.7283, + "grad_norm": 2.9052274227142334, + "learning_rate": 4.170258720570968e-06, + "loss": 0.897, + "step": 14566 + }, + { + "epoch": 0.7284, + "grad_norm": 7.994874477386475, + "learning_rate": 4.167422948163986e-06, + "loss": 0.803, + "step": 14568 + }, + { + "epoch": 0.7285, + "grad_norm": 3.1457056999206543, + "learning_rate": 4.164587886438827e-06, + "loss": 0.499, + "step": 14570 + }, + { + "epoch": 0.7286, + "grad_norm": 16.804349899291992, + "learning_rate": 4.161753535740932e-06, + "loss": 1.2384, + "step": 14572 + }, + { + "epoch": 0.7287, + "grad_norm": 3.396045684814453, + "learning_rate": 4.158919896415656e-06, + "loss": 0.3095, + "step": 14574 + }, + { + "epoch": 0.7288, + "grad_norm": 6.169280529022217, + "learning_rate": 4.15608696880828e-06, + "loss": 0.785, + "step": 14576 + }, + { + "epoch": 0.7289, + "grad_norm": 8.541281700134277, + "learning_rate": 4.153254753263975e-06, + "loss": 1.1564, + "step": 14578 + }, + { + "epoch": 0.729, + "grad_norm": 6.184943199157715, + "learning_rate": 4.150423250127846e-06, + "loss": 0.9596, + "step": 14580 + }, + { + "epoch": 0.7291, + "grad_norm": 5.423591136932373, + "learning_rate": 4.1475924597449025e-06, + "loss": 1.4317, + "step": 14582 + }, + { + "epoch": 0.7292, + "grad_norm": 6.559527397155762, + "learning_rate": 4.144762382460059e-06, + "loss": 0.9756, + "step": 14584 + }, + { + "epoch": 0.7293, + "grad_norm": 3.25453519821167, + "learning_rate": 4.141933018618165e-06, + "loss": 0.9114, + "step": 14586 + }, + { + "epoch": 0.7294, + "grad_norm": 1.6283332109451294, + "learning_rate": 4.1391043685639576e-06, + "loss": 0.7388, + "step": 14588 + }, + { + "epoch": 0.7295, + "grad_norm": 5.37919807434082, + "learning_rate": 4.136276432642107e-06, + "loss": 0.8461, + "step": 14590 + }, + { + "epoch": 0.7296, + "grad_norm": 5.473069190979004, + "learning_rate": 4.133449211197188e-06, + "loss": 1.2477, + "step": 14592 + }, + { + "epoch": 0.7297, + "grad_norm": 6.326376438140869, + "learning_rate": 4.130622704573685e-06, + "loss": 0.5764, + "step": 14594 + }, + { + "epoch": 0.7298, + "grad_norm": 9.596641540527344, + "learning_rate": 4.127796913116004e-06, + "loss": 0.7493, + "step": 14596 + }, + { + "epoch": 0.7299, + "grad_norm": 3.374394655227661, + "learning_rate": 4.124971837168457e-06, + "loss": 1.3431, + "step": 14598 + }, + { + "epoch": 0.73, + "grad_norm": 8.792736053466797, + "learning_rate": 4.12214747707527e-06, + "loss": 1.3908, + "step": 14600 + }, + { + "epoch": 0.7301, + "grad_norm": 4.221529006958008, + "learning_rate": 4.119323833180583e-06, + "loss": 1.2632, + "step": 14602 + }, + { + "epoch": 0.7302, + "grad_norm": 3.240227699279785, + "learning_rate": 4.1165009058284496e-06, + "loss": 0.8362, + "step": 14604 + }, + { + "epoch": 0.7303, + "grad_norm": 4.372395038604736, + "learning_rate": 4.113678695362834e-06, + "loss": 0.4775, + "step": 14606 + }, + { + "epoch": 0.7304, + "grad_norm": 3.286079168319702, + "learning_rate": 4.110857202127615e-06, + "loss": 1.272, + "step": 14608 + }, + { + "epoch": 0.7305, + "grad_norm": 2.353497266769409, + "learning_rate": 4.108036426466577e-06, + "loss": 0.4086, + "step": 14610 + }, + { + "epoch": 0.7306, + "grad_norm": 5.014159202575684, + "learning_rate": 4.105216368723437e-06, + "loss": 1.5735, + "step": 14612 + }, + { + "epoch": 0.7307, + "grad_norm": 2.3503262996673584, + "learning_rate": 4.102397029241793e-06, + "loss": 0.9377, + "step": 14614 + }, + { + "epoch": 0.7308, + "grad_norm": 11.827150344848633, + "learning_rate": 4.099578408365192e-06, + "loss": 1.5928, + "step": 14616 + }, + { + "epoch": 0.7309, + "grad_norm": 7.559430122375488, + "learning_rate": 4.096760506437057e-06, + "loss": 0.9827, + "step": 14618 + }, + { + "epoch": 0.731, + "grad_norm": 4.235477447509766, + "learning_rate": 4.093943323800746e-06, + "loss": 0.7437, + "step": 14620 + }, + { + "epoch": 0.7311, + "grad_norm": 4.7137861251831055, + "learning_rate": 4.091126860799532e-06, + "loss": 0.4852, + "step": 14622 + }, + { + "epoch": 0.7312, + "grad_norm": 0.7861275672912598, + "learning_rate": 4.08831111777658e-06, + "loss": 0.2296, + "step": 14624 + }, + { + "epoch": 0.7313, + "grad_norm": 5.033553123474121, + "learning_rate": 4.08549609507499e-06, + "loss": 1.1701, + "step": 14626 + }, + { + "epoch": 0.7314, + "grad_norm": 3.500722885131836, + "learning_rate": 4.08268179303776e-06, + "loss": 1.3114, + "step": 14628 + }, + { + "epoch": 0.7315, + "grad_norm": 2.746605634689331, + "learning_rate": 4.0798682120078046e-06, + "loss": 0.6885, + "step": 14630 + }, + { + "epoch": 0.7316, + "grad_norm": 5.989818096160889, + "learning_rate": 4.0770553523279535e-06, + "loss": 1.2661, + "step": 14632 + }, + { + "epoch": 0.7317, + "grad_norm": 5.41224479675293, + "learning_rate": 4.0742432143409335e-06, + "loss": 1.278, + "step": 14634 + }, + { + "epoch": 0.7318, + "grad_norm": 7.850254058837891, + "learning_rate": 4.071431798389408e-06, + "loss": 0.809, + "step": 14636 + }, + { + "epoch": 0.7319, + "grad_norm": 0.4626041650772095, + "learning_rate": 4.068621104815934e-06, + "loss": 0.6551, + "step": 14638 + }, + { + "epoch": 0.732, + "grad_norm": 6.705986022949219, + "learning_rate": 4.065811133962987e-06, + "loss": 0.7752, + "step": 14640 + }, + { + "epoch": 0.7321, + "grad_norm": 4.208437442779541, + "learning_rate": 4.0630018861729524e-06, + "loss": 1.6209, + "step": 14642 + }, + { + "epoch": 0.7322, + "grad_norm": 6.33603572845459, + "learning_rate": 4.06019336178813e-06, + "loss": 1.3208, + "step": 14644 + }, + { + "epoch": 0.7323, + "grad_norm": 5.373493194580078, + "learning_rate": 4.057385561150728e-06, + "loss": 0.9903, + "step": 14646 + }, + { + "epoch": 0.7324, + "grad_norm": 2.323075532913208, + "learning_rate": 4.05457848460287e-06, + "loss": 0.3637, + "step": 14648 + }, + { + "epoch": 0.7325, + "grad_norm": 4.3314714431762695, + "learning_rate": 4.051772132486589e-06, + "loss": 1.1975, + "step": 14650 + }, + { + "epoch": 0.7326, + "grad_norm": 1.1403954029083252, + "learning_rate": 4.048966505143831e-06, + "loss": 0.4019, + "step": 14652 + }, + { + "epoch": 0.7327, + "grad_norm": 8.569111824035645, + "learning_rate": 4.046161602916453e-06, + "loss": 0.6659, + "step": 14654 + }, + { + "epoch": 0.7328, + "grad_norm": 6.924413204193115, + "learning_rate": 4.04335742614622e-06, + "loss": 0.3634, + "step": 14656 + }, + { + "epoch": 0.7329, + "grad_norm": 3.8093690872192383, + "learning_rate": 4.040553975174824e-06, + "loss": 0.8921, + "step": 14658 + }, + { + "epoch": 0.733, + "grad_norm": 0.0318816676735878, + "learning_rate": 4.037751250343841e-06, + "loss": 0.3, + "step": 14660 + }, + { + "epoch": 0.7331, + "grad_norm": 2.397662401199341, + "learning_rate": 4.034949251994791e-06, + "loss": 0.9724, + "step": 14662 + }, + { + "epoch": 0.7332, + "grad_norm": 9.637931823730469, + "learning_rate": 4.032147980469072e-06, + "loss": 1.5403, + "step": 14664 + }, + { + "epoch": 0.7333, + "grad_norm": 6.9481201171875, + "learning_rate": 4.029347436108024e-06, + "loss": 0.7567, + "step": 14666 + }, + { + "epoch": 0.7334, + "grad_norm": 4.428175449371338, + "learning_rate": 4.026547619252883e-06, + "loss": 0.8091, + "step": 14668 + }, + { + "epoch": 0.7335, + "grad_norm": 1.6918411254882812, + "learning_rate": 4.023748530244789e-06, + "loss": 0.665, + "step": 14670 + }, + { + "epoch": 0.7336, + "grad_norm": 2.1431686878204346, + "learning_rate": 4.020950169424815e-06, + "loss": 0.265, + "step": 14672 + }, + { + "epoch": 0.7337, + "grad_norm": 3.2758307456970215, + "learning_rate": 4.018152537133919e-06, + "loss": 1.2062, + "step": 14674 + }, + { + "epoch": 0.7338, + "grad_norm": 5.627645969390869, + "learning_rate": 4.015355633712996e-06, + "loss": 1.2193, + "step": 14676 + }, + { + "epoch": 0.7339, + "grad_norm": 5.189601421356201, + "learning_rate": 4.012559459502835e-06, + "loss": 1.147, + "step": 14678 + }, + { + "epoch": 0.734, + "grad_norm": 5.260983467102051, + "learning_rate": 4.009764014844143e-06, + "loss": 0.8694, + "step": 14680 + }, + { + "epoch": 0.7341, + "grad_norm": 3.240495204925537, + "learning_rate": 4.006969300077535e-06, + "loss": 0.9094, + "step": 14682 + }, + { + "epoch": 0.7342, + "grad_norm": 3.8346598148345947, + "learning_rate": 4.004175315543538e-06, + "loss": 0.8253, + "step": 14684 + }, + { + "epoch": 0.7343, + "grad_norm": 4.7977423667907715, + "learning_rate": 4.001382061582593e-06, + "loss": 1.4524, + "step": 14686 + }, + { + "epoch": 0.7344, + "grad_norm": 4.23897647857666, + "learning_rate": 3.998589538535046e-06, + "loss": 0.6244, + "step": 14688 + }, + { + "epoch": 0.7345, + "grad_norm": 6.121820449829102, + "learning_rate": 3.9957977467411615e-06, + "loss": 1.0367, + "step": 14690 + }, + { + "epoch": 0.7346, + "grad_norm": 11.180745124816895, + "learning_rate": 3.993006686541108e-06, + "loss": 1.091, + "step": 14692 + }, + { + "epoch": 0.7347, + "grad_norm": 8.485357284545898, + "learning_rate": 3.990216358274969e-06, + "loss": 1.3443, + "step": 14694 + }, + { + "epoch": 0.7348, + "grad_norm": 9.32945728302002, + "learning_rate": 3.987426762282733e-06, + "loss": 0.9983, + "step": 14696 + }, + { + "epoch": 0.7349, + "grad_norm": 4.06540584564209, + "learning_rate": 3.984637898904315e-06, + "loss": 1.366, + "step": 14698 + }, + { + "epoch": 0.735, + "grad_norm": 2.382772207260132, + "learning_rate": 3.981849768479516e-06, + "loss": 0.2839, + "step": 14700 + }, + { + "epoch": 0.7351, + "grad_norm": 3.034963607788086, + "learning_rate": 3.979062371348074e-06, + "loss": 1.0643, + "step": 14702 + }, + { + "epoch": 0.7352, + "grad_norm": 3.1066982746124268, + "learning_rate": 3.976275707849616e-06, + "loss": 1.0823, + "step": 14704 + }, + { + "epoch": 0.7353, + "grad_norm": 6.32686185836792, + "learning_rate": 3.973489778323688e-06, + "loss": 0.7019, + "step": 14706 + }, + { + "epoch": 0.7354, + "grad_norm": 4.812687397003174, + "learning_rate": 3.970704583109755e-06, + "loss": 0.6125, + "step": 14708 + }, + { + "epoch": 0.7355, + "grad_norm": 12.541725158691406, + "learning_rate": 3.967920122547175e-06, + "loss": 0.8949, + "step": 14710 + }, + { + "epoch": 0.7356, + "grad_norm": 5.446384429931641, + "learning_rate": 3.965136396975235e-06, + "loss": 0.825, + "step": 14712 + }, + { + "epoch": 0.7357, + "grad_norm": 4.681289196014404, + "learning_rate": 3.962353406733117e-06, + "loss": 1.0584, + "step": 14714 + }, + { + "epoch": 0.7358, + "grad_norm": 3.907163381576538, + "learning_rate": 3.959571152159922e-06, + "loss": 0.9433, + "step": 14716 + }, + { + "epoch": 0.7359, + "grad_norm": 5.236362934112549, + "learning_rate": 3.956789633594661e-06, + "loss": 1.0022, + "step": 14718 + }, + { + "epoch": 0.736, + "grad_norm": 4.229848861694336, + "learning_rate": 3.954008851376252e-06, + "loss": 1.3056, + "step": 14720 + }, + { + "epoch": 0.7361, + "grad_norm": 3.1889727115631104, + "learning_rate": 3.951228805843525e-06, + "loss": 0.7732, + "step": 14722 + }, + { + "epoch": 0.7362, + "grad_norm": 4.0825114250183105, + "learning_rate": 3.94844949733522e-06, + "loss": 1.2076, + "step": 14724 + }, + { + "epoch": 0.7363, + "grad_norm": 0.2997003495693207, + "learning_rate": 3.945670926189987e-06, + "loss": 0.3819, + "step": 14726 + }, + { + "epoch": 0.7364, + "grad_norm": 3.198819398880005, + "learning_rate": 3.942893092746387e-06, + "loss": 0.1755, + "step": 14728 + }, + { + "epoch": 0.7365, + "grad_norm": 6.121098041534424, + "learning_rate": 3.940115997342892e-06, + "loss": 1.4097, + "step": 14730 + }, + { + "epoch": 0.7366, + "grad_norm": 3.8880765438079834, + "learning_rate": 3.937339640317879e-06, + "loss": 0.9245, + "step": 14732 + }, + { + "epoch": 0.7367, + "grad_norm": 3.2030532360076904, + "learning_rate": 3.9345640220096415e-06, + "loss": 0.9551, + "step": 14734 + }, + { + "epoch": 0.7368, + "grad_norm": 2.5664310455322266, + "learning_rate": 3.931789142756377e-06, + "loss": 0.2318, + "step": 14736 + }, + { + "epoch": 0.7369, + "grad_norm": 4.033813953399658, + "learning_rate": 3.929015002896205e-06, + "loss": 1.4227, + "step": 14738 + }, + { + "epoch": 0.737, + "grad_norm": 5.834567070007324, + "learning_rate": 3.9262416027671354e-06, + "loss": 1.6877, + "step": 14740 + }, + { + "epoch": 0.7371, + "grad_norm": 9.957584381103516, + "learning_rate": 3.9234689427071006e-06, + "loss": 1.2825, + "step": 14742 + }, + { + "epoch": 0.7372, + "grad_norm": 6.897633075714111, + "learning_rate": 3.920697023053949e-06, + "loss": 0.8475, + "step": 14744 + }, + { + "epoch": 0.7373, + "grad_norm": 7.908332347869873, + "learning_rate": 3.917925844145418e-06, + "loss": 0.9139, + "step": 14746 + }, + { + "epoch": 0.7374, + "grad_norm": 4.491857528686523, + "learning_rate": 3.915155406319181e-06, + "loss": 0.9097, + "step": 14748 + }, + { + "epoch": 0.7375, + "grad_norm": 4.541660785675049, + "learning_rate": 3.912385709912794e-06, + "loss": 1.0164, + "step": 14750 + }, + { + "epoch": 0.7376, + "grad_norm": 3.425809144973755, + "learning_rate": 3.9096167552637454e-06, + "loss": 0.9776, + "step": 14752 + }, + { + "epoch": 0.7377, + "grad_norm": 3.4742956161499023, + "learning_rate": 3.90684854270942e-06, + "loss": 0.7734, + "step": 14754 + }, + { + "epoch": 0.7378, + "grad_norm": 0.8053838610649109, + "learning_rate": 3.90408107258712e-06, + "loss": 0.4779, + "step": 14756 + }, + { + "epoch": 0.7379, + "grad_norm": 7.993101119995117, + "learning_rate": 3.901314345234047e-06, + "loss": 1.0579, + "step": 14758 + }, + { + "epoch": 0.738, + "grad_norm": 7.221048831939697, + "learning_rate": 3.898548360987325e-06, + "loss": 1.4634, + "step": 14760 + }, + { + "epoch": 0.7381, + "grad_norm": 3.7853286266326904, + "learning_rate": 3.895783120183975e-06, + "loss": 0.8623, + "step": 14762 + }, + { + "epoch": 0.7382, + "grad_norm": 3.083833694458008, + "learning_rate": 3.893018623160938e-06, + "loss": 0.2012, + "step": 14764 + }, + { + "epoch": 0.7383, + "grad_norm": 2.9775660037994385, + "learning_rate": 3.890254870255056e-06, + "loss": 0.7153, + "step": 14766 + }, + { + "epoch": 0.7384, + "grad_norm": 3.603158473968506, + "learning_rate": 3.887491861803085e-06, + "loss": 1.0336, + "step": 14768 + }, + { + "epoch": 0.7385, + "grad_norm": 4.9408159255981445, + "learning_rate": 3.88472959814169e-06, + "loss": 0.6889, + "step": 14770 + }, + { + "epoch": 0.7386, + "grad_norm": 4.934737205505371, + "learning_rate": 3.88196807960744e-06, + "loss": 0.4043, + "step": 14772 + }, + { + "epoch": 0.7387, + "grad_norm": 0.20249487459659576, + "learning_rate": 3.879207306536828e-06, + "loss": 1.0269, + "step": 14774 + }, + { + "epoch": 0.7388, + "grad_norm": 3.6515722274780273, + "learning_rate": 3.876447279266238e-06, + "loss": 0.7732, + "step": 14776 + }, + { + "epoch": 0.7389, + "grad_norm": 3.2191414833068848, + "learning_rate": 3.87368799813197e-06, + "loss": 1.0229, + "step": 14778 + }, + { + "epoch": 0.739, + "grad_norm": 4.591108322143555, + "learning_rate": 3.8709294634702374e-06, + "loss": 1.9201, + "step": 14780 + }, + { + "epoch": 0.7391, + "grad_norm": 9.218032836914062, + "learning_rate": 3.868171675617155e-06, + "loss": 1.3423, + "step": 14782 + }, + { + "epoch": 0.7392, + "grad_norm": 3.1075756549835205, + "learning_rate": 3.86541463490876e-06, + "loss": 1.5511, + "step": 14784 + }, + { + "epoch": 0.7393, + "grad_norm": 3.3982341289520264, + "learning_rate": 3.862658341680977e-06, + "loss": 1.5141, + "step": 14786 + }, + { + "epoch": 0.7394, + "grad_norm": 5.537563323974609, + "learning_rate": 3.859902796269664e-06, + "loss": 1.3743, + "step": 14788 + }, + { + "epoch": 0.7395, + "grad_norm": 7.23355770111084, + "learning_rate": 3.857147999010568e-06, + "loss": 0.6442, + "step": 14790 + }, + { + "epoch": 0.7396, + "grad_norm": 6.09028959274292, + "learning_rate": 3.854393950239356e-06, + "loss": 1.3055, + "step": 14792 + }, + { + "epoch": 0.7397, + "grad_norm": 3.982866048812866, + "learning_rate": 3.8516406502916025e-06, + "loss": 0.952, + "step": 14794 + }, + { + "epoch": 0.7398, + "grad_norm": 3.7030210494995117, + "learning_rate": 3.848888099502779e-06, + "loss": 0.624, + "step": 14796 + }, + { + "epoch": 0.7399, + "grad_norm": 3.207655906677246, + "learning_rate": 3.846136298208285e-06, + "loss": 1.0001, + "step": 14798 + }, + { + "epoch": 0.74, + "grad_norm": 5.9700422286987305, + "learning_rate": 3.8433852467434175e-06, + "loss": 1.047, + "step": 14800 + }, + { + "epoch": 0.7401, + "grad_norm": 4.6681742668151855, + "learning_rate": 3.840634945443382e-06, + "loss": 1.2964, + "step": 14802 + }, + { + "epoch": 0.7402, + "grad_norm": 4.476190567016602, + "learning_rate": 3.8378853946432956e-06, + "loss": 0.6566, + "step": 14804 + }, + { + "epoch": 0.7403, + "grad_norm": 5.230774879455566, + "learning_rate": 3.835136594678182e-06, + "loss": 0.7985, + "step": 14806 + }, + { + "epoch": 0.7404, + "grad_norm": 6.797305107116699, + "learning_rate": 3.832388545882975e-06, + "loss": 1.0051, + "step": 14808 + }, + { + "epoch": 0.7405, + "grad_norm": 2.2441835403442383, + "learning_rate": 3.829641248592515e-06, + "loss": 0.7644, + "step": 14810 + }, + { + "epoch": 0.7406, + "grad_norm": 2.247741460800171, + "learning_rate": 3.826894703141552e-06, + "loss": 0.291, + "step": 14812 + }, + { + "epoch": 0.7407, + "grad_norm": 4.331024169921875, + "learning_rate": 3.824148909864744e-06, + "loss": 0.5226, + "step": 14814 + }, + { + "epoch": 0.7408, + "grad_norm": 5.58912992477417, + "learning_rate": 3.821403869096658e-06, + "loss": 0.9864, + "step": 14816 + }, + { + "epoch": 0.7409, + "grad_norm": 7.561180591583252, + "learning_rate": 3.818659581171767e-06, + "loss": 0.5343, + "step": 14818 + }, + { + "epoch": 0.741, + "grad_norm": 3.2607290744781494, + "learning_rate": 3.81591604642446e-06, + "loss": 1.0188, + "step": 14820 + }, + { + "epoch": 0.7411, + "grad_norm": 5.711424350738525, + "learning_rate": 3.8131732651890197e-06, + "loss": 1.0593, + "step": 14822 + }, + { + "epoch": 0.7412, + "grad_norm": 2.8292462825775146, + "learning_rate": 3.810431237799657e-06, + "loss": 0.793, + "step": 14824 + }, + { + "epoch": 0.7413, + "grad_norm": 3.64056134223938, + "learning_rate": 3.8076899645904662e-06, + "loss": 1.1821, + "step": 14826 + }, + { + "epoch": 0.7414, + "grad_norm": 3.90848708152771, + "learning_rate": 3.804949445895473e-06, + "loss": 0.3784, + "step": 14828 + }, + { + "epoch": 0.7415, + "grad_norm": 3.7369322776794434, + "learning_rate": 3.8022096820486023e-06, + "loss": 0.5459, + "step": 14830 + }, + { + "epoch": 0.7416, + "grad_norm": 4.846248626708984, + "learning_rate": 3.7994706733836738e-06, + "loss": 0.4409, + "step": 14832 + }, + { + "epoch": 0.7417, + "grad_norm": 4.3919453620910645, + "learning_rate": 3.7967324202344433e-06, + "loss": 0.9807, + "step": 14834 + }, + { + "epoch": 0.7418, + "grad_norm": 9.054425239562988, + "learning_rate": 3.793994922934544e-06, + "loss": 1.5203, + "step": 14836 + }, + { + "epoch": 0.7419, + "grad_norm": 3.5159237384796143, + "learning_rate": 3.791258181817542e-06, + "loss": 1.0238, + "step": 14838 + }, + { + "epoch": 0.742, + "grad_norm": 12.05142593383789, + "learning_rate": 3.7885221972168974e-06, + "loss": 1.3076, + "step": 14840 + }, + { + "epoch": 0.7421, + "grad_norm": 10.191295623779297, + "learning_rate": 3.785786969465981e-06, + "loss": 1.1057, + "step": 14842 + }, + { + "epoch": 0.7422, + "grad_norm": 5.881371021270752, + "learning_rate": 3.783052498898073e-06, + "loss": 0.9463, + "step": 14844 + }, + { + "epoch": 0.7423, + "grad_norm": 3.149933099746704, + "learning_rate": 3.7803187858463607e-06, + "loss": 0.7014, + "step": 14846 + }, + { + "epoch": 0.7424, + "grad_norm": 4.375971794128418, + "learning_rate": 3.7775858306439374e-06, + "loss": 1.2944, + "step": 14848 + }, + { + "epoch": 0.7425, + "grad_norm": 13.022995948791504, + "learning_rate": 3.774853633623806e-06, + "loss": 0.9853, + "step": 14850 + }, + { + "epoch": 0.7426, + "grad_norm": 14.468046188354492, + "learning_rate": 3.772122195118877e-06, + "loss": 1.2889, + "step": 14852 + }, + { + "epoch": 0.7427, + "grad_norm": 0.617839515209198, + "learning_rate": 3.7693915154619664e-06, + "loss": 0.0095, + "step": 14854 + }, + { + "epoch": 0.7428, + "grad_norm": 7.675257205963135, + "learning_rate": 3.766661594985801e-06, + "loss": 1.2491, + "step": 14856 + }, + { + "epoch": 0.7429, + "grad_norm": 6.536868572235107, + "learning_rate": 3.7639324340230086e-06, + "loss": 0.5946, + "step": 14858 + }, + { + "epoch": 0.743, + "grad_norm": 4.37770414352417, + "learning_rate": 3.7612040329061405e-06, + "loss": 1.3839, + "step": 14860 + }, + { + "epoch": 0.7431, + "grad_norm": 13.265092849731445, + "learning_rate": 3.7584763919676294e-06, + "loss": 0.8057, + "step": 14862 + }, + { + "epoch": 0.7432, + "grad_norm": 6.1341118812561035, + "learning_rate": 3.7557495115398446e-06, + "loss": 0.6183, + "step": 14864 + }, + { + "epoch": 0.7433, + "grad_norm": 2.80611252784729, + "learning_rate": 3.7530233919550374e-06, + "loss": 1.082, + "step": 14866 + }, + { + "epoch": 0.7434, + "grad_norm": 9.35041332244873, + "learning_rate": 3.7502980335453777e-06, + "loss": 0.6073, + "step": 14868 + }, + { + "epoch": 0.7435, + "grad_norm": 1.797986388206482, + "learning_rate": 3.747573436642952e-06, + "loss": 0.4006, + "step": 14870 + }, + { + "epoch": 0.7436, + "grad_norm": 2.6596739292144775, + "learning_rate": 3.7448496015797296e-06, + "loss": 0.3345, + "step": 14872 + }, + { + "epoch": 0.7437, + "grad_norm": 3.202688455581665, + "learning_rate": 3.742126528687614e-06, + "loss": 1.0676, + "step": 14874 + }, + { + "epoch": 0.7438, + "grad_norm": 0.933582603931427, + "learning_rate": 3.7394042182983983e-06, + "loss": 0.1978, + "step": 14876 + }, + { + "epoch": 0.7439, + "grad_norm": 3.953282594680786, + "learning_rate": 3.7366826707437874e-06, + "loss": 1.1182, + "step": 14878 + }, + { + "epoch": 0.744, + "grad_norm": 4.29925012588501, + "learning_rate": 3.7339618863553983e-06, + "loss": 0.7817, + "step": 14880 + }, + { + "epoch": 0.7441, + "grad_norm": 10.442138671875, + "learning_rate": 3.731241865464741e-06, + "loss": 1.0355, + "step": 14882 + }, + { + "epoch": 0.7442, + "grad_norm": 8.599416732788086, + "learning_rate": 3.728522608403249e-06, + "loss": 1.313, + "step": 14884 + }, + { + "epoch": 0.7443, + "grad_norm": 4.960334777832031, + "learning_rate": 3.725804115502254e-06, + "loss": 0.3532, + "step": 14886 + }, + { + "epoch": 0.7444, + "grad_norm": 3.4707207679748535, + "learning_rate": 3.723086387092997e-06, + "loss": 0.7267, + "step": 14888 + }, + { + "epoch": 0.7445, + "grad_norm": 3.017688035964966, + "learning_rate": 3.7203694235066224e-06, + "loss": 1.1688, + "step": 14890 + }, + { + "epoch": 0.7446, + "grad_norm": 5.351512432098389, + "learning_rate": 3.7176532250741857e-06, + "loss": 0.8823, + "step": 14892 + }, + { + "epoch": 0.7447, + "grad_norm": 4.036864280700684, + "learning_rate": 3.714937792126647e-06, + "loss": 0.7502, + "step": 14894 + }, + { + "epoch": 0.7448, + "grad_norm": 10.438506126403809, + "learning_rate": 3.7122231249948747e-06, + "loss": 1.2173, + "step": 14896 + }, + { + "epoch": 0.7449, + "grad_norm": 3.3652262687683105, + "learning_rate": 3.7095092240096407e-06, + "loss": 1.0459, + "step": 14898 + }, + { + "epoch": 0.745, + "grad_norm": 7.7993364334106445, + "learning_rate": 3.7067960895016277e-06, + "loss": 0.7276, + "step": 14900 + }, + { + "epoch": 0.7451, + "grad_norm": 6.841578960418701, + "learning_rate": 3.7040837218014215e-06, + "loss": 0.2235, + "step": 14902 + }, + { + "epoch": 0.7452, + "grad_norm": 10.928037643432617, + "learning_rate": 3.7013721212395128e-06, + "loss": 1.4644, + "step": 14904 + }, + { + "epoch": 0.7453, + "grad_norm": 3.8102076053619385, + "learning_rate": 3.6986612881463114e-06, + "loss": 0.9468, + "step": 14906 + }, + { + "epoch": 0.7454, + "grad_norm": 0.7886770963668823, + "learning_rate": 3.6959512228521123e-06, + "loss": 0.4426, + "step": 14908 + }, + { + "epoch": 0.7455, + "grad_norm": 7.105436325073242, + "learning_rate": 3.693241925687141e-06, + "loss": 0.7444, + "step": 14910 + }, + { + "epoch": 0.7456, + "grad_norm": 4.047141075134277, + "learning_rate": 3.6905333969815038e-06, + "loss": 1.1776, + "step": 14912 + }, + { + "epoch": 0.7457, + "grad_norm": 0.5833863615989685, + "learning_rate": 3.6878256370652366e-06, + "loss": 0.2616, + "step": 14914 + }, + { + "epoch": 0.7458, + "grad_norm": 2.5481274127960205, + "learning_rate": 3.685118646268272e-06, + "loss": 0.9745, + "step": 14916 + }, + { + "epoch": 0.7459, + "grad_norm": 4.449697971343994, + "learning_rate": 3.6824124249204384e-06, + "loss": 0.8481, + "step": 14918 + }, + { + "epoch": 0.746, + "grad_norm": 5.37776517868042, + "learning_rate": 3.679706973351491e-06, + "loss": 0.703, + "step": 14920 + }, + { + "epoch": 0.7461, + "grad_norm": 3.68676495552063, + "learning_rate": 3.6770022918910787e-06, + "loss": 1.0816, + "step": 14922 + }, + { + "epoch": 0.7462, + "grad_norm": 2.8574271202087402, + "learning_rate": 3.674298380868756e-06, + "loss": 1.1891, + "step": 14924 + }, + { + "epoch": 0.7463, + "grad_norm": 3.2700729370117188, + "learning_rate": 3.6715952406139886e-06, + "loss": 0.9586, + "step": 14926 + }, + { + "epoch": 0.7464, + "grad_norm": 2.1280148029327393, + "learning_rate": 3.6688928714561444e-06, + "loss": 0.7252, + "step": 14928 + }, + { + "epoch": 0.7465, + "grad_norm": 8.167734146118164, + "learning_rate": 3.6661912737244996e-06, + "loss": 0.7139, + "step": 14930 + }, + { + "epoch": 0.7466, + "grad_norm": 9.096439361572266, + "learning_rate": 3.663490447748236e-06, + "loss": 1.1597, + "step": 14932 + }, + { + "epoch": 0.7467, + "grad_norm": 5.174562454223633, + "learning_rate": 3.6607903938564405e-06, + "loss": 1.2521, + "step": 14934 + }, + { + "epoch": 0.7468, + "grad_norm": 7.370358943939209, + "learning_rate": 3.658091112378106e-06, + "loss": 1.4407, + "step": 14936 + }, + { + "epoch": 0.7469, + "grad_norm": 7.787032604217529, + "learning_rate": 3.655392603642133e-06, + "loss": 0.2435, + "step": 14938 + }, + { + "epoch": 0.747, + "grad_norm": 5.640602111816406, + "learning_rate": 3.6526948679773256e-06, + "loss": 0.1549, + "step": 14940 + }, + { + "epoch": 0.7471, + "grad_norm": 4.582219123840332, + "learning_rate": 3.649997905712396e-06, + "loss": 0.8253, + "step": 14942 + }, + { + "epoch": 0.7472, + "grad_norm": 5.46951150894165, + "learning_rate": 3.6473017171759563e-06, + "loss": 0.5087, + "step": 14944 + }, + { + "epoch": 0.7473, + "grad_norm": 4.009860515594482, + "learning_rate": 3.6446063026965385e-06, + "loss": 0.5748, + "step": 14946 + }, + { + "epoch": 0.7474, + "grad_norm": 1.5773394107818604, + "learning_rate": 3.6419116626025585e-06, + "loss": 0.4676, + "step": 14948 + }, + { + "epoch": 0.7475, + "grad_norm": 5.953627586364746, + "learning_rate": 3.6392177972223596e-06, + "loss": 0.5395, + "step": 14950 + }, + { + "epoch": 0.7476, + "grad_norm": 4.252641677856445, + "learning_rate": 3.636524706884181e-06, + "loss": 1.4655, + "step": 14952 + }, + { + "epoch": 0.7477, + "grad_norm": 3.937683343887329, + "learning_rate": 3.633832391916159e-06, + "loss": 0.9983, + "step": 14954 + }, + { + "epoch": 0.7478, + "grad_norm": 5.781533241271973, + "learning_rate": 3.6311408526463554e-06, + "loss": 0.48, + "step": 14956 + }, + { + "epoch": 0.7479, + "grad_norm": 4.670941352844238, + "learning_rate": 3.628450089402713e-06, + "loss": 0.2613, + "step": 14958 + }, + { + "epoch": 0.748, + "grad_norm": 2.5804507732391357, + "learning_rate": 3.625760102513103e-06, + "loss": 0.6671, + "step": 14960 + }, + { + "epoch": 0.7481, + "grad_norm": 5.287639617919922, + "learning_rate": 3.6230708923052905e-06, + "loss": 0.9504, + "step": 14962 + }, + { + "epoch": 0.7482, + "grad_norm": 11.71030330657959, + "learning_rate": 3.620382459106946e-06, + "loss": 1.4518, + "step": 14964 + }, + { + "epoch": 0.7483, + "grad_norm": 6.245300769805908, + "learning_rate": 3.617694803245647e-06, + "loss": 1.1257, + "step": 14966 + }, + { + "epoch": 0.7484, + "grad_norm": 6.134958267211914, + "learning_rate": 3.615007925048878e-06, + "loss": 1.0327, + "step": 14968 + }, + { + "epoch": 0.7485, + "grad_norm": 4.0732502937316895, + "learning_rate": 3.612321824844024e-06, + "loss": 1.2811, + "step": 14970 + }, + { + "epoch": 0.7486, + "grad_norm": 2.4655795097351074, + "learning_rate": 3.6096365029583803e-06, + "loss": 0.1377, + "step": 14972 + }, + { + "epoch": 0.7487, + "grad_norm": 4.710424900054932, + "learning_rate": 3.606951959719145e-06, + "loss": 0.5921, + "step": 14974 + }, + { + "epoch": 0.7488, + "grad_norm": 3.5788471698760986, + "learning_rate": 3.604268195453421e-06, + "loss": 1.4917, + "step": 14976 + }, + { + "epoch": 0.7489, + "grad_norm": 0.7016591429710388, + "learning_rate": 3.601585210488218e-06, + "loss": 0.4237, + "step": 14978 + }, + { + "epoch": 0.749, + "grad_norm": 2.6436121463775635, + "learning_rate": 3.598903005150444e-06, + "loss": 1.0342, + "step": 14980 + }, + { + "epoch": 0.7491, + "grad_norm": 3.8722639083862305, + "learning_rate": 3.59622157976693e-06, + "loss": 0.7138, + "step": 14982 + }, + { + "epoch": 0.7492, + "grad_norm": 3.5085721015930176, + "learning_rate": 3.5935409346643835e-06, + "loss": 0.6325, + "step": 14984 + }, + { + "epoch": 0.7493, + "grad_norm": 2.7206192016601562, + "learning_rate": 3.590861070169449e-06, + "loss": 0.7948, + "step": 14986 + }, + { + "epoch": 0.7494, + "grad_norm": 3.828972578048706, + "learning_rate": 3.5881819866086485e-06, + "loss": 0.6783, + "step": 14988 + }, + { + "epoch": 0.7495, + "grad_norm": 4.704100608825684, + "learning_rate": 3.5855036843084213e-06, + "loss": 0.8793, + "step": 14990 + }, + { + "epoch": 0.7496, + "grad_norm": 2.8571038246154785, + "learning_rate": 3.582826163595119e-06, + "loss": 0.5002, + "step": 14992 + }, + { + "epoch": 0.7497, + "grad_norm": 4.599770545959473, + "learning_rate": 3.5801494247949764e-06, + "loss": 1.3183, + "step": 14994 + }, + { + "epoch": 0.7498, + "grad_norm": 2.6399853229522705, + "learning_rate": 3.5774734682341563e-06, + "loss": 0.7517, + "step": 14996 + }, + { + "epoch": 0.7499, + "grad_norm": 3.00002384185791, + "learning_rate": 3.5747982942387125e-06, + "loss": 0.9458, + "step": 14998 + }, + { + "epoch": 0.75, + "grad_norm": 5.928466796875, + "learning_rate": 3.5721239031346067e-06, + "loss": 1.0823, + "step": 15000 + }, + { + "epoch": 0.7501, + "grad_norm": 3.221207618713379, + "learning_rate": 3.569450295247706e-06, + "loss": 1.2209, + "step": 15002 + }, + { + "epoch": 0.7502, + "grad_norm": 2.9632787704467773, + "learning_rate": 3.5667774709037804e-06, + "loss": 0.8148, + "step": 15004 + }, + { + "epoch": 0.7503, + "grad_norm": 5.332133769989014, + "learning_rate": 3.5641054304285062e-06, + "loss": 0.7099, + "step": 15006 + }, + { + "epoch": 0.7504, + "grad_norm": 4.521986484527588, + "learning_rate": 3.5614341741474633e-06, + "loss": 0.6574, + "step": 15008 + }, + { + "epoch": 0.7505, + "grad_norm": 1.5115967988967896, + "learning_rate": 3.5587637023861356e-06, + "loss": 0.3342, + "step": 15010 + }, + { + "epoch": 0.7506, + "grad_norm": 5.544386386871338, + "learning_rate": 3.5560940154699133e-06, + "loss": 0.9403, + "step": 15012 + }, + { + "epoch": 0.7507, + "grad_norm": 5.7554497718811035, + "learning_rate": 3.5534251137240883e-06, + "loss": 1.1688, + "step": 15014 + }, + { + "epoch": 0.7508, + "grad_norm": 5.591619491577148, + "learning_rate": 3.5507569974738575e-06, + "loss": 1.1252, + "step": 15016 + }, + { + "epoch": 0.7509, + "grad_norm": 3.201906442642212, + "learning_rate": 3.5480896670443255e-06, + "loss": 0.9994, + "step": 15018 + }, + { + "epoch": 0.751, + "grad_norm": 1.0629771947860718, + "learning_rate": 3.545423122760493e-06, + "loss": 0.1656, + "step": 15020 + }, + { + "epoch": 0.7511, + "grad_norm": 5.411448001861572, + "learning_rate": 3.542757364947281e-06, + "loss": 0.9565, + "step": 15022 + }, + { + "epoch": 0.7512, + "grad_norm": 1.9406523704528809, + "learning_rate": 3.540092393929494e-06, + "loss": 0.6307, + "step": 15024 + }, + { + "epoch": 0.7513, + "grad_norm": 4.686605930328369, + "learning_rate": 3.537428210031849e-06, + "loss": 0.9349, + "step": 15026 + }, + { + "epoch": 0.7514, + "grad_norm": 12.226398468017578, + "learning_rate": 3.5347648135789823e-06, + "loss": 0.857, + "step": 15028 + }, + { + "epoch": 0.7515, + "grad_norm": 16.442224502563477, + "learning_rate": 3.5321022048954036e-06, + "loss": 1.5276, + "step": 15030 + }, + { + "epoch": 0.7516, + "grad_norm": 2.3236329555511475, + "learning_rate": 3.5294403843055604e-06, + "loss": 0.8317, + "step": 15032 + }, + { + "epoch": 0.7517, + "grad_norm": 6.1127729415893555, + "learning_rate": 3.52677935213377e-06, + "loss": 0.2792, + "step": 15034 + }, + { + "epoch": 0.7518, + "grad_norm": 3.7224326133728027, + "learning_rate": 3.524119108704286e-06, + "loss": 0.6248, + "step": 15036 + }, + { + "epoch": 0.7519, + "grad_norm": 3.4354238510131836, + "learning_rate": 3.521459654341244e-06, + "loss": 0.6474, + "step": 15038 + }, + { + "epoch": 0.752, + "grad_norm": 8.613937377929688, + "learning_rate": 3.5188009893686916e-06, + "loss": 1.0121, + "step": 15040 + }, + { + "epoch": 0.7521, + "grad_norm": 2.528186082839966, + "learning_rate": 3.516143114110582e-06, + "loss": 0.1949, + "step": 15042 + }, + { + "epoch": 0.7522, + "grad_norm": 1.6412630081176758, + "learning_rate": 3.5134860288907602e-06, + "loss": 1.1671, + "step": 15044 + }, + { + "epoch": 0.7523, + "grad_norm": 4.989923477172852, + "learning_rate": 3.510829734032993e-06, + "loss": 1.1947, + "step": 15046 + }, + { + "epoch": 0.7524, + "grad_norm": 6.450009822845459, + "learning_rate": 3.50817422986094e-06, + "loss": 1.1116, + "step": 15048 + }, + { + "epoch": 0.7525, + "grad_norm": 4.9640655517578125, + "learning_rate": 3.505519516698165e-06, + "loss": 1.0873, + "step": 15050 + }, + { + "epoch": 0.7526, + "grad_norm": 2.839949131011963, + "learning_rate": 3.502865594868136e-06, + "loss": 0.2878, + "step": 15052 + }, + { + "epoch": 0.7527, + "grad_norm": 8.308237075805664, + "learning_rate": 3.5002124646942272e-06, + "loss": 1.4925, + "step": 15054 + }, + { + "epoch": 0.7528, + "grad_norm": 3.905466079711914, + "learning_rate": 3.4975601264997094e-06, + "loss": 2.3554, + "step": 15056 + }, + { + "epoch": 0.7529, + "grad_norm": 1.2920807600021362, + "learning_rate": 3.494908580607774e-06, + "loss": 0.4087, + "step": 15058 + }, + { + "epoch": 0.753, + "grad_norm": 3.8191583156585693, + "learning_rate": 3.492257827341492e-06, + "loss": 0.5222, + "step": 15060 + }, + { + "epoch": 0.7531, + "grad_norm": 7.686995029449463, + "learning_rate": 3.4896078670238544e-06, + "loss": 0.884, + "step": 15062 + }, + { + "epoch": 0.7532, + "grad_norm": 8.683452606201172, + "learning_rate": 3.4869586999777492e-06, + "loss": 1.2686, + "step": 15064 + }, + { + "epoch": 0.7533, + "grad_norm": 2.812953472137451, + "learning_rate": 3.484310326525967e-06, + "loss": 0.247, + "step": 15066 + }, + { + "epoch": 0.7534, + "grad_norm": 5.115321159362793, + "learning_rate": 3.4816627469912147e-06, + "loss": 1.0009, + "step": 15068 + }, + { + "epoch": 0.7535, + "grad_norm": 6.415835380554199, + "learning_rate": 3.479015961696077e-06, + "loss": 0.42, + "step": 15070 + }, + { + "epoch": 0.7536, + "grad_norm": 3.165958881378174, + "learning_rate": 3.476369970963072e-06, + "loss": 1.1527, + "step": 15072 + }, + { + "epoch": 0.7537, + "grad_norm": 5.444614410400391, + "learning_rate": 3.4737247751145897e-06, + "loss": 1.3473, + "step": 15074 + }, + { + "epoch": 0.7538, + "grad_norm": 16.602188110351562, + "learning_rate": 3.4710803744729517e-06, + "loss": 1.0354, + "step": 15076 + }, + { + "epoch": 0.7539, + "grad_norm": 5.057973384857178, + "learning_rate": 3.468436769360368e-06, + "loss": 0.9722, + "step": 15078 + }, + { + "epoch": 0.754, + "grad_norm": 3.712124824523926, + "learning_rate": 3.4657939600989453e-06, + "loss": 0.9223, + "step": 15080 + }, + { + "epoch": 0.7541, + "grad_norm": 0.5733520984649658, + "learning_rate": 3.4631519470107124e-06, + "loss": 0.4491, + "step": 15082 + }, + { + "epoch": 0.7542, + "grad_norm": 3.1638336181640625, + "learning_rate": 3.4605107304175855e-06, + "loss": 1.0425, + "step": 15084 + }, + { + "epoch": 0.7543, + "grad_norm": 2.152747869491577, + "learning_rate": 3.4578703106413903e-06, + "loss": 0.9505, + "step": 15086 + }, + { + "epoch": 0.7544, + "grad_norm": 9.641358375549316, + "learning_rate": 3.455230688003852e-06, + "loss": 1.3202, + "step": 15088 + }, + { + "epoch": 0.7545, + "grad_norm": 7.905656814575195, + "learning_rate": 3.452591862826603e-06, + "loss": 0.1625, + "step": 15090 + }, + { + "epoch": 0.7546, + "grad_norm": 7.380062580108643, + "learning_rate": 3.4499538354311757e-06, + "loss": 0.7413, + "step": 15092 + }, + { + "epoch": 0.7547, + "grad_norm": 5.224301815032959, + "learning_rate": 3.447316606139004e-06, + "loss": 0.99, + "step": 15094 + }, + { + "epoch": 0.7548, + "grad_norm": 4.702919960021973, + "learning_rate": 3.4446801752714287e-06, + "loss": 0.5537, + "step": 15096 + }, + { + "epoch": 0.7549, + "grad_norm": 3.969726085662842, + "learning_rate": 3.4420445431496887e-06, + "loss": 0.7119, + "step": 15098 + }, + { + "epoch": 0.755, + "grad_norm": 4.102963924407959, + "learning_rate": 3.4394097100949286e-06, + "loss": 1.5827, + "step": 15100 + }, + { + "epoch": 0.7551, + "grad_norm": 4.817896842956543, + "learning_rate": 3.4367756764281956e-06, + "loss": 0.6449, + "step": 15102 + }, + { + "epoch": 0.7552, + "grad_norm": 3.62813401222229, + "learning_rate": 3.4341424424704373e-06, + "loss": 0.5737, + "step": 15104 + }, + { + "epoch": 0.7553, + "grad_norm": 13.49328899383545, + "learning_rate": 3.4315100085425034e-06, + "loss": 1.3406, + "step": 15106 + }, + { + "epoch": 0.7554, + "grad_norm": 3.778759002685547, + "learning_rate": 3.4288783749651568e-06, + "loss": 0.6336, + "step": 15108 + }, + { + "epoch": 0.7555, + "grad_norm": 3.3368587493896484, + "learning_rate": 3.4262475420590414e-06, + "loss": 0.3404, + "step": 15110 + }, + { + "epoch": 0.7556, + "grad_norm": 3.6249260902404785, + "learning_rate": 3.4236175101447265e-06, + "loss": 0.1153, + "step": 15112 + }, + { + "epoch": 0.7557, + "grad_norm": 2.1180307865142822, + "learning_rate": 3.420988279542672e-06, + "loss": 0.7932, + "step": 15114 + }, + { + "epoch": 0.7558, + "grad_norm": 3.3652873039245605, + "learning_rate": 3.418359850573234e-06, + "loss": 0.5153, + "step": 15116 + }, + { + "epoch": 0.7559, + "grad_norm": 2.507920503616333, + "learning_rate": 3.4157322235566894e-06, + "loss": 1.0157, + "step": 15118 + }, + { + "epoch": 0.756, + "grad_norm": 12.686071395874023, + "learning_rate": 3.4131053988131947e-06, + "loss": 1.8181, + "step": 15120 + }, + { + "epoch": 0.7561, + "grad_norm": 2.256786823272705, + "learning_rate": 3.4104793766628307e-06, + "loss": 0.6631, + "step": 15122 + }, + { + "epoch": 0.7562, + "grad_norm": 5.047788619995117, + "learning_rate": 3.4078541574255664e-06, + "loss": 0.671, + "step": 15124 + }, + { + "epoch": 0.7563, + "grad_norm": 2.6419436931610107, + "learning_rate": 3.4052297414212776e-06, + "loss": 0.3151, + "step": 15126 + }, + { + "epoch": 0.7564, + "grad_norm": 3.2866790294647217, + "learning_rate": 3.4026061289697397e-06, + "loss": 1.58, + "step": 15128 + }, + { + "epoch": 0.7565, + "grad_norm": 1.959973931312561, + "learning_rate": 3.399983320390633e-06, + "loss": 0.3365, + "step": 15130 + }, + { + "epoch": 0.7566, + "grad_norm": 2.3798375129699707, + "learning_rate": 3.397361316003539e-06, + "loss": 0.2529, + "step": 15132 + }, + { + "epoch": 0.7567, + "grad_norm": 6.0084967613220215, + "learning_rate": 3.3947401161279415e-06, + "loss": 0.8371, + "step": 15134 + }, + { + "epoch": 0.7568, + "grad_norm": 3.892230272293091, + "learning_rate": 3.3921197210832235e-06, + "loss": 0.4353, + "step": 15136 + }, + { + "epoch": 0.7569, + "grad_norm": 3.5661909580230713, + "learning_rate": 3.3895001311886745e-06, + "loss": 0.4038, + "step": 15138 + }, + { + "epoch": 0.757, + "grad_norm": 3.5036566257476807, + "learning_rate": 3.3868813467634833e-06, + "loss": 0.71, + "step": 15140 + }, + { + "epoch": 0.7571, + "grad_norm": 3.4290876388549805, + "learning_rate": 3.3842633681267356e-06, + "loss": 0.9799, + "step": 15142 + }, + { + "epoch": 0.7572, + "grad_norm": 3.082249164581299, + "learning_rate": 3.381646195597437e-06, + "loss": 0.9083, + "step": 15144 + }, + { + "epoch": 0.7573, + "grad_norm": 2.529841661453247, + "learning_rate": 3.379029829494469e-06, + "loss": 1.2261, + "step": 15146 + }, + { + "epoch": 0.7574, + "grad_norm": 4.650539875030518, + "learning_rate": 3.376414270136633e-06, + "loss": 1.0175, + "step": 15148 + }, + { + "epoch": 0.7575, + "grad_norm": 5.594895362854004, + "learning_rate": 3.3737995178426276e-06, + "loss": 0.6578, + "step": 15150 + }, + { + "epoch": 0.7576, + "grad_norm": 9.341703414916992, + "learning_rate": 3.3711855729310482e-06, + "loss": 0.8656, + "step": 15152 + }, + { + "epoch": 0.7577, + "grad_norm": 5.117753028869629, + "learning_rate": 3.3685724357204052e-06, + "loss": 1.103, + "step": 15154 + }, + { + "epoch": 0.7578, + "grad_norm": 11.698408126831055, + "learning_rate": 3.3659601065290893e-06, + "loss": 0.5851, + "step": 15156 + }, + { + "epoch": 0.7579, + "grad_norm": 2.0856688022613525, + "learning_rate": 3.3633485856754143e-06, + "loss": 0.9119, + "step": 15158 + }, + { + "epoch": 0.758, + "grad_norm": 3.974963665008545, + "learning_rate": 3.360737873477584e-06, + "loss": 0.6842, + "step": 15160 + }, + { + "epoch": 0.7581, + "grad_norm": 6.430545330047607, + "learning_rate": 3.358127970253704e-06, + "loss": 1.0076, + "step": 15162 + }, + { + "epoch": 0.7582, + "grad_norm": 7.942394256591797, + "learning_rate": 3.355518876321787e-06, + "loss": 0.6182, + "step": 15164 + }, + { + "epoch": 0.7583, + "grad_norm": 8.862954139709473, + "learning_rate": 3.352910591999734e-06, + "loss": 0.7122, + "step": 15166 + }, + { + "epoch": 0.7584, + "grad_norm": 2.7997868061065674, + "learning_rate": 3.3503031176053657e-06, + "loss": 0.3796, + "step": 15168 + }, + { + "epoch": 0.7585, + "grad_norm": 3.1568822860717773, + "learning_rate": 3.3476964534563927e-06, + "loss": 0.7744, + "step": 15170 + }, + { + "epoch": 0.7586, + "grad_norm": 3.1697001457214355, + "learning_rate": 3.3450905998704274e-06, + "loss": 0.972, + "step": 15172 + }, + { + "epoch": 0.7587, + "grad_norm": 3.5245442390441895, + "learning_rate": 3.342485557164986e-06, + "loss": 1.1837, + "step": 15174 + }, + { + "epoch": 0.7588, + "grad_norm": 4.078039169311523, + "learning_rate": 3.3398813256574847e-06, + "loss": 1.3526, + "step": 15176 + }, + { + "epoch": 0.7589, + "grad_norm": 8.370218276977539, + "learning_rate": 3.3372779056652427e-06, + "loss": 0.6536, + "step": 15178 + }, + { + "epoch": 0.759, + "grad_norm": 3.8522279262542725, + "learning_rate": 3.3346752975054763e-06, + "loss": 1.2373, + "step": 15180 + }, + { + "epoch": 0.7591, + "grad_norm": 7.207118988037109, + "learning_rate": 3.3320735014953078e-06, + "loss": 0.751, + "step": 15182 + }, + { + "epoch": 0.7592, + "grad_norm": 3.438723564147949, + "learning_rate": 3.3294725179517573e-06, + "loss": 1.1898, + "step": 15184 + }, + { + "epoch": 0.7593, + "grad_norm": 6.0493574142456055, + "learning_rate": 3.3268723471917463e-06, + "loss": 0.7446, + "step": 15186 + }, + { + "epoch": 0.7594, + "grad_norm": 4.5268964767456055, + "learning_rate": 3.3242729895320945e-06, + "loss": 0.6655, + "step": 15188 + }, + { + "epoch": 0.7595, + "grad_norm": 1.7901519536972046, + "learning_rate": 3.3216744452895356e-06, + "loss": 1.1149, + "step": 15190 + }, + { + "epoch": 0.7596, + "grad_norm": 5.392563343048096, + "learning_rate": 3.3190767147806825e-06, + "loss": 1.5827, + "step": 15192 + }, + { + "epoch": 0.7597, + "grad_norm": 11.424216270446777, + "learning_rate": 3.316479798322072e-06, + "loss": 0.9572, + "step": 15194 + }, + { + "epoch": 0.7598, + "grad_norm": 2.998189687728882, + "learning_rate": 3.3138836962301192e-06, + "loss": 0.158, + "step": 15196 + }, + { + "epoch": 0.7599, + "grad_norm": 10.055610656738281, + "learning_rate": 3.3112884088211593e-06, + "loss": 1.5224, + "step": 15198 + }, + { + "epoch": 0.76, + "grad_norm": 7.772680282592773, + "learning_rate": 3.308693936411421e-06, + "loss": 1.7185, + "step": 15200 + }, + { + "epoch": 0.7601, + "grad_norm": 4.786521911621094, + "learning_rate": 3.306100279317024e-06, + "loss": 0.9198, + "step": 15202 + }, + { + "epoch": 0.7602, + "grad_norm": 5.69362735748291, + "learning_rate": 3.3035074378540087e-06, + "loss": 0.6889, + "step": 15204 + }, + { + "epoch": 0.7603, + "grad_norm": 4.203535556793213, + "learning_rate": 3.3009154123382936e-06, + "loss": 0.7089, + "step": 15206 + }, + { + "epoch": 0.7604, + "grad_norm": 4.0263285636901855, + "learning_rate": 3.2983242030857177e-06, + "loss": 0.8008, + "step": 15208 + }, + { + "epoch": 0.7605, + "grad_norm": 2.0823709964752197, + "learning_rate": 3.2957338104120096e-06, + "loss": 0.6808, + "step": 15210 + }, + { + "epoch": 0.7606, + "grad_norm": 7.030992031097412, + "learning_rate": 3.2931442346328e-06, + "loss": 0.8252, + "step": 15212 + }, + { + "epoch": 0.7607, + "grad_norm": 3.008441209793091, + "learning_rate": 3.2905554760636225e-06, + "loss": 0.7238, + "step": 15214 + }, + { + "epoch": 0.7608, + "grad_norm": 7.667186260223389, + "learning_rate": 3.287967535019908e-06, + "loss": 1.3523, + "step": 15216 + }, + { + "epoch": 0.7609, + "grad_norm": 9.197781562805176, + "learning_rate": 3.2853804118169884e-06, + "loss": 0.3474, + "step": 15218 + }, + { + "epoch": 0.761, + "grad_norm": 2.4892141819000244, + "learning_rate": 3.2827941067700996e-06, + "loss": 0.4023, + "step": 15220 + }, + { + "epoch": 0.7611, + "grad_norm": 1.9058319330215454, + "learning_rate": 3.2802086201943728e-06, + "loss": 1.2634, + "step": 15222 + }, + { + "epoch": 0.7612, + "grad_norm": 10.904255867004395, + "learning_rate": 3.2776239524048426e-06, + "loss": 1.5571, + "step": 15224 + }, + { + "epoch": 0.7613, + "grad_norm": 2.852635383605957, + "learning_rate": 3.2750401037164415e-06, + "loss": 0.6026, + "step": 15226 + }, + { + "epoch": 0.7614, + "grad_norm": 7.375488758087158, + "learning_rate": 3.272457074444003e-06, + "loss": 1.2499, + "step": 15228 + }, + { + "epoch": 0.7615, + "grad_norm": 1.3395270109176636, + "learning_rate": 3.2698748649022693e-06, + "loss": 1.7934, + "step": 15230 + }, + { + "epoch": 0.7616, + "grad_norm": 5.64564847946167, + "learning_rate": 3.2672934754058615e-06, + "loss": 0.8436, + "step": 15232 + }, + { + "epoch": 0.7617, + "grad_norm": 9.079142570495605, + "learning_rate": 3.2647129062693284e-06, + "loss": 0.7045, + "step": 15234 + }, + { + "epoch": 0.7618, + "grad_norm": 4.958298206329346, + "learning_rate": 3.2621331578070936e-06, + "loss": 0.9819, + "step": 15236 + }, + { + "epoch": 0.7619, + "grad_norm": 0.7109386920928955, + "learning_rate": 3.2595542303334924e-06, + "loss": 0.1964, + "step": 15238 + }, + { + "epoch": 0.762, + "grad_norm": 2.933582305908203, + "learning_rate": 3.2569761241627694e-06, + "loss": 0.6769, + "step": 15240 + }, + { + "epoch": 0.7621, + "grad_norm": 10.877847671508789, + "learning_rate": 3.254398839609044e-06, + "loss": 0.9536, + "step": 15242 + }, + { + "epoch": 0.7622, + "grad_norm": 0.37152424454689026, + "learning_rate": 3.2518223769863633e-06, + "loss": 0.5575, + "step": 15244 + }, + { + "epoch": 0.7623, + "grad_norm": 5.059979438781738, + "learning_rate": 3.2492467366086557e-06, + "loss": 1.1077, + "step": 15246 + }, + { + "epoch": 0.7624, + "grad_norm": 0.5173677802085876, + "learning_rate": 3.2466719187897555e-06, + "loss": 0.5958, + "step": 15248 + }, + { + "epoch": 0.7625, + "grad_norm": 4.551392555236816, + "learning_rate": 3.2440979238433977e-06, + "loss": 0.5633, + "step": 15250 + }, + { + "epoch": 0.7626, + "grad_norm": 3.012763261795044, + "learning_rate": 3.241524752083215e-06, + "loss": 0.8423, + "step": 15252 + }, + { + "epoch": 0.7627, + "grad_norm": 5.071713447570801, + "learning_rate": 3.2389524038227405e-06, + "loss": 0.6486, + "step": 15254 + }, + { + "epoch": 0.7628, + "grad_norm": 5.841597557067871, + "learning_rate": 3.2363808793754082e-06, + "loss": 0.6801, + "step": 15256 + }, + { + "epoch": 0.7629, + "grad_norm": 8.654031753540039, + "learning_rate": 3.2338101790545485e-06, + "loss": 0.9116, + "step": 15258 + }, + { + "epoch": 0.763, + "grad_norm": 0.9927638173103333, + "learning_rate": 3.2312403031733943e-06, + "loss": 0.5481, + "step": 15260 + }, + { + "epoch": 0.7631, + "grad_norm": 3.2254137992858887, + "learning_rate": 3.228671252045077e-06, + "loss": 3.5214, + "step": 15262 + }, + { + "epoch": 0.7632, + "grad_norm": 2.714237689971924, + "learning_rate": 3.2261030259826287e-06, + "loss": 0.585, + "step": 15264 + }, + { + "epoch": 0.7633, + "grad_norm": 5.877479553222656, + "learning_rate": 3.223535625298979e-06, + "loss": 0.3457, + "step": 15266 + }, + { + "epoch": 0.7634, + "grad_norm": 27.12339973449707, + "learning_rate": 3.2209690503069545e-06, + "loss": 1.5378, + "step": 15268 + }, + { + "epoch": 0.7635, + "grad_norm": 3.3099751472473145, + "learning_rate": 3.2184033013192962e-06, + "loss": 0.4886, + "step": 15270 + }, + { + "epoch": 0.7636, + "grad_norm": 4.63886022567749, + "learning_rate": 3.2158383786486204e-06, + "loss": 1.2872, + "step": 15272 + }, + { + "epoch": 0.7637, + "grad_norm": 3.5186073780059814, + "learning_rate": 3.213274282607457e-06, + "loss": 1.1971, + "step": 15274 + }, + { + "epoch": 0.7638, + "grad_norm": 9.565726280212402, + "learning_rate": 3.210711013508242e-06, + "loss": 1.2294, + "step": 15276 + }, + { + "epoch": 0.7639, + "grad_norm": 3.531435012817383, + "learning_rate": 3.2081485716632886e-06, + "loss": 0.8857, + "step": 15278 + }, + { + "epoch": 0.764, + "grad_norm": 4.1477251052856445, + "learning_rate": 3.2055869573848374e-06, + "loss": 0.9139, + "step": 15280 + }, + { + "epoch": 0.7641, + "grad_norm": 3.3779666423797607, + "learning_rate": 3.2030261709849997e-06, + "loss": 1.0678, + "step": 15282 + }, + { + "epoch": 0.7642, + "grad_norm": 0.23025767505168915, + "learning_rate": 3.200466212775808e-06, + "loss": 0.7031, + "step": 15284 + }, + { + "epoch": 0.7643, + "grad_norm": 4.373912334442139, + "learning_rate": 3.197907083069184e-06, + "loss": 0.9135, + "step": 15286 + }, + { + "epoch": 0.7644, + "grad_norm": 4.199671745300293, + "learning_rate": 3.195348782176948e-06, + "loss": 0.5114, + "step": 15288 + }, + { + "epoch": 0.7645, + "grad_norm": 4.843865394592285, + "learning_rate": 3.192791310410822e-06, + "loss": 1.0783, + "step": 15290 + }, + { + "epoch": 0.7646, + "grad_norm": 8.776944160461426, + "learning_rate": 3.190234668082427e-06, + "loss": 0.9764, + "step": 15292 + }, + { + "epoch": 0.7647, + "grad_norm": 4.722289562225342, + "learning_rate": 3.1876788555032825e-06, + "loss": 0.5881, + "step": 15294 + }, + { + "epoch": 0.7648, + "grad_norm": 3.783865213394165, + "learning_rate": 3.1851238729848033e-06, + "loss": 0.7041, + "step": 15296 + }, + { + "epoch": 0.7649, + "grad_norm": 3.9567034244537354, + "learning_rate": 3.18256972083831e-06, + "loss": 0.7672, + "step": 15298 + }, + { + "epoch": 0.765, + "grad_norm": 3.06410551071167, + "learning_rate": 3.1800163993750166e-06, + "loss": 0.997, + "step": 15300 + }, + { + "epoch": 0.7651, + "grad_norm": 4.007926940917969, + "learning_rate": 3.1774639089060364e-06, + "loss": 0.2713, + "step": 15302 + }, + { + "epoch": 0.7652, + "grad_norm": 1.0723968744277954, + "learning_rate": 3.174912249742382e-06, + "loss": 0.5237, + "step": 15304 + }, + { + "epoch": 0.7653, + "grad_norm": 14.045269966125488, + "learning_rate": 3.1723614221949738e-06, + "loss": 1.7402, + "step": 15306 + }, + { + "epoch": 0.7654, + "grad_norm": 7.08651065826416, + "learning_rate": 3.1698114265746126e-06, + "loss": 0.7218, + "step": 15308 + }, + { + "epoch": 0.7655, + "grad_norm": 11.19555950164795, + "learning_rate": 3.1672622631920102e-06, + "loss": 0.9129, + "step": 15310 + }, + { + "epoch": 0.7656, + "grad_norm": 9.478501319885254, + "learning_rate": 3.164713932357776e-06, + "loss": 0.8752, + "step": 15312 + }, + { + "epoch": 0.7657, + "grad_norm": 2.046025514602661, + "learning_rate": 3.162166434382412e-06, + "loss": 0.1986, + "step": 15314 + }, + { + "epoch": 0.7658, + "grad_norm": 3.4918975830078125, + "learning_rate": 3.159619769576333e-06, + "loss": 1.2506, + "step": 15316 + }, + { + "epoch": 0.7659, + "grad_norm": 1.144835114479065, + "learning_rate": 3.1570739382498293e-06, + "loss": 0.1455, + "step": 15318 + }, + { + "epoch": 0.766, + "grad_norm": 8.498276710510254, + "learning_rate": 3.1545289407131128e-06, + "loss": 1.5041, + "step": 15320 + }, + { + "epoch": 0.7661, + "grad_norm": 4.405997276306152, + "learning_rate": 3.1519847772762803e-06, + "loss": 0.9918, + "step": 15322 + }, + { + "epoch": 0.7662, + "grad_norm": 3.732877254486084, + "learning_rate": 3.149441448249331e-06, + "loss": 1.6904, + "step": 15324 + }, + { + "epoch": 0.7663, + "grad_norm": 4.900388240814209, + "learning_rate": 3.1468989539421634e-06, + "loss": 0.8335, + "step": 15326 + }, + { + "epoch": 0.7664, + "grad_norm": 4.8048601150512695, + "learning_rate": 3.144357294664565e-06, + "loss": 1.2373, + "step": 15328 + }, + { + "epoch": 0.7665, + "grad_norm": 4.645586967468262, + "learning_rate": 3.1418164707262375e-06, + "loss": 0.8439, + "step": 15330 + }, + { + "epoch": 0.7666, + "grad_norm": 15.249272346496582, + "learning_rate": 3.1392764824367706e-06, + "loss": 0.8487, + "step": 15332 + }, + { + "epoch": 0.7667, + "grad_norm": 5.623498916625977, + "learning_rate": 3.1367373301056535e-06, + "loss": 0.9577, + "step": 15334 + }, + { + "epoch": 0.7668, + "grad_norm": 3.825962543487549, + "learning_rate": 3.134199014042274e-06, + "loss": 0.8342, + "step": 15336 + }, + { + "epoch": 0.7669, + "grad_norm": 12.398581504821777, + "learning_rate": 3.1316615345559188e-06, + "loss": 0.3957, + "step": 15338 + }, + { + "epoch": 0.767, + "grad_norm": 4.342898845672607, + "learning_rate": 3.1291248919557717e-06, + "loss": 0.3203, + "step": 15340 + }, + { + "epoch": 0.7671, + "grad_norm": 6.118057727813721, + "learning_rate": 3.126589086550914e-06, + "loss": 0.6114, + "step": 15342 + }, + { + "epoch": 0.7672, + "grad_norm": 4.7577104568481445, + "learning_rate": 3.124054118650327e-06, + "loss": 0.5874, + "step": 15344 + }, + { + "epoch": 0.7673, + "grad_norm": 1.1999338865280151, + "learning_rate": 3.12151998856289e-06, + "loss": 0.5432, + "step": 15346 + }, + { + "epoch": 0.7674, + "grad_norm": 1.6454336643218994, + "learning_rate": 3.118986696597377e-06, + "loss": 0.9657, + "step": 15348 + }, + { + "epoch": 0.7675, + "grad_norm": 7.386750221252441, + "learning_rate": 3.116454243062459e-06, + "loss": 0.8495, + "step": 15350 + }, + { + "epoch": 0.7676, + "grad_norm": 11.879753112792969, + "learning_rate": 3.113922628266718e-06, + "loss": 1.4374, + "step": 15352 + }, + { + "epoch": 0.7677, + "grad_norm": 3.8569998741149902, + "learning_rate": 3.1113918525186117e-06, + "loss": 0.8243, + "step": 15354 + }, + { + "epoch": 0.7678, + "grad_norm": 9.820329666137695, + "learning_rate": 3.108861916126518e-06, + "loss": 0.6228, + "step": 15356 + }, + { + "epoch": 0.7679, + "grad_norm": 6.095893859863281, + "learning_rate": 3.1063328193986907e-06, + "loss": 1.0873, + "step": 15358 + }, + { + "epoch": 0.768, + "grad_norm": 6.878300189971924, + "learning_rate": 3.103804562643302e-06, + "loss": 0.5053, + "step": 15360 + }, + { + "epoch": 0.7681, + "grad_norm": 2.9050519466400146, + "learning_rate": 3.1012771461684123e-06, + "loss": 0.9432, + "step": 15362 + }, + { + "epoch": 0.7682, + "grad_norm": 3.685292959213257, + "learning_rate": 3.0987505702819687e-06, + "loss": 0.8192, + "step": 15364 + }, + { + "epoch": 0.7683, + "grad_norm": 4.24931526184082, + "learning_rate": 3.096224835291839e-06, + "loss": 0.6475, + "step": 15366 + }, + { + "epoch": 0.7684, + "grad_norm": 4.89659309387207, + "learning_rate": 3.0936999415057712e-06, + "loss": 0.6809, + "step": 15368 + }, + { + "epoch": 0.7685, + "grad_norm": 6.227106094360352, + "learning_rate": 3.091175889231417e-06, + "loss": 0.8564, + "step": 15370 + }, + { + "epoch": 0.7686, + "grad_norm": 4.736580848693848, + "learning_rate": 3.0886526787763237e-06, + "loss": 0.9567, + "step": 15372 + }, + { + "epoch": 0.7687, + "grad_norm": 6.652310371398926, + "learning_rate": 3.086130310447937e-06, + "loss": 1.4127, + "step": 15374 + }, + { + "epoch": 0.7688, + "grad_norm": 1.5121206045150757, + "learning_rate": 3.0836087845536e-06, + "loss": 0.7906, + "step": 15376 + }, + { + "epoch": 0.7689, + "grad_norm": 4.310826301574707, + "learning_rate": 3.081088101400552e-06, + "loss": 0.2175, + "step": 15378 + }, + { + "epoch": 0.769, + "grad_norm": 13.648624420166016, + "learning_rate": 3.0785682612959334e-06, + "loss": 0.8444, + "step": 15380 + }, + { + "epoch": 0.7691, + "grad_norm": 13.085478782653809, + "learning_rate": 3.0760492645467765e-06, + "loss": 1.1537, + "step": 15382 + }, + { + "epoch": 0.7692, + "grad_norm": 2.3424153327941895, + "learning_rate": 3.073531111460013e-06, + "loss": 0.7121, + "step": 15384 + }, + { + "epoch": 0.7693, + "grad_norm": 1.3288617134094238, + "learning_rate": 3.071013802342475e-06, + "loss": 0.1557, + "step": 15386 + }, + { + "epoch": 0.7694, + "grad_norm": 3.7674560546875, + "learning_rate": 3.0684973375008865e-06, + "loss": 0.6884, + "step": 15388 + }, + { + "epoch": 0.7695, + "grad_norm": 7.538422584533691, + "learning_rate": 3.0659817172418694e-06, + "loss": 0.9974, + "step": 15390 + }, + { + "epoch": 0.7696, + "grad_norm": 5.402904033660889, + "learning_rate": 3.063466941871952e-06, + "loss": 1.0828, + "step": 15392 + }, + { + "epoch": 0.7697, + "grad_norm": 8.320656776428223, + "learning_rate": 3.060953011697545e-06, + "loss": 0.8543, + "step": 15394 + }, + { + "epoch": 0.7698, + "grad_norm": 5.197290897369385, + "learning_rate": 3.058439927024962e-06, + "loss": 0.5422, + "step": 15396 + }, + { + "epoch": 0.7699, + "grad_norm": 11.395401000976562, + "learning_rate": 3.0559276881604237e-06, + "loss": 2.3937, + "step": 15398 + }, + { + "epoch": 0.77, + "grad_norm": 0.32994544506073, + "learning_rate": 3.0534162954100264e-06, + "loss": 0.1251, + "step": 15400 + }, + { + "epoch": 0.7701, + "grad_norm": 4.763673305511475, + "learning_rate": 3.0509057490797887e-06, + "loss": 0.7964, + "step": 15402 + }, + { + "epoch": 0.7702, + "grad_norm": 1.5680094957351685, + "learning_rate": 3.0483960494756017e-06, + "loss": 0.4589, + "step": 15404 + }, + { + "epoch": 0.7703, + "grad_norm": 4.106363296508789, + "learning_rate": 3.045887196903271e-06, + "loss": 0.862, + "step": 15406 + }, + { + "epoch": 0.7704, + "grad_norm": 4.034846782684326, + "learning_rate": 3.043379191668492e-06, + "loss": 0.5708, + "step": 15408 + }, + { + "epoch": 0.7705, + "grad_norm": 4.648529052734375, + "learning_rate": 3.040872034076857e-06, + "loss": 0.5585, + "step": 15410 + }, + { + "epoch": 0.7706, + "grad_norm": 2.546984910964966, + "learning_rate": 3.038365724433858e-06, + "loss": 0.5208, + "step": 15412 + }, + { + "epoch": 0.7707, + "grad_norm": 9.469350814819336, + "learning_rate": 3.035860263044873e-06, + "loss": 0.8862, + "step": 15414 + }, + { + "epoch": 0.7708, + "grad_norm": 6.0140767097473145, + "learning_rate": 3.033355650215193e-06, + "loss": 1.2998, + "step": 15416 + }, + { + "epoch": 0.7709, + "grad_norm": 5.055733680725098, + "learning_rate": 3.0308518862499957e-06, + "loss": 0.9235, + "step": 15418 + }, + { + "epoch": 0.771, + "grad_norm": 3.944653034210205, + "learning_rate": 3.028348971454356e-06, + "loss": 0.4357, + "step": 15420 + }, + { + "epoch": 0.7711, + "grad_norm": 2.236046314239502, + "learning_rate": 3.0258469061332463e-06, + "loss": 1.1886, + "step": 15422 + }, + { + "epoch": 0.7712, + "grad_norm": 4.088985919952393, + "learning_rate": 3.023345690591537e-06, + "loss": 0.6447, + "step": 15424 + }, + { + "epoch": 0.7713, + "grad_norm": 2.9230246543884277, + "learning_rate": 3.0208453251339887e-06, + "loss": 0.2434, + "step": 15426 + }, + { + "epoch": 0.7714, + "grad_norm": 5.554233551025391, + "learning_rate": 3.0183458100652752e-06, + "loss": 0.5129, + "step": 15428 + }, + { + "epoch": 0.7715, + "grad_norm": 2.684673547744751, + "learning_rate": 3.015847145689943e-06, + "loss": 1.0927, + "step": 15430 + }, + { + "epoch": 0.7716, + "grad_norm": 4.0280914306640625, + "learning_rate": 3.013349332312451e-06, + "loss": 0.1881, + "step": 15432 + }, + { + "epoch": 0.7717, + "grad_norm": 10.085474967956543, + "learning_rate": 3.0108523702371507e-06, + "loss": 0.3687, + "step": 15434 + }, + { + "epoch": 0.7718, + "grad_norm": 20.9274959564209, + "learning_rate": 3.008356259768285e-06, + "loss": 1.0592, + "step": 15436 + }, + { + "epoch": 0.7719, + "grad_norm": 7.339298725128174, + "learning_rate": 3.0058610012100076e-06, + "loss": 0.5226, + "step": 15438 + }, + { + "epoch": 0.772, + "grad_norm": 2.8144748210906982, + "learning_rate": 3.003366594866345e-06, + "loss": 1.1638, + "step": 15440 + }, + { + "epoch": 0.7721, + "grad_norm": 4.946394443511963, + "learning_rate": 3.000873041041247e-06, + "loss": 0.9529, + "step": 15442 + }, + { + "epoch": 0.7722, + "grad_norm": 27.024219512939453, + "learning_rate": 2.9983803400385313e-06, + "loss": 2.0933, + "step": 15444 + }, + { + "epoch": 0.7723, + "grad_norm": 5.035782814025879, + "learning_rate": 2.9958884921619368e-06, + "loss": 0.7617, + "step": 15446 + }, + { + "epoch": 0.7724, + "grad_norm": 15.200722694396973, + "learning_rate": 2.993397497715086e-06, + "loss": 0.3124, + "step": 15448 + }, + { + "epoch": 0.7725, + "grad_norm": 3.562619686126709, + "learning_rate": 2.990907357001491e-06, + "loss": 1.0688, + "step": 15450 + }, + { + "epoch": 0.7726, + "grad_norm": 6.202802658081055, + "learning_rate": 2.988418070324577e-06, + "loss": 0.8147, + "step": 15452 + }, + { + "epoch": 0.7727, + "grad_norm": 9.683131217956543, + "learning_rate": 2.985929637987652e-06, + "loss": 1.0383, + "step": 15454 + }, + { + "epoch": 0.7728, + "grad_norm": 10.969441413879395, + "learning_rate": 2.983442060293926e-06, + "loss": 1.3066, + "step": 15456 + }, + { + "epoch": 0.7729, + "grad_norm": 5.852087497711182, + "learning_rate": 2.9809553375465006e-06, + "loss": 1.3862, + "step": 15458 + }, + { + "epoch": 0.773, + "grad_norm": 3.5722830295562744, + "learning_rate": 2.978469470048376e-06, + "loss": 0.5746, + "step": 15460 + }, + { + "epoch": 0.7731, + "grad_norm": 3.9603116512298584, + "learning_rate": 2.9759844581024488e-06, + "loss": 0.4312, + "step": 15462 + }, + { + "epoch": 0.7732, + "grad_norm": 5.099301815032959, + "learning_rate": 2.9735003020115095e-06, + "loss": 1.5363, + "step": 15464 + }, + { + "epoch": 0.7733, + "grad_norm": 3.0362517833709717, + "learning_rate": 2.9710170020782435e-06, + "loss": 0.6978, + "step": 15466 + }, + { + "epoch": 0.7734, + "grad_norm": 5.973239898681641, + "learning_rate": 2.968534558605236e-06, + "loss": 0.7028, + "step": 15468 + }, + { + "epoch": 0.7735, + "grad_norm": 3.554172992706299, + "learning_rate": 2.9660529718949628e-06, + "loss": 0.6406, + "step": 15470 + }, + { + "epoch": 0.7736, + "grad_norm": 9.30981731414795, + "learning_rate": 2.963572242249799e-06, + "loss": 1.3489, + "step": 15472 + }, + { + "epoch": 0.7737, + "grad_norm": 4.632805347442627, + "learning_rate": 2.961092369972014e-06, + "loss": 1.3153, + "step": 15474 + }, + { + "epoch": 0.7738, + "grad_norm": 2.698941469192505, + "learning_rate": 2.9586133553637687e-06, + "loss": 0.3968, + "step": 15476 + }, + { + "epoch": 0.7739, + "grad_norm": 4.429535388946533, + "learning_rate": 2.9561351987271337e-06, + "loss": 0.34, + "step": 15478 + }, + { + "epoch": 0.774, + "grad_norm": 7.769199371337891, + "learning_rate": 2.953657900364053e-06, + "loss": 0.8419, + "step": 15480 + }, + { + "epoch": 0.7741, + "grad_norm": 15.372440338134766, + "learning_rate": 2.9511814605763855e-06, + "loss": 1.5561, + "step": 15482 + }, + { + "epoch": 0.7742, + "grad_norm": 2.5001866817474365, + "learning_rate": 2.9487058796658785e-06, + "loss": 0.7489, + "step": 15484 + }, + { + "epoch": 0.7743, + "grad_norm": 6.1473164558410645, + "learning_rate": 2.946231157934166e-06, + "loss": 1.4495, + "step": 15486 + }, + { + "epoch": 0.7744, + "grad_norm": 3.0083703994750977, + "learning_rate": 2.9437572956827965e-06, + "loss": 0.8294, + "step": 15488 + }, + { + "epoch": 0.7745, + "grad_norm": 1.508623480796814, + "learning_rate": 2.9412842932131904e-06, + "loss": 0.4978, + "step": 15490 + }, + { + "epoch": 0.7746, + "grad_norm": 4.810530662536621, + "learning_rate": 2.938812150826684e-06, + "loss": 0.4869, + "step": 15492 + }, + { + "epoch": 0.7747, + "grad_norm": 6.713289737701416, + "learning_rate": 2.9363408688245e-06, + "loss": 0.7483, + "step": 15494 + }, + { + "epoch": 0.7748, + "grad_norm": 4.27460241317749, + "learning_rate": 2.9338704475077527e-06, + "loss": 0.8875, + "step": 15496 + }, + { + "epoch": 0.7749, + "grad_norm": 8.916017532348633, + "learning_rate": 2.9314008871774593e-06, + "loss": 0.5804, + "step": 15498 + }, + { + "epoch": 0.775, + "grad_norm": 5.289140224456787, + "learning_rate": 2.9289321881345257e-06, + "loss": 1.0455, + "step": 15500 + }, + { + "epoch": 0.7751, + "grad_norm": 12.118025779724121, + "learning_rate": 2.926464350679756e-06, + "loss": 0.6271, + "step": 15502 + }, + { + "epoch": 0.7752, + "grad_norm": 4.079357624053955, + "learning_rate": 2.9239973751138495e-06, + "loss": 1.3523, + "step": 15504 + }, + { + "epoch": 0.7753, + "grad_norm": 6.4991865158081055, + "learning_rate": 2.921531261737398e-06, + "loss": 0.9966, + "step": 15506 + }, + { + "epoch": 0.7754, + "grad_norm": 3.0381104946136475, + "learning_rate": 2.919066010850892e-06, + "loss": 0.6936, + "step": 15508 + }, + { + "epoch": 0.7755, + "grad_norm": 10.34044075012207, + "learning_rate": 2.9166016227547135e-06, + "loss": 0.6442, + "step": 15510 + }, + { + "epoch": 0.7756, + "grad_norm": 2.690249443054199, + "learning_rate": 2.9141380977491373e-06, + "loss": 0.8399, + "step": 15512 + }, + { + "epoch": 0.7757, + "grad_norm": 9.433344841003418, + "learning_rate": 2.911675436134347e-06, + "loss": 1.1239, + "step": 15514 + }, + { + "epoch": 0.7758, + "grad_norm": 2.892487049102783, + "learning_rate": 2.9092136382103976e-06, + "loss": 1.2212, + "step": 15516 + }, + { + "epoch": 0.7759, + "grad_norm": 3.8306310176849365, + "learning_rate": 2.9067527042772638e-06, + "loss": 0.1304, + "step": 15518 + }, + { + "epoch": 0.776, + "grad_norm": 5.141617774963379, + "learning_rate": 2.9042926346347932e-06, + "loss": 0.721, + "step": 15520 + }, + { + "epoch": 0.7761, + "grad_norm": 7.569578170776367, + "learning_rate": 2.9018334295827387e-06, + "loss": 0.6471, + "step": 15522 + }, + { + "epoch": 0.7762, + "grad_norm": 4.730265140533447, + "learning_rate": 2.8993750894207563e-06, + "loss": 0.5064, + "step": 15524 + }, + { + "epoch": 0.7763, + "grad_norm": 9.357083320617676, + "learning_rate": 2.8969176144483747e-06, + "loss": 0.6786, + "step": 15526 + }, + { + "epoch": 0.7764, + "grad_norm": 6.37130880355835, + "learning_rate": 2.8944610049650377e-06, + "loss": 1.3003, + "step": 15528 + }, + { + "epoch": 0.7765, + "grad_norm": 1.5845098495483398, + "learning_rate": 2.8920052612700755e-06, + "loss": 0.7558, + "step": 15530 + }, + { + "epoch": 0.7766, + "grad_norm": 7.380557060241699, + "learning_rate": 2.8895503836627105e-06, + "loss": 0.8359, + "step": 15532 + }, + { + "epoch": 0.7767, + "grad_norm": 2.6532814502716064, + "learning_rate": 2.887096372442063e-06, + "loss": 0.6069, + "step": 15534 + }, + { + "epoch": 0.7768, + "grad_norm": 2.536604881286621, + "learning_rate": 2.884643227907147e-06, + "loss": 0.5642, + "step": 15536 + }, + { + "epoch": 0.7769, + "grad_norm": 3.7185404300689697, + "learning_rate": 2.8821909503568703e-06, + "loss": 1.398, + "step": 15538 + }, + { + "epoch": 0.777, + "grad_norm": 4.0456414222717285, + "learning_rate": 2.8797395400900362e-06, + "loss": 1.1505, + "step": 15540 + }, + { + "epoch": 0.7771, + "grad_norm": 5.446933746337891, + "learning_rate": 2.877288997405341e-06, + "loss": 0.9086, + "step": 15542 + }, + { + "epoch": 0.7772, + "grad_norm": 3.045356273651123, + "learning_rate": 2.874839322601375e-06, + "loss": 0.8664, + "step": 15544 + }, + { + "epoch": 0.7773, + "grad_norm": 10.061610221862793, + "learning_rate": 2.8723905159766254e-06, + "loss": 0.9172, + "step": 15546 + }, + { + "epoch": 0.7774, + "grad_norm": 5.209420680999756, + "learning_rate": 2.869942577829471e-06, + "loss": 0.9144, + "step": 15548 + }, + { + "epoch": 0.7775, + "grad_norm": 3.8070411682128906, + "learning_rate": 2.867495508458186e-06, + "loss": 0.9187, + "step": 15550 + }, + { + "epoch": 0.7776, + "grad_norm": 9.346997261047363, + "learning_rate": 2.8650493081609344e-06, + "loss": 0.8821, + "step": 15552 + }, + { + "epoch": 0.7777, + "grad_norm": 7.864978790283203, + "learning_rate": 2.8626039772357884e-06, + "loss": 0.3793, + "step": 15554 + }, + { + "epoch": 0.7778, + "grad_norm": 10.783885955810547, + "learning_rate": 2.860159515980695e-06, + "loss": 0.5521, + "step": 15556 + }, + { + "epoch": 0.7779, + "grad_norm": 4.4483323097229, + "learning_rate": 2.857715924693504e-06, + "loss": 0.9099, + "step": 15558 + }, + { + "epoch": 0.778, + "grad_norm": 6.69064474105835, + "learning_rate": 2.855273203671969e-06, + "loss": 0.2492, + "step": 15560 + }, + { + "epoch": 0.7781, + "grad_norm": 4.328423976898193, + "learning_rate": 2.8528313532137155e-06, + "loss": 1.0754, + "step": 15562 + }, + { + "epoch": 0.7782, + "grad_norm": 4.035939693450928, + "learning_rate": 2.8503903736162876e-06, + "loss": 0.9138, + "step": 15564 + }, + { + "epoch": 0.7783, + "grad_norm": 4.854342937469482, + "learning_rate": 2.8479502651770998e-06, + "loss": 0.9031, + "step": 15566 + }, + { + "epoch": 0.7784, + "grad_norm": 3.963752031326294, + "learning_rate": 2.8455110281934804e-06, + "loss": 0.7034, + "step": 15568 + }, + { + "epoch": 0.7785, + "grad_norm": 3.685314655303955, + "learning_rate": 2.8430726629626416e-06, + "loss": 1.3817, + "step": 15570 + }, + { + "epoch": 0.7786, + "grad_norm": 3.8546013832092285, + "learning_rate": 2.840635169781688e-06, + "loss": 0.6119, + "step": 15572 + }, + { + "epoch": 0.7787, + "grad_norm": 2.246988534927368, + "learning_rate": 2.838198548947627e-06, + "loss": 0.5294, + "step": 15574 + }, + { + "epoch": 0.7788, + "grad_norm": 8.134276390075684, + "learning_rate": 2.8357628007573412e-06, + "loss": 1.1844, + "step": 15576 + }, + { + "epoch": 0.7789, + "grad_norm": 2.320126533508301, + "learning_rate": 2.8333279255076307e-06, + "loss": 0.6564, + "step": 15578 + }, + { + "epoch": 0.779, + "grad_norm": 3.2456438541412354, + "learning_rate": 2.830893923495173e-06, + "loss": 0.5812, + "step": 15580 + }, + { + "epoch": 0.7791, + "grad_norm": 8.47397518157959, + "learning_rate": 2.8284607950165445e-06, + "loss": 0.4959, + "step": 15582 + }, + { + "epoch": 0.7792, + "grad_norm": 4.091839790344238, + "learning_rate": 2.8260285403682153e-06, + "loss": 1.2096, + "step": 15584 + }, + { + "epoch": 0.7793, + "grad_norm": 9.340723037719727, + "learning_rate": 2.823597159846547e-06, + "loss": 0.6489, + "step": 15586 + }, + { + "epoch": 0.7794, + "grad_norm": 3.833657741546631, + "learning_rate": 2.821166653747793e-06, + "loss": 1.2015, + "step": 15588 + }, + { + "epoch": 0.7795, + "grad_norm": 3.502345085144043, + "learning_rate": 2.8187370223681134e-06, + "loss": 1.3537, + "step": 15590 + }, + { + "epoch": 0.7796, + "grad_norm": 2.5314836502075195, + "learning_rate": 2.816308266003541e-06, + "loss": 0.8185, + "step": 15592 + }, + { + "epoch": 0.7797, + "grad_norm": 10.318405151367188, + "learning_rate": 2.8138803849500163e-06, + "loss": 1.2423, + "step": 15594 + }, + { + "epoch": 0.7798, + "grad_norm": 6.722660064697266, + "learning_rate": 2.8114533795033685e-06, + "loss": 1.0701, + "step": 15596 + }, + { + "epoch": 0.7799, + "grad_norm": 4.1410980224609375, + "learning_rate": 2.8090272499593175e-06, + "loss": 0.9691, + "step": 15598 + }, + { + "epoch": 0.78, + "grad_norm": 7.81804895401001, + "learning_rate": 2.8066019966134907e-06, + "loss": 1.3566, + "step": 15600 + }, + { + "epoch": 0.7801, + "grad_norm": 5.906374931335449, + "learning_rate": 2.8041776197613848e-06, + "loss": 1.268, + "step": 15602 + }, + { + "epoch": 0.7802, + "grad_norm": 12.654419898986816, + "learning_rate": 2.8017541196984144e-06, + "loss": 1.0779, + "step": 15604 + }, + { + "epoch": 0.7803, + "grad_norm": 4.494592189788818, + "learning_rate": 2.7993314967198636e-06, + "loss": 0.9692, + "step": 15606 + }, + { + "epoch": 0.7804, + "grad_norm": 3.2717816829681396, + "learning_rate": 2.796909751120931e-06, + "loss": 0.9909, + "step": 15608 + }, + { + "epoch": 0.7805, + "grad_norm": 5.994987964630127, + "learning_rate": 2.794488883196699e-06, + "loss": 0.4995, + "step": 15610 + }, + { + "epoch": 0.7806, + "grad_norm": 5.5135111808776855, + "learning_rate": 2.7920688932421337e-06, + "loss": 0.6211, + "step": 15612 + }, + { + "epoch": 0.7807, + "grad_norm": 5.317834854125977, + "learning_rate": 2.789649781552113e-06, + "loss": 0.3852, + "step": 15614 + }, + { + "epoch": 0.7808, + "grad_norm": 10.430737495422363, + "learning_rate": 2.7872315484213954e-06, + "loss": 1.5548, + "step": 15616 + }, + { + "epoch": 0.7809, + "grad_norm": 7.532837867736816, + "learning_rate": 2.784814194144635e-06, + "loss": 0.8662, + "step": 15618 + }, + { + "epoch": 0.781, + "grad_norm": 7.313412666320801, + "learning_rate": 2.7823977190163788e-06, + "loss": 1.4792, + "step": 15620 + }, + { + "epoch": 0.7811, + "grad_norm": 6.2337565422058105, + "learning_rate": 2.7799821233310676e-06, + "loss": 0.1653, + "step": 15622 + }, + { + "epoch": 0.7812, + "grad_norm": 2.202460289001465, + "learning_rate": 2.7775674073830337e-06, + "loss": 0.702, + "step": 15624 + }, + { + "epoch": 0.7813, + "grad_norm": 9.87565803527832, + "learning_rate": 2.7751535714665025e-06, + "loss": 1.2049, + "step": 15626 + }, + { + "epoch": 0.7814, + "grad_norm": 3.9171998500823975, + "learning_rate": 2.7727406158755943e-06, + "loss": 0.9199, + "step": 15628 + }, + { + "epoch": 0.7815, + "grad_norm": 10.223541259765625, + "learning_rate": 2.7703285409043192e-06, + "loss": 1.2969, + "step": 15630 + }, + { + "epoch": 0.7816, + "grad_norm": 3.8202929496765137, + "learning_rate": 2.7679173468465813e-06, + "loss": 0.5204, + "step": 15632 + }, + { + "epoch": 0.7817, + "grad_norm": 10.440673828125, + "learning_rate": 2.7655070339961777e-06, + "loss": 1.3125, + "step": 15634 + }, + { + "epoch": 0.7818, + "grad_norm": 3.2878928184509277, + "learning_rate": 2.763097602646797e-06, + "loss": 0.9014, + "step": 15636 + }, + { + "epoch": 0.7819, + "grad_norm": 8.750489234924316, + "learning_rate": 2.760689053092019e-06, + "loss": 0.6716, + "step": 15638 + }, + { + "epoch": 0.782, + "grad_norm": 6.276682376861572, + "learning_rate": 2.7582813856253276e-06, + "loss": 1.041, + "step": 15640 + }, + { + "epoch": 0.7821, + "grad_norm": 4.255042552947998, + "learning_rate": 2.7558746005400783e-06, + "loss": 1.5588, + "step": 15642 + }, + { + "epoch": 0.7822, + "grad_norm": 3.303494930267334, + "learning_rate": 2.7534686981295335e-06, + "loss": 0.6325, + "step": 15644 + }, + { + "epoch": 0.7823, + "grad_norm": 2.4988460540771484, + "learning_rate": 2.7510636786868518e-06, + "loss": 0.56, + "step": 15646 + }, + { + "epoch": 0.7824, + "grad_norm": 8.429854393005371, + "learning_rate": 2.7486595425050667e-06, + "loss": 0.8137, + "step": 15648 + }, + { + "epoch": 0.7825, + "grad_norm": 4.212010860443115, + "learning_rate": 2.746256289877126e-06, + "loss": 1.2986, + "step": 15650 + }, + { + "epoch": 0.7826, + "grad_norm": 2.760025978088379, + "learning_rate": 2.7438539210958483e-06, + "loss": 0.4327, + "step": 15652 + }, + { + "epoch": 0.7827, + "grad_norm": 3.5168075561523438, + "learning_rate": 2.741452436453963e-06, + "loss": 0.9683, + "step": 15654 + }, + { + "epoch": 0.7828, + "grad_norm": 6.734898090362549, + "learning_rate": 2.739051836244081e-06, + "loss": 0.9388, + "step": 15656 + }, + { + "epoch": 0.7829, + "grad_norm": 2.4575045108795166, + "learning_rate": 2.736652120758708e-06, + "loss": 0.282, + "step": 15658 + }, + { + "epoch": 0.783, + "grad_norm": 3.675842761993408, + "learning_rate": 2.7342532902902418e-06, + "loss": 1.5749, + "step": 15660 + }, + { + "epoch": 0.7831, + "grad_norm": 4.109372138977051, + "learning_rate": 2.7318553451309726e-06, + "loss": 0.8868, + "step": 15662 + }, + { + "epoch": 0.7832, + "grad_norm": 2.577045440673828, + "learning_rate": 2.7294582855730835e-06, + "loss": 0.4643, + "step": 15664 + }, + { + "epoch": 0.7833, + "grad_norm": 5.6769280433654785, + "learning_rate": 2.727062111908647e-06, + "loss": 0.7958, + "step": 15666 + }, + { + "epoch": 0.7834, + "grad_norm": 0.9046494364738464, + "learning_rate": 2.7246668244296328e-06, + "loss": 0.7981, + "step": 15668 + }, + { + "epoch": 0.7835, + "grad_norm": 7.766051769256592, + "learning_rate": 2.7222724234278963e-06, + "loss": 1.8144, + "step": 15670 + }, + { + "epoch": 0.7836, + "grad_norm": 3.377614736557007, + "learning_rate": 2.7198789091951903e-06, + "loss": 0.6308, + "step": 15672 + }, + { + "epoch": 0.7837, + "grad_norm": 7.586991786956787, + "learning_rate": 2.717486282023153e-06, + "loss": 1.1286, + "step": 15674 + }, + { + "epoch": 0.7838, + "grad_norm": 2.7936289310455322, + "learning_rate": 2.715094542203327e-06, + "loss": 0.3224, + "step": 15676 + }, + { + "epoch": 0.7839, + "grad_norm": 4.003614902496338, + "learning_rate": 2.712703690027132e-06, + "loss": 0.802, + "step": 15678 + }, + { + "epoch": 0.784, + "grad_norm": 3.756229877471924, + "learning_rate": 2.7103137257858867e-06, + "loss": 1.1347, + "step": 15680 + }, + { + "epoch": 0.7841, + "grad_norm": 22.92917823791504, + "learning_rate": 2.7079246497708024e-06, + "loss": 0.7916, + "step": 15682 + }, + { + "epoch": 0.7842, + "grad_norm": 3.684377908706665, + "learning_rate": 2.7055364622729772e-06, + "loss": 0.6185, + "step": 15684 + }, + { + "epoch": 0.7843, + "grad_norm": 5.930863857269287, + "learning_rate": 2.703149163583414e-06, + "loss": 0.5837, + "step": 15686 + }, + { + "epoch": 0.7844, + "grad_norm": 1.763745665550232, + "learning_rate": 2.7007627539929847e-06, + "loss": 0.6346, + "step": 15688 + }, + { + "epoch": 0.7845, + "grad_norm": 19.506412506103516, + "learning_rate": 2.698377233792476e-06, + "loss": 0.7082, + "step": 15690 + }, + { + "epoch": 0.7846, + "grad_norm": 2.3774573802948, + "learning_rate": 2.6959926032725537e-06, + "loss": 0.5752, + "step": 15692 + }, + { + "epoch": 0.7847, + "grad_norm": 4.221240997314453, + "learning_rate": 2.6936088627237766e-06, + "loss": 0.4027, + "step": 15694 + }, + { + "epoch": 0.7848, + "grad_norm": 8.716275215148926, + "learning_rate": 2.6912260124366007e-06, + "loss": 1.1377, + "step": 15696 + }, + { + "epoch": 0.7849, + "grad_norm": 4.22721529006958, + "learning_rate": 2.6888440527013595e-06, + "loss": 0.436, + "step": 15698 + }, + { + "epoch": 0.785, + "grad_norm": 4.625136375427246, + "learning_rate": 2.6864629838082957e-06, + "loss": 1.4972, + "step": 15700 + }, + { + "epoch": 0.7851, + "grad_norm": 2.1336119174957275, + "learning_rate": 2.6840828060475333e-06, + "loss": 1.0499, + "step": 15702 + }, + { + "epoch": 0.7852, + "grad_norm": 1.5943152904510498, + "learning_rate": 2.6817035197090892e-06, + "loss": 0.5795, + "step": 15704 + }, + { + "epoch": 0.7853, + "grad_norm": 2.929060220718384, + "learning_rate": 2.6793251250828723e-06, + "loss": 0.8375, + "step": 15706 + }, + { + "epoch": 0.7854, + "grad_norm": 3.554936408996582, + "learning_rate": 2.676947622458683e-06, + "loss": 1.0572, + "step": 15708 + }, + { + "epoch": 0.7855, + "grad_norm": 5.535151481628418, + "learning_rate": 2.6745710121262135e-06, + "loss": 0.3061, + "step": 15710 + }, + { + "epoch": 0.7856, + "grad_norm": 9.557644844055176, + "learning_rate": 2.672195294375045e-06, + "loss": 1.1, + "step": 15712 + }, + { + "epoch": 0.7857, + "grad_norm": 3.239802360534668, + "learning_rate": 2.6698204694946527e-06, + "loss": 0.5122, + "step": 15714 + }, + { + "epoch": 0.7858, + "grad_norm": 3.867567777633667, + "learning_rate": 2.667446537774402e-06, + "loss": 0.8328, + "step": 15716 + }, + { + "epoch": 0.7859, + "grad_norm": 3.0346901416778564, + "learning_rate": 2.6650734995035478e-06, + "loss": 0.8841, + "step": 15718 + }, + { + "epoch": 0.786, + "grad_norm": 2.772193431854248, + "learning_rate": 2.6627013549712355e-06, + "loss": 0.4802, + "step": 15720 + }, + { + "epoch": 0.7861, + "grad_norm": 3.1466617584228516, + "learning_rate": 2.6603301044665132e-06, + "loss": 1.0202, + "step": 15722 + }, + { + "epoch": 0.7862, + "grad_norm": 2.8818647861480713, + "learning_rate": 2.6579597482782972e-06, + "loss": 1.3159, + "step": 15724 + }, + { + "epoch": 0.7863, + "grad_norm": 6.9119648933410645, + "learning_rate": 2.655590286695422e-06, + "loss": 0.9797, + "step": 15726 + }, + { + "epoch": 0.7864, + "grad_norm": 9.920783042907715, + "learning_rate": 2.6532217200065856e-06, + "loss": 1.1442, + "step": 15728 + }, + { + "epoch": 0.7865, + "grad_norm": 2.766982078552246, + "learning_rate": 2.650854048500401e-06, + "loss": 0.7991, + "step": 15730 + }, + { + "epoch": 0.7866, + "grad_norm": 2.4080569744110107, + "learning_rate": 2.648487272465361e-06, + "loss": 0.6863, + "step": 15732 + }, + { + "epoch": 0.7867, + "grad_norm": 2.264409065246582, + "learning_rate": 2.646121392189841e-06, + "loss": 1.7256, + "step": 15734 + }, + { + "epoch": 0.7868, + "grad_norm": 10.125195503234863, + "learning_rate": 2.643756407962127e-06, + "loss": 0.8513, + "step": 15736 + }, + { + "epoch": 0.7869, + "grad_norm": 2.421107292175293, + "learning_rate": 2.6413923200703795e-06, + "loss": 1.3921, + "step": 15738 + }, + { + "epoch": 0.787, + "grad_norm": 6.634011745452881, + "learning_rate": 2.639029128802657e-06, + "loss": 0.3996, + "step": 15740 + }, + { + "epoch": 0.7871, + "grad_norm": 1.087597131729126, + "learning_rate": 2.636666834446907e-06, + "loss": 0.6188, + "step": 15742 + }, + { + "epoch": 0.7872, + "grad_norm": 3.677459955215454, + "learning_rate": 2.634305437290968e-06, + "loss": 0.6528, + "step": 15744 + }, + { + "epoch": 0.7873, + "grad_norm": 11.290108680725098, + "learning_rate": 2.6319449376225692e-06, + "loss": 1.1346, + "step": 15746 + }, + { + "epoch": 0.7874, + "grad_norm": 3.3944993019104004, + "learning_rate": 2.62958533572933e-06, + "loss": 0.6498, + "step": 15748 + }, + { + "epoch": 0.7875, + "grad_norm": 7.815998077392578, + "learning_rate": 2.6272266318987606e-06, + "loss": 0.2596, + "step": 15750 + }, + { + "epoch": 0.7876, + "grad_norm": 4.410046577453613, + "learning_rate": 2.624868826418262e-06, + "loss": 1.3354, + "step": 15752 + }, + { + "epoch": 0.7877, + "grad_norm": 7.580938816070557, + "learning_rate": 2.6225119195751257e-06, + "loss": 0.7843, + "step": 15754 + }, + { + "epoch": 0.7878, + "grad_norm": 2.4689419269561768, + "learning_rate": 2.6201559116565346e-06, + "loss": 1.0995, + "step": 15756 + }, + { + "epoch": 0.7879, + "grad_norm": 4.565606117248535, + "learning_rate": 2.6178008029495594e-06, + "loss": 0.968, + "step": 15758 + }, + { + "epoch": 0.788, + "grad_norm": 4.184640407562256, + "learning_rate": 2.615446593741161e-06, + "loss": 0.8555, + "step": 15760 + }, + { + "epoch": 0.7881, + "grad_norm": 3.2279109954833984, + "learning_rate": 2.6130932843182013e-06, + "loss": 0.573, + "step": 15762 + }, + { + "epoch": 0.7882, + "grad_norm": 3.8562419414520264, + "learning_rate": 2.6107408749674125e-06, + "loss": 1.3046, + "step": 15764 + }, + { + "epoch": 0.7883, + "grad_norm": 0.46125364303588867, + "learning_rate": 2.6083893659754357e-06, + "loss": 0.1814, + "step": 15766 + }, + { + "epoch": 0.7884, + "grad_norm": 5.024489879608154, + "learning_rate": 2.6060387576287983e-06, + "loss": 0.5923, + "step": 15768 + }, + { + "epoch": 0.7885, + "grad_norm": 3.0296072959899902, + "learning_rate": 2.603689050213902e-06, + "loss": 0.1695, + "step": 15770 + }, + { + "epoch": 0.7886, + "grad_norm": 5.479783535003662, + "learning_rate": 2.6013402440170676e-06, + "loss": 1.363, + "step": 15772 + }, + { + "epoch": 0.7887, + "grad_norm": 1.8856406211853027, + "learning_rate": 2.598992339324474e-06, + "loss": 0.4553, + "step": 15774 + }, + { + "epoch": 0.7888, + "grad_norm": 2.071657419204712, + "learning_rate": 2.596645336422219e-06, + "loss": 0.6361, + "step": 15776 + }, + { + "epoch": 0.7889, + "grad_norm": 5.781007289886475, + "learning_rate": 2.5942992355962724e-06, + "loss": 0.7385, + "step": 15778 + }, + { + "epoch": 0.789, + "grad_norm": 5.775184154510498, + "learning_rate": 2.5919540371325005e-06, + "loss": 0.7555, + "step": 15780 + }, + { + "epoch": 0.7891, + "grad_norm": 5.584766387939453, + "learning_rate": 2.5896097413166567e-06, + "loss": 0.9127, + "step": 15782 + }, + { + "epoch": 0.7892, + "grad_norm": 7.391965866088867, + "learning_rate": 2.5872663484343887e-06, + "loss": 0.7141, + "step": 15784 + }, + { + "epoch": 0.7893, + "grad_norm": 0.12352439761161804, + "learning_rate": 2.584923858771231e-06, + "loss": 0.3368, + "step": 15786 + }, + { + "epoch": 0.7894, + "grad_norm": 0.9142717123031616, + "learning_rate": 2.5825822726126095e-06, + "loss": 0.524, + "step": 15788 + }, + { + "epoch": 0.7895, + "grad_norm": 5.078593730926514, + "learning_rate": 2.5802415902438373e-06, + "loss": 0.8145, + "step": 15790 + }, + { + "epoch": 0.7896, + "grad_norm": 13.021615982055664, + "learning_rate": 2.577901811950121e-06, + "loss": 2.0119, + "step": 15792 + }, + { + "epoch": 0.7897, + "grad_norm": 4.53361701965332, + "learning_rate": 2.575562938016556e-06, + "loss": 0.5532, + "step": 15794 + }, + { + "epoch": 0.7898, + "grad_norm": 2.395071268081665, + "learning_rate": 2.5732249687281228e-06, + "loss": 0.6069, + "step": 15796 + }, + { + "epoch": 0.7899, + "grad_norm": 2.704491376876831, + "learning_rate": 2.5708879043697053e-06, + "loss": 0.7035, + "step": 15798 + }, + { + "epoch": 0.79, + "grad_norm": 5.380894660949707, + "learning_rate": 2.5685517452260566e-06, + "loss": 1.3686, + "step": 15800 + }, + { + "epoch": 0.7901, + "grad_norm": 2.5472612380981445, + "learning_rate": 2.5662164915818412e-06, + "loss": 0.8731, + "step": 15802 + }, + { + "epoch": 0.7902, + "grad_norm": 4.658206462860107, + "learning_rate": 2.5638821437215944e-06, + "loss": 0.2422, + "step": 15804 + }, + { + "epoch": 0.7903, + "grad_norm": 0.6479368805885315, + "learning_rate": 2.561548701929749e-06, + "loss": 0.5639, + "step": 15806 + }, + { + "epoch": 0.7904, + "grad_norm": 0.32808786630630493, + "learning_rate": 2.5592161664906366e-06, + "loss": 0.4518, + "step": 15808 + }, + { + "epoch": 0.7905, + "grad_norm": 6.283143997192383, + "learning_rate": 2.556884537688459e-06, + "loss": 0.709, + "step": 15810 + }, + { + "epoch": 0.7906, + "grad_norm": 3.0346574783325195, + "learning_rate": 2.5545538158073278e-06, + "loss": 0.5765, + "step": 15812 + }, + { + "epoch": 0.7907, + "grad_norm": 3.432060956954956, + "learning_rate": 2.552224001131225e-06, + "loss": 1.3278, + "step": 15814 + }, + { + "epoch": 0.7908, + "grad_norm": 7.379076957702637, + "learning_rate": 2.549895093944039e-06, + "loss": 1.7725, + "step": 15816 + }, + { + "epoch": 0.7909, + "grad_norm": 3.790424346923828, + "learning_rate": 2.5475670945295373e-06, + "loss": 0.6399, + "step": 15818 + }, + { + "epoch": 0.791, + "grad_norm": 5.486758232116699, + "learning_rate": 2.5452400031713786e-06, + "loss": 1.387, + "step": 15820 + }, + { + "epoch": 0.7911, + "grad_norm": 2.7293622493743896, + "learning_rate": 2.5429138201531135e-06, + "loss": 1.3181, + "step": 15822 + }, + { + "epoch": 0.7912, + "grad_norm": 6.1232008934021, + "learning_rate": 2.5405885457581793e-06, + "loss": 0.4371, + "step": 15824 + }, + { + "epoch": 0.7913, + "grad_norm": 4.5450568199157715, + "learning_rate": 2.5382641802699036e-06, + "loss": 1.0741, + "step": 15826 + }, + { + "epoch": 0.7914, + "grad_norm": 0.7064595222473145, + "learning_rate": 2.535940723971505e-06, + "loss": 0.3442, + "step": 15828 + }, + { + "epoch": 0.7915, + "grad_norm": 6.356743812561035, + "learning_rate": 2.5336181771460877e-06, + "loss": 0.7746, + "step": 15830 + }, + { + "epoch": 0.7916, + "grad_norm": 10.345131874084473, + "learning_rate": 2.5312965400766475e-06, + "loss": 1.8969, + "step": 15832 + }, + { + "epoch": 0.7917, + "grad_norm": 3.4980974197387695, + "learning_rate": 2.5289758130460685e-06, + "loss": 1.4628, + "step": 15834 + }, + { + "epoch": 0.7918, + "grad_norm": 1.638777256011963, + "learning_rate": 2.5266559963371216e-06, + "loss": 1.0583, + "step": 15836 + }, + { + "epoch": 0.7919, + "grad_norm": 7.2178778648376465, + "learning_rate": 2.5243370902324794e-06, + "loss": 1.3159, + "step": 15838 + }, + { + "epoch": 0.792, + "grad_norm": 0.8556069135665894, + "learning_rate": 2.522019095014683e-06, + "loss": 0.6021, + "step": 15840 + }, + { + "epoch": 0.7921, + "grad_norm": 7.7531609535217285, + "learning_rate": 2.5197020109661775e-06, + "loss": 0.9414, + "step": 15842 + }, + { + "epoch": 0.7922, + "grad_norm": 3.8797905445098877, + "learning_rate": 2.5173858383692906e-06, + "loss": 0.2931, + "step": 15844 + }, + { + "epoch": 0.7923, + "grad_norm": 6.315894603729248, + "learning_rate": 2.51507057750624e-06, + "loss": 1.3597, + "step": 15846 + }, + { + "epoch": 0.7924, + "grad_norm": 4.527257919311523, + "learning_rate": 2.512756228659141e-06, + "loss": 1.2766, + "step": 15848 + }, + { + "epoch": 0.7925, + "grad_norm": 8.258660316467285, + "learning_rate": 2.5104427921099783e-06, + "loss": 1.0968, + "step": 15850 + }, + { + "epoch": 0.7926, + "grad_norm": 4.47475004196167, + "learning_rate": 2.5081302681406463e-06, + "loss": 0.6633, + "step": 15852 + }, + { + "epoch": 0.7927, + "grad_norm": 10.320082664489746, + "learning_rate": 2.5058186570329157e-06, + "loss": 0.9917, + "step": 15854 + }, + { + "epoch": 0.7928, + "grad_norm": 4.674445152282715, + "learning_rate": 2.5035079590684496e-06, + "loss": 0.8187, + "step": 15856 + }, + { + "epoch": 0.7929, + "grad_norm": 2.3255858421325684, + "learning_rate": 2.5011981745288016e-06, + "loss": 0.5993, + "step": 15858 + }, + { + "epoch": 0.793, + "grad_norm": 3.0741782188415527, + "learning_rate": 2.4988893036954045e-06, + "loss": 0.8646, + "step": 15860 + }, + { + "epoch": 0.7931, + "grad_norm": 10.022472381591797, + "learning_rate": 2.496581346849596e-06, + "loss": 1.5062, + "step": 15862 + }, + { + "epoch": 0.7932, + "grad_norm": 5.762510776519775, + "learning_rate": 2.494274304272589e-06, + "loss": 1.4381, + "step": 15864 + }, + { + "epoch": 0.7933, + "grad_norm": 3.1448333263397217, + "learning_rate": 2.4919681762454915e-06, + "loss": 1.062, + "step": 15866 + }, + { + "epoch": 0.7934, + "grad_norm": 0.40642526745796204, + "learning_rate": 2.4896629630492974e-06, + "loss": 0.5142, + "step": 15868 + }, + { + "epoch": 0.7935, + "grad_norm": 2.271228075027466, + "learning_rate": 2.4873586649648896e-06, + "loss": 1.0378, + "step": 15870 + }, + { + "epoch": 0.7936, + "grad_norm": 6.742845058441162, + "learning_rate": 2.48505528227304e-06, + "loss": 0.2337, + "step": 15872 + }, + { + "epoch": 0.7937, + "grad_norm": 15.479546546936035, + "learning_rate": 2.48275281525441e-06, + "loss": 1.4053, + "step": 15874 + }, + { + "epoch": 0.7938, + "grad_norm": 1.827659010887146, + "learning_rate": 2.480451264189546e-06, + "loss": 0.2048, + "step": 15876 + }, + { + "epoch": 0.7939, + "grad_norm": 9.20085334777832, + "learning_rate": 2.4781506293588876e-06, + "loss": 1.3232, + "step": 15878 + }, + { + "epoch": 0.794, + "grad_norm": 11.726430892944336, + "learning_rate": 2.4758509110427576e-06, + "loss": 1.3253, + "step": 15880 + }, + { + "epoch": 0.7941, + "grad_norm": 2.6345913410186768, + "learning_rate": 2.473552109521369e-06, + "loss": 0.85, + "step": 15882 + }, + { + "epoch": 0.7942, + "grad_norm": 3.416372060775757, + "learning_rate": 2.4712542250748305e-06, + "loss": 0.8689, + "step": 15884 + }, + { + "epoch": 0.7943, + "grad_norm": 6.362034320831299, + "learning_rate": 2.468957257983122e-06, + "loss": 1.5309, + "step": 15886 + }, + { + "epoch": 0.7944, + "grad_norm": 1.5284498929977417, + "learning_rate": 2.4666612085261344e-06, + "loss": 0.4651, + "step": 15888 + }, + { + "epoch": 0.7945, + "grad_norm": 4.583583831787109, + "learning_rate": 2.464366076983623e-06, + "loss": 0.3747, + "step": 15890 + }, + { + "epoch": 0.7946, + "grad_norm": 7.470922470092773, + "learning_rate": 2.4620718636352457e-06, + "loss": 0.6969, + "step": 15892 + }, + { + "epoch": 0.7947, + "grad_norm": 4.274083137512207, + "learning_rate": 2.4597785687605512e-06, + "loss": 0.9043, + "step": 15894 + }, + { + "epoch": 0.7948, + "grad_norm": 10.307106018066406, + "learning_rate": 2.4574861926389615e-06, + "loss": 0.8526, + "step": 15896 + }, + { + "epoch": 0.7949, + "grad_norm": 7.907009124755859, + "learning_rate": 2.455194735549803e-06, + "loss": 1.1895, + "step": 15898 + }, + { + "epoch": 0.795, + "grad_norm": 5.185745716094971, + "learning_rate": 2.45290419777228e-06, + "loss": 0.6244, + "step": 15900 + }, + { + "epoch": 0.7951, + "grad_norm": 4.588487148284912, + "learning_rate": 2.4506145795854873e-06, + "loss": 1.3843, + "step": 15902 + }, + { + "epoch": 0.7952, + "grad_norm": 6.167654037475586, + "learning_rate": 2.4483258812684096e-06, + "loss": 0.9581, + "step": 15904 + }, + { + "epoch": 0.7953, + "grad_norm": 7.454841136932373, + "learning_rate": 2.446038103099916e-06, + "loss": 0.7865, + "step": 15906 + }, + { + "epoch": 0.7954, + "grad_norm": 5.877472877502441, + "learning_rate": 2.4437512453587653e-06, + "loss": 1.5357, + "step": 15908 + }, + { + "epoch": 0.7955, + "grad_norm": 4.134323596954346, + "learning_rate": 2.441465308323605e-06, + "loss": 0.4847, + "step": 15910 + }, + { + "epoch": 0.7956, + "grad_norm": 2.545431613922119, + "learning_rate": 2.4391802922729703e-06, + "loss": 0.6364, + "step": 15912 + }, + { + "epoch": 0.7957, + "grad_norm": 1.32488214969635, + "learning_rate": 2.436896197485282e-06, + "loss": 0.5725, + "step": 15914 + }, + { + "epoch": 0.7958, + "grad_norm": 5.289571762084961, + "learning_rate": 2.43461302423885e-06, + "loss": 1.0447, + "step": 15916 + }, + { + "epoch": 0.7959, + "grad_norm": 7.014378547668457, + "learning_rate": 2.432330772811874e-06, + "loss": 1.0791, + "step": 15918 + }, + { + "epoch": 0.796, + "grad_norm": 40.52552032470703, + "learning_rate": 2.4300494434824373e-06, + "loss": 0.7246, + "step": 15920 + }, + { + "epoch": 0.7961, + "grad_norm": 4.981069564819336, + "learning_rate": 2.427769036528511e-06, + "loss": 0.632, + "step": 15922 + }, + { + "epoch": 0.7962, + "grad_norm": 3.7183523178100586, + "learning_rate": 2.4254895522279642e-06, + "loss": 1.3539, + "step": 15924 + }, + { + "epoch": 0.7963, + "grad_norm": 3.3330602645874023, + "learning_rate": 2.4232109908585376e-06, + "loss": 0.4788, + "step": 15926 + }, + { + "epoch": 0.7964, + "grad_norm": 0.6575347185134888, + "learning_rate": 2.420933352697865e-06, + "loss": 0.5314, + "step": 15928 + }, + { + "epoch": 0.7965, + "grad_norm": 3.5078816413879395, + "learning_rate": 2.41865663802348e-06, + "loss": 0.9595, + "step": 15930 + }, + { + "epoch": 0.7966, + "grad_norm": 4.458200454711914, + "learning_rate": 2.4163808471127815e-06, + "loss": 0.4429, + "step": 15932 + }, + { + "epoch": 0.7967, + "grad_norm": 3.4458229541778564, + "learning_rate": 2.4141059802430777e-06, + "loss": 0.4601, + "step": 15934 + }, + { + "epoch": 0.7968, + "grad_norm": 1.5726892948150635, + "learning_rate": 2.411832037691545e-06, + "loss": 0.4276, + "step": 15936 + }, + { + "epoch": 0.7969, + "grad_norm": 3.054262161254883, + "learning_rate": 2.4095590197352634e-06, + "loss": 0.7329, + "step": 15938 + }, + { + "epoch": 0.797, + "grad_norm": 2.3089635372161865, + "learning_rate": 2.407286926651192e-06, + "loss": 1.5919, + "step": 15940 + }, + { + "epoch": 0.7971, + "grad_norm": 8.458686828613281, + "learning_rate": 2.405015758716177e-06, + "loss": 0.5584, + "step": 15942 + }, + { + "epoch": 0.7972, + "grad_norm": 21.793916702270508, + "learning_rate": 2.4027455162069567e-06, + "loss": 1.3576, + "step": 15944 + }, + { + "epoch": 0.7973, + "grad_norm": 5.140868663787842, + "learning_rate": 2.4004761994001433e-06, + "loss": 1.1178, + "step": 15946 + }, + { + "epoch": 0.7974, + "grad_norm": 7.7867937088012695, + "learning_rate": 2.398207808572258e-06, + "loss": 0.7403, + "step": 15948 + }, + { + "epoch": 0.7975, + "grad_norm": 5.035459041595459, + "learning_rate": 2.395940343999691e-06, + "loss": 1.1057, + "step": 15950 + }, + { + "epoch": 0.7976, + "grad_norm": 4.405247211456299, + "learning_rate": 2.3936738059587284e-06, + "loss": 0.976, + "step": 15952 + }, + { + "epoch": 0.7977, + "grad_norm": 5.142799377441406, + "learning_rate": 2.3914081947255396e-06, + "loss": 0.568, + "step": 15954 + }, + { + "epoch": 0.7978, + "grad_norm": 3.6861681938171387, + "learning_rate": 2.3891435105761838e-06, + "loss": 0.7986, + "step": 15956 + }, + { + "epoch": 0.7979, + "grad_norm": 4.138240337371826, + "learning_rate": 2.386879753786602e-06, + "loss": 0.7878, + "step": 15958 + }, + { + "epoch": 0.798, + "grad_norm": 2.038662910461426, + "learning_rate": 2.3846169246326345e-06, + "loss": 0.7337, + "step": 15960 + }, + { + "epoch": 0.7981, + "grad_norm": 4.2867021560668945, + "learning_rate": 2.3823550233899916e-06, + "loss": 1.0113, + "step": 15962 + }, + { + "epoch": 0.7982, + "grad_norm": 3.507681369781494, + "learning_rate": 2.380094050334283e-06, + "loss": 0.5493, + "step": 15964 + }, + { + "epoch": 0.7983, + "grad_norm": 3.0858030319213867, + "learning_rate": 2.377834005741e-06, + "loss": 1.2944, + "step": 15966 + }, + { + "epoch": 0.7984, + "grad_norm": 3.1884632110595703, + "learning_rate": 2.37557488988552e-06, + "loss": 1.105, + "step": 15968 + }, + { + "epoch": 0.7985, + "grad_norm": 2.5154218673706055, + "learning_rate": 2.3733167030431194e-06, + "loss": 0.4303, + "step": 15970 + }, + { + "epoch": 0.7986, + "grad_norm": 8.42250919342041, + "learning_rate": 2.371059445488938e-06, + "loss": 1.2891, + "step": 15972 + }, + { + "epoch": 0.7987, + "grad_norm": 17.168285369873047, + "learning_rate": 2.3688031174980277e-06, + "loss": 0.8937, + "step": 15974 + }, + { + "epoch": 0.7988, + "grad_norm": 3.1825077533721924, + "learning_rate": 2.3665477193453037e-06, + "loss": 0.36, + "step": 15976 + }, + { + "epoch": 0.7989, + "grad_norm": 2.2596521377563477, + "learning_rate": 2.3642932513055885e-06, + "loss": 1.2124, + "step": 15978 + }, + { + "epoch": 0.799, + "grad_norm": 5.97321081161499, + "learning_rate": 2.362039713653581e-06, + "loss": 1.1655, + "step": 15980 + }, + { + "epoch": 0.7991, + "grad_norm": 3.840578556060791, + "learning_rate": 2.359787106663861e-06, + "loss": 1.0964, + "step": 15982 + }, + { + "epoch": 0.7992, + "grad_norm": 3.9506068229675293, + "learning_rate": 2.35753543061091e-06, + "loss": 0.5048, + "step": 15984 + }, + { + "epoch": 0.7993, + "grad_norm": 3.1689648628234863, + "learning_rate": 2.3552846857690847e-06, + "loss": 0.18, + "step": 15986 + }, + { + "epoch": 0.7994, + "grad_norm": 2.4597983360290527, + "learning_rate": 2.3530348724126304e-06, + "loss": 1.1189, + "step": 15988 + }, + { + "epoch": 0.7995, + "grad_norm": 9.843687057495117, + "learning_rate": 2.3507859908156828e-06, + "loss": 1.0157, + "step": 15990 + }, + { + "epoch": 0.7996, + "grad_norm": 3.514437675476074, + "learning_rate": 2.3485380412522586e-06, + "loss": 1.228, + "step": 15992 + }, + { + "epoch": 0.7997, + "grad_norm": 0.7997338771820068, + "learning_rate": 2.3462910239962654e-06, + "loss": 0.681, + "step": 15994 + }, + { + "epoch": 0.7998, + "grad_norm": 6.068864345550537, + "learning_rate": 2.3440449393214947e-06, + "loss": 1.1919, + "step": 15996 + }, + { + "epoch": 0.7999, + "grad_norm": 6.073366641998291, + "learning_rate": 2.341799787501625e-06, + "loss": 0.4397, + "step": 15998 + }, + { + "epoch": 0.8, + "grad_norm": 2.624521017074585, + "learning_rate": 2.339555568810221e-06, + "loss": 1.1794, + "step": 16000 + }, + { + "epoch": 0.8001, + "grad_norm": 1.5330501794815063, + "learning_rate": 2.3373122835207353e-06, + "loss": 0.3373, + "step": 16002 + }, + { + "epoch": 0.8002, + "grad_norm": 3.5070974826812744, + "learning_rate": 2.335069931906503e-06, + "loss": 1.2364, + "step": 16004 + }, + { + "epoch": 0.8003, + "grad_norm": 2.0494697093963623, + "learning_rate": 2.3328285142407503e-06, + "loss": 0.8597, + "step": 16006 + }, + { + "epoch": 0.8004, + "grad_norm": 4.536273956298828, + "learning_rate": 2.3305880307965834e-06, + "loss": 0.2612, + "step": 16008 + }, + { + "epoch": 0.8005, + "grad_norm": 9.012617111206055, + "learning_rate": 2.328348481847006e-06, + "loss": 0.4774, + "step": 16010 + }, + { + "epoch": 0.8006, + "grad_norm": 0.19720789790153503, + "learning_rate": 2.3261098676648908e-06, + "loss": 0.7158, + "step": 16012 + }, + { + "epoch": 0.8007, + "grad_norm": 2.783567190170288, + "learning_rate": 2.323872188523013e-06, + "loss": 0.477, + "step": 16014 + }, + { + "epoch": 0.8008, + "grad_norm": 2.520193099975586, + "learning_rate": 2.321635444694028e-06, + "loss": 1.1979, + "step": 16016 + }, + { + "epoch": 0.8009, + "grad_norm": 8.622408866882324, + "learning_rate": 2.319399636450468e-06, + "loss": 1.1939, + "step": 16018 + }, + { + "epoch": 0.801, + "grad_norm": 4.768767833709717, + "learning_rate": 2.317164764064769e-06, + "loss": 0.8918, + "step": 16020 + }, + { + "epoch": 0.8011, + "grad_norm": 6.6190361976623535, + "learning_rate": 2.3149308278092343e-06, + "loss": 0.671, + "step": 16022 + }, + { + "epoch": 0.8012, + "grad_norm": 10.911921501159668, + "learning_rate": 2.3126978279560687e-06, + "loss": 0.8638, + "step": 16024 + }, + { + "epoch": 0.8013, + "grad_norm": 4.917567729949951, + "learning_rate": 2.3104657647773552e-06, + "loss": 0.1582, + "step": 16026 + }, + { + "epoch": 0.8014, + "grad_norm": 9.570755004882812, + "learning_rate": 2.308234638545064e-06, + "loss": 1.0005, + "step": 16028 + }, + { + "epoch": 0.8015, + "grad_norm": 3.4108548164367676, + "learning_rate": 2.3060044495310507e-06, + "loss": 0.9264, + "step": 16030 + }, + { + "epoch": 0.8016, + "grad_norm": 4.506930828094482, + "learning_rate": 2.3037751980070557e-06, + "loss": 2.4075, + "step": 16032 + }, + { + "epoch": 0.8017, + "grad_norm": 5.093729496002197, + "learning_rate": 2.301546884244709e-06, + "loss": 0.707, + "step": 16034 + }, + { + "epoch": 0.8018, + "grad_norm": 3.8171043395996094, + "learning_rate": 2.2993195085155205e-06, + "loss": 1.1843, + "step": 16036 + }, + { + "epoch": 0.8019, + "grad_norm": 8.299418449401855, + "learning_rate": 2.2970930710908934e-06, + "loss": 0.7192, + "step": 16038 + }, + { + "epoch": 0.802, + "grad_norm": 4.138442516326904, + "learning_rate": 2.2948675722421086e-06, + "loss": 0.6986, + "step": 16040 + }, + { + "epoch": 0.8021, + "grad_norm": 5.0138421058654785, + "learning_rate": 2.292643012240339e-06, + "loss": 0.8248, + "step": 16042 + }, + { + "epoch": 0.8022, + "grad_norm": 3.7268640995025635, + "learning_rate": 2.2904193913566363e-06, + "loss": 0.5791, + "step": 16044 + }, + { + "epoch": 0.8023, + "grad_norm": 7.149802207946777, + "learning_rate": 2.2881967098619507e-06, + "loss": 0.8191, + "step": 16046 + }, + { + "epoch": 0.8024, + "grad_norm": 4.4771504402160645, + "learning_rate": 2.2859749680270983e-06, + "loss": 0.8517, + "step": 16048 + }, + { + "epoch": 0.8025, + "grad_norm": 2.4737701416015625, + "learning_rate": 2.2837541661228024e-06, + "loss": 1.1963, + "step": 16050 + }, + { + "epoch": 0.8026, + "grad_norm": 2.4475300312042236, + "learning_rate": 2.2815343044196523e-06, + "loss": 0.597, + "step": 16052 + }, + { + "epoch": 0.8027, + "grad_norm": 3.5712738037109375, + "learning_rate": 2.279315383188132e-06, + "loss": 1.075, + "step": 16054 + }, + { + "epoch": 0.8028, + "grad_norm": 2.178985595703125, + "learning_rate": 2.277097402698619e-06, + "loss": 0.8043, + "step": 16056 + }, + { + "epoch": 0.8029, + "grad_norm": 3.282435655593872, + "learning_rate": 2.2748803632213556e-06, + "loss": 0.4668, + "step": 16058 + }, + { + "epoch": 0.803, + "grad_norm": 1.3803445100784302, + "learning_rate": 2.27266426502649e-06, + "loss": 0.506, + "step": 16060 + }, + { + "epoch": 0.8031, + "grad_norm": 6.191848278045654, + "learning_rate": 2.270449108384044e-06, + "loss": 1.1995, + "step": 16062 + }, + { + "epoch": 0.8032, + "grad_norm": 3.3565547466278076, + "learning_rate": 2.2682348935639274e-06, + "loss": 0.8742, + "step": 16064 + }, + { + "epoch": 0.8033, + "grad_norm": 3.2913477420806885, + "learning_rate": 2.2660216208359365e-06, + "loss": 1.1827, + "step": 16066 + }, + { + "epoch": 0.8034, + "grad_norm": 10.493579864501953, + "learning_rate": 2.2638092904697516e-06, + "loss": 1.0973, + "step": 16068 + }, + { + "epoch": 0.8035, + "grad_norm": 9.291919708251953, + "learning_rate": 2.261597902734939e-06, + "loss": 1.401, + "step": 16070 + }, + { + "epoch": 0.8036, + "grad_norm": 4.43838357925415, + "learning_rate": 2.259387457900948e-06, + "loss": 0.7833, + "step": 16072 + }, + { + "epoch": 0.8037, + "grad_norm": 10.290034294128418, + "learning_rate": 2.2571779562371153e-06, + "loss": 1.4953, + "step": 16074 + }, + { + "epoch": 0.8038, + "grad_norm": 0.10001963376998901, + "learning_rate": 2.254969398012663e-06, + "loss": 0.4323, + "step": 16076 + }, + { + "epoch": 0.8039, + "grad_norm": 5.291372776031494, + "learning_rate": 2.2527617834966953e-06, + "loss": 0.8275, + "step": 16078 + }, + { + "epoch": 0.804, + "grad_norm": 5.4931721687316895, + "learning_rate": 2.2505551129582047e-06, + "loss": 0.3902, + "step": 16080 + }, + { + "epoch": 0.8041, + "grad_norm": 5.74638032913208, + "learning_rate": 2.2483493866660677e-06, + "loss": 1.0004, + "step": 16082 + }, + { + "epoch": 0.8042, + "grad_norm": 1.437219262123108, + "learning_rate": 2.2461446048890424e-06, + "loss": 0.4065, + "step": 16084 + }, + { + "epoch": 0.8043, + "grad_norm": 7.481938362121582, + "learning_rate": 2.2439407678957814e-06, + "loss": 1.7479, + "step": 16086 + }, + { + "epoch": 0.8044, + "grad_norm": 6.454188823699951, + "learning_rate": 2.241737875954808e-06, + "loss": 0.6424, + "step": 16088 + }, + { + "epoch": 0.8045, + "grad_norm": 21.455202102661133, + "learning_rate": 2.2395359293345396e-06, + "loss": 1.6079, + "step": 16090 + }, + { + "epoch": 0.8046, + "grad_norm": 8.617880821228027, + "learning_rate": 2.237334928303283e-06, + "loss": 1.2996, + "step": 16092 + }, + { + "epoch": 0.8047, + "grad_norm": 11.392097473144531, + "learning_rate": 2.2351348731292134e-06, + "loss": 0.914, + "step": 16094 + }, + { + "epoch": 0.8048, + "grad_norm": 4.1512651443481445, + "learning_rate": 2.2329357640804118e-06, + "loss": 0.9495, + "step": 16096 + }, + { + "epoch": 0.8049, + "grad_norm": 0.6105539202690125, + "learning_rate": 2.2307376014248218e-06, + "loss": 0.4177, + "step": 16098 + }, + { + "epoch": 0.805, + "grad_norm": 8.450813293457031, + "learning_rate": 2.2285403854302912e-06, + "loss": 0.9964, + "step": 16100 + }, + { + "epoch": 0.8051, + "grad_norm": 4.257735252380371, + "learning_rate": 2.2263441163645407e-06, + "loss": 0.3869, + "step": 16102 + }, + { + "epoch": 0.8052, + "grad_norm": 3.1840338706970215, + "learning_rate": 2.22414879449518e-06, + "loss": 0.5597, + "step": 16104 + }, + { + "epoch": 0.8053, + "grad_norm": 3.1235742568969727, + "learning_rate": 2.2219544200897024e-06, + "loss": 1.0455, + "step": 16106 + }, + { + "epoch": 0.8054, + "grad_norm": 5.500372886657715, + "learning_rate": 2.219760993415485e-06, + "loss": 0.8488, + "step": 16108 + }, + { + "epoch": 0.8055, + "grad_norm": 6.171422004699707, + "learning_rate": 2.2175685147397906e-06, + "loss": 1.2204, + "step": 16110 + }, + { + "epoch": 0.8056, + "grad_norm": 3.0159597396850586, + "learning_rate": 2.215376984329767e-06, + "loss": 0.5713, + "step": 16112 + }, + { + "epoch": 0.8057, + "grad_norm": 4.5765252113342285, + "learning_rate": 2.213186402452443e-06, + "loss": 0.755, + "step": 16114 + }, + { + "epoch": 0.8058, + "grad_norm": 6.229780673980713, + "learning_rate": 2.210996769374737e-06, + "loss": 1.103, + "step": 16116 + }, + { + "epoch": 0.8059, + "grad_norm": 3.8990750312805176, + "learning_rate": 2.2088080853634474e-06, + "loss": 0.5228, + "step": 16118 + }, + { + "epoch": 0.806, + "grad_norm": 3.247185468673706, + "learning_rate": 2.206620350685257e-06, + "loss": 0.7999, + "step": 16120 + }, + { + "epoch": 0.8061, + "grad_norm": 7.3778252601623535, + "learning_rate": 2.204433565606743e-06, + "loss": 0.9286, + "step": 16122 + }, + { + "epoch": 0.8062, + "grad_norm": 5.951402187347412, + "learning_rate": 2.202247730394349e-06, + "loss": 1.1592, + "step": 16124 + }, + { + "epoch": 0.8063, + "grad_norm": 3.0786221027374268, + "learning_rate": 2.2000628453144166e-06, + "loss": 0.9394, + "step": 16126 + }, + { + "epoch": 0.8064, + "grad_norm": 2.7323148250579834, + "learning_rate": 2.1978789106331666e-06, + "loss": 0.588, + "step": 16128 + }, + { + "epoch": 0.8065, + "grad_norm": 2.6466832160949707, + "learning_rate": 2.195695926616702e-06, + "loss": 0.8636, + "step": 16130 + }, + { + "epoch": 0.8066, + "grad_norm": 7.275325775146484, + "learning_rate": 2.1935138935310208e-06, + "loss": 0.8102, + "step": 16132 + }, + { + "epoch": 0.8067, + "grad_norm": 3.6605377197265625, + "learning_rate": 2.1913328116419872e-06, + "loss": 1.1083, + "step": 16134 + }, + { + "epoch": 0.8068, + "grad_norm": 1.1777112483978271, + "learning_rate": 2.1891526812153674e-06, + "loss": 0.2373, + "step": 16136 + }, + { + "epoch": 0.8069, + "grad_norm": 5.231235504150391, + "learning_rate": 2.1869735025168025e-06, + "loss": 0.7954, + "step": 16138 + }, + { + "epoch": 0.807, + "grad_norm": 13.53899097442627, + "learning_rate": 2.1847952758118118e-06, + "loss": 1.6272, + "step": 16140 + }, + { + "epoch": 0.8071, + "grad_norm": 2.3691930770874023, + "learning_rate": 2.182618001365817e-06, + "loss": 0.534, + "step": 16142 + }, + { + "epoch": 0.8072, + "grad_norm": 2.7537992000579834, + "learning_rate": 2.1804416794441e-06, + "loss": 0.567, + "step": 16144 + }, + { + "epoch": 0.8073, + "grad_norm": 4.634843826293945, + "learning_rate": 2.1782663103118474e-06, + "loss": 0.9447, + "step": 16146 + }, + { + "epoch": 0.8074, + "grad_norm": 3.3577983379364014, + "learning_rate": 2.1760918942341193e-06, + "loss": 0.7234, + "step": 16148 + }, + { + "epoch": 0.8075, + "grad_norm": 7.680382251739502, + "learning_rate": 2.173918431475861e-06, + "loss": 1.0901, + "step": 16150 + }, + { + "epoch": 0.8076, + "grad_norm": 3.192375421524048, + "learning_rate": 2.171745922301903e-06, + "loss": 0.9511, + "step": 16152 + }, + { + "epoch": 0.8077, + "grad_norm": 3.359891891479492, + "learning_rate": 2.1695743669769597e-06, + "loss": 0.7278, + "step": 16154 + }, + { + "epoch": 0.8078, + "grad_norm": 7.425804138183594, + "learning_rate": 2.1674037657656265e-06, + "loss": 1.2868, + "step": 16156 + }, + { + "epoch": 0.8079, + "grad_norm": 2.9212164878845215, + "learning_rate": 2.1652341189323867e-06, + "loss": 0.7245, + "step": 16158 + }, + { + "epoch": 0.808, + "grad_norm": 5.750950813293457, + "learning_rate": 2.163065426741603e-06, + "loss": 0.5598, + "step": 16160 + }, + { + "epoch": 0.8081, + "grad_norm": 6.083713054656982, + "learning_rate": 2.160897689457526e-06, + "loss": 1.0433, + "step": 16162 + }, + { + "epoch": 0.8082, + "grad_norm": 2.840860605239868, + "learning_rate": 2.1587309073442865e-06, + "loss": 0.0923, + "step": 16164 + }, + { + "epoch": 0.8083, + "grad_norm": 9.544060707092285, + "learning_rate": 2.1565650806658977e-06, + "loss": 0.7397, + "step": 16166 + }, + { + "epoch": 0.8084, + "grad_norm": 3.355175256729126, + "learning_rate": 2.154400209686268e-06, + "loss": 0.6592, + "step": 16168 + }, + { + "epoch": 0.8085, + "grad_norm": 5.329705238342285, + "learning_rate": 2.15223629466917e-06, + "loss": 1.0326, + "step": 16170 + }, + { + "epoch": 0.8086, + "grad_norm": 6.994843006134033, + "learning_rate": 2.1500733358782786e-06, + "loss": 0.592, + "step": 16172 + }, + { + "epoch": 0.8087, + "grad_norm": 8.484472274780273, + "learning_rate": 2.1479113335771383e-06, + "loss": 0.2593, + "step": 16174 + }, + { + "epoch": 0.8088, + "grad_norm": 14.628308296203613, + "learning_rate": 2.1457502880291815e-06, + "loss": 1.2354, + "step": 16176 + }, + { + "epoch": 0.8089, + "grad_norm": 6.5475239753723145, + "learning_rate": 2.1435901994977325e-06, + "loss": 0.9057, + "step": 16178 + }, + { + "epoch": 0.809, + "grad_norm": 5.432136058807373, + "learning_rate": 2.1414310682459805e-06, + "loss": 1.0066, + "step": 16180 + }, + { + "epoch": 0.8091, + "grad_norm": 7.570497512817383, + "learning_rate": 2.1392728945370224e-06, + "loss": 1.2507, + "step": 16182 + }, + { + "epoch": 0.8092, + "grad_norm": 4.917091369628906, + "learning_rate": 2.1371156786338108e-06, + "loss": 1.1258, + "step": 16184 + }, + { + "epoch": 0.8093, + "grad_norm": 4.172047138214111, + "learning_rate": 2.1349594207992066e-06, + "loss": 0.4863, + "step": 16186 + }, + { + "epoch": 0.8094, + "grad_norm": 10.555187225341797, + "learning_rate": 2.1328041212959403e-06, + "loss": 0.7361, + "step": 16188 + }, + { + "epoch": 0.8095, + "grad_norm": 2.539048194885254, + "learning_rate": 2.130649780386628e-06, + "loss": 0.8832, + "step": 16190 + }, + { + "epoch": 0.8096, + "grad_norm": 1.4578710794448853, + "learning_rate": 2.128496398333768e-06, + "loss": 0.0375, + "step": 16192 + }, + { + "epoch": 0.8097, + "grad_norm": 4.990647792816162, + "learning_rate": 2.1263439753997473e-06, + "loss": 0.7154, + "step": 16194 + }, + { + "epoch": 0.8098, + "grad_norm": 1.70303213596344, + "learning_rate": 2.1241925118468288e-06, + "loss": 0.5519, + "step": 16196 + }, + { + "epoch": 0.8099, + "grad_norm": 3.559763193130493, + "learning_rate": 2.122042007937163e-06, + "loss": 1.2824, + "step": 16198 + }, + { + "epoch": 0.81, + "grad_norm": 4.759049892425537, + "learning_rate": 2.119892463932781e-06, + "loss": 0.883, + "step": 16200 + }, + { + "epoch": 0.8101, + "grad_norm": 6.545719623565674, + "learning_rate": 2.117743880095601e-06, + "loss": 0.9707, + "step": 16202 + }, + { + "epoch": 0.8102, + "grad_norm": 7.813844680786133, + "learning_rate": 2.115596256687419e-06, + "loss": 0.9126, + "step": 16204 + }, + { + "epoch": 0.8103, + "grad_norm": 5.941276550292969, + "learning_rate": 2.113449593969915e-06, + "loss": 0.5625, + "step": 16206 + }, + { + "epoch": 0.8104, + "grad_norm": 3.7115862369537354, + "learning_rate": 2.1113038922046603e-06, + "loss": 1.0188, + "step": 16208 + }, + { + "epoch": 0.8105, + "grad_norm": 2.464517593383789, + "learning_rate": 2.1091591516530952e-06, + "loss": 0.654, + "step": 16210 + }, + { + "epoch": 0.8106, + "grad_norm": 3.9982240200042725, + "learning_rate": 2.107015372576552e-06, + "loss": 0.3659, + "step": 16212 + }, + { + "epoch": 0.8107, + "grad_norm": 3.635026454925537, + "learning_rate": 2.104872555236244e-06, + "loss": 1.1603, + "step": 16214 + }, + { + "epoch": 0.8108, + "grad_norm": 0.7494789361953735, + "learning_rate": 2.102730699893263e-06, + "loss": 0.5215, + "step": 16216 + }, + { + "epoch": 0.8109, + "grad_norm": 3.3939504623413086, + "learning_rate": 2.100589806808597e-06, + "loss": 0.6102, + "step": 16218 + }, + { + "epoch": 0.811, + "grad_norm": 7.423990726470947, + "learning_rate": 2.098449876243096e-06, + "loss": 1.2205, + "step": 16220 + }, + { + "epoch": 0.8111, + "grad_norm": 2.1792964935302734, + "learning_rate": 2.096310908457513e-06, + "loss": 0.9716, + "step": 16222 + }, + { + "epoch": 0.8112, + "grad_norm": 4.0067315101623535, + "learning_rate": 2.09417290371247e-06, + "loss": 0.6652, + "step": 16224 + }, + { + "epoch": 0.8113, + "grad_norm": 9.540702819824219, + "learning_rate": 2.092035862268479e-06, + "loss": 1.5658, + "step": 16226 + }, + { + "epoch": 0.8114, + "grad_norm": 13.87845230102539, + "learning_rate": 2.0898997843859338e-06, + "loss": 1.6921, + "step": 16228 + }, + { + "epoch": 0.8115, + "grad_norm": 4.7062506675720215, + "learning_rate": 2.0877646703251e-06, + "loss": 0.7053, + "step": 16230 + }, + { + "epoch": 0.8116, + "grad_norm": 3.307257890701294, + "learning_rate": 2.0856305203461436e-06, + "loss": 0.5078, + "step": 16232 + }, + { + "epoch": 0.8117, + "grad_norm": 3.1191742420196533, + "learning_rate": 2.0834973347091016e-06, + "loss": 0.5771, + "step": 16234 + }, + { + "epoch": 0.8118, + "grad_norm": 5.52719783782959, + "learning_rate": 2.0813651136738957e-06, + "loss": 1.4806, + "step": 16236 + }, + { + "epoch": 0.8119, + "grad_norm": 4.215018272399902, + "learning_rate": 2.0792338575003303e-06, + "loss": 0.4267, + "step": 16238 + }, + { + "epoch": 0.812, + "grad_norm": 4.410096645355225, + "learning_rate": 2.0771035664480944e-06, + "loss": 1.1007, + "step": 16240 + }, + { + "epoch": 0.8121, + "grad_norm": 20.121366500854492, + "learning_rate": 2.074974240776755e-06, + "loss": 0.8493, + "step": 16242 + }, + { + "epoch": 0.8122, + "grad_norm": 4.805339813232422, + "learning_rate": 2.072845880745766e-06, + "loss": 0.8342, + "step": 16244 + }, + { + "epoch": 0.8123, + "grad_norm": 13.761454582214355, + "learning_rate": 2.0707184866144604e-06, + "loss": 0.9022, + "step": 16246 + }, + { + "epoch": 0.8124, + "grad_norm": 1.9093180894851685, + "learning_rate": 2.0685920586420562e-06, + "loss": 0.2583, + "step": 16248 + }, + { + "epoch": 0.8125, + "grad_norm": 14.07712459564209, + "learning_rate": 2.0664665970876496e-06, + "loss": 2.8293, + "step": 16250 + }, + { + "epoch": 0.8126, + "grad_norm": 4.005525588989258, + "learning_rate": 2.0643421022102216e-06, + "loss": 2.0644, + "step": 16252 + }, + { + "epoch": 0.8127, + "grad_norm": 2.903372287750244, + "learning_rate": 2.0622185742686418e-06, + "loss": 0.3341, + "step": 16254 + }, + { + "epoch": 0.8128, + "grad_norm": 0.9821724891662598, + "learning_rate": 2.0600960135216463e-06, + "loss": 0.2343, + "step": 16256 + }, + { + "epoch": 0.8129, + "grad_norm": 12.978788375854492, + "learning_rate": 2.0579744202278718e-06, + "loss": 0.5931, + "step": 16258 + }, + { + "epoch": 0.813, + "grad_norm": 3.301706314086914, + "learning_rate": 2.0558537946458177e-06, + "loss": 0.5308, + "step": 16260 + }, + { + "epoch": 0.8131, + "grad_norm": 8.206040382385254, + "learning_rate": 2.053734137033886e-06, + "loss": 1.0133, + "step": 16262 + }, + { + "epoch": 0.8132, + "grad_norm": 3.1616177558898926, + "learning_rate": 2.051615447650347e-06, + "loss": 1.3777, + "step": 16264 + }, + { + "epoch": 0.8133, + "grad_norm": 4.896703720092773, + "learning_rate": 2.049497726753351e-06, + "loss": 1.3042, + "step": 16266 + }, + { + "epoch": 0.8134, + "grad_norm": 10.726639747619629, + "learning_rate": 2.0473809746009444e-06, + "loss": 0.9989, + "step": 16268 + }, + { + "epoch": 0.8135, + "grad_norm": 5.4362969398498535, + "learning_rate": 2.0452651914510414e-06, + "loss": 0.8412, + "step": 16270 + }, + { + "epoch": 0.8136, + "grad_norm": 13.120713233947754, + "learning_rate": 2.0431503775614457e-06, + "loss": 1.2569, + "step": 16272 + }, + { + "epoch": 0.8137, + "grad_norm": 10.283708572387695, + "learning_rate": 2.041036533189842e-06, + "loss": 0.7974, + "step": 16274 + }, + { + "epoch": 0.8138, + "grad_norm": 9.18591022491455, + "learning_rate": 2.0389236585937944e-06, + "loss": 0.9052, + "step": 16276 + }, + { + "epoch": 0.8139, + "grad_norm": 7.84080696105957, + "learning_rate": 2.0368117540307496e-06, + "loss": 1.6034, + "step": 16278 + }, + { + "epoch": 0.814, + "grad_norm": 7.539074897766113, + "learning_rate": 2.0347008197580376e-06, + "loss": 1.053, + "step": 16280 + }, + { + "epoch": 0.8141, + "grad_norm": 4.68807315826416, + "learning_rate": 2.03259085603287e-06, + "loss": 1.6305, + "step": 16282 + }, + { + "epoch": 0.8142, + "grad_norm": 3.556530714035034, + "learning_rate": 2.0304818631123393e-06, + "loss": 0.2733, + "step": 16284 + }, + { + "epoch": 0.8143, + "grad_norm": 5.089840888977051, + "learning_rate": 2.0283738412534193e-06, + "loss": 1.0321, + "step": 16286 + }, + { + "epoch": 0.8144, + "grad_norm": 3.032911539077759, + "learning_rate": 2.026266790712965e-06, + "loss": 0.8324, + "step": 16288 + }, + { + "epoch": 0.8145, + "grad_norm": 5.177615165710449, + "learning_rate": 2.024160711747717e-06, + "loss": 0.702, + "step": 16290 + }, + { + "epoch": 0.8146, + "grad_norm": 5.165910720825195, + "learning_rate": 2.022055604614289e-06, + "loss": 0.9974, + "step": 16292 + }, + { + "epoch": 0.8147, + "grad_norm": 8.063267707824707, + "learning_rate": 2.0199514695691915e-06, + "loss": 1.2623, + "step": 16294 + }, + { + "epoch": 0.8148, + "grad_norm": 4.294336318969727, + "learning_rate": 2.017848306868797e-06, + "loss": 0.0641, + "step": 16296 + }, + { + "epoch": 0.8149, + "grad_norm": 1.4018495082855225, + "learning_rate": 2.0157461167693757e-06, + "loss": 0.821, + "step": 16298 + }, + { + "epoch": 0.815, + "grad_norm": 9.004457473754883, + "learning_rate": 2.013644899527074e-06, + "loss": 0.6453, + "step": 16300 + }, + { + "epoch": 0.8151, + "grad_norm": 3.771188974380493, + "learning_rate": 2.0115446553979103e-06, + "loss": 0.5449, + "step": 16302 + }, + { + "epoch": 0.8152, + "grad_norm": 3.2866408824920654, + "learning_rate": 2.009445384637805e-06, + "loss": 0.6565, + "step": 16304 + }, + { + "epoch": 0.8153, + "grad_norm": 2.028172254562378, + "learning_rate": 2.007347087502536e-06, + "loss": 0.5156, + "step": 16306 + }, + { + "epoch": 0.8154, + "grad_norm": 4.212270259857178, + "learning_rate": 2.005249764247783e-06, + "loss": 0.2947, + "step": 16308 + }, + { + "epoch": 0.8155, + "grad_norm": 3.4092719554901123, + "learning_rate": 2.0031534151290944e-06, + "loss": 0.6675, + "step": 16310 + }, + { + "epoch": 0.8156, + "grad_norm": 3.2565855979919434, + "learning_rate": 2.0010580404019066e-06, + "loss": 0.3483, + "step": 16312 + }, + { + "epoch": 0.8157, + "grad_norm": 1.9368925094604492, + "learning_rate": 1.998963640321533e-06, + "loss": 0.6428, + "step": 16314 + }, + { + "epoch": 0.8158, + "grad_norm": 5.095726013183594, + "learning_rate": 1.9968702151431697e-06, + "loss": 0.9057, + "step": 16316 + }, + { + "epoch": 0.8159, + "grad_norm": 3.0925962924957275, + "learning_rate": 1.994777765121895e-06, + "loss": 0.1962, + "step": 16318 + }, + { + "epoch": 0.816, + "grad_norm": 1.1760514974594116, + "learning_rate": 1.9926862905126663e-06, + "loss": 0.6387, + "step": 16320 + }, + { + "epoch": 0.8161, + "grad_norm": 1.6798255443572998, + "learning_rate": 1.9905957915703244e-06, + "loss": 0.3993, + "step": 16322 + }, + { + "epoch": 0.8162, + "grad_norm": 2.7325327396392822, + "learning_rate": 1.9885062685495905e-06, + "loss": 1.1391, + "step": 16324 + }, + { + "epoch": 0.8163, + "grad_norm": 2.2126264572143555, + "learning_rate": 1.9864177217050672e-06, + "loss": 0.6484, + "step": 16326 + }, + { + "epoch": 0.8164, + "grad_norm": 3.758502244949341, + "learning_rate": 1.984330151291233e-06, + "loss": 0.8134, + "step": 16328 + }, + { + "epoch": 0.8165, + "grad_norm": 3.8033506870269775, + "learning_rate": 1.982243557562461e-06, + "loss": 0.6697, + "step": 16330 + }, + { + "epoch": 0.8166, + "grad_norm": 7.108344078063965, + "learning_rate": 1.9801579407729866e-06, + "loss": 0.9819, + "step": 16332 + }, + { + "epoch": 0.8167, + "grad_norm": 4.374663829803467, + "learning_rate": 1.978073301176945e-06, + "loss": 1.4147, + "step": 16334 + }, + { + "epoch": 0.8168, + "grad_norm": 5.453912258148193, + "learning_rate": 1.9759896390283362e-06, + "loss": 0.931, + "step": 16336 + }, + { + "epoch": 0.8169, + "grad_norm": 5.845370769500732, + "learning_rate": 1.9739069545810484e-06, + "loss": 0.9124, + "step": 16338 + }, + { + "epoch": 0.817, + "grad_norm": 6.12211799621582, + "learning_rate": 1.9718252480888567e-06, + "loss": 1.0102, + "step": 16340 + }, + { + "epoch": 0.8171, + "grad_norm": 4.113263130187988, + "learning_rate": 1.9697445198054023e-06, + "loss": 1.2877, + "step": 16342 + }, + { + "epoch": 0.8172, + "grad_norm": 6.351430416107178, + "learning_rate": 1.9676647699842246e-06, + "loss": 0.7395, + "step": 16344 + }, + { + "epoch": 0.8173, + "grad_norm": 3.940298557281494, + "learning_rate": 1.965585998878724e-06, + "loss": 1.4829, + "step": 16346 + }, + { + "epoch": 0.8174, + "grad_norm": 1.2591191530227661, + "learning_rate": 1.963508206742202e-06, + "loss": 0.5008, + "step": 16348 + }, + { + "epoch": 0.8175, + "grad_norm": 13.250885963439941, + "learning_rate": 1.961431393827827e-06, + "loss": 0.8539, + "step": 16350 + }, + { + "epoch": 0.8176, + "grad_norm": 15.143732070922852, + "learning_rate": 1.959355560388654e-06, + "loss": 1.9473, + "step": 16352 + }, + { + "epoch": 0.8177, + "grad_norm": 3.300276279449463, + "learning_rate": 1.9572807066776145e-06, + "loss": 0.9849, + "step": 16354 + }, + { + "epoch": 0.8178, + "grad_norm": 4.795674800872803, + "learning_rate": 1.955206832947526e-06, + "loss": 1.0531, + "step": 16356 + }, + { + "epoch": 0.8179, + "grad_norm": 5.342112064361572, + "learning_rate": 1.9531339394510827e-06, + "loss": 0.5968, + "step": 16358 + }, + { + "epoch": 0.818, + "grad_norm": 3.311164140701294, + "learning_rate": 1.95106202644086e-06, + "loss": 0.6898, + "step": 16360 + }, + { + "epoch": 0.8181, + "grad_norm": 2.0724985599517822, + "learning_rate": 1.9489910941693134e-06, + "loss": 0.7214, + "step": 16362 + }, + { + "epoch": 0.8182, + "grad_norm": 6.415163040161133, + "learning_rate": 1.9469211428887813e-06, + "loss": 1.1284, + "step": 16364 + }, + { + "epoch": 0.8183, + "grad_norm": 4.672908306121826, + "learning_rate": 1.9448521728514802e-06, + "loss": 1.3986, + "step": 16366 + }, + { + "epoch": 0.8184, + "grad_norm": 2.9184815883636475, + "learning_rate": 1.9427841843095063e-06, + "loss": 0.3282, + "step": 16368 + }, + { + "epoch": 0.8185, + "grad_norm": 4.882077693939209, + "learning_rate": 1.940717177514844e-06, + "loss": 1.3889, + "step": 16370 + }, + { + "epoch": 0.8186, + "grad_norm": 5.7706804275512695, + "learning_rate": 1.938651152719344e-06, + "loss": 0.5397, + "step": 16372 + }, + { + "epoch": 0.8187, + "grad_norm": 5.3687028884887695, + "learning_rate": 1.9365861101747484e-06, + "loss": 0.3825, + "step": 16374 + }, + { + "epoch": 0.8188, + "grad_norm": 6.923152446746826, + "learning_rate": 1.934522050132678e-06, + "loss": 1.1635, + "step": 16376 + }, + { + "epoch": 0.8189, + "grad_norm": 3.924553871154785, + "learning_rate": 1.9324589728446265e-06, + "loss": 0.4262, + "step": 16378 + }, + { + "epoch": 0.819, + "grad_norm": 4.719447135925293, + "learning_rate": 1.930396878561983e-06, + "loss": 0.2711, + "step": 16380 + }, + { + "epoch": 0.8191, + "grad_norm": 4.704864025115967, + "learning_rate": 1.928335767535997e-06, + "loss": 1.2858, + "step": 16382 + }, + { + "epoch": 0.8192, + "grad_norm": 7.633813858032227, + "learning_rate": 1.9262756400178163e-06, + "loss": 0.5306, + "step": 16384 + }, + { + "epoch": 0.8193, + "grad_norm": 3.243206739425659, + "learning_rate": 1.9242164962584618e-06, + "loss": 1.3295, + "step": 16386 + }, + { + "epoch": 0.8194, + "grad_norm": 4.039931297302246, + "learning_rate": 1.9221583365088246e-06, + "loss": 0.9264, + "step": 16388 + }, + { + "epoch": 0.8195, + "grad_norm": 4.626771926879883, + "learning_rate": 1.9201011610196972e-06, + "loss": 0.9729, + "step": 16390 + }, + { + "epoch": 0.8196, + "grad_norm": 8.799589157104492, + "learning_rate": 1.918044970041729e-06, + "loss": 0.7665, + "step": 16392 + }, + { + "epoch": 0.8197, + "grad_norm": 9.575860023498535, + "learning_rate": 1.91598976382547e-06, + "loss": 0.8716, + "step": 16394 + }, + { + "epoch": 0.8198, + "grad_norm": 4.917582035064697, + "learning_rate": 1.9139355426213346e-06, + "loss": 0.3127, + "step": 16396 + }, + { + "epoch": 0.8199, + "grad_norm": 3.571366310119629, + "learning_rate": 1.9118823066796277e-06, + "loss": 0.9471, + "step": 16398 + }, + { + "epoch": 0.82, + "grad_norm": 3.0827600955963135, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.9515, + "step": 16400 + }, + { + "epoch": 0.8201, + "grad_norm": 5.55448055267334, + "learning_rate": 1.9077787915840928e-06, + "loss": 1.0304, + "step": 16402 + }, + { + "epoch": 0.8202, + "grad_norm": 8.471136093139648, + "learning_rate": 1.9057285129302682e-06, + "loss": 0.9657, + "step": 16404 + }, + { + "epoch": 0.8203, + "grad_norm": 6.7289814949035645, + "learning_rate": 1.903679220538871e-06, + "loss": 0.9387, + "step": 16406 + }, + { + "epoch": 0.8204, + "grad_norm": 5.5166850090026855, + "learning_rate": 1.9016309146596024e-06, + "loss": 0.6639, + "step": 16408 + }, + { + "epoch": 0.8205, + "grad_norm": 2.191983461380005, + "learning_rate": 1.8995835955420417e-06, + "loss": 0.8471, + "step": 16410 + }, + { + "epoch": 0.8206, + "grad_norm": 4.27327823638916, + "learning_rate": 1.8975372634356481e-06, + "loss": 0.964, + "step": 16412 + }, + { + "epoch": 0.8207, + "grad_norm": 3.556396722793579, + "learning_rate": 1.8954919185897592e-06, + "loss": 0.2455, + "step": 16414 + }, + { + "epoch": 0.8208, + "grad_norm": 5.678452014923096, + "learning_rate": 1.8934475612536019e-06, + "loss": 0.6999, + "step": 16416 + }, + { + "epoch": 0.8209, + "grad_norm": 8.072776794433594, + "learning_rate": 1.8914041916762648e-06, + "loss": 1.0046, + "step": 16418 + }, + { + "epoch": 0.821, + "grad_norm": 4.082847595214844, + "learning_rate": 1.8893618101067357e-06, + "loss": 1.158, + "step": 16420 + }, + { + "epoch": 0.8211, + "grad_norm": 4.131528377532959, + "learning_rate": 1.8873204167938652e-06, + "loss": 0.7034, + "step": 16422 + }, + { + "epoch": 0.8212, + "grad_norm": 3.490907669067383, + "learning_rate": 1.8852800119863912e-06, + "loss": 1.1811, + "step": 16424 + }, + { + "epoch": 0.8213, + "grad_norm": 4.826265811920166, + "learning_rate": 1.883240595932938e-06, + "loss": 1.1636, + "step": 16426 + }, + { + "epoch": 0.8214, + "grad_norm": 2.1811845302581787, + "learning_rate": 1.8812021688819914e-06, + "loss": 1.3901, + "step": 16428 + }, + { + "epoch": 0.8215, + "grad_norm": 1.5492846965789795, + "learning_rate": 1.8791647310819371e-06, + "loss": 0.7269, + "step": 16430 + }, + { + "epoch": 0.8216, + "grad_norm": 3.143782615661621, + "learning_rate": 1.8771282827810278e-06, + "loss": 0.4716, + "step": 16432 + }, + { + "epoch": 0.8217, + "grad_norm": 2.5803914070129395, + "learning_rate": 1.8750928242273969e-06, + "loss": 0.2783, + "step": 16434 + }, + { + "epoch": 0.8218, + "grad_norm": 8.776880264282227, + "learning_rate": 1.8730583556690607e-06, + "loss": 0.6556, + "step": 16436 + }, + { + "epoch": 0.8219, + "grad_norm": 7.059377670288086, + "learning_rate": 1.8710248773539118e-06, + "loss": 1.0354, + "step": 16438 + }, + { + "epoch": 0.822, + "grad_norm": 4.131236553192139, + "learning_rate": 1.8689923895297247e-06, + "loss": 1.0933, + "step": 16440 + }, + { + "epoch": 0.8221, + "grad_norm": 2.5014281272888184, + "learning_rate": 1.8669608924441497e-06, + "loss": 0.4535, + "step": 16442 + }, + { + "epoch": 0.8222, + "grad_norm": 2.6216490268707275, + "learning_rate": 1.86493038634472e-06, + "loss": 1.4445, + "step": 16444 + }, + { + "epoch": 0.8223, + "grad_norm": 11.62355899810791, + "learning_rate": 1.8629008714788466e-06, + "loss": 1.034, + "step": 16446 + }, + { + "epoch": 0.8224, + "grad_norm": 13.141670227050781, + "learning_rate": 1.8608723480938207e-06, + "loss": 1.5595, + "step": 16448 + }, + { + "epoch": 0.8225, + "grad_norm": 4.82578706741333, + "learning_rate": 1.858844816436809e-06, + "loss": 0.8952, + "step": 16450 + }, + { + "epoch": 0.8226, + "grad_norm": 4.242182731628418, + "learning_rate": 1.8568182767548626e-06, + "loss": 0.9567, + "step": 16452 + }, + { + "epoch": 0.8227, + "grad_norm": 4.856588363647461, + "learning_rate": 1.8547927292949053e-06, + "loss": 0.716, + "step": 16454 + }, + { + "epoch": 0.8228, + "grad_norm": 6.993559837341309, + "learning_rate": 1.8527681743037518e-06, + "loss": 1.212, + "step": 16456 + }, + { + "epoch": 0.8229, + "grad_norm": 6.479448318481445, + "learning_rate": 1.8507446120280814e-06, + "loss": 0.7112, + "step": 16458 + }, + { + "epoch": 0.823, + "grad_norm": 5.26952600479126, + "learning_rate": 1.848722042714457e-06, + "loss": 1.3553, + "step": 16460 + }, + { + "epoch": 0.8231, + "grad_norm": 10.081646919250488, + "learning_rate": 1.8467004666093325e-06, + "loss": 1.3072, + "step": 16462 + }, + { + "epoch": 0.8232, + "grad_norm": 3.069153308868408, + "learning_rate": 1.8446798839590186e-06, + "loss": 0.4752, + "step": 16464 + }, + { + "epoch": 0.8233, + "grad_norm": 3.4797585010528564, + "learning_rate": 1.8426602950097284e-06, + "loss": 0.6524, + "step": 16466 + }, + { + "epoch": 0.8234, + "grad_norm": 3.277897596359253, + "learning_rate": 1.8406417000075327e-06, + "loss": 1.4387, + "step": 16468 + }, + { + "epoch": 0.8235, + "grad_norm": 2.2908999919891357, + "learning_rate": 1.8386240991983973e-06, + "loss": 1.0682, + "step": 16470 + }, + { + "epoch": 0.8236, + "grad_norm": 12.960140228271484, + "learning_rate": 1.8366074928281608e-06, + "loss": 1.1891, + "step": 16472 + }, + { + "epoch": 0.8237, + "grad_norm": 10.759151458740234, + "learning_rate": 1.834591881142538e-06, + "loss": 0.4993, + "step": 16474 + }, + { + "epoch": 0.8238, + "grad_norm": 5.632076263427734, + "learning_rate": 1.8325772643871264e-06, + "loss": 1.1523, + "step": 16476 + }, + { + "epoch": 0.8239, + "grad_norm": 5.380593299865723, + "learning_rate": 1.8305636428074015e-06, + "loss": 0.5963, + "step": 16478 + }, + { + "epoch": 0.824, + "grad_norm": 2.2784152030944824, + "learning_rate": 1.8285510166487154e-06, + "loss": 1.2799, + "step": 16480 + }, + { + "epoch": 0.8241, + "grad_norm": 3.4330544471740723, + "learning_rate": 1.8265393861563019e-06, + "loss": 0.8384, + "step": 16482 + }, + { + "epoch": 0.8242, + "grad_norm": 3.6101760864257812, + "learning_rate": 1.8245287515752708e-06, + "loss": 1.1773, + "step": 16484 + }, + { + "epoch": 0.8243, + "grad_norm": 5.475780963897705, + "learning_rate": 1.8225191131506125e-06, + "loss": 0.8944, + "step": 16486 + }, + { + "epoch": 0.8244, + "grad_norm": 13.158224105834961, + "learning_rate": 1.820510471127196e-06, + "loss": 1.3742, + "step": 16488 + }, + { + "epoch": 0.8245, + "grad_norm": 0.8907073736190796, + "learning_rate": 1.818502825749764e-06, + "loss": 0.6513, + "step": 16490 + }, + { + "epoch": 0.8246, + "grad_norm": 6.936700820922852, + "learning_rate": 1.816496177262952e-06, + "loss": 1.3746, + "step": 16492 + }, + { + "epoch": 0.8247, + "grad_norm": 5.932499408721924, + "learning_rate": 1.8144905259112543e-06, + "loss": 0.527, + "step": 16494 + }, + { + "epoch": 0.8248, + "grad_norm": 3.231830358505249, + "learning_rate": 1.812485871939056e-06, + "loss": 0.7438, + "step": 16496 + }, + { + "epoch": 0.8249, + "grad_norm": 7.731083393096924, + "learning_rate": 1.8104822155906198e-06, + "loss": 0.7823, + "step": 16498 + }, + { + "epoch": 0.825, + "grad_norm": 7.732916355133057, + "learning_rate": 1.808479557110081e-06, + "loss": 1.1332, + "step": 16500 + }, + { + "epoch": 0.8251, + "grad_norm": 7.248221397399902, + "learning_rate": 1.8064778967414664e-06, + "loss": 1.1559, + "step": 16502 + }, + { + "epoch": 0.8252, + "grad_norm": 2.4405906200408936, + "learning_rate": 1.804477234728661e-06, + "loss": 0.4712, + "step": 16504 + }, + { + "epoch": 0.8253, + "grad_norm": 6.058358669281006, + "learning_rate": 1.8024775713154475e-06, + "loss": 1.1517, + "step": 16506 + }, + { + "epoch": 0.8254, + "grad_norm": 5.885146141052246, + "learning_rate": 1.8004789067454763e-06, + "loss": 1.2945, + "step": 16508 + }, + { + "epoch": 0.8255, + "grad_norm": 9.69728946685791, + "learning_rate": 1.7984812412622787e-06, + "loss": 0.3741, + "step": 16510 + }, + { + "epoch": 0.8256, + "grad_norm": 9.074531555175781, + "learning_rate": 1.7964845751092663e-06, + "loss": 0.5078, + "step": 16512 + }, + { + "epoch": 0.8257, + "grad_norm": 11.089240074157715, + "learning_rate": 1.794488908529719e-06, + "loss": 1.0091, + "step": 16514 + }, + { + "epoch": 0.8258, + "grad_norm": 3.5850889682769775, + "learning_rate": 1.7924942417668113e-06, + "loss": 0.7042, + "step": 16516 + }, + { + "epoch": 0.8259, + "grad_norm": 4.518642902374268, + "learning_rate": 1.790500575063584e-06, + "loss": 1.0348, + "step": 16518 + }, + { + "epoch": 0.826, + "grad_norm": 3.3525328636169434, + "learning_rate": 1.7885079086629598e-06, + "loss": 1.176, + "step": 16520 + }, + { + "epoch": 0.8261, + "grad_norm": 1.88568115234375, + "learning_rate": 1.7865162428077387e-06, + "loss": 0.8416, + "step": 16522 + }, + { + "epoch": 0.8262, + "grad_norm": 2.700667381286621, + "learning_rate": 1.7845255777406e-06, + "loss": 0.7267, + "step": 16524 + }, + { + "epoch": 0.8263, + "grad_norm": 2.443450689315796, + "learning_rate": 1.7825359137040987e-06, + "loss": 0.7241, + "step": 16526 + }, + { + "epoch": 0.8264, + "grad_norm": 3.457181930541992, + "learning_rate": 1.7805472509406695e-06, + "loss": 0.7607, + "step": 16528 + }, + { + "epoch": 0.8265, + "grad_norm": 4.181433200836182, + "learning_rate": 1.7785595896926267e-06, + "loss": 1.1315, + "step": 16530 + }, + { + "epoch": 0.8266, + "grad_norm": 7.25329065322876, + "learning_rate": 1.7765729302021596e-06, + "loss": 1.6469, + "step": 16532 + }, + { + "epoch": 0.8267, + "grad_norm": 5.232656955718994, + "learning_rate": 1.7745872727113356e-06, + "loss": 0.6192, + "step": 16534 + }, + { + "epoch": 0.8268, + "grad_norm": 4.0580153465271, + "learning_rate": 1.7726026174621004e-06, + "loss": 1.0427, + "step": 16536 + }, + { + "epoch": 0.8269, + "grad_norm": 3.778684616088867, + "learning_rate": 1.7706189646962846e-06, + "loss": 1.0421, + "step": 16538 + }, + { + "epoch": 0.827, + "grad_norm": 7.779934406280518, + "learning_rate": 1.7686363146555807e-06, + "loss": 1.1685, + "step": 16540 + }, + { + "epoch": 0.8271, + "grad_norm": 1.120281457901001, + "learning_rate": 1.7666546675815776e-06, + "loss": 1.3623, + "step": 16542 + }, + { + "epoch": 0.8272, + "grad_norm": 4.137245178222656, + "learning_rate": 1.7646740237157256e-06, + "loss": 0.7309, + "step": 16544 + }, + { + "epoch": 0.8273, + "grad_norm": 2.9216363430023193, + "learning_rate": 1.7626943832993649e-06, + "loss": 1.0081, + "step": 16546 + }, + { + "epoch": 0.8274, + "grad_norm": 3.9784014225006104, + "learning_rate": 1.760715746573709e-06, + "loss": 0.6791, + "step": 16548 + }, + { + "epoch": 0.8275, + "grad_norm": 4.6989641189575195, + "learning_rate": 1.7587381137798432e-06, + "loss": 1.3657, + "step": 16550 + }, + { + "epoch": 0.8276, + "grad_norm": 5.8845038414001465, + "learning_rate": 1.7567614851587444e-06, + "loss": 1.1321, + "step": 16552 + }, + { + "epoch": 0.8277, + "grad_norm": 4.025447368621826, + "learning_rate": 1.7547858609512492e-06, + "loss": 0.9908, + "step": 16554 + }, + { + "epoch": 0.8278, + "grad_norm": 7.675049304962158, + "learning_rate": 1.7528112413980892e-06, + "loss": 1.2135, + "step": 16556 + }, + { + "epoch": 0.8279, + "grad_norm": 3.2389843463897705, + "learning_rate": 1.750837626739863e-06, + "loss": 1.4275, + "step": 16558 + }, + { + "epoch": 0.828, + "grad_norm": 3.4664359092712402, + "learning_rate": 1.7488650172170496e-06, + "loss": 1.2444, + "step": 16560 + }, + { + "epoch": 0.8281, + "grad_norm": 1.208779215812683, + "learning_rate": 1.7468934130700044e-06, + "loss": 0.4302, + "step": 16562 + }, + { + "epoch": 0.8282, + "grad_norm": 4.127584457397461, + "learning_rate": 1.744922814538964e-06, + "loss": 0.3307, + "step": 16564 + }, + { + "epoch": 0.8283, + "grad_norm": 8.147751808166504, + "learning_rate": 1.7429532218640377e-06, + "loss": 0.9134, + "step": 16566 + }, + { + "epoch": 0.8284, + "grad_norm": 4.49747371673584, + "learning_rate": 1.7409846352852144e-06, + "loss": 0.9511, + "step": 16568 + }, + { + "epoch": 0.8285, + "grad_norm": 6.278622627258301, + "learning_rate": 1.7390170550423624e-06, + "loss": 1.3292, + "step": 16570 + }, + { + "epoch": 0.8286, + "grad_norm": 4.573216438293457, + "learning_rate": 1.7370504813752232e-06, + "loss": 1.8047, + "step": 16572 + }, + { + "epoch": 0.8287, + "grad_norm": 1.5184818506240845, + "learning_rate": 1.7350849145234183e-06, + "loss": 0.6811, + "step": 16574 + }, + { + "epoch": 0.8288, + "grad_norm": 1.4583438634872437, + "learning_rate": 1.7331203547264452e-06, + "loss": 1.0336, + "step": 16576 + }, + { + "epoch": 0.8289, + "grad_norm": 6.007341384887695, + "learning_rate": 1.7311568022236847e-06, + "loss": 0.7581, + "step": 16578 + }, + { + "epoch": 0.829, + "grad_norm": 0.3316022455692291, + "learning_rate": 1.7291942572543806e-06, + "loss": 0.6044, + "step": 16580 + }, + { + "epoch": 0.8291, + "grad_norm": 2.8469765186309814, + "learning_rate": 1.7272327200576743e-06, + "loss": 0.4553, + "step": 16582 + }, + { + "epoch": 0.8292, + "grad_norm": 3.3018290996551514, + "learning_rate": 1.7252721908725633e-06, + "loss": 0.6293, + "step": 16584 + }, + { + "epoch": 0.8293, + "grad_norm": 0.37268367409706116, + "learning_rate": 1.7233126699379344e-06, + "loss": 0.8449, + "step": 16586 + }, + { + "epoch": 0.8294, + "grad_norm": 8.417835235595703, + "learning_rate": 1.7213541574925551e-06, + "loss": 1.1836, + "step": 16588 + }, + { + "epoch": 0.8295, + "grad_norm": 5.237951278686523, + "learning_rate": 1.7193966537750561e-06, + "loss": 1.5659, + "step": 16590 + }, + { + "epoch": 0.8296, + "grad_norm": 3.1155800819396973, + "learning_rate": 1.7174401590239587e-06, + "loss": 0.5201, + "step": 16592 + }, + { + "epoch": 0.8297, + "grad_norm": 4.477454662322998, + "learning_rate": 1.7154846734776543e-06, + "loss": 0.4964, + "step": 16594 + }, + { + "epoch": 0.8298, + "grad_norm": 3.6844732761383057, + "learning_rate": 1.7135301973744122e-06, + "loss": 0.5671, + "step": 16596 + }, + { + "epoch": 0.8299, + "grad_norm": 4.312709808349609, + "learning_rate": 1.7115767309523811e-06, + "loss": 0.2432, + "step": 16598 + }, + { + "epoch": 0.83, + "grad_norm": 3.2352888584136963, + "learning_rate": 1.709624274449584e-06, + "loss": 1.1268, + "step": 16600 + }, + { + "epoch": 0.8301, + "grad_norm": 10.250940322875977, + "learning_rate": 1.70767282810392e-06, + "loss": 1.3177, + "step": 16602 + }, + { + "epoch": 0.8302, + "grad_norm": 4.319877624511719, + "learning_rate": 1.7057223921531706e-06, + "loss": 0.9306, + "step": 16604 + }, + { + "epoch": 0.8303, + "grad_norm": 3.6728286743164062, + "learning_rate": 1.7037729668349878e-06, + "loss": 0.3533, + "step": 16606 + }, + { + "epoch": 0.8304, + "grad_norm": 3.780168056488037, + "learning_rate": 1.7018245523869038e-06, + "loss": 0.4846, + "step": 16608 + }, + { + "epoch": 0.8305, + "grad_norm": 4.806090354919434, + "learning_rate": 1.6998771490463262e-06, + "loss": 1.0065, + "step": 16610 + }, + { + "epoch": 0.8306, + "grad_norm": 6.740401744842529, + "learning_rate": 1.6979307570505422e-06, + "loss": 0.7835, + "step": 16612 + }, + { + "epoch": 0.8307, + "grad_norm": 13.733887672424316, + "learning_rate": 1.6959853766367117e-06, + "loss": 1.0825, + "step": 16614 + }, + { + "epoch": 0.8308, + "grad_norm": 4.164363384246826, + "learning_rate": 1.6940410080418723e-06, + "loss": 1.135, + "step": 16616 + }, + { + "epoch": 0.8309, + "grad_norm": 6.7940354347229, + "learning_rate": 1.6920976515029463e-06, + "loss": 0.7043, + "step": 16618 + }, + { + "epoch": 0.831, + "grad_norm": 12.459733963012695, + "learning_rate": 1.6901553072567189e-06, + "loss": 0.4472, + "step": 16620 + }, + { + "epoch": 0.8311, + "grad_norm": 2.996462106704712, + "learning_rate": 1.6882139755398575e-06, + "loss": 1.1565, + "step": 16622 + }, + { + "epoch": 0.8312, + "grad_norm": 5.637897491455078, + "learning_rate": 1.686273656588917e-06, + "loss": 0.677, + "step": 16624 + }, + { + "epoch": 0.8313, + "grad_norm": 3.0218682289123535, + "learning_rate": 1.6843343506403076e-06, + "loss": 1.1266, + "step": 16626 + }, + { + "epoch": 0.8314, + "grad_norm": 3.151244878768921, + "learning_rate": 1.6823960579303378e-06, + "loss": 0.7513, + "step": 16628 + }, + { + "epoch": 0.8315, + "grad_norm": 13.72484016418457, + "learning_rate": 1.6804587786951744e-06, + "loss": 1.1849, + "step": 16630 + }, + { + "epoch": 0.8316, + "grad_norm": 4.372872829437256, + "learning_rate": 1.6785225131708749e-06, + "loss": 0.8889, + "step": 16632 + }, + { + "epoch": 0.8317, + "grad_norm": 4.45857572555542, + "learning_rate": 1.6765872615933676e-06, + "loss": 0.3727, + "step": 16634 + }, + { + "epoch": 0.8318, + "grad_norm": 3.2495694160461426, + "learning_rate": 1.6746530241984504e-06, + "loss": 0.7105, + "step": 16636 + }, + { + "epoch": 0.8319, + "grad_norm": 2.9872961044311523, + "learning_rate": 1.6727198012218115e-06, + "loss": 0.8533, + "step": 16638 + }, + { + "epoch": 0.832, + "grad_norm": 2.723998546600342, + "learning_rate": 1.6707875928990059e-06, + "loss": 1.1879, + "step": 16640 + }, + { + "epoch": 0.8321, + "grad_norm": 3.4205703735351562, + "learning_rate": 1.6688563994654661e-06, + "loss": 1.065, + "step": 16642 + }, + { + "epoch": 0.8322, + "grad_norm": 2.1904046535491943, + "learning_rate": 1.666926221156503e-06, + "loss": 0.1906, + "step": 16644 + }, + { + "epoch": 0.8323, + "grad_norm": 1.5400164127349854, + "learning_rate": 1.6649970582073027e-06, + "loss": 0.6483, + "step": 16646 + }, + { + "epoch": 0.8324, + "grad_norm": 20.34628677368164, + "learning_rate": 1.6630689108529286e-06, + "loss": 2.4993, + "step": 16648 + }, + { + "epoch": 0.8325, + "grad_norm": 3.8831217288970947, + "learning_rate": 1.6611417793283192e-06, + "loss": 0.2977, + "step": 16650 + }, + { + "epoch": 0.8326, + "grad_norm": 3.4156646728515625, + "learning_rate": 1.6592156638682887e-06, + "loss": 1.0892, + "step": 16652 + }, + { + "epoch": 0.8327, + "grad_norm": 3.040421962738037, + "learning_rate": 1.6572905647075299e-06, + "loss": 1.2488, + "step": 16654 + }, + { + "epoch": 0.8328, + "grad_norm": 1.763563632965088, + "learning_rate": 1.6553664820806102e-06, + "loss": 0.0777, + "step": 16656 + }, + { + "epoch": 0.8329, + "grad_norm": 4.971432685852051, + "learning_rate": 1.6534434162219727e-06, + "loss": 1.1323, + "step": 16658 + }, + { + "epoch": 0.833, + "grad_norm": 2.3296735286712646, + "learning_rate": 1.651521367365936e-06, + "loss": 0.814, + "step": 16660 + }, + { + "epoch": 0.8331, + "grad_norm": 7.619426250457764, + "learning_rate": 1.649600335746695e-06, + "loss": 0.9256, + "step": 16662 + }, + { + "epoch": 0.8332, + "grad_norm": 5.110507488250732, + "learning_rate": 1.6476803215983295e-06, + "loss": 0.9043, + "step": 16664 + }, + { + "epoch": 0.8333, + "grad_norm": 2.041769504547119, + "learning_rate": 1.6457613251547756e-06, + "loss": 0.2995, + "step": 16666 + }, + { + "epoch": 0.8334, + "grad_norm": 4.022482395172119, + "learning_rate": 1.643843346649866e-06, + "loss": 0.4618, + "step": 16668 + }, + { + "epoch": 0.8335, + "grad_norm": 7.577566623687744, + "learning_rate": 1.6419263863172997e-06, + "loss": 1.0313, + "step": 16670 + }, + { + "epoch": 0.8336, + "grad_norm": 4.3915324211120605, + "learning_rate": 1.6400104443906463e-06, + "loss": 1.2104, + "step": 16672 + }, + { + "epoch": 0.8337, + "grad_norm": 3.677987813949585, + "learning_rate": 1.6380955211033655e-06, + "loss": 0.5474, + "step": 16674 + }, + { + "epoch": 0.8338, + "grad_norm": 2.2258880138397217, + "learning_rate": 1.6361816166887768e-06, + "loss": 0.8571, + "step": 16676 + }, + { + "epoch": 0.8339, + "grad_norm": 0.4186985194683075, + "learning_rate": 1.634268731380091e-06, + "loss": 0.3011, + "step": 16678 + }, + { + "epoch": 0.834, + "grad_norm": 1.2658826112747192, + "learning_rate": 1.6323568654103838e-06, + "loss": 0.3497, + "step": 16680 + }, + { + "epoch": 0.8341, + "grad_norm": 8.47219467163086, + "learning_rate": 1.6304460190126103e-06, + "loss": 0.6358, + "step": 16682 + }, + { + "epoch": 0.8342, + "grad_norm": 3.452345848083496, + "learning_rate": 1.6285361924196031e-06, + "loss": 0.8465, + "step": 16684 + }, + { + "epoch": 0.8343, + "grad_norm": 5.264406681060791, + "learning_rate": 1.6266273858640659e-06, + "loss": 0.5363, + "step": 16686 + }, + { + "epoch": 0.8344, + "grad_norm": 3.1544594764709473, + "learning_rate": 1.6247195995785836e-06, + "loss": 0.5026, + "step": 16688 + }, + { + "epoch": 0.8345, + "grad_norm": 4.781060695648193, + "learning_rate": 1.6228128337956128e-06, + "loss": 0.9056, + "step": 16690 + }, + { + "epoch": 0.8346, + "grad_norm": 3.4816229343414307, + "learning_rate": 1.6209070887474876e-06, + "loss": 0.3557, + "step": 16692 + }, + { + "epoch": 0.8347, + "grad_norm": 3.123758554458618, + "learning_rate": 1.6190023646664178e-06, + "loss": 1.1884, + "step": 16694 + }, + { + "epoch": 0.8348, + "grad_norm": 5.669595718383789, + "learning_rate": 1.6170986617844864e-06, + "loss": 1.3152, + "step": 16696 + }, + { + "epoch": 0.8349, + "grad_norm": 6.916489601135254, + "learning_rate": 1.6151959803336537e-06, + "loss": 0.8079, + "step": 16698 + }, + { + "epoch": 0.835, + "grad_norm": 5.220053672790527, + "learning_rate": 1.6132943205457607e-06, + "loss": 1.0108, + "step": 16700 + }, + { + "epoch": 0.8351, + "grad_norm": 5.469123363494873, + "learning_rate": 1.611393682652511e-06, + "loss": 0.7954, + "step": 16702 + }, + { + "epoch": 0.8352, + "grad_norm": 2.064707040786743, + "learning_rate": 1.6094940668855008e-06, + "loss": 1.4594, + "step": 16704 + }, + { + "epoch": 0.8353, + "grad_norm": 4.282876491546631, + "learning_rate": 1.6075954734761844e-06, + "loss": 0.8307, + "step": 16706 + }, + { + "epoch": 0.8354, + "grad_norm": 5.299331188201904, + "learning_rate": 1.6056979026559005e-06, + "loss": 0.3247, + "step": 16708 + }, + { + "epoch": 0.8355, + "grad_norm": 2.6274757385253906, + "learning_rate": 1.6038013546558695e-06, + "loss": 0.9017, + "step": 16710 + }, + { + "epoch": 0.8356, + "grad_norm": 15.224793434143066, + "learning_rate": 1.601905829707171e-06, + "loss": 2.037, + "step": 16712 + }, + { + "epoch": 0.8357, + "grad_norm": 3.884166717529297, + "learning_rate": 1.600011328040777e-06, + "loss": 1.2492, + "step": 16714 + }, + { + "epoch": 0.8358, + "grad_norm": 5.444003105163574, + "learning_rate": 1.5981178498875182e-06, + "loss": 0.5712, + "step": 16716 + }, + { + "epoch": 0.8359, + "grad_norm": 7.629682540893555, + "learning_rate": 1.596225395478116e-06, + "loss": 0.8902, + "step": 16718 + }, + { + "epoch": 0.836, + "grad_norm": 4.961212635040283, + "learning_rate": 1.5943339650431578e-06, + "loss": 0.7279, + "step": 16720 + }, + { + "epoch": 0.8361, + "grad_norm": 4.542156219482422, + "learning_rate": 1.5924435588131093e-06, + "loss": 0.3584, + "step": 16722 + }, + { + "epoch": 0.8362, + "grad_norm": 8.65418815612793, + "learning_rate": 1.5905541770183096e-06, + "loss": 0.3526, + "step": 16724 + }, + { + "epoch": 0.8363, + "grad_norm": 6.726553916931152, + "learning_rate": 1.588665819888976e-06, + "loss": 0.962, + "step": 16726 + }, + { + "epoch": 0.8364, + "grad_norm": 3.906297445297241, + "learning_rate": 1.5867784876551973e-06, + "loss": 0.2997, + "step": 16728 + }, + { + "epoch": 0.8365, + "grad_norm": 4.032556056976318, + "learning_rate": 1.5848921805469396e-06, + "loss": 0.6175, + "step": 16730 + }, + { + "epoch": 0.8366, + "grad_norm": 2.2298667430877686, + "learning_rate": 1.583006898794044e-06, + "loss": 0.7355, + "step": 16732 + }, + { + "epoch": 0.8367, + "grad_norm": 3.5049688816070557, + "learning_rate": 1.581122642626226e-06, + "loss": 0.6825, + "step": 16734 + }, + { + "epoch": 0.8368, + "grad_norm": 4.325751304626465, + "learning_rate": 1.579239412273078e-06, + "loss": 0.6552, + "step": 16736 + }, + { + "epoch": 0.8369, + "grad_norm": 6.773179054260254, + "learning_rate": 1.577357207964062e-06, + "loss": 1.2075, + "step": 16738 + }, + { + "epoch": 0.837, + "grad_norm": 6.161369323730469, + "learning_rate": 1.5754760299285255e-06, + "loss": 1.2194, + "step": 16740 + }, + { + "epoch": 0.8371, + "grad_norm": 34.6231803894043, + "learning_rate": 1.5735958783956795e-06, + "loss": 1.8895, + "step": 16742 + }, + { + "epoch": 0.8372, + "grad_norm": 38.82673645019531, + "learning_rate": 1.5717167535946142e-06, + "loss": 1.5587, + "step": 16744 + }, + { + "epoch": 0.8373, + "grad_norm": 7.646658420562744, + "learning_rate": 1.5698386557542978e-06, + "loss": 1.5053, + "step": 16746 + }, + { + "epoch": 0.8374, + "grad_norm": 3.870633840560913, + "learning_rate": 1.5679615851035669e-06, + "loss": 1.0564, + "step": 16748 + }, + { + "epoch": 0.8375, + "grad_norm": 1.5268299579620361, + "learning_rate": 1.566085541871145e-06, + "loss": 0.8077, + "step": 16750 + }, + { + "epoch": 0.8376, + "grad_norm": 2.795027732849121, + "learning_rate": 1.5642105262856122e-06, + "loss": 0.986, + "step": 16752 + }, + { + "epoch": 0.8377, + "grad_norm": 12.738187789916992, + "learning_rate": 1.5623365385754408e-06, + "loss": 0.9742, + "step": 16754 + }, + { + "epoch": 0.8378, + "grad_norm": 4.728071212768555, + "learning_rate": 1.560463578968967e-06, + "loss": 0.9332, + "step": 16756 + }, + { + "epoch": 0.8379, + "grad_norm": 4.1622114181518555, + "learning_rate": 1.5585916476944074e-06, + "loss": 0.285, + "step": 16758 + }, + { + "epoch": 0.838, + "grad_norm": 10.658799171447754, + "learning_rate": 1.5567207449798517e-06, + "loss": 0.71, + "step": 16760 + }, + { + "epoch": 0.8381, + "grad_norm": 6.0711517333984375, + "learning_rate": 1.5548508710532573e-06, + "loss": 1.744, + "step": 16762 + }, + { + "epoch": 0.8382, + "grad_norm": 1.430808663368225, + "learning_rate": 1.55298202614247e-06, + "loss": 1.3393, + "step": 16764 + }, + { + "epoch": 0.8383, + "grad_norm": 7.665195941925049, + "learning_rate": 1.5511142104752009e-06, + "loss": 1.198, + "step": 16766 + }, + { + "epoch": 0.8384, + "grad_norm": 8.882551193237305, + "learning_rate": 1.5492474242790368e-06, + "loss": 0.7949, + "step": 16768 + }, + { + "epoch": 0.8385, + "grad_norm": 7.431225776672363, + "learning_rate": 1.547381667781439e-06, + "loss": 0.9862, + "step": 16770 + }, + { + "epoch": 0.8386, + "grad_norm": 1.6576968431472778, + "learning_rate": 1.545516941209747e-06, + "loss": 0.3897, + "step": 16772 + }, + { + "epoch": 0.8387, + "grad_norm": 5.575819969177246, + "learning_rate": 1.54365324479117e-06, + "loss": 1.3015, + "step": 16774 + }, + { + "epoch": 0.8388, + "grad_norm": 3.4037725925445557, + "learning_rate": 1.5417905787527943e-06, + "loss": 0.8311, + "step": 16776 + }, + { + "epoch": 0.8389, + "grad_norm": 4.008891582489014, + "learning_rate": 1.5399289433215792e-06, + "loss": 0.4441, + "step": 16778 + }, + { + "epoch": 0.839, + "grad_norm": 3.8073904514312744, + "learning_rate": 1.538068338724361e-06, + "loss": 0.7198, + "step": 16780 + }, + { + "epoch": 0.8391, + "grad_norm": 6.0912580490112305, + "learning_rate": 1.5362087651878477e-06, + "loss": 1.117, + "step": 16782 + }, + { + "epoch": 0.8392, + "grad_norm": 3.006481885910034, + "learning_rate": 1.5343502229386209e-06, + "loss": 0.6261, + "step": 16784 + }, + { + "epoch": 0.8393, + "grad_norm": 2.6815435886383057, + "learning_rate": 1.5324927122031452e-06, + "loss": 0.7812, + "step": 16786 + }, + { + "epoch": 0.8394, + "grad_norm": 0.7896661758422852, + "learning_rate": 1.530636233207743e-06, + "loss": 0.2874, + "step": 16788 + }, + { + "epoch": 0.8395, + "grad_norm": 4.228903293609619, + "learning_rate": 1.5287807861786308e-06, + "loss": 1.1075, + "step": 16790 + }, + { + "epoch": 0.8396, + "grad_norm": 2.109663486480713, + "learning_rate": 1.526926371341878e-06, + "loss": 0.0854, + "step": 16792 + }, + { + "epoch": 0.8397, + "grad_norm": 8.265311241149902, + "learning_rate": 1.5250729889234483e-06, + "loss": 1.9885, + "step": 16794 + }, + { + "epoch": 0.8398, + "grad_norm": 4.576780796051025, + "learning_rate": 1.52322063914917e-06, + "loss": 0.5524, + "step": 16796 + }, + { + "epoch": 0.8399, + "grad_norm": 9.415674209594727, + "learning_rate": 1.521369322244739e-06, + "loss": 1.397, + "step": 16798 + }, + { + "epoch": 0.84, + "grad_norm": 8.053428649902344, + "learning_rate": 1.5195190384357405e-06, + "loss": 0.3905, + "step": 16800 + }, + { + "epoch": 0.8401, + "grad_norm": 3.0312016010284424, + "learning_rate": 1.5176697879476233e-06, + "loss": 0.8796, + "step": 16802 + }, + { + "epoch": 0.8402, + "grad_norm": 8.614913940429688, + "learning_rate": 1.5158215710057123e-06, + "loss": 0.3851, + "step": 16804 + }, + { + "epoch": 0.8403, + "grad_norm": 4.608804225921631, + "learning_rate": 1.5139743878352075e-06, + "loss": 0.994, + "step": 16806 + }, + { + "epoch": 0.8404, + "grad_norm": 3.0936460494995117, + "learning_rate": 1.5121282386611823e-06, + "loss": 0.3349, + "step": 16808 + }, + { + "epoch": 0.8405, + "grad_norm": 4.876464366912842, + "learning_rate": 1.5102831237085857e-06, + "loss": 0.6765, + "step": 16810 + }, + { + "epoch": 0.8406, + "grad_norm": 3.769578695297241, + "learning_rate": 1.5084390432022377e-06, + "loss": 0.9406, + "step": 16812 + }, + { + "epoch": 0.8407, + "grad_norm": 5.3068976402282715, + "learning_rate": 1.5065959973668355e-06, + "loss": 0.6003, + "step": 16814 + }, + { + "epoch": 0.8408, + "grad_norm": 3.9119696617126465, + "learning_rate": 1.5047539864269477e-06, + "loss": 0.9528, + "step": 16816 + }, + { + "epoch": 0.8409, + "grad_norm": 3.1731350421905518, + "learning_rate": 1.5029130106070166e-06, + "loss": 0.7183, + "step": 16818 + }, + { + "epoch": 0.841, + "grad_norm": 6.656569957733154, + "learning_rate": 1.5010730701313626e-06, + "loss": 1.075, + "step": 16820 + }, + { + "epoch": 0.8411, + "grad_norm": 1.8444610834121704, + "learning_rate": 1.4992341652241738e-06, + "loss": 0.8269, + "step": 16822 + }, + { + "epoch": 0.8412, + "grad_norm": 6.758680820465088, + "learning_rate": 1.4973962961095135e-06, + "loss": 0.9504, + "step": 16824 + }, + { + "epoch": 0.8413, + "grad_norm": 5.005814552307129, + "learning_rate": 1.495559463011329e-06, + "loss": 0.9588, + "step": 16826 + }, + { + "epoch": 0.8414, + "grad_norm": 6.459672451019287, + "learning_rate": 1.4937236661534227e-06, + "loss": 1.062, + "step": 16828 + }, + { + "epoch": 0.8415, + "grad_norm": 5.685894012451172, + "learning_rate": 1.4918889057594876e-06, + "loss": 0.9777, + "step": 16830 + }, + { + "epoch": 0.8416, + "grad_norm": 3.594468116760254, + "learning_rate": 1.490055182053083e-06, + "loss": 0.8644, + "step": 16832 + }, + { + "epoch": 0.8417, + "grad_norm": 11.989259719848633, + "learning_rate": 1.4882224952576373e-06, + "loss": 1.5166, + "step": 16834 + }, + { + "epoch": 0.8418, + "grad_norm": 5.248622894287109, + "learning_rate": 1.486390845596466e-06, + "loss": 1.4181, + "step": 16836 + }, + { + "epoch": 0.8419, + "grad_norm": 1.9847044944763184, + "learning_rate": 1.4845602332927411e-06, + "loss": 1.254, + "step": 16838 + }, + { + "epoch": 0.842, + "grad_norm": 5.0837225914001465, + "learning_rate": 1.4827306585695234e-06, + "loss": 0.9812, + "step": 16840 + }, + { + "epoch": 0.8421, + "grad_norm": 5.795083045959473, + "learning_rate": 1.48090212164974e-06, + "loss": 0.6492, + "step": 16842 + }, + { + "epoch": 0.8422, + "grad_norm": 12.134989738464355, + "learning_rate": 1.4790746227561925e-06, + "loss": 0.7289, + "step": 16844 + }, + { + "epoch": 0.8423, + "grad_norm": 3.494856119155884, + "learning_rate": 1.4772481621115543e-06, + "loss": 0.9476, + "step": 16846 + }, + { + "epoch": 0.8424, + "grad_norm": 3.1662158966064453, + "learning_rate": 1.4754227399383758e-06, + "loss": 1.0193, + "step": 16848 + }, + { + "epoch": 0.8425, + "grad_norm": 6.591827392578125, + "learning_rate": 1.4735983564590784e-06, + "loss": 0.6645, + "step": 16850 + }, + { + "epoch": 0.8426, + "grad_norm": 9.248714447021484, + "learning_rate": 1.4717750118959583e-06, + "loss": 1.3266, + "step": 16852 + }, + { + "epoch": 0.8427, + "grad_norm": 4.980313301086426, + "learning_rate": 1.4699527064711838e-06, + "loss": 1.1726, + "step": 16854 + }, + { + "epoch": 0.8428, + "grad_norm": 8.41786003112793, + "learning_rate": 1.468131440406798e-06, + "loss": 1.5527, + "step": 16856 + }, + { + "epoch": 0.8429, + "grad_norm": 4.6740641593933105, + "learning_rate": 1.466311213924716e-06, + "loss": 0.5165, + "step": 16858 + }, + { + "epoch": 0.843, + "grad_norm": 4.810483932495117, + "learning_rate": 1.4644920272467245e-06, + "loss": 0.9105, + "step": 16860 + }, + { + "epoch": 0.8431, + "grad_norm": 4.32034969329834, + "learning_rate": 1.462673880594494e-06, + "loss": 0.9001, + "step": 16862 + }, + { + "epoch": 0.8432, + "grad_norm": 7.293081283569336, + "learning_rate": 1.4608567741895496e-06, + "loss": 0.4642, + "step": 16864 + }, + { + "epoch": 0.8433, + "grad_norm": 3.3814618587493896, + "learning_rate": 1.4590407082533099e-06, + "loss": 1.1359, + "step": 16866 + }, + { + "epoch": 0.8434, + "grad_norm": 7.3514227867126465, + "learning_rate": 1.4572256830070497e-06, + "loss": 0.5944, + "step": 16868 + }, + { + "epoch": 0.8435, + "grad_norm": 3.424968719482422, + "learning_rate": 1.4554116986719258e-06, + "loss": 0.3662, + "step": 16870 + }, + { + "epoch": 0.8436, + "grad_norm": 5.501387119293213, + "learning_rate": 1.4535987554689712e-06, + "loss": 0.6297, + "step": 16872 + }, + { + "epoch": 0.8437, + "grad_norm": 4.592956066131592, + "learning_rate": 1.4517868536190804e-06, + "loss": 0.5831, + "step": 16874 + }, + { + "epoch": 0.8438, + "grad_norm": 8.863179206848145, + "learning_rate": 1.4499759933430347e-06, + "loss": 0.871, + "step": 16876 + }, + { + "epoch": 0.8439, + "grad_norm": 7.996165752410889, + "learning_rate": 1.4481661748614783e-06, + "loss": 0.9061, + "step": 16878 + }, + { + "epoch": 0.844, + "grad_norm": 8.322800636291504, + "learning_rate": 1.446357398394934e-06, + "loss": 0.6586, + "step": 16880 + }, + { + "epoch": 0.8441, + "grad_norm": 4.084438800811768, + "learning_rate": 1.4445496641637967e-06, + "loss": 0.9868, + "step": 16882 + }, + { + "epoch": 0.8442, + "grad_norm": 8.046995162963867, + "learning_rate": 1.4427429723883256e-06, + "loss": 0.8695, + "step": 16884 + }, + { + "epoch": 0.8443, + "grad_norm": 8.872368812561035, + "learning_rate": 1.4409373232886703e-06, + "loss": 0.8687, + "step": 16886 + }, + { + "epoch": 0.8444, + "grad_norm": 2.431061029434204, + "learning_rate": 1.439132717084839e-06, + "loss": 0.8044, + "step": 16888 + }, + { + "epoch": 0.8445, + "grad_norm": 3.2689011096954346, + "learning_rate": 1.4373291539967182e-06, + "loss": 0.6447, + "step": 16890 + }, + { + "epoch": 0.8446, + "grad_norm": 3.485478162765503, + "learning_rate": 1.4355266342440678e-06, + "loss": 1.0027, + "step": 16892 + }, + { + "epoch": 0.8447, + "grad_norm": 15.743244171142578, + "learning_rate": 1.4337251580465173e-06, + "loss": 0.9972, + "step": 16894 + }, + { + "epoch": 0.8448, + "grad_norm": 4.821179389953613, + "learning_rate": 1.4319247256235713e-06, + "loss": 1.3845, + "step": 16896 + }, + { + "epoch": 0.8449, + "grad_norm": 7.800971984863281, + "learning_rate": 1.430125337194609e-06, + "loss": 1.1325, + "step": 16898 + }, + { + "epoch": 0.845, + "grad_norm": 13.556204795837402, + "learning_rate": 1.4283269929788779e-06, + "loss": 1.4721, + "step": 16900 + }, + { + "epoch": 0.8451, + "grad_norm": 3.9172141551971436, + "learning_rate": 1.426529693195503e-06, + "loss": 0.9278, + "step": 16902 + }, + { + "epoch": 0.8452, + "grad_norm": 4.100444316864014, + "learning_rate": 1.4247334380634792e-06, + "loss": 1.3856, + "step": 16904 + }, + { + "epoch": 0.8453, + "grad_norm": 3.0142552852630615, + "learning_rate": 1.4229382278016712e-06, + "loss": 1.1644, + "step": 16906 + }, + { + "epoch": 0.8454, + "grad_norm": 7.694007396697998, + "learning_rate": 1.4211440626288286e-06, + "loss": 1.7269, + "step": 16908 + }, + { + "epoch": 0.8455, + "grad_norm": 4.280798435211182, + "learning_rate": 1.4193509427635543e-06, + "loss": 1.0279, + "step": 16910 + }, + { + "epoch": 0.8456, + "grad_norm": 5.7010908126831055, + "learning_rate": 1.4175588684243447e-06, + "loss": 1.0229, + "step": 16912 + }, + { + "epoch": 0.8457, + "grad_norm": 6.234348297119141, + "learning_rate": 1.4157678398295483e-06, + "loss": 0.8566, + "step": 16914 + }, + { + "epoch": 0.8458, + "grad_norm": 4.561542987823486, + "learning_rate": 1.413977857197405e-06, + "loss": 0.4247, + "step": 16916 + }, + { + "epoch": 0.8459, + "grad_norm": 5.086822509765625, + "learning_rate": 1.4121889207460171e-06, + "loss": 0.7146, + "step": 16918 + }, + { + "epoch": 0.846, + "grad_norm": 26.53534507751465, + "learning_rate": 1.4104010306933558e-06, + "loss": 2.7626, + "step": 16920 + }, + { + "epoch": 0.8461, + "grad_norm": 5.727695465087891, + "learning_rate": 1.408614187257279e-06, + "loss": 0.9082, + "step": 16922 + }, + { + "epoch": 0.8462, + "grad_norm": 1.1136960983276367, + "learning_rate": 1.4068283906554969e-06, + "loss": 0.0456, + "step": 16924 + }, + { + "epoch": 0.8463, + "grad_norm": 6.879436492919922, + "learning_rate": 1.4050436411056122e-06, + "loss": 1.0368, + "step": 16926 + }, + { + "epoch": 0.8464, + "grad_norm": 2.5337510108947754, + "learning_rate": 1.40325993882509e-06, + "loss": 1.8071, + "step": 16928 + }, + { + "epoch": 0.8465, + "grad_norm": 5.922140598297119, + "learning_rate": 1.4014772840312663e-06, + "loss": 0.4796, + "step": 16930 + }, + { + "epoch": 0.8466, + "grad_norm": 8.85401439666748, + "learning_rate": 1.399695676941354e-06, + "loss": 1.6539, + "step": 16932 + }, + { + "epoch": 0.8467, + "grad_norm": 12.023006439208984, + "learning_rate": 1.3979151177724348e-06, + "loss": 0.8771, + "step": 16934 + }, + { + "epoch": 0.8468, + "grad_norm": 3.257256031036377, + "learning_rate": 1.3961356067414667e-06, + "loss": 0.6253, + "step": 16936 + }, + { + "epoch": 0.8469, + "grad_norm": 9.046489715576172, + "learning_rate": 1.394357144065277e-06, + "loss": 0.5021, + "step": 16938 + }, + { + "epoch": 0.847, + "grad_norm": 3.8683247566223145, + "learning_rate": 1.3925797299605649e-06, + "loss": 1.3342, + "step": 16940 + }, + { + "epoch": 0.8471, + "grad_norm": 4.280135154724121, + "learning_rate": 1.3908033646439034e-06, + "loss": 2.3549, + "step": 16942 + }, + { + "epoch": 0.8472, + "grad_norm": 1.377642035484314, + "learning_rate": 1.3890280483317375e-06, + "loss": 0.7427, + "step": 16944 + }, + { + "epoch": 0.8473, + "grad_norm": 2.243122100830078, + "learning_rate": 1.387253781240383e-06, + "loss": 1.4644, + "step": 16946 + }, + { + "epoch": 0.8474, + "grad_norm": 7.176146030426025, + "learning_rate": 1.3854805635860335e-06, + "loss": 0.8897, + "step": 16948 + }, + { + "epoch": 0.8475, + "grad_norm": 6.99845027923584, + "learning_rate": 1.3837083955847418e-06, + "loss": 1.131, + "step": 16950 + }, + { + "epoch": 0.8476, + "grad_norm": 3.5270426273345947, + "learning_rate": 1.381937277452451e-06, + "loss": 0.643, + "step": 16952 + }, + { + "epoch": 0.8477, + "grad_norm": 6.118293285369873, + "learning_rate": 1.38016720940496e-06, + "loss": 0.2355, + "step": 16954 + }, + { + "epoch": 0.8478, + "grad_norm": 8.48019790649414, + "learning_rate": 1.3783981916579448e-06, + "loss": 2.4258, + "step": 16956 + }, + { + "epoch": 0.8479, + "grad_norm": 1.6404101848602295, + "learning_rate": 1.3766302244269624e-06, + "loss": 0.7702, + "step": 16958 + }, + { + "epoch": 0.848, + "grad_norm": 4.227622032165527, + "learning_rate": 1.3748633079274254e-06, + "loss": 1.2279, + "step": 16960 + }, + { + "epoch": 0.8481, + "grad_norm": 3.431091785430908, + "learning_rate": 1.3730974423746334e-06, + "loss": 0.5546, + "step": 16962 + }, + { + "epoch": 0.8482, + "grad_norm": 4.16981840133667, + "learning_rate": 1.3713326279837502e-06, + "loss": 1.1659, + "step": 16964 + }, + { + "epoch": 0.8483, + "grad_norm": 6.079126358032227, + "learning_rate": 1.3695688649698124e-06, + "loss": 0.8195, + "step": 16966 + }, + { + "epoch": 0.8484, + "grad_norm": 4.843242168426514, + "learning_rate": 1.3678061535477305e-06, + "loss": 0.4653, + "step": 16968 + }, + { + "epoch": 0.8485, + "grad_norm": 3.257268190383911, + "learning_rate": 1.3660444939322837e-06, + "loss": 1.6373, + "step": 16970 + }, + { + "epoch": 0.8486, + "grad_norm": 5.696289539337158, + "learning_rate": 1.3642838863381258e-06, + "loss": 1.157, + "step": 16972 + }, + { + "epoch": 0.8487, + "grad_norm": 13.532317161560059, + "learning_rate": 1.362524330979782e-06, + "loss": 0.7996, + "step": 16974 + }, + { + "epoch": 0.8488, + "grad_norm": 2.042363405227661, + "learning_rate": 1.3607658280716474e-06, + "loss": 0.5683, + "step": 16976 + }, + { + "epoch": 0.8489, + "grad_norm": 8.775580406188965, + "learning_rate": 1.3590083778279917e-06, + "loss": 1.1347, + "step": 16978 + }, + { + "epoch": 0.849, + "grad_norm": 5.169849395751953, + "learning_rate": 1.3572519804629537e-06, + "loss": 0.7496, + "step": 16980 + }, + { + "epoch": 0.8491, + "grad_norm": 13.528088569641113, + "learning_rate": 1.3554966361905465e-06, + "loss": 0.8602, + "step": 16982 + }, + { + "epoch": 0.8492, + "grad_norm": 16.628747940063477, + "learning_rate": 1.3537423452246522e-06, + "loss": 0.9999, + "step": 16984 + }, + { + "epoch": 0.8493, + "grad_norm": 11.656760215759277, + "learning_rate": 1.3519891077790237e-06, + "loss": 0.6407, + "step": 16986 + }, + { + "epoch": 0.8494, + "grad_norm": 3.2251315116882324, + "learning_rate": 1.3502369240672941e-06, + "loss": 0.7267, + "step": 16988 + }, + { + "epoch": 0.8495, + "grad_norm": 5.47212553024292, + "learning_rate": 1.3484857943029572e-06, + "loss": 0.7435, + "step": 16990 + }, + { + "epoch": 0.8496, + "grad_norm": 9.698142051696777, + "learning_rate": 1.3467357186993802e-06, + "loss": 0.6345, + "step": 16992 + }, + { + "epoch": 0.8497, + "grad_norm": 0.8942798376083374, + "learning_rate": 1.3449866974698123e-06, + "loss": 0.4016, + "step": 16994 + }, + { + "epoch": 0.8498, + "grad_norm": 12.881036758422852, + "learning_rate": 1.3432387308273576e-06, + "loss": 1.0084, + "step": 16996 + }, + { + "epoch": 0.8499, + "grad_norm": 3.0976014137268066, + "learning_rate": 1.341491818985009e-06, + "loss": 0.0459, + "step": 16998 + }, + { + "epoch": 0.85, + "grad_norm": 4.770663738250732, + "learning_rate": 1.339745962155613e-06, + "loss": 0.5169, + "step": 17000 + }, + { + "epoch": 0.8501, + "grad_norm": 2.5696568489074707, + "learning_rate": 1.338001160551906e-06, + "loss": 0.6317, + "step": 17002 + }, + { + "epoch": 0.8502, + "grad_norm": 2.420480489730835, + "learning_rate": 1.3362574143864816e-06, + "loss": 0.6389, + "step": 17004 + }, + { + "epoch": 0.8503, + "grad_norm": 16.604177474975586, + "learning_rate": 1.3345147238718125e-06, + "loss": 0.4794, + "step": 17006 + }, + { + "epoch": 0.8504, + "grad_norm": 2.8219637870788574, + "learning_rate": 1.3327730892202384e-06, + "loss": 1.105, + "step": 17008 + }, + { + "epoch": 0.8505, + "grad_norm": 4.920712947845459, + "learning_rate": 1.3310325106439725e-06, + "loss": 1.1698, + "step": 17010 + }, + { + "epoch": 0.8506, + "grad_norm": 1.3171876668930054, + "learning_rate": 1.3292929883550998e-06, + "loss": 0.4645, + "step": 17012 + }, + { + "epoch": 0.8507, + "grad_norm": 7.18240213394165, + "learning_rate": 1.3275545225655762e-06, + "loss": 1.0558, + "step": 17014 + }, + { + "epoch": 0.8508, + "grad_norm": 4.242491245269775, + "learning_rate": 1.3258171134872267e-06, + "loss": 1.0473, + "step": 17016 + }, + { + "epoch": 0.8509, + "grad_norm": 5.970418930053711, + "learning_rate": 1.3240807613317508e-06, + "loss": 1.1019, + "step": 17018 + }, + { + "epoch": 0.851, + "grad_norm": 3.52459979057312, + "learning_rate": 1.322345466310717e-06, + "loss": 1.5667, + "step": 17020 + }, + { + "epoch": 0.8511, + "grad_norm": 2.6932640075683594, + "learning_rate": 1.3206112286355633e-06, + "loss": 1.1689, + "step": 17022 + }, + { + "epoch": 0.8512, + "grad_norm": 2.0136570930480957, + "learning_rate": 1.3188780485176089e-06, + "loss": 0.5525, + "step": 17024 + }, + { + "epoch": 0.8513, + "grad_norm": 4.553659439086914, + "learning_rate": 1.3171459261680297e-06, + "loss": 0.8558, + "step": 17026 + }, + { + "epoch": 0.8514, + "grad_norm": 2.5703108310699463, + "learning_rate": 1.3154148617978813e-06, + "loss": 0.6262, + "step": 17028 + }, + { + "epoch": 0.8515, + "grad_norm": 6.610842227935791, + "learning_rate": 1.3136848556180893e-06, + "loss": 1.1311, + "step": 17030 + }, + { + "epoch": 0.8516, + "grad_norm": 5.783816337585449, + "learning_rate": 1.3119559078394462e-06, + "loss": 0.9616, + "step": 17032 + }, + { + "epoch": 0.8517, + "grad_norm": 2.296308755874634, + "learning_rate": 1.310228018672627e-06, + "loss": 0.5425, + "step": 17034 + }, + { + "epoch": 0.8518, + "grad_norm": 7.469788551330566, + "learning_rate": 1.3085011883281606e-06, + "loss": 0.5624, + "step": 17036 + }, + { + "epoch": 0.8519, + "grad_norm": 2.414443016052246, + "learning_rate": 1.3067754170164615e-06, + "loss": 0.2802, + "step": 17038 + }, + { + "epoch": 0.852, + "grad_norm": 12.366231918334961, + "learning_rate": 1.30505070494781e-06, + "loss": 1.0297, + "step": 17040 + }, + { + "epoch": 0.8521, + "grad_norm": 2.886026620864868, + "learning_rate": 1.303327052332355e-06, + "loss": 0.2834, + "step": 17042 + }, + { + "epoch": 0.8522, + "grad_norm": 6.412732124328613, + "learning_rate": 1.3016044593801202e-06, + "loss": 1.4945, + "step": 17044 + }, + { + "epoch": 0.8523, + "grad_norm": 14.894566535949707, + "learning_rate": 1.2998829263009937e-06, + "loss": 1.9633, + "step": 17046 + }, + { + "epoch": 0.8524, + "grad_norm": 3.3542656898498535, + "learning_rate": 1.2981624533047432e-06, + "loss": 0.504, + "step": 17048 + }, + { + "epoch": 0.8525, + "grad_norm": 3.7724006175994873, + "learning_rate": 1.2964430406010032e-06, + "loss": 0.2762, + "step": 17050 + }, + { + "epoch": 0.8526, + "grad_norm": 6.330784797668457, + "learning_rate": 1.294724688399278e-06, + "loss": 0.7279, + "step": 17052 + }, + { + "epoch": 0.8527, + "grad_norm": 2.8858845233917236, + "learning_rate": 1.2930073969089442e-06, + "loss": 1.0442, + "step": 17054 + }, + { + "epoch": 0.8528, + "grad_norm": 2.6938729286193848, + "learning_rate": 1.2912911663392468e-06, + "loss": 0.9882, + "step": 17056 + }, + { + "epoch": 0.8529, + "grad_norm": 2.9629006385803223, + "learning_rate": 1.289575996899305e-06, + "loss": 0.7361, + "step": 17058 + }, + { + "epoch": 0.853, + "grad_norm": 3.9707398414611816, + "learning_rate": 1.2878618887981064e-06, + "loss": 1.3744, + "step": 17060 + }, + { + "epoch": 0.8531, + "grad_norm": 3.7655563354492188, + "learning_rate": 1.28614884224451e-06, + "loss": 1.9194, + "step": 17062 + }, + { + "epoch": 0.8532, + "grad_norm": 7.7704620361328125, + "learning_rate": 1.2844368574472454e-06, + "loss": 0.9001, + "step": 17064 + }, + { + "epoch": 0.8533, + "grad_norm": 5.0851969718933105, + "learning_rate": 1.2827259346149123e-06, + "loss": 0.5631, + "step": 17066 + }, + { + "epoch": 0.8534, + "grad_norm": 3.6661272048950195, + "learning_rate": 1.2810160739559797e-06, + "loss": 0.7517, + "step": 17068 + }, + { + "epoch": 0.8535, + "grad_norm": 5.625874042510986, + "learning_rate": 1.279307275678795e-06, + "loss": 0.893, + "step": 17070 + }, + { + "epoch": 0.8536, + "grad_norm": 4.3471808433532715, + "learning_rate": 1.277599539991563e-06, + "loss": 0.989, + "step": 17072 + }, + { + "epoch": 0.8537, + "grad_norm": 7.256825923919678, + "learning_rate": 1.2758928671023718e-06, + "loss": 1.6342, + "step": 17074 + }, + { + "epoch": 0.8538, + "grad_norm": 14.533737182617188, + "learning_rate": 1.2741872572191684e-06, + "loss": 2.3348, + "step": 17076 + }, + { + "epoch": 0.8539, + "grad_norm": 4.8000993728637695, + "learning_rate": 1.2724827105497816e-06, + "loss": 0.8303, + "step": 17078 + }, + { + "epoch": 0.854, + "grad_norm": 4.933955192565918, + "learning_rate": 1.2707792273019049e-06, + "loss": 1.0484, + "step": 17080 + }, + { + "epoch": 0.8541, + "grad_norm": 1.2069295644760132, + "learning_rate": 1.2690768076830972e-06, + "loss": 0.7645, + "step": 17082 + }, + { + "epoch": 0.8542, + "grad_norm": 4.474284648895264, + "learning_rate": 1.2673754519008008e-06, + "loss": 0.8876, + "step": 17084 + }, + { + "epoch": 0.8543, + "grad_norm": 3.338794231414795, + "learning_rate": 1.2656751601623117e-06, + "loss": 0.796, + "step": 17086 + }, + { + "epoch": 0.8544, + "grad_norm": 3.433633804321289, + "learning_rate": 1.2639759326748136e-06, + "loss": 1.2044, + "step": 17088 + }, + { + "epoch": 0.8545, + "grad_norm": 6.920332908630371, + "learning_rate": 1.2622777696453482e-06, + "loss": 0.8188, + "step": 17090 + }, + { + "epoch": 0.8546, + "grad_norm": 2.7760396003723145, + "learning_rate": 1.2605806712808322e-06, + "loss": 0.8461, + "step": 17092 + }, + { + "epoch": 0.8547, + "grad_norm": 4.704334259033203, + "learning_rate": 1.2588846377880526e-06, + "loss": 1.3336, + "step": 17094 + }, + { + "epoch": 0.8548, + "grad_norm": 11.53809928894043, + "learning_rate": 1.257189669373664e-06, + "loss": 1.0972, + "step": 17096 + }, + { + "epoch": 0.8549, + "grad_norm": 2.239990234375, + "learning_rate": 1.2554957662441958e-06, + "loss": 1.1524, + "step": 17098 + }, + { + "epoch": 0.855, + "grad_norm": 10.242255210876465, + "learning_rate": 1.2538029286060428e-06, + "loss": 0.6241, + "step": 17100 + }, + { + "epoch": 0.8551, + "grad_norm": 3.7012038230895996, + "learning_rate": 1.2521111566654732e-06, + "loss": 0.5856, + "step": 17102 + }, + { + "epoch": 0.8552, + "grad_norm": 12.438253402709961, + "learning_rate": 1.2504204506286244e-06, + "loss": 0.6833, + "step": 17104 + }, + { + "epoch": 0.8553, + "grad_norm": 3.1666347980499268, + "learning_rate": 1.248730810701503e-06, + "loss": 0.8037, + "step": 17106 + }, + { + "epoch": 0.8554, + "grad_norm": 6.541536331176758, + "learning_rate": 1.2470422370899838e-06, + "loss": 1.4676, + "step": 17108 + }, + { + "epoch": 0.8555, + "grad_norm": 3.0957515239715576, + "learning_rate": 1.2453547299998226e-06, + "loss": 1.244, + "step": 17110 + }, + { + "epoch": 0.8556, + "grad_norm": 10.395167350769043, + "learning_rate": 1.2436682896366282e-06, + "loss": 1.354, + "step": 17112 + }, + { + "epoch": 0.8557, + "grad_norm": 4.167850971221924, + "learning_rate": 1.2419829162058949e-06, + "loss": 0.9409, + "step": 17114 + }, + { + "epoch": 0.8558, + "grad_norm": 4.4138875007629395, + "learning_rate": 1.2402986099129765e-06, + "loss": 0.8449, + "step": 17116 + }, + { + "epoch": 0.8559, + "grad_norm": 3.570457696914673, + "learning_rate": 1.2386153709630989e-06, + "loss": 0.9011, + "step": 17118 + }, + { + "epoch": 0.856, + "grad_norm": 9.99819278717041, + "learning_rate": 1.2369331995613664e-06, + "loss": 1.6443, + "step": 17120 + }, + { + "epoch": 0.8561, + "grad_norm": 3.49953293800354, + "learning_rate": 1.235252095912738e-06, + "loss": 1.0596, + "step": 17122 + }, + { + "epoch": 0.8562, + "grad_norm": 5.76384162902832, + "learning_rate": 1.233572060222057e-06, + "loss": 1.3704, + "step": 17124 + }, + { + "epoch": 0.8563, + "grad_norm": 6.217260360717773, + "learning_rate": 1.2318930926940297e-06, + "loss": 1.1714, + "step": 17126 + }, + { + "epoch": 0.8564, + "grad_norm": 0.3676046133041382, + "learning_rate": 1.230215193533233e-06, + "loss": 0.6364, + "step": 17128 + }, + { + "epoch": 0.8565, + "grad_norm": 3.913658618927002, + "learning_rate": 1.228538362944115e-06, + "loss": 0.5825, + "step": 17130 + }, + { + "epoch": 0.8566, + "grad_norm": 3.8460161685943604, + "learning_rate": 1.2268626011309858e-06, + "loss": 1.0528, + "step": 17132 + }, + { + "epoch": 0.8567, + "grad_norm": 10.91623306274414, + "learning_rate": 1.22518790829804e-06, + "loss": 1.716, + "step": 17134 + }, + { + "epoch": 0.8568, + "grad_norm": 12.562458992004395, + "learning_rate": 1.223514284649331e-06, + "loss": 0.4013, + "step": 17136 + }, + { + "epoch": 0.8569, + "grad_norm": 0.13579151034355164, + "learning_rate": 1.2218417303887842e-06, + "loss": 0.4125, + "step": 17138 + }, + { + "epoch": 0.857, + "grad_norm": 4.5138444900512695, + "learning_rate": 1.2201702457201948e-06, + "loss": 1.0345, + "step": 17140 + }, + { + "epoch": 0.8571, + "grad_norm": 5.43743896484375, + "learning_rate": 1.2184998308472295e-06, + "loss": 1.1435, + "step": 17142 + }, + { + "epoch": 0.8572, + "grad_norm": 2.200242280960083, + "learning_rate": 1.2168304859734226e-06, + "loss": 0.5787, + "step": 17144 + }, + { + "epoch": 0.8573, + "grad_norm": 6.072834014892578, + "learning_rate": 1.2151622113021789e-06, + "loss": 1.4019, + "step": 17146 + }, + { + "epoch": 0.8574, + "grad_norm": 3.681199312210083, + "learning_rate": 1.2134950070367723e-06, + "loss": 0.8571, + "step": 17148 + }, + { + "epoch": 0.8575, + "grad_norm": 4.164674282073975, + "learning_rate": 1.2118288733803474e-06, + "loss": 1.1473, + "step": 17150 + }, + { + "epoch": 0.8576, + "grad_norm": 12.865419387817383, + "learning_rate": 1.210163810535917e-06, + "loss": 0.7086, + "step": 17152 + }, + { + "epoch": 0.8577, + "grad_norm": 8.334492683410645, + "learning_rate": 1.2084998187063612e-06, + "loss": 1.9008, + "step": 17154 + }, + { + "epoch": 0.8578, + "grad_norm": 3.6395652294158936, + "learning_rate": 1.206836898094439e-06, + "loss": 0.3108, + "step": 17156 + }, + { + "epoch": 0.8579, + "grad_norm": 3.489351511001587, + "learning_rate": 1.2051750489027648e-06, + "loss": 0.4442, + "step": 17158 + }, + { + "epoch": 0.858, + "grad_norm": 2.460897445678711, + "learning_rate": 1.2035142713338366e-06, + "loss": 0.797, + "step": 17160 + }, + { + "epoch": 0.8581, + "grad_norm": 2.6807475090026855, + "learning_rate": 1.2018545655900083e-06, + "loss": 1.2778, + "step": 17162 + }, + { + "epoch": 0.8582, + "grad_norm": 3.6999363899230957, + "learning_rate": 1.2001959318735158e-06, + "loss": 0.8189, + "step": 17164 + }, + { + "epoch": 0.8583, + "grad_norm": 2.631685972213745, + "learning_rate": 1.1985383703864585e-06, + "loss": 0.5303, + "step": 17166 + }, + { + "epoch": 0.8584, + "grad_norm": 3.7858266830444336, + "learning_rate": 1.196881881330798e-06, + "loss": 0.4568, + "step": 17168 + }, + { + "epoch": 0.8585, + "grad_norm": 3.121417999267578, + "learning_rate": 1.19522646490838e-06, + "loss": 0.9398, + "step": 17170 + }, + { + "epoch": 0.8586, + "grad_norm": 3.4182755947113037, + "learning_rate": 1.1935721213209106e-06, + "loss": 0.9531, + "step": 17172 + }, + { + "epoch": 0.8587, + "grad_norm": 5.534213066101074, + "learning_rate": 1.1919188507699641e-06, + "loss": 0.3218, + "step": 17174 + }, + { + "epoch": 0.8588, + "grad_norm": 3.452334403991699, + "learning_rate": 1.1902666534569884e-06, + "loss": 0.3481, + "step": 17176 + }, + { + "epoch": 0.8589, + "grad_norm": 4.577888488769531, + "learning_rate": 1.1886155295832991e-06, + "loss": 0.8369, + "step": 17178 + }, + { + "epoch": 0.859, + "grad_norm": 0.9923873543739319, + "learning_rate": 1.1869654793500784e-06, + "loss": 0.6655, + "step": 17180 + }, + { + "epoch": 0.8591, + "grad_norm": 5.10595703125, + "learning_rate": 1.1853165029583825e-06, + "loss": 1.5102, + "step": 17182 + }, + { + "epoch": 0.8592, + "grad_norm": 6.239665985107422, + "learning_rate": 1.1836686006091313e-06, + "loss": 0.4718, + "step": 17184 + }, + { + "epoch": 0.8593, + "grad_norm": 0.7650820016860962, + "learning_rate": 1.1820217725031192e-06, + "loss": 0.8327, + "step": 17186 + }, + { + "epoch": 0.8594, + "grad_norm": 4.264188289642334, + "learning_rate": 1.1803760188410074e-06, + "loss": 0.2057, + "step": 17188 + }, + { + "epoch": 0.8595, + "grad_norm": 4.400570392608643, + "learning_rate": 1.1787313398233235e-06, + "loss": 0.4985, + "step": 17190 + }, + { + "epoch": 0.8596, + "grad_norm": 4.141981601715088, + "learning_rate": 1.1770877356504684e-06, + "loss": 0.828, + "step": 17192 + }, + { + "epoch": 0.8597, + "grad_norm": 3.1006722450256348, + "learning_rate": 1.1754452065227084e-06, + "loss": 0.2715, + "step": 17194 + }, + { + "epoch": 0.8598, + "grad_norm": 1.3938429355621338, + "learning_rate": 1.1738037526401857e-06, + "loss": 0.3105, + "step": 17196 + }, + { + "epoch": 0.8599, + "grad_norm": 2.5996623039245605, + "learning_rate": 1.1721633742028992e-06, + "loss": 1.7667, + "step": 17198 + }, + { + "epoch": 0.86, + "grad_norm": 2.0018041133880615, + "learning_rate": 1.1705240714107301e-06, + "loss": 0.4952, + "step": 17200 + }, + { + "epoch": 0.8601, + "grad_norm": 3.7295570373535156, + "learning_rate": 1.1688858444634221e-06, + "loss": 1.3089, + "step": 17202 + }, + { + "epoch": 0.8602, + "grad_norm": 3.502525568008423, + "learning_rate": 1.167248693560583e-06, + "loss": 0.6936, + "step": 17204 + }, + { + "epoch": 0.8603, + "grad_norm": 5.417031288146973, + "learning_rate": 1.1656126189017014e-06, + "loss": 0.5757, + "step": 17206 + }, + { + "epoch": 0.8604, + "grad_norm": 4.974056243896484, + "learning_rate": 1.1639776206861197e-06, + "loss": 1.3442, + "step": 17208 + }, + { + "epoch": 0.8605, + "grad_norm": 2.2628984451293945, + "learning_rate": 1.1623436991130654e-06, + "loss": 0.4232, + "step": 17210 + }, + { + "epoch": 0.8606, + "grad_norm": 4.658947467803955, + "learning_rate": 1.1607108543816247e-06, + "loss": 0.2727, + "step": 17212 + }, + { + "epoch": 0.8607, + "grad_norm": 2.481915235519409, + "learning_rate": 1.159079086690753e-06, + "loss": 1.6452, + "step": 17214 + }, + { + "epoch": 0.8608, + "grad_norm": 0.5358085632324219, + "learning_rate": 1.1574483962392768e-06, + "loss": 0.6568, + "step": 17216 + }, + { + "epoch": 0.8609, + "grad_norm": 3.806042432785034, + "learning_rate": 1.1558187832258927e-06, + "loss": 0.5151, + "step": 17218 + }, + { + "epoch": 0.861, + "grad_norm": 7.0235090255737305, + "learning_rate": 1.1541902478491607e-06, + "loss": 1.4172, + "step": 17220 + }, + { + "epoch": 0.8611, + "grad_norm": 6.352231025695801, + "learning_rate": 1.1525627903075165e-06, + "loss": 0.9998, + "step": 17222 + }, + { + "epoch": 0.8612, + "grad_norm": 3.723759651184082, + "learning_rate": 1.1509364107992582e-06, + "loss": 0.7189, + "step": 17224 + }, + { + "epoch": 0.8613, + "grad_norm": 1.8313610553741455, + "learning_rate": 1.1493111095225561e-06, + "loss": 0.826, + "step": 17226 + }, + { + "epoch": 0.8614, + "grad_norm": 4.007940769195557, + "learning_rate": 1.1476868866754488e-06, + "loss": 1.2397, + "step": 17228 + }, + { + "epoch": 0.8615, + "grad_norm": 4.79613733291626, + "learning_rate": 1.1460637424558406e-06, + "loss": 0.9635, + "step": 17230 + }, + { + "epoch": 0.8616, + "grad_norm": 2.0928642749786377, + "learning_rate": 1.1444416770615118e-06, + "loss": 0.1298, + "step": 17232 + }, + { + "epoch": 0.8617, + "grad_norm": 2.9013078212738037, + "learning_rate": 1.1428206906900995e-06, + "loss": 0.7515, + "step": 17234 + }, + { + "epoch": 0.8618, + "grad_norm": 1.8094884157180786, + "learning_rate": 1.1412007835391237e-06, + "loss": 1.2447, + "step": 17236 + }, + { + "epoch": 0.8619, + "grad_norm": 15.746347427368164, + "learning_rate": 1.1395819558059573e-06, + "loss": 1.2789, + "step": 17238 + }, + { + "epoch": 0.862, + "grad_norm": 3.2688281536102295, + "learning_rate": 1.1379642076878528e-06, + "loss": 1.0014, + "step": 17240 + }, + { + "epoch": 0.8621, + "grad_norm": 3.301795244216919, + "learning_rate": 1.1363475393819312e-06, + "loss": 1.0639, + "step": 17242 + }, + { + "epoch": 0.8622, + "grad_norm": 0.31691083312034607, + "learning_rate": 1.1347319510851718e-06, + "loss": 0.7617, + "step": 17244 + }, + { + "epoch": 0.8623, + "grad_norm": 5.54612398147583, + "learning_rate": 1.1331174429944346e-06, + "loss": 0.6586, + "step": 17246 + }, + { + "epoch": 0.8624, + "grad_norm": 5.182070255279541, + "learning_rate": 1.1315040153064416e-06, + "loss": 0.6744, + "step": 17248 + }, + { + "epoch": 0.8625, + "grad_norm": 4.384965419769287, + "learning_rate": 1.129891668217783e-06, + "loss": 1.3019, + "step": 17250 + }, + { + "epoch": 0.8626, + "grad_norm": 2.5013539791107178, + "learning_rate": 1.1282804019249183e-06, + "loss": 0.5998, + "step": 17252 + }, + { + "epoch": 0.8627, + "grad_norm": 7.98594856262207, + "learning_rate": 1.1266702166241772e-06, + "loss": 0.8914, + "step": 17254 + }, + { + "epoch": 0.8628, + "grad_norm": 15.711946487426758, + "learning_rate": 1.1250611125117527e-06, + "loss": 1.3689, + "step": 17256 + }, + { + "epoch": 0.8629, + "grad_norm": 17.687759399414062, + "learning_rate": 1.1234530897837127e-06, + "loss": 1.0206, + "step": 17258 + }, + { + "epoch": 0.863, + "grad_norm": 3.1549158096313477, + "learning_rate": 1.1218461486359878e-06, + "loss": 0.7599, + "step": 17260 + }, + { + "epoch": 0.8631, + "grad_norm": 6.4403557777404785, + "learning_rate": 1.1202402892643783e-06, + "loss": 0.9426, + "step": 17262 + }, + { + "epoch": 0.8632, + "grad_norm": 3.9883086681365967, + "learning_rate": 1.1186355118645552e-06, + "loss": 1.0136, + "step": 17264 + }, + { + "epoch": 0.8633, + "grad_norm": 6.821285724639893, + "learning_rate": 1.1170318166320548e-06, + "loss": 0.7518, + "step": 17266 + }, + { + "epoch": 0.8634, + "grad_norm": 6.434916973114014, + "learning_rate": 1.1154292037622838e-06, + "loss": 0.8881, + "step": 17268 + }, + { + "epoch": 0.8635, + "grad_norm": 4.518795013427734, + "learning_rate": 1.1138276734505105e-06, + "loss": 1.1276, + "step": 17270 + }, + { + "epoch": 0.8636, + "grad_norm": 13.835293769836426, + "learning_rate": 1.1122272258918864e-06, + "loss": 0.9801, + "step": 17272 + }, + { + "epoch": 0.8637, + "grad_norm": 3.5135059356689453, + "learning_rate": 1.1106278612814125e-06, + "loss": 0.9704, + "step": 17274 + }, + { + "epoch": 0.8638, + "grad_norm": 9.417642593383789, + "learning_rate": 1.1090295798139672e-06, + "loss": 0.9034, + "step": 17276 + }, + { + "epoch": 0.8639, + "grad_norm": 5.584457874298096, + "learning_rate": 1.1074323816843025e-06, + "loss": 0.5026, + "step": 17278 + }, + { + "epoch": 0.864, + "grad_norm": 7.001768112182617, + "learning_rate": 1.1058362670870248e-06, + "loss": 2.3557, + "step": 17280 + }, + { + "epoch": 0.8641, + "grad_norm": 12.120532989501953, + "learning_rate": 1.1042412362166221e-06, + "loss": 1.182, + "step": 17282 + }, + { + "epoch": 0.8642, + "grad_norm": 7.693655490875244, + "learning_rate": 1.102647289267438e-06, + "loss": 1.2952, + "step": 17284 + }, + { + "epoch": 0.8643, + "grad_norm": 5.657966613769531, + "learning_rate": 1.1010544264336942e-06, + "loss": 1.4994, + "step": 17286 + }, + { + "epoch": 0.8644, + "grad_norm": 3.8032398223876953, + "learning_rate": 1.0994626479094749e-06, + "loss": 1.2714, + "step": 17288 + }, + { + "epoch": 0.8645, + "grad_norm": 1.758111596107483, + "learning_rate": 1.097871953888735e-06, + "loss": 0.4616, + "step": 17290 + }, + { + "epoch": 0.8646, + "grad_norm": 7.658591270446777, + "learning_rate": 1.096282344565296e-06, + "loss": 1.467, + "step": 17292 + }, + { + "epoch": 0.8647, + "grad_norm": 3.1928462982177734, + "learning_rate": 1.0946938201328416e-06, + "loss": 0.783, + "step": 17294 + }, + { + "epoch": 0.8648, + "grad_norm": 2.718702554702759, + "learning_rate": 1.093106380784934e-06, + "loss": 1.6522, + "step": 17296 + }, + { + "epoch": 0.8649, + "grad_norm": 15.108038902282715, + "learning_rate": 1.0915200267149973e-06, + "loss": 0.6422, + "step": 17298 + }, + { + "epoch": 0.865, + "grad_norm": 8.277823448181152, + "learning_rate": 1.0899347581163222e-06, + "loss": 1.5083, + "step": 17300 + }, + { + "epoch": 0.8651, + "grad_norm": 2.7851040363311768, + "learning_rate": 1.08835057518207e-06, + "loss": 0.5033, + "step": 17302 + }, + { + "epoch": 0.8652, + "grad_norm": 2.644010305404663, + "learning_rate": 1.0867674781052683e-06, + "loss": 1.053, + "step": 17304 + }, + { + "epoch": 0.8653, + "grad_norm": 0.5024089813232422, + "learning_rate": 1.0851854670788108e-06, + "loss": 0.2755, + "step": 17306 + }, + { + "epoch": 0.8654, + "grad_norm": 5.394587993621826, + "learning_rate": 1.0836045422954665e-06, + "loss": 0.6949, + "step": 17308 + }, + { + "epoch": 0.8655, + "grad_norm": 4.761642932891846, + "learning_rate": 1.0820247039478605e-06, + "loss": 1.2359, + "step": 17310 + }, + { + "epoch": 0.8656, + "grad_norm": 2.9443037509918213, + "learning_rate": 1.0804459522284927e-06, + "loss": 1.4493, + "step": 17312 + }, + { + "epoch": 0.8657, + "grad_norm": 3.694610595703125, + "learning_rate": 1.0788682873297307e-06, + "loss": 1.1764, + "step": 17314 + }, + { + "epoch": 0.8658, + "grad_norm": 4.1306610107421875, + "learning_rate": 1.0772917094438052e-06, + "loss": 0.667, + "step": 17316 + }, + { + "epoch": 0.8659, + "grad_norm": 3.7969398498535156, + "learning_rate": 1.0757162187628223e-06, + "loss": 0.702, + "step": 17318 + }, + { + "epoch": 0.866, + "grad_norm": 5.41865348815918, + "learning_rate": 1.0741418154787443e-06, + "loss": 0.8704, + "step": 17320 + }, + { + "epoch": 0.8661, + "grad_norm": 4.091107368469238, + "learning_rate": 1.0725684997834162e-06, + "loss": 0.5687, + "step": 17322 + }, + { + "epoch": 0.8662, + "grad_norm": 8.249683380126953, + "learning_rate": 1.0709962718685318e-06, + "loss": 0.8272, + "step": 17324 + }, + { + "epoch": 0.8663, + "grad_norm": 12.450394630432129, + "learning_rate": 1.0694251319256688e-06, + "loss": 0.8881, + "step": 17326 + }, + { + "epoch": 0.8664, + "grad_norm": 3.5684003829956055, + "learning_rate": 1.0678550801462662e-06, + "loss": 0.6365, + "step": 17328 + }, + { + "epoch": 0.8665, + "grad_norm": 7.734438896179199, + "learning_rate": 1.0662861167216243e-06, + "loss": 0.6941, + "step": 17330 + }, + { + "epoch": 0.8666, + "grad_norm": 4.095773696899414, + "learning_rate": 1.0647182418429224e-06, + "loss": 0.4897, + "step": 17332 + }, + { + "epoch": 0.8667, + "grad_norm": 0.6646037697792053, + "learning_rate": 1.063151455701199e-06, + "loss": 0.59, + "step": 17334 + }, + { + "epoch": 0.8668, + "grad_norm": 3.0860190391540527, + "learning_rate": 1.0615857584873624e-06, + "loss": 0.7061, + "step": 17336 + }, + { + "epoch": 0.8669, + "grad_norm": 2.9269793033599854, + "learning_rate": 1.0600211503921886e-06, + "loss": 1.02, + "step": 17338 + }, + { + "epoch": 0.867, + "grad_norm": 2.66336727142334, + "learning_rate": 1.058457631606319e-06, + "loss": 0.4198, + "step": 17340 + }, + { + "epoch": 0.8671, + "grad_norm": 2.5272037982940674, + "learning_rate": 1.056895202320264e-06, + "loss": 0.6023, + "step": 17342 + }, + { + "epoch": 0.8672, + "grad_norm": 4.3669257164001465, + "learning_rate": 1.0553338627244026e-06, + "loss": 0.4306, + "step": 17344 + }, + { + "epoch": 0.8673, + "grad_norm": 4.097462177276611, + "learning_rate": 1.0537736130089771e-06, + "loss": 0.6334, + "step": 17346 + }, + { + "epoch": 0.8674, + "grad_norm": 4.821677207946777, + "learning_rate": 1.0522144533641e-06, + "loss": 1.4863, + "step": 17348 + }, + { + "epoch": 0.8675, + "grad_norm": 6.516008377075195, + "learning_rate": 1.0506563839797501e-06, + "loss": 0.9626, + "step": 17350 + }, + { + "epoch": 0.8676, + "grad_norm": 13.318467140197754, + "learning_rate": 1.0490994050457748e-06, + "loss": 0.6508, + "step": 17352 + }, + { + "epoch": 0.8677, + "grad_norm": 2.0614748001098633, + "learning_rate": 1.0475435167518843e-06, + "loss": 1.2885, + "step": 17354 + }, + { + "epoch": 0.8678, + "grad_norm": 10.649493217468262, + "learning_rate": 1.0459887192876595e-06, + "loss": 1.016, + "step": 17356 + }, + { + "epoch": 0.8679, + "grad_norm": 3.873880386352539, + "learning_rate": 1.0444350128425528e-06, + "loss": 1.1096, + "step": 17358 + }, + { + "epoch": 0.868, + "grad_norm": 3.752166271209717, + "learning_rate": 1.042882397605871e-06, + "loss": 1.1801, + "step": 17360 + }, + { + "epoch": 0.8681, + "grad_norm": 7.5571980476379395, + "learning_rate": 1.0413308737668005e-06, + "loss": 1.301, + "step": 17362 + }, + { + "epoch": 0.8682, + "grad_norm": 3.1491737365722656, + "learning_rate": 1.039780441514391e-06, + "loss": 1.1657, + "step": 17364 + }, + { + "epoch": 0.8683, + "grad_norm": 1.0629568099975586, + "learning_rate": 1.0382311010375512e-06, + "loss": 0.9163, + "step": 17366 + }, + { + "epoch": 0.8684, + "grad_norm": 7.391267776489258, + "learning_rate": 1.0366828525250728e-06, + "loss": 0.4386, + "step": 17368 + }, + { + "epoch": 0.8685, + "grad_norm": 3.06268048286438, + "learning_rate": 1.0351356961655945e-06, + "loss": 1.1627, + "step": 17370 + }, + { + "epoch": 0.8686, + "grad_norm": 6.906704425811768, + "learning_rate": 1.0335896321476413e-06, + "loss": 1.0401, + "step": 17372 + }, + { + "epoch": 0.8687, + "grad_norm": 2.0370447635650635, + "learning_rate": 1.0320446606595935e-06, + "loss": 0.3345, + "step": 17374 + }, + { + "epoch": 0.8688, + "grad_norm": 3.9529759883880615, + "learning_rate": 1.0305007818897006e-06, + "loss": 0.8029, + "step": 17376 + }, + { + "epoch": 0.8689, + "grad_norm": 2.316473960876465, + "learning_rate": 1.0289579960260809e-06, + "loss": 1.3258, + "step": 17378 + }, + { + "epoch": 0.869, + "grad_norm": 22.280548095703125, + "learning_rate": 1.0274163032567165e-06, + "loss": 0.8029, + "step": 17380 + }, + { + "epoch": 0.8691, + "grad_norm": 5.119341850280762, + "learning_rate": 1.025875703769459e-06, + "loss": 0.9233, + "step": 17382 + }, + { + "epoch": 0.8692, + "grad_norm": 6.675411224365234, + "learning_rate": 1.024336197752025e-06, + "loss": 0.7879, + "step": 17384 + }, + { + "epoch": 0.8693, + "grad_norm": 3.7914886474609375, + "learning_rate": 1.0227977853920002e-06, + "loss": 1.1807, + "step": 17386 + }, + { + "epoch": 0.8694, + "grad_norm": 9.850646018981934, + "learning_rate": 1.0212604668768343e-06, + "loss": 1.1321, + "step": 17388 + }, + { + "epoch": 0.8695, + "grad_norm": 0.07395721226930618, + "learning_rate": 1.0197242423938447e-06, + "loss": 0.264, + "step": 17390 + }, + { + "epoch": 0.8696, + "grad_norm": 7.440797328948975, + "learning_rate": 1.0181891121302145e-06, + "loss": 0.8905, + "step": 17392 + }, + { + "epoch": 0.8697, + "grad_norm": 4.639575004577637, + "learning_rate": 1.0166550762729998e-06, + "loss": 1.4512, + "step": 17394 + }, + { + "epoch": 0.8698, + "grad_norm": 2.3841235637664795, + "learning_rate": 1.0151221350091134e-06, + "loss": 0.5957, + "step": 17396 + }, + { + "epoch": 0.8699, + "grad_norm": 4.484880447387695, + "learning_rate": 1.01359028852534e-06, + "loss": 0.7579, + "step": 17398 + }, + { + "epoch": 0.87, + "grad_norm": 2.4931793212890625, + "learning_rate": 1.012059537008332e-06, + "loss": 1.0685, + "step": 17400 + }, + { + "epoch": 0.8701, + "grad_norm": 3.874258518218994, + "learning_rate": 1.010529880644603e-06, + "loss": 0.6456, + "step": 17402 + }, + { + "epoch": 0.8702, + "grad_norm": 4.222980499267578, + "learning_rate": 1.009001319620545e-06, + "loss": 0.6152, + "step": 17404 + }, + { + "epoch": 0.8703, + "grad_norm": 3.8094422817230225, + "learning_rate": 1.0074738541223993e-06, + "loss": 1.4144, + "step": 17406 + }, + { + "epoch": 0.8704, + "grad_norm": 6.920052528381348, + "learning_rate": 1.0059474843362893e-06, + "loss": 1.2369, + "step": 17408 + }, + { + "epoch": 0.8705, + "grad_norm": 3.7903835773468018, + "learning_rate": 1.004422210448197e-06, + "loss": 1.0214, + "step": 17410 + }, + { + "epoch": 0.8706, + "grad_norm": 11.35103988647461, + "learning_rate": 1.0028980326439708e-06, + "loss": 0.5295, + "step": 17412 + }, + { + "epoch": 0.8707, + "grad_norm": 5.578859806060791, + "learning_rate": 1.0013749511093307e-06, + "loss": 1.0797, + "step": 17414 + }, + { + "epoch": 0.8708, + "grad_norm": 3.295665740966797, + "learning_rate": 9.99852966029854e-07, + "loss": 1.2819, + "step": 17416 + }, + { + "epoch": 0.8709, + "grad_norm": 4.423251628875732, + "learning_rate": 9.983320775909933e-07, + "loss": 1.346, + "step": 17418 + }, + { + "epoch": 0.871, + "grad_norm": 2.2827911376953125, + "learning_rate": 9.968122859780648e-07, + "loss": 0.6359, + "step": 17420 + }, + { + "epoch": 0.8711, + "grad_norm": 6.76934289932251, + "learning_rate": 9.952935913762507e-07, + "loss": 0.7804, + "step": 17422 + }, + { + "epoch": 0.8712, + "grad_norm": 2.799734115600586, + "learning_rate": 9.93775993970597e-07, + "loss": 0.3381, + "step": 17424 + }, + { + "epoch": 0.8713, + "grad_norm": 2.904043197631836, + "learning_rate": 9.922594939460195e-07, + "loss": 0.7524, + "step": 17426 + }, + { + "epoch": 0.8714, + "grad_norm": 2.540499687194824, + "learning_rate": 9.907440914873e-07, + "loss": 0.2265, + "step": 17428 + }, + { + "epoch": 0.8715, + "grad_norm": 4.733099937438965, + "learning_rate": 9.892297867790846e-07, + "loss": 0.615, + "step": 17430 + }, + { + "epoch": 0.8716, + "grad_norm": 8.095733642578125, + "learning_rate": 9.877165800058874e-07, + "loss": 0.494, + "step": 17432 + }, + { + "epoch": 0.8717, + "grad_norm": 14.934964179992676, + "learning_rate": 9.862044713520879e-07, + "loss": 0.5596, + "step": 17434 + }, + { + "epoch": 0.8718, + "grad_norm": 2.042257070541382, + "learning_rate": 9.84693461001932e-07, + "loss": 0.5482, + "step": 17436 + }, + { + "epoch": 0.8719, + "grad_norm": 1.9539480209350586, + "learning_rate": 9.83183549139529e-07, + "loss": 1.114, + "step": 17438 + }, + { + "epoch": 0.872, + "grad_norm": 2.8123252391815186, + "learning_rate": 9.816747359488632e-07, + "loss": 0.3861, + "step": 17440 + }, + { + "epoch": 0.8721, + "grad_norm": 8.796839714050293, + "learning_rate": 9.801670216137726e-07, + "loss": 1.0963, + "step": 17442 + }, + { + "epoch": 0.8722, + "grad_norm": 5.378193378448486, + "learning_rate": 9.786604063179728e-07, + "loss": 0.494, + "step": 17444 + }, + { + "epoch": 0.8723, + "grad_norm": 12.722362518310547, + "learning_rate": 9.771548902450356e-07, + "loss": 1.4425, + "step": 17446 + }, + { + "epoch": 0.8724, + "grad_norm": 5.78135871887207, + "learning_rate": 9.756504735784067e-07, + "loss": 1.1561, + "step": 17448 + }, + { + "epoch": 0.8725, + "grad_norm": 6.957970142364502, + "learning_rate": 9.74147156501396e-07, + "loss": 0.8827, + "step": 17450 + }, + { + "epoch": 0.8726, + "grad_norm": 3.89913010597229, + "learning_rate": 9.726449391971716e-07, + "loss": 1.2702, + "step": 17452 + }, + { + "epoch": 0.8727, + "grad_norm": 6.376608848571777, + "learning_rate": 9.711438218487835e-07, + "loss": 0.2356, + "step": 17454 + }, + { + "epoch": 0.8728, + "grad_norm": 0.7228912115097046, + "learning_rate": 9.696438046391288e-07, + "loss": 0.5932, + "step": 17456 + }, + { + "epoch": 0.8729, + "grad_norm": 8.629925727844238, + "learning_rate": 9.681448877509857e-07, + "loss": 1.0473, + "step": 17458 + }, + { + "epoch": 0.873, + "grad_norm": 4.911823749542236, + "learning_rate": 9.666470713669918e-07, + "loss": 0.1276, + "step": 17460 + }, + { + "epoch": 0.8731, + "grad_norm": 3.0240395069122314, + "learning_rate": 9.651503556696519e-07, + "loss": 0.6213, + "step": 17462 + }, + { + "epoch": 0.8732, + "grad_norm": 5.8750505447387695, + "learning_rate": 9.636547408413355e-07, + "loss": 1.6717, + "step": 17464 + }, + { + "epoch": 0.8733, + "grad_norm": 8.533980369567871, + "learning_rate": 9.621602270642783e-07, + "loss": 0.7985, + "step": 17466 + }, + { + "epoch": 0.8734, + "grad_norm": 5.092528343200684, + "learning_rate": 9.606668145205833e-07, + "loss": 1.635, + "step": 17468 + }, + { + "epoch": 0.8735, + "grad_norm": 2.399549961090088, + "learning_rate": 9.591745033922173e-07, + "loss": 0.3168, + "step": 17470 + }, + { + "epoch": 0.8736, + "grad_norm": 4.363719463348389, + "learning_rate": 9.576832938610137e-07, + "loss": 0.623, + "step": 17472 + }, + { + "epoch": 0.8737, + "grad_norm": 4.223438739776611, + "learning_rate": 9.561931861086738e-07, + "loss": 0.6227, + "step": 17474 + }, + { + "epoch": 0.8738, + "grad_norm": 22.957252502441406, + "learning_rate": 9.547041803167601e-07, + "loss": 0.9856, + "step": 17476 + }, + { + "epoch": 0.8739, + "grad_norm": 5.5035858154296875, + "learning_rate": 9.532162766667042e-07, + "loss": 1.3946, + "step": 17478 + }, + { + "epoch": 0.874, + "grad_norm": 9.044662475585938, + "learning_rate": 9.517294753398066e-07, + "loss": 1.0591, + "step": 17480 + }, + { + "epoch": 0.8741, + "grad_norm": 4.810704231262207, + "learning_rate": 9.502437765172212e-07, + "loss": 1.5699, + "step": 17482 + }, + { + "epoch": 0.8742, + "grad_norm": 8.636917114257812, + "learning_rate": 9.487591803799856e-07, + "loss": 0.7978, + "step": 17484 + }, + { + "epoch": 0.8743, + "grad_norm": 2.0501768589019775, + "learning_rate": 9.472756871089861e-07, + "loss": 0.744, + "step": 17486 + }, + { + "epoch": 0.8744, + "grad_norm": 2.887024164199829, + "learning_rate": 9.457932968849826e-07, + "loss": 0.755, + "step": 17488 + }, + { + "epoch": 0.8745, + "grad_norm": 4.3405232429504395, + "learning_rate": 9.44312009888606e-07, + "loss": 0.9006, + "step": 17490 + }, + { + "epoch": 0.8746, + "grad_norm": 3.870392084121704, + "learning_rate": 9.428318263003378e-07, + "loss": 0.9046, + "step": 17492 + }, + { + "epoch": 0.8747, + "grad_norm": 4.870553016662598, + "learning_rate": 9.413527463005401e-07, + "loss": 0.7079, + "step": 17494 + }, + { + "epoch": 0.8748, + "grad_norm": 5.356075286865234, + "learning_rate": 9.398747700694322e-07, + "loss": 1.4411, + "step": 17496 + }, + { + "epoch": 0.8749, + "grad_norm": 2.8242342472076416, + "learning_rate": 9.383978977871022e-07, + "loss": 1.3298, + "step": 17498 + }, + { + "epoch": 0.875, + "grad_norm": 12.23960018157959, + "learning_rate": 9.369221296335007e-07, + "loss": 0.8971, + "step": 17500 + }, + { + "epoch": 0.8751, + "grad_norm": 6.192659854888916, + "learning_rate": 9.354474657884472e-07, + "loss": 0.5511, + "step": 17502 + }, + { + "epoch": 0.8752, + "grad_norm": 4.174504280090332, + "learning_rate": 9.339739064316233e-07, + "loss": 0.4358, + "step": 17504 + }, + { + "epoch": 0.8753, + "grad_norm": 5.247692108154297, + "learning_rate": 9.32501451742579e-07, + "loss": 1.0143, + "step": 17506 + }, + { + "epoch": 0.8754, + "grad_norm": 3.3510079383850098, + "learning_rate": 9.310301019007284e-07, + "loss": 1.147, + "step": 17508 + }, + { + "epoch": 0.8755, + "grad_norm": 3.515583038330078, + "learning_rate": 9.295598570853514e-07, + "loss": 0.435, + "step": 17510 + }, + { + "epoch": 0.8756, + "grad_norm": 4.580440044403076, + "learning_rate": 9.280907174755916e-07, + "loss": 0.7212, + "step": 17512 + }, + { + "epoch": 0.8757, + "grad_norm": 4.140383720397949, + "learning_rate": 9.266226832504599e-07, + "loss": 0.7044, + "step": 17514 + }, + { + "epoch": 0.8758, + "grad_norm": 4.840308666229248, + "learning_rate": 9.251557545888312e-07, + "loss": 0.7406, + "step": 17516 + }, + { + "epoch": 0.8759, + "grad_norm": 6.468361854553223, + "learning_rate": 9.236899316694459e-07, + "loss": 0.7416, + "step": 17518 + }, + { + "epoch": 0.876, + "grad_norm": 3.6995065212249756, + "learning_rate": 9.222252146709143e-07, + "loss": 1.4887, + "step": 17520 + }, + { + "epoch": 0.8761, + "grad_norm": 12.75653076171875, + "learning_rate": 9.207616037717027e-07, + "loss": 0.7486, + "step": 17522 + }, + { + "epoch": 0.8762, + "grad_norm": 8.073079109191895, + "learning_rate": 9.192990991501483e-07, + "loss": 1.2133, + "step": 17524 + }, + { + "epoch": 0.8763, + "grad_norm": 1.2892605066299438, + "learning_rate": 9.178377009844563e-07, + "loss": 0.3464, + "step": 17526 + }, + { + "epoch": 0.8764, + "grad_norm": 8.31390380859375, + "learning_rate": 9.16377409452689e-07, + "loss": 0.6901, + "step": 17528 + }, + { + "epoch": 0.8765, + "grad_norm": 22.594636917114258, + "learning_rate": 9.149182247327837e-07, + "loss": 0.8301, + "step": 17530 + }, + { + "epoch": 0.8766, + "grad_norm": 3.7135329246520996, + "learning_rate": 9.134601470025306e-07, + "loss": 0.9991, + "step": 17532 + }, + { + "epoch": 0.8767, + "grad_norm": 2.1513848304748535, + "learning_rate": 9.120031764395987e-07, + "loss": 0.4139, + "step": 17534 + }, + { + "epoch": 0.8768, + "grad_norm": 2.1831541061401367, + "learning_rate": 9.105473132215126e-07, + "loss": 0.5952, + "step": 17536 + }, + { + "epoch": 0.8769, + "grad_norm": 3.421430826187134, + "learning_rate": 9.090925575256659e-07, + "loss": 0.5456, + "step": 17538 + }, + { + "epoch": 0.877, + "grad_norm": 6.456420421600342, + "learning_rate": 9.076389095293148e-07, + "loss": 0.69, + "step": 17540 + }, + { + "epoch": 0.8771, + "grad_norm": 2.4729442596435547, + "learning_rate": 9.061863694095829e-07, + "loss": 0.9544, + "step": 17542 + }, + { + "epoch": 0.8772, + "grad_norm": 4.1095499992370605, + "learning_rate": 9.047349373434566e-07, + "loss": 0.745, + "step": 17544 + }, + { + "epoch": 0.8773, + "grad_norm": 10.505170822143555, + "learning_rate": 9.0328461350779e-07, + "loss": 1.0296, + "step": 17546 + }, + { + "epoch": 0.8774, + "grad_norm": 1.979911208152771, + "learning_rate": 9.018353980792993e-07, + "loss": 0.703, + "step": 17548 + }, + { + "epoch": 0.8775, + "grad_norm": 2.696608066558838, + "learning_rate": 9.00387291234569e-07, + "loss": 0.4783, + "step": 17550 + }, + { + "epoch": 0.8776, + "grad_norm": 6.084646224975586, + "learning_rate": 8.989402931500434e-07, + "loss": 1.1225, + "step": 17552 + }, + { + "epoch": 0.8777, + "grad_norm": 4.481259346008301, + "learning_rate": 8.974944040020362e-07, + "loss": 0.9224, + "step": 17554 + }, + { + "epoch": 0.8778, + "grad_norm": 4.776264667510986, + "learning_rate": 8.960496239667282e-07, + "loss": 1.1851, + "step": 17556 + }, + { + "epoch": 0.8779, + "grad_norm": 2.471768856048584, + "learning_rate": 8.946059532201568e-07, + "loss": 0.747, + "step": 17558 + }, + { + "epoch": 0.878, + "grad_norm": 4.540163040161133, + "learning_rate": 8.931633919382299e-07, + "loss": 0.7299, + "step": 17560 + }, + { + "epoch": 0.8781, + "grad_norm": 4.992328643798828, + "learning_rate": 8.917219402967203e-07, + "loss": 1.3119, + "step": 17562 + }, + { + "epoch": 0.8782, + "grad_norm": 15.40233039855957, + "learning_rate": 8.902815984712621e-07, + "loss": 2.2949, + "step": 17564 + }, + { + "epoch": 0.8783, + "grad_norm": 3.449470043182373, + "learning_rate": 8.888423666373614e-07, + "loss": 0.7514, + "step": 17566 + }, + { + "epoch": 0.8784, + "grad_norm": 3.655613660812378, + "learning_rate": 8.874042449703779e-07, + "loss": 1.3237, + "step": 17568 + }, + { + "epoch": 0.8785, + "grad_norm": 2.660217046737671, + "learning_rate": 8.859672336455471e-07, + "loss": 0.339, + "step": 17570 + }, + { + "epoch": 0.8786, + "grad_norm": 3.1179752349853516, + "learning_rate": 8.845313328379635e-07, + "loss": 0.879, + "step": 17572 + }, + { + "epoch": 0.8787, + "grad_norm": 6.312758445739746, + "learning_rate": 8.830965427225868e-07, + "loss": 0.4906, + "step": 17574 + }, + { + "epoch": 0.8788, + "grad_norm": 8.183249473571777, + "learning_rate": 8.816628634742441e-07, + "loss": 0.5995, + "step": 17576 + }, + { + "epoch": 0.8789, + "grad_norm": 4.548605442047119, + "learning_rate": 8.80230295267619e-07, + "loss": 0.0873, + "step": 17578 + }, + { + "epoch": 0.879, + "grad_norm": 2.695384979248047, + "learning_rate": 8.787988382772705e-07, + "loss": 1.4057, + "step": 17580 + }, + { + "epoch": 0.8791, + "grad_norm": 10.511476516723633, + "learning_rate": 8.77368492677616e-07, + "loss": 1.529, + "step": 17582 + }, + { + "epoch": 0.8792, + "grad_norm": 7.355175018310547, + "learning_rate": 8.759392586429394e-07, + "loss": 0.7748, + "step": 17584 + }, + { + "epoch": 0.8793, + "grad_norm": 5.911981582641602, + "learning_rate": 8.745111363473869e-07, + "loss": 1.2, + "step": 17586 + }, + { + "epoch": 0.8794, + "grad_norm": 8.273785591125488, + "learning_rate": 8.730841259649725e-07, + "loss": 1.2135, + "step": 17588 + }, + { + "epoch": 0.8795, + "grad_norm": 6.148337364196777, + "learning_rate": 8.716582276695729e-07, + "loss": 0.8373, + "step": 17590 + }, + { + "epoch": 0.8796, + "grad_norm": 2.688567638397217, + "learning_rate": 8.702334416349279e-07, + "loss": 0.7537, + "step": 17592 + }, + { + "epoch": 0.8797, + "grad_norm": 6.034850597381592, + "learning_rate": 8.688097680346453e-07, + "loss": 1.1461, + "step": 17594 + }, + { + "epoch": 0.8798, + "grad_norm": 10.497350692749023, + "learning_rate": 8.67387207042194e-07, + "loss": 1.0528, + "step": 17596 + }, + { + "epoch": 0.8799, + "grad_norm": 3.7667014598846436, + "learning_rate": 8.6596575883091e-07, + "loss": 1.1976, + "step": 17598 + }, + { + "epoch": 0.88, + "grad_norm": 11.326070785522461, + "learning_rate": 8.645454235739903e-07, + "loss": 1.0536, + "step": 17600 + }, + { + "epoch": 0.8801, + "grad_norm": 5.7868332862854, + "learning_rate": 8.63126201444503e-07, + "loss": 1.1993, + "step": 17602 + }, + { + "epoch": 0.8802, + "grad_norm": 1.768622636795044, + "learning_rate": 8.617080926153698e-07, + "loss": 0.4717, + "step": 17604 + }, + { + "epoch": 0.8803, + "grad_norm": 2.571065664291382, + "learning_rate": 8.602910972593892e-07, + "loss": 0.2745, + "step": 17606 + }, + { + "epoch": 0.8804, + "grad_norm": 10.400043487548828, + "learning_rate": 8.58875215549212e-07, + "loss": 1.5753, + "step": 17608 + }, + { + "epoch": 0.8805, + "grad_norm": 10.813019752502441, + "learning_rate": 8.574604476573623e-07, + "loss": 0.7961, + "step": 17610 + }, + { + "epoch": 0.8806, + "grad_norm": 5.082242965698242, + "learning_rate": 8.560467937562278e-07, + "loss": 2.0563, + "step": 17612 + }, + { + "epoch": 0.8807, + "grad_norm": 5.074003219604492, + "learning_rate": 8.546342540180508e-07, + "loss": 0.5807, + "step": 17614 + }, + { + "epoch": 0.8808, + "grad_norm": 4.5447163581848145, + "learning_rate": 8.532228286149502e-07, + "loss": 1.1574, + "step": 17616 + }, + { + "epoch": 0.8809, + "grad_norm": 8.345322608947754, + "learning_rate": 8.518125177189041e-07, + "loss": 1.8699, + "step": 17618 + }, + { + "epoch": 0.881, + "grad_norm": 3.1258432865142822, + "learning_rate": 8.504033215017527e-07, + "loss": 0.3865, + "step": 17620 + }, + { + "epoch": 0.8811, + "grad_norm": 2.6427664756774902, + "learning_rate": 8.48995240135202e-07, + "loss": 0.5344, + "step": 17622 + }, + { + "epoch": 0.8812, + "grad_norm": 2.4300196170806885, + "learning_rate": 8.475882737908248e-07, + "loss": 0.7886, + "step": 17624 + }, + { + "epoch": 0.8813, + "grad_norm": 7.58906888961792, + "learning_rate": 8.461824226400539e-07, + "loss": 0.3842, + "step": 17626 + }, + { + "epoch": 0.8814, + "grad_norm": 4.542149066925049, + "learning_rate": 8.447776868541879e-07, + "loss": 0.6576, + "step": 17628 + }, + { + "epoch": 0.8815, + "grad_norm": 9.450010299682617, + "learning_rate": 8.433740666043899e-07, + "loss": 1.7658, + "step": 17630 + }, + { + "epoch": 0.8816, + "grad_norm": 4.265382766723633, + "learning_rate": 8.419715620616875e-07, + "loss": 1.0541, + "step": 17632 + }, + { + "epoch": 0.8817, + "grad_norm": 4.40294075012207, + "learning_rate": 8.405701733969706e-07, + "loss": 0.3826, + "step": 17634 + }, + { + "epoch": 0.8818, + "grad_norm": 2.281006336212158, + "learning_rate": 8.39169900780995e-07, + "loss": 0.2481, + "step": 17636 + }, + { + "epoch": 0.8819, + "grad_norm": 6.83705997467041, + "learning_rate": 8.377707443843786e-07, + "loss": 0.3397, + "step": 17638 + }, + { + "epoch": 0.882, + "grad_norm": 3.1348023414611816, + "learning_rate": 8.363727043776037e-07, + "loss": 0.7741, + "step": 17640 + }, + { + "epoch": 0.8821, + "grad_norm": 3.4301021099090576, + "learning_rate": 8.349757809310211e-07, + "loss": 0.7965, + "step": 17642 + }, + { + "epoch": 0.8822, + "grad_norm": 3.9100329875946045, + "learning_rate": 8.335799742148387e-07, + "loss": 0.7487, + "step": 17644 + }, + { + "epoch": 0.8823, + "grad_norm": 11.79671859741211, + "learning_rate": 8.321852843991296e-07, + "loss": 0.7062, + "step": 17646 + }, + { + "epoch": 0.8824, + "grad_norm": 3.258425712585449, + "learning_rate": 8.307917116538378e-07, + "loss": 0.5842, + "step": 17648 + }, + { + "epoch": 0.8825, + "grad_norm": 5.0266194343566895, + "learning_rate": 8.293992561487596e-07, + "loss": 1.3213, + "step": 17650 + }, + { + "epoch": 0.8826, + "grad_norm": 2.342965841293335, + "learning_rate": 8.280079180535672e-07, + "loss": 0.8982, + "step": 17652 + }, + { + "epoch": 0.8827, + "grad_norm": 2.753441333770752, + "learning_rate": 8.26617697537786e-07, + "loss": 1.1803, + "step": 17654 + }, + { + "epoch": 0.8828, + "grad_norm": 0.4017708897590637, + "learning_rate": 8.252285947708139e-07, + "loss": 0.6375, + "step": 17656 + }, + { + "epoch": 0.8829, + "grad_norm": 1.2296942472457886, + "learning_rate": 8.238406099219076e-07, + "loss": 0.8077, + "step": 17658 + }, + { + "epoch": 0.883, + "grad_norm": 0.6590880155563354, + "learning_rate": 8.224537431601886e-07, + "loss": 0.4527, + "step": 17660 + }, + { + "epoch": 0.8831, + "grad_norm": 17.032752990722656, + "learning_rate": 8.21067994654644e-07, + "loss": 1.2028, + "step": 17662 + }, + { + "epoch": 0.8832, + "grad_norm": 6.386171817779541, + "learning_rate": 8.196833645741187e-07, + "loss": 0.4575, + "step": 17664 + }, + { + "epoch": 0.8833, + "grad_norm": 2.873152494430542, + "learning_rate": 8.182998530873298e-07, + "loss": 0.8887, + "step": 17666 + }, + { + "epoch": 0.8834, + "grad_norm": 9.037581443786621, + "learning_rate": 8.169174603628538e-07, + "loss": 1.473, + "step": 17668 + }, + { + "epoch": 0.8835, + "grad_norm": 2.197956085205078, + "learning_rate": 8.155361865691291e-07, + "loss": 0.5211, + "step": 17670 + }, + { + "epoch": 0.8836, + "grad_norm": 2.9057412147521973, + "learning_rate": 8.141560318744601e-07, + "loss": 1.7334, + "step": 17672 + }, + { + "epoch": 0.8837, + "grad_norm": 2.757005214691162, + "learning_rate": 8.127769964470156e-07, + "loss": 0.8087, + "step": 17674 + }, + { + "epoch": 0.8838, + "grad_norm": 4.56389856338501, + "learning_rate": 8.113990804548244e-07, + "loss": 0.6605, + "step": 17676 + }, + { + "epoch": 0.8839, + "grad_norm": 6.934174060821533, + "learning_rate": 8.100222840657879e-07, + "loss": 0.7424, + "step": 17678 + }, + { + "epoch": 0.884, + "grad_norm": 2.1374449729919434, + "learning_rate": 8.086466074476562e-07, + "loss": 0.2943, + "step": 17680 + }, + { + "epoch": 0.8841, + "grad_norm": 7.8167948722839355, + "learning_rate": 8.072720507680565e-07, + "loss": 1.0372, + "step": 17682 + }, + { + "epoch": 0.8842, + "grad_norm": 0.9865421056747437, + "learning_rate": 8.058986141944724e-07, + "loss": 0.5497, + "step": 17684 + }, + { + "epoch": 0.8843, + "grad_norm": 5.171399116516113, + "learning_rate": 8.045262978942514e-07, + "loss": 1.3044, + "step": 17686 + }, + { + "epoch": 0.8844, + "grad_norm": 2.1134538650512695, + "learning_rate": 8.031551020346129e-07, + "loss": 0.5229, + "step": 17688 + }, + { + "epoch": 0.8845, + "grad_norm": 6.388742446899414, + "learning_rate": 8.017850267826233e-07, + "loss": 0.8888, + "step": 17690 + }, + { + "epoch": 0.8846, + "grad_norm": 4.7239227294921875, + "learning_rate": 8.004160723052312e-07, + "loss": 0.971, + "step": 17692 + }, + { + "epoch": 0.8847, + "grad_norm": 4.39414119720459, + "learning_rate": 7.990482387692311e-07, + "loss": 1.0867, + "step": 17694 + }, + { + "epoch": 0.8848, + "grad_norm": 7.781190872192383, + "learning_rate": 7.976815263412963e-07, + "loss": 0.5084, + "step": 17696 + }, + { + "epoch": 0.8849, + "grad_norm": 2.168412446975708, + "learning_rate": 7.963159351879557e-07, + "loss": 1.1494, + "step": 17698 + }, + { + "epoch": 0.885, + "grad_norm": 3.8697142601013184, + "learning_rate": 7.949514654755963e-07, + "loss": 0.8732, + "step": 17700 + }, + { + "epoch": 0.8851, + "grad_norm": 11.277085304260254, + "learning_rate": 7.935881173704818e-07, + "loss": 1.28, + "step": 17702 + }, + { + "epoch": 0.8852, + "grad_norm": 0.25437411665916443, + "learning_rate": 7.922258910387282e-07, + "loss": 0.5359, + "step": 17704 + }, + { + "epoch": 0.8853, + "grad_norm": 5.73805046081543, + "learning_rate": 7.908647866463204e-07, + "loss": 0.9706, + "step": 17706 + }, + { + "epoch": 0.8854, + "grad_norm": 8.049269676208496, + "learning_rate": 7.895048043591036e-07, + "loss": 1.922, + "step": 17708 + }, + { + "epoch": 0.8855, + "grad_norm": 3.8584749698638916, + "learning_rate": 7.881459443427885e-07, + "loss": 1.1682, + "step": 17710 + }, + { + "epoch": 0.8856, + "grad_norm": 3.7673542499542236, + "learning_rate": 7.867882067629473e-07, + "loss": 0.4063, + "step": 17712 + }, + { + "epoch": 0.8857, + "grad_norm": 4.076463222503662, + "learning_rate": 7.854315917850163e-07, + "loss": 0.9165, + "step": 17714 + }, + { + "epoch": 0.8858, + "grad_norm": 4.738184452056885, + "learning_rate": 7.840760995742946e-07, + "loss": 1.351, + "step": 17716 + }, + { + "epoch": 0.8859, + "grad_norm": 6.718052864074707, + "learning_rate": 7.827217302959466e-07, + "loss": 0.4123, + "step": 17718 + }, + { + "epoch": 0.886, + "grad_norm": 8.491876602172852, + "learning_rate": 7.81368484114996e-07, + "loss": 1.5824, + "step": 17720 + }, + { + "epoch": 0.8861, + "grad_norm": 5.534744739532471, + "learning_rate": 7.800163611963319e-07, + "loss": 1.3033, + "step": 17722 + }, + { + "epoch": 0.8862, + "grad_norm": 9.486635208129883, + "learning_rate": 7.78665361704708e-07, + "loss": 0.9342, + "step": 17724 + }, + { + "epoch": 0.8863, + "grad_norm": 6.658869743347168, + "learning_rate": 7.77315485804736e-07, + "loss": 1.0265, + "step": 17726 + }, + { + "epoch": 0.8864, + "grad_norm": 15.070242881774902, + "learning_rate": 7.759667336609011e-07, + "loss": 1.5129, + "step": 17728 + }, + { + "epoch": 0.8865, + "grad_norm": 4.485331058502197, + "learning_rate": 7.746191054375363e-07, + "loss": 0.9327, + "step": 17730 + }, + { + "epoch": 0.8866, + "grad_norm": 4.083524703979492, + "learning_rate": 7.732726012988512e-07, + "loss": 1.5474, + "step": 17732 + }, + { + "epoch": 0.8867, + "grad_norm": 5.046948432922363, + "learning_rate": 7.719272214089146e-07, + "loss": 0.6486, + "step": 17734 + }, + { + "epoch": 0.8868, + "grad_norm": 3.0419721603393555, + "learning_rate": 7.7058296593165e-07, + "loss": 1.2802, + "step": 17736 + }, + { + "epoch": 0.8869, + "grad_norm": 10.914406776428223, + "learning_rate": 7.692398350308594e-07, + "loss": 1.2062, + "step": 17738 + }, + { + "epoch": 0.887, + "grad_norm": 1.1439661979675293, + "learning_rate": 7.678978288701911e-07, + "loss": 0.4127, + "step": 17740 + }, + { + "epoch": 0.8871, + "grad_norm": 7.4184041023254395, + "learning_rate": 7.665569476131706e-07, + "loss": 1.4839, + "step": 17742 + }, + { + "epoch": 0.8872, + "grad_norm": 4.885092258453369, + "learning_rate": 7.652171914231777e-07, + "loss": 1.1203, + "step": 17744 + }, + { + "epoch": 0.8873, + "grad_norm": 5.351678371429443, + "learning_rate": 7.638785604634579e-07, + "loss": 0.7223, + "step": 17746 + }, + { + "epoch": 0.8874, + "grad_norm": 6.347989559173584, + "learning_rate": 7.62541054897119e-07, + "loss": 1.8646, + "step": 17748 + }, + { + "epoch": 0.8875, + "grad_norm": 4.417210578918457, + "learning_rate": 7.612046748871327e-07, + "loss": 0.8681, + "step": 17750 + }, + { + "epoch": 0.8876, + "grad_norm": 8.394693374633789, + "learning_rate": 7.598694205963331e-07, + "loss": 0.6803, + "step": 17752 + }, + { + "epoch": 0.8877, + "grad_norm": 5.740983486175537, + "learning_rate": 7.585352921874156e-07, + "loss": 0.8836, + "step": 17754 + }, + { + "epoch": 0.8878, + "grad_norm": 4.117424964904785, + "learning_rate": 7.572022898229403e-07, + "loss": 0.6998, + "step": 17756 + }, + { + "epoch": 0.8879, + "grad_norm": 1.6871747970581055, + "learning_rate": 7.558704136653306e-07, + "loss": 0.2216, + "step": 17758 + }, + { + "epoch": 0.888, + "grad_norm": 6.155460357666016, + "learning_rate": 7.545396638768698e-07, + "loss": 1.2152, + "step": 17760 + }, + { + "epoch": 0.8881, + "grad_norm": 0.4200490117073059, + "learning_rate": 7.532100406197041e-07, + "loss": 0.8093, + "step": 17762 + }, + { + "epoch": 0.8882, + "grad_norm": 2.621245861053467, + "learning_rate": 7.518815440558514e-07, + "loss": 2.3142, + "step": 17764 + }, + { + "epoch": 0.8883, + "grad_norm": 5.441250324249268, + "learning_rate": 7.505541743471756e-07, + "loss": 1.2042, + "step": 17766 + }, + { + "epoch": 0.8884, + "grad_norm": 3.4299426078796387, + "learning_rate": 7.492279316554207e-07, + "loss": 0.7218, + "step": 17768 + }, + { + "epoch": 0.8885, + "grad_norm": 9.004402160644531, + "learning_rate": 7.479028161421798e-07, + "loss": 1.0264, + "step": 17770 + }, + { + "epoch": 0.8886, + "grad_norm": 1.8667861223220825, + "learning_rate": 7.465788279689156e-07, + "loss": 0.9059, + "step": 17772 + }, + { + "epoch": 0.8887, + "grad_norm": 5.085934638977051, + "learning_rate": 7.452559672969551e-07, + "loss": 0.4265, + "step": 17774 + }, + { + "epoch": 0.8888, + "grad_norm": 3.3656301498413086, + "learning_rate": 7.439342342874789e-07, + "loss": 0.6704, + "step": 17776 + }, + { + "epoch": 0.8889, + "grad_norm": 3.3381524085998535, + "learning_rate": 7.426136291015418e-07, + "loss": 0.4349, + "step": 17778 + }, + { + "epoch": 0.889, + "grad_norm": 5.188583850860596, + "learning_rate": 7.412941519000527e-07, + "loss": 0.4067, + "step": 17780 + }, + { + "epoch": 0.8891, + "grad_norm": 5.846499443054199, + "learning_rate": 7.399758028437865e-07, + "loss": 0.7249, + "step": 17782 + }, + { + "epoch": 0.8892, + "grad_norm": 15.960113525390625, + "learning_rate": 7.386585820933812e-07, + "loss": 2.0659, + "step": 17784 + }, + { + "epoch": 0.8893, + "grad_norm": 2.5314130783081055, + "learning_rate": 7.373424898093339e-07, + "loss": 0.803, + "step": 17786 + }, + { + "epoch": 0.8894, + "grad_norm": 6.600428104400635, + "learning_rate": 7.360275261520078e-07, + "loss": 1.0095, + "step": 17788 + }, + { + "epoch": 0.8895, + "grad_norm": 6.205752372741699, + "learning_rate": 7.347136912816277e-07, + "loss": 0.9566, + "step": 17790 + }, + { + "epoch": 0.8896, + "grad_norm": 6.328387260437012, + "learning_rate": 7.334009853582791e-07, + "loss": 0.8436, + "step": 17792 + }, + { + "epoch": 0.8897, + "grad_norm": 5.163962364196777, + "learning_rate": 7.320894085419117e-07, + "loss": 1.1827, + "step": 17794 + }, + { + "epoch": 0.8898, + "grad_norm": 4.755800247192383, + "learning_rate": 7.307789609923377e-07, + "loss": 0.9608, + "step": 17796 + }, + { + "epoch": 0.8899, + "grad_norm": 2.809468984603882, + "learning_rate": 7.294696428692305e-07, + "loss": 1.2091, + "step": 17798 + }, + { + "epoch": 0.89, + "grad_norm": 7.623346328735352, + "learning_rate": 7.281614543321269e-07, + "loss": 0.7733, + "step": 17800 + }, + { + "epoch": 0.8901, + "grad_norm": 1.407217025756836, + "learning_rate": 7.268543955404239e-07, + "loss": 0.3089, + "step": 17802 + }, + { + "epoch": 0.8902, + "grad_norm": 11.449344635009766, + "learning_rate": 7.255484666533874e-07, + "loss": 1.0507, + "step": 17804 + }, + { + "epoch": 0.8903, + "grad_norm": 6.27724552154541, + "learning_rate": 7.242436678301368e-07, + "loss": 0.7632, + "step": 17806 + }, + { + "epoch": 0.8904, + "grad_norm": 1.6479870080947876, + "learning_rate": 7.22939999229657e-07, + "loss": 1.0915, + "step": 17808 + }, + { + "epoch": 0.8905, + "grad_norm": 4.350630760192871, + "learning_rate": 7.216374610108012e-07, + "loss": 1.6032, + "step": 17810 + }, + { + "epoch": 0.8906, + "grad_norm": 5.142702579498291, + "learning_rate": 7.203360533322734e-07, + "loss": 0.6373, + "step": 17812 + }, + { + "epoch": 0.8907, + "grad_norm": 7.055582523345947, + "learning_rate": 7.190357763526523e-07, + "loss": 0.6992, + "step": 17814 + }, + { + "epoch": 0.8908, + "grad_norm": 5.691769123077393, + "learning_rate": 7.177366302303667e-07, + "loss": 1.4342, + "step": 17816 + }, + { + "epoch": 0.8909, + "grad_norm": 2.031665086746216, + "learning_rate": 7.164386151237179e-07, + "loss": 0.7782, + "step": 17818 + }, + { + "epoch": 0.891, + "grad_norm": 8.459723472595215, + "learning_rate": 7.151417311908648e-07, + "loss": 1.3448, + "step": 17820 + }, + { + "epoch": 0.8911, + "grad_norm": 0.21870876848697662, + "learning_rate": 7.138459785898266e-07, + "loss": 0.3146, + "step": 17822 + }, + { + "epoch": 0.8912, + "grad_norm": 3.8490357398986816, + "learning_rate": 7.125513574784904e-07, + "loss": 0.7742, + "step": 17824 + }, + { + "epoch": 0.8913, + "grad_norm": 2.7632908821105957, + "learning_rate": 7.112578680145954e-07, + "loss": 1.4421, + "step": 17826 + }, + { + "epoch": 0.8914, + "grad_norm": 0.5668547749519348, + "learning_rate": 7.099655103557557e-07, + "loss": 0.4505, + "step": 17828 + }, + { + "epoch": 0.8915, + "grad_norm": 4.342286109924316, + "learning_rate": 7.086742846594385e-07, + "loss": 0.3132, + "step": 17830 + }, + { + "epoch": 0.8916, + "grad_norm": 2.822638988494873, + "learning_rate": 7.073841910829771e-07, + "loss": 0.4653, + "step": 17832 + }, + { + "epoch": 0.8917, + "grad_norm": 6.42014217376709, + "learning_rate": 7.060952297835632e-07, + "loss": 1.8923, + "step": 17834 + }, + { + "epoch": 0.8918, + "grad_norm": 8.08585262298584, + "learning_rate": 7.048074009182548e-07, + "loss": 1.1924, + "step": 17836 + }, + { + "epoch": 0.8919, + "grad_norm": 3.0186374187469482, + "learning_rate": 7.035207046439673e-07, + "loss": 0.2998, + "step": 17838 + }, + { + "epoch": 0.892, + "grad_norm": 5.399554252624512, + "learning_rate": 7.022351411174866e-07, + "loss": 0.6455, + "step": 17840 + }, + { + "epoch": 0.8921, + "grad_norm": 4.809275150299072, + "learning_rate": 7.009507104954493e-07, + "loss": 1.0988, + "step": 17842 + }, + { + "epoch": 0.8922, + "grad_norm": 2.3460981845855713, + "learning_rate": 6.996674129343606e-07, + "loss": 0.3271, + "step": 17844 + }, + { + "epoch": 0.8923, + "grad_norm": 4.028432369232178, + "learning_rate": 6.983852485905862e-07, + "loss": 0.5804, + "step": 17846 + }, + { + "epoch": 0.8924, + "grad_norm": 5.535247802734375, + "learning_rate": 6.971042176203535e-07, + "loss": 0.6229, + "step": 17848 + }, + { + "epoch": 0.8925, + "grad_norm": 2.4154858589172363, + "learning_rate": 6.958243201797554e-07, + "loss": 0.047, + "step": 17850 + }, + { + "epoch": 0.8926, + "grad_norm": 3.3455097675323486, + "learning_rate": 6.945455564247394e-07, + "loss": 1.0929, + "step": 17852 + }, + { + "epoch": 0.8927, + "grad_norm": 2.767319440841675, + "learning_rate": 6.932679265111231e-07, + "loss": 0.9447, + "step": 17854 + }, + { + "epoch": 0.8928, + "grad_norm": 3.9431607723236084, + "learning_rate": 6.919914305945774e-07, + "loss": 0.9667, + "step": 17856 + }, + { + "epoch": 0.8929, + "grad_norm": 5.82735538482666, + "learning_rate": 6.907160688306425e-07, + "loss": 0.525, + "step": 17858 + }, + { + "epoch": 0.893, + "grad_norm": 5.766722679138184, + "learning_rate": 6.894418413747183e-07, + "loss": 0.6808, + "step": 17860 + }, + { + "epoch": 0.8931, + "grad_norm": 7.396283149719238, + "learning_rate": 6.881687483820609e-07, + "loss": 1.5226, + "step": 17862 + }, + { + "epoch": 0.8932, + "grad_norm": 9.415319442749023, + "learning_rate": 6.868967900077972e-07, + "loss": 0.9942, + "step": 17864 + }, + { + "epoch": 0.8933, + "grad_norm": 4.356390953063965, + "learning_rate": 6.856259664069098e-07, + "loss": 0.8742, + "step": 17866 + }, + { + "epoch": 0.8934, + "grad_norm": 3.536287307739258, + "learning_rate": 6.84356277734245e-07, + "loss": 0.6276, + "step": 17868 + }, + { + "epoch": 0.8935, + "grad_norm": 6.259418487548828, + "learning_rate": 6.83087724144511e-07, + "loss": 1.1733, + "step": 17870 + }, + { + "epoch": 0.8936, + "grad_norm": 3.212543249130249, + "learning_rate": 6.818203057922756e-07, + "loss": 0.8945, + "step": 17872 + }, + { + "epoch": 0.8937, + "grad_norm": 10.467618942260742, + "learning_rate": 6.805540228319718e-07, + "loss": 1.419, + "step": 17874 + }, + { + "epoch": 0.8938, + "grad_norm": 5.652041912078857, + "learning_rate": 6.792888754178906e-07, + "loss": 0.9852, + "step": 17876 + }, + { + "epoch": 0.8939, + "grad_norm": 2.98114275932312, + "learning_rate": 6.780248637041875e-07, + "loss": 0.5453, + "step": 17878 + }, + { + "epoch": 0.894, + "grad_norm": 1.2255544662475586, + "learning_rate": 6.767619878448783e-07, + "loss": 0.0345, + "step": 17880 + }, + { + "epoch": 0.8941, + "grad_norm": 7.2010722160339355, + "learning_rate": 6.755002479938411e-07, + "loss": 0.7359, + "step": 17882 + }, + { + "epoch": 0.8942, + "grad_norm": 9.990978240966797, + "learning_rate": 6.742396443048138e-07, + "loss": 0.6429, + "step": 17884 + }, + { + "epoch": 0.8943, + "grad_norm": 4.837777137756348, + "learning_rate": 6.729801769313982e-07, + "loss": 0.9538, + "step": 17886 + }, + { + "epoch": 0.8944, + "grad_norm": 6.855040073394775, + "learning_rate": 6.717218460270536e-07, + "loss": 0.4919, + "step": 17888 + }, + { + "epoch": 0.8945, + "grad_norm": 4.6695237159729, + "learning_rate": 6.704646517451108e-07, + "loss": 0.4839, + "step": 17890 + }, + { + "epoch": 0.8946, + "grad_norm": 4.3926591873168945, + "learning_rate": 6.692085942387483e-07, + "loss": 1.1484, + "step": 17892 + }, + { + "epoch": 0.8947, + "grad_norm": 6.387213706970215, + "learning_rate": 6.679536736610137e-07, + "loss": 1.0146, + "step": 17894 + }, + { + "epoch": 0.8948, + "grad_norm": 0.21043048799037933, + "learning_rate": 6.666998901648203e-07, + "loss": 0.111, + "step": 17896 + }, + { + "epoch": 0.8949, + "grad_norm": 6.850005149841309, + "learning_rate": 6.654472439029314e-07, + "loss": 1.2851, + "step": 17898 + }, + { + "epoch": 0.895, + "grad_norm": 3.4288582801818848, + "learning_rate": 6.641957350279838e-07, + "loss": 1.6903, + "step": 17900 + }, + { + "epoch": 0.8951, + "grad_norm": 4.193212985992432, + "learning_rate": 6.629453636924643e-07, + "loss": 1.071, + "step": 17902 + }, + { + "epoch": 0.8952, + "grad_norm": 3.2603793144226074, + "learning_rate": 6.616961300487323e-07, + "loss": 0.6963, + "step": 17904 + }, + { + "epoch": 0.8953, + "grad_norm": 3.173377275466919, + "learning_rate": 6.604480342490005e-07, + "loss": 0.3745, + "step": 17906 + }, + { + "epoch": 0.8954, + "grad_norm": 9.541975975036621, + "learning_rate": 6.592010764453449e-07, + "loss": 0.8256, + "step": 17908 + }, + { + "epoch": 0.8955, + "grad_norm": 2.4990415573120117, + "learning_rate": 6.579552567897052e-07, + "loss": 0.7149, + "step": 17910 + }, + { + "epoch": 0.8956, + "grad_norm": 3.583609104156494, + "learning_rate": 6.567105754338798e-07, + "loss": 0.8, + "step": 17912 + }, + { + "epoch": 0.8957, + "grad_norm": 3.289816379547119, + "learning_rate": 6.554670325295298e-07, + "loss": 0.7203, + "step": 17914 + }, + { + "epoch": 0.8958, + "grad_norm": 5.956127166748047, + "learning_rate": 6.542246282281772e-07, + "loss": 1.6254, + "step": 17916 + }, + { + "epoch": 0.8959, + "grad_norm": 1.8643559217453003, + "learning_rate": 6.529833626812044e-07, + "loss": 1.2796, + "step": 17918 + }, + { + "epoch": 0.896, + "grad_norm": 2.304119110107422, + "learning_rate": 6.517432360398556e-07, + "loss": 1.1907, + "step": 17920 + }, + { + "epoch": 0.8961, + "grad_norm": 12.558161735534668, + "learning_rate": 6.505042484552382e-07, + "loss": 1.5995, + "step": 17922 + }, + { + "epoch": 0.8962, + "grad_norm": 18.182086944580078, + "learning_rate": 6.492664000783166e-07, + "loss": 1.1377, + "step": 17924 + }, + { + "epoch": 0.8963, + "grad_norm": 2.658275842666626, + "learning_rate": 6.480296910599238e-07, + "loss": 0.4709, + "step": 17926 + }, + { + "epoch": 0.8964, + "grad_norm": 6.014880657196045, + "learning_rate": 6.467941215507434e-07, + "loss": 1.2088, + "step": 17928 + }, + { + "epoch": 0.8965, + "grad_norm": 1.522965431213379, + "learning_rate": 6.455596917013274e-07, + "loss": 0.0986, + "step": 17930 + }, + { + "epoch": 0.8966, + "grad_norm": 5.917978286743164, + "learning_rate": 6.443264016620887e-07, + "loss": 1.3759, + "step": 17932 + }, + { + "epoch": 0.8967, + "grad_norm": 4.013158798217773, + "learning_rate": 6.430942515832983e-07, + "loss": 0.9964, + "step": 17934 + }, + { + "epoch": 0.8968, + "grad_norm": 5.259104251861572, + "learning_rate": 6.418632416150927e-07, + "loss": 0.741, + "step": 17936 + }, + { + "epoch": 0.8969, + "grad_norm": 2.827568292617798, + "learning_rate": 6.40633371907462e-07, + "loss": 0.7402, + "step": 17938 + }, + { + "epoch": 0.897, + "grad_norm": 5.30014705657959, + "learning_rate": 6.394046426102673e-07, + "loss": 1.5506, + "step": 17940 + }, + { + "epoch": 0.8971, + "grad_norm": 8.717474937438965, + "learning_rate": 6.381770538732223e-07, + "loss": 0.2152, + "step": 17942 + }, + { + "epoch": 0.8972, + "grad_norm": 2.3987882137298584, + "learning_rate": 6.369506058459063e-07, + "loss": 0.8505, + "step": 17944 + }, + { + "epoch": 0.8973, + "grad_norm": 3.0356390476226807, + "learning_rate": 6.357252986777595e-07, + "loss": 0.9015, + "step": 17946 + }, + { + "epoch": 0.8974, + "grad_norm": 3.925863742828369, + "learning_rate": 6.345011325180772e-07, + "loss": 1.4105, + "step": 17948 + }, + { + "epoch": 0.8975, + "grad_norm": 0.19399471580982208, + "learning_rate": 6.332781075160244e-07, + "loss": 0.1007, + "step": 17950 + }, + { + "epoch": 0.8976, + "grad_norm": 3.297863006591797, + "learning_rate": 6.320562238206218e-07, + "loss": 0.7638, + "step": 17952 + }, + { + "epoch": 0.8977, + "grad_norm": 2.945404529571533, + "learning_rate": 6.308354815807527e-07, + "loss": 0.3673, + "step": 17954 + }, + { + "epoch": 0.8978, + "grad_norm": 6.812829971313477, + "learning_rate": 6.296158809451602e-07, + "loss": 0.5265, + "step": 17956 + }, + { + "epoch": 0.8979, + "grad_norm": 2.8211193084716797, + "learning_rate": 6.283974220624489e-07, + "loss": 1.3267, + "step": 17958 + }, + { + "epoch": 0.898, + "grad_norm": 6.898566722869873, + "learning_rate": 6.271801050810856e-07, + "loss": 1.4165, + "step": 17960 + }, + { + "epoch": 0.8981, + "grad_norm": 6.109023094177246, + "learning_rate": 6.259639301493947e-07, + "loss": 1.3723, + "step": 17962 + }, + { + "epoch": 0.8982, + "grad_norm": 3.537736654281616, + "learning_rate": 6.247488974155657e-07, + "loss": 0.8346, + "step": 17964 + }, + { + "epoch": 0.8983, + "grad_norm": 3.254364490509033, + "learning_rate": 6.235350070276447e-07, + "loss": 0.9665, + "step": 17966 + }, + { + "epoch": 0.8984, + "grad_norm": 1.246232032775879, + "learning_rate": 6.223222591335409e-07, + "loss": 0.5457, + "step": 17968 + }, + { + "epoch": 0.8985, + "grad_norm": 8.096909523010254, + "learning_rate": 6.21110653881023e-07, + "loss": 0.5054, + "step": 17970 + }, + { + "epoch": 0.8986, + "grad_norm": 7.715300559997559, + "learning_rate": 6.199001914177261e-07, + "loss": 0.6394, + "step": 17972 + }, + { + "epoch": 0.8987, + "grad_norm": 3.7669975757598877, + "learning_rate": 6.186908718911344e-07, + "loss": 1.4803, + "step": 17974 + }, + { + "epoch": 0.8988, + "grad_norm": 4.6995158195495605, + "learning_rate": 6.174826954486069e-07, + "loss": 0.3842, + "step": 17976 + }, + { + "epoch": 0.8989, + "grad_norm": 4.911076545715332, + "learning_rate": 6.1627566223735e-07, + "loss": 0.9452, + "step": 17978 + }, + { + "epoch": 0.899, + "grad_norm": 3.6229770183563232, + "learning_rate": 6.150697724044407e-07, + "loss": 0.727, + "step": 17980 + }, + { + "epoch": 0.8991, + "grad_norm": 2.520920753479004, + "learning_rate": 6.138650260968138e-07, + "loss": 0.1955, + "step": 17982 + }, + { + "epoch": 0.8992, + "grad_norm": 6.189536094665527, + "learning_rate": 6.126614234612593e-07, + "loss": 1.391, + "step": 17984 + }, + { + "epoch": 0.8993, + "grad_norm": 9.3114595413208, + "learning_rate": 6.114589646444369e-07, + "loss": 1.099, + "step": 17986 + }, + { + "epoch": 0.8994, + "grad_norm": 5.3345513343811035, + "learning_rate": 6.102576497928614e-07, + "loss": 0.2374, + "step": 17988 + }, + { + "epoch": 0.8995, + "grad_norm": 5.9268479347229, + "learning_rate": 6.090574790529091e-07, + "loss": 0.869, + "step": 17990 + }, + { + "epoch": 0.8996, + "grad_norm": 2.0288562774658203, + "learning_rate": 6.078584525708175e-07, + "loss": 1.1624, + "step": 17992 + }, + { + "epoch": 0.8997, + "grad_norm": 3.157125473022461, + "learning_rate": 6.066605704926831e-07, + "loss": 0.6666, + "step": 17994 + }, + { + "epoch": 0.8998, + "grad_norm": 3.265658378601074, + "learning_rate": 6.054638329644658e-07, + "loss": 1.4396, + "step": 17996 + }, + { + "epoch": 0.8999, + "grad_norm": 2.1507728099823, + "learning_rate": 6.042682401319843e-07, + "loss": 0.2847, + "step": 17998 + }, + { + "epoch": 0.9, + "grad_norm": 2.986142158508301, + "learning_rate": 6.030737921409169e-07, + "loss": 0.7282, + "step": 18000 + }, + { + "epoch": 0.9001, + "grad_norm": 8.870338439941406, + "learning_rate": 6.018804891368035e-07, + "loss": 0.7494, + "step": 18002 + }, + { + "epoch": 0.9002, + "grad_norm": 6.825883388519287, + "learning_rate": 6.006883312650458e-07, + "loss": 0.6013, + "step": 18004 + }, + { + "epoch": 0.9003, + "grad_norm": 2.763442277908325, + "learning_rate": 5.994973186709041e-07, + "loss": 0.5769, + "step": 18006 + }, + { + "epoch": 0.9004, + "grad_norm": 4.384718418121338, + "learning_rate": 5.98307451499498e-07, + "loss": 0.898, + "step": 18008 + }, + { + "epoch": 0.9005, + "grad_norm": 8.014935493469238, + "learning_rate": 5.971187298958103e-07, + "loss": 0.8677, + "step": 18010 + }, + { + "epoch": 0.9006, + "grad_norm": 1.3396550416946411, + "learning_rate": 5.959311540046863e-07, + "loss": 1.344, + "step": 18012 + }, + { + "epoch": 0.9007, + "grad_norm": 6.939223766326904, + "learning_rate": 5.947447239708215e-07, + "loss": 1.1715, + "step": 18014 + }, + { + "epoch": 0.9008, + "grad_norm": 2.8039886951446533, + "learning_rate": 5.935594399387856e-07, + "loss": 0.6973, + "step": 18016 + }, + { + "epoch": 0.9009, + "grad_norm": 2.6574597358703613, + "learning_rate": 5.923753020529998e-07, + "loss": 0.4598, + "step": 18018 + }, + { + "epoch": 0.901, + "grad_norm": 4.151673793792725, + "learning_rate": 5.911923104577455e-07, + "loss": 0.9458, + "step": 18020 + }, + { + "epoch": 0.9011, + "grad_norm": 7.654257297515869, + "learning_rate": 5.900104652971695e-07, + "loss": 0.828, + "step": 18022 + }, + { + "epoch": 0.9012, + "grad_norm": 4.9852728843688965, + "learning_rate": 5.888297667152731e-07, + "loss": 0.601, + "step": 18024 + }, + { + "epoch": 0.9013, + "grad_norm": 7.251172065734863, + "learning_rate": 5.876502148559238e-07, + "loss": 0.7776, + "step": 18026 + }, + { + "epoch": 0.9014, + "grad_norm": 10.695982933044434, + "learning_rate": 5.864718098628441e-07, + "loss": 2.0384, + "step": 18028 + }, + { + "epoch": 0.9015, + "grad_norm": 2.492905855178833, + "learning_rate": 5.852945518796205e-07, + "loss": 0.7609, + "step": 18030 + }, + { + "epoch": 0.9016, + "grad_norm": 3.294748306274414, + "learning_rate": 5.841184410496992e-07, + "loss": 0.7523, + "step": 18032 + }, + { + "epoch": 0.9017, + "grad_norm": 5.021809101104736, + "learning_rate": 5.829434775163834e-07, + "loss": 1.1914, + "step": 18034 + }, + { + "epoch": 0.9018, + "grad_norm": 3.795342206954956, + "learning_rate": 5.817696614228396e-07, + "loss": 0.4168, + "step": 18036 + }, + { + "epoch": 0.9019, + "grad_norm": 4.998139381408691, + "learning_rate": 5.805969929120947e-07, + "loss": 0.795, + "step": 18038 + }, + { + "epoch": 0.902, + "grad_norm": 7.876280784606934, + "learning_rate": 5.794254721270331e-07, + "loss": 0.8782, + "step": 18040 + }, + { + "epoch": 0.9021, + "grad_norm": 4.7589497566223145, + "learning_rate": 5.78255099210403e-07, + "loss": 0.6953, + "step": 18042 + }, + { + "epoch": 0.9022, + "grad_norm": 4.816479682922363, + "learning_rate": 5.770858743048091e-07, + "loss": 0.5575, + "step": 18044 + }, + { + "epoch": 0.9023, + "grad_norm": 21.931507110595703, + "learning_rate": 5.759177975527186e-07, + "loss": 1.2902, + "step": 18046 + }, + { + "epoch": 0.9024, + "grad_norm": 7.14919900894165, + "learning_rate": 5.747508690964599e-07, + "loss": 0.5825, + "step": 18048 + }, + { + "epoch": 0.9025, + "grad_norm": 9.46651554107666, + "learning_rate": 5.735850890782158e-07, + "loss": 1.0847, + "step": 18050 + }, + { + "epoch": 0.9026, + "grad_norm": 10.528321266174316, + "learning_rate": 5.724204576400372e-07, + "loss": 1.9538, + "step": 18052 + }, + { + "epoch": 0.9027, + "grad_norm": 13.150111198425293, + "learning_rate": 5.712569749238284e-07, + "loss": 2.178, + "step": 18054 + }, + { + "epoch": 0.9028, + "grad_norm": 4.4237165451049805, + "learning_rate": 5.700946410713548e-07, + "loss": 1.1674, + "step": 18056 + }, + { + "epoch": 0.9029, + "grad_norm": 0.8457707762718201, + "learning_rate": 5.689334562242488e-07, + "loss": 0.3263, + "step": 18058 + }, + { + "epoch": 0.903, + "grad_norm": 8.334202766418457, + "learning_rate": 5.677734205239904e-07, + "loss": 1.2747, + "step": 18060 + }, + { + "epoch": 0.9031, + "grad_norm": 0.7592671513557434, + "learning_rate": 5.666145341119322e-07, + "loss": 0.1043, + "step": 18062 + }, + { + "epoch": 0.9032, + "grad_norm": 8.168760299682617, + "learning_rate": 5.654567971292757e-07, + "loss": 0.9007, + "step": 18064 + }, + { + "epoch": 0.9033, + "grad_norm": 14.105587005615234, + "learning_rate": 5.643002097170924e-07, + "loss": 1.7732, + "step": 18066 + }, + { + "epoch": 0.9034, + "grad_norm": 11.710031509399414, + "learning_rate": 5.631447720163074e-07, + "loss": 0.9347, + "step": 18068 + }, + { + "epoch": 0.9035, + "grad_norm": 13.492654800415039, + "learning_rate": 5.619904841677059e-07, + "loss": 1.141, + "step": 18070 + }, + { + "epoch": 0.9036, + "grad_norm": 4.390909671783447, + "learning_rate": 5.608373463119354e-07, + "loss": 0.4909, + "step": 18072 + }, + { + "epoch": 0.9037, + "grad_norm": 7.590572834014893, + "learning_rate": 5.596853585895034e-07, + "loss": 1.2893, + "step": 18074 + }, + { + "epoch": 0.9038, + "grad_norm": 4.7296142578125, + "learning_rate": 5.585345211407734e-07, + "loss": 1.0083, + "step": 18076 + }, + { + "epoch": 0.9039, + "grad_norm": 7.373366832733154, + "learning_rate": 5.57384834105974e-07, + "loss": 0.4401, + "step": 18078 + }, + { + "epoch": 0.904, + "grad_norm": 5.477568626403809, + "learning_rate": 5.562362976251901e-07, + "loss": 1.7706, + "step": 18080 + }, + { + "epoch": 0.9041, + "grad_norm": 7.007457256317139, + "learning_rate": 5.550889118383674e-07, + "loss": 0.6198, + "step": 18082 + }, + { + "epoch": 0.9042, + "grad_norm": 15.67258071899414, + "learning_rate": 5.539426768853107e-07, + "loss": 1.5318, + "step": 18084 + }, + { + "epoch": 0.9043, + "grad_norm": 4.6145405769348145, + "learning_rate": 5.52797592905685e-07, + "loss": 0.667, + "step": 18086 + }, + { + "epoch": 0.9044, + "grad_norm": 5.503971099853516, + "learning_rate": 5.516536600390188e-07, + "loss": 0.5132, + "step": 18088 + }, + { + "epoch": 0.9045, + "grad_norm": 3.7714009284973145, + "learning_rate": 5.505108784246926e-07, + "loss": 0.8886, + "step": 18090 + }, + { + "epoch": 0.9046, + "grad_norm": 7.516842365264893, + "learning_rate": 5.49369248201953e-07, + "loss": 0.4725, + "step": 18092 + }, + { + "epoch": 0.9047, + "grad_norm": 4.637226581573486, + "learning_rate": 5.482287695099031e-07, + "loss": 0.3619, + "step": 18094 + }, + { + "epoch": 0.9048, + "grad_norm": 2.441650629043579, + "learning_rate": 5.470894424875062e-07, + "loss": 0.0854, + "step": 18096 + }, + { + "epoch": 0.9049, + "grad_norm": 10.281482696533203, + "learning_rate": 5.4595126727359e-07, + "loss": 1.3352, + "step": 18098 + }, + { + "epoch": 0.905, + "grad_norm": 3.9175026416778564, + "learning_rate": 5.448142440068316e-07, + "loss": 0.2925, + "step": 18100 + }, + { + "epoch": 0.9051, + "grad_norm": 5.145664215087891, + "learning_rate": 5.436783728257789e-07, + "loss": 0.8488, + "step": 18102 + }, + { + "epoch": 0.9052, + "grad_norm": 19.652097702026367, + "learning_rate": 5.425436538688322e-07, + "loss": 1.7077, + "step": 18104 + }, + { + "epoch": 0.9053, + "grad_norm": 1.759239912033081, + "learning_rate": 5.414100872742534e-07, + "loss": 1.3155, + "step": 18106 + }, + { + "epoch": 0.9054, + "grad_norm": 4.275135040283203, + "learning_rate": 5.402776731801662e-07, + "loss": 0.9884, + "step": 18108 + }, + { + "epoch": 0.9055, + "grad_norm": 5.714369773864746, + "learning_rate": 5.391464117245471e-07, + "loss": 0.3282, + "step": 18110 + }, + { + "epoch": 0.9056, + "grad_norm": 5.176314353942871, + "learning_rate": 5.380163030452412e-07, + "loss": 1.4012, + "step": 18112 + }, + { + "epoch": 0.9057, + "grad_norm": 4.954285144805908, + "learning_rate": 5.368873472799474e-07, + "loss": 1.5552, + "step": 18114 + }, + { + "epoch": 0.9058, + "grad_norm": 3.3832709789276123, + "learning_rate": 5.357595445662267e-07, + "loss": 0.5327, + "step": 18116 + }, + { + "epoch": 0.9059, + "grad_norm": 5.4670820236206055, + "learning_rate": 5.346328950414969e-07, + "loss": 0.7835, + "step": 18118 + }, + { + "epoch": 0.906, + "grad_norm": 4.209352970123291, + "learning_rate": 5.335073988430373e-07, + "loss": 1.3349, + "step": 18120 + }, + { + "epoch": 0.9061, + "grad_norm": 8.478549003601074, + "learning_rate": 5.323830561079857e-07, + "loss": 0.9842, + "step": 18122 + }, + { + "epoch": 0.9062, + "grad_norm": 3.0841329097747803, + "learning_rate": 5.312598669733404e-07, + "loss": 1.0408, + "step": 18124 + }, + { + "epoch": 0.9063, + "grad_norm": 3.02205753326416, + "learning_rate": 5.301378315759598e-07, + "loss": 1.1762, + "step": 18126 + }, + { + "epoch": 0.9064, + "grad_norm": 5.515789985656738, + "learning_rate": 5.290169500525577e-07, + "loss": 0.9072, + "step": 18128 + }, + { + "epoch": 0.9065, + "grad_norm": 3.8055381774902344, + "learning_rate": 5.278972225397128e-07, + "loss": 1.4318, + "step": 18130 + }, + { + "epoch": 0.9066, + "grad_norm": 4.49526834487915, + "learning_rate": 5.267786491738569e-07, + "loss": 1.1065, + "step": 18132 + }, + { + "epoch": 0.9067, + "grad_norm": 8.987667083740234, + "learning_rate": 5.256612300912911e-07, + "loss": 0.8787, + "step": 18134 + }, + { + "epoch": 0.9068, + "grad_norm": 6.011823654174805, + "learning_rate": 5.245449654281632e-07, + "loss": 0.2728, + "step": 18136 + }, + { + "epoch": 0.9069, + "grad_norm": 3.276275873184204, + "learning_rate": 5.234298553204908e-07, + "loss": 0.7787, + "step": 18138 + }, + { + "epoch": 0.907, + "grad_norm": 4.28641414642334, + "learning_rate": 5.223158999041444e-07, + "loss": 0.6113, + "step": 18140 + }, + { + "epoch": 0.9071, + "grad_norm": 4.522246360778809, + "learning_rate": 5.212030993148554e-07, + "loss": 0.597, + "step": 18142 + }, + { + "epoch": 0.9072, + "grad_norm": 3.5702741146087646, + "learning_rate": 5.200914536882184e-07, + "loss": 1.0308, + "step": 18144 + }, + { + "epoch": 0.9073, + "grad_norm": 12.331389427185059, + "learning_rate": 5.189809631596798e-07, + "loss": 0.9708, + "step": 18146 + }, + { + "epoch": 0.9074, + "grad_norm": 1.7302509546279907, + "learning_rate": 5.178716278645534e-07, + "loss": 0.7893, + "step": 18148 + }, + { + "epoch": 0.9075, + "grad_norm": 5.769188404083252, + "learning_rate": 5.167634479380068e-07, + "loss": 0.9207, + "step": 18150 + }, + { + "epoch": 0.9076, + "grad_norm": 2.9393539428710938, + "learning_rate": 5.156564235150686e-07, + "loss": 1.2499, + "step": 18152 + }, + { + "epoch": 0.9077, + "grad_norm": 2.4810588359832764, + "learning_rate": 5.145505547306251e-07, + "loss": 0.7269, + "step": 18154 + }, + { + "epoch": 0.9078, + "grad_norm": 6.002814292907715, + "learning_rate": 5.134458417194255e-07, + "loss": 1.0647, + "step": 18156 + }, + { + "epoch": 0.9079, + "grad_norm": 4.474119663238525, + "learning_rate": 5.12342284616073e-07, + "loss": 0.76, + "step": 18158 + }, + { + "epoch": 0.908, + "grad_norm": 4.042145729064941, + "learning_rate": 5.112398835550348e-07, + "loss": 1.0356, + "step": 18160 + }, + { + "epoch": 0.9081, + "grad_norm": 4.6525421142578125, + "learning_rate": 5.101386386706342e-07, + "loss": 0.7445, + "step": 18162 + }, + { + "epoch": 0.9082, + "grad_norm": 2.4161524772644043, + "learning_rate": 5.090385500970551e-07, + "loss": 0.3754, + "step": 18164 + }, + { + "epoch": 0.9083, + "grad_norm": 4.213879585266113, + "learning_rate": 5.079396179683382e-07, + "loss": 0.5844, + "step": 18166 + }, + { + "epoch": 0.9084, + "grad_norm": 11.066215515136719, + "learning_rate": 5.068418424183874e-07, + "loss": 0.7076, + "step": 18168 + }, + { + "epoch": 0.9085, + "grad_norm": 6.42080545425415, + "learning_rate": 5.057452235809623e-07, + "loss": 0.145, + "step": 18170 + }, + { + "epoch": 0.9086, + "grad_norm": 9.460894584655762, + "learning_rate": 5.046497615896806e-07, + "loss": 1.3176, + "step": 18172 + }, + { + "epoch": 0.9087, + "grad_norm": 15.209765434265137, + "learning_rate": 5.035554565780265e-07, + "loss": 1.0911, + "step": 18174 + }, + { + "epoch": 0.9088, + "grad_norm": 5.308688163757324, + "learning_rate": 5.024623086793323e-07, + "loss": 0.8692, + "step": 18176 + }, + { + "epoch": 0.9089, + "grad_norm": 1.353451132774353, + "learning_rate": 5.013703180267959e-07, + "loss": 0.6808, + "step": 18178 + }, + { + "epoch": 0.909, + "grad_norm": 15.346653938293457, + "learning_rate": 5.002794847534765e-07, + "loss": 0.9591, + "step": 18180 + }, + { + "epoch": 0.9091, + "grad_norm": 4.185728549957275, + "learning_rate": 4.99189808992282e-07, + "loss": 0.7924, + "step": 18182 + }, + { + "epoch": 0.9092, + "grad_norm": 4.687820911407471, + "learning_rate": 4.981012908759941e-07, + "loss": 1.0098, + "step": 18184 + }, + { + "epoch": 0.9093, + "grad_norm": 5.290226459503174, + "learning_rate": 4.97013930537239e-07, + "loss": 0.8769, + "step": 18186 + }, + { + "epoch": 0.9094, + "grad_norm": 1.8044511079788208, + "learning_rate": 4.959277281085128e-07, + "loss": 0.9949, + "step": 18188 + }, + { + "epoch": 0.9095, + "grad_norm": 3.0960397720336914, + "learning_rate": 4.948426837221632e-07, + "loss": 0.9714, + "step": 18190 + }, + { + "epoch": 0.9096, + "grad_norm": 5.450993061065674, + "learning_rate": 4.937587975103997e-07, + "loss": 1.0703, + "step": 18192 + }, + { + "epoch": 0.9097, + "grad_norm": 2.775888681411743, + "learning_rate": 4.926760696052934e-07, + "loss": 0.1701, + "step": 18194 + }, + { + "epoch": 0.9098, + "grad_norm": 3.839564561843872, + "learning_rate": 4.915945001387668e-07, + "loss": 0.8017, + "step": 18196 + }, + { + "epoch": 0.9099, + "grad_norm": 3.6069910526275635, + "learning_rate": 4.905140892426097e-07, + "loss": 1.0228, + "step": 18198 + }, + { + "epoch": 0.91, + "grad_norm": 8.292706489562988, + "learning_rate": 4.894348370484648e-07, + "loss": 1.5403, + "step": 18200 + }, + { + "epoch": 0.9101, + "grad_norm": 1.020399570465088, + "learning_rate": 4.883567436878367e-07, + "loss": 0.2762, + "step": 18202 + }, + { + "epoch": 0.9102, + "grad_norm": 5.209334373474121, + "learning_rate": 4.872798092920871e-07, + "loss": 1.0445, + "step": 18204 + }, + { + "epoch": 0.9103, + "grad_norm": 4.2906341552734375, + "learning_rate": 4.862040339924379e-07, + "loss": 1.0551, + "step": 18206 + }, + { + "epoch": 0.9104, + "grad_norm": 1.6051734685897827, + "learning_rate": 4.851294179199673e-07, + "loss": 0.0779, + "step": 18208 + }, + { + "epoch": 0.9105, + "grad_norm": 4.8708367347717285, + "learning_rate": 4.840559612056184e-07, + "loss": 0.4047, + "step": 18210 + }, + { + "epoch": 0.9106, + "grad_norm": 2.836137294769287, + "learning_rate": 4.829836639801844e-07, + "loss": 0.9619, + "step": 18212 + }, + { + "epoch": 0.9107, + "grad_norm": 3.7281553745269775, + "learning_rate": 4.819125263743229e-07, + "loss": 1.1299, + "step": 18214 + }, + { + "epoch": 0.9108, + "grad_norm": 9.884147644042969, + "learning_rate": 4.808425485185486e-07, + "loss": 1.1318, + "step": 18216 + }, + { + "epoch": 0.9109, + "grad_norm": 3.984766721725464, + "learning_rate": 4.797737305432337e-07, + "loss": 0.497, + "step": 18218 + }, + { + "epoch": 0.911, + "grad_norm": 9.741004943847656, + "learning_rate": 4.787060725786141e-07, + "loss": 0.866, + "step": 18220 + }, + { + "epoch": 0.9111, + "grad_norm": 5.4994797706604, + "learning_rate": 4.776395747547758e-07, + "loss": 1.1112, + "step": 18222 + }, + { + "epoch": 0.9112, + "grad_norm": 3.71598744392395, + "learning_rate": 4.765742372016735e-07, + "loss": 0.5212, + "step": 18224 + }, + { + "epoch": 0.9113, + "grad_norm": 4.970616817474365, + "learning_rate": 4.755100600491103e-07, + "loss": 0.3715, + "step": 18226 + }, + { + "epoch": 0.9114, + "grad_norm": 1.47749924659729, + "learning_rate": 4.7444704342675673e-07, + "loss": 0.7955, + "step": 18228 + }, + { + "epoch": 0.9115, + "grad_norm": 3.886702299118042, + "learning_rate": 4.733851874641382e-07, + "loss": 1.5412, + "step": 18230 + }, + { + "epoch": 0.9116, + "grad_norm": 2.0424489974975586, + "learning_rate": 4.723244922906356e-07, + "loss": 1.0161, + "step": 18232 + }, + { + "epoch": 0.9117, + "grad_norm": 2.1519367694854736, + "learning_rate": 4.712649580354933e-07, + "loss": 0.7288, + "step": 18234 + }, + { + "epoch": 0.9118, + "grad_norm": 3.633253812789917, + "learning_rate": 4.702065848278126e-07, + "loss": 0.6712, + "step": 18236 + }, + { + "epoch": 0.9119, + "grad_norm": 9.48561954498291, + "learning_rate": 4.6914937279655125e-07, + "loss": 0.9477, + "step": 18238 + }, + { + "epoch": 0.912, + "grad_norm": 6.093535900115967, + "learning_rate": 4.6809332207053083e-07, + "loss": 1.2913, + "step": 18240 + }, + { + "epoch": 0.9121, + "grad_norm": 1.9626562595367432, + "learning_rate": 4.6703843277842387e-07, + "loss": 0.5315, + "step": 18242 + }, + { + "epoch": 0.9122, + "grad_norm": 5.779587268829346, + "learning_rate": 4.659847050487687e-07, + "loss": 1.2934, + "step": 18244 + }, + { + "epoch": 0.9123, + "grad_norm": 3.8345727920532227, + "learning_rate": 4.6493213900995703e-07, + "loss": 1.1331, + "step": 18246 + }, + { + "epoch": 0.9124, + "grad_norm": 2.2277655601501465, + "learning_rate": 4.638807347902408e-07, + "loss": 0.7955, + "step": 18248 + }, + { + "epoch": 0.9125, + "grad_norm": 5.57105827331543, + "learning_rate": 4.628304925177318e-07, + "loss": 0.883, + "step": 18250 + }, + { + "epoch": 0.9126, + "grad_norm": 13.659500122070312, + "learning_rate": 4.6178141232039676e-07, + "loss": 1.6294, + "step": 18252 + }, + { + "epoch": 0.9127, + "grad_norm": 9.880464553833008, + "learning_rate": 4.6073349432606554e-07, + "loss": 1.0926, + "step": 18254 + }, + { + "epoch": 0.9128, + "grad_norm": 7.095430374145508, + "learning_rate": 4.596867386624215e-07, + "loss": 1.022, + "step": 18256 + }, + { + "epoch": 0.9129, + "grad_norm": 3.767233371734619, + "learning_rate": 4.586411454570083e-07, + "loss": 0.5827, + "step": 18258 + }, + { + "epoch": 0.913, + "grad_norm": 2.7540974617004395, + "learning_rate": 4.575967148372318e-07, + "loss": 0.8421, + "step": 18260 + }, + { + "epoch": 0.9131, + "grad_norm": 2.202950954437256, + "learning_rate": 4.5655344693034896e-07, + "loss": 1.065, + "step": 18262 + }, + { + "epoch": 0.9132, + "grad_norm": 3.7064759731292725, + "learning_rate": 4.5551134186348045e-07, + "loss": 0.8445, + "step": 18264 + }, + { + "epoch": 0.9133, + "grad_norm": 5.502486705780029, + "learning_rate": 4.5447039976360463e-07, + "loss": 1.2174, + "step": 18266 + }, + { + "epoch": 0.9134, + "grad_norm": 6.153170108795166, + "learning_rate": 4.534306207575545e-07, + "loss": 0.394, + "step": 18268 + }, + { + "epoch": 0.9135, + "grad_norm": 4.97020149230957, + "learning_rate": 4.5239200497202654e-07, + "loss": 1.0496, + "step": 18270 + }, + { + "epoch": 0.9136, + "grad_norm": 3.633798837661743, + "learning_rate": 4.5135455253357053e-07, + "loss": 0.6977, + "step": 18272 + }, + { + "epoch": 0.9137, + "grad_norm": 0.8490571975708008, + "learning_rate": 4.5031826356859874e-07, + "loss": 0.188, + "step": 18274 + }, + { + "epoch": 0.9138, + "grad_norm": 4.768453598022461, + "learning_rate": 4.492831382033791e-07, + "loss": 1.1272, + "step": 18276 + }, + { + "epoch": 0.9139, + "grad_norm": 4.862828254699707, + "learning_rate": 4.4824917656403954e-07, + "loss": 0.7036, + "step": 18278 + }, + { + "epoch": 0.914, + "grad_norm": 5.637712001800537, + "learning_rate": 4.4721637877656377e-07, + "loss": 0.3617, + "step": 18280 + }, + { + "epoch": 0.9141, + "grad_norm": 10.29205322265625, + "learning_rate": 4.461847449667955e-07, + "loss": 1.0464, + "step": 18282 + }, + { + "epoch": 0.9142, + "grad_norm": 3.3958663940429688, + "learning_rate": 4.451542752604365e-07, + "loss": 0.7375, + "step": 18284 + }, + { + "epoch": 0.9143, + "grad_norm": 3.497755765914917, + "learning_rate": 4.441249697830452e-07, + "loss": 0.7719, + "step": 18286 + }, + { + "epoch": 0.9144, + "grad_norm": 10.530708312988281, + "learning_rate": 4.4309682866004124e-07, + "loss": 0.5982, + "step": 18288 + }, + { + "epoch": 0.9145, + "grad_norm": 6.404597759246826, + "learning_rate": 4.420698520166988e-07, + "loss": 1.0393, + "step": 18290 + }, + { + "epoch": 0.9146, + "grad_norm": 5.306743621826172, + "learning_rate": 4.4104403997815346e-07, + "loss": 1.0361, + "step": 18292 + }, + { + "epoch": 0.9147, + "grad_norm": 2.750814437866211, + "learning_rate": 4.400193926693952e-07, + "loss": 0.2551, + "step": 18294 + }, + { + "epoch": 0.9148, + "grad_norm": 5.502594947814941, + "learning_rate": 4.3899591021527743e-07, + "loss": 0.4348, + "step": 18296 + }, + { + "epoch": 0.9149, + "grad_norm": 5.102119445800781, + "learning_rate": 4.379735927405038e-07, + "loss": 0.7433, + "step": 18298 + }, + { + "epoch": 0.915, + "grad_norm": 5.705443382263184, + "learning_rate": 4.3695244036964567e-07, + "loss": 1.0823, + "step": 18300 + }, + { + "epoch": 0.9151, + "grad_norm": 2.6369800567626953, + "learning_rate": 4.3593245322712476e-07, + "loss": 0.2652, + "step": 18302 + }, + { + "epoch": 0.9152, + "grad_norm": 5.96918249130249, + "learning_rate": 4.349136314372204e-07, + "loss": 1.2898, + "step": 18304 + }, + { + "epoch": 0.9153, + "grad_norm": 11.26224422454834, + "learning_rate": 4.338959751240801e-07, + "loss": 2.0269, + "step": 18306 + }, + { + "epoch": 0.9154, + "grad_norm": 2.4266135692596436, + "learning_rate": 4.3287948441169457e-07, + "loss": 0.2491, + "step": 18308 + }, + { + "epoch": 0.9155, + "grad_norm": 6.592166423797607, + "learning_rate": 4.318641594239259e-07, + "loss": 0.4114, + "step": 18310 + }, + { + "epoch": 0.9156, + "grad_norm": 2.9023077487945557, + "learning_rate": 4.308500002844862e-07, + "loss": 1.2081, + "step": 18312 + }, + { + "epoch": 0.9157, + "grad_norm": 16.79288101196289, + "learning_rate": 4.2983700711694665e-07, + "loss": 1.3463, + "step": 18314 + }, + { + "epoch": 0.9158, + "grad_norm": 5.609817981719971, + "learning_rate": 4.288251800447385e-07, + "loss": 1.1867, + "step": 18316 + }, + { + "epoch": 0.9159, + "grad_norm": 6.110819339752197, + "learning_rate": 4.2781451919115093e-07, + "loss": 0.604, + "step": 18318 + }, + { + "epoch": 0.916, + "grad_norm": 3.5126683712005615, + "learning_rate": 4.268050246793276e-07, + "loss": 0.8719, + "step": 18320 + }, + { + "epoch": 0.9161, + "grad_norm": 6.606064319610596, + "learning_rate": 4.257966966322735e-07, + "loss": 1.0786, + "step": 18322 + }, + { + "epoch": 0.9162, + "grad_norm": 3.6961865425109863, + "learning_rate": 4.247895351728504e-07, + "loss": 1.0666, + "step": 18324 + }, + { + "epoch": 0.9163, + "grad_norm": 3.20981502532959, + "learning_rate": 4.2378354042377776e-07, + "loss": 0.5917, + "step": 18326 + }, + { + "epoch": 0.9164, + "grad_norm": 3.810879707336426, + "learning_rate": 4.2277871250763327e-07, + "loss": 0.644, + "step": 18328 + }, + { + "epoch": 0.9165, + "grad_norm": 23.970232009887695, + "learning_rate": 4.2177505154685215e-07, + "loss": 0.7754, + "step": 18330 + }, + { + "epoch": 0.9166, + "grad_norm": 3.2760655879974365, + "learning_rate": 4.207725576637256e-07, + "loss": 0.4947, + "step": 18332 + }, + { + "epoch": 0.9167, + "grad_norm": 4.174849033355713, + "learning_rate": 4.197712309804058e-07, + "loss": 1.252, + "step": 18334 + }, + { + "epoch": 0.9168, + "grad_norm": 6.649113178253174, + "learning_rate": 4.1877107161890416e-07, + "loss": 0.4228, + "step": 18336 + }, + { + "epoch": 0.9169, + "grad_norm": 0.5739880204200745, + "learning_rate": 4.177720797010831e-07, + "loss": 0.3304, + "step": 18338 + }, + { + "epoch": 0.917, + "grad_norm": 2.868772029876709, + "learning_rate": 4.167742553486676e-07, + "loss": 0.6079, + "step": 18340 + }, + { + "epoch": 0.9171, + "grad_norm": 8.445385932922363, + "learning_rate": 4.157775986832413e-07, + "loss": 0.6799, + "step": 18342 + }, + { + "epoch": 0.9172, + "grad_norm": 3.301766872406006, + "learning_rate": 4.1478210982624055e-07, + "loss": 1.2659, + "step": 18344 + }, + { + "epoch": 0.9173, + "grad_norm": 10.283225059509277, + "learning_rate": 4.137877888989672e-07, + "loss": 0.9858, + "step": 18346 + }, + { + "epoch": 0.9174, + "grad_norm": 3.6677753925323486, + "learning_rate": 4.1279463602257207e-07, + "loss": 0.8736, + "step": 18348 + }, + { + "epoch": 0.9175, + "grad_norm": 1.9969815015792847, + "learning_rate": 4.118026513180695e-07, + "loss": 0.4595, + "step": 18350 + }, + { + "epoch": 0.9176, + "grad_norm": 8.162701606750488, + "learning_rate": 4.108118349063306e-07, + "loss": 1.143, + "step": 18352 + }, + { + "epoch": 0.9177, + "grad_norm": 1.5917996168136597, + "learning_rate": 4.0982218690808204e-07, + "loss": 0.6618, + "step": 18354 + }, + { + "epoch": 0.9178, + "grad_norm": 2.955303430557251, + "learning_rate": 4.0883370744390973e-07, + "loss": 0.565, + "step": 18356 + }, + { + "epoch": 0.9179, + "grad_norm": 9.630663871765137, + "learning_rate": 4.078463966342572e-07, + "loss": 1.5425, + "step": 18358 + }, + { + "epoch": 0.918, + "grad_norm": 13.502791404724121, + "learning_rate": 4.068602545994249e-07, + "loss": 0.7188, + "step": 18360 + }, + { + "epoch": 0.9181, + "grad_norm": 5.097113609313965, + "learning_rate": 4.0587528145957235e-07, + "loss": 0.9114, + "step": 18362 + }, + { + "epoch": 0.9182, + "grad_norm": 3.7985148429870605, + "learning_rate": 4.0489147733471347e-07, + "loss": 0.909, + "step": 18364 + }, + { + "epoch": 0.9183, + "grad_norm": 3.092498540878296, + "learning_rate": 4.039088423447235e-07, + "loss": 0.5221, + "step": 18366 + }, + { + "epoch": 0.9184, + "grad_norm": 3.693268299102783, + "learning_rate": 4.0292737660933335e-07, + "loss": 0.9183, + "step": 18368 + }, + { + "epoch": 0.9185, + "grad_norm": 4.269296169281006, + "learning_rate": 4.019470802481307e-07, + "loss": 1.2281, + "step": 18370 + }, + { + "epoch": 0.9186, + "grad_norm": 10.804274559020996, + "learning_rate": 4.009679533805633e-07, + "loss": 1.3833, + "step": 18372 + }, + { + "epoch": 0.9187, + "grad_norm": 6.88474702835083, + "learning_rate": 3.999899961259335e-07, + "loss": 0.8414, + "step": 18374 + }, + { + "epoch": 0.9188, + "grad_norm": 0.41805294156074524, + "learning_rate": 3.990132086034026e-07, + "loss": 0.5454, + "step": 18376 + }, + { + "epoch": 0.9189, + "grad_norm": 7.523619651794434, + "learning_rate": 3.9803759093199e-07, + "loss": 1.0565, + "step": 18378 + }, + { + "epoch": 0.919, + "grad_norm": 0.3705720603466034, + "learning_rate": 3.9706314323056936e-07, + "loss": 0.0538, + "step": 18380 + }, + { + "epoch": 0.9191, + "grad_norm": 3.132246732711792, + "learning_rate": 3.96089865617878e-07, + "loss": 0.7327, + "step": 18382 + }, + { + "epoch": 0.9192, + "grad_norm": 6.789026260375977, + "learning_rate": 3.9511775821250206e-07, + "loss": 1.3714, + "step": 18384 + }, + { + "epoch": 0.9193, + "grad_norm": 3.0762903690338135, + "learning_rate": 3.9414682113289473e-07, + "loss": 0.8702, + "step": 18386 + }, + { + "epoch": 0.9194, + "grad_norm": 3.4404282569885254, + "learning_rate": 3.931770544973601e-07, + "loss": 1.7153, + "step": 18388 + }, + { + "epoch": 0.9195, + "grad_norm": 3.6157679557800293, + "learning_rate": 3.922084584240582e-07, + "loss": 0.492, + "step": 18390 + }, + { + "epoch": 0.9196, + "grad_norm": 4.653417110443115, + "learning_rate": 3.912410330310157e-07, + "loss": 0.509, + "step": 18392 + }, + { + "epoch": 0.9197, + "grad_norm": 4.527660846710205, + "learning_rate": 3.9027477843610384e-07, + "loss": 0.988, + "step": 18394 + }, + { + "epoch": 0.9198, + "grad_norm": 2.903146982192993, + "learning_rate": 3.8930969475706183e-07, + "loss": 0.7107, + "step": 18396 + }, + { + "epoch": 0.9199, + "grad_norm": 4.020486831665039, + "learning_rate": 3.883457821114811e-07, + "loss": 0.9447, + "step": 18398 + }, + { + "epoch": 0.92, + "grad_norm": 2.5428786277770996, + "learning_rate": 3.8738304061681107e-07, + "loss": 1.0826, + "step": 18400 + }, + { + "epoch": 0.9201, + "grad_norm": 5.467055320739746, + "learning_rate": 3.8642147039036014e-07, + "loss": 1.5172, + "step": 18402 + }, + { + "epoch": 0.9202, + "grad_norm": 5.667673587799072, + "learning_rate": 3.854610715492924e-07, + "loss": 0.3736, + "step": 18404 + }, + { + "epoch": 0.9203, + "grad_norm": 3.090341329574585, + "learning_rate": 3.845018442106285e-07, + "loss": 0.9435, + "step": 18406 + }, + { + "epoch": 0.9204, + "grad_norm": 4.513547897338867, + "learning_rate": 3.835437884912474e-07, + "loss": 1.0355, + "step": 18408 + }, + { + "epoch": 0.9205, + "grad_norm": 10.377581596374512, + "learning_rate": 3.825869045078867e-07, + "loss": 0.7252, + "step": 18410 + }, + { + "epoch": 0.9206, + "grad_norm": 2.1177358627319336, + "learning_rate": 3.8163119237713877e-07, + "loss": 1.2971, + "step": 18412 + }, + { + "epoch": 0.9207, + "grad_norm": 14.16405963897705, + "learning_rate": 3.806766522154548e-07, + "loss": 0.8673, + "step": 18414 + }, + { + "epoch": 0.9208, + "grad_norm": 7.332795143127441, + "learning_rate": 3.7972328413914074e-07, + "loss": 0.9221, + "step": 18416 + }, + { + "epoch": 0.9209, + "grad_norm": 2.321343183517456, + "learning_rate": 3.7877108826436584e-07, + "loss": 0.546, + "step": 18418 + }, + { + "epoch": 0.921, + "grad_norm": 3.524444580078125, + "learning_rate": 3.7782006470714614e-07, + "loss": 1.1053, + "step": 18420 + }, + { + "epoch": 0.9211, + "grad_norm": 5.657606601715088, + "learning_rate": 3.7687021358336683e-07, + "loss": 0.2706, + "step": 18422 + }, + { + "epoch": 0.9212, + "grad_norm": 2.1194405555725098, + "learning_rate": 3.759215350087619e-07, + "loss": 1.1821, + "step": 18424 + }, + { + "epoch": 0.9213, + "grad_norm": 3.606400966644287, + "learning_rate": 3.749740290989234e-07, + "loss": 0.6384, + "step": 18426 + }, + { + "epoch": 0.9214, + "grad_norm": 2.6533353328704834, + "learning_rate": 3.7402769596930567e-07, + "loss": 0.5017, + "step": 18428 + }, + { + "epoch": 0.9215, + "grad_norm": 6.61241340637207, + "learning_rate": 3.7308253573521193e-07, + "loss": 0.9225, + "step": 18430 + }, + { + "epoch": 0.9216, + "grad_norm": 4.353476524353027, + "learning_rate": 3.721385485118123e-07, + "loss": 1.0641, + "step": 18432 + }, + { + "epoch": 0.9217, + "grad_norm": 6.208396911621094, + "learning_rate": 3.711957344141237e-07, + "loss": 0.5778, + "step": 18434 + }, + { + "epoch": 0.9218, + "grad_norm": 5.56964635848999, + "learning_rate": 3.7025409355702977e-07, + "loss": 0.645, + "step": 18436 + }, + { + "epoch": 0.9219, + "grad_norm": 3.1057634353637695, + "learning_rate": 3.693136260552632e-07, + "loss": 1.4625, + "step": 18438 + }, + { + "epoch": 0.922, + "grad_norm": 4.947170734405518, + "learning_rate": 3.68374332023419e-07, + "loss": 0.6942, + "step": 18440 + }, + { + "epoch": 0.9221, + "grad_norm": 4.963726997375488, + "learning_rate": 3.6743621157594554e-07, + "loss": 0.7625, + "step": 18442 + }, + { + "epoch": 0.9222, + "grad_norm": 1.8840501308441162, + "learning_rate": 3.664992648271526e-07, + "loss": 0.8198, + "step": 18444 + }, + { + "epoch": 0.9223, + "grad_norm": 3.2406351566314697, + "learning_rate": 3.65563491891201e-07, + "loss": 1.0612, + "step": 18446 + }, + { + "epoch": 0.9224, + "grad_norm": 2.520141124725342, + "learning_rate": 3.646288928821151e-07, + "loss": 0.9262, + "step": 18448 + }, + { + "epoch": 0.9225, + "grad_norm": 5.923039436340332, + "learning_rate": 3.6369546791377054e-07, + "loss": 0.6594, + "step": 18450 + }, + { + "epoch": 0.9226, + "grad_norm": 4.719236850738525, + "learning_rate": 3.627632170999029e-07, + "loss": 0.7097, + "step": 18452 + }, + { + "epoch": 0.9227, + "grad_norm": 9.862130165100098, + "learning_rate": 3.6183214055410586e-07, + "loss": 1.3661, + "step": 18454 + }, + { + "epoch": 0.9228, + "grad_norm": 4.849722862243652, + "learning_rate": 3.609022383898242e-07, + "loss": 0.7152, + "step": 18456 + }, + { + "epoch": 0.9229, + "grad_norm": 2.6391258239746094, + "learning_rate": 3.599735107203695e-07, + "loss": 1.0415, + "step": 18458 + }, + { + "epoch": 0.923, + "grad_norm": 4.818056106567383, + "learning_rate": 3.590459576589e-07, + "loss": 2.1956, + "step": 18460 + }, + { + "epoch": 0.9231, + "grad_norm": 3.4445090293884277, + "learning_rate": 3.5811957931843557e-07, + "loss": 1.1736, + "step": 18462 + }, + { + "epoch": 0.9232, + "grad_norm": 5.622391700744629, + "learning_rate": 3.571943758118546e-07, + "loss": 1.0978, + "step": 18464 + }, + { + "epoch": 0.9233, + "grad_norm": 11.86209487915039, + "learning_rate": 3.5627034725188694e-07, + "loss": 0.421, + "step": 18466 + }, + { + "epoch": 0.9234, + "grad_norm": 2.3248696327209473, + "learning_rate": 3.553474937511281e-07, + "loss": 0.5232, + "step": 18468 + }, + { + "epoch": 0.9235, + "grad_norm": 6.036898612976074, + "learning_rate": 3.544258154220193e-07, + "loss": 1.2813, + "step": 18470 + }, + { + "epoch": 0.9236, + "grad_norm": 10.097639083862305, + "learning_rate": 3.5350531237686723e-07, + "loss": 1.0398, + "step": 18472 + }, + { + "epoch": 0.9237, + "grad_norm": 11.038125991821289, + "learning_rate": 3.5258598472783233e-07, + "loss": 0.8339, + "step": 18474 + }, + { + "epoch": 0.9238, + "grad_norm": 1.2721627950668335, + "learning_rate": 3.516678325869316e-07, + "loss": 0.2425, + "step": 18476 + }, + { + "epoch": 0.9239, + "grad_norm": 5.861643314361572, + "learning_rate": 3.5075085606604e-07, + "loss": 1.1905, + "step": 18478 + }, + { + "epoch": 0.924, + "grad_norm": 3.4238946437835693, + "learning_rate": 3.498350552768859e-07, + "loss": 0.3378, + "step": 18480 + }, + { + "epoch": 0.9241, + "grad_norm": 11.838418960571289, + "learning_rate": 3.489204303310578e-07, + "loss": 1.7257, + "step": 18482 + }, + { + "epoch": 0.9242, + "grad_norm": 11.956696510314941, + "learning_rate": 3.480069813400022e-07, + "loss": 1.2063, + "step": 18484 + }, + { + "epoch": 0.9243, + "grad_norm": 2.302340507507324, + "learning_rate": 3.470947084150167e-07, + "loss": 0.7866, + "step": 18486 + }, + { + "epoch": 0.9244, + "grad_norm": 4.965366363525391, + "learning_rate": 3.4618361166726123e-07, + "loss": 0.6872, + "step": 18488 + }, + { + "epoch": 0.9245, + "grad_norm": 2.7752156257629395, + "learning_rate": 3.4527369120775036e-07, + "loss": 0.1416, + "step": 18490 + }, + { + "epoch": 0.9246, + "grad_norm": 6.9506754875183105, + "learning_rate": 3.4436494714735313e-07, + "loss": 1.0695, + "step": 18492 + }, + { + "epoch": 0.9247, + "grad_norm": 2.570432424545288, + "learning_rate": 3.434573795967988e-07, + "loss": 1.138, + "step": 18494 + }, + { + "epoch": 0.9248, + "grad_norm": 3.8552346229553223, + "learning_rate": 3.4255098866667114e-07, + "loss": 0.7094, + "step": 18496 + }, + { + "epoch": 0.9249, + "grad_norm": 1.636447787284851, + "learning_rate": 3.4164577446741175e-07, + "loss": 1.048, + "step": 18498 + }, + { + "epoch": 0.925, + "grad_norm": 6.361367225646973, + "learning_rate": 3.4074173710931804e-07, + "loss": 0.5586, + "step": 18500 + }, + { + "epoch": 0.9251, + "grad_norm": 4.045589923858643, + "learning_rate": 3.398388767025418e-07, + "loss": 1.2098, + "step": 18502 + }, + { + "epoch": 0.9252, + "grad_norm": 6.387289524078369, + "learning_rate": 3.3893719335709953e-07, + "loss": 1.0923, + "step": 18504 + }, + { + "epoch": 0.9253, + "grad_norm": 2.526348352432251, + "learning_rate": 3.380366871828522e-07, + "loss": 1.4871, + "step": 18506 + }, + { + "epoch": 0.9254, + "grad_norm": 1.6937172412872314, + "learning_rate": 3.3713735828952985e-07, + "loss": 0.56, + "step": 18508 + }, + { + "epoch": 0.9255, + "grad_norm": 3.6237196922302246, + "learning_rate": 3.3623920678670597e-07, + "loss": 1.6045, + "step": 18510 + }, + { + "epoch": 0.9256, + "grad_norm": 6.198193550109863, + "learning_rate": 3.3534223278382405e-07, + "loss": 0.5371, + "step": 18512 + }, + { + "epoch": 0.9257, + "grad_norm": 5.6771368980407715, + "learning_rate": 3.344464363901756e-07, + "loss": 0.5628, + "step": 18514 + }, + { + "epoch": 0.9258, + "grad_norm": 4.972047805786133, + "learning_rate": 3.3355181771490776e-07, + "loss": 0.856, + "step": 18516 + }, + { + "epoch": 0.9259, + "grad_norm": 4.092444896697998, + "learning_rate": 3.326583768670311e-07, + "loss": 1.4986, + "step": 18518 + }, + { + "epoch": 0.926, + "grad_norm": 3.0201830863952637, + "learning_rate": 3.3176611395540625e-07, + "loss": 0.6539, + "step": 18520 + }, + { + "epoch": 0.9261, + "grad_norm": 3.1829416751861572, + "learning_rate": 3.3087502908875415e-07, + "loss": 0.6877, + "step": 18522 + }, + { + "epoch": 0.9262, + "grad_norm": 4.986604690551758, + "learning_rate": 3.2998512237565005e-07, + "loss": 0.8658, + "step": 18524 + }, + { + "epoch": 0.9263, + "grad_norm": 5.468716144561768, + "learning_rate": 3.290963939245262e-07, + "loss": 0.518, + "step": 18526 + }, + { + "epoch": 0.9264, + "grad_norm": 4.756303310394287, + "learning_rate": 3.282088438436715e-07, + "loss": 0.5807, + "step": 18528 + }, + { + "epoch": 0.9265, + "grad_norm": 3.357570171356201, + "learning_rate": 3.273224722412327e-07, + "loss": 1.2754, + "step": 18530 + }, + { + "epoch": 0.9266, + "grad_norm": 4.20378303527832, + "learning_rate": 3.2643727922520905e-07, + "loss": 0.511, + "step": 18532 + }, + { + "epoch": 0.9267, + "grad_norm": 16.65717887878418, + "learning_rate": 3.2555326490346094e-07, + "loss": 0.7675, + "step": 18534 + }, + { + "epoch": 0.9268, + "grad_norm": 7.506216049194336, + "learning_rate": 3.246704293837011e-07, + "loss": 1.4823, + "step": 18536 + }, + { + "epoch": 0.9269, + "grad_norm": 2.2124555110931396, + "learning_rate": 3.237887727735012e-07, + "loss": 0.5098, + "step": 18538 + }, + { + "epoch": 0.927, + "grad_norm": 7.074944496154785, + "learning_rate": 3.2290829518028867e-07, + "loss": 1.0048, + "step": 18540 + }, + { + "epoch": 0.9271, + "grad_norm": 4.249271869659424, + "learning_rate": 3.2202899671134546e-07, + "loss": 0.4841, + "step": 18542 + }, + { + "epoch": 0.9272, + "grad_norm": 3.6544454097747803, + "learning_rate": 3.211508774738137e-07, + "loss": 0.5793, + "step": 18544 + }, + { + "epoch": 0.9273, + "grad_norm": 3.517885446548462, + "learning_rate": 3.202739375746877e-07, + "loss": 0.6045, + "step": 18546 + }, + { + "epoch": 0.9274, + "grad_norm": 5.197198867797852, + "learning_rate": 3.19398177120821e-07, + "loss": 2.5423, + "step": 18548 + }, + { + "epoch": 0.9275, + "grad_norm": 5.549899101257324, + "learning_rate": 3.185235962189237e-07, + "loss": 0.9538, + "step": 18550 + }, + { + "epoch": 0.9276, + "grad_norm": 6.208946704864502, + "learning_rate": 3.1765019497555617e-07, + "loss": 1.2411, + "step": 18552 + }, + { + "epoch": 0.9277, + "grad_norm": 6.617724895477295, + "learning_rate": 3.1677797349714546e-07, + "loss": 0.6094, + "step": 18554 + }, + { + "epoch": 0.9278, + "grad_norm": 2.1844091415405273, + "learning_rate": 3.1590693188996324e-07, + "loss": 0.392, + "step": 18556 + }, + { + "epoch": 0.9279, + "grad_norm": 3.1057116985321045, + "learning_rate": 3.150370702601491e-07, + "loss": 0.9911, + "step": 18558 + }, + { + "epoch": 0.928, + "grad_norm": 2.4590904712677, + "learning_rate": 3.1416838871368925e-07, + "loss": 0.705, + "step": 18560 + }, + { + "epoch": 0.9281, + "grad_norm": 12.47998332977295, + "learning_rate": 3.1330088735643027e-07, + "loss": 0.8321, + "step": 18562 + }, + { + "epoch": 0.9282, + "grad_norm": 4.212388038635254, + "learning_rate": 3.1243456629407644e-07, + "loss": 0.6633, + "step": 18564 + }, + { + "epoch": 0.9283, + "grad_norm": 3.216930627822876, + "learning_rate": 3.115694256321855e-07, + "loss": 1.1129, + "step": 18566 + }, + { + "epoch": 0.9284, + "grad_norm": 5.7394633293151855, + "learning_rate": 3.10705465476171e-07, + "loss": 1.0622, + "step": 18568 + }, + { + "epoch": 0.9285, + "grad_norm": 1.2297226190567017, + "learning_rate": 3.098426859313053e-07, + "loss": 0.4458, + "step": 18570 + }, + { + "epoch": 0.9286, + "grad_norm": 6.794150352478027, + "learning_rate": 3.0898108710271437e-07, + "loss": 0.5526, + "step": 18572 + }, + { + "epoch": 0.9287, + "grad_norm": 11.20079231262207, + "learning_rate": 3.081206690953831e-07, + "loss": 0.9355, + "step": 18574 + }, + { + "epoch": 0.9288, + "grad_norm": 5.608401298522949, + "learning_rate": 3.072614320141487e-07, + "loss": 1.649, + "step": 18576 + }, + { + "epoch": 0.9289, + "grad_norm": 4.832703113555908, + "learning_rate": 3.064033759637064e-07, + "loss": 1.203, + "step": 18578 + }, + { + "epoch": 0.929, + "grad_norm": 2.279432535171509, + "learning_rate": 3.0554650104861137e-07, + "loss": 1.1233, + "step": 18580 + }, + { + "epoch": 0.9291, + "grad_norm": 6.0388641357421875, + "learning_rate": 3.0469080737326685e-07, + "loss": 1.4334, + "step": 18582 + }, + { + "epoch": 0.9292, + "grad_norm": 2.2855961322784424, + "learning_rate": 3.0383629504194047e-07, + "loss": 0.7169, + "step": 18584 + }, + { + "epoch": 0.9293, + "grad_norm": 4.006405830383301, + "learning_rate": 3.0298296415874894e-07, + "loss": 1.072, + "step": 18586 + }, + { + "epoch": 0.9294, + "grad_norm": 4.763950347900391, + "learning_rate": 3.0213081482766803e-07, + "loss": 0.9169, + "step": 18588 + }, + { + "epoch": 0.9295, + "grad_norm": 6.589143753051758, + "learning_rate": 3.0127984715253246e-07, + "loss": 0.8394, + "step": 18590 + }, + { + "epoch": 0.9296, + "grad_norm": 5.154077053070068, + "learning_rate": 3.00430061237027e-07, + "loss": 1.4015, + "step": 18592 + }, + { + "epoch": 0.9297, + "grad_norm": 5.814541339874268, + "learning_rate": 2.995814571846978e-07, + "loss": 0.7079, + "step": 18594 + }, + { + "epoch": 0.9298, + "grad_norm": 9.925080299377441, + "learning_rate": 2.987340350989421e-07, + "loss": 1.0438, + "step": 18596 + }, + { + "epoch": 0.9299, + "grad_norm": 10.124931335449219, + "learning_rate": 2.9788779508301725e-07, + "loss": 0.6186, + "step": 18598 + }, + { + "epoch": 0.93, + "grad_norm": 3.9439198970794678, + "learning_rate": 2.970427372400353e-07, + "loss": 0.8573, + "step": 18600 + }, + { + "epoch": 0.9301, + "grad_norm": 5.25805139541626, + "learning_rate": 2.961988616729639e-07, + "loss": 0.9167, + "step": 18602 + }, + { + "epoch": 0.9302, + "grad_norm": 4.206060886383057, + "learning_rate": 2.9535616848462624e-07, + "loss": 1.1119, + "step": 18604 + }, + { + "epoch": 0.9303, + "grad_norm": 4.847503185272217, + "learning_rate": 2.945146577777025e-07, + "loss": 0.3463, + "step": 18606 + }, + { + "epoch": 0.9304, + "grad_norm": 11.237757682800293, + "learning_rate": 2.936743296547273e-07, + "loss": 2.1378, + "step": 18608 + }, + { + "epoch": 0.9305, + "grad_norm": 4.381591320037842, + "learning_rate": 2.928351842180921e-07, + "loss": 0.4129, + "step": 18610 + }, + { + "epoch": 0.9306, + "grad_norm": 3.755932569503784, + "learning_rate": 2.919972215700462e-07, + "loss": 0.7129, + "step": 18612 + }, + { + "epoch": 0.9307, + "grad_norm": 5.686026573181152, + "learning_rate": 2.911604418126901e-07, + "loss": 0.8632, + "step": 18614 + }, + { + "epoch": 0.9308, + "grad_norm": 5.21612548828125, + "learning_rate": 2.9032484504798454e-07, + "loss": 0.3535, + "step": 18616 + }, + { + "epoch": 0.9309, + "grad_norm": 1.3057990074157715, + "learning_rate": 2.8949043137774356e-07, + "loss": 0.5634, + "step": 18618 + }, + { + "epoch": 0.931, + "grad_norm": 3.8211123943328857, + "learning_rate": 2.8865720090364037e-07, + "loss": 1.3358, + "step": 18620 + }, + { + "epoch": 0.9311, + "grad_norm": 4.8888468742370605, + "learning_rate": 2.878251537271981e-07, + "loss": 0.7342, + "step": 18622 + }, + { + "epoch": 0.9312, + "grad_norm": 4.349949359893799, + "learning_rate": 2.8699428994980017e-07, + "loss": 1.0581, + "step": 18624 + }, + { + "epoch": 0.9313, + "grad_norm": 4.035645008087158, + "learning_rate": 2.861646096726867e-07, + "loss": 1.2121, + "step": 18626 + }, + { + "epoch": 0.9314, + "grad_norm": 6.190158367156982, + "learning_rate": 2.8533611299694784e-07, + "loss": 0.816, + "step": 18628 + }, + { + "epoch": 0.9315, + "grad_norm": 3.259061574935913, + "learning_rate": 2.8450880002353967e-07, + "loss": 0.8813, + "step": 18630 + }, + { + "epoch": 0.9316, + "grad_norm": 7.836317539215088, + "learning_rate": 2.836826708532603e-07, + "loss": 0.8435, + "step": 18632 + }, + { + "epoch": 0.9317, + "grad_norm": 6.734375, + "learning_rate": 2.8285772558677703e-07, + "loss": 0.8304, + "step": 18634 + }, + { + "epoch": 0.9318, + "grad_norm": 3.5291833877563477, + "learning_rate": 2.8203396432460507e-07, + "loss": 1.2659, + "step": 18636 + }, + { + "epoch": 0.9319, + "grad_norm": 1.9793241024017334, + "learning_rate": 2.8121138716711406e-07, + "loss": 0.5936, + "step": 18638 + }, + { + "epoch": 0.932, + "grad_norm": 3.3283116817474365, + "learning_rate": 2.8038999421453827e-07, + "loss": 0.7255, + "step": 18640 + }, + { + "epoch": 0.9321, + "grad_norm": 3.7687172889709473, + "learning_rate": 2.7956978556695766e-07, + "loss": 1.4399, + "step": 18642 + }, + { + "epoch": 0.9322, + "grad_norm": 2.7127373218536377, + "learning_rate": 2.7875076132431344e-07, + "loss": 0.4294, + "step": 18644 + }, + { + "epoch": 0.9323, + "grad_norm": 3.823136568069458, + "learning_rate": 2.779329215864013e-07, + "loss": 0.2443, + "step": 18646 + }, + { + "epoch": 0.9324, + "grad_norm": 5.957787036895752, + "learning_rate": 2.771162664528726e-07, + "loss": 0.9747, + "step": 18648 + }, + { + "epoch": 0.9325, + "grad_norm": 3.9446959495544434, + "learning_rate": 2.7630079602323447e-07, + "loss": 1.4663, + "step": 18650 + }, + { + "epoch": 0.9326, + "grad_norm": 5.994869709014893, + "learning_rate": 2.7548651039684847e-07, + "loss": 2.013, + "step": 18652 + }, + { + "epoch": 0.9327, + "grad_norm": 8.558859825134277, + "learning_rate": 2.746734096729342e-07, + "loss": 0.5152, + "step": 18654 + }, + { + "epoch": 0.9328, + "grad_norm": 3.229797124862671, + "learning_rate": 2.7386149395056463e-07, + "loss": 0.7038, + "step": 18656 + }, + { + "epoch": 0.9329, + "grad_norm": 3.5258288383483887, + "learning_rate": 2.7305076332867053e-07, + "loss": 0.6395, + "step": 18658 + }, + { + "epoch": 0.933, + "grad_norm": 8.818422317504883, + "learning_rate": 2.7224121790603517e-07, + "loss": 0.7338, + "step": 18660 + }, + { + "epoch": 0.9331, + "grad_norm": 6.069280624389648, + "learning_rate": 2.7143285778129967e-07, + "loss": 0.8223, + "step": 18662 + }, + { + "epoch": 0.9332, + "grad_norm": 4.9171342849731445, + "learning_rate": 2.7062568305295967e-07, + "loss": 0.7157, + "step": 18664 + }, + { + "epoch": 0.9333, + "grad_norm": 7.538431167602539, + "learning_rate": 2.6981969381936977e-07, + "loss": 1.2739, + "step": 18666 + }, + { + "epoch": 0.9334, + "grad_norm": 5.901175022125244, + "learning_rate": 2.6901489017873375e-07, + "loss": 1.116, + "step": 18668 + }, + { + "epoch": 0.9335, + "grad_norm": 2.6338276863098145, + "learning_rate": 2.682112722291186e-07, + "loss": 0.6982, + "step": 18670 + }, + { + "epoch": 0.9336, + "grad_norm": 2.576700210571289, + "learning_rate": 2.6740884006843826e-07, + "loss": 1.0418, + "step": 18672 + }, + { + "epoch": 0.9337, + "grad_norm": 6.430859565734863, + "learning_rate": 2.66607593794469e-07, + "loss": 0.7717, + "step": 18674 + }, + { + "epoch": 0.9338, + "grad_norm": 3.0484907627105713, + "learning_rate": 2.6580753350484044e-07, + "loss": 0.8793, + "step": 18676 + }, + { + "epoch": 0.9339, + "grad_norm": 3.5573887825012207, + "learning_rate": 2.650086592970358e-07, + "loss": 0.7407, + "step": 18678 + }, + { + "epoch": 0.934, + "grad_norm": 3.552050828933716, + "learning_rate": 2.6421097126839714e-07, + "loss": 0.7607, + "step": 18680 + }, + { + "epoch": 0.9341, + "grad_norm": 2.372159719467163, + "learning_rate": 2.6341446951612006e-07, + "loss": 0.8031, + "step": 18682 + }, + { + "epoch": 0.9342, + "grad_norm": 6.926505088806152, + "learning_rate": 2.626191541372558e-07, + "loss": 1.0528, + "step": 18684 + }, + { + "epoch": 0.9343, + "grad_norm": 4.267000198364258, + "learning_rate": 2.6182502522871135e-07, + "loss": 0.3081, + "step": 18686 + }, + { + "epoch": 0.9344, + "grad_norm": 5.847003936767578, + "learning_rate": 2.6103208288724815e-07, + "loss": 1.4074, + "step": 18688 + }, + { + "epoch": 0.9345, + "grad_norm": 8.851766586303711, + "learning_rate": 2.6024032720948446e-07, + "loss": 0.5757, + "step": 18690 + }, + { + "epoch": 0.9346, + "grad_norm": 4.227054595947266, + "learning_rate": 2.59449758291892e-07, + "loss": 0.5644, + "step": 18692 + }, + { + "epoch": 0.9347, + "grad_norm": 6.15515661239624, + "learning_rate": 2.5866037623080155e-07, + "loss": 0.1417, + "step": 18694 + }, + { + "epoch": 0.9348, + "grad_norm": 2.9262800216674805, + "learning_rate": 2.57872181122395e-07, + "loss": 1.1656, + "step": 18696 + }, + { + "epoch": 0.9349, + "grad_norm": 4.740843296051025, + "learning_rate": 2.570851730627122e-07, + "loss": 1.0005, + "step": 18698 + }, + { + "epoch": 0.935, + "grad_norm": 4.277476787567139, + "learning_rate": 2.5629935214764866e-07, + "loss": 0.8129, + "step": 18700 + }, + { + "epoch": 0.9351, + "grad_norm": 5.283162593841553, + "learning_rate": 2.555147184729523e-07, + "loss": 0.9123, + "step": 18702 + }, + { + "epoch": 0.9352, + "grad_norm": 5.456458568572998, + "learning_rate": 2.547312721342277e-07, + "loss": 0.9828, + "step": 18704 + }, + { + "epoch": 0.9353, + "grad_norm": 2.6806302070617676, + "learning_rate": 2.5394901322694065e-07, + "loss": 0.5092, + "step": 18706 + }, + { + "epoch": 0.9354, + "grad_norm": 4.843123912811279, + "learning_rate": 2.5316794184640056e-07, + "loss": 0.5195, + "step": 18708 + }, + { + "epoch": 0.9355, + "grad_norm": 9.666827201843262, + "learning_rate": 2.523880580877824e-07, + "loss": 1.4295, + "step": 18710 + }, + { + "epoch": 0.9356, + "grad_norm": 4.123926162719727, + "learning_rate": 2.516093620461124e-07, + "loss": 0.6727, + "step": 18712 + }, + { + "epoch": 0.9357, + "grad_norm": 3.2467281818389893, + "learning_rate": 2.5083185381627017e-07, + "loss": 1.1547, + "step": 18714 + }, + { + "epoch": 0.9358, + "grad_norm": 5.072636604309082, + "learning_rate": 2.500555334929955e-07, + "loss": 0.8787, + "step": 18716 + }, + { + "epoch": 0.9359, + "grad_norm": 4.400053977966309, + "learning_rate": 2.4928040117087825e-07, + "loss": 0.7057, + "step": 18718 + }, + { + "epoch": 0.936, + "grad_norm": 2.330524444580078, + "learning_rate": 2.4850645694436736e-07, + "loss": 0.3146, + "step": 18720 + }, + { + "epoch": 0.9361, + "grad_norm": 4.1444220542907715, + "learning_rate": 2.4773370090776625e-07, + "loss": 1.026, + "step": 18722 + }, + { + "epoch": 0.9362, + "grad_norm": 3.7824342250823975, + "learning_rate": 2.4696213315523074e-07, + "loss": 0.9239, + "step": 18724 + }, + { + "epoch": 0.9363, + "grad_norm": 6.155531883239746, + "learning_rate": 2.4619175378077566e-07, + "loss": 0.7255, + "step": 18726 + }, + { + "epoch": 0.9364, + "grad_norm": 5.461802959442139, + "learning_rate": 2.4542256287826915e-07, + "loss": 0.4442, + "step": 18728 + }, + { + "epoch": 0.9365, + "grad_norm": 5.06893253326416, + "learning_rate": 2.446545605414341e-07, + "loss": 1.2276, + "step": 18730 + }, + { + "epoch": 0.9366, + "grad_norm": 2.4536008834838867, + "learning_rate": 2.4388774686385007e-07, + "loss": 0.813, + "step": 18732 + }, + { + "epoch": 0.9367, + "grad_norm": 5.514265060424805, + "learning_rate": 2.4312212193895126e-07, + "loss": 1.2414, + "step": 18734 + }, + { + "epoch": 0.9368, + "grad_norm": 3.1791534423828125, + "learning_rate": 2.423576858600252e-07, + "loss": 0.7118, + "step": 18736 + }, + { + "epoch": 0.9369, + "grad_norm": 3.3344218730926514, + "learning_rate": 2.415944387202174e-07, + "loss": 0.6363, + "step": 18738 + }, + { + "epoch": 0.937, + "grad_norm": 4.971277713775635, + "learning_rate": 2.4083238061252565e-07, + "loss": 1.5157, + "step": 18740 + }, + { + "epoch": 0.9371, + "grad_norm": 6.942105293273926, + "learning_rate": 2.40071511629808e-07, + "loss": 0.7853, + "step": 18742 + }, + { + "epoch": 0.9372, + "grad_norm": 5.796408653259277, + "learning_rate": 2.3931183186477026e-07, + "loss": 1.103, + "step": 18744 + }, + { + "epoch": 0.9373, + "grad_norm": 9.620516777038574, + "learning_rate": 2.385533414099783e-07, + "loss": 1.2575, + "step": 18746 + }, + { + "epoch": 0.9374, + "grad_norm": 8.276147842407227, + "learning_rate": 2.3779604035785277e-07, + "loss": 0.5042, + "step": 18748 + }, + { + "epoch": 0.9375, + "grad_norm": 2.679478168487549, + "learning_rate": 2.370399288006664e-07, + "loss": 0.6143, + "step": 18750 + }, + { + "epoch": 0.9376, + "grad_norm": 0.11912139505147934, + "learning_rate": 2.3628500683055222e-07, + "loss": 0.3465, + "step": 18752 + }, + { + "epoch": 0.9377, + "grad_norm": 4.005870819091797, + "learning_rate": 2.355312745394922e-07, + "loss": 0.4714, + "step": 18754 + }, + { + "epoch": 0.9378, + "grad_norm": 5.6689934730529785, + "learning_rate": 2.3477873201932733e-07, + "loss": 0.9068, + "step": 18756 + }, + { + "epoch": 0.9379, + "grad_norm": 17.990577697753906, + "learning_rate": 2.3402737936175423e-07, + "loss": 0.6418, + "step": 18758 + }, + { + "epoch": 0.938, + "grad_norm": 5.189640998840332, + "learning_rate": 2.332772166583208e-07, + "loss": 1.2385, + "step": 18760 + }, + { + "epoch": 0.9381, + "grad_norm": 3.89462947845459, + "learning_rate": 2.3252824400043393e-07, + "loss": 0.9347, + "step": 18762 + }, + { + "epoch": 0.9382, + "grad_norm": 4.402156352996826, + "learning_rate": 2.3178046147935173e-07, + "loss": 0.7147, + "step": 18764 + }, + { + "epoch": 0.9383, + "grad_norm": 6.034617900848389, + "learning_rate": 2.310338691861902e-07, + "loss": 0.8775, + "step": 18766 + }, + { + "epoch": 0.9384, + "grad_norm": 6.65950345993042, + "learning_rate": 2.3028846721191878e-07, + "loss": 1.2112, + "step": 18768 + }, + { + "epoch": 0.9385, + "grad_norm": 5.374935626983643, + "learning_rate": 2.295442556473637e-07, + "loss": 0.7629, + "step": 18770 + }, + { + "epoch": 0.9386, + "grad_norm": 32.5634880065918, + "learning_rate": 2.288012345832047e-07, + "loss": 1.6042, + "step": 18772 + }, + { + "epoch": 0.9387, + "grad_norm": 5.0046210289001465, + "learning_rate": 2.2805940410997484e-07, + "loss": 1.2718, + "step": 18774 + }, + { + "epoch": 0.9388, + "grad_norm": 3.00292706489563, + "learning_rate": 2.273187643180652e-07, + "loss": 0.7926, + "step": 18776 + }, + { + "epoch": 0.9389, + "grad_norm": 7.567384719848633, + "learning_rate": 2.2657931529772137e-07, + "loss": 1.2032, + "step": 18778 + }, + { + "epoch": 0.939, + "grad_norm": 11.285650253295898, + "learning_rate": 2.2584105713904126e-07, + "loss": 1.4953, + "step": 18780 + }, + { + "epoch": 0.9391, + "grad_norm": 3.9828200340270996, + "learning_rate": 2.2510398993198068e-07, + "loss": 0.8133, + "step": 18782 + }, + { + "epoch": 0.9392, + "grad_norm": 4.49417781829834, + "learning_rate": 2.2436811376634893e-07, + "loss": 1.1926, + "step": 18784 + }, + { + "epoch": 0.9393, + "grad_norm": 1.9988893270492554, + "learning_rate": 2.236334287318076e-07, + "loss": 0.6978, + "step": 18786 + }, + { + "epoch": 0.9394, + "grad_norm": 1.9170717000961304, + "learning_rate": 2.2289993491788065e-07, + "loss": 0.2871, + "step": 18788 + }, + { + "epoch": 0.9395, + "grad_norm": 3.6598286628723145, + "learning_rate": 2.221676324139377e-07, + "loss": 0.8085, + "step": 18790 + }, + { + "epoch": 0.9396, + "grad_norm": 5.494101524353027, + "learning_rate": 2.214365213092118e-07, + "loss": 1.0695, + "step": 18792 + }, + { + "epoch": 0.9397, + "grad_norm": 5.192503452301025, + "learning_rate": 2.2070660169278168e-07, + "loss": 1.158, + "step": 18794 + }, + { + "epoch": 0.9398, + "grad_norm": 4.7373270988464355, + "learning_rate": 2.1997787365358958e-07, + "loss": 0.4642, + "step": 18796 + }, + { + "epoch": 0.9399, + "grad_norm": 10.752547264099121, + "learning_rate": 2.1925033728042777e-07, + "loss": 1.5917, + "step": 18798 + }, + { + "epoch": 0.94, + "grad_norm": 0.7373356223106384, + "learning_rate": 2.1852399266194312e-07, + "loss": 0.8141, + "step": 18800 + }, + { + "epoch": 0.9401, + "grad_norm": 4.32249116897583, + "learning_rate": 2.1779883988664153e-07, + "loss": 0.6322, + "step": 18802 + }, + { + "epoch": 0.9402, + "grad_norm": 12.80670166015625, + "learning_rate": 2.1707487904287672e-07, + "loss": 1.1912, + "step": 18804 + }, + { + "epoch": 0.9403, + "grad_norm": 4.823444843292236, + "learning_rate": 2.1635211021886482e-07, + "loss": 1.0508, + "step": 18806 + }, + { + "epoch": 0.9404, + "grad_norm": 7.342342376708984, + "learning_rate": 2.1563053350266983e-07, + "loss": 1.725, + "step": 18808 + }, + { + "epoch": 0.9405, + "grad_norm": 3.976630449295044, + "learning_rate": 2.1491014898221585e-07, + "loss": 1.2926, + "step": 18810 + }, + { + "epoch": 0.9406, + "grad_norm": 2.6552422046661377, + "learning_rate": 2.1419095674527934e-07, + "loss": 0.2551, + "step": 18812 + }, + { + "epoch": 0.9407, + "grad_norm": 2.0583455562591553, + "learning_rate": 2.134729568794902e-07, + "loss": 0.2157, + "step": 18814 + }, + { + "epoch": 0.9408, + "grad_norm": 5.665534019470215, + "learning_rate": 2.1275614947233624e-07, + "loss": 0.9719, + "step": 18816 + }, + { + "epoch": 0.9409, + "grad_norm": 13.04146671295166, + "learning_rate": 2.1204053461115758e-07, + "loss": 1.2285, + "step": 18818 + }, + { + "epoch": 0.941, + "grad_norm": 2.7097280025482178, + "learning_rate": 2.1132611238315004e-07, + "loss": 0.3813, + "step": 18820 + }, + { + "epoch": 0.9411, + "grad_norm": 3.4772675037384033, + "learning_rate": 2.1061288287536287e-07, + "loss": 0.8337, + "step": 18822 + }, + { + "epoch": 0.9412, + "grad_norm": 4.66595458984375, + "learning_rate": 2.0990084617470207e-07, + "loss": 1.0801, + "step": 18824 + }, + { + "epoch": 0.9413, + "grad_norm": 7.384253025054932, + "learning_rate": 2.091900023679261e-07, + "loss": 0.8061, + "step": 18826 + }, + { + "epoch": 0.9414, + "grad_norm": 0.9204240441322327, + "learning_rate": 2.0848035154165113e-07, + "loss": 0.4958, + "step": 18828 + }, + { + "epoch": 0.9415, + "grad_norm": 6.848769664764404, + "learning_rate": 2.077718937823414e-07, + "loss": 0.3291, + "step": 18830 + }, + { + "epoch": 0.9416, + "grad_norm": 10.904918670654297, + "learning_rate": 2.0706462917632676e-07, + "loss": 0.7347, + "step": 18832 + }, + { + "epoch": 0.9417, + "grad_norm": 5.018107891082764, + "learning_rate": 2.0635855780978042e-07, + "loss": 0.7486, + "step": 18834 + }, + { + "epoch": 0.9418, + "grad_norm": 1.984152913093567, + "learning_rate": 2.0565367976873584e-07, + "loss": 0.2685, + "step": 18836 + }, + { + "epoch": 0.9419, + "grad_norm": 5.019595623016357, + "learning_rate": 2.049499951390832e-07, + "loss": 1.1295, + "step": 18838 + }, + { + "epoch": 0.942, + "grad_norm": 5.0759148597717285, + "learning_rate": 2.0424750400655947e-07, + "loss": 0.7632, + "step": 18840 + }, + { + "epoch": 0.9421, + "grad_norm": 3.3687729835510254, + "learning_rate": 2.0354620645676503e-07, + "loss": 0.4117, + "step": 18842 + }, + { + "epoch": 0.9422, + "grad_norm": 6.29938268661499, + "learning_rate": 2.0284610257514936e-07, + "loss": 1.2248, + "step": 18844 + }, + { + "epoch": 0.9423, + "grad_norm": 4.15864896774292, + "learning_rate": 2.0214719244701753e-07, + "loss": 1.0847, + "step": 18846 + }, + { + "epoch": 0.9424, + "grad_norm": 21.094280242919922, + "learning_rate": 2.014494761575314e-07, + "loss": 0.6417, + "step": 18848 + }, + { + "epoch": 0.9425, + "grad_norm": 11.517862319946289, + "learning_rate": 2.0075295379170413e-07, + "loss": 1.0242, + "step": 18850 + }, + { + "epoch": 0.9426, + "grad_norm": 4.826227188110352, + "learning_rate": 2.0005762543440444e-07, + "loss": 1.093, + "step": 18852 + }, + { + "epoch": 0.9427, + "grad_norm": 7.064993858337402, + "learning_rate": 1.993634911703579e-07, + "loss": 1.5974, + "step": 18854 + }, + { + "epoch": 0.9428, + "grad_norm": 7.505946159362793, + "learning_rate": 1.9867055108414023e-07, + "loss": 0.6439, + "step": 18856 + }, + { + "epoch": 0.9429, + "grad_norm": 3.6843628883361816, + "learning_rate": 1.9797880526018608e-07, + "loss": 1.294, + "step": 18858 + }, + { + "epoch": 0.943, + "grad_norm": 5.495808124542236, + "learning_rate": 1.9728825378278248e-07, + "loss": 0.852, + "step": 18860 + }, + { + "epoch": 0.9431, + "grad_norm": 10.850616455078125, + "learning_rate": 1.965988967360688e-07, + "loss": 1.1722, + "step": 18862 + }, + { + "epoch": 0.9432, + "grad_norm": 6.5370283126831055, + "learning_rate": 1.9591073420404338e-07, + "loss": 0.73, + "step": 18864 + }, + { + "epoch": 0.9433, + "grad_norm": 6.1363139152526855, + "learning_rate": 1.9522376627055585e-07, + "loss": 1.3231, + "step": 18866 + }, + { + "epoch": 0.9434, + "grad_norm": 2.7178611755371094, + "learning_rate": 1.9453799301931253e-07, + "loss": 0.8408, + "step": 18868 + }, + { + "epoch": 0.9435, + "grad_norm": 14.417662620544434, + "learning_rate": 1.9385341453386997e-07, + "loss": 0.9094, + "step": 18870 + }, + { + "epoch": 0.9436, + "grad_norm": 4.542585849761963, + "learning_rate": 1.9317003089764365e-07, + "loss": 0.3828, + "step": 18872 + }, + { + "epoch": 0.9437, + "grad_norm": 3.7909820079803467, + "learning_rate": 1.924878421939036e-07, + "loss": 1.1935, + "step": 18874 + }, + { + "epoch": 0.9438, + "grad_norm": 10.124725341796875, + "learning_rate": 1.9180684850576893e-07, + "loss": 1.1978, + "step": 18876 + }, + { + "epoch": 0.9439, + "grad_norm": 1.7901476621627808, + "learning_rate": 1.9112704991621988e-07, + "loss": 0.3979, + "step": 18878 + }, + { + "epoch": 0.944, + "grad_norm": 2.3266334533691406, + "learning_rate": 1.9044844650808468e-07, + "loss": 1.0328, + "step": 18880 + }, + { + "epoch": 0.9441, + "grad_norm": 6.401607513427734, + "learning_rate": 1.8977103836405054e-07, + "loss": 0.9071, + "step": 18882 + }, + { + "epoch": 0.9442, + "grad_norm": 4.244029998779297, + "learning_rate": 1.8909482556666026e-07, + "loss": 0.629, + "step": 18884 + }, + { + "epoch": 0.9443, + "grad_norm": 6.9934773445129395, + "learning_rate": 1.8841980819830352e-07, + "loss": 0.4382, + "step": 18886 + }, + { + "epoch": 0.9444, + "grad_norm": 3.7532896995544434, + "learning_rate": 1.877459863412323e-07, + "loss": 1.2059, + "step": 18888 + }, + { + "epoch": 0.9445, + "grad_norm": 3.964357852935791, + "learning_rate": 1.8707336007754873e-07, + "loss": 1.4847, + "step": 18890 + }, + { + "epoch": 0.9446, + "grad_norm": 4.863725185394287, + "learning_rate": 1.8640192948921053e-07, + "loss": 0.9236, + "step": 18892 + }, + { + "epoch": 0.9447, + "grad_norm": 3.3867270946502686, + "learning_rate": 1.85731694658029e-07, + "loss": 1.0895, + "step": 18894 + }, + { + "epoch": 0.9448, + "grad_norm": 6.62061882019043, + "learning_rate": 1.8506265566567095e-07, + "loss": 0.9414, + "step": 18896 + }, + { + "epoch": 0.9449, + "grad_norm": 4.7950897216796875, + "learning_rate": 1.8439481259365676e-07, + "loss": 1.3758, + "step": 18898 + }, + { + "epoch": 0.945, + "grad_norm": 3.6582279205322266, + "learning_rate": 1.8372816552336025e-07, + "loss": 0.5137, + "step": 18900 + }, + { + "epoch": 0.9451, + "grad_norm": 11.627959251403809, + "learning_rate": 1.8306271453601198e-07, + "loss": 1.1729, + "step": 18902 + }, + { + "epoch": 0.9452, + "grad_norm": 1.5422598123550415, + "learning_rate": 1.8239845971269266e-07, + "loss": 0.071, + "step": 18904 + }, + { + "epoch": 0.9453, + "grad_norm": 5.626513481140137, + "learning_rate": 1.8173540113434197e-07, + "loss": 0.5704, + "step": 18906 + }, + { + "epoch": 0.9454, + "grad_norm": 3.4060113430023193, + "learning_rate": 1.8107353888175083e-07, + "loss": 0.6858, + "step": 18908 + }, + { + "epoch": 0.9455, + "grad_norm": 1.3813093900680542, + "learning_rate": 1.8041287303556366e-07, + "loss": 0.5744, + "step": 18910 + }, + { + "epoch": 0.9456, + "grad_norm": 6.361273288726807, + "learning_rate": 1.7975340367628269e-07, + "loss": 1.1174, + "step": 18912 + }, + { + "epoch": 0.9457, + "grad_norm": 8.929173469543457, + "learning_rate": 1.7909513088426256e-07, + "loss": 0.9993, + "step": 18914 + }, + { + "epoch": 0.9458, + "grad_norm": 13.665705680847168, + "learning_rate": 1.7843805473970798e-07, + "loss": 0.989, + "step": 18916 + }, + { + "epoch": 0.9459, + "grad_norm": 4.562392711639404, + "learning_rate": 1.7778217532268715e-07, + "loss": 0.924, + "step": 18918 + }, + { + "epoch": 0.946, + "grad_norm": 4.413688659667969, + "learning_rate": 1.7712749271311392e-07, + "loss": 0.3862, + "step": 18920 + }, + { + "epoch": 0.9461, + "grad_norm": 3.3315625190734863, + "learning_rate": 1.764740069907589e-07, + "loss": 0.5996, + "step": 18922 + }, + { + "epoch": 0.9462, + "grad_norm": 6.616060256958008, + "learning_rate": 1.758217182352495e-07, + "loss": 0.7944, + "step": 18924 + }, + { + "epoch": 0.9463, + "grad_norm": 2.2942862510681152, + "learning_rate": 1.7517062652606108e-07, + "loss": 0.6212, + "step": 18926 + }, + { + "epoch": 0.9464, + "grad_norm": 4.756677627563477, + "learning_rate": 1.7452073194253237e-07, + "loss": 1.3779, + "step": 18928 + }, + { + "epoch": 0.9465, + "grad_norm": 1.830067753791809, + "learning_rate": 1.7387203456384784e-07, + "loss": 0.5393, + "step": 18930 + }, + { + "epoch": 0.9466, + "grad_norm": 1.6570687294006348, + "learning_rate": 1.7322453446905084e-07, + "loss": 0.1989, + "step": 18932 + }, + { + "epoch": 0.9467, + "grad_norm": 9.019028663635254, + "learning_rate": 1.7257823173703504e-07, + "loss": 1.0945, + "step": 18934 + }, + { + "epoch": 0.9468, + "grad_norm": 3.151015281677246, + "learning_rate": 1.719331264465529e-07, + "loss": 1.1685, + "step": 18936 + }, + { + "epoch": 0.9469, + "grad_norm": 3.4585773944854736, + "learning_rate": 1.7128921867620828e-07, + "loss": 0.9733, + "step": 18938 + }, + { + "epoch": 0.947, + "grad_norm": 2.824444532394409, + "learning_rate": 1.706465085044584e-07, + "loss": 0.5968, + "step": 18940 + }, + { + "epoch": 0.9471, + "grad_norm": 3.769341230392456, + "learning_rate": 1.7000499600961507e-07, + "loss": 1.4042, + "step": 18942 + }, + { + "epoch": 0.9472, + "grad_norm": 6.6701860427856445, + "learning_rate": 1.6936468126984573e-07, + "loss": 1.1911, + "step": 18944 + }, + { + "epoch": 0.9473, + "grad_norm": 5.326620101928711, + "learning_rate": 1.6872556436317022e-07, + "loss": 0.426, + "step": 18946 + }, + { + "epoch": 0.9474, + "grad_norm": 7.473501205444336, + "learning_rate": 1.680876453674629e-07, + "loss": 0.7339, + "step": 18948 + }, + { + "epoch": 0.9475, + "grad_norm": 6.945289611816406, + "learning_rate": 1.6745092436045495e-07, + "loss": 1.0395, + "step": 18950 + }, + { + "epoch": 0.9476, + "grad_norm": 7.051239967346191, + "learning_rate": 1.668154014197243e-07, + "loss": 0.8309, + "step": 18952 + }, + { + "epoch": 0.9477, + "grad_norm": 4.955126762390137, + "learning_rate": 1.661810766227112e-07, + "loss": 0.3083, + "step": 18954 + }, + { + "epoch": 0.9478, + "grad_norm": 5.6733198165893555, + "learning_rate": 1.6554795004670389e-07, + "loss": 0.718, + "step": 18956 + }, + { + "epoch": 0.9479, + "grad_norm": 3.451685667037964, + "learning_rate": 1.6491602176884724e-07, + "loss": 0.8962, + "step": 18958 + }, + { + "epoch": 0.948, + "grad_norm": 3.056011199951172, + "learning_rate": 1.6428529186614195e-07, + "loss": 0.7416, + "step": 18960 + }, + { + "epoch": 0.9481, + "grad_norm": 4.619480609893799, + "learning_rate": 1.6365576041543652e-07, + "loss": 0.8798, + "step": 18962 + }, + { + "epoch": 0.9482, + "grad_norm": 4.315484046936035, + "learning_rate": 1.6302742749344292e-07, + "loss": 0.8244, + "step": 18964 + }, + { + "epoch": 0.9483, + "grad_norm": 11.13147258758545, + "learning_rate": 1.6240029317671657e-07, + "loss": 1.4063, + "step": 18966 + }, + { + "epoch": 0.9484, + "grad_norm": 2.517223358154297, + "learning_rate": 1.6177435754167413e-07, + "loss": 0.3573, + "step": 18968 + }, + { + "epoch": 0.9485, + "grad_norm": 6.365816116333008, + "learning_rate": 1.6114962066458351e-07, + "loss": 0.4725, + "step": 18970 + }, + { + "epoch": 0.9486, + "grad_norm": 5.429219722747803, + "learning_rate": 1.605260826215682e-07, + "loss": 0.4527, + "step": 18972 + }, + { + "epoch": 0.9487, + "grad_norm": 5.146205425262451, + "learning_rate": 1.5990374348860304e-07, + "loss": 0.8031, + "step": 18974 + }, + { + "epoch": 0.9488, + "grad_norm": 3.965400457382202, + "learning_rate": 1.5928260334151847e-07, + "loss": 1.0448, + "step": 18976 + }, + { + "epoch": 0.9489, + "grad_norm": 4.838458061218262, + "learning_rate": 1.5866266225599836e-07, + "loss": 0.7895, + "step": 18978 + }, + { + "epoch": 0.949, + "grad_norm": 1.1868561506271362, + "learning_rate": 1.580439203075812e-07, + "loss": 0.1339, + "step": 18980 + }, + { + "epoch": 0.9491, + "grad_norm": 5.3250603675842285, + "learning_rate": 1.5742637757165779e-07, + "loss": 1.5462, + "step": 18982 + }, + { + "epoch": 0.9492, + "grad_norm": 12.597103118896484, + "learning_rate": 1.5681003412347573e-07, + "loss": 1.318, + "step": 18984 + }, + { + "epoch": 0.9493, + "grad_norm": 3.83142352104187, + "learning_rate": 1.561948900381327e-07, + "loss": 0.3447, + "step": 18986 + }, + { + "epoch": 0.9494, + "grad_norm": 3.7760133743286133, + "learning_rate": 1.555809453905821e-07, + "loss": 1.1632, + "step": 18988 + }, + { + "epoch": 0.9495, + "grad_norm": 5.2871503829956055, + "learning_rate": 1.549682002556341e-07, + "loss": 0.8341, + "step": 18990 + }, + { + "epoch": 0.9496, + "grad_norm": 3.9084465503692627, + "learning_rate": 1.543566547079467e-07, + "loss": 0.7201, + "step": 18992 + }, + { + "epoch": 0.9497, + "grad_norm": 7.552841663360596, + "learning_rate": 1.537463088220359e-07, + "loss": 1.2724, + "step": 18994 + }, + { + "epoch": 0.9498, + "grad_norm": 3.2195487022399902, + "learning_rate": 1.5313716267226997e-07, + "loss": 0.7932, + "step": 18996 + }, + { + "epoch": 0.9499, + "grad_norm": 12.160991668701172, + "learning_rate": 1.5252921633287176e-07, + "loss": 1.0112, + "step": 18998 + }, + { + "epoch": 0.95, + "grad_norm": 1.2820103168487549, + "learning_rate": 1.519224698779198e-07, + "loss": 0.5882, + "step": 19000 + }, + { + "epoch": 0.9501, + "grad_norm": 5.444787979125977, + "learning_rate": 1.5131692338134052e-07, + "loss": 0.5552, + "step": 19002 + }, + { + "epoch": 0.9502, + "grad_norm": 3.445831775665283, + "learning_rate": 1.5071257691692153e-07, + "loss": 0.6802, + "step": 19004 + }, + { + "epoch": 0.9503, + "grad_norm": 8.503541946411133, + "learning_rate": 1.501094305582984e-07, + "loss": 0.8541, + "step": 19006 + }, + { + "epoch": 0.9504, + "grad_norm": 0.8713387846946716, + "learning_rate": 1.4950748437896235e-07, + "loss": 0.2498, + "step": 19008 + }, + { + "epoch": 0.9505, + "grad_norm": 7.428864002227783, + "learning_rate": 1.4890673845226133e-07, + "loss": 0.4585, + "step": 19010 + }, + { + "epoch": 0.9506, + "grad_norm": 2.4161324501037598, + "learning_rate": 1.483071928513913e-07, + "loss": 0.4414, + "step": 19012 + }, + { + "epoch": 0.9507, + "grad_norm": 4.0237603187561035, + "learning_rate": 1.4770884764940707e-07, + "loss": 0.7149, + "step": 19014 + }, + { + "epoch": 0.9508, + "grad_norm": 4.434983730316162, + "learning_rate": 1.4711170291921485e-07, + "loss": 0.6357, + "step": 19016 + }, + { + "epoch": 0.9509, + "grad_norm": 3.8583967685699463, + "learning_rate": 1.4651575873357416e-07, + "loss": 1.3141, + "step": 19018 + }, + { + "epoch": 0.951, + "grad_norm": 4.381001949310303, + "learning_rate": 1.4592101516509916e-07, + "loss": 1.2488, + "step": 19020 + }, + { + "epoch": 0.9511, + "grad_norm": 3.492776393890381, + "learning_rate": 1.4532747228625855e-07, + "loss": 1.5973, + "step": 19022 + }, + { + "epoch": 0.9512, + "grad_norm": 3.569445848464966, + "learning_rate": 1.4473513016937223e-07, + "loss": 1.0587, + "step": 19024 + }, + { + "epoch": 0.9513, + "grad_norm": 3.0800929069519043, + "learning_rate": 1.4414398888661695e-07, + "loss": 0.4763, + "step": 19026 + }, + { + "epoch": 0.9514, + "grad_norm": 13.182977676391602, + "learning_rate": 1.4355404851001953e-07, + "loss": 0.6703, + "step": 19028 + }, + { + "epoch": 0.9515, + "grad_norm": 7.991048812866211, + "learning_rate": 1.4296530911146466e-07, + "loss": 0.8849, + "step": 19030 + }, + { + "epoch": 0.9516, + "grad_norm": 5.028714179992676, + "learning_rate": 1.4237777076268723e-07, + "loss": 0.3557, + "step": 19032 + }, + { + "epoch": 0.9517, + "grad_norm": 4.5039496421813965, + "learning_rate": 1.4179143353527547e-07, + "loss": 0.397, + "step": 19034 + }, + { + "epoch": 0.9518, + "grad_norm": 2.5334270000457764, + "learning_rate": 1.4120629750067672e-07, + "loss": 0.6849, + "step": 19036 + }, + { + "epoch": 0.9519, + "grad_norm": 2.8704752922058105, + "learning_rate": 1.4062236273018392e-07, + "loss": 0.3188, + "step": 19038 + }, + { + "epoch": 0.952, + "grad_norm": 1.654152274131775, + "learning_rate": 1.400396292949513e-07, + "loss": 0.7565, + "step": 19040 + }, + { + "epoch": 0.9521, + "grad_norm": 3.5232675075531006, + "learning_rate": 1.3945809726597982e-07, + "loss": 0.9416, + "step": 19042 + }, + { + "epoch": 0.9522, + "grad_norm": 6.423243045806885, + "learning_rate": 1.3887776671412943e-07, + "loss": 0.9861, + "step": 19044 + }, + { + "epoch": 0.9523, + "grad_norm": 4.49097204208374, + "learning_rate": 1.3829863771011253e-07, + "loss": 2.0208, + "step": 19046 + }, + { + "epoch": 0.9524, + "grad_norm": 4.0098557472229, + "learning_rate": 1.377207103244904e-07, + "loss": 0.6313, + "step": 19048 + }, + { + "epoch": 0.9525, + "grad_norm": 19.357666015625, + "learning_rate": 1.3714398462768563e-07, + "loss": 0.5778, + "step": 19050 + }, + { + "epoch": 0.9526, + "grad_norm": 5.370688438415527, + "learning_rate": 1.3656846068996976e-07, + "loss": 0.9141, + "step": 19052 + }, + { + "epoch": 0.9527, + "grad_norm": 3.0345299243927, + "learning_rate": 1.359941385814667e-07, + "loss": 0.2928, + "step": 19054 + }, + { + "epoch": 0.9528, + "grad_norm": 4.1958327293396, + "learning_rate": 1.3542101837215826e-07, + "loss": 0.3915, + "step": 19056 + }, + { + "epoch": 0.9529, + "grad_norm": 1.2648106813430786, + "learning_rate": 1.3484910013187523e-07, + "loss": 0.5028, + "step": 19058 + }, + { + "epoch": 0.953, + "grad_norm": 3.760725259780884, + "learning_rate": 1.3427838393030634e-07, + "loss": 0.7062, + "step": 19060 + }, + { + "epoch": 0.9531, + "grad_norm": 10.046363830566406, + "learning_rate": 1.3370886983698928e-07, + "loss": 1.2297, + "step": 19062 + }, + { + "epoch": 0.9532, + "grad_norm": 3.1593892574310303, + "learning_rate": 1.3314055792131964e-07, + "loss": 0.3506, + "step": 19064 + }, + { + "epoch": 0.9533, + "grad_norm": 2.107922315597534, + "learning_rate": 1.3257344825254315e-07, + "loss": 0.969, + "step": 19066 + }, + { + "epoch": 0.9534, + "grad_norm": 2.812852144241333, + "learning_rate": 1.320075408997612e-07, + "loss": 0.8163, + "step": 19068 + }, + { + "epoch": 0.9535, + "grad_norm": 3.655433177947998, + "learning_rate": 1.3144283593192752e-07, + "loss": 0.7678, + "step": 19070 + }, + { + "epoch": 0.9536, + "grad_norm": 2.62358021736145, + "learning_rate": 1.308793334178493e-07, + "loss": 0.7861, + "step": 19072 + }, + { + "epoch": 0.9537, + "grad_norm": 7.177584171295166, + "learning_rate": 1.3031703342618828e-07, + "loss": 1.3593, + "step": 19074 + }, + { + "epoch": 0.9538, + "grad_norm": 2.863194704055786, + "learning_rate": 1.2975593602545966e-07, + "loss": 1.2428, + "step": 19076 + }, + { + "epoch": 0.9539, + "grad_norm": 14.369985580444336, + "learning_rate": 1.2919604128402873e-07, + "loss": 1.4702, + "step": 19078 + }, + { + "epoch": 0.954, + "grad_norm": 4.733057022094727, + "learning_rate": 1.2863734927012094e-07, + "loss": 0.6871, + "step": 19080 + }, + { + "epoch": 0.9541, + "grad_norm": 3.9067952632904053, + "learning_rate": 1.280798600518085e-07, + "loss": 0.7717, + "step": 19082 + }, + { + "epoch": 0.9542, + "grad_norm": 5.158524036407471, + "learning_rate": 1.275235736970193e-07, + "loss": 1.4129, + "step": 19084 + }, + { + "epoch": 0.9543, + "grad_norm": 4.722469806671143, + "learning_rate": 1.2696849027353797e-07, + "loss": 0.7743, + "step": 19086 + }, + { + "epoch": 0.9544, + "grad_norm": 7.30267333984375, + "learning_rate": 1.26414609848996e-07, + "loss": 0.8832, + "step": 19088 + }, + { + "epoch": 0.9545, + "grad_norm": 4.5601725578308105, + "learning_rate": 1.2586193249088607e-07, + "loss": 0.8118, + "step": 19090 + }, + { + "epoch": 0.9546, + "grad_norm": 8.594212532043457, + "learning_rate": 1.2531045826654652e-07, + "loss": 0.8896, + "step": 19092 + }, + { + "epoch": 0.9547, + "grad_norm": 3.703368902206421, + "learning_rate": 1.2476018724317586e-07, + "loss": 0.867, + "step": 19094 + }, + { + "epoch": 0.9548, + "grad_norm": 27.508705139160156, + "learning_rate": 1.242111194878215e-07, + "loss": 1.0712, + "step": 19096 + }, + { + "epoch": 0.9549, + "grad_norm": 3.7265944480895996, + "learning_rate": 1.2366325506738442e-07, + "loss": 0.9149, + "step": 19098 + }, + { + "epoch": 0.955, + "grad_norm": 6.803202152252197, + "learning_rate": 1.231165940486234e-07, + "loss": 1.3081, + "step": 19100 + }, + { + "epoch": 0.9551, + "grad_norm": 6.200563907623291, + "learning_rate": 1.225711364981441e-07, + "loss": 0.6869, + "step": 19102 + }, + { + "epoch": 0.9552, + "grad_norm": 5.454736232757568, + "learning_rate": 1.2202688248241113e-07, + "loss": 1.8888, + "step": 19104 + }, + { + "epoch": 0.9553, + "grad_norm": 9.472323417663574, + "learning_rate": 1.2148383206773916e-07, + "loss": 0.8835, + "step": 19106 + }, + { + "epoch": 0.9554, + "grad_norm": 0.379545122385025, + "learning_rate": 1.2094198532029754e-07, + "loss": 0.0739, + "step": 19108 + }, + { + "epoch": 0.9555, + "grad_norm": 4.739381790161133, + "learning_rate": 1.2040134230610902e-07, + "loss": 0.5554, + "step": 19110 + }, + { + "epoch": 0.9556, + "grad_norm": 9.297167778015137, + "learning_rate": 1.1986190309104861e-07, + "loss": 0.9399, + "step": 19112 + }, + { + "epoch": 0.9557, + "grad_norm": 3.5415821075439453, + "learning_rate": 1.1932366774084493e-07, + "loss": 0.4043, + "step": 19114 + }, + { + "epoch": 0.9558, + "grad_norm": 7.341370105743408, + "learning_rate": 1.1878663632108322e-07, + "loss": 0.6772, + "step": 19116 + }, + { + "epoch": 0.9559, + "grad_norm": 3.802814245223999, + "learning_rate": 1.1825080889719565e-07, + "loss": 1.2158, + "step": 19118 + }, + { + "epoch": 0.956, + "grad_norm": 2.635395050048828, + "learning_rate": 1.1771618553447217e-07, + "loss": 0.556, + "step": 19120 + }, + { + "epoch": 0.9561, + "grad_norm": 19.204511642456055, + "learning_rate": 1.1718276629805625e-07, + "loss": 1.5154, + "step": 19122 + }, + { + "epoch": 0.9562, + "grad_norm": 5.652496814727783, + "learning_rate": 1.1665055125294033e-07, + "loss": 1.5197, + "step": 19124 + }, + { + "epoch": 0.9563, + "grad_norm": 6.591673374176025, + "learning_rate": 1.16119540463977e-07, + "loss": 0.8964, + "step": 19126 + }, + { + "epoch": 0.9564, + "grad_norm": 5.4970173835754395, + "learning_rate": 1.1558973399586671e-07, + "loss": 0.331, + "step": 19128 + }, + { + "epoch": 0.9565, + "grad_norm": 2.7586324214935303, + "learning_rate": 1.1506113191316447e-07, + "loss": 0.5991, + "step": 19130 + }, + { + "epoch": 0.9566, + "grad_norm": 3.0060017108917236, + "learning_rate": 1.1453373428027992e-07, + "loss": 0.8703, + "step": 19132 + }, + { + "epoch": 0.9567, + "grad_norm": 7.400001049041748, + "learning_rate": 1.1400754116147272e-07, + "loss": 0.5195, + "step": 19134 + }, + { + "epoch": 0.9568, + "grad_norm": 3.951340675354004, + "learning_rate": 1.134825526208605e-07, + "loss": 0.248, + "step": 19136 + }, + { + "epoch": 0.9569, + "grad_norm": 5.290920257568359, + "learning_rate": 1.1295876872240874e-07, + "loss": 1.0738, + "step": 19138 + }, + { + "epoch": 0.957, + "grad_norm": 6.856482028961182, + "learning_rate": 1.1243618952994195e-07, + "loss": 1.1592, + "step": 19140 + }, + { + "epoch": 0.9571, + "grad_norm": 0.3487666845321655, + "learning_rate": 1.1191481510713254e-07, + "loss": 0.5244, + "step": 19142 + }, + { + "epoch": 0.9572, + "grad_norm": 2.6695780754089355, + "learning_rate": 1.1139464551750857e-07, + "loss": 1.308, + "step": 19144 + }, + { + "epoch": 0.9573, + "grad_norm": 4.101734638214111, + "learning_rate": 1.1087568082445266e-07, + "loss": 0.7096, + "step": 19146 + }, + { + "epoch": 0.9574, + "grad_norm": 3.4760921001434326, + "learning_rate": 1.1035792109119758e-07, + "loss": 0.8961, + "step": 19148 + }, + { + "epoch": 0.9575, + "grad_norm": 10.48100757598877, + "learning_rate": 1.0984136638083176e-07, + "loss": 0.8897, + "step": 19150 + }, + { + "epoch": 0.9576, + "grad_norm": 3.550185441970825, + "learning_rate": 1.0932601675629595e-07, + "loss": 0.5461, + "step": 19152 + }, + { + "epoch": 0.9577, + "grad_norm": 2.4956629276275635, + "learning_rate": 1.0881187228038214e-07, + "loss": 0.9853, + "step": 19154 + }, + { + "epoch": 0.9578, + "grad_norm": 6.146535396575928, + "learning_rate": 1.0829893301573913e-07, + "loss": 1.2883, + "step": 19156 + }, + { + "epoch": 0.9579, + "grad_norm": 6.166335582733154, + "learning_rate": 1.077871990248669e-07, + "loss": 1.0499, + "step": 19158 + }, + { + "epoch": 0.958, + "grad_norm": 6.572437763214111, + "learning_rate": 1.0727667037011668e-07, + "loss": 0.2437, + "step": 19160 + }, + { + "epoch": 0.9581, + "grad_norm": 4.5318756103515625, + "learning_rate": 1.0676734711369762e-07, + "loss": 0.2779, + "step": 19162 + }, + { + "epoch": 0.9582, + "grad_norm": 1.4907749891281128, + "learning_rate": 1.0625922931766786e-07, + "loss": 0.9254, + "step": 19164 + }, + { + "epoch": 0.9583, + "grad_norm": 6.205759048461914, + "learning_rate": 1.0575231704393895e-07, + "loss": 1.0462, + "step": 19166 + }, + { + "epoch": 0.9584, + "grad_norm": 3.679921865463257, + "learning_rate": 1.052466103542793e-07, + "loss": 0.746, + "step": 19168 + }, + { + "epoch": 0.9585, + "grad_norm": 4.448284149169922, + "learning_rate": 1.0474210931030516e-07, + "loss": 1.0496, + "step": 19170 + }, + { + "epoch": 0.9586, + "grad_norm": 2.2598018646240234, + "learning_rate": 1.0423881397349067e-07, + "loss": 0.773, + "step": 19172 + }, + { + "epoch": 0.9587, + "grad_norm": 2.8647408485412598, + "learning_rate": 1.0373672440515902e-07, + "loss": 1.1916, + "step": 19174 + }, + { + "epoch": 0.9588, + "grad_norm": 3.6451661586761475, + "learning_rate": 1.0323584066648795e-07, + "loss": 0.2087, + "step": 19176 + }, + { + "epoch": 0.9589, + "grad_norm": 5.04844331741333, + "learning_rate": 1.0273616281851084e-07, + "loss": 0.6494, + "step": 19178 + }, + { + "epoch": 0.959, + "grad_norm": 1.7412806749343872, + "learning_rate": 1.0223769092211012e-07, + "loss": 0.8719, + "step": 19180 + }, + { + "epoch": 0.9591, + "grad_norm": 9.143796920776367, + "learning_rate": 1.0174042503802495e-07, + "loss": 1.7916, + "step": 19182 + }, + { + "epoch": 0.9592, + "grad_norm": 3.2979493141174316, + "learning_rate": 1.0124436522684244e-07, + "loss": 1.4215, + "step": 19184 + }, + { + "epoch": 0.9593, + "grad_norm": 1.9190149307250977, + "learning_rate": 1.0074951154900869e-07, + "loss": 0.5173, + "step": 19186 + }, + { + "epoch": 0.9594, + "grad_norm": 7.1898274421691895, + "learning_rate": 1.002558640648199e-07, + "loss": 1.562, + "step": 19188 + }, + { + "epoch": 0.9595, + "grad_norm": 3.441046714782715, + "learning_rate": 9.976342283442464e-08, + "loss": 0.7305, + "step": 19190 + }, + { + "epoch": 0.9596, + "grad_norm": 6.728073596954346, + "learning_rate": 9.9272187917826e-08, + "loss": 0.5228, + "step": 19192 + }, + { + "epoch": 0.9597, + "grad_norm": 1.7113866806030273, + "learning_rate": 9.878215937487834e-08, + "loss": 0.5763, + "step": 19194 + }, + { + "epoch": 0.9598, + "grad_norm": 4.344182014465332, + "learning_rate": 9.829333726529056e-08, + "loss": 0.1106, + "step": 19196 + }, + { + "epoch": 0.9599, + "grad_norm": 16.1227970123291, + "learning_rate": 9.780572164862612e-08, + "loss": 0.4932, + "step": 19198 + }, + { + "epoch": 0.96, + "grad_norm": 2.4629995822906494, + "learning_rate": 9.731931258429638e-08, + "loss": 0.1607, + "step": 19200 + }, + { + "epoch": 0.9601, + "grad_norm": 5.4276347160339355, + "learning_rate": 9.683411013157174e-08, + "loss": 0.658, + "step": 19202 + }, + { + "epoch": 0.9602, + "grad_norm": 5.9654154777526855, + "learning_rate": 9.635011434957153e-08, + "loss": 0.8184, + "step": 19204 + }, + { + "epoch": 0.9603, + "grad_norm": 4.199613094329834, + "learning_rate": 9.58673252972675e-08, + "loss": 0.4102, + "step": 19206 + }, + { + "epoch": 0.9604, + "grad_norm": 4.399903297424316, + "learning_rate": 9.538574303348813e-08, + "loss": 2.1645, + "step": 19208 + }, + { + "epoch": 0.9605, + "grad_norm": 2.9650514125823975, + "learning_rate": 9.490536761691205e-08, + "loss": 0.7116, + "step": 19210 + }, + { + "epoch": 0.9606, + "grad_norm": 2.709960699081421, + "learning_rate": 9.442619910607131e-08, + "loss": 0.7777, + "step": 19212 + }, + { + "epoch": 0.9607, + "grad_norm": 2.718595504760742, + "learning_rate": 9.394823755935146e-08, + "loss": 0.2601, + "step": 19214 + }, + { + "epoch": 0.9608, + "grad_norm": 5.604752063751221, + "learning_rate": 9.347148303499143e-08, + "loss": 1.1964, + "step": 19216 + }, + { + "epoch": 0.9609, + "grad_norm": 2.6008293628692627, + "learning_rate": 9.299593559108033e-08, + "loss": 0.7214, + "step": 19218 + }, + { + "epoch": 0.961, + "grad_norm": 4.57860803604126, + "learning_rate": 9.252159528556404e-08, + "loss": 1.0895, + "step": 19220 + }, + { + "epoch": 0.9611, + "grad_norm": 1.27571439743042, + "learning_rate": 9.204846217623853e-08, + "loss": 0.2989, + "step": 19222 + }, + { + "epoch": 0.9612, + "grad_norm": 5.960310935974121, + "learning_rate": 9.157653632075435e-08, + "loss": 0.6947, + "step": 19224 + }, + { + "epoch": 0.9613, + "grad_norm": 5.4994587898254395, + "learning_rate": 9.110581777661331e-08, + "loss": 1.1812, + "step": 19226 + }, + { + "epoch": 0.9614, + "grad_norm": 6.765439987182617, + "learning_rate": 9.063630660117172e-08, + "loss": 0.8656, + "step": 19228 + }, + { + "epoch": 0.9615, + "grad_norm": 8.8762788772583, + "learning_rate": 9.016800285163718e-08, + "loss": 0.9397, + "step": 19230 + }, + { + "epoch": 0.9616, + "grad_norm": 1.902753472328186, + "learning_rate": 8.970090658507291e-08, + "loss": 0.4795, + "step": 19232 + }, + { + "epoch": 0.9617, + "grad_norm": 4.41359806060791, + "learning_rate": 8.923501785839117e-08, + "loss": 0.881, + "step": 19234 + }, + { + "epoch": 0.9618, + "grad_norm": 3.755784511566162, + "learning_rate": 8.877033672835988e-08, + "loss": 1.2844, + "step": 19236 + }, + { + "epoch": 0.9619, + "grad_norm": 4.904941082000732, + "learning_rate": 8.830686325160043e-08, + "loss": 1.0982, + "step": 19238 + }, + { + "epoch": 0.962, + "grad_norm": 16.482534408569336, + "learning_rate": 8.784459748458318e-08, + "loss": 1.2238, + "step": 19240 + }, + { + "epoch": 0.9621, + "grad_norm": 11.196466445922852, + "learning_rate": 8.73835394836342e-08, + "loss": 0.9663, + "step": 19242 + }, + { + "epoch": 0.9622, + "grad_norm": 8.071429252624512, + "learning_rate": 8.692368930493522e-08, + "loss": 0.3551, + "step": 19244 + }, + { + "epoch": 0.9623, + "grad_norm": 4.380390167236328, + "learning_rate": 8.646504700451253e-08, + "loss": 0.5943, + "step": 19246 + }, + { + "epoch": 0.9624, + "grad_norm": 3.407837152481079, + "learning_rate": 8.600761263825475e-08, + "loss": 0.7066, + "step": 19248 + }, + { + "epoch": 0.9625, + "grad_norm": 3.428234100341797, + "learning_rate": 8.555138626189619e-08, + "loss": 0.7909, + "step": 19250 + }, + { + "epoch": 0.9626, + "grad_norm": 5.108121395111084, + "learning_rate": 8.509636793102683e-08, + "loss": 0.5903, + "step": 19252 + }, + { + "epoch": 0.9627, + "grad_norm": 14.08668327331543, + "learning_rate": 8.46425577010912e-08, + "loss": 1.0508, + "step": 19254 + }, + { + "epoch": 0.9628, + "grad_norm": 4.480656147003174, + "learning_rate": 8.418995562738286e-08, + "loss": 0.2697, + "step": 19256 + }, + { + "epoch": 0.9629, + "grad_norm": 6.624965667724609, + "learning_rate": 8.373856176505101e-08, + "loss": 1.3001, + "step": 19258 + }, + { + "epoch": 0.963, + "grad_norm": 3.804664134979248, + "learning_rate": 8.328837616909612e-08, + "loss": 0.8428, + "step": 19260 + }, + { + "epoch": 0.9631, + "grad_norm": 9.578962326049805, + "learning_rate": 8.283939889437209e-08, + "loss": 0.4824, + "step": 19262 + }, + { + "epoch": 0.9632, + "grad_norm": 17.506820678710938, + "learning_rate": 8.239162999558403e-08, + "loss": 0.8144, + "step": 19264 + }, + { + "epoch": 0.9633, + "grad_norm": 7.634217262268066, + "learning_rate": 8.194506952729386e-08, + "loss": 0.1632, + "step": 19266 + }, + { + "epoch": 0.9634, + "grad_norm": 2.9949872493743896, + "learning_rate": 8.149971754391251e-08, + "loss": 1.1514, + "step": 19268 + }, + { + "epoch": 0.9635, + "grad_norm": 4.473271369934082, + "learning_rate": 8.105557409970433e-08, + "loss": 0.902, + "step": 19270 + }, + { + "epoch": 0.9636, + "grad_norm": 4.77285623550415, + "learning_rate": 8.061263924878604e-08, + "loss": 0.7321, + "step": 19272 + }, + { + "epoch": 0.9637, + "grad_norm": 15.369714736938477, + "learning_rate": 8.017091304513003e-08, + "loss": 1.1291, + "step": 19274 + }, + { + "epoch": 0.9638, + "grad_norm": 5.724766731262207, + "learning_rate": 7.973039554255768e-08, + "loss": 1.2134, + "step": 19276 + }, + { + "epoch": 0.9639, + "grad_norm": 0.3864465057849884, + "learning_rate": 7.929108679474607e-08, + "loss": 0.2305, + "step": 19278 + }, + { + "epoch": 0.964, + "grad_norm": 9.183984756469727, + "learning_rate": 7.885298685522235e-08, + "loss": 0.6986, + "step": 19280 + }, + { + "epoch": 0.9641, + "grad_norm": 3.751142740249634, + "learning_rate": 7.841609577736719e-08, + "loss": 1.0493, + "step": 19282 + }, + { + "epoch": 0.9642, + "grad_norm": 4.2353739738464355, + "learning_rate": 7.798041361441688e-08, + "loss": 0.2685, + "step": 19284 + }, + { + "epoch": 0.9643, + "grad_norm": 3.506911516189575, + "learning_rate": 7.754594041945562e-08, + "loss": 1.0822, + "step": 19286 + }, + { + "epoch": 0.9644, + "grad_norm": 4.836272239685059, + "learning_rate": 7.71126762454233e-08, + "loss": 1.1537, + "step": 19288 + }, + { + "epoch": 0.9645, + "grad_norm": 19.833837509155273, + "learning_rate": 7.66806211451132e-08, + "loss": 0.8644, + "step": 19290 + }, + { + "epoch": 0.9646, + "grad_norm": 1.3978720903396606, + "learning_rate": 7.624977517116772e-08, + "loss": 0.9, + "step": 19292 + }, + { + "epoch": 0.9647, + "grad_norm": 7.7393269538879395, + "learning_rate": 7.582013837608592e-08, + "loss": 1.0676, + "step": 19294 + }, + { + "epoch": 0.9648, + "grad_norm": 2.5723180770874023, + "learning_rate": 7.539171081221597e-08, + "loss": 1.301, + "step": 19296 + }, + { + "epoch": 0.9649, + "grad_norm": 4.1392974853515625, + "learning_rate": 7.496449253176274e-08, + "loss": 0.6986, + "step": 19298 + }, + { + "epoch": 0.965, + "grad_norm": 7.164106845855713, + "learning_rate": 7.453848358678018e-08, + "loss": 1.4089, + "step": 19300 + }, + { + "epoch": 0.9651, + "grad_norm": 4.430750370025635, + "learning_rate": 7.411368402917563e-08, + "loss": 0.9427, + "step": 19302 + }, + { + "epoch": 0.9652, + "grad_norm": 4.975013732910156, + "learning_rate": 7.369009391070992e-08, + "loss": 0.501, + "step": 19304 + }, + { + "epoch": 0.9653, + "grad_norm": 4.489003658294678, + "learning_rate": 7.326771328299732e-08, + "loss": 1.5555, + "step": 19306 + }, + { + "epoch": 0.9654, + "grad_norm": 10.123655319213867, + "learning_rate": 7.284654219750332e-08, + "loss": 0.6954, + "step": 19308 + }, + { + "epoch": 0.9655, + "grad_norm": 13.821763038635254, + "learning_rate": 7.242658070554465e-08, + "loss": 0.9581, + "step": 19310 + }, + { + "epoch": 0.9656, + "grad_norm": 8.929611206054688, + "learning_rate": 7.200782885829482e-08, + "loss": 0.6343, + "step": 19312 + }, + { + "epoch": 0.9657, + "grad_norm": 11.78345012664795, + "learning_rate": 7.159028670677526e-08, + "loss": 0.5791, + "step": 19314 + }, + { + "epoch": 0.9658, + "grad_norm": 1.240849256515503, + "learning_rate": 7.117395430186414e-08, + "loss": 0.4069, + "step": 19316 + }, + { + "epoch": 0.9659, + "grad_norm": 2.2971580028533936, + "learning_rate": 7.075883169428755e-08, + "loss": 0.8032, + "step": 19318 + }, + { + "epoch": 0.966, + "grad_norm": 5.281668186187744, + "learning_rate": 7.034491893463059e-08, + "loss": 0.8676, + "step": 19320 + }, + { + "epoch": 0.9661, + "grad_norm": 4.382662773132324, + "learning_rate": 6.993221607332401e-08, + "loss": 0.406, + "step": 19322 + }, + { + "epoch": 0.9662, + "grad_norm": 6.363682746887207, + "learning_rate": 6.95207231606576e-08, + "loss": 0.9748, + "step": 19324 + }, + { + "epoch": 0.9663, + "grad_norm": 0.06852955371141434, + "learning_rate": 6.911044024676683e-08, + "loss": 0.5453, + "step": 19326 + }, + { + "epoch": 0.9664, + "grad_norm": 3.7637453079223633, + "learning_rate": 6.870136738164612e-08, + "loss": 0.4383, + "step": 19328 + }, + { + "epoch": 0.9665, + "grad_norm": 5.6677374839782715, + "learning_rate": 6.829350461514007e-08, + "loss": 0.8663, + "step": 19330 + }, + { + "epoch": 0.9666, + "grad_norm": 3.2688496112823486, + "learning_rate": 6.788685199694222e-08, + "loss": 0.7075, + "step": 19332 + }, + { + "epoch": 0.9667, + "grad_norm": 3.535930633544922, + "learning_rate": 6.748140957660632e-08, + "loss": 1.3913, + "step": 19334 + }, + { + "epoch": 0.9668, + "grad_norm": 2.3497464656829834, + "learning_rate": 6.707717740353059e-08, + "loss": 0.2956, + "step": 19336 + }, + { + "epoch": 0.9669, + "grad_norm": 2.9321072101593018, + "learning_rate": 6.667415552697121e-08, + "loss": 1.3203, + "step": 19338 + }, + { + "epoch": 0.967, + "grad_norm": 8.505166053771973, + "learning_rate": 6.627234399603554e-08, + "loss": 1.0606, + "step": 19340 + }, + { + "epoch": 0.9671, + "grad_norm": 2.9601426124572754, + "learning_rate": 6.587174285968223e-08, + "loss": 0.8419, + "step": 19342 + }, + { + "epoch": 0.9672, + "grad_norm": 5.634332180023193, + "learning_rate": 6.547235216672443e-08, + "loss": 1.0108, + "step": 19344 + }, + { + "epoch": 0.9673, + "grad_norm": 2.933441400527954, + "learning_rate": 6.507417196582544e-08, + "loss": 0.5043, + "step": 19346 + }, + { + "epoch": 0.9674, + "grad_norm": 5.588244915008545, + "learning_rate": 6.4677202305502e-08, + "loss": 1.1976, + "step": 19348 + }, + { + "epoch": 0.9675, + "grad_norm": 2.6879348754882812, + "learning_rate": 6.428144323412544e-08, + "loss": 1.5404, + "step": 19350 + }, + { + "epoch": 0.9676, + "grad_norm": 7.830679893493652, + "learning_rate": 6.388689479991606e-08, + "loss": 1.2525, + "step": 19352 + }, + { + "epoch": 0.9677, + "grad_norm": 4.566296577453613, + "learning_rate": 6.349355705094983e-08, + "loss": 0.5123, + "step": 19354 + }, + { + "epoch": 0.9678, + "grad_norm": 2.6216914653778076, + "learning_rate": 6.310143003515179e-08, + "loss": 0.6234, + "step": 19356 + }, + { + "epoch": 0.9679, + "grad_norm": 5.009812831878662, + "learning_rate": 6.271051380030368e-08, + "loss": 0.3867, + "step": 19358 + }, + { + "epoch": 0.968, + "grad_norm": 2.3545916080474854, + "learning_rate": 6.232080839403631e-08, + "loss": 1.0814, + "step": 19360 + }, + { + "epoch": 0.9681, + "grad_norm": 10.952310562133789, + "learning_rate": 6.193231386383391e-08, + "loss": 0.5878, + "step": 19362 + }, + { + "epoch": 0.9682, + "grad_norm": 3.0508081912994385, + "learning_rate": 6.154503025703418e-08, + "loss": 0.6794, + "step": 19364 + }, + { + "epoch": 0.9683, + "grad_norm": 6.07178258895874, + "learning_rate": 6.115895762082602e-08, + "loss": 0.4074, + "step": 19366 + }, + { + "epoch": 0.9684, + "grad_norm": 6.268693447113037, + "learning_rate": 6.07740960022507e-08, + "loss": 0.8137, + "step": 19368 + }, + { + "epoch": 0.9685, + "grad_norm": 5.893105506896973, + "learning_rate": 6.039044544820404e-08, + "loss": 0.9031, + "step": 19370 + }, + { + "epoch": 0.9686, + "grad_norm": 4.936570167541504, + "learning_rate": 6.000800600542977e-08, + "loss": 1.0029, + "step": 19372 + }, + { + "epoch": 0.9687, + "grad_norm": 3.77276349067688, + "learning_rate": 5.96267777205295e-08, + "loss": 0.4153, + "step": 19374 + }, + { + "epoch": 0.9688, + "grad_norm": 2.5780084133148193, + "learning_rate": 5.9246760639953824e-08, + "loss": 0.7394, + "step": 19376 + }, + { + "epoch": 0.9689, + "grad_norm": 5.3824944496154785, + "learning_rate": 5.886795481000795e-08, + "loss": 1.2547, + "step": 19378 + }, + { + "epoch": 0.969, + "grad_norm": 11.250140190124512, + "learning_rate": 5.849036027684607e-08, + "loss": 0.9293, + "step": 19380 + }, + { + "epoch": 0.9691, + "grad_norm": 2.819120407104492, + "learning_rate": 5.8113977086478037e-08, + "loss": 0.1313, + "step": 19382 + }, + { + "epoch": 0.9692, + "grad_norm": 2.998793840408325, + "learning_rate": 5.7738805284764945e-08, + "loss": 0.5355, + "step": 19384 + }, + { + "epoch": 0.9693, + "grad_norm": 2.2248375415802, + "learning_rate": 5.736484491742133e-08, + "loss": 0.3939, + "step": 19386 + }, + { + "epoch": 0.9694, + "grad_norm": 5.991446018218994, + "learning_rate": 5.699209603001077e-08, + "loss": 1.0528, + "step": 19388 + }, + { + "epoch": 0.9695, + "grad_norm": 5.3890380859375, + "learning_rate": 5.662055866795357e-08, + "loss": 1.2345, + "step": 19390 + }, + { + "epoch": 0.9696, + "grad_norm": 3.392103672027588, + "learning_rate": 5.625023287652021e-08, + "loss": 0.966, + "step": 19392 + }, + { + "epoch": 0.9697, + "grad_norm": 7.790553092956543, + "learning_rate": 5.588111870083346e-08, + "loss": 0.6911, + "step": 19394 + }, + { + "epoch": 0.9698, + "grad_norm": 3.623988628387451, + "learning_rate": 5.5513216185867356e-08, + "loss": 0.6659, + "step": 19396 + }, + { + "epoch": 0.9699, + "grad_norm": 2.2942450046539307, + "learning_rate": 5.514652537645271e-08, + "loss": 0.5715, + "step": 19398 + }, + { + "epoch": 0.97, + "grad_norm": 5.707869529724121, + "learning_rate": 5.4781046317267103e-08, + "loss": 1.1785, + "step": 19400 + }, + { + "epoch": 0.9701, + "grad_norm": 2.6863956451416016, + "learning_rate": 5.4416779052843814e-08, + "loss": 0.6199, + "step": 19402 + }, + { + "epoch": 0.9702, + "grad_norm": 6.302961826324463, + "learning_rate": 5.4053723627567336e-08, + "loss": 0.8454, + "step": 19404 + }, + { + "epoch": 0.9703, + "grad_norm": 3.0985453128814697, + "learning_rate": 5.369188008567672e-08, + "loss": 0.3462, + "step": 19406 + }, + { + "epoch": 0.9704, + "grad_norm": 0.1436418890953064, + "learning_rate": 5.3331248471258926e-08, + "loss": 0.2996, + "step": 19408 + }, + { + "epoch": 0.9705, + "grad_norm": 4.2518391609191895, + "learning_rate": 5.29718288282588e-08, + "loss": 1.1169, + "step": 19410 + }, + { + "epoch": 0.9706, + "grad_norm": 7.721410751342773, + "learning_rate": 5.261362120046687e-08, + "loss": 1.309, + "step": 19412 + }, + { + "epoch": 0.9707, + "grad_norm": 4.031399726867676, + "learning_rate": 5.2256625631532663e-08, + "loss": 0.8593, + "step": 19414 + }, + { + "epoch": 0.9708, + "grad_norm": 4.121155261993408, + "learning_rate": 5.190084216495361e-08, + "loss": 0.8897, + "step": 19416 + }, + { + "epoch": 0.9709, + "grad_norm": 5.714001178741455, + "learning_rate": 5.154627084408059e-08, + "loss": 0.5427, + "step": 19418 + }, + { + "epoch": 0.971, + "grad_norm": 4.989934921264648, + "learning_rate": 5.119291171211793e-08, + "loss": 0.5224, + "step": 19420 + }, + { + "epoch": 0.9711, + "grad_norm": 3.2029073238372803, + "learning_rate": 5.084076481212119e-08, + "loss": 1.9338, + "step": 19422 + }, + { + "epoch": 0.9712, + "grad_norm": 3.5540292263031006, + "learning_rate": 5.048983018699827e-08, + "loss": 0.7795, + "step": 19424 + }, + { + "epoch": 0.9713, + "grad_norm": 3.8922805786132812, + "learning_rate": 5.0140107879509403e-08, + "loss": 0.2621, + "step": 19426 + }, + { + "epoch": 0.9714, + "grad_norm": 2.434065103530884, + "learning_rate": 4.979159793226718e-08, + "loss": 0.761, + "step": 19428 + }, + { + "epoch": 0.9715, + "grad_norm": 3.5385098457336426, + "learning_rate": 4.944430038773762e-08, + "loss": 1.4298, + "step": 19430 + }, + { + "epoch": 0.9716, + "grad_norm": 2.1356208324432373, + "learning_rate": 4.9098215288235776e-08, + "loss": 0.6777, + "step": 19432 + }, + { + "epoch": 0.9717, + "grad_norm": 7.965282440185547, + "learning_rate": 4.875334267593235e-08, + "loss": 0.9757, + "step": 19434 + }, + { + "epoch": 0.9718, + "grad_norm": 8.96241283416748, + "learning_rate": 4.840968259284817e-08, + "loss": 1.0184, + "step": 19436 + }, + { + "epoch": 0.9719, + "grad_norm": 8.737812042236328, + "learning_rate": 4.806723508085864e-08, + "loss": 0.9576, + "step": 19438 + }, + { + "epoch": 0.972, + "grad_norm": 14.038747787475586, + "learning_rate": 4.772600018168816e-08, + "loss": 1.3106, + "step": 19440 + }, + { + "epoch": 0.9721, + "grad_norm": 3.1047730445861816, + "learning_rate": 4.7385977936916796e-08, + "loss": 1.2406, + "step": 19442 + }, + { + "epoch": 0.9722, + "grad_norm": 12.365687370300293, + "learning_rate": 4.704716838797363e-08, + "loss": 0.8205, + "step": 19444 + }, + { + "epoch": 0.9723, + "grad_norm": 5.278129577636719, + "learning_rate": 4.670957157614453e-08, + "loss": 0.7435, + "step": 19446 + }, + { + "epoch": 0.9724, + "grad_norm": 3.4291064739227295, + "learning_rate": 4.6373187542561036e-08, + "loss": 1.096, + "step": 19448 + }, + { + "epoch": 0.9725, + "grad_norm": 5.415245056152344, + "learning_rate": 4.603801632821148e-08, + "loss": 0.9882, + "step": 19450 + }, + { + "epoch": 0.9726, + "grad_norm": 4.597330570220947, + "learning_rate": 4.570405797393762e-08, + "loss": 0.6174, + "step": 19452 + }, + { + "epoch": 0.9727, + "grad_norm": 3.4645864963531494, + "learning_rate": 4.537131252042914e-08, + "loss": 0.8835, + "step": 19454 + }, + { + "epoch": 0.9728, + "grad_norm": 2.987510919570923, + "learning_rate": 4.503978000823028e-08, + "loss": 0.5588, + "step": 19456 + }, + { + "epoch": 0.9729, + "grad_norm": 8.62527084350586, + "learning_rate": 4.470946047773761e-08, + "loss": 0.8641, + "step": 19458 + }, + { + "epoch": 0.973, + "grad_norm": 3.363814115524292, + "learning_rate": 4.438035396920004e-08, + "loss": 0.9171, + "step": 19460 + }, + { + "epoch": 0.9731, + "grad_norm": 3.8478639125823975, + "learning_rate": 4.405246052271772e-08, + "loss": 0.7029, + "step": 19462 + }, + { + "epoch": 0.9732, + "grad_norm": 3.6886396408081055, + "learning_rate": 4.3725780178243135e-08, + "loss": 1.3158, + "step": 19464 + }, + { + "epoch": 0.9733, + "grad_norm": 3.5695760250091553, + "learning_rate": 4.3400312975581114e-08, + "loss": 1.3344, + "step": 19466 + }, + { + "epoch": 0.9734, + "grad_norm": 1.6420527696609497, + "learning_rate": 4.3076058954391045e-08, + "loss": 0.3865, + "step": 19468 + }, + { + "epoch": 0.9735, + "grad_norm": 8.595229148864746, + "learning_rate": 4.275301815417909e-08, + "loss": 0.6503, + "step": 19470 + }, + { + "epoch": 0.9736, + "grad_norm": 4.797027111053467, + "learning_rate": 4.2431190614309334e-08, + "loss": 0.4548, + "step": 19472 + }, + { + "epoch": 0.9737, + "grad_norm": 7.2887468338012695, + "learning_rate": 4.211057637399374e-08, + "loss": 1.5726, + "step": 19474 + }, + { + "epoch": 0.9738, + "grad_norm": 3.8407177925109863, + "learning_rate": 4.179117547229883e-08, + "loss": 1.1098, + "step": 19476 + }, + { + "epoch": 0.9739, + "grad_norm": 7.63332462310791, + "learning_rate": 4.147298794814347e-08, + "loss": 0.6583, + "step": 19478 + }, + { + "epoch": 0.974, + "grad_norm": 3.1468284130096436, + "learning_rate": 4.115601384029666e-08, + "loss": 1.3834, + "step": 19480 + }, + { + "epoch": 0.9741, + "grad_norm": 4.6850080490112305, + "learning_rate": 4.084025318738083e-08, + "loss": 0.4083, + "step": 19482 + }, + { + "epoch": 0.9742, + "grad_norm": 7.223211765289307, + "learning_rate": 4.052570602787076e-08, + "loss": 0.8623, + "step": 19484 + }, + { + "epoch": 0.9743, + "grad_norm": 7.000033378601074, + "learning_rate": 4.021237240009468e-08, + "loss": 1.0199, + "step": 19486 + }, + { + "epoch": 0.9744, + "grad_norm": 6.174544811248779, + "learning_rate": 3.990025234222872e-08, + "loss": 0.2158, + "step": 19488 + }, + { + "epoch": 0.9745, + "grad_norm": 8.071942329406738, + "learning_rate": 3.9589345892304673e-08, + "loss": 1.0821, + "step": 19490 + }, + { + "epoch": 0.9746, + "grad_norm": 3.6260921955108643, + "learning_rate": 3.927965308820558e-08, + "loss": 0.5852, + "step": 19492 + }, + { + "epoch": 0.9747, + "grad_norm": 5.05721378326416, + "learning_rate": 3.897117396766681e-08, + "loss": 0.7119, + "step": 19494 + }, + { + "epoch": 0.9748, + "grad_norm": 1.8357152938842773, + "learning_rate": 3.866390856827495e-08, + "loss": 1.0837, + "step": 19496 + }, + { + "epoch": 0.9749, + "grad_norm": 2.4693660736083984, + "learning_rate": 3.8357856927471185e-08, + "loss": 0.746, + "step": 19498 + }, + { + "epoch": 0.975, + "grad_norm": 1.4629931449890137, + "learning_rate": 3.805301908254455e-08, + "loss": 0.8093, + "step": 19500 + }, + { + "epoch": 0.9751, + "grad_norm": 3.8855438232421875, + "learning_rate": 3.7749395070639795e-08, + "loss": 0.8222, + "step": 19502 + }, + { + "epoch": 0.9752, + "grad_norm": 2.5188517570495605, + "learning_rate": 3.7446984928753984e-08, + "loss": 0.5691, + "step": 19504 + }, + { + "epoch": 0.9753, + "grad_norm": 2.170273780822754, + "learning_rate": 3.7145788693732086e-08, + "loss": 1.0015, + "step": 19506 + }, + { + "epoch": 0.9754, + "grad_norm": 8.701078414916992, + "learning_rate": 3.684580640227586e-08, + "loss": 0.9742, + "step": 19508 + }, + { + "epoch": 0.9755, + "grad_norm": 6.431429862976074, + "learning_rate": 3.654703809093607e-08, + "loss": 1.0276, + "step": 19510 + }, + { + "epoch": 0.9756, + "grad_norm": 14.821151733398438, + "learning_rate": 3.6249483796116924e-08, + "loss": 0.6781, + "step": 19512 + }, + { + "epoch": 0.9757, + "grad_norm": 3.7955739498138428, + "learning_rate": 3.595314355407609e-08, + "loss": 1.0804, + "step": 19514 + }, + { + "epoch": 0.9758, + "grad_norm": 2.8753902912139893, + "learning_rate": 3.565801740092023e-08, + "loss": 1.1388, + "step": 19516 + }, + { + "epoch": 0.9759, + "grad_norm": 5.332594871520996, + "learning_rate": 3.536410537260948e-08, + "loss": 0.9739, + "step": 19518 + }, + { + "epoch": 0.976, + "grad_norm": 4.68341064453125, + "learning_rate": 3.50714075049563e-08, + "loss": 0.8787, + "step": 19520 + }, + { + "epoch": 0.9761, + "grad_norm": 4.105252265930176, + "learning_rate": 3.47799238336266e-08, + "loss": 0.5365, + "step": 19522 + }, + { + "epoch": 0.9762, + "grad_norm": 4.020605564117432, + "learning_rate": 3.4489654394134206e-08, + "loss": 1.0306, + "step": 19524 + }, + { + "epoch": 0.9763, + "grad_norm": 8.666617393493652, + "learning_rate": 3.4200599221848594e-08, + "loss": 1.6121, + "step": 19526 + }, + { + "epoch": 0.9764, + "grad_norm": 4.624138355255127, + "learning_rate": 3.391275835199159e-08, + "loss": 0.7712, + "step": 19528 + }, + { + "epoch": 0.9765, + "grad_norm": 3.4346747398376465, + "learning_rate": 3.362613181963404e-08, + "loss": 1.0574, + "step": 19530 + }, + { + "epoch": 0.9766, + "grad_norm": 2.6128411293029785, + "learning_rate": 3.3340719659701315e-08, + "loss": 0.5618, + "step": 19532 + }, + { + "epoch": 0.9767, + "grad_norm": 3.267258882522583, + "learning_rate": 3.305652190696895e-08, + "loss": 0.8675, + "step": 19534 + }, + { + "epoch": 0.9768, + "grad_norm": 8.082592964172363, + "learning_rate": 3.2773538596068134e-08, + "loss": 1.9833, + "step": 19536 + }, + { + "epoch": 0.9769, + "grad_norm": 6.17704963684082, + "learning_rate": 3.249176976147683e-08, + "loss": 0.9576, + "step": 19538 + }, + { + "epoch": 0.977, + "grad_norm": 3.5265390872955322, + "learning_rate": 3.22112154375287e-08, + "loss": 1.4275, + "step": 19540 + }, + { + "epoch": 0.9771, + "grad_norm": 6.875016689300537, + "learning_rate": 3.1931875658408604e-08, + "loss": 0.7842, + "step": 19542 + }, + { + "epoch": 0.9772, + "grad_norm": 2.271233081817627, + "learning_rate": 3.165375045815266e-08, + "loss": 1.4323, + "step": 19544 + }, + { + "epoch": 0.9773, + "grad_norm": 3.964524984359741, + "learning_rate": 3.137683987065043e-08, + "loss": 0.8465, + "step": 19546 + }, + { + "epoch": 0.9774, + "grad_norm": 3.5163216590881348, + "learning_rate": 3.110114392964159e-08, + "loss": 1.368, + "step": 19548 + }, + { + "epoch": 0.9775, + "grad_norm": 4.012831211090088, + "learning_rate": 3.082666266872036e-08, + "loss": 0.3467, + "step": 19550 + }, + { + "epoch": 0.9776, + "grad_norm": 0.8475341796875, + "learning_rate": 3.0553396121330015e-08, + "loss": 0.292, + "step": 19552 + }, + { + "epoch": 0.9777, + "grad_norm": 14.773455619812012, + "learning_rate": 3.028134432076835e-08, + "loss": 0.8506, + "step": 19554 + }, + { + "epoch": 0.9778, + "grad_norm": 4.062407970428467, + "learning_rate": 3.001050730018218e-08, + "loss": 0.566, + "step": 19556 + }, + { + "epoch": 0.9779, + "grad_norm": 12.498695373535156, + "learning_rate": 2.974088509257511e-08, + "loss": 1.3195, + "step": 19558 + }, + { + "epoch": 0.978, + "grad_norm": 2.353377342224121, + "learning_rate": 2.947247773079753e-08, + "loss": 0.9756, + "step": 19560 + }, + { + "epoch": 0.9781, + "grad_norm": 2.78255295753479, + "learning_rate": 2.9205285247555504e-08, + "loss": 0.3766, + "step": 19562 + }, + { + "epoch": 0.9782, + "grad_norm": 1.105752944946289, + "learning_rate": 2.8939307675402983e-08, + "loss": 0.7967, + "step": 19564 + }, + { + "epoch": 0.9783, + "grad_norm": 4.072922229766846, + "learning_rate": 2.8674545046751822e-08, + "loss": 1.0255, + "step": 19566 + }, + { + "epoch": 0.9784, + "grad_norm": 3.6715667247772217, + "learning_rate": 2.8410997393860663e-08, + "loss": 0.9759, + "step": 19568 + }, + { + "epoch": 0.9785, + "grad_norm": 4.765913009643555, + "learning_rate": 2.8148664748842702e-08, + "loss": 0.7204, + "step": 19570 + }, + { + "epoch": 0.9786, + "grad_norm": 6.58888053894043, + "learning_rate": 2.7887547143662375e-08, + "loss": 0.7544, + "step": 19572 + }, + { + "epoch": 0.9787, + "grad_norm": 5.254945278167725, + "learning_rate": 2.762764461013423e-08, + "loss": 0.9159, + "step": 19574 + }, + { + "epoch": 0.9788, + "grad_norm": 3.217808246612549, + "learning_rate": 2.7368957179929602e-08, + "loss": 1.0228, + "step": 19576 + }, + { + "epoch": 0.9789, + "grad_norm": 3.690962314605713, + "learning_rate": 2.711148488456772e-08, + "loss": 0.7827, + "step": 19578 + }, + { + "epoch": 0.979, + "grad_norm": 18.95390510559082, + "learning_rate": 2.6855227755419046e-08, + "loss": 1.9247, + "step": 19580 + }, + { + "epoch": 0.9791, + "grad_norm": 9.894643783569336, + "learning_rate": 2.6600185823709712e-08, + "loss": 0.811, + "step": 19582 + }, + { + "epoch": 0.9792, + "grad_norm": 5.137407302856445, + "learning_rate": 2.6346359120514863e-08, + "loss": 1.0977, + "step": 19584 + }, + { + "epoch": 0.9793, + "grad_norm": 5.325346946716309, + "learning_rate": 2.6093747676763093e-08, + "loss": 0.6373, + "step": 19586 + }, + { + "epoch": 0.9794, + "grad_norm": 10.630491256713867, + "learning_rate": 2.584235152323422e-08, + "loss": 1.7929, + "step": 19588 + }, + { + "epoch": 0.9795, + "grad_norm": 10.950763702392578, + "learning_rate": 2.5592170690560415e-08, + "loss": 1.14, + "step": 19590 + }, + { + "epoch": 0.9796, + "grad_norm": 5.245797157287598, + "learning_rate": 2.5343205209225062e-08, + "loss": 0.8311, + "step": 19592 + }, + { + "epoch": 0.9797, + "grad_norm": 4.331976413726807, + "learning_rate": 2.5095455109562795e-08, + "loss": 0.6177, + "step": 19594 + }, + { + "epoch": 0.9798, + "grad_norm": 3.8350329399108887, + "learning_rate": 2.484892042176279e-08, + "loss": 0.862, + "step": 19596 + }, + { + "epoch": 0.9799, + "grad_norm": 8.101247787475586, + "learning_rate": 2.4603601175864357e-08, + "loss": 1.1207, + "step": 19598 + }, + { + "epoch": 0.98, + "grad_norm": 7.897077560424805, + "learning_rate": 2.4359497401758026e-08, + "loss": 1.1366, + "step": 19600 + }, + { + "epoch": 0.9801, + "grad_norm": 2.2400903701782227, + "learning_rate": 2.4116609129187786e-08, + "loss": 0.6457, + "step": 19602 + }, + { + "epoch": 0.9802, + "grad_norm": 1.837314486503601, + "learning_rate": 2.3874936387747738e-08, + "loss": 0.497, + "step": 19604 + }, + { + "epoch": 0.9803, + "grad_norm": 3.4610133171081543, + "learning_rate": 2.3634479206886552e-08, + "loss": 1.5036, + "step": 19606 + }, + { + "epoch": 0.9804, + "grad_norm": 4.511185169219971, + "learning_rate": 2.339523761590301e-08, + "loss": 0.7508, + "step": 19608 + }, + { + "epoch": 0.9805, + "grad_norm": 13.765979766845703, + "learning_rate": 2.315721164394713e-08, + "loss": 2.7692, + "step": 19610 + }, + { + "epoch": 0.9806, + "grad_norm": 6.461192607879639, + "learning_rate": 2.292040132002238e-08, + "loss": 1.0683, + "step": 19612 + }, + { + "epoch": 0.9807, + "grad_norm": 4.475201606750488, + "learning_rate": 2.268480667298234e-08, + "loss": 0.5187, + "step": 19614 + }, + { + "epoch": 0.9808, + "grad_norm": 4.0994744300842285, + "learning_rate": 2.2450427731534052e-08, + "loss": 0.5045, + "step": 19616 + }, + { + "epoch": 0.9809, + "grad_norm": 8.567634582519531, + "learning_rate": 2.221726452423689e-08, + "loss": 0.97, + "step": 19618 + }, + { + "epoch": 0.981, + "grad_norm": 5.679925918579102, + "learning_rate": 2.1985317079500358e-08, + "loss": 0.4893, + "step": 19620 + }, + { + "epoch": 0.9811, + "grad_norm": 7.0233001708984375, + "learning_rate": 2.175458542558517e-08, + "loss": 0.8183, + "step": 19622 + }, + { + "epoch": 0.9812, + "grad_norm": 6.003610610961914, + "learning_rate": 2.152506959060774e-08, + "loss": 0.7884, + "step": 19624 + }, + { + "epoch": 0.9813, + "grad_norm": 3.1577606201171875, + "learning_rate": 2.1296769602532352e-08, + "loss": 0.4073, + "step": 19626 + }, + { + "epoch": 0.9814, + "grad_norm": 5.533236026763916, + "learning_rate": 2.1069685489176762e-08, + "loss": 1.1796, + "step": 19628 + }, + { + "epoch": 0.9815, + "grad_norm": 1.0437498092651367, + "learning_rate": 2.0843817278209943e-08, + "loss": 0.7569, + "step": 19630 + }, + { + "epoch": 0.9816, + "grad_norm": 3.802222728729248, + "learning_rate": 2.061916499715544e-08, + "loss": 1.1152, + "step": 19632 + }, + { + "epoch": 0.9817, + "grad_norm": 10.065625190734863, + "learning_rate": 2.0395728673383575e-08, + "loss": 0.7912, + "step": 19634 + }, + { + "epoch": 0.9818, + "grad_norm": 1.019012212753296, + "learning_rate": 2.017350833412146e-08, + "loss": 0.214, + "step": 19636 + }, + { + "epoch": 0.9819, + "grad_norm": 3.7741618156433105, + "learning_rate": 1.995250400644633e-08, + "loss": 1.5215, + "step": 19638 + }, + { + "epoch": 0.982, + "grad_norm": 16.157127380371094, + "learning_rate": 1.973271571728441e-08, + "loss": 0.8108, + "step": 19640 + }, + { + "epoch": 0.9821, + "grad_norm": 5.979916572570801, + "learning_rate": 1.9514143493417624e-08, + "loss": 0.9182, + "step": 19642 + }, + { + "epoch": 0.9822, + "grad_norm": 4.297214508056641, + "learning_rate": 1.929678736148022e-08, + "loss": 1.1513, + "step": 19644 + }, + { + "epoch": 0.9823, + "grad_norm": 4.465670585632324, + "learning_rate": 1.908064734795323e-08, + "loss": 0.8627, + "step": 19646 + }, + { + "epoch": 0.9824, + "grad_norm": 4.940378665924072, + "learning_rate": 1.886572347917337e-08, + "loss": 0.6442, + "step": 19648 + }, + { + "epoch": 0.9825, + "grad_norm": 4.986958026885986, + "learning_rate": 1.86520157813308e-08, + "loss": 0.7384, + "step": 19650 + }, + { + "epoch": 0.9826, + "grad_norm": 3.54321026802063, + "learning_rate": 1.8439524280462474e-08, + "loss": 1.7867, + "step": 19652 + }, + { + "epoch": 0.9827, + "grad_norm": 6.470366954803467, + "learning_rate": 1.8228249002461007e-08, + "loss": 1.0144, + "step": 19654 + }, + { + "epoch": 0.9828, + "grad_norm": 2.2216765880584717, + "learning_rate": 1.8018189973069144e-08, + "loss": 0.2732, + "step": 19656 + }, + { + "epoch": 0.9829, + "grad_norm": 4.353637218475342, + "learning_rate": 1.7809347217881966e-08, + "loss": 1.3071, + "step": 19658 + }, + { + "epoch": 0.983, + "grad_norm": 5.980717658996582, + "learning_rate": 1.7601720762346895e-08, + "loss": 0.6303, + "step": 19660 + }, + { + "epoch": 0.9831, + "grad_norm": 2.96423602104187, + "learning_rate": 1.7395310631762585e-08, + "loss": 1.2738, + "step": 19662 + }, + { + "epoch": 0.9832, + "grad_norm": 7.005360126495361, + "learning_rate": 1.7190116851280024e-08, + "loss": 1.1141, + "step": 19664 + }, + { + "epoch": 0.9833, + "grad_norm": 3.981509208679199, + "learning_rate": 1.698613944589922e-08, + "loss": 1.1422, + "step": 19666 + }, + { + "epoch": 0.9834, + "grad_norm": 5.396299839019775, + "learning_rate": 1.678337844047695e-08, + "loss": 0.2914, + "step": 19668 + }, + { + "epoch": 0.9835, + "grad_norm": 4.5340895652771, + "learning_rate": 1.6581833859716788e-08, + "loss": 0.7455, + "step": 19670 + }, + { + "epoch": 0.9836, + "grad_norm": 4.504086494445801, + "learning_rate": 1.6381505728176872e-08, + "loss": 0.7485, + "step": 19672 + }, + { + "epoch": 0.9837, + "grad_norm": 2.815438747406006, + "learning_rate": 1.618239407026767e-08, + "loss": 0.7305, + "step": 19674 + }, + { + "epoch": 0.9838, + "grad_norm": 4.438483238220215, + "learning_rate": 1.5984498910249778e-08, + "loss": 0.5819, + "step": 19676 + }, + { + "epoch": 0.9839, + "grad_norm": 6.729186058044434, + "learning_rate": 1.578782027223502e-08, + "loss": 0.4559, + "step": 19678 + }, + { + "epoch": 0.984, + "grad_norm": 4.880221366882324, + "learning_rate": 1.5592358180189782e-08, + "loss": 0.6348, + "step": 19680 + }, + { + "epoch": 0.9841, + "grad_norm": 3.465579032897949, + "learning_rate": 1.5398112657929453e-08, + "loss": 0.516, + "step": 19682 + }, + { + "epoch": 0.9842, + "grad_norm": 8.032076835632324, + "learning_rate": 1.5205083729122883e-08, + "loss": 0.9144, + "step": 19684 + }, + { + "epoch": 0.9843, + "grad_norm": 6.991009712219238, + "learning_rate": 1.5013271417290143e-08, + "loss": 1.0209, + "step": 19686 + }, + { + "epoch": 0.9844, + "grad_norm": 0.5979531407356262, + "learning_rate": 1.482267574580143e-08, + "loss": 0.4061, + "step": 19688 + }, + { + "epoch": 0.9845, + "grad_norm": 1.786592960357666, + "learning_rate": 1.4633296737882607e-08, + "loss": 1.5181, + "step": 19690 + }, + { + "epoch": 0.9846, + "grad_norm": 3.108546257019043, + "learning_rate": 1.4445134416607442e-08, + "loss": 1.2066, + "step": 19692 + }, + { + "epoch": 0.9847, + "grad_norm": 13.04734992980957, + "learning_rate": 1.425818880490315e-08, + "loss": 0.6063, + "step": 19694 + }, + { + "epoch": 0.9848, + "grad_norm": 0.26500368118286133, + "learning_rate": 1.4072459925548176e-08, + "loss": 0.6147, + "step": 19696 + }, + { + "epoch": 0.9849, + "grad_norm": 12.028108596801758, + "learning_rate": 1.3887947801173308e-08, + "loss": 0.6309, + "step": 19698 + }, + { + "epoch": 0.985, + "grad_norm": 5.667196273803711, + "learning_rate": 1.370465245426167e-08, + "loss": 0.9208, + "step": 19700 + }, + { + "epoch": 0.9851, + "grad_norm": 2.8561513423919678, + "learning_rate": 1.3522573907145397e-08, + "loss": 0.9948, + "step": 19702 + }, + { + "epoch": 0.9852, + "grad_norm": 18.803144454956055, + "learning_rate": 1.3341712182012301e-08, + "loss": 1.1774, + "step": 19704 + }, + { + "epoch": 0.9853, + "grad_norm": 3.5431418418884277, + "learning_rate": 1.3162067300898085e-08, + "loss": 1.0664, + "step": 19706 + }, + { + "epoch": 0.9854, + "grad_norm": 5.754012584686279, + "learning_rate": 1.2983639285693018e-08, + "loss": 0.9206, + "step": 19708 + }, + { + "epoch": 0.9855, + "grad_norm": 1.2489960193634033, + "learning_rate": 1.2806428158138596e-08, + "loss": 0.6267, + "step": 19710 + }, + { + "epoch": 0.9856, + "grad_norm": 2.50722336769104, + "learning_rate": 1.2630433939825326e-08, + "loss": 0.7853, + "step": 19712 + }, + { + "epoch": 0.9857, + "grad_norm": 15.15906047821045, + "learning_rate": 1.2455656652198279e-08, + "loss": 1.064, + "step": 19714 + }, + { + "epoch": 0.9858, + "grad_norm": 1.9374390840530396, + "learning_rate": 1.2282096316554858e-08, + "loss": 0.7714, + "step": 19716 + }, + { + "epoch": 0.9859, + "grad_norm": 4.370643615722656, + "learning_rate": 1.2109752954042597e-08, + "loss": 0.6353, + "step": 19718 + }, + { + "epoch": 0.986, + "grad_norm": 7.913616180419922, + "learning_rate": 1.1938626585660252e-08, + "loss": 0.7376, + "step": 19720 + }, + { + "epoch": 0.9861, + "grad_norm": 2.454669713973999, + "learning_rate": 1.1768717232257809e-08, + "loss": 0.5059, + "step": 19722 + }, + { + "epoch": 0.9862, + "grad_norm": 4.1177215576171875, + "learning_rate": 1.1600024914540931e-08, + "loss": 0.7819, + "step": 19724 + }, + { + "epoch": 0.9863, + "grad_norm": 12.636998176574707, + "learning_rate": 1.1432549653063174e-08, + "loss": 1.8482, + "step": 19726 + }, + { + "epoch": 0.9864, + "grad_norm": 16.746562957763672, + "learning_rate": 1.126629146822933e-08, + "loss": 1.0139, + "step": 19728 + }, + { + "epoch": 0.9865, + "grad_norm": 4.6682586669921875, + "learning_rate": 1.1101250380300965e-08, + "loss": 0.3749, + "step": 19730 + }, + { + "epoch": 0.9866, + "grad_norm": 6.931367874145508, + "learning_rate": 1.0937426409384223e-08, + "loss": 0.6701, + "step": 19732 + }, + { + "epoch": 0.9867, + "grad_norm": 6.285030364990234, + "learning_rate": 1.077481957544202e-08, + "loss": 0.4774, + "step": 19734 + }, + { + "epoch": 0.9868, + "grad_norm": 3.1172008514404297, + "learning_rate": 1.0613429898287397e-08, + "loss": 1.2899, + "step": 19736 + }, + { + "epoch": 0.9869, + "grad_norm": 7.8422160148620605, + "learning_rate": 1.0453257397585736e-08, + "loss": 0.4332, + "step": 19738 + }, + { + "epoch": 0.987, + "grad_norm": 4.3643717765808105, + "learning_rate": 1.0294302092853647e-08, + "loss": 1.4243, + "step": 19740 + }, + { + "epoch": 0.9871, + "grad_norm": 13.329190254211426, + "learning_rate": 1.013656400345786e-08, + "loss": 1.6153, + "step": 19742 + }, + { + "epoch": 0.9872, + "grad_norm": 5.3483452796936035, + "learning_rate": 9.980043148619668e-09, + "loss": 1.1011, + "step": 19744 + }, + { + "epoch": 0.9873, + "grad_norm": 5.4529290199279785, + "learning_rate": 9.824739547410477e-09, + "loss": 1.0177, + "step": 19746 + }, + { + "epoch": 0.9874, + "grad_norm": 2.6931607723236084, + "learning_rate": 9.670653218752935e-09, + "loss": 0.2323, + "step": 19748 + }, + { + "epoch": 0.9875, + "grad_norm": 2.834500789642334, + "learning_rate": 9.517784181422018e-09, + "loss": 0.8228, + "step": 19750 + }, + { + "epoch": 0.9876, + "grad_norm": 4.694814205169678, + "learning_rate": 9.366132454046162e-09, + "loss": 0.6871, + "step": 19752 + }, + { + "epoch": 0.9877, + "grad_norm": 2.2176730632781982, + "learning_rate": 9.215698055100586e-09, + "loss": 0.901, + "step": 19754 + }, + { + "epoch": 0.9878, + "grad_norm": 17.28916358947754, + "learning_rate": 9.066481002918403e-09, + "loss": 0.9404, + "step": 19756 + }, + { + "epoch": 0.9879, + "grad_norm": 4.84968900680542, + "learning_rate": 8.918481315678407e-09, + "loss": 0.5921, + "step": 19758 + }, + { + "epoch": 0.988, + "grad_norm": 8.9998197555542, + "learning_rate": 8.771699011416169e-09, + "loss": 0.626, + "step": 19760 + }, + { + "epoch": 0.9881, + "grad_norm": 9.792930603027344, + "learning_rate": 8.62613410801627e-09, + "loss": 1.0824, + "step": 19762 + }, + { + "epoch": 0.9882, + "grad_norm": 0.5538368821144104, + "learning_rate": 8.481786623214527e-09, + "loss": 0.5109, + "step": 19764 + }, + { + "epoch": 0.9883, + "grad_norm": 3.0209848880767822, + "learning_rate": 8.33865657459909e-09, + "loss": 0.506, + "step": 19766 + }, + { + "epoch": 0.9884, + "grad_norm": 2.4914562702178955, + "learning_rate": 8.196743979610455e-09, + "loss": 1.1697, + "step": 19768 + }, + { + "epoch": 0.9885, + "grad_norm": 5.168426036834717, + "learning_rate": 8.056048855540344e-09, + "loss": 0.6267, + "step": 19770 + }, + { + "epoch": 0.9886, + "grad_norm": 3.7865114212036133, + "learning_rate": 7.916571219531711e-09, + "loss": 0.7211, + "step": 19772 + }, + { + "epoch": 0.9887, + "grad_norm": 12.051025390625, + "learning_rate": 7.778311088579849e-09, + "loss": 1.0736, + "step": 19774 + }, + { + "epoch": 0.9888, + "grad_norm": 5.365987777709961, + "learning_rate": 7.641268479531283e-09, + "loss": 1.1338, + "step": 19776 + }, + { + "epoch": 0.9889, + "grad_norm": 3.2710647583007812, + "learning_rate": 7.505443409083767e-09, + "loss": 1.6756, + "step": 19778 + }, + { + "epoch": 0.989, + "grad_norm": 1.7524441480636597, + "learning_rate": 7.370835893788508e-09, + "loss": 0.8376, + "step": 19780 + }, + { + "epoch": 0.9891, + "grad_norm": 1.6529126167297363, + "learning_rate": 7.237445950044608e-09, + "loss": 0.7719, + "step": 19782 + }, + { + "epoch": 0.9892, + "grad_norm": 3.322226047515869, + "learning_rate": 7.105273594107953e-09, + "loss": 1.2565, + "step": 19784 + }, + { + "epoch": 0.9893, + "grad_norm": 4.081569671630859, + "learning_rate": 6.974318842081218e-09, + "loss": 1.7668, + "step": 19786 + }, + { + "epoch": 0.9894, + "grad_norm": 3.389479160308838, + "learning_rate": 6.844581709921639e-09, + "loss": 0.9814, + "step": 19788 + }, + { + "epoch": 0.9895, + "grad_norm": 10.693714141845703, + "learning_rate": 6.716062213437679e-09, + "loss": 0.8618, + "step": 19790 + }, + { + "epoch": 0.9896, + "grad_norm": 3.1960039138793945, + "learning_rate": 6.588760368287928e-09, + "loss": 1.0603, + "step": 19792 + }, + { + "epoch": 0.9897, + "grad_norm": 4.747997760772705, + "learning_rate": 6.4626761899855285e-09, + "loss": 1.0877, + "step": 19794 + }, + { + "epoch": 0.9898, + "grad_norm": 5.983870506286621, + "learning_rate": 6.3378096938915276e-09, + "loss": 0.3816, + "step": 19796 + }, + { + "epoch": 0.9899, + "grad_norm": 2.4413063526153564, + "learning_rate": 6.214160895222643e-09, + "loss": 0.9567, + "step": 19798 + }, + { + "epoch": 0.99, + "grad_norm": 4.050498008728027, + "learning_rate": 6.091729809042379e-09, + "loss": 0.9517, + "step": 19800 + }, + { + "epoch": 0.9901, + "grad_norm": 7.6636152267456055, + "learning_rate": 5.970516450271025e-09, + "loss": 1.1945, + "step": 19802 + }, + { + "epoch": 0.9902, + "grad_norm": 4.106207847595215, + "learning_rate": 5.850520833676765e-09, + "loss": 0.3711, + "step": 19804 + }, + { + "epoch": 0.9903, + "grad_norm": 6.333311080932617, + "learning_rate": 5.731742973881238e-09, + "loss": 0.5183, + "step": 19806 + }, + { + "epoch": 0.9904, + "grad_norm": 10.632869720458984, + "learning_rate": 5.614182885357311e-09, + "loss": 1.8742, + "step": 19808 + }, + { + "epoch": 0.9905, + "grad_norm": 5.131982803344727, + "learning_rate": 5.497840582429082e-09, + "loss": 0.9188, + "step": 19810 + }, + { + "epoch": 0.9906, + "grad_norm": 4.007627010345459, + "learning_rate": 5.382716079271877e-09, + "loss": 0.5785, + "step": 19812 + }, + { + "epoch": 0.9907, + "grad_norm": 4.351119041442871, + "learning_rate": 5.268809389913365e-09, + "loss": 0.9951, + "step": 19814 + }, + { + "epoch": 0.9908, + "grad_norm": 2.5257222652435303, + "learning_rate": 5.156120528233555e-09, + "loss": 0.6821, + "step": 19816 + }, + { + "epoch": 0.9909, + "grad_norm": 8.66934871673584, + "learning_rate": 5.044649507963684e-09, + "loss": 1.1891, + "step": 19818 + }, + { + "epoch": 0.991, + "grad_norm": 1.2927913665771484, + "learning_rate": 4.9343963426840006e-09, + "loss": 0.0279, + "step": 19820 + }, + { + "epoch": 0.9911, + "grad_norm": 14.25020694732666, + "learning_rate": 4.825361045831534e-09, + "loss": 1.4385, + "step": 19822 + }, + { + "epoch": 0.9912, + "grad_norm": 5.932011127471924, + "learning_rate": 4.717543630688992e-09, + "loss": 0.6798, + "step": 19824 + }, + { + "epoch": 0.9913, + "grad_norm": 8.3759126663208, + "learning_rate": 4.610944110394755e-09, + "loss": 0.8845, + "step": 19826 + }, + { + "epoch": 0.9914, + "grad_norm": 3.156914710998535, + "learning_rate": 4.505562497938431e-09, + "loss": 0.349, + "step": 19828 + }, + { + "epoch": 0.9915, + "grad_norm": 5.410774230957031, + "learning_rate": 4.4013988061597515e-09, + "loss": 1.2271, + "step": 19830 + }, + { + "epoch": 0.9916, + "grad_norm": 3.2184977531433105, + "learning_rate": 4.298453047749674e-09, + "loss": 0.5312, + "step": 19832 + }, + { + "epoch": 0.9917, + "grad_norm": 9.416964530944824, + "learning_rate": 4.196725235253718e-09, + "loss": 1.2237, + "step": 19834 + }, + { + "epoch": 0.9918, + "grad_norm": 3.888516426086426, + "learning_rate": 4.096215381066415e-09, + "loss": 0.6967, + "step": 19836 + }, + { + "epoch": 0.9919, + "grad_norm": 4.726244926452637, + "learning_rate": 3.996923497434635e-09, + "loss": 0.7537, + "step": 19838 + }, + { + "epoch": 0.992, + "grad_norm": 3.0076777935028076, + "learning_rate": 3.898849596456477e-09, + "loss": 1.0179, + "step": 19840 + }, + { + "epoch": 0.9921, + "grad_norm": 5.096513271331787, + "learning_rate": 3.8019936900812735e-09, + "loss": 0.7979, + "step": 19842 + }, + { + "epoch": 0.9922, + "grad_norm": 6.845061779022217, + "learning_rate": 3.7063557901129144e-09, + "loss": 1.1569, + "step": 19844 + }, + { + "epoch": 0.9923, + "grad_norm": 5.943205833435059, + "learning_rate": 3.61193590820208e-09, + "loss": 1.2868, + "step": 19846 + }, + { + "epoch": 0.9924, + "grad_norm": 4.264725685119629, + "learning_rate": 3.518734055855122e-09, + "loss": 1.1757, + "step": 19848 + }, + { + "epoch": 0.9925, + "grad_norm": 5.649845123291016, + "learning_rate": 3.4267502444274013e-09, + "loss": 1.1304, + "step": 19850 + }, + { + "epoch": 0.9926, + "grad_norm": 7.8548784255981445, + "learning_rate": 3.3359844851277302e-09, + "loss": 0.5356, + "step": 19852 + }, + { + "epoch": 0.9927, + "grad_norm": 4.591884613037109, + "learning_rate": 3.2464367890150394e-09, + "loss": 0.7936, + "step": 19854 + }, + { + "epoch": 0.9928, + "grad_norm": 4.297606468200684, + "learning_rate": 3.1581071670006013e-09, + "loss": 0.8756, + "step": 19856 + }, + { + "epoch": 0.9929, + "grad_norm": 9.842260360717773, + "learning_rate": 3.070995629846918e-09, + "loss": 0.4151, + "step": 19858 + }, + { + "epoch": 0.993, + "grad_norm": 9.294713020324707, + "learning_rate": 2.9851021881688314e-09, + "loss": 1.6994, + "step": 19860 + }, + { + "epoch": 0.9931, + "grad_norm": 7.926208019256592, + "learning_rate": 2.9004268524313038e-09, + "loss": 1.1192, + "step": 19862 + }, + { + "epoch": 0.9932, + "grad_norm": 16.13078498840332, + "learning_rate": 2.8169696329527484e-09, + "loss": 1.5684, + "step": 19864 + }, + { + "epoch": 0.9933, + "grad_norm": 6.2420148849487305, + "learning_rate": 2.7347305399016975e-09, + "loss": 0.976, + "step": 19866 + }, + { + "epoch": 0.9934, + "grad_norm": 3.357750654220581, + "learning_rate": 2.6537095832990247e-09, + "loss": 0.4672, + "step": 19868 + }, + { + "epoch": 0.9935, + "grad_norm": 6.548162937164307, + "learning_rate": 2.573906773016832e-09, + "loss": 0.2371, + "step": 19870 + }, + { + "epoch": 0.9936, + "grad_norm": 2.7948215007781982, + "learning_rate": 2.495322118778454e-09, + "loss": 1.4456, + "step": 19872 + }, + { + "epoch": 0.9937, + "grad_norm": 4.390927791595459, + "learning_rate": 2.417955630159563e-09, + "loss": 0.7727, + "step": 19874 + }, + { + "epoch": 0.9938, + "grad_norm": 1.288460373878479, + "learning_rate": 2.341807316587064e-09, + "loss": 0.3162, + "step": 19876 + }, + { + "epoch": 0.9939, + "grad_norm": 7.473386764526367, + "learning_rate": 2.2668771873390895e-09, + "loss": 1.3113, + "step": 19878 + }, + { + "epoch": 0.994, + "grad_norm": 6.755417346954346, + "learning_rate": 2.193165251545004e-09, + "loss": 0.8021, + "step": 19880 + }, + { + "epoch": 0.9941, + "grad_norm": 6.435380458831787, + "learning_rate": 2.1206715181876225e-09, + "loss": 0.6844, + "step": 19882 + }, + { + "epoch": 0.9942, + "grad_norm": 7.9883832931518555, + "learning_rate": 2.049395996099879e-09, + "loss": 0.9974, + "step": 19884 + }, + { + "epoch": 0.9943, + "grad_norm": 0.574100911617279, + "learning_rate": 1.9793386939659378e-09, + "loss": 0.8388, + "step": 19886 + }, + { + "epoch": 0.9944, + "grad_norm": 3.9229235649108887, + "learning_rate": 1.910499620322304e-09, + "loss": 0.4564, + "step": 19888 + }, + { + "epoch": 0.9945, + "grad_norm": 7.612203121185303, + "learning_rate": 1.8428787835578222e-09, + "loss": 0.8028, + "step": 19890 + }, + { + "epoch": 0.9946, + "grad_norm": 4.160488605499268, + "learning_rate": 1.776476191910348e-09, + "loss": 0.5998, + "step": 19892 + }, + { + "epoch": 0.9947, + "grad_norm": 2.624274969100952, + "learning_rate": 1.7112918534711865e-09, + "loss": 0.8856, + "step": 19894 + }, + { + "epoch": 0.9948, + "grad_norm": 5.444456577301025, + "learning_rate": 1.647325776182873e-09, + "loss": 0.7615, + "step": 19896 + }, + { + "epoch": 0.9949, + "grad_norm": 4.421595573425293, + "learning_rate": 1.584577967840284e-09, + "loss": 0.3483, + "step": 19898 + }, + { + "epoch": 0.995, + "grad_norm": 15.11137866973877, + "learning_rate": 1.5230484360873043e-09, + "loss": 1.3301, + "step": 19900 + }, + { + "epoch": 0.9951, + "grad_norm": 13.50172233581543, + "learning_rate": 1.4627371884234909e-09, + "loss": 1.4243, + "step": 19902 + }, + { + "epoch": 0.9952, + "grad_norm": 3.2734932899475098, + "learning_rate": 1.4036442321962995e-09, + "loss": 0.7694, + "step": 19904 + }, + { + "epoch": 0.9953, + "grad_norm": 2.6042420864105225, + "learning_rate": 1.3457695746055265e-09, + "loss": 0.8348, + "step": 19906 + }, + { + "epoch": 0.9954, + "grad_norm": 10.09923267364502, + "learning_rate": 1.2891132227033087e-09, + "loss": 0.4744, + "step": 19908 + }, + { + "epoch": 0.9955, + "grad_norm": 4.337605953216553, + "learning_rate": 1.233675183394123e-09, + "loss": 0.4956, + "step": 19910 + }, + { + "epoch": 0.9956, + "grad_norm": 5.215989112854004, + "learning_rate": 1.1794554634314558e-09, + "loss": 0.8414, + "step": 19912 + }, + { + "epoch": 0.9957, + "grad_norm": 8.513381004333496, + "learning_rate": 1.126454069423355e-09, + "loss": 1.0594, + "step": 19914 + }, + { + "epoch": 0.9958, + "grad_norm": 2.480914831161499, + "learning_rate": 1.0746710078257673e-09, + "loss": 0.7157, + "step": 19916 + }, + { + "epoch": 0.9959, + "grad_norm": 5.537534236907959, + "learning_rate": 1.0241062849503102e-09, + "loss": 1.2395, + "step": 19918 + }, + { + "epoch": 0.996, + "grad_norm": 2.9113004207611084, + "learning_rate": 9.74759906957612e-10, + "loss": 0.6964, + "step": 19920 + }, + { + "epoch": 0.9961, + "grad_norm": 1.8967117071151733, + "learning_rate": 9.2663187986064e-10, + "loss": 0.5394, + "step": 19922 + }, + { + "epoch": 0.9962, + "grad_norm": 5.352100372314453, + "learning_rate": 8.797222095224822e-10, + "loss": 1.1373, + "step": 19924 + }, + { + "epoch": 0.9963, + "grad_norm": 3.9546101093292236, + "learning_rate": 8.340309016585669e-10, + "loss": 0.9097, + "step": 19926 + }, + { + "epoch": 0.9964, + "grad_norm": 35.29146957397461, + "learning_rate": 7.895579618388827e-10, + "loss": 1.7144, + "step": 19928 + }, + { + "epoch": 0.9965, + "grad_norm": 4.282063961029053, + "learning_rate": 7.463033954802079e-10, + "loss": 1.0364, + "step": 19930 + }, + { + "epoch": 0.9966, + "grad_norm": 6.440047264099121, + "learning_rate": 7.042672078527712e-10, + "loss": 1.1359, + "step": 19932 + }, + { + "epoch": 0.9967, + "grad_norm": 7.15844202041626, + "learning_rate": 6.634494040802519e-10, + "loss": 1.1112, + "step": 19934 + }, + { + "epoch": 0.9968, + "grad_norm": 6.629772186279297, + "learning_rate": 6.238499891353389e-10, + "loss": 1.2311, + "step": 19936 + }, + { + "epoch": 0.9969, + "grad_norm": 5.485152244567871, + "learning_rate": 5.854689678419511e-10, + "loss": 1.0663, + "step": 19938 + }, + { + "epoch": 0.997, + "grad_norm": 2.8898303508758545, + "learning_rate": 5.483063448785686e-10, + "loss": 0.6836, + "step": 19940 + }, + { + "epoch": 0.9971, + "grad_norm": 1.9052832126617432, + "learning_rate": 5.123621247726807e-10, + "loss": 1.494, + "step": 19942 + }, + { + "epoch": 0.9972, + "grad_norm": 5.645745277404785, + "learning_rate": 4.77636311903007e-10, + "loss": 1.3743, + "step": 19944 + }, + { + "epoch": 0.9973, + "grad_norm": 4.9496073722839355, + "learning_rate": 4.441289105017177e-10, + "loss": 1.2905, + "step": 19946 + }, + { + "epoch": 0.9974, + "grad_norm": 7.286529541015625, + "learning_rate": 4.118399246522131e-10, + "loss": 1.7318, + "step": 19948 + }, + { + "epoch": 0.9975, + "grad_norm": 2.940696954727173, + "learning_rate": 3.807693582869032e-10, + "loss": 1.1015, + "step": 19950 + }, + { + "epoch": 0.9976, + "grad_norm": 3.3419559001922607, + "learning_rate": 3.509172151938689e-10, + "loss": 0.8909, + "step": 19952 + }, + { + "epoch": 0.9977, + "grad_norm": 2.0686116218566895, + "learning_rate": 3.222834990090906e-10, + "loss": 0.393, + "step": 19954 + }, + { + "epoch": 0.9978, + "grad_norm": 9.867607116699219, + "learning_rate": 2.948682132208891e-10, + "loss": 1.1247, + "step": 19956 + }, + { + "epoch": 0.9979, + "grad_norm": 2.9818501472473145, + "learning_rate": 2.6867136117214587e-10, + "loss": 0.7078, + "step": 19958 + }, + { + "epoch": 0.998, + "grad_norm": 2.4496192932128906, + "learning_rate": 2.436929460525317e-10, + "loss": 0.4968, + "step": 19960 + }, + { + "epoch": 0.9981, + "grad_norm": 2.8336730003356934, + "learning_rate": 2.1993297090627808e-10, + "loss": 0.9038, + "step": 19962 + }, + { + "epoch": 0.9982, + "grad_norm": 1.4381980895996094, + "learning_rate": 1.9739143862884668e-10, + "loss": 1.0257, + "step": 19964 + }, + { + "epoch": 0.9983, + "grad_norm": 3.4894540309906006, + "learning_rate": 1.760683519669293e-10, + "loss": 0.8638, + "step": 19966 + }, + { + "epoch": 0.9984, + "grad_norm": 5.293965816497803, + "learning_rate": 1.559637135173375e-10, + "loss": 1.5056, + "step": 19968 + }, + { + "epoch": 0.9985, + "grad_norm": 2.7894349098205566, + "learning_rate": 1.3707752573255406e-10, + "loss": 1.0963, + "step": 19970 + }, + { + "epoch": 0.9986, + "grad_norm": 2.7835450172424316, + "learning_rate": 1.1940979091074056e-10, + "loss": 0.731, + "step": 19972 + }, + { + "epoch": 0.9987, + "grad_norm": 4.518327713012695, + "learning_rate": 1.0296051120683991e-10, + "loss": 0.6047, + "step": 19974 + }, + { + "epoch": 0.9988, + "grad_norm": 3.73995041847229, + "learning_rate": 8.772968862369447e-11, + "loss": 0.3832, + "step": 19976 + }, + { + "epoch": 0.9989, + "grad_norm": 8.350715637207031, + "learning_rate": 7.37173250175971e-11, + "loss": 0.8507, + "step": 19978 + }, + { + "epoch": 0.999, + "grad_norm": 16.121816635131836, + "learning_rate": 6.092342209607083e-11, + "loss": 0.7728, + "step": 19980 + }, + { + "epoch": 0.9991, + "grad_norm": 3.393390655517578, + "learning_rate": 4.934798141786878e-11, + "loss": 0.9042, + "step": 19982 + }, + { + "epoch": 0.9992, + "grad_norm": 2.618628978729248, + "learning_rate": 3.899100439408443e-11, + "loss": 0.8415, + "step": 19984 + }, + { + "epoch": 0.9993, + "grad_norm": 10.466394424438477, + "learning_rate": 2.9852492285931125e-11, + "loss": 1.3578, + "step": 19986 + }, + { + "epoch": 0.9994, + "grad_norm": 2.9366493225097656, + "learning_rate": 2.1932446206962556e-11, + "loss": 1.2033, + "step": 19988 + }, + { + "epoch": 0.9995, + "grad_norm": 19.91321563720703, + "learning_rate": 1.5230867123072757e-11, + "loss": 1.6739, + "step": 19990 + }, + { + "epoch": 0.9996, + "grad_norm": 23.951034545898438, + "learning_rate": 9.74775584916543e-12, + "loss": 0.8488, + "step": 19992 + }, + { + "epoch": 0.9997, + "grad_norm": 4.581809997558594, + "learning_rate": 5.483113054705058e-12, + "loss": 0.9295, + "step": 19994 + }, + { + "epoch": 0.9998, + "grad_norm": 6.4173264503479, + "learning_rate": 2.4369392592760166e-12, + "loss": 0.9709, + "step": 19996 + }, + { + "epoch": 0.9999, + "grad_norm": 7.08517599105835, + "learning_rate": 6.092348336927956e-13, + "loss": 1.1056, + "step": 19998 + }, + { + "epoch": 1.0, + "grad_norm": 4.703798294067383, + "learning_rate": 0.0, + "loss": 0.7668, + "step": 20000 + }, + { + "epoch": 1.0, + "step": 20000, + "total_flos": 8.233403997067674e+16, + "train_loss": 0.9858472045598552, + "train_runtime": 7741.4793, + "train_samples_per_second": 2.583, + "train_steps_per_second": 2.583 + } + ], + "logging_steps": 2, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 8.233403997067674e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario20_new_10000_nosampling_seed1/client_0/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario20_new_10000_nosampling_seed1/client_0/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..56eb8595314b034b94815c79e4b8ce1fdad5e46e --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario20_new_10000_nosampling_seed1/client_0/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c7d6793c457c97a2a472852c668428e257217a472f617f6c9bfcb035428274f +size 3837841200 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario20_new_10000_nosampling_seed1/client_0/global_step20000/mp_rank_00_model_states.pt b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario20_new_10000_nosampling_seed1/client_0/global_step20000/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aab0ebb533e6186e935f41da314c605ff8bf851a --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario20_new_10000_nosampling_seed1/client_0/global_step20000/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa1de6e6171863c8a98ec3c6c3d8bcc7d8cfbf82ccc2a8fc97a41e94e422fdac +size 639989420 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario20_new_10000_nosampling_seed1/client_0/latest b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario20_new_10000_nosampling_seed1/client_0/latest new file mode 100644 index 0000000000000000000000000000000000000000..50908603509898f37e005b455ca2e7cad40a4bb0 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario20_new_10000_nosampling_seed1/client_0/latest @@ -0,0 +1 @@ +global_step20000 \ No newline at end of file diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario20_new_10000_nosampling_seed1/client_0/scheduler.pt b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario20_new_10000_nosampling_seed1/client_0/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..14adb2d77102c1156e0d951a1599df60b26efc39 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario20_new_10000_nosampling_seed1/client_0/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5734877ae2897df6e2a90f2862806048a9fe56423f6401033740a3f5d5d3d11 +size 1064 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario20_new_10000_nosampling_seed1/client_0/zero_to_fp32.py b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario20_new_10000_nosampling_seed1/client_0/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..e93cb1c95f15c1474642edb1978714075361bc04 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario20_new_10000_nosampling_seed1/client_0/zero_to_fp32.py @@ -0,0 +1,758 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: + shared_tensor = state_dict[converted_tensors[tensor_id]] + state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + state_dict[name] = tensor.contiguous() + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in shard_state_dict: + del state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario20_new_10000_nosampling_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario20_new_10000_nosampling_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..38a07418ab46c32bab44b3186b60726e970d158e --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sequential_scenario20_new_10000_nosampling_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a0a3b28ff849358e4f07a7cdc5cd5ccb6e6e85d423f84c331261b8fe3729bba +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_nosampling_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_nosampling_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c4f70db3a2507a5e886f1ba7eec269b841c9b7df --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_nosampling_seed1/0_trainer_state.json @@ -0,0 +1,4400 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1249, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016012810248198558, + "grad_norm": 1.5004663467407227, + "learning_rate": 2.4341906163790364e-06, + "loss": 1.0669, + "step": 2 + }, + { + "epoch": 0.0032025620496397116, + "grad_norm": 0.11599339544773102, + "learning_rate": 2.4708617956148052e-06, + "loss": 0.1818, + "step": 4 + }, + { + "epoch": 0.004803843074459567, + "grad_norm": 1.7548251152038574, + "learning_rate": 2.507768247396697e-06, + "loss": 1.3291, + "step": 6 + }, + { + "epoch": 0.006405124099279423, + "grad_norm": 2.502962350845337, + "learning_rate": 2.5449088184619065e-06, + "loss": 0.8205, + "step": 8 + }, + { + "epoch": 0.008006405124099279, + "grad_norm": 2.117067337036133, + "learning_rate": 2.5822823482318517e-06, + "loss": 0.8567, + "step": 10 + }, + { + "epoch": 0.009607686148919135, + "grad_norm": 0.15529029071331024, + "learning_rate": 2.6198876688483453e-06, + "loss": 0.1508, + "step": 12 + }, + { + "epoch": 0.01120896717373899, + "grad_norm": 1.556419849395752, + "learning_rate": 2.6577236052101764e-06, + "loss": 1.4124, + "step": 14 + }, + { + "epoch": 0.012810248198558846, + "grad_norm": 0.39232563972473145, + "learning_rate": 2.6957889750097866e-06, + "loss": 0.1069, + "step": 16 + }, + { + "epoch": 0.014411529223378704, + "grad_norm": 2.1366662979125977, + "learning_rate": 2.7340825887701848e-06, + "loss": 0.6589, + "step": 18 + }, + { + "epoch": 0.016012810248198558, + "grad_norm": 1.4933586120605469, + "learning_rate": 2.772603249882202e-06, + "loss": 1.0826, + "step": 20 + }, + { + "epoch": 0.017614091273018415, + "grad_norm": 0.9681024551391602, + "learning_rate": 2.81134975464178e-06, + "loss": 0.6565, + "step": 22 + }, + { + "epoch": 0.01921537229783827, + "grad_norm": 6.762082576751709, + "learning_rate": 2.850320892287688e-06, + "loss": 1.3359, + "step": 24 + }, + { + "epoch": 0.020816653322658127, + "grad_norm": 0.4144692122936249, + "learning_rate": 2.889515445039256e-06, + "loss": 0.4399, + "step": 26 + }, + { + "epoch": 0.02241793434747798, + "grad_norm": 1.0163583755493164, + "learning_rate": 2.928932188134529e-06, + "loss": 0.5102, + "step": 28 + }, + { + "epoch": 0.02401921537229784, + "grad_norm": 0.48482492566108704, + "learning_rate": 2.9685698898684355e-06, + "loss": 0.3604, + "step": 30 + }, + { + "epoch": 0.025620496397117692, + "grad_norm": 1.783494472503662, + "learning_rate": 3.00842731163137e-06, + "loss": 0.4979, + "step": 32 + }, + { + "epoch": 0.02722177742193755, + "grad_norm": 0.8115494251251221, + "learning_rate": 3.048503207947854e-06, + "loss": 0.8831, + "step": 34 + }, + { + "epoch": 0.028823058446757407, + "grad_norm": 0.4673188626766205, + "learning_rate": 3.0887963265154187e-06, + "loss": 0.2012, + "step": 36 + }, + { + "epoch": 0.03042433947157726, + "grad_norm": 0.2615116238594055, + "learning_rate": 3.129305408243829e-06, + "loss": 0.6928, + "step": 38 + }, + { + "epoch": 0.032025620496397116, + "grad_norm": 1.3333370685577393, + "learning_rate": 3.17002918729432e-06, + "loss": 1.6109, + "step": 40 + }, + { + "epoch": 0.03362690152121697, + "grad_norm": 1.0444493293762207, + "learning_rate": 3.2109663911192622e-06, + "loss": 0.892, + "step": 42 + }, + { + "epoch": 0.03522818254603683, + "grad_norm": 1.0018532276153564, + "learning_rate": 3.2521157405018146e-06, + "loss": 0.9139, + "step": 44 + }, + { + "epoch": 0.03682946357085669, + "grad_norm": 1.5275040864944458, + "learning_rate": 3.293475949595998e-06, + "loss": 1.3119, + "step": 46 + }, + { + "epoch": 0.03843074459567654, + "grad_norm": 1.3862746953964233, + "learning_rate": 3.335045725966829e-06, + "loss": 1.8048, + "step": 48 + }, + { + "epoch": 0.040032025620496396, + "grad_norm": 1.692179560661316, + "learning_rate": 3.3768237706306716e-06, + "loss": 0.7305, + "step": 50 + }, + { + "epoch": 0.041633306645316254, + "grad_norm": 0.7032297849655151, + "learning_rate": 3.418808778095917e-06, + "loss": 0.6075, + "step": 52 + }, + { + "epoch": 0.04323458767013611, + "grad_norm": 0.11847246438264847, + "learning_rate": 3.460999436403676e-06, + "loss": 0.1744, + "step": 54 + }, + { + "epoch": 0.04483586869495596, + "grad_norm": 2.055413007736206, + "learning_rate": 3.5033944271688624e-06, + "loss": 0.6932, + "step": 56 + }, + { + "epoch": 0.04643714971977582, + "grad_norm": 0.13576938211917877, + "learning_rate": 3.5459924256213596e-06, + "loss": 0.2902, + "step": 58 + }, + { + "epoch": 0.04803843074459568, + "grad_norm": 2.0845072269439697, + "learning_rate": 3.588792100647368e-06, + "loss": 1.7346, + "step": 60 + }, + { + "epoch": 0.049639711769415534, + "grad_norm": 0.3660995066165924, + "learning_rate": 3.6317921148310965e-06, + "loss": 0.3926, + "step": 62 + }, + { + "epoch": 0.051240992794235385, + "grad_norm": 1.964263677597046, + "learning_rate": 3.674991124496452e-06, + "loss": 1.2661, + "step": 64 + }, + { + "epoch": 0.05284227381905524, + "grad_norm": 0.17617757618427277, + "learning_rate": 3.7183877797491143e-06, + "loss": 0.057, + "step": 66 + }, + { + "epoch": 0.0544435548438751, + "grad_norm": 0.32967373728752136, + "learning_rate": 3.7619807245186824e-06, + "loss": 0.1531, + "step": 68 + }, + { + "epoch": 0.05604483586869496, + "grad_norm": 2.62912917137146, + "learning_rate": 3.8057685966010025e-06, + "loss": 2.5594, + "step": 70 + }, + { + "epoch": 0.057646116893514815, + "grad_norm": 0.8250241875648499, + "learning_rate": 3.849750027700842e-06, + "loss": 0.4691, + "step": 72 + }, + { + "epoch": 0.059247397918334666, + "grad_norm": 1.6612212657928467, + "learning_rate": 3.8939236434745184e-06, + "loss": 1.6559, + "step": 74 + }, + { + "epoch": 0.06084867894315452, + "grad_norm": 1.1231873035430908, + "learning_rate": 3.938288063572962e-06, + "loss": 0.3989, + "step": 76 + }, + { + "epoch": 0.06244995996797438, + "grad_norm": 0.5326964855194092, + "learning_rate": 3.982841901684792e-06, + "loss": 0.5177, + "step": 78 + }, + { + "epoch": 0.06405124099279423, + "grad_norm": 2.6332480907440186, + "learning_rate": 4.027583765579601e-06, + "loss": 0.909, + "step": 80 + }, + { + "epoch": 0.0656525220176141, + "grad_norm": 0.19248095154762268, + "learning_rate": 4.072512257151546e-06, + "loss": 0.4564, + "step": 82 + }, + { + "epoch": 0.06725380304243395, + "grad_norm": 1.5720062255859375, + "learning_rate": 4.117625972462988e-06, + "loss": 0.7833, + "step": 84 + }, + { + "epoch": 0.0688550840672538, + "grad_norm": 0.5304996371269226, + "learning_rate": 4.1629235017883285e-06, + "loss": 0.1872, + "step": 86 + }, + { + "epoch": 0.07045636509207366, + "grad_norm": 0.8136661648750305, + "learning_rate": 4.208403429658151e-06, + "loss": 0.5882, + "step": 88 + }, + { + "epoch": 0.07205764611689351, + "grad_norm": 0.49865245819091797, + "learning_rate": 4.254064334903347e-06, + "loss": 0.2808, + "step": 90 + }, + { + "epoch": 0.07365892714171338, + "grad_norm": 2.9532620906829834, + "learning_rate": 4.299904790699619e-06, + "loss": 0.9904, + "step": 92 + }, + { + "epoch": 0.07526020816653323, + "grad_norm": 1.6501909494400024, + "learning_rate": 4.345923364612024e-06, + "loss": 1.3817, + "step": 94 + }, + { + "epoch": 0.07686148919135308, + "grad_norm": 1.1663086414337158, + "learning_rate": 4.392118618639698e-06, + "loss": 0.8088, + "step": 96 + }, + { + "epoch": 0.07846277021617294, + "grad_norm": 0.3673509955406189, + "learning_rate": 4.4384891092608795e-06, + "loss": 0.2089, + "step": 98 + }, + { + "epoch": 0.08006405124099279, + "grad_norm": 0.29378700256347656, + "learning_rate": 4.485033387477915e-06, + "loss": 0.9931, + "step": 100 + }, + { + "epoch": 0.08166533226581266, + "grad_norm": 1.626305103302002, + "learning_rate": 4.531749998862628e-06, + "loss": 1.6936, + "step": 102 + }, + { + "epoch": 0.08326661329063251, + "grad_norm": 1.367993712425232, + "learning_rate": 4.578637483601732e-06, + "loss": 0.9095, + "step": 104 + }, + { + "epoch": 0.08486789431545236, + "grad_norm": 0.2807723581790924, + "learning_rate": 4.625694376542399e-06, + "loss": 0.5995, + "step": 106 + }, + { + "epoch": 0.08646917534027222, + "grad_norm": 0.7695561051368713, + "learning_rate": 4.672919207238145e-06, + "loss": 1.1104, + "step": 108 + }, + { + "epoch": 0.08807045636509207, + "grad_norm": 1.238323450088501, + "learning_rate": 4.720310499994664e-06, + "loss": 1.1323, + "step": 110 + }, + { + "epoch": 0.08967173738991192, + "grad_norm": 0.39419683814048767, + "learning_rate": 4.767866773916041e-06, + "loss": 0.5738, + "step": 112 + }, + { + "epoch": 0.09127301841473179, + "grad_norm": 2.332308292388916, + "learning_rate": 4.81558654295099e-06, + "loss": 0.9931, + "step": 114 + }, + { + "epoch": 0.09287429943955164, + "grad_norm": 0.4865284562110901, + "learning_rate": 4.863468315939234e-06, + "loss": 0.4002, + "step": 116 + }, + { + "epoch": 0.0944755804643715, + "grad_norm": 2.0963590145111084, + "learning_rate": 4.911510596658202e-06, + "loss": 0.8449, + "step": 118 + }, + { + "epoch": 0.09607686148919135, + "grad_norm": 0.9347001910209656, + "learning_rate": 4.959711883869734e-06, + "loss": 0.5004, + "step": 120 + }, + { + "epoch": 0.0976781425140112, + "grad_norm": 1.359710693359375, + "learning_rate": 5.0080706713669435e-06, + "loss": 0.6106, + "step": 122 + }, + { + "epoch": 0.09927942353883107, + "grad_norm": 0.434155136346817, + "learning_rate": 5.056585448021398e-06, + "loss": 0.3104, + "step": 124 + }, + { + "epoch": 0.10088070456365092, + "grad_norm": 1.3239470720291138, + "learning_rate": 5.105254697830208e-06, + "loss": 0.8625, + "step": 126 + }, + { + "epoch": 0.10248198558847077, + "grad_norm": 0.5976133346557617, + "learning_rate": 5.154076899963514e-06, + "loss": 0.7191, + "step": 128 + }, + { + "epoch": 0.10408326661329063, + "grad_norm": 0.43495479226112366, + "learning_rate": 5.203050528811959e-06, + "loss": 0.2425, + "step": 130 + }, + { + "epoch": 0.10568454763811048, + "grad_norm": 0.7092037200927734, + "learning_rate": 5.2521740540343205e-06, + "loss": 0.2325, + "step": 132 + }, + { + "epoch": 0.10728582866293035, + "grad_norm": 1.9537054300308228, + "learning_rate": 5.3014459406054295e-06, + "loss": 1.1439, + "step": 134 + }, + { + "epoch": 0.1088871096877502, + "grad_norm": 1.9745272397994995, + "learning_rate": 5.350864648864026e-06, + "loss": 1.774, + "step": 136 + }, + { + "epoch": 0.11048839071257005, + "grad_norm": 7.290694713592529, + "learning_rate": 5.4004286345609665e-06, + "loss": 1.991, + "step": 138 + }, + { + "epoch": 0.11208967173738991, + "grad_norm": 3.122187852859497, + "learning_rate": 5.450136348907444e-06, + "loss": 0.4878, + "step": 140 + }, + { + "epoch": 0.11369095276220977, + "grad_norm": 1.9484702348709106, + "learning_rate": 5.499986238623329e-06, + "loss": 0.6344, + "step": 142 + }, + { + "epoch": 0.11529223378702963, + "grad_norm": 0.5919404625892639, + "learning_rate": 5.549976745985809e-06, + "loss": 0.6714, + "step": 144 + }, + { + "epoch": 0.11689351481184948, + "grad_norm": 2.202042579650879, + "learning_rate": 5.6001063088780085e-06, + "loss": 0.5936, + "step": 146 + }, + { + "epoch": 0.11849479583666933, + "grad_norm": 0.7151498198509216, + "learning_rate": 5.650373360837763e-06, + "loss": 0.9592, + "step": 148 + }, + { + "epoch": 0.1200960768614892, + "grad_norm": 1.6252286434173584, + "learning_rate": 5.700776331106674e-06, + "loss": 0.721, + "step": 150 + }, + { + "epoch": 0.12169735788630905, + "grad_norm": 0.11620815843343735, + "learning_rate": 5.751313644679071e-06, + "loss": 0.5327, + "step": 152 + }, + { + "epoch": 0.1232986389111289, + "grad_norm": 1.002037763595581, + "learning_rate": 5.8019837223513295e-06, + "loss": 0.7654, + "step": 154 + }, + { + "epoch": 0.12489991993594876, + "grad_norm": 1.3588364124298096, + "learning_rate": 5.852784980771182e-06, + "loss": 1.1198, + "step": 156 + }, + { + "epoch": 0.1265012009607686, + "grad_norm": 0.6529707312583923, + "learning_rate": 5.903715832487138e-06, + "loss": 0.4162, + "step": 158 + }, + { + "epoch": 0.12810248198558846, + "grad_norm": 2.6190567016601562, + "learning_rate": 5.954774685998206e-06, + "loss": 1.6867, + "step": 160 + }, + { + "epoch": 0.1297037630104083, + "grad_norm": 3.5172793865203857, + "learning_rate": 6.005959945803494e-06, + "loss": 1.1061, + "step": 162 + }, + { + "epoch": 0.1313050440352282, + "grad_norm": 14.847761154174805, + "learning_rate": 6.057270012452186e-06, + "loss": 2.1512, + "step": 164 + }, + { + "epoch": 0.13290632506004804, + "grad_norm": 3.6225180625915527, + "learning_rate": 6.108703282593461e-06, + "loss": 1.6377, + "step": 166 + }, + { + "epoch": 0.1345076060848679, + "grad_norm": 1.5380995273590088, + "learning_rate": 6.160258149026557e-06, + "loss": 0.9269, + "step": 168 + }, + { + "epoch": 0.13610888710968774, + "grad_norm": 1.4536486864089966, + "learning_rate": 6.2119330007511014e-06, + "loss": 1.3047, + "step": 170 + }, + { + "epoch": 0.1377101681345076, + "grad_norm": 0.39711764454841614, + "learning_rate": 6.263726223017326e-06, + "loss": 0.7606, + "step": 172 + }, + { + "epoch": 0.13931144915932747, + "grad_norm": 2.168816328048706, + "learning_rate": 6.315636197376634e-06, + "loss": 1.7013, + "step": 174 + }, + { + "epoch": 0.14091273018414732, + "grad_norm": 0.688697338104248, + "learning_rate": 6.3676613017321305e-06, + "loss": 0.9326, + "step": 176 + }, + { + "epoch": 0.14251401120896717, + "grad_norm": 0.8165487051010132, + "learning_rate": 6.419799910389257e-06, + "loss": 0.3736, + "step": 178 + }, + { + "epoch": 0.14411529223378702, + "grad_norm": 0.4791257083415985, + "learning_rate": 6.472050394106689e-06, + "loss": 0.5631, + "step": 180 + }, + { + "epoch": 0.14571657325860687, + "grad_norm": 2.015381336212158, + "learning_rate": 6.524411120147204e-06, + "loss": 1.3075, + "step": 182 + }, + { + "epoch": 0.14731785428342675, + "grad_norm": 1.556729793548584, + "learning_rate": 6.576880452328645e-06, + "loss": 1.1742, + "step": 184 + }, + { + "epoch": 0.1489191353082466, + "grad_norm": 0.5485149025917053, + "learning_rate": 6.6294567510751675e-06, + "loss": 0.664, + "step": 186 + }, + { + "epoch": 0.15052041633306645, + "grad_norm": 3.6662025451660156, + "learning_rate": 6.682138373468341e-06, + "loss": 2.5653, + "step": 188 + }, + { + "epoch": 0.1521216973578863, + "grad_norm": 0.625605046749115, + "learning_rate": 6.734923673298605e-06, + "loss": 0.2515, + "step": 190 + }, + { + "epoch": 0.15372297838270615, + "grad_norm": 1.1685861349105835, + "learning_rate": 6.787811001116654e-06, + "loss": 0.6395, + "step": 192 + }, + { + "epoch": 0.15532425940752603, + "grad_norm": 0.40881064534187317, + "learning_rate": 6.840798704284939e-06, + "loss": 0.1986, + "step": 194 + }, + { + "epoch": 0.15692554043234588, + "grad_norm": 0.8681973814964294, + "learning_rate": 6.893885127029419e-06, + "loss": 0.7313, + "step": 196 + }, + { + "epoch": 0.15852682145716573, + "grad_norm": 0.6155533194541931, + "learning_rate": 6.94706861049117e-06, + "loss": 0.6637, + "step": 198 + }, + { + "epoch": 0.16012810248198558, + "grad_norm": 5.094818592071533, + "learning_rate": 7.000347492778341e-06, + "loss": 1.3671, + "step": 200 + }, + { + "epoch": 0.16172938350680544, + "grad_norm": 1.6668344736099243, + "learning_rate": 7.05372010901803e-06, + "loss": 0.2928, + "step": 202 + }, + { + "epoch": 0.1633306645316253, + "grad_norm": 0.4587869644165039, + "learning_rate": 7.1071847914082605e-06, + "loss": 0.5234, + "step": 204 + }, + { + "epoch": 0.16493194555644516, + "grad_norm": 3.3128199577331543, + "learning_rate": 7.160739869270219e-06, + "loss": 1.6496, + "step": 206 + }, + { + "epoch": 0.16653322658126501, + "grad_norm": 6.082993030548096, + "learning_rate": 7.214383669100317e-06, + "loss": 1.9769, + "step": 208 + }, + { + "epoch": 0.16813450760608487, + "grad_norm": 3.504732131958008, + "learning_rate": 7.268114514622635e-06, + "loss": 2.8359, + "step": 210 + }, + { + "epoch": 0.16973578863090472, + "grad_norm": 2.125244140625, + "learning_rate": 7.321930726841144e-06, + "loss": 1.1787, + "step": 212 + }, + { + "epoch": 0.17133706965572457, + "grad_norm": 1.6377960443496704, + "learning_rate": 7.375830624092336e-06, + "loss": 1.0679, + "step": 214 + }, + { + "epoch": 0.17293835068054444, + "grad_norm": 1.3317642211914062, + "learning_rate": 7.429812522097613e-06, + "loss": 0.6136, + "step": 216 + }, + { + "epoch": 0.1745396317053643, + "grad_norm": 1.6820716857910156, + "learning_rate": 7.4838747340160475e-06, + "loss": 0.5, + "step": 218 + }, + { + "epoch": 0.17614091273018415, + "grad_norm": 0.8493342995643616, + "learning_rate": 7.538015570497046e-06, + "loss": 0.3547, + "step": 220 + }, + { + "epoch": 0.177742193755004, + "grad_norm": 1.2354962825775146, + "learning_rate": 7.592233339733077e-06, + "loss": 0.6598, + "step": 222 + }, + { + "epoch": 0.17934347477982385, + "grad_norm": 2.5751326084136963, + "learning_rate": 7.646526347512665e-06, + "loss": 1.3372, + "step": 224 + }, + { + "epoch": 0.18094475580464373, + "grad_norm": 1.073601245880127, + "learning_rate": 7.70089289727319e-06, + "loss": 0.5803, + "step": 226 + }, + { + "epoch": 0.18254603682946358, + "grad_norm": 2.0171408653259277, + "learning_rate": 7.755331290154041e-06, + "loss": 0.3575, + "step": 228 + }, + { + "epoch": 0.18414731785428343, + "grad_norm": 0.7550749778747559, + "learning_rate": 7.809839825049565e-06, + "loss": 0.4745, + "step": 230 + }, + { + "epoch": 0.18574859887910328, + "grad_norm": 0.28771713376045227, + "learning_rate": 7.864416798662347e-06, + "loss": 0.1067, + "step": 232 + }, + { + "epoch": 0.18734987990392313, + "grad_norm": 0.5271168947219849, + "learning_rate": 7.919060505556376e-06, + "loss": 0.3599, + "step": 234 + }, + { + "epoch": 0.188951160928743, + "grad_norm": 0.9536488652229309, + "learning_rate": 7.973769238210291e-06, + "loss": 0.4183, + "step": 236 + }, + { + "epoch": 0.19055244195356286, + "grad_norm": 0.8162179589271545, + "learning_rate": 8.028541287070858e-06, + "loss": 0.9809, + "step": 238 + }, + { + "epoch": 0.1921537229783827, + "grad_norm": 1.302890419960022, + "learning_rate": 8.083374940606256e-06, + "loss": 0.9777, + "step": 240 + }, + { + "epoch": 0.19375500400320256, + "grad_norm": 0.6141533851623535, + "learning_rate": 8.138268485359684e-06, + "loss": 1.0386, + "step": 242 + }, + { + "epoch": 0.1953562850280224, + "grad_norm": 3.538051128387451, + "learning_rate": 8.193220206002785e-06, + "loss": 0.6166, + "step": 244 + }, + { + "epoch": 0.1969575660528423, + "grad_norm": 0.97962886095047, + "learning_rate": 8.248228385389349e-06, + "loss": 0.8379, + "step": 246 + }, + { + "epoch": 0.19855884707766214, + "grad_norm": 0.862324059009552, + "learning_rate": 8.303291304608936e-06, + "loss": 1.1463, + "step": 248 + }, + { + "epoch": 0.200160128102482, + "grad_norm": 1.2553560733795166, + "learning_rate": 8.358407243040524e-06, + "loss": 2.4234, + "step": 250 + }, + { + "epoch": 0.20176140912730184, + "grad_norm": 0.5202347040176392, + "learning_rate": 8.413574478406386e-06, + "loss": 0.6799, + "step": 252 + }, + { + "epoch": 0.2033626901521217, + "grad_norm": 2.3971855640411377, + "learning_rate": 8.468791286825856e-06, + "loss": 1.3848, + "step": 254 + }, + { + "epoch": 0.20496397117694154, + "grad_norm": 1.3474164009094238, + "learning_rate": 8.524055942869135e-06, + "loss": 0.4722, + "step": 256 + }, + { + "epoch": 0.20656525220176142, + "grad_norm": 0.9461628198623657, + "learning_rate": 8.579366719611353e-06, + "loss": 0.4509, + "step": 258 + }, + { + "epoch": 0.20816653322658127, + "grad_norm": 0.4040084481239319, + "learning_rate": 8.634721888686368e-06, + "loss": 0.2426, + "step": 260 + }, + { + "epoch": 0.20976781425140112, + "grad_norm": 1.3955566883087158, + "learning_rate": 8.690119720340907e-06, + "loss": 0.5864, + "step": 262 + }, + { + "epoch": 0.21136909527622097, + "grad_norm": 0.7348403334617615, + "learning_rate": 8.74555848348857e-06, + "loss": 0.7033, + "step": 264 + }, + { + "epoch": 0.21297037630104082, + "grad_norm": 0.4948270618915558, + "learning_rate": 8.801036445763858e-06, + "loss": 0.2638, + "step": 266 + }, + { + "epoch": 0.2145716573258607, + "grad_norm": 0.712347149848938, + "learning_rate": 8.856551873576448e-06, + "loss": 0.3535, + "step": 268 + }, + { + "epoch": 0.21617293835068055, + "grad_norm": 3.7811851501464844, + "learning_rate": 8.912103032165206e-06, + "loss": 2.1874, + "step": 270 + }, + { + "epoch": 0.2177742193755004, + "grad_norm": 1.2924195528030396, + "learning_rate": 8.967688185652527e-06, + "loss": 0.6866, + "step": 272 + }, + { + "epoch": 0.21937550040032025, + "grad_norm": 0.44834116101264954, + "learning_rate": 9.023305597098526e-06, + "loss": 0.2292, + "step": 274 + }, + { + "epoch": 0.2209767814251401, + "grad_norm": 1.077794075012207, + "learning_rate": 9.078953528555258e-06, + "loss": 0.2237, + "step": 276 + }, + { + "epoch": 0.22257806244995998, + "grad_norm": 1.3801982402801514, + "learning_rate": 9.134630241121135e-06, + "loss": 0.5257, + "step": 278 + }, + { + "epoch": 0.22417934347477983, + "grad_norm": 2.1891112327575684, + "learning_rate": 9.190333994995208e-06, + "loss": 1.2032, + "step": 280 + }, + { + "epoch": 0.22578062449959968, + "grad_norm": 0.9804417490959167, + "learning_rate": 9.24606304953148e-06, + "loss": 0.5972, + "step": 282 + }, + { + "epoch": 0.22738190552441953, + "grad_norm": 3.828866720199585, + "learning_rate": 9.301815663293426e-06, + "loss": 1.5628, + "step": 284 + }, + { + "epoch": 0.22898318654923938, + "grad_norm": 0.3064195513725281, + "learning_rate": 9.35759009410826e-06, + "loss": 1.0633, + "step": 286 + }, + { + "epoch": 0.23058446757405926, + "grad_norm": 0.7491010427474976, + "learning_rate": 9.41338459912151e-06, + "loss": 0.8965, + "step": 288 + }, + { + "epoch": 0.2321857485988791, + "grad_norm": 1.93744695186615, + "learning_rate": 9.469197434851414e-06, + "loss": 1.5361, + "step": 290 + }, + { + "epoch": 0.23378702962369896, + "grad_norm": 0.6048277616500854, + "learning_rate": 9.52502685724336e-06, + "loss": 0.4982, + "step": 292 + }, + { + "epoch": 0.2353883106485188, + "grad_norm": 1.0110554695129395, + "learning_rate": 9.580871121724498e-06, + "loss": 0.4649, + "step": 294 + }, + { + "epoch": 0.23698959167333866, + "grad_norm": 3.868407726287842, + "learning_rate": 9.636728483258116e-06, + "loss": 1.47, + "step": 296 + }, + { + "epoch": 0.23859087269815854, + "grad_norm": 0.5160191655158997, + "learning_rate": 9.692597196398302e-06, + "loss": 0.4794, + "step": 298 + }, + { + "epoch": 0.2401921537229784, + "grad_norm": 3.991729259490967, + "learning_rate": 9.748475515344416e-06, + "loss": 1.0373, + "step": 300 + }, + { + "epoch": 0.24179343474779824, + "grad_norm": 2.166703701019287, + "learning_rate": 9.80436169399561e-06, + "loss": 1.9783, + "step": 302 + }, + { + "epoch": 0.2433947157726181, + "grad_norm": 0.3666301667690277, + "learning_rate": 9.8602539860055e-06, + "loss": 0.8768, + "step": 304 + }, + { + "epoch": 0.24499599679743794, + "grad_norm": 1.5217561721801758, + "learning_rate": 9.916150644836596e-06, + "loss": 0.5495, + "step": 306 + }, + { + "epoch": 0.2465972778222578, + "grad_norm": 1.4261540174484253, + "learning_rate": 9.972049923815011e-06, + "loss": 0.6494, + "step": 308 + }, + { + "epoch": 0.24819855884707767, + "grad_norm": 2.0301692485809326, + "learning_rate": 1.0027950076184982e-05, + "loss": 0.8801, + "step": 310 + }, + { + "epoch": 0.24979983987189752, + "grad_norm": 0.5044181942939758, + "learning_rate": 1.0083849355163397e-05, + "loss": 0.3538, + "step": 312 + }, + { + "epoch": 0.2514011208967174, + "grad_norm": 2.66142201423645, + "learning_rate": 1.0139746013994493e-05, + "loss": 0.9227, + "step": 314 + }, + { + "epoch": 0.2530024019215372, + "grad_norm": 0.6894093751907349, + "learning_rate": 1.0195638306004383e-05, + "loss": 0.3348, + "step": 316 + }, + { + "epoch": 0.2546036829463571, + "grad_norm": 0.8799777626991272, + "learning_rate": 1.0251524484655577e-05, + "loss": 1.779, + "step": 318 + }, + { + "epoch": 0.2562049639711769, + "grad_norm": 0.46906888484954834, + "learning_rate": 1.0307402803601691e-05, + "loss": 0.2186, + "step": 320 + }, + { + "epoch": 0.2578062449959968, + "grad_norm": 1.0319573879241943, + "learning_rate": 1.0363271516741877e-05, + "loss": 0.4688, + "step": 322 + }, + { + "epoch": 0.2594075260208166, + "grad_norm": 2.1138594150543213, + "learning_rate": 1.0419128878275495e-05, + "loss": 0.7253, + "step": 324 + }, + { + "epoch": 0.2610088070456365, + "grad_norm": 1.357761025428772, + "learning_rate": 1.0474973142756632e-05, + "loss": 0.4771, + "step": 326 + }, + { + "epoch": 0.2626100880704564, + "grad_norm": 1.3925987482070923, + "learning_rate": 1.053080256514858e-05, + "loss": 1.14, + "step": 328 + }, + { + "epoch": 0.2642113690952762, + "grad_norm": 0.8517356514930725, + "learning_rate": 1.0586615400878484e-05, + "loss": 0.9027, + "step": 330 + }, + { + "epoch": 0.2658126501200961, + "grad_norm": 2.058413505554199, + "learning_rate": 1.0642409905891733e-05, + "loss": 1.0014, + "step": 332 + }, + { + "epoch": 0.2674139311449159, + "grad_norm": 0.7842358350753784, + "learning_rate": 1.0698184336706567e-05, + "loss": 0.2543, + "step": 334 + }, + { + "epoch": 0.2690152121697358, + "grad_norm": 0.601685106754303, + "learning_rate": 1.0753936950468513e-05, + "loss": 1.0987, + "step": 336 + }, + { + "epoch": 0.27061649319455566, + "grad_norm": 1.842978596687317, + "learning_rate": 1.0809666005004787e-05, + "loss": 0.9233, + "step": 338 + }, + { + "epoch": 0.2722177742193755, + "grad_norm": 2.2074649333953857, + "learning_rate": 1.0865369758878858e-05, + "loss": 1.4273, + "step": 340 + }, + { + "epoch": 0.27381905524419536, + "grad_norm": 2.630526065826416, + "learning_rate": 1.0921046471444737e-05, + "loss": 1.1762, + "step": 342 + }, + { + "epoch": 0.2754203362690152, + "grad_norm": 1.3465025424957275, + "learning_rate": 1.0976694402901467e-05, + "loss": 1.4946, + "step": 344 + }, + { + "epoch": 0.27702161729383507, + "grad_norm": 1.4610470533370972, + "learning_rate": 1.1032311814347467e-05, + "loss": 0.7276, + "step": 346 + }, + { + "epoch": 0.27862289831865494, + "grad_norm": 0.8547130227088928, + "learning_rate": 1.1087896967834787e-05, + "loss": 0.9582, + "step": 348 + }, + { + "epoch": 0.28022417934347477, + "grad_norm": 0.34722548723220825, + "learning_rate": 1.1143448126423545e-05, + "loss": 0.2064, + "step": 350 + }, + { + "epoch": 0.28182546036829464, + "grad_norm": 1.2971452474594116, + "learning_rate": 1.1198963554236135e-05, + "loss": 0.5554, + "step": 352 + }, + { + "epoch": 0.28342674139311447, + "grad_norm": 4.7040510177612305, + "learning_rate": 1.1254441516511425e-05, + "loss": 2.9586, + "step": 354 + }, + { + "epoch": 0.28502802241793435, + "grad_norm": 5.01977014541626, + "learning_rate": 1.1309880279659087e-05, + "loss": 4.438, + "step": 356 + }, + { + "epoch": 0.2866293034427542, + "grad_norm": 1.483677864074707, + "learning_rate": 1.1365278111313625e-05, + "loss": 0.7832, + "step": 358 + }, + { + "epoch": 0.28823058446757405, + "grad_norm": 5.96160364151001, + "learning_rate": 1.142063328038864e-05, + "loss": 2.7248, + "step": 360 + }, + { + "epoch": 0.2898318654923939, + "grad_norm": 0.5783430933952332, + "learning_rate": 1.1475944057130856e-05, + "loss": 0.799, + "step": 362 + }, + { + "epoch": 0.29143314651721375, + "grad_norm": 3.301081895828247, + "learning_rate": 1.1531208713174138e-05, + "loss": 2.1008, + "step": 364 + }, + { + "epoch": 0.2930344275420336, + "grad_norm": 0.9644891023635864, + "learning_rate": 1.1586425521593607e-05, + "loss": 0.4383, + "step": 366 + }, + { + "epoch": 0.2946357085668535, + "grad_norm": 2.094698190689087, + "learning_rate": 1.1641592756959467e-05, + "loss": 1.2348, + "step": 368 + }, + { + "epoch": 0.2962369895916733, + "grad_norm": 0.884109616279602, + "learning_rate": 1.1696708695391057e-05, + "loss": 1.9496, + "step": 370 + }, + { + "epoch": 0.2978382706164932, + "grad_norm": 3.4012014865875244, + "learning_rate": 1.1751771614610643e-05, + "loss": 1.1301, + "step": 372 + }, + { + "epoch": 0.29943955164131303, + "grad_norm": 2.2041361331939697, + "learning_rate": 1.180677979399721e-05, + "loss": 3.9026, + "step": 374 + }, + { + "epoch": 0.3010408326661329, + "grad_norm": 1.5450705289840698, + "learning_rate": 1.1861731514640309e-05, + "loss": 1.074, + "step": 376 + }, + { + "epoch": 0.3026421136909528, + "grad_norm": 1.425844669342041, + "learning_rate": 1.1916625059393739e-05, + "loss": 1.6024, + "step": 378 + }, + { + "epoch": 0.3042433947157726, + "grad_norm": 2.757223606109619, + "learning_rate": 1.1971458712929133e-05, + "loss": 0.8697, + "step": 380 + }, + { + "epoch": 0.3058446757405925, + "grad_norm": 2.287362813949585, + "learning_rate": 1.2026230761789702e-05, + "loss": 2.4518, + "step": 382 + }, + { + "epoch": 0.3074459567654123, + "grad_norm": 3.5028767585754395, + "learning_rate": 1.2080939494443618e-05, + "loss": 1.0252, + "step": 384 + }, + { + "epoch": 0.3090472377902322, + "grad_norm": 2.35418701171875, + "learning_rate": 1.2135583201337646e-05, + "loss": 2.4385, + "step": 386 + }, + { + "epoch": 0.31064851881505207, + "grad_norm": 1.1545484066009521, + "learning_rate": 1.2190160174950428e-05, + "loss": 0.9246, + "step": 388 + }, + { + "epoch": 0.3122497998398719, + "grad_norm": 1.506903052330017, + "learning_rate": 1.2244668709845952e-05, + "loss": 2.4405, + "step": 390 + }, + { + "epoch": 0.31385108086469177, + "grad_norm": 1.8492883443832397, + "learning_rate": 1.2299107102726804e-05, + "loss": 2.0722, + "step": 392 + }, + { + "epoch": 0.3154523618895116, + "grad_norm": 1.6731404066085815, + "learning_rate": 1.2353473652487329e-05, + "loss": 1.55, + "step": 394 + }, + { + "epoch": 0.31705364291433147, + "grad_norm": 1.6981631517410278, + "learning_rate": 1.2407766660266916e-05, + "loss": 1.6783, + "step": 396 + }, + { + "epoch": 0.31865492393915135, + "grad_norm": 0.8948637843132019, + "learning_rate": 1.2461984429502947e-05, + "loss": 1.3128, + "step": 398 + }, + { + "epoch": 0.32025620496397117, + "grad_norm": 2.423490524291992, + "learning_rate": 1.2516125265983945e-05, + "loss": 1.1197, + "step": 400 + }, + { + "epoch": 0.32185748598879105, + "grad_norm": 1.7709052562713623, + "learning_rate": 1.257018747790238e-05, + "loss": 1.5812, + "step": 402 + }, + { + "epoch": 0.32345876701361087, + "grad_norm": 0.9947282671928406, + "learning_rate": 1.2624169375907657e-05, + "loss": 0.5626, + "step": 404 + }, + { + "epoch": 0.32506004803843075, + "grad_norm": 2.3331263065338135, + "learning_rate": 1.2678069273158849e-05, + "loss": 1.9289, + "step": 406 + }, + { + "epoch": 0.3266613290632506, + "grad_norm": 0.7927196025848389, + "learning_rate": 1.273188548537736e-05, + "loss": 1.1356, + "step": 408 + }, + { + "epoch": 0.32826261008807045, + "grad_norm": 2.9997189044952393, + "learning_rate": 1.2785616330899676e-05, + "loss": 1.5024, + "step": 410 + }, + { + "epoch": 0.32986389111289033, + "grad_norm": 2.8885343074798584, + "learning_rate": 1.2839260130729776e-05, + "loss": 1.2104, + "step": 412 + }, + { + "epoch": 0.33146517213771015, + "grad_norm": 2.62939190864563, + "learning_rate": 1.2892815208591734e-05, + "loss": 1.5596, + "step": 414 + }, + { + "epoch": 0.33306645316253003, + "grad_norm": 1.4304401874542236, + "learning_rate": 1.2946279890981966e-05, + "loss": 1.5979, + "step": 416 + }, + { + "epoch": 0.33466773418734985, + "grad_norm": 5.8834381103515625, + "learning_rate": 1.2999652507221652e-05, + "loss": 1.2553, + "step": 418 + }, + { + "epoch": 0.33626901521216973, + "grad_norm": 0.5731202960014343, + "learning_rate": 1.3052931389508822e-05, + "loss": 2.6099, + "step": 420 + }, + { + "epoch": 0.3378702962369896, + "grad_norm": 1.268283724784851, + "learning_rate": 1.3106114872970575e-05, + "loss": 1.143, + "step": 422 + }, + { + "epoch": 0.33947157726180943, + "grad_norm": 5.157853126525879, + "learning_rate": 1.3159201295715054e-05, + "loss": 1.4265, + "step": 424 + }, + { + "epoch": 0.3410728582866293, + "grad_norm": 1.565788745880127, + "learning_rate": 1.321218899888334e-05, + "loss": 0.7864, + "step": 426 + }, + { + "epoch": 0.34267413931144913, + "grad_norm": 2.4202182292938232, + "learning_rate": 1.326507632670139e-05, + "loss": 2.0195, + "step": 428 + }, + { + "epoch": 0.344275420336269, + "grad_norm": 3.146437168121338, + "learning_rate": 1.3317861626531652e-05, + "loss": 2.1378, + "step": 430 + }, + { + "epoch": 0.3458767013610889, + "grad_norm": 2.037660598754883, + "learning_rate": 1.3370543248924826e-05, + "loss": 1.1159, + "step": 432 + }, + { + "epoch": 0.3474779823859087, + "grad_norm": 4.221460819244385, + "learning_rate": 1.3423119547671348e-05, + "loss": 1.5294, + "step": 434 + }, + { + "epoch": 0.3490792634107286, + "grad_norm": 1.992856740951538, + "learning_rate": 1.347558887985279e-05, + "loss": 0.9948, + "step": 436 + }, + { + "epoch": 0.3506805444355484, + "grad_norm": 1.0508719682693481, + "learning_rate": 1.3527949605893305e-05, + "loss": 0.8378, + "step": 438 + }, + { + "epoch": 0.3522818254603683, + "grad_norm": 3.4117918014526367, + "learning_rate": 1.3580200089610739e-05, + "loss": 1.3983, + "step": 440 + }, + { + "epoch": 0.35388310648518817, + "grad_norm": 1.3608174324035645, + "learning_rate": 1.3632338698267863e-05, + "loss": 0.8751, + "step": 442 + }, + { + "epoch": 0.355484387510008, + "grad_norm": 2.5038158893585205, + "learning_rate": 1.368436380262336e-05, + "loss": 1.8313, + "step": 444 + }, + { + "epoch": 0.35708566853482787, + "grad_norm": 2.5927035808563232, + "learning_rate": 1.3736273776982667e-05, + "loss": 1.1294, + "step": 446 + }, + { + "epoch": 0.3586869495596477, + "grad_norm": 2.293827533721924, + "learning_rate": 1.3788066999248893e-05, + "loss": 3.0626, + "step": 448 + }, + { + "epoch": 0.3602882305844676, + "grad_norm": 1.3796327114105225, + "learning_rate": 1.3839741850973435e-05, + "loss": 0.471, + "step": 450 + }, + { + "epoch": 0.36188951160928745, + "grad_norm": 2.2750208377838135, + "learning_rate": 1.3891296717406533e-05, + "loss": 1.633, + "step": 452 + }, + { + "epoch": 0.3634907926341073, + "grad_norm": 3.9926955699920654, + "learning_rate": 1.3942729987547808e-05, + "loss": 0.8867, + "step": 454 + }, + { + "epoch": 0.36509207365892715, + "grad_norm": 1.188187599182129, + "learning_rate": 1.3994040054196498e-05, + "loss": 1.8361, + "step": 456 + }, + { + "epoch": 0.366693354683747, + "grad_norm": 1.6645219326019287, + "learning_rate": 1.4045225314001789e-05, + "loss": 0.8313, + "step": 458 + }, + { + "epoch": 0.36829463570856685, + "grad_norm": 5.094385623931885, + "learning_rate": 1.4096284167512856e-05, + "loss": 2.4849, + "step": 460 + }, + { + "epoch": 0.36989591673338673, + "grad_norm": 1.9351874589920044, + "learning_rate": 1.4147215019228813e-05, + "loss": 0.824, + "step": 462 + }, + { + "epoch": 0.37149719775820655, + "grad_norm": 1.5214515924453735, + "learning_rate": 1.4198016277648665e-05, + "loss": 0.8016, + "step": 464 + }, + { + "epoch": 0.37309847878302643, + "grad_norm": 3.019312620162964, + "learning_rate": 1.4248686355320922e-05, + "loss": 1.8661, + "step": 466 + }, + { + "epoch": 0.37469975980784626, + "grad_norm": 0.36004555225372314, + "learning_rate": 1.429922366889332e-05, + "loss": 0.5334, + "step": 468 + }, + { + "epoch": 0.37630104083266613, + "grad_norm": 5.063556671142578, + "learning_rate": 1.4349626639162231e-05, + "loss": 1.7588, + "step": 470 + }, + { + "epoch": 0.377902321857486, + "grad_norm": 0.3317326605319977, + "learning_rate": 1.4399893691121985e-05, + "loss": 0.7093, + "step": 472 + }, + { + "epoch": 0.37950360288230583, + "grad_norm": 1.7341829538345337, + "learning_rate": 1.4450023254014185e-05, + "loss": 1.9285, + "step": 474 + }, + { + "epoch": 0.3811048839071257, + "grad_norm": 2.280532121658325, + "learning_rate": 1.4500013761376663e-05, + "loss": 2.2643, + "step": 476 + }, + { + "epoch": 0.38270616493194554, + "grad_norm": 1.4746766090393066, + "learning_rate": 1.454986365109255e-05, + "loss": 1.7954, + "step": 478 + }, + { + "epoch": 0.3843074459567654, + "grad_norm": 3.4241297245025635, + "learning_rate": 1.4599571365439027e-05, + "loss": 4.052, + "step": 480 + }, + { + "epoch": 0.3859087269815853, + "grad_norm": 1.9219751358032227, + "learning_rate": 1.4649135351135968e-05, + "loss": 0.9748, + "step": 482 + }, + { + "epoch": 0.3875100080064051, + "grad_norm": 1.1262885332107544, + "learning_rate": 1.4698554059394563e-05, + "loss": 1.5131, + "step": 484 + }, + { + "epoch": 0.389111289031225, + "grad_norm": 3.08154034614563, + "learning_rate": 1.4747825945965675e-05, + "loss": 3.2738, + "step": 486 + }, + { + "epoch": 0.3907125700560448, + "grad_norm": 0.2941211462020874, + "learning_rate": 1.4796949471188033e-05, + "loss": 0.3959, + "step": 488 + }, + { + "epoch": 0.3923138510808647, + "grad_norm": 2.031050205230713, + "learning_rate": 1.4845923100036479e-05, + "loss": 2.6678, + "step": 490 + }, + { + "epoch": 0.3939151321056846, + "grad_norm": 3.0186398029327393, + "learning_rate": 1.4894745302169786e-05, + "loss": 2.4335, + "step": 492 + }, + { + "epoch": 0.3955164131305044, + "grad_norm": 3.338667869567871, + "learning_rate": 1.4943414551978597e-05, + "loss": 1.6847, + "step": 494 + }, + { + "epoch": 0.3971176941553243, + "grad_norm": 1.5763025283813477, + "learning_rate": 1.499192932863305e-05, + "loss": 2.8577, + "step": 496 + }, + { + "epoch": 0.3987189751801441, + "grad_norm": 2.004448652267456, + "learning_rate": 1.5040288116130261e-05, + "loss": 1.8034, + "step": 498 + }, + { + "epoch": 0.400320256204964, + "grad_norm": 0.4262777268886566, + "learning_rate": 1.5088489403341793e-05, + "loss": 1.4654, + "step": 500 + }, + { + "epoch": 0.40192153722978385, + "grad_norm": 2.5623342990875244, + "learning_rate": 1.513653168406076e-05, + "loss": 2.0699, + "step": 502 + }, + { + "epoch": 0.4035228182546037, + "grad_norm": 3.4944663047790527, + "learning_rate": 1.5184413457049006e-05, + "loss": 1.1839, + "step": 504 + }, + { + "epoch": 0.40512409927942356, + "grad_norm": 0.8031488060951233, + "learning_rate": 1.5232133226083954e-05, + "loss": 1.6042, + "step": 506 + }, + { + "epoch": 0.4067253803042434, + "grad_norm": 2.332994222640991, + "learning_rate": 1.527968950000533e-05, + "loss": 1.1222, + "step": 508 + }, + { + "epoch": 0.40832666132906326, + "grad_norm": 0.6046788692474365, + "learning_rate": 1.532708079276185e-05, + "loss": 0.6048, + "step": 510 + }, + { + "epoch": 0.4099279423538831, + "grad_norm": 1.8540889024734497, + "learning_rate": 1.5374305623457594e-05, + "loss": 1.9467, + "step": 512 + }, + { + "epoch": 0.41152922337870296, + "grad_norm": 2.513918161392212, + "learning_rate": 1.542136251639826e-05, + "loss": 1.1665, + "step": 514 + }, + { + "epoch": 0.41313050440352284, + "grad_norm": 0.48221322894096375, + "learning_rate": 1.5468250001137368e-05, + "loss": 1.5951, + "step": 516 + }, + { + "epoch": 0.41473178542834266, + "grad_norm": 1.8153339624404907, + "learning_rate": 1.551496661252208e-05, + "loss": 0.8532, + "step": 518 + }, + { + "epoch": 0.41633306645316254, + "grad_norm": 0.8141906261444092, + "learning_rate": 1.5561510890739113e-05, + "loss": 1.9527, + "step": 520 + }, + { + "epoch": 0.41793434747798236, + "grad_norm": 0.6479558348655701, + "learning_rate": 1.5607881381360296e-05, + "loss": 0.9495, + "step": 522 + }, + { + "epoch": 0.41953562850280224, + "grad_norm": 5.3584160804748535, + "learning_rate": 1.565407663538797e-05, + "loss": 2.3574, + "step": 524 + }, + { + "epoch": 0.4211369095276221, + "grad_norm": 3.2333738803863525, + "learning_rate": 1.5700095209300376e-05, + "loss": 1.7711, + "step": 526 + }, + { + "epoch": 0.42273819055244194, + "grad_norm": 2.7287609577178955, + "learning_rate": 1.5745935665096647e-05, + "loss": 1.7363, + "step": 528 + }, + { + "epoch": 0.4243394715772618, + "grad_norm": 2.4549694061279297, + "learning_rate": 1.5791596570341844e-05, + "loss": 2.3289, + "step": 530 + }, + { + "epoch": 0.42594075260208164, + "grad_norm": 4.72097635269165, + "learning_rate": 1.5837076498211666e-05, + "loss": 2.4264, + "step": 532 + }, + { + "epoch": 0.4275420336269015, + "grad_norm": 0.43421804904937744, + "learning_rate": 1.5882374027537005e-05, + "loss": 0.3367, + "step": 534 + }, + { + "epoch": 0.4291433146517214, + "grad_norm": 0.8977329730987549, + "learning_rate": 1.5927487742848448e-05, + "loss": 0.7395, + "step": 536 + }, + { + "epoch": 0.4307445956765412, + "grad_norm": 2.131248712539673, + "learning_rate": 1.5972416234420393e-05, + "loss": 2.2606, + "step": 538 + }, + { + "epoch": 0.4323458767013611, + "grad_norm": 0.8926573395729065, + "learning_rate": 1.60171580983152e-05, + "loss": 1.0258, + "step": 540 + }, + { + "epoch": 0.4339471577261809, + "grad_norm": 4.0784077644348145, + "learning_rate": 1.606171193642703e-05, + "loss": 1.2897, + "step": 542 + }, + { + "epoch": 0.4355484387510008, + "grad_norm": 4.184883117675781, + "learning_rate": 1.6106076356525474e-05, + "loss": 2.7588, + "step": 544 + }, + { + "epoch": 0.4371497197758207, + "grad_norm": 1.6611478328704834, + "learning_rate": 1.6150249972299153e-05, + "loss": 1.3995, + "step": 546 + }, + { + "epoch": 0.4387510008006405, + "grad_norm": 0.7476415634155273, + "learning_rate": 1.6194231403398994e-05, + "loss": 1.1153, + "step": 548 + }, + { + "epoch": 0.4403522818254604, + "grad_norm": 2.894113540649414, + "learning_rate": 1.6238019275481313e-05, + "loss": 1.164, + "step": 550 + }, + { + "epoch": 0.4419535628502802, + "grad_norm": 1.5848498344421387, + "learning_rate": 1.6281612220250883e-05, + "loss": 1.044, + "step": 552 + }, + { + "epoch": 0.4435548438751001, + "grad_norm": 1.015914797782898, + "learning_rate": 1.6325008875503543e-05, + "loss": 1.6644, + "step": 554 + }, + { + "epoch": 0.44515612489991996, + "grad_norm": 0.49447667598724365, + "learning_rate": 1.6368207885168897e-05, + "loss": 1.192, + "step": 556 + }, + { + "epoch": 0.4467574059247398, + "grad_norm": 2.5006227493286133, + "learning_rate": 1.641120789935263e-05, + "loss": 1.7414, + "step": 558 + }, + { + "epoch": 0.44835868694955966, + "grad_norm": 0.8820304274559021, + "learning_rate": 1.6454007574378637e-05, + "loss": 0.3903, + "step": 560 + }, + { + "epoch": 0.4499599679743795, + "grad_norm": 2.4274849891662598, + "learning_rate": 1.6496605572831134e-05, + "loss": 1.4846, + "step": 562 + }, + { + "epoch": 0.45156124899919936, + "grad_norm": 1.9465970993041992, + "learning_rate": 1.6539000563596318e-05, + "loss": 0.4608, + "step": 564 + }, + { + "epoch": 0.45316253002401924, + "grad_norm": 1.6413123607635498, + "learning_rate": 1.6581191221904077e-05, + "loss": 1.0824, + "step": 566 + }, + { + "epoch": 0.45476381104883906, + "grad_norm": 1.6077611446380615, + "learning_rate": 1.6623176229369324e-05, + "loss": 0.9239, + "step": 568 + }, + { + "epoch": 0.45636509207365894, + "grad_norm": 1.9611300230026245, + "learning_rate": 1.6664954274033168e-05, + "loss": 0.5095, + "step": 570 + }, + { + "epoch": 0.45796637309847876, + "grad_norm": 4.057272434234619, + "learning_rate": 1.6706524050403996e-05, + "loss": 1.3203, + "step": 572 + }, + { + "epoch": 0.45956765412329864, + "grad_norm": 1.3887745141983032, + "learning_rate": 1.674788425949818e-05, + "loss": 2.03, + "step": 574 + }, + { + "epoch": 0.4611689351481185, + "grad_norm": 0.3176191747188568, + "learning_rate": 1.6789033608880735e-05, + "loss": 0.5056, + "step": 576 + }, + { + "epoch": 0.46277021617293834, + "grad_norm": 2.312699794769287, + "learning_rate": 1.6829970812705674e-05, + "loss": 2.0908, + "step": 578 + }, + { + "epoch": 0.4643714971977582, + "grad_norm": 1.5363367795944214, + "learning_rate": 1.6870694591756165e-05, + "loss": 0.4155, + "step": 580 + }, + { + "epoch": 0.46597277822257804, + "grad_norm": 1.4750593900680542, + "learning_rate": 1.6911203673484577e-05, + "loss": 2.1354, + "step": 582 + }, + { + "epoch": 0.4675740592473979, + "grad_norm": 0.870273232460022, + "learning_rate": 1.695149679205214e-05, + "loss": 0.726, + "step": 584 + }, + { + "epoch": 0.4691753402722178, + "grad_norm": 1.4252557754516602, + "learning_rate": 1.6991572688368628e-05, + "loss": 1.1978, + "step": 586 + }, + { + "epoch": 0.4707766212970376, + "grad_norm": 2.378805160522461, + "learning_rate": 1.7031430110131562e-05, + "loss": 0.8437, + "step": 588 + }, + { + "epoch": 0.4723779023218575, + "grad_norm": 4.3041534423828125, + "learning_rate": 1.7071067811865467e-05, + "loss": 0.8981, + "step": 590 + }, + { + "epoch": 0.4739791833466773, + "grad_norm": 1.9830443859100342, + "learning_rate": 1.7110484554960738e-05, + "loss": 1.6152, + "step": 592 + }, + { + "epoch": 0.4755804643714972, + "grad_norm": 1.2421841621398926, + "learning_rate": 1.7149679107712306e-05, + "loss": 1.2704, + "step": 594 + }, + { + "epoch": 0.4771817453963171, + "grad_norm": 2.351569890975952, + "learning_rate": 1.7188650245358215e-05, + "loss": 2.1242, + "step": 596 + }, + { + "epoch": 0.4787830264211369, + "grad_norm": 2.8555033206939697, + "learning_rate": 1.722739675011779e-05, + "loss": 0.752, + "step": 598 + }, + { + "epoch": 0.4803843074459568, + "grad_norm": 2.7918388843536377, + "learning_rate": 1.726591741122981e-05, + "loss": 1.5606, + "step": 600 + }, + { + "epoch": 0.4819855884707766, + "grad_norm": 6.7934465408325195, + "learning_rate": 1.730421102499021e-05, + "loss": 1.7621, + "step": 602 + }, + { + "epoch": 0.4835868694955965, + "grad_norm": 1.8609873056411743, + "learning_rate": 1.734227639478982e-05, + "loss": 0.671, + "step": 604 + }, + { + "epoch": 0.4851881505204163, + "grad_norm": 3.692981719970703, + "learning_rate": 1.738011233115165e-05, + "loss": 1.4227, + "step": 606 + }, + { + "epoch": 0.4867894315452362, + "grad_norm": 3.535794734954834, + "learning_rate": 1.7417717651768144e-05, + "loss": 3.1441, + "step": 608 + }, + { + "epoch": 0.48839071257005606, + "grad_norm": 1.408652663230896, + "learning_rate": 1.7455091181538087e-05, + "loss": 0.3714, + "step": 610 + }, + { + "epoch": 0.4899919935948759, + "grad_norm": 1.229390025138855, + "learning_rate": 1.74922317526033e-05, + "loss": 0.6312, + "step": 612 + }, + { + "epoch": 0.49159327461969576, + "grad_norm": 2.0757551193237305, + "learning_rate": 1.752913820438519e-05, + "loss": 1.0217, + "step": 614 + }, + { + "epoch": 0.4931945556445156, + "grad_norm": 2.3851053714752197, + "learning_rate": 1.756580938362096e-05, + "loss": 3.5756, + "step": 616 + }, + { + "epoch": 0.49479583666933546, + "grad_norm": 2.0853214263916016, + "learning_rate": 1.7602244144399693e-05, + "loss": 0.8365, + "step": 618 + }, + { + "epoch": 0.49639711769415534, + "grad_norm": 0.9884747862815857, + "learning_rate": 1.7638441348198147e-05, + "loss": 1.4498, + "step": 620 + }, + { + "epoch": 0.49799839871897517, + "grad_norm": 4.900949478149414, + "learning_rate": 1.7674399863916295e-05, + "loss": 2.097, + "step": 622 + }, + { + "epoch": 0.49959967974379504, + "grad_norm": 4.958817481994629, + "learning_rate": 1.771011856791273e-05, + "loss": 1.9168, + "step": 624 + }, + { + "epoch": 0.5012009607686149, + "grad_norm": 2.2582879066467285, + "learning_rate": 1.7745596344039712e-05, + "loss": 1.0935, + "step": 626 + }, + { + "epoch": 0.5028022417934348, + "grad_norm": 1.6418002843856812, + "learning_rate": 1.7780832083678116e-05, + "loss": 2.6788, + "step": 628 + }, + { + "epoch": 0.5044035228182546, + "grad_norm": 3.227318048477173, + "learning_rate": 1.7815824685772035e-05, + "loss": 1.8182, + "step": 630 + }, + { + "epoch": 0.5060048038430744, + "grad_norm": 3.732487678527832, + "learning_rate": 1.7850573056863156e-05, + "loss": 2.4856, + "step": 632 + }, + { + "epoch": 0.5076060848678943, + "grad_norm": 5.898594379425049, + "learning_rate": 1.7885076111125004e-05, + "loss": 2.0315, + "step": 634 + }, + { + "epoch": 0.5092073658927142, + "grad_norm": 2.434490442276001, + "learning_rate": 1.791933277039679e-05, + "loss": 1.7327, + "step": 636 + }, + { + "epoch": 0.510808646917534, + "grad_norm": 3.772852659225464, + "learning_rate": 1.7953341964217183e-05, + "loss": 1.9029, + "step": 638 + }, + { + "epoch": 0.5124099279423538, + "grad_norm": 1.3970327377319336, + "learning_rate": 1.7987102629857696e-05, + "loss": 0.8241, + "step": 640 + }, + { + "epoch": 0.5140112089671738, + "grad_norm": 0.4309978485107422, + "learning_rate": 1.8020613712355912e-05, + "loss": 0.3671, + "step": 642 + }, + { + "epoch": 0.5156124899919936, + "grad_norm": 3.695401191711426, + "learning_rate": 1.805387416454847e-05, + "loss": 1.0777, + "step": 644 + }, + { + "epoch": 0.5172137710168134, + "grad_norm": 2.718665361404419, + "learning_rate": 1.8086882947103787e-05, + "loss": 0.871, + "step": 646 + }, + { + "epoch": 0.5188150520416333, + "grad_norm": 1.1885088682174683, + "learning_rate": 1.811963902855447e-05, + "loss": 2.1, + "step": 648 + }, + { + "epoch": 0.5204163330664532, + "grad_norm": 2.7451171875, + "learning_rate": 1.8152141385329658e-05, + "loss": 1.4787, + "step": 650 + }, + { + "epoch": 0.522017614091273, + "grad_norm": 2.0423049926757812, + "learning_rate": 1.8184389001786895e-05, + "loss": 1.4898, + "step": 652 + }, + { + "epoch": 0.5236188951160928, + "grad_norm": 2.085296154022217, + "learning_rate": 1.821638087024396e-05, + "loss": 1.2659, + "step": 654 + }, + { + "epoch": 0.5252201761409128, + "grad_norm": 2.4013125896453857, + "learning_rate": 1.8248115991010296e-05, + "loss": 0.8327, + "step": 656 + }, + { + "epoch": 0.5268214571657326, + "grad_norm": 0.8781247735023499, + "learning_rate": 1.8279593372418264e-05, + "loss": 0.5121, + "step": 658 + }, + { + "epoch": 0.5284227381905524, + "grad_norm": 3.234022855758667, + "learning_rate": 1.8310812030854155e-05, + "loss": 2.1092, + "step": 660 + }, + { + "epoch": 0.5300240192153723, + "grad_norm": 0.7276309728622437, + "learning_rate": 1.834177099078887e-05, + "loss": 0.8579, + "step": 662 + }, + { + "epoch": 0.5316253002401922, + "grad_norm": 4.537621974945068, + "learning_rate": 1.8372469284808465e-05, + "loss": 1.9428, + "step": 664 + }, + { + "epoch": 0.533226581265012, + "grad_norm": 3.000288724899292, + "learning_rate": 1.840290595364436e-05, + "loss": 1.7011, + "step": 666 + }, + { + "epoch": 0.5348278622898318, + "grad_norm": 1.7386661767959595, + "learning_rate": 1.8433080046203286e-05, + "loss": 1.4665, + "step": 668 + }, + { + "epoch": 0.5364291433146517, + "grad_norm": 1.4105316400527954, + "learning_rate": 1.8462990619597054e-05, + "loss": 1.1828, + "step": 670 + }, + { + "epoch": 0.5380304243394716, + "grad_norm": 2.322204113006592, + "learning_rate": 1.8492636739171966e-05, + "loss": 1.3538, + "step": 672 + }, + { + "epoch": 0.5396317053642914, + "grad_norm": 5.358737945556641, + "learning_rate": 1.8522017478538067e-05, + "loss": 0.7161, + "step": 674 + }, + { + "epoch": 0.5412329863891113, + "grad_norm": 4.409482955932617, + "learning_rate": 1.855113191959808e-05, + "loss": 3.5676, + "step": 676 + }, + { + "epoch": 0.5428342674139311, + "grad_norm": 0.7212360501289368, + "learning_rate": 1.8579979152576063e-05, + "loss": 0.5143, + "step": 678 + }, + { + "epoch": 0.544435548438751, + "grad_norm": 2.1198341846466064, + "learning_rate": 1.8608558276045895e-05, + "loss": 1.0673, + "step": 680 + }, + { + "epoch": 0.5460368294635709, + "grad_norm": 2.353126049041748, + "learning_rate": 1.86368683969594e-05, + "loss": 1.9982, + "step": 682 + }, + { + "epoch": 0.5476381104883907, + "grad_norm": 4.079829216003418, + "learning_rate": 1.866490863067425e-05, + "loss": 1.0351, + "step": 684 + }, + { + "epoch": 0.5492393915132106, + "grad_norm": 1.9789396524429321, + "learning_rate": 1.8692678100981663e-05, + "loss": 1.2937, + "step": 686 + }, + { + "epoch": 0.5508406725380304, + "grad_norm": 5.303819179534912, + "learning_rate": 1.8720175940133705e-05, + "loss": 3.1657, + "step": 688 + }, + { + "epoch": 0.5524419535628503, + "grad_norm": 3.2883307933807373, + "learning_rate": 1.8747401288870472e-05, + "loss": 0.9897, + "step": 690 + }, + { + "epoch": 0.5540432345876701, + "grad_norm": 0.7752354741096497, + "learning_rate": 1.877435329644691e-05, + "loss": 1.4486, + "step": 692 + }, + { + "epoch": 0.55564451561249, + "grad_norm": 2.7542550563812256, + "learning_rate": 1.8801031120659393e-05, + "loss": 2.3771, + "step": 694 + }, + { + "epoch": 0.5572457966373099, + "grad_norm": 0.49501386284828186, + "learning_rate": 1.8827433927872066e-05, + "loss": 0.5669, + "step": 696 + }, + { + "epoch": 0.5588470776621297, + "grad_norm": 1.891086220741272, + "learning_rate": 1.8853560893042854e-05, + "loss": 1.1003, + "step": 698 + }, + { + "epoch": 0.5604483586869495, + "grad_norm": 1.205254077911377, + "learning_rate": 1.8879411199749303e-05, + "loss": 0.8474, + "step": 700 + }, + { + "epoch": 0.5620496397117695, + "grad_norm": 1.1239306926727295, + "learning_rate": 1.8904984040214037e-05, + "loss": 0.9103, + "step": 702 + }, + { + "epoch": 0.5636509207365893, + "grad_norm": 1.2536603212356567, + "learning_rate": 1.893027861533002e-05, + "loss": 1.1801, + "step": 704 + }, + { + "epoch": 0.5652522017614091, + "grad_norm": 2.2487826347351074, + "learning_rate": 1.8955294134685528e-05, + "loss": 1.5262, + "step": 706 + }, + { + "epoch": 0.5668534827862289, + "grad_norm": 1.1155579090118408, + "learning_rate": 1.898002981658886e-05, + "loss": 1.2277, + "step": 708 + }, + { + "epoch": 0.5684547638110489, + "grad_norm": 2.357337474822998, + "learning_rate": 1.9004484888092724e-05, + "loss": 1.5741, + "step": 710 + }, + { + "epoch": 0.5700560448358687, + "grad_norm": 2.2856945991516113, + "learning_rate": 1.9028658585018455e-05, + "loss": 0.9087, + "step": 712 + }, + { + "epoch": 0.5716573258606885, + "grad_norm": 1.24591863155365, + "learning_rate": 1.9052550151979816e-05, + "loss": 2.0101, + "step": 714 + }, + { + "epoch": 0.5732586068855084, + "grad_norm": 2.9909167289733887, + "learning_rate": 1.9076158842406674e-05, + "loss": 1.4221, + "step": 716 + }, + { + "epoch": 0.5748598879103283, + "grad_norm": 2.6867928504943848, + "learning_rate": 1.909948391856829e-05, + "loss": 2.3942, + "step": 718 + }, + { + "epoch": 0.5764611689351481, + "grad_norm": 5.429558277130127, + "learning_rate": 1.912252465159637e-05, + "loss": 1.691, + "step": 720 + }, + { + "epoch": 0.578062449959968, + "grad_norm": 2.436601161956787, + "learning_rate": 1.9145280321507872e-05, + "loss": 2.1648, + "step": 722 + }, + { + "epoch": 0.5796637309847879, + "grad_norm": 1.4253017902374268, + "learning_rate": 1.9167750217227454e-05, + "loss": 1.8293, + "step": 724 + }, + { + "epoch": 0.5812650120096077, + "grad_norm": 1.38555109500885, + "learning_rate": 1.9189933636609747e-05, + "loss": 1.095, + "step": 726 + }, + { + "epoch": 0.5828662930344275, + "grad_norm": 3.646054983139038, + "learning_rate": 1.9211829886461274e-05, + "loss": 2.0621, + "step": 728 + }, + { + "epoch": 0.5844675740592474, + "grad_norm": 1.435380458831787, + "learning_rate": 1.9233438282562085e-05, + "loss": 0.8903, + "step": 730 + }, + { + "epoch": 0.5860688550840673, + "grad_norm": 2.963207960128784, + "learning_rate": 1.925475814968719e-05, + "loss": 1.7339, + "step": 732 + }, + { + "epoch": 0.5876701361088871, + "grad_norm": 0.9102033376693726, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.3736, + "step": 734 + }, + { + "epoch": 0.589271417133707, + "grad_norm": 1.168823003768921, + "learning_rate": 1.9296529641211215e-05, + "loss": 1.1446, + "step": 736 + }, + { + "epoch": 0.5908726981585268, + "grad_norm": 5.4084577560424805, + "learning_rate": 1.9316979960323286e-05, + "loss": 2.5455, + "step": 738 + }, + { + "epoch": 0.5924739791833467, + "grad_norm": 1.742395281791687, + "learning_rate": 1.9337139139926707e-05, + "loss": 1.1524, + "step": 740 + }, + { + "epoch": 0.5940752602081665, + "grad_norm": 1.3539904356002808, + "learning_rate": 1.935700655008199e-05, + "loss": 0.5331, + "step": 742 + }, + { + "epoch": 0.5956765412329864, + "grad_norm": 2.786160707473755, + "learning_rate": 1.9376581569966933e-05, + "loss": 1.0714, + "step": 744 + }, + { + "epoch": 0.5972778222578062, + "grad_norm": 1.3455504179000854, + "learning_rate": 1.939586358789602e-05, + "loss": 1.0691, + "step": 746 + }, + { + "epoch": 0.5988791032826261, + "grad_norm": 2.0267562866210938, + "learning_rate": 1.9414852001339547e-05, + "loss": 1.6821, + "step": 748 + }, + { + "epoch": 0.600480384307446, + "grad_norm": 0.731253445148468, + "learning_rate": 1.9433546216942423e-05, + "loss": 0.5054, + "step": 750 + }, + { + "epoch": 0.6020816653322658, + "grad_norm": 0.5526037812232971, + "learning_rate": 1.945194565054276e-05, + "loss": 0.5222, + "step": 752 + }, + { + "epoch": 0.6036829463570856, + "grad_norm": 2.0381815433502197, + "learning_rate": 1.9470049727190073e-05, + "loss": 0.6461, + "step": 754 + }, + { + "epoch": 0.6052842273819056, + "grad_norm": 4.229750156402588, + "learning_rate": 1.948785788116329e-05, + "loss": 2.8607, + "step": 756 + }, + { + "epoch": 0.6068855084067254, + "grad_norm": 0.8832327723503113, + "learning_rate": 1.9505369555988395e-05, + "loss": 1.2586, + "step": 758 + }, + { + "epoch": 0.6084867894315452, + "grad_norm": 2.866272211074829, + "learning_rate": 1.952258420445583e-05, + "loss": 3.72, + "step": 760 + }, + { + "epoch": 0.610088070456365, + "grad_norm": 2.023226499557495, + "learning_rate": 1.953950128863762e-05, + "loss": 1.0199, + "step": 762 + }, + { + "epoch": 0.611689351481185, + "grad_norm": 2.645901918411255, + "learning_rate": 1.9556120279904144e-05, + "loss": 1.4684, + "step": 764 + }, + { + "epoch": 0.6132906325060048, + "grad_norm": 0.9625386595726013, + "learning_rate": 1.957244065894066e-05, + "loss": 1.4331, + "step": 766 + }, + { + "epoch": 0.6148919135308246, + "grad_norm": 1.47154700756073, + "learning_rate": 1.9588461915763566e-05, + "loss": 0.7508, + "step": 768 + }, + { + "epoch": 0.6164931945556446, + "grad_norm": 2.7907140254974365, + "learning_rate": 1.9604183549736283e-05, + "loss": 2.2508, + "step": 770 + }, + { + "epoch": 0.6180944755804644, + "grad_norm": 4.152882099151611, + "learning_rate": 1.9619605069584954e-05, + "loss": 1.9617, + "step": 772 + }, + { + "epoch": 0.6196957566052842, + "grad_norm": 1.8030649423599243, + "learning_rate": 1.9634725993413744e-05, + "loss": 1.4689, + "step": 774 + }, + { + "epoch": 0.6212970376301041, + "grad_norm": 4.284875392913818, + "learning_rate": 1.964954584871995e-05, + "loss": 2.0722, + "step": 776 + }, + { + "epoch": 0.622898318654924, + "grad_norm": 1.956337571144104, + "learning_rate": 1.966406417240872e-05, + "loss": 0.8696, + "step": 778 + }, + { + "epoch": 0.6244995996797438, + "grad_norm": 0.6642113327980042, + "learning_rate": 1.967828051080755e-05, + "loss": 0.7165, + "step": 780 + }, + { + "epoch": 0.6261008807045636, + "grad_norm": 2.331721544265747, + "learning_rate": 1.969219441968046e-05, + "loss": 1.665, + "step": 782 + }, + { + "epoch": 0.6277021617293835, + "grad_norm": 3.7535288333892822, + "learning_rate": 1.9705805464241856e-05, + "loss": 2.3348, + "step": 784 + }, + { + "epoch": 0.6293034427542034, + "grad_norm": 0.9361191391944885, + "learning_rate": 1.971911321917015e-05, + "loss": 1.8891, + "step": 786 + }, + { + "epoch": 0.6309047237790232, + "grad_norm": 2.614335775375366, + "learning_rate": 1.9732117268621005e-05, + "loss": 1.822, + "step": 788 + }, + { + "epoch": 0.6325060048038431, + "grad_norm": 3.572840929031372, + "learning_rate": 1.9744817206240377e-05, + "loss": 2.3939, + "step": 790 + }, + { + "epoch": 0.6341072858286629, + "grad_norm": 0.3212428092956543, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.6811, + "step": 792 + }, + { + "epoch": 0.6357085668534828, + "grad_norm": 0.9476223587989807, + "learning_rate": 1.976930316809569e-05, + "loss": 2.5844, + "step": 794 + }, + { + "epoch": 0.6373098478783027, + "grad_norm": 2.15559983253479, + "learning_rate": 1.978108842718768e-05, + "loss": 1.2286, + "step": 796 + }, + { + "epoch": 0.6389111289031225, + "grad_norm": 2.636197090148926, + "learning_rate": 1.9792568044184176e-05, + "loss": 1.6526, + "step": 798 + }, + { + "epoch": 0.6405124099279423, + "grad_norm": 1.336948275566101, + "learning_rate": 1.9803741660367015e-05, + "loss": 0.9959, + "step": 800 + }, + { + "epoch": 0.6421136909527622, + "grad_norm": 0.39179208874702454, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.8636, + "step": 802 + }, + { + "epoch": 0.6437149719775821, + "grad_norm": 2.1984870433807373, + "learning_rate": 1.9825169503239885e-05, + "loss": 1.9714, + "step": 804 + }, + { + "epoch": 0.6453162530024019, + "grad_norm": 1.1849243640899658, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.8658, + "step": 806 + }, + { + "epoch": 0.6469175340272217, + "grad_norm": 2.3895506858825684, + "learning_rate": 1.9845369277495102e-05, + "loss": 1.2911, + "step": 808 + }, + { + "epoch": 0.6485188150520417, + "grad_norm": 2.325873613357544, + "learning_rate": 1.985500784388244e-05, + "loss": 1.4898, + "step": 810 + }, + { + "epoch": 0.6501200960768615, + "grad_norm": 5.504713535308838, + "learning_rate": 1.9864338458320366e-05, + "loss": 1.9817, + "step": 812 + }, + { + "epoch": 0.6517213771016813, + "grad_norm": 2.4665093421936035, + "learning_rate": 1.9873360829243323e-05, + "loss": 1.9984, + "step": 814 + }, + { + "epoch": 0.6533226581265013, + "grad_norm": 1.3474812507629395, + "learning_rate": 1.9882074674717836e-05, + "loss": 0.7775, + "step": 816 + }, + { + "epoch": 0.6549239391513211, + "grad_norm": 3.981353521347046, + "learning_rate": 1.989047972245129e-05, + "loss": 1.6013, + "step": 818 + }, + { + "epoch": 0.6565252201761409, + "grad_norm": 2.1101977825164795, + "learning_rate": 1.989857570980049e-05, + "loss": 1.3449, + "step": 820 + }, + { + "epoch": 0.6581265012009607, + "grad_norm": 3.8438498973846436, + "learning_rate": 1.9906362383779826e-05, + "loss": 1.2506, + "step": 822 + }, + { + "epoch": 0.6597277822257807, + "grad_norm": 2.2831003665924072, + "learning_rate": 1.9913839501069213e-05, + "loss": 3.32, + "step": 824 + }, + { + "epoch": 0.6613290632506005, + "grad_norm": 4.53866720199585, + "learning_rate": 1.9921006828021666e-05, + "loss": 2.6648, + "step": 826 + }, + { + "epoch": 0.6629303442754203, + "grad_norm": 5.285488605499268, + "learning_rate": 1.9927864140670615e-05, + "loss": 2.6368, + "step": 828 + }, + { + "epoch": 0.6645316253002402, + "grad_norm": 0.35420843958854675, + "learning_rate": 1.99344112247369e-05, + "loss": 0.6031, + "step": 830 + }, + { + "epoch": 0.6661329063250601, + "grad_norm": 1.3100470304489136, + "learning_rate": 1.9940647875635463e-05, + "loss": 1.2263, + "step": 832 + }, + { + "epoch": 0.6677341873498799, + "grad_norm": 1.6531809568405151, + "learning_rate": 1.994657389848176e-05, + "loss": 0.8481, + "step": 834 + }, + { + "epoch": 0.6693354683746997, + "grad_norm": 3.2566752433776855, + "learning_rate": 1.9952189108097825e-05, + "loss": 1.4428, + "step": 836 + }, + { + "epoch": 0.6709367493995196, + "grad_norm": 1.5921595096588135, + "learning_rate": 1.9957493329018064e-05, + "loss": 2.4036, + "step": 838 + }, + { + "epoch": 0.6725380304243395, + "grad_norm": 1.6074070930480957, + "learning_rate": 1.996248639549475e-05, + "loss": 0.7711, + "step": 840 + }, + { + "epoch": 0.6741393114491593, + "grad_norm": 2.341845750808716, + "learning_rate": 1.9967168151503196e-05, + "loss": 1.6511, + "step": 842 + }, + { + "epoch": 0.6757405924739792, + "grad_norm": 0.9461358189582825, + "learning_rate": 1.997153845074662e-05, + "loss": 0.6522, + "step": 844 + }, + { + "epoch": 0.677341873498799, + "grad_norm": 0.3312332332134247, + "learning_rate": 1.997559715666073e-05, + "loss": 1.2837, + "step": 846 + }, + { + "epoch": 0.6789431545236189, + "grad_norm": 1.8032020330429077, + "learning_rate": 1.9979344142417986e-05, + "loss": 0.8973, + "step": 848 + }, + { + "epoch": 0.6805444355484388, + "grad_norm": 1.2481379508972168, + "learning_rate": 1.998277929093157e-05, + "loss": 2.0106, + "step": 850 + }, + { + "epoch": 0.6821457165732586, + "grad_norm": 0.4399324953556061, + "learning_rate": 1.9985902494859023e-05, + "loss": 1.0878, + "step": 852 + }, + { + "epoch": 0.6837469975980784, + "grad_norm": 3.643920660018921, + "learning_rate": 1.9988713656605635e-05, + "loss": 2.1515, + "step": 854 + }, + { + "epoch": 0.6853482786228983, + "grad_norm": 5.450771331787109, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.8124, + "step": 856 + }, + { + "epoch": 0.6869495596477182, + "grad_norm": 0.8681036829948425, + "learning_rate": 1.999339951193407e-05, + "loss": 1.354, + "step": 858 + }, + { + "epoch": 0.688550840672538, + "grad_norm": 0.7660692930221558, + "learning_rate": 1.9995274059091018e-05, + "loss": 0.486, + "step": 860 + }, + { + "epoch": 0.6901521216973578, + "grad_norm": 2.1245641708374023, + "learning_rate": 1.999683627122195e-05, + "loss": 2.2678, + "step": 862 + }, + { + "epoch": 0.6917534027221778, + "grad_norm": 1.1392130851745605, + "learning_rate": 1.9998086099510433e-05, + "loss": 1.1203, + "step": 864 + }, + { + "epoch": 0.6933546837469976, + "grad_norm": 0.7254119515419006, + "learning_rate": 1.99990235049015e-05, + "loss": 0.8125, + "step": 866 + }, + { + "epoch": 0.6949559647718174, + "grad_norm": 1.287118673324585, + "learning_rate": 1.999964845810285e-05, + "loss": 0.9746, + "step": 868 + }, + { + "epoch": 0.6965572457966374, + "grad_norm": 1.4788893461227417, + "learning_rate": 1.999996093958578e-05, + "loss": 1.2155, + "step": 870 + }, + { + "epoch": 0.6981585268214572, + "grad_norm": 2.152543306350708, + "learning_rate": 1.999996093958578e-05, + "loss": 1.0889, + "step": 872 + }, + { + "epoch": 0.699759807846277, + "grad_norm": 2.767014265060425, + "learning_rate": 1.999964845810285e-05, + "loss": 2.3735, + "step": 874 + }, + { + "epoch": 0.7013610888710968, + "grad_norm": 2.791797399520874, + "learning_rate": 1.99990235049015e-05, + "loss": 2.2684, + "step": 876 + }, + { + "epoch": 0.7029623698959168, + "grad_norm": 2.071749687194824, + "learning_rate": 1.9998086099510433e-05, + "loss": 1.3278, + "step": 878 + }, + { + "epoch": 0.7045636509207366, + "grad_norm": 1.5336110591888428, + "learning_rate": 1.999683627122195e-05, + "loss": 0.8617, + "step": 880 + }, + { + "epoch": 0.7061649319455564, + "grad_norm": 3.690253973007202, + "learning_rate": 1.999527405909102e-05, + "loss": 2.1353, + "step": 882 + }, + { + "epoch": 0.7077662129703763, + "grad_norm": 2.26058030128479, + "learning_rate": 1.999339951193407e-05, + "loss": 1.9523, + "step": 884 + }, + { + "epoch": 0.7093674939951962, + "grad_norm": 1.0966075658798218, + "learning_rate": 1.9991212688327456e-05, + "loss": 1.4842, + "step": 886 + }, + { + "epoch": 0.710968775020016, + "grad_norm": 1.3473819494247437, + "learning_rate": 1.9988713656605635e-05, + "loss": 1.5067, + "step": 888 + }, + { + "epoch": 0.7125700560448359, + "grad_norm": 0.6934193968772888, + "learning_rate": 1.9985902494859026e-05, + "loss": 0.7411, + "step": 890 + }, + { + "epoch": 0.7141713370696557, + "grad_norm": 1.1892766952514648, + "learning_rate": 1.9982779290931572e-05, + "loss": 1.1759, + "step": 892 + }, + { + "epoch": 0.7157726180944756, + "grad_norm": 2.9387874603271484, + "learning_rate": 1.997934414241799e-05, + "loss": 1.6974, + "step": 894 + }, + { + "epoch": 0.7173738991192954, + "grad_norm": 2.657370090484619, + "learning_rate": 1.997559715666073e-05, + "loss": 1.6777, + "step": 896 + }, + { + "epoch": 0.7189751801441153, + "grad_norm": 1.060289978981018, + "learning_rate": 1.997153845074662e-05, + "loss": 1.9854, + "step": 898 + }, + { + "epoch": 0.7205764611689351, + "grad_norm": 1.2988049983978271, + "learning_rate": 1.9967168151503193e-05, + "loss": 0.9668, + "step": 900 + }, + { + "epoch": 0.722177742193755, + "grad_norm": 1.7357770204544067, + "learning_rate": 1.9962486395494753e-05, + "loss": 1.7262, + "step": 902 + }, + { + "epoch": 0.7237790232185749, + "grad_norm": 1.118236780166626, + "learning_rate": 1.9957493329018064e-05, + "loss": 1.6294, + "step": 904 + }, + { + "epoch": 0.7253803042433947, + "grad_norm": 2.844278335571289, + "learning_rate": 1.9952189108097825e-05, + "loss": 1.2228, + "step": 906 + }, + { + "epoch": 0.7269815852682145, + "grad_norm": 0.5361149907112122, + "learning_rate": 1.994657389848176e-05, + "loss": 0.7294, + "step": 908 + }, + { + "epoch": 0.7285828662930345, + "grad_norm": 1.333326816558838, + "learning_rate": 1.9940647875635466e-05, + "loss": 2.5786, + "step": 910 + }, + { + "epoch": 0.7301841473178543, + "grad_norm": 1.7993069887161255, + "learning_rate": 1.99344112247369e-05, + "loss": 2.5186, + "step": 912 + }, + { + "epoch": 0.7317854283426741, + "grad_norm": 0.4585472345352173, + "learning_rate": 1.9927864140670618e-05, + "loss": 0.5684, + "step": 914 + }, + { + "epoch": 0.733386709367494, + "grad_norm": 1.6229790449142456, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.6657, + "step": 916 + }, + { + "epoch": 0.7349879903923139, + "grad_norm": 0.9887677431106567, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.7117, + "step": 918 + }, + { + "epoch": 0.7365892714171337, + "grad_norm": 1.2062684297561646, + "learning_rate": 1.9906362383779826e-05, + "loss": 1.1431, + "step": 920 + }, + { + "epoch": 0.7381905524419535, + "grad_norm": 1.1369714736938477, + "learning_rate": 1.989857570980049e-05, + "loss": 0.5017, + "step": 922 + }, + { + "epoch": 0.7397918334667735, + "grad_norm": 1.2134912014007568, + "learning_rate": 1.9890479722451292e-05, + "loss": 1.0635, + "step": 924 + }, + { + "epoch": 0.7413931144915933, + "grad_norm": 3.4467785358428955, + "learning_rate": 1.9882074674717832e-05, + "loss": 2.5341, + "step": 926 + }, + { + "epoch": 0.7429943955164131, + "grad_norm": 0.6437131762504578, + "learning_rate": 1.987336082924333e-05, + "loss": 0.2482, + "step": 928 + }, + { + "epoch": 0.7445956765412329, + "grad_norm": 1.5937007665634155, + "learning_rate": 1.986433845832037e-05, + "loss": 0.4926, + "step": 930 + }, + { + "epoch": 0.7461969575660529, + "grad_norm": 3.0766074657440186, + "learning_rate": 1.9855007843882437e-05, + "loss": 0.6416, + "step": 932 + }, + { + "epoch": 0.7477982385908727, + "grad_norm": 3.5493216514587402, + "learning_rate": 1.9845369277495105e-05, + "loss": 0.9301, + "step": 934 + }, + { + "epoch": 0.7493995196156925, + "grad_norm": 0.4958154559135437, + "learning_rate": 1.9835423060346892e-05, + "loss": 2.626, + "step": 936 + }, + { + "epoch": 0.7510008006405124, + "grad_norm": 1.6603699922561646, + "learning_rate": 1.9825169503239885e-05, + "loss": 2.5931, + "step": 938 + }, + { + "epoch": 0.7526020816653323, + "grad_norm": 0.1832837015390396, + "learning_rate": 1.9814608926580007e-05, + "loss": 1.5933, + "step": 940 + }, + { + "epoch": 0.7542033626901521, + "grad_norm": 1.436362624168396, + "learning_rate": 1.9803741660367018e-05, + "loss": 1.1324, + "step": 942 + }, + { + "epoch": 0.755804643714972, + "grad_norm": 3.406855344772339, + "learning_rate": 1.979256804418418e-05, + "loss": 1.0107, + "step": 944 + }, + { + "epoch": 0.7574059247397918, + "grad_norm": 1.956744909286499, + "learning_rate": 1.9781088427187677e-05, + "loss": 1.8213, + "step": 946 + }, + { + "epoch": 0.7590072057646117, + "grad_norm": 0.38227635622024536, + "learning_rate": 1.976930316809569e-05, + "loss": 0.6661, + "step": 948 + }, + { + "epoch": 0.7606084867894315, + "grad_norm": 2.606114149093628, + "learning_rate": 1.9757212635177177e-05, + "loss": 2.446, + "step": 950 + }, + { + "epoch": 0.7622097678142514, + "grad_norm": 1.2398097515106201, + "learning_rate": 1.9744817206240374e-05, + "loss": 0.7108, + "step": 952 + }, + { + "epoch": 0.7638110488390712, + "grad_norm": 3.048835515975952, + "learning_rate": 1.9732117268621005e-05, + "loss": 3.1212, + "step": 954 + }, + { + "epoch": 0.7654123298638911, + "grad_norm": 1.2856189012527466, + "learning_rate": 1.9719113219170152e-05, + "loss": 1.1183, + "step": 956 + }, + { + "epoch": 0.767013610888711, + "grad_norm": 1.9659767150878906, + "learning_rate": 1.970580546424186e-05, + "loss": 1.2716, + "step": 958 + }, + { + "epoch": 0.7686148919135308, + "grad_norm": 2.8084945678710938, + "learning_rate": 1.9692194419680463e-05, + "loss": 1.4138, + "step": 960 + }, + { + "epoch": 0.7702161729383507, + "grad_norm": 1.6146398782730103, + "learning_rate": 1.9678280510807552e-05, + "loss": 0.6401, + "step": 962 + }, + { + "epoch": 0.7718174539631706, + "grad_norm": 4.153204441070557, + "learning_rate": 1.966406417240872e-05, + "loss": 1.7414, + "step": 964 + }, + { + "epoch": 0.7734187349879904, + "grad_norm": 3.3813881874084473, + "learning_rate": 1.964954584871995e-05, + "loss": 0.8718, + "step": 966 + }, + { + "epoch": 0.7750200160128102, + "grad_norm": 0.6088300943374634, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.3886, + "step": 968 + }, + { + "epoch": 0.77662129703763, + "grad_norm": 1.7553712129592896, + "learning_rate": 1.9619605069584954e-05, + "loss": 1.3022, + "step": 970 + }, + { + "epoch": 0.77822257806245, + "grad_norm": 1.6024192571640015, + "learning_rate": 1.9604183549736287e-05, + "loss": 1.9707, + "step": 972 + }, + { + "epoch": 0.7798238590872698, + "grad_norm": 3.1089975833892822, + "learning_rate": 1.958846191576357e-05, + "loss": 1.7008, + "step": 974 + }, + { + "epoch": 0.7814251401120896, + "grad_norm": 0.6323603987693787, + "learning_rate": 1.9572440658940667e-05, + "loss": 0.6215, + "step": 976 + }, + { + "epoch": 0.7830264211369096, + "grad_norm": 4.609311580657959, + "learning_rate": 1.955612027990415e-05, + "loss": 3.1875, + "step": 978 + }, + { + "epoch": 0.7846277021617294, + "grad_norm": 3.6928672790527344, + "learning_rate": 1.953950128863763e-05, + "loss": 2.0424, + "step": 980 + }, + { + "epoch": 0.7862289831865492, + "grad_norm": 0.40719327330589294, + "learning_rate": 1.9522584204455835e-05, + "loss": 0.4345, + "step": 982 + }, + { + "epoch": 0.7878302642113691, + "grad_norm": 2.6689980030059814, + "learning_rate": 1.9505369555988395e-05, + "loss": 2.4913, + "step": 984 + }, + { + "epoch": 0.789431545236189, + "grad_norm": 2.223402261734009, + "learning_rate": 1.9487857881163295e-05, + "loss": 0.947, + "step": 986 + }, + { + "epoch": 0.7910328262610088, + "grad_norm": 1.066835641860962, + "learning_rate": 1.947004972719008e-05, + "loss": 0.6195, + "step": 988 + }, + { + "epoch": 0.7926341072858286, + "grad_norm": 1.5345313549041748, + "learning_rate": 1.945194565054276e-05, + "loss": 0.3377, + "step": 990 + }, + { + "epoch": 0.7942353883106485, + "grad_norm": 0.6744644045829773, + "learning_rate": 1.9433546216942433e-05, + "loss": 0.3679, + "step": 992 + }, + { + "epoch": 0.7958366693354684, + "grad_norm": 1.9812852144241333, + "learning_rate": 1.941485200133955e-05, + "loss": 3.6173, + "step": 994 + }, + { + "epoch": 0.7974379503602882, + "grad_norm": 1.1819018125534058, + "learning_rate": 1.9395863587896025e-05, + "loss": 0.5277, + "step": 996 + }, + { + "epoch": 0.7990392313851081, + "grad_norm": 2.227170467376709, + "learning_rate": 1.937658156996694e-05, + "loss": 0.6576, + "step": 998 + }, + { + "epoch": 0.800640512409928, + "grad_norm": 1.2199251651763916, + "learning_rate": 1.9357006550082e-05, + "loss": 0.4044, + "step": 1000 + }, + { + "epoch": 0.8022417934347478, + "grad_norm": 1.6109410524368286, + "learning_rate": 1.933713913992671e-05, + "loss": 1.4925, + "step": 1002 + }, + { + "epoch": 0.8038430744595677, + "grad_norm": 0.9988458156585693, + "learning_rate": 1.9316979960323283e-05, + "loss": 1.4707, + "step": 1004 + }, + { + "epoch": 0.8054443554843875, + "grad_norm": 1.4009785652160645, + "learning_rate": 1.9296529641211226e-05, + "loss": 3.4313, + "step": 1006 + }, + { + "epoch": 0.8070456365092074, + "grad_norm": 1.2766797542572021, + "learning_rate": 1.9275788821627607e-05, + "loss": 1.4285, + "step": 1008 + }, + { + "epoch": 0.8086469175340272, + "grad_norm": 1.4580941200256348, + "learning_rate": 1.9254758149687187e-05, + "loss": 1.6398, + "step": 1010 + }, + { + "epoch": 0.8102481985588471, + "grad_norm": 1.8317885398864746, + "learning_rate": 1.9233438282562095e-05, + "loss": 2.2435, + "step": 1012 + }, + { + "epoch": 0.8118494795836669, + "grad_norm": 0.8780755400657654, + "learning_rate": 1.9211829886461278e-05, + "loss": 1.4584, + "step": 1014 + }, + { + "epoch": 0.8134507606084868, + "grad_norm": 0.5499489307403564, + "learning_rate": 1.918993363660975e-05, + "loss": 1.8556, + "step": 1016 + }, + { + "epoch": 0.8150520416333067, + "grad_norm": 1.1738747358322144, + "learning_rate": 1.916775021722745e-05, + "loss": 1.1079, + "step": 1018 + }, + { + "epoch": 0.8166533226581265, + "grad_norm": 1.5663732290267944, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.8542, + "step": 1020 + }, + { + "epoch": 0.8182546036829463, + "grad_norm": 0.9821288585662842, + "learning_rate": 1.9122524651596372e-05, + "loss": 0.6408, + "step": 1022 + }, + { + "epoch": 0.8198558847077662, + "grad_norm": 1.2327334880828857, + "learning_rate": 1.9099483918568287e-05, + "loss": 1.6681, + "step": 1024 + }, + { + "epoch": 0.8214571657325861, + "grad_norm": 0.3509677052497864, + "learning_rate": 1.907615884240668e-05, + "loss": 0.7164, + "step": 1026 + }, + { + "epoch": 0.8230584467574059, + "grad_norm": 3.0025110244750977, + "learning_rate": 1.905255015197982e-05, + "loss": 2.0905, + "step": 1028 + }, + { + "epoch": 0.8246597277822257, + "grad_norm": 0.5793729424476624, + "learning_rate": 1.902865858501845e-05, + "loss": 0.6869, + "step": 1030 + }, + { + "epoch": 0.8262610088070457, + "grad_norm": 1.3046590089797974, + "learning_rate": 1.9004484888092734e-05, + "loss": 0.7577, + "step": 1032 + }, + { + "epoch": 0.8278622898318655, + "grad_norm": 0.17382295429706573, + "learning_rate": 1.8980029816588863e-05, + "loss": 0.278, + "step": 1034 + }, + { + "epoch": 0.8294635708566853, + "grad_norm": 3.0656626224517822, + "learning_rate": 1.8955294134685528e-05, + "loss": 1.2479, + "step": 1036 + }, + { + "epoch": 0.8310648518815053, + "grad_norm": 0.8955541849136353, + "learning_rate": 1.893027861533003e-05, + "loss": 0.3533, + "step": 1038 + }, + { + "epoch": 0.8326661329063251, + "grad_norm": 1.5759841203689575, + "learning_rate": 1.8904984040214043e-05, + "loss": 1.19, + "step": 1040 + }, + { + "epoch": 0.8342674139311449, + "grad_norm": 0.9483829736709595, + "learning_rate": 1.8879411199749306e-05, + "loss": 0.4845, + "step": 1042 + }, + { + "epoch": 0.8358686949559647, + "grad_norm": 1.961287021636963, + "learning_rate": 1.885356089304285e-05, + "loss": 1.9476, + "step": 1044 + }, + { + "epoch": 0.8374699759807847, + "grad_norm": 1.0029722452163696, + "learning_rate": 1.882743392787207e-05, + "loss": 1.6232, + "step": 1046 + }, + { + "epoch": 0.8390712570056045, + "grad_norm": 1.791336178779602, + "learning_rate": 1.8801031120659396e-05, + "loss": 0.7585, + "step": 1048 + }, + { + "epoch": 0.8406725380304243, + "grad_norm": 1.724076747894287, + "learning_rate": 1.877435329644691e-05, + "loss": 1.0636, + "step": 1050 + }, + { + "epoch": 0.8422738190552442, + "grad_norm": 3.0383310317993164, + "learning_rate": 1.8747401288870482e-05, + "loss": 1.3502, + "step": 1052 + }, + { + "epoch": 0.8438751000800641, + "grad_norm": 4.471200466156006, + "learning_rate": 1.8720175940133712e-05, + "loss": 0.5245, + "step": 1054 + }, + { + "epoch": 0.8454763811048839, + "grad_norm": 1.7018201351165771, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.9002, + "step": 1056 + }, + { + "epoch": 0.8470776621297038, + "grad_norm": 2.207733154296875, + "learning_rate": 1.8664908630674264e-05, + "loss": 1.2915, + "step": 1058 + }, + { + "epoch": 0.8486789431545236, + "grad_norm": 1.3572278022766113, + "learning_rate": 1.8636868396959406e-05, + "loss": 1.5454, + "step": 1060 + }, + { + "epoch": 0.8502802241793435, + "grad_norm": 0.8189003467559814, + "learning_rate": 1.8608558276045898e-05, + "loss": 2.0655, + "step": 1062 + }, + { + "epoch": 0.8518815052041633, + "grad_norm": 3.0063459873199463, + "learning_rate": 1.8579979152576076e-05, + "loss": 1.853, + "step": 1064 + }, + { + "epoch": 0.8534827862289832, + "grad_norm": 1.480734944343567, + "learning_rate": 1.8551131919598084e-05, + "loss": 0.8587, + "step": 1066 + }, + { + "epoch": 0.855084067253803, + "grad_norm": 1.7147406339645386, + "learning_rate": 1.852201747853807e-05, + "loss": 0.8151, + "step": 1068 + }, + { + "epoch": 0.8566853482786229, + "grad_norm": 3.3855082988739014, + "learning_rate": 1.849263673917196e-05, + "loss": 1.2009, + "step": 1070 + }, + { + "epoch": 0.8582866293034428, + "grad_norm": 3.6384215354919434, + "learning_rate": 1.846299061959706e-05, + "loss": 1.4375, + "step": 1072 + }, + { + "epoch": 0.8598879103282626, + "grad_norm": 1.070793628692627, + "learning_rate": 1.8433080046203293e-05, + "loss": 0.9616, + "step": 1074 + }, + { + "epoch": 0.8614891913530824, + "grad_norm": 0.20211243629455566, + "learning_rate": 1.8402905953644356e-05, + "loss": 0.957, + "step": 1076 + }, + { + "epoch": 0.8630904723779024, + "grad_norm": 0.2896943986415863, + "learning_rate": 1.837246928480848e-05, + "loss": 0.9789, + "step": 1078 + }, + { + "epoch": 0.8646917534027222, + "grad_norm": 3.5619606971740723, + "learning_rate": 1.8341770990788874e-05, + "loss": 3.093, + "step": 1080 + }, + { + "epoch": 0.866293034427542, + "grad_norm": 1.1735012531280518, + "learning_rate": 1.831081203085415e-05, + "loss": 0.7008, + "step": 1082 + }, + { + "epoch": 0.8678943154523618, + "grad_norm": 2.6949212551116943, + "learning_rate": 1.8279593372418284e-05, + "loss": 1.2928, + "step": 1084 + }, + { + "epoch": 0.8694955964771818, + "grad_norm": 4.361700057983398, + "learning_rate": 1.8248115991010303e-05, + "loss": 2.0229, + "step": 1086 + }, + { + "epoch": 0.8710968775020016, + "grad_norm": 2.845186233520508, + "learning_rate": 1.8216380870243963e-05, + "loss": 0.9723, + "step": 1088 + }, + { + "epoch": 0.8726981585268214, + "grad_norm": 2.3665802478790283, + "learning_rate": 1.8184389001786912e-05, + "loss": 3.8003, + "step": 1090 + }, + { + "epoch": 0.8742994395516414, + "grad_norm": 3.5755226612091064, + "learning_rate": 1.815214138532966e-05, + "loss": 2.2496, + "step": 1092 + }, + { + "epoch": 0.8759007205764612, + "grad_norm": 1.8449146747589111, + "learning_rate": 1.8119639028554475e-05, + "loss": 1.5114, + "step": 1094 + }, + { + "epoch": 0.877502001601281, + "grad_norm": 1.6307106018066406, + "learning_rate": 1.808688294710378e-05, + "loss": 0.8907, + "step": 1096 + }, + { + "epoch": 0.8791032826261009, + "grad_norm": 1.6523268222808838, + "learning_rate": 1.805387416454849e-05, + "loss": 2.3874, + "step": 1098 + }, + { + "epoch": 0.8807045636509208, + "grad_norm": 1.6842433214187622, + "learning_rate": 1.802061371235592e-05, + "loss": 1.7253, + "step": 1100 + }, + { + "epoch": 0.8823058446757406, + "grad_norm": 1.1021060943603516, + "learning_rate": 1.7987102629857692e-05, + "loss": 0.7522, + "step": 1102 + }, + { + "epoch": 0.8839071257005604, + "grad_norm": 1.309314489364624, + "learning_rate": 1.7953341964217196e-05, + "loss": 1.4418, + "step": 1104 + }, + { + "epoch": 0.8855084067253803, + "grad_norm": 2.3494319915771484, + "learning_rate": 1.7919332770396798e-05, + "loss": 1.7823, + "step": 1106 + }, + { + "epoch": 0.8871096877502002, + "grad_norm": 0.13610607385635376, + "learning_rate": 1.7885076111125e-05, + "loss": 0.385, + "step": 1108 + }, + { + "epoch": 0.88871096877502, + "grad_norm": 0.7458411455154419, + "learning_rate": 1.7850573056863173e-05, + "loss": 0.7959, + "step": 1110 + }, + { + "epoch": 0.8903122497998399, + "grad_norm": 4.37985372543335, + "learning_rate": 1.7815824685772042e-05, + "loss": 1.8678, + "step": 1112 + }, + { + "epoch": 0.8919135308246597, + "grad_norm": 4.481252670288086, + "learning_rate": 1.7780832083678122e-05, + "loss": 2.0782, + "step": 1114 + }, + { + "epoch": 0.8935148118494796, + "grad_norm": 1.2357103824615479, + "learning_rate": 1.774559634403971e-05, + "loss": 0.7552, + "step": 1116 + }, + { + "epoch": 0.8951160928742994, + "grad_norm": 2.341662645339966, + "learning_rate": 1.7710118567912732e-05, + "loss": 0.963, + "step": 1118 + }, + { + "epoch": 0.8967173738991193, + "grad_norm": 1.8337247371673584, + "learning_rate": 1.7674399863916298e-05, + "loss": 2.3347, + "step": 1120 + }, + { + "epoch": 0.8983186549239391, + "grad_norm": 1.715006947517395, + "learning_rate": 1.7638441348198144e-05, + "loss": 0.7229, + "step": 1122 + }, + { + "epoch": 0.899919935948759, + "grad_norm": 0.9324997067451477, + "learning_rate": 1.7602244144399713e-05, + "loss": 1.2044, + "step": 1124 + }, + { + "epoch": 0.9015212169735789, + "grad_norm": 2.5102286338806152, + "learning_rate": 1.7565809383620966e-05, + "loss": 0.6813, + "step": 1126 + }, + { + "epoch": 0.9031224979983987, + "grad_norm": 1.3197435140609741, + "learning_rate": 1.7529138204385186e-05, + "loss": 0.3799, + "step": 1128 + }, + { + "epoch": 0.9047237790232185, + "grad_norm": 1.1094192266464233, + "learning_rate": 1.7492231752603305e-05, + "loss": 2.2801, + "step": 1130 + }, + { + "epoch": 0.9063250600480385, + "grad_norm": 2.6152217388153076, + "learning_rate": 1.7455091181538094e-05, + "loss": 1.6864, + "step": 1132 + }, + { + "epoch": 0.9079263410728583, + "grad_norm": 1.592564582824707, + "learning_rate": 1.741771765176815e-05, + "loss": 0.6739, + "step": 1134 + }, + { + "epoch": 0.9095276220976781, + "grad_norm": 1.1827375888824463, + "learning_rate": 1.7380112331151657e-05, + "loss": 0.4611, + "step": 1136 + }, + { + "epoch": 0.911128903122498, + "grad_norm": 1.323564052581787, + "learning_rate": 1.7342276394789825e-05, + "loss": 1.0661, + "step": 1138 + }, + { + "epoch": 0.9127301841473179, + "grad_norm": 1.8881070613861084, + "learning_rate": 1.7304211024990216e-05, + "loss": 1.8429, + "step": 1140 + }, + { + "epoch": 0.9143314651721377, + "grad_norm": 0.7184663414955139, + "learning_rate": 1.7265917411229803e-05, + "loss": 0.4275, + "step": 1142 + }, + { + "epoch": 0.9159327461969575, + "grad_norm": 0.9552716612815857, + "learning_rate": 1.7227396750117802e-05, + "loss": 0.9257, + "step": 1144 + }, + { + "epoch": 0.9175340272217775, + "grad_norm": 1.8908990621566772, + "learning_rate": 1.718865024535822e-05, + "loss": 2.5411, + "step": 1146 + }, + { + "epoch": 0.9191353082465973, + "grad_norm": 3.1544816493988037, + "learning_rate": 1.7149679107712317e-05, + "loss": 0.5273, + "step": 1148 + }, + { + "epoch": 0.9207365892714171, + "grad_norm": 3.0787792205810547, + "learning_rate": 1.711048455496075e-05, + "loss": 1.7463, + "step": 1150 + }, + { + "epoch": 0.922337870296237, + "grad_norm": 2.047178030014038, + "learning_rate": 1.7071067811865474e-05, + "loss": 3.7262, + "step": 1152 + }, + { + "epoch": 0.9239391513210569, + "grad_norm": 5.097822666168213, + "learning_rate": 1.7031430110131566e-05, + "loss": 2.6919, + "step": 1154 + }, + { + "epoch": 0.9255404323458767, + "grad_norm": 1.8718602657318115, + "learning_rate": 1.699157268836863e-05, + "loss": 0.6918, + "step": 1156 + }, + { + "epoch": 0.9271417133706965, + "grad_norm": 2.106215715408325, + "learning_rate": 1.6951496792052148e-05, + "loss": 0.8796, + "step": 1158 + }, + { + "epoch": 0.9287429943955164, + "grad_norm": 2.3310933113098145, + "learning_rate": 1.6911203673484583e-05, + "loss": 1.2808, + "step": 1160 + }, + { + "epoch": 0.9303442754203363, + "grad_norm": 0.39920124411582947, + "learning_rate": 1.687069459175619e-05, + "loss": 0.6156, + "step": 1162 + }, + { + "epoch": 0.9319455564451561, + "grad_norm": 1.9218827486038208, + "learning_rate": 1.682997081270568e-05, + "loss": 1.635, + "step": 1164 + }, + { + "epoch": 0.933546837469976, + "grad_norm": 3.0548582077026367, + "learning_rate": 1.6789033608880742e-05, + "loss": 2.2977, + "step": 1166 + }, + { + "epoch": 0.9351481184947958, + "grad_norm": 1.488222599029541, + "learning_rate": 1.6747884259498185e-05, + "loss": 1.146, + "step": 1168 + }, + { + "epoch": 0.9367493995196157, + "grad_norm": 1.0275205373764038, + "learning_rate": 1.6706524050404006e-05, + "loss": 1.7765, + "step": 1170 + }, + { + "epoch": 0.9383506805444356, + "grad_norm": 4.316395282745361, + "learning_rate": 1.6664954274033175e-05, + "loss": 1.9326, + "step": 1172 + }, + { + "epoch": 0.9399519615692554, + "grad_norm": 1.3081533908843994, + "learning_rate": 1.662317622936933e-05, + "loss": 1.0873, + "step": 1174 + }, + { + "epoch": 0.9415532425940752, + "grad_norm": 1.107600450515747, + "learning_rate": 1.6581191221904098e-05, + "loss": 1.5214, + "step": 1176 + }, + { + "epoch": 0.9431545236188951, + "grad_norm": 2.542297601699829, + "learning_rate": 1.6539000563596328e-05, + "loss": 1.8884, + "step": 1178 + }, + { + "epoch": 0.944755804643715, + "grad_norm": 0.9735599160194397, + "learning_rate": 1.6496605572831127e-05, + "loss": 0.6504, + "step": 1180 + }, + { + "epoch": 0.9463570856685348, + "grad_norm": 2.821681261062622, + "learning_rate": 1.6454007574378657e-05, + "loss": 0.9374, + "step": 1182 + }, + { + "epoch": 0.9479583666933546, + "grad_norm": 0.8494756817817688, + "learning_rate": 1.6411207899352633e-05, + "loss": 1.7512, + "step": 1184 + }, + { + "epoch": 0.9495596477181746, + "grad_norm": 1.416666030883789, + "learning_rate": 1.6368207885168904e-05, + "loss": 1.2768, + "step": 1186 + }, + { + "epoch": 0.9511609287429944, + "grad_norm": 0.4646993577480316, + "learning_rate": 1.6325008875503563e-05, + "loss": 0.4575, + "step": 1188 + }, + { + "epoch": 0.9527622097678142, + "grad_norm": 2.510261058807373, + "learning_rate": 1.628161222025089e-05, + "loss": 1.5016, + "step": 1190 + }, + { + "epoch": 0.9543634907926342, + "grad_norm": 0.8265005946159363, + "learning_rate": 1.623801927548132e-05, + "loss": 1.108, + "step": 1192 + }, + { + "epoch": 0.955964771817454, + "grad_norm": 2.0374512672424316, + "learning_rate": 1.6194231403398987e-05, + "loss": 0.2825, + "step": 1194 + }, + { + "epoch": 0.9575660528422738, + "grad_norm": 1.8480101823806763, + "learning_rate": 1.6150249972299173e-05, + "loss": 1.0921, + "step": 1196 + }, + { + "epoch": 0.9591673338670936, + "grad_norm": 3.3500051498413086, + "learning_rate": 1.6106076356525484e-05, + "loss": 2.3045, + "step": 1198 + }, + { + "epoch": 0.9607686148919136, + "grad_norm": 3.349045515060425, + "learning_rate": 1.6061711936427028e-05, + "loss": 1.3409, + "step": 1200 + }, + { + "epoch": 0.9623698959167334, + "grad_norm": 0.42878904938697815, + "learning_rate": 1.6017158098315224e-05, + "loss": 0.213, + "step": 1202 + }, + { + "epoch": 0.9639711769415532, + "grad_norm": 2.919360876083374, + "learning_rate": 1.5972416234420404e-05, + "loss": 2.8156, + "step": 1204 + }, + { + "epoch": 0.9655724579663731, + "grad_norm": 1.9368078708648682, + "learning_rate": 1.592748774284844e-05, + "loss": 1.1759, + "step": 1206 + }, + { + "epoch": 0.967173738991193, + "grad_norm": 1.7241026163101196, + "learning_rate": 1.588237402753703e-05, + "loss": 0.7364, + "step": 1208 + }, + { + "epoch": 0.9687750200160128, + "grad_norm": 2.972261905670166, + "learning_rate": 1.5837076498211673e-05, + "loss": 3.3012, + "step": 1210 + }, + { + "epoch": 0.9703763010408326, + "grad_norm": 0.6597610116004944, + "learning_rate": 1.579159657034185e-05, + "loss": 0.5629, + "step": 1212 + }, + { + "epoch": 0.9719775820656525, + "grad_norm": 0.17026177048683167, + "learning_rate": 1.574593566509664e-05, + "loss": 0.4184, + "step": 1214 + }, + { + "epoch": 0.9735788630904724, + "grad_norm": 0.3944058418273926, + "learning_rate": 1.5700095209300386e-05, + "loss": 0.8405, + "step": 1216 + }, + { + "epoch": 0.9751801441152922, + "grad_norm": 0.5041757822036743, + "learning_rate": 1.5654076635387976e-05, + "loss": 0.3268, + "step": 1218 + }, + { + "epoch": 0.9767814251401121, + "grad_norm": 1.95888352394104, + "learning_rate": 1.560788138136029e-05, + "loss": 3.4891, + "step": 1220 + }, + { + "epoch": 0.978382706164932, + "grad_norm": 1.3303838968276978, + "learning_rate": 1.5561510890739137e-05, + "loss": 0.6043, + "step": 1222 + }, + { + "epoch": 0.9799839871897518, + "grad_norm": 0.6112059354782104, + "learning_rate": 1.5514966612522088e-05, + "loss": 0.2274, + "step": 1224 + }, + { + "epoch": 0.9815852682145717, + "grad_norm": 0.6413119435310364, + "learning_rate": 1.546825000113736e-05, + "loss": 1.0371, + "step": 1226 + }, + { + "epoch": 0.9831865492393915, + "grad_norm": 0.9354526996612549, + "learning_rate": 1.5421362516398285e-05, + "loss": 1.1677, + "step": 1228 + }, + { + "epoch": 0.9847878302642114, + "grad_norm": 0.488077312707901, + "learning_rate": 1.5374305623457605e-05, + "loss": 0.6415, + "step": 1230 + }, + { + "epoch": 0.9863891112890312, + "grad_norm": 2.2682902812957764, + "learning_rate": 1.532708079276186e-05, + "loss": 2.4064, + "step": 1232 + }, + { + "epoch": 0.9879903923138511, + "grad_norm": 0.8012260794639587, + "learning_rate": 1.5279689500005353e-05, + "loss": 0.6979, + "step": 1234 + }, + { + "epoch": 0.9895916733386709, + "grad_norm": 1.2622216939926147, + "learning_rate": 1.5232133226083962e-05, + "loss": 1.4162, + "step": 1236 + }, + { + "epoch": 0.9911929543634908, + "grad_norm": 0.12676040828227997, + "learning_rate": 1.5184413457049014e-05, + "loss": 0.5408, + "step": 1238 + }, + { + "epoch": 0.9927942353883107, + "grad_norm": 3.4280080795288086, + "learning_rate": 1.5136531684060753e-05, + "loss": 1.3191, + "step": 1240 + }, + { + "epoch": 0.9943955164131305, + "grad_norm": 0.633600652217865, + "learning_rate": 1.50884894033418e-05, + "loss": 0.6289, + "step": 1242 + }, + { + "epoch": 0.9959967974379503, + "grad_norm": 0.7680506706237793, + "learning_rate": 1.504028811613027e-05, + "loss": 0.9093, + "step": 1244 + }, + { + "epoch": 0.9975980784627703, + "grad_norm": 0.24087411165237427, + "learning_rate": 1.4991929328633043e-05, + "loss": 0.3544, + "step": 1246 + }, + { + "epoch": 0.9991993594875901, + "grad_norm": 1.9583964347839355, + "learning_rate": 1.4943414551978622e-05, + "loss": 0.5386, + "step": 1248 + }, + { + "epoch": 1.0, + "step": 1249, + "total_flos": 1.2470350948335616e+17, + "train_loss": 1.270330325043421, + "train_runtime": 15264.486, + "train_samples_per_second": 1.309, + "train_steps_per_second": 0.082 + } + ], + "logging_steps": 2, + "max_steps": 1249, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 1.2470350948335616e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_nosampling_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_nosampling_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..00d9e8a9f3377fc13bbb0a29b675b875a4ecf0c6 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_nosampling_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:340821c29229557d377226cf30956c6e1654b039893778ab23accc8aca709dfc +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_nosampling_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_nosampling_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..f65de876fd671d56adc198aa45e2417fc1e4853b --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_nosampling_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:217ea74af8ef6be261c68f979bb0e4ef2d6e2871c8d4fe004c9f468c373acdd6 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_nosampling_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_nosampling_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..4cb8379ae656c98715e95b10d154fc1d50f634ca --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_nosampling_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:317a79515913c1f37c211243c876350e9c156ef015ee4fa07cf3c6c9eda16c63 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_nosampling_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_nosampling_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..4de38be3821ee24c4c309c8e64acc61f8a80a344 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_nosampling_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f28a8a6db8cffff62a57f6316bb039be8efb77f728a6e84fe7cf27561ddc815 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..11ecb15c54dfd0c347567effa1d12158832737b4 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,3776 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1249, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016012810248198558, + "learning_rate": 2.4341906163790364e-06, + "loss": 0.1514, + "step": 2 + }, + { + "epoch": 0.0032025620496397116, + "learning_rate": 2.4708617956148052e-06, + "loss": 0.1172, + "step": 4 + }, + { + "epoch": 0.004803843074459567, + "learning_rate": 2.507768247396697e-06, + "loss": 0.139, + "step": 6 + }, + { + "epoch": 0.006405124099279423, + "learning_rate": 2.5449088184619065e-06, + "loss": 0.0545, + "step": 8 + }, + { + "epoch": 0.008006405124099279, + "learning_rate": 2.5822823482318517e-06, + "loss": 0.6831, + "step": 10 + }, + { + "epoch": 0.009607686148919135, + "learning_rate": 2.6198876688483453e-06, + "loss": 0.1763, + "step": 12 + }, + { + "epoch": 0.01120896717373899, + "learning_rate": 2.6577236052101764e-06, + "loss": 0.1585, + "step": 14 + }, + { + "epoch": 0.012810248198558846, + "learning_rate": 2.6957889750097866e-06, + "loss": 0.0462, + "step": 16 + }, + { + "epoch": 0.014411529223378704, + "learning_rate": 2.7340825887701848e-06, + "loss": 0.2059, + "step": 18 + }, + { + "epoch": 0.016012810248198558, + "learning_rate": 2.772603249882202e-06, + "loss": 0.0941, + "step": 20 + }, + { + "epoch": 0.017614091273018415, + "learning_rate": 2.81134975464178e-06, + "loss": 0.1517, + "step": 22 + }, + { + "epoch": 0.01921537229783827, + "learning_rate": 2.850320892287688e-06, + "loss": 0.1519, + "step": 24 + }, + { + "epoch": 0.020816653322658127, + "learning_rate": 2.889515445039256e-06, + "loss": 0.1036, + "step": 26 + }, + { + "epoch": 0.02241793434747798, + "learning_rate": 2.928932188134529e-06, + "loss": 0.1438, + "step": 28 + }, + { + "epoch": 0.02401921537229784, + "learning_rate": 2.9685698898684355e-06, + "loss": 0.4228, + "step": 30 + }, + { + "epoch": 0.025620496397117692, + "learning_rate": 3.00842731163137e-06, + "loss": 0.1722, + "step": 32 + }, + { + "epoch": 0.02722177742193755, + "learning_rate": 3.048503207947854e-06, + "loss": 0.0769, + "step": 34 + }, + { + "epoch": 0.028823058446757407, + "learning_rate": 3.0887963265154187e-06, + "loss": 0.4029, + "step": 36 + }, + { + "epoch": 0.03042433947157726, + "learning_rate": 3.129305408243829e-06, + "loss": 0.1135, + "step": 38 + }, + { + "epoch": 0.032025620496397116, + "learning_rate": 3.17002918729432e-06, + "loss": 0.185, + "step": 40 + }, + { + "epoch": 0.03362690152121697, + "learning_rate": 3.2109663911192622e-06, + "loss": 0.153, + "step": 42 + }, + { + "epoch": 0.03522818254603683, + "learning_rate": 3.2521157405018146e-06, + "loss": 0.1267, + "step": 44 + }, + { + "epoch": 0.03682946357085669, + "learning_rate": 3.293475949595998e-06, + "loss": 0.1423, + "step": 46 + }, + { + "epoch": 0.03843074459567654, + "learning_rate": 3.335045725966829e-06, + "loss": 0.2389, + "step": 48 + }, + { + "epoch": 0.040032025620496396, + "learning_rate": 3.3768237706306716e-06, + "loss": 0.2257, + "step": 50 + }, + { + "epoch": 0.041633306645316254, + "learning_rate": 3.418808778095917e-06, + "loss": 0.1678, + "step": 52 + }, + { + "epoch": 0.04323458767013611, + "learning_rate": 3.460999436403676e-06, + "loss": 0.1747, + "step": 54 + }, + { + "epoch": 0.04483586869495596, + "learning_rate": 3.5033944271688624e-06, + "loss": 0.1452, + "step": 56 + }, + { + "epoch": 0.04643714971977582, + "learning_rate": 3.5459924256213596e-06, + "loss": 0.0932, + "step": 58 + }, + { + "epoch": 0.04803843074459568, + "learning_rate": 3.588792100647368e-06, + "loss": 0.1592, + "step": 60 + }, + { + "epoch": 0.049639711769415534, + "learning_rate": 3.6317921148310965e-06, + "loss": 0.1281, + "step": 62 + }, + { + "epoch": 0.051240992794235385, + "learning_rate": 3.674991124496452e-06, + "loss": 0.2977, + "step": 64 + }, + { + "epoch": 0.05284227381905524, + "learning_rate": 3.7183877797491143e-06, + "loss": 0.5645, + "step": 66 + }, + { + "epoch": 0.0544435548438751, + "learning_rate": 3.7619807245186824e-06, + "loss": 0.1164, + "step": 68 + }, + { + "epoch": 0.05604483586869496, + "learning_rate": 3.8057685966010025e-06, + "loss": 0.3493, + "step": 70 + }, + { + "epoch": 0.057646116893514815, + "learning_rate": 3.849750027700842e-06, + "loss": 0.1356, + "step": 72 + }, + { + "epoch": 0.059247397918334666, + "learning_rate": 3.8939236434745184e-06, + "loss": 0.0712, + "step": 74 + }, + { + "epoch": 0.06084867894315452, + "learning_rate": 3.938288063572962e-06, + "loss": 0.4313, + "step": 76 + }, + { + "epoch": 0.06244995996797438, + "learning_rate": 3.982841901684792e-06, + "loss": 0.2522, + "step": 78 + }, + { + "epoch": 0.06405124099279423, + "learning_rate": 4.027583765579601e-06, + "loss": 0.2175, + "step": 80 + }, + { + "epoch": 0.0656525220176141, + "learning_rate": 4.072512257151546e-06, + "loss": 0.0657, + "step": 82 + }, + { + "epoch": 0.06725380304243395, + "learning_rate": 4.117625972462988e-06, + "loss": 0.2479, + "step": 84 + }, + { + "epoch": 0.0688550840672538, + "learning_rate": 4.1629235017883285e-06, + "loss": 0.1743, + "step": 86 + }, + { + "epoch": 0.07045636509207366, + "learning_rate": 4.208403429658151e-06, + "loss": 0.6238, + "step": 88 + }, + { + "epoch": 0.07205764611689351, + "learning_rate": 4.254064334903347e-06, + "loss": 0.0513, + "step": 90 + }, + { + "epoch": 0.07365892714171338, + "learning_rate": 4.299904790699619e-06, + "loss": 0.0817, + "step": 92 + }, + { + "epoch": 0.07526020816653323, + "learning_rate": 4.345923364612024e-06, + "loss": 0.0745, + "step": 94 + }, + { + "epoch": 0.07686148919135308, + "learning_rate": 4.392118618639698e-06, + "loss": 0.2478, + "step": 96 + }, + { + "epoch": 0.07846277021617294, + "learning_rate": 4.4384891092608795e-06, + "loss": 0.3022, + "step": 98 + }, + { + "epoch": 0.08006405124099279, + "learning_rate": 4.485033387477915e-06, + "loss": 0.283, + "step": 100 + }, + { + "epoch": 0.08166533226581266, + "learning_rate": 4.531749998862628e-06, + "loss": 0.0965, + "step": 102 + }, + { + "epoch": 0.08326661329063251, + "learning_rate": 4.578637483601732e-06, + "loss": 0.2106, + "step": 104 + }, + { + "epoch": 0.08486789431545236, + "learning_rate": 4.625694376542399e-06, + "loss": 0.0995, + "step": 106 + }, + { + "epoch": 0.08646917534027222, + "learning_rate": 4.672919207238145e-06, + "loss": 0.1367, + "step": 108 + }, + { + "epoch": 0.08807045636509207, + "learning_rate": 4.720310499994664e-06, + "loss": 0.0185, + "step": 110 + }, + { + "epoch": 0.08967173738991192, + "learning_rate": 4.767866773916041e-06, + "loss": 0.0643, + "step": 112 + }, + { + "epoch": 0.09127301841473179, + "learning_rate": 4.81558654295099e-06, + "loss": 0.4434, + "step": 114 + }, + { + "epoch": 0.09287429943955164, + "learning_rate": 4.863468315939234e-06, + "loss": 0.19, + "step": 116 + }, + { + "epoch": 0.0944755804643715, + "learning_rate": 4.911510596658202e-06, + "loss": 0.668, + "step": 118 + }, + { + "epoch": 0.09607686148919135, + "learning_rate": 4.959711883869734e-06, + "loss": 0.111, + "step": 120 + }, + { + "epoch": 0.0976781425140112, + "learning_rate": 5.0080706713669435e-06, + "loss": 0.0766, + "step": 122 + }, + { + "epoch": 0.09927942353883107, + "learning_rate": 5.056585448021398e-06, + "loss": 0.0974, + "step": 124 + }, + { + "epoch": 0.10088070456365092, + "learning_rate": 5.105254697830208e-06, + "loss": 0.1732, + "step": 126 + }, + { + "epoch": 0.10248198558847077, + "learning_rate": 5.154076899963514e-06, + "loss": 0.1209, + "step": 128 + }, + { + "epoch": 0.10408326661329063, + "learning_rate": 5.203050528811959e-06, + "loss": 0.6952, + "step": 130 + }, + { + "epoch": 0.10568454763811048, + "learning_rate": 5.2521740540343205e-06, + "loss": 0.1867, + "step": 132 + }, + { + "epoch": 0.10728582866293035, + "learning_rate": 5.3014459406054295e-06, + "loss": 0.1691, + "step": 134 + }, + { + "epoch": 0.1088871096877502, + "learning_rate": 5.350864648864026e-06, + "loss": 0.1333, + "step": 136 + }, + { + "epoch": 0.11048839071257005, + "learning_rate": 5.4004286345609665e-06, + "loss": 0.1076, + "step": 138 + }, + { + "epoch": 0.11208967173738991, + "learning_rate": 5.450136348907444e-06, + "loss": 0.1348, + "step": 140 + }, + { + "epoch": 0.11369095276220977, + "learning_rate": 5.499986238623329e-06, + "loss": 0.1674, + "step": 142 + }, + { + "epoch": 0.11529223378702963, + "learning_rate": 5.549976745985809e-06, + "loss": 0.0837, + "step": 144 + }, + { + "epoch": 0.11689351481184948, + "learning_rate": 5.6001063088780085e-06, + "loss": 0.052, + "step": 146 + }, + { + "epoch": 0.11849479583666933, + "learning_rate": 5.650373360837763e-06, + "loss": 0.0444, + "step": 148 + }, + { + "epoch": 0.1200960768614892, + "learning_rate": 5.700776331106674e-06, + "loss": 0.015, + "step": 150 + }, + { + "epoch": 0.12169735788630905, + "learning_rate": 5.751313644679071e-06, + "loss": 0.0531, + "step": 152 + }, + { + "epoch": 0.1232986389111289, + "learning_rate": 5.8019837223513295e-06, + "loss": 0.1197, + "step": 154 + }, + { + "epoch": 0.12489991993594876, + "learning_rate": 5.852784980771182e-06, + "loss": 0.2765, + "step": 156 + }, + { + "epoch": 0.1265012009607686, + "learning_rate": 5.903715832487138e-06, + "loss": 0.2867, + "step": 158 + }, + { + "epoch": 0.12810248198558846, + "learning_rate": 5.954774685998206e-06, + "loss": 0.1788, + "step": 160 + }, + { + "epoch": 0.1297037630104083, + "learning_rate": 6.005959945803494e-06, + "loss": 0.2799, + "step": 162 + }, + { + "epoch": 0.1313050440352282, + "learning_rate": 6.057270012452186e-06, + "loss": 0.1319, + "step": 164 + }, + { + "epoch": 0.13290632506004804, + "learning_rate": 6.108703282593461e-06, + "loss": 0.3217, + "step": 166 + }, + { + "epoch": 0.1345076060848679, + "learning_rate": 6.160258149026557e-06, + "loss": 0.1849, + "step": 168 + }, + { + "epoch": 0.13610888710968774, + "learning_rate": 6.2119330007511014e-06, + "loss": 0.4212, + "step": 170 + }, + { + "epoch": 0.1377101681345076, + "learning_rate": 6.263726223017326e-06, + "loss": 0.1075, + "step": 172 + }, + { + "epoch": 0.13931144915932747, + "learning_rate": 6.315636197376634e-06, + "loss": 0.1672, + "step": 174 + }, + { + "epoch": 0.14091273018414732, + "learning_rate": 6.3676613017321305e-06, + "loss": 0.4658, + "step": 176 + }, + { + "epoch": 0.14251401120896717, + "learning_rate": 6.419799910389257e-06, + "loss": 0.1964, + "step": 178 + }, + { + "epoch": 0.14411529223378702, + "learning_rate": 6.472050394106689e-06, + "loss": 0.042, + "step": 180 + }, + { + "epoch": 0.14571657325860687, + "learning_rate": 6.524411120147204e-06, + "loss": 0.0147, + "step": 182 + }, + { + "epoch": 0.14731785428342675, + "learning_rate": 6.576880452328645e-06, + "loss": 0.1999, + "step": 184 + }, + { + "epoch": 0.1489191353082466, + "learning_rate": 6.6294567510751675e-06, + "loss": 0.8296, + "step": 186 + }, + { + "epoch": 0.15052041633306645, + "learning_rate": 6.682138373468341e-06, + "loss": 0.1899, + "step": 188 + }, + { + "epoch": 0.1521216973578863, + "learning_rate": 6.734923673298605e-06, + "loss": 0.27, + "step": 190 + }, + { + "epoch": 0.15372297838270615, + "learning_rate": 6.787811001116654e-06, + "loss": 0.0206, + "step": 192 + }, + { + "epoch": 0.15532425940752603, + "learning_rate": 6.840798704284939e-06, + "loss": 0.305, + "step": 194 + }, + { + "epoch": 0.15692554043234588, + "learning_rate": 6.893885127029419e-06, + "loss": 0.1658, + "step": 196 + }, + { + "epoch": 0.15852682145716573, + "learning_rate": 6.94706861049117e-06, + "loss": 0.1486, + "step": 198 + }, + { + "epoch": 0.16012810248198558, + "learning_rate": 7.000347492778341e-06, + "loss": 0.1772, + "step": 200 + }, + { + "epoch": 0.16172938350680544, + "learning_rate": 7.05372010901803e-06, + "loss": 0.4549, + "step": 202 + }, + { + "epoch": 0.1633306645316253, + "learning_rate": 7.1071847914082605e-06, + "loss": 0.2341, + "step": 204 + }, + { + "epoch": 0.16493194555644516, + "learning_rate": 7.160739869270219e-06, + "loss": 0.0896, + "step": 206 + }, + { + "epoch": 0.16653322658126501, + "learning_rate": 7.214383669100317e-06, + "loss": 0.1866, + "step": 208 + }, + { + "epoch": 0.16813450760608487, + "learning_rate": 7.268114514622635e-06, + "loss": 0.7071, + "step": 210 + }, + { + "epoch": 0.16973578863090472, + "learning_rate": 7.321930726841144e-06, + "loss": 0.1683, + "step": 212 + }, + { + "epoch": 0.17133706965572457, + "learning_rate": 7.375830624092336e-06, + "loss": 0.0287, + "step": 214 + }, + { + "epoch": 0.17293835068054444, + "learning_rate": 7.429812522097613e-06, + "loss": 0.1243, + "step": 216 + }, + { + "epoch": 0.1745396317053643, + "learning_rate": 7.4838747340160475e-06, + "loss": 0.204, + "step": 218 + }, + { + "epoch": 0.17614091273018415, + "learning_rate": 7.538015570497046e-06, + "loss": 0.2267, + "step": 220 + }, + { + "epoch": 0.177742193755004, + "learning_rate": 7.592233339733077e-06, + "loss": 0.3241, + "step": 222 + }, + { + "epoch": 0.17934347477982385, + "learning_rate": 7.646526347512665e-06, + "loss": 0.492, + "step": 224 + }, + { + "epoch": 0.18094475580464373, + "learning_rate": 7.70089289727319e-06, + "loss": 0.0819, + "step": 226 + }, + { + "epoch": 0.18254603682946358, + "learning_rate": 7.755331290154041e-06, + "loss": 0.1381, + "step": 228 + }, + { + "epoch": 0.18414731785428343, + "learning_rate": 7.809839825049565e-06, + "loss": 0.0647, + "step": 230 + }, + { + "epoch": 0.18574859887910328, + "learning_rate": 7.864416798662347e-06, + "loss": 0.1587, + "step": 232 + }, + { + "epoch": 0.18734987990392313, + "learning_rate": 7.919060505556376e-06, + "loss": 0.0265, + "step": 234 + }, + { + "epoch": 0.188951160928743, + "learning_rate": 7.973769238210291e-06, + "loss": 0.4234, + "step": 236 + }, + { + "epoch": 0.19055244195356286, + "learning_rate": 8.028541287070858e-06, + "loss": 0.2822, + "step": 238 + }, + { + "epoch": 0.1921537229783827, + "learning_rate": 8.083374940606256e-06, + "loss": 0.2843, + "step": 240 + }, + { + "epoch": 0.19375500400320256, + "learning_rate": 8.138268485359684e-06, + "loss": 0.5846, + "step": 242 + }, + { + "epoch": 0.1953562850280224, + "learning_rate": 8.193220206002785e-06, + "loss": 0.227, + "step": 244 + }, + { + "epoch": 0.1969575660528423, + "learning_rate": 8.248228385389349e-06, + "loss": 0.225, + "step": 246 + }, + { + "epoch": 0.19855884707766214, + "learning_rate": 8.303291304608936e-06, + "loss": 0.1251, + "step": 248 + }, + { + "epoch": 0.200160128102482, + "learning_rate": 8.358407243040524e-06, + "loss": 0.2108, + "step": 250 + }, + { + "epoch": 0.20176140912730184, + "learning_rate": 8.413574478406386e-06, + "loss": 0.0656, + "step": 252 + }, + { + "epoch": 0.2033626901521217, + "learning_rate": 8.468791286825856e-06, + "loss": 0.5503, + "step": 254 + }, + { + "epoch": 0.20496397117694154, + "learning_rate": 8.524055942869135e-06, + "loss": 0.1978, + "step": 256 + }, + { + "epoch": 0.20656525220176142, + "learning_rate": 8.579366719611353e-06, + "loss": 0.0869, + "step": 258 + }, + { + "epoch": 0.20816653322658127, + "learning_rate": 8.634721888686368e-06, + "loss": 0.1127, + "step": 260 + }, + { + "epoch": 0.20976781425140112, + "learning_rate": 8.690119720340907e-06, + "loss": 0.1418, + "step": 262 + }, + { + "epoch": 0.21136909527622097, + "learning_rate": 8.74555848348857e-06, + "loss": 0.2559, + "step": 264 + }, + { + "epoch": 0.21297037630104082, + "learning_rate": 8.801036445763858e-06, + "loss": 0.092, + "step": 266 + }, + { + "epoch": 0.2145716573258607, + "learning_rate": 8.856551873576448e-06, + "loss": 0.109, + "step": 268 + }, + { + "epoch": 0.21617293835068055, + "learning_rate": 8.912103032165206e-06, + "loss": 0.2023, + "step": 270 + }, + { + "epoch": 0.2177742193755004, + "learning_rate": 8.967688185652527e-06, + "loss": 0.1366, + "step": 272 + }, + { + "epoch": 0.21937550040032025, + "learning_rate": 9.023305597098526e-06, + "loss": 0.124, + "step": 274 + }, + { + "epoch": 0.2209767814251401, + "learning_rate": 9.078953528555258e-06, + "loss": 0.6121, + "step": 276 + }, + { + "epoch": 0.22257806244995998, + "learning_rate": 9.134630241121135e-06, + "loss": 0.1334, + "step": 278 + }, + { + "epoch": 0.22417934347477983, + "learning_rate": 9.190333994995208e-06, + "loss": 0.2663, + "step": 280 + }, + { + "epoch": 0.22578062449959968, + "learning_rate": 9.24606304953148e-06, + "loss": 0.0988, + "step": 282 + }, + { + "epoch": 0.22738190552441953, + "learning_rate": 9.301815663293426e-06, + "loss": 0.409, + "step": 284 + }, + { + "epoch": 0.22898318654923938, + "learning_rate": 9.35759009410826e-06, + "loss": 0.2903, + "step": 286 + }, + { + "epoch": 0.23058446757405926, + "learning_rate": 9.41338459912151e-06, + "loss": 1.129, + "step": 288 + }, + { + "epoch": 0.2321857485988791, + "learning_rate": 9.469197434851414e-06, + "loss": 0.0543, + "step": 290 + }, + { + "epoch": 0.23378702962369896, + "learning_rate": 9.52502685724336e-06, + "loss": 0.37, + "step": 292 + }, + { + "epoch": 0.2353883106485188, + "learning_rate": 9.580871121724498e-06, + "loss": 0.6854, + "step": 294 + }, + { + "epoch": 0.23698959167333866, + "learning_rate": 9.636728483258116e-06, + "loss": 0.5659, + "step": 296 + }, + { + "epoch": 0.23859087269815854, + "learning_rate": 9.692597196398302e-06, + "loss": 0.2975, + "step": 298 + }, + { + "epoch": 0.2401921537229784, + "learning_rate": 9.748475515344416e-06, + "loss": 0.2951, + "step": 300 + }, + { + "epoch": 0.24179343474779824, + "learning_rate": 9.80436169399561e-06, + "loss": 0.5071, + "step": 302 + }, + { + "epoch": 0.2433947157726181, + "learning_rate": 9.8602539860055e-06, + "loss": 0.0566, + "step": 304 + }, + { + "epoch": 0.24499599679743794, + "learning_rate": 9.916150644836596e-06, + "loss": 0.1737, + "step": 306 + }, + { + "epoch": 0.2465972778222578, + "learning_rate": 9.972049923815011e-06, + "loss": 0.2336, + "step": 308 + }, + { + "epoch": 0.24819855884707767, + "learning_rate": 1.0027950076184982e-05, + "loss": 0.3613, + "step": 310 + }, + { + "epoch": 0.24979983987189752, + "learning_rate": 1.0083849355163397e-05, + "loss": 0.521, + "step": 312 + }, + { + "epoch": 0.2514011208967174, + "learning_rate": 1.0139746013994493e-05, + "loss": 0.1591, + "step": 314 + }, + { + "epoch": 0.2530024019215372, + "learning_rate": 1.0195638306004383e-05, + "loss": 0.2951, + "step": 316 + }, + { + "epoch": 0.2546036829463571, + "learning_rate": 1.0251524484655577e-05, + "loss": 0.0635, + "step": 318 + }, + { + "epoch": 0.2562049639711769, + "learning_rate": 1.0307402803601691e-05, + "loss": 0.1459, + "step": 320 + }, + { + "epoch": 0.2578062449959968, + "learning_rate": 1.0363271516741877e-05, + "loss": 0.156, + "step": 322 + }, + { + "epoch": 0.2594075260208166, + "learning_rate": 1.0419128878275495e-05, + "loss": 0.3462, + "step": 324 + }, + { + "epoch": 0.2610088070456365, + "learning_rate": 1.0474973142756632e-05, + "loss": 0.1901, + "step": 326 + }, + { + "epoch": 0.2626100880704564, + "learning_rate": 1.053080256514858e-05, + "loss": 0.2888, + "step": 328 + }, + { + "epoch": 0.2642113690952762, + "learning_rate": 1.0586615400878484e-05, + "loss": 0.2525, + "step": 330 + }, + { + "epoch": 0.2658126501200961, + "learning_rate": 1.0642409905891733e-05, + "loss": 0.2064, + "step": 332 + }, + { + "epoch": 0.2674139311449159, + "learning_rate": 1.0698184336706567e-05, + "loss": 0.318, + "step": 334 + }, + { + "epoch": 0.2690152121697358, + "learning_rate": 1.0753936950468513e-05, + "loss": 0.3186, + "step": 336 + }, + { + "epoch": 0.27061649319455566, + "learning_rate": 1.0809666005004787e-05, + "loss": 0.3997, + "step": 338 + }, + { + "epoch": 0.2722177742193755, + "learning_rate": 1.0865369758878858e-05, + "loss": 0.1775, + "step": 340 + }, + { + "epoch": 0.27381905524419536, + "learning_rate": 1.0921046471444737e-05, + "loss": 0.2227, + "step": 342 + }, + { + "epoch": 0.2754203362690152, + "learning_rate": 1.0976694402901467e-05, + "loss": 0.1278, + "step": 344 + }, + { + "epoch": 0.27702161729383507, + "learning_rate": 1.1032311814347467e-05, + "loss": 0.2362, + "step": 346 + }, + { + "epoch": 0.27862289831865494, + "learning_rate": 1.1087896967834787e-05, + "loss": 0.2938, + "step": 348 + }, + { + "epoch": 0.28022417934347477, + "learning_rate": 1.1143448126423545e-05, + "loss": 0.3197, + "step": 350 + }, + { + "epoch": 0.28182546036829464, + "learning_rate": 1.1198963554236135e-05, + "loss": 0.111, + "step": 352 + }, + { + "epoch": 0.28342674139311447, + "learning_rate": 1.1254441516511425e-05, + "loss": 0.1264, + "step": 354 + }, + { + "epoch": 0.28502802241793435, + "learning_rate": 1.1309880279659087e-05, + "loss": 0.2787, + "step": 356 + }, + { + "epoch": 0.2866293034427542, + "learning_rate": 1.1365278111313625e-05, + "loss": 0.2473, + "step": 358 + }, + { + "epoch": 0.28823058446757405, + "learning_rate": 1.142063328038864e-05, + "loss": 0.3255, + "step": 360 + }, + { + "epoch": 0.2898318654923939, + "learning_rate": 1.1475944057130856e-05, + "loss": 0.239, + "step": 362 + }, + { + "epoch": 0.29143314651721375, + "learning_rate": 1.1531208713174138e-05, + "loss": 0.5546, + "step": 364 + }, + { + "epoch": 0.2930344275420336, + "learning_rate": 1.1586425521593607e-05, + "loss": 0.0541, + "step": 366 + }, + { + "epoch": 0.2946357085668535, + "learning_rate": 1.1641592756959467e-05, + "loss": 0.1592, + "step": 368 + }, + { + "epoch": 0.2962369895916733, + "learning_rate": 1.1696708695391057e-05, + "loss": 0.2305, + "step": 370 + }, + { + "epoch": 0.2978382706164932, + "learning_rate": 1.1751771614610643e-05, + "loss": 0.1104, + "step": 372 + }, + { + "epoch": 0.29943955164131303, + "learning_rate": 1.180677979399721e-05, + "loss": 0.1474, + "step": 374 + }, + { + "epoch": 0.3010408326661329, + "learning_rate": 1.1861731514640309e-05, + "loss": 0.1575, + "step": 376 + }, + { + "epoch": 0.3026421136909528, + "learning_rate": 1.1916625059393739e-05, + "loss": 0.21, + "step": 378 + }, + { + "epoch": 0.3042433947157726, + "learning_rate": 1.1971458712929133e-05, + "loss": 0.0702, + "step": 380 + }, + { + "epoch": 0.3058446757405925, + "learning_rate": 1.2026230761789702e-05, + "loss": 0.1996, + "step": 382 + }, + { + "epoch": 0.3074459567654123, + "learning_rate": 1.2080939494443618e-05, + "loss": 0.2965, + "step": 384 + }, + { + "epoch": 0.3090472377902322, + "learning_rate": 1.2135583201337646e-05, + "loss": 0.1831, + "step": 386 + }, + { + "epoch": 0.31064851881505207, + "learning_rate": 1.2190160174950428e-05, + "loss": 0.1602, + "step": 388 + }, + { + "epoch": 0.3122497998398719, + "learning_rate": 1.2244668709845952e-05, + "loss": 0.0055, + "step": 390 + }, + { + "epoch": 0.31385108086469177, + "learning_rate": 1.2299107102726804e-05, + "loss": 0.7346, + "step": 392 + }, + { + "epoch": 0.3154523618895116, + "learning_rate": 1.2353473652487329e-05, + "loss": 0.043, + "step": 394 + }, + { + "epoch": 0.31705364291433147, + "learning_rate": 1.2407766660266916e-05, + "loss": 0.0878, + "step": 396 + }, + { + "epoch": 0.31865492393915135, + "learning_rate": 1.2461984429502947e-05, + "loss": 0.769, + "step": 398 + }, + { + "epoch": 0.32025620496397117, + "learning_rate": 1.2516125265983945e-05, + "loss": 0.1737, + "step": 400 + }, + { + "epoch": 0.32185748598879105, + "learning_rate": 1.257018747790238e-05, + "loss": 0.2405, + "step": 402 + }, + { + "epoch": 0.32345876701361087, + "learning_rate": 1.2624169375907657e-05, + "loss": 0.4884, + "step": 404 + }, + { + "epoch": 0.32506004803843075, + "learning_rate": 1.2678069273158849e-05, + "loss": 0.194, + "step": 406 + }, + { + "epoch": 0.3266613290632506, + "learning_rate": 1.273188548537736e-05, + "loss": 0.3149, + "step": 408 + }, + { + "epoch": 0.32826261008807045, + "learning_rate": 1.2785616330899676e-05, + "loss": 1.8402, + "step": 410 + }, + { + "epoch": 0.32986389111289033, + "learning_rate": 1.2839260130729776e-05, + "loss": 0.1369, + "step": 412 + }, + { + "epoch": 0.33146517213771015, + "learning_rate": 1.2892815208591734e-05, + "loss": 0.5522, + "step": 414 + }, + { + "epoch": 0.33306645316253003, + "learning_rate": 1.2946279890981966e-05, + "loss": 0.2254, + "step": 416 + }, + { + "epoch": 0.33466773418734985, + "learning_rate": 1.2999652507221652e-05, + "loss": 0.243, + "step": 418 + }, + { + "epoch": 0.33626901521216973, + "learning_rate": 1.3052931389508822e-05, + "loss": 0.2589, + "step": 420 + }, + { + "epoch": 0.3378702962369896, + "learning_rate": 1.3106114872970575e-05, + "loss": 0.189, + "step": 422 + }, + { + "epoch": 0.33947157726180943, + "learning_rate": 1.3159201295715054e-05, + "loss": 0.5865, + "step": 424 + }, + { + "epoch": 0.3410728582866293, + "learning_rate": 1.321218899888334e-05, + "loss": 0.1693, + "step": 426 + }, + { + "epoch": 0.34267413931144913, + "learning_rate": 1.326507632670139e-05, + "loss": 0.4265, + "step": 428 + }, + { + "epoch": 0.344275420336269, + "learning_rate": 1.3317861626531652e-05, + "loss": 0.3939, + "step": 430 + }, + { + "epoch": 0.3458767013610889, + "learning_rate": 1.3370543248924826e-05, + "loss": 0.4309, + "step": 432 + }, + { + "epoch": 0.3474779823859087, + "learning_rate": 1.3423119547671348e-05, + "loss": 0.2518, + "step": 434 + }, + { + "epoch": 0.3490792634107286, + "learning_rate": 1.347558887985279e-05, + "loss": 0.3826, + "step": 436 + }, + { + "epoch": 0.3506805444355484, + "learning_rate": 1.3527949605893305e-05, + "loss": 0.1508, + "step": 438 + }, + { + "epoch": 0.3522818254603683, + "learning_rate": 1.3580200089610739e-05, + "loss": 0.4003, + "step": 440 + }, + { + "epoch": 0.35388310648518817, + "learning_rate": 1.3632338698267863e-05, + "loss": 0.2644, + "step": 442 + }, + { + "epoch": 0.355484387510008, + "learning_rate": 1.368436380262336e-05, + "loss": 0.3196, + "step": 444 + }, + { + "epoch": 0.35708566853482787, + "learning_rate": 1.3736273776982667e-05, + "loss": 0.1834, + "step": 446 + }, + { + "epoch": 0.3586869495596477, + "learning_rate": 1.3788066999248893e-05, + "loss": 0.2493, + "step": 448 + }, + { + "epoch": 0.3602882305844676, + "learning_rate": 1.3839741850973435e-05, + "loss": 0.1866, + "step": 450 + }, + { + "epoch": 0.36188951160928745, + "learning_rate": 1.3891296717406533e-05, + "loss": 0.1911, + "step": 452 + }, + { + "epoch": 0.3634907926341073, + "learning_rate": 1.3942729987547808e-05, + "loss": 0.3398, + "step": 454 + }, + { + "epoch": 0.36509207365892715, + "learning_rate": 1.3994040054196498e-05, + "loss": 0.4, + "step": 456 + }, + { + "epoch": 0.366693354683747, + "learning_rate": 1.4045225314001789e-05, + "loss": 0.1742, + "step": 458 + }, + { + "epoch": 0.36829463570856685, + "learning_rate": 1.4096284167512856e-05, + "loss": 0.161, + "step": 460 + }, + { + "epoch": 0.36989591673338673, + "learning_rate": 1.4147215019228813e-05, + "loss": 0.1911, + "step": 462 + }, + { + "epoch": 0.37149719775820655, + "learning_rate": 1.4198016277648665e-05, + "loss": 0.1084, + "step": 464 + }, + { + "epoch": 0.37309847878302643, + "learning_rate": 1.4248686355320922e-05, + "loss": 0.3494, + "step": 466 + }, + { + "epoch": 0.37469975980784626, + "learning_rate": 1.429922366889332e-05, + "loss": 0.4, + "step": 468 + }, + { + "epoch": 0.37630104083266613, + "learning_rate": 1.4349626639162231e-05, + "loss": 0.3706, + "step": 470 + }, + { + "epoch": 0.377902321857486, + "learning_rate": 1.4399893691121985e-05, + "loss": 0.4302, + "step": 472 + }, + { + "epoch": 0.37950360288230583, + "learning_rate": 1.4450023254014185e-05, + "loss": 0.2218, + "step": 474 + }, + { + "epoch": 0.3811048839071257, + "learning_rate": 1.4500013761376663e-05, + "loss": 0.2239, + "step": 476 + }, + { + "epoch": 0.38270616493194554, + "learning_rate": 1.454986365109255e-05, + "loss": 0.1917, + "step": 478 + }, + { + "epoch": 0.3843074459567654, + "learning_rate": 1.4599571365439027e-05, + "loss": 0.514, + "step": 480 + }, + { + "epoch": 0.3859087269815853, + "learning_rate": 1.4649135351135968e-05, + "loss": 0.0929, + "step": 482 + }, + { + "epoch": 0.3875100080064051, + "learning_rate": 1.4698554059394563e-05, + "loss": 0.2467, + "step": 484 + }, + { + "epoch": 0.389111289031225, + "learning_rate": 1.4747825945965675e-05, + "loss": 0.2526, + "step": 486 + }, + { + "epoch": 0.3907125700560448, + "learning_rate": 1.4796949471188033e-05, + "loss": 0.3333, + "step": 488 + }, + { + "epoch": 0.3923138510808647, + "learning_rate": 1.4845923100036479e-05, + "loss": 0.2936, + "step": 490 + }, + { + "epoch": 0.3939151321056846, + "learning_rate": 1.4894745302169786e-05, + "loss": 0.168, + "step": 492 + }, + { + "epoch": 0.3955164131305044, + "learning_rate": 1.4943414551978597e-05, + "loss": 0.2293, + "step": 494 + }, + { + "epoch": 0.3971176941553243, + "learning_rate": 1.499192932863305e-05, + "loss": 0.2809, + "step": 496 + }, + { + "epoch": 0.3987189751801441, + "learning_rate": 1.5040288116130261e-05, + "loss": 0.4036, + "step": 498 + }, + { + "epoch": 0.400320256204964, + "learning_rate": 1.5088489403341793e-05, + "loss": 0.3494, + "step": 500 + }, + { + "epoch": 0.40192153722978385, + "learning_rate": 1.513653168406076e-05, + "loss": 0.1981, + "step": 502 + }, + { + "epoch": 0.4035228182546037, + "learning_rate": 1.5184413457049006e-05, + "loss": 0.4441, + "step": 504 + }, + { + "epoch": 0.40512409927942356, + "learning_rate": 1.5232133226083954e-05, + "loss": 0.1469, + "step": 506 + }, + { + "epoch": 0.4067253803042434, + "learning_rate": 1.527968950000533e-05, + "loss": 0.3933, + "step": 508 + }, + { + "epoch": 0.40832666132906326, + "learning_rate": 1.532708079276185e-05, + "loss": 0.1119, + "step": 510 + }, + { + "epoch": 0.4099279423538831, + "learning_rate": 1.5374305623457594e-05, + "loss": 0.4375, + "step": 512 + }, + { + "epoch": 0.41152922337870296, + "learning_rate": 1.542136251639826e-05, + "loss": 0.1107, + "step": 514 + }, + { + "epoch": 0.41313050440352284, + "learning_rate": 1.5468250001137368e-05, + "loss": 0.1726, + "step": 516 + }, + { + "epoch": 0.41473178542834266, + "learning_rate": 1.551496661252208e-05, + "loss": 0.3434, + "step": 518 + }, + { + "epoch": 0.41633306645316254, + "learning_rate": 1.5561510890739113e-05, + "loss": 0.3449, + "step": 520 + }, + { + "epoch": 0.41793434747798236, + "learning_rate": 1.5607881381360296e-05, + "loss": 0.1956, + "step": 522 + }, + { + "epoch": 0.41953562850280224, + "learning_rate": 1.565407663538797e-05, + "loss": 0.2114, + "step": 524 + }, + { + "epoch": 0.4211369095276221, + "learning_rate": 1.5700095209300376e-05, + "loss": 0.2248, + "step": 526 + }, + { + "epoch": 0.42273819055244194, + "learning_rate": 1.5745935665096647e-05, + "loss": 0.1809, + "step": 528 + }, + { + "epoch": 0.4243394715772618, + "learning_rate": 1.5791596570341844e-05, + "loss": 0.7246, + "step": 530 + }, + { + "epoch": 0.42594075260208164, + "learning_rate": 1.5837076498211666e-05, + "loss": 0.4036, + "step": 532 + }, + { + "epoch": 0.4275420336269015, + "learning_rate": 1.5882374027537005e-05, + "loss": 0.1076, + "step": 534 + }, + { + "epoch": 0.4291433146517214, + "learning_rate": 1.5927487742848448e-05, + "loss": 0.676, + "step": 536 + }, + { + "epoch": 0.4307445956765412, + "learning_rate": 1.5972416234420393e-05, + "loss": 0.5394, + "step": 538 + }, + { + "epoch": 0.4323458767013611, + "learning_rate": 1.60171580983152e-05, + "loss": 0.2514, + "step": 540 + }, + { + "epoch": 0.4339471577261809, + "learning_rate": 1.606171193642703e-05, + "loss": 0.2542, + "step": 542 + }, + { + "epoch": 0.4355484387510008, + "learning_rate": 1.6106076356525474e-05, + "loss": 0.4332, + "step": 544 + }, + { + "epoch": 0.4371497197758207, + "learning_rate": 1.6150249972299153e-05, + "loss": 0.1449, + "step": 546 + }, + { + "epoch": 0.4387510008006405, + "learning_rate": 1.6194231403398994e-05, + "loss": 0.417, + "step": 548 + }, + { + "epoch": 0.4403522818254604, + "learning_rate": 1.6238019275481313e-05, + "loss": 0.2424, + "step": 550 + }, + { + "epoch": 0.4419535628502802, + "learning_rate": 1.6281612220250883e-05, + "loss": 0.242, + "step": 552 + }, + { + "epoch": 0.4435548438751001, + "learning_rate": 1.6325008875503543e-05, + "loss": 0.2277, + "step": 554 + }, + { + "epoch": 0.44515612489991996, + "learning_rate": 1.6368207885168897e-05, + "loss": 0.3083, + "step": 556 + }, + { + "epoch": 0.4467574059247398, + "learning_rate": 1.641120789935263e-05, + "loss": 0.3959, + "step": 558 + }, + { + "epoch": 0.44835868694955966, + "learning_rate": 1.6454007574378637e-05, + "loss": 0.3832, + "step": 560 + }, + { + "epoch": 0.4499599679743795, + "learning_rate": 1.6496605572831134e-05, + "loss": 0.365, + "step": 562 + }, + { + "epoch": 0.45156124899919936, + "learning_rate": 1.6539000563596318e-05, + "loss": 0.2909, + "step": 564 + }, + { + "epoch": 0.45316253002401924, + "learning_rate": 1.6581191221904077e-05, + "loss": 0.1024, + "step": 566 + }, + { + "epoch": 0.45476381104883906, + "learning_rate": 1.6623176229369324e-05, + "loss": 0.2723, + "step": 568 + }, + { + "epoch": 0.45636509207365894, + "learning_rate": 1.6664954274033168e-05, + "loss": 0.3812, + "step": 570 + }, + { + "epoch": 0.45796637309847876, + "learning_rate": 1.6706524050403996e-05, + "loss": 0.0368, + "step": 572 + }, + { + "epoch": 0.45956765412329864, + "learning_rate": 1.674788425949818e-05, + "loss": 0.3394, + "step": 574 + }, + { + "epoch": 0.4611689351481185, + "learning_rate": 1.6789033608880735e-05, + "loss": 0.0958, + "step": 576 + }, + { + "epoch": 0.46277021617293834, + "learning_rate": 1.6829970812705674e-05, + "loss": 0.1656, + "step": 578 + }, + { + "epoch": 0.4643714971977582, + "learning_rate": 1.6870694591756165e-05, + "loss": 0.4163, + "step": 580 + }, + { + "epoch": 0.46597277822257804, + "learning_rate": 1.6911203673484577e-05, + "loss": 0.0417, + "step": 582 + }, + { + "epoch": 0.4675740592473979, + "learning_rate": 1.695149679205214e-05, + "loss": 0.7376, + "step": 584 + }, + { + "epoch": 0.4691753402722178, + "learning_rate": 1.6991572688368628e-05, + "loss": 0.3798, + "step": 586 + }, + { + "epoch": 0.4707766212970376, + "learning_rate": 1.7031430110131562e-05, + "loss": 0.4544, + "step": 588 + }, + { + "epoch": 0.4723779023218575, + "learning_rate": 1.7071067811865467e-05, + "loss": 0.3427, + "step": 590 + }, + { + "epoch": 0.4739791833466773, + "learning_rate": 1.7110484554960738e-05, + "loss": 0.0882, + "step": 592 + }, + { + "epoch": 0.4755804643714972, + "learning_rate": 1.7149679107712306e-05, + "loss": 0.5429, + "step": 594 + }, + { + "epoch": 0.4771817453963171, + "learning_rate": 1.7188650245358215e-05, + "loss": 0.0116, + "step": 596 + }, + { + "epoch": 0.4787830264211369, + "learning_rate": 1.722739675011779e-05, + "loss": 0.35, + "step": 598 + }, + { + "epoch": 0.4803843074459568, + "learning_rate": 1.726591741122981e-05, + "loss": 0.2514, + "step": 600 + }, + { + "epoch": 0.4819855884707766, + "learning_rate": 1.730421102499021e-05, + "loss": 0.4055, + "step": 602 + }, + { + "epoch": 0.4835868694955965, + "learning_rate": 1.734227639478982e-05, + "loss": 0.4334, + "step": 604 + }, + { + "epoch": 0.4851881505204163, + "learning_rate": 1.738011233115165e-05, + "loss": 0.2647, + "step": 606 + }, + { + "epoch": 0.4867894315452362, + "learning_rate": 1.7417717651768144e-05, + "loss": 0.2909, + "step": 608 + }, + { + "epoch": 0.48839071257005606, + "learning_rate": 1.7455091181538087e-05, + "loss": 0.0694, + "step": 610 + }, + { + "epoch": 0.4899919935948759, + "learning_rate": 1.74922317526033e-05, + "loss": 0.0212, + "step": 612 + }, + { + "epoch": 0.49159327461969576, + "learning_rate": 1.752913820438519e-05, + "loss": 0.1282, + "step": 614 + }, + { + "epoch": 0.4931945556445156, + "learning_rate": 1.756580938362096e-05, + "loss": 0.2675, + "step": 616 + }, + { + "epoch": 0.49479583666933546, + "learning_rate": 1.7602244144399693e-05, + "loss": 0.5806, + "step": 618 + }, + { + "epoch": 0.49639711769415534, + "learning_rate": 1.7638441348198147e-05, + "loss": 0.0669, + "step": 620 + }, + { + "epoch": 0.49799839871897517, + "learning_rate": 1.7674399863916295e-05, + "loss": 0.5326, + "step": 622 + }, + { + "epoch": 0.49959967974379504, + "learning_rate": 1.771011856791273e-05, + "loss": 0.058, + "step": 624 + }, + { + "epoch": 0.5012009607686149, + "learning_rate": 1.7745596344039712e-05, + "loss": 0.1398, + "step": 626 + }, + { + "epoch": 0.5028022417934348, + "learning_rate": 1.7780832083678116e-05, + "loss": 0.1384, + "step": 628 + }, + { + "epoch": 0.5044035228182546, + "learning_rate": 1.7815824685772035e-05, + "loss": 0.3157, + "step": 630 + }, + { + "epoch": 0.5060048038430744, + "learning_rate": 1.7850573056863156e-05, + "loss": 0.1699, + "step": 632 + }, + { + "epoch": 0.5076060848678943, + "learning_rate": 1.7885076111125004e-05, + "loss": 0.5987, + "step": 634 + }, + { + "epoch": 0.5092073658927142, + "learning_rate": 1.791933277039679e-05, + "loss": 0.3425, + "step": 636 + }, + { + "epoch": 0.510808646917534, + "learning_rate": 1.7953341964217183e-05, + "loss": 0.0768, + "step": 638 + }, + { + "epoch": 0.5124099279423538, + "learning_rate": 1.7987102629857696e-05, + "loss": 0.0989, + "step": 640 + }, + { + "epoch": 0.5140112089671738, + "learning_rate": 1.8020613712355912e-05, + "loss": 0.129, + "step": 642 + }, + { + "epoch": 0.5156124899919936, + "learning_rate": 1.805387416454847e-05, + "loss": 0.252, + "step": 644 + }, + { + "epoch": 0.5172137710168134, + "learning_rate": 1.8086882947103787e-05, + "loss": 0.2271, + "step": 646 + }, + { + "epoch": 0.5188150520416333, + "learning_rate": 1.811963902855447e-05, + "loss": 0.253, + "step": 648 + }, + { + "epoch": 0.5204163330664532, + "learning_rate": 1.8152141385329658e-05, + "loss": 0.3181, + "step": 650 + }, + { + "epoch": 0.522017614091273, + "learning_rate": 1.8184389001786895e-05, + "loss": 0.1894, + "step": 652 + }, + { + "epoch": 0.5236188951160928, + "learning_rate": 1.821638087024396e-05, + "loss": 0.1544, + "step": 654 + }, + { + "epoch": 0.5252201761409128, + "learning_rate": 1.8248115991010296e-05, + "loss": 0.2476, + "step": 656 + }, + { + "epoch": 0.5268214571657326, + "learning_rate": 1.8279593372418264e-05, + "loss": 0.4412, + "step": 658 + }, + { + "epoch": 0.5284227381905524, + "learning_rate": 1.8310812030854155e-05, + "loss": 0.0098, + "step": 660 + }, + { + "epoch": 0.5300240192153723, + "learning_rate": 1.834177099078887e-05, + "loss": 0.1315, + "step": 662 + }, + { + "epoch": 0.5316253002401922, + "learning_rate": 1.8372469284808465e-05, + "loss": 0.1228, + "step": 664 + }, + { + "epoch": 0.533226581265012, + "learning_rate": 1.840290595364436e-05, + "loss": 0.9322, + "step": 666 + }, + { + "epoch": 0.5348278622898318, + "learning_rate": 1.8433080046203286e-05, + "loss": 0.3805, + "step": 668 + }, + { + "epoch": 0.5364291433146517, + "learning_rate": 1.8462990619597054e-05, + "loss": 0.1881, + "step": 670 + }, + { + "epoch": 0.5380304243394716, + "learning_rate": 1.8492636739171966e-05, + "loss": 0.3647, + "step": 672 + }, + { + "epoch": 0.5396317053642914, + "learning_rate": 1.8522017478538067e-05, + "loss": 0.3042, + "step": 674 + }, + { + "epoch": 0.5412329863891113, + "learning_rate": 1.855113191959808e-05, + "loss": 0.4389, + "step": 676 + }, + { + "epoch": 0.5428342674139311, + "learning_rate": 1.8579979152576063e-05, + "loss": 0.1327, + "step": 678 + }, + { + "epoch": 0.544435548438751, + "learning_rate": 1.8608558276045895e-05, + "loss": 0.2536, + "step": 680 + }, + { + "epoch": 0.5460368294635709, + "learning_rate": 1.86368683969594e-05, + "loss": 0.1594, + "step": 682 + }, + { + "epoch": 0.5476381104883907, + "learning_rate": 1.866490863067425e-05, + "loss": 0.1688, + "step": 684 + }, + { + "epoch": 0.5492393915132106, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.0977, + "step": 686 + }, + { + "epoch": 0.5508406725380304, + "learning_rate": 1.8720175940133705e-05, + "loss": 0.4264, + "step": 688 + }, + { + "epoch": 0.5524419535628503, + "learning_rate": 1.8747401288870472e-05, + "loss": 0.0024, + "step": 690 + }, + { + "epoch": 0.5540432345876701, + "learning_rate": 1.877435329644691e-05, + "loss": 0.2463, + "step": 692 + }, + { + "epoch": 0.55564451561249, + "learning_rate": 1.8801031120659393e-05, + "loss": 0.1728, + "step": 694 + }, + { + "epoch": 0.5572457966373099, + "learning_rate": 1.8827433927872066e-05, + "loss": 0.3265, + "step": 696 + }, + { + "epoch": 0.5588470776621297, + "learning_rate": 1.8853560893042854e-05, + "loss": 0.3823, + "step": 698 + }, + { + "epoch": 0.5604483586869495, + "learning_rate": 1.8879411199749303e-05, + "loss": 0.1635, + "step": 700 + }, + { + "epoch": 0.5620496397117695, + "learning_rate": 1.8904984040214037e-05, + "loss": 0.2391, + "step": 702 + }, + { + "epoch": 0.5636509207365893, + "learning_rate": 1.893027861533002e-05, + "loss": 0.0641, + "step": 704 + }, + { + "epoch": 0.5652522017614091, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.0634, + "step": 706 + }, + { + "epoch": 0.5668534827862289, + "learning_rate": 1.898002981658886e-05, + "loss": 0.3219, + "step": 708 + }, + { + "epoch": 0.5684547638110489, + "learning_rate": 1.9004484888092724e-05, + "loss": 0.2285, + "step": 710 + }, + { + "epoch": 0.5700560448358687, + "learning_rate": 1.9028658585018455e-05, + "loss": 0.2359, + "step": 712 + }, + { + "epoch": 0.5716573258606885, + "learning_rate": 1.9052550151979816e-05, + "loss": 0.0369, + "step": 714 + }, + { + "epoch": 0.5732586068855084, + "learning_rate": 1.9076158842406674e-05, + "loss": 0.024, + "step": 716 + }, + { + "epoch": 0.5748598879103283, + "learning_rate": 1.909948391856829e-05, + "loss": 0.0789, + "step": 718 + }, + { + "epoch": 0.5764611689351481, + "learning_rate": 1.912252465159637e-05, + "loss": 0.4336, + "step": 720 + }, + { + "epoch": 0.578062449959968, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.3042, + "step": 722 + }, + { + "epoch": 0.5796637309847879, + "learning_rate": 1.9167750217227454e-05, + "loss": 0.245, + "step": 724 + }, + { + "epoch": 0.5812650120096077, + "learning_rate": 1.9189933636609747e-05, + "loss": 0.0515, + "step": 726 + }, + { + "epoch": 0.5828662930344275, + "learning_rate": 1.9211829886461274e-05, + "loss": 0.2889, + "step": 728 + }, + { + "epoch": 0.5844675740592474, + "learning_rate": 1.9233438282562085e-05, + "loss": 0.0882, + "step": 730 + }, + { + "epoch": 0.5860688550840673, + "learning_rate": 1.925475814968719e-05, + "loss": 0.0002, + "step": 732 + }, + { + "epoch": 0.5876701361088871, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.0721, + "step": 734 + }, + { + "epoch": 0.589271417133707, + "learning_rate": 1.9296529641211215e-05, + "loss": 0.01, + "step": 736 + }, + { + "epoch": 0.5908726981585268, + "learning_rate": 1.9316979960323286e-05, + "loss": 0.021, + "step": 738 + }, + { + "epoch": 0.5924739791833467, + "learning_rate": 1.9337139139926707e-05, + "loss": 0.5328, + "step": 740 + }, + { + "epoch": 0.5940752602081665, + "learning_rate": 1.935700655008199e-05, + "loss": 0.4535, + "step": 742 + }, + { + "epoch": 0.5956765412329864, + "learning_rate": 1.9376581569966933e-05, + "loss": 0.2055, + "step": 744 + }, + { + "epoch": 0.5972778222578062, + "learning_rate": 1.939586358789602e-05, + "loss": 0.0065, + "step": 746 + }, + { + "epoch": 0.5988791032826261, + "learning_rate": 1.9414852001339547e-05, + "loss": 0.7148, + "step": 748 + }, + { + "epoch": 0.600480384307446, + "learning_rate": 1.9433546216942423e-05, + "loss": 0.0204, + "step": 750 + }, + { + "epoch": 0.6020816653322658, + "learning_rate": 1.945194565054276e-05, + "loss": 0.8355, + "step": 752 + }, + { + "epoch": 0.6036829463570856, + "learning_rate": 1.9470049727190073e-05, + "loss": 0.5139, + "step": 754 + }, + { + "epoch": 0.6052842273819056, + "learning_rate": 1.948785788116329e-05, + "loss": 0.0002, + "step": 756 + }, + { + "epoch": 0.6068855084067254, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.2042, + "step": 758 + }, + { + "epoch": 0.6084867894315452, + "learning_rate": 1.952258420445583e-05, + "loss": 0.0885, + "step": 760 + }, + { + "epoch": 0.610088070456365, + "learning_rate": 1.953950128863762e-05, + "loss": 0.0436, + "step": 762 + }, + { + "epoch": 0.611689351481185, + "learning_rate": 1.9556120279904144e-05, + "loss": 0.5328, + "step": 764 + }, + { + "epoch": 0.6132906325060048, + "learning_rate": 1.957244065894066e-05, + "loss": 0.2262, + "step": 766 + }, + { + "epoch": 0.6148919135308246, + "learning_rate": 1.9588461915763566e-05, + "loss": 0.3054, + "step": 768 + }, + { + "epoch": 0.6164931945556446, + "learning_rate": 1.9604183549736283e-05, + "loss": 0.3508, + "step": 770 + }, + { + "epoch": 0.6180944755804644, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.5364, + "step": 772 + }, + { + "epoch": 0.6196957566052842, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.1168, + "step": 774 + }, + { + "epoch": 0.6212970376301041, + "learning_rate": 1.964954584871995e-05, + "loss": 0.4526, + "step": 776 + }, + { + "epoch": 0.622898318654924, + "learning_rate": 1.966406417240872e-05, + "loss": 0.2059, + "step": 778 + }, + { + "epoch": 0.6244995996797438, + "learning_rate": 1.967828051080755e-05, + "loss": 0.2176, + "step": 780 + }, + { + "epoch": 0.6261008807045636, + "learning_rate": 1.969219441968046e-05, + "loss": 0.0018, + "step": 782 + }, + { + "epoch": 0.6277021617293835, + "learning_rate": 1.9705805464241856e-05, + "loss": 0.1679, + "step": 784 + }, + { + "epoch": 0.6293034427542034, + "learning_rate": 1.971911321917015e-05, + "loss": 0.1927, + "step": 786 + }, + { + "epoch": 0.6309047237790232, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.4173, + "step": 788 + }, + { + "epoch": 0.6325060048038431, + "learning_rate": 1.9744817206240377e-05, + "loss": 0.3615, + "step": 790 + }, + { + "epoch": 0.6341072858286629, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.1433, + "step": 792 + }, + { + "epoch": 0.6357085668534828, + "learning_rate": 1.976930316809569e-05, + "loss": 0.2274, + "step": 794 + }, + { + "epoch": 0.6373098478783027, + "learning_rate": 1.978108842718768e-05, + "loss": 0.4554, + "step": 796 + }, + { + "epoch": 0.6389111289031225, + "learning_rate": 1.9792568044184176e-05, + "loss": 0.309, + "step": 798 + }, + { + "epoch": 0.6405124099279423, + "learning_rate": 1.9803741660367015e-05, + "loss": 0.125, + "step": 800 + }, + { + "epoch": 0.6421136909527622, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.2541, + "step": 802 + }, + { + "epoch": 0.6437149719775821, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.4126, + "step": 804 + }, + { + "epoch": 0.6453162530024019, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.2168, + "step": 806 + }, + { + "epoch": 0.6469175340272217, + "learning_rate": 1.9845369277495102e-05, + "loss": 0.2062, + "step": 808 + }, + { + "epoch": 0.6485188150520417, + "learning_rate": 1.985500784388244e-05, + "loss": 0.3649, + "step": 810 + }, + { + "epoch": 0.6501200960768615, + "learning_rate": 1.9864338458320366e-05, + "loss": 0.3101, + "step": 812 + }, + { + "epoch": 0.6517213771016813, + "learning_rate": 1.9873360829243323e-05, + "loss": 0.0315, + "step": 814 + }, + { + "epoch": 0.6533226581265013, + "learning_rate": 1.9882074674717836e-05, + "loss": 0.2392, + "step": 816 + }, + { + "epoch": 0.6549239391513211, + "learning_rate": 1.989047972245129e-05, + "loss": 0.0814, + "step": 818 + }, + { + "epoch": 0.6565252201761409, + "learning_rate": 1.989857570980049e-05, + "loss": 0.1961, + "step": 820 + }, + { + "epoch": 0.6581265012009607, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.0243, + "step": 822 + }, + { + "epoch": 0.6597277822257807, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.3149, + "step": 824 + }, + { + "epoch": 0.6613290632506005, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.0862, + "step": 826 + }, + { + "epoch": 0.6629303442754203, + "learning_rate": 1.9927864140670615e-05, + "loss": 0.2755, + "step": 828 + }, + { + "epoch": 0.6645316253002402, + "learning_rate": 1.99344112247369e-05, + "loss": 0.2877, + "step": 830 + }, + { + "epoch": 0.6661329063250601, + "learning_rate": 1.9940647875635463e-05, + "loss": 0.2791, + "step": 832 + }, + { + "epoch": 0.6677341873498799, + "learning_rate": 1.994657389848176e-05, + "loss": 0.1761, + "step": 834 + }, + { + "epoch": 0.6693354683746997, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.2275, + "step": 836 + }, + { + "epoch": 0.6709367493995196, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.0233, + "step": 838 + }, + { + "epoch": 0.6725380304243395, + "learning_rate": 1.996248639549475e-05, + "loss": 0.2653, + "step": 840 + }, + { + "epoch": 0.6741393114491593, + "learning_rate": 1.9967168151503196e-05, + "loss": 0.3435, + "step": 842 + }, + { + "epoch": 0.6757405924739792, + "learning_rate": 1.997153845074662e-05, + "loss": 0.0327, + "step": 844 + }, + { + "epoch": 0.677341873498799, + "learning_rate": 1.997559715666073e-05, + "loss": 0.2651, + "step": 846 + }, + { + "epoch": 0.6789431545236189, + "learning_rate": 1.9979344142417986e-05, + "loss": 0.3056, + "step": 848 + }, + { + "epoch": 0.6805444355484388, + "learning_rate": 1.998277929093157e-05, + "loss": 0.1792, + "step": 850 + }, + { + "epoch": 0.6821457165732586, + "learning_rate": 1.9985902494859023e-05, + "loss": 0.1139, + "step": 852 + }, + { + "epoch": 0.6837469975980784, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.548, + "step": 854 + }, + { + "epoch": 0.6853482786228983, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.1276, + "step": 856 + }, + { + "epoch": 0.6869495596477182, + "learning_rate": 1.999339951193407e-05, + "loss": 0.0769, + "step": 858 + }, + { + "epoch": 0.688550840672538, + "learning_rate": 1.9995274059091018e-05, + "loss": 0.2463, + "step": 860 + }, + { + "epoch": 0.6901521216973578, + "learning_rate": 1.999683627122195e-05, + "loss": 0.5896, + "step": 862 + }, + { + "epoch": 0.6917534027221778, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.0092, + "step": 864 + }, + { + "epoch": 0.6933546837469976, + "learning_rate": 1.99990235049015e-05, + "loss": 0.3945, + "step": 866 + }, + { + "epoch": 0.6949559647718174, + "learning_rate": 1.999964845810285e-05, + "loss": 0.3901, + "step": 868 + }, + { + "epoch": 0.6965572457966374, + "learning_rate": 1.999996093958578e-05, + "loss": 0.0916, + "step": 870 + }, + { + "epoch": 0.6981585268214572, + "learning_rate": 1.999996093958578e-05, + "loss": 0.2292, + "step": 872 + }, + { + "epoch": 0.699759807846277, + "learning_rate": 1.999964845810285e-05, + "loss": 0.1592, + "step": 874 + }, + { + "epoch": 0.7013610888710968, + "learning_rate": 1.99990235049015e-05, + "loss": 0.4008, + "step": 876 + }, + { + "epoch": 0.7029623698959168, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.1462, + "step": 878 + }, + { + "epoch": 0.7045636509207366, + "learning_rate": 1.999683627122195e-05, + "loss": 0.0401, + "step": 880 + }, + { + "epoch": 0.7061649319455564, + "learning_rate": 1.999527405909102e-05, + "loss": 0.1604, + "step": 882 + }, + { + "epoch": 0.7077662129703763, + "learning_rate": 1.999339951193407e-05, + "loss": 0.1657, + "step": 884 + }, + { + "epoch": 0.7093674939951962, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.2581, + "step": 886 + }, + { + "epoch": 0.710968775020016, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.0265, + "step": 888 + }, + { + "epoch": 0.7125700560448359, + "learning_rate": 1.9985902494859026e-05, + "loss": 0.0464, + "step": 890 + }, + { + "epoch": 0.7141713370696557, + "learning_rate": 1.9982779290931572e-05, + "loss": 0.0836, + "step": 892 + }, + { + "epoch": 0.7157726180944756, + "learning_rate": 1.997934414241799e-05, + "loss": 0.0439, + "step": 894 + }, + { + "epoch": 0.7173738991192954, + "learning_rate": 1.997559715666073e-05, + "loss": 0.0822, + "step": 896 + }, + { + "epoch": 0.7189751801441153, + "learning_rate": 1.997153845074662e-05, + "loss": 0.2426, + "step": 898 + }, + { + "epoch": 0.7205764611689351, + "learning_rate": 1.9967168151503193e-05, + "loss": 0.0023, + "step": 900 + }, + { + "epoch": 0.722177742193755, + "learning_rate": 1.9962486395494753e-05, + "loss": 0.114, + "step": 902 + }, + { + "epoch": 0.7237790232185749, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.532, + "step": 904 + }, + { + "epoch": 0.7253803042433947, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.4572, + "step": 906 + }, + { + "epoch": 0.7269815852682145, + "learning_rate": 1.994657389848176e-05, + "loss": 0.3566, + "step": 908 + }, + { + "epoch": 0.7285828662930345, + "learning_rate": 1.9940647875635466e-05, + "loss": 0.205, + "step": 910 + }, + { + "epoch": 0.7301841473178543, + "learning_rate": 1.99344112247369e-05, + "loss": 0.4722, + "step": 912 + }, + { + "epoch": 0.7317854283426741, + "learning_rate": 1.9927864140670618e-05, + "loss": 0.021, + "step": 914 + }, + { + "epoch": 0.733386709367494, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.1926, + "step": 916 + }, + { + "epoch": 0.7349879903923139, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.2962, + "step": 918 + }, + { + "epoch": 0.7365892714171337, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.0869, + "step": 920 + }, + { + "epoch": 0.7381905524419535, + "learning_rate": 1.989857570980049e-05, + "loss": 0.0743, + "step": 922 + }, + { + "epoch": 0.7397918334667735, + "learning_rate": 1.9890479722451292e-05, + "loss": 0.3135, + "step": 924 + }, + { + "epoch": 0.7413931144915933, + "learning_rate": 1.9882074674717832e-05, + "loss": 1.2724, + "step": 926 + }, + { + "epoch": 0.7429943955164131, + "learning_rate": 1.987336082924333e-05, + "loss": 0.0381, + "step": 928 + }, + { + "epoch": 0.7445956765412329, + "learning_rate": 1.986433845832037e-05, + "loss": 0.6571, + "step": 930 + }, + { + "epoch": 0.7461969575660529, + "learning_rate": 1.9855007843882437e-05, + "loss": 0.1995, + "step": 932 + }, + { + "epoch": 0.7477982385908727, + "learning_rate": 1.9845369277495105e-05, + "loss": 0.1977, + "step": 934 + }, + { + "epoch": 0.7493995196156925, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.3571, + "step": 936 + }, + { + "epoch": 0.7510008006405124, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.1503, + "step": 938 + }, + { + "epoch": 0.7526020816653323, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.2454, + "step": 940 + }, + { + "epoch": 0.7542033626901521, + "learning_rate": 1.9803741660367018e-05, + "loss": 0.2311, + "step": 942 + }, + { + "epoch": 0.755804643714972, + "learning_rate": 1.979256804418418e-05, + "loss": 0.0518, + "step": 944 + }, + { + "epoch": 0.7574059247397918, + "learning_rate": 1.9781088427187677e-05, + "loss": 0.3416, + "step": 946 + }, + { + "epoch": 0.7590072057646117, + "learning_rate": 1.976930316809569e-05, + "loss": 0.0772, + "step": 948 + }, + { + "epoch": 0.7606084867894315, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.4751, + "step": 950 + }, + { + "epoch": 0.7622097678142514, + "learning_rate": 1.9744817206240374e-05, + "loss": 0.4526, + "step": 952 + }, + { + "epoch": 0.7638110488390712, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.3096, + "step": 954 + }, + { + "epoch": 0.7654123298638911, + "learning_rate": 1.9719113219170152e-05, + "loss": 0.2254, + "step": 956 + }, + { + "epoch": 0.767013610888711, + "learning_rate": 1.970580546424186e-05, + "loss": 0.1723, + "step": 958 + }, + { + "epoch": 0.7686148919135308, + "learning_rate": 1.9692194419680463e-05, + "loss": 0.6533, + "step": 960 + }, + { + "epoch": 0.7702161729383507, + "learning_rate": 1.9678280510807552e-05, + "loss": 0.281, + "step": 962 + }, + { + "epoch": 0.7718174539631706, + "learning_rate": 1.966406417240872e-05, + "loss": 0.399, + "step": 964 + }, + { + "epoch": 0.7734187349879904, + "learning_rate": 1.964954584871995e-05, + "loss": 0.228, + "step": 966 + }, + { + "epoch": 0.7750200160128102, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.5152, + "step": 968 + }, + { + "epoch": 0.77662129703763, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.1713, + "step": 970 + }, + { + "epoch": 0.77822257806245, + "learning_rate": 1.9604183549736287e-05, + "loss": 0.2849, + "step": 972 + }, + { + "epoch": 0.7798238590872698, + "learning_rate": 1.958846191576357e-05, + "loss": 0.2035, + "step": 974 + }, + { + "epoch": 0.7814251401120896, + "learning_rate": 1.9572440658940667e-05, + "loss": 0.0795, + "step": 976 + }, + { + "epoch": 0.7830264211369096, + "learning_rate": 1.955612027990415e-05, + "loss": 0.177, + "step": 978 + }, + { + "epoch": 0.7846277021617294, + "learning_rate": 1.953950128863763e-05, + "loss": 0.2624, + "step": 980 + }, + { + "epoch": 0.7862289831865492, + "learning_rate": 1.9522584204455835e-05, + "loss": 0.2466, + "step": 982 + }, + { + "epoch": 0.7878302642113691, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.1225, + "step": 984 + }, + { + "epoch": 0.789431545236189, + "learning_rate": 1.9487857881163295e-05, + "loss": 0.1407, + "step": 986 + }, + { + "epoch": 0.7910328262610088, + "learning_rate": 1.947004972719008e-05, + "loss": 0.5051, + "step": 988 + }, + { + "epoch": 0.7926341072858286, + "learning_rate": 1.945194565054276e-05, + "loss": 0.1115, + "step": 990 + }, + { + "epoch": 0.7942353883106485, + "learning_rate": 1.9433546216942433e-05, + "loss": 0.0047, + "step": 992 + }, + { + "epoch": 0.7958366693354684, + "learning_rate": 1.941485200133955e-05, + "loss": 0.4239, + "step": 994 + }, + { + "epoch": 0.7974379503602882, + "learning_rate": 1.9395863587896025e-05, + "loss": 0.4847, + "step": 996 + }, + { + "epoch": 0.7990392313851081, + "learning_rate": 1.937658156996694e-05, + "loss": 0.3656, + "step": 998 + }, + { + "epoch": 0.800640512409928, + "learning_rate": 1.9357006550082e-05, + "loss": 0.1793, + "step": 1000 + }, + { + "epoch": 0.8022417934347478, + "learning_rate": 1.933713913992671e-05, + "loss": 0.1453, + "step": 1002 + }, + { + "epoch": 0.8038430744595677, + "learning_rate": 1.9316979960323283e-05, + "loss": 0.7052, + "step": 1004 + }, + { + "epoch": 0.8054443554843875, + "learning_rate": 1.9296529641211226e-05, + "loss": 0.1244, + "step": 1006 + }, + { + "epoch": 0.8070456365092074, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.374, + "step": 1008 + }, + { + "epoch": 0.8086469175340272, + "learning_rate": 1.9254758149687187e-05, + "loss": 0.3294, + "step": 1010 + }, + { + "epoch": 0.8102481985588471, + "learning_rate": 1.9233438282562095e-05, + "loss": 0.0984, + "step": 1012 + }, + { + "epoch": 0.8118494795836669, + "learning_rate": 1.9211829886461278e-05, + "loss": 0.3529, + "step": 1014 + }, + { + "epoch": 0.8134507606084868, + "learning_rate": 1.918993363660975e-05, + "loss": 0.1602, + "step": 1016 + }, + { + "epoch": 0.8150520416333067, + "learning_rate": 1.916775021722745e-05, + "loss": 0.1997, + "step": 1018 + }, + { + "epoch": 0.8166533226581265, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.5403, + "step": 1020 + }, + { + "epoch": 0.8182546036829463, + "learning_rate": 1.9122524651596372e-05, + "loss": 0.0639, + "step": 1022 + }, + { + "epoch": 0.8198558847077662, + "learning_rate": 1.9099483918568287e-05, + "loss": 0.1201, + "step": 1024 + }, + { + "epoch": 0.8214571657325861, + "learning_rate": 1.907615884240668e-05, + "loss": 0.0731, + "step": 1026 + }, + { + "epoch": 0.8230584467574059, + "learning_rate": 1.905255015197982e-05, + "loss": 0.1547, + "step": 1028 + }, + { + "epoch": 0.8246597277822257, + "learning_rate": 1.902865858501845e-05, + "loss": 0.1453, + "step": 1030 + }, + { + "epoch": 0.8262610088070457, + "learning_rate": 1.9004484888092734e-05, + "loss": 0.3729, + "step": 1032 + }, + { + "epoch": 0.8278622898318655, + "learning_rate": 1.8980029816588863e-05, + "loss": 0.2768, + "step": 1034 + }, + { + "epoch": 0.8294635708566853, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.205, + "step": 1036 + }, + { + "epoch": 0.8310648518815053, + "learning_rate": 1.893027861533003e-05, + "loss": 0.9087, + "step": 1038 + }, + { + "epoch": 0.8326661329063251, + "learning_rate": 1.8904984040214043e-05, + "loss": 0.1169, + "step": 1040 + }, + { + "epoch": 0.8342674139311449, + "learning_rate": 1.8879411199749306e-05, + "loss": 0.629, + "step": 1042 + }, + { + "epoch": 0.8358686949559647, + "learning_rate": 1.885356089304285e-05, + "loss": 0.323, + "step": 1044 + }, + { + "epoch": 0.8374699759807847, + "learning_rate": 1.882743392787207e-05, + "loss": 0.1265, + "step": 1046 + }, + { + "epoch": 0.8390712570056045, + "learning_rate": 1.8801031120659396e-05, + "loss": 0.0609, + "step": 1048 + }, + { + "epoch": 0.8406725380304243, + "learning_rate": 1.877435329644691e-05, + "loss": 0.2982, + "step": 1050 + }, + { + "epoch": 0.8422738190552442, + "learning_rate": 1.8747401288870482e-05, + "loss": 0.2404, + "step": 1052 + }, + { + "epoch": 0.8438751000800641, + "learning_rate": 1.8720175940133712e-05, + "loss": 0.2915, + "step": 1054 + }, + { + "epoch": 0.8454763811048839, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.0284, + "step": 1056 + }, + { + "epoch": 0.8470776621297038, + "learning_rate": 1.8664908630674264e-05, + "loss": 0.4157, + "step": 1058 + }, + { + "epoch": 0.8486789431545236, + "learning_rate": 1.8636868396959406e-05, + "loss": 0.2116, + "step": 1060 + }, + { + "epoch": 0.8502802241793435, + "learning_rate": 1.8608558276045898e-05, + "loss": 0.1604, + "step": 1062 + }, + { + "epoch": 0.8518815052041633, + "learning_rate": 1.8579979152576076e-05, + "loss": 0.196, + "step": 1064 + }, + { + "epoch": 0.8534827862289832, + "learning_rate": 1.8551131919598084e-05, + "loss": 0.1915, + "step": 1066 + }, + { + "epoch": 0.855084067253803, + "learning_rate": 1.852201747853807e-05, + "loss": 0.2589, + "step": 1068 + }, + { + "epoch": 0.8566853482786229, + "learning_rate": 1.849263673917196e-05, + "loss": 0.3002, + "step": 1070 + }, + { + "epoch": 0.8582866293034428, + "learning_rate": 1.846299061959706e-05, + "loss": 0.1926, + "step": 1072 + }, + { + "epoch": 0.8598879103282626, + "learning_rate": 1.8433080046203293e-05, + "loss": 0.4974, + "step": 1074 + }, + { + "epoch": 0.8614891913530824, + "learning_rate": 1.8402905953644356e-05, + "loss": 0.3675, + "step": 1076 + }, + { + "epoch": 0.8630904723779024, + "learning_rate": 1.837246928480848e-05, + "loss": 0.1309, + "step": 1078 + }, + { + "epoch": 0.8646917534027222, + "learning_rate": 1.8341770990788874e-05, + "loss": 0.1405, + "step": 1080 + }, + { + "epoch": 0.866293034427542, + "learning_rate": 1.831081203085415e-05, + "loss": 0.088, + "step": 1082 + }, + { + "epoch": 0.8678943154523618, + "learning_rate": 1.8279593372418284e-05, + "loss": 0.3894, + "step": 1084 + }, + { + "epoch": 0.8694955964771818, + "learning_rate": 1.8248115991010303e-05, + "loss": 0.1894, + "step": 1086 + }, + { + "epoch": 0.8710968775020016, + "learning_rate": 1.8216380870243963e-05, + "loss": 0.1194, + "step": 1088 + }, + { + "epoch": 0.8726981585268214, + "learning_rate": 1.8184389001786912e-05, + "loss": 0.4904, + "step": 1090 + }, + { + "epoch": 0.8742994395516414, + "learning_rate": 1.815214138532966e-05, + "loss": 0.3006, + "step": 1092 + }, + { + "epoch": 0.8759007205764612, + "learning_rate": 1.8119639028554475e-05, + "loss": 0.0661, + "step": 1094 + }, + { + "epoch": 0.877502001601281, + "learning_rate": 1.808688294710378e-05, + "loss": 0.4522, + "step": 1096 + }, + { + "epoch": 0.8791032826261009, + "learning_rate": 1.805387416454849e-05, + "loss": 1.0015, + "step": 1098 + }, + { + "epoch": 0.8807045636509208, + "learning_rate": 1.802061371235592e-05, + "loss": 0.0829, + "step": 1100 + }, + { + "epoch": 0.8823058446757406, + "learning_rate": 1.7987102629857692e-05, + "loss": 0.0227, + "step": 1102 + }, + { + "epoch": 0.8839071257005604, + "learning_rate": 1.7953341964217196e-05, + "loss": 0.0373, + "step": 1104 + }, + { + "epoch": 0.8855084067253803, + "learning_rate": 1.7919332770396798e-05, + "loss": 0.3724, + "step": 1106 + }, + { + "epoch": 0.8871096877502002, + "learning_rate": 1.7885076111125e-05, + "loss": 0.0994, + "step": 1108 + }, + { + "epoch": 0.88871096877502, + "learning_rate": 1.7850573056863173e-05, + "loss": 0.7036, + "step": 1110 + }, + { + "epoch": 0.8903122497998399, + "learning_rate": 1.7815824685772042e-05, + "loss": 0.1373, + "step": 1112 + }, + { + "epoch": 0.8919135308246597, + "learning_rate": 1.7780832083678122e-05, + "loss": 0.1676, + "step": 1114 + }, + { + "epoch": 0.8935148118494796, + "learning_rate": 1.774559634403971e-05, + "loss": 0.6738, + "step": 1116 + }, + { + "epoch": 0.8951160928742994, + "learning_rate": 1.7710118567912732e-05, + "loss": 0.1152, + "step": 1118 + }, + { + "epoch": 0.8967173738991193, + "learning_rate": 1.7674399863916298e-05, + "loss": 0.265, + "step": 1120 + }, + { + "epoch": 0.8983186549239391, + "learning_rate": 1.7638441348198144e-05, + "loss": 0.3219, + "step": 1122 + }, + { + "epoch": 0.899919935948759, + "learning_rate": 1.7602244144399713e-05, + "loss": 0.7533, + "step": 1124 + }, + { + "epoch": 0.9015212169735789, + "learning_rate": 1.7565809383620966e-05, + "loss": 0.3681, + "step": 1126 + }, + { + "epoch": 0.9031224979983987, + "learning_rate": 1.7529138204385186e-05, + "loss": 0.438, + "step": 1128 + }, + { + "epoch": 0.9047237790232185, + "learning_rate": 1.7492231752603305e-05, + "loss": 0.1269, + "step": 1130 + }, + { + "epoch": 0.9063250600480385, + "learning_rate": 1.7455091181538094e-05, + "loss": 0.246, + "step": 1132 + }, + { + "epoch": 0.9079263410728583, + "learning_rate": 1.741771765176815e-05, + "loss": 0.3602, + "step": 1134 + }, + { + "epoch": 0.9095276220976781, + "learning_rate": 1.7380112331151657e-05, + "loss": 0.3644, + "step": 1136 + }, + { + "epoch": 0.911128903122498, + "learning_rate": 1.7342276394789825e-05, + "loss": 0.1733, + "step": 1138 + }, + { + "epoch": 0.9127301841473179, + "learning_rate": 1.7304211024990216e-05, + "loss": 0.3662, + "step": 1140 + }, + { + "epoch": 0.9143314651721377, + "learning_rate": 1.7265917411229803e-05, + "loss": 0.2163, + "step": 1142 + }, + { + "epoch": 0.9159327461969575, + "learning_rate": 1.7227396750117802e-05, + "loss": 0.0666, + "step": 1144 + }, + { + "epoch": 0.9175340272217775, + "learning_rate": 1.718865024535822e-05, + "loss": 0.5219, + "step": 1146 + }, + { + "epoch": 0.9191353082465973, + "learning_rate": 1.7149679107712317e-05, + "loss": 0.459, + "step": 1148 + }, + { + "epoch": 0.9207365892714171, + "learning_rate": 1.711048455496075e-05, + "loss": 0.4002, + "step": 1150 + }, + { + "epoch": 0.922337870296237, + "learning_rate": 1.7071067811865474e-05, + "loss": 0.2244, + "step": 1152 + }, + { + "epoch": 0.9239391513210569, + "learning_rate": 1.7031430110131566e-05, + "loss": 0.3854, + "step": 1154 + }, + { + "epoch": 0.9255404323458767, + "learning_rate": 1.699157268836863e-05, + "loss": 0.1912, + "step": 1156 + }, + { + "epoch": 0.9271417133706965, + "learning_rate": 1.6951496792052148e-05, + "loss": 0.3037, + "step": 1158 + }, + { + "epoch": 0.9287429943955164, + "learning_rate": 1.6911203673484583e-05, + "loss": 0.3624, + "step": 1160 + }, + { + "epoch": 0.9303442754203363, + "learning_rate": 1.687069459175619e-05, + "loss": 0.2524, + "step": 1162 + }, + { + "epoch": 0.9319455564451561, + "learning_rate": 1.682997081270568e-05, + "loss": 0.2356, + "step": 1164 + }, + { + "epoch": 0.933546837469976, + "learning_rate": 1.6789033608880742e-05, + "loss": 0.4152, + "step": 1166 + }, + { + "epoch": 0.9351481184947958, + "learning_rate": 1.6747884259498185e-05, + "loss": 0.3348, + "step": 1168 + }, + { + "epoch": 0.9367493995196157, + "learning_rate": 1.6706524050404006e-05, + "loss": 0.2385, + "step": 1170 + }, + { + "epoch": 0.9383506805444356, + "learning_rate": 1.6664954274033175e-05, + "loss": 0.2656, + "step": 1172 + }, + { + "epoch": 0.9399519615692554, + "learning_rate": 1.662317622936933e-05, + "loss": 0.1864, + "step": 1174 + }, + { + "epoch": 0.9415532425940752, + "learning_rate": 1.6581191221904098e-05, + "loss": 0.0389, + "step": 1176 + }, + { + "epoch": 0.9431545236188951, + "learning_rate": 1.6539000563596328e-05, + "loss": 0.3571, + "step": 1178 + }, + { + "epoch": 0.944755804643715, + "learning_rate": 1.6496605572831127e-05, + "loss": 0.2997, + "step": 1180 + }, + { + "epoch": 0.9463570856685348, + "learning_rate": 1.6454007574378657e-05, + "loss": 0.0873, + "step": 1182 + }, + { + "epoch": 0.9479583666933546, + "learning_rate": 1.6411207899352633e-05, + "loss": 0.3684, + "step": 1184 + }, + { + "epoch": 0.9495596477181746, + "learning_rate": 1.6368207885168904e-05, + "loss": 0.8584, + "step": 1186 + }, + { + "epoch": 0.9511609287429944, + "learning_rate": 1.6325008875503563e-05, + "loss": 0.0825, + "step": 1188 + }, + { + "epoch": 0.9527622097678142, + "learning_rate": 1.628161222025089e-05, + "loss": 0.3448, + "step": 1190 + }, + { + "epoch": 0.9543634907926342, + "learning_rate": 1.623801927548132e-05, + "loss": 0.0754, + "step": 1192 + }, + { + "epoch": 0.955964771817454, + "learning_rate": 1.6194231403398987e-05, + "loss": 0.2999, + "step": 1194 + }, + { + "epoch": 0.9575660528422738, + "learning_rate": 1.6150249972299173e-05, + "loss": 0.0235, + "step": 1196 + }, + { + "epoch": 0.9591673338670936, + "learning_rate": 1.6106076356525484e-05, + "loss": 0.3246, + "step": 1198 + }, + { + "epoch": 0.9607686148919136, + "learning_rate": 1.6061711936427028e-05, + "loss": 0.3452, + "step": 1200 + }, + { + "epoch": 0.9623698959167334, + "learning_rate": 1.6017158098315224e-05, + "loss": 0.1659, + "step": 1202 + }, + { + "epoch": 0.9639711769415532, + "learning_rate": 1.5972416234420404e-05, + "loss": 0.1711, + "step": 1204 + }, + { + "epoch": 0.9655724579663731, + "learning_rate": 1.592748774284844e-05, + "loss": 0.2315, + "step": 1206 + }, + { + "epoch": 0.967173738991193, + "learning_rate": 1.588237402753703e-05, + "loss": 0.3712, + "step": 1208 + }, + { + "epoch": 0.9687750200160128, + "learning_rate": 1.5837076498211673e-05, + "loss": 0.2155, + "step": 1210 + }, + { + "epoch": 0.9703763010408326, + "learning_rate": 1.579159657034185e-05, + "loss": 0.1708, + "step": 1212 + }, + { + "epoch": 0.9719775820656525, + "learning_rate": 1.574593566509664e-05, + "loss": 0.218, + "step": 1214 + }, + { + "epoch": 0.9735788630904724, + "learning_rate": 1.5700095209300386e-05, + "loss": 0.3061, + "step": 1216 + }, + { + "epoch": 0.9751801441152922, + "learning_rate": 1.5654076635387976e-05, + "loss": 0.3219, + "step": 1218 + }, + { + "epoch": 0.9767814251401121, + "learning_rate": 1.560788138136029e-05, + "loss": 0.2066, + "step": 1220 + }, + { + "epoch": 0.978382706164932, + "learning_rate": 1.5561510890739137e-05, + "loss": 0.0483, + "step": 1222 + }, + { + "epoch": 0.9799839871897518, + "learning_rate": 1.5514966612522088e-05, + "loss": 0.338, + "step": 1224 + }, + { + "epoch": 0.9815852682145717, + "learning_rate": 1.546825000113736e-05, + "loss": 0.1178, + "step": 1226 + }, + { + "epoch": 0.9831865492393915, + "learning_rate": 1.5421362516398285e-05, + "loss": 0.0207, + "step": 1228 + }, + { + "epoch": 0.9847878302642114, + "learning_rate": 1.5374305623457605e-05, + "loss": 0.0918, + "step": 1230 + }, + { + "epoch": 0.9863891112890312, + "learning_rate": 1.532708079276186e-05, + "loss": 0.4725, + "step": 1232 + }, + { + "epoch": 0.9879903923138511, + "learning_rate": 1.5279689500005353e-05, + "loss": 0.1675, + "step": 1234 + }, + { + "epoch": 0.9895916733386709, + "learning_rate": 1.5232133226083962e-05, + "loss": 0.0769, + "step": 1236 + }, + { + "epoch": 0.9911929543634908, + "learning_rate": 1.5184413457049014e-05, + "loss": 0.1771, + "step": 1238 + }, + { + "epoch": 0.9927942353883107, + "learning_rate": 1.5136531684060753e-05, + "loss": 0.1841, + "step": 1240 + }, + { + "epoch": 0.9943955164131305, + "learning_rate": 1.50884894033418e-05, + "loss": 0.1286, + "step": 1242 + }, + { + "epoch": 0.9959967974379503, + "learning_rate": 1.504028811613027e-05, + "loss": 0.1521, + "step": 1244 + }, + { + "epoch": 0.9975980784627703, + "learning_rate": 1.4991929328633043e-05, + "loss": 0.0993, + "step": 1246 + }, + { + "epoch": 0.9991993594875901, + "learning_rate": 1.4943414551978622e-05, + "loss": 1.0781, + "step": 1248 + }, + { + "epoch": 1.0, + "step": 1249, + "total_flos": 7716239844573184.0, + "train_loss": 0.25484217935285286, + "train_runtime": 2013.0633, + "train_samples_per_second": 9.927, + "train_steps_per_second": 0.62 + } + ], + "logging_steps": 2, + "max_steps": 1249, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 7716239844573184.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e648d5e4ddd541fdd21978cd66a1c42997dc726 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:869e2e62bc69ef9c3492bbcedbd2ab2239d0c34a87e1be44dea1be9a5549a443 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..4233af0aa4a7debbf2a745f1028b2073eb2ea512 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dbd491d2a6d2cfdd487bbd2499573b31c5a4366c7ea2b890d67aab5efdcca87 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6dbbed756984ae50b2fac195c81efe2a8c143719 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8369f289e19c0d10b1b825b4ded821e564f892cacc4eef187078f79bc56b740d +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..43b5cea004c1722445bdecc643b6a660bbb0d36c --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0ef5c39ebe1bb987f228f5bccd58021a45d7193f451c8f3bf7de1fc398d6e5c +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_125_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_125_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6025478b3bec60311c264b70d834225d3af06f74 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_125_seed1/0_trainer_state.json @@ -0,0 +1,7526 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2498, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008006405124099279, + "learning_rate": 2.415943612351265e-06, + "loss": 0.2467, + "step": 2 + }, + { + "epoch": 0.0016012810248198558, + "learning_rate": 2.4341906163790364e-06, + "loss": 0.0573, + "step": 4 + }, + { + "epoch": 0.0024019215372297837, + "learning_rate": 2.4524967251364995e-06, + "loss": 0.1694, + "step": 6 + }, + { + "epoch": 0.0032025620496397116, + "learning_rate": 2.4708617956148052e-06, + "loss": 0.2059, + "step": 8 + }, + { + "epoch": 0.0040032025620496394, + "learning_rate": 2.4892856843445236e-06, + "loss": 0.2581, + "step": 10 + }, + { + "epoch": 0.004803843074459567, + "learning_rate": 2.507768247396697e-06, + "loss": 0.0719, + "step": 12 + }, + { + "epoch": 0.005604483586869495, + "learning_rate": 2.5263093403840022e-06, + "loss": 0.3515, + "step": 14 + }, + { + "epoch": 0.006405124099279423, + "learning_rate": 2.5449088184619065e-06, + "loss": 0.0693, + "step": 16 + }, + { + "epoch": 0.007205764611689352, + "learning_rate": 2.5635665363297356e-06, + "loss": 0.1825, + "step": 18 + }, + { + "epoch": 0.008006405124099279, + "learning_rate": 2.5822823482318517e-06, + "loss": 0.1901, + "step": 20 + }, + { + "epoch": 0.008807045636509208, + "learning_rate": 2.6010561079587694e-06, + "loss": 0.6459, + "step": 22 + }, + { + "epoch": 0.009607686148919135, + "learning_rate": 2.6198876688483453e-06, + "loss": 0.1767, + "step": 24 + }, + { + "epoch": 0.010408326661329063, + "learning_rate": 2.6387768837868565e-06, + "loss": 0.4669, + "step": 26 + }, + { + "epoch": 0.01120896717373899, + "learning_rate": 2.6577236052101764e-06, + "loss": 0.1539, + "step": 28 + }, + { + "epoch": 0.01200960768614892, + "learning_rate": 2.6767276851049716e-06, + "loss": 0.1946, + "step": 30 + }, + { + "epoch": 0.012810248198558846, + "learning_rate": 2.6957889750097866e-06, + "loss": 0.2724, + "step": 32 + }, + { + "epoch": 0.013610888710968775, + "learning_rate": 2.7149073260162416e-06, + "loss": 0.0462, + "step": 34 + }, + { + "epoch": 0.014411529223378704, + "learning_rate": 2.7340825887701848e-06, + "loss": 0.0152, + "step": 36 + }, + { + "epoch": 0.01521216973578863, + "learning_rate": 2.7533146134728993e-06, + "loss": 0.1016, + "step": 38 + }, + { + "epoch": 0.016012810248198558, + "learning_rate": 2.772603249882202e-06, + "loss": 0.048, + "step": 40 + }, + { + "epoch": 0.016813450760608487, + "learning_rate": 2.7919483473136555e-06, + "loss": 0.1376, + "step": 42 + }, + { + "epoch": 0.017614091273018415, + "learning_rate": 2.81134975464178e-06, + "loss": 0.034, + "step": 44 + }, + { + "epoch": 0.018414731785428344, + "learning_rate": 2.8308073203011634e-06, + "loss": 0.1538, + "step": 46 + }, + { + "epoch": 0.01921537229783827, + "learning_rate": 2.850320892287688e-06, + "loss": 0.1127, + "step": 48 + }, + { + "epoch": 0.020016012810248198, + "learning_rate": 2.8698903181597026e-06, + "loss": 0.175, + "step": 50 + }, + { + "epoch": 0.020816653322658127, + "learning_rate": 2.889515445039256e-06, + "loss": 0.0457, + "step": 52 + }, + { + "epoch": 0.021617293835068056, + "learning_rate": 2.909196119613218e-06, + "loss": 0.4392, + "step": 54 + }, + { + "epoch": 0.02241793434747798, + "learning_rate": 2.928932188134529e-06, + "loss": 0.0658, + "step": 56 + }, + { + "epoch": 0.02321857485988791, + "learning_rate": 2.9487234964233724e-06, + "loss": 0.1111, + "step": 58 + }, + { + "epoch": 0.02401921537229784, + "learning_rate": 2.9685698898684355e-06, + "loss": 0.0388, + "step": 60 + }, + { + "epoch": 0.024819855884707767, + "learning_rate": 2.988471213428035e-06, + "loss": 0.1192, + "step": 62 + }, + { + "epoch": 0.025620496397117692, + "learning_rate": 3.00842731163137e-06, + "loss": 0.3528, + "step": 64 + }, + { + "epoch": 0.02642113690952762, + "learning_rate": 3.0284380285797733e-06, + "loss": 0.1812, + "step": 66 + }, + { + "epoch": 0.02722177742193755, + "learning_rate": 3.048503207947854e-06, + "loss": 0.0245, + "step": 68 + }, + { + "epoch": 0.02802241793434748, + "learning_rate": 3.068622692984767e-06, + "loss": 0.1994, + "step": 70 + }, + { + "epoch": 0.028823058446757407, + "learning_rate": 3.0887963265154187e-06, + "loss": 0.0103, + "step": 72 + }, + { + "epoch": 0.029623698959167333, + "learning_rate": 3.1090239509417364e-06, + "loss": 0.0254, + "step": 74 + }, + { + "epoch": 0.03042433947157726, + "learning_rate": 3.129305408243829e-06, + "loss": 0.0651, + "step": 76 + }, + { + "epoch": 0.03122497998398719, + "learning_rate": 3.1496405399812602e-06, + "loss": 0.0946, + "step": 78 + }, + { + "epoch": 0.032025620496397116, + "learning_rate": 3.17002918729432e-06, + "loss": 0.0301, + "step": 80 + }, + { + "epoch": 0.03282626100880705, + "learning_rate": 3.1904711909051967e-06, + "loss": 0.4394, + "step": 82 + }, + { + "epoch": 0.03362690152121697, + "learning_rate": 3.2109663911192622e-06, + "loss": 0.1513, + "step": 84 + }, + { + "epoch": 0.0344275420336269, + "learning_rate": 3.231514627826302e-06, + "loss": 0.112, + "step": 86 + }, + { + "epoch": 0.03522818254603683, + "learning_rate": 3.2521157405018146e-06, + "loss": 0.0642, + "step": 88 + }, + { + "epoch": 0.036028823058446756, + "learning_rate": 3.2727695682081897e-06, + "loss": 0.1087, + "step": 90 + }, + { + "epoch": 0.03682946357085669, + "learning_rate": 3.293475949595998e-06, + "loss": 0.3814, + "step": 92 + }, + { + "epoch": 0.03763010408326661, + "learning_rate": 3.314234722905302e-06, + "loss": 0.0535, + "step": 94 + }, + { + "epoch": 0.03843074459567654, + "learning_rate": 3.335045725966829e-06, + "loss": 0.3592, + "step": 96 + }, + { + "epoch": 0.03923138510808647, + "learning_rate": 3.355908796203301e-06, + "loss": 0.0818, + "step": 98 + }, + { + "epoch": 0.040032025620496396, + "learning_rate": 3.3768237706306716e-06, + "loss": 0.1316, + "step": 100 + }, + { + "epoch": 0.04083266613290633, + "learning_rate": 3.3977904858594534e-06, + "loss": 0.1642, + "step": 102 + }, + { + "epoch": 0.041633306645316254, + "learning_rate": 3.418808778095917e-06, + "loss": 0.1075, + "step": 104 + }, + { + "epoch": 0.04243394715772618, + "learning_rate": 3.4398784831434097e-06, + "loss": 0.3196, + "step": 106 + }, + { + "epoch": 0.04323458767013611, + "learning_rate": 3.460999436403676e-06, + "loss": 0.1469, + "step": 108 + }, + { + "epoch": 0.044035228182546036, + "learning_rate": 3.4821714728780654e-06, + "loss": 0.1458, + "step": 110 + }, + { + "epoch": 0.04483586869495596, + "learning_rate": 3.5033944271688624e-06, + "loss": 0.0183, + "step": 112 + }, + { + "epoch": 0.045636509207365894, + "learning_rate": 3.5246681334806177e-06, + "loss": 0.1096, + "step": 114 + }, + { + "epoch": 0.04643714971977582, + "learning_rate": 3.5459924256213596e-06, + "loss": 0.2373, + "step": 116 + }, + { + "epoch": 0.04723779023218575, + "learning_rate": 3.567367137003953e-06, + "loss": 0.9613, + "step": 118 + }, + { + "epoch": 0.04803843074459568, + "learning_rate": 3.588792100647368e-06, + "loss": 0.1215, + "step": 120 + }, + { + "epoch": 0.0488390712570056, + "learning_rate": 3.6102671491780393e-06, + "loss": 0.2998, + "step": 122 + }, + { + "epoch": 0.049639711769415534, + "learning_rate": 3.6317921148310965e-06, + "loss": 0.0343, + "step": 124 + }, + { + "epoch": 0.05044035228182546, + "learning_rate": 3.653366829451711e-06, + "loss": 0.256, + "step": 126 + }, + { + "epoch": 0.051240992794235385, + "learning_rate": 3.674991124496452e-06, + "loss": 0.0248, + "step": 128 + }, + { + "epoch": 0.05204163330664532, + "learning_rate": 3.696664831034521e-06, + "loss": 0.1913, + "step": 130 + }, + { + "epoch": 0.05284227381905524, + "learning_rate": 3.7183877797491143e-06, + "loss": 0.1421, + "step": 132 + }, + { + "epoch": 0.053642914331465175, + "learning_rate": 3.740159800938784e-06, + "loss": 0.0973, + "step": 134 + }, + { + "epoch": 0.0544435548438751, + "learning_rate": 3.7619807245186824e-06, + "loss": 0.008, + "step": 136 + }, + { + "epoch": 0.055244195356285025, + "learning_rate": 3.783850380021933e-06, + "loss": 0.1147, + "step": 138 + }, + { + "epoch": 0.05604483586869496, + "learning_rate": 3.8057685966010025e-06, + "loss": 0.6627, + "step": 140 + }, + { + "epoch": 0.05684547638110488, + "learning_rate": 3.827735203028956e-06, + "loss": 0.2326, + "step": 142 + }, + { + "epoch": 0.057646116893514815, + "learning_rate": 3.849750027700842e-06, + "loss": 0.0308, + "step": 144 + }, + { + "epoch": 0.05844675740592474, + "learning_rate": 3.871812898635011e-06, + "loss": 0.0637, + "step": 146 + }, + { + "epoch": 0.059247397918334666, + "learning_rate": 3.8939236434745184e-06, + "loss": 0.1526, + "step": 148 + }, + { + "epoch": 0.0600480384307446, + "learning_rate": 3.916082089488379e-06, + "loss": 0.0016, + "step": 150 + }, + { + "epoch": 0.06084867894315452, + "learning_rate": 3.938288063572962e-06, + "loss": 0.0926, + "step": 152 + }, + { + "epoch": 0.06164931945556445, + "learning_rate": 3.960541392253387e-06, + "loss": 0.1669, + "step": 154 + }, + { + "epoch": 0.06244995996797438, + "learning_rate": 3.982841901684792e-06, + "loss": 0.2158, + "step": 156 + }, + { + "epoch": 0.0632506004803843, + "learning_rate": 4.005189417653737e-06, + "loss": 0.148, + "step": 158 + }, + { + "epoch": 0.06405124099279423, + "learning_rate": 4.027583765579601e-06, + "loss": 0.0577, + "step": 160 + }, + { + "epoch": 0.06485188150520416, + "learning_rate": 4.050024770515873e-06, + "loss": 0.1165, + "step": 162 + }, + { + "epoch": 0.0656525220176141, + "learning_rate": 4.072512257151546e-06, + "loss": 0.1286, + "step": 164 + }, + { + "epoch": 0.06645316253002402, + "learning_rate": 4.095046049812541e-06, + "loss": 0.1151, + "step": 166 + }, + { + "epoch": 0.06725380304243395, + "learning_rate": 4.117625972462988e-06, + "loss": 0.0402, + "step": 168 + }, + { + "epoch": 0.06805444355484387, + "learning_rate": 4.1402518487066624e-06, + "loss": 0.2551, + "step": 170 + }, + { + "epoch": 0.0688550840672538, + "learning_rate": 4.1629235017883285e-06, + "loss": 0.4012, + "step": 172 + }, + { + "epoch": 0.06965572457966374, + "learning_rate": 4.1856407545951825e-06, + "loss": 0.2387, + "step": 174 + }, + { + "epoch": 0.07045636509207366, + "learning_rate": 4.208403429658151e-06, + "loss": 0.024, + "step": 176 + }, + { + "epoch": 0.07125700560448359, + "learning_rate": 4.2312113491533145e-06, + "loss": 0.1358, + "step": 178 + }, + { + "epoch": 0.07205764611689351, + "learning_rate": 4.254064334903347e-06, + "loss": 0.0094, + "step": 180 + }, + { + "epoch": 0.07285828662930344, + "learning_rate": 4.276962208378814e-06, + "loss": 0.1583, + "step": 182 + }, + { + "epoch": 0.07365892714171338, + "learning_rate": 4.299904790699619e-06, + "loss": 0.1238, + "step": 184 + }, + { + "epoch": 0.0744595676541233, + "learning_rate": 4.3228919026364345e-06, + "loss": 0.4382, + "step": 186 + }, + { + "epoch": 0.07526020816653323, + "learning_rate": 4.345923364612024e-06, + "loss": 0.3528, + "step": 188 + }, + { + "epoch": 0.07606084867894315, + "learning_rate": 4.368998996702686e-06, + "loss": 0.3852, + "step": 190 + }, + { + "epoch": 0.07686148919135308, + "learning_rate": 4.392118618639698e-06, + "loss": 0.0947, + "step": 192 + }, + { + "epoch": 0.07766212970376302, + "learning_rate": 4.415282049810643e-06, + "loss": 0.0537, + "step": 194 + }, + { + "epoch": 0.07846277021617294, + "learning_rate": 4.4384891092608795e-06, + "loss": 0.0766, + "step": 196 + }, + { + "epoch": 0.07926341072858287, + "learning_rate": 4.461739615694921e-06, + "loss": 0.0362, + "step": 198 + }, + { + "epoch": 0.08006405124099279, + "learning_rate": 4.485033387477915e-06, + "loss": 0.4005, + "step": 200 + }, + { + "epoch": 0.08086469175340272, + "learning_rate": 4.5083702426369715e-06, + "loss": 0.0108, + "step": 202 + }, + { + "epoch": 0.08166533226581266, + "learning_rate": 4.531749998862628e-06, + "loss": 0.3322, + "step": 204 + }, + { + "epoch": 0.08246597277822258, + "learning_rate": 4.555172473510324e-06, + "loss": 0.3022, + "step": 206 + }, + { + "epoch": 0.08326661329063251, + "learning_rate": 4.578637483601732e-06, + "loss": 0.1124, + "step": 208 + }, + { + "epoch": 0.08406725380304243, + "learning_rate": 4.602144845826234e-06, + "loss": 0.5849, + "step": 210 + }, + { + "epoch": 0.08486789431545236, + "learning_rate": 4.625694376542399e-06, + "loss": 0.0004, + "step": 212 + }, + { + "epoch": 0.08566853482786228, + "learning_rate": 4.649285891779326e-06, + "loss": 0.3145, + "step": 214 + }, + { + "epoch": 0.08646917534027222, + "learning_rate": 4.672919207238145e-06, + "loss": 0.2548, + "step": 216 + }, + { + "epoch": 0.08726981585268215, + "learning_rate": 4.696594138293421e-06, + "loss": 0.079, + "step": 218 + }, + { + "epoch": 0.08807045636509207, + "learning_rate": 4.720310499994664e-06, + "loss": 0.4485, + "step": 220 + }, + { + "epoch": 0.088871096877502, + "learning_rate": 4.744068107067673e-06, + "loss": 0.1557, + "step": 222 + }, + { + "epoch": 0.08967173738991192, + "learning_rate": 4.767866773916041e-06, + "loss": 0.1359, + "step": 224 + }, + { + "epoch": 0.09047237790232186, + "learning_rate": 4.79170631462264e-06, + "loss": 0.3935, + "step": 226 + }, + { + "epoch": 0.09127301841473179, + "learning_rate": 4.81558654295099e-06, + "loss": 0.1582, + "step": 228 + }, + { + "epoch": 0.09207365892714171, + "learning_rate": 4.839507272346751e-06, + "loss": 0.1028, + "step": 230 + }, + { + "epoch": 0.09287429943955164, + "learning_rate": 4.863468315939234e-06, + "loss": 0.0717, + "step": 232 + }, + { + "epoch": 0.09367493995196156, + "learning_rate": 4.8874694865427676e-06, + "loss": 0.3559, + "step": 234 + }, + { + "epoch": 0.0944755804643715, + "learning_rate": 4.911510596658202e-06, + "loss": 0.1527, + "step": 236 + }, + { + "epoch": 0.09527622097678143, + "learning_rate": 4.935591458474425e-06, + "loss": 0.1192, + "step": 238 + }, + { + "epoch": 0.09607686148919135, + "learning_rate": 4.959711883869734e-06, + "loss": 0.2082, + "step": 240 + }, + { + "epoch": 0.09687750200160128, + "learning_rate": 4.9838716844133665e-06, + "loss": 0.0934, + "step": 242 + }, + { + "epoch": 0.0976781425140112, + "learning_rate": 5.0080706713669435e-06, + "loss": 0.7068, + "step": 244 + }, + { + "epoch": 0.09847878302642114, + "learning_rate": 5.032308655686007e-06, + "loss": 0.1441, + "step": 246 + }, + { + "epoch": 0.09927942353883107, + "learning_rate": 5.056585448021398e-06, + "loss": 0.2424, + "step": 248 + }, + { + "epoch": 0.100080064051241, + "learning_rate": 5.080900858720789e-06, + "loss": 0.562, + "step": 250 + }, + { + "epoch": 0.10088070456365092, + "learning_rate": 5.105254697830208e-06, + "loss": 0.2064, + "step": 252 + }, + { + "epoch": 0.10168134507606084, + "learning_rate": 5.129646775095432e-06, + "loss": 0.1076, + "step": 254 + }, + { + "epoch": 0.10248198558847077, + "learning_rate": 5.154076899963514e-06, + "loss": 0.3241, + "step": 256 + }, + { + "epoch": 0.10328262610088071, + "learning_rate": 5.178544881584328e-06, + "loss": 0.1282, + "step": 258 + }, + { + "epoch": 0.10408326661329063, + "learning_rate": 5.203050528811959e-06, + "loss": 0.2354, + "step": 260 + }, + { + "epoch": 0.10488390712570056, + "learning_rate": 5.227593650206246e-06, + "loss": 0.2725, + "step": 262 + }, + { + "epoch": 0.10568454763811048, + "learning_rate": 5.2521740540343205e-06, + "loss": 0.096, + "step": 264 + }, + { + "epoch": 0.10648518815052041, + "learning_rate": 5.2767915482720164e-06, + "loss": 0.2634, + "step": 266 + }, + { + "epoch": 0.10728582866293035, + "learning_rate": 5.3014459406054295e-06, + "loss": 0.2161, + "step": 268 + }, + { + "epoch": 0.10808646917534027, + "learning_rate": 5.3261370384323904e-06, + "loss": 0.0682, + "step": 270 + }, + { + "epoch": 0.1088871096877502, + "learning_rate": 5.350864648864026e-06, + "loss": 0.0461, + "step": 272 + }, + { + "epoch": 0.10968775020016013, + "learning_rate": 5.375628578726181e-06, + "loss": 0.0681, + "step": 274 + }, + { + "epoch": 0.11048839071257005, + "learning_rate": 5.4004286345609665e-06, + "loss": 0.1726, + "step": 276 + }, + { + "epoch": 0.11128903122497999, + "learning_rate": 5.425264622628326e-06, + "loss": 0.0297, + "step": 278 + }, + { + "epoch": 0.11208967173738991, + "learning_rate": 5.450136348907444e-06, + "loss": 0.3649, + "step": 280 + }, + { + "epoch": 0.11289031224979984, + "learning_rate": 5.475043619098321e-06, + "loss": 0.1461, + "step": 282 + }, + { + "epoch": 0.11369095276220977, + "learning_rate": 5.499986238623329e-06, + "loss": 0.1071, + "step": 284 + }, + { + "epoch": 0.11449159327461969, + "learning_rate": 5.524964012628644e-06, + "loss": 0.0333, + "step": 286 + }, + { + "epoch": 0.11529223378702963, + "learning_rate": 5.549976745985809e-06, + "loss": 0.0659, + "step": 288 + }, + { + "epoch": 0.11609287429943956, + "learning_rate": 5.57502424329331e-06, + "loss": 0.1361, + "step": 290 + }, + { + "epoch": 0.11689351481184948, + "learning_rate": 5.6001063088780085e-06, + "loss": 0.6853, + "step": 292 + }, + { + "epoch": 0.1176941553242594, + "learning_rate": 5.62522274679673e-06, + "loss": 0.9823, + "step": 294 + }, + { + "epoch": 0.11849479583666933, + "learning_rate": 5.650373360837763e-06, + "loss": 0.2493, + "step": 296 + }, + { + "epoch": 0.11929543634907927, + "learning_rate": 5.675557954522462e-06, + "loss": 0.016, + "step": 298 + }, + { + "epoch": 0.1200960768614892, + "learning_rate": 5.700776331106674e-06, + "loss": 0.0424, + "step": 300 + }, + { + "epoch": 0.12089671737389912, + "learning_rate": 5.726028293582342e-06, + "loss": 0.0034, + "step": 302 + }, + { + "epoch": 0.12169735788630905, + "learning_rate": 5.751313644679071e-06, + "loss": 0.0171, + "step": 304 + }, + { + "epoch": 0.12249799839871897, + "learning_rate": 5.776632186865589e-06, + "loss": 0.0232, + "step": 306 + }, + { + "epoch": 0.1232986389111289, + "learning_rate": 5.8019837223513295e-06, + "loss": 0.1032, + "step": 308 + }, + { + "epoch": 0.12409927942353884, + "learning_rate": 5.827368053088032e-06, + "loss": 0.0758, + "step": 310 + }, + { + "epoch": 0.12489991993594876, + "learning_rate": 5.852784980771182e-06, + "loss": 0.0282, + "step": 312 + }, + { + "epoch": 0.1257005604483587, + "learning_rate": 5.878234306841637e-06, + "loss": 0.5472, + "step": 314 + }, + { + "epoch": 0.1265012009607686, + "learning_rate": 5.903715832487138e-06, + "loss": 0.0415, + "step": 316 + }, + { + "epoch": 0.12730184147317855, + "learning_rate": 5.929229358643925e-06, + "loss": 0.2705, + "step": 318 + }, + { + "epoch": 0.12810248198558846, + "learning_rate": 5.954774685998206e-06, + "loss": 0.11, + "step": 320 + }, + { + "epoch": 0.1289031224979984, + "learning_rate": 5.9803516149877475e-06, + "loss": 0.2075, + "step": 322 + }, + { + "epoch": 0.1297037630104083, + "learning_rate": 6.005959945803494e-06, + "loss": 0.0062, + "step": 324 + }, + { + "epoch": 0.13050440352281825, + "learning_rate": 6.03159947839103e-06, + "loss": 0.1936, + "step": 326 + }, + { + "epoch": 0.1313050440352282, + "learning_rate": 6.057270012452186e-06, + "loss": 0.2279, + "step": 328 + }, + { + "epoch": 0.1321056845476381, + "learning_rate": 6.082971347446654e-06, + "loss": 0.565, + "step": 330 + }, + { + "epoch": 0.13290632506004804, + "learning_rate": 6.108703282593461e-06, + "loss": 0.0678, + "step": 332 + }, + { + "epoch": 0.13370696557245795, + "learning_rate": 6.13446561687258e-06, + "loss": 0.0736, + "step": 334 + }, + { + "epoch": 0.1345076060848679, + "learning_rate": 6.160258149026557e-06, + "loss": 0.0189, + "step": 336 + }, + { + "epoch": 0.13530824659727783, + "learning_rate": 6.186080677561974e-06, + "loss": 0.3389, + "step": 338 + }, + { + "epoch": 0.13610888710968774, + "learning_rate": 6.2119330007511014e-06, + "loss": 0.037, + "step": 340 + }, + { + "epoch": 0.13690952762209768, + "learning_rate": 6.237814916633431e-06, + "loss": 0.1, + "step": 342 + }, + { + "epoch": 0.1377101681345076, + "learning_rate": 6.263726223017326e-06, + "loss": 0.0294, + "step": 344 + }, + { + "epoch": 0.13851080864691753, + "learning_rate": 6.289666717481496e-06, + "loss": 0.2595, + "step": 346 + }, + { + "epoch": 0.13931144915932747, + "learning_rate": 6.315636197376634e-06, + "loss": 0.04, + "step": 348 + }, + { + "epoch": 0.14011208967173738, + "learning_rate": 6.341634459827044e-06, + "loss": 0.0864, + "step": 350 + }, + { + "epoch": 0.14091273018414732, + "learning_rate": 6.3676613017321305e-06, + "loss": 0.101, + "step": 352 + }, + { + "epoch": 0.14171337069655723, + "learning_rate": 6.393716519768032e-06, + "loss": 0.0951, + "step": 354 + }, + { + "epoch": 0.14251401120896717, + "learning_rate": 6.419799910389257e-06, + "loss": 0.0108, + "step": 356 + }, + { + "epoch": 0.1433146517213771, + "learning_rate": 6.445911269830183e-06, + "loss": 0.1673, + "step": 358 + }, + { + "epoch": 0.14411529223378702, + "learning_rate": 6.472050394106689e-06, + "loss": 0.1196, + "step": 360 + }, + { + "epoch": 0.14491593274619696, + "learning_rate": 6.498217079017806e-06, + "loss": 0.1911, + "step": 362 + }, + { + "epoch": 0.14571657325860687, + "learning_rate": 6.524411120147204e-06, + "loss": 0.1379, + "step": 364 + }, + { + "epoch": 0.1465172137710168, + "learning_rate": 6.5506323128648654e-06, + "loss": 0.1161, + "step": 366 + }, + { + "epoch": 0.14731785428342675, + "learning_rate": 6.576880452328645e-06, + "loss": 0.399, + "step": 368 + }, + { + "epoch": 0.14811849479583666, + "learning_rate": 6.603155333485934e-06, + "loss": 0.1525, + "step": 370 + }, + { + "epoch": 0.1489191353082466, + "learning_rate": 6.6294567510751675e-06, + "loss": 0.3517, + "step": 372 + }, + { + "epoch": 0.14971977582065651, + "learning_rate": 6.655784499627476e-06, + "loss": 0.3868, + "step": 374 + }, + { + "epoch": 0.15052041633306645, + "learning_rate": 6.682138373468341e-06, + "loss": 0.0292, + "step": 376 + }, + { + "epoch": 0.1513210568454764, + "learning_rate": 6.7085181667191e-06, + "loss": 0.0172, + "step": 378 + }, + { + "epoch": 0.1521216973578863, + "learning_rate": 6.734923673298605e-06, + "loss": 0.1275, + "step": 380 + }, + { + "epoch": 0.15292233787029624, + "learning_rate": 6.761354686924883e-06, + "loss": 0.1193, + "step": 382 + }, + { + "epoch": 0.15372297838270615, + "learning_rate": 6.787811001116654e-06, + "loss": 0.2931, + "step": 384 + }, + { + "epoch": 0.1545236188951161, + "learning_rate": 6.8142924091949955e-06, + "loss": 0.6562, + "step": 386 + }, + { + "epoch": 0.15532425940752603, + "learning_rate": 6.840798704284939e-06, + "loss": 0.13, + "step": 388 + }, + { + "epoch": 0.15612489991993594, + "learning_rate": 6.867329679317144e-06, + "loss": 0.6252, + "step": 390 + }, + { + "epoch": 0.15692554043234588, + "learning_rate": 6.893885127029419e-06, + "loss": 0.2444, + "step": 392 + }, + { + "epoch": 0.1577261809447558, + "learning_rate": 6.920464839968391e-06, + "loss": 0.0536, + "step": 394 + }, + { + "epoch": 0.15852682145716573, + "learning_rate": 6.94706861049117e-06, + "loss": 0.2237, + "step": 396 + }, + { + "epoch": 0.15932746196957567, + "learning_rate": 6.973696230766884e-06, + "loss": 0.397, + "step": 398 + }, + { + "epoch": 0.16012810248198558, + "learning_rate": 7.000347492778341e-06, + "loss": 0.4228, + "step": 400 + }, + { + "epoch": 0.16092874299439552, + "learning_rate": 7.027022188323704e-06, + "loss": 0.2557, + "step": 402 + }, + { + "epoch": 0.16172938350680544, + "learning_rate": 7.05372010901803e-06, + "loss": 0.0858, + "step": 404 + }, + { + "epoch": 0.16253002401921537, + "learning_rate": 7.080441046294945e-06, + "loss": 0.1755, + "step": 406 + }, + { + "epoch": 0.1633306645316253, + "learning_rate": 7.1071847914082605e-06, + "loss": 0.0102, + "step": 408 + }, + { + "epoch": 0.16413130504403523, + "learning_rate": 7.133951135433656e-06, + "loss": 0.06, + "step": 410 + }, + { + "epoch": 0.16493194555644516, + "learning_rate": 7.160739869270219e-06, + "loss": 0.2886, + "step": 412 + }, + { + "epoch": 0.16573258606885508, + "learning_rate": 7.18755078364214e-06, + "loss": 0.3412, + "step": 414 + }, + { + "epoch": 0.16653322658126501, + "learning_rate": 7.214383669100317e-06, + "loss": 0.2079, + "step": 416 + }, + { + "epoch": 0.16733386709367493, + "learning_rate": 7.241238316024064e-06, + "loss": 0.5073, + "step": 418 + }, + { + "epoch": 0.16813450760608487, + "learning_rate": 7.268114514622635e-06, + "loss": 0.0016, + "step": 420 + }, + { + "epoch": 0.1689351481184948, + "learning_rate": 7.2950120549369204e-06, + "loss": 0.0156, + "step": 422 + }, + { + "epoch": 0.16973578863090472, + "learning_rate": 7.321930726841144e-06, + "loss": 0.0325, + "step": 424 + }, + { + "epoch": 0.17053642914331466, + "learning_rate": 7.348870320044395e-06, + "loss": 0.1904, + "step": 426 + }, + { + "epoch": 0.17133706965572457, + "learning_rate": 7.375830624092336e-06, + "loss": 0.1332, + "step": 428 + }, + { + "epoch": 0.1721377101681345, + "learning_rate": 7.402811428368824e-06, + "loss": 0.2039, + "step": 430 + }, + { + "epoch": 0.17293835068054444, + "learning_rate": 7.429812522097613e-06, + "loss": 0.1391, + "step": 432 + }, + { + "epoch": 0.17373899119295436, + "learning_rate": 7.4568336943439055e-06, + "loss": 0.0247, + "step": 434 + }, + { + "epoch": 0.1745396317053643, + "learning_rate": 7.4838747340160475e-06, + "loss": 0.5593, + "step": 436 + }, + { + "epoch": 0.1753402722177742, + "learning_rate": 7.510935429867233e-06, + "loss": 0.5083, + "step": 438 + }, + { + "epoch": 0.17614091273018415, + "learning_rate": 7.538015570497046e-06, + "loss": 0.0494, + "step": 440 + }, + { + "epoch": 0.17694155324259409, + "learning_rate": 7.5651149443531846e-06, + "loss": 0.0339, + "step": 442 + }, + { + "epoch": 0.177742193755004, + "learning_rate": 7.592233339733077e-06, + "loss": 0.103, + "step": 444 + }, + { + "epoch": 0.17854283426741394, + "learning_rate": 7.619370544785608e-06, + "loss": 0.0907, + "step": 446 + }, + { + "epoch": 0.17934347477982385, + "learning_rate": 7.646526347512665e-06, + "loss": 0.1347, + "step": 448 + }, + { + "epoch": 0.1801441152922338, + "learning_rate": 7.67370053577085e-06, + "loss": 0.5048, + "step": 450 + }, + { + "epoch": 0.18094475580464373, + "learning_rate": 7.70089289727319e-06, + "loss": 0.0254, + "step": 452 + }, + { + "epoch": 0.18174539631705364, + "learning_rate": 7.728103219590684e-06, + "loss": 0.2381, + "step": 454 + }, + { + "epoch": 0.18254603682946358, + "learning_rate": 7.755331290154041e-06, + "loss": 0.1525, + "step": 456 + }, + { + "epoch": 0.1833466773418735, + "learning_rate": 7.7825768962553e-06, + "loss": 0.0364, + "step": 458 + }, + { + "epoch": 0.18414731785428343, + "learning_rate": 7.809839825049565e-06, + "loss": 0.4224, + "step": 460 + }, + { + "epoch": 0.18494795836669337, + "learning_rate": 7.83711986355656e-06, + "loss": 0.1089, + "step": 462 + }, + { + "epoch": 0.18574859887910328, + "learning_rate": 7.864416798662347e-06, + "loss": 0.4363, + "step": 464 + }, + { + "epoch": 0.18654923939151322, + "learning_rate": 7.891730417121043e-06, + "loss": 0.0321, + "step": 466 + }, + { + "epoch": 0.18734987990392313, + "learning_rate": 7.919060505556376e-06, + "loss": 0.008, + "step": 468 + }, + { + "epoch": 0.18815052041633307, + "learning_rate": 7.946406850463435e-06, + "loss": 0.1374, + "step": 470 + }, + { + "epoch": 0.188951160928743, + "learning_rate": 7.973769238210291e-06, + "loss": 0.2525, + "step": 472 + }, + { + "epoch": 0.18975180144115292, + "learning_rate": 8.001147455039737e-06, + "loss": 0.2124, + "step": 474 + }, + { + "epoch": 0.19055244195356286, + "learning_rate": 8.028541287070858e-06, + "loss": 0.0513, + "step": 476 + }, + { + "epoch": 0.19135308246597277, + "learning_rate": 8.055950520300756e-06, + "loss": 0.2265, + "step": 478 + }, + { + "epoch": 0.1921537229783827, + "learning_rate": 8.083374940606256e-06, + "loss": 0.0198, + "step": 480 + }, + { + "epoch": 0.19295436349079265, + "learning_rate": 8.110814333745503e-06, + "loss": 0.5947, + "step": 482 + }, + { + "epoch": 0.19375500400320256, + "learning_rate": 8.138268485359684e-06, + "loss": 0.1962, + "step": 484 + }, + { + "epoch": 0.1945556445156125, + "learning_rate": 8.165737180974676e-06, + "loss": 0.1806, + "step": 486 + }, + { + "epoch": 0.1953562850280224, + "learning_rate": 8.193220206002785e-06, + "loss": 0.4964, + "step": 488 + }, + { + "epoch": 0.19615692554043235, + "learning_rate": 8.220717345744326e-06, + "loss": 0.1587, + "step": 490 + }, + { + "epoch": 0.1969575660528423, + "learning_rate": 8.248228385389349e-06, + "loss": 0.1606, + "step": 492 + }, + { + "epoch": 0.1977582065652522, + "learning_rate": 8.275753110019367e-06, + "loss": 0.1103, + "step": 494 + }, + { + "epoch": 0.19855884707766214, + "learning_rate": 8.303291304608936e-06, + "loss": 0.021, + "step": 496 + }, + { + "epoch": 0.19935948759007205, + "learning_rate": 8.330842754027378e-06, + "loss": 0.0049, + "step": 498 + }, + { + "epoch": 0.200160128102482, + "learning_rate": 8.358407243040524e-06, + "loss": 0.2578, + "step": 500 + }, + { + "epoch": 0.20096076861489193, + "learning_rate": 8.385984556312285e-06, + "loss": 0.1362, + "step": 502 + }, + { + "epoch": 0.20176140912730184, + "learning_rate": 8.413574478406386e-06, + "loss": 0.4179, + "step": 504 + }, + { + "epoch": 0.20256204963971178, + "learning_rate": 8.441176793788106e-06, + "loss": 0.1503, + "step": 506 + }, + { + "epoch": 0.2033626901521217, + "learning_rate": 8.468791286825856e-06, + "loss": 0.1755, + "step": 508 + }, + { + "epoch": 0.20416333066453163, + "learning_rate": 8.496417741792922e-06, + "loss": 0.1431, + "step": 510 + }, + { + "epoch": 0.20496397117694154, + "learning_rate": 8.524055942869135e-06, + "loss": 0.011, + "step": 512 + }, + { + "epoch": 0.20576461168935148, + "learning_rate": 8.551705674142616e-06, + "loss": 0.3312, + "step": 514 + }, + { + "epoch": 0.20656525220176142, + "learning_rate": 8.579366719611353e-06, + "loss": 0.019, + "step": 516 + }, + { + "epoch": 0.20736589271417133, + "learning_rate": 8.607038863184952e-06, + "loss": 0.0045, + "step": 518 + }, + { + "epoch": 0.20816653322658127, + "learning_rate": 8.634721888686368e-06, + "loss": 0.1427, + "step": 520 + }, + { + "epoch": 0.20896717373899118, + "learning_rate": 8.662415579853495e-06, + "loss": 0.0707, + "step": 522 + }, + { + "epoch": 0.20976781425140112, + "learning_rate": 8.690119720340907e-06, + "loss": 0.1926, + "step": 524 + }, + { + "epoch": 0.21056845476381106, + "learning_rate": 8.717834093721598e-06, + "loss": 0.1663, + "step": 526 + }, + { + "epoch": 0.21136909527622097, + "learning_rate": 8.74555848348857e-06, + "loss": 0.1228, + "step": 528 + }, + { + "epoch": 0.2121697357886309, + "learning_rate": 8.773292673056572e-06, + "loss": 0.235, + "step": 530 + }, + { + "epoch": 0.21297037630104082, + "learning_rate": 8.801036445763858e-06, + "loss": 0.0872, + "step": 532 + }, + { + "epoch": 0.21377101681345076, + "learning_rate": 8.828789584873757e-06, + "loss": 0.2412, + "step": 534 + }, + { + "epoch": 0.2145716573258607, + "learning_rate": 8.856551873576448e-06, + "loss": 0.1103, + "step": 536 + }, + { + "epoch": 0.2153722978382706, + "learning_rate": 8.884323094990613e-06, + "loss": 0.0017, + "step": 538 + }, + { + "epoch": 0.21617293835068055, + "learning_rate": 8.912103032165206e-06, + "loss": 0.0287, + "step": 540 + }, + { + "epoch": 0.21697357886309046, + "learning_rate": 8.939891468081036e-06, + "loss": 0.1293, + "step": 542 + }, + { + "epoch": 0.2177742193755004, + "learning_rate": 8.967688185652527e-06, + "loss": 0.0015, + "step": 544 + }, + { + "epoch": 0.21857485988791034, + "learning_rate": 8.995492967729449e-06, + "loss": 0.2504, + "step": 546 + }, + { + "epoch": 0.21937550040032025, + "learning_rate": 9.023305597098526e-06, + "loss": 0.1533, + "step": 548 + }, + { + "epoch": 0.2201761409127302, + "learning_rate": 9.051125856485175e-06, + "loss": 0.0213, + "step": 550 + }, + { + "epoch": 0.2209767814251401, + "learning_rate": 9.078953528555258e-06, + "loss": 0.9907, + "step": 552 + }, + { + "epoch": 0.22177742193755004, + "learning_rate": 9.106788395916682e-06, + "loss": 1.0512, + "step": 554 + }, + { + "epoch": 0.22257806244995998, + "learning_rate": 9.134630241121135e-06, + "loss": 0.0298, + "step": 556 + }, + { + "epoch": 0.2233787029623699, + "learning_rate": 9.162478846665854e-06, + "loss": 0.4309, + "step": 558 + }, + { + "epoch": 0.22417934347477983, + "learning_rate": 9.190333994995208e-06, + "loss": 0.1522, + "step": 560 + }, + { + "epoch": 0.22497998398718974, + "learning_rate": 9.218195468502469e-06, + "loss": 0.0953, + "step": 562 + }, + { + "epoch": 0.22578062449959968, + "learning_rate": 9.24606304953148e-06, + "loss": 0.5407, + "step": 564 + }, + { + "epoch": 0.22658126501200962, + "learning_rate": 9.273936520378426e-06, + "loss": 0.0858, + "step": 566 + }, + { + "epoch": 0.22738190552441953, + "learning_rate": 9.301815663293426e-06, + "loss": 0.2854, + "step": 568 + }, + { + "epoch": 0.22818254603682947, + "learning_rate": 9.329700260482286e-06, + "loss": 1.9709, + "step": 570 + }, + { + "epoch": 0.22898318654923938, + "learning_rate": 9.35759009410826e-06, + "loss": 0.0383, + "step": 572 + }, + { + "epoch": 0.22978382706164932, + "learning_rate": 9.38548494629364e-06, + "loss": 0.0332, + "step": 574 + }, + { + "epoch": 0.23058446757405926, + "learning_rate": 9.41338459912151e-06, + "loss": 0.1591, + "step": 576 + }, + { + "epoch": 0.23138510808646917, + "learning_rate": 9.441288834637507e-06, + "loss": 0.4677, + "step": 578 + }, + { + "epoch": 0.2321857485988791, + "learning_rate": 9.469197434851414e-06, + "loss": 0.108, + "step": 580 + }, + { + "epoch": 0.23298638911128902, + "learning_rate": 9.497110181738935e-06, + "loss": 0.254, + "step": 582 + }, + { + "epoch": 0.23378702962369896, + "learning_rate": 9.52502685724336e-06, + "loss": 0.2554, + "step": 584 + }, + { + "epoch": 0.2345876701361089, + "learning_rate": 9.552947243277342e-06, + "loss": 0.1302, + "step": 586 + }, + { + "epoch": 0.2353883106485188, + "learning_rate": 9.580871121724498e-06, + "loss": 0.2466, + "step": 588 + }, + { + "epoch": 0.23618895116092875, + "learning_rate": 9.608798274441153e-06, + "loss": 0.0709, + "step": 590 + }, + { + "epoch": 0.23698959167333866, + "learning_rate": 9.636728483258116e-06, + "loss": 0.2866, + "step": 592 + }, + { + "epoch": 0.2377902321857486, + "learning_rate": 9.664661529982263e-06, + "loss": 0.1067, + "step": 594 + }, + { + "epoch": 0.23859087269815854, + "learning_rate": 9.692597196398302e-06, + "loss": 0.2506, + "step": 596 + }, + { + "epoch": 0.23939151321056845, + "learning_rate": 9.720535264270526e-06, + "loss": 0.1105, + "step": 598 + }, + { + "epoch": 0.2401921537229784, + "learning_rate": 9.748475515344416e-06, + "loss": 0.2434, + "step": 600 + }, + { + "epoch": 0.2409927942353883, + "learning_rate": 9.776417731348403e-06, + "loss": 0.1571, + "step": 602 + }, + { + "epoch": 0.24179343474779824, + "learning_rate": 9.80436169399561e-06, + "loss": 0.5832, + "step": 604 + }, + { + "epoch": 0.24259407526020815, + "learning_rate": 9.832307184985473e-06, + "loss": 0.0777, + "step": 606 + }, + { + "epoch": 0.2433947157726181, + "learning_rate": 9.8602539860055e-06, + "loss": 0.2253, + "step": 608 + }, + { + "epoch": 0.24419535628502803, + "learning_rate": 9.888201878732946e-06, + "loss": 0.3303, + "step": 610 + }, + { + "epoch": 0.24499599679743794, + "learning_rate": 9.916150644836596e-06, + "loss": 0.233, + "step": 612 + }, + { + "epoch": 0.24579663730984788, + "learning_rate": 9.944100065978354e-06, + "loss": 0.1816, + "step": 614 + }, + { + "epoch": 0.2465972778222578, + "learning_rate": 9.972049923815011e-06, + "loss": 0.0678, + "step": 616 + }, + { + "epoch": 0.24739791833466773, + "learning_rate": 9.999999999999996e-06, + "loss": 0.2618, + "step": 618 + }, + { + "epoch": 0.24819855884707767, + "learning_rate": 1.0027950076184982e-05, + "loss": 0.0673, + "step": 620 + }, + { + "epoch": 0.24899919935948758, + "learning_rate": 1.0055899934021637e-05, + "loss": 0.1078, + "step": 622 + }, + { + "epoch": 0.24979983987189752, + "learning_rate": 1.0083849355163397e-05, + "loss": 0.0413, + "step": 624 + }, + { + "epoch": 0.25060048038430743, + "learning_rate": 1.0111798121267047e-05, + "loss": 0.0515, + "step": 626 + }, + { + "epoch": 0.2514011208967174, + "learning_rate": 1.0139746013994493e-05, + "loss": 0.2504, + "step": 628 + }, + { + "epoch": 0.2522017614091273, + "learning_rate": 1.016769281501452e-05, + "loss": 0.1164, + "step": 630 + }, + { + "epoch": 0.2530024019215372, + "learning_rate": 1.0195638306004383e-05, + "loss": 0.2182, + "step": 632 + }, + { + "epoch": 0.25380304243394713, + "learning_rate": 1.022358226865159e-05, + "loss": 0.1589, + "step": 634 + }, + { + "epoch": 0.2546036829463571, + "learning_rate": 1.0251524484655577e-05, + "loss": 0.1725, + "step": 636 + }, + { + "epoch": 0.255404323458767, + "learning_rate": 1.0279464735729467e-05, + "loss": 0.0579, + "step": 638 + }, + { + "epoch": 0.2562049639711769, + "learning_rate": 1.0307402803601691e-05, + "loss": 0.0485, + "step": 640 + }, + { + "epoch": 0.2570056044835869, + "learning_rate": 1.033533847001773e-05, + "loss": 0.4735, + "step": 642 + }, + { + "epoch": 0.2578062449959968, + "learning_rate": 1.0363271516741877e-05, + "loss": 0.0575, + "step": 644 + }, + { + "epoch": 0.2586068855084067, + "learning_rate": 1.039120172555884e-05, + "loss": 0.0828, + "step": 646 + }, + { + "epoch": 0.2594075260208166, + "learning_rate": 1.0419128878275495e-05, + "loss": 0.0701, + "step": 648 + }, + { + "epoch": 0.2602081665332266, + "learning_rate": 1.0447052756722651e-05, + "loss": 0.2886, + "step": 650 + }, + { + "epoch": 0.2610088070456365, + "learning_rate": 1.0474973142756632e-05, + "loss": 0.1189, + "step": 652 + }, + { + "epoch": 0.2618094475580464, + "learning_rate": 1.0502889818261058e-05, + "loss": 0.3615, + "step": 654 + }, + { + "epoch": 0.2626100880704564, + "learning_rate": 1.053080256514858e-05, + "loss": 0.177, + "step": 656 + }, + { + "epoch": 0.2634107285828663, + "learning_rate": 1.0558711165362488e-05, + "loss": 0.0059, + "step": 658 + }, + { + "epoch": 0.2642113690952762, + "learning_rate": 1.0586615400878484e-05, + "loss": 0.394, + "step": 660 + }, + { + "epoch": 0.26501200960768617, + "learning_rate": 1.0614515053706354e-05, + "loss": 0.177, + "step": 662 + }, + { + "epoch": 0.2658126501200961, + "learning_rate": 1.0642409905891733e-05, + "loss": 0.6303, + "step": 664 + }, + { + "epoch": 0.266613290632506, + "learning_rate": 1.0670299739517706e-05, + "loss": 0.3272, + "step": 666 + }, + { + "epoch": 0.2674139311449159, + "learning_rate": 1.0698184336706567e-05, + "loss": 0.2295, + "step": 668 + }, + { + "epoch": 0.2682145716573259, + "learning_rate": 1.0726063479621567e-05, + "loss": 0.1106, + "step": 670 + }, + { + "epoch": 0.2690152121697358, + "learning_rate": 1.0753936950468513e-05, + "loss": 0.0988, + "step": 672 + }, + { + "epoch": 0.2698158526821457, + "learning_rate": 1.0781804531497525e-05, + "loss": 0.5348, + "step": 674 + }, + { + "epoch": 0.27061649319455566, + "learning_rate": 1.0809666005004787e-05, + "loss": 0.1596, + "step": 676 + }, + { + "epoch": 0.2714171337069656, + "learning_rate": 1.083752115333414e-05, + "loss": 0.0666, + "step": 678 + }, + { + "epoch": 0.2722177742193755, + "learning_rate": 1.0865369758878858e-05, + "loss": 0.3488, + "step": 680 + }, + { + "epoch": 0.27301841473178545, + "learning_rate": 1.0893211604083311e-05, + "loss": 0.2719, + "step": 682 + }, + { + "epoch": 0.27381905524419536, + "learning_rate": 1.0921046471444737e-05, + "loss": 0.1288, + "step": 684 + }, + { + "epoch": 0.2746196957566053, + "learning_rate": 1.0948874143514818e-05, + "loss": 0.427, + "step": 686 + }, + { + "epoch": 0.2754203362690152, + "learning_rate": 1.0976694402901467e-05, + "loss": 0.1688, + "step": 688 + }, + { + "epoch": 0.27622097678142515, + "learning_rate": 1.1004507032270544e-05, + "loss": 0.2892, + "step": 690 + }, + { + "epoch": 0.27702161729383507, + "learning_rate": 1.1032311814347467e-05, + "loss": 0.1668, + "step": 692 + }, + { + "epoch": 0.277822257806245, + "learning_rate": 1.1060108531918955e-05, + "loss": 0.0432, + "step": 694 + }, + { + "epoch": 0.27862289831865494, + "learning_rate": 1.1087896967834787e-05, + "loss": 0.5289, + "step": 696 + }, + { + "epoch": 0.27942353883106485, + "learning_rate": 1.111567690500938e-05, + "loss": 0.1471, + "step": 698 + }, + { + "epoch": 0.28022417934347477, + "learning_rate": 1.1143448126423545e-05, + "loss": 0.0306, + "step": 700 + }, + { + "epoch": 0.28102481985588473, + "learning_rate": 1.1171210415126238e-05, + "loss": 0.2731, + "step": 702 + }, + { + "epoch": 0.28182546036829464, + "learning_rate": 1.1198963554236135e-05, + "loss": 0.5657, + "step": 704 + }, + { + "epoch": 0.28262610088070456, + "learning_rate": 1.122670732694342e-05, + "loss": 0.0508, + "step": 706 + }, + { + "epoch": 0.28342674139311447, + "learning_rate": 1.1254441516511425e-05, + "loss": 0.0586, + "step": 708 + }, + { + "epoch": 0.28422738190552443, + "learning_rate": 1.1282165906278395e-05, + "loss": 0.0144, + "step": 710 + }, + { + "epoch": 0.28502802241793435, + "learning_rate": 1.1309880279659087e-05, + "loss": 0.3224, + "step": 712 + }, + { + "epoch": 0.28582866293034426, + "learning_rate": 1.1337584420146496e-05, + "loss": 0.278, + "step": 714 + }, + { + "epoch": 0.2866293034427542, + "learning_rate": 1.1365278111313625e-05, + "loss": 0.5086, + "step": 716 + }, + { + "epoch": 0.28742994395516414, + "learning_rate": 1.1392961136815041e-05, + "loss": 0.2386, + "step": 718 + }, + { + "epoch": 0.28823058446757405, + "learning_rate": 1.142063328038864e-05, + "loss": 0.5037, + "step": 720 + }, + { + "epoch": 0.289031224979984, + "learning_rate": 1.1448294325857377e-05, + "loss": 0.1244, + "step": 722 + }, + { + "epoch": 0.2898318654923939, + "learning_rate": 1.1475944057130856e-05, + "loss": 0.1801, + "step": 724 + }, + { + "epoch": 0.29063250600480384, + "learning_rate": 1.150358225820707e-05, + "loss": 0.1845, + "step": 726 + }, + { + "epoch": 0.29143314651721375, + "learning_rate": 1.1531208713174138e-05, + "loss": 0.3542, + "step": 728 + }, + { + "epoch": 0.2922337870296237, + "learning_rate": 1.1558823206211887e-05, + "loss": 0.0561, + "step": 730 + }, + { + "epoch": 0.2930344275420336, + "learning_rate": 1.1586425521593607e-05, + "loss": 0.3418, + "step": 732 + }, + { + "epoch": 0.29383506805444354, + "learning_rate": 1.1614015443687708e-05, + "loss": 0.2323, + "step": 734 + }, + { + "epoch": 0.2946357085668535, + "learning_rate": 1.1641592756959467e-05, + "loss": 0.152, + "step": 736 + }, + { + "epoch": 0.2954363490792634, + "learning_rate": 1.1669157245972616e-05, + "loss": 0.1304, + "step": 738 + }, + { + "epoch": 0.2962369895916733, + "learning_rate": 1.1696708695391057e-05, + "loss": 0.4447, + "step": 740 + }, + { + "epoch": 0.29703763010408324, + "learning_rate": 1.1724246889980626e-05, + "loss": 0.0264, + "step": 742 + }, + { + "epoch": 0.2978382706164932, + "learning_rate": 1.1751771614610643e-05, + "loss": 0.0659, + "step": 744 + }, + { + "epoch": 0.2986389111289031, + "learning_rate": 1.1779282654255668e-05, + "loss": 0.1796, + "step": 746 + }, + { + "epoch": 0.29943955164131303, + "learning_rate": 1.180677979399721e-05, + "loss": 0.1283, + "step": 748 + }, + { + "epoch": 0.300240192153723, + "learning_rate": 1.1834262819025317e-05, + "loss": 0.0365, + "step": 750 + }, + { + "epoch": 0.3010408326661329, + "learning_rate": 1.1861731514640309e-05, + "loss": 0.2157, + "step": 752 + }, + { + "epoch": 0.3018414731785428, + "learning_rate": 1.188918566625449e-05, + "loss": 0.5453, + "step": 754 + }, + { + "epoch": 0.3026421136909528, + "learning_rate": 1.1916625059393739e-05, + "loss": 0.3992, + "step": 756 + }, + { + "epoch": 0.3034427542033627, + "learning_rate": 1.1944049479699241e-05, + "loss": 0.0596, + "step": 758 + }, + { + "epoch": 0.3042433947157726, + "learning_rate": 1.1971458712929133e-05, + "loss": 0.0521, + "step": 760 + }, + { + "epoch": 0.3050440352281825, + "learning_rate": 1.1998852544960256e-05, + "loss": 0.2155, + "step": 762 + }, + { + "epoch": 0.3058446757405925, + "learning_rate": 1.2026230761789702e-05, + "loss": 0.1666, + "step": 764 + }, + { + "epoch": 0.3066453162530024, + "learning_rate": 1.2053593149536557e-05, + "loss": 0.0528, + "step": 766 + }, + { + "epoch": 0.3074459567654123, + "learning_rate": 1.2080939494443618e-05, + "loss": 0.2306, + "step": 768 + }, + { + "epoch": 0.3082465972778223, + "learning_rate": 1.210826958287895e-05, + "loss": 0.0265, + "step": 770 + }, + { + "epoch": 0.3090472377902322, + "learning_rate": 1.2135583201337646e-05, + "loss": 0.04, + "step": 772 + }, + { + "epoch": 0.3098478783026421, + "learning_rate": 1.2162880136443434e-05, + "loss": 0.2041, + "step": 774 + }, + { + "epoch": 0.31064851881505207, + "learning_rate": 1.2190160174950428e-05, + "loss": 0.3985, + "step": 776 + }, + { + "epoch": 0.311449159327462, + "learning_rate": 1.2217423103744692e-05, + "loss": 0.4338, + "step": 778 + }, + { + "epoch": 0.3122497998398719, + "learning_rate": 1.2244668709845952e-05, + "loss": 0.0033, + "step": 780 + }, + { + "epoch": 0.3130504403522818, + "learning_rate": 1.2271896780409309e-05, + "loss": 0.1383, + "step": 782 + }, + { + "epoch": 0.31385108086469177, + "learning_rate": 1.2299107102726804e-05, + "loss": 0.1044, + "step": 784 + }, + { + "epoch": 0.3146517213771017, + "learning_rate": 1.2326299464229143e-05, + "loss": 0.6256, + "step": 786 + }, + { + "epoch": 0.3154523618895116, + "learning_rate": 1.2353473652487329e-05, + "loss": 0.1302, + "step": 788 + }, + { + "epoch": 0.31625300240192156, + "learning_rate": 1.2380629455214385e-05, + "loss": 0.1811, + "step": 790 + }, + { + "epoch": 0.31705364291433147, + "learning_rate": 1.2407766660266916e-05, + "loss": 0.7897, + "step": 792 + }, + { + "epoch": 0.3178542834267414, + "learning_rate": 1.2434885055646808e-05, + "loss": 0.0882, + "step": 794 + }, + { + "epoch": 0.31865492393915135, + "learning_rate": 1.2461984429502947e-05, + "loss": 0.2129, + "step": 796 + }, + { + "epoch": 0.31945556445156126, + "learning_rate": 1.2489064570132761e-05, + "loss": 0.2419, + "step": 798 + }, + { + "epoch": 0.32025620496397117, + "learning_rate": 1.2516125265983945e-05, + "loss": 0.0982, + "step": 800 + }, + { + "epoch": 0.3210568454763811, + "learning_rate": 1.2543166305656089e-05, + "loss": 0.0043, + "step": 802 + }, + { + "epoch": 0.32185748598879105, + "learning_rate": 1.257018747790238e-05, + "loss": 0.0432, + "step": 804 + }, + { + "epoch": 0.32265812650120096, + "learning_rate": 1.259718857163117e-05, + "loss": 0.3037, + "step": 806 + }, + { + "epoch": 0.32345876701361087, + "learning_rate": 1.2624169375907657e-05, + "loss": 0.1702, + "step": 808 + }, + { + "epoch": 0.32425940752602084, + "learning_rate": 1.2651129679955598e-05, + "loss": 0.3856, + "step": 810 + }, + { + "epoch": 0.32506004803843075, + "learning_rate": 1.2678069273158849e-05, + "loss": 0.133, + "step": 812 + }, + { + "epoch": 0.32586068855084066, + "learning_rate": 1.2704987945063073e-05, + "loss": 0.2291, + "step": 814 + }, + { + "epoch": 0.3266613290632506, + "learning_rate": 1.273188548537736e-05, + "loss": 0.6004, + "step": 816 + }, + { + "epoch": 0.32746196957566054, + "learning_rate": 1.2758761683975929e-05, + "loss": 0.2873, + "step": 818 + }, + { + "epoch": 0.32826261008807045, + "learning_rate": 1.2785616330899676e-05, + "loss": 0.0601, + "step": 820 + }, + { + "epoch": 0.32906325060048036, + "learning_rate": 1.2812449216357855e-05, + "loss": 0.023, + "step": 822 + }, + { + "epoch": 0.32986389111289033, + "learning_rate": 1.2839260130729776e-05, + "loss": 0.3317, + "step": 824 + }, + { + "epoch": 0.33066453162530024, + "learning_rate": 1.2866048864566336e-05, + "loss": 0.0976, + "step": 826 + }, + { + "epoch": 0.33146517213771015, + "learning_rate": 1.2892815208591734e-05, + "loss": 0.2702, + "step": 828 + }, + { + "epoch": 0.3322658126501201, + "learning_rate": 1.2919558953705047e-05, + "loss": 0.1872, + "step": 830 + }, + { + "epoch": 0.33306645316253003, + "learning_rate": 1.2946279890981966e-05, + "loss": 0.0484, + "step": 832 + }, + { + "epoch": 0.33386709367493994, + "learning_rate": 1.2972977811676289e-05, + "loss": 0.2853, + "step": 834 + }, + { + "epoch": 0.33466773418734985, + "learning_rate": 1.2999652507221652e-05, + "loss": 0.0551, + "step": 836 + }, + { + "epoch": 0.3354683746997598, + "learning_rate": 1.3026303769233109e-05, + "loss": 0.3515, + "step": 838 + }, + { + "epoch": 0.33626901521216973, + "learning_rate": 1.3052931389508822e-05, + "loss": 0.3938, + "step": 840 + }, + { + "epoch": 0.33706965572457964, + "learning_rate": 1.3079535160031601e-05, + "loss": 0.0773, + "step": 842 + }, + { + "epoch": 0.3378702962369896, + "learning_rate": 1.3106114872970575e-05, + "loss": 0.25, + "step": 844 + }, + { + "epoch": 0.3386709367493995, + "learning_rate": 1.313267032068285e-05, + "loss": 0.2299, + "step": 846 + }, + { + "epoch": 0.33947157726180943, + "learning_rate": 1.3159201295715054e-05, + "loss": 0.8993, + "step": 848 + }, + { + "epoch": 0.3402722177742194, + "learning_rate": 1.3185707590804997e-05, + "loss": 0.0636, + "step": 850 + }, + { + "epoch": 0.3410728582866293, + "learning_rate": 1.321218899888334e-05, + "loss": 0.2269, + "step": 852 + }, + { + "epoch": 0.3418734987990392, + "learning_rate": 1.3238645313075109e-05, + "loss": 0.3642, + "step": 854 + }, + { + "epoch": 0.34267413931144913, + "learning_rate": 1.326507632670139e-05, + "loss": 0.5907, + "step": 856 + }, + { + "epoch": 0.3434747798238591, + "learning_rate": 1.3291481833280894e-05, + "loss": 0.6678, + "step": 858 + }, + { + "epoch": 0.344275420336269, + "learning_rate": 1.3317861626531652e-05, + "loss": 0.0843, + "step": 860 + }, + { + "epoch": 0.3450760608486789, + "learning_rate": 1.3344215500372517e-05, + "loss": 0.1857, + "step": 862 + }, + { + "epoch": 0.3458767013610889, + "learning_rate": 1.3370543248924826e-05, + "loss": 0.2602, + "step": 864 + }, + { + "epoch": 0.3466773418734988, + "learning_rate": 1.3396844666514062e-05, + "loss": 0.2764, + "step": 866 + }, + { + "epoch": 0.3474779823859087, + "learning_rate": 1.3423119547671348e-05, + "loss": 0.2288, + "step": 868 + }, + { + "epoch": 0.3482786228983187, + "learning_rate": 1.344936768713513e-05, + "loss": 0.143, + "step": 870 + }, + { + "epoch": 0.3490792634107286, + "learning_rate": 1.347558887985279e-05, + "loss": 0.2076, + "step": 872 + }, + { + "epoch": 0.3498799039231385, + "learning_rate": 1.3501782920982189e-05, + "loss": 0.0369, + "step": 874 + }, + { + "epoch": 0.3506805444355484, + "learning_rate": 1.3527949605893305e-05, + "loss": 0.0689, + "step": 876 + }, + { + "epoch": 0.3514811849479584, + "learning_rate": 1.3554088730169812e-05, + "loss": 0.0862, + "step": 878 + }, + { + "epoch": 0.3522818254603683, + "learning_rate": 1.3580200089610739e-05, + "loss": 0.1249, + "step": 880 + }, + { + "epoch": 0.3530824659727782, + "learning_rate": 1.3606283480231962e-05, + "loss": 0.0659, + "step": 882 + }, + { + "epoch": 0.35388310648518817, + "learning_rate": 1.3632338698267863e-05, + "loss": 0.0604, + "step": 884 + }, + { + "epoch": 0.3546837469975981, + "learning_rate": 1.3658365540172948e-05, + "loss": 0.6569, + "step": 886 + }, + { + "epoch": 0.355484387510008, + "learning_rate": 1.368436380262336e-05, + "loss": 0.1914, + "step": 888 + }, + { + "epoch": 0.35628502802241796, + "learning_rate": 1.3710333282518497e-05, + "loss": 0.1759, + "step": 890 + }, + { + "epoch": 0.35708566853482787, + "learning_rate": 1.3736273776982667e-05, + "loss": 0.0608, + "step": 892 + }, + { + "epoch": 0.3578863090472378, + "learning_rate": 1.3762185083366562e-05, + "loss": 0.4837, + "step": 894 + }, + { + "epoch": 0.3586869495596477, + "learning_rate": 1.3788066999248893e-05, + "loss": 0.0775, + "step": 896 + }, + { + "epoch": 0.35948759007205766, + "learning_rate": 1.3813919322438018e-05, + "loss": 0.2043, + "step": 898 + }, + { + "epoch": 0.3602882305844676, + "learning_rate": 1.3839741850973435e-05, + "loss": 0.3837, + "step": 900 + }, + { + "epoch": 0.3610888710968775, + "learning_rate": 1.3865534383127413e-05, + "loss": 0.2239, + "step": 902 + }, + { + "epoch": 0.36188951160928745, + "learning_rate": 1.3891296717406533e-05, + "loss": 0.083, + "step": 904 + }, + { + "epoch": 0.36269015212169736, + "learning_rate": 1.391702865255334e-05, + "loss": 0.3167, + "step": 906 + }, + { + "epoch": 0.3634907926341073, + "learning_rate": 1.3942729987547808e-05, + "loss": 0.0993, + "step": 908 + }, + { + "epoch": 0.36429143314651724, + "learning_rate": 1.3968400521608962e-05, + "loss": 0.2865, + "step": 910 + }, + { + "epoch": 0.36509207365892715, + "learning_rate": 1.3994040054196498e-05, + "loss": 0.2562, + "step": 912 + }, + { + "epoch": 0.36589271417133706, + "learning_rate": 1.4019648385012245e-05, + "loss": 0.1762, + "step": 914 + }, + { + "epoch": 0.366693354683747, + "learning_rate": 1.4045225314001789e-05, + "loss": 0.1536, + "step": 916 + }, + { + "epoch": 0.36749399519615694, + "learning_rate": 1.4070770641356069e-05, + "loss": 0.4599, + "step": 918 + }, + { + "epoch": 0.36829463570856685, + "learning_rate": 1.4096284167512856e-05, + "loss": 0.2153, + "step": 920 + }, + { + "epoch": 0.36909527622097676, + "learning_rate": 1.4121765693158355e-05, + "loss": 0.0877, + "step": 922 + }, + { + "epoch": 0.36989591673338673, + "learning_rate": 1.4147215019228813e-05, + "loss": 0.5322, + "step": 924 + }, + { + "epoch": 0.37069655724579664, + "learning_rate": 1.4172631946911964e-05, + "loss": 0.3526, + "step": 926 + }, + { + "epoch": 0.37149719775820655, + "learning_rate": 1.4198016277648665e-05, + "loss": 0.323, + "step": 928 + }, + { + "epoch": 0.37229783827061647, + "learning_rate": 1.4223367813134406e-05, + "loss": 0.0472, + "step": 930 + }, + { + "epoch": 0.37309847878302643, + "learning_rate": 1.4248686355320922e-05, + "loss": 0.1587, + "step": 932 + }, + { + "epoch": 0.37389911929543634, + "learning_rate": 1.4273971706417653e-05, + "loss": 0.4038, + "step": 934 + }, + { + "epoch": 0.37469975980784626, + "learning_rate": 1.429922366889332e-05, + "loss": 0.2765, + "step": 936 + }, + { + "epoch": 0.3755004003202562, + "learning_rate": 1.4324442045477534e-05, + "loss": 0.0974, + "step": 938 + }, + { + "epoch": 0.37630104083266613, + "learning_rate": 1.4349626639162231e-05, + "loss": 0.2454, + "step": 940 + }, + { + "epoch": 0.37710168134507605, + "learning_rate": 1.4374777253203265e-05, + "loss": 0.1364, + "step": 942 + }, + { + "epoch": 0.377902321857486, + "learning_rate": 1.4399893691121985e-05, + "loss": 0.2448, + "step": 944 + }, + { + "epoch": 0.3787029623698959, + "learning_rate": 1.4424975756706684e-05, + "loss": 0.2071, + "step": 946 + }, + { + "epoch": 0.37950360288230583, + "learning_rate": 1.4450023254014185e-05, + "loss": 0.1957, + "step": 948 + }, + { + "epoch": 0.38030424339471575, + "learning_rate": 1.4475035987371348e-05, + "loss": 0.2507, + "step": 950 + }, + { + "epoch": 0.3811048839071257, + "learning_rate": 1.4500013761376663e-05, + "loss": 0.8129, + "step": 952 + }, + { + "epoch": 0.3819055244195356, + "learning_rate": 1.4524956380901674e-05, + "loss": 0.0641, + "step": 954 + }, + { + "epoch": 0.38270616493194554, + "learning_rate": 1.454986365109255e-05, + "loss": 0.1356, + "step": 956 + }, + { + "epoch": 0.3835068054443555, + "learning_rate": 1.4574735377371669e-05, + "loss": 0.3025, + "step": 958 + }, + { + "epoch": 0.3843074459567654, + "learning_rate": 1.4599571365439027e-05, + "loss": 0.2261, + "step": 960 + }, + { + "epoch": 0.3851080864691753, + "learning_rate": 1.4624371421273812e-05, + "loss": 0.0825, + "step": 962 + }, + { + "epoch": 0.3859087269815853, + "learning_rate": 1.4649135351135968e-05, + "loss": 0.1754, + "step": 964 + }, + { + "epoch": 0.3867093674939952, + "learning_rate": 1.4673862961567604e-05, + "loss": 0.0762, + "step": 966 + }, + { + "epoch": 0.3875100080064051, + "learning_rate": 1.4698554059394563e-05, + "loss": 0.2071, + "step": 968 + }, + { + "epoch": 0.388310648518815, + "learning_rate": 1.4723208451727977e-05, + "loss": 0.0323, + "step": 970 + }, + { + "epoch": 0.389111289031225, + "learning_rate": 1.4747825945965675e-05, + "loss": 0.0896, + "step": 972 + }, + { + "epoch": 0.3899119295436349, + "learning_rate": 1.4772406349793749e-05, + "loss": 0.3451, + "step": 974 + }, + { + "epoch": 0.3907125700560448, + "learning_rate": 1.4796949471188033e-05, + "loss": 0.0705, + "step": 976 + }, + { + "epoch": 0.3915132105684548, + "learning_rate": 1.4821455118415666e-05, + "loss": 0.2977, + "step": 978 + }, + { + "epoch": 0.3923138510808647, + "learning_rate": 1.4845923100036479e-05, + "loss": 0.0907, + "step": 980 + }, + { + "epoch": 0.3931144915932746, + "learning_rate": 1.4870353224904563e-05, + "loss": 0.0208, + "step": 982 + }, + { + "epoch": 0.3939151321056846, + "learning_rate": 1.4894745302169786e-05, + "loss": 0.0376, + "step": 984 + }, + { + "epoch": 0.3947157726180945, + "learning_rate": 1.4919099141279205e-05, + "loss": 0.8712, + "step": 986 + }, + { + "epoch": 0.3955164131305044, + "learning_rate": 1.4943414551978597e-05, + "loss": 0.5081, + "step": 988 + }, + { + "epoch": 0.3963170536429143, + "learning_rate": 1.4967691344313988e-05, + "loss": 0.2639, + "step": 990 + }, + { + "epoch": 0.3971176941553243, + "learning_rate": 1.499192932863305e-05, + "loss": 0.4575, + "step": 992 + }, + { + "epoch": 0.3979183346677342, + "learning_rate": 1.5016128315586626e-05, + "loss": 0.0815, + "step": 994 + }, + { + "epoch": 0.3987189751801441, + "learning_rate": 1.5040288116130261e-05, + "loss": 0.015, + "step": 996 + }, + { + "epoch": 0.39951961569255406, + "learning_rate": 1.5064408541525568e-05, + "loss": 0.3493, + "step": 998 + }, + { + "epoch": 0.400320256204964, + "learning_rate": 1.5088489403341793e-05, + "loss": 0.5574, + "step": 1000 + }, + { + "epoch": 0.4011208967173739, + "learning_rate": 1.5112530513457229e-05, + "loss": 0.0433, + "step": 1002 + }, + { + "epoch": 0.40192153722978385, + "learning_rate": 1.513653168406076e-05, + "loss": 0.4451, + "step": 1004 + }, + { + "epoch": 0.40272217774219377, + "learning_rate": 1.5160492727653245e-05, + "loss": 0.1424, + "step": 1006 + }, + { + "epoch": 0.4035228182546037, + "learning_rate": 1.5184413457049006e-05, + "loss": 0.0179, + "step": 1008 + }, + { + "epoch": 0.4043234587670136, + "learning_rate": 1.5208293685377354e-05, + "loss": 0.1899, + "step": 1010 + }, + { + "epoch": 0.40512409927942356, + "learning_rate": 1.5232133226083954e-05, + "loss": 0.1627, + "step": 1012 + }, + { + "epoch": 0.40592473979183347, + "learning_rate": 1.5255931892932322e-05, + "loss": 0.128, + "step": 1014 + }, + { + "epoch": 0.4067253803042434, + "learning_rate": 1.527968950000533e-05, + "loss": 0.0551, + "step": 1016 + }, + { + "epoch": 0.40752602081665334, + "learning_rate": 1.5303405861706574e-05, + "loss": 0.3644, + "step": 1018 + }, + { + "epoch": 0.40832666132906326, + "learning_rate": 1.532708079276185e-05, + "loss": 0.0188, + "step": 1020 + }, + { + "epoch": 0.40912730184147317, + "learning_rate": 1.5350714108220667e-05, + "loss": 0.0302, + "step": 1022 + }, + { + "epoch": 0.4099279423538831, + "learning_rate": 1.5374305623457594e-05, + "loss": 0.1886, + "step": 1024 + }, + { + "epoch": 0.41072858286629305, + "learning_rate": 1.539785515417376e-05, + "loss": 0.0798, + "step": 1026 + }, + { + "epoch": 0.41152922337870296, + "learning_rate": 1.542136251639826e-05, + "loss": 0.8809, + "step": 1028 + }, + { + "epoch": 0.41232986389111287, + "learning_rate": 1.5444827526489668e-05, + "loss": 0.1639, + "step": 1030 + }, + { + "epoch": 0.41313050440352284, + "learning_rate": 1.5468250001137368e-05, + "loss": 0.0516, + "step": 1032 + }, + { + "epoch": 0.41393114491593275, + "learning_rate": 1.5491629757363026e-05, + "loss": 0.2273, + "step": 1034 + }, + { + "epoch": 0.41473178542834266, + "learning_rate": 1.551496661252208e-05, + "loss": 0.3641, + "step": 1036 + }, + { + "epoch": 0.4155324259407526, + "learning_rate": 1.5538260384305073e-05, + "loss": 0.0152, + "step": 1038 + }, + { + "epoch": 0.41633306645316254, + "learning_rate": 1.5561510890739113e-05, + "loss": 0.1172, + "step": 1040 + }, + { + "epoch": 0.41713370696557245, + "learning_rate": 1.5584717950189353e-05, + "loss": 0.0946, + "step": 1042 + }, + { + "epoch": 0.41793434747798236, + "learning_rate": 1.5607881381360296e-05, + "loss": 0.0921, + "step": 1044 + }, + { + "epoch": 0.4187349879903923, + "learning_rate": 1.563100100329731e-05, + "loss": 0.1605, + "step": 1046 + }, + { + "epoch": 0.41953562850280224, + "learning_rate": 1.565407663538797e-05, + "loss": 0.0204, + "step": 1048 + }, + { + "epoch": 0.42033626901521215, + "learning_rate": 1.567710809736356e-05, + "loss": 0.0842, + "step": 1050 + }, + { + "epoch": 0.4211369095276221, + "learning_rate": 1.5700095209300376e-05, + "loss": 0.0796, + "step": 1052 + }, + { + "epoch": 0.42193755004003203, + "learning_rate": 1.572303779162118e-05, + "loss": 0.0992, + "step": 1054 + }, + { + "epoch": 0.42273819055244194, + "learning_rate": 1.5745935665096647e-05, + "loss": 0.0317, + "step": 1056 + }, + { + "epoch": 0.4235388310648519, + "learning_rate": 1.5768788650846677e-05, + "loss": 0.0394, + "step": 1058 + }, + { + "epoch": 0.4243394715772618, + "learning_rate": 1.5791596570341844e-05, + "loss": 0.1129, + "step": 1060 + }, + { + "epoch": 0.42514011208967173, + "learning_rate": 1.581435924540481e-05, + "loss": 0.0158, + "step": 1062 + }, + { + "epoch": 0.42594075260208164, + "learning_rate": 1.5837076498211666e-05, + "loss": 0.0748, + "step": 1064 + }, + { + "epoch": 0.4267413931144916, + "learning_rate": 1.5859748151293333e-05, + "loss": 0.4398, + "step": 1066 + }, + { + "epoch": 0.4275420336269015, + "learning_rate": 1.5882374027537005e-05, + "loss": 0.1629, + "step": 1068 + }, + { + "epoch": 0.42834267413931143, + "learning_rate": 1.5904953950187455e-05, + "loss": 0.4817, + "step": 1070 + }, + { + "epoch": 0.4291433146517214, + "learning_rate": 1.5927487742848448e-05, + "loss": 0.8606, + "step": 1072 + }, + { + "epoch": 0.4299439551641313, + "learning_rate": 1.594997522948412e-05, + "loss": 0.2745, + "step": 1074 + }, + { + "epoch": 0.4307445956765412, + "learning_rate": 1.5972416234420393e-05, + "loss": 0.1034, + "step": 1076 + }, + { + "epoch": 0.4315452361889512, + "learning_rate": 1.599481058234626e-05, + "loss": 0.6513, + "step": 1078 + }, + { + "epoch": 0.4323458767013611, + "learning_rate": 1.60171580983152e-05, + "loss": 0.0517, + "step": 1080 + }, + { + "epoch": 0.433146517213771, + "learning_rate": 1.6039458607746607e-05, + "loss": 0.011, + "step": 1082 + }, + { + "epoch": 0.4339471577261809, + "learning_rate": 1.606171193642703e-05, + "loss": 0.0504, + "step": 1084 + }, + { + "epoch": 0.4347477982385909, + "learning_rate": 1.6083917910511616e-05, + "loss": 0.1601, + "step": 1086 + }, + { + "epoch": 0.4355484387510008, + "learning_rate": 1.6106076356525474e-05, + "loss": 0.0823, + "step": 1088 + }, + { + "epoch": 0.4363490792634107, + "learning_rate": 1.6128187101364982e-05, + "loss": 0.1996, + "step": 1090 + }, + { + "epoch": 0.4371497197758207, + "learning_rate": 1.6150249972299153e-05, + "loss": 0.0065, + "step": 1092 + }, + { + "epoch": 0.4379503602882306, + "learning_rate": 1.617226479697104e-05, + "loss": 0.0838, + "step": 1094 + }, + { + "epoch": 0.4387510008006405, + "learning_rate": 1.6194231403398994e-05, + "loss": 0.0665, + "step": 1096 + }, + { + "epoch": 0.43955164131305047, + "learning_rate": 1.621614961997806e-05, + "loss": 0.2692, + "step": 1098 + }, + { + "epoch": 0.4403522818254604, + "learning_rate": 1.6238019275481313e-05, + "loss": 0.1905, + "step": 1100 + }, + { + "epoch": 0.4411529223378703, + "learning_rate": 1.6259840199061212e-05, + "loss": 0.0288, + "step": 1102 + }, + { + "epoch": 0.4419535628502802, + "learning_rate": 1.6281612220250883e-05, + "loss": 0.0064, + "step": 1104 + }, + { + "epoch": 0.44275420336269017, + "learning_rate": 1.6303335168965474e-05, + "loss": 0.2056, + "step": 1106 + }, + { + "epoch": 0.4435548438751001, + "learning_rate": 1.6325008875503543e-05, + "loss": 0.1396, + "step": 1108 + }, + { + "epoch": 0.44435548438751, + "learning_rate": 1.6346633170548285e-05, + "loss": 0.0021, + "step": 1110 + }, + { + "epoch": 0.44515612489991996, + "learning_rate": 1.6368207885168897e-05, + "loss": 0.2049, + "step": 1112 + }, + { + "epoch": 0.44595676541232987, + "learning_rate": 1.6389732850821957e-05, + "loss": 0.4087, + "step": 1114 + }, + { + "epoch": 0.4467574059247398, + "learning_rate": 1.641120789935263e-05, + "loss": 0.0183, + "step": 1116 + }, + { + "epoch": 0.4475580464371497, + "learning_rate": 1.6432632862996042e-05, + "loss": 0.194, + "step": 1118 + }, + { + "epoch": 0.44835868694955966, + "learning_rate": 1.6454007574378637e-05, + "loss": 0.195, + "step": 1120 + }, + { + "epoch": 0.44915932746196957, + "learning_rate": 1.6475331866519377e-05, + "loss": 0.0973, + "step": 1122 + }, + { + "epoch": 0.4499599679743795, + "learning_rate": 1.6496605572831134e-05, + "loss": 0.2849, + "step": 1124 + }, + { + "epoch": 0.45076060848678945, + "learning_rate": 1.6517828527121928e-05, + "loss": 0.0539, + "step": 1126 + }, + { + "epoch": 0.45156124899919936, + "learning_rate": 1.6539000563596318e-05, + "loss": 0.0263, + "step": 1128 + }, + { + "epoch": 0.45236188951160927, + "learning_rate": 1.6560121516856586e-05, + "loss": 0.0043, + "step": 1130 + }, + { + "epoch": 0.45316253002401924, + "learning_rate": 1.6581191221904077e-05, + "loss": 0.093, + "step": 1132 + }, + { + "epoch": 0.45396317053642915, + "learning_rate": 1.6602209514140542e-05, + "loss": 0.0711, + "step": 1134 + }, + { + "epoch": 0.45476381104883906, + "learning_rate": 1.6623176229369324e-05, + "loss": 0.2443, + "step": 1136 + }, + { + "epoch": 0.455564451561249, + "learning_rate": 1.6644091203796694e-05, + "loss": 0.4891, + "step": 1138 + }, + { + "epoch": 0.45636509207365894, + "learning_rate": 1.6664954274033168e-05, + "loss": 0.1636, + "step": 1140 + }, + { + "epoch": 0.45716573258606885, + "learning_rate": 1.6685765277094695e-05, + "loss": 0.2698, + "step": 1142 + }, + { + "epoch": 0.45796637309847876, + "learning_rate": 1.6706524050403996e-05, + "loss": 0.7748, + "step": 1144 + }, + { + "epoch": 0.45876701361088873, + "learning_rate": 1.6727230431791806e-05, + "loss": 0.1163, + "step": 1146 + }, + { + "epoch": 0.45956765412329864, + "learning_rate": 1.674788425949818e-05, + "loss": 0.0006, + "step": 1148 + }, + { + "epoch": 0.46036829463570855, + "learning_rate": 1.6768485372173696e-05, + "loss": 0.39, + "step": 1150 + }, + { + "epoch": 0.4611689351481185, + "learning_rate": 1.6789033608880735e-05, + "loss": 0.064, + "step": 1152 + }, + { + "epoch": 0.46196957566052843, + "learning_rate": 1.6809528809094798e-05, + "loss": 0.7769, + "step": 1154 + }, + { + "epoch": 0.46277021617293834, + "learning_rate": 1.6829970812705674e-05, + "loss": 0.151, + "step": 1156 + }, + { + "epoch": 0.46357085668534825, + "learning_rate": 1.6850359460018733e-05, + "loss": 0.5947, + "step": 1158 + }, + { + "epoch": 0.4643714971977582, + "learning_rate": 1.6870694591756165e-05, + "loss": 0.0909, + "step": 1160 + }, + { + "epoch": 0.46517213771016813, + "learning_rate": 1.689097604905826e-05, + "loss": 0.1042, + "step": 1162 + }, + { + "epoch": 0.46597277822257804, + "learning_rate": 1.6911203673484577e-05, + "loss": 0.4073, + "step": 1164 + }, + { + "epoch": 0.466773418734988, + "learning_rate": 1.6931377307015226e-05, + "loss": 0.0709, + "step": 1166 + }, + { + "epoch": 0.4675740592473979, + "learning_rate": 1.695149679205214e-05, + "loss": 0.1424, + "step": 1168 + }, + { + "epoch": 0.46837469975980783, + "learning_rate": 1.6971561971420222e-05, + "loss": 0.2204, + "step": 1170 + }, + { + "epoch": 0.4691753402722178, + "learning_rate": 1.6991572688368628e-05, + "loss": 0.5136, + "step": 1172 + }, + { + "epoch": 0.4699759807846277, + "learning_rate": 1.701152878657196e-05, + "loss": 0.1587, + "step": 1174 + }, + { + "epoch": 0.4707766212970376, + "learning_rate": 1.7031430110131562e-05, + "loss": 0.2293, + "step": 1176 + }, + { + "epoch": 0.47157726180944753, + "learning_rate": 1.705127650357662e-05, + "loss": 0.2638, + "step": 1178 + }, + { + "epoch": 0.4723779023218575, + "learning_rate": 1.7071067811865467e-05, + "loss": 0.4434, + "step": 1180 + }, + { + "epoch": 0.4731785428342674, + "learning_rate": 1.7090803880386778e-05, + "loss": 0.0871, + "step": 1182 + }, + { + "epoch": 0.4739791833466773, + "learning_rate": 1.7110484554960738e-05, + "loss": 0.2391, + "step": 1184 + }, + { + "epoch": 0.4747798238590873, + "learning_rate": 1.713010968184029e-05, + "loss": 0.7516, + "step": 1186 + }, + { + "epoch": 0.4755804643714972, + "learning_rate": 1.7149679107712306e-05, + "loss": 0.0321, + "step": 1188 + }, + { + "epoch": 0.4763811048839071, + "learning_rate": 1.716919267969883e-05, + "loss": 0.2413, + "step": 1190 + }, + { + "epoch": 0.4771817453963171, + "learning_rate": 1.7188650245358215e-05, + "loss": 0.3946, + "step": 1192 + }, + { + "epoch": 0.477982385908727, + "learning_rate": 1.7208051652686338e-05, + "loss": 0.2789, + "step": 1194 + }, + { + "epoch": 0.4787830264211369, + "learning_rate": 1.722739675011779e-05, + "loss": 0.0135, + "step": 1196 + }, + { + "epoch": 0.4795836669335468, + "learning_rate": 1.7246685386527095e-05, + "loss": 0.1909, + "step": 1198 + }, + { + "epoch": 0.4803843074459568, + "learning_rate": 1.726591741122981e-05, + "loss": 0.2864, + "step": 1200 + }, + { + "epoch": 0.4811849479583667, + "learning_rate": 1.7285092673983753e-05, + "loss": 0.3336, + "step": 1202 + }, + { + "epoch": 0.4819855884707766, + "learning_rate": 1.730421102499021e-05, + "loss": 0.1287, + "step": 1204 + }, + { + "epoch": 0.48278622898318657, + "learning_rate": 1.7323272314895022e-05, + "loss": 0.3104, + "step": 1206 + }, + { + "epoch": 0.4835868694955965, + "learning_rate": 1.734227639478982e-05, + "loss": 0.2498, + "step": 1208 + }, + { + "epoch": 0.4843875100080064, + "learning_rate": 1.736122311621314e-05, + "loss": 0.2466, + "step": 1210 + }, + { + "epoch": 0.4851881505204163, + "learning_rate": 1.738011233115165e-05, + "loss": 0.7192, + "step": 1212 + }, + { + "epoch": 0.4859887910328263, + "learning_rate": 1.7398943892041227e-05, + "loss": 0.1451, + "step": 1214 + }, + { + "epoch": 0.4867894315452362, + "learning_rate": 1.7417717651768144e-05, + "loss": 0.5076, + "step": 1216 + }, + { + "epoch": 0.4875900720576461, + "learning_rate": 1.743643346367026e-05, + "loss": 0.1385, + "step": 1218 + }, + { + "epoch": 0.48839071257005606, + "learning_rate": 1.7455091181538087e-05, + "loss": 0.192, + "step": 1220 + }, + { + "epoch": 0.489191353082466, + "learning_rate": 1.7473690659615992e-05, + "loss": 0.328, + "step": 1222 + }, + { + "epoch": 0.4899919935948759, + "learning_rate": 1.74922317526033e-05, + "loss": 0.1104, + "step": 1224 + }, + { + "epoch": 0.49079263410728585, + "learning_rate": 1.7510714315655474e-05, + "loss": 0.1754, + "step": 1226 + }, + { + "epoch": 0.49159327461969576, + "learning_rate": 1.752913820438519e-05, + "loss": 0.2398, + "step": 1228 + }, + { + "epoch": 0.4923939151321057, + "learning_rate": 1.7547503274863495e-05, + "loss": 0.136, + "step": 1230 + }, + { + "epoch": 0.4931945556445156, + "learning_rate": 1.756580938362096e-05, + "loss": 0.132, + "step": 1232 + }, + { + "epoch": 0.49399519615692555, + "learning_rate": 1.758405638764873e-05, + "loss": 0.2823, + "step": 1234 + }, + { + "epoch": 0.49479583666933546, + "learning_rate": 1.7602244144399693e-05, + "loss": 0.069, + "step": 1236 + }, + { + "epoch": 0.4955964771817454, + "learning_rate": 1.7620372511789604e-05, + "loss": 0.4614, + "step": 1238 + }, + { + "epoch": 0.49639711769415534, + "learning_rate": 1.7638441348198147e-05, + "loss": 0.2106, + "step": 1240 + }, + { + "epoch": 0.49719775820656525, + "learning_rate": 1.7656450512470077e-05, + "loss": 0.0966, + "step": 1242 + }, + { + "epoch": 0.49799839871897517, + "learning_rate": 1.7674399863916295e-05, + "loss": 0.1319, + "step": 1244 + }, + { + "epoch": 0.49879903923138513, + "learning_rate": 1.7692289262315e-05, + "loss": 0.2828, + "step": 1246 + }, + { + "epoch": 0.49959967974379504, + "learning_rate": 1.771011856791273e-05, + "loss": 0.5292, + "step": 1248 + }, + { + "epoch": 0.500400320256205, + "learning_rate": 1.7727887641425448e-05, + "loss": 0.1641, + "step": 1250 + }, + { + "epoch": 0.5012009607686149, + "learning_rate": 1.7745596344039712e-05, + "loss": 0.2002, + "step": 1252 + }, + { + "epoch": 0.5020016012810248, + "learning_rate": 1.7763244537413657e-05, + "loss": 0.2159, + "step": 1254 + }, + { + "epoch": 0.5028022417934348, + "learning_rate": 1.7780832083678116e-05, + "loss": 0.2908, + "step": 1256 + }, + { + "epoch": 0.5036028823058447, + "learning_rate": 1.7798358845437754e-05, + "loss": 0.1535, + "step": 1258 + }, + { + "epoch": 0.5044035228182546, + "learning_rate": 1.7815824685772035e-05, + "loss": 0.3213, + "step": 1260 + }, + { + "epoch": 0.5052041633306645, + "learning_rate": 1.7833229468236364e-05, + "loss": 0.0867, + "step": 1262 + }, + { + "epoch": 0.5060048038430744, + "learning_rate": 1.7850573056863156e-05, + "loss": 0.1262, + "step": 1264 + }, + { + "epoch": 0.5068054443554844, + "learning_rate": 1.786785531616285e-05, + "loss": 0.1487, + "step": 1266 + }, + { + "epoch": 0.5076060848678943, + "learning_rate": 1.7885076111125004e-05, + "loss": 0.2642, + "step": 1268 + }, + { + "epoch": 0.5084067253803043, + "learning_rate": 1.790223530721933e-05, + "loss": 0.0003, + "step": 1270 + }, + { + "epoch": 0.5092073658927142, + "learning_rate": 1.791933277039679e-05, + "loss": 0.145, + "step": 1272 + }, + { + "epoch": 0.5100080064051241, + "learning_rate": 1.7936368367090577e-05, + "loss": 0.174, + "step": 1274 + }, + { + "epoch": 0.510808646917534, + "learning_rate": 1.7953341964217183e-05, + "loss": 0.0573, + "step": 1276 + }, + { + "epoch": 0.5116092874299439, + "learning_rate": 1.7970253429177477e-05, + "loss": 0.6106, + "step": 1278 + }, + { + "epoch": 0.5124099279423538, + "learning_rate": 1.7987102629857696e-05, + "loss": 0.1791, + "step": 1280 + }, + { + "epoch": 0.5132105684547638, + "learning_rate": 1.800388943463047e-05, + "loss": 0.2116, + "step": 1282 + }, + { + "epoch": 0.5140112089671738, + "learning_rate": 1.8020613712355912e-05, + "loss": 0.1299, + "step": 1284 + }, + { + "epoch": 0.5148118494795837, + "learning_rate": 1.803727533238257e-05, + "loss": 0.1081, + "step": 1286 + }, + { + "epoch": 0.5156124899919936, + "learning_rate": 1.805387416454847e-05, + "loss": 0.3832, + "step": 1288 + }, + { + "epoch": 0.5164131305044035, + "learning_rate": 1.8070410079182195e-05, + "loss": 0.1825, + "step": 1290 + }, + { + "epoch": 0.5172137710168134, + "learning_rate": 1.8086882947103787e-05, + "loss": 0.142, + "step": 1292 + }, + { + "epoch": 0.5180144115292233, + "learning_rate": 1.8103292639625842e-05, + "loss": 0.0229, + "step": 1294 + }, + { + "epoch": 0.5188150520416333, + "learning_rate": 1.811963902855447e-05, + "loss": 0.1252, + "step": 1296 + }, + { + "epoch": 0.5196156925540433, + "learning_rate": 1.813592198619035e-05, + "loss": 0.0974, + "step": 1298 + }, + { + "epoch": 0.5204163330664532, + "learning_rate": 1.8152141385329658e-05, + "loss": 0.1453, + "step": 1300 + }, + { + "epoch": 0.5212169735788631, + "learning_rate": 1.816829709926509e-05, + "loss": 0.2697, + "step": 1302 + }, + { + "epoch": 0.522017614091273, + "learning_rate": 1.8184389001786895e-05, + "loss": 0.1717, + "step": 1304 + }, + { + "epoch": 0.5228182546036829, + "learning_rate": 1.8200416967183785e-05, + "loss": 0.0476, + "step": 1306 + }, + { + "epoch": 0.5236188951160928, + "learning_rate": 1.821638087024396e-05, + "loss": 0.0677, + "step": 1308 + }, + { + "epoch": 0.5244195356285029, + "learning_rate": 1.8232280586256097e-05, + "loss": 0.3593, + "step": 1310 + }, + { + "epoch": 0.5252201761409128, + "learning_rate": 1.8248115991010296e-05, + "loss": 0.0539, + "step": 1312 + }, + { + "epoch": 0.5260208166533227, + "learning_rate": 1.8263886960799055e-05, + "loss": 0.0399, + "step": 1314 + }, + { + "epoch": 0.5268214571657326, + "learning_rate": 1.8279593372418264e-05, + "loss": 1.1203, + "step": 1316 + }, + { + "epoch": 0.5276220976781425, + "learning_rate": 1.829523510316813e-05, + "loss": 0.018, + "step": 1318 + }, + { + "epoch": 0.5284227381905524, + "learning_rate": 1.8310812030854155e-05, + "loss": 0.4293, + "step": 1320 + }, + { + "epoch": 0.5292233787029623, + "learning_rate": 1.832632403378808e-05, + "loss": 0.2101, + "step": 1322 + }, + { + "epoch": 0.5300240192153723, + "learning_rate": 1.834177099078887e-05, + "loss": 0.3111, + "step": 1324 + }, + { + "epoch": 0.5308246597277823, + "learning_rate": 1.8357152781183606e-05, + "loss": 0.1155, + "step": 1326 + }, + { + "epoch": 0.5316253002401922, + "learning_rate": 1.8372469284808465e-05, + "loss": 0.2028, + "step": 1328 + }, + { + "epoch": 0.5324259407526021, + "learning_rate": 1.8387720382009665e-05, + "loss": 0.2867, + "step": 1330 + }, + { + "epoch": 0.533226581265012, + "learning_rate": 1.840290595364436e-05, + "loss": 0.2181, + "step": 1332 + }, + { + "epoch": 0.5340272217774219, + "learning_rate": 1.8418025881081606e-05, + "loss": 0.2585, + "step": 1334 + }, + { + "epoch": 0.5348278622898318, + "learning_rate": 1.8433080046203286e-05, + "loss": 0.662, + "step": 1336 + }, + { + "epoch": 0.5356285028022418, + "learning_rate": 1.844806833140501e-05, + "loss": 0.1319, + "step": 1338 + }, + { + "epoch": 0.5364291433146517, + "learning_rate": 1.8462990619597054e-05, + "loss": 0.0692, + "step": 1340 + }, + { + "epoch": 0.5372297838270617, + "learning_rate": 1.8477846794205258e-05, + "loss": 0.0794, + "step": 1342 + }, + { + "epoch": 0.5380304243394716, + "learning_rate": 1.8492636739171966e-05, + "loss": 0.0022, + "step": 1344 + }, + { + "epoch": 0.5388310648518815, + "learning_rate": 1.85073603389569e-05, + "loss": 0.0033, + "step": 1346 + }, + { + "epoch": 0.5396317053642914, + "learning_rate": 1.8522017478538067e-05, + "loss": 0.0789, + "step": 1348 + }, + { + "epoch": 0.5404323458767014, + "learning_rate": 1.8536608043412695e-05, + "loss": 0.0815, + "step": 1350 + }, + { + "epoch": 0.5412329863891113, + "learning_rate": 1.855113191959808e-05, + "loss": 0.1315, + "step": 1352 + }, + { + "epoch": 0.5420336269015212, + "learning_rate": 1.856558899363248e-05, + "loss": 0.3248, + "step": 1354 + }, + { + "epoch": 0.5428342674139311, + "learning_rate": 1.8579979152576063e-05, + "loss": 0.0728, + "step": 1356 + }, + { + "epoch": 0.5436349079263411, + "learning_rate": 1.85943022840117e-05, + "loss": 1.013, + "step": 1358 + }, + { + "epoch": 0.544435548438751, + "learning_rate": 1.8608558276045895e-05, + "loss": 0.4096, + "step": 1360 + }, + { + "epoch": 0.5452361889511609, + "learning_rate": 1.862274701730967e-05, + "loss": 0.2427, + "step": 1362 + }, + { + "epoch": 0.5460368294635709, + "learning_rate": 1.86368683969594e-05, + "loss": 0.3155, + "step": 1364 + }, + { + "epoch": 0.5468374699759808, + "learning_rate": 1.865092230467769e-05, + "loss": 0.4296, + "step": 1366 + }, + { + "epoch": 0.5476381104883907, + "learning_rate": 1.866490863067425e-05, + "loss": 0.2626, + "step": 1368 + }, + { + "epoch": 0.5484387510008006, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.1588, + "step": 1370 + }, + { + "epoch": 0.5492393915132106, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.1276, + "step": 1372 + }, + { + "epoch": 0.5500400320256205, + "learning_rate": 1.87064610283551e-05, + "loss": 0.102, + "step": 1374 + }, + { + "epoch": 0.5508406725380304, + "learning_rate": 1.8720175940133705e-05, + "loss": 0.1099, + "step": 1376 + }, + { + "epoch": 0.5516413130504404, + "learning_rate": 1.873382272917545e-05, + "loss": 0.3393, + "step": 1378 + }, + { + "epoch": 0.5524419535628503, + "learning_rate": 1.8747401288870472e-05, + "loss": 0.08, + "step": 1380 + }, + { + "epoch": 0.5532425940752602, + "learning_rate": 1.876091151314196e-05, + "loss": 0.2856, + "step": 1382 + }, + { + "epoch": 0.5540432345876701, + "learning_rate": 1.877435329644691e-05, + "loss": 0.2085, + "step": 1384 + }, + { + "epoch": 0.55484387510008, + "learning_rate": 1.8787726533776996e-05, + "loss": 0.4122, + "step": 1386 + }, + { + "epoch": 0.55564451561249, + "learning_rate": 1.8801031120659393e-05, + "loss": 0.1118, + "step": 1388 + }, + { + "epoch": 0.5564451561248999, + "learning_rate": 1.8814266953157557e-05, + "loss": 0.0669, + "step": 1390 + }, + { + "epoch": 0.5572457966373099, + "learning_rate": 1.8827433927872066e-05, + "loss": 0.411, + "step": 1392 + }, + { + "epoch": 0.5580464371497198, + "learning_rate": 1.8840531941941415e-05, + "loss": 0.0661, + "step": 1394 + }, + { + "epoch": 0.5588470776621297, + "learning_rate": 1.8853560893042854e-05, + "loss": 0.4688, + "step": 1396 + }, + { + "epoch": 0.5596477181745396, + "learning_rate": 1.8866520679393127e-05, + "loss": 0.0438, + "step": 1398 + }, + { + "epoch": 0.5604483586869495, + "learning_rate": 1.8879411199749303e-05, + "loss": 0.0017, + "step": 1400 + }, + { + "epoch": 0.5612489991993594, + "learning_rate": 1.889223235340958e-05, + "loss": 0.0591, + "step": 1402 + }, + { + "epoch": 0.5620496397117695, + "learning_rate": 1.8904984040214037e-05, + "loss": 0.4956, + "step": 1404 + }, + { + "epoch": 0.5628502802241794, + "learning_rate": 1.8917666160545436e-05, + "loss": 0.029, + "step": 1406 + }, + { + "epoch": 0.5636509207365893, + "learning_rate": 1.893027861533002e-05, + "loss": 0.1913, + "step": 1408 + }, + { + "epoch": 0.5644515612489992, + "learning_rate": 1.894282130603823e-05, + "loss": 0.2488, + "step": 1410 + }, + { + "epoch": 0.5652522017614091, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.0893, + "step": 1412 + }, + { + "epoch": 0.566052842273819, + "learning_rate": 1.896769700383315e-05, + "loss": 0.3816, + "step": 1414 + }, + { + "epoch": 0.5668534827862289, + "learning_rate": 1.898002981658886e-05, + "loss": 0.3141, + "step": 1416 + }, + { + "epoch": 0.567654123298639, + "learning_rate": 1.899229247660769e-05, + "loss": 0.0574, + "step": 1418 + }, + { + "epoch": 0.5684547638110489, + "learning_rate": 1.9004484888092724e-05, + "loss": 0.5658, + "step": 1420 + }, + { + "epoch": 0.5692554043234588, + "learning_rate": 1.901660695579585e-05, + "loss": 0.4091, + "step": 1422 + }, + { + "epoch": 0.5700560448358687, + "learning_rate": 1.9028658585018455e-05, + "loss": 0.2643, + "step": 1424 + }, + { + "epoch": 0.5708566853482786, + "learning_rate": 1.9040639681612212e-05, + "loss": 0.0455, + "step": 1426 + }, + { + "epoch": 0.5716573258606885, + "learning_rate": 1.9052550151979816e-05, + "loss": 0.2142, + "step": 1428 + }, + { + "epoch": 0.5724579663730984, + "learning_rate": 1.9064389903075676e-05, + "loss": 0.2532, + "step": 1430 + }, + { + "epoch": 0.5732586068855084, + "learning_rate": 1.9076158842406674e-05, + "loss": 1.2189, + "step": 1432 + }, + { + "epoch": 0.5740592473979184, + "learning_rate": 1.9087856878032886e-05, + "loss": 0.1078, + "step": 1434 + }, + { + "epoch": 0.5748598879103283, + "learning_rate": 1.909948391856829e-05, + "loss": 0.3039, + "step": 1436 + }, + { + "epoch": 0.5756605284227382, + "learning_rate": 1.911103987318148e-05, + "loss": 0.2527, + "step": 1438 + }, + { + "epoch": 0.5764611689351481, + "learning_rate": 1.912252465159637e-05, + "loss": 0.1365, + "step": 1440 + }, + { + "epoch": 0.577261809447558, + "learning_rate": 1.913393816409294e-05, + "loss": 0.2057, + "step": 1442 + }, + { + "epoch": 0.578062449959968, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.5137, + "step": 1444 + }, + { + "epoch": 0.5788630904723779, + "learning_rate": 1.9156551035235288e-05, + "loss": 0.0381, + "step": 1446 + }, + { + "epoch": 0.5796637309847879, + "learning_rate": 1.9167750217227454e-05, + "loss": 0.3175, + "step": 1448 + }, + { + "epoch": 0.5804643714971978, + "learning_rate": 1.9178877779995423e-05, + "loss": 0.3451, + "step": 1450 + }, + { + "epoch": 0.5812650120096077, + "learning_rate": 1.9189933636609747e-05, + "loss": 0.4345, + "step": 1452 + }, + { + "epoch": 0.5820656525220176, + "learning_rate": 1.9200917700701173e-05, + "loss": 0.2699, + "step": 1454 + }, + { + "epoch": 0.5828662930344275, + "learning_rate": 1.9211829886461274e-05, + "loss": 0.3656, + "step": 1456 + }, + { + "epoch": 0.5836669335468375, + "learning_rate": 1.9222670108643146e-05, + "loss": 0.0408, + "step": 1458 + }, + { + "epoch": 0.5844675740592474, + "learning_rate": 1.9233438282562085e-05, + "loss": 0.0889, + "step": 1460 + }, + { + "epoch": 0.5852682145716573, + "learning_rate": 1.924413432409622e-05, + "loss": 0.4903, + "step": 1462 + }, + { + "epoch": 0.5860688550840673, + "learning_rate": 1.925475814968719e-05, + "loss": 0.1423, + "step": 1464 + }, + { + "epoch": 0.5868694955964772, + "learning_rate": 1.926530967634078e-05, + "loss": 0.4906, + "step": 1466 + }, + { + "epoch": 0.5876701361088871, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.2471, + "step": 1468 + }, + { + "epoch": 0.588470776621297, + "learning_rate": 1.9286195503683705e-05, + "loss": 0.3457, + "step": 1470 + }, + { + "epoch": 0.589271417133707, + "learning_rate": 1.9296529641211215e-05, + "loss": 0.2819, + "step": 1472 + }, + { + "epoch": 0.5900720576461169, + "learning_rate": 1.9306791153479004e-05, + "loss": 0.2074, + "step": 1474 + }, + { + "epoch": 0.5908726981585268, + "learning_rate": 1.9316979960323286e-05, + "loss": 0.1451, + "step": 1476 + }, + { + "epoch": 0.5916733386709367, + "learning_rate": 1.932709598214825e-05, + "loss": 0.2294, + "step": 1478 + }, + { + "epoch": 0.5924739791833467, + "learning_rate": 1.9337139139926707e-05, + "loss": 0.4399, + "step": 1480 + }, + { + "epoch": 0.5932746196957566, + "learning_rate": 1.9347109355200672e-05, + "loss": 0.2047, + "step": 1482 + }, + { + "epoch": 0.5940752602081665, + "learning_rate": 1.935700655008199e-05, + "loss": 0.2363, + "step": 1484 + }, + { + "epoch": 0.5948759007205765, + "learning_rate": 1.9366830647252967e-05, + "loss": 0.1318, + "step": 1486 + }, + { + "epoch": 0.5956765412329864, + "learning_rate": 1.9376581569966933e-05, + "loss": 0.1546, + "step": 1488 + }, + { + "epoch": 0.5964771817453963, + "learning_rate": 1.9386259242048883e-05, + "loss": 0.1202, + "step": 1490 + }, + { + "epoch": 0.5972778222578062, + "learning_rate": 1.939586358789602e-05, + "loss": 0.0085, + "step": 1492 + }, + { + "epoch": 0.5980784627702161, + "learning_rate": 1.940539453247842e-05, + "loss": 0.0263, + "step": 1494 + }, + { + "epoch": 0.5988791032826261, + "learning_rate": 1.9414852001339547e-05, + "loss": 0.2422, + "step": 1496 + }, + { + "epoch": 0.5996797437950361, + "learning_rate": 1.9424235920596863e-05, + "loss": 0.0332, + "step": 1498 + }, + { + "epoch": 0.600480384307446, + "learning_rate": 1.9433546216942423e-05, + "loss": 1.3784, + "step": 1500 + }, + { + "epoch": 0.6012810248198559, + "learning_rate": 1.944278281764342e-05, + "loss": 0.0198, + "step": 1502 + }, + { + "epoch": 0.6020816653322658, + "learning_rate": 1.945194565054276e-05, + "loss": 0.0887, + "step": 1504 + }, + { + "epoch": 0.6028823058446757, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.3157, + "step": 1506 + }, + { + "epoch": 0.6036829463570856, + "learning_rate": 1.9470049727190073e-05, + "loss": 0.3134, + "step": 1508 + }, + { + "epoch": 0.6044835868694955, + "learning_rate": 1.9478990829507504e-05, + "loss": 0.1176, + "step": 1510 + }, + { + "epoch": 0.6052842273819056, + "learning_rate": 1.948785788116329e-05, + "loss": 0.0093, + "step": 1512 + }, + { + "epoch": 0.6060848678943155, + "learning_rate": 1.9496650812887286e-05, + "loss": 0.0659, + "step": 1514 + }, + { + "epoch": 0.6068855084067254, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.1703, + "step": 1516 + }, + { + "epoch": 0.6076861489191353, + "learning_rate": 1.951401404235505e-05, + "loss": 0.0626, + "step": 1518 + }, + { + "epoch": 0.6084867894315452, + "learning_rate": 1.952258420445583e-05, + "loss": 0.1296, + "step": 1520 + }, + { + "epoch": 0.6092874299439551, + "learning_rate": 1.9531079975339912e-05, + "loss": 0.1258, + "step": 1522 + }, + { + "epoch": 0.610088070456365, + "learning_rate": 1.953950128863762e-05, + "loss": 0.0125, + "step": 1524 + }, + { + "epoch": 0.6108887109687751, + "learning_rate": 1.9547848078560975e-05, + "loss": 0.555, + "step": 1526 + }, + { + "epoch": 0.611689351481185, + "learning_rate": 1.9556120279904144e-05, + "loss": 0.0572, + "step": 1528 + }, + { + "epoch": 0.6124899919935949, + "learning_rate": 1.956431782804402e-05, + "loss": 0.2353, + "step": 1530 + }, + { + "epoch": 0.6132906325060048, + "learning_rate": 1.957244065894066e-05, + "loss": 0.4456, + "step": 1532 + }, + { + "epoch": 0.6140912730184147, + "learning_rate": 1.9580488709137858e-05, + "loss": 0.2388, + "step": 1534 + }, + { + "epoch": 0.6148919135308246, + "learning_rate": 1.9588461915763566e-05, + "loss": 0.1931, + "step": 1536 + }, + { + "epoch": 0.6156925540432346, + "learning_rate": 1.9596360216530436e-05, + "loss": 0.0611, + "step": 1538 + }, + { + "epoch": 0.6164931945556446, + "learning_rate": 1.9604183549736283e-05, + "loss": 0.067, + "step": 1540 + }, + { + "epoch": 0.6172938350680545, + "learning_rate": 1.961193185426459e-05, + "loss": 0.2984, + "step": 1542 + }, + { + "epoch": 0.6180944755804644, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.0881, + "step": 1544 + }, + { + "epoch": 0.6188951160928743, + "learning_rate": 1.9627203135753573e-05, + "loss": 0.2874, + "step": 1546 + }, + { + "epoch": 0.6196957566052842, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.4476, + "step": 1548 + }, + { + "epoch": 0.6204963971176941, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.031, + "step": 1550 + }, + { + "epoch": 0.6212970376301041, + "learning_rate": 1.964954584871995e-05, + "loss": 0.0747, + "step": 1552 + }, + { + "epoch": 0.622097678142514, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.4271, + "step": 1554 + }, + { + "epoch": 0.622898318654924, + "learning_rate": 1.966406417240872e-05, + "loss": 0.2952, + "step": 1556 + }, + { + "epoch": 0.6236989591673339, + "learning_rate": 1.967121011775546e-05, + "loss": 0.2226, + "step": 1558 + }, + { + "epoch": 0.6244995996797438, + "learning_rate": 1.967828051080755e-05, + "loss": 0.3785, + "step": 1560 + }, + { + "epoch": 0.6253002401921537, + "learning_rate": 1.9685275296330497e-05, + "loss": 0.0303, + "step": 1562 + }, + { + "epoch": 0.6261008807045636, + "learning_rate": 1.969219441968046e-05, + "loss": 0.3485, + "step": 1564 + }, + { + "epoch": 0.6269015212169736, + "learning_rate": 1.969903782680467e-05, + "loss": 0.2553, + "step": 1566 + }, + { + "epoch": 0.6277021617293835, + "learning_rate": 1.9705805464241856e-05, + "loss": 0.1104, + "step": 1568 + }, + { + "epoch": 0.6285028022417934, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.0624, + "step": 1570 + }, + { + "epoch": 0.6293034427542034, + "learning_rate": 1.971911321917015e-05, + "loss": 0.0821, + "step": 1572 + }, + { + "epoch": 0.6301040832666133, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.1675, + "step": 1574 + }, + { + "epoch": 0.6309047237790232, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.0596, + "step": 1576 + }, + { + "epoch": 0.6317053642914331, + "learning_rate": 1.9738505276435692e-05, + "loss": 0.5358, + "step": 1578 + }, + { + "epoch": 0.6325060048038431, + "learning_rate": 1.9744817206240377e-05, + "loss": 0.0355, + "step": 1580 + }, + { + "epoch": 0.633306645316253, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.1109, + "step": 1582 + }, + { + "epoch": 0.6341072858286629, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.0341, + "step": 1584 + }, + { + "epoch": 0.6349079263410728, + "learning_rate": 1.9763296037475174e-05, + "loss": 0.025, + "step": 1586 + }, + { + "epoch": 0.6357085668534828, + "learning_rate": 1.976930316809569e-05, + "loss": 0.0736, + "step": 1588 + }, + { + "epoch": 0.6365092073658927, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.0179, + "step": 1590 + }, + { + "epoch": 0.6373098478783027, + "learning_rate": 1.978108842718768e-05, + "loss": 0.0019, + "step": 1592 + }, + { + "epoch": 0.6381104883907126, + "learning_rate": 1.9786866463591732e-05, + "loss": 0.1085, + "step": 1594 + }, + { + "epoch": 0.6389111289031225, + "learning_rate": 1.9792568044184176e-05, + "loss": 0.0477, + "step": 1596 + }, + { + "epoch": 0.6397117694155324, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.0333, + "step": 1598 + }, + { + "epoch": 0.6405124099279423, + "learning_rate": 1.9803741660367015e-05, + "loss": 0.2606, + "step": 1600 + }, + { + "epoch": 0.6413130504403523, + "learning_rate": 1.9809213608668185e-05, + "loss": 0.0667, + "step": 1602 + }, + { + "epoch": 0.6421136909527622, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.7038, + "step": 1604 + }, + { + "epoch": 0.6429143314651722, + "learning_rate": 1.9819927571953807e-05, + "loss": 0.0585, + "step": 1606 + }, + { + "epoch": 0.6437149719775821, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.167, + "step": 1608 + }, + { + "epoch": 0.644515612489992, + "learning_rate": 1.983033467948784e-05, + "loss": 0.0716, + "step": 1610 + }, + { + "epoch": 0.6453162530024019, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.2851, + "step": 1612 + }, + { + "epoch": 0.6461168935148118, + "learning_rate": 1.9840434606066182e-05, + "loss": 0.0759, + "step": 1614 + }, + { + "epoch": 0.6469175340272217, + "learning_rate": 1.9845369277495102e-05, + "loss": 0.6611, + "step": 1616 + }, + { + "epoch": 0.6477181745396317, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.2019, + "step": 1618 + }, + { + "epoch": 0.6485188150520417, + "learning_rate": 1.985500784388244e-05, + "loss": 0.4445, + "step": 1620 + }, + { + "epoch": 0.6493194555644516, + "learning_rate": 1.985971166354357e-05, + "loss": 0.0054, + "step": 1622 + }, + { + "epoch": 0.6501200960768615, + "learning_rate": 1.9864338458320366e-05, + "loss": 0.4025, + "step": 1624 + }, + { + "epoch": 0.6509207365892714, + "learning_rate": 1.986888819206792e-05, + "loss": 0.238, + "step": 1626 + }, + { + "epoch": 0.6517213771016813, + "learning_rate": 1.9873360829243323e-05, + "loss": 0.249, + "step": 1628 + }, + { + "epoch": 0.6525220176140912, + "learning_rate": 1.9877756334905983e-05, + "loss": 0.2378, + "step": 1630 + }, + { + "epoch": 0.6533226581265013, + "learning_rate": 1.9882074674717836e-05, + "loss": 0.265, + "step": 1632 + }, + { + "epoch": 0.6541232986389112, + "learning_rate": 1.988631581494365e-05, + "loss": 0.2361, + "step": 1634 + }, + { + "epoch": 0.6549239391513211, + "learning_rate": 1.989047972245129e-05, + "loss": 0.0343, + "step": 1636 + }, + { + "epoch": 0.655724579663731, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.1859, + "step": 1638 + }, + { + "epoch": 0.6565252201761409, + "learning_rate": 1.989857570980049e-05, + "loss": 0.1679, + "step": 1640 + }, + { + "epoch": 0.6573258606885508, + "learning_rate": 1.990250772639552e-05, + "loss": 0.1719, + "step": 1642 + }, + { + "epoch": 0.6581265012009607, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.1317, + "step": 1644 + }, + { + "epoch": 0.6589271417133707, + "learning_rate": 1.99101396518405e-05, + "loss": 0.313, + "step": 1646 + }, + { + "epoch": 0.6597277822257807, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.1333, + "step": 1648 + }, + { + "epoch": 0.6605284227381906, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.0822, + "step": 1650 + }, + { + "epoch": 0.6613290632506005, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.1722, + "step": 1652 + }, + { + "epoch": 0.6621297037630104, + "learning_rate": 1.9924474249753652e-05, + "loss": 0.1906, + "step": 1654 + }, + { + "epoch": 0.6629303442754203, + "learning_rate": 1.9927864140670615e-05, + "loss": 0.1694, + "step": 1656 + }, + { + "epoch": 0.6637309847878302, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.0376, + "step": 1658 + }, + { + "epoch": 0.6645316253002402, + "learning_rate": 1.99344112247369e-05, + "loss": 0.0567, + "step": 1660 + }, + { + "epoch": 0.6653322658126501, + "learning_rate": 1.9937568366739858e-05, + "loss": 0.2487, + "step": 1662 + }, + { + "epoch": 0.6661329063250601, + "learning_rate": 1.9940647875635463e-05, + "loss": 0.016, + "step": 1664 + }, + { + "epoch": 0.66693354683747, + "learning_rate": 1.9943649727366335e-05, + "loss": 0.9333, + "step": 1666 + }, + { + "epoch": 0.6677341873498799, + "learning_rate": 1.994657389848176e-05, + "loss": 0.4249, + "step": 1668 + }, + { + "epoch": 0.6685348278622898, + "learning_rate": 1.994942036613787e-05, + "loss": 0.6759, + "step": 1670 + }, + { + "epoch": 0.6693354683746997, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.0906, + "step": 1672 + }, + { + "epoch": 0.6701361088871097, + "learning_rate": 1.995488010273198e-05, + "loss": 0.0132, + "step": 1674 + }, + { + "epoch": 0.6709367493995196, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.0983, + "step": 1676 + }, + { + "epoch": 0.6717373899119295, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.4848, + "step": 1678 + }, + { + "epoch": 0.6725380304243395, + "learning_rate": 1.996248639549475e-05, + "loss": 0.1295, + "step": 1680 + }, + { + "epoch": 0.6733386709367494, + "learning_rate": 1.9964866196679105e-05, + "loss": 0.8247, + "step": 1682 + }, + { + "epoch": 0.6741393114491593, + "learning_rate": 1.9967168151503196e-05, + "loss": 0.3808, + "step": 1684 + }, + { + "epoch": 0.6749399519615693, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.0001, + "step": 1686 + }, + { + "epoch": 0.6757405924739792, + "learning_rate": 1.997153845074662e-05, + "loss": 0.4078, + "step": 1688 + }, + { + "epoch": 0.6765412329863891, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.094, + "step": 1690 + }, + { + "epoch": 0.677341873498799, + "learning_rate": 1.997559715666073e-05, + "loss": 0.1079, + "step": 1692 + }, + { + "epoch": 0.678142514011209, + "learning_rate": 1.9977509622105233e-05, + "loss": 0.057, + "step": 1694 + }, + { + "epoch": 0.6789431545236189, + "learning_rate": 1.9979344142417986e-05, + "loss": 0.1165, + "step": 1696 + }, + { + "epoch": 0.6797437950360288, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.2249, + "step": 1698 + }, + { + "epoch": 0.6805444355484388, + "learning_rate": 1.998277929093157e-05, + "loss": 0.3387, + "step": 1700 + }, + { + "epoch": 0.6813450760608487, + "learning_rate": 1.998437989229673e-05, + "loss": 0.0786, + "step": 1702 + }, + { + "epoch": 0.6821457165732586, + "learning_rate": 1.9985902494859023e-05, + "loss": 0.1919, + "step": 1704 + }, + { + "epoch": 0.6829463570856685, + "learning_rate": 1.998734708672375e-05, + "loss": 0.1849, + "step": 1706 + }, + { + "epoch": 0.6837469975980784, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.0975, + "step": 1708 + }, + { + "epoch": 0.6845476381104884, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.1772, + "step": 1710 + }, + { + "epoch": 0.6853482786228983, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.218, + "step": 1712 + }, + { + "epoch": 0.6861489191353083, + "learning_rate": 1.9992345130644747e-05, + "loss": 0.2987, + "step": 1714 + }, + { + "epoch": 0.6869495596477182, + "learning_rate": 1.999339951193407e-05, + "loss": 0.067, + "step": 1716 + }, + { + "epoch": 0.6877502001601281, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.1281, + "step": 1718 + }, + { + "epoch": 0.688550840672538, + "learning_rate": 1.9995274059091018e-05, + "loss": 0.2421, + "step": 1720 + }, + { + "epoch": 0.6893514811849479, + "learning_rate": 1.999609421031453e-05, + "loss": 0.4262, + "step": 1722 + }, + { + "epoch": 0.6901521216973578, + "learning_rate": 1.999683627122195e-05, + "loss": 0.2677, + "step": 1724 + }, + { + "epoch": 0.6909527622097679, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.0343, + "step": 1726 + }, + { + "epoch": 0.6917534027221778, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.1022, + "step": 1728 + }, + { + "epoch": 0.6925540432345877, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.1873, + "step": 1730 + }, + { + "epoch": 0.6933546837469976, + "learning_rate": 1.99990235049015e-05, + "loss": 0.2784, + "step": 1732 + }, + { + "epoch": 0.6941553242594075, + "learning_rate": 1.9999375039475275e-05, + "loss": 0.0593, + "step": 1734 + }, + { + "epoch": 0.6949559647718174, + "learning_rate": 1.999964845810285e-05, + "loss": 0.0838, + "step": 1736 + }, + { + "epoch": 0.6957566052842273, + "learning_rate": 1.9999843758648253e-05, + "loss": 0.0984, + "step": 1738 + }, + { + "epoch": 0.6965572457966374, + "learning_rate": 1.999996093958578e-05, + "loss": 0.0006, + "step": 1740 + }, + { + "epoch": 0.6973578863090473, + "learning_rate": 2e-05, + "loss": 0.033, + "step": 1742 + }, + { + "epoch": 0.6981585268214572, + "learning_rate": 1.999996093958578e-05, + "loss": 0.264, + "step": 1744 + }, + { + "epoch": 0.6989591673338671, + "learning_rate": 1.9999843758648253e-05, + "loss": 0.2612, + "step": 1746 + }, + { + "epoch": 0.699759807846277, + "learning_rate": 1.999964845810285e-05, + "loss": 0.1759, + "step": 1748 + }, + { + "epoch": 0.7005604483586869, + "learning_rate": 1.9999375039475278e-05, + "loss": 0.2072, + "step": 1750 + }, + { + "epoch": 0.7013610888710968, + "learning_rate": 1.99990235049015e-05, + "loss": 0.4149, + "step": 1752 + }, + { + "epoch": 0.7021617293835068, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.0984, + "step": 1754 + }, + { + "epoch": 0.7029623698959168, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.9108, + "step": 1756 + }, + { + "epoch": 0.7037630104083267, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.0378, + "step": 1758 + }, + { + "epoch": 0.7045636509207366, + "learning_rate": 1.999683627122195e-05, + "loss": 0.3632, + "step": 1760 + }, + { + "epoch": 0.7053642914331465, + "learning_rate": 1.999609421031453e-05, + "loss": 0.014, + "step": 1762 + }, + { + "epoch": 0.7061649319455564, + "learning_rate": 1.999527405909102e-05, + "loss": 0.3102, + "step": 1764 + }, + { + "epoch": 0.7069655724579663, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.0602, + "step": 1766 + }, + { + "epoch": 0.7077662129703763, + "learning_rate": 1.999339951193407e-05, + "loss": 0.256, + "step": 1768 + }, + { + "epoch": 0.7085668534827863, + "learning_rate": 1.999234513064475e-05, + "loss": 0.2158, + "step": 1770 + }, + { + "epoch": 0.7093674939951962, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.1384, + "step": 1772 + }, + { + "epoch": 0.7101681345076061, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.1103, + "step": 1774 + }, + { + "epoch": 0.710968775020016, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.1634, + "step": 1776 + }, + { + "epoch": 0.7117694155324259, + "learning_rate": 1.998734708672375e-05, + "loss": 0.0888, + "step": 1778 + }, + { + "epoch": 0.7125700560448359, + "learning_rate": 1.9985902494859026e-05, + "loss": 0.204, + "step": 1780 + }, + { + "epoch": 0.7133706965572458, + "learning_rate": 1.9984379892296735e-05, + "loss": 0.0226, + "step": 1782 + }, + { + "epoch": 0.7141713370696557, + "learning_rate": 1.9982779290931572e-05, + "loss": 0.3126, + "step": 1784 + }, + { + "epoch": 0.7149719775820657, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.0467, + "step": 1786 + }, + { + "epoch": 0.7157726180944756, + "learning_rate": 1.997934414241799e-05, + "loss": 0.0295, + "step": 1788 + }, + { + "epoch": 0.7165732586068855, + "learning_rate": 1.9977509622105236e-05, + "loss": 0.5553, + "step": 1790 + }, + { + "epoch": 0.7173738991192954, + "learning_rate": 1.997559715666073e-05, + "loss": 0.6304, + "step": 1792 + }, + { + "epoch": 0.7181745396317054, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.0665, + "step": 1794 + }, + { + "epoch": 0.7189751801441153, + "learning_rate": 1.997153845074662e-05, + "loss": 0.0303, + "step": 1796 + }, + { + "epoch": 0.7197758206565252, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.0254, + "step": 1798 + }, + { + "epoch": 0.7205764611689351, + "learning_rate": 1.9967168151503193e-05, + "loss": 0.3363, + "step": 1800 + }, + { + "epoch": 0.7213771016813451, + "learning_rate": 1.996486619667911e-05, + "loss": 0.3805, + "step": 1802 + }, + { + "epoch": 0.722177742193755, + "learning_rate": 1.9962486395494753e-05, + "loss": 0.0144, + "step": 1804 + }, + { + "epoch": 0.7229783827061649, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.2415, + "step": 1806 + }, + { + "epoch": 0.7237790232185749, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.0346, + "step": 1808 + }, + { + "epoch": 0.7245796637309848, + "learning_rate": 1.995488010273198e-05, + "loss": 0.1156, + "step": 1810 + }, + { + "epoch": 0.7253803042433947, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.2213, + "step": 1812 + }, + { + "epoch": 0.7261809447558046, + "learning_rate": 1.9949420366137873e-05, + "loss": 0.1772, + "step": 1814 + }, + { + "epoch": 0.7269815852682145, + "learning_rate": 1.994657389848176e-05, + "loss": 0.0259, + "step": 1816 + }, + { + "epoch": 0.7277822257806245, + "learning_rate": 1.994364972736634e-05, + "loss": 0.1447, + "step": 1818 + }, + { + "epoch": 0.7285828662930345, + "learning_rate": 1.9940647875635466e-05, + "loss": 0.0001, + "step": 1820 + }, + { + "epoch": 0.7293835068054444, + "learning_rate": 1.993756836673986e-05, + "loss": 0.1412, + "step": 1822 + }, + { + "epoch": 0.7301841473178543, + "learning_rate": 1.99344112247369e-05, + "loss": 0.3803, + "step": 1824 + }, + { + "epoch": 0.7309847878302642, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.1956, + "step": 1826 + }, + { + "epoch": 0.7317854283426741, + "learning_rate": 1.9927864140670618e-05, + "loss": 0.1739, + "step": 1828 + }, + { + "epoch": 0.732586068855084, + "learning_rate": 1.9924474249753656e-05, + "loss": 0.3067, + "step": 1830 + }, + { + "epoch": 0.733386709367494, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.1189, + "step": 1832 + }, + { + "epoch": 0.734187349879904, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.2977, + "step": 1834 + }, + { + "epoch": 0.7349879903923139, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.1836, + "step": 1836 + }, + { + "epoch": 0.7357886309047238, + "learning_rate": 1.9910139651840497e-05, + "loss": 0.3066, + "step": 1838 + }, + { + "epoch": 0.7365892714171337, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.2902, + "step": 1840 + }, + { + "epoch": 0.7373899119295436, + "learning_rate": 1.9902507726395524e-05, + "loss": 0.1327, + "step": 1842 + }, + { + "epoch": 0.7381905524419535, + "learning_rate": 1.989857570980049e-05, + "loss": 0.034, + "step": 1844 + }, + { + "epoch": 0.7389911929543634, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.2908, + "step": 1846 + }, + { + "epoch": 0.7397918334667735, + "learning_rate": 1.9890479722451292e-05, + "loss": 0.3809, + "step": 1848 + }, + { + "epoch": 0.7405924739791834, + "learning_rate": 1.988631581494365e-05, + "loss": 0.0179, + "step": 1850 + }, + { + "epoch": 0.7413931144915933, + "learning_rate": 1.9882074674717832e-05, + "loss": 0.4259, + "step": 1852 + }, + { + "epoch": 0.7421937550040032, + "learning_rate": 1.987775633490599e-05, + "loss": 0.5049, + "step": 1854 + }, + { + "epoch": 0.7429943955164131, + "learning_rate": 1.987336082924333e-05, + "loss": 0.0307, + "step": 1856 + }, + { + "epoch": 0.743795036028823, + "learning_rate": 1.986888819206792e-05, + "loss": 0.1995, + "step": 1858 + }, + { + "epoch": 0.7445956765412329, + "learning_rate": 1.986433845832037e-05, + "loss": 0.2614, + "step": 1860 + }, + { + "epoch": 0.745396317053643, + "learning_rate": 1.9859711663543573e-05, + "loss": 0.3215, + "step": 1862 + }, + { + "epoch": 0.7461969575660529, + "learning_rate": 1.9855007843882437e-05, + "loss": 0.0051, + "step": 1864 + }, + { + "epoch": 0.7469975980784628, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.174, + "step": 1866 + }, + { + "epoch": 0.7477982385908727, + "learning_rate": 1.9845369277495105e-05, + "loss": 0.2507, + "step": 1868 + }, + { + "epoch": 0.7485988791032826, + "learning_rate": 1.9840434606066186e-05, + "loss": 0.2157, + "step": 1870 + }, + { + "epoch": 0.7493995196156925, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.2949, + "step": 1872 + }, + { + "epoch": 0.7502001601281025, + "learning_rate": 1.983033467948784e-05, + "loss": 0.2882, + "step": 1874 + }, + { + "epoch": 0.7510008006405124, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.1412, + "step": 1876 + }, + { + "epoch": 0.7518014411529224, + "learning_rate": 1.9819927571953804e-05, + "loss": 0.2432, + "step": 1878 + }, + { + "epoch": 0.7526020816653323, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.1141, + "step": 1880 + }, + { + "epoch": 0.7534027221777422, + "learning_rate": 1.980921360866819e-05, + "loss": 0.0728, + "step": 1882 + }, + { + "epoch": 0.7542033626901521, + "learning_rate": 1.9803741660367018e-05, + "loss": 0.0084, + "step": 1884 + }, + { + "epoch": 0.755004003202562, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.0052, + "step": 1886 + }, + { + "epoch": 0.755804643714972, + "learning_rate": 1.979256804418418e-05, + "loss": 1.0712, + "step": 1888 + }, + { + "epoch": 0.7566052842273819, + "learning_rate": 1.978686646359173e-05, + "loss": 0.0176, + "step": 1890 + }, + { + "epoch": 0.7574059247397918, + "learning_rate": 1.9781088427187677e-05, + "loss": 0.8224, + "step": 1892 + }, + { + "epoch": 0.7582065652522018, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.0612, + "step": 1894 + }, + { + "epoch": 0.7590072057646117, + "learning_rate": 1.976930316809569e-05, + "loss": 1.2486, + "step": 1896 + }, + { + "epoch": 0.7598078462770216, + "learning_rate": 1.9763296037475177e-05, + "loss": 0.5712, + "step": 1898 + }, + { + "epoch": 0.7606084867894315, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.253, + "step": 1900 + }, + { + "epoch": 0.7614091273018415, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.3648, + "step": 1902 + }, + { + "epoch": 0.7622097678142514, + "learning_rate": 1.9744817206240374e-05, + "loss": 0.3473, + "step": 1904 + }, + { + "epoch": 0.7630104083266613, + "learning_rate": 1.9738505276435695e-05, + "loss": 0.2674, + "step": 1906 + }, + { + "epoch": 0.7638110488390712, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.0053, + "step": 1908 + }, + { + "epoch": 0.7646116893514812, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.0881, + "step": 1910 + }, + { + "epoch": 0.7654123298638911, + "learning_rate": 1.9719113219170152e-05, + "loss": 0.3642, + "step": 1912 + }, + { + "epoch": 0.7662129703763011, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.041, + "step": 1914 + }, + { + "epoch": 0.767013610888711, + "learning_rate": 1.970580546424186e-05, + "loss": 0.3424, + "step": 1916 + }, + { + "epoch": 0.7678142514011209, + "learning_rate": 1.969903782680467e-05, + "loss": 0.2806, + "step": 1918 + }, + { + "epoch": 0.7686148919135308, + "learning_rate": 1.9692194419680463e-05, + "loss": 0.2913, + "step": 1920 + }, + { + "epoch": 0.7694155324259407, + "learning_rate": 1.96852752963305e-05, + "loss": 0.277, + "step": 1922 + }, + { + "epoch": 0.7702161729383507, + "learning_rate": 1.9678280510807552e-05, + "loss": 0.1203, + "step": 1924 + }, + { + "epoch": 0.7710168134507606, + "learning_rate": 1.9671210117755462e-05, + "loss": 0.03, + "step": 1926 + }, + { + "epoch": 0.7718174539631706, + "learning_rate": 1.966406417240872e-05, + "loss": 0.1387, + "step": 1928 + }, + { + "epoch": 0.7726180944755805, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.196, + "step": 1930 + }, + { + "epoch": 0.7734187349879904, + "learning_rate": 1.964954584871995e-05, + "loss": 0.1097, + "step": 1932 + }, + { + "epoch": 0.7742193755004003, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.0572, + "step": 1934 + }, + { + "epoch": 0.7750200160128102, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.5186, + "step": 1936 + }, + { + "epoch": 0.7758206565252201, + "learning_rate": 1.9627203135753576e-05, + "loss": 0.0547, + "step": 1938 + }, + { + "epoch": 0.77662129703763, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.4263, + "step": 1940 + }, + { + "epoch": 0.7774219375500401, + "learning_rate": 1.961193185426459e-05, + "loss": 0.0049, + "step": 1942 + }, + { + "epoch": 0.77822257806245, + "learning_rate": 1.9604183549736287e-05, + "loss": 0.3619, + "step": 1944 + }, + { + "epoch": 0.7790232185748599, + "learning_rate": 1.959636021653044e-05, + "loss": 0.3762, + "step": 1946 + }, + { + "epoch": 0.7798238590872698, + "learning_rate": 1.958846191576357e-05, + "loss": 1.2455, + "step": 1948 + }, + { + "epoch": 0.7806244995996797, + "learning_rate": 1.958048870913786e-05, + "loss": 0.6391, + "step": 1950 + }, + { + "epoch": 0.7814251401120896, + "learning_rate": 1.9572440658940667e-05, + "loss": 0.301, + "step": 1952 + }, + { + "epoch": 0.7822257806244995, + "learning_rate": 1.9564317828044022e-05, + "loss": 0.2228, + "step": 1954 + }, + { + "epoch": 0.7830264211369096, + "learning_rate": 1.955612027990415e-05, + "loss": 0.047, + "step": 1956 + }, + { + "epoch": 0.7838270616493195, + "learning_rate": 1.9547848078560982e-05, + "loss": 0.0756, + "step": 1958 + }, + { + "epoch": 0.7846277021617294, + "learning_rate": 1.953950128863763e-05, + "loss": 0.1306, + "step": 1960 + }, + { + "epoch": 0.7854283426741393, + "learning_rate": 1.9531079975339915e-05, + "loss": 0.1555, + "step": 1962 + }, + { + "epoch": 0.7862289831865492, + "learning_rate": 1.9522584204455835e-05, + "loss": 0.4522, + "step": 1964 + }, + { + "epoch": 0.7870296236989591, + "learning_rate": 1.9514014042355054e-05, + "loss": 0.4533, + "step": 1966 + }, + { + "epoch": 0.7878302642113691, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.3241, + "step": 1968 + }, + { + "epoch": 0.7886309047237791, + "learning_rate": 1.9496650812887293e-05, + "loss": 0.0001, + "step": 1970 + }, + { + "epoch": 0.789431545236189, + "learning_rate": 1.9487857881163295e-05, + "loss": 0.1759, + "step": 1972 + }, + { + "epoch": 0.7902321857485989, + "learning_rate": 1.947899082950751e-05, + "loss": 0.1912, + "step": 1974 + }, + { + "epoch": 0.7910328262610088, + "learning_rate": 1.947004972719008e-05, + "loss": 0.0505, + "step": 1976 + }, + { + "epoch": 0.7918334667734187, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.0515, + "step": 1978 + }, + { + "epoch": 0.7926341072858286, + "learning_rate": 1.945194565054276e-05, + "loss": 0.3746, + "step": 1980 + }, + { + "epoch": 0.7934347477982386, + "learning_rate": 1.9442782817643425e-05, + "loss": 0.0978, + "step": 1982 + }, + { + "epoch": 0.7942353883106485, + "learning_rate": 1.9433546216942433e-05, + "loss": 0.174, + "step": 1984 + }, + { + "epoch": 0.7950360288230585, + "learning_rate": 1.942423592059687e-05, + "loss": 0.115, + "step": 1986 + }, + { + "epoch": 0.7958366693354684, + "learning_rate": 1.941485200133955e-05, + "loss": 0.0653, + "step": 1988 + }, + { + "epoch": 0.7966373098478783, + "learning_rate": 1.9405394532478422e-05, + "loss": 0.1865, + "step": 1990 + }, + { + "epoch": 0.7974379503602882, + "learning_rate": 1.9395863587896025e-05, + "loss": 0.1241, + "step": 1992 + }, + { + "epoch": 0.7982385908726981, + "learning_rate": 1.938625924204888e-05, + "loss": 0.405, + "step": 1994 + }, + { + "epoch": 0.7990392313851081, + "learning_rate": 1.937658156996694e-05, + "loss": 0.2969, + "step": 1996 + }, + { + "epoch": 0.799839871897518, + "learning_rate": 1.9366830647252977e-05, + "loss": 0.3522, + "step": 1998 + }, + { + "epoch": 0.800640512409928, + "learning_rate": 1.9357006550082e-05, + "loss": 0.1591, + "step": 2000 + }, + { + "epoch": 0.8014411529223379, + "learning_rate": 1.9347109355200676e-05, + "loss": 0.1505, + "step": 2002 + }, + { + "epoch": 0.8022417934347478, + "learning_rate": 1.933713913992671e-05, + "loss": 0.0585, + "step": 2004 + }, + { + "epoch": 0.8030424339471577, + "learning_rate": 1.9327095982148255e-05, + "loss": 0.1003, + "step": 2006 + }, + { + "epoch": 0.8038430744595677, + "learning_rate": 1.9316979960323283e-05, + "loss": 0.4811, + "step": 2008 + }, + { + "epoch": 0.8046437149719776, + "learning_rate": 1.9306791153479017e-05, + "loss": 0.5298, + "step": 2010 + }, + { + "epoch": 0.8054443554843875, + "learning_rate": 1.9296529641211226e-05, + "loss": 0.0979, + "step": 2012 + }, + { + "epoch": 0.8062449959967974, + "learning_rate": 1.928619550368371e-05, + "loss": 0.0981, + "step": 2014 + }, + { + "epoch": 0.8070456365092074, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.2901, + "step": 2016 + }, + { + "epoch": 0.8078462770216173, + "learning_rate": 1.9265309676340783e-05, + "loss": 0.0537, + "step": 2018 + }, + { + "epoch": 0.8086469175340272, + "learning_rate": 1.9254758149687187e-05, + "loss": 0.8714, + "step": 2020 + }, + { + "epoch": 0.8094475580464372, + "learning_rate": 1.9244134324096216e-05, + "loss": 0.821, + "step": 2022 + }, + { + "epoch": 0.8102481985588471, + "learning_rate": 1.9233438282562095e-05, + "loss": 0.0681, + "step": 2024 + }, + { + "epoch": 0.811048839071257, + "learning_rate": 1.9222670108643156e-05, + "loss": 0.2082, + "step": 2026 + }, + { + "epoch": 0.8118494795836669, + "learning_rate": 1.9211829886461278e-05, + "loss": 0.5836, + "step": 2028 + }, + { + "epoch": 0.8126501200960768, + "learning_rate": 1.9200917700701176e-05, + "loss": 0.1669, + "step": 2030 + }, + { + "epoch": 0.8134507606084868, + "learning_rate": 1.918993363660975e-05, + "loss": 0.1299, + "step": 2032 + }, + { + "epoch": 0.8142514011208967, + "learning_rate": 1.9178877779995416e-05, + "loss": 0.0688, + "step": 2034 + }, + { + "epoch": 0.8150520416333067, + "learning_rate": 1.916775021722745e-05, + "loss": 0.4094, + "step": 2036 + }, + { + "epoch": 0.8158526821457166, + "learning_rate": 1.9156551035235298e-05, + "loss": 0.0134, + "step": 2038 + }, + { + "epoch": 0.8166533226581265, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.2536, + "step": 2040 + }, + { + "epoch": 0.8174539631705364, + "learning_rate": 1.9133938164092942e-05, + "loss": 0.2122, + "step": 2042 + }, + { + "epoch": 0.8182546036829463, + "learning_rate": 1.9122524651596372e-05, + "loss": 0.0984, + "step": 2044 + }, + { + "epoch": 0.8190552441953562, + "learning_rate": 1.9111039873181475e-05, + "loss": 0.2875, + "step": 2046 + }, + { + "epoch": 0.8198558847077662, + "learning_rate": 1.9099483918568287e-05, + "loss": 0.5369, + "step": 2048 + }, + { + "epoch": 0.8206565252201762, + "learning_rate": 1.90878568780329e-05, + "loss": 0.3338, + "step": 2050 + }, + { + "epoch": 0.8214571657325861, + "learning_rate": 1.907615884240668e-05, + "loss": 0.2758, + "step": 2052 + }, + { + "epoch": 0.822257806244996, + "learning_rate": 1.9064389903075683e-05, + "loss": 0.2697, + "step": 2054 + }, + { + "epoch": 0.8230584467574059, + "learning_rate": 1.905255015197982e-05, + "loss": 0.1285, + "step": 2056 + }, + { + "epoch": 0.8238590872698158, + "learning_rate": 1.9040639681612216e-05, + "loss": 0.2542, + "step": 2058 + }, + { + "epoch": 0.8246597277822257, + "learning_rate": 1.902865858501845e-05, + "loss": 0.1639, + "step": 2060 + }, + { + "epoch": 0.8254603682946358, + "learning_rate": 1.9016606955795843e-05, + "loss": 0.0512, + "step": 2062 + }, + { + "epoch": 0.8262610088070457, + "learning_rate": 1.9004484888092734e-05, + "loss": 0.0898, + "step": 2064 + }, + { + "epoch": 0.8270616493194556, + "learning_rate": 1.8992292476607695e-05, + "loss": 0.3781, + "step": 2066 + }, + { + "epoch": 0.8278622898318655, + "learning_rate": 1.8980029816588863e-05, + "loss": 0.2964, + "step": 2068 + }, + { + "epoch": 0.8286629303442754, + "learning_rate": 1.8967697003833156e-05, + "loss": 0.1906, + "step": 2070 + }, + { + "epoch": 0.8294635708566853, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.2854, + "step": 2072 + }, + { + "epoch": 0.8302642113690952, + "learning_rate": 1.8942821306038227e-05, + "loss": 0.1701, + "step": 2074 + }, + { + "epoch": 0.8310648518815053, + "learning_rate": 1.893027861533003e-05, + "loss": 0.236, + "step": 2076 + }, + { + "epoch": 0.8318654923939152, + "learning_rate": 1.891766616054545e-05, + "loss": 0.2393, + "step": 2078 + }, + { + "epoch": 0.8326661329063251, + "learning_rate": 1.8904984040214043e-05, + "loss": 0.0004, + "step": 2080 + }, + { + "epoch": 0.833466773418735, + "learning_rate": 1.8892232353409582e-05, + "loss": 0.0981, + "step": 2082 + }, + { + "epoch": 0.8342674139311449, + "learning_rate": 1.8879411199749306e-05, + "loss": 0.1249, + "step": 2084 + }, + { + "epoch": 0.8350680544435548, + "learning_rate": 1.8866520679393124e-05, + "loss": 0.6806, + "step": 2086 + }, + { + "epoch": 0.8358686949559647, + "learning_rate": 1.885356089304285e-05, + "loss": 0.0441, + "step": 2088 + }, + { + "epoch": 0.8366693354683747, + "learning_rate": 1.884053194194143e-05, + "loss": 0.3056, + "step": 2090 + }, + { + "epoch": 0.8374699759807847, + "learning_rate": 1.882743392787207e-05, + "loss": 0.1716, + "step": 2092 + }, + { + "epoch": 0.8382706164931946, + "learning_rate": 1.881426695315756e-05, + "loss": 0.3304, + "step": 2094 + }, + { + "epoch": 0.8390712570056045, + "learning_rate": 1.8801031120659396e-05, + "loss": 0.1705, + "step": 2096 + }, + { + "epoch": 0.8398718975180144, + "learning_rate": 1.8787726533777003e-05, + "loss": 0.5106, + "step": 2098 + }, + { + "epoch": 0.8406725380304243, + "learning_rate": 1.877435329644691e-05, + "loss": 0.3935, + "step": 2100 + }, + { + "epoch": 0.8414731785428343, + "learning_rate": 1.8760911513141974e-05, + "loss": 0.0292, + "step": 2102 + }, + { + "epoch": 0.8422738190552442, + "learning_rate": 1.8747401288870482e-05, + "loss": 0.0359, + "step": 2104 + }, + { + "epoch": 0.8430744595676541, + "learning_rate": 1.8733822729175455e-05, + "loss": 0.0288, + "step": 2106 + }, + { + "epoch": 0.8438751000800641, + "learning_rate": 1.8720175940133712e-05, + "loss": 0.0732, + "step": 2108 + }, + { + "epoch": 0.844675740592474, + "learning_rate": 1.8706461028355107e-05, + "loss": 0.1904, + "step": 2110 + }, + { + "epoch": 0.8454763811048839, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.1565, + "step": 2112 + }, + { + "epoch": 0.8462770216172938, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.576, + "step": 2114 + }, + { + "epoch": 0.8470776621297038, + "learning_rate": 1.8664908630674264e-05, + "loss": 0.6103, + "step": 2116 + }, + { + "epoch": 0.8478783026421137, + "learning_rate": 1.86509223046777e-05, + "loss": 0.0349, + "step": 2118 + }, + { + "epoch": 0.8486789431545236, + "learning_rate": 1.8636868396959406e-05, + "loss": 0.2403, + "step": 2120 + }, + { + "epoch": 0.8494795836669335, + "learning_rate": 1.8622747017309676e-05, + "loss": 0.0048, + "step": 2122 + }, + { + "epoch": 0.8502802241793435, + "learning_rate": 1.8608558276045898e-05, + "loss": 0.0155, + "step": 2124 + }, + { + "epoch": 0.8510808646917534, + "learning_rate": 1.8594302284011697e-05, + "loss": 0.1246, + "step": 2126 + }, + { + "epoch": 0.8518815052041633, + "learning_rate": 1.8579979152576076e-05, + "loss": 0.1598, + "step": 2128 + }, + { + "epoch": 0.8526821457165733, + "learning_rate": 1.8565588993632498e-05, + "loss": 0.0322, + "step": 2130 + }, + { + "epoch": 0.8534827862289832, + "learning_rate": 1.8551131919598084e-05, + "loss": 0.3569, + "step": 2132 + }, + { + "epoch": 0.8542834267413931, + "learning_rate": 1.8536608043412702e-05, + "loss": 0.1195, + "step": 2134 + }, + { + "epoch": 0.855084067253803, + "learning_rate": 1.852201747853807e-05, + "loss": 0.0451, + "step": 2136 + }, + { + "epoch": 0.855884707766213, + "learning_rate": 1.8507360338956896e-05, + "loss": 0.0938, + "step": 2138 + }, + { + "epoch": 0.8566853482786229, + "learning_rate": 1.849263673917196e-05, + "loss": 0.3296, + "step": 2140 + }, + { + "epoch": 0.8574859887910328, + "learning_rate": 1.847784679420527e-05, + "loss": 0.0642, + "step": 2142 + }, + { + "epoch": 0.8582866293034428, + "learning_rate": 1.846299061959706e-05, + "loss": 0.1166, + "step": 2144 + }, + { + "epoch": 0.8590872698158527, + "learning_rate": 1.8448068331405018e-05, + "loss": 0.0455, + "step": 2146 + }, + { + "epoch": 0.8598879103282626, + "learning_rate": 1.8433080046203293e-05, + "loss": 0.0201, + "step": 2148 + }, + { + "epoch": 0.8606885508406725, + "learning_rate": 1.841802588108161e-05, + "loss": 0.0726, + "step": 2150 + }, + { + "epoch": 0.8614891913530824, + "learning_rate": 1.8402905953644356e-05, + "loss": 0.0901, + "step": 2152 + }, + { + "epoch": 0.8622898318654924, + "learning_rate": 1.838772038200968e-05, + "loss": 0.0186, + "step": 2154 + }, + { + "epoch": 0.8630904723779024, + "learning_rate": 1.837246928480848e-05, + "loss": 0.2786, + "step": 2156 + }, + { + "epoch": 0.8638911128903123, + "learning_rate": 1.8357152781183613e-05, + "loss": 0.0711, + "step": 2158 + }, + { + "epoch": 0.8646917534027222, + "learning_rate": 1.8341770990788874e-05, + "loss": 0.021, + "step": 2160 + }, + { + "epoch": 0.8654923939151321, + "learning_rate": 1.8326324033788087e-05, + "loss": 0.5279, + "step": 2162 + }, + { + "epoch": 0.866293034427542, + "learning_rate": 1.831081203085415e-05, + "loss": 0.1609, + "step": 2164 + }, + { + "epoch": 0.8670936749399519, + "learning_rate": 1.8295235103168128e-05, + "loss": 0.1316, + "step": 2166 + }, + { + "epoch": 0.8678943154523618, + "learning_rate": 1.8279593372418284e-05, + "loss": 0.2905, + "step": 2168 + }, + { + "epoch": 0.8686949559647719, + "learning_rate": 1.8263886960799072e-05, + "loss": 0.0491, + "step": 2170 + }, + { + "epoch": 0.8694955964771818, + "learning_rate": 1.8248115991010303e-05, + "loss": 0.3533, + "step": 2172 + }, + { + "epoch": 0.8702962369895917, + "learning_rate": 1.8232280586256104e-05, + "loss": 0.1822, + "step": 2174 + }, + { + "epoch": 0.8710968775020016, + "learning_rate": 1.8216380870243963e-05, + "loss": 0.079, + "step": 2176 + }, + { + "epoch": 0.8718975180144115, + "learning_rate": 1.820041696718378e-05, + "loss": 0.1392, + "step": 2178 + }, + { + "epoch": 0.8726981585268214, + "learning_rate": 1.8184389001786912e-05, + "loss": 0.2052, + "step": 2180 + }, + { + "epoch": 0.8734987990392313, + "learning_rate": 1.8168297099265108e-05, + "loss": 0.3134, + "step": 2182 + }, + { + "epoch": 0.8742994395516414, + "learning_rate": 1.815214138532966e-05, + "loss": 0.0791, + "step": 2184 + }, + { + "epoch": 0.8751000800640513, + "learning_rate": 1.8135921986190358e-05, + "loss": 0.1473, + "step": 2186 + }, + { + "epoch": 0.8759007205764612, + "learning_rate": 1.8119639028554475e-05, + "loss": 0.0087, + "step": 2188 + }, + { + "epoch": 0.8767013610888711, + "learning_rate": 1.8103292639625835e-05, + "loss": 0.1448, + "step": 2190 + }, + { + "epoch": 0.877502001601281, + "learning_rate": 1.808688294710378e-05, + "loss": 0.0578, + "step": 2192 + }, + { + "epoch": 0.8783026421136909, + "learning_rate": 1.807041007918221e-05, + "loss": 0.0475, + "step": 2194 + }, + { + "epoch": 0.8791032826261009, + "learning_rate": 1.805387416454849e-05, + "loss": 0.5049, + "step": 2196 + }, + { + "epoch": 0.8799039231385108, + "learning_rate": 1.8037275332382575e-05, + "loss": 0.4241, + "step": 2198 + }, + { + "epoch": 0.8807045636509208, + "learning_rate": 1.802061371235592e-05, + "loss": 0.0, + "step": 2200 + }, + { + "epoch": 0.8815052041633307, + "learning_rate": 1.8003889434630476e-05, + "loss": 0.427, + "step": 2202 + }, + { + "epoch": 0.8823058446757406, + "learning_rate": 1.7987102629857692e-05, + "loss": 0.0413, + "step": 2204 + }, + { + "epoch": 0.8831064851881505, + "learning_rate": 1.7970253429177494e-05, + "loss": 0.3957, + "step": 2206 + }, + { + "epoch": 0.8839071257005604, + "learning_rate": 1.7953341964217196e-05, + "loss": 0.0366, + "step": 2208 + }, + { + "epoch": 0.8847077662129704, + "learning_rate": 1.7936368367090583e-05, + "loss": 0.0093, + "step": 2210 + }, + { + "epoch": 0.8855084067253803, + "learning_rate": 1.7919332770396798e-05, + "loss": 0.0087, + "step": 2212 + }, + { + "epoch": 0.8863090472377902, + "learning_rate": 1.7902235307219336e-05, + "loss": 0.1281, + "step": 2214 + }, + { + "epoch": 0.8871096877502002, + "learning_rate": 1.7885076111125e-05, + "loss": 0.0003, + "step": 2216 + }, + { + "epoch": 0.8879103282626101, + "learning_rate": 1.7867855316162846e-05, + "loss": 0.0592, + "step": 2218 + }, + { + "epoch": 0.88871096877502, + "learning_rate": 1.7850573056863173e-05, + "loss": 0.0222, + "step": 2220 + }, + { + "epoch": 0.8895116092874299, + "learning_rate": 1.783322946823638e-05, + "loss": 0.1386, + "step": 2222 + }, + { + "epoch": 0.8903122497998399, + "learning_rate": 1.7815824685772042e-05, + "loss": 0.2224, + "step": 2224 + }, + { + "epoch": 0.8911128903122498, + "learning_rate": 1.779835884543776e-05, + "loss": 0.0014, + "step": 2226 + }, + { + "epoch": 0.8919135308246597, + "learning_rate": 1.7780832083678122e-05, + "loss": 0.0365, + "step": 2228 + }, + { + "epoch": 0.8927141713370697, + "learning_rate": 1.776324453741365e-05, + "loss": 0.315, + "step": 2230 + }, + { + "epoch": 0.8935148118494796, + "learning_rate": 1.774559634403971e-05, + "loss": 0.6204, + "step": 2232 + }, + { + "epoch": 0.8943154523618895, + "learning_rate": 1.7727887641425465e-05, + "loss": 0.0644, + "step": 2234 + }, + { + "epoch": 0.8951160928742994, + "learning_rate": 1.7710118567912732e-05, + "loss": 0.4255, + "step": 2236 + }, + { + "epoch": 0.8959167333867094, + "learning_rate": 1.7692289262315008e-05, + "loss": 0.1004, + "step": 2238 + }, + { + "epoch": 0.8967173738991193, + "learning_rate": 1.7674399863916298e-05, + "loss": 0.1559, + "step": 2240 + }, + { + "epoch": 0.8975180144115292, + "learning_rate": 1.765645051247007e-05, + "loss": 0.0007, + "step": 2242 + }, + { + "epoch": 0.8983186549239391, + "learning_rate": 1.7638441348198144e-05, + "loss": 0.0329, + "step": 2244 + }, + { + "epoch": 0.899119295436349, + "learning_rate": 1.762037251178961e-05, + "loss": 0.0871, + "step": 2246 + }, + { + "epoch": 0.899919935948759, + "learning_rate": 1.7602244144399713e-05, + "loss": 0.0455, + "step": 2248 + }, + { + "epoch": 0.900720576461169, + "learning_rate": 1.7584056387648738e-05, + "loss": 0.5892, + "step": 2250 + }, + { + "epoch": 0.9015212169735789, + "learning_rate": 1.7565809383620966e-05, + "loss": 0.079, + "step": 2252 + }, + { + "epoch": 0.9023218574859888, + "learning_rate": 1.7547503274863502e-05, + "loss": 0.1274, + "step": 2254 + }, + { + "epoch": 0.9031224979983987, + "learning_rate": 1.7529138204385186e-05, + "loss": 0.4296, + "step": 2256 + }, + { + "epoch": 0.9039231385108086, + "learning_rate": 1.7510714315655467e-05, + "loss": 0.7326, + "step": 2258 + }, + { + "epoch": 0.9047237790232185, + "learning_rate": 1.7492231752603305e-05, + "loss": 0.2269, + "step": 2260 + }, + { + "epoch": 0.9055244195356285, + "learning_rate": 1.7473690659616e-05, + "loss": 0.5769, + "step": 2262 + }, + { + "epoch": 0.9063250600480385, + "learning_rate": 1.7455091181538094e-05, + "loss": 0.9878, + "step": 2264 + }, + { + "epoch": 0.9071257005604484, + "learning_rate": 1.743643346367027e-05, + "loss": 0.0126, + "step": 2266 + }, + { + "epoch": 0.9079263410728583, + "learning_rate": 1.741771765176815e-05, + "loss": 0.0573, + "step": 2268 + }, + { + "epoch": 0.9087269815852682, + "learning_rate": 1.739894389204122e-05, + "loss": 0.0123, + "step": 2270 + }, + { + "epoch": 0.9095276220976781, + "learning_rate": 1.7380112331151657e-05, + "loss": 0.2645, + "step": 2272 + }, + { + "epoch": 0.910328262610088, + "learning_rate": 1.7361223116213146e-05, + "loss": 0.1383, + "step": 2274 + }, + { + "epoch": 0.911128903122498, + "learning_rate": 1.7342276394789825e-05, + "loss": 0.1959, + "step": 2276 + }, + { + "epoch": 0.911929543634908, + "learning_rate": 1.732327231489503e-05, + "loss": 0.2246, + "step": 2278 + }, + { + "epoch": 0.9127301841473179, + "learning_rate": 1.7304211024990216e-05, + "loss": 0.1226, + "step": 2280 + }, + { + "epoch": 0.9135308246597278, + "learning_rate": 1.728509267398376e-05, + "loss": 0.117, + "step": 2282 + }, + { + "epoch": 0.9143314651721377, + "learning_rate": 1.7265917411229803e-05, + "loss": 0.1093, + "step": 2284 + }, + { + "epoch": 0.9151321056845476, + "learning_rate": 1.7246685386527105e-05, + "loss": 0.1817, + "step": 2286 + }, + { + "epoch": 0.9159327461969575, + "learning_rate": 1.7227396750117802e-05, + "loss": 0.2221, + "step": 2288 + }, + { + "epoch": 0.9167333867093675, + "learning_rate": 1.7208051652686348e-05, + "loss": 0.0549, + "step": 2290 + }, + { + "epoch": 0.9175340272217775, + "learning_rate": 1.718865024535822e-05, + "loss": 0.2853, + "step": 2292 + }, + { + "epoch": 0.9183346677341874, + "learning_rate": 1.716919267969884e-05, + "loss": 0.308, + "step": 2294 + }, + { + "epoch": 0.9191353082465973, + "learning_rate": 1.7149679107712317e-05, + "loss": 0.1333, + "step": 2296 + }, + { + "epoch": 0.9199359487590072, + "learning_rate": 1.7130109681840298e-05, + "loss": 0.2974, + "step": 2298 + }, + { + "epoch": 0.9207365892714171, + "learning_rate": 1.711048455496075e-05, + "loss": 0.022, + "step": 2300 + }, + { + "epoch": 0.921537229783827, + "learning_rate": 1.7090803880386784e-05, + "loss": 0.0724, + "step": 2302 + }, + { + "epoch": 0.922337870296237, + "learning_rate": 1.7071067811865474e-05, + "loss": 0.073, + "step": 2304 + }, + { + "epoch": 0.923138510808647, + "learning_rate": 1.705127650357663e-05, + "loss": 0.1961, + "step": 2306 + }, + { + "epoch": 0.9239391513210569, + "learning_rate": 1.7031430110131566e-05, + "loss": 0.9507, + "step": 2308 + }, + { + "epoch": 0.9247397918334668, + "learning_rate": 1.701152878657197e-05, + "loss": 0.2302, + "step": 2310 + }, + { + "epoch": 0.9255404323458767, + "learning_rate": 1.699157268836863e-05, + "loss": 0.353, + "step": 2312 + }, + { + "epoch": 0.9263410728582866, + "learning_rate": 1.697156197142023e-05, + "loss": 0.0348, + "step": 2314 + }, + { + "epoch": 0.9271417133706965, + "learning_rate": 1.6951496792052148e-05, + "loss": 0.3032, + "step": 2316 + }, + { + "epoch": 0.9279423538831065, + "learning_rate": 1.6931377307015236e-05, + "loss": 0.1601, + "step": 2318 + }, + { + "epoch": 0.9287429943955164, + "learning_rate": 1.6911203673484583e-05, + "loss": 0.2476, + "step": 2320 + }, + { + "epoch": 0.9295436349079264, + "learning_rate": 1.6890976049058267e-05, + "loss": 0.288, + "step": 2322 + }, + { + "epoch": 0.9303442754203363, + "learning_rate": 1.687069459175619e-05, + "loss": 0.1142, + "step": 2324 + }, + { + "epoch": 0.9311449159327462, + "learning_rate": 1.6850359460018744e-05, + "loss": 0.0748, + "step": 2326 + }, + { + "epoch": 0.9319455564451561, + "learning_rate": 1.682997081270568e-05, + "loss": 0.1398, + "step": 2328 + }, + { + "epoch": 0.932746196957566, + "learning_rate": 1.6809528809094805e-05, + "loss": 0.075, + "step": 2330 + }, + { + "epoch": 0.933546837469976, + "learning_rate": 1.6789033608880742e-05, + "loss": 0.3647, + "step": 2332 + }, + { + "epoch": 0.9343474779823859, + "learning_rate": 1.67684853721737e-05, + "loss": 0.0252, + "step": 2334 + }, + { + "epoch": 0.9351481184947958, + "learning_rate": 1.6747884259498185e-05, + "loss": 0.4783, + "step": 2336 + }, + { + "epoch": 0.9359487590072058, + "learning_rate": 1.6727230431791826e-05, + "loss": 0.2695, + "step": 2338 + }, + { + "epoch": 0.9367493995196157, + "learning_rate": 1.6706524050404006e-05, + "loss": 0.2075, + "step": 2340 + }, + { + "epoch": 0.9375500400320256, + "learning_rate": 1.6685765277094702e-05, + "loss": 0.0415, + "step": 2342 + }, + { + "epoch": 0.9383506805444356, + "learning_rate": 1.6664954274033175e-05, + "loss": 0.0542, + "step": 2344 + }, + { + "epoch": 0.9391513210568455, + "learning_rate": 1.66440912037967e-05, + "loss": 0.0237, + "step": 2346 + }, + { + "epoch": 0.9399519615692554, + "learning_rate": 1.662317622936933e-05, + "loss": 0.3304, + "step": 2348 + }, + { + "epoch": 0.9407526020816653, + "learning_rate": 1.6602209514140562e-05, + "loss": 0.5609, + "step": 2350 + }, + { + "epoch": 0.9415532425940752, + "learning_rate": 1.6581191221904098e-05, + "loss": 0.6125, + "step": 2352 + }, + { + "epoch": 0.9423538831064852, + "learning_rate": 1.6560121516856592e-05, + "loss": 0.1366, + "step": 2354 + }, + { + "epoch": 0.9431545236188951, + "learning_rate": 1.6539000563596328e-05, + "loss": 0.003, + "step": 2356 + }, + { + "epoch": 0.9439551641313051, + "learning_rate": 1.651782852712194e-05, + "loss": 0.0442, + "step": 2358 + }, + { + "epoch": 0.944755804643715, + "learning_rate": 1.6496605572831127e-05, + "loss": 0.052, + "step": 2360 + }, + { + "epoch": 0.9455564451561249, + "learning_rate": 1.6475331866519387e-05, + "loss": 0.0048, + "step": 2362 + }, + { + "epoch": 0.9463570856685348, + "learning_rate": 1.6454007574378657e-05, + "loss": 0.1963, + "step": 2364 + }, + { + "epoch": 0.9471577261809447, + "learning_rate": 1.6432632862996062e-05, + "loss": 0.2465, + "step": 2366 + }, + { + "epoch": 0.9479583666933546, + "learning_rate": 1.6411207899352633e-05, + "loss": 0.0489, + "step": 2368 + }, + { + "epoch": 0.9487590072057646, + "learning_rate": 1.6389732850821964e-05, + "loss": 0.0135, + "step": 2370 + }, + { + "epoch": 0.9495596477181746, + "learning_rate": 1.6368207885168904e-05, + "loss": 0.1357, + "step": 2372 + }, + { + "epoch": 0.9503602882305845, + "learning_rate": 1.6346633170548275e-05, + "loss": 0.2982, + "step": 2374 + }, + { + "epoch": 0.9511609287429944, + "learning_rate": 1.6325008875503563e-05, + "loss": 0.0393, + "step": 2376 + }, + { + "epoch": 0.9519615692554043, + "learning_rate": 1.6303335168965495e-05, + "loss": 0.2672, + "step": 2378 + }, + { + "epoch": 0.9527622097678142, + "learning_rate": 1.628161222025089e-05, + "loss": 0.0149, + "step": 2380 + }, + { + "epoch": 0.9535628502802241, + "learning_rate": 1.625984019906122e-05, + "loss": 0.2072, + "step": 2382 + }, + { + "epoch": 0.9543634907926342, + "learning_rate": 1.623801927548132e-05, + "loss": 0.3846, + "step": 2384 + }, + { + "epoch": 0.9551641313050441, + "learning_rate": 1.6216149619978057e-05, + "loss": 0.0548, + "step": 2386 + }, + { + "epoch": 0.955964771817454, + "learning_rate": 1.6194231403398987e-05, + "loss": 0.0021, + "step": 2388 + }, + { + "epoch": 0.9567654123298639, + "learning_rate": 1.6172264796971063e-05, + "loss": 0.5501, + "step": 2390 + }, + { + "epoch": 0.9575660528422738, + "learning_rate": 1.6150249972299173e-05, + "loss": 0.381, + "step": 2392 + }, + { + "epoch": 0.9583666933546837, + "learning_rate": 1.612818710136499e-05, + "loss": 0.2256, + "step": 2394 + }, + { + "epoch": 0.9591673338670936, + "learning_rate": 1.6106076356525484e-05, + "loss": 0.1745, + "step": 2396 + }, + { + "epoch": 0.9599679743795037, + "learning_rate": 1.6083917910511623e-05, + "loss": 0.4644, + "step": 2398 + }, + { + "epoch": 0.9607686148919136, + "learning_rate": 1.6061711936427028e-05, + "loss": 0.001, + "step": 2400 + }, + { + "epoch": 0.9615692554043235, + "learning_rate": 1.60394586077466e-05, + "loss": 0.2483, + "step": 2402 + }, + { + "epoch": 0.9623698959167334, + "learning_rate": 1.6017158098315224e-05, + "loss": 0.0512, + "step": 2404 + }, + { + "epoch": 0.9631705364291433, + "learning_rate": 1.5994810582346266e-05, + "loss": 0.0671, + "step": 2406 + }, + { + "epoch": 0.9639711769415532, + "learning_rate": 1.5972416234420404e-05, + "loss": 0.1952, + "step": 2408 + }, + { + "epoch": 0.9647718174539631, + "learning_rate": 1.594997522948413e-05, + "loss": 0.2874, + "step": 2410 + }, + { + "epoch": 0.9655724579663731, + "learning_rate": 1.592748774284844e-05, + "loss": 0.102, + "step": 2412 + }, + { + "epoch": 0.966373098478783, + "learning_rate": 1.5904953950187448e-05, + "loss": 0.1618, + "step": 2414 + }, + { + "epoch": 0.967173738991193, + "learning_rate": 1.588237402753703e-05, + "loss": 0.3621, + "step": 2416 + }, + { + "epoch": 0.9679743795036029, + "learning_rate": 1.5859748151293354e-05, + "loss": 0.1427, + "step": 2418 + }, + { + "epoch": 0.9687750200160128, + "learning_rate": 1.5837076498211673e-05, + "loss": 0.2411, + "step": 2420 + }, + { + "epoch": 0.9695756605284227, + "learning_rate": 1.581435924540482e-05, + "loss": 0.094, + "step": 2422 + }, + { + "epoch": 0.9703763010408326, + "learning_rate": 1.579159657034185e-05, + "loss": 0.0049, + "step": 2424 + }, + { + "epoch": 0.9711769415532426, + "learning_rate": 1.5768788650846674e-05, + "loss": 0.1452, + "step": 2426 + }, + { + "epoch": 0.9719775820656525, + "learning_rate": 1.574593566509664e-05, + "loss": 0.2861, + "step": 2428 + }, + { + "epoch": 0.9727782225780625, + "learning_rate": 1.5723037791621203e-05, + "loss": 0.0181, + "step": 2430 + }, + { + "epoch": 0.9735788630904724, + "learning_rate": 1.5700095209300386e-05, + "loss": 0.0083, + "step": 2432 + }, + { + "epoch": 0.9743795036028823, + "learning_rate": 1.5677108097363565e-05, + "loss": 0.4448, + "step": 2434 + }, + { + "epoch": 0.9751801441152922, + "learning_rate": 1.5654076635387976e-05, + "loss": 0.1516, + "step": 2436 + }, + { + "epoch": 0.9759807846277022, + "learning_rate": 1.5631001003297302e-05, + "loss": 0.1595, + "step": 2438 + }, + { + "epoch": 0.9767814251401121, + "learning_rate": 1.560788138136029e-05, + "loss": 0.1581, + "step": 2440 + }, + { + "epoch": 0.977582065652522, + "learning_rate": 1.5584717950189373e-05, + "loss": 0.0335, + "step": 2442 + }, + { + "epoch": 0.978382706164932, + "learning_rate": 1.5561510890739137e-05, + "loss": 0.1403, + "step": 2444 + }, + { + "epoch": 0.9791833466773419, + "learning_rate": 1.5538260384305083e-05, + "loss": 0.2236, + "step": 2446 + }, + { + "epoch": 0.9799839871897518, + "learning_rate": 1.5514966612522088e-05, + "loss": 0.1106, + "step": 2448 + }, + { + "epoch": 0.9807846277021617, + "learning_rate": 1.5491629757363033e-05, + "loss": 0.2088, + "step": 2450 + }, + { + "epoch": 0.9815852682145717, + "learning_rate": 1.546825000113736e-05, + "loss": 0.608, + "step": 2452 + }, + { + "epoch": 0.9823859087269816, + "learning_rate": 1.544482752648966e-05, + "loss": 0.06, + "step": 2454 + }, + { + "epoch": 0.9831865492393915, + "learning_rate": 1.5421362516398285e-05, + "loss": 0.5576, + "step": 2456 + }, + { + "epoch": 0.9839871897518014, + "learning_rate": 1.539785515417377e-05, + "loss": 0.1275, + "step": 2458 + }, + { + "epoch": 0.9847878302642114, + "learning_rate": 1.5374305623457605e-05, + "loss": 1.3793, + "step": 2460 + }, + { + "epoch": 0.9855884707766213, + "learning_rate": 1.5350714108220677e-05, + "loss": 0.0663, + "step": 2462 + }, + { + "epoch": 0.9863891112890312, + "learning_rate": 1.532708079276186e-05, + "loss": 0.1428, + "step": 2464 + }, + { + "epoch": 0.9871897518014412, + "learning_rate": 1.5303405861706567e-05, + "loss": 0.9145, + "step": 2466 + }, + { + "epoch": 0.9879903923138511, + "learning_rate": 1.5279689500005353e-05, + "loss": 0.0227, + "step": 2468 + }, + { + "epoch": 0.988791032826261, + "learning_rate": 1.5255931892932344e-05, + "loss": 0.0544, + "step": 2470 + }, + { + "epoch": 0.9895916733386709, + "learning_rate": 1.5232133226083962e-05, + "loss": 0.4662, + "step": 2472 + }, + { + "epoch": 0.9903923138510808, + "learning_rate": 1.5208293685377362e-05, + "loss": 0.064, + "step": 2474 + }, + { + "epoch": 0.9911929543634908, + "learning_rate": 1.5184413457049014e-05, + "loss": 0.0645, + "step": 2476 + }, + { + "epoch": 0.9919935948759008, + "learning_rate": 1.5160492727653238e-05, + "loss": 0.3333, + "step": 2478 + }, + { + "epoch": 0.9927942353883107, + "learning_rate": 1.5136531684060753e-05, + "loss": 0.3651, + "step": 2480 + }, + { + "epoch": 0.9935948759007206, + "learning_rate": 1.5112530513457251e-05, + "loss": 0.1784, + "step": 2482 + }, + { + "epoch": 0.9943955164131305, + "learning_rate": 1.50884894033418e-05, + "loss": 0.0366, + "step": 2484 + }, + { + "epoch": 0.9951961569255404, + "learning_rate": 1.5064408541525578e-05, + "loss": 0.0181, + "step": 2486 + }, + { + "epoch": 0.9959967974379503, + "learning_rate": 1.504028811613027e-05, + "loss": 0.1543, + "step": 2488 + }, + { + "epoch": 0.9967974379503602, + "learning_rate": 1.5016128315586636e-05, + "loss": 0.2302, + "step": 2490 + }, + { + "epoch": 0.9975980784627703, + "learning_rate": 1.4991929328633043e-05, + "loss": 0.1723, + "step": 2492 + }, + { + "epoch": 0.9983987189751802, + "learning_rate": 1.4967691344314012e-05, + "loss": 0.1521, + "step": 2494 + }, + { + "epoch": 0.9991993594875901, + "learning_rate": 1.4943414551978622e-05, + "loss": 0.4803, + "step": 2496 + }, + { + "epoch": 1.0, + "learning_rate": 1.4919099141279214e-05, + "loss": 0.0601, + "step": 2498 + }, + { + "epoch": 1.0, + "step": 2498, + "total_flos": 1.5606287483011072e+16, + "train_loss": 0.21205857841617706, + "train_runtime": 3220.1664, + "train_samples_per_second": 6.206, + "train_steps_per_second": 0.776 + } + ], + "logging_steps": 2, + "max_steps": 2498, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 1.5606287483011072e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_125_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_125_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b04a2edf8c97b507d03b9cf03ef8a34da46e1a8 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_125_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2d947126c628438ba9d02f915ddee955ca5a48797cf023adac0b3268e98b96e +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_125_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_125_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..e0c83c7f314ea55c2475252648c63d8b67653d54 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_125_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ffa64505aa4b1a82142e7c39ca1971c1f08f5d4b4e846e61cb8b56823401e22 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_125_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_125_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..0113a1d9af7522a739aa74aacd456565774ba0f7 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_125_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:753ef7369cb87ba16731ae8518682a71ee44ae2fbc264b3d02ed42cc158fb671 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_125_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_125_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..8830b839aacf624ea057f0038a5e76eea13dccbb --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_125_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:804e679959facc911c5746a9efdbcd738e2d0d047f73f20205748f9664d9ac65 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_25_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_25_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3896f3b748e356a8bcefe3cad98d20be79c1d601 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_25_seed1/0_trainer_state.json @@ -0,0 +1,15020 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 4996, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008006405124099279, + "learning_rate": 2.406842319175051e-06, + "loss": 0.2245, + "step": 2 + }, + { + "epoch": 0.0008006405124099279, + "learning_rate": 2.415943612351265e-06, + "loss": 0.0118, + "step": 4 + }, + { + "epoch": 0.0016012810248198558, + "learning_rate": 2.4250597173539104e-06, + "loss": 0.0292, + "step": 6 + }, + { + "epoch": 0.0016012810248198558, + "learning_rate": 2.4341906163790364e-06, + "loss": 0.2735, + "step": 8 + }, + { + "epoch": 0.0024019215372297837, + "learning_rate": 2.443336291593801e-06, + "loss": 0.2063, + "step": 10 + }, + { + "epoch": 0.0024019215372297837, + "learning_rate": 2.4524967251364995e-06, + "loss": 0.0706, + "step": 12 + }, + { + "epoch": 0.0032025620496397116, + "learning_rate": 2.461671899116598e-06, + "loss": 0.0441, + "step": 14 + }, + { + "epoch": 0.0032025620496397116, + "learning_rate": 2.4708617956148052e-06, + "loss": 0.0202, + "step": 16 + }, + { + "epoch": 0.0040032025620496394, + "learning_rate": 2.4800663966830417e-06, + "loss": 0.0157, + "step": 18 + }, + { + "epoch": 0.0040032025620496394, + "learning_rate": 2.4892856843445236e-06, + "loss": 0.0341, + "step": 20 + }, + { + "epoch": 0.004803843074459567, + "learning_rate": 2.4985196405937807e-06, + "loss": 0.3035, + "step": 22 + }, + { + "epoch": 0.004803843074459567, + "learning_rate": 2.507768247396697e-06, + "loss": 0.0827, + "step": 24 + }, + { + "epoch": 0.005604483586869495, + "learning_rate": 2.5170314866905443e-06, + "loss": 0.2022, + "step": 26 + }, + { + "epoch": 0.005604483586869495, + "learning_rate": 2.5263093403840022e-06, + "loss": 0.1232, + "step": 28 + }, + { + "epoch": 0.006405124099279423, + "learning_rate": 2.535601790357246e-06, + "loss": 0.0604, + "step": 30 + }, + { + "epoch": 0.006405124099279423, + "learning_rate": 2.5449088184619065e-06, + "loss": 0.0003, + "step": 32 + }, + { + "epoch": 0.007205764611689352, + "learning_rate": 2.5542304065211578e-06, + "loss": 0.0145, + "step": 34 + }, + { + "epoch": 0.007205764611689352, + "learning_rate": 2.5635665363297356e-06, + "loss": 0.4092, + "step": 36 + }, + { + "epoch": 0.008006405124099279, + "learning_rate": 2.5729171896539763e-06, + "loss": 0.0639, + "step": 38 + }, + { + "epoch": 0.008006405124099279, + "learning_rate": 2.5822823482318517e-06, + "loss": 0.1412, + "step": 40 + }, + { + "epoch": 0.008807045636509208, + "learning_rate": 2.5916619937729915e-06, + "loss": 0.0089, + "step": 42 + }, + { + "epoch": 0.008807045636509208, + "learning_rate": 2.6010561079587694e-06, + "loss": 0.0002, + "step": 44 + }, + { + "epoch": 0.009607686148919135, + "learning_rate": 2.6104646724422643e-06, + "loss": 0.0789, + "step": 46 + }, + { + "epoch": 0.009607686148919135, + "learning_rate": 2.6198876688483453e-06, + "loss": 0.0153, + "step": 48 + }, + { + "epoch": 0.010408326661329063, + "learning_rate": 2.629325078773699e-06, + "loss": 0.0316, + "step": 50 + }, + { + "epoch": 0.010408326661329063, + "learning_rate": 2.6387768837868565e-06, + "loss": 0.1, + "step": 52 + }, + { + "epoch": 0.01120896717373899, + "learning_rate": 2.648243065428239e-06, + "loss": 0.905, + "step": 54 + }, + { + "epoch": 0.01120896717373899, + "learning_rate": 2.6577236052101764e-06, + "loss": 0.8829, + "step": 56 + }, + { + "epoch": 0.01200960768614892, + "learning_rate": 2.6672184846169934e-06, + "loss": 0.0738, + "step": 58 + }, + { + "epoch": 0.01200960768614892, + "learning_rate": 2.6767276851049716e-06, + "loss": 0.0161, + "step": 60 + }, + { + "epoch": 0.012810248198558846, + "learning_rate": 2.686251188102439e-06, + "loss": 0.0466, + "step": 62 + }, + { + "epoch": 0.012810248198558846, + "learning_rate": 2.6957889750097866e-06, + "loss": 0.238, + "step": 64 + }, + { + "epoch": 0.013610888710968775, + "learning_rate": 2.7053410271995085e-06, + "loss": 0.0087, + "step": 66 + }, + { + "epoch": 0.013610888710968775, + "learning_rate": 2.7149073260162416e-06, + "loss": 0.0255, + "step": 68 + }, + { + "epoch": 0.014411529223378704, + "learning_rate": 2.724487852776785e-06, + "loss": 0.0097, + "step": 70 + }, + { + "epoch": 0.014411529223378704, + "learning_rate": 2.7340825887701848e-06, + "loss": 0.0003, + "step": 72 + }, + { + "epoch": 0.01521216973578863, + "learning_rate": 2.7436915152577038e-06, + "loss": 0.1093, + "step": 74 + }, + { + "epoch": 0.01521216973578863, + "learning_rate": 2.7533146134728993e-06, + "loss": 0.0242, + "step": 76 + }, + { + "epoch": 0.016012810248198558, + "learning_rate": 2.7629518646216522e-06, + "loss": 0.0683, + "step": 78 + }, + { + "epoch": 0.016012810248198558, + "learning_rate": 2.772603249882202e-06, + "loss": 0.0011, + "step": 80 + }, + { + "epoch": 0.016813450760608487, + "learning_rate": 2.782268750405185e-06, + "loss": 0.0392, + "step": 82 + }, + { + "epoch": 0.016813450760608487, + "learning_rate": 2.7919483473136555e-06, + "loss": 0.0253, + "step": 84 + }, + { + "epoch": 0.017614091273018415, + "learning_rate": 2.801642021703177e-06, + "loss": 0.0457, + "step": 86 + }, + { + "epoch": 0.017614091273018415, + "learning_rate": 2.81134975464178e-06, + "loss": 0.0589, + "step": 88 + }, + { + "epoch": 0.018414731785428344, + "learning_rate": 2.821071527170053e-06, + "loss": 0.0115, + "step": 90 + }, + { + "epoch": 0.018414731785428344, + "learning_rate": 2.8308073203011634e-06, + "loss": 0.014, + "step": 92 + }, + { + "epoch": 0.01921537229783827, + "learning_rate": 2.8405571150208945e-06, + "loss": 0.5004, + "step": 94 + }, + { + "epoch": 0.01921537229783827, + "learning_rate": 2.850320892287688e-06, + "loss": 0.0032, + "step": 96 + }, + { + "epoch": 0.020016012810248198, + "learning_rate": 2.860098633032663e-06, + "loss": 0.0276, + "step": 98 + }, + { + "epoch": 0.020016012810248198, + "learning_rate": 2.8698903181597026e-06, + "loss": 0.3032, + "step": 100 + }, + { + "epoch": 0.020816653322658127, + "learning_rate": 2.879695928545424e-06, + "loss": 0.0603, + "step": 102 + }, + { + "epoch": 0.020816653322658127, + "learning_rate": 2.889515445039256e-06, + "loss": 0.0499, + "step": 104 + }, + { + "epoch": 0.021617293835068056, + "learning_rate": 2.899348848463471e-06, + "loss": 0.0774, + "step": 106 + }, + { + "epoch": 0.021617293835068056, + "learning_rate": 2.909196119613218e-06, + "loss": 0.1493, + "step": 108 + }, + { + "epoch": 0.02241793434747798, + "learning_rate": 2.9190572392565643e-06, + "loss": 0.0502, + "step": 110 + }, + { + "epoch": 0.02241793434747798, + "learning_rate": 2.928932188134529e-06, + "loss": 0.235, + "step": 112 + }, + { + "epoch": 0.02321857485988791, + "learning_rate": 2.9388209469611093e-06, + "loss": 0.1898, + "step": 114 + }, + { + "epoch": 0.02321857485988791, + "learning_rate": 2.9487234964233724e-06, + "loss": 0.054, + "step": 116 + }, + { + "epoch": 0.02401921537229784, + "learning_rate": 2.9586398171814114e-06, + "loss": 0.0687, + "step": 118 + }, + { + "epoch": 0.02401921537229784, + "learning_rate": 2.9685698898684355e-06, + "loss": 0.0171, + "step": 120 + }, + { + "epoch": 0.024819855884707767, + "learning_rate": 2.9785136950907987e-06, + "loss": 0.096, + "step": 122 + }, + { + "epoch": 0.024819855884707767, + "learning_rate": 2.988471213428035e-06, + "loss": 0.0267, + "step": 124 + }, + { + "epoch": 0.025620496397117692, + "learning_rate": 2.9984424254328936e-06, + "loss": 0.0394, + "step": 126 + }, + { + "epoch": 0.025620496397117692, + "learning_rate": 3.00842731163137e-06, + "loss": 0.0225, + "step": 128 + }, + { + "epoch": 0.02642113690952762, + "learning_rate": 3.0184258525227895e-06, + "loss": 0.1671, + "step": 130 + }, + { + "epoch": 0.02642113690952762, + "learning_rate": 3.0284380285797733e-06, + "loss": 0.0021, + "step": 132 + }, + { + "epoch": 0.02722177742193755, + "learning_rate": 3.038463820248324e-06, + "loss": 0.2073, + "step": 134 + }, + { + "epoch": 0.02722177742193755, + "learning_rate": 3.048503207947854e-06, + "loss": 0.4269, + "step": 136 + }, + { + "epoch": 0.02802241793434748, + "learning_rate": 3.0585561720712207e-06, + "loss": 0.2502, + "step": 138 + }, + { + "epoch": 0.02802241793434748, + "learning_rate": 3.068622692984767e-06, + "loss": 0.0066, + "step": 140 + }, + { + "epoch": 0.028823058446757407, + "learning_rate": 3.0787027510283495e-06, + "loss": 0.3818, + "step": 142 + }, + { + "epoch": 0.028823058446757407, + "learning_rate": 3.0887963265154187e-06, + "loss": 1.3114, + "step": 144 + }, + { + "epoch": 0.029623698959167333, + "learning_rate": 3.098903399732992e-06, + "loss": 0.095, + "step": 146 + }, + { + "epoch": 0.029623698959167333, + "learning_rate": 3.1090239509417364e-06, + "loss": 0.0453, + "step": 148 + }, + { + "epoch": 0.03042433947157726, + "learning_rate": 3.1191579603759946e-06, + "loss": 0.0139, + "step": 150 + }, + { + "epoch": 0.03042433947157726, + "learning_rate": 3.129305408243829e-06, + "loss": 0.0274, + "step": 152 + }, + { + "epoch": 0.03122497998398719, + "learning_rate": 3.139466274727052e-06, + "loss": 0.0971, + "step": 154 + }, + { + "epoch": 0.03122497998398719, + "learning_rate": 3.1496405399812602e-06, + "loss": 0.113, + "step": 156 + }, + { + "epoch": 0.032025620496397116, + "learning_rate": 3.159828184135917e-06, + "loss": 0.0473, + "step": 158 + }, + { + "epoch": 0.032025620496397116, + "learning_rate": 3.17002918729432e-06, + "loss": 0.3605, + "step": 160 + }, + { + "epoch": 0.03282626100880705, + "learning_rate": 3.1802435295336908e-06, + "loss": 0.2868, + "step": 162 + }, + { + "epoch": 0.03282626100880705, + "learning_rate": 3.1904711909051967e-06, + "loss": 0.0193, + "step": 164 + }, + { + "epoch": 0.03362690152121697, + "learning_rate": 3.2007121514339924e-06, + "loss": 0.1995, + "step": 166 + }, + { + "epoch": 0.03362690152121697, + "learning_rate": 3.2109663911192622e-06, + "loss": 0.1387, + "step": 168 + }, + { + "epoch": 0.0344275420336269, + "learning_rate": 3.221233889934239e-06, + "loss": 0.03, + "step": 170 + }, + { + "epoch": 0.0344275420336269, + "learning_rate": 3.231514627826302e-06, + "loss": 0.0652, + "step": 172 + }, + { + "epoch": 0.03522818254603683, + "learning_rate": 3.2418085847169344e-06, + "loss": 0.3763, + "step": 174 + }, + { + "epoch": 0.03522818254603683, + "learning_rate": 3.2521157405018146e-06, + "loss": 0.032, + "step": 176 + }, + { + "epoch": 0.036028823058446756, + "learning_rate": 3.2624360750508457e-06, + "loss": 0.0932, + "step": 178 + }, + { + "epoch": 0.036028823058446756, + "learning_rate": 3.2727695682081897e-06, + "loss": 0.0047, + "step": 180 + }, + { + "epoch": 0.03682946357085669, + "learning_rate": 3.28311619979231e-06, + "loss": 0.0176, + "step": 182 + }, + { + "epoch": 0.03682946357085669, + "learning_rate": 3.293475949595998e-06, + "loss": 0.1452, + "step": 184 + }, + { + "epoch": 0.03763010408326661, + "learning_rate": 3.303848797386465e-06, + "loss": 0.0661, + "step": 186 + }, + { + "epoch": 0.03763010408326661, + "learning_rate": 3.314234722905302e-06, + "loss": 0.1349, + "step": 188 + }, + { + "epoch": 0.03843074459567654, + "learning_rate": 3.3246337058685697e-06, + "loss": 0.7079, + "step": 190 + }, + { + "epoch": 0.03843074459567654, + "learning_rate": 3.335045725966829e-06, + "loss": 0.0953, + "step": 192 + }, + { + "epoch": 0.03923138510808647, + "learning_rate": 3.3454707628651806e-06, + "loss": 0.2677, + "step": 194 + }, + { + "epoch": 0.03923138510808647, + "learning_rate": 3.355908796203301e-06, + "loss": 0.0954, + "step": 196 + }, + { + "epoch": 0.040032025620496396, + "learning_rate": 3.3663598055954716e-06, + "loss": 0.012, + "step": 198 + }, + { + "epoch": 0.040032025620496396, + "learning_rate": 3.3768237706306716e-06, + "loss": 0.209, + "step": 200 + }, + { + "epoch": 0.04083266613290633, + "learning_rate": 3.3873006708725365e-06, + "loss": 0.3563, + "step": 202 + }, + { + "epoch": 0.04083266613290633, + "learning_rate": 3.3977904858594534e-06, + "loss": 0.0811, + "step": 204 + }, + { + "epoch": 0.041633306645316254, + "learning_rate": 3.408293195104586e-06, + "loss": 0.3081, + "step": 206 + }, + { + "epoch": 0.041633306645316254, + "learning_rate": 3.418808778095917e-06, + "loss": 0.0914, + "step": 208 + }, + { + "epoch": 0.04243394715772618, + "learning_rate": 3.4293372142962845e-06, + "loss": 0.0096, + "step": 210 + }, + { + "epoch": 0.04243394715772618, + "learning_rate": 3.4398784831434097e-06, + "loss": 0.0516, + "step": 212 + }, + { + "epoch": 0.04323458767013611, + "learning_rate": 3.4504325640499936e-06, + "loss": 0.0363, + "step": 214 + }, + { + "epoch": 0.04323458767013611, + "learning_rate": 3.460999436403676e-06, + "loss": 0.0339, + "step": 216 + }, + { + "epoch": 0.044035228182546036, + "learning_rate": 3.4715790795671232e-06, + "loss": 0.6857, + "step": 218 + }, + { + "epoch": 0.044035228182546036, + "learning_rate": 3.4821714728780654e-06, + "loss": 0.021, + "step": 220 + }, + { + "epoch": 0.04483586869495596, + "learning_rate": 3.4927765956493276e-06, + "loss": 0.0023, + "step": 222 + }, + { + "epoch": 0.04483586869495596, + "learning_rate": 3.5033944271688624e-06, + "loss": 0.1252, + "step": 224 + }, + { + "epoch": 0.045636509207365894, + "learning_rate": 3.514024946699842e-06, + "loss": 0.0489, + "step": 226 + }, + { + "epoch": 0.045636509207365894, + "learning_rate": 3.5246681334806177e-06, + "loss": 0.0338, + "step": 228 + }, + { + "epoch": 0.04643714971977582, + "learning_rate": 3.535323966724814e-06, + "loss": 0.0024, + "step": 230 + }, + { + "epoch": 0.04643714971977582, + "learning_rate": 3.5459924256213596e-06, + "loss": 0.3643, + "step": 232 + }, + { + "epoch": 0.04723779023218575, + "learning_rate": 3.556673489334522e-06, + "loss": 0.0939, + "step": 234 + }, + { + "epoch": 0.04723779023218575, + "learning_rate": 3.567367137003953e-06, + "loss": 0.2711, + "step": 236 + }, + { + "epoch": 0.04803843074459568, + "learning_rate": 3.5780733477447127e-06, + "loss": 0.0104, + "step": 238 + }, + { + "epoch": 0.04803843074459568, + "learning_rate": 3.588792100647368e-06, + "loss": 0.1105, + "step": 240 + }, + { + "epoch": 0.0488390712570056, + "learning_rate": 3.5995233747779467e-06, + "loss": 0.4746, + "step": 242 + }, + { + "epoch": 0.0488390712570056, + "learning_rate": 3.6102671491780393e-06, + "loss": 0.2292, + "step": 244 + }, + { + "epoch": 0.049639711769415534, + "learning_rate": 3.6210234028648216e-06, + "loss": 0.0113, + "step": 246 + }, + { + "epoch": 0.049639711769415534, + "learning_rate": 3.6317921148310965e-06, + "loss": 0.008, + "step": 248 + }, + { + "epoch": 0.05044035228182546, + "learning_rate": 3.6425732640453235e-06, + "loss": 0.005, + "step": 250 + }, + { + "epoch": 0.05044035228182546, + "learning_rate": 3.653366829451711e-06, + "loss": 0.1048, + "step": 252 + }, + { + "epoch": 0.051240992794235385, + "learning_rate": 3.6641727899701795e-06, + "loss": 0.0156, + "step": 254 + }, + { + "epoch": 0.051240992794235385, + "learning_rate": 3.674991124496452e-06, + "loss": 0.2425, + "step": 256 + }, + { + "epoch": 0.05204163330664532, + "learning_rate": 3.6858218119020884e-06, + "loss": 0.3036, + "step": 258 + }, + { + "epoch": 0.05204163330664532, + "learning_rate": 3.696664831034521e-06, + "loss": 0.0145, + "step": 260 + }, + { + "epoch": 0.05284227381905524, + "learning_rate": 3.7075201607170997e-06, + "loss": 0.03, + "step": 262 + }, + { + "epoch": 0.05284227381905524, + "learning_rate": 3.7183877797491143e-06, + "loss": 0.0707, + "step": 264 + }, + { + "epoch": 0.053642914331465175, + "learning_rate": 3.729267666905899e-06, + "loss": 0.0401, + "step": 266 + }, + { + "epoch": 0.053642914331465175, + "learning_rate": 3.740159800938784e-06, + "loss": 0.8348, + "step": 268 + }, + { + "epoch": 0.0544435548438751, + "learning_rate": 3.751064160575195e-06, + "loss": 0.0358, + "step": 270 + }, + { + "epoch": 0.0544435548438751, + "learning_rate": 3.7619807245186824e-06, + "loss": 0.029, + "step": 272 + }, + { + "epoch": 0.055244195356285025, + "learning_rate": 3.772909471448959e-06, + "loss": 0.0364, + "step": 274 + }, + { + "epoch": 0.055244195356285025, + "learning_rate": 3.783850380021933e-06, + "loss": 0.1504, + "step": 276 + }, + { + "epoch": 0.05604483586869496, + "learning_rate": 3.794803428869799e-06, + "loss": 0.0876, + "step": 278 + }, + { + "epoch": 0.05604483586869496, + "learning_rate": 3.8057685966010025e-06, + "loss": 0.3091, + "step": 280 + }, + { + "epoch": 0.05684547638110488, + "learning_rate": 3.816745861800334e-06, + "loss": 0.0216, + "step": 282 + }, + { + "epoch": 0.05684547638110488, + "learning_rate": 3.827735203028956e-06, + "loss": 0.0179, + "step": 284 + }, + { + "epoch": 0.057646116893514815, + "learning_rate": 3.838736598824446e-06, + "loss": 0.3936, + "step": 286 + }, + { + "epoch": 0.057646116893514815, + "learning_rate": 3.849750027700842e-06, + "loss": 1.0686, + "step": 288 + }, + { + "epoch": 0.05844675740592474, + "learning_rate": 3.860775468148662e-06, + "loss": 0.0364, + "step": 290 + }, + { + "epoch": 0.05844675740592474, + "learning_rate": 3.871812898635011e-06, + "loss": 0.0011, + "step": 292 + }, + { + "epoch": 0.059247397918334666, + "learning_rate": 3.882862297603536e-06, + "loss": 0.3827, + "step": 294 + }, + { + "epoch": 0.059247397918334666, + "learning_rate": 3.8939236434745184e-06, + "loss": 0.0509, + "step": 296 + }, + { + "epoch": 0.0600480384307446, + "learning_rate": 3.904996914644913e-06, + "loss": 0.0551, + "step": 298 + }, + { + "epoch": 0.0600480384307446, + "learning_rate": 3.916082089488379e-06, + "loss": 0.1809, + "step": 300 + }, + { + "epoch": 0.06084867894315452, + "learning_rate": 3.927179146355317e-06, + "loss": 0.1952, + "step": 302 + }, + { + "epoch": 0.06084867894315452, + "learning_rate": 3.938288063572962e-06, + "loss": 0.0243, + "step": 304 + }, + { + "epoch": 0.06164931945556445, + "learning_rate": 3.949408819445345e-06, + "loss": 0.0479, + "step": 306 + }, + { + "epoch": 0.06164931945556445, + "learning_rate": 3.960541392253387e-06, + "loss": 0.002, + "step": 308 + }, + { + "epoch": 0.06244995996797438, + "learning_rate": 3.971685760254933e-06, + "loss": 0.0113, + "step": 310 + }, + { + "epoch": 0.06244995996797438, + "learning_rate": 3.982841901684792e-06, + "loss": 0.1619, + "step": 312 + }, + { + "epoch": 0.0632506004803843, + "learning_rate": 3.994009794754777e-06, + "loss": 0.1256, + "step": 314 + }, + { + "epoch": 0.0632506004803843, + "learning_rate": 4.005189417653737e-06, + "loss": 0.0047, + "step": 316 + }, + { + "epoch": 0.06405124099279423, + "learning_rate": 4.016380748547654e-06, + "loss": 0.0296, + "step": 318 + }, + { + "epoch": 0.06405124099279423, + "learning_rate": 4.027583765579601e-06, + "loss": 0.1846, + "step": 320 + }, + { + "epoch": 0.06485188150520416, + "learning_rate": 4.038798446869847e-06, + "loss": 1.0708, + "step": 322 + }, + { + "epoch": 0.06485188150520416, + "learning_rate": 4.050024770515873e-06, + "loss": 0.7748, + "step": 324 + }, + { + "epoch": 0.0656525220176141, + "learning_rate": 4.061262714592426e-06, + "loss": 0.0098, + "step": 326 + }, + { + "epoch": 0.0656525220176141, + "learning_rate": 4.072512257151546e-06, + "loss": 0.2922, + "step": 328 + }, + { + "epoch": 0.06645316253002402, + "learning_rate": 4.0837733762226584e-06, + "loss": 0.0793, + "step": 330 + }, + { + "epoch": 0.06645316253002402, + "learning_rate": 4.095046049812541e-06, + "loss": 0.2065, + "step": 332 + }, + { + "epoch": 0.06725380304243395, + "learning_rate": 4.106330255905417e-06, + "loss": 0.0293, + "step": 334 + }, + { + "epoch": 0.06725380304243395, + "learning_rate": 4.117625972462988e-06, + "loss": 0.0397, + "step": 336 + }, + { + "epoch": 0.06805444355484387, + "learning_rate": 4.128933177424475e-06, + "loss": 0.2227, + "step": 338 + }, + { + "epoch": 0.06805444355484387, + "learning_rate": 4.1402518487066624e-06, + "loss": 0.4014, + "step": 340 + }, + { + "epoch": 0.0688550840672538, + "learning_rate": 4.151581964203924e-06, + "loss": 0.5289, + "step": 342 + }, + { + "epoch": 0.0688550840672538, + "learning_rate": 4.1629235017883285e-06, + "loss": 0.0356, + "step": 344 + }, + { + "epoch": 0.06965572457966374, + "learning_rate": 4.174276439309593e-06, + "loss": 0.0765, + "step": 346 + }, + { + "epoch": 0.06965572457966374, + "learning_rate": 4.1856407545951825e-06, + "loss": 0.0174, + "step": 348 + }, + { + "epoch": 0.07045636509207366, + "learning_rate": 4.197016425450347e-06, + "loss": 0.0532, + "step": 350 + }, + { + "epoch": 0.07045636509207366, + "learning_rate": 4.208403429658151e-06, + "loss": 0.3923, + "step": 352 + }, + { + "epoch": 0.07125700560448359, + "learning_rate": 4.219801744979517e-06, + "loss": 0.1223, + "step": 354 + }, + { + "epoch": 0.07125700560448359, + "learning_rate": 4.2312113491533145e-06, + "loss": 0.1502, + "step": 356 + }, + { + "epoch": 0.07205764611689351, + "learning_rate": 4.242632219896328e-06, + "loss": 0.0199, + "step": 358 + }, + { + "epoch": 0.07205764611689351, + "learning_rate": 4.254064334903347e-06, + "loss": 0.2017, + "step": 360 + }, + { + "epoch": 0.07285828662930344, + "learning_rate": 4.2655076718472045e-06, + "loss": 0.013, + "step": 362 + }, + { + "epoch": 0.07285828662930344, + "learning_rate": 4.276962208378814e-06, + "loss": 0.0144, + "step": 364 + }, + { + "epoch": 0.07365892714171338, + "learning_rate": 4.28842792212722e-06, + "loss": 0.5861, + "step": 366 + }, + { + "epoch": 0.07365892714171338, + "learning_rate": 4.299904790699619e-06, + "loss": 0.1492, + "step": 368 + }, + { + "epoch": 0.0744595676541233, + "learning_rate": 4.3113927916814665e-06, + "loss": 0.1093, + "step": 370 + }, + { + "epoch": 0.0744595676541233, + "learning_rate": 4.3228919026364345e-06, + "loss": 0.0634, + "step": 372 + }, + { + "epoch": 0.07526020816653323, + "learning_rate": 4.33440210110651e-06, + "loss": 0.2735, + "step": 374 + }, + { + "epoch": 0.07526020816653323, + "learning_rate": 4.345923364612024e-06, + "loss": 0.0113, + "step": 376 + }, + { + "epoch": 0.07606084867894315, + "learning_rate": 4.3574556706517035e-06, + "loss": 0.0036, + "step": 378 + }, + { + "epoch": 0.07606084867894315, + "learning_rate": 4.368998996702686e-06, + "loss": 0.6036, + "step": 380 + }, + { + "epoch": 0.07686148919135308, + "learning_rate": 4.380553320220638e-06, + "loss": 0.0076, + "step": 382 + }, + { + "epoch": 0.07686148919135308, + "learning_rate": 4.392118618639698e-06, + "loss": 0.2191, + "step": 384 + }, + { + "epoch": 0.07766212970376302, + "learning_rate": 4.403694869372589e-06, + "loss": 0.0467, + "step": 386 + }, + { + "epoch": 0.07766212970376302, + "learning_rate": 4.415282049810643e-06, + "loss": 0.253, + "step": 388 + }, + { + "epoch": 0.07846277021617294, + "learning_rate": 4.4268801373238454e-06, + "loss": 0.1936, + "step": 390 + }, + { + "epoch": 0.07846277021617294, + "learning_rate": 4.4384891092608795e-06, + "loss": 0.1942, + "step": 392 + }, + { + "epoch": 0.07926341072858287, + "learning_rate": 4.450108942949158e-06, + "loss": 0.0363, + "step": 394 + }, + { + "epoch": 0.07926341072858287, + "learning_rate": 4.461739615694921e-06, + "loss": 0.132, + "step": 396 + }, + { + "epoch": 0.08006405124099279, + "learning_rate": 4.473381104783201e-06, + "loss": 0.0027, + "step": 398 + }, + { + "epoch": 0.08006405124099279, + "learning_rate": 4.485033387477915e-06, + "loss": 0.026, + "step": 400 + }, + { + "epoch": 0.08086469175340272, + "learning_rate": 4.496696441021904e-06, + "loss": 0.0318, + "step": 402 + }, + { + "epoch": 0.08086469175340272, + "learning_rate": 4.5083702426369715e-06, + "loss": 0.0816, + "step": 404 + }, + { + "epoch": 0.08166533226581266, + "learning_rate": 4.520054769523929e-06, + "loss": 0.3522, + "step": 406 + }, + { + "epoch": 0.08166533226581266, + "learning_rate": 4.531749998862628e-06, + "loss": 0.1673, + "step": 408 + }, + { + "epoch": 0.08246597277822258, + "learning_rate": 4.543455907812063e-06, + "loss": 0.0567, + "step": 410 + }, + { + "epoch": 0.08246597277822258, + "learning_rate": 4.555172473510324e-06, + "loss": 0.2197, + "step": 412 + }, + { + "epoch": 0.08326661329063251, + "learning_rate": 4.566899673074706e-06, + "loss": 0.0265, + "step": 414 + }, + { + "epoch": 0.08326661329063251, + "learning_rate": 4.578637483601732e-06, + "loss": 0.0584, + "step": 416 + }, + { + "epoch": 0.08406725380304243, + "learning_rate": 4.590385882167206e-06, + "loss": 0.1447, + "step": 418 + }, + { + "epoch": 0.08406725380304243, + "learning_rate": 4.602144845826234e-06, + "loss": 0.0639, + "step": 420 + }, + { + "epoch": 0.08486789431545236, + "learning_rate": 4.613914351613337e-06, + "loss": 0.0346, + "step": 422 + }, + { + "epoch": 0.08486789431545236, + "learning_rate": 4.625694376542399e-06, + "loss": 0.1671, + "step": 424 + }, + { + "epoch": 0.08566853482786228, + "learning_rate": 4.637484897606777e-06, + "loss": 0.0419, + "step": 426 + }, + { + "epoch": 0.08566853482786228, + "learning_rate": 4.649285891779326e-06, + "loss": 0.0115, + "step": 428 + }, + { + "epoch": 0.08646917534027222, + "learning_rate": 4.661097336012451e-06, + "loss": 0.3203, + "step": 430 + }, + { + "epoch": 0.08646917534027222, + "learning_rate": 4.672919207238145e-06, + "loss": 0.0073, + "step": 432 + }, + { + "epoch": 0.08726981585268215, + "learning_rate": 4.684751482368022e-06, + "loss": 0.4947, + "step": 434 + }, + { + "epoch": 0.08726981585268215, + "learning_rate": 4.696594138293421e-06, + "loss": 0.0135, + "step": 436 + }, + { + "epoch": 0.08807045636509207, + "learning_rate": 4.7084471518853656e-06, + "loss": 0.685, + "step": 438 + }, + { + "epoch": 0.08807045636509207, + "learning_rate": 4.720310499994664e-06, + "loss": 0.0406, + "step": 440 + }, + { + "epoch": 0.088871096877502, + "learning_rate": 4.732184159451937e-06, + "loss": 0.1383, + "step": 442 + }, + { + "epoch": 0.088871096877502, + "learning_rate": 4.744068107067673e-06, + "loss": 0.4203, + "step": 444 + }, + { + "epoch": 0.08967173738991192, + "learning_rate": 4.755962319632249e-06, + "loss": 0.0013, + "step": 446 + }, + { + "epoch": 0.08967173738991192, + "learning_rate": 4.767866773916041e-06, + "loss": 0.3312, + "step": 448 + }, + { + "epoch": 0.09047237790232186, + "learning_rate": 4.779781446669376e-06, + "loss": 0.0158, + "step": 450 + }, + { + "epoch": 0.09047237790232186, + "learning_rate": 4.79170631462264e-06, + "loss": 0.0588, + "step": 452 + }, + { + "epoch": 0.09127301841473179, + "learning_rate": 4.8036413544863095e-06, + "loss": 0.1779, + "step": 454 + }, + { + "epoch": 0.09127301841473179, + "learning_rate": 4.81558654295099e-06, + "loss": 0.3094, + "step": 456 + }, + { + "epoch": 0.09207365892714171, + "learning_rate": 4.827541856687471e-06, + "loss": 0.0115, + "step": 458 + }, + { + "epoch": 0.09207365892714171, + "learning_rate": 4.839507272346751e-06, + "loss": 0.1442, + "step": 460 + }, + { + "epoch": 0.09287429943955164, + "learning_rate": 4.8514827665601425e-06, + "loss": 0.1322, + "step": 462 + }, + { + "epoch": 0.09287429943955164, + "learning_rate": 4.863468315939234e-06, + "loss": 0.2644, + "step": 464 + }, + { + "epoch": 0.09367493995196156, + "learning_rate": 4.875463897075985e-06, + "loss": 0.0725, + "step": 466 + }, + { + "epoch": 0.09367493995196156, + "learning_rate": 4.8874694865427676e-06, + "loss": 0.1148, + "step": 468 + }, + { + "epoch": 0.0944755804643715, + "learning_rate": 4.899485060892404e-06, + "loss": 0.0078, + "step": 470 + }, + { + "epoch": 0.0944755804643715, + "learning_rate": 4.911510596658202e-06, + "loss": 0.2896, + "step": 472 + }, + { + "epoch": 0.09527622097678143, + "learning_rate": 4.9235460703540615e-06, + "loss": 0.0285, + "step": 474 + }, + { + "epoch": 0.09527622097678143, + "learning_rate": 4.935591458474425e-06, + "loss": 0.0535, + "step": 476 + }, + { + "epoch": 0.09607686148919135, + "learning_rate": 4.947646737494389e-06, + "loss": 0.0429, + "step": 478 + }, + { + "epoch": 0.09607686148919135, + "learning_rate": 4.959711883869734e-06, + "loss": 0.0088, + "step": 480 + }, + { + "epoch": 0.09687750200160128, + "learning_rate": 4.9717868740369645e-06, + "loss": 0.119, + "step": 482 + }, + { + "epoch": 0.09687750200160128, + "learning_rate": 4.9838716844133665e-06, + "loss": 0.0254, + "step": 484 + }, + { + "epoch": 0.0976781425140112, + "learning_rate": 4.9959662913970254e-06, + "loss": 0.0075, + "step": 486 + }, + { + "epoch": 0.0976781425140112, + "learning_rate": 5.0080706713669435e-06, + "loss": 0.0931, + "step": 488 + }, + { + "epoch": 0.09847878302642114, + "learning_rate": 5.02018480068299e-06, + "loss": 0.0364, + "step": 490 + }, + { + "epoch": 0.09847878302642114, + "learning_rate": 5.032308655686007e-06, + "loss": 0.1071, + "step": 492 + }, + { + "epoch": 0.09927942353883107, + "learning_rate": 5.044442212697842e-06, + "loss": 0.2257, + "step": 494 + }, + { + "epoch": 0.09927942353883107, + "learning_rate": 5.056585448021398e-06, + "loss": 0.0042, + "step": 496 + }, + { + "epoch": 0.100080064051241, + "learning_rate": 5.068738337940655e-06, + "loss": 0.0156, + "step": 498 + }, + { + "epoch": 0.100080064051241, + "learning_rate": 5.080900858720789e-06, + "loss": 0.3075, + "step": 500 + }, + { + "epoch": 0.10088070456365092, + "learning_rate": 5.093072986608116e-06, + "loss": 0.1653, + "step": 502 + }, + { + "epoch": 0.10088070456365092, + "learning_rate": 5.105254697830208e-06, + "loss": 0.1079, + "step": 504 + }, + { + "epoch": 0.10168134507606084, + "learning_rate": 5.1174459685959175e-06, + "loss": 0.0202, + "step": 506 + }, + { + "epoch": 0.10168134507606084, + "learning_rate": 5.129646775095432e-06, + "loss": 0.2227, + "step": 508 + }, + { + "epoch": 0.10248198558847077, + "learning_rate": 5.141857093500307e-06, + "loss": 0.4367, + "step": 510 + }, + { + "epoch": 0.10248198558847077, + "learning_rate": 5.154076899963514e-06, + "loss": 0.1199, + "step": 512 + }, + { + "epoch": 0.10328262610088071, + "learning_rate": 5.166306170619537e-06, + "loss": 0.2441, + "step": 514 + }, + { + "epoch": 0.10328262610088071, + "learning_rate": 5.178544881584328e-06, + "loss": 0.0141, + "step": 516 + }, + { + "epoch": 0.10408326661329063, + "learning_rate": 5.190793008955421e-06, + "loss": 0.0299, + "step": 518 + }, + { + "epoch": 0.10408326661329063, + "learning_rate": 5.203050528811959e-06, + "loss": 0.2882, + "step": 520 + }, + { + "epoch": 0.10488390712570056, + "learning_rate": 5.215317417214739e-06, + "loss": 0.1079, + "step": 522 + }, + { + "epoch": 0.10488390712570056, + "learning_rate": 5.227593650206246e-06, + "loss": 0.1298, + "step": 524 + }, + { + "epoch": 0.10568454763811048, + "learning_rate": 5.239879203810763e-06, + "loss": 0.0454, + "step": 526 + }, + { + "epoch": 0.10568454763811048, + "learning_rate": 5.2521740540343205e-06, + "loss": 0.4498, + "step": 528 + }, + { + "epoch": 0.10648518815052041, + "learning_rate": 5.264478176864811e-06, + "loss": 0.0684, + "step": 530 + }, + { + "epoch": 0.10648518815052041, + "learning_rate": 5.2767915482720164e-06, + "loss": 0.229, + "step": 532 + }, + { + "epoch": 0.10728582866293035, + "learning_rate": 5.289114144207656e-06, + "loss": 0.0021, + "step": 534 + }, + { + "epoch": 0.10728582866293035, + "learning_rate": 5.3014459406054295e-06, + "loss": 0.0355, + "step": 536 + }, + { + "epoch": 0.10808646917534027, + "learning_rate": 5.313786913381061e-06, + "loss": 0.1513, + "step": 538 + }, + { + "epoch": 0.10808646917534027, + "learning_rate": 5.3261370384323904e-06, + "loss": 0.2105, + "step": 540 + }, + { + "epoch": 0.1088871096877502, + "learning_rate": 5.338496291639341e-06, + "loss": 0.0444, + "step": 542 + }, + { + "epoch": 0.1088871096877502, + "learning_rate": 5.350864648864026e-06, + "loss": 0.0692, + "step": 544 + }, + { + "epoch": 0.10968775020016013, + "learning_rate": 5.363242085950773e-06, + "loss": 0.0169, + "step": 546 + }, + { + "epoch": 0.10968775020016013, + "learning_rate": 5.375628578726181e-06, + "loss": 0.0879, + "step": 548 + }, + { + "epoch": 0.11048839071257005, + "learning_rate": 5.3880241029991434e-06, + "loss": 0.0717, + "step": 550 + }, + { + "epoch": 0.11048839071257005, + "learning_rate": 5.4004286345609665e-06, + "loss": 0.111, + "step": 552 + }, + { + "epoch": 0.11128903122497999, + "learning_rate": 5.412842149185316e-06, + "loss": 0.0848, + "step": 554 + }, + { + "epoch": 0.11128903122497999, + "learning_rate": 5.425264622628326e-06, + "loss": 0.0137, + "step": 556 + }, + { + "epoch": 0.11208967173738991, + "learning_rate": 5.437696030628639e-06, + "loss": 0.2116, + "step": 558 + }, + { + "epoch": 0.11208967173738991, + "learning_rate": 5.450136348907444e-06, + "loss": 0.0087, + "step": 560 + }, + { + "epoch": 0.11289031224979984, + "learning_rate": 5.462585553168532e-06, + "loss": 0.0262, + "step": 562 + }, + { + "epoch": 0.11289031224979984, + "learning_rate": 5.475043619098321e-06, + "loss": 0.1652, + "step": 564 + }, + { + "epoch": 0.11369095276220977, + "learning_rate": 5.487510522365969e-06, + "loss": 0.1214, + "step": 566 + }, + { + "epoch": 0.11369095276220977, + "learning_rate": 5.499986238623329e-06, + "loss": 0.1847, + "step": 568 + }, + { + "epoch": 0.11449159327461969, + "learning_rate": 5.512470743505057e-06, + "loss": 0.0048, + "step": 570 + }, + { + "epoch": 0.11449159327461969, + "learning_rate": 5.524964012628644e-06, + "loss": 0.011, + "step": 572 + }, + { + "epoch": 0.11529223378702963, + "learning_rate": 5.537466021594464e-06, + "loss": 0.0106, + "step": 574 + }, + { + "epoch": 0.11529223378702963, + "learning_rate": 5.549976745985809e-06, + "loss": 0.3321, + "step": 576 + }, + { + "epoch": 0.11609287429943956, + "learning_rate": 5.5624961613689934e-06, + "loss": 0.1939, + "step": 578 + }, + { + "epoch": 0.11609287429943956, + "learning_rate": 5.57502424329331e-06, + "loss": 0.0094, + "step": 580 + }, + { + "epoch": 0.11689351481184948, + "learning_rate": 5.5875609672911465e-06, + "loss": 0.0067, + "step": 582 + }, + { + "epoch": 0.11689351481184948, + "learning_rate": 5.6001063088780085e-06, + "loss": 0.0152, + "step": 584 + }, + { + "epoch": 0.1176941553242594, + "learning_rate": 5.6126602435525725e-06, + "loss": 0.0199, + "step": 586 + }, + { + "epoch": 0.1176941553242594, + "learning_rate": 5.62522274679673e-06, + "loss": 0.0116, + "step": 588 + }, + { + "epoch": 0.11849479583666933, + "learning_rate": 5.637793794075625e-06, + "loss": 0.1037, + "step": 590 + }, + { + "epoch": 0.11849479583666933, + "learning_rate": 5.650373360837763e-06, + "loss": 0.0014, + "step": 592 + }, + { + "epoch": 0.11929543634907927, + "learning_rate": 5.662961422514961e-06, + "loss": 0.1492, + "step": 594 + }, + { + "epoch": 0.11929543634907927, + "learning_rate": 5.675557954522462e-06, + "loss": 0.0122, + "step": 596 + }, + { + "epoch": 0.1200960768614892, + "learning_rate": 5.688162932258965e-06, + "loss": 0.0058, + "step": 598 + }, + { + "epoch": 0.1200960768614892, + "learning_rate": 5.700776331106674e-06, + "loss": 0.0401, + "step": 600 + }, + { + "epoch": 0.12089671737389912, + "learning_rate": 5.713398126431353e-06, + "loss": 0.0344, + "step": 602 + }, + { + "epoch": 0.12089671737389912, + "learning_rate": 5.726028293582342e-06, + "loss": 0.1736, + "step": 604 + }, + { + "epoch": 0.12169735788630905, + "learning_rate": 5.738666807892684e-06, + "loss": 0.3692, + "step": 606 + }, + { + "epoch": 0.12169735788630905, + "learning_rate": 5.751313644679071e-06, + "loss": 0.0052, + "step": 608 + }, + { + "epoch": 0.12249799839871897, + "learning_rate": 5.763968779241957e-06, + "loss": 0.0811, + "step": 610 + }, + { + "epoch": 0.12249799839871897, + "learning_rate": 5.776632186865589e-06, + "loss": 0.039, + "step": 612 + }, + { + "epoch": 0.1232986389111289, + "learning_rate": 5.7893038428180584e-06, + "loss": 0.1314, + "step": 614 + }, + { + "epoch": 0.1232986389111289, + "learning_rate": 5.8019837223513295e-06, + "loss": 0.2696, + "step": 616 + }, + { + "epoch": 0.12409927942353884, + "learning_rate": 5.814671800701357e-06, + "loss": 0.9439, + "step": 618 + }, + { + "epoch": 0.12409927942353884, + "learning_rate": 5.827368053088032e-06, + "loss": 0.193, + "step": 620 + }, + { + "epoch": 0.12489991993594876, + "learning_rate": 5.840072454715297e-06, + "loss": 0.0481, + "step": 622 + }, + { + "epoch": 0.12489991993594876, + "learning_rate": 5.852784980771182e-06, + "loss": 0.0055, + "step": 624 + }, + { + "epoch": 0.1257005604483587, + "learning_rate": 5.865505606427848e-06, + "loss": 0.0108, + "step": 626 + }, + { + "epoch": 0.1257005604483587, + "learning_rate": 5.878234306841637e-06, + "loss": 0.5902, + "step": 628 + }, + { + "epoch": 0.1265012009607686, + "learning_rate": 5.890971057153105e-06, + "loss": 0.0096, + "step": 630 + }, + { + "epoch": 0.1265012009607686, + "learning_rate": 5.903715832487138e-06, + "loss": 0.0293, + "step": 632 + }, + { + "epoch": 0.12730184147317855, + "learning_rate": 5.916468607952892e-06, + "loss": 0.0053, + "step": 634 + }, + { + "epoch": 0.12730184147317855, + "learning_rate": 5.929229358643925e-06, + "loss": 0.0121, + "step": 636 + }, + { + "epoch": 0.12810248198558846, + "learning_rate": 5.941998059638212e-06, + "loss": 0.5321, + "step": 638 + }, + { + "epoch": 0.12810248198558846, + "learning_rate": 5.954774685998206e-06, + "loss": 0.7033, + "step": 640 + }, + { + "epoch": 0.1289031224979984, + "learning_rate": 5.9675592127708585e-06, + "loss": 0.7663, + "step": 642 + }, + { + "epoch": 0.1289031224979984, + "learning_rate": 5.9803516149877475e-06, + "loss": 0.0427, + "step": 644 + }, + { + "epoch": 0.1297037630104083, + "learning_rate": 5.993151867665015e-06, + "loss": 0.0254, + "step": 646 + }, + { + "epoch": 0.1297037630104083, + "learning_rate": 6.005959945803494e-06, + "loss": 0.0021, + "step": 648 + }, + { + "epoch": 0.13050440352281825, + "learning_rate": 6.01877582438873e-06, + "loss": 0.0121, + "step": 650 + }, + { + "epoch": 0.13050440352281825, + "learning_rate": 6.03159947839103e-06, + "loss": 0.0021, + "step": 652 + }, + { + "epoch": 0.1313050440352282, + "learning_rate": 6.0444308827655265e-06, + "loss": 0.0148, + "step": 654 + }, + { + "epoch": 0.1313050440352282, + "learning_rate": 6.057270012452186e-06, + "loss": 0.0039, + "step": 656 + }, + { + "epoch": 0.1321056845476381, + "learning_rate": 6.070116842375947e-06, + "loss": 0.096, + "step": 658 + }, + { + "epoch": 0.1321056845476381, + "learning_rate": 6.082971347446654e-06, + "loss": 0.053, + "step": 660 + }, + { + "epoch": 0.13290632506004804, + "learning_rate": 6.095833502559182e-06, + "loss": 0.0664, + "step": 662 + }, + { + "epoch": 0.13290632506004804, + "learning_rate": 6.108703282593461e-06, + "loss": 0.1844, + "step": 664 + }, + { + "epoch": 0.13370696557245795, + "learning_rate": 6.121580662414533e-06, + "loss": 0.0128, + "step": 666 + }, + { + "epoch": 0.13370696557245795, + "learning_rate": 6.13446561687258e-06, + "loss": 0.035, + "step": 668 + }, + { + "epoch": 0.1345076060848679, + "learning_rate": 6.147358120803041e-06, + "loss": 0.0232, + "step": 670 + }, + { + "epoch": 0.1345076060848679, + "learning_rate": 6.160258149026557e-06, + "loss": 0.0639, + "step": 672 + }, + { + "epoch": 0.13530824659727783, + "learning_rate": 6.173165676349095e-06, + "loss": 0.0117, + "step": 674 + }, + { + "epoch": 0.13530824659727783, + "learning_rate": 6.186080677561974e-06, + "loss": 0.0451, + "step": 676 + }, + { + "epoch": 0.13610888710968774, + "learning_rate": 6.1990031274419186e-06, + "loss": 0.2309, + "step": 678 + }, + { + "epoch": 0.13610888710968774, + "learning_rate": 6.2119330007511014e-06, + "loss": 0.0424, + "step": 680 + }, + { + "epoch": 0.13690952762209768, + "learning_rate": 6.224870272237185e-06, + "loss": 0.0111, + "step": 682 + }, + { + "epoch": 0.13690952762209768, + "learning_rate": 6.237814916633431e-06, + "loss": 0.7465, + "step": 684 + }, + { + "epoch": 0.1377101681345076, + "learning_rate": 6.250766908658652e-06, + "loss": 0.0742, + "step": 686 + }, + { + "epoch": 0.1377101681345076, + "learning_rate": 6.263726223017326e-06, + "loss": 0.2684, + "step": 688 + }, + { + "epoch": 0.13851080864691753, + "learning_rate": 6.2766928343996314e-06, + "loss": 0.2153, + "step": 690 + }, + { + "epoch": 0.13851080864691753, + "learning_rate": 6.289666717481496e-06, + "loss": 1.194, + "step": 692 + }, + { + "epoch": 0.13931144915932747, + "learning_rate": 6.3026478469246285e-06, + "loss": 0.0025, + "step": 694 + }, + { + "epoch": 0.13931144915932747, + "learning_rate": 6.315636197376634e-06, + "loss": 0.0239, + "step": 696 + }, + { + "epoch": 0.14011208967173738, + "learning_rate": 6.328631743470968e-06, + "loss": 0.0228, + "step": 698 + }, + { + "epoch": 0.14011208967173738, + "learning_rate": 6.341634459827044e-06, + "loss": 0.0141, + "step": 700 + }, + { + "epoch": 0.14091273018414732, + "learning_rate": 6.354644321050279e-06, + "loss": 0.0201, + "step": 702 + }, + { + "epoch": 0.14091273018414732, + "learning_rate": 6.3676613017321305e-06, + "loss": 0.0072, + "step": 704 + }, + { + "epoch": 0.14171337069655723, + "learning_rate": 6.380685376450153e-06, + "loss": 0.0333, + "step": 706 + }, + { + "epoch": 0.14171337069655723, + "learning_rate": 6.393716519768032e-06, + "loss": 0.0259, + "step": 708 + }, + { + "epoch": 0.14251401120896717, + "learning_rate": 6.406754706235692e-06, + "loss": 0.041, + "step": 710 + }, + { + "epoch": 0.14251401120896717, + "learning_rate": 6.419799910389257e-06, + "loss": 0.0144, + "step": 712 + }, + { + "epoch": 0.1433146517213771, + "learning_rate": 6.432852106751162e-06, + "loss": 0.0424, + "step": 714 + }, + { + "epoch": 0.1433146517213771, + "learning_rate": 6.445911269830183e-06, + "loss": 0.0047, + "step": 716 + }, + { + "epoch": 0.14411529223378702, + "learning_rate": 6.458977374121492e-06, + "loss": 0.0576, + "step": 718 + }, + { + "epoch": 0.14411529223378702, + "learning_rate": 6.472050394106689e-06, + "loss": 0.2847, + "step": 720 + }, + { + "epoch": 0.14491593274619696, + "learning_rate": 6.485130304253915e-06, + "loss": 0.2511, + "step": 722 + }, + { + "epoch": 0.14491593274619696, + "learning_rate": 6.498217079017806e-06, + "loss": 0.1441, + "step": 724 + }, + { + "epoch": 0.14571657325860687, + "learning_rate": 6.511310692839605e-06, + "loss": 0.0048, + "step": 726 + }, + { + "epoch": 0.14571657325860687, + "learning_rate": 6.524411120147204e-06, + "loss": 0.0084, + "step": 728 + }, + { + "epoch": 0.1465172137710168, + "learning_rate": 6.537518335355182e-06, + "loss": 0.1234, + "step": 730 + }, + { + "epoch": 0.1465172137710168, + "learning_rate": 6.5506323128648654e-06, + "loss": 0.6444, + "step": 732 + }, + { + "epoch": 0.14731785428342675, + "learning_rate": 6.563753027064355e-06, + "loss": 0.2685, + "step": 734 + }, + { + "epoch": 0.14731785428342675, + "learning_rate": 6.576880452328645e-06, + "loss": 0.0141, + "step": 736 + }, + { + "epoch": 0.14811849479583666, + "learning_rate": 6.590014563019571e-06, + "loss": 0.0336, + "step": 738 + }, + { + "epoch": 0.14811849479583666, + "learning_rate": 6.603155333485934e-06, + "loss": 0.1474, + "step": 740 + }, + { + "epoch": 0.1489191353082466, + "learning_rate": 6.61630273806352e-06, + "loss": 0.0416, + "step": 742 + }, + { + "epoch": 0.1489191353082466, + "learning_rate": 6.6294567510751675e-06, + "loss": 0.1148, + "step": 744 + }, + { + "epoch": 0.14971977582065651, + "learning_rate": 6.642617346830784e-06, + "loss": 0.0863, + "step": 746 + }, + { + "epoch": 0.14971977582065651, + "learning_rate": 6.655784499627476e-06, + "loss": 0.016, + "step": 748 + }, + { + "epoch": 0.15052041633306645, + "learning_rate": 6.6689581837494925e-06, + "loss": 0.088, + "step": 750 + }, + { + "epoch": 0.15052041633306645, + "learning_rate": 6.682138373468341e-06, + "loss": 0.0487, + "step": 752 + }, + { + "epoch": 0.1513210568454764, + "learning_rate": 6.695325043042827e-06, + "loss": 0.1758, + "step": 754 + }, + { + "epoch": 0.1513210568454764, + "learning_rate": 6.7085181667191e-06, + "loss": 0.1013, + "step": 756 + }, + { + "epoch": 0.1521216973578863, + "learning_rate": 6.7217177187307e-06, + "loss": 0.0393, + "step": 758 + }, + { + "epoch": 0.1521216973578863, + "learning_rate": 6.734923673298605e-06, + "loss": 0.1555, + "step": 760 + }, + { + "epoch": 0.15292233787029624, + "learning_rate": 6.748136004631327e-06, + "loss": 0.1211, + "step": 762 + }, + { + "epoch": 0.15292233787029624, + "learning_rate": 6.761354686924883e-06, + "loss": 0.0077, + "step": 764 + }, + { + "epoch": 0.15372297838270615, + "learning_rate": 6.774579694362902e-06, + "loss": 0.0592, + "step": 766 + }, + { + "epoch": 0.15372297838270615, + "learning_rate": 6.787811001116654e-06, + "loss": 0.4267, + "step": 768 + }, + { + "epoch": 0.1545236188951161, + "learning_rate": 6.801048581345113e-06, + "loss": 0.0706, + "step": 770 + }, + { + "epoch": 0.1545236188951161, + "learning_rate": 6.8142924091949955e-06, + "loss": 0.0115, + "step": 772 + }, + { + "epoch": 0.15532425940752603, + "learning_rate": 6.827542458800804e-06, + "loss": 0.0537, + "step": 774 + }, + { + "epoch": 0.15532425940752603, + "learning_rate": 6.840798704284939e-06, + "loss": 0.0022, + "step": 776 + }, + { + "epoch": 0.15612489991993594, + "learning_rate": 6.854061119757647e-06, + "loss": 0.0178, + "step": 778 + }, + { + "epoch": 0.15612489991993594, + "learning_rate": 6.867329679317144e-06, + "loss": 0.0285, + "step": 780 + }, + { + "epoch": 0.15692554043234588, + "learning_rate": 6.880604357049646e-06, + "loss": 0.0053, + "step": 782 + }, + { + "epoch": 0.15692554043234588, + "learning_rate": 6.893885127029419e-06, + "loss": 0.5322, + "step": 784 + }, + { + "epoch": 0.1577261809447558, + "learning_rate": 6.907171963318815e-06, + "loss": 0.6383, + "step": 786 + }, + { + "epoch": 0.1577261809447558, + "learning_rate": 6.920464839968391e-06, + "loss": 0.5201, + "step": 788 + }, + { + "epoch": 0.15852682145716573, + "learning_rate": 6.9337637310168494e-06, + "loss": 0.3529, + "step": 790 + }, + { + "epoch": 0.15852682145716573, + "learning_rate": 6.94706861049117e-06, + "loss": 0.0111, + "step": 792 + }, + { + "epoch": 0.15932746196957567, + "learning_rate": 6.960379452406636e-06, + "loss": 0.5342, + "step": 794 + }, + { + "epoch": 0.15932746196957567, + "learning_rate": 6.973696230766884e-06, + "loss": 0.036, + "step": 796 + }, + { + "epoch": 0.16012810248198558, + "learning_rate": 6.9870189195639595e-06, + "loss": 0.014, + "step": 798 + }, + { + "epoch": 0.16012810248198558, + "learning_rate": 7.000347492778341e-06, + "loss": 0.1846, + "step": 800 + }, + { + "epoch": 0.16092874299439552, + "learning_rate": 7.013681924379073e-06, + "loss": 0.0368, + "step": 802 + }, + { + "epoch": 0.16092874299439552, + "learning_rate": 7.027022188323704e-06, + "loss": 0.1067, + "step": 804 + }, + { + "epoch": 0.16172938350680544, + "learning_rate": 7.040368258558412e-06, + "loss": 0.0144, + "step": 806 + }, + { + "epoch": 0.16172938350680544, + "learning_rate": 7.05372010901803e-06, + "loss": 0.0633, + "step": 808 + }, + { + "epoch": 0.16253002401921537, + "learning_rate": 7.0670777136261035e-06, + "loss": 0.063, + "step": 810 + }, + { + "epoch": 0.16253002401921537, + "learning_rate": 7.080441046294945e-06, + "loss": 0.2441, + "step": 812 + }, + { + "epoch": 0.1633306645316253, + "learning_rate": 7.093810080925657e-06, + "loss": 0.0668, + "step": 814 + }, + { + "epoch": 0.1633306645316253, + "learning_rate": 7.1071847914082605e-06, + "loss": 0.0031, + "step": 816 + }, + { + "epoch": 0.16413130504403523, + "learning_rate": 7.120565151621638e-06, + "loss": 0.0057, + "step": 818 + }, + { + "epoch": 0.16413130504403523, + "learning_rate": 7.133951135433656e-06, + "loss": 0.0164, + "step": 820 + }, + { + "epoch": 0.16493194555644516, + "learning_rate": 7.1473427167012e-06, + "loss": 0.7306, + "step": 822 + }, + { + "epoch": 0.16493194555644516, + "learning_rate": 7.160739869270219e-06, + "loss": 0.1253, + "step": 824 + }, + { + "epoch": 0.16573258606885508, + "learning_rate": 7.1741425669757854e-06, + "loss": 0.0289, + "step": 826 + }, + { + "epoch": 0.16573258606885508, + "learning_rate": 7.18755078364214e-06, + "loss": 0.0144, + "step": 828 + }, + { + "epoch": 0.16653322658126501, + "learning_rate": 7.200964493082727e-06, + "loss": 0.0761, + "step": 830 + }, + { + "epoch": 0.16653322658126501, + "learning_rate": 7.214383669100317e-06, + "loss": 0.3868, + "step": 832 + }, + { + "epoch": 0.16733386709367493, + "learning_rate": 7.227808285486952e-06, + "loss": 0.014, + "step": 834 + }, + { + "epoch": 0.16733386709367493, + "learning_rate": 7.241238316024064e-06, + "loss": 0.0099, + "step": 836 + }, + { + "epoch": 0.16813450760608487, + "learning_rate": 7.254673734482513e-06, + "loss": 0.3261, + "step": 838 + }, + { + "epoch": 0.16813450760608487, + "learning_rate": 7.268114514622635e-06, + "loss": 0.2844, + "step": 840 + }, + { + "epoch": 0.1689351481184948, + "learning_rate": 7.2815606301942945e-06, + "loss": 0.0691, + "step": 842 + }, + { + "epoch": 0.1689351481184948, + "learning_rate": 7.2950120549369204e-06, + "loss": 0.0652, + "step": 844 + }, + { + "epoch": 0.16973578863090472, + "learning_rate": 7.308468762579623e-06, + "loss": 0.0158, + "step": 846 + }, + { + "epoch": 0.16973578863090472, + "learning_rate": 7.321930726841144e-06, + "loss": 0.0365, + "step": 848 + }, + { + "epoch": 0.17053642914331466, + "learning_rate": 7.3353979214299765e-06, + "loss": 0.005, + "step": 850 + }, + { + "epoch": 0.17053642914331466, + "learning_rate": 7.348870320044395e-06, + "loss": 0.1944, + "step": 852 + }, + { + "epoch": 0.17133706965572457, + "learning_rate": 7.362347896372515e-06, + "loss": 0.0794, + "step": 854 + }, + { + "epoch": 0.17133706965572457, + "learning_rate": 7.375830624092336e-06, + "loss": 0.3105, + "step": 856 + }, + { + "epoch": 0.1721377101681345, + "learning_rate": 7.389318476871784e-06, + "loss": 0.0019, + "step": 858 + }, + { + "epoch": 0.1721377101681345, + "learning_rate": 7.402811428368824e-06, + "loss": 0.4458, + "step": 860 + }, + { + "epoch": 0.17293835068054444, + "learning_rate": 7.416309452231411e-06, + "loss": 0.0178, + "step": 862 + }, + { + "epoch": 0.17293835068054444, + "learning_rate": 7.429812522097613e-06, + "loss": 0.0054, + "step": 864 + }, + { + "epoch": 0.17373899119295436, + "learning_rate": 7.443320611595641e-06, + "loss": 0.1459, + "step": 866 + }, + { + "epoch": 0.17373899119295436, + "learning_rate": 7.4568336943439055e-06, + "loss": 0.1011, + "step": 868 + }, + { + "epoch": 0.1745396317053643, + "learning_rate": 7.470351743951061e-06, + "loss": 0.0137, + "step": 870 + }, + { + "epoch": 0.1745396317053643, + "learning_rate": 7.4838747340160475e-06, + "loss": 0.0054, + "step": 872 + }, + { + "epoch": 0.1753402722177742, + "learning_rate": 7.497402638128209e-06, + "loss": 0.2333, + "step": 874 + }, + { + "epoch": 0.1753402722177742, + "learning_rate": 7.510935429867233e-06, + "loss": 0.0795, + "step": 876 + }, + { + "epoch": 0.17614091273018415, + "learning_rate": 7.52447308280329e-06, + "loss": 0.0042, + "step": 878 + }, + { + "epoch": 0.17614091273018415, + "learning_rate": 7.538015570497046e-06, + "loss": 0.0743, + "step": 880 + }, + { + "epoch": 0.17694155324259409, + "learning_rate": 7.551562866499732e-06, + "loss": 0.0172, + "step": 882 + }, + { + "epoch": 0.17694155324259409, + "learning_rate": 7.5651149443531846e-06, + "loss": 0.5775, + "step": 884 + }, + { + "epoch": 0.177742193755004, + "learning_rate": 7.578671777589884e-06, + "loss": 0.0605, + "step": 886 + }, + { + "epoch": 0.177742193755004, + "learning_rate": 7.592233339733077e-06, + "loss": 0.0771, + "step": 888 + }, + { + "epoch": 0.17854283426741394, + "learning_rate": 7.605799604296721e-06, + "loss": 0.4937, + "step": 890 + }, + { + "epoch": 0.17854283426741394, + "learning_rate": 7.619370544785608e-06, + "loss": 0.06, + "step": 892 + }, + { + "epoch": 0.17934347477982385, + "learning_rate": 7.632946134695396e-06, + "loss": 0.0559, + "step": 894 + }, + { + "epoch": 0.17934347477982385, + "learning_rate": 7.646526347512665e-06, + "loss": 0.0646, + "step": 896 + }, + { + "epoch": 0.1801441152922338, + "learning_rate": 7.660111156714964e-06, + "loss": 0.2121, + "step": 898 + }, + { + "epoch": 0.1801441152922338, + "learning_rate": 7.67370053577085e-06, + "loss": 0.0074, + "step": 900 + }, + { + "epoch": 0.18094475580464373, + "learning_rate": 7.687294458140006e-06, + "loss": 0.585, + "step": 902 + }, + { + "epoch": 0.18094475580464373, + "learning_rate": 7.70089289727319e-06, + "loss": 0.9582, + "step": 904 + }, + { + "epoch": 0.18174539631705364, + "learning_rate": 7.714495826612353e-06, + "loss": 0.0692, + "step": 906 + }, + { + "epoch": 0.18174539631705364, + "learning_rate": 7.728103219590684e-06, + "loss": 0.0846, + "step": 908 + }, + { + "epoch": 0.18254603682946358, + "learning_rate": 7.741715049632646e-06, + "loss": 0.3627, + "step": 910 + }, + { + "epoch": 0.18254603682946358, + "learning_rate": 7.755331290154041e-06, + "loss": 0.392, + "step": 912 + }, + { + "epoch": 0.1833466773418735, + "learning_rate": 7.76895191456204e-06, + "loss": 0.2245, + "step": 914 + }, + { + "epoch": 0.1833466773418735, + "learning_rate": 7.7825768962553e-06, + "loss": 0.2652, + "step": 916 + }, + { + "epoch": 0.18414731785428343, + "learning_rate": 7.796206208623925e-06, + "loss": 0.0906, + "step": 918 + }, + { + "epoch": 0.18414731785428343, + "learning_rate": 7.809839825049565e-06, + "loss": 0.4078, + "step": 920 + }, + { + "epoch": 0.18494795836669337, + "learning_rate": 7.82347771890548e-06, + "loss": 0.0276, + "step": 922 + }, + { + "epoch": 0.18494795836669337, + "learning_rate": 7.83711986355656e-06, + "loss": 0.0087, + "step": 924 + }, + { + "epoch": 0.18574859887910328, + "learning_rate": 7.850766232359408e-06, + "loss": 0.5772, + "step": 926 + }, + { + "epoch": 0.18574859887910328, + "learning_rate": 7.864416798662347e-06, + "loss": 0.0604, + "step": 928 + }, + { + "epoch": 0.18654923939151322, + "learning_rate": 7.878071535805564e-06, + "loss": 0.078, + "step": 930 + }, + { + "epoch": 0.18654923939151322, + "learning_rate": 7.891730417121043e-06, + "loss": 0.2474, + "step": 932 + }, + { + "epoch": 0.18734987990392313, + "learning_rate": 7.90539341593269e-06, + "loss": 0.0278, + "step": 934 + }, + { + "epoch": 0.18734987990392313, + "learning_rate": 7.919060505556376e-06, + "loss": 0.0452, + "step": 936 + }, + { + "epoch": 0.18815052041633307, + "learning_rate": 7.932731659299978e-06, + "loss": 0.0051, + "step": 938 + }, + { + "epoch": 0.18815052041633307, + "learning_rate": 7.946406850463435e-06, + "loss": 0.2532, + "step": 940 + }, + { + "epoch": 0.188951160928743, + "learning_rate": 7.960086052338788e-06, + "loss": 0.0196, + "step": 942 + }, + { + "epoch": 0.188951160928743, + "learning_rate": 7.973769238210291e-06, + "loss": 0.0199, + "step": 944 + }, + { + "epoch": 0.18975180144115292, + "learning_rate": 7.987456381354371e-06, + "loss": 0.0717, + "step": 946 + }, + { + "epoch": 0.18975180144115292, + "learning_rate": 8.001147455039737e-06, + "loss": 0.018, + "step": 948 + }, + { + "epoch": 0.19055244195356286, + "learning_rate": 8.01484243252743e-06, + "loss": 0.0143, + "step": 950 + }, + { + "epoch": 0.19055244195356286, + "learning_rate": 8.028541287070858e-06, + "loss": 0.1911, + "step": 952 + }, + { + "epoch": 0.19135308246597277, + "learning_rate": 8.042243991915866e-06, + "loss": 0.009, + "step": 954 + }, + { + "epoch": 0.19135308246597277, + "learning_rate": 8.055950520300756e-06, + "loss": 0.0281, + "step": 956 + }, + { + "epoch": 0.1921537229783827, + "learning_rate": 8.069660845456411e-06, + "loss": 0.009, + "step": 958 + }, + { + "epoch": 0.1921537229783827, + "learning_rate": 8.083374940606256e-06, + "loss": 0.0677, + "step": 960 + }, + { + "epoch": 0.19295436349079265, + "learning_rate": 8.097092778966364e-06, + "loss": 0.0813, + "step": 962 + }, + { + "epoch": 0.19295436349079265, + "learning_rate": 8.110814333745503e-06, + "loss": 0.019, + "step": 964 + }, + { + "epoch": 0.19375500400320256, + "learning_rate": 8.124539578145176e-06, + "loss": 0.6384, + "step": 966 + }, + { + "epoch": 0.19375500400320256, + "learning_rate": 8.138268485359684e-06, + "loss": 0.0413, + "step": 968 + }, + { + "epoch": 0.1945556445156125, + "learning_rate": 8.152001028576158e-06, + "loss": 0.1822, + "step": 970 + }, + { + "epoch": 0.1945556445156125, + "learning_rate": 8.165737180974676e-06, + "loss": 0.0006, + "step": 972 + }, + { + "epoch": 0.1953562850280224, + "learning_rate": 8.179476915728217e-06, + "loss": 0.0172, + "step": 974 + }, + { + "epoch": 0.1953562850280224, + "learning_rate": 8.193220206002785e-06, + "loss": 0.0072, + "step": 976 + }, + { + "epoch": 0.19615692554043235, + "learning_rate": 8.206967024957432e-06, + "loss": 0.0649, + "step": 978 + }, + { + "epoch": 0.19615692554043235, + "learning_rate": 8.220717345744326e-06, + "loss": 0.0767, + "step": 980 + }, + { + "epoch": 0.1969575660528423, + "learning_rate": 8.234471141508773e-06, + "loss": 0.2771, + "step": 982 + }, + { + "epoch": 0.1969575660528423, + "learning_rate": 8.248228385389349e-06, + "loss": 0.1763, + "step": 984 + }, + { + "epoch": 0.1977582065652522, + "learning_rate": 8.261989050517841e-06, + "loss": 0.3352, + "step": 986 + }, + { + "epoch": 0.1977582065652522, + "learning_rate": 8.275753110019367e-06, + "loss": 0.0877, + "step": 988 + }, + { + "epoch": 0.19855884707766214, + "learning_rate": 8.289520537012428e-06, + "loss": 0.0524, + "step": 990 + }, + { + "epoch": 0.19855884707766214, + "learning_rate": 8.303291304608936e-06, + "loss": 0.0023, + "step": 992 + }, + { + "epoch": 0.19935948759007205, + "learning_rate": 8.317065385914285e-06, + "loss": 0.0668, + "step": 994 + }, + { + "epoch": 0.19935948759007205, + "learning_rate": 8.330842754027378e-06, + "loss": 0.1407, + "step": 996 + }, + { + "epoch": 0.200160128102482, + "learning_rate": 8.344623382040752e-06, + "loss": 0.372, + "step": 998 + }, + { + "epoch": 0.200160128102482, + "learning_rate": 8.358407243040524e-06, + "loss": 0.0704, + "step": 1000 + }, + { + "epoch": 0.20096076861489193, + "learning_rate": 8.372194310106515e-06, + "loss": 0.0038, + "step": 1002 + }, + { + "epoch": 0.20096076861489193, + "learning_rate": 8.385984556312285e-06, + "loss": 0.1285, + "step": 1004 + }, + { + "epoch": 0.20176140912730184, + "learning_rate": 8.399777954725183e-06, + "loss": 0.064, + "step": 1006 + }, + { + "epoch": 0.20176140912730184, + "learning_rate": 8.413574478406386e-06, + "loss": 0.0237, + "step": 1008 + }, + { + "epoch": 0.20256204963971178, + "learning_rate": 8.427374100411022e-06, + "loss": 0.1235, + "step": 1010 + }, + { + "epoch": 0.20256204963971178, + "learning_rate": 8.441176793788106e-06, + "loss": 0.0007, + "step": 1012 + }, + { + "epoch": 0.2033626901521217, + "learning_rate": 8.454982531580687e-06, + "loss": 0.002, + "step": 1014 + }, + { + "epoch": 0.2033626901521217, + "learning_rate": 8.468791286825856e-06, + "loss": 0.006, + "step": 1016 + }, + { + "epoch": 0.20416333066453163, + "learning_rate": 8.482603032554812e-06, + "loss": 0.096, + "step": 1018 + }, + { + "epoch": 0.20416333066453163, + "learning_rate": 8.496417741792922e-06, + "loss": 0.2062, + "step": 1020 + }, + { + "epoch": 0.20496397117694154, + "learning_rate": 8.510235387559738e-06, + "loss": 0.0064, + "step": 1022 + }, + { + "epoch": 0.20496397117694154, + "learning_rate": 8.524055942869135e-06, + "loss": 0.0568, + "step": 1024 + }, + { + "epoch": 0.20576461168935148, + "learning_rate": 8.537879380729254e-06, + "loss": 0.0613, + "step": 1026 + }, + { + "epoch": 0.20576461168935148, + "learning_rate": 8.551705674142616e-06, + "loss": 0.5321, + "step": 1028 + }, + { + "epoch": 0.20656525220176142, + "learning_rate": 8.565534796106175e-06, + "loss": 0.1715, + "step": 1030 + }, + { + "epoch": 0.20656525220176142, + "learning_rate": 8.579366719611353e-06, + "loss": 0.0224, + "step": 1032 + }, + { + "epoch": 0.20736589271417133, + "learning_rate": 8.593201417644091e-06, + "loss": 0.1239, + "step": 1034 + }, + { + "epoch": 0.20736589271417133, + "learning_rate": 8.607038863184952e-06, + "loss": 0.0088, + "step": 1036 + }, + { + "epoch": 0.20816653322658127, + "learning_rate": 8.620879029209093e-06, + "loss": 0.0345, + "step": 1038 + }, + { + "epoch": 0.20816653322658127, + "learning_rate": 8.634721888686368e-06, + "loss": 0.0324, + "step": 1040 + }, + { + "epoch": 0.20896717373899118, + "learning_rate": 8.648567414581372e-06, + "loss": 0.0635, + "step": 1042 + }, + { + "epoch": 0.20896717373899118, + "learning_rate": 8.662415579853495e-06, + "loss": 0.097, + "step": 1044 + }, + { + "epoch": 0.20976781425140112, + "learning_rate": 8.676266357456968e-06, + "loss": 0.0091, + "step": 1046 + }, + { + "epoch": 0.20976781425140112, + "learning_rate": 8.690119720340907e-06, + "loss": 0.332, + "step": 1048 + }, + { + "epoch": 0.21056845476381106, + "learning_rate": 8.703975641449426e-06, + "loss": 0.4779, + "step": 1050 + }, + { + "epoch": 0.21056845476381106, + "learning_rate": 8.717834093721598e-06, + "loss": 0.0128, + "step": 1052 + }, + { + "epoch": 0.21136909527622097, + "learning_rate": 8.731695050091561e-06, + "loss": 0.1246, + "step": 1054 + }, + { + "epoch": 0.21136909527622097, + "learning_rate": 8.74555848348857e-06, + "loss": 0.012, + "step": 1056 + }, + { + "epoch": 0.2121697357886309, + "learning_rate": 8.759424366837035e-06, + "loss": 0.0676, + "step": 1058 + }, + { + "epoch": 0.2121697357886309, + "learning_rate": 8.773292673056572e-06, + "loss": 0.0404, + "step": 1060 + }, + { + "epoch": 0.21297037630104082, + "learning_rate": 8.787163375062113e-06, + "loss": 0.079, + "step": 1062 + }, + { + "epoch": 0.21297037630104082, + "learning_rate": 8.801036445763858e-06, + "loss": 0.0062, + "step": 1064 + }, + { + "epoch": 0.21377101681345076, + "learning_rate": 8.8149118580674e-06, + "loss": 0.0569, + "step": 1066 + }, + { + "epoch": 0.21377101681345076, + "learning_rate": 8.828789584873757e-06, + "loss": 0.0085, + "step": 1068 + }, + { + "epoch": 0.2145716573258607, + "learning_rate": 8.84266959907943e-06, + "loss": 0.0878, + "step": 1070 + }, + { + "epoch": 0.2145716573258607, + "learning_rate": 8.856551873576448e-06, + "loss": 0.0079, + "step": 1072 + }, + { + "epoch": 0.2153722978382706, + "learning_rate": 8.870436381252412e-06, + "loss": 0.3884, + "step": 1074 + }, + { + "epoch": 0.2153722978382706, + "learning_rate": 8.884323094990613e-06, + "loss": 0.1615, + "step": 1076 + }, + { + "epoch": 0.21617293835068055, + "learning_rate": 8.89821198766998e-06, + "loss": 0.0169, + "step": 1078 + }, + { + "epoch": 0.21617293835068055, + "learning_rate": 8.912103032165206e-06, + "loss": 0.0958, + "step": 1080 + }, + { + "epoch": 0.21697357886309046, + "learning_rate": 8.925996201346779e-06, + "loss": 0.021, + "step": 1082 + }, + { + "epoch": 0.21697357886309046, + "learning_rate": 8.939891468081036e-06, + "loss": 0.0057, + "step": 1084 + }, + { + "epoch": 0.2177742193755004, + "learning_rate": 8.953788805230209e-06, + "loss": 0.0411, + "step": 1086 + }, + { + "epoch": 0.2177742193755004, + "learning_rate": 8.967688185652527e-06, + "loss": 0.0296, + "step": 1088 + }, + { + "epoch": 0.21857485988791034, + "learning_rate": 8.981589582202184e-06, + "loss": 0.0325, + "step": 1090 + }, + { + "epoch": 0.21857485988791034, + "learning_rate": 8.995492967729449e-06, + "loss": 0.0238, + "step": 1092 + }, + { + "epoch": 0.21937550040032025, + "learning_rate": 9.009398315080712e-06, + "loss": 0.0024, + "step": 1094 + }, + { + "epoch": 0.21937550040032025, + "learning_rate": 9.023305597098526e-06, + "loss": 0.0069, + "step": 1096 + }, + { + "epoch": 0.2201761409127302, + "learning_rate": 9.037214786621669e-06, + "loss": 0.1268, + "step": 1098 + }, + { + "epoch": 0.2201761409127302, + "learning_rate": 9.051125856485175e-06, + "loss": 0.0068, + "step": 1100 + }, + { + "epoch": 0.2209767814251401, + "learning_rate": 9.065038779520457e-06, + "loss": 0.0378, + "step": 1102 + }, + { + "epoch": 0.2209767814251401, + "learning_rate": 9.078953528555258e-06, + "loss": 0.004, + "step": 1104 + }, + { + "epoch": 0.22177742193755004, + "learning_rate": 9.092870076413771e-06, + "loss": 0.0429, + "step": 1106 + }, + { + "epoch": 0.22177742193755004, + "learning_rate": 9.106788395916682e-06, + "loss": 0.0082, + "step": 1108 + }, + { + "epoch": 0.22257806244995998, + "learning_rate": 9.120708459881203e-06, + "loss": 0.3261, + "step": 1110 + }, + { + "epoch": 0.22257806244995998, + "learning_rate": 9.134630241121135e-06, + "loss": 1.215, + "step": 1112 + }, + { + "epoch": 0.2233787029623699, + "learning_rate": 9.148553712446971e-06, + "loss": 0.8289, + "step": 1114 + }, + { + "epoch": 0.2233787029623699, + "learning_rate": 9.162478846665854e-06, + "loss": 0.0973, + "step": 1116 + }, + { + "epoch": 0.22417934347477983, + "learning_rate": 9.176405616581694e-06, + "loss": 0.0015, + "step": 1118 + }, + { + "epoch": 0.22417934347477983, + "learning_rate": 9.190333994995208e-06, + "loss": 0.0001, + "step": 1120 + }, + { + "epoch": 0.22497998398718974, + "learning_rate": 9.20426395470397e-06, + "loss": 0.0073, + "step": 1122 + }, + { + "epoch": 0.22497998398718974, + "learning_rate": 9.218195468502469e-06, + "loss": 0.0003, + "step": 1124 + }, + { + "epoch": 0.22578062449959968, + "learning_rate": 9.232128509182136e-06, + "loss": 0.0121, + "step": 1126 + }, + { + "epoch": 0.22578062449959968, + "learning_rate": 9.24606304953148e-06, + "loss": 0.0651, + "step": 1128 + }, + { + "epoch": 0.22658126501200962, + "learning_rate": 9.259999062336021e-06, + "loss": 0.1146, + "step": 1130 + }, + { + "epoch": 0.22658126501200962, + "learning_rate": 9.273936520378426e-06, + "loss": 0.0537, + "step": 1132 + }, + { + "epoch": 0.22738190552441953, + "learning_rate": 9.287875396438536e-06, + "loss": 0.0254, + "step": 1134 + }, + { + "epoch": 0.22738190552441953, + "learning_rate": 9.301815663293426e-06, + "loss": 0.2098, + "step": 1136 + }, + { + "epoch": 0.22818254603682947, + "learning_rate": 9.315757293717432e-06, + "loss": 0.0286, + "step": 1138 + }, + { + "epoch": 0.22818254603682947, + "learning_rate": 9.329700260482286e-06, + "loss": 0.0162, + "step": 1140 + }, + { + "epoch": 0.22898318654923938, + "learning_rate": 9.343644536357053e-06, + "loss": 0.3065, + "step": 1142 + }, + { + "epoch": 0.22898318654923938, + "learning_rate": 9.35759009410826e-06, + "loss": 0.1275, + "step": 1144 + }, + { + "epoch": 0.22978382706164932, + "learning_rate": 9.37153690649993e-06, + "loss": 0.1533, + "step": 1146 + }, + { + "epoch": 0.22978382706164932, + "learning_rate": 9.38548494629364e-06, + "loss": 0.0023, + "step": 1148 + }, + { + "epoch": 0.23058446757405926, + "learning_rate": 9.39943418624856e-06, + "loss": 0.4893, + "step": 1150 + }, + { + "epoch": 0.23058446757405926, + "learning_rate": 9.41338459912151e-06, + "loss": 0.0454, + "step": 1152 + }, + { + "epoch": 0.23138510808646917, + "learning_rate": 9.427336157667062e-06, + "loss": 0.0368, + "step": 1154 + }, + { + "epoch": 0.23138510808646917, + "learning_rate": 9.441288834637507e-06, + "loss": 0.1444, + "step": 1156 + }, + { + "epoch": 0.2321857485988791, + "learning_rate": 9.45524260278296e-06, + "loss": 0.0117, + "step": 1158 + }, + { + "epoch": 0.2321857485988791, + "learning_rate": 9.469197434851414e-06, + "loss": 0.0969, + "step": 1160 + }, + { + "epoch": 0.23298638911128902, + "learning_rate": 9.483153303588777e-06, + "loss": 0.0023, + "step": 1162 + }, + { + "epoch": 0.23298638911128902, + "learning_rate": 9.497110181738935e-06, + "loss": 0.0004, + "step": 1164 + }, + { + "epoch": 0.23378702962369896, + "learning_rate": 9.511068042043785e-06, + "loss": 0.0009, + "step": 1166 + }, + { + "epoch": 0.23378702962369896, + "learning_rate": 9.52502685724336e-06, + "loss": 0.0478, + "step": 1168 + }, + { + "epoch": 0.2345876701361089, + "learning_rate": 9.538986600075773e-06, + "loss": 0.0548, + "step": 1170 + }, + { + "epoch": 0.2345876701361089, + "learning_rate": 9.552947243277342e-06, + "loss": 0.0019, + "step": 1172 + }, + { + "epoch": 0.2353883106485188, + "learning_rate": 9.566908759582633e-06, + "loss": 0.0241, + "step": 1174 + }, + { + "epoch": 0.2353883106485188, + "learning_rate": 9.580871121724498e-06, + "loss": 0.1469, + "step": 1176 + }, + { + "epoch": 0.23618895116092875, + "learning_rate": 9.594834302434123e-06, + "loss": 0.2849, + "step": 1178 + }, + { + "epoch": 0.23618895116092875, + "learning_rate": 9.608798274441153e-06, + "loss": 0.5833, + "step": 1180 + }, + { + "epoch": 0.23698959167333866, + "learning_rate": 9.622763010473628e-06, + "loss": 0.0157, + "step": 1182 + }, + { + "epoch": 0.23698959167333866, + "learning_rate": 9.636728483258116e-06, + "loss": 0.006, + "step": 1184 + }, + { + "epoch": 0.2377902321857486, + "learning_rate": 9.650694665519747e-06, + "loss": 0.5603, + "step": 1186 + }, + { + "epoch": 0.2377902321857486, + "learning_rate": 9.664661529982263e-06, + "loss": 0.002, + "step": 1188 + }, + { + "epoch": 0.23859087269815854, + "learning_rate": 9.678629049368077e-06, + "loss": 0.1111, + "step": 1190 + }, + { + "epoch": 0.23859087269815854, + "learning_rate": 9.692597196398302e-06, + "loss": 0.0066, + "step": 1192 + }, + { + "epoch": 0.23939151321056845, + "learning_rate": 9.706565943792879e-06, + "loss": 0.3765, + "step": 1194 + }, + { + "epoch": 0.23939151321056845, + "learning_rate": 9.720535264270526e-06, + "loss": 0.0876, + "step": 1196 + }, + { + "epoch": 0.2401921537229784, + "learning_rate": 9.734505130548855e-06, + "loss": 0.005, + "step": 1198 + }, + { + "epoch": 0.2401921537229784, + "learning_rate": 9.748475515344416e-06, + "loss": 0.0053, + "step": 1200 + }, + { + "epoch": 0.2409927942353883, + "learning_rate": 9.762446391372746e-06, + "loss": 0.0688, + "step": 1202 + }, + { + "epoch": 0.2409927942353883, + "learning_rate": 9.776417731348403e-06, + "loss": 0.0944, + "step": 1204 + }, + { + "epoch": 0.24179343474779824, + "learning_rate": 9.790389507985091e-06, + "loss": 0.7192, + "step": 1206 + }, + { + "epoch": 0.24179343474779824, + "learning_rate": 9.80436169399561e-06, + "loss": 0.5004, + "step": 1208 + }, + { + "epoch": 0.24259407526020815, + "learning_rate": 9.81833426209198e-06, + "loss": 0.9441, + "step": 1210 + }, + { + "epoch": 0.24259407526020815, + "learning_rate": 9.832307184985473e-06, + "loss": 0.0308, + "step": 1212 + }, + { + "epoch": 0.2433947157726181, + "learning_rate": 9.846280435386668e-06, + "loss": 0.0472, + "step": 1214 + }, + { + "epoch": 0.2433947157726181, + "learning_rate": 9.8602539860055e-06, + "loss": 0.5599, + "step": 1216 + }, + { + "epoch": 0.24419535628502803, + "learning_rate": 9.874227809551307e-06, + "loss": 0.0007, + "step": 1218 + }, + { + "epoch": 0.24419535628502803, + "learning_rate": 9.888201878732946e-06, + "loss": 0.5289, + "step": 1220 + }, + { + "epoch": 0.24499599679743794, + "learning_rate": 9.902176166258738e-06, + "loss": 0.0053, + "step": 1222 + }, + { + "epoch": 0.24499599679743794, + "learning_rate": 9.916150644836596e-06, + "loss": 0.0271, + "step": 1224 + }, + { + "epoch": 0.24579663730984788, + "learning_rate": 9.930125287174061e-06, + "loss": 0.027, + "step": 1226 + }, + { + "epoch": 0.24579663730984788, + "learning_rate": 9.944100065978354e-06, + "loss": 0.1086, + "step": 1228 + }, + { + "epoch": 0.2465972778222578, + "learning_rate": 9.958074953956413e-06, + "loss": 0.0707, + "step": 1230 + }, + { + "epoch": 0.2465972778222578, + "learning_rate": 9.972049923815011e-06, + "loss": 0.2591, + "step": 1232 + }, + { + "epoch": 0.24739791833466773, + "learning_rate": 9.986024948260714e-06, + "loss": 0.2851, + "step": 1234 + }, + { + "epoch": 0.24739791833466773, + "learning_rate": 9.999999999999996e-06, + "loss": 0.2452, + "step": 1236 + }, + { + "epoch": 0.24819855884707767, + "learning_rate": 1.0013975051739277e-05, + "loss": 0.1713, + "step": 1238 + }, + { + "epoch": 0.24819855884707767, + "learning_rate": 1.0027950076184982e-05, + "loss": 0.016, + "step": 1240 + }, + { + "epoch": 0.24899919935948758, + "learning_rate": 1.004192504604358e-05, + "loss": 0.0583, + "step": 1242 + }, + { + "epoch": 0.24899919935948758, + "learning_rate": 1.0055899934021637e-05, + "loss": 0.6078, + "step": 1244 + }, + { + "epoch": 0.24979983987189752, + "learning_rate": 1.006987471282593e-05, + "loss": 0.1036, + "step": 1246 + }, + { + "epoch": 0.24979983987189752, + "learning_rate": 1.0083849355163397e-05, + "loss": 0.2092, + "step": 1248 + }, + { + "epoch": 0.25060048038430743, + "learning_rate": 1.0097823833741255e-05, + "loss": 0.188, + "step": 1250 + }, + { + "epoch": 0.25060048038430743, + "learning_rate": 1.0111798121267047e-05, + "loss": 0.0553, + "step": 1252 + }, + { + "epoch": 0.2514011208967174, + "learning_rate": 1.0125772190448686e-05, + "loss": 0.0974, + "step": 1254 + }, + { + "epoch": 0.2514011208967174, + "learning_rate": 1.0139746013994493e-05, + "loss": 0.0237, + "step": 1256 + }, + { + "epoch": 0.2522017614091273, + "learning_rate": 1.0153719564613327e-05, + "loss": 0.0993, + "step": 1258 + }, + { + "epoch": 0.2522017614091273, + "learning_rate": 1.016769281501452e-05, + "loss": 0.0158, + "step": 1260 + }, + { + "epoch": 0.2530024019215372, + "learning_rate": 1.018166573790801e-05, + "loss": 0.0661, + "step": 1262 + }, + { + "epoch": 0.2530024019215372, + "learning_rate": 1.0195638306004383e-05, + "loss": 0.0999, + "step": 1264 + }, + { + "epoch": 0.25380304243394713, + "learning_rate": 1.0209610492014904e-05, + "loss": 0.0287, + "step": 1266 + }, + { + "epoch": 0.25380304243394713, + "learning_rate": 1.022358226865159e-05, + "loss": 0.0201, + "step": 1268 + }, + { + "epoch": 0.2546036829463571, + "learning_rate": 1.0237553608627247e-05, + "loss": 0.0774, + "step": 1270 + }, + { + "epoch": 0.2546036829463571, + "learning_rate": 1.0251524484655577e-05, + "loss": 0.1335, + "step": 1272 + }, + { + "epoch": 0.255404323458767, + "learning_rate": 1.0265494869451138e-05, + "loss": 0.0769, + "step": 1274 + }, + { + "epoch": 0.255404323458767, + "learning_rate": 1.0279464735729467e-05, + "loss": 0.0734, + "step": 1276 + }, + { + "epoch": 0.2562049639711769, + "learning_rate": 1.0293434056207114e-05, + "loss": 0.0226, + "step": 1278 + }, + { + "epoch": 0.2562049639711769, + "learning_rate": 1.0307402803601691e-05, + "loss": 0.0714, + "step": 1280 + }, + { + "epoch": 0.2570056044835869, + "learning_rate": 1.0321370950631918e-05, + "loss": 0.0788, + "step": 1282 + }, + { + "epoch": 0.2570056044835869, + "learning_rate": 1.033533847001773e-05, + "loss": 0.1717, + "step": 1284 + }, + { + "epoch": 0.2578062449959968, + "learning_rate": 1.0349305334480246e-05, + "loss": 0.0019, + "step": 1286 + }, + { + "epoch": 0.2578062449959968, + "learning_rate": 1.0363271516741877e-05, + "loss": 0.0361, + "step": 1288 + }, + { + "epoch": 0.2586068855084067, + "learning_rate": 1.0377236989526366e-05, + "loss": 0.3517, + "step": 1290 + }, + { + "epoch": 0.2586068855084067, + "learning_rate": 1.039120172555884e-05, + "loss": 0.1751, + "step": 1292 + }, + { + "epoch": 0.2594075260208166, + "learning_rate": 1.0405165697565868e-05, + "loss": 0.0172, + "step": 1294 + }, + { + "epoch": 0.2594075260208166, + "learning_rate": 1.0419128878275495e-05, + "loss": 0.0122, + "step": 1296 + }, + { + "epoch": 0.2602081665332266, + "learning_rate": 1.0433091240417362e-05, + "loss": 0.532, + "step": 1298 + }, + { + "epoch": 0.2602081665332266, + "learning_rate": 1.0447052756722651e-05, + "loss": 0.4325, + "step": 1300 + }, + { + "epoch": 0.2610088070456365, + "learning_rate": 1.046101339992422e-05, + "loss": 0.0304, + "step": 1302 + }, + { + "epoch": 0.2610088070456365, + "learning_rate": 1.0474973142756632e-05, + "loss": 0.141, + "step": 1304 + }, + { + "epoch": 0.2618094475580464, + "learning_rate": 1.0488931957956208e-05, + "loss": 0.0219, + "step": 1306 + }, + { + "epoch": 0.2618094475580464, + "learning_rate": 1.0502889818261058e-05, + "loss": 0.0018, + "step": 1308 + }, + { + "epoch": 0.2626100880704564, + "learning_rate": 1.0516846696411216e-05, + "loss": 0.2653, + "step": 1310 + }, + { + "epoch": 0.2626100880704564, + "learning_rate": 1.053080256514858e-05, + "loss": 0.1198, + "step": 1312 + }, + { + "epoch": 0.2634107285828663, + "learning_rate": 1.054475739721703e-05, + "loss": 0.0021, + "step": 1314 + }, + { + "epoch": 0.2634107285828663, + "learning_rate": 1.0558711165362488e-05, + "loss": 0.2559, + "step": 1316 + }, + { + "epoch": 0.2642113690952762, + "learning_rate": 1.0572663842332931e-05, + "loss": 0.0145, + "step": 1318 + }, + { + "epoch": 0.2642113690952762, + "learning_rate": 1.0586615400878484e-05, + "loss": 0.1832, + "step": 1320 + }, + { + "epoch": 0.26501200960768617, + "learning_rate": 1.0600565813751433e-05, + "loss": 0.1346, + "step": 1322 + }, + { + "epoch": 0.26501200960768617, + "learning_rate": 1.0614515053706354e-05, + "loss": 0.0096, + "step": 1324 + }, + { + "epoch": 0.2658126501200961, + "learning_rate": 1.0628463093500063e-05, + "loss": 0.0197, + "step": 1326 + }, + { + "epoch": 0.2658126501200961, + "learning_rate": 1.0642409905891733e-05, + "loss": 0.2077, + "step": 1328 + }, + { + "epoch": 0.266613290632506, + "learning_rate": 1.065635546364294e-05, + "loss": 0.0335, + "step": 1330 + }, + { + "epoch": 0.266613290632506, + "learning_rate": 1.0670299739517706e-05, + "loss": 0.0054, + "step": 1332 + }, + { + "epoch": 0.2674139311449159, + "learning_rate": 1.0684242706282562e-05, + "loss": 0.0127, + "step": 1334 + }, + { + "epoch": 0.2674139311449159, + "learning_rate": 1.0698184336706567e-05, + "loss": 0.0136, + "step": 1336 + }, + { + "epoch": 0.2682145716573259, + "learning_rate": 1.0712124603561457e-05, + "loss": 0.0107, + "step": 1338 + }, + { + "epoch": 0.2682145716573259, + "learning_rate": 1.0726063479621567e-05, + "loss": 0.1213, + "step": 1340 + }, + { + "epoch": 0.2690152121697358, + "learning_rate": 1.0740000937663972e-05, + "loss": 0.0017, + "step": 1342 + }, + { + "epoch": 0.2690152121697358, + "learning_rate": 1.0753936950468513e-05, + "loss": 0.0068, + "step": 1344 + }, + { + "epoch": 0.2698158526821457, + "learning_rate": 1.0767871490817856e-05, + "loss": 0.5639, + "step": 1346 + }, + { + "epoch": 0.2698158526821457, + "learning_rate": 1.0781804531497525e-05, + "loss": 0.0, + "step": 1348 + }, + { + "epoch": 0.27061649319455566, + "learning_rate": 1.0795736045296023e-05, + "loss": 0.0127, + "step": 1350 + }, + { + "epoch": 0.27061649319455566, + "learning_rate": 1.0809666005004787e-05, + "loss": 0.0614, + "step": 1352 + }, + { + "epoch": 0.2714171337069656, + "learning_rate": 1.08235943834183e-05, + "loss": 0.0102, + "step": 1354 + }, + { + "epoch": 0.2714171337069656, + "learning_rate": 1.083752115333414e-05, + "loss": 0.0292, + "step": 1356 + }, + { + "epoch": 0.2722177742193755, + "learning_rate": 1.0851446287553022e-05, + "loss": 0.007, + "step": 1358 + }, + { + "epoch": 0.2722177742193755, + "learning_rate": 1.0865369758878858e-05, + "loss": 0.0376, + "step": 1360 + }, + { + "epoch": 0.27301841473178545, + "learning_rate": 1.087929154011879e-05, + "loss": 0.0724, + "step": 1362 + }, + { + "epoch": 0.27301841473178545, + "learning_rate": 1.0893211604083311e-05, + "loss": 0.0271, + "step": 1364 + }, + { + "epoch": 0.27381905524419536, + "learning_rate": 1.090712992358622e-05, + "loss": 0.0107, + "step": 1366 + }, + { + "epoch": 0.27381905524419536, + "learning_rate": 1.0921046471444737e-05, + "loss": 0.2174, + "step": 1368 + }, + { + "epoch": 0.2746196957566053, + "learning_rate": 1.0934961220479537e-05, + "loss": 0.024, + "step": 1370 + }, + { + "epoch": 0.2746196957566053, + "learning_rate": 1.0948874143514818e-05, + "loss": 0.2072, + "step": 1372 + }, + { + "epoch": 0.2754203362690152, + "learning_rate": 1.0962785213378325e-05, + "loss": 0.1743, + "step": 1374 + }, + { + "epoch": 0.2754203362690152, + "learning_rate": 1.0976694402901467e-05, + "loss": 0.2507, + "step": 1376 + }, + { + "epoch": 0.27622097678142515, + "learning_rate": 1.0990601684919282e-05, + "loss": 0.0042, + "step": 1378 + }, + { + "epoch": 0.27622097678142515, + "learning_rate": 1.1004507032270544e-05, + "loss": 0.0906, + "step": 1380 + }, + { + "epoch": 0.27702161729383507, + "learning_rate": 1.1018410417797809e-05, + "loss": 0.5405, + "step": 1382 + }, + { + "epoch": 0.27702161729383507, + "learning_rate": 1.1032311814347467e-05, + "loss": 0.0029, + "step": 1384 + }, + { + "epoch": 0.277822257806245, + "learning_rate": 1.1046211194769784e-05, + "loss": 0.1255, + "step": 1386 + }, + { + "epoch": 0.277822257806245, + "learning_rate": 1.1060108531918955e-05, + "loss": 0.0362, + "step": 1388 + }, + { + "epoch": 0.27862289831865494, + "learning_rate": 1.1074003798653215e-05, + "loss": 0.0245, + "step": 1390 + }, + { + "epoch": 0.27862289831865494, + "learning_rate": 1.1087896967834787e-05, + "loss": 0.355, + "step": 1392 + }, + { + "epoch": 0.27942353883106485, + "learning_rate": 1.1101788012330013e-05, + "loss": 0.8935, + "step": 1394 + }, + { + "epoch": 0.27942353883106485, + "learning_rate": 1.111567690500938e-05, + "loss": 0.0288, + "step": 1396 + }, + { + "epoch": 0.28022417934347477, + "learning_rate": 1.1129563618747581e-05, + "loss": 0.0604, + "step": 1398 + }, + { + "epoch": 0.28022417934347477, + "learning_rate": 1.1143448126423545e-05, + "loss": 0.0051, + "step": 1400 + }, + { + "epoch": 0.28102481985588473, + "learning_rate": 1.1157330400920563e-05, + "loss": 0.1323, + "step": 1402 + }, + { + "epoch": 0.28102481985588473, + "learning_rate": 1.1171210415126238e-05, + "loss": 0.3759, + "step": 1404 + }, + { + "epoch": 0.28182546036829464, + "learning_rate": 1.1185088141932594e-05, + "loss": 0.0645, + "step": 1406 + }, + { + "epoch": 0.28182546036829464, + "learning_rate": 1.1198963554236135e-05, + "loss": 0.1871, + "step": 1408 + }, + { + "epoch": 0.28262610088070456, + "learning_rate": 1.121283662493788e-05, + "loss": 0.0799, + "step": 1410 + }, + { + "epoch": 0.28262610088070456, + "learning_rate": 1.122670732694342e-05, + "loss": 0.1952, + "step": 1412 + }, + { + "epoch": 0.28342674139311447, + "learning_rate": 1.1240575633162958e-05, + "loss": 0.0062, + "step": 1414 + }, + { + "epoch": 0.28342674139311447, + "learning_rate": 1.1254441516511425e-05, + "loss": 0.0969, + "step": 1416 + }, + { + "epoch": 0.28422738190552443, + "learning_rate": 1.1268304949908434e-05, + "loss": 0.2347, + "step": 1418 + }, + { + "epoch": 0.28422738190552443, + "learning_rate": 1.1282165906278395e-05, + "loss": 0.0566, + "step": 1420 + }, + { + "epoch": 0.28502802241793435, + "learning_rate": 1.1296024358550565e-05, + "loss": 0.6587, + "step": 1422 + }, + { + "epoch": 0.28502802241793435, + "learning_rate": 1.1309880279659087e-05, + "loss": 0.1107, + "step": 1424 + }, + { + "epoch": 0.28582866293034426, + "learning_rate": 1.1323733642543024e-05, + "loss": 0.0233, + "step": 1426 + }, + { + "epoch": 0.28582866293034426, + "learning_rate": 1.1337584420146496e-05, + "loss": 0.4258, + "step": 1428 + }, + { + "epoch": 0.2866293034427542, + "learning_rate": 1.135143258541862e-05, + "loss": 0.225, + "step": 1430 + }, + { + "epoch": 0.2866293034427542, + "learning_rate": 1.1365278111313625e-05, + "loss": 0.1452, + "step": 1432 + }, + { + "epoch": 0.28742994395516414, + "learning_rate": 1.13791209707909e-05, + "loss": 0.0199, + "step": 1434 + }, + { + "epoch": 0.28742994395516414, + "learning_rate": 1.1392961136815041e-05, + "loss": 0.0388, + "step": 1436 + }, + { + "epoch": 0.28823058446757405, + "learning_rate": 1.1406798582355902e-05, + "loss": 0.016, + "step": 1438 + }, + { + "epoch": 0.28823058446757405, + "learning_rate": 1.142063328038864e-05, + "loss": 0.325, + "step": 1440 + }, + { + "epoch": 0.289031224979984, + "learning_rate": 1.1434465203893818e-05, + "loss": 0.0097, + "step": 1442 + }, + { + "epoch": 0.289031224979984, + "learning_rate": 1.1448294325857377e-05, + "loss": 0.3079, + "step": 1444 + }, + { + "epoch": 0.2898318654923939, + "learning_rate": 1.146212061927074e-05, + "loss": 0.1159, + "step": 1446 + }, + { + "epoch": 0.2898318654923939, + "learning_rate": 1.1475944057130856e-05, + "loss": 0.0019, + "step": 1448 + }, + { + "epoch": 0.29063250600480384, + "learning_rate": 1.1489764612440255e-05, + "loss": 0.0135, + "step": 1450 + }, + { + "epoch": 0.29063250600480384, + "learning_rate": 1.150358225820707e-05, + "loss": 0.5323, + "step": 1452 + }, + { + "epoch": 0.29143314651721375, + "learning_rate": 1.151739696744518e-05, + "loss": 0.1129, + "step": 1454 + }, + { + "epoch": 0.29143314651721375, + "learning_rate": 1.1531208713174138e-05, + "loss": 0.4592, + "step": 1456 + }, + { + "epoch": 0.2922337870296237, + "learning_rate": 1.1545017468419307e-05, + "loss": 0.2969, + "step": 1458 + }, + { + "epoch": 0.2922337870296237, + "learning_rate": 1.1558823206211887e-05, + "loss": 0.0312, + "step": 1460 + }, + { + "epoch": 0.2930344275420336, + "learning_rate": 1.1572625899588972e-05, + "loss": 0.004, + "step": 1462 + }, + { + "epoch": 0.2930344275420336, + "learning_rate": 1.1586425521593607e-05, + "loss": 0.0182, + "step": 1464 + }, + { + "epoch": 0.29383506805444354, + "learning_rate": 1.1600222045274809e-05, + "loss": 0.0497, + "step": 1466 + }, + { + "epoch": 0.29383506805444354, + "learning_rate": 1.1614015443687708e-05, + "loss": 0.4583, + "step": 1468 + }, + { + "epoch": 0.2946357085668535, + "learning_rate": 1.1627805689893478e-05, + "loss": 0.0885, + "step": 1470 + }, + { + "epoch": 0.2946357085668535, + "learning_rate": 1.1641592756959467e-05, + "loss": 0.0658, + "step": 1472 + }, + { + "epoch": 0.2954363490792634, + "learning_rate": 1.1655376617959239e-05, + "loss": 0.2095, + "step": 1474 + }, + { + "epoch": 0.2954363490792634, + "learning_rate": 1.1669157245972616e-05, + "loss": 0.0406, + "step": 1476 + }, + { + "epoch": 0.2962369895916733, + "learning_rate": 1.1682934614085708e-05, + "loss": 0.034, + "step": 1478 + }, + { + "epoch": 0.2962369895916733, + "learning_rate": 1.1696708695391057e-05, + "loss": 0.0048, + "step": 1480 + }, + { + "epoch": 0.29703763010408324, + "learning_rate": 1.1710479462987565e-05, + "loss": 0.113, + "step": 1482 + }, + { + "epoch": 0.29703763010408324, + "learning_rate": 1.1724246889980626e-05, + "loss": 0.1315, + "step": 1484 + }, + { + "epoch": 0.2978382706164932, + "learning_rate": 1.1738010949482152e-05, + "loss": 0.014, + "step": 1486 + }, + { + "epoch": 0.2978382706164932, + "learning_rate": 1.1751771614610643e-05, + "loss": 0.0784, + "step": 1488 + }, + { + "epoch": 0.2986389111289031, + "learning_rate": 1.176552885849122e-05, + "loss": 0.0867, + "step": 1490 + }, + { + "epoch": 0.2986389111289031, + "learning_rate": 1.1779282654255668e-05, + "loss": 0.1079, + "step": 1492 + }, + { + "epoch": 0.29943955164131303, + "learning_rate": 1.1793032975042563e-05, + "loss": 0.0799, + "step": 1494 + }, + { + "epoch": 0.29943955164131303, + "learning_rate": 1.180677979399721e-05, + "loss": 0.5594, + "step": 1496 + }, + { + "epoch": 0.300240192153723, + "learning_rate": 1.1820523084271775e-05, + "loss": 0.0183, + "step": 1498 + }, + { + "epoch": 0.300240192153723, + "learning_rate": 1.1834262819025317e-05, + "loss": 0.0241, + "step": 1500 + }, + { + "epoch": 0.3010408326661329, + "learning_rate": 1.1847998971423835e-05, + "loss": 0.0174, + "step": 1502 + }, + { + "epoch": 0.3010408326661329, + "learning_rate": 1.1861731514640309e-05, + "loss": 0.0178, + "step": 1504 + }, + { + "epoch": 0.3018414731785428, + "learning_rate": 1.1875460421854816e-05, + "loss": 0.0147, + "step": 1506 + }, + { + "epoch": 0.3018414731785428, + "learning_rate": 1.188918566625449e-05, + "loss": 0.2881, + "step": 1508 + }, + { + "epoch": 0.3026421136909528, + "learning_rate": 1.1902907221033629e-05, + "loss": 0.0856, + "step": 1510 + }, + { + "epoch": 0.3026421136909528, + "learning_rate": 1.1916625059393739e-05, + "loss": 0.0628, + "step": 1512 + }, + { + "epoch": 0.3034427542033627, + "learning_rate": 1.1930339154543582e-05, + "loss": 0.0199, + "step": 1514 + }, + { + "epoch": 0.3034427542033627, + "learning_rate": 1.1944049479699241e-05, + "loss": 0.7157, + "step": 1516 + }, + { + "epoch": 0.3042433947157726, + "learning_rate": 1.1957756008084127e-05, + "loss": 0.0027, + "step": 1518 + }, + { + "epoch": 0.3042433947157726, + "learning_rate": 1.1971458712929133e-05, + "loss": 0.2221, + "step": 1520 + }, + { + "epoch": 0.3050440352281825, + "learning_rate": 1.1985157567472563e-05, + "loss": 0.119, + "step": 1522 + }, + { + "epoch": 0.3050440352281825, + "learning_rate": 1.1998852544960256e-05, + "loss": 0.0015, + "step": 1524 + }, + { + "epoch": 0.3058446757405925, + "learning_rate": 1.2012543618645622e-05, + "loss": 0.0073, + "step": 1526 + }, + { + "epoch": 0.3058446757405925, + "learning_rate": 1.2026230761789702e-05, + "loss": 0.0324, + "step": 1528 + }, + { + "epoch": 0.3066453162530024, + "learning_rate": 1.2039913947661205e-05, + "loss": 0.0035, + "step": 1530 + }, + { + "epoch": 0.3066453162530024, + "learning_rate": 1.2053593149536557e-05, + "loss": 0.4392, + "step": 1532 + }, + { + "epoch": 0.3074459567654123, + "learning_rate": 1.2067268340700016e-05, + "loss": 0.6442, + "step": 1534 + }, + { + "epoch": 0.3074459567654123, + "learning_rate": 1.2080939494443618e-05, + "loss": 0.021, + "step": 1536 + }, + { + "epoch": 0.3082465972778223, + "learning_rate": 1.2094606584067304e-05, + "loss": 0.1435, + "step": 1538 + }, + { + "epoch": 0.3082465972778223, + "learning_rate": 1.210826958287895e-05, + "loss": 0.3032, + "step": 1540 + }, + { + "epoch": 0.3090472377902322, + "learning_rate": 1.212192846419443e-05, + "loss": 0.3523, + "step": 1542 + }, + { + "epoch": 0.3090472377902322, + "learning_rate": 1.2135583201337646e-05, + "loss": 0.0063, + "step": 1544 + }, + { + "epoch": 0.3098478783026421, + "learning_rate": 1.2149233767640587e-05, + "loss": 0.0648, + "step": 1546 + }, + { + "epoch": 0.3098478783026421, + "learning_rate": 1.2162880136443434e-05, + "loss": 0.1901, + "step": 1548 + }, + { + "epoch": 0.31064851881505207, + "learning_rate": 1.2176522281094514e-05, + "loss": 0.0184, + "step": 1550 + }, + { + "epoch": 0.31064851881505207, + "learning_rate": 1.2190160174950428e-05, + "loss": 0.0188, + "step": 1552 + }, + { + "epoch": 0.311449159327462, + "learning_rate": 1.220379379137607e-05, + "loss": 0.009, + "step": 1554 + }, + { + "epoch": 0.311449159327462, + "learning_rate": 1.2217423103744692e-05, + "loss": 0.1953, + "step": 1556 + }, + { + "epoch": 0.3122497998398719, + "learning_rate": 1.2231048085437953e-05, + "loss": 0.1356, + "step": 1558 + }, + { + "epoch": 0.3122497998398719, + "learning_rate": 1.2244668709845952e-05, + "loss": 0.0103, + "step": 1560 + }, + { + "epoch": 0.3130504403522818, + "learning_rate": 1.2258284950367347e-05, + "loss": 0.0657, + "step": 1562 + }, + { + "epoch": 0.3130504403522818, + "learning_rate": 1.2271896780409309e-05, + "loss": 0.2355, + "step": 1564 + }, + { + "epoch": 0.31385108086469177, + "learning_rate": 1.228550417338764e-05, + "loss": 0.3524, + "step": 1566 + }, + { + "epoch": 0.31385108086469177, + "learning_rate": 1.2299107102726804e-05, + "loss": 0.008, + "step": 1568 + }, + { + "epoch": 0.3146517213771017, + "learning_rate": 1.2312705541859985e-05, + "loss": 0.3384, + "step": 1570 + }, + { + "epoch": 0.3146517213771017, + "learning_rate": 1.2326299464229143e-05, + "loss": 0.0107, + "step": 1572 + }, + { + "epoch": 0.3154523618895116, + "learning_rate": 1.2339888843285029e-05, + "loss": 0.064, + "step": 1574 + }, + { + "epoch": 0.3154523618895116, + "learning_rate": 1.2353473652487329e-05, + "loss": 0.426, + "step": 1576 + }, + { + "epoch": 0.31625300240192156, + "learning_rate": 1.2367053865304597e-05, + "loss": 0.004, + "step": 1578 + }, + { + "epoch": 0.31625300240192156, + "learning_rate": 1.2380629455214385e-05, + "loss": 0.3006, + "step": 1580 + }, + { + "epoch": 0.31705364291433147, + "learning_rate": 1.2394200395703273e-05, + "loss": 0.0162, + "step": 1582 + }, + { + "epoch": 0.31705364291433147, + "learning_rate": 1.2407766660266916e-05, + "loss": 0.1779, + "step": 1584 + }, + { + "epoch": 0.3178542834267414, + "learning_rate": 1.2421328222410109e-05, + "loss": 0.2049, + "step": 1586 + }, + { + "epoch": 0.3178542834267414, + "learning_rate": 1.2434885055646808e-05, + "loss": 0.1294, + "step": 1588 + }, + { + "epoch": 0.31865492393915135, + "learning_rate": 1.2448437133500262e-05, + "loss": 0.8116, + "step": 1590 + }, + { + "epoch": 0.31865492393915135, + "learning_rate": 1.2461984429502947e-05, + "loss": 0.0334, + "step": 1592 + }, + { + "epoch": 0.31945556445156126, + "learning_rate": 1.2475526917196703e-05, + "loss": 0.174, + "step": 1594 + }, + { + "epoch": 0.31945556445156126, + "learning_rate": 1.2489064570132761e-05, + "loss": 0.0497, + "step": 1596 + }, + { + "epoch": 0.32025620496397117, + "learning_rate": 1.2502597361871787e-05, + "loss": 0.5346, + "step": 1598 + }, + { + "epoch": 0.32025620496397117, + "learning_rate": 1.2516125265983945e-05, + "loss": 0.0015, + "step": 1600 + }, + { + "epoch": 0.3210568454763811, + "learning_rate": 1.2529648256048931e-05, + "loss": 0.0047, + "step": 1602 + }, + { + "epoch": 0.3210568454763811, + "learning_rate": 1.2543166305656089e-05, + "loss": 0.0655, + "step": 1604 + }, + { + "epoch": 0.32185748598879105, + "learning_rate": 1.2556679388404351e-05, + "loss": 0.3828, + "step": 1606 + }, + { + "epoch": 0.32185748598879105, + "learning_rate": 1.257018747790238e-05, + "loss": 0.3042, + "step": 1608 + }, + { + "epoch": 0.32265812650120096, + "learning_rate": 1.2583690547768584e-05, + "loss": 0.1267, + "step": 1610 + }, + { + "epoch": 0.32265812650120096, + "learning_rate": 1.259718857163117e-05, + "loss": 0.1215, + "step": 1612 + }, + { + "epoch": 0.32345876701361087, + "learning_rate": 1.261068152312821e-05, + "loss": 0.179, + "step": 1614 + }, + { + "epoch": 0.32345876701361087, + "learning_rate": 1.2624169375907657e-05, + "loss": 0.1746, + "step": 1616 + }, + { + "epoch": 0.32425940752602084, + "learning_rate": 1.2637652103627481e-05, + "loss": 0.0294, + "step": 1618 + }, + { + "epoch": 0.32425940752602084, + "learning_rate": 1.2651129679955598e-05, + "loss": 0.1699, + "step": 1620 + }, + { + "epoch": 0.32506004803843075, + "learning_rate": 1.2664602078570017e-05, + "loss": 0.0205, + "step": 1622 + }, + { + "epoch": 0.32506004803843075, + "learning_rate": 1.2678069273158849e-05, + "loss": 0.0269, + "step": 1624 + }, + { + "epoch": 0.32586068855084066, + "learning_rate": 1.2691531237420369e-05, + "loss": 0.0163, + "step": 1626 + }, + { + "epoch": 0.32586068855084066, + "learning_rate": 1.2704987945063073e-05, + "loss": 0.1361, + "step": 1628 + }, + { + "epoch": 0.3266613290632506, + "learning_rate": 1.27184393698057e-05, + "loss": 0.0484, + "step": 1630 + }, + { + "epoch": 0.3266613290632506, + "learning_rate": 1.273188548537736e-05, + "loss": 0.0635, + "step": 1632 + }, + { + "epoch": 0.32746196957566054, + "learning_rate": 1.2745326265517481e-05, + "loss": 0.116, + "step": 1634 + }, + { + "epoch": 0.32746196957566054, + "learning_rate": 1.2758761683975929e-05, + "loss": 0.2892, + "step": 1636 + }, + { + "epoch": 0.32826261008807045, + "learning_rate": 1.277219171451304e-05, + "loss": 0.4079, + "step": 1638 + }, + { + "epoch": 0.32826261008807045, + "learning_rate": 1.2785616330899676e-05, + "loss": 0.2038, + "step": 1640 + }, + { + "epoch": 0.32906325060048036, + "learning_rate": 1.2799035506917265e-05, + "loss": 0.1826, + "step": 1642 + }, + { + "epoch": 0.32906325060048036, + "learning_rate": 1.2812449216357855e-05, + "loss": 0.4261, + "step": 1644 + }, + { + "epoch": 0.32986389111289033, + "learning_rate": 1.2825857433024208e-05, + "loss": 0.0569, + "step": 1646 + }, + { + "epoch": 0.32986389111289033, + "learning_rate": 1.2839260130729776e-05, + "loss": 0.0064, + "step": 1648 + }, + { + "epoch": 0.33066453162530024, + "learning_rate": 1.2852657283298794e-05, + "loss": 0.1738, + "step": 1650 + }, + { + "epoch": 0.33066453162530024, + "learning_rate": 1.2866048864566336e-05, + "loss": 0.5639, + "step": 1652 + }, + { + "epoch": 0.33146517213771015, + "learning_rate": 1.2879434848378356e-05, + "loss": 0.0677, + "step": 1654 + }, + { + "epoch": 0.33146517213771015, + "learning_rate": 1.2892815208591734e-05, + "loss": 0.207, + "step": 1656 + }, + { + "epoch": 0.3322658126501201, + "learning_rate": 1.2906189919074336e-05, + "loss": 0.0617, + "step": 1658 + }, + { + "epoch": 0.3322658126501201, + "learning_rate": 1.2919558953705047e-05, + "loss": 0.0003, + "step": 1660 + }, + { + "epoch": 0.33306645316253003, + "learning_rate": 1.293292228637389e-05, + "loss": 0.1745, + "step": 1662 + }, + { + "epoch": 0.33306645316253003, + "learning_rate": 1.2946279890981966e-05, + "loss": 0.4262, + "step": 1664 + }, + { + "epoch": 0.33386709367493994, + "learning_rate": 1.2959631741441583e-05, + "loss": 0.1257, + "step": 1666 + }, + { + "epoch": 0.33386709367493994, + "learning_rate": 1.2972977811676289e-05, + "loss": 0.041, + "step": 1668 + }, + { + "epoch": 0.33466773418734985, + "learning_rate": 1.298631807562092e-05, + "loss": 0.0631, + "step": 1670 + }, + { + "epoch": 0.33466773418734985, + "learning_rate": 1.2999652507221652e-05, + "loss": 0.0238, + "step": 1672 + }, + { + "epoch": 0.3354683746997598, + "learning_rate": 1.3012981080436036e-05, + "loss": 0.0107, + "step": 1674 + }, + { + "epoch": 0.3354683746997598, + "learning_rate": 1.3026303769233109e-05, + "loss": 0.0088, + "step": 1676 + }, + { + "epoch": 0.33626901521216973, + "learning_rate": 1.3039620547593357e-05, + "loss": 0.0021, + "step": 1678 + }, + { + "epoch": 0.33626901521216973, + "learning_rate": 1.3052931389508822e-05, + "loss": 0.0481, + "step": 1680 + }, + { + "epoch": 0.33706965572457964, + "learning_rate": 1.3066236268983143e-05, + "loss": 0.0512, + "step": 1682 + }, + { + "epoch": 0.33706965572457964, + "learning_rate": 1.3079535160031601e-05, + "loss": 0.0025, + "step": 1684 + }, + { + "epoch": 0.3378702962369896, + "learning_rate": 1.3092828036681178e-05, + "loss": 0.1257, + "step": 1686 + }, + { + "epoch": 0.3378702962369896, + "learning_rate": 1.3106114872970575e-05, + "loss": 0.2769, + "step": 1688 + }, + { + "epoch": 0.3386709367493995, + "learning_rate": 1.3119395642950348e-05, + "loss": 0.0616, + "step": 1690 + }, + { + "epoch": 0.3386709367493995, + "learning_rate": 1.313267032068285e-05, + "loss": 0.2574, + "step": 1692 + }, + { + "epoch": 0.33947157726180943, + "learning_rate": 1.3145938880242346e-05, + "loss": 0.0183, + "step": 1694 + }, + { + "epoch": 0.33947157726180943, + "learning_rate": 1.3159201295715054e-05, + "loss": 0.0205, + "step": 1696 + }, + { + "epoch": 0.3402722177742194, + "learning_rate": 1.3172457541199188e-05, + "loss": 0.001, + "step": 1698 + }, + { + "epoch": 0.3402722177742194, + "learning_rate": 1.3185707590804997e-05, + "loss": 0.4886, + "step": 1700 + }, + { + "epoch": 0.3410728582866293, + "learning_rate": 1.3198951418654882e-05, + "loss": 0.0142, + "step": 1702 + }, + { + "epoch": 0.3410728582866293, + "learning_rate": 1.321218899888334e-05, + "loss": 0.2076, + "step": 1704 + }, + { + "epoch": 0.3418734987990392, + "learning_rate": 1.322542030563709e-05, + "loss": 0.127, + "step": 1706 + }, + { + "epoch": 0.3418734987990392, + "learning_rate": 1.3238645313075109e-05, + "loss": 0.0181, + "step": 1708 + }, + { + "epoch": 0.34267413931144913, + "learning_rate": 1.3251863995368665e-05, + "loss": 0.1601, + "step": 1710 + }, + { + "epoch": 0.34267413931144913, + "learning_rate": 1.326507632670139e-05, + "loss": 0.1594, + "step": 1712 + }, + { + "epoch": 0.3434747798238591, + "learning_rate": 1.3278282281269293e-05, + "loss": 0.0096, + "step": 1714 + }, + { + "epoch": 0.3434747798238591, + "learning_rate": 1.3291481833280894e-05, + "loss": 0.1763, + "step": 1716 + }, + { + "epoch": 0.344275420336269, + "learning_rate": 1.3304674956957167e-05, + "loss": 0.0147, + "step": 1718 + }, + { + "epoch": 0.344275420336269, + "learning_rate": 1.3317861626531652e-05, + "loss": 0.1088, + "step": 1720 + }, + { + "epoch": 0.3450760608486789, + "learning_rate": 1.3331041816250503e-05, + "loss": 0.004, + "step": 1722 + }, + { + "epoch": 0.3450760608486789, + "learning_rate": 1.3344215500372517e-05, + "loss": 0.0042, + "step": 1724 + }, + { + "epoch": 0.3458767013610889, + "learning_rate": 1.335738265316921e-05, + "loss": 0.675, + "step": 1726 + }, + { + "epoch": 0.3458767013610889, + "learning_rate": 1.3370543248924826e-05, + "loss": 0.1237, + "step": 1728 + }, + { + "epoch": 0.3466773418734988, + "learning_rate": 1.3383697261936472e-05, + "loss": 0.0011, + "step": 1730 + }, + { + "epoch": 0.3466773418734988, + "learning_rate": 1.3396844666514062e-05, + "loss": 0.0164, + "step": 1732 + }, + { + "epoch": 0.3474779823859087, + "learning_rate": 1.3409985436980422e-05, + "loss": 0.0075, + "step": 1734 + }, + { + "epoch": 0.3474779823859087, + "learning_rate": 1.3423119547671348e-05, + "loss": 0.0756, + "step": 1736 + }, + { + "epoch": 0.3482786228983187, + "learning_rate": 1.3436246972935638e-05, + "loss": 0.0125, + "step": 1738 + }, + { + "epoch": 0.3482786228983187, + "learning_rate": 1.344936768713513e-05, + "loss": 0.28, + "step": 1740 + }, + { + "epoch": 0.3490792634107286, + "learning_rate": 1.346248166464481e-05, + "loss": 0.1424, + "step": 1742 + }, + { + "epoch": 0.3490792634107286, + "learning_rate": 1.347558887985279e-05, + "loss": 0.0664, + "step": 1744 + }, + { + "epoch": 0.3498799039231385, + "learning_rate": 1.348868930716039e-05, + "loss": 0.0147, + "step": 1746 + }, + { + "epoch": 0.3498799039231385, + "learning_rate": 1.3501782920982189e-05, + "loss": 0.1745, + "step": 1748 + }, + { + "epoch": 0.3506805444355484, + "learning_rate": 1.3514869695746078e-05, + "loss": 0.0179, + "step": 1750 + }, + { + "epoch": 0.3506805444355484, + "learning_rate": 1.3527949605893305e-05, + "loss": 1.0639, + "step": 1752 + }, + { + "epoch": 0.3514811849479584, + "learning_rate": 1.3541022625878501e-05, + "loss": 0.0137, + "step": 1754 + }, + { + "epoch": 0.3514811849479584, + "learning_rate": 1.3554088730169812e-05, + "loss": 0.8238, + "step": 1756 + }, + { + "epoch": 0.3522818254603683, + "learning_rate": 1.3567147893248833e-05, + "loss": 0.2865, + "step": 1758 + }, + { + "epoch": 0.3522818254603683, + "learning_rate": 1.3580200089610739e-05, + "loss": 0.0201, + "step": 1760 + }, + { + "epoch": 0.3530824659727782, + "learning_rate": 1.3593245293764303e-05, + "loss": 0.0084, + "step": 1762 + }, + { + "epoch": 0.3530824659727782, + "learning_rate": 1.3606283480231962e-05, + "loss": 0.014, + "step": 1764 + }, + { + "epoch": 0.35388310648518817, + "learning_rate": 1.361931462354984e-05, + "loss": 0.1741, + "step": 1766 + }, + { + "epoch": 0.35388310648518817, + "learning_rate": 1.3632338698267863e-05, + "loss": 0.2101, + "step": 1768 + }, + { + "epoch": 0.3546837469975981, + "learning_rate": 1.3645355678949715e-05, + "loss": 0.0569, + "step": 1770 + }, + { + "epoch": 0.3546837469975981, + "learning_rate": 1.3658365540172948e-05, + "loss": 0.364, + "step": 1772 + }, + { + "epoch": 0.355484387510008, + "learning_rate": 1.3671368256529026e-05, + "loss": 0.1783, + "step": 1774 + }, + { + "epoch": 0.355484387510008, + "learning_rate": 1.368436380262336e-05, + "loss": 0.1956, + "step": 1776 + }, + { + "epoch": 0.35628502802241796, + "learning_rate": 1.3697352153075365e-05, + "loss": 0.1071, + "step": 1778 + }, + { + "epoch": 0.35628502802241796, + "learning_rate": 1.3710333282518497e-05, + "loss": 0.3065, + "step": 1780 + }, + { + "epoch": 0.35708566853482787, + "learning_rate": 1.3723307165600361e-05, + "loss": 0.6657, + "step": 1782 + }, + { + "epoch": 0.35708566853482787, + "learning_rate": 1.3736273776982667e-05, + "loss": 0.1771, + "step": 1784 + }, + { + "epoch": 0.3578863090472378, + "learning_rate": 1.3749233091341344e-05, + "loss": 0.0499, + "step": 1786 + }, + { + "epoch": 0.3578863090472378, + "learning_rate": 1.3762185083366562e-05, + "loss": 0.4165, + "step": 1788 + }, + { + "epoch": 0.3586869495596477, + "learning_rate": 1.3775129727762808e-05, + "loss": 0.2392, + "step": 1790 + }, + { + "epoch": 0.3586869495596477, + "learning_rate": 1.3788066999248893e-05, + "loss": 0.0401, + "step": 1792 + }, + { + "epoch": 0.35948759007205766, + "learning_rate": 1.3800996872558075e-05, + "loss": 0.0414, + "step": 1794 + }, + { + "epoch": 0.35948759007205766, + "learning_rate": 1.3813919322438018e-05, + "loss": 0.0621, + "step": 1796 + }, + { + "epoch": 0.3602882305844676, + "learning_rate": 1.3826834323650899e-05, + "loss": 0.3924, + "step": 1798 + }, + { + "epoch": 0.3602882305844676, + "learning_rate": 1.3839741850973435e-05, + "loss": 0.0247, + "step": 1800 + }, + { + "epoch": 0.3610888710968775, + "learning_rate": 1.3852641879196952e-05, + "loss": 0.0349, + "step": 1802 + }, + { + "epoch": 0.3610888710968775, + "learning_rate": 1.3865534383127413e-05, + "loss": 0.0793, + "step": 1804 + }, + { + "epoch": 0.36188951160928745, + "learning_rate": 1.387841933758546e-05, + "loss": 0.0203, + "step": 1806 + }, + { + "epoch": 0.36188951160928745, + "learning_rate": 1.3891296717406533e-05, + "loss": 0.0062, + "step": 1808 + }, + { + "epoch": 0.36269015212169736, + "learning_rate": 1.3904166497440812e-05, + "loss": 0.0833, + "step": 1810 + }, + { + "epoch": 0.36269015212169736, + "learning_rate": 1.391702865255334e-05, + "loss": 0.0341, + "step": 1812 + }, + { + "epoch": 0.3634907926341073, + "learning_rate": 1.3929883157624046e-05, + "loss": 0.0423, + "step": 1814 + }, + { + "epoch": 0.3634907926341073, + "learning_rate": 1.3942729987547808e-05, + "loss": 0.1737, + "step": 1816 + }, + { + "epoch": 0.36429143314651724, + "learning_rate": 1.3955569117234468e-05, + "loss": 0.0002, + "step": 1818 + }, + { + "epoch": 0.36429143314651724, + "learning_rate": 1.3968400521608962e-05, + "loss": 0.0027, + "step": 1820 + }, + { + "epoch": 0.36509207365892715, + "learning_rate": 1.3981224175611265e-05, + "loss": 0.1103, + "step": 1822 + }, + { + "epoch": 0.36509207365892715, + "learning_rate": 1.3994040054196498e-05, + "loss": 0.0484, + "step": 1824 + }, + { + "epoch": 0.36589271417133706, + "learning_rate": 1.4006848132334979e-05, + "loss": 0.4436, + "step": 1826 + }, + { + "epoch": 0.36589271417133706, + "learning_rate": 1.4019648385012245e-05, + "loss": 0.1193, + "step": 1828 + }, + { + "epoch": 0.366693354683747, + "learning_rate": 1.4032440787229135e-05, + "loss": 0.0045, + "step": 1830 + }, + { + "epoch": 0.366693354683747, + "learning_rate": 1.4045225314001789e-05, + "loss": 0.4379, + "step": 1832 + }, + { + "epoch": 0.36749399519615694, + "learning_rate": 1.4058001940361781e-05, + "loss": 0.561, + "step": 1834 + }, + { + "epoch": 0.36749399519615694, + "learning_rate": 1.4070770641356069e-05, + "loss": 0.3164, + "step": 1836 + }, + { + "epoch": 0.36829463570856685, + "learning_rate": 1.40835313920471e-05, + "loss": 0.0443, + "step": 1838 + }, + { + "epoch": 0.36829463570856685, + "learning_rate": 1.4096284167512856e-05, + "loss": 0.0709, + "step": 1840 + }, + { + "epoch": 0.36909527622097676, + "learning_rate": 1.4109028942846888e-05, + "loss": 0.0116, + "step": 1842 + }, + { + "epoch": 0.36909527622097676, + "learning_rate": 1.4121765693158355e-05, + "loss": 0.2858, + "step": 1844 + }, + { + "epoch": 0.36989591673338673, + "learning_rate": 1.4134494393572146e-05, + "loss": 0.0018, + "step": 1846 + }, + { + "epoch": 0.36989591673338673, + "learning_rate": 1.4147215019228813e-05, + "loss": 0.0405, + "step": 1848 + }, + { + "epoch": 0.37069655724579664, + "learning_rate": 1.4159927545284697e-05, + "loss": 0.1321, + "step": 1850 + }, + { + "epoch": 0.37069655724579664, + "learning_rate": 1.4172631946911964e-05, + "loss": 0.1739, + "step": 1852 + }, + { + "epoch": 0.37149719775820655, + "learning_rate": 1.4185328199298636e-05, + "loss": 0.0016, + "step": 1854 + }, + { + "epoch": 0.37149719775820655, + "learning_rate": 1.4198016277648665e-05, + "loss": 0.1445, + "step": 1856 + }, + { + "epoch": 0.37229783827061647, + "learning_rate": 1.4210696157181936e-05, + "loss": 0.3723, + "step": 1858 + }, + { + "epoch": 0.37229783827061647, + "learning_rate": 1.4223367813134406e-05, + "loss": 0.01, + "step": 1860 + }, + { + "epoch": 0.37309847878302643, + "learning_rate": 1.4236031220758037e-05, + "loss": 0.0756, + "step": 1862 + }, + { + "epoch": 0.37309847878302643, + "learning_rate": 1.4248686355320922e-05, + "loss": 0.0973, + "step": 1864 + }, + { + "epoch": 0.37389911929543634, + "learning_rate": 1.426133319210731e-05, + "loss": 0.0791, + "step": 1866 + }, + { + "epoch": 0.37389911929543634, + "learning_rate": 1.4273971706417653e-05, + "loss": 0.0001, + "step": 1868 + }, + { + "epoch": 0.37469975980784626, + "learning_rate": 1.4286601873568642e-05, + "loss": 0.3288, + "step": 1870 + }, + { + "epoch": 0.37469975980784626, + "learning_rate": 1.429922366889332e-05, + "loss": 0.0125, + "step": 1872 + }, + { + "epoch": 0.3755004003202562, + "learning_rate": 1.431183706774103e-05, + "loss": 0.4434, + "step": 1874 + }, + { + "epoch": 0.3755004003202562, + "learning_rate": 1.4324442045477534e-05, + "loss": 0.1589, + "step": 1876 + }, + { + "epoch": 0.37630104083266613, + "learning_rate": 1.4337038577485035e-05, + "loss": 0.21, + "step": 1878 + }, + { + "epoch": 0.37630104083266613, + "learning_rate": 1.4349626639162231e-05, + "loss": 0.0721, + "step": 1880 + }, + { + "epoch": 0.37710168134507605, + "learning_rate": 1.436220620592437e-05, + "loss": 0.0338, + "step": 1882 + }, + { + "epoch": 0.37710168134507605, + "learning_rate": 1.4374777253203265e-05, + "loss": 0.0966, + "step": 1884 + }, + { + "epoch": 0.377902321857486, + "learning_rate": 1.4387339756447422e-05, + "loss": 0.0253, + "step": 1886 + }, + { + "epoch": 0.377902321857486, + "learning_rate": 1.4399893691121985e-05, + "loss": 0.0037, + "step": 1888 + }, + { + "epoch": 0.3787029623698959, + "learning_rate": 1.4412439032708848e-05, + "loss": 0.0104, + "step": 1890 + }, + { + "epoch": 0.3787029623698959, + "learning_rate": 1.4424975756706684e-05, + "loss": 0.0127, + "step": 1892 + }, + { + "epoch": 0.37950360288230583, + "learning_rate": 1.4437503838631002e-05, + "loss": 0.1112, + "step": 1894 + }, + { + "epoch": 0.37950360288230583, + "learning_rate": 1.4450023254014185e-05, + "loss": 0.5726, + "step": 1896 + }, + { + "epoch": 0.38030424339471575, + "learning_rate": 1.4462533978405529e-05, + "loss": 0.5321, + "step": 1898 + }, + { + "epoch": 0.38030424339471575, + "learning_rate": 1.4475035987371348e-05, + "loss": 0.0733, + "step": 1900 + }, + { + "epoch": 0.3811048839071257, + "learning_rate": 1.4487529256494937e-05, + "loss": 0.0811, + "step": 1902 + }, + { + "epoch": 0.3811048839071257, + "learning_rate": 1.4500013761376663e-05, + "loss": 0.2072, + "step": 1904 + }, + { + "epoch": 0.3819055244195356, + "learning_rate": 1.4512489477634024e-05, + "loss": 0.0745, + "step": 1906 + }, + { + "epoch": 0.3819055244195356, + "learning_rate": 1.4524956380901674e-05, + "loss": 0.3317, + "step": 1908 + }, + { + "epoch": 0.38270616493194554, + "learning_rate": 1.4537414446831461e-05, + "loss": 0.25, + "step": 1910 + }, + { + "epoch": 0.38270616493194554, + "learning_rate": 1.454986365109255e-05, + "loss": 0.0255, + "step": 1912 + }, + { + "epoch": 0.3835068054443555, + "learning_rate": 1.4562303969371357e-05, + "loss": 0.1315, + "step": 1914 + }, + { + "epoch": 0.3835068054443555, + "learning_rate": 1.4574735377371669e-05, + "loss": 0.056, + "step": 1916 + }, + { + "epoch": 0.3843074459567654, + "learning_rate": 1.4587157850814679e-05, + "loss": 0.6164, + "step": 1918 + }, + { + "epoch": 0.3843074459567654, + "learning_rate": 1.4599571365439027e-05, + "loss": 0.3517, + "step": 1920 + }, + { + "epoch": 0.3851080864691753, + "learning_rate": 1.4611975897000849e-05, + "loss": 0.1739, + "step": 1922 + }, + { + "epoch": 0.3851080864691753, + "learning_rate": 1.4624371421273812e-05, + "loss": 0.0098, + "step": 1924 + }, + { + "epoch": 0.3859087269815853, + "learning_rate": 1.463675791404922e-05, + "loss": 0.108, + "step": 1926 + }, + { + "epoch": 0.3859087269815853, + "learning_rate": 1.4649135351135968e-05, + "loss": 0.0031, + "step": 1928 + }, + { + "epoch": 0.3867093674939952, + "learning_rate": 1.4661503708360652e-05, + "loss": 0.057, + "step": 1930 + }, + { + "epoch": 0.3867093674939952, + "learning_rate": 1.4673862961567604e-05, + "loss": 0.4891, + "step": 1932 + }, + { + "epoch": 0.3875100080064051, + "learning_rate": 1.4686213086618932e-05, + "loss": 0.1323, + "step": 1934 + }, + { + "epoch": 0.3875100080064051, + "learning_rate": 1.4698554059394563e-05, + "loss": 0.1741, + "step": 1936 + }, + { + "epoch": 0.388310648518815, + "learning_rate": 1.4710885855792338e-05, + "loss": 0.0287, + "step": 1938 + }, + { + "epoch": 0.388310648518815, + "learning_rate": 1.4723208451727977e-05, + "loss": 0.0255, + "step": 1940 + }, + { + "epoch": 0.389111289031225, + "learning_rate": 1.4735521823135184e-05, + "loss": 0.3533, + "step": 1942 + }, + { + "epoch": 0.389111289031225, + "learning_rate": 1.4747825945965675e-05, + "loss": 0.0039, + "step": 1944 + }, + { + "epoch": 0.3899119295436349, + "learning_rate": 1.4760120796189233e-05, + "loss": 0.0124, + "step": 1946 + }, + { + "epoch": 0.3899119295436349, + "learning_rate": 1.4772406349793749e-05, + "loss": 0.1452, + "step": 1948 + }, + { + "epoch": 0.3907125700560448, + "learning_rate": 1.4784682582785254e-05, + "loss": 0.0324, + "step": 1950 + }, + { + "epoch": 0.3907125700560448, + "learning_rate": 1.4796949471188033e-05, + "loss": 0.1643, + "step": 1952 + }, + { + "epoch": 0.3915132105684548, + "learning_rate": 1.4809206991044571e-05, + "loss": 0.0099, + "step": 1954 + }, + { + "epoch": 0.3915132105684548, + "learning_rate": 1.4821455118415666e-05, + "loss": 0.0509, + "step": 1956 + }, + { + "epoch": 0.3923138510808647, + "learning_rate": 1.4833693829380458e-05, + "loss": 0.0298, + "step": 1958 + }, + { + "epoch": 0.3923138510808647, + "learning_rate": 1.4845923100036479e-05, + "loss": 0.1315, + "step": 1960 + }, + { + "epoch": 0.3931144915932746, + "learning_rate": 1.4858142906499686e-05, + "loss": 0.0135, + "step": 1962 + }, + { + "epoch": 0.3931144915932746, + "learning_rate": 1.4870353224904563e-05, + "loss": 0.0179, + "step": 1964 + }, + { + "epoch": 0.3939151321056846, + "learning_rate": 1.4882554031404075e-05, + "loss": 0.3744, + "step": 1966 + }, + { + "epoch": 0.3939151321056846, + "learning_rate": 1.4894745302169786e-05, + "loss": 0.0014, + "step": 1968 + }, + { + "epoch": 0.3947157726180945, + "learning_rate": 1.4906927013391879e-05, + "loss": 0.253, + "step": 1970 + }, + { + "epoch": 0.3947157726180945, + "learning_rate": 1.4919099141279205e-05, + "loss": 0.0225, + "step": 1972 + }, + { + "epoch": 0.3955164131305044, + "learning_rate": 1.4931261662059338e-05, + "loss": 0.0338, + "step": 1974 + }, + { + "epoch": 0.3955164131305044, + "learning_rate": 1.4943414551978597e-05, + "loss": 0.2072, + "step": 1976 + }, + { + "epoch": 0.3963170536429143, + "learning_rate": 1.4955557787302151e-05, + "loss": 0.0784, + "step": 1978 + }, + { + "epoch": 0.3963170536429143, + "learning_rate": 1.4967691344313988e-05, + "loss": 0.5323, + "step": 1980 + }, + { + "epoch": 0.3971176941553243, + "learning_rate": 1.4979815199317005e-05, + "loss": 0.175, + "step": 1982 + }, + { + "epoch": 0.3971176941553243, + "learning_rate": 1.499192932863305e-05, + "loss": 0.4076, + "step": 1984 + }, + { + "epoch": 0.3979183346677342, + "learning_rate": 1.5004033708602967e-05, + "loss": 0.3064, + "step": 1986 + }, + { + "epoch": 0.3979183346677342, + "learning_rate": 1.5016128315586626e-05, + "loss": 0.0584, + "step": 1988 + }, + { + "epoch": 0.3987189751801441, + "learning_rate": 1.5028213125963029e-05, + "loss": 0.051, + "step": 1990 + }, + { + "epoch": 0.3987189751801441, + "learning_rate": 1.5040288116130261e-05, + "loss": 0.1366, + "step": 1992 + }, + { + "epoch": 0.39951961569255406, + "learning_rate": 1.5052353262505603e-05, + "loss": 0.2081, + "step": 1994 + }, + { + "epoch": 0.39951961569255406, + "learning_rate": 1.5064408541525568e-05, + "loss": 0.0229, + "step": 1996 + }, + { + "epoch": 0.400320256204964, + "learning_rate": 1.5076453929645933e-05, + "loss": 0.0083, + "step": 1998 + }, + { + "epoch": 0.400320256204964, + "learning_rate": 1.5088489403341793e-05, + "loss": 0.3206, + "step": 2000 + }, + { + "epoch": 0.4011208967173739, + "learning_rate": 1.510051493910759e-05, + "loss": 0.3443, + "step": 2002 + }, + { + "epoch": 0.4011208967173739, + "learning_rate": 1.5112530513457229e-05, + "loss": 0.6163, + "step": 2004 + }, + { + "epoch": 0.40192153722978385, + "learning_rate": 1.512453610292401e-05, + "loss": 0.0454, + "step": 2006 + }, + { + "epoch": 0.40192153722978385, + "learning_rate": 1.513653168406076e-05, + "loss": 0.0265, + "step": 2008 + }, + { + "epoch": 0.40272217774219377, + "learning_rate": 1.514851723343985e-05, + "loss": 0.1301, + "step": 2010 + }, + { + "epoch": 0.40272217774219377, + "learning_rate": 1.5160492727653245e-05, + "loss": 0.0287, + "step": 2012 + }, + { + "epoch": 0.4035228182546037, + "learning_rate": 1.5172458143312522e-05, + "loss": 0.0469, + "step": 2014 + }, + { + "epoch": 0.4035228182546037, + "learning_rate": 1.5184413457049006e-05, + "loss": 0.2463, + "step": 2016 + }, + { + "epoch": 0.4043234587670136, + "learning_rate": 1.5196358645513685e-05, + "loss": 0.343, + "step": 2018 + }, + { + "epoch": 0.4043234587670136, + "learning_rate": 1.5208293685377354e-05, + "loss": 0.0077, + "step": 2020 + }, + { + "epoch": 0.40512409927942356, + "learning_rate": 1.5220218553330618e-05, + "loss": 0.5335, + "step": 2022 + }, + { + "epoch": 0.40512409927942356, + "learning_rate": 1.5232133226083954e-05, + "loss": 0.0509, + "step": 2024 + }, + { + "epoch": 0.40592473979183347, + "learning_rate": 1.5244037680367744e-05, + "loss": 0.5498, + "step": 2026 + }, + { + "epoch": 0.40592473979183347, + "learning_rate": 1.5255931892932322e-05, + "loss": 0.0034, + "step": 2028 + }, + { + "epoch": 0.4067253803042434, + "learning_rate": 1.5267815840548057e-05, + "loss": 0.2442, + "step": 2030 + }, + { + "epoch": 0.4067253803042434, + "learning_rate": 1.527968950000533e-05, + "loss": 0.0582, + "step": 2032 + }, + { + "epoch": 0.40752602081665334, + "learning_rate": 1.529155284811463e-05, + "loss": 0.0548, + "step": 2034 + }, + { + "epoch": 0.40752602081665334, + "learning_rate": 1.5303405861706574e-05, + "loss": 0.1956, + "step": 2036 + }, + { + "epoch": 0.40832666132906326, + "learning_rate": 1.5315248517631975e-05, + "loss": 0.0362, + "step": 2038 + }, + { + "epoch": 0.40832666132906326, + "learning_rate": 1.532708079276185e-05, + "loss": 0.0933, + "step": 2040 + }, + { + "epoch": 0.40912730184147317, + "learning_rate": 1.5338902663987544e-05, + "loss": 0.0571, + "step": 2042 + }, + { + "epoch": 0.40912730184147317, + "learning_rate": 1.5350714108220667e-05, + "loss": 0.6302, + "step": 2044 + }, + { + "epoch": 0.4099279423538831, + "learning_rate": 1.5362515102393217e-05, + "loss": 0.2525, + "step": 2046 + }, + { + "epoch": 0.4099279423538831, + "learning_rate": 1.5374305623457594e-05, + "loss": 0.1428, + "step": 2048 + }, + { + "epoch": 0.41072858286629305, + "learning_rate": 1.5386085648386656e-05, + "loss": 0.004, + "step": 2050 + }, + { + "epoch": 0.41072858286629305, + "learning_rate": 1.539785515417376e-05, + "loss": 0.4937, + "step": 2052 + }, + { + "epoch": 0.41152922337870296, + "learning_rate": 1.540961411783279e-05, + "loss": 0.0051, + "step": 2054 + }, + { + "epoch": 0.41152922337870296, + "learning_rate": 1.542136251639826e-05, + "loss": 0.0261, + "step": 2056 + }, + { + "epoch": 0.41232986389111287, + "learning_rate": 1.5433100326925288e-05, + "loss": 0.148, + "step": 2058 + }, + { + "epoch": 0.41232986389111287, + "learning_rate": 1.5444827526489668e-05, + "loss": 0.2505, + "step": 2060 + }, + { + "epoch": 0.41313050440352284, + "learning_rate": 1.545654409218793e-05, + "loss": 0.8927, + "step": 2062 + }, + { + "epoch": 0.41313050440352284, + "learning_rate": 1.5468250001137368e-05, + "loss": 0.3761, + "step": 2064 + }, + { + "epoch": 0.41393114491593275, + "learning_rate": 1.5479945230476066e-05, + "loss": 0.0109, + "step": 2066 + }, + { + "epoch": 0.41393114491593275, + "learning_rate": 1.5491629757363026e-05, + "loss": 0.2885, + "step": 2068 + }, + { + "epoch": 0.41473178542834266, + "learning_rate": 1.550330355897809e-05, + "loss": 0.0041, + "step": 2070 + }, + { + "epoch": 0.41473178542834266, + "learning_rate": 1.551496661252208e-05, + "loss": 0.0, + "step": 2072 + }, + { + "epoch": 0.4155324259407526, + "learning_rate": 1.5526618895216793e-05, + "loss": 0.1729, + "step": 2074 + }, + { + "epoch": 0.4155324259407526, + "learning_rate": 1.5538260384305073e-05, + "loss": 0.2008, + "step": 2076 + }, + { + "epoch": 0.41633306645316254, + "learning_rate": 1.5549891057050837e-05, + "loss": 0.0256, + "step": 2078 + }, + { + "epoch": 0.41633306645316254, + "learning_rate": 1.5561510890739113e-05, + "loss": 0.7624, + "step": 2080 + }, + { + "epoch": 0.41713370696557245, + "learning_rate": 1.557311986267615e-05, + "loss": 0.182, + "step": 2082 + }, + { + "epoch": 0.41713370696557245, + "learning_rate": 1.5584717950189353e-05, + "loss": 0.0333, + "step": 2084 + }, + { + "epoch": 0.41793434747798236, + "learning_rate": 1.5596305130627404e-05, + "loss": 0.0283, + "step": 2086 + }, + { + "epoch": 0.41793434747798236, + "learning_rate": 1.5607881381360296e-05, + "loss": 0.2847, + "step": 2088 + }, + { + "epoch": 0.4187349879903923, + "learning_rate": 1.5619446679779357e-05, + "loss": 0.0297, + "step": 2090 + }, + { + "epoch": 0.4187349879903923, + "learning_rate": 1.563100100329731e-05, + "loss": 0.0977, + "step": 2092 + }, + { + "epoch": 0.41953562850280224, + "learning_rate": 1.564254432934829e-05, + "loss": 0.0708, + "step": 2094 + }, + { + "epoch": 0.41953562850280224, + "learning_rate": 1.565407663538797e-05, + "loss": 0.0259, + "step": 2096 + }, + { + "epoch": 0.42033626901521215, + "learning_rate": 1.5665597898893484e-05, + "loss": 0.2051, + "step": 2098 + }, + { + "epoch": 0.42033626901521215, + "learning_rate": 1.567710809736356e-05, + "loss": 0.1189, + "step": 2100 + }, + { + "epoch": 0.4211369095276221, + "learning_rate": 1.568860720831853e-05, + "loss": 0.0076, + "step": 2102 + }, + { + "epoch": 0.4211369095276221, + "learning_rate": 1.5700095209300376e-05, + "loss": 0.1071, + "step": 2104 + }, + { + "epoch": 0.42193755004003203, + "learning_rate": 1.5711572077872774e-05, + "loss": 0.3572, + "step": 2106 + }, + { + "epoch": 0.42193755004003203, + "learning_rate": 1.572303779162118e-05, + "loss": 0.0258, + "step": 2108 + }, + { + "epoch": 0.42273819055244194, + "learning_rate": 1.573449232815279e-05, + "loss": 0.1191, + "step": 2110 + }, + { + "epoch": 0.42273819055244194, + "learning_rate": 1.5745935665096647e-05, + "loss": 0.034, + "step": 2112 + }, + { + "epoch": 0.4235388310648519, + "learning_rate": 1.5757367780103666e-05, + "loss": 0.0507, + "step": 2114 + }, + { + "epoch": 0.4235388310648519, + "learning_rate": 1.5768788650846677e-05, + "loss": 0.2534, + "step": 2116 + }, + { + "epoch": 0.4243394715772618, + "learning_rate": 1.5780198255020478e-05, + "loss": 0.0286, + "step": 2118 + }, + { + "epoch": 0.4243394715772618, + "learning_rate": 1.5791596570341844e-05, + "loss": 0.0087, + "step": 2120 + }, + { + "epoch": 0.42514011208967173, + "learning_rate": 1.580298357454965e-05, + "loss": 0.5248, + "step": 2122 + }, + { + "epoch": 0.42514011208967173, + "learning_rate": 1.581435924540481e-05, + "loss": 0.5258, + "step": 2124 + }, + { + "epoch": 0.42594075260208164, + "learning_rate": 1.5825723560690403e-05, + "loss": 0.0979, + "step": 2126 + }, + { + "epoch": 0.42594075260208164, + "learning_rate": 1.5837076498211666e-05, + "loss": 0.3518, + "step": 2128 + }, + { + "epoch": 0.4267413931144916, + "learning_rate": 1.5848418035796068e-05, + "loss": 0.7287, + "step": 2130 + }, + { + "epoch": 0.4267413931144916, + "learning_rate": 1.5859748151293333e-05, + "loss": 0.3432, + "step": 2132 + }, + { + "epoch": 0.4275420336269015, + "learning_rate": 1.587106682257552e-05, + "loss": 0.1664, + "step": 2134 + }, + { + "epoch": 0.4275420336269015, + "learning_rate": 1.5882374027537005e-05, + "loss": 0.2253, + "step": 2136 + }, + { + "epoch": 0.42834267413931143, + "learning_rate": 1.5893669744094577e-05, + "loss": 0.1761, + "step": 2138 + }, + { + "epoch": 0.42834267413931143, + "learning_rate": 1.5904953950187455e-05, + "loss": 0.132, + "step": 2140 + }, + { + "epoch": 0.4291433146517214, + "learning_rate": 1.591622662377734e-05, + "loss": 0.1193, + "step": 2142 + }, + { + "epoch": 0.4291433146517214, + "learning_rate": 1.5927487742848448e-05, + "loss": 0.0049, + "step": 2144 + }, + { + "epoch": 0.4299439551641313, + "learning_rate": 1.5938737285407567e-05, + "loss": 0.176, + "step": 2146 + }, + { + "epoch": 0.4299439551641313, + "learning_rate": 1.594997522948412e-05, + "loss": 0.4796, + "step": 2148 + }, + { + "epoch": 0.4307445956765412, + "learning_rate": 1.5961201553130148e-05, + "loss": 0.052, + "step": 2150 + }, + { + "epoch": 0.4307445956765412, + "learning_rate": 1.5972416234420393e-05, + "loss": 0.1079, + "step": 2152 + }, + { + "epoch": 0.4315452361889512, + "learning_rate": 1.598361925145234e-05, + "loss": 0.0756, + "step": 2154 + }, + { + "epoch": 0.4315452361889512, + "learning_rate": 1.599481058234626e-05, + "loss": 0.4084, + "step": 2156 + }, + { + "epoch": 0.4323458767013611, + "learning_rate": 1.6005990205245216e-05, + "loss": 0.1834, + "step": 2158 + }, + { + "epoch": 0.4323458767013611, + "learning_rate": 1.60171580983152e-05, + "loss": 0.5729, + "step": 2160 + }, + { + "epoch": 0.433146517213771, + "learning_rate": 1.602831423974506e-05, + "loss": 0.1357, + "step": 2162 + }, + { + "epoch": 0.433146517213771, + "learning_rate": 1.6039458607746607e-05, + "loss": 0.2525, + "step": 2164 + }, + { + "epoch": 0.4339471577261809, + "learning_rate": 1.6050591180554648e-05, + "loss": 0.6019, + "step": 2166 + }, + { + "epoch": 0.4339471577261809, + "learning_rate": 1.606171193642703e-05, + "loss": 0.0098, + "step": 2168 + }, + { + "epoch": 0.4347477982385909, + "learning_rate": 1.6072820853644677e-05, + "loss": 0.1903, + "step": 2170 + }, + { + "epoch": 0.4347477982385909, + "learning_rate": 1.6083917910511616e-05, + "loss": 0.235, + "step": 2172 + }, + { + "epoch": 0.4355484387510008, + "learning_rate": 1.6095003085355082e-05, + "loss": 0.2394, + "step": 2174 + }, + { + "epoch": 0.4355484387510008, + "learning_rate": 1.6106076356525474e-05, + "loss": 0.2572, + "step": 2176 + }, + { + "epoch": 0.4363490792634107, + "learning_rate": 1.611713770239646e-05, + "loss": 0.1863, + "step": 2178 + }, + { + "epoch": 0.4363490792634107, + "learning_rate": 1.6128187101364982e-05, + "loss": 0.0461, + "step": 2180 + }, + { + "epoch": 0.4371497197758207, + "learning_rate": 1.6139224531851332e-05, + "loss": 0.2021, + "step": 2182 + }, + { + "epoch": 0.4371497197758207, + "learning_rate": 1.6150249972299153e-05, + "loss": 0.1118, + "step": 2184 + }, + { + "epoch": 0.4379503602882306, + "learning_rate": 1.616126340117555e-05, + "loss": 0.049, + "step": 2186 + }, + { + "epoch": 0.4379503602882306, + "learning_rate": 1.617226479697104e-05, + "loss": 0.7397, + "step": 2188 + }, + { + "epoch": 0.4387510008006405, + "learning_rate": 1.618325413819966e-05, + "loss": 0.0111, + "step": 2190 + }, + { + "epoch": 0.4387510008006405, + "learning_rate": 1.6194231403398994e-05, + "loss": 0.0157, + "step": 2192 + }, + { + "epoch": 0.43955164131305047, + "learning_rate": 1.6205196571130194e-05, + "loss": 0.0509, + "step": 2194 + }, + { + "epoch": 0.43955164131305047, + "learning_rate": 1.621614961997806e-05, + "loss": 0.0178, + "step": 2196 + }, + { + "epoch": 0.4403522818254604, + "learning_rate": 1.6227090528551034e-05, + "loss": 0.0605, + "step": 2198 + }, + { + "epoch": 0.4403522818254604, + "learning_rate": 1.6238019275481313e-05, + "loss": 0.5598, + "step": 2200 + }, + { + "epoch": 0.4411529223378703, + "learning_rate": 1.62489358394248e-05, + "loss": 0.0002, + "step": 2202 + }, + { + "epoch": 0.4411529223378703, + "learning_rate": 1.6259840199061212e-05, + "loss": 0.003, + "step": 2204 + }, + { + "epoch": 0.4419535628502802, + "learning_rate": 1.6270732333094095e-05, + "loss": 0.0286, + "step": 2206 + }, + { + "epoch": 0.4419535628502802, + "learning_rate": 1.6281612220250883e-05, + "loss": 0.004, + "step": 2208 + }, + { + "epoch": 0.44275420336269017, + "learning_rate": 1.6292479839282897e-05, + "loss": 0.0098, + "step": 2210 + }, + { + "epoch": 0.44275420336269017, + "learning_rate": 1.6303335168965474e-05, + "loss": 0.0464, + "step": 2212 + }, + { + "epoch": 0.4435548438751001, + "learning_rate": 1.6314178188097907e-05, + "loss": 0.448, + "step": 2214 + }, + { + "epoch": 0.4435548438751001, + "learning_rate": 1.6325008875503543e-05, + "loss": 0.011, + "step": 2216 + }, + { + "epoch": 0.44435548438751, + "learning_rate": 1.6335827210029816e-05, + "loss": 0.0886, + "step": 2218 + }, + { + "epoch": 0.44435548438751, + "learning_rate": 1.6346633170548285e-05, + "loss": 0.1916, + "step": 2220 + }, + { + "epoch": 0.44515612489991996, + "learning_rate": 1.635742673595467e-05, + "loss": 0.0836, + "step": 2222 + }, + { + "epoch": 0.44515612489991996, + "learning_rate": 1.6368207885168897e-05, + "loss": 0.089, + "step": 2224 + }, + { + "epoch": 0.44595676541232987, + "learning_rate": 1.6378976597135173e-05, + "loss": 0.0478, + "step": 2226 + }, + { + "epoch": 0.44595676541232987, + "learning_rate": 1.6389732850821957e-05, + "loss": 0.0232, + "step": 2228 + }, + { + "epoch": 0.4467574059247398, + "learning_rate": 1.640047662522205e-05, + "loss": 0.1084, + "step": 2230 + }, + { + "epoch": 0.4467574059247398, + "learning_rate": 1.641120789935263e-05, + "loss": 0.2504, + "step": 2232 + }, + { + "epoch": 0.4475580464371497, + "learning_rate": 1.6421926652255282e-05, + "loss": 0.0304, + "step": 2234 + }, + { + "epoch": 0.4475580464371497, + "learning_rate": 1.6432632862996042e-05, + "loss": 0.0962, + "step": 2236 + }, + { + "epoch": 0.44835868694955966, + "learning_rate": 1.6443326510665474e-05, + "loss": 0.253, + "step": 2238 + }, + { + "epoch": 0.44835868694955966, + "learning_rate": 1.6454007574378637e-05, + "loss": 0.1048, + "step": 2240 + }, + { + "epoch": 0.44915932746196957, + "learning_rate": 1.646467603327518e-05, + "loss": 0.0676, + "step": 2242 + }, + { + "epoch": 0.44915932746196957, + "learning_rate": 1.6475331866519377e-05, + "loss": 0.0227, + "step": 2244 + }, + { + "epoch": 0.4499599679743795, + "learning_rate": 1.6485975053300154e-05, + "loss": 0.1299, + "step": 2246 + }, + { + "epoch": 0.4499599679743795, + "learning_rate": 1.6496605572831134e-05, + "loss": 0.3429, + "step": 2248 + }, + { + "epoch": 0.45076060848678945, + "learning_rate": 1.650722340435067e-05, + "loss": 0.0139, + "step": 2250 + }, + { + "epoch": 0.45076060848678945, + "learning_rate": 1.6517828527121928e-05, + "loss": 0.0378, + "step": 2252 + }, + { + "epoch": 0.45156124899919936, + "learning_rate": 1.652842092043287e-05, + "loss": 0.8186, + "step": 2254 + }, + { + "epoch": 0.45156124899919936, + "learning_rate": 1.6539000563596318e-05, + "loss": 0.0128, + "step": 2256 + }, + { + "epoch": 0.45236188951160927, + "learning_rate": 1.6549567435950004e-05, + "loss": 0.0022, + "step": 2258 + }, + { + "epoch": 0.45236188951160927, + "learning_rate": 1.6560121516856586e-05, + "loss": 0.2295, + "step": 2260 + }, + { + "epoch": 0.45316253002401924, + "learning_rate": 1.6570662785703713e-05, + "loss": 0.2286, + "step": 2262 + }, + { + "epoch": 0.45316253002401924, + "learning_rate": 1.6581191221904077e-05, + "loss": 0.1271, + "step": 2264 + }, + { + "epoch": 0.45396317053642915, + "learning_rate": 1.6591706804895408e-05, + "loss": 0.1823, + "step": 2266 + }, + { + "epoch": 0.45396317053642915, + "learning_rate": 1.6602209514140542e-05, + "loss": 0.0361, + "step": 2268 + }, + { + "epoch": 0.45476381104883906, + "learning_rate": 1.6612699329127457e-05, + "loss": 0.2894, + "step": 2270 + }, + { + "epoch": 0.45476381104883906, + "learning_rate": 1.6623176229369324e-05, + "loss": 0.1947, + "step": 2272 + }, + { + "epoch": 0.455564451561249, + "learning_rate": 1.6633640194404523e-05, + "loss": 0.0506, + "step": 2274 + }, + { + "epoch": 0.455564451561249, + "learning_rate": 1.6644091203796694e-05, + "loss": 0.0739, + "step": 2276 + }, + { + "epoch": 0.45636509207365894, + "learning_rate": 1.6654529237134816e-05, + "loss": 0.0, + "step": 2278 + }, + { + "epoch": 0.45636509207365894, + "learning_rate": 1.6664954274033168e-05, + "loss": 0.049, + "step": 2280 + }, + { + "epoch": 0.45716573258606885, + "learning_rate": 1.667536629413143e-05, + "loss": 0.246, + "step": 2282 + }, + { + "epoch": 0.45716573258606885, + "learning_rate": 1.6685765277094695e-05, + "loss": 0.1455, + "step": 2284 + }, + { + "epoch": 0.45796637309847876, + "learning_rate": 1.6696151202613527e-05, + "loss": 1.0972, + "step": 2286 + }, + { + "epoch": 0.45796637309847876, + "learning_rate": 1.6706524050403996e-05, + "loss": 0.0321, + "step": 2288 + }, + { + "epoch": 0.45876701361088873, + "learning_rate": 1.6716883800207685e-05, + "loss": 0.0066, + "step": 2290 + }, + { + "epoch": 0.45876701361088873, + "learning_rate": 1.6727230431791806e-05, + "loss": 0.0724, + "step": 2292 + }, + { + "epoch": 0.45956765412329864, + "learning_rate": 1.673756392494915e-05, + "loss": 0.0425, + "step": 2294 + }, + { + "epoch": 0.45956765412329864, + "learning_rate": 1.674788425949818e-05, + "loss": 0.006, + "step": 2296 + }, + { + "epoch": 0.46036829463570855, + "learning_rate": 1.6758191415283063e-05, + "loss": 0.0304, + "step": 2298 + }, + { + "epoch": 0.46036829463570855, + "learning_rate": 1.6768485372173696e-05, + "loss": 0.1192, + "step": 2300 + }, + { + "epoch": 0.4611689351481185, + "learning_rate": 1.6778766110065755e-05, + "loss": 0.0234, + "step": 2302 + }, + { + "epoch": 0.4611689351481185, + "learning_rate": 1.6789033608880735e-05, + "loss": 0.0323, + "step": 2304 + }, + { + "epoch": 0.46196957566052843, + "learning_rate": 1.6799287848566e-05, + "loss": 0.0215, + "step": 2306 + }, + { + "epoch": 0.46196957566052843, + "learning_rate": 1.6809528809094798e-05, + "loss": 0.0229, + "step": 2308 + }, + { + "epoch": 0.46277021617293834, + "learning_rate": 1.6819756470466305e-05, + "loss": 0.1947, + "step": 2310 + }, + { + "epoch": 0.46277021617293834, + "learning_rate": 1.6829970812705674e-05, + "loss": 0.464, + "step": 2312 + }, + { + "epoch": 0.46357085668534825, + "learning_rate": 1.684017181586408e-05, + "loss": 1.1038, + "step": 2314 + }, + { + "epoch": 0.46357085668534825, + "learning_rate": 1.6850359460018733e-05, + "loss": 0.0335, + "step": 2316 + }, + { + "epoch": 0.4643714971977582, + "learning_rate": 1.6860533725272943e-05, + "loss": 0.4481, + "step": 2318 + }, + { + "epoch": 0.4643714971977582, + "learning_rate": 1.6870694591756165e-05, + "loss": 0.428, + "step": 2320 + }, + { + "epoch": 0.46517213771016813, + "learning_rate": 1.6880842039624e-05, + "loss": 0.1203, + "step": 2322 + }, + { + "epoch": 0.46517213771016813, + "learning_rate": 1.689097604905826e-05, + "loss": 0.3194, + "step": 2324 + }, + { + "epoch": 0.46597277822257804, + "learning_rate": 1.6901096600267e-05, + "loss": 0.0117, + "step": 2326 + }, + { + "epoch": 0.46597277822257804, + "learning_rate": 1.6911203673484577e-05, + "loss": 0.3157, + "step": 2328 + }, + { + "epoch": 0.466773418734988, + "learning_rate": 1.6921297248971645e-05, + "loss": 0.0886, + "step": 2330 + }, + { + "epoch": 0.466773418734988, + "learning_rate": 1.6931377307015226e-05, + "loss": 0.1314, + "step": 2332 + }, + { + "epoch": 0.4675740592473979, + "learning_rate": 1.6941443827928778e-05, + "loss": 0.0766, + "step": 2334 + }, + { + "epoch": 0.4675740592473979, + "learning_rate": 1.695149679205214e-05, + "loss": 0.129, + "step": 2336 + }, + { + "epoch": 0.46837469975980783, + "learning_rate": 1.6961536179751672e-05, + "loss": 0.1868, + "step": 2338 + }, + { + "epoch": 0.46837469975980783, + "learning_rate": 1.6971561971420222e-05, + "loss": 0.1483, + "step": 2340 + }, + { + "epoch": 0.4691753402722178, + "learning_rate": 1.6981574147477204e-05, + "loss": 0.0004, + "step": 2342 + }, + { + "epoch": 0.4691753402722178, + "learning_rate": 1.6991572688368628e-05, + "loss": 0.2286, + "step": 2344 + }, + { + "epoch": 0.4699759807846277, + "learning_rate": 1.70015575745671e-05, + "loss": 0.3779, + "step": 2346 + }, + { + "epoch": 0.4699759807846277, + "learning_rate": 1.701152878657196e-05, + "loss": 0.0078, + "step": 2348 + }, + { + "epoch": 0.4707766212970376, + "learning_rate": 1.7021486304909196e-05, + "loss": 0.1481, + "step": 2350 + }, + { + "epoch": 0.4707766212970376, + "learning_rate": 1.7031430110131562e-05, + "loss": 0.0097, + "step": 2352 + }, + { + "epoch": 0.47157726180944753, + "learning_rate": 1.7041360182818583e-05, + "loss": 0.0569, + "step": 2354 + }, + { + "epoch": 0.47157726180944753, + "learning_rate": 1.705127650357662e-05, + "loss": 0.0176, + "step": 2356 + }, + { + "epoch": 0.4723779023218575, + "learning_rate": 1.7061179053038887e-05, + "loss": 0.0303, + "step": 2358 + }, + { + "epoch": 0.4723779023218575, + "learning_rate": 1.7071067811865467e-05, + "loss": 0.1502, + "step": 2360 + }, + { + "epoch": 0.4731785428342674, + "learning_rate": 1.708094276074343e-05, + "loss": 0.0042, + "step": 2362 + }, + { + "epoch": 0.4731785428342674, + "learning_rate": 1.7090803880386778e-05, + "loss": 0.88, + "step": 2364 + }, + { + "epoch": 0.4739791833466773, + "learning_rate": 1.7100651151536525e-05, + "loss": 1.2154, + "step": 2366 + }, + { + "epoch": 0.4739791833466773, + "learning_rate": 1.7110484554960738e-05, + "loss": 0.014, + "step": 2368 + }, + { + "epoch": 0.4747798238590873, + "learning_rate": 1.712030407145457e-05, + "loss": 0.2946, + "step": 2370 + }, + { + "epoch": 0.4747798238590873, + "learning_rate": 1.713010968184029e-05, + "loss": 0.0077, + "step": 2372 + }, + { + "epoch": 0.4755804643714972, + "learning_rate": 1.7139901366967332e-05, + "loss": 0.1712, + "step": 2374 + }, + { + "epoch": 0.4755804643714972, + "learning_rate": 1.7149679107712306e-05, + "loss": 0.2641, + "step": 2376 + }, + { + "epoch": 0.4763811048839071, + "learning_rate": 1.71594428849791e-05, + "loss": 0.1521, + "step": 2378 + }, + { + "epoch": 0.4763811048839071, + "learning_rate": 1.716919267969883e-05, + "loss": 0.029, + "step": 2380 + }, + { + "epoch": 0.4771817453963171, + "learning_rate": 1.717892847282994e-05, + "loss": 0.0722, + "step": 2382 + }, + { + "epoch": 0.4771817453963171, + "learning_rate": 1.7188650245358215e-05, + "loss": 0.0362, + "step": 2384 + }, + { + "epoch": 0.477982385908727, + "learning_rate": 1.7198357978296817e-05, + "loss": 0.0789, + "step": 2386 + }, + { + "epoch": 0.477982385908727, + "learning_rate": 1.7208051652686338e-05, + "loss": 0.0413, + "step": 2388 + }, + { + "epoch": 0.4787830264211369, + "learning_rate": 1.721773124959481e-05, + "loss": 0.0323, + "step": 2390 + }, + { + "epoch": 0.4787830264211369, + "learning_rate": 1.722739675011779e-05, + "loss": 0.0175, + "step": 2392 + }, + { + "epoch": 0.4795836669335468, + "learning_rate": 1.723704813537834e-05, + "loss": 0.2796, + "step": 2394 + }, + { + "epoch": 0.4795836669335468, + "learning_rate": 1.7246685386527095e-05, + "loss": 0.0049, + "step": 2396 + }, + { + "epoch": 0.4803843074459568, + "learning_rate": 1.725630848474229e-05, + "loss": 0.0026, + "step": 2398 + }, + { + "epoch": 0.4803843074459568, + "learning_rate": 1.726591741122981e-05, + "loss": 0.0217, + "step": 2400 + }, + { + "epoch": 0.4811849479583667, + "learning_rate": 1.727551214722321e-05, + "loss": 0.258, + "step": 2402 + }, + { + "epoch": 0.4811849479583667, + "learning_rate": 1.7285092673983753e-05, + "loss": 0.0142, + "step": 2404 + }, + { + "epoch": 0.4819855884707766, + "learning_rate": 1.7294658972800488e-05, + "loss": 0.7132, + "step": 2406 + }, + { + "epoch": 0.4819855884707766, + "learning_rate": 1.730421102499021e-05, + "loss": 0.7087, + "step": 2408 + }, + { + "epoch": 0.48278622898318657, + "learning_rate": 1.7313748811897558e-05, + "loss": 0.4889, + "step": 2410 + }, + { + "epoch": 0.48278622898318657, + "learning_rate": 1.7323272314895022e-05, + "loss": 0.2012, + "step": 2412 + }, + { + "epoch": 0.4835868694955965, + "learning_rate": 1.7332781515383003e-05, + "loss": 0.0005, + "step": 2414 + }, + { + "epoch": 0.4835868694955965, + "learning_rate": 1.734227639478982e-05, + "loss": 0.0288, + "step": 2416 + }, + { + "epoch": 0.4843875100080064, + "learning_rate": 1.7351756934571758e-05, + "loss": 0.0976, + "step": 2418 + }, + { + "epoch": 0.4843875100080064, + "learning_rate": 1.736122311621314e-05, + "loss": 0.0084, + "step": 2420 + }, + { + "epoch": 0.4851881505204163, + "learning_rate": 1.7370674921226296e-05, + "loss": 0.0019, + "step": 2422 + }, + { + "epoch": 0.4851881505204163, + "learning_rate": 1.738011233115165e-05, + "loss": 0.0002, + "step": 2424 + }, + { + "epoch": 0.4859887910328263, + "learning_rate": 1.7389535327557733e-05, + "loss": 0.4562, + "step": 2426 + }, + { + "epoch": 0.4859887910328263, + "learning_rate": 1.7398943892041227e-05, + "loss": 0.0517, + "step": 2428 + }, + { + "epoch": 0.4867894315452362, + "learning_rate": 1.7408338006227005e-05, + "loss": 0.0796, + "step": 2430 + }, + { + "epoch": 0.4867894315452362, + "learning_rate": 1.7417717651768144e-05, + "loss": 0.204, + "step": 2432 + }, + { + "epoch": 0.4875900720576461, + "learning_rate": 1.7427082810346018e-05, + "loss": 0.0479, + "step": 2434 + }, + { + "epoch": 0.4875900720576461, + "learning_rate": 1.743643346367026e-05, + "loss": 0.0638, + "step": 2436 + }, + { + "epoch": 0.48839071257005606, + "learning_rate": 1.744576959347884e-05, + "loss": 0.1085, + "step": 2438 + }, + { + "epoch": 0.48839071257005606, + "learning_rate": 1.7455091181538087e-05, + "loss": 0.3072, + "step": 2440 + }, + { + "epoch": 0.489191353082466, + "learning_rate": 1.746439820964275e-05, + "loss": 0.0014, + "step": 2442 + }, + { + "epoch": 0.489191353082466, + "learning_rate": 1.7473690659615992e-05, + "loss": 0.2766, + "step": 2444 + }, + { + "epoch": 0.4899919935948759, + "learning_rate": 1.748296851330945e-05, + "loss": 0.0509, + "step": 2446 + }, + { + "epoch": 0.4899919935948759, + "learning_rate": 1.74922317526033e-05, + "loss": 0.0184, + "step": 2448 + }, + { + "epoch": 0.49079263410728585, + "learning_rate": 1.7501480359406217e-05, + "loss": 0.3353, + "step": 2450 + }, + { + "epoch": 0.49079263410728585, + "learning_rate": 1.7510714315655474e-05, + "loss": 0.0003, + "step": 2452 + }, + { + "epoch": 0.49159327461969576, + "learning_rate": 1.7519933603316955e-05, + "loss": 0.278, + "step": 2454 + }, + { + "epoch": 0.49159327461969576, + "learning_rate": 1.752913820438519e-05, + "loss": 0.0484, + "step": 2456 + }, + { + "epoch": 0.4923939151321057, + "learning_rate": 1.7538328100883397e-05, + "loss": 0.0643, + "step": 2458 + }, + { + "epoch": 0.4923939151321057, + "learning_rate": 1.7547503274863495e-05, + "loss": 0.0165, + "step": 2460 + }, + { + "epoch": 0.4931945556445156, + "learning_rate": 1.7556663708406193e-05, + "loss": 0.0877, + "step": 2462 + }, + { + "epoch": 0.4931945556445156, + "learning_rate": 1.756580938362096e-05, + "loss": 0.0073, + "step": 2464 + }, + { + "epoch": 0.49399519615692555, + "learning_rate": 1.7574940282646085e-05, + "loss": 0.0001, + "step": 2466 + }, + { + "epoch": 0.49399519615692555, + "learning_rate": 1.758405638764873e-05, + "loss": 0.1528, + "step": 2468 + }, + { + "epoch": 0.49479583666933546, + "learning_rate": 1.7593157680824946e-05, + "loss": 0.0, + "step": 2470 + }, + { + "epoch": 0.49479583666933546, + "learning_rate": 1.7602244144399693e-05, + "loss": 0.0068, + "step": 2472 + }, + { + "epoch": 0.4955964771817454, + "learning_rate": 1.761131576062694e-05, + "loss": 0.005, + "step": 2474 + }, + { + "epoch": 0.4955964771817454, + "learning_rate": 1.7620372511789604e-05, + "loss": 0.0729, + "step": 2476 + }, + { + "epoch": 0.49639711769415534, + "learning_rate": 1.7629414380199662e-05, + "loss": 0.0883, + "step": 2478 + }, + { + "epoch": 0.49639711769415534, + "learning_rate": 1.7638441348198147e-05, + "loss": 0.0541, + "step": 2480 + }, + { + "epoch": 0.49719775820656525, + "learning_rate": 1.7647453398155194e-05, + "loss": 0.242, + "step": 2482 + }, + { + "epoch": 0.49719775820656525, + "learning_rate": 1.7656450512470077e-05, + "loss": 0.2072, + "step": 2484 + }, + { + "epoch": 0.49799839871897517, + "learning_rate": 1.7665432673571218e-05, + "loss": 0.1443, + "step": 2486 + }, + { + "epoch": 0.49799839871897517, + "learning_rate": 1.7674399863916295e-05, + "loss": 0.0056, + "step": 2488 + }, + { + "epoch": 0.49879903923138513, + "learning_rate": 1.768335206599217e-05, + "loss": 0.0725, + "step": 2490 + }, + { + "epoch": 0.49879903923138513, + "learning_rate": 1.7692289262315e-05, + "loss": 0.0004, + "step": 2492 + }, + { + "epoch": 0.49959967974379504, + "learning_rate": 1.7701211435430256e-05, + "loss": 0.0345, + "step": 2494 + }, + { + "epoch": 0.49959967974379504, + "learning_rate": 1.771011856791273e-05, + "loss": 0.5482, + "step": 2496 + }, + { + "epoch": 0.500400320256205, + "learning_rate": 1.771901064236659e-05, + "loss": 0.0322, + "step": 2498 + }, + { + "epoch": 0.500400320256205, + "learning_rate": 1.7727887641425448e-05, + "loss": 0.0125, + "step": 2500 + }, + { + "epoch": 0.5012009607686149, + "learning_rate": 1.773674954775232e-05, + "loss": 3.5382, + "step": 2502 + }, + { + "epoch": 0.5012009607686149, + "learning_rate": 1.7745596344039712e-05, + "loss": 0.1737, + "step": 2504 + }, + { + "epoch": 0.5020016012810248, + "learning_rate": 1.7754428013009637e-05, + "loss": 0.4712, + "step": 2506 + }, + { + "epoch": 0.5020016012810248, + "learning_rate": 1.7763244537413657e-05, + "loss": 0.0195, + "step": 2508 + }, + { + "epoch": 0.5028022417934348, + "learning_rate": 1.77720459000329e-05, + "loss": 0.0298, + "step": 2510 + }, + { + "epoch": 0.5028022417934348, + "learning_rate": 1.7780832083678116e-05, + "loss": 1.1242, + "step": 2512 + }, + { + "epoch": 0.5036028823058447, + "learning_rate": 1.7789603071189712e-05, + "loss": 0.2881, + "step": 2514 + }, + { + "epoch": 0.5036028823058447, + "learning_rate": 1.7798358845437754e-05, + "loss": 0.1423, + "step": 2516 + }, + { + "epoch": 0.5044035228182546, + "learning_rate": 1.780709938932202e-05, + "loss": 0.059, + "step": 2518 + }, + { + "epoch": 0.5044035228182546, + "learning_rate": 1.7815824685772035e-05, + "loss": 0.1672, + "step": 2520 + }, + { + "epoch": 0.5052041633306645, + "learning_rate": 1.7824534717747115e-05, + "loss": 0.1975, + "step": 2522 + }, + { + "epoch": 0.5052041633306645, + "learning_rate": 1.7833229468236364e-05, + "loss": 0.207, + "step": 2524 + }, + { + "epoch": 0.5060048038430744, + "learning_rate": 1.7841908920258767e-05, + "loss": 0.0132, + "step": 2526 + }, + { + "epoch": 0.5060048038430744, + "learning_rate": 1.7850573056863156e-05, + "loss": 0.0263, + "step": 2528 + }, + { + "epoch": 0.5068054443554844, + "learning_rate": 1.7859221861128284e-05, + "loss": 0.3832, + "step": 2530 + }, + { + "epoch": 0.5068054443554844, + "learning_rate": 1.786785531616285e-05, + "loss": 0.2274, + "step": 2532 + }, + { + "epoch": 0.5076060848678943, + "learning_rate": 1.7876473405105528e-05, + "loss": 0.0681, + "step": 2534 + }, + { + "epoch": 0.5076060848678943, + "learning_rate": 1.7885076111125004e-05, + "loss": 0.1454, + "step": 2536 + }, + { + "epoch": 0.5084067253803043, + "learning_rate": 1.7893663417419995e-05, + "loss": 0.1324, + "step": 2538 + }, + { + "epoch": 0.5084067253803043, + "learning_rate": 1.790223530721933e-05, + "loss": 0.0617, + "step": 2540 + }, + { + "epoch": 0.5092073658927142, + "learning_rate": 1.791079176378191e-05, + "loss": 0.1033, + "step": 2542 + }, + { + "epoch": 0.5092073658927142, + "learning_rate": 1.791933277039679e-05, + "loss": 0.6168, + "step": 2544 + }, + { + "epoch": 0.5100080064051241, + "learning_rate": 1.7927858310383202e-05, + "loss": 0.1173, + "step": 2546 + }, + { + "epoch": 0.5100080064051241, + "learning_rate": 1.7936368367090577e-05, + "loss": 0.2637, + "step": 2548 + }, + { + "epoch": 0.510808646917534, + "learning_rate": 1.794486292389858e-05, + "loss": 0.1787, + "step": 2550 + }, + { + "epoch": 0.510808646917534, + "learning_rate": 1.7953341964217183e-05, + "loss": 0.198, + "step": 2552 + }, + { + "epoch": 0.5116092874299439, + "learning_rate": 1.7961805471486618e-05, + "loss": 0.2027, + "step": 2554 + }, + { + "epoch": 0.5116092874299439, + "learning_rate": 1.7970253429177477e-05, + "loss": 0.3226, + "step": 2556 + }, + { + "epoch": 0.5124099279423538, + "learning_rate": 1.797868582079072e-05, + "loss": 0.0487, + "step": 2558 + }, + { + "epoch": 0.5124099279423538, + "learning_rate": 1.7987102629857696e-05, + "loss": 0.624, + "step": 2560 + }, + { + "epoch": 0.5132105684547638, + "learning_rate": 1.7995503839940197e-05, + "loss": 0.0091, + "step": 2562 + }, + { + "epoch": 0.5132105684547638, + "learning_rate": 1.800388943463047e-05, + "loss": 0.0489, + "step": 2564 + }, + { + "epoch": 0.5140112089671738, + "learning_rate": 1.8012259397551283e-05, + "loss": 0.2649, + "step": 2566 + }, + { + "epoch": 0.5140112089671738, + "learning_rate": 1.8020613712355912e-05, + "loss": 0.0891, + "step": 2568 + }, + { + "epoch": 0.5148118494795837, + "learning_rate": 1.8028952362728197e-05, + "loss": 0.1569, + "step": 2570 + }, + { + "epoch": 0.5148118494795837, + "learning_rate": 1.803727533238257e-05, + "loss": 0.3262, + "step": 2572 + }, + { + "epoch": 0.5156124899919936, + "learning_rate": 1.804558260506409e-05, + "loss": 0.123, + "step": 2574 + }, + { + "epoch": 0.5156124899919936, + "learning_rate": 1.805387416454847e-05, + "loss": 0.1534, + "step": 2576 + }, + { + "epoch": 0.5164131305044035, + "learning_rate": 1.8062149994642135e-05, + "loss": 0.2302, + "step": 2578 + }, + { + "epoch": 0.5164131305044035, + "learning_rate": 1.8070410079182195e-05, + "loss": 0.2713, + "step": 2580 + }, + { + "epoch": 0.5172137710168134, + "learning_rate": 1.8078654402036526e-05, + "loss": 0.1904, + "step": 2582 + }, + { + "epoch": 0.5172137710168134, + "learning_rate": 1.8086882947103787e-05, + "loss": 0.1712, + "step": 2584 + }, + { + "epoch": 0.5180144115292233, + "learning_rate": 1.8095095698313452e-05, + "loss": 0.2231, + "step": 2586 + }, + { + "epoch": 0.5180144115292233, + "learning_rate": 1.8103292639625842e-05, + "loss": 0.3059, + "step": 2588 + }, + { + "epoch": 0.5188150520416333, + "learning_rate": 1.811147375503214e-05, + "loss": 0.1131, + "step": 2590 + }, + { + "epoch": 0.5188150520416333, + "learning_rate": 1.811963902855447e-05, + "loss": 0.1092, + "step": 2592 + }, + { + "epoch": 0.5196156925540433, + "learning_rate": 1.812778844424587e-05, + "loss": 0.7426, + "step": 2594 + }, + { + "epoch": 0.5196156925540433, + "learning_rate": 1.813592198619035e-05, + "loss": 0.0229, + "step": 2596 + }, + { + "epoch": 0.5204163330664532, + "learning_rate": 1.814403963850293e-05, + "loss": 0.1786, + "step": 2598 + }, + { + "epoch": 0.5204163330664532, + "learning_rate": 1.8152141385329658e-05, + "loss": 0.2612, + "step": 2600 + }, + { + "epoch": 0.5212169735788631, + "learning_rate": 1.8160227210847636e-05, + "loss": 0.0106, + "step": 2602 + }, + { + "epoch": 0.5212169735788631, + "learning_rate": 1.816829709926509e-05, + "loss": 0.1229, + "step": 2604 + }, + { + "epoch": 0.522017614091273, + "learning_rate": 1.8176351034821345e-05, + "loss": 0.0587, + "step": 2606 + }, + { + "epoch": 0.522017614091273, + "learning_rate": 1.8184389001786895e-05, + "loss": 0.007, + "step": 2608 + }, + { + "epoch": 0.5228182546036829, + "learning_rate": 1.819241098446341e-05, + "loss": 0.0817, + "step": 2610 + }, + { + "epoch": 0.5228182546036829, + "learning_rate": 1.8200416967183785e-05, + "loss": 0.0064, + "step": 2612 + }, + { + "epoch": 0.5236188951160928, + "learning_rate": 1.8208406934312167e-05, + "loss": 0.4501, + "step": 2614 + }, + { + "epoch": 0.5236188951160928, + "learning_rate": 1.821638087024396e-05, + "loss": 0.1896, + "step": 2616 + }, + { + "epoch": 0.5244195356285029, + "learning_rate": 1.8224338759405917e-05, + "loss": 0.0846, + "step": 2618 + }, + { + "epoch": 0.5244195356285029, + "learning_rate": 1.8232280586256097e-05, + "loss": 0.0084, + "step": 2620 + }, + { + "epoch": 0.5252201761409128, + "learning_rate": 1.8240206335283947e-05, + "loss": 0.1603, + "step": 2622 + }, + { + "epoch": 0.5252201761409128, + "learning_rate": 1.8248115991010296e-05, + "loss": 0.2705, + "step": 2624 + }, + { + "epoch": 0.5260208166533227, + "learning_rate": 1.825600953798743e-05, + "loss": 0.0045, + "step": 2626 + }, + { + "epoch": 0.5260208166533227, + "learning_rate": 1.8263886960799055e-05, + "loss": 0.3326, + "step": 2628 + }, + { + "epoch": 0.5268214571657326, + "learning_rate": 1.8271748244060426e-05, + "loss": 0.03, + "step": 2630 + }, + { + "epoch": 0.5268214571657326, + "learning_rate": 1.8279593372418264e-05, + "loss": 0.0135, + "step": 2632 + }, + { + "epoch": 0.5276220976781425, + "learning_rate": 1.8287422330550878e-05, + "loss": 0.0131, + "step": 2634 + }, + { + "epoch": 0.5276220976781425, + "learning_rate": 1.829523510316813e-05, + "loss": 0.0177, + "step": 2636 + }, + { + "epoch": 0.5284227381905524, + "learning_rate": 1.8303031675011515e-05, + "loss": 0.6744, + "step": 2638 + }, + { + "epoch": 0.5284227381905524, + "learning_rate": 1.8310812030854155e-05, + "loss": 0.0063, + "step": 2640 + }, + { + "epoch": 0.5292233787029623, + "learning_rate": 1.8318576155500838e-05, + "loss": 0.552, + "step": 2642 + }, + { + "epoch": 0.5292233787029623, + "learning_rate": 1.832632403378808e-05, + "loss": 0.0404, + "step": 2644 + }, + { + "epoch": 0.5300240192153723, + "learning_rate": 1.8334055650584094e-05, + "loss": 0.0079, + "step": 2646 + }, + { + "epoch": 0.5300240192153723, + "learning_rate": 1.834177099078887e-05, + "loss": 0.2652, + "step": 2648 + }, + { + "epoch": 0.5308246597277823, + "learning_rate": 1.8349470039334173e-05, + "loss": 0.0688, + "step": 2650 + }, + { + "epoch": 0.5308246597277823, + "learning_rate": 1.8357152781183606e-05, + "loss": 0.1784, + "step": 2652 + }, + { + "epoch": 0.5316253002401922, + "learning_rate": 1.83648192013326e-05, + "loss": 0.093, + "step": 2654 + }, + { + "epoch": 0.5316253002401922, + "learning_rate": 1.8372469284808465e-05, + "loss": 0.3144, + "step": 2656 + }, + { + "epoch": 0.5324259407526021, + "learning_rate": 1.8380103016670437e-05, + "loss": 0.1216, + "step": 2658 + }, + { + "epoch": 0.5324259407526021, + "learning_rate": 1.8387720382009665e-05, + "loss": 0.1079, + "step": 2660 + }, + { + "epoch": 0.533226581265012, + "learning_rate": 1.839532136594927e-05, + "loss": 0.5767, + "step": 2662 + }, + { + "epoch": 0.533226581265012, + "learning_rate": 1.840290595364436e-05, + "loss": 0.1849, + "step": 2664 + }, + { + "epoch": 0.5340272217774219, + "learning_rate": 1.8410474130282085e-05, + "loss": 0.3459, + "step": 2666 + }, + { + "epoch": 0.5340272217774219, + "learning_rate": 1.8418025881081606e-05, + "loss": 0.0479, + "step": 2668 + }, + { + "epoch": 0.5348278622898318, + "learning_rate": 1.8425561191294217e-05, + "loss": 0.1349, + "step": 2670 + }, + { + "epoch": 0.5348278622898318, + "learning_rate": 1.8433080046203286e-05, + "loss": 0.1899, + "step": 2672 + }, + { + "epoch": 0.5356285028022418, + "learning_rate": 1.8440582431124325e-05, + "loss": 0.014, + "step": 2674 + }, + { + "epoch": 0.5356285028022418, + "learning_rate": 1.844806833140501e-05, + "loss": 0.0026, + "step": 2676 + }, + { + "epoch": 0.5364291433146517, + "learning_rate": 1.8455537732425223e-05, + "loss": 0.3479, + "step": 2678 + }, + { + "epoch": 0.5364291433146517, + "learning_rate": 1.8462990619597054e-05, + "loss": 0.7356, + "step": 2680 + }, + { + "epoch": 0.5372297838270617, + "learning_rate": 1.847042697836485e-05, + "loss": 0.7116, + "step": 2682 + }, + { + "epoch": 0.5372297838270617, + "learning_rate": 1.8477846794205258e-05, + "loss": 0.315, + "step": 2684 + }, + { + "epoch": 0.5380304243394716, + "learning_rate": 1.84852500526272e-05, + "loss": 0.0073, + "step": 2686 + }, + { + "epoch": 0.5380304243394716, + "learning_rate": 1.8492636739171966e-05, + "loss": 0.2609, + "step": 2688 + }, + { + "epoch": 0.5388310648518815, + "learning_rate": 1.8500006839413183e-05, + "loss": 0.0574, + "step": 2690 + }, + { + "epoch": 0.5388310648518815, + "learning_rate": 1.85073603389569e-05, + "loss": 0.0091, + "step": 2692 + }, + { + "epoch": 0.5396317053642914, + "learning_rate": 1.851469722344155e-05, + "loss": 0.4291, + "step": 2694 + }, + { + "epoch": 0.5396317053642914, + "learning_rate": 1.8522017478538067e-05, + "loss": 0.3311, + "step": 2696 + }, + { + "epoch": 0.5404323458767014, + "learning_rate": 1.8529321089949817e-05, + "loss": 0.2065, + "step": 2698 + }, + { + "epoch": 0.5404323458767014, + "learning_rate": 1.8536608043412695e-05, + "loss": 0.084, + "step": 2700 + }, + { + "epoch": 0.5412329863891113, + "learning_rate": 1.8543878324695122e-05, + "loss": 0.4038, + "step": 2702 + }, + { + "epoch": 0.5412329863891113, + "learning_rate": 1.855113191959808e-05, + "loss": 0.4007, + "step": 2704 + }, + { + "epoch": 0.5420336269015212, + "learning_rate": 1.8558368813955143e-05, + "loss": 0.2224, + "step": 2706 + }, + { + "epoch": 0.5420336269015212, + "learning_rate": 1.856558899363248e-05, + "loss": 0.0127, + "step": 2708 + }, + { + "epoch": 0.5428342674139311, + "learning_rate": 1.857279244452896e-05, + "loss": 0.5832, + "step": 2710 + }, + { + "epoch": 0.5428342674139311, + "learning_rate": 1.8579979152576063e-05, + "loss": 0.0357, + "step": 2712 + }, + { + "epoch": 0.5436349079263411, + "learning_rate": 1.8587149103738e-05, + "loss": 0.0999, + "step": 2714 + }, + { + "epoch": 0.5436349079263411, + "learning_rate": 1.85943022840117e-05, + "loss": 0.3761, + "step": 2716 + }, + { + "epoch": 0.544435548438751, + "learning_rate": 1.8601438679426847e-05, + "loss": 0.5107, + "step": 2718 + }, + { + "epoch": 0.544435548438751, + "learning_rate": 1.8608558276045895e-05, + "loss": 0.4593, + "step": 2720 + }, + { + "epoch": 0.5452361889511609, + "learning_rate": 1.8615661059964134e-05, + "loss": 0.0161, + "step": 2722 + }, + { + "epoch": 0.5452361889511609, + "learning_rate": 1.862274701730967e-05, + "loss": 0.0511, + "step": 2724 + }, + { + "epoch": 0.5460368294635709, + "learning_rate": 1.862981613424347e-05, + "loss": 0.1043, + "step": 2726 + }, + { + "epoch": 0.5460368294635709, + "learning_rate": 1.86368683969594e-05, + "loss": 0.0609, + "step": 2728 + }, + { + "epoch": 0.5468374699759808, + "learning_rate": 1.864390379168423e-05, + "loss": 0.0814, + "step": 2730 + }, + { + "epoch": 0.5468374699759808, + "learning_rate": 1.865092230467769e-05, + "loss": 0.1448, + "step": 2732 + }, + { + "epoch": 0.5476381104883907, + "learning_rate": 1.8657923922232464e-05, + "loss": 0.0172, + "step": 2734 + }, + { + "epoch": 0.5476381104883907, + "learning_rate": 1.866490863067425e-05, + "loss": 0.7035, + "step": 2736 + }, + { + "epoch": 0.5484387510008006, + "learning_rate": 1.8671876416361763e-05, + "loss": 0.0186, + "step": 2738 + }, + { + "epoch": 0.5484387510008006, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.3092, + "step": 2740 + }, + { + "epoch": 0.5492393915132106, + "learning_rate": 1.8685761165074073e-05, + "loss": 0.4573, + "step": 2742 + }, + { + "epoch": 0.5492393915132106, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.0467, + "step": 2744 + }, + { + "epoch": 0.5500400320256205, + "learning_rate": 1.869957805990059e-05, + "loss": 0.0467, + "step": 2746 + }, + { + "epoch": 0.5500400320256205, + "learning_rate": 1.87064610283551e-05, + "loss": 0.0171, + "step": 2748 + }, + { + "epoch": 0.5508406725380304, + "learning_rate": 1.87133269929026e-05, + "loss": 0.51, + "step": 2750 + }, + { + "epoch": 0.5508406725380304, + "learning_rate": 1.8720175940133705e-05, + "loss": 0.2379, + "step": 2752 + }, + { + "epoch": 0.5516413130504404, + "learning_rate": 1.8727007856672285e-05, + "loss": 0.2766, + "step": 2754 + }, + { + "epoch": 0.5516413130504404, + "learning_rate": 1.873382272917545e-05, + "loss": 0.1793, + "step": 2756 + }, + { + "epoch": 0.5524419535628503, + "learning_rate": 1.8740620544333607e-05, + "loss": 0.4878, + "step": 2758 + }, + { + "epoch": 0.5524419535628503, + "learning_rate": 1.8747401288870472e-05, + "loss": 0.2689, + "step": 2760 + }, + { + "epoch": 0.5532425940752602, + "learning_rate": 1.875416494954312e-05, + "loss": 0.0306, + "step": 2762 + }, + { + "epoch": 0.5532425940752602, + "learning_rate": 1.876091151314196e-05, + "loss": 0.0694, + "step": 2764 + }, + { + "epoch": 0.5540432345876701, + "learning_rate": 1.8767640966490813e-05, + "loss": 0.004, + "step": 2766 + }, + { + "epoch": 0.5540432345876701, + "learning_rate": 1.877435329644691e-05, + "loss": 0.0842, + "step": 2768 + }, + { + "epoch": 0.55484387510008, + "learning_rate": 1.878104848990093e-05, + "loss": 0.1603, + "step": 2770 + }, + { + "epoch": 0.55484387510008, + "learning_rate": 1.8787726533776996e-05, + "loss": 0.1332, + "step": 2772 + }, + { + "epoch": 0.55564451561249, + "learning_rate": 1.879438741503277e-05, + "loss": 0.1473, + "step": 2774 + }, + { + "epoch": 0.55564451561249, + "learning_rate": 1.8801031120659393e-05, + "loss": 0.0389, + "step": 2776 + }, + { + "epoch": 0.5564451561248999, + "learning_rate": 1.8807657637681563e-05, + "loss": 0.3386, + "step": 2778 + }, + { + "epoch": 0.5564451561248999, + "learning_rate": 1.8814266953157557e-05, + "loss": 0.0541, + "step": 2780 + }, + { + "epoch": 0.5572457966373099, + "learning_rate": 1.8820859054179225e-05, + "loss": 0.0404, + "step": 2782 + }, + { + "epoch": 0.5572457966373099, + "learning_rate": 1.8827433927872066e-05, + "loss": 0.1567, + "step": 2784 + }, + { + "epoch": 0.5580464371497198, + "learning_rate": 1.883399156139519e-05, + "loss": 0.0301, + "step": 2786 + }, + { + "epoch": 0.5580464371497198, + "learning_rate": 1.8840531941941415e-05, + "loss": 0.0221, + "step": 2788 + }, + { + "epoch": 0.5588470776621297, + "learning_rate": 1.8847055056737233e-05, + "loss": 0.4619, + "step": 2790 + }, + { + "epoch": 0.5588470776621297, + "learning_rate": 1.8853560893042854e-05, + "loss": 0.0482, + "step": 2792 + }, + { + "epoch": 0.5596477181745396, + "learning_rate": 1.8860049438152244e-05, + "loss": 0.2075, + "step": 2794 + }, + { + "epoch": 0.5596477181745396, + "learning_rate": 1.8866520679393127e-05, + "loss": 0.0255, + "step": 2796 + }, + { + "epoch": 0.5604483586869495, + "learning_rate": 1.8872974604127025e-05, + "loss": 0.035, + "step": 2798 + }, + { + "epoch": 0.5604483586869495, + "learning_rate": 1.8879411199749303e-05, + "loss": 0.0239, + "step": 2800 + }, + { + "epoch": 0.5612489991993594, + "learning_rate": 1.8885830453689132e-05, + "loss": 0.7233, + "step": 2802 + }, + { + "epoch": 0.5612489991993594, + "learning_rate": 1.889223235340958e-05, + "loss": 0.1568, + "step": 2804 + }, + { + "epoch": 0.5620496397117695, + "learning_rate": 1.889861688640759e-05, + "loss": 0.0575, + "step": 2806 + }, + { + "epoch": 0.5620496397117695, + "learning_rate": 1.8904984040214037e-05, + "loss": 0.4793, + "step": 2808 + }, + { + "epoch": 0.5628502802241794, + "learning_rate": 1.891133380239373e-05, + "loss": 0.1917, + "step": 2810 + }, + { + "epoch": 0.5628502802241794, + "learning_rate": 1.8917666160545436e-05, + "loss": 0.0919, + "step": 2812 + }, + { + "epoch": 0.5636509207365893, + "learning_rate": 1.892398110230194e-05, + "loss": 0.1424, + "step": 2814 + }, + { + "epoch": 0.5636509207365893, + "learning_rate": 1.893027861533002e-05, + "loss": 0.241, + "step": 2816 + }, + { + "epoch": 0.5644515612489992, + "learning_rate": 1.8936558687330485e-05, + "loss": 0.0732, + "step": 2818 + }, + { + "epoch": 0.5644515612489992, + "learning_rate": 1.894282130603823e-05, + "loss": 0.4048, + "step": 2820 + }, + { + "epoch": 0.5652522017614091, + "learning_rate": 1.8949066459222217e-05, + "loss": 0.3528, + "step": 2822 + }, + { + "epoch": 0.5652522017614091, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.0315, + "step": 2824 + }, + { + "epoch": 0.566052842273819, + "learning_rate": 1.8961504320265382e-05, + "loss": 0.1131, + "step": 2826 + }, + { + "epoch": 0.566052842273819, + "learning_rate": 1.896769700383315e-05, + "loss": 0.0508, + "step": 2828 + }, + { + "epoch": 0.5668534827862289, + "learning_rate": 1.897387217329439e-05, + "loss": 0.2211, + "step": 2830 + }, + { + "epoch": 0.5668534827862289, + "learning_rate": 1.898002981658886e-05, + "loss": 0.0379, + "step": 2832 + }, + { + "epoch": 0.567654123298639, + "learning_rate": 1.8986169921690543e-05, + "loss": 0.6515, + "step": 2834 + }, + { + "epoch": 0.567654123298639, + "learning_rate": 1.899229247660769e-05, + "loss": 0.0429, + "step": 2836 + }, + { + "epoch": 0.5684547638110489, + "learning_rate": 1.899839746938281e-05, + "loss": 0.0412, + "step": 2838 + }, + { + "epoch": 0.5684547638110489, + "learning_rate": 1.9004484888092724e-05, + "loss": 0.015, + "step": 2840 + }, + { + "epoch": 0.5692554043234588, + "learning_rate": 1.9010554720848577e-05, + "loss": 0.1533, + "step": 2842 + }, + { + "epoch": 0.5692554043234588, + "learning_rate": 1.901660695579585e-05, + "loss": 0.0107, + "step": 2844 + }, + { + "epoch": 0.5700560448358687, + "learning_rate": 1.9022641581114392e-05, + "loss": 0.2489, + "step": 2846 + }, + { + "epoch": 0.5700560448358687, + "learning_rate": 1.9028658585018455e-05, + "loss": 0.099, + "step": 2848 + }, + { + "epoch": 0.5708566853482786, + "learning_rate": 1.9034657955756695e-05, + "loss": 0.1624, + "step": 2850 + }, + { + "epoch": 0.5708566853482786, + "learning_rate": 1.9040639681612212e-05, + "loss": 0.0057, + "step": 2852 + }, + { + "epoch": 0.5716573258606885, + "learning_rate": 1.904660375090257e-05, + "loss": 0.0101, + "step": 2854 + }, + { + "epoch": 0.5716573258606885, + "learning_rate": 1.9052550151979816e-05, + "loss": 0.2317, + "step": 2856 + }, + { + "epoch": 0.5724579663730984, + "learning_rate": 1.905847887323049e-05, + "loss": 0.0892, + "step": 2858 + }, + { + "epoch": 0.5724579663730984, + "learning_rate": 1.9064389903075676e-05, + "loss": 0.01, + "step": 2860 + }, + { + "epoch": 0.5732586068855084, + "learning_rate": 1.9070283229971007e-05, + "loss": 0.0221, + "step": 2862 + }, + { + "epoch": 0.5732586068855084, + "learning_rate": 1.9076158842406674e-05, + "loss": 0.0797, + "step": 2864 + }, + { + "epoch": 0.5740592473979184, + "learning_rate": 1.9082016728907496e-05, + "loss": 0.0338, + "step": 2866 + }, + { + "epoch": 0.5740592473979184, + "learning_rate": 1.9087856878032886e-05, + "loss": 0.0026, + "step": 2868 + }, + { + "epoch": 0.5748598879103283, + "learning_rate": 1.909367927837691e-05, + "loss": 0.0558, + "step": 2870 + }, + { + "epoch": 0.5748598879103283, + "learning_rate": 1.909948391856829e-05, + "loss": 0.2079, + "step": 2872 + }, + { + "epoch": 0.5756605284227382, + "learning_rate": 1.910527078727044e-05, + "loss": 0.044, + "step": 2874 + }, + { + "epoch": 0.5756605284227382, + "learning_rate": 1.911103987318148e-05, + "loss": 1.2491, + "step": 2876 + }, + { + "epoch": 0.5764611689351481, + "learning_rate": 1.911679116503425e-05, + "loss": 0.0289, + "step": 2878 + }, + { + "epoch": 0.5764611689351481, + "learning_rate": 1.912252465159637e-05, + "loss": 0.0293, + "step": 2880 + }, + { + "epoch": 0.577261809447558, + "learning_rate": 1.9128240321670208e-05, + "loss": 0.0465, + "step": 2882 + }, + { + "epoch": 0.577261809447558, + "learning_rate": 1.913393816409294e-05, + "loss": 0.1438, + "step": 2884 + }, + { + "epoch": 0.578062449959968, + "learning_rate": 1.913961816773655e-05, + "loss": 0.0522, + "step": 2886 + }, + { + "epoch": 0.578062449959968, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.2323, + "step": 2888 + }, + { + "epoch": 0.5788630904723779, + "learning_rate": 1.9150924614348588e-05, + "loss": 0.0051, + "step": 2890 + }, + { + "epoch": 0.5788630904723779, + "learning_rate": 1.9156551035235288e-05, + "loss": 0.0455, + "step": 2892 + }, + { + "epoch": 0.5796637309847879, + "learning_rate": 1.916215957317944e-05, + "loss": 0.2222, + "step": 2894 + }, + { + "epoch": 0.5796637309847879, + "learning_rate": 1.9167750217227454e-05, + "loss": 0.0577, + "step": 2896 + }, + { + "epoch": 0.5804643714971978, + "learning_rate": 1.9173322956460675e-05, + "loss": 0.1016, + "step": 2898 + }, + { + "epoch": 0.5804643714971978, + "learning_rate": 1.9178877779995423e-05, + "loss": 0.4562, + "step": 2900 + }, + { + "epoch": 0.5812650120096077, + "learning_rate": 1.9184414676983006e-05, + "loss": 0.1855, + "step": 2902 + }, + { + "epoch": 0.5812650120096077, + "learning_rate": 1.9189933636609747e-05, + "loss": 0.0035, + "step": 2904 + }, + { + "epoch": 0.5820656525220176, + "learning_rate": 1.9195434648097003e-05, + "loss": 0.0558, + "step": 2906 + }, + { + "epoch": 0.5820656525220176, + "learning_rate": 1.9200917700701173e-05, + "loss": 0.0563, + "step": 2908 + }, + { + "epoch": 0.5828662930344275, + "learning_rate": 1.9206382783713738e-05, + "loss": 0.0265, + "step": 2910 + }, + { + "epoch": 0.5828662930344275, + "learning_rate": 1.9211829886461274e-05, + "loss": 0.0418, + "step": 2912 + }, + { + "epoch": 0.5836669335468375, + "learning_rate": 1.921725899830547e-05, + "loss": 0.0713, + "step": 2914 + }, + { + "epoch": 0.5836669335468375, + "learning_rate": 1.9222670108643146e-05, + "loss": 0.37, + "step": 2916 + }, + { + "epoch": 0.5844675740592474, + "learning_rate": 1.92280632069063e-05, + "loss": 0.1193, + "step": 2918 + }, + { + "epoch": 0.5844675740592474, + "learning_rate": 1.9233438282562085e-05, + "loss": 0.0021, + "step": 2920 + }, + { + "epoch": 0.5852682145716573, + "learning_rate": 1.9238795325112867e-05, + "loss": 0.3495, + "step": 2922 + }, + { + "epoch": 0.5852682145716573, + "learning_rate": 1.924413432409622e-05, + "loss": 0.1569, + "step": 2924 + }, + { + "epoch": 0.5860688550840673, + "learning_rate": 1.924945526908497e-05, + "loss": 0.0052, + "step": 2926 + }, + { + "epoch": 0.5860688550840673, + "learning_rate": 1.925475814968719e-05, + "loss": 0.0156, + "step": 2928 + }, + { + "epoch": 0.5868694955964772, + "learning_rate": 1.9260042955546237e-05, + "loss": 0.2698, + "step": 2930 + }, + { + "epoch": 0.5868694955964772, + "learning_rate": 1.926530967634078e-05, + "loss": 0.5332, + "step": 2932 + }, + { + "epoch": 0.5876701361088871, + "learning_rate": 1.9270558301784795e-05, + "loss": 0.2495, + "step": 2934 + }, + { + "epoch": 0.5876701361088871, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.1481, + "step": 2936 + }, + { + "epoch": 0.588470776621297, + "learning_rate": 1.9281001225653887e-05, + "loss": 0.0716, + "step": 2938 + }, + { + "epoch": 0.588470776621297, + "learning_rate": 1.9286195503683705e-05, + "loss": 0.1333, + "step": 2940 + }, + { + "epoch": 0.589271417133707, + "learning_rate": 1.9291371645572517e-05, + "loss": 0.1215, + "step": 2942 + }, + { + "epoch": 0.589271417133707, + "learning_rate": 1.9296529641211215e-05, + "loss": 0.0312, + "step": 2944 + }, + { + "epoch": 0.5900720576461169, + "learning_rate": 1.9301669480526115e-05, + "loss": 0.0639, + "step": 2946 + }, + { + "epoch": 0.5900720576461169, + "learning_rate": 1.9306791153479004e-05, + "loss": 0.0107, + "step": 2948 + }, + { + "epoch": 0.5908726981585268, + "learning_rate": 1.931189465006714e-05, + "loss": 0.2692, + "step": 2950 + }, + { + "epoch": 0.5908726981585268, + "learning_rate": 1.9316979960323286e-05, + "loss": 0.0211, + "step": 2952 + }, + { + "epoch": 0.5916733386709367, + "learning_rate": 1.9322047074315717e-05, + "loss": 0.4411, + "step": 2954 + }, + { + "epoch": 0.5916733386709367, + "learning_rate": 1.932709598214825e-05, + "loss": 0.0166, + "step": 2956 + }, + { + "epoch": 0.5924739791833467, + "learning_rate": 1.9332126673960262e-05, + "loss": 0.0134, + "step": 2958 + }, + { + "epoch": 0.5924739791833467, + "learning_rate": 1.9337139139926707e-05, + "loss": 0.1086, + "step": 2960 + }, + { + "epoch": 0.5932746196957566, + "learning_rate": 1.934213337025812e-05, + "loss": 0.0163, + "step": 2962 + }, + { + "epoch": 0.5932746196957566, + "learning_rate": 1.9347109355200672e-05, + "loss": 1.1038, + "step": 2964 + }, + { + "epoch": 0.5940752602081665, + "learning_rate": 1.9352067085036145e-05, + "loss": 0.1048, + "step": 2966 + }, + { + "epoch": 0.5940752602081665, + "learning_rate": 1.935700655008199e-05, + "loss": 0.4094, + "step": 2968 + }, + { + "epoch": 0.5948759007205765, + "learning_rate": 1.9361927740691327e-05, + "loss": 0.7449, + "step": 2970 + }, + { + "epoch": 0.5948759007205765, + "learning_rate": 1.9366830647252967e-05, + "loss": 0.0522, + "step": 2972 + }, + { + "epoch": 0.5956765412329864, + "learning_rate": 1.937171526019142e-05, + "loss": 0.2284, + "step": 2974 + }, + { + "epoch": 0.5956765412329864, + "learning_rate": 1.9376581569966933e-05, + "loss": 0.4643, + "step": 2976 + }, + { + "epoch": 0.5964771817453963, + "learning_rate": 1.9381429567075504e-05, + "loss": 0.1672, + "step": 2978 + }, + { + "epoch": 0.5964771817453963, + "learning_rate": 1.9386259242048883e-05, + "loss": 0.1211, + "step": 2980 + }, + { + "epoch": 0.5972778222578062, + "learning_rate": 1.93910705854546e-05, + "loss": 0.0034, + "step": 2982 + }, + { + "epoch": 0.5972778222578062, + "learning_rate": 1.939586358789602e-05, + "loss": 0.0977, + "step": 2984 + }, + { + "epoch": 0.5980784627702161, + "learning_rate": 1.9400638240012294e-05, + "loss": 0.1788, + "step": 2986 + }, + { + "epoch": 0.5980784627702161, + "learning_rate": 1.940539453247842e-05, + "loss": 0.0539, + "step": 2988 + }, + { + "epoch": 0.5988791032826261, + "learning_rate": 1.9410132456005262e-05, + "loss": 0.0382, + "step": 2990 + }, + { + "epoch": 0.5988791032826261, + "learning_rate": 1.9414852001339547e-05, + "loss": 0.0435, + "step": 2992 + }, + { + "epoch": 0.5996797437950361, + "learning_rate": 1.9419553159263896e-05, + "loss": 0.0362, + "step": 2994 + }, + { + "epoch": 0.5996797437950361, + "learning_rate": 1.9424235920596863e-05, + "loss": 0.3303, + "step": 2996 + }, + { + "epoch": 0.600480384307446, + "learning_rate": 1.94289002761929e-05, + "loss": 0.1236, + "step": 2998 + }, + { + "epoch": 0.600480384307446, + "learning_rate": 1.9433546216942423e-05, + "loss": 0.1697, + "step": 3000 + }, + { + "epoch": 0.6012810248198559, + "learning_rate": 1.943817373377181e-05, + "loss": 0.0093, + "step": 3002 + }, + { + "epoch": 0.6012810248198559, + "learning_rate": 1.944278281764342e-05, + "loss": 0.0188, + "step": 3004 + }, + { + "epoch": 0.6020816653322658, + "learning_rate": 1.944737345955561e-05, + "loss": 0.1566, + "step": 3006 + }, + { + "epoch": 0.6020816653322658, + "learning_rate": 1.945194565054276e-05, + "loss": 0.122, + "step": 3008 + }, + { + "epoch": 0.6028823058446757, + "learning_rate": 1.945649938167528e-05, + "loss": 0.2948, + "step": 3010 + }, + { + "epoch": 0.6028823058446757, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.0077, + "step": 3012 + }, + { + "epoch": 0.6036829463570856, + "learning_rate": 1.946555142883836e-05, + "loss": 0.0394, + "step": 3014 + }, + { + "epoch": 0.6036829463570856, + "learning_rate": 1.9470049727190073e-05, + "loss": 0.2427, + "step": 3016 + }, + { + "epoch": 0.6044835868694955, + "learning_rate": 1.9474529530329507e-05, + "loss": 0.0907, + "step": 3018 + }, + { + "epoch": 0.6044835868694955, + "learning_rate": 1.9478990829507504e-05, + "loss": 0.1782, + "step": 3020 + }, + { + "epoch": 0.6052842273819056, + "learning_rate": 1.9483433616011047e-05, + "loss": 0.3069, + "step": 3022 + }, + { + "epoch": 0.6052842273819056, + "learning_rate": 1.948785788116329e-05, + "loss": 0.3808, + "step": 3024 + }, + { + "epoch": 0.6060848678943155, + "learning_rate": 1.9492263616323533e-05, + "loss": 0.8603, + "step": 3026 + }, + { + "epoch": 0.6060848678943155, + "learning_rate": 1.9496650812887286e-05, + "loss": 0.2218, + "step": 3028 + }, + { + "epoch": 0.6068855084067254, + "learning_rate": 1.9501019462286263e-05, + "loss": 0.0265, + "step": 3030 + }, + { + "epoch": 0.6068855084067254, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.7192, + "step": 3032 + }, + { + "epoch": 0.6076861489191353, + "learning_rate": 1.9509701085497842e-05, + "loss": 0.3694, + "step": 3034 + }, + { + "epoch": 0.6076861489191353, + "learning_rate": 1.951401404235505e-05, + "loss": 0.0709, + "step": 3036 + }, + { + "epoch": 0.6084867894315452, + "learning_rate": 1.9518308418136718e-05, + "loss": 0.0409, + "step": 3038 + }, + { + "epoch": 0.6084867894315452, + "learning_rate": 1.952258420445583e-05, + "loss": 0.0429, + "step": 3040 + }, + { + "epoch": 0.6092874299439551, + "learning_rate": 1.952684139296169e-05, + "loss": 0.1959, + "step": 3042 + }, + { + "epoch": 0.6092874299439551, + "learning_rate": 1.9531079975339912e-05, + "loss": 0.1083, + "step": 3044 + }, + { + "epoch": 0.610088070456365, + "learning_rate": 1.9535299943312455e-05, + "loss": 0.0085, + "step": 3046 + }, + { + "epoch": 0.610088070456365, + "learning_rate": 1.953950128863762e-05, + "loss": 0.0452, + "step": 3048 + }, + { + "epoch": 0.6108887109687751, + "learning_rate": 1.9543684003110105e-05, + "loss": 0.33, + "step": 3050 + }, + { + "epoch": 0.6108887109687751, + "learning_rate": 1.9547848078560975e-05, + "loss": 0.536, + "step": 3052 + }, + { + "epoch": 0.611689351481185, + "learning_rate": 1.9551993506857688e-05, + "loss": 0.0138, + "step": 3054 + }, + { + "epoch": 0.611689351481185, + "learning_rate": 1.9556120279904144e-05, + "loss": 0.0022, + "step": 3056 + }, + { + "epoch": 0.6124899919935949, + "learning_rate": 1.9560228389640664e-05, + "loss": 0.016, + "step": 3058 + }, + { + "epoch": 0.6124899919935949, + "learning_rate": 1.956431782804402e-05, + "loss": 0.063, + "step": 3060 + }, + { + "epoch": 0.6132906325060048, + "learning_rate": 1.956838858712744e-05, + "loss": 0.089, + "step": 3062 + }, + { + "epoch": 0.6132906325060048, + "learning_rate": 1.957244065894066e-05, + "loss": 0.0471, + "step": 3064 + }, + { + "epoch": 0.6140912730184147, + "learning_rate": 1.9576474035569892e-05, + "loss": 0.0712, + "step": 3066 + }, + { + "epoch": 0.6140912730184147, + "learning_rate": 1.9580488709137858e-05, + "loss": 0.4605, + "step": 3068 + }, + { + "epoch": 0.6148919135308246, + "learning_rate": 1.9584484671803818e-05, + "loss": 0.0531, + "step": 3070 + }, + { + "epoch": 0.6148919135308246, + "learning_rate": 1.9588461915763566e-05, + "loss": 0.0359, + "step": 3072 + }, + { + "epoch": 0.6156925540432346, + "learning_rate": 1.9592420433249462e-05, + "loss": 0.1318, + "step": 3074 + }, + { + "epoch": 0.6156925540432346, + "learning_rate": 1.9596360216530436e-05, + "loss": 0.0491, + "step": 3076 + }, + { + "epoch": 0.6164931945556446, + "learning_rate": 1.9600281257912e-05, + "loss": 0.2526, + "step": 3078 + }, + { + "epoch": 0.6164931945556446, + "learning_rate": 1.9604183549736283e-05, + "loss": 0.0135, + "step": 3080 + }, + { + "epoch": 0.6172938350680545, + "learning_rate": 1.960806708438202e-05, + "loss": 0.0336, + "step": 3082 + }, + { + "epoch": 0.6172938350680545, + "learning_rate": 1.961193185426459e-05, + "loss": 0.0057, + "step": 3084 + }, + { + "epoch": 0.6180944755804644, + "learning_rate": 1.9615777851836003e-05, + "loss": 0.1756, + "step": 3086 + }, + { + "epoch": 0.6180944755804644, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.1225, + "step": 3088 + }, + { + "epoch": 0.6188951160928743, + "learning_rate": 1.962341350003679e-05, + "loss": 0.242, + "step": 3090 + }, + { + "epoch": 0.6188951160928743, + "learning_rate": 1.9627203135753573e-05, + "loss": 0.0529, + "step": 3092 + }, + { + "epoch": 0.6196957566052842, + "learning_rate": 1.9630973969334068e-05, + "loss": 0.0896, + "step": 3094 + }, + { + "epoch": 0.6196957566052842, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.007, + "step": 3096 + }, + { + "epoch": 0.6204963971176941, + "learning_rate": 1.9638459200664822e-05, + "loss": 0.0435, + "step": 3098 + }, + { + "epoch": 0.6204963971176941, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.0655, + "step": 3100 + }, + { + "epoch": 0.6212970376301041, + "learning_rate": 1.9645869135553806e-05, + "loss": 0.4796, + "step": 3102 + }, + { + "epoch": 0.6212970376301041, + "learning_rate": 1.964954584871995e-05, + "loss": 0.0232, + "step": 3104 + }, + { + "epoch": 0.622097678142514, + "learning_rate": 1.965320371611399e-05, + "loss": 0.0017, + "step": 3106 + }, + { + "epoch": 0.622097678142514, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.4126, + "step": 3108 + }, + { + "epoch": 0.622898318654924, + "learning_rate": 1.9660462885047032e-05, + "loss": 0.3787, + "step": 3110 + }, + { + "epoch": 0.622898318654924, + "learning_rate": 1.966406417240872e-05, + "loss": 0.492, + "step": 3112 + }, + { + "epoch": 0.6236989591673339, + "learning_rate": 1.9667646585643703e-05, + "loss": 0.0054, + "step": 3114 + }, + { + "epoch": 0.6236989591673339, + "learning_rate": 1.967121011775546e-05, + "loss": 0.0531, + "step": 3116 + }, + { + "epoch": 0.6244995996797438, + "learning_rate": 1.967475476178433e-05, + "loss": 0.6575, + "step": 3118 + }, + { + "epoch": 0.6244995996797438, + "learning_rate": 1.967828051080755e-05, + "loss": 0.0256, + "step": 3120 + }, + { + "epoch": 0.6253002401921537, + "learning_rate": 1.9681787357939254e-05, + "loss": 0.1465, + "step": 3122 + }, + { + "epoch": 0.6253002401921537, + "learning_rate": 1.9685275296330497e-05, + "loss": 0.3773, + "step": 3124 + }, + { + "epoch": 0.6261008807045636, + "learning_rate": 1.968874431916926e-05, + "loss": 0.8332, + "step": 3126 + }, + { + "epoch": 0.6261008807045636, + "learning_rate": 1.969219441968046e-05, + "loss": 0.0331, + "step": 3128 + }, + { + "epoch": 0.6269015212169736, + "learning_rate": 1.969562559112598e-05, + "loss": 0.1319, + "step": 3130 + }, + { + "epoch": 0.6269015212169736, + "learning_rate": 1.969903782680467e-05, + "loss": 0.0338, + "step": 3132 + }, + { + "epoch": 0.6277021617293835, + "learning_rate": 1.970243112005235e-05, + "loss": 0.0114, + "step": 3134 + }, + { + "epoch": 0.6277021617293835, + "learning_rate": 1.9705805464241856e-05, + "loss": 0.3044, + "step": 3136 + }, + { + "epoch": 0.6285028022417934, + "learning_rate": 1.970916085278302e-05, + "loss": 0.4368, + "step": 3138 + }, + { + "epoch": 0.6285028022417934, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.6032, + "step": 3140 + }, + { + "epoch": 0.6293034427542034, + "learning_rate": 1.9715814736744755e-05, + "loss": 0.0583, + "step": 3142 + }, + { + "epoch": 0.6293034427542034, + "learning_rate": 1.971911321917015e-05, + "loss": 0.3803, + "step": 3144 + }, + { + "epoch": 0.6301040832666133, + "learning_rate": 1.9722392719956864e-05, + "loss": 0.1795, + "step": 3146 + }, + { + "epoch": 0.6301040832666133, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.0456, + "step": 3148 + }, + { + "epoch": 0.6309047237790232, + "learning_rate": 1.9728894751031595e-05, + "loss": 0.0214, + "step": 3150 + }, + { + "epoch": 0.6309047237790232, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.5837, + "step": 3152 + }, + { + "epoch": 0.6317053642914331, + "learning_rate": 1.9735320779174545e-05, + "loss": 0.4585, + "step": 3154 + }, + { + "epoch": 0.6317053642914331, + "learning_rate": 1.9738505276435692e-05, + "loss": 0.2161, + "step": 3156 + }, + { + "epoch": 0.6325060048038431, + "learning_rate": 1.974167075418505e-05, + "loss": 0.0386, + "step": 3158 + }, + { + "epoch": 0.6325060048038431, + "learning_rate": 1.9744817206240377e-05, + "loss": 0.3257, + "step": 3160 + }, + { + "epoch": 0.633306645316253, + "learning_rate": 1.9747944626456577e-05, + "loss": 0.0475, + "step": 3162 + }, + { + "epoch": 0.633306645316253, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.1106, + "step": 3164 + }, + { + "epoch": 0.6341072858286629, + "learning_rate": 1.975414234697712e-05, + "loss": 0.0204, + "step": 3166 + }, + { + "epoch": 0.6341072858286629, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.0376, + "step": 3168 + }, + { + "epoch": 0.6349079263410728, + "learning_rate": 1.9760263867329568e-05, + "loss": 0.1958, + "step": 3170 + }, + { + "epoch": 0.6349079263410728, + "learning_rate": 1.9763296037475174e-05, + "loss": 0.0487, + "step": 3172 + }, + { + "epoch": 0.6357085668534828, + "learning_rate": 1.97663091396921e-05, + "loss": 0.0892, + "step": 3174 + }, + { + "epoch": 0.6357085668534828, + "learning_rate": 1.976930316809569e-05, + "loss": 0.2803, + "step": 3176 + }, + { + "epoch": 0.6365092073658927, + "learning_rate": 1.9772278116838543e-05, + "loss": 0.0016, + "step": 3178 + }, + { + "epoch": 0.6365092073658927, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.0241, + "step": 3180 + }, + { + "epoch": 0.6373098478783027, + "learning_rate": 1.977817075213876e-05, + "loss": 0.0477, + "step": 3182 + }, + { + "epoch": 0.6373098478783027, + "learning_rate": 1.978108842718768e-05, + "loss": 0.5619, + "step": 3184 + }, + { + "epoch": 0.6381104883907126, + "learning_rate": 1.9783986999558994e-05, + "loss": 0.0977, + "step": 3186 + }, + { + "epoch": 0.6381104883907126, + "learning_rate": 1.9786866463591732e-05, + "loss": 0.0134, + "step": 3188 + }, + { + "epoch": 0.6389111289031225, + "learning_rate": 1.9789726813662233e-05, + "loss": 0.9198, + "step": 3190 + }, + { + "epoch": 0.6389111289031225, + "learning_rate": 1.9792568044184176e-05, + "loss": 0.0774, + "step": 3192 + }, + { + "epoch": 0.6397117694155324, + "learning_rate": 1.979539014960858e-05, + "loss": 0.0104, + "step": 3194 + }, + { + "epoch": 0.6397117694155324, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.5227, + "step": 3196 + }, + { + "epoch": 0.6405124099279423, + "learning_rate": 1.9800976963155584e-05, + "loss": 0.1797, + "step": 3198 + }, + { + "epoch": 0.6405124099279423, + "learning_rate": 1.9803741660367015e-05, + "loss": 0.1958, + "step": 3200 + }, + { + "epoch": 0.6413130504403523, + "learning_rate": 1.980648721065859e-05, + "loss": 0.0083, + "step": 3202 + }, + { + "epoch": 0.6413130504403523, + "learning_rate": 1.9809213608668185e-05, + "loss": 0.1216, + "step": 3204 + }, + { + "epoch": 0.6421136909527622, + "learning_rate": 1.9811920849071092e-05, + "loss": 0.8303, + "step": 3206 + }, + { + "epoch": 0.6421136909527622, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.1918, + "step": 3208 + }, + { + "epoch": 0.6429143314651722, + "learning_rate": 1.9817277835945057e-05, + "loss": 0.0317, + "step": 3210 + }, + { + "epoch": 0.6429143314651722, + "learning_rate": 1.9819927571953807e-05, + "loss": 0.3447, + "step": 3212 + }, + { + "epoch": 0.6437149719775821, + "learning_rate": 1.9822558129431263e-05, + "loss": 0.3363, + "step": 3214 + }, + { + "epoch": 0.6437149719775821, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.1515, + "step": 3216 + }, + { + "epoch": 0.644515612489992, + "learning_rate": 1.9827761688279606e-05, + "loss": 0.1523, + "step": 3218 + }, + { + "epoch": 0.644515612489992, + "learning_rate": 1.983033467948784e-05, + "loss": 0.038, + "step": 3220 + }, + { + "epoch": 0.6453162530024019, + "learning_rate": 1.983288847183947e-05, + "loss": 0.0379, + "step": 3222 + }, + { + "epoch": 0.6453162530024019, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.0585, + "step": 3224 + }, + { + "epoch": 0.6461168935148118, + "learning_rate": 1.9837938440059992e-05, + "loss": 0.0026, + "step": 3226 + }, + { + "epoch": 0.6461168935148118, + "learning_rate": 1.9840434606066182e-05, + "loss": 0.0802, + "step": 3228 + }, + { + "epoch": 0.6469175340272217, + "learning_rate": 1.9842911553490392e-05, + "loss": 0.1589, + "step": 3230 + }, + { + "epoch": 0.6469175340272217, + "learning_rate": 1.9845369277495102e-05, + "loss": 0.0837, + "step": 3232 + }, + { + "epoch": 0.6477181745396317, + "learning_rate": 1.984780777328031e-05, + "loss": 0.3762, + "step": 3234 + }, + { + "epoch": 0.6477181745396317, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.0644, + "step": 3236 + }, + { + "epoch": 0.6485188150520417, + "learning_rate": 1.985262706118007e-05, + "loss": 0.1502, + "step": 3238 + }, + { + "epoch": 0.6485188150520417, + "learning_rate": 1.985500784388244e-05, + "loss": 0.1317, + "step": 3240 + }, + { + "epoch": 0.6493194555644516, + "learning_rate": 1.9857369379540982e-05, + "loss": 0.2345, + "step": 3242 + }, + { + "epoch": 0.6493194555644516, + "learning_rate": 1.985971166354357e-05, + "loss": 0.038, + "step": 3244 + }, + { + "epoch": 0.6501200960768615, + "learning_rate": 1.986203469131567e-05, + "loss": 0.0104, + "step": 3246 + }, + { + "epoch": 0.6501200960768615, + "learning_rate": 1.9864338458320366e-05, + "loss": 0.6179, + "step": 3248 + }, + { + "epoch": 0.6509207365892714, + "learning_rate": 1.986662296005834e-05, + "loss": 0.0433, + "step": 3250 + }, + { + "epoch": 0.6509207365892714, + "learning_rate": 1.986888819206792e-05, + "loss": 0.6175, + "step": 3252 + }, + { + "epoch": 0.6517213771016813, + "learning_rate": 1.987113414992505e-05, + "loss": 0.5607, + "step": 3254 + }, + { + "epoch": 0.6517213771016813, + "learning_rate": 1.9873360829243323e-05, + "loss": 0.0692, + "step": 3256 + }, + { + "epoch": 0.6525220176140912, + "learning_rate": 1.9875568225674e-05, + "loss": 0.2194, + "step": 3258 + }, + { + "epoch": 0.6525220176140912, + "learning_rate": 1.9877756334905983e-05, + "loss": 0.0692, + "step": 3260 + }, + { + "epoch": 0.6533226581265013, + "learning_rate": 1.9879925152665845e-05, + "loss": 0.1111, + "step": 3262 + }, + { + "epoch": 0.6533226581265013, + "learning_rate": 1.9882074674717836e-05, + "loss": 0.0709, + "step": 3264 + }, + { + "epoch": 0.6541232986389112, + "learning_rate": 1.9884204896863895e-05, + "loss": 0.0607, + "step": 3266 + }, + { + "epoch": 0.6541232986389112, + "learning_rate": 1.988631581494365e-05, + "loss": 0.0027, + "step": 3268 + }, + { + "epoch": 0.6549239391513211, + "learning_rate": 1.9888407424834433e-05, + "loss": 0.4185, + "step": 3270 + }, + { + "epoch": 0.6549239391513211, + "learning_rate": 1.989047972245129e-05, + "loss": 0.0256, + "step": 3272 + }, + { + "epoch": 0.655724579663731, + "learning_rate": 1.989253270374697e-05, + "loss": 0.0115, + "step": 3274 + }, + { + "epoch": 0.655724579663731, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.256, + "step": 3276 + }, + { + "epoch": 0.6565252201761409, + "learning_rate": 1.9896580701374482e-05, + "loss": 0.1215, + "step": 3278 + }, + { + "epoch": 0.6565252201761409, + "learning_rate": 1.989857570980049e-05, + "loss": 0.0074, + "step": 3280 + }, + { + "epoch": 0.6573258606885508, + "learning_rate": 1.9900551386093677e-05, + "loss": 0.003, + "step": 3282 + }, + { + "epoch": 0.6573258606885508, + "learning_rate": 1.990250772639552e-05, + "loss": 0.0367, + "step": 3284 + }, + { + "epoch": 0.6581265012009607, + "learning_rate": 1.9904444726885236e-05, + "loss": 0.1352, + "step": 3286 + }, + { + "epoch": 0.6581265012009607, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.0313, + "step": 3288 + }, + { + "epoch": 0.6589271417133707, + "learning_rate": 1.990826069333406e-05, + "loss": 0.3117, + "step": 3290 + }, + { + "epoch": 0.6589271417133707, + "learning_rate": 1.99101396518405e-05, + "loss": 0.1912, + "step": 3292 + }, + { + "epoch": 0.6597277822257807, + "learning_rate": 1.99119992556295e-05, + "loss": 0.4836, + "step": 3294 + }, + { + "epoch": 0.6597277822257807, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.1759, + "step": 3296 + }, + { + "epoch": 0.6605284227381906, + "learning_rate": 1.99156603845656e-05, + "loss": 0.1428, + "step": 3298 + }, + { + "epoch": 0.6605284227381906, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.0186, + "step": 3300 + }, + { + "epoch": 0.6613290632506005, + "learning_rate": 1.9919244051541315e-05, + "loss": 0.0267, + "step": 3302 + }, + { + "epoch": 0.6613290632506005, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.0172, + "step": 3304 + }, + { + "epoch": 0.6621297037630104, + "learning_rate": 1.9922750228560746e-05, + "loss": 0.1901, + "step": 3306 + }, + { + "epoch": 0.6621297037630104, + "learning_rate": 1.9924474249753652e-05, + "loss": 0.4087, + "step": 3308 + }, + { + "epoch": 0.6629303442754203, + "learning_rate": 1.9926178888233344e-05, + "loss": 0.4782, + "step": 3310 + }, + { + "epoch": 0.6629303442754203, + "learning_rate": 1.9927864140670615e-05, + "loss": 0.319, + "step": 3312 + }, + { + "epoch": 0.6637309847878302, + "learning_rate": 1.9929530003774133e-05, + "loss": 0.3591, + "step": 3314 + }, + { + "epoch": 0.6637309847878302, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.0583, + "step": 3316 + }, + { + "epoch": 0.6645316253002402, + "learning_rate": 1.993280354900393e-05, + "loss": 0.018, + "step": 3318 + }, + { + "epoch": 0.6645316253002402, + "learning_rate": 1.99344112247369e-05, + "loss": 0.0594, + "step": 3320 + }, + { + "epoch": 0.6653322658126501, + "learning_rate": 1.9935999498349518e-05, + "loss": 0.0067, + "step": 3322 + }, + { + "epoch": 0.6653322658126501, + "learning_rate": 1.9937568366739858e-05, + "loss": 0.134, + "step": 3324 + }, + { + "epoch": 0.6661329063250601, + "learning_rate": 1.9939117826843883e-05, + "loss": 0.1076, + "step": 3326 + }, + { + "epoch": 0.6661329063250601, + "learning_rate": 1.9940647875635463e-05, + "loss": 0.0231, + "step": 3328 + }, + { + "epoch": 0.66693354683747, + "learning_rate": 1.9942158510126384e-05, + "loss": 0.0171, + "step": 3330 + }, + { + "epoch": 0.66693354683747, + "learning_rate": 1.9943649727366335e-05, + "loss": 0.282, + "step": 3332 + }, + { + "epoch": 0.6677341873498799, + "learning_rate": 1.9945121524442944e-05, + "loss": 0.0492, + "step": 3334 + }, + { + "epoch": 0.6677341873498799, + "learning_rate": 1.994657389848176e-05, + "loss": 0.0461, + "step": 3336 + }, + { + "epoch": 0.6685348278622898, + "learning_rate": 1.9948006846646262e-05, + "loss": 0.3787, + "step": 3338 + }, + { + "epoch": 0.6685348278622898, + "learning_rate": 1.994942036613787e-05, + "loss": 0.6746, + "step": 3340 + }, + { + "epoch": 0.6693354683746997, + "learning_rate": 1.9950814454195953e-05, + "loss": 0.1152, + "step": 3342 + }, + { + "epoch": 0.6693354683746997, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.1777, + "step": 3344 + }, + { + "epoch": 0.6701361088871097, + "learning_rate": 1.9953544325158755e-05, + "loss": 0.0535, + "step": 3346 + }, + { + "epoch": 0.6701361088871097, + "learning_rate": 1.995488010273198e-05, + "loss": 0.0726, + "step": 3348 + }, + { + "epoch": 0.6709367493995196, + "learning_rate": 1.9956196438208693e-05, + "loss": 0.3071, + "step": 3350 + }, + { + "epoch": 0.6709367493995196, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.2484, + "step": 3352 + }, + { + "epoch": 0.6717373899119295, + "learning_rate": 1.9958770772627236e-05, + "loss": 0.1095, + "step": 3354 + }, + { + "epoch": 0.6717373899119295, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.0126, + "step": 3356 + }, + { + "epoch": 0.6725380304243395, + "learning_rate": 1.9961267308303473e-05, + "loss": 0.0714, + "step": 3358 + }, + { + "epoch": 0.6725380304243395, + "learning_rate": 1.996248639549475e-05, + "loss": 0.1554, + "step": 3360 + }, + { + "epoch": 0.6733386709367494, + "learning_rate": 1.9963686025734262e-05, + "loss": 0.0717, + "step": 3362 + }, + { + "epoch": 0.6733386709367494, + "learning_rate": 1.9964866196679105e-05, + "loss": 0.0981, + "step": 3364 + }, + { + "epoch": 0.6741393114491593, + "learning_rate": 1.9966026906024377e-05, + "loss": 0.1466, + "step": 3366 + }, + { + "epoch": 0.6741393114491593, + "learning_rate": 1.9967168151503196e-05, + "loss": 0.0029, + "step": 3368 + }, + { + "epoch": 0.6749399519615693, + "learning_rate": 1.9968289930886675e-05, + "loss": 0.0291, + "step": 3370 + }, + { + "epoch": 0.6749399519615693, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.6371, + "step": 3372 + }, + { + "epoch": 0.6757405924739792, + "learning_rate": 1.997047508264221e-05, + "loss": 0.2442, + "step": 3374 + }, + { + "epoch": 0.6757405924739792, + "learning_rate": 1.997153845074662e-05, + "loss": 0.2376, + "step": 3376 + }, + { + "epoch": 0.6765412329863891, + "learning_rate": 1.99725823442204e-05, + "loss": 0.1529, + "step": 3378 + }, + { + "epoch": 0.6765412329863891, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.2074, + "step": 3380 + }, + { + "epoch": 0.677341873498799, + "learning_rate": 1.9974611699159142e-05, + "loss": 0.4039, + "step": 3382 + }, + { + "epoch": 0.677341873498799, + "learning_rate": 1.997559715666073e-05, + "loss": 0.0923, + "step": 3384 + }, + { + "epoch": 0.678142514011209, + "learning_rate": 1.9976563131604945e-05, + "loss": 0.0588, + "step": 3386 + }, + { + "epoch": 0.678142514011209, + "learning_rate": 1.9977509622105233e-05, + "loss": 0.018, + "step": 3388 + }, + { + "epoch": 0.6789431545236189, + "learning_rate": 1.9978436626313065e-05, + "loss": 0.0462, + "step": 3390 + }, + { + "epoch": 0.6789431545236189, + "learning_rate": 1.9979344142417986e-05, + "loss": 0.0442, + "step": 3392 + }, + { + "epoch": 0.6797437950360288, + "learning_rate": 1.99802321686476e-05, + "loss": 0.001, + "step": 3394 + }, + { + "epoch": 0.6797437950360288, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.0513, + "step": 3396 + }, + { + "epoch": 0.6805444355484388, + "learning_rate": 1.9981949744581622e-05, + "loss": 0.0288, + "step": 3398 + }, + { + "epoch": 0.6805444355484388, + "learning_rate": 1.998277929093157e-05, + "loss": 0.001, + "step": 3400 + }, + { + "epoch": 0.6813450760608487, + "learning_rate": 1.9983589340697288e-05, + "loss": 0.7546, + "step": 3402 + }, + { + "epoch": 0.6813450760608487, + "learning_rate": 1.998437989229673e-05, + "loss": 0.2252, + "step": 3404 + }, + { + "epoch": 0.6821457165732586, + "learning_rate": 1.998515094418594e-05, + "loss": 0.0005, + "step": 3406 + }, + { + "epoch": 0.6821457165732586, + "learning_rate": 1.9985902494859023e-05, + "loss": 0.265, + "step": 3408 + }, + { + "epoch": 0.6829463570856685, + "learning_rate": 1.99866345428482e-05, + "loss": 0.1207, + "step": 3410 + }, + { + "epoch": 0.6829463570856685, + "learning_rate": 1.998734708672375e-05, + "loss": 0.2851, + "step": 3412 + }, + { + "epoch": 0.6837469975980784, + "learning_rate": 1.998804012509407e-05, + "loss": 0.0085, + "step": 3414 + }, + { + "epoch": 0.6837469975980784, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.0469, + "step": 3416 + }, + { + "epoch": 0.6845476381104884, + "learning_rate": 1.9989367679943025e-05, + "loss": 0.0002, + "step": 3418 + }, + { + "epoch": 0.6845476381104884, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.5922, + "step": 3420 + }, + { + "epoch": 0.6853482786228983, + "learning_rate": 1.9990617197024103e-05, + "loss": 0.0153, + "step": 3422 + }, + { + "epoch": 0.6853482786228983, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.2045, + "step": 3424 + }, + { + "epoch": 0.6861489191353083, + "learning_rate": 1.999178866657597e-05, + "loss": 0.2433, + "step": 3426 + }, + { + "epoch": 0.6861489191353083, + "learning_rate": 1.9992345130644747e-05, + "loss": 0.2768, + "step": 3428 + }, + { + "epoch": 0.6869495596477182, + "learning_rate": 1.999288207944701e-05, + "loss": 0.4234, + "step": 3430 + }, + { + "epoch": 0.6869495596477182, + "learning_rate": 1.999339951193407e-05, + "loss": 0.1436, + "step": 3432 + }, + { + "epoch": 0.6877502001601281, + "learning_rate": 1.999389742709538e-05, + "loss": 0.3293, + "step": 3434 + }, + { + "epoch": 0.6877502001601281, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.245, + "step": 3436 + }, + { + "epoch": 0.688550840672538, + "learning_rate": 1.9994834701589113e-05, + "loss": 0.1319, + "step": 3438 + }, + { + "epoch": 0.688550840672538, + "learning_rate": 1.9995274059091018e-05, + "loss": 0.0164, + "step": 3440 + }, + { + "epoch": 0.6893514811849479, + "learning_rate": 1.999569389560614e-05, + "loss": 0.008, + "step": 3442 + }, + { + "epoch": 0.6893514811849479, + "learning_rate": 1.999609421031453e-05, + "loss": 0.0038, + "step": 3444 + }, + { + "epoch": 0.6901521216973578, + "learning_rate": 1.9996475002434365e-05, + "loss": 0.5699, + "step": 3446 + }, + { + "epoch": 0.6901521216973578, + "learning_rate": 1.999683627122195e-05, + "loss": 0.1118, + "step": 3448 + }, + { + "epoch": 0.6909527622097679, + "learning_rate": 1.999717801597172e-05, + "loss": 0.0066, + "step": 3450 + }, + { + "epoch": 0.6909527622097679, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.0207, + "step": 3452 + }, + { + "epoch": 0.6917534027221778, + "learning_rate": 1.9997802930726195e-05, + "loss": 0.1963, + "step": 3454 + }, + { + "epoch": 0.6917534027221778, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.1867, + "step": 3456 + }, + { + "epoch": 0.6925540432345877, + "learning_rate": 1.9998349741815916e-05, + "loss": 0.3297, + "step": 3458 + }, + { + "epoch": 0.6925540432345877, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.7552, + "step": 3460 + }, + { + "epoch": 0.6933546837469976, + "learning_rate": 1.999881844496914e-05, + "loss": 0.0213, + "step": 3462 + }, + { + "epoch": 0.6933546837469976, + "learning_rate": 1.99990235049015e-05, + "loss": 0.3177, + "step": 3464 + }, + { + "epoch": 0.6941553242594075, + "learning_rate": 1.9999209036524326e-05, + "loss": 0.1053, + "step": 3466 + }, + { + "epoch": 0.6941553242594075, + "learning_rate": 1.9999375039475275e-05, + "loss": 0.0966, + "step": 3468 + }, + { + "epoch": 0.6949559647718174, + "learning_rate": 1.999952151343014e-05, + "loss": 0.1196, + "step": 3470 + }, + { + "epoch": 0.6949559647718174, + "learning_rate": 1.999964845810285e-05, + "loss": 0.174, + "step": 3472 + }, + { + "epoch": 0.6957566052842273, + "learning_rate": 1.9999755873245484e-05, + "loss": 0.089, + "step": 3474 + }, + { + "epoch": 0.6957566052842273, + "learning_rate": 1.9999843758648253e-05, + "loss": 0.0145, + "step": 3476 + }, + { + "epoch": 0.6965572457966374, + "learning_rate": 1.999991211413952e-05, + "loss": 0.0029, + "step": 3478 + }, + { + "epoch": 0.6965572457966374, + "learning_rate": 1.999996093958578e-05, + "loss": 0.0226, + "step": 3480 + }, + { + "epoch": 0.6973578863090473, + "learning_rate": 1.9999990234891677e-05, + "loss": 0.0185, + "step": 3482 + }, + { + "epoch": 0.6973578863090473, + "learning_rate": 2e-05, + "loss": 0.044, + "step": 3484 + }, + { + "epoch": 0.6981585268214572, + "learning_rate": 1.999999023489168e-05, + "loss": 0.0128, + "step": 3486 + }, + { + "epoch": 0.6981585268214572, + "learning_rate": 1.999996093958578e-05, + "loss": 0.1749, + "step": 3488 + }, + { + "epoch": 0.6989591673338671, + "learning_rate": 1.999991211413952e-05, + "loss": 0.0107, + "step": 3490 + }, + { + "epoch": 0.6989591673338671, + "learning_rate": 1.9999843758648253e-05, + "loss": 1.6636, + "step": 3492 + }, + { + "epoch": 0.699759807846277, + "learning_rate": 1.9999755873245484e-05, + "loss": 0.1738, + "step": 3494 + }, + { + "epoch": 0.699759807846277, + "learning_rate": 1.999964845810285e-05, + "loss": 0.2116, + "step": 3496 + }, + { + "epoch": 0.7005604483586869, + "learning_rate": 1.999952151343014e-05, + "loss": 0.0352, + "step": 3498 + }, + { + "epoch": 0.7005604483586869, + "learning_rate": 1.9999375039475278e-05, + "loss": 0.1588, + "step": 3500 + }, + { + "epoch": 0.7013610888710968, + "learning_rate": 1.9999209036524326e-05, + "loss": 0.7076, + "step": 3502 + }, + { + "epoch": 0.7013610888710968, + "learning_rate": 1.99990235049015e-05, + "loss": 0.2209, + "step": 3504 + }, + { + "epoch": 0.7021617293835068, + "learning_rate": 1.999881844496914e-05, + "loss": 0.2268, + "step": 3506 + }, + { + "epoch": 0.7021617293835068, + "learning_rate": 1.9998593857127736e-05, + "loss": 0.0018, + "step": 3508 + }, + { + "epoch": 0.7029623698959168, + "learning_rate": 1.9998349741815916e-05, + "loss": 0.1081, + "step": 3510 + }, + { + "epoch": 0.7029623698959168, + "learning_rate": 1.9998086099510433e-05, + "loss": 0.3079, + "step": 3512 + }, + { + "epoch": 0.7037630104083267, + "learning_rate": 1.9997802930726195e-05, + "loss": 0.0927, + "step": 3514 + }, + { + "epoch": 0.7037630104083267, + "learning_rate": 1.9997500236016233e-05, + "loss": 0.0094, + "step": 3516 + }, + { + "epoch": 0.7045636509207366, + "learning_rate": 1.999717801597172e-05, + "loss": 0.0003, + "step": 3518 + }, + { + "epoch": 0.7045636509207366, + "learning_rate": 1.999683627122195e-05, + "loss": 0.2951, + "step": 3520 + }, + { + "epoch": 0.7053642914331465, + "learning_rate": 1.9996475002434365e-05, + "loss": 0.0323, + "step": 3522 + }, + { + "epoch": 0.7053642914331465, + "learning_rate": 1.999609421031453e-05, + "loss": 0.2409, + "step": 3524 + }, + { + "epoch": 0.7061649319455564, + "learning_rate": 1.999569389560614e-05, + "loss": 0.4153, + "step": 3526 + }, + { + "epoch": 0.7061649319455564, + "learning_rate": 1.999527405909102e-05, + "loss": 0.0635, + "step": 3528 + }, + { + "epoch": 0.7069655724579663, + "learning_rate": 1.9994834701589113e-05, + "loss": 0.0314, + "step": 3530 + }, + { + "epoch": 0.7069655724579663, + "learning_rate": 1.9994375823958504e-05, + "loss": 0.5603, + "step": 3532 + }, + { + "epoch": 0.7077662129703763, + "learning_rate": 1.9993897427095378e-05, + "loss": 0.814, + "step": 3534 + }, + { + "epoch": 0.7077662129703763, + "learning_rate": 1.999339951193407e-05, + "loss": 0.1303, + "step": 3536 + }, + { + "epoch": 0.7085668534827863, + "learning_rate": 1.999288207944701e-05, + "loss": 0.0126, + "step": 3538 + }, + { + "epoch": 0.7085668534827863, + "learning_rate": 1.999234513064475e-05, + "loss": 0.1419, + "step": 3540 + }, + { + "epoch": 0.7093674939951962, + "learning_rate": 1.999178866657597e-05, + "loss": 0.0618, + "step": 3542 + }, + { + "epoch": 0.7093674939951962, + "learning_rate": 1.9991212688327456e-05, + "loss": 0.0791, + "step": 3544 + }, + { + "epoch": 0.7101681345076061, + "learning_rate": 1.9990617197024103e-05, + "loss": 0.0975, + "step": 3546 + }, + { + "epoch": 0.7101681345076061, + "learning_rate": 1.9990002193828923e-05, + "loss": 0.0028, + "step": 3548 + }, + { + "epoch": 0.710968775020016, + "learning_rate": 1.998936767994303e-05, + "loss": 0.0128, + "step": 3550 + }, + { + "epoch": 0.710968775020016, + "learning_rate": 1.9988713656605635e-05, + "loss": 0.0659, + "step": 3552 + }, + { + "epoch": 0.7117694155324259, + "learning_rate": 1.998804012509407e-05, + "loss": 0.278, + "step": 3554 + }, + { + "epoch": 0.7117694155324259, + "learning_rate": 1.998734708672375e-05, + "loss": 0.5662, + "step": 3556 + }, + { + "epoch": 0.7125700560448359, + "learning_rate": 1.99866345428482e-05, + "loss": 0.2798, + "step": 3558 + }, + { + "epoch": 0.7125700560448359, + "learning_rate": 1.9985902494859026e-05, + "loss": 0.1963, + "step": 3560 + }, + { + "epoch": 0.7133706965572458, + "learning_rate": 1.998515094418594e-05, + "loss": 0.0975, + "step": 3562 + }, + { + "epoch": 0.7133706965572458, + "learning_rate": 1.9984379892296735e-05, + "loss": 0.1228, + "step": 3564 + }, + { + "epoch": 0.7141713370696557, + "learning_rate": 1.9983589340697288e-05, + "loss": 0.3723, + "step": 3566 + }, + { + "epoch": 0.7141713370696557, + "learning_rate": 1.9982779290931572e-05, + "loss": 0.7659, + "step": 3568 + }, + { + "epoch": 0.7149719775820657, + "learning_rate": 1.9981949744581622e-05, + "loss": 0.3801, + "step": 3570 + }, + { + "epoch": 0.7149719775820657, + "learning_rate": 1.9981100703267567e-05, + "loss": 0.0481, + "step": 3572 + }, + { + "epoch": 0.7157726180944756, + "learning_rate": 1.99802321686476e-05, + "loss": 0.0162, + "step": 3574 + }, + { + "epoch": 0.7157726180944756, + "learning_rate": 1.997934414241799e-05, + "loss": 0.0094, + "step": 3576 + }, + { + "epoch": 0.7165732586068855, + "learning_rate": 1.9978436626313068e-05, + "loss": 0.0351, + "step": 3578 + }, + { + "epoch": 0.7165732586068855, + "learning_rate": 1.9977509622105236e-05, + "loss": 0.218, + "step": 3580 + }, + { + "epoch": 0.7173738991192954, + "learning_rate": 1.997656313160495e-05, + "loss": 0.0083, + "step": 3582 + }, + { + "epoch": 0.7173738991192954, + "learning_rate": 1.997559715666073e-05, + "loss": 0.0063, + "step": 3584 + }, + { + "epoch": 0.7181745396317054, + "learning_rate": 1.9974611699159142e-05, + "loss": 0.6514, + "step": 3586 + }, + { + "epoch": 0.7181745396317054, + "learning_rate": 1.9973606761024813e-05, + "loss": 0.2108, + "step": 3588 + }, + { + "epoch": 0.7189751801441153, + "learning_rate": 1.99725823442204e-05, + "loss": 0.0058, + "step": 3590 + }, + { + "epoch": 0.7189751801441153, + "learning_rate": 1.997153845074662e-05, + "loss": 0.0032, + "step": 3592 + }, + { + "epoch": 0.7197758206565252, + "learning_rate": 1.9970475082642212e-05, + "loss": 0.0113, + "step": 3594 + }, + { + "epoch": 0.7197758206565252, + "learning_rate": 1.9969392241983957e-05, + "loss": 0.1827, + "step": 3596 + }, + { + "epoch": 0.7205764611689351, + "learning_rate": 1.9968289930886675e-05, + "loss": 0.4046, + "step": 3598 + }, + { + "epoch": 0.7205764611689351, + "learning_rate": 1.9967168151503193e-05, + "loss": 0.0273, + "step": 3600 + }, + { + "epoch": 0.7213771016813451, + "learning_rate": 1.9966026906024377e-05, + "loss": 0.3491, + "step": 3602 + }, + { + "epoch": 0.7213771016813451, + "learning_rate": 1.996486619667911e-05, + "loss": 0.0475, + "step": 3604 + }, + { + "epoch": 0.722177742193755, + "learning_rate": 1.9963686025734262e-05, + "loss": 0.0682, + "step": 3606 + }, + { + "epoch": 0.722177742193755, + "learning_rate": 1.9962486395494753e-05, + "loss": 0.0248, + "step": 3608 + }, + { + "epoch": 0.7229783827061649, + "learning_rate": 1.9961267308303473e-05, + "loss": 0.0034, + "step": 3610 + }, + { + "epoch": 0.7229783827061649, + "learning_rate": 1.9960028766541336e-05, + "loss": 0.7332, + "step": 3612 + }, + { + "epoch": 0.7237790232185749, + "learning_rate": 1.9958770772627236e-05, + "loss": 0.3551, + "step": 3614 + }, + { + "epoch": 0.7237790232185749, + "learning_rate": 1.9957493329018064e-05, + "loss": 0.0266, + "step": 3616 + }, + { + "epoch": 0.7245796637309848, + "learning_rate": 1.9956196438208693e-05, + "loss": 0.3972, + "step": 3618 + }, + { + "epoch": 0.7245796637309848, + "learning_rate": 1.995488010273198e-05, + "loss": 0.0001, + "step": 3620 + }, + { + "epoch": 0.7253803042433947, + "learning_rate": 1.9953544325158755e-05, + "loss": 0.6098, + "step": 3622 + }, + { + "epoch": 0.7253803042433947, + "learning_rate": 1.9952189108097825e-05, + "loss": 0.405, + "step": 3624 + }, + { + "epoch": 0.7261809447558046, + "learning_rate": 1.9950814454195953e-05, + "loss": 0.0087, + "step": 3626 + }, + { + "epoch": 0.7261809447558046, + "learning_rate": 1.9949420366137873e-05, + "loss": 0.2155, + "step": 3628 + }, + { + "epoch": 0.7269815852682145, + "learning_rate": 1.9948006846646262e-05, + "loss": 0.1198, + "step": 3630 + }, + { + "epoch": 0.7269815852682145, + "learning_rate": 1.994657389848176e-05, + "loss": 0.0003, + "step": 3632 + }, + { + "epoch": 0.7277822257806245, + "learning_rate": 1.9945121524442947e-05, + "loss": 0.3432, + "step": 3634 + }, + { + "epoch": 0.7277822257806245, + "learning_rate": 1.994364972736634e-05, + "loss": 0.2053, + "step": 3636 + }, + { + "epoch": 0.7285828662930345, + "learning_rate": 1.9942158510126384e-05, + "loss": 0.0651, + "step": 3638 + }, + { + "epoch": 0.7285828662930345, + "learning_rate": 1.9940647875635466e-05, + "loss": 0.0545, + "step": 3640 + }, + { + "epoch": 0.7293835068054444, + "learning_rate": 1.9939117826843887e-05, + "loss": 0.5509, + "step": 3642 + }, + { + "epoch": 0.7293835068054444, + "learning_rate": 1.993756836673986e-05, + "loss": 0.1451, + "step": 3644 + }, + { + "epoch": 0.7301841473178543, + "learning_rate": 1.9935999498349525e-05, + "loss": 0.0147, + "step": 3646 + }, + { + "epoch": 0.7301841473178543, + "learning_rate": 1.99344112247369e-05, + "loss": 0.272, + "step": 3648 + }, + { + "epoch": 0.7309847878302642, + "learning_rate": 1.9932803549003932e-05, + "loss": 0.1058, + "step": 3650 + }, + { + "epoch": 0.7309847878302642, + "learning_rate": 1.9931176474290438e-05, + "loss": 0.1083, + "step": 3652 + }, + { + "epoch": 0.7317854283426741, + "learning_rate": 1.9929530003774136e-05, + "loss": 0.0049, + "step": 3654 + }, + { + "epoch": 0.7317854283426741, + "learning_rate": 1.9927864140670618e-05, + "loss": 0.1205, + "step": 3656 + }, + { + "epoch": 0.732586068855084, + "learning_rate": 1.9926178888233344e-05, + "loss": 0.1771, + "step": 3658 + }, + { + "epoch": 0.732586068855084, + "learning_rate": 1.9924474249753656e-05, + "loss": 0.0483, + "step": 3660 + }, + { + "epoch": 0.733386709367494, + "learning_rate": 1.9922750228560746e-05, + "loss": 0.1089, + "step": 3662 + }, + { + "epoch": 0.733386709367494, + "learning_rate": 1.9921006828021666e-05, + "loss": 0.2073, + "step": 3664 + }, + { + "epoch": 0.734187349879904, + "learning_rate": 1.9919244051541315e-05, + "loss": 0.9387, + "step": 3666 + }, + { + "epoch": 0.734187349879904, + "learning_rate": 1.9917461902562435e-05, + "loss": 0.0016, + "step": 3668 + }, + { + "epoch": 0.7349879903923139, + "learning_rate": 1.9915660384565603e-05, + "loss": 0.3725, + "step": 3670 + }, + { + "epoch": 0.7349879903923139, + "learning_rate": 1.9913839501069213e-05, + "loss": 0.4029, + "step": 3672 + }, + { + "epoch": 0.7357886309047238, + "learning_rate": 1.9911999255629504e-05, + "loss": 0.0271, + "step": 3674 + }, + { + "epoch": 0.7357886309047238, + "learning_rate": 1.9910139651840497e-05, + "loss": 0.1075, + "step": 3676 + }, + { + "epoch": 0.7365892714171337, + "learning_rate": 1.990826069333406e-05, + "loss": 0.0018, + "step": 3678 + }, + { + "epoch": 0.7365892714171337, + "learning_rate": 1.9906362383779826e-05, + "loss": 0.0915, + "step": 3680 + }, + { + "epoch": 0.7373899119295436, + "learning_rate": 1.9904444726885236e-05, + "loss": 0.2878, + "step": 3682 + }, + { + "epoch": 0.7373899119295436, + "learning_rate": 1.9902507726395524e-05, + "loss": 0.2905, + "step": 3684 + }, + { + "epoch": 0.7381905524419535, + "learning_rate": 1.9900551386093677e-05, + "loss": 0.3305, + "step": 3686 + }, + { + "epoch": 0.7381905524419535, + "learning_rate": 1.989857570980049e-05, + "loss": 0.399, + "step": 3688 + }, + { + "epoch": 0.7389911929543634, + "learning_rate": 1.9896580701374482e-05, + "loss": 0.0898, + "step": 3690 + }, + { + "epoch": 0.7389911929543634, + "learning_rate": 1.9894566364711965e-05, + "loss": 0.1192, + "step": 3692 + }, + { + "epoch": 0.7397918334667735, + "learning_rate": 1.9892532703746977e-05, + "loss": 0.0933, + "step": 3694 + }, + { + "epoch": 0.7397918334667735, + "learning_rate": 1.9890479722451292e-05, + "loss": 0.0096, + "step": 3696 + }, + { + "epoch": 0.7405924739791834, + "learning_rate": 1.9888407424834437e-05, + "loss": 0.2818, + "step": 3698 + }, + { + "epoch": 0.7405924739791834, + "learning_rate": 1.988631581494365e-05, + "loss": 0.0471, + "step": 3700 + }, + { + "epoch": 0.7413931144915933, + "learning_rate": 1.9884204896863895e-05, + "loss": 0.0939, + "step": 3702 + }, + { + "epoch": 0.7413931144915933, + "learning_rate": 1.9882074674717832e-05, + "loss": 0.2267, + "step": 3704 + }, + { + "epoch": 0.7421937550040032, + "learning_rate": 1.9879925152665845e-05, + "loss": 0.0505, + "step": 3706 + }, + { + "epoch": 0.7421937550040032, + "learning_rate": 1.987775633490599e-05, + "loss": 0.0318, + "step": 3708 + }, + { + "epoch": 0.7429943955164131, + "learning_rate": 1.9875568225674005e-05, + "loss": 0.7036, + "step": 3710 + }, + { + "epoch": 0.7429943955164131, + "learning_rate": 1.987336082924333e-05, + "loss": 0.003, + "step": 3712 + }, + { + "epoch": 0.743795036028823, + "learning_rate": 1.987113414992505e-05, + "loss": 0.0081, + "step": 3714 + }, + { + "epoch": 0.743795036028823, + "learning_rate": 1.986888819206792e-05, + "loss": 0.3831, + "step": 3716 + }, + { + "epoch": 0.7445956765412329, + "learning_rate": 1.986662296005834e-05, + "loss": 0.0177, + "step": 3718 + }, + { + "epoch": 0.7445956765412329, + "learning_rate": 1.986433845832037e-05, + "loss": 0.0479, + "step": 3720 + }, + { + "epoch": 0.745396317053643, + "learning_rate": 1.9862034691315678e-05, + "loss": 0.6472, + "step": 3722 + }, + { + "epoch": 0.745396317053643, + "learning_rate": 1.9859711663543573e-05, + "loss": 0.1698, + "step": 3724 + }, + { + "epoch": 0.7461969575660529, + "learning_rate": 1.9857369379540985e-05, + "loss": 0.1236, + "step": 3726 + }, + { + "epoch": 0.7461969575660529, + "learning_rate": 1.9855007843882437e-05, + "loss": 0.0104, + "step": 3728 + }, + { + "epoch": 0.7469975980784628, + "learning_rate": 1.985262706118007e-05, + "loss": 0.0674, + "step": 3730 + }, + { + "epoch": 0.7469975980784628, + "learning_rate": 1.9850227036083592e-05, + "loss": 0.3206, + "step": 3732 + }, + { + "epoch": 0.7477982385908727, + "learning_rate": 1.9847807773280314e-05, + "loss": 0.0057, + "step": 3734 + }, + { + "epoch": 0.7477982385908727, + "learning_rate": 1.9845369277495105e-05, + "loss": 0.0321, + "step": 3736 + }, + { + "epoch": 0.7485988791032826, + "learning_rate": 1.9842911553490396e-05, + "loss": 0.0029, + "step": 3738 + }, + { + "epoch": 0.7485988791032826, + "learning_rate": 1.9840434606066186e-05, + "loss": 0.3315, + "step": 3740 + }, + { + "epoch": 0.7493995196156925, + "learning_rate": 1.983793844005999e-05, + "loss": 0.0654, + "step": 3742 + }, + { + "epoch": 0.7493995196156925, + "learning_rate": 1.9835423060346892e-05, + "loss": 0.3219, + "step": 3744 + }, + { + "epoch": 0.7502001601281025, + "learning_rate": 1.9832888471839475e-05, + "loss": 0.004, + "step": 3746 + }, + { + "epoch": 0.7502001601281025, + "learning_rate": 1.983033467948784e-05, + "loss": 0.1593, + "step": 3748 + }, + { + "epoch": 0.7510008006405124, + "learning_rate": 1.9827761688279613e-05, + "loss": 0.0735, + "step": 3750 + }, + { + "epoch": 0.7510008006405124, + "learning_rate": 1.9825169503239885e-05, + "loss": 0.3823, + "step": 3752 + }, + { + "epoch": 0.7518014411529224, + "learning_rate": 1.9822558129431263e-05, + "loss": 0.0194, + "step": 3754 + }, + { + "epoch": 0.7518014411529224, + "learning_rate": 1.9819927571953804e-05, + "loss": 0.0378, + "step": 3756 + }, + { + "epoch": 0.7526020816653323, + "learning_rate": 1.981727783594506e-05, + "loss": 0.2887, + "step": 3758 + }, + { + "epoch": 0.7526020816653323, + "learning_rate": 1.9814608926580007e-05, + "loss": 0.0104, + "step": 3760 + }, + { + "epoch": 0.7534027221777422, + "learning_rate": 1.9811920849071092e-05, + "loss": 0.0684, + "step": 3762 + }, + { + "epoch": 0.7534027221777422, + "learning_rate": 1.980921360866819e-05, + "loss": 0.0434, + "step": 3764 + }, + { + "epoch": 0.7542033626901521, + "learning_rate": 1.980648721065859e-05, + "loss": 0.1821, + "step": 3766 + }, + { + "epoch": 0.7542033626901521, + "learning_rate": 1.9803741660367018e-05, + "loss": 0.1031, + "step": 3768 + }, + { + "epoch": 0.755004003202562, + "learning_rate": 1.980097696315558e-05, + "loss": 0.0655, + "step": 3770 + }, + { + "epoch": 0.755004003202562, + "learning_rate": 1.9798193124423804e-05, + "loss": 0.7076, + "step": 3772 + }, + { + "epoch": 0.755804643714972, + "learning_rate": 1.979539014960858e-05, + "loss": 0.0256, + "step": 3774 + }, + { + "epoch": 0.755804643714972, + "learning_rate": 1.979256804418418e-05, + "loss": 0.0295, + "step": 3776 + }, + { + "epoch": 0.7566052842273819, + "learning_rate": 1.9789726813662233e-05, + "loss": 0.0541, + "step": 3778 + }, + { + "epoch": 0.7566052842273819, + "learning_rate": 1.978686646359173e-05, + "loss": 0.4889, + "step": 3780 + }, + { + "epoch": 0.7574059247397918, + "learning_rate": 1.9783986999558994e-05, + "loss": 0.1909, + "step": 3782 + }, + { + "epoch": 0.7574059247397918, + "learning_rate": 1.9781088427187677e-05, + "loss": 0.011, + "step": 3784 + }, + { + "epoch": 0.7582065652522018, + "learning_rate": 1.9778170752138763e-05, + "loss": 0.1344, + "step": 3786 + }, + { + "epoch": 0.7582065652522018, + "learning_rate": 1.9775233980110524e-05, + "loss": 0.0013, + "step": 3788 + }, + { + "epoch": 0.7590072057646117, + "learning_rate": 1.9772278116838546e-05, + "loss": 0.0929, + "step": 3790 + }, + { + "epoch": 0.7590072057646117, + "learning_rate": 1.976930316809569e-05, + "loss": 0.0457, + "step": 3792 + }, + { + "epoch": 0.7598078462770216, + "learning_rate": 1.97663091396921e-05, + "loss": 0.3307, + "step": 3794 + }, + { + "epoch": 0.7598078462770216, + "learning_rate": 1.9763296037475177e-05, + "loss": 0.2448, + "step": 3796 + }, + { + "epoch": 0.7606084867894315, + "learning_rate": 1.976026386732957e-05, + "loss": 0.0571, + "step": 3798 + }, + { + "epoch": 0.7606084867894315, + "learning_rate": 1.9757212635177177e-05, + "loss": 0.0015, + "step": 3800 + }, + { + "epoch": 0.7614091273018415, + "learning_rate": 1.9754142346977122e-05, + "loss": 0.1106, + "step": 3802 + }, + { + "epoch": 0.7614091273018415, + "learning_rate": 1.9751053008725736e-05, + "loss": 0.0508, + "step": 3804 + }, + { + "epoch": 0.7622097678142514, + "learning_rate": 1.9747944626456577e-05, + "loss": 0.1983, + "step": 3806 + }, + { + "epoch": 0.7622097678142514, + "learning_rate": 1.9744817206240374e-05, + "loss": 0.0374, + "step": 3808 + }, + { + "epoch": 0.7630104083266613, + "learning_rate": 1.9741670754185054e-05, + "loss": 0.0878, + "step": 3810 + }, + { + "epoch": 0.7630104083266613, + "learning_rate": 1.9738505276435695e-05, + "loss": 0.0979, + "step": 3812 + }, + { + "epoch": 0.7638110488390712, + "learning_rate": 1.9735320779174548e-05, + "loss": 0.0794, + "step": 3814 + }, + { + "epoch": 0.7638110488390712, + "learning_rate": 1.9732117268621005e-05, + "loss": 0.4015, + "step": 3816 + }, + { + "epoch": 0.7646116893514812, + "learning_rate": 1.9728894751031595e-05, + "loss": 0.1314, + "step": 3818 + }, + { + "epoch": 0.7646116893514812, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.1137, + "step": 3820 + }, + { + "epoch": 0.7654123298638911, + "learning_rate": 1.972239271995686e-05, + "loss": 0.3243, + "step": 3822 + }, + { + "epoch": 0.7654123298638911, + "learning_rate": 1.9719113219170152e-05, + "loss": 0.4039, + "step": 3824 + }, + { + "epoch": 0.7662129703763011, + "learning_rate": 1.9715814736744758e-05, + "loss": 0.0909, + "step": 3826 + }, + { + "epoch": 0.7662129703763011, + "learning_rate": 1.9712497279122692e-05, + "loss": 0.0378, + "step": 3828 + }, + { + "epoch": 0.767013610888711, + "learning_rate": 1.9709160852783022e-05, + "loss": 0.0689, + "step": 3830 + }, + { + "epoch": 0.767013610888711, + "learning_rate": 1.970580546424186e-05, + "loss": 0.0083, + "step": 3832 + }, + { + "epoch": 0.7678142514011209, + "learning_rate": 1.9702431120052352e-05, + "loss": 0.6668, + "step": 3834 + }, + { + "epoch": 0.7678142514011209, + "learning_rate": 1.969903782680467e-05, + "loss": 1.5785, + "step": 3836 + }, + { + "epoch": 0.7686148919135308, + "learning_rate": 1.9695625591125984e-05, + "loss": 0.0383, + "step": 3838 + }, + { + "epoch": 0.7686148919135308, + "learning_rate": 1.9692194419680463e-05, + "loss": 0.0222, + "step": 3840 + }, + { + "epoch": 0.7694155324259407, + "learning_rate": 1.968874431916926e-05, + "loss": 0.5358, + "step": 3842 + }, + { + "epoch": 0.7694155324259407, + "learning_rate": 1.96852752963305e-05, + "loss": 0.0044, + "step": 3844 + }, + { + "epoch": 0.7702161729383507, + "learning_rate": 1.9681787357939257e-05, + "loss": 0.0118, + "step": 3846 + }, + { + "epoch": 0.7702161729383507, + "learning_rate": 1.9678280510807552e-05, + "loss": 0.0756, + "step": 3848 + }, + { + "epoch": 0.7710168134507606, + "learning_rate": 1.9674754761784334e-05, + "loss": 0.0302, + "step": 3850 + }, + { + "epoch": 0.7710168134507606, + "learning_rate": 1.9671210117755462e-05, + "loss": 0.0481, + "step": 3852 + }, + { + "epoch": 0.7718174539631706, + "learning_rate": 1.9667646585643706e-05, + "loss": 0.0694, + "step": 3854 + }, + { + "epoch": 0.7718174539631706, + "learning_rate": 1.966406417240872e-05, + "loss": 0.1949, + "step": 3856 + }, + { + "epoch": 0.7726180944755805, + "learning_rate": 1.966046288504704e-05, + "loss": 0.1386, + "step": 3858 + }, + { + "epoch": 0.7726180944755805, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.0484, + "step": 3860 + }, + { + "epoch": 0.7734187349879904, + "learning_rate": 1.965320371611399e-05, + "loss": 0.0311, + "step": 3862 + }, + { + "epoch": 0.7734187349879904, + "learning_rate": 1.964954584871995e-05, + "loss": 0.0324, + "step": 3864 + }, + { + "epoch": 0.7742193755004003, + "learning_rate": 1.964586913555381e-05, + "loss": 0.1953, + "step": 3866 + }, + { + "epoch": 0.7742193755004003, + "learning_rate": 1.9642173583796265e-05, + "loss": 0.0237, + "step": 3868 + }, + { + "epoch": 0.7750200160128102, + "learning_rate": 1.9638459200664822e-05, + "loss": 0.1214, + "step": 3870 + }, + { + "epoch": 0.7750200160128102, + "learning_rate": 1.9634725993413744e-05, + "loss": 0.0079, + "step": 3872 + }, + { + "epoch": 0.7758206565252201, + "learning_rate": 1.9630973969334068e-05, + "loss": 0.1113, + "step": 3874 + }, + { + "epoch": 0.7758206565252201, + "learning_rate": 1.9627203135753576e-05, + "loss": 0.0793, + "step": 3876 + }, + { + "epoch": 0.77662129703763, + "learning_rate": 1.9623413500036795e-05, + "loss": 0.0161, + "step": 3878 + }, + { + "epoch": 0.77662129703763, + "learning_rate": 1.9619605069584954e-05, + "loss": 0.0377, + "step": 3880 + }, + { + "epoch": 0.7774219375500401, + "learning_rate": 1.9615777851836007e-05, + "loss": 0.0209, + "step": 3882 + }, + { + "epoch": 0.7774219375500401, + "learning_rate": 1.961193185426459e-05, + "loss": 0.0148, + "step": 3884 + }, + { + "epoch": 0.77822257806245, + "learning_rate": 1.9608067084382025e-05, + "loss": 0.2408, + "step": 3886 + }, + { + "epoch": 0.77822257806245, + "learning_rate": 1.9604183549736287e-05, + "loss": 0.0301, + "step": 3888 + }, + { + "epoch": 0.7790232185748599, + "learning_rate": 1.9600281257912002e-05, + "loss": 0.0431, + "step": 3890 + }, + { + "epoch": 0.7790232185748599, + "learning_rate": 1.959636021653044e-05, + "loss": 0.2251, + "step": 3892 + }, + { + "epoch": 0.7798238590872698, + "learning_rate": 1.9592420433249465e-05, + "loss": 0.2131, + "step": 3894 + }, + { + "epoch": 0.7798238590872698, + "learning_rate": 1.958846191576357e-05, + "loss": 0.0066, + "step": 3896 + }, + { + "epoch": 0.7806244995996797, + "learning_rate": 1.958448467180382e-05, + "loss": 0.0145, + "step": 3898 + }, + { + "epoch": 0.7806244995996797, + "learning_rate": 1.958048870913786e-05, + "loss": 0.0484, + "step": 3900 + }, + { + "epoch": 0.7814251401120896, + "learning_rate": 1.9576474035569895e-05, + "loss": 0.9636, + "step": 3902 + }, + { + "epoch": 0.7814251401120896, + "learning_rate": 1.9572440658940667e-05, + "loss": 0.0401, + "step": 3904 + }, + { + "epoch": 0.7822257806244995, + "learning_rate": 1.9568388587127448e-05, + "loss": 0.0742, + "step": 3906 + }, + { + "epoch": 0.7822257806244995, + "learning_rate": 1.9564317828044022e-05, + "loss": 0.0499, + "step": 3908 + }, + { + "epoch": 0.7830264211369096, + "learning_rate": 1.9560228389640668e-05, + "loss": 0.2441, + "step": 3910 + }, + { + "epoch": 0.7830264211369096, + "learning_rate": 1.955612027990415e-05, + "loss": 0.0022, + "step": 3912 + }, + { + "epoch": 0.7838270616493195, + "learning_rate": 1.955199350685769e-05, + "loss": 0.5884, + "step": 3914 + }, + { + "epoch": 0.7838270616493195, + "learning_rate": 1.9547848078560982e-05, + "loss": 0.0121, + "step": 3916 + }, + { + "epoch": 0.7846277021617294, + "learning_rate": 1.954368400311011e-05, + "loss": 0.6463, + "step": 3918 + }, + { + "epoch": 0.7846277021617294, + "learning_rate": 1.953950128863763e-05, + "loss": 0.0124, + "step": 3920 + }, + { + "epoch": 0.7854283426741393, + "learning_rate": 1.9535299943312455e-05, + "loss": 0.0083, + "step": 3922 + }, + { + "epoch": 0.7854283426741393, + "learning_rate": 1.9531079975339915e-05, + "loss": 0.0046, + "step": 3924 + }, + { + "epoch": 0.7862289831865492, + "learning_rate": 1.9526841392961694e-05, + "loss": 0.0051, + "step": 3926 + }, + { + "epoch": 0.7862289831865492, + "learning_rate": 1.9522584204455835e-05, + "loss": 0.0006, + "step": 3928 + }, + { + "epoch": 0.7870296236989591, + "learning_rate": 1.9518308418136728e-05, + "loss": 0.2095, + "step": 3930 + }, + { + "epoch": 0.7870296236989591, + "learning_rate": 1.9514014042355054e-05, + "loss": 0.0324, + "step": 3932 + }, + { + "epoch": 0.7878302642113691, + "learning_rate": 1.9509701085497852e-05, + "loss": 0.0042, + "step": 3934 + }, + { + "epoch": 0.7878302642113691, + "learning_rate": 1.9505369555988395e-05, + "loss": 0.3444, + "step": 3936 + }, + { + "epoch": 0.7886309047237791, + "learning_rate": 1.9501019462286266e-05, + "loss": 0.1593, + "step": 3938 + }, + { + "epoch": 0.7886309047237791, + "learning_rate": 1.9496650812887293e-05, + "loss": 0.0034, + "step": 3940 + }, + { + "epoch": 0.789431545236189, + "learning_rate": 1.9492263616323536e-05, + "loss": 0.0164, + "step": 3942 + }, + { + "epoch": 0.789431545236189, + "learning_rate": 1.9487857881163295e-05, + "loss": 0.0049, + "step": 3944 + }, + { + "epoch": 0.7902321857485989, + "learning_rate": 1.948343361601105e-05, + "loss": 1.2805, + "step": 3946 + }, + { + "epoch": 0.7902321857485989, + "learning_rate": 1.947899082950751e-05, + "loss": 0.0121, + "step": 3948 + }, + { + "epoch": 0.7910328262610088, + "learning_rate": 1.947452953032951e-05, + "loss": 0.4869, + "step": 3950 + }, + { + "epoch": 0.7910328262610088, + "learning_rate": 1.947004972719008e-05, + "loss": 0.0195, + "step": 3952 + }, + { + "epoch": 0.7918334667734187, + "learning_rate": 1.9465551428838363e-05, + "loss": 0.2966, + "step": 3954 + }, + { + "epoch": 0.7918334667734187, + "learning_rate": 1.9461034644059637e-05, + "loss": 0.0714, + "step": 3956 + }, + { + "epoch": 0.7926341072858286, + "learning_rate": 1.9456499381675285e-05, + "loss": 0.2187, + "step": 3958 + }, + { + "epoch": 0.7926341072858286, + "learning_rate": 1.945194565054276e-05, + "loss": 0.0937, + "step": 3960 + }, + { + "epoch": 0.7934347477982386, + "learning_rate": 1.9447373459555617e-05, + "loss": 0.0452, + "step": 3962 + }, + { + "epoch": 0.7934347477982386, + "learning_rate": 1.9442782817643425e-05, + "loss": 0.385, + "step": 3964 + }, + { + "epoch": 0.7942353883106485, + "learning_rate": 1.9438173733771814e-05, + "loss": 0.2444, + "step": 3966 + }, + { + "epoch": 0.7942353883106485, + "learning_rate": 1.9433546216942433e-05, + "loss": 0.472, + "step": 3968 + }, + { + "epoch": 0.7950360288230585, + "learning_rate": 1.9428900276192903e-05, + "loss": 0.016, + "step": 3970 + }, + { + "epoch": 0.7950360288230585, + "learning_rate": 1.942423592059687e-05, + "loss": 0.0207, + "step": 3972 + }, + { + "epoch": 0.7958366693354684, + "learning_rate": 1.94195531592639e-05, + "loss": 0.0672, + "step": 3974 + }, + { + "epoch": 0.7958366693354684, + "learning_rate": 1.941485200133955e-05, + "loss": 0.006, + "step": 3976 + }, + { + "epoch": 0.7966373098478783, + "learning_rate": 1.9410132456005262e-05, + "loss": 0.2485, + "step": 3978 + }, + { + "epoch": 0.7966373098478783, + "learning_rate": 1.9405394532478422e-05, + "loss": 0.0698, + "step": 3980 + }, + { + "epoch": 0.7974379503602882, + "learning_rate": 1.94006382400123e-05, + "loss": 0.0648, + "step": 3982 + }, + { + "epoch": 0.7974379503602882, + "learning_rate": 1.9395863587896025e-05, + "loss": 0.122, + "step": 3984 + }, + { + "epoch": 0.7982385908726981, + "learning_rate": 1.939107058545461e-05, + "loss": 0.1117, + "step": 3986 + }, + { + "epoch": 0.7982385908726981, + "learning_rate": 1.938625924204888e-05, + "loss": 0.0079, + "step": 3988 + }, + { + "epoch": 0.7990392313851081, + "learning_rate": 1.9381429567075507e-05, + "loss": 0.2143, + "step": 3990 + }, + { + "epoch": 0.7990392313851081, + "learning_rate": 1.937658156996694e-05, + "loss": 0.0055, + "step": 3992 + }, + { + "epoch": 0.799839871897518, + "learning_rate": 1.9371715260191425e-05, + "loss": 0.031, + "step": 3994 + }, + { + "epoch": 0.799839871897518, + "learning_rate": 1.9366830647252977e-05, + "loss": 0.3998, + "step": 3996 + }, + { + "epoch": 0.800640512409928, + "learning_rate": 1.936192774069133e-05, + "loss": 0.024, + "step": 3998 + }, + { + "epoch": 0.800640512409928, + "learning_rate": 1.9357006550082e-05, + "loss": 0.0325, + "step": 4000 + }, + { + "epoch": 0.8014411529223379, + "learning_rate": 1.9352067085036145e-05, + "loss": 0.2118, + "step": 4002 + }, + { + "epoch": 0.8014411529223379, + "learning_rate": 1.9347109355200676e-05, + "loss": 0.0099, + "step": 4004 + }, + { + "epoch": 0.8022417934347478, + "learning_rate": 1.9342133370258124e-05, + "loss": 0.0529, + "step": 4006 + }, + { + "epoch": 0.8022417934347478, + "learning_rate": 1.933713913992671e-05, + "loss": 0.0302, + "step": 4008 + }, + { + "epoch": 0.8030424339471577, + "learning_rate": 1.9332126673960276e-05, + "loss": 0.0142, + "step": 4010 + }, + { + "epoch": 0.8030424339471577, + "learning_rate": 1.9327095982148255e-05, + "loss": 0.0221, + "step": 4012 + }, + { + "epoch": 0.8038430744595677, + "learning_rate": 1.932204707431572e-05, + "loss": 0.2001, + "step": 4014 + }, + { + "epoch": 0.8038430744595677, + "learning_rate": 1.9316979960323283e-05, + "loss": 0.0417, + "step": 4016 + }, + { + "epoch": 0.8046437149719776, + "learning_rate": 1.9311894650067146e-05, + "loss": 0.2539, + "step": 4018 + }, + { + "epoch": 0.8046437149719776, + "learning_rate": 1.9306791153479017e-05, + "loss": 0.0056, + "step": 4020 + }, + { + "epoch": 0.8054443554843875, + "learning_rate": 1.9301669480526118e-05, + "loss": 0.0042, + "step": 4022 + }, + { + "epoch": 0.8054443554843875, + "learning_rate": 1.9296529641211226e-05, + "loss": 0.0124, + "step": 4024 + }, + { + "epoch": 0.8062449959967974, + "learning_rate": 1.929137164557252e-05, + "loss": 0.0917, + "step": 4026 + }, + { + "epoch": 0.8062449959967974, + "learning_rate": 1.928619550368371e-05, + "loss": 0.014, + "step": 4028 + }, + { + "epoch": 0.8070456365092074, + "learning_rate": 1.9281001225653883e-05, + "loss": 0.1997, + "step": 4030 + }, + { + "epoch": 0.8070456365092074, + "learning_rate": 1.9275788821627607e-05, + "loss": 0.7662, + "step": 4032 + }, + { + "epoch": 0.8078462770216173, + "learning_rate": 1.9270558301784808e-05, + "loss": 0.1789, + "step": 4034 + }, + { + "epoch": 0.8078462770216173, + "learning_rate": 1.9265309676340783e-05, + "loss": 0.2444, + "step": 4036 + }, + { + "epoch": 0.8086469175340272, + "learning_rate": 1.9260042955546247e-05, + "loss": 0.0058, + "step": 4038 + }, + { + "epoch": 0.8086469175340272, + "learning_rate": 1.9254758149687187e-05, + "loss": 0.6853, + "step": 4040 + }, + { + "epoch": 0.8094475580464372, + "learning_rate": 1.9249455269084972e-05, + "loss": 0.0857, + "step": 4042 + }, + { + "epoch": 0.8094475580464372, + "learning_rate": 1.9244134324096216e-05, + "loss": 0.0511, + "step": 4044 + }, + { + "epoch": 0.8102481985588471, + "learning_rate": 1.9238795325112867e-05, + "loss": 0.0875, + "step": 4046 + }, + { + "epoch": 0.8102481985588471, + "learning_rate": 1.9233438282562095e-05, + "loss": 0.0463, + "step": 4048 + }, + { + "epoch": 0.811048839071257, + "learning_rate": 1.9228063206906302e-05, + "loss": 0.208, + "step": 4050 + }, + { + "epoch": 0.811048839071257, + "learning_rate": 1.9222670108643156e-05, + "loss": 0.0203, + "step": 4052 + }, + { + "epoch": 0.8118494795836669, + "learning_rate": 1.9217258998305464e-05, + "loss": 0.0176, + "step": 4054 + }, + { + "epoch": 0.8118494795836669, + "learning_rate": 1.9211829886461278e-05, + "loss": 0.7316, + "step": 4056 + }, + { + "epoch": 0.8126501200960768, + "learning_rate": 1.9206382783713735e-05, + "loss": 0.1024, + "step": 4058 + }, + { + "epoch": 0.8126501200960768, + "learning_rate": 1.9200917700701176e-05, + "loss": 0.0936, + "step": 4060 + }, + { + "epoch": 0.8134507606084868, + "learning_rate": 1.9195434648097013e-05, + "loss": 0.0763, + "step": 4062 + }, + { + "epoch": 0.8134507606084868, + "learning_rate": 1.918993363660975e-05, + "loss": 0.1285, + "step": 4064 + }, + { + "epoch": 0.8142514011208967, + "learning_rate": 1.9184414676983013e-05, + "loss": 0.0198, + "step": 4066 + }, + { + "epoch": 0.8142514011208967, + "learning_rate": 1.9178877779995416e-05, + "loss": 0.0617, + "step": 4068 + }, + { + "epoch": 0.8150520416333067, + "learning_rate": 1.9173322956460678e-05, + "loss": 0.3842, + "step": 4070 + }, + { + "epoch": 0.8150520416333067, + "learning_rate": 1.916775021722745e-05, + "loss": 0.0228, + "step": 4072 + }, + { + "epoch": 0.8158526821457166, + "learning_rate": 1.9162159573179446e-05, + "loss": 0.0832, + "step": 4074 + }, + { + "epoch": 0.8158526821457166, + "learning_rate": 1.9156551035235298e-05, + "loss": 0.0332, + "step": 4076 + }, + { + "epoch": 0.8166533226581265, + "learning_rate": 1.915092461434859e-05, + "loss": 0.0511, + "step": 4078 + }, + { + "epoch": 0.8166533226581265, + "learning_rate": 1.9145280321507872e-05, + "loss": 0.1116, + "step": 4080 + }, + { + "epoch": 0.8174539631705364, + "learning_rate": 1.9139618167736547e-05, + "loss": 0.0728, + "step": 4082 + }, + { + "epoch": 0.8174539631705364, + "learning_rate": 1.9133938164092942e-05, + "loss": 0.0144, + "step": 4084 + }, + { + "epoch": 0.8182546036829463, + "learning_rate": 1.912824032167022e-05, + "loss": 0.0237, + "step": 4086 + }, + { + "epoch": 0.8182546036829463, + "learning_rate": 1.9122524651596372e-05, + "loss": 0.4968, + "step": 4088 + }, + { + "epoch": 0.8190552441953562, + "learning_rate": 1.911679116503426e-05, + "loss": 0.005, + "step": 4090 + }, + { + "epoch": 0.8190552441953562, + "learning_rate": 1.9111039873181475e-05, + "loss": 0.8599, + "step": 4092 + }, + { + "epoch": 0.8198558847077662, + "learning_rate": 1.9105270787270446e-05, + "loss": 0.7507, + "step": 4094 + }, + { + "epoch": 0.8198558847077662, + "learning_rate": 1.9099483918568287e-05, + "loss": 0.0487, + "step": 4096 + }, + { + "epoch": 0.8206565252201762, + "learning_rate": 1.9093679278376913e-05, + "loss": 0.8846, + "step": 4098 + }, + { + "epoch": 0.8206565252201762, + "learning_rate": 1.90878568780329e-05, + "loss": 0.0103, + "step": 4100 + }, + { + "epoch": 0.8214571657325861, + "learning_rate": 1.90820167289075e-05, + "loss": 0.0057, + "step": 4102 + }, + { + "epoch": 0.8214571657325861, + "learning_rate": 1.907615884240668e-05, + "loss": 0.0676, + "step": 4104 + }, + { + "epoch": 0.822257806244996, + "learning_rate": 1.9070283229971003e-05, + "loss": 0.0561, + "step": 4106 + }, + { + "epoch": 0.822257806244996, + "learning_rate": 1.9064389903075683e-05, + "loss": 0.4073, + "step": 4108 + }, + { + "epoch": 0.8230584467574059, + "learning_rate": 1.9058478873230487e-05, + "loss": 0.3225, + "step": 4110 + }, + { + "epoch": 0.8230584467574059, + "learning_rate": 1.905255015197982e-05, + "loss": 0.1903, + "step": 4112 + }, + { + "epoch": 0.8238590872698158, + "learning_rate": 1.9046603750902585e-05, + "loss": 0.083, + "step": 4114 + }, + { + "epoch": 0.8238590872698158, + "learning_rate": 1.9040639681612216e-05, + "loss": 0.1232, + "step": 4116 + }, + { + "epoch": 0.8246597277822257, + "learning_rate": 1.9034657955756702e-05, + "loss": 0.0662, + "step": 4118 + }, + { + "epoch": 0.8246597277822257, + "learning_rate": 1.902865858501845e-05, + "loss": 0.0622, + "step": 4120 + }, + { + "epoch": 0.8254603682946358, + "learning_rate": 1.9022641581114396e-05, + "loss": 0.1022, + "step": 4122 + }, + { + "epoch": 0.8254603682946358, + "learning_rate": 1.9016606955795843e-05, + "loss": 0.2059, + "step": 4124 + }, + { + "epoch": 0.8262610088070457, + "learning_rate": 1.901055472084858e-05, + "loss": 0.1967, + "step": 4126 + }, + { + "epoch": 0.8262610088070457, + "learning_rate": 1.9004484888092734e-05, + "loss": 0.0219, + "step": 4128 + }, + { + "epoch": 0.8270616493194556, + "learning_rate": 1.8998397469382812e-05, + "loss": 0.1896, + "step": 4130 + }, + { + "epoch": 0.8270616493194556, + "learning_rate": 1.8992292476607695e-05, + "loss": 0.0409, + "step": 4132 + }, + { + "epoch": 0.8278622898318655, + "learning_rate": 1.898616992169054e-05, + "loss": 0.086, + "step": 4134 + }, + { + "epoch": 0.8278622898318655, + "learning_rate": 1.8980029816588863e-05, + "loss": 0.109, + "step": 4136 + }, + { + "epoch": 0.8286629303442754, + "learning_rate": 1.89738721732944e-05, + "loss": 0.3486, + "step": 4138 + }, + { + "epoch": 0.8286629303442754, + "learning_rate": 1.8967697003833156e-05, + "loss": 0.0245, + "step": 4140 + }, + { + "epoch": 0.8294635708566853, + "learning_rate": 1.8961504320265392e-05, + "loss": 0.1447, + "step": 4142 + }, + { + "epoch": 0.8294635708566853, + "learning_rate": 1.8955294134685528e-05, + "loss": 0.0645, + "step": 4144 + }, + { + "epoch": 0.8302642113690952, + "learning_rate": 1.8949066459222224e-05, + "loss": 0.106, + "step": 4146 + }, + { + "epoch": 0.8302642113690952, + "learning_rate": 1.8942821306038227e-05, + "loss": 0.4531, + "step": 4148 + }, + { + "epoch": 0.8310648518815053, + "learning_rate": 1.8936558687330492e-05, + "loss": 0.0161, + "step": 4150 + }, + { + "epoch": 0.8310648518815053, + "learning_rate": 1.893027861533003e-05, + "loss": 0.0828, + "step": 4152 + }, + { + "epoch": 0.8318654923939152, + "learning_rate": 1.8923981102301944e-05, + "loss": 0.3116, + "step": 4154 + }, + { + "epoch": 0.8318654923939152, + "learning_rate": 1.891766616054545e-05, + "loss": 0.0632, + "step": 4156 + }, + { + "epoch": 0.8326661329063251, + "learning_rate": 1.8911333802393725e-05, + "loss": 0.0102, + "step": 4158 + }, + { + "epoch": 0.8326661329063251, + "learning_rate": 1.8904984040214043e-05, + "loss": 0.264, + "step": 4160 + }, + { + "epoch": 0.833466773418735, + "learning_rate": 1.8898616886407588e-05, + "loss": 0.2095, + "step": 4162 + }, + { + "epoch": 0.833466773418735, + "learning_rate": 1.8892232353409582e-05, + "loss": 0.0468, + "step": 4164 + }, + { + "epoch": 0.8342674139311449, + "learning_rate": 1.8885830453689146e-05, + "loss": 0.0037, + "step": 4166 + }, + { + "epoch": 0.8342674139311449, + "learning_rate": 1.8879411199749306e-05, + "loss": 0.0073, + "step": 4168 + }, + { + "epoch": 0.8350680544435548, + "learning_rate": 1.8872974604127038e-05, + "loss": 0.0394, + "step": 4170 + }, + { + "epoch": 0.8350680544435548, + "learning_rate": 1.8866520679393124e-05, + "loss": 0.0097, + "step": 4172 + }, + { + "epoch": 0.8358686949559647, + "learning_rate": 1.8860049438152247e-05, + "loss": 0.3799, + "step": 4174 + }, + { + "epoch": 0.8358686949559647, + "learning_rate": 1.885356089304285e-05, + "loss": 0.001, + "step": 4176 + }, + { + "epoch": 0.8366693354683747, + "learning_rate": 1.8847055056737236e-05, + "loss": 0.1668, + "step": 4178 + }, + { + "epoch": 0.8366693354683747, + "learning_rate": 1.884053194194143e-05, + "loss": 0.0354, + "step": 4180 + }, + { + "epoch": 0.8374699759807847, + "learning_rate": 1.8833991561395194e-05, + "loss": 0.2445, + "step": 4182 + }, + { + "epoch": 0.8374699759807847, + "learning_rate": 1.882743392787207e-05, + "loss": 0.0223, + "step": 4184 + }, + { + "epoch": 0.8382706164931946, + "learning_rate": 1.8820859054179225e-05, + "loss": 0.6012, + "step": 4186 + }, + { + "epoch": 0.8382706164931946, + "learning_rate": 1.881426695315756e-05, + "loss": 0.0379, + "step": 4188 + }, + { + "epoch": 0.8390712570056045, + "learning_rate": 1.8807657637681577e-05, + "loss": 0.1409, + "step": 4190 + }, + { + "epoch": 0.8390712570056045, + "learning_rate": 1.8801031120659396e-05, + "loss": 0.5324, + "step": 4192 + }, + { + "epoch": 0.8398718975180144, + "learning_rate": 1.8794387415032783e-05, + "loss": 0.0058, + "step": 4194 + }, + { + "epoch": 0.8398718975180144, + "learning_rate": 1.8787726533777003e-05, + "loss": 0.0682, + "step": 4196 + }, + { + "epoch": 0.8406725380304243, + "learning_rate": 1.8781048489900936e-05, + "loss": 0.5677, + "step": 4198 + }, + { + "epoch": 0.8406725380304243, + "learning_rate": 1.877435329644691e-05, + "loss": 0.1233, + "step": 4200 + }, + { + "epoch": 0.8414731785428343, + "learning_rate": 1.876764096649082e-05, + "loss": 0.0016, + "step": 4202 + }, + { + "epoch": 0.8414731785428343, + "learning_rate": 1.8760911513141974e-05, + "loss": 0.0372, + "step": 4204 + }, + { + "epoch": 0.8422738190552442, + "learning_rate": 1.8754164949543123e-05, + "loss": 0.2646, + "step": 4206 + }, + { + "epoch": 0.8422738190552442, + "learning_rate": 1.8747401288870482e-05, + "loss": 0.0181, + "step": 4208 + }, + { + "epoch": 0.8430744595676541, + "learning_rate": 1.8740620544333604e-05, + "loss": 0.0043, + "step": 4210 + }, + { + "epoch": 0.8430744595676541, + "learning_rate": 1.8733822729175455e-05, + "loss": 0.0905, + "step": 4212 + }, + { + "epoch": 0.8438751000800641, + "learning_rate": 1.872700785667228e-05, + "loss": 0.2858, + "step": 4214 + }, + { + "epoch": 0.8438751000800641, + "learning_rate": 1.8720175940133712e-05, + "loss": 0.0066, + "step": 4216 + }, + { + "epoch": 0.844675740592474, + "learning_rate": 1.8713326992902612e-05, + "loss": 0.0123, + "step": 4218 + }, + { + "epoch": 0.844675740592474, + "learning_rate": 1.8706461028355107e-05, + "loss": 0.832, + "step": 4220 + }, + { + "epoch": 0.8454763811048839, + "learning_rate": 1.8699578059900604e-05, + "loss": 0.1862, + "step": 4222 + }, + { + "epoch": 0.8454763811048839, + "learning_rate": 1.8692678100981663e-05, + "loss": 0.0138, + "step": 4224 + }, + { + "epoch": 0.8462770216172938, + "learning_rate": 1.868576116507408e-05, + "loss": 0.383, + "step": 4226 + }, + { + "epoch": 0.8462770216172938, + "learning_rate": 1.8678827265686753e-05, + "loss": 0.0087, + "step": 4228 + }, + { + "epoch": 0.8470776621297038, + "learning_rate": 1.8671876416361767e-05, + "loss": 0.0265, + "step": 4230 + }, + { + "epoch": 0.8470776621297038, + "learning_rate": 1.8664908630674264e-05, + "loss": 0.0219, + "step": 4232 + }, + { + "epoch": 0.8478783026421137, + "learning_rate": 1.8657923922232467e-05, + "loss": 0.0293, + "step": 4234 + }, + { + "epoch": 0.8478783026421137, + "learning_rate": 1.86509223046777e-05, + "loss": 0.6814, + "step": 4236 + }, + { + "epoch": 0.8486789431545236, + "learning_rate": 1.8643903791684228e-05, + "loss": 0.0607, + "step": 4238 + }, + { + "epoch": 0.8486789431545236, + "learning_rate": 1.8636868396959406e-05, + "loss": 0.0289, + "step": 4240 + }, + { + "epoch": 0.8494795836669335, + "learning_rate": 1.8629816134243466e-05, + "loss": 0.0681, + "step": 4242 + }, + { + "epoch": 0.8494795836669335, + "learning_rate": 1.8622747017309676e-05, + "loss": 0.138, + "step": 4244 + }, + { + "epoch": 0.8502802241793435, + "learning_rate": 1.8615661059964148e-05, + "loss": 0.0914, + "step": 4246 + }, + { + "epoch": 0.8502802241793435, + "learning_rate": 1.8608558276045898e-05, + "loss": 0.1304, + "step": 4248 + }, + { + "epoch": 0.8510808646917534, + "learning_rate": 1.860143867942685e-05, + "loss": 0.1283, + "step": 4250 + }, + { + "epoch": 0.8510808646917534, + "learning_rate": 1.8594302284011697e-05, + "loss": 0.4595, + "step": 4252 + }, + { + "epoch": 0.8518815052041633, + "learning_rate": 1.8587149103738006e-05, + "loss": 0.0023, + "step": 4254 + }, + { + "epoch": 0.8518815052041633, + "learning_rate": 1.8579979152576076e-05, + "loss": 0.0099, + "step": 4256 + }, + { + "epoch": 0.8526821457165733, + "learning_rate": 1.8572792444528963e-05, + "loss": 0.0087, + "step": 4258 + }, + { + "epoch": 0.8526821457165733, + "learning_rate": 1.8565588993632498e-05, + "loss": 0.1457, + "step": 4260 + }, + { + "epoch": 0.8534827862289832, + "learning_rate": 1.8558368813955136e-05, + "loss": 0.309, + "step": 4262 + }, + { + "epoch": 0.8534827862289832, + "learning_rate": 1.8551131919598084e-05, + "loss": 0.44, + "step": 4264 + }, + { + "epoch": 0.8542834267413931, + "learning_rate": 1.854387832469512e-05, + "loss": 0.1083, + "step": 4266 + }, + { + "epoch": 0.8542834267413931, + "learning_rate": 1.8536608043412702e-05, + "loss": 0.0323, + "step": 4268 + }, + { + "epoch": 0.855084067253803, + "learning_rate": 1.8529321089949833e-05, + "loss": 0.7393, + "step": 4270 + }, + { + "epoch": 0.855084067253803, + "learning_rate": 1.852201747853807e-05, + "loss": 0.2861, + "step": 4272 + }, + { + "epoch": 0.855884707766213, + "learning_rate": 1.8514697223441565e-05, + "loss": 0.5597, + "step": 4274 + }, + { + "epoch": 0.855884707766213, + "learning_rate": 1.8507360338956896e-05, + "loss": 0.121, + "step": 4276 + }, + { + "epoch": 0.8566853482786229, + "learning_rate": 1.850000683941319e-05, + "loss": 0.0291, + "step": 4278 + }, + { + "epoch": 0.8566853482786229, + "learning_rate": 1.849263673917196e-05, + "loss": 0.0008, + "step": 4280 + }, + { + "epoch": 0.8574859887910328, + "learning_rate": 1.8485250052627205e-05, + "loss": 0.0126, + "step": 4282 + }, + { + "epoch": 0.8574859887910328, + "learning_rate": 1.847784679420527e-05, + "loss": 0.0071, + "step": 4284 + }, + { + "epoch": 0.8582866293034428, + "learning_rate": 1.8470426978364857e-05, + "loss": 1.0178, + "step": 4286 + }, + { + "epoch": 0.8582866293034428, + "learning_rate": 1.846299061959706e-05, + "loss": 0.4273, + "step": 4288 + }, + { + "epoch": 0.8590872698158527, + "learning_rate": 1.845553773242522e-05, + "loss": 0.0026, + "step": 4290 + }, + { + "epoch": 0.8590872698158527, + "learning_rate": 1.8448068331405018e-05, + "loss": 0.014, + "step": 4292 + }, + { + "epoch": 0.8598879103282626, + "learning_rate": 1.8440582431124322e-05, + "loss": 0.5494, + "step": 4294 + }, + { + "epoch": 0.8598879103282626, + "learning_rate": 1.8433080046203293e-05, + "loss": 0.0264, + "step": 4296 + }, + { + "epoch": 0.8606885508406725, + "learning_rate": 1.842556119129423e-05, + "loss": 0.0015, + "step": 4298 + }, + { + "epoch": 0.8606885508406725, + "learning_rate": 1.841802588108161e-05, + "loss": 0.1826, + "step": 4300 + }, + { + "epoch": 0.8614891913530824, + "learning_rate": 1.841047413028209e-05, + "loss": 0.2028, + "step": 4302 + }, + { + "epoch": 0.8614891913530824, + "learning_rate": 1.8402905953644356e-05, + "loss": 0.0099, + "step": 4304 + }, + { + "epoch": 0.8622898318654924, + "learning_rate": 1.8395321365949273e-05, + "loss": 0.0238, + "step": 4306 + }, + { + "epoch": 0.8622898318654924, + "learning_rate": 1.838772038200968e-05, + "loss": 0.1592, + "step": 4308 + }, + { + "epoch": 0.8630904723779024, + "learning_rate": 1.838010301667044e-05, + "loss": 0.2855, + "step": 4310 + }, + { + "epoch": 0.8630904723779024, + "learning_rate": 1.837246928480848e-05, + "loss": 0.0182, + "step": 4312 + }, + { + "epoch": 0.8638911128903123, + "learning_rate": 1.8364819201332596e-05, + "loss": 0.046, + "step": 4314 + }, + { + "epoch": 0.8638911128903123, + "learning_rate": 1.8357152781183613e-05, + "loss": 0.0267, + "step": 4316 + }, + { + "epoch": 0.8646917534027222, + "learning_rate": 1.834947003933417e-05, + "loss": 0.0335, + "step": 4318 + }, + { + "epoch": 0.8646917534027222, + "learning_rate": 1.8341770990788874e-05, + "loss": 0.0042, + "step": 4320 + }, + { + "epoch": 0.8654923939151321, + "learning_rate": 1.8334055650584107e-05, + "loss": 0.2258, + "step": 4322 + }, + { + "epoch": 0.8654923939151321, + "learning_rate": 1.8326324033788087e-05, + "loss": 0.0474, + "step": 4324 + }, + { + "epoch": 0.866293034427542, + "learning_rate": 1.8318576155500855e-05, + "loss": 0.3548, + "step": 4326 + }, + { + "epoch": 0.866293034427542, + "learning_rate": 1.831081203085415e-05, + "loss": 0.0714, + "step": 4328 + }, + { + "epoch": 0.8670936749399519, + "learning_rate": 1.830303167501152e-05, + "loss": 0.1452, + "step": 4330 + }, + { + "epoch": 0.8670936749399519, + "learning_rate": 1.8295235103168128e-05, + "loss": 0.0457, + "step": 4332 + }, + { + "epoch": 0.8678943154523618, + "learning_rate": 1.8287422330550885e-05, + "loss": 0.0612, + "step": 4334 + }, + { + "epoch": 0.8678943154523618, + "learning_rate": 1.8279593372418284e-05, + "loss": 0.0141, + "step": 4336 + }, + { + "epoch": 0.8686949559647719, + "learning_rate": 1.827174824406043e-05, + "loss": 0.285, + "step": 4338 + }, + { + "epoch": 0.8686949559647719, + "learning_rate": 1.8263886960799072e-05, + "loss": 0.2448, + "step": 4340 + }, + { + "epoch": 0.8694955964771818, + "learning_rate": 1.8256009537987424e-05, + "loss": 0.2898, + "step": 4342 + }, + { + "epoch": 0.8694955964771818, + "learning_rate": 1.8248115991010303e-05, + "loss": 0.0005, + "step": 4344 + }, + { + "epoch": 0.8702962369895917, + "learning_rate": 1.8240206335283943e-05, + "loss": 0.0853, + "step": 4346 + }, + { + "epoch": 0.8702962369895917, + "learning_rate": 1.8232280586256104e-05, + "loss": 0.0463, + "step": 4348 + }, + { + "epoch": 0.8710968775020016, + "learning_rate": 1.8224338759405934e-05, + "loss": 0.037, + "step": 4350 + }, + { + "epoch": 0.8710968775020016, + "learning_rate": 1.8216380870243963e-05, + "loss": 0.8524, + "step": 4352 + }, + { + "epoch": 0.8718975180144115, + "learning_rate": 1.820840693431217e-05, + "loss": 0.0434, + "step": 4354 + }, + { + "epoch": 0.8718975180144115, + "learning_rate": 1.820041696718378e-05, + "loss": 0.0698, + "step": 4356 + }, + { + "epoch": 0.8726981585268214, + "learning_rate": 1.8192410984463416e-05, + "loss": 0.0181, + "step": 4358 + }, + { + "epoch": 0.8726981585268214, + "learning_rate": 1.8184389001786912e-05, + "loss": 0.0167, + "step": 4360 + }, + { + "epoch": 0.8734987990392313, + "learning_rate": 1.8176351034821352e-05, + "loss": 0.645, + "step": 4362 + }, + { + "epoch": 0.8734987990392313, + "learning_rate": 1.8168297099265108e-05, + "loss": 0.0178, + "step": 4364 + }, + { + "epoch": 0.8742994395516414, + "learning_rate": 1.8160227210847642e-05, + "loss": 0.1395, + "step": 4366 + }, + { + "epoch": 0.8742994395516414, + "learning_rate": 1.815214138532966e-05, + "loss": 0.0002, + "step": 4368 + }, + { + "epoch": 0.8751000800640513, + "learning_rate": 1.8144039638502927e-05, + "loss": 0.0791, + "step": 4370 + }, + { + "epoch": 0.8751000800640513, + "learning_rate": 1.8135921986190358e-05, + "loss": 0.1451, + "step": 4372 + }, + { + "epoch": 0.8759007205764612, + "learning_rate": 1.8127788444245884e-05, + "loss": 0.0422, + "step": 4374 + }, + { + "epoch": 0.8759007205764612, + "learning_rate": 1.8119639028554475e-05, + "loss": 0.0607, + "step": 4376 + }, + { + "epoch": 0.8767013610888711, + "learning_rate": 1.8111473755032152e-05, + "loss": 0.174, + "step": 4378 + }, + { + "epoch": 0.8767013610888711, + "learning_rate": 1.8103292639625835e-05, + "loss": 0.1196, + "step": 4380 + }, + { + "epoch": 0.877502001601281, + "learning_rate": 1.8095095698313456e-05, + "loss": 0.7135, + "step": 4382 + }, + { + "epoch": 0.877502001601281, + "learning_rate": 1.808688294710378e-05, + "loss": 0.0365, + "step": 4384 + }, + { + "epoch": 0.8783026421136909, + "learning_rate": 1.807865440203653e-05, + "loss": 0.1132, + "step": 4386 + }, + { + "epoch": 0.8783026421136909, + "learning_rate": 1.807041007918221e-05, + "loss": 0.2144, + "step": 4388 + }, + { + "epoch": 0.8791032826261009, + "learning_rate": 1.806214999464214e-05, + "loss": 0.1838, + "step": 4390 + }, + { + "epoch": 0.8791032826261009, + "learning_rate": 1.805387416454849e-05, + "loss": 0.3317, + "step": 4392 + }, + { + "epoch": 0.8799039231385108, + "learning_rate": 1.8045582605064087e-05, + "loss": 0.3998, + "step": 4394 + }, + { + "epoch": 0.8799039231385108, + "learning_rate": 1.8037275332382575e-05, + "loss": 0.1097, + "step": 4396 + }, + { + "epoch": 0.8807045636509208, + "learning_rate": 1.802895236272819e-05, + "loss": 0.0047, + "step": 4398 + }, + { + "epoch": 0.8807045636509208, + "learning_rate": 1.802061371235592e-05, + "loss": 0.0888, + "step": 4400 + }, + { + "epoch": 0.8815052041633307, + "learning_rate": 1.80122593975513e-05, + "loss": 0.0025, + "step": 4402 + }, + { + "epoch": 0.8815052041633307, + "learning_rate": 1.8003889434630476e-05, + "loss": 0.1081, + "step": 4404 + }, + { + "epoch": 0.8823058446757406, + "learning_rate": 1.7995503839940204e-05, + "loss": 0.077, + "step": 4406 + }, + { + "epoch": 0.8823058446757406, + "learning_rate": 1.7987102629857692e-05, + "loss": 0.1649, + "step": 4408 + }, + { + "epoch": 0.8831064851881505, + "learning_rate": 1.7978685820790725e-05, + "loss": 0.0458, + "step": 4410 + }, + { + "epoch": 0.8831064851881505, + "learning_rate": 1.7970253429177494e-05, + "loss": 0.6462, + "step": 4412 + }, + { + "epoch": 0.8839071257005604, + "learning_rate": 1.796180547148662e-05, + "loss": 0.0009, + "step": 4414 + }, + { + "epoch": 0.8839071257005604, + "learning_rate": 1.7953341964217196e-05, + "loss": 0.0356, + "step": 4416 + }, + { + "epoch": 0.8847077662129704, + "learning_rate": 1.7944862923898586e-05, + "loss": 0.0437, + "step": 4418 + }, + { + "epoch": 0.8847077662129704, + "learning_rate": 1.7936368367090583e-05, + "loss": 0.2153, + "step": 4420 + }, + { + "epoch": 0.8855084067253803, + "learning_rate": 1.7927858310383196e-05, + "loss": 0.4518, + "step": 4422 + }, + { + "epoch": 0.8855084067253803, + "learning_rate": 1.7919332770396798e-05, + "loss": 0.0071, + "step": 4424 + }, + { + "epoch": 0.8863090472377902, + "learning_rate": 1.7910791763781928e-05, + "loss": 0.0115, + "step": 4426 + }, + { + "epoch": 0.8863090472377902, + "learning_rate": 1.7902235307219336e-05, + "loss": 0.0043, + "step": 4428 + }, + { + "epoch": 0.8871096877502002, + "learning_rate": 1.789366341742001e-05, + "loss": 0.0052, + "step": 4430 + }, + { + "epoch": 0.8871096877502002, + "learning_rate": 1.7885076111125e-05, + "loss": 0.0266, + "step": 4432 + }, + { + "epoch": 0.8879103282626101, + "learning_rate": 1.7876473405105535e-05, + "loss": 0.2389, + "step": 4434 + }, + { + "epoch": 0.8879103282626101, + "learning_rate": 1.7867855316162846e-05, + "loss": 0.6174, + "step": 4436 + }, + { + "epoch": 0.88871096877502, + "learning_rate": 1.785922186112829e-05, + "loss": 0.1741, + "step": 4438 + }, + { + "epoch": 0.88871096877502, + "learning_rate": 1.7850573056863173e-05, + "loss": 0.6703, + "step": 4440 + }, + { + "epoch": 0.8895116092874299, + "learning_rate": 1.7841908920258774e-05, + "loss": 0.8508, + "step": 4442 + }, + { + "epoch": 0.8895116092874299, + "learning_rate": 1.783322946823638e-05, + "loss": 0.5366, + "step": 4444 + }, + { + "epoch": 0.8903122497998399, + "learning_rate": 1.782453471774711e-05, + "loss": 0.0156, + "step": 4446 + }, + { + "epoch": 0.8903122497998399, + "learning_rate": 1.7815824685772042e-05, + "loss": 0.4487, + "step": 4448 + }, + { + "epoch": 0.8911128903122498, + "learning_rate": 1.7807099389322013e-05, + "loss": 0.0424, + "step": 4450 + }, + { + "epoch": 0.8911128903122498, + "learning_rate": 1.779835884543776e-05, + "loss": 0.1621, + "step": 4452 + }, + { + "epoch": 0.8919135308246597, + "learning_rate": 1.7789603071189733e-05, + "loss": 0.0488, + "step": 4454 + }, + { + "epoch": 0.8919135308246597, + "learning_rate": 1.7780832083678122e-05, + "loss": 0.0026, + "step": 4456 + }, + { + "epoch": 0.8927141713370697, + "learning_rate": 1.7772045900032912e-05, + "loss": 0.0407, + "step": 4458 + }, + { + "epoch": 0.8927141713370697, + "learning_rate": 1.776324453741365e-05, + "loss": 0.0363, + "step": 4460 + }, + { + "epoch": 0.8935148118494796, + "learning_rate": 1.7754428013009644e-05, + "loss": 0.0125, + "step": 4462 + }, + { + "epoch": 0.8935148118494796, + "learning_rate": 1.774559634403971e-05, + "loss": 0.3954, + "step": 4464 + }, + { + "epoch": 0.8943154523618895, + "learning_rate": 1.7736749547752327e-05, + "loss": 0.3634, + "step": 4466 + }, + { + "epoch": 0.8943154523618895, + "learning_rate": 1.7727887641425465e-05, + "loss": 0.6503, + "step": 4468 + }, + { + "epoch": 0.8951160928742994, + "learning_rate": 1.7719010642366597e-05, + "loss": 0.2508, + "step": 4470 + }, + { + "epoch": 0.8951160928742994, + "learning_rate": 1.7710118567912732e-05, + "loss": 0.0064, + "step": 4472 + }, + { + "epoch": 0.8959167333867094, + "learning_rate": 1.770121143543025e-05, + "loss": 0.0049, + "step": 4474 + }, + { + "epoch": 0.8959167333867094, + "learning_rate": 1.7692289262315008e-05, + "loss": 0.4575, + "step": 4476 + }, + { + "epoch": 0.8967173738991193, + "learning_rate": 1.7683352065992174e-05, + "loss": 0.3151, + "step": 4478 + }, + { + "epoch": 0.8967173738991193, + "learning_rate": 1.7674399863916298e-05, + "loss": 0.8447, + "step": 4480 + }, + { + "epoch": 0.8975180144115292, + "learning_rate": 1.7665432673571238e-05, + "loss": 0.4893, + "step": 4482 + }, + { + "epoch": 0.8975180144115292, + "learning_rate": 1.765645051247007e-05, + "loss": 0.1106, + "step": 4484 + }, + { + "epoch": 0.8983186549239391, + "learning_rate": 1.7647453398155204e-05, + "loss": 0.0603, + "step": 4486 + }, + { + "epoch": 0.8983186549239391, + "learning_rate": 1.7638441348198144e-05, + "loss": 0.1166, + "step": 4488 + }, + { + "epoch": 0.899119295436349, + "learning_rate": 1.7629414380199672e-05, + "loss": 0.011, + "step": 4490 + }, + { + "epoch": 0.899119295436349, + "learning_rate": 1.762037251178961e-05, + "loss": 0.0745, + "step": 4492 + }, + { + "epoch": 0.899919935948759, + "learning_rate": 1.7611315760626943e-05, + "loss": 0.0202, + "step": 4494 + }, + { + "epoch": 0.899919935948759, + "learning_rate": 1.7602244144399713e-05, + "loss": 0.1178, + "step": 4496 + }, + { + "epoch": 0.900720576461169, + "learning_rate": 1.7593157680824943e-05, + "loss": 0.1448, + "step": 4498 + }, + { + "epoch": 0.900720576461169, + "learning_rate": 1.7584056387648738e-05, + "loss": 0.0915, + "step": 4500 + }, + { + "epoch": 0.9015212169735789, + "learning_rate": 1.757494028264608e-05, + "loss": 0.2023, + "step": 4502 + }, + { + "epoch": 0.9015212169735789, + "learning_rate": 1.7565809383620966e-05, + "loss": 0.3148, + "step": 4504 + }, + { + "epoch": 0.9023218574859888, + "learning_rate": 1.7556663708406203e-05, + "loss": 0.2471, + "step": 4506 + }, + { + "epoch": 0.9023218574859888, + "learning_rate": 1.7547503274863502e-05, + "loss": 0.0228, + "step": 4508 + }, + { + "epoch": 0.9031224979983987, + "learning_rate": 1.7538328100883404e-05, + "loss": 0.0069, + "step": 4510 + }, + { + "epoch": 0.9031224979983987, + "learning_rate": 1.7529138204385186e-05, + "loss": 0.1112, + "step": 4512 + }, + { + "epoch": 0.9039231385108086, + "learning_rate": 1.7519933603316962e-05, + "loss": 0.0859, + "step": 4514 + }, + { + "epoch": 0.9039231385108086, + "learning_rate": 1.7510714315655467e-05, + "loss": 0.0017, + "step": 4516 + }, + { + "epoch": 0.9047237790232185, + "learning_rate": 1.750148035940622e-05, + "loss": 0.0127, + "step": 4518 + }, + { + "epoch": 0.9047237790232185, + "learning_rate": 1.7492231752603305e-05, + "loss": 0.2075, + "step": 4520 + }, + { + "epoch": 0.9055244195356285, + "learning_rate": 1.7482968513309458e-05, + "loss": 0.0021, + "step": 4522 + }, + { + "epoch": 0.9055244195356285, + "learning_rate": 1.7473690659616e-05, + "loss": 0.0979, + "step": 4524 + }, + { + "epoch": 0.9063250600480385, + "learning_rate": 1.7464398209642744e-05, + "loss": 0.0251, + "step": 4526 + }, + { + "epoch": 0.9063250600480385, + "learning_rate": 1.7455091181538094e-05, + "loss": 0.0189, + "step": 4528 + }, + { + "epoch": 0.9071257005604484, + "learning_rate": 1.7445769593478842e-05, + "loss": 0.0713, + "step": 4530 + }, + { + "epoch": 0.9071257005604484, + "learning_rate": 1.743643346367027e-05, + "loss": 0.0574, + "step": 4532 + }, + { + "epoch": 0.9079263410728583, + "learning_rate": 1.7427082810346024e-05, + "loss": 0.0129, + "step": 4534 + }, + { + "epoch": 0.9079263410728583, + "learning_rate": 1.741771765176815e-05, + "loss": 0.0363, + "step": 4536 + }, + { + "epoch": 0.9087269815852682, + "learning_rate": 1.740833800622701e-05, + "loss": 0.108, + "step": 4538 + }, + { + "epoch": 0.9087269815852682, + "learning_rate": 1.739894389204122e-05, + "loss": 0.1981, + "step": 4540 + }, + { + "epoch": 0.9095276220976781, + "learning_rate": 1.738953532755774e-05, + "loss": 0.1389, + "step": 4542 + }, + { + "epoch": 0.9095276220976781, + "learning_rate": 1.7380112331151657e-05, + "loss": 0.0033, + "step": 4544 + }, + { + "epoch": 0.910328262610088, + "learning_rate": 1.7370674921226306e-05, + "loss": 0.1383, + "step": 4546 + }, + { + "epoch": 0.910328262610088, + "learning_rate": 1.7361223116213146e-05, + "loss": 0.064, + "step": 4548 + }, + { + "epoch": 0.911128903122498, + "learning_rate": 1.7351756934571764e-05, + "loss": 0.0288, + "step": 4550 + }, + { + "epoch": 0.911128903122498, + "learning_rate": 1.7342276394789825e-05, + "loss": 0.1466, + "step": 4552 + }, + { + "epoch": 0.911929543634908, + "learning_rate": 1.7332781515382996e-05, + "loss": 0.0741, + "step": 4554 + }, + { + "epoch": 0.911929543634908, + "learning_rate": 1.732327231489503e-05, + "loss": 0.1906, + "step": 4556 + }, + { + "epoch": 0.9127301841473179, + "learning_rate": 1.7313748811897564e-05, + "loss": 0.0859, + "step": 4558 + }, + { + "epoch": 0.9127301841473179, + "learning_rate": 1.7304211024990216e-05, + "loss": 0.0742, + "step": 4560 + }, + { + "epoch": 0.9135308246597278, + "learning_rate": 1.7294658972800495e-05, + "loss": 0.0808, + "step": 4562 + }, + { + "epoch": 0.9135308246597278, + "learning_rate": 1.728509267398376e-05, + "loss": 0.1261, + "step": 4564 + }, + { + "epoch": 0.9143314651721377, + "learning_rate": 1.727551214722322e-05, + "loss": 0.4533, + "step": 4566 + }, + { + "epoch": 0.9143314651721377, + "learning_rate": 1.7265917411229803e-05, + "loss": 0.3084, + "step": 4568 + }, + { + "epoch": 0.9151321056845476, + "learning_rate": 1.72563084847423e-05, + "loss": 0.4168, + "step": 4570 + }, + { + "epoch": 0.9151321056845476, + "learning_rate": 1.7246685386527105e-05, + "loss": 0.0292, + "step": 4572 + }, + { + "epoch": 0.9159327461969575, + "learning_rate": 1.723704813537835e-05, + "loss": 0.0722, + "step": 4574 + }, + { + "epoch": 0.9159327461969575, + "learning_rate": 1.7227396750117802e-05, + "loss": 0.0364, + "step": 4576 + }, + { + "epoch": 0.9167333867093675, + "learning_rate": 1.7217731249594817e-05, + "loss": 0.0966, + "step": 4578 + }, + { + "epoch": 0.9167333867093675, + "learning_rate": 1.7208051652686348e-05, + "loss": 0.0772, + "step": 4580 + }, + { + "epoch": 0.9175340272217775, + "learning_rate": 1.7198357978296827e-05, + "loss": 0.1232, + "step": 4582 + }, + { + "epoch": 0.9175340272217775, + "learning_rate": 1.718865024535822e-05, + "loss": 0.0018, + "step": 4584 + }, + { + "epoch": 0.9183346677341874, + "learning_rate": 1.717892847282995e-05, + "loss": 0.0311, + "step": 4586 + }, + { + "epoch": 0.9183346677341874, + "learning_rate": 1.716919267969884e-05, + "loss": 0.0232, + "step": 4588 + }, + { + "epoch": 0.9191353082465973, + "learning_rate": 1.715944288497911e-05, + "loss": 0.0593, + "step": 4590 + }, + { + "epoch": 0.9191353082465973, + "learning_rate": 1.7149679107712317e-05, + "loss": 0.0691, + "step": 4592 + }, + { + "epoch": 0.9199359487590072, + "learning_rate": 1.713990136696734e-05, + "loss": 0.319, + "step": 4594 + }, + { + "epoch": 0.9199359487590072, + "learning_rate": 1.7130109681840298e-05, + "loss": 0.0095, + "step": 4596 + }, + { + "epoch": 0.9207365892714171, + "learning_rate": 1.7120304071454578e-05, + "loss": 0.0061, + "step": 4598 + }, + { + "epoch": 0.9207365892714171, + "learning_rate": 1.711048455496075e-05, + "loss": 0.1566, + "step": 4600 + }, + { + "epoch": 0.921537229783827, + "learning_rate": 1.7100651151536532e-05, + "loss": 0.051, + "step": 4602 + }, + { + "epoch": 0.921537229783827, + "learning_rate": 1.7090803880386784e-05, + "loss": 0.0418, + "step": 4604 + }, + { + "epoch": 0.922337870296237, + "learning_rate": 1.708094276074344e-05, + "loss": 0.0004, + "step": 4606 + }, + { + "epoch": 0.922337870296237, + "learning_rate": 1.7071067811865474e-05, + "loss": 0.5623, + "step": 4608 + }, + { + "epoch": 0.923138510808647, + "learning_rate": 1.7061179053038894e-05, + "loss": 0.4271, + "step": 4610 + }, + { + "epoch": 0.923138510808647, + "learning_rate": 1.705127650357663e-05, + "loss": 0.0304, + "step": 4612 + }, + { + "epoch": 0.9239391513210569, + "learning_rate": 1.704136018281859e-05, + "loss": 0.3872, + "step": 4614 + }, + { + "epoch": 0.9239391513210569, + "learning_rate": 1.7031430110131566e-05, + "loss": 0.0389, + "step": 4616 + }, + { + "epoch": 0.9247397918334668, + "learning_rate": 1.7021486304909202e-05, + "loss": 0.1278, + "step": 4618 + }, + { + "epoch": 0.9247397918334668, + "learning_rate": 1.701152878657197e-05, + "loss": 0.0363, + "step": 4620 + }, + { + "epoch": 0.9255404323458767, + "learning_rate": 1.700155757456711e-05, + "loss": 0.4973, + "step": 4622 + }, + { + "epoch": 0.9255404323458767, + "learning_rate": 1.699157268836863e-05, + "loss": 0.0269, + "step": 4624 + }, + { + "epoch": 0.9263410728582866, + "learning_rate": 1.6981574147477214e-05, + "loss": 0.0031, + "step": 4626 + }, + { + "epoch": 0.9263410728582866, + "learning_rate": 1.697156197142023e-05, + "loss": 0.1844, + "step": 4628 + }, + { + "epoch": 0.9271417133706965, + "learning_rate": 1.696153617975168e-05, + "loss": 0.0097, + "step": 4630 + }, + { + "epoch": 0.9271417133706965, + "learning_rate": 1.6951496792052148e-05, + "loss": 0.3659, + "step": 4632 + }, + { + "epoch": 0.9279423538831065, + "learning_rate": 1.694144382792878e-05, + "loss": 0.226, + "step": 4634 + }, + { + "epoch": 0.9279423538831065, + "learning_rate": 1.6931377307015236e-05, + "loss": 0.0247, + "step": 4636 + }, + { + "epoch": 0.9287429943955164, + "learning_rate": 1.6921297248971652e-05, + "loss": 0.1635, + "step": 4638 + }, + { + "epoch": 0.9287429943955164, + "learning_rate": 1.6911203673484583e-05, + "loss": 0.1907, + "step": 4640 + }, + { + "epoch": 0.9295436349079264, + "learning_rate": 1.690109660026701e-05, + "loss": 0.161, + "step": 4642 + }, + { + "epoch": 0.9295436349079264, + "learning_rate": 1.6890976049058267e-05, + "loss": 0.0338, + "step": 4644 + }, + { + "epoch": 0.9303442754203363, + "learning_rate": 1.688084203962401e-05, + "loss": 0.0256, + "step": 4646 + }, + { + "epoch": 0.9303442754203363, + "learning_rate": 1.687069459175619e-05, + "loss": 0.0607, + "step": 4648 + }, + { + "epoch": 0.9311449159327462, + "learning_rate": 1.6860533725272953e-05, + "loss": 0.0146, + "step": 4650 + }, + { + "epoch": 0.9311449159327462, + "learning_rate": 1.6850359460018744e-05, + "loss": 0.0035, + "step": 4652 + }, + { + "epoch": 0.9319455564451561, + "learning_rate": 1.6840171815864085e-05, + "loss": 0.0369, + "step": 4654 + }, + { + "epoch": 0.9319455564451561, + "learning_rate": 1.682997081270568e-05, + "loss": 0.2452, + "step": 4656 + }, + { + "epoch": 0.932746196957566, + "learning_rate": 1.681975647046631e-05, + "loss": 0.0881, + "step": 4658 + }, + { + "epoch": 0.932746196957566, + "learning_rate": 1.6809528809094805e-05, + "loss": 0.0985, + "step": 4660 + }, + { + "epoch": 0.933546837469976, + "learning_rate": 1.6799287848566024e-05, + "loss": 0.7647, + "step": 4662 + }, + { + "epoch": 0.933546837469976, + "learning_rate": 1.6789033608880742e-05, + "loss": 0.0535, + "step": 4664 + }, + { + "epoch": 0.9343474779823859, + "learning_rate": 1.6778766110065765e-05, + "loss": 0.026, + "step": 4666 + }, + { + "epoch": 0.9343474779823859, + "learning_rate": 1.67684853721737e-05, + "loss": 0.5852, + "step": 4668 + }, + { + "epoch": 0.9351481184947958, + "learning_rate": 1.6758191415283066e-05, + "loss": 0.0257, + "step": 4670 + }, + { + "epoch": 0.9351481184947958, + "learning_rate": 1.6747884259498185e-05, + "loss": 0.0145, + "step": 4672 + }, + { + "epoch": 0.9359487590072058, + "learning_rate": 1.673756392494916e-05, + "loss": 0.1984, + "step": 4674 + }, + { + "epoch": 0.9359487590072058, + "learning_rate": 1.6727230431791826e-05, + "loss": 0.5905, + "step": 4676 + }, + { + "epoch": 0.9367493995196157, + "learning_rate": 1.671688380020769e-05, + "loss": 0.0481, + "step": 4678 + }, + { + "epoch": 0.9367493995196157, + "learning_rate": 1.6706524050404006e-05, + "loss": 0.0443, + "step": 4680 + }, + { + "epoch": 0.9375500400320256, + "learning_rate": 1.6696151202613537e-05, + "loss": 0.2201, + "step": 4682 + }, + { + "epoch": 0.9375500400320256, + "learning_rate": 1.6685765277094702e-05, + "loss": 0.5024, + "step": 4684 + }, + { + "epoch": 0.9383506805444356, + "learning_rate": 1.6675366294131432e-05, + "loss": 0.0266, + "step": 4686 + }, + { + "epoch": 0.9383506805444356, + "learning_rate": 1.6664954274033175e-05, + "loss": 0.0042, + "step": 4688 + }, + { + "epoch": 0.9391513210568455, + "learning_rate": 1.6654529237134833e-05, + "loss": 0.6745, + "step": 4690 + }, + { + "epoch": 0.9391513210568455, + "learning_rate": 1.66440912037967e-05, + "loss": 0.0101, + "step": 4692 + }, + { + "epoch": 0.9399519615692554, + "learning_rate": 1.663364019440453e-05, + "loss": 0.0376, + "step": 4694 + }, + { + "epoch": 0.9399519615692554, + "learning_rate": 1.662317622936933e-05, + "loss": 0.0003, + "step": 4696 + }, + { + "epoch": 0.9407526020816653, + "learning_rate": 1.6612699329127467e-05, + "loss": 0.4053, + "step": 4698 + }, + { + "epoch": 0.9407526020816653, + "learning_rate": 1.6602209514140562e-05, + "loss": 0.2188, + "step": 4700 + }, + { + "epoch": 0.9415532425940752, + "learning_rate": 1.6591706804895415e-05, + "loss": 0.0749, + "step": 4702 + }, + { + "epoch": 0.9415532425940752, + "learning_rate": 1.6581191221904098e-05, + "loss": 0.0826, + "step": 4704 + }, + { + "epoch": 0.9423538831064852, + "learning_rate": 1.6570662785703716e-05, + "loss": 0.1388, + "step": 4706 + }, + { + "epoch": 0.9423538831064852, + "learning_rate": 1.6560121516856592e-05, + "loss": 0.2987, + "step": 4708 + }, + { + "epoch": 0.9431545236188951, + "learning_rate": 1.654956743595001e-05, + "loss": 0.0902, + "step": 4710 + }, + { + "epoch": 0.9431545236188951, + "learning_rate": 1.6539000563596328e-05, + "loss": 0.1025, + "step": 4712 + }, + { + "epoch": 0.9439551641313051, + "learning_rate": 1.6528420920432893e-05, + "loss": 0.2205, + "step": 4714 + }, + { + "epoch": 0.9439551641313051, + "learning_rate": 1.651782852712194e-05, + "loss": 0.0081, + "step": 4716 + }, + { + "epoch": 0.944755804643715, + "learning_rate": 1.6507223404350686e-05, + "loss": 0.161, + "step": 4718 + }, + { + "epoch": 0.944755804643715, + "learning_rate": 1.6496605572831127e-05, + "loss": 0.0141, + "step": 4720 + }, + { + "epoch": 0.9455564451561249, + "learning_rate": 1.648597505330016e-05, + "loss": 0.02, + "step": 4722 + }, + { + "epoch": 0.9455564451561249, + "learning_rate": 1.6475331866519387e-05, + "loss": 0.0408, + "step": 4724 + }, + { + "epoch": 0.9463570856685348, + "learning_rate": 1.6464676033275187e-05, + "loss": 0.002, + "step": 4726 + }, + { + "epoch": 0.9463570856685348, + "learning_rate": 1.6454007574378657e-05, + "loss": 0.0318, + "step": 4728 + }, + { + "epoch": 0.9471577261809447, + "learning_rate": 1.644332651066548e-05, + "loss": 0.641, + "step": 4730 + }, + { + "epoch": 0.9471577261809447, + "learning_rate": 1.6432632862996062e-05, + "loss": 0.0143, + "step": 4732 + }, + { + "epoch": 0.9479583666933546, + "learning_rate": 1.6421926652255275e-05, + "loss": 0.0665, + "step": 4734 + }, + { + "epoch": 0.9479583666933546, + "learning_rate": 1.6411207899352633e-05, + "loss": 0.1474, + "step": 4736 + }, + { + "epoch": 0.9487590072057646, + "learning_rate": 1.6400476625222057e-05, + "loss": 0.0244, + "step": 4738 + }, + { + "epoch": 0.9487590072057646, + "learning_rate": 1.6389732850821964e-05, + "loss": 0.0217, + "step": 4740 + }, + { + "epoch": 0.9495596477181746, + "learning_rate": 1.6378976597135193e-05, + "loss": 0.3912, + "step": 4742 + }, + { + "epoch": 0.9495596477181746, + "learning_rate": 1.6368207885168904e-05, + "loss": 0.5459, + "step": 4744 + }, + { + "epoch": 0.9503602882305845, + "learning_rate": 1.635742673595468e-05, + "loss": 0.0611, + "step": 4746 + }, + { + "epoch": 0.9503602882305845, + "learning_rate": 1.6346633170548275e-05, + "loss": 0.0711, + "step": 4748 + }, + { + "epoch": 0.9511609287429944, + "learning_rate": 1.6335827210029823e-05, + "loss": 0.0078, + "step": 4750 + }, + { + "epoch": 0.9511609287429944, + "learning_rate": 1.6325008875503563e-05, + "loss": 0.3129, + "step": 4752 + }, + { + "epoch": 0.9519615692554043, + "learning_rate": 1.6314178188097917e-05, + "loss": 0.6196, + "step": 4754 + }, + { + "epoch": 0.9519615692554043, + "learning_rate": 1.6303335168965495e-05, + "loss": 0.3044, + "step": 4756 + }, + { + "epoch": 0.9527622097678142, + "learning_rate": 1.6292479839282904e-05, + "loss": 0.0344, + "step": 4758 + }, + { + "epoch": 0.9527622097678142, + "learning_rate": 1.628161222025089e-05, + "loss": 0.1647, + "step": 4760 + }, + { + "epoch": 0.9535628502802241, + "learning_rate": 1.627073233309409e-05, + "loss": 0.2549, + "step": 4762 + }, + { + "epoch": 0.9535628502802241, + "learning_rate": 1.625984019906122e-05, + "loss": 0.0008, + "step": 4764 + }, + { + "epoch": 0.9543634907926342, + "learning_rate": 1.624893583942482e-05, + "loss": 0.0005, + "step": 4766 + }, + { + "epoch": 0.9543634907926342, + "learning_rate": 1.623801927548132e-05, + "loss": 0.1165, + "step": 4768 + }, + { + "epoch": 0.9551641313050441, + "learning_rate": 1.6227090528551058e-05, + "loss": 0.2526, + "step": 4770 + }, + { + "epoch": 0.9551641313050441, + "learning_rate": 1.6216149619978057e-05, + "loss": 0.0205, + "step": 4772 + }, + { + "epoch": 0.955964771817454, + "learning_rate": 1.6205196571130204e-05, + "loss": 0.0128, + "step": 4774 + }, + { + "epoch": 0.955964771817454, + "learning_rate": 1.6194231403398987e-05, + "loss": 0.5051, + "step": 4776 + }, + { + "epoch": 0.9567654123298639, + "learning_rate": 1.618325413819967e-05, + "loss": 0.007, + "step": 4778 + }, + { + "epoch": 0.9567654123298639, + "learning_rate": 1.6172264796971063e-05, + "loss": 0.0147, + "step": 4780 + }, + { + "epoch": 0.9575660528422738, + "learning_rate": 1.6161263401175555e-05, + "loss": 0.3514, + "step": 4782 + }, + { + "epoch": 0.9575660528422738, + "learning_rate": 1.6150249972299173e-05, + "loss": 0.1475, + "step": 4784 + }, + { + "epoch": 0.9583666933546837, + "learning_rate": 1.613922453185133e-05, + "loss": 0.0236, + "step": 4786 + }, + { + "epoch": 0.9583666933546837, + "learning_rate": 1.612818710136499e-05, + "loss": 0.2684, + "step": 4788 + }, + { + "epoch": 0.9591673338670936, + "learning_rate": 1.6117137702396454e-05, + "loss": 0.4085, + "step": 4790 + }, + { + "epoch": 0.9591673338670936, + "learning_rate": 1.6106076356525484e-05, + "loss": 0.0641, + "step": 4792 + }, + { + "epoch": 0.9599679743795037, + "learning_rate": 1.6095003085355103e-05, + "loss": 0.0229, + "step": 4794 + }, + { + "epoch": 0.9599679743795037, + "learning_rate": 1.6083917910511623e-05, + "loss": 0.0324, + "step": 4796 + }, + { + "epoch": 0.9607686148919136, + "learning_rate": 1.6072820853644688e-05, + "loss": 0.0663, + "step": 4798 + }, + { + "epoch": 0.9607686148919136, + "learning_rate": 1.6061711936427028e-05, + "loss": 0.1194, + "step": 4800 + }, + { + "epoch": 0.9615692554043235, + "learning_rate": 1.6050591180554658e-05, + "loss": 0.0705, + "step": 4802 + }, + { + "epoch": 0.9615692554043235, + "learning_rate": 1.60394586077466e-05, + "loss": 0.0039, + "step": 4804 + }, + { + "epoch": 0.9623698959167334, + "learning_rate": 1.6028314239745068e-05, + "loss": 0.0016, + "step": 4806 + }, + { + "epoch": 0.9623698959167334, + "learning_rate": 1.6017158098315224e-05, + "loss": 0.3333, + "step": 4808 + }, + { + "epoch": 0.9631705364291433, + "learning_rate": 1.6005990205245226e-05, + "loss": 0.0645, + "step": 4810 + }, + { + "epoch": 0.9631705364291433, + "learning_rate": 1.5994810582346266e-05, + "loss": 0.0618, + "step": 4812 + }, + { + "epoch": 0.9639711769415532, + "learning_rate": 1.5983619251452334e-05, + "loss": 0.0912, + "step": 4814 + }, + { + "epoch": 0.9639711769415532, + "learning_rate": 1.5972416234420404e-05, + "loss": 1.032, + "step": 4816 + }, + { + "epoch": 0.9647718174539631, + "learning_rate": 1.596120155313017e-05, + "loss": 0.0508, + "step": 4818 + }, + { + "epoch": 0.9647718174539631, + "learning_rate": 1.594997522948413e-05, + "loss": 0.0337, + "step": 4820 + }, + { + "epoch": 0.9655724579663731, + "learning_rate": 1.593873728540759e-05, + "loss": 0.2075, + "step": 4822 + }, + { + "epoch": 0.9655724579663731, + "learning_rate": 1.592748774284844e-05, + "loss": 0.1392, + "step": 4824 + }, + { + "epoch": 0.966373098478783, + "learning_rate": 1.5916226623777346e-05, + "loss": 0.0479, + "step": 4826 + }, + { + "epoch": 0.966373098478783, + "learning_rate": 1.5904953950187448e-05, + "loss": 0.2172, + "step": 4828 + }, + { + "epoch": 0.967173738991193, + "learning_rate": 1.5893669744094587e-05, + "loss": 0.0888, + "step": 4830 + }, + { + "epoch": 0.967173738991193, + "learning_rate": 1.588237402753703e-05, + "loss": 0.1391, + "step": 4832 + }, + { + "epoch": 0.9679743795036029, + "learning_rate": 1.5871066822575526e-05, + "loss": 0.0049, + "step": 4834 + }, + { + "epoch": 0.9679743795036029, + "learning_rate": 1.5859748151293354e-05, + "loss": 0.0797, + "step": 4836 + }, + { + "epoch": 0.9687750200160128, + "learning_rate": 1.5848418035796064e-05, + "loss": 0.1279, + "step": 4838 + }, + { + "epoch": 0.9687750200160128, + "learning_rate": 1.5837076498211673e-05, + "loss": 0.063, + "step": 4840 + }, + { + "epoch": 0.9695756605284227, + "learning_rate": 1.5825723560690396e-05, + "loss": 0.0048, + "step": 4842 + }, + { + "epoch": 0.9695756605284227, + "learning_rate": 1.581435924540482e-05, + "loss": 0.0363, + "step": 4844 + }, + { + "epoch": 0.9703763010408326, + "learning_rate": 1.580298357454967e-05, + "loss": 0.1297, + "step": 4846 + }, + { + "epoch": 0.9703763010408326, + "learning_rate": 1.579159657034185e-05, + "loss": 0.0172, + "step": 4848 + }, + { + "epoch": 0.9711769415532426, + "learning_rate": 1.5780198255020485e-05, + "loss": 0.0203, + "step": 4850 + }, + { + "epoch": 0.9711769415532426, + "learning_rate": 1.5768788650846674e-05, + "loss": 0.2499, + "step": 4852 + }, + { + "epoch": 0.9719775820656525, + "learning_rate": 1.5757367780103672e-05, + "loss": 0.0279, + "step": 4854 + }, + { + "epoch": 0.9719775820656525, + "learning_rate": 1.574593566509664e-05, + "loss": 0.0165, + "step": 4856 + }, + { + "epoch": 0.9727782225780625, + "learning_rate": 1.5734492328152796e-05, + "loss": 1.0499, + "step": 4858 + }, + { + "epoch": 0.9727782225780625, + "learning_rate": 1.5723037791621203e-05, + "loss": 0.0406, + "step": 4860 + }, + { + "epoch": 0.9735788630904724, + "learning_rate": 1.5711572077872784e-05, + "loss": 0.5883, + "step": 4862 + }, + { + "epoch": 0.9735788630904724, + "learning_rate": 1.5700095209300386e-05, + "loss": 0.5494, + "step": 4864 + }, + { + "epoch": 0.9743795036028823, + "learning_rate": 1.568860720831852e-05, + "loss": 0.0975, + "step": 4866 + }, + { + "epoch": 0.9743795036028823, + "learning_rate": 1.5677108097363565e-05, + "loss": 0.3404, + "step": 4868 + }, + { + "epoch": 0.9751801441152922, + "learning_rate": 1.5665597898893508e-05, + "loss": 0.0578, + "step": 4870 + }, + { + "epoch": 0.9751801441152922, + "learning_rate": 1.5654076635387976e-05, + "loss": 0.0768, + "step": 4872 + }, + { + "epoch": 0.9759807846277022, + "learning_rate": 1.5642544329348316e-05, + "loss": 0.0874, + "step": 4874 + }, + { + "epoch": 0.9759807846277022, + "learning_rate": 1.5631001003297302e-05, + "loss": 0.0643, + "step": 4876 + }, + { + "epoch": 0.9767814251401121, + "learning_rate": 1.5619446679779367e-05, + "loss": 0.167, + "step": 4878 + }, + { + "epoch": 0.9767814251401121, + "learning_rate": 1.560788138136029e-05, + "loss": 0.6188, + "step": 4880 + }, + { + "epoch": 0.977582065652522, + "learning_rate": 1.5596305130627414e-05, + "loss": 0.3616, + "step": 4882 + }, + { + "epoch": 0.977582065652522, + "learning_rate": 1.5584717950189373e-05, + "loss": 0.0884, + "step": 4884 + }, + { + "epoch": 0.978382706164932, + "learning_rate": 1.5573119862676155e-05, + "loss": 0.0462, + "step": 4886 + }, + { + "epoch": 0.978382706164932, + "learning_rate": 1.5561510890739137e-05, + "loss": 0.2085, + "step": 4888 + }, + { + "epoch": 0.9791833466773419, + "learning_rate": 1.554989105705083e-05, + "loss": 0.6744, + "step": 4890 + }, + { + "epoch": 0.9791833466773419, + "learning_rate": 1.5538260384305083e-05, + "loss": 0.0331, + "step": 4892 + }, + { + "epoch": 0.9799839871897518, + "learning_rate": 1.5526618895216786e-05, + "loss": 0.0593, + "step": 4894 + }, + { + "epoch": 0.9799839871897518, + "learning_rate": 1.5514966612522088e-05, + "loss": 0.1083, + "step": 4896 + }, + { + "epoch": 0.9807846277021617, + "learning_rate": 1.5503303558978112e-05, + "loss": 0.0498, + "step": 4898 + }, + { + "epoch": 0.9807846277021617, + "learning_rate": 1.5491629757363033e-05, + "loss": 0.3636, + "step": 4900 + }, + { + "epoch": 0.9815852682145717, + "learning_rate": 1.547994523047609e-05, + "loss": 0.048, + "step": 4902 + }, + { + "epoch": 0.9815852682145717, + "learning_rate": 1.546825000113736e-05, + "loss": 0.1679, + "step": 4904 + }, + { + "epoch": 0.9823859087269816, + "learning_rate": 1.545654409218794e-05, + "loss": 0.0022, + "step": 4906 + }, + { + "epoch": 0.9823859087269816, + "learning_rate": 1.544482752648966e-05, + "loss": 0.0209, + "step": 4908 + }, + { + "epoch": 0.9831865492393915, + "learning_rate": 1.5433100326925298e-05, + "loss": 0.005, + "step": 4910 + }, + { + "epoch": 0.9831865492393915, + "learning_rate": 1.5421362516398285e-05, + "loss": 0.4835, + "step": 4912 + }, + { + "epoch": 0.9839871897518014, + "learning_rate": 1.5409614117832797e-05, + "loss": 0.0608, + "step": 4914 + }, + { + "epoch": 0.9839871897518014, + "learning_rate": 1.539785515417377e-05, + "loss": 0.073, + "step": 4916 + }, + { + "epoch": 0.9847878302642114, + "learning_rate": 1.538608564838665e-05, + "loss": 0.022, + "step": 4918 + }, + { + "epoch": 0.9847878302642114, + "learning_rate": 1.5374305623457605e-05, + "loss": 0.0577, + "step": 4920 + }, + { + "epoch": 0.9855884707766213, + "learning_rate": 1.5362515102393244e-05, + "loss": 0.1315, + "step": 4922 + }, + { + "epoch": 0.9855884707766213, + "learning_rate": 1.5350714108220677e-05, + "loss": 0.043, + "step": 4924 + }, + { + "epoch": 0.9863891112890312, + "learning_rate": 1.5338902663987564e-05, + "loss": 0.1197, + "step": 4926 + }, + { + "epoch": 0.9863891112890312, + "learning_rate": 1.532708079276186e-05, + "loss": 0.0243, + "step": 4928 + }, + { + "epoch": 0.9871897518014412, + "learning_rate": 1.531524851763198e-05, + "loss": 0.3316, + "step": 4930 + }, + { + "epoch": 0.9871897518014412, + "learning_rate": 1.5303405861706567e-05, + "loss": 0.0711, + "step": 4932 + }, + { + "epoch": 0.9879903923138511, + "learning_rate": 1.529155284811464e-05, + "loss": 0.0271, + "step": 4934 + }, + { + "epoch": 0.9879903923138511, + "learning_rate": 1.5279689500005353e-05, + "loss": 0.2095, + "step": 4936 + }, + { + "epoch": 0.988791032826261, + "learning_rate": 1.5267815840548067e-05, + "loss": 0.1111, + "step": 4938 + }, + { + "epoch": 0.988791032826261, + "learning_rate": 1.5255931892932344e-05, + "loss": 0.431, + "step": 4940 + }, + { + "epoch": 0.9895916733386709, + "learning_rate": 1.5244037680367739e-05, + "loss": 0.017, + "step": 4942 + }, + { + "epoch": 0.9895916733386709, + "learning_rate": 1.5232133226083962e-05, + "loss": 0.0678, + "step": 4944 + }, + { + "epoch": 0.9903923138510808, + "learning_rate": 1.522021855333061e-05, + "loss": 0.5357, + "step": 4946 + }, + { + "epoch": 0.9903923138510808, + "learning_rate": 1.5208293685377362e-05, + "loss": 0.1738, + "step": 4948 + }, + { + "epoch": 0.9911929543634908, + "learning_rate": 1.519635864551371e-05, + "loss": 0.0039, + "step": 4950 + }, + { + "epoch": 0.9911929543634908, + "learning_rate": 1.5184413457049014e-05, + "loss": 0.2504, + "step": 4952 + }, + { + "epoch": 0.9919935948759008, + "learning_rate": 1.5172458143312548e-05, + "loss": 0.0179, + "step": 4954 + }, + { + "epoch": 0.9919935948759008, + "learning_rate": 1.5160492727653238e-05, + "loss": 0.2076, + "step": 4956 + }, + { + "epoch": 0.9927942353883107, + "learning_rate": 1.5148517233439858e-05, + "loss": 0.0743, + "step": 4958 + }, + { + "epoch": 0.9927942353883107, + "learning_rate": 1.5136531684060753e-05, + "loss": 0.1117, + "step": 4960 + }, + { + "epoch": 0.9935948759007206, + "learning_rate": 1.512453610292402e-05, + "loss": 0.079, + "step": 4962 + }, + { + "epoch": 0.9935948759007206, + "learning_rate": 1.5112530513457251e-05, + "loss": 0.0026, + "step": 4964 + }, + { + "epoch": 0.9943955164131305, + "learning_rate": 1.5100514939107598e-05, + "loss": 0.0578, + "step": 4966 + }, + { + "epoch": 0.9943955164131305, + "learning_rate": 1.50884894033418e-05, + "loss": 0.0305, + "step": 4968 + }, + { + "epoch": 0.9951961569255404, + "learning_rate": 1.5076453929645927e-05, + "loss": 0.0813, + "step": 4970 + }, + { + "epoch": 0.9951961569255404, + "learning_rate": 1.5064408541525578e-05, + "loss": 0.6164, + "step": 4972 + }, + { + "epoch": 0.9959967974379503, + "learning_rate": 1.505235326250563e-05, + "loss": 0.0735, + "step": 4974 + }, + { + "epoch": 0.9959967974379503, + "learning_rate": 1.504028811613027e-05, + "loss": 0.0054, + "step": 4976 + }, + { + "epoch": 0.9967974379503602, + "learning_rate": 1.5028213125963054e-05, + "loss": 0.0529, + "step": 4978 + }, + { + "epoch": 0.9967974379503602, + "learning_rate": 1.5016128315586636e-05, + "loss": 0.0124, + "step": 4980 + }, + { + "epoch": 0.9975980784627703, + "learning_rate": 1.5004033708602977e-05, + "loss": 0.0988, + "step": 4982 + }, + { + "epoch": 0.9975980784627703, + "learning_rate": 1.4991929328633043e-05, + "loss": 0.0814, + "step": 4984 + }, + { + "epoch": 0.9983987189751802, + "learning_rate": 1.4979815199317011e-05, + "loss": 0.0498, + "step": 4986 + }, + { + "epoch": 0.9983987189751802, + "learning_rate": 1.4967691344314012e-05, + "loss": 0.0177, + "step": 4988 + }, + { + "epoch": 0.9991993594875901, + "learning_rate": 1.495555778730216e-05, + "loss": 0.1159, + "step": 4990 + }, + { + "epoch": 0.9991993594875901, + "learning_rate": 1.4943414551978622e-05, + "loss": 0.2319, + "step": 4992 + }, + { + "epoch": 1.0, + "learning_rate": 1.4931261662059333e-05, + "loss": 0.0877, + "step": 4994 + }, + { + "epoch": 1.0, + "learning_rate": 1.4919099141279214e-05, + "loss": 0.0255, + "step": 4996 + }, + { + "epoch": 1.0, + "step": 4996, + "total_flos": 3.09618044567552e+16, + "train_loss": 0.15984798657085356, + "train_runtime": 5072.2105, + "train_samples_per_second": 3.94, + "train_steps_per_second": 0.985 + } + ], + "logging_steps": 2, + "max_steps": 4996, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 3.09618044567552e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_25_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_25_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..20adcba212f7646846c0e24463cacd0efcbbc993 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_25_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a5cc3bfe978b71c6f6c43c278641bed025d7fc5c46d359d54155b0e37859bb3 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_25_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_25_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0d2ccf52224106aa522cb53f72f67154727e4976 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_25_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fa3929918ecf1249e47ca3834bb223acdf9a844c9bcd45e067a416afe2bf8df +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_25_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_25_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..91728fd1364bb7865fd2a8bf276e3526c2791d3c --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_25_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:965c165a85094c79db9302efaa9709605d1282ba3bc1db1d959861d62fea843d +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_25_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_25_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..2f43cd79f1836173c1505cb8f2b43912f3c3c9ee --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_125_sft_scenario12_new_10000_random0_25_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2d4e19e8550de9ef06c71e877cb99de9508e25318249bd436639c6947b8a8d3 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_coincide_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_coincide_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cdd9f8e29d231b213a61eecc8d31cd1c4f51f857 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_coincide_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,7532 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008, + "learning_rate": 2.357535430610912e-06, + "loss": 0.1137, + "step": 2 + }, + { + "epoch": 0.0016, + "learning_rate": 2.3755748898855234e-06, + "loss": 0.4873, + "step": 4 + }, + { + "epoch": 0.0024, + "learning_rate": 2.3936738059587174e-06, + "loss": 0.0008, + "step": 6 + }, + { + "epoch": 0.0032, + "learning_rate": 2.411832037691545e-06, + "loss": 0.0092, + "step": 8 + }, + { + "epoch": 0.004, + "learning_rate": 2.430049443482434e-06, + "loss": 0.2262, + "step": 10 + }, + { + "epoch": 0.0048, + "learning_rate": 2.448325881268406e-06, + "loss": 0.4922, + "step": 12 + }, + { + "epoch": 0.0056, + "learning_rate": 2.4666612085261277e-06, + "loss": 0.0003, + "step": 14 + }, + { + "epoch": 0.0064, + "learning_rate": 2.4850552822730346e-06, + "loss": 0.2309, + "step": 16 + }, + { + "epoch": 0.0072, + "learning_rate": 2.503507959068455e-06, + "loss": 0.0885, + "step": 18 + }, + { + "epoch": 0.008, + "learning_rate": 2.522019095014686e-06, + "loss": 0.2286, + "step": 20 + }, + { + "epoch": 0.0088, + "learning_rate": 2.5405885457581814e-06, + "loss": 0.2004, + "step": 22 + }, + { + "epoch": 0.0096, + "learning_rate": 2.5592161664906243e-06, + "loss": 0.0887, + "step": 24 + }, + { + "epoch": 0.0104, + "learning_rate": 2.5779018119501086e-06, + "loss": 0.0042, + "step": 26 + }, + { + "epoch": 0.0112, + "learning_rate": 2.596645336422219e-06, + "loss": 0.0865, + "step": 28 + }, + { + "epoch": 0.012, + "learning_rate": 2.615446593741161e-06, + "loss": 0.4674, + "step": 30 + }, + { + "epoch": 0.0128, + "learning_rate": 2.6343054372909648e-06, + "loss": 0.0055, + "step": 32 + }, + { + "epoch": 0.0136, + "learning_rate": 2.6532217200065826e-06, + "loss": 0.4743, + "step": 34 + }, + { + "epoch": 0.0144, + "learning_rate": 2.6721952943750396e-06, + "loss": 0.2151, + "step": 36 + }, + { + "epoch": 0.0152, + "learning_rate": 2.691226012436604e-06, + "loss": 0.1093, + "step": 38 + }, + { + "epoch": 0.016, + "learning_rate": 2.7103137257858893e-06, + "loss": 0.225, + "step": 40 + }, + { + "epoch": 0.0168, + "learning_rate": 2.7294582855730733e-06, + "loss": 0.0088, + "step": 42 + }, + { + "epoch": 0.0176, + "learning_rate": 2.7486595425050566e-06, + "loss": 0.2947, + "step": 44 + }, + { + "epoch": 0.0184, + "learning_rate": 2.7679173468465813e-06, + "loss": 0.1448, + "step": 46 + }, + { + "epoch": 0.0192, + "learning_rate": 2.7872315484213954e-06, + "loss": 0.3437, + "step": 48 + }, + { + "epoch": 0.02, + "learning_rate": 2.8066019966134873e-06, + "loss": 0.1013, + "step": 50 + }, + { + "epoch": 0.0208, + "learning_rate": 2.826028540368212e-06, + "loss": 0.0157, + "step": 52 + }, + { + "epoch": 0.0216, + "learning_rate": 2.845511028193477e-06, + "loss": 0.0021, + "step": 54 + }, + { + "epoch": 0.0224, + "learning_rate": 2.865049308160931e-06, + "loss": 0.0579, + "step": 56 + }, + { + "epoch": 0.0232, + "learning_rate": 2.8846432279071533e-06, + "loss": 0.0005, + "step": 58 + }, + { + "epoch": 0.024, + "learning_rate": 2.9042926346347835e-06, + "loss": 0.1051, + "step": 60 + }, + { + "epoch": 0.0248, + "learning_rate": 2.9239973751138397e-06, + "loss": 0.0405, + "step": 62 + }, + { + "epoch": 0.0256, + "learning_rate": 2.943757295682783e-06, + "loss": 0.3857, + "step": 64 + }, + { + "epoch": 0.0264, + "learning_rate": 2.9635722422497983e-06, + "loss": 0.1994, + "step": 66 + }, + { + "epoch": 0.0272, + "learning_rate": 2.983442060293926e-06, + "loss": 0.2379, + "step": 68 + }, + { + "epoch": 0.028, + "learning_rate": 3.003366594866345e-06, + "loss": 0.4975, + "step": 70 + }, + { + "epoch": 0.0288, + "learning_rate": 3.0233456905915338e-06, + "loss": 0.3085, + "step": 72 + }, + { + "epoch": 0.0296, + "learning_rate": 3.0433791916684885e-06, + "loss": 0.3479, + "step": 74 + }, + { + "epoch": 0.0304, + "learning_rate": 3.0634669418719453e-06, + "loss": 0.0421, + "step": 76 + }, + { + "epoch": 0.0312, + "learning_rate": 3.0836087845535933e-06, + "loss": 0.2549, + "step": 78 + }, + { + "epoch": 0.032, + "learning_rate": 3.1038045626432945e-06, + "loss": 0.412, + "step": 80 + }, + { + "epoch": 0.0328, + "learning_rate": 3.1240541186503173e-06, + "loss": 0.1209, + "step": 82 + }, + { + "epoch": 0.0336, + "learning_rate": 3.1443572946645683e-06, + "loss": 0.1098, + "step": 84 + }, + { + "epoch": 0.0344, + "learning_rate": 3.164713932357776e-06, + "loss": 0.1732, + "step": 86 + }, + { + "epoch": 0.0352, + "learning_rate": 3.1851238729848033e-06, + "loss": 1.2045, + "step": 88 + }, + { + "epoch": 0.036, + "learning_rate": 3.205586957384834e-06, + "loss": 0.0005, + "step": 90 + }, + { + "epoch": 0.0368, + "learning_rate": 3.2261030259826253e-06, + "loss": 0.4249, + "step": 92 + }, + { + "epoch": 0.0376, + "learning_rate": 3.246671918789752e-06, + "loss": 0.2557, + "step": 94 + }, + { + "epoch": 0.0384, + "learning_rate": 3.267293475405858e-06, + "loss": 0.0049, + "step": 96 + }, + { + "epoch": 0.0392, + "learning_rate": 3.2879675350199004e-06, + "loss": 0.4825, + "step": 98 + }, + { + "epoch": 0.04, + "learning_rate": 3.3086939364114113e-06, + "loss": 0.3985, + "step": 100 + }, + { + "epoch": 0.0408, + "learning_rate": 3.329472517951747e-06, + "loss": 0.2239, + "step": 102 + }, + { + "epoch": 0.0416, + "learning_rate": 3.350303117605369e-06, + "loss": 0.0841, + "step": 104 + }, + { + "epoch": 0.0424, + "learning_rate": 3.3711855729310503e-06, + "loss": 0.1688, + "step": 106 + }, + { + "epoch": 0.0432, + "learning_rate": 3.3921197210832235e-06, + "loss": 0.2858, + "step": 108 + }, + { + "epoch": 0.044, + "learning_rate": 3.4131053988131947e-06, + "loss": 0.3616, + "step": 110 + }, + { + "epoch": 0.0448, + "learning_rate": 3.434142442470434e-06, + "loss": 0.3541, + "step": 112 + }, + { + "epoch": 0.0456, + "learning_rate": 3.455230688003849e-06, + "loss": 0.1876, + "step": 114 + }, + { + "epoch": 0.0464, + "learning_rate": 3.476369970963065e-06, + "loss": 0.0561, + "step": 116 + }, + { + "epoch": 0.0472, + "learning_rate": 3.497560126499706e-06, + "loss": 0.1496, + "step": 118 + }, + { + "epoch": 0.048, + "learning_rate": 3.5188009893686836e-06, + "loss": 0.0064, + "step": 120 + }, + { + "epoch": 0.0488, + "learning_rate": 3.5400923939294827e-06, + "loss": 0.167, + "step": 122 + }, + { + "epoch": 0.0496, + "learning_rate": 3.5614341741474667e-06, + "loss": 0.0688, + "step": 124 + }, + { + "epoch": 0.0504, + "learning_rate": 3.5828261635951177e-06, + "loss": 0.0029, + "step": 126 + }, + { + "epoch": 0.0512, + "learning_rate": 3.604268195453421e-06, + "loss": 0.1979, + "step": 128 + }, + { + "epoch": 0.052, + "learning_rate": 3.6257601025130893e-06, + "loss": 0.0048, + "step": 130 + }, + { + "epoch": 0.0528, + "learning_rate": 3.647301717175955e-06, + "loss": 0.1234, + "step": 132 + }, + { + "epoch": 0.0536, + "learning_rate": 3.66889287145614e-06, + "loss": 0.1726, + "step": 134 + }, + { + "epoch": 0.0544, + "learning_rate": 3.6905333969814995e-06, + "loss": 0.5575, + "step": 136 + }, + { + "epoch": 0.0552, + "learning_rate": 3.712223124994867e-06, + "loss": 0.1828, + "step": 138 + }, + { + "epoch": 0.056, + "learning_rate": 3.7339618863553885e-06, + "loss": 0.2203, + "step": 140 + }, + { + "epoch": 0.0568, + "learning_rate": 3.755749511539848e-06, + "loss": 0.0043, + "step": 142 + }, + { + "epoch": 0.0576, + "learning_rate": 3.7775858306439404e-06, + "loss": 0.0016, + "step": 144 + }, + { + "epoch": 0.0584, + "learning_rate": 3.799470673383677e-06, + "loss": 0.0987, + "step": 146 + }, + { + "epoch": 0.0592, + "learning_rate": 3.821403869096644e-06, + "loss": 0.3883, + "step": 148 + }, + { + "epoch": 0.06, + "learning_rate": 3.8433852467434175e-06, + "loss": 0.3289, + "step": 150 + }, + { + "epoch": 0.0608, + "learning_rate": 3.865414634908756e-06, + "loss": 0.005, + "step": 152 + }, + { + "epoch": 0.0616, + "learning_rate": 3.887491861803081e-06, + "loss": 0.1816, + "step": 154 + }, + { + "epoch": 0.0624, + "learning_rate": 3.909616755263741e-06, + "loss": 0.4841, + "step": 156 + }, + { + "epoch": 0.0632, + "learning_rate": 3.9317891427563725e-06, + "loss": 0.0914, + "step": 158 + }, + { + "epoch": 0.064, + "learning_rate": 3.954008851376244e-06, + "loss": 0.074, + "step": 160 + }, + { + "epoch": 0.0648, + "learning_rate": 3.976275707849619e-06, + "loss": 0.6025, + "step": 162 + }, + { + "epoch": 0.0656, + "learning_rate": 3.99858953853505e-06, + "loss": 0.1252, + "step": 164 + }, + { + "epoch": 0.0664, + "learning_rate": 4.0209501694248e-06, + "loss": 0.001, + "step": 166 + }, + { + "epoch": 0.0672, + "learning_rate": 4.043357426146209e-06, + "loss": 0.0942, + "step": 168 + }, + { + "epoch": 0.068, + "learning_rate": 4.065811133962987e-06, + "loss": 0.0693, + "step": 170 + }, + { + "epoch": 0.0688, + "learning_rate": 4.08831111777658e-06, + "loss": 0.1663, + "step": 172 + }, + { + "epoch": 0.0696, + "learning_rate": 4.110857202127611e-06, + "loss": 0.3222, + "step": 174 + }, + { + "epoch": 0.0704, + "learning_rate": 4.133449211197183e-06, + "loss": 0.0883, + "step": 176 + }, + { + "epoch": 0.0712, + "learning_rate": 4.156086968808274e-06, + "loss": 0.1923, + "step": 178 + }, + { + "epoch": 0.072, + "learning_rate": 4.178770298427114e-06, + "loss": 0.0004, + "step": 180 + }, + { + "epoch": 0.0728, + "learning_rate": 4.201499023164515e-06, + "loss": 0.1559, + "step": 182 + }, + { + "epoch": 0.0736, + "learning_rate": 4.224272965777315e-06, + "loss": 0.0205, + "step": 184 + }, + { + "epoch": 0.0744, + "learning_rate": 4.247091948669764e-06, + "loss": 0.3531, + "step": 186 + }, + { + "epoch": 0.0752, + "learning_rate": 4.269955793894849e-06, + "loss": 0.1389, + "step": 188 + }, + { + "epoch": 0.076, + "learning_rate": 4.292864323155684e-06, + "loss": 0.0018, + "step": 190 + }, + { + "epoch": 0.0768, + "learning_rate": 4.3158173578069696e-06, + "loss": 0.1842, + "step": 192 + }, + { + "epoch": 0.0776, + "learning_rate": 4.338814718856333e-06, + "loss": 0.0007, + "step": 194 + }, + { + "epoch": 0.0784, + "learning_rate": 4.3618562269657285e-06, + "loss": 0.0003, + "step": 196 + }, + { + "epoch": 0.0792, + "learning_rate": 4.384941702452852e-06, + "loss": 0.0171, + "step": 198 + }, + { + "epoch": 0.08, + "learning_rate": 4.408070965292526e-06, + "loss": 0.0366, + "step": 200 + }, + { + "epoch": 0.0808, + "learning_rate": 4.431243835118112e-06, + "loss": 0.159, + "step": 202 + }, + { + "epoch": 0.0816, + "learning_rate": 4.4544601312229185e-06, + "loss": 0.1002, + "step": 204 + }, + { + "epoch": 0.0824, + "learning_rate": 4.477719672561602e-06, + "loss": 0.356, + "step": 206 + }, + { + "epoch": 0.0832, + "learning_rate": 4.501022277751605e-06, + "loss": 0.1537, + "step": 208 + }, + { + "epoch": 0.084, + "learning_rate": 4.524367765074499e-06, + "loss": 0.265, + "step": 210 + }, + { + "epoch": 0.0848, + "learning_rate": 4.5477559524775e-06, + "loss": 0.3804, + "step": 212 + }, + { + "epoch": 0.0856, + "learning_rate": 4.571186657574823e-06, + "loss": 0.0488, + "step": 214 + }, + { + "epoch": 0.0864, + "learning_rate": 4.5946596976491254e-06, + "loss": 0.1144, + "step": 216 + }, + { + "epoch": 0.0872, + "learning_rate": 4.618174889652924e-06, + "loss": 0.2648, + "step": 218 + }, + { + "epoch": 0.088, + "learning_rate": 4.6417320502100286e-06, + "loss": 0.1179, + "step": 220 + }, + { + "epoch": 0.0888, + "learning_rate": 4.665330995616967e-06, + "loss": 0.1725, + "step": 222 + }, + { + "epoch": 0.0896, + "learning_rate": 4.688971541844424e-06, + "loss": 0.0995, + "step": 224 + }, + { + "epoch": 0.0904, + "learning_rate": 4.712653504538672e-06, + "loss": 0.1296, + "step": 226 + }, + { + "epoch": 0.0912, + "learning_rate": 4.736376699023023e-06, + "loss": 0.0005, + "step": 228 + }, + { + "epoch": 0.092, + "learning_rate": 4.76014094029921e-06, + "loss": 0.0564, + "step": 230 + }, + { + "epoch": 0.0928, + "learning_rate": 4.7839460430489216e-06, + "loss": 0.0012, + "step": 232 + }, + { + "epoch": 0.0936, + "learning_rate": 4.807791821635185e-06, + "loss": 0.1221, + "step": 234 + }, + { + "epoch": 0.0944, + "learning_rate": 4.831678090103828e-06, + "loss": 0.9578, + "step": 236 + }, + { + "epoch": 0.0952, + "learning_rate": 4.855604662184931e-06, + "loss": 0.0681, + "step": 238 + }, + { + "epoch": 0.096, + "learning_rate": 4.8795713512942785e-06, + "loss": 0.5132, + "step": 240 + }, + { + "epoch": 0.0968, + "learning_rate": 4.903577970534815e-06, + "loss": 0.0024, + "step": 242 + }, + { + "epoch": 0.0976, + "learning_rate": 4.9276243326981e-06, + "loss": 0.1998, + "step": 244 + }, + { + "epoch": 0.0984, + "learning_rate": 4.951710250265788e-06, + "loss": 0.0149, + "step": 246 + }, + { + "epoch": 0.0992, + "learning_rate": 4.975835535411023e-06, + "loss": 0.2725, + "step": 248 + }, + { + "epoch": 0.1, + "learning_rate": 5.000000000000003e-06, + "loss": 0.0901, + "step": 250 + }, + { + "epoch": 0.1008, + "learning_rate": 5.024203455593375e-06, + "loss": 0.0944, + "step": 252 + }, + { + "epoch": 0.1016, + "learning_rate": 5.048445713447734e-06, + "loss": 0.0807, + "step": 254 + }, + { + "epoch": 0.1024, + "learning_rate": 5.072726584517083e-06, + "loss": 0.0048, + "step": 256 + }, + { + "epoch": 0.1032, + "learning_rate": 5.097045879454308e-06, + "loss": 0.0032, + "step": 258 + }, + { + "epoch": 0.104, + "learning_rate": 5.1214034086126685e-06, + "loss": 0.0062, + "step": 260 + }, + { + "epoch": 0.1048, + "learning_rate": 5.145798982047253e-06, + "loss": 0.108, + "step": 262 + }, + { + "epoch": 0.1056, + "learning_rate": 5.170232409516483e-06, + "loss": 0.0006, + "step": 264 + }, + { + "epoch": 0.1064, + "learning_rate": 5.194703500483597e-06, + "loss": 0.3128, + "step": 266 + }, + { + "epoch": 0.1072, + "learning_rate": 5.219212064118082e-06, + "loss": 0.1993, + "step": 268 + }, + { + "epoch": 0.108, + "learning_rate": 5.24375790929725e-06, + "loss": 0.0397, + "step": 270 + }, + { + "epoch": 0.1088, + "learning_rate": 5.268340844607653e-06, + "loss": 0.6222, + "step": 272 + }, + { + "epoch": 0.1096, + "learning_rate": 5.2929606783466735e-06, + "loss": 0.0008, + "step": 274 + }, + { + "epoch": 0.1104, + "learning_rate": 5.317617218523853e-06, + "loss": 0.0543, + "step": 276 + }, + { + "epoch": 0.1112, + "learning_rate": 5.342310272862553e-06, + "loss": 0.151, + "step": 278 + }, + { + "epoch": 0.112, + "learning_rate": 5.367039648801377e-06, + "loss": 0.2643, + "step": 280 + }, + { + "epoch": 0.1128, + "learning_rate": 5.391805153495684e-06, + "loss": 0.3185, + "step": 282 + }, + { + "epoch": 0.1136, + "learning_rate": 5.416606593819109e-06, + "loss": 0.0014, + "step": 284 + }, + { + "epoch": 0.1144, + "learning_rate": 5.441443776365005e-06, + "loss": 0.2843, + "step": 286 + }, + { + "epoch": 0.1152, + "learning_rate": 5.466316507448053e-06, + "loss": 0.1495, + "step": 288 + }, + { + "epoch": 0.116, + "learning_rate": 5.49122459310568e-06, + "loss": 0.1291, + "step": 290 + }, + { + "epoch": 0.1168, + "learning_rate": 5.516167839099662e-06, + "loss": 0.0004, + "step": 292 + }, + { + "epoch": 0.1176, + "learning_rate": 5.5411460509175605e-06, + "loss": 0.1704, + "step": 294 + }, + { + "epoch": 0.1184, + "learning_rate": 5.5661590337742255e-06, + "loss": 0.6981, + "step": 296 + }, + { + "epoch": 0.1192, + "learning_rate": 5.591206592613412e-06, + "loss": 0.1447, + "step": 298 + }, + { + "epoch": 0.12, + "learning_rate": 5.616288532109221e-06, + "loss": 0.3539, + "step": 300 + }, + { + "epoch": 0.1208, + "learning_rate": 5.641404656667652e-06, + "loss": 0.2079, + "step": 302 + }, + { + "epoch": 0.1216, + "learning_rate": 5.666554770428136e-06, + "loss": 0.542, + "step": 304 + }, + { + "epoch": 0.1224, + "learning_rate": 5.6917386772650015e-06, + "loss": 0.0363, + "step": 306 + }, + { + "epoch": 0.1232, + "learning_rate": 5.716956180789086e-06, + "loss": 0.0213, + "step": 308 + }, + { + "epoch": 0.124, + "learning_rate": 5.74220708434926e-06, + "loss": 0.2849, + "step": 310 + }, + { + "epoch": 0.1248, + "learning_rate": 5.767491191033909e-06, + "loss": 0.0026, + "step": 312 + }, + { + "epoch": 0.1256, + "learning_rate": 5.7928083036724535e-06, + "loss": 0.3049, + "step": 314 + }, + { + "epoch": 0.1264, + "learning_rate": 5.818158224836983e-06, + "loss": 0.024, + "step": 316 + }, + { + "epoch": 0.1272, + "learning_rate": 5.8435407568437194e-06, + "loss": 0.2014, + "step": 318 + }, + { + "epoch": 0.128, + "learning_rate": 5.868955701754577e-06, + "loss": 0.2417, + "step": 320 + }, + { + "epoch": 0.1288, + "learning_rate": 5.894402861378714e-06, + "loss": 0.0008, + "step": 322 + }, + { + "epoch": 0.1296, + "learning_rate": 5.919882037274065e-06, + "loss": 0.1421, + "step": 324 + }, + { + "epoch": 0.1304, + "learning_rate": 5.9453930307488985e-06, + "loss": 0.4448, + "step": 326 + }, + { + "epoch": 0.1312, + "learning_rate": 5.970935642863362e-06, + "loss": 0.0065, + "step": 328 + }, + { + "epoch": 0.132, + "learning_rate": 5.996509674431038e-06, + "loss": 0.0365, + "step": 330 + }, + { + "epoch": 0.1328, + "learning_rate": 6.022114926020505e-06, + "loss": 0.2919, + "step": 332 + }, + { + "epoch": 0.1336, + "learning_rate": 6.047751197956836e-06, + "loss": 0.0083, + "step": 334 + }, + { + "epoch": 0.1344, + "learning_rate": 6.0734182903232475e-06, + "loss": 0.005, + "step": 336 + }, + { + "epoch": 0.1352, + "learning_rate": 6.0991160029626e-06, + "loss": 0.1897, + "step": 338 + }, + { + "epoch": 0.136, + "learning_rate": 6.124844135478966e-06, + "loss": 0.2324, + "step": 340 + }, + { + "epoch": 0.1368, + "learning_rate": 6.1506024872392e-06, + "loss": 0.012, + "step": 342 + }, + { + "epoch": 0.1376, + "learning_rate": 6.176390857374501e-06, + "loss": 0.3967, + "step": 344 + }, + { + "epoch": 0.1384, + "learning_rate": 6.202209044781979e-06, + "loss": 0.8111, + "step": 346 + }, + { + "epoch": 0.1392, + "learning_rate": 6.228056848126223e-06, + "loss": 0.0911, + "step": 348 + }, + { + "epoch": 0.14, + "learning_rate": 6.253934065840883e-06, + "loss": 0.0306, + "step": 350 + }, + { + "epoch": 0.1408, + "learning_rate": 6.279840496130188e-06, + "loss": 0.4152, + "step": 352 + }, + { + "epoch": 0.1416, + "learning_rate": 6.305775936970606e-06, + "loss": 0.0234, + "step": 354 + }, + { + "epoch": 0.1424, + "learning_rate": 6.331740186112359e-06, + "loss": 0.04, + "step": 356 + }, + { + "epoch": 0.1432, + "learning_rate": 6.357733041081015e-06, + "loss": 0.1787, + "step": 358 + }, + { + "epoch": 0.144, + "learning_rate": 6.383754299179072e-06, + "loss": 0.1806, + "step": 360 + }, + { + "epoch": 0.1448, + "learning_rate": 6.409803757487532e-06, + "loss": 0.3752, + "step": 362 + }, + { + "epoch": 0.1456, + "learning_rate": 6.435881212867485e-06, + "loss": 0.8593, + "step": 364 + }, + { + "epoch": 0.1464, + "learning_rate": 6.4619864619616975e-06, + "loss": 0.1704, + "step": 366 + }, + { + "epoch": 0.1472, + "learning_rate": 6.48811930119619e-06, + "loss": 0.0128, + "step": 368 + }, + { + "epoch": 0.148, + "learning_rate": 6.514279526781853e-06, + "loss": 0.0021, + "step": 370 + }, + { + "epoch": 0.1488, + "learning_rate": 6.540466934715955e-06, + "loss": 0.359, + "step": 372 + }, + { + "epoch": 0.1496, + "learning_rate": 6.566681320783848e-06, + "loss": 0.0042, + "step": 374 + }, + { + "epoch": 0.1504, + "learning_rate": 6.592922480560483e-06, + "loss": 0.3635, + "step": 376 + }, + { + "epoch": 0.1512, + "learning_rate": 6.619190209412025e-06, + "loss": 0.0252, + "step": 378 + }, + { + "epoch": 0.152, + "learning_rate": 6.6454843024974465e-06, + "loss": 0.0161, + "step": 380 + }, + { + "epoch": 0.1528, + "learning_rate": 6.671804554770128e-06, + "loss": 0.001, + "step": 382 + }, + { + "epoch": 0.1536, + "learning_rate": 6.698150760979456e-06, + "loss": 0.2211, + "step": 384 + }, + { + "epoch": 0.1544, + "learning_rate": 6.724522715672421e-06, + "loss": 0.2219, + "step": 386 + }, + { + "epoch": 0.1552, + "learning_rate": 6.750920213195242e-06, + "loss": 0.2113, + "step": 388 + }, + { + "epoch": 0.156, + "learning_rate": 6.777343047694894e-06, + "loss": 0.2464, + "step": 390 + }, + { + "epoch": 0.1568, + "learning_rate": 6.803791013120824e-06, + "loss": 0.0127, + "step": 392 + }, + { + "epoch": 0.1576, + "learning_rate": 6.8302639032264836e-06, + "loss": 0.3725, + "step": 394 + }, + { + "epoch": 0.1584, + "learning_rate": 6.856761511570944e-06, + "loss": 0.1287, + "step": 396 + }, + { + "epoch": 0.1592, + "learning_rate": 6.883283631520579e-06, + "loss": 0.1, + "step": 398 + }, + { + "epoch": 0.16, + "learning_rate": 6.909830056250522e-06, + "loss": 0.0489, + "step": 400 + }, + { + "epoch": 0.1608, + "learning_rate": 6.936400578746436e-06, + "loss": 0.1775, + "step": 402 + }, + { + "epoch": 0.1616, + "learning_rate": 6.96299499180605e-06, + "loss": 0.1897, + "step": 404 + }, + { + "epoch": 0.1624, + "learning_rate": 6.989613088040787e-06, + "loss": 0.0939, + "step": 406 + }, + { + "epoch": 0.1632, + "learning_rate": 7.016254659877404e-06, + "loss": 0.0002, + "step": 408 + }, + { + "epoch": 0.164, + "learning_rate": 7.042919499559539e-06, + "loss": 0.1157, + "step": 410 + }, + { + "epoch": 0.1648, + "learning_rate": 7.06960739914943e-06, + "loss": 0.0985, + "step": 412 + }, + { + "epoch": 0.1656, + "learning_rate": 7.09631815052946e-06, + "loss": 0.1024, + "step": 414 + }, + { + "epoch": 0.1664, + "learning_rate": 7.123051545403873e-06, + "loss": 0.0153, + "step": 416 + }, + { + "epoch": 0.1672, + "learning_rate": 7.1498073753002375e-06, + "loss": 0.0011, + "step": 418 + }, + { + "epoch": 0.168, + "learning_rate": 7.1765854315712325e-06, + "loss": 0.4632, + "step": 420 + }, + { + "epoch": 0.1688, + "learning_rate": 7.203385505396197e-06, + "loss": 0.1017, + "step": 422 + }, + { + "epoch": 0.1696, + "learning_rate": 7.230207387782771e-06, + "loss": 0.0005, + "step": 424 + }, + { + "epoch": 0.1704, + "learning_rate": 7.257050869568527e-06, + "loss": 0.0766, + "step": 426 + }, + { + "epoch": 0.1712, + "learning_rate": 7.28391574142262e-06, + "loss": 0.622, + "step": 428 + }, + { + "epoch": 0.172, + "learning_rate": 7.3108017938473485e-06, + "loss": 0.3862, + "step": 430 + }, + { + "epoch": 0.1728, + "learning_rate": 7.337708817179875e-06, + "loss": 0.6825, + "step": 432 + }, + { + "epoch": 0.1736, + "learning_rate": 7.36463660159386e-06, + "loss": 0.025, + "step": 434 + }, + { + "epoch": 0.1744, + "learning_rate": 7.39158493710103e-06, + "loss": 0.0013, + "step": 436 + }, + { + "epoch": 0.1752, + "learning_rate": 7.418553613552822e-06, + "loss": 0.0291, + "step": 438 + }, + { + "epoch": 0.176, + "learning_rate": 7.445542420642091e-06, + "loss": 0.1271, + "step": 440 + }, + { + "epoch": 0.1768, + "learning_rate": 7.472551147904703e-06, + "loss": 0.1498, + "step": 442 + }, + { + "epoch": 0.1776, + "learning_rate": 7.499579584721173e-06, + "loss": 0.4885, + "step": 444 + }, + { + "epoch": 0.1784, + "learning_rate": 7.5266275203183395e-06, + "loss": 0.6193, + "step": 446 + }, + { + "epoch": 0.1792, + "learning_rate": 7.553694743770917e-06, + "loss": 0.0616, + "step": 448 + }, + { + "epoch": 0.18, + "learning_rate": 7.580781044003312e-06, + "loss": 0.0081, + "step": 450 + }, + { + "epoch": 0.1808, + "learning_rate": 7.607886209791095e-06, + "loss": 0.2091, + "step": 452 + }, + { + "epoch": 0.1816, + "learning_rate": 7.635010029762755e-06, + "loss": 0.1268, + "step": 454 + }, + { + "epoch": 0.1824, + "learning_rate": 7.662152292401265e-06, + "loss": 0.0051, + "step": 456 + }, + { + "epoch": 0.1832, + "learning_rate": 7.689312786045822e-06, + "loss": 0.2367, + "step": 458 + }, + { + "epoch": 0.184, + "learning_rate": 7.716491298893441e-06, + "loss": 0.2598, + "step": 460 + }, + { + "epoch": 0.1848, + "learning_rate": 7.74368761900062e-06, + "loss": 0.318, + "step": 462 + }, + { + "epoch": 0.1856, + "learning_rate": 7.770901534284991e-06, + "loss": 0.0186, + "step": 464 + }, + { + "epoch": 0.1864, + "learning_rate": 7.798132832526976e-06, + "loss": 0.0812, + "step": 466 + }, + { + "epoch": 0.1872, + "learning_rate": 7.825381301371444e-06, + "loss": 0.2045, + "step": 468 + }, + { + "epoch": 0.188, + "learning_rate": 7.852646728329358e-06, + "loss": 0.0028, + "step": 470 + }, + { + "epoch": 0.1888, + "learning_rate": 7.879928900779441e-06, + "loss": 0.016, + "step": 472 + }, + { + "epoch": 0.1896, + "learning_rate": 7.907227605969852e-06, + "loss": 0.3792, + "step": 474 + }, + { + "epoch": 0.1904, + "learning_rate": 7.934542631019767e-06, + "loss": 0.4227, + "step": 476 + }, + { + "epoch": 0.1912, + "learning_rate": 7.961873762921153e-06, + "loss": 0.5182, + "step": 478 + }, + { + "epoch": 0.192, + "learning_rate": 7.989220788540351e-06, + "loss": 0.8737, + "step": 480 + }, + { + "epoch": 0.1928, + "learning_rate": 8.016583494619764e-06, + "loss": 0.0034, + "step": 482 + }, + { + "epoch": 0.1936, + "learning_rate": 8.043961667779511e-06, + "loss": 0.0554, + "step": 484 + }, + { + "epoch": 0.1944, + "learning_rate": 8.071355094519103e-06, + "loss": 0.2098, + "step": 486 + }, + { + "epoch": 0.1952, + "learning_rate": 8.098763561219089e-06, + "loss": 0.0016, + "step": 488 + }, + { + "epoch": 0.196, + "learning_rate": 8.126186854142744e-06, + "loss": 0.0318, + "step": 490 + }, + { + "epoch": 0.1968, + "learning_rate": 8.153624759437718e-06, + "loss": 0.1766, + "step": 492 + }, + { + "epoch": 0.1976, + "learning_rate": 8.181077063137735e-06, + "loss": 0.4132, + "step": 494 + }, + { + "epoch": 0.1984, + "learning_rate": 8.208543551164178e-06, + "loss": 0.0086, + "step": 496 + }, + { + "epoch": 0.1992, + "learning_rate": 8.236024009327877e-06, + "loss": 0.2495, + "step": 498 + }, + { + "epoch": 0.2, + "learning_rate": 8.263518223330695e-06, + "loss": 0.0065, + "step": 500 + }, + { + "epoch": 0.2008, + "learning_rate": 8.29102597876723e-06, + "loss": 0.8377, + "step": 502 + }, + { + "epoch": 0.2016, + "learning_rate": 8.31854706112648e-06, + "loss": 0.0443, + "step": 504 + }, + { + "epoch": 0.2024, + "learning_rate": 8.346081255793516e-06, + "loss": 0.0514, + "step": 506 + }, + { + "epoch": 0.2032, + "learning_rate": 8.373628348051156e-06, + "loss": 0.0027, + "step": 508 + }, + { + "epoch": 0.204, + "learning_rate": 8.401188123081642e-06, + "loss": 0.2151, + "step": 510 + }, + { + "epoch": 0.2048, + "learning_rate": 8.428760365968329e-06, + "loss": 0.2168, + "step": 512 + }, + { + "epoch": 0.2056, + "learning_rate": 8.456344861697293e-06, + "loss": 0.0007, + "step": 514 + }, + { + "epoch": 0.2064, + "learning_rate": 8.483941395159114e-06, + "loss": 0.4202, + "step": 516 + }, + { + "epoch": 0.2072, + "learning_rate": 8.511549751150478e-06, + "loss": 0.4946, + "step": 518 + }, + { + "epoch": 0.208, + "learning_rate": 8.539169714375883e-06, + "loss": 0.0002, + "step": 520 + }, + { + "epoch": 0.2088, + "learning_rate": 8.566801069449304e-06, + "loss": 0.0015, + "step": 522 + }, + { + "epoch": 0.2096, + "learning_rate": 8.594443600895886e-06, + "loss": 0.0041, + "step": 524 + }, + { + "epoch": 0.2104, + "learning_rate": 8.622097093153612e-06, + "loss": 0.0005, + "step": 526 + }, + { + "epoch": 0.2112, + "learning_rate": 8.649761330575e-06, + "loss": 0.6665, + "step": 528 + }, + { + "epoch": 0.212, + "learning_rate": 8.677436097428766e-06, + "loss": 0.1242, + "step": 530 + }, + { + "epoch": 0.2128, + "learning_rate": 8.705121177901537e-06, + "loss": 0.0524, + "step": 532 + }, + { + "epoch": 0.2136, + "learning_rate": 8.732816356099459e-06, + "loss": 0.512, + "step": 534 + }, + { + "epoch": 0.2144, + "learning_rate": 8.760521416049986e-06, + "loss": 0.351, + "step": 536 + }, + { + "epoch": 0.2152, + "learning_rate": 8.788236141703477e-06, + "loss": 0.1024, + "step": 538 + }, + { + "epoch": 0.216, + "learning_rate": 8.81596031693499e-06, + "loss": 0.4811, + "step": 540 + }, + { + "epoch": 0.2168, + "learning_rate": 8.84369372554578e-06, + "loss": 0.0002, + "step": 542 + }, + { + "epoch": 0.2176, + "learning_rate": 8.87143615126518e-06, + "loss": 0.1156, + "step": 544 + }, + { + "epoch": 0.2184, + "learning_rate": 8.899187377752173e-06, + "loss": 0.0658, + "step": 546 + }, + { + "epoch": 0.2192, + "learning_rate": 8.926947188597127e-06, + "loss": 2.1607, + "step": 548 + }, + { + "epoch": 0.22, + "learning_rate": 8.954715367323473e-06, + "loss": 0.3399, + "step": 550 + }, + { + "epoch": 0.2208, + "learning_rate": 8.982491697389344e-06, + "loss": 1.0154, + "step": 552 + }, + { + "epoch": 0.2216, + "learning_rate": 9.010275962189356e-06, + "loss": 0.3964, + "step": 554 + }, + { + "epoch": 0.2224, + "learning_rate": 9.03806794505621e-06, + "loss": 1.2308, + "step": 556 + }, + { + "epoch": 0.2232, + "learning_rate": 9.065867429262497e-06, + "loss": 0.1172, + "step": 558 + }, + { + "epoch": 0.224, + "learning_rate": 9.093674198022198e-06, + "loss": 0.0439, + "step": 560 + }, + { + "epoch": 0.2248, + "learning_rate": 9.121488034492567e-06, + "loss": 0.2233, + "step": 562 + }, + { + "epoch": 0.2256, + "learning_rate": 9.149308721775717e-06, + "loss": 0.115, + "step": 564 + }, + { + "epoch": 0.2264, + "learning_rate": 9.177136042920338e-06, + "loss": 0.0226, + "step": 566 + }, + { + "epoch": 0.2272, + "learning_rate": 9.204969780923396e-06, + "loss": 0.1486, + "step": 568 + }, + { + "epoch": 0.228, + "learning_rate": 9.232809718731822e-06, + "loss": 0.6194, + "step": 570 + }, + { + "epoch": 0.2288, + "learning_rate": 9.26065563924414e-06, + "loss": 0.3316, + "step": 572 + }, + { + "epoch": 0.2296, + "learning_rate": 9.288507325312319e-06, + "loss": 0.0233, + "step": 574 + }, + { + "epoch": 0.2304, + "learning_rate": 9.316364559743298e-06, + "loss": 0.0851, + "step": 576 + }, + { + "epoch": 0.2312, + "learning_rate": 9.344227125300788e-06, + "loss": 0.0049, + "step": 578 + }, + { + "epoch": 0.232, + "learning_rate": 9.372094804706867e-06, + "loss": 0.0312, + "step": 580 + }, + { + "epoch": 0.2328, + "learning_rate": 9.39996738064379e-06, + "loss": 0.2059, + "step": 582 + }, + { + "epoch": 0.2336, + "learning_rate": 9.427844635755615e-06, + "loss": 0.137, + "step": 584 + }, + { + "epoch": 0.2344, + "learning_rate": 9.455726352649904e-06, + "loss": 0.0426, + "step": 586 + }, + { + "epoch": 0.2352, + "learning_rate": 9.483612313899446e-06, + "loss": 0.267, + "step": 588 + }, + { + "epoch": 0.236, + "learning_rate": 9.511502302043859e-06, + "loss": 0.764, + "step": 590 + }, + { + "epoch": 0.2368, + "learning_rate": 9.539396099591469e-06, + "loss": 0.0816, + "step": 592 + }, + { + "epoch": 0.2376, + "learning_rate": 9.567293489020816e-06, + "loss": 0.2356, + "step": 594 + }, + { + "epoch": 0.2384, + "learning_rate": 9.595194252782461e-06, + "loss": 0.2405, + "step": 596 + }, + { + "epoch": 0.2392, + "learning_rate": 9.623098173300656e-06, + "loss": 0.467, + "step": 598 + }, + { + "epoch": 0.24, + "learning_rate": 9.651005032974991e-06, + "loss": 0.3692, + "step": 600 + }, + { + "epoch": 0.2408, + "learning_rate": 9.678914614182184e-06, + "loss": 0.0091, + "step": 602 + }, + { + "epoch": 0.2416, + "learning_rate": 9.706826699277714e-06, + "loss": 0.4954, + "step": 604 + }, + { + "epoch": 0.2424, + "learning_rate": 9.734741070597535e-06, + "loss": 0.0112, + "step": 606 + }, + { + "epoch": 0.2432, + "learning_rate": 9.762657510459774e-06, + "loss": 0.2867, + "step": 608 + }, + { + "epoch": 0.244, + "learning_rate": 9.790575801166422e-06, + "loss": 0.3744, + "step": 610 + }, + { + "epoch": 0.2448, + "learning_rate": 9.818495725005043e-06, + "loss": 0.0073, + "step": 612 + }, + { + "epoch": 0.2456, + "learning_rate": 9.846417064250459e-06, + "loss": 0.1622, + "step": 614 + }, + { + "epoch": 0.2464, + "learning_rate": 9.874339601166479e-06, + "loss": 0.2542, + "step": 616 + }, + { + "epoch": 0.2472, + "learning_rate": 9.902263118007513e-06, + "loss": 0.3822, + "step": 618 + }, + { + "epoch": 0.248, + "learning_rate": 9.930187397020385e-06, + "loss": 0.0029, + "step": 620 + }, + { + "epoch": 0.2488, + "learning_rate": 9.95811222044596e-06, + "loss": 0.1782, + "step": 622 + }, + { + "epoch": 0.2496, + "learning_rate": 9.986037370520855e-06, + "loss": 0.153, + "step": 624 + }, + { + "epoch": 0.2504, + "learning_rate": 1.0013962629479139e-05, + "loss": 0.3894, + "step": 626 + }, + { + "epoch": 0.2512, + "learning_rate": 1.0041887779554034e-05, + "loss": 0.228, + "step": 628 + }, + { + "epoch": 0.252, + "learning_rate": 1.0069812602979607e-05, + "loss": 0.0584, + "step": 630 + }, + { + "epoch": 0.2528, + "learning_rate": 1.0097736881992482e-05, + "loss": 0.2294, + "step": 632 + }, + { + "epoch": 0.2536, + "learning_rate": 1.0125660398833514e-05, + "loss": 0.0351, + "step": 634 + }, + { + "epoch": 0.2544, + "learning_rate": 1.0153582935749533e-05, + "loss": 0.102, + "step": 636 + }, + { + "epoch": 0.2552, + "learning_rate": 1.0181504274994952e-05, + "loss": 0.0009, + "step": 638 + }, + { + "epoch": 0.256, + "learning_rate": 1.0209424198833571e-05, + "loss": 0.0246, + "step": 640 + }, + { + "epoch": 0.2568, + "learning_rate": 1.0237342489540218e-05, + "loss": 0.3556, + "step": 642 + }, + { + "epoch": 0.2576, + "learning_rate": 1.0265258929402458e-05, + "loss": 0.6624, + "step": 644 + }, + { + "epoch": 0.2584, + "learning_rate": 1.029317330072228e-05, + "loss": 0.0711, + "step": 646 + }, + { + "epoch": 0.2592, + "learning_rate": 1.0321085385817811e-05, + "loss": 0.2012, + "step": 648 + }, + { + "epoch": 0.26, + "learning_rate": 1.0348994967025004e-05, + "loss": 0.1786, + "step": 650 + }, + { + "epoch": 0.2608, + "learning_rate": 1.0376901826699337e-05, + "loss": 0.0046, + "step": 652 + }, + { + "epoch": 0.2616, + "learning_rate": 1.0404805747217532e-05, + "loss": 0.4032, + "step": 654 + }, + { + "epoch": 0.2624, + "learning_rate": 1.0432706510979175e-05, + "loss": 0.0061, + "step": 656 + }, + { + "epoch": 0.2632, + "learning_rate": 1.0460603900408526e-05, + "loss": 0.1547, + "step": 658 + }, + { + "epoch": 0.264, + "learning_rate": 1.0488497697956134e-05, + "loss": 0.2432, + "step": 660 + }, + { + "epoch": 0.2648, + "learning_rate": 1.0516387686100549e-05, + "loss": 0.3547, + "step": 662 + }, + { + "epoch": 0.2656, + "learning_rate": 1.054427364735009e-05, + "loss": 0.196, + "step": 664 + }, + { + "epoch": 0.2664, + "learning_rate": 1.0572155364244378e-05, + "loss": 0.1653, + "step": 666 + }, + { + "epoch": 0.2672, + "learning_rate": 1.0600032619356203e-05, + "loss": 0.5405, + "step": 668 + }, + { + "epoch": 0.268, + "learning_rate": 1.0627905195293127e-05, + "loss": 0.036, + "step": 670 + }, + { + "epoch": 0.2688, + "learning_rate": 1.0655772874699206e-05, + "loss": 0.0773, + "step": 672 + }, + { + "epoch": 0.2696, + "learning_rate": 1.0683635440256694e-05, + "loss": 0.1741, + "step": 674 + }, + { + "epoch": 0.2704, + "learning_rate": 1.0711492674687674e-05, + "loss": 0.1537, + "step": 676 + }, + { + "epoch": 0.2712, + "learning_rate": 1.0739344360755855e-05, + "loss": 0.0122, + "step": 678 + }, + { + "epoch": 0.272, + "learning_rate": 1.0767190281268171e-05, + "loss": 0.2893, + "step": 680 + }, + { + "epoch": 0.2728, + "learning_rate": 1.07950302190766e-05, + "loss": 0.0003, + "step": 682 + }, + { + "epoch": 0.2736, + "learning_rate": 1.0822863957079654e-05, + "loss": 0.0461, + "step": 684 + }, + { + "epoch": 0.2744, + "learning_rate": 1.0850691278224277e-05, + "loss": 0.0002, + "step": 686 + }, + { + "epoch": 0.2752, + "learning_rate": 1.0878511965507428e-05, + "loss": 0.2741, + "step": 688 + }, + { + "epoch": 0.276, + "learning_rate": 1.0906325801977795e-05, + "loss": 0.2003, + "step": 690 + }, + { + "epoch": 0.2768, + "learning_rate": 1.0934132570737497e-05, + "loss": 0.4165, + "step": 692 + }, + { + "epoch": 0.2776, + "learning_rate": 1.0961932054943785e-05, + "loss": 0.0005, + "step": 694 + }, + { + "epoch": 0.2784, + "learning_rate": 1.098972403781064e-05, + "loss": 0.5584, + "step": 696 + }, + { + "epoch": 0.2792, + "learning_rate": 1.101750830261065e-05, + "loss": 0.0605, + "step": 698 + }, + { + "epoch": 0.28, + "learning_rate": 1.104528463267652e-05, + "loss": 0.6934, + "step": 700 + }, + { + "epoch": 0.2808, + "learning_rate": 1.1073052811402867e-05, + "loss": 0.006, + "step": 702 + }, + { + "epoch": 0.2816, + "learning_rate": 1.1100812622247821e-05, + "loss": 0.6559, + "step": 704 + }, + { + "epoch": 0.2824, + "learning_rate": 1.1128563848734815e-05, + "loss": 0.1212, + "step": 706 + }, + { + "epoch": 0.2832, + "learning_rate": 1.1156306274454211e-05, + "loss": 0.5195, + "step": 708 + }, + { + "epoch": 0.284, + "learning_rate": 1.1184039683065002e-05, + "loss": 0.0423, + "step": 710 + }, + { + "epoch": 0.2848, + "learning_rate": 1.1211763858296516e-05, + "loss": 0.2236, + "step": 712 + }, + { + "epoch": 0.2856, + "learning_rate": 1.1239478583950007e-05, + "loss": 0.3916, + "step": 714 + }, + { + "epoch": 0.2864, + "learning_rate": 1.1267183643900534e-05, + "loss": 0.263, + "step": 716 + }, + { + "epoch": 0.2872, + "learning_rate": 1.1294878822098456e-05, + "loss": 0.0079, + "step": 718 + }, + { + "epoch": 0.288, + "learning_rate": 1.1322563902571227e-05, + "loss": 0.6642, + "step": 720 + }, + { + "epoch": 0.2888, + "learning_rate": 1.1350238669424993e-05, + "loss": 0.0869, + "step": 722 + }, + { + "epoch": 0.2896, + "learning_rate": 1.137790290684638e-05, + "loss": 0.3121, + "step": 724 + }, + { + "epoch": 0.2904, + "learning_rate": 1.1405556399104108e-05, + "loss": 0.192, + "step": 726 + }, + { + "epoch": 0.2912, + "learning_rate": 1.143319893055069e-05, + "loss": 0.1489, + "step": 728 + }, + { + "epoch": 0.292, + "learning_rate": 1.1460830285624112e-05, + "loss": 0.0321, + "step": 730 + }, + { + "epoch": 0.2928, + "learning_rate": 1.1488450248849515e-05, + "loss": 0.147, + "step": 732 + }, + { + "epoch": 0.2936, + "learning_rate": 1.1516058604840881e-05, + "loss": 0.0091, + "step": 734 + }, + { + "epoch": 0.2944, + "learning_rate": 1.15436551383027e-05, + "loss": 0.0036, + "step": 736 + }, + { + "epoch": 0.2952, + "learning_rate": 1.1571239634031666e-05, + "loss": 0.002, + "step": 738 + }, + { + "epoch": 0.296, + "learning_rate": 1.1598811876918352e-05, + "loss": 0.3608, + "step": 740 + }, + { + "epoch": 0.2968, + "learning_rate": 1.1626371651948839e-05, + "loss": 0.0191, + "step": 742 + }, + { + "epoch": 0.2976, + "learning_rate": 1.1653918744206478e-05, + "loss": 0.9681, + "step": 744 + }, + { + "epoch": 0.2984, + "learning_rate": 1.1681452938873515e-05, + "loss": 0.0019, + "step": 746 + }, + { + "epoch": 0.2992, + "learning_rate": 1.1708974021232763e-05, + "loss": 0.307, + "step": 748 + }, + { + "epoch": 0.3, + "learning_rate": 1.1736481776669297e-05, + "loss": 0.2665, + "step": 750 + }, + { + "epoch": 0.3008, + "learning_rate": 1.1763975990672116e-05, + "loss": 0.1004, + "step": 752 + }, + { + "epoch": 0.3016, + "learning_rate": 1.1791456448835815e-05, + "loss": 0.0007, + "step": 754 + }, + { + "epoch": 0.3024, + "learning_rate": 1.1818922936862258e-05, + "loss": 0.2417, + "step": 756 + }, + { + "epoch": 0.3032, + "learning_rate": 1.1846375240562274e-05, + "loss": 0.5443, + "step": 758 + }, + { + "epoch": 0.304, + "learning_rate": 1.187381314585725e-05, + "loss": 0.2289, + "step": 760 + }, + { + "epoch": 0.3048, + "learning_rate": 1.1901236438780906e-05, + "loss": 0.3316, + "step": 762 + }, + { + "epoch": 0.3056, + "learning_rate": 1.192864490548089e-05, + "loss": 0.2802, + "step": 764 + }, + { + "epoch": 0.3064, + "learning_rate": 1.195603833222048e-05, + "loss": 0.021, + "step": 766 + }, + { + "epoch": 0.3072, + "learning_rate": 1.198341650538023e-05, + "loss": 0.0315, + "step": 768 + }, + { + "epoch": 0.308, + "learning_rate": 1.2010779211459642e-05, + "loss": 0.0733, + "step": 770 + }, + { + "epoch": 0.3088, + "learning_rate": 1.203812623707884e-05, + "loss": 0.2727, + "step": 772 + }, + { + "epoch": 0.3096, + "learning_rate": 1.2065457368980227e-05, + "loss": 0.0733, + "step": 774 + }, + { + "epoch": 0.3104, + "learning_rate": 1.2092772394030141e-05, + "loss": 0.2024, + "step": 776 + }, + { + "epoch": 0.3112, + "learning_rate": 1.2120071099220552e-05, + "loss": 0.1064, + "step": 778 + }, + { + "epoch": 0.312, + "learning_rate": 1.2147353271670637e-05, + "loss": 0.0782, + "step": 780 + }, + { + "epoch": 0.3128, + "learning_rate": 1.217461869862855e-05, + "loss": 0.2906, + "step": 782 + }, + { + "epoch": 0.3136, + "learning_rate": 1.2201867167473015e-05, + "loss": 0.438, + "step": 784 + }, + { + "epoch": 0.3144, + "learning_rate": 1.2229098465715002e-05, + "loss": 0.2826, + "step": 786 + }, + { + "epoch": 0.3152, + "learning_rate": 1.2256312380999373e-05, + "loss": 0.2015, + "step": 788 + }, + { + "epoch": 0.316, + "learning_rate": 1.2283508701106552e-05, + "loss": 0.0006, + "step": 790 + }, + { + "epoch": 0.3168, + "learning_rate": 1.2310687213954173e-05, + "loss": 0.2468, + "step": 792 + }, + { + "epoch": 0.3176, + "learning_rate": 1.233784770759873e-05, + "loss": 0.0069, + "step": 794 + }, + { + "epoch": 0.3184, + "learning_rate": 1.2364989970237238e-05, + "loss": 0.0057, + "step": 796 + }, + { + "epoch": 0.3192, + "learning_rate": 1.23921137902089e-05, + "loss": 0.2721, + "step": 798 + }, + { + "epoch": 0.32, + "learning_rate": 1.241921895599668e-05, + "loss": 0.199, + "step": 800 + }, + { + "epoch": 0.3208, + "learning_rate": 1.2446305256229076e-05, + "loss": 1.1869, + "step": 802 + }, + { + "epoch": 0.3216, + "learning_rate": 1.2473372479681653e-05, + "loss": 0.0719, + "step": 804 + }, + { + "epoch": 0.3224, + "learning_rate": 1.2500420415278822e-05, + "loss": 0.0006, + "step": 806 + }, + { + "epoch": 0.3232, + "learning_rate": 1.2527448852095292e-05, + "loss": 0.0026, + "step": 808 + }, + { + "epoch": 0.324, + "learning_rate": 1.2554457579357902e-05, + "loss": 0.0567, + "step": 810 + }, + { + "epoch": 0.3248, + "learning_rate": 1.2581446386447171e-05, + "loss": 0.671, + "step": 812 + }, + { + "epoch": 0.3256, + "learning_rate": 1.2608415062898963e-05, + "loss": 0.0212, + "step": 814 + }, + { + "epoch": 0.3264, + "learning_rate": 1.2635363398406133e-05, + "loss": 0.3114, + "step": 816 + }, + { + "epoch": 0.3272, + "learning_rate": 1.266229118282012e-05, + "loss": 0.469, + "step": 818 + }, + { + "epoch": 0.328, + "learning_rate": 1.2689198206152644e-05, + "loss": 0.237, + "step": 820 + }, + { + "epoch": 0.3288, + "learning_rate": 1.2716084258577373e-05, + "loss": 0.6853, + "step": 822 + }, + { + "epoch": 0.3296, + "learning_rate": 1.2742949130431468e-05, + "loss": 0.3348, + "step": 824 + }, + { + "epoch": 0.3304, + "learning_rate": 1.2769792612217224e-05, + "loss": 0.135, + "step": 826 + }, + { + "epoch": 0.3312, + "learning_rate": 1.2796614494603795e-05, + "loss": 0.1613, + "step": 828 + }, + { + "epoch": 0.332, + "learning_rate": 1.282341456842876e-05, + "loss": 0.4998, + "step": 830 + }, + { + "epoch": 0.3328, + "learning_rate": 1.2850192624699756e-05, + "loss": 1.1045, + "step": 832 + }, + { + "epoch": 0.3336, + "learning_rate": 1.2876948454596122e-05, + "loss": 0.1457, + "step": 834 + }, + { + "epoch": 0.3344, + "learning_rate": 1.2903681849470535e-05, + "loss": 0.1782, + "step": 836 + }, + { + "epoch": 0.3352, + "learning_rate": 1.2930392600850565e-05, + "loss": 0.5431, + "step": 838 + }, + { + "epoch": 0.336, + "learning_rate": 1.2957080500440455e-05, + "loss": 0.0569, + "step": 840 + }, + { + "epoch": 0.3368, + "learning_rate": 1.2983745340122589e-05, + "loss": 0.7322, + "step": 842 + }, + { + "epoch": 0.3376, + "learning_rate": 1.3010386911959205e-05, + "loss": 0.291, + "step": 844 + }, + { + "epoch": 0.3384, + "learning_rate": 1.3037005008193944e-05, + "loss": 0.1791, + "step": 846 + }, + { + "epoch": 0.3392, + "learning_rate": 1.3063599421253556e-05, + "loss": 0.1248, + "step": 848 + }, + { + "epoch": 0.34, + "learning_rate": 1.309016994374947e-05, + "loss": 0.0388, + "step": 850 + }, + { + "epoch": 0.3408, + "learning_rate": 1.3116716368479415e-05, + "loss": 0.6019, + "step": 852 + }, + { + "epoch": 0.3416, + "learning_rate": 1.3143238488429049e-05, + "loss": 0.0919, + "step": 854 + }, + { + "epoch": 0.3424, + "learning_rate": 1.316973609677351e-05, + "loss": 0.3312, + "step": 856 + }, + { + "epoch": 0.3432, + "learning_rate": 1.319620898687917e-05, + "loss": 0.1327, + "step": 858 + }, + { + "epoch": 0.344, + "learning_rate": 1.32226569523051e-05, + "loss": 0.1389, + "step": 860 + }, + { + "epoch": 0.3448, + "learning_rate": 1.324907978680475e-05, + "loss": 0.0002, + "step": 862 + }, + { + "epoch": 0.3456, + "learning_rate": 1.3275477284327572e-05, + "loss": 0.0707, + "step": 864 + }, + { + "epoch": 0.3464, + "learning_rate": 1.3301849239020537e-05, + "loss": 0.125, + "step": 866 + }, + { + "epoch": 0.3472, + "learning_rate": 1.3328195445229865e-05, + "loss": 0.0005, + "step": 868 + }, + { + "epoch": 0.348, + "learning_rate": 1.3354515697502548e-05, + "loss": 0.2939, + "step": 870 + }, + { + "epoch": 0.3488, + "learning_rate": 1.338080979058797e-05, + "loss": 0.0782, + "step": 872 + }, + { + "epoch": 0.3496, + "learning_rate": 1.340707751943951e-05, + "loss": 0.0007, + "step": 874 + }, + { + "epoch": 0.3504, + "learning_rate": 1.3433318679216145e-05, + "loss": 0.0293, + "step": 876 + }, + { + "epoch": 0.3512, + "learning_rate": 1.3459533065284039e-05, + "loss": 0.0584, + "step": 878 + }, + { + "epoch": 0.352, + "learning_rate": 1.348572047321814e-05, + "loss": 0.0979, + "step": 880 + }, + { + "epoch": 0.3528, + "learning_rate": 1.3511880698803803e-05, + "loss": 0.3399, + "step": 882 + }, + { + "epoch": 0.3536, + "learning_rate": 1.3538013538038296e-05, + "loss": 0.1621, + "step": 884 + }, + { + "epoch": 0.3544, + "learning_rate": 1.3564118787132507e-05, + "loss": 0.003, + "step": 886 + }, + { + "epoch": 0.3552, + "learning_rate": 1.3590196242512461e-05, + "loss": 0.0011, + "step": 888 + }, + { + "epoch": 0.356, + "learning_rate": 1.361624570082092e-05, + "loss": 0.0428, + "step": 890 + }, + { + "epoch": 0.3568, + "learning_rate": 1.364226695891898e-05, + "loss": 0.0089, + "step": 892 + }, + { + "epoch": 0.3576, + "learning_rate": 1.3668259813887637e-05, + "loss": 0.1391, + "step": 894 + }, + { + "epoch": 0.3584, + "learning_rate": 1.3694224063029386e-05, + "loss": 0.2308, + "step": 896 + }, + { + "epoch": 0.3592, + "learning_rate": 1.3720159503869806e-05, + "loss": 0.5499, + "step": 898 + }, + { + "epoch": 0.36, + "learning_rate": 1.374606593415911e-05, + "loss": 0.1694, + "step": 900 + }, + { + "epoch": 0.3608, + "learning_rate": 1.377194315187377e-05, + "loss": 0.5252, + "step": 902 + }, + { + "epoch": 0.3616, + "learning_rate": 1.3797790955218014e-05, + "loss": 0.0061, + "step": 904 + }, + { + "epoch": 0.3624, + "learning_rate": 1.3823609142625492e-05, + "loss": 0.2116, + "step": 906 + }, + { + "epoch": 0.3632, + "learning_rate": 1.3849397512760793e-05, + "loss": 0.8643, + "step": 908 + }, + { + "epoch": 0.364, + "learning_rate": 1.3875155864521027e-05, + "loss": 0.3853, + "step": 910 + }, + { + "epoch": 0.3648, + "learning_rate": 1.3900883997037393e-05, + "loss": 0.1275, + "step": 912 + }, + { + "epoch": 0.3656, + "learning_rate": 1.3926581709676746e-05, + "loss": 0.1728, + "step": 914 + }, + { + "epoch": 0.3664, + "learning_rate": 1.3952248802043158e-05, + "loss": 0.308, + "step": 916 + }, + { + "epoch": 0.3672, + "learning_rate": 1.397788507397949e-05, + "loss": 0.1, + "step": 918 + }, + { + "epoch": 0.368, + "learning_rate": 1.4003490325568956e-05, + "loss": 0.1487, + "step": 920 + }, + { + "epoch": 0.3688, + "learning_rate": 1.4029064357136632e-05, + "loss": 0.5198, + "step": 922 + }, + { + "epoch": 0.3696, + "learning_rate": 1.4054606969251096e-05, + "loss": 0.0913, + "step": 924 + }, + { + "epoch": 0.3704, + "learning_rate": 1.4080117962725929e-05, + "loss": 0.2014, + "step": 926 + }, + { + "epoch": 0.3712, + "learning_rate": 1.4105597138621281e-05, + "loss": 0.005, + "step": 928 + }, + { + "epoch": 0.372, + "learning_rate": 1.4131044298245416e-05, + "loss": 0.0427, + "step": 930 + }, + { + "epoch": 0.3728, + "learning_rate": 1.4156459243156275e-05, + "loss": 0.0029, + "step": 932 + }, + { + "epoch": 0.3736, + "learning_rate": 1.418184177516301e-05, + "loss": 0.391, + "step": 934 + }, + { + "epoch": 0.3744, + "learning_rate": 1.420719169632754e-05, + "loss": 0.5573, + "step": 936 + }, + { + "epoch": 0.3752, + "learning_rate": 1.4232508808966085e-05, + "loss": 0.0019, + "step": 938 + }, + { + "epoch": 0.376, + "learning_rate": 1.4257792915650735e-05, + "loss": 0.005, + "step": 940 + }, + { + "epoch": 0.3768, + "learning_rate": 1.4283043819210906e-05, + "loss": 0.3926, + "step": 942 + }, + { + "epoch": 0.3776, + "learning_rate": 1.430826132273499e-05, + "loss": 0.1287, + "step": 944 + }, + { + "epoch": 0.3784, + "learning_rate": 1.4333445229571857e-05, + "loss": 0.0007, + "step": 946 + }, + { + "epoch": 0.3792, + "learning_rate": 1.4358595343332342e-05, + "loss": 0.0114, + "step": 948 + }, + { + "epoch": 0.38, + "learning_rate": 1.4383711467890772e-05, + "loss": 0.1224, + "step": 950 + }, + { + "epoch": 0.3808, + "learning_rate": 1.4408793407386584e-05, + "loss": 0.0004, + "step": 952 + }, + { + "epoch": 0.3816, + "learning_rate": 1.4433840966225767e-05, + "loss": 0.2015, + "step": 954 + }, + { + "epoch": 0.3824, + "learning_rate": 1.4458853949082434e-05, + "loss": 0.0291, + "step": 956 + }, + { + "epoch": 0.3832, + "learning_rate": 1.4483832160900332e-05, + "loss": 0.0001, + "step": 958 + }, + { + "epoch": 0.384, + "learning_rate": 1.4508775406894315e-05, + "loss": 0.1113, + "step": 960 + }, + { + "epoch": 0.3848, + "learning_rate": 1.4533683492551942e-05, + "loss": 0.0015, + "step": 962 + }, + { + "epoch": 0.3856, + "learning_rate": 1.4558556223634988e-05, + "loss": 0.6482, + "step": 964 + }, + { + "epoch": 0.3864, + "learning_rate": 1.4583393406180886e-05, + "loss": 0.2356, + "step": 966 + }, + { + "epoch": 0.3872, + "learning_rate": 1.460819484650431e-05, + "loss": 0.103, + "step": 968 + }, + { + "epoch": 0.388, + "learning_rate": 1.4632960351198618e-05, + "loss": 0.6899, + "step": 970 + }, + { + "epoch": 0.3888, + "learning_rate": 1.4657689727137441e-05, + "loss": 0.2278, + "step": 972 + }, + { + "epoch": 0.3896, + "learning_rate": 1.468238278147614e-05, + "loss": 0.2956, + "step": 974 + }, + { + "epoch": 0.3904, + "learning_rate": 1.470703932165332e-05, + "loss": 1.3894, + "step": 976 + }, + { + "epoch": 0.3912, + "learning_rate": 1.4731659155392339e-05, + "loss": 0.0808, + "step": 978 + }, + { + "epoch": 0.392, + "learning_rate": 1.4756242090702744e-05, + "loss": 0.1975, + "step": 980 + }, + { + "epoch": 0.3928, + "learning_rate": 1.4780787935881913e-05, + "loss": 0.2071, + "step": 982 + }, + { + "epoch": 0.3936, + "learning_rate": 1.4805296499516397e-05, + "loss": 0.2383, + "step": 984 + }, + { + "epoch": 0.3944, + "learning_rate": 1.482976759048351e-05, + "loss": 0.029, + "step": 986 + }, + { + "epoch": 0.3952, + "learning_rate": 1.485420101795274e-05, + "loss": 0.4407, + "step": 988 + }, + { + "epoch": 0.396, + "learning_rate": 1.4878596591387327e-05, + "loss": 0.2196, + "step": 990 + }, + { + "epoch": 0.3968, + "learning_rate": 1.4902954120545686e-05, + "loss": 0.1377, + "step": 992 + }, + { + "epoch": 0.3976, + "learning_rate": 1.4927273415482913e-05, + "loss": 0.0063, + "step": 994 + }, + { + "epoch": 0.3984, + "learning_rate": 1.4951554286552261e-05, + "loss": 0.1238, + "step": 996 + }, + { + "epoch": 0.3992, + "learning_rate": 1.4975796544406617e-05, + "loss": 0.5682, + "step": 998 + }, + { + "epoch": 0.4, + "learning_rate": 1.4999999999999992e-05, + "loss": 0.2291, + "step": 1000 + }, + { + "epoch": 0.4008, + "learning_rate": 1.502416446458897e-05, + "loss": 0.2126, + "step": 1002 + }, + { + "epoch": 0.4016, + "learning_rate": 1.5048289749734206e-05, + "loss": 0.0218, + "step": 1004 + }, + { + "epoch": 0.4024, + "learning_rate": 1.5072375667301895e-05, + "loss": 0.2061, + "step": 1006 + }, + { + "epoch": 0.4032, + "learning_rate": 1.5096422029465178e-05, + "loss": 0.2672, + "step": 1008 + }, + { + "epoch": 0.404, + "learning_rate": 1.5120428648705714e-05, + "loss": 0.5395, + "step": 1010 + }, + { + "epoch": 0.4048, + "learning_rate": 1.5144395337815064e-05, + "loss": 0.0367, + "step": 1012 + }, + { + "epoch": 0.4056, + "learning_rate": 1.5168321909896166e-05, + "loss": 0.2608, + "step": 1014 + }, + { + "epoch": 0.4064, + "learning_rate": 1.5192208178364808e-05, + "loss": 0.3243, + "step": 1016 + }, + { + "epoch": 0.4072, + "learning_rate": 1.521605395695107e-05, + "loss": 0.2637, + "step": 1018 + }, + { + "epoch": 0.408, + "learning_rate": 1.5239859059700784e-05, + "loss": 0.068, + "step": 1020 + }, + { + "epoch": 0.4088, + "learning_rate": 1.526362330097697e-05, + "loss": 0.2439, + "step": 1022 + }, + { + "epoch": 0.4096, + "learning_rate": 1.5287346495461322e-05, + "loss": 0.2942, + "step": 1024 + }, + { + "epoch": 0.4104, + "learning_rate": 1.531102845815557e-05, + "loss": 0.0409, + "step": 1026 + }, + { + "epoch": 0.4112, + "learning_rate": 1.5334669004383025e-05, + "loss": 0.2306, + "step": 1028 + }, + { + "epoch": 0.412, + "learning_rate": 1.5358267949789968e-05, + "loss": 0.0219, + "step": 1030 + }, + { + "epoch": 0.4128, + "learning_rate": 1.5381825110347072e-05, + "loss": 0.0225, + "step": 1032 + }, + { + "epoch": 0.4136, + "learning_rate": 1.540534030235087e-05, + "loss": 0.2414, + "step": 1034 + }, + { + "epoch": 0.4144, + "learning_rate": 1.542881334242517e-05, + "loss": 0.0773, + "step": 1036 + }, + { + "epoch": 0.4152, + "learning_rate": 1.5452244047522493e-05, + "loss": 0.0168, + "step": 1038 + }, + { + "epoch": 0.416, + "learning_rate": 1.5475632234925495e-05, + "loss": 0.9456, + "step": 1040 + }, + { + "epoch": 0.4168, + "learning_rate": 1.5498977722248388e-05, + "loss": 0.0024, + "step": 1042 + }, + { + "epoch": 0.4176, + "learning_rate": 1.552228032743839e-05, + "loss": 0.8664, + "step": 1044 + }, + { + "epoch": 0.4184, + "learning_rate": 1.5545539868777075e-05, + "loss": 0.1539, + "step": 1046 + }, + { + "epoch": 0.4192, + "learning_rate": 1.556875616488188e-05, + "loss": 0.2183, + "step": 1048 + }, + { + "epoch": 0.42, + "learning_rate": 1.5591929034707468e-05, + "loss": 0.5243, + "step": 1050 + }, + { + "epoch": 0.4208, + "learning_rate": 1.5615058297547144e-05, + "loss": 0.0663, + "step": 1052 + }, + { + "epoch": 0.4216, + "learning_rate": 1.5638143773034268e-05, + "loss": 0.3099, + "step": 1054 + }, + { + "epoch": 0.4224, + "learning_rate": 1.5661185281143663e-05, + "loss": 0.2124, + "step": 1056 + }, + { + "epoch": 0.4232, + "learning_rate": 1.5684182642193024e-05, + "loss": 0.1332, + "step": 1058 + }, + { + "epoch": 0.424, + "learning_rate": 1.5707135676844312e-05, + "loss": 0.5211, + "step": 1060 + }, + { + "epoch": 0.4248, + "learning_rate": 1.5730044206105146e-05, + "loss": 0.0595, + "step": 1062 + }, + { + "epoch": 0.4256, + "learning_rate": 1.5752908051330232e-05, + "loss": 0.2272, + "step": 1064 + }, + { + "epoch": 0.4264, + "learning_rate": 1.577572703422268e-05, + "loss": 0.1463, + "step": 1066 + }, + { + "epoch": 0.4272, + "learning_rate": 1.579850097683548e-05, + "loss": 0.1024, + "step": 1068 + }, + { + "epoch": 0.428, + "learning_rate": 1.582122970157288e-05, + "loss": 0.0008, + "step": 1070 + }, + { + "epoch": 0.4288, + "learning_rate": 1.5843913031191722e-05, + "loss": 0.5403, + "step": 1072 + }, + { + "epoch": 0.4296, + "learning_rate": 1.586655078880281e-05, + "loss": 0.0548, + "step": 1074 + }, + { + "epoch": 0.4304, + "learning_rate": 1.5889142797872383e-05, + "loss": 0.302, + "step": 1076 + }, + { + "epoch": 0.4312, + "learning_rate": 1.5911688882223415e-05, + "loss": 0.1395, + "step": 1078 + }, + { + "epoch": 0.432, + "learning_rate": 1.5934188866037007e-05, + "loss": 0.2069, + "step": 1080 + }, + { + "epoch": 0.4328, + "learning_rate": 1.5956642573853787e-05, + "loss": 0.2919, + "step": 1082 + }, + { + "epoch": 0.4336, + "learning_rate": 1.5979049830575193e-05, + "loss": 0.0096, + "step": 1084 + }, + { + "epoch": 0.4344, + "learning_rate": 1.6001410461464945e-05, + "loss": 0.0035, + "step": 1086 + }, + { + "epoch": 0.4352, + "learning_rate": 1.6023724292150377e-05, + "loss": 0.3364, + "step": 1088 + }, + { + "epoch": 0.436, + "learning_rate": 1.604599114862375e-05, + "loss": 0.7331, + "step": 1090 + }, + { + "epoch": 0.4368, + "learning_rate": 1.606821085724362e-05, + "loss": 0.1854, + "step": 1092 + }, + { + "epoch": 0.4376, + "learning_rate": 1.6090383244736253e-05, + "loss": 0.1, + "step": 1094 + }, + { + "epoch": 0.4384, + "learning_rate": 1.6112508138196912e-05, + "loss": 0.6034, + "step": 1096 + }, + { + "epoch": 0.4392, + "learning_rate": 1.613458536509124e-05, + "loss": 0.1593, + "step": 1098 + }, + { + "epoch": 0.44, + "learning_rate": 1.615661475325658e-05, + "loss": 0.114, + "step": 1100 + }, + { + "epoch": 0.4408, + "learning_rate": 1.6178596130903352e-05, + "loss": 0.0007, + "step": 1102 + }, + { + "epoch": 0.4416, + "learning_rate": 1.620052932661632e-05, + "loss": 0.1292, + "step": 1104 + }, + { + "epoch": 0.4424, + "learning_rate": 1.6222414169356056e-05, + "loss": 0.0197, + "step": 1106 + }, + { + "epoch": 0.4432, + "learning_rate": 1.6244250488460146e-05, + "loss": 0.0112, + "step": 1108 + }, + { + "epoch": 0.444, + "learning_rate": 1.6266038113644605e-05, + "loss": 0.0828, + "step": 1110 + }, + { + "epoch": 0.4448, + "learning_rate": 1.6287776875005127e-05, + "loss": 0.0151, + "step": 1112 + }, + { + "epoch": 0.4456, + "learning_rate": 1.6309466603018497e-05, + "loss": 0.5253, + "step": 1114 + }, + { + "epoch": 0.4464, + "learning_rate": 1.6331107128543856e-05, + "loss": 0.0815, + "step": 1116 + }, + { + "epoch": 0.4472, + "learning_rate": 1.635269828282404e-05, + "loss": 0.3211, + "step": 1118 + }, + { + "epoch": 0.448, + "learning_rate": 1.6374239897486905e-05, + "loss": 0.0036, + "step": 1120 + }, + { + "epoch": 0.4488, + "learning_rate": 1.6395731804546575e-05, + "loss": 0.4881, + "step": 1122 + }, + { + "epoch": 0.4496, + "learning_rate": 1.6417173836404878e-05, + "loss": 0.0635, + "step": 1124 + }, + { + "epoch": 0.4504, + "learning_rate": 1.643856582585253e-05, + "loss": 0.2656, + "step": 1126 + }, + { + "epoch": 0.4512, + "learning_rate": 1.6459907606070513e-05, + "loss": 0.2487, + "step": 1128 + }, + { + "epoch": 0.452, + "learning_rate": 1.6481199010631312e-05, + "loss": 0.1444, + "step": 1130 + }, + { + "epoch": 0.4528, + "learning_rate": 1.650243987350029e-05, + "loss": 0.2299, + "step": 1132 + }, + { + "epoch": 0.4536, + "learning_rate": 1.652363002903693e-05, + "loss": 0.0221, + "step": 1134 + }, + { + "epoch": 0.4544, + "learning_rate": 1.6544769311996146e-05, + "loss": 0.4045, + "step": 1136 + }, + { + "epoch": 0.4552, + "learning_rate": 1.656585755752956e-05, + "loss": 0.6586, + "step": 1138 + }, + { + "epoch": 0.456, + "learning_rate": 1.65868946011868e-05, + "loss": 0.2365, + "step": 1140 + }, + { + "epoch": 0.4568, + "learning_rate": 1.660788027891677e-05, + "loss": 0.2895, + "step": 1142 + }, + { + "epoch": 0.4576, + "learning_rate": 1.6628814427068944e-05, + "loss": 0.1412, + "step": 1144 + }, + { + "epoch": 0.4584, + "learning_rate": 1.6649696882394625e-05, + "loss": 0.0907, + "step": 1146 + }, + { + "epoch": 0.4592, + "learning_rate": 1.667052748204825e-05, + "loss": 0.3012, + "step": 1148 + }, + { + "epoch": 0.46, + "learning_rate": 1.6691306063588583e-05, + "loss": 0.1167, + "step": 1150 + }, + { + "epoch": 0.4608, + "learning_rate": 1.6712032464980094e-05, + "loss": 0.1889, + "step": 1152 + }, + { + "epoch": 0.4616, + "learning_rate": 1.6732706524594138e-05, + "loss": 0.3086, + "step": 1154 + }, + { + "epoch": 0.4624, + "learning_rate": 1.6753328081210244e-05, + "loss": 0.2174, + "step": 1156 + }, + { + "epoch": 0.4632, + "learning_rate": 1.6773896974017373e-05, + "loss": 0.26, + "step": 1158 + }, + { + "epoch": 0.464, + "learning_rate": 1.679441304261516e-05, + "loss": 0.2006, + "step": 1160 + }, + { + "epoch": 0.4648, + "learning_rate": 1.681487612701519e-05, + "loss": 0.3664, + "step": 1162 + }, + { + "epoch": 0.4656, + "learning_rate": 1.683528606764222e-05, + "loss": 0.0003, + "step": 1164 + }, + { + "epoch": 0.4664, + "learning_rate": 1.6855642705335428e-05, + "loss": 0.0026, + "step": 1166 + }, + { + "epoch": 0.4672, + "learning_rate": 1.687594588134968e-05, + "loss": 0.2878, + "step": 1168 + }, + { + "epoch": 0.468, + "learning_rate": 1.68961954373567e-05, + "loss": 0.0098, + "step": 1170 + }, + { + "epoch": 0.4688, + "learning_rate": 1.6916391215446403e-05, + "loss": 0.0586, + "step": 1172 + }, + { + "epoch": 0.4696, + "learning_rate": 1.693653305812805e-05, + "loss": 0.0778, + "step": 1174 + }, + { + "epoch": 0.4704, + "learning_rate": 1.6956620808331505e-05, + "loss": 0.538, + "step": 1176 + }, + { + "epoch": 0.4712, + "learning_rate": 1.697665430940846e-05, + "loss": 0.4076, + "step": 1178 + }, + { + "epoch": 0.472, + "learning_rate": 1.699663340513365e-05, + "loss": 0.0086, + "step": 1180 + }, + { + "epoch": 0.4728, + "learning_rate": 1.7016557939706068e-05, + "loss": 0.5348, + "step": 1182 + }, + { + "epoch": 0.4736, + "learning_rate": 1.7036427757750198e-05, + "loss": 0.6041, + "step": 1184 + }, + { + "epoch": 0.4744, + "learning_rate": 1.7056242704317212e-05, + "loss": 0.0343, + "step": 1186 + }, + { + "epoch": 0.4752, + "learning_rate": 1.7076002624886156e-05, + "loss": 0.2332, + "step": 1188 + }, + { + "epoch": 0.476, + "learning_rate": 1.709570736536521e-05, + "loss": 0.1139, + "step": 1190 + }, + { + "epoch": 0.4768, + "learning_rate": 1.7115356772092844e-05, + "loss": 0.8086, + "step": 1192 + }, + { + "epoch": 0.4776, + "learning_rate": 1.7134950691839063e-05, + "loss": 0.3751, + "step": 1194 + }, + { + "epoch": 0.4784, + "learning_rate": 1.7154488971806518e-05, + "loss": 0.0038, + "step": 1196 + }, + { + "epoch": 0.4792, + "learning_rate": 1.7173971459631783e-05, + "loss": 0.5315, + "step": 1198 + }, + { + "epoch": 0.48, + "learning_rate": 1.7193398003386507e-05, + "loss": 0.4179, + "step": 1200 + }, + { + "epoch": 0.4808, + "learning_rate": 1.7212768451578602e-05, + "loss": 0.0076, + "step": 1202 + }, + { + "epoch": 0.4816, + "learning_rate": 1.7232082653153416e-05, + "loss": 0.3609, + "step": 1204 + }, + { + "epoch": 0.4824, + "learning_rate": 1.7251340457494937e-05, + "loss": 0.1177, + "step": 1206 + }, + { + "epoch": 0.4832, + "learning_rate": 1.7270541714426923e-05, + "loss": 0.3364, + "step": 1208 + }, + { + "epoch": 0.484, + "learning_rate": 1.7289686274214106e-05, + "loss": 0.2662, + "step": 1210 + }, + { + "epoch": 0.4848, + "learning_rate": 1.7308773987563393e-05, + "loss": 0.5623, + "step": 1212 + }, + { + "epoch": 0.4856, + "learning_rate": 1.732780470562496e-05, + "loss": 0.277, + "step": 1214 + }, + { + "epoch": 0.4864, + "learning_rate": 1.7346778279993413e-05, + "loss": 0.0962, + "step": 1216 + }, + { + "epoch": 0.4872, + "learning_rate": 1.736569456270903e-05, + "loss": 0.043, + "step": 1218 + }, + { + "epoch": 0.488, + "learning_rate": 1.7384553406258836e-05, + "loss": 0.6313, + "step": 1220 + }, + { + "epoch": 0.4888, + "learning_rate": 1.740335466357778e-05, + "loss": 0.1333, + "step": 1222 + }, + { + "epoch": 0.4896, + "learning_rate": 1.7422098188049888e-05, + "loss": 0.0028, + "step": 1224 + }, + { + "epoch": 0.4904, + "learning_rate": 1.7440783833509373e-05, + "loss": 0.3986, + "step": 1226 + }, + { + "epoch": 0.4912, + "learning_rate": 1.7459411454241816e-05, + "loss": 0.2498, + "step": 1228 + }, + { + "epoch": 0.492, + "learning_rate": 1.747798090498531e-05, + "loss": 0.1344, + "step": 1230 + }, + { + "epoch": 0.4928, + "learning_rate": 1.749649204093154e-05, + "loss": 0.5114, + "step": 1232 + }, + { + "epoch": 0.4936, + "learning_rate": 1.7514944717726962e-05, + "loss": 0.2453, + "step": 1234 + }, + { + "epoch": 0.4944, + "learning_rate": 1.753333879147387e-05, + "loss": 0.5509, + "step": 1236 + }, + { + "epoch": 0.4952, + "learning_rate": 1.755167411873159e-05, + "loss": 0.0109, + "step": 1238 + }, + { + "epoch": 0.496, + "learning_rate": 1.7569950556517563e-05, + "loss": 0.0461, + "step": 1240 + }, + { + "epoch": 0.4968, + "learning_rate": 1.758816796230845e-05, + "loss": 0.47, + "step": 1242 + }, + { + "epoch": 0.4976, + "learning_rate": 1.7606326194041278e-05, + "loss": 0.523, + "step": 1244 + }, + { + "epoch": 0.4984, + "learning_rate": 1.762442511011447e-05, + "loss": 0.2827, + "step": 1246 + }, + { + "epoch": 0.4992, + "learning_rate": 1.7642464569389083e-05, + "loss": 0.0859, + "step": 1248 + }, + { + "epoch": 0.5, + "learning_rate": 1.766044443118977e-05, + "loss": 0.6452, + "step": 1250 + }, + { + "epoch": 0.5008, + "learning_rate": 1.767836455530598e-05, + "loss": 0.0036, + "step": 1252 + }, + { + "epoch": 0.5016, + "learning_rate": 1.7696224801992947e-05, + "loss": 0.2799, + "step": 1254 + }, + { + "epoch": 0.5024, + "learning_rate": 1.77140250319729e-05, + "loss": 0.1759, + "step": 1256 + }, + { + "epoch": 0.5032, + "learning_rate": 1.7731765106436073e-05, + "loss": 0.2439, + "step": 1258 + }, + { + "epoch": 0.504, + "learning_rate": 1.7749444887041793e-05, + "loss": 0.0171, + "step": 1260 + }, + { + "epoch": 0.5048, + "learning_rate": 1.776706423591959e-05, + "loss": 0.0232, + "step": 1262 + }, + { + "epoch": 0.5056, + "learning_rate": 1.778462301567023e-05, + "loss": 0.0059, + "step": 1264 + }, + { + "epoch": 0.5064, + "learning_rate": 1.7802121089366832e-05, + "loss": 0.163, + "step": 1266 + }, + { + "epoch": 0.5072, + "learning_rate": 1.7819558320555895e-05, + "loss": 0.4731, + "step": 1268 + }, + { + "epoch": 0.508, + "learning_rate": 1.7836934573258392e-05, + "loss": 0.2923, + "step": 1270 + }, + { + "epoch": 0.5088, + "learning_rate": 1.785424971197082e-05, + "loss": 0.3556, + "step": 1272 + }, + { + "epoch": 0.5096, + "learning_rate": 1.7871503601666233e-05, + "loss": 0.1742, + "step": 1274 + }, + { + "epoch": 0.5104, + "learning_rate": 1.7888696107795343e-05, + "loss": 0.0148, + "step": 1276 + }, + { + "epoch": 0.5112, + "learning_rate": 1.790582709628753e-05, + "loss": 0.5125, + "step": 1278 + }, + { + "epoch": 0.512, + "learning_rate": 1.7922896433551903e-05, + "loss": 0.1765, + "step": 1280 + }, + { + "epoch": 0.5128, + "learning_rate": 1.793990398647835e-05, + "loss": 0.1429, + "step": 1282 + }, + { + "epoch": 0.5136, + "learning_rate": 1.795684962243855e-05, + "loss": 0.1078, + "step": 1284 + }, + { + "epoch": 0.5144, + "learning_rate": 1.7973733209287032e-05, + "loss": 0.0276, + "step": 1286 + }, + { + "epoch": 0.5152, + "learning_rate": 1.7990554615362193e-05, + "loss": 0.4077, + "step": 1288 + }, + { + "epoch": 0.516, + "learning_rate": 1.800731370948734e-05, + "loss": 0.0009, + "step": 1290 + }, + { + "epoch": 0.5168, + "learning_rate": 1.802401036097167e-05, + "loss": 0.005, + "step": 1292 + }, + { + "epoch": 0.5176, + "learning_rate": 1.804064443961135e-05, + "loss": 0.093, + "step": 1294 + }, + { + "epoch": 0.5184, + "learning_rate": 1.8057215815690494e-05, + "loss": 0.2657, + "step": 1296 + }, + { + "epoch": 0.5192, + "learning_rate": 1.8073724359982184e-05, + "loss": 0.3594, + "step": 1298 + }, + { + "epoch": 0.52, + "learning_rate": 1.809016994374947e-05, + "loss": 0.0735, + "step": 1300 + }, + { + "epoch": 0.5208, + "learning_rate": 1.81065524387464e-05, + "loss": 0.3146, + "step": 1302 + }, + { + "epoch": 0.5216, + "learning_rate": 1.8122871717218968e-05, + "loss": 0.3487, + "step": 1304 + }, + { + "epoch": 0.5224, + "learning_rate": 1.8139127651906176e-05, + "loss": 0.1389, + "step": 1306 + }, + { + "epoch": 0.5232, + "learning_rate": 1.8155320116040976e-05, + "loss": 0.034, + "step": 1308 + }, + { + "epoch": 0.524, + "learning_rate": 1.817144898335129e-05, + "loss": 0.0303, + "step": 1310 + }, + { + "epoch": 0.5248, + "learning_rate": 1.818751412806095e-05, + "loss": 0.4922, + "step": 1312 + }, + { + "epoch": 0.5256, + "learning_rate": 1.8203515424890738e-05, + "loss": 0.1506, + "step": 1314 + }, + { + "epoch": 0.5264, + "learning_rate": 1.8219452749059322e-05, + "loss": 0.2849, + "step": 1316 + }, + { + "epoch": 0.5272, + "learning_rate": 1.8235325976284276e-05, + "loss": 0.0498, + "step": 1318 + }, + { + "epoch": 0.528, + "learning_rate": 1.8251134982782952e-05, + "loss": 0.0269, + "step": 1320 + }, + { + "epoch": 0.5288, + "learning_rate": 1.826687964527355e-05, + "loss": 0.1825, + "step": 1322 + }, + { + "epoch": 0.5296, + "learning_rate": 1.828255984097604e-05, + "loss": 0.2941, + "step": 1324 + }, + { + "epoch": 0.5304, + "learning_rate": 1.8298175447613093e-05, + "loss": 0.1251, + "step": 1326 + }, + { + "epoch": 0.5312, + "learning_rate": 1.8313726343411092e-05, + "loss": 0.3087, + "step": 1328 + }, + { + "epoch": 0.532, + "learning_rate": 1.8329212407101e-05, + "loss": 0.1309, + "step": 1330 + }, + { + "epoch": 0.5328, + "learning_rate": 1.8344633517919394e-05, + "loss": 0.2697, + "step": 1332 + }, + { + "epoch": 0.5336, + "learning_rate": 1.8359989555609344e-05, + "loss": 0.0675, + "step": 1334 + }, + { + "epoch": 0.5344, + "learning_rate": 1.8375280400421407e-05, + "loss": 0.0053, + "step": 1336 + }, + { + "epoch": 0.5352, + "learning_rate": 1.8390505933114503e-05, + "loss": 0.0951, + "step": 1338 + }, + { + "epoch": 0.536, + "learning_rate": 1.8405666034956842e-05, + "loss": 0.1988, + "step": 1340 + }, + { + "epoch": 0.5368, + "learning_rate": 1.842076058772692e-05, + "loss": 0.4745, + "step": 1342 + }, + { + "epoch": 0.5376, + "learning_rate": 1.8435789473714384e-05, + "loss": 0.1565, + "step": 1344 + }, + { + "epoch": 0.5384, + "learning_rate": 1.8450752575720964e-05, + "loss": 0.0017, + "step": 1346 + }, + { + "epoch": 0.5392, + "learning_rate": 1.8465649777061384e-05, + "loss": 0.2025, + "step": 1348 + }, + { + "epoch": 0.54, + "learning_rate": 1.8480480961564266e-05, + "loss": 0.1196, + "step": 1350 + }, + { + "epoch": 0.5408, + "learning_rate": 1.8495246013573047e-05, + "loss": 0.1705, + "step": 1352 + }, + { + "epoch": 0.5416, + "learning_rate": 1.850994481794691e-05, + "loss": 0.0036, + "step": 1354 + }, + { + "epoch": 0.5424, + "learning_rate": 1.8524577260061628e-05, + "loss": 0.0589, + "step": 1356 + }, + { + "epoch": 0.5432, + "learning_rate": 1.8539143225810453e-05, + "loss": 0.1228, + "step": 1358 + }, + { + "epoch": 0.544, + "learning_rate": 1.8553642601605066e-05, + "loss": 0.1596, + "step": 1360 + }, + { + "epoch": 0.5448, + "learning_rate": 1.856807527437643e-05, + "loss": 0.9909, + "step": 1362 + }, + { + "epoch": 0.5456, + "learning_rate": 1.8582441131575658e-05, + "loss": 0.013, + "step": 1364 + }, + { + "epoch": 0.5464, + "learning_rate": 1.859674006117491e-05, + "loss": 0.7559, + "step": 1366 + }, + { + "epoch": 0.5472, + "learning_rate": 1.8610971951668268e-05, + "loss": 0.0727, + "step": 1368 + }, + { + "epoch": 0.548, + "learning_rate": 1.862513669207257e-05, + "loss": 0.0031, + "step": 1370 + }, + { + "epoch": 0.5488, + "learning_rate": 1.8639234171928348e-05, + "loss": 0.0005, + "step": 1372 + }, + { + "epoch": 0.5496, + "learning_rate": 1.8653264281300612e-05, + "loss": 0.0578, + "step": 1374 + }, + { + "epoch": 0.5504, + "learning_rate": 1.8667226910779767e-05, + "loss": 0.039, + "step": 1376 + }, + { + "epoch": 0.5512, + "learning_rate": 1.8681121951482393e-05, + "loss": 0.2802, + "step": 1378 + }, + { + "epoch": 0.552, + "learning_rate": 1.869494929505219e-05, + "loss": 0.0005, + "step": 1380 + }, + { + "epoch": 0.5528, + "learning_rate": 1.870870883366075e-05, + "loss": 0.1665, + "step": 1382 + }, + { + "epoch": 0.5536, + "learning_rate": 1.8722400460008434e-05, + "loss": 0.0063, + "step": 1384 + }, + { + "epoch": 0.5544, + "learning_rate": 1.8736024067325195e-05, + "loss": 0.0894, + "step": 1386 + }, + { + "epoch": 0.5552, + "learning_rate": 1.8749579549371373e-05, + "loss": 0.1925, + "step": 1388 + }, + { + "epoch": 0.556, + "learning_rate": 1.876306680043863e-05, + "loss": 0.5828, + "step": 1390 + }, + { + "epoch": 0.5568, + "learning_rate": 1.8776485715350665e-05, + "loss": 0.2086, + "step": 1392 + }, + { + "epoch": 0.5576, + "learning_rate": 1.878983618946409e-05, + "loss": 0.106, + "step": 1394 + }, + { + "epoch": 0.5584, + "learning_rate": 1.8803118118669203e-05, + "loss": 0.3634, + "step": 1396 + }, + { + "epoch": 0.5592, + "learning_rate": 1.881633139939087e-05, + "loss": 0.0748, + "step": 1398 + }, + { + "epoch": 0.56, + "learning_rate": 1.882947592858927e-05, + "loss": 0.2242, + "step": 1400 + }, + { + "epoch": 0.5608, + "learning_rate": 1.884255160376072e-05, + "loss": 0.5622, + "step": 1402 + }, + { + "epoch": 0.5616, + "learning_rate": 1.885555832293849e-05, + "loss": 0.0221, + "step": 1404 + }, + { + "epoch": 0.5624, + "learning_rate": 1.886849598469356e-05, + "loss": 0.0007, + "step": 1406 + }, + { + "epoch": 0.5632, + "learning_rate": 1.888136448813544e-05, + "loss": 0.1448, + "step": 1408 + }, + { + "epoch": 0.564, + "learning_rate": 1.8894163732912972e-05, + "loss": 0.6092, + "step": 1410 + }, + { + "epoch": 0.5648, + "learning_rate": 1.890689361921506e-05, + "loss": 0.3621, + "step": 1412 + }, + { + "epoch": 0.5656, + "learning_rate": 1.891955404777151e-05, + "loss": 0.0166, + "step": 1414 + }, + { + "epoch": 0.5664, + "learning_rate": 1.893214491985374e-05, + "loss": 0.1286, + "step": 1416 + }, + { + "epoch": 0.5672, + "learning_rate": 1.89446661372756e-05, + "loss": 0.5966, + "step": 1418 + }, + { + "epoch": 0.568, + "learning_rate": 1.895711760239413e-05, + "loss": 0.2598, + "step": 1420 + }, + { + "epoch": 0.5688, + "learning_rate": 1.89694992181103e-05, + "loss": 0.667, + "step": 1422 + }, + { + "epoch": 0.5696, + "learning_rate": 1.8981810887869784e-05, + "loss": 0.5182, + "step": 1424 + }, + { + "epoch": 0.5704, + "learning_rate": 1.8994052515663708e-05, + "loss": 0.3818, + "step": 1426 + }, + { + "epoch": 0.5712, + "learning_rate": 1.90062240060294e-05, + "loss": 0.1342, + "step": 1428 + }, + { + "epoch": 0.572, + "learning_rate": 1.9018325264051136e-05, + "loss": 0.3015, + "step": 1430 + }, + { + "epoch": 0.5728, + "learning_rate": 1.9030356195360868e-05, + "loss": 0.0007, + "step": 1432 + }, + { + "epoch": 0.5736, + "learning_rate": 1.904231670613899e-05, + "loss": 0.0006, + "step": 1434 + }, + { + "epoch": 0.5744, + "learning_rate": 1.905420670311502e-05, + "loss": 0.0005, + "step": 1436 + }, + { + "epoch": 0.5752, + "learning_rate": 1.906602609356838e-05, + "loss": 0.5103, + "step": 1438 + }, + { + "epoch": 0.576, + "learning_rate": 1.9077774785329078e-05, + "loss": 0.0044, + "step": 1440 + }, + { + "epoch": 0.5768, + "learning_rate": 1.9089452686778487e-05, + "loss": 0.2297, + "step": 1442 + }, + { + "epoch": 0.5776, + "learning_rate": 1.9101059706849957e-05, + "loss": 0.0073, + "step": 1444 + }, + { + "epoch": 0.5784, + "learning_rate": 1.911259575502962e-05, + "loss": 0.1903, + "step": 1446 + }, + { + "epoch": 0.5792, + "learning_rate": 1.912406074135706e-05, + "loss": 0.1661, + "step": 1448 + }, + { + "epoch": 0.58, + "learning_rate": 1.9135454576426006e-05, + "loss": 0.0213, + "step": 1450 + }, + { + "epoch": 0.5808, + "learning_rate": 1.9146777171385053e-05, + "loss": 0.222, + "step": 1452 + }, + { + "epoch": 0.5816, + "learning_rate": 1.9158028437938316e-05, + "loss": 0.268, + "step": 1454 + }, + { + "epoch": 0.5824, + "learning_rate": 1.9169208288346168e-05, + "loss": 0.3684, + "step": 1456 + }, + { + "epoch": 0.5832, + "learning_rate": 1.9180316635425876e-05, + "loss": 0.1735, + "step": 1458 + }, + { + "epoch": 0.584, + "learning_rate": 1.9191353392552346e-05, + "loss": 0.5014, + "step": 1460 + }, + { + "epoch": 0.5848, + "learning_rate": 1.9202318473658703e-05, + "loss": 0.3514, + "step": 1462 + }, + { + "epoch": 0.5856, + "learning_rate": 1.9213211793237052e-05, + "loss": 0.2858, + "step": 1464 + }, + { + "epoch": 0.5864, + "learning_rate": 1.92240332663391e-05, + "loss": 0.2022, + "step": 1466 + }, + { + "epoch": 0.5872, + "learning_rate": 1.923478280857682e-05, + "loss": 0.2076, + "step": 1468 + }, + { + "epoch": 0.588, + "learning_rate": 1.924546033612313e-05, + "loss": 0.1121, + "step": 1470 + }, + { + "epoch": 0.5888, + "learning_rate": 1.9256065765712524e-05, + "loss": 0.0422, + "step": 1472 + }, + { + "epoch": 0.5896, + "learning_rate": 1.9266599014641724e-05, + "loss": 0.1487, + "step": 1474 + }, + { + "epoch": 0.5904, + "learning_rate": 1.927706000077034e-05, + "loss": 0.3377, + "step": 1476 + }, + { + "epoch": 0.5912, + "learning_rate": 1.9287448642521507e-05, + "loss": 0.3396, + "step": 1478 + }, + { + "epoch": 0.592, + "learning_rate": 1.9297764858882516e-05, + "loss": 0.2554, + "step": 1480 + }, + { + "epoch": 0.5928, + "learning_rate": 1.9308008569405424e-05, + "loss": 0.0008, + "step": 1482 + }, + { + "epoch": 0.5936, + "learning_rate": 1.9318179694207722e-05, + "loss": 0.1203, + "step": 1484 + }, + { + "epoch": 0.5944, + "learning_rate": 1.9328278153972943e-05, + "loss": 0.0004, + "step": 1486 + }, + { + "epoch": 0.5952, + "learning_rate": 1.9338303869951266e-05, + "loss": 0.0002, + "step": 1488 + }, + { + "epoch": 0.596, + "learning_rate": 1.934825676396015e-05, + "loss": 0.0346, + "step": 1490 + }, + { + "epoch": 0.5968, + "learning_rate": 1.935813675838491e-05, + "loss": 0.2663, + "step": 1492 + }, + { + "epoch": 0.5976, + "learning_rate": 1.9367943776179375e-05, + "loss": 0.0301, + "step": 1494 + }, + { + "epoch": 0.5984, + "learning_rate": 1.9377677740866457e-05, + "loss": 0.4738, + "step": 1496 + }, + { + "epoch": 0.5992, + "learning_rate": 1.9387338576538743e-05, + "loss": 1.2062, + "step": 1498 + }, + { + "epoch": 0.6, + "learning_rate": 1.9396926207859085e-05, + "loss": 0.3236, + "step": 1500 + }, + { + "epoch": 0.6008, + "learning_rate": 1.9406440560061214e-05, + "loss": 0.3619, + "step": 1502 + }, + { + "epoch": 0.6016, + "learning_rate": 1.9415881558950302e-05, + "loss": 0.2093, + "step": 1504 + }, + { + "epoch": 0.6024, + "learning_rate": 1.942524913090354e-05, + "loss": 0.0387, + "step": 1506 + }, + { + "epoch": 0.6032, + "learning_rate": 1.9434543202870723e-05, + "loss": 1.9969, + "step": 1508 + }, + { + "epoch": 0.604, + "learning_rate": 1.9443763702374815e-05, + "loss": 0.0078, + "step": 1510 + }, + { + "epoch": 0.6048, + "learning_rate": 1.9452910557512494e-05, + "loss": 0.738, + "step": 1512 + }, + { + "epoch": 0.6056, + "learning_rate": 1.9461983696954756e-05, + "loss": 0.1017, + "step": 1514 + }, + { + "epoch": 0.6064, + "learning_rate": 1.947098304994744e-05, + "loss": 0.013, + "step": 1516 + }, + { + "epoch": 0.6072, + "learning_rate": 1.9479908546311787e-05, + "loss": 0.2554, + "step": 1518 + }, + { + "epoch": 0.608, + "learning_rate": 1.9488760116444966e-05, + "loss": 0.325, + "step": 1520 + }, + { + "epoch": 0.6088, + "learning_rate": 1.949753769132067e-05, + "loss": 0.0313, + "step": 1522 + }, + { + "epoch": 0.6096, + "learning_rate": 1.95062412024896e-05, + "loss": 0.2035, + "step": 1524 + }, + { + "epoch": 0.6104, + "learning_rate": 1.951487058208003e-05, + "loss": 0.2413, + "step": 1526 + }, + { + "epoch": 0.6112, + "learning_rate": 1.952342576279833e-05, + "loss": 0.3224, + "step": 1528 + }, + { + "epoch": 0.612, + "learning_rate": 1.953190667792947e-05, + "loss": 0.0523, + "step": 1530 + }, + { + "epoch": 0.6128, + "learning_rate": 1.9540313261337578e-05, + "loss": 0.6345, + "step": 1532 + }, + { + "epoch": 0.6136, + "learning_rate": 1.954864544746643e-05, + "loss": 0.5308, + "step": 1534 + }, + { + "epoch": 0.6144, + "learning_rate": 1.955690317133996e-05, + "loss": 0.0057, + "step": 1536 + }, + { + "epoch": 0.6152, + "learning_rate": 1.956508636856278e-05, + "loss": 0.6466, + "step": 1538 + }, + { + "epoch": 0.616, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.1571, + "step": 1540 + }, + { + "epoch": 0.6168, + "learning_rate": 1.95812289283811e-05, + "loss": 0.2567, + "step": 1542 + }, + { + "epoch": 0.6176, + "learning_rate": 1.958918816509367e-05, + "loss": 0.3165, + "step": 1544 + }, + { + "epoch": 0.6184, + "learning_rate": 1.9597072623390668e-05, + "loss": 0.2134, + "step": 1546 + }, + { + "epoch": 0.6192, + "learning_rate": 1.9604882241787496e-05, + "loss": 0.0162, + "step": 1548 + }, + { + "epoch": 0.62, + "learning_rate": 1.9612616959383187e-05, + "loss": 0.1221, + "step": 1550 + }, + { + "epoch": 0.6208, + "learning_rate": 1.9620276715860856e-05, + "loss": 0.0007, + "step": 1552 + }, + { + "epoch": 0.6216, + "learning_rate": 1.9627861451488187e-05, + "loss": 0.1751, + "step": 1554 + }, + { + "epoch": 0.6224, + "learning_rate": 1.963537110711789e-05, + "loss": 0.0002, + "step": 1556 + }, + { + "epoch": 0.6232, + "learning_rate": 1.964280562418815e-05, + "loss": 0.0537, + "step": 1558 + }, + { + "epoch": 0.624, + "learning_rate": 1.9650164944723116e-05, + "loss": 0.1969, + "step": 1560 + }, + { + "epoch": 0.6248, + "learning_rate": 1.9657449011333328e-05, + "loss": 0.0648, + "step": 1562 + }, + { + "epoch": 0.6256, + "learning_rate": 1.9664657767216176e-05, + "loss": 0.001, + "step": 1564 + }, + { + "epoch": 0.6264, + "learning_rate": 1.967179115615633e-05, + "loss": 0.1878, + "step": 1566 + }, + { + "epoch": 0.6272, + "learning_rate": 1.967884912252619e-05, + "loss": 0.0762, + "step": 1568 + }, + { + "epoch": 0.628, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.0006, + "step": 1570 + }, + { + "epoch": 0.6288, + "learning_rate": 1.969273856798585e-05, + "loss": 0.1703, + "step": 1572 + }, + { + "epoch": 0.6296, + "learning_rate": 1.9699569938762972e-05, + "loss": 0.7997, + "step": 1574 + }, + { + "epoch": 0.6304, + "learning_rate": 1.9706325670345276e-05, + "loss": 0.014, + "step": 1576 + }, + { + "epoch": 0.6312, + "learning_rate": 1.9713005710050203e-05, + "loss": 0.0642, + "step": 1578 + }, + { + "epoch": 0.632, + "learning_rate": 1.9719610005785466e-05, + "loss": 0.1404, + "step": 1580 + }, + { + "epoch": 0.6328, + "learning_rate": 1.9726138506049434e-05, + "loss": 0.5661, + "step": 1582 + }, + { + "epoch": 0.6336, + "learning_rate": 1.9732591159931564e-05, + "loss": 0.2276, + "step": 1584 + }, + { + "epoch": 0.6344, + "learning_rate": 1.9738967917112752e-05, + "loss": 0.0013, + "step": 1586 + }, + { + "epoch": 0.6352, + "learning_rate": 1.974526872786577e-05, + "loss": 0.3886, + "step": 1588 + }, + { + "epoch": 0.636, + "learning_rate": 1.9751493543055634e-05, + "loss": 0.001, + "step": 1590 + }, + { + "epoch": 0.6368, + "learning_rate": 1.9757642314139977e-05, + "loss": 0.4289, + "step": 1592 + }, + { + "epoch": 0.6376, + "learning_rate": 1.976371499316945e-05, + "loss": 2.414, + "step": 1594 + }, + { + "epoch": 0.6384, + "learning_rate": 1.9769711532788083e-05, + "loss": 0.0298, + "step": 1596 + }, + { + "epoch": 0.6392, + "learning_rate": 1.9775631886233655e-05, + "loss": 0.6218, + "step": 1598 + }, + { + "epoch": 0.64, + "learning_rate": 1.9781476007338054e-05, + "loss": 0.0489, + "step": 1600 + }, + { + "epoch": 0.6408, + "learning_rate": 1.978724385052766e-05, + "loss": 0.2171, + "step": 1602 + }, + { + "epoch": 0.6416, + "learning_rate": 1.9792935370823673e-05, + "loss": 0.0488, + "step": 1604 + }, + { + "epoch": 0.6424, + "learning_rate": 1.979855052384247e-05, + "loss": 0.0578, + "step": 1606 + }, + { + "epoch": 0.6432, + "learning_rate": 1.9804089265795956e-05, + "loss": 0.366, + "step": 1608 + }, + { + "epoch": 0.644, + "learning_rate": 1.9809551553491918e-05, + "loss": 0.0702, + "step": 1610 + }, + { + "epoch": 0.6448, + "learning_rate": 1.981493734433433e-05, + "loss": 0.012, + "step": 1612 + }, + { + "epoch": 0.6456, + "learning_rate": 1.982024659632372e-05, + "loss": 1.024, + "step": 1614 + }, + { + "epoch": 0.6464, + "learning_rate": 1.9825479268057472e-05, + "loss": 0.2509, + "step": 1616 + }, + { + "epoch": 0.6472, + "learning_rate": 1.9830635318730155e-05, + "loss": 0.0481, + "step": 1618 + }, + { + "epoch": 0.648, + "learning_rate": 1.9835714708133858e-05, + "loss": 0.0422, + "step": 1620 + }, + { + "epoch": 0.6488, + "learning_rate": 1.9840717396658483e-05, + "loss": 0.002, + "step": 1622 + }, + { + "epoch": 0.6496, + "learning_rate": 1.9845643345292055e-05, + "loss": 0.0036, + "step": 1624 + }, + { + "epoch": 0.6504, + "learning_rate": 1.9850492515621038e-05, + "loss": 0.465, + "step": 1626 + }, + { + "epoch": 0.6512, + "learning_rate": 1.985526486983063e-05, + "loss": 0.0123, + "step": 1628 + }, + { + "epoch": 0.652, + "learning_rate": 1.985996037070505e-05, + "loss": 0.8625, + "step": 1630 + }, + { + "epoch": 0.6528, + "learning_rate": 1.9864578981627844e-05, + "loss": 0.2339, + "step": 1632 + }, + { + "epoch": 0.6536, + "learning_rate": 1.9869120666582153e-05, + "loss": 1.0709, + "step": 1634 + }, + { + "epoch": 0.6544, + "learning_rate": 1.9873585390151003e-05, + "loss": 0.0092, + "step": 1636 + }, + { + "epoch": 0.6552, + "learning_rate": 1.987797311751759e-05, + "loss": 0.1981, + "step": 1638 + }, + { + "epoch": 0.656, + "learning_rate": 1.9882283814465528e-05, + "loss": 0.9766, + "step": 1640 + }, + { + "epoch": 0.6568, + "learning_rate": 1.988651744737914e-05, + "loss": 0.0342, + "step": 1642 + }, + { + "epoch": 0.6576, + "learning_rate": 1.9890673983243704e-05, + "loss": 0.281, + "step": 1644 + }, + { + "epoch": 0.6584, + "learning_rate": 1.9894753389645723e-05, + "loss": 0.0015, + "step": 1646 + }, + { + "epoch": 0.6592, + "learning_rate": 1.9898755634773155e-05, + "loss": 0.0323, + "step": 1648 + }, + { + "epoch": 0.66, + "learning_rate": 1.9902680687415704e-05, + "loss": 0.1159, + "step": 1650 + }, + { + "epoch": 0.6608, + "learning_rate": 1.9906528516965014e-05, + "loss": 0.1071, + "step": 1652 + }, + { + "epoch": 0.6616, + "learning_rate": 1.9910299093414926e-05, + "loss": 0.075, + "step": 1654 + }, + { + "epoch": 0.6624, + "learning_rate": 1.9913992387361744e-05, + "loss": 0.0621, + "step": 1656 + }, + { + "epoch": 0.6632, + "learning_rate": 1.9917608370004414e-05, + "loss": 0.1992, + "step": 1658 + }, + { + "epoch": 0.664, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.2858, + "step": 1660 + }, + { + "epoch": 0.6648, + "learning_rate": 1.9924608289187786e-05, + "loss": 0.1557, + "step": 1662 + }, + { + "epoch": 0.6656, + "learning_rate": 1.9927992171141707e-05, + "loss": 0.0335, + "step": 1664 + }, + { + "epoch": 0.6664, + "learning_rate": 1.9931298632618355e-05, + "loss": 0.0092, + "step": 1666 + }, + { + "epoch": 0.6672, + "learning_rate": 1.9934527647833276e-05, + "loss": 0.156, + "step": 1668 + }, + { + "epoch": 0.668, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.0476, + "step": 1670 + }, + { + "epoch": 0.6688, + "learning_rate": 1.9940753239360047e-05, + "loss": 0.3844, + "step": 1672 + }, + { + "epoch": 0.6696, + "learning_rate": 1.994374976712348e-05, + "loss": 0.2621, + "step": 1674 + }, + { + "epoch": 0.6704, + "learning_rate": 1.994666875152874e-05, + "loss": 0.0392, + "step": 1676 + }, + { + "epoch": 0.6712, + "learning_rate": 1.9949510169813003e-05, + "loss": 0.0927, + "step": 1678 + }, + { + "epoch": 0.672, + "learning_rate": 1.9952273999818312e-05, + "loss": 0.0885, + "step": 1680 + }, + { + "epoch": 0.6728, + "learning_rate": 1.995496021999177e-05, + "loss": 0.1506, + "step": 1682 + }, + { + "epoch": 0.6736, + "learning_rate": 1.9957568809385693e-05, + "loss": 0.1026, + "step": 1684 + }, + { + "epoch": 0.6744, + "learning_rate": 1.9960099747657774e-05, + "loss": 0.0018, + "step": 1686 + }, + { + "epoch": 0.6752, + "learning_rate": 1.996255301507125e-05, + "loss": 0.1765, + "step": 1688 + }, + { + "epoch": 0.676, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.2224, + "step": 1690 + }, + { + "epoch": 0.6768, + "learning_rate": 1.9967226461403934e-05, + "loss": 0.1665, + "step": 1692 + }, + { + "epoch": 0.6776, + "learning_rate": 1.996944660387867e-05, + "loss": 0.3126, + "step": 1694 + }, + { + "epoch": 0.6784, + "learning_rate": 1.997158900260614e-05, + "loss": 0.1639, + "step": 1696 + }, + { + "epoch": 0.6792, + "learning_rate": 1.9973653640879486e-05, + "loss": 0.0112, + "step": 1698 + }, + { + "epoch": 0.68, + "learning_rate": 1.9975640502598246e-05, + "loss": 0.4507, + "step": 1700 + }, + { + "epoch": 0.6808, + "learning_rate": 1.997754957226847e-05, + "loss": 0.2233, + "step": 1702 + }, + { + "epoch": 0.6816, + "learning_rate": 1.9979380835002846e-05, + "loss": 0.0827, + "step": 1704 + }, + { + "epoch": 0.6824, + "learning_rate": 1.9981134276520828e-05, + "loss": 0.6995, + "step": 1706 + }, + { + "epoch": 0.6832, + "learning_rate": 1.998280988314872e-05, + "loss": 0.5066, + "step": 1708 + }, + { + "epoch": 0.684, + "learning_rate": 1.998440764181981e-05, + "loss": 0.2124, + "step": 1710 + }, + { + "epoch": 0.6848, + "learning_rate": 1.9985927540074453e-05, + "loss": 0.2997, + "step": 1712 + }, + { + "epoch": 0.6856, + "learning_rate": 1.998736956606018e-05, + "loss": 0.0166, + "step": 1714 + }, + { + "epoch": 0.6864, + "learning_rate": 1.9988733708531772e-05, + "loss": 0.046, + "step": 1716 + }, + { + "epoch": 0.6872, + "learning_rate": 1.9990019956851384e-05, + "loss": 0.0562, + "step": 1718 + }, + { + "epoch": 0.688, + "learning_rate": 1.9991228300988586e-05, + "loss": 0.027, + "step": 1720 + }, + { + "epoch": 0.6888, + "learning_rate": 1.999235873152047e-05, + "loss": 0.2357, + "step": 1722 + }, + { + "epoch": 0.6896, + "learning_rate": 1.9993411239631713e-05, + "loss": 0.014, + "step": 1724 + }, + { + "epoch": 0.6904, + "learning_rate": 1.9994385817114644e-05, + "loss": 0.0197, + "step": 1726 + }, + { + "epoch": 0.6912, + "learning_rate": 1.9995282456369313e-05, + "loss": 0.152, + "step": 1728 + }, + { + "epoch": 0.692, + "learning_rate": 1.9996101150403543e-05, + "loss": 0.0098, + "step": 1730 + }, + { + "epoch": 0.6928, + "learning_rate": 1.9996841892833e-05, + "loss": 1.3518, + "step": 1732 + }, + { + "epoch": 0.6936, + "learning_rate": 1.9997504677881224e-05, + "loss": 0.0193, + "step": 1734 + }, + { + "epoch": 0.6944, + "learning_rate": 1.999808950037968e-05, + "loss": 0.0004, + "step": 1736 + }, + { + "epoch": 0.6952, + "learning_rate": 1.9998596355767805e-05, + "loss": 0.0048, + "step": 1738 + }, + { + "epoch": 0.696, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.0518, + "step": 1740 + }, + { + "epoch": 0.6968, + "learning_rate": 1.9999376150010868e-05, + "loss": 0.2947, + "step": 1742 + }, + { + "epoch": 0.6976, + "learning_rate": 1.9999649082784807e-05, + "loss": 0.0035, + "step": 1744 + }, + { + "epoch": 0.6984, + "learning_rate": 1.9999844036286483e-05, + "loss": 0.0099, + "step": 1746 + }, + { + "epoch": 0.6992, + "learning_rate": 1.9999961008995607e-05, + "loss": 0.0088, + "step": 1748 + }, + { + "epoch": 0.7, + "learning_rate": 2e-05, + "loss": 0.1937, + "step": 1750 + }, + { + "epoch": 0.7008, + "learning_rate": 1.9999961008995607e-05, + "loss": 0.0005, + "step": 1752 + }, + { + "epoch": 0.7016, + "learning_rate": 1.9999844036286483e-05, + "loss": 0.4619, + "step": 1754 + }, + { + "epoch": 0.7024, + "learning_rate": 1.9999649082784807e-05, + "loss": 0.4092, + "step": 1756 + }, + { + "epoch": 0.7032, + "learning_rate": 1.9999376150010868e-05, + "loss": 0.0009, + "step": 1758 + }, + { + "epoch": 0.704, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.0035, + "step": 1760 + }, + { + "epoch": 0.7048, + "learning_rate": 1.9998596355767805e-05, + "loss": 0.0336, + "step": 1762 + }, + { + "epoch": 0.7056, + "learning_rate": 1.999808950037968e-05, + "loss": 0.0058, + "step": 1764 + }, + { + "epoch": 0.7064, + "learning_rate": 1.9997504677881224e-05, + "loss": 0.0279, + "step": 1766 + }, + { + "epoch": 0.7072, + "learning_rate": 1.9996841892833e-05, + "loss": 0.0206, + "step": 1768 + }, + { + "epoch": 0.708, + "learning_rate": 1.9996101150403547e-05, + "loss": 0.0101, + "step": 1770 + }, + { + "epoch": 0.7088, + "learning_rate": 1.9995282456369313e-05, + "loss": 0.0295, + "step": 1772 + }, + { + "epoch": 0.7096, + "learning_rate": 1.9994385817114644e-05, + "loss": 0.1145, + "step": 1774 + }, + { + "epoch": 0.7104, + "learning_rate": 1.9993411239631713e-05, + "loss": 0.2297, + "step": 1776 + }, + { + "epoch": 0.7112, + "learning_rate": 1.999235873152047e-05, + "loss": 0.0011, + "step": 1778 + }, + { + "epoch": 0.712, + "learning_rate": 1.9991228300988586e-05, + "loss": 0.1393, + "step": 1780 + }, + { + "epoch": 0.7128, + "learning_rate": 1.9990019956851384e-05, + "loss": 0.0496, + "step": 1782 + }, + { + "epoch": 0.7136, + "learning_rate": 1.9988733708531772e-05, + "loss": 1.0144, + "step": 1784 + }, + { + "epoch": 0.7144, + "learning_rate": 1.998736956606018e-05, + "loss": 0.2366, + "step": 1786 + }, + { + "epoch": 0.7152, + "learning_rate": 1.9985927540074453e-05, + "loss": 0.006, + "step": 1788 + }, + { + "epoch": 0.716, + "learning_rate": 1.9984407641819812e-05, + "loss": 0.0066, + "step": 1790 + }, + { + "epoch": 0.7168, + "learning_rate": 1.998280988314872e-05, + "loss": 0.0095, + "step": 1792 + }, + { + "epoch": 0.7176, + "learning_rate": 1.9981134276520828e-05, + "loss": 0.1085, + "step": 1794 + }, + { + "epoch": 0.7184, + "learning_rate": 1.9979380835002846e-05, + "loss": 0.8131, + "step": 1796 + }, + { + "epoch": 0.7192, + "learning_rate": 1.9977549572268467e-05, + "loss": 0.3071, + "step": 1798 + }, + { + "epoch": 0.72, + "learning_rate": 1.9975640502598246e-05, + "loss": 0.3299, + "step": 1800 + }, + { + "epoch": 0.7208, + "learning_rate": 1.9973653640879486e-05, + "loss": 0.0056, + "step": 1802 + }, + { + "epoch": 0.7216, + "learning_rate": 1.997158900260614e-05, + "loss": 0.0002, + "step": 1804 + }, + { + "epoch": 0.7224, + "learning_rate": 1.9969446603878673e-05, + "loss": 0.0045, + "step": 1806 + }, + { + "epoch": 0.7232, + "learning_rate": 1.9967226461403934e-05, + "loss": 0.3511, + "step": 1808 + }, + { + "epoch": 0.724, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.0012, + "step": 1810 + }, + { + "epoch": 0.7248, + "learning_rate": 1.996255301507125e-05, + "loss": 0.9138, + "step": 1812 + }, + { + "epoch": 0.7256, + "learning_rate": 1.9960099747657774e-05, + "loss": 0.2838, + "step": 1814 + }, + { + "epoch": 0.7264, + "learning_rate": 1.9957568809385693e-05, + "loss": 0.2258, + "step": 1816 + }, + { + "epoch": 0.7272, + "learning_rate": 1.995496021999177e-05, + "loss": 0.1015, + "step": 1818 + }, + { + "epoch": 0.728, + "learning_rate": 1.9952273999818312e-05, + "loss": 0.0055, + "step": 1820 + }, + { + "epoch": 0.7288, + "learning_rate": 1.9949510169813006e-05, + "loss": 0.0226, + "step": 1822 + }, + { + "epoch": 0.7296, + "learning_rate": 1.9946668751528745e-05, + "loss": 0.0011, + "step": 1824 + }, + { + "epoch": 0.7304, + "learning_rate": 1.994374976712348e-05, + "loss": 0.5546, + "step": 1826 + }, + { + "epoch": 0.7312, + "learning_rate": 1.9940753239360047e-05, + "loss": 0.0011, + "step": 1828 + }, + { + "epoch": 0.732, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.1032, + "step": 1830 + }, + { + "epoch": 0.7328, + "learning_rate": 1.993452764783328e-05, + "loss": 0.6099, + "step": 1832 + }, + { + "epoch": 0.7336, + "learning_rate": 1.9931298632618352e-05, + "loss": 0.0191, + "step": 1834 + }, + { + "epoch": 0.7344, + "learning_rate": 1.9927992171141707e-05, + "loss": 0.0023, + "step": 1836 + }, + { + "epoch": 0.7352, + "learning_rate": 1.9924608289187786e-05, + "loss": 0.0343, + "step": 1838 + }, + { + "epoch": 0.736, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.042, + "step": 1840 + }, + { + "epoch": 0.7368, + "learning_rate": 1.9917608370004417e-05, + "loss": 0.0005, + "step": 1842 + }, + { + "epoch": 0.7376, + "learning_rate": 1.9913992387361747e-05, + "loss": 0.4032, + "step": 1844 + }, + { + "epoch": 0.7384, + "learning_rate": 1.9910299093414932e-05, + "loss": 0.0069, + "step": 1846 + }, + { + "epoch": 0.7392, + "learning_rate": 1.990652851696501e-05, + "loss": 0.0313, + "step": 1848 + }, + { + "epoch": 0.74, + "learning_rate": 1.9902680687415704e-05, + "loss": 0.2773, + "step": 1850 + }, + { + "epoch": 0.7408, + "learning_rate": 1.9898755634773155e-05, + "loss": 0.2324, + "step": 1852 + }, + { + "epoch": 0.7416, + "learning_rate": 1.9894753389645723e-05, + "loss": 0.3867, + "step": 1854 + }, + { + "epoch": 0.7424, + "learning_rate": 1.9890673983243704e-05, + "loss": 0.7596, + "step": 1856 + }, + { + "epoch": 0.7432, + "learning_rate": 1.9886517447379143e-05, + "loss": 0.2482, + "step": 1858 + }, + { + "epoch": 0.744, + "learning_rate": 1.988228381446553e-05, + "loss": 0.5124, + "step": 1860 + }, + { + "epoch": 0.7448, + "learning_rate": 1.987797311751759e-05, + "loss": 0.0666, + "step": 1862 + }, + { + "epoch": 0.7456, + "learning_rate": 1.9873585390151007e-05, + "loss": 0.0427, + "step": 1864 + }, + { + "epoch": 0.7464, + "learning_rate": 1.9869120666582153e-05, + "loss": 0.047, + "step": 1866 + }, + { + "epoch": 0.7472, + "learning_rate": 1.9864578981627844e-05, + "loss": 0.4023, + "step": 1868 + }, + { + "epoch": 0.748, + "learning_rate": 1.985996037070505e-05, + "loss": 0.3911, + "step": 1870 + }, + { + "epoch": 0.7488, + "learning_rate": 1.985526486983063e-05, + "loss": 0.0549, + "step": 1872 + }, + { + "epoch": 0.7496, + "learning_rate": 1.9850492515621038e-05, + "loss": 0.0598, + "step": 1874 + }, + { + "epoch": 0.7504, + "learning_rate": 1.9845643345292055e-05, + "loss": 0.2436, + "step": 1876 + }, + { + "epoch": 0.7512, + "learning_rate": 1.9840717396658483e-05, + "loss": 0.1072, + "step": 1878 + }, + { + "epoch": 0.752, + "learning_rate": 1.983571470813386e-05, + "loss": 0.3193, + "step": 1880 + }, + { + "epoch": 0.7528, + "learning_rate": 1.983063531873016e-05, + "loss": 0.0208, + "step": 1882 + }, + { + "epoch": 0.7536, + "learning_rate": 1.982547926805747e-05, + "loss": 0.0187, + "step": 1884 + }, + { + "epoch": 0.7544, + "learning_rate": 1.9820246596323724e-05, + "loss": 0.0996, + "step": 1886 + }, + { + "epoch": 0.7552, + "learning_rate": 1.981493734433433e-05, + "loss": 0.0165, + "step": 1888 + }, + { + "epoch": 0.756, + "learning_rate": 1.9809551553491918e-05, + "loss": 0.0119, + "step": 1890 + }, + { + "epoch": 0.7568, + "learning_rate": 1.9804089265795963e-05, + "loss": 0.0359, + "step": 1892 + }, + { + "epoch": 0.7576, + "learning_rate": 1.979855052384247e-05, + "loss": 0.136, + "step": 1894 + }, + { + "epoch": 0.7584, + "learning_rate": 1.979293537082368e-05, + "loss": 0.4746, + "step": 1896 + }, + { + "epoch": 0.7592, + "learning_rate": 1.9787243850527663e-05, + "loss": 0.2797, + "step": 1898 + }, + { + "epoch": 0.76, + "learning_rate": 1.978147600733806e-05, + "loss": 0.5301, + "step": 1900 + }, + { + "epoch": 0.7608, + "learning_rate": 1.977563188623365e-05, + "loss": 0.2177, + "step": 1902 + }, + { + "epoch": 0.7616, + "learning_rate": 1.9769711532788086e-05, + "loss": 0.4217, + "step": 1904 + }, + { + "epoch": 0.7624, + "learning_rate": 1.9763714993169448e-05, + "loss": 0.1825, + "step": 1906 + }, + { + "epoch": 0.7632, + "learning_rate": 1.9757642314139977e-05, + "loss": 0.0007, + "step": 1908 + }, + { + "epoch": 0.764, + "learning_rate": 1.9751493543055638e-05, + "loss": 0.4687, + "step": 1910 + }, + { + "epoch": 0.7648, + "learning_rate": 1.9745268727865774e-05, + "loss": 0.2132, + "step": 1912 + }, + { + "epoch": 0.7656, + "learning_rate": 1.973896791711276e-05, + "loss": 0.2216, + "step": 1914 + }, + { + "epoch": 0.7664, + "learning_rate": 1.9732591159931567e-05, + "loss": 0.0286, + "step": 1916 + }, + { + "epoch": 0.7672, + "learning_rate": 1.972613850604944e-05, + "loss": 0.0596, + "step": 1918 + }, + { + "epoch": 0.768, + "learning_rate": 1.9719610005785463e-05, + "loss": 0.5919, + "step": 1920 + }, + { + "epoch": 0.7688, + "learning_rate": 1.9713005710050206e-05, + "loss": 0.1655, + "step": 1922 + }, + { + "epoch": 0.7696, + "learning_rate": 1.9706325670345276e-05, + "loss": 0.0007, + "step": 1924 + }, + { + "epoch": 0.7704, + "learning_rate": 1.9699569938762975e-05, + "loss": 0.3033, + "step": 1926 + }, + { + "epoch": 0.7712, + "learning_rate": 1.969273856798586e-05, + "loss": 0.0566, + "step": 1928 + }, + { + "epoch": 0.772, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.239, + "step": 1930 + }, + { + "epoch": 0.7728, + "learning_rate": 1.9678849122526195e-05, + "loss": 0.0011, + "step": 1932 + }, + { + "epoch": 0.7736, + "learning_rate": 1.967179115615633e-05, + "loss": 0.0195, + "step": 1934 + }, + { + "epoch": 0.7744, + "learning_rate": 1.966465776721618e-05, + "loss": 0.015, + "step": 1936 + }, + { + "epoch": 0.7752, + "learning_rate": 1.9657449011333328e-05, + "loss": 0.0023, + "step": 1938 + }, + { + "epoch": 0.776, + "learning_rate": 1.965016494472312e-05, + "loss": 0.0654, + "step": 1940 + }, + { + "epoch": 0.7768, + "learning_rate": 1.964280562418815e-05, + "loss": 0.1434, + "step": 1942 + }, + { + "epoch": 0.7776, + "learning_rate": 1.963537110711789e-05, + "loss": 1.0365, + "step": 1944 + }, + { + "epoch": 0.7784, + "learning_rate": 1.9627861451488194e-05, + "loss": 0.8124, + "step": 1946 + }, + { + "epoch": 0.7792, + "learning_rate": 1.962027671586086e-05, + "loss": 0.0422, + "step": 1948 + }, + { + "epoch": 0.78, + "learning_rate": 1.9612616959383194e-05, + "loss": 0.0008, + "step": 1950 + }, + { + "epoch": 0.7808, + "learning_rate": 1.96048822417875e-05, + "loss": 0.173, + "step": 1952 + }, + { + "epoch": 0.7816, + "learning_rate": 1.9597072623390668e-05, + "loss": 0.1296, + "step": 1954 + }, + { + "epoch": 0.7824, + "learning_rate": 1.9589188165093666e-05, + "loss": 0.1823, + "step": 1956 + }, + { + "epoch": 0.7832, + "learning_rate": 1.95812289283811e-05, + "loss": 0.0003, + "step": 1958 + }, + { + "epoch": 0.784, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.0018, + "step": 1960 + }, + { + "epoch": 0.7848, + "learning_rate": 1.9565086368562784e-05, + "loss": 0.0867, + "step": 1962 + }, + { + "epoch": 0.7856, + "learning_rate": 1.9556903171339966e-05, + "loss": 0.3306, + "step": 1964 + }, + { + "epoch": 0.7864, + "learning_rate": 1.954864544746643e-05, + "loss": 0.0112, + "step": 1966 + }, + { + "epoch": 0.7872, + "learning_rate": 1.9540313261337585e-05, + "loss": 0.0345, + "step": 1968 + }, + { + "epoch": 0.788, + "learning_rate": 1.9531906677929472e-05, + "loss": 0.0055, + "step": 1970 + }, + { + "epoch": 0.7888, + "learning_rate": 1.9523425762798335e-05, + "loss": 0.0036, + "step": 1972 + }, + { + "epoch": 0.7896, + "learning_rate": 1.9514870582080035e-05, + "loss": 0.3699, + "step": 1974 + }, + { + "epoch": 0.7904, + "learning_rate": 1.95062412024896e-05, + "loss": 0.1556, + "step": 1976 + }, + { + "epoch": 0.7912, + "learning_rate": 1.9497537691320667e-05, + "loss": 0.0106, + "step": 1978 + }, + { + "epoch": 0.792, + "learning_rate": 1.948876011644497e-05, + "loss": 0.0048, + "step": 1980 + }, + { + "epoch": 0.7928, + "learning_rate": 1.9479908546311787e-05, + "loss": 0.0592, + "step": 1982 + }, + { + "epoch": 0.7936, + "learning_rate": 1.9470983049947443e-05, + "loss": 0.1533, + "step": 1984 + }, + { + "epoch": 0.7944, + "learning_rate": 1.9461983696954767e-05, + "loss": 0.2193, + "step": 1986 + }, + { + "epoch": 0.7952, + "learning_rate": 1.9452910557512494e-05, + "loss": 0.2109, + "step": 1988 + }, + { + "epoch": 0.796, + "learning_rate": 1.9443763702374818e-05, + "loss": 0.0004, + "step": 1990 + }, + { + "epoch": 0.7968, + "learning_rate": 1.9434543202870726e-05, + "loss": 0.0382, + "step": 1992 + }, + { + "epoch": 0.7976, + "learning_rate": 1.9425249130903544e-05, + "loss": 0.025, + "step": 1994 + }, + { + "epoch": 0.7984, + "learning_rate": 1.94158815589503e-05, + "loss": 0.003, + "step": 1996 + }, + { + "epoch": 0.7992, + "learning_rate": 1.940644056006122e-05, + "loss": 0.1854, + "step": 1998 + }, + { + "epoch": 0.8, + "learning_rate": 1.939692620785909e-05, + "loss": 0.0389, + "step": 2000 + }, + { + "epoch": 0.8008, + "learning_rate": 1.9387338576538746e-05, + "loss": 0.097, + "step": 2002 + }, + { + "epoch": 0.8016, + "learning_rate": 1.9377677740866464e-05, + "loss": 0.1928, + "step": 2004 + }, + { + "epoch": 0.8024, + "learning_rate": 1.936794377617938e-05, + "loss": 0.0015, + "step": 2006 + }, + { + "epoch": 0.8032, + "learning_rate": 1.9358136758384917e-05, + "loss": 0.0348, + "step": 2008 + }, + { + "epoch": 0.804, + "learning_rate": 1.9348256763960146e-05, + "loss": 0.0114, + "step": 2010 + }, + { + "epoch": 0.8048, + "learning_rate": 1.9338303869951273e-05, + "loss": 0.2367, + "step": 2012 + }, + { + "epoch": 0.8056, + "learning_rate": 1.9328278153972943e-05, + "loss": 0.3682, + "step": 2014 + }, + { + "epoch": 0.8064, + "learning_rate": 1.931817969420773e-05, + "loss": 0.4326, + "step": 2016 + }, + { + "epoch": 0.8072, + "learning_rate": 1.930800856940543e-05, + "loss": 0.1862, + "step": 2018 + }, + { + "epoch": 0.808, + "learning_rate": 1.929776485888252e-05, + "loss": 0.0002, + "step": 2020 + }, + { + "epoch": 0.8088, + "learning_rate": 1.9287448642521517e-05, + "loss": 0.0032, + "step": 2022 + }, + { + "epoch": 0.8096, + "learning_rate": 1.9277060000770342e-05, + "loss": 0.2913, + "step": 2024 + }, + { + "epoch": 0.8104, + "learning_rate": 1.9266599014641727e-05, + "loss": 0.5676, + "step": 2026 + }, + { + "epoch": 0.8112, + "learning_rate": 1.925606576571252e-05, + "loss": 0.001, + "step": 2028 + }, + { + "epoch": 0.812, + "learning_rate": 1.9245460336123136e-05, + "loss": 0.1546, + "step": 2030 + }, + { + "epoch": 0.8128, + "learning_rate": 1.923478280857682e-05, + "loss": 0.0787, + "step": 2032 + }, + { + "epoch": 0.8136, + "learning_rate": 1.9224033266339103e-05, + "loss": 0.6456, + "step": 2034 + }, + { + "epoch": 0.8144, + "learning_rate": 1.9213211793237066e-05, + "loss": 0.0648, + "step": 2036 + }, + { + "epoch": 0.8152, + "learning_rate": 1.9202318473658707e-05, + "loss": 0.0004, + "step": 2038 + }, + { + "epoch": 0.816, + "learning_rate": 1.919135339255235e-05, + "loss": 0.1354, + "step": 2040 + }, + { + "epoch": 0.8168, + "learning_rate": 1.918031663542588e-05, + "loss": 0.0026, + "step": 2042 + }, + { + "epoch": 0.8176, + "learning_rate": 1.916920828834617e-05, + "loss": 0.011, + "step": 2044 + }, + { + "epoch": 0.8184, + "learning_rate": 1.9158028437938313e-05, + "loss": 0.4277, + "step": 2046 + }, + { + "epoch": 0.8192, + "learning_rate": 1.9146777171385057e-05, + "loss": 0.6154, + "step": 2048 + }, + { + "epoch": 0.82, + "learning_rate": 1.913545457642601e-05, + "loss": 0.1601, + "step": 2050 + }, + { + "epoch": 0.8208, + "learning_rate": 1.9124060741357065e-05, + "loss": 0.0237, + "step": 2052 + }, + { + "epoch": 0.8216, + "learning_rate": 1.911259575502963e-05, + "loss": 0.002, + "step": 2054 + }, + { + "epoch": 0.8224, + "learning_rate": 1.910105970684996e-05, + "loss": 0.246, + "step": 2056 + }, + { + "epoch": 0.8232, + "learning_rate": 1.908945268677849e-05, + "loss": 0.2791, + "step": 2058 + }, + { + "epoch": 0.824, + "learning_rate": 1.9077774785329085e-05, + "loss": 1.3502, + "step": 2060 + }, + { + "epoch": 0.8248, + "learning_rate": 1.9066026093568383e-05, + "loss": 0.0341, + "step": 2062 + }, + { + "epoch": 0.8256, + "learning_rate": 1.9054206703115013e-05, + "loss": 0.0861, + "step": 2064 + }, + { + "epoch": 0.8264, + "learning_rate": 1.9042316706138994e-05, + "loss": 0.1096, + "step": 2066 + }, + { + "epoch": 0.8272, + "learning_rate": 1.903035619536087e-05, + "loss": 0.1337, + "step": 2068 + }, + { + "epoch": 0.828, + "learning_rate": 1.901832526405114e-05, + "loss": 0.0166, + "step": 2070 + }, + { + "epoch": 0.8288, + "learning_rate": 1.9006224006029414e-05, + "loss": 0.3144, + "step": 2072 + }, + { + "epoch": 0.8296, + "learning_rate": 1.899405251566371e-05, + "loss": 0.0028, + "step": 2074 + }, + { + "epoch": 0.8304, + "learning_rate": 1.8981810887869797e-05, + "loss": 0.0524, + "step": 2076 + }, + { + "epoch": 0.8312, + "learning_rate": 1.8969499218110302e-05, + "loss": 0.4555, + "step": 2078 + }, + { + "epoch": 0.832, + "learning_rate": 1.8957117602394133e-05, + "loss": 0.1062, + "step": 2080 + }, + { + "epoch": 0.8328, + "learning_rate": 1.8944666137275596e-05, + "loss": 0.0138, + "step": 2082 + }, + { + "epoch": 0.8336, + "learning_rate": 1.8932144919853744e-05, + "loss": 0.063, + "step": 2084 + }, + { + "epoch": 0.8344, + "learning_rate": 1.8919554047771508e-05, + "loss": 0.2633, + "step": 2086 + }, + { + "epoch": 0.8352, + "learning_rate": 1.890689361921507e-05, + "loss": 0.4054, + "step": 2088 + }, + { + "epoch": 0.836, + "learning_rate": 1.8894163732912986e-05, + "loss": 0.1788, + "step": 2090 + }, + { + "epoch": 0.8368, + "learning_rate": 1.8881364488135445e-05, + "loss": 0.5077, + "step": 2092 + }, + { + "epoch": 0.8376, + "learning_rate": 1.886849598469357e-05, + "loss": 0.0706, + "step": 2094 + }, + { + "epoch": 0.8384, + "learning_rate": 1.8855558322938492e-05, + "loss": 0.0191, + "step": 2096 + }, + { + "epoch": 0.8392, + "learning_rate": 1.8842551603760725e-05, + "loss": 1.2281, + "step": 2098 + }, + { + "epoch": 0.84, + "learning_rate": 1.8829475928589265e-05, + "loss": 0.0038, + "step": 2100 + }, + { + "epoch": 0.8408, + "learning_rate": 1.8816331399390874e-05, + "loss": 0.2732, + "step": 2102 + }, + { + "epoch": 0.8416, + "learning_rate": 1.88031181186692e-05, + "loss": 0.6364, + "step": 2104 + }, + { + "epoch": 0.8424, + "learning_rate": 1.8789836189464092e-05, + "loss": 0.1305, + "step": 2106 + }, + { + "epoch": 0.8432, + "learning_rate": 1.877648571535068e-05, + "loss": 0.5577, + "step": 2108 + }, + { + "epoch": 0.844, + "learning_rate": 1.8763066800438638e-05, + "loss": 0.6116, + "step": 2110 + }, + { + "epoch": 0.8448, + "learning_rate": 1.8749579549371387e-05, + "loss": 0.457, + "step": 2112 + }, + { + "epoch": 0.8456, + "learning_rate": 1.8736024067325188e-05, + "loss": 0.1788, + "step": 2114 + }, + { + "epoch": 0.8464, + "learning_rate": 1.8722400460008437e-05, + "loss": 0.2554, + "step": 2116 + }, + { + "epoch": 0.8472, + "learning_rate": 1.8708708833660748e-05, + "loss": 0.1934, + "step": 2118 + }, + { + "epoch": 0.848, + "learning_rate": 1.8694949295052198e-05, + "loss": 0.219, + "step": 2120 + }, + { + "epoch": 0.8488, + "learning_rate": 1.868112195148239e-05, + "loss": 0.0089, + "step": 2122 + }, + { + "epoch": 0.8496, + "learning_rate": 1.866722691077977e-05, + "loss": 0.0236, + "step": 2124 + }, + { + "epoch": 0.8504, + "learning_rate": 1.8653264281300626e-05, + "loss": 0.0693, + "step": 2126 + }, + { + "epoch": 0.8512, + "learning_rate": 1.8639234171928355e-05, + "loss": 0.8022, + "step": 2128 + }, + { + "epoch": 0.852, + "learning_rate": 1.8625136692072587e-05, + "loss": 0.0728, + "step": 2130 + }, + { + "epoch": 0.8528, + "learning_rate": 1.8610971951668265e-05, + "loss": 0.2554, + "step": 2132 + }, + { + "epoch": 0.8536, + "learning_rate": 1.8596740061174912e-05, + "loss": 0.3669, + "step": 2134 + }, + { + "epoch": 0.8544, + "learning_rate": 1.858244113157566e-05, + "loss": 1.1068, + "step": 2136 + }, + { + "epoch": 0.8552, + "learning_rate": 1.8568075274376432e-05, + "loss": 0.1459, + "step": 2138 + }, + { + "epoch": 0.856, + "learning_rate": 1.8553642601605083e-05, + "loss": 0.0615, + "step": 2140 + }, + { + "epoch": 0.8568, + "learning_rate": 1.8539143225810457e-05, + "loss": 0.8236, + "step": 2142 + }, + { + "epoch": 0.8576, + "learning_rate": 1.852457726006163e-05, + "loss": 0.0404, + "step": 2144 + }, + { + "epoch": 0.8584, + "learning_rate": 1.8509944817946917e-05, + "loss": 0.5578, + "step": 2146 + }, + { + "epoch": 0.8592, + "learning_rate": 1.8495246013573064e-05, + "loss": 0.0088, + "step": 2148 + }, + { + "epoch": 0.86, + "learning_rate": 1.848048096156426e-05, + "loss": 0.1723, + "step": 2150 + }, + { + "epoch": 0.8608, + "learning_rate": 1.8465649777061387e-05, + "loss": 0.6638, + "step": 2152 + }, + { + "epoch": 0.8616, + "learning_rate": 1.8450752575720967e-05, + "loss": 0.2702, + "step": 2154 + }, + { + "epoch": 0.8624, + "learning_rate": 1.843578947371439e-05, + "loss": 0.4124, + "step": 2156 + }, + { + "epoch": 0.8632, + "learning_rate": 1.8420760587726935e-05, + "loss": 0.4226, + "step": 2158 + }, + { + "epoch": 0.864, + "learning_rate": 1.8405666034956846e-05, + "loss": 0.3271, + "step": 2160 + }, + { + "epoch": 0.8648, + "learning_rate": 1.8390505933114507e-05, + "loss": 0.1098, + "step": 2162 + }, + { + "epoch": 0.8656, + "learning_rate": 1.8375280400421414e-05, + "loss": 0.2337, + "step": 2164 + }, + { + "epoch": 0.8664, + "learning_rate": 1.8359989555609365e-05, + "loss": 0.2498, + "step": 2166 + }, + { + "epoch": 0.8672, + "learning_rate": 1.834463351791939e-05, + "loss": 0.8041, + "step": 2168 + }, + { + "epoch": 0.868, + "learning_rate": 1.8329212407101006e-05, + "loss": 0.4356, + "step": 2170 + }, + { + "epoch": 0.8688, + "learning_rate": 1.8313726343411085e-05, + "loss": 0.4451, + "step": 2172 + }, + { + "epoch": 0.8696, + "learning_rate": 1.82981754476131e-05, + "loss": 0.2204, + "step": 2174 + }, + { + "epoch": 0.8704, + "learning_rate": 1.8282559840976053e-05, + "loss": 0.1402, + "step": 2176 + }, + { + "epoch": 0.8712, + "learning_rate": 1.8266879645273557e-05, + "loss": 0.5473, + "step": 2178 + }, + { + "epoch": 0.872, + "learning_rate": 1.8251134982782966e-05, + "loss": 0.4845, + "step": 2180 + }, + { + "epoch": 0.8728, + "learning_rate": 1.823532597628428e-05, + "loss": 0.1257, + "step": 2182 + }, + { + "epoch": 0.8736, + "learning_rate": 1.8219452749059336e-05, + "loss": 0.0113, + "step": 2184 + }, + { + "epoch": 0.8744, + "learning_rate": 1.8203515424890734e-05, + "loss": 0.1242, + "step": 2186 + }, + { + "epoch": 0.8752, + "learning_rate": 1.8187514128060956e-05, + "loss": 0.2635, + "step": 2188 + }, + { + "epoch": 0.876, + "learning_rate": 1.8171448983351284e-05, + "loss": 0.5261, + "step": 2190 + }, + { + "epoch": 0.8768, + "learning_rate": 1.8155320116040983e-05, + "loss": 0.1165, + "step": 2192 + }, + { + "epoch": 0.8776, + "learning_rate": 1.8139127651906193e-05, + "loss": 0.4289, + "step": 2194 + }, + { + "epoch": 0.8784, + "learning_rate": 1.8122871717218974e-05, + "loss": 0.0207, + "step": 2196 + }, + { + "epoch": 0.8792, + "learning_rate": 1.8106552438746413e-05, + "loss": 0.3237, + "step": 2198 + }, + { + "epoch": 0.88, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.1726, + "step": 2200 + }, + { + "epoch": 0.8808, + "learning_rate": 1.807372435998219e-05, + "loss": 0.0123, + "step": 2202 + }, + { + "epoch": 0.8816, + "learning_rate": 1.8057215815690487e-05, + "loss": 0.3219, + "step": 2204 + }, + { + "epoch": 0.8824, + "learning_rate": 1.8040644439611355e-05, + "loss": 0.3569, + "step": 2206 + }, + { + "epoch": 0.8832, + "learning_rate": 1.8024010360971665e-05, + "loss": 0.2932, + "step": 2208 + }, + { + "epoch": 0.884, + "learning_rate": 1.8007313709487345e-05, + "loss": 0.203, + "step": 2210 + }, + { + "epoch": 0.8848, + "learning_rate": 1.7990554615362207e-05, + "loss": 0.1107, + "step": 2212 + }, + { + "epoch": 0.8856, + "learning_rate": 1.7973733209287036e-05, + "loss": 0.1403, + "step": 2214 + }, + { + "epoch": 0.8864, + "learning_rate": 1.7956849622438568e-05, + "loss": 0.3395, + "step": 2216 + }, + { + "epoch": 0.8872, + "learning_rate": 1.7939903986478357e-05, + "loss": 0.2361, + "step": 2218 + }, + { + "epoch": 0.888, + "learning_rate": 1.7922896433551913e-05, + "loss": 0.603, + "step": 2220 + }, + { + "epoch": 0.8888, + "learning_rate": 1.7905827096287525e-05, + "loss": 0.2352, + "step": 2222 + }, + { + "epoch": 0.8896, + "learning_rate": 1.7888696107795347e-05, + "loss": 0.0004, + "step": 2224 + }, + { + "epoch": 0.8904, + "learning_rate": 1.787150360166623e-05, + "loss": 0.351, + "step": 2226 + }, + { + "epoch": 0.8912, + "learning_rate": 1.7854249711970826e-05, + "loss": 0.0325, + "step": 2228 + }, + { + "epoch": 0.892, + "learning_rate": 1.783693457325841e-05, + "loss": 0.041, + "step": 2230 + }, + { + "epoch": 0.8928, + "learning_rate": 1.7819558320555902e-05, + "loss": 0.2836, + "step": 2232 + }, + { + "epoch": 0.8936, + "learning_rate": 1.780212108936685e-05, + "loss": 0.3255, + "step": 2234 + }, + { + "epoch": 0.8944, + "learning_rate": 1.7784623015670237e-05, + "loss": 0.5664, + "step": 2236 + }, + { + "epoch": 0.8952, + "learning_rate": 1.7767064235919594e-05, + "loss": 0.2831, + "step": 2238 + }, + { + "epoch": 0.896, + "learning_rate": 1.77494448870418e-05, + "loss": 0.1316, + "step": 2240 + }, + { + "epoch": 0.8968, + "learning_rate": 1.773176510643608e-05, + "loss": 0.0998, + "step": 2242 + }, + { + "epoch": 0.8976, + "learning_rate": 1.7714025031972894e-05, + "loss": 0.5071, + "step": 2244 + }, + { + "epoch": 0.8984, + "learning_rate": 1.769622480199295e-05, + "loss": 0.217, + "step": 2246 + }, + { + "epoch": 0.8992, + "learning_rate": 1.7678364555305982e-05, + "loss": 0.7657, + "step": 2248 + }, + { + "epoch": 0.9, + "learning_rate": 1.7660444431189777e-05, + "loss": 0.4618, + "step": 2250 + }, + { + "epoch": 0.9008, + "learning_rate": 1.76424645693891e-05, + "loss": 0.3323, + "step": 2252 + }, + { + "epoch": 0.9016, + "learning_rate": 1.762442511011448e-05, + "loss": 0.4413, + "step": 2254 + }, + { + "epoch": 0.9024, + "learning_rate": 1.7606326194041285e-05, + "loss": 0.5413, + "step": 2256 + }, + { + "epoch": 0.9032, + "learning_rate": 1.7588167962308458e-05, + "loss": 0.0173, + "step": 2258 + }, + { + "epoch": 0.904, + "learning_rate": 1.756995055651757e-05, + "loss": 0.1142, + "step": 2260 + }, + { + "epoch": 0.9048, + "learning_rate": 1.7551674118731585e-05, + "loss": 0.4094, + "step": 2262 + }, + { + "epoch": 0.9056, + "learning_rate": 1.7533338791473875e-05, + "loss": 0.1809, + "step": 2264 + }, + { + "epoch": 0.9064, + "learning_rate": 1.751494471772697e-05, + "loss": 0.3388, + "step": 2266 + }, + { + "epoch": 0.9072, + "learning_rate": 1.7496492040931548e-05, + "loss": 0.0996, + "step": 2268 + }, + { + "epoch": 0.908, + "learning_rate": 1.747798090498533e-05, + "loss": 0.5855, + "step": 2270 + }, + { + "epoch": 0.9088, + "learning_rate": 1.745941145424182e-05, + "loss": 0.1496, + "step": 2272 + }, + { + "epoch": 0.9096, + "learning_rate": 1.744078383350938e-05, + "loss": 0.1552, + "step": 2274 + }, + { + "epoch": 0.9104, + "learning_rate": 1.7422098188049885e-05, + "loss": 0.513, + "step": 2276 + }, + { + "epoch": 0.9112, + "learning_rate": 1.7403354663577782e-05, + "loss": 0.2505, + "step": 2278 + }, + { + "epoch": 0.912, + "learning_rate": 1.738455340625883e-05, + "loss": 0.2804, + "step": 2280 + }, + { + "epoch": 0.9128, + "learning_rate": 1.7365694562709038e-05, + "loss": 0.1311, + "step": 2282 + }, + { + "epoch": 0.9136, + "learning_rate": 1.7346778279993433e-05, + "loss": 0.074, + "step": 2284 + }, + { + "epoch": 0.9144, + "learning_rate": 1.7327804705624962e-05, + "loss": 0.4993, + "step": 2286 + }, + { + "epoch": 0.9152, + "learning_rate": 1.730877398756341e-05, + "loss": 1.266, + "step": 2288 + }, + { + "epoch": 0.916, + "learning_rate": 1.7289686274214113e-05, + "loss": 0.0439, + "step": 2290 + }, + { + "epoch": 0.9168, + "learning_rate": 1.727054171442693e-05, + "loss": 0.067, + "step": 2292 + }, + { + "epoch": 0.9176, + "learning_rate": 1.7251340457494934e-05, + "loss": 0.1818, + "step": 2294 + }, + { + "epoch": 0.9184, + "learning_rate": 1.7232082653153422e-05, + "loss": 0.3245, + "step": 2296 + }, + { + "epoch": 0.9192, + "learning_rate": 1.7212768451578595e-05, + "loss": 0.4171, + "step": 2298 + }, + { + "epoch": 0.92, + "learning_rate": 1.7193398003386517e-05, + "loss": 0.4119, + "step": 2300 + }, + { + "epoch": 0.9208, + "learning_rate": 1.7173971459631803e-05, + "loss": 0.0094, + "step": 2302 + }, + { + "epoch": 0.9216, + "learning_rate": 1.7154488971806525e-05, + "loss": 0.3696, + "step": 2304 + }, + { + "epoch": 0.9224, + "learning_rate": 1.713495069183907e-05, + "loss": 0.3082, + "step": 2306 + }, + { + "epoch": 0.9232, + "learning_rate": 1.7115356772092847e-05, + "loss": 0.2117, + "step": 2308 + }, + { + "epoch": 0.924, + "learning_rate": 1.709570736536522e-05, + "loss": 0.1614, + "step": 2310 + }, + { + "epoch": 0.9248, + "learning_rate": 1.7076002624886152e-05, + "loss": 0.1564, + "step": 2312 + }, + { + "epoch": 0.9256, + "learning_rate": 1.705624270431722e-05, + "loss": 0.0077, + "step": 2314 + }, + { + "epoch": 0.9264, + "learning_rate": 1.70364277577502e-05, + "loss": 0.0193, + "step": 2316 + }, + { + "epoch": 0.9272, + "learning_rate": 1.7016557939706078e-05, + "loss": 0.0178, + "step": 2318 + }, + { + "epoch": 0.928, + "learning_rate": 1.6996633405133673e-05, + "loss": 0.0863, + "step": 2320 + }, + { + "epoch": 0.9288, + "learning_rate": 1.6976654309408468e-05, + "loss": 0.3703, + "step": 2322 + }, + { + "epoch": 0.9296, + "learning_rate": 1.6956620808331515e-05, + "loss": 0.0648, + "step": 2324 + }, + { + "epoch": 0.9304, + "learning_rate": 1.6936533058128042e-05, + "loss": 0.3048, + "step": 2326 + }, + { + "epoch": 0.9312, + "learning_rate": 1.691639121544641e-05, + "loss": 0.0018, + "step": 2328 + }, + { + "epoch": 0.932, + "learning_rate": 1.6896195437356696e-05, + "loss": 0.0041, + "step": 2330 + }, + { + "epoch": 0.9328, + "learning_rate": 1.6875945881349686e-05, + "loss": 0.164, + "step": 2332 + }, + { + "epoch": 0.9336, + "learning_rate": 1.6855642705335435e-05, + "loss": 0.9997, + "step": 2334 + }, + { + "epoch": 0.9344, + "learning_rate": 1.6835286067642228e-05, + "loss": 0.0189, + "step": 2336 + }, + { + "epoch": 0.9352, + "learning_rate": 1.681487612701521e-05, + "loss": 0.1631, + "step": 2338 + }, + { + "epoch": 0.936, + "learning_rate": 1.6794413042615168e-05, + "loss": 0.1093, + "step": 2340 + }, + { + "epoch": 0.9368, + "learning_rate": 1.677389697401739e-05, + "loss": 0.0635, + "step": 2342 + }, + { + "epoch": 0.9376, + "learning_rate": 1.675332808121025e-05, + "loss": 0.0029, + "step": 2344 + }, + { + "epoch": 0.9384, + "learning_rate": 1.6732706524594145e-05, + "loss": 0.4952, + "step": 2346 + }, + { + "epoch": 0.9392, + "learning_rate": 1.671203246498009e-05, + "loss": 0.0013, + "step": 2348 + }, + { + "epoch": 0.94, + "learning_rate": 1.6691306063588593e-05, + "loss": 0.2344, + "step": 2350 + }, + { + "epoch": 0.9408, + "learning_rate": 1.6670527482048242e-05, + "loss": 0.0072, + "step": 2352 + }, + { + "epoch": 0.9416, + "learning_rate": 1.6649696882394635e-05, + "loss": 0.0013, + "step": 2354 + }, + { + "epoch": 0.9424, + "learning_rate": 1.6628814427068968e-05, + "loss": 0.0754, + "step": 2356 + }, + { + "epoch": 0.9432, + "learning_rate": 1.6607880278916778e-05, + "loss": 0.0185, + "step": 2358 + }, + { + "epoch": 0.944, + "learning_rate": 1.6586894601186824e-05, + "loss": 0.202, + "step": 2360 + }, + { + "epoch": 0.9448, + "learning_rate": 1.656585755752957e-05, + "loss": 0.0053, + "step": 2362 + }, + { + "epoch": 0.9456, + "learning_rate": 1.6544769311996153e-05, + "loss": 0.0386, + "step": 2364 + }, + { + "epoch": 0.9464, + "learning_rate": 1.6523630029036924e-05, + "loss": 0.5941, + "step": 2366 + }, + { + "epoch": 0.9472, + "learning_rate": 1.6502439873500294e-05, + "loss": 0.2747, + "step": 2368 + }, + { + "epoch": 0.948, + "learning_rate": 1.6481199010631305e-05, + "loss": 0.0184, + "step": 2370 + }, + { + "epoch": 0.9488, + "learning_rate": 1.645990760607052e-05, + "loss": 0.039, + "step": 2372 + }, + { + "epoch": 0.9496, + "learning_rate": 1.643856582585255e-05, + "loss": 0.007, + "step": 2374 + }, + { + "epoch": 0.9504, + "learning_rate": 1.641717383640488e-05, + "loss": 0.0739, + "step": 2376 + }, + { + "epoch": 0.9512, + "learning_rate": 1.6395731804546596e-05, + "loss": 0.1634, + "step": 2378 + }, + { + "epoch": 0.952, + "learning_rate": 1.63742398974869e-05, + "loss": 0.5161, + "step": 2380 + }, + { + "epoch": 0.9528, + "learning_rate": 1.6352698282824045e-05, + "loss": 0.0368, + "step": 2382 + }, + { + "epoch": 0.9536, + "learning_rate": 1.633110712854385e-05, + "loss": 0.0528, + "step": 2384 + }, + { + "epoch": 0.9544, + "learning_rate": 1.6309466603018504e-05, + "loss": 0.141, + "step": 2386 + }, + { + "epoch": 0.9552, + "learning_rate": 1.6287776875005148e-05, + "loss": 0.3671, + "step": 2388 + }, + { + "epoch": 0.956, + "learning_rate": 1.6266038113644612e-05, + "loss": 0.2521, + "step": 2390 + }, + { + "epoch": 0.9568, + "learning_rate": 1.624425048846017e-05, + "loss": 0.2974, + "step": 2392 + }, + { + "epoch": 0.9576, + "learning_rate": 1.6222414169356063e-05, + "loss": 0.0195, + "step": 2394 + }, + { + "epoch": 0.9584, + "learning_rate": 1.6200529326616343e-05, + "loss": 0.0699, + "step": 2396 + }, + { + "epoch": 0.9592, + "learning_rate": 1.6178596130903345e-05, + "loss": 0.0295, + "step": 2398 + }, + { + "epoch": 0.96, + "learning_rate": 1.6156614753256587e-05, + "loss": 0.0698, + "step": 2400 + }, + { + "epoch": 0.9608, + "learning_rate": 1.613458536509123e-05, + "loss": 0.0058, + "step": 2402 + }, + { + "epoch": 0.9616, + "learning_rate": 1.6112508138196922e-05, + "loss": 0.1831, + "step": 2404 + }, + { + "epoch": 0.9624, + "learning_rate": 1.6090383244736277e-05, + "loss": 0.0117, + "step": 2406 + }, + { + "epoch": 0.9632, + "learning_rate": 1.606821085724363e-05, + "loss": 0.5339, + "step": 2408 + }, + { + "epoch": 0.964, + "learning_rate": 1.6045991148623756e-05, + "loss": 0.0057, + "step": 2410 + }, + { + "epoch": 0.9648, + "learning_rate": 1.602372429215038e-05, + "loss": 0.0048, + "step": 2412 + }, + { + "epoch": 0.9656, + "learning_rate": 1.600141046146497e-05, + "loss": 0.1537, + "step": 2414 + }, + { + "epoch": 0.9664, + "learning_rate": 1.597904983057519e-05, + "loss": 0.0572, + "step": 2416 + }, + { + "epoch": 0.9672, + "learning_rate": 1.5956642573853794e-05, + "loss": 0.0139, + "step": 2418 + }, + { + "epoch": 0.968, + "learning_rate": 1.5934188866037014e-05, + "loss": 0.2481, + "step": 2420 + }, + { + "epoch": 0.9688, + "learning_rate": 1.591168888222342e-05, + "loss": 0.0839, + "step": 2422 + }, + { + "epoch": 0.9696, + "learning_rate": 1.5889142797872407e-05, + "loss": 0.264, + "step": 2424 + }, + { + "epoch": 0.9704, + "learning_rate": 1.5866550788802818e-05, + "loss": 0.2364, + "step": 2426 + }, + { + "epoch": 0.9712, + "learning_rate": 1.584391303119173e-05, + "loss": 0.0917, + "step": 2428 + }, + { + "epoch": 0.972, + "learning_rate": 1.582122970157289e-05, + "loss": 0.1502, + "step": 2430 + }, + { + "epoch": 0.9728, + "learning_rate": 1.5798500976835503e-05, + "loss": 0.1476, + "step": 2432 + }, + { + "epoch": 0.9736, + "learning_rate": 1.577572703422267e-05, + "loss": 0.0048, + "step": 2434 + }, + { + "epoch": 0.9744, + "learning_rate": 1.575290805133024e-05, + "loss": 0.0017, + "step": 2436 + }, + { + "epoch": 0.9752, + "learning_rate": 1.5730044206105156e-05, + "loss": 0.219, + "step": 2438 + }, + { + "epoch": 0.976, + "learning_rate": 1.570713567684432e-05, + "loss": 0.0825, + "step": 2440 + }, + { + "epoch": 0.9768, + "learning_rate": 1.5684182642193047e-05, + "loss": 0.0898, + "step": 2442 + }, + { + "epoch": 0.9776, + "learning_rate": 1.566118528114367e-05, + "loss": 0.1797, + "step": 2444 + }, + { + "epoch": 0.9784, + "learning_rate": 1.563814377303429e-05, + "loss": 0.78, + "step": 2446 + }, + { + "epoch": 0.9792, + "learning_rate": 1.561505829754715e-05, + "loss": 0.0051, + "step": 2448 + }, + { + "epoch": 0.98, + "learning_rate": 1.5591929034707475e-05, + "loss": 0.0218, + "step": 2450 + }, + { + "epoch": 0.9808, + "learning_rate": 1.5568756164881874e-05, + "loss": 0.416, + "step": 2452 + }, + { + "epoch": 0.9816, + "learning_rate": 1.5545539868777085e-05, + "loss": 0.016, + "step": 2454 + }, + { + "epoch": 0.9824, + "learning_rate": 1.5522280327438384e-05, + "loss": 0.0064, + "step": 2456 + }, + { + "epoch": 0.9832, + "learning_rate": 1.5498977722248398e-05, + "loss": 0.3618, + "step": 2458 + }, + { + "epoch": 0.984, + "learning_rate": 1.547563223492552e-05, + "loss": 0.2367, + "step": 2460 + }, + { + "epoch": 0.9848, + "learning_rate": 1.5452244047522504e-05, + "loss": 0.0024, + "step": 2462 + }, + { + "epoch": 0.9856, + "learning_rate": 1.5428813342425194e-05, + "loss": 0.0359, + "step": 2464 + }, + { + "epoch": 0.9864, + "learning_rate": 1.5405340302350876e-05, + "loss": 0.0476, + "step": 2466 + }, + { + "epoch": 0.9872, + "learning_rate": 1.538182511034708e-05, + "loss": 0.1551, + "step": 2468 + }, + { + "epoch": 0.988, + "learning_rate": 1.535826794978996e-05, + "loss": 0.1158, + "step": 2470 + }, + { + "epoch": 0.9888, + "learning_rate": 1.5334669004383036e-05, + "loss": 0.0024, + "step": 2472 + }, + { + "epoch": 0.9896, + "learning_rate": 1.5311028458155564e-05, + "loss": 0.3838, + "step": 2474 + }, + { + "epoch": 0.9904, + "learning_rate": 1.528734649546133e-05, + "loss": 0.1591, + "step": 2476 + }, + { + "epoch": 0.9912, + "learning_rate": 1.5263623300976997e-05, + "loss": 0.2347, + "step": 2478 + }, + { + "epoch": 0.992, + "learning_rate": 1.5239859059700792e-05, + "loss": 0.3711, + "step": 2480 + }, + { + "epoch": 0.9928, + "learning_rate": 1.5216053956951096e-05, + "loss": 0.11, + "step": 2482 + }, + { + "epoch": 0.9936, + "learning_rate": 1.5192208178364819e-05, + "loss": 0.2081, + "step": 2484 + }, + { + "epoch": 0.9944, + "learning_rate": 1.5168321909896176e-05, + "loss": 0.565, + "step": 2486 + }, + { + "epoch": 0.9952, + "learning_rate": 1.5144395337815057e-05, + "loss": 0.0098, + "step": 2488 + }, + { + "epoch": 0.996, + "learning_rate": 1.5120428648705722e-05, + "loss": 0.3171, + "step": 2490 + }, + { + "epoch": 0.9968, + "learning_rate": 1.5096422029465171e-05, + "loss": 0.1204, + "step": 2492 + }, + { + "epoch": 0.9976, + "learning_rate": 1.5072375667301904e-05, + "loss": 0.0847, + "step": 2494 + }, + { + "epoch": 0.9984, + "learning_rate": 1.5048289749734231e-05, + "loss": 0.1095, + "step": 2496 + }, + { + "epoch": 0.9992, + "learning_rate": 1.502416446458898e-05, + "loss": 0.0296, + "step": 2498 + }, + { + "epoch": 1.0, + "learning_rate": 1.5000000000000014e-05, + "loss": 0.0231, + "step": 2500 + }, + { + "epoch": 1.0, + "step": 2500, + "total_flos": 1.4373314295758848e+16, + "train_loss": 0.2142522058448987, + "train_runtime": 13228.3808, + "train_samples_per_second": 3.024, + "train_steps_per_second": 0.189 + } + ], + "logging_steps": 2, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 1.4373314295758848e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..e452ef7e6371a8adbfacc36f40351260cbaf2880 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d328c9e89c27e4cf7dfec491aa96a2350e36792d8f75942d1b13024ac7bbb8a0 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..07f48da18bedaf5acfb3061bd08a0e1c914a582b --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3837dd5a522246858491a93f23198c0722d79d638e89c530ad0ae9b655b672d +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..96de861d23d60d042e6b9c63f72f0e582018c605 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d9077276dba465a2475dc97639fc2b878407cf9e0a9fd4855cdb12adb85ada4 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..c7616bb45a0af551269d379a3bb38605384692b9 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fd8e05d1f3722a6a8f020b67229fb7e0133968f8599cb9160406f152b7c6298 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_divbs_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_divbs_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..79ec92ee9418bf5c1b6e005e0c99dd353af9b0ae --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_divbs_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,7532 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008, + "learning_rate": 2.357535430610912e-06, + "loss": 0.5066, + "step": 2 + }, + { + "epoch": 0.0016, + "learning_rate": 2.3755748898855234e-06, + "loss": 0.4284, + "step": 4 + }, + { + "epoch": 0.0024, + "learning_rate": 2.3936738059587174e-06, + "loss": 0.54, + "step": 6 + }, + { + "epoch": 0.0032, + "learning_rate": 2.411832037691545e-06, + "loss": 0.223, + "step": 8 + }, + { + "epoch": 0.004, + "learning_rate": 2.430049443482434e-06, + "loss": 0.7785, + "step": 10 + }, + { + "epoch": 0.0048, + "learning_rate": 2.448325881268406e-06, + "loss": 0.2473, + "step": 12 + }, + { + "epoch": 0.0056, + "learning_rate": 2.4666612085261277e-06, + "loss": 0.6844, + "step": 14 + }, + { + "epoch": 0.0064, + "learning_rate": 2.4850552822730346e-06, + "loss": 0.2891, + "step": 16 + }, + { + "epoch": 0.0072, + "learning_rate": 2.503507959068455e-06, + "loss": 0.3879, + "step": 18 + }, + { + "epoch": 0.008, + "learning_rate": 2.522019095014686e-06, + "loss": 0.3056, + "step": 20 + }, + { + "epoch": 0.0088, + "learning_rate": 2.5405885457581814e-06, + "loss": 0.5168, + "step": 22 + }, + { + "epoch": 0.0096, + "learning_rate": 2.5592161664906243e-06, + "loss": 0.3891, + "step": 24 + }, + { + "epoch": 0.0104, + "learning_rate": 2.5779018119501086e-06, + "loss": 0.203, + "step": 26 + }, + { + "epoch": 0.0112, + "learning_rate": 2.596645336422219e-06, + "loss": 0.6583, + "step": 28 + }, + { + "epoch": 0.012, + "learning_rate": 2.615446593741161e-06, + "loss": 0.619, + "step": 30 + }, + { + "epoch": 0.0128, + "learning_rate": 2.6343054372909648e-06, + "loss": 0.4012, + "step": 32 + }, + { + "epoch": 0.0136, + "learning_rate": 2.6532217200065826e-06, + "loss": 0.6023, + "step": 34 + }, + { + "epoch": 0.0144, + "learning_rate": 2.6721952943750396e-06, + "loss": 0.2551, + "step": 36 + }, + { + "epoch": 0.0152, + "learning_rate": 2.691226012436604e-06, + "loss": 0.5627, + "step": 38 + }, + { + "epoch": 0.016, + "learning_rate": 2.7103137257858893e-06, + "loss": 0.4168, + "step": 40 + }, + { + "epoch": 0.0168, + "learning_rate": 2.7294582855730733e-06, + "loss": 0.286, + "step": 42 + }, + { + "epoch": 0.0176, + "learning_rate": 2.7486595425050566e-06, + "loss": 0.5306, + "step": 44 + }, + { + "epoch": 0.0184, + "learning_rate": 2.7679173468465813e-06, + "loss": 0.4829, + "step": 46 + }, + { + "epoch": 0.0192, + "learning_rate": 2.7872315484213954e-06, + "loss": 0.2738, + "step": 48 + }, + { + "epoch": 0.02, + "learning_rate": 2.8066019966134873e-06, + "loss": 0.6816, + "step": 50 + }, + { + "epoch": 0.0208, + "learning_rate": 2.826028540368212e-06, + "loss": 0.4022, + "step": 52 + }, + { + "epoch": 0.0216, + "learning_rate": 2.845511028193477e-06, + "loss": 0.2627, + "step": 54 + }, + { + "epoch": 0.0224, + "learning_rate": 2.865049308160931e-06, + "loss": 0.4647, + "step": 56 + }, + { + "epoch": 0.0232, + "learning_rate": 2.8846432279071533e-06, + "loss": 0.0953, + "step": 58 + }, + { + "epoch": 0.024, + "learning_rate": 2.9042926346347835e-06, + "loss": 0.4157, + "step": 60 + }, + { + "epoch": 0.0248, + "learning_rate": 2.9239973751138397e-06, + "loss": 0.1798, + "step": 62 + }, + { + "epoch": 0.0256, + "learning_rate": 2.943757295682783e-06, + "loss": 0.8878, + "step": 64 + }, + { + "epoch": 0.0264, + "learning_rate": 2.9635722422497983e-06, + "loss": 0.3731, + "step": 66 + }, + { + "epoch": 0.0272, + "learning_rate": 2.983442060293926e-06, + "loss": 0.3373, + "step": 68 + }, + { + "epoch": 0.028, + "learning_rate": 3.003366594866345e-06, + "loss": 0.6968, + "step": 70 + }, + { + "epoch": 0.0288, + "learning_rate": 3.0233456905915338e-06, + "loss": 0.772, + "step": 72 + }, + { + "epoch": 0.0296, + "learning_rate": 3.0433791916684885e-06, + "loss": 0.5366, + "step": 74 + }, + { + "epoch": 0.0304, + "learning_rate": 3.0634669418719453e-06, + "loss": 0.473, + "step": 76 + }, + { + "epoch": 0.0312, + "learning_rate": 3.0836087845535933e-06, + "loss": 0.285, + "step": 78 + }, + { + "epoch": 0.032, + "learning_rate": 3.1038045626432945e-06, + "loss": 0.4266, + "step": 80 + }, + { + "epoch": 0.0328, + "learning_rate": 3.1240541186503173e-06, + "loss": 0.3658, + "step": 82 + }, + { + "epoch": 0.0336, + "learning_rate": 3.1443572946645683e-06, + "loss": 0.3782, + "step": 84 + }, + { + "epoch": 0.0344, + "learning_rate": 3.164713932357776e-06, + "loss": 0.5022, + "step": 86 + }, + { + "epoch": 0.0352, + "learning_rate": 3.1851238729848033e-06, + "loss": 0.2422, + "step": 88 + }, + { + "epoch": 0.036, + "learning_rate": 3.205586957384834e-06, + "loss": 0.5569, + "step": 90 + }, + { + "epoch": 0.0368, + "learning_rate": 3.2261030259826253e-06, + "loss": 0.5797, + "step": 92 + }, + { + "epoch": 0.0376, + "learning_rate": 3.246671918789752e-06, + "loss": 0.2737, + "step": 94 + }, + { + "epoch": 0.0384, + "learning_rate": 3.267293475405858e-06, + "loss": 0.4947, + "step": 96 + }, + { + "epoch": 0.0392, + "learning_rate": 3.2879675350199004e-06, + "loss": 0.374, + "step": 98 + }, + { + "epoch": 0.04, + "learning_rate": 3.3086939364114113e-06, + "loss": 0.5866, + "step": 100 + }, + { + "epoch": 0.0408, + "learning_rate": 3.329472517951747e-06, + "loss": 0.4331, + "step": 102 + }, + { + "epoch": 0.0416, + "learning_rate": 3.350303117605369e-06, + "loss": 0.3842, + "step": 104 + }, + { + "epoch": 0.0424, + "learning_rate": 3.3711855729310503e-06, + "loss": 0.2203, + "step": 106 + }, + { + "epoch": 0.0432, + "learning_rate": 3.3921197210832235e-06, + "loss": 0.7557, + "step": 108 + }, + { + "epoch": 0.044, + "learning_rate": 3.4131053988131947e-06, + "loss": 0.6596, + "step": 110 + }, + { + "epoch": 0.0448, + "learning_rate": 3.434142442470434e-06, + "loss": 0.3937, + "step": 112 + }, + { + "epoch": 0.0456, + "learning_rate": 3.455230688003849e-06, + "loss": 0.8088, + "step": 114 + }, + { + "epoch": 0.0464, + "learning_rate": 3.476369970963065e-06, + "loss": 0.62, + "step": 116 + }, + { + "epoch": 0.0472, + "learning_rate": 3.497560126499706e-06, + "loss": 0.3125, + "step": 118 + }, + { + "epoch": 0.048, + "learning_rate": 3.5188009893686836e-06, + "loss": 0.1061, + "step": 120 + }, + { + "epoch": 0.0488, + "learning_rate": 3.5400923939294827e-06, + "loss": 0.7779, + "step": 122 + }, + { + "epoch": 0.0496, + "learning_rate": 3.5614341741474667e-06, + "loss": 0.4566, + "step": 124 + }, + { + "epoch": 0.0504, + "learning_rate": 3.5828261635951177e-06, + "loss": 0.2501, + "step": 126 + }, + { + "epoch": 0.0512, + "learning_rate": 3.604268195453421e-06, + "loss": 0.549, + "step": 128 + }, + { + "epoch": 0.052, + "learning_rate": 3.6257601025130893e-06, + "loss": 0.3728, + "step": 130 + }, + { + "epoch": 0.0528, + "learning_rate": 3.647301717175955e-06, + "loss": 0.1764, + "step": 132 + }, + { + "epoch": 0.0536, + "learning_rate": 3.66889287145614e-06, + "loss": 0.6742, + "step": 134 + }, + { + "epoch": 0.0544, + "learning_rate": 3.6905333969814995e-06, + "loss": 0.6854, + "step": 136 + }, + { + "epoch": 0.0552, + "learning_rate": 3.712223124994867e-06, + "loss": 0.1988, + "step": 138 + }, + { + "epoch": 0.056, + "learning_rate": 3.7339618863553885e-06, + "loss": 0.5939, + "step": 140 + }, + { + "epoch": 0.0568, + "learning_rate": 3.755749511539848e-06, + "loss": 0.3901, + "step": 142 + }, + { + "epoch": 0.0576, + "learning_rate": 3.7775858306439404e-06, + "loss": 0.3543, + "step": 144 + }, + { + "epoch": 0.0584, + "learning_rate": 3.799470673383677e-06, + "loss": 0.217, + "step": 146 + }, + { + "epoch": 0.0592, + "learning_rate": 3.821403869096644e-06, + "loss": 0.2542, + "step": 148 + }, + { + "epoch": 0.06, + "learning_rate": 3.8433852467434175e-06, + "loss": 0.3439, + "step": 150 + }, + { + "epoch": 0.0608, + "learning_rate": 3.865414634908756e-06, + "loss": 0.4473, + "step": 152 + }, + { + "epoch": 0.0616, + "learning_rate": 3.887491861803081e-06, + "loss": 0.6661, + "step": 154 + }, + { + "epoch": 0.0624, + "learning_rate": 3.909616755263741e-06, + "loss": 0.6413, + "step": 156 + }, + { + "epoch": 0.0632, + "learning_rate": 3.9317891427563725e-06, + "loss": 0.4425, + "step": 158 + }, + { + "epoch": 0.064, + "learning_rate": 3.954008851376244e-06, + "loss": 0.3943, + "step": 160 + }, + { + "epoch": 0.0648, + "learning_rate": 3.976275707849619e-06, + "loss": 0.7404, + "step": 162 + }, + { + "epoch": 0.0656, + "learning_rate": 3.99858953853505e-06, + "loss": 0.466, + "step": 164 + }, + { + "epoch": 0.0664, + "learning_rate": 4.0209501694248e-06, + "loss": 0.585, + "step": 166 + }, + { + "epoch": 0.0672, + "learning_rate": 4.043357426146209e-06, + "loss": 0.9076, + "step": 168 + }, + { + "epoch": 0.068, + "learning_rate": 4.065811133962987e-06, + "loss": 0.4185, + "step": 170 + }, + { + "epoch": 0.0688, + "learning_rate": 4.08831111777658e-06, + "loss": 0.7192, + "step": 172 + }, + { + "epoch": 0.0696, + "learning_rate": 4.110857202127611e-06, + "loss": 0.9235, + "step": 174 + }, + { + "epoch": 0.0704, + "learning_rate": 4.133449211197183e-06, + "loss": 0.3489, + "step": 176 + }, + { + "epoch": 0.0712, + "learning_rate": 4.156086968808274e-06, + "loss": 0.4155, + "step": 178 + }, + { + "epoch": 0.072, + "learning_rate": 4.178770298427114e-06, + "loss": 0.1693, + "step": 180 + }, + { + "epoch": 0.0728, + "learning_rate": 4.201499023164515e-06, + "loss": 0.125, + "step": 182 + }, + { + "epoch": 0.0736, + "learning_rate": 4.224272965777315e-06, + "loss": 0.4994, + "step": 184 + }, + { + "epoch": 0.0744, + "learning_rate": 4.247091948669764e-06, + "loss": 0.4609, + "step": 186 + }, + { + "epoch": 0.0752, + "learning_rate": 4.269955793894849e-06, + "loss": 0.5455, + "step": 188 + }, + { + "epoch": 0.076, + "learning_rate": 4.292864323155684e-06, + "loss": 0.4301, + "step": 190 + }, + { + "epoch": 0.0768, + "learning_rate": 4.3158173578069696e-06, + "loss": 0.2036, + "step": 192 + }, + { + "epoch": 0.0776, + "learning_rate": 4.338814718856333e-06, + "loss": 0.2572, + "step": 194 + }, + { + "epoch": 0.0784, + "learning_rate": 4.3618562269657285e-06, + "loss": 0.2054, + "step": 196 + }, + { + "epoch": 0.0792, + "learning_rate": 4.384941702452852e-06, + "loss": 1.329, + "step": 198 + }, + { + "epoch": 0.08, + "learning_rate": 4.408070965292526e-06, + "loss": 0.4782, + "step": 200 + }, + { + "epoch": 0.0808, + "learning_rate": 4.431243835118112e-06, + "loss": 0.419, + "step": 202 + }, + { + "epoch": 0.0816, + "learning_rate": 4.4544601312229185e-06, + "loss": 0.8437, + "step": 204 + }, + { + "epoch": 0.0824, + "learning_rate": 4.477719672561602e-06, + "loss": 0.5603, + "step": 206 + }, + { + "epoch": 0.0832, + "learning_rate": 4.501022277751605e-06, + "loss": 0.5275, + "step": 208 + }, + { + "epoch": 0.084, + "learning_rate": 4.524367765074499e-06, + "loss": 0.6613, + "step": 210 + }, + { + "epoch": 0.0848, + "learning_rate": 4.5477559524775e-06, + "loss": 0.8573, + "step": 212 + }, + { + "epoch": 0.0856, + "learning_rate": 4.571186657574823e-06, + "loss": 0.462, + "step": 214 + }, + { + "epoch": 0.0864, + "learning_rate": 4.5946596976491254e-06, + "loss": 0.3821, + "step": 216 + }, + { + "epoch": 0.0872, + "learning_rate": 4.618174889652924e-06, + "loss": 0.523, + "step": 218 + }, + { + "epoch": 0.088, + "learning_rate": 4.6417320502100286e-06, + "loss": 0.7721, + "step": 220 + }, + { + "epoch": 0.0888, + "learning_rate": 4.665330995616967e-06, + "loss": 0.4027, + "step": 222 + }, + { + "epoch": 0.0896, + "learning_rate": 4.688971541844424e-06, + "loss": 0.4174, + "step": 224 + }, + { + "epoch": 0.0904, + "learning_rate": 4.712653504538672e-06, + "loss": 0.5057, + "step": 226 + }, + { + "epoch": 0.0912, + "learning_rate": 4.736376699023023e-06, + "loss": 0.3628, + "step": 228 + }, + { + "epoch": 0.092, + "learning_rate": 4.76014094029921e-06, + "loss": 0.2011, + "step": 230 + }, + { + "epoch": 0.0928, + "learning_rate": 4.7839460430489216e-06, + "loss": 0.5171, + "step": 232 + }, + { + "epoch": 0.0936, + "learning_rate": 4.807791821635185e-06, + "loss": 0.4866, + "step": 234 + }, + { + "epoch": 0.0944, + "learning_rate": 4.831678090103828e-06, + "loss": 0.6559, + "step": 236 + }, + { + "epoch": 0.0952, + "learning_rate": 4.855604662184931e-06, + "loss": 0.5001, + "step": 238 + }, + { + "epoch": 0.096, + "learning_rate": 4.8795713512942785e-06, + "loss": 0.5321, + "step": 240 + }, + { + "epoch": 0.0968, + "learning_rate": 4.903577970534815e-06, + "loss": 0.4153, + "step": 242 + }, + { + "epoch": 0.0976, + "learning_rate": 4.9276243326981e-06, + "loss": 0.241, + "step": 244 + }, + { + "epoch": 0.0984, + "learning_rate": 4.951710250265788e-06, + "loss": 0.3167, + "step": 246 + }, + { + "epoch": 0.0992, + "learning_rate": 4.975835535411023e-06, + "loss": 0.7548, + "step": 248 + }, + { + "epoch": 0.1, + "learning_rate": 5.000000000000003e-06, + "loss": 0.2297, + "step": 250 + }, + { + "epoch": 0.1008, + "learning_rate": 5.024203455593375e-06, + "loss": 0.4872, + "step": 252 + }, + { + "epoch": 0.1016, + "learning_rate": 5.048445713447734e-06, + "loss": 0.361, + "step": 254 + }, + { + "epoch": 0.1024, + "learning_rate": 5.072726584517083e-06, + "loss": 0.5079, + "step": 256 + }, + { + "epoch": 0.1032, + "learning_rate": 5.097045879454308e-06, + "loss": 0.6466, + "step": 258 + }, + { + "epoch": 0.104, + "learning_rate": 5.1214034086126685e-06, + "loss": 0.3846, + "step": 260 + }, + { + "epoch": 0.1048, + "learning_rate": 5.145798982047253e-06, + "loss": 0.4534, + "step": 262 + }, + { + "epoch": 0.1056, + "learning_rate": 5.170232409516483e-06, + "loss": 0.4074, + "step": 264 + }, + { + "epoch": 0.1064, + "learning_rate": 5.194703500483597e-06, + "loss": 0.1507, + "step": 266 + }, + { + "epoch": 0.1072, + "learning_rate": 5.219212064118082e-06, + "loss": 0.178, + "step": 268 + }, + { + "epoch": 0.108, + "learning_rate": 5.24375790929725e-06, + "loss": 0.3908, + "step": 270 + }, + { + "epoch": 0.1088, + "learning_rate": 5.268340844607653e-06, + "loss": 4.5376, + "step": 272 + }, + { + "epoch": 0.1096, + "learning_rate": 5.2929606783466735e-06, + "loss": 0.371, + "step": 274 + }, + { + "epoch": 0.1104, + "learning_rate": 5.317617218523853e-06, + "loss": 0.3113, + "step": 276 + }, + { + "epoch": 0.1112, + "learning_rate": 5.342310272862553e-06, + "loss": 0.3864, + "step": 278 + }, + { + "epoch": 0.112, + "learning_rate": 5.367039648801377e-06, + "loss": 0.4193, + "step": 280 + }, + { + "epoch": 0.1128, + "learning_rate": 5.391805153495684e-06, + "loss": 0.0807, + "step": 282 + }, + { + "epoch": 0.1136, + "learning_rate": 5.416606593819109e-06, + "loss": 0.7182, + "step": 284 + }, + { + "epoch": 0.1144, + "learning_rate": 5.441443776365005e-06, + "loss": 0.2478, + "step": 286 + }, + { + "epoch": 0.1152, + "learning_rate": 5.466316507448053e-06, + "loss": 1.1238, + "step": 288 + }, + { + "epoch": 0.116, + "learning_rate": 5.49122459310568e-06, + "loss": 0.4272, + "step": 290 + }, + { + "epoch": 0.1168, + "learning_rate": 5.516167839099662e-06, + "loss": 0.4764, + "step": 292 + }, + { + "epoch": 0.1176, + "learning_rate": 5.5411460509175605e-06, + "loss": 0.3629, + "step": 294 + }, + { + "epoch": 0.1184, + "learning_rate": 5.5661590337742255e-06, + "loss": 0.5906, + "step": 296 + }, + { + "epoch": 0.1192, + "learning_rate": 5.591206592613412e-06, + "loss": 0.2451, + "step": 298 + }, + { + "epoch": 0.12, + "learning_rate": 5.616288532109221e-06, + "loss": 0.4, + "step": 300 + }, + { + "epoch": 0.1208, + "learning_rate": 5.641404656667652e-06, + "loss": 0.4016, + "step": 302 + }, + { + "epoch": 0.1216, + "learning_rate": 5.666554770428136e-06, + "loss": 0.3074, + "step": 304 + }, + { + "epoch": 0.1224, + "learning_rate": 5.6917386772650015e-06, + "loss": 0.3233, + "step": 306 + }, + { + "epoch": 0.1232, + "learning_rate": 5.716956180789086e-06, + "loss": 0.3573, + "step": 308 + }, + { + "epoch": 0.124, + "learning_rate": 5.74220708434926e-06, + "loss": 0.5949, + "step": 310 + }, + { + "epoch": 0.1248, + "learning_rate": 5.767491191033909e-06, + "loss": 0.4005, + "step": 312 + }, + { + "epoch": 0.1256, + "learning_rate": 5.7928083036724535e-06, + "loss": 0.311, + "step": 314 + }, + { + "epoch": 0.1264, + "learning_rate": 5.818158224836983e-06, + "loss": 0.4105, + "step": 316 + }, + { + "epoch": 0.1272, + "learning_rate": 5.8435407568437194e-06, + "loss": 1.6052, + "step": 318 + }, + { + "epoch": 0.128, + "learning_rate": 5.868955701754577e-06, + "loss": 0.2449, + "step": 320 + }, + { + "epoch": 0.1288, + "learning_rate": 5.894402861378714e-06, + "loss": 0.4014, + "step": 322 + }, + { + "epoch": 0.1296, + "learning_rate": 5.919882037274065e-06, + "loss": 0.6437, + "step": 324 + }, + { + "epoch": 0.1304, + "learning_rate": 5.9453930307488985e-06, + "loss": 0.3434, + "step": 326 + }, + { + "epoch": 0.1312, + "learning_rate": 5.970935642863362e-06, + "loss": 0.5164, + "step": 328 + }, + { + "epoch": 0.132, + "learning_rate": 5.996509674431038e-06, + "loss": 0.4935, + "step": 330 + }, + { + "epoch": 0.1328, + "learning_rate": 6.022114926020505e-06, + "loss": 0.5716, + "step": 332 + }, + { + "epoch": 0.1336, + "learning_rate": 6.047751197956836e-06, + "loss": 0.4474, + "step": 334 + }, + { + "epoch": 0.1344, + "learning_rate": 6.0734182903232475e-06, + "loss": 0.5938, + "step": 336 + }, + { + "epoch": 0.1352, + "learning_rate": 6.0991160029626e-06, + "loss": 0.2797, + "step": 338 + }, + { + "epoch": 0.136, + "learning_rate": 6.124844135478966e-06, + "loss": 0.4728, + "step": 340 + }, + { + "epoch": 0.1368, + "learning_rate": 6.1506024872392e-06, + "loss": 0.2753, + "step": 342 + }, + { + "epoch": 0.1376, + "learning_rate": 6.176390857374501e-06, + "loss": 0.8049, + "step": 344 + }, + { + "epoch": 0.1384, + "learning_rate": 6.202209044781979e-06, + "loss": 0.2738, + "step": 346 + }, + { + "epoch": 0.1392, + "learning_rate": 6.228056848126223e-06, + "loss": 0.3147, + "step": 348 + }, + { + "epoch": 0.14, + "learning_rate": 6.253934065840883e-06, + "loss": 0.3714, + "step": 350 + }, + { + "epoch": 0.1408, + "learning_rate": 6.279840496130188e-06, + "loss": 0.2173, + "step": 352 + }, + { + "epoch": 0.1416, + "learning_rate": 6.305775936970606e-06, + "loss": 0.5228, + "step": 354 + }, + { + "epoch": 0.1424, + "learning_rate": 6.331740186112359e-06, + "loss": 0.4728, + "step": 356 + }, + { + "epoch": 0.1432, + "learning_rate": 6.357733041081015e-06, + "loss": 0.5339, + "step": 358 + }, + { + "epoch": 0.144, + "learning_rate": 6.383754299179072e-06, + "loss": 0.5707, + "step": 360 + }, + { + "epoch": 0.1448, + "learning_rate": 6.409803757487532e-06, + "loss": 0.6757, + "step": 362 + }, + { + "epoch": 0.1456, + "learning_rate": 6.435881212867485e-06, + "loss": 0.4086, + "step": 364 + }, + { + "epoch": 0.1464, + "learning_rate": 6.4619864619616975e-06, + "loss": 0.6854, + "step": 366 + }, + { + "epoch": 0.1472, + "learning_rate": 6.48811930119619e-06, + "loss": 0.3832, + "step": 368 + }, + { + "epoch": 0.148, + "learning_rate": 6.514279526781853e-06, + "loss": 0.3497, + "step": 370 + }, + { + "epoch": 0.1488, + "learning_rate": 6.540466934715955e-06, + "loss": 0.4083, + "step": 372 + }, + { + "epoch": 0.1496, + "learning_rate": 6.566681320783848e-06, + "loss": 0.2446, + "step": 374 + }, + { + "epoch": 0.1504, + "learning_rate": 6.592922480560483e-06, + "loss": 0.1998, + "step": 376 + }, + { + "epoch": 0.1512, + "learning_rate": 6.619190209412025e-06, + "loss": 0.8796, + "step": 378 + }, + { + "epoch": 0.152, + "learning_rate": 6.6454843024974465e-06, + "loss": 0.4574, + "step": 380 + }, + { + "epoch": 0.1528, + "learning_rate": 6.671804554770128e-06, + "loss": 0.6307, + "step": 382 + }, + { + "epoch": 0.1536, + "learning_rate": 6.698150760979456e-06, + "loss": 0.3223, + "step": 384 + }, + { + "epoch": 0.1544, + "learning_rate": 6.724522715672421e-06, + "loss": 0.1816, + "step": 386 + }, + { + "epoch": 0.1552, + "learning_rate": 6.750920213195242e-06, + "loss": 0.2928, + "step": 388 + }, + { + "epoch": 0.156, + "learning_rate": 6.777343047694894e-06, + "loss": 0.3772, + "step": 390 + }, + { + "epoch": 0.1568, + "learning_rate": 6.803791013120824e-06, + "loss": 0.469, + "step": 392 + }, + { + "epoch": 0.1576, + "learning_rate": 6.8302639032264836e-06, + "loss": 0.3188, + "step": 394 + }, + { + "epoch": 0.1584, + "learning_rate": 6.856761511570944e-06, + "loss": 0.4061, + "step": 396 + }, + { + "epoch": 0.1592, + "learning_rate": 6.883283631520579e-06, + "loss": 0.2467, + "step": 398 + }, + { + "epoch": 0.16, + "learning_rate": 6.909830056250522e-06, + "loss": 0.4803, + "step": 400 + }, + { + "epoch": 0.1608, + "learning_rate": 6.936400578746436e-06, + "loss": 0.6734, + "step": 402 + }, + { + "epoch": 0.1616, + "learning_rate": 6.96299499180605e-06, + "loss": 0.4348, + "step": 404 + }, + { + "epoch": 0.1624, + "learning_rate": 6.989613088040787e-06, + "loss": 0.2338, + "step": 406 + }, + { + "epoch": 0.1632, + "learning_rate": 7.016254659877404e-06, + "loss": 0.6963, + "step": 408 + }, + { + "epoch": 0.164, + "learning_rate": 7.042919499559539e-06, + "loss": 0.3678, + "step": 410 + }, + { + "epoch": 0.1648, + "learning_rate": 7.06960739914943e-06, + "loss": 0.6086, + "step": 412 + }, + { + "epoch": 0.1656, + "learning_rate": 7.09631815052946e-06, + "loss": 0.4045, + "step": 414 + }, + { + "epoch": 0.1664, + "learning_rate": 7.123051545403873e-06, + "loss": 0.8015, + "step": 416 + }, + { + "epoch": 0.1672, + "learning_rate": 7.1498073753002375e-06, + "loss": 1.1483, + "step": 418 + }, + { + "epoch": 0.168, + "learning_rate": 7.1765854315712325e-06, + "loss": 0.7095, + "step": 420 + }, + { + "epoch": 0.1688, + "learning_rate": 7.203385505396197e-06, + "loss": 0.325, + "step": 422 + }, + { + "epoch": 0.1696, + "learning_rate": 7.230207387782771e-06, + "loss": 0.2765, + "step": 424 + }, + { + "epoch": 0.1704, + "learning_rate": 7.257050869568527e-06, + "loss": 0.3212, + "step": 426 + }, + { + "epoch": 0.1712, + "learning_rate": 7.28391574142262e-06, + "loss": 0.7222, + "step": 428 + }, + { + "epoch": 0.172, + "learning_rate": 7.3108017938473485e-06, + "loss": 0.1863, + "step": 430 + }, + { + "epoch": 0.1728, + "learning_rate": 7.337708817179875e-06, + "loss": 0.3413, + "step": 432 + }, + { + "epoch": 0.1736, + "learning_rate": 7.36463660159386e-06, + "loss": 0.4063, + "step": 434 + }, + { + "epoch": 0.1744, + "learning_rate": 7.39158493710103e-06, + "loss": 0.7361, + "step": 436 + }, + { + "epoch": 0.1752, + "learning_rate": 7.418553613552822e-06, + "loss": 0.3111, + "step": 438 + }, + { + "epoch": 0.176, + "learning_rate": 7.445542420642091e-06, + "loss": 0.4697, + "step": 440 + }, + { + "epoch": 0.1768, + "learning_rate": 7.472551147904703e-06, + "loss": 0.5171, + "step": 442 + }, + { + "epoch": 0.1776, + "learning_rate": 7.499579584721173e-06, + "loss": 0.334, + "step": 444 + }, + { + "epoch": 0.1784, + "learning_rate": 7.5266275203183395e-06, + "loss": 0.4657, + "step": 446 + }, + { + "epoch": 0.1792, + "learning_rate": 7.553694743770917e-06, + "loss": 0.6019, + "step": 448 + }, + { + "epoch": 0.18, + "learning_rate": 7.580781044003312e-06, + "loss": 0.7023, + "step": 450 + }, + { + "epoch": 0.1808, + "learning_rate": 7.607886209791095e-06, + "loss": 0.4337, + "step": 452 + }, + { + "epoch": 0.1816, + "learning_rate": 7.635010029762755e-06, + "loss": 0.1469, + "step": 454 + }, + { + "epoch": 0.1824, + "learning_rate": 7.662152292401265e-06, + "loss": 0.2539, + "step": 456 + }, + { + "epoch": 0.1832, + "learning_rate": 7.689312786045822e-06, + "loss": 0.392, + "step": 458 + }, + { + "epoch": 0.184, + "learning_rate": 7.716491298893441e-06, + "loss": 0.3791, + "step": 460 + }, + { + "epoch": 0.1848, + "learning_rate": 7.74368761900062e-06, + "loss": 0.7843, + "step": 462 + }, + { + "epoch": 0.1856, + "learning_rate": 7.770901534284991e-06, + "loss": 0.3139, + "step": 464 + }, + { + "epoch": 0.1864, + "learning_rate": 7.798132832526976e-06, + "loss": 0.2702, + "step": 466 + }, + { + "epoch": 0.1872, + "learning_rate": 7.825381301371444e-06, + "loss": 0.3664, + "step": 468 + }, + { + "epoch": 0.188, + "learning_rate": 7.852646728329358e-06, + "loss": 0.3896, + "step": 470 + }, + { + "epoch": 0.1888, + "learning_rate": 7.879928900779441e-06, + "loss": 0.3861, + "step": 472 + }, + { + "epoch": 0.1896, + "learning_rate": 7.907227605969852e-06, + "loss": 0.5974, + "step": 474 + }, + { + "epoch": 0.1904, + "learning_rate": 7.934542631019767e-06, + "loss": 0.3873, + "step": 476 + }, + { + "epoch": 0.1912, + "learning_rate": 7.961873762921153e-06, + "loss": 0.2682, + "step": 478 + }, + { + "epoch": 0.192, + "learning_rate": 7.989220788540351e-06, + "loss": 0.5066, + "step": 480 + }, + { + "epoch": 0.1928, + "learning_rate": 8.016583494619764e-06, + "loss": 0.6342, + "step": 482 + }, + { + "epoch": 0.1936, + "learning_rate": 8.043961667779511e-06, + "loss": 0.2558, + "step": 484 + }, + { + "epoch": 0.1944, + "learning_rate": 8.071355094519103e-06, + "loss": 0.3674, + "step": 486 + }, + { + "epoch": 0.1952, + "learning_rate": 8.098763561219089e-06, + "loss": 0.2631, + "step": 488 + }, + { + "epoch": 0.196, + "learning_rate": 8.126186854142744e-06, + "loss": 0.4233, + "step": 490 + }, + { + "epoch": 0.1968, + "learning_rate": 8.153624759437718e-06, + "loss": 0.4573, + "step": 492 + }, + { + "epoch": 0.1976, + "learning_rate": 8.181077063137735e-06, + "loss": 0.6095, + "step": 494 + }, + { + "epoch": 0.1984, + "learning_rate": 8.208543551164178e-06, + "loss": 0.6742, + "step": 496 + }, + { + "epoch": 0.1992, + "learning_rate": 8.236024009327877e-06, + "loss": 0.3291, + "step": 498 + }, + { + "epoch": 0.2, + "learning_rate": 8.263518223330695e-06, + "loss": 0.0596, + "step": 500 + }, + { + "epoch": 0.2008, + "learning_rate": 8.29102597876723e-06, + "loss": 0.4351, + "step": 502 + }, + { + "epoch": 0.2016, + "learning_rate": 8.31854706112648e-06, + "loss": 0.4415, + "step": 504 + }, + { + "epoch": 0.2024, + "learning_rate": 8.346081255793516e-06, + "loss": 0.211, + "step": 506 + }, + { + "epoch": 0.2032, + "learning_rate": 8.373628348051156e-06, + "loss": 0.6572, + "step": 508 + }, + { + "epoch": 0.204, + "learning_rate": 8.401188123081642e-06, + "loss": 0.1364, + "step": 510 + }, + { + "epoch": 0.2048, + "learning_rate": 8.428760365968329e-06, + "loss": 0.4927, + "step": 512 + }, + { + "epoch": 0.2056, + "learning_rate": 8.456344861697293e-06, + "loss": 0.8258, + "step": 514 + }, + { + "epoch": 0.2064, + "learning_rate": 8.483941395159114e-06, + "loss": 0.2837, + "step": 516 + }, + { + "epoch": 0.2072, + "learning_rate": 8.511549751150478e-06, + "loss": 0.6409, + "step": 518 + }, + { + "epoch": 0.208, + "learning_rate": 8.539169714375883e-06, + "loss": 0.2775, + "step": 520 + }, + { + "epoch": 0.2088, + "learning_rate": 8.566801069449304e-06, + "loss": 0.4633, + "step": 522 + }, + { + "epoch": 0.2096, + "learning_rate": 8.594443600895886e-06, + "loss": 0.6476, + "step": 524 + }, + { + "epoch": 0.2104, + "learning_rate": 8.622097093153612e-06, + "loss": 0.3053, + "step": 526 + }, + { + "epoch": 0.2112, + "learning_rate": 8.649761330575e-06, + "loss": 0.7951, + "step": 528 + }, + { + "epoch": 0.212, + "learning_rate": 8.677436097428766e-06, + "loss": 0.4397, + "step": 530 + }, + { + "epoch": 0.2128, + "learning_rate": 8.705121177901537e-06, + "loss": 0.3374, + "step": 532 + }, + { + "epoch": 0.2136, + "learning_rate": 8.732816356099459e-06, + "loss": 0.3922, + "step": 534 + }, + { + "epoch": 0.2144, + "learning_rate": 8.760521416049986e-06, + "loss": 0.4012, + "step": 536 + }, + { + "epoch": 0.2152, + "learning_rate": 8.788236141703477e-06, + "loss": 0.2116, + "step": 538 + }, + { + "epoch": 0.216, + "learning_rate": 8.81596031693499e-06, + "loss": 0.4522, + "step": 540 + }, + { + "epoch": 0.2168, + "learning_rate": 8.84369372554578e-06, + "loss": 0.4614, + "step": 542 + }, + { + "epoch": 0.2176, + "learning_rate": 8.87143615126518e-06, + "loss": 0.2794, + "step": 544 + }, + { + "epoch": 0.2184, + "learning_rate": 8.899187377752173e-06, + "loss": 0.4445, + "step": 546 + }, + { + "epoch": 0.2192, + "learning_rate": 8.926947188597127e-06, + "loss": 0.3272, + "step": 548 + }, + { + "epoch": 0.22, + "learning_rate": 8.954715367323473e-06, + "loss": 0.5668, + "step": 550 + }, + { + "epoch": 0.2208, + "learning_rate": 8.982491697389344e-06, + "loss": 0.2618, + "step": 552 + }, + { + "epoch": 0.2216, + "learning_rate": 9.010275962189356e-06, + "loss": 0.4631, + "step": 554 + }, + { + "epoch": 0.2224, + "learning_rate": 9.03806794505621e-06, + "loss": 0.2781, + "step": 556 + }, + { + "epoch": 0.2232, + "learning_rate": 9.065867429262497e-06, + "loss": 0.6424, + "step": 558 + }, + { + "epoch": 0.224, + "learning_rate": 9.093674198022198e-06, + "loss": 0.4665, + "step": 560 + }, + { + "epoch": 0.2248, + "learning_rate": 9.121488034492567e-06, + "loss": 0.1697, + "step": 562 + }, + { + "epoch": 0.2256, + "learning_rate": 9.149308721775717e-06, + "loss": 0.8246, + "step": 564 + }, + { + "epoch": 0.2264, + "learning_rate": 9.177136042920338e-06, + "loss": 0.3101, + "step": 566 + }, + { + "epoch": 0.2272, + "learning_rate": 9.204969780923396e-06, + "loss": 0.4792, + "step": 568 + }, + { + "epoch": 0.228, + "learning_rate": 9.232809718731822e-06, + "loss": 0.5229, + "step": 570 + }, + { + "epoch": 0.2288, + "learning_rate": 9.26065563924414e-06, + "loss": 0.5164, + "step": 572 + }, + { + "epoch": 0.2296, + "learning_rate": 9.288507325312319e-06, + "loss": 0.5728, + "step": 574 + }, + { + "epoch": 0.2304, + "learning_rate": 9.316364559743298e-06, + "loss": 0.556, + "step": 576 + }, + { + "epoch": 0.2312, + "learning_rate": 9.344227125300788e-06, + "loss": 0.621, + "step": 578 + }, + { + "epoch": 0.232, + "learning_rate": 9.372094804706867e-06, + "loss": 0.5573, + "step": 580 + }, + { + "epoch": 0.2328, + "learning_rate": 9.39996738064379e-06, + "loss": 0.8175, + "step": 582 + }, + { + "epoch": 0.2336, + "learning_rate": 9.427844635755615e-06, + "loss": 0.4101, + "step": 584 + }, + { + "epoch": 0.2344, + "learning_rate": 9.455726352649904e-06, + "loss": 0.528, + "step": 586 + }, + { + "epoch": 0.2352, + "learning_rate": 9.483612313899446e-06, + "loss": 0.5111, + "step": 588 + }, + { + "epoch": 0.236, + "learning_rate": 9.511502302043859e-06, + "loss": 0.3548, + "step": 590 + }, + { + "epoch": 0.2368, + "learning_rate": 9.539396099591469e-06, + "loss": 1.0182, + "step": 592 + }, + { + "epoch": 0.2376, + "learning_rate": 9.567293489020816e-06, + "loss": 0.7234, + "step": 594 + }, + { + "epoch": 0.2384, + "learning_rate": 9.595194252782461e-06, + "loss": 0.3013, + "step": 596 + }, + { + "epoch": 0.2392, + "learning_rate": 9.623098173300656e-06, + "loss": 0.4776, + "step": 598 + }, + { + "epoch": 0.24, + "learning_rate": 9.651005032974991e-06, + "loss": 0.3319, + "step": 600 + }, + { + "epoch": 0.2408, + "learning_rate": 9.678914614182184e-06, + "loss": 0.3854, + "step": 602 + }, + { + "epoch": 0.2416, + "learning_rate": 9.706826699277714e-06, + "loss": 0.4377, + "step": 604 + }, + { + "epoch": 0.2424, + "learning_rate": 9.734741070597535e-06, + "loss": 0.2744, + "step": 606 + }, + { + "epoch": 0.2432, + "learning_rate": 9.762657510459774e-06, + "loss": 0.3382, + "step": 608 + }, + { + "epoch": 0.244, + "learning_rate": 9.790575801166422e-06, + "loss": 0.2654, + "step": 610 + }, + { + "epoch": 0.2448, + "learning_rate": 9.818495725005043e-06, + "loss": 0.5038, + "step": 612 + }, + { + "epoch": 0.2456, + "learning_rate": 9.846417064250459e-06, + "loss": 0.2826, + "step": 614 + }, + { + "epoch": 0.2464, + "learning_rate": 9.874339601166479e-06, + "loss": 0.4985, + "step": 616 + }, + { + "epoch": 0.2472, + "learning_rate": 9.902263118007513e-06, + "loss": 0.7933, + "step": 618 + }, + { + "epoch": 0.248, + "learning_rate": 9.930187397020385e-06, + "loss": 0.4856, + "step": 620 + }, + { + "epoch": 0.2488, + "learning_rate": 9.95811222044596e-06, + "loss": 0.2805, + "step": 622 + }, + { + "epoch": 0.2496, + "learning_rate": 9.986037370520855e-06, + "loss": 0.2714, + "step": 624 + }, + { + "epoch": 0.2504, + "learning_rate": 1.0013962629479139e-05, + "loss": 0.779, + "step": 626 + }, + { + "epoch": 0.2512, + "learning_rate": 1.0041887779554034e-05, + "loss": 0.4763, + "step": 628 + }, + { + "epoch": 0.252, + "learning_rate": 1.0069812602979607e-05, + "loss": 0.2753, + "step": 630 + }, + { + "epoch": 0.2528, + "learning_rate": 1.0097736881992482e-05, + "loss": 0.4313, + "step": 632 + }, + { + "epoch": 0.2536, + "learning_rate": 1.0125660398833514e-05, + "loss": 0.3869, + "step": 634 + }, + { + "epoch": 0.2544, + "learning_rate": 1.0153582935749533e-05, + "loss": 0.2987, + "step": 636 + }, + { + "epoch": 0.2552, + "learning_rate": 1.0181504274994952e-05, + "loss": 0.236, + "step": 638 + }, + { + "epoch": 0.256, + "learning_rate": 1.0209424198833571e-05, + "loss": 0.514, + "step": 640 + }, + { + "epoch": 0.2568, + "learning_rate": 1.0237342489540218e-05, + "loss": 0.3227, + "step": 642 + }, + { + "epoch": 0.2576, + "learning_rate": 1.0265258929402458e-05, + "loss": 0.4836, + "step": 644 + }, + { + "epoch": 0.2584, + "learning_rate": 1.029317330072228e-05, + "loss": 0.2998, + "step": 646 + }, + { + "epoch": 0.2592, + "learning_rate": 1.0321085385817811e-05, + "loss": 0.2966, + "step": 648 + }, + { + "epoch": 0.26, + "learning_rate": 1.0348994967025004e-05, + "loss": 0.4875, + "step": 650 + }, + { + "epoch": 0.2608, + "learning_rate": 1.0376901826699337e-05, + "loss": 0.4732, + "step": 652 + }, + { + "epoch": 0.2616, + "learning_rate": 1.0404805747217532e-05, + "loss": 0.189, + "step": 654 + }, + { + "epoch": 0.2624, + "learning_rate": 1.0432706510979175e-05, + "loss": 0.158, + "step": 656 + }, + { + "epoch": 0.2632, + "learning_rate": 1.0460603900408526e-05, + "loss": 0.1743, + "step": 658 + }, + { + "epoch": 0.264, + "learning_rate": 1.0488497697956134e-05, + "loss": 0.2943, + "step": 660 + }, + { + "epoch": 0.2648, + "learning_rate": 1.0516387686100549e-05, + "loss": 0.2674, + "step": 662 + }, + { + "epoch": 0.2656, + "learning_rate": 1.054427364735009e-05, + "loss": 0.2781, + "step": 664 + }, + { + "epoch": 0.2664, + "learning_rate": 1.0572155364244378e-05, + "loss": 0.4538, + "step": 666 + }, + { + "epoch": 0.2672, + "learning_rate": 1.0600032619356203e-05, + "loss": 1.0262, + "step": 668 + }, + { + "epoch": 0.268, + "learning_rate": 1.0627905195293127e-05, + "loss": 0.7517, + "step": 670 + }, + { + "epoch": 0.2688, + "learning_rate": 1.0655772874699206e-05, + "loss": 0.4172, + "step": 672 + }, + { + "epoch": 0.2696, + "learning_rate": 1.0683635440256694e-05, + "loss": 0.7487, + "step": 674 + }, + { + "epoch": 0.2704, + "learning_rate": 1.0711492674687674e-05, + "loss": 0.4969, + "step": 676 + }, + { + "epoch": 0.2712, + "learning_rate": 1.0739344360755855e-05, + "loss": 0.7213, + "step": 678 + }, + { + "epoch": 0.272, + "learning_rate": 1.0767190281268171e-05, + "loss": 0.6534, + "step": 680 + }, + { + "epoch": 0.2728, + "learning_rate": 1.07950302190766e-05, + "loss": 0.4617, + "step": 682 + }, + { + "epoch": 0.2736, + "learning_rate": 1.0822863957079654e-05, + "loss": 0.2726, + "step": 684 + }, + { + "epoch": 0.2744, + "learning_rate": 1.0850691278224277e-05, + "loss": 0.4616, + "step": 686 + }, + { + "epoch": 0.2752, + "learning_rate": 1.0878511965507428e-05, + "loss": 0.7135, + "step": 688 + }, + { + "epoch": 0.276, + "learning_rate": 1.0906325801977795e-05, + "loss": 0.2675, + "step": 690 + }, + { + "epoch": 0.2768, + "learning_rate": 1.0934132570737497e-05, + "loss": 0.4599, + "step": 692 + }, + { + "epoch": 0.2776, + "learning_rate": 1.0961932054943785e-05, + "loss": 0.6568, + "step": 694 + }, + { + "epoch": 0.2784, + "learning_rate": 1.098972403781064e-05, + "loss": 1.1232, + "step": 696 + }, + { + "epoch": 0.2792, + "learning_rate": 1.101750830261065e-05, + "loss": 0.3807, + "step": 698 + }, + { + "epoch": 0.28, + "learning_rate": 1.104528463267652e-05, + "loss": 0.5539, + "step": 700 + }, + { + "epoch": 0.2808, + "learning_rate": 1.1073052811402867e-05, + "loss": 0.7892, + "step": 702 + }, + { + "epoch": 0.2816, + "learning_rate": 1.1100812622247821e-05, + "loss": 0.6809, + "step": 704 + }, + { + "epoch": 0.2824, + "learning_rate": 1.1128563848734815e-05, + "loss": 0.4572, + "step": 706 + }, + { + "epoch": 0.2832, + "learning_rate": 1.1156306274454211e-05, + "loss": 0.5122, + "step": 708 + }, + { + "epoch": 0.284, + "learning_rate": 1.1184039683065002e-05, + "loss": 0.3382, + "step": 710 + }, + { + "epoch": 0.2848, + "learning_rate": 1.1211763858296516e-05, + "loss": 0.1458, + "step": 712 + }, + { + "epoch": 0.2856, + "learning_rate": 1.1239478583950007e-05, + "loss": 0.7479, + "step": 714 + }, + { + "epoch": 0.2864, + "learning_rate": 1.1267183643900534e-05, + "loss": 0.2173, + "step": 716 + }, + { + "epoch": 0.2872, + "learning_rate": 1.1294878822098456e-05, + "loss": 0.6472, + "step": 718 + }, + { + "epoch": 0.288, + "learning_rate": 1.1322563902571227e-05, + "loss": 0.0933, + "step": 720 + }, + { + "epoch": 0.2888, + "learning_rate": 1.1350238669424993e-05, + "loss": 0.5315, + "step": 722 + }, + { + "epoch": 0.2896, + "learning_rate": 1.137790290684638e-05, + "loss": 0.2551, + "step": 724 + }, + { + "epoch": 0.2904, + "learning_rate": 1.1405556399104108e-05, + "loss": 0.3734, + "step": 726 + }, + { + "epoch": 0.2912, + "learning_rate": 1.143319893055069e-05, + "loss": 0.2851, + "step": 728 + }, + { + "epoch": 0.292, + "learning_rate": 1.1460830285624112e-05, + "loss": 0.3993, + "step": 730 + }, + { + "epoch": 0.2928, + "learning_rate": 1.1488450248849515e-05, + "loss": 0.2586, + "step": 732 + }, + { + "epoch": 0.2936, + "learning_rate": 1.1516058604840881e-05, + "loss": 0.342, + "step": 734 + }, + { + "epoch": 0.2944, + "learning_rate": 1.15436551383027e-05, + "loss": 0.3236, + "step": 736 + }, + { + "epoch": 0.2952, + "learning_rate": 1.1571239634031666e-05, + "loss": 0.3265, + "step": 738 + }, + { + "epoch": 0.296, + "learning_rate": 1.1598811876918352e-05, + "loss": 0.8297, + "step": 740 + }, + { + "epoch": 0.2968, + "learning_rate": 1.1626371651948839e-05, + "loss": 0.3934, + "step": 742 + }, + { + "epoch": 0.2976, + "learning_rate": 1.1653918744206478e-05, + "loss": 0.3055, + "step": 744 + }, + { + "epoch": 0.2984, + "learning_rate": 1.1681452938873515e-05, + "loss": 0.5374, + "step": 746 + }, + { + "epoch": 0.2992, + "learning_rate": 1.1708974021232763e-05, + "loss": 0.3936, + "step": 748 + }, + { + "epoch": 0.3, + "learning_rate": 1.1736481776669297e-05, + "loss": 0.2892, + "step": 750 + }, + { + "epoch": 0.3008, + "learning_rate": 1.1763975990672116e-05, + "loss": 0.4954, + "step": 752 + }, + { + "epoch": 0.3016, + "learning_rate": 1.1791456448835815e-05, + "loss": 0.3661, + "step": 754 + }, + { + "epoch": 0.3024, + "learning_rate": 1.1818922936862258e-05, + "loss": 0.4951, + "step": 756 + }, + { + "epoch": 0.3032, + "learning_rate": 1.1846375240562274e-05, + "loss": 0.8052, + "step": 758 + }, + { + "epoch": 0.304, + "learning_rate": 1.187381314585725e-05, + "loss": 0.3973, + "step": 760 + }, + { + "epoch": 0.3048, + "learning_rate": 1.1901236438780906e-05, + "loss": 0.3082, + "step": 762 + }, + { + "epoch": 0.3056, + "learning_rate": 1.192864490548089e-05, + "loss": 0.1613, + "step": 764 + }, + { + "epoch": 0.3064, + "learning_rate": 1.195603833222048e-05, + "loss": 0.143, + "step": 766 + }, + { + "epoch": 0.3072, + "learning_rate": 1.198341650538023e-05, + "loss": 0.4215, + "step": 768 + }, + { + "epoch": 0.308, + "learning_rate": 1.2010779211459642e-05, + "loss": 0.3123, + "step": 770 + }, + { + "epoch": 0.3088, + "learning_rate": 1.203812623707884e-05, + "loss": 0.2438, + "step": 772 + }, + { + "epoch": 0.3096, + "learning_rate": 1.2065457368980227e-05, + "loss": 0.1545, + "step": 774 + }, + { + "epoch": 0.3104, + "learning_rate": 1.2092772394030141e-05, + "loss": 0.435, + "step": 776 + }, + { + "epoch": 0.3112, + "learning_rate": 1.2120071099220552e-05, + "loss": 0.2771, + "step": 778 + }, + { + "epoch": 0.312, + "learning_rate": 1.2147353271670637e-05, + "loss": 0.7295, + "step": 780 + }, + { + "epoch": 0.3128, + "learning_rate": 1.217461869862855e-05, + "loss": 0.2875, + "step": 782 + }, + { + "epoch": 0.3136, + "learning_rate": 1.2201867167473015e-05, + "loss": 0.1907, + "step": 784 + }, + { + "epoch": 0.3144, + "learning_rate": 1.2229098465715002e-05, + "loss": 0.3547, + "step": 786 + }, + { + "epoch": 0.3152, + "learning_rate": 1.2256312380999373e-05, + "loss": 0.5093, + "step": 788 + }, + { + "epoch": 0.316, + "learning_rate": 1.2283508701106552e-05, + "loss": 0.3415, + "step": 790 + }, + { + "epoch": 0.3168, + "learning_rate": 1.2310687213954173e-05, + "loss": 0.2829, + "step": 792 + }, + { + "epoch": 0.3176, + "learning_rate": 1.233784770759873e-05, + "loss": 0.3377, + "step": 794 + }, + { + "epoch": 0.3184, + "learning_rate": 1.2364989970237238e-05, + "loss": 0.3137, + "step": 796 + }, + { + "epoch": 0.3192, + "learning_rate": 1.23921137902089e-05, + "loss": 0.2657, + "step": 798 + }, + { + "epoch": 0.32, + "learning_rate": 1.241921895599668e-05, + "loss": 0.1925, + "step": 800 + }, + { + "epoch": 0.3208, + "learning_rate": 1.2446305256229076e-05, + "loss": 0.4903, + "step": 802 + }, + { + "epoch": 0.3216, + "learning_rate": 1.2473372479681653e-05, + "loss": 0.4193, + "step": 804 + }, + { + "epoch": 0.3224, + "learning_rate": 1.2500420415278822e-05, + "loss": 0.1719, + "step": 806 + }, + { + "epoch": 0.3232, + "learning_rate": 1.2527448852095292e-05, + "loss": 0.5688, + "step": 808 + }, + { + "epoch": 0.324, + "learning_rate": 1.2554457579357902e-05, + "loss": 0.8451, + "step": 810 + }, + { + "epoch": 0.3248, + "learning_rate": 1.2581446386447171e-05, + "loss": 0.4608, + "step": 812 + }, + { + "epoch": 0.3256, + "learning_rate": 1.2608415062898963e-05, + "loss": 0.426, + "step": 814 + }, + { + "epoch": 0.3264, + "learning_rate": 1.2635363398406133e-05, + "loss": 0.4211, + "step": 816 + }, + { + "epoch": 0.3272, + "learning_rate": 1.266229118282012e-05, + "loss": 0.9495, + "step": 818 + }, + { + "epoch": 0.328, + "learning_rate": 1.2689198206152644e-05, + "loss": 0.4161, + "step": 820 + }, + { + "epoch": 0.3288, + "learning_rate": 1.2716084258577373e-05, + "loss": 0.4325, + "step": 822 + }, + { + "epoch": 0.3296, + "learning_rate": 1.2742949130431468e-05, + "loss": 0.5616, + "step": 824 + }, + { + "epoch": 0.3304, + "learning_rate": 1.2769792612217224e-05, + "loss": 0.5868, + "step": 826 + }, + { + "epoch": 0.3312, + "learning_rate": 1.2796614494603795e-05, + "loss": 0.5262, + "step": 828 + }, + { + "epoch": 0.332, + "learning_rate": 1.282341456842876e-05, + "loss": 0.401, + "step": 830 + }, + { + "epoch": 0.3328, + "learning_rate": 1.2850192624699756e-05, + "loss": 2.9712, + "step": 832 + }, + { + "epoch": 0.3336, + "learning_rate": 1.2876948454596122e-05, + "loss": 0.6492, + "step": 834 + }, + { + "epoch": 0.3344, + "learning_rate": 1.2903681849470535e-05, + "loss": 0.5945, + "step": 836 + }, + { + "epoch": 0.3352, + "learning_rate": 1.2930392600850565e-05, + "loss": 0.7923, + "step": 838 + }, + { + "epoch": 0.336, + "learning_rate": 1.2957080500440455e-05, + "loss": 0.4155, + "step": 840 + }, + { + "epoch": 0.3368, + "learning_rate": 1.2983745340122589e-05, + "loss": 0.2658, + "step": 842 + }, + { + "epoch": 0.3376, + "learning_rate": 1.3010386911959205e-05, + "loss": 0.2902, + "step": 844 + }, + { + "epoch": 0.3384, + "learning_rate": 1.3037005008193944e-05, + "loss": 0.2692, + "step": 846 + }, + { + "epoch": 0.3392, + "learning_rate": 1.3063599421253556e-05, + "loss": 0.6142, + "step": 848 + }, + { + "epoch": 0.34, + "learning_rate": 1.309016994374947e-05, + "loss": 0.3521, + "step": 850 + }, + { + "epoch": 0.3408, + "learning_rate": 1.3116716368479415e-05, + "loss": 0.5977, + "step": 852 + }, + { + "epoch": 0.3416, + "learning_rate": 1.3143238488429049e-05, + "loss": 0.6295, + "step": 854 + }, + { + "epoch": 0.3424, + "learning_rate": 1.316973609677351e-05, + "loss": 0.4176, + "step": 856 + }, + { + "epoch": 0.3432, + "learning_rate": 1.319620898687917e-05, + "loss": 0.5002, + "step": 858 + }, + { + "epoch": 0.344, + "learning_rate": 1.32226569523051e-05, + "loss": 0.4966, + "step": 860 + }, + { + "epoch": 0.3448, + "learning_rate": 1.324907978680475e-05, + "loss": 0.2596, + "step": 862 + }, + { + "epoch": 0.3456, + "learning_rate": 1.3275477284327572e-05, + "loss": 0.3927, + "step": 864 + }, + { + "epoch": 0.3464, + "learning_rate": 1.3301849239020537e-05, + "loss": 0.0975, + "step": 866 + }, + { + "epoch": 0.3472, + "learning_rate": 1.3328195445229865e-05, + "loss": 0.1328, + "step": 868 + }, + { + "epoch": 0.348, + "learning_rate": 1.3354515697502548e-05, + "loss": 0.3064, + "step": 870 + }, + { + "epoch": 0.3488, + "learning_rate": 1.338080979058797e-05, + "loss": 0.3346, + "step": 872 + }, + { + "epoch": 0.3496, + "learning_rate": 1.340707751943951e-05, + "loss": 0.4206, + "step": 874 + }, + { + "epoch": 0.3504, + "learning_rate": 1.3433318679216145e-05, + "loss": 0.1936, + "step": 876 + }, + { + "epoch": 0.3512, + "learning_rate": 1.3459533065284039e-05, + "loss": 0.8657, + "step": 878 + }, + { + "epoch": 0.352, + "learning_rate": 1.348572047321814e-05, + "loss": 0.5618, + "step": 880 + }, + { + "epoch": 0.3528, + "learning_rate": 1.3511880698803803e-05, + "loss": 0.6061, + "step": 882 + }, + { + "epoch": 0.3536, + "learning_rate": 1.3538013538038296e-05, + "loss": 0.5408, + "step": 884 + }, + { + "epoch": 0.3544, + "learning_rate": 1.3564118787132507e-05, + "loss": 0.555, + "step": 886 + }, + { + "epoch": 0.3552, + "learning_rate": 1.3590196242512461e-05, + "loss": 0.6004, + "step": 888 + }, + { + "epoch": 0.356, + "learning_rate": 1.361624570082092e-05, + "loss": 0.3199, + "step": 890 + }, + { + "epoch": 0.3568, + "learning_rate": 1.364226695891898e-05, + "loss": 0.5311, + "step": 892 + }, + { + "epoch": 0.3576, + "learning_rate": 1.3668259813887637e-05, + "loss": 1.0644, + "step": 894 + }, + { + "epoch": 0.3584, + "learning_rate": 1.3694224063029386e-05, + "loss": 0.2981, + "step": 896 + }, + { + "epoch": 0.3592, + "learning_rate": 1.3720159503869806e-05, + "loss": 0.2743, + "step": 898 + }, + { + "epoch": 0.36, + "learning_rate": 1.374606593415911e-05, + "loss": 0.2481, + "step": 900 + }, + { + "epoch": 0.3608, + "learning_rate": 1.377194315187377e-05, + "loss": 0.5719, + "step": 902 + }, + { + "epoch": 0.3616, + "learning_rate": 1.3797790955218014e-05, + "loss": 0.3525, + "step": 904 + }, + { + "epoch": 0.3624, + "learning_rate": 1.3823609142625492e-05, + "loss": 0.2797, + "step": 906 + }, + { + "epoch": 0.3632, + "learning_rate": 1.3849397512760793e-05, + "loss": 0.2219, + "step": 908 + }, + { + "epoch": 0.364, + "learning_rate": 1.3875155864521027e-05, + "loss": 0.4071, + "step": 910 + }, + { + "epoch": 0.3648, + "learning_rate": 1.3900883997037393e-05, + "loss": 0.1693, + "step": 912 + }, + { + "epoch": 0.3656, + "learning_rate": 1.3926581709676746e-05, + "loss": 0.2184, + "step": 914 + }, + { + "epoch": 0.3664, + "learning_rate": 1.3952248802043158e-05, + "loss": 0.4943, + "step": 916 + }, + { + "epoch": 0.3672, + "learning_rate": 1.397788507397949e-05, + "loss": 0.4375, + "step": 918 + }, + { + "epoch": 0.368, + "learning_rate": 1.4003490325568956e-05, + "loss": 0.2513, + "step": 920 + }, + { + "epoch": 0.3688, + "learning_rate": 1.4029064357136632e-05, + "loss": 0.582, + "step": 922 + }, + { + "epoch": 0.3696, + "learning_rate": 1.4054606969251096e-05, + "loss": 0.5867, + "step": 924 + }, + { + "epoch": 0.3704, + "learning_rate": 1.4080117962725929e-05, + "loss": 0.6241, + "step": 926 + }, + { + "epoch": 0.3712, + "learning_rate": 1.4105597138621281e-05, + "loss": 0.1821, + "step": 928 + }, + { + "epoch": 0.372, + "learning_rate": 1.4131044298245416e-05, + "loss": 0.5264, + "step": 930 + }, + { + "epoch": 0.3728, + "learning_rate": 1.4156459243156275e-05, + "loss": 0.3223, + "step": 932 + }, + { + "epoch": 0.3736, + "learning_rate": 1.418184177516301e-05, + "loss": 0.5581, + "step": 934 + }, + { + "epoch": 0.3744, + "learning_rate": 1.420719169632754e-05, + "loss": 0.51, + "step": 936 + }, + { + "epoch": 0.3752, + "learning_rate": 1.4232508808966085e-05, + "loss": 0.2135, + "step": 938 + }, + { + "epoch": 0.376, + "learning_rate": 1.4257792915650735e-05, + "loss": 0.4519, + "step": 940 + }, + { + "epoch": 0.3768, + "learning_rate": 1.4283043819210906e-05, + "loss": 0.0925, + "step": 942 + }, + { + "epoch": 0.3776, + "learning_rate": 1.430826132273499e-05, + "loss": 0.5629, + "step": 944 + }, + { + "epoch": 0.3784, + "learning_rate": 1.4333445229571857e-05, + "loss": 0.4294, + "step": 946 + }, + { + "epoch": 0.3792, + "learning_rate": 1.4358595343332342e-05, + "loss": 0.2303, + "step": 948 + }, + { + "epoch": 0.38, + "learning_rate": 1.4383711467890772e-05, + "loss": 0.7224, + "step": 950 + }, + { + "epoch": 0.3808, + "learning_rate": 1.4408793407386584e-05, + "loss": 0.7681, + "step": 952 + }, + { + "epoch": 0.3816, + "learning_rate": 1.4433840966225767e-05, + "loss": 0.1611, + "step": 954 + }, + { + "epoch": 0.3824, + "learning_rate": 1.4458853949082434e-05, + "loss": 0.3258, + "step": 956 + }, + { + "epoch": 0.3832, + "learning_rate": 1.4483832160900332e-05, + "loss": 0.2089, + "step": 958 + }, + { + "epoch": 0.384, + "learning_rate": 1.4508775406894315e-05, + "loss": 0.3231, + "step": 960 + }, + { + "epoch": 0.3848, + "learning_rate": 1.4533683492551942e-05, + "loss": 0.1896, + "step": 962 + }, + { + "epoch": 0.3856, + "learning_rate": 1.4558556223634988e-05, + "loss": 0.1308, + "step": 964 + }, + { + "epoch": 0.3864, + "learning_rate": 1.4583393406180886e-05, + "loss": 0.645, + "step": 966 + }, + { + "epoch": 0.3872, + "learning_rate": 1.460819484650431e-05, + "loss": 0.7102, + "step": 968 + }, + { + "epoch": 0.388, + "learning_rate": 1.4632960351198618e-05, + "loss": 0.4555, + "step": 970 + }, + { + "epoch": 0.3888, + "learning_rate": 1.4657689727137441e-05, + "loss": 0.573, + "step": 972 + }, + { + "epoch": 0.3896, + "learning_rate": 1.468238278147614e-05, + "loss": 0.459, + "step": 974 + }, + { + "epoch": 0.3904, + "learning_rate": 1.470703932165332e-05, + "loss": 0.4586, + "step": 976 + }, + { + "epoch": 0.3912, + "learning_rate": 1.4731659155392339e-05, + "loss": 0.2938, + "step": 978 + }, + { + "epoch": 0.392, + "learning_rate": 1.4756242090702744e-05, + "loss": 0.4641, + "step": 980 + }, + { + "epoch": 0.3928, + "learning_rate": 1.4780787935881913e-05, + "loss": 0.2242, + "step": 982 + }, + { + "epoch": 0.3936, + "learning_rate": 1.4805296499516397e-05, + "loss": 0.5384, + "step": 984 + }, + { + "epoch": 0.3944, + "learning_rate": 1.482976759048351e-05, + "loss": 0.368, + "step": 986 + }, + { + "epoch": 0.3952, + "learning_rate": 1.485420101795274e-05, + "loss": 0.4566, + "step": 988 + }, + { + "epoch": 0.396, + "learning_rate": 1.4878596591387327e-05, + "loss": 0.267, + "step": 990 + }, + { + "epoch": 0.3968, + "learning_rate": 1.4902954120545686e-05, + "loss": 0.3369, + "step": 992 + }, + { + "epoch": 0.3976, + "learning_rate": 1.4927273415482913e-05, + "loss": 0.3975, + "step": 994 + }, + { + "epoch": 0.3984, + "learning_rate": 1.4951554286552261e-05, + "loss": 0.2294, + "step": 996 + }, + { + "epoch": 0.3992, + "learning_rate": 1.4975796544406617e-05, + "loss": 0.7224, + "step": 998 + }, + { + "epoch": 0.4, + "learning_rate": 1.4999999999999992e-05, + "loss": 0.4933, + "step": 1000 + }, + { + "epoch": 0.4008, + "learning_rate": 1.502416446458897e-05, + "loss": 0.5847, + "step": 1002 + }, + { + "epoch": 0.4016, + "learning_rate": 1.5048289749734206e-05, + "loss": 0.5089, + "step": 1004 + }, + { + "epoch": 0.4024, + "learning_rate": 1.5072375667301895e-05, + "loss": 0.3371, + "step": 1006 + }, + { + "epoch": 0.4032, + "learning_rate": 1.5096422029465178e-05, + "loss": 0.2545, + "step": 1008 + }, + { + "epoch": 0.404, + "learning_rate": 1.5120428648705714e-05, + "loss": 0.3811, + "step": 1010 + }, + { + "epoch": 0.4048, + "learning_rate": 1.5144395337815064e-05, + "loss": 0.4458, + "step": 1012 + }, + { + "epoch": 0.4056, + "learning_rate": 1.5168321909896166e-05, + "loss": 0.3858, + "step": 1014 + }, + { + "epoch": 0.4064, + "learning_rate": 1.5192208178364808e-05, + "loss": 0.8033, + "step": 1016 + }, + { + "epoch": 0.4072, + "learning_rate": 1.521605395695107e-05, + "loss": 0.4742, + "step": 1018 + }, + { + "epoch": 0.408, + "learning_rate": 1.5239859059700784e-05, + "loss": 0.5258, + "step": 1020 + }, + { + "epoch": 0.4088, + "learning_rate": 1.526362330097697e-05, + "loss": 0.2302, + "step": 1022 + }, + { + "epoch": 0.4096, + "learning_rate": 1.5287346495461322e-05, + "loss": 0.5003, + "step": 1024 + }, + { + "epoch": 0.4104, + "learning_rate": 1.531102845815557e-05, + "loss": 0.1691, + "step": 1026 + }, + { + "epoch": 0.4112, + "learning_rate": 1.5334669004383025e-05, + "loss": 0.3695, + "step": 1028 + }, + { + "epoch": 0.412, + "learning_rate": 1.5358267949789968e-05, + "loss": 0.4745, + "step": 1030 + }, + { + "epoch": 0.4128, + "learning_rate": 1.5381825110347072e-05, + "loss": 0.3486, + "step": 1032 + }, + { + "epoch": 0.4136, + "learning_rate": 1.540534030235087e-05, + "loss": 0.2189, + "step": 1034 + }, + { + "epoch": 0.4144, + "learning_rate": 1.542881334242517e-05, + "loss": 0.5796, + "step": 1036 + }, + { + "epoch": 0.4152, + "learning_rate": 1.5452244047522493e-05, + "loss": 0.2266, + "step": 1038 + }, + { + "epoch": 0.416, + "learning_rate": 1.5475632234925495e-05, + "loss": 0.3786, + "step": 1040 + }, + { + "epoch": 0.4168, + "learning_rate": 1.5498977722248388e-05, + "loss": 0.3394, + "step": 1042 + }, + { + "epoch": 0.4176, + "learning_rate": 1.552228032743839e-05, + "loss": 0.4539, + "step": 1044 + }, + { + "epoch": 0.4184, + "learning_rate": 1.5545539868777075e-05, + "loss": 0.3738, + "step": 1046 + }, + { + "epoch": 0.4192, + "learning_rate": 1.556875616488188e-05, + "loss": 0.2871, + "step": 1048 + }, + { + "epoch": 0.42, + "learning_rate": 1.5591929034707468e-05, + "loss": 0.3148, + "step": 1050 + }, + { + "epoch": 0.4208, + "learning_rate": 1.5615058297547144e-05, + "loss": 0.3577, + "step": 1052 + }, + { + "epoch": 0.4216, + "learning_rate": 1.5638143773034268e-05, + "loss": 0.4161, + "step": 1054 + }, + { + "epoch": 0.4224, + "learning_rate": 1.5661185281143663e-05, + "loss": 0.4687, + "step": 1056 + }, + { + "epoch": 0.4232, + "learning_rate": 1.5684182642193024e-05, + "loss": 0.3008, + "step": 1058 + }, + { + "epoch": 0.424, + "learning_rate": 1.5707135676844312e-05, + "loss": 0.1481, + "step": 1060 + }, + { + "epoch": 0.4248, + "learning_rate": 1.5730044206105146e-05, + "loss": 0.6407, + "step": 1062 + }, + { + "epoch": 0.4256, + "learning_rate": 1.5752908051330232e-05, + "loss": 0.5111, + "step": 1064 + }, + { + "epoch": 0.4264, + "learning_rate": 1.577572703422268e-05, + "loss": 0.2776, + "step": 1066 + }, + { + "epoch": 0.4272, + "learning_rate": 1.579850097683548e-05, + "loss": 0.3443, + "step": 1068 + }, + { + "epoch": 0.428, + "learning_rate": 1.582122970157288e-05, + "loss": 0.3981, + "step": 1070 + }, + { + "epoch": 0.4288, + "learning_rate": 1.5843913031191722e-05, + "loss": 0.2054, + "step": 1072 + }, + { + "epoch": 0.4296, + "learning_rate": 1.586655078880281e-05, + "loss": 0.4513, + "step": 1074 + }, + { + "epoch": 0.4304, + "learning_rate": 1.5889142797872383e-05, + "loss": 0.4396, + "step": 1076 + }, + { + "epoch": 0.4312, + "learning_rate": 1.5911688882223415e-05, + "loss": 0.182, + "step": 1078 + }, + { + "epoch": 0.432, + "learning_rate": 1.5934188866037007e-05, + "loss": 0.5474, + "step": 1080 + }, + { + "epoch": 0.4328, + "learning_rate": 1.5956642573853787e-05, + "loss": 0.2979, + "step": 1082 + }, + { + "epoch": 0.4336, + "learning_rate": 1.5979049830575193e-05, + "loss": 0.2033, + "step": 1084 + }, + { + "epoch": 0.4344, + "learning_rate": 1.6001410461464945e-05, + "loss": 0.5218, + "step": 1086 + }, + { + "epoch": 0.4352, + "learning_rate": 1.6023724292150377e-05, + "loss": 0.7608, + "step": 1088 + }, + { + "epoch": 0.436, + "learning_rate": 1.604599114862375e-05, + "loss": 0.1863, + "step": 1090 + }, + { + "epoch": 0.4368, + "learning_rate": 1.606821085724362e-05, + "loss": 0.2661, + "step": 1092 + }, + { + "epoch": 0.4376, + "learning_rate": 1.6090383244736253e-05, + "loss": 0.2176, + "step": 1094 + }, + { + "epoch": 0.4384, + "learning_rate": 1.6112508138196912e-05, + "loss": 0.2362, + "step": 1096 + }, + { + "epoch": 0.4392, + "learning_rate": 1.613458536509124e-05, + "loss": 0.317, + "step": 1098 + }, + { + "epoch": 0.44, + "learning_rate": 1.615661475325658e-05, + "loss": 0.803, + "step": 1100 + }, + { + "epoch": 0.4408, + "learning_rate": 1.6178596130903352e-05, + "loss": 0.5381, + "step": 1102 + }, + { + "epoch": 0.4416, + "learning_rate": 1.620052932661632e-05, + "loss": 0.607, + "step": 1104 + }, + { + "epoch": 0.4424, + "learning_rate": 1.6222414169356056e-05, + "loss": 1.8422, + "step": 1106 + }, + { + "epoch": 0.4432, + "learning_rate": 1.6244250488460146e-05, + "loss": 0.4935, + "step": 1108 + }, + { + "epoch": 0.444, + "learning_rate": 1.6266038113644605e-05, + "loss": 0.4026, + "step": 1110 + }, + { + "epoch": 0.4448, + "learning_rate": 1.6287776875005127e-05, + "loss": 0.5309, + "step": 1112 + }, + { + "epoch": 0.4456, + "learning_rate": 1.6309466603018497e-05, + "loss": 0.672, + "step": 1114 + }, + { + "epoch": 0.4464, + "learning_rate": 1.6331107128543856e-05, + "loss": 0.7134, + "step": 1116 + }, + { + "epoch": 0.4472, + "learning_rate": 1.635269828282404e-05, + "loss": 0.4279, + "step": 1118 + }, + { + "epoch": 0.448, + "learning_rate": 1.6374239897486905e-05, + "loss": 0.5922, + "step": 1120 + }, + { + "epoch": 0.4488, + "learning_rate": 1.6395731804546575e-05, + "loss": 0.2563, + "step": 1122 + }, + { + "epoch": 0.4496, + "learning_rate": 1.6417173836404878e-05, + "loss": 0.3628, + "step": 1124 + }, + { + "epoch": 0.4504, + "learning_rate": 1.643856582585253e-05, + "loss": 0.5537, + "step": 1126 + }, + { + "epoch": 0.4512, + "learning_rate": 1.6459907606070513e-05, + "loss": 0.499, + "step": 1128 + }, + { + "epoch": 0.452, + "learning_rate": 1.6481199010631312e-05, + "loss": 0.4621, + "step": 1130 + }, + { + "epoch": 0.4528, + "learning_rate": 1.650243987350029e-05, + "loss": 0.3665, + "step": 1132 + }, + { + "epoch": 0.4536, + "learning_rate": 1.652363002903693e-05, + "loss": 0.5138, + "step": 1134 + }, + { + "epoch": 0.4544, + "learning_rate": 1.6544769311996146e-05, + "loss": 0.36, + "step": 1136 + }, + { + "epoch": 0.4552, + "learning_rate": 1.656585755752956e-05, + "loss": 0.433, + "step": 1138 + }, + { + "epoch": 0.456, + "learning_rate": 1.65868946011868e-05, + "loss": 0.2838, + "step": 1140 + }, + { + "epoch": 0.4568, + "learning_rate": 1.660788027891677e-05, + "loss": 0.574, + "step": 1142 + }, + { + "epoch": 0.4576, + "learning_rate": 1.6628814427068944e-05, + "loss": 0.2731, + "step": 1144 + }, + { + "epoch": 0.4584, + "learning_rate": 1.6649696882394625e-05, + "loss": 0.2417, + "step": 1146 + }, + { + "epoch": 0.4592, + "learning_rate": 1.667052748204825e-05, + "loss": 0.9602, + "step": 1148 + }, + { + "epoch": 0.46, + "learning_rate": 1.6691306063588583e-05, + "loss": 0.1413, + "step": 1150 + }, + { + "epoch": 0.4608, + "learning_rate": 1.6712032464980094e-05, + "loss": 0.3732, + "step": 1152 + }, + { + "epoch": 0.4616, + "learning_rate": 1.6732706524594138e-05, + "loss": 0.2055, + "step": 1154 + }, + { + "epoch": 0.4624, + "learning_rate": 1.6753328081210244e-05, + "loss": 0.4441, + "step": 1156 + }, + { + "epoch": 0.4632, + "learning_rate": 1.6773896974017373e-05, + "loss": 0.9756, + "step": 1158 + }, + { + "epoch": 0.464, + "learning_rate": 1.679441304261516e-05, + "loss": 0.2989, + "step": 1160 + }, + { + "epoch": 0.4648, + "learning_rate": 1.681487612701519e-05, + "loss": 0.4563, + "step": 1162 + }, + { + "epoch": 0.4656, + "learning_rate": 1.683528606764222e-05, + "loss": 0.4705, + "step": 1164 + }, + { + "epoch": 0.4664, + "learning_rate": 1.6855642705335428e-05, + "loss": 0.8693, + "step": 1166 + }, + { + "epoch": 0.4672, + "learning_rate": 1.687594588134968e-05, + "loss": 0.1037, + "step": 1168 + }, + { + "epoch": 0.468, + "learning_rate": 1.68961954373567e-05, + "loss": 0.8448, + "step": 1170 + }, + { + "epoch": 0.4688, + "learning_rate": 1.6916391215446403e-05, + "loss": 0.4903, + "step": 1172 + }, + { + "epoch": 0.4696, + "learning_rate": 1.693653305812805e-05, + "loss": 0.262, + "step": 1174 + }, + { + "epoch": 0.4704, + "learning_rate": 1.6956620808331505e-05, + "loss": 2.3711, + "step": 1176 + }, + { + "epoch": 0.4712, + "learning_rate": 1.697665430940846e-05, + "loss": 0.1705, + "step": 1178 + }, + { + "epoch": 0.472, + "learning_rate": 1.699663340513365e-05, + "loss": 0.3903, + "step": 1180 + }, + { + "epoch": 0.4728, + "learning_rate": 1.7016557939706068e-05, + "loss": 0.2667, + "step": 1182 + }, + { + "epoch": 0.4736, + "learning_rate": 1.7036427757750198e-05, + "loss": 0.221, + "step": 1184 + }, + { + "epoch": 0.4744, + "learning_rate": 1.7056242704317212e-05, + "loss": 0.3477, + "step": 1186 + }, + { + "epoch": 0.4752, + "learning_rate": 1.7076002624886156e-05, + "loss": 0.7181, + "step": 1188 + }, + { + "epoch": 0.476, + "learning_rate": 1.709570736536521e-05, + "loss": 1.1246, + "step": 1190 + }, + { + "epoch": 0.4768, + "learning_rate": 1.7115356772092844e-05, + "loss": 0.3987, + "step": 1192 + }, + { + "epoch": 0.4776, + "learning_rate": 1.7134950691839063e-05, + "loss": 0.3578, + "step": 1194 + }, + { + "epoch": 0.4784, + "learning_rate": 1.7154488971806518e-05, + "loss": 0.5649, + "step": 1196 + }, + { + "epoch": 0.4792, + "learning_rate": 1.7173971459631783e-05, + "loss": 0.3612, + "step": 1198 + }, + { + "epoch": 0.48, + "learning_rate": 1.7193398003386507e-05, + "loss": 0.6612, + "step": 1200 + }, + { + "epoch": 0.4808, + "learning_rate": 1.7212768451578602e-05, + "loss": 0.3363, + "step": 1202 + }, + { + "epoch": 0.4816, + "learning_rate": 1.7232082653153416e-05, + "loss": 0.6457, + "step": 1204 + }, + { + "epoch": 0.4824, + "learning_rate": 1.7251340457494937e-05, + "loss": 0.5465, + "step": 1206 + }, + { + "epoch": 0.4832, + "learning_rate": 1.7270541714426923e-05, + "loss": 0.314, + "step": 1208 + }, + { + "epoch": 0.484, + "learning_rate": 1.7289686274214106e-05, + "loss": 0.2807, + "step": 1210 + }, + { + "epoch": 0.4848, + "learning_rate": 1.7308773987563393e-05, + "loss": 0.5126, + "step": 1212 + }, + { + "epoch": 0.4856, + "learning_rate": 1.732780470562496e-05, + "loss": 0.4914, + "step": 1214 + }, + { + "epoch": 0.4864, + "learning_rate": 1.7346778279993413e-05, + "loss": 0.2799, + "step": 1216 + }, + { + "epoch": 0.4872, + "learning_rate": 1.736569456270903e-05, + "loss": 0.3901, + "step": 1218 + }, + { + "epoch": 0.488, + "learning_rate": 1.7384553406258836e-05, + "loss": 0.9799, + "step": 1220 + }, + { + "epoch": 0.4888, + "learning_rate": 1.740335466357778e-05, + "loss": 0.5404, + "step": 1222 + }, + { + "epoch": 0.4896, + "learning_rate": 1.7422098188049888e-05, + "loss": 0.459, + "step": 1224 + }, + { + "epoch": 0.4904, + "learning_rate": 1.7440783833509373e-05, + "loss": 0.2855, + "step": 1226 + }, + { + "epoch": 0.4912, + "learning_rate": 1.7459411454241816e-05, + "loss": 0.4176, + "step": 1228 + }, + { + "epoch": 0.492, + "learning_rate": 1.747798090498531e-05, + "loss": 0.2373, + "step": 1230 + }, + { + "epoch": 0.4928, + "learning_rate": 1.749649204093154e-05, + "loss": 0.1595, + "step": 1232 + }, + { + "epoch": 0.4936, + "learning_rate": 1.7514944717726962e-05, + "loss": 0.5574, + "step": 1234 + }, + { + "epoch": 0.4944, + "learning_rate": 1.753333879147387e-05, + "loss": 0.5671, + "step": 1236 + }, + { + "epoch": 0.4952, + "learning_rate": 1.755167411873159e-05, + "loss": 0.4137, + "step": 1238 + }, + { + "epoch": 0.496, + "learning_rate": 1.7569950556517563e-05, + "loss": 0.2345, + "step": 1240 + }, + { + "epoch": 0.4968, + "learning_rate": 1.758816796230845e-05, + "loss": 0.3545, + "step": 1242 + }, + { + "epoch": 0.4976, + "learning_rate": 1.7606326194041278e-05, + "loss": 0.4018, + "step": 1244 + }, + { + "epoch": 0.4984, + "learning_rate": 1.762442511011447e-05, + "loss": 0.3273, + "step": 1246 + }, + { + "epoch": 0.4992, + "learning_rate": 1.7642464569389083e-05, + "loss": 0.4224, + "step": 1248 + }, + { + "epoch": 0.5, + "learning_rate": 1.766044443118977e-05, + "loss": 0.3708, + "step": 1250 + }, + { + "epoch": 0.5008, + "learning_rate": 1.767836455530598e-05, + "loss": 0.2254, + "step": 1252 + }, + { + "epoch": 0.5016, + "learning_rate": 1.7696224801992947e-05, + "loss": 0.6273, + "step": 1254 + }, + { + "epoch": 0.5024, + "learning_rate": 1.77140250319729e-05, + "loss": 0.3763, + "step": 1256 + }, + { + "epoch": 0.5032, + "learning_rate": 1.7731765106436073e-05, + "loss": 0.8509, + "step": 1258 + }, + { + "epoch": 0.504, + "learning_rate": 1.7749444887041793e-05, + "loss": 0.3393, + "step": 1260 + }, + { + "epoch": 0.5048, + "learning_rate": 1.776706423591959e-05, + "loss": 0.3212, + "step": 1262 + }, + { + "epoch": 0.5056, + "learning_rate": 1.778462301567023e-05, + "loss": 0.4857, + "step": 1264 + }, + { + "epoch": 0.5064, + "learning_rate": 1.7802121089366832e-05, + "loss": 0.4235, + "step": 1266 + }, + { + "epoch": 0.5072, + "learning_rate": 1.7819558320555895e-05, + "loss": 0.4269, + "step": 1268 + }, + { + "epoch": 0.508, + "learning_rate": 1.7836934573258392e-05, + "loss": 0.212, + "step": 1270 + }, + { + "epoch": 0.5088, + "learning_rate": 1.785424971197082e-05, + "loss": 0.974, + "step": 1272 + }, + { + "epoch": 0.5096, + "learning_rate": 1.7871503601666233e-05, + "loss": 0.7842, + "step": 1274 + }, + { + "epoch": 0.5104, + "learning_rate": 1.7888696107795343e-05, + "loss": 0.7938, + "step": 1276 + }, + { + "epoch": 0.5112, + "learning_rate": 1.790582709628753e-05, + "loss": 0.3575, + "step": 1278 + }, + { + "epoch": 0.512, + "learning_rate": 1.7922896433551903e-05, + "loss": 0.2405, + "step": 1280 + }, + { + "epoch": 0.5128, + "learning_rate": 1.793990398647835e-05, + "loss": 0.3137, + "step": 1282 + }, + { + "epoch": 0.5136, + "learning_rate": 1.795684962243855e-05, + "loss": 0.3059, + "step": 1284 + }, + { + "epoch": 0.5144, + "learning_rate": 1.7973733209287032e-05, + "loss": 0.2006, + "step": 1286 + }, + { + "epoch": 0.5152, + "learning_rate": 1.7990554615362193e-05, + "loss": 0.2252, + "step": 1288 + }, + { + "epoch": 0.516, + "learning_rate": 1.800731370948734e-05, + "loss": 0.4147, + "step": 1290 + }, + { + "epoch": 0.5168, + "learning_rate": 1.802401036097167e-05, + "loss": 0.5745, + "step": 1292 + }, + { + "epoch": 0.5176, + "learning_rate": 1.804064443961135e-05, + "loss": 0.1498, + "step": 1294 + }, + { + "epoch": 0.5184, + "learning_rate": 1.8057215815690494e-05, + "loss": 0.4047, + "step": 1296 + }, + { + "epoch": 0.5192, + "learning_rate": 1.8073724359982184e-05, + "loss": 0.7676, + "step": 1298 + }, + { + "epoch": 0.52, + "learning_rate": 1.809016994374947e-05, + "loss": 0.4026, + "step": 1300 + }, + { + "epoch": 0.5208, + "learning_rate": 1.81065524387464e-05, + "loss": 0.3772, + "step": 1302 + }, + { + "epoch": 0.5216, + "learning_rate": 1.8122871717218968e-05, + "loss": 0.0441, + "step": 1304 + }, + { + "epoch": 0.5224, + "learning_rate": 1.8139127651906176e-05, + "loss": 0.1758, + "step": 1306 + }, + { + "epoch": 0.5232, + "learning_rate": 1.8155320116040976e-05, + "loss": 1.3785, + "step": 1308 + }, + { + "epoch": 0.524, + "learning_rate": 1.817144898335129e-05, + "loss": 0.3662, + "step": 1310 + }, + { + "epoch": 0.5248, + "learning_rate": 1.818751412806095e-05, + "loss": 0.4931, + "step": 1312 + }, + { + "epoch": 0.5256, + "learning_rate": 1.8203515424890738e-05, + "loss": 0.4942, + "step": 1314 + }, + { + "epoch": 0.5264, + "learning_rate": 1.8219452749059322e-05, + "loss": 0.5083, + "step": 1316 + }, + { + "epoch": 0.5272, + "learning_rate": 1.8235325976284276e-05, + "loss": 0.5566, + "step": 1318 + }, + { + "epoch": 0.528, + "learning_rate": 1.8251134982782952e-05, + "loss": 0.1276, + "step": 1320 + }, + { + "epoch": 0.5288, + "learning_rate": 1.826687964527355e-05, + "loss": 0.3966, + "step": 1322 + }, + { + "epoch": 0.5296, + "learning_rate": 1.828255984097604e-05, + "loss": 1.0448, + "step": 1324 + }, + { + "epoch": 0.5304, + "learning_rate": 1.8298175447613093e-05, + "loss": 0.6915, + "step": 1326 + }, + { + "epoch": 0.5312, + "learning_rate": 1.8313726343411092e-05, + "loss": 0.3926, + "step": 1328 + }, + { + "epoch": 0.532, + "learning_rate": 1.8329212407101e-05, + "loss": 0.197, + "step": 1330 + }, + { + "epoch": 0.5328, + "learning_rate": 1.8344633517919394e-05, + "loss": 0.167, + "step": 1332 + }, + { + "epoch": 0.5336, + "learning_rate": 1.8359989555609344e-05, + "loss": 0.0678, + "step": 1334 + }, + { + "epoch": 0.5344, + "learning_rate": 1.8375280400421407e-05, + "loss": 0.4975, + "step": 1336 + }, + { + "epoch": 0.5352, + "learning_rate": 1.8390505933114503e-05, + "loss": 0.1489, + "step": 1338 + }, + { + "epoch": 0.536, + "learning_rate": 1.8405666034956842e-05, + "loss": 1.777, + "step": 1340 + }, + { + "epoch": 0.5368, + "learning_rate": 1.842076058772692e-05, + "loss": 0.0837, + "step": 1342 + }, + { + "epoch": 0.5376, + "learning_rate": 1.8435789473714384e-05, + "loss": 0.8077, + "step": 1344 + }, + { + "epoch": 0.5384, + "learning_rate": 1.8450752575720964e-05, + "loss": 0.2715, + "step": 1346 + }, + { + "epoch": 0.5392, + "learning_rate": 1.8465649777061384e-05, + "loss": 0.398, + "step": 1348 + }, + { + "epoch": 0.54, + "learning_rate": 1.8480480961564266e-05, + "loss": 0.3737, + "step": 1350 + }, + { + "epoch": 0.5408, + "learning_rate": 1.8495246013573047e-05, + "loss": 0.2856, + "step": 1352 + }, + { + "epoch": 0.5416, + "learning_rate": 1.850994481794691e-05, + "loss": 0.2185, + "step": 1354 + }, + { + "epoch": 0.5424, + "learning_rate": 1.8524577260061628e-05, + "loss": 0.4094, + "step": 1356 + }, + { + "epoch": 0.5432, + "learning_rate": 1.8539143225810453e-05, + "loss": 0.3481, + "step": 1358 + }, + { + "epoch": 0.544, + "learning_rate": 1.8553642601605066e-05, + "loss": 0.2152, + "step": 1360 + }, + { + "epoch": 0.5448, + "learning_rate": 1.856807527437643e-05, + "loss": 0.7451, + "step": 1362 + }, + { + "epoch": 0.5456, + "learning_rate": 1.8582441131575658e-05, + "loss": 0.2519, + "step": 1364 + }, + { + "epoch": 0.5464, + "learning_rate": 1.859674006117491e-05, + "loss": 0.2673, + "step": 1366 + }, + { + "epoch": 0.5472, + "learning_rate": 1.8610971951668268e-05, + "loss": 0.315, + "step": 1368 + }, + { + "epoch": 0.548, + "learning_rate": 1.862513669207257e-05, + "loss": 0.2114, + "step": 1370 + }, + { + "epoch": 0.5488, + "learning_rate": 1.8639234171928348e-05, + "loss": 0.2307, + "step": 1372 + }, + { + "epoch": 0.5496, + "learning_rate": 1.8653264281300612e-05, + "loss": 0.175, + "step": 1374 + }, + { + "epoch": 0.5504, + "learning_rate": 1.8667226910779767e-05, + "loss": 0.5248, + "step": 1376 + }, + { + "epoch": 0.5512, + "learning_rate": 1.8681121951482393e-05, + "loss": 0.5769, + "step": 1378 + }, + { + "epoch": 0.552, + "learning_rate": 1.869494929505219e-05, + "loss": 1.1836, + "step": 1380 + }, + { + "epoch": 0.5528, + "learning_rate": 1.870870883366075e-05, + "loss": 0.1642, + "step": 1382 + }, + { + "epoch": 0.5536, + "learning_rate": 1.8722400460008434e-05, + "loss": 0.8945, + "step": 1384 + }, + { + "epoch": 0.5544, + "learning_rate": 1.8736024067325195e-05, + "loss": 0.5492, + "step": 1386 + }, + { + "epoch": 0.5552, + "learning_rate": 1.8749579549371373e-05, + "loss": 0.2745, + "step": 1388 + }, + { + "epoch": 0.556, + "learning_rate": 1.876306680043863e-05, + "loss": 0.361, + "step": 1390 + }, + { + "epoch": 0.5568, + "learning_rate": 1.8776485715350665e-05, + "loss": 0.5676, + "step": 1392 + }, + { + "epoch": 0.5576, + "learning_rate": 1.878983618946409e-05, + "loss": 0.5793, + "step": 1394 + }, + { + "epoch": 0.5584, + "learning_rate": 1.8803118118669203e-05, + "loss": 0.3842, + "step": 1396 + }, + { + "epoch": 0.5592, + "learning_rate": 1.881633139939087e-05, + "loss": 0.3279, + "step": 1398 + }, + { + "epoch": 0.56, + "learning_rate": 1.882947592858927e-05, + "loss": 0.4662, + "step": 1400 + }, + { + "epoch": 0.5608, + "learning_rate": 1.884255160376072e-05, + "loss": 0.2877, + "step": 1402 + }, + { + "epoch": 0.5616, + "learning_rate": 1.885555832293849e-05, + "loss": 0.5547, + "step": 1404 + }, + { + "epoch": 0.5624, + "learning_rate": 1.886849598469356e-05, + "loss": 0.8496, + "step": 1406 + }, + { + "epoch": 0.5632, + "learning_rate": 1.888136448813544e-05, + "loss": 0.7592, + "step": 1408 + }, + { + "epoch": 0.564, + "learning_rate": 1.8894163732912972e-05, + "loss": 0.227, + "step": 1410 + }, + { + "epoch": 0.5648, + "learning_rate": 1.890689361921506e-05, + "loss": 0.4809, + "step": 1412 + }, + { + "epoch": 0.5656, + "learning_rate": 1.891955404777151e-05, + "loss": 0.4551, + "step": 1414 + }, + { + "epoch": 0.5664, + "learning_rate": 1.893214491985374e-05, + "loss": 0.7215, + "step": 1416 + }, + { + "epoch": 0.5672, + "learning_rate": 1.89446661372756e-05, + "loss": 0.1362, + "step": 1418 + }, + { + "epoch": 0.568, + "learning_rate": 1.895711760239413e-05, + "loss": 0.5627, + "step": 1420 + }, + { + "epoch": 0.5688, + "learning_rate": 1.89694992181103e-05, + "loss": 0.3542, + "step": 1422 + }, + { + "epoch": 0.5696, + "learning_rate": 1.8981810887869784e-05, + "loss": 0.9833, + "step": 1424 + }, + { + "epoch": 0.5704, + "learning_rate": 1.8994052515663708e-05, + "loss": 0.3746, + "step": 1426 + }, + { + "epoch": 0.5712, + "learning_rate": 1.90062240060294e-05, + "loss": 0.5797, + "step": 1428 + }, + { + "epoch": 0.572, + "learning_rate": 1.9018325264051136e-05, + "loss": 1.0416, + "step": 1430 + }, + { + "epoch": 0.5728, + "learning_rate": 1.9030356195360868e-05, + "loss": 0.682, + "step": 1432 + }, + { + "epoch": 0.5736, + "learning_rate": 1.904231670613899e-05, + "loss": 0.1686, + "step": 1434 + }, + { + "epoch": 0.5744, + "learning_rate": 1.905420670311502e-05, + "loss": 0.5803, + "step": 1436 + }, + { + "epoch": 0.5752, + "learning_rate": 1.906602609356838e-05, + "loss": 0.3824, + "step": 1438 + }, + { + "epoch": 0.576, + "learning_rate": 1.9077774785329078e-05, + "loss": 0.4151, + "step": 1440 + }, + { + "epoch": 0.5768, + "learning_rate": 1.9089452686778487e-05, + "loss": 0.5139, + "step": 1442 + }, + { + "epoch": 0.5776, + "learning_rate": 1.9101059706849957e-05, + "loss": 0.3103, + "step": 1444 + }, + { + "epoch": 0.5784, + "learning_rate": 1.911259575502962e-05, + "loss": 0.4288, + "step": 1446 + }, + { + "epoch": 0.5792, + "learning_rate": 1.912406074135706e-05, + "loss": 0.2531, + "step": 1448 + }, + { + "epoch": 0.58, + "learning_rate": 1.9135454576426006e-05, + "loss": 0.6024, + "step": 1450 + }, + { + "epoch": 0.5808, + "learning_rate": 1.9146777171385053e-05, + "loss": 0.201, + "step": 1452 + }, + { + "epoch": 0.5816, + "learning_rate": 1.9158028437938316e-05, + "loss": 0.3086, + "step": 1454 + }, + { + "epoch": 0.5824, + "learning_rate": 1.9169208288346168e-05, + "loss": 0.5072, + "step": 1456 + }, + { + "epoch": 0.5832, + "learning_rate": 1.9180316635425876e-05, + "loss": 0.2633, + "step": 1458 + }, + { + "epoch": 0.584, + "learning_rate": 1.9191353392552346e-05, + "loss": 1.0178, + "step": 1460 + }, + { + "epoch": 0.5848, + "learning_rate": 1.9202318473658703e-05, + "loss": 0.2655, + "step": 1462 + }, + { + "epoch": 0.5856, + "learning_rate": 1.9213211793237052e-05, + "loss": 0.5568, + "step": 1464 + }, + { + "epoch": 0.5864, + "learning_rate": 1.92240332663391e-05, + "loss": 0.5739, + "step": 1466 + }, + { + "epoch": 0.5872, + "learning_rate": 1.923478280857682e-05, + "loss": 0.2075, + "step": 1468 + }, + { + "epoch": 0.588, + "learning_rate": 1.924546033612313e-05, + "loss": 0.4666, + "step": 1470 + }, + { + "epoch": 0.5888, + "learning_rate": 1.9256065765712524e-05, + "loss": 0.9465, + "step": 1472 + }, + { + "epoch": 0.5896, + "learning_rate": 1.9266599014641724e-05, + "loss": 0.4405, + "step": 1474 + }, + { + "epoch": 0.5904, + "learning_rate": 1.927706000077034e-05, + "loss": 0.2985, + "step": 1476 + }, + { + "epoch": 0.5912, + "learning_rate": 1.9287448642521507e-05, + "loss": 0.2449, + "step": 1478 + }, + { + "epoch": 0.592, + "learning_rate": 1.9297764858882516e-05, + "loss": 0.7689, + "step": 1480 + }, + { + "epoch": 0.5928, + "learning_rate": 1.9308008569405424e-05, + "loss": 0.8106, + "step": 1482 + }, + { + "epoch": 0.5936, + "learning_rate": 1.9318179694207722e-05, + "loss": 0.7092, + "step": 1484 + }, + { + "epoch": 0.5944, + "learning_rate": 1.9328278153972943e-05, + "loss": 0.1927, + "step": 1486 + }, + { + "epoch": 0.5952, + "learning_rate": 1.9338303869951266e-05, + "loss": 0.4652, + "step": 1488 + }, + { + "epoch": 0.596, + "learning_rate": 1.934825676396015e-05, + "loss": 0.3873, + "step": 1490 + }, + { + "epoch": 0.5968, + "learning_rate": 1.935813675838491e-05, + "loss": 0.3876, + "step": 1492 + }, + { + "epoch": 0.5976, + "learning_rate": 1.9367943776179375e-05, + "loss": 0.3785, + "step": 1494 + }, + { + "epoch": 0.5984, + "learning_rate": 1.9377677740866457e-05, + "loss": 0.3013, + "step": 1496 + }, + { + "epoch": 0.5992, + "learning_rate": 1.9387338576538743e-05, + "loss": 0.6693, + "step": 1498 + }, + { + "epoch": 0.6, + "learning_rate": 1.9396926207859085e-05, + "loss": 0.3456, + "step": 1500 + }, + { + "epoch": 0.6008, + "learning_rate": 1.9406440560061214e-05, + "loss": 0.3013, + "step": 1502 + }, + { + "epoch": 0.6016, + "learning_rate": 1.9415881558950302e-05, + "loss": 0.5545, + "step": 1504 + }, + { + "epoch": 0.6024, + "learning_rate": 1.942524913090354e-05, + "loss": 0.1985, + "step": 1506 + }, + { + "epoch": 0.6032, + "learning_rate": 1.9434543202870723e-05, + "loss": 0.3867, + "step": 1508 + }, + { + "epoch": 0.604, + "learning_rate": 1.9443763702374815e-05, + "loss": 0.4621, + "step": 1510 + }, + { + "epoch": 0.6048, + "learning_rate": 1.9452910557512494e-05, + "loss": 0.5818, + "step": 1512 + }, + { + "epoch": 0.6056, + "learning_rate": 1.9461983696954756e-05, + "loss": 0.1312, + "step": 1514 + }, + { + "epoch": 0.6064, + "learning_rate": 1.947098304994744e-05, + "loss": 0.4279, + "step": 1516 + }, + { + "epoch": 0.6072, + "learning_rate": 1.9479908546311787e-05, + "loss": 0.4722, + "step": 1518 + }, + { + "epoch": 0.608, + "learning_rate": 1.9488760116444966e-05, + "loss": 0.5362, + "step": 1520 + }, + { + "epoch": 0.6088, + "learning_rate": 1.949753769132067e-05, + "loss": 0.518, + "step": 1522 + }, + { + "epoch": 0.6096, + "learning_rate": 1.95062412024896e-05, + "loss": 0.9091, + "step": 1524 + }, + { + "epoch": 0.6104, + "learning_rate": 1.951487058208003e-05, + "loss": 0.5898, + "step": 1526 + }, + { + "epoch": 0.6112, + "learning_rate": 1.952342576279833e-05, + "loss": 0.3218, + "step": 1528 + }, + { + "epoch": 0.612, + "learning_rate": 1.953190667792947e-05, + "loss": 0.2644, + "step": 1530 + }, + { + "epoch": 0.6128, + "learning_rate": 1.9540313261337578e-05, + "loss": 0.4739, + "step": 1532 + }, + { + "epoch": 0.6136, + "learning_rate": 1.954864544746643e-05, + "loss": 0.7892, + "step": 1534 + }, + { + "epoch": 0.6144, + "learning_rate": 1.955690317133996e-05, + "loss": 0.3955, + "step": 1536 + }, + { + "epoch": 0.6152, + "learning_rate": 1.956508636856278e-05, + "loss": 0.3319, + "step": 1538 + }, + { + "epoch": 0.616, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.4159, + "step": 1540 + }, + { + "epoch": 0.6168, + "learning_rate": 1.95812289283811e-05, + "loss": 0.2276, + "step": 1542 + }, + { + "epoch": 0.6176, + "learning_rate": 1.958918816509367e-05, + "loss": 0.4349, + "step": 1544 + }, + { + "epoch": 0.6184, + "learning_rate": 1.9597072623390668e-05, + "loss": 0.5578, + "step": 1546 + }, + { + "epoch": 0.6192, + "learning_rate": 1.9604882241787496e-05, + "loss": 0.3256, + "step": 1548 + }, + { + "epoch": 0.62, + "learning_rate": 1.9612616959383187e-05, + "loss": 0.333, + "step": 1550 + }, + { + "epoch": 0.6208, + "learning_rate": 1.9620276715860856e-05, + "loss": 0.4473, + "step": 1552 + }, + { + "epoch": 0.6216, + "learning_rate": 1.9627861451488187e-05, + "loss": 0.3564, + "step": 1554 + }, + { + "epoch": 0.6224, + "learning_rate": 1.963537110711789e-05, + "loss": 0.2724, + "step": 1556 + }, + { + "epoch": 0.6232, + "learning_rate": 1.964280562418815e-05, + "loss": 0.2938, + "step": 1558 + }, + { + "epoch": 0.624, + "learning_rate": 1.9650164944723116e-05, + "loss": 0.9231, + "step": 1560 + }, + { + "epoch": 0.6248, + "learning_rate": 1.9657449011333328e-05, + "loss": 0.6229, + "step": 1562 + }, + { + "epoch": 0.6256, + "learning_rate": 1.9664657767216176e-05, + "loss": 0.7667, + "step": 1564 + }, + { + "epoch": 0.6264, + "learning_rate": 1.967179115615633e-05, + "loss": 0.4632, + "step": 1566 + }, + { + "epoch": 0.6272, + "learning_rate": 1.967884912252619e-05, + "loss": 0.2185, + "step": 1568 + }, + { + "epoch": 0.628, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.2786, + "step": 1570 + }, + { + "epoch": 0.6288, + "learning_rate": 1.969273856798585e-05, + "loss": 0.2322, + "step": 1572 + }, + { + "epoch": 0.6296, + "learning_rate": 1.9699569938762972e-05, + "loss": 0.289, + "step": 1574 + }, + { + "epoch": 0.6304, + "learning_rate": 1.9706325670345276e-05, + "loss": 0.3928, + "step": 1576 + }, + { + "epoch": 0.6312, + "learning_rate": 1.9713005710050203e-05, + "loss": 0.2933, + "step": 1578 + }, + { + "epoch": 0.632, + "learning_rate": 1.9719610005785466e-05, + "loss": 0.3232, + "step": 1580 + }, + { + "epoch": 0.6328, + "learning_rate": 1.9726138506049434e-05, + "loss": 0.2699, + "step": 1582 + }, + { + "epoch": 0.6336, + "learning_rate": 1.9732591159931564e-05, + "loss": 0.6497, + "step": 1584 + }, + { + "epoch": 0.6344, + "learning_rate": 1.9738967917112752e-05, + "loss": 0.3046, + "step": 1586 + }, + { + "epoch": 0.6352, + "learning_rate": 1.974526872786577e-05, + "loss": 0.3025, + "step": 1588 + }, + { + "epoch": 0.636, + "learning_rate": 1.9751493543055634e-05, + "loss": 0.5236, + "step": 1590 + }, + { + "epoch": 0.6368, + "learning_rate": 1.9757642314139977e-05, + "loss": 0.5004, + "step": 1592 + }, + { + "epoch": 0.6376, + "learning_rate": 1.976371499316945e-05, + "loss": 0.357, + "step": 1594 + }, + { + "epoch": 0.6384, + "learning_rate": 1.9769711532788083e-05, + "loss": 0.9836, + "step": 1596 + }, + { + "epoch": 0.6392, + "learning_rate": 1.9775631886233655e-05, + "loss": 0.226, + "step": 1598 + }, + { + "epoch": 0.64, + "learning_rate": 1.9781476007338054e-05, + "loss": 0.0414, + "step": 1600 + }, + { + "epoch": 0.6408, + "learning_rate": 1.978724385052766e-05, + "loss": 0.5458, + "step": 1602 + }, + { + "epoch": 0.6416, + "learning_rate": 1.9792935370823673e-05, + "loss": 0.2798, + "step": 1604 + }, + { + "epoch": 0.6424, + "learning_rate": 1.979855052384247e-05, + "loss": 0.6075, + "step": 1606 + }, + { + "epoch": 0.6432, + "learning_rate": 1.9804089265795956e-05, + "loss": 0.4444, + "step": 1608 + }, + { + "epoch": 0.644, + "learning_rate": 1.9809551553491918e-05, + "loss": 0.4185, + "step": 1610 + }, + { + "epoch": 0.6448, + "learning_rate": 1.981493734433433e-05, + "loss": 0.3061, + "step": 1612 + }, + { + "epoch": 0.6456, + "learning_rate": 1.982024659632372e-05, + "loss": 0.3893, + "step": 1614 + }, + { + "epoch": 0.6464, + "learning_rate": 1.9825479268057472e-05, + "loss": 0.1499, + "step": 1616 + }, + { + "epoch": 0.6472, + "learning_rate": 1.9830635318730155e-05, + "loss": 0.5627, + "step": 1618 + }, + { + "epoch": 0.648, + "learning_rate": 1.9835714708133858e-05, + "loss": 0.2208, + "step": 1620 + }, + { + "epoch": 0.6488, + "learning_rate": 1.9840717396658483e-05, + "loss": 0.1886, + "step": 1622 + }, + { + "epoch": 0.6496, + "learning_rate": 1.9845643345292055e-05, + "loss": 0.4447, + "step": 1624 + }, + { + "epoch": 0.6504, + "learning_rate": 1.9850492515621038e-05, + "loss": 0.1425, + "step": 1626 + }, + { + "epoch": 0.6512, + "learning_rate": 1.985526486983063e-05, + "loss": 0.5016, + "step": 1628 + }, + { + "epoch": 0.652, + "learning_rate": 1.985996037070505e-05, + "loss": 0.5538, + "step": 1630 + }, + { + "epoch": 0.6528, + "learning_rate": 1.9864578981627844e-05, + "loss": 0.438, + "step": 1632 + }, + { + "epoch": 0.6536, + "learning_rate": 1.9869120666582153e-05, + "loss": 0.0923, + "step": 1634 + }, + { + "epoch": 0.6544, + "learning_rate": 1.9873585390151003e-05, + "loss": 0.3436, + "step": 1636 + }, + { + "epoch": 0.6552, + "learning_rate": 1.987797311751759e-05, + "loss": 0.4172, + "step": 1638 + }, + { + "epoch": 0.656, + "learning_rate": 1.9882283814465528e-05, + "loss": 0.2756, + "step": 1640 + }, + { + "epoch": 0.6568, + "learning_rate": 1.988651744737914e-05, + "loss": 0.367, + "step": 1642 + }, + { + "epoch": 0.6576, + "learning_rate": 1.9890673983243704e-05, + "loss": 0.4439, + "step": 1644 + }, + { + "epoch": 0.6584, + "learning_rate": 1.9894753389645723e-05, + "loss": 0.3351, + "step": 1646 + }, + { + "epoch": 0.6592, + "learning_rate": 1.9898755634773155e-05, + "loss": 0.2895, + "step": 1648 + }, + { + "epoch": 0.66, + "learning_rate": 1.9902680687415704e-05, + "loss": 0.7238, + "step": 1650 + }, + { + "epoch": 0.6608, + "learning_rate": 1.9906528516965014e-05, + "loss": 0.6581, + "step": 1652 + }, + { + "epoch": 0.6616, + "learning_rate": 1.9910299093414926e-05, + "loss": 0.3748, + "step": 1654 + }, + { + "epoch": 0.6624, + "learning_rate": 1.9913992387361744e-05, + "loss": 0.4014, + "step": 1656 + }, + { + "epoch": 0.6632, + "learning_rate": 1.9917608370004414e-05, + "loss": 1.1257, + "step": 1658 + }, + { + "epoch": 0.664, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.4292, + "step": 1660 + }, + { + "epoch": 0.6648, + "learning_rate": 1.9924608289187786e-05, + "loss": 0.9405, + "step": 1662 + }, + { + "epoch": 0.6656, + "learning_rate": 1.9927992171141707e-05, + "loss": 0.4015, + "step": 1664 + }, + { + "epoch": 0.6664, + "learning_rate": 1.9931298632618355e-05, + "loss": 0.8005, + "step": 1666 + }, + { + "epoch": 0.6672, + "learning_rate": 1.9934527647833276e-05, + "loss": 0.2537, + "step": 1668 + }, + { + "epoch": 0.668, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.2554, + "step": 1670 + }, + { + "epoch": 0.6688, + "learning_rate": 1.9940753239360047e-05, + "loss": 0.3409, + "step": 1672 + }, + { + "epoch": 0.6696, + "learning_rate": 1.994374976712348e-05, + "loss": 0.6588, + "step": 1674 + }, + { + "epoch": 0.6704, + "learning_rate": 1.994666875152874e-05, + "loss": 0.5613, + "step": 1676 + }, + { + "epoch": 0.6712, + "learning_rate": 1.9949510169813003e-05, + "loss": 0.2924, + "step": 1678 + }, + { + "epoch": 0.672, + "learning_rate": 1.9952273999818312e-05, + "loss": 0.2271, + "step": 1680 + }, + { + "epoch": 0.6728, + "learning_rate": 1.995496021999177e-05, + "loss": 0.3869, + "step": 1682 + }, + { + "epoch": 0.6736, + "learning_rate": 1.9957568809385693e-05, + "loss": 0.1274, + "step": 1684 + }, + { + "epoch": 0.6744, + "learning_rate": 1.9960099747657774e-05, + "loss": 0.1654, + "step": 1686 + }, + { + "epoch": 0.6752, + "learning_rate": 1.996255301507125e-05, + "loss": 0.2258, + "step": 1688 + }, + { + "epoch": 0.676, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.4476, + "step": 1690 + }, + { + "epoch": 0.6768, + "learning_rate": 1.9967226461403934e-05, + "loss": 0.315, + "step": 1692 + }, + { + "epoch": 0.6776, + "learning_rate": 1.996944660387867e-05, + "loss": 0.9127, + "step": 1694 + }, + { + "epoch": 0.6784, + "learning_rate": 1.997158900260614e-05, + "loss": 0.5285, + "step": 1696 + }, + { + "epoch": 0.6792, + "learning_rate": 1.9973653640879486e-05, + "loss": 0.3819, + "step": 1698 + }, + { + "epoch": 0.68, + "learning_rate": 1.9975640502598246e-05, + "loss": 0.6786, + "step": 1700 + }, + { + "epoch": 0.6808, + "learning_rate": 1.997754957226847e-05, + "loss": 0.3319, + "step": 1702 + }, + { + "epoch": 0.6816, + "learning_rate": 1.9979380835002846e-05, + "loss": 0.4224, + "step": 1704 + }, + { + "epoch": 0.6824, + "learning_rate": 1.9981134276520828e-05, + "loss": 1.0829, + "step": 1706 + }, + { + "epoch": 0.6832, + "learning_rate": 1.998280988314872e-05, + "loss": 0.3782, + "step": 1708 + }, + { + "epoch": 0.684, + "learning_rate": 1.998440764181981e-05, + "loss": 0.5528, + "step": 1710 + }, + { + "epoch": 0.6848, + "learning_rate": 1.9985927540074453e-05, + "loss": 0.282, + "step": 1712 + }, + { + "epoch": 0.6856, + "learning_rate": 1.998736956606018e-05, + "loss": 0.3542, + "step": 1714 + }, + { + "epoch": 0.6864, + "learning_rate": 1.9988733708531772e-05, + "loss": 0.4115, + "step": 1716 + }, + { + "epoch": 0.6872, + "learning_rate": 1.9990019956851384e-05, + "loss": 0.5023, + "step": 1718 + }, + { + "epoch": 0.688, + "learning_rate": 1.9991228300988586e-05, + "loss": 0.2131, + "step": 1720 + }, + { + "epoch": 0.6888, + "learning_rate": 1.999235873152047e-05, + "loss": 0.4659, + "step": 1722 + }, + { + "epoch": 0.6896, + "learning_rate": 1.9993411239631713e-05, + "loss": 0.1468, + "step": 1724 + }, + { + "epoch": 0.6904, + "learning_rate": 1.9994385817114644e-05, + "loss": 0.7366, + "step": 1726 + }, + { + "epoch": 0.6912, + "learning_rate": 1.9995282456369313e-05, + "loss": 0.337, + "step": 1728 + }, + { + "epoch": 0.692, + "learning_rate": 1.9996101150403543e-05, + "loss": 0.4347, + "step": 1730 + }, + { + "epoch": 0.6928, + "learning_rate": 1.9996841892833e-05, + "loss": 0.2232, + "step": 1732 + }, + { + "epoch": 0.6936, + "learning_rate": 1.9997504677881224e-05, + "loss": 0.1627, + "step": 1734 + }, + { + "epoch": 0.6944, + "learning_rate": 1.999808950037968e-05, + "loss": 0.2049, + "step": 1736 + }, + { + "epoch": 0.6952, + "learning_rate": 1.9998596355767805e-05, + "loss": 0.5515, + "step": 1738 + }, + { + "epoch": 0.696, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.2638, + "step": 1740 + }, + { + "epoch": 0.6968, + "learning_rate": 1.9999376150010868e-05, + "loss": 0.6832, + "step": 1742 + }, + { + "epoch": 0.6976, + "learning_rate": 1.9999649082784807e-05, + "loss": 0.1664, + "step": 1744 + }, + { + "epoch": 0.6984, + "learning_rate": 1.9999844036286483e-05, + "loss": 0.9, + "step": 1746 + }, + { + "epoch": 0.6992, + "learning_rate": 1.9999961008995607e-05, + "loss": 0.4565, + "step": 1748 + }, + { + "epoch": 0.7, + "learning_rate": 2e-05, + "loss": 1.0892, + "step": 1750 + }, + { + "epoch": 0.7008, + "learning_rate": 1.9999961008995607e-05, + "loss": 0.7489, + "step": 1752 + }, + { + "epoch": 0.7016, + "learning_rate": 1.9999844036286483e-05, + "loss": 0.3249, + "step": 1754 + }, + { + "epoch": 0.7024, + "learning_rate": 1.9999649082784807e-05, + "loss": 0.3105, + "step": 1756 + }, + { + "epoch": 0.7032, + "learning_rate": 1.9999376150010868e-05, + "loss": 0.2714, + "step": 1758 + }, + { + "epoch": 0.704, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.6899, + "step": 1760 + }, + { + "epoch": 0.7048, + "learning_rate": 1.9998596355767805e-05, + "loss": 0.7268, + "step": 1762 + }, + { + "epoch": 0.7056, + "learning_rate": 1.999808950037968e-05, + "loss": 0.4576, + "step": 1764 + }, + { + "epoch": 0.7064, + "learning_rate": 1.9997504677881224e-05, + "loss": 0.7163, + "step": 1766 + }, + { + "epoch": 0.7072, + "learning_rate": 1.9996841892833e-05, + "loss": 0.7157, + "step": 1768 + }, + { + "epoch": 0.708, + "learning_rate": 1.9996101150403547e-05, + "loss": 0.291, + "step": 1770 + }, + { + "epoch": 0.7088, + "learning_rate": 1.9995282456369313e-05, + "loss": 0.1539, + "step": 1772 + }, + { + "epoch": 0.7096, + "learning_rate": 1.9994385817114644e-05, + "loss": 0.3853, + "step": 1774 + }, + { + "epoch": 0.7104, + "learning_rate": 1.9993411239631713e-05, + "loss": 0.1681, + "step": 1776 + }, + { + "epoch": 0.7112, + "learning_rate": 1.999235873152047e-05, + "loss": 0.2608, + "step": 1778 + }, + { + "epoch": 0.712, + "learning_rate": 1.9991228300988586e-05, + "loss": 0.1743, + "step": 1780 + }, + { + "epoch": 0.7128, + "learning_rate": 1.9990019956851384e-05, + "loss": 0.3511, + "step": 1782 + }, + { + "epoch": 0.7136, + "learning_rate": 1.9988733708531772e-05, + "loss": 1.0645, + "step": 1784 + }, + { + "epoch": 0.7144, + "learning_rate": 1.998736956606018e-05, + "loss": 0.5579, + "step": 1786 + }, + { + "epoch": 0.7152, + "learning_rate": 1.9985927540074453e-05, + "loss": 0.6372, + "step": 1788 + }, + { + "epoch": 0.716, + "learning_rate": 1.9984407641819812e-05, + "loss": 0.4025, + "step": 1790 + }, + { + "epoch": 0.7168, + "learning_rate": 1.998280988314872e-05, + "loss": 1.4624, + "step": 1792 + }, + { + "epoch": 0.7176, + "learning_rate": 1.9981134276520828e-05, + "loss": 1.1388, + "step": 1794 + }, + { + "epoch": 0.7184, + "learning_rate": 1.9979380835002846e-05, + "loss": 0.2093, + "step": 1796 + }, + { + "epoch": 0.7192, + "learning_rate": 1.9977549572268467e-05, + "loss": 0.5385, + "step": 1798 + }, + { + "epoch": 0.72, + "learning_rate": 1.9975640502598246e-05, + "loss": 0.4109, + "step": 1800 + }, + { + "epoch": 0.7208, + "learning_rate": 1.9973653640879486e-05, + "loss": 0.8722, + "step": 1802 + }, + { + "epoch": 0.7216, + "learning_rate": 1.997158900260614e-05, + "loss": 0.6299, + "step": 1804 + }, + { + "epoch": 0.7224, + "learning_rate": 1.9969446603878673e-05, + "loss": 0.1349, + "step": 1806 + }, + { + "epoch": 0.7232, + "learning_rate": 1.9967226461403934e-05, + "loss": 0.066, + "step": 1808 + }, + { + "epoch": 0.724, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.575, + "step": 1810 + }, + { + "epoch": 0.7248, + "learning_rate": 1.996255301507125e-05, + "loss": 0.5848, + "step": 1812 + }, + { + "epoch": 0.7256, + "learning_rate": 1.9960099747657774e-05, + "loss": 0.1976, + "step": 1814 + }, + { + "epoch": 0.7264, + "learning_rate": 1.9957568809385693e-05, + "loss": 0.2437, + "step": 1816 + }, + { + "epoch": 0.7272, + "learning_rate": 1.995496021999177e-05, + "loss": 0.4663, + "step": 1818 + }, + { + "epoch": 0.728, + "learning_rate": 1.9952273999818312e-05, + "loss": 0.3642, + "step": 1820 + }, + { + "epoch": 0.7288, + "learning_rate": 1.9949510169813006e-05, + "loss": 0.8091, + "step": 1822 + }, + { + "epoch": 0.7296, + "learning_rate": 1.9946668751528745e-05, + "loss": 0.3783, + "step": 1824 + }, + { + "epoch": 0.7304, + "learning_rate": 1.994374976712348e-05, + "loss": 0.4386, + "step": 1826 + }, + { + "epoch": 0.7312, + "learning_rate": 1.9940753239360047e-05, + "loss": 0.2616, + "step": 1828 + }, + { + "epoch": 0.732, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.5381, + "step": 1830 + }, + { + "epoch": 0.7328, + "learning_rate": 1.993452764783328e-05, + "loss": 0.1681, + "step": 1832 + }, + { + "epoch": 0.7336, + "learning_rate": 1.9931298632618352e-05, + "loss": 0.49, + "step": 1834 + }, + { + "epoch": 0.7344, + "learning_rate": 1.9927992171141707e-05, + "loss": 0.3654, + "step": 1836 + }, + { + "epoch": 0.7352, + "learning_rate": 1.9924608289187786e-05, + "loss": 0.3099, + "step": 1838 + }, + { + "epoch": 0.736, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.1701, + "step": 1840 + }, + { + "epoch": 0.7368, + "learning_rate": 1.9917608370004417e-05, + "loss": 1.1036, + "step": 1842 + }, + { + "epoch": 0.7376, + "learning_rate": 1.9913992387361747e-05, + "loss": 0.2086, + "step": 1844 + }, + { + "epoch": 0.7384, + "learning_rate": 1.9910299093414932e-05, + "loss": 0.2771, + "step": 1846 + }, + { + "epoch": 0.7392, + "learning_rate": 1.990652851696501e-05, + "loss": 0.2752, + "step": 1848 + }, + { + "epoch": 0.74, + "learning_rate": 1.9902680687415704e-05, + "loss": 0.2388, + "step": 1850 + }, + { + "epoch": 0.7408, + "learning_rate": 1.9898755634773155e-05, + "loss": 0.482, + "step": 1852 + }, + { + "epoch": 0.7416, + "learning_rate": 1.9894753389645723e-05, + "loss": 0.0664, + "step": 1854 + }, + { + "epoch": 0.7424, + "learning_rate": 1.9890673983243704e-05, + "loss": 0.7385, + "step": 1856 + }, + { + "epoch": 0.7432, + "learning_rate": 1.9886517447379143e-05, + "loss": 0.4241, + "step": 1858 + }, + { + "epoch": 0.744, + "learning_rate": 1.988228381446553e-05, + "loss": 0.7578, + "step": 1860 + }, + { + "epoch": 0.7448, + "learning_rate": 1.987797311751759e-05, + "loss": 0.1984, + "step": 1862 + }, + { + "epoch": 0.7456, + "learning_rate": 1.9873585390151007e-05, + "loss": 0.7002, + "step": 1864 + }, + { + "epoch": 0.7464, + "learning_rate": 1.9869120666582153e-05, + "loss": 0.1967, + "step": 1866 + }, + { + "epoch": 0.7472, + "learning_rate": 1.9864578981627844e-05, + "loss": 0.7714, + "step": 1868 + }, + { + "epoch": 0.748, + "learning_rate": 1.985996037070505e-05, + "loss": 0.3367, + "step": 1870 + }, + { + "epoch": 0.7488, + "learning_rate": 1.985526486983063e-05, + "loss": 0.3797, + "step": 1872 + }, + { + "epoch": 0.7496, + "learning_rate": 1.9850492515621038e-05, + "loss": 0.5128, + "step": 1874 + }, + { + "epoch": 0.7504, + "learning_rate": 1.9845643345292055e-05, + "loss": 0.6135, + "step": 1876 + }, + { + "epoch": 0.7512, + "learning_rate": 1.9840717396658483e-05, + "loss": 0.4172, + "step": 1878 + }, + { + "epoch": 0.752, + "learning_rate": 1.983571470813386e-05, + "loss": 0.5313, + "step": 1880 + }, + { + "epoch": 0.7528, + "learning_rate": 1.983063531873016e-05, + "loss": 0.3186, + "step": 1882 + }, + { + "epoch": 0.7536, + "learning_rate": 1.982547926805747e-05, + "loss": 0.8126, + "step": 1884 + }, + { + "epoch": 0.7544, + "learning_rate": 1.9820246596323724e-05, + "loss": 0.7088, + "step": 1886 + }, + { + "epoch": 0.7552, + "learning_rate": 1.981493734433433e-05, + "loss": 1.1173, + "step": 1888 + }, + { + "epoch": 0.756, + "learning_rate": 1.9809551553491918e-05, + "loss": 0.2701, + "step": 1890 + }, + { + "epoch": 0.7568, + "learning_rate": 1.9804089265795963e-05, + "loss": 0.585, + "step": 1892 + }, + { + "epoch": 0.7576, + "learning_rate": 1.979855052384247e-05, + "loss": 0.2232, + "step": 1894 + }, + { + "epoch": 0.7584, + "learning_rate": 1.979293537082368e-05, + "loss": 2.1961, + "step": 1896 + }, + { + "epoch": 0.7592, + "learning_rate": 1.9787243850527663e-05, + "loss": 0.4117, + "step": 1898 + }, + { + "epoch": 0.76, + "learning_rate": 1.978147600733806e-05, + "loss": 0.2534, + "step": 1900 + }, + { + "epoch": 0.7608, + "learning_rate": 1.977563188623365e-05, + "loss": 0.5024, + "step": 1902 + }, + { + "epoch": 0.7616, + "learning_rate": 1.9769711532788086e-05, + "loss": 0.4275, + "step": 1904 + }, + { + "epoch": 0.7624, + "learning_rate": 1.9763714993169448e-05, + "loss": 0.7152, + "step": 1906 + }, + { + "epoch": 0.7632, + "learning_rate": 1.9757642314139977e-05, + "loss": 0.5625, + "step": 1908 + }, + { + "epoch": 0.764, + "learning_rate": 1.9751493543055638e-05, + "loss": 0.3726, + "step": 1910 + }, + { + "epoch": 0.7648, + "learning_rate": 1.9745268727865774e-05, + "loss": 0.1908, + "step": 1912 + }, + { + "epoch": 0.7656, + "learning_rate": 1.973896791711276e-05, + "loss": 0.3683, + "step": 1914 + }, + { + "epoch": 0.7664, + "learning_rate": 1.9732591159931567e-05, + "loss": 0.2123, + "step": 1916 + }, + { + "epoch": 0.7672, + "learning_rate": 1.972613850604944e-05, + "loss": 0.7411, + "step": 1918 + }, + { + "epoch": 0.768, + "learning_rate": 1.9719610005785463e-05, + "loss": 0.0932, + "step": 1920 + }, + { + "epoch": 0.7688, + "learning_rate": 1.9713005710050206e-05, + "loss": 0.1633, + "step": 1922 + }, + { + "epoch": 0.7696, + "learning_rate": 1.9706325670345276e-05, + "loss": 0.4882, + "step": 1924 + }, + { + "epoch": 0.7704, + "learning_rate": 1.9699569938762975e-05, + "loss": 0.3703, + "step": 1926 + }, + { + "epoch": 0.7712, + "learning_rate": 1.969273856798586e-05, + "loss": 0.2894, + "step": 1928 + }, + { + "epoch": 0.772, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.4614, + "step": 1930 + }, + { + "epoch": 0.7728, + "learning_rate": 1.9678849122526195e-05, + "loss": 0.3397, + "step": 1932 + }, + { + "epoch": 0.7736, + "learning_rate": 1.967179115615633e-05, + "loss": 0.1496, + "step": 1934 + }, + { + "epoch": 0.7744, + "learning_rate": 1.966465776721618e-05, + "loss": 0.5711, + "step": 1936 + }, + { + "epoch": 0.7752, + "learning_rate": 1.9657449011333328e-05, + "loss": 0.477, + "step": 1938 + }, + { + "epoch": 0.776, + "learning_rate": 1.965016494472312e-05, + "loss": 0.6088, + "step": 1940 + }, + { + "epoch": 0.7768, + "learning_rate": 1.964280562418815e-05, + "loss": 0.3664, + "step": 1942 + }, + { + "epoch": 0.7776, + "learning_rate": 1.963537110711789e-05, + "loss": 0.1077, + "step": 1944 + }, + { + "epoch": 0.7784, + "learning_rate": 1.9627861451488194e-05, + "loss": 0.7762, + "step": 1946 + }, + { + "epoch": 0.7792, + "learning_rate": 1.962027671586086e-05, + "loss": 0.6725, + "step": 1948 + }, + { + "epoch": 0.78, + "learning_rate": 1.9612616959383194e-05, + "loss": 0.4897, + "step": 1950 + }, + { + "epoch": 0.7808, + "learning_rate": 1.96048822417875e-05, + "loss": 0.6242, + "step": 1952 + }, + { + "epoch": 0.7816, + "learning_rate": 1.9597072623390668e-05, + "loss": 0.5445, + "step": 1954 + }, + { + "epoch": 0.7824, + "learning_rate": 1.9589188165093666e-05, + "loss": 0.6345, + "step": 1956 + }, + { + "epoch": 0.7832, + "learning_rate": 1.95812289283811e-05, + "loss": 0.1542, + "step": 1958 + }, + { + "epoch": 0.784, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.2684, + "step": 1960 + }, + { + "epoch": 0.7848, + "learning_rate": 1.9565086368562784e-05, + "loss": 0.0992, + "step": 1962 + }, + { + "epoch": 0.7856, + "learning_rate": 1.9556903171339966e-05, + "loss": 0.2203, + "step": 1964 + }, + { + "epoch": 0.7864, + "learning_rate": 1.954864544746643e-05, + "loss": 0.3223, + "step": 1966 + }, + { + "epoch": 0.7872, + "learning_rate": 1.9540313261337585e-05, + "loss": 0.3121, + "step": 1968 + }, + { + "epoch": 0.788, + "learning_rate": 1.9531906677929472e-05, + "loss": 0.4199, + "step": 1970 + }, + { + "epoch": 0.7888, + "learning_rate": 1.9523425762798335e-05, + "loss": 0.2109, + "step": 1972 + }, + { + "epoch": 0.7896, + "learning_rate": 1.9514870582080035e-05, + "loss": 0.1776, + "step": 1974 + }, + { + "epoch": 0.7904, + "learning_rate": 1.95062412024896e-05, + "loss": 0.5987, + "step": 1976 + }, + { + "epoch": 0.7912, + "learning_rate": 1.9497537691320667e-05, + "loss": 0.2741, + "step": 1978 + }, + { + "epoch": 0.792, + "learning_rate": 1.948876011644497e-05, + "loss": 0.4932, + "step": 1980 + }, + { + "epoch": 0.7928, + "learning_rate": 1.9479908546311787e-05, + "loss": 0.2572, + "step": 1982 + }, + { + "epoch": 0.7936, + "learning_rate": 1.9470983049947443e-05, + "loss": 0.4874, + "step": 1984 + }, + { + "epoch": 0.7944, + "learning_rate": 1.9461983696954767e-05, + "loss": 0.2646, + "step": 1986 + }, + { + "epoch": 0.7952, + "learning_rate": 1.9452910557512494e-05, + "loss": 0.2742, + "step": 1988 + }, + { + "epoch": 0.796, + "learning_rate": 1.9443763702374818e-05, + "loss": 0.5782, + "step": 1990 + }, + { + "epoch": 0.7968, + "learning_rate": 1.9434543202870726e-05, + "loss": 0.5215, + "step": 1992 + }, + { + "epoch": 0.7976, + "learning_rate": 1.9425249130903544e-05, + "loss": 0.3634, + "step": 1994 + }, + { + "epoch": 0.7984, + "learning_rate": 1.94158815589503e-05, + "loss": 0.3587, + "step": 1996 + }, + { + "epoch": 0.7992, + "learning_rate": 1.940644056006122e-05, + "loss": 0.2521, + "step": 1998 + }, + { + "epoch": 0.8, + "learning_rate": 1.939692620785909e-05, + "loss": 0.1523, + "step": 2000 + }, + { + "epoch": 0.8008, + "learning_rate": 1.9387338576538746e-05, + "loss": 1.2392, + "step": 2002 + }, + { + "epoch": 0.8016, + "learning_rate": 1.9377677740866464e-05, + "loss": 0.4053, + "step": 2004 + }, + { + "epoch": 0.8024, + "learning_rate": 1.936794377617938e-05, + "loss": 0.8884, + "step": 2006 + }, + { + "epoch": 0.8032, + "learning_rate": 1.9358136758384917e-05, + "loss": 0.294, + "step": 2008 + }, + { + "epoch": 0.804, + "learning_rate": 1.9348256763960146e-05, + "loss": 0.2858, + "step": 2010 + }, + { + "epoch": 0.8048, + "learning_rate": 1.9338303869951273e-05, + "loss": 0.523, + "step": 2012 + }, + { + "epoch": 0.8056, + "learning_rate": 1.9328278153972943e-05, + "loss": 0.2265, + "step": 2014 + }, + { + "epoch": 0.8064, + "learning_rate": 1.931817969420773e-05, + "loss": 0.4204, + "step": 2016 + }, + { + "epoch": 0.8072, + "learning_rate": 1.930800856940543e-05, + "loss": 0.4531, + "step": 2018 + }, + { + "epoch": 0.808, + "learning_rate": 1.929776485888252e-05, + "loss": 0.2061, + "step": 2020 + }, + { + "epoch": 0.8088, + "learning_rate": 1.9287448642521517e-05, + "loss": 0.803, + "step": 2022 + }, + { + "epoch": 0.8096, + "learning_rate": 1.9277060000770342e-05, + "loss": 0.3962, + "step": 2024 + }, + { + "epoch": 0.8104, + "learning_rate": 1.9266599014641727e-05, + "loss": 0.2493, + "step": 2026 + }, + { + "epoch": 0.8112, + "learning_rate": 1.925606576571252e-05, + "loss": 0.7416, + "step": 2028 + }, + { + "epoch": 0.812, + "learning_rate": 1.9245460336123136e-05, + "loss": 0.3659, + "step": 2030 + }, + { + "epoch": 0.8128, + "learning_rate": 1.923478280857682e-05, + "loss": 0.3262, + "step": 2032 + }, + { + "epoch": 0.8136, + "learning_rate": 1.9224033266339103e-05, + "loss": 0.3067, + "step": 2034 + }, + { + "epoch": 0.8144, + "learning_rate": 1.9213211793237066e-05, + "loss": 0.2731, + "step": 2036 + }, + { + "epoch": 0.8152, + "learning_rate": 1.9202318473658707e-05, + "loss": 0.3654, + "step": 2038 + }, + { + "epoch": 0.816, + "learning_rate": 1.919135339255235e-05, + "loss": 0.3996, + "step": 2040 + }, + { + "epoch": 0.8168, + "learning_rate": 1.918031663542588e-05, + "loss": 0.2801, + "step": 2042 + }, + { + "epoch": 0.8176, + "learning_rate": 1.916920828834617e-05, + "loss": 0.166, + "step": 2044 + }, + { + "epoch": 0.8184, + "learning_rate": 1.9158028437938313e-05, + "loss": 0.2529, + "step": 2046 + }, + { + "epoch": 0.8192, + "learning_rate": 1.9146777171385057e-05, + "loss": 0.2323, + "step": 2048 + }, + { + "epoch": 0.82, + "learning_rate": 1.913545457642601e-05, + "loss": 0.3422, + "step": 2050 + }, + { + "epoch": 0.8208, + "learning_rate": 1.9124060741357065e-05, + "loss": 0.2812, + "step": 2052 + }, + { + "epoch": 0.8216, + "learning_rate": 1.911259575502963e-05, + "loss": 0.4032, + "step": 2054 + }, + { + "epoch": 0.8224, + "learning_rate": 1.910105970684996e-05, + "loss": 0.2518, + "step": 2056 + }, + { + "epoch": 0.8232, + "learning_rate": 1.908945268677849e-05, + "loss": 0.2914, + "step": 2058 + }, + { + "epoch": 0.824, + "learning_rate": 1.9077774785329085e-05, + "loss": 0.9539, + "step": 2060 + }, + { + "epoch": 0.8248, + "learning_rate": 1.9066026093568383e-05, + "loss": 0.5869, + "step": 2062 + }, + { + "epoch": 0.8256, + "learning_rate": 1.9054206703115013e-05, + "loss": 0.8948, + "step": 2064 + }, + { + "epoch": 0.8264, + "learning_rate": 1.9042316706138994e-05, + "loss": 0.1718, + "step": 2066 + }, + { + "epoch": 0.8272, + "learning_rate": 1.903035619536087e-05, + "loss": 0.365, + "step": 2068 + }, + { + "epoch": 0.828, + "learning_rate": 1.901832526405114e-05, + "loss": 0.2584, + "step": 2070 + }, + { + "epoch": 0.8288, + "learning_rate": 1.9006224006029414e-05, + "loss": 0.4179, + "step": 2072 + }, + { + "epoch": 0.8296, + "learning_rate": 1.899405251566371e-05, + "loss": 0.231, + "step": 2074 + }, + { + "epoch": 0.8304, + "learning_rate": 1.8981810887869797e-05, + "loss": 0.2505, + "step": 2076 + }, + { + "epoch": 0.8312, + "learning_rate": 1.8969499218110302e-05, + "loss": 0.5921, + "step": 2078 + }, + { + "epoch": 0.832, + "learning_rate": 1.8957117602394133e-05, + "loss": 0.2893, + "step": 2080 + }, + { + "epoch": 0.8328, + "learning_rate": 1.8944666137275596e-05, + "loss": 0.2167, + "step": 2082 + }, + { + "epoch": 0.8336, + "learning_rate": 1.8932144919853744e-05, + "loss": 0.5062, + "step": 2084 + }, + { + "epoch": 0.8344, + "learning_rate": 1.8919554047771508e-05, + "loss": 0.1003, + "step": 2086 + }, + { + "epoch": 0.8352, + "learning_rate": 1.890689361921507e-05, + "loss": 0.3901, + "step": 2088 + }, + { + "epoch": 0.836, + "learning_rate": 1.8894163732912986e-05, + "loss": 0.1797, + "step": 2090 + }, + { + "epoch": 0.8368, + "learning_rate": 1.8881364488135445e-05, + "loss": 0.3855, + "step": 2092 + }, + { + "epoch": 0.8376, + "learning_rate": 1.886849598469357e-05, + "loss": 0.4061, + "step": 2094 + }, + { + "epoch": 0.8384, + "learning_rate": 1.8855558322938492e-05, + "loss": 0.1171, + "step": 2096 + }, + { + "epoch": 0.8392, + "learning_rate": 1.8842551603760725e-05, + "loss": 0.6174, + "step": 2098 + }, + { + "epoch": 0.84, + "learning_rate": 1.8829475928589265e-05, + "loss": 0.1718, + "step": 2100 + }, + { + "epoch": 0.8408, + "learning_rate": 1.8816331399390874e-05, + "loss": 0.4348, + "step": 2102 + }, + { + "epoch": 0.8416, + "learning_rate": 1.88031181186692e-05, + "loss": 0.8923, + "step": 2104 + }, + { + "epoch": 0.8424, + "learning_rate": 1.8789836189464092e-05, + "loss": 0.2211, + "step": 2106 + }, + { + "epoch": 0.8432, + "learning_rate": 1.877648571535068e-05, + "loss": 0.6141, + "step": 2108 + }, + { + "epoch": 0.844, + "learning_rate": 1.8763066800438638e-05, + "loss": 0.5523, + "step": 2110 + }, + { + "epoch": 0.8448, + "learning_rate": 1.8749579549371387e-05, + "loss": 2.182, + "step": 2112 + }, + { + "epoch": 0.8456, + "learning_rate": 1.8736024067325188e-05, + "loss": 0.1116, + "step": 2114 + }, + { + "epoch": 0.8464, + "learning_rate": 1.8722400460008437e-05, + "loss": 0.9293, + "step": 2116 + }, + { + "epoch": 0.8472, + "learning_rate": 1.8708708833660748e-05, + "loss": 0.6067, + "step": 2118 + }, + { + "epoch": 0.848, + "learning_rate": 1.8694949295052198e-05, + "loss": 0.4569, + "step": 2120 + }, + { + "epoch": 0.8488, + "learning_rate": 1.868112195148239e-05, + "loss": 0.6397, + "step": 2122 + }, + { + "epoch": 0.8496, + "learning_rate": 1.866722691077977e-05, + "loss": 0.6109, + "step": 2124 + }, + { + "epoch": 0.8504, + "learning_rate": 1.8653264281300626e-05, + "loss": 0.1689, + "step": 2126 + }, + { + "epoch": 0.8512, + "learning_rate": 1.8639234171928355e-05, + "loss": 0.5759, + "step": 2128 + }, + { + "epoch": 0.852, + "learning_rate": 1.8625136692072587e-05, + "loss": 0.188, + "step": 2130 + }, + { + "epoch": 0.8528, + "learning_rate": 1.8610971951668265e-05, + "loss": 0.5637, + "step": 2132 + }, + { + "epoch": 0.8536, + "learning_rate": 1.8596740061174912e-05, + "loss": 0.1275, + "step": 2134 + }, + { + "epoch": 0.8544, + "learning_rate": 1.858244113157566e-05, + "loss": 0.585, + "step": 2136 + }, + { + "epoch": 0.8552, + "learning_rate": 1.8568075274376432e-05, + "loss": 0.4383, + "step": 2138 + }, + { + "epoch": 0.856, + "learning_rate": 1.8553642601605083e-05, + "loss": 0.2521, + "step": 2140 + }, + { + "epoch": 0.8568, + "learning_rate": 1.8539143225810457e-05, + "loss": 0.6251, + "step": 2142 + }, + { + "epoch": 0.8576, + "learning_rate": 1.852457726006163e-05, + "loss": 0.3309, + "step": 2144 + }, + { + "epoch": 0.8584, + "learning_rate": 1.8509944817946917e-05, + "loss": 0.5098, + "step": 2146 + }, + { + "epoch": 0.8592, + "learning_rate": 1.8495246013573064e-05, + "loss": 0.4194, + "step": 2148 + }, + { + "epoch": 0.86, + "learning_rate": 1.848048096156426e-05, + "loss": 0.411, + "step": 2150 + }, + { + "epoch": 0.8608, + "learning_rate": 1.8465649777061387e-05, + "loss": 0.602, + "step": 2152 + }, + { + "epoch": 0.8616, + "learning_rate": 1.8450752575720967e-05, + "loss": 0.2657, + "step": 2154 + }, + { + "epoch": 0.8624, + "learning_rate": 1.843578947371439e-05, + "loss": 0.5243, + "step": 2156 + }, + { + "epoch": 0.8632, + "learning_rate": 1.8420760587726935e-05, + "loss": 0.522, + "step": 2158 + }, + { + "epoch": 0.864, + "learning_rate": 1.8405666034956846e-05, + "loss": 0.2699, + "step": 2160 + }, + { + "epoch": 0.8648, + "learning_rate": 1.8390505933114507e-05, + "loss": 0.3282, + "step": 2162 + }, + { + "epoch": 0.8656, + "learning_rate": 1.8375280400421414e-05, + "loss": 0.1787, + "step": 2164 + }, + { + "epoch": 0.8664, + "learning_rate": 1.8359989555609365e-05, + "loss": 0.1887, + "step": 2166 + }, + { + "epoch": 0.8672, + "learning_rate": 1.834463351791939e-05, + "loss": 0.2981, + "step": 2168 + }, + { + "epoch": 0.868, + "learning_rate": 1.8329212407101006e-05, + "loss": 0.3937, + "step": 2170 + }, + { + "epoch": 0.8688, + "learning_rate": 1.8313726343411085e-05, + "loss": 0.2183, + "step": 2172 + }, + { + "epoch": 0.8696, + "learning_rate": 1.82981754476131e-05, + "loss": 0.4952, + "step": 2174 + }, + { + "epoch": 0.8704, + "learning_rate": 1.8282559840976053e-05, + "loss": 0.315, + "step": 2176 + }, + { + "epoch": 0.8712, + "learning_rate": 1.8266879645273557e-05, + "loss": 0.2321, + "step": 2178 + }, + { + "epoch": 0.872, + "learning_rate": 1.8251134982782966e-05, + "loss": 0.4357, + "step": 2180 + }, + { + "epoch": 0.8728, + "learning_rate": 1.823532597628428e-05, + "loss": 0.2251, + "step": 2182 + }, + { + "epoch": 0.8736, + "learning_rate": 1.8219452749059336e-05, + "loss": 0.3756, + "step": 2184 + }, + { + "epoch": 0.8744, + "learning_rate": 1.8203515424890734e-05, + "loss": 0.4731, + "step": 2186 + }, + { + "epoch": 0.8752, + "learning_rate": 1.8187514128060956e-05, + "loss": 0.4636, + "step": 2188 + }, + { + "epoch": 0.876, + "learning_rate": 1.8171448983351284e-05, + "loss": 0.4501, + "step": 2190 + }, + { + "epoch": 0.8768, + "learning_rate": 1.8155320116040983e-05, + "loss": 0.2686, + "step": 2192 + }, + { + "epoch": 0.8776, + "learning_rate": 1.8139127651906193e-05, + "loss": 0.378, + "step": 2194 + }, + { + "epoch": 0.8784, + "learning_rate": 1.8122871717218974e-05, + "loss": 0.3205, + "step": 2196 + }, + { + "epoch": 0.8792, + "learning_rate": 1.8106552438746413e-05, + "loss": 0.4097, + "step": 2198 + }, + { + "epoch": 0.88, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.574, + "step": 2200 + }, + { + "epoch": 0.8808, + "learning_rate": 1.807372435998219e-05, + "loss": 0.3735, + "step": 2202 + }, + { + "epoch": 0.8816, + "learning_rate": 1.8057215815690487e-05, + "loss": 1.1131, + "step": 2204 + }, + { + "epoch": 0.8824, + "learning_rate": 1.8040644439611355e-05, + "loss": 0.2605, + "step": 2206 + }, + { + "epoch": 0.8832, + "learning_rate": 1.8024010360971665e-05, + "loss": 0.318, + "step": 2208 + }, + { + "epoch": 0.884, + "learning_rate": 1.8007313709487345e-05, + "loss": 0.3736, + "step": 2210 + }, + { + "epoch": 0.8848, + "learning_rate": 1.7990554615362207e-05, + "loss": 0.0743, + "step": 2212 + }, + { + "epoch": 0.8856, + "learning_rate": 1.7973733209287036e-05, + "loss": 0.3272, + "step": 2214 + }, + { + "epoch": 0.8864, + "learning_rate": 1.7956849622438568e-05, + "loss": 0.8301, + "step": 2216 + }, + { + "epoch": 0.8872, + "learning_rate": 1.7939903986478357e-05, + "loss": 0.7631, + "step": 2218 + }, + { + "epoch": 0.888, + "learning_rate": 1.7922896433551913e-05, + "loss": 0.2698, + "step": 2220 + }, + { + "epoch": 0.8888, + "learning_rate": 1.7905827096287525e-05, + "loss": 0.1229, + "step": 2222 + }, + { + "epoch": 0.8896, + "learning_rate": 1.7888696107795347e-05, + "loss": 0.4985, + "step": 2224 + }, + { + "epoch": 0.8904, + "learning_rate": 1.787150360166623e-05, + "loss": 0.6542, + "step": 2226 + }, + { + "epoch": 0.8912, + "learning_rate": 1.7854249711970826e-05, + "loss": 0.509, + "step": 2228 + }, + { + "epoch": 0.892, + "learning_rate": 1.783693457325841e-05, + "loss": 0.2181, + "step": 2230 + }, + { + "epoch": 0.8928, + "learning_rate": 1.7819558320555902e-05, + "loss": 0.7396, + "step": 2232 + }, + { + "epoch": 0.8936, + "learning_rate": 1.780212108936685e-05, + "loss": 1.0151, + "step": 2234 + }, + { + "epoch": 0.8944, + "learning_rate": 1.7784623015670237e-05, + "loss": 0.6005, + "step": 2236 + }, + { + "epoch": 0.8952, + "learning_rate": 1.7767064235919594e-05, + "loss": 0.5643, + "step": 2238 + }, + { + "epoch": 0.896, + "learning_rate": 1.77494448870418e-05, + "loss": 0.458, + "step": 2240 + }, + { + "epoch": 0.8968, + "learning_rate": 1.773176510643608e-05, + "loss": 0.2175, + "step": 2242 + }, + { + "epoch": 0.8976, + "learning_rate": 1.7714025031972894e-05, + "loss": 0.3, + "step": 2244 + }, + { + "epoch": 0.8984, + "learning_rate": 1.769622480199295e-05, + "loss": 0.2323, + "step": 2246 + }, + { + "epoch": 0.8992, + "learning_rate": 1.7678364555305982e-05, + "loss": 0.4655, + "step": 2248 + }, + { + "epoch": 0.9, + "learning_rate": 1.7660444431189777e-05, + "loss": 1.6231, + "step": 2250 + }, + { + "epoch": 0.9008, + "learning_rate": 1.76424645693891e-05, + "loss": 0.5873, + "step": 2252 + }, + { + "epoch": 0.9016, + "learning_rate": 1.762442511011448e-05, + "loss": 0.5142, + "step": 2254 + }, + { + "epoch": 0.9024, + "learning_rate": 1.7606326194041285e-05, + "loss": 0.1879, + "step": 2256 + }, + { + "epoch": 0.9032, + "learning_rate": 1.7588167962308458e-05, + "loss": 0.3926, + "step": 2258 + }, + { + "epoch": 0.904, + "learning_rate": 1.756995055651757e-05, + "loss": 0.0868, + "step": 2260 + }, + { + "epoch": 0.9048, + "learning_rate": 1.7551674118731585e-05, + "loss": 0.4694, + "step": 2262 + }, + { + "epoch": 0.9056, + "learning_rate": 1.7533338791473875e-05, + "loss": 0.4617, + "step": 2264 + }, + { + "epoch": 0.9064, + "learning_rate": 1.751494471772697e-05, + "loss": 0.7479, + "step": 2266 + }, + { + "epoch": 0.9072, + "learning_rate": 1.7496492040931548e-05, + "loss": 0.4305, + "step": 2268 + }, + { + "epoch": 0.908, + "learning_rate": 1.747798090498533e-05, + "loss": 0.2598, + "step": 2270 + }, + { + "epoch": 0.9088, + "learning_rate": 1.745941145424182e-05, + "loss": 0.2899, + "step": 2272 + }, + { + "epoch": 0.9096, + "learning_rate": 1.744078383350938e-05, + "loss": 0.5119, + "step": 2274 + }, + { + "epoch": 0.9104, + "learning_rate": 1.7422098188049885e-05, + "loss": 0.3437, + "step": 2276 + }, + { + "epoch": 0.9112, + "learning_rate": 1.7403354663577782e-05, + "loss": 0.5659, + "step": 2278 + }, + { + "epoch": 0.912, + "learning_rate": 1.738455340625883e-05, + "loss": 0.3212, + "step": 2280 + }, + { + "epoch": 0.9128, + "learning_rate": 1.7365694562709038e-05, + "loss": 0.7569, + "step": 2282 + }, + { + "epoch": 0.9136, + "learning_rate": 1.7346778279993433e-05, + "loss": 0.1909, + "step": 2284 + }, + { + "epoch": 0.9144, + "learning_rate": 1.7327804705624962e-05, + "loss": 0.4225, + "step": 2286 + }, + { + "epoch": 0.9152, + "learning_rate": 1.730877398756341e-05, + "loss": 1.0185, + "step": 2288 + }, + { + "epoch": 0.916, + "learning_rate": 1.7289686274214113e-05, + "loss": 0.1741, + "step": 2290 + }, + { + "epoch": 0.9168, + "learning_rate": 1.727054171442693e-05, + "loss": 0.5183, + "step": 2292 + }, + { + "epoch": 0.9176, + "learning_rate": 1.7251340457494934e-05, + "loss": 0.2259, + "step": 2294 + }, + { + "epoch": 0.9184, + "learning_rate": 1.7232082653153422e-05, + "loss": 0.1786, + "step": 2296 + }, + { + "epoch": 0.9192, + "learning_rate": 1.7212768451578595e-05, + "loss": 0.5538, + "step": 2298 + }, + { + "epoch": 0.92, + "learning_rate": 1.7193398003386517e-05, + "loss": 0.6274, + "step": 2300 + }, + { + "epoch": 0.9208, + "learning_rate": 1.7173971459631803e-05, + "loss": 0.512, + "step": 2302 + }, + { + "epoch": 0.9216, + "learning_rate": 1.7154488971806525e-05, + "loss": 0.1632, + "step": 2304 + }, + { + "epoch": 0.9224, + "learning_rate": 1.713495069183907e-05, + "loss": 0.3863, + "step": 2306 + }, + { + "epoch": 0.9232, + "learning_rate": 1.7115356772092847e-05, + "loss": 0.3775, + "step": 2308 + }, + { + "epoch": 0.924, + "learning_rate": 1.709570736536522e-05, + "loss": 0.1814, + "step": 2310 + }, + { + "epoch": 0.9248, + "learning_rate": 1.7076002624886152e-05, + "loss": 0.3849, + "step": 2312 + }, + { + "epoch": 0.9256, + "learning_rate": 1.705624270431722e-05, + "loss": 0.6614, + "step": 2314 + }, + { + "epoch": 0.9264, + "learning_rate": 1.70364277577502e-05, + "loss": 0.1903, + "step": 2316 + }, + { + "epoch": 0.9272, + "learning_rate": 1.7016557939706078e-05, + "loss": 0.5449, + "step": 2318 + }, + { + "epoch": 0.928, + "learning_rate": 1.6996633405133673e-05, + "loss": 0.2385, + "step": 2320 + }, + { + "epoch": 0.9288, + "learning_rate": 1.6976654309408468e-05, + "loss": 0.5231, + "step": 2322 + }, + { + "epoch": 0.9296, + "learning_rate": 1.6956620808331515e-05, + "loss": 0.2504, + "step": 2324 + }, + { + "epoch": 0.9304, + "learning_rate": 1.6936533058128042e-05, + "loss": 0.2089, + "step": 2326 + }, + { + "epoch": 0.9312, + "learning_rate": 1.691639121544641e-05, + "loss": 0.5606, + "step": 2328 + }, + { + "epoch": 0.932, + "learning_rate": 1.6896195437356696e-05, + "loss": 0.5377, + "step": 2330 + }, + { + "epoch": 0.9328, + "learning_rate": 1.6875945881349686e-05, + "loss": 0.2831, + "step": 2332 + }, + { + "epoch": 0.9336, + "learning_rate": 1.6855642705335435e-05, + "loss": 0.1908, + "step": 2334 + }, + { + "epoch": 0.9344, + "learning_rate": 1.6835286067642228e-05, + "loss": 0.5317, + "step": 2336 + }, + { + "epoch": 0.9352, + "learning_rate": 1.681487612701521e-05, + "loss": 0.1991, + "step": 2338 + }, + { + "epoch": 0.936, + "learning_rate": 1.6794413042615168e-05, + "loss": 0.2978, + "step": 2340 + }, + { + "epoch": 0.9368, + "learning_rate": 1.677389697401739e-05, + "loss": 0.4045, + "step": 2342 + }, + { + "epoch": 0.9376, + "learning_rate": 1.675332808121025e-05, + "loss": 0.3003, + "step": 2344 + }, + { + "epoch": 0.9384, + "learning_rate": 1.6732706524594145e-05, + "loss": 0.3849, + "step": 2346 + }, + { + "epoch": 0.9392, + "learning_rate": 1.671203246498009e-05, + "loss": 0.1792, + "step": 2348 + }, + { + "epoch": 0.94, + "learning_rate": 1.6691306063588593e-05, + "loss": 0.3505, + "step": 2350 + }, + { + "epoch": 0.9408, + "learning_rate": 1.6670527482048242e-05, + "loss": 0.1022, + "step": 2352 + }, + { + "epoch": 0.9416, + "learning_rate": 1.6649696882394635e-05, + "loss": 0.194, + "step": 2354 + }, + { + "epoch": 0.9424, + "learning_rate": 1.6628814427068968e-05, + "loss": 0.7474, + "step": 2356 + }, + { + "epoch": 0.9432, + "learning_rate": 1.6607880278916778e-05, + "loss": 0.1155, + "step": 2358 + }, + { + "epoch": 0.944, + "learning_rate": 1.6586894601186824e-05, + "loss": 0.3948, + "step": 2360 + }, + { + "epoch": 0.9448, + "learning_rate": 1.656585755752957e-05, + "loss": 0.3781, + "step": 2362 + }, + { + "epoch": 0.9456, + "learning_rate": 1.6544769311996153e-05, + "loss": 0.5032, + "step": 2364 + }, + { + "epoch": 0.9464, + "learning_rate": 1.6523630029036924e-05, + "loss": 0.2272, + "step": 2366 + }, + { + "epoch": 0.9472, + "learning_rate": 1.6502439873500294e-05, + "loss": 0.556, + "step": 2368 + }, + { + "epoch": 0.948, + "learning_rate": 1.6481199010631305e-05, + "loss": 0.5924, + "step": 2370 + }, + { + "epoch": 0.9488, + "learning_rate": 1.645990760607052e-05, + "loss": 0.1125, + "step": 2372 + }, + { + "epoch": 0.9496, + "learning_rate": 1.643856582585255e-05, + "loss": 0.23, + "step": 2374 + }, + { + "epoch": 0.9504, + "learning_rate": 1.641717383640488e-05, + "loss": 0.1262, + "step": 2376 + }, + { + "epoch": 0.9512, + "learning_rate": 1.6395731804546596e-05, + "loss": 0.0727, + "step": 2378 + }, + { + "epoch": 0.952, + "learning_rate": 1.63742398974869e-05, + "loss": 0.8852, + "step": 2380 + }, + { + "epoch": 0.9528, + "learning_rate": 1.6352698282824045e-05, + "loss": 0.3532, + "step": 2382 + }, + { + "epoch": 0.9536, + "learning_rate": 1.633110712854385e-05, + "loss": 0.3234, + "step": 2384 + }, + { + "epoch": 0.9544, + "learning_rate": 1.6309466603018504e-05, + "loss": 0.1877, + "step": 2386 + }, + { + "epoch": 0.9552, + "learning_rate": 1.6287776875005148e-05, + "loss": 0.1109, + "step": 2388 + }, + { + "epoch": 0.956, + "learning_rate": 1.6266038113644612e-05, + "loss": 0.5913, + "step": 2390 + }, + { + "epoch": 0.9568, + "learning_rate": 1.624425048846017e-05, + "loss": 0.8445, + "step": 2392 + }, + { + "epoch": 0.9576, + "learning_rate": 1.6222414169356063e-05, + "loss": 0.1008, + "step": 2394 + }, + { + "epoch": 0.9584, + "learning_rate": 1.6200529326616343e-05, + "loss": 0.2016, + "step": 2396 + }, + { + "epoch": 0.9592, + "learning_rate": 1.6178596130903345e-05, + "loss": 0.992, + "step": 2398 + }, + { + "epoch": 0.96, + "learning_rate": 1.6156614753256587e-05, + "loss": 0.3245, + "step": 2400 + }, + { + "epoch": 0.9608, + "learning_rate": 1.613458536509123e-05, + "loss": 0.3623, + "step": 2402 + }, + { + "epoch": 0.9616, + "learning_rate": 1.6112508138196922e-05, + "loss": 0.5023, + "step": 2404 + }, + { + "epoch": 0.9624, + "learning_rate": 1.6090383244736277e-05, + "loss": 0.5802, + "step": 2406 + }, + { + "epoch": 0.9632, + "learning_rate": 1.606821085724363e-05, + "loss": 0.2166, + "step": 2408 + }, + { + "epoch": 0.964, + "learning_rate": 1.6045991148623756e-05, + "loss": 0.2652, + "step": 2410 + }, + { + "epoch": 0.9648, + "learning_rate": 1.602372429215038e-05, + "loss": 0.4155, + "step": 2412 + }, + { + "epoch": 0.9656, + "learning_rate": 1.600141046146497e-05, + "loss": 0.2546, + "step": 2414 + }, + { + "epoch": 0.9664, + "learning_rate": 1.597904983057519e-05, + "loss": 0.7841, + "step": 2416 + }, + { + "epoch": 0.9672, + "learning_rate": 1.5956642573853794e-05, + "loss": 0.3981, + "step": 2418 + }, + { + "epoch": 0.968, + "learning_rate": 1.5934188866037014e-05, + "loss": 0.2034, + "step": 2420 + }, + { + "epoch": 0.9688, + "learning_rate": 1.591168888222342e-05, + "loss": 0.4389, + "step": 2422 + }, + { + "epoch": 0.9696, + "learning_rate": 1.5889142797872407e-05, + "loss": 0.5375, + "step": 2424 + }, + { + "epoch": 0.9704, + "learning_rate": 1.5866550788802818e-05, + "loss": 0.2904, + "step": 2426 + }, + { + "epoch": 0.9712, + "learning_rate": 1.584391303119173e-05, + "loss": 0.306, + "step": 2428 + }, + { + "epoch": 0.972, + "learning_rate": 1.582122970157289e-05, + "loss": 0.4744, + "step": 2430 + }, + { + "epoch": 0.9728, + "learning_rate": 1.5798500976835503e-05, + "loss": 0.3711, + "step": 2432 + }, + { + "epoch": 0.9736, + "learning_rate": 1.577572703422267e-05, + "loss": 0.1888, + "step": 2434 + }, + { + "epoch": 0.9744, + "learning_rate": 1.575290805133024e-05, + "loss": 0.1716, + "step": 2436 + }, + { + "epoch": 0.9752, + "learning_rate": 1.5730044206105156e-05, + "loss": 0.3307, + "step": 2438 + }, + { + "epoch": 0.976, + "learning_rate": 1.570713567684432e-05, + "loss": 0.1467, + "step": 2440 + }, + { + "epoch": 0.9768, + "learning_rate": 1.5684182642193047e-05, + "loss": 0.7858, + "step": 2442 + }, + { + "epoch": 0.9776, + "learning_rate": 1.566118528114367e-05, + "loss": 0.1853, + "step": 2444 + }, + { + "epoch": 0.9784, + "learning_rate": 1.563814377303429e-05, + "loss": 0.6537, + "step": 2446 + }, + { + "epoch": 0.9792, + "learning_rate": 1.561505829754715e-05, + "loss": 0.3108, + "step": 2448 + }, + { + "epoch": 0.98, + "learning_rate": 1.5591929034707475e-05, + "loss": 0.3119, + "step": 2450 + }, + { + "epoch": 0.9808, + "learning_rate": 1.5568756164881874e-05, + "loss": 0.5385, + "step": 2452 + }, + { + "epoch": 0.9816, + "learning_rate": 1.5545539868777085e-05, + "loss": 0.3152, + "step": 2454 + }, + { + "epoch": 0.9824, + "learning_rate": 1.5522280327438384e-05, + "loss": 0.4374, + "step": 2456 + }, + { + "epoch": 0.9832, + "learning_rate": 1.5498977722248398e-05, + "loss": 0.3415, + "step": 2458 + }, + { + "epoch": 0.984, + "learning_rate": 1.547563223492552e-05, + "loss": 0.1873, + "step": 2460 + }, + { + "epoch": 0.9848, + "learning_rate": 1.5452244047522504e-05, + "loss": 0.5168, + "step": 2462 + }, + { + "epoch": 0.9856, + "learning_rate": 1.5428813342425194e-05, + "loss": 0.4948, + "step": 2464 + }, + { + "epoch": 0.9864, + "learning_rate": 1.5405340302350876e-05, + "loss": 1.5774, + "step": 2466 + }, + { + "epoch": 0.9872, + "learning_rate": 1.538182511034708e-05, + "loss": 0.2163, + "step": 2468 + }, + { + "epoch": 0.988, + "learning_rate": 1.535826794978996e-05, + "loss": 0.0688, + "step": 2470 + }, + { + "epoch": 0.9888, + "learning_rate": 1.5334669004383036e-05, + "loss": 0.6284, + "step": 2472 + }, + { + "epoch": 0.9896, + "learning_rate": 1.5311028458155564e-05, + "loss": 0.2339, + "step": 2474 + }, + { + "epoch": 0.9904, + "learning_rate": 1.528734649546133e-05, + "loss": 0.187, + "step": 2476 + }, + { + "epoch": 0.9912, + "learning_rate": 1.5263623300976997e-05, + "loss": 0.7126, + "step": 2478 + }, + { + "epoch": 0.992, + "learning_rate": 1.5239859059700792e-05, + "loss": 0.3825, + "step": 2480 + }, + { + "epoch": 0.9928, + "learning_rate": 1.5216053956951096e-05, + "loss": 0.2037, + "step": 2482 + }, + { + "epoch": 0.9936, + "learning_rate": 1.5192208178364819e-05, + "loss": 0.4225, + "step": 2484 + }, + { + "epoch": 0.9944, + "learning_rate": 1.5168321909896176e-05, + "loss": 0.1019, + "step": 2486 + }, + { + "epoch": 0.9952, + "learning_rate": 1.5144395337815057e-05, + "loss": 0.1317, + "step": 2488 + }, + { + "epoch": 0.996, + "learning_rate": 1.5120428648705722e-05, + "loss": 0.2469, + "step": 2490 + }, + { + "epoch": 0.9968, + "learning_rate": 1.5096422029465171e-05, + "loss": 0.7894, + "step": 2492 + }, + { + "epoch": 0.9976, + "learning_rate": 1.5072375667301904e-05, + "loss": 0.2355, + "step": 2494 + }, + { + "epoch": 0.9984, + "learning_rate": 1.5048289749734231e-05, + "loss": 0.3076, + "step": 2496 + }, + { + "epoch": 0.9992, + "learning_rate": 1.502416446458898e-05, + "loss": 0.1786, + "step": 2498 + }, + { + "epoch": 1.0, + "learning_rate": 1.5000000000000014e-05, + "loss": 0.723, + "step": 2500 + }, + { + "epoch": 1.0, + "step": 2500, + "total_flos": 0, + "train_loss": 0.44787728500664237, + "train_runtime": 11238.0661, + "train_samples_per_second": 3.559, + "train_steps_per_second": 0.222 + } + ], + "logging_steps": 2, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..e7c94698df4a11b82413109c3e1f448ae71ef87f --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72ebecec349a27f5d9ac29c4ac6cd9f8b1b5be093f47c11b4d315640255aa8e5 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..52aeefe337bc0000811695472b931d8df123aa4f --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22520cb17d6c925dab8e964834d23f293caa5a35178e6662478746654ddbfb79 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..e711493161ba03f9e48565d3d743b3447ac25488 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51a26c1ce62fb21b0282562126aeb04c9b38034f1edb8c986495586ad7cdbc02 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..32c550d622dd1ddc0989c9e2059cd61612115f55 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d97a098bfab53d95e50a7cdfb2da5cbec3a95da1fa3833ff2ab3fb9bdf910e0 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_gradnorm_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_gradnorm_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6c827283b44ca682462de986d8091e9d6e681cfa --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_gradnorm_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,8782 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008, + "grad_norm": 7.428676605224609, + "learning_rate": 2.357535430610912e-06, + "loss": 0.8135, + "step": 2 + }, + { + "epoch": 0.0016, + "grad_norm": 4.688574314117432, + "learning_rate": 2.3755748898855234e-06, + "loss": 0.4197, + "step": 4 + }, + { + "epoch": 0.0024, + "grad_norm": 6.85806941986084, + "learning_rate": 2.3936738059587174e-06, + "loss": 0.5889, + "step": 6 + }, + { + "epoch": 0.0032, + "grad_norm": 4.939311981201172, + "learning_rate": 2.411832037691545e-06, + "loss": 1.7777, + "step": 8 + }, + { + "epoch": 0.004, + "grad_norm": 6.581257343292236, + "learning_rate": 2.430049443482434e-06, + "loss": 0.7227, + "step": 10 + }, + { + "epoch": 0.0048, + "grad_norm": 5.2789812088012695, + "learning_rate": 2.448325881268406e-06, + "loss": 0.6827, + "step": 12 + }, + { + "epoch": 0.0056, + "grad_norm": 8.367552757263184, + "learning_rate": 2.4666612085261277e-06, + "loss": 0.7583, + "step": 14 + }, + { + "epoch": 0.0064, + "grad_norm": 6.309530735015869, + "learning_rate": 2.4850552822730346e-06, + "loss": 0.5772, + "step": 16 + }, + { + "epoch": 0.0072, + "grad_norm": 11.15611457824707, + "learning_rate": 2.503507959068455e-06, + "loss": 0.5898, + "step": 18 + }, + { + "epoch": 0.008, + "grad_norm": 2.9229376316070557, + "learning_rate": 2.522019095014686e-06, + "loss": 0.1423, + "step": 20 + }, + { + "epoch": 0.0088, + "grad_norm": 13.48912525177002, + "learning_rate": 2.5405885457581814e-06, + "loss": 0.9523, + "step": 22 + }, + { + "epoch": 0.0096, + "grad_norm": 7.22756290435791, + "learning_rate": 2.5592161664906243e-06, + "loss": 0.329, + "step": 24 + }, + { + "epoch": 0.0104, + "grad_norm": 3.864868402481079, + "learning_rate": 2.5779018119501086e-06, + "loss": 0.3043, + "step": 26 + }, + { + "epoch": 0.0112, + "grad_norm": 7.621895790100098, + "learning_rate": 2.596645336422219e-06, + "loss": 0.3123, + "step": 28 + }, + { + "epoch": 0.012, + "grad_norm": 6.358210563659668, + "learning_rate": 2.615446593741161e-06, + "loss": 0.5717, + "step": 30 + }, + { + "epoch": 0.0128, + "grad_norm": 2.5341553688049316, + "learning_rate": 2.6343054372909648e-06, + "loss": 0.3356, + "step": 32 + }, + { + "epoch": 0.0136, + "grad_norm": 7.333450794219971, + "learning_rate": 2.6532217200065826e-06, + "loss": 0.4261, + "step": 34 + }, + { + "epoch": 0.0144, + "grad_norm": 9.422982215881348, + "learning_rate": 2.6721952943750396e-06, + "loss": 0.7553, + "step": 36 + }, + { + "epoch": 0.0152, + "grad_norm": 7.869431972503662, + "learning_rate": 2.691226012436604e-06, + "loss": 0.4286, + "step": 38 + }, + { + "epoch": 0.016, + "grad_norm": 5.751649379730225, + "learning_rate": 2.7103137257858893e-06, + "loss": 0.5196, + "step": 40 + }, + { + "epoch": 0.0168, + "grad_norm": 9.487853050231934, + "learning_rate": 2.7294582855730733e-06, + "loss": 0.3418, + "step": 42 + }, + { + "epoch": 0.0176, + "grad_norm": 0.8265520334243774, + "learning_rate": 2.7486595425050566e-06, + "loss": 0.36, + "step": 44 + }, + { + "epoch": 0.0184, + "grad_norm": 6.352355003356934, + "learning_rate": 2.7679173468465813e-06, + "loss": 0.6908, + "step": 46 + }, + { + "epoch": 0.0192, + "grad_norm": 6.220467567443848, + "learning_rate": 2.7872315484213954e-06, + "loss": 0.5355, + "step": 48 + }, + { + "epoch": 0.02, + "grad_norm": 8.542949676513672, + "learning_rate": 2.8066019966134873e-06, + "loss": 0.3584, + "step": 50 + }, + { + "epoch": 0.0208, + "grad_norm": 12.532072067260742, + "learning_rate": 2.826028540368212e-06, + "loss": 0.7105, + "step": 52 + }, + { + "epoch": 0.0216, + "grad_norm": 22.506433486938477, + "learning_rate": 2.845511028193477e-06, + "loss": 1.2368, + "step": 54 + }, + { + "epoch": 0.0224, + "grad_norm": 5.032310485839844, + "learning_rate": 2.865049308160931e-06, + "loss": 0.8072, + "step": 56 + }, + { + "epoch": 0.0232, + "grad_norm": 5.905128479003906, + "learning_rate": 2.8846432279071533e-06, + "loss": 0.4286, + "step": 58 + }, + { + "epoch": 0.024, + "grad_norm": 4.69681453704834, + "learning_rate": 2.9042926346347835e-06, + "loss": 0.3218, + "step": 60 + }, + { + "epoch": 0.0248, + "grad_norm": 7.986056804656982, + "learning_rate": 2.9239973751138397e-06, + "loss": 0.2838, + "step": 62 + }, + { + "epoch": 0.0256, + "grad_norm": 7.595277309417725, + "learning_rate": 2.943757295682783e-06, + "loss": 0.8026, + "step": 64 + }, + { + "epoch": 0.0264, + "grad_norm": 8.339045524597168, + "learning_rate": 2.9635722422497983e-06, + "loss": 0.3001, + "step": 66 + }, + { + "epoch": 0.0272, + "grad_norm": 2.1323695182800293, + "learning_rate": 2.983442060293926e-06, + "loss": 0.2492, + "step": 68 + }, + { + "epoch": 0.028, + "grad_norm": 7.9373908042907715, + "learning_rate": 3.003366594866345e-06, + "loss": 0.7103, + "step": 70 + }, + { + "epoch": 0.0288, + "grad_norm": 4.134149551391602, + "learning_rate": 3.0233456905915338e-06, + "loss": 0.7687, + "step": 72 + }, + { + "epoch": 0.0296, + "grad_norm": 7.5926666259765625, + "learning_rate": 3.0433791916684885e-06, + "loss": 0.5618, + "step": 74 + }, + { + "epoch": 0.0304, + "grad_norm": 7.338794231414795, + "learning_rate": 3.0634669418719453e-06, + "loss": 0.4229, + "step": 76 + }, + { + "epoch": 0.0312, + "grad_norm": 10.961363792419434, + "learning_rate": 3.0836087845535933e-06, + "loss": 0.7771, + "step": 78 + }, + { + "epoch": 0.032, + "grad_norm": 7.643119812011719, + "learning_rate": 3.1038045626432945e-06, + "loss": 0.5454, + "step": 80 + }, + { + "epoch": 0.0328, + "grad_norm": 6.308757305145264, + "learning_rate": 3.1240541186503173e-06, + "loss": 0.337, + "step": 82 + }, + { + "epoch": 0.0336, + "grad_norm": 5.920470714569092, + "learning_rate": 3.1443572946645683e-06, + "loss": 0.6162, + "step": 84 + }, + { + "epoch": 0.0344, + "grad_norm": 4.9982733726501465, + "learning_rate": 3.164713932357776e-06, + "loss": 0.3885, + "step": 86 + }, + { + "epoch": 0.0352, + "grad_norm": 4.772004127502441, + "learning_rate": 3.1851238729848033e-06, + "loss": 0.4505, + "step": 88 + }, + { + "epoch": 0.036, + "grad_norm": 6.033514022827148, + "learning_rate": 3.205586957384834e-06, + "loss": 0.5492, + "step": 90 + }, + { + "epoch": 0.0368, + "grad_norm": 7.672890663146973, + "learning_rate": 3.2261030259826253e-06, + "loss": 0.5542, + "step": 92 + }, + { + "epoch": 0.0376, + "grad_norm": 4.560334205627441, + "learning_rate": 3.246671918789752e-06, + "loss": 0.4246, + "step": 94 + }, + { + "epoch": 0.0384, + "grad_norm": 8.157868385314941, + "learning_rate": 3.267293475405858e-06, + "loss": 0.5161, + "step": 96 + }, + { + "epoch": 0.0392, + "grad_norm": 9.780196189880371, + "learning_rate": 3.2879675350199004e-06, + "loss": 0.4372, + "step": 98 + }, + { + "epoch": 0.04, + "grad_norm": 20.107337951660156, + "learning_rate": 3.3086939364114113e-06, + "loss": 0.7497, + "step": 100 + }, + { + "epoch": 0.0408, + "grad_norm": 5.913912296295166, + "learning_rate": 3.329472517951747e-06, + "loss": 0.4863, + "step": 102 + }, + { + "epoch": 0.0416, + "grad_norm": 5.233573913574219, + "learning_rate": 3.350303117605369e-06, + "loss": 0.3886, + "step": 104 + }, + { + "epoch": 0.0424, + "grad_norm": 7.944777488708496, + "learning_rate": 3.3711855729310503e-06, + "loss": 0.7644, + "step": 106 + }, + { + "epoch": 0.0432, + "grad_norm": 7.0277485847473145, + "learning_rate": 3.3921197210832235e-06, + "loss": 0.7131, + "step": 108 + }, + { + "epoch": 0.044, + "grad_norm": 25.346858978271484, + "learning_rate": 3.4131053988131947e-06, + "loss": 0.8946, + "step": 110 + }, + { + "epoch": 0.0448, + "grad_norm": 4.840317249298096, + "learning_rate": 3.434142442470434e-06, + "loss": 0.3861, + "step": 112 + }, + { + "epoch": 0.0456, + "grad_norm": 5.518627643585205, + "learning_rate": 3.455230688003849e-06, + "loss": 0.6604, + "step": 114 + }, + { + "epoch": 0.0464, + "grad_norm": 6.281485557556152, + "learning_rate": 3.476369970963065e-06, + "loss": 0.648, + "step": 116 + }, + { + "epoch": 0.0472, + "grad_norm": 13.487313270568848, + "learning_rate": 3.497560126499706e-06, + "loss": 0.7007, + "step": 118 + }, + { + "epoch": 0.048, + "grad_norm": 2.665107250213623, + "learning_rate": 3.5188009893686836e-06, + "loss": 0.1257, + "step": 120 + }, + { + "epoch": 0.0488, + "grad_norm": 8.823912620544434, + "learning_rate": 3.5400923939294827e-06, + "loss": 0.6357, + "step": 122 + }, + { + "epoch": 0.0496, + "grad_norm": 3.015528678894043, + "learning_rate": 3.5614341741474667e-06, + "loss": 0.1395, + "step": 124 + }, + { + "epoch": 0.0504, + "grad_norm": 3.595741033554077, + "learning_rate": 3.5828261635951177e-06, + "loss": 0.3638, + "step": 126 + }, + { + "epoch": 0.0512, + "grad_norm": 4.580526828765869, + "learning_rate": 3.604268195453421e-06, + "loss": 0.5191, + "step": 128 + }, + { + "epoch": 0.052, + "grad_norm": 7.3755292892456055, + "learning_rate": 3.6257601025130893e-06, + "loss": 0.4993, + "step": 130 + }, + { + "epoch": 0.0528, + "grad_norm": 9.659078598022461, + "learning_rate": 3.647301717175955e-06, + "loss": 0.4007, + "step": 132 + }, + { + "epoch": 0.0536, + "grad_norm": 6.970526695251465, + "learning_rate": 3.66889287145614e-06, + "loss": 0.9278, + "step": 134 + }, + { + "epoch": 0.0544, + "grad_norm": 9.35775375366211, + "learning_rate": 3.6905333969814995e-06, + "loss": 0.7499, + "step": 136 + }, + { + "epoch": 0.0552, + "grad_norm": 8.061389923095703, + "learning_rate": 3.712223124994867e-06, + "loss": 0.3931, + "step": 138 + }, + { + "epoch": 0.056, + "grad_norm": 7.523435592651367, + "learning_rate": 3.7339618863553885e-06, + "loss": 0.3639, + "step": 140 + }, + { + "epoch": 0.0568, + "grad_norm": 5.77680778503418, + "learning_rate": 3.755749511539848e-06, + "loss": 0.5739, + "step": 142 + }, + { + "epoch": 0.0576, + "grad_norm": 8.214181900024414, + "learning_rate": 3.7775858306439404e-06, + "loss": 0.4753, + "step": 144 + }, + { + "epoch": 0.0584, + "grad_norm": 6.254648685455322, + "learning_rate": 3.799470673383677e-06, + "loss": 0.3452, + "step": 146 + }, + { + "epoch": 0.0592, + "grad_norm": 8.179805755615234, + "learning_rate": 3.821403869096644e-06, + "loss": 0.4398, + "step": 148 + }, + { + "epoch": 0.06, + "grad_norm": 5.415241718292236, + "learning_rate": 3.8433852467434175e-06, + "loss": 0.6625, + "step": 150 + }, + { + "epoch": 0.0608, + "grad_norm": 4.284675121307373, + "learning_rate": 3.865414634908756e-06, + "loss": 0.4488, + "step": 152 + }, + { + "epoch": 0.0616, + "grad_norm": 7.8885345458984375, + "learning_rate": 3.887491861803081e-06, + "loss": 0.6684, + "step": 154 + }, + { + "epoch": 0.0624, + "grad_norm": 5.000823497772217, + "learning_rate": 3.909616755263741e-06, + "loss": 0.5359, + "step": 156 + }, + { + "epoch": 0.0632, + "grad_norm": 3.3473854064941406, + "learning_rate": 3.9317891427563725e-06, + "loss": 0.403, + "step": 158 + }, + { + "epoch": 0.064, + "grad_norm": 5.74239444732666, + "learning_rate": 3.954008851376244e-06, + "loss": 0.4741, + "step": 160 + }, + { + "epoch": 0.0648, + "grad_norm": 7.925304889678955, + "learning_rate": 3.976275707849619e-06, + "loss": 0.4604, + "step": 162 + }, + { + "epoch": 0.0656, + "grad_norm": 7.1892571449279785, + "learning_rate": 3.99858953853505e-06, + "loss": 0.547, + "step": 164 + }, + { + "epoch": 0.0664, + "grad_norm": 6.141909599304199, + "learning_rate": 4.0209501694248e-06, + "loss": 0.4658, + "step": 166 + }, + { + "epoch": 0.0672, + "grad_norm": 11.796825408935547, + "learning_rate": 4.043357426146209e-06, + "loss": 0.6537, + "step": 168 + }, + { + "epoch": 0.068, + "grad_norm": 3.107785940170288, + "learning_rate": 4.065811133962987e-06, + "loss": 0.3216, + "step": 170 + }, + { + "epoch": 0.0688, + "grad_norm": 6.113830089569092, + "learning_rate": 4.08831111777658e-06, + "loss": 0.6906, + "step": 172 + }, + { + "epoch": 0.0696, + "grad_norm": 8.406272888183594, + "learning_rate": 4.110857202127611e-06, + "loss": 0.6615, + "step": 174 + }, + { + "epoch": 0.0704, + "grad_norm": 6.46627140045166, + "learning_rate": 4.133449211197183e-06, + "loss": 0.5224, + "step": 176 + }, + { + "epoch": 0.0712, + "grad_norm": 4.773953914642334, + "learning_rate": 4.156086968808274e-06, + "loss": 0.3986, + "step": 178 + }, + { + "epoch": 0.072, + "grad_norm": 2.648827314376831, + "learning_rate": 4.178770298427114e-06, + "loss": 0.3031, + "step": 180 + }, + { + "epoch": 0.0728, + "grad_norm": 5.903095722198486, + "learning_rate": 4.201499023164515e-06, + "loss": 0.4427, + "step": 182 + }, + { + "epoch": 0.0736, + "grad_norm": 11.300853729248047, + "learning_rate": 4.224272965777315e-06, + "loss": 0.8331, + "step": 184 + }, + { + "epoch": 0.0744, + "grad_norm": 4.325449466705322, + "learning_rate": 4.247091948669764e-06, + "loss": 0.3271, + "step": 186 + }, + { + "epoch": 0.0752, + "grad_norm": 8.037109375, + "learning_rate": 4.269955793894849e-06, + "loss": 0.6188, + "step": 188 + }, + { + "epoch": 0.076, + "grad_norm": 5.573611259460449, + "learning_rate": 4.292864323155684e-06, + "loss": 0.5308, + "step": 190 + }, + { + "epoch": 0.0768, + "grad_norm": 9.673019409179688, + "learning_rate": 4.3158173578069696e-06, + "loss": 0.5004, + "step": 192 + }, + { + "epoch": 0.0776, + "grad_norm": 2.4727535247802734, + "learning_rate": 4.338814718856333e-06, + "loss": 0.3358, + "step": 194 + }, + { + "epoch": 0.0784, + "grad_norm": 5.131119728088379, + "learning_rate": 4.3618562269657285e-06, + "loss": 0.2717, + "step": 196 + }, + { + "epoch": 0.0792, + "grad_norm": 18.657052993774414, + "learning_rate": 4.384941702452852e-06, + "loss": 0.8001, + "step": 198 + }, + { + "epoch": 0.08, + "grad_norm": 2.7666983604431152, + "learning_rate": 4.408070965292526e-06, + "loss": 0.4866, + "step": 200 + }, + { + "epoch": 0.0808, + "grad_norm": 3.064669609069824, + "learning_rate": 4.431243835118112e-06, + "loss": 0.3906, + "step": 202 + }, + { + "epoch": 0.0816, + "grad_norm": 12.827176094055176, + "learning_rate": 4.4544601312229185e-06, + "loss": 0.7817, + "step": 204 + }, + { + "epoch": 0.0824, + "grad_norm": 12.40218734741211, + "learning_rate": 4.477719672561602e-06, + "loss": 0.511, + "step": 206 + }, + { + "epoch": 0.0832, + "grad_norm": 7.720707416534424, + "learning_rate": 4.501022277751605e-06, + "loss": 0.396, + "step": 208 + }, + { + "epoch": 0.084, + "grad_norm": 5.669599533081055, + "learning_rate": 4.524367765074499e-06, + "loss": 0.3701, + "step": 210 + }, + { + "epoch": 0.0848, + "grad_norm": 14.275040626525879, + "learning_rate": 4.5477559524775e-06, + "loss": 0.9219, + "step": 212 + }, + { + "epoch": 0.0856, + "grad_norm": 5.309834957122803, + "learning_rate": 4.571186657574823e-06, + "loss": 0.5128, + "step": 214 + }, + { + "epoch": 0.0864, + "grad_norm": 8.355277061462402, + "learning_rate": 4.5946596976491254e-06, + "loss": 0.7039, + "step": 216 + }, + { + "epoch": 0.0872, + "grad_norm": 8.066947937011719, + "learning_rate": 4.618174889652924e-06, + "loss": 0.5883, + "step": 218 + }, + { + "epoch": 0.088, + "grad_norm": 8.851380348205566, + "learning_rate": 4.6417320502100286e-06, + "loss": 0.6141, + "step": 220 + }, + { + "epoch": 0.0888, + "grad_norm": 6.146321773529053, + "learning_rate": 4.665330995616967e-06, + "loss": 0.4213, + "step": 222 + }, + { + "epoch": 0.0896, + "grad_norm": 2.3837244510650635, + "learning_rate": 4.688971541844424e-06, + "loss": 0.372, + "step": 224 + }, + { + "epoch": 0.0904, + "grad_norm": 15.606354713439941, + "learning_rate": 4.712653504538672e-06, + "loss": 0.6691, + "step": 226 + }, + { + "epoch": 0.0912, + "grad_norm": 5.409621238708496, + "learning_rate": 4.736376699023023e-06, + "loss": 0.3749, + "step": 228 + }, + { + "epoch": 0.092, + "grad_norm": 3.3378796577453613, + "learning_rate": 4.76014094029921e-06, + "loss": 0.2424, + "step": 230 + }, + { + "epoch": 0.0928, + "grad_norm": 6.939662933349609, + "learning_rate": 4.7839460430489216e-06, + "loss": 0.2881, + "step": 232 + }, + { + "epoch": 0.0936, + "grad_norm": 11.248266220092773, + "learning_rate": 4.807791821635185e-06, + "loss": 0.6406, + "step": 234 + }, + { + "epoch": 0.0944, + "grad_norm": 14.713454246520996, + "learning_rate": 4.831678090103828e-06, + "loss": 1.1709, + "step": 236 + }, + { + "epoch": 0.0952, + "grad_norm": 3.425539016723633, + "learning_rate": 4.855604662184931e-06, + "loss": 0.4371, + "step": 238 + }, + { + "epoch": 0.096, + "grad_norm": 4.274609088897705, + "learning_rate": 4.8795713512942785e-06, + "loss": 0.2196, + "step": 240 + }, + { + "epoch": 0.0968, + "grad_norm": 5.718649387359619, + "learning_rate": 4.903577970534815e-06, + "loss": 0.3956, + "step": 242 + }, + { + "epoch": 0.0976, + "grad_norm": 5.639620780944824, + "learning_rate": 4.9276243326981e-06, + "loss": 0.4738, + "step": 244 + }, + { + "epoch": 0.0984, + "grad_norm": 6.417396545410156, + "learning_rate": 4.951710250265788e-06, + "loss": 0.7023, + "step": 246 + }, + { + "epoch": 0.0992, + "grad_norm": 5.298447608947754, + "learning_rate": 4.975835535411023e-06, + "loss": 0.5218, + "step": 248 + }, + { + "epoch": 0.1, + "grad_norm": 7.021275520324707, + "learning_rate": 5.000000000000003e-06, + "loss": 0.4748, + "step": 250 + }, + { + "epoch": 0.1008, + "grad_norm": 4.228523254394531, + "learning_rate": 5.024203455593375e-06, + "loss": 0.1665, + "step": 252 + }, + { + "epoch": 0.1016, + "grad_norm": 4.622741222381592, + "learning_rate": 5.048445713447734e-06, + "loss": 0.4185, + "step": 254 + }, + { + "epoch": 0.1024, + "grad_norm": 5.1597208976745605, + "learning_rate": 5.072726584517083e-06, + "loss": 0.336, + "step": 256 + }, + { + "epoch": 0.1032, + "grad_norm": 7.466769695281982, + "learning_rate": 5.097045879454308e-06, + "loss": 0.3572, + "step": 258 + }, + { + "epoch": 0.104, + "grad_norm": 4.210982799530029, + "learning_rate": 5.1214034086126685e-06, + "loss": 0.4357, + "step": 260 + }, + { + "epoch": 0.1048, + "grad_norm": 3.1034891605377197, + "learning_rate": 5.145798982047253e-06, + "loss": 0.4363, + "step": 262 + }, + { + "epoch": 0.1056, + "grad_norm": 5.984959602355957, + "learning_rate": 5.170232409516483e-06, + "loss": 0.4626, + "step": 264 + }, + { + "epoch": 0.1064, + "grad_norm": 17.632976531982422, + "learning_rate": 5.194703500483597e-06, + "loss": 0.4285, + "step": 266 + }, + { + "epoch": 0.1072, + "grad_norm": 7.416657447814941, + "learning_rate": 5.219212064118082e-06, + "loss": 0.3839, + "step": 268 + }, + { + "epoch": 0.108, + "grad_norm": 9.656270980834961, + "learning_rate": 5.24375790929725e-06, + "loss": 0.4548, + "step": 270 + }, + { + "epoch": 0.1088, + "grad_norm": 50.1550178527832, + "learning_rate": 5.268340844607653e-06, + "loss": 4.0707, + "step": 272 + }, + { + "epoch": 0.1096, + "grad_norm": 9.233906745910645, + "learning_rate": 5.2929606783466735e-06, + "loss": 0.662, + "step": 274 + }, + { + "epoch": 0.1104, + "grad_norm": 7.082289218902588, + "learning_rate": 5.317617218523853e-06, + "loss": 0.6863, + "step": 276 + }, + { + "epoch": 0.1112, + "grad_norm": 7.848796367645264, + "learning_rate": 5.342310272862553e-06, + "loss": 0.6412, + "step": 278 + }, + { + "epoch": 0.112, + "grad_norm": 3.84726881980896, + "learning_rate": 5.367039648801377e-06, + "loss": 0.3674, + "step": 280 + }, + { + "epoch": 0.1128, + "grad_norm": 8.89835262298584, + "learning_rate": 5.391805153495684e-06, + "loss": 0.5559, + "step": 282 + }, + { + "epoch": 0.1136, + "grad_norm": 8.333953857421875, + "learning_rate": 5.416606593819109e-06, + "loss": 0.7547, + "step": 284 + }, + { + "epoch": 0.1144, + "grad_norm": 5.093605995178223, + "learning_rate": 5.441443776365005e-06, + "loss": 0.4324, + "step": 286 + }, + { + "epoch": 0.1152, + "grad_norm": 5.2162017822265625, + "learning_rate": 5.466316507448053e-06, + "loss": 0.8642, + "step": 288 + }, + { + "epoch": 0.116, + "grad_norm": 10.420114517211914, + "learning_rate": 5.49122459310568e-06, + "loss": 0.6503, + "step": 290 + }, + { + "epoch": 0.1168, + "grad_norm": 6.556491851806641, + "learning_rate": 5.516167839099662e-06, + "loss": 0.6044, + "step": 292 + }, + { + "epoch": 0.1176, + "grad_norm": 12.74498176574707, + "learning_rate": 5.5411460509175605e-06, + "loss": 0.7149, + "step": 294 + }, + { + "epoch": 0.1184, + "grad_norm": 7.758618354797363, + "learning_rate": 5.5661590337742255e-06, + "loss": 0.4331, + "step": 296 + }, + { + "epoch": 0.1192, + "grad_norm": 6.872960567474365, + "learning_rate": 5.591206592613412e-06, + "loss": 0.5118, + "step": 298 + }, + { + "epoch": 0.12, + "grad_norm": 4.9478302001953125, + "learning_rate": 5.616288532109221e-06, + "loss": 0.3771, + "step": 300 + }, + { + "epoch": 0.1208, + "grad_norm": 4.419280529022217, + "learning_rate": 5.641404656667652e-06, + "loss": 0.3816, + "step": 302 + }, + { + "epoch": 0.1216, + "grad_norm": 4.0377936363220215, + "learning_rate": 5.666554770428136e-06, + "loss": 0.3645, + "step": 304 + }, + { + "epoch": 0.1224, + "grad_norm": 7.112875938415527, + "learning_rate": 5.6917386772650015e-06, + "loss": 0.4655, + "step": 306 + }, + { + "epoch": 0.1232, + "grad_norm": 4.4771904945373535, + "learning_rate": 5.716956180789086e-06, + "loss": 0.4779, + "step": 308 + }, + { + "epoch": 0.124, + "grad_norm": 5.968297958374023, + "learning_rate": 5.74220708434926e-06, + "loss": 0.8389, + "step": 310 + }, + { + "epoch": 0.1248, + "grad_norm": 7.576871395111084, + "learning_rate": 5.767491191033909e-06, + "loss": 0.4136, + "step": 312 + }, + { + "epoch": 0.1256, + "grad_norm": 6.465977668762207, + "learning_rate": 5.7928083036724535e-06, + "loss": 0.3366, + "step": 314 + }, + { + "epoch": 0.1264, + "grad_norm": 8.43101692199707, + "learning_rate": 5.818158224836983e-06, + "loss": 0.4671, + "step": 316 + }, + { + "epoch": 0.1272, + "grad_norm": 6.674490928649902, + "learning_rate": 5.8435407568437194e-06, + "loss": 1.8329, + "step": 318 + }, + { + "epoch": 0.128, + "grad_norm": 4.587431907653809, + "learning_rate": 5.868955701754577e-06, + "loss": 0.339, + "step": 320 + }, + { + "epoch": 0.1288, + "grad_norm": 12.540658950805664, + "learning_rate": 5.894402861378714e-06, + "loss": 0.5991, + "step": 322 + }, + { + "epoch": 0.1296, + "grad_norm": 6.9753875732421875, + "learning_rate": 5.919882037274065e-06, + "loss": 0.4035, + "step": 324 + }, + { + "epoch": 0.1304, + "grad_norm": 4.0211663246154785, + "learning_rate": 5.9453930307488985e-06, + "loss": 0.5661, + "step": 326 + }, + { + "epoch": 0.1312, + "grad_norm": 11.671126365661621, + "learning_rate": 5.970935642863362e-06, + "loss": 0.776, + "step": 328 + }, + { + "epoch": 0.132, + "grad_norm": 4.407910346984863, + "learning_rate": 5.996509674431038e-06, + "loss": 0.4211, + "step": 330 + }, + { + "epoch": 0.1328, + "grad_norm": 3.740440607070923, + "learning_rate": 6.022114926020505e-06, + "loss": 0.3472, + "step": 332 + }, + { + "epoch": 0.1336, + "grad_norm": 4.749978542327881, + "learning_rate": 6.047751197956836e-06, + "loss": 0.2956, + "step": 334 + }, + { + "epoch": 0.1344, + "grad_norm": 3.446544647216797, + "learning_rate": 6.0734182903232475e-06, + "loss": 0.6477, + "step": 336 + }, + { + "epoch": 0.1352, + "grad_norm": 8.398823738098145, + "learning_rate": 6.0991160029626e-06, + "loss": 0.6604, + "step": 338 + }, + { + "epoch": 0.136, + "grad_norm": 3.5237298011779785, + "learning_rate": 6.124844135478966e-06, + "loss": 0.4482, + "step": 340 + }, + { + "epoch": 0.1368, + "grad_norm": 5.967245578765869, + "learning_rate": 6.1506024872392e-06, + "loss": 0.4079, + "step": 342 + }, + { + "epoch": 0.1376, + "grad_norm": 9.490840911865234, + "learning_rate": 6.176390857374501e-06, + "loss": 0.8531, + "step": 344 + }, + { + "epoch": 0.1384, + "grad_norm": 3.8971786499023438, + "learning_rate": 6.202209044781979e-06, + "loss": 0.4008, + "step": 346 + }, + { + "epoch": 0.1392, + "grad_norm": 4.254534721374512, + "learning_rate": 6.228056848126223e-06, + "loss": 0.6325, + "step": 348 + }, + { + "epoch": 0.14, + "grad_norm": 5.16749906539917, + "learning_rate": 6.253934065840883e-06, + "loss": 0.3609, + "step": 350 + }, + { + "epoch": 0.1408, + "grad_norm": 7.321227073669434, + "learning_rate": 6.279840496130188e-06, + "loss": 0.3653, + "step": 352 + }, + { + "epoch": 0.1416, + "grad_norm": 7.62925386428833, + "learning_rate": 6.305775936970606e-06, + "loss": 0.8053, + "step": 354 + }, + { + "epoch": 0.1424, + "grad_norm": 4.928267002105713, + "learning_rate": 6.331740186112359e-06, + "loss": 0.5276, + "step": 356 + }, + { + "epoch": 0.1432, + "grad_norm": 2.6523029804229736, + "learning_rate": 6.357733041081015e-06, + "loss": 0.2917, + "step": 358 + }, + { + "epoch": 0.144, + "grad_norm": 12.000823020935059, + "learning_rate": 6.383754299179072e-06, + "loss": 0.5779, + "step": 360 + }, + { + "epoch": 0.1448, + "grad_norm": 4.4840407371521, + "learning_rate": 6.409803757487532e-06, + "loss": 0.5981, + "step": 362 + }, + { + "epoch": 0.1456, + "grad_norm": 5.198264122009277, + "learning_rate": 6.435881212867485e-06, + "loss": 0.6076, + "step": 364 + }, + { + "epoch": 0.1464, + "grad_norm": 3.4151840209960938, + "learning_rate": 6.4619864619616975e-06, + "loss": 0.4114, + "step": 366 + }, + { + "epoch": 0.1472, + "grad_norm": 6.271514415740967, + "learning_rate": 6.48811930119619e-06, + "loss": 0.4758, + "step": 368 + }, + { + "epoch": 0.148, + "grad_norm": 17.935209274291992, + "learning_rate": 6.514279526781853e-06, + "loss": 0.7199, + "step": 370 + }, + { + "epoch": 0.1488, + "grad_norm": 14.56944465637207, + "learning_rate": 6.540466934715955e-06, + "loss": 0.6454, + "step": 372 + }, + { + "epoch": 0.1496, + "grad_norm": 4.990140438079834, + "learning_rate": 6.566681320783848e-06, + "loss": 0.2431, + "step": 374 + }, + { + "epoch": 0.1504, + "grad_norm": 6.392396450042725, + "learning_rate": 6.592922480560483e-06, + "loss": 0.3967, + "step": 376 + }, + { + "epoch": 0.1512, + "grad_norm": 6.542480945587158, + "learning_rate": 6.619190209412025e-06, + "loss": 0.577, + "step": 378 + }, + { + "epoch": 0.152, + "grad_norm": 6.3435893058776855, + "learning_rate": 6.6454843024974465e-06, + "loss": 0.3547, + "step": 380 + }, + { + "epoch": 0.1528, + "grad_norm": 7.85693883895874, + "learning_rate": 6.671804554770128e-06, + "loss": 0.9327, + "step": 382 + }, + { + "epoch": 0.1536, + "grad_norm": 4.3096137046813965, + "learning_rate": 6.698150760979456e-06, + "loss": 0.6549, + "step": 384 + }, + { + "epoch": 0.1544, + "grad_norm": 5.939504146575928, + "learning_rate": 6.724522715672421e-06, + "loss": 0.518, + "step": 386 + }, + { + "epoch": 0.1552, + "grad_norm": 5.392618656158447, + "learning_rate": 6.750920213195242e-06, + "loss": 0.7825, + "step": 388 + }, + { + "epoch": 0.156, + "grad_norm": 9.22570514678955, + "learning_rate": 6.777343047694894e-06, + "loss": 0.681, + "step": 390 + }, + { + "epoch": 0.1568, + "grad_norm": 5.902850151062012, + "learning_rate": 6.803791013120824e-06, + "loss": 0.3366, + "step": 392 + }, + { + "epoch": 0.1576, + "grad_norm": 6.317704200744629, + "learning_rate": 6.8302639032264836e-06, + "loss": 0.6636, + "step": 394 + }, + { + "epoch": 0.1584, + "grad_norm": 2.739799737930298, + "learning_rate": 6.856761511570944e-06, + "loss": 0.3568, + "step": 396 + }, + { + "epoch": 0.1592, + "grad_norm": 10.551313400268555, + "learning_rate": 6.883283631520579e-06, + "loss": 0.5487, + "step": 398 + }, + { + "epoch": 0.16, + "grad_norm": 8.61369800567627, + "learning_rate": 6.909830056250522e-06, + "loss": 0.5602, + "step": 400 + }, + { + "epoch": 0.1608, + "grad_norm": 8.547295570373535, + "learning_rate": 6.936400578746436e-06, + "loss": 0.6884, + "step": 402 + }, + { + "epoch": 0.1616, + "grad_norm": 16.45806312561035, + "learning_rate": 6.96299499180605e-06, + "loss": 0.7216, + "step": 404 + }, + { + "epoch": 0.1624, + "grad_norm": 3.695611000061035, + "learning_rate": 6.989613088040787e-06, + "loss": 0.3861, + "step": 406 + }, + { + "epoch": 0.1632, + "grad_norm": 6.287961959838867, + "learning_rate": 7.016254659877404e-06, + "loss": 0.4634, + "step": 408 + }, + { + "epoch": 0.164, + "grad_norm": 7.198334217071533, + "learning_rate": 7.042919499559539e-06, + "loss": 0.473, + "step": 410 + }, + { + "epoch": 0.1648, + "grad_norm": 8.600248336791992, + "learning_rate": 7.06960739914943e-06, + "loss": 0.6679, + "step": 412 + }, + { + "epoch": 0.1656, + "grad_norm": 3.197615146636963, + "learning_rate": 7.09631815052946e-06, + "loss": 0.3843, + "step": 414 + }, + { + "epoch": 0.1664, + "grad_norm": 3.0980639457702637, + "learning_rate": 7.123051545403873e-06, + "loss": 0.6683, + "step": 416 + }, + { + "epoch": 0.1672, + "grad_norm": 6.608954906463623, + "learning_rate": 7.1498073753002375e-06, + "loss": 0.7753, + "step": 418 + }, + { + "epoch": 0.168, + "grad_norm": 7.584798812866211, + "learning_rate": 7.1765854315712325e-06, + "loss": 0.5333, + "step": 420 + }, + { + "epoch": 0.1688, + "grad_norm": 6.7867960929870605, + "learning_rate": 7.203385505396197e-06, + "loss": 0.4801, + "step": 422 + }, + { + "epoch": 0.1696, + "grad_norm": 2.2217888832092285, + "learning_rate": 7.230207387782771e-06, + "loss": 0.2836, + "step": 424 + }, + { + "epoch": 0.1704, + "grad_norm": 3.6246209144592285, + "learning_rate": 7.257050869568527e-06, + "loss": 0.4577, + "step": 426 + }, + { + "epoch": 0.1712, + "grad_norm": 3.1784732341766357, + "learning_rate": 7.28391574142262e-06, + "loss": 0.5712, + "step": 428 + }, + { + "epoch": 0.172, + "grad_norm": 2.8604462146759033, + "learning_rate": 7.3108017938473485e-06, + "loss": 0.3046, + "step": 430 + }, + { + "epoch": 0.1728, + "grad_norm": 3.685370922088623, + "learning_rate": 7.337708817179875e-06, + "loss": 0.6305, + "step": 432 + }, + { + "epoch": 0.1736, + "grad_norm": 7.055061340332031, + "learning_rate": 7.36463660159386e-06, + "loss": 0.5632, + "step": 434 + }, + { + "epoch": 0.1744, + "grad_norm": 8.530369758605957, + "learning_rate": 7.39158493710103e-06, + "loss": 0.6991, + "step": 436 + }, + { + "epoch": 0.1752, + "grad_norm": 3.476318597793579, + "learning_rate": 7.418553613552822e-06, + "loss": 0.4436, + "step": 438 + }, + { + "epoch": 0.176, + "grad_norm": 2.979520559310913, + "learning_rate": 7.445542420642091e-06, + "loss": 0.3924, + "step": 440 + }, + { + "epoch": 0.1768, + "grad_norm": 6.099870681762695, + "learning_rate": 7.472551147904703e-06, + "loss": 0.4088, + "step": 442 + }, + { + "epoch": 0.1776, + "grad_norm": 8.147076606750488, + "learning_rate": 7.499579584721173e-06, + "loss": 0.4561, + "step": 444 + }, + { + "epoch": 0.1784, + "grad_norm": 6.481442451477051, + "learning_rate": 7.5266275203183395e-06, + "loss": 0.5383, + "step": 446 + }, + { + "epoch": 0.1792, + "grad_norm": 2.7729148864746094, + "learning_rate": 7.553694743770917e-06, + "loss": 0.3598, + "step": 448 + }, + { + "epoch": 0.18, + "grad_norm": 8.662606239318848, + "learning_rate": 7.580781044003312e-06, + "loss": 0.6492, + "step": 450 + }, + { + "epoch": 0.1808, + "grad_norm": 7.1027374267578125, + "learning_rate": 7.607886209791095e-06, + "loss": 0.8726, + "step": 452 + }, + { + "epoch": 0.1816, + "grad_norm": 3.737102508544922, + "learning_rate": 7.635010029762755e-06, + "loss": 0.4625, + "step": 454 + }, + { + "epoch": 0.1824, + "grad_norm": 7.406997203826904, + "learning_rate": 7.662152292401265e-06, + "loss": 0.6214, + "step": 456 + }, + { + "epoch": 0.1832, + "grad_norm": 6.555530548095703, + "learning_rate": 7.689312786045822e-06, + "loss": 0.7548, + "step": 458 + }, + { + "epoch": 0.184, + "grad_norm": 3.5676968097686768, + "learning_rate": 7.716491298893441e-06, + "loss": 0.3623, + "step": 460 + }, + { + "epoch": 0.1848, + "grad_norm": 5.894664287567139, + "learning_rate": 7.74368761900062e-06, + "loss": 0.5739, + "step": 462 + }, + { + "epoch": 0.1856, + "grad_norm": 4.684706687927246, + "learning_rate": 7.770901534284991e-06, + "loss": 0.6722, + "step": 464 + }, + { + "epoch": 0.1864, + "grad_norm": 8.258028984069824, + "learning_rate": 7.798132832526976e-06, + "loss": 0.4903, + "step": 466 + }, + { + "epoch": 0.1872, + "grad_norm": 11.913901329040527, + "learning_rate": 7.825381301371444e-06, + "loss": 0.3768, + "step": 468 + }, + { + "epoch": 0.188, + "grad_norm": 2.9942965507507324, + "learning_rate": 7.852646728329358e-06, + "loss": 0.4586, + "step": 470 + }, + { + "epoch": 0.1888, + "grad_norm": 9.535568237304688, + "learning_rate": 7.879928900779441e-06, + "loss": 0.6082, + "step": 472 + }, + { + "epoch": 0.1896, + "grad_norm": 11.259536743164062, + "learning_rate": 7.907227605969852e-06, + "loss": 0.859, + "step": 474 + }, + { + "epoch": 0.1904, + "grad_norm": 10.543761253356934, + "learning_rate": 7.934542631019767e-06, + "loss": 0.4658, + "step": 476 + }, + { + "epoch": 0.1912, + "grad_norm": 9.306256294250488, + "learning_rate": 7.961873762921153e-06, + "loss": 0.9107, + "step": 478 + }, + { + "epoch": 0.192, + "grad_norm": 3.1275129318237305, + "learning_rate": 7.989220788540351e-06, + "loss": 0.4828, + "step": 480 + }, + { + "epoch": 0.1928, + "grad_norm": 10.207925796508789, + "learning_rate": 8.016583494619764e-06, + "loss": 0.3932, + "step": 482 + }, + { + "epoch": 0.1936, + "grad_norm": 3.0926074981689453, + "learning_rate": 8.043961667779511e-06, + "loss": 0.514, + "step": 484 + }, + { + "epoch": 0.1944, + "grad_norm": 6.525674343109131, + "learning_rate": 8.071355094519103e-06, + "loss": 0.3918, + "step": 486 + }, + { + "epoch": 0.1952, + "grad_norm": 2.066398859024048, + "learning_rate": 8.098763561219089e-06, + "loss": 0.3501, + "step": 488 + }, + { + "epoch": 0.196, + "grad_norm": 4.382544040679932, + "learning_rate": 8.126186854142744e-06, + "loss": 0.4171, + "step": 490 + }, + { + "epoch": 0.1968, + "grad_norm": 2.5938968658447266, + "learning_rate": 8.153624759437718e-06, + "loss": 0.3604, + "step": 492 + }, + { + "epoch": 0.1976, + "grad_norm": 5.127084255218506, + "learning_rate": 8.181077063137735e-06, + "loss": 0.4352, + "step": 494 + }, + { + "epoch": 0.1984, + "grad_norm": 2.662642478942871, + "learning_rate": 8.208543551164178e-06, + "loss": 0.4609, + "step": 496 + }, + { + "epoch": 0.1992, + "grad_norm": 6.415817737579346, + "learning_rate": 8.236024009327877e-06, + "loss": 0.6271, + "step": 498 + }, + { + "epoch": 0.2, + "grad_norm": 4.131330490112305, + "learning_rate": 8.263518223330695e-06, + "loss": 0.5425, + "step": 500 + }, + { + "epoch": 0.2008, + "grad_norm": 4.420889377593994, + "learning_rate": 8.29102597876723e-06, + "loss": 0.5649, + "step": 502 + }, + { + "epoch": 0.2016, + "grad_norm": 5.390644073486328, + "learning_rate": 8.31854706112648e-06, + "loss": 0.6567, + "step": 504 + }, + { + "epoch": 0.2024, + "grad_norm": 7.901724338531494, + "learning_rate": 8.346081255793516e-06, + "loss": 0.4477, + "step": 506 + }, + { + "epoch": 0.2032, + "grad_norm": 3.4163174629211426, + "learning_rate": 8.373628348051156e-06, + "loss": 0.4377, + "step": 508 + }, + { + "epoch": 0.204, + "grad_norm": 5.113077163696289, + "learning_rate": 8.401188123081642e-06, + "loss": 0.3727, + "step": 510 + }, + { + "epoch": 0.2048, + "grad_norm": 2.9778358936309814, + "learning_rate": 8.428760365968329e-06, + "loss": 0.5046, + "step": 512 + }, + { + "epoch": 0.2056, + "grad_norm": 9.621271133422852, + "learning_rate": 8.456344861697293e-06, + "loss": 0.5682, + "step": 514 + }, + { + "epoch": 0.2064, + "grad_norm": 3.1639251708984375, + "learning_rate": 8.483941395159114e-06, + "loss": 0.4007, + "step": 516 + }, + { + "epoch": 0.2072, + "grad_norm": 14.570175170898438, + "learning_rate": 8.511549751150478e-06, + "loss": 0.6807, + "step": 518 + }, + { + "epoch": 0.208, + "grad_norm": 4.703779220581055, + "learning_rate": 8.539169714375883e-06, + "loss": 0.362, + "step": 520 + }, + { + "epoch": 0.2088, + "grad_norm": 8.251890182495117, + "learning_rate": 8.566801069449304e-06, + "loss": 0.669, + "step": 522 + }, + { + "epoch": 0.2096, + "grad_norm": 7.932413101196289, + "learning_rate": 8.594443600895886e-06, + "loss": 0.5107, + "step": 524 + }, + { + "epoch": 0.2104, + "grad_norm": 3.570723533630371, + "learning_rate": 8.622097093153612e-06, + "loss": 0.3977, + "step": 526 + }, + { + "epoch": 0.2112, + "grad_norm": 7.7874979972839355, + "learning_rate": 8.649761330575e-06, + "loss": 0.5925, + "step": 528 + }, + { + "epoch": 0.212, + "grad_norm": 6.07938814163208, + "learning_rate": 8.677436097428766e-06, + "loss": 0.5776, + "step": 530 + }, + { + "epoch": 0.2128, + "grad_norm": 5.536543369293213, + "learning_rate": 8.705121177901537e-06, + "loss": 0.4131, + "step": 532 + }, + { + "epoch": 0.2136, + "grad_norm": 3.3589935302734375, + "learning_rate": 8.732816356099459e-06, + "loss": 0.4839, + "step": 534 + }, + { + "epoch": 0.2144, + "grad_norm": 2.7732083797454834, + "learning_rate": 8.760521416049986e-06, + "loss": 0.2898, + "step": 536 + }, + { + "epoch": 0.2152, + "grad_norm": 3.485111713409424, + "learning_rate": 8.788236141703477e-06, + "loss": 0.2139, + "step": 538 + }, + { + "epoch": 0.216, + "grad_norm": 5.76864767074585, + "learning_rate": 8.81596031693499e-06, + "loss": 0.445, + "step": 540 + }, + { + "epoch": 0.2168, + "grad_norm": 3.4889819622039795, + "learning_rate": 8.84369372554578e-06, + "loss": 0.3836, + "step": 542 + }, + { + "epoch": 0.2176, + "grad_norm": 3.0347912311553955, + "learning_rate": 8.87143615126518e-06, + "loss": 0.2902, + "step": 544 + }, + { + "epoch": 0.2184, + "grad_norm": 8.731011390686035, + "learning_rate": 8.899187377752173e-06, + "loss": 0.4252, + "step": 546 + }, + { + "epoch": 0.2192, + "grad_norm": 2.755082130432129, + "learning_rate": 8.926947188597127e-06, + "loss": 0.3409, + "step": 548 + }, + { + "epoch": 0.22, + "grad_norm": 7.3828043937683105, + "learning_rate": 8.954715367323473e-06, + "loss": 0.416, + "step": 550 + }, + { + "epoch": 0.2208, + "grad_norm": 2.2503442764282227, + "learning_rate": 8.982491697389344e-06, + "loss": 0.655, + "step": 552 + }, + { + "epoch": 0.2216, + "grad_norm": 6.455631732940674, + "learning_rate": 9.010275962189356e-06, + "loss": 0.5366, + "step": 554 + }, + { + "epoch": 0.2224, + "grad_norm": 4.30733060836792, + "learning_rate": 9.03806794505621e-06, + "loss": 0.5306, + "step": 556 + }, + { + "epoch": 0.2232, + "grad_norm": 10.166731834411621, + "learning_rate": 9.065867429262497e-06, + "loss": 1.083, + "step": 558 + }, + { + "epoch": 0.224, + "grad_norm": 3.9657294750213623, + "learning_rate": 9.093674198022198e-06, + "loss": 0.675, + "step": 560 + }, + { + "epoch": 0.2248, + "grad_norm": 11.929780006408691, + "learning_rate": 9.121488034492567e-06, + "loss": 0.6316, + "step": 562 + }, + { + "epoch": 0.2256, + "grad_norm": 10.191093444824219, + "learning_rate": 9.149308721775717e-06, + "loss": 0.6091, + "step": 564 + }, + { + "epoch": 0.2264, + "grad_norm": 7.18539571762085, + "learning_rate": 9.177136042920338e-06, + "loss": 0.403, + "step": 566 + }, + { + "epoch": 0.2272, + "grad_norm": 3.656543016433716, + "learning_rate": 9.204969780923396e-06, + "loss": 0.6639, + "step": 568 + }, + { + "epoch": 0.228, + "grad_norm": 7.359060764312744, + "learning_rate": 9.232809718731822e-06, + "loss": 0.3566, + "step": 570 + }, + { + "epoch": 0.2288, + "grad_norm": 9.004820823669434, + "learning_rate": 9.26065563924414e-06, + "loss": 0.6622, + "step": 572 + }, + { + "epoch": 0.2296, + "grad_norm": 10.838543891906738, + "learning_rate": 9.288507325312319e-06, + "loss": 0.6195, + "step": 574 + }, + { + "epoch": 0.2304, + "grad_norm": 2.975687265396118, + "learning_rate": 9.316364559743298e-06, + "loss": 0.3825, + "step": 576 + }, + { + "epoch": 0.2312, + "grad_norm": 8.698026657104492, + "learning_rate": 9.344227125300788e-06, + "loss": 1.0027, + "step": 578 + }, + { + "epoch": 0.232, + "grad_norm": 5.095306396484375, + "learning_rate": 9.372094804706867e-06, + "loss": 0.4516, + "step": 580 + }, + { + "epoch": 0.2328, + "grad_norm": 11.336697578430176, + "learning_rate": 9.39996738064379e-06, + "loss": 1.0363, + "step": 582 + }, + { + "epoch": 0.2336, + "grad_norm": 9.10135269165039, + "learning_rate": 9.427844635755615e-06, + "loss": 0.437, + "step": 584 + }, + { + "epoch": 0.2344, + "grad_norm": 5.346004962921143, + "learning_rate": 9.455726352649904e-06, + "loss": 0.476, + "step": 586 + }, + { + "epoch": 0.2352, + "grad_norm": 6.9864983558654785, + "learning_rate": 9.483612313899446e-06, + "loss": 0.6293, + "step": 588 + }, + { + "epoch": 0.236, + "grad_norm": 3.372296094894409, + "learning_rate": 9.511502302043859e-06, + "loss": 0.4772, + "step": 590 + }, + { + "epoch": 0.2368, + "grad_norm": 6.616199493408203, + "learning_rate": 9.539396099591469e-06, + "loss": 0.6176, + "step": 592 + }, + { + "epoch": 0.2376, + "grad_norm": 10.571331977844238, + "learning_rate": 9.567293489020816e-06, + "loss": 0.6476, + "step": 594 + }, + { + "epoch": 0.2384, + "grad_norm": 6.520980358123779, + "learning_rate": 9.595194252782461e-06, + "loss": 0.4728, + "step": 596 + }, + { + "epoch": 0.2392, + "grad_norm": 5.69945764541626, + "learning_rate": 9.623098173300656e-06, + "loss": 0.3048, + "step": 598 + }, + { + "epoch": 0.24, + "grad_norm": 2.8320627212524414, + "learning_rate": 9.651005032974991e-06, + "loss": 0.3585, + "step": 600 + }, + { + "epoch": 0.2408, + "grad_norm": 5.430441379547119, + "learning_rate": 9.678914614182184e-06, + "loss": 0.362, + "step": 602 + }, + { + "epoch": 0.2416, + "grad_norm": 6.453746795654297, + "learning_rate": 9.706826699277714e-06, + "loss": 0.5644, + "step": 604 + }, + { + "epoch": 0.2424, + "grad_norm": 5.222368240356445, + "learning_rate": 9.734741070597535e-06, + "loss": 0.4847, + "step": 606 + }, + { + "epoch": 0.2432, + "grad_norm": 8.284361839294434, + "learning_rate": 9.762657510459774e-06, + "loss": 0.3067, + "step": 608 + }, + { + "epoch": 0.244, + "grad_norm": 2.8420135974884033, + "learning_rate": 9.790575801166422e-06, + "loss": 0.4242, + "step": 610 + }, + { + "epoch": 0.2448, + "grad_norm": 3.083292245864868, + "learning_rate": 9.818495725005043e-06, + "loss": 1.6922, + "step": 612 + }, + { + "epoch": 0.2456, + "grad_norm": 7.784399509429932, + "learning_rate": 9.846417064250459e-06, + "loss": 0.4656, + "step": 614 + }, + { + "epoch": 0.2464, + "grad_norm": 2.823225736618042, + "learning_rate": 9.874339601166479e-06, + "loss": 0.6133, + "step": 616 + }, + { + "epoch": 0.2472, + "grad_norm": 2.745265483856201, + "learning_rate": 9.902263118007513e-06, + "loss": 0.4453, + "step": 618 + }, + { + "epoch": 0.248, + "grad_norm": 6.4653191566467285, + "learning_rate": 9.930187397020385e-06, + "loss": 0.4293, + "step": 620 + }, + { + "epoch": 0.2488, + "grad_norm": 7.129982948303223, + "learning_rate": 9.95811222044596e-06, + "loss": 0.662, + "step": 622 + }, + { + "epoch": 0.2496, + "grad_norm": 3.3546640872955322, + "learning_rate": 9.986037370520855e-06, + "loss": 0.4132, + "step": 624 + }, + { + "epoch": 0.2504, + "grad_norm": 6.033866882324219, + "learning_rate": 1.0013962629479139e-05, + "loss": 0.4106, + "step": 626 + }, + { + "epoch": 0.2512, + "grad_norm": 2.7721498012542725, + "learning_rate": 1.0041887779554034e-05, + "loss": 0.435, + "step": 628 + }, + { + "epoch": 0.252, + "grad_norm": 4.638468265533447, + "learning_rate": 1.0069812602979607e-05, + "loss": 0.3248, + "step": 630 + }, + { + "epoch": 0.2528, + "grad_norm": 4.487112522125244, + "learning_rate": 1.0097736881992482e-05, + "loss": 0.2748, + "step": 632 + }, + { + "epoch": 0.2536, + "grad_norm": 3.1372745037078857, + "learning_rate": 1.0125660398833514e-05, + "loss": 0.5276, + "step": 634 + }, + { + "epoch": 0.2544, + "grad_norm": 3.534034013748169, + "learning_rate": 1.0153582935749533e-05, + "loss": 0.3561, + "step": 636 + }, + { + "epoch": 0.2552, + "grad_norm": 3.8285937309265137, + "learning_rate": 1.0181504274994952e-05, + "loss": 0.2581, + "step": 638 + }, + { + "epoch": 0.256, + "grad_norm": 8.048152923583984, + "learning_rate": 1.0209424198833571e-05, + "loss": 0.6331, + "step": 640 + }, + { + "epoch": 0.2568, + "grad_norm": 14.114180564880371, + "learning_rate": 1.0237342489540218e-05, + "loss": 0.5025, + "step": 642 + }, + { + "epoch": 0.2576, + "grad_norm": 2.971632480621338, + "learning_rate": 1.0265258929402458e-05, + "loss": 0.3162, + "step": 644 + }, + { + "epoch": 0.2584, + "grad_norm": 3.681770086288452, + "learning_rate": 1.029317330072228e-05, + "loss": 0.555, + "step": 646 + }, + { + "epoch": 0.2592, + "grad_norm": 6.693310260772705, + "learning_rate": 1.0321085385817811e-05, + "loss": 0.4238, + "step": 648 + }, + { + "epoch": 0.26, + "grad_norm": 7.489689350128174, + "learning_rate": 1.0348994967025004e-05, + "loss": 0.5693, + "step": 650 + }, + { + "epoch": 0.2608, + "grad_norm": 10.314934730529785, + "learning_rate": 1.0376901826699337e-05, + "loss": 0.444, + "step": 652 + }, + { + "epoch": 0.2616, + "grad_norm": 8.000083923339844, + "learning_rate": 1.0404805747217532e-05, + "loss": 1.1624, + "step": 654 + }, + { + "epoch": 0.2624, + "grad_norm": 3.0513830184936523, + "learning_rate": 1.0432706510979175e-05, + "loss": 0.3671, + "step": 656 + }, + { + "epoch": 0.2632, + "grad_norm": 5.537996768951416, + "learning_rate": 1.0460603900408526e-05, + "loss": 0.3633, + "step": 658 + }, + { + "epoch": 0.264, + "grad_norm": 3.6096031665802, + "learning_rate": 1.0488497697956134e-05, + "loss": 0.3869, + "step": 660 + }, + { + "epoch": 0.2648, + "grad_norm": 3.872437000274658, + "learning_rate": 1.0516387686100549e-05, + "loss": 0.2569, + "step": 662 + }, + { + "epoch": 0.2656, + "grad_norm": 19.229639053344727, + "learning_rate": 1.054427364735009e-05, + "loss": 1.5103, + "step": 664 + }, + { + "epoch": 0.2664, + "grad_norm": 4.366555213928223, + "learning_rate": 1.0572155364244378e-05, + "loss": 0.4739, + "step": 666 + }, + { + "epoch": 0.2672, + "grad_norm": 12.398948669433594, + "learning_rate": 1.0600032619356203e-05, + "loss": 0.7889, + "step": 668 + }, + { + "epoch": 0.268, + "grad_norm": 14.852575302124023, + "learning_rate": 1.0627905195293127e-05, + "loss": 0.9629, + "step": 670 + }, + { + "epoch": 0.2688, + "grad_norm": 10.531519889831543, + "learning_rate": 1.0655772874699206e-05, + "loss": 0.8866, + "step": 672 + }, + { + "epoch": 0.2696, + "grad_norm": 9.607449531555176, + "learning_rate": 1.0683635440256694e-05, + "loss": 0.8125, + "step": 674 + }, + { + "epoch": 0.2704, + "grad_norm": 4.8872785568237305, + "learning_rate": 1.0711492674687674e-05, + "loss": 0.3938, + "step": 676 + }, + { + "epoch": 0.2712, + "grad_norm": 11.146080017089844, + "learning_rate": 1.0739344360755855e-05, + "loss": 0.8325, + "step": 678 + }, + { + "epoch": 0.272, + "grad_norm": 4.164681434631348, + "learning_rate": 1.0767190281268171e-05, + "loss": 0.4736, + "step": 680 + }, + { + "epoch": 0.2728, + "grad_norm": 7.81128454208374, + "learning_rate": 1.07950302190766e-05, + "loss": 0.3557, + "step": 682 + }, + { + "epoch": 0.2736, + "grad_norm": 4.940060138702393, + "learning_rate": 1.0822863957079654e-05, + "loss": 0.2446, + "step": 684 + }, + { + "epoch": 0.2744, + "grad_norm": 3.599069118499756, + "learning_rate": 1.0850691278224277e-05, + "loss": 0.3247, + "step": 686 + }, + { + "epoch": 0.2752, + "grad_norm": 2.723473072052002, + "learning_rate": 1.0878511965507428e-05, + "loss": 0.4377, + "step": 688 + }, + { + "epoch": 0.276, + "grad_norm": 9.285993576049805, + "learning_rate": 1.0906325801977795e-05, + "loss": 0.5683, + "step": 690 + }, + { + "epoch": 0.2768, + "grad_norm": 6.284435272216797, + "learning_rate": 1.0934132570737497e-05, + "loss": 0.4374, + "step": 692 + }, + { + "epoch": 0.2776, + "grad_norm": 3.0146565437316895, + "learning_rate": 1.0961932054943785e-05, + "loss": 0.36, + "step": 694 + }, + { + "epoch": 0.2784, + "grad_norm": 8.176236152648926, + "learning_rate": 1.098972403781064e-05, + "loss": 0.7033, + "step": 696 + }, + { + "epoch": 0.2792, + "grad_norm": 7.648057460784912, + "learning_rate": 1.101750830261065e-05, + "loss": 0.4907, + "step": 698 + }, + { + "epoch": 0.28, + "grad_norm": 6.528674602508545, + "learning_rate": 1.104528463267652e-05, + "loss": 0.489, + "step": 700 + }, + { + "epoch": 0.2808, + "grad_norm": 5.457291603088379, + "learning_rate": 1.1073052811402867e-05, + "loss": 0.8648, + "step": 702 + }, + { + "epoch": 0.2816, + "grad_norm": 7.5247344970703125, + "learning_rate": 1.1100812622247821e-05, + "loss": 0.3886, + "step": 704 + }, + { + "epoch": 0.2824, + "grad_norm": 9.832022666931152, + "learning_rate": 1.1128563848734815e-05, + "loss": 0.7117, + "step": 706 + }, + { + "epoch": 0.2832, + "grad_norm": 12.296405792236328, + "learning_rate": 1.1156306274454211e-05, + "loss": 0.6653, + "step": 708 + }, + { + "epoch": 0.284, + "grad_norm": 3.188476324081421, + "learning_rate": 1.1184039683065002e-05, + "loss": 0.4074, + "step": 710 + }, + { + "epoch": 0.2848, + "grad_norm": 8.455747604370117, + "learning_rate": 1.1211763858296516e-05, + "loss": 0.803, + "step": 712 + }, + { + "epoch": 0.2856, + "grad_norm": 2.5128846168518066, + "learning_rate": 1.1239478583950007e-05, + "loss": 0.6004, + "step": 714 + }, + { + "epoch": 0.2864, + "grad_norm": 6.20869255065918, + "learning_rate": 1.1267183643900534e-05, + "loss": 0.2269, + "step": 716 + }, + { + "epoch": 0.2872, + "grad_norm": 3.149703025817871, + "learning_rate": 1.1294878822098456e-05, + "loss": 0.3468, + "step": 718 + }, + { + "epoch": 0.288, + "grad_norm": 3.046736717224121, + "learning_rate": 1.1322563902571227e-05, + "loss": 0.435, + "step": 720 + }, + { + "epoch": 0.2888, + "grad_norm": 3.9733293056488037, + "learning_rate": 1.1350238669424993e-05, + "loss": 0.336, + "step": 722 + }, + { + "epoch": 0.2896, + "grad_norm": 5.300455093383789, + "learning_rate": 1.137790290684638e-05, + "loss": 0.4059, + "step": 724 + }, + { + "epoch": 0.2904, + "grad_norm": 6.307999134063721, + "learning_rate": 1.1405556399104108e-05, + "loss": 0.5065, + "step": 726 + }, + { + "epoch": 0.2912, + "grad_norm": 7.641139030456543, + "learning_rate": 1.143319893055069e-05, + "loss": 0.8458, + "step": 728 + }, + { + "epoch": 0.292, + "grad_norm": 6.329631328582764, + "learning_rate": 1.1460830285624112e-05, + "loss": 0.3936, + "step": 730 + }, + { + "epoch": 0.2928, + "grad_norm": 3.9163966178894043, + "learning_rate": 1.1488450248849515e-05, + "loss": 0.2481, + "step": 732 + }, + { + "epoch": 0.2936, + "grad_norm": 4.952850341796875, + "learning_rate": 1.1516058604840881e-05, + "loss": 0.5636, + "step": 734 + }, + { + "epoch": 0.2944, + "grad_norm": 4.797139644622803, + "learning_rate": 1.15436551383027e-05, + "loss": 0.3305, + "step": 736 + }, + { + "epoch": 0.2952, + "grad_norm": 5.792111873626709, + "learning_rate": 1.1571239634031666e-05, + "loss": 0.3829, + "step": 738 + }, + { + "epoch": 0.296, + "grad_norm": 2.61521053314209, + "learning_rate": 1.1598811876918352e-05, + "loss": 0.4863, + "step": 740 + }, + { + "epoch": 0.2968, + "grad_norm": 4.18222188949585, + "learning_rate": 1.1626371651948839e-05, + "loss": 0.6332, + "step": 742 + }, + { + "epoch": 0.2976, + "grad_norm": 4.946098327636719, + "learning_rate": 1.1653918744206478e-05, + "loss": 0.4458, + "step": 744 + }, + { + "epoch": 0.2984, + "grad_norm": 5.32734489440918, + "learning_rate": 1.1681452938873515e-05, + "loss": 0.3399, + "step": 746 + }, + { + "epoch": 0.2992, + "grad_norm": 4.0147600173950195, + "learning_rate": 1.1708974021232763e-05, + "loss": 0.4765, + "step": 748 + }, + { + "epoch": 0.3, + "grad_norm": 5.177061080932617, + "learning_rate": 1.1736481776669297e-05, + "loss": 0.3631, + "step": 750 + }, + { + "epoch": 0.3008, + "grad_norm": 5.71055793762207, + "learning_rate": 1.1763975990672116e-05, + "loss": 0.3434, + "step": 752 + }, + { + "epoch": 0.3016, + "grad_norm": 2.9367713928222656, + "learning_rate": 1.1791456448835815e-05, + "loss": 0.338, + "step": 754 + }, + { + "epoch": 0.3024, + "grad_norm": 5.17765474319458, + "learning_rate": 1.1818922936862258e-05, + "loss": 0.2791, + "step": 756 + }, + { + "epoch": 0.3032, + "grad_norm": 11.932939529418945, + "learning_rate": 1.1846375240562274e-05, + "loss": 1.1732, + "step": 758 + }, + { + "epoch": 0.304, + "grad_norm": 3.952331304550171, + "learning_rate": 1.187381314585725e-05, + "loss": 0.3083, + "step": 760 + }, + { + "epoch": 0.3048, + "grad_norm": 2.7587730884552, + "learning_rate": 1.1901236438780906e-05, + "loss": 0.3518, + "step": 762 + }, + { + "epoch": 0.3056, + "grad_norm": 1.007660984992981, + "learning_rate": 1.192864490548089e-05, + "loss": 0.3737, + "step": 764 + }, + { + "epoch": 0.3064, + "grad_norm": 2.9483938217163086, + "learning_rate": 1.195603833222048e-05, + "loss": 0.2933, + "step": 766 + }, + { + "epoch": 0.3072, + "grad_norm": 2.8194210529327393, + "learning_rate": 1.198341650538023e-05, + "loss": 0.5113, + "step": 768 + }, + { + "epoch": 0.308, + "grad_norm": 6.031118869781494, + "learning_rate": 1.2010779211459642e-05, + "loss": 0.8163, + "step": 770 + }, + { + "epoch": 0.3088, + "grad_norm": 3.5361063480377197, + "learning_rate": 1.203812623707884e-05, + "loss": 0.7252, + "step": 772 + }, + { + "epoch": 0.3096, + "grad_norm": 3.8909547328948975, + "learning_rate": 1.2065457368980227e-05, + "loss": 0.4168, + "step": 774 + }, + { + "epoch": 0.3104, + "grad_norm": 3.7777271270751953, + "learning_rate": 1.2092772394030141e-05, + "loss": 0.9927, + "step": 776 + }, + { + "epoch": 0.3112, + "grad_norm": 4.374517440795898, + "learning_rate": 1.2120071099220552e-05, + "loss": 0.5356, + "step": 778 + }, + { + "epoch": 0.312, + "grad_norm": 7.112154483795166, + "learning_rate": 1.2147353271670637e-05, + "loss": 0.4746, + "step": 780 + }, + { + "epoch": 0.3128, + "grad_norm": 4.3697967529296875, + "learning_rate": 1.217461869862855e-05, + "loss": 0.5526, + "step": 782 + }, + { + "epoch": 0.3136, + "grad_norm": 8.930327415466309, + "learning_rate": 1.2201867167473015e-05, + "loss": 0.377, + "step": 784 + }, + { + "epoch": 0.3144, + "grad_norm": 3.697787046432495, + "learning_rate": 1.2229098465715002e-05, + "loss": 0.5277, + "step": 786 + }, + { + "epoch": 0.3152, + "grad_norm": 6.874600887298584, + "learning_rate": 1.2256312380999373e-05, + "loss": 0.7206, + "step": 788 + }, + { + "epoch": 0.316, + "grad_norm": 3.165311813354492, + "learning_rate": 1.2283508701106552e-05, + "loss": 0.4289, + "step": 790 + }, + { + "epoch": 0.3168, + "grad_norm": 8.956424713134766, + "learning_rate": 1.2310687213954173e-05, + "loss": 0.5692, + "step": 792 + }, + { + "epoch": 0.3176, + "grad_norm": 3.313727855682373, + "learning_rate": 1.233784770759873e-05, + "loss": 0.4057, + "step": 794 + }, + { + "epoch": 0.3184, + "grad_norm": 7.44467830657959, + "learning_rate": 1.2364989970237238e-05, + "loss": 0.425, + "step": 796 + }, + { + "epoch": 0.3192, + "grad_norm": 2.7032885551452637, + "learning_rate": 1.23921137902089e-05, + "loss": 1.1685, + "step": 798 + }, + { + "epoch": 0.32, + "grad_norm": 21.789257049560547, + "learning_rate": 1.241921895599668e-05, + "loss": 2.9202, + "step": 800 + }, + { + "epoch": 0.3208, + "grad_norm": 8.086333274841309, + "learning_rate": 1.2446305256229076e-05, + "loss": 0.4955, + "step": 802 + }, + { + "epoch": 0.3216, + "grad_norm": 3.5499794483184814, + "learning_rate": 1.2473372479681653e-05, + "loss": 0.5145, + "step": 804 + }, + { + "epoch": 0.3224, + "grad_norm": 7.596743583679199, + "learning_rate": 1.2500420415278822e-05, + "loss": 0.5452, + "step": 806 + }, + { + "epoch": 0.3232, + "grad_norm": 6.792609214782715, + "learning_rate": 1.2527448852095292e-05, + "loss": 0.4943, + "step": 808 + }, + { + "epoch": 0.324, + "grad_norm": 6.553757667541504, + "learning_rate": 1.2554457579357902e-05, + "loss": 0.6194, + "step": 810 + }, + { + "epoch": 0.3248, + "grad_norm": 7.149604797363281, + "learning_rate": 1.2581446386447171e-05, + "loss": 0.5983, + "step": 812 + }, + { + "epoch": 0.3256, + "grad_norm": 3.606539726257324, + "learning_rate": 1.2608415062898963e-05, + "loss": 0.5224, + "step": 814 + }, + { + "epoch": 0.3264, + "grad_norm": 2.280977725982666, + "learning_rate": 1.2635363398406133e-05, + "loss": 0.623, + "step": 816 + }, + { + "epoch": 0.3272, + "grad_norm": 10.369894981384277, + "learning_rate": 1.266229118282012e-05, + "loss": 0.8874, + "step": 818 + }, + { + "epoch": 0.328, + "grad_norm": 4.448359489440918, + "learning_rate": 1.2689198206152644e-05, + "loss": 0.352, + "step": 820 + }, + { + "epoch": 0.3288, + "grad_norm": 2.4175426959991455, + "learning_rate": 1.2716084258577373e-05, + "loss": 0.4758, + "step": 822 + }, + { + "epoch": 0.3296, + "grad_norm": 5.202447891235352, + "learning_rate": 1.2742949130431468e-05, + "loss": 0.3762, + "step": 824 + }, + { + "epoch": 0.3304, + "grad_norm": 10.678263664245605, + "learning_rate": 1.2769792612217224e-05, + "loss": 0.7578, + "step": 826 + }, + { + "epoch": 0.3312, + "grad_norm": 10.530842781066895, + "learning_rate": 1.2796614494603795e-05, + "loss": 0.5892, + "step": 828 + }, + { + "epoch": 0.332, + "grad_norm": 2.2692458629608154, + "learning_rate": 1.282341456842876e-05, + "loss": 0.4299, + "step": 830 + }, + { + "epoch": 0.3328, + "grad_norm": 29.20014762878418, + "learning_rate": 1.2850192624699756e-05, + "loss": 2.1729, + "step": 832 + }, + { + "epoch": 0.3336, + "grad_norm": 4.95890998840332, + "learning_rate": 1.2876948454596122e-05, + "loss": 0.7695, + "step": 834 + }, + { + "epoch": 0.3344, + "grad_norm": 4.845802307128906, + "learning_rate": 1.2903681849470535e-05, + "loss": 0.5292, + "step": 836 + }, + { + "epoch": 0.3352, + "grad_norm": 8.932195663452148, + "learning_rate": 1.2930392600850565e-05, + "loss": 0.7145, + "step": 838 + }, + { + "epoch": 0.336, + "grad_norm": 5.170701503753662, + "learning_rate": 1.2957080500440455e-05, + "loss": 0.6364, + "step": 840 + }, + { + "epoch": 0.3368, + "grad_norm": 10.944951057434082, + "learning_rate": 1.2983745340122589e-05, + "loss": 0.6809, + "step": 842 + }, + { + "epoch": 0.3376, + "grad_norm": 15.961918830871582, + "learning_rate": 1.3010386911959205e-05, + "loss": 0.6377, + "step": 844 + }, + { + "epoch": 0.3384, + "grad_norm": 7.693042755126953, + "learning_rate": 1.3037005008193944e-05, + "loss": 0.5823, + "step": 846 + }, + { + "epoch": 0.3392, + "grad_norm": 10.104445457458496, + "learning_rate": 1.3063599421253556e-05, + "loss": 0.4535, + "step": 848 + }, + { + "epoch": 0.34, + "grad_norm": 9.348386764526367, + "learning_rate": 1.309016994374947e-05, + "loss": 0.5694, + "step": 850 + }, + { + "epoch": 0.3408, + "grad_norm": 1.5034079551696777, + "learning_rate": 1.3116716368479415e-05, + "loss": 0.3579, + "step": 852 + }, + { + "epoch": 0.3416, + "grad_norm": 6.123833179473877, + "learning_rate": 1.3143238488429049e-05, + "loss": 0.5469, + "step": 854 + }, + { + "epoch": 0.3424, + "grad_norm": 2.4656760692596436, + "learning_rate": 1.316973609677351e-05, + "loss": 0.3558, + "step": 856 + }, + { + "epoch": 0.3432, + "grad_norm": 5.257678985595703, + "learning_rate": 1.319620898687917e-05, + "loss": 0.5961, + "step": 858 + }, + { + "epoch": 0.344, + "grad_norm": 2.2889106273651123, + "learning_rate": 1.32226569523051e-05, + "loss": 0.4341, + "step": 860 + }, + { + "epoch": 0.3448, + "grad_norm": 6.256611347198486, + "learning_rate": 1.324907978680475e-05, + "loss": 0.4521, + "step": 862 + }, + { + "epoch": 0.3456, + "grad_norm": 4.55164909362793, + "learning_rate": 1.3275477284327572e-05, + "loss": 0.4435, + "step": 864 + }, + { + "epoch": 0.3464, + "grad_norm": 3.1022279262542725, + "learning_rate": 1.3301849239020537e-05, + "loss": 0.6874, + "step": 866 + }, + { + "epoch": 0.3472, + "grad_norm": 2.245877981185913, + "learning_rate": 1.3328195445229865e-05, + "loss": 0.5371, + "step": 868 + }, + { + "epoch": 0.348, + "grad_norm": 7.837813377380371, + "learning_rate": 1.3354515697502548e-05, + "loss": 0.5843, + "step": 870 + }, + { + "epoch": 0.3488, + "grad_norm": 6.17982816696167, + "learning_rate": 1.338080979058797e-05, + "loss": 0.5286, + "step": 872 + }, + { + "epoch": 0.3496, + "grad_norm": 11.103405952453613, + "learning_rate": 1.340707751943951e-05, + "loss": 0.6753, + "step": 874 + }, + { + "epoch": 0.3504, + "grad_norm": 14.157888412475586, + "learning_rate": 1.3433318679216145e-05, + "loss": 0.5989, + "step": 876 + }, + { + "epoch": 0.3512, + "grad_norm": 4.318014621734619, + "learning_rate": 1.3459533065284039e-05, + "loss": 0.3676, + "step": 878 + }, + { + "epoch": 0.352, + "grad_norm": 6.474916934967041, + "learning_rate": 1.348572047321814e-05, + "loss": 0.5159, + "step": 880 + }, + { + "epoch": 0.3528, + "grad_norm": 5.441918849945068, + "learning_rate": 1.3511880698803803e-05, + "loss": 0.8862, + "step": 882 + }, + { + "epoch": 0.3536, + "grad_norm": 6.940057277679443, + "learning_rate": 1.3538013538038296e-05, + "loss": 0.6308, + "step": 884 + }, + { + "epoch": 0.3544, + "grad_norm": 3.7186665534973145, + "learning_rate": 1.3564118787132507e-05, + "loss": 0.3685, + "step": 886 + }, + { + "epoch": 0.3552, + "grad_norm": 19.710111618041992, + "learning_rate": 1.3590196242512461e-05, + "loss": 1.2655, + "step": 888 + }, + { + "epoch": 0.356, + "grad_norm": 2.533984899520874, + "learning_rate": 1.361624570082092e-05, + "loss": 0.3519, + "step": 890 + }, + { + "epoch": 0.3568, + "grad_norm": 11.8716402053833, + "learning_rate": 1.364226695891898e-05, + "loss": 0.7045, + "step": 892 + }, + { + "epoch": 0.3576, + "grad_norm": 2.3505406379699707, + "learning_rate": 1.3668259813887637e-05, + "loss": 0.492, + "step": 894 + }, + { + "epoch": 0.3584, + "grad_norm": 4.525683403015137, + "learning_rate": 1.3694224063029386e-05, + "loss": 0.475, + "step": 896 + }, + { + "epoch": 0.3592, + "grad_norm": 7.988845348358154, + "learning_rate": 1.3720159503869806e-05, + "loss": 0.3732, + "step": 898 + }, + { + "epoch": 0.36, + "grad_norm": 4.825455665588379, + "learning_rate": 1.374606593415911e-05, + "loss": 0.4227, + "step": 900 + }, + { + "epoch": 0.3608, + "grad_norm": 2.447044610977173, + "learning_rate": 1.377194315187377e-05, + "loss": 0.5101, + "step": 902 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4026242792606354, + "learning_rate": 1.3797790955218014e-05, + "loss": 0.3221, + "step": 904 + }, + { + "epoch": 0.3624, + "grad_norm": 7.215357780456543, + "learning_rate": 1.3823609142625492e-05, + "loss": 0.8086, + "step": 906 + }, + { + "epoch": 0.3632, + "grad_norm": 3.070342779159546, + "learning_rate": 1.3849397512760793e-05, + "loss": 0.5274, + "step": 908 + }, + { + "epoch": 0.364, + "grad_norm": 4.354795455932617, + "learning_rate": 1.3875155864521027e-05, + "loss": 0.4501, + "step": 910 + }, + { + "epoch": 0.3648, + "grad_norm": 4.162818908691406, + "learning_rate": 1.3900883997037393e-05, + "loss": 0.2185, + "step": 912 + }, + { + "epoch": 0.3656, + "grad_norm": 1.5700749158859253, + "learning_rate": 1.3926581709676746e-05, + "loss": 0.288, + "step": 914 + }, + { + "epoch": 0.3664, + "grad_norm": 4.362253189086914, + "learning_rate": 1.3952248802043158e-05, + "loss": 0.3545, + "step": 916 + }, + { + "epoch": 0.3672, + "grad_norm": 5.418710231781006, + "learning_rate": 1.397788507397949e-05, + "loss": 0.4328, + "step": 918 + }, + { + "epoch": 0.368, + "grad_norm": 6.52181339263916, + "learning_rate": 1.4003490325568956e-05, + "loss": 0.3501, + "step": 920 + }, + { + "epoch": 0.3688, + "grad_norm": 4.86818265914917, + "learning_rate": 1.4029064357136632e-05, + "loss": 0.4529, + "step": 922 + }, + { + "epoch": 0.3696, + "grad_norm": 6.881949424743652, + "learning_rate": 1.4054606969251096e-05, + "loss": 0.5725, + "step": 924 + }, + { + "epoch": 0.3704, + "grad_norm": 3.368731737136841, + "learning_rate": 1.4080117962725929e-05, + "loss": 0.4468, + "step": 926 + }, + { + "epoch": 0.3712, + "grad_norm": 3.010711908340454, + "learning_rate": 1.4105597138621281e-05, + "loss": 0.4431, + "step": 928 + }, + { + "epoch": 0.372, + "grad_norm": 8.769145011901855, + "learning_rate": 1.4131044298245416e-05, + "loss": 0.5087, + "step": 930 + }, + { + "epoch": 0.3728, + "grad_norm": 7.3292975425720215, + "learning_rate": 1.4156459243156275e-05, + "loss": 0.4583, + "step": 932 + }, + { + "epoch": 0.3736, + "grad_norm": 2.95186448097229, + "learning_rate": 1.418184177516301e-05, + "loss": 0.4141, + "step": 934 + }, + { + "epoch": 0.3744, + "grad_norm": 3.9545629024505615, + "learning_rate": 1.420719169632754e-05, + "loss": 0.3136, + "step": 936 + }, + { + "epoch": 0.3752, + "grad_norm": 8.807525634765625, + "learning_rate": 1.4232508808966085e-05, + "loss": 0.792, + "step": 938 + }, + { + "epoch": 0.376, + "grad_norm": 3.8343663215637207, + "learning_rate": 1.4257792915650735e-05, + "loss": 0.4076, + "step": 940 + }, + { + "epoch": 0.3768, + "grad_norm": 3.995673179626465, + "learning_rate": 1.4283043819210906e-05, + "loss": 0.5437, + "step": 942 + }, + { + "epoch": 0.3776, + "grad_norm": 5.492018222808838, + "learning_rate": 1.430826132273499e-05, + "loss": 0.4751, + "step": 944 + }, + { + "epoch": 0.3784, + "grad_norm": 6.859824180603027, + "learning_rate": 1.4333445229571857e-05, + "loss": 0.6557, + "step": 946 + }, + { + "epoch": 0.3792, + "grad_norm": 8.421869277954102, + "learning_rate": 1.4358595343332342e-05, + "loss": 0.7003, + "step": 948 + }, + { + "epoch": 0.38, + "grad_norm": 3.7867302894592285, + "learning_rate": 1.4383711467890772e-05, + "loss": 0.2215, + "step": 950 + }, + { + "epoch": 0.3808, + "grad_norm": 8.494775772094727, + "learning_rate": 1.4408793407386584e-05, + "loss": 0.5345, + "step": 952 + }, + { + "epoch": 0.3816, + "grad_norm": 3.8272619247436523, + "learning_rate": 1.4433840966225767e-05, + "loss": 0.5162, + "step": 954 + }, + { + "epoch": 0.3824, + "grad_norm": 6.869154930114746, + "learning_rate": 1.4458853949082434e-05, + "loss": 0.7501, + "step": 956 + }, + { + "epoch": 0.3832, + "grad_norm": 3.7528223991394043, + "learning_rate": 1.4483832160900332e-05, + "loss": 0.2814, + "step": 958 + }, + { + "epoch": 0.384, + "grad_norm": 12.88850212097168, + "learning_rate": 1.4508775406894315e-05, + "loss": 0.6589, + "step": 960 + }, + { + "epoch": 0.3848, + "grad_norm": 5.3597588539123535, + "learning_rate": 1.4533683492551942e-05, + "loss": 0.1747, + "step": 962 + }, + { + "epoch": 0.3856, + "grad_norm": 4.822731971740723, + "learning_rate": 1.4558556223634988e-05, + "loss": 0.3057, + "step": 964 + }, + { + "epoch": 0.3864, + "grad_norm": 6.9705891609191895, + "learning_rate": 1.4583393406180886e-05, + "loss": 0.4439, + "step": 966 + }, + { + "epoch": 0.3872, + "grad_norm": 7.96857213973999, + "learning_rate": 1.460819484650431e-05, + "loss": 0.586, + "step": 968 + }, + { + "epoch": 0.388, + "grad_norm": 5.491257667541504, + "learning_rate": 1.4632960351198618e-05, + "loss": 0.5213, + "step": 970 + }, + { + "epoch": 0.3888, + "grad_norm": 7.354158878326416, + "learning_rate": 1.4657689727137441e-05, + "loss": 0.4731, + "step": 972 + }, + { + "epoch": 0.3896, + "grad_norm": 8.442399978637695, + "learning_rate": 1.468238278147614e-05, + "loss": 0.5359, + "step": 974 + }, + { + "epoch": 0.3904, + "grad_norm": 1.9848921298980713, + "learning_rate": 1.470703932165332e-05, + "loss": 0.341, + "step": 976 + }, + { + "epoch": 0.3912, + "grad_norm": 2.058201551437378, + "learning_rate": 1.4731659155392339e-05, + "loss": 0.3913, + "step": 978 + }, + { + "epoch": 0.392, + "grad_norm": 8.525607109069824, + "learning_rate": 1.4756242090702744e-05, + "loss": 0.5909, + "step": 980 + }, + { + "epoch": 0.3928, + "grad_norm": 2.9162418842315674, + "learning_rate": 1.4780787935881913e-05, + "loss": 0.3393, + "step": 982 + }, + { + "epoch": 0.3936, + "grad_norm": 2.995479106903076, + "learning_rate": 1.4805296499516397e-05, + "loss": 0.5719, + "step": 984 + }, + { + "epoch": 0.3944, + "grad_norm": 8.575796127319336, + "learning_rate": 1.482976759048351e-05, + "loss": 0.619, + "step": 986 + }, + { + "epoch": 0.3952, + "grad_norm": 2.8806512355804443, + "learning_rate": 1.485420101795274e-05, + "loss": 0.3831, + "step": 988 + }, + { + "epoch": 0.396, + "grad_norm": 6.711476802825928, + "learning_rate": 1.4878596591387327e-05, + "loss": 0.4079, + "step": 990 + }, + { + "epoch": 0.3968, + "grad_norm": 3.724522590637207, + "learning_rate": 1.4902954120545686e-05, + "loss": 0.7505, + "step": 992 + }, + { + "epoch": 0.3976, + "grad_norm": 3.5688815116882324, + "learning_rate": 1.4927273415482913e-05, + "loss": 0.2601, + "step": 994 + }, + { + "epoch": 0.3984, + "grad_norm": 5.003245830535889, + "learning_rate": 1.4951554286552261e-05, + "loss": 0.3731, + "step": 996 + }, + { + "epoch": 0.3992, + "grad_norm": 8.481842041015625, + "learning_rate": 1.4975796544406617e-05, + "loss": 0.7391, + "step": 998 + }, + { + "epoch": 0.4, + "grad_norm": 4.910562038421631, + "learning_rate": 1.4999999999999992e-05, + "loss": 0.4004, + "step": 1000 + }, + { + "epoch": 0.4008, + "grad_norm": 12.834502220153809, + "learning_rate": 1.502416446458897e-05, + "loss": 1.0613, + "step": 1002 + }, + { + "epoch": 0.4016, + "grad_norm": 6.666214942932129, + "learning_rate": 1.5048289749734206e-05, + "loss": 0.594, + "step": 1004 + }, + { + "epoch": 0.4024, + "grad_norm": 2.4326231479644775, + "learning_rate": 1.5072375667301895e-05, + "loss": 0.3435, + "step": 1006 + }, + { + "epoch": 0.4032, + "grad_norm": 2.314138650894165, + "learning_rate": 1.5096422029465178e-05, + "loss": 0.35, + "step": 1008 + }, + { + "epoch": 0.404, + "grad_norm": 2.2951853275299072, + "learning_rate": 1.5120428648705714e-05, + "loss": 0.2415, + "step": 1010 + }, + { + "epoch": 0.4048, + "grad_norm": 1.675195574760437, + "learning_rate": 1.5144395337815064e-05, + "loss": 0.2919, + "step": 1012 + }, + { + "epoch": 0.4056, + "grad_norm": 9.622923851013184, + "learning_rate": 1.5168321909896166e-05, + "loss": 0.558, + "step": 1014 + }, + { + "epoch": 0.4064, + "grad_norm": 9.771827697753906, + "learning_rate": 1.5192208178364808e-05, + "loss": 0.667, + "step": 1016 + }, + { + "epoch": 0.4072, + "grad_norm": 3.1868371963500977, + "learning_rate": 1.521605395695107e-05, + "loss": 0.3962, + "step": 1018 + }, + { + "epoch": 0.408, + "grad_norm": 3.322626829147339, + "learning_rate": 1.5239859059700784e-05, + "loss": 0.3846, + "step": 1020 + }, + { + "epoch": 0.4088, + "grad_norm": 3.9479050636291504, + "learning_rate": 1.526362330097697e-05, + "loss": 0.3751, + "step": 1022 + }, + { + "epoch": 0.4096, + "grad_norm": 2.961951971054077, + "learning_rate": 1.5287346495461322e-05, + "loss": 0.452, + "step": 1024 + }, + { + "epoch": 0.4104, + "grad_norm": 2.779170274734497, + "learning_rate": 1.531102845815557e-05, + "loss": 0.3655, + "step": 1026 + }, + { + "epoch": 0.4112, + "grad_norm": 9.825654983520508, + "learning_rate": 1.5334669004383025e-05, + "loss": 0.3564, + "step": 1028 + }, + { + "epoch": 0.412, + "grad_norm": 6.186550617218018, + "learning_rate": 1.5358267949789968e-05, + "loss": 0.3361, + "step": 1030 + }, + { + "epoch": 0.4128, + "grad_norm": 8.893967628479004, + "learning_rate": 1.5381825110347072e-05, + "loss": 0.5267, + "step": 1032 + }, + { + "epoch": 0.4136, + "grad_norm": 4.747354507446289, + "learning_rate": 1.540534030235087e-05, + "loss": 0.1701, + "step": 1034 + }, + { + "epoch": 0.4144, + "grad_norm": 7.46083927154541, + "learning_rate": 1.542881334242517e-05, + "loss": 0.488, + "step": 1036 + }, + { + "epoch": 0.4152, + "grad_norm": 5.576011657714844, + "learning_rate": 1.5452244047522493e-05, + "loss": 0.5621, + "step": 1038 + }, + { + "epoch": 0.416, + "grad_norm": 3.370360851287842, + "learning_rate": 1.5475632234925495e-05, + "loss": 0.6699, + "step": 1040 + }, + { + "epoch": 0.4168, + "grad_norm": 5.393393039703369, + "learning_rate": 1.5498977722248388e-05, + "loss": 0.487, + "step": 1042 + }, + { + "epoch": 0.4176, + "grad_norm": 3.2120022773742676, + "learning_rate": 1.552228032743839e-05, + "loss": 0.6391, + "step": 1044 + }, + { + "epoch": 0.4184, + "grad_norm": 3.1691336631774902, + "learning_rate": 1.5545539868777075e-05, + "loss": 0.4345, + "step": 1046 + }, + { + "epoch": 0.4192, + "grad_norm": 2.4558982849121094, + "learning_rate": 1.556875616488188e-05, + "loss": 0.5859, + "step": 1048 + }, + { + "epoch": 0.42, + "grad_norm": 4.1330342292785645, + "learning_rate": 1.5591929034707468e-05, + "loss": 0.3224, + "step": 1050 + }, + { + "epoch": 0.4208, + "grad_norm": 2.89819073677063, + "learning_rate": 1.5615058297547144e-05, + "loss": 0.4887, + "step": 1052 + }, + { + "epoch": 0.4216, + "grad_norm": 4.832616806030273, + "learning_rate": 1.5638143773034268e-05, + "loss": 0.4998, + "step": 1054 + }, + { + "epoch": 0.4224, + "grad_norm": 2.52421236038208, + "learning_rate": 1.5661185281143663e-05, + "loss": 0.367, + "step": 1056 + }, + { + "epoch": 0.4232, + "grad_norm": 2.5860087871551514, + "learning_rate": 1.5684182642193024e-05, + "loss": 0.2247, + "step": 1058 + }, + { + "epoch": 0.424, + "grad_norm": 20.680755615234375, + "learning_rate": 1.5707135676844312e-05, + "loss": 1.0634, + "step": 1060 + }, + { + "epoch": 0.4248, + "grad_norm": 2.4126100540161133, + "learning_rate": 1.5730044206105146e-05, + "loss": 0.2989, + "step": 1062 + }, + { + "epoch": 0.4256, + "grad_norm": 6.855231761932373, + "learning_rate": 1.5752908051330232e-05, + "loss": 0.5926, + "step": 1064 + }, + { + "epoch": 0.4264, + "grad_norm": 11.962879180908203, + "learning_rate": 1.577572703422268e-05, + "loss": 0.5489, + "step": 1066 + }, + { + "epoch": 0.4272, + "grad_norm": 5.586971759796143, + "learning_rate": 1.579850097683548e-05, + "loss": 0.5642, + "step": 1068 + }, + { + "epoch": 0.428, + "grad_norm": 9.240392684936523, + "learning_rate": 1.582122970157288e-05, + "loss": 0.5487, + "step": 1070 + }, + { + "epoch": 0.4288, + "grad_norm": 2.5346271991729736, + "learning_rate": 1.5843913031191722e-05, + "loss": 0.2458, + "step": 1072 + }, + { + "epoch": 0.4296, + "grad_norm": 3.1922338008880615, + "learning_rate": 1.586655078880281e-05, + "loss": 0.3974, + "step": 1074 + }, + { + "epoch": 0.4304, + "grad_norm": 7.212031364440918, + "learning_rate": 1.5889142797872383e-05, + "loss": 0.4685, + "step": 1076 + }, + { + "epoch": 0.4312, + "grad_norm": 2.082321882247925, + "learning_rate": 1.5911688882223415e-05, + "loss": 0.2191, + "step": 1078 + }, + { + "epoch": 0.432, + "grad_norm": 5.929754257202148, + "learning_rate": 1.5934188866037007e-05, + "loss": 0.5255, + "step": 1080 + }, + { + "epoch": 0.4328, + "grad_norm": 4.870511531829834, + "learning_rate": 1.5956642573853787e-05, + "loss": 0.5902, + "step": 1082 + }, + { + "epoch": 0.4336, + "grad_norm": 7.386419296264648, + "learning_rate": 1.5979049830575193e-05, + "loss": 0.346, + "step": 1084 + }, + { + "epoch": 0.4344, + "grad_norm": 3.978145122528076, + "learning_rate": 1.6001410461464945e-05, + "loss": 0.6408, + "step": 1086 + }, + { + "epoch": 0.4352, + "grad_norm": 2.4360148906707764, + "learning_rate": 1.6023724292150377e-05, + "loss": 0.643, + "step": 1088 + }, + { + "epoch": 0.436, + "grad_norm": 2.7286767959594727, + "learning_rate": 1.604599114862375e-05, + "loss": 0.3048, + "step": 1090 + }, + { + "epoch": 0.4368, + "grad_norm": 2.107506036758423, + "learning_rate": 1.606821085724362e-05, + "loss": 0.2779, + "step": 1092 + }, + { + "epoch": 0.4376, + "grad_norm": 8.017492294311523, + "learning_rate": 1.6090383244736253e-05, + "loss": 0.453, + "step": 1094 + }, + { + "epoch": 0.4384, + "grad_norm": 8.654541015625, + "learning_rate": 1.6112508138196912e-05, + "loss": 0.4822, + "step": 1096 + }, + { + "epoch": 0.4392, + "grad_norm": 4.79012393951416, + "learning_rate": 1.613458536509124e-05, + "loss": 0.4431, + "step": 1098 + }, + { + "epoch": 0.44, + "grad_norm": 4.138698101043701, + "learning_rate": 1.615661475325658e-05, + "loss": 0.2924, + "step": 1100 + }, + { + "epoch": 0.4408, + "grad_norm": 7.6140522956848145, + "learning_rate": 1.6178596130903352e-05, + "loss": 0.5564, + "step": 1102 + }, + { + "epoch": 0.4416, + "grad_norm": 5.842023849487305, + "learning_rate": 1.620052932661632e-05, + "loss": 0.2565, + "step": 1104 + }, + { + "epoch": 0.4424, + "grad_norm": 3.1433494091033936, + "learning_rate": 1.6222414169356056e-05, + "loss": 0.5082, + "step": 1106 + }, + { + "epoch": 0.4432, + "grad_norm": 2.6837446689605713, + "learning_rate": 1.6244250488460146e-05, + "loss": 0.4007, + "step": 1108 + }, + { + "epoch": 0.444, + "grad_norm": 3.3919453620910645, + "learning_rate": 1.6266038113644605e-05, + "loss": 0.4995, + "step": 1110 + }, + { + "epoch": 0.4448, + "grad_norm": 2.611570358276367, + "learning_rate": 1.6287776875005127e-05, + "loss": 0.3224, + "step": 1112 + }, + { + "epoch": 0.4456, + "grad_norm": 4.856591701507568, + "learning_rate": 1.6309466603018497e-05, + "loss": 0.4728, + "step": 1114 + }, + { + "epoch": 0.4464, + "grad_norm": 13.449359893798828, + "learning_rate": 1.6331107128543856e-05, + "loss": 0.5741, + "step": 1116 + }, + { + "epoch": 0.4472, + "grad_norm": 2.368253707885742, + "learning_rate": 1.635269828282404e-05, + "loss": 0.561, + "step": 1118 + }, + { + "epoch": 0.448, + "grad_norm": 2.5609967708587646, + "learning_rate": 1.6374239897486905e-05, + "loss": 0.4176, + "step": 1120 + }, + { + "epoch": 0.4488, + "grad_norm": 3.7215967178344727, + "learning_rate": 1.6395731804546575e-05, + "loss": 0.3077, + "step": 1122 + }, + { + "epoch": 0.4496, + "grad_norm": 7.401405334472656, + "learning_rate": 1.6417173836404878e-05, + "loss": 0.3221, + "step": 1124 + }, + { + "epoch": 0.4504, + "grad_norm": 2.0310912132263184, + "learning_rate": 1.643856582585253e-05, + "loss": 0.3205, + "step": 1126 + }, + { + "epoch": 0.4512, + "grad_norm": 9.05074405670166, + "learning_rate": 1.6459907606070513e-05, + "loss": 0.3382, + "step": 1128 + }, + { + "epoch": 0.452, + "grad_norm": 11.440814018249512, + "learning_rate": 1.6481199010631312e-05, + "loss": 0.4869, + "step": 1130 + }, + { + "epoch": 0.4528, + "grad_norm": 2.9795913696289062, + "learning_rate": 1.650243987350029e-05, + "loss": 0.6705, + "step": 1132 + }, + { + "epoch": 0.4536, + "grad_norm": 4.947592258453369, + "learning_rate": 1.652363002903693e-05, + "loss": 0.2737, + "step": 1134 + }, + { + "epoch": 0.4544, + "grad_norm": 6.063323974609375, + "learning_rate": 1.6544769311996146e-05, + "loss": 0.3493, + "step": 1136 + }, + { + "epoch": 0.4552, + "grad_norm": 8.340121269226074, + "learning_rate": 1.656585755752956e-05, + "loss": 0.6582, + "step": 1138 + }, + { + "epoch": 0.456, + "grad_norm": 11.626544952392578, + "learning_rate": 1.65868946011868e-05, + "loss": 0.6541, + "step": 1140 + }, + { + "epoch": 0.4568, + "grad_norm": 4.603865623474121, + "learning_rate": 1.660788027891677e-05, + "loss": 0.6873, + "step": 1142 + }, + { + "epoch": 0.4576, + "grad_norm": 7.3429155349731445, + "learning_rate": 1.6628814427068944e-05, + "loss": 0.6526, + "step": 1144 + }, + { + "epoch": 0.4584, + "grad_norm": 3.518334150314331, + "learning_rate": 1.6649696882394625e-05, + "loss": 0.3181, + "step": 1146 + }, + { + "epoch": 0.4592, + "grad_norm": 5.010440349578857, + "learning_rate": 1.667052748204825e-05, + "loss": 0.4269, + "step": 1148 + }, + { + "epoch": 0.46, + "grad_norm": 4.749119758605957, + "learning_rate": 1.6691306063588583e-05, + "loss": 0.3674, + "step": 1150 + }, + { + "epoch": 0.4608, + "grad_norm": 7.220366954803467, + "learning_rate": 1.6712032464980094e-05, + "loss": 0.2894, + "step": 1152 + }, + { + "epoch": 0.4616, + "grad_norm": 2.52457857131958, + "learning_rate": 1.6732706524594138e-05, + "loss": 0.472, + "step": 1154 + }, + { + "epoch": 0.4624, + "grad_norm": 1.485608458518982, + "learning_rate": 1.6753328081210244e-05, + "loss": 0.1831, + "step": 1156 + }, + { + "epoch": 0.4632, + "grad_norm": 4.405038356781006, + "learning_rate": 1.6773896974017373e-05, + "loss": 0.5844, + "step": 1158 + }, + { + "epoch": 0.464, + "grad_norm": 8.147605895996094, + "learning_rate": 1.679441304261516e-05, + "loss": 0.48, + "step": 1160 + }, + { + "epoch": 0.4648, + "grad_norm": 4.937817573547363, + "learning_rate": 1.681487612701519e-05, + "loss": 0.5161, + "step": 1162 + }, + { + "epoch": 0.4656, + "grad_norm": 5.370089530944824, + "learning_rate": 1.683528606764222e-05, + "loss": 0.5149, + "step": 1164 + }, + { + "epoch": 0.4664, + "grad_norm": 8.323260307312012, + "learning_rate": 1.6855642705335428e-05, + "loss": 0.8392, + "step": 1166 + }, + { + "epoch": 0.4672, + "grad_norm": 2.0059239864349365, + "learning_rate": 1.687594588134968e-05, + "loss": 0.3356, + "step": 1168 + }, + { + "epoch": 0.468, + "grad_norm": 4.255488872528076, + "learning_rate": 1.68961954373567e-05, + "loss": 0.4183, + "step": 1170 + }, + { + "epoch": 0.4688, + "grad_norm": 7.48425817489624, + "learning_rate": 1.6916391215446403e-05, + "loss": 0.5642, + "step": 1172 + }, + { + "epoch": 0.4696, + "grad_norm": 6.062568664550781, + "learning_rate": 1.693653305812805e-05, + "loss": 0.4021, + "step": 1174 + }, + { + "epoch": 0.4704, + "grad_norm": 6.629954814910889, + "learning_rate": 1.6956620808331505e-05, + "loss": 2.9714, + "step": 1176 + }, + { + "epoch": 0.4712, + "grad_norm": 4.56139612197876, + "learning_rate": 1.697665430940846e-05, + "loss": 0.4547, + "step": 1178 + }, + { + "epoch": 0.472, + "grad_norm": 6.164916038513184, + "learning_rate": 1.699663340513365e-05, + "loss": 0.7382, + "step": 1180 + }, + { + "epoch": 0.4728, + "grad_norm": 4.28443717956543, + "learning_rate": 1.7016557939706068e-05, + "loss": 0.5405, + "step": 1182 + }, + { + "epoch": 0.4736, + "grad_norm": 9.564210891723633, + "learning_rate": 1.7036427757750198e-05, + "loss": 0.641, + "step": 1184 + }, + { + "epoch": 0.4744, + "grad_norm": 4.702108860015869, + "learning_rate": 1.7056242704317212e-05, + "loss": 0.4587, + "step": 1186 + }, + { + "epoch": 0.4752, + "grad_norm": 6.514930248260498, + "learning_rate": 1.7076002624886156e-05, + "loss": 0.4304, + "step": 1188 + }, + { + "epoch": 0.476, + "grad_norm": 7.816126823425293, + "learning_rate": 1.709570736536521e-05, + "loss": 0.5538, + "step": 1190 + }, + { + "epoch": 0.4768, + "grad_norm": 4.538508415222168, + "learning_rate": 1.7115356772092844e-05, + "loss": 0.3398, + "step": 1192 + }, + { + "epoch": 0.4776, + "grad_norm": 7.519460678100586, + "learning_rate": 1.7134950691839063e-05, + "loss": 0.5039, + "step": 1194 + }, + { + "epoch": 0.4784, + "grad_norm": 3.8620688915252686, + "learning_rate": 1.7154488971806518e-05, + "loss": 0.4347, + "step": 1196 + }, + { + "epoch": 0.4792, + "grad_norm": 2.2250876426696777, + "learning_rate": 1.7173971459631783e-05, + "loss": 0.4299, + "step": 1198 + }, + { + "epoch": 0.48, + "grad_norm": 1.6023005247116089, + "learning_rate": 1.7193398003386507e-05, + "loss": 0.3742, + "step": 1200 + }, + { + "epoch": 0.4808, + "grad_norm": 8.703466415405273, + "learning_rate": 1.7212768451578602e-05, + "loss": 0.5665, + "step": 1202 + }, + { + "epoch": 0.4816, + "grad_norm": 2.355034351348877, + "learning_rate": 1.7232082653153416e-05, + "loss": 0.3028, + "step": 1204 + }, + { + "epoch": 0.4824, + "grad_norm": 2.3913300037384033, + "learning_rate": 1.7251340457494937e-05, + "loss": 0.2936, + "step": 1206 + }, + { + "epoch": 0.4832, + "grad_norm": 2.8259389400482178, + "learning_rate": 1.7270541714426923e-05, + "loss": 0.2899, + "step": 1208 + }, + { + "epoch": 0.484, + "grad_norm": 11.542742729187012, + "learning_rate": 1.7289686274214106e-05, + "loss": 1.6536, + "step": 1210 + }, + { + "epoch": 0.4848, + "grad_norm": 8.168238639831543, + "learning_rate": 1.7308773987563393e-05, + "loss": 0.7052, + "step": 1212 + }, + { + "epoch": 0.4856, + "grad_norm": 6.536013603210449, + "learning_rate": 1.732780470562496e-05, + "loss": 0.7356, + "step": 1214 + }, + { + "epoch": 0.4864, + "grad_norm": 6.662909984588623, + "learning_rate": 1.7346778279993413e-05, + "loss": 0.7118, + "step": 1216 + }, + { + "epoch": 0.4872, + "grad_norm": 5.744428634643555, + "learning_rate": 1.736569456270903e-05, + "loss": 0.4607, + "step": 1218 + }, + { + "epoch": 0.488, + "grad_norm": 3.2977943420410156, + "learning_rate": 1.7384553406258836e-05, + "loss": 0.6452, + "step": 1220 + }, + { + "epoch": 0.4888, + "grad_norm": 7.264194011688232, + "learning_rate": 1.740335466357778e-05, + "loss": 0.4696, + "step": 1222 + }, + { + "epoch": 0.4896, + "grad_norm": 9.938541412353516, + "learning_rate": 1.7422098188049888e-05, + "loss": 0.6597, + "step": 1224 + }, + { + "epoch": 0.4904, + "grad_norm": 4.108887195587158, + "learning_rate": 1.7440783833509373e-05, + "loss": 0.2282, + "step": 1226 + }, + { + "epoch": 0.4912, + "grad_norm": 3.81720232963562, + "learning_rate": 1.7459411454241816e-05, + "loss": 0.3304, + "step": 1228 + }, + { + "epoch": 0.492, + "grad_norm": 8.632997512817383, + "learning_rate": 1.747798090498531e-05, + "loss": 0.4577, + "step": 1230 + }, + { + "epoch": 0.4928, + "grad_norm": 4.479432106018066, + "learning_rate": 1.749649204093154e-05, + "loss": 0.5919, + "step": 1232 + }, + { + "epoch": 0.4936, + "grad_norm": 7.987699031829834, + "learning_rate": 1.7514944717726962e-05, + "loss": 1.0532, + "step": 1234 + }, + { + "epoch": 0.4944, + "grad_norm": 3.5861763954162598, + "learning_rate": 1.753333879147387e-05, + "loss": 0.3997, + "step": 1236 + }, + { + "epoch": 0.4952, + "grad_norm": 8.19918155670166, + "learning_rate": 1.755167411873159e-05, + "loss": 0.5248, + "step": 1238 + }, + { + "epoch": 0.496, + "grad_norm": 0.6573111414909363, + "learning_rate": 1.7569950556517563e-05, + "loss": 0.1645, + "step": 1240 + }, + { + "epoch": 0.4968, + "grad_norm": 3.7899794578552246, + "learning_rate": 1.758816796230845e-05, + "loss": 0.6108, + "step": 1242 + }, + { + "epoch": 0.4976, + "grad_norm": 6.53424596786499, + "learning_rate": 1.7606326194041278e-05, + "loss": 0.5721, + "step": 1244 + }, + { + "epoch": 0.4984, + "grad_norm": 6.82229471206665, + "learning_rate": 1.762442511011447e-05, + "loss": 0.6002, + "step": 1246 + }, + { + "epoch": 0.4992, + "grad_norm": 10.31006908416748, + "learning_rate": 1.7642464569389083e-05, + "loss": 0.7883, + "step": 1248 + }, + { + "epoch": 0.5, + "grad_norm": 4.102840900421143, + "learning_rate": 1.766044443118977e-05, + "loss": 0.3058, + "step": 1250 + }, + { + "epoch": 0.5008, + "grad_norm": 13.063787460327148, + "learning_rate": 1.767836455530598e-05, + "loss": 0.7589, + "step": 1252 + }, + { + "epoch": 0.5016, + "grad_norm": 12.737040519714355, + "learning_rate": 1.7696224801992947e-05, + "loss": 0.879, + "step": 1254 + }, + { + "epoch": 0.5024, + "grad_norm": 5.0701751708984375, + "learning_rate": 1.77140250319729e-05, + "loss": 0.6462, + "step": 1256 + }, + { + "epoch": 0.5032, + "grad_norm": 6.491806507110596, + "learning_rate": 1.7731765106436073e-05, + "loss": 0.4422, + "step": 1258 + }, + { + "epoch": 0.504, + "grad_norm": 7.690762042999268, + "learning_rate": 1.7749444887041793e-05, + "loss": 0.6886, + "step": 1260 + }, + { + "epoch": 0.5048, + "grad_norm": 2.4480361938476562, + "learning_rate": 1.776706423591959e-05, + "loss": 0.3819, + "step": 1262 + }, + { + "epoch": 0.5056, + "grad_norm": 6.522246837615967, + "learning_rate": 1.778462301567023e-05, + "loss": 1.1178, + "step": 1264 + }, + { + "epoch": 0.5064, + "grad_norm": 7.040022373199463, + "learning_rate": 1.7802121089366832e-05, + "loss": 0.6745, + "step": 1266 + }, + { + "epoch": 0.5072, + "grad_norm": 5.24992036819458, + "learning_rate": 1.7819558320555895e-05, + "loss": 0.4028, + "step": 1268 + }, + { + "epoch": 0.508, + "grad_norm": 2.2948031425476074, + "learning_rate": 1.7836934573258392e-05, + "loss": 0.3027, + "step": 1270 + }, + { + "epoch": 0.5088, + "grad_norm": 2.4775025844573975, + "learning_rate": 1.785424971197082e-05, + "loss": 0.3507, + "step": 1272 + }, + { + "epoch": 0.5096, + "grad_norm": 3.7970142364501953, + "learning_rate": 1.7871503601666233e-05, + "loss": 0.591, + "step": 1274 + }, + { + "epoch": 0.5104, + "grad_norm": 8.241109848022461, + "learning_rate": 1.7888696107795343e-05, + "loss": 0.6565, + "step": 1276 + }, + { + "epoch": 0.5112, + "grad_norm": 8.135817527770996, + "learning_rate": 1.790582709628753e-05, + "loss": 0.6159, + "step": 1278 + }, + { + "epoch": 0.512, + "grad_norm": 4.600522518157959, + "learning_rate": 1.7922896433551903e-05, + "loss": 0.3696, + "step": 1280 + }, + { + "epoch": 0.5128, + "grad_norm": 4.542240142822266, + "learning_rate": 1.793990398647835e-05, + "loss": 0.4028, + "step": 1282 + }, + { + "epoch": 0.5136, + "grad_norm": 2.846844434738159, + "learning_rate": 1.795684962243855e-05, + "loss": 0.3901, + "step": 1284 + }, + { + "epoch": 0.5144, + "grad_norm": 2.5168137550354004, + "learning_rate": 1.7973733209287032e-05, + "loss": 0.3155, + "step": 1286 + }, + { + "epoch": 0.5152, + "grad_norm": 7.11170768737793, + "learning_rate": 1.7990554615362193e-05, + "loss": 0.4178, + "step": 1288 + }, + { + "epoch": 0.516, + "grad_norm": 2.3307881355285645, + "learning_rate": 1.800731370948734e-05, + "loss": 0.4979, + "step": 1290 + }, + { + "epoch": 0.5168, + "grad_norm": 14.980961799621582, + "learning_rate": 1.802401036097167e-05, + "loss": 0.6443, + "step": 1292 + }, + { + "epoch": 0.5176, + "grad_norm": 2.6857290267944336, + "learning_rate": 1.804064443961135e-05, + "loss": 0.4083, + "step": 1294 + }, + { + "epoch": 0.5184, + "grad_norm": 5.365575790405273, + "learning_rate": 1.8057215815690494e-05, + "loss": 0.3637, + "step": 1296 + }, + { + "epoch": 0.5192, + "grad_norm": 2.6065447330474854, + "learning_rate": 1.8073724359982184e-05, + "loss": 0.1999, + "step": 1298 + }, + { + "epoch": 0.52, + "grad_norm": 4.7862114906311035, + "learning_rate": 1.809016994374947e-05, + "loss": 0.2711, + "step": 1300 + }, + { + "epoch": 0.5208, + "grad_norm": 3.6769278049468994, + "learning_rate": 1.81065524387464e-05, + "loss": 0.3093, + "step": 1302 + }, + { + "epoch": 0.5216, + "grad_norm": 20.27410888671875, + "learning_rate": 1.8122871717218968e-05, + "loss": 0.5956, + "step": 1304 + }, + { + "epoch": 0.5224, + "grad_norm": 15.000691413879395, + "learning_rate": 1.8139127651906176e-05, + "loss": 0.7266, + "step": 1306 + }, + { + "epoch": 0.5232, + "grad_norm": 8.50636100769043, + "learning_rate": 1.8155320116040976e-05, + "loss": 1.346, + "step": 1308 + }, + { + "epoch": 0.524, + "grad_norm": 2.9468233585357666, + "learning_rate": 1.817144898335129e-05, + "loss": 0.2572, + "step": 1310 + }, + { + "epoch": 0.5248, + "grad_norm": 12.458033561706543, + "learning_rate": 1.818751412806095e-05, + "loss": 0.5091, + "step": 1312 + }, + { + "epoch": 0.5256, + "grad_norm": 4.08944034576416, + "learning_rate": 1.8203515424890738e-05, + "loss": 0.3258, + "step": 1314 + }, + { + "epoch": 0.5264, + "grad_norm": 6.3507819175720215, + "learning_rate": 1.8219452749059322e-05, + "loss": 0.6816, + "step": 1316 + }, + { + "epoch": 0.5272, + "grad_norm": 3.709740161895752, + "learning_rate": 1.8235325976284276e-05, + "loss": 0.3486, + "step": 1318 + }, + { + "epoch": 0.528, + "grad_norm": 2.184805154800415, + "learning_rate": 1.8251134982782952e-05, + "loss": 0.2956, + "step": 1320 + }, + { + "epoch": 0.5288, + "grad_norm": 3.21848726272583, + "learning_rate": 1.826687964527355e-05, + "loss": 0.4955, + "step": 1322 + }, + { + "epoch": 0.5296, + "grad_norm": 11.279875755310059, + "learning_rate": 1.828255984097604e-05, + "loss": 0.5082, + "step": 1324 + }, + { + "epoch": 0.5304, + "grad_norm": 4.378537178039551, + "learning_rate": 1.8298175447613093e-05, + "loss": 0.3311, + "step": 1326 + }, + { + "epoch": 0.5312, + "grad_norm": 5.776856422424316, + "learning_rate": 1.8313726343411092e-05, + "loss": 0.3442, + "step": 1328 + }, + { + "epoch": 0.532, + "grad_norm": 2.2048215866088867, + "learning_rate": 1.8329212407101e-05, + "loss": 0.3682, + "step": 1330 + }, + { + "epoch": 0.5328, + "grad_norm": 2.4243111610412598, + "learning_rate": 1.8344633517919394e-05, + "loss": 0.4139, + "step": 1332 + }, + { + "epoch": 0.5336, + "grad_norm": 8.74450397491455, + "learning_rate": 1.8359989555609344e-05, + "loss": 0.4095, + "step": 1334 + }, + { + "epoch": 0.5344, + "grad_norm": 8.038501739501953, + "learning_rate": 1.8375280400421407e-05, + "loss": 0.7819, + "step": 1336 + }, + { + "epoch": 0.5352, + "grad_norm": 7.184566974639893, + "learning_rate": 1.8390505933114503e-05, + "loss": 0.3759, + "step": 1338 + }, + { + "epoch": 0.536, + "grad_norm": 7.550333499908447, + "learning_rate": 1.8405666034956842e-05, + "loss": 1.3611, + "step": 1340 + }, + { + "epoch": 0.5368, + "grad_norm": 9.062027931213379, + "learning_rate": 1.842076058772692e-05, + "loss": 0.6528, + "step": 1342 + }, + { + "epoch": 0.5376, + "grad_norm": 4.2818708419799805, + "learning_rate": 1.8435789473714384e-05, + "loss": 0.9378, + "step": 1344 + }, + { + "epoch": 0.5384, + "grad_norm": 7.2454938888549805, + "learning_rate": 1.8450752575720964e-05, + "loss": 0.2129, + "step": 1346 + }, + { + "epoch": 0.5392, + "grad_norm": 2.8972926139831543, + "learning_rate": 1.8465649777061384e-05, + "loss": 0.4785, + "step": 1348 + }, + { + "epoch": 0.54, + "grad_norm": 7.637453556060791, + "learning_rate": 1.8480480961564266e-05, + "loss": 0.7323, + "step": 1350 + }, + { + "epoch": 0.5408, + "grad_norm": 3.591602325439453, + "learning_rate": 1.8495246013573047e-05, + "loss": 0.4019, + "step": 1352 + }, + { + "epoch": 0.5416, + "grad_norm": 8.564355850219727, + "learning_rate": 1.850994481794691e-05, + "loss": 0.5674, + "step": 1354 + }, + { + "epoch": 0.5424, + "grad_norm": 11.854432106018066, + "learning_rate": 1.8524577260061628e-05, + "loss": 0.5302, + "step": 1356 + }, + { + "epoch": 0.5432, + "grad_norm": 4.832673072814941, + "learning_rate": 1.8539143225810453e-05, + "loss": 0.397, + "step": 1358 + }, + { + "epoch": 0.544, + "grad_norm": 6.154983043670654, + "learning_rate": 1.8553642601605066e-05, + "loss": 0.8572, + "step": 1360 + }, + { + "epoch": 0.5448, + "grad_norm": 4.938594341278076, + "learning_rate": 1.856807527437643e-05, + "loss": 0.6868, + "step": 1362 + }, + { + "epoch": 0.5456, + "grad_norm": 5.232631206512451, + "learning_rate": 1.8582441131575658e-05, + "loss": 0.5783, + "step": 1364 + }, + { + "epoch": 0.5464, + "grad_norm": 5.804172039031982, + "learning_rate": 1.859674006117491e-05, + "loss": 0.3358, + "step": 1366 + }, + { + "epoch": 0.5472, + "grad_norm": 2.756375789642334, + "learning_rate": 1.8610971951668268e-05, + "loss": 0.396, + "step": 1368 + }, + { + "epoch": 0.548, + "grad_norm": 1.8918120861053467, + "learning_rate": 1.862513669207257e-05, + "loss": 0.2675, + "step": 1370 + }, + { + "epoch": 0.5488, + "grad_norm": 4.279423236846924, + "learning_rate": 1.8639234171928348e-05, + "loss": 0.2105, + "step": 1372 + }, + { + "epoch": 0.5496, + "grad_norm": 14.454187393188477, + "learning_rate": 1.8653264281300612e-05, + "loss": 0.7139, + "step": 1374 + }, + { + "epoch": 0.5504, + "grad_norm": 8.212747573852539, + "learning_rate": 1.8667226910779767e-05, + "loss": 0.4805, + "step": 1376 + }, + { + "epoch": 0.5512, + "grad_norm": 1.839599847793579, + "learning_rate": 1.8681121951482393e-05, + "loss": 0.3243, + "step": 1378 + }, + { + "epoch": 0.552, + "grad_norm": 8.92973518371582, + "learning_rate": 1.869494929505219e-05, + "loss": 1.4886, + "step": 1380 + }, + { + "epoch": 0.5528, + "grad_norm": 9.111851692199707, + "learning_rate": 1.870870883366075e-05, + "loss": 1.0078, + "step": 1382 + }, + { + "epoch": 0.5536, + "grad_norm": 2.5603065490722656, + "learning_rate": 1.8722400460008434e-05, + "loss": 0.2356, + "step": 1384 + }, + { + "epoch": 0.5544, + "grad_norm": 2.080918312072754, + "learning_rate": 1.8736024067325195e-05, + "loss": 0.3798, + "step": 1386 + }, + { + "epoch": 0.5552, + "grad_norm": 2.699968099594116, + "learning_rate": 1.8749579549371373e-05, + "loss": 0.4191, + "step": 1388 + }, + { + "epoch": 0.556, + "grad_norm": 5.318063259124756, + "learning_rate": 1.876306680043863e-05, + "loss": 0.3578, + "step": 1390 + }, + { + "epoch": 0.5568, + "grad_norm": 3.0889317989349365, + "learning_rate": 1.8776485715350665e-05, + "loss": 0.3083, + "step": 1392 + }, + { + "epoch": 0.5576, + "grad_norm": 9.75284194946289, + "learning_rate": 1.878983618946409e-05, + "loss": 0.5724, + "step": 1394 + }, + { + "epoch": 0.5584, + "grad_norm": 2.678609848022461, + "learning_rate": 1.8803118118669203e-05, + "loss": 0.287, + "step": 1396 + }, + { + "epoch": 0.5592, + "grad_norm": 1.7660149335861206, + "learning_rate": 1.881633139939087e-05, + "loss": 0.2172, + "step": 1398 + }, + { + "epoch": 0.56, + "grad_norm": 2.460742950439453, + "learning_rate": 1.882947592858927e-05, + "loss": 0.3789, + "step": 1400 + }, + { + "epoch": 0.5608, + "grad_norm": 2.597334146499634, + "learning_rate": 1.884255160376072e-05, + "loss": 0.5009, + "step": 1402 + }, + { + "epoch": 0.5616, + "grad_norm": 2.7347497940063477, + "learning_rate": 1.885555832293849e-05, + "loss": 0.3827, + "step": 1404 + }, + { + "epoch": 0.5624, + "grad_norm": 11.216575622558594, + "learning_rate": 1.886849598469356e-05, + "loss": 1.3072, + "step": 1406 + }, + { + "epoch": 0.5632, + "grad_norm": 1.3659820556640625, + "learning_rate": 1.888136448813544e-05, + "loss": 0.2031, + "step": 1408 + }, + { + "epoch": 0.564, + "grad_norm": 6.435552597045898, + "learning_rate": 1.8894163732912972e-05, + "loss": 0.3993, + "step": 1410 + }, + { + "epoch": 0.5648, + "grad_norm": 6.237542629241943, + "learning_rate": 1.890689361921506e-05, + "loss": 0.3356, + "step": 1412 + }, + { + "epoch": 0.5656, + "grad_norm": 7.920870780944824, + "learning_rate": 1.891955404777151e-05, + "loss": 0.4875, + "step": 1414 + }, + { + "epoch": 0.5664, + "grad_norm": 8.32780933380127, + "learning_rate": 1.893214491985374e-05, + "loss": 0.4395, + "step": 1416 + }, + { + "epoch": 0.5672, + "grad_norm": 5.75148868560791, + "learning_rate": 1.89446661372756e-05, + "loss": 0.384, + "step": 1418 + }, + { + "epoch": 0.568, + "grad_norm": 6.593601226806641, + "learning_rate": 1.895711760239413e-05, + "loss": 0.4153, + "step": 1420 + }, + { + "epoch": 0.5688, + "grad_norm": 5.3204474449157715, + "learning_rate": 1.89694992181103e-05, + "loss": 0.5154, + "step": 1422 + }, + { + "epoch": 0.5696, + "grad_norm": 7.025198459625244, + "learning_rate": 1.8981810887869784e-05, + "loss": 0.3926, + "step": 1424 + }, + { + "epoch": 0.5704, + "grad_norm": 3.4113192558288574, + "learning_rate": 1.8994052515663708e-05, + "loss": 0.4192, + "step": 1426 + }, + { + "epoch": 0.5712, + "grad_norm": 2.7692418098449707, + "learning_rate": 1.90062240060294e-05, + "loss": 0.4204, + "step": 1428 + }, + { + "epoch": 0.572, + "grad_norm": 7.360057830810547, + "learning_rate": 1.9018325264051136e-05, + "loss": 0.3961, + "step": 1430 + }, + { + "epoch": 0.5728, + "grad_norm": 13.831832885742188, + "learning_rate": 1.9030356195360868e-05, + "loss": 0.7024, + "step": 1432 + }, + { + "epoch": 0.5736, + "grad_norm": 3.6427500247955322, + "learning_rate": 1.904231670613899e-05, + "loss": 0.2966, + "step": 1434 + }, + { + "epoch": 0.5744, + "grad_norm": 6.612185478210449, + "learning_rate": 1.905420670311502e-05, + "loss": 0.6514, + "step": 1436 + }, + { + "epoch": 0.5752, + "grad_norm": 7.35181188583374, + "learning_rate": 1.906602609356838e-05, + "loss": 0.6473, + "step": 1438 + }, + { + "epoch": 0.576, + "grad_norm": 7.588043689727783, + "learning_rate": 1.9077774785329078e-05, + "loss": 0.5851, + "step": 1440 + }, + { + "epoch": 0.5768, + "grad_norm": 5.37501859664917, + "learning_rate": 1.9089452686778487e-05, + "loss": 0.4444, + "step": 1442 + }, + { + "epoch": 0.5776, + "grad_norm": 2.9378836154937744, + "learning_rate": 1.9101059706849957e-05, + "loss": 0.5089, + "step": 1444 + }, + { + "epoch": 0.5784, + "grad_norm": 7.634711742401123, + "learning_rate": 1.911259575502962e-05, + "loss": 0.5481, + "step": 1446 + }, + { + "epoch": 0.5792, + "grad_norm": 3.4533517360687256, + "learning_rate": 1.912406074135706e-05, + "loss": 0.3859, + "step": 1448 + }, + { + "epoch": 0.58, + "grad_norm": 7.150989055633545, + "learning_rate": 1.9135454576426006e-05, + "loss": 0.4401, + "step": 1450 + }, + { + "epoch": 0.5808, + "grad_norm": 15.217089653015137, + "learning_rate": 1.9146777171385053e-05, + "loss": 0.7959, + "step": 1452 + }, + { + "epoch": 0.5816, + "grad_norm": 7.8969197273254395, + "learning_rate": 1.9158028437938316e-05, + "loss": 0.3802, + "step": 1454 + }, + { + "epoch": 0.5824, + "grad_norm": 5.025742530822754, + "learning_rate": 1.9169208288346168e-05, + "loss": 0.3686, + "step": 1456 + }, + { + "epoch": 0.5832, + "grad_norm": 12.718505859375, + "learning_rate": 1.9180316635425876e-05, + "loss": 0.3675, + "step": 1458 + }, + { + "epoch": 0.584, + "grad_norm": 5.548830032348633, + "learning_rate": 1.9191353392552346e-05, + "loss": 0.8477, + "step": 1460 + }, + { + "epoch": 0.5848, + "grad_norm": 5.419297695159912, + "learning_rate": 1.9202318473658703e-05, + "loss": 0.521, + "step": 1462 + }, + { + "epoch": 0.5856, + "grad_norm": 2.6505277156829834, + "learning_rate": 1.9213211793237052e-05, + "loss": 0.4772, + "step": 1464 + }, + { + "epoch": 0.5864, + "grad_norm": 5.732573509216309, + "learning_rate": 1.92240332663391e-05, + "loss": 0.619, + "step": 1466 + }, + { + "epoch": 0.5872, + "grad_norm": 7.656153678894043, + "learning_rate": 1.923478280857682e-05, + "loss": 0.6467, + "step": 1468 + }, + { + "epoch": 0.588, + "grad_norm": 12.494869232177734, + "learning_rate": 1.924546033612313e-05, + "loss": 1.3374, + "step": 1470 + }, + { + "epoch": 0.5888, + "grad_norm": 2.8723483085632324, + "learning_rate": 1.9256065765712524e-05, + "loss": 0.4653, + "step": 1472 + }, + { + "epoch": 0.5896, + "grad_norm": 2.027968168258667, + "learning_rate": 1.9266599014641724e-05, + "loss": 0.4778, + "step": 1474 + }, + { + "epoch": 0.5904, + "grad_norm": 3.951679229736328, + "learning_rate": 1.927706000077034e-05, + "loss": 0.2609, + "step": 1476 + }, + { + "epoch": 0.5912, + "grad_norm": 9.217889785766602, + "learning_rate": 1.9287448642521507e-05, + "loss": 0.893, + "step": 1478 + }, + { + "epoch": 0.592, + "grad_norm": 11.40930461883545, + "learning_rate": 1.9297764858882516e-05, + "loss": 0.7237, + "step": 1480 + }, + { + "epoch": 0.5928, + "grad_norm": 2.650052070617676, + "learning_rate": 1.9308008569405424e-05, + "loss": 0.4074, + "step": 1482 + }, + { + "epoch": 0.5936, + "grad_norm": 4.919514179229736, + "learning_rate": 1.9318179694207722e-05, + "loss": 1.0755, + "step": 1484 + }, + { + "epoch": 0.5944, + "grad_norm": 5.501877784729004, + "learning_rate": 1.9328278153972943e-05, + "loss": 0.36, + "step": 1486 + }, + { + "epoch": 0.5952, + "grad_norm": 3.0425186157226562, + "learning_rate": 1.9338303869951266e-05, + "loss": 0.3789, + "step": 1488 + }, + { + "epoch": 0.596, + "grad_norm": 0.8871821165084839, + "learning_rate": 1.934825676396015e-05, + "loss": 0.131, + "step": 1490 + }, + { + "epoch": 0.5968, + "grad_norm": 6.4956464767456055, + "learning_rate": 1.935813675838491e-05, + "loss": 0.5635, + "step": 1492 + }, + { + "epoch": 0.5976, + "grad_norm": 8.6665620803833, + "learning_rate": 1.9367943776179375e-05, + "loss": 0.5478, + "step": 1494 + }, + { + "epoch": 0.5984, + "grad_norm": 13.301843643188477, + "learning_rate": 1.9377677740866457e-05, + "loss": 1.3023, + "step": 1496 + }, + { + "epoch": 0.5992, + "grad_norm": 8.54133129119873, + "learning_rate": 1.9387338576538743e-05, + "loss": 0.8299, + "step": 1498 + }, + { + "epoch": 0.6, + "grad_norm": 9.637697219848633, + "learning_rate": 1.9396926207859085e-05, + "loss": 0.592, + "step": 1500 + }, + { + "epoch": 0.6008, + "grad_norm": 6.597886562347412, + "learning_rate": 1.9406440560061214e-05, + "loss": 0.3459, + "step": 1502 + }, + { + "epoch": 0.6016, + "grad_norm": 9.389493942260742, + "learning_rate": 1.9415881558950302e-05, + "loss": 0.8848, + "step": 1504 + }, + { + "epoch": 0.6024, + "grad_norm": 4.483582973480225, + "learning_rate": 1.942524913090354e-05, + "loss": 0.2175, + "step": 1506 + }, + { + "epoch": 0.6032, + "grad_norm": 2.89554762840271, + "learning_rate": 1.9434543202870723e-05, + "loss": 2.5025, + "step": 1508 + }, + { + "epoch": 0.604, + "grad_norm": 3.055518388748169, + "learning_rate": 1.9443763702374815e-05, + "loss": 0.5473, + "step": 1510 + }, + { + "epoch": 0.6048, + "grad_norm": 1.5298120975494385, + "learning_rate": 1.9452910557512494e-05, + "loss": 0.3529, + "step": 1512 + }, + { + "epoch": 0.6056, + "grad_norm": 2.2401487827301025, + "learning_rate": 1.9461983696954756e-05, + "loss": 0.3263, + "step": 1514 + }, + { + "epoch": 0.6064, + "grad_norm": 6.360352993011475, + "learning_rate": 1.947098304994744e-05, + "loss": 0.5558, + "step": 1516 + }, + { + "epoch": 0.6072, + "grad_norm": 2.6135237216949463, + "learning_rate": 1.9479908546311787e-05, + "loss": 0.4786, + "step": 1518 + }, + { + "epoch": 0.608, + "grad_norm": 3.436858654022217, + "learning_rate": 1.9488760116444966e-05, + "loss": 0.4016, + "step": 1520 + }, + { + "epoch": 0.6088, + "grad_norm": 5.723163604736328, + "learning_rate": 1.949753769132067e-05, + "loss": 0.5532, + "step": 1522 + }, + { + "epoch": 0.6096, + "grad_norm": 5.951367378234863, + "learning_rate": 1.95062412024896e-05, + "loss": 0.6564, + "step": 1524 + }, + { + "epoch": 0.6104, + "grad_norm": 6.127574920654297, + "learning_rate": 1.951487058208003e-05, + "loss": 0.5847, + "step": 1526 + }, + { + "epoch": 0.6112, + "grad_norm": 0.4084920287132263, + "learning_rate": 1.952342576279833e-05, + "loss": 0.1559, + "step": 1528 + }, + { + "epoch": 0.612, + "grad_norm": 4.772994518280029, + "learning_rate": 1.953190667792947e-05, + "loss": 0.3836, + "step": 1530 + }, + { + "epoch": 0.6128, + "grad_norm": 3.275306463241577, + "learning_rate": 1.9540313261337578e-05, + "loss": 0.4954, + "step": 1532 + }, + { + "epoch": 0.6136, + "grad_norm": 17.405899047851562, + "learning_rate": 1.954864544746643e-05, + "loss": 0.9868, + "step": 1534 + }, + { + "epoch": 0.6144, + "grad_norm": 3.4976556301116943, + "learning_rate": 1.955690317133996e-05, + "loss": 0.5427, + "step": 1536 + }, + { + "epoch": 0.6152, + "grad_norm": 4.741286277770996, + "learning_rate": 1.956508636856278e-05, + "loss": 0.5814, + "step": 1538 + }, + { + "epoch": 0.616, + "grad_norm": 9.094281196594238, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.7027, + "step": 1540 + }, + { + "epoch": 0.6168, + "grad_norm": 10.391242027282715, + "learning_rate": 1.95812289283811e-05, + "loss": 0.6991, + "step": 1542 + }, + { + "epoch": 0.6176, + "grad_norm": 4.268300533294678, + "learning_rate": 1.958918816509367e-05, + "loss": 0.5425, + "step": 1544 + }, + { + "epoch": 0.6184, + "grad_norm": 2.675271511077881, + "learning_rate": 1.9597072623390668e-05, + "loss": 0.427, + "step": 1546 + }, + { + "epoch": 0.6192, + "grad_norm": 4.287508010864258, + "learning_rate": 1.9604882241787496e-05, + "loss": 0.4181, + "step": 1548 + }, + { + "epoch": 0.62, + "grad_norm": 5.404676914215088, + "learning_rate": 1.9612616959383187e-05, + "loss": 0.4235, + "step": 1550 + }, + { + "epoch": 0.6208, + "grad_norm": 13.730271339416504, + "learning_rate": 1.9620276715860856e-05, + "loss": 0.8335, + "step": 1552 + }, + { + "epoch": 0.6216, + "grad_norm": 4.315786361694336, + "learning_rate": 1.9627861451488187e-05, + "loss": 0.5621, + "step": 1554 + }, + { + "epoch": 0.6224, + "grad_norm": 7.495677947998047, + "learning_rate": 1.963537110711789e-05, + "loss": 0.4497, + "step": 1556 + }, + { + "epoch": 0.6232, + "grad_norm": 3.7830357551574707, + "learning_rate": 1.964280562418815e-05, + "loss": 0.2931, + "step": 1558 + }, + { + "epoch": 0.624, + "grad_norm": 11.285057067871094, + "learning_rate": 1.9650164944723116e-05, + "loss": 0.7102, + "step": 1560 + }, + { + "epoch": 0.6248, + "grad_norm": 4.690367221832275, + "learning_rate": 1.9657449011333328e-05, + "loss": 0.3579, + "step": 1562 + }, + { + "epoch": 0.6256, + "grad_norm": 5.907130718231201, + "learning_rate": 1.9664657767216176e-05, + "loss": 0.5712, + "step": 1564 + }, + { + "epoch": 0.6264, + "grad_norm": 4.572456359863281, + "learning_rate": 1.967179115615633e-05, + "loss": 0.4964, + "step": 1566 + }, + { + "epoch": 0.6272, + "grad_norm": 6.378812313079834, + "learning_rate": 1.967884912252619e-05, + "loss": 0.4332, + "step": 1568 + }, + { + "epoch": 0.628, + "grad_norm": 4.807539463043213, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.3473, + "step": 1570 + }, + { + "epoch": 0.6288, + "grad_norm": 7.2378339767456055, + "learning_rate": 1.969273856798585e-05, + "loss": 0.4192, + "step": 1572 + }, + { + "epoch": 0.6296, + "grad_norm": 8.278740882873535, + "learning_rate": 1.9699569938762972e-05, + "loss": 0.4816, + "step": 1574 + }, + { + "epoch": 0.6304, + "grad_norm": 8.5382719039917, + "learning_rate": 1.9706325670345276e-05, + "loss": 0.6862, + "step": 1576 + }, + { + "epoch": 0.6312, + "grad_norm": 3.933340549468994, + "learning_rate": 1.9713005710050203e-05, + "loss": 0.4656, + "step": 1578 + }, + { + "epoch": 0.632, + "grad_norm": 7.442902565002441, + "learning_rate": 1.9719610005785466e-05, + "loss": 0.4644, + "step": 1580 + }, + { + "epoch": 0.6328, + "grad_norm": 10.707149505615234, + "learning_rate": 1.9726138506049434e-05, + "loss": 0.6171, + "step": 1582 + }, + { + "epoch": 0.6336, + "grad_norm": 2.4153261184692383, + "learning_rate": 1.9732591159931564e-05, + "loss": 0.4276, + "step": 1584 + }, + { + "epoch": 0.6344, + "grad_norm": 2.093482732772827, + "learning_rate": 1.9738967917112752e-05, + "loss": 0.3721, + "step": 1586 + }, + { + "epoch": 0.6352, + "grad_norm": 5.4622344970703125, + "learning_rate": 1.974526872786577e-05, + "loss": 0.4802, + "step": 1588 + }, + { + "epoch": 0.636, + "grad_norm": 5.805969715118408, + "learning_rate": 1.9751493543055634e-05, + "loss": 0.9636, + "step": 1590 + }, + { + "epoch": 0.6368, + "grad_norm": 9.242039680480957, + "learning_rate": 1.9757642314139977e-05, + "loss": 0.8385, + "step": 1592 + }, + { + "epoch": 0.6376, + "grad_norm": 4.233711242675781, + "learning_rate": 1.976371499316945e-05, + "loss": 0.2892, + "step": 1594 + }, + { + "epoch": 0.6384, + "grad_norm": 8.57029914855957, + "learning_rate": 1.9769711532788083e-05, + "loss": 0.5883, + "step": 1596 + }, + { + "epoch": 0.6392, + "grad_norm": 3.897994041442871, + "learning_rate": 1.9775631886233655e-05, + "loss": 0.4078, + "step": 1598 + }, + { + "epoch": 0.64, + "grad_norm": 3.17345929145813, + "learning_rate": 1.9781476007338054e-05, + "loss": 0.3577, + "step": 1600 + }, + { + "epoch": 0.6408, + "grad_norm": 11.469186782836914, + "learning_rate": 1.978724385052766e-05, + "loss": 0.5497, + "step": 1602 + }, + { + "epoch": 0.6416, + "grad_norm": 1.5659518241882324, + "learning_rate": 1.9792935370823673e-05, + "loss": 0.2753, + "step": 1604 + }, + { + "epoch": 0.6424, + "grad_norm": 2.373509168624878, + "learning_rate": 1.979855052384247e-05, + "loss": 0.3695, + "step": 1606 + }, + { + "epoch": 0.6432, + "grad_norm": 2.043501138687134, + "learning_rate": 1.9804089265795956e-05, + "loss": 0.3488, + "step": 1608 + }, + { + "epoch": 0.644, + "grad_norm": 4.598385334014893, + "learning_rate": 1.9809551553491918e-05, + "loss": 0.6336, + "step": 1610 + }, + { + "epoch": 0.6448, + "grad_norm": 2.1152210235595703, + "learning_rate": 1.981493734433433e-05, + "loss": 0.302, + "step": 1612 + }, + { + "epoch": 0.6456, + "grad_norm": 2.2232983112335205, + "learning_rate": 1.982024659632372e-05, + "loss": 0.3294, + "step": 1614 + }, + { + "epoch": 0.6464, + "grad_norm": 5.589328289031982, + "learning_rate": 1.9825479268057472e-05, + "loss": 0.2848, + "step": 1616 + }, + { + "epoch": 0.6472, + "grad_norm": 7.114500522613525, + "learning_rate": 1.9830635318730155e-05, + "loss": 0.7318, + "step": 1618 + }, + { + "epoch": 0.648, + "grad_norm": 5.905538082122803, + "learning_rate": 1.9835714708133858e-05, + "loss": 0.3707, + "step": 1620 + }, + { + "epoch": 0.6488, + "grad_norm": 2.066833734512329, + "learning_rate": 1.9840717396658483e-05, + "loss": 0.5473, + "step": 1622 + }, + { + "epoch": 0.6496, + "grad_norm": 3.5688998699188232, + "learning_rate": 1.9845643345292055e-05, + "loss": 0.3819, + "step": 1624 + }, + { + "epoch": 0.6504, + "grad_norm": 1.7835557460784912, + "learning_rate": 1.9850492515621038e-05, + "loss": 0.309, + "step": 1626 + }, + { + "epoch": 0.6512, + "grad_norm": 2.2958924770355225, + "learning_rate": 1.985526486983063e-05, + "loss": 0.2285, + "step": 1628 + }, + { + "epoch": 0.652, + "grad_norm": 3.196260452270508, + "learning_rate": 1.985996037070505e-05, + "loss": 0.5863, + "step": 1630 + }, + { + "epoch": 0.6528, + "grad_norm": 8.522262573242188, + "learning_rate": 1.9864578981627844e-05, + "loss": 0.8679, + "step": 1632 + }, + { + "epoch": 0.6536, + "grad_norm": 3.0343809127807617, + "learning_rate": 1.9869120666582153e-05, + "loss": 0.5473, + "step": 1634 + }, + { + "epoch": 0.6544, + "grad_norm": 2.9875917434692383, + "learning_rate": 1.9873585390151003e-05, + "loss": 0.402, + "step": 1636 + }, + { + "epoch": 0.6552, + "grad_norm": 10.342791557312012, + "learning_rate": 1.987797311751759e-05, + "loss": 0.4833, + "step": 1638 + }, + { + "epoch": 0.656, + "grad_norm": 6.520310401916504, + "learning_rate": 1.9882283814465528e-05, + "loss": 0.3338, + "step": 1640 + }, + { + "epoch": 0.6568, + "grad_norm": 2.931133508682251, + "learning_rate": 1.988651744737914e-05, + "loss": 0.2669, + "step": 1642 + }, + { + "epoch": 0.6576, + "grad_norm": 11.377016067504883, + "learning_rate": 1.9890673983243704e-05, + "loss": 0.6862, + "step": 1644 + }, + { + "epoch": 0.6584, + "grad_norm": 6.504305362701416, + "learning_rate": 1.9894753389645723e-05, + "loss": 0.329, + "step": 1646 + }, + { + "epoch": 0.6592, + "grad_norm": 2.494025230407715, + "learning_rate": 1.9898755634773155e-05, + "loss": 0.3779, + "step": 1648 + }, + { + "epoch": 0.66, + "grad_norm": 9.32198429107666, + "learning_rate": 1.9902680687415704e-05, + "loss": 0.5162, + "step": 1650 + }, + { + "epoch": 0.6608, + "grad_norm": 4.862305641174316, + "learning_rate": 1.9906528516965014e-05, + "loss": 0.6947, + "step": 1652 + }, + { + "epoch": 0.6616, + "grad_norm": 12.738851547241211, + "learning_rate": 1.9910299093414926e-05, + "loss": 0.4528, + "step": 1654 + }, + { + "epoch": 0.6624, + "grad_norm": 2.390115737915039, + "learning_rate": 1.9913992387361744e-05, + "loss": 0.4201, + "step": 1656 + }, + { + "epoch": 0.6632, + "grad_norm": 7.348924160003662, + "learning_rate": 1.9917608370004414e-05, + "loss": 0.4181, + "step": 1658 + }, + { + "epoch": 0.664, + "grad_norm": 3.2150723934173584, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.3665, + "step": 1660 + }, + { + "epoch": 0.6648, + "grad_norm": 4.1383819580078125, + "learning_rate": 1.9924608289187786e-05, + "loss": 0.4743, + "step": 1662 + }, + { + "epoch": 0.6656, + "grad_norm": 6.19020938873291, + "learning_rate": 1.9927992171141707e-05, + "loss": 0.6406, + "step": 1664 + }, + { + "epoch": 0.6664, + "grad_norm": 2.8534698486328125, + "learning_rate": 1.9931298632618355e-05, + "loss": 0.2139, + "step": 1666 + }, + { + "epoch": 0.6672, + "grad_norm": 6.954126834869385, + "learning_rate": 1.9934527647833276e-05, + "loss": 0.4103, + "step": 1668 + }, + { + "epoch": 0.668, + "grad_norm": 1.7877683639526367, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.4074, + "step": 1670 + }, + { + "epoch": 0.6688, + "grad_norm": 3.242607593536377, + "learning_rate": 1.9940753239360047e-05, + "loss": 0.3618, + "step": 1672 + }, + { + "epoch": 0.6696, + "grad_norm": 3.0980658531188965, + "learning_rate": 1.994374976712348e-05, + "loss": 0.4763, + "step": 1674 + }, + { + "epoch": 0.6704, + "grad_norm": 4.143883228302002, + "learning_rate": 1.994666875152874e-05, + "loss": 0.5616, + "step": 1676 + }, + { + "epoch": 0.6712, + "grad_norm": 2.5318260192871094, + "learning_rate": 1.9949510169813003e-05, + "loss": 0.2323, + "step": 1678 + }, + { + "epoch": 0.672, + "grad_norm": 12.222033500671387, + "learning_rate": 1.9952273999818312e-05, + "loss": 0.4916, + "step": 1680 + }, + { + "epoch": 0.6728, + "grad_norm": 3.9567325115203857, + "learning_rate": 1.995496021999177e-05, + "loss": 0.4506, + "step": 1682 + }, + { + "epoch": 0.6736, + "grad_norm": 2.8990304470062256, + "learning_rate": 1.9957568809385693e-05, + "loss": 0.4624, + "step": 1684 + }, + { + "epoch": 0.6744, + "grad_norm": 3.471292018890381, + "learning_rate": 1.9960099747657774e-05, + "loss": 0.4444, + "step": 1686 + }, + { + "epoch": 0.6752, + "grad_norm": 3.2368650436401367, + "learning_rate": 1.996255301507125e-05, + "loss": 0.3354, + "step": 1688 + }, + { + "epoch": 0.676, + "grad_norm": 4.808667182922363, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.8105, + "step": 1690 + }, + { + "epoch": 0.6768, + "grad_norm": 2.451111316680908, + "learning_rate": 1.9967226461403934e-05, + "loss": 0.337, + "step": 1692 + }, + { + "epoch": 0.6776, + "grad_norm": 11.671585083007812, + "learning_rate": 1.996944660387867e-05, + "loss": 0.4399, + "step": 1694 + }, + { + "epoch": 0.6784, + "grad_norm": 7.658277988433838, + "learning_rate": 1.997158900260614e-05, + "loss": 0.4076, + "step": 1696 + }, + { + "epoch": 0.6792, + "grad_norm": 5.0221686363220215, + "learning_rate": 1.9973653640879486e-05, + "loss": 0.4134, + "step": 1698 + }, + { + "epoch": 0.68, + "grad_norm": 7.082271575927734, + "learning_rate": 1.9975640502598246e-05, + "loss": 0.5284, + "step": 1700 + }, + { + "epoch": 0.6808, + "grad_norm": 3.120457649230957, + "learning_rate": 1.997754957226847e-05, + "loss": 0.5992, + "step": 1702 + }, + { + "epoch": 0.6816, + "grad_norm": 1.900876760482788, + "learning_rate": 1.9979380835002846e-05, + "loss": 0.3063, + "step": 1704 + }, + { + "epoch": 0.6824, + "grad_norm": 9.405780792236328, + "learning_rate": 1.9981134276520828e-05, + "loss": 0.3785, + "step": 1706 + }, + { + "epoch": 0.6832, + "grad_norm": 2.7329249382019043, + "learning_rate": 1.998280988314872e-05, + "loss": 0.2797, + "step": 1708 + }, + { + "epoch": 0.684, + "grad_norm": 2.579680919647217, + "learning_rate": 1.998440764181981e-05, + "loss": 0.471, + "step": 1710 + }, + { + "epoch": 0.6848, + "grad_norm": 5.701201915740967, + "learning_rate": 1.9985927540074453e-05, + "loss": 0.3976, + "step": 1712 + }, + { + "epoch": 0.6856, + "grad_norm": 7.745370864868164, + "learning_rate": 1.998736956606018e-05, + "loss": 0.3947, + "step": 1714 + }, + { + "epoch": 0.6864, + "grad_norm": 8.640480995178223, + "learning_rate": 1.9988733708531772e-05, + "loss": 1.0942, + "step": 1716 + }, + { + "epoch": 0.6872, + "grad_norm": 8.300025939941406, + "learning_rate": 1.9990019956851384e-05, + "loss": 0.7977, + "step": 1718 + }, + { + "epoch": 0.688, + "grad_norm": 5.9511895179748535, + "learning_rate": 1.9991228300988586e-05, + "loss": 0.4706, + "step": 1720 + }, + { + "epoch": 0.6888, + "grad_norm": 5.14940881729126, + "learning_rate": 1.999235873152047e-05, + "loss": 0.2438, + "step": 1722 + }, + { + "epoch": 0.6896, + "grad_norm": 6.6042985916137695, + "learning_rate": 1.9993411239631713e-05, + "loss": 0.3952, + "step": 1724 + }, + { + "epoch": 0.6904, + "grad_norm": 6.875572681427002, + "learning_rate": 1.9994385817114644e-05, + "loss": 0.4717, + "step": 1726 + }, + { + "epoch": 0.6912, + "grad_norm": 2.505627155303955, + "learning_rate": 1.9995282456369313e-05, + "loss": 0.3459, + "step": 1728 + }, + { + "epoch": 0.692, + "grad_norm": 6.113884925842285, + "learning_rate": 1.9996101150403543e-05, + "loss": 0.5111, + "step": 1730 + }, + { + "epoch": 0.6928, + "grad_norm": 2.1594738960266113, + "learning_rate": 1.9996841892833e-05, + "loss": 0.2743, + "step": 1732 + }, + { + "epoch": 0.6936, + "grad_norm": 20.72479820251465, + "learning_rate": 1.9997504677881224e-05, + "loss": 0.9828, + "step": 1734 + }, + { + "epoch": 0.6944, + "grad_norm": 7.1836700439453125, + "learning_rate": 1.999808950037968e-05, + "loss": 0.493, + "step": 1736 + }, + { + "epoch": 0.6952, + "grad_norm": 5.67575740814209, + "learning_rate": 1.9998596355767805e-05, + "loss": 0.3416, + "step": 1738 + }, + { + "epoch": 0.696, + "grad_norm": 9.29202651977539, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.5197, + "step": 1740 + }, + { + "epoch": 0.6968, + "grad_norm": 5.713569164276123, + "learning_rate": 1.9999376150010868e-05, + "loss": 0.4243, + "step": 1742 + }, + { + "epoch": 0.6976, + "grad_norm": 3.9301981925964355, + "learning_rate": 1.9999649082784807e-05, + "loss": 0.3626, + "step": 1744 + }, + { + "epoch": 0.6984, + "grad_norm": 3.975416898727417, + "learning_rate": 1.9999844036286483e-05, + "loss": 0.8586, + "step": 1746 + }, + { + "epoch": 0.6992, + "grad_norm": 6.84246826171875, + "learning_rate": 1.9999961008995607e-05, + "loss": 0.7227, + "step": 1748 + }, + { + "epoch": 0.7, + "grad_norm": 20.004087448120117, + "learning_rate": 2e-05, + "loss": 0.9191, + "step": 1750 + }, + { + "epoch": 0.7008, + "grad_norm": 5.791635990142822, + "learning_rate": 1.9999961008995607e-05, + "loss": 0.4023, + "step": 1752 + }, + { + "epoch": 0.7016, + "grad_norm": 5.369904041290283, + "learning_rate": 1.9999844036286483e-05, + "loss": 0.3795, + "step": 1754 + }, + { + "epoch": 0.7024, + "grad_norm": 3.682668924331665, + "learning_rate": 1.9999649082784807e-05, + "loss": 0.5381, + "step": 1756 + }, + { + "epoch": 0.7032, + "grad_norm": 2.7093493938446045, + "learning_rate": 1.9999376150010868e-05, + "loss": 0.6459, + "step": 1758 + }, + { + "epoch": 0.704, + "grad_norm": 2.522735357284546, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.5787, + "step": 1760 + }, + { + "epoch": 0.7048, + "grad_norm": 8.20150375366211, + "learning_rate": 1.9998596355767805e-05, + "loss": 0.5729, + "step": 1762 + }, + { + "epoch": 0.7056, + "grad_norm": 8.837404251098633, + "learning_rate": 1.999808950037968e-05, + "loss": 0.5134, + "step": 1764 + }, + { + "epoch": 0.7064, + "grad_norm": 5.532891273498535, + "learning_rate": 1.9997504677881224e-05, + "loss": 0.7412, + "step": 1766 + }, + { + "epoch": 0.7072, + "grad_norm": 5.736209869384766, + "learning_rate": 1.9996841892833e-05, + "loss": 0.5258, + "step": 1768 + }, + { + "epoch": 0.708, + "grad_norm": 4.132336616516113, + "learning_rate": 1.9996101150403547e-05, + "loss": 0.3544, + "step": 1770 + }, + { + "epoch": 0.7088, + "grad_norm": 4.415733337402344, + "learning_rate": 1.9995282456369313e-05, + "loss": 0.4621, + "step": 1772 + }, + { + "epoch": 0.7096, + "grad_norm": 4.9113264083862305, + "learning_rate": 1.9994385817114644e-05, + "loss": 0.5411, + "step": 1774 + }, + { + "epoch": 0.7104, + "grad_norm": 3.3670785427093506, + "learning_rate": 1.9993411239631713e-05, + "loss": 0.3433, + "step": 1776 + }, + { + "epoch": 0.7112, + "grad_norm": 3.183927059173584, + "learning_rate": 1.999235873152047e-05, + "loss": 0.4211, + "step": 1778 + }, + { + "epoch": 0.712, + "grad_norm": 8.622447967529297, + "learning_rate": 1.9991228300988586e-05, + "loss": 1.7236, + "step": 1780 + }, + { + "epoch": 0.7128, + "grad_norm": 3.417908191680908, + "learning_rate": 1.9990019956851384e-05, + "loss": 0.2192, + "step": 1782 + }, + { + "epoch": 0.7136, + "grad_norm": 15.327376365661621, + "learning_rate": 1.9988733708531772e-05, + "loss": 0.949, + "step": 1784 + }, + { + "epoch": 0.7144, + "grad_norm": 6.369883060455322, + "learning_rate": 1.998736956606018e-05, + "loss": 0.5447, + "step": 1786 + }, + { + "epoch": 0.7152, + "grad_norm": 5.282643795013428, + "learning_rate": 1.9985927540074453e-05, + "loss": 0.2633, + "step": 1788 + }, + { + "epoch": 0.716, + "grad_norm": 1.8992022275924683, + "learning_rate": 1.9984407641819812e-05, + "loss": 0.3532, + "step": 1790 + }, + { + "epoch": 0.7168, + "grad_norm": 9.116549491882324, + "learning_rate": 1.998280988314872e-05, + "loss": 0.5309, + "step": 1792 + }, + { + "epoch": 0.7176, + "grad_norm": 4.018207550048828, + "learning_rate": 1.9981134276520828e-05, + "loss": 0.6177, + "step": 1794 + }, + { + "epoch": 0.7184, + "grad_norm": 3.2702534198760986, + "learning_rate": 1.9979380835002846e-05, + "loss": 0.5697, + "step": 1796 + }, + { + "epoch": 0.7192, + "grad_norm": 13.79327392578125, + "learning_rate": 1.9977549572268467e-05, + "loss": 0.86, + "step": 1798 + }, + { + "epoch": 0.72, + "grad_norm": 2.421355724334717, + "learning_rate": 1.9975640502598246e-05, + "loss": 0.3692, + "step": 1800 + }, + { + "epoch": 0.7208, + "grad_norm": 6.39658260345459, + "learning_rate": 1.9973653640879486e-05, + "loss": 0.5341, + "step": 1802 + }, + { + "epoch": 0.7216, + "grad_norm": 12.072526931762695, + "learning_rate": 1.997158900260614e-05, + "loss": 0.843, + "step": 1804 + }, + { + "epoch": 0.7224, + "grad_norm": 6.202824115753174, + "learning_rate": 1.9969446603878673e-05, + "loss": 0.4508, + "step": 1806 + }, + { + "epoch": 0.7232, + "grad_norm": 12.091536521911621, + "learning_rate": 1.9967226461403934e-05, + "loss": 0.6631, + "step": 1808 + }, + { + "epoch": 0.724, + "grad_norm": 3.4513208866119385, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.3141, + "step": 1810 + }, + { + "epoch": 0.7248, + "grad_norm": 11.12137508392334, + "learning_rate": 1.996255301507125e-05, + "loss": 0.539, + "step": 1812 + }, + { + "epoch": 0.7256, + "grad_norm": 2.0206141471862793, + "learning_rate": 1.9960099747657774e-05, + "loss": 0.658, + "step": 1814 + }, + { + "epoch": 0.7264, + "grad_norm": 7.8277106285095215, + "learning_rate": 1.9957568809385693e-05, + "loss": 0.5622, + "step": 1816 + }, + { + "epoch": 0.7272, + "grad_norm": 4.81088924407959, + "learning_rate": 1.995496021999177e-05, + "loss": 0.3566, + "step": 1818 + }, + { + "epoch": 0.728, + "grad_norm": 4.36074161529541, + "learning_rate": 1.9952273999818312e-05, + "loss": 0.4191, + "step": 1820 + }, + { + "epoch": 0.7288, + "grad_norm": 4.0613226890563965, + "learning_rate": 1.9949510169813006e-05, + "loss": 0.4074, + "step": 1822 + }, + { + "epoch": 0.7296, + "grad_norm": 5.445612907409668, + "learning_rate": 1.9946668751528745e-05, + "loss": 0.3167, + "step": 1824 + }, + { + "epoch": 0.7304, + "grad_norm": 2.8114616870880127, + "learning_rate": 1.994374976712348e-05, + "loss": 0.6147, + "step": 1826 + }, + { + "epoch": 0.7312, + "grad_norm": 2.5986952781677246, + "learning_rate": 1.9940753239360047e-05, + "loss": 0.5259, + "step": 1828 + }, + { + "epoch": 0.732, + "grad_norm": 3.6003918647766113, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.3197, + "step": 1830 + }, + { + "epoch": 0.7328, + "grad_norm": 3.8639371395111084, + "learning_rate": 1.993452764783328e-05, + "loss": 0.21, + "step": 1832 + }, + { + "epoch": 0.7336, + "grad_norm": 2.496325969696045, + "learning_rate": 1.9931298632618352e-05, + "loss": 0.3632, + "step": 1834 + }, + { + "epoch": 0.7344, + "grad_norm": 2.928487777709961, + "learning_rate": 1.9927992171141707e-05, + "loss": 0.3398, + "step": 1836 + }, + { + "epoch": 0.7352, + "grad_norm": 5.671520709991455, + "learning_rate": 1.9924608289187786e-05, + "loss": 0.5193, + "step": 1838 + }, + { + "epoch": 0.736, + "grad_norm": 3.602909803390503, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.1562, + "step": 1840 + }, + { + "epoch": 0.7368, + "grad_norm": 2.749011993408203, + "learning_rate": 1.9917608370004417e-05, + "loss": 0.315, + "step": 1842 + }, + { + "epoch": 0.7376, + "grad_norm": 4.273372173309326, + "learning_rate": 1.9913992387361747e-05, + "loss": 0.3395, + "step": 1844 + }, + { + "epoch": 0.7384, + "grad_norm": 5.967968463897705, + "learning_rate": 1.9910299093414932e-05, + "loss": 0.4194, + "step": 1846 + }, + { + "epoch": 0.7392, + "grad_norm": 2.5091896057128906, + "learning_rate": 1.990652851696501e-05, + "loss": 0.5031, + "step": 1848 + }, + { + "epoch": 0.74, + "grad_norm": 7.949019432067871, + "learning_rate": 1.9902680687415704e-05, + "loss": 1.0475, + "step": 1850 + }, + { + "epoch": 0.7408, + "grad_norm": 2.438105821609497, + "learning_rate": 1.9898755634773155e-05, + "loss": 0.3475, + "step": 1852 + }, + { + "epoch": 0.7416, + "grad_norm": 8.848106384277344, + "learning_rate": 1.9894753389645723e-05, + "loss": 0.4821, + "step": 1854 + }, + { + "epoch": 0.7424, + "grad_norm": 5.018491744995117, + "learning_rate": 1.9890673983243704e-05, + "loss": 0.3251, + "step": 1856 + }, + { + "epoch": 0.7432, + "grad_norm": 5.742210388183594, + "learning_rate": 1.9886517447379143e-05, + "loss": 0.2919, + "step": 1858 + }, + { + "epoch": 0.744, + "grad_norm": 10.081647872924805, + "learning_rate": 1.988228381446553e-05, + "loss": 0.5546, + "step": 1860 + }, + { + "epoch": 0.7448, + "grad_norm": 4.09962272644043, + "learning_rate": 1.987797311751759e-05, + "loss": 0.3861, + "step": 1862 + }, + { + "epoch": 0.7456, + "grad_norm": 3.1583993434906006, + "learning_rate": 1.9873585390151007e-05, + "loss": 0.3128, + "step": 1864 + }, + { + "epoch": 0.7464, + "grad_norm": 1.916212797164917, + "learning_rate": 1.9869120666582153e-05, + "loss": 0.3608, + "step": 1866 + }, + { + "epoch": 0.7472, + "grad_norm": 8.458840370178223, + "learning_rate": 1.9864578981627844e-05, + "loss": 0.7873, + "step": 1868 + }, + { + "epoch": 0.748, + "grad_norm": 6.865233898162842, + "learning_rate": 1.985996037070505e-05, + "loss": 0.3612, + "step": 1870 + }, + { + "epoch": 0.7488, + "grad_norm": 10.543816566467285, + "learning_rate": 1.985526486983063e-05, + "loss": 0.698, + "step": 1872 + }, + { + "epoch": 0.7496, + "grad_norm": 6.476944446563721, + "learning_rate": 1.9850492515621038e-05, + "loss": 0.5256, + "step": 1874 + }, + { + "epoch": 0.7504, + "grad_norm": 8.187792778015137, + "learning_rate": 1.9845643345292055e-05, + "loss": 0.4788, + "step": 1876 + }, + { + "epoch": 0.7512, + "grad_norm": 1.9630470275878906, + "learning_rate": 1.9840717396658483e-05, + "loss": 0.2404, + "step": 1878 + }, + { + "epoch": 0.752, + "grad_norm": 8.175183296203613, + "learning_rate": 1.983571470813386e-05, + "loss": 0.699, + "step": 1880 + }, + { + "epoch": 0.7528, + "grad_norm": 12.990069389343262, + "learning_rate": 1.983063531873016e-05, + "loss": 1.1128, + "step": 1882 + }, + { + "epoch": 0.7536, + "grad_norm": 5.050752639770508, + "learning_rate": 1.982547926805747e-05, + "loss": 0.7573, + "step": 1884 + }, + { + "epoch": 0.7544, + "grad_norm": 6.0260515213012695, + "learning_rate": 1.9820246596323724e-05, + "loss": 0.3665, + "step": 1886 + }, + { + "epoch": 0.7552, + "grad_norm": 14.117484092712402, + "learning_rate": 1.981493734433433e-05, + "loss": 0.8667, + "step": 1888 + }, + { + "epoch": 0.756, + "grad_norm": 11.667659759521484, + "learning_rate": 1.9809551553491918e-05, + "loss": 0.6158, + "step": 1890 + }, + { + "epoch": 0.7568, + "grad_norm": 5.7208380699157715, + "learning_rate": 1.9804089265795963e-05, + "loss": 0.5358, + "step": 1892 + }, + { + "epoch": 0.7576, + "grad_norm": 5.21250057220459, + "learning_rate": 1.979855052384247e-05, + "loss": 0.4207, + "step": 1894 + }, + { + "epoch": 0.7584, + "grad_norm": 3.6927437782287598, + "learning_rate": 1.979293537082368e-05, + "loss": 0.6051, + "step": 1896 + }, + { + "epoch": 0.7592, + "grad_norm": 4.236647605895996, + "learning_rate": 1.9787243850527663e-05, + "loss": 0.3979, + "step": 1898 + }, + { + "epoch": 0.76, + "grad_norm": 5.474388122558594, + "learning_rate": 1.978147600733806e-05, + "loss": 0.6603, + "step": 1900 + }, + { + "epoch": 0.7608, + "grad_norm": 3.9843273162841797, + "learning_rate": 1.977563188623365e-05, + "loss": 0.4227, + "step": 1902 + }, + { + "epoch": 0.7616, + "grad_norm": 2.3406167030334473, + "learning_rate": 1.9769711532788086e-05, + "loss": 0.2071, + "step": 1904 + }, + { + "epoch": 0.7624, + "grad_norm": 8.789372444152832, + "learning_rate": 1.9763714993169448e-05, + "loss": 0.5955, + "step": 1906 + }, + { + "epoch": 0.7632, + "grad_norm": 4.878142833709717, + "learning_rate": 1.9757642314139977e-05, + "loss": 0.3273, + "step": 1908 + }, + { + "epoch": 0.764, + "grad_norm": 3.703111410140991, + "learning_rate": 1.9751493543055638e-05, + "loss": 0.2455, + "step": 1910 + }, + { + "epoch": 0.7648, + "grad_norm": 1.3227205276489258, + "learning_rate": 1.9745268727865774e-05, + "loss": 0.1464, + "step": 1912 + }, + { + "epoch": 0.7656, + "grad_norm": 2.471626043319702, + "learning_rate": 1.973896791711276e-05, + "loss": 0.4114, + "step": 1914 + }, + { + "epoch": 0.7664, + "grad_norm": 6.342803955078125, + "learning_rate": 1.9732591159931567e-05, + "loss": 0.3901, + "step": 1916 + }, + { + "epoch": 0.7672, + "grad_norm": 3.0534446239471436, + "learning_rate": 1.972613850604944e-05, + "loss": 0.6209, + "step": 1918 + }, + { + "epoch": 0.768, + "grad_norm": 6.819042682647705, + "learning_rate": 1.9719610005785463e-05, + "loss": 0.4309, + "step": 1920 + }, + { + "epoch": 0.7688, + "grad_norm": 7.792922496795654, + "learning_rate": 1.9713005710050206e-05, + "loss": 0.4873, + "step": 1922 + }, + { + "epoch": 0.7696, + "grad_norm": 5.196712493896484, + "learning_rate": 1.9706325670345276e-05, + "loss": 0.3977, + "step": 1924 + }, + { + "epoch": 0.7704, + "grad_norm": 5.214966297149658, + "learning_rate": 1.9699569938762975e-05, + "loss": 0.4326, + "step": 1926 + }, + { + "epoch": 0.7712, + "grad_norm": 8.276177406311035, + "learning_rate": 1.969273856798586e-05, + "loss": 0.446, + "step": 1928 + }, + { + "epoch": 0.772, + "grad_norm": 3.4883368015289307, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.3672, + "step": 1930 + }, + { + "epoch": 0.7728, + "grad_norm": 1.9015693664550781, + "learning_rate": 1.9678849122526195e-05, + "loss": 0.3345, + "step": 1932 + }, + { + "epoch": 0.7736, + "grad_norm": 8.110586166381836, + "learning_rate": 1.967179115615633e-05, + "loss": 0.4249, + "step": 1934 + }, + { + "epoch": 0.7744, + "grad_norm": 4.227611064910889, + "learning_rate": 1.966465776721618e-05, + "loss": 0.6143, + "step": 1936 + }, + { + "epoch": 0.7752, + "grad_norm": 2.341872215270996, + "learning_rate": 1.9657449011333328e-05, + "loss": 0.5497, + "step": 1938 + }, + { + "epoch": 0.776, + "grad_norm": 2.357506513595581, + "learning_rate": 1.965016494472312e-05, + "loss": 0.2292, + "step": 1940 + }, + { + "epoch": 0.7768, + "grad_norm": 3.373274087905884, + "learning_rate": 1.964280562418815e-05, + "loss": 0.2772, + "step": 1942 + }, + { + "epoch": 0.7776, + "grad_norm": 27.803871154785156, + "learning_rate": 1.963537110711789e-05, + "loss": 0.8697, + "step": 1944 + }, + { + "epoch": 0.7784, + "grad_norm": 4.842161655426025, + "learning_rate": 1.9627861451488194e-05, + "loss": 0.4879, + "step": 1946 + }, + { + "epoch": 0.7792, + "grad_norm": 8.67030143737793, + "learning_rate": 1.962027671586086e-05, + "loss": 0.5492, + "step": 1948 + }, + { + "epoch": 0.78, + "grad_norm": 3.379067897796631, + "learning_rate": 1.9612616959383194e-05, + "loss": 0.3028, + "step": 1950 + }, + { + "epoch": 0.7808, + "grad_norm": 3.8166677951812744, + "learning_rate": 1.96048822417875e-05, + "loss": 0.2783, + "step": 1952 + }, + { + "epoch": 0.7816, + "grad_norm": 3.9429714679718018, + "learning_rate": 1.9597072623390668e-05, + "loss": 0.4552, + "step": 1954 + }, + { + "epoch": 0.7824, + "grad_norm": 2.631047487258911, + "learning_rate": 1.9589188165093666e-05, + "loss": 0.3029, + "step": 1956 + }, + { + "epoch": 0.7832, + "grad_norm": 2.698028087615967, + "learning_rate": 1.95812289283811e-05, + "loss": 0.1427, + "step": 1958 + }, + { + "epoch": 0.784, + "grad_norm": 6.577787399291992, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.565, + "step": 1960 + }, + { + "epoch": 0.7848, + "grad_norm": 2.450500249862671, + "learning_rate": 1.9565086368562784e-05, + "loss": 0.4542, + "step": 1962 + }, + { + "epoch": 0.7856, + "grad_norm": 10.015090942382812, + "learning_rate": 1.9556903171339966e-05, + "loss": 0.3882, + "step": 1964 + }, + { + "epoch": 0.7864, + "grad_norm": 4.287123680114746, + "learning_rate": 1.954864544746643e-05, + "loss": 0.2544, + "step": 1966 + }, + { + "epoch": 0.7872, + "grad_norm": 3.965257406234741, + "learning_rate": 1.9540313261337585e-05, + "loss": 0.4182, + "step": 1968 + }, + { + "epoch": 0.788, + "grad_norm": 5.50913143157959, + "learning_rate": 1.9531906677929472e-05, + "loss": 0.3617, + "step": 1970 + }, + { + "epoch": 0.7888, + "grad_norm": 3.2805440425872803, + "learning_rate": 1.9523425762798335e-05, + "loss": 0.204, + "step": 1972 + }, + { + "epoch": 0.7896, + "grad_norm": 10.288555145263672, + "learning_rate": 1.9514870582080035e-05, + "loss": 0.6548, + "step": 1974 + }, + { + "epoch": 0.7904, + "grad_norm": 5.323610782623291, + "learning_rate": 1.95062412024896e-05, + "loss": 1.676, + "step": 1976 + }, + { + "epoch": 0.7912, + "grad_norm": 12.464410781860352, + "learning_rate": 1.9497537691320667e-05, + "loss": 0.7239, + "step": 1978 + }, + { + "epoch": 0.792, + "grad_norm": 3.5932133197784424, + "learning_rate": 1.948876011644497e-05, + "loss": 0.3674, + "step": 1980 + }, + { + "epoch": 0.7928, + "grad_norm": 3.760422945022583, + "learning_rate": 1.9479908546311787e-05, + "loss": 0.3465, + "step": 1982 + }, + { + "epoch": 0.7936, + "grad_norm": 5.273140907287598, + "learning_rate": 1.9470983049947443e-05, + "loss": 0.5971, + "step": 1984 + }, + { + "epoch": 0.7944, + "grad_norm": 2.991767168045044, + "learning_rate": 1.9461983696954767e-05, + "loss": 0.27, + "step": 1986 + }, + { + "epoch": 0.7952, + "grad_norm": 2.138972759246826, + "learning_rate": 1.9452910557512494e-05, + "loss": 0.2681, + "step": 1988 + }, + { + "epoch": 0.796, + "grad_norm": 8.971880912780762, + "learning_rate": 1.9443763702374818e-05, + "loss": 0.6808, + "step": 1990 + }, + { + "epoch": 0.7968, + "grad_norm": 2.9185991287231445, + "learning_rate": 1.9434543202870726e-05, + "loss": 0.6934, + "step": 1992 + }, + { + "epoch": 0.7976, + "grad_norm": 4.297379493713379, + "learning_rate": 1.9425249130903544e-05, + "loss": 0.3042, + "step": 1994 + }, + { + "epoch": 0.7984, + "grad_norm": 3.2408552169799805, + "learning_rate": 1.94158815589503e-05, + "loss": 0.5953, + "step": 1996 + }, + { + "epoch": 0.7992, + "grad_norm": 8.040743827819824, + "learning_rate": 1.940644056006122e-05, + "loss": 0.8196, + "step": 1998 + }, + { + "epoch": 0.8, + "grad_norm": 7.990960597991943, + "learning_rate": 1.939692620785909e-05, + "loss": 0.4612, + "step": 2000 + }, + { + "epoch": 0.8008, + "grad_norm": 6.383898735046387, + "learning_rate": 1.9387338576538746e-05, + "loss": 0.4105, + "step": 2002 + }, + { + "epoch": 0.8016, + "grad_norm": 6.999117851257324, + "learning_rate": 1.9377677740866464e-05, + "loss": 0.2251, + "step": 2004 + }, + { + "epoch": 0.8024, + "grad_norm": 7.917981147766113, + "learning_rate": 1.936794377617938e-05, + "loss": 0.7872, + "step": 2006 + }, + { + "epoch": 0.8032, + "grad_norm": 5.2557501792907715, + "learning_rate": 1.9358136758384917e-05, + "loss": 0.5837, + "step": 2008 + }, + { + "epoch": 0.804, + "grad_norm": 4.809940338134766, + "learning_rate": 1.9348256763960146e-05, + "loss": 0.6812, + "step": 2010 + }, + { + "epoch": 0.8048, + "grad_norm": 2.842546224594116, + "learning_rate": 1.9338303869951273e-05, + "loss": 0.3981, + "step": 2012 + }, + { + "epoch": 0.8056, + "grad_norm": 3.4043211936950684, + "learning_rate": 1.9328278153972943e-05, + "loss": 0.2863, + "step": 2014 + }, + { + "epoch": 0.8064, + "grad_norm": 2.5935723781585693, + "learning_rate": 1.931817969420773e-05, + "loss": 0.353, + "step": 2016 + }, + { + "epoch": 0.8072, + "grad_norm": 4.572518825531006, + "learning_rate": 1.930800856940543e-05, + "loss": 0.3879, + "step": 2018 + }, + { + "epoch": 0.808, + "grad_norm": 4.014364242553711, + "learning_rate": 1.929776485888252e-05, + "loss": 0.5561, + "step": 2020 + }, + { + "epoch": 0.8088, + "grad_norm": 4.8947906494140625, + "learning_rate": 1.9287448642521517e-05, + "loss": 0.734, + "step": 2022 + }, + { + "epoch": 0.8096, + "grad_norm": 1.952275037765503, + "learning_rate": 1.9277060000770342e-05, + "loss": 0.5817, + "step": 2024 + }, + { + "epoch": 0.8104, + "grad_norm": 4.268755912780762, + "learning_rate": 1.9266599014641727e-05, + "loss": 0.5069, + "step": 2026 + }, + { + "epoch": 0.8112, + "grad_norm": 3.921480655670166, + "learning_rate": 1.925606576571252e-05, + "loss": 0.5792, + "step": 2028 + }, + { + "epoch": 0.812, + "grad_norm": 12.108016967773438, + "learning_rate": 1.9245460336123136e-05, + "loss": 0.6826, + "step": 2030 + }, + { + "epoch": 0.8128, + "grad_norm": 3.33408522605896, + "learning_rate": 1.923478280857682e-05, + "loss": 0.464, + "step": 2032 + }, + { + "epoch": 0.8136, + "grad_norm": 1.6179202795028687, + "learning_rate": 1.9224033266339103e-05, + "loss": 0.2836, + "step": 2034 + }, + { + "epoch": 0.8144, + "grad_norm": 12.353996276855469, + "learning_rate": 1.9213211793237066e-05, + "loss": 0.7539, + "step": 2036 + }, + { + "epoch": 0.8152, + "grad_norm": 6.888246536254883, + "learning_rate": 1.9202318473658707e-05, + "loss": 0.5177, + "step": 2038 + }, + { + "epoch": 0.816, + "grad_norm": 4.788189888000488, + "learning_rate": 1.919135339255235e-05, + "loss": 0.6794, + "step": 2040 + }, + { + "epoch": 0.8168, + "grad_norm": 3.879225492477417, + "learning_rate": 1.918031663542588e-05, + "loss": 0.2968, + "step": 2042 + }, + { + "epoch": 0.8176, + "grad_norm": 2.4591026306152344, + "learning_rate": 1.916920828834617e-05, + "loss": 0.3379, + "step": 2044 + }, + { + "epoch": 0.8184, + "grad_norm": 3.039889335632324, + "learning_rate": 1.9158028437938313e-05, + "loss": 0.4206, + "step": 2046 + }, + { + "epoch": 0.8192, + "grad_norm": 9.00166130065918, + "learning_rate": 1.9146777171385057e-05, + "loss": 0.7119, + "step": 2048 + }, + { + "epoch": 0.82, + "grad_norm": 16.83409309387207, + "learning_rate": 1.913545457642601e-05, + "loss": 0.9266, + "step": 2050 + }, + { + "epoch": 0.8208, + "grad_norm": 8.59817123413086, + "learning_rate": 1.9124060741357065e-05, + "loss": 0.4125, + "step": 2052 + }, + { + "epoch": 0.8216, + "grad_norm": 4.9218525886535645, + "learning_rate": 1.911259575502963e-05, + "loss": 0.8163, + "step": 2054 + }, + { + "epoch": 0.8224, + "grad_norm": 7.36513090133667, + "learning_rate": 1.910105970684996e-05, + "loss": 0.4727, + "step": 2056 + }, + { + "epoch": 0.8232, + "grad_norm": 2.194617986679077, + "learning_rate": 1.908945268677849e-05, + "loss": 0.8017, + "step": 2058 + }, + { + "epoch": 0.824, + "grad_norm": 4.550289154052734, + "learning_rate": 1.9077774785329085e-05, + "loss": 0.3739, + "step": 2060 + }, + { + "epoch": 0.8248, + "grad_norm": 3.4990861415863037, + "learning_rate": 1.9066026093568383e-05, + "loss": 0.3745, + "step": 2062 + }, + { + "epoch": 0.8256, + "grad_norm": 8.7343111038208, + "learning_rate": 1.9054206703115013e-05, + "loss": 0.8979, + "step": 2064 + }, + { + "epoch": 0.8264, + "grad_norm": 2.711123466491699, + "learning_rate": 1.9042316706138994e-05, + "loss": 0.3762, + "step": 2066 + }, + { + "epoch": 0.8272, + "grad_norm": 1.9720872640609741, + "learning_rate": 1.903035619536087e-05, + "loss": 0.4065, + "step": 2068 + }, + { + "epoch": 0.828, + "grad_norm": 4.787201881408691, + "learning_rate": 1.901832526405114e-05, + "loss": 0.4545, + "step": 2070 + }, + { + "epoch": 0.8288, + "grad_norm": 4.116015434265137, + "learning_rate": 1.9006224006029414e-05, + "loss": 0.3861, + "step": 2072 + }, + { + "epoch": 0.8296, + "grad_norm": 4.5069451332092285, + "learning_rate": 1.899405251566371e-05, + "loss": 0.5742, + "step": 2074 + }, + { + "epoch": 0.8304, + "grad_norm": 5.005687713623047, + "learning_rate": 1.8981810887869797e-05, + "loss": 0.5577, + "step": 2076 + }, + { + "epoch": 0.8312, + "grad_norm": 5.526610374450684, + "learning_rate": 1.8969499218110302e-05, + "loss": 0.4934, + "step": 2078 + }, + { + "epoch": 0.832, + "grad_norm": 2.5126290321350098, + "learning_rate": 1.8957117602394133e-05, + "loss": 0.3066, + "step": 2080 + }, + { + "epoch": 0.8328, + "grad_norm": 9.842791557312012, + "learning_rate": 1.8944666137275596e-05, + "loss": 0.6265, + "step": 2082 + }, + { + "epoch": 0.8336, + "grad_norm": 13.449177742004395, + "learning_rate": 1.8932144919853744e-05, + "loss": 0.7168, + "step": 2084 + }, + { + "epoch": 0.8344, + "grad_norm": 4.880086421966553, + "learning_rate": 1.8919554047771508e-05, + "loss": 0.3834, + "step": 2086 + }, + { + "epoch": 0.8352, + "grad_norm": 5.4077019691467285, + "learning_rate": 1.890689361921507e-05, + "loss": 0.3614, + "step": 2088 + }, + { + "epoch": 0.836, + "grad_norm": 2.8501031398773193, + "learning_rate": 1.8894163732912986e-05, + "loss": 0.4492, + "step": 2090 + }, + { + "epoch": 0.8368, + "grad_norm": 5.37775993347168, + "learning_rate": 1.8881364488135445e-05, + "loss": 0.4278, + "step": 2092 + }, + { + "epoch": 0.8376, + "grad_norm": 2.3796370029449463, + "learning_rate": 1.886849598469357e-05, + "loss": 0.3203, + "step": 2094 + }, + { + "epoch": 0.8384, + "grad_norm": 5.196391582489014, + "learning_rate": 1.8855558322938492e-05, + "loss": 0.3425, + "step": 2096 + }, + { + "epoch": 0.8392, + "grad_norm": 5.295304775238037, + "learning_rate": 1.8842551603760725e-05, + "loss": 0.5215, + "step": 2098 + }, + { + "epoch": 0.84, + "grad_norm": 4.4182562828063965, + "learning_rate": 1.8829475928589265e-05, + "loss": 0.4456, + "step": 2100 + }, + { + "epoch": 0.8408, + "grad_norm": 3.1044559478759766, + "learning_rate": 1.8816331399390874e-05, + "loss": 0.3528, + "step": 2102 + }, + { + "epoch": 0.8416, + "grad_norm": 2.9102487564086914, + "learning_rate": 1.88031181186692e-05, + "loss": 0.34, + "step": 2104 + }, + { + "epoch": 0.8424, + "grad_norm": 6.911310195922852, + "learning_rate": 1.8789836189464092e-05, + "loss": 0.5755, + "step": 2106 + }, + { + "epoch": 0.8432, + "grad_norm": 5.777722358703613, + "learning_rate": 1.877648571535068e-05, + "loss": 0.8788, + "step": 2108 + }, + { + "epoch": 0.844, + "grad_norm": 5.194482326507568, + "learning_rate": 1.8763066800438638e-05, + "loss": 0.6766, + "step": 2110 + }, + { + "epoch": 0.8448, + "grad_norm": 10.82712459564209, + "learning_rate": 1.8749579549371387e-05, + "loss": 0.9228, + "step": 2112 + }, + { + "epoch": 0.8456, + "grad_norm": 4.030005931854248, + "learning_rate": 1.8736024067325188e-05, + "loss": 0.4271, + "step": 2114 + }, + { + "epoch": 0.8464, + "grad_norm": 5.070348262786865, + "learning_rate": 1.8722400460008437e-05, + "loss": 0.5131, + "step": 2116 + }, + { + "epoch": 0.8472, + "grad_norm": 3.887665033340454, + "learning_rate": 1.8708708833660748e-05, + "loss": 2.6075, + "step": 2118 + }, + { + "epoch": 0.848, + "grad_norm": 4.675127983093262, + "learning_rate": 1.8694949295052198e-05, + "loss": 0.7133, + "step": 2120 + }, + { + "epoch": 0.8488, + "grad_norm": 4.268080711364746, + "learning_rate": 1.868112195148239e-05, + "loss": 0.5845, + "step": 2122 + }, + { + "epoch": 0.8496, + "grad_norm": 3.9360458850860596, + "learning_rate": 1.866722691077977e-05, + "loss": 0.4167, + "step": 2124 + }, + { + "epoch": 0.8504, + "grad_norm": 20.160083770751953, + "learning_rate": 1.8653264281300626e-05, + "loss": 0.9318, + "step": 2126 + }, + { + "epoch": 0.8512, + "grad_norm": 4.777723789215088, + "learning_rate": 1.8639234171928355e-05, + "loss": 0.4258, + "step": 2128 + }, + { + "epoch": 0.852, + "grad_norm": 3.6808910369873047, + "learning_rate": 1.8625136692072587e-05, + "loss": 0.4202, + "step": 2130 + }, + { + "epoch": 0.8528, + "grad_norm": 8.434818267822266, + "learning_rate": 1.8610971951668265e-05, + "loss": 0.7677, + "step": 2132 + }, + { + "epoch": 0.8536, + "grad_norm": 8.012714385986328, + "learning_rate": 1.8596740061174912e-05, + "loss": 0.5678, + "step": 2134 + }, + { + "epoch": 0.8544, + "grad_norm": 9.045832633972168, + "learning_rate": 1.858244113157566e-05, + "loss": 0.3242, + "step": 2136 + }, + { + "epoch": 0.8552, + "grad_norm": 6.075767993927002, + "learning_rate": 1.8568075274376432e-05, + "loss": 0.5078, + "step": 2138 + }, + { + "epoch": 0.856, + "grad_norm": 3.5789854526519775, + "learning_rate": 1.8553642601605083e-05, + "loss": 0.3583, + "step": 2140 + }, + { + "epoch": 0.8568, + "grad_norm": 3.273469924926758, + "learning_rate": 1.8539143225810457e-05, + "loss": 0.162, + "step": 2142 + }, + { + "epoch": 0.8576, + "grad_norm": 6.589645862579346, + "learning_rate": 1.852457726006163e-05, + "loss": 0.7334, + "step": 2144 + }, + { + "epoch": 0.8584, + "grad_norm": 6.523824691772461, + "learning_rate": 1.8509944817946917e-05, + "loss": 0.4509, + "step": 2146 + }, + { + "epoch": 0.8592, + "grad_norm": 4.779366970062256, + "learning_rate": 1.8495246013573064e-05, + "loss": 0.2784, + "step": 2148 + }, + { + "epoch": 0.86, + "grad_norm": 2.8005824089050293, + "learning_rate": 1.848048096156426e-05, + "loss": 0.3085, + "step": 2150 + }, + { + "epoch": 0.8608, + "grad_norm": 4.007396221160889, + "learning_rate": 1.8465649777061387e-05, + "loss": 0.6944, + "step": 2152 + }, + { + "epoch": 0.8616, + "grad_norm": 7.373164653778076, + "learning_rate": 1.8450752575720967e-05, + "loss": 0.4137, + "step": 2154 + }, + { + "epoch": 0.8624, + "grad_norm": 2.44792103767395, + "learning_rate": 1.843578947371439e-05, + "loss": 0.494, + "step": 2156 + }, + { + "epoch": 0.8632, + "grad_norm": 5.441621780395508, + "learning_rate": 1.8420760587726935e-05, + "loss": 1.3811, + "step": 2158 + }, + { + "epoch": 0.864, + "grad_norm": 14.928495407104492, + "learning_rate": 1.8405666034956846e-05, + "loss": 0.6873, + "step": 2160 + }, + { + "epoch": 0.8648, + "grad_norm": 19.143177032470703, + "learning_rate": 1.8390505933114507e-05, + "loss": 0.9384, + "step": 2162 + }, + { + "epoch": 0.8656, + "grad_norm": 2.426013708114624, + "learning_rate": 1.8375280400421414e-05, + "loss": 0.4808, + "step": 2164 + }, + { + "epoch": 0.8664, + "grad_norm": 4.136850833892822, + "learning_rate": 1.8359989555609365e-05, + "loss": 0.32, + "step": 2166 + }, + { + "epoch": 0.8672, + "grad_norm": 7.991090774536133, + "learning_rate": 1.834463351791939e-05, + "loss": 0.7678, + "step": 2168 + }, + { + "epoch": 0.868, + "grad_norm": 4.490537643432617, + "learning_rate": 1.8329212407101006e-05, + "loss": 0.5311, + "step": 2170 + }, + { + "epoch": 0.8688, + "grad_norm": 7.613470554351807, + "learning_rate": 1.8313726343411085e-05, + "loss": 0.4121, + "step": 2172 + }, + { + "epoch": 0.8696, + "grad_norm": 5.197305679321289, + "learning_rate": 1.82981754476131e-05, + "loss": 0.1904, + "step": 2174 + }, + { + "epoch": 0.8704, + "grad_norm": 9.690563201904297, + "learning_rate": 1.8282559840976053e-05, + "loss": 0.4502, + "step": 2176 + }, + { + "epoch": 0.8712, + "grad_norm": 2.421522855758667, + "learning_rate": 1.8266879645273557e-05, + "loss": 0.2495, + "step": 2178 + }, + { + "epoch": 0.872, + "grad_norm": 7.8181328773498535, + "learning_rate": 1.8251134982782966e-05, + "loss": 0.2981, + "step": 2180 + }, + { + "epoch": 0.8728, + "grad_norm": 1.5006805658340454, + "learning_rate": 1.823532597628428e-05, + "loss": 0.3555, + "step": 2182 + }, + { + "epoch": 0.8736, + "grad_norm": 3.146165609359741, + "learning_rate": 1.8219452749059336e-05, + "loss": 0.3498, + "step": 2184 + }, + { + "epoch": 0.8744, + "grad_norm": 2.798628807067871, + "learning_rate": 1.8203515424890734e-05, + "loss": 0.5941, + "step": 2186 + }, + { + "epoch": 0.8752, + "grad_norm": 7.344174861907959, + "learning_rate": 1.8187514128060956e-05, + "loss": 0.8017, + "step": 2188 + }, + { + "epoch": 0.876, + "grad_norm": 1.4742686748504639, + "learning_rate": 1.8171448983351284e-05, + "loss": 0.2445, + "step": 2190 + }, + { + "epoch": 0.8768, + "grad_norm": 3.6886837482452393, + "learning_rate": 1.8155320116040983e-05, + "loss": 0.1941, + "step": 2192 + }, + { + "epoch": 0.8776, + "grad_norm": 10.147557258605957, + "learning_rate": 1.8139127651906193e-05, + "loss": 0.7069, + "step": 2194 + }, + { + "epoch": 0.8784, + "grad_norm": 2.233205556869507, + "learning_rate": 1.8122871717218974e-05, + "loss": 0.438, + "step": 2196 + }, + { + "epoch": 0.8792, + "grad_norm": 5.252257823944092, + "learning_rate": 1.8106552438746413e-05, + "loss": 0.7512, + "step": 2198 + }, + { + "epoch": 0.88, + "grad_norm": 8.280484199523926, + "learning_rate": 1.8090169943749477e-05, + "loss": 1.0446, + "step": 2200 + }, + { + "epoch": 0.8808, + "grad_norm": 8.377373695373535, + "learning_rate": 1.807372435998219e-05, + "loss": 0.4684, + "step": 2202 + }, + { + "epoch": 0.8816, + "grad_norm": 4.66433048248291, + "learning_rate": 1.8057215815690487e-05, + "loss": 0.3269, + "step": 2204 + }, + { + "epoch": 0.8824, + "grad_norm": 2.217611312866211, + "learning_rate": 1.8040644439611355e-05, + "loss": 0.5373, + "step": 2206 + }, + { + "epoch": 0.8832, + "grad_norm": 5.229759216308594, + "learning_rate": 1.8024010360971665e-05, + "loss": 0.3007, + "step": 2208 + }, + { + "epoch": 0.884, + "grad_norm": 5.907240390777588, + "learning_rate": 1.8007313709487345e-05, + "loss": 0.4164, + "step": 2210 + }, + { + "epoch": 0.8848, + "grad_norm": 6.11362886428833, + "learning_rate": 1.7990554615362207e-05, + "loss": 0.2909, + "step": 2212 + }, + { + "epoch": 0.8856, + "grad_norm": 2.218219757080078, + "learning_rate": 1.7973733209287036e-05, + "loss": 0.3034, + "step": 2214 + }, + { + "epoch": 0.8864, + "grad_norm": 2.290370464324951, + "learning_rate": 1.7956849622438568e-05, + "loss": 0.2474, + "step": 2216 + }, + { + "epoch": 0.8872, + "grad_norm": 3.0256340503692627, + "learning_rate": 1.7939903986478357e-05, + "loss": 0.5, + "step": 2218 + }, + { + "epoch": 0.888, + "grad_norm": 7.33586311340332, + "learning_rate": 1.7922896433551913e-05, + "loss": 0.7149, + "step": 2220 + }, + { + "epoch": 0.8888, + "grad_norm": 2.595695734024048, + "learning_rate": 1.7905827096287525e-05, + "loss": 0.24, + "step": 2222 + }, + { + "epoch": 0.8896, + "grad_norm": 4.993026256561279, + "learning_rate": 1.7888696107795347e-05, + "loss": 0.4084, + "step": 2224 + }, + { + "epoch": 0.8904, + "grad_norm": 13.632013320922852, + "learning_rate": 1.787150360166623e-05, + "loss": 0.5648, + "step": 2226 + }, + { + "epoch": 0.8912, + "grad_norm": 4.771726608276367, + "learning_rate": 1.7854249711970826e-05, + "loss": 0.3186, + "step": 2228 + }, + { + "epoch": 0.892, + "grad_norm": 1.7299787998199463, + "learning_rate": 1.783693457325841e-05, + "loss": 0.1849, + "step": 2230 + }, + { + "epoch": 0.8928, + "grad_norm": 5.569767951965332, + "learning_rate": 1.7819558320555902e-05, + "loss": 0.5359, + "step": 2232 + }, + { + "epoch": 0.8936, + "grad_norm": 2.1846365928649902, + "learning_rate": 1.780212108936685e-05, + "loss": 0.2436, + "step": 2234 + }, + { + "epoch": 0.8944, + "grad_norm": 5.118314266204834, + "learning_rate": 1.7784623015670237e-05, + "loss": 0.4241, + "step": 2236 + }, + { + "epoch": 0.8952, + "grad_norm": 8.755192756652832, + "learning_rate": 1.7767064235919594e-05, + "loss": 0.5717, + "step": 2238 + }, + { + "epoch": 0.896, + "grad_norm": 8.251045227050781, + "learning_rate": 1.77494448870418e-05, + "loss": 0.6775, + "step": 2240 + }, + { + "epoch": 0.8968, + "grad_norm": 2.6442363262176514, + "learning_rate": 1.773176510643608e-05, + "loss": 0.3537, + "step": 2242 + }, + { + "epoch": 0.8976, + "grad_norm": 2.8393073081970215, + "learning_rate": 1.7714025031972894e-05, + "loss": 0.3711, + "step": 2244 + }, + { + "epoch": 0.8984, + "grad_norm": 4.766880035400391, + "learning_rate": 1.769622480199295e-05, + "loss": 0.3779, + "step": 2246 + }, + { + "epoch": 0.8992, + "grad_norm": 10.22143840789795, + "learning_rate": 1.7678364555305982e-05, + "loss": 0.5814, + "step": 2248 + }, + { + "epoch": 0.9, + "grad_norm": 9.3465576171875, + "learning_rate": 1.7660444431189777e-05, + "loss": 0.4841, + "step": 2250 + }, + { + "epoch": 0.9008, + "grad_norm": 2.9356963634490967, + "learning_rate": 1.76424645693891e-05, + "loss": 0.4369, + "step": 2252 + }, + { + "epoch": 0.9016, + "grad_norm": 6.478868007659912, + "learning_rate": 1.762442511011448e-05, + "loss": 0.6658, + "step": 2254 + }, + { + "epoch": 0.9024, + "grad_norm": 4.515495777130127, + "learning_rate": 1.7606326194041285e-05, + "loss": 0.2635, + "step": 2256 + }, + { + "epoch": 0.9032, + "grad_norm": 3.4764370918273926, + "learning_rate": 1.7588167962308458e-05, + "loss": 0.3208, + "step": 2258 + }, + { + "epoch": 0.904, + "grad_norm": 4.561454772949219, + "learning_rate": 1.756995055651757e-05, + "loss": 0.5108, + "step": 2260 + }, + { + "epoch": 0.9048, + "grad_norm": 4.654769420623779, + "learning_rate": 1.7551674118731585e-05, + "loss": 0.3701, + "step": 2262 + }, + { + "epoch": 0.9056, + "grad_norm": 10.341426849365234, + "learning_rate": 1.7533338791473875e-05, + "loss": 0.6947, + "step": 2264 + }, + { + "epoch": 0.9064, + "grad_norm": 7.0824809074401855, + "learning_rate": 1.751494471772697e-05, + "loss": 0.4752, + "step": 2266 + }, + { + "epoch": 0.9072, + "grad_norm": 6.04065465927124, + "learning_rate": 1.7496492040931548e-05, + "loss": 0.3313, + "step": 2268 + }, + { + "epoch": 0.908, + "grad_norm": 3.093017578125, + "learning_rate": 1.747798090498533e-05, + "loss": 0.2161, + "step": 2270 + }, + { + "epoch": 0.9088, + "grad_norm": 3.0049962997436523, + "learning_rate": 1.745941145424182e-05, + "loss": 0.6934, + "step": 2272 + }, + { + "epoch": 0.9096, + "grad_norm": 2.6591107845306396, + "learning_rate": 1.744078383350938e-05, + "loss": 0.3383, + "step": 2274 + }, + { + "epoch": 0.9104, + "grad_norm": 9.022825241088867, + "learning_rate": 1.7422098188049885e-05, + "loss": 0.5028, + "step": 2276 + }, + { + "epoch": 0.9112, + "grad_norm": 7.651523590087891, + "learning_rate": 1.7403354663577782e-05, + "loss": 0.8057, + "step": 2278 + }, + { + "epoch": 0.912, + "grad_norm": 5.598511219024658, + "learning_rate": 1.738455340625883e-05, + "loss": 0.7146, + "step": 2280 + }, + { + "epoch": 0.9128, + "grad_norm": 1.7681468725204468, + "learning_rate": 1.7365694562709038e-05, + "loss": 0.5604, + "step": 2282 + }, + { + "epoch": 0.9136, + "grad_norm": 4.347034454345703, + "learning_rate": 1.7346778279993433e-05, + "loss": 0.354, + "step": 2284 + }, + { + "epoch": 0.9144, + "grad_norm": 4.9559149742126465, + "learning_rate": 1.7327804705624962e-05, + "loss": 0.404, + "step": 2286 + }, + { + "epoch": 0.9152, + "grad_norm": 2.3942646980285645, + "learning_rate": 1.730877398756341e-05, + "loss": 0.6259, + "step": 2288 + }, + { + "epoch": 0.916, + "grad_norm": 6.6393351554870605, + "learning_rate": 1.7289686274214113e-05, + "loss": 0.3538, + "step": 2290 + }, + { + "epoch": 0.9168, + "grad_norm": 3.790372610092163, + "learning_rate": 1.727054171442693e-05, + "loss": 0.3411, + "step": 2292 + }, + { + "epoch": 0.9176, + "grad_norm": 4.417578220367432, + "learning_rate": 1.7251340457494934e-05, + "loss": 0.3735, + "step": 2294 + }, + { + "epoch": 0.9184, + "grad_norm": 2.050532579421997, + "learning_rate": 1.7232082653153422e-05, + "loss": 0.2166, + "step": 2296 + }, + { + "epoch": 0.9192, + "grad_norm": 4.665719032287598, + "learning_rate": 1.7212768451578595e-05, + "loss": 0.2797, + "step": 2298 + }, + { + "epoch": 0.92, + "grad_norm": 3.98673415184021, + "learning_rate": 1.7193398003386517e-05, + "loss": 0.3617, + "step": 2300 + }, + { + "epoch": 0.9208, + "grad_norm": 3.666666030883789, + "learning_rate": 1.7173971459631803e-05, + "loss": 0.5871, + "step": 2302 + }, + { + "epoch": 0.9216, + "grad_norm": 4.225686550140381, + "learning_rate": 1.7154488971806525e-05, + "loss": 0.3627, + "step": 2304 + }, + { + "epoch": 0.9224, + "grad_norm": 2.3536264896392822, + "learning_rate": 1.713495069183907e-05, + "loss": 0.3145, + "step": 2306 + }, + { + "epoch": 0.9232, + "grad_norm": 2.9840025901794434, + "learning_rate": 1.7115356772092847e-05, + "loss": 0.5521, + "step": 2308 + }, + { + "epoch": 0.924, + "grad_norm": 3.0804269313812256, + "learning_rate": 1.709570736536522e-05, + "loss": 0.4238, + "step": 2310 + }, + { + "epoch": 0.9248, + "grad_norm": 2.455007791519165, + "learning_rate": 1.7076002624886152e-05, + "loss": 0.2472, + "step": 2312 + }, + { + "epoch": 0.9256, + "grad_norm": 7.137457847595215, + "learning_rate": 1.705624270431722e-05, + "loss": 0.6622, + "step": 2314 + }, + { + "epoch": 0.9264, + "grad_norm": 1.5967859029769897, + "learning_rate": 1.70364277577502e-05, + "loss": 0.1157, + "step": 2316 + }, + { + "epoch": 0.9272, + "grad_norm": 4.234429836273193, + "learning_rate": 1.7016557939706078e-05, + "loss": 0.3578, + "step": 2318 + }, + { + "epoch": 0.928, + "grad_norm": 4.573144435882568, + "learning_rate": 1.6996633405133673e-05, + "loss": 0.288, + "step": 2320 + }, + { + "epoch": 0.9288, + "grad_norm": 10.306396484375, + "learning_rate": 1.6976654309408468e-05, + "loss": 0.6993, + "step": 2322 + }, + { + "epoch": 0.9296, + "grad_norm": 10.781827926635742, + "learning_rate": 1.6956620808331515e-05, + "loss": 0.735, + "step": 2324 + }, + { + "epoch": 0.9304, + "grad_norm": 3.1664340496063232, + "learning_rate": 1.6936533058128042e-05, + "loss": 0.4528, + "step": 2326 + }, + { + "epoch": 0.9312, + "grad_norm": 7.0673298835754395, + "learning_rate": 1.691639121544641e-05, + "loss": 0.3242, + "step": 2328 + }, + { + "epoch": 0.932, + "grad_norm": 8.26816463470459, + "learning_rate": 1.6896195437356696e-05, + "loss": 0.6315, + "step": 2330 + }, + { + "epoch": 0.9328, + "grad_norm": 4.574904441833496, + "learning_rate": 1.6875945881349686e-05, + "loss": 0.4652, + "step": 2332 + }, + { + "epoch": 0.9336, + "grad_norm": 4.3775506019592285, + "learning_rate": 1.6855642705335435e-05, + "loss": 0.4898, + "step": 2334 + }, + { + "epoch": 0.9344, + "grad_norm": 4.384169578552246, + "learning_rate": 1.6835286067642228e-05, + "loss": 0.7099, + "step": 2336 + }, + { + "epoch": 0.9352, + "grad_norm": 4.223312854766846, + "learning_rate": 1.681487612701521e-05, + "loss": 0.5002, + "step": 2338 + }, + { + "epoch": 0.936, + "grad_norm": 5.141642093658447, + "learning_rate": 1.6794413042615168e-05, + "loss": 0.4201, + "step": 2340 + }, + { + "epoch": 0.9368, + "grad_norm": 4.047835350036621, + "learning_rate": 1.677389697401739e-05, + "loss": 0.578, + "step": 2342 + }, + { + "epoch": 0.9376, + "grad_norm": 2.148810863494873, + "learning_rate": 1.675332808121025e-05, + "loss": 0.2867, + "step": 2344 + }, + { + "epoch": 0.9384, + "grad_norm": 6.967923164367676, + "learning_rate": 1.6732706524594145e-05, + "loss": 0.8902, + "step": 2346 + }, + { + "epoch": 0.9392, + "grad_norm": 9.913086891174316, + "learning_rate": 1.671203246498009e-05, + "loss": 0.4933, + "step": 2348 + }, + { + "epoch": 0.94, + "grad_norm": 3.1417829990386963, + "learning_rate": 1.6691306063588593e-05, + "loss": 0.3506, + "step": 2350 + }, + { + "epoch": 0.9408, + "grad_norm": 14.28138256072998, + "learning_rate": 1.6670527482048242e-05, + "loss": 1.1406, + "step": 2352 + }, + { + "epoch": 0.9416, + "grad_norm": 2.6398777961730957, + "learning_rate": 1.6649696882394635e-05, + "loss": 0.2026, + "step": 2354 + }, + { + "epoch": 0.9424, + "grad_norm": 3.2635467052459717, + "learning_rate": 1.6628814427068968e-05, + "loss": 0.7803, + "step": 2356 + }, + { + "epoch": 0.9432, + "grad_norm": 2.1623849868774414, + "learning_rate": 1.6607880278916778e-05, + "loss": 0.2551, + "step": 2358 + }, + { + "epoch": 0.944, + "grad_norm": 2.4618899822235107, + "learning_rate": 1.6586894601186824e-05, + "loss": 0.4137, + "step": 2360 + }, + { + "epoch": 0.9448, + "grad_norm": 2.293994903564453, + "learning_rate": 1.656585755752957e-05, + "loss": 0.2418, + "step": 2362 + }, + { + "epoch": 0.9456, + "grad_norm": 2.1502673625946045, + "learning_rate": 1.6544769311996153e-05, + "loss": 0.3008, + "step": 2364 + }, + { + "epoch": 0.9464, + "grad_norm": 2.998347520828247, + "learning_rate": 1.6523630029036924e-05, + "loss": 0.4707, + "step": 2366 + }, + { + "epoch": 0.9472, + "grad_norm": 5.802120208740234, + "learning_rate": 1.6502439873500294e-05, + "loss": 0.4726, + "step": 2368 + }, + { + "epoch": 0.948, + "grad_norm": 11.09598159790039, + "learning_rate": 1.6481199010631305e-05, + "loss": 0.5139, + "step": 2370 + }, + { + "epoch": 0.9488, + "grad_norm": 5.688537120819092, + "learning_rate": 1.645990760607052e-05, + "loss": 0.3321, + "step": 2372 + }, + { + "epoch": 0.9496, + "grad_norm": 1.9983233213424683, + "learning_rate": 1.643856582585255e-05, + "loss": 0.394, + "step": 2374 + }, + { + "epoch": 0.9504, + "grad_norm": 6.963872909545898, + "learning_rate": 1.641717383640488e-05, + "loss": 0.3209, + "step": 2376 + }, + { + "epoch": 0.9512, + "grad_norm": 2.2653205394744873, + "learning_rate": 1.6395731804546596e-05, + "loss": 0.1529, + "step": 2378 + }, + { + "epoch": 0.952, + "grad_norm": 3.29524564743042, + "learning_rate": 1.63742398974869e-05, + "loss": 0.3794, + "step": 2380 + }, + { + "epoch": 0.9528, + "grad_norm": 2.852288007736206, + "learning_rate": 1.6352698282824045e-05, + "loss": 0.2458, + "step": 2382 + }, + { + "epoch": 0.9536, + "grad_norm": 2.9293675422668457, + "learning_rate": 1.633110712854385e-05, + "loss": 0.3333, + "step": 2384 + }, + { + "epoch": 0.9544, + "grad_norm": 10.415287017822266, + "learning_rate": 1.6309466603018504e-05, + "loss": 0.5087, + "step": 2386 + }, + { + "epoch": 0.9552, + "grad_norm": 1.27485191822052, + "learning_rate": 1.6287776875005148e-05, + "loss": 0.2743, + "step": 2388 + }, + { + "epoch": 0.956, + "grad_norm": 2.6245243549346924, + "learning_rate": 1.6266038113644612e-05, + "loss": 0.516, + "step": 2390 + }, + { + "epoch": 0.9568, + "grad_norm": 4.452570915222168, + "learning_rate": 1.624425048846017e-05, + "loss": 0.4682, + "step": 2392 + }, + { + "epoch": 0.9576, + "grad_norm": 2.626039743423462, + "learning_rate": 1.6222414169356063e-05, + "loss": 0.2863, + "step": 2394 + }, + { + "epoch": 0.9584, + "grad_norm": 2.5585098266601562, + "learning_rate": 1.6200529326616343e-05, + "loss": 0.4538, + "step": 2396 + }, + { + "epoch": 0.9592, + "grad_norm": 5.5422539710998535, + "learning_rate": 1.6178596130903345e-05, + "loss": 0.4687, + "step": 2398 + }, + { + "epoch": 0.96, + "grad_norm": 8.677401542663574, + "learning_rate": 1.6156614753256587e-05, + "loss": 0.2982, + "step": 2400 + }, + { + "epoch": 0.9608, + "grad_norm": 3.8012046813964844, + "learning_rate": 1.613458536509123e-05, + "loss": 0.2614, + "step": 2402 + }, + { + "epoch": 0.9616, + "grad_norm": 2.2905969619750977, + "learning_rate": 1.6112508138196922e-05, + "loss": 0.7023, + "step": 2404 + }, + { + "epoch": 0.9624, + "grad_norm": 2.1353418827056885, + "learning_rate": 1.6090383244736277e-05, + "loss": 0.423, + "step": 2406 + }, + { + "epoch": 0.9632, + "grad_norm": 1.8343054056167603, + "learning_rate": 1.606821085724363e-05, + "loss": 0.4421, + "step": 2408 + }, + { + "epoch": 0.964, + "grad_norm": 2.907493829727173, + "learning_rate": 1.6045991148623756e-05, + "loss": 0.3788, + "step": 2410 + }, + { + "epoch": 0.9648, + "grad_norm": 7.708912372589111, + "learning_rate": 1.602372429215038e-05, + "loss": 0.556, + "step": 2412 + }, + { + "epoch": 0.9656, + "grad_norm": 2.3259804248809814, + "learning_rate": 1.600141046146497e-05, + "loss": 0.2895, + "step": 2414 + }, + { + "epoch": 0.9664, + "grad_norm": 4.717739105224609, + "learning_rate": 1.597904983057519e-05, + "loss": 0.8317, + "step": 2416 + }, + { + "epoch": 0.9672, + "grad_norm": 3.4694042205810547, + "learning_rate": 1.5956642573853794e-05, + "loss": 0.2237, + "step": 2418 + }, + { + "epoch": 0.968, + "grad_norm": 6.2816033363342285, + "learning_rate": 1.5934188866037014e-05, + "loss": 0.2499, + "step": 2420 + }, + { + "epoch": 0.9688, + "grad_norm": 3.9102213382720947, + "learning_rate": 1.591168888222342e-05, + "loss": 0.4237, + "step": 2422 + }, + { + "epoch": 0.9696, + "grad_norm": 9.37143611907959, + "learning_rate": 1.5889142797872407e-05, + "loss": 0.5436, + "step": 2424 + }, + { + "epoch": 0.9704, + "grad_norm": 2.8910293579101562, + "learning_rate": 1.5866550788802818e-05, + "loss": 0.3519, + "step": 2426 + }, + { + "epoch": 0.9712, + "grad_norm": 3.6150565147399902, + "learning_rate": 1.584391303119173e-05, + "loss": 0.1368, + "step": 2428 + }, + { + "epoch": 0.972, + "grad_norm": 7.551123142242432, + "learning_rate": 1.582122970157289e-05, + "loss": 0.4774, + "step": 2430 + }, + { + "epoch": 0.9728, + "grad_norm": 8.42343807220459, + "learning_rate": 1.5798500976835503e-05, + "loss": 0.7228, + "step": 2432 + }, + { + "epoch": 0.9736, + "grad_norm": 5.844367027282715, + "learning_rate": 1.577572703422267e-05, + "loss": 0.3905, + "step": 2434 + }, + { + "epoch": 0.9744, + "grad_norm": 4.236822605133057, + "learning_rate": 1.575290805133024e-05, + "loss": 0.5367, + "step": 2436 + }, + { + "epoch": 0.9752, + "grad_norm": 5.707516670227051, + "learning_rate": 1.5730044206105156e-05, + "loss": 1.0149, + "step": 2438 + }, + { + "epoch": 0.976, + "grad_norm": 11.354463577270508, + "learning_rate": 1.570713567684432e-05, + "loss": 0.8809, + "step": 2440 + }, + { + "epoch": 0.9768, + "grad_norm": 4.4314446449279785, + "learning_rate": 1.5684182642193047e-05, + "loss": 0.7045, + "step": 2442 + }, + { + "epoch": 0.9776, + "grad_norm": 2.725847005844116, + "learning_rate": 1.566118528114367e-05, + "loss": 0.2321, + "step": 2444 + }, + { + "epoch": 0.9784, + "grad_norm": 3.7199370861053467, + "learning_rate": 1.563814377303429e-05, + "loss": 0.4067, + "step": 2446 + }, + { + "epoch": 0.9792, + "grad_norm": 3.164414167404175, + "learning_rate": 1.561505829754715e-05, + "loss": 0.3257, + "step": 2448 + }, + { + "epoch": 0.98, + "grad_norm": 2.3418188095092773, + "learning_rate": 1.5591929034707475e-05, + "loss": 0.4852, + "step": 2450 + }, + { + "epoch": 0.9808, + "grad_norm": 2.900113582611084, + "learning_rate": 1.5568756164881874e-05, + "loss": 0.3542, + "step": 2452 + }, + { + "epoch": 0.9816, + "grad_norm": 1.9147764444351196, + "learning_rate": 1.5545539868777085e-05, + "loss": 0.2348, + "step": 2454 + }, + { + "epoch": 0.9824, + "grad_norm": 4.320040702819824, + "learning_rate": 1.5522280327438384e-05, + "loss": 0.5722, + "step": 2456 + }, + { + "epoch": 0.9832, + "grad_norm": 2.6387248039245605, + "learning_rate": 1.5498977722248398e-05, + "loss": 0.3449, + "step": 2458 + }, + { + "epoch": 0.984, + "grad_norm": 1.3307088613510132, + "learning_rate": 1.547563223492552e-05, + "loss": 0.3057, + "step": 2460 + }, + { + "epoch": 0.9848, + "grad_norm": 3.4244158267974854, + "learning_rate": 1.5452244047522504e-05, + "loss": 0.4097, + "step": 2462 + }, + { + "epoch": 0.9856, + "grad_norm": 3.6758108139038086, + "learning_rate": 1.5428813342425194e-05, + "loss": 0.4778, + "step": 2464 + }, + { + "epoch": 0.9864, + "grad_norm": 26.934717178344727, + "learning_rate": 1.5405340302350876e-05, + "loss": 1.4515, + "step": 2466 + }, + { + "epoch": 0.9872, + "grad_norm": 5.977929592132568, + "learning_rate": 1.538182511034708e-05, + "loss": 0.4473, + "step": 2468 + }, + { + "epoch": 0.988, + "grad_norm": 1.2147676944732666, + "learning_rate": 1.535826794978996e-05, + "loss": 0.3552, + "step": 2470 + }, + { + "epoch": 0.9888, + "grad_norm": 5.006214618682861, + "learning_rate": 1.5334669004383036e-05, + "loss": 0.9009, + "step": 2472 + }, + { + "epoch": 0.9896, + "grad_norm": 8.950480461120605, + "learning_rate": 1.5311028458155564e-05, + "loss": 0.9034, + "step": 2474 + }, + { + "epoch": 0.9904, + "grad_norm": 3.063087224960327, + "learning_rate": 1.528734649546133e-05, + "loss": 0.4858, + "step": 2476 + }, + { + "epoch": 0.9912, + "grad_norm": 9.52176284790039, + "learning_rate": 1.5263623300976997e-05, + "loss": 0.4892, + "step": 2478 + }, + { + "epoch": 0.992, + "grad_norm": 1.4161584377288818, + "learning_rate": 1.5239859059700792e-05, + "loss": 0.2002, + "step": 2480 + }, + { + "epoch": 0.9928, + "grad_norm": 5.208809852600098, + "learning_rate": 1.5216053956951096e-05, + "loss": 0.5075, + "step": 2482 + }, + { + "epoch": 0.9936, + "grad_norm": 5.600852966308594, + "learning_rate": 1.5192208178364819e-05, + "loss": 0.5732, + "step": 2484 + }, + { + "epoch": 0.9944, + "grad_norm": 3.0066847801208496, + "learning_rate": 1.5168321909896176e-05, + "loss": 0.331, + "step": 2486 + }, + { + "epoch": 0.9952, + "grad_norm": 3.0223422050476074, + "learning_rate": 1.5144395337815057e-05, + "loss": 0.2142, + "step": 2488 + }, + { + "epoch": 0.996, + "grad_norm": 2.803382635116577, + "learning_rate": 1.5120428648705722e-05, + "loss": 0.2565, + "step": 2490 + }, + { + "epoch": 0.9968, + "grad_norm": 7.8674445152282715, + "learning_rate": 1.5096422029465171e-05, + "loss": 0.6374, + "step": 2492 + }, + { + "epoch": 0.9976, + "grad_norm": 5.142702579498291, + "learning_rate": 1.5072375667301904e-05, + "loss": 0.414, + "step": 2494 + }, + { + "epoch": 0.9984, + "grad_norm": 5.461601734161377, + "learning_rate": 1.5048289749734231e-05, + "loss": 0.4925, + "step": 2496 + }, + { + "epoch": 0.9992, + "grad_norm": 7.165497303009033, + "learning_rate": 1.502416446458898e-05, + "loss": 0.496, + "step": 2498 + }, + { + "epoch": 1.0, + "grad_norm": 3.7376537322998047, + "learning_rate": 1.5000000000000014e-05, + "loss": 0.3629, + "step": 2500 + }, + { + "epoch": 1.0, + "step": 2500, + "total_flos": 1.0020465781768192e+16, + "train_loss": 0.5193727528691292, + "train_runtime": 17799.0301, + "train_samples_per_second": 2.247, + "train_steps_per_second": 0.14 + } + ], + "logging_steps": 2, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 1.0020465781768192e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..0f5bc49911711b3fc42970112cdf1d9bb1d4315a --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0b639778d4e1f7aae966d0b330009ecbda922dd8869a6613e0eda2954c0915b +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..b511825b8b9e3b1dac247f297e0874c999b89776 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e08f705eab7c3e575cee2a2e93a8f7e06e7514bce97e6eb4353b7448c6472262 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9a820d6dc33485218ef15ad29b849dcc17e3924c --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e535c21196e01af3ba8bd5b2d6cad8060cb4f93528c909e6610c444f3678e52b +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f4ac6d9874913fa17651737eacc2204b1e2ee761 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bde55ce76a5fd1470d6214ecfa14613869e50b34f417538b17eae2e496c9f7c6 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_infoBatch_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_infoBatch_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b5106ca7b3fb8d79680edb26222a32d81ce9febc --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_infoBatch_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,7532 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008, + "learning_rate": 2.357535430610912e-06, + "loss": 0.4714, + "step": 2 + }, + { + "epoch": 0.0016, + "learning_rate": 2.3755748898855234e-06, + "loss": 0.2971, + "step": 4 + }, + { + "epoch": 0.0024, + "learning_rate": 2.3936738059587174e-06, + "loss": 0.322, + "step": 6 + }, + { + "epoch": 0.0032, + "learning_rate": 2.411832037691545e-06, + "loss": 1.7913, + "step": 8 + }, + { + "epoch": 0.004, + "learning_rate": 2.430049443482434e-06, + "loss": 0.3682, + "step": 10 + }, + { + "epoch": 0.0048, + "learning_rate": 2.448325881268406e-06, + "loss": 0.4075, + "step": 12 + }, + { + "epoch": 0.0056, + "learning_rate": 2.4666612085261277e-06, + "loss": 0.2954, + "step": 14 + }, + { + "epoch": 0.0064, + "learning_rate": 2.4850552822730346e-06, + "loss": 0.2551, + "step": 16 + }, + { + "epoch": 0.0072, + "learning_rate": 2.503507959068455e-06, + "loss": 0.67, + "step": 18 + }, + { + "epoch": 0.008, + "learning_rate": 2.522019095014686e-06, + "loss": 0.4827, + "step": 20 + }, + { + "epoch": 0.0088, + "learning_rate": 2.5405885457581814e-06, + "loss": 0.8245, + "step": 22 + }, + { + "epoch": 0.0096, + "learning_rate": 2.5592161664906243e-06, + "loss": 0.3826, + "step": 24 + }, + { + "epoch": 0.0104, + "learning_rate": 2.5779018119501086e-06, + "loss": 0.2266, + "step": 26 + }, + { + "epoch": 0.0112, + "learning_rate": 2.596645336422219e-06, + "loss": 0.808, + "step": 28 + }, + { + "epoch": 0.012, + "learning_rate": 2.615446593741161e-06, + "loss": 0.4032, + "step": 30 + }, + { + "epoch": 0.0128, + "learning_rate": 2.6343054372909648e-06, + "loss": 0.5288, + "step": 32 + }, + { + "epoch": 0.0136, + "learning_rate": 2.6532217200065826e-06, + "loss": 0.279, + "step": 34 + }, + { + "epoch": 0.0144, + "learning_rate": 2.6721952943750396e-06, + "loss": 0.6505, + "step": 36 + }, + { + "epoch": 0.0152, + "learning_rate": 2.691226012436604e-06, + "loss": 0.2859, + "step": 38 + }, + { + "epoch": 0.016, + "learning_rate": 2.7103137257858893e-06, + "loss": 0.4549, + "step": 40 + }, + { + "epoch": 0.0168, + "learning_rate": 2.7294582855730733e-06, + "loss": 1.0067, + "step": 42 + }, + { + "epoch": 0.0176, + "learning_rate": 2.7486595425050566e-06, + "loss": 0.6952, + "step": 44 + }, + { + "epoch": 0.0184, + "learning_rate": 2.7679173468465813e-06, + "loss": 0.3269, + "step": 46 + }, + { + "epoch": 0.0192, + "learning_rate": 2.7872315484213954e-06, + "loss": 0.4172, + "step": 48 + }, + { + "epoch": 0.02, + "learning_rate": 2.8066019966134873e-06, + "loss": 0.3755, + "step": 50 + }, + { + "epoch": 0.0208, + "learning_rate": 2.826028540368212e-06, + "loss": 0.5115, + "step": 52 + }, + { + "epoch": 0.0216, + "learning_rate": 2.845511028193477e-06, + "loss": 0.281, + "step": 54 + }, + { + "epoch": 0.0224, + "learning_rate": 2.865049308160931e-06, + "loss": 0.4912, + "step": 56 + }, + { + "epoch": 0.0232, + "learning_rate": 2.8846432279071533e-06, + "loss": 0.3886, + "step": 58 + }, + { + "epoch": 0.024, + "learning_rate": 2.9042926346347835e-06, + "loss": 0.3006, + "step": 60 + }, + { + "epoch": 0.0248, + "learning_rate": 2.9239973751138397e-06, + "loss": 0.5139, + "step": 62 + }, + { + "epoch": 0.0256, + "learning_rate": 2.943757295682783e-06, + "loss": 0.334, + "step": 64 + }, + { + "epoch": 0.0264, + "learning_rate": 2.9635722422497983e-06, + "loss": 0.4278, + "step": 66 + }, + { + "epoch": 0.0272, + "learning_rate": 2.983442060293926e-06, + "loss": 0.4833, + "step": 68 + }, + { + "epoch": 0.028, + "learning_rate": 3.003366594866345e-06, + "loss": 0.4375, + "step": 70 + }, + { + "epoch": 0.0288, + "learning_rate": 3.0233456905915338e-06, + "loss": 0.3418, + "step": 72 + }, + { + "epoch": 0.0296, + "learning_rate": 3.0433791916684885e-06, + "loss": 0.3205, + "step": 74 + }, + { + "epoch": 0.0304, + "learning_rate": 3.0634669418719453e-06, + "loss": 0.6138, + "step": 76 + }, + { + "epoch": 0.0312, + "learning_rate": 3.0836087845535933e-06, + "loss": 0.303, + "step": 78 + }, + { + "epoch": 0.032, + "learning_rate": 3.1038045626432945e-06, + "loss": 0.5614, + "step": 80 + }, + { + "epoch": 0.0328, + "learning_rate": 3.1240541186503173e-06, + "loss": 0.2131, + "step": 82 + }, + { + "epoch": 0.0336, + "learning_rate": 3.1443572946645683e-06, + "loss": 0.3757, + "step": 84 + }, + { + "epoch": 0.0344, + "learning_rate": 3.164713932357776e-06, + "loss": 0.4551, + "step": 86 + }, + { + "epoch": 0.0352, + "learning_rate": 3.1851238729848033e-06, + "loss": 0.6274, + "step": 88 + }, + { + "epoch": 0.036, + "learning_rate": 3.205586957384834e-06, + "loss": 0.3222, + "step": 90 + }, + { + "epoch": 0.0368, + "learning_rate": 3.2261030259826253e-06, + "loss": 0.6092, + "step": 92 + }, + { + "epoch": 0.0376, + "learning_rate": 3.246671918789752e-06, + "loss": 0.8778, + "step": 94 + }, + { + "epoch": 0.0384, + "learning_rate": 3.267293475405858e-06, + "loss": 0.5037, + "step": 96 + }, + { + "epoch": 0.0392, + "learning_rate": 3.2879675350199004e-06, + "loss": 0.5835, + "step": 98 + }, + { + "epoch": 0.04, + "learning_rate": 3.3086939364114113e-06, + "loss": 0.5834, + "step": 100 + }, + { + "epoch": 0.0408, + "learning_rate": 3.329472517951747e-06, + "loss": 0.5101, + "step": 102 + }, + { + "epoch": 0.0416, + "learning_rate": 3.350303117605369e-06, + "loss": 0.7538, + "step": 104 + }, + { + "epoch": 0.0424, + "learning_rate": 3.3711855729310503e-06, + "loss": 0.251, + "step": 106 + }, + { + "epoch": 0.0432, + "learning_rate": 3.3921197210832235e-06, + "loss": 1.0623, + "step": 108 + }, + { + "epoch": 0.044, + "learning_rate": 3.4131053988131947e-06, + "loss": 0.2531, + "step": 110 + }, + { + "epoch": 0.0448, + "learning_rate": 3.434142442470434e-06, + "loss": 0.376, + "step": 112 + }, + { + "epoch": 0.0456, + "learning_rate": 3.455230688003849e-06, + "loss": 0.245, + "step": 114 + }, + { + "epoch": 0.0464, + "learning_rate": 3.476369970963065e-06, + "loss": 0.4318, + "step": 116 + }, + { + "epoch": 0.0472, + "learning_rate": 3.497560126499706e-06, + "loss": 0.2827, + "step": 118 + }, + { + "epoch": 0.048, + "learning_rate": 3.5188009893686836e-06, + "loss": 0.3698, + "step": 120 + }, + { + "epoch": 0.0488, + "learning_rate": 3.5400923939294827e-06, + "loss": 0.948, + "step": 122 + }, + { + "epoch": 0.0496, + "learning_rate": 3.5614341741474667e-06, + "loss": 0.6545, + "step": 124 + }, + { + "epoch": 0.0504, + "learning_rate": 3.5828261635951177e-06, + "loss": 0.2865, + "step": 126 + }, + { + "epoch": 0.0512, + "learning_rate": 3.604268195453421e-06, + "loss": 0.4351, + "step": 128 + }, + { + "epoch": 0.052, + "learning_rate": 3.6257601025130893e-06, + "loss": 0.6385, + "step": 130 + }, + { + "epoch": 0.0528, + "learning_rate": 3.647301717175955e-06, + "loss": 0.2678, + "step": 132 + }, + { + "epoch": 0.0536, + "learning_rate": 3.66889287145614e-06, + "loss": 0.3828, + "step": 134 + }, + { + "epoch": 0.0544, + "learning_rate": 3.6905333969814995e-06, + "loss": 0.2692, + "step": 136 + }, + { + "epoch": 0.0552, + "learning_rate": 3.712223124994867e-06, + "loss": 0.2416, + "step": 138 + }, + { + "epoch": 0.056, + "learning_rate": 3.7339618863553885e-06, + "loss": 0.4302, + "step": 140 + }, + { + "epoch": 0.0568, + "learning_rate": 3.755749511539848e-06, + "loss": 0.5065, + "step": 142 + }, + { + "epoch": 0.0576, + "learning_rate": 3.7775858306439404e-06, + "loss": 0.6851, + "step": 144 + }, + { + "epoch": 0.0584, + "learning_rate": 3.799470673383677e-06, + "loss": 0.3453, + "step": 146 + }, + { + "epoch": 0.0592, + "learning_rate": 3.821403869096644e-06, + "loss": 0.4392, + "step": 148 + }, + { + "epoch": 0.06, + "learning_rate": 3.8433852467434175e-06, + "loss": 0.3607, + "step": 150 + }, + { + "epoch": 0.0608, + "learning_rate": 3.865414634908756e-06, + "loss": 0.5525, + "step": 152 + }, + { + "epoch": 0.0616, + "learning_rate": 3.887491861803081e-06, + "loss": 0.5969, + "step": 154 + }, + { + "epoch": 0.0624, + "learning_rate": 3.909616755263741e-06, + "loss": 0.5114, + "step": 156 + }, + { + "epoch": 0.0632, + "learning_rate": 3.9317891427563725e-06, + "loss": 0.3365, + "step": 158 + }, + { + "epoch": 0.064, + "learning_rate": 3.954008851376244e-06, + "loss": 0.3368, + "step": 160 + }, + { + "epoch": 0.0648, + "learning_rate": 3.976275707849619e-06, + "loss": 0.4222, + "step": 162 + }, + { + "epoch": 0.0656, + "learning_rate": 3.99858953853505e-06, + "loss": 0.6011, + "step": 164 + }, + { + "epoch": 0.0664, + "learning_rate": 4.0209501694248e-06, + "loss": 0.619, + "step": 166 + }, + { + "epoch": 0.0672, + "learning_rate": 4.043357426146209e-06, + "loss": 0.9463, + "step": 168 + }, + { + "epoch": 0.068, + "learning_rate": 4.065811133962987e-06, + "loss": 0.3358, + "step": 170 + }, + { + "epoch": 0.0688, + "learning_rate": 4.08831111777658e-06, + "loss": 0.5158, + "step": 172 + }, + { + "epoch": 0.0696, + "learning_rate": 4.110857202127611e-06, + "loss": 0.4278, + "step": 174 + }, + { + "epoch": 0.0704, + "learning_rate": 4.133449211197183e-06, + "loss": 0.3824, + "step": 176 + }, + { + "epoch": 0.0712, + "learning_rate": 4.156086968808274e-06, + "loss": 0.3289, + "step": 178 + }, + { + "epoch": 0.072, + "learning_rate": 4.178770298427114e-06, + "loss": 0.2942, + "step": 180 + }, + { + "epoch": 0.0728, + "learning_rate": 4.201499023164515e-06, + "loss": 0.2927, + "step": 182 + }, + { + "epoch": 0.0736, + "learning_rate": 4.224272965777315e-06, + "loss": 0.3445, + "step": 184 + }, + { + "epoch": 0.0744, + "learning_rate": 4.247091948669764e-06, + "loss": 0.2638, + "step": 186 + }, + { + "epoch": 0.0752, + "learning_rate": 4.269955793894849e-06, + "loss": 0.3631, + "step": 188 + }, + { + "epoch": 0.076, + "learning_rate": 4.292864323155684e-06, + "loss": 0.4863, + "step": 190 + }, + { + "epoch": 0.0768, + "learning_rate": 4.3158173578069696e-06, + "loss": 0.9163, + "step": 192 + }, + { + "epoch": 0.0776, + "learning_rate": 4.338814718856333e-06, + "loss": 0.4195, + "step": 194 + }, + { + "epoch": 0.0784, + "learning_rate": 4.3618562269657285e-06, + "loss": 0.291, + "step": 196 + }, + { + "epoch": 0.0792, + "learning_rate": 4.384941702452852e-06, + "loss": 1.2305, + "step": 198 + }, + { + "epoch": 0.08, + "learning_rate": 4.408070965292526e-06, + "loss": 0.2542, + "step": 200 + }, + { + "epoch": 0.0808, + "learning_rate": 4.431243835118112e-06, + "loss": 0.4672, + "step": 202 + }, + { + "epoch": 0.0816, + "learning_rate": 4.4544601312229185e-06, + "loss": 0.7299, + "step": 204 + }, + { + "epoch": 0.0824, + "learning_rate": 4.477719672561602e-06, + "loss": 0.3302, + "step": 206 + }, + { + "epoch": 0.0832, + "learning_rate": 4.501022277751605e-06, + "loss": 0.5331, + "step": 208 + }, + { + "epoch": 0.084, + "learning_rate": 4.524367765074499e-06, + "loss": 0.2821, + "step": 210 + }, + { + "epoch": 0.0848, + "learning_rate": 4.5477559524775e-06, + "loss": 0.3516, + "step": 212 + }, + { + "epoch": 0.0856, + "learning_rate": 4.571186657574823e-06, + "loss": 0.477, + "step": 214 + }, + { + "epoch": 0.0864, + "learning_rate": 4.5946596976491254e-06, + "loss": 0.7272, + "step": 216 + }, + { + "epoch": 0.0872, + "learning_rate": 4.618174889652924e-06, + "loss": 0.29, + "step": 218 + }, + { + "epoch": 0.088, + "learning_rate": 4.6417320502100286e-06, + "loss": 0.5942, + "step": 220 + }, + { + "epoch": 0.0888, + "learning_rate": 4.665330995616967e-06, + "loss": 0.2344, + "step": 222 + }, + { + "epoch": 0.0896, + "learning_rate": 4.688971541844424e-06, + "loss": 0.6269, + "step": 224 + }, + { + "epoch": 0.0904, + "learning_rate": 4.712653504538672e-06, + "loss": 0.3763, + "step": 226 + }, + { + "epoch": 0.0912, + "learning_rate": 4.736376699023023e-06, + "loss": 0.47, + "step": 228 + }, + { + "epoch": 0.092, + "learning_rate": 4.76014094029921e-06, + "loss": 0.3322, + "step": 230 + }, + { + "epoch": 0.0928, + "learning_rate": 4.7839460430489216e-06, + "loss": 0.364, + "step": 232 + }, + { + "epoch": 0.0936, + "learning_rate": 4.807791821635185e-06, + "loss": 0.282, + "step": 234 + }, + { + "epoch": 0.0944, + "learning_rate": 4.831678090103828e-06, + "loss": 0.6493, + "step": 236 + }, + { + "epoch": 0.0952, + "learning_rate": 4.855604662184931e-06, + "loss": 0.452, + "step": 238 + }, + { + "epoch": 0.096, + "learning_rate": 4.8795713512942785e-06, + "loss": 0.3573, + "step": 240 + }, + { + "epoch": 0.0968, + "learning_rate": 4.903577970534815e-06, + "loss": 0.345, + "step": 242 + }, + { + "epoch": 0.0976, + "learning_rate": 4.9276243326981e-06, + "loss": 0.3725, + "step": 244 + }, + { + "epoch": 0.0984, + "learning_rate": 4.951710250265788e-06, + "loss": 0.5144, + "step": 246 + }, + { + "epoch": 0.0992, + "learning_rate": 4.975835535411023e-06, + "loss": 0.3763, + "step": 248 + }, + { + "epoch": 0.1, + "learning_rate": 5.000000000000003e-06, + "loss": 0.3102, + "step": 250 + }, + { + "epoch": 0.1008, + "learning_rate": 5.024203455593375e-06, + "loss": 0.6165, + "step": 252 + }, + { + "epoch": 0.1016, + "learning_rate": 5.048445713447734e-06, + "loss": 0.2472, + "step": 254 + }, + { + "epoch": 0.1024, + "learning_rate": 5.072726584517083e-06, + "loss": 0.3165, + "step": 256 + }, + { + "epoch": 0.1032, + "learning_rate": 5.097045879454308e-06, + "loss": 0.684, + "step": 258 + }, + { + "epoch": 0.104, + "learning_rate": 5.1214034086126685e-06, + "loss": 0.2145, + "step": 260 + }, + { + "epoch": 0.1048, + "learning_rate": 5.145798982047253e-06, + "loss": 0.711, + "step": 262 + }, + { + "epoch": 0.1056, + "learning_rate": 5.170232409516483e-06, + "loss": 0.2654, + "step": 264 + }, + { + "epoch": 0.1064, + "learning_rate": 5.194703500483597e-06, + "loss": 0.3271, + "step": 266 + }, + { + "epoch": 0.1072, + "learning_rate": 5.219212064118082e-06, + "loss": 0.2825, + "step": 268 + }, + { + "epoch": 0.108, + "learning_rate": 5.24375790929725e-06, + "loss": 0.3206, + "step": 270 + }, + { + "epoch": 0.1088, + "learning_rate": 5.268340844607653e-06, + "loss": 0.2309, + "step": 272 + }, + { + "epoch": 0.1096, + "learning_rate": 5.2929606783466735e-06, + "loss": 0.5375, + "step": 274 + }, + { + "epoch": 0.1104, + "learning_rate": 5.317617218523853e-06, + "loss": 0.4645, + "step": 276 + }, + { + "epoch": 0.1112, + "learning_rate": 5.342310272862553e-06, + "loss": 0.5901, + "step": 278 + }, + { + "epoch": 0.112, + "learning_rate": 5.367039648801377e-06, + "loss": 0.5244, + "step": 280 + }, + { + "epoch": 0.1128, + "learning_rate": 5.391805153495684e-06, + "loss": 0.4362, + "step": 282 + }, + { + "epoch": 0.1136, + "learning_rate": 5.416606593819109e-06, + "loss": 0.284, + "step": 284 + }, + { + "epoch": 0.1144, + "learning_rate": 5.441443776365005e-06, + "loss": 0.3989, + "step": 286 + }, + { + "epoch": 0.1152, + "learning_rate": 5.466316507448053e-06, + "loss": 0.9494, + "step": 288 + }, + { + "epoch": 0.116, + "learning_rate": 5.49122459310568e-06, + "loss": 0.3183, + "step": 290 + }, + { + "epoch": 0.1168, + "learning_rate": 5.516167839099662e-06, + "loss": 0.4315, + "step": 292 + }, + { + "epoch": 0.1176, + "learning_rate": 5.5411460509175605e-06, + "loss": 0.5278, + "step": 294 + }, + { + "epoch": 0.1184, + "learning_rate": 5.5661590337742255e-06, + "loss": 0.4146, + "step": 296 + }, + { + "epoch": 0.1192, + "learning_rate": 5.591206592613412e-06, + "loss": 0.4265, + "step": 298 + }, + { + "epoch": 0.12, + "learning_rate": 5.616288532109221e-06, + "loss": 0.4033, + "step": 300 + }, + { + "epoch": 0.1208, + "learning_rate": 5.641404656667652e-06, + "loss": 0.3881, + "step": 302 + }, + { + "epoch": 0.1216, + "learning_rate": 5.666554770428136e-06, + "loss": 0.5939, + "step": 304 + }, + { + "epoch": 0.1224, + "learning_rate": 5.6917386772650015e-06, + "loss": 0.4928, + "step": 306 + }, + { + "epoch": 0.1232, + "learning_rate": 5.716956180789086e-06, + "loss": 0.2829, + "step": 308 + }, + { + "epoch": 0.124, + "learning_rate": 5.74220708434926e-06, + "loss": 0.7071, + "step": 310 + }, + { + "epoch": 0.1248, + "learning_rate": 5.767491191033909e-06, + "loss": 0.4297, + "step": 312 + }, + { + "epoch": 0.1256, + "learning_rate": 5.7928083036724535e-06, + "loss": 0.433, + "step": 314 + }, + { + "epoch": 0.1264, + "learning_rate": 5.818158224836983e-06, + "loss": 0.3571, + "step": 316 + }, + { + "epoch": 0.1272, + "learning_rate": 5.8435407568437194e-06, + "loss": 0.441, + "step": 318 + }, + { + "epoch": 0.128, + "learning_rate": 5.868955701754577e-06, + "loss": 0.3291, + "step": 320 + }, + { + "epoch": 0.1288, + "learning_rate": 5.894402861378714e-06, + "loss": 0.4948, + "step": 322 + }, + { + "epoch": 0.1296, + "learning_rate": 5.919882037274065e-06, + "loss": 0.431, + "step": 324 + }, + { + "epoch": 0.1304, + "learning_rate": 5.9453930307488985e-06, + "loss": 0.3465, + "step": 326 + }, + { + "epoch": 0.1312, + "learning_rate": 5.970935642863362e-06, + "loss": 0.6457, + "step": 328 + }, + { + "epoch": 0.132, + "learning_rate": 5.996509674431038e-06, + "loss": 0.243, + "step": 330 + }, + { + "epoch": 0.1328, + "learning_rate": 6.022114926020505e-06, + "loss": 0.3487, + "step": 332 + }, + { + "epoch": 0.1336, + "learning_rate": 6.047751197956836e-06, + "loss": 0.4426, + "step": 334 + }, + { + "epoch": 0.1344, + "learning_rate": 6.0734182903232475e-06, + "loss": 0.5499, + "step": 336 + }, + { + "epoch": 0.1352, + "learning_rate": 6.0991160029626e-06, + "loss": 0.3444, + "step": 338 + }, + { + "epoch": 0.136, + "learning_rate": 6.124844135478966e-06, + "loss": 0.2943, + "step": 340 + }, + { + "epoch": 0.1368, + "learning_rate": 6.1506024872392e-06, + "loss": 0.4057, + "step": 342 + }, + { + "epoch": 0.1376, + "learning_rate": 6.176390857374501e-06, + "loss": 0.5759, + "step": 344 + }, + { + "epoch": 0.1384, + "learning_rate": 6.202209044781979e-06, + "loss": 0.263, + "step": 346 + }, + { + "epoch": 0.1392, + "learning_rate": 6.228056848126223e-06, + "loss": 0.3439, + "step": 348 + }, + { + "epoch": 0.14, + "learning_rate": 6.253934065840883e-06, + "loss": 0.4268, + "step": 350 + }, + { + "epoch": 0.1408, + "learning_rate": 6.279840496130188e-06, + "loss": 0.3465, + "step": 352 + }, + { + "epoch": 0.1416, + "learning_rate": 6.305775936970606e-06, + "loss": 0.276, + "step": 354 + }, + { + "epoch": 0.1424, + "learning_rate": 6.331740186112359e-06, + "loss": 0.4401, + "step": 356 + }, + { + "epoch": 0.1432, + "learning_rate": 6.357733041081015e-06, + "loss": 0.5032, + "step": 358 + }, + { + "epoch": 0.144, + "learning_rate": 6.383754299179072e-06, + "loss": 0.653, + "step": 360 + }, + { + "epoch": 0.1448, + "learning_rate": 6.409803757487532e-06, + "loss": 0.4657, + "step": 362 + }, + { + "epoch": 0.1456, + "learning_rate": 6.435881212867485e-06, + "loss": 0.632, + "step": 364 + }, + { + "epoch": 0.1464, + "learning_rate": 6.4619864619616975e-06, + "loss": 0.3643, + "step": 366 + }, + { + "epoch": 0.1472, + "learning_rate": 6.48811930119619e-06, + "loss": 0.3886, + "step": 368 + }, + { + "epoch": 0.148, + "learning_rate": 6.514279526781853e-06, + "loss": 0.2475, + "step": 370 + }, + { + "epoch": 0.1488, + "learning_rate": 6.540466934715955e-06, + "loss": 0.1889, + "step": 372 + }, + { + "epoch": 0.1496, + "learning_rate": 6.566681320783848e-06, + "loss": 0.2385, + "step": 374 + }, + { + "epoch": 0.1504, + "learning_rate": 6.592922480560483e-06, + "loss": 0.2743, + "step": 376 + }, + { + "epoch": 0.1512, + "learning_rate": 6.619190209412025e-06, + "loss": 0.2938, + "step": 378 + }, + { + "epoch": 0.152, + "learning_rate": 6.6454843024974465e-06, + "loss": 0.4063, + "step": 380 + }, + { + "epoch": 0.1528, + "learning_rate": 6.671804554770128e-06, + "loss": 0.5354, + "step": 382 + }, + { + "epoch": 0.1536, + "learning_rate": 6.698150760979456e-06, + "loss": 0.2152, + "step": 384 + }, + { + "epoch": 0.1544, + "learning_rate": 6.724522715672421e-06, + "loss": 0.2733, + "step": 386 + }, + { + "epoch": 0.1552, + "learning_rate": 6.750920213195242e-06, + "loss": 0.7761, + "step": 388 + }, + { + "epoch": 0.156, + "learning_rate": 6.777343047694894e-06, + "loss": 0.5654, + "step": 390 + }, + { + "epoch": 0.1568, + "learning_rate": 6.803791013120824e-06, + "loss": 0.3181, + "step": 392 + }, + { + "epoch": 0.1576, + "learning_rate": 6.8302639032264836e-06, + "loss": 0.348, + "step": 394 + }, + { + "epoch": 0.1584, + "learning_rate": 6.856761511570944e-06, + "loss": 0.2651, + "step": 396 + }, + { + "epoch": 0.1592, + "learning_rate": 6.883283631520579e-06, + "loss": 0.4322, + "step": 398 + }, + { + "epoch": 0.16, + "learning_rate": 6.909830056250522e-06, + "loss": 0.2099, + "step": 400 + }, + { + "epoch": 0.1608, + "learning_rate": 6.936400578746436e-06, + "loss": 0.3784, + "step": 402 + }, + { + "epoch": 0.1616, + "learning_rate": 6.96299499180605e-06, + "loss": 0.3442, + "step": 404 + }, + { + "epoch": 0.1624, + "learning_rate": 6.989613088040787e-06, + "loss": 0.3624, + "step": 406 + }, + { + "epoch": 0.1632, + "learning_rate": 7.016254659877404e-06, + "loss": 0.3468, + "step": 408 + }, + { + "epoch": 0.164, + "learning_rate": 7.042919499559539e-06, + "loss": 0.4575, + "step": 410 + }, + { + "epoch": 0.1648, + "learning_rate": 7.06960739914943e-06, + "loss": 0.797, + "step": 412 + }, + { + "epoch": 0.1656, + "learning_rate": 7.09631815052946e-06, + "loss": 0.2691, + "step": 414 + }, + { + "epoch": 0.1664, + "learning_rate": 7.123051545403873e-06, + "loss": 0.3538, + "step": 416 + }, + { + "epoch": 0.1672, + "learning_rate": 7.1498073753002375e-06, + "loss": 0.4981, + "step": 418 + }, + { + "epoch": 0.168, + "learning_rate": 7.1765854315712325e-06, + "loss": 0.9161, + "step": 420 + }, + { + "epoch": 0.1688, + "learning_rate": 7.203385505396197e-06, + "loss": 0.961, + "step": 422 + }, + { + "epoch": 0.1696, + "learning_rate": 7.230207387782771e-06, + "loss": 0.3676, + "step": 424 + }, + { + "epoch": 0.1704, + "learning_rate": 7.257050869568527e-06, + "loss": 0.4763, + "step": 426 + }, + { + "epoch": 0.1712, + "learning_rate": 7.28391574142262e-06, + "loss": 0.3739, + "step": 428 + }, + { + "epoch": 0.172, + "learning_rate": 7.3108017938473485e-06, + "loss": 0.2436, + "step": 430 + }, + { + "epoch": 0.1728, + "learning_rate": 7.337708817179875e-06, + "loss": 0.4273, + "step": 432 + }, + { + "epoch": 0.1736, + "learning_rate": 7.36463660159386e-06, + "loss": 0.3465, + "step": 434 + }, + { + "epoch": 0.1744, + "learning_rate": 7.39158493710103e-06, + "loss": 0.8465, + "step": 436 + }, + { + "epoch": 0.1752, + "learning_rate": 7.418553613552822e-06, + "loss": 0.5765, + "step": 438 + }, + { + "epoch": 0.176, + "learning_rate": 7.445542420642091e-06, + "loss": 0.3633, + "step": 440 + }, + { + "epoch": 0.1768, + "learning_rate": 7.472551147904703e-06, + "loss": 0.3359, + "step": 442 + }, + { + "epoch": 0.1776, + "learning_rate": 7.499579584721173e-06, + "loss": 0.8696, + "step": 444 + }, + { + "epoch": 0.1784, + "learning_rate": 7.5266275203183395e-06, + "loss": 0.2987, + "step": 446 + }, + { + "epoch": 0.1792, + "learning_rate": 7.553694743770917e-06, + "loss": 0.6482, + "step": 448 + }, + { + "epoch": 0.18, + "learning_rate": 7.580781044003312e-06, + "loss": 0.476, + "step": 450 + }, + { + "epoch": 0.1808, + "learning_rate": 7.607886209791095e-06, + "loss": 0.7378, + "step": 452 + }, + { + "epoch": 0.1816, + "learning_rate": 7.635010029762755e-06, + "loss": 0.3455, + "step": 454 + }, + { + "epoch": 0.1824, + "learning_rate": 7.662152292401265e-06, + "loss": 0.4921, + "step": 456 + }, + { + "epoch": 0.1832, + "learning_rate": 7.689312786045822e-06, + "loss": 0.6308, + "step": 458 + }, + { + "epoch": 0.184, + "learning_rate": 7.716491298893441e-06, + "loss": 0.3688, + "step": 460 + }, + { + "epoch": 0.1848, + "learning_rate": 7.74368761900062e-06, + "loss": 0.3464, + "step": 462 + }, + { + "epoch": 0.1856, + "learning_rate": 7.770901534284991e-06, + "loss": 0.7383, + "step": 464 + }, + { + "epoch": 0.1864, + "learning_rate": 7.798132832526976e-06, + "loss": 0.6074, + "step": 466 + }, + { + "epoch": 0.1872, + "learning_rate": 7.825381301371444e-06, + "loss": 0.4597, + "step": 468 + }, + { + "epoch": 0.188, + "learning_rate": 7.852646728329358e-06, + "loss": 0.441, + "step": 470 + }, + { + "epoch": 0.1888, + "learning_rate": 7.879928900779441e-06, + "loss": 0.3051, + "step": 472 + }, + { + "epoch": 0.1896, + "learning_rate": 7.907227605969852e-06, + "loss": 0.35, + "step": 474 + }, + { + "epoch": 0.1904, + "learning_rate": 7.934542631019767e-06, + "loss": 0.3059, + "step": 476 + }, + { + "epoch": 0.1912, + "learning_rate": 7.961873762921153e-06, + "loss": 0.3124, + "step": 478 + }, + { + "epoch": 0.192, + "learning_rate": 7.989220788540351e-06, + "loss": 0.4266, + "step": 480 + }, + { + "epoch": 0.1928, + "learning_rate": 8.016583494619764e-06, + "loss": 0.2483, + "step": 482 + }, + { + "epoch": 0.1936, + "learning_rate": 8.043961667779511e-06, + "loss": 0.2404, + "step": 484 + }, + { + "epoch": 0.1944, + "learning_rate": 8.071355094519103e-06, + "loss": 0.4939, + "step": 486 + }, + { + "epoch": 0.1952, + "learning_rate": 8.098763561219089e-06, + "loss": 0.4651, + "step": 488 + }, + { + "epoch": 0.196, + "learning_rate": 8.126186854142744e-06, + "loss": 0.2406, + "step": 490 + }, + { + "epoch": 0.1968, + "learning_rate": 8.153624759437718e-06, + "loss": 0.402, + "step": 492 + }, + { + "epoch": 0.1976, + "learning_rate": 8.181077063137735e-06, + "loss": 0.3155, + "step": 494 + }, + { + "epoch": 0.1984, + "learning_rate": 8.208543551164178e-06, + "loss": 0.2355, + "step": 496 + }, + { + "epoch": 0.1992, + "learning_rate": 8.236024009327877e-06, + "loss": 0.295, + "step": 498 + }, + { + "epoch": 0.2, + "learning_rate": 8.263518223330695e-06, + "loss": 0.5917, + "step": 500 + }, + { + "epoch": 0.2008, + "learning_rate": 8.29102597876723e-06, + "loss": 0.5848, + "step": 502 + }, + { + "epoch": 0.2016, + "learning_rate": 8.31854706112648e-06, + "loss": 0.4908, + "step": 504 + }, + { + "epoch": 0.2024, + "learning_rate": 8.346081255793516e-06, + "loss": 0.5443, + "step": 506 + }, + { + "epoch": 0.2032, + "learning_rate": 8.373628348051156e-06, + "loss": 0.9282, + "step": 508 + }, + { + "epoch": 0.204, + "learning_rate": 8.401188123081642e-06, + "loss": 0.3142, + "step": 510 + }, + { + "epoch": 0.2048, + "learning_rate": 8.428760365968329e-06, + "loss": 0.5007, + "step": 512 + }, + { + "epoch": 0.2056, + "learning_rate": 8.456344861697293e-06, + "loss": 0.3037, + "step": 514 + }, + { + "epoch": 0.2064, + "learning_rate": 8.483941395159114e-06, + "loss": 0.2874, + "step": 516 + }, + { + "epoch": 0.2072, + "learning_rate": 8.511549751150478e-06, + "loss": 0.6319, + "step": 518 + }, + { + "epoch": 0.208, + "learning_rate": 8.539169714375883e-06, + "loss": 0.3452, + "step": 520 + }, + { + "epoch": 0.2088, + "learning_rate": 8.566801069449304e-06, + "loss": 0.5788, + "step": 522 + }, + { + "epoch": 0.2096, + "learning_rate": 8.594443600895886e-06, + "loss": 0.3476, + "step": 524 + }, + { + "epoch": 0.2104, + "learning_rate": 8.622097093153612e-06, + "loss": 0.3355, + "step": 526 + }, + { + "epoch": 0.2112, + "learning_rate": 8.649761330575e-06, + "loss": 0.5809, + "step": 528 + }, + { + "epoch": 0.212, + "learning_rate": 8.677436097428766e-06, + "loss": 0.4445, + "step": 530 + }, + { + "epoch": 0.2128, + "learning_rate": 8.705121177901537e-06, + "loss": 0.4035, + "step": 532 + }, + { + "epoch": 0.2136, + "learning_rate": 8.732816356099459e-06, + "loss": 0.3521, + "step": 534 + }, + { + "epoch": 0.2144, + "learning_rate": 8.760521416049986e-06, + "loss": 0.3813, + "step": 536 + }, + { + "epoch": 0.2152, + "learning_rate": 8.788236141703477e-06, + "loss": 0.4115, + "step": 538 + }, + { + "epoch": 0.216, + "learning_rate": 8.81596031693499e-06, + "loss": 0.4172, + "step": 540 + }, + { + "epoch": 0.2168, + "learning_rate": 8.84369372554578e-06, + "loss": 0.4372, + "step": 542 + }, + { + "epoch": 0.2176, + "learning_rate": 8.87143615126518e-06, + "loss": 0.2334, + "step": 544 + }, + { + "epoch": 0.2184, + "learning_rate": 8.899187377752173e-06, + "loss": 0.3658, + "step": 546 + }, + { + "epoch": 0.2192, + "learning_rate": 8.926947188597127e-06, + "loss": 0.262, + "step": 548 + }, + { + "epoch": 0.22, + "learning_rate": 8.954715367323473e-06, + "loss": 0.3717, + "step": 550 + }, + { + "epoch": 0.2208, + "learning_rate": 8.982491697389344e-06, + "loss": 0.3241, + "step": 552 + }, + { + "epoch": 0.2216, + "learning_rate": 9.010275962189356e-06, + "loss": 0.7118, + "step": 554 + }, + { + "epoch": 0.2224, + "learning_rate": 9.03806794505621e-06, + "loss": 0.2839, + "step": 556 + }, + { + "epoch": 0.2232, + "learning_rate": 9.065867429262497e-06, + "loss": 0.6255, + "step": 558 + }, + { + "epoch": 0.224, + "learning_rate": 9.093674198022198e-06, + "loss": 0.9816, + "step": 560 + }, + { + "epoch": 0.2248, + "learning_rate": 9.121488034492567e-06, + "loss": 0.4018, + "step": 562 + }, + { + "epoch": 0.2256, + "learning_rate": 9.149308721775717e-06, + "loss": 0.4459, + "step": 564 + }, + { + "epoch": 0.2264, + "learning_rate": 9.177136042920338e-06, + "loss": 0.4469, + "step": 566 + }, + { + "epoch": 0.2272, + "learning_rate": 9.204969780923396e-06, + "loss": 0.3279, + "step": 568 + }, + { + "epoch": 0.228, + "learning_rate": 9.232809718731822e-06, + "loss": 0.469, + "step": 570 + }, + { + "epoch": 0.2288, + "learning_rate": 9.26065563924414e-06, + "loss": 0.3866, + "step": 572 + }, + { + "epoch": 0.2296, + "learning_rate": 9.288507325312319e-06, + "loss": 0.2673, + "step": 574 + }, + { + "epoch": 0.2304, + "learning_rate": 9.316364559743298e-06, + "loss": 0.2224, + "step": 576 + }, + { + "epoch": 0.2312, + "learning_rate": 9.344227125300788e-06, + "loss": 0.3321, + "step": 578 + }, + { + "epoch": 0.232, + "learning_rate": 9.372094804706867e-06, + "loss": 0.3654, + "step": 580 + }, + { + "epoch": 0.2328, + "learning_rate": 9.39996738064379e-06, + "loss": 0.5099, + "step": 582 + }, + { + "epoch": 0.2336, + "learning_rate": 9.427844635755615e-06, + "loss": 0.2288, + "step": 584 + }, + { + "epoch": 0.2344, + "learning_rate": 9.455726352649904e-06, + "loss": 0.2673, + "step": 586 + }, + { + "epoch": 0.2352, + "learning_rate": 9.483612313899446e-06, + "loss": 0.4036, + "step": 588 + }, + { + "epoch": 0.236, + "learning_rate": 9.511502302043859e-06, + "loss": 0.4949, + "step": 590 + }, + { + "epoch": 0.2368, + "learning_rate": 9.539396099591469e-06, + "loss": 0.2731, + "step": 592 + }, + { + "epoch": 0.2376, + "learning_rate": 9.567293489020816e-06, + "loss": 0.2777, + "step": 594 + }, + { + "epoch": 0.2384, + "learning_rate": 9.595194252782461e-06, + "loss": 0.2858, + "step": 596 + }, + { + "epoch": 0.2392, + "learning_rate": 9.623098173300656e-06, + "loss": 0.444, + "step": 598 + }, + { + "epoch": 0.24, + "learning_rate": 9.651005032974991e-06, + "loss": 0.3982, + "step": 600 + }, + { + "epoch": 0.2408, + "learning_rate": 9.678914614182184e-06, + "loss": 0.4207, + "step": 602 + }, + { + "epoch": 0.2416, + "learning_rate": 9.706826699277714e-06, + "loss": 0.6006, + "step": 604 + }, + { + "epoch": 0.2424, + "learning_rate": 9.734741070597535e-06, + "loss": 0.3266, + "step": 606 + }, + { + "epoch": 0.2432, + "learning_rate": 9.762657510459774e-06, + "loss": 0.3242, + "step": 608 + }, + { + "epoch": 0.244, + "learning_rate": 9.790575801166422e-06, + "loss": 0.2771, + "step": 610 + }, + { + "epoch": 0.2448, + "learning_rate": 9.818495725005043e-06, + "loss": 0.4186, + "step": 612 + }, + { + "epoch": 0.2456, + "learning_rate": 9.846417064250459e-06, + "loss": 0.3253, + "step": 614 + }, + { + "epoch": 0.2464, + "learning_rate": 9.874339601166479e-06, + "loss": 0.4127, + "step": 616 + }, + { + "epoch": 0.2472, + "learning_rate": 9.902263118007513e-06, + "loss": 0.2735, + "step": 618 + }, + { + "epoch": 0.248, + "learning_rate": 9.930187397020385e-06, + "loss": 0.4919, + "step": 620 + }, + { + "epoch": 0.2488, + "learning_rate": 9.95811222044596e-06, + "loss": 0.4266, + "step": 622 + }, + { + "epoch": 0.2496, + "learning_rate": 9.986037370520855e-06, + "loss": 0.5172, + "step": 624 + }, + { + "epoch": 0.2504, + "learning_rate": 1.0013962629479139e-05, + "loss": 0.5194, + "step": 626 + }, + { + "epoch": 0.2512, + "learning_rate": 1.0041887779554034e-05, + "loss": 0.342, + "step": 628 + }, + { + "epoch": 0.252, + "learning_rate": 1.0069812602979607e-05, + "loss": 0.5551, + "step": 630 + }, + { + "epoch": 0.2528, + "learning_rate": 1.0097736881992482e-05, + "loss": 0.4521, + "step": 632 + }, + { + "epoch": 0.2536, + "learning_rate": 1.0125660398833514e-05, + "loss": 0.2162, + "step": 634 + }, + { + "epoch": 0.2544, + "learning_rate": 1.0153582935749533e-05, + "loss": 0.27, + "step": 636 + }, + { + "epoch": 0.2552, + "learning_rate": 1.0181504274994952e-05, + "loss": 0.9982, + "step": 638 + }, + { + "epoch": 0.256, + "learning_rate": 1.0209424198833571e-05, + "loss": 0.2717, + "step": 640 + }, + { + "epoch": 0.2568, + "learning_rate": 1.0237342489540218e-05, + "loss": 0.272, + "step": 642 + }, + { + "epoch": 0.2576, + "learning_rate": 1.0265258929402458e-05, + "loss": 0.3665, + "step": 644 + }, + { + "epoch": 0.2584, + "learning_rate": 1.029317330072228e-05, + "loss": 0.4587, + "step": 646 + }, + { + "epoch": 0.2592, + "learning_rate": 1.0321085385817811e-05, + "loss": 0.4466, + "step": 648 + }, + { + "epoch": 0.26, + "learning_rate": 1.0348994967025004e-05, + "loss": 0.3845, + "step": 650 + }, + { + "epoch": 0.2608, + "learning_rate": 1.0376901826699337e-05, + "loss": 0.2762, + "step": 652 + }, + { + "epoch": 0.2616, + "learning_rate": 1.0404805747217532e-05, + "loss": 0.2913, + "step": 654 + }, + { + "epoch": 0.2624, + "learning_rate": 1.0432706510979175e-05, + "loss": 0.4641, + "step": 656 + }, + { + "epoch": 0.2632, + "learning_rate": 1.0460603900408526e-05, + "loss": 0.3176, + "step": 658 + }, + { + "epoch": 0.264, + "learning_rate": 1.0488497697956134e-05, + "loss": 0.2859, + "step": 660 + }, + { + "epoch": 0.2648, + "learning_rate": 1.0516387686100549e-05, + "loss": 0.3802, + "step": 662 + }, + { + "epoch": 0.2656, + "learning_rate": 1.054427364735009e-05, + "loss": 1.0116, + "step": 664 + }, + { + "epoch": 0.2664, + "learning_rate": 1.0572155364244378e-05, + "loss": 0.4018, + "step": 666 + }, + { + "epoch": 0.2672, + "learning_rate": 1.0600032619356203e-05, + "loss": 0.4008, + "step": 668 + }, + { + "epoch": 0.268, + "learning_rate": 1.0627905195293127e-05, + "loss": 0.3625, + "step": 670 + }, + { + "epoch": 0.2688, + "learning_rate": 1.0655772874699206e-05, + "loss": 0.3154, + "step": 672 + }, + { + "epoch": 0.2696, + "learning_rate": 1.0683635440256694e-05, + "loss": 0.5846, + "step": 674 + }, + { + "epoch": 0.2704, + "learning_rate": 1.0711492674687674e-05, + "loss": 0.2596, + "step": 676 + }, + { + "epoch": 0.2712, + "learning_rate": 1.0739344360755855e-05, + "loss": 0.7721, + "step": 678 + }, + { + "epoch": 0.272, + "learning_rate": 1.0767190281268171e-05, + "loss": 0.6305, + "step": 680 + }, + { + "epoch": 0.2728, + "learning_rate": 1.07950302190766e-05, + "loss": 0.3027, + "step": 682 + }, + { + "epoch": 0.2736, + "learning_rate": 1.0822863957079654e-05, + "loss": 0.3475, + "step": 684 + }, + { + "epoch": 0.2744, + "learning_rate": 1.0850691278224277e-05, + "loss": 0.5406, + "step": 686 + }, + { + "epoch": 0.2752, + "learning_rate": 1.0878511965507428e-05, + "loss": 0.4604, + "step": 688 + }, + { + "epoch": 0.276, + "learning_rate": 1.0906325801977795e-05, + "loss": 0.514, + "step": 690 + }, + { + "epoch": 0.2768, + "learning_rate": 1.0934132570737497e-05, + "loss": 0.3485, + "step": 692 + }, + { + "epoch": 0.2776, + "learning_rate": 1.0961932054943785e-05, + "loss": 0.7798, + "step": 694 + }, + { + "epoch": 0.2784, + "learning_rate": 1.098972403781064e-05, + "loss": 0.2672, + "step": 696 + }, + { + "epoch": 0.2792, + "learning_rate": 1.101750830261065e-05, + "loss": 0.2432, + "step": 698 + }, + { + "epoch": 0.28, + "learning_rate": 1.104528463267652e-05, + "loss": 0.4459, + "step": 700 + }, + { + "epoch": 0.2808, + "learning_rate": 1.1073052811402867e-05, + "loss": 0.4207, + "step": 702 + }, + { + "epoch": 0.2816, + "learning_rate": 1.1100812622247821e-05, + "loss": 0.3667, + "step": 704 + }, + { + "epoch": 0.2824, + "learning_rate": 1.1128563848734815e-05, + "loss": 0.5361, + "step": 706 + }, + { + "epoch": 0.2832, + "learning_rate": 1.1156306274454211e-05, + "loss": 0.4286, + "step": 708 + }, + { + "epoch": 0.284, + "learning_rate": 1.1184039683065002e-05, + "loss": 0.3239, + "step": 710 + }, + { + "epoch": 0.2848, + "learning_rate": 1.1211763858296516e-05, + "loss": 0.3586, + "step": 712 + }, + { + "epoch": 0.2856, + "learning_rate": 1.1239478583950007e-05, + "loss": 0.2812, + "step": 714 + }, + { + "epoch": 0.2864, + "learning_rate": 1.1267183643900534e-05, + "loss": 0.3338, + "step": 716 + }, + { + "epoch": 0.2872, + "learning_rate": 1.1294878822098456e-05, + "loss": 0.2453, + "step": 718 + }, + { + "epoch": 0.288, + "learning_rate": 1.1322563902571227e-05, + "loss": 0.5514, + "step": 720 + }, + { + "epoch": 0.2888, + "learning_rate": 1.1350238669424993e-05, + "loss": 0.6422, + "step": 722 + }, + { + "epoch": 0.2896, + "learning_rate": 1.137790290684638e-05, + "loss": 0.6737, + "step": 724 + }, + { + "epoch": 0.2904, + "learning_rate": 1.1405556399104108e-05, + "loss": 0.2843, + "step": 726 + }, + { + "epoch": 0.2912, + "learning_rate": 1.143319893055069e-05, + "loss": 0.4204, + "step": 728 + }, + { + "epoch": 0.292, + "learning_rate": 1.1460830285624112e-05, + "loss": 0.4902, + "step": 730 + }, + { + "epoch": 0.2928, + "learning_rate": 1.1488450248849515e-05, + "loss": 0.7357, + "step": 732 + }, + { + "epoch": 0.2936, + "learning_rate": 1.1516058604840881e-05, + "loss": 0.3957, + "step": 734 + }, + { + "epoch": 0.2944, + "learning_rate": 1.15436551383027e-05, + "loss": 0.2672, + "step": 736 + }, + { + "epoch": 0.2952, + "learning_rate": 1.1571239634031666e-05, + "loss": 0.3371, + "step": 738 + }, + { + "epoch": 0.296, + "learning_rate": 1.1598811876918352e-05, + "loss": 0.2517, + "step": 740 + }, + { + "epoch": 0.2968, + "learning_rate": 1.1626371651948839e-05, + "loss": 0.4991, + "step": 742 + }, + { + "epoch": 0.2976, + "learning_rate": 1.1653918744206478e-05, + "loss": 0.3371, + "step": 744 + }, + { + "epoch": 0.2984, + "learning_rate": 1.1681452938873515e-05, + "loss": 1.1836, + "step": 746 + }, + { + "epoch": 0.2992, + "learning_rate": 1.1708974021232763e-05, + "loss": 0.3795, + "step": 748 + }, + { + "epoch": 0.3, + "learning_rate": 1.1736481776669297e-05, + "loss": 0.1885, + "step": 750 + }, + { + "epoch": 0.3008, + "learning_rate": 1.1763975990672116e-05, + "loss": 0.2099, + "step": 752 + }, + { + "epoch": 0.3016, + "learning_rate": 1.1791456448835815e-05, + "loss": 0.3667, + "step": 754 + }, + { + "epoch": 0.3024, + "learning_rate": 1.1818922936862258e-05, + "loss": 0.3202, + "step": 756 + }, + { + "epoch": 0.3032, + "learning_rate": 1.1846375240562274e-05, + "loss": 0.3827, + "step": 758 + }, + { + "epoch": 0.304, + "learning_rate": 1.187381314585725e-05, + "loss": 0.5721, + "step": 760 + }, + { + "epoch": 0.3048, + "learning_rate": 1.1901236438780906e-05, + "loss": 0.5338, + "step": 762 + }, + { + "epoch": 0.3056, + "learning_rate": 1.192864490548089e-05, + "loss": 0.4324, + "step": 764 + }, + { + "epoch": 0.3064, + "learning_rate": 1.195603833222048e-05, + "loss": 0.2322, + "step": 766 + }, + { + "epoch": 0.3072, + "learning_rate": 1.198341650538023e-05, + "loss": 0.4336, + "step": 768 + }, + { + "epoch": 0.308, + "learning_rate": 1.2010779211459642e-05, + "loss": 0.3213, + "step": 770 + }, + { + "epoch": 0.3088, + "learning_rate": 1.203812623707884e-05, + "loss": 0.2136, + "step": 772 + }, + { + "epoch": 0.3096, + "learning_rate": 1.2065457368980227e-05, + "loss": 0.3464, + "step": 774 + }, + { + "epoch": 0.3104, + "learning_rate": 1.2092772394030141e-05, + "loss": 0.2926, + "step": 776 + }, + { + "epoch": 0.3112, + "learning_rate": 1.2120071099220552e-05, + "loss": 0.6897, + "step": 778 + }, + { + "epoch": 0.312, + "learning_rate": 1.2147353271670637e-05, + "loss": 0.4736, + "step": 780 + }, + { + "epoch": 0.3128, + "learning_rate": 1.217461869862855e-05, + "loss": 0.5841, + "step": 782 + }, + { + "epoch": 0.3136, + "learning_rate": 1.2201867167473015e-05, + "loss": 0.286, + "step": 784 + }, + { + "epoch": 0.3144, + "learning_rate": 1.2229098465715002e-05, + "loss": 0.4417, + "step": 786 + }, + { + "epoch": 0.3152, + "learning_rate": 1.2256312380999373e-05, + "loss": 0.4855, + "step": 788 + }, + { + "epoch": 0.316, + "learning_rate": 1.2283508701106552e-05, + "loss": 0.4042, + "step": 790 + }, + { + "epoch": 0.3168, + "learning_rate": 1.2310687213954173e-05, + "loss": 0.3355, + "step": 792 + }, + { + "epoch": 0.3176, + "learning_rate": 1.233784770759873e-05, + "loss": 0.3118, + "step": 794 + }, + { + "epoch": 0.3184, + "learning_rate": 1.2364989970237238e-05, + "loss": 0.3242, + "step": 796 + }, + { + "epoch": 0.3192, + "learning_rate": 1.23921137902089e-05, + "loss": 0.2826, + "step": 798 + }, + { + "epoch": 0.32, + "learning_rate": 1.241921895599668e-05, + "loss": 0.3227, + "step": 800 + }, + { + "epoch": 0.3208, + "learning_rate": 1.2446305256229076e-05, + "loss": 0.4918, + "step": 802 + }, + { + "epoch": 0.3216, + "learning_rate": 1.2473372479681653e-05, + "loss": 0.6615, + "step": 804 + }, + { + "epoch": 0.3224, + "learning_rate": 1.2500420415278822e-05, + "loss": 0.5442, + "step": 806 + }, + { + "epoch": 0.3232, + "learning_rate": 1.2527448852095292e-05, + "loss": 0.4694, + "step": 808 + }, + { + "epoch": 0.324, + "learning_rate": 1.2554457579357902e-05, + "loss": 0.3882, + "step": 810 + }, + { + "epoch": 0.3248, + "learning_rate": 1.2581446386447171e-05, + "loss": 0.3797, + "step": 812 + }, + { + "epoch": 0.3256, + "learning_rate": 1.2608415062898963e-05, + "loss": 0.2297, + "step": 814 + }, + { + "epoch": 0.3264, + "learning_rate": 1.2635363398406133e-05, + "loss": 0.4983, + "step": 816 + }, + { + "epoch": 0.3272, + "learning_rate": 1.266229118282012e-05, + "loss": 0.667, + "step": 818 + }, + { + "epoch": 0.328, + "learning_rate": 1.2689198206152644e-05, + "loss": 0.2424, + "step": 820 + }, + { + "epoch": 0.3288, + "learning_rate": 1.2716084258577373e-05, + "loss": 0.2759, + "step": 822 + }, + { + "epoch": 0.3296, + "learning_rate": 1.2742949130431468e-05, + "loss": 0.2339, + "step": 824 + }, + { + "epoch": 0.3304, + "learning_rate": 1.2769792612217224e-05, + "loss": 0.3662, + "step": 826 + }, + { + "epoch": 0.3312, + "learning_rate": 1.2796614494603795e-05, + "loss": 0.3535, + "step": 828 + }, + { + "epoch": 0.332, + "learning_rate": 1.282341456842876e-05, + "loss": 0.3086, + "step": 830 + }, + { + "epoch": 0.3328, + "learning_rate": 1.2850192624699756e-05, + "loss": 0.241, + "step": 832 + }, + { + "epoch": 0.3336, + "learning_rate": 1.2876948454596122e-05, + "loss": 0.5256, + "step": 834 + }, + { + "epoch": 0.3344, + "learning_rate": 1.2903681849470535e-05, + "loss": 0.375, + "step": 836 + }, + { + "epoch": 0.3352, + "learning_rate": 1.2930392600850565e-05, + "loss": 0.2919, + "step": 838 + }, + { + "epoch": 0.336, + "learning_rate": 1.2957080500440455e-05, + "loss": 0.248, + "step": 840 + }, + { + "epoch": 0.3368, + "learning_rate": 1.2983745340122589e-05, + "loss": 0.4049, + "step": 842 + }, + { + "epoch": 0.3376, + "learning_rate": 1.3010386911959205e-05, + "loss": 1.0449, + "step": 844 + }, + { + "epoch": 0.3384, + "learning_rate": 1.3037005008193944e-05, + "loss": 0.3531, + "step": 846 + }, + { + "epoch": 0.3392, + "learning_rate": 1.3063599421253556e-05, + "loss": 0.4393, + "step": 848 + }, + { + "epoch": 0.34, + "learning_rate": 1.309016994374947e-05, + "loss": 0.946, + "step": 850 + }, + { + "epoch": 0.3408, + "learning_rate": 1.3116716368479415e-05, + "loss": 0.7443, + "step": 852 + }, + { + "epoch": 0.3416, + "learning_rate": 1.3143238488429049e-05, + "loss": 0.4829, + "step": 854 + }, + { + "epoch": 0.3424, + "learning_rate": 1.316973609677351e-05, + "loss": 0.4061, + "step": 856 + }, + { + "epoch": 0.3432, + "learning_rate": 1.319620898687917e-05, + "loss": 0.6136, + "step": 858 + }, + { + "epoch": 0.344, + "learning_rate": 1.32226569523051e-05, + "loss": 0.2964, + "step": 860 + }, + { + "epoch": 0.3448, + "learning_rate": 1.324907978680475e-05, + "loss": 0.3351, + "step": 862 + }, + { + "epoch": 0.3456, + "learning_rate": 1.3275477284327572e-05, + "loss": 0.3508, + "step": 864 + }, + { + "epoch": 0.3464, + "learning_rate": 1.3301849239020537e-05, + "loss": 0.4017, + "step": 866 + }, + { + "epoch": 0.3472, + "learning_rate": 1.3328195445229865e-05, + "loss": 0.7801, + "step": 868 + }, + { + "epoch": 0.348, + "learning_rate": 1.3354515697502548e-05, + "loss": 0.3268, + "step": 870 + }, + { + "epoch": 0.3488, + "learning_rate": 1.338080979058797e-05, + "loss": 0.5479, + "step": 872 + }, + { + "epoch": 0.3496, + "learning_rate": 1.340707751943951e-05, + "loss": 0.5759, + "step": 874 + }, + { + "epoch": 0.3504, + "learning_rate": 1.3433318679216145e-05, + "loss": 0.3954, + "step": 876 + }, + { + "epoch": 0.3512, + "learning_rate": 1.3459533065284039e-05, + "loss": 0.4792, + "step": 878 + }, + { + "epoch": 0.352, + "learning_rate": 1.348572047321814e-05, + "loss": 0.2747, + "step": 880 + }, + { + "epoch": 0.3528, + "learning_rate": 1.3511880698803803e-05, + "loss": 0.4234, + "step": 882 + }, + { + "epoch": 0.3536, + "learning_rate": 1.3538013538038296e-05, + "loss": 0.2369, + "step": 884 + }, + { + "epoch": 0.3544, + "learning_rate": 1.3564118787132507e-05, + "loss": 0.2081, + "step": 886 + }, + { + "epoch": 0.3552, + "learning_rate": 1.3590196242512461e-05, + "loss": 0.6625, + "step": 888 + }, + { + "epoch": 0.356, + "learning_rate": 1.361624570082092e-05, + "loss": 0.3053, + "step": 890 + }, + { + "epoch": 0.3568, + "learning_rate": 1.364226695891898e-05, + "loss": 0.538, + "step": 892 + }, + { + "epoch": 0.3576, + "learning_rate": 1.3668259813887637e-05, + "loss": 0.4137, + "step": 894 + }, + { + "epoch": 0.3584, + "learning_rate": 1.3694224063029386e-05, + "loss": 0.8427, + "step": 896 + }, + { + "epoch": 0.3592, + "learning_rate": 1.3720159503869806e-05, + "loss": 0.2233, + "step": 898 + }, + { + "epoch": 0.36, + "learning_rate": 1.374606593415911e-05, + "loss": 0.8037, + "step": 900 + }, + { + "epoch": 0.3608, + "learning_rate": 1.377194315187377e-05, + "loss": 0.9872, + "step": 902 + }, + { + "epoch": 0.3616, + "learning_rate": 1.3797790955218014e-05, + "loss": 0.473, + "step": 904 + }, + { + "epoch": 0.3624, + "learning_rate": 1.3823609142625492e-05, + "loss": 0.2346, + "step": 906 + }, + { + "epoch": 0.3632, + "learning_rate": 1.3849397512760793e-05, + "loss": 0.5327, + "step": 908 + }, + { + "epoch": 0.364, + "learning_rate": 1.3875155864521027e-05, + "loss": 0.3929, + "step": 910 + }, + { + "epoch": 0.3648, + "learning_rate": 1.3900883997037393e-05, + "loss": 0.541, + "step": 912 + }, + { + "epoch": 0.3656, + "learning_rate": 1.3926581709676746e-05, + "loss": 0.3351, + "step": 914 + }, + { + "epoch": 0.3664, + "learning_rate": 1.3952248802043158e-05, + "loss": 0.4472, + "step": 916 + }, + { + "epoch": 0.3672, + "learning_rate": 1.397788507397949e-05, + "loss": 0.3121, + "step": 918 + }, + { + "epoch": 0.368, + "learning_rate": 1.4003490325568956e-05, + "loss": 0.7905, + "step": 920 + }, + { + "epoch": 0.3688, + "learning_rate": 1.4029064357136632e-05, + "loss": 0.4028, + "step": 922 + }, + { + "epoch": 0.3696, + "learning_rate": 1.4054606969251096e-05, + "loss": 0.5571, + "step": 924 + }, + { + "epoch": 0.3704, + "learning_rate": 1.4080117962725929e-05, + "loss": 0.7056, + "step": 926 + }, + { + "epoch": 0.3712, + "learning_rate": 1.4105597138621281e-05, + "loss": 0.5847, + "step": 928 + }, + { + "epoch": 0.372, + "learning_rate": 1.4131044298245416e-05, + "loss": 0.6136, + "step": 930 + }, + { + "epoch": 0.3728, + "learning_rate": 1.4156459243156275e-05, + "loss": 0.2658, + "step": 932 + }, + { + "epoch": 0.3736, + "learning_rate": 1.418184177516301e-05, + "loss": 0.3698, + "step": 934 + }, + { + "epoch": 0.3744, + "learning_rate": 1.420719169632754e-05, + "loss": 0.2825, + "step": 936 + }, + { + "epoch": 0.3752, + "learning_rate": 1.4232508808966085e-05, + "loss": 0.5185, + "step": 938 + }, + { + "epoch": 0.376, + "learning_rate": 1.4257792915650735e-05, + "loss": 0.7429, + "step": 940 + }, + { + "epoch": 0.3768, + "learning_rate": 1.4283043819210906e-05, + "loss": 0.4087, + "step": 942 + }, + { + "epoch": 0.3776, + "learning_rate": 1.430826132273499e-05, + "loss": 0.5076, + "step": 944 + }, + { + "epoch": 0.3784, + "learning_rate": 1.4333445229571857e-05, + "loss": 0.4058, + "step": 946 + }, + { + "epoch": 0.3792, + "learning_rate": 1.4358595343332342e-05, + "loss": 0.9642, + "step": 948 + }, + { + "epoch": 0.38, + "learning_rate": 1.4383711467890772e-05, + "loss": 0.5352, + "step": 950 + }, + { + "epoch": 0.3808, + "learning_rate": 1.4408793407386584e-05, + "loss": 0.5089, + "step": 952 + }, + { + "epoch": 0.3816, + "learning_rate": 1.4433840966225767e-05, + "loss": 0.2588, + "step": 954 + }, + { + "epoch": 0.3824, + "learning_rate": 1.4458853949082434e-05, + "loss": 0.9228, + "step": 956 + }, + { + "epoch": 0.3832, + "learning_rate": 1.4483832160900332e-05, + "loss": 0.3173, + "step": 958 + }, + { + "epoch": 0.384, + "learning_rate": 1.4508775406894315e-05, + "loss": 0.2883, + "step": 960 + }, + { + "epoch": 0.3848, + "learning_rate": 1.4533683492551942e-05, + "loss": 0.3414, + "step": 962 + }, + { + "epoch": 0.3856, + "learning_rate": 1.4558556223634988e-05, + "loss": 0.3254, + "step": 964 + }, + { + "epoch": 0.3864, + "learning_rate": 1.4583393406180886e-05, + "loss": 0.3411, + "step": 966 + }, + { + "epoch": 0.3872, + "learning_rate": 1.460819484650431e-05, + "loss": 0.4813, + "step": 968 + }, + { + "epoch": 0.388, + "learning_rate": 1.4632960351198618e-05, + "loss": 0.23, + "step": 970 + }, + { + "epoch": 0.3888, + "learning_rate": 1.4657689727137441e-05, + "loss": 0.5028, + "step": 972 + }, + { + "epoch": 0.3896, + "learning_rate": 1.468238278147614e-05, + "loss": 0.643, + "step": 974 + }, + { + "epoch": 0.3904, + "learning_rate": 1.470703932165332e-05, + "loss": 0.269, + "step": 976 + }, + { + "epoch": 0.3912, + "learning_rate": 1.4731659155392339e-05, + "loss": 0.5183, + "step": 978 + }, + { + "epoch": 0.392, + "learning_rate": 1.4756242090702744e-05, + "loss": 0.5015, + "step": 980 + }, + { + "epoch": 0.3928, + "learning_rate": 1.4780787935881913e-05, + "loss": 0.321, + "step": 982 + }, + { + "epoch": 0.3936, + "learning_rate": 1.4805296499516397e-05, + "loss": 0.789, + "step": 984 + }, + { + "epoch": 0.3944, + "learning_rate": 1.482976759048351e-05, + "loss": 0.3935, + "step": 986 + }, + { + "epoch": 0.3952, + "learning_rate": 1.485420101795274e-05, + "loss": 0.3647, + "step": 988 + }, + { + "epoch": 0.396, + "learning_rate": 1.4878596591387327e-05, + "loss": 0.2917, + "step": 990 + }, + { + "epoch": 0.3968, + "learning_rate": 1.4902954120545686e-05, + "loss": 0.8508, + "step": 992 + }, + { + "epoch": 0.3976, + "learning_rate": 1.4927273415482913e-05, + "loss": 0.274, + "step": 994 + }, + { + "epoch": 0.3984, + "learning_rate": 1.4951554286552261e-05, + "loss": 0.2655, + "step": 996 + }, + { + "epoch": 0.3992, + "learning_rate": 1.4975796544406617e-05, + "loss": 0.7718, + "step": 998 + }, + { + "epoch": 0.4, + "learning_rate": 1.4999999999999992e-05, + "loss": 0.48, + "step": 1000 + }, + { + "epoch": 0.4008, + "learning_rate": 1.502416446458897e-05, + "loss": 0.591, + "step": 1002 + }, + { + "epoch": 0.4016, + "learning_rate": 1.5048289749734206e-05, + "loss": 0.457, + "step": 1004 + }, + { + "epoch": 0.4024, + "learning_rate": 1.5072375667301895e-05, + "loss": 0.436, + "step": 1006 + }, + { + "epoch": 0.4032, + "learning_rate": 1.5096422029465178e-05, + "loss": 0.4374, + "step": 1008 + }, + { + "epoch": 0.404, + "learning_rate": 1.5120428648705714e-05, + "loss": 0.2004, + "step": 1010 + }, + { + "epoch": 0.4048, + "learning_rate": 1.5144395337815064e-05, + "loss": 0.3874, + "step": 1012 + }, + { + "epoch": 0.4056, + "learning_rate": 1.5168321909896166e-05, + "loss": 0.3859, + "step": 1014 + }, + { + "epoch": 0.4064, + "learning_rate": 1.5192208178364808e-05, + "loss": 0.2814, + "step": 1016 + }, + { + "epoch": 0.4072, + "learning_rate": 1.521605395695107e-05, + "loss": 0.5106, + "step": 1018 + }, + { + "epoch": 0.408, + "learning_rate": 1.5239859059700784e-05, + "loss": 0.4411, + "step": 1020 + }, + { + "epoch": 0.4088, + "learning_rate": 1.526362330097697e-05, + "loss": 0.3376, + "step": 1022 + }, + { + "epoch": 0.4096, + "learning_rate": 1.5287346495461322e-05, + "loss": 0.4434, + "step": 1024 + }, + { + "epoch": 0.4104, + "learning_rate": 1.531102845815557e-05, + "loss": 0.3665, + "step": 1026 + }, + { + "epoch": 0.4112, + "learning_rate": 1.5334669004383025e-05, + "loss": 0.2121, + "step": 1028 + }, + { + "epoch": 0.412, + "learning_rate": 1.5358267949789968e-05, + "loss": 0.4192, + "step": 1030 + }, + { + "epoch": 0.4128, + "learning_rate": 1.5381825110347072e-05, + "loss": 0.3138, + "step": 1032 + }, + { + "epoch": 0.4136, + "learning_rate": 1.540534030235087e-05, + "loss": 0.2972, + "step": 1034 + }, + { + "epoch": 0.4144, + "learning_rate": 1.542881334242517e-05, + "loss": 0.4245, + "step": 1036 + }, + { + "epoch": 0.4152, + "learning_rate": 1.5452244047522493e-05, + "loss": 0.5862, + "step": 1038 + }, + { + "epoch": 0.416, + "learning_rate": 1.5475632234925495e-05, + "loss": 0.2114, + "step": 1040 + }, + { + "epoch": 0.4168, + "learning_rate": 1.5498977722248388e-05, + "loss": 0.5851, + "step": 1042 + }, + { + "epoch": 0.4176, + "learning_rate": 1.552228032743839e-05, + "loss": 0.4185, + "step": 1044 + }, + { + "epoch": 0.4184, + "learning_rate": 1.5545539868777075e-05, + "loss": 0.4525, + "step": 1046 + }, + { + "epoch": 0.4192, + "learning_rate": 1.556875616488188e-05, + "loss": 0.7098, + "step": 1048 + }, + { + "epoch": 0.42, + "learning_rate": 1.5591929034707468e-05, + "loss": 0.3528, + "step": 1050 + }, + { + "epoch": 0.4208, + "learning_rate": 1.5615058297547144e-05, + "loss": 0.5871, + "step": 1052 + }, + { + "epoch": 0.4216, + "learning_rate": 1.5638143773034268e-05, + "loss": 0.5177, + "step": 1054 + }, + { + "epoch": 0.4224, + "learning_rate": 1.5661185281143663e-05, + "loss": 0.2322, + "step": 1056 + }, + { + "epoch": 0.4232, + "learning_rate": 1.5684182642193024e-05, + "loss": 0.3836, + "step": 1058 + }, + { + "epoch": 0.424, + "learning_rate": 1.5707135676844312e-05, + "loss": 0.3008, + "step": 1060 + }, + { + "epoch": 0.4248, + "learning_rate": 1.5730044206105146e-05, + "loss": 0.5311, + "step": 1062 + }, + { + "epoch": 0.4256, + "learning_rate": 1.5752908051330232e-05, + "loss": 0.2635, + "step": 1064 + }, + { + "epoch": 0.4264, + "learning_rate": 1.577572703422268e-05, + "loss": 0.4447, + "step": 1066 + }, + { + "epoch": 0.4272, + "learning_rate": 1.579850097683548e-05, + "loss": 0.3446, + "step": 1068 + }, + { + "epoch": 0.428, + "learning_rate": 1.582122970157288e-05, + "loss": 0.6397, + "step": 1070 + }, + { + "epoch": 0.4288, + "learning_rate": 1.5843913031191722e-05, + "loss": 0.505, + "step": 1072 + }, + { + "epoch": 0.4296, + "learning_rate": 1.586655078880281e-05, + "loss": 0.4105, + "step": 1074 + }, + { + "epoch": 0.4304, + "learning_rate": 1.5889142797872383e-05, + "loss": 0.6582, + "step": 1076 + }, + { + "epoch": 0.4312, + "learning_rate": 1.5911688882223415e-05, + "loss": 0.2914, + "step": 1078 + }, + { + "epoch": 0.432, + "learning_rate": 1.5934188866037007e-05, + "loss": 0.2608, + "step": 1080 + }, + { + "epoch": 0.4328, + "learning_rate": 1.5956642573853787e-05, + "loss": 0.683, + "step": 1082 + }, + { + "epoch": 0.4336, + "learning_rate": 1.5979049830575193e-05, + "loss": 0.2178, + "step": 1084 + }, + { + "epoch": 0.4344, + "learning_rate": 1.6001410461464945e-05, + "loss": 0.5141, + "step": 1086 + }, + { + "epoch": 0.4352, + "learning_rate": 1.6023724292150377e-05, + "loss": 0.7005, + "step": 1088 + }, + { + "epoch": 0.436, + "learning_rate": 1.604599114862375e-05, + "loss": 0.3868, + "step": 1090 + }, + { + "epoch": 0.4368, + "learning_rate": 1.606821085724362e-05, + "loss": 0.2315, + "step": 1092 + }, + { + "epoch": 0.4376, + "learning_rate": 1.6090383244736253e-05, + "loss": 0.5514, + "step": 1094 + }, + { + "epoch": 0.4384, + "learning_rate": 1.6112508138196912e-05, + "loss": 0.3346, + "step": 1096 + }, + { + "epoch": 0.4392, + "learning_rate": 1.613458536509124e-05, + "loss": 0.7358, + "step": 1098 + }, + { + "epoch": 0.44, + "learning_rate": 1.615661475325658e-05, + "loss": 0.3302, + "step": 1100 + }, + { + "epoch": 0.4408, + "learning_rate": 1.6178596130903352e-05, + "loss": 0.2831, + "step": 1102 + }, + { + "epoch": 0.4416, + "learning_rate": 1.620052932661632e-05, + "loss": 0.3822, + "step": 1104 + }, + { + "epoch": 0.4424, + "learning_rate": 1.6222414169356056e-05, + "loss": 2.4778, + "step": 1106 + }, + { + "epoch": 0.4432, + "learning_rate": 1.6244250488460146e-05, + "loss": 0.4332, + "step": 1108 + }, + { + "epoch": 0.444, + "learning_rate": 1.6266038113644605e-05, + "loss": 0.357, + "step": 1110 + }, + { + "epoch": 0.4448, + "learning_rate": 1.6287776875005127e-05, + "loss": 0.3973, + "step": 1112 + }, + { + "epoch": 0.4456, + "learning_rate": 1.6309466603018497e-05, + "loss": 1.267, + "step": 1114 + }, + { + "epoch": 0.4464, + "learning_rate": 1.6331107128543856e-05, + "loss": 0.3139, + "step": 1116 + }, + { + "epoch": 0.4472, + "learning_rate": 1.635269828282404e-05, + "loss": 0.5295, + "step": 1118 + }, + { + "epoch": 0.448, + "learning_rate": 1.6374239897486905e-05, + "loss": 0.343, + "step": 1120 + }, + { + "epoch": 0.4488, + "learning_rate": 1.6395731804546575e-05, + "loss": 0.2376, + "step": 1122 + }, + { + "epoch": 0.4496, + "learning_rate": 1.6417173836404878e-05, + "loss": 0.2352, + "step": 1124 + }, + { + "epoch": 0.4504, + "learning_rate": 1.643856582585253e-05, + "loss": 0.2731, + "step": 1126 + }, + { + "epoch": 0.4512, + "learning_rate": 1.6459907606070513e-05, + "loss": 0.6209, + "step": 1128 + }, + { + "epoch": 0.452, + "learning_rate": 1.6481199010631312e-05, + "loss": 0.2766, + "step": 1130 + }, + { + "epoch": 0.4528, + "learning_rate": 1.650243987350029e-05, + "loss": 0.451, + "step": 1132 + }, + { + "epoch": 0.4536, + "learning_rate": 1.652363002903693e-05, + "loss": 0.2056, + "step": 1134 + }, + { + "epoch": 0.4544, + "learning_rate": 1.6544769311996146e-05, + "loss": 0.4519, + "step": 1136 + }, + { + "epoch": 0.4552, + "learning_rate": 1.656585755752956e-05, + "loss": 0.3613, + "step": 1138 + }, + { + "epoch": 0.456, + "learning_rate": 1.65868946011868e-05, + "loss": 0.5727, + "step": 1140 + }, + { + "epoch": 0.4568, + "learning_rate": 1.660788027891677e-05, + "loss": 0.8557, + "step": 1142 + }, + { + "epoch": 0.4576, + "learning_rate": 1.6628814427068944e-05, + "loss": 0.3926, + "step": 1144 + }, + { + "epoch": 0.4584, + "learning_rate": 1.6649696882394625e-05, + "loss": 0.2356, + "step": 1146 + }, + { + "epoch": 0.4592, + "learning_rate": 1.667052748204825e-05, + "loss": 0.378, + "step": 1148 + }, + { + "epoch": 0.46, + "learning_rate": 1.6691306063588583e-05, + "loss": 0.7716, + "step": 1150 + }, + { + "epoch": 0.4608, + "learning_rate": 1.6712032464980094e-05, + "loss": 0.3282, + "step": 1152 + }, + { + "epoch": 0.4616, + "learning_rate": 1.6732706524594138e-05, + "loss": 0.2891, + "step": 1154 + }, + { + "epoch": 0.4624, + "learning_rate": 1.6753328081210244e-05, + "loss": 0.2865, + "step": 1156 + }, + { + "epoch": 0.4632, + "learning_rate": 1.6773896974017373e-05, + "loss": 0.4733, + "step": 1158 + }, + { + "epoch": 0.464, + "learning_rate": 1.679441304261516e-05, + "loss": 0.2597, + "step": 1160 + }, + { + "epoch": 0.4648, + "learning_rate": 1.681487612701519e-05, + "loss": 0.4279, + "step": 1162 + }, + { + "epoch": 0.4656, + "learning_rate": 1.683528606764222e-05, + "loss": 0.3304, + "step": 1164 + }, + { + "epoch": 0.4664, + "learning_rate": 1.6855642705335428e-05, + "loss": 0.2887, + "step": 1166 + }, + { + "epoch": 0.4672, + "learning_rate": 1.687594588134968e-05, + "loss": 0.484, + "step": 1168 + }, + { + "epoch": 0.468, + "learning_rate": 1.68961954373567e-05, + "loss": 0.4702, + "step": 1170 + }, + { + "epoch": 0.4688, + "learning_rate": 1.6916391215446403e-05, + "loss": 0.4202, + "step": 1172 + }, + { + "epoch": 0.4696, + "learning_rate": 1.693653305812805e-05, + "loss": 0.2916, + "step": 1174 + }, + { + "epoch": 0.4704, + "learning_rate": 1.6956620808331505e-05, + "loss": 0.2701, + "step": 1176 + }, + { + "epoch": 0.4712, + "learning_rate": 1.697665430940846e-05, + "loss": 0.5951, + "step": 1178 + }, + { + "epoch": 0.472, + "learning_rate": 1.699663340513365e-05, + "loss": 0.6434, + "step": 1180 + }, + { + "epoch": 0.4728, + "learning_rate": 1.7016557939706068e-05, + "loss": 0.3452, + "step": 1182 + }, + { + "epoch": 0.4736, + "learning_rate": 1.7036427757750198e-05, + "loss": 0.3056, + "step": 1184 + }, + { + "epoch": 0.4744, + "learning_rate": 1.7056242704317212e-05, + "loss": 0.4876, + "step": 1186 + }, + { + "epoch": 0.4752, + "learning_rate": 1.7076002624886156e-05, + "loss": 0.3781, + "step": 1188 + }, + { + "epoch": 0.476, + "learning_rate": 1.709570736536521e-05, + "loss": 0.2957, + "step": 1190 + }, + { + "epoch": 0.4768, + "learning_rate": 1.7115356772092844e-05, + "loss": 0.3816, + "step": 1192 + }, + { + "epoch": 0.4776, + "learning_rate": 1.7134950691839063e-05, + "loss": 0.7305, + "step": 1194 + }, + { + "epoch": 0.4784, + "learning_rate": 1.7154488971806518e-05, + "loss": 0.3029, + "step": 1196 + }, + { + "epoch": 0.4792, + "learning_rate": 1.7173971459631783e-05, + "loss": 0.2838, + "step": 1198 + }, + { + "epoch": 0.48, + "learning_rate": 1.7193398003386507e-05, + "loss": 0.4767, + "step": 1200 + }, + { + "epoch": 0.4808, + "learning_rate": 1.7212768451578602e-05, + "loss": 0.76, + "step": 1202 + }, + { + "epoch": 0.4816, + "learning_rate": 1.7232082653153416e-05, + "loss": 0.597, + "step": 1204 + }, + { + "epoch": 0.4824, + "learning_rate": 1.7251340457494937e-05, + "loss": 1.1168, + "step": 1206 + }, + { + "epoch": 0.4832, + "learning_rate": 1.7270541714426923e-05, + "loss": 0.2649, + "step": 1208 + }, + { + "epoch": 0.484, + "learning_rate": 1.7289686274214106e-05, + "loss": 1.7221, + "step": 1210 + }, + { + "epoch": 0.4848, + "learning_rate": 1.7308773987563393e-05, + "loss": 0.3807, + "step": 1212 + }, + { + "epoch": 0.4856, + "learning_rate": 1.732780470562496e-05, + "loss": 0.4856, + "step": 1214 + }, + { + "epoch": 0.4864, + "learning_rate": 1.7346778279993413e-05, + "loss": 0.3511, + "step": 1216 + }, + { + "epoch": 0.4872, + "learning_rate": 1.736569456270903e-05, + "loss": 0.5886, + "step": 1218 + }, + { + "epoch": 0.488, + "learning_rate": 1.7384553406258836e-05, + "loss": 0.9044, + "step": 1220 + }, + { + "epoch": 0.4888, + "learning_rate": 1.740335466357778e-05, + "loss": 0.4384, + "step": 1222 + }, + { + "epoch": 0.4896, + "learning_rate": 1.7422098188049888e-05, + "loss": 0.3725, + "step": 1224 + }, + { + "epoch": 0.4904, + "learning_rate": 1.7440783833509373e-05, + "loss": 0.2402, + "step": 1226 + }, + { + "epoch": 0.4912, + "learning_rate": 1.7459411454241816e-05, + "loss": 0.3152, + "step": 1228 + }, + { + "epoch": 0.492, + "learning_rate": 1.747798090498531e-05, + "loss": 0.7297, + "step": 1230 + }, + { + "epoch": 0.4928, + "learning_rate": 1.749649204093154e-05, + "loss": 0.2419, + "step": 1232 + }, + { + "epoch": 0.4936, + "learning_rate": 1.7514944717726962e-05, + "loss": 0.2237, + "step": 1234 + }, + { + "epoch": 0.4944, + "learning_rate": 1.753333879147387e-05, + "loss": 0.5788, + "step": 1236 + }, + { + "epoch": 0.4952, + "learning_rate": 1.755167411873159e-05, + "loss": 0.9642, + "step": 1238 + }, + { + "epoch": 0.496, + "learning_rate": 1.7569950556517563e-05, + "loss": 0.4745, + "step": 1240 + }, + { + "epoch": 0.4968, + "learning_rate": 1.758816796230845e-05, + "loss": 0.3132, + "step": 1242 + }, + { + "epoch": 0.4976, + "learning_rate": 1.7606326194041278e-05, + "loss": 0.1961, + "step": 1244 + }, + { + "epoch": 0.4984, + "learning_rate": 1.762442511011447e-05, + "loss": 0.4547, + "step": 1246 + }, + { + "epoch": 0.4992, + "learning_rate": 1.7642464569389083e-05, + "loss": 0.5548, + "step": 1248 + }, + { + "epoch": 0.5, + "learning_rate": 1.766044443118977e-05, + "loss": 0.3138, + "step": 1250 + }, + { + "epoch": 0.5008, + "learning_rate": 1.767836455530598e-05, + "loss": 0.3175, + "step": 1252 + }, + { + "epoch": 0.5016, + "learning_rate": 1.7696224801992947e-05, + "loss": 0.4397, + "step": 1254 + }, + { + "epoch": 0.5024, + "learning_rate": 1.77140250319729e-05, + "loss": 0.3803, + "step": 1256 + }, + { + "epoch": 0.5032, + "learning_rate": 1.7731765106436073e-05, + "loss": 0.3477, + "step": 1258 + }, + { + "epoch": 0.504, + "learning_rate": 1.7749444887041793e-05, + "loss": 0.6586, + "step": 1260 + }, + { + "epoch": 0.5048, + "learning_rate": 1.776706423591959e-05, + "loss": 0.2833, + "step": 1262 + }, + { + "epoch": 0.5056, + "learning_rate": 1.778462301567023e-05, + "loss": 0.3282, + "step": 1264 + }, + { + "epoch": 0.5064, + "learning_rate": 1.7802121089366832e-05, + "loss": 0.5765, + "step": 1266 + }, + { + "epoch": 0.5072, + "learning_rate": 1.7819558320555895e-05, + "loss": 0.3454, + "step": 1268 + }, + { + "epoch": 0.508, + "learning_rate": 1.7836934573258392e-05, + "loss": 0.3051, + "step": 1270 + }, + { + "epoch": 0.5088, + "learning_rate": 1.785424971197082e-05, + "loss": 0.878, + "step": 1272 + }, + { + "epoch": 0.5096, + "learning_rate": 1.7871503601666233e-05, + "loss": 0.5818, + "step": 1274 + }, + { + "epoch": 0.5104, + "learning_rate": 1.7888696107795343e-05, + "loss": 0.5624, + "step": 1276 + }, + { + "epoch": 0.5112, + "learning_rate": 1.790582709628753e-05, + "loss": 0.4118, + "step": 1278 + }, + { + "epoch": 0.512, + "learning_rate": 1.7922896433551903e-05, + "loss": 0.2159, + "step": 1280 + }, + { + "epoch": 0.5128, + "learning_rate": 1.793990398647835e-05, + "loss": 0.3974, + "step": 1282 + }, + { + "epoch": 0.5136, + "learning_rate": 1.795684962243855e-05, + "loss": 0.4225, + "step": 1284 + }, + { + "epoch": 0.5144, + "learning_rate": 1.7973733209287032e-05, + "loss": 0.455, + "step": 1286 + }, + { + "epoch": 0.5152, + "learning_rate": 1.7990554615362193e-05, + "loss": 1.0838, + "step": 1288 + }, + { + "epoch": 0.516, + "learning_rate": 1.800731370948734e-05, + "loss": 0.5829, + "step": 1290 + }, + { + "epoch": 0.5168, + "learning_rate": 1.802401036097167e-05, + "loss": 0.3087, + "step": 1292 + }, + { + "epoch": 0.5176, + "learning_rate": 1.804064443961135e-05, + "loss": 0.2661, + "step": 1294 + }, + { + "epoch": 0.5184, + "learning_rate": 1.8057215815690494e-05, + "loss": 0.6078, + "step": 1296 + }, + { + "epoch": 0.5192, + "learning_rate": 1.8073724359982184e-05, + "loss": 0.4031, + "step": 1298 + }, + { + "epoch": 0.52, + "learning_rate": 1.809016994374947e-05, + "loss": 0.5256, + "step": 1300 + }, + { + "epoch": 0.5208, + "learning_rate": 1.81065524387464e-05, + "loss": 0.2434, + "step": 1302 + }, + { + "epoch": 0.5216, + "learning_rate": 1.8122871717218968e-05, + "loss": 0.3107, + "step": 1304 + }, + { + "epoch": 0.5224, + "learning_rate": 1.8139127651906176e-05, + "loss": 0.4962, + "step": 1306 + }, + { + "epoch": 0.5232, + "learning_rate": 1.8155320116040976e-05, + "loss": 0.4455, + "step": 1308 + }, + { + "epoch": 0.524, + "learning_rate": 1.817144898335129e-05, + "loss": 0.2813, + "step": 1310 + }, + { + "epoch": 0.5248, + "learning_rate": 1.818751412806095e-05, + "loss": 0.8767, + "step": 1312 + }, + { + "epoch": 0.5256, + "learning_rate": 1.8203515424890738e-05, + "loss": 0.3128, + "step": 1314 + }, + { + "epoch": 0.5264, + "learning_rate": 1.8219452749059322e-05, + "loss": 0.3927, + "step": 1316 + }, + { + "epoch": 0.5272, + "learning_rate": 1.8235325976284276e-05, + "loss": 0.3637, + "step": 1318 + }, + { + "epoch": 0.528, + "learning_rate": 1.8251134982782952e-05, + "loss": 0.3883, + "step": 1320 + }, + { + "epoch": 0.5288, + "learning_rate": 1.826687964527355e-05, + "loss": 0.2562, + "step": 1322 + }, + { + "epoch": 0.5296, + "learning_rate": 1.828255984097604e-05, + "loss": 0.7825, + "step": 1324 + }, + { + "epoch": 0.5304, + "learning_rate": 1.8298175447613093e-05, + "loss": 0.2397, + "step": 1326 + }, + { + "epoch": 0.5312, + "learning_rate": 1.8313726343411092e-05, + "loss": 0.3641, + "step": 1328 + }, + { + "epoch": 0.532, + "learning_rate": 1.8329212407101e-05, + "loss": 0.2499, + "step": 1330 + }, + { + "epoch": 0.5328, + "learning_rate": 1.8344633517919394e-05, + "loss": 0.3126, + "step": 1332 + }, + { + "epoch": 0.5336, + "learning_rate": 1.8359989555609344e-05, + "loss": 0.2866, + "step": 1334 + }, + { + "epoch": 0.5344, + "learning_rate": 1.8375280400421407e-05, + "loss": 0.2497, + "step": 1336 + }, + { + "epoch": 0.5352, + "learning_rate": 1.8390505933114503e-05, + "loss": 0.6196, + "step": 1338 + }, + { + "epoch": 0.536, + "learning_rate": 1.8405666034956842e-05, + "loss": 0.5532, + "step": 1340 + }, + { + "epoch": 0.5368, + "learning_rate": 1.842076058772692e-05, + "loss": 0.4005, + "step": 1342 + }, + { + "epoch": 0.5376, + "learning_rate": 1.8435789473714384e-05, + "loss": 0.4494, + "step": 1344 + }, + { + "epoch": 0.5384, + "learning_rate": 1.8450752575720964e-05, + "loss": 0.6334, + "step": 1346 + }, + { + "epoch": 0.5392, + "learning_rate": 1.8465649777061384e-05, + "loss": 0.4742, + "step": 1348 + }, + { + "epoch": 0.54, + "learning_rate": 1.8480480961564266e-05, + "loss": 0.6602, + "step": 1350 + }, + { + "epoch": 0.5408, + "learning_rate": 1.8495246013573047e-05, + "loss": 0.5222, + "step": 1352 + }, + { + "epoch": 0.5416, + "learning_rate": 1.850994481794691e-05, + "loss": 0.2957, + "step": 1354 + }, + { + "epoch": 0.5424, + "learning_rate": 1.8524577260061628e-05, + "loss": 0.2854, + "step": 1356 + }, + { + "epoch": 0.5432, + "learning_rate": 1.8539143225810453e-05, + "loss": 0.884, + "step": 1358 + }, + { + "epoch": 0.544, + "learning_rate": 1.8553642601605066e-05, + "loss": 0.433, + "step": 1360 + }, + { + "epoch": 0.5448, + "learning_rate": 1.856807527437643e-05, + "loss": 0.2934, + "step": 1362 + }, + { + "epoch": 0.5456, + "learning_rate": 1.8582441131575658e-05, + "loss": 0.8728, + "step": 1364 + }, + { + "epoch": 0.5464, + "learning_rate": 1.859674006117491e-05, + "loss": 1.0129, + "step": 1366 + }, + { + "epoch": 0.5472, + "learning_rate": 1.8610971951668268e-05, + "loss": 0.3221, + "step": 1368 + }, + { + "epoch": 0.548, + "learning_rate": 1.862513669207257e-05, + "loss": 0.3409, + "step": 1370 + }, + { + "epoch": 0.5488, + "learning_rate": 1.8639234171928348e-05, + "loss": 0.4106, + "step": 1372 + }, + { + "epoch": 0.5496, + "learning_rate": 1.8653264281300612e-05, + "loss": 0.1713, + "step": 1374 + }, + { + "epoch": 0.5504, + "learning_rate": 1.8667226910779767e-05, + "loss": 0.9538, + "step": 1376 + }, + { + "epoch": 0.5512, + "learning_rate": 1.8681121951482393e-05, + "loss": 0.2391, + "step": 1378 + }, + { + "epoch": 0.552, + "learning_rate": 1.869494929505219e-05, + "loss": 0.514, + "step": 1380 + }, + { + "epoch": 0.5528, + "learning_rate": 1.870870883366075e-05, + "loss": 0.3008, + "step": 1382 + }, + { + "epoch": 0.5536, + "learning_rate": 1.8722400460008434e-05, + "loss": 1.106, + "step": 1384 + }, + { + "epoch": 0.5544, + "learning_rate": 1.8736024067325195e-05, + "loss": 0.2155, + "step": 1386 + }, + { + "epoch": 0.5552, + "learning_rate": 1.8749579549371373e-05, + "loss": 0.3582, + "step": 1388 + }, + { + "epoch": 0.556, + "learning_rate": 1.876306680043863e-05, + "loss": 0.3647, + "step": 1390 + }, + { + "epoch": 0.5568, + "learning_rate": 1.8776485715350665e-05, + "loss": 0.6262, + "step": 1392 + }, + { + "epoch": 0.5576, + "learning_rate": 1.878983618946409e-05, + "loss": 0.4101, + "step": 1394 + }, + { + "epoch": 0.5584, + "learning_rate": 1.8803118118669203e-05, + "loss": 0.7839, + "step": 1396 + }, + { + "epoch": 0.5592, + "learning_rate": 1.881633139939087e-05, + "loss": 0.3563, + "step": 1398 + }, + { + "epoch": 0.56, + "learning_rate": 1.882947592858927e-05, + "loss": 0.3198, + "step": 1400 + }, + { + "epoch": 0.5608, + "learning_rate": 1.884255160376072e-05, + "loss": 0.3839, + "step": 1402 + }, + { + "epoch": 0.5616, + "learning_rate": 1.885555832293849e-05, + "loss": 0.5167, + "step": 1404 + }, + { + "epoch": 0.5624, + "learning_rate": 1.886849598469356e-05, + "loss": 0.2983, + "step": 1406 + }, + { + "epoch": 0.5632, + "learning_rate": 1.888136448813544e-05, + "loss": 0.458, + "step": 1408 + }, + { + "epoch": 0.564, + "learning_rate": 1.8894163732912972e-05, + "loss": 0.3048, + "step": 1410 + }, + { + "epoch": 0.5648, + "learning_rate": 1.890689361921506e-05, + "loss": 0.5421, + "step": 1412 + }, + { + "epoch": 0.5656, + "learning_rate": 1.891955404777151e-05, + "loss": 0.3461, + "step": 1414 + }, + { + "epoch": 0.5664, + "learning_rate": 1.893214491985374e-05, + "loss": 0.3811, + "step": 1416 + }, + { + "epoch": 0.5672, + "learning_rate": 1.89446661372756e-05, + "loss": 0.1966, + "step": 1418 + }, + { + "epoch": 0.568, + "learning_rate": 1.895711760239413e-05, + "loss": 0.2135, + "step": 1420 + }, + { + "epoch": 0.5688, + "learning_rate": 1.89694992181103e-05, + "loss": 0.2891, + "step": 1422 + }, + { + "epoch": 0.5696, + "learning_rate": 1.8981810887869784e-05, + "loss": 0.2853, + "step": 1424 + }, + { + "epoch": 0.5704, + "learning_rate": 1.8994052515663708e-05, + "loss": 0.3409, + "step": 1426 + }, + { + "epoch": 0.5712, + "learning_rate": 1.90062240060294e-05, + "loss": 0.6495, + "step": 1428 + }, + { + "epoch": 0.572, + "learning_rate": 1.9018325264051136e-05, + "loss": 0.3951, + "step": 1430 + }, + { + "epoch": 0.5728, + "learning_rate": 1.9030356195360868e-05, + "loss": 0.3849, + "step": 1432 + }, + { + "epoch": 0.5736, + "learning_rate": 1.904231670613899e-05, + "loss": 0.3366, + "step": 1434 + }, + { + "epoch": 0.5744, + "learning_rate": 1.905420670311502e-05, + "loss": 0.4332, + "step": 1436 + }, + { + "epoch": 0.5752, + "learning_rate": 1.906602609356838e-05, + "loss": 0.4592, + "step": 1438 + }, + { + "epoch": 0.576, + "learning_rate": 1.9077774785329078e-05, + "loss": 0.3389, + "step": 1440 + }, + { + "epoch": 0.5768, + "learning_rate": 1.9089452686778487e-05, + "loss": 0.7297, + "step": 1442 + }, + { + "epoch": 0.5776, + "learning_rate": 1.9101059706849957e-05, + "loss": 0.7544, + "step": 1444 + }, + { + "epoch": 0.5784, + "learning_rate": 1.911259575502962e-05, + "loss": 0.2922, + "step": 1446 + }, + { + "epoch": 0.5792, + "learning_rate": 1.912406074135706e-05, + "loss": 0.3067, + "step": 1448 + }, + { + "epoch": 0.58, + "learning_rate": 1.9135454576426006e-05, + "loss": 0.6259, + "step": 1450 + }, + { + "epoch": 0.5808, + "learning_rate": 1.9146777171385053e-05, + "loss": 0.3281, + "step": 1452 + }, + { + "epoch": 0.5816, + "learning_rate": 1.9158028437938316e-05, + "loss": 0.5702, + "step": 1454 + }, + { + "epoch": 0.5824, + "learning_rate": 1.9169208288346168e-05, + "loss": 0.2712, + "step": 1456 + }, + { + "epoch": 0.5832, + "learning_rate": 1.9180316635425876e-05, + "loss": 0.4279, + "step": 1458 + }, + { + "epoch": 0.584, + "learning_rate": 1.9191353392552346e-05, + "loss": 0.6408, + "step": 1460 + }, + { + "epoch": 0.5848, + "learning_rate": 1.9202318473658703e-05, + "loss": 0.2467, + "step": 1462 + }, + { + "epoch": 0.5856, + "learning_rate": 1.9213211793237052e-05, + "loss": 0.5283, + "step": 1464 + }, + { + "epoch": 0.5864, + "learning_rate": 1.92240332663391e-05, + "loss": 0.2985, + "step": 1466 + }, + { + "epoch": 0.5872, + "learning_rate": 1.923478280857682e-05, + "loss": 0.2361, + "step": 1468 + }, + { + "epoch": 0.588, + "learning_rate": 1.924546033612313e-05, + "loss": 1.1026, + "step": 1470 + }, + { + "epoch": 0.5888, + "learning_rate": 1.9256065765712524e-05, + "loss": 0.2561, + "step": 1472 + }, + { + "epoch": 0.5896, + "learning_rate": 1.9266599014641724e-05, + "loss": 0.3058, + "step": 1474 + }, + { + "epoch": 0.5904, + "learning_rate": 1.927706000077034e-05, + "loss": 0.2774, + "step": 1476 + }, + { + "epoch": 0.5912, + "learning_rate": 1.9287448642521507e-05, + "loss": 0.3456, + "step": 1478 + }, + { + "epoch": 0.592, + "learning_rate": 1.9297764858882516e-05, + "loss": 0.2891, + "step": 1480 + }, + { + "epoch": 0.5928, + "learning_rate": 1.9308008569405424e-05, + "loss": 0.2914, + "step": 1482 + }, + { + "epoch": 0.5936, + "learning_rate": 1.9318179694207722e-05, + "loss": 0.6569, + "step": 1484 + }, + { + "epoch": 0.5944, + "learning_rate": 1.9328278153972943e-05, + "loss": 0.7627, + "step": 1486 + }, + { + "epoch": 0.5952, + "learning_rate": 1.9338303869951266e-05, + "loss": 0.4721, + "step": 1488 + }, + { + "epoch": 0.596, + "learning_rate": 1.934825676396015e-05, + "loss": 0.6444, + "step": 1490 + }, + { + "epoch": 0.5968, + "learning_rate": 1.935813675838491e-05, + "loss": 0.2065, + "step": 1492 + }, + { + "epoch": 0.5976, + "learning_rate": 1.9367943776179375e-05, + "loss": 0.4541, + "step": 1494 + }, + { + "epoch": 0.5984, + "learning_rate": 1.9377677740866457e-05, + "loss": 1.4342, + "step": 1496 + }, + { + "epoch": 0.5992, + "learning_rate": 1.9387338576538743e-05, + "loss": 0.2676, + "step": 1498 + }, + { + "epoch": 0.6, + "learning_rate": 1.9396926207859085e-05, + "loss": 0.2039, + "step": 1500 + }, + { + "epoch": 0.6008, + "learning_rate": 1.9406440560061214e-05, + "loss": 0.462, + "step": 1502 + }, + { + "epoch": 0.6016, + "learning_rate": 1.9415881558950302e-05, + "loss": 0.2546, + "step": 1504 + }, + { + "epoch": 0.6024, + "learning_rate": 1.942524913090354e-05, + "loss": 0.3266, + "step": 1506 + }, + { + "epoch": 0.6032, + "learning_rate": 1.9434543202870723e-05, + "loss": 0.3837, + "step": 1508 + }, + { + "epoch": 0.604, + "learning_rate": 1.9443763702374815e-05, + "loss": 0.5365, + "step": 1510 + }, + { + "epoch": 0.6048, + "learning_rate": 1.9452910557512494e-05, + "loss": 0.5157, + "step": 1512 + }, + { + "epoch": 0.6056, + "learning_rate": 1.9461983696954756e-05, + "loss": 0.6955, + "step": 1514 + }, + { + "epoch": 0.6064, + "learning_rate": 1.947098304994744e-05, + "loss": 0.2092, + "step": 1516 + }, + { + "epoch": 0.6072, + "learning_rate": 1.9479908546311787e-05, + "loss": 0.2819, + "step": 1518 + }, + { + "epoch": 0.608, + "learning_rate": 1.9488760116444966e-05, + "loss": 0.3803, + "step": 1520 + }, + { + "epoch": 0.6088, + "learning_rate": 1.949753769132067e-05, + "loss": 0.2723, + "step": 1522 + }, + { + "epoch": 0.6096, + "learning_rate": 1.95062412024896e-05, + "loss": 0.2962, + "step": 1524 + }, + { + "epoch": 0.6104, + "learning_rate": 1.951487058208003e-05, + "loss": 0.4052, + "step": 1526 + }, + { + "epoch": 0.6112, + "learning_rate": 1.952342576279833e-05, + "loss": 0.5166, + "step": 1528 + }, + { + "epoch": 0.612, + "learning_rate": 1.953190667792947e-05, + "loss": 0.1857, + "step": 1530 + }, + { + "epoch": 0.6128, + "learning_rate": 1.9540313261337578e-05, + "loss": 0.6652, + "step": 1532 + }, + { + "epoch": 0.6136, + "learning_rate": 1.954864544746643e-05, + "loss": 0.2782, + "step": 1534 + }, + { + "epoch": 0.6144, + "learning_rate": 1.955690317133996e-05, + "loss": 0.4112, + "step": 1536 + }, + { + "epoch": 0.6152, + "learning_rate": 1.956508636856278e-05, + "loss": 0.3615, + "step": 1538 + }, + { + "epoch": 0.616, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.3244, + "step": 1540 + }, + { + "epoch": 0.6168, + "learning_rate": 1.95812289283811e-05, + "loss": 0.2492, + "step": 1542 + }, + { + "epoch": 0.6176, + "learning_rate": 1.958918816509367e-05, + "loss": 0.4758, + "step": 1544 + }, + { + "epoch": 0.6184, + "learning_rate": 1.9597072623390668e-05, + "loss": 0.3923, + "step": 1546 + }, + { + "epoch": 0.6192, + "learning_rate": 1.9604882241787496e-05, + "loss": 0.1968, + "step": 1548 + }, + { + "epoch": 0.62, + "learning_rate": 1.9612616959383187e-05, + "loss": 0.2705, + "step": 1550 + }, + { + "epoch": 0.6208, + "learning_rate": 1.9620276715860856e-05, + "loss": 0.6171, + "step": 1552 + }, + { + "epoch": 0.6216, + "learning_rate": 1.9627861451488187e-05, + "loss": 0.5356, + "step": 1554 + }, + { + "epoch": 0.6224, + "learning_rate": 1.963537110711789e-05, + "loss": 0.3816, + "step": 1556 + }, + { + "epoch": 0.6232, + "learning_rate": 1.964280562418815e-05, + "loss": 0.3689, + "step": 1558 + }, + { + "epoch": 0.624, + "learning_rate": 1.9650164944723116e-05, + "loss": 0.3315, + "step": 1560 + }, + { + "epoch": 0.6248, + "learning_rate": 1.9657449011333328e-05, + "loss": 0.408, + "step": 1562 + }, + { + "epoch": 0.6256, + "learning_rate": 1.9664657767216176e-05, + "loss": 0.251, + "step": 1564 + }, + { + "epoch": 0.6264, + "learning_rate": 1.967179115615633e-05, + "loss": 0.3692, + "step": 1566 + }, + { + "epoch": 0.6272, + "learning_rate": 1.967884912252619e-05, + "loss": 0.2522, + "step": 1568 + }, + { + "epoch": 0.628, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.2777, + "step": 1570 + }, + { + "epoch": 0.6288, + "learning_rate": 1.969273856798585e-05, + "loss": 0.447, + "step": 1572 + }, + { + "epoch": 0.6296, + "learning_rate": 1.9699569938762972e-05, + "loss": 0.3315, + "step": 1574 + }, + { + "epoch": 0.6304, + "learning_rate": 1.9706325670345276e-05, + "loss": 1.1316, + "step": 1576 + }, + { + "epoch": 0.6312, + "learning_rate": 1.9713005710050203e-05, + "loss": 0.6628, + "step": 1578 + }, + { + "epoch": 0.632, + "learning_rate": 1.9719610005785466e-05, + "loss": 0.2016, + "step": 1580 + }, + { + "epoch": 0.6328, + "learning_rate": 1.9726138506049434e-05, + "loss": 0.4666, + "step": 1582 + }, + { + "epoch": 0.6336, + "learning_rate": 1.9732591159931564e-05, + "loss": 0.6091, + "step": 1584 + }, + { + "epoch": 0.6344, + "learning_rate": 1.9738967917112752e-05, + "loss": 0.315, + "step": 1586 + }, + { + "epoch": 0.6352, + "learning_rate": 1.974526872786577e-05, + "loss": 0.4679, + "step": 1588 + }, + { + "epoch": 0.636, + "learning_rate": 1.9751493543055634e-05, + "loss": 0.2392, + "step": 1590 + }, + { + "epoch": 0.6368, + "learning_rate": 1.9757642314139977e-05, + "loss": 0.6821, + "step": 1592 + }, + { + "epoch": 0.6376, + "learning_rate": 1.976371499316945e-05, + "loss": 0.2116, + "step": 1594 + }, + { + "epoch": 0.6384, + "learning_rate": 1.9769711532788083e-05, + "loss": 0.3523, + "step": 1596 + }, + { + "epoch": 0.6392, + "learning_rate": 1.9775631886233655e-05, + "loss": 0.2062, + "step": 1598 + }, + { + "epoch": 0.64, + "learning_rate": 1.9781476007338054e-05, + "loss": 0.7485, + "step": 1600 + }, + { + "epoch": 0.6408, + "learning_rate": 1.978724385052766e-05, + "loss": 0.6192, + "step": 1602 + }, + { + "epoch": 0.6416, + "learning_rate": 1.9792935370823673e-05, + "loss": 0.2354, + "step": 1604 + }, + { + "epoch": 0.6424, + "learning_rate": 1.979855052384247e-05, + "loss": 0.3029, + "step": 1606 + }, + { + "epoch": 0.6432, + "learning_rate": 1.9804089265795956e-05, + "loss": 0.6285, + "step": 1608 + }, + { + "epoch": 0.644, + "learning_rate": 1.9809551553491918e-05, + "loss": 0.5859, + "step": 1610 + }, + { + "epoch": 0.6448, + "learning_rate": 1.981493734433433e-05, + "loss": 0.404, + "step": 1612 + }, + { + "epoch": 0.6456, + "learning_rate": 1.982024659632372e-05, + "loss": 0.3944, + "step": 1614 + }, + { + "epoch": 0.6464, + "learning_rate": 1.9825479268057472e-05, + "loss": 0.4164, + "step": 1616 + }, + { + "epoch": 0.6472, + "learning_rate": 1.9830635318730155e-05, + "loss": 0.6069, + "step": 1618 + }, + { + "epoch": 0.648, + "learning_rate": 1.9835714708133858e-05, + "loss": 0.7534, + "step": 1620 + }, + { + "epoch": 0.6488, + "learning_rate": 1.9840717396658483e-05, + "loss": 0.79, + "step": 1622 + }, + { + "epoch": 0.6496, + "learning_rate": 1.9845643345292055e-05, + "loss": 0.3299, + "step": 1624 + }, + { + "epoch": 0.6504, + "learning_rate": 1.9850492515621038e-05, + "loss": 0.2726, + "step": 1626 + }, + { + "epoch": 0.6512, + "learning_rate": 1.985526486983063e-05, + "loss": 0.2595, + "step": 1628 + }, + { + "epoch": 0.652, + "learning_rate": 1.985996037070505e-05, + "loss": 0.1985, + "step": 1630 + }, + { + "epoch": 0.6528, + "learning_rate": 1.9864578981627844e-05, + "loss": 0.4013, + "step": 1632 + }, + { + "epoch": 0.6536, + "learning_rate": 1.9869120666582153e-05, + "loss": 0.226, + "step": 1634 + }, + { + "epoch": 0.6544, + "learning_rate": 1.9873585390151003e-05, + "loss": 0.8202, + "step": 1636 + }, + { + "epoch": 0.6552, + "learning_rate": 1.987797311751759e-05, + "loss": 0.4602, + "step": 1638 + }, + { + "epoch": 0.656, + "learning_rate": 1.9882283814465528e-05, + "loss": 0.214, + "step": 1640 + }, + { + "epoch": 0.6568, + "learning_rate": 1.988651744737914e-05, + "loss": 0.851, + "step": 1642 + }, + { + "epoch": 0.6576, + "learning_rate": 1.9890673983243704e-05, + "loss": 0.5044, + "step": 1644 + }, + { + "epoch": 0.6584, + "learning_rate": 1.9894753389645723e-05, + "loss": 0.1587, + "step": 1646 + }, + { + "epoch": 0.6592, + "learning_rate": 1.9898755634773155e-05, + "loss": 0.355, + "step": 1648 + }, + { + "epoch": 0.66, + "learning_rate": 1.9902680687415704e-05, + "loss": 0.2981, + "step": 1650 + }, + { + "epoch": 0.6608, + "learning_rate": 1.9906528516965014e-05, + "loss": 0.3566, + "step": 1652 + }, + { + "epoch": 0.6616, + "learning_rate": 1.9910299093414926e-05, + "loss": 0.2655, + "step": 1654 + }, + { + "epoch": 0.6624, + "learning_rate": 1.9913992387361744e-05, + "loss": 0.6327, + "step": 1656 + }, + { + "epoch": 0.6632, + "learning_rate": 1.9917608370004414e-05, + "loss": 0.3246, + "step": 1658 + }, + { + "epoch": 0.664, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.2596, + "step": 1660 + }, + { + "epoch": 0.6648, + "learning_rate": 1.9924608289187786e-05, + "loss": 0.4516, + "step": 1662 + }, + { + "epoch": 0.6656, + "learning_rate": 1.9927992171141707e-05, + "loss": 0.1858, + "step": 1664 + }, + { + "epoch": 0.6664, + "learning_rate": 1.9931298632618355e-05, + "loss": 0.2705, + "step": 1666 + }, + { + "epoch": 0.6672, + "learning_rate": 1.9934527647833276e-05, + "loss": 0.3139, + "step": 1668 + }, + { + "epoch": 0.668, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.2585, + "step": 1670 + }, + { + "epoch": 0.6688, + "learning_rate": 1.9940753239360047e-05, + "loss": 0.5798, + "step": 1672 + }, + { + "epoch": 0.6696, + "learning_rate": 1.994374976712348e-05, + "loss": 0.3228, + "step": 1674 + }, + { + "epoch": 0.6704, + "learning_rate": 1.994666875152874e-05, + "loss": 0.5148, + "step": 1676 + }, + { + "epoch": 0.6712, + "learning_rate": 1.9949510169813003e-05, + "loss": 0.2952, + "step": 1678 + }, + { + "epoch": 0.672, + "learning_rate": 1.9952273999818312e-05, + "loss": 0.5234, + "step": 1680 + }, + { + "epoch": 0.6728, + "learning_rate": 1.995496021999177e-05, + "loss": 0.2761, + "step": 1682 + }, + { + "epoch": 0.6736, + "learning_rate": 1.9957568809385693e-05, + "loss": 0.607, + "step": 1684 + }, + { + "epoch": 0.6744, + "learning_rate": 1.9960099747657774e-05, + "loss": 0.376, + "step": 1686 + }, + { + "epoch": 0.6752, + "learning_rate": 1.996255301507125e-05, + "loss": 0.3114, + "step": 1688 + }, + { + "epoch": 0.676, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.2945, + "step": 1690 + }, + { + "epoch": 0.6768, + "learning_rate": 1.9967226461403934e-05, + "loss": 0.7938, + "step": 1692 + }, + { + "epoch": 0.6776, + "learning_rate": 1.996944660387867e-05, + "loss": 0.5561, + "step": 1694 + }, + { + "epoch": 0.6784, + "learning_rate": 1.997158900260614e-05, + "loss": 0.4054, + "step": 1696 + }, + { + "epoch": 0.6792, + "learning_rate": 1.9973653640879486e-05, + "loss": 0.3551, + "step": 1698 + }, + { + "epoch": 0.68, + "learning_rate": 1.9975640502598246e-05, + "loss": 0.5112, + "step": 1700 + }, + { + "epoch": 0.6808, + "learning_rate": 1.997754957226847e-05, + "loss": 0.3792, + "step": 1702 + }, + { + "epoch": 0.6816, + "learning_rate": 1.9979380835002846e-05, + "loss": 0.2304, + "step": 1704 + }, + { + "epoch": 0.6824, + "learning_rate": 1.9981134276520828e-05, + "loss": 0.3494, + "step": 1706 + }, + { + "epoch": 0.6832, + "learning_rate": 1.998280988314872e-05, + "loss": 0.2988, + "step": 1708 + }, + { + "epoch": 0.684, + "learning_rate": 1.998440764181981e-05, + "loss": 0.303, + "step": 1710 + }, + { + "epoch": 0.6848, + "learning_rate": 1.9985927540074453e-05, + "loss": 1.4673, + "step": 1712 + }, + { + "epoch": 0.6856, + "learning_rate": 1.998736956606018e-05, + "loss": 0.3398, + "step": 1714 + }, + { + "epoch": 0.6864, + "learning_rate": 1.9988733708531772e-05, + "loss": 0.3864, + "step": 1716 + }, + { + "epoch": 0.6872, + "learning_rate": 1.9990019956851384e-05, + "loss": 0.2219, + "step": 1718 + }, + { + "epoch": 0.688, + "learning_rate": 1.9991228300988586e-05, + "loss": 0.3164, + "step": 1720 + }, + { + "epoch": 0.6888, + "learning_rate": 1.999235873152047e-05, + "loss": 0.2771, + "step": 1722 + }, + { + "epoch": 0.6896, + "learning_rate": 1.9993411239631713e-05, + "loss": 0.3405, + "step": 1724 + }, + { + "epoch": 0.6904, + "learning_rate": 1.9994385817114644e-05, + "loss": 0.2174, + "step": 1726 + }, + { + "epoch": 0.6912, + "learning_rate": 1.9995282456369313e-05, + "loss": 0.2272, + "step": 1728 + }, + { + "epoch": 0.692, + "learning_rate": 1.9996101150403543e-05, + "loss": 0.5491, + "step": 1730 + }, + { + "epoch": 0.6928, + "learning_rate": 1.9996841892833e-05, + "loss": 0.3751, + "step": 1732 + }, + { + "epoch": 0.6936, + "learning_rate": 1.9997504677881224e-05, + "loss": 0.3051, + "step": 1734 + }, + { + "epoch": 0.6944, + "learning_rate": 1.999808950037968e-05, + "loss": 0.4029, + "step": 1736 + }, + { + "epoch": 0.6952, + "learning_rate": 1.9998596355767805e-05, + "loss": 0.3979, + "step": 1738 + }, + { + "epoch": 0.696, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.564, + "step": 1740 + }, + { + "epoch": 0.6968, + "learning_rate": 1.9999376150010868e-05, + "loss": 0.4381, + "step": 1742 + }, + { + "epoch": 0.6976, + "learning_rate": 1.9999649082784807e-05, + "loss": 0.4625, + "step": 1744 + }, + { + "epoch": 0.6984, + "learning_rate": 1.9999844036286483e-05, + "loss": 0.3532, + "step": 1746 + }, + { + "epoch": 0.6992, + "learning_rate": 1.9999961008995607e-05, + "loss": 0.2577, + "step": 1748 + }, + { + "epoch": 0.7, + "learning_rate": 2e-05, + "loss": 0.1996, + "step": 1750 + }, + { + "epoch": 0.7008, + "learning_rate": 1.9999961008995607e-05, + "loss": 0.4102, + "step": 1752 + }, + { + "epoch": 0.7016, + "learning_rate": 1.9999844036286483e-05, + "loss": 0.4381, + "step": 1754 + }, + { + "epoch": 0.7024, + "learning_rate": 1.9999649082784807e-05, + "loss": 0.5969, + "step": 1756 + }, + { + "epoch": 0.7032, + "learning_rate": 1.9999376150010868e-05, + "loss": 0.2813, + "step": 1758 + }, + { + "epoch": 0.704, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.6018, + "step": 1760 + }, + { + "epoch": 0.7048, + "learning_rate": 1.9998596355767805e-05, + "loss": 0.45, + "step": 1762 + }, + { + "epoch": 0.7056, + "learning_rate": 1.999808950037968e-05, + "loss": 0.6249, + "step": 1764 + }, + { + "epoch": 0.7064, + "learning_rate": 1.9997504677881224e-05, + "loss": 0.3622, + "step": 1766 + }, + { + "epoch": 0.7072, + "learning_rate": 1.9996841892833e-05, + "loss": 0.6121, + "step": 1768 + }, + { + "epoch": 0.708, + "learning_rate": 1.9996101150403547e-05, + "loss": 0.4156, + "step": 1770 + }, + { + "epoch": 0.7088, + "learning_rate": 1.9995282456369313e-05, + "loss": 1.2839, + "step": 1772 + }, + { + "epoch": 0.7096, + "learning_rate": 1.9994385817114644e-05, + "loss": 0.4216, + "step": 1774 + }, + { + "epoch": 0.7104, + "learning_rate": 1.9993411239631713e-05, + "loss": 0.3174, + "step": 1776 + }, + { + "epoch": 0.7112, + "learning_rate": 1.999235873152047e-05, + "loss": 0.4151, + "step": 1778 + }, + { + "epoch": 0.712, + "learning_rate": 1.9991228300988586e-05, + "loss": 0.415, + "step": 1780 + }, + { + "epoch": 0.7128, + "learning_rate": 1.9990019956851384e-05, + "loss": 0.2363, + "step": 1782 + }, + { + "epoch": 0.7136, + "learning_rate": 1.9988733708531772e-05, + "loss": 0.2731, + "step": 1784 + }, + { + "epoch": 0.7144, + "learning_rate": 1.998736956606018e-05, + "loss": 0.7535, + "step": 1786 + }, + { + "epoch": 0.7152, + "learning_rate": 1.9985927540074453e-05, + "loss": 0.6897, + "step": 1788 + }, + { + "epoch": 0.716, + "learning_rate": 1.9984407641819812e-05, + "loss": 0.3776, + "step": 1790 + }, + { + "epoch": 0.7168, + "learning_rate": 1.998280988314872e-05, + "loss": 0.3082, + "step": 1792 + }, + { + "epoch": 0.7176, + "learning_rate": 1.9981134276520828e-05, + "loss": 0.9716, + "step": 1794 + }, + { + "epoch": 0.7184, + "learning_rate": 1.9979380835002846e-05, + "loss": 0.2966, + "step": 1796 + }, + { + "epoch": 0.7192, + "learning_rate": 1.9977549572268467e-05, + "loss": 0.5585, + "step": 1798 + }, + { + "epoch": 0.72, + "learning_rate": 1.9975640502598246e-05, + "loss": 0.3014, + "step": 1800 + }, + { + "epoch": 0.7208, + "learning_rate": 1.9973653640879486e-05, + "loss": 0.8217, + "step": 1802 + }, + { + "epoch": 0.7216, + "learning_rate": 1.997158900260614e-05, + "loss": 0.5544, + "step": 1804 + }, + { + "epoch": 0.7224, + "learning_rate": 1.9969446603878673e-05, + "loss": 0.3791, + "step": 1806 + }, + { + "epoch": 0.7232, + "learning_rate": 1.9967226461403934e-05, + "loss": 0.3205, + "step": 1808 + }, + { + "epoch": 0.724, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.3588, + "step": 1810 + }, + { + "epoch": 0.7248, + "learning_rate": 1.996255301507125e-05, + "loss": 0.6419, + "step": 1812 + }, + { + "epoch": 0.7256, + "learning_rate": 1.9960099747657774e-05, + "loss": 0.2393, + "step": 1814 + }, + { + "epoch": 0.7264, + "learning_rate": 1.9957568809385693e-05, + "loss": 0.4101, + "step": 1816 + }, + { + "epoch": 0.7272, + "learning_rate": 1.995496021999177e-05, + "loss": 0.1867, + "step": 1818 + }, + { + "epoch": 0.728, + "learning_rate": 1.9952273999818312e-05, + "loss": 0.2578, + "step": 1820 + }, + { + "epoch": 0.7288, + "learning_rate": 1.9949510169813006e-05, + "loss": 0.5568, + "step": 1822 + }, + { + "epoch": 0.7296, + "learning_rate": 1.9946668751528745e-05, + "loss": 0.2108, + "step": 1824 + }, + { + "epoch": 0.7304, + "learning_rate": 1.994374976712348e-05, + "loss": 0.3135, + "step": 1826 + }, + { + "epoch": 0.7312, + "learning_rate": 1.9940753239360047e-05, + "loss": 0.7631, + "step": 1828 + }, + { + "epoch": 0.732, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.2881, + "step": 1830 + }, + { + "epoch": 0.7328, + "learning_rate": 1.993452764783328e-05, + "loss": 0.3897, + "step": 1832 + }, + { + "epoch": 0.7336, + "learning_rate": 1.9931298632618352e-05, + "loss": 0.2551, + "step": 1834 + }, + { + "epoch": 0.7344, + "learning_rate": 1.9927992171141707e-05, + "loss": 0.7452, + "step": 1836 + }, + { + "epoch": 0.7352, + "learning_rate": 1.9924608289187786e-05, + "loss": 0.2037, + "step": 1838 + }, + { + "epoch": 0.736, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.2083, + "step": 1840 + }, + { + "epoch": 0.7368, + "learning_rate": 1.9917608370004417e-05, + "loss": 0.2776, + "step": 1842 + }, + { + "epoch": 0.7376, + "learning_rate": 1.9913992387361747e-05, + "loss": 0.4288, + "step": 1844 + }, + { + "epoch": 0.7384, + "learning_rate": 1.9910299093414932e-05, + "loss": 0.2098, + "step": 1846 + }, + { + "epoch": 0.7392, + "learning_rate": 1.990652851696501e-05, + "loss": 0.7243, + "step": 1848 + }, + { + "epoch": 0.74, + "learning_rate": 1.9902680687415704e-05, + "loss": 0.2803, + "step": 1850 + }, + { + "epoch": 0.7408, + "learning_rate": 1.9898755634773155e-05, + "loss": 1.3556, + "step": 1852 + }, + { + "epoch": 0.7416, + "learning_rate": 1.9894753389645723e-05, + "loss": 0.6151, + "step": 1854 + }, + { + "epoch": 0.7424, + "learning_rate": 1.9890673983243704e-05, + "loss": 0.4928, + "step": 1856 + }, + { + "epoch": 0.7432, + "learning_rate": 1.9886517447379143e-05, + "loss": 0.4827, + "step": 1858 + }, + { + "epoch": 0.744, + "learning_rate": 1.988228381446553e-05, + "loss": 0.2061, + "step": 1860 + }, + { + "epoch": 0.7448, + "learning_rate": 1.987797311751759e-05, + "loss": 0.2591, + "step": 1862 + }, + { + "epoch": 0.7456, + "learning_rate": 1.9873585390151007e-05, + "loss": 0.8382, + "step": 1864 + }, + { + "epoch": 0.7464, + "learning_rate": 1.9869120666582153e-05, + "loss": 0.2804, + "step": 1866 + }, + { + "epoch": 0.7472, + "learning_rate": 1.9864578981627844e-05, + "loss": 0.2743, + "step": 1868 + }, + { + "epoch": 0.748, + "learning_rate": 1.985996037070505e-05, + "loss": 0.4064, + "step": 1870 + }, + { + "epoch": 0.7488, + "learning_rate": 1.985526486983063e-05, + "loss": 0.265, + "step": 1872 + }, + { + "epoch": 0.7496, + "learning_rate": 1.9850492515621038e-05, + "loss": 0.2343, + "step": 1874 + }, + { + "epoch": 0.7504, + "learning_rate": 1.9845643345292055e-05, + "loss": 0.2311, + "step": 1876 + }, + { + "epoch": 0.7512, + "learning_rate": 1.9840717396658483e-05, + "loss": 0.2859, + "step": 1878 + }, + { + "epoch": 0.752, + "learning_rate": 1.983571470813386e-05, + "loss": 0.3071, + "step": 1880 + }, + { + "epoch": 0.7528, + "learning_rate": 1.983063531873016e-05, + "loss": 0.9323, + "step": 1882 + }, + { + "epoch": 0.7536, + "learning_rate": 1.982547926805747e-05, + "loss": 1.0031, + "step": 1884 + }, + { + "epoch": 0.7544, + "learning_rate": 1.9820246596323724e-05, + "loss": 0.2707, + "step": 1886 + }, + { + "epoch": 0.7552, + "learning_rate": 1.981493734433433e-05, + "loss": 0.4908, + "step": 1888 + }, + { + "epoch": 0.756, + "learning_rate": 1.9809551553491918e-05, + "loss": 0.3736, + "step": 1890 + }, + { + "epoch": 0.7568, + "learning_rate": 1.9804089265795963e-05, + "loss": 0.3827, + "step": 1892 + }, + { + "epoch": 0.7576, + "learning_rate": 1.979855052384247e-05, + "loss": 0.3131, + "step": 1894 + }, + { + "epoch": 0.7584, + "learning_rate": 1.979293537082368e-05, + "loss": 0.3875, + "step": 1896 + }, + { + "epoch": 0.7592, + "learning_rate": 1.9787243850527663e-05, + "loss": 0.3477, + "step": 1898 + }, + { + "epoch": 0.76, + "learning_rate": 1.978147600733806e-05, + "loss": 0.6706, + "step": 1900 + }, + { + "epoch": 0.7608, + "learning_rate": 1.977563188623365e-05, + "loss": 0.2089, + "step": 1902 + }, + { + "epoch": 0.7616, + "learning_rate": 1.9769711532788086e-05, + "loss": 0.4683, + "step": 1904 + }, + { + "epoch": 0.7624, + "learning_rate": 1.9763714993169448e-05, + "loss": 1.0816, + "step": 1906 + }, + { + "epoch": 0.7632, + "learning_rate": 1.9757642314139977e-05, + "loss": 0.431, + "step": 1908 + }, + { + "epoch": 0.764, + "learning_rate": 1.9751493543055638e-05, + "loss": 0.355, + "step": 1910 + }, + { + "epoch": 0.7648, + "learning_rate": 1.9745268727865774e-05, + "loss": 0.178, + "step": 1912 + }, + { + "epoch": 0.7656, + "learning_rate": 1.973896791711276e-05, + "loss": 0.2835, + "step": 1914 + }, + { + "epoch": 0.7664, + "learning_rate": 1.9732591159931567e-05, + "loss": 0.1825, + "step": 1916 + }, + { + "epoch": 0.7672, + "learning_rate": 1.972613850604944e-05, + "loss": 0.7821, + "step": 1918 + }, + { + "epoch": 0.768, + "learning_rate": 1.9719610005785463e-05, + "loss": 0.4036, + "step": 1920 + }, + { + "epoch": 0.7688, + "learning_rate": 1.9713005710050206e-05, + "loss": 0.3472, + "step": 1922 + }, + { + "epoch": 0.7696, + "learning_rate": 1.9706325670345276e-05, + "loss": 0.4867, + "step": 1924 + }, + { + "epoch": 0.7704, + "learning_rate": 1.9699569938762975e-05, + "loss": 0.3304, + "step": 1926 + }, + { + "epoch": 0.7712, + "learning_rate": 1.969273856798586e-05, + "loss": 0.2193, + "step": 1928 + }, + { + "epoch": 0.772, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.4641, + "step": 1930 + }, + { + "epoch": 0.7728, + "learning_rate": 1.9678849122526195e-05, + "loss": 0.2356, + "step": 1932 + }, + { + "epoch": 0.7736, + "learning_rate": 1.967179115615633e-05, + "loss": 0.2357, + "step": 1934 + }, + { + "epoch": 0.7744, + "learning_rate": 1.966465776721618e-05, + "loss": 0.5697, + "step": 1936 + }, + { + "epoch": 0.7752, + "learning_rate": 1.9657449011333328e-05, + "loss": 0.2893, + "step": 1938 + }, + { + "epoch": 0.776, + "learning_rate": 1.965016494472312e-05, + "loss": 0.305, + "step": 1940 + }, + { + "epoch": 0.7768, + "learning_rate": 1.964280562418815e-05, + "loss": 0.5985, + "step": 1942 + }, + { + "epoch": 0.7776, + "learning_rate": 1.963537110711789e-05, + "loss": 0.2363, + "step": 1944 + }, + { + "epoch": 0.7784, + "learning_rate": 1.9627861451488194e-05, + "loss": 0.4926, + "step": 1946 + }, + { + "epoch": 0.7792, + "learning_rate": 1.962027671586086e-05, + "loss": 0.8343, + "step": 1948 + }, + { + "epoch": 0.78, + "learning_rate": 1.9612616959383194e-05, + "loss": 0.2186, + "step": 1950 + }, + { + "epoch": 0.7808, + "learning_rate": 1.96048822417875e-05, + "loss": 0.3249, + "step": 1952 + }, + { + "epoch": 0.7816, + "learning_rate": 1.9597072623390668e-05, + "loss": 0.3311, + "step": 1954 + }, + { + "epoch": 0.7824, + "learning_rate": 1.9589188165093666e-05, + "loss": 0.3768, + "step": 1956 + }, + { + "epoch": 0.7832, + "learning_rate": 1.95812289283811e-05, + "loss": 0.2547, + "step": 1958 + }, + { + "epoch": 0.784, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.3626, + "step": 1960 + }, + { + "epoch": 0.7848, + "learning_rate": 1.9565086368562784e-05, + "loss": 0.5001, + "step": 1962 + }, + { + "epoch": 0.7856, + "learning_rate": 1.9556903171339966e-05, + "loss": 0.2204, + "step": 1964 + }, + { + "epoch": 0.7864, + "learning_rate": 1.954864544746643e-05, + "loss": 0.4254, + "step": 1966 + }, + { + "epoch": 0.7872, + "learning_rate": 1.9540313261337585e-05, + "loss": 0.2324, + "step": 1968 + }, + { + "epoch": 0.788, + "learning_rate": 1.9531906677929472e-05, + "loss": 0.2429, + "step": 1970 + }, + { + "epoch": 0.7888, + "learning_rate": 1.9523425762798335e-05, + "loss": 0.3856, + "step": 1972 + }, + { + "epoch": 0.7896, + "learning_rate": 1.9514870582080035e-05, + "loss": 0.3546, + "step": 1974 + }, + { + "epoch": 0.7904, + "learning_rate": 1.95062412024896e-05, + "loss": 0.6147, + "step": 1976 + }, + { + "epoch": 0.7912, + "learning_rate": 1.9497537691320667e-05, + "loss": 0.2789, + "step": 1978 + }, + { + "epoch": 0.792, + "learning_rate": 1.948876011644497e-05, + "loss": 0.5248, + "step": 1980 + }, + { + "epoch": 0.7928, + "learning_rate": 1.9479908546311787e-05, + "loss": 0.2059, + "step": 1982 + }, + { + "epoch": 0.7936, + "learning_rate": 1.9470983049947443e-05, + "loss": 0.502, + "step": 1984 + }, + { + "epoch": 0.7944, + "learning_rate": 1.9461983696954767e-05, + "loss": 0.4995, + "step": 1986 + }, + { + "epoch": 0.7952, + "learning_rate": 1.9452910557512494e-05, + "loss": 0.2398, + "step": 1988 + }, + { + "epoch": 0.796, + "learning_rate": 1.9443763702374818e-05, + "loss": 0.52, + "step": 1990 + }, + { + "epoch": 0.7968, + "learning_rate": 1.9434543202870726e-05, + "loss": 0.3496, + "step": 1992 + }, + { + "epoch": 0.7976, + "learning_rate": 1.9425249130903544e-05, + "loss": 0.4955, + "step": 1994 + }, + { + "epoch": 0.7984, + "learning_rate": 1.94158815589503e-05, + "loss": 0.4192, + "step": 1996 + }, + { + "epoch": 0.7992, + "learning_rate": 1.940644056006122e-05, + "loss": 0.3103, + "step": 1998 + }, + { + "epoch": 0.8, + "learning_rate": 1.939692620785909e-05, + "loss": 0.3261, + "step": 2000 + }, + { + "epoch": 0.8008, + "learning_rate": 1.9387338576538746e-05, + "loss": 1.2938, + "step": 2002 + }, + { + "epoch": 0.8016, + "learning_rate": 1.9377677740866464e-05, + "loss": 0.3635, + "step": 2004 + }, + { + "epoch": 0.8024, + "learning_rate": 1.936794377617938e-05, + "loss": 0.44, + "step": 2006 + }, + { + "epoch": 0.8032, + "learning_rate": 1.9358136758384917e-05, + "loss": 0.3666, + "step": 2008 + }, + { + "epoch": 0.804, + "learning_rate": 1.9348256763960146e-05, + "loss": 0.4458, + "step": 2010 + }, + { + "epoch": 0.8048, + "learning_rate": 1.9338303869951273e-05, + "loss": 0.3224, + "step": 2012 + }, + { + "epoch": 0.8056, + "learning_rate": 1.9328278153972943e-05, + "loss": 0.2577, + "step": 2014 + }, + { + "epoch": 0.8064, + "learning_rate": 1.931817969420773e-05, + "loss": 0.5228, + "step": 2016 + }, + { + "epoch": 0.8072, + "learning_rate": 1.930800856940543e-05, + "loss": 0.3259, + "step": 2018 + }, + { + "epoch": 0.808, + "learning_rate": 1.929776485888252e-05, + "loss": 0.3406, + "step": 2020 + }, + { + "epoch": 0.8088, + "learning_rate": 1.9287448642521517e-05, + "loss": 0.3109, + "step": 2022 + }, + { + "epoch": 0.8096, + "learning_rate": 1.9277060000770342e-05, + "loss": 0.428, + "step": 2024 + }, + { + "epoch": 0.8104, + "learning_rate": 1.9266599014641727e-05, + "loss": 0.4023, + "step": 2026 + }, + { + "epoch": 0.8112, + "learning_rate": 1.925606576571252e-05, + "loss": 0.2821, + "step": 2028 + }, + { + "epoch": 0.812, + "learning_rate": 1.9245460336123136e-05, + "loss": 0.2332, + "step": 2030 + }, + { + "epoch": 0.8128, + "learning_rate": 1.923478280857682e-05, + "loss": 0.2869, + "step": 2032 + }, + { + "epoch": 0.8136, + "learning_rate": 1.9224033266339103e-05, + "loss": 0.2279, + "step": 2034 + }, + { + "epoch": 0.8144, + "learning_rate": 1.9213211793237066e-05, + "loss": 0.44, + "step": 2036 + }, + { + "epoch": 0.8152, + "learning_rate": 1.9202318473658707e-05, + "loss": 0.3438, + "step": 2038 + }, + { + "epoch": 0.816, + "learning_rate": 1.919135339255235e-05, + "loss": 0.6177, + "step": 2040 + }, + { + "epoch": 0.8168, + "learning_rate": 1.918031663542588e-05, + "loss": 0.3366, + "step": 2042 + }, + { + "epoch": 0.8176, + "learning_rate": 1.916920828834617e-05, + "loss": 0.2609, + "step": 2044 + }, + { + "epoch": 0.8184, + "learning_rate": 1.9158028437938313e-05, + "loss": 0.3542, + "step": 2046 + }, + { + "epoch": 0.8192, + "learning_rate": 1.9146777171385057e-05, + "loss": 0.8144, + "step": 2048 + }, + { + "epoch": 0.82, + "learning_rate": 1.913545457642601e-05, + "loss": 0.8687, + "step": 2050 + }, + { + "epoch": 0.8208, + "learning_rate": 1.9124060741357065e-05, + "loss": 0.4303, + "step": 2052 + }, + { + "epoch": 0.8216, + "learning_rate": 1.911259575502963e-05, + "loss": 0.4533, + "step": 2054 + }, + { + "epoch": 0.8224, + "learning_rate": 1.910105970684996e-05, + "loss": 0.4074, + "step": 2056 + }, + { + "epoch": 0.8232, + "learning_rate": 1.908945268677849e-05, + "loss": 0.6682, + "step": 2058 + }, + { + "epoch": 0.824, + "learning_rate": 1.9077774785329085e-05, + "loss": 0.4162, + "step": 2060 + }, + { + "epoch": 0.8248, + "learning_rate": 1.9066026093568383e-05, + "loss": 0.6529, + "step": 2062 + }, + { + "epoch": 0.8256, + "learning_rate": 1.9054206703115013e-05, + "loss": 0.3014, + "step": 2064 + }, + { + "epoch": 0.8264, + "learning_rate": 1.9042316706138994e-05, + "loss": 0.8301, + "step": 2066 + }, + { + "epoch": 0.8272, + "learning_rate": 1.903035619536087e-05, + "loss": 0.2041, + "step": 2068 + }, + { + "epoch": 0.828, + "learning_rate": 1.901832526405114e-05, + "loss": 0.5046, + "step": 2070 + }, + { + "epoch": 0.8288, + "learning_rate": 1.9006224006029414e-05, + "loss": 0.5502, + "step": 2072 + }, + { + "epoch": 0.8296, + "learning_rate": 1.899405251566371e-05, + "loss": 0.2661, + "step": 2074 + }, + { + "epoch": 0.8304, + "learning_rate": 1.8981810887869797e-05, + "loss": 0.2117, + "step": 2076 + }, + { + "epoch": 0.8312, + "learning_rate": 1.8969499218110302e-05, + "loss": 0.3694, + "step": 2078 + }, + { + "epoch": 0.832, + "learning_rate": 1.8957117602394133e-05, + "loss": 0.4057, + "step": 2080 + }, + { + "epoch": 0.8328, + "learning_rate": 1.8944666137275596e-05, + "loss": 0.4345, + "step": 2082 + }, + { + "epoch": 0.8336, + "learning_rate": 1.8932144919853744e-05, + "loss": 0.4886, + "step": 2084 + }, + { + "epoch": 0.8344, + "learning_rate": 1.8919554047771508e-05, + "loss": 0.3912, + "step": 2086 + }, + { + "epoch": 0.8352, + "learning_rate": 1.890689361921507e-05, + "loss": 0.6219, + "step": 2088 + }, + { + "epoch": 0.836, + "learning_rate": 1.8894163732912986e-05, + "loss": 0.6012, + "step": 2090 + }, + { + "epoch": 0.8368, + "learning_rate": 1.8881364488135445e-05, + "loss": 0.3559, + "step": 2092 + }, + { + "epoch": 0.8376, + "learning_rate": 1.886849598469357e-05, + "loss": 0.433, + "step": 2094 + }, + { + "epoch": 0.8384, + "learning_rate": 1.8855558322938492e-05, + "loss": 0.3452, + "step": 2096 + }, + { + "epoch": 0.8392, + "learning_rate": 1.8842551603760725e-05, + "loss": 0.5711, + "step": 2098 + }, + { + "epoch": 0.84, + "learning_rate": 1.8829475928589265e-05, + "loss": 0.2983, + "step": 2100 + }, + { + "epoch": 0.8408, + "learning_rate": 1.8816331399390874e-05, + "loss": 0.3865, + "step": 2102 + }, + { + "epoch": 0.8416, + "learning_rate": 1.88031181186692e-05, + "loss": 0.303, + "step": 2104 + }, + { + "epoch": 0.8424, + "learning_rate": 1.8789836189464092e-05, + "loss": 0.7201, + "step": 2106 + }, + { + "epoch": 0.8432, + "learning_rate": 1.877648571535068e-05, + "loss": 0.3385, + "step": 2108 + }, + { + "epoch": 0.844, + "learning_rate": 1.8763066800438638e-05, + "loss": 0.5775, + "step": 2110 + }, + { + "epoch": 0.8448, + "learning_rate": 1.8749579549371387e-05, + "loss": 0.9404, + "step": 2112 + }, + { + "epoch": 0.8456, + "learning_rate": 1.8736024067325188e-05, + "loss": 0.4841, + "step": 2114 + }, + { + "epoch": 0.8464, + "learning_rate": 1.8722400460008437e-05, + "loss": 0.2027, + "step": 2116 + }, + { + "epoch": 0.8472, + "learning_rate": 1.8708708833660748e-05, + "loss": 2.7746, + "step": 2118 + }, + { + "epoch": 0.848, + "learning_rate": 1.8694949295052198e-05, + "loss": 0.8873, + "step": 2120 + }, + { + "epoch": 0.8488, + "learning_rate": 1.868112195148239e-05, + "loss": 0.903, + "step": 2122 + }, + { + "epoch": 0.8496, + "learning_rate": 1.866722691077977e-05, + "loss": 0.7591, + "step": 2124 + }, + { + "epoch": 0.8504, + "learning_rate": 1.8653264281300626e-05, + "loss": 0.8773, + "step": 2126 + }, + { + "epoch": 0.8512, + "learning_rate": 1.8639234171928355e-05, + "loss": 0.2006, + "step": 2128 + }, + { + "epoch": 0.852, + "learning_rate": 1.8625136692072587e-05, + "loss": 0.6132, + "step": 2130 + }, + { + "epoch": 0.8528, + "learning_rate": 1.8610971951668265e-05, + "loss": 0.5983, + "step": 2132 + }, + { + "epoch": 0.8536, + "learning_rate": 1.8596740061174912e-05, + "loss": 1.3371, + "step": 2134 + }, + { + "epoch": 0.8544, + "learning_rate": 1.858244113157566e-05, + "loss": 0.4698, + "step": 2136 + }, + { + "epoch": 0.8552, + "learning_rate": 1.8568075274376432e-05, + "loss": 0.388, + "step": 2138 + }, + { + "epoch": 0.856, + "learning_rate": 1.8553642601605083e-05, + "loss": 0.6704, + "step": 2140 + }, + { + "epoch": 0.8568, + "learning_rate": 1.8539143225810457e-05, + "loss": 0.3132, + "step": 2142 + }, + { + "epoch": 0.8576, + "learning_rate": 1.852457726006163e-05, + "loss": 0.2646, + "step": 2144 + }, + { + "epoch": 0.8584, + "learning_rate": 1.8509944817946917e-05, + "loss": 0.4395, + "step": 2146 + }, + { + "epoch": 0.8592, + "learning_rate": 1.8495246013573064e-05, + "loss": 0.4838, + "step": 2148 + }, + { + "epoch": 0.86, + "learning_rate": 1.848048096156426e-05, + "loss": 0.2043, + "step": 2150 + }, + { + "epoch": 0.8608, + "learning_rate": 1.8465649777061387e-05, + "loss": 0.7569, + "step": 2152 + }, + { + "epoch": 0.8616, + "learning_rate": 1.8450752575720967e-05, + "loss": 0.2815, + "step": 2154 + }, + { + "epoch": 0.8624, + "learning_rate": 1.843578947371439e-05, + "loss": 0.2869, + "step": 2156 + }, + { + "epoch": 0.8632, + "learning_rate": 1.8420760587726935e-05, + "loss": 1.5304, + "step": 2158 + }, + { + "epoch": 0.864, + "learning_rate": 1.8405666034956846e-05, + "loss": 0.3794, + "step": 2160 + }, + { + "epoch": 0.8648, + "learning_rate": 1.8390505933114507e-05, + "loss": 0.3344, + "step": 2162 + }, + { + "epoch": 0.8656, + "learning_rate": 1.8375280400421414e-05, + "loss": 0.307, + "step": 2164 + }, + { + "epoch": 0.8664, + "learning_rate": 1.8359989555609365e-05, + "loss": 0.3618, + "step": 2166 + }, + { + "epoch": 0.8672, + "learning_rate": 1.834463351791939e-05, + "loss": 0.4506, + "step": 2168 + }, + { + "epoch": 0.868, + "learning_rate": 1.8329212407101006e-05, + "loss": 0.3274, + "step": 2170 + }, + { + "epoch": 0.8688, + "learning_rate": 1.8313726343411085e-05, + "loss": 0.2614, + "step": 2172 + }, + { + "epoch": 0.8696, + "learning_rate": 1.82981754476131e-05, + "loss": 0.2533, + "step": 2174 + }, + { + "epoch": 0.8704, + "learning_rate": 1.8282559840976053e-05, + "loss": 1.0762, + "step": 2176 + }, + { + "epoch": 0.8712, + "learning_rate": 1.8266879645273557e-05, + "loss": 0.3468, + "step": 2178 + }, + { + "epoch": 0.872, + "learning_rate": 1.8251134982782966e-05, + "loss": 0.3445, + "step": 2180 + }, + { + "epoch": 0.8728, + "learning_rate": 1.823532597628428e-05, + "loss": 0.301, + "step": 2182 + }, + { + "epoch": 0.8736, + "learning_rate": 1.8219452749059336e-05, + "loss": 0.1815, + "step": 2184 + }, + { + "epoch": 0.8744, + "learning_rate": 1.8203515424890734e-05, + "loss": 0.3455, + "step": 2186 + }, + { + "epoch": 0.8752, + "learning_rate": 1.8187514128060956e-05, + "loss": 0.3815, + "step": 2188 + }, + { + "epoch": 0.876, + "learning_rate": 1.8171448983351284e-05, + "loss": 0.4904, + "step": 2190 + }, + { + "epoch": 0.8768, + "learning_rate": 1.8155320116040983e-05, + "loss": 0.4646, + "step": 2192 + }, + { + "epoch": 0.8776, + "learning_rate": 1.8139127651906193e-05, + "loss": 0.6676, + "step": 2194 + }, + { + "epoch": 0.8784, + "learning_rate": 1.8122871717218974e-05, + "loss": 0.6452, + "step": 2196 + }, + { + "epoch": 0.8792, + "learning_rate": 1.8106552438746413e-05, + "loss": 0.5564, + "step": 2198 + }, + { + "epoch": 0.88, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.7557, + "step": 2200 + }, + { + "epoch": 0.8808, + "learning_rate": 1.807372435998219e-05, + "loss": 0.6628, + "step": 2202 + }, + { + "epoch": 0.8816, + "learning_rate": 1.8057215815690487e-05, + "loss": 0.2415, + "step": 2204 + }, + { + "epoch": 0.8824, + "learning_rate": 1.8040644439611355e-05, + "loss": 0.4855, + "step": 2206 + }, + { + "epoch": 0.8832, + "learning_rate": 1.8024010360971665e-05, + "loss": 0.6082, + "step": 2208 + }, + { + "epoch": 0.884, + "learning_rate": 1.8007313709487345e-05, + "loss": 0.4848, + "step": 2210 + }, + { + "epoch": 0.8848, + "learning_rate": 1.7990554615362207e-05, + "loss": 0.2959, + "step": 2212 + }, + { + "epoch": 0.8856, + "learning_rate": 1.7973733209287036e-05, + "loss": 0.2662, + "step": 2214 + }, + { + "epoch": 0.8864, + "learning_rate": 1.7956849622438568e-05, + "loss": 0.2331, + "step": 2216 + }, + { + "epoch": 0.8872, + "learning_rate": 1.7939903986478357e-05, + "loss": 0.2931, + "step": 2218 + }, + { + "epoch": 0.888, + "learning_rate": 1.7922896433551913e-05, + "loss": 0.2052, + "step": 2220 + }, + { + "epoch": 0.8888, + "learning_rate": 1.7905827096287525e-05, + "loss": 0.4904, + "step": 2222 + }, + { + "epoch": 0.8896, + "learning_rate": 1.7888696107795347e-05, + "loss": 0.27, + "step": 2224 + }, + { + "epoch": 0.8904, + "learning_rate": 1.787150360166623e-05, + "loss": 0.4649, + "step": 2226 + }, + { + "epoch": 0.8912, + "learning_rate": 1.7854249711970826e-05, + "loss": 0.4108, + "step": 2228 + }, + { + "epoch": 0.892, + "learning_rate": 1.783693457325841e-05, + "loss": 0.1366, + "step": 2230 + }, + { + "epoch": 0.8928, + "learning_rate": 1.7819558320555902e-05, + "loss": 0.5248, + "step": 2232 + }, + { + "epoch": 0.8936, + "learning_rate": 1.780212108936685e-05, + "loss": 0.2671, + "step": 2234 + }, + { + "epoch": 0.8944, + "learning_rate": 1.7784623015670237e-05, + "loss": 0.4074, + "step": 2236 + }, + { + "epoch": 0.8952, + "learning_rate": 1.7767064235919594e-05, + "loss": 1.228, + "step": 2238 + }, + { + "epoch": 0.896, + "learning_rate": 1.77494448870418e-05, + "loss": 0.4915, + "step": 2240 + }, + { + "epoch": 0.8968, + "learning_rate": 1.773176510643608e-05, + "loss": 0.5822, + "step": 2242 + }, + { + "epoch": 0.8976, + "learning_rate": 1.7714025031972894e-05, + "loss": 0.535, + "step": 2244 + }, + { + "epoch": 0.8984, + "learning_rate": 1.769622480199295e-05, + "loss": 0.3114, + "step": 2246 + }, + { + "epoch": 0.8992, + "learning_rate": 1.7678364555305982e-05, + "loss": 0.2489, + "step": 2248 + }, + { + "epoch": 0.9, + "learning_rate": 1.7660444431189777e-05, + "loss": 0.8887, + "step": 2250 + }, + { + "epoch": 0.9008, + "learning_rate": 1.76424645693891e-05, + "loss": 0.3893, + "step": 2252 + }, + { + "epoch": 0.9016, + "learning_rate": 1.762442511011448e-05, + "loss": 0.4646, + "step": 2254 + }, + { + "epoch": 0.9024, + "learning_rate": 1.7606326194041285e-05, + "loss": 0.859, + "step": 2256 + }, + { + "epoch": 0.9032, + "learning_rate": 1.7588167962308458e-05, + "loss": 0.6035, + "step": 2258 + }, + { + "epoch": 0.904, + "learning_rate": 1.756995055651757e-05, + "loss": 0.4018, + "step": 2260 + }, + { + "epoch": 0.9048, + "learning_rate": 1.7551674118731585e-05, + "loss": 0.5824, + "step": 2262 + }, + { + "epoch": 0.9056, + "learning_rate": 1.7533338791473875e-05, + "loss": 0.2973, + "step": 2264 + }, + { + "epoch": 0.9064, + "learning_rate": 1.751494471772697e-05, + "loss": 0.3813, + "step": 2266 + }, + { + "epoch": 0.9072, + "learning_rate": 1.7496492040931548e-05, + "loss": 0.2876, + "step": 2268 + }, + { + "epoch": 0.908, + "learning_rate": 1.747798090498533e-05, + "loss": 0.5741, + "step": 2270 + }, + { + "epoch": 0.9088, + "learning_rate": 1.745941145424182e-05, + "loss": 0.6742, + "step": 2272 + }, + { + "epoch": 0.9096, + "learning_rate": 1.744078383350938e-05, + "loss": 0.4175, + "step": 2274 + }, + { + "epoch": 0.9104, + "learning_rate": 1.7422098188049885e-05, + "loss": 0.2929, + "step": 2276 + }, + { + "epoch": 0.9112, + "learning_rate": 1.7403354663577782e-05, + "loss": 0.6589, + "step": 2278 + }, + { + "epoch": 0.912, + "learning_rate": 1.738455340625883e-05, + "loss": 0.5795, + "step": 2280 + }, + { + "epoch": 0.9128, + "learning_rate": 1.7365694562709038e-05, + "loss": 0.6459, + "step": 2282 + }, + { + "epoch": 0.9136, + "learning_rate": 1.7346778279993433e-05, + "loss": 0.3482, + "step": 2284 + }, + { + "epoch": 0.9144, + "learning_rate": 1.7327804705624962e-05, + "loss": 0.3131, + "step": 2286 + }, + { + "epoch": 0.9152, + "learning_rate": 1.730877398756341e-05, + "loss": 0.4361, + "step": 2288 + }, + { + "epoch": 0.916, + "learning_rate": 1.7289686274214113e-05, + "loss": 0.2655, + "step": 2290 + }, + { + "epoch": 0.9168, + "learning_rate": 1.727054171442693e-05, + "loss": 0.265, + "step": 2292 + }, + { + "epoch": 0.9176, + "learning_rate": 1.7251340457494934e-05, + "loss": 0.2461, + "step": 2294 + }, + { + "epoch": 0.9184, + "learning_rate": 1.7232082653153422e-05, + "loss": 0.2583, + "step": 2296 + }, + { + "epoch": 0.9192, + "learning_rate": 1.7212768451578595e-05, + "loss": 0.4616, + "step": 2298 + }, + { + "epoch": 0.92, + "learning_rate": 1.7193398003386517e-05, + "loss": 0.4566, + "step": 2300 + }, + { + "epoch": 0.9208, + "learning_rate": 1.7173971459631803e-05, + "loss": 0.1646, + "step": 2302 + }, + { + "epoch": 0.9216, + "learning_rate": 1.7154488971806525e-05, + "loss": 0.3506, + "step": 2304 + }, + { + "epoch": 0.9224, + "learning_rate": 1.713495069183907e-05, + "loss": 0.6979, + "step": 2306 + }, + { + "epoch": 0.9232, + "learning_rate": 1.7115356772092847e-05, + "loss": 0.4254, + "step": 2308 + }, + { + "epoch": 0.924, + "learning_rate": 1.709570736536522e-05, + "loss": 0.3409, + "step": 2310 + }, + { + "epoch": 0.9248, + "learning_rate": 1.7076002624886152e-05, + "loss": 0.2511, + "step": 2312 + }, + { + "epoch": 0.9256, + "learning_rate": 1.705624270431722e-05, + "loss": 0.5803, + "step": 2314 + }, + { + "epoch": 0.9264, + "learning_rate": 1.70364277577502e-05, + "loss": 0.6993, + "step": 2316 + }, + { + "epoch": 0.9272, + "learning_rate": 1.7016557939706078e-05, + "loss": 0.419, + "step": 2318 + }, + { + "epoch": 0.928, + "learning_rate": 1.6996633405133673e-05, + "loss": 0.2792, + "step": 2320 + }, + { + "epoch": 0.9288, + "learning_rate": 1.6976654309408468e-05, + "loss": 0.2809, + "step": 2322 + }, + { + "epoch": 0.9296, + "learning_rate": 1.6956620808331515e-05, + "loss": 0.9577, + "step": 2324 + }, + { + "epoch": 0.9304, + "learning_rate": 1.6936533058128042e-05, + "loss": 0.3098, + "step": 2326 + }, + { + "epoch": 0.9312, + "learning_rate": 1.691639121544641e-05, + "loss": 0.2982, + "step": 2328 + }, + { + "epoch": 0.932, + "learning_rate": 1.6896195437356696e-05, + "loss": 0.1997, + "step": 2330 + }, + { + "epoch": 0.9328, + "learning_rate": 1.6875945881349686e-05, + "loss": 0.2766, + "step": 2332 + }, + { + "epoch": 0.9336, + "learning_rate": 1.6855642705335435e-05, + "loss": 0.732, + "step": 2334 + }, + { + "epoch": 0.9344, + "learning_rate": 1.6835286067642228e-05, + "loss": 0.9749, + "step": 2336 + }, + { + "epoch": 0.9352, + "learning_rate": 1.681487612701521e-05, + "loss": 0.643, + "step": 2338 + }, + { + "epoch": 0.936, + "learning_rate": 1.6794413042615168e-05, + "loss": 0.3268, + "step": 2340 + }, + { + "epoch": 0.9368, + "learning_rate": 1.677389697401739e-05, + "loss": 0.2914, + "step": 2342 + }, + { + "epoch": 0.9376, + "learning_rate": 1.675332808121025e-05, + "loss": 0.2454, + "step": 2344 + }, + { + "epoch": 0.9384, + "learning_rate": 1.6732706524594145e-05, + "loss": 0.2803, + "step": 2346 + }, + { + "epoch": 0.9392, + "learning_rate": 1.671203246498009e-05, + "loss": 0.3538, + "step": 2348 + }, + { + "epoch": 0.94, + "learning_rate": 1.6691306063588593e-05, + "loss": 0.8185, + "step": 2350 + }, + { + "epoch": 0.9408, + "learning_rate": 1.6670527482048242e-05, + "loss": 0.325, + "step": 2352 + }, + { + "epoch": 0.9416, + "learning_rate": 1.6649696882394635e-05, + "loss": 0.4259, + "step": 2354 + }, + { + "epoch": 0.9424, + "learning_rate": 1.6628814427068968e-05, + "loss": 0.3757, + "step": 2356 + }, + { + "epoch": 0.9432, + "learning_rate": 1.6607880278916778e-05, + "loss": 0.2445, + "step": 2358 + }, + { + "epoch": 0.944, + "learning_rate": 1.6586894601186824e-05, + "loss": 0.514, + "step": 2360 + }, + { + "epoch": 0.9448, + "learning_rate": 1.656585755752957e-05, + "loss": 0.1825, + "step": 2362 + }, + { + "epoch": 0.9456, + "learning_rate": 1.6544769311996153e-05, + "loss": 0.2975, + "step": 2364 + }, + { + "epoch": 0.9464, + "learning_rate": 1.6523630029036924e-05, + "loss": 0.4538, + "step": 2366 + }, + { + "epoch": 0.9472, + "learning_rate": 1.6502439873500294e-05, + "loss": 0.5259, + "step": 2368 + }, + { + "epoch": 0.948, + "learning_rate": 1.6481199010631305e-05, + "loss": 0.9322, + "step": 2370 + }, + { + "epoch": 0.9488, + "learning_rate": 1.645990760607052e-05, + "loss": 0.3057, + "step": 2372 + }, + { + "epoch": 0.9496, + "learning_rate": 1.643856582585255e-05, + "loss": 0.4628, + "step": 2374 + }, + { + "epoch": 0.9504, + "learning_rate": 1.641717383640488e-05, + "loss": 0.6671, + "step": 2376 + }, + { + "epoch": 0.9512, + "learning_rate": 1.6395731804546596e-05, + "loss": 0.3283, + "step": 2378 + }, + { + "epoch": 0.952, + "learning_rate": 1.63742398974869e-05, + "loss": 0.2361, + "step": 2380 + }, + { + "epoch": 0.9528, + "learning_rate": 1.6352698282824045e-05, + "loss": 0.529, + "step": 2382 + }, + { + "epoch": 0.9536, + "learning_rate": 1.633110712854385e-05, + "loss": 0.3062, + "step": 2384 + }, + { + "epoch": 0.9544, + "learning_rate": 1.6309466603018504e-05, + "loss": 0.3648, + "step": 2386 + }, + { + "epoch": 0.9552, + "learning_rate": 1.6287776875005148e-05, + "loss": 0.6852, + "step": 2388 + }, + { + "epoch": 0.956, + "learning_rate": 1.6266038113644612e-05, + "loss": 0.7395, + "step": 2390 + }, + { + "epoch": 0.9568, + "learning_rate": 1.624425048846017e-05, + "loss": 0.2785, + "step": 2392 + }, + { + "epoch": 0.9576, + "learning_rate": 1.6222414169356063e-05, + "loss": 0.5972, + "step": 2394 + }, + { + "epoch": 0.9584, + "learning_rate": 1.6200529326616343e-05, + "loss": 0.9087, + "step": 2396 + }, + { + "epoch": 0.9592, + "learning_rate": 1.6178596130903345e-05, + "loss": 0.6266, + "step": 2398 + }, + { + "epoch": 0.96, + "learning_rate": 1.6156614753256587e-05, + "loss": 0.5249, + "step": 2400 + }, + { + "epoch": 0.9608, + "learning_rate": 1.613458536509123e-05, + "loss": 0.2893, + "step": 2402 + }, + { + "epoch": 0.9616, + "learning_rate": 1.6112508138196922e-05, + "loss": 0.3863, + "step": 2404 + }, + { + "epoch": 0.9624, + "learning_rate": 1.6090383244736277e-05, + "loss": 0.2399, + "step": 2406 + }, + { + "epoch": 0.9632, + "learning_rate": 1.606821085724363e-05, + "loss": 0.3374, + "step": 2408 + }, + { + "epoch": 0.964, + "learning_rate": 1.6045991148623756e-05, + "loss": 0.3893, + "step": 2410 + }, + { + "epoch": 0.9648, + "learning_rate": 1.602372429215038e-05, + "loss": 0.2202, + "step": 2412 + }, + { + "epoch": 0.9656, + "learning_rate": 1.600141046146497e-05, + "loss": 0.3849, + "step": 2414 + }, + { + "epoch": 0.9664, + "learning_rate": 1.597904983057519e-05, + "loss": 0.4723, + "step": 2416 + }, + { + "epoch": 0.9672, + "learning_rate": 1.5956642573853794e-05, + "loss": 0.5313, + "step": 2418 + }, + { + "epoch": 0.968, + "learning_rate": 1.5934188866037014e-05, + "loss": 0.1997, + "step": 2420 + }, + { + "epoch": 0.9688, + "learning_rate": 1.591168888222342e-05, + "loss": 0.48, + "step": 2422 + }, + { + "epoch": 0.9696, + "learning_rate": 1.5889142797872407e-05, + "loss": 0.6027, + "step": 2424 + }, + { + "epoch": 0.9704, + "learning_rate": 1.5866550788802818e-05, + "loss": 0.2288, + "step": 2426 + }, + { + "epoch": 0.9712, + "learning_rate": 1.584391303119173e-05, + "loss": 0.251, + "step": 2428 + }, + { + "epoch": 0.972, + "learning_rate": 1.582122970157289e-05, + "loss": 0.7532, + "step": 2430 + }, + { + "epoch": 0.9728, + "learning_rate": 1.5798500976835503e-05, + "loss": 0.821, + "step": 2432 + }, + { + "epoch": 0.9736, + "learning_rate": 1.577572703422267e-05, + "loss": 0.2697, + "step": 2434 + }, + { + "epoch": 0.9744, + "learning_rate": 1.575290805133024e-05, + "loss": 0.2455, + "step": 2436 + }, + { + "epoch": 0.9752, + "learning_rate": 1.5730044206105156e-05, + "loss": 0.2995, + "step": 2438 + }, + { + "epoch": 0.976, + "learning_rate": 1.570713567684432e-05, + "loss": 0.5387, + "step": 2440 + }, + { + "epoch": 0.9768, + "learning_rate": 1.5684182642193047e-05, + "loss": 0.4096, + "step": 2442 + }, + { + "epoch": 0.9776, + "learning_rate": 1.566118528114367e-05, + "loss": 0.3883, + "step": 2444 + }, + { + "epoch": 0.9784, + "learning_rate": 1.563814377303429e-05, + "loss": 1.0732, + "step": 2446 + }, + { + "epoch": 0.9792, + "learning_rate": 1.561505829754715e-05, + "loss": 0.3094, + "step": 2448 + }, + { + "epoch": 0.98, + "learning_rate": 1.5591929034707475e-05, + "loss": 0.5814, + "step": 2450 + }, + { + "epoch": 0.9808, + "learning_rate": 1.5568756164881874e-05, + "loss": 0.7261, + "step": 2452 + }, + { + "epoch": 0.9816, + "learning_rate": 1.5545539868777085e-05, + "loss": 0.3577, + "step": 2454 + }, + { + "epoch": 0.9824, + "learning_rate": 1.5522280327438384e-05, + "loss": 0.575, + "step": 2456 + }, + { + "epoch": 0.9832, + "learning_rate": 1.5498977722248398e-05, + "loss": 0.389, + "step": 2458 + }, + { + "epoch": 0.984, + "learning_rate": 1.547563223492552e-05, + "loss": 0.3828, + "step": 2460 + }, + { + "epoch": 0.9848, + "learning_rate": 1.5452244047522504e-05, + "loss": 0.416, + "step": 2462 + }, + { + "epoch": 0.9856, + "learning_rate": 1.5428813342425194e-05, + "loss": 0.2143, + "step": 2464 + }, + { + "epoch": 0.9864, + "learning_rate": 1.5405340302350876e-05, + "loss": 2.1062, + "step": 2466 + }, + { + "epoch": 0.9872, + "learning_rate": 1.538182511034708e-05, + "loss": 0.2473, + "step": 2468 + }, + { + "epoch": 0.988, + "learning_rate": 1.535826794978996e-05, + "loss": 0.4548, + "step": 2470 + }, + { + "epoch": 0.9888, + "learning_rate": 1.5334669004383036e-05, + "loss": 0.2232, + "step": 2472 + }, + { + "epoch": 0.9896, + "learning_rate": 1.5311028458155564e-05, + "loss": 0.9489, + "step": 2474 + }, + { + "epoch": 0.9904, + "learning_rate": 1.528734649546133e-05, + "loss": 0.3381, + "step": 2476 + }, + { + "epoch": 0.9912, + "learning_rate": 1.5263623300976997e-05, + "loss": 0.242, + "step": 2478 + }, + { + "epoch": 0.992, + "learning_rate": 1.5239859059700792e-05, + "loss": 0.2199, + "step": 2480 + }, + { + "epoch": 0.9928, + "learning_rate": 1.5216053956951096e-05, + "loss": 0.4118, + "step": 2482 + }, + { + "epoch": 0.9936, + "learning_rate": 1.5192208178364819e-05, + "loss": 0.5076, + "step": 2484 + }, + { + "epoch": 0.9944, + "learning_rate": 1.5168321909896176e-05, + "loss": 0.674, + "step": 2486 + }, + { + "epoch": 0.9952, + "learning_rate": 1.5144395337815057e-05, + "loss": 0.3767, + "step": 2488 + }, + { + "epoch": 0.996, + "learning_rate": 1.5120428648705722e-05, + "loss": 0.3037, + "step": 2490 + }, + { + "epoch": 0.9968, + "learning_rate": 1.5096422029465171e-05, + "loss": 0.5018, + "step": 2492 + }, + { + "epoch": 0.9976, + "learning_rate": 1.5072375667301904e-05, + "loss": 0.2759, + "step": 2494 + }, + { + "epoch": 0.9984, + "learning_rate": 1.5048289749734231e-05, + "loss": 0.6072, + "step": 2496 + }, + { + "epoch": 0.9992, + "learning_rate": 1.502416446458898e-05, + "loss": 0.4806, + "step": 2498 + }, + { + "epoch": 1.0, + "learning_rate": 1.5000000000000014e-05, + "loss": 0.4471, + "step": 2500 + }, + { + "epoch": 1.0, + "step": 2500, + "total_flos": 1.0187878155419648e+16, + "train_loss": 0.4481667988538742, + "train_runtime": 9740.388, + "train_samples_per_second": 4.107, + "train_steps_per_second": 0.257 + } + ], + "logging_steps": 2, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 1.0187878155419648e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..fc8c606efa3dc83be73fc1bc842e85e0d50fc688 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cefb388e62db0f102bc8b3b07faa4bd03757806ea8c4a946b7766a58ec3f275 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..b5ec447be6d06f922fb59f46b80be2973eb731cb --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38a9edfb4ae1d60252c71d4b2dad35daceda72c74a6392ecd60ef1191bb0ed75 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..e11f922f0957ff149427913d186c8f18d6d2be13 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7829d53ed9665c44a533575b99de834c30a485dbf3884358ef08d16105fb3dc3 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..3afa20107a99f382b0d194aeb3d1d0a82bf46b1d --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f2b6519be27579b04323d53659c4ed33db74d193821699e1696251455080e82 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_selfsup_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_selfsup_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..26015c2a40abc231ed285598eaef55e0e79147ae --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_selfsup_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,7532 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008, + "learning_rate": 2.357535430610912e-06, + "loss": 0.0114, + "step": 2 + }, + { + "epoch": 0.0016, + "learning_rate": 2.3755748898855234e-06, + "loss": 0.0538, + "step": 4 + }, + { + "epoch": 0.0024, + "learning_rate": 2.3936738059587174e-06, + "loss": 0.1818, + "step": 6 + }, + { + "epoch": 0.0032, + "learning_rate": 2.411832037691545e-06, + "loss": 0.5497, + "step": 8 + }, + { + "epoch": 0.004, + "learning_rate": 2.430049443482434e-06, + "loss": 0.0445, + "step": 10 + }, + { + "epoch": 0.0048, + "learning_rate": 2.448325881268406e-06, + "loss": 0.2739, + "step": 12 + }, + { + "epoch": 0.0056, + "learning_rate": 2.4666612085261277e-06, + "loss": 0.0153, + "step": 14 + }, + { + "epoch": 0.0064, + "learning_rate": 2.4850552822730346e-06, + "loss": 0.025, + "step": 16 + }, + { + "epoch": 0.0072, + "learning_rate": 2.503507959068455e-06, + "loss": 0.5164, + "step": 18 + }, + { + "epoch": 0.008, + "learning_rate": 2.522019095014686e-06, + "loss": 0.0066, + "step": 20 + }, + { + "epoch": 0.0088, + "learning_rate": 2.5405885457581814e-06, + "loss": 0.0245, + "step": 22 + }, + { + "epoch": 0.0096, + "learning_rate": 2.5592161664906243e-06, + "loss": 0.5499, + "step": 24 + }, + { + "epoch": 0.0104, + "learning_rate": 2.5779018119501086e-06, + "loss": 0.0008, + "step": 26 + }, + { + "epoch": 0.0112, + "learning_rate": 2.596645336422219e-06, + "loss": 1.0019, + "step": 28 + }, + { + "epoch": 0.012, + "learning_rate": 2.615446593741161e-06, + "loss": 0.0006, + "step": 30 + }, + { + "epoch": 0.0128, + "learning_rate": 2.6343054372909648e-06, + "loss": 0.0102, + "step": 32 + }, + { + "epoch": 0.0136, + "learning_rate": 2.6532217200065826e-06, + "loss": 0.0069, + "step": 34 + }, + { + "epoch": 0.0144, + "learning_rate": 2.6721952943750396e-06, + "loss": 0.0537, + "step": 36 + }, + { + "epoch": 0.0152, + "learning_rate": 2.691226012436604e-06, + "loss": 0.0036, + "step": 38 + }, + { + "epoch": 0.016, + "learning_rate": 2.7103137257858893e-06, + "loss": 1.0076, + "step": 40 + }, + { + "epoch": 0.0168, + "learning_rate": 2.7294582855730733e-06, + "loss": 0.0316, + "step": 42 + }, + { + "epoch": 0.0176, + "learning_rate": 2.7486595425050566e-06, + "loss": 0.0013, + "step": 44 + }, + { + "epoch": 0.0184, + "learning_rate": 2.7679173468465813e-06, + "loss": 0.4162, + "step": 46 + }, + { + "epoch": 0.0192, + "learning_rate": 2.7872315484213954e-06, + "loss": 0.3843, + "step": 48 + }, + { + "epoch": 0.02, + "learning_rate": 2.8066019966134873e-06, + "loss": 0.0093, + "step": 50 + }, + { + "epoch": 0.0208, + "learning_rate": 2.826028540368212e-06, + "loss": 0.4165, + "step": 52 + }, + { + "epoch": 0.0216, + "learning_rate": 2.845511028193477e-06, + "loss": 0.204, + "step": 54 + }, + { + "epoch": 0.0224, + "learning_rate": 2.865049308160931e-06, + "loss": 0.0037, + "step": 56 + }, + { + "epoch": 0.0232, + "learning_rate": 2.8846432279071533e-06, + "loss": 0.0029, + "step": 58 + }, + { + "epoch": 0.024, + "learning_rate": 2.9042926346347835e-06, + "loss": 0.334, + "step": 60 + }, + { + "epoch": 0.0248, + "learning_rate": 2.9239973751138397e-06, + "loss": 0.1269, + "step": 62 + }, + { + "epoch": 0.0256, + "learning_rate": 2.943757295682783e-06, + "loss": 0.011, + "step": 64 + }, + { + "epoch": 0.0264, + "learning_rate": 2.9635722422497983e-06, + "loss": 0.0114, + "step": 66 + }, + { + "epoch": 0.0272, + "learning_rate": 2.983442060293926e-06, + "loss": 0.0149, + "step": 68 + }, + { + "epoch": 0.028, + "learning_rate": 3.003366594866345e-06, + "loss": 0.0478, + "step": 70 + }, + { + "epoch": 0.0288, + "learning_rate": 3.0233456905915338e-06, + "loss": 0.3687, + "step": 72 + }, + { + "epoch": 0.0296, + "learning_rate": 3.0433791916684885e-06, + "loss": 0.0143, + "step": 74 + }, + { + "epoch": 0.0304, + "learning_rate": 3.0634669418719453e-06, + "loss": 0.0017, + "step": 76 + }, + { + "epoch": 0.0312, + "learning_rate": 3.0836087845535933e-06, + "loss": 0.1096, + "step": 78 + }, + { + "epoch": 0.032, + "learning_rate": 3.1038045626432945e-06, + "loss": 0.3132, + "step": 80 + }, + { + "epoch": 0.0328, + "learning_rate": 3.1240541186503173e-06, + "loss": 0.0037, + "step": 82 + }, + { + "epoch": 0.0336, + "learning_rate": 3.1443572946645683e-06, + "loss": 0.1466, + "step": 84 + }, + { + "epoch": 0.0344, + "learning_rate": 3.164713932357776e-06, + "loss": 0.0059, + "step": 86 + }, + { + "epoch": 0.0352, + "learning_rate": 3.1851238729848033e-06, + "loss": 0.0342, + "step": 88 + }, + { + "epoch": 0.036, + "learning_rate": 3.205586957384834e-06, + "loss": 0.0024, + "step": 90 + }, + { + "epoch": 0.0368, + "learning_rate": 3.2261030259826253e-06, + "loss": 0.0054, + "step": 92 + }, + { + "epoch": 0.0376, + "learning_rate": 3.246671918789752e-06, + "loss": 0.0084, + "step": 94 + }, + { + "epoch": 0.0384, + "learning_rate": 3.267293475405858e-06, + "loss": 0.0054, + "step": 96 + }, + { + "epoch": 0.0392, + "learning_rate": 3.2879675350199004e-06, + "loss": 0.0032, + "step": 98 + }, + { + "epoch": 0.04, + "learning_rate": 3.3086939364114113e-06, + "loss": 0.0102, + "step": 100 + }, + { + "epoch": 0.0408, + "learning_rate": 3.329472517951747e-06, + "loss": 0.0022, + "step": 102 + }, + { + "epoch": 0.0416, + "learning_rate": 3.350303117605369e-06, + "loss": 0.0495, + "step": 104 + }, + { + "epoch": 0.0424, + "learning_rate": 3.3711855729310503e-06, + "loss": 0.036, + "step": 106 + }, + { + "epoch": 0.0432, + "learning_rate": 3.3921197210832235e-06, + "loss": 0.3466, + "step": 108 + }, + { + "epoch": 0.044, + "learning_rate": 3.4131053988131947e-06, + "loss": 0.1809, + "step": 110 + }, + { + "epoch": 0.0448, + "learning_rate": 3.434142442470434e-06, + "loss": 0.0235, + "step": 112 + }, + { + "epoch": 0.0456, + "learning_rate": 3.455230688003849e-06, + "loss": 0.0472, + "step": 114 + }, + { + "epoch": 0.0464, + "learning_rate": 3.476369970963065e-06, + "loss": 0.004, + "step": 116 + }, + { + "epoch": 0.0472, + "learning_rate": 3.497560126499706e-06, + "loss": 0.0425, + "step": 118 + }, + { + "epoch": 0.048, + "learning_rate": 3.5188009893686836e-06, + "loss": 0.0014, + "step": 120 + }, + { + "epoch": 0.0488, + "learning_rate": 3.5400923939294827e-06, + "loss": 0.0007, + "step": 122 + }, + { + "epoch": 0.0496, + "learning_rate": 3.5614341741474667e-06, + "loss": 0.1245, + "step": 124 + }, + { + "epoch": 0.0504, + "learning_rate": 3.5828261635951177e-06, + "loss": 0.0381, + "step": 126 + }, + { + "epoch": 0.0512, + "learning_rate": 3.604268195453421e-06, + "loss": 0.119, + "step": 128 + }, + { + "epoch": 0.052, + "learning_rate": 3.6257601025130893e-06, + "loss": 0.0039, + "step": 130 + }, + { + "epoch": 0.0528, + "learning_rate": 3.647301717175955e-06, + "loss": 0.0322, + "step": 132 + }, + { + "epoch": 0.0536, + "learning_rate": 3.66889287145614e-06, + "loss": 0.0574, + "step": 134 + }, + { + "epoch": 0.0544, + "learning_rate": 3.6905333969814995e-06, + "loss": 0.0746, + "step": 136 + }, + { + "epoch": 0.0552, + "learning_rate": 3.712223124994867e-06, + "loss": 0.0444, + "step": 138 + }, + { + "epoch": 0.056, + "learning_rate": 3.7339618863553885e-06, + "loss": 0.0497, + "step": 140 + }, + { + "epoch": 0.0568, + "learning_rate": 3.755749511539848e-06, + "loss": 0.001, + "step": 142 + }, + { + "epoch": 0.0576, + "learning_rate": 3.7775858306439404e-06, + "loss": 0.1217, + "step": 144 + }, + { + "epoch": 0.0584, + "learning_rate": 3.799470673383677e-06, + "loss": 0.1533, + "step": 146 + }, + { + "epoch": 0.0592, + "learning_rate": 3.821403869096644e-06, + "loss": 0.0017, + "step": 148 + }, + { + "epoch": 0.06, + "learning_rate": 3.8433852467434175e-06, + "loss": 0.0614, + "step": 150 + }, + { + "epoch": 0.0608, + "learning_rate": 3.865414634908756e-06, + "loss": 0.0003, + "step": 152 + }, + { + "epoch": 0.0616, + "learning_rate": 3.887491861803081e-06, + "loss": 0.0377, + "step": 154 + }, + { + "epoch": 0.0624, + "learning_rate": 3.909616755263741e-06, + "loss": 0.027, + "step": 156 + }, + { + "epoch": 0.0632, + "learning_rate": 3.9317891427563725e-06, + "loss": 0.6366, + "step": 158 + }, + { + "epoch": 0.064, + "learning_rate": 3.954008851376244e-06, + "loss": 0.0048, + "step": 160 + }, + { + "epoch": 0.0648, + "learning_rate": 3.976275707849619e-06, + "loss": 0.0695, + "step": 162 + }, + { + "epoch": 0.0656, + "learning_rate": 3.99858953853505e-06, + "loss": 0.0022, + "step": 164 + }, + { + "epoch": 0.0664, + "learning_rate": 4.0209501694248e-06, + "loss": 0.0024, + "step": 166 + }, + { + "epoch": 0.0672, + "learning_rate": 4.043357426146209e-06, + "loss": 0.0038, + "step": 168 + }, + { + "epoch": 0.068, + "learning_rate": 4.065811133962987e-06, + "loss": 0.2259, + "step": 170 + }, + { + "epoch": 0.0688, + "learning_rate": 4.08831111777658e-06, + "loss": 0.0097, + "step": 172 + }, + { + "epoch": 0.0696, + "learning_rate": 4.110857202127611e-06, + "loss": 0.0053, + "step": 174 + }, + { + "epoch": 0.0704, + "learning_rate": 4.133449211197183e-06, + "loss": 0.0052, + "step": 176 + }, + { + "epoch": 0.0712, + "learning_rate": 4.156086968808274e-06, + "loss": 0.0007, + "step": 178 + }, + { + "epoch": 0.072, + "learning_rate": 4.178770298427114e-06, + "loss": 0.0012, + "step": 180 + }, + { + "epoch": 0.0728, + "learning_rate": 4.201499023164515e-06, + "loss": 0.0042, + "step": 182 + }, + { + "epoch": 0.0736, + "learning_rate": 4.224272965777315e-06, + "loss": 0.0049, + "step": 184 + }, + { + "epoch": 0.0744, + "learning_rate": 4.247091948669764e-06, + "loss": 0.143, + "step": 186 + }, + { + "epoch": 0.0752, + "learning_rate": 4.269955793894849e-06, + "loss": 0.0142, + "step": 188 + }, + { + "epoch": 0.076, + "learning_rate": 4.292864323155684e-06, + "loss": 0.014, + "step": 190 + }, + { + "epoch": 0.0768, + "learning_rate": 4.3158173578069696e-06, + "loss": 0.0904, + "step": 192 + }, + { + "epoch": 0.0776, + "learning_rate": 4.338814718856333e-06, + "loss": 0.0013, + "step": 194 + }, + { + "epoch": 0.0784, + "learning_rate": 4.3618562269657285e-06, + "loss": 0.0035, + "step": 196 + }, + { + "epoch": 0.0792, + "learning_rate": 4.384941702452852e-06, + "loss": 0.0274, + "step": 198 + }, + { + "epoch": 0.08, + "learning_rate": 4.408070965292526e-06, + "loss": 0.0032, + "step": 200 + }, + { + "epoch": 0.0808, + "learning_rate": 4.431243835118112e-06, + "loss": 0.0075, + "step": 202 + }, + { + "epoch": 0.0816, + "learning_rate": 4.4544601312229185e-06, + "loss": 0.8027, + "step": 204 + }, + { + "epoch": 0.0824, + "learning_rate": 4.477719672561602e-06, + "loss": 0.0012, + "step": 206 + }, + { + "epoch": 0.0832, + "learning_rate": 4.501022277751605e-06, + "loss": 0.0023, + "step": 208 + }, + { + "epoch": 0.084, + "learning_rate": 4.524367765074499e-06, + "loss": 0.0265, + "step": 210 + }, + { + "epoch": 0.0848, + "learning_rate": 4.5477559524775e-06, + "loss": 0.2539, + "step": 212 + }, + { + "epoch": 0.0856, + "learning_rate": 4.571186657574823e-06, + "loss": 0.0004, + "step": 214 + }, + { + "epoch": 0.0864, + "learning_rate": 4.5946596976491254e-06, + "loss": 0.0025, + "step": 216 + }, + { + "epoch": 0.0872, + "learning_rate": 4.618174889652924e-06, + "loss": 0.0023, + "step": 218 + }, + { + "epoch": 0.088, + "learning_rate": 4.6417320502100286e-06, + "loss": 0.0594, + "step": 220 + }, + { + "epoch": 0.0888, + "learning_rate": 4.665330995616967e-06, + "loss": 0.0184, + "step": 222 + }, + { + "epoch": 0.0896, + "learning_rate": 4.688971541844424e-06, + "loss": 0.001, + "step": 224 + }, + { + "epoch": 0.0904, + "learning_rate": 4.712653504538672e-06, + "loss": 0.0034, + "step": 226 + }, + { + "epoch": 0.0912, + "learning_rate": 4.736376699023023e-06, + "loss": 0.034, + "step": 228 + }, + { + "epoch": 0.092, + "learning_rate": 4.76014094029921e-06, + "loss": 0.008, + "step": 230 + }, + { + "epoch": 0.0928, + "learning_rate": 4.7839460430489216e-06, + "loss": 0.012, + "step": 232 + }, + { + "epoch": 0.0936, + "learning_rate": 4.807791821635185e-06, + "loss": 0.0023, + "step": 234 + }, + { + "epoch": 0.0944, + "learning_rate": 4.831678090103828e-06, + "loss": 0.3672, + "step": 236 + }, + { + "epoch": 0.0952, + "learning_rate": 4.855604662184931e-06, + "loss": 0.0014, + "step": 238 + }, + { + "epoch": 0.096, + "learning_rate": 4.8795713512942785e-06, + "loss": 0.0039, + "step": 240 + }, + { + "epoch": 0.0968, + "learning_rate": 4.903577970534815e-06, + "loss": 0.0008, + "step": 242 + }, + { + "epoch": 0.0976, + "learning_rate": 4.9276243326981e-06, + "loss": 0.0046, + "step": 244 + }, + { + "epoch": 0.0984, + "learning_rate": 4.951710250265788e-06, + "loss": 0.0463, + "step": 246 + }, + { + "epoch": 0.0992, + "learning_rate": 4.975835535411023e-06, + "loss": 0.004, + "step": 248 + }, + { + "epoch": 0.1, + "learning_rate": 5.000000000000003e-06, + "loss": 0.0037, + "step": 250 + }, + { + "epoch": 0.1008, + "learning_rate": 5.024203455593375e-06, + "loss": 0.0231, + "step": 252 + }, + { + "epoch": 0.1016, + "learning_rate": 5.048445713447734e-06, + "loss": 0.0795, + "step": 254 + }, + { + "epoch": 0.1024, + "learning_rate": 5.072726584517083e-06, + "loss": 0.7435, + "step": 256 + }, + { + "epoch": 0.1032, + "learning_rate": 5.097045879454308e-06, + "loss": 0.0049, + "step": 258 + }, + { + "epoch": 0.104, + "learning_rate": 5.1214034086126685e-06, + "loss": 0.0009, + "step": 260 + }, + { + "epoch": 0.1048, + "learning_rate": 5.145798982047253e-06, + "loss": 0.0047, + "step": 262 + }, + { + "epoch": 0.1056, + "learning_rate": 5.170232409516483e-06, + "loss": 0.0286, + "step": 264 + }, + { + "epoch": 0.1064, + "learning_rate": 5.194703500483597e-06, + "loss": 0.0018, + "step": 266 + }, + { + "epoch": 0.1072, + "learning_rate": 5.219212064118082e-06, + "loss": 0.0011, + "step": 268 + }, + { + "epoch": 0.108, + "learning_rate": 5.24375790929725e-06, + "loss": 0.0004, + "step": 270 + }, + { + "epoch": 0.1088, + "learning_rate": 5.268340844607653e-06, + "loss": 0.0462, + "step": 272 + }, + { + "epoch": 0.1096, + "learning_rate": 5.2929606783466735e-06, + "loss": 0.0012, + "step": 274 + }, + { + "epoch": 0.1104, + "learning_rate": 5.317617218523853e-06, + "loss": 0.0096, + "step": 276 + }, + { + "epoch": 0.1112, + "learning_rate": 5.342310272862553e-06, + "loss": 0.0264, + "step": 278 + }, + { + "epoch": 0.112, + "learning_rate": 5.367039648801377e-06, + "loss": 0.0003, + "step": 280 + }, + { + "epoch": 0.1128, + "learning_rate": 5.391805153495684e-06, + "loss": 0.0022, + "step": 282 + }, + { + "epoch": 0.1136, + "learning_rate": 5.416606593819109e-06, + "loss": 0.0209, + "step": 284 + }, + { + "epoch": 0.1144, + "learning_rate": 5.441443776365005e-06, + "loss": 0.4095, + "step": 286 + }, + { + "epoch": 0.1152, + "learning_rate": 5.466316507448053e-06, + "loss": 0.0015, + "step": 288 + }, + { + "epoch": 0.116, + "learning_rate": 5.49122459310568e-06, + "loss": 0.0059, + "step": 290 + }, + { + "epoch": 0.1168, + "learning_rate": 5.516167839099662e-06, + "loss": 0.0031, + "step": 292 + }, + { + "epoch": 0.1176, + "learning_rate": 5.5411460509175605e-06, + "loss": 0.0229, + "step": 294 + }, + { + "epoch": 0.1184, + "learning_rate": 5.5661590337742255e-06, + "loss": 0.0019, + "step": 296 + }, + { + "epoch": 0.1192, + "learning_rate": 5.591206592613412e-06, + "loss": 0.0674, + "step": 298 + }, + { + "epoch": 0.12, + "learning_rate": 5.616288532109221e-06, + "loss": 0.0055, + "step": 300 + }, + { + "epoch": 0.1208, + "learning_rate": 5.641404656667652e-06, + "loss": 0.0002, + "step": 302 + }, + { + "epoch": 0.1216, + "learning_rate": 5.666554770428136e-06, + "loss": 0.0066, + "step": 304 + }, + { + "epoch": 0.1224, + "learning_rate": 5.6917386772650015e-06, + "loss": 0.0032, + "step": 306 + }, + { + "epoch": 0.1232, + "learning_rate": 5.716956180789086e-06, + "loss": 0.0002, + "step": 308 + }, + { + "epoch": 0.124, + "learning_rate": 5.74220708434926e-06, + "loss": 0.0293, + "step": 310 + }, + { + "epoch": 0.1248, + "learning_rate": 5.767491191033909e-06, + "loss": 0.0051, + "step": 312 + }, + { + "epoch": 0.1256, + "learning_rate": 5.7928083036724535e-06, + "loss": 0.278, + "step": 314 + }, + { + "epoch": 0.1264, + "learning_rate": 5.818158224836983e-06, + "loss": 0.0008, + "step": 316 + }, + { + "epoch": 0.1272, + "learning_rate": 5.8435407568437194e-06, + "loss": 0.0324, + "step": 318 + }, + { + "epoch": 0.128, + "learning_rate": 5.868955701754577e-06, + "loss": 0.0127, + "step": 320 + }, + { + "epoch": 0.1288, + "learning_rate": 5.894402861378714e-06, + "loss": 0.0164, + "step": 322 + }, + { + "epoch": 0.1296, + "learning_rate": 5.919882037274065e-06, + "loss": 0.8316, + "step": 324 + }, + { + "epoch": 0.1304, + "learning_rate": 5.9453930307488985e-06, + "loss": 0.0853, + "step": 326 + }, + { + "epoch": 0.1312, + "learning_rate": 5.970935642863362e-06, + "loss": 0.0004, + "step": 328 + }, + { + "epoch": 0.132, + "learning_rate": 5.996509674431038e-06, + "loss": 0.0261, + "step": 330 + }, + { + "epoch": 0.1328, + "learning_rate": 6.022114926020505e-06, + "loss": 0.7622, + "step": 332 + }, + { + "epoch": 0.1336, + "learning_rate": 6.047751197956836e-06, + "loss": 0.0873, + "step": 334 + }, + { + "epoch": 0.1344, + "learning_rate": 6.0734182903232475e-06, + "loss": 0.0004, + "step": 336 + }, + { + "epoch": 0.1352, + "learning_rate": 6.0991160029626e-06, + "loss": 0.0301, + "step": 338 + }, + { + "epoch": 0.136, + "learning_rate": 6.124844135478966e-06, + "loss": 0.0036, + "step": 340 + }, + { + "epoch": 0.1368, + "learning_rate": 6.1506024872392e-06, + "loss": 0.0004, + "step": 342 + }, + { + "epoch": 0.1376, + "learning_rate": 6.176390857374501e-06, + "loss": 0.0043, + "step": 344 + }, + { + "epoch": 0.1384, + "learning_rate": 6.202209044781979e-06, + "loss": 0.0001, + "step": 346 + }, + { + "epoch": 0.1392, + "learning_rate": 6.228056848126223e-06, + "loss": 0.0667, + "step": 348 + }, + { + "epoch": 0.14, + "learning_rate": 6.253934065840883e-06, + "loss": 0.0061, + "step": 350 + }, + { + "epoch": 0.1408, + "learning_rate": 6.279840496130188e-06, + "loss": 0.0089, + "step": 352 + }, + { + "epoch": 0.1416, + "learning_rate": 6.305775936970606e-06, + "loss": 0.0907, + "step": 354 + }, + { + "epoch": 0.1424, + "learning_rate": 6.331740186112359e-06, + "loss": 0.0035, + "step": 356 + }, + { + "epoch": 0.1432, + "learning_rate": 6.357733041081015e-06, + "loss": 0.5141, + "step": 358 + }, + { + "epoch": 0.144, + "learning_rate": 6.383754299179072e-06, + "loss": 0.1489, + "step": 360 + }, + { + "epoch": 0.1448, + "learning_rate": 6.409803757487532e-06, + "loss": 0.0288, + "step": 362 + }, + { + "epoch": 0.1456, + "learning_rate": 6.435881212867485e-06, + "loss": 0.1001, + "step": 364 + }, + { + "epoch": 0.1464, + "learning_rate": 6.4619864619616975e-06, + "loss": 0.0039, + "step": 366 + }, + { + "epoch": 0.1472, + "learning_rate": 6.48811930119619e-06, + "loss": 0.2385, + "step": 368 + }, + { + "epoch": 0.148, + "learning_rate": 6.514279526781853e-06, + "loss": 0.0015, + "step": 370 + }, + { + "epoch": 0.1488, + "learning_rate": 6.540466934715955e-06, + "loss": 0.0478, + "step": 372 + }, + { + "epoch": 0.1496, + "learning_rate": 6.566681320783848e-06, + "loss": 0.0036, + "step": 374 + }, + { + "epoch": 0.1504, + "learning_rate": 6.592922480560483e-06, + "loss": 0.0085, + "step": 376 + }, + { + "epoch": 0.1512, + "learning_rate": 6.619190209412025e-06, + "loss": 0.0015, + "step": 378 + }, + { + "epoch": 0.152, + "learning_rate": 6.6454843024974465e-06, + "loss": 0.0013, + "step": 380 + }, + { + "epoch": 0.1528, + "learning_rate": 6.671804554770128e-06, + "loss": 0.0002, + "step": 382 + }, + { + "epoch": 0.1536, + "learning_rate": 6.698150760979456e-06, + "loss": 0.0011, + "step": 384 + }, + { + "epoch": 0.1544, + "learning_rate": 6.724522715672421e-06, + "loss": 0.0012, + "step": 386 + }, + { + "epoch": 0.1552, + "learning_rate": 6.750920213195242e-06, + "loss": 0.3655, + "step": 388 + }, + { + "epoch": 0.156, + "learning_rate": 6.777343047694894e-06, + "loss": 0.0018, + "step": 390 + }, + { + "epoch": 0.1568, + "learning_rate": 6.803791013120824e-06, + "loss": 0.2722, + "step": 392 + }, + { + "epoch": 0.1576, + "learning_rate": 6.8302639032264836e-06, + "loss": 0.0345, + "step": 394 + }, + { + "epoch": 0.1584, + "learning_rate": 6.856761511570944e-06, + "loss": 0.0025, + "step": 396 + }, + { + "epoch": 0.1592, + "learning_rate": 6.883283631520579e-06, + "loss": 0.2781, + "step": 398 + }, + { + "epoch": 0.16, + "learning_rate": 6.909830056250522e-06, + "loss": 0.006, + "step": 400 + }, + { + "epoch": 0.1608, + "learning_rate": 6.936400578746436e-06, + "loss": 0.0012, + "step": 402 + }, + { + "epoch": 0.1616, + "learning_rate": 6.96299499180605e-06, + "loss": 0.0173, + "step": 404 + }, + { + "epoch": 0.1624, + "learning_rate": 6.989613088040787e-06, + "loss": 0.0631, + "step": 406 + }, + { + "epoch": 0.1632, + "learning_rate": 7.016254659877404e-06, + "loss": 0.0364, + "step": 408 + }, + { + "epoch": 0.164, + "learning_rate": 7.042919499559539e-06, + "loss": 0.0771, + "step": 410 + }, + { + "epoch": 0.1648, + "learning_rate": 7.06960739914943e-06, + "loss": 0.0028, + "step": 412 + }, + { + "epoch": 0.1656, + "learning_rate": 7.09631815052946e-06, + "loss": 0.0001, + "step": 414 + }, + { + "epoch": 0.1664, + "learning_rate": 7.123051545403873e-06, + "loss": 0.0004, + "step": 416 + }, + { + "epoch": 0.1672, + "learning_rate": 7.1498073753002375e-06, + "loss": 0.3526, + "step": 418 + }, + { + "epoch": 0.168, + "learning_rate": 7.1765854315712325e-06, + "loss": 0.0049, + "step": 420 + }, + { + "epoch": 0.1688, + "learning_rate": 7.203385505396197e-06, + "loss": 0.001, + "step": 422 + }, + { + "epoch": 0.1696, + "learning_rate": 7.230207387782771e-06, + "loss": 0.3729, + "step": 424 + }, + { + "epoch": 0.1704, + "learning_rate": 7.257050869568527e-06, + "loss": 0.0225, + "step": 426 + }, + { + "epoch": 0.1712, + "learning_rate": 7.28391574142262e-06, + "loss": 0.0008, + "step": 428 + }, + { + "epoch": 0.172, + "learning_rate": 7.3108017938473485e-06, + "loss": 0.0008, + "step": 430 + }, + { + "epoch": 0.1728, + "learning_rate": 7.337708817179875e-06, + "loss": 0.0014, + "step": 432 + }, + { + "epoch": 0.1736, + "learning_rate": 7.36463660159386e-06, + "loss": 0.0002, + "step": 434 + }, + { + "epoch": 0.1744, + "learning_rate": 7.39158493710103e-06, + "loss": 0.0004, + "step": 436 + }, + { + "epoch": 0.1752, + "learning_rate": 7.418553613552822e-06, + "loss": 0.0077, + "step": 438 + }, + { + "epoch": 0.176, + "learning_rate": 7.445542420642091e-06, + "loss": 0.4597, + "step": 440 + }, + { + "epoch": 0.1768, + "learning_rate": 7.472551147904703e-06, + "loss": 0.1823, + "step": 442 + }, + { + "epoch": 0.1776, + "learning_rate": 7.499579584721173e-06, + "loss": 0.0015, + "step": 444 + }, + { + "epoch": 0.1784, + "learning_rate": 7.5266275203183395e-06, + "loss": 0.0297, + "step": 446 + }, + { + "epoch": 0.1792, + "learning_rate": 7.553694743770917e-06, + "loss": 0.0674, + "step": 448 + }, + { + "epoch": 0.18, + "learning_rate": 7.580781044003312e-06, + "loss": 0.0713, + "step": 450 + }, + { + "epoch": 0.1808, + "learning_rate": 7.607886209791095e-06, + "loss": 0.001, + "step": 452 + }, + { + "epoch": 0.1816, + "learning_rate": 7.635010029762755e-06, + "loss": 0.0012, + "step": 454 + }, + { + "epoch": 0.1824, + "learning_rate": 7.662152292401265e-06, + "loss": 0.0021, + "step": 456 + }, + { + "epoch": 0.1832, + "learning_rate": 7.689312786045822e-06, + "loss": 0.0148, + "step": 458 + }, + { + "epoch": 0.184, + "learning_rate": 7.716491298893441e-06, + "loss": 0.0057, + "step": 460 + }, + { + "epoch": 0.1848, + "learning_rate": 7.74368761900062e-06, + "loss": 0.0339, + "step": 462 + }, + { + "epoch": 0.1856, + "learning_rate": 7.770901534284991e-06, + "loss": 0.003, + "step": 464 + }, + { + "epoch": 0.1864, + "learning_rate": 7.798132832526976e-06, + "loss": 0.0326, + "step": 466 + }, + { + "epoch": 0.1872, + "learning_rate": 7.825381301371444e-06, + "loss": 0.0095, + "step": 468 + }, + { + "epoch": 0.188, + "learning_rate": 7.852646728329358e-06, + "loss": 0.0037, + "step": 470 + }, + { + "epoch": 0.1888, + "learning_rate": 7.879928900779441e-06, + "loss": 0.0264, + "step": 472 + }, + { + "epoch": 0.1896, + "learning_rate": 7.907227605969852e-06, + "loss": 0.0059, + "step": 474 + }, + { + "epoch": 0.1904, + "learning_rate": 7.934542631019767e-06, + "loss": 0.014, + "step": 476 + }, + { + "epoch": 0.1912, + "learning_rate": 7.961873762921153e-06, + "loss": 0.0004, + "step": 478 + }, + { + "epoch": 0.192, + "learning_rate": 7.989220788540351e-06, + "loss": 0.0831, + "step": 480 + }, + { + "epoch": 0.1928, + "learning_rate": 8.016583494619764e-06, + "loss": 0.0018, + "step": 482 + }, + { + "epoch": 0.1936, + "learning_rate": 8.043961667779511e-06, + "loss": 0.1637, + "step": 484 + }, + { + "epoch": 0.1944, + "learning_rate": 8.071355094519103e-06, + "loss": 0.6775, + "step": 486 + }, + { + "epoch": 0.1952, + "learning_rate": 8.098763561219089e-06, + "loss": 0.0003, + "step": 488 + }, + { + "epoch": 0.196, + "learning_rate": 8.126186854142744e-06, + "loss": 0.0012, + "step": 490 + }, + { + "epoch": 0.1968, + "learning_rate": 8.153624759437718e-06, + "loss": 0.0007, + "step": 492 + }, + { + "epoch": 0.1976, + "learning_rate": 8.181077063137735e-06, + "loss": 0.9133, + "step": 494 + }, + { + "epoch": 0.1984, + "learning_rate": 8.208543551164178e-06, + "loss": 0.0029, + "step": 496 + }, + { + "epoch": 0.1992, + "learning_rate": 8.236024009327877e-06, + "loss": 0.0982, + "step": 498 + }, + { + "epoch": 0.2, + "learning_rate": 8.263518223330695e-06, + "loss": 0.0166, + "step": 500 + }, + { + "epoch": 0.2008, + "learning_rate": 8.29102597876723e-06, + "loss": 0.0606, + "step": 502 + }, + { + "epoch": 0.2016, + "learning_rate": 8.31854706112648e-06, + "loss": 0.0004, + "step": 504 + }, + { + "epoch": 0.2024, + "learning_rate": 8.346081255793516e-06, + "loss": 0.001, + "step": 506 + }, + { + "epoch": 0.2032, + "learning_rate": 8.373628348051156e-06, + "loss": 0.0188, + "step": 508 + }, + { + "epoch": 0.204, + "learning_rate": 8.401188123081642e-06, + "loss": 0.0079, + "step": 510 + }, + { + "epoch": 0.2048, + "learning_rate": 8.428760365968329e-06, + "loss": 0.003, + "step": 512 + }, + { + "epoch": 0.2056, + "learning_rate": 8.456344861697293e-06, + "loss": 0.0823, + "step": 514 + }, + { + "epoch": 0.2064, + "learning_rate": 8.483941395159114e-06, + "loss": 0.043, + "step": 516 + }, + { + "epoch": 0.2072, + "learning_rate": 8.511549751150478e-06, + "loss": 0.0042, + "step": 518 + }, + { + "epoch": 0.208, + "learning_rate": 8.539169714375883e-06, + "loss": 0.0027, + "step": 520 + }, + { + "epoch": 0.2088, + "learning_rate": 8.566801069449304e-06, + "loss": 0.9017, + "step": 522 + }, + { + "epoch": 0.2096, + "learning_rate": 8.594443600895886e-06, + "loss": 0.0148, + "step": 524 + }, + { + "epoch": 0.2104, + "learning_rate": 8.622097093153612e-06, + "loss": 0.0513, + "step": 526 + }, + { + "epoch": 0.2112, + "learning_rate": 8.649761330575e-06, + "loss": 0.6009, + "step": 528 + }, + { + "epoch": 0.212, + "learning_rate": 8.677436097428766e-06, + "loss": 0.0026, + "step": 530 + }, + { + "epoch": 0.2128, + "learning_rate": 8.705121177901537e-06, + "loss": 0.0009, + "step": 532 + }, + { + "epoch": 0.2136, + "learning_rate": 8.732816356099459e-06, + "loss": 0.0011, + "step": 534 + }, + { + "epoch": 0.2144, + "learning_rate": 8.760521416049986e-06, + "loss": 0.1258, + "step": 536 + }, + { + "epoch": 0.2152, + "learning_rate": 8.788236141703477e-06, + "loss": 0.0101, + "step": 538 + }, + { + "epoch": 0.216, + "learning_rate": 8.81596031693499e-06, + "loss": 0.0406, + "step": 540 + }, + { + "epoch": 0.2168, + "learning_rate": 8.84369372554578e-06, + "loss": 0.0734, + "step": 542 + }, + { + "epoch": 0.2176, + "learning_rate": 8.87143615126518e-06, + "loss": 0.0003, + "step": 544 + }, + { + "epoch": 0.2184, + "learning_rate": 8.899187377752173e-06, + "loss": 0.0004, + "step": 546 + }, + { + "epoch": 0.2192, + "learning_rate": 8.926947188597127e-06, + "loss": 0.0299, + "step": 548 + }, + { + "epoch": 0.22, + "learning_rate": 8.954715367323473e-06, + "loss": 0.122, + "step": 550 + }, + { + "epoch": 0.2208, + "learning_rate": 8.982491697389344e-06, + "loss": 0.0006, + "step": 552 + }, + { + "epoch": 0.2216, + "learning_rate": 9.010275962189356e-06, + "loss": 0.0203, + "step": 554 + }, + { + "epoch": 0.2224, + "learning_rate": 9.03806794505621e-06, + "loss": 0.0011, + "step": 556 + }, + { + "epoch": 0.2232, + "learning_rate": 9.065867429262497e-06, + "loss": 0.4826, + "step": 558 + }, + { + "epoch": 0.224, + "learning_rate": 9.093674198022198e-06, + "loss": 0.0016, + "step": 560 + }, + { + "epoch": 0.2248, + "learning_rate": 9.121488034492567e-06, + "loss": 0.4759, + "step": 562 + }, + { + "epoch": 0.2256, + "learning_rate": 9.149308721775717e-06, + "loss": 0.0429, + "step": 564 + }, + { + "epoch": 0.2264, + "learning_rate": 9.177136042920338e-06, + "loss": 0.0005, + "step": 566 + }, + { + "epoch": 0.2272, + "learning_rate": 9.204969780923396e-06, + "loss": 0.0174, + "step": 568 + }, + { + "epoch": 0.228, + "learning_rate": 9.232809718731822e-06, + "loss": 1.223, + "step": 570 + }, + { + "epoch": 0.2288, + "learning_rate": 9.26065563924414e-06, + "loss": 0.0002, + "step": 572 + }, + { + "epoch": 0.2296, + "learning_rate": 9.288507325312319e-06, + "loss": 0.0051, + "step": 574 + }, + { + "epoch": 0.2304, + "learning_rate": 9.316364559743298e-06, + "loss": 0.0003, + "step": 576 + }, + { + "epoch": 0.2312, + "learning_rate": 9.344227125300788e-06, + "loss": 0.0137, + "step": 578 + }, + { + "epoch": 0.232, + "learning_rate": 9.372094804706867e-06, + "loss": 0.0003, + "step": 580 + }, + { + "epoch": 0.2328, + "learning_rate": 9.39996738064379e-06, + "loss": 0.7711, + "step": 582 + }, + { + "epoch": 0.2336, + "learning_rate": 9.427844635755615e-06, + "loss": 0.0449, + "step": 584 + }, + { + "epoch": 0.2344, + "learning_rate": 9.455726352649904e-06, + "loss": 0.825, + "step": 586 + }, + { + "epoch": 0.2352, + "learning_rate": 9.483612313899446e-06, + "loss": 0.0013, + "step": 588 + }, + { + "epoch": 0.236, + "learning_rate": 9.511502302043859e-06, + "loss": 0.156, + "step": 590 + }, + { + "epoch": 0.2368, + "learning_rate": 9.539396099591469e-06, + "loss": 0.512, + "step": 592 + }, + { + "epoch": 0.2376, + "learning_rate": 9.567293489020816e-06, + "loss": 0.0858, + "step": 594 + }, + { + "epoch": 0.2384, + "learning_rate": 9.595194252782461e-06, + "loss": 0.1478, + "step": 596 + }, + { + "epoch": 0.2392, + "learning_rate": 9.623098173300656e-06, + "loss": 0.022, + "step": 598 + }, + { + "epoch": 0.24, + "learning_rate": 9.651005032974991e-06, + "loss": 0.0778, + "step": 600 + }, + { + "epoch": 0.2408, + "learning_rate": 9.678914614182184e-06, + "loss": 0.0113, + "step": 602 + }, + { + "epoch": 0.2416, + "learning_rate": 9.706826699277714e-06, + "loss": 0.2379, + "step": 604 + }, + { + "epoch": 0.2424, + "learning_rate": 9.734741070597535e-06, + "loss": 0.0122, + "step": 606 + }, + { + "epoch": 0.2432, + "learning_rate": 9.762657510459774e-06, + "loss": 0.0637, + "step": 608 + }, + { + "epoch": 0.244, + "learning_rate": 9.790575801166422e-06, + "loss": 0.0143, + "step": 610 + }, + { + "epoch": 0.2448, + "learning_rate": 9.818495725005043e-06, + "loss": 0.2477, + "step": 612 + }, + { + "epoch": 0.2456, + "learning_rate": 9.846417064250459e-06, + "loss": 1.1105, + "step": 614 + }, + { + "epoch": 0.2464, + "learning_rate": 9.874339601166479e-06, + "loss": 0.0052, + "step": 616 + }, + { + "epoch": 0.2472, + "learning_rate": 9.902263118007513e-06, + "loss": 0.0181, + "step": 618 + }, + { + "epoch": 0.248, + "learning_rate": 9.930187397020385e-06, + "loss": 0.0374, + "step": 620 + }, + { + "epoch": 0.2488, + "learning_rate": 9.95811222044596e-06, + "loss": 0.089, + "step": 622 + }, + { + "epoch": 0.2496, + "learning_rate": 9.986037370520855e-06, + "loss": 0.0948, + "step": 624 + }, + { + "epoch": 0.2504, + "learning_rate": 1.0013962629479139e-05, + "loss": 0.0015, + "step": 626 + }, + { + "epoch": 0.2512, + "learning_rate": 1.0041887779554034e-05, + "loss": 0.1123, + "step": 628 + }, + { + "epoch": 0.252, + "learning_rate": 1.0069812602979607e-05, + "loss": 0.0043, + "step": 630 + }, + { + "epoch": 0.2528, + "learning_rate": 1.0097736881992482e-05, + "loss": 0.0043, + "step": 632 + }, + { + "epoch": 0.2536, + "learning_rate": 1.0125660398833514e-05, + "loss": 0.0142, + "step": 634 + }, + { + "epoch": 0.2544, + "learning_rate": 1.0153582935749533e-05, + "loss": 0.0108, + "step": 636 + }, + { + "epoch": 0.2552, + "learning_rate": 1.0181504274994952e-05, + "loss": 0.1254, + "step": 638 + }, + { + "epoch": 0.256, + "learning_rate": 1.0209424198833571e-05, + "loss": 0.083, + "step": 640 + }, + { + "epoch": 0.2568, + "learning_rate": 1.0237342489540218e-05, + "loss": 0.179, + "step": 642 + }, + { + "epoch": 0.2576, + "learning_rate": 1.0265258929402458e-05, + "loss": 0.0012, + "step": 644 + }, + { + "epoch": 0.2584, + "learning_rate": 1.029317330072228e-05, + "loss": 0.0353, + "step": 646 + }, + { + "epoch": 0.2592, + "learning_rate": 1.0321085385817811e-05, + "loss": 0.1669, + "step": 648 + }, + { + "epoch": 0.26, + "learning_rate": 1.0348994967025004e-05, + "loss": 0.0032, + "step": 650 + }, + { + "epoch": 0.2608, + "learning_rate": 1.0376901826699337e-05, + "loss": 0.01, + "step": 652 + }, + { + "epoch": 0.2616, + "learning_rate": 1.0404805747217532e-05, + "loss": 0.0292, + "step": 654 + }, + { + "epoch": 0.2624, + "learning_rate": 1.0432706510979175e-05, + "loss": 0.0006, + "step": 656 + }, + { + "epoch": 0.2632, + "learning_rate": 1.0460603900408526e-05, + "loss": 0.0018, + "step": 658 + }, + { + "epoch": 0.264, + "learning_rate": 1.0488497697956134e-05, + "loss": 0.0007, + "step": 660 + }, + { + "epoch": 0.2648, + "learning_rate": 1.0516387686100549e-05, + "loss": 0.0087, + "step": 662 + }, + { + "epoch": 0.2656, + "learning_rate": 1.054427364735009e-05, + "loss": 0.058, + "step": 664 + }, + { + "epoch": 0.2664, + "learning_rate": 1.0572155364244378e-05, + "loss": 0.0566, + "step": 666 + }, + { + "epoch": 0.2672, + "learning_rate": 1.0600032619356203e-05, + "loss": 0.0087, + "step": 668 + }, + { + "epoch": 0.268, + "learning_rate": 1.0627905195293127e-05, + "loss": 0.0131, + "step": 670 + }, + { + "epoch": 0.2688, + "learning_rate": 1.0655772874699206e-05, + "loss": 0.0989, + "step": 672 + }, + { + "epoch": 0.2696, + "learning_rate": 1.0683635440256694e-05, + "loss": 0.4608, + "step": 674 + }, + { + "epoch": 0.2704, + "learning_rate": 1.0711492674687674e-05, + "loss": 0.0039, + "step": 676 + }, + { + "epoch": 0.2712, + "learning_rate": 1.0739344360755855e-05, + "loss": 0.0584, + "step": 678 + }, + { + "epoch": 0.272, + "learning_rate": 1.0767190281268171e-05, + "loss": 0.0007, + "step": 680 + }, + { + "epoch": 0.2728, + "learning_rate": 1.07950302190766e-05, + "loss": 0.0037, + "step": 682 + }, + { + "epoch": 0.2736, + "learning_rate": 1.0822863957079654e-05, + "loss": 0.0758, + "step": 684 + }, + { + "epoch": 0.2744, + "learning_rate": 1.0850691278224277e-05, + "loss": 0.0008, + "step": 686 + }, + { + "epoch": 0.2752, + "learning_rate": 1.0878511965507428e-05, + "loss": 0.2452, + "step": 688 + }, + { + "epoch": 0.276, + "learning_rate": 1.0906325801977795e-05, + "loss": 0.0012, + "step": 690 + }, + { + "epoch": 0.2768, + "learning_rate": 1.0934132570737497e-05, + "loss": 0.066, + "step": 692 + }, + { + "epoch": 0.2776, + "learning_rate": 1.0961932054943785e-05, + "loss": 0.002, + "step": 694 + }, + { + "epoch": 0.2784, + "learning_rate": 1.098972403781064e-05, + "loss": 0.022, + "step": 696 + }, + { + "epoch": 0.2792, + "learning_rate": 1.101750830261065e-05, + "loss": 0.0027, + "step": 698 + }, + { + "epoch": 0.28, + "learning_rate": 1.104528463267652e-05, + "loss": 0.0079, + "step": 700 + }, + { + "epoch": 0.2808, + "learning_rate": 1.1073052811402867e-05, + "loss": 0.0002, + "step": 702 + }, + { + "epoch": 0.2816, + "learning_rate": 1.1100812622247821e-05, + "loss": 1.9113, + "step": 704 + }, + { + "epoch": 0.2824, + "learning_rate": 1.1128563848734815e-05, + "loss": 0.0007, + "step": 706 + }, + { + "epoch": 0.2832, + "learning_rate": 1.1156306274454211e-05, + "loss": 0.0143, + "step": 708 + }, + { + "epoch": 0.284, + "learning_rate": 1.1184039683065002e-05, + "loss": 0.0064, + "step": 710 + }, + { + "epoch": 0.2848, + "learning_rate": 1.1211763858296516e-05, + "loss": 0.7138, + "step": 712 + }, + { + "epoch": 0.2856, + "learning_rate": 1.1239478583950007e-05, + "loss": 0.2771, + "step": 714 + }, + { + "epoch": 0.2864, + "learning_rate": 1.1267183643900534e-05, + "loss": 0.0005, + "step": 716 + }, + { + "epoch": 0.2872, + "learning_rate": 1.1294878822098456e-05, + "loss": 0.0612, + "step": 718 + }, + { + "epoch": 0.288, + "learning_rate": 1.1322563902571227e-05, + "loss": 0.0012, + "step": 720 + }, + { + "epoch": 0.2888, + "learning_rate": 1.1350238669424993e-05, + "loss": 0.0094, + "step": 722 + }, + { + "epoch": 0.2896, + "learning_rate": 1.137790290684638e-05, + "loss": 0.3117, + "step": 724 + }, + { + "epoch": 0.2904, + "learning_rate": 1.1405556399104108e-05, + "loss": 0.0388, + "step": 726 + }, + { + "epoch": 0.2912, + "learning_rate": 1.143319893055069e-05, + "loss": 0.0009, + "step": 728 + }, + { + "epoch": 0.292, + "learning_rate": 1.1460830285624112e-05, + "loss": 0.0194, + "step": 730 + }, + { + "epoch": 0.2928, + "learning_rate": 1.1488450248849515e-05, + "loss": 0.0013, + "step": 732 + }, + { + "epoch": 0.2936, + "learning_rate": 1.1516058604840881e-05, + "loss": 0.0026, + "step": 734 + }, + { + "epoch": 0.2944, + "learning_rate": 1.15436551383027e-05, + "loss": 0.1476, + "step": 736 + }, + { + "epoch": 0.2952, + "learning_rate": 1.1571239634031666e-05, + "loss": 0.0232, + "step": 738 + }, + { + "epoch": 0.296, + "learning_rate": 1.1598811876918352e-05, + "loss": 0.0117, + "step": 740 + }, + { + "epoch": 0.2968, + "learning_rate": 1.1626371651948839e-05, + "loss": 0.0124, + "step": 742 + }, + { + "epoch": 0.2976, + "learning_rate": 1.1653918744206478e-05, + "loss": 0.0187, + "step": 744 + }, + { + "epoch": 0.2984, + "learning_rate": 1.1681452938873515e-05, + "loss": 0.0075, + "step": 746 + }, + { + "epoch": 0.2992, + "learning_rate": 1.1708974021232763e-05, + "loss": 0.0358, + "step": 748 + }, + { + "epoch": 0.3, + "learning_rate": 1.1736481776669297e-05, + "loss": 0.0565, + "step": 750 + }, + { + "epoch": 0.3008, + "learning_rate": 1.1763975990672116e-05, + "loss": 0.4557, + "step": 752 + }, + { + "epoch": 0.3016, + "learning_rate": 1.1791456448835815e-05, + "loss": 0.0485, + "step": 754 + }, + { + "epoch": 0.3024, + "learning_rate": 1.1818922936862258e-05, + "loss": 0.3115, + "step": 756 + }, + { + "epoch": 0.3032, + "learning_rate": 1.1846375240562274e-05, + "loss": 0.0016, + "step": 758 + }, + { + "epoch": 0.304, + "learning_rate": 1.187381314585725e-05, + "loss": 0.0022, + "step": 760 + }, + { + "epoch": 0.3048, + "learning_rate": 1.1901236438780906e-05, + "loss": 0.0065, + "step": 762 + }, + { + "epoch": 0.3056, + "learning_rate": 1.192864490548089e-05, + "loss": 0.0273, + "step": 764 + }, + { + "epoch": 0.3064, + "learning_rate": 1.195603833222048e-05, + "loss": 0.0972, + "step": 766 + }, + { + "epoch": 0.3072, + "learning_rate": 1.198341650538023e-05, + "loss": 0.0071, + "step": 768 + }, + { + "epoch": 0.308, + "learning_rate": 1.2010779211459642e-05, + "loss": 0.0115, + "step": 770 + }, + { + "epoch": 0.3088, + "learning_rate": 1.203812623707884e-05, + "loss": 0.2521, + "step": 772 + }, + { + "epoch": 0.3096, + "learning_rate": 1.2065457368980227e-05, + "loss": 0.0017, + "step": 774 + }, + { + "epoch": 0.3104, + "learning_rate": 1.2092772394030141e-05, + "loss": 0.0025, + "step": 776 + }, + { + "epoch": 0.3112, + "learning_rate": 1.2120071099220552e-05, + "loss": 0.0036, + "step": 778 + }, + { + "epoch": 0.312, + "learning_rate": 1.2147353271670637e-05, + "loss": 1.2653, + "step": 780 + }, + { + "epoch": 0.3128, + "learning_rate": 1.217461869862855e-05, + "loss": 0.0179, + "step": 782 + }, + { + "epoch": 0.3136, + "learning_rate": 1.2201867167473015e-05, + "loss": 0.2921, + "step": 784 + }, + { + "epoch": 0.3144, + "learning_rate": 1.2229098465715002e-05, + "loss": 0.0273, + "step": 786 + }, + { + "epoch": 0.3152, + "learning_rate": 1.2256312380999373e-05, + "loss": 0.0678, + "step": 788 + }, + { + "epoch": 0.316, + "learning_rate": 1.2283508701106552e-05, + "loss": 0.227, + "step": 790 + }, + { + "epoch": 0.3168, + "learning_rate": 1.2310687213954173e-05, + "loss": 0.0005, + "step": 792 + }, + { + "epoch": 0.3176, + "learning_rate": 1.233784770759873e-05, + "loss": 0.1092, + "step": 794 + }, + { + "epoch": 0.3184, + "learning_rate": 1.2364989970237238e-05, + "loss": 0.0441, + "step": 796 + }, + { + "epoch": 0.3192, + "learning_rate": 1.23921137902089e-05, + "loss": 0.2983, + "step": 798 + }, + { + "epoch": 0.32, + "learning_rate": 1.241921895599668e-05, + "loss": 0.1579, + "step": 800 + }, + { + "epoch": 0.3208, + "learning_rate": 1.2446305256229076e-05, + "loss": 0.0502, + "step": 802 + }, + { + "epoch": 0.3216, + "learning_rate": 1.2473372479681653e-05, + "loss": 0.0675, + "step": 804 + }, + { + "epoch": 0.3224, + "learning_rate": 1.2500420415278822e-05, + "loss": 0.6894, + "step": 806 + }, + { + "epoch": 0.3232, + "learning_rate": 1.2527448852095292e-05, + "loss": 0.1998, + "step": 808 + }, + { + "epoch": 0.324, + "learning_rate": 1.2554457579357902e-05, + "loss": 0.0055, + "step": 810 + }, + { + "epoch": 0.3248, + "learning_rate": 1.2581446386447171e-05, + "loss": 0.9854, + "step": 812 + }, + { + "epoch": 0.3256, + "learning_rate": 1.2608415062898963e-05, + "loss": 0.0063, + "step": 814 + }, + { + "epoch": 0.3264, + "learning_rate": 1.2635363398406133e-05, + "loss": 0.2309, + "step": 816 + }, + { + "epoch": 0.3272, + "learning_rate": 1.266229118282012e-05, + "loss": 0.1528, + "step": 818 + }, + { + "epoch": 0.328, + "learning_rate": 1.2689198206152644e-05, + "loss": 0.0011, + "step": 820 + }, + { + "epoch": 0.3288, + "learning_rate": 1.2716084258577373e-05, + "loss": 0.0162, + "step": 822 + }, + { + "epoch": 0.3296, + "learning_rate": 1.2742949130431468e-05, + "loss": 0.0097, + "step": 824 + }, + { + "epoch": 0.3304, + "learning_rate": 1.2769792612217224e-05, + "loss": 0.2707, + "step": 826 + }, + { + "epoch": 0.3312, + "learning_rate": 1.2796614494603795e-05, + "loss": 0.2288, + "step": 828 + }, + { + "epoch": 0.332, + "learning_rate": 1.282341456842876e-05, + "loss": 0.0007, + "step": 830 + }, + { + "epoch": 0.3328, + "learning_rate": 1.2850192624699756e-05, + "loss": 0.0066, + "step": 832 + }, + { + "epoch": 0.3336, + "learning_rate": 1.2876948454596122e-05, + "loss": 0.0135, + "step": 834 + }, + { + "epoch": 0.3344, + "learning_rate": 1.2903681849470535e-05, + "loss": 0.2187, + "step": 836 + }, + { + "epoch": 0.3352, + "learning_rate": 1.2930392600850565e-05, + "loss": 0.0007, + "step": 838 + }, + { + "epoch": 0.336, + "learning_rate": 1.2957080500440455e-05, + "loss": 0.0367, + "step": 840 + }, + { + "epoch": 0.3368, + "learning_rate": 1.2983745340122589e-05, + "loss": 0.604, + "step": 842 + }, + { + "epoch": 0.3376, + "learning_rate": 1.3010386911959205e-05, + "loss": 0.1659, + "step": 844 + }, + { + "epoch": 0.3384, + "learning_rate": 1.3037005008193944e-05, + "loss": 0.4783, + "step": 846 + }, + { + "epoch": 0.3392, + "learning_rate": 1.3063599421253556e-05, + "loss": 0.0374, + "step": 848 + }, + { + "epoch": 0.34, + "learning_rate": 1.309016994374947e-05, + "loss": 0.0644, + "step": 850 + }, + { + "epoch": 0.3408, + "learning_rate": 1.3116716368479415e-05, + "loss": 0.0113, + "step": 852 + }, + { + "epoch": 0.3416, + "learning_rate": 1.3143238488429049e-05, + "loss": 0.1769, + "step": 854 + }, + { + "epoch": 0.3424, + "learning_rate": 1.316973609677351e-05, + "loss": 0.006, + "step": 856 + }, + { + "epoch": 0.3432, + "learning_rate": 1.319620898687917e-05, + "loss": 0.0015, + "step": 858 + }, + { + "epoch": 0.344, + "learning_rate": 1.32226569523051e-05, + "loss": 0.0007, + "step": 860 + }, + { + "epoch": 0.3448, + "learning_rate": 1.324907978680475e-05, + "loss": 0.0015, + "step": 862 + }, + { + "epoch": 0.3456, + "learning_rate": 1.3275477284327572e-05, + "loss": 0.0753, + "step": 864 + }, + { + "epoch": 0.3464, + "learning_rate": 1.3301849239020537e-05, + "loss": 0.029, + "step": 866 + }, + { + "epoch": 0.3472, + "learning_rate": 1.3328195445229865e-05, + "loss": 0.0157, + "step": 868 + }, + { + "epoch": 0.348, + "learning_rate": 1.3354515697502548e-05, + "loss": 0.0014, + "step": 870 + }, + { + "epoch": 0.3488, + "learning_rate": 1.338080979058797e-05, + "loss": 0.1665, + "step": 872 + }, + { + "epoch": 0.3496, + "learning_rate": 1.340707751943951e-05, + "loss": 0.0365, + "step": 874 + }, + { + "epoch": 0.3504, + "learning_rate": 1.3433318679216145e-05, + "loss": 0.3169, + "step": 876 + }, + { + "epoch": 0.3512, + "learning_rate": 1.3459533065284039e-05, + "loss": 0.0014, + "step": 878 + }, + { + "epoch": 0.352, + "learning_rate": 1.348572047321814e-05, + "loss": 0.0664, + "step": 880 + }, + { + "epoch": 0.3528, + "learning_rate": 1.3511880698803803e-05, + "loss": 0.1472, + "step": 882 + }, + { + "epoch": 0.3536, + "learning_rate": 1.3538013538038296e-05, + "loss": 0.0087, + "step": 884 + }, + { + "epoch": 0.3544, + "learning_rate": 1.3564118787132507e-05, + "loss": 0.0016, + "step": 886 + }, + { + "epoch": 0.3552, + "learning_rate": 1.3590196242512461e-05, + "loss": 0.0363, + "step": 888 + }, + { + "epoch": 0.356, + "learning_rate": 1.361624570082092e-05, + "loss": 0.0002, + "step": 890 + }, + { + "epoch": 0.3568, + "learning_rate": 1.364226695891898e-05, + "loss": 0.0014, + "step": 892 + }, + { + "epoch": 0.3576, + "learning_rate": 1.3668259813887637e-05, + "loss": 0.0156, + "step": 894 + }, + { + "epoch": 0.3584, + "learning_rate": 1.3694224063029386e-05, + "loss": 0.003, + "step": 896 + }, + { + "epoch": 0.3592, + "learning_rate": 1.3720159503869806e-05, + "loss": 0.003, + "step": 898 + }, + { + "epoch": 0.36, + "learning_rate": 1.374606593415911e-05, + "loss": 0.0041, + "step": 900 + }, + { + "epoch": 0.3608, + "learning_rate": 1.377194315187377e-05, + "loss": 0.0173, + "step": 902 + }, + { + "epoch": 0.3616, + "learning_rate": 1.3797790955218014e-05, + "loss": 0.0012, + "step": 904 + }, + { + "epoch": 0.3624, + "learning_rate": 1.3823609142625492e-05, + "loss": 0.004, + "step": 906 + }, + { + "epoch": 0.3632, + "learning_rate": 1.3849397512760793e-05, + "loss": 0.6563, + "step": 908 + }, + { + "epoch": 0.364, + "learning_rate": 1.3875155864521027e-05, + "loss": 0.0011, + "step": 910 + }, + { + "epoch": 0.3648, + "learning_rate": 1.3900883997037393e-05, + "loss": 0.1612, + "step": 912 + }, + { + "epoch": 0.3656, + "learning_rate": 1.3926581709676746e-05, + "loss": 0.0024, + "step": 914 + }, + { + "epoch": 0.3664, + "learning_rate": 1.3952248802043158e-05, + "loss": 0.4061, + "step": 916 + }, + { + "epoch": 0.3672, + "learning_rate": 1.397788507397949e-05, + "loss": 0.0048, + "step": 918 + }, + { + "epoch": 0.368, + "learning_rate": 1.4003490325568956e-05, + "loss": 0.0213, + "step": 920 + }, + { + "epoch": 0.3688, + "learning_rate": 1.4029064357136632e-05, + "loss": 0.0237, + "step": 922 + }, + { + "epoch": 0.3696, + "learning_rate": 1.4054606969251096e-05, + "loss": 0.4146, + "step": 924 + }, + { + "epoch": 0.3704, + "learning_rate": 1.4080117962725929e-05, + "loss": 0.0252, + "step": 926 + }, + { + "epoch": 0.3712, + "learning_rate": 1.4105597138621281e-05, + "loss": 0.0316, + "step": 928 + }, + { + "epoch": 0.372, + "learning_rate": 1.4131044298245416e-05, + "loss": 0.0291, + "step": 930 + }, + { + "epoch": 0.3728, + "learning_rate": 1.4156459243156275e-05, + "loss": 0.7741, + "step": 932 + }, + { + "epoch": 0.3736, + "learning_rate": 1.418184177516301e-05, + "loss": 0.0015, + "step": 934 + }, + { + "epoch": 0.3744, + "learning_rate": 1.420719169632754e-05, + "loss": 0.0003, + "step": 936 + }, + { + "epoch": 0.3752, + "learning_rate": 1.4232508808966085e-05, + "loss": 0.0707, + "step": 938 + }, + { + "epoch": 0.376, + "learning_rate": 1.4257792915650735e-05, + "loss": 0.0455, + "step": 940 + }, + { + "epoch": 0.3768, + "learning_rate": 1.4283043819210906e-05, + "loss": 0.0389, + "step": 942 + }, + { + "epoch": 0.3776, + "learning_rate": 1.430826132273499e-05, + "loss": 0.1369, + "step": 944 + }, + { + "epoch": 0.3784, + "learning_rate": 1.4333445229571857e-05, + "loss": 0.1176, + "step": 946 + }, + { + "epoch": 0.3792, + "learning_rate": 1.4358595343332342e-05, + "loss": 0.2449, + "step": 948 + }, + { + "epoch": 0.38, + "learning_rate": 1.4383711467890772e-05, + "loss": 0.8199, + "step": 950 + }, + { + "epoch": 0.3808, + "learning_rate": 1.4408793407386584e-05, + "loss": 0.0006, + "step": 952 + }, + { + "epoch": 0.3816, + "learning_rate": 1.4433840966225767e-05, + "loss": 0.0957, + "step": 954 + }, + { + "epoch": 0.3824, + "learning_rate": 1.4458853949082434e-05, + "loss": 0.0109, + "step": 956 + }, + { + "epoch": 0.3832, + "learning_rate": 1.4483832160900332e-05, + "loss": 0.864, + "step": 958 + }, + { + "epoch": 0.384, + "learning_rate": 1.4508775406894315e-05, + "loss": 0.1186, + "step": 960 + }, + { + "epoch": 0.3848, + "learning_rate": 1.4533683492551942e-05, + "loss": 0.0543, + "step": 962 + }, + { + "epoch": 0.3856, + "learning_rate": 1.4558556223634988e-05, + "loss": 0.0087, + "step": 964 + }, + { + "epoch": 0.3864, + "learning_rate": 1.4583393406180886e-05, + "loss": 0.8009, + "step": 966 + }, + { + "epoch": 0.3872, + "learning_rate": 1.460819484650431e-05, + "loss": 0.0273, + "step": 968 + }, + { + "epoch": 0.388, + "learning_rate": 1.4632960351198618e-05, + "loss": 0.3166, + "step": 970 + }, + { + "epoch": 0.3888, + "learning_rate": 1.4657689727137441e-05, + "loss": 0.0277, + "step": 972 + }, + { + "epoch": 0.3896, + "learning_rate": 1.468238278147614e-05, + "loss": 1.1738, + "step": 974 + }, + { + "epoch": 0.3904, + "learning_rate": 1.470703932165332e-05, + "loss": 0.0129, + "step": 976 + }, + { + "epoch": 0.3912, + "learning_rate": 1.4731659155392339e-05, + "loss": 0.0066, + "step": 978 + }, + { + "epoch": 0.392, + "learning_rate": 1.4756242090702744e-05, + "loss": 0.0411, + "step": 980 + }, + { + "epoch": 0.3928, + "learning_rate": 1.4780787935881913e-05, + "loss": 0.0114, + "step": 982 + }, + { + "epoch": 0.3936, + "learning_rate": 1.4805296499516397e-05, + "loss": 0.8405, + "step": 984 + }, + { + "epoch": 0.3944, + "learning_rate": 1.482976759048351e-05, + "loss": 0.0832, + "step": 986 + }, + { + "epoch": 0.3952, + "learning_rate": 1.485420101795274e-05, + "loss": 0.189, + "step": 988 + }, + { + "epoch": 0.396, + "learning_rate": 1.4878596591387327e-05, + "loss": 0.1741, + "step": 990 + }, + { + "epoch": 0.3968, + "learning_rate": 1.4902954120545686e-05, + "loss": 0.0904, + "step": 992 + }, + { + "epoch": 0.3976, + "learning_rate": 1.4927273415482913e-05, + "loss": 0.2758, + "step": 994 + }, + { + "epoch": 0.3984, + "learning_rate": 1.4951554286552261e-05, + "loss": 0.1394, + "step": 996 + }, + { + "epoch": 0.3992, + "learning_rate": 1.4975796544406617e-05, + "loss": 0.0227, + "step": 998 + }, + { + "epoch": 0.4, + "learning_rate": 1.4999999999999992e-05, + "loss": 0.0007, + "step": 1000 + }, + { + "epoch": 0.4008, + "learning_rate": 1.502416446458897e-05, + "loss": 0.6995, + "step": 1002 + }, + { + "epoch": 0.4016, + "learning_rate": 1.5048289749734206e-05, + "loss": 0.1991, + "step": 1004 + }, + { + "epoch": 0.4024, + "learning_rate": 1.5072375667301895e-05, + "loss": 0.8636, + "step": 1006 + }, + { + "epoch": 0.4032, + "learning_rate": 1.5096422029465178e-05, + "loss": 0.0666, + "step": 1008 + }, + { + "epoch": 0.404, + "learning_rate": 1.5120428648705714e-05, + "loss": 0.2682, + "step": 1010 + }, + { + "epoch": 0.4048, + "learning_rate": 1.5144395337815064e-05, + "loss": 0.2553, + "step": 1012 + }, + { + "epoch": 0.4056, + "learning_rate": 1.5168321909896166e-05, + "loss": 0.1212, + "step": 1014 + }, + { + "epoch": 0.4064, + "learning_rate": 1.5192208178364808e-05, + "loss": 0.1705, + "step": 1016 + }, + { + "epoch": 0.4072, + "learning_rate": 1.521605395695107e-05, + "loss": 0.0247, + "step": 1018 + }, + { + "epoch": 0.408, + "learning_rate": 1.5239859059700784e-05, + "loss": 0.3914, + "step": 1020 + }, + { + "epoch": 0.4088, + "learning_rate": 1.526362330097697e-05, + "loss": 0.0678, + "step": 1022 + }, + { + "epoch": 0.4096, + "learning_rate": 1.5287346495461322e-05, + "loss": 0.0016, + "step": 1024 + }, + { + "epoch": 0.4104, + "learning_rate": 1.531102845815557e-05, + "loss": 0.0028, + "step": 1026 + }, + { + "epoch": 0.4112, + "learning_rate": 1.5334669004383025e-05, + "loss": 0.0065, + "step": 1028 + }, + { + "epoch": 0.412, + "learning_rate": 1.5358267949789968e-05, + "loss": 0.0158, + "step": 1030 + }, + { + "epoch": 0.4128, + "learning_rate": 1.5381825110347072e-05, + "loss": 0.0158, + "step": 1032 + }, + { + "epoch": 0.4136, + "learning_rate": 1.540534030235087e-05, + "loss": 0.4769, + "step": 1034 + }, + { + "epoch": 0.4144, + "learning_rate": 1.542881334242517e-05, + "loss": 0.024, + "step": 1036 + }, + { + "epoch": 0.4152, + "learning_rate": 1.5452244047522493e-05, + "loss": 0.211, + "step": 1038 + }, + { + "epoch": 0.416, + "learning_rate": 1.5475632234925495e-05, + "loss": 0.5014, + "step": 1040 + }, + { + "epoch": 0.4168, + "learning_rate": 1.5498977722248388e-05, + "loss": 0.7562, + "step": 1042 + }, + { + "epoch": 0.4176, + "learning_rate": 1.552228032743839e-05, + "loss": 0.0957, + "step": 1044 + }, + { + "epoch": 0.4184, + "learning_rate": 1.5545539868777075e-05, + "loss": 0.9688, + "step": 1046 + }, + { + "epoch": 0.4192, + "learning_rate": 1.556875616488188e-05, + "loss": 0.1644, + "step": 1048 + }, + { + "epoch": 0.42, + "learning_rate": 1.5591929034707468e-05, + "loss": 0.0653, + "step": 1050 + }, + { + "epoch": 0.4208, + "learning_rate": 1.5615058297547144e-05, + "loss": 0.7905, + "step": 1052 + }, + { + "epoch": 0.4216, + "learning_rate": 1.5638143773034268e-05, + "loss": 0.1003, + "step": 1054 + }, + { + "epoch": 0.4224, + "learning_rate": 1.5661185281143663e-05, + "loss": 0.0503, + "step": 1056 + }, + { + "epoch": 0.4232, + "learning_rate": 1.5684182642193024e-05, + "loss": 0.0621, + "step": 1058 + }, + { + "epoch": 0.424, + "learning_rate": 1.5707135676844312e-05, + "loss": 0.0158, + "step": 1060 + }, + { + "epoch": 0.4248, + "learning_rate": 1.5730044206105146e-05, + "loss": 0.0271, + "step": 1062 + }, + { + "epoch": 0.4256, + "learning_rate": 1.5752908051330232e-05, + "loss": 0.0148, + "step": 1064 + }, + { + "epoch": 0.4264, + "learning_rate": 1.577572703422268e-05, + "loss": 0.6018, + "step": 1066 + }, + { + "epoch": 0.4272, + "learning_rate": 1.579850097683548e-05, + "loss": 0.0818, + "step": 1068 + }, + { + "epoch": 0.428, + "learning_rate": 1.582122970157288e-05, + "loss": 0.1039, + "step": 1070 + }, + { + "epoch": 0.4288, + "learning_rate": 1.5843913031191722e-05, + "loss": 0.0046, + "step": 1072 + }, + { + "epoch": 0.4296, + "learning_rate": 1.586655078880281e-05, + "loss": 0.0247, + "step": 1074 + }, + { + "epoch": 0.4304, + "learning_rate": 1.5889142797872383e-05, + "loss": 0.0405, + "step": 1076 + }, + { + "epoch": 0.4312, + "learning_rate": 1.5911688882223415e-05, + "loss": 0.1715, + "step": 1078 + }, + { + "epoch": 0.432, + "learning_rate": 1.5934188866037007e-05, + "loss": 0.0853, + "step": 1080 + }, + { + "epoch": 0.4328, + "learning_rate": 1.5956642573853787e-05, + "loss": 0.1593, + "step": 1082 + }, + { + "epoch": 0.4336, + "learning_rate": 1.5979049830575193e-05, + "loss": 0.0041, + "step": 1084 + }, + { + "epoch": 0.4344, + "learning_rate": 1.6001410461464945e-05, + "loss": 0.0077, + "step": 1086 + }, + { + "epoch": 0.4352, + "learning_rate": 1.6023724292150377e-05, + "loss": 0.0052, + "step": 1088 + }, + { + "epoch": 0.436, + "learning_rate": 1.604599114862375e-05, + "loss": 0.0109, + "step": 1090 + }, + { + "epoch": 0.4368, + "learning_rate": 1.606821085724362e-05, + "loss": 0.1346, + "step": 1092 + }, + { + "epoch": 0.4376, + "learning_rate": 1.6090383244736253e-05, + "loss": 0.0207, + "step": 1094 + }, + { + "epoch": 0.4384, + "learning_rate": 1.6112508138196912e-05, + "loss": 0.0765, + "step": 1096 + }, + { + "epoch": 0.4392, + "learning_rate": 1.613458536509124e-05, + "loss": 0.0292, + "step": 1098 + }, + { + "epoch": 0.44, + "learning_rate": 1.615661475325658e-05, + "loss": 0.0049, + "step": 1100 + }, + { + "epoch": 0.4408, + "learning_rate": 1.6178596130903352e-05, + "loss": 0.0016, + "step": 1102 + }, + { + "epoch": 0.4416, + "learning_rate": 1.620052932661632e-05, + "loss": 0.0067, + "step": 1104 + }, + { + "epoch": 0.4424, + "learning_rate": 1.6222414169356056e-05, + "loss": 0.0091, + "step": 1106 + }, + { + "epoch": 0.4432, + "learning_rate": 1.6244250488460146e-05, + "loss": 0.0003, + "step": 1108 + }, + { + "epoch": 0.444, + "learning_rate": 1.6266038113644605e-05, + "loss": 0.0207, + "step": 1110 + }, + { + "epoch": 0.4448, + "learning_rate": 1.6287776875005127e-05, + "loss": 0.0036, + "step": 1112 + }, + { + "epoch": 0.4456, + "learning_rate": 1.6309466603018497e-05, + "loss": 1.3406, + "step": 1114 + }, + { + "epoch": 0.4464, + "learning_rate": 1.6331107128543856e-05, + "loss": 0.107, + "step": 1116 + }, + { + "epoch": 0.4472, + "learning_rate": 1.635269828282404e-05, + "loss": 0.6438, + "step": 1118 + }, + { + "epoch": 0.448, + "learning_rate": 1.6374239897486905e-05, + "loss": 0.2638, + "step": 1120 + }, + { + "epoch": 0.4488, + "learning_rate": 1.6395731804546575e-05, + "loss": 0.4416, + "step": 1122 + }, + { + "epoch": 0.4496, + "learning_rate": 1.6417173836404878e-05, + "loss": 0.0075, + "step": 1124 + }, + { + "epoch": 0.4504, + "learning_rate": 1.643856582585253e-05, + "loss": 0.0039, + "step": 1126 + }, + { + "epoch": 0.4512, + "learning_rate": 1.6459907606070513e-05, + "loss": 0.1285, + "step": 1128 + }, + { + "epoch": 0.452, + "learning_rate": 1.6481199010631312e-05, + "loss": 0.0005, + "step": 1130 + }, + { + "epoch": 0.4528, + "learning_rate": 1.650243987350029e-05, + "loss": 0.0033, + "step": 1132 + }, + { + "epoch": 0.4536, + "learning_rate": 1.652363002903693e-05, + "loss": 0.0251, + "step": 1134 + }, + { + "epoch": 0.4544, + "learning_rate": 1.6544769311996146e-05, + "loss": 0.0014, + "step": 1136 + }, + { + "epoch": 0.4552, + "learning_rate": 1.656585755752956e-05, + "loss": 0.0683, + "step": 1138 + }, + { + "epoch": 0.456, + "learning_rate": 1.65868946011868e-05, + "loss": 0.0005, + "step": 1140 + }, + { + "epoch": 0.4568, + "learning_rate": 1.660788027891677e-05, + "loss": 0.0002, + "step": 1142 + }, + { + "epoch": 0.4576, + "learning_rate": 1.6628814427068944e-05, + "loss": 0.0422, + "step": 1144 + }, + { + "epoch": 0.4584, + "learning_rate": 1.6649696882394625e-05, + "loss": 0.0005, + "step": 1146 + }, + { + "epoch": 0.4592, + "learning_rate": 1.667052748204825e-05, + "loss": 1.2971, + "step": 1148 + }, + { + "epoch": 0.46, + "learning_rate": 1.6691306063588583e-05, + "loss": 0.6126, + "step": 1150 + }, + { + "epoch": 0.4608, + "learning_rate": 1.6712032464980094e-05, + "loss": 0.2187, + "step": 1152 + }, + { + "epoch": 0.4616, + "learning_rate": 1.6732706524594138e-05, + "loss": 0.1372, + "step": 1154 + }, + { + "epoch": 0.4624, + "learning_rate": 1.6753328081210244e-05, + "loss": 0.1888, + "step": 1156 + }, + { + "epoch": 0.4632, + "learning_rate": 1.6773896974017373e-05, + "loss": 0.0294, + "step": 1158 + }, + { + "epoch": 0.464, + "learning_rate": 1.679441304261516e-05, + "loss": 0.0481, + "step": 1160 + }, + { + "epoch": 0.4648, + "learning_rate": 1.681487612701519e-05, + "loss": 0.011, + "step": 1162 + }, + { + "epoch": 0.4656, + "learning_rate": 1.683528606764222e-05, + "loss": 0.0404, + "step": 1164 + }, + { + "epoch": 0.4664, + "learning_rate": 1.6855642705335428e-05, + "loss": 0.0818, + "step": 1166 + }, + { + "epoch": 0.4672, + "learning_rate": 1.687594588134968e-05, + "loss": 0.0033, + "step": 1168 + }, + { + "epoch": 0.468, + "learning_rate": 1.68961954373567e-05, + "loss": 0.3934, + "step": 1170 + }, + { + "epoch": 0.4688, + "learning_rate": 1.6916391215446403e-05, + "loss": 0.0475, + "step": 1172 + }, + { + "epoch": 0.4696, + "learning_rate": 1.693653305812805e-05, + "loss": 0.028, + "step": 1174 + }, + { + "epoch": 0.4704, + "learning_rate": 1.6956620808331505e-05, + "loss": 1.1289, + "step": 1176 + }, + { + "epoch": 0.4712, + "learning_rate": 1.697665430940846e-05, + "loss": 0.002, + "step": 1178 + }, + { + "epoch": 0.472, + "learning_rate": 1.699663340513365e-05, + "loss": 0.5752, + "step": 1180 + }, + { + "epoch": 0.4728, + "learning_rate": 1.7016557939706068e-05, + "loss": 0.8214, + "step": 1182 + }, + { + "epoch": 0.4736, + "learning_rate": 1.7036427757750198e-05, + "loss": 0.0217, + "step": 1184 + }, + { + "epoch": 0.4744, + "learning_rate": 1.7056242704317212e-05, + "loss": 0.5819, + "step": 1186 + }, + { + "epoch": 0.4752, + "learning_rate": 1.7076002624886156e-05, + "loss": 0.0412, + "step": 1188 + }, + { + "epoch": 0.476, + "learning_rate": 1.709570736536521e-05, + "loss": 0.0078, + "step": 1190 + }, + { + "epoch": 0.4768, + "learning_rate": 1.7115356772092844e-05, + "loss": 0.1368, + "step": 1192 + }, + { + "epoch": 0.4776, + "learning_rate": 1.7134950691839063e-05, + "loss": 0.3109, + "step": 1194 + }, + { + "epoch": 0.4784, + "learning_rate": 1.7154488971806518e-05, + "loss": 0.3393, + "step": 1196 + }, + { + "epoch": 0.4792, + "learning_rate": 1.7173971459631783e-05, + "loss": 0.1565, + "step": 1198 + }, + { + "epoch": 0.48, + "learning_rate": 1.7193398003386507e-05, + "loss": 0.0589, + "step": 1200 + }, + { + "epoch": 0.4808, + "learning_rate": 1.7212768451578602e-05, + "loss": 0.5378, + "step": 1202 + }, + { + "epoch": 0.4816, + "learning_rate": 1.7232082653153416e-05, + "loss": 0.4679, + "step": 1204 + }, + { + "epoch": 0.4824, + "learning_rate": 1.7251340457494937e-05, + "loss": 0.023, + "step": 1206 + }, + { + "epoch": 0.4832, + "learning_rate": 1.7270541714426923e-05, + "loss": 0.0611, + "step": 1208 + }, + { + "epoch": 0.484, + "learning_rate": 1.7289686274214106e-05, + "loss": 0.3156, + "step": 1210 + }, + { + "epoch": 0.4848, + "learning_rate": 1.7308773987563393e-05, + "loss": 0.4025, + "step": 1212 + }, + { + "epoch": 0.4856, + "learning_rate": 1.732780470562496e-05, + "loss": 0.2821, + "step": 1214 + }, + { + "epoch": 0.4864, + "learning_rate": 1.7346778279993413e-05, + "loss": 0.1697, + "step": 1216 + }, + { + "epoch": 0.4872, + "learning_rate": 1.736569456270903e-05, + "loss": 0.0026, + "step": 1218 + }, + { + "epoch": 0.488, + "learning_rate": 1.7384553406258836e-05, + "loss": 0.004, + "step": 1220 + }, + { + "epoch": 0.4888, + "learning_rate": 1.740335466357778e-05, + "loss": 0.2519, + "step": 1222 + }, + { + "epoch": 0.4896, + "learning_rate": 1.7422098188049888e-05, + "loss": 0.0024, + "step": 1224 + }, + { + "epoch": 0.4904, + "learning_rate": 1.7440783833509373e-05, + "loss": 0.0013, + "step": 1226 + }, + { + "epoch": 0.4912, + "learning_rate": 1.7459411454241816e-05, + "loss": 0.0287, + "step": 1228 + }, + { + "epoch": 0.492, + "learning_rate": 1.747798090498531e-05, + "loss": 0.0411, + "step": 1230 + }, + { + "epoch": 0.4928, + "learning_rate": 1.749649204093154e-05, + "loss": 0.1829, + "step": 1232 + }, + { + "epoch": 0.4936, + "learning_rate": 1.7514944717726962e-05, + "loss": 0.006, + "step": 1234 + }, + { + "epoch": 0.4944, + "learning_rate": 1.753333879147387e-05, + "loss": 0.0149, + "step": 1236 + }, + { + "epoch": 0.4952, + "learning_rate": 1.755167411873159e-05, + "loss": 0.0105, + "step": 1238 + }, + { + "epoch": 0.496, + "learning_rate": 1.7569950556517563e-05, + "loss": 0.07, + "step": 1240 + }, + { + "epoch": 0.4968, + "learning_rate": 1.758816796230845e-05, + "loss": 0.0456, + "step": 1242 + }, + { + "epoch": 0.4976, + "learning_rate": 1.7606326194041278e-05, + "loss": 0.0071, + "step": 1244 + }, + { + "epoch": 0.4984, + "learning_rate": 1.762442511011447e-05, + "loss": 0.0006, + "step": 1246 + }, + { + "epoch": 0.4992, + "learning_rate": 1.7642464569389083e-05, + "loss": 0.0042, + "step": 1248 + }, + { + "epoch": 0.5, + "learning_rate": 1.766044443118977e-05, + "loss": 0.2683, + "step": 1250 + }, + { + "epoch": 0.5008, + "learning_rate": 1.767836455530598e-05, + "loss": 0.0009, + "step": 1252 + }, + { + "epoch": 0.5016, + "learning_rate": 1.7696224801992947e-05, + "loss": 0.0747, + "step": 1254 + }, + { + "epoch": 0.5024, + "learning_rate": 1.77140250319729e-05, + "loss": 0.0087, + "step": 1256 + }, + { + "epoch": 0.5032, + "learning_rate": 1.7731765106436073e-05, + "loss": 0.0002, + "step": 1258 + }, + { + "epoch": 0.504, + "learning_rate": 1.7749444887041793e-05, + "loss": 0.0036, + "step": 1260 + }, + { + "epoch": 0.5048, + "learning_rate": 1.776706423591959e-05, + "loss": 0.0039, + "step": 1262 + }, + { + "epoch": 0.5056, + "learning_rate": 1.778462301567023e-05, + "loss": 0.0016, + "step": 1264 + }, + { + "epoch": 0.5064, + "learning_rate": 1.7802121089366832e-05, + "loss": 0.0084, + "step": 1266 + }, + { + "epoch": 0.5072, + "learning_rate": 1.7819558320555895e-05, + "loss": 0.0001, + "step": 1268 + }, + { + "epoch": 0.508, + "learning_rate": 1.7836934573258392e-05, + "loss": 0.0192, + "step": 1270 + }, + { + "epoch": 0.5088, + "learning_rate": 1.785424971197082e-05, + "loss": 0.0047, + "step": 1272 + }, + { + "epoch": 0.5096, + "learning_rate": 1.7871503601666233e-05, + "loss": 0.0763, + "step": 1274 + }, + { + "epoch": 0.5104, + "learning_rate": 1.7888696107795343e-05, + "loss": 0.0017, + "step": 1276 + }, + { + "epoch": 0.5112, + "learning_rate": 1.790582709628753e-05, + "loss": 0.0333, + "step": 1278 + }, + { + "epoch": 0.512, + "learning_rate": 1.7922896433551903e-05, + "loss": 0.0246, + "step": 1280 + }, + { + "epoch": 0.5128, + "learning_rate": 1.793990398647835e-05, + "loss": 0.0006, + "step": 1282 + }, + { + "epoch": 0.5136, + "learning_rate": 1.795684962243855e-05, + "loss": 0.0006, + "step": 1284 + }, + { + "epoch": 0.5144, + "learning_rate": 1.7973733209287032e-05, + "loss": 0.0152, + "step": 1286 + }, + { + "epoch": 0.5152, + "learning_rate": 1.7990554615362193e-05, + "loss": 0.0005, + "step": 1288 + }, + { + "epoch": 0.516, + "learning_rate": 1.800731370948734e-05, + "loss": 0.0001, + "step": 1290 + }, + { + "epoch": 0.5168, + "learning_rate": 1.802401036097167e-05, + "loss": 0.0484, + "step": 1292 + }, + { + "epoch": 0.5176, + "learning_rate": 1.804064443961135e-05, + "loss": 0.0445, + "step": 1294 + }, + { + "epoch": 0.5184, + "learning_rate": 1.8057215815690494e-05, + "loss": 0.0345, + "step": 1296 + }, + { + "epoch": 0.5192, + "learning_rate": 1.8073724359982184e-05, + "loss": 0.0209, + "step": 1298 + }, + { + "epoch": 0.52, + "learning_rate": 1.809016994374947e-05, + "loss": 0.0064, + "step": 1300 + }, + { + "epoch": 0.5208, + "learning_rate": 1.81065524387464e-05, + "loss": 0.5277, + "step": 1302 + }, + { + "epoch": 0.5216, + "learning_rate": 1.8122871717218968e-05, + "loss": 0.0001, + "step": 1304 + }, + { + "epoch": 0.5224, + "learning_rate": 1.8139127651906176e-05, + "loss": 0.0005, + "step": 1306 + }, + { + "epoch": 0.5232, + "learning_rate": 1.8155320116040976e-05, + "loss": 0.0008, + "step": 1308 + }, + { + "epoch": 0.524, + "learning_rate": 1.817144898335129e-05, + "loss": 0.0044, + "step": 1310 + }, + { + "epoch": 0.5248, + "learning_rate": 1.818751412806095e-05, + "loss": 0.0013, + "step": 1312 + }, + { + "epoch": 0.5256, + "learning_rate": 1.8203515424890738e-05, + "loss": 0.4533, + "step": 1314 + }, + { + "epoch": 0.5264, + "learning_rate": 1.8219452749059322e-05, + "loss": 1.586, + "step": 1316 + }, + { + "epoch": 0.5272, + "learning_rate": 1.8235325976284276e-05, + "loss": 0.2815, + "step": 1318 + }, + { + "epoch": 0.528, + "learning_rate": 1.8251134982782952e-05, + "loss": 0.0044, + "step": 1320 + }, + { + "epoch": 0.5288, + "learning_rate": 1.826687964527355e-05, + "loss": 0.0002, + "step": 1322 + }, + { + "epoch": 0.5296, + "learning_rate": 1.828255984097604e-05, + "loss": 0.0003, + "step": 1324 + }, + { + "epoch": 0.5304, + "learning_rate": 1.8298175447613093e-05, + "loss": 0.6127, + "step": 1326 + }, + { + "epoch": 0.5312, + "learning_rate": 1.8313726343411092e-05, + "loss": 0.3639, + "step": 1328 + }, + { + "epoch": 0.532, + "learning_rate": 1.8329212407101e-05, + "loss": 0.4435, + "step": 1330 + }, + { + "epoch": 0.5328, + "learning_rate": 1.8344633517919394e-05, + "loss": 0.0088, + "step": 1332 + }, + { + "epoch": 0.5336, + "learning_rate": 1.8359989555609344e-05, + "loss": 0.0047, + "step": 1334 + }, + { + "epoch": 0.5344, + "learning_rate": 1.8375280400421407e-05, + "loss": 0.3612, + "step": 1336 + }, + { + "epoch": 0.5352, + "learning_rate": 1.8390505933114503e-05, + "loss": 0.0592, + "step": 1338 + }, + { + "epoch": 0.536, + "learning_rate": 1.8405666034956842e-05, + "loss": 0.1241, + "step": 1340 + }, + { + "epoch": 0.5368, + "learning_rate": 1.842076058772692e-05, + "loss": 0.6893, + "step": 1342 + }, + { + "epoch": 0.5376, + "learning_rate": 1.8435789473714384e-05, + "loss": 0.1428, + "step": 1344 + }, + { + "epoch": 0.5384, + "learning_rate": 1.8450752575720964e-05, + "loss": 0.0038, + "step": 1346 + }, + { + "epoch": 0.5392, + "learning_rate": 1.8465649777061384e-05, + "loss": 0.2682, + "step": 1348 + }, + { + "epoch": 0.54, + "learning_rate": 1.8480480961564266e-05, + "loss": 0.0566, + "step": 1350 + }, + { + "epoch": 0.5408, + "learning_rate": 1.8495246013573047e-05, + "loss": 0.0086, + "step": 1352 + }, + { + "epoch": 0.5416, + "learning_rate": 1.850994481794691e-05, + "loss": 0.0004, + "step": 1354 + }, + { + "epoch": 0.5424, + "learning_rate": 1.8524577260061628e-05, + "loss": 0.0132, + "step": 1356 + }, + { + "epoch": 0.5432, + "learning_rate": 1.8539143225810453e-05, + "loss": 0.5804, + "step": 1358 + }, + { + "epoch": 0.544, + "learning_rate": 1.8553642601605066e-05, + "loss": 0.0579, + "step": 1360 + }, + { + "epoch": 0.5448, + "learning_rate": 1.856807527437643e-05, + "loss": 0.4187, + "step": 1362 + }, + { + "epoch": 0.5456, + "learning_rate": 1.8582441131575658e-05, + "loss": 0.0008, + "step": 1364 + }, + { + "epoch": 0.5464, + "learning_rate": 1.859674006117491e-05, + "loss": 0.0948, + "step": 1366 + }, + { + "epoch": 0.5472, + "learning_rate": 1.8610971951668268e-05, + "loss": 0.0008, + "step": 1368 + }, + { + "epoch": 0.548, + "learning_rate": 1.862513669207257e-05, + "loss": 0.4324, + "step": 1370 + }, + { + "epoch": 0.5488, + "learning_rate": 1.8639234171928348e-05, + "loss": 0.0195, + "step": 1372 + }, + { + "epoch": 0.5496, + "learning_rate": 1.8653264281300612e-05, + "loss": 0.0012, + "step": 1374 + }, + { + "epoch": 0.5504, + "learning_rate": 1.8667226910779767e-05, + "loss": 0.1332, + "step": 1376 + }, + { + "epoch": 0.5512, + "learning_rate": 1.8681121951482393e-05, + "loss": 0.0032, + "step": 1378 + }, + { + "epoch": 0.552, + "learning_rate": 1.869494929505219e-05, + "loss": 0.6811, + "step": 1380 + }, + { + "epoch": 0.5528, + "learning_rate": 1.870870883366075e-05, + "loss": 0.6594, + "step": 1382 + }, + { + "epoch": 0.5536, + "learning_rate": 1.8722400460008434e-05, + "loss": 0.5689, + "step": 1384 + }, + { + "epoch": 0.5544, + "learning_rate": 1.8736024067325195e-05, + "loss": 0.0255, + "step": 1386 + }, + { + "epoch": 0.5552, + "learning_rate": 1.8749579549371373e-05, + "loss": 0.0003, + "step": 1388 + }, + { + "epoch": 0.556, + "learning_rate": 1.876306680043863e-05, + "loss": 0.1605, + "step": 1390 + }, + { + "epoch": 0.5568, + "learning_rate": 1.8776485715350665e-05, + "loss": 0.054, + "step": 1392 + }, + { + "epoch": 0.5576, + "learning_rate": 1.878983618946409e-05, + "loss": 0.009, + "step": 1394 + }, + { + "epoch": 0.5584, + "learning_rate": 1.8803118118669203e-05, + "loss": 0.3037, + "step": 1396 + }, + { + "epoch": 0.5592, + "learning_rate": 1.881633139939087e-05, + "loss": 0.0373, + "step": 1398 + }, + { + "epoch": 0.56, + "learning_rate": 1.882947592858927e-05, + "loss": 0.1047, + "step": 1400 + }, + { + "epoch": 0.5608, + "learning_rate": 1.884255160376072e-05, + "loss": 0.0505, + "step": 1402 + }, + { + "epoch": 0.5616, + "learning_rate": 1.885555832293849e-05, + "loss": 0.517, + "step": 1404 + }, + { + "epoch": 0.5624, + "learning_rate": 1.886849598469356e-05, + "loss": 0.4008, + "step": 1406 + }, + { + "epoch": 0.5632, + "learning_rate": 1.888136448813544e-05, + "loss": 0.1585, + "step": 1408 + }, + { + "epoch": 0.564, + "learning_rate": 1.8894163732912972e-05, + "loss": 0.002, + "step": 1410 + }, + { + "epoch": 0.5648, + "learning_rate": 1.890689361921506e-05, + "loss": 0.3848, + "step": 1412 + }, + { + "epoch": 0.5656, + "learning_rate": 1.891955404777151e-05, + "loss": 0.5114, + "step": 1414 + }, + { + "epoch": 0.5664, + "learning_rate": 1.893214491985374e-05, + "loss": 0.1321, + "step": 1416 + }, + { + "epoch": 0.5672, + "learning_rate": 1.89446661372756e-05, + "loss": 0.5882, + "step": 1418 + }, + { + "epoch": 0.568, + "learning_rate": 1.895711760239413e-05, + "loss": 0.0188, + "step": 1420 + }, + { + "epoch": 0.5688, + "learning_rate": 1.89694992181103e-05, + "loss": 0.0756, + "step": 1422 + }, + { + "epoch": 0.5696, + "learning_rate": 1.8981810887869784e-05, + "loss": 0.5708, + "step": 1424 + }, + { + "epoch": 0.5704, + "learning_rate": 1.8994052515663708e-05, + "loss": 0.07, + "step": 1426 + }, + { + "epoch": 0.5712, + "learning_rate": 1.90062240060294e-05, + "loss": 0.005, + "step": 1428 + }, + { + "epoch": 0.572, + "learning_rate": 1.9018325264051136e-05, + "loss": 0.3297, + "step": 1430 + }, + { + "epoch": 0.5728, + "learning_rate": 1.9030356195360868e-05, + "loss": 0.0366, + "step": 1432 + }, + { + "epoch": 0.5736, + "learning_rate": 1.904231670613899e-05, + "loss": 0.0119, + "step": 1434 + }, + { + "epoch": 0.5744, + "learning_rate": 1.905420670311502e-05, + "loss": 0.0041, + "step": 1436 + }, + { + "epoch": 0.5752, + "learning_rate": 1.906602609356838e-05, + "loss": 0.067, + "step": 1438 + }, + { + "epoch": 0.576, + "learning_rate": 1.9077774785329078e-05, + "loss": 0.2302, + "step": 1440 + }, + { + "epoch": 0.5768, + "learning_rate": 1.9089452686778487e-05, + "loss": 0.0009, + "step": 1442 + }, + { + "epoch": 0.5776, + "learning_rate": 1.9101059706849957e-05, + "loss": 0.0141, + "step": 1444 + }, + { + "epoch": 0.5784, + "learning_rate": 1.911259575502962e-05, + "loss": 0.2043, + "step": 1446 + }, + { + "epoch": 0.5792, + "learning_rate": 1.912406074135706e-05, + "loss": 0.0001, + "step": 1448 + }, + { + "epoch": 0.58, + "learning_rate": 1.9135454576426006e-05, + "loss": 0.059, + "step": 1450 + }, + { + "epoch": 0.5808, + "learning_rate": 1.9146777171385053e-05, + "loss": 0.2296, + "step": 1452 + }, + { + "epoch": 0.5816, + "learning_rate": 1.9158028437938316e-05, + "loss": 0.2977, + "step": 1454 + }, + { + "epoch": 0.5824, + "learning_rate": 1.9169208288346168e-05, + "loss": 0.0002, + "step": 1456 + }, + { + "epoch": 0.5832, + "learning_rate": 1.9180316635425876e-05, + "loss": 0.133, + "step": 1458 + }, + { + "epoch": 0.584, + "learning_rate": 1.9191353392552346e-05, + "loss": 0.0047, + "step": 1460 + }, + { + "epoch": 0.5848, + "learning_rate": 1.9202318473658703e-05, + "loss": 0.0105, + "step": 1462 + }, + { + "epoch": 0.5856, + "learning_rate": 1.9213211793237052e-05, + "loss": 0.5803, + "step": 1464 + }, + { + "epoch": 0.5864, + "learning_rate": 1.92240332663391e-05, + "loss": 0.8232, + "step": 1466 + }, + { + "epoch": 0.5872, + "learning_rate": 1.923478280857682e-05, + "loss": 0.2902, + "step": 1468 + }, + { + "epoch": 0.588, + "learning_rate": 1.924546033612313e-05, + "loss": 0.0084, + "step": 1470 + }, + { + "epoch": 0.5888, + "learning_rate": 1.9256065765712524e-05, + "loss": 0.0606, + "step": 1472 + }, + { + "epoch": 0.5896, + "learning_rate": 1.9266599014641724e-05, + "loss": 0.3786, + "step": 1474 + }, + { + "epoch": 0.5904, + "learning_rate": 1.927706000077034e-05, + "loss": 0.0146, + "step": 1476 + }, + { + "epoch": 0.5912, + "learning_rate": 1.9287448642521507e-05, + "loss": 0.0747, + "step": 1478 + }, + { + "epoch": 0.592, + "learning_rate": 1.9297764858882516e-05, + "loss": 0.0042, + "step": 1480 + }, + { + "epoch": 0.5928, + "learning_rate": 1.9308008569405424e-05, + "loss": 0.057, + "step": 1482 + }, + { + "epoch": 0.5936, + "learning_rate": 1.9318179694207722e-05, + "loss": 0.6622, + "step": 1484 + }, + { + "epoch": 0.5944, + "learning_rate": 1.9328278153972943e-05, + "loss": 0.3217, + "step": 1486 + }, + { + "epoch": 0.5952, + "learning_rate": 1.9338303869951266e-05, + "loss": 0.1116, + "step": 1488 + }, + { + "epoch": 0.596, + "learning_rate": 1.934825676396015e-05, + "loss": 0.1194, + "step": 1490 + }, + { + "epoch": 0.5968, + "learning_rate": 1.935813675838491e-05, + "loss": 0.0122, + "step": 1492 + }, + { + "epoch": 0.5976, + "learning_rate": 1.9367943776179375e-05, + "loss": 0.0017, + "step": 1494 + }, + { + "epoch": 0.5984, + "learning_rate": 1.9377677740866457e-05, + "loss": 0.0015, + "step": 1496 + }, + { + "epoch": 0.5992, + "learning_rate": 1.9387338576538743e-05, + "loss": 0.1951, + "step": 1498 + }, + { + "epoch": 0.6, + "learning_rate": 1.9396926207859085e-05, + "loss": 0.0009, + "step": 1500 + }, + { + "epoch": 0.6008, + "learning_rate": 1.9406440560061214e-05, + "loss": 0.0017, + "step": 1502 + }, + { + "epoch": 0.6016, + "learning_rate": 1.9415881558950302e-05, + "loss": 0.4027, + "step": 1504 + }, + { + "epoch": 0.6024, + "learning_rate": 1.942524913090354e-05, + "loss": 0.4808, + "step": 1506 + }, + { + "epoch": 0.6032, + "learning_rate": 1.9434543202870723e-05, + "loss": 0.0004, + "step": 1508 + }, + { + "epoch": 0.604, + "learning_rate": 1.9443763702374815e-05, + "loss": 0.0057, + "step": 1510 + }, + { + "epoch": 0.6048, + "learning_rate": 1.9452910557512494e-05, + "loss": 0.0264, + "step": 1512 + }, + { + "epoch": 0.6056, + "learning_rate": 1.9461983696954756e-05, + "loss": 0.0214, + "step": 1514 + }, + { + "epoch": 0.6064, + "learning_rate": 1.947098304994744e-05, + "loss": 0.0286, + "step": 1516 + }, + { + "epoch": 0.6072, + "learning_rate": 1.9479908546311787e-05, + "loss": 0.0015, + "step": 1518 + }, + { + "epoch": 0.608, + "learning_rate": 1.9488760116444966e-05, + "loss": 0.5635, + "step": 1520 + }, + { + "epoch": 0.6088, + "learning_rate": 1.949753769132067e-05, + "loss": 0.0213, + "step": 1522 + }, + { + "epoch": 0.6096, + "learning_rate": 1.95062412024896e-05, + "loss": 0.1775, + "step": 1524 + }, + { + "epoch": 0.6104, + "learning_rate": 1.951487058208003e-05, + "loss": 0.0096, + "step": 1526 + }, + { + "epoch": 0.6112, + "learning_rate": 1.952342576279833e-05, + "loss": 0.0098, + "step": 1528 + }, + { + "epoch": 0.612, + "learning_rate": 1.953190667792947e-05, + "loss": 0.001, + "step": 1530 + }, + { + "epoch": 0.6128, + "learning_rate": 1.9540313261337578e-05, + "loss": 0.006, + "step": 1532 + }, + { + "epoch": 0.6136, + "learning_rate": 1.954864544746643e-05, + "loss": 0.003, + "step": 1534 + }, + { + "epoch": 0.6144, + "learning_rate": 1.955690317133996e-05, + "loss": 0.0077, + "step": 1536 + }, + { + "epoch": 0.6152, + "learning_rate": 1.956508636856278e-05, + "loss": 0.0034, + "step": 1538 + }, + { + "epoch": 0.616, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.125, + "step": 1540 + }, + { + "epoch": 0.6168, + "learning_rate": 1.95812289283811e-05, + "loss": 0.0174, + "step": 1542 + }, + { + "epoch": 0.6176, + "learning_rate": 1.958918816509367e-05, + "loss": 0.7194, + "step": 1544 + }, + { + "epoch": 0.6184, + "learning_rate": 1.9597072623390668e-05, + "loss": 0.2466, + "step": 1546 + }, + { + "epoch": 0.6192, + "learning_rate": 1.9604882241787496e-05, + "loss": 0.0222, + "step": 1548 + }, + { + "epoch": 0.62, + "learning_rate": 1.9612616959383187e-05, + "loss": 0.3027, + "step": 1550 + }, + { + "epoch": 0.6208, + "learning_rate": 1.9620276715860856e-05, + "loss": 0.0285, + "step": 1552 + }, + { + "epoch": 0.6216, + "learning_rate": 1.9627861451488187e-05, + "loss": 0.0427, + "step": 1554 + }, + { + "epoch": 0.6224, + "learning_rate": 1.963537110711789e-05, + "loss": 0.182, + "step": 1556 + }, + { + "epoch": 0.6232, + "learning_rate": 1.964280562418815e-05, + "loss": 0.0003, + "step": 1558 + }, + { + "epoch": 0.624, + "learning_rate": 1.9650164944723116e-05, + "loss": 0.0442, + "step": 1560 + }, + { + "epoch": 0.6248, + "learning_rate": 1.9657449011333328e-05, + "loss": 0.2273, + "step": 1562 + }, + { + "epoch": 0.6256, + "learning_rate": 1.9664657767216176e-05, + "loss": 0.0003, + "step": 1564 + }, + { + "epoch": 0.6264, + "learning_rate": 1.967179115615633e-05, + "loss": 0.0438, + "step": 1566 + }, + { + "epoch": 0.6272, + "learning_rate": 1.967884912252619e-05, + "loss": 0.0003, + "step": 1568 + }, + { + "epoch": 0.628, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.2332, + "step": 1570 + }, + { + "epoch": 0.6288, + "learning_rate": 1.969273856798585e-05, + "loss": 0.3103, + "step": 1572 + }, + { + "epoch": 0.6296, + "learning_rate": 1.9699569938762972e-05, + "loss": 0.2202, + "step": 1574 + }, + { + "epoch": 0.6304, + "learning_rate": 1.9706325670345276e-05, + "loss": 0.0534, + "step": 1576 + }, + { + "epoch": 0.6312, + "learning_rate": 1.9713005710050203e-05, + "loss": 0.3785, + "step": 1578 + }, + { + "epoch": 0.632, + "learning_rate": 1.9719610005785466e-05, + "loss": 0.506, + "step": 1580 + }, + { + "epoch": 0.6328, + "learning_rate": 1.9726138506049434e-05, + "loss": 0.15, + "step": 1582 + }, + { + "epoch": 0.6336, + "learning_rate": 1.9732591159931564e-05, + "loss": 0.2677, + "step": 1584 + }, + { + "epoch": 0.6344, + "learning_rate": 1.9738967917112752e-05, + "loss": 0.2498, + "step": 1586 + }, + { + "epoch": 0.6352, + "learning_rate": 1.974526872786577e-05, + "loss": 0.0013, + "step": 1588 + }, + { + "epoch": 0.636, + "learning_rate": 1.9751493543055634e-05, + "loss": 0.0054, + "step": 1590 + }, + { + "epoch": 0.6368, + "learning_rate": 1.9757642314139977e-05, + "loss": 0.3639, + "step": 1592 + }, + { + "epoch": 0.6376, + "learning_rate": 1.976371499316945e-05, + "loss": 0.0004, + "step": 1594 + }, + { + "epoch": 0.6384, + "learning_rate": 1.9769711532788083e-05, + "loss": 0.3692, + "step": 1596 + }, + { + "epoch": 0.6392, + "learning_rate": 1.9775631886233655e-05, + "loss": 0.191, + "step": 1598 + }, + { + "epoch": 0.64, + "learning_rate": 1.9781476007338054e-05, + "loss": 0.0002, + "step": 1600 + }, + { + "epoch": 0.6408, + "learning_rate": 1.978724385052766e-05, + "loss": 0.0002, + "step": 1602 + }, + { + "epoch": 0.6416, + "learning_rate": 1.9792935370823673e-05, + "loss": 0.0026, + "step": 1604 + }, + { + "epoch": 0.6424, + "learning_rate": 1.979855052384247e-05, + "loss": 0.0347, + "step": 1606 + }, + { + "epoch": 0.6432, + "learning_rate": 1.9804089265795956e-05, + "loss": 0.0011, + "step": 1608 + }, + { + "epoch": 0.644, + "learning_rate": 1.9809551553491918e-05, + "loss": 0.2278, + "step": 1610 + }, + { + "epoch": 0.6448, + "learning_rate": 1.981493734433433e-05, + "loss": 0.0023, + "step": 1612 + }, + { + "epoch": 0.6456, + "learning_rate": 1.982024659632372e-05, + "loss": 0.0313, + "step": 1614 + }, + { + "epoch": 0.6464, + "learning_rate": 1.9825479268057472e-05, + "loss": 0.0001, + "step": 1616 + }, + { + "epoch": 0.6472, + "learning_rate": 1.9830635318730155e-05, + "loss": 0.301, + "step": 1618 + }, + { + "epoch": 0.648, + "learning_rate": 1.9835714708133858e-05, + "loss": 0.0276, + "step": 1620 + }, + { + "epoch": 0.6488, + "learning_rate": 1.9840717396658483e-05, + "loss": 1.7505, + "step": 1622 + }, + { + "epoch": 0.6496, + "learning_rate": 1.9845643345292055e-05, + "loss": 0.0024, + "step": 1624 + }, + { + "epoch": 0.6504, + "learning_rate": 1.9850492515621038e-05, + "loss": 0.0959, + "step": 1626 + }, + { + "epoch": 0.6512, + "learning_rate": 1.985526486983063e-05, + "loss": 0.0063, + "step": 1628 + }, + { + "epoch": 0.652, + "learning_rate": 1.985996037070505e-05, + "loss": 0.0406, + "step": 1630 + }, + { + "epoch": 0.6528, + "learning_rate": 1.9864578981627844e-05, + "loss": 0.1483, + "step": 1632 + }, + { + "epoch": 0.6536, + "learning_rate": 1.9869120666582153e-05, + "loss": 0.4189, + "step": 1634 + }, + { + "epoch": 0.6544, + "learning_rate": 1.9873585390151003e-05, + "loss": 0.0329, + "step": 1636 + }, + { + "epoch": 0.6552, + "learning_rate": 1.987797311751759e-05, + "loss": 1.1702, + "step": 1638 + }, + { + "epoch": 0.656, + "learning_rate": 1.9882283814465528e-05, + "loss": 0.0006, + "step": 1640 + }, + { + "epoch": 0.6568, + "learning_rate": 1.988651744737914e-05, + "loss": 0.1924, + "step": 1642 + }, + { + "epoch": 0.6576, + "learning_rate": 1.9890673983243704e-05, + "loss": 0.2227, + "step": 1644 + }, + { + "epoch": 0.6584, + "learning_rate": 1.9894753389645723e-05, + "loss": 0.0678, + "step": 1646 + }, + { + "epoch": 0.6592, + "learning_rate": 1.9898755634773155e-05, + "loss": 0.1466, + "step": 1648 + }, + { + "epoch": 0.66, + "learning_rate": 1.9902680687415704e-05, + "loss": 0.4467, + "step": 1650 + }, + { + "epoch": 0.6608, + "learning_rate": 1.9906528516965014e-05, + "loss": 1.5136, + "step": 1652 + }, + { + "epoch": 0.6616, + "learning_rate": 1.9910299093414926e-05, + "loss": 0.0969, + "step": 1654 + }, + { + "epoch": 0.6624, + "learning_rate": 1.9913992387361744e-05, + "loss": 0.16, + "step": 1656 + }, + { + "epoch": 0.6632, + "learning_rate": 1.9917608370004414e-05, + "loss": 0.0317, + "step": 1658 + }, + { + "epoch": 0.664, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.0569, + "step": 1660 + }, + { + "epoch": 0.6648, + "learning_rate": 1.9924608289187786e-05, + "loss": 0.0147, + "step": 1662 + }, + { + "epoch": 0.6656, + "learning_rate": 1.9927992171141707e-05, + "loss": 0.2525, + "step": 1664 + }, + { + "epoch": 0.6664, + "learning_rate": 1.9931298632618355e-05, + "loss": 0.0582, + "step": 1666 + }, + { + "epoch": 0.6672, + "learning_rate": 1.9934527647833276e-05, + "loss": 0.1197, + "step": 1668 + }, + { + "epoch": 0.668, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.0994, + "step": 1670 + }, + { + "epoch": 0.6688, + "learning_rate": 1.9940753239360047e-05, + "loss": 0.1577, + "step": 1672 + }, + { + "epoch": 0.6696, + "learning_rate": 1.994374976712348e-05, + "loss": 1.0883, + "step": 1674 + }, + { + "epoch": 0.6704, + "learning_rate": 1.994666875152874e-05, + "loss": 0.0405, + "step": 1676 + }, + { + "epoch": 0.6712, + "learning_rate": 1.9949510169813003e-05, + "loss": 0.0261, + "step": 1678 + }, + { + "epoch": 0.672, + "learning_rate": 1.9952273999818312e-05, + "loss": 0.012, + "step": 1680 + }, + { + "epoch": 0.6728, + "learning_rate": 1.995496021999177e-05, + "loss": 0.3207, + "step": 1682 + }, + { + "epoch": 0.6736, + "learning_rate": 1.9957568809385693e-05, + "loss": 0.049, + "step": 1684 + }, + { + "epoch": 0.6744, + "learning_rate": 1.9960099747657774e-05, + "loss": 0.0295, + "step": 1686 + }, + { + "epoch": 0.6752, + "learning_rate": 1.996255301507125e-05, + "loss": 0.1288, + "step": 1688 + }, + { + "epoch": 0.676, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.0928, + "step": 1690 + }, + { + "epoch": 0.6768, + "learning_rate": 1.9967226461403934e-05, + "loss": 0.281, + "step": 1692 + }, + { + "epoch": 0.6776, + "learning_rate": 1.996944660387867e-05, + "loss": 0.0964, + "step": 1694 + }, + { + "epoch": 0.6784, + "learning_rate": 1.997158900260614e-05, + "loss": 0.735, + "step": 1696 + }, + { + "epoch": 0.6792, + "learning_rate": 1.9973653640879486e-05, + "loss": 0.1437, + "step": 1698 + }, + { + "epoch": 0.68, + "learning_rate": 1.9975640502598246e-05, + "loss": 0.0645, + "step": 1700 + }, + { + "epoch": 0.6808, + "learning_rate": 1.997754957226847e-05, + "loss": 0.0176, + "step": 1702 + }, + { + "epoch": 0.6816, + "learning_rate": 1.9979380835002846e-05, + "loss": 0.0703, + "step": 1704 + }, + { + "epoch": 0.6824, + "learning_rate": 1.9981134276520828e-05, + "loss": 0.0106, + "step": 1706 + }, + { + "epoch": 0.6832, + "learning_rate": 1.998280988314872e-05, + "loss": 0.7959, + "step": 1708 + }, + { + "epoch": 0.684, + "learning_rate": 1.998440764181981e-05, + "loss": 0.1008, + "step": 1710 + }, + { + "epoch": 0.6848, + "learning_rate": 1.9985927540074453e-05, + "loss": 0.3584, + "step": 1712 + }, + { + "epoch": 0.6856, + "learning_rate": 1.998736956606018e-05, + "loss": 0.0058, + "step": 1714 + }, + { + "epoch": 0.6864, + "learning_rate": 1.9988733708531772e-05, + "loss": 0.0196, + "step": 1716 + }, + { + "epoch": 0.6872, + "learning_rate": 1.9990019956851384e-05, + "loss": 0.0115, + "step": 1718 + }, + { + "epoch": 0.688, + "learning_rate": 1.9991228300988586e-05, + "loss": 0.0118, + "step": 1720 + }, + { + "epoch": 0.6888, + "learning_rate": 1.999235873152047e-05, + "loss": 0.2053, + "step": 1722 + }, + { + "epoch": 0.6896, + "learning_rate": 1.9993411239631713e-05, + "loss": 0.0303, + "step": 1724 + }, + { + "epoch": 0.6904, + "learning_rate": 1.9994385817114644e-05, + "loss": 0.023, + "step": 1726 + }, + { + "epoch": 0.6912, + "learning_rate": 1.9995282456369313e-05, + "loss": 0.0311, + "step": 1728 + }, + { + "epoch": 0.692, + "learning_rate": 1.9996101150403543e-05, + "loss": 0.1754, + "step": 1730 + }, + { + "epoch": 0.6928, + "learning_rate": 1.9996841892833e-05, + "loss": 0.4787, + "step": 1732 + }, + { + "epoch": 0.6936, + "learning_rate": 1.9997504677881224e-05, + "loss": 0.055, + "step": 1734 + }, + { + "epoch": 0.6944, + "learning_rate": 1.999808950037968e-05, + "loss": 0.9505, + "step": 1736 + }, + { + "epoch": 0.6952, + "learning_rate": 1.9998596355767805e-05, + "loss": 0.0058, + "step": 1738 + }, + { + "epoch": 0.696, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.0023, + "step": 1740 + }, + { + "epoch": 0.6968, + "learning_rate": 1.9999376150010868e-05, + "loss": 0.0462, + "step": 1742 + }, + { + "epoch": 0.6976, + "learning_rate": 1.9999649082784807e-05, + "loss": 0.0009, + "step": 1744 + }, + { + "epoch": 0.6984, + "learning_rate": 1.9999844036286483e-05, + "loss": 0.4746, + "step": 1746 + }, + { + "epoch": 0.6992, + "learning_rate": 1.9999961008995607e-05, + "loss": 0.314, + "step": 1748 + }, + { + "epoch": 0.7, + "learning_rate": 2e-05, + "loss": 0.0017, + "step": 1750 + }, + { + "epoch": 0.7008, + "learning_rate": 1.9999961008995607e-05, + "loss": 2.2469, + "step": 1752 + }, + { + "epoch": 0.7016, + "learning_rate": 1.9999844036286483e-05, + "loss": 0.0295, + "step": 1754 + }, + { + "epoch": 0.7024, + "learning_rate": 1.9999649082784807e-05, + "loss": 0.2414, + "step": 1756 + }, + { + "epoch": 0.7032, + "learning_rate": 1.9999376150010868e-05, + "loss": 0.1735, + "step": 1758 + }, + { + "epoch": 0.704, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.009, + "step": 1760 + }, + { + "epoch": 0.7048, + "learning_rate": 1.9998596355767805e-05, + "loss": 0.1329, + "step": 1762 + }, + { + "epoch": 0.7056, + "learning_rate": 1.999808950037968e-05, + "loss": 0.2628, + "step": 1764 + }, + { + "epoch": 0.7064, + "learning_rate": 1.9997504677881224e-05, + "loss": 0.3569, + "step": 1766 + }, + { + "epoch": 0.7072, + "learning_rate": 1.9996841892833e-05, + "loss": 0.0721, + "step": 1768 + }, + { + "epoch": 0.708, + "learning_rate": 1.9996101150403547e-05, + "loss": 0.0184, + "step": 1770 + }, + { + "epoch": 0.7088, + "learning_rate": 1.9995282456369313e-05, + "loss": 0.6307, + "step": 1772 + }, + { + "epoch": 0.7096, + "learning_rate": 1.9994385817114644e-05, + "loss": 0.0453, + "step": 1774 + }, + { + "epoch": 0.7104, + "learning_rate": 1.9993411239631713e-05, + "loss": 0.0117, + "step": 1776 + }, + { + "epoch": 0.7112, + "learning_rate": 1.999235873152047e-05, + "loss": 0.0416, + "step": 1778 + }, + { + "epoch": 0.712, + "learning_rate": 1.9991228300988586e-05, + "loss": 0.2019, + "step": 1780 + }, + { + "epoch": 0.7128, + "learning_rate": 1.9990019956851384e-05, + "loss": 0.0016, + "step": 1782 + }, + { + "epoch": 0.7136, + "learning_rate": 1.9988733708531772e-05, + "loss": 0.0211, + "step": 1784 + }, + { + "epoch": 0.7144, + "learning_rate": 1.998736956606018e-05, + "loss": 0.0076, + "step": 1786 + }, + { + "epoch": 0.7152, + "learning_rate": 1.9985927540074453e-05, + "loss": 0.2906, + "step": 1788 + }, + { + "epoch": 0.716, + "learning_rate": 1.9984407641819812e-05, + "loss": 0.5363, + "step": 1790 + }, + { + "epoch": 0.7168, + "learning_rate": 1.998280988314872e-05, + "loss": 0.0181, + "step": 1792 + }, + { + "epoch": 0.7176, + "learning_rate": 1.9981134276520828e-05, + "loss": 1.7796, + "step": 1794 + }, + { + "epoch": 0.7184, + "learning_rate": 1.9979380835002846e-05, + "loss": 0.2532, + "step": 1796 + }, + { + "epoch": 0.7192, + "learning_rate": 1.9977549572268467e-05, + "loss": 0.0862, + "step": 1798 + }, + { + "epoch": 0.72, + "learning_rate": 1.9975640502598246e-05, + "loss": 0.216, + "step": 1800 + }, + { + "epoch": 0.7208, + "learning_rate": 1.9973653640879486e-05, + "loss": 0.013, + "step": 1802 + }, + { + "epoch": 0.7216, + "learning_rate": 1.997158900260614e-05, + "loss": 0.0234, + "step": 1804 + }, + { + "epoch": 0.7224, + "learning_rate": 1.9969446603878673e-05, + "loss": 0.026, + "step": 1806 + }, + { + "epoch": 0.7232, + "learning_rate": 1.9967226461403934e-05, + "loss": 0.0468, + "step": 1808 + }, + { + "epoch": 0.724, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.0468, + "step": 1810 + }, + { + "epoch": 0.7248, + "learning_rate": 1.996255301507125e-05, + "loss": 0.0512, + "step": 1812 + }, + { + "epoch": 0.7256, + "learning_rate": 1.9960099747657774e-05, + "loss": 0.0127, + "step": 1814 + }, + { + "epoch": 0.7264, + "learning_rate": 1.9957568809385693e-05, + "loss": 0.2308, + "step": 1816 + }, + { + "epoch": 0.7272, + "learning_rate": 1.995496021999177e-05, + "loss": 0.1829, + "step": 1818 + }, + { + "epoch": 0.728, + "learning_rate": 1.9952273999818312e-05, + "loss": 0.9208, + "step": 1820 + }, + { + "epoch": 0.7288, + "learning_rate": 1.9949510169813006e-05, + "loss": 0.0892, + "step": 1822 + }, + { + "epoch": 0.7296, + "learning_rate": 1.9946668751528745e-05, + "loss": 0.1839, + "step": 1824 + }, + { + "epoch": 0.7304, + "learning_rate": 1.994374976712348e-05, + "loss": 0.7965, + "step": 1826 + }, + { + "epoch": 0.7312, + "learning_rate": 1.9940753239360047e-05, + "loss": 0.0167, + "step": 1828 + }, + { + "epoch": 0.732, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.0144, + "step": 1830 + }, + { + "epoch": 0.7328, + "learning_rate": 1.993452764783328e-05, + "loss": 0.1671, + "step": 1832 + }, + { + "epoch": 0.7336, + "learning_rate": 1.9931298632618352e-05, + "loss": 0.1504, + "step": 1834 + }, + { + "epoch": 0.7344, + "learning_rate": 1.9927992171141707e-05, + "loss": 0.8673, + "step": 1836 + }, + { + "epoch": 0.7352, + "learning_rate": 1.9924608289187786e-05, + "loss": 0.0088, + "step": 1838 + }, + { + "epoch": 0.736, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.2308, + "step": 1840 + }, + { + "epoch": 0.7368, + "learning_rate": 1.9917608370004417e-05, + "loss": 0.1666, + "step": 1842 + }, + { + "epoch": 0.7376, + "learning_rate": 1.9913992387361747e-05, + "loss": 0.3304, + "step": 1844 + }, + { + "epoch": 0.7384, + "learning_rate": 1.9910299093414932e-05, + "loss": 0.1237, + "step": 1846 + }, + { + "epoch": 0.7392, + "learning_rate": 1.990652851696501e-05, + "loss": 0.2194, + "step": 1848 + }, + { + "epoch": 0.74, + "learning_rate": 1.9902680687415704e-05, + "loss": 0.0359, + "step": 1850 + }, + { + "epoch": 0.7408, + "learning_rate": 1.9898755634773155e-05, + "loss": 0.1059, + "step": 1852 + }, + { + "epoch": 0.7416, + "learning_rate": 1.9894753389645723e-05, + "loss": 0.0054, + "step": 1854 + }, + { + "epoch": 0.7424, + "learning_rate": 1.9890673983243704e-05, + "loss": 0.0806, + "step": 1856 + }, + { + "epoch": 0.7432, + "learning_rate": 1.9886517447379143e-05, + "loss": 0.3198, + "step": 1858 + }, + { + "epoch": 0.744, + "learning_rate": 1.988228381446553e-05, + "loss": 0.5058, + "step": 1860 + }, + { + "epoch": 0.7448, + "learning_rate": 1.987797311751759e-05, + "loss": 0.3095, + "step": 1862 + }, + { + "epoch": 0.7456, + "learning_rate": 1.9873585390151007e-05, + "loss": 0.2754, + "step": 1864 + }, + { + "epoch": 0.7464, + "learning_rate": 1.9869120666582153e-05, + "loss": 0.1577, + "step": 1866 + }, + { + "epoch": 0.7472, + "learning_rate": 1.9864578981627844e-05, + "loss": 0.026, + "step": 1868 + }, + { + "epoch": 0.748, + "learning_rate": 1.985996037070505e-05, + "loss": 0.1339, + "step": 1870 + }, + { + "epoch": 0.7488, + "learning_rate": 1.985526486983063e-05, + "loss": 0.0163, + "step": 1872 + }, + { + "epoch": 0.7496, + "learning_rate": 1.9850492515621038e-05, + "loss": 0.019, + "step": 1874 + }, + { + "epoch": 0.7504, + "learning_rate": 1.9845643345292055e-05, + "loss": 0.3126, + "step": 1876 + }, + { + "epoch": 0.7512, + "learning_rate": 1.9840717396658483e-05, + "loss": 0.2677, + "step": 1878 + }, + { + "epoch": 0.752, + "learning_rate": 1.983571470813386e-05, + "loss": 0.3568, + "step": 1880 + }, + { + "epoch": 0.7528, + "learning_rate": 1.983063531873016e-05, + "loss": 0.2941, + "step": 1882 + }, + { + "epoch": 0.7536, + "learning_rate": 1.982547926805747e-05, + "loss": 0.0069, + "step": 1884 + }, + { + "epoch": 0.7544, + "learning_rate": 1.9820246596323724e-05, + "loss": 0.0106, + "step": 1886 + }, + { + "epoch": 0.7552, + "learning_rate": 1.981493734433433e-05, + "loss": 0.0587, + "step": 1888 + }, + { + "epoch": 0.756, + "learning_rate": 1.9809551553491918e-05, + "loss": 0.0023, + "step": 1890 + }, + { + "epoch": 0.7568, + "learning_rate": 1.9804089265795963e-05, + "loss": 0.1224, + "step": 1892 + }, + { + "epoch": 0.7576, + "learning_rate": 1.979855052384247e-05, + "loss": 0.003, + "step": 1894 + }, + { + "epoch": 0.7584, + "learning_rate": 1.979293537082368e-05, + "loss": 0.3377, + "step": 1896 + }, + { + "epoch": 0.7592, + "learning_rate": 1.9787243850527663e-05, + "loss": 0.0331, + "step": 1898 + }, + { + "epoch": 0.76, + "learning_rate": 1.978147600733806e-05, + "loss": 0.0491, + "step": 1900 + }, + { + "epoch": 0.7608, + "learning_rate": 1.977563188623365e-05, + "loss": 0.0005, + "step": 1902 + }, + { + "epoch": 0.7616, + "learning_rate": 1.9769711532788086e-05, + "loss": 0.0682, + "step": 1904 + }, + { + "epoch": 0.7624, + "learning_rate": 1.9763714993169448e-05, + "loss": 0.0827, + "step": 1906 + }, + { + "epoch": 0.7632, + "learning_rate": 1.9757642314139977e-05, + "loss": 0.9237, + "step": 1908 + }, + { + "epoch": 0.764, + "learning_rate": 1.9751493543055638e-05, + "loss": 0.0071, + "step": 1910 + }, + { + "epoch": 0.7648, + "learning_rate": 1.9745268727865774e-05, + "loss": 0.0193, + "step": 1912 + }, + { + "epoch": 0.7656, + "learning_rate": 1.973896791711276e-05, + "loss": 0.0005, + "step": 1914 + }, + { + "epoch": 0.7664, + "learning_rate": 1.9732591159931567e-05, + "loss": 0.1188, + "step": 1916 + }, + { + "epoch": 0.7672, + "learning_rate": 1.972613850604944e-05, + "loss": 0.0178, + "step": 1918 + }, + { + "epoch": 0.768, + "learning_rate": 1.9719610005785463e-05, + "loss": 0.3305, + "step": 1920 + }, + { + "epoch": 0.7688, + "learning_rate": 1.9713005710050206e-05, + "loss": 0.0029, + "step": 1922 + }, + { + "epoch": 0.7696, + "learning_rate": 1.9706325670345276e-05, + "loss": 0.0447, + "step": 1924 + }, + { + "epoch": 0.7704, + "learning_rate": 1.9699569938762975e-05, + "loss": 0.0011, + "step": 1926 + }, + { + "epoch": 0.7712, + "learning_rate": 1.969273856798586e-05, + "loss": 0.0123, + "step": 1928 + }, + { + "epoch": 0.772, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.0023, + "step": 1930 + }, + { + "epoch": 0.7728, + "learning_rate": 1.9678849122526195e-05, + "loss": 0.0008, + "step": 1932 + }, + { + "epoch": 0.7736, + "learning_rate": 1.967179115615633e-05, + "loss": 0.579, + "step": 1934 + }, + { + "epoch": 0.7744, + "learning_rate": 1.966465776721618e-05, + "loss": 0.0196, + "step": 1936 + }, + { + "epoch": 0.7752, + "learning_rate": 1.9657449011333328e-05, + "loss": 0.0382, + "step": 1938 + }, + { + "epoch": 0.776, + "learning_rate": 1.965016494472312e-05, + "loss": 0.0026, + "step": 1940 + }, + { + "epoch": 0.7768, + "learning_rate": 1.964280562418815e-05, + "loss": 0.0063, + "step": 1942 + }, + { + "epoch": 0.7776, + "learning_rate": 1.963537110711789e-05, + "loss": 0.0025, + "step": 1944 + }, + { + "epoch": 0.7784, + "learning_rate": 1.9627861451488194e-05, + "loss": 0.0163, + "step": 1946 + }, + { + "epoch": 0.7792, + "learning_rate": 1.962027671586086e-05, + "loss": 0.0006, + "step": 1948 + }, + { + "epoch": 0.78, + "learning_rate": 1.9612616959383194e-05, + "loss": 0.0035, + "step": 1950 + }, + { + "epoch": 0.7808, + "learning_rate": 1.96048822417875e-05, + "loss": 0.0256, + "step": 1952 + }, + { + "epoch": 0.7816, + "learning_rate": 1.9597072623390668e-05, + "loss": 0.4352, + "step": 1954 + }, + { + "epoch": 0.7824, + "learning_rate": 1.9589188165093666e-05, + "loss": 0.0044, + "step": 1956 + }, + { + "epoch": 0.7832, + "learning_rate": 1.95812289283811e-05, + "loss": 0.0061, + "step": 1958 + }, + { + "epoch": 0.784, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.0048, + "step": 1960 + }, + { + "epoch": 0.7848, + "learning_rate": 1.9565086368562784e-05, + "loss": 0.1914, + "step": 1962 + }, + { + "epoch": 0.7856, + "learning_rate": 1.9556903171339966e-05, + "loss": 0.0944, + "step": 1964 + }, + { + "epoch": 0.7864, + "learning_rate": 1.954864544746643e-05, + "loss": 0.0012, + "step": 1966 + }, + { + "epoch": 0.7872, + "learning_rate": 1.9540313261337585e-05, + "loss": 0.0027, + "step": 1968 + }, + { + "epoch": 0.788, + "learning_rate": 1.9531906677929472e-05, + "loss": 0.0148, + "step": 1970 + }, + { + "epoch": 0.7888, + "learning_rate": 1.9523425762798335e-05, + "loss": 0.0012, + "step": 1972 + }, + { + "epoch": 0.7896, + "learning_rate": 1.9514870582080035e-05, + "loss": 0.0018, + "step": 1974 + }, + { + "epoch": 0.7904, + "learning_rate": 1.95062412024896e-05, + "loss": 0.0305, + "step": 1976 + }, + { + "epoch": 0.7912, + "learning_rate": 1.9497537691320667e-05, + "loss": 0.0019, + "step": 1978 + }, + { + "epoch": 0.792, + "learning_rate": 1.948876011644497e-05, + "loss": 0.1104, + "step": 1980 + }, + { + "epoch": 0.7928, + "learning_rate": 1.9479908546311787e-05, + "loss": 0.0005, + "step": 1982 + }, + { + "epoch": 0.7936, + "learning_rate": 1.9470983049947443e-05, + "loss": 0.0132, + "step": 1984 + }, + { + "epoch": 0.7944, + "learning_rate": 1.9461983696954767e-05, + "loss": 0.0727, + "step": 1986 + }, + { + "epoch": 0.7952, + "learning_rate": 1.9452910557512494e-05, + "loss": 0.3567, + "step": 1988 + }, + { + "epoch": 0.796, + "learning_rate": 1.9443763702374818e-05, + "loss": 0.0384, + "step": 1990 + }, + { + "epoch": 0.7968, + "learning_rate": 1.9434543202870726e-05, + "loss": 0.2872, + "step": 1992 + }, + { + "epoch": 0.7976, + "learning_rate": 1.9425249130903544e-05, + "loss": 0.0145, + "step": 1994 + }, + { + "epoch": 0.7984, + "learning_rate": 1.94158815589503e-05, + "loss": 1.0319, + "step": 1996 + }, + { + "epoch": 0.7992, + "learning_rate": 1.940644056006122e-05, + "loss": 0.0323, + "step": 1998 + }, + { + "epoch": 0.8, + "learning_rate": 1.939692620785909e-05, + "loss": 0.9458, + "step": 2000 + }, + { + "epoch": 0.8008, + "learning_rate": 1.9387338576538746e-05, + "loss": 0.0499, + "step": 2002 + }, + { + "epoch": 0.8016, + "learning_rate": 1.9377677740866464e-05, + "loss": 0.0003, + "step": 2004 + }, + { + "epoch": 0.8024, + "learning_rate": 1.936794377617938e-05, + "loss": 0.0219, + "step": 2006 + }, + { + "epoch": 0.8032, + "learning_rate": 1.9358136758384917e-05, + "loss": 0.001, + "step": 2008 + }, + { + "epoch": 0.804, + "learning_rate": 1.9348256763960146e-05, + "loss": 0.0657, + "step": 2010 + }, + { + "epoch": 0.8048, + "learning_rate": 1.9338303869951273e-05, + "loss": 0.0478, + "step": 2012 + }, + { + "epoch": 0.8056, + "learning_rate": 1.9328278153972943e-05, + "loss": 0.1628, + "step": 2014 + }, + { + "epoch": 0.8064, + "learning_rate": 1.931817969420773e-05, + "loss": 0.0626, + "step": 2016 + }, + { + "epoch": 0.8072, + "learning_rate": 1.930800856940543e-05, + "loss": 0.001, + "step": 2018 + }, + { + "epoch": 0.808, + "learning_rate": 1.929776485888252e-05, + "loss": 0.0201, + "step": 2020 + }, + { + "epoch": 0.8088, + "learning_rate": 1.9287448642521517e-05, + "loss": 0.3468, + "step": 2022 + }, + { + "epoch": 0.8096, + "learning_rate": 1.9277060000770342e-05, + "loss": 0.5697, + "step": 2024 + }, + { + "epoch": 0.8104, + "learning_rate": 1.9266599014641727e-05, + "loss": 0.5123, + "step": 2026 + }, + { + "epoch": 0.8112, + "learning_rate": 1.925606576571252e-05, + "loss": 0.2776, + "step": 2028 + }, + { + "epoch": 0.812, + "learning_rate": 1.9245460336123136e-05, + "loss": 0.218, + "step": 2030 + }, + { + "epoch": 0.8128, + "learning_rate": 1.923478280857682e-05, + "loss": 0.5923, + "step": 2032 + }, + { + "epoch": 0.8136, + "learning_rate": 1.9224033266339103e-05, + "loss": 0.2937, + "step": 2034 + }, + { + "epoch": 0.8144, + "learning_rate": 1.9213211793237066e-05, + "loss": 0.1209, + "step": 2036 + }, + { + "epoch": 0.8152, + "learning_rate": 1.9202318473658707e-05, + "loss": 0.0074, + "step": 2038 + }, + { + "epoch": 0.816, + "learning_rate": 1.919135339255235e-05, + "loss": 0.3764, + "step": 2040 + }, + { + "epoch": 0.8168, + "learning_rate": 1.918031663542588e-05, + "loss": 0.4123, + "step": 2042 + }, + { + "epoch": 0.8176, + "learning_rate": 1.916920828834617e-05, + "loss": 0.5164, + "step": 2044 + }, + { + "epoch": 0.8184, + "learning_rate": 1.9158028437938313e-05, + "loss": 1.1912, + "step": 2046 + }, + { + "epoch": 0.8192, + "learning_rate": 1.9146777171385057e-05, + "loss": 1.1282, + "step": 2048 + }, + { + "epoch": 0.82, + "learning_rate": 1.913545457642601e-05, + "loss": 0.2692, + "step": 2050 + }, + { + "epoch": 0.8208, + "learning_rate": 1.9124060741357065e-05, + "loss": 0.1678, + "step": 2052 + }, + { + "epoch": 0.8216, + "learning_rate": 1.911259575502963e-05, + "loss": 0.2135, + "step": 2054 + }, + { + "epoch": 0.8224, + "learning_rate": 1.910105970684996e-05, + "loss": 0.0102, + "step": 2056 + }, + { + "epoch": 0.8232, + "learning_rate": 1.908945268677849e-05, + "loss": 0.2763, + "step": 2058 + }, + { + "epoch": 0.824, + "learning_rate": 1.9077774785329085e-05, + "loss": 0.1061, + "step": 2060 + }, + { + "epoch": 0.8248, + "learning_rate": 1.9066026093568383e-05, + "loss": 0.1281, + "step": 2062 + }, + { + "epoch": 0.8256, + "learning_rate": 1.9054206703115013e-05, + "loss": 0.0895, + "step": 2064 + }, + { + "epoch": 0.8264, + "learning_rate": 1.9042316706138994e-05, + "loss": 0.2634, + "step": 2066 + }, + { + "epoch": 0.8272, + "learning_rate": 1.903035619536087e-05, + "loss": 0.0122, + "step": 2068 + }, + { + "epoch": 0.828, + "learning_rate": 1.901832526405114e-05, + "loss": 0.0331, + "step": 2070 + }, + { + "epoch": 0.8288, + "learning_rate": 1.9006224006029414e-05, + "loss": 0.0773, + "step": 2072 + }, + { + "epoch": 0.8296, + "learning_rate": 1.899405251566371e-05, + "loss": 0.023, + "step": 2074 + }, + { + "epoch": 0.8304, + "learning_rate": 1.8981810887869797e-05, + "loss": 0.245, + "step": 2076 + }, + { + "epoch": 0.8312, + "learning_rate": 1.8969499218110302e-05, + "loss": 0.0226, + "step": 2078 + }, + { + "epoch": 0.832, + "learning_rate": 1.8957117602394133e-05, + "loss": 0.0033, + "step": 2080 + }, + { + "epoch": 0.8328, + "learning_rate": 1.8944666137275596e-05, + "loss": 0.0674, + "step": 2082 + }, + { + "epoch": 0.8336, + "learning_rate": 1.8932144919853744e-05, + "loss": 0.008, + "step": 2084 + }, + { + "epoch": 0.8344, + "learning_rate": 1.8919554047771508e-05, + "loss": 0.0182, + "step": 2086 + }, + { + "epoch": 0.8352, + "learning_rate": 1.890689361921507e-05, + "loss": 0.0252, + "step": 2088 + }, + { + "epoch": 0.836, + "learning_rate": 1.8894163732912986e-05, + "loss": 0.0042, + "step": 2090 + }, + { + "epoch": 0.8368, + "learning_rate": 1.8881364488135445e-05, + "loss": 0.0898, + "step": 2092 + }, + { + "epoch": 0.8376, + "learning_rate": 1.886849598469357e-05, + "loss": 0.0248, + "step": 2094 + }, + { + "epoch": 0.8384, + "learning_rate": 1.8855558322938492e-05, + "loss": 0.0324, + "step": 2096 + }, + { + "epoch": 0.8392, + "learning_rate": 1.8842551603760725e-05, + "loss": 0.0027, + "step": 2098 + }, + { + "epoch": 0.84, + "learning_rate": 1.8829475928589265e-05, + "loss": 0.0028, + "step": 2100 + }, + { + "epoch": 0.8408, + "learning_rate": 1.8816331399390874e-05, + "loss": 0.0142, + "step": 2102 + }, + { + "epoch": 0.8416, + "learning_rate": 1.88031181186692e-05, + "loss": 0.8458, + "step": 2104 + }, + { + "epoch": 0.8424, + "learning_rate": 1.8789836189464092e-05, + "loss": 0.0154, + "step": 2106 + }, + { + "epoch": 0.8432, + "learning_rate": 1.877648571535068e-05, + "loss": 0.0126, + "step": 2108 + }, + { + "epoch": 0.844, + "learning_rate": 1.8763066800438638e-05, + "loss": 0.003, + "step": 2110 + }, + { + "epoch": 0.8448, + "learning_rate": 1.8749579549371387e-05, + "loss": 0.0466, + "step": 2112 + }, + { + "epoch": 0.8456, + "learning_rate": 1.8736024067325188e-05, + "loss": 0.0049, + "step": 2114 + }, + { + "epoch": 0.8464, + "learning_rate": 1.8722400460008437e-05, + "loss": 0.005, + "step": 2116 + }, + { + "epoch": 0.8472, + "learning_rate": 1.8708708833660748e-05, + "loss": 0.0026, + "step": 2118 + }, + { + "epoch": 0.848, + "learning_rate": 1.8694949295052198e-05, + "loss": 0.7495, + "step": 2120 + }, + { + "epoch": 0.8488, + "learning_rate": 1.868112195148239e-05, + "loss": 0.0057, + "step": 2122 + }, + { + "epoch": 0.8496, + "learning_rate": 1.866722691077977e-05, + "loss": 0.6767, + "step": 2124 + }, + { + "epoch": 0.8504, + "learning_rate": 1.8653264281300626e-05, + "loss": 0.6432, + "step": 2126 + }, + { + "epoch": 0.8512, + "learning_rate": 1.8639234171928355e-05, + "loss": 0.4428, + "step": 2128 + }, + { + "epoch": 0.852, + "learning_rate": 1.8625136692072587e-05, + "loss": 0.0113, + "step": 2130 + }, + { + "epoch": 0.8528, + "learning_rate": 1.8610971951668265e-05, + "loss": 0.0022, + "step": 2132 + }, + { + "epoch": 0.8536, + "learning_rate": 1.8596740061174912e-05, + "loss": 0.1897, + "step": 2134 + }, + { + "epoch": 0.8544, + "learning_rate": 1.858244113157566e-05, + "loss": 0.4108, + "step": 2136 + }, + { + "epoch": 0.8552, + "learning_rate": 1.8568075274376432e-05, + "loss": 0.0664, + "step": 2138 + }, + { + "epoch": 0.856, + "learning_rate": 1.8553642601605083e-05, + "loss": 0.3855, + "step": 2140 + }, + { + "epoch": 0.8568, + "learning_rate": 1.8539143225810457e-05, + "loss": 0.1407, + "step": 2142 + }, + { + "epoch": 0.8576, + "learning_rate": 1.852457726006163e-05, + "loss": 0.0698, + "step": 2144 + }, + { + "epoch": 0.8584, + "learning_rate": 1.8509944817946917e-05, + "loss": 1.2363, + "step": 2146 + }, + { + "epoch": 0.8592, + "learning_rate": 1.8495246013573064e-05, + "loss": 0.1597, + "step": 2148 + }, + { + "epoch": 0.86, + "learning_rate": 1.848048096156426e-05, + "loss": 0.4943, + "step": 2150 + }, + { + "epoch": 0.8608, + "learning_rate": 1.8465649777061387e-05, + "loss": 0.0143, + "step": 2152 + }, + { + "epoch": 0.8616, + "learning_rate": 1.8450752575720967e-05, + "loss": 0.047, + "step": 2154 + }, + { + "epoch": 0.8624, + "learning_rate": 1.843578947371439e-05, + "loss": 0.5615, + "step": 2156 + }, + { + "epoch": 0.8632, + "learning_rate": 1.8420760587726935e-05, + "loss": 0.5071, + "step": 2158 + }, + { + "epoch": 0.864, + "learning_rate": 1.8405666034956846e-05, + "loss": 0.0564, + "step": 2160 + }, + { + "epoch": 0.8648, + "learning_rate": 1.8390505933114507e-05, + "loss": 0.0319, + "step": 2162 + }, + { + "epoch": 0.8656, + "learning_rate": 1.8375280400421414e-05, + "loss": 0.0658, + "step": 2164 + }, + { + "epoch": 0.8664, + "learning_rate": 1.8359989555609365e-05, + "loss": 0.0327, + "step": 2166 + }, + { + "epoch": 0.8672, + "learning_rate": 1.834463351791939e-05, + "loss": 0.1069, + "step": 2168 + }, + { + "epoch": 0.868, + "learning_rate": 1.8329212407101006e-05, + "loss": 0.0228, + "step": 2170 + }, + { + "epoch": 0.8688, + "learning_rate": 1.8313726343411085e-05, + "loss": 0.6702, + "step": 2172 + }, + { + "epoch": 0.8696, + "learning_rate": 1.82981754476131e-05, + "loss": 0.0255, + "step": 2174 + }, + { + "epoch": 0.8704, + "learning_rate": 1.8282559840976053e-05, + "loss": 0.7038, + "step": 2176 + }, + { + "epoch": 0.8712, + "learning_rate": 1.8266879645273557e-05, + "loss": 0.2695, + "step": 2178 + }, + { + "epoch": 0.872, + "learning_rate": 1.8251134982782966e-05, + "loss": 0.0189, + "step": 2180 + }, + { + "epoch": 0.8728, + "learning_rate": 1.823532597628428e-05, + "loss": 0.0351, + "step": 2182 + }, + { + "epoch": 0.8736, + "learning_rate": 1.8219452749059336e-05, + "loss": 0.3323, + "step": 2184 + }, + { + "epoch": 0.8744, + "learning_rate": 1.8203515424890734e-05, + "loss": 0.2399, + "step": 2186 + }, + { + "epoch": 0.8752, + "learning_rate": 1.8187514128060956e-05, + "loss": 0.0123, + "step": 2188 + }, + { + "epoch": 0.876, + "learning_rate": 1.8171448983351284e-05, + "loss": 0.3425, + "step": 2190 + }, + { + "epoch": 0.8768, + "learning_rate": 1.8155320116040983e-05, + "loss": 0.0112, + "step": 2192 + }, + { + "epoch": 0.8776, + "learning_rate": 1.8139127651906193e-05, + "loss": 0.0814, + "step": 2194 + }, + { + "epoch": 0.8784, + "learning_rate": 1.8122871717218974e-05, + "loss": 0.0233, + "step": 2196 + }, + { + "epoch": 0.8792, + "learning_rate": 1.8106552438746413e-05, + "loss": 0.0063, + "step": 2198 + }, + { + "epoch": 0.88, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.0577, + "step": 2200 + }, + { + "epoch": 0.8808, + "learning_rate": 1.807372435998219e-05, + "loss": 0.0751, + "step": 2202 + }, + { + "epoch": 0.8816, + "learning_rate": 1.8057215815690487e-05, + "loss": 0.0273, + "step": 2204 + }, + { + "epoch": 0.8824, + "learning_rate": 1.8040644439611355e-05, + "loss": 0.008, + "step": 2206 + }, + { + "epoch": 0.8832, + "learning_rate": 1.8024010360971665e-05, + "loss": 0.0224, + "step": 2208 + }, + { + "epoch": 0.884, + "learning_rate": 1.8007313709487345e-05, + "loss": 0.2811, + "step": 2210 + }, + { + "epoch": 0.8848, + "learning_rate": 1.7990554615362207e-05, + "loss": 0.4007, + "step": 2212 + }, + { + "epoch": 0.8856, + "learning_rate": 1.7973733209287036e-05, + "loss": 0.0078, + "step": 2214 + }, + { + "epoch": 0.8864, + "learning_rate": 1.7956849622438568e-05, + "loss": 0.161, + "step": 2216 + }, + { + "epoch": 0.8872, + "learning_rate": 1.7939903986478357e-05, + "loss": 0.5141, + "step": 2218 + }, + { + "epoch": 0.888, + "learning_rate": 1.7922896433551913e-05, + "loss": 0.0496, + "step": 2220 + }, + { + "epoch": 0.8888, + "learning_rate": 1.7905827096287525e-05, + "loss": 0.0023, + "step": 2222 + }, + { + "epoch": 0.8896, + "learning_rate": 1.7888696107795347e-05, + "loss": 0.0266, + "step": 2224 + }, + { + "epoch": 0.8904, + "learning_rate": 1.787150360166623e-05, + "loss": 0.2279, + "step": 2226 + }, + { + "epoch": 0.8912, + "learning_rate": 1.7854249711970826e-05, + "loss": 0.0023, + "step": 2228 + }, + { + "epoch": 0.892, + "learning_rate": 1.783693457325841e-05, + "loss": 0.0088, + "step": 2230 + }, + { + "epoch": 0.8928, + "learning_rate": 1.7819558320555902e-05, + "loss": 0.1709, + "step": 2232 + }, + { + "epoch": 0.8936, + "learning_rate": 1.780212108936685e-05, + "loss": 0.0026, + "step": 2234 + }, + { + "epoch": 0.8944, + "learning_rate": 1.7784623015670237e-05, + "loss": 0.0001, + "step": 2236 + }, + { + "epoch": 0.8952, + "learning_rate": 1.7767064235919594e-05, + "loss": 0.0021, + "step": 2238 + }, + { + "epoch": 0.896, + "learning_rate": 1.77494448870418e-05, + "loss": 0.5575, + "step": 2240 + }, + { + "epoch": 0.8968, + "learning_rate": 1.773176510643608e-05, + "loss": 0.8552, + "step": 2242 + }, + { + "epoch": 0.8976, + "learning_rate": 1.7714025031972894e-05, + "loss": 0.006, + "step": 2244 + }, + { + "epoch": 0.8984, + "learning_rate": 1.769622480199295e-05, + "loss": 0.4348, + "step": 2246 + }, + { + "epoch": 0.8992, + "learning_rate": 1.7678364555305982e-05, + "loss": 0.0395, + "step": 2248 + }, + { + "epoch": 0.9, + "learning_rate": 1.7660444431189777e-05, + "loss": 0.0625, + "step": 2250 + }, + { + "epoch": 0.9008, + "learning_rate": 1.76424645693891e-05, + "loss": 0.4662, + "step": 2252 + }, + { + "epoch": 0.9016, + "learning_rate": 1.762442511011448e-05, + "loss": 0.0017, + "step": 2254 + }, + { + "epoch": 0.9024, + "learning_rate": 1.7606326194041285e-05, + "loss": 0.1559, + "step": 2256 + }, + { + "epoch": 0.9032, + "learning_rate": 1.7588167962308458e-05, + "loss": 0.6804, + "step": 2258 + }, + { + "epoch": 0.904, + "learning_rate": 1.756995055651757e-05, + "loss": 0.0232, + "step": 2260 + }, + { + "epoch": 0.9048, + "learning_rate": 1.7551674118731585e-05, + "loss": 0.0438, + "step": 2262 + }, + { + "epoch": 0.9056, + "learning_rate": 1.7533338791473875e-05, + "loss": 0.0052, + "step": 2264 + }, + { + "epoch": 0.9064, + "learning_rate": 1.751494471772697e-05, + "loss": 0.1633, + "step": 2266 + }, + { + "epoch": 0.9072, + "learning_rate": 1.7496492040931548e-05, + "loss": 0.1294, + "step": 2268 + }, + { + "epoch": 0.908, + "learning_rate": 1.747798090498533e-05, + "loss": 0.0381, + "step": 2270 + }, + { + "epoch": 0.9088, + "learning_rate": 1.745941145424182e-05, + "loss": 0.473, + "step": 2272 + }, + { + "epoch": 0.9096, + "learning_rate": 1.744078383350938e-05, + "loss": 0.247, + "step": 2274 + }, + { + "epoch": 0.9104, + "learning_rate": 1.7422098188049885e-05, + "loss": 0.0817, + "step": 2276 + }, + { + "epoch": 0.9112, + "learning_rate": 1.7403354663577782e-05, + "loss": 0.004, + "step": 2278 + }, + { + "epoch": 0.912, + "learning_rate": 1.738455340625883e-05, + "loss": 0.4078, + "step": 2280 + }, + { + "epoch": 0.9128, + "learning_rate": 1.7365694562709038e-05, + "loss": 0.0037, + "step": 2282 + }, + { + "epoch": 0.9136, + "learning_rate": 1.7346778279993433e-05, + "loss": 0.0082, + "step": 2284 + }, + { + "epoch": 0.9144, + "learning_rate": 1.7327804705624962e-05, + "loss": 0.0443, + "step": 2286 + }, + { + "epoch": 0.9152, + "learning_rate": 1.730877398756341e-05, + "loss": 0.0072, + "step": 2288 + }, + { + "epoch": 0.916, + "learning_rate": 1.7289686274214113e-05, + "loss": 0.0628, + "step": 2290 + }, + { + "epoch": 0.9168, + "learning_rate": 1.727054171442693e-05, + "loss": 0.2357, + "step": 2292 + }, + { + "epoch": 0.9176, + "learning_rate": 1.7251340457494934e-05, + "loss": 0.3079, + "step": 2294 + }, + { + "epoch": 0.9184, + "learning_rate": 1.7232082653153422e-05, + "loss": 0.2869, + "step": 2296 + }, + { + "epoch": 0.9192, + "learning_rate": 1.7212768451578595e-05, + "loss": 0.0453, + "step": 2298 + }, + { + "epoch": 0.92, + "learning_rate": 1.7193398003386517e-05, + "loss": 0.0131, + "step": 2300 + }, + { + "epoch": 0.9208, + "learning_rate": 1.7173971459631803e-05, + "loss": 0.0003, + "step": 2302 + }, + { + "epoch": 0.9216, + "learning_rate": 1.7154488971806525e-05, + "loss": 0.0726, + "step": 2304 + }, + { + "epoch": 0.9224, + "learning_rate": 1.713495069183907e-05, + "loss": 0.0008, + "step": 2306 + }, + { + "epoch": 0.9232, + "learning_rate": 1.7115356772092847e-05, + "loss": 0.0026, + "step": 2308 + }, + { + "epoch": 0.924, + "learning_rate": 1.709570736536522e-05, + "loss": 0.266, + "step": 2310 + }, + { + "epoch": 0.9248, + "learning_rate": 1.7076002624886152e-05, + "loss": 0.005, + "step": 2312 + }, + { + "epoch": 0.9256, + "learning_rate": 1.705624270431722e-05, + "loss": 0.0005, + "step": 2314 + }, + { + "epoch": 0.9264, + "learning_rate": 1.70364277577502e-05, + "loss": 0.0004, + "step": 2316 + }, + { + "epoch": 0.9272, + "learning_rate": 1.7016557939706078e-05, + "loss": 0.0146, + "step": 2318 + }, + { + "epoch": 0.928, + "learning_rate": 1.6996633405133673e-05, + "loss": 0.0005, + "step": 2320 + }, + { + "epoch": 0.9288, + "learning_rate": 1.6976654309408468e-05, + "loss": 0.0005, + "step": 2322 + }, + { + "epoch": 0.9296, + "learning_rate": 1.6956620808331515e-05, + "loss": 0.0188, + "step": 2324 + }, + { + "epoch": 0.9304, + "learning_rate": 1.6936533058128042e-05, + "loss": 0.0016, + "step": 2326 + }, + { + "epoch": 0.9312, + "learning_rate": 1.691639121544641e-05, + "loss": 0.0101, + "step": 2328 + }, + { + "epoch": 0.932, + "learning_rate": 1.6896195437356696e-05, + "loss": 0.0724, + "step": 2330 + }, + { + "epoch": 0.9328, + "learning_rate": 1.6875945881349686e-05, + "loss": 0.5693, + "step": 2332 + }, + { + "epoch": 0.9336, + "learning_rate": 1.6855642705335435e-05, + "loss": 0.0031, + "step": 2334 + }, + { + "epoch": 0.9344, + "learning_rate": 1.6835286067642228e-05, + "loss": 0.0002, + "step": 2336 + }, + { + "epoch": 0.9352, + "learning_rate": 1.681487612701521e-05, + "loss": 0.2462, + "step": 2338 + }, + { + "epoch": 0.936, + "learning_rate": 1.6794413042615168e-05, + "loss": 0.004, + "step": 2340 + }, + { + "epoch": 0.9368, + "learning_rate": 1.677389697401739e-05, + "loss": 0.0028, + "step": 2342 + }, + { + "epoch": 0.9376, + "learning_rate": 1.675332808121025e-05, + "loss": 0.0787, + "step": 2344 + }, + { + "epoch": 0.9384, + "learning_rate": 1.6732706524594145e-05, + "loss": 0.4253, + "step": 2346 + }, + { + "epoch": 0.9392, + "learning_rate": 1.671203246498009e-05, + "loss": 0.0914, + "step": 2348 + }, + { + "epoch": 0.94, + "learning_rate": 1.6691306063588593e-05, + "loss": 0.0402, + "step": 2350 + }, + { + "epoch": 0.9408, + "learning_rate": 1.6670527482048242e-05, + "loss": 0.003, + "step": 2352 + }, + { + "epoch": 0.9416, + "learning_rate": 1.6649696882394635e-05, + "loss": 0.0003, + "step": 2354 + }, + { + "epoch": 0.9424, + "learning_rate": 1.6628814427068968e-05, + "loss": 0.0497, + "step": 2356 + }, + { + "epoch": 0.9432, + "learning_rate": 1.6607880278916778e-05, + "loss": 0.031, + "step": 2358 + }, + { + "epoch": 0.944, + "learning_rate": 1.6586894601186824e-05, + "loss": 0.0002, + "step": 2360 + }, + { + "epoch": 0.9448, + "learning_rate": 1.656585755752957e-05, + "loss": 0.0034, + "step": 2362 + }, + { + "epoch": 0.9456, + "learning_rate": 1.6544769311996153e-05, + "loss": 0.0004, + "step": 2364 + }, + { + "epoch": 0.9464, + "learning_rate": 1.6523630029036924e-05, + "loss": 0.0001, + "step": 2366 + }, + { + "epoch": 0.9472, + "learning_rate": 1.6502439873500294e-05, + "loss": 1.1633, + "step": 2368 + }, + { + "epoch": 0.948, + "learning_rate": 1.6481199010631305e-05, + "loss": 0.0428, + "step": 2370 + }, + { + "epoch": 0.9488, + "learning_rate": 1.645990760607052e-05, + "loss": 0.0014, + "step": 2372 + }, + { + "epoch": 0.9496, + "learning_rate": 1.643856582585255e-05, + "loss": 0.0009, + "step": 2374 + }, + { + "epoch": 0.9504, + "learning_rate": 1.641717383640488e-05, + "loss": 0.1227, + "step": 2376 + }, + { + "epoch": 0.9512, + "learning_rate": 1.6395731804546596e-05, + "loss": 0.0003, + "step": 2378 + }, + { + "epoch": 0.952, + "learning_rate": 1.63742398974869e-05, + "loss": 0.2144, + "step": 2380 + }, + { + "epoch": 0.9528, + "learning_rate": 1.6352698282824045e-05, + "loss": 0.0011, + "step": 2382 + }, + { + "epoch": 0.9536, + "learning_rate": 1.633110712854385e-05, + "loss": 0.1121, + "step": 2384 + }, + { + "epoch": 0.9544, + "learning_rate": 1.6309466603018504e-05, + "loss": 0.1026, + "step": 2386 + }, + { + "epoch": 0.9552, + "learning_rate": 1.6287776875005148e-05, + "loss": 0.2669, + "step": 2388 + }, + { + "epoch": 0.956, + "learning_rate": 1.6266038113644612e-05, + "loss": 0.3882, + "step": 2390 + }, + { + "epoch": 0.9568, + "learning_rate": 1.624425048846017e-05, + "loss": 0.01, + "step": 2392 + }, + { + "epoch": 0.9576, + "learning_rate": 1.6222414169356063e-05, + "loss": 0.0017, + "step": 2394 + }, + { + "epoch": 0.9584, + "learning_rate": 1.6200529326616343e-05, + "loss": 0.0099, + "step": 2396 + }, + { + "epoch": 0.9592, + "learning_rate": 1.6178596130903345e-05, + "loss": 0.0004, + "step": 2398 + }, + { + "epoch": 0.96, + "learning_rate": 1.6156614753256587e-05, + "loss": 0.13, + "step": 2400 + }, + { + "epoch": 0.9608, + "learning_rate": 1.613458536509123e-05, + "loss": 0.0671, + "step": 2402 + }, + { + "epoch": 0.9616, + "learning_rate": 1.6112508138196922e-05, + "loss": 2.566, + "step": 2404 + }, + { + "epoch": 0.9624, + "learning_rate": 1.6090383244736277e-05, + "loss": 0.001, + "step": 2406 + }, + { + "epoch": 0.9632, + "learning_rate": 1.606821085724363e-05, + "loss": 0.4241, + "step": 2408 + }, + { + "epoch": 0.964, + "learning_rate": 1.6045991148623756e-05, + "loss": 1.1747, + "step": 2410 + }, + { + "epoch": 0.9648, + "learning_rate": 1.602372429215038e-05, + "loss": 0.3063, + "step": 2412 + }, + { + "epoch": 0.9656, + "learning_rate": 1.600141046146497e-05, + "loss": 0.6455, + "step": 2414 + }, + { + "epoch": 0.9664, + "learning_rate": 1.597904983057519e-05, + "loss": 0.5169, + "step": 2416 + }, + { + "epoch": 0.9672, + "learning_rate": 1.5956642573853794e-05, + "loss": 0.1549, + "step": 2418 + }, + { + "epoch": 0.968, + "learning_rate": 1.5934188866037014e-05, + "loss": 0.0871, + "step": 2420 + }, + { + "epoch": 0.9688, + "learning_rate": 1.591168888222342e-05, + "loss": 0.0502, + "step": 2422 + }, + { + "epoch": 0.9696, + "learning_rate": 1.5889142797872407e-05, + "loss": 0.4109, + "step": 2424 + }, + { + "epoch": 0.9704, + "learning_rate": 1.5866550788802818e-05, + "loss": 0.2261, + "step": 2426 + }, + { + "epoch": 0.9712, + "learning_rate": 1.584391303119173e-05, + "loss": 0.0975, + "step": 2428 + }, + { + "epoch": 0.972, + "learning_rate": 1.582122970157289e-05, + "loss": 0.2487, + "step": 2430 + }, + { + "epoch": 0.9728, + "learning_rate": 1.5798500976835503e-05, + "loss": 0.0141, + "step": 2432 + }, + { + "epoch": 0.9736, + "learning_rate": 1.577572703422267e-05, + "loss": 0.6673, + "step": 2434 + }, + { + "epoch": 0.9744, + "learning_rate": 1.575290805133024e-05, + "loss": 0.018, + "step": 2436 + }, + { + "epoch": 0.9752, + "learning_rate": 1.5730044206105156e-05, + "loss": 0.1152, + "step": 2438 + }, + { + "epoch": 0.976, + "learning_rate": 1.570713567684432e-05, + "loss": 0.4949, + "step": 2440 + }, + { + "epoch": 0.9768, + "learning_rate": 1.5684182642193047e-05, + "loss": 0.1438, + "step": 2442 + }, + { + "epoch": 0.9776, + "learning_rate": 1.566118528114367e-05, + "loss": 0.248, + "step": 2444 + }, + { + "epoch": 0.9784, + "learning_rate": 1.563814377303429e-05, + "loss": 0.0297, + "step": 2446 + }, + { + "epoch": 0.9792, + "learning_rate": 1.561505829754715e-05, + "loss": 0.1958, + "step": 2448 + }, + { + "epoch": 0.98, + "learning_rate": 1.5591929034707475e-05, + "loss": 0.0327, + "step": 2450 + }, + { + "epoch": 0.9808, + "learning_rate": 1.5568756164881874e-05, + "loss": 0.084, + "step": 2452 + }, + { + "epoch": 0.9816, + "learning_rate": 1.5545539868777085e-05, + "loss": 0.0017, + "step": 2454 + }, + { + "epoch": 0.9824, + "learning_rate": 1.5522280327438384e-05, + "loss": 0.0033, + "step": 2456 + }, + { + "epoch": 0.9832, + "learning_rate": 1.5498977722248398e-05, + "loss": 0.0753, + "step": 2458 + }, + { + "epoch": 0.984, + "learning_rate": 1.547563223492552e-05, + "loss": 0.1011, + "step": 2460 + }, + { + "epoch": 0.9848, + "learning_rate": 1.5452244047522504e-05, + "loss": 0.1606, + "step": 2462 + }, + { + "epoch": 0.9856, + "learning_rate": 1.5428813342425194e-05, + "loss": 0.0043, + "step": 2464 + }, + { + "epoch": 0.9864, + "learning_rate": 1.5405340302350876e-05, + "loss": 0.0161, + "step": 2466 + }, + { + "epoch": 0.9872, + "learning_rate": 1.538182511034708e-05, + "loss": 0.0018, + "step": 2468 + }, + { + "epoch": 0.988, + "learning_rate": 1.535826794978996e-05, + "loss": 0.0519, + "step": 2470 + }, + { + "epoch": 0.9888, + "learning_rate": 1.5334669004383036e-05, + "loss": 0.0014, + "step": 2472 + }, + { + "epoch": 0.9896, + "learning_rate": 1.5311028458155564e-05, + "loss": 0.0853, + "step": 2474 + }, + { + "epoch": 0.9904, + "learning_rate": 1.528734649546133e-05, + "loss": 0.0207, + "step": 2476 + }, + { + "epoch": 0.9912, + "learning_rate": 1.5263623300976997e-05, + "loss": 0.013, + "step": 2478 + }, + { + "epoch": 0.992, + "learning_rate": 1.5239859059700792e-05, + "loss": 0.002, + "step": 2480 + }, + { + "epoch": 0.9928, + "learning_rate": 1.5216053956951096e-05, + "loss": 0.0041, + "step": 2482 + }, + { + "epoch": 0.9936, + "learning_rate": 1.5192208178364819e-05, + "loss": 0.0017, + "step": 2484 + }, + { + "epoch": 0.9944, + "learning_rate": 1.5168321909896176e-05, + "loss": 0.0465, + "step": 2486 + }, + { + "epoch": 0.9952, + "learning_rate": 1.5144395337815057e-05, + "loss": 0.0026, + "step": 2488 + }, + { + "epoch": 0.996, + "learning_rate": 1.5120428648705722e-05, + "loss": 0.4636, + "step": 2490 + }, + { + "epoch": 0.9968, + "learning_rate": 1.5096422029465171e-05, + "loss": 0.2338, + "step": 2492 + }, + { + "epoch": 0.9976, + "learning_rate": 1.5072375667301904e-05, + "loss": 0.0026, + "step": 2494 + }, + { + "epoch": 0.9984, + "learning_rate": 1.5048289749734231e-05, + "loss": 0.0015, + "step": 2496 + }, + { + "epoch": 0.9992, + "learning_rate": 1.502416446458898e-05, + "loss": 0.2254, + "step": 2498 + }, + { + "epoch": 1.0, + "learning_rate": 1.5000000000000014e-05, + "loss": 0.0009, + "step": 2500 + }, + { + "epoch": 1.0, + "step": 2500, + "total_flos": 1.1095339016650752e+16, + "train_loss": 0.14856628477514022, + "train_runtime": 12512.2958, + "train_samples_per_second": 3.197, + "train_steps_per_second": 0.2 + } + ], + "logging_steps": 2, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 1.1095339016650752e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..58477712887115aafec1c46fbcd552d1b478eab9 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23e3aa6ba728f20117c11a282ee63f55f9ff4dbbcb82c987e3ed9d378d92f896 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..09f20625ce98ebea41fa283bf098c8ef010edb54 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbc06e1fc98f41656fccb78d839ef3e94621728f5e097a6745738e186f8109b1 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..47e557ee0062c8d4e608e316223c4f02ffc25dc9 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89034dd8c7b24f83108cd4522fe63ff5f056122818c218027b30ee3593938d75 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..67220ffbea50b09920177395d0a2996e714fce47 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_25_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffa0fcfe130463ff30bc3ce9cf1d5dce42cf9255c165c67b2a1e2f98ac62c1ab +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_coincide_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_coincide_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fce954bb5ea8e5198215b277e56e69722e9b8c27 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_coincide_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,15032 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004, + "learning_rate": 2.3485380412522497e-06, + "loss": 0.4138, + "step": 2 + }, + { + "epoch": 0.0008, + "learning_rate": 2.357535430610912e-06, + "loss": 0.0009, + "step": 4 + }, + { + "epoch": 0.0012, + "learning_rate": 2.366547719345306e-06, + "loss": 0.7065, + "step": 6 + }, + { + "epoch": 0.0016, + "learning_rate": 2.3755748898855234e-06, + "loss": 0.4059, + "step": 8 + }, + { + "epoch": 0.002, + "learning_rate": 2.3846169246326332e-06, + "loss": 0.2793, + "step": 10 + }, + { + "epoch": 0.0024, + "learning_rate": 2.3936738059587174e-06, + "loss": 0.1857, + "step": 12 + }, + { + "epoch": 0.0028, + "learning_rate": 2.4027455162069537e-06, + "loss": 0.6085, + "step": 14 + }, + { + "epoch": 0.0032, + "learning_rate": 2.411832037691545e-06, + "loss": 0.2253, + "step": 16 + }, + { + "epoch": 0.0036, + "learning_rate": 2.420933352697865e-06, + "loss": 0.0819, + "step": 18 + }, + { + "epoch": 0.004, + "learning_rate": 2.430049443482434e-06, + "loss": 0.0082, + "step": 20 + }, + { + "epoch": 0.0044, + "learning_rate": 2.439180292272967e-06, + "loss": 0.1312, + "step": 22 + }, + { + "epoch": 0.0048, + "learning_rate": 2.448325881268406e-06, + "loss": 0.1015, + "step": 24 + }, + { + "epoch": 0.0052, + "learning_rate": 2.457486192638958e-06, + "loss": 0.0017, + "step": 26 + }, + { + "epoch": 0.0056, + "learning_rate": 2.4666612085261277e-06, + "loss": 0.3433, + "step": 28 + }, + { + "epoch": 0.006, + "learning_rate": 2.475850911042752e-06, + "loss": 0.1802, + "step": 30 + }, + { + "epoch": 0.0064, + "learning_rate": 2.4850552822730346e-06, + "loss": 0.054, + "step": 32 + }, + { + "epoch": 0.0068, + "learning_rate": 2.4942743042725836e-06, + "loss": 0.0344, + "step": 34 + }, + { + "epoch": 0.0072, + "learning_rate": 2.503507959068455e-06, + "loss": 0.0761, + "step": 36 + }, + { + "epoch": 0.0076, + "learning_rate": 2.5127562286591313e-06, + "loss": 0.2786, + "step": 38 + }, + { + "epoch": 0.008, + "learning_rate": 2.522019095014686e-06, + "loss": 0.0028, + "step": 40 + }, + { + "epoch": 0.0084, + "learning_rate": 2.531296540076638e-06, + "loss": 0.0066, + "step": 42 + }, + { + "epoch": 0.0088, + "learning_rate": 2.5405885457581814e-06, + "loss": 0.0421, + "step": 44 + }, + { + "epoch": 0.0092, + "learning_rate": 2.5498950939440413e-06, + "loss": 0.0042, + "step": 46 + }, + { + "epoch": 0.0096, + "learning_rate": 2.5592161664906243e-06, + "loss": 0.0041, + "step": 48 + }, + { + "epoch": 0.01, + "learning_rate": 2.5685517452260587e-06, + "loss": 0.0745, + "step": 50 + }, + { + "epoch": 0.0104, + "learning_rate": 2.5779018119501086e-06, + "loss": 0.0271, + "step": 52 + }, + { + "epoch": 0.0108, + "learning_rate": 2.5872663484343887e-06, + "loss": 0.0788, + "step": 54 + }, + { + "epoch": 0.0112, + "learning_rate": 2.596645336422219e-06, + "loss": 0.2829, + "step": 56 + }, + { + "epoch": 0.0116, + "learning_rate": 2.606038757628795e-06, + "loss": 0.0171, + "step": 58 + }, + { + "epoch": 0.012, + "learning_rate": 2.615446593741161e-06, + "loss": 0.1002, + "step": 60 + }, + { + "epoch": 0.0124, + "learning_rate": 2.6248688264182588e-06, + "loss": 0.0086, + "step": 62 + }, + { + "epoch": 0.0128, + "learning_rate": 2.6343054372909648e-06, + "loss": 0.3297, + "step": 64 + }, + { + "epoch": 0.0132, + "learning_rate": 2.6437564079621235e-06, + "loss": 0.2952, + "step": 66 + }, + { + "epoch": 0.0136, + "learning_rate": 2.6532217200065826e-06, + "loss": 0.7451, + "step": 68 + }, + { + "epoch": 0.014, + "learning_rate": 2.662701354971232e-06, + "loss": 0.9484, + "step": 70 + }, + { + "epoch": 0.0144, + "learning_rate": 2.6721952943750396e-06, + "loss": 0.9914, + "step": 72 + }, + { + "epoch": 0.0148, + "learning_rate": 2.6817035197090825e-06, + "loss": 0.1821, + "step": 74 + }, + { + "epoch": 0.0152, + "learning_rate": 2.691226012436604e-06, + "loss": 0.024, + "step": 76 + }, + { + "epoch": 0.0156, + "learning_rate": 2.7007627539929783e-06, + "loss": 0.0226, + "step": 78 + }, + { + "epoch": 0.016, + "learning_rate": 2.7103137257858893e-06, + "loss": 1.0241, + "step": 80 + }, + { + "epoch": 0.0164, + "learning_rate": 2.7198789091951806e-06, + "loss": 0.0184, + "step": 82 + }, + { + "epoch": 0.0168, + "learning_rate": 2.7294582855730733e-06, + "loss": 0.3889, + "step": 84 + }, + { + "epoch": 0.0172, + "learning_rate": 2.7390518362440843e-06, + "loss": 0.4014, + "step": 86 + }, + { + "epoch": 0.0176, + "learning_rate": 2.7486595425050566e-06, + "loss": 0.7048, + "step": 88 + }, + { + "epoch": 0.018, + "learning_rate": 2.7582813856253264e-06, + "loss": 0.0375, + "step": 90 + }, + { + "epoch": 0.0184, + "learning_rate": 2.7679173468465813e-06, + "loss": 0.0076, + "step": 92 + }, + { + "epoch": 0.0188, + "learning_rate": 2.777567407383033e-06, + "loss": 0.2864, + "step": 94 + }, + { + "epoch": 0.0192, + "learning_rate": 2.7872315484213954e-06, + "loss": 0.3307, + "step": 96 + }, + { + "epoch": 0.0196, + "learning_rate": 2.796909751120931e-06, + "loss": 0.4264, + "step": 98 + }, + { + "epoch": 0.02, + "learning_rate": 2.8066019966134873e-06, + "loss": 0.1978, + "step": 100 + }, + { + "epoch": 0.0204, + "learning_rate": 2.816308266003538e-06, + "loss": 0.034, + "step": 102 + }, + { + "epoch": 0.0208, + "learning_rate": 2.826028540368212e-06, + "loss": 0.0004, + "step": 104 + }, + { + "epoch": 0.0212, + "learning_rate": 2.835762800757338e-06, + "loss": 0.0475, + "step": 106 + }, + { + "epoch": 0.0216, + "learning_rate": 2.845511028193477e-06, + "loss": 0.005, + "step": 108 + }, + { + "epoch": 0.022, + "learning_rate": 2.855273203671962e-06, + "loss": 0.0544, + "step": 110 + }, + { + "epoch": 0.0224, + "learning_rate": 2.865049308160931e-06, + "loss": 0.2841, + "step": 112 + }, + { + "epoch": 0.0228, + "learning_rate": 2.874839322601368e-06, + "loss": 0.1683, + "step": 114 + }, + { + "epoch": 0.0232, + "learning_rate": 2.8846432279071533e-06, + "loss": 0.001, + "step": 116 + }, + { + "epoch": 0.0236, + "learning_rate": 2.8944610049650314e-06, + "loss": 0.245, + "step": 118 + }, + { + "epoch": 0.024, + "learning_rate": 2.9042926346347835e-06, + "loss": 0.2443, + "step": 120 + }, + { + "epoch": 0.0244, + "learning_rate": 2.914138097749143e-06, + "loss": 0.0453, + "step": 122 + }, + { + "epoch": 0.0248, + "learning_rate": 2.9239973751138397e-06, + "loss": 0.1086, + "step": 124 + }, + { + "epoch": 0.0252, + "learning_rate": 2.933870447507756e-06, + "loss": 0.0546, + "step": 126 + }, + { + "epoch": 0.0256, + "learning_rate": 2.943757295682783e-06, + "loss": 0.1546, + "step": 128 + }, + { + "epoch": 0.026, + "learning_rate": 2.953657900364055e-06, + "loss": 0.5038, + "step": 130 + }, + { + "epoch": 0.0264, + "learning_rate": 2.9635722422497983e-06, + "loss": 0.0407, + "step": 132 + }, + { + "epoch": 0.0268, + "learning_rate": 2.973500302011496e-06, + "loss": 0.0056, + "step": 134 + }, + { + "epoch": 0.0272, + "learning_rate": 2.983442060293926e-06, + "loss": 0.0295, + "step": 136 + }, + { + "epoch": 0.0276, + "learning_rate": 2.9933974977150827e-06, + "loss": 0.2821, + "step": 138 + }, + { + "epoch": 0.028, + "learning_rate": 3.003366594866345e-06, + "loss": 0.3919, + "step": 140 + }, + { + "epoch": 0.0284, + "learning_rate": 3.0133493323124474e-06, + "loss": 0.0999, + "step": 142 + }, + { + "epoch": 0.0288, + "learning_rate": 3.0233456905915338e-06, + "loss": 0.153, + "step": 144 + }, + { + "epoch": 0.0292, + "learning_rate": 3.0333556502151895e-06, + "loss": 0.1221, + "step": 146 + }, + { + "epoch": 0.0296, + "learning_rate": 3.0433791916684885e-06, + "loss": 0.0218, + "step": 148 + }, + { + "epoch": 0.03, + "learning_rate": 3.0534162954100234e-06, + "loss": 0.183, + "step": 150 + }, + { + "epoch": 0.0304, + "learning_rate": 3.0634669418719453e-06, + "loss": 0.6585, + "step": 152 + }, + { + "epoch": 0.0308, + "learning_rate": 3.0735311114600064e-06, + "loss": 0.2476, + "step": 154 + }, + { + "epoch": 0.0312, + "learning_rate": 3.0836087845535933e-06, + "loss": 0.0048, + "step": 156 + }, + { + "epoch": 0.0316, + "learning_rate": 3.0936999415057645e-06, + "loss": 0.0113, + "step": 158 + }, + { + "epoch": 0.032, + "learning_rate": 3.1038045626432945e-06, + "loss": 0.0005, + "step": 160 + }, + { + "epoch": 0.0324, + "learning_rate": 3.1139226282667212e-06, + "loss": 0.0008, + "step": 162 + }, + { + "epoch": 0.0328, + "learning_rate": 3.1240541186503173e-06, + "loss": 0.2029, + "step": 164 + }, + { + "epoch": 0.0332, + "learning_rate": 3.134199014042277e-06, + "loss": 0.2696, + "step": 166 + }, + { + "epoch": 0.0336, + "learning_rate": 3.1443572946645683e-06, + "loss": 0.0033, + "step": 168 + }, + { + "epoch": 0.034, + "learning_rate": 3.154528940713103e-06, + "loss": 0.0913, + "step": 170 + }, + { + "epoch": 0.0344, + "learning_rate": 3.164713932357776e-06, + "loss": 0.044, + "step": 172 + }, + { + "epoch": 0.0348, + "learning_rate": 3.1749122497423724e-06, + "loss": 0.7367, + "step": 174 + }, + { + "epoch": 0.0352, + "learning_rate": 3.1851238729848033e-06, + "loss": 0.0246, + "step": 176 + }, + { + "epoch": 0.0356, + "learning_rate": 3.195348782176948e-06, + "loss": 0.2839, + "step": 178 + }, + { + "epoch": 0.036, + "learning_rate": 3.205586957384834e-06, + "loss": 0.1741, + "step": 180 + }, + { + "epoch": 0.0364, + "learning_rate": 3.215838378648617e-06, + "loss": 0.0604, + "step": 182 + }, + { + "epoch": 0.0368, + "learning_rate": 3.2261030259826253e-06, + "loss": 0.1237, + "step": 184 + }, + { + "epoch": 0.0372, + "learning_rate": 3.2363808793754036e-06, + "loss": 0.2438, + "step": 186 + }, + { + "epoch": 0.0376, + "learning_rate": 3.246671918789752e-06, + "loss": 0.1712, + "step": 188 + }, + { + "epoch": 0.038, + "learning_rate": 3.2569761241627617e-06, + "loss": 0.4667, + "step": 190 + }, + { + "epoch": 0.0384, + "learning_rate": 3.267293475405858e-06, + "loss": 0.0022, + "step": 192 + }, + { + "epoch": 0.0388, + "learning_rate": 3.277623952404835e-06, + "loss": 0.0042, + "step": 194 + }, + { + "epoch": 0.0392, + "learning_rate": 3.2879675350199004e-06, + "loss": 0.2523, + "step": 196 + }, + { + "epoch": 0.0396, + "learning_rate": 3.298324203085723e-06, + "loss": 0.073, + "step": 198 + }, + { + "epoch": 0.04, + "learning_rate": 3.3086939364114113e-06, + "loss": 0.0064, + "step": 200 + }, + { + "epoch": 0.0404, + "learning_rate": 3.3190767147806892e-06, + "loss": 0.0024, + "step": 202 + }, + { + "epoch": 0.0408, + "learning_rate": 3.329472517951747e-06, + "loss": 0.0373, + "step": 204 + }, + { + "epoch": 0.0412, + "learning_rate": 3.3398813256574745e-06, + "loss": 0.0005, + "step": 206 + }, + { + "epoch": 0.0416, + "learning_rate": 3.350303117605369e-06, + "loss": 0.0007, + "step": 208 + }, + { + "epoch": 0.042, + "learning_rate": 3.360737873477574e-06, + "loss": 0.0458, + "step": 210 + }, + { + "epoch": 0.0424, + "learning_rate": 3.3711855729310503e-06, + "loss": 0.0575, + "step": 212 + }, + { + "epoch": 0.0428, + "learning_rate": 3.3816461955974224e-06, + "loss": 0.3038, + "step": 214 + }, + { + "epoch": 0.0432, + "learning_rate": 3.3921197210832235e-06, + "loss": 0.1678, + "step": 216 + }, + { + "epoch": 0.0436, + "learning_rate": 3.4026061289697397e-06, + "loss": 0.8024, + "step": 218 + }, + { + "epoch": 0.044, + "learning_rate": 3.4131053988131947e-06, + "loss": 0.0355, + "step": 220 + }, + { + "epoch": 0.0444, + "learning_rate": 3.4236175101447257e-06, + "loss": 0.2177, + "step": 222 + }, + { + "epoch": 0.0448, + "learning_rate": 3.434142442470434e-06, + "loss": 0.285, + "step": 224 + }, + { + "epoch": 0.0452, + "learning_rate": 3.444680175271424e-06, + "loss": 0.01, + "step": 226 + }, + { + "epoch": 0.0456, + "learning_rate": 3.455230688003849e-06, + "loss": 0.176, + "step": 228 + }, + { + "epoch": 0.046, + "learning_rate": 3.465793960098942e-06, + "loss": 0.2003, + "step": 230 + }, + { + "epoch": 0.0464, + "learning_rate": 3.476369970963065e-06, + "loss": 0.5468, + "step": 232 + }, + { + "epoch": 0.0468, + "learning_rate": 3.486958699977743e-06, + "loss": 0.0087, + "step": 234 + }, + { + "epoch": 0.0472, + "learning_rate": 3.497560126499706e-06, + "loss": 0.0095, + "step": 236 + }, + { + "epoch": 0.0476, + "learning_rate": 3.508174229860947e-06, + "loss": 0.0013, + "step": 238 + }, + { + "epoch": 0.048, + "learning_rate": 3.5188009893686836e-06, + "loss": 0.3184, + "step": 240 + }, + { + "epoch": 0.0484, + "learning_rate": 3.5294403843055493e-06, + "loss": 0.1307, + "step": 242 + }, + { + "epoch": 0.0488, + "learning_rate": 3.5400923939294827e-06, + "loss": 0.545, + "step": 244 + }, + { + "epoch": 0.0492, + "learning_rate": 3.5507569974738477e-06, + "loss": 0.7428, + "step": 246 + }, + { + "epoch": 0.0496, + "learning_rate": 3.5614341741474667e-06, + "loss": 0.0012, + "step": 248 + }, + { + "epoch": 0.05, + "learning_rate": 3.5721239031345966e-06, + "loss": 0.0095, + "step": 250 + }, + { + "epoch": 0.0504, + "learning_rate": 3.5828261635951177e-06, + "loss": 0.0046, + "step": 252 + }, + { + "epoch": 0.0508, + "learning_rate": 3.593540934664387e-06, + "loss": 0.0372, + "step": 254 + }, + { + "epoch": 0.0512, + "learning_rate": 3.604268195453421e-06, + "loss": 0.0034, + "step": 256 + }, + { + "epoch": 0.0516, + "learning_rate": 3.6150079250488767e-06, + "loss": 0.1263, + "step": 258 + }, + { + "epoch": 0.052, + "learning_rate": 3.6257601025130893e-06, + "loss": 0.309, + "step": 260 + }, + { + "epoch": 0.0524, + "learning_rate": 3.636524706884178e-06, + "loss": 0.0124, + "step": 262 + }, + { + "epoch": 0.0528, + "learning_rate": 3.647301717175955e-06, + "loss": 0.0008, + "step": 264 + }, + { + "epoch": 0.0532, + "learning_rate": 3.6580911123781025e-06, + "loss": 0.0045, + "step": 266 + }, + { + "epoch": 0.0536, + "learning_rate": 3.66889287145614e-06, + "loss": 0.0005, + "step": 268 + }, + { + "epoch": 0.054, + "learning_rate": 3.679706973351488e-06, + "loss": 0.1996, + "step": 270 + }, + { + "epoch": 0.0544, + "learning_rate": 3.6905333969814995e-06, + "loss": 0.0024, + "step": 272 + }, + { + "epoch": 0.0548, + "learning_rate": 3.701372121239508e-06, + "loss": 0.1811, + "step": 274 + }, + { + "epoch": 0.0552, + "learning_rate": 3.712223124994867e-06, + "loss": 0.2194, + "step": 276 + }, + { + "epoch": 0.0556, + "learning_rate": 3.723086387092989e-06, + "loss": 0.3104, + "step": 278 + }, + { + "epoch": 0.056, + "learning_rate": 3.7339618863553885e-06, + "loss": 0.0028, + "step": 280 + }, + { + "epoch": 0.0564, + "learning_rate": 3.744849601579722e-06, + "loss": 0.0292, + "step": 282 + }, + { + "epoch": 0.0568, + "learning_rate": 3.755749511539848e-06, + "loss": 0.4843, + "step": 284 + }, + { + "epoch": 0.0572, + "learning_rate": 3.7666615949857897e-06, + "loss": 0.0005, + "step": 286 + }, + { + "epoch": 0.0576, + "learning_rate": 3.7775858306439404e-06, + "loss": 0.1602, + "step": 288 + }, + { + "epoch": 0.058, + "learning_rate": 3.7885221972168864e-06, + "loss": 0.4085, + "step": 290 + }, + { + "epoch": 0.0584, + "learning_rate": 3.799470673383677e-06, + "loss": 0.0457, + "step": 292 + }, + { + "epoch": 0.0588, + "learning_rate": 3.810431237799657e-06, + "loss": 0.1898, + "step": 294 + }, + { + "epoch": 0.0592, + "learning_rate": 3.821403869096644e-06, + "loss": 0.0856, + "step": 296 + }, + { + "epoch": 0.0596, + "learning_rate": 3.8323885458829745e-06, + "loss": 0.2074, + "step": 298 + }, + { + "epoch": 0.06, + "learning_rate": 3.8433852467434175e-06, + "loss": 0.0109, + "step": 300 + }, + { + "epoch": 0.0604, + "learning_rate": 3.854393950239355e-06, + "loss": 0.0007, + "step": 302 + }, + { + "epoch": 0.0608, + "learning_rate": 3.865414634908756e-06, + "loss": 0.1477, + "step": 304 + }, + { + "epoch": 0.0612, + "learning_rate": 3.876447279266233e-06, + "loss": 0.3872, + "step": 306 + }, + { + "epoch": 0.0616, + "learning_rate": 3.887491861803081e-06, + "loss": 0.1357, + "step": 308 + }, + { + "epoch": 0.062, + "learning_rate": 3.898548360987321e-06, + "loss": 0.6838, + "step": 310 + }, + { + "epoch": 0.0624, + "learning_rate": 3.909616755263741e-06, + "loss": 0.1717, + "step": 312 + }, + { + "epoch": 0.0628, + "learning_rate": 3.920697023053941e-06, + "loss": 0.0036, + "step": 314 + }, + { + "epoch": 0.0632, + "learning_rate": 3.9317891427563725e-06, + "loss": 0.075, + "step": 316 + }, + { + "epoch": 0.0636, + "learning_rate": 3.942893092746381e-06, + "loss": 0.1567, + "step": 318 + }, + { + "epoch": 0.064, + "learning_rate": 3.954008851376244e-06, + "loss": 0.1251, + "step": 320 + }, + { + "epoch": 0.0644, + "learning_rate": 3.965136396975227e-06, + "loss": 0.3452, + "step": 322 + }, + { + "epoch": 0.0648, + "learning_rate": 3.976275707849619e-06, + "loss": 0.4217, + "step": 324 + }, + { + "epoch": 0.0652, + "learning_rate": 3.987426762282726e-06, + "loss": 0.0614, + "step": 326 + }, + { + "epoch": 0.0656, + "learning_rate": 3.99858953853505e-06, + "loss": 0.777, + "step": 328 + }, + { + "epoch": 0.066, + "learning_rate": 4.009764014844146e-06, + "loss": 0.0018, + "step": 330 + }, + { + "epoch": 0.0664, + "learning_rate": 4.0209501694248e-06, + "loss": 0.0002, + "step": 332 + }, + { + "epoch": 0.0668, + "learning_rate": 4.032147980469076e-06, + "loss": 0.229, + "step": 334 + }, + { + "epoch": 0.0672, + "learning_rate": 4.043357426146209e-06, + "loss": 0.0272, + "step": 336 + }, + { + "epoch": 0.0676, + "learning_rate": 4.054578484602869e-06, + "loss": 0.0091, + "step": 338 + }, + { + "epoch": 0.068, + "learning_rate": 4.065811133962987e-06, + "loss": 0.049, + "step": 340 + }, + { + "epoch": 0.0684, + "learning_rate": 4.07705535232795e-06, + "loss": 0.3538, + "step": 342 + }, + { + "epoch": 0.0688, + "learning_rate": 4.08831111777658e-06, + "loss": 0.0004, + "step": 344 + }, + { + "epoch": 0.0692, + "learning_rate": 4.0995784083651865e-06, + "loss": 0.0027, + "step": 346 + }, + { + "epoch": 0.0696, + "learning_rate": 4.110857202127611e-06, + "loss": 0.0103, + "step": 348 + }, + { + "epoch": 0.07, + "learning_rate": 4.122147477075266e-06, + "loss": 0.0011, + "step": 350 + }, + { + "epoch": 0.0704, + "learning_rate": 4.133449211197183e-06, + "loss": 0.1724, + "step": 352 + }, + { + "epoch": 0.0708, + "learning_rate": 4.144762382460055e-06, + "loss": 0.2228, + "step": 354 + }, + { + "epoch": 0.0712, + "learning_rate": 4.156086968808274e-06, + "loss": 0.042, + "step": 356 + }, + { + "epoch": 0.0716, + "learning_rate": 4.1674229481639796e-06, + "loss": 0.2434, + "step": 358 + }, + { + "epoch": 0.072, + "learning_rate": 4.178770298427114e-06, + "loss": 0.0957, + "step": 360 + }, + { + "epoch": 0.0724, + "learning_rate": 4.190128997475395e-06, + "loss": 0.4496, + "step": 362 + }, + { + "epoch": 0.0728, + "learning_rate": 4.201499023164515e-06, + "loss": 0.3847, + "step": 364 + }, + { + "epoch": 0.0732, + "learning_rate": 4.212880353327968e-06, + "loss": 0.5516, + "step": 366 + }, + { + "epoch": 0.0736, + "learning_rate": 4.224272965777315e-06, + "loss": 0.1318, + "step": 368 + }, + { + "epoch": 0.074, + "learning_rate": 4.235676838302072e-06, + "loss": 0.003, + "step": 370 + }, + { + "epoch": 0.0744, + "learning_rate": 4.247091948669764e-06, + "loss": 0.403, + "step": 372 + }, + { + "epoch": 0.0748, + "learning_rate": 4.258518274626106e-06, + "loss": 0.0408, + "step": 374 + }, + { + "epoch": 0.0752, + "learning_rate": 4.269955793894849e-06, + "loss": 0.0282, + "step": 376 + }, + { + "epoch": 0.0756, + "learning_rate": 4.281404484177978e-06, + "loss": 0.0029, + "step": 378 + }, + { + "epoch": 0.076, + "learning_rate": 4.292864323155684e-06, + "loss": 0.3571, + "step": 380 + }, + { + "epoch": 0.0764, + "learning_rate": 4.304335288486412e-06, + "loss": 0.2027, + "step": 382 + }, + { + "epoch": 0.0768, + "learning_rate": 4.3158173578069696e-06, + "loss": 0.016, + "step": 384 + }, + { + "epoch": 0.0772, + "learning_rate": 4.327310508732434e-06, + "loss": 0.3407, + "step": 386 + }, + { + "epoch": 0.0776, + "learning_rate": 4.338814718856333e-06, + "loss": 0.0691, + "step": 388 + }, + { + "epoch": 0.078, + "learning_rate": 4.350329965750618e-06, + "loss": 0.0014, + "step": 390 + }, + { + "epoch": 0.0784, + "learning_rate": 4.3618562269657285e-06, + "loss": 0.2911, + "step": 392 + }, + { + "epoch": 0.0788, + "learning_rate": 4.373393480030629e-06, + "loss": 0.0091, + "step": 394 + }, + { + "epoch": 0.0792, + "learning_rate": 4.384941702452852e-06, + "loss": 0.3078, + "step": 396 + }, + { + "epoch": 0.0796, + "learning_rate": 4.396500871718548e-06, + "loss": 0.5986, + "step": 398 + }, + { + "epoch": 0.08, + "learning_rate": 4.408070965292526e-06, + "loss": 0.9499, + "step": 400 + }, + { + "epoch": 0.0804, + "learning_rate": 4.419651960618294e-06, + "loss": 0.0654, + "step": 402 + }, + { + "epoch": 0.0808, + "learning_rate": 4.431243835118112e-06, + "loss": 0.0021, + "step": 404 + }, + { + "epoch": 0.0812, + "learning_rate": 4.442846566193041e-06, + "loss": 0.0004, + "step": 406 + }, + { + "epoch": 0.0816, + "learning_rate": 4.4544601312229185e-06, + "loss": 0.2285, + "step": 408 + }, + { + "epoch": 0.082, + "learning_rate": 4.4660845075665635e-06, + "loss": 0.1322, + "step": 410 + }, + { + "epoch": 0.0824, + "learning_rate": 4.477719672561602e-06, + "loss": 0.0039, + "step": 412 + }, + { + "epoch": 0.0828, + "learning_rate": 4.489365603524743e-06, + "loss": 0.001, + "step": 414 + }, + { + "epoch": 0.0832, + "learning_rate": 4.501022277751605e-06, + "loss": 0.0457, + "step": 416 + }, + { + "epoch": 0.0836, + "learning_rate": 4.5126896725169025e-06, + "loss": 0.3251, + "step": 418 + }, + { + "epoch": 0.084, + "learning_rate": 4.524367765074499e-06, + "loss": 0.0131, + "step": 420 + }, + { + "epoch": 0.0844, + "learning_rate": 4.536056532657295e-06, + "loss": 0.0311, + "step": 422 + }, + { + "epoch": 0.0848, + "learning_rate": 4.5477559524775e-06, + "loss": 0.0297, + "step": 424 + }, + { + "epoch": 0.0852, + "learning_rate": 4.559466001726451e-06, + "loss": 0.0112, + "step": 426 + }, + { + "epoch": 0.0856, + "learning_rate": 4.571186657574823e-06, + "loss": 0.0023, + "step": 428 + }, + { + "epoch": 0.086, + "learning_rate": 4.582917897172599e-06, + "loss": 0.4661, + "step": 430 + }, + { + "epoch": 0.0864, + "learning_rate": 4.5946596976491254e-06, + "loss": 0.2911, + "step": 432 + }, + { + "epoch": 0.0868, + "learning_rate": 4.6064120361131624e-06, + "loss": 0.1303, + "step": 434 + }, + { + "epoch": 0.0872, + "learning_rate": 4.618174889652924e-06, + "loss": 0.201, + "step": 436 + }, + { + "epoch": 0.0876, + "learning_rate": 4.629948235336126e-06, + "loss": 0.8038, + "step": 438 + }, + { + "epoch": 0.088, + "learning_rate": 4.6417320502100286e-06, + "loss": 0.3002, + "step": 440 + }, + { + "epoch": 0.0884, + "learning_rate": 4.653526311301479e-06, + "loss": 0.0017, + "step": 442 + }, + { + "epoch": 0.0888, + "learning_rate": 4.665330995616967e-06, + "loss": 0.0393, + "step": 444 + }, + { + "epoch": 0.0892, + "learning_rate": 4.677146080142667e-06, + "loss": 0.1937, + "step": 446 + }, + { + "epoch": 0.0896, + "learning_rate": 4.688971541844424e-06, + "loss": 0.0474, + "step": 448 + }, + { + "epoch": 0.09, + "learning_rate": 4.700807357667956e-06, + "loss": 0.0132, + "step": 450 + }, + { + "epoch": 0.0904, + "learning_rate": 4.712653504538672e-06, + "loss": 0.0783, + "step": 452 + }, + { + "epoch": 0.0908, + "learning_rate": 4.7245099593619495e-06, + "loss": 0.0359, + "step": 454 + }, + { + "epoch": 0.0912, + "learning_rate": 4.736376699023023e-06, + "loss": 0.3504, + "step": 456 + }, + { + "epoch": 0.0916, + "learning_rate": 4.74825370038703e-06, + "loss": 0.0008, + "step": 458 + }, + { + "epoch": 0.092, + "learning_rate": 4.76014094029921e-06, + "loss": 0.068, + "step": 460 + }, + { + "epoch": 0.0924, + "learning_rate": 4.772038395584735e-06, + "loss": 0.1723, + "step": 462 + }, + { + "epoch": 0.0928, + "learning_rate": 4.7839460430489216e-06, + "loss": 0.1753, + "step": 464 + }, + { + "epoch": 0.0932, + "learning_rate": 4.7958638594772035e-06, + "loss": 0.0032, + "step": 466 + }, + { + "epoch": 0.0936, + "learning_rate": 4.807791821635185e-06, + "loss": 0.1708, + "step": 468 + }, + { + "epoch": 0.094, + "learning_rate": 4.8197299062686954e-06, + "loss": 0.0218, + "step": 470 + }, + { + "epoch": 0.0944, + "learning_rate": 4.831678090103828e-06, + "loss": 0.0025, + "step": 472 + }, + { + "epoch": 0.0948, + "learning_rate": 4.8436363498469865e-06, + "loss": 0.126, + "step": 474 + }, + { + "epoch": 0.0952, + "learning_rate": 4.855604662184931e-06, + "loss": 0.0003, + "step": 476 + }, + { + "epoch": 0.0956, + "learning_rate": 4.867583003784825e-06, + "loss": 0.1255, + "step": 478 + }, + { + "epoch": 0.096, + "learning_rate": 4.8795713512942785e-06, + "loss": 0.2226, + "step": 480 + }, + { + "epoch": 0.0964, + "learning_rate": 4.891569681341395e-06, + "loss": 0.2277, + "step": 482 + }, + { + "epoch": 0.0968, + "learning_rate": 4.903577970534815e-06, + "loss": 0.658, + "step": 484 + }, + { + "epoch": 0.0972, + "learning_rate": 4.91559619546378e-06, + "loss": 0.3865, + "step": 486 + }, + { + "epoch": 0.0976, + "learning_rate": 4.9276243326981e-06, + "loss": 0.0016, + "step": 488 + }, + { + "epoch": 0.098, + "learning_rate": 4.939662358788352e-06, + "loss": 0.0951, + "step": 490 + }, + { + "epoch": 0.0984, + "learning_rate": 4.951710250265788e-06, + "loss": 0.0086, + "step": 492 + }, + { + "epoch": 0.0988, + "learning_rate": 4.96376798364238e-06, + "loss": 0.0148, + "step": 494 + }, + { + "epoch": 0.0992, + "learning_rate": 4.975835535411023e-06, + "loss": 0.3558, + "step": 496 + }, + { + "epoch": 0.0996, + "learning_rate": 4.987912882045345e-06, + "loss": 0.0636, + "step": 498 + }, + { + "epoch": 0.1, + "learning_rate": 5.000000000000003e-06, + "loss": 0.059, + "step": 500 + }, + { + "epoch": 0.1004, + "learning_rate": 5.012096865710493e-06, + "loss": 0.3538, + "step": 502 + }, + { + "epoch": 0.1008, + "learning_rate": 5.024203455593375e-06, + "loss": 0.0046, + "step": 504 + }, + { + "epoch": 0.1012, + "learning_rate": 5.036319746046232e-06, + "loss": 0.2433, + "step": 506 + }, + { + "epoch": 0.1016, + "learning_rate": 5.048445713447734e-06, + "loss": 0.6191, + "step": 508 + }, + { + "epoch": 0.102, + "learning_rate": 5.0605813341576885e-06, + "loss": 0.2229, + "step": 510 + }, + { + "epoch": 0.1024, + "learning_rate": 5.072726584517083e-06, + "loss": 0.0449, + "step": 512 + }, + { + "epoch": 0.1028, + "learning_rate": 5.084881440848126e-06, + "loss": 0.2145, + "step": 514 + }, + { + "epoch": 0.1032, + "learning_rate": 5.097045879454308e-06, + "loss": 0.0466, + "step": 516 + }, + { + "epoch": 0.1036, + "learning_rate": 5.109219876620433e-06, + "loss": 0.1997, + "step": 518 + }, + { + "epoch": 0.104, + "learning_rate": 5.1214034086126685e-06, + "loss": 0.2181, + "step": 520 + }, + { + "epoch": 0.1044, + "learning_rate": 5.133596451678611e-06, + "loss": 0.017, + "step": 522 + }, + { + "epoch": 0.1048, + "learning_rate": 5.145798982047253e-06, + "loss": 0.0003, + "step": 524 + }, + { + "epoch": 0.1052, + "learning_rate": 5.158010975929185e-06, + "loss": 0.3547, + "step": 526 + }, + { + "epoch": 0.1056, + "learning_rate": 5.170232409516483e-06, + "loss": 0.1426, + "step": 528 + }, + { + "epoch": 0.106, + "learning_rate": 5.182463258982837e-06, + "loss": 0.092, + "step": 530 + }, + { + "epoch": 0.1064, + "learning_rate": 5.194703500483597e-06, + "loss": 0.0163, + "step": 532 + }, + { + "epoch": 0.1068, + "learning_rate": 5.2069531101557395e-06, + "loss": 0.3296, + "step": 534 + }, + { + "epoch": 0.1072, + "learning_rate": 5.219212064118082e-06, + "loss": 0.0008, + "step": 536 + }, + { + "epoch": 0.1076, + "learning_rate": 5.231480338471124e-06, + "loss": 0.85, + "step": 538 + }, + { + "epoch": 0.108, + "learning_rate": 5.24375790929725e-06, + "loss": 0.0034, + "step": 540 + }, + { + "epoch": 0.1084, + "learning_rate": 5.256044752660709e-06, + "loss": 0.0328, + "step": 542 + }, + { + "epoch": 0.1088, + "learning_rate": 5.268340844607653e-06, + "loss": 0.0251, + "step": 544 + }, + { + "epoch": 0.1092, + "learning_rate": 5.2806461611662725e-06, + "loss": 0.0034, + "step": 546 + }, + { + "epoch": 0.1096, + "learning_rate": 5.2929606783466735e-06, + "loss": 0.2658, + "step": 548 + }, + { + "epoch": 0.11, + "learning_rate": 5.305284372141091e-06, + "loss": 0.0057, + "step": 550 + }, + { + "epoch": 0.1104, + "learning_rate": 5.317617218523853e-06, + "loss": 0.0671, + "step": 552 + }, + { + "epoch": 0.1108, + "learning_rate": 5.3299591934514435e-06, + "loss": 0.0037, + "step": 554 + }, + { + "epoch": 0.1112, + "learning_rate": 5.342310272862553e-06, + "loss": 0.2567, + "step": 556 + }, + { + "epoch": 0.1116, + "learning_rate": 5.354670432678119e-06, + "loss": 0.0672, + "step": 558 + }, + { + "epoch": 0.112, + "learning_rate": 5.367039648801377e-06, + "loss": 0.0011, + "step": 560 + }, + { + "epoch": 0.1124, + "learning_rate": 5.379417897117909e-06, + "loss": 0.0993, + "step": 562 + }, + { + "epoch": 0.1128, + "learning_rate": 5.391805153495684e-06, + "loss": 0.0027, + "step": 564 + }, + { + "epoch": 0.1132, + "learning_rate": 5.404201393785113e-06, + "loss": 0.0549, + "step": 566 + }, + { + "epoch": 0.1136, + "learning_rate": 5.416606593819109e-06, + "loss": 0.0233, + "step": 568 + }, + { + "epoch": 0.114, + "learning_rate": 5.429020729413049e-06, + "loss": 0.2336, + "step": 570 + }, + { + "epoch": 0.1144, + "learning_rate": 5.441443776365005e-06, + "loss": 0.1941, + "step": 572 + }, + { + "epoch": 0.1148, + "learning_rate": 5.453875710455549e-06, + "loss": 0.0985, + "step": 574 + }, + { + "epoch": 0.1152, + "learning_rate": 5.466316507448053e-06, + "loss": 0.301, + "step": 576 + }, + { + "epoch": 0.1156, + "learning_rate": 5.478766143088497e-06, + "loss": 0.0009, + "step": 578 + }, + { + "epoch": 0.116, + "learning_rate": 5.49122459310568e-06, + "loss": 0.3074, + "step": 580 + }, + { + "epoch": 0.1164, + "learning_rate": 5.503691833211264e-06, + "loss": 0.5729, + "step": 582 + }, + { + "epoch": 0.1168, + "learning_rate": 5.516167839099662e-06, + "loss": 0.0785, + "step": 584 + }, + { + "epoch": 0.1172, + "learning_rate": 5.5286525864483285e-06, + "loss": 0.1387, + "step": 586 + }, + { + "epoch": 0.1176, + "learning_rate": 5.5411460509175605e-06, + "loss": 0.5108, + "step": 588 + }, + { + "epoch": 0.118, + "learning_rate": 5.553648208150724e-06, + "loss": 0.0016, + "step": 590 + }, + { + "epoch": 0.1184, + "learning_rate": 5.5661590337742255e-06, + "loss": 0.3277, + "step": 592 + }, + { + "epoch": 0.1188, + "learning_rate": 5.57867850339757e-06, + "loss": 0.028, + "step": 594 + }, + { + "epoch": 0.1192, + "learning_rate": 5.591206592613412e-06, + "loss": 0.0431, + "step": 596 + }, + { + "epoch": 0.1196, + "learning_rate": 5.603743276997597e-06, + "loss": 0.18, + "step": 598 + }, + { + "epoch": 0.12, + "learning_rate": 5.616288532109221e-06, + "loss": 0.0535, + "step": 600 + }, + { + "epoch": 0.1204, + "learning_rate": 5.628842333490665e-06, + "loss": 0.4899, + "step": 602 + }, + { + "epoch": 0.1208, + "learning_rate": 5.641404656667652e-06, + "loss": 0.4787, + "step": 604 + }, + { + "epoch": 0.1212, + "learning_rate": 5.653975477149289e-06, + "loss": 0.3691, + "step": 606 + }, + { + "epoch": 0.1216, + "learning_rate": 5.666554770428136e-06, + "loss": 0.1477, + "step": 608 + }, + { + "epoch": 0.122, + "learning_rate": 5.679142511980168e-06, + "loss": 0.0638, + "step": 610 + }, + { + "epoch": 0.1224, + "learning_rate": 5.6917386772650015e-06, + "loss": 0.4013, + "step": 612 + }, + { + "epoch": 0.1228, + "learning_rate": 5.7043432417257076e-06, + "loss": 0.0168, + "step": 614 + }, + { + "epoch": 0.1232, + "learning_rate": 5.716956180789086e-06, + "loss": 0.1177, + "step": 616 + }, + { + "epoch": 0.1236, + "learning_rate": 5.729577469865569e-06, + "loss": 0.0494, + "step": 618 + }, + { + "epoch": 0.124, + "learning_rate": 5.74220708434926e-06, + "loss": 0.4121, + "step": 620 + }, + { + "epoch": 0.1244, + "learning_rate": 5.754844999618143e-06, + "loss": 0.0622, + "step": 622 + }, + { + "epoch": 0.1248, + "learning_rate": 5.767491191033909e-06, + "loss": 0.4892, + "step": 624 + }, + { + "epoch": 0.1252, + "learning_rate": 5.780145633942173e-06, + "loss": 0.0004, + "step": 626 + }, + { + "epoch": 0.1256, + "learning_rate": 5.7928083036724535e-06, + "loss": 0.3675, + "step": 628 + }, + { + "epoch": 0.126, + "learning_rate": 5.8054791755382125e-06, + "loss": 0.0068, + "step": 630 + }, + { + "epoch": 0.1264, + "learning_rate": 5.818158224836983e-06, + "loss": 0.0004, + "step": 632 + }, + { + "epoch": 0.1268, + "learning_rate": 5.830845426850263e-06, + "loss": 0.8001, + "step": 634 + }, + { + "epoch": 0.1272, + "learning_rate": 5.8435407568437194e-06, + "loss": 0.0048, + "step": 636 + }, + { + "epoch": 0.1276, + "learning_rate": 5.856244190067155e-06, + "loss": 0.1921, + "step": 638 + }, + { + "epoch": 0.128, + "learning_rate": 5.868955701754577e-06, + "loss": 0.8996, + "step": 640 + }, + { + "epoch": 0.1284, + "learning_rate": 5.881675267124245e-06, + "loss": 0.2708, + "step": 642 + }, + { + "epoch": 0.1288, + "learning_rate": 5.894402861378714e-06, + "loss": 0.1667, + "step": 644 + }, + { + "epoch": 0.1292, + "learning_rate": 5.907138459704886e-06, + "loss": 0.376, + "step": 646 + }, + { + "epoch": 0.1296, + "learning_rate": 5.919882037274065e-06, + "loss": 0.1152, + "step": 648 + }, + { + "epoch": 0.13, + "learning_rate": 5.932633569241989e-06, + "loss": 0.0524, + "step": 650 + }, + { + "epoch": 0.1304, + "learning_rate": 5.9453930307488985e-06, + "loss": 0.0148, + "step": 652 + }, + { + "epoch": 0.1308, + "learning_rate": 5.958160396919584e-06, + "loss": 0.0661, + "step": 654 + }, + { + "epoch": 0.1312, + "learning_rate": 5.970935642863362e-06, + "loss": 0.238, + "step": 656 + }, + { + "epoch": 0.1316, + "learning_rate": 5.983718743674305e-06, + "loss": 0.3458, + "step": 658 + }, + { + "epoch": 0.132, + "learning_rate": 5.996509674431038e-06, + "loss": 0.0016, + "step": 660 + }, + { + "epoch": 0.1324, + "learning_rate": 6.00930841019705e-06, + "loss": 0.1368, + "step": 662 + }, + { + "epoch": 0.1328, + "learning_rate": 6.022114926020505e-06, + "loss": 0.0208, + "step": 664 + }, + { + "epoch": 0.1332, + "learning_rate": 6.0349291969344426e-06, + "loss": 0.2478, + "step": 666 + }, + { + "epoch": 0.1336, + "learning_rate": 6.047751197956836e-06, + "loss": 0.0028, + "step": 668 + }, + { + "epoch": 0.134, + "learning_rate": 6.060580904090489e-06, + "loss": 0.3334, + "step": 670 + }, + { + "epoch": 0.1344, + "learning_rate": 6.0734182903232475e-06, + "loss": 0.0319, + "step": 672 + }, + { + "epoch": 0.1348, + "learning_rate": 6.086263331627974e-06, + "loss": 0.0386, + "step": 674 + }, + { + "epoch": 0.1352, + "learning_rate": 6.0991160029626e-06, + "loss": 0.2109, + "step": 676 + }, + { + "epoch": 0.1356, + "learning_rate": 6.111976279270187e-06, + "loss": 0.0005, + "step": 678 + }, + { + "epoch": 0.136, + "learning_rate": 6.124844135478966e-06, + "loss": 0.001, + "step": 680 + }, + { + "epoch": 0.1364, + "learning_rate": 6.137719546502394e-06, + "loss": 0.1349, + "step": 682 + }, + { + "epoch": 0.1368, + "learning_rate": 6.1506024872392e-06, + "loss": 0.2494, + "step": 684 + }, + { + "epoch": 0.1372, + "learning_rate": 6.163492932573429e-06, + "loss": 0.116, + "step": 686 + }, + { + "epoch": 0.1376, + "learning_rate": 6.176390857374501e-06, + "loss": 0.0008, + "step": 688 + }, + { + "epoch": 0.138, + "learning_rate": 6.189296236497251e-06, + "loss": 0.618, + "step": 690 + }, + { + "epoch": 0.1384, + "learning_rate": 6.202209044781979e-06, + "loss": 0.279, + "step": 692 + }, + { + "epoch": 0.1388, + "learning_rate": 6.215129257054525e-06, + "loss": 0.1192, + "step": 694 + }, + { + "epoch": 0.1392, + "learning_rate": 6.228056848126223e-06, + "loss": 0.0462, + "step": 696 + }, + { + "epoch": 0.1396, + "learning_rate": 6.240991792794137e-06, + "loss": 0.0808, + "step": 698 + }, + { + "epoch": 0.14, + "learning_rate": 6.253934065840883e-06, + "loss": 0.0152, + "step": 700 + }, + { + "epoch": 0.1404, + "learning_rate": 6.2668836420348374e-06, + "loss": 0.1309, + "step": 702 + }, + { + "epoch": 0.1408, + "learning_rate": 6.279840496130188e-06, + "loss": 0.6543, + "step": 704 + }, + { + "epoch": 0.1412, + "learning_rate": 6.2928046028668185e-06, + "loss": 0.1882, + "step": 706 + }, + { + "epoch": 0.1416, + "learning_rate": 6.305775936970606e-06, + "loss": 0.3531, + "step": 708 + }, + { + "epoch": 0.142, + "learning_rate": 6.3187544731532205e-06, + "loss": 0.0021, + "step": 710 + }, + { + "epoch": 0.1424, + "learning_rate": 6.331740186112359e-06, + "loss": 0.0061, + "step": 712 + }, + { + "epoch": 0.1428, + "learning_rate": 6.344733050531709e-06, + "loss": 0.1265, + "step": 714 + }, + { + "epoch": 0.1432, + "learning_rate": 6.357733041081015e-06, + "loss": 0.0145, + "step": 716 + }, + { + "epoch": 0.1436, + "learning_rate": 6.370740132416133e-06, + "loss": 0.0802, + "step": 718 + }, + { + "epoch": 0.144, + "learning_rate": 6.383754299179072e-06, + "loss": 0.5938, + "step": 720 + }, + { + "epoch": 0.1444, + "learning_rate": 6.3967755159980485e-06, + "loss": 0.0001, + "step": 722 + }, + { + "epoch": 0.1448, + "learning_rate": 6.409803757487532e-06, + "loss": 0.0113, + "step": 724 + }, + { + "epoch": 0.1452, + "learning_rate": 6.422838998248301e-06, + "loss": 0.0898, + "step": 726 + }, + { + "epoch": 0.1456, + "learning_rate": 6.435881212867485e-06, + "loss": 0.215, + "step": 728 + }, + { + "epoch": 0.146, + "learning_rate": 6.4489303759186385e-06, + "loss": 0.8299, + "step": 730 + }, + { + "epoch": 0.1464, + "learning_rate": 6.4619864619616975e-06, + "loss": 0.3504, + "step": 732 + }, + { + "epoch": 0.1468, + "learning_rate": 6.475049445543222e-06, + "loss": 0.152, + "step": 734 + }, + { + "epoch": 0.1472, + "learning_rate": 6.48811930119619e-06, + "loss": 0.0789, + "step": 736 + }, + { + "epoch": 0.1476, + "learning_rate": 6.5011960034403e-06, + "loss": 0.0003, + "step": 738 + }, + { + "epoch": 0.148, + "learning_rate": 6.514279526781853e-06, + "loss": 0.0143, + "step": 740 + }, + { + "epoch": 0.1484, + "learning_rate": 6.5273698457137965e-06, + "loss": 0.2685, + "step": 742 + }, + { + "epoch": 0.1488, + "learning_rate": 6.540466934715955e-06, + "loss": 0.0436, + "step": 744 + }, + { + "epoch": 0.1492, + "learning_rate": 6.553570768254831e-06, + "loss": 0.0718, + "step": 746 + }, + { + "epoch": 0.1496, + "learning_rate": 6.566681320783848e-06, + "loss": 0.0003, + "step": 748 + }, + { + "epoch": 0.15, + "learning_rate": 6.579798566743313e-06, + "loss": 0.1809, + "step": 750 + }, + { + "epoch": 0.1504, + "learning_rate": 6.592922480560483e-06, + "loss": 0.0853, + "step": 752 + }, + { + "epoch": 0.1508, + "learning_rate": 6.606053036649618e-06, + "loss": 0.0335, + "step": 754 + }, + { + "epoch": 0.1512, + "learning_rate": 6.619190209412025e-06, + "loss": 0.066, + "step": 756 + }, + { + "epoch": 0.1516, + "learning_rate": 6.632333973236113e-06, + "loss": 0.0231, + "step": 758 + }, + { + "epoch": 0.152, + "learning_rate": 6.6454843024974465e-06, + "loss": 0.1918, + "step": 760 + }, + { + "epoch": 0.1524, + "learning_rate": 6.6586411715587805e-06, + "loss": 0.0488, + "step": 762 + }, + { + "epoch": 0.1528, + "learning_rate": 6.671804554770128e-06, + "loss": 0.1504, + "step": 764 + }, + { + "epoch": 0.1532, + "learning_rate": 6.6849744264688e-06, + "loss": 0.086, + "step": 766 + }, + { + "epoch": 0.1536, + "learning_rate": 6.698150760979456e-06, + "loss": 2.2168, + "step": 768 + }, + { + "epoch": 0.154, + "learning_rate": 6.711333532614177e-06, + "loss": 0.0623, + "step": 770 + }, + { + "epoch": 0.1544, + "learning_rate": 6.724522715672421e-06, + "loss": 0.0044, + "step": 772 + }, + { + "epoch": 0.1548, + "learning_rate": 6.737718284441256e-06, + "loss": 0.3246, + "step": 774 + }, + { + "epoch": 0.1552, + "learning_rate": 6.750920213195242e-06, + "loss": 0.2824, + "step": 776 + }, + { + "epoch": 0.1556, + "learning_rate": 6.764128476196494e-06, + "loss": 0.0002, + "step": 778 + }, + { + "epoch": 0.156, + "learning_rate": 6.777343047694894e-06, + "loss": 0.4034, + "step": 780 + }, + { + "epoch": 0.1564, + "learning_rate": 6.7905639019278925e-06, + "loss": 0.0901, + "step": 782 + }, + { + "epoch": 0.1568, + "learning_rate": 6.803791013120824e-06, + "loss": 0.0003, + "step": 784 + }, + { + "epoch": 0.1572, + "learning_rate": 6.817024355486707e-06, + "loss": 0.1239, + "step": 786 + }, + { + "epoch": 0.1576, + "learning_rate": 6.8302639032264836e-06, + "loss": 0.4308, + "step": 788 + }, + { + "epoch": 0.158, + "learning_rate": 6.8435096305289765e-06, + "loss": 0.1798, + "step": 790 + }, + { + "epoch": 0.1584, + "learning_rate": 6.856761511570944e-06, + "loss": 0.0015, + "step": 792 + }, + { + "epoch": 0.1588, + "learning_rate": 6.870019520517217e-06, + "loss": 0.0894, + "step": 794 + }, + { + "epoch": 0.1592, + "learning_rate": 6.883283631520579e-06, + "loss": 0.0227, + "step": 796 + }, + { + "epoch": 0.1596, + "learning_rate": 6.896553818721985e-06, + "loss": 0.025, + "step": 798 + }, + { + "epoch": 0.16, + "learning_rate": 6.909830056250522e-06, + "loss": 0.1428, + "step": 800 + }, + { + "epoch": 0.1604, + "learning_rate": 6.9231123182234895e-06, + "loss": 0.1165, + "step": 802 + }, + { + "epoch": 0.1608, + "learning_rate": 6.936400578746436e-06, + "loss": 0.037, + "step": 804 + }, + { + "epoch": 0.1612, + "learning_rate": 6.949694811913237e-06, + "loss": 0.6542, + "step": 806 + }, + { + "epoch": 0.1616, + "learning_rate": 6.96299499180605e-06, + "loss": 0.0586, + "step": 808 + }, + { + "epoch": 0.162, + "learning_rate": 6.976301092495548e-06, + "loss": 0.2483, + "step": 810 + }, + { + "epoch": 0.1624, + "learning_rate": 6.989613088040787e-06, + "loss": 0.0067, + "step": 812 + }, + { + "epoch": 0.1628, + "learning_rate": 7.002930952489353e-06, + "loss": 0.0022, + "step": 814 + }, + { + "epoch": 0.1632, + "learning_rate": 7.016254659877404e-06, + "loss": 0.1116, + "step": 816 + }, + { + "epoch": 0.1636, + "learning_rate": 7.029584184229641e-06, + "loss": 0.0603, + "step": 818 + }, + { + "epoch": 0.164, + "learning_rate": 7.042919499559539e-06, + "loss": 0.0002, + "step": 820 + }, + { + "epoch": 0.1644, + "learning_rate": 7.056260579869152e-06, + "loss": 0.0198, + "step": 822 + }, + { + "epoch": 0.1648, + "learning_rate": 7.06960739914943e-06, + "loss": 0.5727, + "step": 824 + }, + { + "epoch": 0.1652, + "learning_rate": 7.082959931380013e-06, + "loss": 0.7902, + "step": 826 + }, + { + "epoch": 0.1656, + "learning_rate": 7.09631815052946e-06, + "loss": 0.5187, + "step": 828 + }, + { + "epoch": 0.166, + "learning_rate": 7.109682030555285e-06, + "loss": 0.2954, + "step": 830 + }, + { + "epoch": 0.1664, + "learning_rate": 7.123051545403873e-06, + "loss": 0.3203, + "step": 832 + }, + { + "epoch": 0.1668, + "learning_rate": 7.136426669010686e-06, + "loss": 0.2173, + "step": 834 + }, + { + "epoch": 0.1672, + "learning_rate": 7.1498073753002375e-06, + "loss": 0.3844, + "step": 836 + }, + { + "epoch": 0.1676, + "learning_rate": 7.1631936381861544e-06, + "loss": 0.2651, + "step": 838 + }, + { + "epoch": 0.168, + "learning_rate": 7.1765854315712325e-06, + "loss": 0.0528, + "step": 840 + }, + { + "epoch": 0.1684, + "learning_rate": 7.189982729347485e-06, + "loss": 0.1154, + "step": 842 + }, + { + "epoch": 0.1688, + "learning_rate": 7.203385505396197e-06, + "loss": 0.0001, + "step": 844 + }, + { + "epoch": 0.1692, + "learning_rate": 7.216793733587966e-06, + "loss": 0.1948, + "step": 846 + }, + { + "epoch": 0.1696, + "learning_rate": 7.230207387782771e-06, + "loss": 0.7324, + "step": 848 + }, + { + "epoch": 0.17, + "learning_rate": 7.243626441830001e-06, + "loss": 0.0004, + "step": 850 + }, + { + "epoch": 0.1704, + "learning_rate": 7.257050869568527e-06, + "loss": 0.1101, + "step": 852 + }, + { + "epoch": 0.1708, + "learning_rate": 7.270480644826739e-06, + "loss": 0.0089, + "step": 854 + }, + { + "epoch": 0.1712, + "learning_rate": 7.28391574142262e-06, + "loss": 0.3825, + "step": 856 + }, + { + "epoch": 0.1716, + "learning_rate": 7.297356133163711e-06, + "loss": 0.2874, + "step": 858 + }, + { + "epoch": 0.172, + "learning_rate": 7.3108017938473485e-06, + "loss": 0.0031, + "step": 860 + }, + { + "epoch": 0.1724, + "learning_rate": 7.324252697260479e-06, + "loss": 0.0004, + "step": 862 + }, + { + "epoch": 0.1728, + "learning_rate": 7.337708817179875e-06, + "loss": 0.2308, + "step": 864 + }, + { + "epoch": 0.1732, + "learning_rate": 7.351170127372196e-06, + "loss": 0.344, + "step": 866 + }, + { + "epoch": 0.1736, + "learning_rate": 7.36463660159386e-06, + "loss": 0.0009, + "step": 868 + }, + { + "epoch": 0.174, + "learning_rate": 7.378108213591355e-06, + "loss": 0.0868, + "step": 870 + }, + { + "epoch": 0.1744, + "learning_rate": 7.39158493710103e-06, + "loss": 0.1465, + "step": 872 + }, + { + "epoch": 0.1748, + "learning_rate": 7.405066745849345e-06, + "loss": 0.0439, + "step": 874 + }, + { + "epoch": 0.1752, + "learning_rate": 7.418553613552822e-06, + "loss": 0.3584, + "step": 876 + }, + { + "epoch": 0.1756, + "learning_rate": 7.432045513918121e-06, + "loss": 0.0008, + "step": 878 + }, + { + "epoch": 0.176, + "learning_rate": 7.445542420642091e-06, + "loss": 0.0238, + "step": 880 + }, + { + "epoch": 0.1764, + "learning_rate": 7.459044307411826e-06, + "loss": 0.0003, + "step": 882 + }, + { + "epoch": 0.1768, + "learning_rate": 7.472551147904703e-06, + "loss": 0.1885, + "step": 884 + }, + { + "epoch": 0.1772, + "learning_rate": 7.486062915788446e-06, + "loss": 0.0047, + "step": 886 + }, + { + "epoch": 0.1776, + "learning_rate": 7.499579584721173e-06, + "loss": 0.0192, + "step": 888 + }, + { + "epoch": 0.178, + "learning_rate": 7.513101128351446e-06, + "loss": 0.0013, + "step": 890 + }, + { + "epoch": 0.1784, + "learning_rate": 7.5266275203183395e-06, + "loss": 0.0085, + "step": 892 + }, + { + "epoch": 0.1788, + "learning_rate": 7.540158734251412e-06, + "loss": 0.0038, + "step": 894 + }, + { + "epoch": 0.1792, + "learning_rate": 7.553694743770917e-06, + "loss": 0.0935, + "step": 896 + }, + { + "epoch": 0.1796, + "learning_rate": 7.567235522487698e-06, + "loss": 0.0001, + "step": 898 + }, + { + "epoch": 0.18, + "learning_rate": 7.580781044003312e-06, + "loss": 0.0005, + "step": 900 + }, + { + "epoch": 0.1804, + "learning_rate": 7.5943312819100875e-06, + "loss": 0.3303, + "step": 902 + }, + { + "epoch": 0.1808, + "learning_rate": 7.607886209791095e-06, + "loss": 0.4083, + "step": 904 + }, + { + "epoch": 0.1812, + "learning_rate": 7.6214458012203726e-06, + "loss": 0.0042, + "step": 906 + }, + { + "epoch": 0.1816, + "learning_rate": 7.635010029762755e-06, + "loss": 0.0083, + "step": 908 + }, + { + "epoch": 0.182, + "learning_rate": 7.648578868974102e-06, + "loss": 0.0007, + "step": 910 + }, + { + "epoch": 0.1824, + "learning_rate": 7.662152292401265e-06, + "loss": 0.0576, + "step": 912 + }, + { + "epoch": 0.1828, + "learning_rate": 7.675730273582142e-06, + "loss": 0.3061, + "step": 914 + }, + { + "epoch": 0.1832, + "learning_rate": 7.689312786045822e-06, + "loss": 0.6859, + "step": 916 + }, + { + "epoch": 0.1836, + "learning_rate": 7.702899803312443e-06, + "loss": 0.1564, + "step": 918 + }, + { + "epoch": 0.184, + "learning_rate": 7.716491298893441e-06, + "loss": 0.0295, + "step": 920 + }, + { + "epoch": 0.1844, + "learning_rate": 7.730087246291498e-06, + "loss": 0.1669, + "step": 922 + }, + { + "epoch": 0.1848, + "learning_rate": 7.74368761900062e-06, + "loss": 0.0016, + "step": 924 + }, + { + "epoch": 0.1852, + "learning_rate": 7.757292390506184e-06, + "loss": 0.0033, + "step": 926 + }, + { + "epoch": 0.1856, + "learning_rate": 7.770901534284991e-06, + "loss": 0.0004, + "step": 928 + }, + { + "epoch": 0.186, + "learning_rate": 7.78451502380532e-06, + "loss": 0.2283, + "step": 930 + }, + { + "epoch": 0.1864, + "learning_rate": 7.798132832526976e-06, + "loss": 0.1294, + "step": 932 + }, + { + "epoch": 0.1868, + "learning_rate": 7.811754933901346e-06, + "loss": 0.2882, + "step": 934 + }, + { + "epoch": 0.1872, + "learning_rate": 7.825381301371444e-06, + "loss": 0.0429, + "step": 936 + }, + { + "epoch": 0.1876, + "learning_rate": 7.839011908371987e-06, + "loss": 0.176, + "step": 938 + }, + { + "epoch": 0.188, + "learning_rate": 7.852646728329358e-06, + "loss": 0.1084, + "step": 940 + }, + { + "epoch": 0.1884, + "learning_rate": 7.866285734661845e-06, + "loss": 0.0016, + "step": 942 + }, + { + "epoch": 0.1888, + "learning_rate": 7.879928900779441e-06, + "loss": 0.1166, + "step": 944 + }, + { + "epoch": 0.1892, + "learning_rate": 7.893576200084164e-06, + "loss": 0.0625, + "step": 946 + }, + { + "epoch": 0.1896, + "learning_rate": 7.907227605969852e-06, + "loss": 0.2293, + "step": 948 + }, + { + "epoch": 0.19, + "learning_rate": 7.92088309182239e-06, + "loss": 0.3314, + "step": 950 + }, + { + "epoch": 0.1904, + "learning_rate": 7.934542631019767e-06, + "loss": 0.6069, + "step": 952 + }, + { + "epoch": 0.1908, + "learning_rate": 7.948206196931937e-06, + "loss": 0.0084, + "step": 954 + }, + { + "epoch": 0.1912, + "learning_rate": 7.961873762921153e-06, + "loss": 0.1136, + "step": 956 + }, + { + "epoch": 0.1916, + "learning_rate": 7.97554530234174e-06, + "loss": 0.0006, + "step": 958 + }, + { + "epoch": 0.192, + "learning_rate": 7.989220788540351e-06, + "loss": 0.0005, + "step": 960 + }, + { + "epoch": 0.1924, + "learning_rate": 8.002900194855927e-06, + "loss": 0.482, + "step": 962 + }, + { + "epoch": 0.1928, + "learning_rate": 8.016583494619764e-06, + "loss": 0.0667, + "step": 964 + }, + { + "epoch": 0.1932, + "learning_rate": 8.03027066115557e-06, + "loss": 0.0009, + "step": 966 + }, + { + "epoch": 0.1936, + "learning_rate": 8.043961667779511e-06, + "loss": 0.3818, + "step": 968 + }, + { + "epoch": 0.194, + "learning_rate": 8.057656487800274e-06, + "loss": 0.0007, + "step": 970 + }, + { + "epoch": 0.1944, + "learning_rate": 8.071355094519103e-06, + "loss": 0.0983, + "step": 972 + }, + { + "epoch": 0.1948, + "learning_rate": 8.085057461229862e-06, + "loss": 0.4911, + "step": 974 + }, + { + "epoch": 0.1952, + "learning_rate": 8.098763561219089e-06, + "loss": 0.5, + "step": 976 + }, + { + "epoch": 0.1956, + "learning_rate": 8.112473367766056e-06, + "loss": 0.9264, + "step": 978 + }, + { + "epoch": 0.196, + "learning_rate": 8.126186854142744e-06, + "loss": 0.1938, + "step": 980 + }, + { + "epoch": 0.1964, + "learning_rate": 8.139903993614075e-06, + "loss": 0.0902, + "step": 982 + }, + { + "epoch": 0.1968, + "learning_rate": 8.153624759437718e-06, + "loss": 0.0382, + "step": 984 + }, + { + "epoch": 0.1972, + "learning_rate": 8.167349124864389e-06, + "loss": 0.0013, + "step": 986 + }, + { + "epoch": 0.1976, + "learning_rate": 8.181077063137735e-06, + "loss": 0.31, + "step": 988 + }, + { + "epoch": 0.198, + "learning_rate": 8.194808547494386e-06, + "loss": 0.6689, + "step": 990 + }, + { + "epoch": 0.1984, + "learning_rate": 8.208543551164178e-06, + "loss": 0.437, + "step": 992 + }, + { + "epoch": 0.1988, + "learning_rate": 8.22228204736997e-06, + "loss": 0.2534, + "step": 994 + }, + { + "epoch": 0.1992, + "learning_rate": 8.236024009327877e-06, + "loss": 0.265, + "step": 996 + }, + { + "epoch": 0.1996, + "learning_rate": 8.249769410247239e-06, + "loss": 0.2533, + "step": 998 + }, + { + "epoch": 0.2, + "learning_rate": 8.263518223330695e-06, + "loss": 0.1332, + "step": 1000 + }, + { + "epoch": 0.2004, + "learning_rate": 8.277270421774231e-06, + "loss": 0.4716, + "step": 1002 + }, + { + "epoch": 0.2008, + "learning_rate": 8.29102597876723e-06, + "loss": 0.5343, + "step": 1004 + }, + { + "epoch": 0.2012, + "learning_rate": 8.304784867492532e-06, + "loss": 0.0514, + "step": 1006 + }, + { + "epoch": 0.2016, + "learning_rate": 8.31854706112648e-06, + "loss": 0.0047, + "step": 1008 + }, + { + "epoch": 0.202, + "learning_rate": 8.332312532838972e-06, + "loss": 0.0126, + "step": 1010 + }, + { + "epoch": 0.2024, + "learning_rate": 8.346081255793516e-06, + "loss": 0.0287, + "step": 1012 + }, + { + "epoch": 0.2028, + "learning_rate": 8.359853203147282e-06, + "loss": 0.1077, + "step": 1014 + }, + { + "epoch": 0.2032, + "learning_rate": 8.373628348051156e-06, + "loss": 0.0829, + "step": 1016 + }, + { + "epoch": 0.2036, + "learning_rate": 8.387406663649803e-06, + "loss": 0.8568, + "step": 1018 + }, + { + "epoch": 0.204, + "learning_rate": 8.401188123081642e-06, + "loss": 0.0747, + "step": 1020 + }, + { + "epoch": 0.2044, + "learning_rate": 8.414972699479062e-06, + "loss": 0.3738, + "step": 1022 + }, + { + "epoch": 0.2048, + "learning_rate": 8.428760365968329e-06, + "loss": 0.2221, + "step": 1024 + }, + { + "epoch": 0.2052, + "learning_rate": 8.442551095669627e-06, + "loss": 0.0854, + "step": 1026 + }, + { + "epoch": 0.2056, + "learning_rate": 8.456344861697293e-06, + "loss": 0.3071, + "step": 1028 + }, + { + "epoch": 0.206, + "learning_rate": 8.470141637159605e-06, + "loss": 0.3895, + "step": 1030 + }, + { + "epoch": 0.2064, + "learning_rate": 8.483941395159114e-06, + "loss": 0.0008, + "step": 1032 + }, + { + "epoch": 0.2068, + "learning_rate": 8.497744108792431e-06, + "loss": 0.4843, + "step": 1034 + }, + { + "epoch": 0.2072, + "learning_rate": 8.511549751150478e-06, + "loss": 0.0173, + "step": 1036 + }, + { + "epoch": 0.2076, + "learning_rate": 8.52535829531845e-06, + "loss": 0.2254, + "step": 1038 + }, + { + "epoch": 0.208, + "learning_rate": 8.539169714375883e-06, + "loss": 0.0344, + "step": 1040 + }, + { + "epoch": 0.2084, + "learning_rate": 8.552983981396707e-06, + "loss": 0.2724, + "step": 1042 + }, + { + "epoch": 0.2088, + "learning_rate": 8.566801069449304e-06, + "loss": 0.0066, + "step": 1044 + }, + { + "epoch": 0.2092, + "learning_rate": 8.580620951596553e-06, + "loss": 0.0213, + "step": 1046 + }, + { + "epoch": 0.2096, + "learning_rate": 8.594443600895886e-06, + "loss": 0.4866, + "step": 1048 + }, + { + "epoch": 0.21, + "learning_rate": 8.60826899039934e-06, + "loss": 0.1836, + "step": 1050 + }, + { + "epoch": 0.2104, + "learning_rate": 8.622097093153612e-06, + "loss": 0.4458, + "step": 1052 + }, + { + "epoch": 0.2108, + "learning_rate": 8.635927882200128e-06, + "loss": 0.1821, + "step": 1054 + }, + { + "epoch": 0.2112, + "learning_rate": 8.649761330575e-06, + "loss": 0.0151, + "step": 1056 + }, + { + "epoch": 0.2116, + "learning_rate": 8.663597411309268e-06, + "loss": 0.0404, + "step": 1058 + }, + { + "epoch": 0.212, + "learning_rate": 8.677436097428766e-06, + "loss": 0.0734, + "step": 1060 + }, + { + "epoch": 0.2124, + "learning_rate": 8.691277361954266e-06, + "loss": 0.0149, + "step": 1062 + }, + { + "epoch": 0.2128, + "learning_rate": 8.705121177901537e-06, + "loss": 0.2277, + "step": 1064 + }, + { + "epoch": 0.2132, + "learning_rate": 8.718967518281292e-06, + "loss": 0.0601, + "step": 1066 + }, + { + "epoch": 0.2136, + "learning_rate": 8.732816356099459e-06, + "loss": 0.0363, + "step": 1068 + }, + { + "epoch": 0.214, + "learning_rate": 8.746667664356962e-06, + "loss": 0.0231, + "step": 1070 + }, + { + "epoch": 0.2144, + "learning_rate": 8.760521416049986e-06, + "loss": 0.3542, + "step": 1072 + }, + { + "epoch": 0.2148, + "learning_rate": 8.774377584169934e-06, + "loss": 0.0019, + "step": 1074 + }, + { + "epoch": 0.2152, + "learning_rate": 8.788236141703477e-06, + "loss": 0.2338, + "step": 1076 + }, + { + "epoch": 0.2156, + "learning_rate": 8.802097061632706e-06, + "loss": 0.7073, + "step": 1078 + }, + { + "epoch": 0.216, + "learning_rate": 8.81596031693499e-06, + "loss": 0.2802, + "step": 1080 + }, + { + "epoch": 0.2164, + "learning_rate": 8.829825880583224e-06, + "loss": 0.0293, + "step": 1082 + }, + { + "epoch": 0.2168, + "learning_rate": 8.84369372554578e-06, + "loss": 0.0767, + "step": 1084 + }, + { + "epoch": 0.2172, + "learning_rate": 8.85756382478659e-06, + "loss": 0.0303, + "step": 1086 + }, + { + "epoch": 0.2176, + "learning_rate": 8.87143615126518e-06, + "loss": 0.0021, + "step": 1088 + }, + { + "epoch": 0.218, + "learning_rate": 8.88531067793674e-06, + "loss": 0.2695, + "step": 1090 + }, + { + "epoch": 0.2184, + "learning_rate": 8.899187377752173e-06, + "loss": 0.0541, + "step": 1092 + }, + { + "epoch": 0.2188, + "learning_rate": 8.913066223658141e-06, + "loss": 0.4281, + "step": 1094 + }, + { + "epoch": 0.2192, + "learning_rate": 8.926947188597127e-06, + "loss": 0.0511, + "step": 1096 + }, + { + "epoch": 0.2196, + "learning_rate": 8.940830245507473e-06, + "loss": 0.3572, + "step": 1098 + }, + { + "epoch": 0.22, + "learning_rate": 8.954715367323473e-06, + "loss": 0.0331, + "step": 1100 + }, + { + "epoch": 0.2204, + "learning_rate": 8.968602526975317e-06, + "loss": 0.1732, + "step": 1102 + }, + { + "epoch": 0.2208, + "learning_rate": 8.982491697389344e-06, + "loss": 0.0079, + "step": 1104 + }, + { + "epoch": 0.2212, + "learning_rate": 8.996382851487839e-06, + "loss": 0.1911, + "step": 1106 + }, + { + "epoch": 0.2216, + "learning_rate": 9.010275962189356e-06, + "loss": 0.1909, + "step": 1108 + }, + { + "epoch": 0.222, + "learning_rate": 9.024171002408509e-06, + "loss": 0.2, + "step": 1110 + }, + { + "epoch": 0.2224, + "learning_rate": 9.03806794505621e-06, + "loss": 0.0357, + "step": 1112 + }, + { + "epoch": 0.2228, + "learning_rate": 9.051966763039708e-06, + "loss": 0.002, + "step": 1114 + }, + { + "epoch": 0.2232, + "learning_rate": 9.065867429262497e-06, + "loss": 0.0003, + "step": 1116 + }, + { + "epoch": 0.2236, + "learning_rate": 9.07976991662453e-06, + "loss": 0.1539, + "step": 1118 + }, + { + "epoch": 0.224, + "learning_rate": 9.093674198022198e-06, + "loss": 0.2674, + "step": 1120 + }, + { + "epoch": 0.2244, + "learning_rate": 9.107580246348395e-06, + "loss": 0.3991, + "step": 1122 + }, + { + "epoch": 0.2248, + "learning_rate": 9.121488034492567e-06, + "loss": 0.2345, + "step": 1124 + }, + { + "epoch": 0.2252, + "learning_rate": 9.135397535340768e-06, + "loss": 0.0427, + "step": 1126 + }, + { + "epoch": 0.2256, + "learning_rate": 9.149308721775717e-06, + "loss": 0.0002, + "step": 1128 + }, + { + "epoch": 0.226, + "learning_rate": 9.16322156667684e-06, + "loss": 0.1389, + "step": 1130 + }, + { + "epoch": 0.2264, + "learning_rate": 9.177136042920338e-06, + "loss": 0.009, + "step": 1132 + }, + { + "epoch": 0.2268, + "learning_rate": 9.191052123379227e-06, + "loss": 0.2181, + "step": 1134 + }, + { + "epoch": 0.2272, + "learning_rate": 9.204969780923396e-06, + "loss": 0.1348, + "step": 1136 + }, + { + "epoch": 0.2276, + "learning_rate": 9.218888988419656e-06, + "loss": 0.041, + "step": 1138 + }, + { + "epoch": 0.228, + "learning_rate": 9.232809718731822e-06, + "loss": 0.4121, + "step": 1140 + }, + { + "epoch": 0.2284, + "learning_rate": 9.246731944720663e-06, + "loss": 1.6579, + "step": 1142 + }, + { + "epoch": 0.2288, + "learning_rate": 9.26065563924414e-06, + "loss": 0.2298, + "step": 1144 + }, + { + "epoch": 0.2292, + "learning_rate": 9.274580775157299e-06, + "loss": 0.0004, + "step": 1146 + }, + { + "epoch": 0.2296, + "learning_rate": 9.288507325312319e-06, + "loss": 0.0361, + "step": 1148 + }, + { + "epoch": 0.23, + "learning_rate": 9.302435262558752e-06, + "loss": 0.0448, + "step": 1150 + }, + { + "epoch": 0.2304, + "learning_rate": 9.316364559743298e-06, + "loss": 0.2103, + "step": 1152 + }, + { + "epoch": 0.2308, + "learning_rate": 9.330295189710153e-06, + "loss": 0.0198, + "step": 1154 + }, + { + "epoch": 0.2312, + "learning_rate": 9.344227125300788e-06, + "loss": 0.0276, + "step": 1156 + }, + { + "epoch": 0.2316, + "learning_rate": 9.358160339354196e-06, + "loss": 0.4598, + "step": 1158 + }, + { + "epoch": 0.232, + "learning_rate": 9.372094804706867e-06, + "loss": 0.3808, + "step": 1160 + }, + { + "epoch": 0.2324, + "learning_rate": 9.386030494192826e-06, + "loss": 0.3449, + "step": 1162 + }, + { + "epoch": 0.2328, + "learning_rate": 9.39996738064379e-06, + "loss": 0.2663, + "step": 1164 + }, + { + "epoch": 0.2332, + "learning_rate": 9.413905436889032e-06, + "loss": 0.088, + "step": 1166 + }, + { + "epoch": 0.2336, + "learning_rate": 9.427844635755615e-06, + "loss": 0.0002, + "step": 1168 + }, + { + "epoch": 0.234, + "learning_rate": 9.441784950068357e-06, + "loss": 0.2874, + "step": 1170 + }, + { + "epoch": 0.2344, + "learning_rate": 9.455726352649904e-06, + "loss": 0.0005, + "step": 1172 + }, + { + "epoch": 0.2348, + "learning_rate": 9.469668816320777e-06, + "loss": 0.0388, + "step": 1174 + }, + { + "epoch": 0.2352, + "learning_rate": 9.483612313899446e-06, + "loss": 0.0865, + "step": 1176 + }, + { + "epoch": 0.2356, + "learning_rate": 9.497556818202297e-06, + "loss": 0.3238, + "step": 1178 + }, + { + "epoch": 0.236, + "learning_rate": 9.511502302043859e-06, + "loss": 0.0191, + "step": 1180 + }, + { + "epoch": 0.2364, + "learning_rate": 9.52544873823668e-06, + "loss": 0.0827, + "step": 1182 + }, + { + "epoch": 0.2368, + "learning_rate": 9.539396099591469e-06, + "loss": 0.3573, + "step": 1184 + }, + { + "epoch": 0.2372, + "learning_rate": 9.553344358917146e-06, + "loss": 0.3041, + "step": 1186 + }, + { + "epoch": 0.2376, + "learning_rate": 9.567293489020816e-06, + "loss": 0.0007, + "step": 1188 + }, + { + "epoch": 0.238, + "learning_rate": 9.581243462708009e-06, + "loss": 0.004, + "step": 1190 + }, + { + "epoch": 0.2384, + "learning_rate": 9.595194252782461e-06, + "loss": 0.0017, + "step": 1192 + }, + { + "epoch": 0.2388, + "learning_rate": 9.609145832046469e-06, + "loss": 0.0008, + "step": 1194 + }, + { + "epoch": 0.2392, + "learning_rate": 9.623098173300656e-06, + "loss": 0.2191, + "step": 1196 + }, + { + "epoch": 0.2396, + "learning_rate": 9.637051249344225e-06, + "loss": 0.001, + "step": 1198 + }, + { + "epoch": 0.24, + "learning_rate": 9.651005032974991e-06, + "loss": 0.1756, + "step": 1200 + }, + { + "epoch": 0.2404, + "learning_rate": 9.664959496989285e-06, + "loss": 0.2627, + "step": 1202 + }, + { + "epoch": 0.2408, + "learning_rate": 9.678914614182184e-06, + "loss": 0.4802, + "step": 1204 + }, + { + "epoch": 0.2412, + "learning_rate": 9.69287035734747e-06, + "loss": 0.1498, + "step": 1206 + }, + { + "epoch": 0.2416, + "learning_rate": 9.706826699277714e-06, + "loss": 0.3906, + "step": 1208 + }, + { + "epoch": 0.242, + "learning_rate": 9.720783612764307e-06, + "loss": 0.0493, + "step": 1210 + }, + { + "epoch": 0.2424, + "learning_rate": 9.734741070597535e-06, + "loss": 0.215, + "step": 1212 + }, + { + "epoch": 0.2428, + "learning_rate": 9.74869904556662e-06, + "loss": 0.481, + "step": 1214 + }, + { + "epoch": 0.2432, + "learning_rate": 9.762657510459774e-06, + "loss": 0.5898, + "step": 1216 + }, + { + "epoch": 0.2436, + "learning_rate": 9.776616438064255e-06, + "loss": 0.0334, + "step": 1218 + }, + { + "epoch": 0.244, + "learning_rate": 9.790575801166422e-06, + "loss": 0.2133, + "step": 1220 + }, + { + "epoch": 0.2444, + "learning_rate": 9.804535572551782e-06, + "loss": 0.0552, + "step": 1222 + }, + { + "epoch": 0.2448, + "learning_rate": 9.818495725005043e-06, + "loss": 0.0144, + "step": 1224 + }, + { + "epoch": 0.2452, + "learning_rate": 9.832456231310194e-06, + "loss": 0.1574, + "step": 1226 + }, + { + "epoch": 0.2456, + "learning_rate": 9.846417064250459e-06, + "loss": 0.2435, + "step": 1228 + }, + { + "epoch": 0.246, + "learning_rate": 9.860378196608552e-06, + "loss": 0.1206, + "step": 1230 + }, + { + "epoch": 0.2464, + "learning_rate": 9.874339601166479e-06, + "loss": 0.0157, + "step": 1232 + }, + { + "epoch": 0.2468, + "learning_rate": 9.888301250705765e-06, + "loss": 0.2375, + "step": 1234 + }, + { + "epoch": 0.2472, + "learning_rate": 9.902263118007513e-06, + "loss": 0.0051, + "step": 1236 + }, + { + "epoch": 0.2476, + "learning_rate": 9.916225175852278e-06, + "loss": 0.077, + "step": 1238 + }, + { + "epoch": 0.248, + "learning_rate": 9.930187397020385e-06, + "loss": 0.3445, + "step": 1240 + }, + { + "epoch": 0.2484, + "learning_rate": 9.944149754291716e-06, + "loss": 0.0869, + "step": 1242 + }, + { + "epoch": 0.2488, + "learning_rate": 9.95811222044596e-06, + "loss": 0.0088, + "step": 1244 + }, + { + "epoch": 0.2492, + "learning_rate": 9.972074768262572e-06, + "loss": 0.0296, + "step": 1246 + }, + { + "epoch": 0.2496, + "learning_rate": 9.986037370520855e-06, + "loss": 0.1261, + "step": 1248 + }, + { + "epoch": 0.25, + "learning_rate": 9.999999999999996e-06, + "loss": 0.6931, + "step": 1250 + }, + { + "epoch": 0.2504, + "learning_rate": 1.0013962629479139e-05, + "loss": 0.001, + "step": 1252 + }, + { + "epoch": 0.2508, + "learning_rate": 1.0027925231737419e-05, + "loss": 0.1075, + "step": 1254 + }, + { + "epoch": 0.2512, + "learning_rate": 1.0041887779554034e-05, + "loss": 0.2184, + "step": 1256 + }, + { + "epoch": 0.2516, + "learning_rate": 1.0055850245708276e-05, + "loss": 0.0207, + "step": 1258 + }, + { + "epoch": 0.252, + "learning_rate": 1.0069812602979607e-05, + "loss": 0.0002, + "step": 1260 + }, + { + "epoch": 0.2524, + "learning_rate": 1.0083774824147717e-05, + "loss": 0.0084, + "step": 1262 + }, + { + "epoch": 0.2528, + "learning_rate": 1.0097736881992482e-05, + "loss": 0.0324, + "step": 1264 + }, + { + "epoch": 0.2532, + "learning_rate": 1.011169874929423e-05, + "loss": 0.0142, + "step": 1266 + }, + { + "epoch": 0.2536, + "learning_rate": 1.0125660398833514e-05, + "loss": 0.0154, + "step": 1268 + }, + { + "epoch": 0.254, + "learning_rate": 1.013962180339144e-05, + "loss": 0.199, + "step": 1270 + }, + { + "epoch": 0.2544, + "learning_rate": 1.0153582935749533e-05, + "loss": 0.0229, + "step": 1272 + }, + { + "epoch": 0.2548, + "learning_rate": 1.01675437686898e-05, + "loss": 0.1759, + "step": 1274 + }, + { + "epoch": 0.2552, + "learning_rate": 1.0181504274994952e-05, + "loss": 0.0053, + "step": 1276 + }, + { + "epoch": 0.2556, + "learning_rate": 1.0195464427448212e-05, + "loss": 0.0007, + "step": 1278 + }, + { + "epoch": 0.256, + "learning_rate": 1.0209424198833571e-05, + "loss": 0.2104, + "step": 1280 + }, + { + "epoch": 0.2564, + "learning_rate": 1.0223383561935738e-05, + "loss": 0.3836, + "step": 1282 + }, + { + "epoch": 0.2568, + "learning_rate": 1.0237342489540218e-05, + "loss": 0.3264, + "step": 1284 + }, + { + "epoch": 0.2572, + "learning_rate": 1.0251300954433374e-05, + "loss": 0.0162, + "step": 1286 + }, + { + "epoch": 0.2576, + "learning_rate": 1.0265258929402458e-05, + "loss": 0.2711, + "step": 1288 + }, + { + "epoch": 0.258, + "learning_rate": 1.0279216387235686e-05, + "loss": 0.0029, + "step": 1290 + }, + { + "epoch": 0.2584, + "learning_rate": 1.029317330072228e-05, + "loss": 0.0014, + "step": 1292 + }, + { + "epoch": 0.2588, + "learning_rate": 1.0307129642652523e-05, + "loss": 0.2264, + "step": 1294 + }, + { + "epoch": 0.2592, + "learning_rate": 1.0321085385817811e-05, + "loss": 0.0126, + "step": 1296 + }, + { + "epoch": 0.2596, + "learning_rate": 1.033504050301071e-05, + "loss": 0.0866, + "step": 1298 + }, + { + "epoch": 0.26, + "learning_rate": 1.0348994967025004e-05, + "loss": 0.0061, + "step": 1300 + }, + { + "epoch": 0.2604, + "learning_rate": 1.0362948750655768e-05, + "loss": 0.5759, + "step": 1302 + }, + { + "epoch": 0.2608, + "learning_rate": 1.0376901826699337e-05, + "loss": 0.078, + "step": 1304 + }, + { + "epoch": 0.2612, + "learning_rate": 1.0390854167953526e-05, + "loss": 0.1557, + "step": 1306 + }, + { + "epoch": 0.2616, + "learning_rate": 1.0404805747217532e-05, + "loss": 0.1759, + "step": 1308 + }, + { + "epoch": 0.262, + "learning_rate": 1.0418756537291984e-05, + "loss": 0.0212, + "step": 1310 + }, + { + "epoch": 0.2624, + "learning_rate": 1.0432706510979175e-05, + "loss": 0.4093, + "step": 1312 + }, + { + "epoch": 0.2628, + "learning_rate": 1.0446655641082846e-05, + "loss": 0.0605, + "step": 1314 + }, + { + "epoch": 0.2632, + "learning_rate": 1.0460603900408526e-05, + "loss": 0.0736, + "step": 1316 + }, + { + "epoch": 0.2636, + "learning_rate": 1.0474551261763312e-05, + "loss": 0.0174, + "step": 1318 + }, + { + "epoch": 0.264, + "learning_rate": 1.0488497697956134e-05, + "loss": 0.0025, + "step": 1320 + }, + { + "epoch": 0.2644, + "learning_rate": 1.0502443181797696e-05, + "loss": 0.2449, + "step": 1322 + }, + { + "epoch": 0.2648, + "learning_rate": 1.0516387686100549e-05, + "loss": 0.0493, + "step": 1324 + }, + { + "epoch": 0.2652, + "learning_rate": 1.0530331183679216e-05, + "loss": 0.0001, + "step": 1326 + }, + { + "epoch": 0.2656, + "learning_rate": 1.054427364735009e-05, + "loss": 0.0239, + "step": 1328 + }, + { + "epoch": 0.266, + "learning_rate": 1.0558215049931634e-05, + "loss": 0.1035, + "step": 1330 + }, + { + "epoch": 0.2664, + "learning_rate": 1.0572155364244378e-05, + "loss": 0.1566, + "step": 1332 + }, + { + "epoch": 0.2668, + "learning_rate": 1.058609456311096e-05, + "loss": 0.1121, + "step": 1334 + }, + { + "epoch": 0.2672, + "learning_rate": 1.0600032619356203e-05, + "loss": 0.3491, + "step": 1336 + }, + { + "epoch": 0.2676, + "learning_rate": 1.0613969505807167e-05, + "loss": 0.0056, + "step": 1338 + }, + { + "epoch": 0.268, + "learning_rate": 1.0627905195293127e-05, + "loss": 0.0385, + "step": 1340 + }, + { + "epoch": 0.2684, + "learning_rate": 1.0641839660645795e-05, + "loss": 0.0033, + "step": 1342 + }, + { + "epoch": 0.2688, + "learning_rate": 1.0655772874699206e-05, + "loss": 0.2073, + "step": 1344 + }, + { + "epoch": 0.2692, + "learning_rate": 1.066970481028984e-05, + "loss": 0.0011, + "step": 1346 + }, + { + "epoch": 0.2696, + "learning_rate": 1.0683635440256694e-05, + "loss": 0.0891, + "step": 1348 + }, + { + "epoch": 0.27, + "learning_rate": 1.0697564737441242e-05, + "loss": 0.0011, + "step": 1350 + }, + { + "epoch": 0.2704, + "learning_rate": 1.0711492674687674e-05, + "loss": 0.0005, + "step": 1352 + }, + { + "epoch": 0.2708, + "learning_rate": 1.0725419224842695e-05, + "loss": 0.0893, + "step": 1354 + }, + { + "epoch": 0.2712, + "learning_rate": 1.0739344360755855e-05, + "loss": 0.2083, + "step": 1356 + }, + { + "epoch": 0.2716, + "learning_rate": 1.0753268055279332e-05, + "loss": 0.219, + "step": 1358 + }, + { + "epoch": 0.272, + "learning_rate": 1.0767190281268171e-05, + "loss": 0.761, + "step": 1360 + }, + { + "epoch": 0.2724, + "learning_rate": 1.0781111011580336e-05, + "loss": 0.608, + "step": 1362 + }, + { + "epoch": 0.2728, + "learning_rate": 1.07950302190766e-05, + "loss": 0.0036, + "step": 1364 + }, + { + "epoch": 0.2732, + "learning_rate": 1.0808947876620766e-05, + "loss": 0.0003, + "step": 1366 + }, + { + "epoch": 0.2736, + "learning_rate": 1.0822863957079654e-05, + "loss": 0.0057, + "step": 1368 + }, + { + "epoch": 0.274, + "learning_rate": 1.0836778433323153e-05, + "loss": 0.2388, + "step": 1370 + }, + { + "epoch": 0.2744, + "learning_rate": 1.0850691278224277e-05, + "loss": 0.4797, + "step": 1372 + }, + { + "epoch": 0.2748, + "learning_rate": 1.0864602464659227e-05, + "loss": 0.5615, + "step": 1374 + }, + { + "epoch": 0.2752, + "learning_rate": 1.0878511965507428e-05, + "loss": 0.0747, + "step": 1376 + }, + { + "epoch": 0.2756, + "learning_rate": 1.0892419753651598e-05, + "loss": 0.0183, + "step": 1378 + }, + { + "epoch": 0.276, + "learning_rate": 1.0906325801977795e-05, + "loss": 0.0747, + "step": 1380 + }, + { + "epoch": 0.2764, + "learning_rate": 1.0920230083375465e-05, + "loss": 0.0995, + "step": 1382 + }, + { + "epoch": 0.2768, + "learning_rate": 1.0934132570737497e-05, + "loss": 0.3298, + "step": 1384 + }, + { + "epoch": 0.2772, + "learning_rate": 1.0948033236960285e-05, + "loss": 0.0998, + "step": 1386 + }, + { + "epoch": 0.2776, + "learning_rate": 1.0961932054943785e-05, + "loss": 0.0255, + "step": 1388 + }, + { + "epoch": 0.278, + "learning_rate": 1.0975828997591484e-05, + "loss": 0.2275, + "step": 1390 + }, + { + "epoch": 0.2784, + "learning_rate": 1.098972403781064e-05, + "loss": 0.0426, + "step": 1392 + }, + { + "epoch": 0.2788, + "learning_rate": 1.1003617148512154e-05, + "loss": 0.0002, + "step": 1394 + }, + { + "epoch": 0.2792, + "learning_rate": 1.101750830261065e-05, + "loss": 0.0003, + "step": 1396 + }, + { + "epoch": 0.2796, + "learning_rate": 1.1031397473024676e-05, + "loss": 0.1498, + "step": 1398 + }, + { + "epoch": 0.28, + "learning_rate": 1.104528463267652e-05, + "loss": 0.6654, + "step": 1400 + }, + { + "epoch": 0.2804, + "learning_rate": 1.1059169754492518e-05, + "loss": 0.2531, + "step": 1402 + }, + { + "epoch": 0.2808, + "learning_rate": 1.1073052811402867e-05, + "loss": 0.2929, + "step": 1404 + }, + { + "epoch": 0.2812, + "learning_rate": 1.108693377634185e-05, + "loss": 0.0892, + "step": 1406 + }, + { + "epoch": 0.2816, + "learning_rate": 1.1100812622247821e-05, + "loss": 0.2872, + "step": 1408 + }, + { + "epoch": 0.282, + "learning_rate": 1.1114689322063252e-05, + "loss": 0.3473, + "step": 1410 + }, + { + "epoch": 0.2824, + "learning_rate": 1.1128563848734815e-05, + "loss": 0.0041, + "step": 1412 + }, + { + "epoch": 0.2828, + "learning_rate": 1.1142436175213404e-05, + "loss": 0.2112, + "step": 1414 + }, + { + "epoch": 0.2832, + "learning_rate": 1.1156306274454211e-05, + "loss": 0.1323, + "step": 1416 + }, + { + "epoch": 0.2836, + "learning_rate": 1.117017411941677e-05, + "loss": 0.1484, + "step": 1418 + }, + { + "epoch": 0.284, + "learning_rate": 1.1184039683065002e-05, + "loss": 0.0051, + "step": 1420 + }, + { + "epoch": 0.2844, + "learning_rate": 1.1197902938367289e-05, + "loss": 0.0183, + "step": 1422 + }, + { + "epoch": 0.2848, + "learning_rate": 1.1211763858296516e-05, + "loss": 0.1667, + "step": 1424 + }, + { + "epoch": 0.2852, + "learning_rate": 1.122562241583006e-05, + "loss": 0.8326, + "step": 1426 + }, + { + "epoch": 0.2856, + "learning_rate": 1.1239478583950007e-05, + "loss": 0.6633, + "step": 1428 + }, + { + "epoch": 0.286, + "learning_rate": 1.1253332335643033e-05, + "loss": 0.1143, + "step": 1430 + }, + { + "epoch": 0.2864, + "learning_rate": 1.1267183643900534e-05, + "loss": 0.8248, + "step": 1432 + }, + { + "epoch": 0.2868, + "learning_rate": 1.1281032481718701e-05, + "loss": 0.3334, + "step": 1434 + }, + { + "epoch": 0.2872, + "learning_rate": 1.1294878822098456e-05, + "loss": 0.688, + "step": 1436 + }, + { + "epoch": 0.2876, + "learning_rate": 1.1308722638045725e-05, + "loss": 0.0002, + "step": 1438 + }, + { + "epoch": 0.288, + "learning_rate": 1.1322563902571227e-05, + "loss": 0.1999, + "step": 1440 + }, + { + "epoch": 0.2884, + "learning_rate": 1.1336402588690725e-05, + "loss": 0.4211, + "step": 1442 + }, + { + "epoch": 0.2888, + "learning_rate": 1.1350238669424993e-05, + "loss": 0.2229, + "step": 1444 + }, + { + "epoch": 0.2892, + "learning_rate": 1.1364072117799864e-05, + "loss": 0.3294, + "step": 1446 + }, + { + "epoch": 0.2896, + "learning_rate": 1.137790290684638e-05, + "loss": 0.0222, + "step": 1448 + }, + { + "epoch": 0.29, + "learning_rate": 1.1391731009600652e-05, + "loss": 0.2613, + "step": 1450 + }, + { + "epoch": 0.2904, + "learning_rate": 1.1405556399104108e-05, + "loss": 0.0328, + "step": 1452 + }, + { + "epoch": 0.2908, + "learning_rate": 1.141937904840344e-05, + "loss": 0.0032, + "step": 1454 + }, + { + "epoch": 0.2912, + "learning_rate": 1.143319893055069e-05, + "loss": 0.9533, + "step": 1456 + }, + { + "epoch": 0.2916, + "learning_rate": 1.1447016018603286e-05, + "loss": 1.0513, + "step": 1458 + }, + { + "epoch": 0.292, + "learning_rate": 1.1460830285624112e-05, + "loss": 0.0966, + "step": 1460 + }, + { + "epoch": 0.2924, + "learning_rate": 1.1474641704681541e-05, + "loss": 0.0022, + "step": 1462 + }, + { + "epoch": 0.2928, + "learning_rate": 1.1488450248849515e-05, + "loss": 0.0265, + "step": 1464 + }, + { + "epoch": 0.2932, + "learning_rate": 1.150225589120756e-05, + "loss": 0.3655, + "step": 1466 + }, + { + "epoch": 0.2936, + "learning_rate": 1.1516058604840881e-05, + "loss": 0.2195, + "step": 1468 + }, + { + "epoch": 0.294, + "learning_rate": 1.1529858362840388e-05, + "loss": 0.1955, + "step": 1470 + }, + { + "epoch": 0.2944, + "learning_rate": 1.15436551383027e-05, + "loss": 0.1557, + "step": 1472 + }, + { + "epoch": 0.2948, + "learning_rate": 1.1557448904330366e-05, + "loss": 0.0087, + "step": 1474 + }, + { + "epoch": 0.2952, + "learning_rate": 1.1571239634031666e-05, + "loss": 0.5158, + "step": 1476 + }, + { + "epoch": 0.2956, + "learning_rate": 1.158502730052093e-05, + "loss": 0.1127, + "step": 1478 + }, + { + "epoch": 0.296, + "learning_rate": 1.1598811876918352e-05, + "loss": 0.1585, + "step": 1480 + }, + { + "epoch": 0.2964, + "learning_rate": 1.161259333635019e-05, + "loss": 0.1027, + "step": 1482 + }, + { + "epoch": 0.2968, + "learning_rate": 1.1626371651948839e-05, + "loss": 0.0037, + "step": 1484 + }, + { + "epoch": 0.2972, + "learning_rate": 1.1640146796852711e-05, + "loss": 0.1872, + "step": 1486 + }, + { + "epoch": 0.2976, + "learning_rate": 1.1653918744206478e-05, + "loss": 0.1601, + "step": 1488 + }, + { + "epoch": 0.298, + "learning_rate": 1.1667687467161021e-05, + "loss": 0.0876, + "step": 1490 + }, + { + "epoch": 0.2984, + "learning_rate": 1.1681452938873515e-05, + "loss": 0.0991, + "step": 1492 + }, + { + "epoch": 0.2988, + "learning_rate": 1.169521513250746e-05, + "loss": 0.3621, + "step": 1494 + }, + { + "epoch": 0.2992, + "learning_rate": 1.1708974021232763e-05, + "loss": 0.0559, + "step": 1496 + }, + { + "epoch": 0.2996, + "learning_rate": 1.1722729578225762e-05, + "loss": 0.1934, + "step": 1498 + }, + { + "epoch": 0.3, + "learning_rate": 1.1736481776669297e-05, + "loss": 0.1686, + "step": 1500 + }, + { + "epoch": 0.3004, + "learning_rate": 1.1750230589752753e-05, + "loss": 0.0005, + "step": 1502 + }, + { + "epoch": 0.3008, + "learning_rate": 1.1763975990672116e-05, + "loss": 0.0004, + "step": 1504 + }, + { + "epoch": 0.3012, + "learning_rate": 1.1777717952630023e-05, + "loss": 0.1552, + "step": 1506 + }, + { + "epoch": 0.3016, + "learning_rate": 1.1791456448835815e-05, + "loss": 0.2896, + "step": 1508 + }, + { + "epoch": 0.302, + "learning_rate": 1.180519145250561e-05, + "loss": 0.0517, + "step": 1510 + }, + { + "epoch": 0.3024, + "learning_rate": 1.1818922936862258e-05, + "loss": 0.2505, + "step": 1512 + }, + { + "epoch": 0.3028, + "learning_rate": 1.1832650875135606e-05, + "loss": 0.04, + "step": 1514 + }, + { + "epoch": 0.3032, + "learning_rate": 1.1846375240562274e-05, + "loss": 0.0003, + "step": 1516 + }, + { + "epoch": 0.3036, + "learning_rate": 1.1860096006385918e-05, + "loss": 0.0018, + "step": 1518 + }, + { + "epoch": 0.304, + "learning_rate": 1.187381314585725e-05, + "loss": 0.2525, + "step": 1520 + }, + { + "epoch": 0.3044, + "learning_rate": 1.1887526632233937e-05, + "loss": 0.1929, + "step": 1522 + }, + { + "epoch": 0.3048, + "learning_rate": 1.1901236438780906e-05, + "loss": 0.2593, + "step": 1524 + }, + { + "epoch": 0.3052, + "learning_rate": 1.191494253877013e-05, + "loss": 0.2432, + "step": 1526 + }, + { + "epoch": 0.3056, + "learning_rate": 1.192864490548089e-05, + "loss": 0.0054, + "step": 1528 + }, + { + "epoch": 0.306, + "learning_rate": 1.1942343512199719e-05, + "loss": 0.1082, + "step": 1530 + }, + { + "epoch": 0.3064, + "learning_rate": 1.195603833222048e-05, + "loss": 0.1605, + "step": 1532 + }, + { + "epoch": 0.3068, + "learning_rate": 1.1969729338844422e-05, + "loss": 0.0905, + "step": 1534 + }, + { + "epoch": 0.3072, + "learning_rate": 1.198341650538023e-05, + "loss": 0.0124, + "step": 1536 + }, + { + "epoch": 0.3076, + "learning_rate": 1.1997099805144066e-05, + "loss": 0.1955, + "step": 1538 + }, + { + "epoch": 0.308, + "learning_rate": 1.2010779211459642e-05, + "loss": 0.0863, + "step": 1540 + }, + { + "epoch": 0.3084, + "learning_rate": 1.2024454697658254e-05, + "loss": 0.1922, + "step": 1542 + }, + { + "epoch": 0.3088, + "learning_rate": 1.203812623707884e-05, + "loss": 0.0511, + "step": 1544 + }, + { + "epoch": 0.3092, + "learning_rate": 1.2051793803068054e-05, + "loss": 0.4359, + "step": 1546 + }, + { + "epoch": 0.3096, + "learning_rate": 1.2065457368980227e-05, + "loss": 0.0536, + "step": 1548 + }, + { + "epoch": 0.31, + "learning_rate": 1.20791169081776e-05, + "loss": 0.26, + "step": 1550 + }, + { + "epoch": 0.3104, + "learning_rate": 1.2092772394030141e-05, + "loss": 0.0021, + "step": 1552 + }, + { + "epoch": 0.3108, + "learning_rate": 1.210642379991583e-05, + "loss": 0.0985, + "step": 1554 + }, + { + "epoch": 0.3112, + "learning_rate": 1.2120071099220552e-05, + "loss": 0.016, + "step": 1556 + }, + { + "epoch": 0.3116, + "learning_rate": 1.2133714265338148e-05, + "loss": 0.0651, + "step": 1558 + }, + { + "epoch": 0.312, + "learning_rate": 1.2147353271670637e-05, + "loss": 0.3521, + "step": 1560 + }, + { + "epoch": 0.3124, + "learning_rate": 1.2160988091628006e-05, + "loss": 0.3376, + "step": 1562 + }, + { + "epoch": 0.3128, + "learning_rate": 1.217461869862855e-05, + "loss": 0.0014, + "step": 1564 + }, + { + "epoch": 0.3132, + "learning_rate": 1.2188245066098647e-05, + "loss": 0.097, + "step": 1566 + }, + { + "epoch": 0.3136, + "learning_rate": 1.2201867167473015e-05, + "loss": 0.2869, + "step": 1568 + }, + { + "epoch": 0.314, + "learning_rate": 1.2215484976194673e-05, + "loss": 0.2296, + "step": 1570 + }, + { + "epoch": 0.3144, + "learning_rate": 1.2229098465715002e-05, + "loss": 0.2613, + "step": 1572 + }, + { + "epoch": 0.3148, + "learning_rate": 1.2242707609493809e-05, + "loss": 0.3339, + "step": 1574 + }, + { + "epoch": 0.3152, + "learning_rate": 1.2256312380999373e-05, + "loss": 0.0151, + "step": 1576 + }, + { + "epoch": 0.3156, + "learning_rate": 1.2269912753708496e-05, + "loss": 0.2674, + "step": 1578 + }, + { + "epoch": 0.316, + "learning_rate": 1.2283508701106552e-05, + "loss": 0.186, + "step": 1580 + }, + { + "epoch": 0.3164, + "learning_rate": 1.229710019668755e-05, + "loss": 0.0892, + "step": 1582 + }, + { + "epoch": 0.3168, + "learning_rate": 1.2310687213954173e-05, + "loss": 0.1004, + "step": 1584 + }, + { + "epoch": 0.3172, + "learning_rate": 1.232426972641785e-05, + "loss": 0.3077, + "step": 1586 + }, + { + "epoch": 0.3176, + "learning_rate": 1.233784770759873e-05, + "loss": 0.2451, + "step": 1588 + }, + { + "epoch": 0.318, + "learning_rate": 1.2351421131025891e-05, + "loss": 0.6273, + "step": 1590 + }, + { + "epoch": 0.3184, + "learning_rate": 1.2364989970237238e-05, + "loss": 0.1991, + "step": 1592 + }, + { + "epoch": 0.3188, + "learning_rate": 1.237855419877962e-05, + "loss": 0.06, + "step": 1594 + }, + { + "epoch": 0.3192, + "learning_rate": 1.23921137902089e-05, + "loss": 0.5316, + "step": 1596 + }, + { + "epoch": 0.3196, + "learning_rate": 1.2405668718089906e-05, + "loss": 0.7097, + "step": 1598 + }, + { + "epoch": 0.32, + "learning_rate": 1.241921895599668e-05, + "loss": 0.0813, + "step": 1600 + }, + { + "epoch": 0.3204, + "learning_rate": 1.2432764477512295e-05, + "loss": 0.2381, + "step": 1602 + }, + { + "epoch": 0.3208, + "learning_rate": 1.2446305256229076e-05, + "loss": 0.1915, + "step": 1604 + }, + { + "epoch": 0.3212, + "learning_rate": 1.2459841265748582e-05, + "loss": 0.0188, + "step": 1606 + }, + { + "epoch": 0.3216, + "learning_rate": 1.2473372479681653e-05, + "loss": 0.0004, + "step": 1608 + }, + { + "epoch": 0.322, + "learning_rate": 1.2486898871648547e-05, + "loss": 0.4589, + "step": 1610 + }, + { + "epoch": 0.3224, + "learning_rate": 1.2500420415278822e-05, + "loss": 0.4307, + "step": 1612 + }, + { + "epoch": 0.3228, + "learning_rate": 1.2513937084211546e-05, + "loss": 0.4207, + "step": 1614 + }, + { + "epoch": 0.3232, + "learning_rate": 1.2527448852095292e-05, + "loss": 0.1338, + "step": 1616 + }, + { + "epoch": 0.3236, + "learning_rate": 1.2540955692588167e-05, + "loss": 0.0065, + "step": 1618 + }, + { + "epoch": 0.324, + "learning_rate": 1.2554457579357902e-05, + "loss": 0.0025, + "step": 1620 + }, + { + "epoch": 0.3244, + "learning_rate": 1.2567954486081873e-05, + "loss": 0.0762, + "step": 1622 + }, + { + "epoch": 0.3248, + "learning_rate": 1.2581446386447171e-05, + "loss": 0.0045, + "step": 1624 + }, + { + "epoch": 0.3252, + "learning_rate": 1.2594933254150647e-05, + "loss": 0.1116, + "step": 1626 + }, + { + "epoch": 0.3256, + "learning_rate": 1.2608415062898963e-05, + "loss": 0.0047, + "step": 1628 + }, + { + "epoch": 0.326, + "learning_rate": 1.262189178640864e-05, + "loss": 0.0062, + "step": 1630 + }, + { + "epoch": 0.3264, + "learning_rate": 1.2635363398406133e-05, + "loss": 0.2501, + "step": 1632 + }, + { + "epoch": 0.3268, + "learning_rate": 1.2648829872627797e-05, + "loss": 0.0004, + "step": 1634 + }, + { + "epoch": 0.3272, + "learning_rate": 1.266229118282012e-05, + "loss": 0.0059, + "step": 1636 + }, + { + "epoch": 0.3276, + "learning_rate": 1.2675747302739516e-05, + "loss": 0.3509, + "step": 1638 + }, + { + "epoch": 0.328, + "learning_rate": 1.2689198206152644e-05, + "loss": 0.7458, + "step": 1640 + }, + { + "epoch": 0.3284, + "learning_rate": 1.2702643866836281e-05, + "loss": 0.2254, + "step": 1642 + }, + { + "epoch": 0.3288, + "learning_rate": 1.2716084258577373e-05, + "loss": 0.1471, + "step": 1644 + }, + { + "epoch": 0.3292, + "learning_rate": 1.2729519355173254e-05, + "loss": 0.0083, + "step": 1646 + }, + { + "epoch": 0.3296, + "learning_rate": 1.2742949130431468e-05, + "loss": 0.108, + "step": 1648 + }, + { + "epoch": 0.33, + "learning_rate": 1.2756373558169992e-05, + "loss": 0.1987, + "step": 1650 + }, + { + "epoch": 0.3304, + "learning_rate": 1.2769792612217224e-05, + "loss": 0.0968, + "step": 1652 + }, + { + "epoch": 0.3308, + "learning_rate": 1.2783206266412028e-05, + "loss": 0.8134, + "step": 1654 + }, + { + "epoch": 0.3312, + "learning_rate": 1.2796614494603795e-05, + "loss": 0.4248, + "step": 1656 + }, + { + "epoch": 0.3316, + "learning_rate": 1.2810017270652508e-05, + "loss": 0.3208, + "step": 1658 + }, + { + "epoch": 0.332, + "learning_rate": 1.282341456842876e-05, + "loss": 0.2019, + "step": 1660 + }, + { + "epoch": 0.3324, + "learning_rate": 1.283680636181384e-05, + "loss": 0.1063, + "step": 1662 + }, + { + "epoch": 0.3328, + "learning_rate": 1.2850192624699756e-05, + "loss": 0.1846, + "step": 1664 + }, + { + "epoch": 0.3332, + "learning_rate": 1.2863573330989308e-05, + "loss": 0.4586, + "step": 1666 + }, + { + "epoch": 0.3336, + "learning_rate": 1.2876948454596122e-05, + "loss": 0.3844, + "step": 1668 + }, + { + "epoch": 0.334, + "learning_rate": 1.2890317969444708e-05, + "loss": 0.0084, + "step": 1670 + }, + { + "epoch": 0.3344, + "learning_rate": 1.2903681849470535e-05, + "loss": 0.0711, + "step": 1672 + }, + { + "epoch": 0.3348, + "learning_rate": 1.291704006861998e-05, + "loss": 0.1587, + "step": 1674 + }, + { + "epoch": 0.3352, + "learning_rate": 1.2930392600850565e-05, + "loss": 0.0019, + "step": 1676 + }, + { + "epoch": 0.3356, + "learning_rate": 1.2943739420130843e-05, + "loss": 0.0452, + "step": 1678 + }, + { + "epoch": 0.336, + "learning_rate": 1.2957080500440455e-05, + "loss": 0.1443, + "step": 1680 + }, + { + "epoch": 0.3364, + "learning_rate": 1.2970415815770353e-05, + "loss": 0.1464, + "step": 1682 + }, + { + "epoch": 0.3368, + "learning_rate": 1.2983745340122589e-05, + "loss": 1.4535, + "step": 1684 + }, + { + "epoch": 0.3372, + "learning_rate": 1.299706904751064e-05, + "loss": 0.1093, + "step": 1686 + }, + { + "epoch": 0.3376, + "learning_rate": 1.3010386911959205e-05, + "loss": 0.0698, + "step": 1688 + }, + { + "epoch": 0.338, + "learning_rate": 1.3023698907504447e-05, + "loss": 0.0001, + "step": 1690 + }, + { + "epoch": 0.3384, + "learning_rate": 1.3037005008193944e-05, + "loss": 0.0539, + "step": 1692 + }, + { + "epoch": 0.3388, + "learning_rate": 1.3050305188086757e-05, + "loss": 0.2448, + "step": 1694 + }, + { + "epoch": 0.3392, + "learning_rate": 1.3063599421253556e-05, + "loss": 0.0108, + "step": 1696 + }, + { + "epoch": 0.3396, + "learning_rate": 1.3076887681776504e-05, + "loss": 0.8424, + "step": 1698 + }, + { + "epoch": 0.34, + "learning_rate": 1.309016994374947e-05, + "loss": 0.357, + "step": 1700 + }, + { + "epoch": 0.3404, + "learning_rate": 1.310344618127801e-05, + "loss": 0.0002, + "step": 1702 + }, + { + "epoch": 0.3408, + "learning_rate": 1.3116716368479415e-05, + "loss": 0.4574, + "step": 1704 + }, + { + "epoch": 0.3412, + "learning_rate": 1.3129980479482776e-05, + "loss": 0.1724, + "step": 1706 + }, + { + "epoch": 0.3416, + "learning_rate": 1.3143238488429049e-05, + "loss": 0.0539, + "step": 1708 + }, + { + "epoch": 0.342, + "learning_rate": 1.3156490369471018e-05, + "loss": 0.5267, + "step": 1710 + }, + { + "epoch": 0.3424, + "learning_rate": 1.316973609677351e-05, + "loss": 0.001, + "step": 1712 + }, + { + "epoch": 0.3428, + "learning_rate": 1.3182975644513286e-05, + "loss": 0.001, + "step": 1714 + }, + { + "epoch": 0.3432, + "learning_rate": 1.319620898687917e-05, + "loss": 0.2299, + "step": 1716 + }, + { + "epoch": 0.3436, + "learning_rate": 1.3209436098072102e-05, + "loss": 1.8413, + "step": 1718 + }, + { + "epoch": 0.344, + "learning_rate": 1.32226569523051e-05, + "loss": 0.4659, + "step": 1720 + }, + { + "epoch": 0.3444, + "learning_rate": 1.3235871523803501e-05, + "loss": 0.1506, + "step": 1722 + }, + { + "epoch": 0.3448, + "learning_rate": 1.324907978680475e-05, + "loss": 0.2483, + "step": 1724 + }, + { + "epoch": 0.3452, + "learning_rate": 1.3262281715558738e-05, + "loss": 0.1055, + "step": 1726 + }, + { + "epoch": 0.3456, + "learning_rate": 1.3275477284327572e-05, + "loss": 0.2658, + "step": 1728 + }, + { + "epoch": 0.346, + "learning_rate": 1.3288666467385815e-05, + "loss": 0.0602, + "step": 1730 + }, + { + "epoch": 0.3464, + "learning_rate": 1.3301849239020537e-05, + "loss": 0.3439, + "step": 1732 + }, + { + "epoch": 0.3468, + "learning_rate": 1.3315025573531193e-05, + "loss": 0.1362, + "step": 1734 + }, + { + "epoch": 0.3472, + "learning_rate": 1.3328195445229865e-05, + "loss": 0.3703, + "step": 1736 + }, + { + "epoch": 0.3476, + "learning_rate": 1.3341358828441214e-05, + "loss": 0.5492, + "step": 1738 + }, + { + "epoch": 0.348, + "learning_rate": 1.3354515697502548e-05, + "loss": 0.0006, + "step": 1740 + }, + { + "epoch": 0.3484, + "learning_rate": 1.3367666026763879e-05, + "loss": 0.5678, + "step": 1742 + }, + { + "epoch": 0.3488, + "learning_rate": 1.338080979058797e-05, + "loss": 0.3923, + "step": 1744 + }, + { + "epoch": 0.3492, + "learning_rate": 1.3393946963350378e-05, + "loss": 0.0038, + "step": 1746 + }, + { + "epoch": 0.3496, + "learning_rate": 1.340707751943951e-05, + "loss": 0.1914, + "step": 1748 + }, + { + "epoch": 0.35, + "learning_rate": 1.3420201433256682e-05, + "loss": 0.1758, + "step": 1750 + }, + { + "epoch": 0.3504, + "learning_rate": 1.3433318679216145e-05, + "loss": 0.2335, + "step": 1752 + }, + { + "epoch": 0.3508, + "learning_rate": 1.3446429231745162e-05, + "loss": 0.2089, + "step": 1754 + }, + { + "epoch": 0.3512, + "learning_rate": 1.3459533065284039e-05, + "loss": 0.0194, + "step": 1756 + }, + { + "epoch": 0.3516, + "learning_rate": 1.3472630154286197e-05, + "loss": 0.0382, + "step": 1758 + }, + { + "epoch": 0.352, + "learning_rate": 1.348572047321814e-05, + "loss": 0.0366, + "step": 1760 + }, + { + "epoch": 0.3524, + "learning_rate": 1.3498803996559692e-05, + "loss": 0.1099, + "step": 1762 + }, + { + "epoch": 0.3528, + "learning_rate": 1.3511880698803803e-05, + "loss": 0.0029, + "step": 1764 + }, + { + "epoch": 0.3532, + "learning_rate": 1.3524950554456773e-05, + "loss": 0.001, + "step": 1766 + }, + { + "epoch": 0.3536, + "learning_rate": 1.3538013538038296e-05, + "loss": 0.2079, + "step": 1768 + }, + { + "epoch": 0.354, + "learning_rate": 1.3551069624081356e-05, + "loss": 0.0029, + "step": 1770 + }, + { + "epoch": 0.3544, + "learning_rate": 1.3564118787132507e-05, + "loss": 0.0543, + "step": 1772 + }, + { + "epoch": 0.3548, + "learning_rate": 1.3577161001751692e-05, + "loss": 0.1046, + "step": 1774 + }, + { + "epoch": 0.3552, + "learning_rate": 1.3590196242512461e-05, + "loss": 0.1267, + "step": 1776 + }, + { + "epoch": 0.3556, + "learning_rate": 1.3603224484001944e-05, + "loss": 0.1112, + "step": 1778 + }, + { + "epoch": 0.356, + "learning_rate": 1.361624570082092e-05, + "loss": 0.4164, + "step": 1780 + }, + { + "epoch": 0.3564, + "learning_rate": 1.362925986758386e-05, + "loss": 0.0144, + "step": 1782 + }, + { + "epoch": 0.3568, + "learning_rate": 1.364226695891898e-05, + "loss": 0.003, + "step": 1784 + }, + { + "epoch": 0.3572, + "learning_rate": 1.3655266949468287e-05, + "loss": 0.1152, + "step": 1786 + }, + { + "epoch": 0.3576, + "learning_rate": 1.3668259813887637e-05, + "loss": 0.3649, + "step": 1788 + }, + { + "epoch": 0.358, + "learning_rate": 1.3681245526846773e-05, + "loss": 0.0387, + "step": 1790 + }, + { + "epoch": 0.3584, + "learning_rate": 1.3694224063029386e-05, + "loss": 0.0087, + "step": 1792 + }, + { + "epoch": 0.3588, + "learning_rate": 1.3707195397133176e-05, + "loss": 0.0004, + "step": 1794 + }, + { + "epoch": 0.3592, + "learning_rate": 1.3720159503869806e-05, + "loss": 0.148, + "step": 1796 + }, + { + "epoch": 0.3596, + "learning_rate": 1.3733116357965156e-05, + "loss": 0.2126, + "step": 1798 + }, + { + "epoch": 0.36, + "learning_rate": 1.374606593415911e-05, + "loss": 0.0053, + "step": 1800 + }, + { + "epoch": 0.3604, + "learning_rate": 1.3759008207205855e-05, + "loss": 0.0004, + "step": 1802 + }, + { + "epoch": 0.3608, + "learning_rate": 1.377194315187377e-05, + "loss": 0.0659, + "step": 1804 + }, + { + "epoch": 0.3612, + "learning_rate": 1.3784870742945468e-05, + "loss": 0.2213, + "step": 1806 + }, + { + "epoch": 0.3616, + "learning_rate": 1.3797790955218014e-05, + "loss": 0.3311, + "step": 1808 + }, + { + "epoch": 0.362, + "learning_rate": 1.3810703763502744e-05, + "loss": 0.301, + "step": 1810 + }, + { + "epoch": 0.3624, + "learning_rate": 1.3823609142625492e-05, + "loss": 0.3071, + "step": 1812 + }, + { + "epoch": 0.3628, + "learning_rate": 1.3836507067426563e-05, + "loss": 0.175, + "step": 1814 + }, + { + "epoch": 0.3632, + "learning_rate": 1.3849397512760793e-05, + "loss": 0.0003, + "step": 1816 + }, + { + "epoch": 0.3636, + "learning_rate": 1.38622804534976e-05, + "loss": 0.0073, + "step": 1818 + }, + { + "epoch": 0.364, + "learning_rate": 1.3875155864521027e-05, + "loss": 0.3194, + "step": 1820 + }, + { + "epoch": 0.3644, + "learning_rate": 1.3888023720729806e-05, + "loss": 0.1528, + "step": 1822 + }, + { + "epoch": 0.3648, + "learning_rate": 1.3900883997037393e-05, + "loss": 0.1415, + "step": 1824 + }, + { + "epoch": 0.3652, + "learning_rate": 1.391373666837202e-05, + "loss": 0.4512, + "step": 1826 + }, + { + "epoch": 0.3656, + "learning_rate": 1.3926581709676746e-05, + "loss": 0.0982, + "step": 1828 + }, + { + "epoch": 0.366, + "learning_rate": 1.3939419095909506e-05, + "loss": 0.0043, + "step": 1830 + }, + { + "epoch": 0.3664, + "learning_rate": 1.3952248802043158e-05, + "loss": 0.2476, + "step": 1832 + }, + { + "epoch": 0.3668, + "learning_rate": 1.396507080306555e-05, + "loss": 0.0001, + "step": 1834 + }, + { + "epoch": 0.3672, + "learning_rate": 1.397788507397949e-05, + "loss": 0.3758, + "step": 1836 + }, + { + "epoch": 0.3676, + "learning_rate": 1.3990691589802943e-05, + "loss": 0.0146, + "step": 1838 + }, + { + "epoch": 0.368, + "learning_rate": 1.4003490325568956e-05, + "loss": 0.2143, + "step": 1840 + }, + { + "epoch": 0.3684, + "learning_rate": 1.4016281256325688e-05, + "loss": 0.0009, + "step": 1842 + }, + { + "epoch": 0.3688, + "learning_rate": 1.4029064357136632e-05, + "loss": 0.0227, + "step": 1844 + }, + { + "epoch": 0.3692, + "learning_rate": 1.4041839603080411e-05, + "loss": 0.0026, + "step": 1846 + }, + { + "epoch": 0.3696, + "learning_rate": 1.4054606969251096e-05, + "loss": 0.1697, + "step": 1848 + }, + { + "epoch": 0.37, + "learning_rate": 1.4067366430758004e-05, + "loss": 0.0002, + "step": 1850 + }, + { + "epoch": 0.3704, + "learning_rate": 1.4080117962725929e-05, + "loss": 0.2149, + "step": 1852 + }, + { + "epoch": 0.3708, + "learning_rate": 1.4092861540295107e-05, + "loss": 0.0941, + "step": 1854 + }, + { + "epoch": 0.3712, + "learning_rate": 1.4105597138621281e-05, + "loss": 0.0052, + "step": 1856 + }, + { + "epoch": 0.3716, + "learning_rate": 1.411832473287575e-05, + "loss": 0.5806, + "step": 1858 + }, + { + "epoch": 0.372, + "learning_rate": 1.4131044298245416e-05, + "loss": 0.1222, + "step": 1860 + }, + { + "epoch": 0.3724, + "learning_rate": 1.414375580993284e-05, + "loss": 0.0033, + "step": 1862 + }, + { + "epoch": 0.3728, + "learning_rate": 1.4156459243156275e-05, + "loss": 0.0307, + "step": 1864 + }, + { + "epoch": 0.3732, + "learning_rate": 1.416915457314973e-05, + "loss": 0.0002, + "step": 1866 + }, + { + "epoch": 0.3736, + "learning_rate": 1.418184177516301e-05, + "loss": 0.0746, + "step": 1868 + }, + { + "epoch": 0.374, + "learning_rate": 1.4194520824461782e-05, + "loss": 0.0004, + "step": 1870 + }, + { + "epoch": 0.3744, + "learning_rate": 1.420719169632754e-05, + "loss": 0.032, + "step": 1872 + }, + { + "epoch": 0.3748, + "learning_rate": 1.4219854366057821e-05, + "loss": 0.1087, + "step": 1874 + }, + { + "epoch": 0.3752, + "learning_rate": 1.4232508808966085e-05, + "loss": 0.044, + "step": 1876 + }, + { + "epoch": 0.3756, + "learning_rate": 1.424515500038185e-05, + "loss": 0.0019, + "step": 1878 + }, + { + "epoch": 0.376, + "learning_rate": 1.4257792915650735e-05, + "loss": 0.1415, + "step": 1880 + }, + { + "epoch": 0.3764, + "learning_rate": 1.4270422530134425e-05, + "loss": 0.0001, + "step": 1882 + }, + { + "epoch": 0.3768, + "learning_rate": 1.4283043819210906e-05, + "loss": 0.0369, + "step": 1884 + }, + { + "epoch": 0.3772, + "learning_rate": 1.4295656758274288e-05, + "loss": 0.0466, + "step": 1886 + }, + { + "epoch": 0.3776, + "learning_rate": 1.430826132273499e-05, + "loss": 0.021, + "step": 1888 + }, + { + "epoch": 0.378, + "learning_rate": 1.4320857488019826e-05, + "loss": 0.5224, + "step": 1890 + }, + { + "epoch": 0.3784, + "learning_rate": 1.4333445229571857e-05, + "loss": 0.5052, + "step": 1892 + }, + { + "epoch": 0.3788, + "learning_rate": 1.4346024522850704e-05, + "loss": 0.0175, + "step": 1894 + }, + { + "epoch": 0.3792, + "learning_rate": 1.4358595343332342e-05, + "loss": 0.1539, + "step": 1896 + }, + { + "epoch": 0.3796, + "learning_rate": 1.437115766650933e-05, + "loss": 0.2584, + "step": 1898 + }, + { + "epoch": 0.38, + "learning_rate": 1.4383711467890772e-05, + "loss": 0.2302, + "step": 1900 + }, + { + "epoch": 0.3804, + "learning_rate": 1.4396256723002398e-05, + "loss": 0.4257, + "step": 1902 + }, + { + "epoch": 0.3808, + "learning_rate": 1.4408793407386584e-05, + "loss": 0.0681, + "step": 1904 + }, + { + "epoch": 0.3812, + "learning_rate": 1.4421321496602423e-05, + "loss": 0.5899, + "step": 1906 + }, + { + "epoch": 0.3816, + "learning_rate": 1.4433840966225767e-05, + "loss": 0.0093, + "step": 1908 + }, + { + "epoch": 0.382, + "learning_rate": 1.444635179184927e-05, + "loss": 0.0019, + "step": 1910 + }, + { + "epoch": 0.3824, + "learning_rate": 1.4458853949082434e-05, + "loss": 0.1605, + "step": 1912 + }, + { + "epoch": 0.3828, + "learning_rate": 1.4471347413551665e-05, + "loss": 0.4612, + "step": 1914 + }, + { + "epoch": 0.3832, + "learning_rate": 1.4483832160900332e-05, + "loss": 0.0002, + "step": 1916 + }, + { + "epoch": 0.3836, + "learning_rate": 1.4496308166788731e-05, + "loss": 0.0724, + "step": 1918 + }, + { + "epoch": 0.384, + "learning_rate": 1.4508775406894315e-05, + "loss": 0.0002, + "step": 1920 + }, + { + "epoch": 0.3844, + "learning_rate": 1.4521233856911499e-05, + "loss": 0.2276, + "step": 1922 + }, + { + "epoch": 0.3848, + "learning_rate": 1.4533683492551942e-05, + "loss": 0.0005, + "step": 1924 + }, + { + "epoch": 0.3852, + "learning_rate": 1.4546124289544446e-05, + "loss": 0.3479, + "step": 1926 + }, + { + "epoch": 0.3856, + "learning_rate": 1.4558556223634988e-05, + "loss": 0.0141, + "step": 1928 + }, + { + "epoch": 0.386, + "learning_rate": 1.4570979270586944e-05, + "loss": 0.0001, + "step": 1930 + }, + { + "epoch": 0.3864, + "learning_rate": 1.4583393406180886e-05, + "loss": 0.0102, + "step": 1932 + }, + { + "epoch": 0.3868, + "learning_rate": 1.4595798606214882e-05, + "loss": 0.5521, + "step": 1934 + }, + { + "epoch": 0.3872, + "learning_rate": 1.460819484650431e-05, + "loss": 0.001, + "step": 1936 + }, + { + "epoch": 0.3876, + "learning_rate": 1.4620582102882086e-05, + "loss": 0.1324, + "step": 1938 + }, + { + "epoch": 0.388, + "learning_rate": 1.4632960351198618e-05, + "loss": 0.5086, + "step": 1940 + }, + { + "epoch": 0.3884, + "learning_rate": 1.4645329567321875e-05, + "loss": 0.1898, + "step": 1942 + }, + { + "epoch": 0.3888, + "learning_rate": 1.4657689727137441e-05, + "loss": 0.7632, + "step": 1944 + }, + { + "epoch": 0.3892, + "learning_rate": 1.4670040806548551e-05, + "loss": 2.0318, + "step": 1946 + }, + { + "epoch": 0.3896, + "learning_rate": 1.468238278147614e-05, + "loss": 0.0005, + "step": 1948 + }, + { + "epoch": 0.39, + "learning_rate": 1.4694715627858904e-05, + "loss": 0.0731, + "step": 1950 + }, + { + "epoch": 0.3904, + "learning_rate": 1.470703932165332e-05, + "loss": 0.0018, + "step": 1952 + }, + { + "epoch": 0.3908, + "learning_rate": 1.471935383883372e-05, + "loss": 0.0485, + "step": 1954 + }, + { + "epoch": 0.3912, + "learning_rate": 1.4731659155392339e-05, + "loss": 0.5016, + "step": 1956 + }, + { + "epoch": 0.3916, + "learning_rate": 1.4743955247339286e-05, + "loss": 0.1003, + "step": 1958 + }, + { + "epoch": 0.392, + "learning_rate": 1.4756242090702744e-05, + "loss": 0.4768, + "step": 1960 + }, + { + "epoch": 0.3924, + "learning_rate": 1.476851966152887e-05, + "loss": 0.0125, + "step": 1962 + }, + { + "epoch": 0.3928, + "learning_rate": 1.4780787935881913e-05, + "loss": 0.1701, + "step": 1964 + }, + { + "epoch": 0.3932, + "learning_rate": 1.4793046889844255e-05, + "loss": 0.2448, + "step": 1966 + }, + { + "epoch": 0.3936, + "learning_rate": 1.4805296499516397e-05, + "loss": 0.3777, + "step": 1968 + }, + { + "epoch": 0.394, + "learning_rate": 1.4817536741017155e-05, + "loss": 0.0525, + "step": 1970 + }, + { + "epoch": 0.3944, + "learning_rate": 1.482976759048351e-05, + "loss": 0.2804, + "step": 1972 + }, + { + "epoch": 0.3948, + "learning_rate": 1.4841989024070809e-05, + "loss": 0.1892, + "step": 1974 + }, + { + "epoch": 0.3952, + "learning_rate": 1.485420101795274e-05, + "loss": 0.3512, + "step": 1976 + }, + { + "epoch": 0.3956, + "learning_rate": 1.4866403548321385e-05, + "loss": 0.001, + "step": 1978 + }, + { + "epoch": 0.396, + "learning_rate": 1.4878596591387327e-05, + "loss": 0.3884, + "step": 1980 + }, + { + "epoch": 0.3964, + "learning_rate": 1.4890780123379563e-05, + "loss": 0.0917, + "step": 1982 + }, + { + "epoch": 0.3968, + "learning_rate": 1.4902954120545686e-05, + "loss": 0.2166, + "step": 1984 + }, + { + "epoch": 0.3972, + "learning_rate": 1.491511855915187e-05, + "loss": 0.1882, + "step": 1986 + }, + { + "epoch": 0.3976, + "learning_rate": 1.4927273415482913e-05, + "loss": 0.0059, + "step": 1988 + }, + { + "epoch": 0.398, + "learning_rate": 1.4939418665842307e-05, + "loss": 0.0083, + "step": 1990 + }, + { + "epoch": 0.3984, + "learning_rate": 1.4951554286552261e-05, + "loss": 0.3124, + "step": 1992 + }, + { + "epoch": 0.3988, + "learning_rate": 1.4963680253953763e-05, + "loss": 0.0396, + "step": 1994 + }, + { + "epoch": 0.3992, + "learning_rate": 1.4975796544406617e-05, + "loss": 0.0073, + "step": 1996 + }, + { + "epoch": 0.3996, + "learning_rate": 1.49879031342895e-05, + "loss": 1.1271, + "step": 1998 + }, + { + "epoch": 0.4, + "learning_rate": 1.4999999999999992e-05, + "loss": 0.0516, + "step": 2000 + }, + { + "epoch": 0.4004, + "learning_rate": 1.501208711795465e-05, + "loss": 0.1018, + "step": 2002 + }, + { + "epoch": 0.4008, + "learning_rate": 1.502416446458897e-05, + "loss": 0.0006, + "step": 2004 + }, + { + "epoch": 0.4012, + "learning_rate": 1.5036232016357613e-05, + "loss": 0.2438, + "step": 2006 + }, + { + "epoch": 0.4016, + "learning_rate": 1.5048289749734206e-05, + "loss": 0.6706, + "step": 2008 + }, + { + "epoch": 0.402, + "learning_rate": 1.5060337641211642e-05, + "loss": 0.1884, + "step": 2010 + }, + { + "epoch": 0.4024, + "learning_rate": 1.5072375667301895e-05, + "loss": 0.0202, + "step": 2012 + }, + { + "epoch": 0.4028, + "learning_rate": 1.5084403804536214e-05, + "loss": 0.0994, + "step": 2014 + }, + { + "epoch": 0.4032, + "learning_rate": 1.5096422029465178e-05, + "loss": 0.023, + "step": 2016 + }, + { + "epoch": 0.4036, + "learning_rate": 1.5108430318658597e-05, + "loss": 0.2269, + "step": 2018 + }, + { + "epoch": 0.404, + "learning_rate": 1.5120428648705714e-05, + "loss": 0.1527, + "step": 2020 + }, + { + "epoch": 0.4044, + "learning_rate": 1.513241699621517e-05, + "loss": 0.035, + "step": 2022 + }, + { + "epoch": 0.4048, + "learning_rate": 1.5144395337815064e-05, + "loss": 0.046, + "step": 2024 + }, + { + "epoch": 0.4052, + "learning_rate": 1.5156363650153008e-05, + "loss": 0.0264, + "step": 2026 + }, + { + "epoch": 0.4056, + "learning_rate": 1.5168321909896166e-05, + "loss": 0.0188, + "step": 2028 + }, + { + "epoch": 0.406, + "learning_rate": 1.51802700937313e-05, + "loss": 0.4989, + "step": 2030 + }, + { + "epoch": 0.4064, + "learning_rate": 1.5192208178364808e-05, + "loss": 0.4955, + "step": 2032 + }, + { + "epoch": 0.4068, + "learning_rate": 1.5204136140522792e-05, + "loss": 0.0179, + "step": 2034 + }, + { + "epoch": 0.4072, + "learning_rate": 1.521605395695107e-05, + "loss": 0.1132, + "step": 2036 + }, + { + "epoch": 0.4076, + "learning_rate": 1.522796160441526e-05, + "loss": 0.3492, + "step": 2038 + }, + { + "epoch": 0.408, + "learning_rate": 1.5239859059700784e-05, + "loss": 0.2873, + "step": 2040 + }, + { + "epoch": 0.4084, + "learning_rate": 1.5251746299612964e-05, + "loss": 0.1159, + "step": 2042 + }, + { + "epoch": 0.4088, + "learning_rate": 1.526362330097697e-05, + "loss": 0.3475, + "step": 2044 + }, + { + "epoch": 0.4092, + "learning_rate": 1.5275490040638044e-05, + "loss": 0.1162, + "step": 2046 + }, + { + "epoch": 0.4096, + "learning_rate": 1.5287346495461322e-05, + "loss": 0.09, + "step": 2048 + }, + { + "epoch": 0.41, + "learning_rate": 1.529919264233204e-05, + "loss": 0.4911, + "step": 2050 + }, + { + "epoch": 0.4104, + "learning_rate": 1.531102845815557e-05, + "loss": 0.0196, + "step": 2052 + }, + { + "epoch": 0.4108, + "learning_rate": 1.5322853919857327e-05, + "loss": 0.2975, + "step": 2054 + }, + { + "epoch": 0.4112, + "learning_rate": 1.5334669004383025e-05, + "loss": 0.2079, + "step": 2056 + }, + { + "epoch": 0.4116, + "learning_rate": 1.5346473688698514e-05, + "loss": 0.2454, + "step": 2058 + }, + { + "epoch": 0.412, + "learning_rate": 1.5358267949789968e-05, + "loss": 0.0019, + "step": 2060 + }, + { + "epoch": 0.4124, + "learning_rate": 1.537005176466387e-05, + "loss": 0.2564, + "step": 2062 + }, + { + "epoch": 0.4128, + "learning_rate": 1.5381825110347072e-05, + "loss": 0.0166, + "step": 2064 + }, + { + "epoch": 0.4132, + "learning_rate": 1.539358796388683e-05, + "loss": 0.1701, + "step": 2066 + }, + { + "epoch": 0.4136, + "learning_rate": 1.540534030235087e-05, + "loss": 0.1491, + "step": 2068 + }, + { + "epoch": 0.414, + "learning_rate": 1.5417082102827397e-05, + "loss": 0.1132, + "step": 2070 + }, + { + "epoch": 0.4144, + "learning_rate": 1.542881334242517e-05, + "loss": 0.0988, + "step": 2072 + }, + { + "epoch": 0.4148, + "learning_rate": 1.5440533998273542e-05, + "loss": 0.0004, + "step": 2074 + }, + { + "epoch": 0.4152, + "learning_rate": 1.5452244047522493e-05, + "loss": 0.5309, + "step": 2076 + }, + { + "epoch": 0.4156, + "learning_rate": 1.54639434673427e-05, + "loss": 0.2318, + "step": 2078 + }, + { + "epoch": 0.416, + "learning_rate": 1.5475632234925495e-05, + "loss": 0.2985, + "step": 2080 + }, + { + "epoch": 0.4164, + "learning_rate": 1.548731032748309e-05, + "loss": 0.1891, + "step": 2082 + }, + { + "epoch": 0.4168, + "learning_rate": 1.5498977722248388e-05, + "loss": 0.0085, + "step": 2084 + }, + { + "epoch": 0.4172, + "learning_rate": 1.551063439647525e-05, + "loss": 0.6682, + "step": 2086 + }, + { + "epoch": 0.4176, + "learning_rate": 1.552228032743839e-05, + "loss": 0.1792, + "step": 2088 + }, + { + "epoch": 0.418, + "learning_rate": 1.553391549243343e-05, + "loss": 0.2566, + "step": 2090 + }, + { + "epoch": 0.4184, + "learning_rate": 1.5545539868777075e-05, + "loss": 0.3413, + "step": 2092 + }, + { + "epoch": 0.4188, + "learning_rate": 1.5557153433806954e-05, + "loss": 0.5119, + "step": 2094 + }, + { + "epoch": 0.4192, + "learning_rate": 1.556875616488188e-05, + "loss": 0.3296, + "step": 2096 + }, + { + "epoch": 0.4196, + "learning_rate": 1.55803480393817e-05, + "loss": 0.0326, + "step": 2098 + }, + { + "epoch": 0.42, + "learning_rate": 1.5591929034707468e-05, + "loss": 0.0004, + "step": 2100 + }, + { + "epoch": 0.4204, + "learning_rate": 1.5603499128281447e-05, + "loss": 0.183, + "step": 2102 + }, + { + "epoch": 0.4208, + "learning_rate": 1.5615058297547144e-05, + "loss": 0.0215, + "step": 2104 + }, + { + "epoch": 0.4212, + "learning_rate": 1.5626606519969366e-05, + "loss": 0.5745, + "step": 2106 + }, + { + "epoch": 0.4216, + "learning_rate": 1.5638143773034268e-05, + "loss": 0.0317, + "step": 2108 + }, + { + "epoch": 0.422, + "learning_rate": 1.5649670034249376e-05, + "loss": 0.0008, + "step": 2110 + }, + { + "epoch": 0.4224, + "learning_rate": 1.5661185281143663e-05, + "loss": 0.2247, + "step": 2112 + }, + { + "epoch": 0.4228, + "learning_rate": 1.5672689491267562e-05, + "loss": 0.2023, + "step": 2114 + }, + { + "epoch": 0.4232, + "learning_rate": 1.5684182642193024e-05, + "loss": 0.0004, + "step": 2116 + }, + { + "epoch": 0.4236, + "learning_rate": 1.5695664711513582e-05, + "loss": 0.3457, + "step": 2118 + }, + { + "epoch": 0.424, + "learning_rate": 1.5707135676844312e-05, + "loss": 0.0014, + "step": 2120 + }, + { + "epoch": 0.4244, + "learning_rate": 1.5718595515822016e-05, + "loss": 0.3771, + "step": 2122 + }, + { + "epoch": 0.4248, + "learning_rate": 1.5730044206105146e-05, + "loss": 0.0113, + "step": 2124 + }, + { + "epoch": 0.4252, + "learning_rate": 1.574148172537389e-05, + "loss": 0.4792, + "step": 2126 + }, + { + "epoch": 0.4256, + "learning_rate": 1.5752908051330232e-05, + "loss": 0.0274, + "step": 2128 + }, + { + "epoch": 0.426, + "learning_rate": 1.5764323161697923e-05, + "loss": 0.2222, + "step": 2130 + }, + { + "epoch": 0.4264, + "learning_rate": 1.577572703422268e-05, + "loss": 0.0004, + "step": 2132 + }, + { + "epoch": 0.4268, + "learning_rate": 1.5787119646672025e-05, + "loss": 0.5952, + "step": 2134 + }, + { + "epoch": 0.4272, + "learning_rate": 1.579850097683548e-05, + "loss": 0.128, + "step": 2136 + }, + { + "epoch": 0.4276, + "learning_rate": 1.58098710025246e-05, + "loss": 0.458, + "step": 2138 + }, + { + "epoch": 0.428, + "learning_rate": 1.582122970157288e-05, + "loss": 0.6171, + "step": 2140 + }, + { + "epoch": 0.4284, + "learning_rate": 1.5832577051836016e-05, + "loss": 0.0779, + "step": 2142 + }, + { + "epoch": 0.4288, + "learning_rate": 1.5843913031191722e-05, + "loss": 0.0004, + "step": 2144 + }, + { + "epoch": 0.4292, + "learning_rate": 1.585523761753994e-05, + "loss": 0.013, + "step": 2146 + }, + { + "epoch": 0.4296, + "learning_rate": 1.586655078880281e-05, + "loss": 0.0005, + "step": 2148 + }, + { + "epoch": 0.43, + "learning_rate": 1.587785252292473e-05, + "loss": 0.0922, + "step": 2150 + }, + { + "epoch": 0.4304, + "learning_rate": 1.5889142797872383e-05, + "loss": 0.4133, + "step": 2152 + }, + { + "epoch": 0.4308, + "learning_rate": 1.5900421591634806e-05, + "loss": 0.5903, + "step": 2154 + }, + { + "epoch": 0.4312, + "learning_rate": 1.5911688882223415e-05, + "loss": 0.095, + "step": 2156 + }, + { + "epoch": 0.4316, + "learning_rate": 1.5922944647672044e-05, + "loss": 0.0785, + "step": 2158 + }, + { + "epoch": 0.432, + "learning_rate": 1.5934188866037007e-05, + "loss": 0.0002, + "step": 2160 + }, + { + "epoch": 0.4324, + "learning_rate": 1.5945421515397125e-05, + "loss": 0.0026, + "step": 2162 + }, + { + "epoch": 0.4328, + "learning_rate": 1.5956642573853787e-05, + "loss": 0.0377, + "step": 2164 + }, + { + "epoch": 0.4332, + "learning_rate": 1.5967852019530918e-05, + "loss": 0.0522, + "step": 2166 + }, + { + "epoch": 0.4336, + "learning_rate": 1.5979049830575193e-05, + "loss": 0.5234, + "step": 2168 + }, + { + "epoch": 0.434, + "learning_rate": 1.599023598515585e-05, + "loss": 0.212, + "step": 2170 + }, + { + "epoch": 0.4344, + "learning_rate": 1.6001410461464945e-05, + "loss": 0.2698, + "step": 2172 + }, + { + "epoch": 0.4348, + "learning_rate": 1.601257323771727e-05, + "loss": 0.4232, + "step": 2174 + }, + { + "epoch": 0.4352, + "learning_rate": 1.6023724292150377e-05, + "loss": 0.6806, + "step": 2176 + }, + { + "epoch": 0.4356, + "learning_rate": 1.6034863603024768e-05, + "loss": 0.5137, + "step": 2178 + }, + { + "epoch": 0.436, + "learning_rate": 1.604599114862375e-05, + "loss": 0.077, + "step": 2180 + }, + { + "epoch": 0.4364, + "learning_rate": 1.6057106907253614e-05, + "loss": 0.0251, + "step": 2182 + }, + { + "epoch": 0.4368, + "learning_rate": 1.606821085724362e-05, + "loss": 0.0544, + "step": 2184 + }, + { + "epoch": 0.4372, + "learning_rate": 1.6079302976946052e-05, + "loss": 0.2996, + "step": 2186 + }, + { + "epoch": 0.4376, + "learning_rate": 1.6090383244736253e-05, + "loss": 0.1931, + "step": 2188 + }, + { + "epoch": 0.438, + "learning_rate": 1.6101451639012675e-05, + "loss": 0.1655, + "step": 2190 + }, + { + "epoch": 0.4384, + "learning_rate": 1.6112508138196912e-05, + "loss": 0.0002, + "step": 2192 + }, + { + "epoch": 0.4388, + "learning_rate": 1.6123552720733763e-05, + "loss": 0.0648, + "step": 2194 + }, + { + "epoch": 0.4392, + "learning_rate": 1.613458536509124e-05, + "loss": 0.0005, + "step": 2196 + }, + { + "epoch": 0.4396, + "learning_rate": 1.614560604976064e-05, + "loss": 0.5891, + "step": 2198 + }, + { + "epoch": 0.44, + "learning_rate": 1.615661475325658e-05, + "loss": 0.0833, + "step": 2200 + }, + { + "epoch": 0.4404, + "learning_rate": 1.616761145411702e-05, + "loss": 0.0717, + "step": 2202 + }, + { + "epoch": 0.4408, + "learning_rate": 1.6178596130903352e-05, + "loss": 0.0026, + "step": 2204 + }, + { + "epoch": 0.4412, + "learning_rate": 1.618956876220034e-05, + "loss": 0.8255, + "step": 2206 + }, + { + "epoch": 0.4416, + "learning_rate": 1.620052932661632e-05, + "loss": 0.2652, + "step": 2208 + }, + { + "epoch": 0.442, + "learning_rate": 1.621147780278311e-05, + "loss": 0.0033, + "step": 2210 + }, + { + "epoch": 0.4424, + "learning_rate": 1.6222414169356056e-05, + "loss": 0.0038, + "step": 2212 + }, + { + "epoch": 0.4428, + "learning_rate": 1.6233338405014204e-05, + "loss": 0.1306, + "step": 2214 + }, + { + "epoch": 0.4432, + "learning_rate": 1.6244250488460146e-05, + "loss": 0.1564, + "step": 2216 + }, + { + "epoch": 0.4436, + "learning_rate": 1.6255150398420273e-05, + "loss": 0.2426, + "step": 2218 + }, + { + "epoch": 0.444, + "learning_rate": 1.6266038113644605e-05, + "loss": 0.0229, + "step": 2220 + }, + { + "epoch": 0.4444, + "learning_rate": 1.6276913612907005e-05, + "loss": 0.1331, + "step": 2222 + }, + { + "epoch": 0.4448, + "learning_rate": 1.6287776875005127e-05, + "loss": 0.34, + "step": 2224 + }, + { + "epoch": 0.4452, + "learning_rate": 1.6298627878760488e-05, + "loss": 0.6813, + "step": 2226 + }, + { + "epoch": 0.4456, + "learning_rate": 1.6309466603018497e-05, + "loss": 0.2239, + "step": 2228 + }, + { + "epoch": 0.446, + "learning_rate": 1.6320293026648508e-05, + "loss": 0.0058, + "step": 2230 + }, + { + "epoch": 0.4464, + "learning_rate": 1.6331107128543856e-05, + "loss": 1.1635, + "step": 2232 + }, + { + "epoch": 0.4468, + "learning_rate": 1.634190888762189e-05, + "loss": 0.5558, + "step": 2234 + }, + { + "epoch": 0.4472, + "learning_rate": 1.635269828282404e-05, + "loss": 0.1091, + "step": 2236 + }, + { + "epoch": 0.4476, + "learning_rate": 1.6363475293115818e-05, + "loss": 0.3789, + "step": 2238 + }, + { + "epoch": 0.448, + "learning_rate": 1.6374239897486905e-05, + "loss": 0.1148, + "step": 2240 + }, + { + "epoch": 0.4484, + "learning_rate": 1.6384992074951118e-05, + "loss": 0.0036, + "step": 2242 + }, + { + "epoch": 0.4488, + "learning_rate": 1.6395731804546575e-05, + "loss": 0.0256, + "step": 2244 + }, + { + "epoch": 0.4492, + "learning_rate": 1.640645906533561e-05, + "loss": 0.437, + "step": 2246 + }, + { + "epoch": 0.4496, + "learning_rate": 1.6417173836404878e-05, + "loss": 1.4846, + "step": 2248 + }, + { + "epoch": 0.45, + "learning_rate": 1.6427876096865397e-05, + "loss": 0.0002, + "step": 2250 + }, + { + "epoch": 0.4504, + "learning_rate": 1.643856582585253e-05, + "loss": 0.0932, + "step": 2252 + }, + { + "epoch": 0.4508, + "learning_rate": 1.6449243002526146e-05, + "loss": 0.505, + "step": 2254 + }, + { + "epoch": 0.4512, + "learning_rate": 1.6459907606070513e-05, + "loss": 0.0121, + "step": 2256 + }, + { + "epoch": 0.4516, + "learning_rate": 1.6470559615694445e-05, + "loss": 0.0024, + "step": 2258 + }, + { + "epoch": 0.452, + "learning_rate": 1.6481199010631312e-05, + "loss": 0.2553, + "step": 2260 + }, + { + "epoch": 0.4524, + "learning_rate": 1.649182577013905e-05, + "loss": 0.2888, + "step": 2262 + }, + { + "epoch": 0.4528, + "learning_rate": 1.650243987350029e-05, + "loss": 0.4118, + "step": 2264 + }, + { + "epoch": 0.4532, + "learning_rate": 1.6513041300022253e-05, + "loss": 0.2957, + "step": 2266 + }, + { + "epoch": 0.4536, + "learning_rate": 1.652363002903693e-05, + "loss": 0.1664, + "step": 2268 + }, + { + "epoch": 0.454, + "learning_rate": 1.6534206039901054e-05, + "loss": 0.0082, + "step": 2270 + }, + { + "epoch": 0.4544, + "learning_rate": 1.6544769311996146e-05, + "loss": 0.0679, + "step": 2272 + }, + { + "epoch": 0.4548, + "learning_rate": 1.655531982472857e-05, + "loss": 0.1983, + "step": 2274 + }, + { + "epoch": 0.4552, + "learning_rate": 1.656585755752956e-05, + "loss": 0.1549, + "step": 2276 + }, + { + "epoch": 0.4556, + "learning_rate": 1.657638248985527e-05, + "loss": 0.0032, + "step": 2278 + }, + { + "epoch": 0.456, + "learning_rate": 1.65868946011868e-05, + "loss": 0.7442, + "step": 2280 + }, + { + "epoch": 0.4564, + "learning_rate": 1.6597393871030257e-05, + "loss": 0.329, + "step": 2282 + }, + { + "epoch": 0.4568, + "learning_rate": 1.660788027891677e-05, + "loss": 0.0992, + "step": 2284 + }, + { + "epoch": 0.4572, + "learning_rate": 1.6618353804402573e-05, + "loss": 0.4003, + "step": 2286 + }, + { + "epoch": 0.4576, + "learning_rate": 1.6628814427068944e-05, + "loss": 0.0482, + "step": 2288 + }, + { + "epoch": 0.458, + "learning_rate": 1.663926212652242e-05, + "loss": 0.3892, + "step": 2290 + }, + { + "epoch": 0.4584, + "learning_rate": 1.6649696882394625e-05, + "loss": 0.146, + "step": 2292 + }, + { + "epoch": 0.4588, + "learning_rate": 1.666011867434252e-05, + "loss": 0.2917, + "step": 2294 + }, + { + "epoch": 0.4592, + "learning_rate": 1.667052748204825e-05, + "loss": 0.0993, + "step": 2296 + }, + { + "epoch": 0.4596, + "learning_rate": 1.6680923285219308e-05, + "loss": 0.0202, + "step": 2298 + }, + { + "epoch": 0.46, + "learning_rate": 1.6691306063588583e-05, + "loss": 0.1274, + "step": 2300 + }, + { + "epoch": 0.4604, + "learning_rate": 1.6701675796914273e-05, + "loss": 0.0253, + "step": 2302 + }, + { + "epoch": 0.4608, + "learning_rate": 1.6712032464980094e-05, + "loss": 0.2633, + "step": 2304 + }, + { + "epoch": 0.4612, + "learning_rate": 1.672237604759516e-05, + "loss": 0.185, + "step": 2306 + }, + { + "epoch": 0.4616, + "learning_rate": 1.6732706524594138e-05, + "loss": 0.0422, + "step": 2308 + }, + { + "epoch": 0.462, + "learning_rate": 1.6743023875837233e-05, + "loss": 0.871, + "step": 2310 + }, + { + "epoch": 0.4624, + "learning_rate": 1.6753328081210244e-05, + "loss": 0.4481, + "step": 2312 + }, + { + "epoch": 0.4628, + "learning_rate": 1.6763619120624592e-05, + "loss": 0.0668, + "step": 2314 + }, + { + "epoch": 0.4632, + "learning_rate": 1.6773896974017373e-05, + "loss": 0.0007, + "step": 2316 + }, + { + "epoch": 0.4636, + "learning_rate": 1.6784161621351377e-05, + "loss": 0.1647, + "step": 2318 + }, + { + "epoch": 0.464, + "learning_rate": 1.679441304261516e-05, + "loss": 0.2018, + "step": 2320 + }, + { + "epoch": 0.4644, + "learning_rate": 1.6804651217823048e-05, + "loss": 0.1094, + "step": 2322 + }, + { + "epoch": 0.4648, + "learning_rate": 1.681487612701519e-05, + "loss": 0.0652, + "step": 2324 + }, + { + "epoch": 0.4652, + "learning_rate": 1.6825087750257624e-05, + "loss": 0.4689, + "step": 2326 + }, + { + "epoch": 0.4656, + "learning_rate": 1.683528606764222e-05, + "loss": 0.1818, + "step": 2328 + }, + { + "epoch": 0.466, + "learning_rate": 1.6845471059286893e-05, + "loss": 0.1519, + "step": 2330 + }, + { + "epoch": 0.4664, + "learning_rate": 1.6855642705335428e-05, + "loss": 0.2466, + "step": 2332 + }, + { + "epoch": 0.4668, + "learning_rate": 1.6865800985957718e-05, + "loss": 0.0988, + "step": 2334 + }, + { + "epoch": 0.4672, + "learning_rate": 1.687594588134968e-05, + "loss": 0.5546, + "step": 2336 + }, + { + "epoch": 0.4676, + "learning_rate": 1.6886077371733275e-05, + "loss": 0.0568, + "step": 2338 + }, + { + "epoch": 0.468, + "learning_rate": 1.68961954373567e-05, + "loss": 0.5638, + "step": 2340 + }, + { + "epoch": 0.4684, + "learning_rate": 1.690630005849423e-05, + "loss": 0.1435, + "step": 2342 + }, + { + "epoch": 0.4688, + "learning_rate": 1.6916391215446403e-05, + "loss": 0.4353, + "step": 2344 + }, + { + "epoch": 0.4692, + "learning_rate": 1.6926468888539988e-05, + "loss": 0.0505, + "step": 2346 + }, + { + "epoch": 0.4696, + "learning_rate": 1.693653305812805e-05, + "loss": 0.0625, + "step": 2348 + }, + { + "epoch": 0.47, + "learning_rate": 1.6946583704589973e-05, + "loss": 0.1946, + "step": 2350 + }, + { + "epoch": 0.4704, + "learning_rate": 1.6956620808331505e-05, + "loss": 0.2281, + "step": 2352 + }, + { + "epoch": 0.4708, + "learning_rate": 1.6966644349784805e-05, + "loss": 0.2154, + "step": 2354 + }, + { + "epoch": 0.4712, + "learning_rate": 1.697665430940846e-05, + "loss": 0.3547, + "step": 2356 + }, + { + "epoch": 0.4716, + "learning_rate": 1.698665066768755e-05, + "loss": 0.0011, + "step": 2358 + }, + { + "epoch": 0.472, + "learning_rate": 1.699663340513365e-05, + "loss": 0.0045, + "step": 2360 + }, + { + "epoch": 0.4724, + "learning_rate": 1.7006602502284913e-05, + "loss": 0.001, + "step": 2362 + }, + { + "epoch": 0.4728, + "learning_rate": 1.7016557939706068e-05, + "loss": 0.2073, + "step": 2364 + }, + { + "epoch": 0.4732, + "learning_rate": 1.70264996979885e-05, + "loss": 0.0088, + "step": 2366 + }, + { + "epoch": 0.4736, + "learning_rate": 1.7036427757750198e-05, + "loss": 0.1794, + "step": 2368 + }, + { + "epoch": 0.474, + "learning_rate": 1.7046342099635938e-05, + "loss": 0.0626, + "step": 2370 + }, + { + "epoch": 0.4744, + "learning_rate": 1.7056242704317212e-05, + "loss": 0.0018, + "step": 2372 + }, + { + "epoch": 0.4748, + "learning_rate": 1.706612955249224e-05, + "loss": 0.0476, + "step": 2374 + }, + { + "epoch": 0.4752, + "learning_rate": 1.7076002624886156e-05, + "loss": 0.1673, + "step": 2376 + }, + { + "epoch": 0.4756, + "learning_rate": 1.708586190225085e-05, + "loss": 0.0381, + "step": 2378 + }, + { + "epoch": 0.476, + "learning_rate": 1.709570736536521e-05, + "loss": 0.2094, + "step": 2380 + }, + { + "epoch": 0.4764, + "learning_rate": 1.710553899503496e-05, + "loss": 0.1627, + "step": 2382 + }, + { + "epoch": 0.4768, + "learning_rate": 1.7115356772092844e-05, + "loss": 0.009, + "step": 2384 + }, + { + "epoch": 0.4772, + "learning_rate": 1.7125160677398625e-05, + "loss": 0.0008, + "step": 2386 + }, + { + "epoch": 0.4776, + "learning_rate": 1.7134950691839063e-05, + "loss": 0.0734, + "step": 2388 + }, + { + "epoch": 0.478, + "learning_rate": 1.7144726796328034e-05, + "loss": 0.0033, + "step": 2390 + }, + { + "epoch": 0.4784, + "learning_rate": 1.7154488971806518e-05, + "loss": 0.0206, + "step": 2392 + }, + { + "epoch": 0.4788, + "learning_rate": 1.716423719924266e-05, + "loss": 0.9297, + "step": 2394 + }, + { + "epoch": 0.4792, + "learning_rate": 1.7173971459631783e-05, + "loss": 0.1895, + "step": 2396 + }, + { + "epoch": 0.4796, + "learning_rate": 1.718369173399646e-05, + "loss": 0.0092, + "step": 2398 + }, + { + "epoch": 0.48, + "learning_rate": 1.7193398003386507e-05, + "loss": 0.2112, + "step": 2400 + }, + { + "epoch": 0.4804, + "learning_rate": 1.7203090248879063e-05, + "loss": 0.008, + "step": 2402 + }, + { + "epoch": 0.4808, + "learning_rate": 1.7212768451578602e-05, + "loss": 0.0323, + "step": 2404 + }, + { + "epoch": 0.4812, + "learning_rate": 1.7222432592616963e-05, + "loss": 0.0016, + "step": 2406 + }, + { + "epoch": 0.4816, + "learning_rate": 1.7232082653153416e-05, + "loss": 1.0644, + "step": 2408 + }, + { + "epoch": 0.482, + "learning_rate": 1.724171861437467e-05, + "loss": 0.4128, + "step": 2410 + }, + { + "epoch": 0.4824, + "learning_rate": 1.7251340457494937e-05, + "loss": 0.0022, + "step": 2412 + }, + { + "epoch": 0.4828, + "learning_rate": 1.726094816375591e-05, + "loss": 0.0002, + "step": 2414 + }, + { + "epoch": 0.4832, + "learning_rate": 1.7270541714426923e-05, + "loss": 0.2985, + "step": 2416 + }, + { + "epoch": 0.4836, + "learning_rate": 1.7280121090804817e-05, + "loss": 0.1265, + "step": 2418 + }, + { + "epoch": 0.484, + "learning_rate": 1.7289686274214106e-05, + "loss": 0.059, + "step": 2420 + }, + { + "epoch": 0.4844, + "learning_rate": 1.7299237246007018e-05, + "loss": 0.0626, + "step": 2422 + }, + { + "epoch": 0.4848, + "learning_rate": 1.7308773987563393e-05, + "loss": 0.0059, + "step": 2424 + }, + { + "epoch": 0.4852, + "learning_rate": 1.7318296480290912e-05, + "loss": 0.0169, + "step": 2426 + }, + { + "epoch": 0.4856, + "learning_rate": 1.732780470562496e-05, + "loss": 0.0027, + "step": 2428 + }, + { + "epoch": 0.486, + "learning_rate": 1.7337298645028764e-05, + "loss": 0.0164, + "step": 2430 + }, + { + "epoch": 0.4864, + "learning_rate": 1.7346778279993413e-05, + "loss": 0.1118, + "step": 2432 + }, + { + "epoch": 0.4868, + "learning_rate": 1.7356243592037872e-05, + "loss": 0.5839, + "step": 2434 + }, + { + "epoch": 0.4872, + "learning_rate": 1.736569456270903e-05, + "loss": 0.1655, + "step": 2436 + }, + { + "epoch": 0.4876, + "learning_rate": 1.7375131173581737e-05, + "loss": 0.1394, + "step": 2438 + }, + { + "epoch": 0.488, + "learning_rate": 1.7384553406258836e-05, + "loss": 0.3164, + "step": 2440 + }, + { + "epoch": 0.4884, + "learning_rate": 1.73939612423712e-05, + "loss": 0.0317, + "step": 2442 + }, + { + "epoch": 0.4888, + "learning_rate": 1.740335466357778e-05, + "loss": 0.0894, + "step": 2444 + }, + { + "epoch": 0.4892, + "learning_rate": 1.7412733651565607e-05, + "loss": 0.0005, + "step": 2446 + }, + { + "epoch": 0.4896, + "learning_rate": 1.7422098188049888e-05, + "loss": 0.0064, + "step": 2448 + }, + { + "epoch": 0.49, + "learning_rate": 1.7431448254773936e-05, + "loss": 0.2433, + "step": 2450 + }, + { + "epoch": 0.4904, + "learning_rate": 1.7440783833509373e-05, + "loss": 0.1225, + "step": 2452 + }, + { + "epoch": 0.4908, + "learning_rate": 1.7450104906055956e-05, + "loss": 0.1878, + "step": 2454 + }, + { + "epoch": 0.4912, + "learning_rate": 1.7459411454241816e-05, + "loss": 0.0181, + "step": 2456 + }, + { + "epoch": 0.4916, + "learning_rate": 1.746870345992336e-05, + "loss": 0.0031, + "step": 2458 + }, + { + "epoch": 0.492, + "learning_rate": 1.747798090498531e-05, + "loss": 0.2124, + "step": 2460 + }, + { + "epoch": 0.4924, + "learning_rate": 1.7487243771340865e-05, + "loss": 0.1661, + "step": 2462 + }, + { + "epoch": 0.4928, + "learning_rate": 1.749649204093154e-05, + "loss": 0.0143, + "step": 2464 + }, + { + "epoch": 0.4932, + "learning_rate": 1.750572569572741e-05, + "loss": 0.0201, + "step": 2466 + }, + { + "epoch": 0.4936, + "learning_rate": 1.7514944717726962e-05, + "loss": 0.7058, + "step": 2468 + }, + { + "epoch": 0.494, + "learning_rate": 1.7524149088957244e-05, + "loss": 0.072, + "step": 2470 + }, + { + "epoch": 0.4944, + "learning_rate": 1.753333879147387e-05, + "loss": 0.2046, + "step": 2472 + }, + { + "epoch": 0.4948, + "learning_rate": 1.7542513807361037e-05, + "loss": 0.2849, + "step": 2474 + }, + { + "epoch": 0.4952, + "learning_rate": 1.755167411873159e-05, + "loss": 0.159, + "step": 2476 + }, + { + "epoch": 0.4956, + "learning_rate": 1.7560819707727027e-05, + "loss": 0.0001, + "step": 2478 + }, + { + "epoch": 0.496, + "learning_rate": 1.7569950556517563e-05, + "loss": 0.0003, + "step": 2480 + }, + { + "epoch": 0.4964, + "learning_rate": 1.757906664730213e-05, + "loss": 0.0018, + "step": 2482 + }, + { + "epoch": 0.4968, + "learning_rate": 1.758816796230845e-05, + "loss": 0.1557, + "step": 2484 + }, + { + "epoch": 0.4972, + "learning_rate": 1.759725448379304e-05, + "loss": 0.137, + "step": 2486 + }, + { + "epoch": 0.4976, + "learning_rate": 1.7606326194041278e-05, + "loss": 0.0164, + "step": 2488 + }, + { + "epoch": 0.498, + "learning_rate": 1.7615383075367363e-05, + "loss": 0.0087, + "step": 2490 + }, + { + "epoch": 0.4984, + "learning_rate": 1.762442511011447e-05, + "loss": 0.0722, + "step": 2492 + }, + { + "epoch": 0.4988, + "learning_rate": 1.763345228065469e-05, + "loss": 0.0131, + "step": 2494 + }, + { + "epoch": 0.4992, + "learning_rate": 1.7642464569389083e-05, + "loss": 0.0003, + "step": 2496 + }, + { + "epoch": 0.4996, + "learning_rate": 1.7651461958747745e-05, + "loss": 0.0166, + "step": 2498 + }, + { + "epoch": 0.5, + "learning_rate": 1.766044443118977e-05, + "loss": 0.855, + "step": 2500 + }, + { + "epoch": 0.5004, + "learning_rate": 1.766941196920342e-05, + "loss": 0.1425, + "step": 2502 + }, + { + "epoch": 0.5008, + "learning_rate": 1.767836455530598e-05, + "loss": 0.0277, + "step": 2504 + }, + { + "epoch": 0.5012, + "learning_rate": 1.7687302172043933e-05, + "loss": 0.0028, + "step": 2506 + }, + { + "epoch": 0.5016, + "learning_rate": 1.7696224801992947e-05, + "loss": 0.0977, + "step": 2508 + }, + { + "epoch": 0.502, + "learning_rate": 1.7705132427757885e-05, + "loss": 0.1215, + "step": 2510 + }, + { + "epoch": 0.5024, + "learning_rate": 1.77140250319729e-05, + "loss": 0.1106, + "step": 2512 + }, + { + "epoch": 0.5028, + "learning_rate": 1.7722902597301385e-05, + "loss": 0.0002, + "step": 2514 + }, + { + "epoch": 0.5032, + "learning_rate": 1.7731765106436073e-05, + "loss": 0.0094, + "step": 2516 + }, + { + "epoch": 0.5036, + "learning_rate": 1.774061254209905e-05, + "loss": 0.0001, + "step": 2518 + }, + { + "epoch": 0.504, + "learning_rate": 1.7749444887041793e-05, + "loss": 0.1497, + "step": 2520 + }, + { + "epoch": 0.5044, + "learning_rate": 1.7758262124045192e-05, + "loss": 0.8678, + "step": 2522 + }, + { + "epoch": 0.5048, + "learning_rate": 1.776706423591959e-05, + "loss": 0.2667, + "step": 2524 + }, + { + "epoch": 0.5052, + "learning_rate": 1.7775851205504816e-05, + "loss": 0.0065, + "step": 2526 + }, + { + "epoch": 0.5056, + "learning_rate": 1.778462301567023e-05, + "loss": 0.2495, + "step": 2528 + }, + { + "epoch": 0.506, + "learning_rate": 1.7793379649314736e-05, + "loss": 0.1486, + "step": 2530 + }, + { + "epoch": 0.5064, + "learning_rate": 1.7802121089366832e-05, + "loss": 0.1303, + "step": 2532 + }, + { + "epoch": 0.5068, + "learning_rate": 1.7810847318784635e-05, + "loss": 0.034, + "step": 2534 + }, + { + "epoch": 0.5072, + "learning_rate": 1.7819558320555895e-05, + "loss": 0.3787, + "step": 2536 + }, + { + "epoch": 0.5076, + "learning_rate": 1.7828254077698103e-05, + "loss": 0.2744, + "step": 2538 + }, + { + "epoch": 0.508, + "learning_rate": 1.7836934573258392e-05, + "loss": 0.107, + "step": 2540 + }, + { + "epoch": 0.5084, + "learning_rate": 1.7845599790313735e-05, + "loss": 0.1697, + "step": 2542 + }, + { + "epoch": 0.5088, + "learning_rate": 1.785424971197082e-05, + "loss": 0.1501, + "step": 2544 + }, + { + "epoch": 0.5092, + "learning_rate": 1.786288432136618e-05, + "loss": 0.3218, + "step": 2546 + }, + { + "epoch": 0.5096, + "learning_rate": 1.7871503601666233e-05, + "loss": 0.0027, + "step": 2548 + }, + { + "epoch": 0.51, + "learning_rate": 1.788010753606722e-05, + "loss": 0.3062, + "step": 2550 + }, + { + "epoch": 0.5104, + "learning_rate": 1.7888696107795343e-05, + "loss": 0.1125, + "step": 2552 + }, + { + "epoch": 0.5108, + "learning_rate": 1.7897269300106735e-05, + "loss": 0.012, + "step": 2554 + }, + { + "epoch": 0.5112, + "learning_rate": 1.790582709628753e-05, + "loss": 0.1479, + "step": 2556 + }, + { + "epoch": 0.5116, + "learning_rate": 1.7914369479653854e-05, + "loss": 0.8671, + "step": 2558 + }, + { + "epoch": 0.512, + "learning_rate": 1.7922896433551903e-05, + "loss": 0.0038, + "step": 2560 + }, + { + "epoch": 0.5124, + "learning_rate": 1.7931407941357945e-05, + "loss": 0.1144, + "step": 2562 + }, + { + "epoch": 0.5128, + "learning_rate": 1.793990398647835e-05, + "loss": 0.0043, + "step": 2564 + }, + { + "epoch": 0.5132, + "learning_rate": 1.7948384552349655e-05, + "loss": 0.8231, + "step": 2566 + }, + { + "epoch": 0.5136, + "learning_rate": 1.795684962243855e-05, + "loss": 0.6871, + "step": 2568 + }, + { + "epoch": 0.514, + "learning_rate": 1.796529918024196e-05, + "loss": 0.4413, + "step": 2570 + }, + { + "epoch": 0.5144, + "learning_rate": 1.7973733209287032e-05, + "loss": 0.7429, + "step": 2572 + }, + { + "epoch": 0.5148, + "learning_rate": 1.798215169313121e-05, + "loss": 0.3078, + "step": 2574 + }, + { + "epoch": 0.5152, + "learning_rate": 1.7990554615362193e-05, + "loss": 1.193, + "step": 2576 + }, + { + "epoch": 0.5156, + "learning_rate": 1.79989419595981e-05, + "loss": 0.4016, + "step": 2578 + }, + { + "epoch": 0.516, + "learning_rate": 1.800731370948734e-05, + "loss": 0.0004, + "step": 2580 + }, + { + "epoch": 0.5164, + "learning_rate": 1.8015669848708757e-05, + "loss": 0.0968, + "step": 2582 + }, + { + "epoch": 0.5168, + "learning_rate": 1.802401036097167e-05, + "loss": 0.6723, + "step": 2584 + }, + { + "epoch": 0.5172, + "learning_rate": 1.803233523001577e-05, + "loss": 0.7768, + "step": 2586 + }, + { + "epoch": 0.5176, + "learning_rate": 1.804064443961135e-05, + "loss": 0.2183, + "step": 2588 + }, + { + "epoch": 0.518, + "learning_rate": 1.804893797355914e-05, + "loss": 0.1795, + "step": 2590 + }, + { + "epoch": 0.5184, + "learning_rate": 1.8057215815690494e-05, + "loss": 0.1923, + "step": 2592 + }, + { + "epoch": 0.5188, + "learning_rate": 1.8065477949867327e-05, + "loss": 0.7449, + "step": 2594 + }, + { + "epoch": 0.5192, + "learning_rate": 1.8073724359982184e-05, + "loss": 0.2646, + "step": 2596 + }, + { + "epoch": 0.5196, + "learning_rate": 1.808195502995827e-05, + "loss": 0.2389, + "step": 2598 + }, + { + "epoch": 0.52, + "learning_rate": 1.809016994374947e-05, + "loss": 0.0047, + "step": 2600 + }, + { + "epoch": 0.5204, + "learning_rate": 1.8098369085340397e-05, + "loss": 0.1798, + "step": 2602 + }, + { + "epoch": 0.5208, + "learning_rate": 1.81065524387464e-05, + "loss": 0.1871, + "step": 2604 + }, + { + "epoch": 0.5212, + "learning_rate": 1.8114719988013606e-05, + "loss": 0.1991, + "step": 2606 + }, + { + "epoch": 0.5216, + "learning_rate": 1.8122871717218968e-05, + "loss": 0.0566, + "step": 2608 + }, + { + "epoch": 0.522, + "learning_rate": 1.813100761047028e-05, + "loss": 0.1937, + "step": 2610 + }, + { + "epoch": 0.5224, + "learning_rate": 1.8139127651906176e-05, + "loss": 0.2673, + "step": 2612 + }, + { + "epoch": 0.5228, + "learning_rate": 1.8147231825696258e-05, + "loss": 0.0815, + "step": 2614 + }, + { + "epoch": 0.5232, + "learning_rate": 1.8155320116040976e-05, + "loss": 0.1445, + "step": 2616 + }, + { + "epoch": 0.5236, + "learning_rate": 1.8163392507171834e-05, + "loss": 0.5201, + "step": 2618 + }, + { + "epoch": 0.524, + "learning_rate": 1.817144898335129e-05, + "loss": 0.2982, + "step": 2620 + }, + { + "epoch": 0.5244, + "learning_rate": 1.8179489528872797e-05, + "loss": 0.3313, + "step": 2622 + }, + { + "epoch": 0.5248, + "learning_rate": 1.818751412806095e-05, + "loss": 0.0218, + "step": 2624 + }, + { + "epoch": 0.5252, + "learning_rate": 1.819552276527134e-05, + "loss": 0.0619, + "step": 2626 + }, + { + "epoch": 0.5256, + "learning_rate": 1.8203515424890738e-05, + "loss": 0.3162, + "step": 2628 + }, + { + "epoch": 0.526, + "learning_rate": 1.821149209133704e-05, + "loss": 0.1284, + "step": 2630 + }, + { + "epoch": 0.5264, + "learning_rate": 1.8219452749059322e-05, + "loss": 0.1085, + "step": 2632 + }, + { + "epoch": 0.5268, + "learning_rate": 1.82273973825379e-05, + "loss": 0.3785, + "step": 2634 + }, + { + "epoch": 0.5272, + "learning_rate": 1.8235325976284276e-05, + "loss": 0.0673, + "step": 2636 + }, + { + "epoch": 0.5276, + "learning_rate": 1.8243238514841258e-05, + "loss": 0.2382, + "step": 2638 + }, + { + "epoch": 0.528, + "learning_rate": 1.8251134982782952e-05, + "loss": 0.3246, + "step": 2640 + }, + { + "epoch": 0.5284, + "learning_rate": 1.8259015364714786e-05, + "loss": 0.138, + "step": 2642 + }, + { + "epoch": 0.5288, + "learning_rate": 1.826687964527355e-05, + "loss": 0.0277, + "step": 2644 + }, + { + "epoch": 0.5292, + "learning_rate": 1.8274727809127437e-05, + "loss": 0.0395, + "step": 2646 + }, + { + "epoch": 0.5296, + "learning_rate": 1.828255984097604e-05, + "loss": 0.0078, + "step": 2648 + }, + { + "epoch": 0.53, + "learning_rate": 1.8290375725550413e-05, + "loss": 0.1472, + "step": 2650 + }, + { + "epoch": 0.5304, + "learning_rate": 1.8298175447613093e-05, + "loss": 0.2816, + "step": 2652 + }, + { + "epoch": 0.5308, + "learning_rate": 1.8305958991958125e-05, + "loss": 0.0372, + "step": 2654 + }, + { + "epoch": 0.5312, + "learning_rate": 1.8313726343411092e-05, + "loss": 0.0025, + "step": 2656 + }, + { + "epoch": 0.5316, + "learning_rate": 1.832147748682912e-05, + "loss": 0.0215, + "step": 2658 + }, + { + "epoch": 0.532, + "learning_rate": 1.8329212407101e-05, + "loss": 0.0122, + "step": 2660 + }, + { + "epoch": 0.5324, + "learning_rate": 1.8336931089147065e-05, + "loss": 0.0007, + "step": 2662 + }, + { + "epoch": 0.5328, + "learning_rate": 1.8344633517919394e-05, + "loss": 0.2679, + "step": 2664 + }, + { + "epoch": 0.5332, + "learning_rate": 1.8352319678401677e-05, + "loss": 0.1119, + "step": 2666 + }, + { + "epoch": 0.5336, + "learning_rate": 1.8359989555609344e-05, + "loss": 0.2351, + "step": 2668 + }, + { + "epoch": 0.534, + "learning_rate": 1.836764313458962e-05, + "loss": 0.0712, + "step": 2670 + }, + { + "epoch": 0.5344, + "learning_rate": 1.8375280400421407e-05, + "loss": 0.0257, + "step": 2672 + }, + { + "epoch": 0.5348, + "learning_rate": 1.8382901338215515e-05, + "loss": 0.0932, + "step": 2674 + }, + { + "epoch": 0.5352, + "learning_rate": 1.8390505933114503e-05, + "loss": 0.0871, + "step": 2676 + }, + { + "epoch": 0.5356, + "learning_rate": 1.839809417029283e-05, + "loss": 0.0518, + "step": 2678 + }, + { + "epoch": 0.536, + "learning_rate": 1.8405666034956842e-05, + "loss": 0.0116, + "step": 2680 + }, + { + "epoch": 0.5364, + "learning_rate": 1.8413221512344805e-05, + "loss": 0.0855, + "step": 2682 + }, + { + "epoch": 0.5368, + "learning_rate": 1.842076058772692e-05, + "loss": 0.0012, + "step": 2684 + }, + { + "epoch": 0.5372, + "learning_rate": 1.8428283246405386e-05, + "loss": 0.0004, + "step": 2686 + }, + { + "epoch": 0.5376, + "learning_rate": 1.8435789473714384e-05, + "loss": 0.2391, + "step": 2688 + }, + { + "epoch": 0.538, + "learning_rate": 1.844327925502015e-05, + "loss": 0.1576, + "step": 2690 + }, + { + "epoch": 0.5384, + "learning_rate": 1.8450752575720964e-05, + "loss": 0.0112, + "step": 2692 + }, + { + "epoch": 0.5388, + "learning_rate": 1.8458209421247205e-05, + "loss": 0.0005, + "step": 2694 + }, + { + "epoch": 0.5392, + "learning_rate": 1.8465649777061384e-05, + "loss": 0.3066, + "step": 2696 + }, + { + "epoch": 0.5396, + "learning_rate": 1.8473073628658116e-05, + "loss": 0.2776, + "step": 2698 + }, + { + "epoch": 0.54, + "learning_rate": 1.8480480961564266e-05, + "loss": 0.0003, + "step": 2700 + }, + { + "epoch": 0.5404, + "learning_rate": 1.848787176133881e-05, + "loss": 0.0005, + "step": 2702 + }, + { + "epoch": 0.5408, + "learning_rate": 1.8495246013573047e-05, + "loss": 0.0012, + "step": 2704 + }, + { + "epoch": 0.5412, + "learning_rate": 1.850260370389049e-05, + "loss": 0.6171, + "step": 2706 + }, + { + "epoch": 0.5416, + "learning_rate": 1.850994481794691e-05, + "loss": 0.3418, + "step": 2708 + }, + { + "epoch": 0.542, + "learning_rate": 1.851726934143048e-05, + "loss": 0.0223, + "step": 2710 + }, + { + "epoch": 0.5424, + "learning_rate": 1.8524577260061628e-05, + "loss": 0.1747, + "step": 2712 + }, + { + "epoch": 0.5428, + "learning_rate": 1.8531868559593205e-05, + "loss": 0.3402, + "step": 2714 + }, + { + "epoch": 0.5432, + "learning_rate": 1.8539143225810453e-05, + "loss": 0.0227, + "step": 2716 + }, + { + "epoch": 0.5436, + "learning_rate": 1.8546401244531028e-05, + "loss": 1.5111, + "step": 2718 + }, + { + "epoch": 0.544, + "learning_rate": 1.8553642601605066e-05, + "loss": 0.0317, + "step": 2720 + }, + { + "epoch": 0.5444, + "learning_rate": 1.856086728291516e-05, + "loss": 0.0009, + "step": 2722 + }, + { + "epoch": 0.5448, + "learning_rate": 1.856807527437643e-05, + "loss": 0.2557, + "step": 2724 + }, + { + "epoch": 0.5452, + "learning_rate": 1.857526656193652e-05, + "loss": 0.2661, + "step": 2726 + }, + { + "epoch": 0.5456, + "learning_rate": 1.8582441131575658e-05, + "loss": 0.0043, + "step": 2728 + }, + { + "epoch": 0.546, + "learning_rate": 1.8589598969306643e-05, + "loss": 0.2358, + "step": 2730 + }, + { + "epoch": 0.5464, + "learning_rate": 1.859674006117491e-05, + "loss": 0.1864, + "step": 2732 + }, + { + "epoch": 0.5468, + "learning_rate": 1.860386439325853e-05, + "loss": 0.0002, + "step": 2734 + }, + { + "epoch": 0.5472, + "learning_rate": 1.8610971951668268e-05, + "loss": 0.0027, + "step": 2736 + }, + { + "epoch": 0.5476, + "learning_rate": 1.8618062722547544e-05, + "loss": 0.0085, + "step": 2738 + }, + { + "epoch": 0.548, + "learning_rate": 1.862513669207257e-05, + "loss": 0.0006, + "step": 2740 + }, + { + "epoch": 0.5484, + "learning_rate": 1.8632193846452274e-05, + "loss": 0.0002, + "step": 2742 + }, + { + "epoch": 0.5488, + "learning_rate": 1.8639234171928348e-05, + "loss": 0.0129, + "step": 2744 + }, + { + "epoch": 0.5492, + "learning_rate": 1.8646257654775354e-05, + "loss": 0.0595, + "step": 2746 + }, + { + "epoch": 0.5496, + "learning_rate": 1.8653264281300612e-05, + "loss": 0.0258, + "step": 2748 + }, + { + "epoch": 0.55, + "learning_rate": 1.866025403784439e-05, + "loss": 1.47, + "step": 2750 + }, + { + "epoch": 0.5504, + "learning_rate": 1.8667226910779767e-05, + "loss": 0.2217, + "step": 2752 + }, + { + "epoch": 0.5508, + "learning_rate": 1.8674182886512776e-05, + "loss": 0.0348, + "step": 2754 + }, + { + "epoch": 0.5512, + "learning_rate": 1.8681121951482393e-05, + "loss": 0.1996, + "step": 2756 + }, + { + "epoch": 0.5516, + "learning_rate": 1.8688044092160554e-05, + "loss": 0.0781, + "step": 2758 + }, + { + "epoch": 0.552, + "learning_rate": 1.869494929505219e-05, + "loss": 0.0405, + "step": 2760 + }, + { + "epoch": 0.5524, + "learning_rate": 1.8701837546695256e-05, + "loss": 0.3227, + "step": 2762 + }, + { + "epoch": 0.5528, + "learning_rate": 1.870870883366075e-05, + "loss": 0.2985, + "step": 2764 + }, + { + "epoch": 0.5532, + "learning_rate": 1.871556314255275e-05, + "loss": 0.6026, + "step": 2766 + }, + { + "epoch": 0.5536, + "learning_rate": 1.8722400460008434e-05, + "loss": 0.257, + "step": 2768 + }, + { + "epoch": 0.554, + "learning_rate": 1.8729220772698093e-05, + "loss": 0.2678, + "step": 2770 + }, + { + "epoch": 0.5544, + "learning_rate": 1.8736024067325195e-05, + "loss": 0.4537, + "step": 2772 + }, + { + "epoch": 0.5548, + "learning_rate": 1.8742810330626335e-05, + "loss": 0.0909, + "step": 2774 + }, + { + "epoch": 0.5552, + "learning_rate": 1.8749579549371373e-05, + "loss": 0.1821, + "step": 2776 + }, + { + "epoch": 0.5556, + "learning_rate": 1.8756331710363368e-05, + "loss": 0.0059, + "step": 2778 + }, + { + "epoch": 0.556, + "learning_rate": 1.876306680043863e-05, + "loss": 0.3245, + "step": 2780 + }, + { + "epoch": 0.5564, + "learning_rate": 1.876978480646677e-05, + "loss": 0.0553, + "step": 2782 + }, + { + "epoch": 0.5568, + "learning_rate": 1.8776485715350665e-05, + "loss": 0.0396, + "step": 2784 + }, + { + "epoch": 0.5572, + "learning_rate": 1.878316951402658e-05, + "loss": 0.008, + "step": 2786 + }, + { + "epoch": 0.5576, + "learning_rate": 1.878983618946409e-05, + "loss": 1.0838, + "step": 2788 + }, + { + "epoch": 0.558, + "learning_rate": 1.879648572866617e-05, + "loss": 0.1001, + "step": 2790 + }, + { + "epoch": 0.5584, + "learning_rate": 1.8803118118669203e-05, + "loss": 0.0205, + "step": 2792 + }, + { + "epoch": 0.5588, + "learning_rate": 1.8809733346543006e-05, + "loss": 0.1251, + "step": 2794 + }, + { + "epoch": 0.5592, + "learning_rate": 1.881633139939087e-05, + "loss": 0.1023, + "step": 2796 + }, + { + "epoch": 0.5596, + "learning_rate": 1.8822912264349532e-05, + "loss": 0.034, + "step": 2798 + }, + { + "epoch": 0.56, + "learning_rate": 1.882947592858927e-05, + "loss": 0.0384, + "step": 2800 + }, + { + "epoch": 0.5604, + "learning_rate": 1.8836022379313877e-05, + "loss": 0.0127, + "step": 2802 + }, + { + "epoch": 0.5608, + "learning_rate": 1.884255160376072e-05, + "loss": 0.2175, + "step": 2804 + }, + { + "epoch": 0.5612, + "learning_rate": 1.8849063589200744e-05, + "loss": 0.3371, + "step": 2806 + }, + { + "epoch": 0.5616, + "learning_rate": 1.885555832293849e-05, + "loss": 0.1742, + "step": 2808 + }, + { + "epoch": 0.562, + "learning_rate": 1.8862035792312145e-05, + "loss": 0.2394, + "step": 2810 + }, + { + "epoch": 0.5624, + "learning_rate": 1.886849598469356e-05, + "loss": 0.0942, + "step": 2812 + }, + { + "epoch": 0.5628, + "learning_rate": 1.8874938887488246e-05, + "loss": 0.0905, + "step": 2814 + }, + { + "epoch": 0.5632, + "learning_rate": 1.888136448813544e-05, + "loss": 0.4946, + "step": 2816 + }, + { + "epoch": 0.5636, + "learning_rate": 1.888777277410812e-05, + "loss": 0.0261, + "step": 2818 + }, + { + "epoch": 0.564, + "learning_rate": 1.8894163732912972e-05, + "loss": 0.0121, + "step": 2820 + }, + { + "epoch": 0.5644, + "learning_rate": 1.890053735209053e-05, + "loss": 0.6277, + "step": 2822 + }, + { + "epoch": 0.5648, + "learning_rate": 1.890689361921506e-05, + "loss": 0.0203, + "step": 2824 + }, + { + "epoch": 0.5652, + "learning_rate": 1.8913232521894737e-05, + "loss": 0.9768, + "step": 2826 + }, + { + "epoch": 0.5656, + "learning_rate": 1.891955404777151e-05, + "loss": 0.2142, + "step": 2828 + }, + { + "epoch": 0.566, + "learning_rate": 1.8925858184521248e-05, + "loss": 0.0841, + "step": 2830 + }, + { + "epoch": 0.5664, + "learning_rate": 1.893214491985374e-05, + "loss": 0.0083, + "step": 2832 + }, + { + "epoch": 0.5668, + "learning_rate": 1.8938414241512634e-05, + "loss": 0.2404, + "step": 2834 + }, + { + "epoch": 0.5672, + "learning_rate": 1.89446661372756e-05, + "loss": 0.0046, + "step": 2836 + }, + { + "epoch": 0.5676, + "learning_rate": 1.8950900594954226e-05, + "loss": 0.0259, + "step": 2838 + }, + { + "epoch": 0.568, + "learning_rate": 1.895711760239413e-05, + "loss": 0.1775, + "step": 2840 + }, + { + "epoch": 0.5684, + "learning_rate": 1.896331714747493e-05, + "loss": 1.0387, + "step": 2842 + }, + { + "epoch": 0.5688, + "learning_rate": 1.89694992181103e-05, + "loss": 0.3554, + "step": 2844 + }, + { + "epoch": 0.5692, + "learning_rate": 1.8975663802247975e-05, + "loss": 0.4114, + "step": 2846 + }, + { + "epoch": 0.5696, + "learning_rate": 1.8981810887869784e-05, + "loss": 0.0597, + "step": 2848 + }, + { + "epoch": 0.57, + "learning_rate": 1.898794046299167e-05, + "loss": 0.3576, + "step": 2850 + }, + { + "epoch": 0.5704, + "learning_rate": 1.8994052515663708e-05, + "loss": 0.5584, + "step": 2852 + }, + { + "epoch": 0.5708, + "learning_rate": 1.9000147033970144e-05, + "loss": 0.1024, + "step": 2854 + }, + { + "epoch": 0.5712, + "learning_rate": 1.90062240060294e-05, + "loss": 0.184, + "step": 2856 + }, + { + "epoch": 0.5716, + "learning_rate": 1.901228341999412e-05, + "loss": 0.1088, + "step": 2858 + }, + { + "epoch": 0.572, + "learning_rate": 1.9018325264051136e-05, + "loss": 0.0004, + "step": 2860 + }, + { + "epoch": 0.5724, + "learning_rate": 1.9024349526421596e-05, + "loss": 0.7114, + "step": 2862 + }, + { + "epoch": 0.5728, + "learning_rate": 1.9030356195360868e-05, + "loss": 0.1977, + "step": 2864 + }, + { + "epoch": 0.5732, + "learning_rate": 1.903634525915866e-05, + "loss": 0.6156, + "step": 2866 + }, + { + "epoch": 0.5736, + "learning_rate": 1.904231670613899e-05, + "loss": 0.2533, + "step": 2868 + }, + { + "epoch": 0.574, + "learning_rate": 1.904827052466019e-05, + "loss": 0.3385, + "step": 2870 + }, + { + "epoch": 0.5744, + "learning_rate": 1.905420670311502e-05, + "loss": 0.0038, + "step": 2872 + }, + { + "epoch": 0.5748, + "learning_rate": 1.9060125229930572e-05, + "loss": 0.0581, + "step": 2874 + }, + { + "epoch": 0.5752, + "learning_rate": 1.906602609356838e-05, + "loss": 0.3611, + "step": 2876 + }, + { + "epoch": 0.5756, + "learning_rate": 1.907190928252441e-05, + "loss": 0.293, + "step": 2878 + }, + { + "epoch": 0.576, + "learning_rate": 1.9077774785329078e-05, + "loss": 0.0066, + "step": 2880 + }, + { + "epoch": 0.5764, + "learning_rate": 1.908362259054731e-05, + "loss": 0.3691, + "step": 2882 + }, + { + "epoch": 0.5768, + "learning_rate": 1.9089452686778487e-05, + "loss": 0.1143, + "step": 2884 + }, + { + "epoch": 0.5772, + "learning_rate": 1.9095265062656542e-05, + "loss": 0.1244, + "step": 2886 + }, + { + "epoch": 0.5776, + "learning_rate": 1.9101059706849957e-05, + "loss": 0.7383, + "step": 2888 + }, + { + "epoch": 0.578, + "learning_rate": 1.910683660806177e-05, + "loss": 0.0006, + "step": 2890 + }, + { + "epoch": 0.5784, + "learning_rate": 1.911259575502962e-05, + "loss": 0.18, + "step": 2892 + }, + { + "epoch": 0.5788, + "learning_rate": 1.9118337136525754e-05, + "loss": 0.273, + "step": 2894 + }, + { + "epoch": 0.5792, + "learning_rate": 1.912406074135706e-05, + "loss": 0.0073, + "step": 2896 + }, + { + "epoch": 0.5796, + "learning_rate": 1.912976655836507e-05, + "loss": 0.0808, + "step": 2898 + }, + { + "epoch": 0.58, + "learning_rate": 1.9135454576426006e-05, + "loss": 0.1819, + "step": 2900 + }, + { + "epoch": 0.5804, + "learning_rate": 1.9141124784450786e-05, + "loss": 0.0759, + "step": 2902 + }, + { + "epoch": 0.5808, + "learning_rate": 1.9146777171385053e-05, + "loss": 0.1631, + "step": 2904 + }, + { + "epoch": 0.5812, + "learning_rate": 1.9152411726209172e-05, + "loss": 0.0115, + "step": 2906 + }, + { + "epoch": 0.5816, + "learning_rate": 1.9158028437938316e-05, + "loss": 0.0002, + "step": 2908 + }, + { + "epoch": 0.582, + "learning_rate": 1.916362729562239e-05, + "loss": 0.0991, + "step": 2910 + }, + { + "epoch": 0.5824, + "learning_rate": 1.9169208288346168e-05, + "loss": 0.1495, + "step": 2912 + }, + { + "epoch": 0.5828, + "learning_rate": 1.9174771405229187e-05, + "loss": 0.0127, + "step": 2914 + }, + { + "epoch": 0.5832, + "learning_rate": 1.9180316635425876e-05, + "loss": 0.0883, + "step": 2916 + }, + { + "epoch": 0.5836, + "learning_rate": 1.9185843968125543e-05, + "loss": 0.0834, + "step": 2918 + }, + { + "epoch": 0.584, + "learning_rate": 1.9191353392552346e-05, + "loss": 0.0153, + "step": 2920 + }, + { + "epoch": 0.5844, + "learning_rate": 1.919684489796539e-05, + "loss": 0.0001, + "step": 2922 + }, + { + "epoch": 0.5848, + "learning_rate": 1.9202318473658703e-05, + "loss": 0.0046, + "step": 2924 + }, + { + "epoch": 0.5852, + "learning_rate": 1.9207774108961273e-05, + "loss": 0.1924, + "step": 2926 + }, + { + "epoch": 0.5856, + "learning_rate": 1.9213211793237052e-05, + "loss": 0.0004, + "step": 2928 + }, + { + "epoch": 0.586, + "learning_rate": 1.9218631515885004e-05, + "loss": 0.1583, + "step": 2930 + }, + { + "epoch": 0.5864, + "learning_rate": 1.92240332663391e-05, + "loss": 0.2422, + "step": 2932 + }, + { + "epoch": 0.5868, + "learning_rate": 1.922941703406835e-05, + "loss": 0.0237, + "step": 2934 + }, + { + "epoch": 0.5872, + "learning_rate": 1.923478280857682e-05, + "loss": 0.0412, + "step": 2936 + }, + { + "epoch": 0.5876, + "learning_rate": 1.9240130579403663e-05, + "loss": 0.2325, + "step": 2938 + }, + { + "epoch": 0.588, + "learning_rate": 1.924546033612313e-05, + "loss": 0.0006, + "step": 2940 + }, + { + "epoch": 0.5884, + "learning_rate": 1.9250772068344577e-05, + "loss": 0.2658, + "step": 2942 + }, + { + "epoch": 0.5888, + "learning_rate": 1.9256065765712524e-05, + "loss": 0.0109, + "step": 2944 + }, + { + "epoch": 0.5892, + "learning_rate": 1.9261341417906615e-05, + "loss": 0.0104, + "step": 2946 + }, + { + "epoch": 0.5896, + "learning_rate": 1.9266599014641724e-05, + "loss": 0.2341, + "step": 2948 + }, + { + "epoch": 0.59, + "learning_rate": 1.9271838545667876e-05, + "loss": 0.0361, + "step": 2950 + }, + { + "epoch": 0.5904, + "learning_rate": 1.927706000077034e-05, + "loss": 0.3786, + "step": 2952 + }, + { + "epoch": 0.5908, + "learning_rate": 1.9282263369769633e-05, + "loss": 0.2334, + "step": 2954 + }, + { + "epoch": 0.5912, + "learning_rate": 1.9287448642521507e-05, + "loss": 0.5078, + "step": 2956 + }, + { + "epoch": 0.5916, + "learning_rate": 1.9292615808917024e-05, + "loss": 0.0806, + "step": 2958 + }, + { + "epoch": 0.592, + "learning_rate": 1.9297764858882516e-05, + "loss": 0.0707, + "step": 2960 + }, + { + "epoch": 0.5924, + "learning_rate": 1.9302895782379648e-05, + "loss": 0.1638, + "step": 2962 + }, + { + "epoch": 0.5928, + "learning_rate": 1.9308008569405424e-05, + "loss": 0.1937, + "step": 2964 + }, + { + "epoch": 0.5932, + "learning_rate": 1.9313103209992205e-05, + "loss": 0.0142, + "step": 2966 + }, + { + "epoch": 0.5936, + "learning_rate": 1.9318179694207722e-05, + "loss": 0.3155, + "step": 2968 + }, + { + "epoch": 0.594, + "learning_rate": 1.932323801215512e-05, + "loss": 0.1087, + "step": 2970 + }, + { + "epoch": 0.5944, + "learning_rate": 1.9328278153972943e-05, + "loss": 0.2305, + "step": 2972 + }, + { + "epoch": 0.5948, + "learning_rate": 1.933330010983518e-05, + "loss": 0.396, + "step": 2974 + }, + { + "epoch": 0.5952, + "learning_rate": 1.9338303869951266e-05, + "loss": 0.0812, + "step": 2976 + }, + { + "epoch": 0.5956, + "learning_rate": 1.934328942456612e-05, + "loss": 0.0054, + "step": 2978 + }, + { + "epoch": 0.596, + "learning_rate": 1.934825676396015e-05, + "loss": 0.0032, + "step": 2980 + }, + { + "epoch": 0.5964, + "learning_rate": 1.9353205878449257e-05, + "loss": 0.0081, + "step": 2982 + }, + { + "epoch": 0.5968, + "learning_rate": 1.935813675838491e-05, + "loss": 1.1341, + "step": 2984 + }, + { + "epoch": 0.5972, + "learning_rate": 1.9363049394154088e-05, + "loss": 0.3646, + "step": 2986 + }, + { + "epoch": 0.5976, + "learning_rate": 1.9367943776179375e-05, + "loss": 0.0057, + "step": 2988 + }, + { + "epoch": 0.598, + "learning_rate": 1.937281989491892e-05, + "loss": 0.108, + "step": 2990 + }, + { + "epoch": 0.5984, + "learning_rate": 1.9377677740866457e-05, + "loss": 0.3295, + "step": 2992 + }, + { + "epoch": 0.5988, + "learning_rate": 1.9382517304551397e-05, + "loss": 0.0204, + "step": 2994 + }, + { + "epoch": 0.5992, + "learning_rate": 1.9387338576538743e-05, + "loss": 0.0008, + "step": 2996 + }, + { + "epoch": 0.5996, + "learning_rate": 1.9392141547429183e-05, + "loss": 0.3042, + "step": 2998 + }, + { + "epoch": 0.6, + "learning_rate": 1.9396926207859085e-05, + "loss": 0.5459, + "step": 3000 + }, + { + "epoch": 0.6004, + "learning_rate": 1.9401692548500504e-05, + "loss": 0.1994, + "step": 3002 + }, + { + "epoch": 0.6008, + "learning_rate": 1.9406440560061214e-05, + "loss": 0.61, + "step": 3004 + }, + { + "epoch": 0.6012, + "learning_rate": 1.9411170233284728e-05, + "loss": 0.0005, + "step": 3006 + }, + { + "epoch": 0.6016, + "learning_rate": 1.9415881558950302e-05, + "loss": 0.0558, + "step": 3008 + }, + { + "epoch": 0.602, + "learning_rate": 1.942057452787297e-05, + "loss": 0.1954, + "step": 3010 + }, + { + "epoch": 0.6024, + "learning_rate": 1.942524913090354e-05, + "loss": 0.0154, + "step": 3012 + }, + { + "epoch": 0.6028, + "learning_rate": 1.9429905358928645e-05, + "loss": 0.024, + "step": 3014 + }, + { + "epoch": 0.6032, + "learning_rate": 1.9434543202870723e-05, + "loss": 0.0627, + "step": 3016 + }, + { + "epoch": 0.6036, + "learning_rate": 1.9439162653688063e-05, + "loss": 0.0593, + "step": 3018 + }, + { + "epoch": 0.604, + "learning_rate": 1.9443763702374815e-05, + "loss": 0.071, + "step": 3020 + }, + { + "epoch": 0.6044, + "learning_rate": 1.944834633996098e-05, + "loss": 0.2295, + "step": 3022 + }, + { + "epoch": 0.6048, + "learning_rate": 1.9452910557512494e-05, + "loss": 0.0237, + "step": 3024 + }, + { + "epoch": 0.6052, + "learning_rate": 1.9457456346131172e-05, + "loss": 0.1814, + "step": 3026 + }, + { + "epoch": 0.6056, + "learning_rate": 1.9461983696954756e-05, + "loss": 0.0217, + "step": 3028 + }, + { + "epoch": 0.606, + "learning_rate": 1.9466492601156964e-05, + "loss": 0.2394, + "step": 3030 + }, + { + "epoch": 0.6064, + "learning_rate": 1.947098304994744e-05, + "loss": 0.0018, + "step": 3032 + }, + { + "epoch": 0.6068, + "learning_rate": 1.947545503457184e-05, + "loss": 0.1218, + "step": 3034 + }, + { + "epoch": 0.6072, + "learning_rate": 1.9479908546311787e-05, + "loss": 0.2102, + "step": 3036 + }, + { + "epoch": 0.6076, + "learning_rate": 1.9484343576484935e-05, + "loss": 0.0159, + "step": 3038 + }, + { + "epoch": 0.608, + "learning_rate": 1.9488760116444966e-05, + "loss": 0.2523, + "step": 3040 + }, + { + "epoch": 0.6084, + "learning_rate": 1.949315815758161e-05, + "loss": 0.0658, + "step": 3042 + }, + { + "epoch": 0.6088, + "learning_rate": 1.949753769132067e-05, + "loss": 0.1618, + "step": 3044 + }, + { + "epoch": 0.6092, + "learning_rate": 1.9501898709124008e-05, + "loss": 0.0881, + "step": 3046 + }, + { + "epoch": 0.6096, + "learning_rate": 1.95062412024896e-05, + "loss": 0.0059, + "step": 3048 + }, + { + "epoch": 0.61, + "learning_rate": 1.9510565162951534e-05, + "loss": 0.1316, + "step": 3050 + }, + { + "epoch": 0.6104, + "learning_rate": 1.951487058208003e-05, + "loss": 0.0003, + "step": 3052 + }, + { + "epoch": 0.6108, + "learning_rate": 1.9519157451481453e-05, + "loss": 0.0026, + "step": 3054 + }, + { + "epoch": 0.6112, + "learning_rate": 1.952342576279833e-05, + "loss": 0.3861, + "step": 3056 + }, + { + "epoch": 0.6116, + "learning_rate": 1.9527675507709364e-05, + "loss": 0.2468, + "step": 3058 + }, + { + "epoch": 0.612, + "learning_rate": 1.953190667792947e-05, + "loss": 1.3121, + "step": 3060 + }, + { + "epoch": 0.6124, + "learning_rate": 1.953611926520976e-05, + "loss": 0.0042, + "step": 3062 + }, + { + "epoch": 0.6128, + "learning_rate": 1.9540313261337578e-05, + "loss": 0.0584, + "step": 3064 + }, + { + "epoch": 0.6132, + "learning_rate": 1.9544488658136522e-05, + "loss": 0.0109, + "step": 3066 + }, + { + "epoch": 0.6136, + "learning_rate": 1.954864544746643e-05, + "loss": 0.0116, + "step": 3068 + }, + { + "epoch": 0.614, + "learning_rate": 1.955278362122344e-05, + "loss": 0.0323, + "step": 3070 + }, + { + "epoch": 0.6144, + "learning_rate": 1.955690317133996e-05, + "loss": 0.1456, + "step": 3072 + }, + { + "epoch": 0.6148, + "learning_rate": 1.9561004089784726e-05, + "loss": 0.0341, + "step": 3074 + }, + { + "epoch": 0.6152, + "learning_rate": 1.956508636856278e-05, + "loss": 0.0001, + "step": 3076 + }, + { + "epoch": 0.6156, + "learning_rate": 1.956914999971551e-05, + "loss": 0.0102, + "step": 3078 + }, + { + "epoch": 0.616, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.0081, + "step": 3080 + }, + { + "epoch": 0.6164, + "learning_rate": 1.9577221287492368e-05, + "loss": 0.9165, + "step": 3082 + }, + { + "epoch": 0.6168, + "learning_rate": 1.95812289283811e-05, + "loss": 0.1951, + "step": 3084 + }, + { + "epoch": 0.6172, + "learning_rate": 1.958521789017376e-05, + "loss": 0.3642, + "step": 3086 + }, + { + "epoch": 0.6176, + "learning_rate": 1.958918816509367e-05, + "loss": 0.0106, + "step": 3088 + }, + { + "epoch": 0.618, + "learning_rate": 1.9593139745400575e-05, + "loss": 0.4914, + "step": 3090 + }, + { + "epoch": 0.6184, + "learning_rate": 1.9597072623390668e-05, + "loss": 0.1374, + "step": 3092 + }, + { + "epoch": 0.6188, + "learning_rate": 1.9600986791396597e-05, + "loss": 0.2646, + "step": 3094 + }, + { + "epoch": 0.6192, + "learning_rate": 1.9604882241787496e-05, + "loss": 0.0148, + "step": 3096 + }, + { + "epoch": 0.6196, + "learning_rate": 1.9608758966968983e-05, + "loss": 0.0468, + "step": 3098 + }, + { + "epoch": 0.62, + "learning_rate": 1.9612616959383187e-05, + "loss": 0.3743, + "step": 3100 + }, + { + "epoch": 0.6204, + "learning_rate": 1.9616456211508752e-05, + "loss": 0.161, + "step": 3102 + }, + { + "epoch": 0.6208, + "learning_rate": 1.9620276715860856e-05, + "loss": 0.0643, + "step": 3104 + }, + { + "epoch": 0.6212, + "learning_rate": 1.962407846499124e-05, + "loss": 0.0431, + "step": 3106 + }, + { + "epoch": 0.6216, + "learning_rate": 1.9627861451488187e-05, + "loss": 0.0666, + "step": 3108 + }, + { + "epoch": 0.622, + "learning_rate": 1.9631625667976584e-05, + "loss": 0.0003, + "step": 3110 + }, + { + "epoch": 0.6224, + "learning_rate": 1.963537110711789e-05, + "loss": 0.0307, + "step": 3112 + }, + { + "epoch": 0.6228, + "learning_rate": 1.9639097761610174e-05, + "loss": 0.0162, + "step": 3114 + }, + { + "epoch": 0.6232, + "learning_rate": 1.964280562418815e-05, + "loss": 0.0534, + "step": 3116 + }, + { + "epoch": 0.6236, + "learning_rate": 1.964649468762313e-05, + "loss": 0.114, + "step": 3118 + }, + { + "epoch": 0.624, + "learning_rate": 1.9650164944723116e-05, + "loss": 0.2582, + "step": 3120 + }, + { + "epoch": 0.6244, + "learning_rate": 1.965381638833274e-05, + "loss": 0.1439, + "step": 3122 + }, + { + "epoch": 0.6248, + "learning_rate": 1.9657449011333328e-05, + "loss": 0.0006, + "step": 3124 + }, + { + "epoch": 0.6252, + "learning_rate": 1.96610628066429e-05, + "loss": 0.6985, + "step": 3126 + }, + { + "epoch": 0.6256, + "learning_rate": 1.9664657767216176e-05, + "loss": 0.0159, + "step": 3128 + }, + { + "epoch": 0.626, + "learning_rate": 1.9668233886044594e-05, + "loss": 0.0001, + "step": 3130 + }, + { + "epoch": 0.6264, + "learning_rate": 1.967179115615633e-05, + "loss": 0.1646, + "step": 3132 + }, + { + "epoch": 0.6268, + "learning_rate": 1.96753295706163e-05, + "loss": 0.1099, + "step": 3134 + }, + { + "epoch": 0.6272, + "learning_rate": 1.967884912252619e-05, + "loss": 0.0986, + "step": 3136 + }, + { + "epoch": 0.6276, + "learning_rate": 1.9682349805024443e-05, + "loss": 0.0054, + "step": 3138 + }, + { + "epoch": 0.628, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.3156, + "step": 3140 + }, + { + "epoch": 0.6284, + "learning_rate": 1.9689294534523833e-05, + "loss": 0.0006, + "step": 3142 + }, + { + "epoch": 0.6288, + "learning_rate": 1.969273856798585e-05, + "loss": 0.0051, + "step": 3144 + }, + { + "epoch": 0.6292, + "learning_rate": 1.969616370495806e-05, + "loss": 0.4695, + "step": 3146 + }, + { + "epoch": 0.6296, + "learning_rate": 1.9699569938762972e-05, + "loss": 0.003, + "step": 3148 + }, + { + "epoch": 0.63, + "learning_rate": 1.9702957262759964e-05, + "loss": 0.0003, + "step": 3150 + }, + { + "epoch": 0.6304, + "learning_rate": 1.9706325670345276e-05, + "loss": 0.1729, + "step": 3152 + }, + { + "epoch": 0.6308, + "learning_rate": 1.9709675154952013e-05, + "loss": 0.7965, + "step": 3154 + }, + { + "epoch": 0.6312, + "learning_rate": 1.9713005710050203e-05, + "loss": 0.1752, + "step": 3156 + }, + { + "epoch": 0.6316, + "learning_rate": 1.971631732914674e-05, + "loss": 0.0003, + "step": 3158 + }, + { + "epoch": 0.632, + "learning_rate": 1.9719610005785466e-05, + "loss": 0.0009, + "step": 3160 + }, + { + "epoch": 0.6324, + "learning_rate": 1.9722883733547128e-05, + "loss": 0.001, + "step": 3162 + }, + { + "epoch": 0.6328, + "learning_rate": 1.9726138506049434e-05, + "loss": 0.0104, + "step": 3164 + }, + { + "epoch": 0.6332, + "learning_rate": 1.972937431694704e-05, + "loss": 0.14, + "step": 3166 + }, + { + "epoch": 0.6336, + "learning_rate": 1.9732591159931564e-05, + "loss": 0.264, + "step": 3168 + }, + { + "epoch": 0.634, + "learning_rate": 1.9735789028731603e-05, + "loss": 0.0051, + "step": 3170 + }, + { + "epoch": 0.6344, + "learning_rate": 1.9738967917112752e-05, + "loss": 0.0031, + "step": 3172 + }, + { + "epoch": 0.6348, + "learning_rate": 1.9742127818877605e-05, + "loss": 0.0016, + "step": 3174 + }, + { + "epoch": 0.6352, + "learning_rate": 1.974526872786577e-05, + "loss": 0.069, + "step": 3176 + }, + { + "epoch": 0.6356, + "learning_rate": 1.974839063795389e-05, + "loss": 0.0686, + "step": 3178 + }, + { + "epoch": 0.636, + "learning_rate": 1.9751493543055634e-05, + "loss": 0.0003, + "step": 3180 + }, + { + "epoch": 0.6364, + "learning_rate": 1.975457743712173e-05, + "loss": 0.0005, + "step": 3182 + }, + { + "epoch": 0.6368, + "learning_rate": 1.9757642314139977e-05, + "loss": 0.1233, + "step": 3184 + }, + { + "epoch": 0.6372, + "learning_rate": 1.976068816813523e-05, + "loss": 0.015, + "step": 3186 + }, + { + "epoch": 0.6376, + "learning_rate": 1.976371499316945e-05, + "loss": 0.0148, + "step": 3188 + }, + { + "epoch": 0.638, + "learning_rate": 1.9766722783341675e-05, + "loss": 0.0336, + "step": 3190 + }, + { + "epoch": 0.6384, + "learning_rate": 1.9769711532788083e-05, + "loss": 0.0279, + "step": 3192 + }, + { + "epoch": 0.6388, + "learning_rate": 1.9772681235681933e-05, + "loss": 0.6088, + "step": 3194 + }, + { + "epoch": 0.6392, + "learning_rate": 1.9775631886233655e-05, + "loss": 0.472, + "step": 3196 + }, + { + "epoch": 0.6396, + "learning_rate": 1.977856347869079e-05, + "loss": 0.0003, + "step": 3198 + }, + { + "epoch": 0.64, + "learning_rate": 1.9781476007338054e-05, + "loss": 0.3159, + "step": 3200 + }, + { + "epoch": 0.6404, + "learning_rate": 1.9784369466497333e-05, + "loss": 0.0007, + "step": 3202 + }, + { + "epoch": 0.6408, + "learning_rate": 1.978724385052766e-05, + "loss": 1.6308, + "step": 3204 + }, + { + "epoch": 0.6412, + "learning_rate": 1.97900991538253e-05, + "loss": 0.0232, + "step": 3206 + }, + { + "epoch": 0.6416, + "learning_rate": 1.9792935370823673e-05, + "loss": 0.0209, + "step": 3208 + }, + { + "epoch": 0.642, + "learning_rate": 1.979575249599344e-05, + "loss": 0.014, + "step": 3210 + }, + { + "epoch": 0.6424, + "learning_rate": 1.979855052384247e-05, + "loss": 0.0001, + "step": 3212 + }, + { + "epoch": 0.6428, + "learning_rate": 1.980132944891586e-05, + "loss": 0.0271, + "step": 3214 + }, + { + "epoch": 0.6432, + "learning_rate": 1.9804089265795956e-05, + "loss": 0.1216, + "step": 3216 + }, + { + "epoch": 0.6436, + "learning_rate": 1.9806829969102356e-05, + "loss": 0.3649, + "step": 3218 + }, + { + "epoch": 0.644, + "learning_rate": 1.9809551553491918e-05, + "loss": 0.1669, + "step": 3220 + }, + { + "epoch": 0.6444, + "learning_rate": 1.981225401365877e-05, + "loss": 0.133, + "step": 3222 + }, + { + "epoch": 0.6448, + "learning_rate": 1.981493734433433e-05, + "loss": 1.5928, + "step": 3224 + }, + { + "epoch": 0.6452, + "learning_rate": 1.981760154028731e-05, + "loss": 0.208, + "step": 3226 + }, + { + "epoch": 0.6456, + "learning_rate": 1.982024659632372e-05, + "loss": 0.7565, + "step": 3228 + }, + { + "epoch": 0.646, + "learning_rate": 1.9822872507286887e-05, + "loss": 0.4732, + "step": 3230 + }, + { + "epoch": 0.6464, + "learning_rate": 1.9825479268057472e-05, + "loss": 0.1144, + "step": 3232 + }, + { + "epoch": 0.6468, + "learning_rate": 1.9828066873553445e-05, + "loss": 0.266, + "step": 3234 + }, + { + "epoch": 0.6472, + "learning_rate": 1.9830635318730155e-05, + "loss": 0.0663, + "step": 3236 + }, + { + "epoch": 0.6476, + "learning_rate": 1.983318459858028e-05, + "loss": 0.3273, + "step": 3238 + }, + { + "epoch": 0.648, + "learning_rate": 1.9835714708133858e-05, + "loss": 0.1508, + "step": 3240 + }, + { + "epoch": 0.6484, + "learning_rate": 1.983822564245833e-05, + "loss": 0.2841, + "step": 3242 + }, + { + "epoch": 0.6488, + "learning_rate": 1.9840717396658483e-05, + "loss": 0.0103, + "step": 3244 + }, + { + "epoch": 0.6492, + "learning_rate": 1.9843189965876525e-05, + "loss": 0.0524, + "step": 3246 + }, + { + "epoch": 0.6496, + "learning_rate": 1.9845643345292055e-05, + "loss": 0.1345, + "step": 3248 + }, + { + "epoch": 0.65, + "learning_rate": 1.9848077530122083e-05, + "loss": 0.1762, + "step": 3250 + }, + { + "epoch": 0.6504, + "learning_rate": 1.9850492515621038e-05, + "loss": 0.06, + "step": 3252 + }, + { + "epoch": 0.6508, + "learning_rate": 1.9852888297080785e-05, + "loss": 0.4044, + "step": 3254 + }, + { + "epoch": 0.6512, + "learning_rate": 1.985526486983063e-05, + "loss": 0.0005, + "step": 3256 + }, + { + "epoch": 0.6516, + "learning_rate": 1.9857622229237315e-05, + "loss": 0.0351, + "step": 3258 + }, + { + "epoch": 0.652, + "learning_rate": 1.985996037070505e-05, + "loss": 0.0087, + "step": 3260 + }, + { + "epoch": 0.6524, + "learning_rate": 1.986227928967551e-05, + "loss": 0.0135, + "step": 3262 + }, + { + "epoch": 0.6528, + "learning_rate": 1.9864578981627844e-05, + "loss": 0.0725, + "step": 3264 + }, + { + "epoch": 0.6532, + "learning_rate": 1.986685944207868e-05, + "loss": 0.0345, + "step": 3266 + }, + { + "epoch": 0.6536, + "learning_rate": 1.9869120666582153e-05, + "loss": 0.1713, + "step": 3268 + }, + { + "epoch": 0.654, + "learning_rate": 1.9871362650729877e-05, + "loss": 0.0958, + "step": 3270 + }, + { + "epoch": 0.6544, + "learning_rate": 1.9873585390151003e-05, + "loss": 0.3652, + "step": 3272 + }, + { + "epoch": 0.6548, + "learning_rate": 1.9875788880512183e-05, + "loss": 0.1685, + "step": 3274 + }, + { + "epoch": 0.6552, + "learning_rate": 1.987797311751759e-05, + "loss": 0.4958, + "step": 3276 + }, + { + "epoch": 0.6556, + "learning_rate": 1.9880138096908955e-05, + "loss": 0.0712, + "step": 3278 + }, + { + "epoch": 0.656, + "learning_rate": 1.9882283814465528e-05, + "loss": 0.301, + "step": 3280 + }, + { + "epoch": 0.6564, + "learning_rate": 1.9884410266004134e-05, + "loss": 0.549, + "step": 3282 + }, + { + "epoch": 0.6568, + "learning_rate": 1.988651744737914e-05, + "loss": 0.5946, + "step": 3284 + }, + { + "epoch": 0.6572, + "learning_rate": 1.9888605354482494e-05, + "loss": 0.7187, + "step": 3286 + }, + { + "epoch": 0.6576, + "learning_rate": 1.9890673983243704e-05, + "loss": 0.0188, + "step": 3288 + }, + { + "epoch": 0.658, + "learning_rate": 1.9892723329629885e-05, + "loss": 0.2394, + "step": 3290 + }, + { + "epoch": 0.6584, + "learning_rate": 1.9894753389645723e-05, + "loss": 0.0129, + "step": 3292 + }, + { + "epoch": 0.6588, + "learning_rate": 1.989676415933351e-05, + "loss": 0.003, + "step": 3294 + }, + { + "epoch": 0.6592, + "learning_rate": 1.9898755634773155e-05, + "loss": 0.0994, + "step": 3296 + }, + { + "epoch": 0.6596, + "learning_rate": 1.9900727812082174e-05, + "loss": 0.0194, + "step": 3298 + }, + { + "epoch": 0.66, + "learning_rate": 1.9902680687415704e-05, + "loss": 0.4363, + "step": 3300 + }, + { + "epoch": 0.6604, + "learning_rate": 1.9904614256966514e-05, + "loss": 0.0726, + "step": 3302 + }, + { + "epoch": 0.6608, + "learning_rate": 1.9906528516965014e-05, + "loss": 0.2747, + "step": 3304 + }, + { + "epoch": 0.6612, + "learning_rate": 1.9908423463679246e-05, + "loss": 0.0175, + "step": 3306 + }, + { + "epoch": 0.6616, + "learning_rate": 1.9910299093414926e-05, + "loss": 0.0265, + "step": 3308 + }, + { + "epoch": 0.662, + "learning_rate": 1.991215540251542e-05, + "loss": 0.0332, + "step": 3310 + }, + { + "epoch": 0.6624, + "learning_rate": 1.9913992387361744e-05, + "loss": 0.0011, + "step": 3312 + }, + { + "epoch": 0.6628, + "learning_rate": 1.9915810044372618e-05, + "loss": 0.0005, + "step": 3314 + }, + { + "epoch": 0.6632, + "learning_rate": 1.9917608370004414e-05, + "loss": 0.0001, + "step": 3316 + }, + { + "epoch": 0.6636, + "learning_rate": 1.9919387360751216e-05, + "loss": 0.1326, + "step": 3318 + }, + { + "epoch": 0.664, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.0325, + "step": 3320 + }, + { + "epoch": 0.6644, + "learning_rate": 1.992288732375458e-05, + "loss": 1.0544, + "step": 3322 + }, + { + "epoch": 0.6648, + "learning_rate": 1.9924608289187786e-05, + "loss": 0.0011, + "step": 3324 + }, + { + "epoch": 0.6652, + "learning_rate": 1.992630990608929e-05, + "loss": 0.245, + "step": 3326 + }, + { + "epoch": 0.6656, + "learning_rate": 1.9927992171141707e-05, + "loss": 0.0142, + "step": 3328 + }, + { + "epoch": 0.666, + "learning_rate": 1.992965508106537e-05, + "loss": 0.0046, + "step": 3330 + }, + { + "epoch": 0.6664, + "learning_rate": 1.9931298632618355e-05, + "loss": 0.7281, + "step": 3332 + }, + { + "epoch": 0.6668, + "learning_rate": 1.993292282259647e-05, + "loss": 0.0668, + "step": 3334 + }, + { + "epoch": 0.6672, + "learning_rate": 1.9934527647833276e-05, + "loss": 0.9586, + "step": 3336 + }, + { + "epoch": 0.6676, + "learning_rate": 1.9936113105200085e-05, + "loss": 0.7117, + "step": 3338 + }, + { + "epoch": 0.668, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.1318, + "step": 3340 + }, + { + "epoch": 0.6684, + "learning_rate": 1.9939225903997748e-05, + "loss": 0.1626, + "step": 3342 + }, + { + "epoch": 0.6688, + "learning_rate": 1.9940753239360047e-05, + "loss": 0.1677, + "step": 3344 + }, + { + "epoch": 0.6692, + "learning_rate": 1.9942261194715236e-05, + "loss": 0.0814, + "step": 3346 + }, + { + "epoch": 0.6696, + "learning_rate": 1.994374976712348e-05, + "loss": 0.0885, + "step": 3348 + }, + { + "epoch": 0.67, + "learning_rate": 1.9945218953682736e-05, + "loss": 0.1775, + "step": 3350 + }, + { + "epoch": 0.6704, + "learning_rate": 1.994666875152874e-05, + "loss": 0.5984, + "step": 3352 + }, + { + "epoch": 0.6708, + "learning_rate": 1.994809915783505e-05, + "loss": 0.0219, + "step": 3354 + }, + { + "epoch": 0.6712, + "learning_rate": 1.9949510169813003e-05, + "loss": 0.273, + "step": 3356 + }, + { + "epoch": 0.6716, + "learning_rate": 1.9950901784711768e-05, + "loss": 0.2023, + "step": 3358 + }, + { + "epoch": 0.672, + "learning_rate": 1.9952273999818312e-05, + "loss": 0.0067, + "step": 3360 + }, + { + "epoch": 0.6724, + "learning_rate": 1.995362681245744e-05, + "loss": 0.0005, + "step": 3362 + }, + { + "epoch": 0.6728, + "learning_rate": 1.995496021999177e-05, + "loss": 0.1205, + "step": 3364 + }, + { + "epoch": 0.6732, + "learning_rate": 1.995627421982176e-05, + "loss": 0.2867, + "step": 3366 + }, + { + "epoch": 0.6736, + "learning_rate": 1.9957568809385693e-05, + "loss": 0.0426, + "step": 3368 + }, + { + "epoch": 0.674, + "learning_rate": 1.9958843986159705e-05, + "loss": 0.0116, + "step": 3370 + }, + { + "epoch": 0.6744, + "learning_rate": 1.9960099747657774e-05, + "loss": 0.646, + "step": 3372 + }, + { + "epoch": 0.6748, + "learning_rate": 1.9961336091431725e-05, + "loss": 0.1936, + "step": 3374 + }, + { + "epoch": 0.6752, + "learning_rate": 1.996255301507125e-05, + "loss": 1.0606, + "step": 3376 + }, + { + "epoch": 0.6756, + "learning_rate": 1.9963750516203884e-05, + "loss": 0.1486, + "step": 3378 + }, + { + "epoch": 0.676, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.0018, + "step": 3380 + }, + { + "epoch": 0.6764, + "learning_rate": 1.996608724164801e-05, + "loss": 0.556, + "step": 3382 + }, + { + "epoch": 0.6768, + "learning_rate": 1.9967226461403934e-05, + "loss": 0.5288, + "step": 3384 + }, + { + "epoch": 0.6772, + "learning_rate": 1.9968346249541848e-05, + "loss": 0.3205, + "step": 3386 + }, + { + "epoch": 0.6776, + "learning_rate": 1.996944660387867e-05, + "loss": 0.1923, + "step": 3388 + }, + { + "epoch": 0.678, + "learning_rate": 1.9970527522269204e-05, + "loss": 0.6128, + "step": 3390 + }, + { + "epoch": 0.6784, + "learning_rate": 1.997158900260614e-05, + "loss": 0.5157, + "step": 3392 + }, + { + "epoch": 0.6788, + "learning_rate": 1.997263104282007e-05, + "loss": 0.1757, + "step": 3394 + }, + { + "epoch": 0.6792, + "learning_rate": 1.9973653640879486e-05, + "loss": 0.5485, + "step": 3396 + }, + { + "epoch": 0.6796, + "learning_rate": 1.9974656794790777e-05, + "loss": 0.003, + "step": 3398 + }, + { + "epoch": 0.68, + "learning_rate": 1.9975640502598246e-05, + "loss": 0.0124, + "step": 3400 + }, + { + "epoch": 0.6804, + "learning_rate": 1.99766047623841e-05, + "loss": 0.2882, + "step": 3402 + }, + { + "epoch": 0.6808, + "learning_rate": 1.997754957226847e-05, + "loss": 0.7195, + "step": 3404 + }, + { + "epoch": 0.6812, + "learning_rate": 1.9978474930409396e-05, + "loss": 0.3042, + "step": 3406 + }, + { + "epoch": 0.6816, + "learning_rate": 1.9979380835002846e-05, + "loss": 0.3522, + "step": 3408 + }, + { + "epoch": 0.682, + "learning_rate": 1.9980267284282718e-05, + "loss": 0.3105, + "step": 3410 + }, + { + "epoch": 0.6824, + "learning_rate": 1.9981134276520828e-05, + "loss": 0.5383, + "step": 3412 + }, + { + "epoch": 0.6828, + "learning_rate": 1.9981981810026932e-05, + "loss": 0.2634, + "step": 3414 + }, + { + "epoch": 0.6832, + "learning_rate": 1.998280988314872e-05, + "loss": 0.2483, + "step": 3416 + }, + { + "epoch": 0.6836, + "learning_rate": 1.9983618494271825e-05, + "loss": 0.0467, + "step": 3418 + }, + { + "epoch": 0.684, + "learning_rate": 1.998440764181981e-05, + "loss": 0.3059, + "step": 3420 + }, + { + "epoch": 0.6844, + "learning_rate": 1.99851773242542e-05, + "loss": 0.1919, + "step": 3422 + }, + { + "epoch": 0.6848, + "learning_rate": 1.9985927540074453e-05, + "loss": 0.0001, + "step": 3424 + }, + { + "epoch": 0.6852, + "learning_rate": 1.9986658287817992e-05, + "loss": 0.0003, + "step": 3426 + }, + { + "epoch": 0.6856, + "learning_rate": 1.998736956606018e-05, + "loss": 0.0006, + "step": 3428 + }, + { + "epoch": 0.686, + "learning_rate": 1.9988061373414342e-05, + "loss": 0.5121, + "step": 3430 + }, + { + "epoch": 0.6864, + "learning_rate": 1.9988733708531772e-05, + "loss": 0.1634, + "step": 3432 + }, + { + "epoch": 0.6868, + "learning_rate": 1.9989386570101712e-05, + "loss": 0.0503, + "step": 3434 + }, + { + "epoch": 0.6872, + "learning_rate": 1.9990019956851384e-05, + "loss": 0.0797, + "step": 3436 + }, + { + "epoch": 0.6876, + "learning_rate": 1.9990633867545956e-05, + "loss": 0.211, + "step": 3438 + }, + { + "epoch": 0.688, + "learning_rate": 1.9991228300988586e-05, + "loss": 0.1942, + "step": 3440 + }, + { + "epoch": 0.6884, + "learning_rate": 1.9991803256020393e-05, + "loss": 0.3825, + "step": 3442 + }, + { + "epoch": 0.6888, + "learning_rate": 1.999235873152047e-05, + "loss": 0.2183, + "step": 3444 + }, + { + "epoch": 0.6892, + "learning_rate": 1.9992894726405894e-05, + "loss": 0.0134, + "step": 3446 + }, + { + "epoch": 0.6896, + "learning_rate": 1.9993411239631713e-05, + "loss": 0.4511, + "step": 3448 + }, + { + "epoch": 0.69, + "learning_rate": 1.999390827019096e-05, + "loss": 0.3143, + "step": 3450 + }, + { + "epoch": 0.6904, + "learning_rate": 1.9994385817114644e-05, + "loss": 0.0173, + "step": 3452 + }, + { + "epoch": 0.6908, + "learning_rate": 1.999484387947177e-05, + "loss": 0.029, + "step": 3454 + }, + { + "epoch": 0.6912, + "learning_rate": 1.9995282456369313e-05, + "loss": 0.3901, + "step": 3456 + }, + { + "epoch": 0.6916, + "learning_rate": 1.9995701546952252e-05, + "loss": 0.1518, + "step": 3458 + }, + { + "epoch": 0.692, + "learning_rate": 1.9996101150403543e-05, + "loss": 0.0464, + "step": 3460 + }, + { + "epoch": 0.6924, + "learning_rate": 1.9996481265944146e-05, + "loss": 0.0646, + "step": 3462 + }, + { + "epoch": 0.6928, + "learning_rate": 1.9996841892833e-05, + "loss": 0.2702, + "step": 3464 + }, + { + "epoch": 0.6932, + "learning_rate": 1.999718303036705e-05, + "loss": 0.0909, + "step": 3466 + }, + { + "epoch": 0.6936, + "learning_rate": 1.9997504677881224e-05, + "loss": 0.1361, + "step": 3468 + }, + { + "epoch": 0.694, + "learning_rate": 1.9997806834748455e-05, + "loss": 0.0127, + "step": 3470 + }, + { + "epoch": 0.6944, + "learning_rate": 1.999808950037968e-05, + "loss": 0.2244, + "step": 3472 + }, + { + "epoch": 0.6948, + "learning_rate": 1.9998352674223816e-05, + "loss": 0.0798, + "step": 3474 + }, + { + "epoch": 0.6952, + "learning_rate": 1.9998596355767805e-05, + "loss": 0.1803, + "step": 3476 + }, + { + "epoch": 0.6956, + "learning_rate": 1.999882054453657e-05, + "loss": 0.0001, + "step": 3478 + }, + { + "epoch": 0.696, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.048, + "step": 3480 + }, + { + "epoch": 0.6964, + "learning_rate": 1.9999210442038164e-05, + "loss": 0.0036, + "step": 3482 + }, + { + "epoch": 0.6968, + "learning_rate": 1.9999376150010868e-05, + "loss": 0.0069, + "step": 3484 + }, + { + "epoch": 0.6972, + "learning_rate": 1.99995223636881e-05, + "loss": 0.1365, + "step": 3486 + }, + { + "epoch": 0.6976, + "learning_rate": 1.9999649082784807e-05, + "loss": 0.0207, + "step": 3488 + }, + { + "epoch": 0.698, + "learning_rate": 1.9999756307053947e-05, + "loss": 0.0904, + "step": 3490 + }, + { + "epoch": 0.6984, + "learning_rate": 1.9999844036286483e-05, + "loss": 0.1599, + "step": 3492 + }, + { + "epoch": 0.6988, + "learning_rate": 1.9999912270311376e-05, + "loss": 0.0218, + "step": 3494 + }, + { + "epoch": 0.6992, + "learning_rate": 1.9999961008995607e-05, + "loss": 0.0331, + "step": 3496 + }, + { + "epoch": 0.6996, + "learning_rate": 1.9999990252244153e-05, + "loss": 0.0947, + "step": 3498 + }, + { + "epoch": 0.7, + "learning_rate": 2e-05, + "loss": 0.8469, + "step": 3500 + }, + { + "epoch": 0.7004, + "learning_rate": 1.9999990252244153e-05, + "loss": 0.3097, + "step": 3502 + }, + { + "epoch": 0.7008, + "learning_rate": 1.9999961008995607e-05, + "loss": 0.2055, + "step": 3504 + }, + { + "epoch": 0.7012, + "learning_rate": 1.9999912270311376e-05, + "loss": 0.2727, + "step": 3506 + }, + { + "epoch": 0.7016, + "learning_rate": 1.9999844036286483e-05, + "loss": 0.3166, + "step": 3508 + }, + { + "epoch": 0.702, + "learning_rate": 1.9999756307053947e-05, + "loss": 0.9651, + "step": 3510 + }, + { + "epoch": 0.7024, + "learning_rate": 1.9999649082784807e-05, + "loss": 0.4809, + "step": 3512 + }, + { + "epoch": 0.7028, + "learning_rate": 1.99995223636881e-05, + "loss": 0.1349, + "step": 3514 + }, + { + "epoch": 0.7032, + "learning_rate": 1.9999376150010868e-05, + "loss": 0.7051, + "step": 3516 + }, + { + "epoch": 0.7036, + "learning_rate": 1.9999210442038164e-05, + "loss": 0.027, + "step": 3518 + }, + { + "epoch": 0.704, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.1648, + "step": 3520 + }, + { + "epoch": 0.7044, + "learning_rate": 1.999882054453657e-05, + "loss": 0.0001, + "step": 3522 + }, + { + "epoch": 0.7048, + "learning_rate": 1.9998596355767805e-05, + "loss": 0.013, + "step": 3524 + }, + { + "epoch": 0.7052, + "learning_rate": 1.9998352674223816e-05, + "loss": 0.1439, + "step": 3526 + }, + { + "epoch": 0.7056, + "learning_rate": 1.999808950037968e-05, + "loss": 0.6233, + "step": 3528 + }, + { + "epoch": 0.706, + "learning_rate": 1.9997806834748455e-05, + "loss": 0.0723, + "step": 3530 + }, + { + "epoch": 0.7064, + "learning_rate": 1.9997504677881224e-05, + "loss": 1.0701, + "step": 3532 + }, + { + "epoch": 0.7068, + "learning_rate": 1.999718303036705e-05, + "loss": 0.0816, + "step": 3534 + }, + { + "epoch": 0.7072, + "learning_rate": 1.9996841892833e-05, + "loss": 0.0033, + "step": 3536 + }, + { + "epoch": 0.7076, + "learning_rate": 1.9996481265944146e-05, + "loss": 0.0956, + "step": 3538 + }, + { + "epoch": 0.708, + "learning_rate": 1.9996101150403547e-05, + "loss": 0.1286, + "step": 3540 + }, + { + "epoch": 0.7084, + "learning_rate": 1.9995701546952252e-05, + "loss": 0.5853, + "step": 3542 + }, + { + "epoch": 0.7088, + "learning_rate": 1.9995282456369313e-05, + "loss": 0.0103, + "step": 3544 + }, + { + "epoch": 0.7092, + "learning_rate": 1.9994843879471766e-05, + "loss": 0.0376, + "step": 3546 + }, + { + "epoch": 0.7096, + "learning_rate": 1.9994385817114644e-05, + "loss": 0.1232, + "step": 3548 + }, + { + "epoch": 0.71, + "learning_rate": 1.999390827019096e-05, + "loss": 0.0875, + "step": 3550 + }, + { + "epoch": 0.7104, + "learning_rate": 1.9993411239631713e-05, + "loss": 0.3198, + "step": 3552 + }, + { + "epoch": 0.7108, + "learning_rate": 1.9992894726405898e-05, + "loss": 0.0575, + "step": 3554 + }, + { + "epoch": 0.7112, + "learning_rate": 1.999235873152047e-05, + "loss": 0.0217, + "step": 3556 + }, + { + "epoch": 0.7116, + "learning_rate": 1.9991803256020393e-05, + "loss": 0.563, + "step": 3558 + }, + { + "epoch": 0.712, + "learning_rate": 1.9991228300988586e-05, + "loss": 0.6102, + "step": 3560 + }, + { + "epoch": 0.7124, + "learning_rate": 1.9990633867545956e-05, + "loss": 0.0003, + "step": 3562 + }, + { + "epoch": 0.7128, + "learning_rate": 1.9990019956851384e-05, + "loss": 0.7152, + "step": 3564 + }, + { + "epoch": 0.7132, + "learning_rate": 1.9989386570101716e-05, + "loss": 0.0108, + "step": 3566 + }, + { + "epoch": 0.7136, + "learning_rate": 1.9988733708531772e-05, + "loss": 0.3166, + "step": 3568 + }, + { + "epoch": 0.714, + "learning_rate": 1.9988061373414342e-05, + "loss": 0.3644, + "step": 3570 + }, + { + "epoch": 0.7144, + "learning_rate": 1.998736956606018e-05, + "loss": 0.3846, + "step": 3572 + }, + { + "epoch": 0.7148, + "learning_rate": 1.998665828781799e-05, + "loss": 0.2043, + "step": 3574 + }, + { + "epoch": 0.7152, + "learning_rate": 1.9985927540074453e-05, + "loss": 0.0162, + "step": 3576 + }, + { + "epoch": 0.7156, + "learning_rate": 1.99851773242542e-05, + "loss": 0.0988, + "step": 3578 + }, + { + "epoch": 0.716, + "learning_rate": 1.9984407641819812e-05, + "loss": 0.0493, + "step": 3580 + }, + { + "epoch": 0.7164, + "learning_rate": 1.9983618494271825e-05, + "loss": 0.9358, + "step": 3582 + }, + { + "epoch": 0.7168, + "learning_rate": 1.998280988314872e-05, + "loss": 0.0735, + "step": 3584 + }, + { + "epoch": 0.7172, + "learning_rate": 1.9981981810026932e-05, + "loss": 0.3973, + "step": 3586 + }, + { + "epoch": 0.7176, + "learning_rate": 1.9981134276520828e-05, + "loss": 0.2159, + "step": 3588 + }, + { + "epoch": 0.718, + "learning_rate": 1.9980267284282718e-05, + "loss": 0.0986, + "step": 3590 + }, + { + "epoch": 0.7184, + "learning_rate": 1.9979380835002846e-05, + "loss": 0.0054, + "step": 3592 + }, + { + "epoch": 0.7188, + "learning_rate": 1.9978474930409396e-05, + "loss": 0.2373, + "step": 3594 + }, + { + "epoch": 0.7192, + "learning_rate": 1.9977549572268467e-05, + "loss": 0.4883, + "step": 3596 + }, + { + "epoch": 0.7196, + "learning_rate": 1.99766047623841e-05, + "loss": 0.0235, + "step": 3598 + }, + { + "epoch": 0.72, + "learning_rate": 1.9975640502598246e-05, + "loss": 0.1398, + "step": 3600 + }, + { + "epoch": 0.7204, + "learning_rate": 1.9974656794790777e-05, + "loss": 0.4607, + "step": 3602 + }, + { + "epoch": 0.7208, + "learning_rate": 1.9973653640879486e-05, + "loss": 0.003, + "step": 3604 + }, + { + "epoch": 0.7212, + "learning_rate": 1.9972631042820074e-05, + "loss": 0.0008, + "step": 3606 + }, + { + "epoch": 0.7216, + "learning_rate": 1.997158900260614e-05, + "loss": 0.1906, + "step": 3608 + }, + { + "epoch": 0.722, + "learning_rate": 1.9970527522269204e-05, + "loss": 0.1756, + "step": 3610 + }, + { + "epoch": 0.7224, + "learning_rate": 1.9969446603878673e-05, + "loss": 0.1937, + "step": 3612 + }, + { + "epoch": 0.7228, + "learning_rate": 1.9968346249541848e-05, + "loss": 0.0356, + "step": 3614 + }, + { + "epoch": 0.7232, + "learning_rate": 1.9967226461403934e-05, + "loss": 0.1373, + "step": 3616 + }, + { + "epoch": 0.7236, + "learning_rate": 1.996608724164801e-05, + "loss": 0.3918, + "step": 3618 + }, + { + "epoch": 0.724, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.0592, + "step": 3620 + }, + { + "epoch": 0.7244, + "learning_rate": 1.9963750516203887e-05, + "loss": 0.2264, + "step": 3622 + }, + { + "epoch": 0.7248, + "learning_rate": 1.996255301507125e-05, + "loss": 0.0074, + "step": 3624 + }, + { + "epoch": 0.7252, + "learning_rate": 1.9961336091431728e-05, + "loss": 0.1139, + "step": 3626 + }, + { + "epoch": 0.7256, + "learning_rate": 1.9960099747657774e-05, + "loss": 0.0627, + "step": 3628 + }, + { + "epoch": 0.726, + "learning_rate": 1.9958843986159705e-05, + "loss": 0.498, + "step": 3630 + }, + { + "epoch": 0.7264, + "learning_rate": 1.9957568809385693e-05, + "loss": 0.0011, + "step": 3632 + }, + { + "epoch": 0.7268, + "learning_rate": 1.995627421982176e-05, + "loss": 0.0129, + "step": 3634 + }, + { + "epoch": 0.7272, + "learning_rate": 1.995496021999177e-05, + "loss": 0.0101, + "step": 3636 + }, + { + "epoch": 0.7276, + "learning_rate": 1.995362681245744e-05, + "loss": 0.007, + "step": 3638 + }, + { + "epoch": 0.728, + "learning_rate": 1.9952273999818312e-05, + "loss": 0.0051, + "step": 3640 + }, + { + "epoch": 0.7284, + "learning_rate": 1.9950901784711768e-05, + "loss": 0.011, + "step": 3642 + }, + { + "epoch": 0.7288, + "learning_rate": 1.9949510169813006e-05, + "loss": 0.5666, + "step": 3644 + }, + { + "epoch": 0.7292, + "learning_rate": 1.994809915783505e-05, + "loss": 0.0396, + "step": 3646 + }, + { + "epoch": 0.7296, + "learning_rate": 1.9946668751528745e-05, + "loss": 0.0264, + "step": 3648 + }, + { + "epoch": 0.73, + "learning_rate": 1.9945218953682736e-05, + "loss": 0.0001, + "step": 3650 + }, + { + "epoch": 0.7304, + "learning_rate": 1.994374976712348e-05, + "loss": 0.0095, + "step": 3652 + }, + { + "epoch": 0.7308, + "learning_rate": 1.9942261194715236e-05, + "loss": 0.0013, + "step": 3654 + }, + { + "epoch": 0.7312, + "learning_rate": 1.9940753239360047e-05, + "loss": 0.1055, + "step": 3656 + }, + { + "epoch": 0.7316, + "learning_rate": 1.9939225903997748e-05, + "loss": 0.0913, + "step": 3658 + }, + { + "epoch": 0.732, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.3088, + "step": 3660 + }, + { + "epoch": 0.7324, + "learning_rate": 1.993611310520009e-05, + "loss": 0.0015, + "step": 3662 + }, + { + "epoch": 0.7328, + "learning_rate": 1.993452764783328e-05, + "loss": 0.1332, + "step": 3664 + }, + { + "epoch": 0.7332, + "learning_rate": 1.993292282259647e-05, + "loss": 0.0084, + "step": 3666 + }, + { + "epoch": 0.7336, + "learning_rate": 1.9931298632618352e-05, + "loss": 0.2949, + "step": 3668 + }, + { + "epoch": 0.734, + "learning_rate": 1.9929655081065373e-05, + "loss": 0.0754, + "step": 3670 + }, + { + "epoch": 0.7344, + "learning_rate": 1.9927992171141707e-05, + "loss": 0.3942, + "step": 3672 + }, + { + "epoch": 0.7348, + "learning_rate": 1.992630990608929e-05, + "loss": 0.0562, + "step": 3674 + }, + { + "epoch": 0.7352, + "learning_rate": 1.9924608289187786e-05, + "loss": 0.2272, + "step": 3676 + }, + { + "epoch": 0.7356, + "learning_rate": 1.992288732375458e-05, + "loss": 0.0077, + "step": 3678 + }, + { + "epoch": 0.736, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.0583, + "step": 3680 + }, + { + "epoch": 0.7364, + "learning_rate": 1.9919387360751216e-05, + "loss": 0.0034, + "step": 3682 + }, + { + "epoch": 0.7368, + "learning_rate": 1.9917608370004417e-05, + "loss": 0.2223, + "step": 3684 + }, + { + "epoch": 0.7372, + "learning_rate": 1.991581004437262e-05, + "loss": 0.3983, + "step": 3686 + }, + { + "epoch": 0.7376, + "learning_rate": 1.9913992387361747e-05, + "loss": 0.8432, + "step": 3688 + }, + { + "epoch": 0.738, + "learning_rate": 1.991215540251542e-05, + "loss": 0.0141, + "step": 3690 + }, + { + "epoch": 0.7384, + "learning_rate": 1.9910299093414932e-05, + "loss": 0.0017, + "step": 3692 + }, + { + "epoch": 0.7388, + "learning_rate": 1.9908423463679246e-05, + "loss": 0.3313, + "step": 3694 + }, + { + "epoch": 0.7392, + "learning_rate": 1.990652851696501e-05, + "loss": 0.071, + "step": 3696 + }, + { + "epoch": 0.7396, + "learning_rate": 1.9904614256966517e-05, + "loss": 0.5388, + "step": 3698 + }, + { + "epoch": 0.74, + "learning_rate": 1.9902680687415704e-05, + "loss": 0.2677, + "step": 3700 + }, + { + "epoch": 0.7404, + "learning_rate": 1.9900727812082177e-05, + "loss": 0.3018, + "step": 3702 + }, + { + "epoch": 0.7408, + "learning_rate": 1.9898755634773155e-05, + "loss": 0.3159, + "step": 3704 + }, + { + "epoch": 0.7412, + "learning_rate": 1.9896764159333518e-05, + "loss": 0.0005, + "step": 3706 + }, + { + "epoch": 0.7416, + "learning_rate": 1.9894753389645723e-05, + "loss": 0.2973, + "step": 3708 + }, + { + "epoch": 0.742, + "learning_rate": 1.9892723329629885e-05, + "loss": 0.009, + "step": 3710 + }, + { + "epoch": 0.7424, + "learning_rate": 1.9890673983243704e-05, + "loss": 0.0015, + "step": 3712 + }, + { + "epoch": 0.7428, + "learning_rate": 1.9888605354482494e-05, + "loss": 0.0955, + "step": 3714 + }, + { + "epoch": 0.7432, + "learning_rate": 1.9886517447379143e-05, + "loss": 0.0438, + "step": 3716 + }, + { + "epoch": 0.7436, + "learning_rate": 1.9884410266004134e-05, + "loss": 0.2379, + "step": 3718 + }, + { + "epoch": 0.744, + "learning_rate": 1.988228381446553e-05, + "loss": 0.0052, + "step": 3720 + }, + { + "epoch": 0.7444, + "learning_rate": 1.9880138096908955e-05, + "loss": 0.0001, + "step": 3722 + }, + { + "epoch": 0.7448, + "learning_rate": 1.987797311751759e-05, + "loss": 0.0117, + "step": 3724 + }, + { + "epoch": 0.7452, + "learning_rate": 1.9875788880512183e-05, + "loss": 0.1617, + "step": 3726 + }, + { + "epoch": 0.7456, + "learning_rate": 1.9873585390151007e-05, + "loss": 0.353, + "step": 3728 + }, + { + "epoch": 0.746, + "learning_rate": 1.987136265072988e-05, + "loss": 0.1115, + "step": 3730 + }, + { + "epoch": 0.7464, + "learning_rate": 1.9869120666582153e-05, + "loss": 0.14, + "step": 3732 + }, + { + "epoch": 0.7468, + "learning_rate": 1.9866859442078685e-05, + "loss": 0.0589, + "step": 3734 + }, + { + "epoch": 0.7472, + "learning_rate": 1.9864578981627844e-05, + "loss": 0.1804, + "step": 3736 + }, + { + "epoch": 0.7476, + "learning_rate": 1.986227928967551e-05, + "loss": 0.0026, + "step": 3738 + }, + { + "epoch": 0.748, + "learning_rate": 1.985996037070505e-05, + "loss": 0.4037, + "step": 3740 + }, + { + "epoch": 0.7484, + "learning_rate": 1.985762222923732e-05, + "loss": 0.5356, + "step": 3742 + }, + { + "epoch": 0.7488, + "learning_rate": 1.985526486983063e-05, + "loss": 0.1451, + "step": 3744 + }, + { + "epoch": 0.7492, + "learning_rate": 1.985288829708079e-05, + "loss": 0.0003, + "step": 3746 + }, + { + "epoch": 0.7496, + "learning_rate": 1.9850492515621038e-05, + "loss": 0.0007, + "step": 3748 + }, + { + "epoch": 0.75, + "learning_rate": 1.9848077530122083e-05, + "loss": 0.0238, + "step": 3750 + }, + { + "epoch": 0.7504, + "learning_rate": 1.9845643345292055e-05, + "loss": 0.0006, + "step": 3752 + }, + { + "epoch": 0.7508, + "learning_rate": 1.9843189965876525e-05, + "loss": 0.4434, + "step": 3754 + }, + { + "epoch": 0.7512, + "learning_rate": 1.9840717396658483e-05, + "loss": 0.3612, + "step": 3756 + }, + { + "epoch": 0.7516, + "learning_rate": 1.983822564245833e-05, + "loss": 0.016, + "step": 3758 + }, + { + "epoch": 0.752, + "learning_rate": 1.983571470813386e-05, + "loss": 0.0017, + "step": 3760 + }, + { + "epoch": 0.7524, + "learning_rate": 1.9833184598580276e-05, + "loss": 0.1392, + "step": 3762 + }, + { + "epoch": 0.7528, + "learning_rate": 1.983063531873016e-05, + "loss": 0.0079, + "step": 3764 + }, + { + "epoch": 0.7532, + "learning_rate": 1.982806687355345e-05, + "loss": 0.2077, + "step": 3766 + }, + { + "epoch": 0.7536, + "learning_rate": 1.982547926805747e-05, + "loss": 0.0244, + "step": 3768 + }, + { + "epoch": 0.754, + "learning_rate": 1.982287250728689e-05, + "loss": 0.0052, + "step": 3770 + }, + { + "epoch": 0.7544, + "learning_rate": 1.9820246596323724e-05, + "loss": 0.0025, + "step": 3772 + }, + { + "epoch": 0.7548, + "learning_rate": 1.981760154028731e-05, + "loss": 0.4241, + "step": 3774 + }, + { + "epoch": 0.7552, + "learning_rate": 1.981493734433433e-05, + "loss": 0.1751, + "step": 3776 + }, + { + "epoch": 0.7556, + "learning_rate": 1.9812254013658773e-05, + "loss": 0.0022, + "step": 3778 + }, + { + "epoch": 0.756, + "learning_rate": 1.9809551553491918e-05, + "loss": 0.0663, + "step": 3780 + }, + { + "epoch": 0.7564, + "learning_rate": 1.9806829969102356e-05, + "loss": 0.1747, + "step": 3782 + }, + { + "epoch": 0.7568, + "learning_rate": 1.9804089265795963e-05, + "loss": 0.0016, + "step": 3784 + }, + { + "epoch": 0.7572, + "learning_rate": 1.9801329448915863e-05, + "loss": 0.005, + "step": 3786 + }, + { + "epoch": 0.7576, + "learning_rate": 1.979855052384247e-05, + "loss": 0.0297, + "step": 3788 + }, + { + "epoch": 0.758, + "learning_rate": 1.979575249599344e-05, + "loss": 0.0012, + "step": 3790 + }, + { + "epoch": 0.7584, + "learning_rate": 1.979293537082368e-05, + "loss": 0.0247, + "step": 3792 + }, + { + "epoch": 0.7588, + "learning_rate": 1.9790099153825303e-05, + "loss": 0.1938, + "step": 3794 + }, + { + "epoch": 0.7592, + "learning_rate": 1.9787243850527663e-05, + "loss": 0.0022, + "step": 3796 + }, + { + "epoch": 0.7596, + "learning_rate": 1.978436946649733e-05, + "loss": 0.3068, + "step": 3798 + }, + { + "epoch": 0.76, + "learning_rate": 1.978147600733806e-05, + "loss": 0.1748, + "step": 3800 + }, + { + "epoch": 0.7604, + "learning_rate": 1.9778563478690793e-05, + "loss": 0.0237, + "step": 3802 + }, + { + "epoch": 0.7608, + "learning_rate": 1.977563188623365e-05, + "loss": 0.0049, + "step": 3804 + }, + { + "epoch": 0.7612, + "learning_rate": 1.977268123568194e-05, + "loss": 0.0493, + "step": 3806 + }, + { + "epoch": 0.7616, + "learning_rate": 1.9769711532788086e-05, + "loss": 0.005, + "step": 3808 + }, + { + "epoch": 0.762, + "learning_rate": 1.9766722783341682e-05, + "loss": 0.0013, + "step": 3810 + }, + { + "epoch": 0.7624, + "learning_rate": 1.9763714993169448e-05, + "loss": 0.0059, + "step": 3812 + }, + { + "epoch": 0.7628, + "learning_rate": 1.9760688168135236e-05, + "loss": 0.0929, + "step": 3814 + }, + { + "epoch": 0.7632, + "learning_rate": 1.9757642314139977e-05, + "loss": 0.1973, + "step": 3816 + }, + { + "epoch": 0.7636, + "learning_rate": 1.9754577437121733e-05, + "loss": 0.0013, + "step": 3818 + }, + { + "epoch": 0.764, + "learning_rate": 1.9751493543055638e-05, + "loss": 0.0324, + "step": 3820 + }, + { + "epoch": 0.7644, + "learning_rate": 1.974839063795389e-05, + "loss": 0.0232, + "step": 3822 + }, + { + "epoch": 0.7648, + "learning_rate": 1.9745268727865774e-05, + "loss": 2.1863, + "step": 3824 + }, + { + "epoch": 0.7652, + "learning_rate": 1.97421278188776e-05, + "loss": 0.0051, + "step": 3826 + }, + { + "epoch": 0.7656, + "learning_rate": 1.973896791711276e-05, + "loss": 0.0011, + "step": 3828 + }, + { + "epoch": 0.766, + "learning_rate": 1.9735789028731607e-05, + "loss": 0.0211, + "step": 3830 + }, + { + "epoch": 0.7664, + "learning_rate": 1.9732591159931567e-05, + "loss": 0.1417, + "step": 3832 + }, + { + "epoch": 0.7668, + "learning_rate": 1.9729374316947037e-05, + "loss": 0.1472, + "step": 3834 + }, + { + "epoch": 0.7672, + "learning_rate": 1.972613850604944e-05, + "loss": 0.0886, + "step": 3836 + }, + { + "epoch": 0.7676, + "learning_rate": 1.972288373354713e-05, + "loss": 0.4068, + "step": 3838 + }, + { + "epoch": 0.768, + "learning_rate": 1.9719610005785463e-05, + "loss": 0.4341, + "step": 3840 + }, + { + "epoch": 0.7684, + "learning_rate": 1.9716317329146743e-05, + "loss": 0.0399, + "step": 3842 + }, + { + "epoch": 0.7688, + "learning_rate": 1.9713005710050206e-05, + "loss": 0.6258, + "step": 3844 + }, + { + "epoch": 0.7692, + "learning_rate": 1.9709675154952017e-05, + "loss": 0.1102, + "step": 3846 + }, + { + "epoch": 0.7696, + "learning_rate": 1.9706325670345276e-05, + "loss": 0.7711, + "step": 3848 + }, + { + "epoch": 0.77, + "learning_rate": 1.970295726275997e-05, + "loss": 0.0074, + "step": 3850 + }, + { + "epoch": 0.7704, + "learning_rate": 1.9699569938762975e-05, + "loss": 0.0243, + "step": 3852 + }, + { + "epoch": 0.7708, + "learning_rate": 1.969616370495806e-05, + "loss": 0.0001, + "step": 3854 + }, + { + "epoch": 0.7712, + "learning_rate": 1.969273856798586e-05, + "loss": 0.0462, + "step": 3856 + }, + { + "epoch": 0.7716, + "learning_rate": 1.9689294534523836e-05, + "loss": 0.1111, + "step": 3858 + }, + { + "epoch": 0.772, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.0686, + "step": 3860 + }, + { + "epoch": 0.7724, + "learning_rate": 1.9682349805024447e-05, + "loss": 0.2509, + "step": 3862 + }, + { + "epoch": 0.7728, + "learning_rate": 1.9678849122526195e-05, + "loss": 0.0111, + "step": 3864 + }, + { + "epoch": 0.7732, + "learning_rate": 1.9675329570616302e-05, + "loss": 0.2763, + "step": 3866 + }, + { + "epoch": 0.7736, + "learning_rate": 1.967179115615633e-05, + "loss": 0.0292, + "step": 3868 + }, + { + "epoch": 0.774, + "learning_rate": 1.966823388604459e-05, + "loss": 0.2623, + "step": 3870 + }, + { + "epoch": 0.7744, + "learning_rate": 1.966465776721618e-05, + "loss": 0.0002, + "step": 3872 + }, + { + "epoch": 0.7748, + "learning_rate": 1.9661062806642906e-05, + "loss": 0.1848, + "step": 3874 + }, + { + "epoch": 0.7752, + "learning_rate": 1.9657449011333328e-05, + "loss": 0.0642, + "step": 3876 + }, + { + "epoch": 0.7756, + "learning_rate": 1.9653816388332743e-05, + "loss": 0.2445, + "step": 3878 + }, + { + "epoch": 0.776, + "learning_rate": 1.965016494472312e-05, + "loss": 0.0149, + "step": 3880 + }, + { + "epoch": 0.7764, + "learning_rate": 1.964649468762313e-05, + "loss": 0.0313, + "step": 3882 + }, + { + "epoch": 0.7768, + "learning_rate": 1.964280562418815e-05, + "loss": 0.195, + "step": 3884 + }, + { + "epoch": 0.7772, + "learning_rate": 1.963909776161018e-05, + "loss": 0.0091, + "step": 3886 + }, + { + "epoch": 0.7776, + "learning_rate": 1.963537110711789e-05, + "loss": 0.0005, + "step": 3888 + }, + { + "epoch": 0.778, + "learning_rate": 1.9631625667976584e-05, + "loss": 0.0105, + "step": 3890 + }, + { + "epoch": 0.7784, + "learning_rate": 1.9627861451488194e-05, + "loss": 0.1129, + "step": 3892 + }, + { + "epoch": 0.7788, + "learning_rate": 1.9624078464991246e-05, + "loss": 0.7939, + "step": 3894 + }, + { + "epoch": 0.7792, + "learning_rate": 1.962027671586086e-05, + "loss": 0.4734, + "step": 3896 + }, + { + "epoch": 0.7796, + "learning_rate": 1.9616456211508756e-05, + "loss": 0.0151, + "step": 3898 + }, + { + "epoch": 0.78, + "learning_rate": 1.9612616959383194e-05, + "loss": 0.0017, + "step": 3900 + }, + { + "epoch": 0.7804, + "learning_rate": 1.9608758966968987e-05, + "loss": 0.0436, + "step": 3902 + }, + { + "epoch": 0.7808, + "learning_rate": 1.96048822417875e-05, + "loss": 0.0285, + "step": 3904 + }, + { + "epoch": 0.7812, + "learning_rate": 1.9600986791396597e-05, + "loss": 0.1526, + "step": 3906 + }, + { + "epoch": 0.7816, + "learning_rate": 1.9597072623390668e-05, + "loss": 0.0001, + "step": 3908 + }, + { + "epoch": 0.782, + "learning_rate": 1.9593139745400578e-05, + "loss": 0.0018, + "step": 3910 + }, + { + "epoch": 0.7824, + "learning_rate": 1.9589188165093666e-05, + "loss": 0.0163, + "step": 3912 + }, + { + "epoch": 0.7828, + "learning_rate": 1.9585217890173765e-05, + "loss": 0.0064, + "step": 3914 + }, + { + "epoch": 0.7832, + "learning_rate": 1.95812289283811e-05, + "loss": 0.1456, + "step": 3916 + }, + { + "epoch": 0.7836, + "learning_rate": 1.957722128749237e-05, + "loss": 0.1865, + "step": 3918 + }, + { + "epoch": 0.784, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.0005, + "step": 3920 + }, + { + "epoch": 0.7844, + "learning_rate": 1.9569149999715518e-05, + "loss": 0.3004, + "step": 3922 + }, + { + "epoch": 0.7848, + "learning_rate": 1.9565086368562784e-05, + "loss": 0.0057, + "step": 3924 + }, + { + "epoch": 0.7852, + "learning_rate": 1.9561004089784722e-05, + "loss": 0.0027, + "step": 3926 + }, + { + "epoch": 0.7856, + "learning_rate": 1.9556903171339966e-05, + "loss": 0.0064, + "step": 3928 + }, + { + "epoch": 0.786, + "learning_rate": 1.955278362122344e-05, + "loss": 0.0002, + "step": 3930 + }, + { + "epoch": 0.7864, + "learning_rate": 1.954864544746643e-05, + "loss": 0.2249, + "step": 3932 + }, + { + "epoch": 0.7868, + "learning_rate": 1.954448865813652e-05, + "loss": 0.0009, + "step": 3934 + }, + { + "epoch": 0.7872, + "learning_rate": 1.9540313261337585e-05, + "loss": 0.5068, + "step": 3936 + }, + { + "epoch": 0.7876, + "learning_rate": 1.9536119265209763e-05, + "loss": 0.2153, + "step": 3938 + }, + { + "epoch": 0.788, + "learning_rate": 1.9531906677929472e-05, + "loss": 1.0544, + "step": 3940 + }, + { + "epoch": 0.7884, + "learning_rate": 1.9527675507709364e-05, + "loss": 0.6563, + "step": 3942 + }, + { + "epoch": 0.7888, + "learning_rate": 1.9523425762798335e-05, + "loss": 0.085, + "step": 3944 + }, + { + "epoch": 0.7892, + "learning_rate": 1.9519157451481456e-05, + "loss": 0.0003, + "step": 3946 + }, + { + "epoch": 0.7896, + "learning_rate": 1.9514870582080035e-05, + "loss": 0.0001, + "step": 3948 + }, + { + "epoch": 0.79, + "learning_rate": 1.9510565162951545e-05, + "loss": 0.0348, + "step": 3950 + }, + { + "epoch": 0.7904, + "learning_rate": 1.95062412024896e-05, + "loss": 0.1687, + "step": 3952 + }, + { + "epoch": 0.7908, + "learning_rate": 1.950189870912401e-05, + "loss": 0.0925, + "step": 3954 + }, + { + "epoch": 0.7912, + "learning_rate": 1.9497537691320667e-05, + "loss": 0.2666, + "step": 3956 + }, + { + "epoch": 0.7916, + "learning_rate": 1.9493158157581617e-05, + "loss": 0.9611, + "step": 3958 + }, + { + "epoch": 0.792, + "learning_rate": 1.948876011644497e-05, + "loss": 0.3584, + "step": 3960 + }, + { + "epoch": 0.7924, + "learning_rate": 1.948434357648493e-05, + "loss": 0.409, + "step": 3962 + }, + { + "epoch": 0.7928, + "learning_rate": 1.9479908546311787e-05, + "loss": 0.3075, + "step": 3964 + }, + { + "epoch": 0.7932, + "learning_rate": 1.9475455034571843e-05, + "loss": 0.0005, + "step": 3966 + }, + { + "epoch": 0.7936, + "learning_rate": 1.9470983049947443e-05, + "loss": 0.0358, + "step": 3968 + }, + { + "epoch": 0.794, + "learning_rate": 1.9466492601156964e-05, + "loss": 0.1655, + "step": 3970 + }, + { + "epoch": 0.7944, + "learning_rate": 1.9461983696954767e-05, + "loss": 0.0526, + "step": 3972 + }, + { + "epoch": 0.7948, + "learning_rate": 1.9457456346131175e-05, + "loss": 0.0207, + "step": 3974 + }, + { + "epoch": 0.7952, + "learning_rate": 1.9452910557512494e-05, + "loss": 0.658, + "step": 3976 + }, + { + "epoch": 0.7956, + "learning_rate": 1.9448346339960984e-05, + "loss": 0.1448, + "step": 3978 + }, + { + "epoch": 0.796, + "learning_rate": 1.9443763702374818e-05, + "loss": 0.2804, + "step": 3980 + }, + { + "epoch": 0.7964, + "learning_rate": 1.9439162653688066e-05, + "loss": 0.31, + "step": 3982 + }, + { + "epoch": 0.7968, + "learning_rate": 1.9434543202870726e-05, + "loss": 0.1336, + "step": 3984 + }, + { + "epoch": 0.7972, + "learning_rate": 1.9429905358928655e-05, + "loss": 1.1505, + "step": 3986 + }, + { + "epoch": 0.7976, + "learning_rate": 1.9425249130903544e-05, + "loss": 0.4268, + "step": 3988 + }, + { + "epoch": 0.798, + "learning_rate": 1.942057452787297e-05, + "loss": 0.0447, + "step": 3990 + }, + { + "epoch": 0.7984, + "learning_rate": 1.94158815589503e-05, + "loss": 0.1648, + "step": 3992 + }, + { + "epoch": 0.7988, + "learning_rate": 1.941117023328473e-05, + "loss": 0.0051, + "step": 3994 + }, + { + "epoch": 0.7992, + "learning_rate": 1.940644056006122e-05, + "loss": 0.5072, + "step": 3996 + }, + { + "epoch": 0.7996, + "learning_rate": 1.94016925485005e-05, + "loss": 0.0903, + "step": 3998 + }, + { + "epoch": 0.8, + "learning_rate": 1.939692620785909e-05, + "loss": 0.1421, + "step": 4000 + }, + { + "epoch": 0.8004, + "learning_rate": 1.939214154742919e-05, + "loss": 0.0949, + "step": 4002 + }, + { + "epoch": 0.8008, + "learning_rate": 1.9387338576538746e-05, + "loss": 0.0534, + "step": 4004 + }, + { + "epoch": 0.8012, + "learning_rate": 1.9382517304551393e-05, + "loss": 0.0018, + "step": 4006 + }, + { + "epoch": 0.8016, + "learning_rate": 1.9377677740866464e-05, + "loss": 0.0991, + "step": 4008 + }, + { + "epoch": 0.802, + "learning_rate": 1.9372819894918922e-05, + "loss": 1.0465, + "step": 4010 + }, + { + "epoch": 0.8024, + "learning_rate": 1.936794377617938e-05, + "loss": 0.1485, + "step": 4012 + }, + { + "epoch": 0.8028, + "learning_rate": 1.9363049394154102e-05, + "loss": 0.0152, + "step": 4014 + }, + { + "epoch": 0.8032, + "learning_rate": 1.9358136758384917e-05, + "loss": 0.3122, + "step": 4016 + }, + { + "epoch": 0.8036, + "learning_rate": 1.935320587844926e-05, + "loss": 0.1237, + "step": 4018 + }, + { + "epoch": 0.804, + "learning_rate": 1.9348256763960146e-05, + "loss": 0.1335, + "step": 4020 + }, + { + "epoch": 0.8044, + "learning_rate": 1.934328942456613e-05, + "loss": 0.0389, + "step": 4022 + }, + { + "epoch": 0.8048, + "learning_rate": 1.9338303869951273e-05, + "loss": 0.0799, + "step": 4024 + }, + { + "epoch": 0.8052, + "learning_rate": 1.9333300109835186e-05, + "loss": 0.0212, + "step": 4026 + }, + { + "epoch": 0.8056, + "learning_rate": 1.9328278153972943e-05, + "loss": 0.1685, + "step": 4028 + }, + { + "epoch": 0.806, + "learning_rate": 1.9323238012155125e-05, + "loss": 0.2622, + "step": 4030 + }, + { + "epoch": 0.8064, + "learning_rate": 1.931817969420773e-05, + "loss": 0.001, + "step": 4032 + }, + { + "epoch": 0.8068, + "learning_rate": 1.93131032099922e-05, + "loss": 0.6579, + "step": 4034 + }, + { + "epoch": 0.8072, + "learning_rate": 1.930800856940543e-05, + "loss": 0.0248, + "step": 4036 + }, + { + "epoch": 0.8076, + "learning_rate": 1.9302895782379648e-05, + "loss": 0.0977, + "step": 4038 + }, + { + "epoch": 0.808, + "learning_rate": 1.929776485888252e-05, + "loss": 0.0853, + "step": 4040 + }, + { + "epoch": 0.8084, + "learning_rate": 1.9292615808917024e-05, + "loss": 0.2765, + "step": 4042 + }, + { + "epoch": 0.8088, + "learning_rate": 1.9287448642521517e-05, + "loss": 0.1602, + "step": 4044 + }, + { + "epoch": 0.8092, + "learning_rate": 1.9282263369769637e-05, + "loss": 0.1241, + "step": 4046 + }, + { + "epoch": 0.8096, + "learning_rate": 1.9277060000770342e-05, + "loss": 0.1687, + "step": 4048 + }, + { + "epoch": 0.81, + "learning_rate": 1.927183854566788e-05, + "loss": 0.0422, + "step": 4050 + }, + { + "epoch": 0.8104, + "learning_rate": 1.9266599014641727e-05, + "loss": 0.1756, + "step": 4052 + }, + { + "epoch": 0.8108, + "learning_rate": 1.9261341417906622e-05, + "loss": 0.0176, + "step": 4054 + }, + { + "epoch": 0.8112, + "learning_rate": 1.925606576571252e-05, + "loss": 0.2383, + "step": 4056 + }, + { + "epoch": 0.8116, + "learning_rate": 1.925077206834459e-05, + "loss": 0.0214, + "step": 4058 + }, + { + "epoch": 0.812, + "learning_rate": 1.9245460336123136e-05, + "loss": 0.4176, + "step": 4060 + }, + { + "epoch": 0.8124, + "learning_rate": 1.924013057940367e-05, + "loss": 0.1834, + "step": 4062 + }, + { + "epoch": 0.8128, + "learning_rate": 1.923478280857682e-05, + "loss": 0.7571, + "step": 4064 + }, + { + "epoch": 0.8132, + "learning_rate": 1.922941703406836e-05, + "loss": 0.011, + "step": 4066 + }, + { + "epoch": 0.8136, + "learning_rate": 1.9224033266339103e-05, + "loss": 0.1035, + "step": 4068 + }, + { + "epoch": 0.814, + "learning_rate": 1.9218631515885007e-05, + "loss": 0.3601, + "step": 4070 + }, + { + "epoch": 0.8144, + "learning_rate": 1.9213211793237066e-05, + "loss": 0.1155, + "step": 4072 + }, + { + "epoch": 0.8148, + "learning_rate": 1.9207774108961276e-05, + "loss": 0.1405, + "step": 4074 + }, + { + "epoch": 0.8152, + "learning_rate": 1.9202318473658707e-05, + "loss": 0.0029, + "step": 4076 + }, + { + "epoch": 0.8156, + "learning_rate": 1.9196844897965387e-05, + "loss": 0.0461, + "step": 4078 + }, + { + "epoch": 0.816, + "learning_rate": 1.919135339255235e-05, + "loss": 0.05, + "step": 4080 + }, + { + "epoch": 0.8164, + "learning_rate": 1.9185843968125546e-05, + "loss": 0.0124, + "step": 4082 + }, + { + "epoch": 0.8168, + "learning_rate": 1.918031663542588e-05, + "loss": 0.0954, + "step": 4084 + }, + { + "epoch": 0.8172, + "learning_rate": 1.917477140522919e-05, + "loss": 0.1922, + "step": 4086 + }, + { + "epoch": 0.8176, + "learning_rate": 1.916920828834617e-05, + "loss": 0.1427, + "step": 4088 + }, + { + "epoch": 0.818, + "learning_rate": 1.9163627295622394e-05, + "loss": 0.0028, + "step": 4090 + }, + { + "epoch": 0.8184, + "learning_rate": 1.9158028437938313e-05, + "loss": 0.0047, + "step": 4092 + }, + { + "epoch": 0.8188, + "learning_rate": 1.9152411726209183e-05, + "loss": 0.0113, + "step": 4094 + }, + { + "epoch": 0.8192, + "learning_rate": 1.9146777171385057e-05, + "loss": 0.0249, + "step": 4096 + }, + { + "epoch": 0.8196, + "learning_rate": 1.914112478445079e-05, + "loss": 0.0319, + "step": 4098 + }, + { + "epoch": 0.82, + "learning_rate": 1.913545457642601e-05, + "loss": 0.1962, + "step": 4100 + }, + { + "epoch": 0.8204, + "learning_rate": 1.9129766558365082e-05, + "loss": 0.007, + "step": 4102 + }, + { + "epoch": 0.8208, + "learning_rate": 1.9124060741357065e-05, + "loss": 0.0014, + "step": 4104 + }, + { + "epoch": 0.8212, + "learning_rate": 1.911833713652576e-05, + "loss": 0.0023, + "step": 4106 + }, + { + "epoch": 0.8216, + "learning_rate": 1.911259575502963e-05, + "loss": 0.8054, + "step": 4108 + }, + { + "epoch": 0.822, + "learning_rate": 1.9106836608061775e-05, + "loss": 0.1066, + "step": 4110 + }, + { + "epoch": 0.8224, + "learning_rate": 1.910105970684996e-05, + "loss": 0.0781, + "step": 4112 + }, + { + "epoch": 0.8228, + "learning_rate": 1.909526506265654e-05, + "loss": 0.2527, + "step": 4114 + }, + { + "epoch": 0.8232, + "learning_rate": 1.908945268677849e-05, + "loss": 0.3597, + "step": 4116 + }, + { + "epoch": 0.8236, + "learning_rate": 1.9083622590547313e-05, + "loss": 0.0173, + "step": 4118 + }, + { + "epoch": 0.824, + "learning_rate": 1.9077774785329085e-05, + "loss": 0.0041, + "step": 4120 + }, + { + "epoch": 0.8244, + "learning_rate": 1.9071909282524422e-05, + "loss": 0.0802, + "step": 4122 + }, + { + "epoch": 0.8248, + "learning_rate": 1.9066026093568383e-05, + "loss": 0.0069, + "step": 4124 + }, + { + "epoch": 0.8252, + "learning_rate": 1.9060125229930576e-05, + "loss": 0.1252, + "step": 4126 + }, + { + "epoch": 0.8256, + "learning_rate": 1.9054206703115013e-05, + "loss": 0.0014, + "step": 4128 + }, + { + "epoch": 0.826, + "learning_rate": 1.9048270524660203e-05, + "loss": 0.142, + "step": 4130 + }, + { + "epoch": 0.8264, + "learning_rate": 1.9042316706138994e-05, + "loss": 0.0647, + "step": 4132 + }, + { + "epoch": 0.8268, + "learning_rate": 1.9036345259158664e-05, + "loss": 0.0009, + "step": 4134 + }, + { + "epoch": 0.8272, + "learning_rate": 1.903035619536087e-05, + "loss": 0.0674, + "step": 4136 + }, + { + "epoch": 0.8276, + "learning_rate": 1.9024349526421603e-05, + "loss": 0.0579, + "step": 4138 + }, + { + "epoch": 0.828, + "learning_rate": 1.901832526405114e-05, + "loss": 0.1096, + "step": 4140 + }, + { + "epoch": 0.8284, + "learning_rate": 1.9012283419994112e-05, + "loss": 0.2075, + "step": 4142 + }, + { + "epoch": 0.8288, + "learning_rate": 1.9006224006029414e-05, + "loss": 0.0039, + "step": 4144 + }, + { + "epoch": 0.8292, + "learning_rate": 1.9000147033970148e-05, + "loss": 0.0402, + "step": 4146 + }, + { + "epoch": 0.8296, + "learning_rate": 1.899405251566371e-05, + "loss": 0.2482, + "step": 4148 + }, + { + "epoch": 0.83, + "learning_rate": 1.8987940462991666e-05, + "loss": 0.0255, + "step": 4150 + }, + { + "epoch": 0.8304, + "learning_rate": 1.8981810887869797e-05, + "loss": 0.018, + "step": 4152 + }, + { + "epoch": 0.8308, + "learning_rate": 1.8975663802247978e-05, + "loss": 0.528, + "step": 4154 + }, + { + "epoch": 0.8312, + "learning_rate": 1.8969499218110302e-05, + "loss": 0.0382, + "step": 4156 + }, + { + "epoch": 0.8316, + "learning_rate": 1.8963317147474943e-05, + "loss": 0.054, + "step": 4158 + }, + { + "epoch": 0.832, + "learning_rate": 1.8957117602394133e-05, + "loss": 0.0028, + "step": 4160 + }, + { + "epoch": 0.8324, + "learning_rate": 1.8950900594954233e-05, + "loss": 0.6649, + "step": 4162 + }, + { + "epoch": 0.8328, + "learning_rate": 1.8944666137275596e-05, + "loss": 0.1854, + "step": 4164 + }, + { + "epoch": 0.8332, + "learning_rate": 1.8938414241512644e-05, + "loss": 0.0119, + "step": 4166 + }, + { + "epoch": 0.8336, + "learning_rate": 1.8932144919853744e-05, + "loss": 0.141, + "step": 4168 + }, + { + "epoch": 0.834, + "learning_rate": 1.892585818452125e-05, + "loss": 0.0233, + "step": 4170 + }, + { + "epoch": 0.8344, + "learning_rate": 1.8919554047771508e-05, + "loss": 0.3071, + "step": 4172 + }, + { + "epoch": 0.8348, + "learning_rate": 1.891323252189474e-05, + "loss": 0.0499, + "step": 4174 + }, + { + "epoch": 0.8352, + "learning_rate": 1.890689361921507e-05, + "loss": 0.0051, + "step": 4176 + }, + { + "epoch": 0.8356, + "learning_rate": 1.8900537352090523e-05, + "loss": 0.0005, + "step": 4178 + }, + { + "epoch": 0.836, + "learning_rate": 1.8894163732912986e-05, + "loss": 0.0008, + "step": 4180 + }, + { + "epoch": 0.8364, + "learning_rate": 1.8887772774108122e-05, + "loss": 0.0823, + "step": 4182 + }, + { + "epoch": 0.8368, + "learning_rate": 1.8881364488135445e-05, + "loss": 0.0006, + "step": 4184 + }, + { + "epoch": 0.8372, + "learning_rate": 1.887493888748825e-05, + "loss": 0.0002, + "step": 4186 + }, + { + "epoch": 0.8376, + "learning_rate": 1.886849598469357e-05, + "loss": 0.098, + "step": 4188 + }, + { + "epoch": 0.838, + "learning_rate": 1.886203579231215e-05, + "loss": 0.001, + "step": 4190 + }, + { + "epoch": 0.8384, + "learning_rate": 1.8855558322938492e-05, + "loss": 0.313, + "step": 4192 + }, + { + "epoch": 0.8388, + "learning_rate": 1.8849063589200754e-05, + "loss": 0.2036, + "step": 4194 + }, + { + "epoch": 0.8392, + "learning_rate": 1.8842551603760725e-05, + "loss": 0.0957, + "step": 4196 + }, + { + "epoch": 0.8396, + "learning_rate": 1.8836022379313884e-05, + "loss": 0.0003, + "step": 4198 + }, + { + "epoch": 0.84, + "learning_rate": 1.8829475928589265e-05, + "loss": 0.2348, + "step": 4200 + }, + { + "epoch": 0.8404, + "learning_rate": 1.882291226434954e-05, + "loss": 0.0924, + "step": 4202 + }, + { + "epoch": 0.8408, + "learning_rate": 1.8816331399390874e-05, + "loss": 0.1197, + "step": 4204 + }, + { + "epoch": 0.8412, + "learning_rate": 1.880973334654301e-05, + "loss": 0.458, + "step": 4206 + }, + { + "epoch": 0.8416, + "learning_rate": 1.88031181186692e-05, + "loss": 0.1972, + "step": 4208 + }, + { + "epoch": 0.842, + "learning_rate": 1.8796485728666172e-05, + "loss": 0.3486, + "step": 4210 + }, + { + "epoch": 0.8424, + "learning_rate": 1.8789836189464092e-05, + "loss": 0.0002, + "step": 4212 + }, + { + "epoch": 0.8428, + "learning_rate": 1.8783169514026574e-05, + "loss": 0.5437, + "step": 4214 + }, + { + "epoch": 0.8432, + "learning_rate": 1.877648571535068e-05, + "loss": 0.0079, + "step": 4216 + }, + { + "epoch": 0.8436, + "learning_rate": 1.8769784806466775e-05, + "loss": 0.2148, + "step": 4218 + }, + { + "epoch": 0.844, + "learning_rate": 1.8763066800438638e-05, + "loss": 0.6789, + "step": 4220 + }, + { + "epoch": 0.8444, + "learning_rate": 1.8756331710363375e-05, + "loss": 0.0007, + "step": 4222 + }, + { + "epoch": 0.8448, + "learning_rate": 1.8749579549371387e-05, + "loss": 0.1483, + "step": 4224 + }, + { + "epoch": 0.8452, + "learning_rate": 1.8742810330626338e-05, + "loss": 1.0774, + "step": 4226 + }, + { + "epoch": 0.8456, + "learning_rate": 1.8736024067325188e-05, + "loss": 0.0069, + "step": 4228 + }, + { + "epoch": 0.846, + "learning_rate": 1.8729220772698106e-05, + "loss": 0.0186, + "step": 4230 + }, + { + "epoch": 0.8464, + "learning_rate": 1.8722400460008437e-05, + "loss": 0.7625, + "step": 4232 + }, + { + "epoch": 0.8468, + "learning_rate": 1.8715563142552758e-05, + "loss": 0.3905, + "step": 4234 + }, + { + "epoch": 0.8472, + "learning_rate": 1.8708708833660748e-05, + "loss": 0.2127, + "step": 4236 + }, + { + "epoch": 0.8476, + "learning_rate": 1.870183754669526e-05, + "loss": 0.0026, + "step": 4238 + }, + { + "epoch": 0.848, + "learning_rate": 1.8694949295052198e-05, + "loss": 0.0059, + "step": 4240 + }, + { + "epoch": 0.8484, + "learning_rate": 1.8688044092160558e-05, + "loss": 0.2723, + "step": 4242 + }, + { + "epoch": 0.8488, + "learning_rate": 1.868112195148239e-05, + "loss": 0.0524, + "step": 4244 + }, + { + "epoch": 0.8492, + "learning_rate": 1.867418288651278e-05, + "loss": 0.0003, + "step": 4246 + }, + { + "epoch": 0.8496, + "learning_rate": 1.866722691077977e-05, + "loss": 0.0288, + "step": 4248 + }, + { + "epoch": 0.85, + "learning_rate": 1.8660254037844384e-05, + "loss": 0.2198, + "step": 4250 + }, + { + "epoch": 0.8504, + "learning_rate": 1.8653264281300626e-05, + "loss": 0.0045, + "step": 4252 + }, + { + "epoch": 0.8508, + "learning_rate": 1.8646257654775357e-05, + "loss": 0.2092, + "step": 4254 + }, + { + "epoch": 0.8512, + "learning_rate": 1.8639234171928355e-05, + "loss": 0.0094, + "step": 4256 + }, + { + "epoch": 0.8516, + "learning_rate": 1.8632193846452267e-05, + "loss": 0.2925, + "step": 4258 + }, + { + "epoch": 0.852, + "learning_rate": 1.8625136692072587e-05, + "loss": 0.0004, + "step": 4260 + }, + { + "epoch": 0.8524, + "learning_rate": 1.861806272254755e-05, + "loss": 0.1676, + "step": 4262 + }, + { + "epoch": 0.8528, + "learning_rate": 1.8610971951668265e-05, + "loss": 0.0015, + "step": 4264 + }, + { + "epoch": 0.8532, + "learning_rate": 1.8603864393258547e-05, + "loss": 0.0109, + "step": 4266 + }, + { + "epoch": 0.8536, + "learning_rate": 1.8596740061174912e-05, + "loss": 0.8508, + "step": 4268 + }, + { + "epoch": 0.854, + "learning_rate": 1.8589598969306646e-05, + "loss": 0.4392, + "step": 4270 + }, + { + "epoch": 0.8544, + "learning_rate": 1.858244113157566e-05, + "loss": 0.0649, + "step": 4272 + }, + { + "epoch": 0.8548, + "learning_rate": 1.8575266561936533e-05, + "loss": 0.1276, + "step": 4274 + }, + { + "epoch": 0.8552, + "learning_rate": 1.8568075274376432e-05, + "loss": 0.0828, + "step": 4276 + }, + { + "epoch": 0.8556, + "learning_rate": 1.8560867282915164e-05, + "loss": 0.0174, + "step": 4278 + }, + { + "epoch": 0.856, + "learning_rate": 1.8553642601605083e-05, + "loss": 0.0044, + "step": 4280 + }, + { + "epoch": 0.8564, + "learning_rate": 1.8546401244531034e-05, + "loss": 0.0334, + "step": 4282 + }, + { + "epoch": 0.8568, + "learning_rate": 1.8539143225810457e-05, + "loss": 0.0024, + "step": 4284 + }, + { + "epoch": 0.8572, + "learning_rate": 1.85318685595932e-05, + "loss": 0.0731, + "step": 4286 + }, + { + "epoch": 0.8576, + "learning_rate": 1.852457726006163e-05, + "loss": 0.0004, + "step": 4288 + }, + { + "epoch": 0.858, + "learning_rate": 1.8517269341430485e-05, + "loss": 0.0869, + "step": 4290 + }, + { + "epoch": 0.8584, + "learning_rate": 1.8509944817946917e-05, + "loss": 0.161, + "step": 4292 + }, + { + "epoch": 0.8588, + "learning_rate": 1.8502603703890484e-05, + "loss": 0.2454, + "step": 4294 + }, + { + "epoch": 0.8592, + "learning_rate": 1.8495246013573064e-05, + "loss": 0.0268, + "step": 4296 + }, + { + "epoch": 0.8596, + "learning_rate": 1.8487871761338817e-05, + "loss": 0.001, + "step": 4298 + }, + { + "epoch": 0.86, + "learning_rate": 1.848048096156426e-05, + "loss": 0.0005, + "step": 4300 + }, + { + "epoch": 0.8604, + "learning_rate": 1.847307362865813e-05, + "loss": 0.002, + "step": 4302 + }, + { + "epoch": 0.8608, + "learning_rate": 1.8465649777061387e-05, + "loss": 0.0004, + "step": 4304 + }, + { + "epoch": 0.8612, + "learning_rate": 1.8458209421247208e-05, + "loss": 0.0242, + "step": 4306 + }, + { + "epoch": 0.8616, + "learning_rate": 1.8450752575720967e-05, + "loss": 0.0927, + "step": 4308 + }, + { + "epoch": 0.862, + "learning_rate": 1.8443279255020163e-05, + "loss": 0.1406, + "step": 4310 + }, + { + "epoch": 0.8624, + "learning_rate": 1.843578947371439e-05, + "loss": 0.0003, + "step": 4312 + }, + { + "epoch": 0.8628, + "learning_rate": 1.842828324640539e-05, + "loss": 0.6394, + "step": 4314 + }, + { + "epoch": 0.8632, + "learning_rate": 1.8420760587726935e-05, + "loss": 0.4797, + "step": 4316 + }, + { + "epoch": 0.8636, + "learning_rate": 1.8413221512344808e-05, + "loss": 0.0625, + "step": 4318 + }, + { + "epoch": 0.864, + "learning_rate": 1.8405666034956846e-05, + "loss": 0.1479, + "step": 4320 + }, + { + "epoch": 0.8644, + "learning_rate": 1.8398094170292826e-05, + "loss": 0.0071, + "step": 4322 + }, + { + "epoch": 0.8648, + "learning_rate": 1.8390505933114507e-05, + "loss": 0.0043, + "step": 4324 + }, + { + "epoch": 0.8652, + "learning_rate": 1.838290133821552e-05, + "loss": 0.0077, + "step": 4326 + }, + { + "epoch": 0.8656, + "learning_rate": 1.8375280400421414e-05, + "loss": 0.101, + "step": 4328 + }, + { + "epoch": 0.866, + "learning_rate": 1.8367643134589613e-05, + "loss": 0.2648, + "step": 4330 + }, + { + "epoch": 0.8664, + "learning_rate": 1.8359989555609365e-05, + "loss": 0.0126, + "step": 4332 + }, + { + "epoch": 0.8668, + "learning_rate": 1.835231967840168e-05, + "loss": 0.3035, + "step": 4334 + }, + { + "epoch": 0.8672, + "learning_rate": 1.834463351791939e-05, + "loss": 0.2705, + "step": 4336 + }, + { + "epoch": 0.8676, + "learning_rate": 1.8336931089147082e-05, + "loss": 0.1211, + "step": 4338 + }, + { + "epoch": 0.868, + "learning_rate": 1.8329212407101006e-05, + "loss": 0.0011, + "step": 4340 + }, + { + "epoch": 0.8684, + "learning_rate": 1.8321477486829128e-05, + "loss": 0.0138, + "step": 4342 + }, + { + "epoch": 0.8688, + "learning_rate": 1.8313726343411085e-05, + "loss": 0.0174, + "step": 4344 + }, + { + "epoch": 0.8692, + "learning_rate": 1.8305958991958135e-05, + "loss": 0.0509, + "step": 4346 + }, + { + "epoch": 0.8696, + "learning_rate": 1.82981754476131e-05, + "loss": 0.2119, + "step": 4348 + }, + { + "epoch": 0.87, + "learning_rate": 1.8290375725550417e-05, + "loss": 0.0006, + "step": 4350 + }, + { + "epoch": 0.8704, + "learning_rate": 1.8282559840976053e-05, + "loss": 0.001, + "step": 4352 + }, + { + "epoch": 0.8708, + "learning_rate": 1.827472780912744e-05, + "loss": 1.1903, + "step": 4354 + }, + { + "epoch": 0.8712, + "learning_rate": 1.8266879645273557e-05, + "loss": 0.167, + "step": 4356 + }, + { + "epoch": 0.8716, + "learning_rate": 1.825901536471478e-05, + "loss": 0.0134, + "step": 4358 + }, + { + "epoch": 0.872, + "learning_rate": 1.8251134982782966e-05, + "loss": 0.0011, + "step": 4360 + }, + { + "epoch": 0.8724, + "learning_rate": 1.824323851484126e-05, + "loss": 0.0027, + "step": 4362 + }, + { + "epoch": 0.8728, + "learning_rate": 1.823532597628428e-05, + "loss": 1.8731, + "step": 4364 + }, + { + "epoch": 0.8732, + "learning_rate": 1.8227397382537893e-05, + "loss": 0.0103, + "step": 4366 + }, + { + "epoch": 0.8736, + "learning_rate": 1.8219452749059336e-05, + "loss": 0.0153, + "step": 4368 + }, + { + "epoch": 0.874, + "learning_rate": 1.8211492091337048e-05, + "loss": 0.0142, + "step": 4370 + }, + { + "epoch": 0.8744, + "learning_rate": 1.8203515424890734e-05, + "loss": 0.0243, + "step": 4372 + }, + { + "epoch": 0.8748, + "learning_rate": 1.8195522765271346e-05, + "loss": 0.3709, + "step": 4374 + }, + { + "epoch": 0.8752, + "learning_rate": 1.8187514128060956e-05, + "loss": 1.107, + "step": 4376 + }, + { + "epoch": 0.8756, + "learning_rate": 1.8179489528872804e-05, + "loss": 0.1935, + "step": 4378 + }, + { + "epoch": 0.876, + "learning_rate": 1.8171448983351284e-05, + "loss": 0.3094, + "step": 4380 + }, + { + "epoch": 0.8764, + "learning_rate": 1.816339250717185e-05, + "loss": 0.3114, + "step": 4382 + }, + { + "epoch": 0.8768, + "learning_rate": 1.8155320116040983e-05, + "loss": 0.0202, + "step": 4384 + }, + { + "epoch": 0.8772, + "learning_rate": 1.814723182569625e-05, + "loss": 0.1087, + "step": 4386 + }, + { + "epoch": 0.8776, + "learning_rate": 1.8139127651906193e-05, + "loss": 0.455, + "step": 4388 + }, + { + "epoch": 0.878, + "learning_rate": 1.813100761047029e-05, + "loss": 0.0282, + "step": 4390 + }, + { + "epoch": 0.8784, + "learning_rate": 1.8122871717218974e-05, + "loss": 0.1348, + "step": 4392 + }, + { + "epoch": 0.8788, + "learning_rate": 1.8114719988013612e-05, + "loss": 0.1972, + "step": 4394 + }, + { + "epoch": 0.8792, + "learning_rate": 1.8106552438746413e-05, + "loss": 0.0004, + "step": 4396 + }, + { + "epoch": 0.8796, + "learning_rate": 1.8098369085340404e-05, + "loss": 0.6462, + "step": 4398 + }, + { + "epoch": 0.88, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.0171, + "step": 4400 + }, + { + "epoch": 0.8804, + "learning_rate": 1.8081955029958265e-05, + "loss": 0.0037, + "step": 4402 + }, + { + "epoch": 0.8808, + "learning_rate": 1.807372435998219e-05, + "loss": 0.0311, + "step": 4404 + }, + { + "epoch": 0.8812, + "learning_rate": 1.806547794986733e-05, + "loss": 0.0981, + "step": 4406 + }, + { + "epoch": 0.8816, + "learning_rate": 1.8057215815690487e-05, + "loss": 0.016, + "step": 4408 + }, + { + "epoch": 0.882, + "learning_rate": 1.8048937973559148e-05, + "loss": 0.183, + "step": 4410 + }, + { + "epoch": 0.8824, + "learning_rate": 1.8040644439611355e-05, + "loss": 0.7314, + "step": 4412 + }, + { + "epoch": 0.8828, + "learning_rate": 1.8032335230015777e-05, + "loss": 0.3102, + "step": 4414 + }, + { + "epoch": 0.8832, + "learning_rate": 1.8024010360971665e-05, + "loss": 0.4388, + "step": 4416 + }, + { + "epoch": 0.8836, + "learning_rate": 1.8015669848708774e-05, + "loss": 0.0881, + "step": 4418 + }, + { + "epoch": 0.884, + "learning_rate": 1.8007313709487345e-05, + "loss": 0.0366, + "step": 4420 + }, + { + "epoch": 0.8844, + "learning_rate": 1.7998941959598093e-05, + "loss": 0.0609, + "step": 4422 + }, + { + "epoch": 0.8848, + "learning_rate": 1.7990554615362207e-05, + "loss": 0.0412, + "step": 4424 + }, + { + "epoch": 0.8852, + "learning_rate": 1.7982151693131213e-05, + "loss": 0.0622, + "step": 4426 + }, + { + "epoch": 0.8856, + "learning_rate": 1.7973733209287036e-05, + "loss": 0.1778, + "step": 4428 + }, + { + "epoch": 0.886, + "learning_rate": 1.7965299180241963e-05, + "loss": 0.0058, + "step": 4430 + }, + { + "epoch": 0.8864, + "learning_rate": 1.7956849622438568e-05, + "loss": 0.1898, + "step": 4432 + }, + { + "epoch": 0.8868, + "learning_rate": 1.794838455234966e-05, + "loss": 0.1076, + "step": 4434 + }, + { + "epoch": 0.8872, + "learning_rate": 1.7939903986478357e-05, + "loss": 0.0028, + "step": 4436 + }, + { + "epoch": 0.8876, + "learning_rate": 1.7931407941357938e-05, + "loss": 0.033, + "step": 4438 + }, + { + "epoch": 0.888, + "learning_rate": 1.7922896433551913e-05, + "loss": 0.0166, + "step": 4440 + }, + { + "epoch": 0.8884, + "learning_rate": 1.7914369479653864e-05, + "loss": 0.3376, + "step": 4442 + }, + { + "epoch": 0.8888, + "learning_rate": 1.7905827096287525e-05, + "loss": 0.0162, + "step": 4444 + }, + { + "epoch": 0.8892, + "learning_rate": 1.7897269300106752e-05, + "loss": 0.3797, + "step": 4446 + }, + { + "epoch": 0.8896, + "learning_rate": 1.7888696107795347e-05, + "loss": 0.0297, + "step": 4448 + }, + { + "epoch": 0.89, + "learning_rate": 1.7880107536067228e-05, + "loss": 0.3437, + "step": 4450 + }, + { + "epoch": 0.8904, + "learning_rate": 1.787150360166623e-05, + "loss": 0.0087, + "step": 4452 + }, + { + "epoch": 0.8908, + "learning_rate": 1.78628843213662e-05, + "loss": 0.5396, + "step": 4454 + }, + { + "epoch": 0.8912, + "learning_rate": 1.7854249711970826e-05, + "loss": 0.001, + "step": 4456 + }, + { + "epoch": 0.8916, + "learning_rate": 1.7845599790313732e-05, + "loss": 0.1215, + "step": 4458 + }, + { + "epoch": 0.892, + "learning_rate": 1.783693457325841e-05, + "loss": 0.0062, + "step": 4460 + }, + { + "epoch": 0.8924, + "learning_rate": 1.782825407769811e-05, + "loss": 0.0298, + "step": 4462 + }, + { + "epoch": 0.8928, + "learning_rate": 1.7819558320555902e-05, + "loss": 0.2679, + "step": 4464 + }, + { + "epoch": 0.8932, + "learning_rate": 1.7810847318784632e-05, + "loss": 0.172, + "step": 4466 + }, + { + "epoch": 0.8936, + "learning_rate": 1.780212108936685e-05, + "loss": 0.3208, + "step": 4468 + }, + { + "epoch": 0.894, + "learning_rate": 1.7793379649314743e-05, + "loss": 0.1105, + "step": 4470 + }, + { + "epoch": 0.8944, + "learning_rate": 1.7784623015670237e-05, + "loss": 0.0049, + "step": 4472 + }, + { + "epoch": 0.8948, + "learning_rate": 1.777585120550481e-05, + "loss": 0.1211, + "step": 4474 + }, + { + "epoch": 0.8952, + "learning_rate": 1.7767064235919594e-05, + "loss": 0.2464, + "step": 4476 + }, + { + "epoch": 0.8956, + "learning_rate": 1.77582621240452e-05, + "loss": 0.0054, + "step": 4478 + }, + { + "epoch": 0.896, + "learning_rate": 1.77494448870418e-05, + "loss": 0.1512, + "step": 4480 + }, + { + "epoch": 0.8964, + "learning_rate": 1.774061254209907e-05, + "loss": 0.035, + "step": 4482 + }, + { + "epoch": 0.8968, + "learning_rate": 1.773176510643608e-05, + "loss": 0.0465, + "step": 4484 + }, + { + "epoch": 0.8972, + "learning_rate": 1.7722902597301388e-05, + "loss": 0.0991, + "step": 4486 + }, + { + "epoch": 0.8976, + "learning_rate": 1.7714025031972894e-05, + "loss": 0.0014, + "step": 4488 + }, + { + "epoch": 0.898, + "learning_rate": 1.77051324277579e-05, + "loss": 0.0166, + "step": 4490 + }, + { + "epoch": 0.8984, + "learning_rate": 1.769622480199295e-05, + "loss": 0.0096, + "step": 4492 + }, + { + "epoch": 0.8988, + "learning_rate": 1.7687302172043926e-05, + "loss": 0.1988, + "step": 4494 + }, + { + "epoch": 0.8992, + "learning_rate": 1.7678364555305982e-05, + "loss": 0.0461, + "step": 4496 + }, + { + "epoch": 0.8996, + "learning_rate": 1.7669411969203424e-05, + "loss": 0.2068, + "step": 4498 + }, + { + "epoch": 0.9, + "learning_rate": 1.7660444431189777e-05, + "loss": 0.0314, + "step": 4500 + }, + { + "epoch": 0.9004, + "learning_rate": 1.765146195874774e-05, + "loss": 0.529, + "step": 4502 + }, + { + "epoch": 0.9008, + "learning_rate": 1.76424645693891e-05, + "loss": 0.0037, + "step": 4504 + }, + { + "epoch": 0.9012, + "learning_rate": 1.7633452280654696e-05, + "loss": 0.1756, + "step": 4506 + }, + { + "epoch": 0.9016, + "learning_rate": 1.762442511011448e-05, + "loss": 0.0182, + "step": 4508 + }, + { + "epoch": 0.902, + "learning_rate": 1.761538307536738e-05, + "loss": 0.014, + "step": 4510 + }, + { + "epoch": 0.9024, + "learning_rate": 1.7606326194041285e-05, + "loss": 0.9144, + "step": 4512 + }, + { + "epoch": 0.9028, + "learning_rate": 1.759725448379305e-05, + "loss": 0.5133, + "step": 4514 + }, + { + "epoch": 0.9032, + "learning_rate": 1.7588167962308458e-05, + "loss": 0.3089, + "step": 4516 + }, + { + "epoch": 0.9036, + "learning_rate": 1.7579066647302147e-05, + "loss": 0.3084, + "step": 4518 + }, + { + "epoch": 0.904, + "learning_rate": 1.756995055651757e-05, + "loss": 0.8056, + "step": 4520 + }, + { + "epoch": 0.9044, + "learning_rate": 1.7560819707727037e-05, + "loss": 0.0467, + "step": 4522 + }, + { + "epoch": 0.9048, + "learning_rate": 1.7551674118731585e-05, + "loss": 0.0018, + "step": 4524 + }, + { + "epoch": 0.9052, + "learning_rate": 1.7542513807361044e-05, + "loss": 0.0686, + "step": 4526 + }, + { + "epoch": 0.9056, + "learning_rate": 1.7533338791473875e-05, + "loss": 0.0594, + "step": 4528 + }, + { + "epoch": 0.906, + "learning_rate": 1.7524149088957238e-05, + "loss": 0.0726, + "step": 4530 + }, + { + "epoch": 0.9064, + "learning_rate": 1.751494471772697e-05, + "loss": 0.0676, + "step": 4532 + }, + { + "epoch": 0.9068, + "learning_rate": 1.750572569572742e-05, + "loss": 0.0408, + "step": 4534 + }, + { + "epoch": 0.9072, + "learning_rate": 1.7496492040931548e-05, + "loss": 0.2313, + "step": 4536 + }, + { + "epoch": 0.9076, + "learning_rate": 1.7487243771340862e-05, + "loss": 0.0546, + "step": 4538 + }, + { + "epoch": 0.908, + "learning_rate": 1.747798090498533e-05, + "loss": 0.0105, + "step": 4540 + }, + { + "epoch": 0.9084, + "learning_rate": 1.7468703459923365e-05, + "loss": 0.0054, + "step": 4542 + }, + { + "epoch": 0.9088, + "learning_rate": 1.745941145424182e-05, + "loss": 0.1618, + "step": 4544 + }, + { + "epoch": 0.9092, + "learning_rate": 1.7450104906055973e-05, + "loss": 0.0543, + "step": 4546 + }, + { + "epoch": 0.9096, + "learning_rate": 1.744078383350938e-05, + "loss": 0.0007, + "step": 4548 + }, + { + "epoch": 0.91, + "learning_rate": 1.7431448254773943e-05, + "loss": 0.2302, + "step": 4550 + }, + { + "epoch": 0.9104, + "learning_rate": 1.7422098188049885e-05, + "loss": 0.0027, + "step": 4552 + }, + { + "epoch": 0.9108, + "learning_rate": 1.7412733651565624e-05, + "loss": 0.4076, + "step": 4554 + }, + { + "epoch": 0.9112, + "learning_rate": 1.7403354663577782e-05, + "loss": 0.003, + "step": 4556 + }, + { + "epoch": 0.9116, + "learning_rate": 1.739396124237121e-05, + "loss": 0.043, + "step": 4558 + }, + { + "epoch": 0.912, + "learning_rate": 1.738455340625883e-05, + "loss": 0.0347, + "step": 4560 + }, + { + "epoch": 0.9124, + "learning_rate": 1.7375131173581744e-05, + "loss": 0.3349, + "step": 4562 + }, + { + "epoch": 0.9128, + "learning_rate": 1.7365694562709038e-05, + "loss": 0.0067, + "step": 4564 + }, + { + "epoch": 0.9132, + "learning_rate": 1.7356243592037865e-05, + "loss": 0.1744, + "step": 4566 + }, + { + "epoch": 0.9136, + "learning_rate": 1.7346778279993433e-05, + "loss": 0.6471, + "step": 4568 + }, + { + "epoch": 0.914, + "learning_rate": 1.733729864502877e-05, + "loss": 0.0097, + "step": 4570 + }, + { + "epoch": 0.9144, + "learning_rate": 1.7327804705624962e-05, + "loss": 0.4842, + "step": 4572 + }, + { + "epoch": 0.9148, + "learning_rate": 1.731829648029091e-05, + "loss": 0.0524, + "step": 4574 + }, + { + "epoch": 0.9152, + "learning_rate": 1.730877398756341e-05, + "loss": 0.2921, + "step": 4576 + }, + { + "epoch": 0.9156, + "learning_rate": 1.7299237246007025e-05, + "loss": 0.005, + "step": 4578 + }, + { + "epoch": 0.916, + "learning_rate": 1.7289686274214113e-05, + "loss": 0.0588, + "step": 4580 + }, + { + "epoch": 0.9164, + "learning_rate": 1.7280121090804824e-05, + "loss": 0.197, + "step": 4582 + }, + { + "epoch": 0.9168, + "learning_rate": 1.727054171442693e-05, + "loss": 0.0319, + "step": 4584 + }, + { + "epoch": 0.9172, + "learning_rate": 1.7260948163755918e-05, + "loss": 0.0069, + "step": 4586 + }, + { + "epoch": 0.9176, + "learning_rate": 1.7251340457494934e-05, + "loss": 0.1261, + "step": 4588 + }, + { + "epoch": 0.918, + "learning_rate": 1.7241718614374688e-05, + "loss": 0.0254, + "step": 4590 + }, + { + "epoch": 0.9184, + "learning_rate": 1.7232082653153422e-05, + "loss": 0.2821, + "step": 4592 + }, + { + "epoch": 0.9188, + "learning_rate": 1.722243259261697e-05, + "loss": 0.0026, + "step": 4594 + }, + { + "epoch": 0.9192, + "learning_rate": 1.7212768451578595e-05, + "loss": 0.2891, + "step": 4596 + }, + { + "epoch": 0.9196, + "learning_rate": 1.7203090248879084e-05, + "loss": 0.9164, + "step": 4598 + }, + { + "epoch": 0.92, + "learning_rate": 1.7193398003386517e-05, + "loss": 0.345, + "step": 4600 + }, + { + "epoch": 0.9204, + "learning_rate": 1.7183691733996463e-05, + "loss": 0.1334, + "step": 4602 + }, + { + "epoch": 0.9208, + "learning_rate": 1.7173971459631803e-05, + "loss": 0.2545, + "step": 4604 + }, + { + "epoch": 0.9212, + "learning_rate": 1.7164237199242663e-05, + "loss": 0.3035, + "step": 4606 + }, + { + "epoch": 0.9216, + "learning_rate": 1.7154488971806525e-05, + "loss": 0.2088, + "step": 4608 + }, + { + "epoch": 0.922, + "learning_rate": 1.7144726796328027e-05, + "loss": 0.7339, + "step": 4610 + }, + { + "epoch": 0.9224, + "learning_rate": 1.713495069183907e-05, + "loss": 0.2851, + "step": 4612 + }, + { + "epoch": 0.9228, + "learning_rate": 1.7125160677398632e-05, + "loss": 0.6097, + "step": 4614 + }, + { + "epoch": 0.9232, + "learning_rate": 1.7115356772092847e-05, + "loss": 0.128, + "step": 4616 + }, + { + "epoch": 0.9236, + "learning_rate": 1.710553899503497e-05, + "loss": 0.0077, + "step": 4618 + }, + { + "epoch": 0.924, + "learning_rate": 1.709570736536522e-05, + "loss": 0.0317, + "step": 4620 + }, + { + "epoch": 0.9244, + "learning_rate": 1.708586190225086e-05, + "loss": 0.4034, + "step": 4622 + }, + { + "epoch": 0.9248, + "learning_rate": 1.7076002624886152e-05, + "loss": 0.1261, + "step": 4624 + }, + { + "epoch": 0.9252, + "learning_rate": 1.7066129552492258e-05, + "loss": 0.1837, + "step": 4626 + }, + { + "epoch": 0.9256, + "learning_rate": 1.705624270431722e-05, + "loss": 0.1979, + "step": 4628 + }, + { + "epoch": 0.926, + "learning_rate": 1.7046342099635945e-05, + "loss": 0.238, + "step": 4630 + }, + { + "epoch": 0.9264, + "learning_rate": 1.70364277577502e-05, + "loss": 0.297, + "step": 4632 + }, + { + "epoch": 0.9268, + "learning_rate": 1.702649969798851e-05, + "loss": 0.0006, + "step": 4634 + }, + { + "epoch": 0.9272, + "learning_rate": 1.7016557939706078e-05, + "loss": 0.0238, + "step": 4636 + }, + { + "epoch": 0.9276, + "learning_rate": 1.700660250228492e-05, + "loss": 1.4445, + "step": 4638 + }, + { + "epoch": 0.928, + "learning_rate": 1.6996633405133673e-05, + "loss": 0.1208, + "step": 4640 + }, + { + "epoch": 0.9284, + "learning_rate": 1.6986650667687556e-05, + "loss": 0.2391, + "step": 4642 + }, + { + "epoch": 0.9288, + "learning_rate": 1.6976654309408468e-05, + "loss": 1.3993, + "step": 4644 + }, + { + "epoch": 0.9292, + "learning_rate": 1.69666443497848e-05, + "loss": 0.001, + "step": 4646 + }, + { + "epoch": 0.9296, + "learning_rate": 1.6956620808331515e-05, + "loss": 0.0452, + "step": 4648 + }, + { + "epoch": 0.93, + "learning_rate": 1.694658370458998e-05, + "loss": 0.3563, + "step": 4650 + }, + { + "epoch": 0.9304, + "learning_rate": 1.6936533058128042e-05, + "loss": 0.0714, + "step": 4652 + }, + { + "epoch": 0.9308, + "learning_rate": 1.692646888854001e-05, + "loss": 0.1849, + "step": 4654 + }, + { + "epoch": 0.9312, + "learning_rate": 1.691639121544641e-05, + "loss": 0.0689, + "step": 4656 + }, + { + "epoch": 0.9316, + "learning_rate": 1.690630005849424e-05, + "loss": 0.2121, + "step": 4658 + }, + { + "epoch": 0.932, + "learning_rate": 1.6896195437356696e-05, + "loss": 0.0898, + "step": 4660 + }, + { + "epoch": 0.9324, + "learning_rate": 1.6886077371733295e-05, + "loss": 0.0357, + "step": 4662 + }, + { + "epoch": 0.9328, + "learning_rate": 1.6875945881349686e-05, + "loss": 0.1849, + "step": 4664 + }, + { + "epoch": 0.9332, + "learning_rate": 1.6865800985957725e-05, + "loss": 0.1054, + "step": 4666 + }, + { + "epoch": 0.9336, + "learning_rate": 1.6855642705335435e-05, + "loss": 0.1437, + "step": 4668 + }, + { + "epoch": 0.934, + "learning_rate": 1.68454710592869e-05, + "loss": 0.3691, + "step": 4670 + }, + { + "epoch": 0.9344, + "learning_rate": 1.6835286067642228e-05, + "loss": 0.1666, + "step": 4672 + }, + { + "epoch": 0.9348, + "learning_rate": 1.6825087750257617e-05, + "loss": 0.137, + "step": 4674 + }, + { + "epoch": 0.9352, + "learning_rate": 1.681487612701521e-05, + "loss": 0.0398, + "step": 4676 + }, + { + "epoch": 0.9356, + "learning_rate": 1.6804651217823055e-05, + "loss": 0.0671, + "step": 4678 + }, + { + "epoch": 0.936, + "learning_rate": 1.6794413042615168e-05, + "loss": 0.351, + "step": 4680 + }, + { + "epoch": 0.9364, + "learning_rate": 1.6784161621351374e-05, + "loss": 0.0223, + "step": 4682 + }, + { + "epoch": 0.9368, + "learning_rate": 1.677389697401739e-05, + "loss": 0.8971, + "step": 4684 + }, + { + "epoch": 0.9372, + "learning_rate": 1.67636191206246e-05, + "loss": 0.0928, + "step": 4686 + }, + { + "epoch": 0.9376, + "learning_rate": 1.675332808121025e-05, + "loss": 0.7161, + "step": 4688 + }, + { + "epoch": 0.938, + "learning_rate": 1.6743023875837253e-05, + "loss": 0.0128, + "step": 4690 + }, + { + "epoch": 0.9384, + "learning_rate": 1.6732706524594145e-05, + "loss": 0.0666, + "step": 4692 + }, + { + "epoch": 0.9388, + "learning_rate": 1.672237604759517e-05, + "loss": 0.0778, + "step": 4694 + }, + { + "epoch": 0.9392, + "learning_rate": 1.671203246498009e-05, + "loss": 0.0402, + "step": 4696 + }, + { + "epoch": 0.9396, + "learning_rate": 1.670167579691429e-05, + "loss": 0.0613, + "step": 4698 + }, + { + "epoch": 0.94, + "learning_rate": 1.6691306063588593e-05, + "loss": 0.6061, + "step": 4700 + }, + { + "epoch": 0.9404, + "learning_rate": 1.668092328521931e-05, + "loss": 0.0004, + "step": 4702 + }, + { + "epoch": 0.9408, + "learning_rate": 1.6670527482048242e-05, + "loss": 0.1005, + "step": 4704 + }, + { + "epoch": 0.9412, + "learning_rate": 1.6660118674342525e-05, + "loss": 0.0015, + "step": 4706 + }, + { + "epoch": 0.9416, + "learning_rate": 1.6649696882394635e-05, + "loss": 0.0326, + "step": 4708 + }, + { + "epoch": 0.942, + "learning_rate": 1.6639262126522414e-05, + "loss": 0.1844, + "step": 4710 + }, + { + "epoch": 0.9424, + "learning_rate": 1.6628814427068968e-05, + "loss": 0.0143, + "step": 4712 + }, + { + "epoch": 0.9428, + "learning_rate": 1.661835380440258e-05, + "loss": 0.0228, + "step": 4714 + }, + { + "epoch": 0.9432, + "learning_rate": 1.6607880278916778e-05, + "loss": 0.0027, + "step": 4716 + }, + { + "epoch": 0.9436, + "learning_rate": 1.6597393871030264e-05, + "loss": 0.019, + "step": 4718 + }, + { + "epoch": 0.944, + "learning_rate": 1.6586894601186824e-05, + "loss": 0.3318, + "step": 4720 + }, + { + "epoch": 0.9444, + "learning_rate": 1.6576382489855278e-05, + "loss": 0.5455, + "step": 4722 + }, + { + "epoch": 0.9448, + "learning_rate": 1.656585755752957e-05, + "loss": 0.2454, + "step": 4724 + }, + { + "epoch": 0.9452, + "learning_rate": 1.655531982472859e-05, + "loss": 0.0332, + "step": 4726 + }, + { + "epoch": 0.9456, + "learning_rate": 1.6544769311996153e-05, + "loss": 0.0475, + "step": 4728 + }, + { + "epoch": 0.946, + "learning_rate": 1.653420603990106e-05, + "loss": 0.454, + "step": 4730 + }, + { + "epoch": 0.9464, + "learning_rate": 1.6523630029036924e-05, + "loss": 0.0737, + "step": 4732 + }, + { + "epoch": 0.9468, + "learning_rate": 1.651304130002226e-05, + "loss": 0.0099, + "step": 4734 + }, + { + "epoch": 0.9472, + "learning_rate": 1.6502439873500294e-05, + "loss": 0.0419, + "step": 4736 + }, + { + "epoch": 0.9476, + "learning_rate": 1.6491825770139058e-05, + "loss": 0.1729, + "step": 4738 + }, + { + "epoch": 0.948, + "learning_rate": 1.6481199010631305e-05, + "loss": 0.008, + "step": 4740 + }, + { + "epoch": 0.9484, + "learning_rate": 1.6470559615694455e-05, + "loss": 0.0283, + "step": 4742 + }, + { + "epoch": 0.9488, + "learning_rate": 1.645990760607052e-05, + "loss": 0.2909, + "step": 4744 + }, + { + "epoch": 0.9492, + "learning_rate": 1.644924300252614e-05, + "loss": 0.0855, + "step": 4746 + }, + { + "epoch": 0.9496, + "learning_rate": 1.643856582585255e-05, + "loss": 0.2414, + "step": 4748 + }, + { + "epoch": 0.95, + "learning_rate": 1.6427876096865407e-05, + "loss": 0.2724, + "step": 4750 + }, + { + "epoch": 0.9504, + "learning_rate": 1.641717383640488e-05, + "loss": 0.016, + "step": 4752 + }, + { + "epoch": 0.9508, + "learning_rate": 1.6406459065335616e-05, + "loss": 0.0197, + "step": 4754 + }, + { + "epoch": 0.9512, + "learning_rate": 1.6395731804546596e-05, + "loss": 0.0151, + "step": 4756 + }, + { + "epoch": 0.9516, + "learning_rate": 1.6384992074951128e-05, + "loss": 0.1989, + "step": 4758 + }, + { + "epoch": 0.952, + "learning_rate": 1.63742398974869e-05, + "loss": 0.1974, + "step": 4760 + }, + { + "epoch": 0.9524, + "learning_rate": 1.6363475293115838e-05, + "loss": 0.0055, + "step": 4762 + }, + { + "epoch": 0.9528, + "learning_rate": 1.6352698282824045e-05, + "loss": 0.4027, + "step": 4764 + }, + { + "epoch": 0.9532, + "learning_rate": 1.63419088876219e-05, + "loss": 0.0015, + "step": 4766 + }, + { + "epoch": 0.9536, + "learning_rate": 1.633110712854385e-05, + "loss": 0.1041, + "step": 4768 + }, + { + "epoch": 0.954, + "learning_rate": 1.6320293026648515e-05, + "loss": 0.0121, + "step": 4770 + }, + { + "epoch": 0.9544, + "learning_rate": 1.6309466603018504e-05, + "loss": 0.0096, + "step": 4772 + }, + { + "epoch": 0.9548, + "learning_rate": 1.6298627878760495e-05, + "loss": 0.0522, + "step": 4774 + }, + { + "epoch": 0.9552, + "learning_rate": 1.6287776875005148e-05, + "loss": 0.0552, + "step": 4776 + }, + { + "epoch": 0.9556, + "learning_rate": 1.6276913612907015e-05, + "loss": 0.0133, + "step": 4778 + }, + { + "epoch": 0.956, + "learning_rate": 1.6266038113644612e-05, + "loss": 0.021, + "step": 4780 + }, + { + "epoch": 0.9564, + "learning_rate": 1.6255150398420266e-05, + "loss": 0.0726, + "step": 4782 + }, + { + "epoch": 0.9568, + "learning_rate": 1.624425048846017e-05, + "loss": 0.7132, + "step": 4784 + }, + { + "epoch": 0.9572, + "learning_rate": 1.623333840501421e-05, + "loss": 0.0179, + "step": 4786 + }, + { + "epoch": 0.9576, + "learning_rate": 1.6222414169356063e-05, + "loss": 0.0478, + "step": 4788 + }, + { + "epoch": 0.958, + "learning_rate": 1.6211477802783102e-05, + "loss": 0.0174, + "step": 4790 + }, + { + "epoch": 0.9584, + "learning_rate": 1.6200529326616343e-05, + "loss": 0.0662, + "step": 4792 + }, + { + "epoch": 0.9588, + "learning_rate": 1.618956876220035e-05, + "loss": 0.2261, + "step": 4794 + }, + { + "epoch": 0.9592, + "learning_rate": 1.6178596130903345e-05, + "loss": 0.722, + "step": 4796 + }, + { + "epoch": 0.9596, + "learning_rate": 1.616761145411704e-05, + "loss": 0.1391, + "step": 4798 + }, + { + "epoch": 0.96, + "learning_rate": 1.6156614753256587e-05, + "loss": 0.2128, + "step": 4800 + }, + { + "epoch": 0.9604, + "learning_rate": 1.6145606049760648e-05, + "loss": 0.3757, + "step": 4802 + }, + { + "epoch": 0.9608, + "learning_rate": 1.613458536509123e-05, + "loss": 0.1694, + "step": 4804 + }, + { + "epoch": 0.9612, + "learning_rate": 1.612355272073378e-05, + "loss": 0.0498, + "step": 4806 + }, + { + "epoch": 0.9616, + "learning_rate": 1.6112508138196922e-05, + "loss": 0.0032, + "step": 4808 + }, + { + "epoch": 0.962, + "learning_rate": 1.610145163901268e-05, + "loss": 0.0261, + "step": 4810 + }, + { + "epoch": 0.9624, + "learning_rate": 1.6090383244736277e-05, + "loss": 0.003, + "step": 4812 + }, + { + "epoch": 0.9628, + "learning_rate": 1.6079302976946062e-05, + "loss": 0.0001, + "step": 4814 + }, + { + "epoch": 0.9632, + "learning_rate": 1.606821085724363e-05, + "loss": 0.013, + "step": 4816 + }, + { + "epoch": 0.9636, + "learning_rate": 1.6057106907253607e-05, + "loss": 0.0456, + "step": 4818 + }, + { + "epoch": 0.964, + "learning_rate": 1.6045991148623756e-05, + "loss": 0.2381, + "step": 4820 + }, + { + "epoch": 0.9644, + "learning_rate": 1.6034863603024775e-05, + "loss": 0.017, + "step": 4822 + }, + { + "epoch": 0.9648, + "learning_rate": 1.602372429215038e-05, + "loss": 0.0217, + "step": 4824 + }, + { + "epoch": 0.9652, + "learning_rate": 1.6012573237717265e-05, + "loss": 0.0068, + "step": 4826 + }, + { + "epoch": 0.9656, + "learning_rate": 1.600141046146497e-05, + "loss": 0.048, + "step": 4828 + }, + { + "epoch": 0.966, + "learning_rate": 1.5990235985155856e-05, + "loss": 0.0203, + "step": 4830 + }, + { + "epoch": 0.9664, + "learning_rate": 1.597904983057519e-05, + "loss": 0.0177, + "step": 4832 + }, + { + "epoch": 0.9668, + "learning_rate": 1.5967852019530942e-05, + "loss": 0.2203, + "step": 4834 + }, + { + "epoch": 0.9672, + "learning_rate": 1.5956642573853794e-05, + "loss": 0.6763, + "step": 4836 + }, + { + "epoch": 0.9676, + "learning_rate": 1.5945421515397135e-05, + "loss": 0.2394, + "step": 4838 + }, + { + "epoch": 0.968, + "learning_rate": 1.5934188866037014e-05, + "loss": 0.0207, + "step": 4840 + }, + { + "epoch": 0.9684, + "learning_rate": 1.5922944647672068e-05, + "loss": 0.0178, + "step": 4842 + }, + { + "epoch": 0.9688, + "learning_rate": 1.591168888222342e-05, + "loss": 0.6112, + "step": 4844 + }, + { + "epoch": 0.9692, + "learning_rate": 1.5900421591634816e-05, + "loss": 0.1571, + "step": 4846 + }, + { + "epoch": 0.9696, + "learning_rate": 1.5889142797872407e-05, + "loss": 0.0028, + "step": 4848 + }, + { + "epoch": 0.97, + "learning_rate": 1.5877852522924736e-05, + "loss": 0.0012, + "step": 4850 + }, + { + "epoch": 0.9704, + "learning_rate": 1.5866550788802818e-05, + "loss": 0.0872, + "step": 4852 + }, + { + "epoch": 0.9708, + "learning_rate": 1.5855237617539932e-05, + "loss": 0.0414, + "step": 4854 + }, + { + "epoch": 0.9712, + "learning_rate": 1.584391303119173e-05, + "loss": 0.8382, + "step": 4856 + }, + { + "epoch": 0.9716, + "learning_rate": 1.5832577051836023e-05, + "loss": 0.0012, + "step": 4858 + }, + { + "epoch": 0.972, + "learning_rate": 1.582122970157289e-05, + "loss": 0.0012, + "step": 4860 + }, + { + "epoch": 0.9724, + "learning_rate": 1.5809871002524592e-05, + "loss": 0.0721, + "step": 4862 + }, + { + "epoch": 0.9728, + "learning_rate": 1.5798500976835503e-05, + "loss": 0.0053, + "step": 4864 + }, + { + "epoch": 0.9732, + "learning_rate": 1.5787119646672032e-05, + "loss": 0.1349, + "step": 4866 + }, + { + "epoch": 0.9736, + "learning_rate": 1.577572703422267e-05, + "loss": 0.2444, + "step": 4868 + }, + { + "epoch": 0.974, + "learning_rate": 1.5764323161697946e-05, + "loss": 0.056, + "step": 4870 + }, + { + "epoch": 0.9744, + "learning_rate": 1.575290805133024e-05, + "loss": 0.0104, + "step": 4872 + }, + { + "epoch": 0.9748, + "learning_rate": 1.5741481725373896e-05, + "loss": 0.2938, + "step": 4874 + }, + { + "epoch": 0.9752, + "learning_rate": 1.5730044206105156e-05, + "loss": 0.0006, + "step": 4876 + }, + { + "epoch": 0.9756, + "learning_rate": 1.571859551582204e-05, + "loss": 0.4928, + "step": 4878 + }, + { + "epoch": 0.976, + "learning_rate": 1.570713567684432e-05, + "loss": 0.5749, + "step": 4880 + }, + { + "epoch": 0.9764, + "learning_rate": 1.5695664711513575e-05, + "loss": 0.0165, + "step": 4882 + }, + { + "epoch": 0.9768, + "learning_rate": 1.5684182642193047e-05, + "loss": 0.0003, + "step": 4884 + }, + { + "epoch": 0.9772, + "learning_rate": 1.567268949126757e-05, + "loss": 0.4531, + "step": 4886 + }, + { + "epoch": 0.9776, + "learning_rate": 1.566118528114367e-05, + "loss": 0.0812, + "step": 4888 + }, + { + "epoch": 0.978, + "learning_rate": 1.5649670034249372e-05, + "loss": 0.0326, + "step": 4890 + }, + { + "epoch": 0.9784, + "learning_rate": 1.563814377303429e-05, + "loss": 0.0896, + "step": 4892 + }, + { + "epoch": 0.9788, + "learning_rate": 1.5626606519969373e-05, + "loss": 0.6808, + "step": 4894 + }, + { + "epoch": 0.9792, + "learning_rate": 1.561505829754715e-05, + "loss": 0.2263, + "step": 4896 + }, + { + "epoch": 0.9796, + "learning_rate": 1.5603499128281437e-05, + "loss": 0.2026, + "step": 4898 + }, + { + "epoch": 0.98, + "learning_rate": 1.5591929034707475e-05, + "loss": 0.1154, + "step": 4900 + }, + { + "epoch": 0.9804, + "learning_rate": 1.558034803938171e-05, + "loss": 0.0142, + "step": 4902 + }, + { + "epoch": 0.9808, + "learning_rate": 1.5568756164881874e-05, + "loss": 0.4716, + "step": 4904 + }, + { + "epoch": 0.9812, + "learning_rate": 1.5557153433806974e-05, + "loss": 0.0643, + "step": 4906 + }, + { + "epoch": 0.9816, + "learning_rate": 1.5545539868777085e-05, + "loss": 0.0081, + "step": 4908 + }, + { + "epoch": 0.982, + "learning_rate": 1.5533915492433437e-05, + "loss": 0.1191, + "step": 4910 + }, + { + "epoch": 0.9824, + "learning_rate": 1.5522280327438384e-05, + "loss": 0.1951, + "step": 4912 + }, + { + "epoch": 0.9828, + "learning_rate": 1.5510634396475275e-05, + "loss": 0.3572, + "step": 4914 + }, + { + "epoch": 0.9832, + "learning_rate": 1.5498977722248398e-05, + "loss": 0.0011, + "step": 4916 + }, + { + "epoch": 0.9836, + "learning_rate": 1.5487310327483084e-05, + "loss": 0.335, + "step": 4918 + }, + { + "epoch": 0.984, + "learning_rate": 1.547563223492552e-05, + "loss": 0.604, + "step": 4920 + }, + { + "epoch": 0.9844, + "learning_rate": 1.5463943467342708e-05, + "loss": 0.1114, + "step": 4922 + }, + { + "epoch": 0.9848, + "learning_rate": 1.5452244047522504e-05, + "loss": 0.649, + "step": 4924 + }, + { + "epoch": 0.9852, + "learning_rate": 1.5440533998273552e-05, + "loss": 0.238, + "step": 4926 + }, + { + "epoch": 0.9856, + "learning_rate": 1.5428813342425194e-05, + "loss": 0.5611, + "step": 4928 + }, + { + "epoch": 0.986, + "learning_rate": 1.5417082102827407e-05, + "loss": 0.1094, + "step": 4930 + }, + { + "epoch": 0.9864, + "learning_rate": 1.5405340302350876e-05, + "loss": 0.6704, + "step": 4932 + }, + { + "epoch": 0.9868, + "learning_rate": 1.5393587963886827e-05, + "loss": 0.0767, + "step": 4934 + }, + { + "epoch": 0.9872, + "learning_rate": 1.538182511034708e-05, + "loss": 0.0183, + "step": 4936 + }, + { + "epoch": 0.9876, + "learning_rate": 1.5370051764663875e-05, + "loss": 0.1054, + "step": 4938 + }, + { + "epoch": 0.988, + "learning_rate": 1.535826794978996e-05, + "loss": 0.064, + "step": 4940 + }, + { + "epoch": 0.9884, + "learning_rate": 1.534647368869852e-05, + "loss": 0.6582, + "step": 4942 + }, + { + "epoch": 0.9888, + "learning_rate": 1.5334669004383036e-05, + "loss": 0.5025, + "step": 4944 + }, + { + "epoch": 0.9892, + "learning_rate": 1.5322853919857337e-05, + "loss": 0.0021, + "step": 4946 + }, + { + "epoch": 0.9896, + "learning_rate": 1.5311028458155564e-05, + "loss": 0.0096, + "step": 4948 + }, + { + "epoch": 0.99, + "learning_rate": 1.5299192642332063e-05, + "loss": 0.2269, + "step": 4950 + }, + { + "epoch": 0.9904, + "learning_rate": 1.528734649546133e-05, + "loss": 0.0765, + "step": 4952 + }, + { + "epoch": 0.9908, + "learning_rate": 1.5275490040638038e-05, + "loss": 0.0002, + "step": 4954 + }, + { + "epoch": 0.9912, + "learning_rate": 1.5263623300976997e-05, + "loss": 0.0141, + "step": 4956 + }, + { + "epoch": 0.9916, + "learning_rate": 1.5251746299612973e-05, + "loss": 0.1432, + "step": 4958 + }, + { + "epoch": 0.992, + "learning_rate": 1.5239859059700792e-05, + "loss": 0.0583, + "step": 4960 + }, + { + "epoch": 0.9924, + "learning_rate": 1.522796160441527e-05, + "loss": 0.0007, + "step": 4962 + }, + { + "epoch": 0.9928, + "learning_rate": 1.5216053956951096e-05, + "loss": 0.0146, + "step": 4964 + }, + { + "epoch": 0.9932, + "learning_rate": 1.5204136140522799e-05, + "loss": 0.0086, + "step": 4966 + }, + { + "epoch": 0.9936, + "learning_rate": 1.5192208178364819e-05, + "loss": 0.0137, + "step": 4968 + }, + { + "epoch": 0.994, + "learning_rate": 1.5180270093731291e-05, + "loss": 0.0873, + "step": 4970 + }, + { + "epoch": 0.9944, + "learning_rate": 1.5168321909896176e-05, + "loss": 0.0598, + "step": 4972 + }, + { + "epoch": 0.9948, + "learning_rate": 1.5156363650153017e-05, + "loss": 0.1364, + "step": 4974 + }, + { + "epoch": 0.9952, + "learning_rate": 1.5144395337815057e-05, + "loss": 0.1142, + "step": 4976 + }, + { + "epoch": 0.9956, + "learning_rate": 1.5132416996215178e-05, + "loss": 0.0133, + "step": 4978 + }, + { + "epoch": 0.996, + "learning_rate": 1.5120428648705722e-05, + "loss": 0.1836, + "step": 4980 + }, + { + "epoch": 0.9964, + "learning_rate": 1.5108430318658607e-05, + "loss": 0.1286, + "step": 4982 + }, + { + "epoch": 0.9968, + "learning_rate": 1.5096422029465171e-05, + "loss": 0.0038, + "step": 4984 + }, + { + "epoch": 0.9972, + "learning_rate": 1.5084403804536236e-05, + "loss": 0.0464, + "step": 4986 + }, + { + "epoch": 0.9976, + "learning_rate": 1.5072375667301904e-05, + "loss": 0.8134, + "step": 4988 + }, + { + "epoch": 0.998, + "learning_rate": 1.5060337641211636e-05, + "loss": 0.0012, + "step": 4990 + }, + { + "epoch": 0.9984, + "learning_rate": 1.5048289749734231e-05, + "loss": 0.0005, + "step": 4992 + }, + { + "epoch": 0.9988, + "learning_rate": 1.5036232016357622e-05, + "loss": 0.0282, + "step": 4994 + }, + { + "epoch": 0.9992, + "learning_rate": 1.502416446458898e-05, + "loss": 0.0027, + "step": 4996 + }, + { + "epoch": 0.9996, + "learning_rate": 1.5012087117954641e-05, + "loss": 0.0278, + "step": 4998 + }, + { + "epoch": 1.0, + "learning_rate": 1.5000000000000014e-05, + "loss": 0.6382, + "step": 5000 + }, + { + "epoch": 1.0, + "step": 5000, + "total_flos": 2.8573694755864576e+16, + "train_loss": 0.1845600268243885, + "train_runtime": 28935.9445, + "train_samples_per_second": 2.765, + "train_steps_per_second": 0.173 + } + ], + "logging_steps": 2, + "max_steps": 5000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 2.8573694755864576e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..21315a94e95a0a2a7a9fa4418f1b4d83ab5c19dd --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02823dd5a374e0f1253b91d8f0814e3ac90c3dfa8e2961aba0f0b2aa86488bd8 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6f57f4664d4f441460a606cd61d2257ec7915ed8 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66a081d3ab476d3f9f6ac604261046f543447afdc54ea4317c73daa47b254734 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..d96f6598ebeee22d6c592877a663b7c5ec731a49 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:184b0b599c97272db0ee770b928b4def6b25f0709138e9491ed42fd13d3f3f11 +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..614d838d982fa25b065dcbc8a7094a2f9d45d756 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_coincide_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:261a1d5bb802d89820d726dd17c16556110a2cd184eee39de1350f8e13c64b9d +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_divbs_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_divbs_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..db73ae0742006e86d8f066c868149ea192314bb7 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_divbs_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,15032 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004, + "learning_rate": 2.3485380412522497e-06, + "loss": 0.4978, + "step": 2 + }, + { + "epoch": 0.0008, + "learning_rate": 2.357535430610912e-06, + "loss": 0.2848, + "step": 4 + }, + { + "epoch": 0.0012, + "learning_rate": 2.366547719345306e-06, + "loss": 0.5063, + "step": 6 + }, + { + "epoch": 0.0016, + "learning_rate": 2.3755748898855234e-06, + "loss": 0.2269, + "step": 8 + }, + { + "epoch": 0.002, + "learning_rate": 2.3846169246326332e-06, + "loss": 0.789, + "step": 10 + }, + { + "epoch": 0.0024, + "learning_rate": 2.3936738059587174e-06, + "loss": 0.357, + "step": 12 + }, + { + "epoch": 0.0028, + "learning_rate": 2.4027455162069537e-06, + "loss": 0.7059, + "step": 14 + }, + { + "epoch": 0.0032, + "learning_rate": 2.411832037691545e-06, + "loss": 0.1656, + "step": 16 + }, + { + "epoch": 0.0036, + "learning_rate": 2.420933352697865e-06, + "loss": 0.6619, + "step": 18 + }, + { + "epoch": 0.004, + "learning_rate": 2.430049443482434e-06, + "loss": 0.2676, + "step": 20 + }, + { + "epoch": 0.0044, + "learning_rate": 2.439180292272967e-06, + "loss": 1.0847, + "step": 22 + }, + { + "epoch": 0.0048, + "learning_rate": 2.448325881268406e-06, + "loss": 0.3018, + "step": 24 + }, + { + "epoch": 0.0052, + "learning_rate": 2.457486192638958e-06, + "loss": 0.1407, + "step": 26 + }, + { + "epoch": 0.0056, + "learning_rate": 2.4666612085261277e-06, + "loss": 0.4019, + "step": 28 + }, + { + "epoch": 0.006, + "learning_rate": 2.475850911042752e-06, + "loss": 0.3209, + "step": 30 + }, + { + "epoch": 0.0064, + "learning_rate": 2.4850552822730346e-06, + "loss": 0.2196, + "step": 32 + }, + { + "epoch": 0.0068, + "learning_rate": 2.4942743042725836e-06, + "loss": 0.4247, + "step": 34 + }, + { + "epoch": 0.0072, + "learning_rate": 2.503507959068455e-06, + "loss": 0.4194, + "step": 36 + }, + { + "epoch": 0.0076, + "learning_rate": 2.5127562286591313e-06, + "loss": 0.659, + "step": 38 + }, + { + "epoch": 0.008, + "learning_rate": 2.522019095014686e-06, + "loss": 0.4145, + "step": 40 + }, + { + "epoch": 0.0084, + "learning_rate": 2.531296540076638e-06, + "loss": 0.571, + "step": 42 + }, + { + "epoch": 0.0088, + "learning_rate": 2.5405885457581814e-06, + "loss": 0.3624, + "step": 44 + }, + { + "epoch": 0.0092, + "learning_rate": 2.5498950939440413e-06, + "loss": 0.526, + "step": 46 + }, + { + "epoch": 0.0096, + "learning_rate": 2.5592161664906243e-06, + "loss": 0.6059, + "step": 48 + }, + { + "epoch": 0.01, + "learning_rate": 2.5685517452260587e-06, + "loss": 0.4984, + "step": 50 + }, + { + "epoch": 0.0104, + "learning_rate": 2.5779018119501086e-06, + "loss": 0.8004, + "step": 52 + }, + { + "epoch": 0.0108, + "learning_rate": 2.5872663484343887e-06, + "loss": 1.0366, + "step": 54 + }, + { + "epoch": 0.0112, + "learning_rate": 2.596645336422219e-06, + "loss": 0.4256, + "step": 56 + }, + { + "epoch": 0.0116, + "learning_rate": 2.606038757628795e-06, + "loss": 0.081, + "step": 58 + }, + { + "epoch": 0.012, + "learning_rate": 2.615446593741161e-06, + "loss": 0.1795, + "step": 60 + }, + { + "epoch": 0.0124, + "learning_rate": 2.6248688264182588e-06, + "loss": 0.2073, + "step": 62 + }, + { + "epoch": 0.0128, + "learning_rate": 2.6343054372909648e-06, + "loss": 0.6551, + "step": 64 + }, + { + "epoch": 0.0132, + "learning_rate": 2.6437564079621235e-06, + "loss": 0.2136, + "step": 66 + }, + { + "epoch": 0.0136, + "learning_rate": 2.6532217200065826e-06, + "loss": 0.4476, + "step": 68 + }, + { + "epoch": 0.014, + "learning_rate": 2.662701354971232e-06, + "loss": 0.8091, + "step": 70 + }, + { + "epoch": 0.0144, + "learning_rate": 2.6721952943750396e-06, + "loss": 0.3775, + "step": 72 + }, + { + "epoch": 0.0148, + "learning_rate": 2.6817035197090825e-06, + "loss": 0.5697, + "step": 74 + }, + { + "epoch": 0.0152, + "learning_rate": 2.691226012436604e-06, + "loss": 0.4306, + "step": 76 + }, + { + "epoch": 0.0156, + "learning_rate": 2.7007627539929783e-06, + "loss": 0.2091, + "step": 78 + }, + { + "epoch": 0.016, + "learning_rate": 2.7103137257858893e-06, + "loss": 0.5239, + "step": 80 + }, + { + "epoch": 0.0164, + "learning_rate": 2.7198789091951806e-06, + "loss": 0.3424, + "step": 82 + }, + { + "epoch": 0.0168, + "learning_rate": 2.7294582855730733e-06, + "loss": 0.4204, + "step": 84 + }, + { + "epoch": 0.0172, + "learning_rate": 2.7390518362440843e-06, + "loss": 0.5054, + "step": 86 + }, + { + "epoch": 0.0176, + "learning_rate": 2.7486595425050566e-06, + "loss": 0.1196, + "step": 88 + }, + { + "epoch": 0.018, + "learning_rate": 2.7582813856253264e-06, + "loss": 0.4632, + "step": 90 + }, + { + "epoch": 0.0184, + "learning_rate": 2.7679173468465813e-06, + "loss": 0.3133, + "step": 92 + }, + { + "epoch": 0.0188, + "learning_rate": 2.777567407383033e-06, + "loss": 0.2941, + "step": 94 + }, + { + "epoch": 0.0192, + "learning_rate": 2.7872315484213954e-06, + "loss": 0.4177, + "step": 96 + }, + { + "epoch": 0.0196, + "learning_rate": 2.796909751120931e-06, + "loss": 0.4319, + "step": 98 + }, + { + "epoch": 0.02, + "learning_rate": 2.8066019966134873e-06, + "loss": 0.5043, + "step": 100 + }, + { + "epoch": 0.0204, + "learning_rate": 2.816308266003538e-06, + "loss": 0.4153, + "step": 102 + }, + { + "epoch": 0.0208, + "learning_rate": 2.826028540368212e-06, + "loss": 0.4126, + "step": 104 + }, + { + "epoch": 0.0212, + "learning_rate": 2.835762800757338e-06, + "loss": 0.7709, + "step": 106 + }, + { + "epoch": 0.0216, + "learning_rate": 2.845511028193477e-06, + "loss": 0.5181, + "step": 108 + }, + { + "epoch": 0.022, + "learning_rate": 2.855273203671962e-06, + "loss": 0.433, + "step": 110 + }, + { + "epoch": 0.0224, + "learning_rate": 2.865049308160931e-06, + "loss": 0.5872, + "step": 112 + }, + { + "epoch": 0.0228, + "learning_rate": 2.874839322601368e-06, + "loss": 0.5732, + "step": 114 + }, + { + "epoch": 0.0232, + "learning_rate": 2.8846432279071533e-06, + "loss": 0.2212, + "step": 116 + }, + { + "epoch": 0.0236, + "learning_rate": 2.8944610049650314e-06, + "loss": 0.6495, + "step": 118 + }, + { + "epoch": 0.024, + "learning_rate": 2.9042926346347835e-06, + "loss": 0.6217, + "step": 120 + }, + { + "epoch": 0.0244, + "learning_rate": 2.914138097749143e-06, + "loss": 0.3093, + "step": 122 + }, + { + "epoch": 0.0248, + "learning_rate": 2.9239973751138397e-06, + "loss": 0.3164, + "step": 124 + }, + { + "epoch": 0.0252, + "learning_rate": 2.933870447507756e-06, + "loss": 0.7215, + "step": 126 + }, + { + "epoch": 0.0256, + "learning_rate": 2.943757295682783e-06, + "loss": 0.2408, + "step": 128 + }, + { + "epoch": 0.026, + "learning_rate": 2.953657900364055e-06, + "loss": 0.3275, + "step": 130 + }, + { + "epoch": 0.0264, + "learning_rate": 2.9635722422497983e-06, + "loss": 0.6744, + "step": 132 + }, + { + "epoch": 0.0268, + "learning_rate": 2.973500302011496e-06, + "loss": 0.2212, + "step": 134 + }, + { + "epoch": 0.0272, + "learning_rate": 2.983442060293926e-06, + "loss": 0.6929, + "step": 136 + }, + { + "epoch": 0.0276, + "learning_rate": 2.9933974977150827e-06, + "loss": 0.1681, + "step": 138 + }, + { + "epoch": 0.028, + "learning_rate": 3.003366594866345e-06, + "loss": 0.4171, + "step": 140 + }, + { + "epoch": 0.0284, + "learning_rate": 3.0133493323124474e-06, + "loss": 0.3914, + "step": 142 + }, + { + "epoch": 0.0288, + "learning_rate": 3.0233456905915338e-06, + "loss": 0.1667, + "step": 144 + }, + { + "epoch": 0.0292, + "learning_rate": 3.0333556502151895e-06, + "loss": 0.3647, + "step": 146 + }, + { + "epoch": 0.0296, + "learning_rate": 3.0433791916684885e-06, + "loss": 0.843, + "step": 148 + }, + { + "epoch": 0.03, + "learning_rate": 3.0534162954100234e-06, + "loss": 0.7318, + "step": 150 + }, + { + "epoch": 0.0304, + "learning_rate": 3.0634669418719453e-06, + "loss": 0.6392, + "step": 152 + }, + { + "epoch": 0.0308, + "learning_rate": 3.0735311114600064e-06, + "loss": 0.3443, + "step": 154 + }, + { + "epoch": 0.0312, + "learning_rate": 3.0836087845535933e-06, + "loss": 0.3958, + "step": 156 + }, + { + "epoch": 0.0316, + "learning_rate": 3.0936999415057645e-06, + "loss": 0.2644, + "step": 158 + }, + { + "epoch": 0.032, + "learning_rate": 3.1038045626432945e-06, + "loss": 0.5595, + "step": 160 + }, + { + "epoch": 0.0324, + "learning_rate": 3.1139226282667212e-06, + "loss": 0.5395, + "step": 162 + }, + { + "epoch": 0.0328, + "learning_rate": 3.1240541186503173e-06, + "loss": 0.3528, + "step": 164 + }, + { + "epoch": 0.0332, + "learning_rate": 3.134199014042277e-06, + "loss": 0.289, + "step": 166 + }, + { + "epoch": 0.0336, + "learning_rate": 3.1443572946645683e-06, + "loss": 1.0455, + "step": 168 + }, + { + "epoch": 0.034, + "learning_rate": 3.154528940713103e-06, + "loss": 0.5556, + "step": 170 + }, + { + "epoch": 0.0344, + "learning_rate": 3.164713932357776e-06, + "loss": 0.6263, + "step": 172 + }, + { + "epoch": 0.0348, + "learning_rate": 3.1749122497423724e-06, + "loss": 0.4976, + "step": 174 + }, + { + "epoch": 0.0352, + "learning_rate": 3.1851238729848033e-06, + "loss": 0.4342, + "step": 176 + }, + { + "epoch": 0.0356, + "learning_rate": 3.195348782176948e-06, + "loss": 0.2161, + "step": 178 + }, + { + "epoch": 0.036, + "learning_rate": 3.205586957384834e-06, + "loss": 0.189, + "step": 180 + }, + { + "epoch": 0.0364, + "learning_rate": 3.215838378648617e-06, + "loss": 0.2455, + "step": 182 + }, + { + "epoch": 0.0368, + "learning_rate": 3.2261030259826253e-06, + "loss": 0.4143, + "step": 184 + }, + { + "epoch": 0.0372, + "learning_rate": 3.2363808793754036e-06, + "loss": 0.2427, + "step": 186 + }, + { + "epoch": 0.0376, + "learning_rate": 3.246671918789752e-06, + "loss": 0.5408, + "step": 188 + }, + { + "epoch": 0.038, + "learning_rate": 3.2569761241627617e-06, + "loss": 1.2436, + "step": 190 + }, + { + "epoch": 0.0384, + "learning_rate": 3.267293475405858e-06, + "loss": 0.362, + "step": 192 + }, + { + "epoch": 0.0388, + "learning_rate": 3.277623952404835e-06, + "loss": 0.3556, + "step": 194 + }, + { + "epoch": 0.0392, + "learning_rate": 3.2879675350199004e-06, + "loss": 1.0699, + "step": 196 + }, + { + "epoch": 0.0396, + "learning_rate": 3.298324203085723e-06, + "loss": 0.5274, + "step": 198 + }, + { + "epoch": 0.04, + "learning_rate": 3.3086939364114113e-06, + "loss": 0.3893, + "step": 200 + }, + { + "epoch": 0.0404, + "learning_rate": 3.3190767147806892e-06, + "loss": 0.2916, + "step": 202 + }, + { + "epoch": 0.0408, + "learning_rate": 3.329472517951747e-06, + "loss": 0.4463, + "step": 204 + }, + { + "epoch": 0.0412, + "learning_rate": 3.3398813256574745e-06, + "loss": 0.6399, + "step": 206 + }, + { + "epoch": 0.0416, + "learning_rate": 3.350303117605369e-06, + "loss": 0.2346, + "step": 208 + }, + { + "epoch": 0.042, + "learning_rate": 3.360737873477574e-06, + "loss": 0.5752, + "step": 210 + }, + { + "epoch": 0.0424, + "learning_rate": 3.3711855729310503e-06, + "loss": 0.4147, + "step": 212 + }, + { + "epoch": 0.0428, + "learning_rate": 3.3816461955974224e-06, + "loss": 0.4813, + "step": 214 + }, + { + "epoch": 0.0432, + "learning_rate": 3.3921197210832235e-06, + "loss": 0.5642, + "step": 216 + }, + { + "epoch": 0.0436, + "learning_rate": 3.4026061289697397e-06, + "loss": 0.4102, + "step": 218 + }, + { + "epoch": 0.044, + "learning_rate": 3.4131053988131947e-06, + "loss": 0.3265, + "step": 220 + }, + { + "epoch": 0.0444, + "learning_rate": 3.4236175101447257e-06, + "loss": 0.422, + "step": 222 + }, + { + "epoch": 0.0448, + "learning_rate": 3.434142442470434e-06, + "loss": 0.4798, + "step": 224 + }, + { + "epoch": 0.0452, + "learning_rate": 3.444680175271424e-06, + "loss": 0.7324, + "step": 226 + }, + { + "epoch": 0.0456, + "learning_rate": 3.455230688003849e-06, + "loss": 0.2095, + "step": 228 + }, + { + "epoch": 0.046, + "learning_rate": 3.465793960098942e-06, + "loss": 0.3462, + "step": 230 + }, + { + "epoch": 0.0464, + "learning_rate": 3.476369970963065e-06, + "loss": 0.4735, + "step": 232 + }, + { + "epoch": 0.0468, + "learning_rate": 3.486958699977743e-06, + "loss": 0.7601, + "step": 234 + }, + { + "epoch": 0.0472, + "learning_rate": 3.497560126499706e-06, + "loss": 0.5506, + "step": 236 + }, + { + "epoch": 0.0476, + "learning_rate": 3.508174229860947e-06, + "loss": 0.2765, + "step": 238 + }, + { + "epoch": 0.048, + "learning_rate": 3.5188009893686836e-06, + "loss": 0.479, + "step": 240 + }, + { + "epoch": 0.0484, + "learning_rate": 3.5294403843055493e-06, + "loss": 0.468, + "step": 242 + }, + { + "epoch": 0.0488, + "learning_rate": 3.5400923939294827e-06, + "loss": 0.6509, + "step": 244 + }, + { + "epoch": 0.0492, + "learning_rate": 3.5507569974738477e-06, + "loss": 0.2147, + "step": 246 + }, + { + "epoch": 0.0496, + "learning_rate": 3.5614341741474667e-06, + "loss": 0.3087, + "step": 248 + }, + { + "epoch": 0.05, + "learning_rate": 3.5721239031345966e-06, + "loss": 0.2604, + "step": 250 + }, + { + "epoch": 0.0504, + "learning_rate": 3.5828261635951177e-06, + "loss": 0.5584, + "step": 252 + }, + { + "epoch": 0.0508, + "learning_rate": 3.593540934664387e-06, + "loss": 0.2994, + "step": 254 + }, + { + "epoch": 0.0512, + "learning_rate": 3.604268195453421e-06, + "loss": 0.5102, + "step": 256 + }, + { + "epoch": 0.0516, + "learning_rate": 3.6150079250488767e-06, + "loss": 0.44, + "step": 258 + }, + { + "epoch": 0.052, + "learning_rate": 3.6257601025130893e-06, + "loss": 0.2371, + "step": 260 + }, + { + "epoch": 0.0524, + "learning_rate": 3.636524706884178e-06, + "loss": 0.5082, + "step": 262 + }, + { + "epoch": 0.0528, + "learning_rate": 3.647301717175955e-06, + "loss": 0.3223, + "step": 264 + }, + { + "epoch": 0.0532, + "learning_rate": 3.6580911123781025e-06, + "loss": 0.1914, + "step": 266 + }, + { + "epoch": 0.0536, + "learning_rate": 3.66889287145614e-06, + "loss": 0.6891, + "step": 268 + }, + { + "epoch": 0.054, + "learning_rate": 3.679706973351488e-06, + "loss": 4.2481, + "step": 270 + }, + { + "epoch": 0.0544, + "learning_rate": 3.6905333969814995e-06, + "loss": 0.2092, + "step": 272 + }, + { + "epoch": 0.0548, + "learning_rate": 3.701372121239508e-06, + "loss": 0.6768, + "step": 274 + }, + { + "epoch": 0.0552, + "learning_rate": 3.712223124994867e-06, + "loss": 0.591, + "step": 276 + }, + { + "epoch": 0.0556, + "learning_rate": 3.723086387092989e-06, + "loss": 0.2471, + "step": 278 + }, + { + "epoch": 0.056, + "learning_rate": 3.7339618863553885e-06, + "loss": 0.1902, + "step": 280 + }, + { + "epoch": 0.0564, + "learning_rate": 3.744849601579722e-06, + "loss": 0.6435, + "step": 282 + }, + { + "epoch": 0.0568, + "learning_rate": 3.755749511539848e-06, + "loss": 0.3655, + "step": 284 + }, + { + "epoch": 0.0572, + "learning_rate": 3.7666615949857897e-06, + "loss": 0.3219, + "step": 286 + }, + { + "epoch": 0.0576, + "learning_rate": 3.7775858306439404e-06, + "loss": 0.4249, + "step": 288 + }, + { + "epoch": 0.058, + "learning_rate": 3.7885221972168864e-06, + "loss": 0.4868, + "step": 290 + }, + { + "epoch": 0.0584, + "learning_rate": 3.799470673383677e-06, + "loss": 0.8884, + "step": 292 + }, + { + "epoch": 0.0588, + "learning_rate": 3.810431237799657e-06, + "loss": 0.3619, + "step": 294 + }, + { + "epoch": 0.0592, + "learning_rate": 3.821403869096644e-06, + "loss": 0.2374, + "step": 296 + }, + { + "epoch": 0.0596, + "learning_rate": 3.8323885458829745e-06, + "loss": 0.3296, + "step": 298 + }, + { + "epoch": 0.06, + "learning_rate": 3.8433852467434175e-06, + "loss": 0.2686, + "step": 300 + }, + { + "epoch": 0.0604, + "learning_rate": 3.854393950239355e-06, + "loss": 0.2137, + "step": 302 + }, + { + "epoch": 0.0608, + "learning_rate": 3.865414634908756e-06, + "loss": 0.4099, + "step": 304 + }, + { + "epoch": 0.0612, + "learning_rate": 3.876447279266233e-06, + "loss": 0.513, + "step": 306 + }, + { + "epoch": 0.0616, + "learning_rate": 3.887491861803081e-06, + "loss": 0.6536, + "step": 308 + }, + { + "epoch": 0.062, + "learning_rate": 3.898548360987321e-06, + "loss": 0.3845, + "step": 310 + }, + { + "epoch": 0.0624, + "learning_rate": 3.909616755263741e-06, + "loss": 0.4884, + "step": 312 + }, + { + "epoch": 0.0628, + "learning_rate": 3.920697023053941e-06, + "loss": 1.63, + "step": 314 + }, + { + "epoch": 0.0632, + "learning_rate": 3.9317891427563725e-06, + "loss": 0.4922, + "step": 316 + }, + { + "epoch": 0.0636, + "learning_rate": 3.942893092746381e-06, + "loss": 0.6452, + "step": 318 + }, + { + "epoch": 0.064, + "learning_rate": 3.954008851376244e-06, + "loss": 0.7281, + "step": 320 + }, + { + "epoch": 0.0644, + "learning_rate": 3.965136396975227e-06, + "loss": 0.7939, + "step": 322 + }, + { + "epoch": 0.0648, + "learning_rate": 3.976275707849619e-06, + "loss": 0.604, + "step": 324 + }, + { + "epoch": 0.0652, + "learning_rate": 3.987426762282726e-06, + "loss": 0.2484, + "step": 326 + }, + { + "epoch": 0.0656, + "learning_rate": 3.99858953853505e-06, + "loss": 0.2199, + "step": 328 + }, + { + "epoch": 0.066, + "learning_rate": 4.009764014844146e-06, + "loss": 0.1041, + "step": 330 + }, + { + "epoch": 0.0664, + "learning_rate": 4.0209501694248e-06, + "loss": 0.5506, + "step": 332 + }, + { + "epoch": 0.0668, + "learning_rate": 4.032147980469076e-06, + "loss": 0.5264, + "step": 334 + }, + { + "epoch": 0.0672, + "learning_rate": 4.043357426146209e-06, + "loss": 0.2475, + "step": 336 + }, + { + "epoch": 0.0676, + "learning_rate": 4.054578484602869e-06, + "loss": 0.7816, + "step": 338 + }, + { + "epoch": 0.068, + "learning_rate": 4.065811133962987e-06, + "loss": 0.4805, + "step": 340 + }, + { + "epoch": 0.0684, + "learning_rate": 4.07705535232795e-06, + "loss": 0.2808, + "step": 342 + }, + { + "epoch": 0.0688, + "learning_rate": 4.08831111777658e-06, + "loss": 0.3566, + "step": 344 + }, + { + "epoch": 0.0692, + "learning_rate": 4.0995784083651865e-06, + "loss": 0.3536, + "step": 346 + }, + { + "epoch": 0.0696, + "learning_rate": 4.110857202127611e-06, + "loss": 0.5355, + "step": 348 + }, + { + "epoch": 0.07, + "learning_rate": 4.122147477075266e-06, + "loss": 0.3157, + "step": 350 + }, + { + "epoch": 0.0704, + "learning_rate": 4.133449211197183e-06, + "loss": 0.3068, + "step": 352 + }, + { + "epoch": 0.0708, + "learning_rate": 4.144762382460055e-06, + "loss": 0.3316, + "step": 354 + }, + { + "epoch": 0.0712, + "learning_rate": 4.156086968808274e-06, + "loss": 0.8446, + "step": 356 + }, + { + "epoch": 0.0716, + "learning_rate": 4.1674229481639796e-06, + "loss": 0.7279, + "step": 358 + }, + { + "epoch": 0.072, + "learning_rate": 4.178770298427114e-06, + "loss": 0.594, + "step": 360 + }, + { + "epoch": 0.0724, + "learning_rate": 4.190128997475395e-06, + "loss": 0.6365, + "step": 362 + }, + { + "epoch": 0.0728, + "learning_rate": 4.201499023164515e-06, + "loss": 0.135, + "step": 364 + }, + { + "epoch": 0.0732, + "learning_rate": 4.212880353327968e-06, + "loss": 0.2319, + "step": 366 + }, + { + "epoch": 0.0736, + "learning_rate": 4.224272965777315e-06, + "loss": 0.5744, + "step": 368 + }, + { + "epoch": 0.074, + "learning_rate": 4.235676838302072e-06, + "loss": 0.2797, + "step": 370 + }, + { + "epoch": 0.0744, + "learning_rate": 4.247091948669764e-06, + "loss": 0.5988, + "step": 372 + }, + { + "epoch": 0.0748, + "learning_rate": 4.258518274626106e-06, + "loss": 0.216, + "step": 374 + }, + { + "epoch": 0.0752, + "learning_rate": 4.269955793894849e-06, + "loss": 0.3689, + "step": 376 + }, + { + "epoch": 0.0756, + "learning_rate": 4.281404484177978e-06, + "loss": 0.8246, + "step": 378 + }, + { + "epoch": 0.076, + "learning_rate": 4.292864323155684e-06, + "loss": 0.346, + "step": 380 + }, + { + "epoch": 0.0764, + "learning_rate": 4.304335288486412e-06, + "loss": 0.723, + "step": 382 + }, + { + "epoch": 0.0768, + "learning_rate": 4.3158173578069696e-06, + "loss": 0.5287, + "step": 384 + }, + { + "epoch": 0.0772, + "learning_rate": 4.327310508732434e-06, + "loss": 0.6834, + "step": 386 + }, + { + "epoch": 0.0776, + "learning_rate": 4.338814718856333e-06, + "loss": 0.3162, + "step": 388 + }, + { + "epoch": 0.078, + "learning_rate": 4.350329965750618e-06, + "loss": 0.3834, + "step": 390 + }, + { + "epoch": 0.0784, + "learning_rate": 4.3618562269657285e-06, + "loss": 0.1518, + "step": 392 + }, + { + "epoch": 0.0788, + "learning_rate": 4.373393480030629e-06, + "loss": 0.617, + "step": 394 + }, + { + "epoch": 0.0792, + "learning_rate": 4.384941702452852e-06, + "loss": 0.1557, + "step": 396 + }, + { + "epoch": 0.0796, + "learning_rate": 4.396500871718548e-06, + "loss": 0.254, + "step": 398 + }, + { + "epoch": 0.08, + "learning_rate": 4.408070965292526e-06, + "loss": 0.5976, + "step": 400 + }, + { + "epoch": 0.0804, + "learning_rate": 4.419651960618294e-06, + "loss": 0.52, + "step": 402 + }, + { + "epoch": 0.0808, + "learning_rate": 4.431243835118112e-06, + "loss": 0.3455, + "step": 404 + }, + { + "epoch": 0.0812, + "learning_rate": 4.442846566193041e-06, + "loss": 0.6479, + "step": 406 + }, + { + "epoch": 0.0816, + "learning_rate": 4.4544601312229185e-06, + "loss": 0.6547, + "step": 408 + }, + { + "epoch": 0.082, + "learning_rate": 4.4660845075665635e-06, + "loss": 0.3382, + "step": 410 + }, + { + "epoch": 0.0824, + "learning_rate": 4.477719672561602e-06, + "loss": 0.8742, + "step": 412 + }, + { + "epoch": 0.0828, + "learning_rate": 4.489365603524743e-06, + "loss": 0.3645, + "step": 414 + }, + { + "epoch": 0.0832, + "learning_rate": 4.501022277751605e-06, + "loss": 0.4448, + "step": 416 + }, + { + "epoch": 0.0836, + "learning_rate": 4.5126896725169025e-06, + "loss": 0.5673, + "step": 418 + }, + { + "epoch": 0.084, + "learning_rate": 4.524367765074499e-06, + "loss": 0.5706, + "step": 420 + }, + { + "epoch": 0.0844, + "learning_rate": 4.536056532657295e-06, + "loss": 0.423, + "step": 422 + }, + { + "epoch": 0.0848, + "learning_rate": 4.5477559524775e-06, + "loss": 0.3611, + "step": 424 + }, + { + "epoch": 0.0852, + "learning_rate": 4.559466001726451e-06, + "loss": 0.4763, + "step": 426 + }, + { + "epoch": 0.0856, + "learning_rate": 4.571186657574823e-06, + "loss": 1.0101, + "step": 428 + }, + { + "epoch": 0.086, + "learning_rate": 4.582917897172599e-06, + "loss": 0.8055, + "step": 430 + }, + { + "epoch": 0.0864, + "learning_rate": 4.5946596976491254e-06, + "loss": 0.1887, + "step": 432 + }, + { + "epoch": 0.0868, + "learning_rate": 4.6064120361131624e-06, + "loss": 0.5731, + "step": 434 + }, + { + "epoch": 0.0872, + "learning_rate": 4.618174889652924e-06, + "loss": 0.4192, + "step": 436 + }, + { + "epoch": 0.0876, + "learning_rate": 4.629948235336126e-06, + "loss": 0.575, + "step": 438 + }, + { + "epoch": 0.088, + "learning_rate": 4.6417320502100286e-06, + "loss": 0.6181, + "step": 440 + }, + { + "epoch": 0.0884, + "learning_rate": 4.653526311301479e-06, + "loss": 0.7818, + "step": 442 + }, + { + "epoch": 0.0888, + "learning_rate": 4.665330995616967e-06, + "loss": 0.4514, + "step": 444 + }, + { + "epoch": 0.0892, + "learning_rate": 4.677146080142667e-06, + "loss": 0.3001, + "step": 446 + }, + { + "epoch": 0.0896, + "learning_rate": 4.688971541844424e-06, + "loss": 0.6596, + "step": 448 + }, + { + "epoch": 0.09, + "learning_rate": 4.700807357667956e-06, + "loss": 0.3166, + "step": 450 + }, + { + "epoch": 0.0904, + "learning_rate": 4.712653504538672e-06, + "loss": 0.3794, + "step": 452 + }, + { + "epoch": 0.0908, + "learning_rate": 4.7245099593619495e-06, + "loss": 0.6721, + "step": 454 + }, + { + "epoch": 0.0912, + "learning_rate": 4.736376699023023e-06, + "loss": 0.8228, + "step": 456 + }, + { + "epoch": 0.0916, + "learning_rate": 4.74825370038703e-06, + "loss": 0.5651, + "step": 458 + }, + { + "epoch": 0.092, + "learning_rate": 4.76014094029921e-06, + "loss": 0.2414, + "step": 460 + }, + { + "epoch": 0.0924, + "learning_rate": 4.772038395584735e-06, + "loss": 0.3285, + "step": 462 + }, + { + "epoch": 0.0928, + "learning_rate": 4.7839460430489216e-06, + "loss": 0.5655, + "step": 464 + }, + { + "epoch": 0.0932, + "learning_rate": 4.7958638594772035e-06, + "loss": 0.2886, + "step": 466 + }, + { + "epoch": 0.0936, + "learning_rate": 4.807791821635185e-06, + "loss": 0.2587, + "step": 468 + }, + { + "epoch": 0.094, + "learning_rate": 4.8197299062686954e-06, + "loss": 0.3, + "step": 470 + }, + { + "epoch": 0.0944, + "learning_rate": 4.831678090103828e-06, + "loss": 0.5498, + "step": 472 + }, + { + "epoch": 0.0948, + "learning_rate": 4.8436363498469865e-06, + "loss": 0.3583, + "step": 474 + }, + { + "epoch": 0.0952, + "learning_rate": 4.855604662184931e-06, + "loss": 0.2844, + "step": 476 + }, + { + "epoch": 0.0956, + "learning_rate": 4.867583003784825e-06, + "loss": 0.428, + "step": 478 + }, + { + "epoch": 0.096, + "learning_rate": 4.8795713512942785e-06, + "loss": 0.3722, + "step": 480 + }, + { + "epoch": 0.0964, + "learning_rate": 4.891569681341395e-06, + "loss": 0.4665, + "step": 482 + }, + { + "epoch": 0.0968, + "learning_rate": 4.903577970534815e-06, + "loss": 0.3869, + "step": 484 + }, + { + "epoch": 0.0972, + "learning_rate": 4.91559619546378e-06, + "loss": 0.5019, + "step": 486 + }, + { + "epoch": 0.0976, + "learning_rate": 4.9276243326981e-06, + "loss": 0.2032, + "step": 488 + }, + { + "epoch": 0.098, + "learning_rate": 4.939662358788352e-06, + "loss": 0.6518, + "step": 490 + }, + { + "epoch": 0.0984, + "learning_rate": 4.951710250265788e-06, + "loss": 0.3309, + "step": 492 + }, + { + "epoch": 0.0988, + "learning_rate": 4.96376798364238e-06, + "loss": 0.6847, + "step": 494 + }, + { + "epoch": 0.0992, + "learning_rate": 4.975835535411023e-06, + "loss": 0.4409, + "step": 496 + }, + { + "epoch": 0.0996, + "learning_rate": 4.987912882045345e-06, + "loss": 0.8926, + "step": 498 + }, + { + "epoch": 0.1, + "learning_rate": 5.000000000000003e-06, + "loss": 0.3192, + "step": 500 + }, + { + "epoch": 0.1004, + "learning_rate": 5.012096865710493e-06, + "loss": 0.6568, + "step": 502 + }, + { + "epoch": 0.1008, + "learning_rate": 5.024203455593375e-06, + "loss": 0.7142, + "step": 504 + }, + { + "epoch": 0.1012, + "learning_rate": 5.036319746046232e-06, + "loss": 0.3062, + "step": 506 + }, + { + "epoch": 0.1016, + "learning_rate": 5.048445713447734e-06, + "loss": 0.1935, + "step": 508 + }, + { + "epoch": 0.102, + "learning_rate": 5.0605813341576885e-06, + "loss": 0.3819, + "step": 510 + }, + { + "epoch": 0.1024, + "learning_rate": 5.072726584517083e-06, + "loss": 0.5336, + "step": 512 + }, + { + "epoch": 0.1028, + "learning_rate": 5.084881440848126e-06, + "loss": 0.3949, + "step": 514 + }, + { + "epoch": 0.1032, + "learning_rate": 5.097045879454308e-06, + "loss": 0.4412, + "step": 516 + }, + { + "epoch": 0.1036, + "learning_rate": 5.109219876620433e-06, + "loss": 0.1469, + "step": 518 + }, + { + "epoch": 0.104, + "learning_rate": 5.1214034086126685e-06, + "loss": 0.2903, + "step": 520 + }, + { + "epoch": 0.1044, + "learning_rate": 5.133596451678611e-06, + "loss": 0.6229, + "step": 522 + }, + { + "epoch": 0.1048, + "learning_rate": 5.145798982047253e-06, + "loss": 0.637, + "step": 524 + }, + { + "epoch": 0.1052, + "learning_rate": 5.158010975929185e-06, + "loss": 0.3293, + "step": 526 + }, + { + "epoch": 0.1056, + "learning_rate": 5.170232409516483e-06, + "loss": 0.213, + "step": 528 + }, + { + "epoch": 0.106, + "learning_rate": 5.182463258982837e-06, + "loss": 0.4401, + "step": 530 + }, + { + "epoch": 0.1064, + "learning_rate": 5.194703500483597e-06, + "loss": 0.2414, + "step": 532 + }, + { + "epoch": 0.1068, + "learning_rate": 5.2069531101557395e-06, + "loss": 0.2141, + "step": 534 + }, + { + "epoch": 0.1072, + "learning_rate": 5.219212064118082e-06, + "loss": 0.3803, + "step": 536 + }, + { + "epoch": 0.1076, + "learning_rate": 5.231480338471124e-06, + "loss": 0.1411, + "step": 538 + }, + { + "epoch": 0.108, + "learning_rate": 5.24375790929725e-06, + "loss": 0.323, + "step": 540 + }, + { + "epoch": 0.1084, + "learning_rate": 5.256044752660709e-06, + "loss": 0.2705, + "step": 542 + }, + { + "epoch": 0.1088, + "learning_rate": 5.268340844607653e-06, + "loss": 0.4348, + "step": 544 + }, + { + "epoch": 0.1092, + "learning_rate": 5.2806461611662725e-06, + "loss": 0.517, + "step": 546 + }, + { + "epoch": 0.1096, + "learning_rate": 5.2929606783466735e-06, + "loss": 0.3316, + "step": 548 + }, + { + "epoch": 0.11, + "learning_rate": 5.305284372141091e-06, + "loss": 0.474, + "step": 550 + }, + { + "epoch": 0.1104, + "learning_rate": 5.317617218523853e-06, + "loss": 0.5303, + "step": 552 + }, + { + "epoch": 0.1108, + "learning_rate": 5.3299591934514435e-06, + "loss": 0.6748, + "step": 554 + }, + { + "epoch": 0.1112, + "learning_rate": 5.342310272862553e-06, + "loss": 0.636, + "step": 556 + }, + { + "epoch": 0.1116, + "learning_rate": 5.354670432678119e-06, + "loss": 0.2121, + "step": 558 + }, + { + "epoch": 0.112, + "learning_rate": 5.367039648801377e-06, + "loss": 0.4483, + "step": 560 + }, + { + "epoch": 0.1124, + "learning_rate": 5.379417897117909e-06, + "loss": 0.2498, + "step": 562 + }, + { + "epoch": 0.1128, + "learning_rate": 5.391805153495684e-06, + "loss": 0.3031, + "step": 564 + }, + { + "epoch": 0.1132, + "learning_rate": 5.404201393785113e-06, + "loss": 0.4915, + "step": 566 + }, + { + "epoch": 0.1136, + "learning_rate": 5.416606593819109e-06, + "loss": 0.2193, + "step": 568 + }, + { + "epoch": 0.114, + "learning_rate": 5.429020729413049e-06, + "loss": 0.2939, + "step": 570 + }, + { + "epoch": 0.1144, + "learning_rate": 5.441443776365005e-06, + "loss": 0.52, + "step": 572 + }, + { + "epoch": 0.1148, + "learning_rate": 5.453875710455549e-06, + "loss": 0.2586, + "step": 574 + }, + { + "epoch": 0.1152, + "learning_rate": 5.466316507448053e-06, + "loss": 0.4845, + "step": 576 + }, + { + "epoch": 0.1156, + "learning_rate": 5.478766143088497e-06, + "loss": 0.4885, + "step": 578 + }, + { + "epoch": 0.116, + "learning_rate": 5.49122459310568e-06, + "loss": 0.5363, + "step": 580 + }, + { + "epoch": 0.1164, + "learning_rate": 5.503691833211264e-06, + "loss": 0.4225, + "step": 582 + }, + { + "epoch": 0.1168, + "learning_rate": 5.516167839099662e-06, + "loss": 0.3044, + "step": 584 + }, + { + "epoch": 0.1172, + "learning_rate": 5.5286525864483285e-06, + "loss": 0.5066, + "step": 586 + }, + { + "epoch": 0.1176, + "learning_rate": 5.5411460509175605e-06, + "loss": 0.3234, + "step": 588 + }, + { + "epoch": 0.118, + "learning_rate": 5.553648208150724e-06, + "loss": 0.3873, + "step": 590 + }, + { + "epoch": 0.1184, + "learning_rate": 5.5661590337742255e-06, + "loss": 0.2609, + "step": 592 + }, + { + "epoch": 0.1188, + "learning_rate": 5.57867850339757e-06, + "loss": 0.424, + "step": 594 + }, + { + "epoch": 0.1192, + "learning_rate": 5.591206592613412e-06, + "loss": 0.36, + "step": 596 + }, + { + "epoch": 0.1196, + "learning_rate": 5.603743276997597e-06, + "loss": 0.5183, + "step": 598 + }, + { + "epoch": 0.12, + "learning_rate": 5.616288532109221e-06, + "loss": 0.4514, + "step": 600 + }, + { + "epoch": 0.1204, + "learning_rate": 5.628842333490665e-06, + "loss": 0.0299, + "step": 602 + }, + { + "epoch": 0.1208, + "learning_rate": 5.641404656667652e-06, + "loss": 0.5307, + "step": 604 + }, + { + "epoch": 0.1212, + "learning_rate": 5.653975477149289e-06, + "loss": 0.2468, + "step": 606 + }, + { + "epoch": 0.1216, + "learning_rate": 5.666554770428136e-06, + "loss": 0.3562, + "step": 608 + }, + { + "epoch": 0.122, + "learning_rate": 5.679142511980168e-06, + "loss": 0.419, + "step": 610 + }, + { + "epoch": 0.1224, + "learning_rate": 5.6917386772650015e-06, + "loss": 0.4426, + "step": 612 + }, + { + "epoch": 0.1228, + "learning_rate": 5.7043432417257076e-06, + "loss": 0.1631, + "step": 614 + }, + { + "epoch": 0.1232, + "learning_rate": 5.716956180789086e-06, + "loss": 0.1677, + "step": 616 + }, + { + "epoch": 0.1236, + "learning_rate": 5.729577469865569e-06, + "loss": 0.2451, + "step": 618 + }, + { + "epoch": 0.124, + "learning_rate": 5.74220708434926e-06, + "loss": 0.4916, + "step": 620 + }, + { + "epoch": 0.1244, + "learning_rate": 5.754844999618143e-06, + "loss": 0.4607, + "step": 622 + }, + { + "epoch": 0.1248, + "learning_rate": 5.767491191033909e-06, + "loss": 0.1815, + "step": 624 + }, + { + "epoch": 0.1252, + "learning_rate": 5.780145633942173e-06, + "loss": 0.6035, + "step": 626 + }, + { + "epoch": 0.1256, + "learning_rate": 5.7928083036724535e-06, + "loss": 0.489, + "step": 628 + }, + { + "epoch": 0.126, + "learning_rate": 5.8054791755382125e-06, + "loss": 0.2393, + "step": 630 + }, + { + "epoch": 0.1264, + "learning_rate": 5.818158224836983e-06, + "loss": 0.4386, + "step": 632 + }, + { + "epoch": 0.1268, + "learning_rate": 5.830845426850263e-06, + "loss": 0.2538, + "step": 634 + }, + { + "epoch": 0.1272, + "learning_rate": 5.8435407568437194e-06, + "loss": 0.1257, + "step": 636 + }, + { + "epoch": 0.1276, + "learning_rate": 5.856244190067155e-06, + "loss": 1.1604, + "step": 638 + }, + { + "epoch": 0.128, + "learning_rate": 5.868955701754577e-06, + "loss": 0.7365, + "step": 640 + }, + { + "epoch": 0.1284, + "learning_rate": 5.881675267124245e-06, + "loss": 0.2199, + "step": 642 + }, + { + "epoch": 0.1288, + "learning_rate": 5.894402861378714e-06, + "loss": 0.3825, + "step": 644 + }, + { + "epoch": 0.1292, + "learning_rate": 5.907138459704886e-06, + "loss": 0.1964, + "step": 646 + }, + { + "epoch": 0.1296, + "learning_rate": 5.919882037274065e-06, + "loss": 0.2918, + "step": 648 + }, + { + "epoch": 0.13, + "learning_rate": 5.932633569241989e-06, + "loss": 0.355, + "step": 650 + }, + { + "epoch": 0.1304, + "learning_rate": 5.9453930307488985e-06, + "loss": 0.2742, + "step": 652 + }, + { + "epoch": 0.1308, + "learning_rate": 5.958160396919584e-06, + "loss": 0.5656, + "step": 654 + }, + { + "epoch": 0.1312, + "learning_rate": 5.970935642863362e-06, + "loss": 1.0181, + "step": 656 + }, + { + "epoch": 0.1316, + "learning_rate": 5.983718743674305e-06, + "loss": 0.4758, + "step": 658 + }, + { + "epoch": 0.132, + "learning_rate": 5.996509674431038e-06, + "loss": 0.7185, + "step": 660 + }, + { + "epoch": 0.1324, + "learning_rate": 6.00930841019705e-06, + "loss": 0.186, + "step": 662 + }, + { + "epoch": 0.1328, + "learning_rate": 6.022114926020505e-06, + "loss": 0.4552, + "step": 664 + }, + { + "epoch": 0.1332, + "learning_rate": 6.0349291969344426e-06, + "loss": 0.7673, + "step": 666 + }, + { + "epoch": 0.1336, + "learning_rate": 6.047751197956836e-06, + "loss": 0.0948, + "step": 668 + }, + { + "epoch": 0.134, + "learning_rate": 6.060580904090489e-06, + "loss": 0.4452, + "step": 670 + }, + { + "epoch": 0.1344, + "learning_rate": 6.0734182903232475e-06, + "loss": 0.2749, + "step": 672 + }, + { + "epoch": 0.1348, + "learning_rate": 6.086263331627974e-06, + "loss": 0.4888, + "step": 674 + }, + { + "epoch": 0.1352, + "learning_rate": 6.0991160029626e-06, + "loss": 0.5217, + "step": 676 + }, + { + "epoch": 0.1356, + "learning_rate": 6.111976279270187e-06, + "loss": 0.2678, + "step": 678 + }, + { + "epoch": 0.136, + "learning_rate": 6.124844135478966e-06, + "loss": 0.7836, + "step": 680 + }, + { + "epoch": 0.1364, + "learning_rate": 6.137719546502394e-06, + "loss": 0.3438, + "step": 682 + }, + { + "epoch": 0.1368, + "learning_rate": 6.1506024872392e-06, + "loss": 0.4353, + "step": 684 + }, + { + "epoch": 0.1372, + "learning_rate": 6.163492932573429e-06, + "loss": 0.4931, + "step": 686 + }, + { + "epoch": 0.1376, + "learning_rate": 6.176390857374501e-06, + "loss": 0.544, + "step": 688 + }, + { + "epoch": 0.138, + "learning_rate": 6.189296236497251e-06, + "loss": 0.3892, + "step": 690 + }, + { + "epoch": 0.1384, + "learning_rate": 6.202209044781979e-06, + "loss": 0.4959, + "step": 692 + }, + { + "epoch": 0.1388, + "learning_rate": 6.215129257054525e-06, + "loss": 0.2829, + "step": 694 + }, + { + "epoch": 0.1392, + "learning_rate": 6.228056848126223e-06, + "loss": 0.2992, + "step": 696 + }, + { + "epoch": 0.1396, + "learning_rate": 6.240991792794137e-06, + "loss": 0.2395, + "step": 698 + }, + { + "epoch": 0.14, + "learning_rate": 6.253934065840883e-06, + "loss": 0.1361, + "step": 700 + }, + { + "epoch": 0.1404, + "learning_rate": 6.2668836420348374e-06, + "loss": 0.532, + "step": 702 + }, + { + "epoch": 0.1408, + "learning_rate": 6.279840496130188e-06, + "loss": 0.2938, + "step": 704 + }, + { + "epoch": 0.1412, + "learning_rate": 6.2928046028668185e-06, + "loss": 0.3802, + "step": 706 + }, + { + "epoch": 0.1416, + "learning_rate": 6.305775936970606e-06, + "loss": 0.6094, + "step": 708 + }, + { + "epoch": 0.142, + "learning_rate": 6.3187544731532205e-06, + "loss": 0.4248, + "step": 710 + }, + { + "epoch": 0.1424, + "learning_rate": 6.331740186112359e-06, + "loss": 0.0424, + "step": 712 + }, + { + "epoch": 0.1428, + "learning_rate": 6.344733050531709e-06, + "loss": 0.5293, + "step": 714 + }, + { + "epoch": 0.1432, + "learning_rate": 6.357733041081015e-06, + "loss": 0.2113, + "step": 716 + }, + { + "epoch": 0.1436, + "learning_rate": 6.370740132416133e-06, + "loss": 0.4702, + "step": 718 + }, + { + "epoch": 0.144, + "learning_rate": 6.383754299179072e-06, + "loss": 0.5836, + "step": 720 + }, + { + "epoch": 0.1444, + "learning_rate": 6.3967755159980485e-06, + "loss": 0.3205, + "step": 722 + }, + { + "epoch": 0.1448, + "learning_rate": 6.409803757487532e-06, + "loss": 0.2761, + "step": 724 + }, + { + "epoch": 0.1452, + "learning_rate": 6.422838998248301e-06, + "loss": 0.5086, + "step": 726 + }, + { + "epoch": 0.1456, + "learning_rate": 6.435881212867485e-06, + "loss": 0.2804, + "step": 728 + }, + { + "epoch": 0.146, + "learning_rate": 6.4489303759186385e-06, + "loss": 0.1072, + "step": 730 + }, + { + "epoch": 0.1464, + "learning_rate": 6.4619864619616975e-06, + "loss": 0.3754, + "step": 732 + }, + { + "epoch": 0.1468, + "learning_rate": 6.475049445543222e-06, + "loss": 0.4856, + "step": 734 + }, + { + "epoch": 0.1472, + "learning_rate": 6.48811930119619e-06, + "loss": 0.2702, + "step": 736 + }, + { + "epoch": 0.1476, + "learning_rate": 6.5011960034403e-06, + "loss": 0.2769, + "step": 738 + }, + { + "epoch": 0.148, + "learning_rate": 6.514279526781853e-06, + "loss": 0.3243, + "step": 740 + }, + { + "epoch": 0.1484, + "learning_rate": 6.5273698457137965e-06, + "loss": 0.2323, + "step": 742 + }, + { + "epoch": 0.1488, + "learning_rate": 6.540466934715955e-06, + "loss": 0.5303, + "step": 744 + }, + { + "epoch": 0.1492, + "learning_rate": 6.553570768254831e-06, + "loss": 0.4626, + "step": 746 + }, + { + "epoch": 0.1496, + "learning_rate": 6.566681320783848e-06, + "loss": 0.2484, + "step": 748 + }, + { + "epoch": 0.15, + "learning_rate": 6.579798566743313e-06, + "loss": 0.5957, + "step": 750 + }, + { + "epoch": 0.1504, + "learning_rate": 6.592922480560483e-06, + "loss": 0.3162, + "step": 752 + }, + { + "epoch": 0.1508, + "learning_rate": 6.606053036649618e-06, + "loss": 0.2676, + "step": 754 + }, + { + "epoch": 0.1512, + "learning_rate": 6.619190209412025e-06, + "loss": 0.3487, + "step": 756 + }, + { + "epoch": 0.1516, + "learning_rate": 6.632333973236113e-06, + "loss": 0.3227, + "step": 758 + }, + { + "epoch": 0.152, + "learning_rate": 6.6454843024974465e-06, + "loss": 0.4049, + "step": 760 + }, + { + "epoch": 0.1524, + "learning_rate": 6.6586411715587805e-06, + "loss": 0.7003, + "step": 762 + }, + { + "epoch": 0.1528, + "learning_rate": 6.671804554770128e-06, + "loss": 0.4246, + "step": 764 + }, + { + "epoch": 0.1532, + "learning_rate": 6.6849744264688e-06, + "loss": 0.3153, + "step": 766 + }, + { + "epoch": 0.1536, + "learning_rate": 6.698150760979456e-06, + "loss": 0.2446, + "step": 768 + }, + { + "epoch": 0.154, + "learning_rate": 6.711333532614177e-06, + "loss": 0.4019, + "step": 770 + }, + { + "epoch": 0.1544, + "learning_rate": 6.724522715672421e-06, + "loss": 0.45, + "step": 772 + }, + { + "epoch": 0.1548, + "learning_rate": 6.737718284441256e-06, + "loss": 0.17, + "step": 774 + }, + { + "epoch": 0.1552, + "learning_rate": 6.750920213195242e-06, + "loss": 0.6598, + "step": 776 + }, + { + "epoch": 0.1556, + "learning_rate": 6.764128476196494e-06, + "loss": 0.1592, + "step": 778 + }, + { + "epoch": 0.156, + "learning_rate": 6.777343047694894e-06, + "loss": 0.7457, + "step": 780 + }, + { + "epoch": 0.1564, + "learning_rate": 6.7905639019278925e-06, + "loss": 0.4014, + "step": 782 + }, + { + "epoch": 0.1568, + "learning_rate": 6.803791013120824e-06, + "loss": 0.5695, + "step": 784 + }, + { + "epoch": 0.1572, + "learning_rate": 6.817024355486707e-06, + "loss": 0.3665, + "step": 786 + }, + { + "epoch": 0.1576, + "learning_rate": 6.8302639032264836e-06, + "loss": 0.6023, + "step": 788 + }, + { + "epoch": 0.158, + "learning_rate": 6.8435096305289765e-06, + "loss": 0.4421, + "step": 790 + }, + { + "epoch": 0.1584, + "learning_rate": 6.856761511570944e-06, + "loss": 0.1436, + "step": 792 + }, + { + "epoch": 0.1588, + "learning_rate": 6.870019520517217e-06, + "loss": 0.695, + "step": 794 + }, + { + "epoch": 0.1592, + "learning_rate": 6.883283631520579e-06, + "loss": 0.2095, + "step": 796 + }, + { + "epoch": 0.1596, + "learning_rate": 6.896553818721985e-06, + "loss": 0.3553, + "step": 798 + }, + { + "epoch": 0.16, + "learning_rate": 6.909830056250522e-06, + "loss": 0.3617, + "step": 800 + }, + { + "epoch": 0.1604, + "learning_rate": 6.9231123182234895e-06, + "loss": 0.4206, + "step": 802 + }, + { + "epoch": 0.1608, + "learning_rate": 6.936400578746436e-06, + "loss": 0.7615, + "step": 804 + }, + { + "epoch": 0.1612, + "learning_rate": 6.949694811913237e-06, + "loss": 0.3431, + "step": 806 + }, + { + "epoch": 0.1616, + "learning_rate": 6.96299499180605e-06, + "loss": 0.7773, + "step": 808 + }, + { + "epoch": 0.162, + "learning_rate": 6.976301092495548e-06, + "loss": 0.4262, + "step": 810 + }, + { + "epoch": 0.1624, + "learning_rate": 6.989613088040787e-06, + "loss": 0.5774, + "step": 812 + }, + { + "epoch": 0.1628, + "learning_rate": 7.002930952489353e-06, + "loss": 0.3014, + "step": 814 + }, + { + "epoch": 0.1632, + "learning_rate": 7.016254659877404e-06, + "loss": 0.5273, + "step": 816 + }, + { + "epoch": 0.1636, + "learning_rate": 7.029584184229641e-06, + "loss": 0.4042, + "step": 818 + }, + { + "epoch": 0.164, + "learning_rate": 7.042919499559539e-06, + "loss": 1.0242, + "step": 820 + }, + { + "epoch": 0.1644, + "learning_rate": 7.056260579869152e-06, + "loss": 0.3114, + "step": 822 + }, + { + "epoch": 0.1648, + "learning_rate": 7.06960739914943e-06, + "loss": 0.4621, + "step": 824 + }, + { + "epoch": 0.1652, + "learning_rate": 7.082959931380013e-06, + "loss": 0.2639, + "step": 826 + }, + { + "epoch": 0.1656, + "learning_rate": 7.09631815052946e-06, + "loss": 0.3438, + "step": 828 + }, + { + "epoch": 0.166, + "learning_rate": 7.109682030555285e-06, + "loss": 0.3132, + "step": 830 + }, + { + "epoch": 0.1664, + "learning_rate": 7.123051545403873e-06, + "loss": 0.4491, + "step": 832 + }, + { + "epoch": 0.1668, + "learning_rate": 7.136426669010686e-06, + "loss": 0.3661, + "step": 834 + }, + { + "epoch": 0.1672, + "learning_rate": 7.1498073753002375e-06, + "loss": 0.4484, + "step": 836 + }, + { + "epoch": 0.1676, + "learning_rate": 7.1631936381861544e-06, + "loss": 0.3452, + "step": 838 + }, + { + "epoch": 0.168, + "learning_rate": 7.1765854315712325e-06, + "loss": 0.5085, + "step": 840 + }, + { + "epoch": 0.1684, + "learning_rate": 7.189982729347485e-06, + "loss": 0.3839, + "step": 842 + }, + { + "epoch": 0.1688, + "learning_rate": 7.203385505396197e-06, + "loss": 0.2722, + "step": 844 + }, + { + "epoch": 0.1692, + "learning_rate": 7.216793733587966e-06, + "loss": 0.1636, + "step": 846 + }, + { + "epoch": 0.1696, + "learning_rate": 7.230207387782771e-06, + "loss": 0.3665, + "step": 848 + }, + { + "epoch": 0.17, + "learning_rate": 7.243626441830001e-06, + "loss": 0.3125, + "step": 850 + }, + { + "epoch": 0.1704, + "learning_rate": 7.257050869568527e-06, + "loss": 0.3114, + "step": 852 + }, + { + "epoch": 0.1708, + "learning_rate": 7.270480644826739e-06, + "loss": 0.4676, + "step": 854 + }, + { + "epoch": 0.1712, + "learning_rate": 7.28391574142262e-06, + "loss": 0.6019, + "step": 856 + }, + { + "epoch": 0.1716, + "learning_rate": 7.297356133163711e-06, + "loss": 0.5212, + "step": 858 + }, + { + "epoch": 0.172, + "learning_rate": 7.3108017938473485e-06, + "loss": 0.0636, + "step": 860 + }, + { + "epoch": 0.1724, + "learning_rate": 7.324252697260479e-06, + "loss": 0.122, + "step": 862 + }, + { + "epoch": 0.1728, + "learning_rate": 7.337708817179875e-06, + "loss": 0.2703, + "step": 864 + }, + { + "epoch": 0.1732, + "learning_rate": 7.351170127372196e-06, + "loss": 0.6272, + "step": 866 + }, + { + "epoch": 0.1736, + "learning_rate": 7.36463660159386e-06, + "loss": 0.8853, + "step": 868 + }, + { + "epoch": 0.174, + "learning_rate": 7.378108213591355e-06, + "loss": 0.3086, + "step": 870 + }, + { + "epoch": 0.1744, + "learning_rate": 7.39158493710103e-06, + "loss": 0.4001, + "step": 872 + }, + { + "epoch": 0.1748, + "learning_rate": 7.405066745849345e-06, + "loss": 0.3119, + "step": 874 + }, + { + "epoch": 0.1752, + "learning_rate": 7.418553613552822e-06, + "loss": 0.3373, + "step": 876 + }, + { + "epoch": 0.1756, + "learning_rate": 7.432045513918121e-06, + "loss": 0.3995, + "step": 878 + }, + { + "epoch": 0.176, + "learning_rate": 7.445542420642091e-06, + "loss": 0.3247, + "step": 880 + }, + { + "epoch": 0.1764, + "learning_rate": 7.459044307411826e-06, + "loss": 0.504, + "step": 882 + }, + { + "epoch": 0.1768, + "learning_rate": 7.472551147904703e-06, + "loss": 0.9268, + "step": 884 + }, + { + "epoch": 0.1772, + "learning_rate": 7.486062915788446e-06, + "loss": 0.1192, + "step": 886 + }, + { + "epoch": 0.1776, + "learning_rate": 7.499579584721173e-06, + "loss": 0.2752, + "step": 888 + }, + { + "epoch": 0.178, + "learning_rate": 7.513101128351446e-06, + "loss": 0.407, + "step": 890 + }, + { + "epoch": 0.1784, + "learning_rate": 7.5266275203183395e-06, + "loss": 0.4311, + "step": 892 + }, + { + "epoch": 0.1788, + "learning_rate": 7.540158734251412e-06, + "loss": 0.4275, + "step": 894 + }, + { + "epoch": 0.1792, + "learning_rate": 7.553694743770917e-06, + "loss": 1.0431, + "step": 896 + }, + { + "epoch": 0.1796, + "learning_rate": 7.567235522487698e-06, + "loss": 0.2753, + "step": 898 + }, + { + "epoch": 0.18, + "learning_rate": 7.580781044003312e-06, + "loss": 0.4615, + "step": 900 + }, + { + "epoch": 0.1804, + "learning_rate": 7.5943312819100875e-06, + "loss": 0.3807, + "step": 902 + }, + { + "epoch": 0.1808, + "learning_rate": 7.607886209791095e-06, + "loss": 1.419, + "step": 904 + }, + { + "epoch": 0.1812, + "learning_rate": 7.6214458012203726e-06, + "loss": 0.6087, + "step": 906 + }, + { + "epoch": 0.1816, + "learning_rate": 7.635010029762755e-06, + "loss": 0.3881, + "step": 908 + }, + { + "epoch": 0.182, + "learning_rate": 7.648578868974102e-06, + "loss": 0.3662, + "step": 910 + }, + { + "epoch": 0.1824, + "learning_rate": 7.662152292401265e-06, + "loss": 0.5038, + "step": 912 + }, + { + "epoch": 0.1828, + "learning_rate": 7.675730273582142e-06, + "loss": 1.2598, + "step": 914 + }, + { + "epoch": 0.1832, + "learning_rate": 7.689312786045822e-06, + "loss": 0.3086, + "step": 916 + }, + { + "epoch": 0.1836, + "learning_rate": 7.702899803312443e-06, + "loss": 0.3597, + "step": 918 + }, + { + "epoch": 0.184, + "learning_rate": 7.716491298893441e-06, + "loss": 0.4188, + "step": 920 + }, + { + "epoch": 0.1844, + "learning_rate": 7.730087246291498e-06, + "loss": 0.4932, + "step": 922 + }, + { + "epoch": 0.1848, + "learning_rate": 7.74368761900062e-06, + "loss": 0.5425, + "step": 924 + }, + { + "epoch": 0.1852, + "learning_rate": 7.757292390506184e-06, + "loss": 0.6096, + "step": 926 + }, + { + "epoch": 0.1856, + "learning_rate": 7.770901534284991e-06, + "loss": 0.6611, + "step": 928 + }, + { + "epoch": 0.186, + "learning_rate": 7.78451502380532e-06, + "loss": 0.4042, + "step": 930 + }, + { + "epoch": 0.1864, + "learning_rate": 7.798132832526976e-06, + "loss": 0.3537, + "step": 932 + }, + { + "epoch": 0.1868, + "learning_rate": 7.811754933901346e-06, + "loss": 0.6119, + "step": 934 + }, + { + "epoch": 0.1872, + "learning_rate": 7.825381301371444e-06, + "loss": 0.6406, + "step": 936 + }, + { + "epoch": 0.1876, + "learning_rate": 7.839011908371987e-06, + "loss": 0.7218, + "step": 938 + }, + { + "epoch": 0.188, + "learning_rate": 7.852646728329358e-06, + "loss": 0.348, + "step": 940 + }, + { + "epoch": 0.1884, + "learning_rate": 7.866285734661845e-06, + "loss": 0.1599, + "step": 942 + }, + { + "epoch": 0.1888, + "learning_rate": 7.879928900779441e-06, + "loss": 0.4448, + "step": 944 + }, + { + "epoch": 0.1892, + "learning_rate": 7.893576200084164e-06, + "loss": 0.1563, + "step": 946 + }, + { + "epoch": 0.1896, + "learning_rate": 7.907227605969852e-06, + "loss": 0.4625, + "step": 948 + }, + { + "epoch": 0.19, + "learning_rate": 7.92088309182239e-06, + "loss": 0.6913, + "step": 950 + }, + { + "epoch": 0.1904, + "learning_rate": 7.934542631019767e-06, + "loss": 0.4542, + "step": 952 + }, + { + "epoch": 0.1908, + "learning_rate": 7.948206196931937e-06, + "loss": 0.2976, + "step": 954 + }, + { + "epoch": 0.1912, + "learning_rate": 7.961873762921153e-06, + "loss": 0.3839, + "step": 956 + }, + { + "epoch": 0.1916, + "learning_rate": 7.97554530234174e-06, + "loss": 0.6925, + "step": 958 + }, + { + "epoch": 0.192, + "learning_rate": 7.989220788540351e-06, + "loss": 0.6349, + "step": 960 + }, + { + "epoch": 0.1924, + "learning_rate": 8.002900194855927e-06, + "loss": 0.4547, + "step": 962 + }, + { + "epoch": 0.1928, + "learning_rate": 8.016583494619764e-06, + "loss": 0.2978, + "step": 964 + }, + { + "epoch": 0.1932, + "learning_rate": 8.03027066115557e-06, + "loss": 0.2427, + "step": 966 + }, + { + "epoch": 0.1936, + "learning_rate": 8.043961667779511e-06, + "loss": 0.4806, + "step": 968 + }, + { + "epoch": 0.194, + "learning_rate": 8.057656487800274e-06, + "loss": 0.6885, + "step": 970 + }, + { + "epoch": 0.1944, + "learning_rate": 8.071355094519103e-06, + "loss": 0.5551, + "step": 972 + }, + { + "epoch": 0.1948, + "learning_rate": 8.085057461229862e-06, + "loss": 0.4917, + "step": 974 + }, + { + "epoch": 0.1952, + "learning_rate": 8.098763561219089e-06, + "loss": 0.1577, + "step": 976 + }, + { + "epoch": 0.1956, + "learning_rate": 8.112473367766056e-06, + "loss": 0.3893, + "step": 978 + }, + { + "epoch": 0.196, + "learning_rate": 8.126186854142744e-06, + "loss": 0.566, + "step": 980 + }, + { + "epoch": 0.1964, + "learning_rate": 8.139903993614075e-06, + "loss": 0.3859, + "step": 982 + }, + { + "epoch": 0.1968, + "learning_rate": 8.153624759437718e-06, + "loss": 0.2921, + "step": 984 + }, + { + "epoch": 0.1972, + "learning_rate": 8.167349124864389e-06, + "loss": 0.1714, + "step": 986 + }, + { + "epoch": 0.1976, + "learning_rate": 8.181077063137735e-06, + "loss": 0.6299, + "step": 988 + }, + { + "epoch": 0.198, + "learning_rate": 8.194808547494386e-06, + "loss": 0.478, + "step": 990 + }, + { + "epoch": 0.1984, + "learning_rate": 8.208543551164178e-06, + "loss": 0.6152, + "step": 992 + }, + { + "epoch": 0.1988, + "learning_rate": 8.22228204736997e-06, + "loss": 0.3155, + "step": 994 + }, + { + "epoch": 0.1992, + "learning_rate": 8.236024009327877e-06, + "loss": 0.3155, + "step": 996 + }, + { + "epoch": 0.1996, + "learning_rate": 8.249769410247239e-06, + "loss": 0.4525, + "step": 998 + }, + { + "epoch": 0.2, + "learning_rate": 8.263518223330695e-06, + "loss": 0.3657, + "step": 1000 + }, + { + "epoch": 0.2004, + "learning_rate": 8.277270421774231e-06, + "loss": 0.5314, + "step": 1002 + }, + { + "epoch": 0.2008, + "learning_rate": 8.29102597876723e-06, + "loss": 0.3303, + "step": 1004 + }, + { + "epoch": 0.2012, + "learning_rate": 8.304784867492532e-06, + "loss": 0.2775, + "step": 1006 + }, + { + "epoch": 0.2016, + "learning_rate": 8.31854706112648e-06, + "loss": 0.297, + "step": 1008 + }, + { + "epoch": 0.202, + "learning_rate": 8.332312532838972e-06, + "loss": 0.6559, + "step": 1010 + }, + { + "epoch": 0.2024, + "learning_rate": 8.346081255793516e-06, + "loss": 0.4066, + "step": 1012 + }, + { + "epoch": 0.2028, + "learning_rate": 8.359853203147282e-06, + "loss": 0.6603, + "step": 1014 + }, + { + "epoch": 0.2032, + "learning_rate": 8.373628348051156e-06, + "loss": 2.3754, + "step": 1016 + }, + { + "epoch": 0.2036, + "learning_rate": 8.387406663649803e-06, + "loss": 0.5686, + "step": 1018 + }, + { + "epoch": 0.204, + "learning_rate": 8.401188123081642e-06, + "loss": 0.384, + "step": 1020 + }, + { + "epoch": 0.2044, + "learning_rate": 8.414972699479062e-06, + "loss": 0.2191, + "step": 1022 + }, + { + "epoch": 0.2048, + "learning_rate": 8.428760365968329e-06, + "loss": 1.2355, + "step": 1024 + }, + { + "epoch": 0.2052, + "learning_rate": 8.442551095669627e-06, + "loss": 0.5, + "step": 1026 + }, + { + "epoch": 0.2056, + "learning_rate": 8.456344861697293e-06, + "loss": 0.6364, + "step": 1028 + }, + { + "epoch": 0.206, + "learning_rate": 8.470141637159605e-06, + "loss": 0.6276, + "step": 1030 + }, + { + "epoch": 0.2064, + "learning_rate": 8.483941395159114e-06, + "loss": 0.2992, + "step": 1032 + }, + { + "epoch": 0.2068, + "learning_rate": 8.497744108792431e-06, + "loss": 0.7278, + "step": 1034 + }, + { + "epoch": 0.2072, + "learning_rate": 8.511549751150478e-06, + "loss": 0.2421, + "step": 1036 + }, + { + "epoch": 0.2076, + "learning_rate": 8.52535829531845e-06, + "loss": 0.4927, + "step": 1038 + }, + { + "epoch": 0.208, + "learning_rate": 8.539169714375883e-06, + "loss": 0.2617, + "step": 1040 + }, + { + "epoch": 0.2084, + "learning_rate": 8.552983981396707e-06, + "loss": 0.335, + "step": 1042 + }, + { + "epoch": 0.2088, + "learning_rate": 8.566801069449304e-06, + "loss": 0.2865, + "step": 1044 + }, + { + "epoch": 0.2092, + "learning_rate": 8.580620951596553e-06, + "loss": 0.4233, + "step": 1046 + }, + { + "epoch": 0.2096, + "learning_rate": 8.594443600895886e-06, + "loss": 0.2044, + "step": 1048 + }, + { + "epoch": 0.21, + "learning_rate": 8.60826899039934e-06, + "loss": 0.3534, + "step": 1050 + }, + { + "epoch": 0.2104, + "learning_rate": 8.622097093153612e-06, + "loss": 0.3784, + "step": 1052 + }, + { + "epoch": 0.2108, + "learning_rate": 8.635927882200128e-06, + "loss": 0.3752, + "step": 1054 + }, + { + "epoch": 0.2112, + "learning_rate": 8.649761330575e-06, + "loss": 0.4934, + "step": 1056 + }, + { + "epoch": 0.2116, + "learning_rate": 8.663597411309268e-06, + "loss": 0.4367, + "step": 1058 + }, + { + "epoch": 0.212, + "learning_rate": 8.677436097428766e-06, + "loss": 0.2832, + "step": 1060 + }, + { + "epoch": 0.2124, + "learning_rate": 8.691277361954266e-06, + "loss": 0.2921, + "step": 1062 + }, + { + "epoch": 0.2128, + "learning_rate": 8.705121177901537e-06, + "loss": 0.4773, + "step": 1064 + }, + { + "epoch": 0.2132, + "learning_rate": 8.718967518281292e-06, + "loss": 0.2715, + "step": 1066 + }, + { + "epoch": 0.2136, + "learning_rate": 8.732816356099459e-06, + "loss": 0.5513, + "step": 1068 + }, + { + "epoch": 0.214, + "learning_rate": 8.746667664356962e-06, + "loss": 0.3792, + "step": 1070 + }, + { + "epoch": 0.2144, + "learning_rate": 8.760521416049986e-06, + "loss": 0.2928, + "step": 1072 + }, + { + "epoch": 0.2148, + "learning_rate": 8.774377584169934e-06, + "loss": 0.3655, + "step": 1074 + }, + { + "epoch": 0.2152, + "learning_rate": 8.788236141703477e-06, + "loss": 0.4638, + "step": 1076 + }, + { + "epoch": 0.2156, + "learning_rate": 8.802097061632706e-06, + "loss": 0.4798, + "step": 1078 + }, + { + "epoch": 0.216, + "learning_rate": 8.81596031693499e-06, + "loss": 0.3537, + "step": 1080 + }, + { + "epoch": 0.2164, + "learning_rate": 8.829825880583224e-06, + "loss": 0.2643, + "step": 1082 + }, + { + "epoch": 0.2168, + "learning_rate": 8.84369372554578e-06, + "loss": 0.527, + "step": 1084 + }, + { + "epoch": 0.2172, + "learning_rate": 8.85756382478659e-06, + "loss": 0.5204, + "step": 1086 + }, + { + "epoch": 0.2176, + "learning_rate": 8.87143615126518e-06, + "loss": 0.3133, + "step": 1088 + }, + { + "epoch": 0.218, + "learning_rate": 8.88531067793674e-06, + "loss": 0.4401, + "step": 1090 + }, + { + "epoch": 0.2184, + "learning_rate": 8.899187377752173e-06, + "loss": 0.6687, + "step": 1092 + }, + { + "epoch": 0.2188, + "learning_rate": 8.913066223658141e-06, + "loss": 0.5779, + "step": 1094 + }, + { + "epoch": 0.2192, + "learning_rate": 8.926947188597127e-06, + "loss": 0.494, + "step": 1096 + }, + { + "epoch": 0.2196, + "learning_rate": 8.940830245507473e-06, + "loss": 0.4146, + "step": 1098 + }, + { + "epoch": 0.22, + "learning_rate": 8.954715367323473e-06, + "loss": 0.3231, + "step": 1100 + }, + { + "epoch": 0.2204, + "learning_rate": 8.968602526975317e-06, + "loss": 0.2961, + "step": 1102 + }, + { + "epoch": 0.2208, + "learning_rate": 8.982491697389344e-06, + "loss": 0.9067, + "step": 1104 + }, + { + "epoch": 0.2212, + "learning_rate": 8.996382851487839e-06, + "loss": 0.2304, + "step": 1106 + }, + { + "epoch": 0.2216, + "learning_rate": 9.010275962189356e-06, + "loss": 0.4528, + "step": 1108 + }, + { + "epoch": 0.222, + "learning_rate": 9.024171002408509e-06, + "loss": 0.5356, + "step": 1110 + }, + { + "epoch": 0.2224, + "learning_rate": 9.03806794505621e-06, + "loss": 0.28, + "step": 1112 + }, + { + "epoch": 0.2228, + "learning_rate": 9.051966763039708e-06, + "loss": 0.3268, + "step": 1114 + }, + { + "epoch": 0.2232, + "learning_rate": 9.065867429262497e-06, + "loss": 0.3966, + "step": 1116 + }, + { + "epoch": 0.2236, + "learning_rate": 9.07976991662453e-06, + "loss": 0.8257, + "step": 1118 + }, + { + "epoch": 0.224, + "learning_rate": 9.093674198022198e-06, + "loss": 0.3123, + "step": 1120 + }, + { + "epoch": 0.2244, + "learning_rate": 9.107580246348395e-06, + "loss": 0.4364, + "step": 1122 + }, + { + "epoch": 0.2248, + "learning_rate": 9.121488034492567e-06, + "loss": 0.7562, + "step": 1124 + }, + { + "epoch": 0.2252, + "learning_rate": 9.135397535340768e-06, + "loss": 0.3469, + "step": 1126 + }, + { + "epoch": 0.2256, + "learning_rate": 9.149308721775717e-06, + "loss": 1.2449, + "step": 1128 + }, + { + "epoch": 0.226, + "learning_rate": 9.16322156667684e-06, + "loss": 0.4778, + "step": 1130 + }, + { + "epoch": 0.2264, + "learning_rate": 9.177136042920338e-06, + "loss": 0.1348, + "step": 1132 + }, + { + "epoch": 0.2268, + "learning_rate": 9.191052123379227e-06, + "loss": 0.2866, + "step": 1134 + }, + { + "epoch": 0.2272, + "learning_rate": 9.204969780923396e-06, + "loss": 0.3545, + "step": 1136 + }, + { + "epoch": 0.2276, + "learning_rate": 9.218888988419656e-06, + "loss": 0.3384, + "step": 1138 + }, + { + "epoch": 0.228, + "learning_rate": 9.232809718731822e-06, + "loss": 0.5363, + "step": 1140 + }, + { + "epoch": 0.2284, + "learning_rate": 9.246731944720663e-06, + "loss": 0.5801, + "step": 1142 + }, + { + "epoch": 0.2288, + "learning_rate": 9.26065563924414e-06, + "loss": 0.3464, + "step": 1144 + }, + { + "epoch": 0.2292, + "learning_rate": 9.274580775157299e-06, + "loss": 0.3997, + "step": 1146 + }, + { + "epoch": 0.2296, + "learning_rate": 9.288507325312319e-06, + "loss": 0.7596, + "step": 1148 + }, + { + "epoch": 0.23, + "learning_rate": 9.302435262558752e-06, + "loss": 0.452, + "step": 1150 + }, + { + "epoch": 0.2304, + "learning_rate": 9.316364559743298e-06, + "loss": 0.2643, + "step": 1152 + }, + { + "epoch": 0.2308, + "learning_rate": 9.330295189710153e-06, + "loss": 0.4243, + "step": 1154 + }, + { + "epoch": 0.2312, + "learning_rate": 9.344227125300788e-06, + "loss": 0.2009, + "step": 1156 + }, + { + "epoch": 0.2316, + "learning_rate": 9.358160339354196e-06, + "loss": 0.3203, + "step": 1158 + }, + { + "epoch": 0.232, + "learning_rate": 9.372094804706867e-06, + "loss": 0.9398, + "step": 1160 + }, + { + "epoch": 0.2324, + "learning_rate": 9.386030494192826e-06, + "loss": 0.2386, + "step": 1162 + }, + { + "epoch": 0.2328, + "learning_rate": 9.39996738064379e-06, + "loss": 0.3563, + "step": 1164 + }, + { + "epoch": 0.2332, + "learning_rate": 9.413905436889032e-06, + "loss": 0.5222, + "step": 1166 + }, + { + "epoch": 0.2336, + "learning_rate": 9.427844635755615e-06, + "loss": 0.7201, + "step": 1168 + }, + { + "epoch": 0.234, + "learning_rate": 9.441784950068357e-06, + "loss": 0.3855, + "step": 1170 + }, + { + "epoch": 0.2344, + "learning_rate": 9.455726352649904e-06, + "loss": 0.4182, + "step": 1172 + }, + { + "epoch": 0.2348, + "learning_rate": 9.469668816320777e-06, + "loss": 0.5191, + "step": 1174 + }, + { + "epoch": 0.2352, + "learning_rate": 9.483612313899446e-06, + "loss": 0.3424, + "step": 1176 + }, + { + "epoch": 0.2356, + "learning_rate": 9.497556818202297e-06, + "loss": 0.2497, + "step": 1178 + }, + { + "epoch": 0.236, + "learning_rate": 9.511502302043859e-06, + "loss": 0.4109, + "step": 1180 + }, + { + "epoch": 0.2364, + "learning_rate": 9.52544873823668e-06, + "loss": 0.5524, + "step": 1182 + }, + { + "epoch": 0.2368, + "learning_rate": 9.539396099591469e-06, + "loss": 0.0892, + "step": 1184 + }, + { + "epoch": 0.2372, + "learning_rate": 9.553344358917146e-06, + "loss": 0.654, + "step": 1186 + }, + { + "epoch": 0.2376, + "learning_rate": 9.567293489020816e-06, + "loss": 0.2993, + "step": 1188 + }, + { + "epoch": 0.238, + "learning_rate": 9.581243462708009e-06, + "loss": 0.3409, + "step": 1190 + }, + { + "epoch": 0.2384, + "learning_rate": 9.595194252782461e-06, + "loss": 0.6568, + "step": 1192 + }, + { + "epoch": 0.2388, + "learning_rate": 9.609145832046469e-06, + "loss": 0.1691, + "step": 1194 + }, + { + "epoch": 0.2392, + "learning_rate": 9.623098173300656e-06, + "loss": 0.6078, + "step": 1196 + }, + { + "epoch": 0.2396, + "learning_rate": 9.637051249344225e-06, + "loss": 0.3729, + "step": 1198 + }, + { + "epoch": 0.24, + "learning_rate": 9.651005032974991e-06, + "loss": 0.33, + "step": 1200 + }, + { + "epoch": 0.2404, + "learning_rate": 9.664959496989285e-06, + "loss": 0.5501, + "step": 1202 + }, + { + "epoch": 0.2408, + "learning_rate": 9.678914614182184e-06, + "loss": 0.341, + "step": 1204 + }, + { + "epoch": 0.2412, + "learning_rate": 9.69287035734747e-06, + "loss": 0.6727, + "step": 1206 + }, + { + "epoch": 0.2416, + "learning_rate": 9.706826699277714e-06, + "loss": 1.3072, + "step": 1208 + }, + { + "epoch": 0.242, + "learning_rate": 9.720783612764307e-06, + "loss": 0.6377, + "step": 1210 + }, + { + "epoch": 0.2424, + "learning_rate": 9.734741070597535e-06, + "loss": 0.8198, + "step": 1212 + }, + { + "epoch": 0.2428, + "learning_rate": 9.74869904556662e-06, + "loss": 0.316, + "step": 1214 + }, + { + "epoch": 0.2432, + "learning_rate": 9.762657510459774e-06, + "loss": 0.3971, + "step": 1216 + }, + { + "epoch": 0.2436, + "learning_rate": 9.776616438064255e-06, + "loss": 0.24, + "step": 1218 + }, + { + "epoch": 0.244, + "learning_rate": 9.790575801166422e-06, + "loss": 0.575, + "step": 1220 + }, + { + "epoch": 0.2444, + "learning_rate": 9.804535572551782e-06, + "loss": 0.2665, + "step": 1222 + }, + { + "epoch": 0.2448, + "learning_rate": 9.818495725005043e-06, + "loss": 0.4151, + "step": 1224 + }, + { + "epoch": 0.2452, + "learning_rate": 9.832456231310194e-06, + "loss": 0.1338, + "step": 1226 + }, + { + "epoch": 0.2456, + "learning_rate": 9.846417064250459e-06, + "loss": 0.2552, + "step": 1228 + }, + { + "epoch": 0.246, + "learning_rate": 9.860378196608552e-06, + "loss": 0.733, + "step": 1230 + }, + { + "epoch": 0.2464, + "learning_rate": 9.874339601166479e-06, + "loss": 0.7393, + "step": 1232 + }, + { + "epoch": 0.2468, + "learning_rate": 9.888301250705765e-06, + "loss": 0.5403, + "step": 1234 + }, + { + "epoch": 0.2472, + "learning_rate": 9.902263118007513e-06, + "loss": 0.1973, + "step": 1236 + }, + { + "epoch": 0.2476, + "learning_rate": 9.916225175852278e-06, + "loss": 0.2178, + "step": 1238 + }, + { + "epoch": 0.248, + "learning_rate": 9.930187397020385e-06, + "loss": 0.3829, + "step": 1240 + }, + { + "epoch": 0.2484, + "learning_rate": 9.944149754291716e-06, + "loss": 0.3331, + "step": 1242 + }, + { + "epoch": 0.2488, + "learning_rate": 9.95811222044596e-06, + "loss": 0.2738, + "step": 1244 + }, + { + "epoch": 0.2492, + "learning_rate": 9.972074768262572e-06, + "loss": 0.4324, + "step": 1246 + }, + { + "epoch": 0.2496, + "learning_rate": 9.986037370520855e-06, + "loss": 0.4488, + "step": 1248 + }, + { + "epoch": 0.25, + "learning_rate": 9.999999999999996e-06, + "loss": 0.4487, + "step": 1250 + }, + { + "epoch": 0.2504, + "learning_rate": 1.0013962629479139e-05, + "loss": 0.3213, + "step": 1252 + }, + { + "epoch": 0.2508, + "learning_rate": 1.0027925231737419e-05, + "loss": 0.2961, + "step": 1254 + }, + { + "epoch": 0.2512, + "learning_rate": 1.0041887779554034e-05, + "loss": 0.2068, + "step": 1256 + }, + { + "epoch": 0.2516, + "learning_rate": 1.0055850245708276e-05, + "loss": 0.3443, + "step": 1258 + }, + { + "epoch": 0.252, + "learning_rate": 1.0069812602979607e-05, + "loss": 0.5035, + "step": 1260 + }, + { + "epoch": 0.2524, + "learning_rate": 1.0083774824147717e-05, + "loss": 0.1745, + "step": 1262 + }, + { + "epoch": 0.2528, + "learning_rate": 1.0097736881992482e-05, + "loss": 0.596, + "step": 1264 + }, + { + "epoch": 0.2532, + "learning_rate": 1.011169874929423e-05, + "loss": 0.2749, + "step": 1266 + }, + { + "epoch": 0.2536, + "learning_rate": 1.0125660398833514e-05, + "loss": 0.5535, + "step": 1268 + }, + { + "epoch": 0.254, + "learning_rate": 1.013962180339144e-05, + "loss": 0.3698, + "step": 1270 + }, + { + "epoch": 0.2544, + "learning_rate": 1.0153582935749533e-05, + "loss": 0.47, + "step": 1272 + }, + { + "epoch": 0.2548, + "learning_rate": 1.01675437686898e-05, + "loss": 0.4238, + "step": 1274 + }, + { + "epoch": 0.2552, + "learning_rate": 1.0181504274994952e-05, + "loss": 1.0414, + "step": 1276 + }, + { + "epoch": 0.2556, + "learning_rate": 1.0195464427448212e-05, + "loss": 0.2627, + "step": 1278 + }, + { + "epoch": 0.256, + "learning_rate": 1.0209424198833571e-05, + "loss": 0.5201, + "step": 1280 + }, + { + "epoch": 0.2564, + "learning_rate": 1.0223383561935738e-05, + "loss": 0.2721, + "step": 1282 + }, + { + "epoch": 0.2568, + "learning_rate": 1.0237342489540218e-05, + "loss": 0.1794, + "step": 1284 + }, + { + "epoch": 0.2572, + "learning_rate": 1.0251300954433374e-05, + "loss": 0.4827, + "step": 1286 + }, + { + "epoch": 0.2576, + "learning_rate": 1.0265258929402458e-05, + "loss": 0.2802, + "step": 1288 + }, + { + "epoch": 0.258, + "learning_rate": 1.0279216387235686e-05, + "loss": 0.169, + "step": 1290 + }, + { + "epoch": 0.2584, + "learning_rate": 1.029317330072228e-05, + "loss": 0.3722, + "step": 1292 + }, + { + "epoch": 0.2588, + "learning_rate": 1.0307129642652523e-05, + "loss": 0.4635, + "step": 1294 + }, + { + "epoch": 0.2592, + "learning_rate": 1.0321085385817811e-05, + "loss": 0.6744, + "step": 1296 + }, + { + "epoch": 0.2596, + "learning_rate": 1.033504050301071e-05, + "loss": 0.3408, + "step": 1298 + }, + { + "epoch": 0.26, + "learning_rate": 1.0348994967025004e-05, + "loss": 0.6595, + "step": 1300 + }, + { + "epoch": 0.2604, + "learning_rate": 1.0362948750655768e-05, + "loss": 0.4341, + "step": 1302 + }, + { + "epoch": 0.2608, + "learning_rate": 1.0376901826699337e-05, + "loss": 0.4674, + "step": 1304 + }, + { + "epoch": 0.2612, + "learning_rate": 1.0390854167953526e-05, + "loss": 0.7926, + "step": 1306 + }, + { + "epoch": 0.2616, + "learning_rate": 1.0404805747217532e-05, + "loss": 0.8566, + "step": 1308 + }, + { + "epoch": 0.262, + "learning_rate": 1.0418756537291984e-05, + "loss": 0.6801, + "step": 1310 + }, + { + "epoch": 0.2624, + "learning_rate": 1.0432706510979175e-05, + "loss": 0.2952, + "step": 1312 + }, + { + "epoch": 0.2628, + "learning_rate": 1.0446655641082846e-05, + "loss": 0.4488, + "step": 1314 + }, + { + "epoch": 0.2632, + "learning_rate": 1.0460603900408526e-05, + "loss": 0.2719, + "step": 1316 + }, + { + "epoch": 0.2636, + "learning_rate": 1.0474551261763312e-05, + "loss": 0.186, + "step": 1318 + }, + { + "epoch": 0.264, + "learning_rate": 1.0488497697956134e-05, + "loss": 0.4073, + "step": 1320 + }, + { + "epoch": 0.2644, + "learning_rate": 1.0502443181797696e-05, + "loss": 0.3498, + "step": 1322 + }, + { + "epoch": 0.2648, + "learning_rate": 1.0516387686100549e-05, + "loss": 0.7824, + "step": 1324 + }, + { + "epoch": 0.2652, + "learning_rate": 1.0530331183679216e-05, + "loss": 0.3453, + "step": 1326 + }, + { + "epoch": 0.2656, + "learning_rate": 1.054427364735009e-05, + "loss": 0.5599, + "step": 1328 + }, + { + "epoch": 0.266, + "learning_rate": 1.0558215049931634e-05, + "loss": 0.3058, + "step": 1330 + }, + { + "epoch": 0.2664, + "learning_rate": 1.0572155364244378e-05, + "loss": 1.0564, + "step": 1332 + }, + { + "epoch": 0.2668, + "learning_rate": 1.058609456311096e-05, + "loss": 0.2901, + "step": 1334 + }, + { + "epoch": 0.2672, + "learning_rate": 1.0600032619356203e-05, + "loss": 2.8422, + "step": 1336 + }, + { + "epoch": 0.2676, + "learning_rate": 1.0613969505807167e-05, + "loss": 0.272, + "step": 1338 + }, + { + "epoch": 0.268, + "learning_rate": 1.0627905195293127e-05, + "loss": 0.192, + "step": 1340 + }, + { + "epoch": 0.2684, + "learning_rate": 1.0641839660645795e-05, + "loss": 0.4333, + "step": 1342 + }, + { + "epoch": 0.2688, + "learning_rate": 1.0655772874699206e-05, + "loss": 0.2916, + "step": 1344 + }, + { + "epoch": 0.2692, + "learning_rate": 1.066970481028984e-05, + "loss": 0.3684, + "step": 1346 + }, + { + "epoch": 0.2696, + "learning_rate": 1.0683635440256694e-05, + "loss": 0.2411, + "step": 1348 + }, + { + "epoch": 0.27, + "learning_rate": 1.0697564737441242e-05, + "loss": 0.255, + "step": 1350 + }, + { + "epoch": 0.2704, + "learning_rate": 1.0711492674687674e-05, + "loss": 2.549, + "step": 1352 + }, + { + "epoch": 0.2708, + "learning_rate": 1.0725419224842695e-05, + "loss": 0.2908, + "step": 1354 + }, + { + "epoch": 0.2712, + "learning_rate": 1.0739344360755855e-05, + "loss": 0.5812, + "step": 1356 + }, + { + "epoch": 0.2716, + "learning_rate": 1.0753268055279332e-05, + "loss": 0.1264, + "step": 1358 + }, + { + "epoch": 0.272, + "learning_rate": 1.0767190281268171e-05, + "loss": 0.4746, + "step": 1360 + }, + { + "epoch": 0.2724, + "learning_rate": 1.0781111011580336e-05, + "loss": 0.2499, + "step": 1362 + }, + { + "epoch": 0.2728, + "learning_rate": 1.07950302190766e-05, + "loss": 0.5753, + "step": 1364 + }, + { + "epoch": 0.2732, + "learning_rate": 1.0808947876620766e-05, + "loss": 0.2242, + "step": 1366 + }, + { + "epoch": 0.2736, + "learning_rate": 1.0822863957079654e-05, + "loss": 0.5205, + "step": 1368 + }, + { + "epoch": 0.274, + "learning_rate": 1.0836778433323153e-05, + "loss": 0.5005, + "step": 1370 + }, + { + "epoch": 0.2744, + "learning_rate": 1.0850691278224277e-05, + "loss": 0.2949, + "step": 1372 + }, + { + "epoch": 0.2748, + "learning_rate": 1.0864602464659227e-05, + "loss": 0.1662, + "step": 1374 + }, + { + "epoch": 0.2752, + "learning_rate": 1.0878511965507428e-05, + "loss": 0.3944, + "step": 1376 + }, + { + "epoch": 0.2756, + "learning_rate": 1.0892419753651598e-05, + "loss": 0.4994, + "step": 1378 + }, + { + "epoch": 0.276, + "learning_rate": 1.0906325801977795e-05, + "loss": 0.6333, + "step": 1380 + }, + { + "epoch": 0.2764, + "learning_rate": 1.0920230083375465e-05, + "loss": 0.5979, + "step": 1382 + }, + { + "epoch": 0.2768, + "learning_rate": 1.0934132570737497e-05, + "loss": 0.6516, + "step": 1384 + }, + { + "epoch": 0.2772, + "learning_rate": 1.0948033236960285e-05, + "loss": 0.5031, + "step": 1386 + }, + { + "epoch": 0.2776, + "learning_rate": 1.0961932054943785e-05, + "loss": 0.5321, + "step": 1388 + }, + { + "epoch": 0.278, + "learning_rate": 1.0975828997591484e-05, + "loss": 0.5573, + "step": 1390 + }, + { + "epoch": 0.2784, + "learning_rate": 1.098972403781064e-05, + "loss": 0.5062, + "step": 1392 + }, + { + "epoch": 0.2788, + "learning_rate": 1.1003617148512154e-05, + "loss": 0.1223, + "step": 1394 + }, + { + "epoch": 0.2792, + "learning_rate": 1.101750830261065e-05, + "loss": 0.158, + "step": 1396 + }, + { + "epoch": 0.2796, + "learning_rate": 1.1031397473024676e-05, + "loss": 0.2081, + "step": 1398 + }, + { + "epoch": 0.28, + "learning_rate": 1.104528463267652e-05, + "loss": 0.3427, + "step": 1400 + }, + { + "epoch": 0.2804, + "learning_rate": 1.1059169754492518e-05, + "loss": 0.3411, + "step": 1402 + }, + { + "epoch": 0.2808, + "learning_rate": 1.1073052811402867e-05, + "loss": 0.787, + "step": 1404 + }, + { + "epoch": 0.2812, + "learning_rate": 1.108693377634185e-05, + "loss": 0.8588, + "step": 1406 + }, + { + "epoch": 0.2816, + "learning_rate": 1.1100812622247821e-05, + "loss": 0.4741, + "step": 1408 + }, + { + "epoch": 0.282, + "learning_rate": 1.1114689322063252e-05, + "loss": 0.3054, + "step": 1410 + }, + { + "epoch": 0.2824, + "learning_rate": 1.1128563848734815e-05, + "loss": 0.2285, + "step": 1412 + }, + { + "epoch": 0.2828, + "learning_rate": 1.1142436175213404e-05, + "loss": 0.4861, + "step": 1414 + }, + { + "epoch": 0.2832, + "learning_rate": 1.1156306274454211e-05, + "loss": 0.4535, + "step": 1416 + }, + { + "epoch": 0.2836, + "learning_rate": 1.117017411941677e-05, + "loss": 0.3787, + "step": 1418 + }, + { + "epoch": 0.284, + "learning_rate": 1.1184039683065002e-05, + "loss": 0.3924, + "step": 1420 + }, + { + "epoch": 0.2844, + "learning_rate": 1.1197902938367289e-05, + "loss": 0.4588, + "step": 1422 + }, + { + "epoch": 0.2848, + "learning_rate": 1.1211763858296516e-05, + "loss": 0.4571, + "step": 1424 + }, + { + "epoch": 0.2852, + "learning_rate": 1.122562241583006e-05, + "loss": 0.4542, + "step": 1426 + }, + { + "epoch": 0.2856, + "learning_rate": 1.1239478583950007e-05, + "loss": 0.1352, + "step": 1428 + }, + { + "epoch": 0.286, + "learning_rate": 1.1253332335643033e-05, + "loss": 1.1907, + "step": 1430 + }, + { + "epoch": 0.2864, + "learning_rate": 1.1267183643900534e-05, + "loss": 0.617, + "step": 1432 + }, + { + "epoch": 0.2868, + "learning_rate": 1.1281032481718701e-05, + "loss": 0.7864, + "step": 1434 + }, + { + "epoch": 0.2872, + "learning_rate": 1.1294878822098456e-05, + "loss": 0.5383, + "step": 1436 + }, + { + "epoch": 0.2876, + "learning_rate": 1.1308722638045725e-05, + "loss": 0.469, + "step": 1438 + }, + { + "epoch": 0.288, + "learning_rate": 1.1322563902571227e-05, + "loss": 0.8571, + "step": 1440 + }, + { + "epoch": 0.2884, + "learning_rate": 1.1336402588690725e-05, + "loss": 0.1857, + "step": 1442 + }, + { + "epoch": 0.2888, + "learning_rate": 1.1350238669424993e-05, + "loss": 0.5524, + "step": 1444 + }, + { + "epoch": 0.2892, + "learning_rate": 1.1364072117799864e-05, + "loss": 0.5513, + "step": 1446 + }, + { + "epoch": 0.2896, + "learning_rate": 1.137790290684638e-05, + "loss": 0.2028, + "step": 1448 + }, + { + "epoch": 0.29, + "learning_rate": 1.1391731009600652e-05, + "loss": 0.6303, + "step": 1450 + }, + { + "epoch": 0.2904, + "learning_rate": 1.1405556399104108e-05, + "loss": 0.6124, + "step": 1452 + }, + { + "epoch": 0.2908, + "learning_rate": 1.141937904840344e-05, + "loss": 0.6582, + "step": 1454 + }, + { + "epoch": 0.2912, + "learning_rate": 1.143319893055069e-05, + "loss": 0.4317, + "step": 1456 + }, + { + "epoch": 0.2916, + "learning_rate": 1.1447016018603286e-05, + "loss": 0.3871, + "step": 1458 + }, + { + "epoch": 0.292, + "learning_rate": 1.1460830285624112e-05, + "loss": 0.5083, + "step": 1460 + }, + { + "epoch": 0.2924, + "learning_rate": 1.1474641704681541e-05, + "loss": 0.1262, + "step": 1462 + }, + { + "epoch": 0.2928, + "learning_rate": 1.1488450248849515e-05, + "loss": 0.1934, + "step": 1464 + }, + { + "epoch": 0.2932, + "learning_rate": 1.150225589120756e-05, + "loss": 0.7433, + "step": 1466 + }, + { + "epoch": 0.2936, + "learning_rate": 1.1516058604840881e-05, + "loss": 0.5863, + "step": 1468 + }, + { + "epoch": 0.294, + "learning_rate": 1.1529858362840388e-05, + "loss": 0.143, + "step": 1470 + }, + { + "epoch": 0.2944, + "learning_rate": 1.15436551383027e-05, + "loss": 0.2944, + "step": 1472 + }, + { + "epoch": 0.2948, + "learning_rate": 1.1557448904330366e-05, + "loss": 0.2327, + "step": 1474 + }, + { + "epoch": 0.2952, + "learning_rate": 1.1571239634031666e-05, + "loss": 0.3026, + "step": 1476 + }, + { + "epoch": 0.2956, + "learning_rate": 1.158502730052093e-05, + "loss": 0.1145, + "step": 1478 + }, + { + "epoch": 0.296, + "learning_rate": 1.1598811876918352e-05, + "loss": 0.4268, + "step": 1480 + }, + { + "epoch": 0.2964, + "learning_rate": 1.161259333635019e-05, + "loss": 0.2187, + "step": 1482 + }, + { + "epoch": 0.2968, + "learning_rate": 1.1626371651948839e-05, + "loss": 0.2586, + "step": 1484 + }, + { + "epoch": 0.2972, + "learning_rate": 1.1640146796852711e-05, + "loss": 0.659, + "step": 1486 + }, + { + "epoch": 0.2976, + "learning_rate": 1.1653918744206478e-05, + "loss": 0.2556, + "step": 1488 + }, + { + "epoch": 0.298, + "learning_rate": 1.1667687467161021e-05, + "loss": 1.9373, + "step": 1490 + }, + { + "epoch": 0.2984, + "learning_rate": 1.1681452938873515e-05, + "loss": 0.7823, + "step": 1492 + }, + { + "epoch": 0.2988, + "learning_rate": 1.169521513250746e-05, + "loss": 0.3833, + "step": 1494 + }, + { + "epoch": 0.2992, + "learning_rate": 1.1708974021232763e-05, + "loss": 0.7123, + "step": 1496 + }, + { + "epoch": 0.2996, + "learning_rate": 1.1722729578225762e-05, + "loss": 0.1582, + "step": 1498 + }, + { + "epoch": 0.3, + "learning_rate": 1.1736481776669297e-05, + "loss": 0.2081, + "step": 1500 + }, + { + "epoch": 0.3004, + "learning_rate": 1.1750230589752753e-05, + "loss": 0.2514, + "step": 1502 + }, + { + "epoch": 0.3008, + "learning_rate": 1.1763975990672116e-05, + "loss": 0.459, + "step": 1504 + }, + { + "epoch": 0.3012, + "learning_rate": 1.1777717952630023e-05, + "loss": 0.3441, + "step": 1506 + }, + { + "epoch": 0.3016, + "learning_rate": 1.1791456448835815e-05, + "loss": 0.1616, + "step": 1508 + }, + { + "epoch": 0.302, + "learning_rate": 1.180519145250561e-05, + "loss": 0.2462, + "step": 1510 + }, + { + "epoch": 0.3024, + "learning_rate": 1.1818922936862258e-05, + "loss": 0.4865, + "step": 1512 + }, + { + "epoch": 0.3028, + "learning_rate": 1.1832650875135606e-05, + "loss": 0.3133, + "step": 1514 + }, + { + "epoch": 0.3032, + "learning_rate": 1.1846375240562274e-05, + "loss": 0.2671, + "step": 1516 + }, + { + "epoch": 0.3036, + "learning_rate": 1.1860096006385918e-05, + "loss": 0.3608, + "step": 1518 + }, + { + "epoch": 0.304, + "learning_rate": 1.187381314585725e-05, + "loss": 0.3268, + "step": 1520 + }, + { + "epoch": 0.3044, + "learning_rate": 1.1887526632233937e-05, + "loss": 0.2715, + "step": 1522 + }, + { + "epoch": 0.3048, + "learning_rate": 1.1901236438780906e-05, + "loss": 0.2828, + "step": 1524 + }, + { + "epoch": 0.3052, + "learning_rate": 1.191494253877013e-05, + "loss": 0.2979, + "step": 1526 + }, + { + "epoch": 0.3056, + "learning_rate": 1.192864490548089e-05, + "loss": 0.8562, + "step": 1528 + }, + { + "epoch": 0.306, + "learning_rate": 1.1942343512199719e-05, + "loss": 0.3044, + "step": 1530 + }, + { + "epoch": 0.3064, + "learning_rate": 1.195603833222048e-05, + "loss": 0.9839, + "step": 1532 + }, + { + "epoch": 0.3068, + "learning_rate": 1.1969729338844422e-05, + "loss": 0.4034, + "step": 1534 + }, + { + "epoch": 0.3072, + "learning_rate": 1.198341650538023e-05, + "loss": 0.8108, + "step": 1536 + }, + { + "epoch": 0.3076, + "learning_rate": 1.1997099805144066e-05, + "loss": 0.1834, + "step": 1538 + }, + { + "epoch": 0.308, + "learning_rate": 1.2010779211459642e-05, + "loss": 0.3162, + "step": 1540 + }, + { + "epoch": 0.3084, + "learning_rate": 1.2024454697658254e-05, + "loss": 0.4109, + "step": 1542 + }, + { + "epoch": 0.3088, + "learning_rate": 1.203812623707884e-05, + "loss": 0.4498, + "step": 1544 + }, + { + "epoch": 0.3092, + "learning_rate": 1.2051793803068054e-05, + "loss": 0.1773, + "step": 1546 + }, + { + "epoch": 0.3096, + "learning_rate": 1.2065457368980227e-05, + "loss": 0.3029, + "step": 1548 + }, + { + "epoch": 0.31, + "learning_rate": 1.20791169081776e-05, + "loss": 0.5723, + "step": 1550 + }, + { + "epoch": 0.3104, + "learning_rate": 1.2092772394030141e-05, + "loss": 0.3939, + "step": 1552 + }, + { + "epoch": 0.3108, + "learning_rate": 1.210642379991583e-05, + "loss": 0.6722, + "step": 1554 + }, + { + "epoch": 0.3112, + "learning_rate": 1.2120071099220552e-05, + "loss": 0.6192, + "step": 1556 + }, + { + "epoch": 0.3116, + "learning_rate": 1.2133714265338148e-05, + "loss": 0.6173, + "step": 1558 + }, + { + "epoch": 0.312, + "learning_rate": 1.2147353271670637e-05, + "loss": 0.4244, + "step": 1560 + }, + { + "epoch": 0.3124, + "learning_rate": 1.2160988091628006e-05, + "loss": 0.2055, + "step": 1562 + }, + { + "epoch": 0.3128, + "learning_rate": 1.217461869862855e-05, + "loss": 0.4073, + "step": 1564 + }, + { + "epoch": 0.3132, + "learning_rate": 1.2188245066098647e-05, + "loss": 0.4476, + "step": 1566 + }, + { + "epoch": 0.3136, + "learning_rate": 1.2201867167473015e-05, + "loss": 0.6565, + "step": 1568 + }, + { + "epoch": 0.314, + "learning_rate": 1.2215484976194673e-05, + "loss": 0.628, + "step": 1570 + }, + { + "epoch": 0.3144, + "learning_rate": 1.2229098465715002e-05, + "loss": 0.3125, + "step": 1572 + }, + { + "epoch": 0.3148, + "learning_rate": 1.2242707609493809e-05, + "loss": 1.4024, + "step": 1574 + }, + { + "epoch": 0.3152, + "learning_rate": 1.2256312380999373e-05, + "loss": 0.6397, + "step": 1576 + }, + { + "epoch": 0.3156, + "learning_rate": 1.2269912753708496e-05, + "loss": 0.4433, + "step": 1578 + }, + { + "epoch": 0.316, + "learning_rate": 1.2283508701106552e-05, + "loss": 0.3786, + "step": 1580 + }, + { + "epoch": 0.3164, + "learning_rate": 1.229710019668755e-05, + "loss": 0.7145, + "step": 1582 + }, + { + "epoch": 0.3168, + "learning_rate": 1.2310687213954173e-05, + "loss": 0.3936, + "step": 1584 + }, + { + "epoch": 0.3172, + "learning_rate": 1.232426972641785e-05, + "loss": 0.324, + "step": 1586 + }, + { + "epoch": 0.3176, + "learning_rate": 1.233784770759873e-05, + "loss": 0.4066, + "step": 1588 + }, + { + "epoch": 0.318, + "learning_rate": 1.2351421131025891e-05, + "loss": 0.095, + "step": 1590 + }, + { + "epoch": 0.3184, + "learning_rate": 1.2364989970237238e-05, + "loss": 0.3485, + "step": 1592 + }, + { + "epoch": 0.3188, + "learning_rate": 1.237855419877962e-05, + "loss": 0.4817, + "step": 1594 + }, + { + "epoch": 0.3192, + "learning_rate": 1.23921137902089e-05, + "loss": 0.2275, + "step": 1596 + }, + { + "epoch": 0.3196, + "learning_rate": 1.2405668718089906e-05, + "loss": 0.2994, + "step": 1598 + }, + { + "epoch": 0.32, + "learning_rate": 1.241921895599668e-05, + "loss": 0.3392, + "step": 1600 + }, + { + "epoch": 0.3204, + "learning_rate": 1.2432764477512295e-05, + "loss": 0.2856, + "step": 1602 + }, + { + "epoch": 0.3208, + "learning_rate": 1.2446305256229076e-05, + "loss": 0.1886, + "step": 1604 + }, + { + "epoch": 0.3212, + "learning_rate": 1.2459841265748582e-05, + "loss": 0.326, + "step": 1606 + }, + { + "epoch": 0.3216, + "learning_rate": 1.2473372479681653e-05, + "loss": 0.3892, + "step": 1608 + }, + { + "epoch": 0.322, + "learning_rate": 1.2486898871648547e-05, + "loss": 0.2951, + "step": 1610 + }, + { + "epoch": 0.3224, + "learning_rate": 1.2500420415278822e-05, + "loss": 0.2612, + "step": 1612 + }, + { + "epoch": 0.3228, + "learning_rate": 1.2513937084211546e-05, + "loss": 0.6412, + "step": 1614 + }, + { + "epoch": 0.3232, + "learning_rate": 1.2527448852095292e-05, + "loss": 0.6221, + "step": 1616 + }, + { + "epoch": 0.3236, + "learning_rate": 1.2540955692588167e-05, + "loss": 0.4591, + "step": 1618 + }, + { + "epoch": 0.324, + "learning_rate": 1.2554457579357902e-05, + "loss": 0.3751, + "step": 1620 + }, + { + "epoch": 0.3244, + "learning_rate": 1.2567954486081873e-05, + "loss": 0.3578, + "step": 1622 + }, + { + "epoch": 0.3248, + "learning_rate": 1.2581446386447171e-05, + "loss": 0.4868, + "step": 1624 + }, + { + "epoch": 0.3252, + "learning_rate": 1.2594933254150647e-05, + "loss": 0.1881, + "step": 1626 + }, + { + "epoch": 0.3256, + "learning_rate": 1.2608415062898963e-05, + "loss": 0.1632, + "step": 1628 + }, + { + "epoch": 0.326, + "learning_rate": 1.262189178640864e-05, + "loss": 0.1633, + "step": 1630 + }, + { + "epoch": 0.3264, + "learning_rate": 1.2635363398406133e-05, + "loss": 0.3786, + "step": 1632 + }, + { + "epoch": 0.3268, + "learning_rate": 1.2648829872627797e-05, + "loss": 0.2799, + "step": 1634 + }, + { + "epoch": 0.3272, + "learning_rate": 1.266229118282012e-05, + "loss": 0.2151, + "step": 1636 + }, + { + "epoch": 0.3276, + "learning_rate": 1.2675747302739516e-05, + "loss": 0.2396, + "step": 1638 + }, + { + "epoch": 0.328, + "learning_rate": 1.2689198206152644e-05, + "loss": 0.3094, + "step": 1640 + }, + { + "epoch": 0.3284, + "learning_rate": 1.2702643866836281e-05, + "loss": 0.2299, + "step": 1642 + }, + { + "epoch": 0.3288, + "learning_rate": 1.2716084258577373e-05, + "loss": 1.506, + "step": 1644 + }, + { + "epoch": 0.3292, + "learning_rate": 1.2729519355173254e-05, + "loss": 0.5759, + "step": 1646 + }, + { + "epoch": 0.3296, + "learning_rate": 1.2742949130431468e-05, + "loss": 0.3715, + "step": 1648 + }, + { + "epoch": 0.33, + "learning_rate": 1.2756373558169992e-05, + "loss": 0.2455, + "step": 1650 + }, + { + "epoch": 0.3304, + "learning_rate": 1.2769792612217224e-05, + "loss": 0.3149, + "step": 1652 + }, + { + "epoch": 0.3308, + "learning_rate": 1.2783206266412028e-05, + "loss": 0.7631, + "step": 1654 + }, + { + "epoch": 0.3312, + "learning_rate": 1.2796614494603795e-05, + "loss": 0.4574, + "step": 1656 + }, + { + "epoch": 0.3316, + "learning_rate": 1.2810017270652508e-05, + "loss": 0.0546, + "step": 1658 + }, + { + "epoch": 0.332, + "learning_rate": 1.282341456842876e-05, + "loss": 0.8595, + "step": 1660 + }, + { + "epoch": 0.3324, + "learning_rate": 1.283680636181384e-05, + "loss": 0.3372, + "step": 1662 + }, + { + "epoch": 0.3328, + "learning_rate": 1.2850192624699756e-05, + "loss": 0.3071, + "step": 1664 + }, + { + "epoch": 0.3332, + "learning_rate": 1.2863573330989308e-05, + "loss": 0.3267, + "step": 1666 + }, + { + "epoch": 0.3336, + "learning_rate": 1.2876948454596122e-05, + "loss": 0.4947, + "step": 1668 + }, + { + "epoch": 0.334, + "learning_rate": 1.2890317969444708e-05, + "loss": 0.6155, + "step": 1670 + }, + { + "epoch": 0.3344, + "learning_rate": 1.2903681849470535e-05, + "loss": 0.3785, + "step": 1672 + }, + { + "epoch": 0.3348, + "learning_rate": 1.291704006861998e-05, + "loss": 0.646, + "step": 1674 + }, + { + "epoch": 0.3352, + "learning_rate": 1.2930392600850565e-05, + "loss": 0.6758, + "step": 1676 + }, + { + "epoch": 0.3356, + "learning_rate": 1.2943739420130843e-05, + "loss": 0.5161, + "step": 1678 + }, + { + "epoch": 0.336, + "learning_rate": 1.2957080500440455e-05, + "loss": 0.211, + "step": 1680 + }, + { + "epoch": 0.3364, + "learning_rate": 1.2970415815770353e-05, + "loss": 1.1864, + "step": 1682 + }, + { + "epoch": 0.3368, + "learning_rate": 1.2983745340122589e-05, + "loss": 0.6206, + "step": 1684 + }, + { + "epoch": 0.3372, + "learning_rate": 1.299706904751064e-05, + "loss": 0.8598, + "step": 1686 + }, + { + "epoch": 0.3376, + "learning_rate": 1.3010386911959205e-05, + "loss": 0.5634, + "step": 1688 + }, + { + "epoch": 0.338, + "learning_rate": 1.3023698907504447e-05, + "loss": 0.451, + "step": 1690 + }, + { + "epoch": 0.3384, + "learning_rate": 1.3037005008193944e-05, + "loss": 0.2313, + "step": 1692 + }, + { + "epoch": 0.3388, + "learning_rate": 1.3050305188086757e-05, + "loss": 0.6209, + "step": 1694 + }, + { + "epoch": 0.3392, + "learning_rate": 1.3063599421253556e-05, + "loss": 0.3399, + "step": 1696 + }, + { + "epoch": 0.3396, + "learning_rate": 1.3076887681776504e-05, + "loss": 0.3016, + "step": 1698 + }, + { + "epoch": 0.34, + "learning_rate": 1.309016994374947e-05, + "loss": 0.2639, + "step": 1700 + }, + { + "epoch": 0.3404, + "learning_rate": 1.310344618127801e-05, + "loss": 1.0176, + "step": 1702 + }, + { + "epoch": 0.3408, + "learning_rate": 1.3116716368479415e-05, + "loss": 0.7978, + "step": 1704 + }, + { + "epoch": 0.3412, + "learning_rate": 1.3129980479482776e-05, + "loss": 0.374, + "step": 1706 + }, + { + "epoch": 0.3416, + "learning_rate": 1.3143238488429049e-05, + "loss": 0.4477, + "step": 1708 + }, + { + "epoch": 0.342, + "learning_rate": 1.3156490369471018e-05, + "loss": 0.1268, + "step": 1710 + }, + { + "epoch": 0.3424, + "learning_rate": 1.316973609677351e-05, + "loss": 0.1999, + "step": 1712 + }, + { + "epoch": 0.3428, + "learning_rate": 1.3182975644513286e-05, + "loss": 0.9926, + "step": 1714 + }, + { + "epoch": 0.3432, + "learning_rate": 1.319620898687917e-05, + "loss": 0.1662, + "step": 1716 + }, + { + "epoch": 0.3436, + "learning_rate": 1.3209436098072102e-05, + "loss": 0.5184, + "step": 1718 + }, + { + "epoch": 0.344, + "learning_rate": 1.32226569523051e-05, + "loss": 0.3454, + "step": 1720 + }, + { + "epoch": 0.3444, + "learning_rate": 1.3235871523803501e-05, + "loss": 0.2766, + "step": 1722 + }, + { + "epoch": 0.3448, + "learning_rate": 1.324907978680475e-05, + "loss": 1.7348, + "step": 1724 + }, + { + "epoch": 0.3452, + "learning_rate": 1.3262281715558738e-05, + "loss": 0.1671, + "step": 1726 + }, + { + "epoch": 0.3456, + "learning_rate": 1.3275477284327572e-05, + "loss": 0.3433, + "step": 1728 + }, + { + "epoch": 0.346, + "learning_rate": 1.3288666467385815e-05, + "loss": 0.3995, + "step": 1730 + }, + { + "epoch": 0.3464, + "learning_rate": 1.3301849239020537e-05, + "loss": 0.408, + "step": 1732 + }, + { + "epoch": 0.3468, + "learning_rate": 1.3315025573531193e-05, + "loss": 0.4914, + "step": 1734 + }, + { + "epoch": 0.3472, + "learning_rate": 1.3328195445229865e-05, + "loss": 0.4791, + "step": 1736 + }, + { + "epoch": 0.3476, + "learning_rate": 1.3341358828441214e-05, + "loss": 0.4036, + "step": 1738 + }, + { + "epoch": 0.348, + "learning_rate": 1.3354515697502548e-05, + "loss": 0.4437, + "step": 1740 + }, + { + "epoch": 0.3484, + "learning_rate": 1.3367666026763879e-05, + "loss": 0.1742, + "step": 1742 + }, + { + "epoch": 0.3488, + "learning_rate": 1.338080979058797e-05, + "loss": 0.6327, + "step": 1744 + }, + { + "epoch": 0.3492, + "learning_rate": 1.3393946963350378e-05, + "loss": 0.2109, + "step": 1746 + }, + { + "epoch": 0.3496, + "learning_rate": 1.340707751943951e-05, + "loss": 0.2445, + "step": 1748 + }, + { + "epoch": 0.35, + "learning_rate": 1.3420201433256682e-05, + "loss": 0.3987, + "step": 1750 + }, + { + "epoch": 0.3504, + "learning_rate": 1.3433318679216145e-05, + "loss": 0.3949, + "step": 1752 + }, + { + "epoch": 0.3508, + "learning_rate": 1.3446429231745162e-05, + "loss": 0.2094, + "step": 1754 + }, + { + "epoch": 0.3512, + "learning_rate": 1.3459533065284039e-05, + "loss": 0.1445, + "step": 1756 + }, + { + "epoch": 0.3516, + "learning_rate": 1.3472630154286197e-05, + "loss": 0.2929, + "step": 1758 + }, + { + "epoch": 0.352, + "learning_rate": 1.348572047321814e-05, + "loss": 0.3592, + "step": 1760 + }, + { + "epoch": 0.3524, + "learning_rate": 1.3498803996559692e-05, + "loss": 0.2513, + "step": 1762 + }, + { + "epoch": 0.3528, + "learning_rate": 1.3511880698803803e-05, + "loss": 0.525, + "step": 1764 + }, + { + "epoch": 0.3532, + "learning_rate": 1.3524950554456773e-05, + "loss": 0.8528, + "step": 1766 + }, + { + "epoch": 0.3536, + "learning_rate": 1.3538013538038296e-05, + "loss": 1.1819, + "step": 1768 + }, + { + "epoch": 0.354, + "learning_rate": 1.3551069624081356e-05, + "loss": 0.3931, + "step": 1770 + }, + { + "epoch": 0.3544, + "learning_rate": 1.3564118787132507e-05, + "loss": 0.4501, + "step": 1772 + }, + { + "epoch": 0.3548, + "learning_rate": 1.3577161001751692e-05, + "loss": 0.2557, + "step": 1774 + }, + { + "epoch": 0.3552, + "learning_rate": 1.3590196242512461e-05, + "loss": 0.6927, + "step": 1776 + }, + { + "epoch": 0.3556, + "learning_rate": 1.3603224484001944e-05, + "loss": 0.7696, + "step": 1778 + }, + { + "epoch": 0.356, + "learning_rate": 1.361624570082092e-05, + "loss": 0.5271, + "step": 1780 + }, + { + "epoch": 0.3564, + "learning_rate": 1.362925986758386e-05, + "loss": 0.7454, + "step": 1782 + }, + { + "epoch": 0.3568, + "learning_rate": 1.364226695891898e-05, + "loss": 0.2002, + "step": 1784 + }, + { + "epoch": 0.3572, + "learning_rate": 1.3655266949468287e-05, + "loss": 0.3121, + "step": 1786 + }, + { + "epoch": 0.3576, + "learning_rate": 1.3668259813887637e-05, + "loss": 0.1468, + "step": 1788 + }, + { + "epoch": 0.358, + "learning_rate": 1.3681245526846773e-05, + "loss": 1.1164, + "step": 1790 + }, + { + "epoch": 0.3584, + "learning_rate": 1.3694224063029386e-05, + "loss": 0.4382, + "step": 1792 + }, + { + "epoch": 0.3588, + "learning_rate": 1.3707195397133176e-05, + "loss": 0.5762, + "step": 1794 + }, + { + "epoch": 0.3592, + "learning_rate": 1.3720159503869806e-05, + "loss": 0.3584, + "step": 1796 + }, + { + "epoch": 0.3596, + "learning_rate": 1.3733116357965156e-05, + "loss": 0.5824, + "step": 1798 + }, + { + "epoch": 0.36, + "learning_rate": 1.374606593415911e-05, + "loss": 0.7021, + "step": 1800 + }, + { + "epoch": 0.3604, + "learning_rate": 1.3759008207205855e-05, + "loss": 2.1981, + "step": 1802 + }, + { + "epoch": 0.3608, + "learning_rate": 1.377194315187377e-05, + "loss": 0.3726, + "step": 1804 + }, + { + "epoch": 0.3612, + "learning_rate": 1.3784870742945468e-05, + "loss": 0.4626, + "step": 1806 + }, + { + "epoch": 0.3616, + "learning_rate": 1.3797790955218014e-05, + "loss": 0.3737, + "step": 1808 + }, + { + "epoch": 0.362, + "learning_rate": 1.3810703763502744e-05, + "loss": 0.314, + "step": 1810 + }, + { + "epoch": 0.3624, + "learning_rate": 1.3823609142625492e-05, + "loss": 0.2949, + "step": 1812 + }, + { + "epoch": 0.3628, + "learning_rate": 1.3836507067426563e-05, + "loss": 0.2606, + "step": 1814 + }, + { + "epoch": 0.3632, + "learning_rate": 1.3849397512760793e-05, + "loss": 0.2114, + "step": 1816 + }, + { + "epoch": 0.3636, + "learning_rate": 1.38622804534976e-05, + "loss": 0.4446, + "step": 1818 + }, + { + "epoch": 0.364, + "learning_rate": 1.3875155864521027e-05, + "loss": 0.2529, + "step": 1820 + }, + { + "epoch": 0.3644, + "learning_rate": 1.3888023720729806e-05, + "loss": 0.7536, + "step": 1822 + }, + { + "epoch": 0.3648, + "learning_rate": 1.3900883997037393e-05, + "loss": 0.2468, + "step": 1824 + }, + { + "epoch": 0.3652, + "learning_rate": 1.391373666837202e-05, + "loss": 0.3663, + "step": 1826 + }, + { + "epoch": 0.3656, + "learning_rate": 1.3926581709676746e-05, + "loss": 0.4399, + "step": 1828 + }, + { + "epoch": 0.366, + "learning_rate": 1.3939419095909506e-05, + "loss": 0.3395, + "step": 1830 + }, + { + "epoch": 0.3664, + "learning_rate": 1.3952248802043158e-05, + "loss": 0.482, + "step": 1832 + }, + { + "epoch": 0.3668, + "learning_rate": 1.396507080306555e-05, + "loss": 0.2361, + "step": 1834 + }, + { + "epoch": 0.3672, + "learning_rate": 1.397788507397949e-05, + "loss": 0.4983, + "step": 1836 + }, + { + "epoch": 0.3676, + "learning_rate": 1.3990691589802943e-05, + "loss": 0.1828, + "step": 1838 + }, + { + "epoch": 0.368, + "learning_rate": 1.4003490325568956e-05, + "loss": 0.2123, + "step": 1840 + }, + { + "epoch": 0.3684, + "learning_rate": 1.4016281256325688e-05, + "loss": 0.1257, + "step": 1842 + }, + { + "epoch": 0.3688, + "learning_rate": 1.4029064357136632e-05, + "loss": 0.3976, + "step": 1844 + }, + { + "epoch": 0.3692, + "learning_rate": 1.4041839603080411e-05, + "loss": 0.4432, + "step": 1846 + }, + { + "epoch": 0.3696, + "learning_rate": 1.4054606969251096e-05, + "loss": 0.4912, + "step": 1848 + }, + { + "epoch": 0.37, + "learning_rate": 1.4067366430758004e-05, + "loss": 0.2336, + "step": 1850 + }, + { + "epoch": 0.3704, + "learning_rate": 1.4080117962725929e-05, + "loss": 0.5297, + "step": 1852 + }, + { + "epoch": 0.3708, + "learning_rate": 1.4092861540295107e-05, + "loss": 0.3135, + "step": 1854 + }, + { + "epoch": 0.3712, + "learning_rate": 1.4105597138621281e-05, + "loss": 0.8366, + "step": 1856 + }, + { + "epoch": 0.3716, + "learning_rate": 1.411832473287575e-05, + "loss": 0.5591, + "step": 1858 + }, + { + "epoch": 0.372, + "learning_rate": 1.4131044298245416e-05, + "loss": 0.6536, + "step": 1860 + }, + { + "epoch": 0.3724, + "learning_rate": 1.414375580993284e-05, + "loss": 0.4745, + "step": 1862 + }, + { + "epoch": 0.3728, + "learning_rate": 1.4156459243156275e-05, + "loss": 0.2517, + "step": 1864 + }, + { + "epoch": 0.3732, + "learning_rate": 1.416915457314973e-05, + "loss": 0.2352, + "step": 1866 + }, + { + "epoch": 0.3736, + "learning_rate": 1.418184177516301e-05, + "loss": 0.3512, + "step": 1868 + }, + { + "epoch": 0.374, + "learning_rate": 1.4194520824461782e-05, + "loss": 0.1351, + "step": 1870 + }, + { + "epoch": 0.3744, + "learning_rate": 1.420719169632754e-05, + "loss": 0.2865, + "step": 1872 + }, + { + "epoch": 0.3748, + "learning_rate": 1.4219854366057821e-05, + "loss": 0.5766, + "step": 1874 + }, + { + "epoch": 0.3752, + "learning_rate": 1.4232508808966085e-05, + "loss": 0.2683, + "step": 1876 + }, + { + "epoch": 0.3756, + "learning_rate": 1.424515500038185e-05, + "loss": 0.3567, + "step": 1878 + }, + { + "epoch": 0.376, + "learning_rate": 1.4257792915650735e-05, + "loss": 0.5921, + "step": 1880 + }, + { + "epoch": 0.3764, + "learning_rate": 1.4270422530134425e-05, + "loss": 0.1803, + "step": 1882 + }, + { + "epoch": 0.3768, + "learning_rate": 1.4283043819210906e-05, + "loss": 0.4073, + "step": 1884 + }, + { + "epoch": 0.3772, + "learning_rate": 1.4295656758274288e-05, + "loss": 0.2364, + "step": 1886 + }, + { + "epoch": 0.3776, + "learning_rate": 1.430826132273499e-05, + "loss": 0.3634, + "step": 1888 + }, + { + "epoch": 0.378, + "learning_rate": 1.4320857488019826e-05, + "loss": 0.3056, + "step": 1890 + }, + { + "epoch": 0.3784, + "learning_rate": 1.4333445229571857e-05, + "loss": 0.4522, + "step": 1892 + }, + { + "epoch": 0.3788, + "learning_rate": 1.4346024522850704e-05, + "loss": 0.536, + "step": 1894 + }, + { + "epoch": 0.3792, + "learning_rate": 1.4358595343332342e-05, + "loss": 0.3937, + "step": 1896 + }, + { + "epoch": 0.3796, + "learning_rate": 1.437115766650933e-05, + "loss": 0.5523, + "step": 1898 + }, + { + "epoch": 0.38, + "learning_rate": 1.4383711467890772e-05, + "loss": 0.4545, + "step": 1900 + }, + { + "epoch": 0.3804, + "learning_rate": 1.4396256723002398e-05, + "loss": 0.5919, + "step": 1902 + }, + { + "epoch": 0.3808, + "learning_rate": 1.4408793407386584e-05, + "loss": 0.0782, + "step": 1904 + }, + { + "epoch": 0.3812, + "learning_rate": 1.4421321496602423e-05, + "loss": 0.2252, + "step": 1906 + }, + { + "epoch": 0.3816, + "learning_rate": 1.4433840966225767e-05, + "loss": 0.6248, + "step": 1908 + }, + { + "epoch": 0.382, + "learning_rate": 1.444635179184927e-05, + "loss": 0.4717, + "step": 1910 + }, + { + "epoch": 0.3824, + "learning_rate": 1.4458853949082434e-05, + "loss": 0.0647, + "step": 1912 + }, + { + "epoch": 0.3828, + "learning_rate": 1.4471347413551665e-05, + "loss": 0.4582, + "step": 1914 + }, + { + "epoch": 0.3832, + "learning_rate": 1.4483832160900332e-05, + "loss": 0.3907, + "step": 1916 + }, + { + "epoch": 0.3836, + "learning_rate": 1.4496308166788731e-05, + "loss": 0.2284, + "step": 1918 + }, + { + "epoch": 0.384, + "learning_rate": 1.4508775406894315e-05, + "loss": 0.2488, + "step": 1920 + }, + { + "epoch": 0.3844, + "learning_rate": 1.4521233856911499e-05, + "loss": 0.4022, + "step": 1922 + }, + { + "epoch": 0.3848, + "learning_rate": 1.4533683492551942e-05, + "loss": 0.7484, + "step": 1924 + }, + { + "epoch": 0.3852, + "learning_rate": 1.4546124289544446e-05, + "loss": 0.1981, + "step": 1926 + }, + { + "epoch": 0.3856, + "learning_rate": 1.4558556223634988e-05, + "loss": 0.4014, + "step": 1928 + }, + { + "epoch": 0.386, + "learning_rate": 1.4570979270586944e-05, + "loss": 0.5458, + "step": 1930 + }, + { + "epoch": 0.3864, + "learning_rate": 1.4583393406180886e-05, + "loss": 0.2788, + "step": 1932 + }, + { + "epoch": 0.3868, + "learning_rate": 1.4595798606214882e-05, + "loss": 0.1315, + "step": 1934 + }, + { + "epoch": 0.3872, + "learning_rate": 1.460819484650431e-05, + "loss": 0.3745, + "step": 1936 + }, + { + "epoch": 0.3876, + "learning_rate": 1.4620582102882086e-05, + "loss": 0.506, + "step": 1938 + }, + { + "epoch": 0.388, + "learning_rate": 1.4632960351198618e-05, + "loss": 0.6673, + "step": 1940 + }, + { + "epoch": 0.3884, + "learning_rate": 1.4645329567321875e-05, + "loss": 0.2244, + "step": 1942 + }, + { + "epoch": 0.3888, + "learning_rate": 1.4657689727137441e-05, + "loss": 0.4186, + "step": 1944 + }, + { + "epoch": 0.3892, + "learning_rate": 1.4670040806548551e-05, + "loss": 0.2661, + "step": 1946 + }, + { + "epoch": 0.3896, + "learning_rate": 1.468238278147614e-05, + "loss": 0.2438, + "step": 1948 + }, + { + "epoch": 0.39, + "learning_rate": 1.4694715627858904e-05, + "loss": 0.7098, + "step": 1950 + }, + { + "epoch": 0.3904, + "learning_rate": 1.470703932165332e-05, + "loss": 0.652, + "step": 1952 + }, + { + "epoch": 0.3908, + "learning_rate": 1.471935383883372e-05, + "loss": 0.4836, + "step": 1954 + }, + { + "epoch": 0.3912, + "learning_rate": 1.4731659155392339e-05, + "loss": 0.1309, + "step": 1956 + }, + { + "epoch": 0.3916, + "learning_rate": 1.4743955247339286e-05, + "loss": 0.0943, + "step": 1958 + }, + { + "epoch": 0.392, + "learning_rate": 1.4756242090702744e-05, + "loss": 0.5174, + "step": 1960 + }, + { + "epoch": 0.3924, + "learning_rate": 1.476851966152887e-05, + "loss": 0.1902, + "step": 1962 + }, + { + "epoch": 0.3928, + "learning_rate": 1.4780787935881913e-05, + "loss": 0.4181, + "step": 1964 + }, + { + "epoch": 0.3932, + "learning_rate": 1.4793046889844255e-05, + "loss": 1.2857, + "step": 1966 + }, + { + "epoch": 0.3936, + "learning_rate": 1.4805296499516397e-05, + "loss": 0.4257, + "step": 1968 + }, + { + "epoch": 0.394, + "learning_rate": 1.4817536741017155e-05, + "loss": 0.5827, + "step": 1970 + }, + { + "epoch": 0.3944, + "learning_rate": 1.482976759048351e-05, + "loss": 0.4568, + "step": 1972 + }, + { + "epoch": 0.3948, + "learning_rate": 1.4841989024070809e-05, + "loss": 0.4093, + "step": 1974 + }, + { + "epoch": 0.3952, + "learning_rate": 1.485420101795274e-05, + "loss": 0.3353, + "step": 1976 + }, + { + "epoch": 0.3956, + "learning_rate": 1.4866403548321385e-05, + "loss": 0.3903, + "step": 1978 + }, + { + "epoch": 0.396, + "learning_rate": 1.4878596591387327e-05, + "loss": 0.2244, + "step": 1980 + }, + { + "epoch": 0.3964, + "learning_rate": 1.4890780123379563e-05, + "loss": 0.7337, + "step": 1982 + }, + { + "epoch": 0.3968, + "learning_rate": 1.4902954120545686e-05, + "loss": 0.4356, + "step": 1984 + }, + { + "epoch": 0.3972, + "learning_rate": 1.491511855915187e-05, + "loss": 0.4108, + "step": 1986 + }, + { + "epoch": 0.3976, + "learning_rate": 1.4927273415482913e-05, + "loss": 0.2177, + "step": 1988 + }, + { + "epoch": 0.398, + "learning_rate": 1.4939418665842307e-05, + "loss": 0.163, + "step": 1990 + }, + { + "epoch": 0.3984, + "learning_rate": 1.4951554286552261e-05, + "loss": 0.2953, + "step": 1992 + }, + { + "epoch": 0.3988, + "learning_rate": 1.4963680253953763e-05, + "loss": 0.716, + "step": 1994 + }, + { + "epoch": 0.3992, + "learning_rate": 1.4975796544406617e-05, + "loss": 0.6106, + "step": 1996 + }, + { + "epoch": 0.3996, + "learning_rate": 1.49879031342895e-05, + "loss": 0.1675, + "step": 1998 + }, + { + "epoch": 0.4, + "learning_rate": 1.4999999999999992e-05, + "loss": 0.5611, + "step": 2000 + }, + { + "epoch": 0.4004, + "learning_rate": 1.501208711795465e-05, + "loss": 0.1502, + "step": 2002 + }, + { + "epoch": 0.4008, + "learning_rate": 1.502416446458897e-05, + "loss": 0.3786, + "step": 2004 + }, + { + "epoch": 0.4012, + "learning_rate": 1.5036232016357613e-05, + "loss": 0.4304, + "step": 2006 + }, + { + "epoch": 0.4016, + "learning_rate": 1.5048289749734206e-05, + "loss": 0.471, + "step": 2008 + }, + { + "epoch": 0.402, + "learning_rate": 1.5060337641211642e-05, + "loss": 0.6029, + "step": 2010 + }, + { + "epoch": 0.4024, + "learning_rate": 1.5072375667301895e-05, + "loss": 0.2245, + "step": 2012 + }, + { + "epoch": 0.4028, + "learning_rate": 1.5084403804536214e-05, + "loss": 0.2382, + "step": 2014 + }, + { + "epoch": 0.4032, + "learning_rate": 1.5096422029465178e-05, + "loss": 0.5211, + "step": 2016 + }, + { + "epoch": 0.4036, + "learning_rate": 1.5108430318658597e-05, + "loss": 0.1845, + "step": 2018 + }, + { + "epoch": 0.404, + "learning_rate": 1.5120428648705714e-05, + "loss": 0.1657, + "step": 2020 + }, + { + "epoch": 0.4044, + "learning_rate": 1.513241699621517e-05, + "loss": 0.2888, + "step": 2022 + }, + { + "epoch": 0.4048, + "learning_rate": 1.5144395337815064e-05, + "loss": 0.3165, + "step": 2024 + }, + { + "epoch": 0.4052, + "learning_rate": 1.5156363650153008e-05, + "loss": 0.3731, + "step": 2026 + }, + { + "epoch": 0.4056, + "learning_rate": 1.5168321909896166e-05, + "loss": 0.383, + "step": 2028 + }, + { + "epoch": 0.406, + "learning_rate": 1.51802700937313e-05, + "loss": 0.6397, + "step": 2030 + }, + { + "epoch": 0.4064, + "learning_rate": 1.5192208178364808e-05, + "loss": 0.2375, + "step": 2032 + }, + { + "epoch": 0.4068, + "learning_rate": 1.5204136140522792e-05, + "loss": 0.5406, + "step": 2034 + }, + { + "epoch": 0.4072, + "learning_rate": 1.521605395695107e-05, + "loss": 0.1518, + "step": 2036 + }, + { + "epoch": 0.4076, + "learning_rate": 1.522796160441526e-05, + "loss": 1.0324, + "step": 2038 + }, + { + "epoch": 0.408, + "learning_rate": 1.5239859059700784e-05, + "loss": 0.6884, + "step": 2040 + }, + { + "epoch": 0.4084, + "learning_rate": 1.5251746299612964e-05, + "loss": 0.337, + "step": 2042 + }, + { + "epoch": 0.4088, + "learning_rate": 1.526362330097697e-05, + "loss": 0.404, + "step": 2044 + }, + { + "epoch": 0.4092, + "learning_rate": 1.5275490040638044e-05, + "loss": 0.396, + "step": 2046 + }, + { + "epoch": 0.4096, + "learning_rate": 1.5287346495461322e-05, + "loss": 0.2327, + "step": 2048 + }, + { + "epoch": 0.41, + "learning_rate": 1.529919264233204e-05, + "loss": 0.4314, + "step": 2050 + }, + { + "epoch": 0.4104, + "learning_rate": 1.531102845815557e-05, + "loss": 0.5474, + "step": 2052 + }, + { + "epoch": 0.4108, + "learning_rate": 1.5322853919857327e-05, + "loss": 0.7187, + "step": 2054 + }, + { + "epoch": 0.4112, + "learning_rate": 1.5334669004383025e-05, + "loss": 0.5364, + "step": 2056 + }, + { + "epoch": 0.4116, + "learning_rate": 1.5346473688698514e-05, + "loss": 0.0958, + "step": 2058 + }, + { + "epoch": 0.412, + "learning_rate": 1.5358267949789968e-05, + "loss": 0.3095, + "step": 2060 + }, + { + "epoch": 0.4124, + "learning_rate": 1.537005176466387e-05, + "loss": 0.3301, + "step": 2062 + }, + { + "epoch": 0.4128, + "learning_rate": 1.5381825110347072e-05, + "loss": 0.4175, + "step": 2064 + }, + { + "epoch": 0.4132, + "learning_rate": 1.539358796388683e-05, + "loss": 0.4154, + "step": 2066 + }, + { + "epoch": 0.4136, + "learning_rate": 1.540534030235087e-05, + "loss": 0.5686, + "step": 2068 + }, + { + "epoch": 0.414, + "learning_rate": 1.5417082102827397e-05, + "loss": 0.5586, + "step": 2070 + }, + { + "epoch": 0.4144, + "learning_rate": 1.542881334242517e-05, + "loss": 0.6943, + "step": 2072 + }, + { + "epoch": 0.4148, + "learning_rate": 1.5440533998273542e-05, + "loss": 0.5523, + "step": 2074 + }, + { + "epoch": 0.4152, + "learning_rate": 1.5452244047522493e-05, + "loss": 0.6599, + "step": 2076 + }, + { + "epoch": 0.4156, + "learning_rate": 1.54639434673427e-05, + "loss": 0.4307, + "step": 2078 + }, + { + "epoch": 0.416, + "learning_rate": 1.5475632234925495e-05, + "loss": 0.3563, + "step": 2080 + }, + { + "epoch": 0.4164, + "learning_rate": 1.548731032748309e-05, + "loss": 0.2965, + "step": 2082 + }, + { + "epoch": 0.4168, + "learning_rate": 1.5498977722248388e-05, + "loss": 0.2128, + "step": 2084 + }, + { + "epoch": 0.4172, + "learning_rate": 1.551063439647525e-05, + "loss": 0.5431, + "step": 2086 + }, + { + "epoch": 0.4176, + "learning_rate": 1.552228032743839e-05, + "loss": 0.3336, + "step": 2088 + }, + { + "epoch": 0.418, + "learning_rate": 1.553391549243343e-05, + "loss": 0.3082, + "step": 2090 + }, + { + "epoch": 0.4184, + "learning_rate": 1.5545539868777075e-05, + "loss": 0.6196, + "step": 2092 + }, + { + "epoch": 0.4188, + "learning_rate": 1.5557153433806954e-05, + "loss": 0.7747, + "step": 2094 + }, + { + "epoch": 0.4192, + "learning_rate": 1.556875616488188e-05, + "loss": 0.1838, + "step": 2096 + }, + { + "epoch": 0.4196, + "learning_rate": 1.55803480393817e-05, + "loss": 0.9596, + "step": 2098 + }, + { + "epoch": 0.42, + "learning_rate": 1.5591929034707468e-05, + "loss": 0.0986, + "step": 2100 + }, + { + "epoch": 0.4204, + "learning_rate": 1.5603499128281447e-05, + "loss": 0.4402, + "step": 2102 + }, + { + "epoch": 0.4208, + "learning_rate": 1.5615058297547144e-05, + "loss": 0.056, + "step": 2104 + }, + { + "epoch": 0.4212, + "learning_rate": 1.5626606519969366e-05, + "loss": 0.1684, + "step": 2106 + }, + { + "epoch": 0.4216, + "learning_rate": 1.5638143773034268e-05, + "loss": 0.2706, + "step": 2108 + }, + { + "epoch": 0.422, + "learning_rate": 1.5649670034249376e-05, + "loss": 0.4148, + "step": 2110 + }, + { + "epoch": 0.4224, + "learning_rate": 1.5661185281143663e-05, + "loss": 0.4329, + "step": 2112 + }, + { + "epoch": 0.4228, + "learning_rate": 1.5672689491267562e-05, + "loss": 0.2438, + "step": 2114 + }, + { + "epoch": 0.4232, + "learning_rate": 1.5684182642193024e-05, + "loss": 0.0583, + "step": 2116 + }, + { + "epoch": 0.4236, + "learning_rate": 1.5695664711513582e-05, + "loss": 0.9311, + "step": 2118 + }, + { + "epoch": 0.424, + "learning_rate": 1.5707135676844312e-05, + "loss": 0.2396, + "step": 2120 + }, + { + "epoch": 0.4244, + "learning_rate": 1.5718595515822016e-05, + "loss": 0.5753, + "step": 2122 + }, + { + "epoch": 0.4248, + "learning_rate": 1.5730044206105146e-05, + "loss": 0.4207, + "step": 2124 + }, + { + "epoch": 0.4252, + "learning_rate": 1.574148172537389e-05, + "loss": 0.5289, + "step": 2126 + }, + { + "epoch": 0.4256, + "learning_rate": 1.5752908051330232e-05, + "loss": 0.9355, + "step": 2128 + }, + { + "epoch": 0.426, + "learning_rate": 1.5764323161697923e-05, + "loss": 0.4498, + "step": 2130 + }, + { + "epoch": 0.4264, + "learning_rate": 1.577572703422268e-05, + "loss": 0.3783, + "step": 2132 + }, + { + "epoch": 0.4268, + "learning_rate": 1.5787119646672025e-05, + "loss": 0.4763, + "step": 2134 + }, + { + "epoch": 0.4272, + "learning_rate": 1.579850097683548e-05, + "loss": 0.0663, + "step": 2136 + }, + { + "epoch": 0.4276, + "learning_rate": 1.58098710025246e-05, + "loss": 0.3966, + "step": 2138 + }, + { + "epoch": 0.428, + "learning_rate": 1.582122970157288e-05, + "loss": 0.6732, + "step": 2140 + }, + { + "epoch": 0.4284, + "learning_rate": 1.5832577051836016e-05, + "loss": 0.2451, + "step": 2142 + }, + { + "epoch": 0.4288, + "learning_rate": 1.5843913031191722e-05, + "loss": 0.3867, + "step": 2144 + }, + { + "epoch": 0.4292, + "learning_rate": 1.585523761753994e-05, + "loss": 0.8255, + "step": 2146 + }, + { + "epoch": 0.4296, + "learning_rate": 1.586655078880281e-05, + "loss": 0.2558, + "step": 2148 + }, + { + "epoch": 0.43, + "learning_rate": 1.587785252292473e-05, + "loss": 0.1291, + "step": 2150 + }, + { + "epoch": 0.4304, + "learning_rate": 1.5889142797872383e-05, + "loss": 0.314, + "step": 2152 + }, + { + "epoch": 0.4308, + "learning_rate": 1.5900421591634806e-05, + "loss": 0.7029, + "step": 2154 + }, + { + "epoch": 0.4312, + "learning_rate": 1.5911688882223415e-05, + "loss": 0.7267, + "step": 2156 + }, + { + "epoch": 0.4316, + "learning_rate": 1.5922944647672044e-05, + "loss": 0.4953, + "step": 2158 + }, + { + "epoch": 0.432, + "learning_rate": 1.5934188866037007e-05, + "loss": 0.1341, + "step": 2160 + }, + { + "epoch": 0.4324, + "learning_rate": 1.5945421515397125e-05, + "loss": 0.1748, + "step": 2162 + }, + { + "epoch": 0.4328, + "learning_rate": 1.5956642573853787e-05, + "loss": 0.1322, + "step": 2164 + }, + { + "epoch": 0.4332, + "learning_rate": 1.5967852019530918e-05, + "loss": 0.3274, + "step": 2166 + }, + { + "epoch": 0.4336, + "learning_rate": 1.5979049830575193e-05, + "loss": 0.6823, + "step": 2168 + }, + { + "epoch": 0.434, + "learning_rate": 1.599023598515585e-05, + "loss": 0.1735, + "step": 2170 + }, + { + "epoch": 0.4344, + "learning_rate": 1.6001410461464945e-05, + "loss": 0.326, + "step": 2172 + }, + { + "epoch": 0.4348, + "learning_rate": 1.601257323771727e-05, + "loss": 0.3176, + "step": 2174 + }, + { + "epoch": 0.4352, + "learning_rate": 1.6023724292150377e-05, + "loss": 0.4639, + "step": 2176 + }, + { + "epoch": 0.4356, + "learning_rate": 1.6034863603024768e-05, + "loss": 0.2781, + "step": 2178 + }, + { + "epoch": 0.436, + "learning_rate": 1.604599114862375e-05, + "loss": 0.7703, + "step": 2180 + }, + { + "epoch": 0.4364, + "learning_rate": 1.6057106907253614e-05, + "loss": 0.154, + "step": 2182 + }, + { + "epoch": 0.4368, + "learning_rate": 1.606821085724362e-05, + "loss": 0.4011, + "step": 2184 + }, + { + "epoch": 0.4372, + "learning_rate": 1.6079302976946052e-05, + "loss": 0.3101, + "step": 2186 + }, + { + "epoch": 0.4376, + "learning_rate": 1.6090383244736253e-05, + "loss": 0.2479, + "step": 2188 + }, + { + "epoch": 0.438, + "learning_rate": 1.6101451639012675e-05, + "loss": 0.3892, + "step": 2190 + }, + { + "epoch": 0.4384, + "learning_rate": 1.6112508138196912e-05, + "loss": 0.4567, + "step": 2192 + }, + { + "epoch": 0.4388, + "learning_rate": 1.6123552720733763e-05, + "loss": 0.4153, + "step": 2194 + }, + { + "epoch": 0.4392, + "learning_rate": 1.613458536509124e-05, + "loss": 0.461, + "step": 2196 + }, + { + "epoch": 0.4396, + "learning_rate": 1.614560604976064e-05, + "loss": 0.4327, + "step": 2198 + }, + { + "epoch": 0.44, + "learning_rate": 1.615661475325658e-05, + "loss": 0.3008, + "step": 2200 + }, + { + "epoch": 0.4404, + "learning_rate": 1.616761145411702e-05, + "loss": 0.2992, + "step": 2202 + }, + { + "epoch": 0.4408, + "learning_rate": 1.6178596130903352e-05, + "loss": 0.1946, + "step": 2204 + }, + { + "epoch": 0.4412, + "learning_rate": 1.618956876220034e-05, + "loss": 0.5753, + "step": 2206 + }, + { + "epoch": 0.4416, + "learning_rate": 1.620052932661632e-05, + "loss": 0.126, + "step": 2208 + }, + { + "epoch": 0.442, + "learning_rate": 1.621147780278311e-05, + "loss": 0.2166, + "step": 2210 + }, + { + "epoch": 0.4424, + "learning_rate": 1.6222414169356056e-05, + "loss": 0.1903, + "step": 2212 + }, + { + "epoch": 0.4428, + "learning_rate": 1.6233338405014204e-05, + "loss": 0.2995, + "step": 2214 + }, + { + "epoch": 0.4432, + "learning_rate": 1.6244250488460146e-05, + "loss": 0.2454, + "step": 2216 + }, + { + "epoch": 0.4436, + "learning_rate": 1.6255150398420273e-05, + "loss": 0.2782, + "step": 2218 + }, + { + "epoch": 0.444, + "learning_rate": 1.6266038113644605e-05, + "loss": 0.1758, + "step": 2220 + }, + { + "epoch": 0.4444, + "learning_rate": 1.6276913612907005e-05, + "loss": 0.773, + "step": 2222 + }, + { + "epoch": 0.4448, + "learning_rate": 1.6287776875005127e-05, + "loss": 0.7094, + "step": 2224 + }, + { + "epoch": 0.4452, + "learning_rate": 1.6298627878760488e-05, + "loss": 0.3022, + "step": 2226 + }, + { + "epoch": 0.4456, + "learning_rate": 1.6309466603018497e-05, + "loss": 0.1311, + "step": 2228 + }, + { + "epoch": 0.446, + "learning_rate": 1.6320293026648508e-05, + "loss": 0.2427, + "step": 2230 + }, + { + "epoch": 0.4464, + "learning_rate": 1.6331107128543856e-05, + "loss": 0.5957, + "step": 2232 + }, + { + "epoch": 0.4468, + "learning_rate": 1.634190888762189e-05, + "loss": 0.6103, + "step": 2234 + }, + { + "epoch": 0.4472, + "learning_rate": 1.635269828282404e-05, + "loss": 0.199, + "step": 2236 + }, + { + "epoch": 0.4476, + "learning_rate": 1.6363475293115818e-05, + "loss": 0.3281, + "step": 2238 + }, + { + "epoch": 0.448, + "learning_rate": 1.6374239897486905e-05, + "loss": 0.409, + "step": 2240 + }, + { + "epoch": 0.4484, + "learning_rate": 1.6384992074951118e-05, + "loss": 0.302, + "step": 2242 + }, + { + "epoch": 0.4488, + "learning_rate": 1.6395731804546575e-05, + "loss": 0.6546, + "step": 2244 + }, + { + "epoch": 0.4492, + "learning_rate": 1.640645906533561e-05, + "loss": 0.5806, + "step": 2246 + }, + { + "epoch": 0.4496, + "learning_rate": 1.6417173836404878e-05, + "loss": 0.7055, + "step": 2248 + }, + { + "epoch": 0.45, + "learning_rate": 1.6427876096865397e-05, + "loss": 0.214, + "step": 2250 + }, + { + "epoch": 0.4504, + "learning_rate": 1.643856582585253e-05, + "loss": 0.0262, + "step": 2252 + }, + { + "epoch": 0.4508, + "learning_rate": 1.6449243002526146e-05, + "loss": 0.1717, + "step": 2254 + }, + { + "epoch": 0.4512, + "learning_rate": 1.6459907606070513e-05, + "loss": 0.2994, + "step": 2256 + }, + { + "epoch": 0.4516, + "learning_rate": 1.6470559615694445e-05, + "loss": 0.4484, + "step": 2258 + }, + { + "epoch": 0.452, + "learning_rate": 1.6481199010631312e-05, + "loss": 0.7119, + "step": 2260 + }, + { + "epoch": 0.4524, + "learning_rate": 1.649182577013905e-05, + "loss": 0.2169, + "step": 2262 + }, + { + "epoch": 0.4528, + "learning_rate": 1.650243987350029e-05, + "loss": 0.3641, + "step": 2264 + }, + { + "epoch": 0.4532, + "learning_rate": 1.6513041300022253e-05, + "loss": 0.5216, + "step": 2266 + }, + { + "epoch": 0.4536, + "learning_rate": 1.652363002903693e-05, + "loss": 0.2793, + "step": 2268 + }, + { + "epoch": 0.454, + "learning_rate": 1.6534206039901054e-05, + "loss": 0.3332, + "step": 2270 + }, + { + "epoch": 0.4544, + "learning_rate": 1.6544769311996146e-05, + "loss": 0.2331, + "step": 2272 + }, + { + "epoch": 0.4548, + "learning_rate": 1.655531982472857e-05, + "loss": 0.6003, + "step": 2274 + }, + { + "epoch": 0.4552, + "learning_rate": 1.656585755752956e-05, + "loss": 0.6675, + "step": 2276 + }, + { + "epoch": 0.4556, + "learning_rate": 1.657638248985527e-05, + "loss": 0.7415, + "step": 2278 + }, + { + "epoch": 0.456, + "learning_rate": 1.65868946011868e-05, + "loss": 0.2189, + "step": 2280 + }, + { + "epoch": 0.4564, + "learning_rate": 1.6597393871030257e-05, + "loss": 0.2262, + "step": 2282 + }, + { + "epoch": 0.4568, + "learning_rate": 1.660788027891677e-05, + "loss": 0.2986, + "step": 2284 + }, + { + "epoch": 0.4572, + "learning_rate": 1.6618353804402573e-05, + "loss": 0.0859, + "step": 2286 + }, + { + "epoch": 0.4576, + "learning_rate": 1.6628814427068944e-05, + "loss": 0.4054, + "step": 2288 + }, + { + "epoch": 0.458, + "learning_rate": 1.663926212652242e-05, + "loss": 0.4217, + "step": 2290 + }, + { + "epoch": 0.4584, + "learning_rate": 1.6649696882394625e-05, + "loss": 0.2112, + "step": 2292 + }, + { + "epoch": 0.4588, + "learning_rate": 1.666011867434252e-05, + "loss": 0.3053, + "step": 2294 + }, + { + "epoch": 0.4592, + "learning_rate": 1.667052748204825e-05, + "loss": 0.7078, + "step": 2296 + }, + { + "epoch": 0.4596, + "learning_rate": 1.6680923285219308e-05, + "loss": 0.5955, + "step": 2298 + }, + { + "epoch": 0.46, + "learning_rate": 1.6691306063588583e-05, + "loss": 0.6973, + "step": 2300 + }, + { + "epoch": 0.4604, + "learning_rate": 1.6701675796914273e-05, + "loss": 0.2172, + "step": 2302 + }, + { + "epoch": 0.4608, + "learning_rate": 1.6712032464980094e-05, + "loss": 0.6912, + "step": 2304 + }, + { + "epoch": 0.4612, + "learning_rate": 1.672237604759516e-05, + "loss": 0.33, + "step": 2306 + }, + { + "epoch": 0.4616, + "learning_rate": 1.6732706524594138e-05, + "loss": 0.3499, + "step": 2308 + }, + { + "epoch": 0.462, + "learning_rate": 1.6743023875837233e-05, + "loss": 0.5602, + "step": 2310 + }, + { + "epoch": 0.4624, + "learning_rate": 1.6753328081210244e-05, + "loss": 0.5344, + "step": 2312 + }, + { + "epoch": 0.4628, + "learning_rate": 1.6763619120624592e-05, + "loss": 0.1335, + "step": 2314 + }, + { + "epoch": 0.4632, + "learning_rate": 1.6773896974017373e-05, + "loss": 0.1973, + "step": 2316 + }, + { + "epoch": 0.4636, + "learning_rate": 1.6784161621351377e-05, + "loss": 0.4247, + "step": 2318 + }, + { + "epoch": 0.464, + "learning_rate": 1.679441304261516e-05, + "loss": 0.3894, + "step": 2320 + }, + { + "epoch": 0.4644, + "learning_rate": 1.6804651217823048e-05, + "loss": 0.4548, + "step": 2322 + }, + { + "epoch": 0.4648, + "learning_rate": 1.681487612701519e-05, + "loss": 0.3654, + "step": 2324 + }, + { + "epoch": 0.4652, + "learning_rate": 1.6825087750257624e-05, + "loss": 0.3532, + "step": 2326 + }, + { + "epoch": 0.4656, + "learning_rate": 1.683528606764222e-05, + "loss": 0.1247, + "step": 2328 + }, + { + "epoch": 0.466, + "learning_rate": 1.6845471059286893e-05, + "loss": 0.3859, + "step": 2330 + }, + { + "epoch": 0.4664, + "learning_rate": 1.6855642705335428e-05, + "loss": 0.332, + "step": 2332 + }, + { + "epoch": 0.4668, + "learning_rate": 1.6865800985957718e-05, + "loss": 0.4711, + "step": 2334 + }, + { + "epoch": 0.4672, + "learning_rate": 1.687594588134968e-05, + "loss": 0.0668, + "step": 2336 + }, + { + "epoch": 0.4676, + "learning_rate": 1.6886077371733275e-05, + "loss": 0.4968, + "step": 2338 + }, + { + "epoch": 0.468, + "learning_rate": 1.68961954373567e-05, + "loss": 0.6808, + "step": 2340 + }, + { + "epoch": 0.4684, + "learning_rate": 1.690630005849423e-05, + "loss": 0.335, + "step": 2342 + }, + { + "epoch": 0.4688, + "learning_rate": 1.6916391215446403e-05, + "loss": 0.7946, + "step": 2344 + }, + { + "epoch": 0.4692, + "learning_rate": 1.6926468888539988e-05, + "loss": 0.538, + "step": 2346 + }, + { + "epoch": 0.4696, + "learning_rate": 1.693653305812805e-05, + "loss": 0.3641, + "step": 2348 + }, + { + "epoch": 0.47, + "learning_rate": 1.6946583704589973e-05, + "loss": 0.4414, + "step": 2350 + }, + { + "epoch": 0.4704, + "learning_rate": 1.6956620808331505e-05, + "loss": 0.8821, + "step": 2352 + }, + { + "epoch": 0.4708, + "learning_rate": 1.6966644349784805e-05, + "loss": 0.6062, + "step": 2354 + }, + { + "epoch": 0.4712, + "learning_rate": 1.697665430940846e-05, + "loss": 0.4634, + "step": 2356 + }, + { + "epoch": 0.4716, + "learning_rate": 1.698665066768755e-05, + "loss": 0.4623, + "step": 2358 + }, + { + "epoch": 0.472, + "learning_rate": 1.699663340513365e-05, + "loss": 0.2445, + "step": 2360 + }, + { + "epoch": 0.4724, + "learning_rate": 1.7006602502284913e-05, + "loss": 0.4793, + "step": 2362 + }, + { + "epoch": 0.4728, + "learning_rate": 1.7016557939706068e-05, + "loss": 0.6003, + "step": 2364 + }, + { + "epoch": 0.4732, + "learning_rate": 1.70264996979885e-05, + "loss": 0.218, + "step": 2366 + }, + { + "epoch": 0.4736, + "learning_rate": 1.7036427757750198e-05, + "loss": 0.3163, + "step": 2368 + }, + { + "epoch": 0.474, + "learning_rate": 1.7046342099635938e-05, + "loss": 0.8093, + "step": 2370 + }, + { + "epoch": 0.4744, + "learning_rate": 1.7056242704317212e-05, + "loss": 0.4662, + "step": 2372 + }, + { + "epoch": 0.4748, + "learning_rate": 1.706612955249224e-05, + "loss": 0.3372, + "step": 2374 + }, + { + "epoch": 0.4752, + "learning_rate": 1.7076002624886156e-05, + "loss": 0.4513, + "step": 2376 + }, + { + "epoch": 0.4756, + "learning_rate": 1.708586190225085e-05, + "loss": 0.1814, + "step": 2378 + }, + { + "epoch": 0.476, + "learning_rate": 1.709570736536521e-05, + "loss": 0.7883, + "step": 2380 + }, + { + "epoch": 0.4764, + "learning_rate": 1.710553899503496e-05, + "loss": 0.4103, + "step": 2382 + }, + { + "epoch": 0.4768, + "learning_rate": 1.7115356772092844e-05, + "loss": 0.4666, + "step": 2384 + }, + { + "epoch": 0.4772, + "learning_rate": 1.7125160677398625e-05, + "loss": 0.0833, + "step": 2386 + }, + { + "epoch": 0.4776, + "learning_rate": 1.7134950691839063e-05, + "loss": 0.253, + "step": 2388 + }, + { + "epoch": 0.478, + "learning_rate": 1.7144726796328034e-05, + "loss": 0.624, + "step": 2390 + }, + { + "epoch": 0.4784, + "learning_rate": 1.7154488971806518e-05, + "loss": 0.5618, + "step": 2392 + }, + { + "epoch": 0.4788, + "learning_rate": 1.716423719924266e-05, + "loss": 1.4912, + "step": 2394 + }, + { + "epoch": 0.4792, + "learning_rate": 1.7173971459631783e-05, + "loss": 0.3042, + "step": 2396 + }, + { + "epoch": 0.4796, + "learning_rate": 1.718369173399646e-05, + "loss": 0.477, + "step": 2398 + }, + { + "epoch": 0.48, + "learning_rate": 1.7193398003386507e-05, + "loss": 0.1874, + "step": 2400 + }, + { + "epoch": 0.4804, + "learning_rate": 1.7203090248879063e-05, + "loss": 0.2845, + "step": 2402 + }, + { + "epoch": 0.4808, + "learning_rate": 1.7212768451578602e-05, + "loss": 0.3802, + "step": 2404 + }, + { + "epoch": 0.4812, + "learning_rate": 1.7222432592616963e-05, + "loss": 0.1641, + "step": 2406 + }, + { + "epoch": 0.4816, + "learning_rate": 1.7232082653153416e-05, + "loss": 0.3955, + "step": 2408 + }, + { + "epoch": 0.482, + "learning_rate": 1.724171861437467e-05, + "loss": 0.3767, + "step": 2410 + }, + { + "epoch": 0.4824, + "learning_rate": 1.7251340457494937e-05, + "loss": 0.6858, + "step": 2412 + }, + { + "epoch": 0.4828, + "learning_rate": 1.726094816375591e-05, + "loss": 0.2491, + "step": 2414 + }, + { + "epoch": 0.4832, + "learning_rate": 1.7270541714426923e-05, + "loss": 0.4403, + "step": 2416 + }, + { + "epoch": 0.4836, + "learning_rate": 1.7280121090804817e-05, + "loss": 0.684, + "step": 2418 + }, + { + "epoch": 0.484, + "learning_rate": 1.7289686274214106e-05, + "loss": 0.3063, + "step": 2420 + }, + { + "epoch": 0.4844, + "learning_rate": 1.7299237246007018e-05, + "loss": 0.6492, + "step": 2422 + }, + { + "epoch": 0.4848, + "learning_rate": 1.7308773987563393e-05, + "loss": 0.6284, + "step": 2424 + }, + { + "epoch": 0.4852, + "learning_rate": 1.7318296480290912e-05, + "loss": 0.1235, + "step": 2426 + }, + { + "epoch": 0.4856, + "learning_rate": 1.732780470562496e-05, + "loss": 0.4292, + "step": 2428 + }, + { + "epoch": 0.486, + "learning_rate": 1.7337298645028764e-05, + "loss": 0.4949, + "step": 2430 + }, + { + "epoch": 0.4864, + "learning_rate": 1.7346778279993413e-05, + "loss": 0.4539, + "step": 2432 + }, + { + "epoch": 0.4868, + "learning_rate": 1.7356243592037872e-05, + "loss": 0.267, + "step": 2434 + }, + { + "epoch": 0.4872, + "learning_rate": 1.736569456270903e-05, + "loss": 0.5412, + "step": 2436 + }, + { + "epoch": 0.4876, + "learning_rate": 1.7375131173581737e-05, + "loss": 0.8155, + "step": 2438 + }, + { + "epoch": 0.488, + "learning_rate": 1.7384553406258836e-05, + "loss": 0.2868, + "step": 2440 + }, + { + "epoch": 0.4884, + "learning_rate": 1.73939612423712e-05, + "loss": 0.5432, + "step": 2442 + }, + { + "epoch": 0.4888, + "learning_rate": 1.740335466357778e-05, + "loss": 0.1585, + "step": 2444 + }, + { + "epoch": 0.4892, + "learning_rate": 1.7412733651565607e-05, + "loss": 0.4756, + "step": 2446 + }, + { + "epoch": 0.4896, + "learning_rate": 1.7422098188049888e-05, + "loss": 0.3802, + "step": 2448 + }, + { + "epoch": 0.49, + "learning_rate": 1.7431448254773936e-05, + "loss": 0.3062, + "step": 2450 + }, + { + "epoch": 0.4904, + "learning_rate": 1.7440783833509373e-05, + "loss": 0.3937, + "step": 2452 + }, + { + "epoch": 0.4908, + "learning_rate": 1.7450104906055956e-05, + "loss": 0.4779, + "step": 2454 + }, + { + "epoch": 0.4912, + "learning_rate": 1.7459411454241816e-05, + "loss": 0.5218, + "step": 2456 + }, + { + "epoch": 0.4916, + "learning_rate": 1.746870345992336e-05, + "loss": 0.1275, + "step": 2458 + }, + { + "epoch": 0.492, + "learning_rate": 1.747798090498531e-05, + "loss": 0.2544, + "step": 2460 + }, + { + "epoch": 0.4924, + "learning_rate": 1.7487243771340865e-05, + "loss": 0.3223, + "step": 2462 + }, + { + "epoch": 0.4928, + "learning_rate": 1.749649204093154e-05, + "loss": 0.4431, + "step": 2464 + }, + { + "epoch": 0.4932, + "learning_rate": 1.750572569572741e-05, + "loss": 0.4967, + "step": 2466 + }, + { + "epoch": 0.4936, + "learning_rate": 1.7514944717726962e-05, + "loss": 0.1758, + "step": 2468 + }, + { + "epoch": 0.494, + "learning_rate": 1.7524149088957244e-05, + "loss": 0.5168, + "step": 2470 + }, + { + "epoch": 0.4944, + "learning_rate": 1.753333879147387e-05, + "loss": 0.344, + "step": 2472 + }, + { + "epoch": 0.4948, + "learning_rate": 1.7542513807361037e-05, + "loss": 0.2956, + "step": 2474 + }, + { + "epoch": 0.4952, + "learning_rate": 1.755167411873159e-05, + "loss": 0.3368, + "step": 2476 + }, + { + "epoch": 0.4956, + "learning_rate": 1.7560819707727027e-05, + "loss": 0.1103, + "step": 2478 + }, + { + "epoch": 0.496, + "learning_rate": 1.7569950556517563e-05, + "loss": 0.4512, + "step": 2480 + }, + { + "epoch": 0.4964, + "learning_rate": 1.757906664730213e-05, + "loss": 0.1489, + "step": 2482 + }, + { + "epoch": 0.4968, + "learning_rate": 1.758816796230845e-05, + "loss": 0.3424, + "step": 2484 + }, + { + "epoch": 0.4972, + "learning_rate": 1.759725448379304e-05, + "loss": 0.2624, + "step": 2486 + }, + { + "epoch": 0.4976, + "learning_rate": 1.7606326194041278e-05, + "loss": 0.1538, + "step": 2488 + }, + { + "epoch": 0.498, + "learning_rate": 1.7615383075367363e-05, + "loss": 0.2462, + "step": 2490 + }, + { + "epoch": 0.4984, + "learning_rate": 1.762442511011447e-05, + "loss": 0.5561, + "step": 2492 + }, + { + "epoch": 0.4988, + "learning_rate": 1.763345228065469e-05, + "loss": 0.1346, + "step": 2494 + }, + { + "epoch": 0.4992, + "learning_rate": 1.7642464569389083e-05, + "loss": 0.3682, + "step": 2496 + }, + { + "epoch": 0.4996, + "learning_rate": 1.7651461958747745e-05, + "loss": 0.1838, + "step": 2498 + }, + { + "epoch": 0.5, + "learning_rate": 1.766044443118977e-05, + "loss": 0.3603, + "step": 2500 + }, + { + "epoch": 0.5004, + "learning_rate": 1.766941196920342e-05, + "loss": 0.3738, + "step": 2502 + }, + { + "epoch": 0.5008, + "learning_rate": 1.767836455530598e-05, + "loss": 0.2771, + "step": 2504 + }, + { + "epoch": 0.5012, + "learning_rate": 1.7687302172043933e-05, + "loss": 0.4026, + "step": 2506 + }, + { + "epoch": 0.5016, + "learning_rate": 1.7696224801992947e-05, + "loss": 0.0629, + "step": 2508 + }, + { + "epoch": 0.502, + "learning_rate": 1.7705132427757885e-05, + "loss": 0.2518, + "step": 2510 + }, + { + "epoch": 0.5024, + "learning_rate": 1.77140250319729e-05, + "loss": 0.2931, + "step": 2512 + }, + { + "epoch": 0.5028, + "learning_rate": 1.7722902597301385e-05, + "loss": 0.4337, + "step": 2514 + }, + { + "epoch": 0.5032, + "learning_rate": 1.7731765106436073e-05, + "loss": 0.4581, + "step": 2516 + }, + { + "epoch": 0.5036, + "learning_rate": 1.774061254209905e-05, + "loss": 0.5083, + "step": 2518 + }, + { + "epoch": 0.504, + "learning_rate": 1.7749444887041793e-05, + "loss": 0.3948, + "step": 2520 + }, + { + "epoch": 0.5044, + "learning_rate": 1.7758262124045192e-05, + "loss": 0.2454, + "step": 2522 + }, + { + "epoch": 0.5048, + "learning_rate": 1.776706423591959e-05, + "loss": 0.331, + "step": 2524 + }, + { + "epoch": 0.5052, + "learning_rate": 1.7775851205504816e-05, + "loss": 0.2174, + "step": 2526 + }, + { + "epoch": 0.5056, + "learning_rate": 1.778462301567023e-05, + "loss": 0.1906, + "step": 2528 + }, + { + "epoch": 0.506, + "learning_rate": 1.7793379649314736e-05, + "loss": 0.2221, + "step": 2530 + }, + { + "epoch": 0.5064, + "learning_rate": 1.7802121089366832e-05, + "loss": 0.3887, + "step": 2532 + }, + { + "epoch": 0.5068, + "learning_rate": 1.7810847318784635e-05, + "loss": 0.2998, + "step": 2534 + }, + { + "epoch": 0.5072, + "learning_rate": 1.7819558320555895e-05, + "loss": 0.3406, + "step": 2536 + }, + { + "epoch": 0.5076, + "learning_rate": 1.7828254077698103e-05, + "loss": 1.002, + "step": 2538 + }, + { + "epoch": 0.508, + "learning_rate": 1.7836934573258392e-05, + "loss": 0.1692, + "step": 2540 + }, + { + "epoch": 0.5084, + "learning_rate": 1.7845599790313735e-05, + "loss": 1.3339, + "step": 2542 + }, + { + "epoch": 0.5088, + "learning_rate": 1.785424971197082e-05, + "loss": 0.3784, + "step": 2544 + }, + { + "epoch": 0.5092, + "learning_rate": 1.786288432136618e-05, + "loss": 0.1756, + "step": 2546 + }, + { + "epoch": 0.5096, + "learning_rate": 1.7871503601666233e-05, + "loss": 0.4221, + "step": 2548 + }, + { + "epoch": 0.51, + "learning_rate": 1.788010753606722e-05, + "loss": 0.2934, + "step": 2550 + }, + { + "epoch": 0.5104, + "learning_rate": 1.7888696107795343e-05, + "loss": 0.2025, + "step": 2552 + }, + { + "epoch": 0.5108, + "learning_rate": 1.7897269300106735e-05, + "loss": 0.8862, + "step": 2554 + }, + { + "epoch": 0.5112, + "learning_rate": 1.790582709628753e-05, + "loss": 0.56, + "step": 2556 + }, + { + "epoch": 0.5116, + "learning_rate": 1.7914369479653854e-05, + "loss": 0.3542, + "step": 2558 + }, + { + "epoch": 0.512, + "learning_rate": 1.7922896433551903e-05, + "loss": 0.271, + "step": 2560 + }, + { + "epoch": 0.5124, + "learning_rate": 1.7931407941357945e-05, + "loss": 0.2647, + "step": 2562 + }, + { + "epoch": 0.5128, + "learning_rate": 1.793990398647835e-05, + "loss": 0.4603, + "step": 2564 + }, + { + "epoch": 0.5132, + "learning_rate": 1.7948384552349655e-05, + "loss": 0.3959, + "step": 2566 + }, + { + "epoch": 0.5136, + "learning_rate": 1.795684962243855e-05, + "loss": 0.9489, + "step": 2568 + }, + { + "epoch": 0.514, + "learning_rate": 1.796529918024196e-05, + "loss": 0.6159, + "step": 2570 + }, + { + "epoch": 0.5144, + "learning_rate": 1.7973733209287032e-05, + "loss": 0.43, + "step": 2572 + }, + { + "epoch": 0.5148, + "learning_rate": 1.798215169313121e-05, + "loss": 0.3463, + "step": 2574 + }, + { + "epoch": 0.5152, + "learning_rate": 1.7990554615362193e-05, + "loss": 0.301, + "step": 2576 + }, + { + "epoch": 0.5156, + "learning_rate": 1.79989419595981e-05, + "loss": 0.1485, + "step": 2578 + }, + { + "epoch": 0.516, + "learning_rate": 1.800731370948734e-05, + "loss": 0.2955, + "step": 2580 + }, + { + "epoch": 0.5164, + "learning_rate": 1.8015669848708757e-05, + "loss": 0.3198, + "step": 2582 + }, + { + "epoch": 0.5168, + "learning_rate": 1.802401036097167e-05, + "loss": 0.3733, + "step": 2584 + }, + { + "epoch": 0.5172, + "learning_rate": 1.803233523001577e-05, + "loss": 0.2532, + "step": 2586 + }, + { + "epoch": 0.5176, + "learning_rate": 1.804064443961135e-05, + "loss": 0.6135, + "step": 2588 + }, + { + "epoch": 0.518, + "learning_rate": 1.804893797355914e-05, + "loss": 0.1644, + "step": 2590 + }, + { + "epoch": 0.5184, + "learning_rate": 1.8057215815690494e-05, + "loss": 0.2538, + "step": 2592 + }, + { + "epoch": 0.5188, + "learning_rate": 1.8065477949867327e-05, + "loss": 0.4518, + "step": 2594 + }, + { + "epoch": 0.5192, + "learning_rate": 1.8073724359982184e-05, + "loss": 0.4101, + "step": 2596 + }, + { + "epoch": 0.5196, + "learning_rate": 1.808195502995827e-05, + "loss": 0.2644, + "step": 2598 + }, + { + "epoch": 0.52, + "learning_rate": 1.809016994374947e-05, + "loss": 0.3485, + "step": 2600 + }, + { + "epoch": 0.5204, + "learning_rate": 1.8098369085340397e-05, + "loss": 0.3839, + "step": 2602 + }, + { + "epoch": 0.5208, + "learning_rate": 1.81065524387464e-05, + "loss": 0.3478, + "step": 2604 + }, + { + "epoch": 0.5212, + "learning_rate": 1.8114719988013606e-05, + "loss": 0.2145, + "step": 2606 + }, + { + "epoch": 0.5216, + "learning_rate": 1.8122871717218968e-05, + "loss": 0.4506, + "step": 2608 + }, + { + "epoch": 0.522, + "learning_rate": 1.813100761047028e-05, + "loss": 0.295, + "step": 2610 + }, + { + "epoch": 0.5224, + "learning_rate": 1.8139127651906176e-05, + "loss": 0.5042, + "step": 2612 + }, + { + "epoch": 0.5228, + "learning_rate": 1.8147231825696258e-05, + "loss": 0.4108, + "step": 2614 + }, + { + "epoch": 0.5232, + "learning_rate": 1.8155320116040976e-05, + "loss": 0.7792, + "step": 2616 + }, + { + "epoch": 0.5236, + "learning_rate": 1.8163392507171834e-05, + "loss": 0.3315, + "step": 2618 + }, + { + "epoch": 0.524, + "learning_rate": 1.817144898335129e-05, + "loss": 0.5713, + "step": 2620 + }, + { + "epoch": 0.5244, + "learning_rate": 1.8179489528872797e-05, + "loss": 0.3929, + "step": 2622 + }, + { + "epoch": 0.5248, + "learning_rate": 1.818751412806095e-05, + "loss": 0.5568, + "step": 2624 + }, + { + "epoch": 0.5252, + "learning_rate": 1.819552276527134e-05, + "loss": 0.4067, + "step": 2626 + }, + { + "epoch": 0.5256, + "learning_rate": 1.8203515424890738e-05, + "loss": 0.3545, + "step": 2628 + }, + { + "epoch": 0.526, + "learning_rate": 1.821149209133704e-05, + "loss": 0.202, + "step": 2630 + }, + { + "epoch": 0.5264, + "learning_rate": 1.8219452749059322e-05, + "loss": 0.413, + "step": 2632 + }, + { + "epoch": 0.5268, + "learning_rate": 1.82273973825379e-05, + "loss": 0.1691, + "step": 2634 + }, + { + "epoch": 0.5272, + "learning_rate": 1.8235325976284276e-05, + "loss": 0.5171, + "step": 2636 + }, + { + "epoch": 0.5276, + "learning_rate": 1.8243238514841258e-05, + "loss": 0.3644, + "step": 2638 + }, + { + "epoch": 0.528, + "learning_rate": 1.8251134982782952e-05, + "loss": 0.5052, + "step": 2640 + }, + { + "epoch": 0.5284, + "learning_rate": 1.8259015364714786e-05, + "loss": 0.295, + "step": 2642 + }, + { + "epoch": 0.5288, + "learning_rate": 1.826687964527355e-05, + "loss": 0.167, + "step": 2644 + }, + { + "epoch": 0.5292, + "learning_rate": 1.8274727809127437e-05, + "loss": 0.1897, + "step": 2646 + }, + { + "epoch": 0.5296, + "learning_rate": 1.828255984097604e-05, + "loss": 0.8242, + "step": 2648 + }, + { + "epoch": 0.53, + "learning_rate": 1.8290375725550413e-05, + "loss": 0.4699, + "step": 2650 + }, + { + "epoch": 0.5304, + "learning_rate": 1.8298175447613093e-05, + "loss": 0.5471, + "step": 2652 + }, + { + "epoch": 0.5308, + "learning_rate": 1.8305958991958125e-05, + "loss": 0.5586, + "step": 2654 + }, + { + "epoch": 0.5312, + "learning_rate": 1.8313726343411092e-05, + "loss": 0.3535, + "step": 2656 + }, + { + "epoch": 0.5316, + "learning_rate": 1.832147748682912e-05, + "loss": 0.3339, + "step": 2658 + }, + { + "epoch": 0.532, + "learning_rate": 1.8329212407101e-05, + "loss": 0.3805, + "step": 2660 + }, + { + "epoch": 0.5324, + "learning_rate": 1.8336931089147065e-05, + "loss": 0.614, + "step": 2662 + }, + { + "epoch": 0.5328, + "learning_rate": 1.8344633517919394e-05, + "loss": 0.3325, + "step": 2664 + }, + { + "epoch": 0.5332, + "learning_rate": 1.8352319678401677e-05, + "loss": 0.0367, + "step": 2666 + }, + { + "epoch": 0.5336, + "learning_rate": 1.8359989555609344e-05, + "loss": 0.2787, + "step": 2668 + }, + { + "epoch": 0.534, + "learning_rate": 1.836764313458962e-05, + "loss": 1.5264, + "step": 2670 + }, + { + "epoch": 0.5344, + "learning_rate": 1.8375280400421407e-05, + "loss": 0.1218, + "step": 2672 + }, + { + "epoch": 0.5348, + "learning_rate": 1.8382901338215515e-05, + "loss": 0.5839, + "step": 2674 + }, + { + "epoch": 0.5352, + "learning_rate": 1.8390505933114503e-05, + "loss": 0.5254, + "step": 2676 + }, + { + "epoch": 0.5356, + "learning_rate": 1.839809417029283e-05, + "loss": 0.2326, + "step": 2678 + }, + { + "epoch": 0.536, + "learning_rate": 1.8405666034956842e-05, + "loss": 0.2566, + "step": 2680 + }, + { + "epoch": 0.5364, + "learning_rate": 1.8413221512344805e-05, + "loss": 1.2278, + "step": 2682 + }, + { + "epoch": 0.5368, + "learning_rate": 1.842076058772692e-05, + "loss": 0.643, + "step": 2684 + }, + { + "epoch": 0.5372, + "learning_rate": 1.8428283246405386e-05, + "loss": 1.0574, + "step": 2686 + }, + { + "epoch": 0.5376, + "learning_rate": 1.8435789473714384e-05, + "loss": 0.5391, + "step": 2688 + }, + { + "epoch": 0.538, + "learning_rate": 1.844327925502015e-05, + "loss": 0.3007, + "step": 2690 + }, + { + "epoch": 0.5384, + "learning_rate": 1.8450752575720964e-05, + "loss": 0.2207, + "step": 2692 + }, + { + "epoch": 0.5388, + "learning_rate": 1.8458209421247205e-05, + "loss": 0.2711, + "step": 2694 + }, + { + "epoch": 0.5392, + "learning_rate": 1.8465649777061384e-05, + "loss": 0.2487, + "step": 2696 + }, + { + "epoch": 0.5396, + "learning_rate": 1.8473073628658116e-05, + "loss": 0.469, + "step": 2698 + }, + { + "epoch": 0.54, + "learning_rate": 1.8480480961564266e-05, + "loss": 0.4119, + "step": 2700 + }, + { + "epoch": 0.5404, + "learning_rate": 1.848787176133881e-05, + "loss": 0.3165, + "step": 2702 + }, + { + "epoch": 0.5408, + "learning_rate": 1.8495246013573047e-05, + "loss": 0.6014, + "step": 2704 + }, + { + "epoch": 0.5412, + "learning_rate": 1.850260370389049e-05, + "loss": 0.3239, + "step": 2706 + }, + { + "epoch": 0.5416, + "learning_rate": 1.850994481794691e-05, + "loss": 0.2028, + "step": 2708 + }, + { + "epoch": 0.542, + "learning_rate": 1.851726934143048e-05, + "loss": 0.5771, + "step": 2710 + }, + { + "epoch": 0.5424, + "learning_rate": 1.8524577260061628e-05, + "loss": 0.8184, + "step": 2712 + }, + { + "epoch": 0.5428, + "learning_rate": 1.8531868559593205e-05, + "loss": 0.2131, + "step": 2714 + }, + { + "epoch": 0.5432, + "learning_rate": 1.8539143225810453e-05, + "loss": 0.2988, + "step": 2716 + }, + { + "epoch": 0.5436, + "learning_rate": 1.8546401244531028e-05, + "loss": 0.217, + "step": 2718 + }, + { + "epoch": 0.544, + "learning_rate": 1.8553642601605066e-05, + "loss": 0.4787, + "step": 2720 + }, + { + "epoch": 0.5444, + "learning_rate": 1.856086728291516e-05, + "loss": 0.4268, + "step": 2722 + }, + { + "epoch": 0.5448, + "learning_rate": 1.856807527437643e-05, + "loss": 0.6164, + "step": 2724 + }, + { + "epoch": 0.5452, + "learning_rate": 1.857526656193652e-05, + "loss": 0.5475, + "step": 2726 + }, + { + "epoch": 0.5456, + "learning_rate": 1.8582441131575658e-05, + "loss": 0.2327, + "step": 2728 + }, + { + "epoch": 0.546, + "learning_rate": 1.8589598969306643e-05, + "loss": 0.1463, + "step": 2730 + }, + { + "epoch": 0.5464, + "learning_rate": 1.859674006117491e-05, + "loss": 0.6091, + "step": 2732 + }, + { + "epoch": 0.5468, + "learning_rate": 1.860386439325853e-05, + "loss": 0.4772, + "step": 2734 + }, + { + "epoch": 0.5472, + "learning_rate": 1.8610971951668268e-05, + "loss": 0.2985, + "step": 2736 + }, + { + "epoch": 0.5476, + "learning_rate": 1.8618062722547544e-05, + "loss": 0.6341, + "step": 2738 + }, + { + "epoch": 0.548, + "learning_rate": 1.862513669207257e-05, + "loss": 0.3775, + "step": 2740 + }, + { + "epoch": 0.5484, + "learning_rate": 1.8632193846452274e-05, + "loss": 0.657, + "step": 2742 + }, + { + "epoch": 0.5488, + "learning_rate": 1.8639234171928348e-05, + "loss": 0.784, + "step": 2744 + }, + { + "epoch": 0.5492, + "learning_rate": 1.8646257654775354e-05, + "loss": 0.2002, + "step": 2746 + }, + { + "epoch": 0.5496, + "learning_rate": 1.8653264281300612e-05, + "loss": 0.4095, + "step": 2748 + }, + { + "epoch": 0.55, + "learning_rate": 1.866025403784439e-05, + "loss": 0.204, + "step": 2750 + }, + { + "epoch": 0.5504, + "learning_rate": 1.8667226910779767e-05, + "loss": 0.2391, + "step": 2752 + }, + { + "epoch": 0.5508, + "learning_rate": 1.8674182886512776e-05, + "loss": 0.6073, + "step": 2754 + }, + { + "epoch": 0.5512, + "learning_rate": 1.8681121951482393e-05, + "loss": 0.4849, + "step": 2756 + }, + { + "epoch": 0.5516, + "learning_rate": 1.8688044092160554e-05, + "loss": 1.1562, + "step": 2758 + }, + { + "epoch": 0.552, + "learning_rate": 1.869494929505219e-05, + "loss": 0.4646, + "step": 2760 + }, + { + "epoch": 0.5524, + "learning_rate": 1.8701837546695256e-05, + "loss": 0.1796, + "step": 2762 + }, + { + "epoch": 0.5528, + "learning_rate": 1.870870883366075e-05, + "loss": 0.5265, + "step": 2764 + }, + { + "epoch": 0.5532, + "learning_rate": 1.871556314255275e-05, + "loss": 0.3542, + "step": 2766 + }, + { + "epoch": 0.5536, + "learning_rate": 1.8722400460008434e-05, + "loss": 0.5263, + "step": 2768 + }, + { + "epoch": 0.554, + "learning_rate": 1.8729220772698093e-05, + "loss": 0.1692, + "step": 2770 + }, + { + "epoch": 0.5544, + "learning_rate": 1.8736024067325195e-05, + "loss": 0.5318, + "step": 2772 + }, + { + "epoch": 0.5548, + "learning_rate": 1.8742810330626335e-05, + "loss": 0.3523, + "step": 2774 + }, + { + "epoch": 0.5552, + "learning_rate": 1.8749579549371373e-05, + "loss": 0.6617, + "step": 2776 + }, + { + "epoch": 0.5556, + "learning_rate": 1.8756331710363368e-05, + "loss": 0.2415, + "step": 2778 + }, + { + "epoch": 0.556, + "learning_rate": 1.876306680043863e-05, + "loss": 0.2456, + "step": 2780 + }, + { + "epoch": 0.5564, + "learning_rate": 1.876978480646677e-05, + "loss": 0.3899, + "step": 2782 + }, + { + "epoch": 0.5568, + "learning_rate": 1.8776485715350665e-05, + "loss": 0.4971, + "step": 2784 + }, + { + "epoch": 0.5572, + "learning_rate": 1.878316951402658e-05, + "loss": 0.2779, + "step": 2786 + }, + { + "epoch": 0.5576, + "learning_rate": 1.878983618946409e-05, + "loss": 0.171, + "step": 2788 + }, + { + "epoch": 0.558, + "learning_rate": 1.879648572866617e-05, + "loss": 0.4398, + "step": 2790 + }, + { + "epoch": 0.5584, + "learning_rate": 1.8803118118669203e-05, + "loss": 0.7344, + "step": 2792 + }, + { + "epoch": 0.5588, + "learning_rate": 1.8809733346543006e-05, + "loss": 0.692, + "step": 2794 + }, + { + "epoch": 0.5592, + "learning_rate": 1.881633139939087e-05, + "loss": 0.1635, + "step": 2796 + }, + { + "epoch": 0.5596, + "learning_rate": 1.8822912264349532e-05, + "loss": 0.4421, + "step": 2798 + }, + { + "epoch": 0.56, + "learning_rate": 1.882947592858927e-05, + "loss": 0.2291, + "step": 2800 + }, + { + "epoch": 0.5604, + "learning_rate": 1.8836022379313877e-05, + "loss": 0.1038, + "step": 2802 + }, + { + "epoch": 0.5608, + "learning_rate": 1.884255160376072e-05, + "loss": 0.4008, + "step": 2804 + }, + { + "epoch": 0.5612, + "learning_rate": 1.8849063589200744e-05, + "loss": 0.454, + "step": 2806 + }, + { + "epoch": 0.5616, + "learning_rate": 1.885555832293849e-05, + "loss": 0.1002, + "step": 2808 + }, + { + "epoch": 0.562, + "learning_rate": 1.8862035792312145e-05, + "loss": 0.3014, + "step": 2810 + }, + { + "epoch": 0.5624, + "learning_rate": 1.886849598469356e-05, + "loss": 0.237, + "step": 2812 + }, + { + "epoch": 0.5628, + "learning_rate": 1.8874938887488246e-05, + "loss": 0.2276, + "step": 2814 + }, + { + "epoch": 0.5632, + "learning_rate": 1.888136448813544e-05, + "loss": 0.3647, + "step": 2816 + }, + { + "epoch": 0.5636, + "learning_rate": 1.888777277410812e-05, + "loss": 0.4398, + "step": 2818 + }, + { + "epoch": 0.564, + "learning_rate": 1.8894163732912972e-05, + "loss": 0.2421, + "step": 2820 + }, + { + "epoch": 0.5644, + "learning_rate": 1.890053735209053e-05, + "loss": 0.0977, + "step": 2822 + }, + { + "epoch": 0.5648, + "learning_rate": 1.890689361921506e-05, + "loss": 0.509, + "step": 2824 + }, + { + "epoch": 0.5652, + "learning_rate": 1.8913232521894737e-05, + "loss": 0.7079, + "step": 2826 + }, + { + "epoch": 0.5656, + "learning_rate": 1.891955404777151e-05, + "loss": 0.3261, + "step": 2828 + }, + { + "epoch": 0.566, + "learning_rate": 1.8925858184521248e-05, + "loss": 0.28, + "step": 2830 + }, + { + "epoch": 0.5664, + "learning_rate": 1.893214491985374e-05, + "loss": 0.7739, + "step": 2832 + }, + { + "epoch": 0.5668, + "learning_rate": 1.8938414241512634e-05, + "loss": 0.5093, + "step": 2834 + }, + { + "epoch": 0.5672, + "learning_rate": 1.89446661372756e-05, + "loss": 0.4029, + "step": 2836 + }, + { + "epoch": 0.5676, + "learning_rate": 1.8950900594954226e-05, + "loss": 0.4449, + "step": 2838 + }, + { + "epoch": 0.568, + "learning_rate": 1.895711760239413e-05, + "loss": 0.3913, + "step": 2840 + }, + { + "epoch": 0.5684, + "learning_rate": 1.896331714747493e-05, + "loss": 0.3687, + "step": 2842 + }, + { + "epoch": 0.5688, + "learning_rate": 1.89694992181103e-05, + "loss": 0.4915, + "step": 2844 + }, + { + "epoch": 0.5692, + "learning_rate": 1.8975663802247975e-05, + "loss": 0.2876, + "step": 2846 + }, + { + "epoch": 0.5696, + "learning_rate": 1.8981810887869784e-05, + "loss": 0.0464, + "step": 2848 + }, + { + "epoch": 0.57, + "learning_rate": 1.898794046299167e-05, + "loss": 0.2611, + "step": 2850 + }, + { + "epoch": 0.5704, + "learning_rate": 1.8994052515663708e-05, + "loss": 0.5614, + "step": 2852 + }, + { + "epoch": 0.5708, + "learning_rate": 1.9000147033970144e-05, + "loss": 1.9282, + "step": 2854 + }, + { + "epoch": 0.5712, + "learning_rate": 1.90062240060294e-05, + "loss": 0.1015, + "step": 2856 + }, + { + "epoch": 0.5716, + "learning_rate": 1.901228341999412e-05, + "loss": 0.0913, + "step": 2858 + }, + { + "epoch": 0.572, + "learning_rate": 1.9018325264051136e-05, + "loss": 0.4343, + "step": 2860 + }, + { + "epoch": 0.5724, + "learning_rate": 1.9024349526421596e-05, + "loss": 0.4359, + "step": 2862 + }, + { + "epoch": 0.5728, + "learning_rate": 1.9030356195360868e-05, + "loss": 0.3295, + "step": 2864 + }, + { + "epoch": 0.5732, + "learning_rate": 1.903634525915866e-05, + "loss": 0.1641, + "step": 2866 + }, + { + "epoch": 0.5736, + "learning_rate": 1.904231670613899e-05, + "loss": 0.184, + "step": 2868 + }, + { + "epoch": 0.574, + "learning_rate": 1.904827052466019e-05, + "loss": 0.3483, + "step": 2870 + }, + { + "epoch": 0.5744, + "learning_rate": 1.905420670311502e-05, + "loss": 0.261, + "step": 2872 + }, + { + "epoch": 0.5748, + "learning_rate": 1.9060125229930572e-05, + "loss": 0.4621, + "step": 2874 + }, + { + "epoch": 0.5752, + "learning_rate": 1.906602609356838e-05, + "loss": 0.7085, + "step": 2876 + }, + { + "epoch": 0.5756, + "learning_rate": 1.907190928252441e-05, + "loss": 0.4198, + "step": 2878 + }, + { + "epoch": 0.576, + "learning_rate": 1.9077774785329078e-05, + "loss": 0.2698, + "step": 2880 + }, + { + "epoch": 0.5764, + "learning_rate": 1.908362259054731e-05, + "loss": 0.2312, + "step": 2882 + }, + { + "epoch": 0.5768, + "learning_rate": 1.9089452686778487e-05, + "loss": 0.3692, + "step": 2884 + }, + { + "epoch": 0.5772, + "learning_rate": 1.9095265062656542e-05, + "loss": 0.342, + "step": 2886 + }, + { + "epoch": 0.5776, + "learning_rate": 1.9101059706849957e-05, + "loss": 0.4308, + "step": 2888 + }, + { + "epoch": 0.578, + "learning_rate": 1.910683660806177e-05, + "loss": 0.4425, + "step": 2890 + }, + { + "epoch": 0.5784, + "learning_rate": 1.911259575502962e-05, + "loss": 0.3312, + "step": 2892 + }, + { + "epoch": 0.5788, + "learning_rate": 1.9118337136525754e-05, + "loss": 0.5693, + "step": 2894 + }, + { + "epoch": 0.5792, + "learning_rate": 1.912406074135706e-05, + "loss": 0.208, + "step": 2896 + }, + { + "epoch": 0.5796, + "learning_rate": 1.912976655836507e-05, + "loss": 0.401, + "step": 2898 + }, + { + "epoch": 0.58, + "learning_rate": 1.9135454576426006e-05, + "loss": 0.1895, + "step": 2900 + }, + { + "epoch": 0.5804, + "learning_rate": 1.9141124784450786e-05, + "loss": 0.3374, + "step": 2902 + }, + { + "epoch": 0.5808, + "learning_rate": 1.9146777171385053e-05, + "loss": 0.2485, + "step": 2904 + }, + { + "epoch": 0.5812, + "learning_rate": 1.9152411726209172e-05, + "loss": 0.6623, + "step": 2906 + }, + { + "epoch": 0.5816, + "learning_rate": 1.9158028437938316e-05, + "loss": 0.9272, + "step": 2908 + }, + { + "epoch": 0.582, + "learning_rate": 1.916362729562239e-05, + "loss": 0.4362, + "step": 2910 + }, + { + "epoch": 0.5824, + "learning_rate": 1.9169208288346168e-05, + "loss": 0.2934, + "step": 2912 + }, + { + "epoch": 0.5828, + "learning_rate": 1.9174771405229187e-05, + "loss": 0.233, + "step": 2914 + }, + { + "epoch": 0.5832, + "learning_rate": 1.9180316635425876e-05, + "loss": 0.4501, + "step": 2916 + }, + { + "epoch": 0.5836, + "learning_rate": 1.9185843968125543e-05, + "loss": 0.2608, + "step": 2918 + }, + { + "epoch": 0.584, + "learning_rate": 1.9191353392552346e-05, + "loss": 0.0836, + "step": 2920 + }, + { + "epoch": 0.5844, + "learning_rate": 1.919684489796539e-05, + "loss": 0.2386, + "step": 2922 + }, + { + "epoch": 0.5848, + "learning_rate": 1.9202318473658703e-05, + "loss": 0.3728, + "step": 2924 + }, + { + "epoch": 0.5852, + "learning_rate": 1.9207774108961273e-05, + "loss": 0.2449, + "step": 2926 + }, + { + "epoch": 0.5856, + "learning_rate": 1.9213211793237052e-05, + "loss": 0.5553, + "step": 2928 + }, + { + "epoch": 0.586, + "learning_rate": 1.9218631515885004e-05, + "loss": 0.3882, + "step": 2930 + }, + { + "epoch": 0.5864, + "learning_rate": 1.92240332663391e-05, + "loss": 0.353, + "step": 2932 + }, + { + "epoch": 0.5868, + "learning_rate": 1.922941703406835e-05, + "loss": 0.5851, + "step": 2934 + }, + { + "epoch": 0.5872, + "learning_rate": 1.923478280857682e-05, + "loss": 0.4484, + "step": 2936 + }, + { + "epoch": 0.5876, + "learning_rate": 1.9240130579403663e-05, + "loss": 0.4564, + "step": 2938 + }, + { + "epoch": 0.588, + "learning_rate": 1.924546033612313e-05, + "loss": 0.3117, + "step": 2940 + }, + { + "epoch": 0.5884, + "learning_rate": 1.9250772068344577e-05, + "loss": 0.5023, + "step": 2942 + }, + { + "epoch": 0.5888, + "learning_rate": 1.9256065765712524e-05, + "loss": 0.147, + "step": 2944 + }, + { + "epoch": 0.5892, + "learning_rate": 1.9261341417906615e-05, + "loss": 0.1917, + "step": 2946 + }, + { + "epoch": 0.5896, + "learning_rate": 1.9266599014641724e-05, + "loss": 0.45, + "step": 2948 + }, + { + "epoch": 0.59, + "learning_rate": 1.9271838545667876e-05, + "loss": 0.1353, + "step": 2950 + }, + { + "epoch": 0.5904, + "learning_rate": 1.927706000077034e-05, + "loss": 0.4902, + "step": 2952 + }, + { + "epoch": 0.5908, + "learning_rate": 1.9282263369769633e-05, + "loss": 0.1986, + "step": 2954 + }, + { + "epoch": 0.5912, + "learning_rate": 1.9287448642521507e-05, + "loss": 0.3804, + "step": 2956 + }, + { + "epoch": 0.5916, + "learning_rate": 1.9292615808917024e-05, + "loss": 0.5828, + "step": 2958 + }, + { + "epoch": 0.592, + "learning_rate": 1.9297764858882516e-05, + "loss": 0.4453, + "step": 2960 + }, + { + "epoch": 0.5924, + "learning_rate": 1.9302895782379648e-05, + "loss": 0.5947, + "step": 2962 + }, + { + "epoch": 0.5928, + "learning_rate": 1.9308008569405424e-05, + "loss": 0.1968, + "step": 2964 + }, + { + "epoch": 0.5932, + "learning_rate": 1.9313103209992205e-05, + "loss": 0.6335, + "step": 2966 + }, + { + "epoch": 0.5936, + "learning_rate": 1.9318179694207722e-05, + "loss": 0.1334, + "step": 2968 + }, + { + "epoch": 0.594, + "learning_rate": 1.932323801215512e-05, + "loss": 0.2482, + "step": 2970 + }, + { + "epoch": 0.5944, + "learning_rate": 1.9328278153972943e-05, + "loss": 0.2898, + "step": 2972 + }, + { + "epoch": 0.5948, + "learning_rate": 1.933330010983518e-05, + "loss": 0.9807, + "step": 2974 + }, + { + "epoch": 0.5952, + "learning_rate": 1.9338303869951266e-05, + "loss": 0.3055, + "step": 2976 + }, + { + "epoch": 0.5956, + "learning_rate": 1.934328942456612e-05, + "loss": 0.299, + "step": 2978 + }, + { + "epoch": 0.596, + "learning_rate": 1.934825676396015e-05, + "loss": 1.6104, + "step": 2980 + }, + { + "epoch": 0.5964, + "learning_rate": 1.9353205878449257e-05, + "loss": 0.4087, + "step": 2982 + }, + { + "epoch": 0.5968, + "learning_rate": 1.935813675838491e-05, + "loss": 0.3544, + "step": 2984 + }, + { + "epoch": 0.5972, + "learning_rate": 1.9363049394154088e-05, + "loss": 0.3972, + "step": 2986 + }, + { + "epoch": 0.5976, + "learning_rate": 1.9367943776179375e-05, + "loss": 0.8196, + "step": 2988 + }, + { + "epoch": 0.598, + "learning_rate": 1.937281989491892e-05, + "loss": 0.2637, + "step": 2990 + }, + { + "epoch": 0.5984, + "learning_rate": 1.9377677740866457e-05, + "loss": 0.5511, + "step": 2992 + }, + { + "epoch": 0.5988, + "learning_rate": 1.9382517304551397e-05, + "loss": 0.2166, + "step": 2994 + }, + { + "epoch": 0.5992, + "learning_rate": 1.9387338576538743e-05, + "loss": 0.4095, + "step": 2996 + }, + { + "epoch": 0.5996, + "learning_rate": 1.9392141547429183e-05, + "loss": 0.469, + "step": 2998 + }, + { + "epoch": 0.6, + "learning_rate": 1.9396926207859085e-05, + "loss": 0.275, + "step": 3000 + }, + { + "epoch": 0.6004, + "learning_rate": 1.9401692548500504e-05, + "loss": 0.3073, + "step": 3002 + }, + { + "epoch": 0.6008, + "learning_rate": 1.9406440560061214e-05, + "loss": 0.5558, + "step": 3004 + }, + { + "epoch": 0.6012, + "learning_rate": 1.9411170233284728e-05, + "loss": 0.3969, + "step": 3006 + }, + { + "epoch": 0.6016, + "learning_rate": 1.9415881558950302e-05, + "loss": 0.3593, + "step": 3008 + }, + { + "epoch": 0.602, + "learning_rate": 1.942057452787297e-05, + "loss": 0.4463, + "step": 3010 + }, + { + "epoch": 0.6024, + "learning_rate": 1.942524913090354e-05, + "loss": 0.2929, + "step": 3012 + }, + { + "epoch": 0.6028, + "learning_rate": 1.9429905358928645e-05, + "loss": 0.4766, + "step": 3014 + }, + { + "epoch": 0.6032, + "learning_rate": 1.9434543202870723e-05, + "loss": 0.0977, + "step": 3016 + }, + { + "epoch": 0.6036, + "learning_rate": 1.9439162653688063e-05, + "loss": 0.4795, + "step": 3018 + }, + { + "epoch": 0.604, + "learning_rate": 1.9443763702374815e-05, + "loss": 0.3406, + "step": 3020 + }, + { + "epoch": 0.6044, + "learning_rate": 1.944834633996098e-05, + "loss": 0.5806, + "step": 3022 + }, + { + "epoch": 0.6048, + "learning_rate": 1.9452910557512494e-05, + "loss": 0.1126, + "step": 3024 + }, + { + "epoch": 0.6052, + "learning_rate": 1.9457456346131172e-05, + "loss": 0.1443, + "step": 3026 + }, + { + "epoch": 0.6056, + "learning_rate": 1.9461983696954756e-05, + "loss": 0.1466, + "step": 3028 + }, + { + "epoch": 0.606, + "learning_rate": 1.9466492601156964e-05, + "loss": 0.3687, + "step": 3030 + }, + { + "epoch": 0.6064, + "learning_rate": 1.947098304994744e-05, + "loss": 0.7448, + "step": 3032 + }, + { + "epoch": 0.6068, + "learning_rate": 1.947545503457184e-05, + "loss": 0.6954, + "step": 3034 + }, + { + "epoch": 0.6072, + "learning_rate": 1.9479908546311787e-05, + "loss": 0.271, + "step": 3036 + }, + { + "epoch": 0.6076, + "learning_rate": 1.9484343576484935e-05, + "loss": 0.3586, + "step": 3038 + }, + { + "epoch": 0.608, + "learning_rate": 1.9488760116444966e-05, + "loss": 0.2513, + "step": 3040 + }, + { + "epoch": 0.6084, + "learning_rate": 1.949315815758161e-05, + "loss": 0.4403, + "step": 3042 + }, + { + "epoch": 0.6088, + "learning_rate": 1.949753769132067e-05, + "loss": 0.6838, + "step": 3044 + }, + { + "epoch": 0.6092, + "learning_rate": 1.9501898709124008e-05, + "loss": 0.2401, + "step": 3046 + }, + { + "epoch": 0.6096, + "learning_rate": 1.95062412024896e-05, + "loss": 0.2543, + "step": 3048 + }, + { + "epoch": 0.61, + "learning_rate": 1.9510565162951534e-05, + "loss": 0.9023, + "step": 3050 + }, + { + "epoch": 0.6104, + "learning_rate": 1.951487058208003e-05, + "loss": 0.4675, + "step": 3052 + }, + { + "epoch": 0.6108, + "learning_rate": 1.9519157451481453e-05, + "loss": 0.1379, + "step": 3054 + }, + { + "epoch": 0.6112, + "learning_rate": 1.952342576279833e-05, + "loss": 0.3731, + "step": 3056 + }, + { + "epoch": 0.6116, + "learning_rate": 1.9527675507709364e-05, + "loss": 0.1114, + "step": 3058 + }, + { + "epoch": 0.612, + "learning_rate": 1.953190667792947e-05, + "loss": 0.5675, + "step": 3060 + }, + { + "epoch": 0.6124, + "learning_rate": 1.953611926520976e-05, + "loss": 0.3667, + "step": 3062 + }, + { + "epoch": 0.6128, + "learning_rate": 1.9540313261337578e-05, + "loss": 0.5543, + "step": 3064 + }, + { + "epoch": 0.6132, + "learning_rate": 1.9544488658136522e-05, + "loss": 0.1604, + "step": 3066 + }, + { + "epoch": 0.6136, + "learning_rate": 1.954864544746643e-05, + "loss": 0.4575, + "step": 3068 + }, + { + "epoch": 0.614, + "learning_rate": 1.955278362122344e-05, + "loss": 0.0978, + "step": 3070 + }, + { + "epoch": 0.6144, + "learning_rate": 1.955690317133996e-05, + "loss": 0.2581, + "step": 3072 + }, + { + "epoch": 0.6148, + "learning_rate": 1.9561004089784726e-05, + "loss": 0.5175, + "step": 3074 + }, + { + "epoch": 0.6152, + "learning_rate": 1.956508636856278e-05, + "loss": 0.2001, + "step": 3076 + }, + { + "epoch": 0.6156, + "learning_rate": 1.956914999971551e-05, + "loss": 0.2532, + "step": 3078 + }, + { + "epoch": 0.616, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.271, + "step": 3080 + }, + { + "epoch": 0.6164, + "learning_rate": 1.9577221287492368e-05, + "loss": 0.4446, + "step": 3082 + }, + { + "epoch": 0.6168, + "learning_rate": 1.95812289283811e-05, + "loss": 0.6833, + "step": 3084 + }, + { + "epoch": 0.6172, + "learning_rate": 1.958521789017376e-05, + "loss": 0.0902, + "step": 3086 + }, + { + "epoch": 0.6176, + "learning_rate": 1.958918816509367e-05, + "loss": 0.4855, + "step": 3088 + }, + { + "epoch": 0.618, + "learning_rate": 1.9593139745400575e-05, + "loss": 0.2153, + "step": 3090 + }, + { + "epoch": 0.6184, + "learning_rate": 1.9597072623390668e-05, + "loss": 0.4365, + "step": 3092 + }, + { + "epoch": 0.6188, + "learning_rate": 1.9600986791396597e-05, + "loss": 0.6754, + "step": 3094 + }, + { + "epoch": 0.6192, + "learning_rate": 1.9604882241787496e-05, + "loss": 0.2218, + "step": 3096 + }, + { + "epoch": 0.6196, + "learning_rate": 1.9608758966968983e-05, + "loss": 0.3194, + "step": 3098 + }, + { + "epoch": 0.62, + "learning_rate": 1.9612616959383187e-05, + "loss": 0.2613, + "step": 3100 + }, + { + "epoch": 0.6204, + "learning_rate": 1.9616456211508752e-05, + "loss": 0.2173, + "step": 3102 + }, + { + "epoch": 0.6208, + "learning_rate": 1.9620276715860856e-05, + "loss": 0.6392, + "step": 3104 + }, + { + "epoch": 0.6212, + "learning_rate": 1.962407846499124e-05, + "loss": 0.5724, + "step": 3106 + }, + { + "epoch": 0.6216, + "learning_rate": 1.9627861451488187e-05, + "loss": 0.6815, + "step": 3108 + }, + { + "epoch": 0.622, + "learning_rate": 1.9631625667976584e-05, + "loss": 0.6591, + "step": 3110 + }, + { + "epoch": 0.6224, + "learning_rate": 1.963537110711789e-05, + "loss": 0.4591, + "step": 3112 + }, + { + "epoch": 0.6228, + "learning_rate": 1.9639097761610174e-05, + "loss": 0.6167, + "step": 3114 + }, + { + "epoch": 0.6232, + "learning_rate": 1.964280562418815e-05, + "loss": 0.2654, + "step": 3116 + }, + { + "epoch": 0.6236, + "learning_rate": 1.964649468762313e-05, + "loss": 0.2223, + "step": 3118 + }, + { + "epoch": 0.624, + "learning_rate": 1.9650164944723116e-05, + "loss": 0.5332, + "step": 3120 + }, + { + "epoch": 0.6244, + "learning_rate": 1.965381638833274e-05, + "loss": 0.3946, + "step": 3122 + }, + { + "epoch": 0.6248, + "learning_rate": 1.9657449011333328e-05, + "loss": 0.492, + "step": 3124 + }, + { + "epoch": 0.6252, + "learning_rate": 1.96610628066429e-05, + "loss": 0.2532, + "step": 3126 + }, + { + "epoch": 0.6256, + "learning_rate": 1.9664657767216176e-05, + "loss": 0.3132, + "step": 3128 + }, + { + "epoch": 0.626, + "learning_rate": 1.9668233886044594e-05, + "loss": 0.2219, + "step": 3130 + }, + { + "epoch": 0.6264, + "learning_rate": 1.967179115615633e-05, + "loss": 0.3411, + "step": 3132 + }, + { + "epoch": 0.6268, + "learning_rate": 1.96753295706163e-05, + "loss": 0.2371, + "step": 3134 + }, + { + "epoch": 0.6272, + "learning_rate": 1.967884912252619e-05, + "loss": 0.2038, + "step": 3136 + }, + { + "epoch": 0.6276, + "learning_rate": 1.9682349805024443e-05, + "loss": 0.3467, + "step": 3138 + }, + { + "epoch": 0.628, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.6589, + "step": 3140 + }, + { + "epoch": 0.6284, + "learning_rate": 1.9689294534523833e-05, + "loss": 0.3652, + "step": 3142 + }, + { + "epoch": 0.6288, + "learning_rate": 1.969273856798585e-05, + "loss": 0.6779, + "step": 3144 + }, + { + "epoch": 0.6292, + "learning_rate": 1.969616370495806e-05, + "loss": 0.6917, + "step": 3146 + }, + { + "epoch": 0.6296, + "learning_rate": 1.9699569938762972e-05, + "loss": 0.3718, + "step": 3148 + }, + { + "epoch": 0.63, + "learning_rate": 1.9702957262759964e-05, + "loss": 0.5987, + "step": 3150 + }, + { + "epoch": 0.6304, + "learning_rate": 1.9706325670345276e-05, + "loss": 0.8134, + "step": 3152 + }, + { + "epoch": 0.6308, + "learning_rate": 1.9709675154952013e-05, + "loss": 0.4146, + "step": 3154 + }, + { + "epoch": 0.6312, + "learning_rate": 1.9713005710050203e-05, + "loss": 0.4077, + "step": 3156 + }, + { + "epoch": 0.6316, + "learning_rate": 1.971631732914674e-05, + "loss": 0.1297, + "step": 3158 + }, + { + "epoch": 0.632, + "learning_rate": 1.9719610005785466e-05, + "loss": 0.2461, + "step": 3160 + }, + { + "epoch": 0.6324, + "learning_rate": 1.9722883733547128e-05, + "loss": 0.4181, + "step": 3162 + }, + { + "epoch": 0.6328, + "learning_rate": 1.9726138506049434e-05, + "loss": 0.4337, + "step": 3164 + }, + { + "epoch": 0.6332, + "learning_rate": 1.972937431694704e-05, + "loss": 0.3711, + "step": 3166 + }, + { + "epoch": 0.6336, + "learning_rate": 1.9732591159931564e-05, + "loss": 0.0643, + "step": 3168 + }, + { + "epoch": 0.634, + "learning_rate": 1.9735789028731603e-05, + "loss": 0.3915, + "step": 3170 + }, + { + "epoch": 0.6344, + "learning_rate": 1.9738967917112752e-05, + "loss": 0.3595, + "step": 3172 + }, + { + "epoch": 0.6348, + "learning_rate": 1.9742127818877605e-05, + "loss": 0.2648, + "step": 3174 + }, + { + "epoch": 0.6352, + "learning_rate": 1.974526872786577e-05, + "loss": 0.1521, + "step": 3176 + }, + { + "epoch": 0.6356, + "learning_rate": 1.974839063795389e-05, + "loss": 0.3582, + "step": 3178 + }, + { + "epoch": 0.636, + "learning_rate": 1.9751493543055634e-05, + "loss": 0.3462, + "step": 3180 + }, + { + "epoch": 0.6364, + "learning_rate": 1.975457743712173e-05, + "loss": 0.2345, + "step": 3182 + }, + { + "epoch": 0.6368, + "learning_rate": 1.9757642314139977e-05, + "loss": 0.4006, + "step": 3184 + }, + { + "epoch": 0.6372, + "learning_rate": 1.976068816813523e-05, + "loss": 1.0042, + "step": 3186 + }, + { + "epoch": 0.6376, + "learning_rate": 1.976371499316945e-05, + "loss": 0.4905, + "step": 3188 + }, + { + "epoch": 0.638, + "learning_rate": 1.9766722783341675e-05, + "loss": 0.6901, + "step": 3190 + }, + { + "epoch": 0.6384, + "learning_rate": 1.9769711532788083e-05, + "loss": 0.3572, + "step": 3192 + }, + { + "epoch": 0.6388, + "learning_rate": 1.9772681235681933e-05, + "loss": 0.616, + "step": 3194 + }, + { + "epoch": 0.6392, + "learning_rate": 1.9775631886233655e-05, + "loss": 0.1736, + "step": 3196 + }, + { + "epoch": 0.6396, + "learning_rate": 1.977856347869079e-05, + "loss": 0.0096, + "step": 3198 + }, + { + "epoch": 0.64, + "learning_rate": 1.9781476007338054e-05, + "loss": 0.5169, + "step": 3200 + }, + { + "epoch": 0.6404, + "learning_rate": 1.9784369466497333e-05, + "loss": 0.4037, + "step": 3202 + }, + { + "epoch": 0.6408, + "learning_rate": 1.978724385052766e-05, + "loss": 0.6378, + "step": 3204 + }, + { + "epoch": 0.6412, + "learning_rate": 1.97900991538253e-05, + "loss": 0.2244, + "step": 3206 + }, + { + "epoch": 0.6416, + "learning_rate": 1.9792935370823673e-05, + "loss": 0.4333, + "step": 3208 + }, + { + "epoch": 0.642, + "learning_rate": 1.979575249599344e-05, + "loss": 0.4466, + "step": 3210 + }, + { + "epoch": 0.6424, + "learning_rate": 1.979855052384247e-05, + "loss": 0.907, + "step": 3212 + }, + { + "epoch": 0.6428, + "learning_rate": 1.980132944891586e-05, + "loss": 0.8047, + "step": 3214 + }, + { + "epoch": 0.6432, + "learning_rate": 1.9804089265795956e-05, + "loss": 0.2624, + "step": 3216 + }, + { + "epoch": 0.6436, + "learning_rate": 1.9806829969102356e-05, + "loss": 0.5118, + "step": 3218 + }, + { + "epoch": 0.644, + "learning_rate": 1.9809551553491918e-05, + "loss": 0.3744, + "step": 3220 + }, + { + "epoch": 0.6444, + "learning_rate": 1.981225401365877e-05, + "loss": 0.3694, + "step": 3222 + }, + { + "epoch": 0.6448, + "learning_rate": 1.981493734433433e-05, + "loss": 0.554, + "step": 3224 + }, + { + "epoch": 0.6452, + "learning_rate": 1.981760154028731e-05, + "loss": 0.1444, + "step": 3226 + }, + { + "epoch": 0.6456, + "learning_rate": 1.982024659632372e-05, + "loss": 0.1377, + "step": 3228 + }, + { + "epoch": 0.646, + "learning_rate": 1.9822872507286887e-05, + "loss": 0.2529, + "step": 3230 + }, + { + "epoch": 0.6464, + "learning_rate": 1.9825479268057472e-05, + "loss": 0.3273, + "step": 3232 + }, + { + "epoch": 0.6468, + "learning_rate": 1.9828066873553445e-05, + "loss": 0.6496, + "step": 3234 + }, + { + "epoch": 0.6472, + "learning_rate": 1.9830635318730155e-05, + "loss": 0.3853, + "step": 3236 + }, + { + "epoch": 0.6476, + "learning_rate": 1.983318459858028e-05, + "loss": 0.2348, + "step": 3238 + }, + { + "epoch": 0.648, + "learning_rate": 1.9835714708133858e-05, + "loss": 0.3269, + "step": 3240 + }, + { + "epoch": 0.6484, + "learning_rate": 1.983822564245833e-05, + "loss": 0.6148, + "step": 3242 + }, + { + "epoch": 0.6488, + "learning_rate": 1.9840717396658483e-05, + "loss": 0.5324, + "step": 3244 + }, + { + "epoch": 0.6492, + "learning_rate": 1.9843189965876525e-05, + "loss": 0.4096, + "step": 3246 + }, + { + "epoch": 0.6496, + "learning_rate": 1.9845643345292055e-05, + "loss": 0.1341, + "step": 3248 + }, + { + "epoch": 0.65, + "learning_rate": 1.9848077530122083e-05, + "loss": 0.3241, + "step": 3250 + }, + { + "epoch": 0.6504, + "learning_rate": 1.9850492515621038e-05, + "loss": 0.1163, + "step": 3252 + }, + { + "epoch": 0.6508, + "learning_rate": 1.9852888297080785e-05, + "loss": 0.1876, + "step": 3254 + }, + { + "epoch": 0.6512, + "learning_rate": 1.985526486983063e-05, + "loss": 0.2528, + "step": 3256 + }, + { + "epoch": 0.6516, + "learning_rate": 1.9857622229237315e-05, + "loss": 0.2096, + "step": 3258 + }, + { + "epoch": 0.652, + "learning_rate": 1.985996037070505e-05, + "loss": 0.6599, + "step": 3260 + }, + { + "epoch": 0.6524, + "learning_rate": 1.986227928967551e-05, + "loss": 0.3456, + "step": 3262 + }, + { + "epoch": 0.6528, + "learning_rate": 1.9864578981627844e-05, + "loss": 0.2582, + "step": 3264 + }, + { + "epoch": 0.6532, + "learning_rate": 1.986685944207868e-05, + "loss": 0.2767, + "step": 3266 + }, + { + "epoch": 0.6536, + "learning_rate": 1.9869120666582153e-05, + "loss": 0.4581, + "step": 3268 + }, + { + "epoch": 0.654, + "learning_rate": 1.9871362650729877e-05, + "loss": 0.2086, + "step": 3270 + }, + { + "epoch": 0.6544, + "learning_rate": 1.9873585390151003e-05, + "loss": 0.695, + "step": 3272 + }, + { + "epoch": 0.6548, + "learning_rate": 1.9875788880512183e-05, + "loss": 0.4846, + "step": 3274 + }, + { + "epoch": 0.6552, + "learning_rate": 1.987797311751759e-05, + "loss": 0.2572, + "step": 3276 + }, + { + "epoch": 0.6556, + "learning_rate": 1.9880138096908955e-05, + "loss": 0.8055, + "step": 3278 + }, + { + "epoch": 0.656, + "learning_rate": 1.9882283814465528e-05, + "loss": 0.303, + "step": 3280 + }, + { + "epoch": 0.6564, + "learning_rate": 1.9884410266004134e-05, + "loss": 0.8064, + "step": 3282 + }, + { + "epoch": 0.6568, + "learning_rate": 1.988651744737914e-05, + "loss": 0.3538, + "step": 3284 + }, + { + "epoch": 0.6572, + "learning_rate": 1.9888605354482494e-05, + "loss": 0.3561, + "step": 3286 + }, + { + "epoch": 0.6576, + "learning_rate": 1.9890673983243704e-05, + "loss": 0.2092, + "step": 3288 + }, + { + "epoch": 0.658, + "learning_rate": 1.9892723329629885e-05, + "loss": 0.4701, + "step": 3290 + }, + { + "epoch": 0.6584, + "learning_rate": 1.9894753389645723e-05, + "loss": 0.1267, + "step": 3292 + }, + { + "epoch": 0.6588, + "learning_rate": 1.989676415933351e-05, + "loss": 0.3411, + "step": 3294 + }, + { + "epoch": 0.6592, + "learning_rate": 1.9898755634773155e-05, + "loss": 0.2245, + "step": 3296 + }, + { + "epoch": 0.6596, + "learning_rate": 1.9900727812082174e-05, + "loss": 0.0913, + "step": 3298 + }, + { + "epoch": 0.66, + "learning_rate": 1.9902680687415704e-05, + "loss": 0.221, + "step": 3300 + }, + { + "epoch": 0.6604, + "learning_rate": 1.9904614256966514e-05, + "loss": 0.6643, + "step": 3302 + }, + { + "epoch": 0.6608, + "learning_rate": 1.9906528516965014e-05, + "loss": 0.5714, + "step": 3304 + }, + { + "epoch": 0.6612, + "learning_rate": 1.9908423463679246e-05, + "loss": 0.5798, + "step": 3306 + }, + { + "epoch": 0.6616, + "learning_rate": 1.9910299093414926e-05, + "loss": 0.1887, + "step": 3308 + }, + { + "epoch": 0.662, + "learning_rate": 1.991215540251542e-05, + "loss": 0.2578, + "step": 3310 + }, + { + "epoch": 0.6624, + "learning_rate": 1.9913992387361744e-05, + "loss": 0.5057, + "step": 3312 + }, + { + "epoch": 0.6628, + "learning_rate": 1.9915810044372618e-05, + "loss": 0.6017, + "step": 3314 + }, + { + "epoch": 0.6632, + "learning_rate": 1.9917608370004414e-05, + "loss": 0.5663, + "step": 3316 + }, + { + "epoch": 0.6636, + "learning_rate": 1.9919387360751216e-05, + "loss": 0.1981, + "step": 3318 + }, + { + "epoch": 0.664, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.2789, + "step": 3320 + }, + { + "epoch": 0.6644, + "learning_rate": 1.992288732375458e-05, + "loss": 0.7267, + "step": 3322 + }, + { + "epoch": 0.6648, + "learning_rate": 1.9924608289187786e-05, + "loss": 0.3839, + "step": 3324 + }, + { + "epoch": 0.6652, + "learning_rate": 1.992630990608929e-05, + "loss": 0.3621, + "step": 3326 + }, + { + "epoch": 0.6656, + "learning_rate": 1.9927992171141707e-05, + "loss": 0.2789, + "step": 3328 + }, + { + "epoch": 0.666, + "learning_rate": 1.992965508106537e-05, + "loss": 0.4148, + "step": 3330 + }, + { + "epoch": 0.6664, + "learning_rate": 1.9931298632618355e-05, + "loss": 0.6315, + "step": 3332 + }, + { + "epoch": 0.6668, + "learning_rate": 1.993292282259647e-05, + "loss": 0.2781, + "step": 3334 + }, + { + "epoch": 0.6672, + "learning_rate": 1.9934527647833276e-05, + "loss": 1.7686, + "step": 3336 + }, + { + "epoch": 0.6676, + "learning_rate": 1.9936113105200085e-05, + "loss": 0.4896, + "step": 3338 + }, + { + "epoch": 0.668, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.8246, + "step": 3340 + }, + { + "epoch": 0.6684, + "learning_rate": 1.9939225903997748e-05, + "loss": 0.2005, + "step": 3342 + }, + { + "epoch": 0.6688, + "learning_rate": 1.9940753239360047e-05, + "loss": 0.2305, + "step": 3344 + }, + { + "epoch": 0.6692, + "learning_rate": 1.9942261194715236e-05, + "loss": 0.2591, + "step": 3346 + }, + { + "epoch": 0.6696, + "learning_rate": 1.994374976712348e-05, + "loss": 0.4019, + "step": 3348 + }, + { + "epoch": 0.67, + "learning_rate": 1.9945218953682736e-05, + "loss": 0.5286, + "step": 3350 + }, + { + "epoch": 0.6704, + "learning_rate": 1.994666875152874e-05, + "loss": 0.4193, + "step": 3352 + }, + { + "epoch": 0.6708, + "learning_rate": 1.994809915783505e-05, + "loss": 0.4511, + "step": 3354 + }, + { + "epoch": 0.6712, + "learning_rate": 1.9949510169813003e-05, + "loss": 0.2584, + "step": 3356 + }, + { + "epoch": 0.6716, + "learning_rate": 1.9950901784711768e-05, + "loss": 0.3761, + "step": 3358 + }, + { + "epoch": 0.672, + "learning_rate": 1.9952273999818312e-05, + "loss": 0.4273, + "step": 3360 + }, + { + "epoch": 0.6724, + "learning_rate": 1.995362681245744e-05, + "loss": 0.4113, + "step": 3362 + }, + { + "epoch": 0.6728, + "learning_rate": 1.995496021999177e-05, + "loss": 0.1796, + "step": 3364 + }, + { + "epoch": 0.6732, + "learning_rate": 1.995627421982176e-05, + "loss": 0.6577, + "step": 3366 + }, + { + "epoch": 0.6736, + "learning_rate": 1.9957568809385693e-05, + "loss": 0.2723, + "step": 3368 + }, + { + "epoch": 0.674, + "learning_rate": 1.9958843986159705e-05, + "loss": 0.2148, + "step": 3370 + }, + { + "epoch": 0.6744, + "learning_rate": 1.9960099747657774e-05, + "loss": 0.1762, + "step": 3372 + }, + { + "epoch": 0.6748, + "learning_rate": 1.9961336091431725e-05, + "loss": 0.3485, + "step": 3374 + }, + { + "epoch": 0.6752, + "learning_rate": 1.996255301507125e-05, + "loss": 0.1408, + "step": 3376 + }, + { + "epoch": 0.6756, + "learning_rate": 1.9963750516203884e-05, + "loss": 0.5658, + "step": 3378 + }, + { + "epoch": 0.676, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.3747, + "step": 3380 + }, + { + "epoch": 0.6764, + "learning_rate": 1.996608724164801e-05, + "loss": 0.8133, + "step": 3382 + }, + { + "epoch": 0.6768, + "learning_rate": 1.9967226461403934e-05, + "loss": 0.5632, + "step": 3384 + }, + { + "epoch": 0.6772, + "learning_rate": 1.9968346249541848e-05, + "loss": 0.344, + "step": 3386 + }, + { + "epoch": 0.6776, + "learning_rate": 1.996944660387867e-05, + "loss": 0.4145, + "step": 3388 + }, + { + "epoch": 0.678, + "learning_rate": 1.9970527522269204e-05, + "loss": 0.0796, + "step": 3390 + }, + { + "epoch": 0.6784, + "learning_rate": 1.997158900260614e-05, + "loss": 0.2299, + "step": 3392 + }, + { + "epoch": 0.6788, + "learning_rate": 1.997263104282007e-05, + "loss": 0.4028, + "step": 3394 + }, + { + "epoch": 0.6792, + "learning_rate": 1.9973653640879486e-05, + "loss": 0.832, + "step": 3396 + }, + { + "epoch": 0.6796, + "learning_rate": 1.9974656794790777e-05, + "loss": 0.6258, + "step": 3398 + }, + { + "epoch": 0.68, + "learning_rate": 1.9975640502598246e-05, + "loss": 0.2126, + "step": 3400 + }, + { + "epoch": 0.6804, + "learning_rate": 1.99766047623841e-05, + "loss": 0.1655, + "step": 3402 + }, + { + "epoch": 0.6808, + "learning_rate": 1.997754957226847e-05, + "loss": 0.392, + "step": 3404 + }, + { + "epoch": 0.6812, + "learning_rate": 1.9978474930409396e-05, + "loss": 0.3815, + "step": 3406 + }, + { + "epoch": 0.6816, + "learning_rate": 1.9979380835002846e-05, + "loss": 0.4513, + "step": 3408 + }, + { + "epoch": 0.682, + "learning_rate": 1.9980267284282718e-05, + "loss": 0.684, + "step": 3410 + }, + { + "epoch": 0.6824, + "learning_rate": 1.9981134276520828e-05, + "loss": 0.7855, + "step": 3412 + }, + { + "epoch": 0.6828, + "learning_rate": 1.9981981810026932e-05, + "loss": 0.2719, + "step": 3414 + }, + { + "epoch": 0.6832, + "learning_rate": 1.998280988314872e-05, + "loss": 0.2801, + "step": 3416 + }, + { + "epoch": 0.6836, + "learning_rate": 1.9983618494271825e-05, + "loss": 0.2722, + "step": 3418 + }, + { + "epoch": 0.684, + "learning_rate": 1.998440764181981e-05, + "loss": 0.1707, + "step": 3420 + }, + { + "epoch": 0.6844, + "learning_rate": 1.99851773242542e-05, + "loss": 0.3289, + "step": 3422 + }, + { + "epoch": 0.6848, + "learning_rate": 1.9985927540074453e-05, + "loss": 0.3892, + "step": 3424 + }, + { + "epoch": 0.6852, + "learning_rate": 1.9986658287817992e-05, + "loss": 0.2403, + "step": 3426 + }, + { + "epoch": 0.6856, + "learning_rate": 1.998736956606018e-05, + "loss": 0.3955, + "step": 3428 + }, + { + "epoch": 0.686, + "learning_rate": 1.9988061373414342e-05, + "loss": 0.1709, + "step": 3430 + }, + { + "epoch": 0.6864, + "learning_rate": 1.9988733708531772e-05, + "loss": 0.294, + "step": 3432 + }, + { + "epoch": 0.6868, + "learning_rate": 1.9989386570101712e-05, + "loss": 0.3262, + "step": 3434 + }, + { + "epoch": 0.6872, + "learning_rate": 1.9990019956851384e-05, + "loss": 0.7574, + "step": 3436 + }, + { + "epoch": 0.6876, + "learning_rate": 1.9990633867545956e-05, + "loss": 0.2661, + "step": 3438 + }, + { + "epoch": 0.688, + "learning_rate": 1.9991228300988586e-05, + "loss": 0.4533, + "step": 3440 + }, + { + "epoch": 0.6884, + "learning_rate": 1.9991803256020393e-05, + "loss": 1.2708, + "step": 3442 + }, + { + "epoch": 0.6888, + "learning_rate": 1.999235873152047e-05, + "loss": 0.3983, + "step": 3444 + }, + { + "epoch": 0.6892, + "learning_rate": 1.9992894726405894e-05, + "loss": 0.2866, + "step": 3446 + }, + { + "epoch": 0.6896, + "learning_rate": 1.9993411239631713e-05, + "loss": 0.5536, + "step": 3448 + }, + { + "epoch": 0.69, + "learning_rate": 1.999390827019096e-05, + "loss": 0.7414, + "step": 3450 + }, + { + "epoch": 0.6904, + "learning_rate": 1.9994385817114644e-05, + "loss": 0.6119, + "step": 3452 + }, + { + "epoch": 0.6908, + "learning_rate": 1.999484387947177e-05, + "loss": 0.2394, + "step": 3454 + }, + { + "epoch": 0.6912, + "learning_rate": 1.9995282456369313e-05, + "loss": 0.1891, + "step": 3456 + }, + { + "epoch": 0.6916, + "learning_rate": 1.9995701546952252e-05, + "loss": 0.3597, + "step": 3458 + }, + { + "epoch": 0.692, + "learning_rate": 1.9996101150403543e-05, + "loss": 0.4461, + "step": 3460 + }, + { + "epoch": 0.6924, + "learning_rate": 1.9996481265944146e-05, + "loss": 0.2969, + "step": 3462 + }, + { + "epoch": 0.6928, + "learning_rate": 1.9996841892833e-05, + "loss": 0.3047, + "step": 3464 + }, + { + "epoch": 0.6932, + "learning_rate": 1.999718303036705e-05, + "loss": 0.6878, + "step": 3466 + }, + { + "epoch": 0.6936, + "learning_rate": 1.9997504677881224e-05, + "loss": 0.5323, + "step": 3468 + }, + { + "epoch": 0.694, + "learning_rate": 1.9997806834748455e-05, + "loss": 0.2311, + "step": 3470 + }, + { + "epoch": 0.6944, + "learning_rate": 1.999808950037968e-05, + "loss": 0.3805, + "step": 3472 + }, + { + "epoch": 0.6948, + "learning_rate": 1.9998352674223816e-05, + "loss": 0.4133, + "step": 3474 + }, + { + "epoch": 0.6952, + "learning_rate": 1.9998596355767805e-05, + "loss": 0.3408, + "step": 3476 + }, + { + "epoch": 0.6956, + "learning_rate": 1.999882054453657e-05, + "loss": 0.2311, + "step": 3478 + }, + { + "epoch": 0.696, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.0774, + "step": 3480 + }, + { + "epoch": 0.6964, + "learning_rate": 1.9999210442038164e-05, + "loss": 0.2566, + "step": 3482 + }, + { + "epoch": 0.6968, + "learning_rate": 1.9999376150010868e-05, + "loss": 0.6279, + "step": 3484 + }, + { + "epoch": 0.6972, + "learning_rate": 1.99995223636881e-05, + "loss": 0.2741, + "step": 3486 + }, + { + "epoch": 0.6976, + "learning_rate": 1.9999649082784807e-05, + "loss": 0.4703, + "step": 3488 + }, + { + "epoch": 0.698, + "learning_rate": 1.9999756307053947e-05, + "loss": 0.6449, + "step": 3490 + }, + { + "epoch": 0.6984, + "learning_rate": 1.9999844036286483e-05, + "loss": 0.6222, + "step": 3492 + }, + { + "epoch": 0.6988, + "learning_rate": 1.9999912270311376e-05, + "loss": 0.6077, + "step": 3494 + }, + { + "epoch": 0.6992, + "learning_rate": 1.9999961008995607e-05, + "loss": 0.6108, + "step": 3496 + }, + { + "epoch": 0.6996, + "learning_rate": 1.9999990252244153e-05, + "loss": 0.5728, + "step": 3498 + }, + { + "epoch": 0.7, + "learning_rate": 2e-05, + "loss": 0.3419, + "step": 3500 + }, + { + "epoch": 0.7004, + "learning_rate": 1.9999990252244153e-05, + "loss": 0.5606, + "step": 3502 + }, + { + "epoch": 0.7008, + "learning_rate": 1.9999961008995607e-05, + "loss": 0.4986, + "step": 3504 + }, + { + "epoch": 0.7012, + "learning_rate": 1.9999912270311376e-05, + "loss": 0.2225, + "step": 3506 + }, + { + "epoch": 0.7016, + "learning_rate": 1.9999844036286483e-05, + "loss": 0.2449, + "step": 3508 + }, + { + "epoch": 0.702, + "learning_rate": 1.9999756307053947e-05, + "loss": 0.2541, + "step": 3510 + }, + { + "epoch": 0.7024, + "learning_rate": 1.9999649082784807e-05, + "loss": 0.4902, + "step": 3512 + }, + { + "epoch": 0.7028, + "learning_rate": 1.99995223636881e-05, + "loss": 0.1641, + "step": 3514 + }, + { + "epoch": 0.7032, + "learning_rate": 1.9999376150010868e-05, + "loss": 0.3167, + "step": 3516 + }, + { + "epoch": 0.7036, + "learning_rate": 1.9999210442038164e-05, + "loss": 0.324, + "step": 3518 + }, + { + "epoch": 0.704, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.2836, + "step": 3520 + }, + { + "epoch": 0.7044, + "learning_rate": 1.999882054453657e-05, + "loss": 0.4192, + "step": 3522 + }, + { + "epoch": 0.7048, + "learning_rate": 1.9998596355767805e-05, + "loss": 0.4379, + "step": 3524 + }, + { + "epoch": 0.7052, + "learning_rate": 1.9998352674223816e-05, + "loss": 0.3336, + "step": 3526 + }, + { + "epoch": 0.7056, + "learning_rate": 1.999808950037968e-05, + "loss": 0.2089, + "step": 3528 + }, + { + "epoch": 0.706, + "learning_rate": 1.9997806834748455e-05, + "loss": 0.4131, + "step": 3530 + }, + { + "epoch": 0.7064, + "learning_rate": 1.9997504677881224e-05, + "loss": 0.1525, + "step": 3532 + }, + { + "epoch": 0.7068, + "learning_rate": 1.999718303036705e-05, + "loss": 0.2674, + "step": 3534 + }, + { + "epoch": 0.7072, + "learning_rate": 1.9996841892833e-05, + "loss": 0.6451, + "step": 3536 + }, + { + "epoch": 0.7076, + "learning_rate": 1.9996481265944146e-05, + "loss": 0.0169, + "step": 3538 + }, + { + "epoch": 0.708, + "learning_rate": 1.9996101150403547e-05, + "loss": 0.6018, + "step": 3540 + }, + { + "epoch": 0.7084, + "learning_rate": 1.9995701546952252e-05, + "loss": 0.1616, + "step": 3542 + }, + { + "epoch": 0.7088, + "learning_rate": 1.9995282456369313e-05, + "loss": 0.947, + "step": 3544 + }, + { + "epoch": 0.7092, + "learning_rate": 1.9994843879471766e-05, + "loss": 0.3946, + "step": 3546 + }, + { + "epoch": 0.7096, + "learning_rate": 1.9994385817114644e-05, + "loss": 0.1565, + "step": 3548 + }, + { + "epoch": 0.71, + "learning_rate": 1.999390827019096e-05, + "loss": 0.2182, + "step": 3550 + }, + { + "epoch": 0.7104, + "learning_rate": 1.9993411239631713e-05, + "loss": 0.3062, + "step": 3552 + }, + { + "epoch": 0.7108, + "learning_rate": 1.9992894726405898e-05, + "loss": 0.3799, + "step": 3554 + }, + { + "epoch": 0.7112, + "learning_rate": 1.999235873152047e-05, + "loss": 0.347, + "step": 3556 + }, + { + "epoch": 0.7116, + "learning_rate": 1.9991803256020393e-05, + "loss": 0.2475, + "step": 3558 + }, + { + "epoch": 0.712, + "learning_rate": 1.9991228300988586e-05, + "loss": 0.4497, + "step": 3560 + }, + { + "epoch": 0.7124, + "learning_rate": 1.9990633867545956e-05, + "loss": 0.2116, + "step": 3562 + }, + { + "epoch": 0.7128, + "learning_rate": 1.9990019956851384e-05, + "loss": 0.276, + "step": 3564 + }, + { + "epoch": 0.7132, + "learning_rate": 1.9989386570101716e-05, + "loss": 1.1488, + "step": 3566 + }, + { + "epoch": 0.7136, + "learning_rate": 1.9988733708531772e-05, + "loss": 0.5636, + "step": 3568 + }, + { + "epoch": 0.714, + "learning_rate": 1.9988061373414342e-05, + "loss": 0.1948, + "step": 3570 + }, + { + "epoch": 0.7144, + "learning_rate": 1.998736956606018e-05, + "loss": 0.6853, + "step": 3572 + }, + { + "epoch": 0.7148, + "learning_rate": 1.998665828781799e-05, + "loss": 1.031, + "step": 3574 + }, + { + "epoch": 0.7152, + "learning_rate": 1.9985927540074453e-05, + "loss": 0.2748, + "step": 3576 + }, + { + "epoch": 0.7156, + "learning_rate": 1.99851773242542e-05, + "loss": 0.3538, + "step": 3578 + }, + { + "epoch": 0.716, + "learning_rate": 1.9984407641819812e-05, + "loss": 0.4645, + "step": 3580 + }, + { + "epoch": 0.7164, + "learning_rate": 1.9983618494271825e-05, + "loss": 0.4368, + "step": 3582 + }, + { + "epoch": 0.7168, + "learning_rate": 1.998280988314872e-05, + "loss": 0.9549, + "step": 3584 + }, + { + "epoch": 0.7172, + "learning_rate": 1.9981981810026932e-05, + "loss": 0.3611, + "step": 3586 + }, + { + "epoch": 0.7176, + "learning_rate": 1.9981134276520828e-05, + "loss": 0.7267, + "step": 3588 + }, + { + "epoch": 0.718, + "learning_rate": 1.9980267284282718e-05, + "loss": 0.6691, + "step": 3590 + }, + { + "epoch": 0.7184, + "learning_rate": 1.9979380835002846e-05, + "loss": 1.1764, + "step": 3592 + }, + { + "epoch": 0.7188, + "learning_rate": 1.9978474930409396e-05, + "loss": 0.9693, + "step": 3594 + }, + { + "epoch": 0.7192, + "learning_rate": 1.9977549572268467e-05, + "loss": 0.3634, + "step": 3596 + }, + { + "epoch": 0.7196, + "learning_rate": 1.99766047623841e-05, + "loss": 0.2632, + "step": 3598 + }, + { + "epoch": 0.72, + "learning_rate": 1.9975640502598246e-05, + "loss": 0.4736, + "step": 3600 + }, + { + "epoch": 0.7204, + "learning_rate": 1.9974656794790777e-05, + "loss": 0.2326, + "step": 3602 + }, + { + "epoch": 0.7208, + "learning_rate": 1.9973653640879486e-05, + "loss": 0.4168, + "step": 3604 + }, + { + "epoch": 0.7212, + "learning_rate": 1.9972631042820074e-05, + "loss": 0.2944, + "step": 3606 + }, + { + "epoch": 0.7216, + "learning_rate": 1.997158900260614e-05, + "loss": 0.264, + "step": 3608 + }, + { + "epoch": 0.722, + "learning_rate": 1.9970527522269204e-05, + "loss": 0.2096, + "step": 3610 + }, + { + "epoch": 0.7224, + "learning_rate": 1.9969446603878673e-05, + "loss": 0.1529, + "step": 3612 + }, + { + "epoch": 0.7228, + "learning_rate": 1.9968346249541848e-05, + "loss": 0.37, + "step": 3614 + }, + { + "epoch": 0.7232, + "learning_rate": 1.9967226461403934e-05, + "loss": 0.3915, + "step": 3616 + }, + { + "epoch": 0.7236, + "learning_rate": 1.996608724164801e-05, + "loss": 1.1172, + "step": 3618 + }, + { + "epoch": 0.724, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.1966, + "step": 3620 + }, + { + "epoch": 0.7244, + "learning_rate": 1.9963750516203887e-05, + "loss": 0.0913, + "step": 3622 + }, + { + "epoch": 0.7248, + "learning_rate": 1.996255301507125e-05, + "loss": 0.365, + "step": 3624 + }, + { + "epoch": 0.7252, + "learning_rate": 1.9961336091431728e-05, + "loss": 0.256, + "step": 3626 + }, + { + "epoch": 0.7256, + "learning_rate": 1.9960099747657774e-05, + "loss": 0.3708, + "step": 3628 + }, + { + "epoch": 0.726, + "learning_rate": 1.9958843986159705e-05, + "loss": 0.6788, + "step": 3630 + }, + { + "epoch": 0.7264, + "learning_rate": 1.9957568809385693e-05, + "loss": 0.4347, + "step": 3632 + }, + { + "epoch": 0.7268, + "learning_rate": 1.995627421982176e-05, + "loss": 0.4676, + "step": 3634 + }, + { + "epoch": 0.7272, + "learning_rate": 1.995496021999177e-05, + "loss": 0.5483, + "step": 3636 + }, + { + "epoch": 0.7276, + "learning_rate": 1.995362681245744e-05, + "loss": 0.6501, + "step": 3638 + }, + { + "epoch": 0.728, + "learning_rate": 1.9952273999818312e-05, + "loss": 0.3121, + "step": 3640 + }, + { + "epoch": 0.7284, + "learning_rate": 1.9950901784711768e-05, + "loss": 0.6384, + "step": 3642 + }, + { + "epoch": 0.7288, + "learning_rate": 1.9949510169813006e-05, + "loss": 0.2575, + "step": 3644 + }, + { + "epoch": 0.7292, + "learning_rate": 1.994809915783505e-05, + "loss": 0.2528, + "step": 3646 + }, + { + "epoch": 0.7296, + "learning_rate": 1.9946668751528745e-05, + "loss": 0.4282, + "step": 3648 + }, + { + "epoch": 0.73, + "learning_rate": 1.9945218953682736e-05, + "loss": 0.1945, + "step": 3650 + }, + { + "epoch": 0.7304, + "learning_rate": 1.994374976712348e-05, + "loss": 0.5285, + "step": 3652 + }, + { + "epoch": 0.7308, + "learning_rate": 1.9942261194715236e-05, + "loss": 0.3788, + "step": 3654 + }, + { + "epoch": 0.7312, + "learning_rate": 1.9940753239360047e-05, + "loss": 0.2462, + "step": 3656 + }, + { + "epoch": 0.7316, + "learning_rate": 1.9939225903997748e-05, + "loss": 0.4954, + "step": 3658 + }, + { + "epoch": 0.732, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.8218, + "step": 3660 + }, + { + "epoch": 0.7324, + "learning_rate": 1.993611310520009e-05, + "loss": 0.2381, + "step": 3662 + }, + { + "epoch": 0.7328, + "learning_rate": 1.993452764783328e-05, + "loss": 0.1129, + "step": 3664 + }, + { + "epoch": 0.7332, + "learning_rate": 1.993292282259647e-05, + "loss": 0.3222, + "step": 3666 + }, + { + "epoch": 0.7336, + "learning_rate": 1.9931298632618352e-05, + "loss": 0.3178, + "step": 3668 + }, + { + "epoch": 0.734, + "learning_rate": 1.9929655081065373e-05, + "loss": 0.447, + "step": 3670 + }, + { + "epoch": 0.7344, + "learning_rate": 1.9927992171141707e-05, + "loss": 0.0943, + "step": 3672 + }, + { + "epoch": 0.7348, + "learning_rate": 1.992630990608929e-05, + "loss": 0.3269, + "step": 3674 + }, + { + "epoch": 0.7352, + "learning_rate": 1.9924608289187786e-05, + "loss": 0.8699, + "step": 3676 + }, + { + "epoch": 0.7356, + "learning_rate": 1.992288732375458e-05, + "loss": 0.3081, + "step": 3678 + }, + { + "epoch": 0.736, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.1423, + "step": 3680 + }, + { + "epoch": 0.7364, + "learning_rate": 1.9919387360751216e-05, + "loss": 0.5361, + "step": 3682 + }, + { + "epoch": 0.7368, + "learning_rate": 1.9917608370004417e-05, + "loss": 0.5461, + "step": 3684 + }, + { + "epoch": 0.7372, + "learning_rate": 1.991581004437262e-05, + "loss": 0.3078, + "step": 3686 + }, + { + "epoch": 0.7376, + "learning_rate": 1.9913992387361747e-05, + "loss": 0.0921, + "step": 3688 + }, + { + "epoch": 0.738, + "learning_rate": 1.991215540251542e-05, + "loss": 0.6288, + "step": 3690 + }, + { + "epoch": 0.7384, + "learning_rate": 1.9910299093414932e-05, + "loss": 0.2791, + "step": 3692 + }, + { + "epoch": 0.7388, + "learning_rate": 1.9908423463679246e-05, + "loss": 0.2175, + "step": 3694 + }, + { + "epoch": 0.7392, + "learning_rate": 1.990652851696501e-05, + "loss": 0.5877, + "step": 3696 + }, + { + "epoch": 0.7396, + "learning_rate": 1.9904614256966517e-05, + "loss": 0.1377, + "step": 3698 + }, + { + "epoch": 0.74, + "learning_rate": 1.9902680687415704e-05, + "loss": 0.4203, + "step": 3700 + }, + { + "epoch": 0.7404, + "learning_rate": 1.9900727812082177e-05, + "loss": 0.1686, + "step": 3702 + }, + { + "epoch": 0.7408, + "learning_rate": 1.9898755634773155e-05, + "loss": 0.205, + "step": 3704 + }, + { + "epoch": 0.7412, + "learning_rate": 1.9896764159333518e-05, + "loss": 0.6709, + "step": 3706 + }, + { + "epoch": 0.7416, + "learning_rate": 1.9894753389645723e-05, + "loss": 0.433, + "step": 3708 + }, + { + "epoch": 0.742, + "learning_rate": 1.9892723329629885e-05, + "loss": 0.2659, + "step": 3710 + }, + { + "epoch": 0.7424, + "learning_rate": 1.9890673983243704e-05, + "loss": 0.2952, + "step": 3712 + }, + { + "epoch": 0.7428, + "learning_rate": 1.9888605354482494e-05, + "loss": 0.2775, + "step": 3714 + }, + { + "epoch": 0.7432, + "learning_rate": 1.9886517447379143e-05, + "loss": 0.4384, + "step": 3716 + }, + { + "epoch": 0.7436, + "learning_rate": 1.9884410266004134e-05, + "loss": 0.2011, + "step": 3718 + }, + { + "epoch": 0.744, + "learning_rate": 1.988228381446553e-05, + "loss": 0.2599, + "step": 3720 + }, + { + "epoch": 0.7444, + "learning_rate": 1.9880138096908955e-05, + "loss": 0.2791, + "step": 3722 + }, + { + "epoch": 0.7448, + "learning_rate": 1.987797311751759e-05, + "loss": 0.403, + "step": 3724 + }, + { + "epoch": 0.7452, + "learning_rate": 1.9875788880512183e-05, + "loss": 0.6661, + "step": 3726 + }, + { + "epoch": 0.7456, + "learning_rate": 1.9873585390151007e-05, + "loss": 0.5803, + "step": 3728 + }, + { + "epoch": 0.746, + "learning_rate": 1.987136265072988e-05, + "loss": 0.2354, + "step": 3730 + }, + { + "epoch": 0.7464, + "learning_rate": 1.9869120666582153e-05, + "loss": 0.5197, + "step": 3732 + }, + { + "epoch": 0.7468, + "learning_rate": 1.9866859442078685e-05, + "loss": 0.3423, + "step": 3734 + }, + { + "epoch": 0.7472, + "learning_rate": 1.9864578981627844e-05, + "loss": 0.1575, + "step": 3736 + }, + { + "epoch": 0.7476, + "learning_rate": 1.986227928967551e-05, + "loss": 0.5374, + "step": 3738 + }, + { + "epoch": 0.748, + "learning_rate": 1.985996037070505e-05, + "loss": 0.19, + "step": 3740 + }, + { + "epoch": 0.7484, + "learning_rate": 1.985762222923732e-05, + "loss": 0.4784, + "step": 3742 + }, + { + "epoch": 0.7488, + "learning_rate": 1.985526486983063e-05, + "loss": 0.5871, + "step": 3744 + }, + { + "epoch": 0.7492, + "learning_rate": 1.985288829708079e-05, + "loss": 0.4987, + "step": 3746 + }, + { + "epoch": 0.7496, + "learning_rate": 1.9850492515621038e-05, + "loss": 0.4662, + "step": 3748 + }, + { + "epoch": 0.75, + "learning_rate": 1.9848077530122083e-05, + "loss": 0.5098, + "step": 3750 + }, + { + "epoch": 0.7504, + "learning_rate": 1.9845643345292055e-05, + "loss": 0.6671, + "step": 3752 + }, + { + "epoch": 0.7508, + "learning_rate": 1.9843189965876525e-05, + "loss": 0.1353, + "step": 3754 + }, + { + "epoch": 0.7512, + "learning_rate": 1.9840717396658483e-05, + "loss": 0.3356, + "step": 3756 + }, + { + "epoch": 0.7516, + "learning_rate": 1.983822564245833e-05, + "loss": 0.4787, + "step": 3758 + }, + { + "epoch": 0.752, + "learning_rate": 1.983571470813386e-05, + "loss": 0.6368, + "step": 3760 + }, + { + "epoch": 0.7524, + "learning_rate": 1.9833184598580276e-05, + "loss": 0.7173, + "step": 3762 + }, + { + "epoch": 0.7528, + "learning_rate": 1.983063531873016e-05, + "loss": 0.3104, + "step": 3764 + }, + { + "epoch": 0.7532, + "learning_rate": 1.982806687355345e-05, + "loss": 0.1855, + "step": 3766 + }, + { + "epoch": 0.7536, + "learning_rate": 1.982547926805747e-05, + "loss": 0.5287, + "step": 3768 + }, + { + "epoch": 0.754, + "learning_rate": 1.982287250728689e-05, + "loss": 0.1261, + "step": 3770 + }, + { + "epoch": 0.7544, + "learning_rate": 1.9820246596323724e-05, + "loss": 0.474, + "step": 3772 + }, + { + "epoch": 0.7548, + "learning_rate": 1.981760154028731e-05, + "loss": 0.6094, + "step": 3774 + }, + { + "epoch": 0.7552, + "learning_rate": 1.981493734433433e-05, + "loss": 0.3466, + "step": 3776 + }, + { + "epoch": 0.7556, + "learning_rate": 1.9812254013658773e-05, + "loss": 0.3143, + "step": 3778 + }, + { + "epoch": 0.756, + "learning_rate": 1.9809551553491918e-05, + "loss": 0.7478, + "step": 3780 + }, + { + "epoch": 0.7564, + "learning_rate": 1.9806829969102356e-05, + "loss": 0.4192, + "step": 3782 + }, + { + "epoch": 0.7568, + "learning_rate": 1.9804089265795963e-05, + "loss": 0.3517, + "step": 3784 + }, + { + "epoch": 0.7572, + "learning_rate": 1.9801329448915863e-05, + "loss": 0.4283, + "step": 3786 + }, + { + "epoch": 0.7576, + "learning_rate": 1.979855052384247e-05, + "loss": 0.5023, + "step": 3788 + }, + { + "epoch": 0.758, + "learning_rate": 1.979575249599344e-05, + "loss": 0.6817, + "step": 3790 + }, + { + "epoch": 0.7584, + "learning_rate": 1.979293537082368e-05, + "loss": 0.5462, + "step": 3792 + }, + { + "epoch": 0.7588, + "learning_rate": 1.9790099153825303e-05, + "loss": 0.6389, + "step": 3794 + }, + { + "epoch": 0.7592, + "learning_rate": 1.9787243850527663e-05, + "loss": 0.4307, + "step": 3796 + }, + { + "epoch": 0.7596, + "learning_rate": 1.978436946649733e-05, + "loss": 0.5304, + "step": 3798 + }, + { + "epoch": 0.76, + "learning_rate": 1.978147600733806e-05, + "loss": 0.4771, + "step": 3800 + }, + { + "epoch": 0.7604, + "learning_rate": 1.9778563478690793e-05, + "loss": 0.4063, + "step": 3802 + }, + { + "epoch": 0.7608, + "learning_rate": 1.977563188623365e-05, + "loss": 0.2103, + "step": 3804 + }, + { + "epoch": 0.7612, + "learning_rate": 1.977268123568194e-05, + "loss": 0.4816, + "step": 3806 + }, + { + "epoch": 0.7616, + "learning_rate": 1.9769711532788086e-05, + "loss": 0.4608, + "step": 3808 + }, + { + "epoch": 0.762, + "learning_rate": 1.9766722783341682e-05, + "loss": 0.7588, + "step": 3810 + }, + { + "epoch": 0.7624, + "learning_rate": 1.9763714993169448e-05, + "loss": 0.3087, + "step": 3812 + }, + { + "epoch": 0.7628, + "learning_rate": 1.9760688168135236e-05, + "loss": 0.5229, + "step": 3814 + }, + { + "epoch": 0.7632, + "learning_rate": 1.9757642314139977e-05, + "loss": 0.869, + "step": 3816 + }, + { + "epoch": 0.7636, + "learning_rate": 1.9754577437121733e-05, + "loss": 0.693, + "step": 3818 + }, + { + "epoch": 0.764, + "learning_rate": 1.9751493543055638e-05, + "loss": 0.2707, + "step": 3820 + }, + { + "epoch": 0.7644, + "learning_rate": 1.974839063795389e-05, + "loss": 0.2564, + "step": 3822 + }, + { + "epoch": 0.7648, + "learning_rate": 1.9745268727865774e-05, + "loss": 0.2223, + "step": 3824 + }, + { + "epoch": 0.7652, + "learning_rate": 1.97421278188776e-05, + "loss": 0.6009, + "step": 3826 + }, + { + "epoch": 0.7656, + "learning_rate": 1.973896791711276e-05, + "loss": 0.3943, + "step": 3828 + }, + { + "epoch": 0.766, + "learning_rate": 1.9735789028731607e-05, + "loss": 0.2591, + "step": 3830 + }, + { + "epoch": 0.7664, + "learning_rate": 1.9732591159931567e-05, + "loss": 0.7535, + "step": 3832 + }, + { + "epoch": 0.7668, + "learning_rate": 1.9729374316947037e-05, + "loss": 0.3732, + "step": 3834 + }, + { + "epoch": 0.7672, + "learning_rate": 1.972613850604944e-05, + "loss": 0.2653, + "step": 3836 + }, + { + "epoch": 0.7676, + "learning_rate": 1.972288373354713e-05, + "loss": 0.337, + "step": 3838 + }, + { + "epoch": 0.768, + "learning_rate": 1.9719610005785463e-05, + "loss": 0.515, + "step": 3840 + }, + { + "epoch": 0.7684, + "learning_rate": 1.9716317329146743e-05, + "loss": 0.3201, + "step": 3842 + }, + { + "epoch": 0.7688, + "learning_rate": 1.9713005710050206e-05, + "loss": 0.4186, + "step": 3844 + }, + { + "epoch": 0.7692, + "learning_rate": 1.9709675154952017e-05, + "loss": 0.4841, + "step": 3846 + }, + { + "epoch": 0.7696, + "learning_rate": 1.9706325670345276e-05, + "loss": 0.2801, + "step": 3848 + }, + { + "epoch": 0.77, + "learning_rate": 1.970295726275997e-05, + "loss": 0.1944, + "step": 3850 + }, + { + "epoch": 0.7704, + "learning_rate": 1.9699569938762975e-05, + "loss": 0.2437, + "step": 3852 + }, + { + "epoch": 0.7708, + "learning_rate": 1.969616370495806e-05, + "loss": 0.3189, + "step": 3854 + }, + { + "epoch": 0.7712, + "learning_rate": 1.969273856798586e-05, + "loss": 0.4711, + "step": 3856 + }, + { + "epoch": 0.7716, + "learning_rate": 1.9689294534523836e-05, + "loss": 0.4434, + "step": 3858 + }, + { + "epoch": 0.772, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.0725, + "step": 3860 + }, + { + "epoch": 0.7724, + "learning_rate": 1.9682349805024447e-05, + "loss": 0.0958, + "step": 3862 + }, + { + "epoch": 0.7728, + "learning_rate": 1.9678849122526195e-05, + "loss": 0.7279, + "step": 3864 + }, + { + "epoch": 0.7732, + "learning_rate": 1.9675329570616302e-05, + "loss": 0.1513, + "step": 3866 + }, + { + "epoch": 0.7736, + "learning_rate": 1.967179115615633e-05, + "loss": 0.8891, + "step": 3868 + }, + { + "epoch": 0.774, + "learning_rate": 1.966823388604459e-05, + "loss": 0.4914, + "step": 3870 + }, + { + "epoch": 0.7744, + "learning_rate": 1.966465776721618e-05, + "loss": 1.0113, + "step": 3872 + }, + { + "epoch": 0.7748, + "learning_rate": 1.9661062806642906e-05, + "loss": 0.1897, + "step": 3874 + }, + { + "epoch": 0.7752, + "learning_rate": 1.9657449011333328e-05, + "loss": 0.8487, + "step": 3876 + }, + { + "epoch": 0.7756, + "learning_rate": 1.9653816388332743e-05, + "loss": 0.3197, + "step": 3878 + }, + { + "epoch": 0.776, + "learning_rate": 1.965016494472312e-05, + "loss": 0.3202, + "step": 3880 + }, + { + "epoch": 0.7764, + "learning_rate": 1.964649468762313e-05, + "loss": 0.6519, + "step": 3882 + }, + { + "epoch": 0.7768, + "learning_rate": 1.964280562418815e-05, + "loss": 0.462, + "step": 3884 + }, + { + "epoch": 0.7772, + "learning_rate": 1.963909776161018e-05, + "loss": 0.2097, + "step": 3886 + }, + { + "epoch": 0.7776, + "learning_rate": 1.963537110711789e-05, + "loss": 0.1438, + "step": 3888 + }, + { + "epoch": 0.778, + "learning_rate": 1.9631625667976584e-05, + "loss": 0.2072, + "step": 3890 + }, + { + "epoch": 0.7784, + "learning_rate": 1.9627861451488194e-05, + "loss": 0.4208, + "step": 3892 + }, + { + "epoch": 0.7788, + "learning_rate": 1.9624078464991246e-05, + "loss": 0.2932, + "step": 3894 + }, + { + "epoch": 0.7792, + "learning_rate": 1.962027671586086e-05, + "loss": 0.3947, + "step": 3896 + }, + { + "epoch": 0.7796, + "learning_rate": 1.9616456211508756e-05, + "loss": 0.6846, + "step": 3898 + }, + { + "epoch": 0.78, + "learning_rate": 1.9612616959383194e-05, + "loss": 0.2149, + "step": 3900 + }, + { + "epoch": 0.7804, + "learning_rate": 1.9608758966968987e-05, + "loss": 0.3504, + "step": 3902 + }, + { + "epoch": 0.7808, + "learning_rate": 1.96048822417875e-05, + "loss": 0.8159, + "step": 3904 + }, + { + "epoch": 0.7812, + "learning_rate": 1.9600986791396597e-05, + "loss": 0.5325, + "step": 3906 + }, + { + "epoch": 0.7816, + "learning_rate": 1.9597072623390668e-05, + "loss": 1.0138, + "step": 3908 + }, + { + "epoch": 0.782, + "learning_rate": 1.9593139745400578e-05, + "loss": 0.1415, + "step": 3910 + }, + { + "epoch": 0.7824, + "learning_rate": 1.9589188165093666e-05, + "loss": 0.315, + "step": 3912 + }, + { + "epoch": 0.7828, + "learning_rate": 1.9585217890173765e-05, + "loss": 0.5427, + "step": 3914 + }, + { + "epoch": 0.7832, + "learning_rate": 1.95812289283811e-05, + "loss": 0.4662, + "step": 3916 + }, + { + "epoch": 0.7836, + "learning_rate": 1.957722128749237e-05, + "loss": 0.1832, + "step": 3918 + }, + { + "epoch": 0.784, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.4523, + "step": 3920 + }, + { + "epoch": 0.7844, + "learning_rate": 1.9569149999715518e-05, + "loss": 0.268, + "step": 3922 + }, + { + "epoch": 0.7848, + "learning_rate": 1.9565086368562784e-05, + "loss": 0.6059, + "step": 3924 + }, + { + "epoch": 0.7852, + "learning_rate": 1.9561004089784722e-05, + "loss": 0.2079, + "step": 3926 + }, + { + "epoch": 0.7856, + "learning_rate": 1.9556903171339966e-05, + "loss": 0.2928, + "step": 3928 + }, + { + "epoch": 0.786, + "learning_rate": 1.955278362122344e-05, + "loss": 0.4607, + "step": 3930 + }, + { + "epoch": 0.7864, + "learning_rate": 1.954864544746643e-05, + "loss": 0.4708, + "step": 3932 + }, + { + "epoch": 0.7868, + "learning_rate": 1.954448865813652e-05, + "loss": 0.1758, + "step": 3934 + }, + { + "epoch": 0.7872, + "learning_rate": 1.9540313261337585e-05, + "loss": 0.2164, + "step": 3936 + }, + { + "epoch": 0.7876, + "learning_rate": 1.9536119265209763e-05, + "loss": 0.3027, + "step": 3938 + }, + { + "epoch": 0.788, + "learning_rate": 1.9531906677929472e-05, + "loss": 0.5666, + "step": 3940 + }, + { + "epoch": 0.7884, + "learning_rate": 1.9527675507709364e-05, + "loss": 0.3472, + "step": 3942 + }, + { + "epoch": 0.7888, + "learning_rate": 1.9523425762798335e-05, + "loss": 0.1398, + "step": 3944 + }, + { + "epoch": 0.7892, + "learning_rate": 1.9519157451481456e-05, + "loss": 0.2788, + "step": 3946 + }, + { + "epoch": 0.7896, + "learning_rate": 1.9514870582080035e-05, + "loss": 0.1883, + "step": 3948 + }, + { + "epoch": 0.79, + "learning_rate": 1.9510565162951545e-05, + "loss": 0.265, + "step": 3950 + }, + { + "epoch": 0.7904, + "learning_rate": 1.95062412024896e-05, + "loss": 0.439, + "step": 3952 + }, + { + "epoch": 0.7908, + "learning_rate": 1.950189870912401e-05, + "loss": 0.1476, + "step": 3954 + }, + { + "epoch": 0.7912, + "learning_rate": 1.9497537691320667e-05, + "loss": 0.1598, + "step": 3956 + }, + { + "epoch": 0.7916, + "learning_rate": 1.9493158157581617e-05, + "loss": 0.245, + "step": 3958 + }, + { + "epoch": 0.792, + "learning_rate": 1.948876011644497e-05, + "loss": 0.5239, + "step": 3960 + }, + { + "epoch": 0.7924, + "learning_rate": 1.948434357648493e-05, + "loss": 0.1886, + "step": 3962 + }, + { + "epoch": 0.7928, + "learning_rate": 1.9479908546311787e-05, + "loss": 0.3543, + "step": 3964 + }, + { + "epoch": 0.7932, + "learning_rate": 1.9475455034571843e-05, + "loss": 0.1824, + "step": 3966 + }, + { + "epoch": 0.7936, + "learning_rate": 1.9470983049947443e-05, + "loss": 1.2475, + "step": 3968 + }, + { + "epoch": 0.794, + "learning_rate": 1.9466492601156964e-05, + "loss": 0.6929, + "step": 3970 + }, + { + "epoch": 0.7944, + "learning_rate": 1.9461983696954767e-05, + "loss": 0.1836, + "step": 3972 + }, + { + "epoch": 0.7948, + "learning_rate": 1.9457456346131175e-05, + "loss": 0.5604, + "step": 3974 + }, + { + "epoch": 0.7952, + "learning_rate": 1.9452910557512494e-05, + "loss": 0.6166, + "step": 3976 + }, + { + "epoch": 0.7956, + "learning_rate": 1.9448346339960984e-05, + "loss": 0.1246, + "step": 3978 + }, + { + "epoch": 0.796, + "learning_rate": 1.9443763702374818e-05, + "loss": 0.362, + "step": 3980 + }, + { + "epoch": 0.7964, + "learning_rate": 1.9439162653688066e-05, + "loss": 0.5638, + "step": 3982 + }, + { + "epoch": 0.7968, + "learning_rate": 1.9434543202870726e-05, + "loss": 0.4297, + "step": 3984 + }, + { + "epoch": 0.7972, + "learning_rate": 1.9429905358928655e-05, + "loss": 0.7007, + "step": 3986 + }, + { + "epoch": 0.7976, + "learning_rate": 1.9425249130903544e-05, + "loss": 1.2002, + "step": 3988 + }, + { + "epoch": 0.798, + "learning_rate": 1.942057452787297e-05, + "loss": 0.7052, + "step": 3990 + }, + { + "epoch": 0.7984, + "learning_rate": 1.94158815589503e-05, + "loss": 0.4422, + "step": 3992 + }, + { + "epoch": 0.7988, + "learning_rate": 1.941117023328473e-05, + "loss": 0.4989, + "step": 3994 + }, + { + "epoch": 0.7992, + "learning_rate": 1.940644056006122e-05, + "loss": 0.1465, + "step": 3996 + }, + { + "epoch": 0.7996, + "learning_rate": 1.94016925485005e-05, + "loss": 0.2389, + "step": 3998 + }, + { + "epoch": 0.8, + "learning_rate": 1.939692620785909e-05, + "loss": 0.2036, + "step": 4000 + }, + { + "epoch": 0.8004, + "learning_rate": 1.939214154742919e-05, + "loss": 0.1124, + "step": 4002 + }, + { + "epoch": 0.8008, + "learning_rate": 1.9387338576538746e-05, + "loss": 0.3612, + "step": 4004 + }, + { + "epoch": 0.8012, + "learning_rate": 1.9382517304551393e-05, + "loss": 0.7357, + "step": 4006 + }, + { + "epoch": 0.8016, + "learning_rate": 1.9377677740866464e-05, + "loss": 0.1286, + "step": 4008 + }, + { + "epoch": 0.802, + "learning_rate": 1.9372819894918922e-05, + "loss": 1.0977, + "step": 4010 + }, + { + "epoch": 0.8024, + "learning_rate": 1.936794377617938e-05, + "loss": 0.2182, + "step": 4012 + }, + { + "epoch": 0.8028, + "learning_rate": 1.9363049394154102e-05, + "loss": 0.1257, + "step": 4014 + }, + { + "epoch": 0.8032, + "learning_rate": 1.9358136758384917e-05, + "loss": 0.3986, + "step": 4016 + }, + { + "epoch": 0.8036, + "learning_rate": 1.935320587844926e-05, + "loss": 0.642, + "step": 4018 + }, + { + "epoch": 0.804, + "learning_rate": 1.9348256763960146e-05, + "loss": 0.2709, + "step": 4020 + }, + { + "epoch": 0.8044, + "learning_rate": 1.934328942456613e-05, + "loss": 0.1656, + "step": 4022 + }, + { + "epoch": 0.8048, + "learning_rate": 1.9338303869951273e-05, + "loss": 0.3589, + "step": 4024 + }, + { + "epoch": 0.8052, + "learning_rate": 1.9333300109835186e-05, + "loss": 0.5024, + "step": 4026 + }, + { + "epoch": 0.8056, + "learning_rate": 1.9328278153972943e-05, + "loss": 0.3645, + "step": 4028 + }, + { + "epoch": 0.806, + "learning_rate": 1.9323238012155125e-05, + "loss": 0.4799, + "step": 4030 + }, + { + "epoch": 0.8064, + "learning_rate": 1.931817969420773e-05, + "loss": 0.2426, + "step": 4032 + }, + { + "epoch": 0.8068, + "learning_rate": 1.93131032099922e-05, + "loss": 0.3426, + "step": 4034 + }, + { + "epoch": 0.8072, + "learning_rate": 1.930800856940543e-05, + "loss": 0.1273, + "step": 4036 + }, + { + "epoch": 0.8076, + "learning_rate": 1.9302895782379648e-05, + "loss": 0.432, + "step": 4038 + }, + { + "epoch": 0.808, + "learning_rate": 1.929776485888252e-05, + "loss": 0.1161, + "step": 4040 + }, + { + "epoch": 0.8084, + "learning_rate": 1.9292615808917024e-05, + "loss": 0.416, + "step": 4042 + }, + { + "epoch": 0.8088, + "learning_rate": 1.9287448642521517e-05, + "loss": 0.2371, + "step": 4044 + }, + { + "epoch": 0.8092, + "learning_rate": 1.9282263369769637e-05, + "loss": 0.516, + "step": 4046 + }, + { + "epoch": 0.8096, + "learning_rate": 1.9277060000770342e-05, + "loss": 0.6714, + "step": 4048 + }, + { + "epoch": 0.81, + "learning_rate": 1.927183854566788e-05, + "loss": 0.2919, + "step": 4050 + }, + { + "epoch": 0.8104, + "learning_rate": 1.9266599014641727e-05, + "loss": 0.066, + "step": 4052 + }, + { + "epoch": 0.8108, + "learning_rate": 1.9261341417906622e-05, + "loss": 0.4084, + "step": 4054 + }, + { + "epoch": 0.8112, + "learning_rate": 1.925606576571252e-05, + "loss": 0.4662, + "step": 4056 + }, + { + "epoch": 0.8116, + "learning_rate": 1.925077206834459e-05, + "loss": 0.268, + "step": 4058 + }, + { + "epoch": 0.812, + "learning_rate": 1.9245460336123136e-05, + "loss": 0.1871, + "step": 4060 + }, + { + "epoch": 0.8124, + "learning_rate": 1.924013057940367e-05, + "loss": 0.3816, + "step": 4062 + }, + { + "epoch": 0.8128, + "learning_rate": 1.923478280857682e-05, + "loss": 0.3537, + "step": 4064 + }, + { + "epoch": 0.8132, + "learning_rate": 1.922941703406836e-05, + "loss": 0.4385, + "step": 4066 + }, + { + "epoch": 0.8136, + "learning_rate": 1.9224033266339103e-05, + "loss": 0.1177, + "step": 4068 + }, + { + "epoch": 0.814, + "learning_rate": 1.9218631515885007e-05, + "loss": 0.3492, + "step": 4070 + }, + { + "epoch": 0.8144, + "learning_rate": 1.9213211793237066e-05, + "loss": 0.2725, + "step": 4072 + }, + { + "epoch": 0.8148, + "learning_rate": 1.9207774108961276e-05, + "loss": 0.1472, + "step": 4074 + }, + { + "epoch": 0.8152, + "learning_rate": 1.9202318473658707e-05, + "loss": 0.1998, + "step": 4076 + }, + { + "epoch": 0.8156, + "learning_rate": 1.9196844897965387e-05, + "loss": 0.2372, + "step": 4078 + }, + { + "epoch": 0.816, + "learning_rate": 1.919135339255235e-05, + "loss": 0.194, + "step": 4080 + }, + { + "epoch": 0.8164, + "learning_rate": 1.9185843968125546e-05, + "loss": 0.2637, + "step": 4082 + }, + { + "epoch": 0.8168, + "learning_rate": 1.918031663542588e-05, + "loss": 0.7861, + "step": 4084 + }, + { + "epoch": 0.8172, + "learning_rate": 1.917477140522919e-05, + "loss": 0.5257, + "step": 4086 + }, + { + "epoch": 0.8176, + "learning_rate": 1.916920828834617e-05, + "loss": 0.3554, + "step": 4088 + }, + { + "epoch": 0.818, + "learning_rate": 1.9163627295622394e-05, + "loss": 0.1994, + "step": 4090 + }, + { + "epoch": 0.8184, + "learning_rate": 1.9158028437938313e-05, + "loss": 0.6129, + "step": 4092 + }, + { + "epoch": 0.8188, + "learning_rate": 1.9152411726209183e-05, + "loss": 0.5995, + "step": 4094 + }, + { + "epoch": 0.8192, + "learning_rate": 1.9146777171385057e-05, + "loss": 0.3436, + "step": 4096 + }, + { + "epoch": 0.8196, + "learning_rate": 1.914112478445079e-05, + "loss": 0.6085, + "step": 4098 + }, + { + "epoch": 0.82, + "learning_rate": 1.913545457642601e-05, + "loss": 0.2151, + "step": 4100 + }, + { + "epoch": 0.8204, + "learning_rate": 1.9129766558365082e-05, + "loss": 0.368, + "step": 4102 + }, + { + "epoch": 0.8208, + "learning_rate": 1.9124060741357065e-05, + "loss": 0.3586, + "step": 4104 + }, + { + "epoch": 0.8212, + "learning_rate": 1.911833713652576e-05, + "loss": 0.7349, + "step": 4106 + }, + { + "epoch": 0.8216, + "learning_rate": 1.911259575502963e-05, + "loss": 0.6216, + "step": 4108 + }, + { + "epoch": 0.822, + "learning_rate": 1.9106836608061775e-05, + "loss": 0.7198, + "step": 4110 + }, + { + "epoch": 0.8224, + "learning_rate": 1.910105970684996e-05, + "loss": 0.5756, + "step": 4112 + }, + { + "epoch": 0.8228, + "learning_rate": 1.909526506265654e-05, + "loss": 0.3778, + "step": 4114 + }, + { + "epoch": 0.8232, + "learning_rate": 1.908945268677849e-05, + "loss": 0.2778, + "step": 4116 + }, + { + "epoch": 0.8236, + "learning_rate": 1.9083622590547313e-05, + "loss": 0.2248, + "step": 4118 + }, + { + "epoch": 0.824, + "learning_rate": 1.9077774785329085e-05, + "loss": 0.6537, + "step": 4120 + }, + { + "epoch": 0.8244, + "learning_rate": 1.9071909282524422e-05, + "loss": 0.5248, + "step": 4122 + }, + { + "epoch": 0.8248, + "learning_rate": 1.9066026093568383e-05, + "loss": 0.4543, + "step": 4124 + }, + { + "epoch": 0.8252, + "learning_rate": 1.9060125229930576e-05, + "loss": 0.2578, + "step": 4126 + }, + { + "epoch": 0.8256, + "learning_rate": 1.9054206703115013e-05, + "loss": 0.5293, + "step": 4128 + }, + { + "epoch": 0.826, + "learning_rate": 1.9048270524660203e-05, + "loss": 0.1624, + "step": 4130 + }, + { + "epoch": 0.8264, + "learning_rate": 1.9042316706138994e-05, + "loss": 0.2403, + "step": 4132 + }, + { + "epoch": 0.8268, + "learning_rate": 1.9036345259158664e-05, + "loss": 0.188, + "step": 4134 + }, + { + "epoch": 0.8272, + "learning_rate": 1.903035619536087e-05, + "loss": 0.3277, + "step": 4136 + }, + { + "epoch": 0.8276, + "learning_rate": 1.9024349526421603e-05, + "loss": 0.3697, + "step": 4138 + }, + { + "epoch": 0.828, + "learning_rate": 1.901832526405114e-05, + "loss": 0.6074, + "step": 4140 + }, + { + "epoch": 0.8284, + "learning_rate": 1.9012283419994112e-05, + "loss": 0.5585, + "step": 4142 + }, + { + "epoch": 0.8288, + "learning_rate": 1.9006224006029414e-05, + "loss": 0.2183, + "step": 4144 + }, + { + "epoch": 0.8292, + "learning_rate": 1.9000147033970148e-05, + "loss": 0.1998, + "step": 4146 + }, + { + "epoch": 0.8296, + "learning_rate": 1.899405251566371e-05, + "loss": 0.4712, + "step": 4148 + }, + { + "epoch": 0.83, + "learning_rate": 1.8987940462991666e-05, + "loss": 0.7746, + "step": 4150 + }, + { + "epoch": 0.8304, + "learning_rate": 1.8981810887869797e-05, + "loss": 0.7434, + "step": 4152 + }, + { + "epoch": 0.8308, + "learning_rate": 1.8975663802247978e-05, + "loss": 0.3145, + "step": 4154 + }, + { + "epoch": 0.8312, + "learning_rate": 1.8969499218110302e-05, + "loss": 0.3599, + "step": 4156 + }, + { + "epoch": 0.8316, + "learning_rate": 1.8963317147474943e-05, + "loss": 0.3584, + "step": 4158 + }, + { + "epoch": 0.832, + "learning_rate": 1.8957117602394133e-05, + "loss": 0.3001, + "step": 4160 + }, + { + "epoch": 0.8324, + "learning_rate": 1.8950900594954233e-05, + "loss": 0.4026, + "step": 4162 + }, + { + "epoch": 0.8328, + "learning_rate": 1.8944666137275596e-05, + "loss": 0.4066, + "step": 4164 + }, + { + "epoch": 0.8332, + "learning_rate": 1.8938414241512644e-05, + "loss": 0.4274, + "step": 4166 + }, + { + "epoch": 0.8336, + "learning_rate": 1.8932144919853744e-05, + "loss": 0.1486, + "step": 4168 + }, + { + "epoch": 0.834, + "learning_rate": 1.892585818452125e-05, + "loss": 0.1694, + "step": 4170 + }, + { + "epoch": 0.8344, + "learning_rate": 1.8919554047771508e-05, + "loss": 0.4472, + "step": 4172 + }, + { + "epoch": 0.8348, + "learning_rate": 1.891323252189474e-05, + "loss": 0.4597, + "step": 4174 + }, + { + "epoch": 0.8352, + "learning_rate": 1.890689361921507e-05, + "loss": 0.2891, + "step": 4176 + }, + { + "epoch": 0.8356, + "learning_rate": 1.8900537352090523e-05, + "loss": 0.131, + "step": 4178 + }, + { + "epoch": 0.836, + "learning_rate": 1.8894163732912986e-05, + "loss": 0.7281, + "step": 4180 + }, + { + "epoch": 0.8364, + "learning_rate": 1.8887772774108122e-05, + "loss": 0.2622, + "step": 4182 + }, + { + "epoch": 0.8368, + "learning_rate": 1.8881364488135445e-05, + "loss": 0.7192, + "step": 4184 + }, + { + "epoch": 0.8372, + "learning_rate": 1.887493888748825e-05, + "loss": 0.3687, + "step": 4186 + }, + { + "epoch": 0.8376, + "learning_rate": 1.886849598469357e-05, + "loss": 0.5268, + "step": 4188 + }, + { + "epoch": 0.838, + "learning_rate": 1.886203579231215e-05, + "loss": 0.3431, + "step": 4190 + }, + { + "epoch": 0.8384, + "learning_rate": 1.8855558322938492e-05, + "loss": 1.0116, + "step": 4192 + }, + { + "epoch": 0.8388, + "learning_rate": 1.8849063589200754e-05, + "loss": 0.5597, + "step": 4194 + }, + { + "epoch": 0.8392, + "learning_rate": 1.8842551603760725e-05, + "loss": 0.5993, + "step": 4196 + }, + { + "epoch": 0.8396, + "learning_rate": 1.8836022379313884e-05, + "loss": 0.3999, + "step": 4198 + }, + { + "epoch": 0.84, + "learning_rate": 1.8829475928589265e-05, + "loss": 0.2387, + "step": 4200 + }, + { + "epoch": 0.8404, + "learning_rate": 1.882291226434954e-05, + "loss": 0.4045, + "step": 4202 + }, + { + "epoch": 0.8408, + "learning_rate": 1.8816331399390874e-05, + "loss": 0.3815, + "step": 4204 + }, + { + "epoch": 0.8412, + "learning_rate": 1.880973334654301e-05, + "loss": 0.2008, + "step": 4206 + }, + { + "epoch": 0.8416, + "learning_rate": 1.88031181186692e-05, + "loss": 0.2996, + "step": 4208 + }, + { + "epoch": 0.842, + "learning_rate": 1.8796485728666172e-05, + "loss": 0.3003, + "step": 4210 + }, + { + "epoch": 0.8424, + "learning_rate": 1.8789836189464092e-05, + "loss": 0.0456, + "step": 4212 + }, + { + "epoch": 0.8428, + "learning_rate": 1.8783169514026574e-05, + "loss": 0.3398, + "step": 4214 + }, + { + "epoch": 0.8432, + "learning_rate": 1.877648571535068e-05, + "loss": 0.3103, + "step": 4216 + }, + { + "epoch": 0.8436, + "learning_rate": 1.8769784806466775e-05, + "loss": 0.5355, + "step": 4218 + }, + { + "epoch": 0.844, + "learning_rate": 1.8763066800438638e-05, + "loss": 0.577, + "step": 4220 + }, + { + "epoch": 0.8444, + "learning_rate": 1.8756331710363375e-05, + "loss": 0.0266, + "step": 4222 + }, + { + "epoch": 0.8448, + "learning_rate": 1.8749579549371387e-05, + "loss": 0.7541, + "step": 4224 + }, + { + "epoch": 0.8452, + "learning_rate": 1.8742810330626338e-05, + "loss": 0.3709, + "step": 4226 + }, + { + "epoch": 0.8456, + "learning_rate": 1.8736024067325188e-05, + "loss": 0.0728, + "step": 4228 + }, + { + "epoch": 0.846, + "learning_rate": 1.8729220772698106e-05, + "loss": 0.1955, + "step": 4230 + }, + { + "epoch": 0.8464, + "learning_rate": 1.8722400460008437e-05, + "loss": 0.1702, + "step": 4232 + }, + { + "epoch": 0.8468, + "learning_rate": 1.8715563142552758e-05, + "loss": 0.2511, + "step": 4234 + }, + { + "epoch": 0.8472, + "learning_rate": 1.8708708833660748e-05, + "loss": 0.5291, + "step": 4236 + }, + { + "epoch": 0.8476, + "learning_rate": 1.870183754669526e-05, + "loss": 0.5172, + "step": 4238 + }, + { + "epoch": 0.848, + "learning_rate": 1.8694949295052198e-05, + "loss": 0.3539, + "step": 4240 + }, + { + "epoch": 0.8484, + "learning_rate": 1.8688044092160558e-05, + "loss": 0.2288, + "step": 4242 + }, + { + "epoch": 0.8488, + "learning_rate": 1.868112195148239e-05, + "loss": 0.1428, + "step": 4244 + }, + { + "epoch": 0.8492, + "learning_rate": 1.867418288651278e-05, + "loss": 0.259, + "step": 4246 + }, + { + "epoch": 0.8496, + "learning_rate": 1.866722691077977e-05, + "loss": 0.7427, + "step": 4248 + }, + { + "epoch": 0.85, + "learning_rate": 1.8660254037844384e-05, + "loss": 0.2486, + "step": 4250 + }, + { + "epoch": 0.8504, + "learning_rate": 1.8653264281300626e-05, + "loss": 0.3036, + "step": 4252 + }, + { + "epoch": 0.8508, + "learning_rate": 1.8646257654775357e-05, + "loss": 0.0994, + "step": 4254 + }, + { + "epoch": 0.8512, + "learning_rate": 1.8639234171928355e-05, + "loss": 0.3369, + "step": 4256 + }, + { + "epoch": 0.8516, + "learning_rate": 1.8632193846452267e-05, + "loss": 0.3736, + "step": 4258 + }, + { + "epoch": 0.852, + "learning_rate": 1.8625136692072587e-05, + "loss": 0.2933, + "step": 4260 + }, + { + "epoch": 0.8524, + "learning_rate": 1.861806272254755e-05, + "loss": 0.195, + "step": 4262 + }, + { + "epoch": 0.8528, + "learning_rate": 1.8610971951668265e-05, + "loss": 0.1631, + "step": 4264 + }, + { + "epoch": 0.8532, + "learning_rate": 1.8603864393258547e-05, + "loss": 0.2164, + "step": 4266 + }, + { + "epoch": 0.8536, + "learning_rate": 1.8596740061174912e-05, + "loss": 0.3692, + "step": 4268 + }, + { + "epoch": 0.854, + "learning_rate": 1.8589598969306646e-05, + "loss": 0.6011, + "step": 4270 + }, + { + "epoch": 0.8544, + "learning_rate": 1.858244113157566e-05, + "loss": 0.166, + "step": 4272 + }, + { + "epoch": 0.8548, + "learning_rate": 1.8575266561936533e-05, + "loss": 0.2227, + "step": 4274 + }, + { + "epoch": 0.8552, + "learning_rate": 1.8568075274376432e-05, + "loss": 0.2524, + "step": 4276 + }, + { + "epoch": 0.8556, + "learning_rate": 1.8560867282915164e-05, + "loss": 0.3329, + "step": 4278 + }, + { + "epoch": 0.856, + "learning_rate": 1.8553642601605083e-05, + "loss": 0.7938, + "step": 4280 + }, + { + "epoch": 0.8564, + "learning_rate": 1.8546401244531034e-05, + "loss": 0.1912, + "step": 4282 + }, + { + "epoch": 0.8568, + "learning_rate": 1.8539143225810457e-05, + "loss": 0.1686, + "step": 4284 + }, + { + "epoch": 0.8572, + "learning_rate": 1.85318685595932e-05, + "loss": 0.2463, + "step": 4286 + }, + { + "epoch": 0.8576, + "learning_rate": 1.852457726006163e-05, + "loss": 0.2364, + "step": 4288 + }, + { + "epoch": 0.858, + "learning_rate": 1.8517269341430485e-05, + "loss": 0.3476, + "step": 4290 + }, + { + "epoch": 0.8584, + "learning_rate": 1.8509944817946917e-05, + "loss": 0.1867, + "step": 4292 + }, + { + "epoch": 0.8588, + "learning_rate": 1.8502603703890484e-05, + "loss": 0.1037, + "step": 4294 + }, + { + "epoch": 0.8592, + "learning_rate": 1.8495246013573064e-05, + "loss": 0.4552, + "step": 4296 + }, + { + "epoch": 0.8596, + "learning_rate": 1.8487871761338817e-05, + "loss": 0.2835, + "step": 4298 + }, + { + "epoch": 0.86, + "learning_rate": 1.848048096156426e-05, + "loss": 0.8144, + "step": 4300 + }, + { + "epoch": 0.8604, + "learning_rate": 1.847307362865813e-05, + "loss": 0.464, + "step": 4302 + }, + { + "epoch": 0.8608, + "learning_rate": 1.8465649777061387e-05, + "loss": 0.4534, + "step": 4304 + }, + { + "epoch": 0.8612, + "learning_rate": 1.8458209421247208e-05, + "loss": 0.3222, + "step": 4306 + }, + { + "epoch": 0.8616, + "learning_rate": 1.8450752575720967e-05, + "loss": 1.3721, + "step": 4308 + }, + { + "epoch": 0.862, + "learning_rate": 1.8443279255020163e-05, + "loss": 0.5144, + "step": 4310 + }, + { + "epoch": 0.8624, + "learning_rate": 1.843578947371439e-05, + "loss": 0.2906, + "step": 4312 + }, + { + "epoch": 0.8628, + "learning_rate": 1.842828324640539e-05, + "loss": 0.2659, + "step": 4314 + }, + { + "epoch": 0.8632, + "learning_rate": 1.8420760587726935e-05, + "loss": 0.1603, + "step": 4316 + }, + { + "epoch": 0.8636, + "learning_rate": 1.8413221512344808e-05, + "loss": 0.2822, + "step": 4318 + }, + { + "epoch": 0.864, + "learning_rate": 1.8405666034956846e-05, + "loss": 0.7518, + "step": 4320 + }, + { + "epoch": 0.8644, + "learning_rate": 1.8398094170292826e-05, + "loss": 0.254, + "step": 4322 + }, + { + "epoch": 0.8648, + "learning_rate": 1.8390505933114507e-05, + "loss": 0.7464, + "step": 4324 + }, + { + "epoch": 0.8652, + "learning_rate": 1.838290133821552e-05, + "loss": 0.2612, + "step": 4326 + }, + { + "epoch": 0.8656, + "learning_rate": 1.8375280400421414e-05, + "loss": 0.3864, + "step": 4328 + }, + { + "epoch": 0.866, + "learning_rate": 1.8367643134589613e-05, + "loss": 0.842, + "step": 4330 + }, + { + "epoch": 0.8664, + "learning_rate": 1.8359989555609365e-05, + "loss": 0.3539, + "step": 4332 + }, + { + "epoch": 0.8668, + "learning_rate": 1.835231967840168e-05, + "loss": 0.1371, + "step": 4334 + }, + { + "epoch": 0.8672, + "learning_rate": 1.834463351791939e-05, + "loss": 0.2737, + "step": 4336 + }, + { + "epoch": 0.8676, + "learning_rate": 1.8336931089147082e-05, + "loss": 0.3905, + "step": 4338 + }, + { + "epoch": 0.868, + "learning_rate": 1.8329212407101006e-05, + "loss": 0.2898, + "step": 4340 + }, + { + "epoch": 0.8684, + "learning_rate": 1.8321477486829128e-05, + "loss": 0.1239, + "step": 4342 + }, + { + "epoch": 0.8688, + "learning_rate": 1.8313726343411085e-05, + "loss": 0.3536, + "step": 4344 + }, + { + "epoch": 0.8692, + "learning_rate": 1.8305958991958135e-05, + "loss": 0.1583, + "step": 4346 + }, + { + "epoch": 0.8696, + "learning_rate": 1.82981754476131e-05, + "loss": 0.3235, + "step": 4348 + }, + { + "epoch": 0.87, + "learning_rate": 1.8290375725550417e-05, + "loss": 0.3145, + "step": 4350 + }, + { + "epoch": 0.8704, + "learning_rate": 1.8282559840976053e-05, + "loss": 0.2681, + "step": 4352 + }, + { + "epoch": 0.8708, + "learning_rate": 1.827472780912744e-05, + "loss": 0.3063, + "step": 4354 + }, + { + "epoch": 0.8712, + "learning_rate": 1.8266879645273557e-05, + "loss": 0.4603, + "step": 4356 + }, + { + "epoch": 0.8716, + "learning_rate": 1.825901536471478e-05, + "loss": 0.2483, + "step": 4358 + }, + { + "epoch": 0.872, + "learning_rate": 1.8251134982782966e-05, + "loss": 0.2126, + "step": 4360 + }, + { + "epoch": 0.8724, + "learning_rate": 1.824323851484126e-05, + "loss": 0.3628, + "step": 4362 + }, + { + "epoch": 0.8728, + "learning_rate": 1.823532597628428e-05, + "loss": 0.2192, + "step": 4364 + }, + { + "epoch": 0.8732, + "learning_rate": 1.8227397382537893e-05, + "loss": 0.2212, + "step": 4366 + }, + { + "epoch": 0.8736, + "learning_rate": 1.8219452749059336e-05, + "loss": 0.3073, + "step": 4368 + }, + { + "epoch": 0.874, + "learning_rate": 1.8211492091337048e-05, + "loss": 0.5134, + "step": 4370 + }, + { + "epoch": 0.8744, + "learning_rate": 1.8203515424890734e-05, + "loss": 0.308, + "step": 4372 + }, + { + "epoch": 0.8748, + "learning_rate": 1.8195522765271346e-05, + "loss": 0.5017, + "step": 4374 + }, + { + "epoch": 0.8752, + "learning_rate": 1.8187514128060956e-05, + "loss": 0.3202, + "step": 4376 + }, + { + "epoch": 0.8756, + "learning_rate": 1.8179489528872804e-05, + "loss": 0.216, + "step": 4378 + }, + { + "epoch": 0.876, + "learning_rate": 1.8171448983351284e-05, + "loss": 0.1125, + "step": 4380 + }, + { + "epoch": 0.8764, + "learning_rate": 1.816339250717185e-05, + "loss": 0.3441, + "step": 4382 + }, + { + "epoch": 0.8768, + "learning_rate": 1.8155320116040983e-05, + "loss": 1.0369, + "step": 4384 + }, + { + "epoch": 0.8772, + "learning_rate": 1.814723182569625e-05, + "loss": 0.142, + "step": 4386 + }, + { + "epoch": 0.8776, + "learning_rate": 1.8139127651906193e-05, + "loss": 0.0946, + "step": 4388 + }, + { + "epoch": 0.878, + "learning_rate": 1.813100761047029e-05, + "loss": 0.128, + "step": 4390 + }, + { + "epoch": 0.8784, + "learning_rate": 1.8122871717218974e-05, + "loss": 0.1914, + "step": 4392 + }, + { + "epoch": 0.8788, + "learning_rate": 1.8114719988013612e-05, + "loss": 0.2878, + "step": 4394 + }, + { + "epoch": 0.8792, + "learning_rate": 1.8106552438746413e-05, + "loss": 0.6396, + "step": 4396 + }, + { + "epoch": 0.8796, + "learning_rate": 1.8098369085340404e-05, + "loss": 0.0462, + "step": 4398 + }, + { + "epoch": 0.88, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.3171, + "step": 4400 + }, + { + "epoch": 0.8804, + "learning_rate": 1.8081955029958265e-05, + "loss": 0.6735, + "step": 4402 + }, + { + "epoch": 0.8808, + "learning_rate": 1.807372435998219e-05, + "loss": 0.1648, + "step": 4404 + }, + { + "epoch": 0.8812, + "learning_rate": 1.806547794986733e-05, + "loss": 0.3631, + "step": 4406 + }, + { + "epoch": 0.8816, + "learning_rate": 1.8057215815690487e-05, + "loss": 0.4089, + "step": 4408 + }, + { + "epoch": 0.882, + "learning_rate": 1.8048937973559148e-05, + "loss": 0.1935, + "step": 4410 + }, + { + "epoch": 0.8824, + "learning_rate": 1.8040644439611355e-05, + "loss": 0.3471, + "step": 4412 + }, + { + "epoch": 0.8828, + "learning_rate": 1.8032335230015777e-05, + "loss": 0.2551, + "step": 4414 + }, + { + "epoch": 0.8832, + "learning_rate": 1.8024010360971665e-05, + "loss": 0.284, + "step": 4416 + }, + { + "epoch": 0.8836, + "learning_rate": 1.8015669848708774e-05, + "loss": 0.3208, + "step": 4418 + }, + { + "epoch": 0.884, + "learning_rate": 1.8007313709487345e-05, + "loss": 0.4543, + "step": 4420 + }, + { + "epoch": 0.8844, + "learning_rate": 1.7998941959598093e-05, + "loss": 0.1541, + "step": 4422 + }, + { + "epoch": 0.8848, + "learning_rate": 1.7990554615362207e-05, + "loss": 0.7097, + "step": 4424 + }, + { + "epoch": 0.8852, + "learning_rate": 1.7982151693131213e-05, + "loss": 0.4324, + "step": 4426 + }, + { + "epoch": 0.8856, + "learning_rate": 1.7973733209287036e-05, + "loss": 0.7662, + "step": 4428 + }, + { + "epoch": 0.886, + "learning_rate": 1.7965299180241963e-05, + "loss": 0.4728, + "step": 4430 + }, + { + "epoch": 0.8864, + "learning_rate": 1.7956849622438568e-05, + "loss": 0.5772, + "step": 4432 + }, + { + "epoch": 0.8868, + "learning_rate": 1.794838455234966e-05, + "loss": 0.4076, + "step": 4434 + }, + { + "epoch": 0.8872, + "learning_rate": 1.7939903986478357e-05, + "loss": 0.5115, + "step": 4436 + }, + { + "epoch": 0.8876, + "learning_rate": 1.7931407941357938e-05, + "loss": 0.2346, + "step": 4438 + }, + { + "epoch": 0.888, + "learning_rate": 1.7922896433551913e-05, + "loss": 0.2064, + "step": 4440 + }, + { + "epoch": 0.8884, + "learning_rate": 1.7914369479653864e-05, + "loss": 0.3454, + "step": 4442 + }, + { + "epoch": 0.8888, + "learning_rate": 1.7905827096287525e-05, + "loss": 0.1642, + "step": 4444 + }, + { + "epoch": 0.8892, + "learning_rate": 1.7897269300106752e-05, + "loss": 0.5338, + "step": 4446 + }, + { + "epoch": 0.8896, + "learning_rate": 1.7888696107795347e-05, + "loss": 1.0457, + "step": 4448 + }, + { + "epoch": 0.89, + "learning_rate": 1.7880107536067228e-05, + "loss": 0.111, + "step": 4450 + }, + { + "epoch": 0.8904, + "learning_rate": 1.787150360166623e-05, + "loss": 0.0891, + "step": 4452 + }, + { + "epoch": 0.8908, + "learning_rate": 1.78628843213662e-05, + "loss": 0.2385, + "step": 4454 + }, + { + "epoch": 0.8912, + "learning_rate": 1.7854249711970826e-05, + "loss": 0.0239, + "step": 4456 + }, + { + "epoch": 0.8916, + "learning_rate": 1.7845599790313732e-05, + "loss": 0.3398, + "step": 4458 + }, + { + "epoch": 0.892, + "learning_rate": 1.783693457325841e-05, + "loss": 0.2394, + "step": 4460 + }, + { + "epoch": 0.8924, + "learning_rate": 1.782825407769811e-05, + "loss": 0.2798, + "step": 4462 + }, + { + "epoch": 0.8928, + "learning_rate": 1.7819558320555902e-05, + "loss": 0.6028, + "step": 4464 + }, + { + "epoch": 0.8932, + "learning_rate": 1.7810847318784632e-05, + "loss": 0.3387, + "step": 4466 + }, + { + "epoch": 0.8936, + "learning_rate": 1.780212108936685e-05, + "loss": 0.0928, + "step": 4468 + }, + { + "epoch": 0.894, + "learning_rate": 1.7793379649314743e-05, + "loss": 0.2647, + "step": 4470 + }, + { + "epoch": 0.8944, + "learning_rate": 1.7784623015670237e-05, + "loss": 0.2654, + "step": 4472 + }, + { + "epoch": 0.8948, + "learning_rate": 1.777585120550481e-05, + "loss": 0.6053, + "step": 4474 + }, + { + "epoch": 0.8952, + "learning_rate": 1.7767064235919594e-05, + "loss": 0.1552, + "step": 4476 + }, + { + "epoch": 0.8956, + "learning_rate": 1.77582621240452e-05, + "loss": 0.2883, + "step": 4478 + }, + { + "epoch": 0.896, + "learning_rate": 1.77494448870418e-05, + "loss": 0.4466, + "step": 4480 + }, + { + "epoch": 0.8964, + "learning_rate": 1.774061254209907e-05, + "loss": 0.5614, + "step": 4482 + }, + { + "epoch": 0.8968, + "learning_rate": 1.773176510643608e-05, + "loss": 0.7328, + "step": 4484 + }, + { + "epoch": 0.8972, + "learning_rate": 1.7722902597301388e-05, + "loss": 0.5033, + "step": 4486 + }, + { + "epoch": 0.8976, + "learning_rate": 1.7714025031972894e-05, + "loss": 0.4507, + "step": 4488 + }, + { + "epoch": 0.898, + "learning_rate": 1.77051324277579e-05, + "loss": 0.1814, + "step": 4490 + }, + { + "epoch": 0.8984, + "learning_rate": 1.769622480199295e-05, + "loss": 0.4668, + "step": 4492 + }, + { + "epoch": 0.8988, + "learning_rate": 1.7687302172043926e-05, + "loss": 0.4958, + "step": 4494 + }, + { + "epoch": 0.8992, + "learning_rate": 1.7678364555305982e-05, + "loss": 0.4973, + "step": 4496 + }, + { + "epoch": 0.8996, + "learning_rate": 1.7669411969203424e-05, + "loss": 0.1237, + "step": 4498 + }, + { + "epoch": 0.9, + "learning_rate": 1.7660444431189777e-05, + "loss": 0.3068, + "step": 4500 + }, + { + "epoch": 0.9004, + "learning_rate": 1.765146195874774e-05, + "loss": 0.5162, + "step": 4502 + }, + { + "epoch": 0.9008, + "learning_rate": 1.76424645693891e-05, + "loss": 0.7352, + "step": 4504 + }, + { + "epoch": 0.9012, + "learning_rate": 1.7633452280654696e-05, + "loss": 0.294, + "step": 4506 + }, + { + "epoch": 0.9016, + "learning_rate": 1.762442511011448e-05, + "loss": 0.1496, + "step": 4508 + }, + { + "epoch": 0.902, + "learning_rate": 1.761538307536738e-05, + "loss": 0.2762, + "step": 4510 + }, + { + "epoch": 0.9024, + "learning_rate": 1.7606326194041285e-05, + "loss": 0.1972, + "step": 4512 + }, + { + "epoch": 0.9028, + "learning_rate": 1.759725448379305e-05, + "loss": 0.577, + "step": 4514 + }, + { + "epoch": 0.9032, + "learning_rate": 1.7588167962308458e-05, + "loss": 0.1347, + "step": 4516 + }, + { + "epoch": 0.9036, + "learning_rate": 1.7579066647302147e-05, + "loss": 0.3436, + "step": 4518 + }, + { + "epoch": 0.904, + "learning_rate": 1.756995055651757e-05, + "loss": 0.1125, + "step": 4520 + }, + { + "epoch": 0.9044, + "learning_rate": 1.7560819707727037e-05, + "loss": 0.1553, + "step": 4522 + }, + { + "epoch": 0.9048, + "learning_rate": 1.7551674118731585e-05, + "loss": 0.4901, + "step": 4524 + }, + { + "epoch": 0.9052, + "learning_rate": 1.7542513807361044e-05, + "loss": 0.66, + "step": 4526 + }, + { + "epoch": 0.9056, + "learning_rate": 1.7533338791473875e-05, + "loss": 0.2167, + "step": 4528 + }, + { + "epoch": 0.906, + "learning_rate": 1.7524149088957238e-05, + "loss": 0.191, + "step": 4530 + }, + { + "epoch": 0.9064, + "learning_rate": 1.751494471772697e-05, + "loss": 0.1084, + "step": 4532 + }, + { + "epoch": 0.9068, + "learning_rate": 1.750572569572742e-05, + "loss": 0.2735, + "step": 4534 + }, + { + "epoch": 0.9072, + "learning_rate": 1.7496492040931548e-05, + "loss": 0.3394, + "step": 4536 + }, + { + "epoch": 0.9076, + "learning_rate": 1.7487243771340862e-05, + "loss": 0.56, + "step": 4538 + }, + { + "epoch": 0.908, + "learning_rate": 1.747798090498533e-05, + "loss": 0.3243, + "step": 4540 + }, + { + "epoch": 0.9084, + "learning_rate": 1.7468703459923365e-05, + "loss": 0.5008, + "step": 4542 + }, + { + "epoch": 0.9088, + "learning_rate": 1.745941145424182e-05, + "loss": 0.392, + "step": 4544 + }, + { + "epoch": 0.9092, + "learning_rate": 1.7450104906055973e-05, + "loss": 0.2317, + "step": 4546 + }, + { + "epoch": 0.9096, + "learning_rate": 1.744078383350938e-05, + "loss": 0.5204, + "step": 4548 + }, + { + "epoch": 0.91, + "learning_rate": 1.7431448254773943e-05, + "loss": 0.5559, + "step": 4550 + }, + { + "epoch": 0.9104, + "learning_rate": 1.7422098188049885e-05, + "loss": 0.4116, + "step": 4552 + }, + { + "epoch": 0.9108, + "learning_rate": 1.7412733651565624e-05, + "loss": 0.2857, + "step": 4554 + }, + { + "epoch": 0.9112, + "learning_rate": 1.7403354663577782e-05, + "loss": 0.3613, + "step": 4556 + }, + { + "epoch": 0.9116, + "learning_rate": 1.739396124237121e-05, + "loss": 0.4433, + "step": 4558 + }, + { + "epoch": 0.912, + "learning_rate": 1.738455340625883e-05, + "loss": 0.2962, + "step": 4560 + }, + { + "epoch": 0.9124, + "learning_rate": 1.7375131173581744e-05, + "loss": 0.5073, + "step": 4562 + }, + { + "epoch": 0.9128, + "learning_rate": 1.7365694562709038e-05, + "loss": 0.5834, + "step": 4564 + }, + { + "epoch": 0.9132, + "learning_rate": 1.7356243592037865e-05, + "loss": 0.419, + "step": 4566 + }, + { + "epoch": 0.9136, + "learning_rate": 1.7346778279993433e-05, + "loss": 0.1875, + "step": 4568 + }, + { + "epoch": 0.914, + "learning_rate": 1.733729864502877e-05, + "loss": 0.5513, + "step": 4570 + }, + { + "epoch": 0.9144, + "learning_rate": 1.7327804705624962e-05, + "loss": 0.306, + "step": 4572 + }, + { + "epoch": 0.9148, + "learning_rate": 1.731829648029091e-05, + "loss": 0.2081, + "step": 4574 + }, + { + "epoch": 0.9152, + "learning_rate": 1.730877398756341e-05, + "loss": 0.6129, + "step": 4576 + }, + { + "epoch": 0.9156, + "learning_rate": 1.7299237246007025e-05, + "loss": 0.1026, + "step": 4578 + }, + { + "epoch": 0.916, + "learning_rate": 1.7289686274214113e-05, + "loss": 0.2232, + "step": 4580 + }, + { + "epoch": 0.9164, + "learning_rate": 1.7280121090804824e-05, + "loss": 0.2233, + "step": 4582 + }, + { + "epoch": 0.9168, + "learning_rate": 1.727054171442693e-05, + "loss": 0.4527, + "step": 4584 + }, + { + "epoch": 0.9172, + "learning_rate": 1.7260948163755918e-05, + "loss": 0.4862, + "step": 4586 + }, + { + "epoch": 0.9176, + "learning_rate": 1.7251340457494934e-05, + "loss": 0.5739, + "step": 4588 + }, + { + "epoch": 0.918, + "learning_rate": 1.7241718614374688e-05, + "loss": 0.2922, + "step": 4590 + }, + { + "epoch": 0.9184, + "learning_rate": 1.7232082653153422e-05, + "loss": 0.4795, + "step": 4592 + }, + { + "epoch": 0.9188, + "learning_rate": 1.722243259261697e-05, + "loss": 0.2121, + "step": 4594 + }, + { + "epoch": 0.9192, + "learning_rate": 1.7212768451578595e-05, + "loss": 0.4462, + "step": 4596 + }, + { + "epoch": 0.9196, + "learning_rate": 1.7203090248879084e-05, + "loss": 0.5281, + "step": 4598 + }, + { + "epoch": 0.92, + "learning_rate": 1.7193398003386517e-05, + "loss": 0.445, + "step": 4600 + }, + { + "epoch": 0.9204, + "learning_rate": 1.7183691733996463e-05, + "loss": 0.094, + "step": 4602 + }, + { + "epoch": 0.9208, + "learning_rate": 1.7173971459631803e-05, + "loss": 0.2429, + "step": 4604 + }, + { + "epoch": 0.9212, + "learning_rate": 1.7164237199242663e-05, + "loss": 0.266, + "step": 4606 + }, + { + "epoch": 0.9216, + "learning_rate": 1.7154488971806525e-05, + "loss": 0.5417, + "step": 4608 + }, + { + "epoch": 0.922, + "learning_rate": 1.7144726796328027e-05, + "loss": 0.0933, + "step": 4610 + }, + { + "epoch": 0.9224, + "learning_rate": 1.713495069183907e-05, + "loss": 0.7588, + "step": 4612 + }, + { + "epoch": 0.9228, + "learning_rate": 1.7125160677398632e-05, + "loss": 0.6549, + "step": 4614 + }, + { + "epoch": 0.9232, + "learning_rate": 1.7115356772092847e-05, + "loss": 0.4359, + "step": 4616 + }, + { + "epoch": 0.9236, + "learning_rate": 1.710553899503497e-05, + "loss": 0.2479, + "step": 4618 + }, + { + "epoch": 0.924, + "learning_rate": 1.709570736536522e-05, + "loss": 0.1795, + "step": 4620 + }, + { + "epoch": 0.9244, + "learning_rate": 1.708586190225086e-05, + "loss": 0.1223, + "step": 4622 + }, + { + "epoch": 0.9248, + "learning_rate": 1.7076002624886152e-05, + "loss": 0.397, + "step": 4624 + }, + { + "epoch": 0.9252, + "learning_rate": 1.7066129552492258e-05, + "loss": 0.3042, + "step": 4626 + }, + { + "epoch": 0.9256, + "learning_rate": 1.705624270431722e-05, + "loss": 0.9217, + "step": 4628 + }, + { + "epoch": 0.926, + "learning_rate": 1.7046342099635945e-05, + "loss": 0.1602, + "step": 4630 + }, + { + "epoch": 0.9264, + "learning_rate": 1.70364277577502e-05, + "loss": 0.646, + "step": 4632 + }, + { + "epoch": 0.9268, + "learning_rate": 1.702649969798851e-05, + "loss": 0.9233, + "step": 4634 + }, + { + "epoch": 0.9272, + "learning_rate": 1.7016557939706078e-05, + "loss": 0.3412, + "step": 4636 + }, + { + "epoch": 0.9276, + "learning_rate": 1.700660250228492e-05, + "loss": 0.8491, + "step": 4638 + }, + { + "epoch": 0.928, + "learning_rate": 1.6996633405133673e-05, + "loss": 0.4218, + "step": 4640 + }, + { + "epoch": 0.9284, + "learning_rate": 1.6986650667687556e-05, + "loss": 0.0877, + "step": 4642 + }, + { + "epoch": 0.9288, + "learning_rate": 1.6976654309408468e-05, + "loss": 0.2737, + "step": 4644 + }, + { + "epoch": 0.9292, + "learning_rate": 1.69666443497848e-05, + "loss": 0.4594, + "step": 4646 + }, + { + "epoch": 0.9296, + "learning_rate": 1.6956620808331515e-05, + "loss": 0.2208, + "step": 4648 + }, + { + "epoch": 0.93, + "learning_rate": 1.694658370458998e-05, + "loss": 0.2182, + "step": 4650 + }, + { + "epoch": 0.9304, + "learning_rate": 1.6936533058128042e-05, + "loss": 0.2622, + "step": 4652 + }, + { + "epoch": 0.9308, + "learning_rate": 1.692646888854001e-05, + "loss": 0.6582, + "step": 4654 + }, + { + "epoch": 0.9312, + "learning_rate": 1.691639121544641e-05, + "loss": 0.4253, + "step": 4656 + }, + { + "epoch": 0.9316, + "learning_rate": 1.690630005849424e-05, + "loss": 0.5316, + "step": 4658 + }, + { + "epoch": 0.932, + "learning_rate": 1.6896195437356696e-05, + "loss": 0.0417, + "step": 4660 + }, + { + "epoch": 0.9324, + "learning_rate": 1.6886077371733295e-05, + "loss": 0.1662, + "step": 4662 + }, + { + "epoch": 0.9328, + "learning_rate": 1.6875945881349686e-05, + "loss": 0.1988, + "step": 4664 + }, + { + "epoch": 0.9332, + "learning_rate": 1.6865800985957725e-05, + "loss": 0.1458, + "step": 4666 + }, + { + "epoch": 0.9336, + "learning_rate": 1.6855642705335435e-05, + "loss": 0.394, + "step": 4668 + }, + { + "epoch": 0.934, + "learning_rate": 1.68454710592869e-05, + "loss": 0.1401, + "step": 4670 + }, + { + "epoch": 0.9344, + "learning_rate": 1.6835286067642228e-05, + "loss": 0.5404, + "step": 4672 + }, + { + "epoch": 0.9348, + "learning_rate": 1.6825087750257617e-05, + "loss": 0.2642, + "step": 4674 + }, + { + "epoch": 0.9352, + "learning_rate": 1.681487612701521e-05, + "loss": 0.172, + "step": 4676 + }, + { + "epoch": 0.9356, + "learning_rate": 1.6804651217823055e-05, + "loss": 0.2817, + "step": 4678 + }, + { + "epoch": 0.936, + "learning_rate": 1.6794413042615168e-05, + "loss": 0.354, + "step": 4680 + }, + { + "epoch": 0.9364, + "learning_rate": 1.6784161621351374e-05, + "loss": 0.2172, + "step": 4682 + }, + { + "epoch": 0.9368, + "learning_rate": 1.677389697401739e-05, + "loss": 0.3583, + "step": 4684 + }, + { + "epoch": 0.9372, + "learning_rate": 1.67636191206246e-05, + "loss": 0.4258, + "step": 4686 + }, + { + "epoch": 0.9376, + "learning_rate": 1.675332808121025e-05, + "loss": 0.4329, + "step": 4688 + }, + { + "epoch": 0.938, + "learning_rate": 1.6743023875837253e-05, + "loss": 0.3078, + "step": 4690 + }, + { + "epoch": 0.9384, + "learning_rate": 1.6732706524594145e-05, + "loss": 0.1831, + "step": 4692 + }, + { + "epoch": 0.9388, + "learning_rate": 1.672237604759517e-05, + "loss": 0.4118, + "step": 4694 + }, + { + "epoch": 0.9392, + "learning_rate": 1.671203246498009e-05, + "loss": 0.2657, + "step": 4696 + }, + { + "epoch": 0.9396, + "learning_rate": 1.670167579691429e-05, + "loss": 0.5089, + "step": 4698 + }, + { + "epoch": 0.94, + "learning_rate": 1.6691306063588593e-05, + "loss": 0.305, + "step": 4700 + }, + { + "epoch": 0.9404, + "learning_rate": 1.668092328521931e-05, + "loss": 0.4701, + "step": 4702 + }, + { + "epoch": 0.9408, + "learning_rate": 1.6670527482048242e-05, + "loss": 0.3058, + "step": 4704 + }, + { + "epoch": 0.9412, + "learning_rate": 1.6660118674342525e-05, + "loss": 0.0792, + "step": 4706 + }, + { + "epoch": 0.9416, + "learning_rate": 1.6649696882394635e-05, + "loss": 0.18, + "step": 4708 + }, + { + "epoch": 0.942, + "learning_rate": 1.6639262126522414e-05, + "loss": 0.1372, + "step": 4710 + }, + { + "epoch": 0.9424, + "learning_rate": 1.6628814427068968e-05, + "loss": 0.1889, + "step": 4712 + }, + { + "epoch": 0.9428, + "learning_rate": 1.661835380440258e-05, + "loss": 1.3578, + "step": 4714 + }, + { + "epoch": 0.9432, + "learning_rate": 1.6607880278916778e-05, + "loss": 0.213, + "step": 4716 + }, + { + "epoch": 0.9436, + "learning_rate": 1.6597393871030264e-05, + "loss": 0.2541, + "step": 4718 + }, + { + "epoch": 0.944, + "learning_rate": 1.6586894601186824e-05, + "loss": 0.2216, + "step": 4720 + }, + { + "epoch": 0.9444, + "learning_rate": 1.6576382489855278e-05, + "loss": 0.528, + "step": 4722 + }, + { + "epoch": 0.9448, + "learning_rate": 1.656585755752957e-05, + "loss": 0.3694, + "step": 4724 + }, + { + "epoch": 0.9452, + "learning_rate": 1.655531982472859e-05, + "loss": 0.7978, + "step": 4726 + }, + { + "epoch": 0.9456, + "learning_rate": 1.6544769311996153e-05, + "loss": 0.7516, + "step": 4728 + }, + { + "epoch": 0.946, + "learning_rate": 1.653420603990106e-05, + "loss": 0.1636, + "step": 4730 + }, + { + "epoch": 0.9464, + "learning_rate": 1.6523630029036924e-05, + "loss": 0.1532, + "step": 4732 + }, + { + "epoch": 0.9468, + "learning_rate": 1.651304130002226e-05, + "loss": 0.6308, + "step": 4734 + }, + { + "epoch": 0.9472, + "learning_rate": 1.6502439873500294e-05, + "loss": 0.7698, + "step": 4736 + }, + { + "epoch": 0.9476, + "learning_rate": 1.6491825770139058e-05, + "loss": 0.5311, + "step": 4738 + }, + { + "epoch": 0.948, + "learning_rate": 1.6481199010631305e-05, + "loss": 0.9658, + "step": 4740 + }, + { + "epoch": 0.9484, + "learning_rate": 1.6470559615694455e-05, + "loss": 0.3795, + "step": 4742 + }, + { + "epoch": 0.9488, + "learning_rate": 1.645990760607052e-05, + "loss": 0.1441, + "step": 4744 + }, + { + "epoch": 0.9492, + "learning_rate": 1.644924300252614e-05, + "loss": 0.1127, + "step": 4746 + }, + { + "epoch": 0.9496, + "learning_rate": 1.643856582585255e-05, + "loss": 0.1803, + "step": 4748 + }, + { + "epoch": 0.95, + "learning_rate": 1.6427876096865407e-05, + "loss": 0.519, + "step": 4750 + }, + { + "epoch": 0.9504, + "learning_rate": 1.641717383640488e-05, + "loss": 0.4095, + "step": 4752 + }, + { + "epoch": 0.9508, + "learning_rate": 1.6406459065335616e-05, + "loss": 0.4066, + "step": 4754 + }, + { + "epoch": 0.9512, + "learning_rate": 1.6395731804546596e-05, + "loss": 0.3499, + "step": 4756 + }, + { + "epoch": 0.9516, + "learning_rate": 1.6384992074951128e-05, + "loss": 0.3735, + "step": 4758 + }, + { + "epoch": 0.952, + "learning_rate": 1.63742398974869e-05, + "loss": 0.2905, + "step": 4760 + }, + { + "epoch": 0.9524, + "learning_rate": 1.6363475293115838e-05, + "loss": 0.3969, + "step": 4762 + }, + { + "epoch": 0.9528, + "learning_rate": 1.6352698282824045e-05, + "loss": 0.9632, + "step": 4764 + }, + { + "epoch": 0.9532, + "learning_rate": 1.63419088876219e-05, + "loss": 0.2666, + "step": 4766 + }, + { + "epoch": 0.9536, + "learning_rate": 1.633110712854385e-05, + "loss": 0.802, + "step": 4768 + }, + { + "epoch": 0.954, + "learning_rate": 1.6320293026648515e-05, + "loss": 0.2691, + "step": 4770 + }, + { + "epoch": 0.9544, + "learning_rate": 1.6309466603018504e-05, + "loss": 0.5391, + "step": 4772 + }, + { + "epoch": 0.9548, + "learning_rate": 1.6298627878760495e-05, + "loss": 0.214, + "step": 4774 + }, + { + "epoch": 0.9552, + "learning_rate": 1.6287776875005148e-05, + "loss": 0.2097, + "step": 4776 + }, + { + "epoch": 0.9556, + "learning_rate": 1.6276913612907015e-05, + "loss": 0.4982, + "step": 4778 + }, + { + "epoch": 0.956, + "learning_rate": 1.6266038113644612e-05, + "loss": 0.5392, + "step": 4780 + }, + { + "epoch": 0.9564, + "learning_rate": 1.6255150398420266e-05, + "loss": 0.4207, + "step": 4782 + }, + { + "epoch": 0.9568, + "learning_rate": 1.624425048846017e-05, + "loss": 0.4933, + "step": 4784 + }, + { + "epoch": 0.9572, + "learning_rate": 1.623333840501421e-05, + "loss": 0.2727, + "step": 4786 + }, + { + "epoch": 0.9576, + "learning_rate": 1.6222414169356063e-05, + "loss": 0.3189, + "step": 4788 + }, + { + "epoch": 0.958, + "learning_rate": 1.6211477802783102e-05, + "loss": 0.0928, + "step": 4790 + }, + { + "epoch": 0.9584, + "learning_rate": 1.6200529326616343e-05, + "loss": 0.2187, + "step": 4792 + }, + { + "epoch": 0.9588, + "learning_rate": 1.618956876220035e-05, + "loss": 0.2812, + "step": 4794 + }, + { + "epoch": 0.9592, + "learning_rate": 1.6178596130903345e-05, + "loss": 0.369, + "step": 4796 + }, + { + "epoch": 0.9596, + "learning_rate": 1.616761145411704e-05, + "loss": 0.457, + "step": 4798 + }, + { + "epoch": 0.96, + "learning_rate": 1.6156614753256587e-05, + "loss": 0.377, + "step": 4800 + }, + { + "epoch": 0.9604, + "learning_rate": 1.6145606049760648e-05, + "loss": 0.4434, + "step": 4802 + }, + { + "epoch": 0.9608, + "learning_rate": 1.613458536509123e-05, + "loss": 0.8189, + "step": 4804 + }, + { + "epoch": 0.9612, + "learning_rate": 1.612355272073378e-05, + "loss": 0.5349, + "step": 4806 + }, + { + "epoch": 0.9616, + "learning_rate": 1.6112508138196922e-05, + "loss": 0.2653, + "step": 4808 + }, + { + "epoch": 0.962, + "learning_rate": 1.610145163901268e-05, + "loss": 0.6323, + "step": 4810 + }, + { + "epoch": 0.9624, + "learning_rate": 1.6090383244736277e-05, + "loss": 0.0429, + "step": 4812 + }, + { + "epoch": 0.9628, + "learning_rate": 1.6079302976946062e-05, + "loss": 0.3353, + "step": 4814 + }, + { + "epoch": 0.9632, + "learning_rate": 1.606821085724363e-05, + "loss": 0.1851, + "step": 4816 + }, + { + "epoch": 0.9636, + "learning_rate": 1.6057106907253607e-05, + "loss": 0.2295, + "step": 4818 + }, + { + "epoch": 0.964, + "learning_rate": 1.6045991148623756e-05, + "loss": 0.3672, + "step": 4820 + }, + { + "epoch": 0.9644, + "learning_rate": 1.6034863603024775e-05, + "loss": 0.1451, + "step": 4822 + }, + { + "epoch": 0.9648, + "learning_rate": 1.602372429215038e-05, + "loss": 0.1974, + "step": 4824 + }, + { + "epoch": 0.9652, + "learning_rate": 1.6012573237717265e-05, + "loss": 0.3073, + "step": 4826 + }, + { + "epoch": 0.9656, + "learning_rate": 1.600141046146497e-05, + "loss": 0.2618, + "step": 4828 + }, + { + "epoch": 0.966, + "learning_rate": 1.5990235985155856e-05, + "loss": 0.2805, + "step": 4830 + }, + { + "epoch": 0.9664, + "learning_rate": 1.597904983057519e-05, + "loss": 0.2173, + "step": 4832 + }, + { + "epoch": 0.9668, + "learning_rate": 1.5967852019530942e-05, + "loss": 0.2199, + "step": 4834 + }, + { + "epoch": 0.9672, + "learning_rate": 1.5956642573853794e-05, + "loss": 0.3157, + "step": 4836 + }, + { + "epoch": 0.9676, + "learning_rate": 1.5945421515397135e-05, + "loss": 0.3602, + "step": 4838 + }, + { + "epoch": 0.968, + "learning_rate": 1.5934188866037014e-05, + "loss": 0.212, + "step": 4840 + }, + { + "epoch": 0.9684, + "learning_rate": 1.5922944647672068e-05, + "loss": 1.6341, + "step": 4842 + }, + { + "epoch": 0.9688, + "learning_rate": 1.591168888222342e-05, + "loss": 0.442, + "step": 4844 + }, + { + "epoch": 0.9692, + "learning_rate": 1.5900421591634816e-05, + "loss": 0.3332, + "step": 4846 + }, + { + "epoch": 0.9696, + "learning_rate": 1.5889142797872407e-05, + "loss": 0.1613, + "step": 4848 + }, + { + "epoch": 0.97, + "learning_rate": 1.5877852522924736e-05, + "loss": 0.2154, + "step": 4850 + }, + { + "epoch": 0.9704, + "learning_rate": 1.5866550788802818e-05, + "loss": 0.3457, + "step": 4852 + }, + { + "epoch": 0.9708, + "learning_rate": 1.5855237617539932e-05, + "loss": 0.517, + "step": 4854 + }, + { + "epoch": 0.9712, + "learning_rate": 1.584391303119173e-05, + "loss": 0.6896, + "step": 4856 + }, + { + "epoch": 0.9716, + "learning_rate": 1.5832577051836023e-05, + "loss": 0.3691, + "step": 4858 + }, + { + "epoch": 0.972, + "learning_rate": 1.582122970157289e-05, + "loss": 0.3319, + "step": 4860 + }, + { + "epoch": 0.9724, + "learning_rate": 1.5809871002524592e-05, + "loss": 0.2666, + "step": 4862 + }, + { + "epoch": 0.9728, + "learning_rate": 1.5798500976835503e-05, + "loss": 0.1984, + "step": 4864 + }, + { + "epoch": 0.9732, + "learning_rate": 1.5787119646672032e-05, + "loss": 0.4403, + "step": 4866 + }, + { + "epoch": 0.9736, + "learning_rate": 1.577572703422267e-05, + "loss": 0.2345, + "step": 4868 + }, + { + "epoch": 0.974, + "learning_rate": 1.5764323161697946e-05, + "loss": 0.494, + "step": 4870 + }, + { + "epoch": 0.9744, + "learning_rate": 1.575290805133024e-05, + "loss": 0.1961, + "step": 4872 + }, + { + "epoch": 0.9748, + "learning_rate": 1.5741481725373896e-05, + "loss": 0.4632, + "step": 4874 + }, + { + "epoch": 0.9752, + "learning_rate": 1.5730044206105156e-05, + "loss": 0.353, + "step": 4876 + }, + { + "epoch": 0.9756, + "learning_rate": 1.571859551582204e-05, + "loss": 0.3184, + "step": 4878 + }, + { + "epoch": 0.976, + "learning_rate": 1.570713567684432e-05, + "loss": 1.1186, + "step": 4880 + }, + { + "epoch": 0.9764, + "learning_rate": 1.5695664711513575e-05, + "loss": 0.2795, + "step": 4882 + }, + { + "epoch": 0.9768, + "learning_rate": 1.5684182642193047e-05, + "loss": 0.9398, + "step": 4884 + }, + { + "epoch": 0.9772, + "learning_rate": 1.567268949126757e-05, + "loss": 0.7083, + "step": 4886 + }, + { + "epoch": 0.9776, + "learning_rate": 1.566118528114367e-05, + "loss": 0.2595, + "step": 4888 + }, + { + "epoch": 0.978, + "learning_rate": 1.5649670034249372e-05, + "loss": 0.1587, + "step": 4890 + }, + { + "epoch": 0.9784, + "learning_rate": 1.563814377303429e-05, + "loss": 0.4812, + "step": 4892 + }, + { + "epoch": 0.9788, + "learning_rate": 1.5626606519969373e-05, + "loss": 0.5026, + "step": 4894 + }, + { + "epoch": 0.9792, + "learning_rate": 1.561505829754715e-05, + "loss": 0.1459, + "step": 4896 + }, + { + "epoch": 0.9796, + "learning_rate": 1.5603499128281437e-05, + "loss": 0.226, + "step": 4898 + }, + { + "epoch": 0.98, + "learning_rate": 1.5591929034707475e-05, + "loss": 0.2713, + "step": 4900 + }, + { + "epoch": 0.9804, + "learning_rate": 1.558034803938171e-05, + "loss": 0.6236, + "step": 4902 + }, + { + "epoch": 0.9808, + "learning_rate": 1.5568756164881874e-05, + "loss": 0.4798, + "step": 4904 + }, + { + "epoch": 0.9812, + "learning_rate": 1.5557153433806974e-05, + "loss": 1.7871, + "step": 4906 + }, + { + "epoch": 0.9816, + "learning_rate": 1.5545539868777085e-05, + "loss": 0.3544, + "step": 4908 + }, + { + "epoch": 0.982, + "learning_rate": 1.5533915492433437e-05, + "loss": 0.297, + "step": 4910 + }, + { + "epoch": 0.9824, + "learning_rate": 1.5522280327438384e-05, + "loss": 0.6249, + "step": 4912 + }, + { + "epoch": 0.9828, + "learning_rate": 1.5510634396475275e-05, + "loss": 0.2954, + "step": 4914 + }, + { + "epoch": 0.9832, + "learning_rate": 1.5498977722248398e-05, + "loss": 1.0988, + "step": 4916 + }, + { + "epoch": 0.9836, + "learning_rate": 1.5487310327483084e-05, + "loss": 0.4274, + "step": 4918 + }, + { + "epoch": 0.984, + "learning_rate": 1.547563223492552e-05, + "loss": 1.0239, + "step": 4920 + }, + { + "epoch": 0.9844, + "learning_rate": 1.5463943467342708e-05, + "loss": 0.3867, + "step": 4922 + }, + { + "epoch": 0.9848, + "learning_rate": 1.5452244047522504e-05, + "loss": 0.1976, + "step": 4924 + }, + { + "epoch": 0.9852, + "learning_rate": 1.5440533998273552e-05, + "loss": 0.103, + "step": 4926 + }, + { + "epoch": 0.9856, + "learning_rate": 1.5428813342425194e-05, + "loss": 0.4862, + "step": 4928 + }, + { + "epoch": 0.986, + "learning_rate": 1.5417082102827407e-05, + "loss": 0.3815, + "step": 4930 + }, + { + "epoch": 0.9864, + "learning_rate": 1.5405340302350876e-05, + "loss": 0.156, + "step": 4932 + }, + { + "epoch": 0.9868, + "learning_rate": 1.5393587963886827e-05, + "loss": 0.3448, + "step": 4934 + }, + { + "epoch": 0.9872, + "learning_rate": 1.538182511034708e-05, + "loss": 0.3498, + "step": 4936 + }, + { + "epoch": 0.9876, + "learning_rate": 1.5370051764663875e-05, + "loss": 0.6057, + "step": 4938 + }, + { + "epoch": 0.988, + "learning_rate": 1.535826794978996e-05, + "loss": 0.1878, + "step": 4940 + }, + { + "epoch": 0.9884, + "learning_rate": 1.534647368869852e-05, + "loss": 0.267, + "step": 4942 + }, + { + "epoch": 0.9888, + "learning_rate": 1.5334669004383036e-05, + "loss": 0.0995, + "step": 4944 + }, + { + "epoch": 0.9892, + "learning_rate": 1.5322853919857337e-05, + "loss": 0.2764, + "step": 4946 + }, + { + "epoch": 0.9896, + "learning_rate": 1.5311028458155564e-05, + "loss": 0.2402, + "step": 4948 + }, + { + "epoch": 0.99, + "learning_rate": 1.5299192642332063e-05, + "loss": 0.5147, + "step": 4950 + }, + { + "epoch": 0.9904, + "learning_rate": 1.528734649546133e-05, + "loss": 0.0648, + "step": 4952 + }, + { + "epoch": 0.9908, + "learning_rate": 1.5275490040638038e-05, + "loss": 0.2504, + "step": 4954 + }, + { + "epoch": 0.9912, + "learning_rate": 1.5263623300976997e-05, + "loss": 0.7926, + "step": 4956 + }, + { + "epoch": 0.9916, + "learning_rate": 1.5251746299612973e-05, + "loss": 0.1338, + "step": 4958 + }, + { + "epoch": 0.992, + "learning_rate": 1.5239859059700792e-05, + "loss": 0.3954, + "step": 4960 + }, + { + "epoch": 0.9924, + "learning_rate": 1.522796160441527e-05, + "loss": 0.2655, + "step": 4962 + }, + { + "epoch": 0.9928, + "learning_rate": 1.5216053956951096e-05, + "loss": 0.3116, + "step": 4964 + }, + { + "epoch": 0.9932, + "learning_rate": 1.5204136140522799e-05, + "loss": 0.1699, + "step": 4966 + }, + { + "epoch": 0.9936, + "learning_rate": 1.5192208178364819e-05, + "loss": 0.2399, + "step": 4968 + }, + { + "epoch": 0.994, + "learning_rate": 1.5180270093731291e-05, + "loss": 0.3266, + "step": 4970 + }, + { + "epoch": 0.9944, + "learning_rate": 1.5168321909896176e-05, + "loss": 0.253, + "step": 4972 + }, + { + "epoch": 0.9948, + "learning_rate": 1.5156363650153017e-05, + "loss": 0.2107, + "step": 4974 + }, + { + "epoch": 0.9952, + "learning_rate": 1.5144395337815057e-05, + "loss": 0.1531, + "step": 4976 + }, + { + "epoch": 0.9956, + "learning_rate": 1.5132416996215178e-05, + "loss": 0.6295, + "step": 4978 + }, + { + "epoch": 0.996, + "learning_rate": 1.5120428648705722e-05, + "loss": 0.4572, + "step": 4980 + }, + { + "epoch": 0.9964, + "learning_rate": 1.5108430318658607e-05, + "loss": 0.261, + "step": 4982 + }, + { + "epoch": 0.9968, + "learning_rate": 1.5096422029465171e-05, + "loss": 0.6105, + "step": 4984 + }, + { + "epoch": 0.9972, + "learning_rate": 1.5084403804536236e-05, + "loss": 0.1747, + "step": 4986 + }, + { + "epoch": 0.9976, + "learning_rate": 1.5072375667301904e-05, + "loss": 0.7721, + "step": 4988 + }, + { + "epoch": 0.998, + "learning_rate": 1.5060337641211636e-05, + "loss": 0.159, + "step": 4990 + }, + { + "epoch": 0.9984, + "learning_rate": 1.5048289749734231e-05, + "loss": 0.2571, + "step": 4992 + }, + { + "epoch": 0.9988, + "learning_rate": 1.5036232016357622e-05, + "loss": 0.5908, + "step": 4994 + }, + { + "epoch": 0.9992, + "learning_rate": 1.502416446458898e-05, + "loss": 0.344, + "step": 4996 + }, + { + "epoch": 0.9996, + "learning_rate": 1.5012087117954641e-05, + "loss": 0.4671, + "step": 4998 + }, + { + "epoch": 1.0, + "learning_rate": 1.5000000000000014e-05, + "loss": 0.1265, + "step": 5000 + }, + { + "epoch": 1.0, + "step": 5000, + "total_flos": 0, + "train_loss": 0.4200633437473327, + "train_runtime": 22460.8343, + "train_samples_per_second": 3.562, + "train_steps_per_second": 0.223 + } + ], + "logging_steps": 2, + "max_steps": 5000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..12551f8335d5e0528896ce3ebbff583f9b5d4efb --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64591c8820f0c4df71367fc4c404b79094ab0e1a2cfe4d84a8aba5d404efbcb9 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d554d9dd62ff5b1becacdbd1d7bc54f71496299a --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62131a6ec5a0420454e2118ed3bab89a132aee41060161c579deb6e79b4d4064 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..725185786c7b940f005fb9789b0f2192123b57ba --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19e6be21445b4f8d8e78d3d46c5ce24ec9aa4a578c46de0fb697350f1b5d4a5e +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b4d898b66c864b78f54bfe2f6bffa3df5327e43e --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_divbs_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5643f57073b3eaf4b4755917e0b453cd04ff3e747957929d121a97d672d3c922 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_gradnorm_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_gradnorm_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c4b52e3df8a04c46af2c87629905848ae3918a69 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_gradnorm_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,17532 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004, + "grad_norm": 9.43017578125, + "learning_rate": 2.3485380412522497e-06, + "loss": 0.771, + "step": 2 + }, + { + "epoch": 0.0008, + "grad_norm": 4.271265983581543, + "learning_rate": 2.357535430610912e-06, + "loss": 0.2701, + "step": 4 + }, + { + "epoch": 0.0012, + "grad_norm": 6.253836154937744, + "learning_rate": 2.366547719345306e-06, + "loss": 0.5153, + "step": 6 + }, + { + "epoch": 0.0016, + "grad_norm": 5.324436187744141, + "learning_rate": 2.3755748898855234e-06, + "loss": 1.8499, + "step": 8 + }, + { + "epoch": 0.002, + "grad_norm": 6.794293403625488, + "learning_rate": 2.3846169246326332e-06, + "loss": 0.6183, + "step": 10 + }, + { + "epoch": 0.0024, + "grad_norm": 5.396615982055664, + "learning_rate": 2.3936738059587174e-06, + "loss": 0.7139, + "step": 12 + }, + { + "epoch": 0.0028, + "grad_norm": 5.497649669647217, + "learning_rate": 2.4027455162069537e-06, + "loss": 0.615, + "step": 14 + }, + { + "epoch": 0.0032, + "grad_norm": 3.6306519508361816, + "learning_rate": 2.411832037691545e-06, + "loss": 0.3701, + "step": 16 + }, + { + "epoch": 0.0036, + "grad_norm": 7.331335544586182, + "learning_rate": 2.420933352697865e-06, + "loss": 0.418, + "step": 18 + }, + { + "epoch": 0.004, + "grad_norm": 0.024429287761449814, + "learning_rate": 2.430049443482434e-06, + "loss": 0.1258, + "step": 20 + }, + { + "epoch": 0.0044, + "grad_norm": 9.476146697998047, + "learning_rate": 2.439180292272967e-06, + "loss": 0.8353, + "step": 22 + }, + { + "epoch": 0.0048, + "grad_norm": 7.4971418380737305, + "learning_rate": 2.448325881268406e-06, + "loss": 0.2719, + "step": 24 + }, + { + "epoch": 0.0052, + "grad_norm": 3.262369155883789, + "learning_rate": 2.457486192638958e-06, + "loss": 0.0886, + "step": 26 + }, + { + "epoch": 0.0056, + "grad_norm": 3.135603666305542, + "learning_rate": 2.4666612085261277e-06, + "loss": 0.4116, + "step": 28 + }, + { + "epoch": 0.006, + "grad_norm": 6.7829670906066895, + "learning_rate": 2.475850911042752e-06, + "loss": 0.4907, + "step": 30 + }, + { + "epoch": 0.0064, + "grad_norm": 2.5207414627075195, + "learning_rate": 2.4850552822730346e-06, + "loss": 0.2504, + "step": 32 + }, + { + "epoch": 0.0068, + "grad_norm": 5.58534049987793, + "learning_rate": 2.4942743042725836e-06, + "loss": 0.4627, + "step": 34 + }, + { + "epoch": 0.0072, + "grad_norm": 7.460333824157715, + "learning_rate": 2.503507959068455e-06, + "loss": 0.4038, + "step": 36 + }, + { + "epoch": 0.0076, + "grad_norm": 10.503233909606934, + "learning_rate": 2.5127562286591313e-06, + "loss": 0.4796, + "step": 38 + }, + { + "epoch": 0.008, + "grad_norm": 5.661429405212402, + "learning_rate": 2.522019095014686e-06, + "loss": 0.2546, + "step": 40 + }, + { + "epoch": 0.0084, + "grad_norm": 9.722317695617676, + "learning_rate": 2.531296540076638e-06, + "loss": 0.4175, + "step": 42 + }, + { + "epoch": 0.0088, + "grad_norm": 1.4715416431427002, + "learning_rate": 2.5405885457581814e-06, + "loss": 0.3419, + "step": 44 + }, + { + "epoch": 0.0092, + "grad_norm": 7.435876846313477, + "learning_rate": 2.5498950939440413e-06, + "loss": 0.6107, + "step": 46 + }, + { + "epoch": 0.0096, + "grad_norm": 4.131146430969238, + "learning_rate": 2.5592161664906243e-06, + "loss": 0.4518, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 9.233051300048828, + "learning_rate": 2.5685517452260587e-06, + "loss": 0.2908, + "step": 50 + }, + { + "epoch": 0.0104, + "grad_norm": 15.128793716430664, + "learning_rate": 2.5779018119501086e-06, + "loss": 0.8474, + "step": 52 + }, + { + "epoch": 0.0108, + "grad_norm": 16.535146713256836, + "learning_rate": 2.5872663484343887e-06, + "loss": 0.7679, + "step": 54 + }, + { + "epoch": 0.0112, + "grad_norm": 5.005345344543457, + "learning_rate": 2.596645336422219e-06, + "loss": 0.7604, + "step": 56 + }, + { + "epoch": 0.0116, + "grad_norm": 6.112252712249756, + "learning_rate": 2.606038757628795e-06, + "loss": 0.3374, + "step": 58 + }, + { + "epoch": 0.012, + "grad_norm": 5.2450947761535645, + "learning_rate": 2.615446593741161e-06, + "loss": 0.2478, + "step": 60 + }, + { + "epoch": 0.0124, + "grad_norm": 4.900399684906006, + "learning_rate": 2.6248688264182588e-06, + "loss": 0.2727, + "step": 62 + }, + { + "epoch": 0.0128, + "grad_norm": 3.8664023876190186, + "learning_rate": 2.6343054372909648e-06, + "loss": 0.754, + "step": 64 + }, + { + "epoch": 0.0132, + "grad_norm": 7.842030048370361, + "learning_rate": 2.6437564079621235e-06, + "loss": 0.2494, + "step": 66 + }, + { + "epoch": 0.0136, + "grad_norm": 8.617486000061035, + "learning_rate": 2.6532217200065826e-06, + "loss": 0.4807, + "step": 68 + }, + { + "epoch": 0.014, + "grad_norm": 9.217799186706543, + "learning_rate": 2.662701354971232e-06, + "loss": 0.7053, + "step": 70 + }, + { + "epoch": 0.0144, + "grad_norm": 1.7402020692825317, + "learning_rate": 2.6721952943750396e-06, + "loss": 0.3528, + "step": 72 + }, + { + "epoch": 0.0148, + "grad_norm": 8.862558364868164, + "learning_rate": 2.6817035197090825e-06, + "loss": 0.4726, + "step": 74 + }, + { + "epoch": 0.0152, + "grad_norm": 7.722004413604736, + "learning_rate": 2.691226012436604e-06, + "loss": 0.4263, + "step": 76 + }, + { + "epoch": 0.0156, + "grad_norm": 12.282310485839844, + "learning_rate": 2.7007627539929783e-06, + "loss": 0.6798, + "step": 78 + }, + { + "epoch": 0.016, + "grad_norm": 4.7992424964904785, + "learning_rate": 2.7103137257858893e-06, + "loss": 0.2185, + "step": 80 + }, + { + "epoch": 0.0164, + "grad_norm": 5.506490707397461, + "learning_rate": 2.7198789091951806e-06, + "loss": 0.407, + "step": 82 + }, + { + "epoch": 0.0168, + "grad_norm": 6.097909450531006, + "learning_rate": 2.7294582855730733e-06, + "loss": 0.3071, + "step": 84 + }, + { + "epoch": 0.0172, + "grad_norm": 4.383481025695801, + "learning_rate": 2.7390518362440843e-06, + "loss": 0.3197, + "step": 86 + }, + { + "epoch": 0.0176, + "grad_norm": 6.233641147613525, + "learning_rate": 2.7486595425050566e-06, + "loss": 0.7594, + "step": 88 + }, + { + "epoch": 0.018, + "grad_norm": 3.302933692932129, + "learning_rate": 2.7582813856253264e-06, + "loss": 0.4171, + "step": 90 + }, + { + "epoch": 0.0184, + "grad_norm": 6.9959306716918945, + "learning_rate": 2.7679173468465813e-06, + "loss": 0.3943, + "step": 92 + }, + { + "epoch": 0.0188, + "grad_norm": 4.529530048370361, + "learning_rate": 2.777567407383033e-06, + "loss": 0.4087, + "step": 94 + }, + { + "epoch": 0.0192, + "grad_norm": 8.349254608154297, + "learning_rate": 2.7872315484213954e-06, + "loss": 0.5298, + "step": 96 + }, + { + "epoch": 0.0196, + "grad_norm": 6.337455749511719, + "learning_rate": 2.796909751120931e-06, + "loss": 0.4923, + "step": 98 + }, + { + "epoch": 0.02, + "grad_norm": 3.956463575363159, + "learning_rate": 2.8066019966134873e-06, + "loss": 0.2077, + "step": 100 + }, + { + "epoch": 0.0204, + "grad_norm": 2.211174249649048, + "learning_rate": 2.816308266003538e-06, + "loss": 0.5643, + "step": 102 + }, + { + "epoch": 0.0208, + "grad_norm": 14.609574317932129, + "learning_rate": 2.826028540368212e-06, + "loss": 0.4668, + "step": 104 + }, + { + "epoch": 0.0212, + "grad_norm": 9.918601036071777, + "learning_rate": 2.835762800757338e-06, + "loss": 0.6823, + "step": 106 + }, + { + "epoch": 0.0216, + "grad_norm": 7.901192665100098, + "learning_rate": 2.845511028193477e-06, + "loss": 0.3562, + "step": 108 + }, + { + "epoch": 0.022, + "grad_norm": 8.191703796386719, + "learning_rate": 2.855273203671962e-06, + "loss": 0.8119, + "step": 110 + }, + { + "epoch": 0.0224, + "grad_norm": 10.761240005493164, + "learning_rate": 2.865049308160931e-06, + "loss": 0.5518, + "step": 112 + }, + { + "epoch": 0.0228, + "grad_norm": 11.626636505126953, + "learning_rate": 2.874839322601368e-06, + "loss": 0.8303, + "step": 114 + }, + { + "epoch": 0.0232, + "grad_norm": 3.837890148162842, + "learning_rate": 2.8846432279071533e-06, + "loss": 0.2803, + "step": 116 + }, + { + "epoch": 0.0236, + "grad_norm": 12.363809585571289, + "learning_rate": 2.8944610049650314e-06, + "loss": 0.6835, + "step": 118 + }, + { + "epoch": 0.024, + "grad_norm": 11.530438423156738, + "learning_rate": 2.9042926346347835e-06, + "loss": 0.591, + "step": 120 + }, + { + "epoch": 0.0244, + "grad_norm": 13.856647491455078, + "learning_rate": 2.914138097749143e-06, + "loss": 0.3738, + "step": 122 + }, + { + "epoch": 0.0248, + "grad_norm": 8.601247787475586, + "learning_rate": 2.9239973751138397e-06, + "loss": 0.408, + "step": 124 + }, + { + "epoch": 0.0252, + "grad_norm": 9.481165885925293, + "learning_rate": 2.933870447507756e-06, + "loss": 0.4447, + "step": 126 + }, + { + "epoch": 0.0256, + "grad_norm": 5.73183536529541, + "learning_rate": 2.943757295682783e-06, + "loss": 0.3318, + "step": 128 + }, + { + "epoch": 0.026, + "grad_norm": 12.501492500305176, + "learning_rate": 2.953657900364055e-06, + "loss": 0.4518, + "step": 130 + }, + { + "epoch": 0.0264, + "grad_norm": 13.509366035461426, + "learning_rate": 2.9635722422497983e-06, + "loss": 0.6901, + "step": 132 + }, + { + "epoch": 0.0268, + "grad_norm": 8.841031074523926, + "learning_rate": 2.973500302011496e-06, + "loss": 0.6345, + "step": 134 + }, + { + "epoch": 0.0272, + "grad_norm": 5.9275078773498535, + "learning_rate": 2.983442060293926e-06, + "loss": 0.6145, + "step": 136 + }, + { + "epoch": 0.0276, + "grad_norm": 16.473703384399414, + "learning_rate": 2.9933974977150827e-06, + "loss": 0.4156, + "step": 138 + }, + { + "epoch": 0.028, + "grad_norm": 12.65388011932373, + "learning_rate": 3.003366594866345e-06, + "loss": 0.5816, + "step": 140 + }, + { + "epoch": 0.0284, + "grad_norm": 6.338968276977539, + "learning_rate": 3.0133493323124474e-06, + "loss": 0.3959, + "step": 142 + }, + { + "epoch": 0.0288, + "grad_norm": 4.3785014152526855, + "learning_rate": 3.0233456905915338e-06, + "loss": 0.4541, + "step": 144 + }, + { + "epoch": 0.0292, + "grad_norm": 5.1778106689453125, + "learning_rate": 3.0333556502151895e-06, + "loss": 0.2957, + "step": 146 + }, + { + "epoch": 0.0296, + "grad_norm": 9.729018211364746, + "learning_rate": 3.0433791916684885e-06, + "loss": 0.7704, + "step": 148 + }, + { + "epoch": 0.03, + "grad_norm": 6.06517219543457, + "learning_rate": 3.0534162954100234e-06, + "loss": 0.491, + "step": 150 + }, + { + "epoch": 0.0304, + "grad_norm": 8.951805114746094, + "learning_rate": 3.0634669418719453e-06, + "loss": 0.4468, + "step": 152 + }, + { + "epoch": 0.0308, + "grad_norm": 8.392043113708496, + "learning_rate": 3.0735311114600064e-06, + "loss": 0.5426, + "step": 154 + }, + { + "epoch": 0.0312, + "grad_norm": 7.071199893951416, + "learning_rate": 3.0836087845535933e-06, + "loss": 0.4874, + "step": 156 + }, + { + "epoch": 0.0316, + "grad_norm": 8.155244827270508, + "learning_rate": 3.0936999415057645e-06, + "loss": 0.3822, + "step": 158 + }, + { + "epoch": 0.032, + "grad_norm": 11.294057846069336, + "learning_rate": 3.1038045626432945e-06, + "loss": 0.5131, + "step": 160 + }, + { + "epoch": 0.0324, + "grad_norm": 9.363883972167969, + "learning_rate": 3.1139226282667212e-06, + "loss": 0.5273, + "step": 162 + }, + { + "epoch": 0.0328, + "grad_norm": 5.2910661697387695, + "learning_rate": 3.1240541186503173e-06, + "loss": 0.3423, + "step": 164 + }, + { + "epoch": 0.0332, + "grad_norm": 22.55951690673828, + "learning_rate": 3.134199014042277e-06, + "loss": 0.5226, + "step": 166 + }, + { + "epoch": 0.0336, + "grad_norm": 6.656784534454346, + "learning_rate": 3.1443572946645683e-06, + "loss": 0.8273, + "step": 168 + }, + { + "epoch": 0.034, + "grad_norm": 9.89769458770752, + "learning_rate": 3.154528940713103e-06, + "loss": 0.6275, + "step": 170 + }, + { + "epoch": 0.0344, + "grad_norm": 0.007958236150443554, + "learning_rate": 3.164713932357776e-06, + "loss": 0.3572, + "step": 172 + }, + { + "epoch": 0.0348, + "grad_norm": 5.75770378112793, + "learning_rate": 3.1749122497423724e-06, + "loss": 0.3737, + "step": 174 + }, + { + "epoch": 0.0352, + "grad_norm": 4.592837810516357, + "learning_rate": 3.1851238729848033e-06, + "loss": 0.4595, + "step": 176 + }, + { + "epoch": 0.0356, + "grad_norm": 2.869123935699463, + "learning_rate": 3.195348782176948e-06, + "loss": 0.3431, + "step": 178 + }, + { + "epoch": 0.036, + "grad_norm": 14.210893630981445, + "learning_rate": 3.205586957384834e-06, + "loss": 0.4515, + "step": 180 + }, + { + "epoch": 0.0364, + "grad_norm": 19.600555419921875, + "learning_rate": 3.215838378648617e-06, + "loss": 0.8745, + "step": 182 + }, + { + "epoch": 0.0368, + "grad_norm": 4.5630059242248535, + "learning_rate": 3.2261030259826253e-06, + "loss": 0.3564, + "step": 184 + }, + { + "epoch": 0.0372, + "grad_norm": 12.26484489440918, + "learning_rate": 3.2363808793754036e-06, + "loss": 0.503, + "step": 186 + }, + { + "epoch": 0.0376, + "grad_norm": 11.284700393676758, + "learning_rate": 3.246671918789752e-06, + "loss": 0.5051, + "step": 188 + }, + { + "epoch": 0.038, + "grad_norm": 1.6922160387039185, + "learning_rate": 3.2569761241627617e-06, + "loss": 0.2485, + "step": 190 + }, + { + "epoch": 0.0384, + "grad_norm": 3.5115294456481934, + "learning_rate": 3.267293475405858e-06, + "loss": 0.7724, + "step": 192 + }, + { + "epoch": 0.0388, + "grad_norm": 7.569575786590576, + "learning_rate": 3.277623952404835e-06, + "loss": 0.4528, + "step": 194 + }, + { + "epoch": 0.0392, + "grad_norm": 12.12824535369873, + "learning_rate": 3.2879675350199004e-06, + "loss": 0.5109, + "step": 196 + }, + { + "epoch": 0.0396, + "grad_norm": 4.234559059143066, + "learning_rate": 3.298324203085723e-06, + "loss": 0.4674, + "step": 198 + }, + { + "epoch": 0.04, + "grad_norm": 6.067758560180664, + "learning_rate": 3.3086939364114113e-06, + "loss": 0.4171, + "step": 200 + }, + { + "epoch": 0.0404, + "grad_norm": 14.10073471069336, + "learning_rate": 3.3190767147806892e-06, + "loss": 1.0114, + "step": 202 + }, + { + "epoch": 0.0408, + "grad_norm": 12.471076011657715, + "learning_rate": 3.329472517951747e-06, + "loss": 0.4659, + "step": 204 + }, + { + "epoch": 0.0412, + "grad_norm": 9.64620590209961, + "learning_rate": 3.3398813256574745e-06, + "loss": 0.4254, + "step": 206 + }, + { + "epoch": 0.0416, + "grad_norm": 8.841073989868164, + "learning_rate": 3.350303117605369e-06, + "loss": 0.4563, + "step": 208 + }, + { + "epoch": 0.042, + "grad_norm": 11.459227561950684, + "learning_rate": 3.360737873477574e-06, + "loss": 0.6406, + "step": 210 + }, + { + "epoch": 0.0424, + "grad_norm": 8.023174285888672, + "learning_rate": 3.3711855729310503e-06, + "loss": 0.523, + "step": 212 + }, + { + "epoch": 0.0428, + "grad_norm": 8.381044387817383, + "learning_rate": 3.3816461955974224e-06, + "loss": 0.3774, + "step": 214 + }, + { + "epoch": 0.0432, + "grad_norm": 5.727261543273926, + "learning_rate": 3.3921197210832235e-06, + "loss": 0.5645, + "step": 216 + }, + { + "epoch": 0.0436, + "grad_norm": 9.199751853942871, + "learning_rate": 3.4026061289697397e-06, + "loss": 0.3278, + "step": 218 + }, + { + "epoch": 0.044, + "grad_norm": 3.1275556087493896, + "learning_rate": 3.4131053988131947e-06, + "loss": 0.3419, + "step": 220 + }, + { + "epoch": 0.0444, + "grad_norm": 3.2991018295288086, + "learning_rate": 3.4236175101447257e-06, + "loss": 0.4636, + "step": 222 + }, + { + "epoch": 0.0448, + "grad_norm": 7.064979553222656, + "learning_rate": 3.434142442470434e-06, + "loss": 0.452, + "step": 224 + }, + { + "epoch": 0.0452, + "grad_norm": 3.3244409561157227, + "learning_rate": 3.444680175271424e-06, + "loss": 0.813, + "step": 226 + }, + { + "epoch": 0.0456, + "grad_norm": 5.9261651039123535, + "learning_rate": 3.455230688003849e-06, + "loss": 0.233, + "step": 228 + }, + { + "epoch": 0.046, + "grad_norm": 6.273072242736816, + "learning_rate": 3.465793960098942e-06, + "loss": 0.2872, + "step": 230 + }, + { + "epoch": 0.0464, + "grad_norm": 6.306349754333496, + "learning_rate": 3.476369970963065e-06, + "loss": 0.5399, + "step": 232 + }, + { + "epoch": 0.0468, + "grad_norm": 13.865418434143066, + "learning_rate": 3.486958699977743e-06, + "loss": 0.8169, + "step": 234 + }, + { + "epoch": 0.0472, + "grad_norm": 8.713153839111328, + "learning_rate": 3.497560126499706e-06, + "loss": 0.4631, + "step": 236 + }, + { + "epoch": 0.0476, + "grad_norm": 7.679821968078613, + "learning_rate": 3.508174229860947e-06, + "loss": 0.1673, + "step": 238 + }, + { + "epoch": 0.048, + "grad_norm": 4.999298572540283, + "learning_rate": 3.5188009893686836e-06, + "loss": 0.3959, + "step": 240 + }, + { + "epoch": 0.0484, + "grad_norm": 9.601078987121582, + "learning_rate": 3.5294403843055493e-06, + "loss": 0.6043, + "step": 242 + }, + { + "epoch": 0.0488, + "grad_norm": 8.280967712402344, + "learning_rate": 3.5400923939294827e-06, + "loss": 0.6999, + "step": 244 + }, + { + "epoch": 0.0492, + "grad_norm": 3.7563018798828125, + "learning_rate": 3.5507569974738477e-06, + "loss": 0.3469, + "step": 246 + }, + { + "epoch": 0.0496, + "grad_norm": 7.402956485748291, + "learning_rate": 3.5614341741474667e-06, + "loss": 0.3927, + "step": 248 + }, + { + "epoch": 0.05, + "grad_norm": 4.280170917510986, + "learning_rate": 3.5721239031345966e-06, + "loss": 0.5859, + "step": 250 + }, + { + "epoch": 0.0504, + "grad_norm": 6.055922985076904, + "learning_rate": 3.5828261635951177e-06, + "loss": 0.2689, + "step": 252 + }, + { + "epoch": 0.0508, + "grad_norm": 5.2920823097229, + "learning_rate": 3.593540934664387e-06, + "loss": 0.3742, + "step": 254 + }, + { + "epoch": 0.0512, + "grad_norm": 5.952661514282227, + "learning_rate": 3.604268195453421e-06, + "loss": 0.4176, + "step": 256 + }, + { + "epoch": 0.0516, + "grad_norm": 6.803701877593994, + "learning_rate": 3.6150079250488767e-06, + "loss": 0.4033, + "step": 258 + }, + { + "epoch": 0.052, + "grad_norm": 1.8791087865829468, + "learning_rate": 3.6257601025130893e-06, + "loss": 0.3373, + "step": 260 + }, + { + "epoch": 0.0524, + "grad_norm": 1.1859296560287476, + "learning_rate": 3.636524706884178e-06, + "loss": 0.4239, + "step": 262 + }, + { + "epoch": 0.0528, + "grad_norm": 12.798867225646973, + "learning_rate": 3.647301717175955e-06, + "loss": 0.655, + "step": 264 + }, + { + "epoch": 0.0532, + "grad_norm": 4.451773643493652, + "learning_rate": 3.6580911123781025e-06, + "loss": 0.3931, + "step": 266 + }, + { + "epoch": 0.0536, + "grad_norm": 8.682490348815918, + "learning_rate": 3.66889287145614e-06, + "loss": 0.582, + "step": 268 + }, + { + "epoch": 0.054, + "grad_norm": 9.979537010192871, + "learning_rate": 3.679706973351488e-06, + "loss": 4.104, + "step": 270 + }, + { + "epoch": 0.0544, + "grad_norm": 10.0934419631958, + "learning_rate": 3.6905333969814995e-06, + "loss": 0.6713, + "step": 272 + }, + { + "epoch": 0.0548, + "grad_norm": 9.977805137634277, + "learning_rate": 3.701372121239508e-06, + "loss": 0.5482, + "step": 274 + }, + { + "epoch": 0.0552, + "grad_norm": 10.493433952331543, + "learning_rate": 3.712223124994867e-06, + "loss": 1.0184, + "step": 276 + }, + { + "epoch": 0.0556, + "grad_norm": 5.24810266494751, + "learning_rate": 3.723086387092989e-06, + "loss": 0.273, + "step": 278 + }, + { + "epoch": 0.056, + "grad_norm": 6.031016826629639, + "learning_rate": 3.7339618863553885e-06, + "loss": 0.4987, + "step": 280 + }, + { + "epoch": 0.0564, + "grad_norm": 11.079527854919434, + "learning_rate": 3.744849601579722e-06, + "loss": 0.5311, + "step": 282 + }, + { + "epoch": 0.0568, + "grad_norm": 9.51747989654541, + "learning_rate": 3.755749511539848e-06, + "loss": 0.3207, + "step": 284 + }, + { + "epoch": 0.0572, + "grad_norm": 4.8036580085754395, + "learning_rate": 3.7666615949857897e-06, + "loss": 0.2922, + "step": 286 + }, + { + "epoch": 0.0576, + "grad_norm": 9.66129207611084, + "learning_rate": 3.7775858306439404e-06, + "loss": 0.75, + "step": 288 + }, + { + "epoch": 0.058, + "grad_norm": 8.733214378356934, + "learning_rate": 3.7885221972168864e-06, + "loss": 0.4233, + "step": 290 + }, + { + "epoch": 0.0584, + "grad_norm": 7.663453102111816, + "learning_rate": 3.799470673383677e-06, + "loss": 0.8511, + "step": 292 + }, + { + "epoch": 0.0588, + "grad_norm": 8.134885787963867, + "learning_rate": 3.810431237799657e-06, + "loss": 0.4383, + "step": 294 + }, + { + "epoch": 0.0592, + "grad_norm": 5.9877214431762695, + "learning_rate": 3.821403869096644e-06, + "loss": 0.3579, + "step": 296 + }, + { + "epoch": 0.0596, + "grad_norm": 8.292781829833984, + "learning_rate": 3.8323885458829745e-06, + "loss": 0.2722, + "step": 298 + }, + { + "epoch": 0.06, + "grad_norm": 9.170614242553711, + "learning_rate": 3.8433852467434175e-06, + "loss": 0.2205, + "step": 300 + }, + { + "epoch": 0.0604, + "grad_norm": 5.584014415740967, + "learning_rate": 3.854393950239355e-06, + "loss": 0.555, + "step": 302 + }, + { + "epoch": 0.0608, + "grad_norm": 5.914859294891357, + "learning_rate": 3.865414634908756e-06, + "loss": 0.3517, + "step": 304 + }, + { + "epoch": 0.0612, + "grad_norm": 6.239687442779541, + "learning_rate": 3.876447279266233e-06, + "loss": 0.7459, + "step": 306 + }, + { + "epoch": 0.0616, + "grad_norm": 8.212594985961914, + "learning_rate": 3.887491861803081e-06, + "loss": 0.3822, + "step": 308 + }, + { + "epoch": 0.062, + "grad_norm": 4.242414951324463, + "learning_rate": 3.898548360987321e-06, + "loss": 0.2803, + "step": 310 + }, + { + "epoch": 0.0624, + "grad_norm": 10.664204597473145, + "learning_rate": 3.909616755263741e-06, + "loss": 0.6315, + "step": 312 + }, + { + "epoch": 0.0628, + "grad_norm": 7.510007858276367, + "learning_rate": 3.920697023053941e-06, + "loss": 1.54, + "step": 314 + }, + { + "epoch": 0.0632, + "grad_norm": 9.558072090148926, + "learning_rate": 3.9317891427563725e-06, + "loss": 0.4083, + "step": 316 + }, + { + "epoch": 0.0636, + "grad_norm": 10.670158386230469, + "learning_rate": 3.942893092746381e-06, + "loss": 0.576, + "step": 318 + }, + { + "epoch": 0.064, + "grad_norm": 6.9446282386779785, + "learning_rate": 3.954008851376244e-06, + "loss": 0.3048, + "step": 320 + }, + { + "epoch": 0.0644, + "grad_norm": 6.637779235839844, + "learning_rate": 3.965136396975227e-06, + "loss": 0.5409, + "step": 322 + }, + { + "epoch": 0.0648, + "grad_norm": 10.840149879455566, + "learning_rate": 3.976275707849619e-06, + "loss": 0.6883, + "step": 324 + }, + { + "epoch": 0.0652, + "grad_norm": 3.220845937728882, + "learning_rate": 3.987426762282726e-06, + "loss": 0.3403, + "step": 326 + }, + { + "epoch": 0.0656, + "grad_norm": 4.267912864685059, + "learning_rate": 3.99858953853505e-06, + "loss": 0.2743, + "step": 328 + }, + { + "epoch": 0.066, + "grad_norm": 5.712594509124756, + "learning_rate": 4.009764014844146e-06, + "loss": 0.3086, + "step": 330 + }, + { + "epoch": 0.0664, + "grad_norm": 8.153928756713867, + "learning_rate": 4.0209501694248e-06, + "loss": 0.5816, + "step": 332 + }, + { + "epoch": 0.0668, + "grad_norm": 10.329399108886719, + "learning_rate": 4.032147980469076e-06, + "loss": 0.5752, + "step": 334 + }, + { + "epoch": 0.0672, + "grad_norm": 2.7394285202026367, + "learning_rate": 4.043357426146209e-06, + "loss": 0.1742, + "step": 336 + }, + { + "epoch": 0.0676, + "grad_norm": 15.53478717803955, + "learning_rate": 4.054578484602869e-06, + "loss": 0.6012, + "step": 338 + }, + { + "epoch": 0.068, + "grad_norm": 9.114849090576172, + "learning_rate": 4.065811133962987e-06, + "loss": 0.3383, + "step": 340 + }, + { + "epoch": 0.0684, + "grad_norm": 4.943862438201904, + "learning_rate": 4.07705535232795e-06, + "loss": 0.3308, + "step": 342 + }, + { + "epoch": 0.0688, + "grad_norm": 4.453750133514404, + "learning_rate": 4.08831111777658e-06, + "loss": 0.4766, + "step": 344 + }, + { + "epoch": 0.0692, + "grad_norm": 7.100834369659424, + "learning_rate": 4.0995784083651865e-06, + "loss": 0.3132, + "step": 346 + }, + { + "epoch": 0.0696, + "grad_norm": 11.26512336730957, + "learning_rate": 4.110857202127611e-06, + "loss": 0.5499, + "step": 348 + }, + { + "epoch": 0.07, + "grad_norm": 10.695387840270996, + "learning_rate": 4.122147477075266e-06, + "loss": 0.5184, + "step": 350 + }, + { + "epoch": 0.0704, + "grad_norm": 4.493046760559082, + "learning_rate": 4.133449211197183e-06, + "loss": 0.2431, + "step": 352 + }, + { + "epoch": 0.0708, + "grad_norm": 8.261948585510254, + "learning_rate": 4.144762382460055e-06, + "loss": 0.4193, + "step": 354 + }, + { + "epoch": 0.0712, + "grad_norm": 9.144987106323242, + "learning_rate": 4.156086968808274e-06, + "loss": 0.4677, + "step": 356 + }, + { + "epoch": 0.0716, + "grad_norm": 7.886310577392578, + "learning_rate": 4.1674229481639796e-06, + "loss": 0.6894, + "step": 358 + }, + { + "epoch": 0.072, + "grad_norm": 4.725317478179932, + "learning_rate": 4.178770298427114e-06, + "loss": 0.4912, + "step": 360 + }, + { + "epoch": 0.0724, + "grad_norm": 10.585810661315918, + "learning_rate": 4.190128997475395e-06, + "loss": 0.7042, + "step": 362 + }, + { + "epoch": 0.0728, + "grad_norm": 1.8422179222106934, + "learning_rate": 4.201499023164515e-06, + "loss": 0.331, + "step": 364 + }, + { + "epoch": 0.0732, + "grad_norm": 5.881594181060791, + "learning_rate": 4.212880353327968e-06, + "loss": 0.5321, + "step": 366 + }, + { + "epoch": 0.0736, + "grad_norm": 2.5500106811523438, + "learning_rate": 4.224272965777315e-06, + "loss": 0.5712, + "step": 368 + }, + { + "epoch": 0.074, + "grad_norm": 6.095949172973633, + "learning_rate": 4.235676838302072e-06, + "loss": 0.2536, + "step": 370 + }, + { + "epoch": 0.0744, + "grad_norm": 9.424298286437988, + "learning_rate": 4.247091948669764e-06, + "loss": 0.5426, + "step": 372 + }, + { + "epoch": 0.0748, + "grad_norm": 7.169968128204346, + "learning_rate": 4.258518274626106e-06, + "loss": 0.4799, + "step": 374 + }, + { + "epoch": 0.0752, + "grad_norm": 9.409886360168457, + "learning_rate": 4.269955793894849e-06, + "loss": 0.8752, + "step": 376 + }, + { + "epoch": 0.0756, + "grad_norm": 13.09287166595459, + "learning_rate": 4.281404484177978e-06, + "loss": 0.7931, + "step": 378 + }, + { + "epoch": 0.076, + "grad_norm": 7.674169063568115, + "learning_rate": 4.292864323155684e-06, + "loss": 0.5985, + "step": 380 + }, + { + "epoch": 0.0764, + "grad_norm": 4.660342216491699, + "learning_rate": 4.304335288486412e-06, + "loss": 0.5236, + "step": 382 + }, + { + "epoch": 0.0768, + "grad_norm": 12.915772438049316, + "learning_rate": 4.3158173578069696e-06, + "loss": 0.5157, + "step": 384 + }, + { + "epoch": 0.0772, + "grad_norm": 8.644211769104004, + "learning_rate": 4.327310508732434e-06, + "loss": 0.4107, + "step": 386 + }, + { + "epoch": 0.0776, + "grad_norm": 6.374632835388184, + "learning_rate": 4.338814718856333e-06, + "loss": 0.3628, + "step": 388 + }, + { + "epoch": 0.078, + "grad_norm": 2.7600512504577637, + "learning_rate": 4.350329965750618e-06, + "loss": 0.1669, + "step": 390 + }, + { + "epoch": 0.0784, + "grad_norm": 13.234768867492676, + "learning_rate": 4.3618562269657285e-06, + "loss": 0.4157, + "step": 392 + }, + { + "epoch": 0.0788, + "grad_norm": 4.549943447113037, + "learning_rate": 4.373393480030629e-06, + "loss": 0.6373, + "step": 394 + }, + { + "epoch": 0.0792, + "grad_norm": 5.812559127807617, + "learning_rate": 4.384941702452852e-06, + "loss": 0.463, + "step": 396 + }, + { + "epoch": 0.0796, + "grad_norm": 15.322596549987793, + "learning_rate": 4.396500871718548e-06, + "loss": 0.6318, + "step": 398 + }, + { + "epoch": 0.08, + "grad_norm": 11.74260425567627, + "learning_rate": 4.408070965292526e-06, + "loss": 0.5582, + "step": 400 + }, + { + "epoch": 0.0804, + "grad_norm": 8.44233226776123, + "learning_rate": 4.419651960618294e-06, + "loss": 0.5934, + "step": 402 + }, + { + "epoch": 0.0808, + "grad_norm": 16.54697608947754, + "learning_rate": 4.431243835118112e-06, + "loss": 0.8965, + "step": 404 + }, + { + "epoch": 0.0812, + "grad_norm": 4.842781066894531, + "learning_rate": 4.442846566193041e-06, + "loss": 0.3764, + "step": 406 + }, + { + "epoch": 0.0816, + "grad_norm": 18.05434226989746, + "learning_rate": 4.4544601312229185e-06, + "loss": 0.6992, + "step": 408 + }, + { + "epoch": 0.082, + "grad_norm": 4.65978479385376, + "learning_rate": 4.4660845075665635e-06, + "loss": 0.3065, + "step": 410 + }, + { + "epoch": 0.0824, + "grad_norm": 8.082731246948242, + "learning_rate": 4.477719672561602e-06, + "loss": 0.6258, + "step": 412 + }, + { + "epoch": 0.0828, + "grad_norm": 9.004798889160156, + "learning_rate": 4.489365603524743e-06, + "loss": 0.4581, + "step": 414 + }, + { + "epoch": 0.0832, + "grad_norm": 5.721434116363525, + "learning_rate": 4.501022277751605e-06, + "loss": 0.3477, + "step": 416 + }, + { + "epoch": 0.0836, + "grad_norm": 8.07176685333252, + "learning_rate": 4.5126896725169025e-06, + "loss": 0.4087, + "step": 418 + }, + { + "epoch": 0.084, + "grad_norm": 11.40060806274414, + "learning_rate": 4.524367765074499e-06, + "loss": 0.6781, + "step": 420 + }, + { + "epoch": 0.0844, + "grad_norm": 4.237157821655273, + "learning_rate": 4.536056532657295e-06, + "loss": 0.3678, + "step": 422 + }, + { + "epoch": 0.0848, + "grad_norm": 7.206717491149902, + "learning_rate": 4.5477559524775e-06, + "loss": 0.3749, + "step": 424 + }, + { + "epoch": 0.0852, + "grad_norm": 1.6987860202789307, + "learning_rate": 4.559466001726451e-06, + "loss": 0.4187, + "step": 426 + }, + { + "epoch": 0.0856, + "grad_norm": 5.544573783874512, + "learning_rate": 4.571186657574823e-06, + "loss": 0.3952, + "step": 428 + }, + { + "epoch": 0.086, + "grad_norm": 7.324548244476318, + "learning_rate": 4.582917897172599e-06, + "loss": 0.7735, + "step": 430 + }, + { + "epoch": 0.0864, + "grad_norm": 7.634961128234863, + "learning_rate": 4.5946596976491254e-06, + "loss": 0.5709, + "step": 432 + }, + { + "epoch": 0.0868, + "grad_norm": 6.265256404876709, + "learning_rate": 4.6064120361131624e-06, + "loss": 0.3278, + "step": 434 + }, + { + "epoch": 0.0872, + "grad_norm": 10.883702278137207, + "learning_rate": 4.618174889652924e-06, + "loss": 0.4484, + "step": 436 + }, + { + "epoch": 0.0876, + "grad_norm": 3.9067330360412598, + "learning_rate": 4.629948235336126e-06, + "loss": 0.3978, + "step": 438 + }, + { + "epoch": 0.088, + "grad_norm": 2.8219709396362305, + "learning_rate": 4.6417320502100286e-06, + "loss": 0.3636, + "step": 440 + }, + { + "epoch": 0.0884, + "grad_norm": 6.845930576324463, + "learning_rate": 4.653526311301479e-06, + "loss": 0.7485, + "step": 442 + }, + { + "epoch": 0.0888, + "grad_norm": 3.9061362743377686, + "learning_rate": 4.665330995616967e-06, + "loss": 0.5521, + "step": 444 + }, + { + "epoch": 0.0892, + "grad_norm": 3.5304887294769287, + "learning_rate": 4.677146080142667e-06, + "loss": 0.1318, + "step": 446 + }, + { + "epoch": 0.0896, + "grad_norm": 4.451516151428223, + "learning_rate": 4.688971541844424e-06, + "loss": 0.65, + "step": 448 + }, + { + "epoch": 0.09, + "grad_norm": 7.691074371337891, + "learning_rate": 4.700807357667956e-06, + "loss": 0.626, + "step": 450 + }, + { + "epoch": 0.0904, + "grad_norm": 4.584842681884766, + "learning_rate": 4.712653504538672e-06, + "loss": 0.5109, + "step": 452 + }, + { + "epoch": 0.0908, + "grad_norm": 11.918622016906738, + "learning_rate": 4.7245099593619495e-06, + "loss": 0.5083, + "step": 454 + }, + { + "epoch": 0.0912, + "grad_norm": 10.897934913635254, + "learning_rate": 4.736376699023023e-06, + "loss": 0.7664, + "step": 456 + }, + { + "epoch": 0.0916, + "grad_norm": 9.378917694091797, + "learning_rate": 4.74825370038703e-06, + "loss": 0.572, + "step": 458 + }, + { + "epoch": 0.092, + "grad_norm": 4.142429351806641, + "learning_rate": 4.76014094029921e-06, + "loss": 0.3126, + "step": 460 + }, + { + "epoch": 0.0924, + "grad_norm": 4.197183609008789, + "learning_rate": 4.772038395584735e-06, + "loss": 0.3626, + "step": 462 + }, + { + "epoch": 0.0928, + "grad_norm": 11.61467170715332, + "learning_rate": 4.7839460430489216e-06, + "loss": 0.5275, + "step": 464 + }, + { + "epoch": 0.0932, + "grad_norm": 12.979358673095703, + "learning_rate": 4.7958638594772035e-06, + "loss": 0.8794, + "step": 466 + }, + { + "epoch": 0.0936, + "grad_norm": 10.078213691711426, + "learning_rate": 4.807791821635185e-06, + "loss": 0.2145, + "step": 468 + }, + { + "epoch": 0.094, + "grad_norm": 6.292989253997803, + "learning_rate": 4.8197299062686954e-06, + "loss": 0.7915, + "step": 470 + }, + { + "epoch": 0.0944, + "grad_norm": 5.791603088378906, + "learning_rate": 4.831678090103828e-06, + "loss": 0.3691, + "step": 472 + }, + { + "epoch": 0.0948, + "grad_norm": 6.856422424316406, + "learning_rate": 4.8436363498469865e-06, + "loss": 0.3942, + "step": 474 + }, + { + "epoch": 0.0952, + "grad_norm": 4.104106426239014, + "learning_rate": 4.855604662184931e-06, + "loss": 0.3662, + "step": 476 + }, + { + "epoch": 0.0956, + "grad_norm": 1.3374210596084595, + "learning_rate": 4.867583003784825e-06, + "loss": 0.4829, + "step": 478 + }, + { + "epoch": 0.096, + "grad_norm": 6.539978504180908, + "learning_rate": 4.8795713512942785e-06, + "loss": 0.3145, + "step": 480 + }, + { + "epoch": 0.0964, + "grad_norm": 4.928034782409668, + "learning_rate": 4.891569681341395e-06, + "loss": 0.3754, + "step": 482 + }, + { + "epoch": 0.0968, + "grad_norm": 8.62166976928711, + "learning_rate": 4.903577970534815e-06, + "loss": 0.3392, + "step": 484 + }, + { + "epoch": 0.0972, + "grad_norm": 6.661494255065918, + "learning_rate": 4.91559619546378e-06, + "loss": 0.3809, + "step": 486 + }, + { + "epoch": 0.0976, + "grad_norm": 10.354453086853027, + "learning_rate": 4.9276243326981e-06, + "loss": 0.8088, + "step": 488 + }, + { + "epoch": 0.098, + "grad_norm": 13.8243989944458, + "learning_rate": 4.939662358788352e-06, + "loss": 0.6111, + "step": 490 + }, + { + "epoch": 0.0984, + "grad_norm": 7.309878826141357, + "learning_rate": 4.951710250265788e-06, + "loss": 0.4847, + "step": 492 + }, + { + "epoch": 0.0988, + "grad_norm": 12.332515716552734, + "learning_rate": 4.96376798364238e-06, + "loss": 0.6326, + "step": 494 + }, + { + "epoch": 0.0992, + "grad_norm": 1.5214141607284546, + "learning_rate": 4.975835535411023e-06, + "loss": 0.1982, + "step": 496 + }, + { + "epoch": 0.0996, + "grad_norm": 18.718753814697266, + "learning_rate": 4.987912882045345e-06, + "loss": 1.1601, + "step": 498 + }, + { + "epoch": 0.1, + "grad_norm": 3.14683198928833, + "learning_rate": 5.000000000000003e-06, + "loss": 0.342, + "step": 500 + }, + { + "epoch": 0.1004, + "grad_norm": 8.71473217010498, + "learning_rate": 5.012096865710493e-06, + "loss": 0.5025, + "step": 502 + }, + { + "epoch": 0.1008, + "grad_norm": 9.5609130859375, + "learning_rate": 5.024203455593375e-06, + "loss": 0.5579, + "step": 504 + }, + { + "epoch": 0.1012, + "grad_norm": 2.729548215866089, + "learning_rate": 5.036319746046232e-06, + "loss": 0.2797, + "step": 506 + }, + { + "epoch": 0.1016, + "grad_norm": 5.954434871673584, + "learning_rate": 5.048445713447734e-06, + "loss": 0.2593, + "step": 508 + }, + { + "epoch": 0.102, + "grad_norm": 9.328837394714355, + "learning_rate": 5.0605813341576885e-06, + "loss": 0.3571, + "step": 510 + }, + { + "epoch": 0.1024, + "grad_norm": 10.317095756530762, + "learning_rate": 5.072726584517083e-06, + "loss": 0.4755, + "step": 512 + }, + { + "epoch": 0.1028, + "grad_norm": 14.90354061126709, + "learning_rate": 5.084881440848126e-06, + "loss": 0.571, + "step": 514 + }, + { + "epoch": 0.1032, + "grad_norm": 9.201313018798828, + "learning_rate": 5.097045879454308e-06, + "loss": 0.5513, + "step": 516 + }, + { + "epoch": 0.1036, + "grad_norm": 8.81816291809082, + "learning_rate": 5.109219876620433e-06, + "loss": 0.6171, + "step": 518 + }, + { + "epoch": 0.104, + "grad_norm": 4.470856189727783, + "learning_rate": 5.1214034086126685e-06, + "loss": 0.4541, + "step": 520 + }, + { + "epoch": 0.1044, + "grad_norm": 6.47641134262085, + "learning_rate": 5.133596451678611e-06, + "loss": 0.5544, + "step": 522 + }, + { + "epoch": 0.1048, + "grad_norm": 4.76102352142334, + "learning_rate": 5.145798982047253e-06, + "loss": 0.4358, + "step": 524 + }, + { + "epoch": 0.1052, + "grad_norm": 4.352917194366455, + "learning_rate": 5.158010975929185e-06, + "loss": 0.3834, + "step": 526 + }, + { + "epoch": 0.1056, + "grad_norm": 4.391143798828125, + "learning_rate": 5.170232409516483e-06, + "loss": 0.1786, + "step": 528 + }, + { + "epoch": 0.106, + "grad_norm": 10.600780487060547, + "learning_rate": 5.182463258982837e-06, + "loss": 0.4946, + "step": 530 + }, + { + "epoch": 0.1064, + "grad_norm": 2.967501163482666, + "learning_rate": 5.194703500483597e-06, + "loss": 0.2943, + "step": 532 + }, + { + "epoch": 0.1068, + "grad_norm": 3.720363140106201, + "learning_rate": 5.2069531101557395e-06, + "loss": 0.3502, + "step": 534 + }, + { + "epoch": 0.1072, + "grad_norm": 5.934722900390625, + "learning_rate": 5.219212064118082e-06, + "loss": 0.4721, + "step": 536 + }, + { + "epoch": 0.1076, + "grad_norm": 4.497922897338867, + "learning_rate": 5.231480338471124e-06, + "loss": 0.4062, + "step": 538 + }, + { + "epoch": 0.108, + "grad_norm": 4.762455940246582, + "learning_rate": 5.24375790929725e-06, + "loss": 0.3486, + "step": 540 + }, + { + "epoch": 0.1084, + "grad_norm": 5.645461082458496, + "learning_rate": 5.256044752660709e-06, + "loss": 0.6363, + "step": 542 + }, + { + "epoch": 0.1088, + "grad_norm": 7.7022223472595215, + "learning_rate": 5.268340844607653e-06, + "loss": 0.7743, + "step": 544 + }, + { + "epoch": 0.1092, + "grad_norm": 4.242071151733398, + "learning_rate": 5.2806461611662725e-06, + "loss": 0.3792, + "step": 546 + }, + { + "epoch": 0.1096, + "grad_norm": 3.976555347442627, + "learning_rate": 5.2929606783466735e-06, + "loss": 0.7598, + "step": 548 + }, + { + "epoch": 0.11, + "grad_norm": 5.913125991821289, + "learning_rate": 5.305284372141091e-06, + "loss": 0.7614, + "step": 550 + }, + { + "epoch": 0.1104, + "grad_norm": 8.41602897644043, + "learning_rate": 5.317617218523853e-06, + "loss": 0.5673, + "step": 552 + }, + { + "epoch": 0.1108, + "grad_norm": 5.553797245025635, + "learning_rate": 5.3299591934514435e-06, + "loss": 0.6229, + "step": 554 + }, + { + "epoch": 0.1112, + "grad_norm": 10.066866874694824, + "learning_rate": 5.342310272862553e-06, + "loss": 0.5907, + "step": 556 + }, + { + "epoch": 0.1116, + "grad_norm": 5.443709373474121, + "learning_rate": 5.354670432678119e-06, + "loss": 0.7079, + "step": 558 + }, + { + "epoch": 0.112, + "grad_norm": 8.942435264587402, + "learning_rate": 5.367039648801377e-06, + "loss": 0.3823, + "step": 560 + }, + { + "epoch": 0.1124, + "grad_norm": 5.016224384307861, + "learning_rate": 5.379417897117909e-06, + "loss": 0.4832, + "step": 562 + }, + { + "epoch": 0.1128, + "grad_norm": 0.039887603372335434, + "learning_rate": 5.391805153495684e-06, + "loss": 0.3068, + "step": 564 + }, + { + "epoch": 0.1132, + "grad_norm": 4.933300018310547, + "learning_rate": 5.404201393785113e-06, + "loss": 0.8791, + "step": 566 + }, + { + "epoch": 0.1136, + "grad_norm": 5.279452323913574, + "learning_rate": 5.416606593819109e-06, + "loss": 0.4562, + "step": 568 + }, + { + "epoch": 0.114, + "grad_norm": 3.51374888420105, + "learning_rate": 5.429020729413049e-06, + "loss": 0.7453, + "step": 570 + }, + { + "epoch": 0.1144, + "grad_norm": 3.155231237411499, + "learning_rate": 5.441443776365005e-06, + "loss": 0.5871, + "step": 572 + }, + { + "epoch": 0.1148, + "grad_norm": 14.033446311950684, + "learning_rate": 5.453875710455549e-06, + "loss": 0.4352, + "step": 574 + }, + { + "epoch": 0.1152, + "grad_norm": 6.462706089019775, + "learning_rate": 5.466316507448053e-06, + "loss": 0.6082, + "step": 576 + }, + { + "epoch": 0.1156, + "grad_norm": 9.42319107055664, + "learning_rate": 5.478766143088497e-06, + "loss": 0.3344, + "step": 578 + }, + { + "epoch": 0.116, + "grad_norm": 7.1178083419799805, + "learning_rate": 5.49122459310568e-06, + "loss": 0.4912, + "step": 580 + }, + { + "epoch": 0.1164, + "grad_norm": 7.38687801361084, + "learning_rate": 5.503691833211264e-06, + "loss": 0.4029, + "step": 582 + }, + { + "epoch": 0.1168, + "grad_norm": 3.7615950107574463, + "learning_rate": 5.516167839099662e-06, + "loss": 0.2497, + "step": 584 + }, + { + "epoch": 0.1172, + "grad_norm": 6.754142761230469, + "learning_rate": 5.5286525864483285e-06, + "loss": 0.3655, + "step": 586 + }, + { + "epoch": 0.1176, + "grad_norm": 5.884318828582764, + "learning_rate": 5.5411460509175605e-06, + "loss": 0.6217, + "step": 588 + }, + { + "epoch": 0.118, + "grad_norm": 7.188671112060547, + "learning_rate": 5.553648208150724e-06, + "loss": 0.4753, + "step": 590 + }, + { + "epoch": 0.1184, + "grad_norm": 7.233493328094482, + "learning_rate": 5.5661590337742255e-06, + "loss": 0.4162, + "step": 592 + }, + { + "epoch": 0.1188, + "grad_norm": 6.075003623962402, + "learning_rate": 5.57867850339757e-06, + "loss": 0.5523, + "step": 594 + }, + { + "epoch": 0.1192, + "grad_norm": 17.73434829711914, + "learning_rate": 5.591206592613412e-06, + "loss": 0.8044, + "step": 596 + }, + { + "epoch": 0.1196, + "grad_norm": 10.365287780761719, + "learning_rate": 5.603743276997597e-06, + "loss": 0.5041, + "step": 598 + }, + { + "epoch": 0.12, + "grad_norm": 4.196221828460693, + "learning_rate": 5.616288532109221e-06, + "loss": 1.5719, + "step": 600 + }, + { + "epoch": 0.1204, + "grad_norm": 5.7655134201049805, + "learning_rate": 5.628842333490665e-06, + "loss": 0.3858, + "step": 602 + }, + { + "epoch": 0.1208, + "grad_norm": 11.360547065734863, + "learning_rate": 5.641404656667652e-06, + "loss": 0.7201, + "step": 604 + }, + { + "epoch": 0.1212, + "grad_norm": 3.5423710346221924, + "learning_rate": 5.653975477149289e-06, + "loss": 0.3107, + "step": 606 + }, + { + "epoch": 0.1216, + "grad_norm": 5.658947467803955, + "learning_rate": 5.666554770428136e-06, + "loss": 0.3941, + "step": 608 + }, + { + "epoch": 0.122, + "grad_norm": 8.62951946258545, + "learning_rate": 5.679142511980168e-06, + "loss": 0.345, + "step": 610 + }, + { + "epoch": 0.1224, + "grad_norm": 9.270598411560059, + "learning_rate": 5.6917386772650015e-06, + "loss": 0.4343, + "step": 612 + }, + { + "epoch": 0.1228, + "grad_norm": 1.7288422584533691, + "learning_rate": 5.7043432417257076e-06, + "loss": 0.1755, + "step": 614 + }, + { + "epoch": 0.1232, + "grad_norm": 4.429084300994873, + "learning_rate": 5.716956180789086e-06, + "loss": 0.3844, + "step": 616 + }, + { + "epoch": 0.1236, + "grad_norm": 6.5113654136657715, + "learning_rate": 5.729577469865569e-06, + "loss": 0.2699, + "step": 618 + }, + { + "epoch": 0.124, + "grad_norm": 7.390730857849121, + "learning_rate": 5.74220708434926e-06, + "loss": 0.4022, + "step": 620 + }, + { + "epoch": 0.1244, + "grad_norm": 4.365558624267578, + "learning_rate": 5.754844999618143e-06, + "loss": 0.3992, + "step": 622 + }, + { + "epoch": 0.1248, + "grad_norm": 2.8961451053619385, + "learning_rate": 5.767491191033909e-06, + "loss": 0.2534, + "step": 624 + }, + { + "epoch": 0.1252, + "grad_norm": 7.486629962921143, + "learning_rate": 5.780145633942173e-06, + "loss": 0.5013, + "step": 626 + }, + { + "epoch": 0.1256, + "grad_norm": 21.515949249267578, + "learning_rate": 5.7928083036724535e-06, + "loss": 0.7132, + "step": 628 + }, + { + "epoch": 0.126, + "grad_norm": 0.021705826744437218, + "learning_rate": 5.8054791755382125e-06, + "loss": 0.2674, + "step": 630 + }, + { + "epoch": 0.1264, + "grad_norm": 11.32520580291748, + "learning_rate": 5.818158224836983e-06, + "loss": 0.6134, + "step": 632 + }, + { + "epoch": 0.1268, + "grad_norm": 5.518954277038574, + "learning_rate": 5.830845426850263e-06, + "loss": 0.3906, + "step": 634 + }, + { + "epoch": 0.1272, + "grad_norm": 5.468055248260498, + "learning_rate": 5.8435407568437194e-06, + "loss": 0.3152, + "step": 636 + }, + { + "epoch": 0.1276, + "grad_norm": 15.162836074829102, + "learning_rate": 5.856244190067155e-06, + "loss": 1.0706, + "step": 638 + }, + { + "epoch": 0.128, + "grad_norm": 2.05523419380188, + "learning_rate": 5.868955701754577e-06, + "loss": 0.5536, + "step": 640 + }, + { + "epoch": 0.1284, + "grad_norm": 3.523622512817383, + "learning_rate": 5.881675267124245e-06, + "loss": 0.1721, + "step": 642 + }, + { + "epoch": 0.1288, + "grad_norm": 5.964024543762207, + "learning_rate": 5.894402861378714e-06, + "loss": 0.6156, + "step": 644 + }, + { + "epoch": 0.1292, + "grad_norm": 4.548226356506348, + "learning_rate": 5.907138459704886e-06, + "loss": 0.2096, + "step": 646 + }, + { + "epoch": 0.1296, + "grad_norm": 13.814722061157227, + "learning_rate": 5.919882037274065e-06, + "loss": 0.5503, + "step": 648 + }, + { + "epoch": 0.13, + "grad_norm": 5.925856113433838, + "learning_rate": 5.932633569241989e-06, + "loss": 0.2536, + "step": 650 + }, + { + "epoch": 0.1304, + "grad_norm": 4.77570915222168, + "learning_rate": 5.9453930307488985e-06, + "loss": 0.4301, + "step": 652 + }, + { + "epoch": 0.1308, + "grad_norm": 6.70920467376709, + "learning_rate": 5.958160396919584e-06, + "loss": 0.6257, + "step": 654 + }, + { + "epoch": 0.1312, + "grad_norm": 8.202454566955566, + "learning_rate": 5.970935642863362e-06, + "loss": 0.8576, + "step": 656 + }, + { + "epoch": 0.1316, + "grad_norm": 6.9439377784729, + "learning_rate": 5.983718743674305e-06, + "loss": 0.4183, + "step": 658 + }, + { + "epoch": 0.132, + "grad_norm": 3.8744468688964844, + "learning_rate": 5.996509674431038e-06, + "loss": 0.5693, + "step": 660 + }, + { + "epoch": 0.1324, + "grad_norm": 3.349454164505005, + "learning_rate": 6.00930841019705e-06, + "loss": 0.3389, + "step": 662 + }, + { + "epoch": 0.1328, + "grad_norm": 18.92936134338379, + "learning_rate": 6.022114926020505e-06, + "loss": 0.6775, + "step": 664 + }, + { + "epoch": 0.1332, + "grad_norm": 7.993606090545654, + "learning_rate": 6.0349291969344426e-06, + "loss": 0.5071, + "step": 666 + }, + { + "epoch": 0.1336, + "grad_norm": 3.5069222450256348, + "learning_rate": 6.047751197956836e-06, + "loss": 0.1654, + "step": 668 + }, + { + "epoch": 0.134, + "grad_norm": 8.64906120300293, + "learning_rate": 6.060580904090489e-06, + "loss": 0.3738, + "step": 670 + }, + { + "epoch": 0.1344, + "grad_norm": 6.413625717163086, + "learning_rate": 6.0734182903232475e-06, + "loss": 0.3211, + "step": 672 + }, + { + "epoch": 0.1348, + "grad_norm": 7.102066516876221, + "learning_rate": 6.086263331627974e-06, + "loss": 0.5298, + "step": 674 + }, + { + "epoch": 0.1352, + "grad_norm": 2.854391098022461, + "learning_rate": 6.0991160029626e-06, + "loss": 0.4201, + "step": 676 + }, + { + "epoch": 0.1356, + "grad_norm": 4.8001275062561035, + "learning_rate": 6.111976279270187e-06, + "loss": 0.9085, + "step": 678 + }, + { + "epoch": 0.136, + "grad_norm": 13.59422779083252, + "learning_rate": 6.124844135478966e-06, + "loss": 0.701, + "step": 680 + }, + { + "epoch": 0.1364, + "grad_norm": 8.73102855682373, + "learning_rate": 6.137719546502394e-06, + "loss": 0.534, + "step": 682 + }, + { + "epoch": 0.1368, + "grad_norm": 3.8618650436401367, + "learning_rate": 6.1506024872392e-06, + "loss": 0.5582, + "step": 684 + }, + { + "epoch": 0.1372, + "grad_norm": 4.220007419586182, + "learning_rate": 6.163492932573429e-06, + "loss": 0.4272, + "step": 686 + }, + { + "epoch": 0.1376, + "grad_norm": 12.380171775817871, + "learning_rate": 6.176390857374501e-06, + "loss": 0.7268, + "step": 688 + }, + { + "epoch": 0.138, + "grad_norm": 12.070074081420898, + "learning_rate": 6.189296236497251e-06, + "loss": 0.7542, + "step": 690 + }, + { + "epoch": 0.1384, + "grad_norm": 5.565659523010254, + "learning_rate": 6.202209044781979e-06, + "loss": 0.5234, + "step": 692 + }, + { + "epoch": 0.1388, + "grad_norm": 14.183318138122559, + "learning_rate": 6.215129257054525e-06, + "loss": 0.3193, + "step": 694 + }, + { + "epoch": 0.1392, + "grad_norm": 6.876009941101074, + "learning_rate": 6.228056848126223e-06, + "loss": 0.3624, + "step": 696 + }, + { + "epoch": 0.1396, + "grad_norm": 9.379109382629395, + "learning_rate": 6.240991792794137e-06, + "loss": 0.4817, + "step": 698 + }, + { + "epoch": 0.14, + "grad_norm": 8.584040641784668, + "learning_rate": 6.253934065840883e-06, + "loss": 0.452, + "step": 700 + }, + { + "epoch": 0.1404, + "grad_norm": 8.330338478088379, + "learning_rate": 6.2668836420348374e-06, + "loss": 0.7225, + "step": 702 + }, + { + "epoch": 0.1408, + "grad_norm": 9.674088478088379, + "learning_rate": 6.279840496130188e-06, + "loss": 0.3807, + "step": 704 + }, + { + "epoch": 0.1412, + "grad_norm": 7.107100963592529, + "learning_rate": 6.2928046028668185e-06, + "loss": 0.5526, + "step": 706 + }, + { + "epoch": 0.1416, + "grad_norm": 9.867752075195312, + "learning_rate": 6.305775936970606e-06, + "loss": 0.6599, + "step": 708 + }, + { + "epoch": 0.142, + "grad_norm": 6.906452655792236, + "learning_rate": 6.3187544731532205e-06, + "loss": 0.4028, + "step": 710 + }, + { + "epoch": 0.1424, + "grad_norm": 2.030731439590454, + "learning_rate": 6.331740186112359e-06, + "loss": 0.3942, + "step": 712 + }, + { + "epoch": 0.1428, + "grad_norm": 9.149688720703125, + "learning_rate": 6.344733050531709e-06, + "loss": 0.4635, + "step": 714 + }, + { + "epoch": 0.1432, + "grad_norm": 7.154139518737793, + "learning_rate": 6.357733041081015e-06, + "loss": 0.3751, + "step": 716 + }, + { + "epoch": 0.1436, + "grad_norm": 5.630312442779541, + "learning_rate": 6.370740132416133e-06, + "loss": 0.3364, + "step": 718 + }, + { + "epoch": 0.144, + "grad_norm": 16.472841262817383, + "learning_rate": 6.383754299179072e-06, + "loss": 0.6647, + "step": 720 + }, + { + "epoch": 0.1444, + "grad_norm": 9.415342330932617, + "learning_rate": 6.3967755159980485e-06, + "loss": 0.4964, + "step": 722 + }, + { + "epoch": 0.1448, + "grad_norm": 4.9506754875183105, + "learning_rate": 6.409803757487532e-06, + "loss": 0.3136, + "step": 724 + }, + { + "epoch": 0.1452, + "grad_norm": 6.21171236038208, + "learning_rate": 6.422838998248301e-06, + "loss": 0.214, + "step": 726 + }, + { + "epoch": 0.1456, + "grad_norm": 3.7686269283294678, + "learning_rate": 6.435881212867485e-06, + "loss": 0.3669, + "step": 728 + }, + { + "epoch": 0.146, + "grad_norm": 2.010577440261841, + "learning_rate": 6.4489303759186385e-06, + "loss": 0.3847, + "step": 730 + }, + { + "epoch": 0.1464, + "grad_norm": 4.508663654327393, + "learning_rate": 6.4619864619616975e-06, + "loss": 0.4283, + "step": 732 + }, + { + "epoch": 0.1468, + "grad_norm": 11.302230834960938, + "learning_rate": 6.475049445543222e-06, + "loss": 0.5133, + "step": 734 + }, + { + "epoch": 0.1472, + "grad_norm": 13.551816940307617, + "learning_rate": 6.48811930119619e-06, + "loss": 0.5777, + "step": 736 + }, + { + "epoch": 0.1476, + "grad_norm": 5.785101413726807, + "learning_rate": 6.5011960034403e-06, + "loss": 0.3011, + "step": 738 + }, + { + "epoch": 0.148, + "grad_norm": 10.311860084533691, + "learning_rate": 6.514279526781853e-06, + "loss": 0.4525, + "step": 740 + }, + { + "epoch": 0.1484, + "grad_norm": 3.428866386413574, + "learning_rate": 6.5273698457137965e-06, + "loss": 0.3227, + "step": 742 + }, + { + "epoch": 0.1488, + "grad_norm": 5.147528171539307, + "learning_rate": 6.540466934715955e-06, + "loss": 0.4536, + "step": 744 + }, + { + "epoch": 0.1492, + "grad_norm": 3.970655918121338, + "learning_rate": 6.553570768254831e-06, + "loss": 0.5332, + "step": 746 + }, + { + "epoch": 0.1496, + "grad_norm": 10.89538860321045, + "learning_rate": 6.566681320783848e-06, + "loss": 0.3576, + "step": 748 + }, + { + "epoch": 0.15, + "grad_norm": 10.928024291992188, + "learning_rate": 6.579798566743313e-06, + "loss": 0.8612, + "step": 750 + }, + { + "epoch": 0.1504, + "grad_norm": 4.219385147094727, + "learning_rate": 6.592922480560483e-06, + "loss": 0.3391, + "step": 752 + }, + { + "epoch": 0.1508, + "grad_norm": 4.059660911560059, + "learning_rate": 6.606053036649618e-06, + "loss": 0.3628, + "step": 754 + }, + { + "epoch": 0.1512, + "grad_norm": 15.986249923706055, + "learning_rate": 6.619190209412025e-06, + "loss": 0.5906, + "step": 756 + }, + { + "epoch": 0.1516, + "grad_norm": 3.5934808254241943, + "learning_rate": 6.632333973236113e-06, + "loss": 0.5227, + "step": 758 + }, + { + "epoch": 0.152, + "grad_norm": 3.0729784965515137, + "learning_rate": 6.6454843024974465e-06, + "loss": 0.2874, + "step": 760 + }, + { + "epoch": 0.1524, + "grad_norm": 14.278196334838867, + "learning_rate": 6.6586411715587805e-06, + "loss": 0.65, + "step": 762 + }, + { + "epoch": 0.1528, + "grad_norm": 6.700320720672607, + "learning_rate": 6.671804554770128e-06, + "loss": 0.4862, + "step": 764 + }, + { + "epoch": 0.1532, + "grad_norm": 6.843954086303711, + "learning_rate": 6.6849744264688e-06, + "loss": 0.5892, + "step": 766 + }, + { + "epoch": 0.1536, + "grad_norm": 14.10453987121582, + "learning_rate": 6.698150760979456e-06, + "loss": 0.5368, + "step": 768 + }, + { + "epoch": 0.154, + "grad_norm": 2.885185956954956, + "learning_rate": 6.711333532614177e-06, + "loss": 0.3572, + "step": 770 + }, + { + "epoch": 0.1544, + "grad_norm": 10.13174819946289, + "learning_rate": 6.724522715672421e-06, + "loss": 0.4431, + "step": 772 + }, + { + "epoch": 0.1548, + "grad_norm": 3.468263626098633, + "learning_rate": 6.737718284441256e-06, + "loss": 0.1677, + "step": 774 + }, + { + "epoch": 0.1552, + "grad_norm": 3.1964352130889893, + "learning_rate": 6.750920213195242e-06, + "loss": 0.4121, + "step": 776 + }, + { + "epoch": 0.1556, + "grad_norm": 5.629176139831543, + "learning_rate": 6.764128476196494e-06, + "loss": 0.2377, + "step": 778 + }, + { + "epoch": 0.156, + "grad_norm": 12.229723930358887, + "learning_rate": 6.777343047694894e-06, + "loss": 0.7154, + "step": 780 + }, + { + "epoch": 0.1564, + "grad_norm": 3.3051795959472656, + "learning_rate": 6.7905639019278925e-06, + "loss": 0.2427, + "step": 782 + }, + { + "epoch": 0.1568, + "grad_norm": 7.652355670928955, + "learning_rate": 6.803791013120824e-06, + "loss": 0.2424, + "step": 784 + }, + { + "epoch": 0.1572, + "grad_norm": 3.2247581481933594, + "learning_rate": 6.817024355486707e-06, + "loss": 0.3198, + "step": 786 + }, + { + "epoch": 0.1576, + "grad_norm": 1.7191749811172485, + "learning_rate": 6.8302639032264836e-06, + "loss": 0.4825, + "step": 788 + }, + { + "epoch": 0.158, + "grad_norm": 9.647163391113281, + "learning_rate": 6.8435096305289765e-06, + "loss": 0.4332, + "step": 790 + }, + { + "epoch": 0.1584, + "grad_norm": 8.396803855895996, + "learning_rate": 6.856761511570944e-06, + "loss": 0.2501, + "step": 792 + }, + { + "epoch": 0.1588, + "grad_norm": 19.717267990112305, + "learning_rate": 6.870019520517217e-06, + "loss": 0.7277, + "step": 794 + }, + { + "epoch": 0.1592, + "grad_norm": 2.5725841522216797, + "learning_rate": 6.883283631520579e-06, + "loss": 0.3353, + "step": 796 + }, + { + "epoch": 0.1596, + "grad_norm": 4.188928127288818, + "learning_rate": 6.896553818721985e-06, + "loss": 0.2245, + "step": 798 + }, + { + "epoch": 0.16, + "grad_norm": 4.037902355194092, + "learning_rate": 6.909830056250522e-06, + "loss": 0.3399, + "step": 800 + }, + { + "epoch": 0.1604, + "grad_norm": 5.065680027008057, + "learning_rate": 6.9231123182234895e-06, + "loss": 0.3848, + "step": 802 + }, + { + "epoch": 0.1608, + "grad_norm": 2.4502971172332764, + "learning_rate": 6.936400578746436e-06, + "loss": 0.5131, + "step": 804 + }, + { + "epoch": 0.1612, + "grad_norm": 5.089271545410156, + "learning_rate": 6.949694811913237e-06, + "loss": 0.6121, + "step": 806 + }, + { + "epoch": 0.1616, + "grad_norm": 16.869577407836914, + "learning_rate": 6.96299499180605e-06, + "loss": 0.6868, + "step": 808 + }, + { + "epoch": 0.162, + "grad_norm": 4.157081604003906, + "learning_rate": 6.976301092495548e-06, + "loss": 0.2923, + "step": 810 + }, + { + "epoch": 0.1624, + "grad_norm": 5.360850811004639, + "learning_rate": 6.989613088040787e-06, + "loss": 0.354, + "step": 812 + }, + { + "epoch": 0.1628, + "grad_norm": 13.399910926818848, + "learning_rate": 7.002930952489353e-06, + "loss": 0.4801, + "step": 814 + }, + { + "epoch": 0.1632, + "grad_norm": 6.160241603851318, + "learning_rate": 7.016254659877404e-06, + "loss": 0.4726, + "step": 816 + }, + { + "epoch": 0.1636, + "grad_norm": 10.750870704650879, + "learning_rate": 7.029584184229641e-06, + "loss": 0.7609, + "step": 818 + }, + { + "epoch": 0.164, + "grad_norm": 7.787296295166016, + "learning_rate": 7.042919499559539e-06, + "loss": 0.6701, + "step": 820 + }, + { + "epoch": 0.1644, + "grad_norm": 8.239069938659668, + "learning_rate": 7.056260579869152e-06, + "loss": 0.3435, + "step": 822 + }, + { + "epoch": 0.1648, + "grad_norm": 11.421568870544434, + "learning_rate": 7.06960739914943e-06, + "loss": 0.4854, + "step": 824 + }, + { + "epoch": 0.1652, + "grad_norm": 2.5625734329223633, + "learning_rate": 7.082959931380013e-06, + "loss": 0.2983, + "step": 826 + }, + { + "epoch": 0.1656, + "grad_norm": 3.4926583766937256, + "learning_rate": 7.09631815052946e-06, + "loss": 0.3505, + "step": 828 + }, + { + "epoch": 0.166, + "grad_norm": 7.8978352546691895, + "learning_rate": 7.109682030555285e-06, + "loss": 0.3397, + "step": 830 + }, + { + "epoch": 0.1664, + "grad_norm": 18.112512588500977, + "learning_rate": 7.123051545403873e-06, + "loss": 0.7538, + "step": 832 + }, + { + "epoch": 0.1668, + "grad_norm": 6.368231296539307, + "learning_rate": 7.136426669010686e-06, + "loss": 0.442, + "step": 834 + }, + { + "epoch": 0.1672, + "grad_norm": 5.724649429321289, + "learning_rate": 7.1498073753002375e-06, + "loss": 0.4052, + "step": 836 + }, + { + "epoch": 0.1676, + "grad_norm": 2.642228126525879, + "learning_rate": 7.1631936381861544e-06, + "loss": 0.6626, + "step": 838 + }, + { + "epoch": 0.168, + "grad_norm": 4.494007110595703, + "learning_rate": 7.1765854315712325e-06, + "loss": 1.1029, + "step": 840 + }, + { + "epoch": 0.1684, + "grad_norm": 7.890208721160889, + "learning_rate": 7.189982729347485e-06, + "loss": 0.4481, + "step": 842 + }, + { + "epoch": 0.1688, + "grad_norm": 3.479203939437866, + "learning_rate": 7.203385505396197e-06, + "loss": 0.4121, + "step": 844 + }, + { + "epoch": 0.1692, + "grad_norm": 8.981403350830078, + "learning_rate": 7.216793733587966e-06, + "loss": 0.3879, + "step": 846 + }, + { + "epoch": 0.1696, + "grad_norm": 10.090559005737305, + "learning_rate": 7.230207387782771e-06, + "loss": 0.5132, + "step": 848 + }, + { + "epoch": 0.17, + "grad_norm": 7.934718132019043, + "learning_rate": 7.243626441830001e-06, + "loss": 0.4681, + "step": 850 + }, + { + "epoch": 0.1704, + "grad_norm": 4.586116313934326, + "learning_rate": 7.257050869568527e-06, + "loss": 0.6233, + "step": 852 + }, + { + "epoch": 0.1708, + "grad_norm": 2.835678815841675, + "learning_rate": 7.270480644826739e-06, + "loss": 0.405, + "step": 854 + }, + { + "epoch": 0.1712, + "grad_norm": 15.727119445800781, + "learning_rate": 7.28391574142262e-06, + "loss": 0.5705, + "step": 856 + }, + { + "epoch": 0.1716, + "grad_norm": 7.766270160675049, + "learning_rate": 7.297356133163711e-06, + "loss": 0.3394, + "step": 858 + }, + { + "epoch": 0.172, + "grad_norm": 1.4164787530899048, + "learning_rate": 7.3108017938473485e-06, + "loss": 0.073, + "step": 860 + }, + { + "epoch": 0.1724, + "grad_norm": 4.286393642425537, + "learning_rate": 7.324252697260479e-06, + "loss": 0.3516, + "step": 862 + }, + { + "epoch": 0.1728, + "grad_norm": 10.02456283569336, + "learning_rate": 7.337708817179875e-06, + "loss": 0.4398, + "step": 864 + }, + { + "epoch": 0.1732, + "grad_norm": 16.70851707458496, + "learning_rate": 7.351170127372196e-06, + "loss": 0.5001, + "step": 866 + }, + { + "epoch": 0.1736, + "grad_norm": 9.615955352783203, + "learning_rate": 7.36463660159386e-06, + "loss": 0.4175, + "step": 868 + }, + { + "epoch": 0.174, + "grad_norm": 0.1856585144996643, + "learning_rate": 7.378108213591355e-06, + "loss": 0.1393, + "step": 870 + }, + { + "epoch": 0.1744, + "grad_norm": 6.751959800720215, + "learning_rate": 7.39158493710103e-06, + "loss": 0.5308, + "step": 872 + }, + { + "epoch": 0.1748, + "grad_norm": 5.480131149291992, + "learning_rate": 7.405066745849345e-06, + "loss": 0.5994, + "step": 874 + }, + { + "epoch": 0.1752, + "grad_norm": 9.561187744140625, + "learning_rate": 7.418553613552822e-06, + "loss": 0.5893, + "step": 876 + }, + { + "epoch": 0.1756, + "grad_norm": 2.3375208377838135, + "learning_rate": 7.432045513918121e-06, + "loss": 0.196, + "step": 878 + }, + { + "epoch": 0.176, + "grad_norm": 8.190189361572266, + "learning_rate": 7.445542420642091e-06, + "loss": 0.5206, + "step": 880 + }, + { + "epoch": 0.1764, + "grad_norm": 11.755085945129395, + "learning_rate": 7.459044307411826e-06, + "loss": 0.7758, + "step": 882 + }, + { + "epoch": 0.1768, + "grad_norm": 6.631009578704834, + "learning_rate": 7.472551147904703e-06, + "loss": 0.8498, + "step": 884 + }, + { + "epoch": 0.1772, + "grad_norm": 9.193879127502441, + "learning_rate": 7.486062915788446e-06, + "loss": 0.2629, + "step": 886 + }, + { + "epoch": 0.1776, + "grad_norm": 2.9170496463775635, + "learning_rate": 7.499579584721173e-06, + "loss": 0.215, + "step": 888 + }, + { + "epoch": 0.178, + "grad_norm": 4.22748327255249, + "learning_rate": 7.513101128351446e-06, + "loss": 0.4872, + "step": 890 + }, + { + "epoch": 0.1784, + "grad_norm": 15.961320877075195, + "learning_rate": 7.5266275203183395e-06, + "loss": 0.697, + "step": 892 + }, + { + "epoch": 0.1788, + "grad_norm": 8.920997619628906, + "learning_rate": 7.540158734251412e-06, + "loss": 0.5223, + "step": 894 + }, + { + "epoch": 0.1792, + "grad_norm": 15.58497142791748, + "learning_rate": 7.553694743770917e-06, + "loss": 0.585, + "step": 896 + }, + { + "epoch": 0.1796, + "grad_norm": 6.984971046447754, + "learning_rate": 7.567235522487698e-06, + "loss": 0.4167, + "step": 898 + }, + { + "epoch": 0.18, + "grad_norm": 10.183481216430664, + "learning_rate": 7.580781044003312e-06, + "loss": 0.5077, + "step": 900 + }, + { + "epoch": 0.1804, + "grad_norm": 12.522652626037598, + "learning_rate": 7.5943312819100875e-06, + "loss": 0.5003, + "step": 902 + }, + { + "epoch": 0.1808, + "grad_norm": 2.5764498710632324, + "learning_rate": 7.607886209791095e-06, + "loss": 1.092, + "step": 904 + }, + { + "epoch": 0.1812, + "grad_norm": 11.2557373046875, + "learning_rate": 7.6214458012203726e-06, + "loss": 0.4882, + "step": 906 + }, + { + "epoch": 0.1816, + "grad_norm": 6.787613391876221, + "learning_rate": 7.635010029762755e-06, + "loss": 0.389, + "step": 908 + }, + { + "epoch": 0.182, + "grad_norm": 3.3469245433807373, + "learning_rate": 7.648578868974102e-06, + "loss": 0.327, + "step": 910 + }, + { + "epoch": 0.1824, + "grad_norm": 9.81145191192627, + "learning_rate": 7.662152292401265e-06, + "loss": 0.4259, + "step": 912 + }, + { + "epoch": 0.1828, + "grad_norm": 13.223085403442383, + "learning_rate": 7.675730273582142e-06, + "loss": 0.6418, + "step": 914 + }, + { + "epoch": 0.1832, + "grad_norm": 10.374373435974121, + "learning_rate": 7.689312786045822e-06, + "loss": 0.2689, + "step": 916 + }, + { + "epoch": 0.1836, + "grad_norm": 7.179947376251221, + "learning_rate": 7.702899803312443e-06, + "loss": 0.4406, + "step": 918 + }, + { + "epoch": 0.184, + "grad_norm": 11.501348495483398, + "learning_rate": 7.716491298893441e-06, + "loss": 0.6871, + "step": 920 + }, + { + "epoch": 0.1844, + "grad_norm": 8.864216804504395, + "learning_rate": 7.730087246291498e-06, + "loss": 0.4214, + "step": 922 + }, + { + "epoch": 0.1848, + "grad_norm": 10.927438735961914, + "learning_rate": 7.74368761900062e-06, + "loss": 0.7688, + "step": 924 + }, + { + "epoch": 0.1852, + "grad_norm": 12.23855972290039, + "learning_rate": 7.757292390506184e-06, + "loss": 0.7375, + "step": 926 + }, + { + "epoch": 0.1856, + "grad_norm": 1.962257742881775, + "learning_rate": 7.770901534284991e-06, + "loss": 0.6835, + "step": 928 + }, + { + "epoch": 0.186, + "grad_norm": 7.5569891929626465, + "learning_rate": 7.78451502380532e-06, + "loss": 0.466, + "step": 930 + }, + { + "epoch": 0.1864, + "grad_norm": 3.5059900283813477, + "learning_rate": 7.798132832526976e-06, + "loss": 0.5148, + "step": 932 + }, + { + "epoch": 0.1868, + "grad_norm": 7.006364822387695, + "learning_rate": 7.811754933901346e-06, + "loss": 0.7456, + "step": 934 + }, + { + "epoch": 0.1872, + "grad_norm": 13.448099136352539, + "learning_rate": 7.825381301371444e-06, + "loss": 1.2551, + "step": 936 + }, + { + "epoch": 0.1876, + "grad_norm": 5.768279075622559, + "learning_rate": 7.839011908371987e-06, + "loss": 0.4028, + "step": 938 + }, + { + "epoch": 0.188, + "grad_norm": 3.1335666179656982, + "learning_rate": 7.852646728329358e-06, + "loss": 0.4063, + "step": 940 + }, + { + "epoch": 0.1884, + "grad_norm": 5.8325653076171875, + "learning_rate": 7.866285734661845e-06, + "loss": 0.2906, + "step": 942 + }, + { + "epoch": 0.1888, + "grad_norm": 8.456764221191406, + "learning_rate": 7.879928900779441e-06, + "loss": 0.5, + "step": 944 + }, + { + "epoch": 0.1892, + "grad_norm": 4.712984085083008, + "learning_rate": 7.893576200084164e-06, + "loss": 0.3611, + "step": 946 + }, + { + "epoch": 0.1896, + "grad_norm": 14.402421951293945, + "learning_rate": 7.907227605969852e-06, + "loss": 0.4076, + "step": 948 + }, + { + "epoch": 0.19, + "grad_norm": 7.400119781494141, + "learning_rate": 7.92088309182239e-06, + "loss": 0.4554, + "step": 950 + }, + { + "epoch": 0.1904, + "grad_norm": 4.048001766204834, + "learning_rate": 7.934542631019767e-06, + "loss": 0.245, + "step": 952 + }, + { + "epoch": 0.1908, + "grad_norm": 11.333590507507324, + "learning_rate": 7.948206196931937e-06, + "loss": 0.6639, + "step": 954 + }, + { + "epoch": 0.1912, + "grad_norm": 3.745378017425537, + "learning_rate": 7.961873762921153e-06, + "loss": 0.3687, + "step": 956 + }, + { + "epoch": 0.1916, + "grad_norm": 8.305449485778809, + "learning_rate": 7.97554530234174e-06, + "loss": 0.703, + "step": 958 + }, + { + "epoch": 0.192, + "grad_norm": 5.076037883758545, + "learning_rate": 7.989220788540351e-06, + "loss": 0.4174, + "step": 960 + }, + { + "epoch": 0.1924, + "grad_norm": 8.22502326965332, + "learning_rate": 8.002900194855927e-06, + "loss": 0.8513, + "step": 962 + }, + { + "epoch": 0.1928, + "grad_norm": 2.7648229598999023, + "learning_rate": 8.016583494619764e-06, + "loss": 0.2889, + "step": 964 + }, + { + "epoch": 0.1932, + "grad_norm": 6.429098129272461, + "learning_rate": 8.03027066115557e-06, + "loss": 0.4805, + "step": 966 + }, + { + "epoch": 0.1936, + "grad_norm": 8.474832534790039, + "learning_rate": 8.043961667779511e-06, + "loss": 0.3659, + "step": 968 + }, + { + "epoch": 0.194, + "grad_norm": 7.559782028198242, + "learning_rate": 8.057656487800274e-06, + "loss": 0.76, + "step": 970 + }, + { + "epoch": 0.1944, + "grad_norm": 7.466895580291748, + "learning_rate": 8.071355094519103e-06, + "loss": 0.4334, + "step": 972 + }, + { + "epoch": 0.1948, + "grad_norm": 11.824060440063477, + "learning_rate": 8.085057461229862e-06, + "loss": 0.9611, + "step": 974 + }, + { + "epoch": 0.1952, + "grad_norm": 13.445891380310059, + "learning_rate": 8.098763561219089e-06, + "loss": 0.4064, + "step": 976 + }, + { + "epoch": 0.1956, + "grad_norm": 11.341733932495117, + "learning_rate": 8.112473367766056e-06, + "loss": 0.5305, + "step": 978 + }, + { + "epoch": 0.196, + "grad_norm": 2.5826163291931152, + "learning_rate": 8.126186854142744e-06, + "loss": 0.3164, + "step": 980 + }, + { + "epoch": 0.1964, + "grad_norm": 3.1881608963012695, + "learning_rate": 8.139903993614075e-06, + "loss": 0.3971, + "step": 982 + }, + { + "epoch": 0.1968, + "grad_norm": 6.571081161499023, + "learning_rate": 8.153624759437718e-06, + "loss": 0.314, + "step": 984 + }, + { + "epoch": 0.1972, + "grad_norm": 11.240885734558105, + "learning_rate": 8.167349124864389e-06, + "loss": 0.5487, + "step": 986 + }, + { + "epoch": 0.1976, + "grad_norm": 1.0466963052749634, + "learning_rate": 8.181077063137735e-06, + "loss": 0.3782, + "step": 988 + }, + { + "epoch": 0.198, + "grad_norm": 4.301483154296875, + "learning_rate": 8.194808547494386e-06, + "loss": 0.7161, + "step": 990 + }, + { + "epoch": 0.1984, + "grad_norm": 8.191146850585938, + "learning_rate": 8.208543551164178e-06, + "loss": 0.4402, + "step": 992 + }, + { + "epoch": 0.1988, + "grad_norm": 7.866001605987549, + "learning_rate": 8.22228204736997e-06, + "loss": 0.3263, + "step": 994 + }, + { + "epoch": 0.1992, + "grad_norm": 4.371945858001709, + "learning_rate": 8.236024009327877e-06, + "loss": 0.6534, + "step": 996 + }, + { + "epoch": 0.1996, + "grad_norm": 14.479482650756836, + "learning_rate": 8.249769410247239e-06, + "loss": 0.4922, + "step": 998 + }, + { + "epoch": 0.2, + "grad_norm": 4.134467601776123, + "learning_rate": 8.263518223330695e-06, + "loss": 0.3993, + "step": 1000 + }, + { + "epoch": 0.2004, + "grad_norm": 1.994302749633789, + "learning_rate": 8.277270421774231e-06, + "loss": 0.5201, + "step": 1002 + }, + { + "epoch": 0.2008, + "grad_norm": 2.5535504817962646, + "learning_rate": 8.29102597876723e-06, + "loss": 0.1685, + "step": 1004 + }, + { + "epoch": 0.2012, + "grad_norm": 3.2310678958892822, + "learning_rate": 8.304784867492532e-06, + "loss": 0.4343, + "step": 1006 + }, + { + "epoch": 0.2016, + "grad_norm": 4.669698715209961, + "learning_rate": 8.31854706112648e-06, + "loss": 0.3237, + "step": 1008 + }, + { + "epoch": 0.202, + "grad_norm": 21.141176223754883, + "learning_rate": 8.332312532838972e-06, + "loss": 0.6736, + "step": 1010 + }, + { + "epoch": 0.2024, + "grad_norm": 4.0565714836120605, + "learning_rate": 8.346081255793516e-06, + "loss": 0.322, + "step": 1012 + }, + { + "epoch": 0.2028, + "grad_norm": 5.858126640319824, + "learning_rate": 8.359853203147282e-06, + "loss": 0.5223, + "step": 1014 + }, + { + "epoch": 0.2032, + "grad_norm": 11.738845825195312, + "learning_rate": 8.373628348051156e-06, + "loss": 2.8489, + "step": 1016 + }, + { + "epoch": 0.2036, + "grad_norm": 3.7632217407226562, + "learning_rate": 8.387406663649803e-06, + "loss": 0.6496, + "step": 1018 + }, + { + "epoch": 0.204, + "grad_norm": 20.16446304321289, + "learning_rate": 8.401188123081642e-06, + "loss": 0.7124, + "step": 1020 + }, + { + "epoch": 0.2044, + "grad_norm": 12.613508224487305, + "learning_rate": 8.414972699479062e-06, + "loss": 0.2599, + "step": 1022 + }, + { + "epoch": 0.2048, + "grad_norm": 13.095061302185059, + "learning_rate": 8.428760365968329e-06, + "loss": 1.6982, + "step": 1024 + }, + { + "epoch": 0.2052, + "grad_norm": 6.144779205322266, + "learning_rate": 8.442551095669627e-06, + "loss": 0.5601, + "step": 1026 + }, + { + "epoch": 0.2056, + "grad_norm": 8.436849594116211, + "learning_rate": 8.456344861697293e-06, + "loss": 0.8165, + "step": 1028 + }, + { + "epoch": 0.206, + "grad_norm": 5.922204971313477, + "learning_rate": 8.470141637159605e-06, + "loss": 0.3526, + "step": 1030 + }, + { + "epoch": 0.2064, + "grad_norm": 13.34311294555664, + "learning_rate": 8.483941395159114e-06, + "loss": 1.3357, + "step": 1032 + }, + { + "epoch": 0.2068, + "grad_norm": 8.632156372070312, + "learning_rate": 8.497744108792431e-06, + "loss": 0.775, + "step": 1034 + }, + { + "epoch": 0.2072, + "grad_norm": 3.509516954421997, + "learning_rate": 8.511549751150478e-06, + "loss": 0.4745, + "step": 1036 + }, + { + "epoch": 0.2076, + "grad_norm": 8.338186264038086, + "learning_rate": 8.52535829531845e-06, + "loss": 0.449, + "step": 1038 + }, + { + "epoch": 0.208, + "grad_norm": 6.432318687438965, + "learning_rate": 8.539169714375883e-06, + "loss": 0.4153, + "step": 1040 + }, + { + "epoch": 0.2084, + "grad_norm": 5.306497097015381, + "learning_rate": 8.552983981396707e-06, + "loss": 0.3046, + "step": 1042 + }, + { + "epoch": 0.2088, + "grad_norm": 6.796935081481934, + "learning_rate": 8.566801069449304e-06, + "loss": 0.3804, + "step": 1044 + }, + { + "epoch": 0.2092, + "grad_norm": 6.128684997558594, + "learning_rate": 8.580620951596553e-06, + "loss": 0.3821, + "step": 1046 + }, + { + "epoch": 0.2096, + "grad_norm": 4.2606120109558105, + "learning_rate": 8.594443600895886e-06, + "loss": 0.2952, + "step": 1048 + }, + { + "epoch": 0.21, + "grad_norm": 3.0965423583984375, + "learning_rate": 8.60826899039934e-06, + "loss": 0.3207, + "step": 1050 + }, + { + "epoch": 0.2104, + "grad_norm": 8.993412017822266, + "learning_rate": 8.622097093153612e-06, + "loss": 0.5327, + "step": 1052 + }, + { + "epoch": 0.2108, + "grad_norm": 8.555419921875, + "learning_rate": 8.635927882200128e-06, + "loss": 0.5794, + "step": 1054 + }, + { + "epoch": 0.2112, + "grad_norm": 6.076488494873047, + "learning_rate": 8.649761330575e-06, + "loss": 0.5393, + "step": 1056 + }, + { + "epoch": 0.2116, + "grad_norm": 10.156936645507812, + "learning_rate": 8.663597411309268e-06, + "loss": 0.4184, + "step": 1058 + }, + { + "epoch": 0.212, + "grad_norm": 7.925901889801025, + "learning_rate": 8.677436097428766e-06, + "loss": 0.4019, + "step": 1060 + }, + { + "epoch": 0.2124, + "grad_norm": 2.011803388595581, + "learning_rate": 8.691277361954266e-06, + "loss": 0.3901, + "step": 1062 + }, + { + "epoch": 0.2128, + "grad_norm": 2.8129687309265137, + "learning_rate": 8.705121177901537e-06, + "loss": 0.4634, + "step": 1064 + }, + { + "epoch": 0.2132, + "grad_norm": 7.570572376251221, + "learning_rate": 8.718967518281292e-06, + "loss": 0.4997, + "step": 1066 + }, + { + "epoch": 0.2136, + "grad_norm": 5.320712089538574, + "learning_rate": 8.732816356099459e-06, + "loss": 0.5009, + "step": 1068 + }, + { + "epoch": 0.214, + "grad_norm": 4.759933948516846, + "learning_rate": 8.746667664356962e-06, + "loss": 0.5161, + "step": 1070 + }, + { + "epoch": 0.2144, + "grad_norm": 7.89381742477417, + "learning_rate": 8.760521416049986e-06, + "loss": 0.5429, + "step": 1072 + }, + { + "epoch": 0.2148, + "grad_norm": 8.030691146850586, + "learning_rate": 8.774377584169934e-06, + "loss": 0.6002, + "step": 1074 + }, + { + "epoch": 0.2152, + "grad_norm": 6.01340389251709, + "learning_rate": 8.788236141703477e-06, + "loss": 0.4884, + "step": 1076 + }, + { + "epoch": 0.2156, + "grad_norm": 8.655228614807129, + "learning_rate": 8.802097061632706e-06, + "loss": 0.5259, + "step": 1078 + }, + { + "epoch": 0.216, + "grad_norm": 4.559602737426758, + "learning_rate": 8.81596031693499e-06, + "loss": 0.318, + "step": 1080 + }, + { + "epoch": 0.2164, + "grad_norm": 9.333916664123535, + "learning_rate": 8.829825880583224e-06, + "loss": 0.4511, + "step": 1082 + }, + { + "epoch": 0.2168, + "grad_norm": 7.97321081161499, + "learning_rate": 8.84369372554578e-06, + "loss": 0.4386, + "step": 1084 + }, + { + "epoch": 0.2172, + "grad_norm": 9.464213371276855, + "learning_rate": 8.85756382478659e-06, + "loss": 0.558, + "step": 1086 + }, + { + "epoch": 0.2176, + "grad_norm": 10.65013599395752, + "learning_rate": 8.87143615126518e-06, + "loss": 0.3198, + "step": 1088 + }, + { + "epoch": 0.218, + "grad_norm": 5.179736137390137, + "learning_rate": 8.88531067793674e-06, + "loss": 0.5811, + "step": 1090 + }, + { + "epoch": 0.2184, + "grad_norm": 3.35599684715271, + "learning_rate": 8.899187377752173e-06, + "loss": 0.2638, + "step": 1092 + }, + { + "epoch": 0.2188, + "grad_norm": 17.236064910888672, + "learning_rate": 8.913066223658141e-06, + "loss": 0.4617, + "step": 1094 + }, + { + "epoch": 0.2192, + "grad_norm": 3.3031458854675293, + "learning_rate": 8.926947188597127e-06, + "loss": 0.2754, + "step": 1096 + }, + { + "epoch": 0.2196, + "grad_norm": 3.1426756381988525, + "learning_rate": 8.940830245507473e-06, + "loss": 0.3382, + "step": 1098 + }, + { + "epoch": 0.22, + "grad_norm": 5.686756134033203, + "learning_rate": 8.954715367323473e-06, + "loss": 0.2688, + "step": 1100 + }, + { + "epoch": 0.2204, + "grad_norm": 4.029067039489746, + "learning_rate": 8.968602526975317e-06, + "loss": 0.2645, + "step": 1102 + }, + { + "epoch": 0.2208, + "grad_norm": 6.252431869506836, + "learning_rate": 8.982491697389344e-06, + "loss": 0.2708, + "step": 1104 + }, + { + "epoch": 0.2212, + "grad_norm": 4.683696269989014, + "learning_rate": 8.996382851487839e-06, + "loss": 0.3858, + "step": 1106 + }, + { + "epoch": 0.2216, + "grad_norm": 6.620684623718262, + "learning_rate": 9.010275962189356e-06, + "loss": 0.429, + "step": 1108 + }, + { + "epoch": 0.222, + "grad_norm": 8.595572471618652, + "learning_rate": 9.024171002408509e-06, + "loss": 0.5488, + "step": 1110 + }, + { + "epoch": 0.2224, + "grad_norm": 4.27509880065918, + "learning_rate": 9.03806794505621e-06, + "loss": 0.4199, + "step": 1112 + }, + { + "epoch": 0.2228, + "grad_norm": 8.246331214904785, + "learning_rate": 9.051966763039708e-06, + "loss": 0.2391, + "step": 1114 + }, + { + "epoch": 0.2232, + "grad_norm": 12.573917388916016, + "learning_rate": 9.065867429262497e-06, + "loss": 0.3984, + "step": 1116 + }, + { + "epoch": 0.2236, + "grad_norm": 6.149172782897949, + "learning_rate": 9.07976991662453e-06, + "loss": 0.5106, + "step": 1118 + }, + { + "epoch": 0.224, + "grad_norm": 2.5666861534118652, + "learning_rate": 9.093674198022198e-06, + "loss": 0.2478, + "step": 1120 + }, + { + "epoch": 0.2244, + "grad_norm": 5.783352375030518, + "learning_rate": 9.107580246348395e-06, + "loss": 0.332, + "step": 1122 + }, + { + "epoch": 0.2248, + "grad_norm": 3.155423879623413, + "learning_rate": 9.121488034492567e-06, + "loss": 0.7198, + "step": 1124 + }, + { + "epoch": 0.2252, + "grad_norm": 3.3557944297790527, + "learning_rate": 9.135397535340768e-06, + "loss": 0.4982, + "step": 1126 + }, + { + "epoch": 0.2256, + "grad_norm": 18.048126220703125, + "learning_rate": 9.149308721775717e-06, + "loss": 0.9037, + "step": 1128 + }, + { + "epoch": 0.226, + "grad_norm": 8.392247200012207, + "learning_rate": 9.16322156667684e-06, + "loss": 1.1309, + "step": 1130 + }, + { + "epoch": 0.2264, + "grad_norm": 6.623889923095703, + "learning_rate": 9.177136042920338e-06, + "loss": 0.461, + "step": 1132 + }, + { + "epoch": 0.2268, + "grad_norm": 7.802183628082275, + "learning_rate": 9.191052123379227e-06, + "loss": 0.4284, + "step": 1134 + }, + { + "epoch": 0.2272, + "grad_norm": 6.303231716156006, + "learning_rate": 9.204969780923396e-06, + "loss": 0.2958, + "step": 1136 + }, + { + "epoch": 0.2276, + "grad_norm": 9.021235466003418, + "learning_rate": 9.218888988419656e-06, + "loss": 0.4059, + "step": 1138 + }, + { + "epoch": 0.228, + "grad_norm": 8.156488418579102, + "learning_rate": 9.232809718731822e-06, + "loss": 0.4723, + "step": 1140 + }, + { + "epoch": 0.2284, + "grad_norm": 3.5747907161712646, + "learning_rate": 9.246731944720663e-06, + "loss": 0.5148, + "step": 1142 + }, + { + "epoch": 0.2288, + "grad_norm": 6.50137186050415, + "learning_rate": 9.26065563924414e-06, + "loss": 0.5094, + "step": 1144 + }, + { + "epoch": 0.2292, + "grad_norm": 13.088374137878418, + "learning_rate": 9.274580775157299e-06, + "loss": 0.4124, + "step": 1146 + }, + { + "epoch": 0.2296, + "grad_norm": 9.023184776306152, + "learning_rate": 9.288507325312319e-06, + "loss": 0.6996, + "step": 1148 + }, + { + "epoch": 0.23, + "grad_norm": 7.103399276733398, + "learning_rate": 9.302435262558752e-06, + "loss": 0.6066, + "step": 1150 + }, + { + "epoch": 0.2304, + "grad_norm": 9.870776176452637, + "learning_rate": 9.316364559743298e-06, + "loss": 0.6086, + "step": 1152 + }, + { + "epoch": 0.2308, + "grad_norm": 14.532407760620117, + "learning_rate": 9.330295189710153e-06, + "loss": 0.5549, + "step": 1154 + }, + { + "epoch": 0.2312, + "grad_norm": 4.48659086227417, + "learning_rate": 9.344227125300788e-06, + "loss": 0.2919, + "step": 1156 + }, + { + "epoch": 0.2316, + "grad_norm": 5.752588272094727, + "learning_rate": 9.358160339354196e-06, + "loss": 0.4984, + "step": 1158 + }, + { + "epoch": 0.232, + "grad_norm": 6.5597968101501465, + "learning_rate": 9.372094804706867e-06, + "loss": 0.4872, + "step": 1160 + }, + { + "epoch": 0.2324, + "grad_norm": 17.60839080810547, + "learning_rate": 9.386030494192826e-06, + "loss": 0.7217, + "step": 1162 + }, + { + "epoch": 0.2328, + "grad_norm": 4.611227989196777, + "learning_rate": 9.39996738064379e-06, + "loss": 0.5693, + "step": 1164 + }, + { + "epoch": 0.2332, + "grad_norm": 7.469904899597168, + "learning_rate": 9.413905436889032e-06, + "loss": 0.3725, + "step": 1166 + }, + { + "epoch": 0.2336, + "grad_norm": 8.132116317749023, + "learning_rate": 9.427844635755615e-06, + "loss": 0.5704, + "step": 1168 + }, + { + "epoch": 0.234, + "grad_norm": 6.70773983001709, + "learning_rate": 9.441784950068357e-06, + "loss": 0.5347, + "step": 1170 + }, + { + "epoch": 0.2344, + "grad_norm": 11.278470993041992, + "learning_rate": 9.455726352649904e-06, + "loss": 0.4099, + "step": 1172 + }, + { + "epoch": 0.2348, + "grad_norm": 6.062188625335693, + "learning_rate": 9.469668816320777e-06, + "loss": 0.5894, + "step": 1174 + }, + { + "epoch": 0.2352, + "grad_norm": 6.659539699554443, + "learning_rate": 9.483612313899446e-06, + "loss": 0.5318, + "step": 1176 + }, + { + "epoch": 0.2356, + "grad_norm": 4.592545509338379, + "learning_rate": 9.497556818202297e-06, + "loss": 0.2287, + "step": 1178 + }, + { + "epoch": 0.236, + "grad_norm": 4.524127960205078, + "learning_rate": 9.511502302043859e-06, + "loss": 0.3165, + "step": 1180 + }, + { + "epoch": 0.2364, + "grad_norm": 4.947080135345459, + "learning_rate": 9.52544873823668e-06, + "loss": 0.4032, + "step": 1182 + }, + { + "epoch": 0.2368, + "grad_norm": 4.086838245391846, + "learning_rate": 9.539396099591469e-06, + "loss": 0.1743, + "step": 1184 + }, + { + "epoch": 0.2372, + "grad_norm": 8.205987930297852, + "learning_rate": 9.553344358917146e-06, + "loss": 0.535, + "step": 1186 + }, + { + "epoch": 0.2376, + "grad_norm": 18.025259017944336, + "learning_rate": 9.567293489020816e-06, + "loss": 0.4672, + "step": 1188 + }, + { + "epoch": 0.238, + "grad_norm": 2.789304494857788, + "learning_rate": 9.581243462708009e-06, + "loss": 0.3538, + "step": 1190 + }, + { + "epoch": 0.2384, + "grad_norm": 12.318904876708984, + "learning_rate": 9.595194252782461e-06, + "loss": 0.5997, + "step": 1192 + }, + { + "epoch": 0.2388, + "grad_norm": 4.160138130187988, + "learning_rate": 9.609145832046469e-06, + "loss": 0.4848, + "step": 1194 + }, + { + "epoch": 0.2392, + "grad_norm": 2.0164973735809326, + "learning_rate": 9.623098173300656e-06, + "loss": 0.2964, + "step": 1196 + }, + { + "epoch": 0.2396, + "grad_norm": 8.75981616973877, + "learning_rate": 9.637051249344225e-06, + "loss": 0.4889, + "step": 1198 + }, + { + "epoch": 0.24, + "grad_norm": 7.435197830200195, + "learning_rate": 9.651005032974991e-06, + "loss": 0.7413, + "step": 1200 + }, + { + "epoch": 0.2404, + "grad_norm": 4.774801254272461, + "learning_rate": 9.664959496989285e-06, + "loss": 0.3135, + "step": 1202 + }, + { + "epoch": 0.2408, + "grad_norm": 2.8716344833374023, + "learning_rate": 9.678914614182184e-06, + "loss": 0.3496, + "step": 1204 + }, + { + "epoch": 0.2412, + "grad_norm": 15.887703895568848, + "learning_rate": 9.69287035734747e-06, + "loss": 0.4702, + "step": 1206 + }, + { + "epoch": 0.2416, + "grad_norm": 5.811670780181885, + "learning_rate": 9.706826699277714e-06, + "loss": 0.2451, + "step": 1208 + }, + { + "epoch": 0.242, + "grad_norm": 9.121792793273926, + "learning_rate": 9.720783612764307e-06, + "loss": 0.4176, + "step": 1210 + }, + { + "epoch": 0.2424, + "grad_norm": 2.5957159996032715, + "learning_rate": 9.734741070597535e-06, + "loss": 1.0469, + "step": 1212 + }, + { + "epoch": 0.2428, + "grad_norm": 4.007805347442627, + "learning_rate": 9.74869904556662e-06, + "loss": 0.1839, + "step": 1214 + }, + { + "epoch": 0.2432, + "grad_norm": 4.322118759155273, + "learning_rate": 9.762657510459774e-06, + "loss": 0.3262, + "step": 1216 + }, + { + "epoch": 0.2436, + "grad_norm": 3.0809874534606934, + "learning_rate": 9.776616438064255e-06, + "loss": 0.378, + "step": 1218 + }, + { + "epoch": 0.244, + "grad_norm": 8.872461318969727, + "learning_rate": 9.790575801166422e-06, + "loss": 0.4666, + "step": 1220 + }, + { + "epoch": 0.2444, + "grad_norm": 2.6419589519500732, + "learning_rate": 9.804535572551782e-06, + "loss": 0.2881, + "step": 1222 + }, + { + "epoch": 0.2448, + "grad_norm": 8.719003677368164, + "learning_rate": 9.818495725005043e-06, + "loss": 0.4938, + "step": 1224 + }, + { + "epoch": 0.2452, + "grad_norm": 3.2691900730133057, + "learning_rate": 9.832456231310194e-06, + "loss": 0.2721, + "step": 1226 + }, + { + "epoch": 0.2456, + "grad_norm": 4.075341701507568, + "learning_rate": 9.846417064250459e-06, + "loss": 0.4825, + "step": 1228 + }, + { + "epoch": 0.246, + "grad_norm": 3.6722347736358643, + "learning_rate": 9.860378196608552e-06, + "loss": 0.3527, + "step": 1230 + }, + { + "epoch": 0.2464, + "grad_norm": 10.746099472045898, + "learning_rate": 9.874339601166479e-06, + "loss": 0.4475, + "step": 1232 + }, + { + "epoch": 0.2468, + "grad_norm": 10.915682792663574, + "learning_rate": 9.888301250705765e-06, + "loss": 0.5117, + "step": 1234 + }, + { + "epoch": 0.2472, + "grad_norm": 7.093919277191162, + "learning_rate": 9.902263118007513e-06, + "loss": 0.3136, + "step": 1236 + }, + { + "epoch": 0.2476, + "grad_norm": 2.987434148788452, + "learning_rate": 9.916225175852278e-06, + "loss": 0.5004, + "step": 1238 + }, + { + "epoch": 0.248, + "grad_norm": 9.897947311401367, + "learning_rate": 9.930187397020385e-06, + "loss": 0.6834, + "step": 1240 + }, + { + "epoch": 0.2484, + "grad_norm": 7.4156928062438965, + "learning_rate": 9.944149754291716e-06, + "loss": 0.3489, + "step": 1242 + }, + { + "epoch": 0.2488, + "grad_norm": 2.551642894744873, + "learning_rate": 9.95811222044596e-06, + "loss": 0.2666, + "step": 1244 + }, + { + "epoch": 0.2492, + "grad_norm": 6.077884197235107, + "learning_rate": 9.972074768262572e-06, + "loss": 0.3085, + "step": 1246 + }, + { + "epoch": 0.2496, + "grad_norm": 15.365740776062012, + "learning_rate": 9.986037370520855e-06, + "loss": 0.6024, + "step": 1248 + }, + { + "epoch": 0.25, + "grad_norm": 4.617124080657959, + "learning_rate": 9.999999999999996e-06, + "loss": 0.3048, + "step": 1250 + }, + { + "epoch": 0.2504, + "grad_norm": 4.314815521240234, + "learning_rate": 1.0013962629479139e-05, + "loss": 0.278, + "step": 1252 + }, + { + "epoch": 0.2508, + "grad_norm": 3.910611152648926, + "learning_rate": 1.0027925231737419e-05, + "loss": 0.2813, + "step": 1254 + }, + { + "epoch": 0.2512, + "grad_norm": 7.8652567863464355, + "learning_rate": 1.0041887779554034e-05, + "loss": 0.389, + "step": 1256 + }, + { + "epoch": 0.2516, + "grad_norm": 3.0736870765686035, + "learning_rate": 1.0055850245708276e-05, + "loss": 0.3648, + "step": 1258 + }, + { + "epoch": 0.252, + "grad_norm": 5.97303581237793, + "learning_rate": 1.0069812602979607e-05, + "loss": 0.4754, + "step": 1260 + }, + { + "epoch": 0.2524, + "grad_norm": 4.6572980880737305, + "learning_rate": 1.0083774824147717e-05, + "loss": 0.2779, + "step": 1262 + }, + { + "epoch": 0.2528, + "grad_norm": 4.347689628601074, + "learning_rate": 1.0097736881992482e-05, + "loss": 0.4741, + "step": 1264 + }, + { + "epoch": 0.2532, + "grad_norm": 3.6360926628112793, + "learning_rate": 1.011169874929423e-05, + "loss": 0.4454, + "step": 1266 + }, + { + "epoch": 0.2536, + "grad_norm": 3.343543767929077, + "learning_rate": 1.0125660398833514e-05, + "loss": 0.3356, + "step": 1268 + }, + { + "epoch": 0.254, + "grad_norm": 4.034943580627441, + "learning_rate": 1.013962180339144e-05, + "loss": 0.2899, + "step": 1270 + }, + { + "epoch": 0.2544, + "grad_norm": 3.052302837371826, + "learning_rate": 1.0153582935749533e-05, + "loss": 0.5083, + "step": 1272 + }, + { + "epoch": 0.2548, + "grad_norm": 7.536210536956787, + "learning_rate": 1.01675437686898e-05, + "loss": 0.4651, + "step": 1274 + }, + { + "epoch": 0.2552, + "grad_norm": 7.279505729675293, + "learning_rate": 1.0181504274994952e-05, + "loss": 0.8825, + "step": 1276 + }, + { + "epoch": 0.2556, + "grad_norm": 5.007200241088867, + "learning_rate": 1.0195464427448212e-05, + "loss": 0.2752, + "step": 1278 + }, + { + "epoch": 0.256, + "grad_norm": 15.085927963256836, + "learning_rate": 1.0209424198833571e-05, + "loss": 0.5496, + "step": 1280 + }, + { + "epoch": 0.2564, + "grad_norm": 3.807936191558838, + "learning_rate": 1.0223383561935738e-05, + "loss": 0.3391, + "step": 1282 + }, + { + "epoch": 0.2568, + "grad_norm": 7.716585159301758, + "learning_rate": 1.0237342489540218e-05, + "loss": 0.306, + "step": 1284 + }, + { + "epoch": 0.2572, + "grad_norm": 9.290019035339355, + "learning_rate": 1.0251300954433374e-05, + "loss": 0.5388, + "step": 1286 + }, + { + "epoch": 0.2576, + "grad_norm": 4.497453212738037, + "learning_rate": 1.0265258929402458e-05, + "loss": 0.3667, + "step": 1288 + }, + { + "epoch": 0.258, + "grad_norm": 4.392390251159668, + "learning_rate": 1.0279216387235686e-05, + "loss": 0.353, + "step": 1290 + }, + { + "epoch": 0.2584, + "grad_norm": 7.22153902053833, + "learning_rate": 1.029317330072228e-05, + "loss": 0.439, + "step": 1292 + }, + { + "epoch": 0.2588, + "grad_norm": 3.822291612625122, + "learning_rate": 1.0307129642652523e-05, + "loss": 0.5578, + "step": 1294 + }, + { + "epoch": 0.2592, + "grad_norm": 9.785799026489258, + "learning_rate": 1.0321085385817811e-05, + "loss": 0.5537, + "step": 1296 + }, + { + "epoch": 0.2596, + "grad_norm": 10.913276672363281, + "learning_rate": 1.033504050301071e-05, + "loss": 0.5844, + "step": 1298 + }, + { + "epoch": 0.26, + "grad_norm": 12.159826278686523, + "learning_rate": 1.0348994967025004e-05, + "loss": 0.8787, + "step": 1300 + }, + { + "epoch": 0.2604, + "grad_norm": 10.910086631774902, + "learning_rate": 1.0362948750655768e-05, + "loss": 0.5781, + "step": 1302 + }, + { + "epoch": 0.2608, + "grad_norm": 13.139826774597168, + "learning_rate": 1.0376901826699337e-05, + "loss": 0.4213, + "step": 1304 + }, + { + "epoch": 0.2612, + "grad_norm": 10.38273811340332, + "learning_rate": 1.0390854167953526e-05, + "loss": 0.6268, + "step": 1306 + }, + { + "epoch": 0.2616, + "grad_norm": 9.462930679321289, + "learning_rate": 1.0404805747217532e-05, + "loss": 0.6171, + "step": 1308 + }, + { + "epoch": 0.262, + "grad_norm": 2.936349391937256, + "learning_rate": 1.0418756537291984e-05, + "loss": 0.4775, + "step": 1310 + }, + { + "epoch": 0.2624, + "grad_norm": 2.6934597492218018, + "learning_rate": 1.0432706510979175e-05, + "loss": 0.4505, + "step": 1312 + }, + { + "epoch": 0.2628, + "grad_norm": 10.6868314743042, + "learning_rate": 1.0446655641082846e-05, + "loss": 0.8237, + "step": 1314 + }, + { + "epoch": 0.2632, + "grad_norm": 7.428442478179932, + "learning_rate": 1.0460603900408526e-05, + "loss": 0.4286, + "step": 1316 + }, + { + "epoch": 0.2636, + "grad_norm": 9.77827262878418, + "learning_rate": 1.0474551261763312e-05, + "loss": 0.4458, + "step": 1318 + }, + { + "epoch": 0.264, + "grad_norm": 7.787086009979248, + "learning_rate": 1.0488497697956134e-05, + "loss": 0.4469, + "step": 1320 + }, + { + "epoch": 0.2644, + "grad_norm": 3.138362169265747, + "learning_rate": 1.0502443181797696e-05, + "loss": 0.4373, + "step": 1322 + }, + { + "epoch": 0.2648, + "grad_norm": 4.602435111999512, + "learning_rate": 1.0516387686100549e-05, + "loss": 0.4457, + "step": 1324 + }, + { + "epoch": 0.2652, + "grad_norm": 6.590594291687012, + "learning_rate": 1.0530331183679216e-05, + "loss": 0.3988, + "step": 1326 + }, + { + "epoch": 0.2656, + "grad_norm": 5.800577163696289, + "learning_rate": 1.054427364735009e-05, + "loss": 0.3854, + "step": 1328 + }, + { + "epoch": 0.266, + "grad_norm": 2.489569664001465, + "learning_rate": 1.0558215049931634e-05, + "loss": 0.447, + "step": 1330 + }, + { + "epoch": 0.2664, + "grad_norm": 6.837520599365234, + "learning_rate": 1.0572155364244378e-05, + "loss": 0.5421, + "step": 1332 + }, + { + "epoch": 0.2668, + "grad_norm": 2.9849133491516113, + "learning_rate": 1.058609456311096e-05, + "loss": 0.3384, + "step": 1334 + }, + { + "epoch": 0.2672, + "grad_norm": 34.88032531738281, + "learning_rate": 1.0600032619356203e-05, + "loss": 2.9233, + "step": 1336 + }, + { + "epoch": 0.2676, + "grad_norm": 9.852622032165527, + "learning_rate": 1.0613969505807167e-05, + "loss": 0.7922, + "step": 1338 + }, + { + "epoch": 0.268, + "grad_norm": 10.343416213989258, + "learning_rate": 1.0627905195293127e-05, + "loss": 0.4777, + "step": 1340 + }, + { + "epoch": 0.2684, + "grad_norm": 3.7254891395568848, + "learning_rate": 1.0641839660645795e-05, + "loss": 0.4507, + "step": 1342 + }, + { + "epoch": 0.2688, + "grad_norm": 12.105618476867676, + "learning_rate": 1.0655772874699206e-05, + "loss": 0.6432, + "step": 1344 + }, + { + "epoch": 0.2692, + "grad_norm": 3.6046953201293945, + "learning_rate": 1.066970481028984e-05, + "loss": 0.3347, + "step": 1346 + }, + { + "epoch": 0.2696, + "grad_norm": 10.82397174835205, + "learning_rate": 1.0683635440256694e-05, + "loss": 0.4948, + "step": 1348 + }, + { + "epoch": 0.27, + "grad_norm": 2.004279136657715, + "learning_rate": 1.0697564737441242e-05, + "loss": 0.3004, + "step": 1350 + }, + { + "epoch": 0.2704, + "grad_norm": 12.486804008483887, + "learning_rate": 1.0711492674687674e-05, + "loss": 2.5481, + "step": 1352 + }, + { + "epoch": 0.2708, + "grad_norm": 2.9018843173980713, + "learning_rate": 1.0725419224842695e-05, + "loss": 0.4402, + "step": 1354 + }, + { + "epoch": 0.2712, + "grad_norm": 2.7257988452911377, + "learning_rate": 1.0739344360755855e-05, + "loss": 0.3942, + "step": 1356 + }, + { + "epoch": 0.2716, + "grad_norm": 3.1312522888183594, + "learning_rate": 1.0753268055279332e-05, + "loss": 0.3369, + "step": 1358 + }, + { + "epoch": 0.272, + "grad_norm": 5.981337070465088, + "learning_rate": 1.0767190281268171e-05, + "loss": 0.4886, + "step": 1360 + }, + { + "epoch": 0.2724, + "grad_norm": 6.3107991218566895, + "learning_rate": 1.0781111011580336e-05, + "loss": 0.5095, + "step": 1362 + }, + { + "epoch": 0.2728, + "grad_norm": 10.96753978729248, + "learning_rate": 1.07950302190766e-05, + "loss": 0.783, + "step": 1364 + }, + { + "epoch": 0.2732, + "grad_norm": 4.114676475524902, + "learning_rate": 1.0808947876620766e-05, + "loss": 0.224, + "step": 1366 + }, + { + "epoch": 0.2736, + "grad_norm": 6.768328666687012, + "learning_rate": 1.0822863957079654e-05, + "loss": 0.5381, + "step": 1368 + }, + { + "epoch": 0.274, + "grad_norm": 10.183931350708008, + "learning_rate": 1.0836778433323153e-05, + "loss": 0.7286, + "step": 1370 + }, + { + "epoch": 0.2744, + "grad_norm": 11.262810707092285, + "learning_rate": 1.0850691278224277e-05, + "loss": 0.4362, + "step": 1372 + }, + { + "epoch": 0.2748, + "grad_norm": 2.5276310443878174, + "learning_rate": 1.0864602464659227e-05, + "loss": 0.3241, + "step": 1374 + }, + { + "epoch": 0.2752, + "grad_norm": 3.2120370864868164, + "learning_rate": 1.0878511965507428e-05, + "loss": 0.4633, + "step": 1376 + }, + { + "epoch": 0.2756, + "grad_norm": 0.043698590248823166, + "learning_rate": 1.0892419753651598e-05, + "loss": 0.2778, + "step": 1378 + }, + { + "epoch": 0.276, + "grad_norm": 4.2104997634887695, + "learning_rate": 1.0906325801977795e-05, + "loss": 1.2036, + "step": 1380 + }, + { + "epoch": 0.2764, + "grad_norm": 3.5874691009521484, + "learning_rate": 1.0920230083375465e-05, + "loss": 0.4037, + "step": 1382 + }, + { + "epoch": 0.2768, + "grad_norm": 2.5431137084960938, + "learning_rate": 1.0934132570737497e-05, + "loss": 0.3448, + "step": 1384 + }, + { + "epoch": 0.2772, + "grad_norm": 8.870311737060547, + "learning_rate": 1.0948033236960285e-05, + "loss": 0.776, + "step": 1386 + }, + { + "epoch": 0.2776, + "grad_norm": 5.718042373657227, + "learning_rate": 1.0961932054943785e-05, + "loss": 0.3255, + "step": 1388 + }, + { + "epoch": 0.278, + "grad_norm": 6.183005332946777, + "learning_rate": 1.0975828997591484e-05, + "loss": 0.3902, + "step": 1390 + }, + { + "epoch": 0.2784, + "grad_norm": 3.1337947845458984, + "learning_rate": 1.098972403781064e-05, + "loss": 0.3535, + "step": 1392 + }, + { + "epoch": 0.2788, + "grad_norm": 4.101289749145508, + "learning_rate": 1.1003617148512154e-05, + "loss": 0.2587, + "step": 1394 + }, + { + "epoch": 0.2792, + "grad_norm": 3.531926393508911, + "learning_rate": 1.101750830261065e-05, + "loss": 0.3897, + "step": 1396 + }, + { + "epoch": 0.2796, + "grad_norm": 2.7929153442382812, + "learning_rate": 1.1031397473024676e-05, + "loss": 0.2155, + "step": 1398 + }, + { + "epoch": 0.28, + "grad_norm": 5.211634635925293, + "learning_rate": 1.104528463267652e-05, + "loss": 0.3324, + "step": 1400 + }, + { + "epoch": 0.2804, + "grad_norm": 5.68538761138916, + "learning_rate": 1.1059169754492518e-05, + "loss": 0.5202, + "step": 1402 + }, + { + "epoch": 0.2808, + "grad_norm": 4.005622386932373, + "learning_rate": 1.1073052811402867e-05, + "loss": 0.3965, + "step": 1404 + }, + { + "epoch": 0.2812, + "grad_norm": 5.649775505065918, + "learning_rate": 1.108693377634185e-05, + "loss": 0.802, + "step": 1406 + }, + { + "epoch": 0.2816, + "grad_norm": 4.55808162689209, + "learning_rate": 1.1100812622247821e-05, + "loss": 0.274, + "step": 1408 + }, + { + "epoch": 0.282, + "grad_norm": 4.362680435180664, + "learning_rate": 1.1114689322063252e-05, + "loss": 0.3611, + "step": 1410 + }, + { + "epoch": 0.2824, + "grad_norm": 5.680489540100098, + "learning_rate": 1.1128563848734815e-05, + "loss": 0.3367, + "step": 1412 + }, + { + "epoch": 0.2828, + "grad_norm": 8.655660629272461, + "learning_rate": 1.1142436175213404e-05, + "loss": 0.6261, + "step": 1414 + }, + { + "epoch": 0.2832, + "grad_norm": 8.299210548400879, + "learning_rate": 1.1156306274454211e-05, + "loss": 0.3496, + "step": 1416 + }, + { + "epoch": 0.2836, + "grad_norm": 5.83567476272583, + "learning_rate": 1.117017411941677e-05, + "loss": 0.4108, + "step": 1418 + }, + { + "epoch": 0.284, + "grad_norm": 3.716970443725586, + "learning_rate": 1.1184039683065002e-05, + "loss": 0.3139, + "step": 1420 + }, + { + "epoch": 0.2844, + "grad_norm": 2.9131391048431396, + "learning_rate": 1.1197902938367289e-05, + "loss": 0.2386, + "step": 1422 + }, + { + "epoch": 0.2848, + "grad_norm": 5.470449447631836, + "learning_rate": 1.1211763858296516e-05, + "loss": 0.2708, + "step": 1424 + }, + { + "epoch": 0.2852, + "grad_norm": 3.6554157733917236, + "learning_rate": 1.122562241583006e-05, + "loss": 0.3993, + "step": 1426 + }, + { + "epoch": 0.2856, + "grad_norm": 3.0244224071502686, + "learning_rate": 1.1239478583950007e-05, + "loss": 0.1771, + "step": 1428 + }, + { + "epoch": 0.286, + "grad_norm": 49.14713668823242, + "learning_rate": 1.1253332335643033e-05, + "loss": 0.7448, + "step": 1430 + }, + { + "epoch": 0.2864, + "grad_norm": 7.2982401847839355, + "learning_rate": 1.1267183643900534e-05, + "loss": 0.7348, + "step": 1432 + }, + { + "epoch": 0.2868, + "grad_norm": 3.3678481578826904, + "learning_rate": 1.1281032481718701e-05, + "loss": 0.3292, + "step": 1434 + }, + { + "epoch": 0.2872, + "grad_norm": 2.2815210819244385, + "learning_rate": 1.1294878822098456e-05, + "loss": 0.5051, + "step": 1436 + }, + { + "epoch": 0.2876, + "grad_norm": 6.8002424240112305, + "learning_rate": 1.1308722638045725e-05, + "loss": 0.4748, + "step": 1438 + }, + { + "epoch": 0.288, + "grad_norm": 7.02114725112915, + "learning_rate": 1.1322563902571227e-05, + "loss": 0.4512, + "step": 1440 + }, + { + "epoch": 0.2884, + "grad_norm": 4.354689121246338, + "learning_rate": 1.1336402588690725e-05, + "loss": 0.5112, + "step": 1442 + }, + { + "epoch": 0.2888, + "grad_norm": 5.8543806076049805, + "learning_rate": 1.1350238669424993e-05, + "loss": 0.3051, + "step": 1444 + }, + { + "epoch": 0.2892, + "grad_norm": 14.015190124511719, + "learning_rate": 1.1364072117799864e-05, + "loss": 0.407, + "step": 1446 + }, + { + "epoch": 0.2896, + "grad_norm": 4.109735488891602, + "learning_rate": 1.137790290684638e-05, + "loss": 0.3197, + "step": 1448 + }, + { + "epoch": 0.29, + "grad_norm": 3.673393726348877, + "learning_rate": 1.1391731009600652e-05, + "loss": 0.5108, + "step": 1450 + }, + { + "epoch": 0.2904, + "grad_norm": 5.345627784729004, + "learning_rate": 1.1405556399104108e-05, + "loss": 0.6313, + "step": 1452 + }, + { + "epoch": 0.2908, + "grad_norm": 14.6724214553833, + "learning_rate": 1.141937904840344e-05, + "loss": 0.3813, + "step": 1454 + }, + { + "epoch": 0.2912, + "grad_norm": 6.962216377258301, + "learning_rate": 1.143319893055069e-05, + "loss": 0.388, + "step": 1456 + }, + { + "epoch": 0.2916, + "grad_norm": 2.838561534881592, + "learning_rate": 1.1447016018603286e-05, + "loss": 0.4473, + "step": 1458 + }, + { + "epoch": 0.292, + "grad_norm": 10.735572814941406, + "learning_rate": 1.1460830285624112e-05, + "loss": 0.586, + "step": 1460 + }, + { + "epoch": 0.2924, + "grad_norm": 3.4416141510009766, + "learning_rate": 1.1474641704681541e-05, + "loss": 0.3125, + "step": 1462 + }, + { + "epoch": 0.2928, + "grad_norm": 4.411246299743652, + "learning_rate": 1.1488450248849515e-05, + "loss": 0.4667, + "step": 1464 + }, + { + "epoch": 0.2932, + "grad_norm": 12.819711685180664, + "learning_rate": 1.150225589120756e-05, + "loss": 0.5776, + "step": 1466 + }, + { + "epoch": 0.2936, + "grad_norm": 8.391084671020508, + "learning_rate": 1.1516058604840881e-05, + "loss": 0.53, + "step": 1468 + }, + { + "epoch": 0.294, + "grad_norm": 2.1660103797912598, + "learning_rate": 1.1529858362840388e-05, + "loss": 0.2581, + "step": 1470 + }, + { + "epoch": 0.2944, + "grad_norm": 4.642001152038574, + "learning_rate": 1.15436551383027e-05, + "loss": 0.4219, + "step": 1472 + }, + { + "epoch": 0.2948, + "grad_norm": 3.5424013137817383, + "learning_rate": 1.1557448904330366e-05, + "loss": 0.5811, + "step": 1474 + }, + { + "epoch": 0.2952, + "grad_norm": 7.127161026000977, + "learning_rate": 1.1571239634031666e-05, + "loss": 0.3801, + "step": 1476 + }, + { + "epoch": 0.2956, + "grad_norm": 4.140847206115723, + "learning_rate": 1.158502730052093e-05, + "loss": 0.4047, + "step": 1478 + }, + { + "epoch": 0.296, + "grad_norm": 6.055454254150391, + "learning_rate": 1.1598811876918352e-05, + "loss": 0.3736, + "step": 1480 + }, + { + "epoch": 0.2964, + "grad_norm": 4.176248073577881, + "learning_rate": 1.161259333635019e-05, + "loss": 0.5165, + "step": 1482 + }, + { + "epoch": 0.2968, + "grad_norm": 21.383167266845703, + "learning_rate": 1.1626371651948839e-05, + "loss": 0.6667, + "step": 1484 + }, + { + "epoch": 0.2972, + "grad_norm": 6.070794105529785, + "learning_rate": 1.1640146796852711e-05, + "loss": 0.5311, + "step": 1486 + }, + { + "epoch": 0.2976, + "grad_norm": 4.431724548339844, + "learning_rate": 1.1653918744206478e-05, + "loss": 0.3337, + "step": 1488 + }, + { + "epoch": 0.298, + "grad_norm": 23.307186126708984, + "learning_rate": 1.1667687467161021e-05, + "loss": 1.7408, + "step": 1490 + }, + { + "epoch": 0.2984, + "grad_norm": 11.07599925994873, + "learning_rate": 1.1681452938873515e-05, + "loss": 0.6157, + "step": 1492 + }, + { + "epoch": 0.2988, + "grad_norm": 5.507989406585693, + "learning_rate": 1.169521513250746e-05, + "loss": 0.3348, + "step": 1494 + }, + { + "epoch": 0.2992, + "grad_norm": 8.756346702575684, + "learning_rate": 1.1708974021232763e-05, + "loss": 0.8226, + "step": 1496 + }, + { + "epoch": 0.2996, + "grad_norm": 3.812328815460205, + "learning_rate": 1.1722729578225762e-05, + "loss": 0.4649, + "step": 1498 + }, + { + "epoch": 0.3, + "grad_norm": 5.434119701385498, + "learning_rate": 1.1736481776669297e-05, + "loss": 0.5212, + "step": 1500 + }, + { + "epoch": 0.3004, + "grad_norm": 5.352927207946777, + "learning_rate": 1.1750230589752753e-05, + "loss": 0.7091, + "step": 1502 + }, + { + "epoch": 0.3008, + "grad_norm": 8.913749694824219, + "learning_rate": 1.1763975990672116e-05, + "loss": 0.6775, + "step": 1504 + }, + { + "epoch": 0.3012, + "grad_norm": 5.383404731750488, + "learning_rate": 1.1777717952630023e-05, + "loss": 0.3879, + "step": 1506 + }, + { + "epoch": 0.3016, + "grad_norm": 7.828726768493652, + "learning_rate": 1.1791456448835815e-05, + "loss": 0.6148, + "step": 1508 + }, + { + "epoch": 0.302, + "grad_norm": 6.818487644195557, + "learning_rate": 1.180519145250561e-05, + "loss": 0.2536, + "step": 1510 + }, + { + "epoch": 0.3024, + "grad_norm": 6.503286838531494, + "learning_rate": 1.1818922936862258e-05, + "loss": 0.4782, + "step": 1512 + }, + { + "epoch": 0.3028, + "grad_norm": 24.935619354248047, + "learning_rate": 1.1832650875135606e-05, + "loss": 0.9027, + "step": 1514 + }, + { + "epoch": 0.3032, + "grad_norm": 6.300797462463379, + "learning_rate": 1.1846375240562274e-05, + "loss": 0.6817, + "step": 1516 + }, + { + "epoch": 0.3036, + "grad_norm": 3.063715696334839, + "learning_rate": 1.1860096006385918e-05, + "loss": 0.2117, + "step": 1518 + }, + { + "epoch": 0.304, + "grad_norm": 2.8798439502716064, + "learning_rate": 1.187381314585725e-05, + "loss": 0.2951, + "step": 1520 + }, + { + "epoch": 0.3044, + "grad_norm": 2.7141551971435547, + "learning_rate": 1.1887526632233937e-05, + "loss": 0.2517, + "step": 1522 + }, + { + "epoch": 0.3048, + "grad_norm": 7.310266494750977, + "learning_rate": 1.1901236438780906e-05, + "loss": 1.6551, + "step": 1524 + }, + { + "epoch": 0.3052, + "grad_norm": 2.782778024673462, + "learning_rate": 1.191494253877013e-05, + "loss": 0.5671, + "step": 1526 + }, + { + "epoch": 0.3056, + "grad_norm": 13.405529975891113, + "learning_rate": 1.192864490548089e-05, + "loss": 0.843, + "step": 1528 + }, + { + "epoch": 0.306, + "grad_norm": 13.810148239135742, + "learning_rate": 1.1942343512199719e-05, + "loss": 0.7178, + "step": 1530 + }, + { + "epoch": 0.3064, + "grad_norm": 9.249113082885742, + "learning_rate": 1.195603833222048e-05, + "loss": 0.8531, + "step": 1532 + }, + { + "epoch": 0.3068, + "grad_norm": 6.271283149719238, + "learning_rate": 1.1969729338844422e-05, + "loss": 0.425, + "step": 1534 + }, + { + "epoch": 0.3072, + "grad_norm": 8.340595245361328, + "learning_rate": 1.198341650538023e-05, + "loss": 0.6761, + "step": 1536 + }, + { + "epoch": 0.3076, + "grad_norm": 4.789613246917725, + "learning_rate": 1.1997099805144066e-05, + "loss": 0.2258, + "step": 1538 + }, + { + "epoch": 0.308, + "grad_norm": 5.014132976531982, + "learning_rate": 1.2010779211459642e-05, + "loss": 0.3882, + "step": 1540 + }, + { + "epoch": 0.3084, + "grad_norm": 3.118912935256958, + "learning_rate": 1.2024454697658254e-05, + "loss": 0.2926, + "step": 1542 + }, + { + "epoch": 0.3088, + "grad_norm": 12.062591552734375, + "learning_rate": 1.203812623707884e-05, + "loss": 0.9824, + "step": 1544 + }, + { + "epoch": 0.3092, + "grad_norm": 0.15562035143375397, + "learning_rate": 1.2051793803068054e-05, + "loss": 0.1776, + "step": 1546 + }, + { + "epoch": 0.3096, + "grad_norm": 3.83920955657959, + "learning_rate": 1.2065457368980227e-05, + "loss": 0.3383, + "step": 1548 + }, + { + "epoch": 0.31, + "grad_norm": 4.220104694366455, + "learning_rate": 1.20791169081776e-05, + "loss": 0.3135, + "step": 1550 + }, + { + "epoch": 0.3104, + "grad_norm": 5.098264217376709, + "learning_rate": 1.2092772394030141e-05, + "loss": 0.3466, + "step": 1552 + }, + { + "epoch": 0.3108, + "grad_norm": 8.941669464111328, + "learning_rate": 1.210642379991583e-05, + "loss": 0.7747, + "step": 1554 + }, + { + "epoch": 0.3112, + "grad_norm": 4.687846660614014, + "learning_rate": 1.2120071099220552e-05, + "loss": 0.439, + "step": 1556 + }, + { + "epoch": 0.3116, + "grad_norm": 5.004270553588867, + "learning_rate": 1.2133714265338148e-05, + "loss": 0.374, + "step": 1558 + }, + { + "epoch": 0.312, + "grad_norm": 3.966196060180664, + "learning_rate": 1.2147353271670637e-05, + "loss": 0.386, + "step": 1560 + }, + { + "epoch": 0.3124, + "grad_norm": 2.9987683296203613, + "learning_rate": 1.2160988091628006e-05, + "loss": 0.2708, + "step": 1562 + }, + { + "epoch": 0.3128, + "grad_norm": 6.770999908447266, + "learning_rate": 1.217461869862855e-05, + "loss": 0.2995, + "step": 1564 + }, + { + "epoch": 0.3132, + "grad_norm": 12.71379566192627, + "learning_rate": 1.2188245066098647e-05, + "loss": 0.6783, + "step": 1566 + }, + { + "epoch": 0.3136, + "grad_norm": 2.4985337257385254, + "learning_rate": 1.2201867167473015e-05, + "loss": 0.5271, + "step": 1568 + }, + { + "epoch": 0.314, + "grad_norm": 12.026313781738281, + "learning_rate": 1.2215484976194673e-05, + "loss": 0.6661, + "step": 1570 + }, + { + "epoch": 0.3144, + "grad_norm": 4.968122482299805, + "learning_rate": 1.2229098465715002e-05, + "loss": 0.3877, + "step": 1572 + }, + { + "epoch": 0.3148, + "grad_norm": 6.689700126647949, + "learning_rate": 1.2242707609493809e-05, + "loss": 0.8666, + "step": 1574 + }, + { + "epoch": 0.3152, + "grad_norm": 8.481232643127441, + "learning_rate": 1.2256312380999373e-05, + "loss": 0.7138, + "step": 1576 + }, + { + "epoch": 0.3156, + "grad_norm": 4.90254020690918, + "learning_rate": 1.2269912753708496e-05, + "loss": 0.2841, + "step": 1578 + }, + { + "epoch": 0.316, + "grad_norm": 2.501577615737915, + "learning_rate": 1.2283508701106552e-05, + "loss": 0.2501, + "step": 1580 + }, + { + "epoch": 0.3164, + "grad_norm": 9.537087440490723, + "learning_rate": 1.229710019668755e-05, + "loss": 0.6255, + "step": 1582 + }, + { + "epoch": 0.3168, + "grad_norm": 11.163188934326172, + "learning_rate": 1.2310687213954173e-05, + "loss": 0.5149, + "step": 1584 + }, + { + "epoch": 0.3172, + "grad_norm": 5.9706573486328125, + "learning_rate": 1.232426972641785e-05, + "loss": 0.435, + "step": 1586 + }, + { + "epoch": 0.3176, + "grad_norm": 4.964463710784912, + "learning_rate": 1.233784770759873e-05, + "loss": 0.4665, + "step": 1588 + }, + { + "epoch": 0.318, + "grad_norm": 5.17854642868042, + "learning_rate": 1.2351421131025891e-05, + "loss": 0.4201, + "step": 1590 + }, + { + "epoch": 0.3184, + "grad_norm": 4.845917224884033, + "learning_rate": 1.2364989970237238e-05, + "loss": 0.3849, + "step": 1592 + }, + { + "epoch": 0.3188, + "grad_norm": 4.7801361083984375, + "learning_rate": 1.237855419877962e-05, + "loss": 0.324, + "step": 1594 + }, + { + "epoch": 0.3192, + "grad_norm": 6.55853271484375, + "learning_rate": 1.23921137902089e-05, + "loss": 0.4364, + "step": 1596 + }, + { + "epoch": 0.3196, + "grad_norm": 3.8424408435821533, + "learning_rate": 1.2405668718089906e-05, + "loss": 0.3761, + "step": 1598 + }, + { + "epoch": 0.32, + "grad_norm": 11.122886657714844, + "learning_rate": 1.241921895599668e-05, + "loss": 0.46, + "step": 1600 + }, + { + "epoch": 0.3204, + "grad_norm": 11.784784317016602, + "learning_rate": 1.2432764477512295e-05, + "loss": 0.4029, + "step": 1602 + }, + { + "epoch": 0.3208, + "grad_norm": 3.6870334148406982, + "learning_rate": 1.2446305256229076e-05, + "loss": 0.1883, + "step": 1604 + }, + { + "epoch": 0.3212, + "grad_norm": 2.482841730117798, + "learning_rate": 1.2459841265748582e-05, + "loss": 0.2445, + "step": 1606 + }, + { + "epoch": 0.3216, + "grad_norm": 6.531163692474365, + "learning_rate": 1.2473372479681653e-05, + "loss": 0.3448, + "step": 1608 + }, + { + "epoch": 0.322, + "grad_norm": 2.568314790725708, + "learning_rate": 1.2486898871648547e-05, + "loss": 0.1402, + "step": 1610 + }, + { + "epoch": 0.3224, + "grad_norm": 7.629702091217041, + "learning_rate": 1.2500420415278822e-05, + "loss": 0.4403, + "step": 1612 + }, + { + "epoch": 0.3228, + "grad_norm": 6.067082405090332, + "learning_rate": 1.2513937084211546e-05, + "loss": 0.4562, + "step": 1614 + }, + { + "epoch": 0.3232, + "grad_norm": 7.10764217376709, + "learning_rate": 1.2527448852095292e-05, + "loss": 0.9565, + "step": 1616 + }, + { + "epoch": 0.3236, + "grad_norm": 3.859891653060913, + "learning_rate": 1.2540955692588167e-05, + "loss": 0.3863, + "step": 1618 + }, + { + "epoch": 0.324, + "grad_norm": 2.9591643810272217, + "learning_rate": 1.2554457579357902e-05, + "loss": 0.4094, + "step": 1620 + }, + { + "epoch": 0.3244, + "grad_norm": 6.026986598968506, + "learning_rate": 1.2567954486081873e-05, + "loss": 0.2616, + "step": 1622 + }, + { + "epoch": 0.3248, + "grad_norm": 10.441879272460938, + "learning_rate": 1.2581446386447171e-05, + "loss": 0.6687, + "step": 1624 + }, + { + "epoch": 0.3252, + "grad_norm": 2.2194833755493164, + "learning_rate": 1.2594933254150647e-05, + "loss": 0.369, + "step": 1626 + }, + { + "epoch": 0.3256, + "grad_norm": 3.6579172611236572, + "learning_rate": 1.2608415062898963e-05, + "loss": 0.2729, + "step": 1628 + }, + { + "epoch": 0.326, + "grad_norm": 7.153888702392578, + "learning_rate": 1.262189178640864e-05, + "loss": 0.7914, + "step": 1630 + }, + { + "epoch": 0.3264, + "grad_norm": 6.044541835784912, + "learning_rate": 1.2635363398406133e-05, + "loss": 0.2306, + "step": 1632 + }, + { + "epoch": 0.3268, + "grad_norm": 14.341837882995605, + "learning_rate": 1.2648829872627797e-05, + "loss": 0.7484, + "step": 1634 + }, + { + "epoch": 0.3272, + "grad_norm": 5.586398124694824, + "learning_rate": 1.266229118282012e-05, + "loss": 0.4744, + "step": 1636 + }, + { + "epoch": 0.3276, + "grad_norm": 3.119391679763794, + "learning_rate": 1.2675747302739516e-05, + "loss": 0.1752, + "step": 1638 + }, + { + "epoch": 0.328, + "grad_norm": 2.412724733352661, + "learning_rate": 1.2689198206152644e-05, + "loss": 0.345, + "step": 1640 + }, + { + "epoch": 0.3284, + "grad_norm": 2.232966899871826, + "learning_rate": 1.2702643866836281e-05, + "loss": 0.268, + "step": 1642 + }, + { + "epoch": 0.3288, + "grad_norm": 19.437339782714844, + "learning_rate": 1.2716084258577373e-05, + "loss": 1.5493, + "step": 1644 + }, + { + "epoch": 0.3292, + "grad_norm": 3.2580904960632324, + "learning_rate": 1.2729519355173254e-05, + "loss": 0.5426, + "step": 1646 + }, + { + "epoch": 0.3296, + "grad_norm": 16.84480094909668, + "learning_rate": 1.2742949130431468e-05, + "loss": 1.1757, + "step": 1648 + }, + { + "epoch": 0.33, + "grad_norm": 4.353577136993408, + "learning_rate": 1.2756373558169992e-05, + "loss": 0.2942, + "step": 1650 + }, + { + "epoch": 0.3304, + "grad_norm": 8.311387062072754, + "learning_rate": 1.2769792612217224e-05, + "loss": 0.5383, + "step": 1652 + }, + { + "epoch": 0.3308, + "grad_norm": 8.657102584838867, + "learning_rate": 1.2783206266412028e-05, + "loss": 0.6397, + "step": 1654 + }, + { + "epoch": 0.3312, + "grad_norm": 7.5646185874938965, + "learning_rate": 1.2796614494603795e-05, + "loss": 0.6236, + "step": 1656 + }, + { + "epoch": 0.3316, + "grad_norm": 3.216963291168213, + "learning_rate": 1.2810017270652508e-05, + "loss": 0.2354, + "step": 1658 + }, + { + "epoch": 0.332, + "grad_norm": 6.212801456451416, + "learning_rate": 1.282341456842876e-05, + "loss": 0.7439, + "step": 1660 + }, + { + "epoch": 0.3324, + "grad_norm": 4.682183265686035, + "learning_rate": 1.283680636181384e-05, + "loss": 0.3804, + "step": 1662 + }, + { + "epoch": 0.3328, + "grad_norm": 6.597832679748535, + "learning_rate": 1.2850192624699756e-05, + "loss": 0.6948, + "step": 1664 + }, + { + "epoch": 0.3332, + "grad_norm": 5.692109107971191, + "learning_rate": 1.2863573330989308e-05, + "loss": 0.2297, + "step": 1666 + }, + { + "epoch": 0.3336, + "grad_norm": 7.669611930847168, + "learning_rate": 1.2876948454596122e-05, + "loss": 0.5082, + "step": 1668 + }, + { + "epoch": 0.334, + "grad_norm": 3.742065191268921, + "learning_rate": 1.2890317969444708e-05, + "loss": 0.2623, + "step": 1670 + }, + { + "epoch": 0.3344, + "grad_norm": 4.147270202636719, + "learning_rate": 1.2903681849470535e-05, + "loss": 0.3952, + "step": 1672 + }, + { + "epoch": 0.3348, + "grad_norm": 4.144037246704102, + "learning_rate": 1.291704006861998e-05, + "loss": 0.3569, + "step": 1674 + }, + { + "epoch": 0.3352, + "grad_norm": 7.955838680267334, + "learning_rate": 1.2930392600850565e-05, + "loss": 0.6596, + "step": 1676 + }, + { + "epoch": 0.3356, + "grad_norm": 2.2395710945129395, + "learning_rate": 1.2943739420130843e-05, + "loss": 0.2516, + "step": 1678 + }, + { + "epoch": 0.336, + "grad_norm": 4.407259941101074, + "learning_rate": 1.2957080500440455e-05, + "loss": 0.4176, + "step": 1680 + }, + { + "epoch": 0.3364, + "grad_norm": 9.191736221313477, + "learning_rate": 1.2970415815770353e-05, + "loss": 1.0477, + "step": 1682 + }, + { + "epoch": 0.3368, + "grad_norm": 9.33099365234375, + "learning_rate": 1.2983745340122589e-05, + "loss": 0.8834, + "step": 1684 + }, + { + "epoch": 0.3372, + "grad_norm": 6.055780410766602, + "learning_rate": 1.299706904751064e-05, + "loss": 0.4361, + "step": 1686 + }, + { + "epoch": 0.3376, + "grad_norm": 5.589316368103027, + "learning_rate": 1.3010386911959205e-05, + "loss": 0.3616, + "step": 1688 + }, + { + "epoch": 0.338, + "grad_norm": 7.299569129943848, + "learning_rate": 1.3023698907504447e-05, + "loss": 0.4951, + "step": 1690 + }, + { + "epoch": 0.3384, + "grad_norm": 2.744166612625122, + "learning_rate": 1.3037005008193944e-05, + "loss": 0.3283, + "step": 1692 + }, + { + "epoch": 0.3388, + "grad_norm": 2.0484960079193115, + "learning_rate": 1.3050305188086757e-05, + "loss": 0.4505, + "step": 1694 + }, + { + "epoch": 0.3392, + "grad_norm": 8.468043327331543, + "learning_rate": 1.3063599421253556e-05, + "loss": 0.7875, + "step": 1696 + }, + { + "epoch": 0.3396, + "grad_norm": 17.935962677001953, + "learning_rate": 1.3076887681776504e-05, + "loss": 0.4442, + "step": 1698 + }, + { + "epoch": 0.34, + "grad_norm": 5.646809101104736, + "learning_rate": 1.309016994374947e-05, + "loss": 0.4909, + "step": 1700 + }, + { + "epoch": 0.3404, + "grad_norm": 4.684771537780762, + "learning_rate": 1.310344618127801e-05, + "loss": 0.3862, + "step": 1702 + }, + { + "epoch": 0.3408, + "grad_norm": 3.6506683826446533, + "learning_rate": 1.3116716368479415e-05, + "loss": 0.2375, + "step": 1704 + }, + { + "epoch": 0.3412, + "grad_norm": 3.4178245067596436, + "learning_rate": 1.3129980479482776e-05, + "loss": 0.3632, + "step": 1706 + }, + { + "epoch": 0.3416, + "grad_norm": 15.628225326538086, + "learning_rate": 1.3143238488429049e-05, + "loss": 0.5638, + "step": 1708 + }, + { + "epoch": 0.342, + "grad_norm": 2.9518306255340576, + "learning_rate": 1.3156490369471018e-05, + "loss": 0.1801, + "step": 1710 + }, + { + "epoch": 0.3424, + "grad_norm": 5.066194534301758, + "learning_rate": 1.316973609677351e-05, + "loss": 0.3511, + "step": 1712 + }, + { + "epoch": 0.3428, + "grad_norm": 2.7387843132019043, + "learning_rate": 1.3182975644513286e-05, + "loss": 0.4853, + "step": 1714 + }, + { + "epoch": 0.3432, + "grad_norm": 12.181938171386719, + "learning_rate": 1.319620898687917e-05, + "loss": 0.745, + "step": 1716 + }, + { + "epoch": 0.3436, + "grad_norm": 6.578967094421387, + "learning_rate": 1.3209436098072102e-05, + "loss": 0.5224, + "step": 1718 + }, + { + "epoch": 0.344, + "grad_norm": 10.673359870910645, + "learning_rate": 1.32226569523051e-05, + "loss": 0.6657, + "step": 1720 + }, + { + "epoch": 0.3444, + "grad_norm": 5.363398551940918, + "learning_rate": 1.3235871523803501e-05, + "loss": 0.1734, + "step": 1722 + }, + { + "epoch": 0.3448, + "grad_norm": 2.7951323986053467, + "learning_rate": 1.324907978680475e-05, + "loss": 0.4193, + "step": 1724 + }, + { + "epoch": 0.3452, + "grad_norm": 8.97787857055664, + "learning_rate": 1.3262281715558738e-05, + "loss": 0.4816, + "step": 1726 + }, + { + "epoch": 0.3456, + "grad_norm": 4.193719387054443, + "learning_rate": 1.3275477284327572e-05, + "loss": 0.3761, + "step": 1728 + }, + { + "epoch": 0.346, + "grad_norm": 7.538856029510498, + "learning_rate": 1.3288666467385815e-05, + "loss": 0.4335, + "step": 1730 + }, + { + "epoch": 0.3464, + "grad_norm": 6.087364673614502, + "learning_rate": 1.3301849239020537e-05, + "loss": 0.4628, + "step": 1732 + }, + { + "epoch": 0.3468, + "grad_norm": 10.74787425994873, + "learning_rate": 1.3315025573531193e-05, + "loss": 0.4834, + "step": 1734 + }, + { + "epoch": 0.3472, + "grad_norm": 7.593412399291992, + "learning_rate": 1.3328195445229865e-05, + "loss": 0.4152, + "step": 1736 + }, + { + "epoch": 0.3476, + "grad_norm": 4.296180725097656, + "learning_rate": 1.3341358828441214e-05, + "loss": 0.402, + "step": 1738 + }, + { + "epoch": 0.348, + "grad_norm": 9.378597259521484, + "learning_rate": 1.3354515697502548e-05, + "loss": 0.7142, + "step": 1740 + }, + { + "epoch": 0.3484, + "grad_norm": 4.494250774383545, + "learning_rate": 1.3367666026763879e-05, + "loss": 0.076, + "step": 1742 + }, + { + "epoch": 0.3488, + "grad_norm": 10.188462257385254, + "learning_rate": 1.338080979058797e-05, + "loss": 0.5774, + "step": 1744 + }, + { + "epoch": 0.3492, + "grad_norm": 2.949403762817383, + "learning_rate": 1.3393946963350378e-05, + "loss": 0.3293, + "step": 1746 + }, + { + "epoch": 0.3496, + "grad_norm": 10.005936622619629, + "learning_rate": 1.340707751943951e-05, + "loss": 0.5311, + "step": 1748 + }, + { + "epoch": 0.35, + "grad_norm": 6.235882759094238, + "learning_rate": 1.3420201433256682e-05, + "loss": 0.5505, + "step": 1750 + }, + { + "epoch": 0.3504, + "grad_norm": 6.3505706787109375, + "learning_rate": 1.3433318679216145e-05, + "loss": 0.4771, + "step": 1752 + }, + { + "epoch": 0.3508, + "grad_norm": 7.070293426513672, + "learning_rate": 1.3446429231745162e-05, + "loss": 0.4544, + "step": 1754 + }, + { + "epoch": 0.3512, + "grad_norm": 3.5526082515716553, + "learning_rate": 1.3459533065284039e-05, + "loss": 0.3455, + "step": 1756 + }, + { + "epoch": 0.3516, + "grad_norm": 2.9448580741882324, + "learning_rate": 1.3472630154286197e-05, + "loss": 0.2316, + "step": 1758 + }, + { + "epoch": 0.352, + "grad_norm": 8.107796669006348, + "learning_rate": 1.348572047321814e-05, + "loss": 0.3516, + "step": 1760 + }, + { + "epoch": 0.3524, + "grad_norm": 8.20751953125, + "learning_rate": 1.3498803996559692e-05, + "loss": 0.7306, + "step": 1762 + }, + { + "epoch": 0.3528, + "grad_norm": 4.305896759033203, + "learning_rate": 1.3511880698803803e-05, + "loss": 0.3823, + "step": 1764 + }, + { + "epoch": 0.3532, + "grad_norm": 6.242337703704834, + "learning_rate": 1.3524950554456773e-05, + "loss": 0.7558, + "step": 1766 + }, + { + "epoch": 0.3536, + "grad_norm": 5.782428741455078, + "learning_rate": 1.3538013538038296e-05, + "loss": 1.1828, + "step": 1768 + }, + { + "epoch": 0.354, + "grad_norm": 4.406677722930908, + "learning_rate": 1.3551069624081356e-05, + "loss": 0.4383, + "step": 1770 + }, + { + "epoch": 0.3544, + "grad_norm": 6.066643238067627, + "learning_rate": 1.3564118787132507e-05, + "loss": 0.3099, + "step": 1772 + }, + { + "epoch": 0.3548, + "grad_norm": 6.364744663238525, + "learning_rate": 1.3577161001751692e-05, + "loss": 0.3121, + "step": 1774 + }, + { + "epoch": 0.3552, + "grad_norm": 15.8635892868042, + "learning_rate": 1.3590196242512461e-05, + "loss": 0.948, + "step": 1776 + }, + { + "epoch": 0.3556, + "grad_norm": 3.743403434753418, + "learning_rate": 1.3603224484001944e-05, + "loss": 0.2648, + "step": 1778 + }, + { + "epoch": 0.356, + "grad_norm": 6.918825149536133, + "learning_rate": 1.361624570082092e-05, + "loss": 1.29, + "step": 1780 + }, + { + "epoch": 0.3564, + "grad_norm": 12.671012878417969, + "learning_rate": 1.362925986758386e-05, + "loss": 1.2126, + "step": 1782 + }, + { + "epoch": 0.3568, + "grad_norm": 3.786299228668213, + "learning_rate": 1.364226695891898e-05, + "loss": 0.4158, + "step": 1784 + }, + { + "epoch": 0.3572, + "grad_norm": 5.928519248962402, + "learning_rate": 1.3655266949468287e-05, + "loss": 0.5281, + "step": 1786 + }, + { + "epoch": 0.3576, + "grad_norm": 3.759648561477661, + "learning_rate": 1.3668259813887637e-05, + "loss": 0.3005, + "step": 1788 + }, + { + "epoch": 0.358, + "grad_norm": 7.388206481933594, + "learning_rate": 1.3681245526846773e-05, + "loss": 0.5464, + "step": 1790 + }, + { + "epoch": 0.3584, + "grad_norm": 3.970754384994507, + "learning_rate": 1.3694224063029386e-05, + "loss": 0.6014, + "step": 1792 + }, + { + "epoch": 0.3588, + "grad_norm": 9.178227424621582, + "learning_rate": 1.3707195397133176e-05, + "loss": 0.5669, + "step": 1794 + }, + { + "epoch": 0.3592, + "grad_norm": 1.742365837097168, + "learning_rate": 1.3720159503869806e-05, + "loss": 0.188, + "step": 1796 + }, + { + "epoch": 0.3596, + "grad_norm": 2.9161486625671387, + "learning_rate": 1.3733116357965156e-05, + "loss": 0.6607, + "step": 1798 + }, + { + "epoch": 0.36, + "grad_norm": 3.7825911045074463, + "learning_rate": 1.374606593415911e-05, + "loss": 0.5104, + "step": 1800 + }, + { + "epoch": 0.3604, + "grad_norm": 25.147523880004883, + "learning_rate": 1.3759008207205855e-05, + "loss": 1.9805, + "step": 1802 + }, + { + "epoch": 0.3608, + "grad_norm": 2.268021583557129, + "learning_rate": 1.377194315187377e-05, + "loss": 0.195, + "step": 1804 + }, + { + "epoch": 0.3612, + "grad_norm": 8.332799911499023, + "learning_rate": 1.3784870742945468e-05, + "loss": 0.45, + "step": 1806 + }, + { + "epoch": 0.3616, + "grad_norm": 2.709839105606079, + "learning_rate": 1.3797790955218014e-05, + "loss": 0.413, + "step": 1808 + }, + { + "epoch": 0.362, + "grad_norm": 5.749772548675537, + "learning_rate": 1.3810703763502744e-05, + "loss": 0.3091, + "step": 1810 + }, + { + "epoch": 0.3624, + "grad_norm": 5.364800453186035, + "learning_rate": 1.3823609142625492e-05, + "loss": 0.3164, + "step": 1812 + }, + { + "epoch": 0.3628, + "grad_norm": 5.713923454284668, + "learning_rate": 1.3836507067426563e-05, + "loss": 0.4804, + "step": 1814 + }, + { + "epoch": 0.3632, + "grad_norm": 4.533278465270996, + "learning_rate": 1.3849397512760793e-05, + "loss": 0.2778, + "step": 1816 + }, + { + "epoch": 0.3636, + "grad_norm": 10.91341495513916, + "learning_rate": 1.38622804534976e-05, + "loss": 0.575, + "step": 1818 + }, + { + "epoch": 0.364, + "grad_norm": 5.871316432952881, + "learning_rate": 1.3875155864521027e-05, + "loss": 0.4173, + "step": 1820 + }, + { + "epoch": 0.3644, + "grad_norm": 3.2955284118652344, + "learning_rate": 1.3888023720729806e-05, + "loss": 0.5569, + "step": 1822 + }, + { + "epoch": 0.3648, + "grad_norm": 6.007755279541016, + "learning_rate": 1.3900883997037393e-05, + "loss": 0.1762, + "step": 1824 + }, + { + "epoch": 0.3652, + "grad_norm": 5.417972087860107, + "learning_rate": 1.391373666837202e-05, + "loss": 0.4329, + "step": 1826 + }, + { + "epoch": 0.3656, + "grad_norm": 5.12111234664917, + "learning_rate": 1.3926581709676746e-05, + "loss": 0.496, + "step": 1828 + }, + { + "epoch": 0.366, + "grad_norm": 9.300477027893066, + "learning_rate": 1.3939419095909506e-05, + "loss": 0.4702, + "step": 1830 + }, + { + "epoch": 0.3664, + "grad_norm": 6.063104629516602, + "learning_rate": 1.3952248802043158e-05, + "loss": 0.404, + "step": 1832 + }, + { + "epoch": 0.3668, + "grad_norm": 9.060558319091797, + "learning_rate": 1.396507080306555e-05, + "loss": 0.485, + "step": 1834 + }, + { + "epoch": 0.3672, + "grad_norm": 5.816432476043701, + "learning_rate": 1.397788507397949e-05, + "loss": 0.4185, + "step": 1836 + }, + { + "epoch": 0.3676, + "grad_norm": 2.9675443172454834, + "learning_rate": 1.3990691589802943e-05, + "loss": 0.4634, + "step": 1838 + }, + { + "epoch": 0.368, + "grad_norm": 6.009825229644775, + "learning_rate": 1.4003490325568956e-05, + "loss": 0.3431, + "step": 1840 + }, + { + "epoch": 0.3684, + "grad_norm": 5.107151985168457, + "learning_rate": 1.4016281256325688e-05, + "loss": 0.4816, + "step": 1842 + }, + { + "epoch": 0.3688, + "grad_norm": 3.044797420501709, + "learning_rate": 1.4029064357136632e-05, + "loss": 0.4541, + "step": 1844 + }, + { + "epoch": 0.3692, + "grad_norm": 4.3653788566589355, + "learning_rate": 1.4041839603080411e-05, + "loss": 0.6533, + "step": 1846 + }, + { + "epoch": 0.3696, + "grad_norm": 5.475567817687988, + "learning_rate": 1.4054606969251096e-05, + "loss": 0.4678, + "step": 1848 + }, + { + "epoch": 0.37, + "grad_norm": 5.718444347381592, + "learning_rate": 1.4067366430758004e-05, + "loss": 0.4311, + "step": 1850 + }, + { + "epoch": 0.3704, + "grad_norm": 4.869439601898193, + "learning_rate": 1.4080117962725929e-05, + "loss": 0.3124, + "step": 1852 + }, + { + "epoch": 0.3708, + "grad_norm": 9.038970947265625, + "learning_rate": 1.4092861540295107e-05, + "loss": 0.3434, + "step": 1854 + }, + { + "epoch": 0.3712, + "grad_norm": 15.572250366210938, + "learning_rate": 1.4105597138621281e-05, + "loss": 0.6732, + "step": 1856 + }, + { + "epoch": 0.3716, + "grad_norm": 8.4092378616333, + "learning_rate": 1.411832473287575e-05, + "loss": 0.6012, + "step": 1858 + }, + { + "epoch": 0.372, + "grad_norm": 9.829660415649414, + "learning_rate": 1.4131044298245416e-05, + "loss": 0.3163, + "step": 1860 + }, + { + "epoch": 0.3724, + "grad_norm": 9.656953811645508, + "learning_rate": 1.414375580993284e-05, + "loss": 0.5574, + "step": 1862 + }, + { + "epoch": 0.3728, + "grad_norm": 10.614069938659668, + "learning_rate": 1.4156459243156275e-05, + "loss": 0.4013, + "step": 1864 + }, + { + "epoch": 0.3732, + "grad_norm": 9.007293701171875, + "learning_rate": 1.416915457314973e-05, + "loss": 0.6109, + "step": 1866 + }, + { + "epoch": 0.3736, + "grad_norm": 7.276789665222168, + "learning_rate": 1.418184177516301e-05, + "loss": 0.5648, + "step": 1868 + }, + { + "epoch": 0.374, + "grad_norm": 4.238958358764648, + "learning_rate": 1.4194520824461782e-05, + "loss": 0.1757, + "step": 1870 + }, + { + "epoch": 0.3744, + "grad_norm": 18.47603416442871, + "learning_rate": 1.420719169632754e-05, + "loss": 0.9498, + "step": 1872 + }, + { + "epoch": 0.3748, + "grad_norm": 8.405122756958008, + "learning_rate": 1.4219854366057821e-05, + "loss": 0.4578, + "step": 1874 + }, + { + "epoch": 0.3752, + "grad_norm": 6.351670742034912, + "learning_rate": 1.4232508808966085e-05, + "loss": 0.2737, + "step": 1876 + }, + { + "epoch": 0.3756, + "grad_norm": 4.195537567138672, + "learning_rate": 1.424515500038185e-05, + "loss": 0.4271, + "step": 1878 + }, + { + "epoch": 0.376, + "grad_norm": 8.796152114868164, + "learning_rate": 1.4257792915650735e-05, + "loss": 0.8, + "step": 1880 + }, + { + "epoch": 0.3764, + "grad_norm": 5.769218921661377, + "learning_rate": 1.4270422530134425e-05, + "loss": 0.2926, + "step": 1882 + }, + { + "epoch": 0.3768, + "grad_norm": 11.621100425720215, + "learning_rate": 1.4283043819210906e-05, + "loss": 0.5649, + "step": 1884 + }, + { + "epoch": 0.3772, + "grad_norm": 2.7881314754486084, + "learning_rate": 1.4295656758274288e-05, + "loss": 0.2729, + "step": 1886 + }, + { + "epoch": 0.3776, + "grad_norm": 4.934391975402832, + "learning_rate": 1.430826132273499e-05, + "loss": 0.3345, + "step": 1888 + }, + { + "epoch": 0.378, + "grad_norm": 4.801124572753906, + "learning_rate": 1.4320857488019826e-05, + "loss": 0.4572, + "step": 1890 + }, + { + "epoch": 0.3784, + "grad_norm": 11.943818092346191, + "learning_rate": 1.4333445229571857e-05, + "loss": 0.6429, + "step": 1892 + }, + { + "epoch": 0.3788, + "grad_norm": 4.150667190551758, + "learning_rate": 1.4346024522850704e-05, + "loss": 0.4897, + "step": 1894 + }, + { + "epoch": 0.3792, + "grad_norm": 6.418819904327393, + "learning_rate": 1.4358595343332342e-05, + "loss": 0.3912, + "step": 1896 + }, + { + "epoch": 0.3796, + "grad_norm": 5.631911277770996, + "learning_rate": 1.437115766650933e-05, + "loss": 0.1897, + "step": 1898 + }, + { + "epoch": 0.38, + "grad_norm": 0.9409749507904053, + "learning_rate": 1.4383711467890772e-05, + "loss": 0.1753, + "step": 1900 + }, + { + "epoch": 0.3804, + "grad_norm": 11.083687782287598, + "learning_rate": 1.4396256723002398e-05, + "loss": 0.6141, + "step": 1902 + }, + { + "epoch": 0.3808, + "grad_norm": 11.110014915466309, + "learning_rate": 1.4408793407386584e-05, + "loss": 0.7392, + "step": 1904 + }, + { + "epoch": 0.3812, + "grad_norm": 4.622432708740234, + "learning_rate": 1.4421321496602423e-05, + "loss": 0.285, + "step": 1906 + }, + { + "epoch": 0.3816, + "grad_norm": 6.6718926429748535, + "learning_rate": 1.4433840966225767e-05, + "loss": 0.5276, + "step": 1908 + }, + { + "epoch": 0.382, + "grad_norm": 7.286358833312988, + "learning_rate": 1.444635179184927e-05, + "loss": 0.4777, + "step": 1910 + }, + { + "epoch": 0.3824, + "grad_norm": 4.509498596191406, + "learning_rate": 1.4458853949082434e-05, + "loss": 0.3266, + "step": 1912 + }, + { + "epoch": 0.3828, + "grad_norm": 7.41378116607666, + "learning_rate": 1.4471347413551665e-05, + "loss": 0.3755, + "step": 1914 + }, + { + "epoch": 0.3832, + "grad_norm": 8.138635635375977, + "learning_rate": 1.4483832160900332e-05, + "loss": 0.3791, + "step": 1916 + }, + { + "epoch": 0.3836, + "grad_norm": 5.865223407745361, + "learning_rate": 1.4496308166788731e-05, + "loss": 0.4633, + "step": 1918 + }, + { + "epoch": 0.384, + "grad_norm": 4.405348300933838, + "learning_rate": 1.4508775406894315e-05, + "loss": 0.1839, + "step": 1920 + }, + { + "epoch": 0.3844, + "grad_norm": 12.141889572143555, + "learning_rate": 1.4521233856911499e-05, + "loss": 0.4177, + "step": 1922 + }, + { + "epoch": 0.3848, + "grad_norm": 14.752460479736328, + "learning_rate": 1.4533683492551942e-05, + "loss": 0.6842, + "step": 1924 + }, + { + "epoch": 0.3852, + "grad_norm": 0.015086171217262745, + "learning_rate": 1.4546124289544446e-05, + "loss": 0.2763, + "step": 1926 + }, + { + "epoch": 0.3856, + "grad_norm": 9.205449104309082, + "learning_rate": 1.4558556223634988e-05, + "loss": 0.6257, + "step": 1928 + }, + { + "epoch": 0.386, + "grad_norm": 2.591710329055786, + "learning_rate": 1.4570979270586944e-05, + "loss": 0.5242, + "step": 1930 + }, + { + "epoch": 0.3864, + "grad_norm": 2.6265180110931396, + "learning_rate": 1.4583393406180886e-05, + "loss": 0.1673, + "step": 1932 + }, + { + "epoch": 0.3868, + "grad_norm": 14.033515930175781, + "learning_rate": 1.4595798606214882e-05, + "loss": 0.5549, + "step": 1934 + }, + { + "epoch": 0.3872, + "grad_norm": 5.748865604400635, + "learning_rate": 1.460819484650431e-05, + "loss": 0.4719, + "step": 1936 + }, + { + "epoch": 0.3876, + "grad_norm": 6.74936056137085, + "learning_rate": 1.4620582102882086e-05, + "loss": 0.5007, + "step": 1938 + }, + { + "epoch": 0.388, + "grad_norm": 5.440566539764404, + "learning_rate": 1.4632960351198618e-05, + "loss": 0.6025, + "step": 1940 + }, + { + "epoch": 0.3884, + "grad_norm": 3.586538553237915, + "learning_rate": 1.4645329567321875e-05, + "loss": 0.216, + "step": 1942 + }, + { + "epoch": 0.3888, + "grad_norm": 3.7532339096069336, + "learning_rate": 1.4657689727137441e-05, + "loss": 0.4005, + "step": 1944 + }, + { + "epoch": 0.3892, + "grad_norm": 9.638934135437012, + "learning_rate": 1.4670040806548551e-05, + "loss": 0.6614, + "step": 1946 + }, + { + "epoch": 0.3896, + "grad_norm": 4.399918079376221, + "learning_rate": 1.468238278147614e-05, + "loss": 0.2926, + "step": 1948 + }, + { + "epoch": 0.39, + "grad_norm": 13.854656219482422, + "learning_rate": 1.4694715627858904e-05, + "loss": 0.4408, + "step": 1950 + }, + { + "epoch": 0.3904, + "grad_norm": 5.668638229370117, + "learning_rate": 1.470703932165332e-05, + "loss": 0.6065, + "step": 1952 + }, + { + "epoch": 0.3908, + "grad_norm": 6.510517597198486, + "learning_rate": 1.471935383883372e-05, + "loss": 0.7406, + "step": 1954 + }, + { + "epoch": 0.3912, + "grad_norm": 7.37987756729126, + "learning_rate": 1.4731659155392339e-05, + "loss": 0.3055, + "step": 1956 + }, + { + "epoch": 0.3916, + "grad_norm": 1.8055506944656372, + "learning_rate": 1.4743955247339286e-05, + "loss": 0.3043, + "step": 1958 + }, + { + "epoch": 0.392, + "grad_norm": 8.169949531555176, + "learning_rate": 1.4756242090702744e-05, + "loss": 0.3113, + "step": 1960 + }, + { + "epoch": 0.3924, + "grad_norm": 9.140602111816406, + "learning_rate": 1.476851966152887e-05, + "loss": 0.4318, + "step": 1962 + }, + { + "epoch": 0.3928, + "grad_norm": 5.031211853027344, + "learning_rate": 1.4780787935881913e-05, + "loss": 0.4553, + "step": 1964 + }, + { + "epoch": 0.3932, + "grad_norm": 6.515687465667725, + "learning_rate": 1.4793046889844255e-05, + "loss": 0.4338, + "step": 1966 + }, + { + "epoch": 0.3936, + "grad_norm": 7.118400573730469, + "learning_rate": 1.4805296499516397e-05, + "loss": 0.2253, + "step": 1968 + }, + { + "epoch": 0.394, + "grad_norm": 5.103697299957275, + "learning_rate": 1.4817536741017155e-05, + "loss": 0.5717, + "step": 1970 + }, + { + "epoch": 0.3944, + "grad_norm": 3.105949878692627, + "learning_rate": 1.482976759048351e-05, + "loss": 0.4142, + "step": 1972 + }, + { + "epoch": 0.3948, + "grad_norm": 11.121064186096191, + "learning_rate": 1.4841989024070809e-05, + "loss": 0.8379, + "step": 1974 + }, + { + "epoch": 0.3952, + "grad_norm": 6.576791286468506, + "learning_rate": 1.485420101795274e-05, + "loss": 0.4328, + "step": 1976 + }, + { + "epoch": 0.3956, + "grad_norm": 4.423554420471191, + "learning_rate": 1.4866403548321385e-05, + "loss": 0.4373, + "step": 1978 + }, + { + "epoch": 0.396, + "grad_norm": 5.545256614685059, + "learning_rate": 1.4878596591387327e-05, + "loss": 0.5896, + "step": 1980 + }, + { + "epoch": 0.3964, + "grad_norm": 2.5055270195007324, + "learning_rate": 1.4890780123379563e-05, + "loss": 0.3574, + "step": 1982 + }, + { + "epoch": 0.3968, + "grad_norm": 9.164911270141602, + "learning_rate": 1.4902954120545686e-05, + "loss": 0.2799, + "step": 1984 + }, + { + "epoch": 0.3972, + "grad_norm": 2.9771347045898438, + "learning_rate": 1.491511855915187e-05, + "loss": 0.4272, + "step": 1986 + }, + { + "epoch": 0.3976, + "grad_norm": 2.9130985736846924, + "learning_rate": 1.4927273415482913e-05, + "loss": 0.2553, + "step": 1988 + }, + { + "epoch": 0.398, + "grad_norm": 6.9299798011779785, + "learning_rate": 1.4939418665842307e-05, + "loss": 0.5131, + "step": 1990 + }, + { + "epoch": 0.3984, + "grad_norm": 5.723151683807373, + "learning_rate": 1.4951554286552261e-05, + "loss": 0.191, + "step": 1992 + }, + { + "epoch": 0.3988, + "grad_norm": 2.7095632553100586, + "learning_rate": 1.4963680253953763e-05, + "loss": 0.4826, + "step": 1994 + }, + { + "epoch": 0.3992, + "grad_norm": 11.261700630187988, + "learning_rate": 1.4975796544406617e-05, + "loss": 0.5627, + "step": 1996 + }, + { + "epoch": 0.3996, + "grad_norm": 4.134109020233154, + "learning_rate": 1.49879031342895e-05, + "loss": 0.3046, + "step": 1998 + }, + { + "epoch": 0.4, + "grad_norm": 2.4128613471984863, + "learning_rate": 1.4999999999999992e-05, + "loss": 0.3374, + "step": 2000 + }, + { + "epoch": 0.4004, + "grad_norm": 6.7849202156066895, + "learning_rate": 1.501208711795465e-05, + "loss": 0.3148, + "step": 2002 + }, + { + "epoch": 0.4008, + "grad_norm": 7.420376300811768, + "learning_rate": 1.502416446458897e-05, + "loss": 0.3817, + "step": 2004 + }, + { + "epoch": 0.4012, + "grad_norm": 7.906816482543945, + "learning_rate": 1.5036232016357613e-05, + "loss": 0.359, + "step": 2006 + }, + { + "epoch": 0.4016, + "grad_norm": 8.403003692626953, + "learning_rate": 1.5048289749734206e-05, + "loss": 0.3301, + "step": 2008 + }, + { + "epoch": 0.402, + "grad_norm": 6.352550029754639, + "learning_rate": 1.5060337641211642e-05, + "loss": 0.5726, + "step": 2010 + }, + { + "epoch": 0.4024, + "grad_norm": 5.943541526794434, + "learning_rate": 1.5072375667301895e-05, + "loss": 0.3013, + "step": 2012 + }, + { + "epoch": 0.4028, + "grad_norm": 4.388930320739746, + "learning_rate": 1.5084403804536214e-05, + "loss": 0.4391, + "step": 2014 + }, + { + "epoch": 0.4032, + "grad_norm": 6.8047943115234375, + "learning_rate": 1.5096422029465178e-05, + "loss": 0.2734, + "step": 2016 + }, + { + "epoch": 0.4036, + "grad_norm": 9.448768615722656, + "learning_rate": 1.5108430318658597e-05, + "loss": 0.5655, + "step": 2018 + }, + { + "epoch": 0.404, + "grad_norm": 22.328603744506836, + "learning_rate": 1.5120428648705714e-05, + "loss": 1.2889, + "step": 2020 + }, + { + "epoch": 0.4044, + "grad_norm": 5.997337818145752, + "learning_rate": 1.513241699621517e-05, + "loss": 0.5213, + "step": 2022 + }, + { + "epoch": 0.4048, + "grad_norm": 7.162021636962891, + "learning_rate": 1.5144395337815064e-05, + "loss": 0.4265, + "step": 2024 + }, + { + "epoch": 0.4052, + "grad_norm": 4.528063774108887, + "learning_rate": 1.5156363650153008e-05, + "loss": 0.6424, + "step": 2026 + }, + { + "epoch": 0.4056, + "grad_norm": 4.883796691894531, + "learning_rate": 1.5168321909896166e-05, + "loss": 0.4472, + "step": 2028 + }, + { + "epoch": 0.406, + "grad_norm": 5.761818885803223, + "learning_rate": 1.51802700937313e-05, + "loss": 0.4092, + "step": 2030 + }, + { + "epoch": 0.4064, + "grad_norm": 2.273637294769287, + "learning_rate": 1.5192208178364808e-05, + "loss": 0.2664, + "step": 2032 + }, + { + "epoch": 0.4068, + "grad_norm": 12.147500038146973, + "learning_rate": 1.5204136140522792e-05, + "loss": 0.4563, + "step": 2034 + }, + { + "epoch": 0.4072, + "grad_norm": 10.824847221374512, + "learning_rate": 1.521605395695107e-05, + "loss": 0.6065, + "step": 2036 + }, + { + "epoch": 0.4076, + "grad_norm": 9.831000328063965, + "learning_rate": 1.522796160441526e-05, + "loss": 0.7557, + "step": 2038 + }, + { + "epoch": 0.408, + "grad_norm": 4.785565376281738, + "learning_rate": 1.5239859059700784e-05, + "loss": 0.4542, + "step": 2040 + }, + { + "epoch": 0.4084, + "grad_norm": 6.32716703414917, + "learning_rate": 1.5251746299612964e-05, + "loss": 0.6219, + "step": 2042 + }, + { + "epoch": 0.4088, + "grad_norm": 5.249875068664551, + "learning_rate": 1.526362330097697e-05, + "loss": 0.4897, + "step": 2044 + }, + { + "epoch": 0.4092, + "grad_norm": 5.055958271026611, + "learning_rate": 1.5275490040638044e-05, + "loss": 0.9734, + "step": 2046 + }, + { + "epoch": 0.4096, + "grad_norm": 7.1267409324646, + "learning_rate": 1.5287346495461322e-05, + "loss": 0.4388, + "step": 2048 + }, + { + "epoch": 0.41, + "grad_norm": 3.9413418769836426, + "learning_rate": 1.529919264233204e-05, + "loss": 0.2857, + "step": 2050 + }, + { + "epoch": 0.4104, + "grad_norm": 4.722209453582764, + "learning_rate": 1.531102845815557e-05, + "loss": 0.2586, + "step": 2052 + }, + { + "epoch": 0.4108, + "grad_norm": 2.7357285022735596, + "learning_rate": 1.5322853919857327e-05, + "loss": 0.6948, + "step": 2054 + }, + { + "epoch": 0.4112, + "grad_norm": 14.794729232788086, + "learning_rate": 1.5334669004383025e-05, + "loss": 0.6256, + "step": 2056 + }, + { + "epoch": 0.4116, + "grad_norm": 4.3584208488464355, + "learning_rate": 1.5346473688698514e-05, + "loss": 0.2439, + "step": 2058 + }, + { + "epoch": 0.412, + "grad_norm": 8.636346817016602, + "learning_rate": 1.5358267949789968e-05, + "loss": 0.5606, + "step": 2060 + }, + { + "epoch": 0.4124, + "grad_norm": 5.573737621307373, + "learning_rate": 1.537005176466387e-05, + "loss": 0.4323, + "step": 2062 + }, + { + "epoch": 0.4128, + "grad_norm": 6.821908473968506, + "learning_rate": 1.5381825110347072e-05, + "loss": 0.2293, + "step": 2064 + }, + { + "epoch": 0.4132, + "grad_norm": 6.058467864990234, + "learning_rate": 1.539358796388683e-05, + "loss": 0.5925, + "step": 2066 + }, + { + "epoch": 0.4136, + "grad_norm": 2.4256718158721924, + "learning_rate": 1.540534030235087e-05, + "loss": 0.3887, + "step": 2068 + }, + { + "epoch": 0.414, + "grad_norm": 9.578333854675293, + "learning_rate": 1.5417082102827397e-05, + "loss": 0.2505, + "step": 2070 + }, + { + "epoch": 0.4144, + "grad_norm": 8.364020347595215, + "learning_rate": 1.542881334242517e-05, + "loss": 0.514, + "step": 2072 + }, + { + "epoch": 0.4148, + "grad_norm": 15.070087432861328, + "learning_rate": 1.5440533998273542e-05, + "loss": 0.7837, + "step": 2074 + }, + { + "epoch": 0.4152, + "grad_norm": 3.20394229888916, + "learning_rate": 1.5452244047522493e-05, + "loss": 0.6852, + "step": 2076 + }, + { + "epoch": 0.4156, + "grad_norm": 3.577606201171875, + "learning_rate": 1.54639434673427e-05, + "loss": 0.2459, + "step": 2078 + }, + { + "epoch": 0.416, + "grad_norm": 3.0422732830047607, + "learning_rate": 1.5475632234925495e-05, + "loss": 0.3009, + "step": 2080 + }, + { + "epoch": 0.4164, + "grad_norm": 36.01543426513672, + "learning_rate": 1.548731032748309e-05, + "loss": 1.1359, + "step": 2082 + }, + { + "epoch": 0.4168, + "grad_norm": 2.2395806312561035, + "learning_rate": 1.5498977722248388e-05, + "loss": 0.3906, + "step": 2084 + }, + { + "epoch": 0.4172, + "grad_norm": 10.182401657104492, + "learning_rate": 1.551063439647525e-05, + "loss": 0.4657, + "step": 2086 + }, + { + "epoch": 0.4176, + "grad_norm": 4.630568981170654, + "learning_rate": 1.552228032743839e-05, + "loss": 0.1533, + "step": 2088 + }, + { + "epoch": 0.418, + "grad_norm": 3.013482093811035, + "learning_rate": 1.553391549243343e-05, + "loss": 0.4822, + "step": 2090 + }, + { + "epoch": 0.4184, + "grad_norm": 7.339443683624268, + "learning_rate": 1.5545539868777075e-05, + "loss": 0.4844, + "step": 2092 + }, + { + "epoch": 0.4188, + "grad_norm": 3.509808301925659, + "learning_rate": 1.5557153433806954e-05, + "loss": 0.8041, + "step": 2094 + }, + { + "epoch": 0.4192, + "grad_norm": 9.763633728027344, + "learning_rate": 1.556875616488188e-05, + "loss": 0.4354, + "step": 2096 + }, + { + "epoch": 0.4196, + "grad_norm": 8.563974380493164, + "learning_rate": 1.55803480393817e-05, + "loss": 0.6431, + "step": 2098 + }, + { + "epoch": 0.42, + "grad_norm": 3.1309869289398193, + "learning_rate": 1.5591929034707468e-05, + "loss": 0.1233, + "step": 2100 + }, + { + "epoch": 0.4204, + "grad_norm": 2.6012301445007324, + "learning_rate": 1.5603499128281447e-05, + "loss": 0.3233, + "step": 2102 + }, + { + "epoch": 0.4208, + "grad_norm": 5.163684368133545, + "learning_rate": 1.5615058297547144e-05, + "loss": 0.1732, + "step": 2104 + }, + { + "epoch": 0.4212, + "grad_norm": 7.482677936553955, + "learning_rate": 1.5626606519969366e-05, + "loss": 0.35, + "step": 2106 + }, + { + "epoch": 0.4216, + "grad_norm": 4.716777324676514, + "learning_rate": 1.5638143773034268e-05, + "loss": 0.2038, + "step": 2108 + }, + { + "epoch": 0.422, + "grad_norm": 5.784825801849365, + "learning_rate": 1.5649670034249376e-05, + "loss": 0.2587, + "step": 2110 + }, + { + "epoch": 0.4224, + "grad_norm": 3.057938814163208, + "learning_rate": 1.5661185281143663e-05, + "loss": 0.2455, + "step": 2112 + }, + { + "epoch": 0.4228, + "grad_norm": 2.635023832321167, + "learning_rate": 1.5672689491267562e-05, + "loss": 0.2468, + "step": 2114 + }, + { + "epoch": 0.4232, + "grad_norm": 9.296106338500977, + "learning_rate": 1.5684182642193024e-05, + "loss": 0.5883, + "step": 2116 + }, + { + "epoch": 0.4236, + "grad_norm": 6.580270290374756, + "learning_rate": 1.5695664711513582e-05, + "loss": 0.6287, + "step": 2118 + }, + { + "epoch": 0.424, + "grad_norm": 10.295584678649902, + "learning_rate": 1.5707135676844312e-05, + "loss": 0.3595, + "step": 2120 + }, + { + "epoch": 0.4244, + "grad_norm": 9.673364639282227, + "learning_rate": 1.5718595515822016e-05, + "loss": 0.5241, + "step": 2122 + }, + { + "epoch": 0.4248, + "grad_norm": 4.172829627990723, + "learning_rate": 1.5730044206105146e-05, + "loss": 0.3916, + "step": 2124 + }, + { + "epoch": 0.4252, + "grad_norm": 2.5463786125183105, + "learning_rate": 1.574148172537389e-05, + "loss": 0.3261, + "step": 2126 + }, + { + "epoch": 0.4256, + "grad_norm": 1.7832642793655396, + "learning_rate": 1.5752908051330232e-05, + "loss": 0.603, + "step": 2128 + }, + { + "epoch": 0.426, + "grad_norm": 3.359545946121216, + "learning_rate": 1.5764323161697923e-05, + "loss": 0.2101, + "step": 2130 + }, + { + "epoch": 0.4264, + "grad_norm": 1.9352971315383911, + "learning_rate": 1.577572703422268e-05, + "loss": 0.2304, + "step": 2132 + }, + { + "epoch": 0.4268, + "grad_norm": 4.446115970611572, + "learning_rate": 1.5787119646672025e-05, + "loss": 0.3564, + "step": 2134 + }, + { + "epoch": 0.4272, + "grad_norm": 2.3797149658203125, + "learning_rate": 1.579850097683548e-05, + "loss": 0.2883, + "step": 2136 + }, + { + "epoch": 0.4276, + "grad_norm": 5.991588592529297, + "learning_rate": 1.58098710025246e-05, + "loss": 0.3363, + "step": 2138 + }, + { + "epoch": 0.428, + "grad_norm": 3.6045079231262207, + "learning_rate": 1.582122970157288e-05, + "loss": 0.4972, + "step": 2140 + }, + { + "epoch": 0.4284, + "grad_norm": 5.545891284942627, + "learning_rate": 1.5832577051836016e-05, + "loss": 0.5499, + "step": 2142 + }, + { + "epoch": 0.4288, + "grad_norm": 9.711775779724121, + "learning_rate": 1.5843913031191722e-05, + "loss": 0.498, + "step": 2144 + }, + { + "epoch": 0.4292, + "grad_norm": 3.7357139587402344, + "learning_rate": 1.585523761753994e-05, + "loss": 0.3028, + "step": 2146 + }, + { + "epoch": 0.4296, + "grad_norm": 2.8902530670166016, + "learning_rate": 1.586655078880281e-05, + "loss": 0.3649, + "step": 2148 + }, + { + "epoch": 0.43, + "grad_norm": 4.512652397155762, + "learning_rate": 1.587785252292473e-05, + "loss": 0.2415, + "step": 2150 + }, + { + "epoch": 0.4304, + "grad_norm": 12.956253051757812, + "learning_rate": 1.5889142797872383e-05, + "loss": 0.8148, + "step": 2152 + }, + { + "epoch": 0.4308, + "grad_norm": 14.486699104309082, + "learning_rate": 1.5900421591634806e-05, + "loss": 0.9024, + "step": 2154 + }, + { + "epoch": 0.4312, + "grad_norm": 1.4882330894470215, + "learning_rate": 1.5911688882223415e-05, + "loss": 0.5023, + "step": 2156 + }, + { + "epoch": 0.4316, + "grad_norm": 10.415624618530273, + "learning_rate": 1.5922944647672044e-05, + "loss": 0.4608, + "step": 2158 + }, + { + "epoch": 0.432, + "grad_norm": 3.1366078853607178, + "learning_rate": 1.5934188866037007e-05, + "loss": 0.2516, + "step": 2160 + }, + { + "epoch": 0.4324, + "grad_norm": 3.5895566940307617, + "learning_rate": 1.5945421515397125e-05, + "loss": 0.3878, + "step": 2162 + }, + { + "epoch": 0.4328, + "grad_norm": 5.506830215454102, + "learning_rate": 1.5956642573853787e-05, + "loss": 0.3852, + "step": 2164 + }, + { + "epoch": 0.4332, + "grad_norm": 3.433748960494995, + "learning_rate": 1.5967852019530918e-05, + "loss": 0.3481, + "step": 2166 + }, + { + "epoch": 0.4336, + "grad_norm": 15.233755111694336, + "learning_rate": 1.5979049830575193e-05, + "loss": 0.6416, + "step": 2168 + }, + { + "epoch": 0.434, + "grad_norm": 10.793618202209473, + "learning_rate": 1.599023598515585e-05, + "loss": 0.9893, + "step": 2170 + }, + { + "epoch": 0.4344, + "grad_norm": 6.2497758865356445, + "learning_rate": 1.6001410461464945e-05, + "loss": 0.4625, + "step": 2172 + }, + { + "epoch": 0.4348, + "grad_norm": 3.3467254638671875, + "learning_rate": 1.601257323771727e-05, + "loss": 0.3161, + "step": 2174 + }, + { + "epoch": 0.4352, + "grad_norm": 4.4151787757873535, + "learning_rate": 1.6023724292150377e-05, + "loss": 0.2926, + "step": 2176 + }, + { + "epoch": 0.4356, + "grad_norm": 3.4813687801361084, + "learning_rate": 1.6034863603024768e-05, + "loss": 0.2681, + "step": 2178 + }, + { + "epoch": 0.436, + "grad_norm": 4.516116142272949, + "learning_rate": 1.604599114862375e-05, + "loss": 0.5959, + "step": 2180 + }, + { + "epoch": 0.4364, + "grad_norm": 16.976716995239258, + "learning_rate": 1.6057106907253614e-05, + "loss": 0.7174, + "step": 2182 + }, + { + "epoch": 0.4368, + "grad_norm": 10.261494636535645, + "learning_rate": 1.606821085724362e-05, + "loss": 0.5698, + "step": 2184 + }, + { + "epoch": 0.4372, + "grad_norm": 8.318699836730957, + "learning_rate": 1.6079302976946052e-05, + "loss": 0.6698, + "step": 2186 + }, + { + "epoch": 0.4376, + "grad_norm": 4.550652503967285, + "learning_rate": 1.6090383244736253e-05, + "loss": 0.3535, + "step": 2188 + }, + { + "epoch": 0.438, + "grad_norm": 3.655412197113037, + "learning_rate": 1.6101451639012675e-05, + "loss": 0.2939, + "step": 2190 + }, + { + "epoch": 0.4384, + "grad_norm": 3.4037415981292725, + "learning_rate": 1.6112508138196912e-05, + "loss": 0.3349, + "step": 2192 + }, + { + "epoch": 0.4388, + "grad_norm": 6.7535080909729, + "learning_rate": 1.6123552720733763e-05, + "loss": 0.4121, + "step": 2194 + }, + { + "epoch": 0.4392, + "grad_norm": 4.163005352020264, + "learning_rate": 1.613458536509124e-05, + "loss": 0.2923, + "step": 2196 + }, + { + "epoch": 0.4396, + "grad_norm": 1.9470465183258057, + "learning_rate": 1.614560604976064e-05, + "loss": 0.4083, + "step": 2198 + }, + { + "epoch": 0.44, + "grad_norm": 2.5843069553375244, + "learning_rate": 1.615661475325658e-05, + "loss": 0.4033, + "step": 2200 + }, + { + "epoch": 0.4404, + "grad_norm": 10.48601245880127, + "learning_rate": 1.616761145411702e-05, + "loss": 0.6074, + "step": 2202 + }, + { + "epoch": 0.4408, + "grad_norm": 4.889411926269531, + "learning_rate": 1.6178596130903352e-05, + "loss": 0.305, + "step": 2204 + }, + { + "epoch": 0.4412, + "grad_norm": 9.178106307983398, + "learning_rate": 1.618956876220034e-05, + "loss": 0.3805, + "step": 2206 + }, + { + "epoch": 0.4416, + "grad_norm": 2.855574369430542, + "learning_rate": 1.620052932661632e-05, + "loss": 0.3824, + "step": 2208 + }, + { + "epoch": 0.442, + "grad_norm": 7.788348197937012, + "learning_rate": 1.621147780278311e-05, + "loss": 0.3601, + "step": 2210 + }, + { + "epoch": 0.4424, + "grad_norm": 7.214420795440674, + "learning_rate": 1.6222414169356056e-05, + "loss": 0.3262, + "step": 2212 + }, + { + "epoch": 0.4428, + "grad_norm": 2.3268818855285645, + "learning_rate": 1.6233338405014204e-05, + "loss": 0.236, + "step": 2214 + }, + { + "epoch": 0.4432, + "grad_norm": 5.05888557434082, + "learning_rate": 1.6244250488460146e-05, + "loss": 0.3214, + "step": 2216 + }, + { + "epoch": 0.4436, + "grad_norm": 4.471818447113037, + "learning_rate": 1.6255150398420273e-05, + "loss": 0.3067, + "step": 2218 + }, + { + "epoch": 0.444, + "grad_norm": 1.8080766201019287, + "learning_rate": 1.6266038113644605e-05, + "loss": 0.2934, + "step": 2220 + }, + { + "epoch": 0.4444, + "grad_norm": 3.8794491291046143, + "learning_rate": 1.6276913612907005e-05, + "loss": 0.5317, + "step": 2222 + }, + { + "epoch": 0.4448, + "grad_norm": 4.7862935066223145, + "learning_rate": 1.6287776875005127e-05, + "loss": 0.2927, + "step": 2224 + }, + { + "epoch": 0.4452, + "grad_norm": 7.168172359466553, + "learning_rate": 1.6298627878760488e-05, + "loss": 0.3261, + "step": 2226 + }, + { + "epoch": 0.4456, + "grad_norm": 3.2497003078460693, + "learning_rate": 1.6309466603018497e-05, + "loss": 0.2934, + "step": 2228 + }, + { + "epoch": 0.446, + "grad_norm": 28.85128402709961, + "learning_rate": 1.6320293026648508e-05, + "loss": 1.527, + "step": 2230 + }, + { + "epoch": 0.4464, + "grad_norm": 8.169844627380371, + "learning_rate": 1.6331107128543856e-05, + "loss": 0.2719, + "step": 2232 + }, + { + "epoch": 0.4468, + "grad_norm": 1.612769365310669, + "learning_rate": 1.634190888762189e-05, + "loss": 0.3074, + "step": 2234 + }, + { + "epoch": 0.4472, + "grad_norm": 2.049543619155884, + "learning_rate": 1.635269828282404e-05, + "loss": 0.2926, + "step": 2236 + }, + { + "epoch": 0.4476, + "grad_norm": 8.33244514465332, + "learning_rate": 1.6363475293115818e-05, + "loss": 0.3862, + "step": 2238 + }, + { + "epoch": 0.448, + "grad_norm": 3.5483319759368896, + "learning_rate": 1.6374239897486905e-05, + "loss": 0.2899, + "step": 2240 + }, + { + "epoch": 0.4484, + "grad_norm": 5.057257652282715, + "learning_rate": 1.6384992074951118e-05, + "loss": 0.4239, + "step": 2242 + }, + { + "epoch": 0.4488, + "grad_norm": 2.693240165710449, + "learning_rate": 1.6395731804546575e-05, + "loss": 0.8502, + "step": 2244 + }, + { + "epoch": 0.4492, + "grad_norm": 4.820684909820557, + "learning_rate": 1.640645906533561e-05, + "loss": 0.6494, + "step": 2246 + }, + { + "epoch": 0.4496, + "grad_norm": 7.762986183166504, + "learning_rate": 1.6417173836404878e-05, + "loss": 0.7493, + "step": 2248 + }, + { + "epoch": 0.45, + "grad_norm": 2.582279920578003, + "learning_rate": 1.6427876096865397e-05, + "loss": 0.307, + "step": 2250 + }, + { + "epoch": 0.4504, + "grad_norm": 10.963881492614746, + "learning_rate": 1.643856582585253e-05, + "loss": 0.9635, + "step": 2252 + }, + { + "epoch": 0.4508, + "grad_norm": 5.264702320098877, + "learning_rate": 1.6449243002526146e-05, + "loss": 0.2561, + "step": 2254 + }, + { + "epoch": 0.4512, + "grad_norm": 8.203911781311035, + "learning_rate": 1.6459907606070513e-05, + "loss": 0.3327, + "step": 2256 + }, + { + "epoch": 0.4516, + "grad_norm": 8.469683647155762, + "learning_rate": 1.6470559615694445e-05, + "loss": 0.607, + "step": 2258 + }, + { + "epoch": 0.452, + "grad_norm": 4.850290298461914, + "learning_rate": 1.6481199010631312e-05, + "loss": 0.465, + "step": 2260 + }, + { + "epoch": 0.4524, + "grad_norm": 1.9697999954223633, + "learning_rate": 1.649182577013905e-05, + "loss": 0.3105, + "step": 2262 + }, + { + "epoch": 0.4528, + "grad_norm": 3.9100747108459473, + "learning_rate": 1.650243987350029e-05, + "loss": 0.3069, + "step": 2264 + }, + { + "epoch": 0.4532, + "grad_norm": 5.264925956726074, + "learning_rate": 1.6513041300022253e-05, + "loss": 0.765, + "step": 2266 + }, + { + "epoch": 0.4536, + "grad_norm": 3.58918833732605, + "learning_rate": 1.652363002903693e-05, + "loss": 0.3043, + "step": 2268 + }, + { + "epoch": 0.454, + "grad_norm": 4.738314628601074, + "learning_rate": 1.6534206039901054e-05, + "loss": 0.6025, + "step": 2270 + }, + { + "epoch": 0.4544, + "grad_norm": 2.7000856399536133, + "learning_rate": 1.6544769311996146e-05, + "loss": 0.3375, + "step": 2272 + }, + { + "epoch": 0.4548, + "grad_norm": 3.467404842376709, + "learning_rate": 1.655531982472857e-05, + "loss": 0.7879, + "step": 2274 + }, + { + "epoch": 0.4552, + "grad_norm": 11.088202476501465, + "learning_rate": 1.656585755752956e-05, + "loss": 0.554, + "step": 2276 + }, + { + "epoch": 0.4556, + "grad_norm": 3.882566213607788, + "learning_rate": 1.657638248985527e-05, + "loss": 0.7535, + "step": 2278 + }, + { + "epoch": 0.456, + "grad_norm": 4.153404712677002, + "learning_rate": 1.65868946011868e-05, + "loss": 0.3571, + "step": 2280 + }, + { + "epoch": 0.4564, + "grad_norm": 2.542940139770508, + "learning_rate": 1.6597393871030257e-05, + "loss": 0.2516, + "step": 2282 + }, + { + "epoch": 0.4568, + "grad_norm": 8.220292091369629, + "learning_rate": 1.660788027891677e-05, + "loss": 0.4969, + "step": 2284 + }, + { + "epoch": 0.4572, + "grad_norm": 9.40221881866455, + "learning_rate": 1.6618353804402573e-05, + "loss": 0.7102, + "step": 2286 + }, + { + "epoch": 0.4576, + "grad_norm": 4.67323637008667, + "learning_rate": 1.6628814427068944e-05, + "loss": 0.4013, + "step": 2288 + }, + { + "epoch": 0.458, + "grad_norm": 2.2188870906829834, + "learning_rate": 1.663926212652242e-05, + "loss": 0.3938, + "step": 2290 + }, + { + "epoch": 0.4584, + "grad_norm": 1.3302007913589478, + "learning_rate": 1.6649696882394625e-05, + "loss": 0.2954, + "step": 2292 + }, + { + "epoch": 0.4588, + "grad_norm": 2.350270986557007, + "learning_rate": 1.666011867434252e-05, + "loss": 0.1801, + "step": 2294 + }, + { + "epoch": 0.4592, + "grad_norm": 5.000794887542725, + "learning_rate": 1.667052748204825e-05, + "loss": 0.4042, + "step": 2296 + }, + { + "epoch": 0.4596, + "grad_norm": 2.4424123764038086, + "learning_rate": 1.6680923285219308e-05, + "loss": 0.3843, + "step": 2298 + }, + { + "epoch": 0.46, + "grad_norm": 9.159187316894531, + "learning_rate": 1.6691306063588583e-05, + "loss": 0.5436, + "step": 2300 + }, + { + "epoch": 0.4604, + "grad_norm": 8.516073226928711, + "learning_rate": 1.6701675796914273e-05, + "loss": 0.7537, + "step": 2302 + }, + { + "epoch": 0.4608, + "grad_norm": 8.398138046264648, + "learning_rate": 1.6712032464980094e-05, + "loss": 0.6558, + "step": 2304 + }, + { + "epoch": 0.4612, + "grad_norm": 5.5033698081970215, + "learning_rate": 1.672237604759516e-05, + "loss": 0.2474, + "step": 2306 + }, + { + "epoch": 0.4616, + "grad_norm": 4.244164943695068, + "learning_rate": 1.6732706524594138e-05, + "loss": 0.4633, + "step": 2308 + }, + { + "epoch": 0.462, + "grad_norm": 10.104329109191895, + "learning_rate": 1.6743023875837233e-05, + "loss": 0.6139, + "step": 2310 + }, + { + "epoch": 0.4624, + "grad_norm": 11.556519508361816, + "learning_rate": 1.6753328081210244e-05, + "loss": 0.4674, + "step": 2312 + }, + { + "epoch": 0.4628, + "grad_norm": 5.027859687805176, + "learning_rate": 1.6763619120624592e-05, + "loss": 0.2181, + "step": 2314 + }, + { + "epoch": 0.4632, + "grad_norm": 2.4473464488983154, + "learning_rate": 1.6773896974017373e-05, + "loss": 0.426, + "step": 2316 + }, + { + "epoch": 0.4636, + "grad_norm": 2.1608505249023438, + "learning_rate": 1.6784161621351377e-05, + "loss": 0.3518, + "step": 2318 + }, + { + "epoch": 0.464, + "grad_norm": 5.581315517425537, + "learning_rate": 1.679441304261516e-05, + "loss": 0.353, + "step": 2320 + }, + { + "epoch": 0.4644, + "grad_norm": 4.199415683746338, + "learning_rate": 1.6804651217823048e-05, + "loss": 0.1983, + "step": 2322 + }, + { + "epoch": 0.4648, + "grad_norm": 3.3914458751678467, + "learning_rate": 1.681487612701519e-05, + "loss": 0.2949, + "step": 2324 + }, + { + "epoch": 0.4652, + "grad_norm": 12.291478157043457, + "learning_rate": 1.6825087750257624e-05, + "loss": 0.4944, + "step": 2326 + }, + { + "epoch": 0.4656, + "grad_norm": 9.627623558044434, + "learning_rate": 1.683528606764222e-05, + "loss": 0.4872, + "step": 2328 + }, + { + "epoch": 0.466, + "grad_norm": 8.11014461517334, + "learning_rate": 1.6845471059286893e-05, + "loss": 0.452, + "step": 2330 + }, + { + "epoch": 0.4664, + "grad_norm": 8.255258560180664, + "learning_rate": 1.6855642705335428e-05, + "loss": 0.4115, + "step": 2332 + }, + { + "epoch": 0.4668, + "grad_norm": 3.0434067249298096, + "learning_rate": 1.6865800985957718e-05, + "loss": 0.4524, + "step": 2334 + }, + { + "epoch": 0.4672, + "grad_norm": 7.0328216552734375, + "learning_rate": 1.687594588134968e-05, + "loss": 0.4629, + "step": 2336 + }, + { + "epoch": 0.4676, + "grad_norm": 2.880730628967285, + "learning_rate": 1.6886077371733275e-05, + "loss": 0.4103, + "step": 2338 + }, + { + "epoch": 0.468, + "grad_norm": 11.14397144317627, + "learning_rate": 1.68961954373567e-05, + "loss": 0.4304, + "step": 2340 + }, + { + "epoch": 0.4684, + "grad_norm": 6.017214298248291, + "learning_rate": 1.690630005849423e-05, + "loss": 0.4081, + "step": 2342 + }, + { + "epoch": 0.4688, + "grad_norm": 2.7355451583862305, + "learning_rate": 1.6916391215446403e-05, + "loss": 0.2128, + "step": 2344 + }, + { + "epoch": 0.4692, + "grad_norm": 16.538043975830078, + "learning_rate": 1.6926468888539988e-05, + "loss": 0.4025, + "step": 2346 + }, + { + "epoch": 0.4696, + "grad_norm": 8.546823501586914, + "learning_rate": 1.693653305812805e-05, + "loss": 0.4722, + "step": 2348 + }, + { + "epoch": 0.47, + "grad_norm": 4.708355903625488, + "learning_rate": 1.6946583704589973e-05, + "loss": 0.7108, + "step": 2350 + }, + { + "epoch": 0.4704, + "grad_norm": 6.91795539855957, + "learning_rate": 1.6956620808331505e-05, + "loss": 0.6823, + "step": 2352 + }, + { + "epoch": 0.4708, + "grad_norm": 3.408494234085083, + "learning_rate": 1.6966644349784805e-05, + "loss": 0.7008, + "step": 2354 + }, + { + "epoch": 0.4712, + "grad_norm": 6.076896667480469, + "learning_rate": 1.697665430940846e-05, + "loss": 0.5413, + "step": 2356 + }, + { + "epoch": 0.4716, + "grad_norm": 13.677722930908203, + "learning_rate": 1.698665066768755e-05, + "loss": 0.9742, + "step": 2358 + }, + { + "epoch": 0.472, + "grad_norm": 9.455607414245605, + "learning_rate": 1.699663340513365e-05, + "loss": 0.7988, + "step": 2360 + }, + { + "epoch": 0.4724, + "grad_norm": 11.917442321777344, + "learning_rate": 1.7006602502284913e-05, + "loss": 0.4884, + "step": 2362 + }, + { + "epoch": 0.4728, + "grad_norm": 2.6096956729888916, + "learning_rate": 1.7016557939706068e-05, + "loss": 0.1629, + "step": 2364 + }, + { + "epoch": 0.4732, + "grad_norm": 9.840989112854004, + "learning_rate": 1.70264996979885e-05, + "loss": 0.4899, + "step": 2366 + }, + { + "epoch": 0.4736, + "grad_norm": 3.0129201412200928, + "learning_rate": 1.7036427757750198e-05, + "loss": 0.3226, + "step": 2368 + }, + { + "epoch": 0.474, + "grad_norm": 3.275906562805176, + "learning_rate": 1.7046342099635938e-05, + "loss": 0.3514, + "step": 2370 + }, + { + "epoch": 0.4744, + "grad_norm": 4.834837913513184, + "learning_rate": 1.7056242704317212e-05, + "loss": 0.3345, + "step": 2372 + }, + { + "epoch": 0.4748, + "grad_norm": 7.957798480987549, + "learning_rate": 1.706612955249224e-05, + "loss": 0.393, + "step": 2374 + }, + { + "epoch": 0.4752, + "grad_norm": 4.971512317657471, + "learning_rate": 1.7076002624886156e-05, + "loss": 0.4437, + "step": 2376 + }, + { + "epoch": 0.4756, + "grad_norm": 5.616350173950195, + "learning_rate": 1.708586190225085e-05, + "loss": 0.382, + "step": 2378 + }, + { + "epoch": 0.476, + "grad_norm": 10.651932716369629, + "learning_rate": 1.709570736536521e-05, + "loss": 0.4602, + "step": 2380 + }, + { + "epoch": 0.4764, + "grad_norm": 0.5972671508789062, + "learning_rate": 1.710553899503496e-05, + "loss": 0.1318, + "step": 2382 + }, + { + "epoch": 0.4768, + "grad_norm": 14.442639350891113, + "learning_rate": 1.7115356772092844e-05, + "loss": 0.9383, + "step": 2384 + }, + { + "epoch": 0.4772, + "grad_norm": 4.869437217712402, + "learning_rate": 1.7125160677398625e-05, + "loss": 0.2414, + "step": 2386 + }, + { + "epoch": 0.4776, + "grad_norm": 4.88511323928833, + "learning_rate": 1.7134950691839063e-05, + "loss": 0.3726, + "step": 2388 + }, + { + "epoch": 0.478, + "grad_norm": 6.475824356079102, + "learning_rate": 1.7144726796328034e-05, + "loss": 0.5816, + "step": 2390 + }, + { + "epoch": 0.4784, + "grad_norm": 6.099315643310547, + "learning_rate": 1.7154488971806518e-05, + "loss": 0.6029, + "step": 2392 + }, + { + "epoch": 0.4788, + "grad_norm": 24.582992553710938, + "learning_rate": 1.716423719924266e-05, + "loss": 1.1377, + "step": 2394 + }, + { + "epoch": 0.4792, + "grad_norm": 18.59328269958496, + "learning_rate": 1.7173971459631783e-05, + "loss": 0.5312, + "step": 2396 + }, + { + "epoch": 0.4796, + "grad_norm": 2.513953447341919, + "learning_rate": 1.718369173399646e-05, + "loss": 0.2868, + "step": 2398 + }, + { + "epoch": 0.48, + "grad_norm": 3.8463456630706787, + "learning_rate": 1.7193398003386507e-05, + "loss": 0.4212, + "step": 2400 + }, + { + "epoch": 0.4804, + "grad_norm": 9.140926361083984, + "learning_rate": 1.7203090248879063e-05, + "loss": 0.6535, + "step": 2402 + }, + { + "epoch": 0.4808, + "grad_norm": 3.3709464073181152, + "learning_rate": 1.7212768451578602e-05, + "loss": 0.3, + "step": 2404 + }, + { + "epoch": 0.4812, + "grad_norm": 5.493853569030762, + "learning_rate": 1.7222432592616963e-05, + "loss": 0.3156, + "step": 2406 + }, + { + "epoch": 0.4816, + "grad_norm": 2.464963912963867, + "learning_rate": 1.7232082653153416e-05, + "loss": 0.4658, + "step": 2408 + }, + { + "epoch": 0.482, + "grad_norm": 5.530620098114014, + "learning_rate": 1.724171861437467e-05, + "loss": 0.3428, + "step": 2410 + }, + { + "epoch": 0.4824, + "grad_norm": 3.1582396030426025, + "learning_rate": 1.7251340457494937e-05, + "loss": 0.3549, + "step": 2412 + }, + { + "epoch": 0.4828, + "grad_norm": 6.625899791717529, + "learning_rate": 1.726094816375591e-05, + "loss": 0.3574, + "step": 2414 + }, + { + "epoch": 0.4832, + "grad_norm": 6.3816237449646, + "learning_rate": 1.7270541714426923e-05, + "loss": 0.34, + "step": 2416 + }, + { + "epoch": 0.4836, + "grad_norm": 8.685528755187988, + "learning_rate": 1.7280121090804817e-05, + "loss": 0.4107, + "step": 2418 + }, + { + "epoch": 0.484, + "grad_norm": 2.579235315322876, + "learning_rate": 1.7289686274214106e-05, + "loss": 0.43, + "step": 2420 + }, + { + "epoch": 0.4844, + "grad_norm": 8.475749015808105, + "learning_rate": 1.7299237246007018e-05, + "loss": 0.6159, + "step": 2422 + }, + { + "epoch": 0.4848, + "grad_norm": 5.168679237365723, + "learning_rate": 1.7308773987563393e-05, + "loss": 0.4367, + "step": 2424 + }, + { + "epoch": 0.4852, + "grad_norm": 6.76646089553833, + "learning_rate": 1.7318296480290912e-05, + "loss": 0.6195, + "step": 2426 + }, + { + "epoch": 0.4856, + "grad_norm": 7.8504438400268555, + "learning_rate": 1.732780470562496e-05, + "loss": 0.518, + "step": 2428 + }, + { + "epoch": 0.486, + "grad_norm": 9.22918701171875, + "learning_rate": 1.7337298645028764e-05, + "loss": 0.5375, + "step": 2430 + }, + { + "epoch": 0.4864, + "grad_norm": 10.052725791931152, + "learning_rate": 1.7346778279993413e-05, + "loss": 0.7948, + "step": 2432 + }, + { + "epoch": 0.4868, + "grad_norm": 8.978928565979004, + "learning_rate": 1.7356243592037872e-05, + "loss": 0.5339, + "step": 2434 + }, + { + "epoch": 0.4872, + "grad_norm": 6.191204071044922, + "learning_rate": 1.736569456270903e-05, + "loss": 0.4434, + "step": 2436 + }, + { + "epoch": 0.4876, + "grad_norm": 2.48982834815979, + "learning_rate": 1.7375131173581737e-05, + "loss": 0.6438, + "step": 2438 + }, + { + "epoch": 0.488, + "grad_norm": 2.546194314956665, + "learning_rate": 1.7384553406258836e-05, + "loss": 0.391, + "step": 2440 + }, + { + "epoch": 0.4884, + "grad_norm": 6.114248752593994, + "learning_rate": 1.73939612423712e-05, + "loss": 0.2758, + "step": 2442 + }, + { + "epoch": 0.4888, + "grad_norm": 2.2946407794952393, + "learning_rate": 1.740335466357778e-05, + "loss": 0.379, + "step": 2444 + }, + { + "epoch": 0.4892, + "grad_norm": 2.8928380012512207, + "learning_rate": 1.7412733651565607e-05, + "loss": 0.3516, + "step": 2446 + }, + { + "epoch": 0.4896, + "grad_norm": 5.041824817657471, + "learning_rate": 1.7422098188049888e-05, + "loss": 0.3615, + "step": 2448 + }, + { + "epoch": 0.49, + "grad_norm": 6.294063568115234, + "learning_rate": 1.7431448254773936e-05, + "loss": 0.4659, + "step": 2450 + }, + { + "epoch": 0.4904, + "grad_norm": 2.193105697631836, + "learning_rate": 1.7440783833509373e-05, + "loss": 0.3363, + "step": 2452 + }, + { + "epoch": 0.4908, + "grad_norm": 5.492352485656738, + "learning_rate": 1.7450104906055956e-05, + "loss": 0.3097, + "step": 2454 + }, + { + "epoch": 0.4912, + "grad_norm": 17.08367156982422, + "learning_rate": 1.7459411454241816e-05, + "loss": 0.9212, + "step": 2456 + }, + { + "epoch": 0.4916, + "grad_norm": 10.389326095581055, + "learning_rate": 1.746870345992336e-05, + "loss": 0.4955, + "step": 2458 + }, + { + "epoch": 0.492, + "grad_norm": 11.219281196594238, + "learning_rate": 1.747798090498531e-05, + "loss": 0.4819, + "step": 2460 + }, + { + "epoch": 0.4924, + "grad_norm": 12.481225967407227, + "learning_rate": 1.7487243771340865e-05, + "loss": 0.4673, + "step": 2462 + }, + { + "epoch": 0.4928, + "grad_norm": 4.9078145027160645, + "learning_rate": 1.749649204093154e-05, + "loss": 0.3109, + "step": 2464 + }, + { + "epoch": 0.4932, + "grad_norm": 2.613521099090576, + "learning_rate": 1.750572569572741e-05, + "loss": 0.3695, + "step": 2466 + }, + { + "epoch": 0.4936, + "grad_norm": 2.5333502292633057, + "learning_rate": 1.7514944717726962e-05, + "loss": 0.5431, + "step": 2468 + }, + { + "epoch": 0.494, + "grad_norm": 2.3544058799743652, + "learning_rate": 1.7524149088957244e-05, + "loss": 0.5761, + "step": 2470 + }, + { + "epoch": 0.4944, + "grad_norm": 2.2148303985595703, + "learning_rate": 1.753333879147387e-05, + "loss": 0.2572, + "step": 2472 + }, + { + "epoch": 0.4948, + "grad_norm": 9.49681568145752, + "learning_rate": 1.7542513807361037e-05, + "loss": 0.5686, + "step": 2474 + }, + { + "epoch": 0.4952, + "grad_norm": 2.319242477416992, + "learning_rate": 1.755167411873159e-05, + "loss": 0.4591, + "step": 2476 + }, + { + "epoch": 0.4956, + "grad_norm": 2.543788433074951, + "learning_rate": 1.7560819707727027e-05, + "loss": 0.3262, + "step": 2478 + }, + { + "epoch": 0.496, + "grad_norm": 4.393068790435791, + "learning_rate": 1.7569950556517563e-05, + "loss": 0.5055, + "step": 2480 + }, + { + "epoch": 0.4964, + "grad_norm": 1.849585771560669, + "learning_rate": 1.757906664730213e-05, + "loss": 0.2422, + "step": 2482 + }, + { + "epoch": 0.4968, + "grad_norm": 2.7112886905670166, + "learning_rate": 1.758816796230845e-05, + "loss": 0.3917, + "step": 2484 + }, + { + "epoch": 0.4972, + "grad_norm": 6.303567409515381, + "learning_rate": 1.759725448379304e-05, + "loss": 0.517, + "step": 2486 + }, + { + "epoch": 0.4976, + "grad_norm": 5.610179901123047, + "learning_rate": 1.7606326194041278e-05, + "loss": 0.5523, + "step": 2488 + }, + { + "epoch": 0.498, + "grad_norm": 1.9083210229873657, + "learning_rate": 1.7615383075367363e-05, + "loss": 0.6523, + "step": 2490 + }, + { + "epoch": 0.4984, + "grad_norm": 9.593994140625, + "learning_rate": 1.762442511011447e-05, + "loss": 0.4598, + "step": 2492 + }, + { + "epoch": 0.4988, + "grad_norm": 9.98492431640625, + "learning_rate": 1.763345228065469e-05, + "loss": 0.6768, + "step": 2494 + }, + { + "epoch": 0.4992, + "grad_norm": 2.8457977771759033, + "learning_rate": 1.7642464569389083e-05, + "loss": 0.3641, + "step": 2496 + }, + { + "epoch": 0.4996, + "grad_norm": 10.847461700439453, + "learning_rate": 1.7651461958747745e-05, + "loss": 0.598, + "step": 2498 + }, + { + "epoch": 0.5, + "grad_norm": 1.6359158754348755, + "learning_rate": 1.766044443118977e-05, + "loss": 0.2487, + "step": 2500 + }, + { + "epoch": 0.5004, + "grad_norm": 8.631575584411621, + "learning_rate": 1.766941196920342e-05, + "loss": 0.5534, + "step": 2502 + }, + { + "epoch": 0.5008, + "grad_norm": 6.8842949867248535, + "learning_rate": 1.767836455530598e-05, + "loss": 0.6863, + "step": 2504 + }, + { + "epoch": 0.5012, + "grad_norm": 9.503893852233887, + "learning_rate": 1.7687302172043933e-05, + "loss": 0.5418, + "step": 2506 + }, + { + "epoch": 0.5016, + "grad_norm": 2.576995849609375, + "learning_rate": 1.7696224801992947e-05, + "loss": 0.2714, + "step": 2508 + }, + { + "epoch": 0.502, + "grad_norm": 2.4463255405426025, + "learning_rate": 1.7705132427757885e-05, + "loss": 0.3489, + "step": 2510 + }, + { + "epoch": 0.5024, + "grad_norm": 10.98027515411377, + "learning_rate": 1.77140250319729e-05, + "loss": 0.6269, + "step": 2512 + }, + { + "epoch": 0.5028, + "grad_norm": 4.933207035064697, + "learning_rate": 1.7722902597301385e-05, + "loss": 0.3591, + "step": 2514 + }, + { + "epoch": 0.5032, + "grad_norm": 6.317653179168701, + "learning_rate": 1.7731765106436073e-05, + "loss": 0.2363, + "step": 2516 + }, + { + "epoch": 0.5036, + "grad_norm": 3.7932164669036865, + "learning_rate": 1.774061254209905e-05, + "loss": 0.26, + "step": 2518 + }, + { + "epoch": 0.504, + "grad_norm": 6.313023567199707, + "learning_rate": 1.7749444887041793e-05, + "loss": 0.2885, + "step": 2520 + }, + { + "epoch": 0.5044, + "grad_norm": 12.174040794372559, + "learning_rate": 1.7758262124045192e-05, + "loss": 0.5835, + "step": 2522 + }, + { + "epoch": 0.5048, + "grad_norm": 2.4631240367889404, + "learning_rate": 1.776706423591959e-05, + "loss": 0.3414, + "step": 2524 + }, + { + "epoch": 0.5052, + "grad_norm": 5.14389181137085, + "learning_rate": 1.7775851205504816e-05, + "loss": 0.3176, + "step": 2526 + }, + { + "epoch": 0.5056, + "grad_norm": 1.9069722890853882, + "learning_rate": 1.778462301567023e-05, + "loss": 0.2083, + "step": 2528 + }, + { + "epoch": 0.506, + "grad_norm": 1.9987138509750366, + "learning_rate": 1.7793379649314736e-05, + "loss": 0.2987, + "step": 2530 + }, + { + "epoch": 0.5064, + "grad_norm": 4.762056827545166, + "learning_rate": 1.7802121089366832e-05, + "loss": 0.3167, + "step": 2532 + }, + { + "epoch": 0.5068, + "grad_norm": 7.858055591583252, + "learning_rate": 1.7810847318784635e-05, + "loss": 0.3472, + "step": 2534 + }, + { + "epoch": 0.5072, + "grad_norm": 2.8657965660095215, + "learning_rate": 1.7819558320555895e-05, + "loss": 0.2441, + "step": 2536 + }, + { + "epoch": 0.5076, + "grad_norm": 3.419342041015625, + "learning_rate": 1.7828254077698103e-05, + "loss": 0.3875, + "step": 2538 + }, + { + "epoch": 0.508, + "grad_norm": 10.446057319641113, + "learning_rate": 1.7836934573258392e-05, + "loss": 0.4137, + "step": 2540 + }, + { + "epoch": 0.5084, + "grad_norm": 22.94832992553711, + "learning_rate": 1.7845599790313735e-05, + "loss": 1.1479, + "step": 2542 + }, + { + "epoch": 0.5088, + "grad_norm": 13.496355056762695, + "learning_rate": 1.785424971197082e-05, + "loss": 0.4934, + "step": 2544 + }, + { + "epoch": 0.5092, + "grad_norm": 7.352383613586426, + "learning_rate": 1.786288432136618e-05, + "loss": 0.1511, + "step": 2546 + }, + { + "epoch": 0.5096, + "grad_norm": 2.632784366607666, + "learning_rate": 1.7871503601666233e-05, + "loss": 0.2947, + "step": 2548 + }, + { + "epoch": 0.51, + "grad_norm": 4.289550304412842, + "learning_rate": 1.788010753606722e-05, + "loss": 0.479, + "step": 2550 + }, + { + "epoch": 0.5104, + "grad_norm": 6.349730014801025, + "learning_rate": 1.7888696107795343e-05, + "loss": 0.309, + "step": 2552 + }, + { + "epoch": 0.5108, + "grad_norm": 11.468204498291016, + "learning_rate": 1.7897269300106735e-05, + "loss": 0.7583, + "step": 2554 + }, + { + "epoch": 0.5112, + "grad_norm": 5.843885898590088, + "learning_rate": 1.790582709628753e-05, + "loss": 0.4734, + "step": 2556 + }, + { + "epoch": 0.5116, + "grad_norm": 11.991304397583008, + "learning_rate": 1.7914369479653854e-05, + "loss": 0.3164, + "step": 2558 + }, + { + "epoch": 0.512, + "grad_norm": 7.134799957275391, + "learning_rate": 1.7922896433551903e-05, + "loss": 0.3713, + "step": 2560 + }, + { + "epoch": 0.5124, + "grad_norm": 2.86505126953125, + "learning_rate": 1.7931407941357945e-05, + "loss": 0.4594, + "step": 2562 + }, + { + "epoch": 0.5128, + "grad_norm": 5.432496547698975, + "learning_rate": 1.793990398647835e-05, + "loss": 0.3353, + "step": 2564 + }, + { + "epoch": 0.5132, + "grad_norm": 11.454633712768555, + "learning_rate": 1.7948384552349655e-05, + "loss": 0.3166, + "step": 2566 + }, + { + "epoch": 0.5136, + "grad_norm": 10.260254859924316, + "learning_rate": 1.795684962243855e-05, + "loss": 0.7591, + "step": 2568 + }, + { + "epoch": 0.514, + "grad_norm": 5.5820746421813965, + "learning_rate": 1.796529918024196e-05, + "loss": 0.436, + "step": 2570 + }, + { + "epoch": 0.5144, + "grad_norm": 7.470113277435303, + "learning_rate": 1.7973733209287032e-05, + "loss": 0.9576, + "step": 2572 + }, + { + "epoch": 0.5148, + "grad_norm": 2.867292642593384, + "learning_rate": 1.798215169313121e-05, + "loss": 0.421, + "step": 2574 + }, + { + "epoch": 0.5152, + "grad_norm": 2.9812204837799072, + "learning_rate": 1.7990554615362193e-05, + "loss": 0.256, + "step": 2576 + }, + { + "epoch": 0.5156, + "grad_norm": 2.0516674518585205, + "learning_rate": 1.79989419595981e-05, + "loss": 0.4128, + "step": 2578 + }, + { + "epoch": 0.516, + "grad_norm": 2.5433638095855713, + "learning_rate": 1.800731370948734e-05, + "loss": 0.126, + "step": 2580 + }, + { + "epoch": 0.5164, + "grad_norm": 9.282876014709473, + "learning_rate": 1.8015669848708757e-05, + "loss": 0.3628, + "step": 2582 + }, + { + "epoch": 0.5168, + "grad_norm": 5.5165276527404785, + "learning_rate": 1.802401036097167e-05, + "loss": 0.4257, + "step": 2584 + }, + { + "epoch": 0.5172, + "grad_norm": 3.440859794616699, + "learning_rate": 1.803233523001577e-05, + "loss": 0.4736, + "step": 2586 + }, + { + "epoch": 0.5176, + "grad_norm": 1.817488193511963, + "learning_rate": 1.804064443961135e-05, + "loss": 0.4214, + "step": 2588 + }, + { + "epoch": 0.518, + "grad_norm": 3.2161598205566406, + "learning_rate": 1.804893797355914e-05, + "loss": 0.3271, + "step": 2590 + }, + { + "epoch": 0.5184, + "grad_norm": 4.628116130828857, + "learning_rate": 1.8057215815690494e-05, + "loss": 0.4066, + "step": 2592 + }, + { + "epoch": 0.5188, + "grad_norm": 4.453661918640137, + "learning_rate": 1.8065477949867327e-05, + "loss": 0.3715, + "step": 2594 + }, + { + "epoch": 0.5192, + "grad_norm": 2.9214999675750732, + "learning_rate": 1.8073724359982184e-05, + "loss": 0.359, + "step": 2596 + }, + { + "epoch": 0.5196, + "grad_norm": 4.091073989868164, + "learning_rate": 1.808195502995827e-05, + "loss": 0.3722, + "step": 2598 + }, + { + "epoch": 0.52, + "grad_norm": 9.883648872375488, + "learning_rate": 1.809016994374947e-05, + "loss": 0.4947, + "step": 2600 + }, + { + "epoch": 0.5204, + "grad_norm": 2.3023641109466553, + "learning_rate": 1.8098369085340397e-05, + "loss": 0.3499, + "step": 2602 + }, + { + "epoch": 0.5208, + "grad_norm": 19.056509017944336, + "learning_rate": 1.81065524387464e-05, + "loss": 0.6957, + "step": 2604 + }, + { + "epoch": 0.5212, + "grad_norm": 3.5548794269561768, + "learning_rate": 1.8114719988013606e-05, + "loss": 0.4773, + "step": 2606 + }, + { + "epoch": 0.5216, + "grad_norm": 3.562157392501831, + "learning_rate": 1.8122871717218968e-05, + "loss": 0.2252, + "step": 2608 + }, + { + "epoch": 0.522, + "grad_norm": 8.437989234924316, + "learning_rate": 1.813100761047028e-05, + "loss": 0.7304, + "step": 2610 + }, + { + "epoch": 0.5224, + "grad_norm": 2.9045896530151367, + "learning_rate": 1.8139127651906176e-05, + "loss": 0.2871, + "step": 2612 + }, + { + "epoch": 0.5228, + "grad_norm": 4.28363561630249, + "learning_rate": 1.8147231825696258e-05, + "loss": 0.9147, + "step": 2614 + }, + { + "epoch": 0.5232, + "grad_norm": 8.224676132202148, + "learning_rate": 1.8155320116040976e-05, + "loss": 0.4292, + "step": 2616 + }, + { + "epoch": 0.5236, + "grad_norm": 8.017407417297363, + "learning_rate": 1.8163392507171834e-05, + "loss": 0.3655, + "step": 2618 + }, + { + "epoch": 0.524, + "grad_norm": 5.481568813323975, + "learning_rate": 1.817144898335129e-05, + "loss": 0.2794, + "step": 2620 + }, + { + "epoch": 0.5244, + "grad_norm": 10.477754592895508, + "learning_rate": 1.8179489528872797e-05, + "loss": 0.6051, + "step": 2622 + }, + { + "epoch": 0.5248, + "grad_norm": 8.79698371887207, + "learning_rate": 1.818751412806095e-05, + "loss": 0.3334, + "step": 2624 + }, + { + "epoch": 0.5252, + "grad_norm": 5.183042526245117, + "learning_rate": 1.819552276527134e-05, + "loss": 0.4085, + "step": 2626 + }, + { + "epoch": 0.5256, + "grad_norm": 4.3663740158081055, + "learning_rate": 1.8203515424890738e-05, + "loss": 0.3583, + "step": 2628 + }, + { + "epoch": 0.526, + "grad_norm": 8.76609992980957, + "learning_rate": 1.821149209133704e-05, + "loss": 0.441, + "step": 2630 + }, + { + "epoch": 0.5264, + "grad_norm": 5.328789234161377, + "learning_rate": 1.8219452749059322e-05, + "loss": 0.4112, + "step": 2632 + }, + { + "epoch": 0.5268, + "grad_norm": 3.7866878509521484, + "learning_rate": 1.82273973825379e-05, + "loss": 0.7086, + "step": 2634 + }, + { + "epoch": 0.5272, + "grad_norm": 2.9086310863494873, + "learning_rate": 1.8235325976284276e-05, + "loss": 0.2082, + "step": 2636 + }, + { + "epoch": 0.5276, + "grad_norm": 4.954035758972168, + "learning_rate": 1.8243238514841258e-05, + "loss": 0.3308, + "step": 2638 + }, + { + "epoch": 0.528, + "grad_norm": 12.781424522399902, + "learning_rate": 1.8251134982782952e-05, + "loss": 0.7253, + "step": 2640 + }, + { + "epoch": 0.5284, + "grad_norm": 3.1564033031463623, + "learning_rate": 1.8259015364714786e-05, + "loss": 0.3441, + "step": 2642 + }, + { + "epoch": 0.5288, + "grad_norm": 2.7530112266540527, + "learning_rate": 1.826687964527355e-05, + "loss": 0.1832, + "step": 2644 + }, + { + "epoch": 0.5292, + "grad_norm": 1.177614450454712, + "learning_rate": 1.8274727809127437e-05, + "loss": 0.4702, + "step": 2646 + }, + { + "epoch": 0.5296, + "grad_norm": 12.472597122192383, + "learning_rate": 1.828255984097604e-05, + "loss": 0.9463, + "step": 2648 + }, + { + "epoch": 0.53, + "grad_norm": 2.9293160438537598, + "learning_rate": 1.8290375725550413e-05, + "loss": 0.2766, + "step": 2650 + }, + { + "epoch": 0.5304, + "grad_norm": 3.00972056388855, + "learning_rate": 1.8298175447613093e-05, + "loss": 0.3727, + "step": 2652 + }, + { + "epoch": 0.5308, + "grad_norm": 2.6771140098571777, + "learning_rate": 1.8305958991958125e-05, + "loss": 0.3526, + "step": 2654 + }, + { + "epoch": 0.5312, + "grad_norm": 2.2463996410369873, + "learning_rate": 1.8313726343411092e-05, + "loss": 0.6072, + "step": 2656 + }, + { + "epoch": 0.5316, + "grad_norm": 9.368053436279297, + "learning_rate": 1.832147748682912e-05, + "loss": 0.543, + "step": 2658 + }, + { + "epoch": 0.532, + "grad_norm": 5.15004301071167, + "learning_rate": 1.8329212407101e-05, + "loss": 0.5974, + "step": 2660 + }, + { + "epoch": 0.5324, + "grad_norm": 4.019261837005615, + "learning_rate": 1.8336931089147065e-05, + "loss": 0.5539, + "step": 2662 + }, + { + "epoch": 0.5328, + "grad_norm": 2.8712925910949707, + "learning_rate": 1.8344633517919394e-05, + "loss": 0.3006, + "step": 2664 + }, + { + "epoch": 0.5332, + "grad_norm": 4.419692516326904, + "learning_rate": 1.8352319678401677e-05, + "loss": 0.4899, + "step": 2666 + }, + { + "epoch": 0.5336, + "grad_norm": 2.5599846839904785, + "learning_rate": 1.8359989555609344e-05, + "loss": 0.2969, + "step": 2668 + }, + { + "epoch": 0.534, + "grad_norm": 5.038468360900879, + "learning_rate": 1.836764313458962e-05, + "loss": 0.4067, + "step": 2670 + }, + { + "epoch": 0.5344, + "grad_norm": 6.059289455413818, + "learning_rate": 1.8375280400421407e-05, + "loss": 1.6824, + "step": 2672 + }, + { + "epoch": 0.5348, + "grad_norm": 2.089632749557495, + "learning_rate": 1.8382901338215515e-05, + "loss": 0.4212, + "step": 2674 + }, + { + "epoch": 0.5352, + "grad_norm": 6.07957649230957, + "learning_rate": 1.8390505933114503e-05, + "loss": 0.7081, + "step": 2676 + }, + { + "epoch": 0.5356, + "grad_norm": 11.706499099731445, + "learning_rate": 1.839809417029283e-05, + "loss": 0.4114, + "step": 2678 + }, + { + "epoch": 0.536, + "grad_norm": 4.392540454864502, + "learning_rate": 1.8405666034956842e-05, + "loss": 0.2692, + "step": 2680 + }, + { + "epoch": 0.5364, + "grad_norm": 4.851634979248047, + "learning_rate": 1.8413221512344805e-05, + "loss": 0.415, + "step": 2682 + }, + { + "epoch": 0.5368, + "grad_norm": 2.236924171447754, + "learning_rate": 1.842076058772692e-05, + "loss": 0.2893, + "step": 2684 + }, + { + "epoch": 0.5372, + "grad_norm": 8.560842514038086, + "learning_rate": 1.8428283246405386e-05, + "loss": 0.5119, + "step": 2686 + }, + { + "epoch": 0.5376, + "grad_norm": 4.771745204925537, + "learning_rate": 1.8435789473714384e-05, + "loss": 0.3704, + "step": 2688 + }, + { + "epoch": 0.538, + "grad_norm": 5.6162238121032715, + "learning_rate": 1.844327925502015e-05, + "loss": 0.4774, + "step": 2690 + }, + { + "epoch": 0.5384, + "grad_norm": 2.4182419776916504, + "learning_rate": 1.8450752575720964e-05, + "loss": 0.3113, + "step": 2692 + }, + { + "epoch": 0.5388, + "grad_norm": 10.021705627441406, + "learning_rate": 1.8458209421247205e-05, + "loss": 0.7894, + "step": 2694 + }, + { + "epoch": 0.5392, + "grad_norm": 5.815845966339111, + "learning_rate": 1.8465649777061384e-05, + "loss": 0.5688, + "step": 2696 + }, + { + "epoch": 0.5396, + "grad_norm": 5.702415466308594, + "learning_rate": 1.8473073628658116e-05, + "loss": 0.4711, + "step": 2698 + }, + { + "epoch": 0.54, + "grad_norm": 7.686366081237793, + "learning_rate": 1.8480480961564266e-05, + "loss": 0.357, + "step": 2700 + }, + { + "epoch": 0.5404, + "grad_norm": 4.112565994262695, + "learning_rate": 1.848787176133881e-05, + "loss": 0.2281, + "step": 2702 + }, + { + "epoch": 0.5408, + "grad_norm": 11.28343391418457, + "learning_rate": 1.8495246013573047e-05, + "loss": 0.6086, + "step": 2704 + }, + { + "epoch": 0.5412, + "grad_norm": 8.169326782226562, + "learning_rate": 1.850260370389049e-05, + "loss": 0.6289, + "step": 2706 + }, + { + "epoch": 0.5416, + "grad_norm": 11.40042495727539, + "learning_rate": 1.850994481794691e-05, + "loss": 0.5544, + "step": 2708 + }, + { + "epoch": 0.542, + "grad_norm": 3.832862615585327, + "learning_rate": 1.851726934143048e-05, + "loss": 0.4566, + "step": 2710 + }, + { + "epoch": 0.5424, + "grad_norm": 8.427587509155273, + "learning_rate": 1.8524577260061628e-05, + "loss": 0.5848, + "step": 2712 + }, + { + "epoch": 0.5428, + "grad_norm": 2.4495596885681152, + "learning_rate": 1.8531868559593205e-05, + "loss": 0.7114, + "step": 2714 + }, + { + "epoch": 0.5432, + "grad_norm": 2.8032925128936768, + "learning_rate": 1.8539143225810453e-05, + "loss": 0.2562, + "step": 2716 + }, + { + "epoch": 0.5436, + "grad_norm": 2.0871102809906006, + "learning_rate": 1.8546401244531028e-05, + "loss": 0.2434, + "step": 2718 + }, + { + "epoch": 0.544, + "grad_norm": 4.8009748458862305, + "learning_rate": 1.8553642601605066e-05, + "loss": 0.4175, + "step": 2720 + }, + { + "epoch": 0.5444, + "grad_norm": 4.694950580596924, + "learning_rate": 1.856086728291516e-05, + "loss": 0.3305, + "step": 2722 + }, + { + "epoch": 0.5448, + "grad_norm": 11.439225196838379, + "learning_rate": 1.856807527437643e-05, + "loss": 0.4825, + "step": 2724 + }, + { + "epoch": 0.5452, + "grad_norm": 7.171995639801025, + "learning_rate": 1.857526656193652e-05, + "loss": 0.4618, + "step": 2726 + }, + { + "epoch": 0.5456, + "grad_norm": 2.5880284309387207, + "learning_rate": 1.8582441131575658e-05, + "loss": 0.4406, + "step": 2728 + }, + { + "epoch": 0.546, + "grad_norm": 4.846583366394043, + "learning_rate": 1.8589598969306643e-05, + "loss": 0.5049, + "step": 2730 + }, + { + "epoch": 0.5464, + "grad_norm": 12.317317962646484, + "learning_rate": 1.859674006117491e-05, + "loss": 0.5098, + "step": 2732 + }, + { + "epoch": 0.5468, + "grad_norm": 7.490502834320068, + "learning_rate": 1.860386439325853e-05, + "loss": 0.435, + "step": 2734 + }, + { + "epoch": 0.5472, + "grad_norm": 4.77424955368042, + "learning_rate": 1.8610971951668268e-05, + "loss": 0.199, + "step": 2736 + }, + { + "epoch": 0.5476, + "grad_norm": 3.241295337677002, + "learning_rate": 1.8618062722547544e-05, + "loss": 0.5779, + "step": 2738 + }, + { + "epoch": 0.548, + "grad_norm": 9.553605079650879, + "learning_rate": 1.862513669207257e-05, + "loss": 0.2964, + "step": 2740 + }, + { + "epoch": 0.5484, + "grad_norm": 4.459126949310303, + "learning_rate": 1.8632193846452274e-05, + "loss": 0.9862, + "step": 2742 + }, + { + "epoch": 0.5488, + "grad_norm": 2.0170369148254395, + "learning_rate": 1.8639234171928348e-05, + "loss": 0.4365, + "step": 2744 + }, + { + "epoch": 0.5492, + "grad_norm": 2.304331064224243, + "learning_rate": 1.8646257654775354e-05, + "loss": 0.1641, + "step": 2746 + }, + { + "epoch": 0.5496, + "grad_norm": 4.68295955657959, + "learning_rate": 1.8653264281300612e-05, + "loss": 0.5119, + "step": 2748 + }, + { + "epoch": 0.55, + "grad_norm": 5.507453918457031, + "learning_rate": 1.866025403784439e-05, + "loss": 0.3738, + "step": 2750 + }, + { + "epoch": 0.5504, + "grad_norm": 9.912617683410645, + "learning_rate": 1.8667226910779767e-05, + "loss": 0.6403, + "step": 2752 + }, + { + "epoch": 0.5508, + "grad_norm": 5.828344821929932, + "learning_rate": 1.8674182886512776e-05, + "loss": 0.5168, + "step": 2754 + }, + { + "epoch": 0.5512, + "grad_norm": 3.282529830932617, + "learning_rate": 1.8681121951482393e-05, + "loss": 0.2641, + "step": 2756 + }, + { + "epoch": 0.5516, + "grad_norm": 9.131134033203125, + "learning_rate": 1.8688044092160554e-05, + "loss": 0.8673, + "step": 2758 + }, + { + "epoch": 0.552, + "grad_norm": 7.49228572845459, + "learning_rate": 1.869494929505219e-05, + "loss": 0.493, + "step": 2760 + }, + { + "epoch": 0.5524, + "grad_norm": 12.317577362060547, + "learning_rate": 1.8701837546695256e-05, + "loss": 0.3039, + "step": 2762 + }, + { + "epoch": 0.5528, + "grad_norm": 6.806464195251465, + "learning_rate": 1.870870883366075e-05, + "loss": 0.615, + "step": 2764 + }, + { + "epoch": 0.5532, + "grad_norm": 2.3761305809020996, + "learning_rate": 1.871556314255275e-05, + "loss": 0.3902, + "step": 2766 + }, + { + "epoch": 0.5536, + "grad_norm": 10.986335754394531, + "learning_rate": 1.8722400460008434e-05, + "loss": 0.5104, + "step": 2768 + }, + { + "epoch": 0.554, + "grad_norm": 7.663166522979736, + "learning_rate": 1.8729220772698093e-05, + "loss": 0.2916, + "step": 2770 + }, + { + "epoch": 0.5544, + "grad_norm": 11.641773223876953, + "learning_rate": 1.8736024067325195e-05, + "loss": 1.0732, + "step": 2772 + }, + { + "epoch": 0.5548, + "grad_norm": 3.6679561138153076, + "learning_rate": 1.8742810330626335e-05, + "loss": 0.5274, + "step": 2774 + }, + { + "epoch": 0.5552, + "grad_norm": 3.1301381587982178, + "learning_rate": 1.8749579549371373e-05, + "loss": 0.2811, + "step": 2776 + }, + { + "epoch": 0.5556, + "grad_norm": 2.490795135498047, + "learning_rate": 1.8756331710363368e-05, + "loss": 0.3786, + "step": 2778 + }, + { + "epoch": 0.556, + "grad_norm": 8.345508575439453, + "learning_rate": 1.876306680043863e-05, + "loss": 0.4435, + "step": 2780 + }, + { + "epoch": 0.5564, + "grad_norm": 6.362506866455078, + "learning_rate": 1.876978480646677e-05, + "loss": 0.3331, + "step": 2782 + }, + { + "epoch": 0.5568, + "grad_norm": 5.1575188636779785, + "learning_rate": 1.8776485715350665e-05, + "loss": 0.656, + "step": 2784 + }, + { + "epoch": 0.5572, + "grad_norm": 3.571089744567871, + "learning_rate": 1.878316951402658e-05, + "loss": 0.4216, + "step": 2786 + }, + { + "epoch": 0.5576, + "grad_norm": 4.825523853302002, + "learning_rate": 1.878983618946409e-05, + "loss": 0.3728, + "step": 2788 + }, + { + "epoch": 0.558, + "grad_norm": 5.970290660858154, + "learning_rate": 1.879648572866617e-05, + "loss": 0.3671, + "step": 2790 + }, + { + "epoch": 0.5584, + "grad_norm": 15.60201644897461, + "learning_rate": 1.8803118118669203e-05, + "loss": 1.3432, + "step": 2792 + }, + { + "epoch": 0.5588, + "grad_norm": 3.825178861618042, + "learning_rate": 1.8809733346543006e-05, + "loss": 0.4855, + "step": 2794 + }, + { + "epoch": 0.5592, + "grad_norm": 4.380986213684082, + "learning_rate": 1.881633139939087e-05, + "loss": 0.3201, + "step": 2796 + }, + { + "epoch": 0.5596, + "grad_norm": 2.6262505054473877, + "learning_rate": 1.8822912264349532e-05, + "loss": 0.3444, + "step": 2798 + }, + { + "epoch": 0.56, + "grad_norm": 3.160878896713257, + "learning_rate": 1.882947592858927e-05, + "loss": 0.2954, + "step": 2800 + }, + { + "epoch": 0.5604, + "grad_norm": 2.602874755859375, + "learning_rate": 1.8836022379313877e-05, + "loss": 0.401, + "step": 2802 + }, + { + "epoch": 0.5608, + "grad_norm": 4.4300947189331055, + "learning_rate": 1.884255160376072e-05, + "loss": 0.3047, + "step": 2804 + }, + { + "epoch": 0.5612, + "grad_norm": 6.292295455932617, + "learning_rate": 1.8849063589200744e-05, + "loss": 0.3335, + "step": 2806 + }, + { + "epoch": 0.5616, + "grad_norm": 2.988844871520996, + "learning_rate": 1.885555832293849e-05, + "loss": 0.3012, + "step": 2808 + }, + { + "epoch": 0.562, + "grad_norm": 6.8503546714782715, + "learning_rate": 1.8862035792312145e-05, + "loss": 0.4679, + "step": 2810 + }, + { + "epoch": 0.5624, + "grad_norm": 6.4087982177734375, + "learning_rate": 1.886849598469356e-05, + "loss": 0.4688, + "step": 2812 + }, + { + "epoch": 0.5628, + "grad_norm": 9.839089393615723, + "learning_rate": 1.8874938887488246e-05, + "loss": 0.5939, + "step": 2814 + }, + { + "epoch": 0.5632, + "grad_norm": 2.354341745376587, + "learning_rate": 1.888136448813544e-05, + "loss": 0.2644, + "step": 2816 + }, + { + "epoch": 0.5636, + "grad_norm": 3.353959321975708, + "learning_rate": 1.888777277410812e-05, + "loss": 0.2609, + "step": 2818 + }, + { + "epoch": 0.564, + "grad_norm": 2.3400180339813232, + "learning_rate": 1.8894163732912972e-05, + "loss": 0.3023, + "step": 2820 + }, + { + "epoch": 0.5644, + "grad_norm": 2.3069167137145996, + "learning_rate": 1.890053735209053e-05, + "loss": 0.2547, + "step": 2822 + }, + { + "epoch": 0.5648, + "grad_norm": 4.189844131469727, + "learning_rate": 1.890689361921506e-05, + "loss": 0.3343, + "step": 2824 + }, + { + "epoch": 0.5652, + "grad_norm": 14.046370506286621, + "learning_rate": 1.8913232521894737e-05, + "loss": 0.7099, + "step": 2826 + }, + { + "epoch": 0.5656, + "grad_norm": 15.118515968322754, + "learning_rate": 1.891955404777151e-05, + "loss": 0.3473, + "step": 2828 + }, + { + "epoch": 0.566, + "grad_norm": 1.6364011764526367, + "learning_rate": 1.8925858184521248e-05, + "loss": 0.2581, + "step": 2830 + }, + { + "epoch": 0.5664, + "grad_norm": 2.94092059135437, + "learning_rate": 1.893214491985374e-05, + "loss": 0.44, + "step": 2832 + }, + { + "epoch": 0.5668, + "grad_norm": 2.9333367347717285, + "learning_rate": 1.8938414241512634e-05, + "loss": 0.4498, + "step": 2834 + }, + { + "epoch": 0.5672, + "grad_norm": 6.774989128112793, + "learning_rate": 1.89446661372756e-05, + "loss": 0.3952, + "step": 2836 + }, + { + "epoch": 0.5676, + "grad_norm": 4.230257034301758, + "learning_rate": 1.8950900594954226e-05, + "loss": 0.4268, + "step": 2838 + }, + { + "epoch": 0.568, + "grad_norm": 7.136099338531494, + "learning_rate": 1.895711760239413e-05, + "loss": 0.2345, + "step": 2840 + }, + { + "epoch": 0.5684, + "grad_norm": 8.781055450439453, + "learning_rate": 1.896331714747493e-05, + "loss": 0.4429, + "step": 2842 + }, + { + "epoch": 0.5688, + "grad_norm": 5.147548675537109, + "learning_rate": 1.89694992181103e-05, + "loss": 0.3696, + "step": 2844 + }, + { + "epoch": 0.5692, + "grad_norm": 9.33837890625, + "learning_rate": 1.8975663802247975e-05, + "loss": 0.7089, + "step": 2846 + }, + { + "epoch": 0.5696, + "grad_norm": 1.7800350189208984, + "learning_rate": 1.8981810887869784e-05, + "loss": 0.1354, + "step": 2848 + }, + { + "epoch": 0.57, + "grad_norm": 3.6641359329223633, + "learning_rate": 1.898794046299167e-05, + "loss": 0.3272, + "step": 2850 + }, + { + "epoch": 0.5704, + "grad_norm": 7.43326473236084, + "learning_rate": 1.8994052515663708e-05, + "loss": 0.4061, + "step": 2852 + }, + { + "epoch": 0.5708, + "grad_norm": 8.51661491394043, + "learning_rate": 1.9000147033970144e-05, + "loss": 1.0261, + "step": 2854 + }, + { + "epoch": 0.5712, + "grad_norm": 0.4580651521682739, + "learning_rate": 1.90062240060294e-05, + "loss": 0.1717, + "step": 2856 + }, + { + "epoch": 0.5716, + "grad_norm": 3.314425230026245, + "learning_rate": 1.901228341999412e-05, + "loss": 0.2293, + "step": 2858 + }, + { + "epoch": 0.572, + "grad_norm": 4.254598617553711, + "learning_rate": 1.9018325264051136e-05, + "loss": 0.5684, + "step": 2860 + }, + { + "epoch": 0.5724, + "grad_norm": 3.0152430534362793, + "learning_rate": 1.9024349526421596e-05, + "loss": 0.3437, + "step": 2862 + }, + { + "epoch": 0.5728, + "grad_norm": 1.0627639293670654, + "learning_rate": 1.9030356195360868e-05, + "loss": 0.0605, + "step": 2864 + }, + { + "epoch": 0.5732, + "grad_norm": 5.744231700897217, + "learning_rate": 1.903634525915866e-05, + "loss": 0.3431, + "step": 2866 + }, + { + "epoch": 0.5736, + "grad_norm": 6.044951438903809, + "learning_rate": 1.904231670613899e-05, + "loss": 0.4343, + "step": 2868 + }, + { + "epoch": 0.574, + "grad_norm": 10.40998649597168, + "learning_rate": 1.904827052466019e-05, + "loss": 0.5893, + "step": 2870 + }, + { + "epoch": 0.5744, + "grad_norm": 8.875959396362305, + "learning_rate": 1.905420670311502e-05, + "loss": 0.2393, + "step": 2872 + }, + { + "epoch": 0.5748, + "grad_norm": 2.237319231033325, + "learning_rate": 1.9060125229930572e-05, + "loss": 0.4703, + "step": 2874 + }, + { + "epoch": 0.5752, + "grad_norm": 4.149212837219238, + "learning_rate": 1.906602609356838e-05, + "loss": 0.2914, + "step": 2876 + }, + { + "epoch": 0.5756, + "grad_norm": 3.4428646564483643, + "learning_rate": 1.907190928252441e-05, + "loss": 0.2835, + "step": 2878 + }, + { + "epoch": 0.576, + "grad_norm": 4.516830921173096, + "learning_rate": 1.9077774785329078e-05, + "loss": 0.3533, + "step": 2880 + }, + { + "epoch": 0.5764, + "grad_norm": 6.448079586029053, + "learning_rate": 1.908362259054731e-05, + "loss": 0.1928, + "step": 2882 + }, + { + "epoch": 0.5768, + "grad_norm": 10.564045906066895, + "learning_rate": 1.9089452686778487e-05, + "loss": 0.539, + "step": 2884 + }, + { + "epoch": 0.5772, + "grad_norm": 9.977001190185547, + "learning_rate": 1.9095265062656542e-05, + "loss": 0.5604, + "step": 2886 + }, + { + "epoch": 0.5776, + "grad_norm": 3.508237838745117, + "learning_rate": 1.9101059706849957e-05, + "loss": 0.3977, + "step": 2888 + }, + { + "epoch": 0.578, + "grad_norm": 4.787101745605469, + "learning_rate": 1.910683660806177e-05, + "loss": 0.4591, + "step": 2890 + }, + { + "epoch": 0.5784, + "grad_norm": 3.002999782562256, + "learning_rate": 1.911259575502962e-05, + "loss": 0.4415, + "step": 2892 + }, + { + "epoch": 0.5788, + "grad_norm": 14.825424194335938, + "learning_rate": 1.9118337136525754e-05, + "loss": 0.6453, + "step": 2894 + }, + { + "epoch": 0.5792, + "grad_norm": 7.936984062194824, + "learning_rate": 1.912406074135706e-05, + "loss": 0.3741, + "step": 2896 + }, + { + "epoch": 0.5796, + "grad_norm": 4.977035045623779, + "learning_rate": 1.912976655836507e-05, + "loss": 0.4152, + "step": 2898 + }, + { + "epoch": 0.58, + "grad_norm": 4.106995582580566, + "learning_rate": 1.9135454576426006e-05, + "loss": 0.2785, + "step": 2900 + }, + { + "epoch": 0.5804, + "grad_norm": 7.292054176330566, + "learning_rate": 1.9141124784450786e-05, + "loss": 0.3544, + "step": 2902 + }, + { + "epoch": 0.5808, + "grad_norm": 27.400583267211914, + "learning_rate": 1.9146777171385053e-05, + "loss": 0.4803, + "step": 2904 + }, + { + "epoch": 0.5812, + "grad_norm": 7.311355113983154, + "learning_rate": 1.9152411726209172e-05, + "loss": 0.4531, + "step": 2906 + }, + { + "epoch": 0.5816, + "grad_norm": 8.587400436401367, + "learning_rate": 1.9158028437938316e-05, + "loss": 0.3402, + "step": 2908 + }, + { + "epoch": 0.582, + "grad_norm": 7.006730079650879, + "learning_rate": 1.916362729562239e-05, + "loss": 0.5493, + "step": 2910 + }, + { + "epoch": 0.5824, + "grad_norm": 0.04787491261959076, + "learning_rate": 1.9169208288346168e-05, + "loss": 0.2234, + "step": 2912 + }, + { + "epoch": 0.5828, + "grad_norm": 9.122152328491211, + "learning_rate": 1.9174771405229187e-05, + "loss": 0.554, + "step": 2914 + }, + { + "epoch": 0.5832, + "grad_norm": 6.771241188049316, + "learning_rate": 1.9180316635425876e-05, + "loss": 0.5266, + "step": 2916 + }, + { + "epoch": 0.5836, + "grad_norm": 15.799236297607422, + "learning_rate": 1.9185843968125543e-05, + "loss": 0.8057, + "step": 2918 + }, + { + "epoch": 0.584, + "grad_norm": 2.701268196105957, + "learning_rate": 1.9191353392552346e-05, + "loss": 0.4011, + "step": 2920 + }, + { + "epoch": 0.5844, + "grad_norm": 12.020171165466309, + "learning_rate": 1.919684489796539e-05, + "loss": 1.2731, + "step": 2922 + }, + { + "epoch": 0.5848, + "grad_norm": 13.230367660522461, + "learning_rate": 1.9202318473658703e-05, + "loss": 0.4494, + "step": 2924 + }, + { + "epoch": 0.5852, + "grad_norm": 3.5153143405914307, + "learning_rate": 1.9207774108961273e-05, + "loss": 0.4909, + "step": 2926 + }, + { + "epoch": 0.5856, + "grad_norm": 4.725398063659668, + "learning_rate": 1.9213211793237052e-05, + "loss": 0.3882, + "step": 2928 + }, + { + "epoch": 0.586, + "grad_norm": 3.8818702697753906, + "learning_rate": 1.9218631515885004e-05, + "loss": 0.126, + "step": 2930 + }, + { + "epoch": 0.5864, + "grad_norm": 7.4234747886657715, + "learning_rate": 1.92240332663391e-05, + "loss": 0.5749, + "step": 2932 + }, + { + "epoch": 0.5868, + "grad_norm": 16.866479873657227, + "learning_rate": 1.922941703406835e-05, + "loss": 0.2427, + "step": 2934 + }, + { + "epoch": 0.5872, + "grad_norm": 6.294543266296387, + "learning_rate": 1.923478280857682e-05, + "loss": 1.0369, + "step": 2936 + }, + { + "epoch": 0.5876, + "grad_norm": 1.0833808183670044, + "learning_rate": 1.9240130579403663e-05, + "loss": 0.1101, + "step": 2938 + }, + { + "epoch": 0.588, + "grad_norm": 2.635394811630249, + "learning_rate": 1.924546033612313e-05, + "loss": 0.4636, + "step": 2940 + }, + { + "epoch": 0.5884, + "grad_norm": 2.3891801834106445, + "learning_rate": 1.9250772068344577e-05, + "loss": 0.3979, + "step": 2942 + }, + { + "epoch": 0.5888, + "grad_norm": 3.269239664077759, + "learning_rate": 1.9256065765712524e-05, + "loss": 0.7828, + "step": 2944 + }, + { + "epoch": 0.5892, + "grad_norm": 7.3950419425964355, + "learning_rate": 1.9261341417906615e-05, + "loss": 0.4994, + "step": 2946 + }, + { + "epoch": 0.5896, + "grad_norm": 7.076539039611816, + "learning_rate": 1.9266599014641724e-05, + "loss": 0.3066, + "step": 2948 + }, + { + "epoch": 0.59, + "grad_norm": 1.063869833946228, + "learning_rate": 1.9271838545667876e-05, + "loss": 0.2398, + "step": 2950 + }, + { + "epoch": 0.5904, + "grad_norm": 5.7909159660339355, + "learning_rate": 1.927706000077034e-05, + "loss": 0.3404, + "step": 2952 + }, + { + "epoch": 0.5908, + "grad_norm": 3.447693347930908, + "learning_rate": 1.9282263369769633e-05, + "loss": 0.2877, + "step": 2954 + }, + { + "epoch": 0.5912, + "grad_norm": 2.476720094680786, + "learning_rate": 1.9287448642521507e-05, + "loss": 0.1592, + "step": 2956 + }, + { + "epoch": 0.5916, + "grad_norm": 6.111763000488281, + "learning_rate": 1.9292615808917024e-05, + "loss": 0.3841, + "step": 2958 + }, + { + "epoch": 0.592, + "grad_norm": 2.295823574066162, + "learning_rate": 1.9297764858882516e-05, + "loss": 0.3983, + "step": 2960 + }, + { + "epoch": 0.5924, + "grad_norm": 11.688994407653809, + "learning_rate": 1.9302895782379648e-05, + "loss": 0.7525, + "step": 2962 + }, + { + "epoch": 0.5928, + "grad_norm": 2.558603048324585, + "learning_rate": 1.9308008569405424e-05, + "loss": 0.3126, + "step": 2964 + }, + { + "epoch": 0.5932, + "grad_norm": 7.392522811889648, + "learning_rate": 1.9313103209992205e-05, + "loss": 0.9193, + "step": 2966 + }, + { + "epoch": 0.5936, + "grad_norm": 4.891191482543945, + "learning_rate": 1.9318179694207722e-05, + "loss": 0.3616, + "step": 2968 + }, + { + "epoch": 0.594, + "grad_norm": 0.8834412097930908, + "learning_rate": 1.932323801215512e-05, + "loss": 0.1893, + "step": 2970 + }, + { + "epoch": 0.5944, + "grad_norm": 9.078575134277344, + "learning_rate": 1.9328278153972943e-05, + "loss": 1.0464, + "step": 2972 + }, + { + "epoch": 0.5948, + "grad_norm": 3.8669850826263428, + "learning_rate": 1.933330010983518e-05, + "loss": 1.0819, + "step": 2974 + }, + { + "epoch": 0.5952, + "grad_norm": 2.9672963619232178, + "learning_rate": 1.9338303869951266e-05, + "loss": 0.423, + "step": 2976 + }, + { + "epoch": 0.5956, + "grad_norm": 2.149142265319824, + "learning_rate": 1.934328942456612e-05, + "loss": 0.3377, + "step": 2978 + }, + { + "epoch": 0.596, + "grad_norm": 4.908062934875488, + "learning_rate": 1.934825676396015e-05, + "loss": 0.5869, + "step": 2980 + }, + { + "epoch": 0.5964, + "grad_norm": 2.2930214405059814, + "learning_rate": 1.9353205878449257e-05, + "loss": 0.3642, + "step": 2982 + }, + { + "epoch": 0.5968, + "grad_norm": 2.068453311920166, + "learning_rate": 1.935813675838491e-05, + "loss": 0.2879, + "step": 2984 + }, + { + "epoch": 0.5972, + "grad_norm": 6.450281143188477, + "learning_rate": 1.9363049394154088e-05, + "loss": 0.6223, + "step": 2986 + }, + { + "epoch": 0.5976, + "grad_norm": 1.42183256149292, + "learning_rate": 1.9367943776179375e-05, + "loss": 0.2905, + "step": 2988 + }, + { + "epoch": 0.598, + "grad_norm": 3.9985463619232178, + "learning_rate": 1.937281989491892e-05, + "loss": 0.3276, + "step": 2990 + }, + { + "epoch": 0.5984, + "grad_norm": 6.081268310546875, + "learning_rate": 1.9377677740866457e-05, + "loss": 0.7195, + "step": 2992 + }, + { + "epoch": 0.5988, + "grad_norm": 4.072579860687256, + "learning_rate": 1.9382517304551397e-05, + "loss": 0.3501, + "step": 2994 + }, + { + "epoch": 0.5992, + "grad_norm": 5.668904781341553, + "learning_rate": 1.9387338576538743e-05, + "loss": 0.4538, + "step": 2996 + }, + { + "epoch": 0.5996, + "grad_norm": 8.575440406799316, + "learning_rate": 1.9392141547429183e-05, + "loss": 0.6313, + "step": 2998 + }, + { + "epoch": 0.6, + "grad_norm": 4.947745323181152, + "learning_rate": 1.9396926207859085e-05, + "loss": 0.2601, + "step": 3000 + }, + { + "epoch": 0.6004, + "grad_norm": 4.210323333740234, + "learning_rate": 1.9401692548500504e-05, + "loss": 0.3053, + "step": 3002 + }, + { + "epoch": 0.6008, + "grad_norm": 6.873440742492676, + "learning_rate": 1.9406440560061214e-05, + "loss": 0.3763, + "step": 3004 + }, + { + "epoch": 0.6012, + "grad_norm": 2.905287027359009, + "learning_rate": 1.9411170233284728e-05, + "loss": 0.2028, + "step": 3006 + }, + { + "epoch": 0.6016, + "grad_norm": 15.054080963134766, + "learning_rate": 1.9415881558950302e-05, + "loss": 0.8319, + "step": 3008 + }, + { + "epoch": 0.602, + "grad_norm": 4.132566928863525, + "learning_rate": 1.942057452787297e-05, + "loss": 0.3187, + "step": 3010 + }, + { + "epoch": 0.6024, + "grad_norm": 4.134480953216553, + "learning_rate": 1.942524913090354e-05, + "loss": 0.2185, + "step": 3012 + }, + { + "epoch": 0.6028, + "grad_norm": 1.8104115724563599, + "learning_rate": 1.9429905358928645e-05, + "loss": 0.4167, + "step": 3014 + }, + { + "epoch": 0.6032, + "grad_norm": 13.61635971069336, + "learning_rate": 1.9434543202870723e-05, + "loss": 0.4303, + "step": 3016 + }, + { + "epoch": 0.6036, + "grad_norm": 7.761966228485107, + "learning_rate": 1.9439162653688063e-05, + "loss": 0.6786, + "step": 3018 + }, + { + "epoch": 0.604, + "grad_norm": 3.9029548168182373, + "learning_rate": 1.9443763702374815e-05, + "loss": 0.3728, + "step": 3020 + }, + { + "epoch": 0.6044, + "grad_norm": 3.9247467517852783, + "learning_rate": 1.944834633996098e-05, + "loss": 0.3303, + "step": 3022 + }, + { + "epoch": 0.6048, + "grad_norm": 2.3633649349212646, + "learning_rate": 1.9452910557512494e-05, + "loss": 0.4953, + "step": 3024 + }, + { + "epoch": 0.6052, + "grad_norm": 4.049633026123047, + "learning_rate": 1.9457456346131172e-05, + "loss": 0.2615, + "step": 3026 + }, + { + "epoch": 0.6056, + "grad_norm": 2.48594331741333, + "learning_rate": 1.9461983696954756e-05, + "loss": 0.8536, + "step": 3028 + }, + { + "epoch": 0.606, + "grad_norm": 2.97275447845459, + "learning_rate": 1.9466492601156964e-05, + "loss": 0.3284, + "step": 3030 + }, + { + "epoch": 0.6064, + "grad_norm": 4.044925689697266, + "learning_rate": 1.947098304994744e-05, + "loss": 0.3291, + "step": 3032 + }, + { + "epoch": 0.6068, + "grad_norm": 2.1783950328826904, + "learning_rate": 1.947545503457184e-05, + "loss": 0.3619, + "step": 3034 + }, + { + "epoch": 0.6072, + "grad_norm": 9.208476066589355, + "learning_rate": 1.9479908546311787e-05, + "loss": 0.4557, + "step": 3036 + }, + { + "epoch": 0.6076, + "grad_norm": 2.302206039428711, + "learning_rate": 1.9484343576484935e-05, + "loss": 0.2085, + "step": 3038 + }, + { + "epoch": 0.608, + "grad_norm": 5.5845746994018555, + "learning_rate": 1.9488760116444966e-05, + "loss": 0.3038, + "step": 3040 + }, + { + "epoch": 0.6084, + "grad_norm": 9.386860847473145, + "learning_rate": 1.949315815758161e-05, + "loss": 0.4585, + "step": 3042 + }, + { + "epoch": 0.6088, + "grad_norm": 5.408812522888184, + "learning_rate": 1.949753769132067e-05, + "loss": 0.3594, + "step": 3044 + }, + { + "epoch": 0.6092, + "grad_norm": 0.0678795725107193, + "learning_rate": 1.9501898709124008e-05, + "loss": 0.1186, + "step": 3046 + }, + { + "epoch": 0.6096, + "grad_norm": 0.016408616676926613, + "learning_rate": 1.95062412024896e-05, + "loss": 0.001, + "step": 3048 + }, + { + "epoch": 0.61, + "grad_norm": 12.311307907104492, + "learning_rate": 1.9510565162951534e-05, + "loss": 0.7536, + "step": 3050 + }, + { + "epoch": 0.6104, + "grad_norm": 3.250002861022949, + "learning_rate": 1.951487058208003e-05, + "loss": 0.2979, + "step": 3052 + }, + { + "epoch": 0.6108, + "grad_norm": 5.773616790771484, + "learning_rate": 1.9519157451481453e-05, + "loss": 0.596, + "step": 3054 + }, + { + "epoch": 0.6112, + "grad_norm": 6.458787441253662, + "learning_rate": 1.952342576279833e-05, + "loss": 0.3746, + "step": 3056 + }, + { + "epoch": 0.6116, + "grad_norm": 13.680730819702148, + "learning_rate": 1.9527675507709364e-05, + "loss": 0.8778, + "step": 3058 + }, + { + "epoch": 0.612, + "grad_norm": 4.030681133270264, + "learning_rate": 1.953190667792947e-05, + "loss": 0.5081, + "step": 3060 + }, + { + "epoch": 0.6124, + "grad_norm": 1.5509819984436035, + "learning_rate": 1.953611926520976e-05, + "loss": 0.2965, + "step": 3062 + }, + { + "epoch": 0.6128, + "grad_norm": 15.349401473999023, + "learning_rate": 1.9540313261337578e-05, + "loss": 0.7625, + "step": 3064 + }, + { + "epoch": 0.6132, + "grad_norm": 1.0952515602111816, + "learning_rate": 1.9544488658136522e-05, + "loss": 0.1338, + "step": 3066 + }, + { + "epoch": 0.6136, + "grad_norm": 2.718113899230957, + "learning_rate": 1.954864544746643e-05, + "loss": 0.3248, + "step": 3068 + }, + { + "epoch": 0.614, + "grad_norm": 2.0834860801696777, + "learning_rate": 1.955278362122344e-05, + "loss": 0.495, + "step": 3070 + }, + { + "epoch": 0.6144, + "grad_norm": 4.787538528442383, + "learning_rate": 1.955690317133996e-05, + "loss": 0.3215, + "step": 3072 + }, + { + "epoch": 0.6148, + "grad_norm": 2.9907476902008057, + "learning_rate": 1.9561004089784726e-05, + "loss": 0.2934, + "step": 3074 + }, + { + "epoch": 0.6152, + "grad_norm": 2.205878734588623, + "learning_rate": 1.956508636856278e-05, + "loss": 0.4205, + "step": 3076 + }, + { + "epoch": 0.6156, + "grad_norm": 5.943421840667725, + "learning_rate": 1.956914999971551e-05, + "loss": 0.3832, + "step": 3078 + }, + { + "epoch": 0.616, + "grad_norm": 5.13758659362793, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.3334, + "step": 3080 + }, + { + "epoch": 0.6164, + "grad_norm": 2.541583776473999, + "learning_rate": 1.9577221287492368e-05, + "loss": 0.3663, + "step": 3082 + }, + { + "epoch": 0.6168, + "grad_norm": 12.082542419433594, + "learning_rate": 1.95812289283811e-05, + "loss": 1.0204, + "step": 3084 + }, + { + "epoch": 0.6172, + "grad_norm": 3.2383480072021484, + "learning_rate": 1.958521789017376e-05, + "loss": 0.2349, + "step": 3086 + }, + { + "epoch": 0.6176, + "grad_norm": 3.9670960903167725, + "learning_rate": 1.958918816509367e-05, + "loss": 0.197, + "step": 3088 + }, + { + "epoch": 0.618, + "grad_norm": 10.055253028869629, + "learning_rate": 1.9593139745400575e-05, + "loss": 0.4294, + "step": 3090 + }, + { + "epoch": 0.6184, + "grad_norm": 1.4403338432312012, + "learning_rate": 1.9597072623390668e-05, + "loss": 0.331, + "step": 3092 + }, + { + "epoch": 0.6188, + "grad_norm": 4.050849437713623, + "learning_rate": 1.9600986791396597e-05, + "loss": 0.2319, + "step": 3094 + }, + { + "epoch": 0.6192, + "grad_norm": 12.189918518066406, + "learning_rate": 1.9604882241787496e-05, + "loss": 0.3857, + "step": 3096 + }, + { + "epoch": 0.6196, + "grad_norm": 5.219604969024658, + "learning_rate": 1.9608758966968983e-05, + "loss": 0.4857, + "step": 3098 + }, + { + "epoch": 0.62, + "grad_norm": 2.9262871742248535, + "learning_rate": 1.9612616959383187e-05, + "loss": 0.2439, + "step": 3100 + }, + { + "epoch": 0.6204, + "grad_norm": 0.15856997668743134, + "learning_rate": 1.9616456211508752e-05, + "loss": 0.1414, + "step": 3102 + }, + { + "epoch": 0.6208, + "grad_norm": 3.0933477878570557, + "learning_rate": 1.9620276715860856e-05, + "loss": 0.2388, + "step": 3104 + }, + { + "epoch": 0.6212, + "grad_norm": 5.281192779541016, + "learning_rate": 1.962407846499124e-05, + "loss": 0.3374, + "step": 3106 + }, + { + "epoch": 0.6216, + "grad_norm": 6.8485260009765625, + "learning_rate": 1.9627861451488187e-05, + "loss": 0.2799, + "step": 3108 + }, + { + "epoch": 0.622, + "grad_norm": 2.6462466716766357, + "learning_rate": 1.9631625667976584e-05, + "loss": 0.3931, + "step": 3110 + }, + { + "epoch": 0.6224, + "grad_norm": 8.119430541992188, + "learning_rate": 1.963537110711789e-05, + "loss": 0.4336, + "step": 3112 + }, + { + "epoch": 0.6228, + "grad_norm": 6.044808864593506, + "learning_rate": 1.9639097761610174e-05, + "loss": 0.3782, + "step": 3114 + }, + { + "epoch": 0.6232, + "grad_norm": 3.278903007507324, + "learning_rate": 1.964280562418815e-05, + "loss": 0.4069, + "step": 3116 + }, + { + "epoch": 0.6236, + "grad_norm": 2.3031933307647705, + "learning_rate": 1.964649468762313e-05, + "loss": 0.3432, + "step": 3118 + }, + { + "epoch": 0.624, + "grad_norm": 5.704897403717041, + "learning_rate": 1.9650164944723116e-05, + "loss": 0.3028, + "step": 3120 + }, + { + "epoch": 0.6244, + "grad_norm": 8.934288024902344, + "learning_rate": 1.965381638833274e-05, + "loss": 0.3881, + "step": 3122 + }, + { + "epoch": 0.6248, + "grad_norm": 9.705379486083984, + "learning_rate": 1.9657449011333328e-05, + "loss": 0.4417, + "step": 3124 + }, + { + "epoch": 0.6252, + "grad_norm": 5.057989120483398, + "learning_rate": 1.96610628066429e-05, + "loss": 0.2193, + "step": 3126 + }, + { + "epoch": 0.6256, + "grad_norm": 4.561992645263672, + "learning_rate": 1.9664657767216176e-05, + "loss": 0.4112, + "step": 3128 + }, + { + "epoch": 0.626, + "grad_norm": 14.311212539672852, + "learning_rate": 1.9668233886044594e-05, + "loss": 0.5477, + "step": 3130 + }, + { + "epoch": 0.6264, + "grad_norm": 6.972470760345459, + "learning_rate": 1.967179115615633e-05, + "loss": 0.3475, + "step": 3132 + }, + { + "epoch": 0.6268, + "grad_norm": 4.312015533447266, + "learning_rate": 1.96753295706163e-05, + "loss": 0.2263, + "step": 3134 + }, + { + "epoch": 0.6272, + "grad_norm": 16.607738494873047, + "learning_rate": 1.967884912252619e-05, + "loss": 0.6575, + "step": 3136 + }, + { + "epoch": 0.6276, + "grad_norm": 6.746756076812744, + "learning_rate": 1.9682349805024443e-05, + "loss": 0.3232, + "step": 3138 + }, + { + "epoch": 0.628, + "grad_norm": 3.9692506790161133, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.4228, + "step": 3140 + }, + { + "epoch": 0.6284, + "grad_norm": 2.6029999256134033, + "learning_rate": 1.9689294534523833e-05, + "loss": 0.3338, + "step": 3142 + }, + { + "epoch": 0.6288, + "grad_norm": 7.929990768432617, + "learning_rate": 1.969273856798585e-05, + "loss": 0.4461, + "step": 3144 + }, + { + "epoch": 0.6292, + "grad_norm": 5.24360466003418, + "learning_rate": 1.969616370495806e-05, + "loss": 0.3093, + "step": 3146 + }, + { + "epoch": 0.6296, + "grad_norm": 2.0615146160125732, + "learning_rate": 1.9699569938762972e-05, + "loss": 0.3563, + "step": 3148 + }, + { + "epoch": 0.63, + "grad_norm": 10.891960144042969, + "learning_rate": 1.9702957262759964e-05, + "loss": 0.903, + "step": 3150 + }, + { + "epoch": 0.6304, + "grad_norm": 0.33485591411590576, + "learning_rate": 1.9706325670345276e-05, + "loss": 0.971, + "step": 3152 + }, + { + "epoch": 0.6308, + "grad_norm": 2.010587692260742, + "learning_rate": 1.9709675154952013e-05, + "loss": 0.2147, + "step": 3154 + }, + { + "epoch": 0.6312, + "grad_norm": 2.3025755882263184, + "learning_rate": 1.9713005710050203e-05, + "loss": 0.2219, + "step": 3156 + }, + { + "epoch": 0.6316, + "grad_norm": 2.1327977180480957, + "learning_rate": 1.971631732914674e-05, + "loss": 0.3991, + "step": 3158 + }, + { + "epoch": 0.632, + "grad_norm": 10.648874282836914, + "learning_rate": 1.9719610005785466e-05, + "loss": 0.5683, + "step": 3160 + }, + { + "epoch": 0.6324, + "grad_norm": 1.8799753189086914, + "learning_rate": 1.9722883733547128e-05, + "loss": 0.3053, + "step": 3162 + }, + { + "epoch": 0.6328, + "grad_norm": 4.276743412017822, + "learning_rate": 1.9726138506049434e-05, + "loss": 0.5778, + "step": 3164 + }, + { + "epoch": 0.6332, + "grad_norm": 3.56365704536438, + "learning_rate": 1.972937431694704e-05, + "loss": 0.2665, + "step": 3166 + }, + { + "epoch": 0.6336, + "grad_norm": 5.996713161468506, + "learning_rate": 1.9732591159931564e-05, + "loss": 0.4628, + "step": 3168 + }, + { + "epoch": 0.634, + "grad_norm": 8.690179824829102, + "learning_rate": 1.9735789028731603e-05, + "loss": 0.5138, + "step": 3170 + }, + { + "epoch": 0.6344, + "grad_norm": 7.665664196014404, + "learning_rate": 1.9738967917112752e-05, + "loss": 0.4615, + "step": 3172 + }, + { + "epoch": 0.6348, + "grad_norm": 4.159283638000488, + "learning_rate": 1.9742127818877605e-05, + "loss": 0.4273, + "step": 3174 + }, + { + "epoch": 0.6352, + "grad_norm": 3.0031960010528564, + "learning_rate": 1.974526872786577e-05, + "loss": 0.3135, + "step": 3176 + }, + { + "epoch": 0.6356, + "grad_norm": 3.7548129558563232, + "learning_rate": 1.974839063795389e-05, + "loss": 0.39, + "step": 3178 + }, + { + "epoch": 0.636, + "grad_norm": 8.917691230773926, + "learning_rate": 1.9751493543055634e-05, + "loss": 0.5249, + "step": 3180 + }, + { + "epoch": 0.6364, + "grad_norm": 14.002290725708008, + "learning_rate": 1.975457743712173e-05, + "loss": 0.4813, + "step": 3182 + }, + { + "epoch": 0.6368, + "grad_norm": 7.703248500823975, + "learning_rate": 1.9757642314139977e-05, + "loss": 0.521, + "step": 3184 + }, + { + "epoch": 0.6372, + "grad_norm": 6.956558704376221, + "learning_rate": 1.976068816813523e-05, + "loss": 0.3237, + "step": 3186 + }, + { + "epoch": 0.6376, + "grad_norm": 10.512991905212402, + "learning_rate": 1.976371499316945e-05, + "loss": 0.5157, + "step": 3188 + }, + { + "epoch": 0.638, + "grad_norm": 3.2512404918670654, + "learning_rate": 1.9766722783341675e-05, + "loss": 0.2475, + "step": 3190 + }, + { + "epoch": 0.6384, + "grad_norm": 11.137414932250977, + "learning_rate": 1.9769711532788083e-05, + "loss": 0.5016, + "step": 3192 + }, + { + "epoch": 0.6388, + "grad_norm": 10.613699913024902, + "learning_rate": 1.9772681235681933e-05, + "loss": 0.4015, + "step": 3194 + }, + { + "epoch": 0.6392, + "grad_norm": 2.551306962966919, + "learning_rate": 1.9775631886233655e-05, + "loss": 0.2175, + "step": 3196 + }, + { + "epoch": 0.6396, + "grad_norm": 2.5975027084350586, + "learning_rate": 1.977856347869079e-05, + "loss": 0.5261, + "step": 3198 + }, + { + "epoch": 0.64, + "grad_norm": 2.5470478534698486, + "learning_rate": 1.9781476007338054e-05, + "loss": 0.4481, + "step": 3200 + }, + { + "epoch": 0.6404, + "grad_norm": 5.174435615539551, + "learning_rate": 1.9784369466497333e-05, + "loss": 0.5611, + "step": 3202 + }, + { + "epoch": 0.6408, + "grad_norm": 4.254577159881592, + "learning_rate": 1.978724385052766e-05, + "loss": 0.7999, + "step": 3204 + }, + { + "epoch": 0.6412, + "grad_norm": 2.8448843955993652, + "learning_rate": 1.97900991538253e-05, + "loss": 0.4633, + "step": 3206 + }, + { + "epoch": 0.6416, + "grad_norm": 1.9862366914749146, + "learning_rate": 1.9792935370823673e-05, + "loss": 0.3331, + "step": 3208 + }, + { + "epoch": 0.642, + "grad_norm": 3.8458147048950195, + "learning_rate": 1.979575249599344e-05, + "loss": 0.1303, + "step": 3210 + }, + { + "epoch": 0.6424, + "grad_norm": 11.724701881408691, + "learning_rate": 1.979855052384247e-05, + "loss": 0.7221, + "step": 3212 + }, + { + "epoch": 0.6428, + "grad_norm": 8.610001564025879, + "learning_rate": 1.980132944891586e-05, + "loss": 0.6508, + "step": 3214 + }, + { + "epoch": 0.6432, + "grad_norm": 5.677703857421875, + "learning_rate": 1.9804089265795956e-05, + "loss": 0.4542, + "step": 3216 + }, + { + "epoch": 0.6436, + "grad_norm": 8.357645988464355, + "learning_rate": 1.9806829969102356e-05, + "loss": 0.6201, + "step": 3218 + }, + { + "epoch": 0.644, + "grad_norm": 3.2478199005126953, + "learning_rate": 1.9809551553491918e-05, + "loss": 0.3235, + "step": 3220 + }, + { + "epoch": 0.6444, + "grad_norm": 2.291588068008423, + "learning_rate": 1.981225401365877e-05, + "loss": 0.6255, + "step": 3222 + }, + { + "epoch": 0.6448, + "grad_norm": 5.643817901611328, + "learning_rate": 1.981493734433433e-05, + "loss": 0.4108, + "step": 3224 + }, + { + "epoch": 0.6452, + "grad_norm": 4.044252872467041, + "learning_rate": 1.981760154028731e-05, + "loss": 0.3124, + "step": 3226 + }, + { + "epoch": 0.6456, + "grad_norm": 1.6267731189727783, + "learning_rate": 1.982024659632372e-05, + "loss": 0.1663, + "step": 3228 + }, + { + "epoch": 0.646, + "grad_norm": 1.4533847570419312, + "learning_rate": 1.9822872507286887e-05, + "loss": 0.1913, + "step": 3230 + }, + { + "epoch": 0.6464, + "grad_norm": 2.0648794174194336, + "learning_rate": 1.9825479268057472e-05, + "loss": 0.1471, + "step": 3232 + }, + { + "epoch": 0.6468, + "grad_norm": 4.256410598754883, + "learning_rate": 1.9828066873553445e-05, + "loss": 0.585, + "step": 3234 + }, + { + "epoch": 0.6472, + "grad_norm": 8.639936447143555, + "learning_rate": 1.9830635318730155e-05, + "loss": 0.4465, + "step": 3236 + }, + { + "epoch": 0.6476, + "grad_norm": 3.739392042160034, + "learning_rate": 1.983318459858028e-05, + "loss": 0.4726, + "step": 3238 + }, + { + "epoch": 0.648, + "grad_norm": 9.779509544372559, + "learning_rate": 1.9835714708133858e-05, + "loss": 0.5151, + "step": 3240 + }, + { + "epoch": 0.6484, + "grad_norm": 3.801658868789673, + "learning_rate": 1.983822564245833e-05, + "loss": 0.3266, + "step": 3242 + }, + { + "epoch": 0.6488, + "grad_norm": 4.0018534660339355, + "learning_rate": 1.9840717396658483e-05, + "loss": 0.5853, + "step": 3244 + }, + { + "epoch": 0.6492, + "grad_norm": 2.49670672416687, + "learning_rate": 1.9843189965876525e-05, + "loss": 0.1894, + "step": 3246 + }, + { + "epoch": 0.6496, + "grad_norm": 3.4306211471557617, + "learning_rate": 1.9845643345292055e-05, + "loss": 0.3926, + "step": 3248 + }, + { + "epoch": 0.65, + "grad_norm": 7.109899044036865, + "learning_rate": 1.9848077530122083e-05, + "loss": 0.3728, + "step": 3250 + }, + { + "epoch": 0.6504, + "grad_norm": 5.702610492706299, + "learning_rate": 1.9850492515621038e-05, + "loss": 0.7132, + "step": 3252 + }, + { + "epoch": 0.6508, + "grad_norm": 2.991964340209961, + "learning_rate": 1.9852888297080785e-05, + "loss": 0.2793, + "step": 3254 + }, + { + "epoch": 0.6512, + "grad_norm": 5.826408386230469, + "learning_rate": 1.985526486983063e-05, + "loss": 0.3929, + "step": 3256 + }, + { + "epoch": 0.6516, + "grad_norm": 4.283285617828369, + "learning_rate": 1.9857622229237315e-05, + "loss": 0.385, + "step": 3258 + }, + { + "epoch": 0.652, + "grad_norm": 6.173493385314941, + "learning_rate": 1.985996037070505e-05, + "loss": 0.416, + "step": 3260 + }, + { + "epoch": 0.6524, + "grad_norm": 10.006912231445312, + "learning_rate": 1.986227928967551e-05, + "loss": 0.2918, + "step": 3262 + }, + { + "epoch": 0.6528, + "grad_norm": 5.0178422927856445, + "learning_rate": 1.9864578981627844e-05, + "loss": 0.2637, + "step": 3264 + }, + { + "epoch": 0.6532, + "grad_norm": 3.1747500896453857, + "learning_rate": 1.986685944207868e-05, + "loss": 0.1355, + "step": 3266 + }, + { + "epoch": 0.6536, + "grad_norm": 1.893709659576416, + "learning_rate": 1.9869120666582153e-05, + "loss": 0.3169, + "step": 3268 + }, + { + "epoch": 0.654, + "grad_norm": 5.65559196472168, + "learning_rate": 1.9871362650729877e-05, + "loss": 0.2553, + "step": 3270 + }, + { + "epoch": 0.6544, + "grad_norm": 14.145685195922852, + "learning_rate": 1.9873585390151003e-05, + "loss": 0.5728, + "step": 3272 + }, + { + "epoch": 0.6548, + "grad_norm": 8.569982528686523, + "learning_rate": 1.9875788880512183e-05, + "loss": 0.4924, + "step": 3274 + }, + { + "epoch": 0.6552, + "grad_norm": 2.566817045211792, + "learning_rate": 1.987797311751759e-05, + "loss": 0.3749, + "step": 3276 + }, + { + "epoch": 0.6556, + "grad_norm": 2.836402416229248, + "learning_rate": 1.9880138096908955e-05, + "loss": 0.2923, + "step": 3278 + }, + { + "epoch": 0.656, + "grad_norm": 3.7662458419799805, + "learning_rate": 1.9882283814465528e-05, + "loss": 0.4209, + "step": 3280 + }, + { + "epoch": 0.6564, + "grad_norm": 10.232479095458984, + "learning_rate": 1.9884410266004134e-05, + "loss": 0.8474, + "step": 3282 + }, + { + "epoch": 0.6568, + "grad_norm": 2.137160539627075, + "learning_rate": 1.988651744737914e-05, + "loss": 0.3218, + "step": 3284 + }, + { + "epoch": 0.6572, + "grad_norm": 5.720272541046143, + "learning_rate": 1.9888605354482494e-05, + "loss": 0.2544, + "step": 3286 + }, + { + "epoch": 0.6576, + "grad_norm": 2.113636016845703, + "learning_rate": 1.9890673983243704e-05, + "loss": 0.3581, + "step": 3288 + }, + { + "epoch": 0.658, + "grad_norm": 5.369784355163574, + "learning_rate": 1.9892723329629885e-05, + "loss": 0.4091, + "step": 3290 + }, + { + "epoch": 0.6584, + "grad_norm": 13.643994331359863, + "learning_rate": 1.9894753389645723e-05, + "loss": 0.6365, + "step": 3292 + }, + { + "epoch": 0.6588, + "grad_norm": 6.548267841339111, + "learning_rate": 1.989676415933351e-05, + "loss": 0.4444, + "step": 3294 + }, + { + "epoch": 0.6592, + "grad_norm": 3.1866977214813232, + "learning_rate": 1.9898755634773155e-05, + "loss": 0.4597, + "step": 3296 + }, + { + "epoch": 0.6596, + "grad_norm": 2.4113199710845947, + "learning_rate": 1.9900727812082174e-05, + "loss": 0.3111, + "step": 3298 + }, + { + "epoch": 0.66, + "grad_norm": 4.112249851226807, + "learning_rate": 1.9902680687415704e-05, + "loss": 0.2816, + "step": 3300 + }, + { + "epoch": 0.6604, + "grad_norm": 7.497649669647217, + "learning_rate": 1.9904614256966514e-05, + "loss": 0.2885, + "step": 3302 + }, + { + "epoch": 0.6608, + "grad_norm": 14.712210655212402, + "learning_rate": 1.9906528516965014e-05, + "loss": 0.7162, + "step": 3304 + }, + { + "epoch": 0.6612, + "grad_norm": 9.239347457885742, + "learning_rate": 1.9908423463679246e-05, + "loss": 0.4701, + "step": 3306 + }, + { + "epoch": 0.6616, + "grad_norm": 8.642061233520508, + "learning_rate": 1.9910299093414926e-05, + "loss": 0.7518, + "step": 3308 + }, + { + "epoch": 0.662, + "grad_norm": 2.8765437602996826, + "learning_rate": 1.991215540251542e-05, + "loss": 0.1465, + "step": 3310 + }, + { + "epoch": 0.6624, + "grad_norm": 3.2499144077301025, + "learning_rate": 1.9913992387361744e-05, + "loss": 0.3812, + "step": 3312 + }, + { + "epoch": 0.6628, + "grad_norm": 3.504441738128662, + "learning_rate": 1.9915810044372618e-05, + "loss": 0.3685, + "step": 3314 + }, + { + "epoch": 0.6632, + "grad_norm": 5.392419815063477, + "learning_rate": 1.9917608370004414e-05, + "loss": 0.3893, + "step": 3316 + }, + { + "epoch": 0.6636, + "grad_norm": 2.1346049308776855, + "learning_rate": 1.9919387360751216e-05, + "loss": 0.1748, + "step": 3318 + }, + { + "epoch": 0.664, + "grad_norm": 5.841980457305908, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.3305, + "step": 3320 + }, + { + "epoch": 0.6644, + "grad_norm": 6.3239426612854, + "learning_rate": 1.992288732375458e-05, + "loss": 0.1519, + "step": 3322 + }, + { + "epoch": 0.6648, + "grad_norm": 6.545201301574707, + "learning_rate": 1.9924608289187786e-05, + "loss": 0.3747, + "step": 3324 + }, + { + "epoch": 0.6652, + "grad_norm": 7.848917484283447, + "learning_rate": 1.992630990608929e-05, + "loss": 0.4812, + "step": 3326 + }, + { + "epoch": 0.6656, + "grad_norm": 2.9980220794677734, + "learning_rate": 1.9927992171141707e-05, + "loss": 0.5205, + "step": 3328 + }, + { + "epoch": 0.666, + "grad_norm": 5.332727432250977, + "learning_rate": 1.992965508106537e-05, + "loss": 0.3199, + "step": 3330 + }, + { + "epoch": 0.6664, + "grad_norm": 5.690469741821289, + "learning_rate": 1.9931298632618355e-05, + "loss": 0.9574, + "step": 3332 + }, + { + "epoch": 0.6668, + "grad_norm": 10.744891166687012, + "learning_rate": 1.993292282259647e-05, + "loss": 0.5954, + "step": 3334 + }, + { + "epoch": 0.6672, + "grad_norm": 12.637776374816895, + "learning_rate": 1.9934527647833276e-05, + "loss": 1.0196, + "step": 3336 + }, + { + "epoch": 0.6676, + "grad_norm": 6.392087459564209, + "learning_rate": 1.9936113105200085e-05, + "loss": 0.4733, + "step": 3338 + }, + { + "epoch": 0.668, + "grad_norm": 9.777617454528809, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.6663, + "step": 3340 + }, + { + "epoch": 0.6684, + "grad_norm": 1.4254674911499023, + "learning_rate": 1.9939225903997748e-05, + "loss": 0.1491, + "step": 3342 + }, + { + "epoch": 0.6688, + "grad_norm": 8.21716022491455, + "learning_rate": 1.9940753239360047e-05, + "loss": 1.1687, + "step": 3344 + }, + { + "epoch": 0.6692, + "grad_norm": 2.6545028686523438, + "learning_rate": 1.9942261194715236e-05, + "loss": 0.2862, + "step": 3346 + }, + { + "epoch": 0.6696, + "grad_norm": 4.042023658752441, + "learning_rate": 1.994374976712348e-05, + "loss": 0.2299, + "step": 3348 + }, + { + "epoch": 0.67, + "grad_norm": 5.448087692260742, + "learning_rate": 1.9945218953682736e-05, + "loss": 0.6379, + "step": 3350 + }, + { + "epoch": 0.6704, + "grad_norm": 2.3556456565856934, + "learning_rate": 1.994666875152874e-05, + "loss": 0.4983, + "step": 3352 + }, + { + "epoch": 0.6708, + "grad_norm": 2.957632303237915, + "learning_rate": 1.994809915783505e-05, + "loss": 0.3161, + "step": 3354 + }, + { + "epoch": 0.6712, + "grad_norm": 3.5552966594696045, + "learning_rate": 1.9949510169813003e-05, + "loss": 0.4043, + "step": 3356 + }, + { + "epoch": 0.6716, + "grad_norm": 5.343554973602295, + "learning_rate": 1.9950901784711768e-05, + "loss": 0.3703, + "step": 3358 + }, + { + "epoch": 0.672, + "grad_norm": 6.1362996101379395, + "learning_rate": 1.9952273999818312e-05, + "loss": 0.2948, + "step": 3360 + }, + { + "epoch": 0.6724, + "grad_norm": 3.9552671909332275, + "learning_rate": 1.995362681245744e-05, + "loss": 0.457, + "step": 3362 + }, + { + "epoch": 0.6728, + "grad_norm": 2.088674783706665, + "learning_rate": 1.995496021999177e-05, + "loss": 0.3356, + "step": 3364 + }, + { + "epoch": 0.6732, + "grad_norm": 5.352585792541504, + "learning_rate": 1.995627421982176e-05, + "loss": 0.5591, + "step": 3366 + }, + { + "epoch": 0.6736, + "grad_norm": 3.560239553451538, + "learning_rate": 1.9957568809385693e-05, + "loss": 0.3455, + "step": 3368 + }, + { + "epoch": 0.674, + "grad_norm": 3.9426498413085938, + "learning_rate": 1.9958843986159705e-05, + "loss": 0.346, + "step": 3370 + }, + { + "epoch": 0.6744, + "grad_norm": 1.591428518295288, + "learning_rate": 1.9960099747657774e-05, + "loss": 0.105, + "step": 3372 + }, + { + "epoch": 0.6748, + "grad_norm": 5.460711479187012, + "learning_rate": 1.9961336091431725e-05, + "loss": 0.3721, + "step": 3374 + }, + { + "epoch": 0.6752, + "grad_norm": 3.2586982250213623, + "learning_rate": 1.996255301507125e-05, + "loss": 1.0141, + "step": 3376 + }, + { + "epoch": 0.6756, + "grad_norm": 8.323261260986328, + "learning_rate": 1.9963750516203884e-05, + "loss": 0.5257, + "step": 3378 + }, + { + "epoch": 0.676, + "grad_norm": 4.053325176239014, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.471, + "step": 3380 + }, + { + "epoch": 0.6764, + "grad_norm": 3.110297679901123, + "learning_rate": 1.996608724164801e-05, + "loss": 0.4921, + "step": 3382 + }, + { + "epoch": 0.6768, + "grad_norm": 2.659935235977173, + "learning_rate": 1.9967226461403934e-05, + "loss": 0.3025, + "step": 3384 + }, + { + "epoch": 0.6772, + "grad_norm": 2.215158462524414, + "learning_rate": 1.9968346249541848e-05, + "loss": 0.1692, + "step": 3386 + }, + { + "epoch": 0.6776, + "grad_norm": 1.7267787456512451, + "learning_rate": 1.996944660387867e-05, + "loss": 0.3445, + "step": 3388 + }, + { + "epoch": 0.678, + "grad_norm": 3.6269516944885254, + "learning_rate": 1.9970527522269204e-05, + "loss": 0.3759, + "step": 3390 + }, + { + "epoch": 0.6784, + "grad_norm": 3.694133758544922, + "learning_rate": 1.997158900260614e-05, + "loss": 0.3583, + "step": 3392 + }, + { + "epoch": 0.6788, + "grad_norm": 4.115728378295898, + "learning_rate": 1.997263104282007e-05, + "loss": 0.2993, + "step": 3394 + }, + { + "epoch": 0.6792, + "grad_norm": 1.4272043704986572, + "learning_rate": 1.9973653640879486e-05, + "loss": 0.2025, + "step": 3396 + }, + { + "epoch": 0.6796, + "grad_norm": 5.1589555740356445, + "learning_rate": 1.9974656794790777e-05, + "loss": 0.381, + "step": 3398 + }, + { + "epoch": 0.68, + "grad_norm": 5.49625825881958, + "learning_rate": 1.9975640502598246e-05, + "loss": 0.4029, + "step": 3400 + }, + { + "epoch": 0.6804, + "grad_norm": 8.302574157714844, + "learning_rate": 1.99766047623841e-05, + "loss": 0.4429, + "step": 3402 + }, + { + "epoch": 0.6808, + "grad_norm": 7.695592403411865, + "learning_rate": 1.997754957226847e-05, + "loss": 0.2849, + "step": 3404 + }, + { + "epoch": 0.6812, + "grad_norm": 4.160185813903809, + "learning_rate": 1.9978474930409396e-05, + "loss": 0.2635, + "step": 3406 + }, + { + "epoch": 0.6816, + "grad_norm": 10.817899703979492, + "learning_rate": 1.9979380835002846e-05, + "loss": 0.7273, + "step": 3408 + }, + { + "epoch": 0.682, + "grad_norm": 5.825295448303223, + "learning_rate": 1.9980267284282718e-05, + "loss": 0.5902, + "step": 3410 + }, + { + "epoch": 0.6824, + "grad_norm": 10.700189590454102, + "learning_rate": 1.9981134276520828e-05, + "loss": 0.5071, + "step": 3412 + }, + { + "epoch": 0.6828, + "grad_norm": 7.277803897857666, + "learning_rate": 1.9981981810026932e-05, + "loss": 0.4996, + "step": 3414 + }, + { + "epoch": 0.6832, + "grad_norm": 2.539660930633545, + "learning_rate": 1.998280988314872e-05, + "loss": 1.1212, + "step": 3416 + }, + { + "epoch": 0.6836, + "grad_norm": 4.808956146240234, + "learning_rate": 1.9983618494271825e-05, + "loss": 0.2962, + "step": 3418 + }, + { + "epoch": 0.684, + "grad_norm": 4.493982315063477, + "learning_rate": 1.998440764181981e-05, + "loss": 0.2222, + "step": 3420 + }, + { + "epoch": 0.6844, + "grad_norm": 8.70090389251709, + "learning_rate": 1.99851773242542e-05, + "loss": 0.5724, + "step": 3422 + }, + { + "epoch": 0.6848, + "grad_norm": 2.4075520038604736, + "learning_rate": 1.9985927540074453e-05, + "loss": 0.3371, + "step": 3424 + }, + { + "epoch": 0.6852, + "grad_norm": 6.775694847106934, + "learning_rate": 1.9986658287817992e-05, + "loss": 0.2769, + "step": 3426 + }, + { + "epoch": 0.6856, + "grad_norm": 5.050171375274658, + "learning_rate": 1.998736956606018e-05, + "loss": 0.4178, + "step": 3428 + }, + { + "epoch": 0.686, + "grad_norm": 7.41837739944458, + "learning_rate": 1.9988061373414342e-05, + "loss": 0.7468, + "step": 3430 + }, + { + "epoch": 0.6864, + "grad_norm": 5.296545505523682, + "learning_rate": 1.9988733708531772e-05, + "loss": 0.2997, + "step": 3432 + }, + { + "epoch": 0.6868, + "grad_norm": 8.020288467407227, + "learning_rate": 1.9989386570101712e-05, + "loss": 0.4445, + "step": 3434 + }, + { + "epoch": 0.6872, + "grad_norm": 16.026287078857422, + "learning_rate": 1.9990019956851384e-05, + "loss": 1.0234, + "step": 3436 + }, + { + "epoch": 0.6876, + "grad_norm": 1.81304931640625, + "learning_rate": 1.9990633867545956e-05, + "loss": 0.5548, + "step": 3438 + }, + { + "epoch": 0.688, + "grad_norm": 5.990199565887451, + "learning_rate": 1.9991228300988586e-05, + "loss": 0.2147, + "step": 3440 + }, + { + "epoch": 0.6884, + "grad_norm": 10.723930358886719, + "learning_rate": 1.9991803256020393e-05, + "loss": 0.9827, + "step": 3442 + }, + { + "epoch": 0.6888, + "grad_norm": 6.198962688446045, + "learning_rate": 1.999235873152047e-05, + "loss": 0.376, + "step": 3444 + }, + { + "epoch": 0.6892, + "grad_norm": 1.5251423120498657, + "learning_rate": 1.9992894726405894e-05, + "loss": 0.3056, + "step": 3446 + }, + { + "epoch": 0.6896, + "grad_norm": 9.797451972961426, + "learning_rate": 1.9993411239631713e-05, + "loss": 0.8322, + "step": 3448 + }, + { + "epoch": 0.69, + "grad_norm": 1.5054662227630615, + "learning_rate": 1.999390827019096e-05, + "loss": 0.3868, + "step": 3450 + }, + { + "epoch": 0.6904, + "grad_norm": 9.541515350341797, + "learning_rate": 1.9994385817114644e-05, + "loss": 0.4202, + "step": 3452 + }, + { + "epoch": 0.6908, + "grad_norm": 18.33040428161621, + "learning_rate": 1.999484387947177e-05, + "loss": 0.6865, + "step": 3454 + }, + { + "epoch": 0.6912, + "grad_norm": 2.5486531257629395, + "learning_rate": 1.9995282456369313e-05, + "loss": 0.4185, + "step": 3456 + }, + { + "epoch": 0.6916, + "grad_norm": 7.150258541107178, + "learning_rate": 1.9995701546952252e-05, + "loss": 0.3857, + "step": 3458 + }, + { + "epoch": 0.692, + "grad_norm": 6.431832313537598, + "learning_rate": 1.9996101150403543e-05, + "loss": 0.4744, + "step": 3460 + }, + { + "epoch": 0.6924, + "grad_norm": 5.144509315490723, + "learning_rate": 1.9996481265944146e-05, + "loss": 0.2933, + "step": 3462 + }, + { + "epoch": 0.6928, + "grad_norm": 15.188258171081543, + "learning_rate": 1.9996841892833e-05, + "loss": 0.6614, + "step": 3464 + }, + { + "epoch": 0.6932, + "grad_norm": 5.353475093841553, + "learning_rate": 1.999718303036705e-05, + "loss": 0.3111, + "step": 3466 + }, + { + "epoch": 0.6936, + "grad_norm": 6.038778305053711, + "learning_rate": 1.9997504677881224e-05, + "loss": 0.3755, + "step": 3468 + }, + { + "epoch": 0.694, + "grad_norm": 5.05306339263916, + "learning_rate": 1.9997806834748455e-05, + "loss": 0.2834, + "step": 3470 + }, + { + "epoch": 0.6944, + "grad_norm": 1.6363768577575684, + "learning_rate": 1.999808950037968e-05, + "loss": 0.3347, + "step": 3472 + }, + { + "epoch": 0.6948, + "grad_norm": 2.225212812423706, + "learning_rate": 1.9998352674223816e-05, + "loss": 0.4799, + "step": 3474 + }, + { + "epoch": 0.6952, + "grad_norm": 5.458343505859375, + "learning_rate": 1.9998596355767805e-05, + "loss": 0.3883, + "step": 3476 + }, + { + "epoch": 0.6956, + "grad_norm": 6.942868709564209, + "learning_rate": 1.999882054453657e-05, + "loss": 0.2428, + "step": 3478 + }, + { + "epoch": 0.696, + "grad_norm": 2.133248805999756, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.3841, + "step": 3480 + }, + { + "epoch": 0.6964, + "grad_norm": 2.539342164993286, + "learning_rate": 1.9999210442038164e-05, + "loss": 0.2253, + "step": 3482 + }, + { + "epoch": 0.6968, + "grad_norm": 3.6325161457061768, + "learning_rate": 1.9999376150010868e-05, + "loss": 0.3533, + "step": 3484 + }, + { + "epoch": 0.6972, + "grad_norm": 1.7929434776306152, + "learning_rate": 1.99995223636881e-05, + "loss": 0.5292, + "step": 3486 + }, + { + "epoch": 0.6976, + "grad_norm": 4.525548458099365, + "learning_rate": 1.9999649082784807e-05, + "loss": 0.322, + "step": 3488 + }, + { + "epoch": 0.698, + "grad_norm": 5.1221699714660645, + "learning_rate": 1.9999756307053947e-05, + "loss": 0.5712, + "step": 3490 + }, + { + "epoch": 0.6984, + "grad_norm": 2.8722729682922363, + "learning_rate": 1.9999844036286483e-05, + "loss": 0.5349, + "step": 3492 + }, + { + "epoch": 0.6988, + "grad_norm": 13.76803970336914, + "learning_rate": 1.9999912270311376e-05, + "loss": 1.1198, + "step": 3494 + }, + { + "epoch": 0.6992, + "grad_norm": 3.435701370239258, + "learning_rate": 1.9999961008995607e-05, + "loss": 0.4871, + "step": 3496 + }, + { + "epoch": 0.6996, + "grad_norm": 8.523530960083008, + "learning_rate": 1.9999990252244153e-05, + "loss": 0.4664, + "step": 3498 + }, + { + "epoch": 0.7, + "grad_norm": 7.942747592926025, + "learning_rate": 2e-05, + "loss": 0.4786, + "step": 3500 + }, + { + "epoch": 0.7004, + "grad_norm": 2.5500338077545166, + "learning_rate": 1.9999990252244153e-05, + "loss": 0.205, + "step": 3502 + }, + { + "epoch": 0.7008, + "grad_norm": 4.129398822784424, + "learning_rate": 1.9999961008995607e-05, + "loss": 0.4463, + "step": 3504 + }, + { + "epoch": 0.7012, + "grad_norm": 4.136841297149658, + "learning_rate": 1.9999912270311376e-05, + "loss": 0.4992, + "step": 3506 + }, + { + "epoch": 0.7016, + "grad_norm": 2.4773693084716797, + "learning_rate": 1.9999844036286483e-05, + "loss": 0.19, + "step": 3508 + }, + { + "epoch": 0.702, + "grad_norm": 3.9615838527679443, + "learning_rate": 1.9999756307053947e-05, + "loss": 0.5053, + "step": 3510 + }, + { + "epoch": 0.7024, + "grad_norm": 5.021847248077393, + "learning_rate": 1.9999649082784807e-05, + "loss": 0.3073, + "step": 3512 + }, + { + "epoch": 0.7028, + "grad_norm": 3.37892484664917, + "learning_rate": 1.99995223636881e-05, + "loss": 0.2005, + "step": 3514 + }, + { + "epoch": 0.7032, + "grad_norm": 1.4871156215667725, + "learning_rate": 1.9999376150010868e-05, + "loss": 0.4595, + "step": 3516 + }, + { + "epoch": 0.7036, + "grad_norm": 23.657522201538086, + "learning_rate": 1.9999210442038164e-05, + "loss": 0.6753, + "step": 3518 + }, + { + "epoch": 0.704, + "grad_norm": 10.764266967773438, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.407, + "step": 3520 + }, + { + "epoch": 0.7044, + "grad_norm": 3.236546039581299, + "learning_rate": 1.999882054453657e-05, + "loss": 0.7471, + "step": 3522 + }, + { + "epoch": 0.7048, + "grad_norm": 12.486367225646973, + "learning_rate": 1.9998596355767805e-05, + "loss": 0.4138, + "step": 3524 + }, + { + "epoch": 0.7052, + "grad_norm": 9.665937423706055, + "learning_rate": 1.9998352674223816e-05, + "loss": 0.5561, + "step": 3526 + }, + { + "epoch": 0.7056, + "grad_norm": 3.5501019954681396, + "learning_rate": 1.999808950037968e-05, + "loss": 0.4414, + "step": 3528 + }, + { + "epoch": 0.706, + "grad_norm": 4.413691520690918, + "learning_rate": 1.9997806834748455e-05, + "loss": 0.2405, + "step": 3530 + }, + { + "epoch": 0.7064, + "grad_norm": 2.0691123008728027, + "learning_rate": 1.9997504677881224e-05, + "loss": 0.3745, + "step": 3532 + }, + { + "epoch": 0.7068, + "grad_norm": 2.187366247177124, + "learning_rate": 1.999718303036705e-05, + "loss": 0.3097, + "step": 3534 + }, + { + "epoch": 0.7072, + "grad_norm": 2.016996145248413, + "learning_rate": 1.9996841892833e-05, + "loss": 0.2233, + "step": 3536 + }, + { + "epoch": 0.7076, + "grad_norm": 2.407163381576538, + "learning_rate": 1.9996481265944146e-05, + "loss": 0.4251, + "step": 3538 + }, + { + "epoch": 0.708, + "grad_norm": 1.444625735282898, + "learning_rate": 1.9996101150403547e-05, + "loss": 0.44, + "step": 3540 + }, + { + "epoch": 0.7084, + "grad_norm": 6.20103645324707, + "learning_rate": 1.9995701546952252e-05, + "loss": 0.2223, + "step": 3542 + }, + { + "epoch": 0.7088, + "grad_norm": 13.445252418518066, + "learning_rate": 1.9995282456369313e-05, + "loss": 0.7574, + "step": 3544 + }, + { + "epoch": 0.7092, + "grad_norm": 7.26235294342041, + "learning_rate": 1.9994843879471766e-05, + "loss": 0.6035, + "step": 3546 + }, + { + "epoch": 0.7096, + "grad_norm": 1.9317138195037842, + "learning_rate": 1.9994385817114644e-05, + "loss": 0.3194, + "step": 3548 + }, + { + "epoch": 0.71, + "grad_norm": 5.026371955871582, + "learning_rate": 1.999390827019096e-05, + "loss": 0.3004, + "step": 3550 + }, + { + "epoch": 0.7104, + "grad_norm": 2.1579346656799316, + "learning_rate": 1.9993411239631713e-05, + "loss": 0.317, + "step": 3552 + }, + { + "epoch": 0.7108, + "grad_norm": 0.8029045462608337, + "learning_rate": 1.9992894726405898e-05, + "loss": 0.1311, + "step": 3554 + }, + { + "epoch": 0.7112, + "grad_norm": 2.295790672302246, + "learning_rate": 1.999235873152047e-05, + "loss": 0.3094, + "step": 3556 + }, + { + "epoch": 0.7116, + "grad_norm": 8.323273658752441, + "learning_rate": 1.9991803256020393e-05, + "loss": 0.6909, + "step": 3558 + }, + { + "epoch": 0.712, + "grad_norm": 9.267980575561523, + "learning_rate": 1.9991228300988586e-05, + "loss": 0.4557, + "step": 3560 + }, + { + "epoch": 0.7124, + "grad_norm": 3.721503734588623, + "learning_rate": 1.9990633867545956e-05, + "loss": 0.4433, + "step": 3562 + }, + { + "epoch": 0.7128, + "grad_norm": 3.7162952423095703, + "learning_rate": 1.9990019956851384e-05, + "loss": 0.3555, + "step": 3564 + }, + { + "epoch": 0.7132, + "grad_norm": 10.928511619567871, + "learning_rate": 1.9989386570101716e-05, + "loss": 0.4262, + "step": 3566 + }, + { + "epoch": 0.7136, + "grad_norm": 7.211308002471924, + "learning_rate": 1.9988733708531772e-05, + "loss": 0.4891, + "step": 3568 + }, + { + "epoch": 0.714, + "grad_norm": 2.6626076698303223, + "learning_rate": 1.9988061373414342e-05, + "loss": 0.3144, + "step": 3570 + }, + { + "epoch": 0.7144, + "grad_norm": 9.969245910644531, + "learning_rate": 1.998736956606018e-05, + "loss": 0.4919, + "step": 3572 + }, + { + "epoch": 0.7148, + "grad_norm": 13.478972434997559, + "learning_rate": 1.998665828781799e-05, + "loss": 0.424, + "step": 3574 + }, + { + "epoch": 0.7152, + "grad_norm": 9.102906227111816, + "learning_rate": 1.9985927540074453e-05, + "loss": 0.3476, + "step": 3576 + }, + { + "epoch": 0.7156, + "grad_norm": 4.581259727478027, + "learning_rate": 1.99851773242542e-05, + "loss": 0.3379, + "step": 3578 + }, + { + "epoch": 0.716, + "grad_norm": 12.655241966247559, + "learning_rate": 1.9984407641819812e-05, + "loss": 0.6987, + "step": 3580 + }, + { + "epoch": 0.7164, + "grad_norm": 2.80825138092041, + "learning_rate": 1.9983618494271825e-05, + "loss": 0.3489, + "step": 3582 + }, + { + "epoch": 0.7168, + "grad_norm": 4.662900924682617, + "learning_rate": 1.998280988314872e-05, + "loss": 0.3832, + "step": 3584 + }, + { + "epoch": 0.7172, + "grad_norm": 8.003684997558594, + "learning_rate": 1.9981981810026932e-05, + "loss": 0.2872, + "step": 3586 + }, + { + "epoch": 0.7176, + "grad_norm": 12.650956153869629, + "learning_rate": 1.9981134276520828e-05, + "loss": 0.6204, + "step": 3588 + }, + { + "epoch": 0.718, + "grad_norm": 7.58662748336792, + "learning_rate": 1.9980267284282718e-05, + "loss": 0.293, + "step": 3590 + }, + { + "epoch": 0.7184, + "grad_norm": 9.500858306884766, + "learning_rate": 1.9979380835002846e-05, + "loss": 0.6413, + "step": 3592 + }, + { + "epoch": 0.7188, + "grad_norm": 3.7807931900024414, + "learning_rate": 1.9978474930409396e-05, + "loss": 0.3209, + "step": 3594 + }, + { + "epoch": 0.7192, + "grad_norm": 7.064390659332275, + "learning_rate": 1.9977549572268467e-05, + "loss": 0.4657, + "step": 3596 + }, + { + "epoch": 0.7196, + "grad_norm": 5.230386734008789, + "learning_rate": 1.99766047623841e-05, + "loss": 0.4035, + "step": 3598 + }, + { + "epoch": 0.72, + "grad_norm": 1.235176682472229, + "learning_rate": 1.9975640502598246e-05, + "loss": 0.2504, + "step": 3600 + }, + { + "epoch": 0.7204, + "grad_norm": 3.277496337890625, + "learning_rate": 1.9974656794790777e-05, + "loss": 0.242, + "step": 3602 + }, + { + "epoch": 0.7208, + "grad_norm": 4.693442344665527, + "learning_rate": 1.9973653640879486e-05, + "loss": 0.3288, + "step": 3604 + }, + { + "epoch": 0.7212, + "grad_norm": 5.493111610412598, + "learning_rate": 1.9972631042820074e-05, + "loss": 0.5798, + "step": 3606 + }, + { + "epoch": 0.7216, + "grad_norm": 6.306934833526611, + "learning_rate": 1.997158900260614e-05, + "loss": 0.3321, + "step": 3608 + }, + { + "epoch": 0.722, + "grad_norm": 2.0286011695861816, + "learning_rate": 1.9970527522269204e-05, + "loss": 0.3724, + "step": 3610 + }, + { + "epoch": 0.7224, + "grad_norm": 2.2418179512023926, + "learning_rate": 1.9969446603878673e-05, + "loss": 0.2779, + "step": 3612 + }, + { + "epoch": 0.7228, + "grad_norm": 5.586213111877441, + "learning_rate": 1.9968346249541848e-05, + "loss": 0.5281, + "step": 3614 + }, + { + "epoch": 0.7232, + "grad_norm": 2.2049245834350586, + "learning_rate": 1.9967226461403934e-05, + "loss": 0.3617, + "step": 3616 + }, + { + "epoch": 0.7236, + "grad_norm": 11.103778839111328, + "learning_rate": 1.996608724164801e-05, + "loss": 0.4774, + "step": 3618 + }, + { + "epoch": 0.724, + "grad_norm": 2.010507583618164, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.3276, + "step": 3620 + }, + { + "epoch": 0.7244, + "grad_norm": 1.2898534536361694, + "learning_rate": 1.9963750516203887e-05, + "loss": 0.1807, + "step": 3622 + }, + { + "epoch": 0.7248, + "grad_norm": 2.726452350616455, + "learning_rate": 1.996255301507125e-05, + "loss": 0.4752, + "step": 3624 + }, + { + "epoch": 0.7252, + "grad_norm": 12.064026832580566, + "learning_rate": 1.9961336091431728e-05, + "loss": 0.7147, + "step": 3626 + }, + { + "epoch": 0.7256, + "grad_norm": 13.410955429077148, + "learning_rate": 1.9960099747657774e-05, + "loss": 0.7593, + "step": 3628 + }, + { + "epoch": 0.726, + "grad_norm": 8.114355087280273, + "learning_rate": 1.9958843986159705e-05, + "loss": 0.612, + "step": 3630 + }, + { + "epoch": 0.7264, + "grad_norm": 1.9206873178482056, + "learning_rate": 1.9957568809385693e-05, + "loss": 0.2962, + "step": 3632 + }, + { + "epoch": 0.7268, + "grad_norm": 3.52126407623291, + "learning_rate": 1.995627421982176e-05, + "loss": 0.3542, + "step": 3634 + }, + { + "epoch": 0.7272, + "grad_norm": 4.258106708526611, + "learning_rate": 1.995496021999177e-05, + "loss": 0.4059, + "step": 3636 + }, + { + "epoch": 0.7276, + "grad_norm": 6.886314392089844, + "learning_rate": 1.995362681245744e-05, + "loss": 0.9854, + "step": 3638 + }, + { + "epoch": 0.728, + "grad_norm": 3.398439407348633, + "learning_rate": 1.9952273999818312e-05, + "loss": 0.4624, + "step": 3640 + }, + { + "epoch": 0.7284, + "grad_norm": 2.7597665786743164, + "learning_rate": 1.9950901784711768e-05, + "loss": 0.6062, + "step": 3642 + }, + { + "epoch": 0.7288, + "grad_norm": 9.773767471313477, + "learning_rate": 1.9949510169813006e-05, + "loss": 0.5045, + "step": 3644 + }, + { + "epoch": 0.7292, + "grad_norm": 6.13499116897583, + "learning_rate": 1.994809915783505e-05, + "loss": 0.273, + "step": 3646 + }, + { + "epoch": 0.7296, + "grad_norm": 6.435482025146484, + "learning_rate": 1.9946668751528745e-05, + "loss": 0.1548, + "step": 3648 + }, + { + "epoch": 0.73, + "grad_norm": 7.9271087646484375, + "learning_rate": 1.9945218953682736e-05, + "loss": 0.3332, + "step": 3650 + }, + { + "epoch": 0.7304, + "grad_norm": 14.060882568359375, + "learning_rate": 1.994374976712348e-05, + "loss": 1.1185, + "step": 3652 + }, + { + "epoch": 0.7308, + "grad_norm": 3.1541588306427, + "learning_rate": 1.9942261194715236e-05, + "loss": 0.406, + "step": 3654 + }, + { + "epoch": 0.7312, + "grad_norm": 3.413231134414673, + "learning_rate": 1.9940753239360047e-05, + "loss": 0.3712, + "step": 3656 + }, + { + "epoch": 0.7316, + "grad_norm": 6.356657028198242, + "learning_rate": 1.9939225903997748e-05, + "loss": 0.5837, + "step": 3658 + }, + { + "epoch": 0.732, + "grad_norm": 6.027167320251465, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.6012, + "step": 3660 + }, + { + "epoch": 0.7324, + "grad_norm": 2.561721086502075, + "learning_rate": 1.993611310520009e-05, + "loss": 0.552, + "step": 3662 + }, + { + "epoch": 0.7328, + "grad_norm": 2.0813450813293457, + "learning_rate": 1.993452764783328e-05, + "loss": 0.1867, + "step": 3664 + }, + { + "epoch": 0.7332, + "grad_norm": 1.8736661672592163, + "learning_rate": 1.993292282259647e-05, + "loss": 0.243, + "step": 3666 + }, + { + "epoch": 0.7336, + "grad_norm": 2.6840455532073975, + "learning_rate": 1.9931298632618352e-05, + "loss": 0.5539, + "step": 3668 + }, + { + "epoch": 0.734, + "grad_norm": 1.4633709192276, + "learning_rate": 1.9929655081065373e-05, + "loss": 0.2762, + "step": 3670 + }, + { + "epoch": 0.7344, + "grad_norm": 2.1169073581695557, + "learning_rate": 1.9927992171141707e-05, + "loss": 0.4432, + "step": 3672 + }, + { + "epoch": 0.7348, + "grad_norm": 3.0685110092163086, + "learning_rate": 1.992630990608929e-05, + "loss": 0.562, + "step": 3674 + }, + { + "epoch": 0.7352, + "grad_norm": 1.8942172527313232, + "learning_rate": 1.9924608289187786e-05, + "loss": 0.426, + "step": 3676 + }, + { + "epoch": 0.7356, + "grad_norm": 6.416544437408447, + "learning_rate": 1.992288732375458e-05, + "loss": 0.4057, + "step": 3678 + }, + { + "epoch": 0.736, + "grad_norm": 4.812638759613037, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.5377, + "step": 3680 + }, + { + "epoch": 0.7364, + "grad_norm": 2.8500170707702637, + "learning_rate": 1.9919387360751216e-05, + "loss": 0.3874, + "step": 3682 + }, + { + "epoch": 0.7368, + "grad_norm": 3.06358003616333, + "learning_rate": 1.9917608370004417e-05, + "loss": 0.2649, + "step": 3684 + }, + { + "epoch": 0.7372, + "grad_norm": 3.0255796909332275, + "learning_rate": 1.991581004437262e-05, + "loss": 0.1346, + "step": 3686 + }, + { + "epoch": 0.7376, + "grad_norm": 2.9722514152526855, + "learning_rate": 1.9913992387361747e-05, + "loss": 0.341, + "step": 3688 + }, + { + "epoch": 0.738, + "grad_norm": 9.049528121948242, + "learning_rate": 1.991215540251542e-05, + "loss": 0.5876, + "step": 3690 + }, + { + "epoch": 0.7384, + "grad_norm": 5.564741611480713, + "learning_rate": 1.9910299093414932e-05, + "loss": 0.4087, + "step": 3692 + }, + { + "epoch": 0.7388, + "grad_norm": 7.150672912597656, + "learning_rate": 1.9908423463679246e-05, + "loss": 0.2485, + "step": 3694 + }, + { + "epoch": 0.7392, + "grad_norm": 16.783416748046875, + "learning_rate": 1.990652851696501e-05, + "loss": 0.4839, + "step": 3696 + }, + { + "epoch": 0.7396, + "grad_norm": 8.73369026184082, + "learning_rate": 1.9904614256966517e-05, + "loss": 0.7202, + "step": 3698 + }, + { + "epoch": 0.74, + "grad_norm": 2.2492752075195312, + "learning_rate": 1.9902680687415704e-05, + "loss": 0.2913, + "step": 3700 + }, + { + "epoch": 0.7404, + "grad_norm": 2.119492292404175, + "learning_rate": 1.9900727812082177e-05, + "loss": 0.5556, + "step": 3702 + }, + { + "epoch": 0.7408, + "grad_norm": 1.9635225534439087, + "learning_rate": 1.9898755634773155e-05, + "loss": 0.5029, + "step": 3704 + }, + { + "epoch": 0.7412, + "grad_norm": 7.054653644561768, + "learning_rate": 1.9896764159333518e-05, + "loss": 0.3475, + "step": 3706 + }, + { + "epoch": 0.7416, + "grad_norm": 6.106769561767578, + "learning_rate": 1.9894753389645723e-05, + "loss": 0.5025, + "step": 3708 + }, + { + "epoch": 0.742, + "grad_norm": 3.469557285308838, + "learning_rate": 1.9892723329629885e-05, + "loss": 0.1934, + "step": 3710 + }, + { + "epoch": 0.7424, + "grad_norm": 13.909452438354492, + "learning_rate": 1.9890673983243704e-05, + "loss": 0.7978, + "step": 3712 + }, + { + "epoch": 0.7428, + "grad_norm": 3.2016549110412598, + "learning_rate": 1.9888605354482494e-05, + "loss": 0.6756, + "step": 3714 + }, + { + "epoch": 0.7432, + "grad_norm": 3.143984317779541, + "learning_rate": 1.9886517447379143e-05, + "loss": 0.2749, + "step": 3716 + }, + { + "epoch": 0.7436, + "grad_norm": 1.6351608037948608, + "learning_rate": 1.9884410266004134e-05, + "loss": 0.3545, + "step": 3718 + }, + { + "epoch": 0.744, + "grad_norm": 2.733872175216675, + "learning_rate": 1.988228381446553e-05, + "loss": 0.3372, + "step": 3720 + }, + { + "epoch": 0.7444, + "grad_norm": 2.857966184616089, + "learning_rate": 1.9880138096908955e-05, + "loss": 0.2181, + "step": 3722 + }, + { + "epoch": 0.7448, + "grad_norm": 2.2954869270324707, + "learning_rate": 1.987797311751759e-05, + "loss": 0.2429, + "step": 3724 + }, + { + "epoch": 0.7452, + "grad_norm": 9.873763084411621, + "learning_rate": 1.9875788880512183e-05, + "loss": 0.5947, + "step": 3726 + }, + { + "epoch": 0.7456, + "grad_norm": 5.675375461578369, + "learning_rate": 1.9873585390151007e-05, + "loss": 0.3381, + "step": 3728 + }, + { + "epoch": 0.746, + "grad_norm": 3.3790171146392822, + "learning_rate": 1.987136265072988e-05, + "loss": 0.6207, + "step": 3730 + }, + { + "epoch": 0.7464, + "grad_norm": 4.3865814208984375, + "learning_rate": 1.9869120666582153e-05, + "loss": 0.3359, + "step": 3732 + }, + { + "epoch": 0.7468, + "grad_norm": 2.261887788772583, + "learning_rate": 1.9866859442078685e-05, + "loss": 0.2763, + "step": 3734 + }, + { + "epoch": 0.7472, + "grad_norm": 1.5743178129196167, + "learning_rate": 1.9864578981627844e-05, + "loss": 0.3803, + "step": 3736 + }, + { + "epoch": 0.7476, + "grad_norm": 3.3636131286621094, + "learning_rate": 1.986227928967551e-05, + "loss": 0.3748, + "step": 3738 + }, + { + "epoch": 0.748, + "grad_norm": 6.03649377822876, + "learning_rate": 1.985996037070505e-05, + "loss": 0.3707, + "step": 3740 + }, + { + "epoch": 0.7484, + "grad_norm": 14.077004432678223, + "learning_rate": 1.985762222923732e-05, + "loss": 0.7987, + "step": 3742 + }, + { + "epoch": 0.7488, + "grad_norm": 1.4182409048080444, + "learning_rate": 1.985526486983063e-05, + "loss": 0.1182, + "step": 3744 + }, + { + "epoch": 0.7492, + "grad_norm": 2.82759952545166, + "learning_rate": 1.985288829708079e-05, + "loss": 0.2848, + "step": 3746 + }, + { + "epoch": 0.7496, + "grad_norm": 15.667213439941406, + "learning_rate": 1.9850492515621038e-05, + "loss": 1.2345, + "step": 3748 + }, + { + "epoch": 0.75, + "grad_norm": 2.078486680984497, + "learning_rate": 1.9848077530122083e-05, + "loss": 0.5098, + "step": 3750 + }, + { + "epoch": 0.7504, + "grad_norm": 3.379621744155884, + "learning_rate": 1.9845643345292055e-05, + "loss": 0.2622, + "step": 3752 + }, + { + "epoch": 0.7508, + "grad_norm": 2.3987069129943848, + "learning_rate": 1.9843189965876525e-05, + "loss": 0.2966, + "step": 3754 + }, + { + "epoch": 0.7512, + "grad_norm": 3.6162967681884766, + "learning_rate": 1.9840717396658483e-05, + "loss": 0.535, + "step": 3756 + }, + { + "epoch": 0.7516, + "grad_norm": 11.369109153747559, + "learning_rate": 1.983822564245833e-05, + "loss": 0.2883, + "step": 3758 + }, + { + "epoch": 0.752, + "grad_norm": 7.834156513214111, + "learning_rate": 1.983571470813386e-05, + "loss": 0.4673, + "step": 3760 + }, + { + "epoch": 0.7524, + "grad_norm": 6.413270473480225, + "learning_rate": 1.9833184598580276e-05, + "loss": 0.5064, + "step": 3762 + }, + { + "epoch": 0.7528, + "grad_norm": 3.982877254486084, + "learning_rate": 1.983063531873016e-05, + "loss": 0.2502, + "step": 3764 + }, + { + "epoch": 0.7532, + "grad_norm": 2.470701217651367, + "learning_rate": 1.982806687355345e-05, + "loss": 0.4686, + "step": 3766 + }, + { + "epoch": 0.7536, + "grad_norm": 6.904505729675293, + "learning_rate": 1.982547926805747e-05, + "loss": 0.3792, + "step": 3768 + }, + { + "epoch": 0.754, + "grad_norm": 8.453192710876465, + "learning_rate": 1.982287250728689e-05, + "loss": 0.5924, + "step": 3770 + }, + { + "epoch": 0.7544, + "grad_norm": 4.557522296905518, + "learning_rate": 1.9820246596323724e-05, + "loss": 0.3507, + "step": 3772 + }, + { + "epoch": 0.7548, + "grad_norm": 4.191603660583496, + "learning_rate": 1.981760154028731e-05, + "loss": 0.4985, + "step": 3774 + }, + { + "epoch": 0.7552, + "grad_norm": 6.292893886566162, + "learning_rate": 1.981493734433433e-05, + "loss": 0.4046, + "step": 3776 + }, + { + "epoch": 0.7556, + "grad_norm": 2.5769972801208496, + "learning_rate": 1.9812254013658773e-05, + "loss": 0.3694, + "step": 3778 + }, + { + "epoch": 0.756, + "grad_norm": 4.791306972503662, + "learning_rate": 1.9809551553491918e-05, + "loss": 0.2801, + "step": 3780 + }, + { + "epoch": 0.7564, + "grad_norm": 10.222087860107422, + "learning_rate": 1.9806829969102356e-05, + "loss": 0.6747, + "step": 3782 + }, + { + "epoch": 0.7568, + "grad_norm": 5.637979984283447, + "learning_rate": 1.9804089265795963e-05, + "loss": 0.3297, + "step": 3784 + }, + { + "epoch": 0.7572, + "grad_norm": 2.1841681003570557, + "learning_rate": 1.9801329448915863e-05, + "loss": 0.3533, + "step": 3786 + }, + { + "epoch": 0.7576, + "grad_norm": 7.452773094177246, + "learning_rate": 1.979855052384247e-05, + "loss": 0.609, + "step": 3788 + }, + { + "epoch": 0.758, + "grad_norm": 5.639261722564697, + "learning_rate": 1.979575249599344e-05, + "loss": 0.6254, + "step": 3790 + }, + { + "epoch": 0.7584, + "grad_norm": 2.031651735305786, + "learning_rate": 1.979293537082368e-05, + "loss": 0.3182, + "step": 3792 + }, + { + "epoch": 0.7588, + "grad_norm": 3.4318366050720215, + "learning_rate": 1.9790099153825303e-05, + "loss": 0.2961, + "step": 3794 + }, + { + "epoch": 0.7592, + "grad_norm": 1.8965623378753662, + "learning_rate": 1.9787243850527663e-05, + "loss": 0.148, + "step": 3796 + }, + { + "epoch": 0.7596, + "grad_norm": 9.969818115234375, + "learning_rate": 1.978436946649733e-05, + "loss": 0.652, + "step": 3798 + }, + { + "epoch": 0.76, + "grad_norm": 4.673080921173096, + "learning_rate": 1.978147600733806e-05, + "loss": 0.247, + "step": 3800 + }, + { + "epoch": 0.7604, + "grad_norm": 2.2457077503204346, + "learning_rate": 1.9778563478690793e-05, + "loss": 0.2529, + "step": 3802 + }, + { + "epoch": 0.7608, + "grad_norm": 7.036073207855225, + "learning_rate": 1.977563188623365e-05, + "loss": 0.5194, + "step": 3804 + }, + { + "epoch": 0.7612, + "grad_norm": 1.138180136680603, + "learning_rate": 1.977268123568194e-05, + "loss": 0.3512, + "step": 3806 + }, + { + "epoch": 0.7616, + "grad_norm": 3.432999849319458, + "learning_rate": 1.9769711532788086e-05, + "loss": 0.4203, + "step": 3808 + }, + { + "epoch": 0.762, + "grad_norm": 12.060546875, + "learning_rate": 1.9766722783341682e-05, + "loss": 0.6253, + "step": 3810 + }, + { + "epoch": 0.7624, + "grad_norm": 2.926715612411499, + "learning_rate": 1.9763714993169448e-05, + "loss": 0.1548, + "step": 3812 + }, + { + "epoch": 0.7628, + "grad_norm": 5.090263843536377, + "learning_rate": 1.9760688168135236e-05, + "loss": 0.4434, + "step": 3814 + }, + { + "epoch": 0.7632, + "grad_norm": 11.497665405273438, + "learning_rate": 1.9757642314139977e-05, + "loss": 0.8647, + "step": 3816 + }, + { + "epoch": 0.7636, + "grad_norm": 6.182353973388672, + "learning_rate": 1.9754577437121733e-05, + "loss": 0.5912, + "step": 3818 + }, + { + "epoch": 0.764, + "grad_norm": 3.498034715652466, + "learning_rate": 1.9751493543055638e-05, + "loss": 0.4536, + "step": 3820 + }, + { + "epoch": 0.7644, + "grad_norm": 3.2880859375, + "learning_rate": 1.974839063795389e-05, + "loss": 0.3415, + "step": 3822 + }, + { + "epoch": 0.7648, + "grad_norm": 4.104403018951416, + "learning_rate": 1.9745268727865774e-05, + "loss": 0.4056, + "step": 3824 + }, + { + "epoch": 0.7652, + "grad_norm": 5.836455345153809, + "learning_rate": 1.97421278188776e-05, + "loss": 0.5306, + "step": 3826 + }, + { + "epoch": 0.7656, + "grad_norm": 9.475279808044434, + "learning_rate": 1.973896791711276e-05, + "loss": 0.3667, + "step": 3828 + }, + { + "epoch": 0.766, + "grad_norm": 8.245805740356445, + "learning_rate": 1.9735789028731607e-05, + "loss": 0.3357, + "step": 3830 + }, + { + "epoch": 0.7664, + "grad_norm": 4.55757474899292, + "learning_rate": 1.9732591159931567e-05, + "loss": 0.5725, + "step": 3832 + }, + { + "epoch": 0.7668, + "grad_norm": 3.2547616958618164, + "learning_rate": 1.9729374316947037e-05, + "loss": 0.3323, + "step": 3834 + }, + { + "epoch": 0.7672, + "grad_norm": 1.2054188251495361, + "learning_rate": 1.972613850604944e-05, + "loss": 0.4269, + "step": 3836 + }, + { + "epoch": 0.7676, + "grad_norm": 2.3890023231506348, + "learning_rate": 1.972288373354713e-05, + "loss": 0.2458, + "step": 3838 + }, + { + "epoch": 0.768, + "grad_norm": 2.707148551940918, + "learning_rate": 1.9719610005785463e-05, + "loss": 0.663, + "step": 3840 + }, + { + "epoch": 0.7684, + "grad_norm": 2.5808651447296143, + "learning_rate": 1.9716317329146743e-05, + "loss": 0.4532, + "step": 3842 + }, + { + "epoch": 0.7688, + "grad_norm": 1.5187207460403442, + "learning_rate": 1.9713005710050206e-05, + "loss": 0.4409, + "step": 3844 + }, + { + "epoch": 0.7692, + "grad_norm": 1.1869665384292603, + "learning_rate": 1.9709675154952017e-05, + "loss": 0.2544, + "step": 3846 + }, + { + "epoch": 0.7696, + "grad_norm": 2.169119358062744, + "learning_rate": 1.9706325670345276e-05, + "loss": 0.3059, + "step": 3848 + }, + { + "epoch": 0.77, + "grad_norm": 2.764636278152466, + "learning_rate": 1.970295726275997e-05, + "loss": 0.3347, + "step": 3850 + }, + { + "epoch": 0.7704, + "grad_norm": 4.525669574737549, + "learning_rate": 1.9699569938762975e-05, + "loss": 0.2257, + "step": 3852 + }, + { + "epoch": 0.7708, + "grad_norm": 4.901131629943848, + "learning_rate": 1.969616370495806e-05, + "loss": 0.2125, + "step": 3854 + }, + { + "epoch": 0.7712, + "grad_norm": 6.123663425445557, + "learning_rate": 1.969273856798586e-05, + "loss": 0.5386, + "step": 3856 + }, + { + "epoch": 0.7716, + "grad_norm": 3.0163817405700684, + "learning_rate": 1.9689294534523836e-05, + "loss": 0.4794, + "step": 3858 + }, + { + "epoch": 0.772, + "grad_norm": 20.098342895507812, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.6341, + "step": 3860 + }, + { + "epoch": 0.7724, + "grad_norm": 3.1360671520233154, + "learning_rate": 1.9682349805024447e-05, + "loss": 0.4233, + "step": 3862 + }, + { + "epoch": 0.7728, + "grad_norm": 2.4499082565307617, + "learning_rate": 1.9678849122526195e-05, + "loss": 0.3157, + "step": 3864 + }, + { + "epoch": 0.7732, + "grad_norm": 3.2070460319519043, + "learning_rate": 1.9675329570616302e-05, + "loss": 0.2332, + "step": 3866 + }, + { + "epoch": 0.7736, + "grad_norm": 4.213040828704834, + "learning_rate": 1.967179115615633e-05, + "loss": 0.4358, + "step": 3868 + }, + { + "epoch": 0.774, + "grad_norm": 3.797877311706543, + "learning_rate": 1.966823388604459e-05, + "loss": 0.4074, + "step": 3870 + }, + { + "epoch": 0.7744, + "grad_norm": 2.9903125762939453, + "learning_rate": 1.966465776721618e-05, + "loss": 0.364, + "step": 3872 + }, + { + "epoch": 0.7748, + "grad_norm": 11.233662605285645, + "learning_rate": 1.9661062806642906e-05, + "loss": 0.4356, + "step": 3874 + }, + { + "epoch": 0.7752, + "grad_norm": 8.974143981933594, + "learning_rate": 1.9657449011333328e-05, + "loss": 0.4884, + "step": 3876 + }, + { + "epoch": 0.7756, + "grad_norm": 2.00252628326416, + "learning_rate": 1.9653816388332743e-05, + "loss": 0.2349, + "step": 3878 + }, + { + "epoch": 0.776, + "grad_norm": 3.519148349761963, + "learning_rate": 1.965016494472312e-05, + "loss": 0.3273, + "step": 3880 + }, + { + "epoch": 0.7764, + "grad_norm": 10.857224464416504, + "learning_rate": 1.964649468762313e-05, + "loss": 0.8828, + "step": 3882 + }, + { + "epoch": 0.7768, + "grad_norm": 11.514236450195312, + "learning_rate": 1.964280562418815e-05, + "loss": 0.1927, + "step": 3884 + }, + { + "epoch": 0.7772, + "grad_norm": 3.7441351413726807, + "learning_rate": 1.963909776161018e-05, + "loss": 0.3792, + "step": 3886 + }, + { + "epoch": 0.7776, + "grad_norm": 2.538505792617798, + "learning_rate": 1.963537110711789e-05, + "loss": 0.4715, + "step": 3888 + }, + { + "epoch": 0.778, + "grad_norm": 3.2941133975982666, + "learning_rate": 1.9631625667976584e-05, + "loss": 0.4592, + "step": 3890 + }, + { + "epoch": 0.7784, + "grad_norm": 2.129331350326538, + "learning_rate": 1.9627861451488194e-05, + "loss": 0.4431, + "step": 3892 + }, + { + "epoch": 0.7788, + "grad_norm": 3.385977268218994, + "learning_rate": 1.9624078464991246e-05, + "loss": 0.2118, + "step": 3894 + }, + { + "epoch": 0.7792, + "grad_norm": 3.1219735145568848, + "learning_rate": 1.962027671586086e-05, + "loss": 0.4856, + "step": 3896 + }, + { + "epoch": 0.7796, + "grad_norm": 13.733538627624512, + "learning_rate": 1.9616456211508756e-05, + "loss": 1.0646, + "step": 3898 + }, + { + "epoch": 0.78, + "grad_norm": 1.4956645965576172, + "learning_rate": 1.9612616959383194e-05, + "loss": 0.2071, + "step": 3900 + }, + { + "epoch": 0.7804, + "grad_norm": 10.3798246383667, + "learning_rate": 1.9608758966968987e-05, + "loss": 0.3576, + "step": 3902 + }, + { + "epoch": 0.7808, + "grad_norm": 8.56035327911377, + "learning_rate": 1.96048822417875e-05, + "loss": 0.438, + "step": 3904 + }, + { + "epoch": 0.7812, + "grad_norm": 6.414236545562744, + "learning_rate": 1.9600986791396597e-05, + "loss": 0.2882, + "step": 3906 + }, + { + "epoch": 0.7816, + "grad_norm": 3.5547258853912354, + "learning_rate": 1.9597072623390668e-05, + "loss": 0.3707, + "step": 3908 + }, + { + "epoch": 0.782, + "grad_norm": 3.1716747283935547, + "learning_rate": 1.9593139745400578e-05, + "loss": 0.265, + "step": 3910 + }, + { + "epoch": 0.7824, + "grad_norm": 5.865848541259766, + "learning_rate": 1.9589188165093666e-05, + "loss": 0.1687, + "step": 3912 + }, + { + "epoch": 0.7828, + "grad_norm": 1.749450445175171, + "learning_rate": 1.9585217890173765e-05, + "loss": 0.2712, + "step": 3914 + }, + { + "epoch": 0.7832, + "grad_norm": 3.4075429439544678, + "learning_rate": 1.95812289283811e-05, + "loss": 0.3103, + "step": 3916 + }, + { + "epoch": 0.7836, + "grad_norm": 4.608646869659424, + "learning_rate": 1.957722128749237e-05, + "loss": 0.439, + "step": 3918 + }, + { + "epoch": 0.784, + "grad_norm": 1.5062508583068848, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.2825, + "step": 3920 + }, + { + "epoch": 0.7844, + "grad_norm": 5.741063117980957, + "learning_rate": 1.9569149999715518e-05, + "loss": 0.2355, + "step": 3922 + }, + { + "epoch": 0.7848, + "grad_norm": 4.68813419342041, + "learning_rate": 1.9565086368562784e-05, + "loss": 0.4758, + "step": 3924 + }, + { + "epoch": 0.7852, + "grad_norm": 0.24742849171161652, + "learning_rate": 1.9561004089784722e-05, + "loss": 0.4573, + "step": 3926 + }, + { + "epoch": 0.7856, + "grad_norm": 4.631235122680664, + "learning_rate": 1.9556903171339966e-05, + "loss": 0.2166, + "step": 3928 + }, + { + "epoch": 0.786, + "grad_norm": 1.6393107175827026, + "learning_rate": 1.955278362122344e-05, + "loss": 0.3984, + "step": 3930 + }, + { + "epoch": 0.7864, + "grad_norm": 4.417569637298584, + "learning_rate": 1.954864544746643e-05, + "loss": 0.2986, + "step": 3932 + }, + { + "epoch": 0.7868, + "grad_norm": 5.4802656173706055, + "learning_rate": 1.954448865813652e-05, + "loss": 0.2738, + "step": 3934 + }, + { + "epoch": 0.7872, + "grad_norm": 1.7197883129119873, + "learning_rate": 1.9540313261337585e-05, + "loss": 0.2403, + "step": 3936 + }, + { + "epoch": 0.7876, + "grad_norm": 1.3640928268432617, + "learning_rate": 1.9536119265209763e-05, + "loss": 0.2404, + "step": 3938 + }, + { + "epoch": 0.788, + "grad_norm": 2.9482781887054443, + "learning_rate": 1.9531906677929472e-05, + "loss": 0.3257, + "step": 3940 + }, + { + "epoch": 0.7884, + "grad_norm": 4.589252471923828, + "learning_rate": 1.9527675507709364e-05, + "loss": 0.3141, + "step": 3942 + }, + { + "epoch": 0.7888, + "grad_norm": 7.408719539642334, + "learning_rate": 1.9523425762798335e-05, + "loss": 0.5681, + "step": 3944 + }, + { + "epoch": 0.7892, + "grad_norm": 33.034671783447266, + "learning_rate": 1.9519157451481456e-05, + "loss": 1.0213, + "step": 3946 + }, + { + "epoch": 0.7896, + "grad_norm": 1.4736300706863403, + "learning_rate": 1.9514870582080035e-05, + "loss": 0.1803, + "step": 3948 + }, + { + "epoch": 0.79, + "grad_norm": 7.571887969970703, + "learning_rate": 1.9510565162951545e-05, + "loss": 0.4145, + "step": 3950 + }, + { + "epoch": 0.7904, + "grad_norm": 2.4621150493621826, + "learning_rate": 1.95062412024896e-05, + "loss": 0.4959, + "step": 3952 + }, + { + "epoch": 0.7908, + "grad_norm": 1.7257097959518433, + "learning_rate": 1.950189870912401e-05, + "loss": 0.1574, + "step": 3954 + }, + { + "epoch": 0.7912, + "grad_norm": 4.36741304397583, + "learning_rate": 1.9497537691320667e-05, + "loss": 0.29, + "step": 3956 + }, + { + "epoch": 0.7916, + "grad_norm": 2.0083720684051514, + "learning_rate": 1.9493158157581617e-05, + "loss": 0.2857, + "step": 3958 + }, + { + "epoch": 0.792, + "grad_norm": 4.739797592163086, + "learning_rate": 1.948876011644497e-05, + "loss": 0.6308, + "step": 3960 + }, + { + "epoch": 0.7924, + "grad_norm": 1.8881384134292603, + "learning_rate": 1.948434357648493e-05, + "loss": 0.2453, + "step": 3962 + }, + { + "epoch": 0.7928, + "grad_norm": 3.3089914321899414, + "learning_rate": 1.9479908546311787e-05, + "loss": 0.3107, + "step": 3964 + }, + { + "epoch": 0.7932, + "grad_norm": 2.540524959564209, + "learning_rate": 1.9475455034571843e-05, + "loss": 0.2855, + "step": 3966 + }, + { + "epoch": 0.7936, + "grad_norm": 1.1607372760772705, + "learning_rate": 1.9470983049947443e-05, + "loss": 0.2668, + "step": 3968 + }, + { + "epoch": 0.794, + "grad_norm": 5.24981164932251, + "learning_rate": 1.9466492601156964e-05, + "loss": 0.3998, + "step": 3970 + }, + { + "epoch": 0.7944, + "grad_norm": 10.619354248046875, + "learning_rate": 1.9461983696954767e-05, + "loss": 0.6584, + "step": 3972 + }, + { + "epoch": 0.7948, + "grad_norm": 6.727949142456055, + "learning_rate": 1.9457456346131175e-05, + "loss": 0.3469, + "step": 3974 + }, + { + "epoch": 0.7952, + "grad_norm": 1.6885584592819214, + "learning_rate": 1.9452910557512494e-05, + "loss": 0.3145, + "step": 3976 + }, + { + "epoch": 0.7956, + "grad_norm": 4.528112411499023, + "learning_rate": 1.9448346339960984e-05, + "loss": 0.2941, + "step": 3978 + }, + { + "epoch": 0.796, + "grad_norm": 1.7852083444595337, + "learning_rate": 1.9443763702374818e-05, + "loss": 0.2405, + "step": 3980 + }, + { + "epoch": 0.7964, + "grad_norm": 5.426568031311035, + "learning_rate": 1.9439162653688066e-05, + "loss": 0.3092, + "step": 3982 + }, + { + "epoch": 0.7968, + "grad_norm": 5.442551136016846, + "learning_rate": 1.9434543202870726e-05, + "loss": 0.2324, + "step": 3984 + }, + { + "epoch": 0.7972, + "grad_norm": 12.321950912475586, + "learning_rate": 1.9429905358928655e-05, + "loss": 0.784, + "step": 3986 + }, + { + "epoch": 0.7976, + "grad_norm": 7.544672966003418, + "learning_rate": 1.9425249130903544e-05, + "loss": 0.7796, + "step": 3988 + }, + { + "epoch": 0.798, + "grad_norm": 12.172638893127441, + "learning_rate": 1.942057452787297e-05, + "loss": 0.6243, + "step": 3990 + }, + { + "epoch": 0.7984, + "grad_norm": 8.399867057800293, + "learning_rate": 1.94158815589503e-05, + "loss": 0.4439, + "step": 3992 + }, + { + "epoch": 0.7988, + "grad_norm": 7.227625370025635, + "learning_rate": 1.941117023328473e-05, + "loss": 0.3407, + "step": 3994 + }, + { + "epoch": 0.7992, + "grad_norm": 6.377566814422607, + "learning_rate": 1.940644056006122e-05, + "loss": 0.2549, + "step": 3996 + }, + { + "epoch": 0.7996, + "grad_norm": 8.156109809875488, + "learning_rate": 1.94016925485005e-05, + "loss": 0.5341, + "step": 3998 + }, + { + "epoch": 0.8, + "grad_norm": 2.652308702468872, + "learning_rate": 1.939692620785909e-05, + "loss": 0.6208, + "step": 4000 + }, + { + "epoch": 0.8004, + "grad_norm": 2.2983179092407227, + "learning_rate": 1.939214154742919e-05, + "loss": 0.2456, + "step": 4002 + }, + { + "epoch": 0.8008, + "grad_norm": 1.4927217960357666, + "learning_rate": 1.9387338576538746e-05, + "loss": 0.4467, + "step": 4004 + }, + { + "epoch": 0.8012, + "grad_norm": 4.699498653411865, + "learning_rate": 1.9382517304551393e-05, + "loss": 0.4244, + "step": 4006 + }, + { + "epoch": 0.8016, + "grad_norm": 3.309119462966919, + "learning_rate": 1.9377677740866464e-05, + "loss": 0.5708, + "step": 4008 + }, + { + "epoch": 0.802, + "grad_norm": 8.689139366149902, + "learning_rate": 1.9372819894918922e-05, + "loss": 0.4277, + "step": 4010 + }, + { + "epoch": 0.8024, + "grad_norm": 11.27013111114502, + "learning_rate": 1.936794377617938e-05, + "loss": 0.6934, + "step": 4012 + }, + { + "epoch": 0.8028, + "grad_norm": 2.5027077198028564, + "learning_rate": 1.9363049394154102e-05, + "loss": 0.3598, + "step": 4014 + }, + { + "epoch": 0.8032, + "grad_norm": 1.7788945436477661, + "learning_rate": 1.9358136758384917e-05, + "loss": 0.4796, + "step": 4016 + }, + { + "epoch": 0.8036, + "grad_norm": 6.176569938659668, + "learning_rate": 1.935320587844926e-05, + "loss": 0.72, + "step": 4018 + }, + { + "epoch": 0.804, + "grad_norm": 3.069485664367676, + "learning_rate": 1.9348256763960146e-05, + "loss": 0.3386, + "step": 4020 + }, + { + "epoch": 0.8044, + "grad_norm": 1.4634151458740234, + "learning_rate": 1.934328942456613e-05, + "loss": 0.4018, + "step": 4022 + }, + { + "epoch": 0.8048, + "grad_norm": 1.8878329992294312, + "learning_rate": 1.9338303869951273e-05, + "loss": 0.3688, + "step": 4024 + }, + { + "epoch": 0.8052, + "grad_norm": 9.040563583374023, + "learning_rate": 1.9333300109835186e-05, + "loss": 0.3792, + "step": 4026 + }, + { + "epoch": 0.8056, + "grad_norm": 2.5843045711517334, + "learning_rate": 1.9328278153972943e-05, + "loss": 0.3295, + "step": 4028 + }, + { + "epoch": 0.806, + "grad_norm": 1.6155383586883545, + "learning_rate": 1.9323238012155125e-05, + "loss": 0.341, + "step": 4030 + }, + { + "epoch": 0.8064, + "grad_norm": 1.9764881134033203, + "learning_rate": 1.931817969420773e-05, + "loss": 0.3515, + "step": 4032 + }, + { + "epoch": 0.8068, + "grad_norm": 10.706425666809082, + "learning_rate": 1.93131032099922e-05, + "loss": 0.5608, + "step": 4034 + }, + { + "epoch": 0.8072, + "grad_norm": 1.0829896926879883, + "learning_rate": 1.930800856940543e-05, + "loss": 0.1574, + "step": 4036 + }, + { + "epoch": 0.8076, + "grad_norm": 2.6864397525787354, + "learning_rate": 1.9302895782379648e-05, + "loss": 0.631, + "step": 4038 + }, + { + "epoch": 0.808, + "grad_norm": 1.5163562297821045, + "learning_rate": 1.929776485888252e-05, + "loss": 0.2287, + "step": 4040 + }, + { + "epoch": 0.8084, + "grad_norm": 1.5338910818099976, + "learning_rate": 1.9292615808917024e-05, + "loss": 0.523, + "step": 4042 + }, + { + "epoch": 0.8088, + "grad_norm": 1.3858321905136108, + "learning_rate": 1.9287448642521517e-05, + "loss": 0.1359, + "step": 4044 + }, + { + "epoch": 0.8092, + "grad_norm": 2.9833850860595703, + "learning_rate": 1.9282263369769637e-05, + "loss": 0.2139, + "step": 4046 + }, + { + "epoch": 0.8096, + "grad_norm": 7.61302375793457, + "learning_rate": 1.9277060000770342e-05, + "loss": 0.4933, + "step": 4048 + }, + { + "epoch": 0.81, + "grad_norm": 2.1343791484832764, + "learning_rate": 1.927183854566788e-05, + "loss": 0.2628, + "step": 4050 + }, + { + "epoch": 0.8104, + "grad_norm": 1.9606010913848877, + "learning_rate": 1.9266599014641727e-05, + "loss": 0.1969, + "step": 4052 + }, + { + "epoch": 0.8108, + "grad_norm": 4.938603401184082, + "learning_rate": 1.9261341417906622e-05, + "loss": 0.304, + "step": 4054 + }, + { + "epoch": 0.8112, + "grad_norm": 9.302064895629883, + "learning_rate": 1.925606576571252e-05, + "loss": 0.5421, + "step": 4056 + }, + { + "epoch": 0.8116, + "grad_norm": 3.3194053173065186, + "learning_rate": 1.925077206834459e-05, + "loss": 0.3219, + "step": 4058 + }, + { + "epoch": 0.812, + "grad_norm": 2.829890012741089, + "learning_rate": 1.9245460336123136e-05, + "loss": 0.332, + "step": 4060 + }, + { + "epoch": 0.8124, + "grad_norm": 6.9356818199157715, + "learning_rate": 1.924013057940367e-05, + "loss": 0.4282, + "step": 4062 + }, + { + "epoch": 0.8128, + "grad_norm": 1.66714346408844, + "learning_rate": 1.923478280857682e-05, + "loss": 0.3499, + "step": 4064 + }, + { + "epoch": 0.8132, + "grad_norm": 3.117354393005371, + "learning_rate": 1.922941703406836e-05, + "loss": 0.2658, + "step": 4066 + }, + { + "epoch": 0.8136, + "grad_norm": 1.9867832660675049, + "learning_rate": 1.9224033266339103e-05, + "loss": 0.2345, + "step": 4068 + }, + { + "epoch": 0.814, + "grad_norm": 4.254197120666504, + "learning_rate": 1.9218631515885007e-05, + "loss": 0.5201, + "step": 4070 + }, + { + "epoch": 0.8144, + "grad_norm": 4.087887287139893, + "learning_rate": 1.9213211793237066e-05, + "loss": 0.2772, + "step": 4072 + }, + { + "epoch": 0.8148, + "grad_norm": 1.1827986240386963, + "learning_rate": 1.9207774108961276e-05, + "loss": 0.1777, + "step": 4074 + }, + { + "epoch": 0.8152, + "grad_norm": 4.625268459320068, + "learning_rate": 1.9202318473658707e-05, + "loss": 0.3109, + "step": 4076 + }, + { + "epoch": 0.8156, + "grad_norm": 5.721567630767822, + "learning_rate": 1.9196844897965387e-05, + "loss": 0.2615, + "step": 4078 + }, + { + "epoch": 0.816, + "grad_norm": 2.384298086166382, + "learning_rate": 1.919135339255235e-05, + "loss": 0.2371, + "step": 4080 + }, + { + "epoch": 0.8164, + "grad_norm": 7.0730156898498535, + "learning_rate": 1.9185843968125546e-05, + "loss": 0.3682, + "step": 4082 + }, + { + "epoch": 0.8168, + "grad_norm": 5.349207401275635, + "learning_rate": 1.918031663542588e-05, + "loss": 0.2545, + "step": 4084 + }, + { + "epoch": 0.8172, + "grad_norm": 10.080778121948242, + "learning_rate": 1.917477140522919e-05, + "loss": 0.5485, + "step": 4086 + }, + { + "epoch": 0.8176, + "grad_norm": 14.694539070129395, + "learning_rate": 1.916920828834617e-05, + "loss": 0.7391, + "step": 4088 + }, + { + "epoch": 0.818, + "grad_norm": 1.8028115034103394, + "learning_rate": 1.9163627295622394e-05, + "loss": 0.2169, + "step": 4090 + }, + { + "epoch": 0.8184, + "grad_norm": 2.636319398880005, + "learning_rate": 1.9158028437938313e-05, + "loss": 0.3996, + "step": 4092 + }, + { + "epoch": 0.8188, + "grad_norm": 1.5989443063735962, + "learning_rate": 1.9152411726209183e-05, + "loss": 0.2616, + "step": 4094 + }, + { + "epoch": 0.8192, + "grad_norm": 4.73516321182251, + "learning_rate": 1.9146777171385057e-05, + "loss": 0.2707, + "step": 4096 + }, + { + "epoch": 0.8196, + "grad_norm": 2.9950292110443115, + "learning_rate": 1.914112478445079e-05, + "loss": 0.2264, + "step": 4098 + }, + { + "epoch": 0.82, + "grad_norm": 3.452821969985962, + "learning_rate": 1.913545457642601e-05, + "loss": 0.4455, + "step": 4100 + }, + { + "epoch": 0.8204, + "grad_norm": 3.4266245365142822, + "learning_rate": 1.9129766558365082e-05, + "loss": 0.5735, + "step": 4102 + }, + { + "epoch": 0.8208, + "grad_norm": 2.886775493621826, + "learning_rate": 1.9124060741357065e-05, + "loss": 0.3541, + "step": 4104 + }, + { + "epoch": 0.8212, + "grad_norm": 5.419414043426514, + "learning_rate": 1.911833713652576e-05, + "loss": 0.8759, + "step": 4106 + }, + { + "epoch": 0.8216, + "grad_norm": 2.733752727508545, + "learning_rate": 1.911259575502963e-05, + "loss": 0.2497, + "step": 4108 + }, + { + "epoch": 0.822, + "grad_norm": 4.684752464294434, + "learning_rate": 1.9106836608061775e-05, + "loss": 0.5834, + "step": 4110 + }, + { + "epoch": 0.8224, + "grad_norm": 2.5550808906555176, + "learning_rate": 1.910105970684996e-05, + "loss": 0.1357, + "step": 4112 + }, + { + "epoch": 0.8228, + "grad_norm": 4.106790065765381, + "learning_rate": 1.909526506265654e-05, + "loss": 0.5376, + "step": 4114 + }, + { + "epoch": 0.8232, + "grad_norm": 4.999304294586182, + "learning_rate": 1.908945268677849e-05, + "loss": 0.2655, + "step": 4116 + }, + { + "epoch": 0.8236, + "grad_norm": 8.464347839355469, + "learning_rate": 1.9083622590547313e-05, + "loss": 0.4936, + "step": 4118 + }, + { + "epoch": 0.824, + "grad_norm": 2.0113227367401123, + "learning_rate": 1.9077774785329085e-05, + "loss": 0.3866, + "step": 4120 + }, + { + "epoch": 0.8244, + "grad_norm": 11.419163703918457, + "learning_rate": 1.9071909282524422e-05, + "loss": 0.927, + "step": 4122 + }, + { + "epoch": 0.8248, + "grad_norm": 4.762845039367676, + "learning_rate": 1.9066026093568383e-05, + "loss": 0.2925, + "step": 4124 + }, + { + "epoch": 0.8252, + "grad_norm": 12.999631881713867, + "learning_rate": 1.9060125229930576e-05, + "loss": 0.8455, + "step": 4126 + }, + { + "epoch": 0.8256, + "grad_norm": 3.4509975910186768, + "learning_rate": 1.9054206703115013e-05, + "loss": 0.2379, + "step": 4128 + }, + { + "epoch": 0.826, + "grad_norm": 3.540321111679077, + "learning_rate": 1.9048270524660203e-05, + "loss": 0.2553, + "step": 4130 + }, + { + "epoch": 0.8264, + "grad_norm": 3.03538179397583, + "learning_rate": 1.9042316706138994e-05, + "loss": 0.1713, + "step": 4132 + }, + { + "epoch": 0.8268, + "grad_norm": 2.547511100769043, + "learning_rate": 1.9036345259158664e-05, + "loss": 0.3671, + "step": 4134 + }, + { + "epoch": 0.8272, + "grad_norm": 4.7643208503723145, + "learning_rate": 1.903035619536087e-05, + "loss": 0.5904, + "step": 4136 + }, + { + "epoch": 0.8276, + "grad_norm": 4.79727840423584, + "learning_rate": 1.9024349526421603e-05, + "loss": 0.2696, + "step": 4138 + }, + { + "epoch": 0.828, + "grad_norm": 5.0795063972473145, + "learning_rate": 1.901832526405114e-05, + "loss": 0.5994, + "step": 4140 + }, + { + "epoch": 0.8284, + "grad_norm": 2.628000020980835, + "learning_rate": 1.9012283419994112e-05, + "loss": 0.4272, + "step": 4142 + }, + { + "epoch": 0.8288, + "grad_norm": 1.5859235525131226, + "learning_rate": 1.9006224006029414e-05, + "loss": 0.1091, + "step": 4144 + }, + { + "epoch": 0.8292, + "grad_norm": 2.963596820831299, + "learning_rate": 1.9000147033970148e-05, + "loss": 0.3937, + "step": 4146 + }, + { + "epoch": 0.8296, + "grad_norm": 4.0517964363098145, + "learning_rate": 1.899405251566371e-05, + "loss": 0.2481, + "step": 4148 + }, + { + "epoch": 0.83, + "grad_norm": 2.354654312133789, + "learning_rate": 1.8987940462991666e-05, + "loss": 0.2177, + "step": 4150 + }, + { + "epoch": 0.8304, + "grad_norm": 9.29647159576416, + "learning_rate": 1.8981810887869797e-05, + "loss": 0.8418, + "step": 4152 + }, + { + "epoch": 0.8308, + "grad_norm": 2.8416950702667236, + "learning_rate": 1.8975663802247978e-05, + "loss": 0.3237, + "step": 4154 + }, + { + "epoch": 0.8312, + "grad_norm": 3.5609192848205566, + "learning_rate": 1.8969499218110302e-05, + "loss": 0.5087, + "step": 4156 + }, + { + "epoch": 0.8316, + "grad_norm": 4.130580425262451, + "learning_rate": 1.8963317147474943e-05, + "loss": 0.2681, + "step": 4158 + }, + { + "epoch": 0.832, + "grad_norm": 11.649012565612793, + "learning_rate": 1.8957117602394133e-05, + "loss": 0.5739, + "step": 4160 + }, + { + "epoch": 0.8324, + "grad_norm": 6.1454596519470215, + "learning_rate": 1.8950900594954233e-05, + "loss": 0.3512, + "step": 4162 + }, + { + "epoch": 0.8328, + "grad_norm": 1.7074997425079346, + "learning_rate": 1.8944666137275596e-05, + "loss": 0.4413, + "step": 4164 + }, + { + "epoch": 0.8332, + "grad_norm": 2.1260640621185303, + "learning_rate": 1.8938414241512644e-05, + "loss": 0.2872, + "step": 4166 + }, + { + "epoch": 0.8336, + "grad_norm": 2.110886335372925, + "learning_rate": 1.8932144919853744e-05, + "loss": 0.3842, + "step": 4168 + }, + { + "epoch": 0.834, + "grad_norm": 5.23813533782959, + "learning_rate": 1.892585818452125e-05, + "loss": 0.4551, + "step": 4170 + }, + { + "epoch": 0.8344, + "grad_norm": 2.9473977088928223, + "learning_rate": 1.8919554047771508e-05, + "loss": 1.1353, + "step": 4172 + }, + { + "epoch": 0.8348, + "grad_norm": 3.4421825408935547, + "learning_rate": 1.891323252189474e-05, + "loss": 0.1393, + "step": 4174 + }, + { + "epoch": 0.8352, + "grad_norm": 2.8630332946777344, + "learning_rate": 1.890689361921507e-05, + "loss": 0.2975, + "step": 4176 + }, + { + "epoch": 0.8356, + "grad_norm": 3.088081121444702, + "learning_rate": 1.8900537352090523e-05, + "loss": 0.4268, + "step": 4178 + }, + { + "epoch": 0.836, + "grad_norm": 7.006877422332764, + "learning_rate": 1.8894163732912986e-05, + "loss": 0.5325, + "step": 4180 + }, + { + "epoch": 0.8364, + "grad_norm": 3.4122326374053955, + "learning_rate": 1.8887772774108122e-05, + "loss": 0.3825, + "step": 4182 + }, + { + "epoch": 0.8368, + "grad_norm": 6.989340305328369, + "learning_rate": 1.8881364488135445e-05, + "loss": 0.4974, + "step": 4184 + }, + { + "epoch": 0.8372, + "grad_norm": 3.2779598236083984, + "learning_rate": 1.887493888748825e-05, + "loss": 0.3572, + "step": 4186 + }, + { + "epoch": 0.8376, + "grad_norm": 5.9523844718933105, + "learning_rate": 1.886849598469357e-05, + "loss": 0.4276, + "step": 4188 + }, + { + "epoch": 0.838, + "grad_norm": 9.182242393493652, + "learning_rate": 1.886203579231215e-05, + "loss": 0.469, + "step": 4190 + }, + { + "epoch": 0.8384, + "grad_norm": 6.946074485778809, + "learning_rate": 1.8855558322938492e-05, + "loss": 0.7692, + "step": 4192 + }, + { + "epoch": 0.8388, + "grad_norm": 2.1147496700286865, + "learning_rate": 1.8849063589200754e-05, + "loss": 0.2462, + "step": 4194 + }, + { + "epoch": 0.8392, + "grad_norm": 11.205409049987793, + "learning_rate": 1.8842551603760725e-05, + "loss": 0.8807, + "step": 4196 + }, + { + "epoch": 0.8396, + "grad_norm": 3.825007438659668, + "learning_rate": 1.8836022379313884e-05, + "loss": 0.6953, + "step": 4198 + }, + { + "epoch": 0.84, + "grad_norm": 1.2823295593261719, + "learning_rate": 1.8829475928589265e-05, + "loss": 0.2749, + "step": 4200 + }, + { + "epoch": 0.8404, + "grad_norm": 4.477908134460449, + "learning_rate": 1.882291226434954e-05, + "loss": 0.23, + "step": 4202 + }, + { + "epoch": 0.8408, + "grad_norm": 4.006918430328369, + "learning_rate": 1.8816331399390874e-05, + "loss": 0.9975, + "step": 4204 + }, + { + "epoch": 0.8412, + "grad_norm": 0.8389751315116882, + "learning_rate": 1.880973334654301e-05, + "loss": 0.252, + "step": 4206 + }, + { + "epoch": 0.8416, + "grad_norm": 3.574333667755127, + "learning_rate": 1.88031181186692e-05, + "loss": 0.2721, + "step": 4208 + }, + { + "epoch": 0.842, + "grad_norm": 1.4027825593948364, + "learning_rate": 1.8796485728666172e-05, + "loss": 0.1971, + "step": 4210 + }, + { + "epoch": 0.8424, + "grad_norm": 1.6746795177459717, + "learning_rate": 1.8789836189464092e-05, + "loss": 0.2338, + "step": 4212 + }, + { + "epoch": 0.8428, + "grad_norm": 2.268486976623535, + "learning_rate": 1.8783169514026574e-05, + "loss": 0.3072, + "step": 4214 + }, + { + "epoch": 0.8432, + "grad_norm": 2.830291509628296, + "learning_rate": 1.877648571535068e-05, + "loss": 0.3567, + "step": 4216 + }, + { + "epoch": 0.8436, + "grad_norm": 1.9760125875473022, + "learning_rate": 1.8769784806466775e-05, + "loss": 0.306, + "step": 4218 + }, + { + "epoch": 0.844, + "grad_norm": 2.0860068798065186, + "learning_rate": 1.8763066800438638e-05, + "loss": 0.4452, + "step": 4220 + }, + { + "epoch": 0.8444, + "grad_norm": 2.2082886695861816, + "learning_rate": 1.8756331710363375e-05, + "loss": 0.238, + "step": 4222 + }, + { + "epoch": 0.8448, + "grad_norm": 1.2872251272201538, + "learning_rate": 1.8749579549371387e-05, + "loss": 0.2005, + "step": 4224 + }, + { + "epoch": 0.8452, + "grad_norm": 4.002313613891602, + "learning_rate": 1.8742810330626338e-05, + "loss": 0.3953, + "step": 4226 + }, + { + "epoch": 0.8456, + "grad_norm": 1.5394119024276733, + "learning_rate": 1.8736024067325188e-05, + "loss": 0.3341, + "step": 4228 + }, + { + "epoch": 0.846, + "grad_norm": 5.3695902824401855, + "learning_rate": 1.8729220772698106e-05, + "loss": 0.4318, + "step": 4230 + }, + { + "epoch": 0.8464, + "grad_norm": 2.6792778968811035, + "learning_rate": 1.8722400460008437e-05, + "loss": 0.2294, + "step": 4232 + }, + { + "epoch": 0.8468, + "grad_norm": 2.3202364444732666, + "learning_rate": 1.8715563142552758e-05, + "loss": 0.1461, + "step": 4234 + }, + { + "epoch": 0.8472, + "grad_norm": 2.4079995155334473, + "learning_rate": 1.8708708833660748e-05, + "loss": 0.3011, + "step": 4236 + }, + { + "epoch": 0.8476, + "grad_norm": 2.3885324001312256, + "learning_rate": 1.870183754669526e-05, + "loss": 0.4542, + "step": 4238 + }, + { + "epoch": 0.848, + "grad_norm": 6.010551929473877, + "learning_rate": 1.8694949295052198e-05, + "loss": 0.4043, + "step": 4240 + }, + { + "epoch": 0.8484, + "grad_norm": 2.448598861694336, + "learning_rate": 1.8688044092160558e-05, + "loss": 0.0764, + "step": 4242 + }, + { + "epoch": 0.8488, + "grad_norm": 4.760672569274902, + "learning_rate": 1.868112195148239e-05, + "loss": 0.354, + "step": 4244 + }, + { + "epoch": 0.8492, + "grad_norm": 11.941316604614258, + "learning_rate": 1.867418288651278e-05, + "loss": 0.393, + "step": 4246 + }, + { + "epoch": 0.8496, + "grad_norm": 4.400118827819824, + "learning_rate": 1.866722691077977e-05, + "loss": 0.2418, + "step": 4248 + }, + { + "epoch": 0.85, + "grad_norm": 3.2601258754730225, + "learning_rate": 1.8660254037844384e-05, + "loss": 0.1869, + "step": 4250 + }, + { + "epoch": 0.8504, + "grad_norm": 1.2703267335891724, + "learning_rate": 1.8653264281300626e-05, + "loss": 0.1646, + "step": 4252 + }, + { + "epoch": 0.8508, + "grad_norm": 7.0376667976379395, + "learning_rate": 1.8646257654775357e-05, + "loss": 0.1894, + "step": 4254 + }, + { + "epoch": 0.8512, + "grad_norm": 1.5702110528945923, + "learning_rate": 1.8639234171928355e-05, + "loss": 0.1787, + "step": 4256 + }, + { + "epoch": 0.8516, + "grad_norm": 2.294933795928955, + "learning_rate": 1.8632193846452267e-05, + "loss": 0.2579, + "step": 4258 + }, + { + "epoch": 0.852, + "grad_norm": 1.3548839092254639, + "learning_rate": 1.8625136692072587e-05, + "loss": 0.2962, + "step": 4260 + }, + { + "epoch": 0.8524, + "grad_norm": 1.1331373453140259, + "learning_rate": 1.861806272254755e-05, + "loss": 0.2342, + "step": 4262 + }, + { + "epoch": 0.8528, + "grad_norm": 3.423243761062622, + "learning_rate": 1.8610971951668265e-05, + "loss": 0.2344, + "step": 4264 + }, + { + "epoch": 0.8532, + "grad_norm": 3.6871814727783203, + "learning_rate": 1.8603864393258547e-05, + "loss": 0.1116, + "step": 4266 + }, + { + "epoch": 0.8536, + "grad_norm": 16.884803771972656, + "learning_rate": 1.8596740061174912e-05, + "loss": 0.4777, + "step": 4268 + }, + { + "epoch": 0.854, + "grad_norm": 0.9385944604873657, + "learning_rate": 1.8589598969306646e-05, + "loss": 0.192, + "step": 4270 + }, + { + "epoch": 0.8544, + "grad_norm": 7.652040481567383, + "learning_rate": 1.858244113157566e-05, + "loss": 0.4098, + "step": 4272 + }, + { + "epoch": 0.8548, + "grad_norm": 6.315662384033203, + "learning_rate": 1.8575266561936533e-05, + "loss": 0.3683, + "step": 4274 + }, + { + "epoch": 0.8552, + "grad_norm": 2.6001853942871094, + "learning_rate": 1.8568075274376432e-05, + "loss": 0.2651, + "step": 4276 + }, + { + "epoch": 0.8556, + "grad_norm": 12.950417518615723, + "learning_rate": 1.8560867282915164e-05, + "loss": 0.9064, + "step": 4278 + }, + { + "epoch": 0.856, + "grad_norm": 0.10136278718709946, + "learning_rate": 1.8553642601605083e-05, + "loss": 0.2893, + "step": 4280 + }, + { + "epoch": 0.8564, + "grad_norm": 2.4272780418395996, + "learning_rate": 1.8546401244531034e-05, + "loss": 0.3203, + "step": 4282 + }, + { + "epoch": 0.8568, + "grad_norm": 4.110124588012695, + "learning_rate": 1.8539143225810457e-05, + "loss": 0.4119, + "step": 4284 + }, + { + "epoch": 0.8572, + "grad_norm": 7.609652042388916, + "learning_rate": 1.85318685595932e-05, + "loss": 0.3072, + "step": 4286 + }, + { + "epoch": 0.8576, + "grad_norm": 22.142013549804688, + "learning_rate": 1.852457726006163e-05, + "loss": 0.5402, + "step": 4288 + }, + { + "epoch": 0.858, + "grad_norm": 19.05792236328125, + "learning_rate": 1.8517269341430485e-05, + "loss": 1.1967, + "step": 4290 + }, + { + "epoch": 0.8584, + "grad_norm": 1.1547534465789795, + "learning_rate": 1.8509944817946917e-05, + "loss": 0.3164, + "step": 4292 + }, + { + "epoch": 0.8588, + "grad_norm": 4.138175010681152, + "learning_rate": 1.8502603703890484e-05, + "loss": 0.1456, + "step": 4294 + }, + { + "epoch": 0.8592, + "grad_norm": 2.0759992599487305, + "learning_rate": 1.8495246013573064e-05, + "loss": 0.4131, + "step": 4296 + }, + { + "epoch": 0.8596, + "grad_norm": 8.61621379852295, + "learning_rate": 1.8487871761338817e-05, + "loss": 0.4004, + "step": 4298 + }, + { + "epoch": 0.86, + "grad_norm": 4.338650226593018, + "learning_rate": 1.848048096156426e-05, + "loss": 0.5552, + "step": 4300 + }, + { + "epoch": 0.8604, + "grad_norm": 5.476207733154297, + "learning_rate": 1.847307362865813e-05, + "loss": 0.4504, + "step": 4302 + }, + { + "epoch": 0.8608, + "grad_norm": 17.284706115722656, + "learning_rate": 1.8465649777061387e-05, + "loss": 0.6266, + "step": 4304 + }, + { + "epoch": 0.8612, + "grad_norm": 11.268281936645508, + "learning_rate": 1.8458209421247208e-05, + "loss": 0.2556, + "step": 4306 + }, + { + "epoch": 0.8616, + "grad_norm": 12.231380462646484, + "learning_rate": 1.8450752575720967e-05, + "loss": 0.9611, + "step": 4308 + }, + { + "epoch": 0.862, + "grad_norm": 2.6861495971679688, + "learning_rate": 1.8443279255020163e-05, + "loss": 1.0975, + "step": 4310 + }, + { + "epoch": 0.8624, + "grad_norm": 11.823570251464844, + "learning_rate": 1.843578947371439e-05, + "loss": 0.3927, + "step": 4312 + }, + { + "epoch": 0.8628, + "grad_norm": 9.269819259643555, + "learning_rate": 1.842828324640539e-05, + "loss": 0.8062, + "step": 4314 + }, + { + "epoch": 0.8632, + "grad_norm": 1.1661649942398071, + "learning_rate": 1.8420760587726935e-05, + "loss": 0.1312, + "step": 4316 + }, + { + "epoch": 0.8636, + "grad_norm": 5.803460121154785, + "learning_rate": 1.8413221512344808e-05, + "loss": 0.3561, + "step": 4318 + }, + { + "epoch": 0.864, + "grad_norm": 1.8872052431106567, + "learning_rate": 1.8405666034956846e-05, + "loss": 0.3873, + "step": 4320 + }, + { + "epoch": 0.8644, + "grad_norm": 3.8554446697235107, + "learning_rate": 1.8398094170292826e-05, + "loss": 0.2269, + "step": 4322 + }, + { + "epoch": 0.8648, + "grad_norm": 5.1440935134887695, + "learning_rate": 1.8390505933114507e-05, + "loss": 0.5226, + "step": 4324 + }, + { + "epoch": 0.8652, + "grad_norm": 1.8622969388961792, + "learning_rate": 1.838290133821552e-05, + "loss": 0.4364, + "step": 4326 + }, + { + "epoch": 0.8656, + "grad_norm": 3.772916793823242, + "learning_rate": 1.8375280400421414e-05, + "loss": 0.4816, + "step": 4328 + }, + { + "epoch": 0.866, + "grad_norm": 12.334714889526367, + "learning_rate": 1.8367643134589613e-05, + "loss": 0.9745, + "step": 4330 + }, + { + "epoch": 0.8664, + "grad_norm": 9.228984832763672, + "learning_rate": 1.8359989555609365e-05, + "loss": 0.648, + "step": 4332 + }, + { + "epoch": 0.8668, + "grad_norm": 2.6707513332366943, + "learning_rate": 1.835231967840168e-05, + "loss": 0.2198, + "step": 4334 + }, + { + "epoch": 0.8672, + "grad_norm": 2.528926372528076, + "learning_rate": 1.834463351791939e-05, + "loss": 0.2287, + "step": 4336 + }, + { + "epoch": 0.8676, + "grad_norm": 6.585297584533691, + "learning_rate": 1.8336931089147082e-05, + "loss": 0.5252, + "step": 4338 + }, + { + "epoch": 0.868, + "grad_norm": 4.047630786895752, + "learning_rate": 1.8329212407101006e-05, + "loss": 0.6047, + "step": 4340 + }, + { + "epoch": 0.8684, + "grad_norm": 1.9104069471359253, + "learning_rate": 1.8321477486829128e-05, + "loss": 0.4009, + "step": 4342 + }, + { + "epoch": 0.8688, + "grad_norm": 5.508389472961426, + "learning_rate": 1.8313726343411085e-05, + "loss": 0.4419, + "step": 4344 + }, + { + "epoch": 0.8692, + "grad_norm": 1.841713786125183, + "learning_rate": 1.8305958991958135e-05, + "loss": 0.2215, + "step": 4346 + }, + { + "epoch": 0.8696, + "grad_norm": 5.603817939758301, + "learning_rate": 1.82981754476131e-05, + "loss": 0.3817, + "step": 4348 + }, + { + "epoch": 0.87, + "grad_norm": 3.016847848892212, + "learning_rate": 1.8290375725550417e-05, + "loss": 0.1682, + "step": 4350 + }, + { + "epoch": 0.8704, + "grad_norm": 3.990354537963867, + "learning_rate": 1.8282559840976053e-05, + "loss": 0.2331, + "step": 4352 + }, + { + "epoch": 0.8708, + "grad_norm": 5.375119686126709, + "learning_rate": 1.827472780912744e-05, + "loss": 0.2508, + "step": 4354 + }, + { + "epoch": 0.8712, + "grad_norm": 1.0237065553665161, + "learning_rate": 1.8266879645273557e-05, + "loss": 0.376, + "step": 4356 + }, + { + "epoch": 0.8716, + "grad_norm": 1.8769701719284058, + "learning_rate": 1.825901536471478e-05, + "loss": 0.3245, + "step": 4358 + }, + { + "epoch": 0.872, + "grad_norm": 1.263677954673767, + "learning_rate": 1.8251134982782966e-05, + "loss": 0.1596, + "step": 4360 + }, + { + "epoch": 0.8724, + "grad_norm": 2.313735246658325, + "learning_rate": 1.824323851484126e-05, + "loss": 0.4136, + "step": 4362 + }, + { + "epoch": 0.8728, + "grad_norm": 14.429132461547852, + "learning_rate": 1.823532597628428e-05, + "loss": 0.3782, + "step": 4364 + }, + { + "epoch": 0.8732, + "grad_norm": 1.6723076105117798, + "learning_rate": 1.8227397382537893e-05, + "loss": 0.1194, + "step": 4366 + }, + { + "epoch": 0.8736, + "grad_norm": 5.516616344451904, + "learning_rate": 1.8219452749059336e-05, + "loss": 0.3768, + "step": 4368 + }, + { + "epoch": 0.874, + "grad_norm": 4.087747573852539, + "learning_rate": 1.8211492091337048e-05, + "loss": 0.2535, + "step": 4370 + }, + { + "epoch": 0.8744, + "grad_norm": 2.306649923324585, + "learning_rate": 1.8203515424890734e-05, + "loss": 0.2176, + "step": 4372 + }, + { + "epoch": 0.8748, + "grad_norm": 8.94923210144043, + "learning_rate": 1.8195522765271346e-05, + "loss": 0.3139, + "step": 4374 + }, + { + "epoch": 0.8752, + "grad_norm": 2.70224666595459, + "learning_rate": 1.8187514128060956e-05, + "loss": 0.5872, + "step": 4376 + }, + { + "epoch": 0.8756, + "grad_norm": 2.474479913711548, + "learning_rate": 1.8179489528872804e-05, + "loss": 0.2387, + "step": 4378 + }, + { + "epoch": 0.876, + "grad_norm": 2.1124720573425293, + "learning_rate": 1.8171448983351284e-05, + "loss": 0.3298, + "step": 4380 + }, + { + "epoch": 0.8764, + "grad_norm": 22.254501342773438, + "learning_rate": 1.816339250717185e-05, + "loss": 0.7989, + "step": 4382 + }, + { + "epoch": 0.8768, + "grad_norm": 5.729538917541504, + "learning_rate": 1.8155320116040983e-05, + "loss": 0.3077, + "step": 4384 + }, + { + "epoch": 0.8772, + "grad_norm": 1.7873680591583252, + "learning_rate": 1.814723182569625e-05, + "loss": 0.2685, + "step": 4386 + }, + { + "epoch": 0.8776, + "grad_norm": 2.574246883392334, + "learning_rate": 1.8139127651906193e-05, + "loss": 0.4541, + "step": 4388 + }, + { + "epoch": 0.878, + "grad_norm": 4.285857677459717, + "learning_rate": 1.813100761047029e-05, + "loss": 0.3049, + "step": 4390 + }, + { + "epoch": 0.8784, + "grad_norm": 5.481351375579834, + "learning_rate": 1.8122871717218974e-05, + "loss": 0.5225, + "step": 4392 + }, + { + "epoch": 0.8788, + "grad_norm": 1.9345036745071411, + "learning_rate": 1.8114719988013612e-05, + "loss": 0.2425, + "step": 4394 + }, + { + "epoch": 0.8792, + "grad_norm": 0.7591046690940857, + "learning_rate": 1.8106552438746413e-05, + "loss": 0.0543, + "step": 4396 + }, + { + "epoch": 0.8796, + "grad_norm": 5.8663482666015625, + "learning_rate": 1.8098369085340404e-05, + "loss": 0.4028, + "step": 4398 + }, + { + "epoch": 0.88, + "grad_norm": 7.147708892822266, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.2534, + "step": 4400 + }, + { + "epoch": 0.8804, + "grad_norm": 1.3403339385986328, + "learning_rate": 1.8081955029958265e-05, + "loss": 0.3613, + "step": 4402 + }, + { + "epoch": 0.8808, + "grad_norm": 3.0721960067749023, + "learning_rate": 1.807372435998219e-05, + "loss": 0.2311, + "step": 4404 + }, + { + "epoch": 0.8812, + "grad_norm": 7.93109655380249, + "learning_rate": 1.806547794986733e-05, + "loss": 0.329, + "step": 4406 + }, + { + "epoch": 0.8816, + "grad_norm": 5.136090278625488, + "learning_rate": 1.8057215815690487e-05, + "loss": 0.3196, + "step": 4408 + }, + { + "epoch": 0.882, + "grad_norm": 5.830309867858887, + "learning_rate": 1.8048937973559148e-05, + "loss": 0.2975, + "step": 4410 + }, + { + "epoch": 0.8824, + "grad_norm": 6.0937089920043945, + "learning_rate": 1.8040644439611355e-05, + "loss": 0.3363, + "step": 4412 + }, + { + "epoch": 0.8828, + "grad_norm": 2.768066167831421, + "learning_rate": 1.8032335230015777e-05, + "loss": 0.2583, + "step": 4414 + }, + { + "epoch": 0.8832, + "grad_norm": 5.795217514038086, + "learning_rate": 1.8024010360971665e-05, + "loss": 0.257, + "step": 4416 + }, + { + "epoch": 0.8836, + "grad_norm": 2.7633285522460938, + "learning_rate": 1.8015669848708774e-05, + "loss": 0.3406, + "step": 4418 + }, + { + "epoch": 0.884, + "grad_norm": 2.2546586990356445, + "learning_rate": 1.8007313709487345e-05, + "loss": 0.3426, + "step": 4420 + }, + { + "epoch": 0.8844, + "grad_norm": 2.3054423332214355, + "learning_rate": 1.7998941959598093e-05, + "loss": 0.2948, + "step": 4422 + }, + { + "epoch": 0.8848, + "grad_norm": 3.379559278488159, + "learning_rate": 1.7990554615362207e-05, + "loss": 0.7341, + "step": 4424 + }, + { + "epoch": 0.8852, + "grad_norm": 8.581156730651855, + "learning_rate": 1.7982151693131213e-05, + "loss": 0.3978, + "step": 4426 + }, + { + "epoch": 0.8856, + "grad_norm": 5.534764289855957, + "learning_rate": 1.7973733209287036e-05, + "loss": 0.7321, + "step": 4428 + }, + { + "epoch": 0.886, + "grad_norm": 4.709218502044678, + "learning_rate": 1.7965299180241963e-05, + "loss": 0.1536, + "step": 4430 + }, + { + "epoch": 0.8864, + "grad_norm": 5.620081901550293, + "learning_rate": 1.7956849622438568e-05, + "loss": 0.2383, + "step": 4432 + }, + { + "epoch": 0.8868, + "grad_norm": 3.5025880336761475, + "learning_rate": 1.794838455234966e-05, + "loss": 0.5966, + "step": 4434 + }, + { + "epoch": 0.8872, + "grad_norm": 16.83366584777832, + "learning_rate": 1.7939903986478357e-05, + "loss": 0.8883, + "step": 4436 + }, + { + "epoch": 0.8876, + "grad_norm": 2.7590324878692627, + "learning_rate": 1.7931407941357938e-05, + "loss": 0.1444, + "step": 4438 + }, + { + "epoch": 0.888, + "grad_norm": 3.1707308292388916, + "learning_rate": 1.7922896433551913e-05, + "loss": 0.3912, + "step": 4440 + }, + { + "epoch": 0.8884, + "grad_norm": 4.7320380210876465, + "learning_rate": 1.7914369479653864e-05, + "loss": 0.4914, + "step": 4442 + }, + { + "epoch": 0.8888, + "grad_norm": 2.7938292026519775, + "learning_rate": 1.7905827096287525e-05, + "loss": 0.3332, + "step": 4444 + }, + { + "epoch": 0.8892, + "grad_norm": 6.976980209350586, + "learning_rate": 1.7897269300106752e-05, + "loss": 0.4535, + "step": 4446 + }, + { + "epoch": 0.8896, + "grad_norm": 0.7354930639266968, + "learning_rate": 1.7888696107795347e-05, + "loss": 0.2111, + "step": 4448 + }, + { + "epoch": 0.89, + "grad_norm": 1.5520787239074707, + "learning_rate": 1.7880107536067228e-05, + "loss": 0.1071, + "step": 4450 + }, + { + "epoch": 0.8904, + "grad_norm": 3.634035110473633, + "learning_rate": 1.787150360166623e-05, + "loss": 0.2106, + "step": 4452 + }, + { + "epoch": 0.8908, + "grad_norm": 5.12615966796875, + "learning_rate": 1.78628843213662e-05, + "loss": 0.1875, + "step": 4454 + }, + { + "epoch": 0.8912, + "grad_norm": 0.6189030408859253, + "learning_rate": 1.7854249711970826e-05, + "loss": 0.504, + "step": 4456 + }, + { + "epoch": 0.8916, + "grad_norm": 14.240012168884277, + "learning_rate": 1.7845599790313732e-05, + "loss": 0.515, + "step": 4458 + }, + { + "epoch": 0.892, + "grad_norm": 1.5689350366592407, + "learning_rate": 1.783693457325841e-05, + "loss": 0.1964, + "step": 4460 + }, + { + "epoch": 0.8924, + "grad_norm": 2.499042510986328, + "learning_rate": 1.782825407769811e-05, + "loss": 0.2706, + "step": 4462 + }, + { + "epoch": 0.8928, + "grad_norm": 5.313007831573486, + "learning_rate": 1.7819558320555902e-05, + "loss": 0.4566, + "step": 4464 + }, + { + "epoch": 0.8932, + "grad_norm": 6.681105136871338, + "learning_rate": 1.7810847318784632e-05, + "loss": 0.5711, + "step": 4466 + }, + { + "epoch": 0.8936, + "grad_norm": 5.436981201171875, + "learning_rate": 1.780212108936685e-05, + "loss": 0.2442, + "step": 4468 + }, + { + "epoch": 0.894, + "grad_norm": 1.9634642601013184, + "learning_rate": 1.7793379649314743e-05, + "loss": 0.4256, + "step": 4470 + }, + { + "epoch": 0.8944, + "grad_norm": 7.332373142242432, + "learning_rate": 1.7784623015670237e-05, + "loss": 0.2936, + "step": 4472 + }, + { + "epoch": 0.8948, + "grad_norm": 6.378211498260498, + "learning_rate": 1.777585120550481e-05, + "loss": 0.381, + "step": 4474 + }, + { + "epoch": 0.8952, + "grad_norm": 1.9609240293502808, + "learning_rate": 1.7767064235919594e-05, + "loss": 0.2294, + "step": 4476 + }, + { + "epoch": 0.8956, + "grad_norm": 2.7844057083129883, + "learning_rate": 1.77582621240452e-05, + "loss": 0.5405, + "step": 4478 + }, + { + "epoch": 0.896, + "grad_norm": 3.83945631980896, + "learning_rate": 1.77494448870418e-05, + "loss": 0.3067, + "step": 4480 + }, + { + "epoch": 0.8964, + "grad_norm": 4.778282165527344, + "learning_rate": 1.774061254209907e-05, + "loss": 0.5211, + "step": 4482 + }, + { + "epoch": 0.8968, + "grad_norm": 17.623916625976562, + "learning_rate": 1.773176510643608e-05, + "loss": 0.5522, + "step": 4484 + }, + { + "epoch": 0.8972, + "grad_norm": 1.917486310005188, + "learning_rate": 1.7722902597301388e-05, + "loss": 0.4967, + "step": 4486 + }, + { + "epoch": 0.8976, + "grad_norm": 6.527610778808594, + "learning_rate": 1.7714025031972894e-05, + "loss": 0.2257, + "step": 4488 + }, + { + "epoch": 0.898, + "grad_norm": 6.171792030334473, + "learning_rate": 1.77051324277579e-05, + "loss": 0.326, + "step": 4490 + }, + { + "epoch": 0.8984, + "grad_norm": 3.731717824935913, + "learning_rate": 1.769622480199295e-05, + "loss": 0.4339, + "step": 4492 + }, + { + "epoch": 0.8988, + "grad_norm": 2.4026479721069336, + "learning_rate": 1.7687302172043926e-05, + "loss": 0.417, + "step": 4494 + }, + { + "epoch": 0.8992, + "grad_norm": 4.909908771514893, + "learning_rate": 1.7678364555305982e-05, + "loss": 0.5636, + "step": 4496 + }, + { + "epoch": 0.8996, + "grad_norm": 7.68532657623291, + "learning_rate": 1.7669411969203424e-05, + "loss": 0.3545, + "step": 4498 + }, + { + "epoch": 0.9, + "grad_norm": 0.0055258008651435375, + "learning_rate": 1.7660444431189777e-05, + "loss": 0.1089, + "step": 4500 + }, + { + "epoch": 0.9004, + "grad_norm": 5.9726881980896, + "learning_rate": 1.765146195874774e-05, + "loss": 0.6284, + "step": 4502 + }, + { + "epoch": 0.9008, + "grad_norm": 10.736701011657715, + "learning_rate": 1.76424645693891e-05, + "loss": 0.5934, + "step": 4504 + }, + { + "epoch": 0.9012, + "grad_norm": 0.5862959027290344, + "learning_rate": 1.7633452280654696e-05, + "loss": 0.1777, + "step": 4506 + }, + { + "epoch": 0.9016, + "grad_norm": 2.165034770965576, + "learning_rate": 1.762442511011448e-05, + "loss": 0.2382, + "step": 4508 + }, + { + "epoch": 0.902, + "grad_norm": 2.448538303375244, + "learning_rate": 1.761538307536738e-05, + "loss": 0.5544, + "step": 4510 + }, + { + "epoch": 0.9024, + "grad_norm": 2.721277952194214, + "learning_rate": 1.7606326194041285e-05, + "loss": 0.3865, + "step": 4512 + }, + { + "epoch": 0.9028, + "grad_norm": 5.212175369262695, + "learning_rate": 1.759725448379305e-05, + "loss": 0.4569, + "step": 4514 + }, + { + "epoch": 0.9032, + "grad_norm": 9.373841285705566, + "learning_rate": 1.7588167962308458e-05, + "loss": 0.4466, + "step": 4516 + }, + { + "epoch": 0.9036, + "grad_norm": 7.129452228546143, + "learning_rate": 1.7579066647302147e-05, + "loss": 0.5416, + "step": 4518 + }, + { + "epoch": 0.904, + "grad_norm": 7.802352428436279, + "learning_rate": 1.756995055651757e-05, + "loss": 0.3362, + "step": 4520 + }, + { + "epoch": 0.9044, + "grad_norm": 2.643044948577881, + "learning_rate": 1.7560819707727037e-05, + "loss": 0.6605, + "step": 4522 + }, + { + "epoch": 0.9048, + "grad_norm": 2.0603039264678955, + "learning_rate": 1.7551674118731585e-05, + "loss": 0.3187, + "step": 4524 + }, + { + "epoch": 0.9052, + "grad_norm": 7.363227844238281, + "learning_rate": 1.7542513807361044e-05, + "loss": 0.4515, + "step": 4526 + }, + { + "epoch": 0.9056, + "grad_norm": 1.5020396709442139, + "learning_rate": 1.7533338791473875e-05, + "loss": 0.2513, + "step": 4528 + }, + { + "epoch": 0.906, + "grad_norm": 1.8217923641204834, + "learning_rate": 1.7524149088957238e-05, + "loss": 0.2585, + "step": 4530 + }, + { + "epoch": 0.9064, + "grad_norm": 2.36561918258667, + "learning_rate": 1.751494471772697e-05, + "loss": 0.4537, + "step": 4532 + }, + { + "epoch": 0.9068, + "grad_norm": 3.160966157913208, + "learning_rate": 1.750572569572742e-05, + "loss": 0.5442, + "step": 4534 + }, + { + "epoch": 0.9072, + "grad_norm": 3.7151758670806885, + "learning_rate": 1.7496492040931548e-05, + "loss": 0.2088, + "step": 4536 + }, + { + "epoch": 0.9076, + "grad_norm": 2.7269296646118164, + "learning_rate": 1.7487243771340862e-05, + "loss": 0.3783, + "step": 4538 + }, + { + "epoch": 0.908, + "grad_norm": 2.14815354347229, + "learning_rate": 1.747798090498533e-05, + "loss": 0.4218, + "step": 4540 + }, + { + "epoch": 0.9084, + "grad_norm": 2.6540491580963135, + "learning_rate": 1.7468703459923365e-05, + "loss": 0.1083, + "step": 4542 + }, + { + "epoch": 0.9088, + "grad_norm": 4.2495436668396, + "learning_rate": 1.745941145424182e-05, + "loss": 0.4062, + "step": 4544 + }, + { + "epoch": 0.9092, + "grad_norm": 1.9258490800857544, + "learning_rate": 1.7450104906055973e-05, + "loss": 0.1071, + "step": 4546 + }, + { + "epoch": 0.9096, + "grad_norm": 2.3075156211853027, + "learning_rate": 1.744078383350938e-05, + "loss": 0.5631, + "step": 4548 + }, + { + "epoch": 0.91, + "grad_norm": 7.055587291717529, + "learning_rate": 1.7431448254773943e-05, + "loss": 0.4367, + "step": 4550 + }, + { + "epoch": 0.9104, + "grad_norm": 9.724135398864746, + "learning_rate": 1.7422098188049885e-05, + "loss": 0.4693, + "step": 4552 + }, + { + "epoch": 0.9108, + "grad_norm": 3.4774413108825684, + "learning_rate": 1.7412733651565624e-05, + "loss": 0.3226, + "step": 4554 + }, + { + "epoch": 0.9112, + "grad_norm": 0.4970102906227112, + "learning_rate": 1.7403354663577782e-05, + "loss": 0.1458, + "step": 4556 + }, + { + "epoch": 0.9116, + "grad_norm": 2.20710825920105, + "learning_rate": 1.739396124237121e-05, + "loss": 0.3905, + "step": 4558 + }, + { + "epoch": 0.912, + "grad_norm": 13.165175437927246, + "learning_rate": 1.738455340625883e-05, + "loss": 0.7534, + "step": 4560 + }, + { + "epoch": 0.9124, + "grad_norm": 3.6718461513519287, + "learning_rate": 1.7375131173581744e-05, + "loss": 0.3831, + "step": 4562 + }, + { + "epoch": 0.9128, + "grad_norm": 5.074256896972656, + "learning_rate": 1.7365694562709038e-05, + "loss": 0.2103, + "step": 4564 + }, + { + "epoch": 0.9132, + "grad_norm": 16.361331939697266, + "learning_rate": 1.7356243592037865e-05, + "loss": 0.6481, + "step": 4566 + }, + { + "epoch": 0.9136, + "grad_norm": 3.182610034942627, + "learning_rate": 1.7346778279993433e-05, + "loss": 0.6665, + "step": 4568 + }, + { + "epoch": 0.914, + "grad_norm": 4.464386940002441, + "learning_rate": 1.733729864502877e-05, + "loss": 0.4995, + "step": 4570 + }, + { + "epoch": 0.9144, + "grad_norm": 2.6768336296081543, + "learning_rate": 1.7327804705624962e-05, + "loss": 0.3199, + "step": 4572 + }, + { + "epoch": 0.9148, + "grad_norm": 2.428009271621704, + "learning_rate": 1.731829648029091e-05, + "loss": 0.5139, + "step": 4574 + }, + { + "epoch": 0.9152, + "grad_norm": 16.112293243408203, + "learning_rate": 1.730877398756341e-05, + "loss": 0.4833, + "step": 4576 + }, + { + "epoch": 0.9156, + "grad_norm": 5.402769565582275, + "learning_rate": 1.7299237246007025e-05, + "loss": 0.3194, + "step": 4578 + }, + { + "epoch": 0.916, + "grad_norm": 4.706737995147705, + "learning_rate": 1.7289686274214113e-05, + "loss": 0.4925, + "step": 4580 + }, + { + "epoch": 0.9164, + "grad_norm": 4.471282482147217, + "learning_rate": 1.7280121090804824e-05, + "loss": 0.3652, + "step": 4582 + }, + { + "epoch": 0.9168, + "grad_norm": 3.4153380393981934, + "learning_rate": 1.727054171442693e-05, + "loss": 0.2637, + "step": 4584 + }, + { + "epoch": 0.9172, + "grad_norm": 0.9554430842399597, + "learning_rate": 1.7260948163755918e-05, + "loss": 0.4319, + "step": 4586 + }, + { + "epoch": 0.9176, + "grad_norm": 4.5731048583984375, + "learning_rate": 1.7251340457494934e-05, + "loss": 0.1525, + "step": 4588 + }, + { + "epoch": 0.918, + "grad_norm": 2.797450542449951, + "learning_rate": 1.7241718614374688e-05, + "loss": 0.511, + "step": 4590 + }, + { + "epoch": 0.9184, + "grad_norm": 9.728850364685059, + "learning_rate": 1.7232082653153422e-05, + "loss": 0.4407, + "step": 4592 + }, + { + "epoch": 0.9188, + "grad_norm": 2.8559415340423584, + "learning_rate": 1.722243259261697e-05, + "loss": 0.2558, + "step": 4594 + }, + { + "epoch": 0.9192, + "grad_norm": 3.760319232940674, + "learning_rate": 1.7212768451578595e-05, + "loss": 0.7497, + "step": 4596 + }, + { + "epoch": 0.9196, + "grad_norm": 2.416416645050049, + "learning_rate": 1.7203090248879084e-05, + "loss": 0.3434, + "step": 4598 + }, + { + "epoch": 0.92, + "grad_norm": 3.2818026542663574, + "learning_rate": 1.7193398003386517e-05, + "loss": 0.3152, + "step": 4600 + }, + { + "epoch": 0.9204, + "grad_norm": 2.1103556156158447, + "learning_rate": 1.7183691733996463e-05, + "loss": 0.335, + "step": 4602 + }, + { + "epoch": 0.9208, + "grad_norm": 4.941044330596924, + "learning_rate": 1.7173971459631803e-05, + "loss": 0.3835, + "step": 4604 + }, + { + "epoch": 0.9212, + "grad_norm": 4.856771469116211, + "learning_rate": 1.7164237199242663e-05, + "loss": 0.3638, + "step": 4606 + }, + { + "epoch": 0.9216, + "grad_norm": 2.7062876224517822, + "learning_rate": 1.7154488971806525e-05, + "loss": 0.3937, + "step": 4608 + }, + { + "epoch": 0.922, + "grad_norm": 11.531942367553711, + "learning_rate": 1.7144726796328027e-05, + "loss": 0.2573, + "step": 4610 + }, + { + "epoch": 0.9224, + "grad_norm": 1.8068491220474243, + "learning_rate": 1.713495069183907e-05, + "loss": 0.1208, + "step": 4612 + }, + { + "epoch": 0.9228, + "grad_norm": 3.5942440032958984, + "learning_rate": 1.7125160677398632e-05, + "loss": 0.2104, + "step": 4614 + }, + { + "epoch": 0.9232, + "grad_norm": 1.4732728004455566, + "learning_rate": 1.7115356772092847e-05, + "loss": 0.4303, + "step": 4616 + }, + { + "epoch": 0.9236, + "grad_norm": 3.314056634902954, + "learning_rate": 1.710553899503497e-05, + "loss": 0.3332, + "step": 4618 + }, + { + "epoch": 0.924, + "grad_norm": 1.372910737991333, + "learning_rate": 1.709570736536522e-05, + "loss": 0.1404, + "step": 4620 + }, + { + "epoch": 0.9244, + "grad_norm": 1.1733759641647339, + "learning_rate": 1.708586190225086e-05, + "loss": 0.1505, + "step": 4622 + }, + { + "epoch": 0.9248, + "grad_norm": 2.194852113723755, + "learning_rate": 1.7076002624886152e-05, + "loss": 0.2692, + "step": 4624 + }, + { + "epoch": 0.9252, + "grad_norm": 2.453238010406494, + "learning_rate": 1.7066129552492258e-05, + "loss": 0.2866, + "step": 4626 + }, + { + "epoch": 0.9256, + "grad_norm": 8.256357192993164, + "learning_rate": 1.705624270431722e-05, + "loss": 0.5584, + "step": 4628 + }, + { + "epoch": 0.926, + "grad_norm": 6.352283000946045, + "learning_rate": 1.7046342099635945e-05, + "loss": 0.4419, + "step": 4630 + }, + { + "epoch": 0.9264, + "grad_norm": 2.786451816558838, + "learning_rate": 1.70364277577502e-05, + "loss": 0.6648, + "step": 4632 + }, + { + "epoch": 0.9268, + "grad_norm": 1.9137259721755981, + "learning_rate": 1.702649969798851e-05, + "loss": 0.2136, + "step": 4634 + }, + { + "epoch": 0.9272, + "grad_norm": 1.9395772218704224, + "learning_rate": 1.7016557939706078e-05, + "loss": 0.4255, + "step": 4636 + }, + { + "epoch": 0.9276, + "grad_norm": 1.6387386322021484, + "learning_rate": 1.700660250228492e-05, + "loss": 0.1999, + "step": 4638 + }, + { + "epoch": 0.928, + "grad_norm": 2.2650303840637207, + "learning_rate": 1.6996633405133673e-05, + "loss": 0.3371, + "step": 4640 + }, + { + "epoch": 0.9284, + "grad_norm": 2.447960615158081, + "learning_rate": 1.6986650667687556e-05, + "loss": 0.2845, + "step": 4642 + }, + { + "epoch": 0.9288, + "grad_norm": 1.6944670677185059, + "learning_rate": 1.6976654309408468e-05, + "loss": 0.096, + "step": 4644 + }, + { + "epoch": 0.9292, + "grad_norm": 3.548175573348999, + "learning_rate": 1.69666443497848e-05, + "loss": 0.5395, + "step": 4646 + }, + { + "epoch": 0.9296, + "grad_norm": 5.435319900512695, + "learning_rate": 1.6956620808331515e-05, + "loss": 0.4494, + "step": 4648 + }, + { + "epoch": 0.93, + "grad_norm": 3.812539577484131, + "learning_rate": 1.694658370458998e-05, + "loss": 0.246, + "step": 4650 + }, + { + "epoch": 0.9304, + "grad_norm": 1.649511456489563, + "learning_rate": 1.6936533058128042e-05, + "loss": 0.44, + "step": 4652 + }, + { + "epoch": 0.9308, + "grad_norm": 6.377965927124023, + "learning_rate": 1.692646888854001e-05, + "loss": 0.5262, + "step": 4654 + }, + { + "epoch": 0.9312, + "grad_norm": 8.255119323730469, + "learning_rate": 1.691639121544641e-05, + "loss": 0.4184, + "step": 4656 + }, + { + "epoch": 0.9316, + "grad_norm": 4.651469707489014, + "learning_rate": 1.690630005849424e-05, + "loss": 0.2894, + "step": 4658 + }, + { + "epoch": 0.932, + "grad_norm": 5.685030460357666, + "learning_rate": 1.6896195437356696e-05, + "loss": 0.2151, + "step": 4660 + }, + { + "epoch": 0.9324, + "grad_norm": 1.387536644935608, + "learning_rate": 1.6886077371733295e-05, + "loss": 0.5508, + "step": 4662 + }, + { + "epoch": 0.9328, + "grad_norm": 3.8255255222320557, + "learning_rate": 1.6875945881349686e-05, + "loss": 0.284, + "step": 4664 + }, + { + "epoch": 0.9332, + "grad_norm": 3.829646110534668, + "learning_rate": 1.6865800985957725e-05, + "loss": 0.3273, + "step": 4666 + }, + { + "epoch": 0.9336, + "grad_norm": 7.510060787200928, + "learning_rate": 1.6855642705335435e-05, + "loss": 0.4721, + "step": 4668 + }, + { + "epoch": 0.934, + "grad_norm": 7.441338062286377, + "learning_rate": 1.68454710592869e-05, + "loss": 0.3862, + "step": 4670 + }, + { + "epoch": 0.9344, + "grad_norm": 3.7644593715667725, + "learning_rate": 1.6835286067642228e-05, + "loss": 0.4442, + "step": 4672 + }, + { + "epoch": 0.9348, + "grad_norm": 4.605536460876465, + "learning_rate": 1.6825087750257617e-05, + "loss": 0.5004, + "step": 4674 + }, + { + "epoch": 0.9352, + "grad_norm": 3.8892505168914795, + "learning_rate": 1.681487612701521e-05, + "loss": 0.3465, + "step": 4676 + }, + { + "epoch": 0.9356, + "grad_norm": 1.5299859046936035, + "learning_rate": 1.6804651217823055e-05, + "loss": 0.2248, + "step": 4678 + }, + { + "epoch": 0.936, + "grad_norm": 1.5003297328948975, + "learning_rate": 1.6794413042615168e-05, + "loss": 0.6884, + "step": 4680 + }, + { + "epoch": 0.9364, + "grad_norm": 9.817461013793945, + "learning_rate": 1.6784161621351374e-05, + "loss": 0.5621, + "step": 4682 + }, + { + "epoch": 0.9368, + "grad_norm": 5.5773749351501465, + "learning_rate": 1.677389697401739e-05, + "loss": 0.3697, + "step": 4684 + }, + { + "epoch": 0.9372, + "grad_norm": 0.8657464981079102, + "learning_rate": 1.67636191206246e-05, + "loss": 0.2384, + "step": 4686 + }, + { + "epoch": 0.9376, + "grad_norm": 2.2345759868621826, + "learning_rate": 1.675332808121025e-05, + "loss": 0.5479, + "step": 4688 + }, + { + "epoch": 0.938, + "grad_norm": 4.0211029052734375, + "learning_rate": 1.6743023875837253e-05, + "loss": 0.402, + "step": 4690 + }, + { + "epoch": 0.9384, + "grad_norm": 4.082418918609619, + "learning_rate": 1.6732706524594145e-05, + "loss": 0.5392, + "step": 4692 + }, + { + "epoch": 0.9388, + "grad_norm": 15.433554649353027, + "learning_rate": 1.672237604759517e-05, + "loss": 0.3681, + "step": 4694 + }, + { + "epoch": 0.9392, + "grad_norm": 0.5448479056358337, + "learning_rate": 1.671203246498009e-05, + "loss": 0.1803, + "step": 4696 + }, + { + "epoch": 0.9396, + "grad_norm": 4.541321277618408, + "learning_rate": 1.670167579691429e-05, + "loss": 0.2615, + "step": 4698 + }, + { + "epoch": 0.94, + "grad_norm": 7.349790096282959, + "learning_rate": 1.6691306063588593e-05, + "loss": 0.6248, + "step": 4700 + }, + { + "epoch": 0.9404, + "grad_norm": 5.262326240539551, + "learning_rate": 1.668092328521931e-05, + "loss": 0.7674, + "step": 4702 + }, + { + "epoch": 0.9408, + "grad_norm": 5.039717197418213, + "learning_rate": 1.6670527482048242e-05, + "loss": 0.2734, + "step": 4704 + }, + { + "epoch": 0.9412, + "grad_norm": 3.4434213638305664, + "learning_rate": 1.6660118674342525e-05, + "loss": 0.1602, + "step": 4706 + }, + { + "epoch": 0.9416, + "grad_norm": 2.414407968521118, + "learning_rate": 1.6649696882394635e-05, + "loss": 0.3875, + "step": 4708 + }, + { + "epoch": 0.942, + "grad_norm": 2.828547954559326, + "learning_rate": 1.6639262126522414e-05, + "loss": 0.3173, + "step": 4710 + }, + { + "epoch": 0.9424, + "grad_norm": 5.270089626312256, + "learning_rate": 1.6628814427068968e-05, + "loss": 0.2516, + "step": 4712 + }, + { + "epoch": 0.9428, + "grad_norm": 4.692998886108398, + "learning_rate": 1.661835380440258e-05, + "loss": 0.516, + "step": 4714 + }, + { + "epoch": 0.9432, + "grad_norm": 4.202429294586182, + "learning_rate": 1.6607880278916778e-05, + "loss": 0.3991, + "step": 4716 + }, + { + "epoch": 0.9436, + "grad_norm": 0.05367492511868477, + "learning_rate": 1.6597393871030264e-05, + "loss": 0.1755, + "step": 4718 + }, + { + "epoch": 0.944, + "grad_norm": 6.432846546173096, + "learning_rate": 1.6586894601186824e-05, + "loss": 0.4153, + "step": 4720 + }, + { + "epoch": 0.9444, + "grad_norm": 9.411581039428711, + "learning_rate": 1.6576382489855278e-05, + "loss": 0.4967, + "step": 4722 + }, + { + "epoch": 0.9448, + "grad_norm": 3.278308868408203, + "learning_rate": 1.656585755752957e-05, + "loss": 0.7007, + "step": 4724 + }, + { + "epoch": 0.9452, + "grad_norm": 8.97327995300293, + "learning_rate": 1.655531982472859e-05, + "loss": 0.5993, + "step": 4726 + }, + { + "epoch": 0.9456, + "grad_norm": 12.207992553710938, + "learning_rate": 1.6544769311996153e-05, + "loss": 0.6414, + "step": 4728 + }, + { + "epoch": 0.946, + "grad_norm": 1.9908528327941895, + "learning_rate": 1.653420603990106e-05, + "loss": 0.3166, + "step": 4730 + }, + { + "epoch": 0.9464, + "grad_norm": 3.661930799484253, + "learning_rate": 1.6523630029036924e-05, + "loss": 0.1355, + "step": 4732 + }, + { + "epoch": 0.9468, + "grad_norm": 7.653252601623535, + "learning_rate": 1.651304130002226e-05, + "loss": 0.4729, + "step": 4734 + }, + { + "epoch": 0.9472, + "grad_norm": 13.31554126739502, + "learning_rate": 1.6502439873500294e-05, + "loss": 0.4986, + "step": 4736 + }, + { + "epoch": 0.9476, + "grad_norm": 10.450355529785156, + "learning_rate": 1.6491825770139058e-05, + "loss": 0.6854, + "step": 4738 + }, + { + "epoch": 0.948, + "grad_norm": 8.442341804504395, + "learning_rate": 1.6481199010631305e-05, + "loss": 1.2273, + "step": 4740 + }, + { + "epoch": 0.9484, + "grad_norm": 3.1967289447784424, + "learning_rate": 1.6470559615694455e-05, + "loss": 0.1959, + "step": 4742 + }, + { + "epoch": 0.9488, + "grad_norm": 5.6306352615356445, + "learning_rate": 1.645990760607052e-05, + "loss": 0.3239, + "step": 4744 + }, + { + "epoch": 0.9492, + "grad_norm": 8.5411376953125, + "learning_rate": 1.644924300252614e-05, + "loss": 0.4554, + "step": 4746 + }, + { + "epoch": 0.9496, + "grad_norm": 2.5537147521972656, + "learning_rate": 1.643856582585255e-05, + "loss": 0.1886, + "step": 4748 + }, + { + "epoch": 0.95, + "grad_norm": 7.373293876647949, + "learning_rate": 1.6427876096865407e-05, + "loss": 0.2779, + "step": 4750 + }, + { + "epoch": 0.9504, + "grad_norm": 1.9812324047088623, + "learning_rate": 1.641717383640488e-05, + "loss": 0.2329, + "step": 4752 + }, + { + "epoch": 0.9508, + "grad_norm": 1.7200254201889038, + "learning_rate": 1.6406459065335616e-05, + "loss": 0.3782, + "step": 4754 + }, + { + "epoch": 0.9512, + "grad_norm": 1.628679633140564, + "learning_rate": 1.6395731804546596e-05, + "loss": 0.28, + "step": 4756 + }, + { + "epoch": 0.9516, + "grad_norm": 6.510631561279297, + "learning_rate": 1.6384992074951128e-05, + "loss": 0.2717, + "step": 4758 + }, + { + "epoch": 0.952, + "grad_norm": 4.3001227378845215, + "learning_rate": 1.63742398974869e-05, + "loss": 0.2813, + "step": 4760 + }, + { + "epoch": 0.9524, + "grad_norm": 16.389856338500977, + "learning_rate": 1.6363475293115838e-05, + "loss": 1.0029, + "step": 4762 + }, + { + "epoch": 0.9528, + "grad_norm": 1.8763813972473145, + "learning_rate": 1.6352698282824045e-05, + "loss": 0.3575, + "step": 4764 + }, + { + "epoch": 0.9532, + "grad_norm": 2.881653070449829, + "learning_rate": 1.63419088876219e-05, + "loss": 0.3787, + "step": 4766 + }, + { + "epoch": 0.9536, + "grad_norm": 2.074388265609741, + "learning_rate": 1.633110712854385e-05, + "loss": 0.3541, + "step": 4768 + }, + { + "epoch": 0.954, + "grad_norm": 7.589776039123535, + "learning_rate": 1.6320293026648515e-05, + "loss": 0.558, + "step": 4770 + }, + { + "epoch": 0.9544, + "grad_norm": 3.187589645385742, + "learning_rate": 1.6309466603018504e-05, + "loss": 0.2711, + "step": 4772 + }, + { + "epoch": 0.9548, + "grad_norm": 3.116626739501953, + "learning_rate": 1.6298627878760495e-05, + "loss": 0.4733, + "step": 4774 + }, + { + "epoch": 0.9552, + "grad_norm": 5.240434646606445, + "learning_rate": 1.6287776875005148e-05, + "loss": 0.3698, + "step": 4776 + }, + { + "epoch": 0.9556, + "grad_norm": 2.423557996749878, + "learning_rate": 1.6276913612907015e-05, + "loss": 0.2624, + "step": 4778 + }, + { + "epoch": 0.956, + "grad_norm": 5.1820220947265625, + "learning_rate": 1.6266038113644612e-05, + "loss": 0.9108, + "step": 4780 + }, + { + "epoch": 0.9564, + "grad_norm": 4.259026527404785, + "learning_rate": 1.6255150398420266e-05, + "loss": 0.5389, + "step": 4782 + }, + { + "epoch": 0.9568, + "grad_norm": 12.433534622192383, + "learning_rate": 1.624425048846017e-05, + "loss": 0.8763, + "step": 4784 + }, + { + "epoch": 0.9572, + "grad_norm": 8.124911308288574, + "learning_rate": 1.623333840501421e-05, + "loss": 0.3713, + "step": 4786 + }, + { + "epoch": 0.9576, + "grad_norm": 4.081881046295166, + "learning_rate": 1.6222414169356063e-05, + "loss": 0.3538, + "step": 4788 + }, + { + "epoch": 0.958, + "grad_norm": 1.776490569114685, + "learning_rate": 1.6211477802783102e-05, + "loss": 0.3163, + "step": 4790 + }, + { + "epoch": 0.9584, + "grad_norm": 2.257758140563965, + "learning_rate": 1.6200529326616343e-05, + "loss": 0.2011, + "step": 4792 + }, + { + "epoch": 0.9588, + "grad_norm": 4.669278144836426, + "learning_rate": 1.618956876220035e-05, + "loss": 0.1911, + "step": 4794 + }, + { + "epoch": 0.9592, + "grad_norm": 2.420146942138672, + "learning_rate": 1.6178596130903345e-05, + "loss": 0.3475, + "step": 4796 + }, + { + "epoch": 0.9596, + "grad_norm": 2.2192740440368652, + "learning_rate": 1.616761145411704e-05, + "loss": 0.2908, + "step": 4798 + }, + { + "epoch": 0.96, + "grad_norm": 7.933553218841553, + "learning_rate": 1.6156614753256587e-05, + "loss": 0.6231, + "step": 4800 + }, + { + "epoch": 0.9604, + "grad_norm": 17.948909759521484, + "learning_rate": 1.6145606049760648e-05, + "loss": 0.9407, + "step": 4802 + }, + { + "epoch": 0.9608, + "grad_norm": 7.540837287902832, + "learning_rate": 1.613458536509123e-05, + "loss": 0.7206, + "step": 4804 + }, + { + "epoch": 0.9612, + "grad_norm": 7.342236042022705, + "learning_rate": 1.612355272073378e-05, + "loss": 0.7455, + "step": 4806 + }, + { + "epoch": 0.9616, + "grad_norm": 8.737372398376465, + "learning_rate": 1.6112508138196922e-05, + "loss": 0.5469, + "step": 4808 + }, + { + "epoch": 0.962, + "grad_norm": 11.444573402404785, + "learning_rate": 1.610145163901268e-05, + "loss": 0.4966, + "step": 4810 + }, + { + "epoch": 0.9624, + "grad_norm": 10.676769256591797, + "learning_rate": 1.6090383244736277e-05, + "loss": 0.6603, + "step": 4812 + }, + { + "epoch": 0.9628, + "grad_norm": 5.099000930786133, + "learning_rate": 1.6079302976946062e-05, + "loss": 0.2465, + "step": 4814 + }, + { + "epoch": 0.9632, + "grad_norm": 4.46925687789917, + "learning_rate": 1.606821085724363e-05, + "loss": 0.3478, + "step": 4816 + }, + { + "epoch": 0.9636, + "grad_norm": 6.447728157043457, + "learning_rate": 1.6057106907253607e-05, + "loss": 0.3757, + "step": 4818 + }, + { + "epoch": 0.964, + "grad_norm": 8.74674129486084, + "learning_rate": 1.6045991148623756e-05, + "loss": 0.3401, + "step": 4820 + }, + { + "epoch": 0.9644, + "grad_norm": 1.2778180837631226, + "learning_rate": 1.6034863603024775e-05, + "loss": 0.2504, + "step": 4822 + }, + { + "epoch": 0.9648, + "grad_norm": 2.5038793087005615, + "learning_rate": 1.602372429215038e-05, + "loss": 0.3047, + "step": 4824 + }, + { + "epoch": 0.9652, + "grad_norm": 4.5180559158325195, + "learning_rate": 1.6012573237717265e-05, + "loss": 0.2635, + "step": 4826 + }, + { + "epoch": 0.9656, + "grad_norm": 3.237168550491333, + "learning_rate": 1.600141046146497e-05, + "loss": 0.2061, + "step": 4828 + }, + { + "epoch": 0.966, + "grad_norm": 0.844524085521698, + "learning_rate": 1.5990235985155856e-05, + "loss": 0.2082, + "step": 4830 + }, + { + "epoch": 0.9664, + "grad_norm": 3.443889617919922, + "learning_rate": 1.597904983057519e-05, + "loss": 0.2482, + "step": 4832 + }, + { + "epoch": 0.9668, + "grad_norm": 5.986931324005127, + "learning_rate": 1.5967852019530942e-05, + "loss": 0.4262, + "step": 4834 + }, + { + "epoch": 0.9672, + "grad_norm": 2.591135025024414, + "learning_rate": 1.5956642573853794e-05, + "loss": 0.4015, + "step": 4836 + }, + { + "epoch": 0.9676, + "grad_norm": 4.978301048278809, + "learning_rate": 1.5945421515397135e-05, + "loss": 0.5276, + "step": 4838 + }, + { + "epoch": 0.968, + "grad_norm": 4.329903602600098, + "learning_rate": 1.5934188866037014e-05, + "loss": 0.2317, + "step": 4840 + }, + { + "epoch": 0.9684, + "grad_norm": 1.0851725339889526, + "learning_rate": 1.5922944647672068e-05, + "loss": 0.4802, + "step": 4842 + }, + { + "epoch": 0.9688, + "grad_norm": 6.902378082275391, + "learning_rate": 1.591168888222342e-05, + "loss": 0.501, + "step": 4844 + }, + { + "epoch": 0.9692, + "grad_norm": 1.8128844499588013, + "learning_rate": 1.5900421591634816e-05, + "loss": 0.2279, + "step": 4846 + }, + { + "epoch": 0.9696, + "grad_norm": 10.014293670654297, + "learning_rate": 1.5889142797872407e-05, + "loss": 0.3971, + "step": 4848 + }, + { + "epoch": 0.97, + "grad_norm": 1.8142011165618896, + "learning_rate": 1.5877852522924736e-05, + "loss": 0.2759, + "step": 4850 + }, + { + "epoch": 0.9704, + "grad_norm": 7.790955066680908, + "learning_rate": 1.5866550788802818e-05, + "loss": 0.3522, + "step": 4852 + }, + { + "epoch": 0.9708, + "grad_norm": 5.489255428314209, + "learning_rate": 1.5855237617539932e-05, + "loss": 0.5317, + "step": 4854 + }, + { + "epoch": 0.9712, + "grad_norm": 2.29658579826355, + "learning_rate": 1.584391303119173e-05, + "loss": 0.4665, + "step": 4856 + }, + { + "epoch": 0.9716, + "grad_norm": 1.849222183227539, + "learning_rate": 1.5832577051836023e-05, + "loss": 0.1313, + "step": 4858 + }, + { + "epoch": 0.972, + "grad_norm": 10.02795124053955, + "learning_rate": 1.582122970157289e-05, + "loss": 0.6772, + "step": 4860 + }, + { + "epoch": 0.9724, + "grad_norm": 3.089338541030884, + "learning_rate": 1.5809871002524592e-05, + "loss": 0.4755, + "step": 4862 + }, + { + "epoch": 0.9728, + "grad_norm": 3.1757960319519043, + "learning_rate": 1.5798500976835503e-05, + "loss": 0.2026, + "step": 4864 + }, + { + "epoch": 0.9732, + "grad_norm": 7.2989678382873535, + "learning_rate": 1.5787119646672032e-05, + "loss": 0.3546, + "step": 4866 + }, + { + "epoch": 0.9736, + "grad_norm": 3.48993182182312, + "learning_rate": 1.577572703422267e-05, + "loss": 0.2233, + "step": 4868 + }, + { + "epoch": 0.974, + "grad_norm": 4.323819160461426, + "learning_rate": 1.5764323161697946e-05, + "loss": 0.3594, + "step": 4870 + }, + { + "epoch": 0.9744, + "grad_norm": 2.219534397125244, + "learning_rate": 1.575290805133024e-05, + "loss": 0.4658, + "step": 4872 + }, + { + "epoch": 0.9748, + "grad_norm": 2.2977614402770996, + "learning_rate": 1.5741481725373896e-05, + "loss": 0.2956, + "step": 4874 + }, + { + "epoch": 0.9752, + "grad_norm": 2.326444625854492, + "learning_rate": 1.5730044206105156e-05, + "loss": 0.2178, + "step": 4876 + }, + { + "epoch": 0.9756, + "grad_norm": 8.747400283813477, + "learning_rate": 1.571859551582204e-05, + "loss": 0.5483, + "step": 4878 + }, + { + "epoch": 0.976, + "grad_norm": 7.1064019203186035, + "learning_rate": 1.570713567684432e-05, + "loss": 0.2191, + "step": 4880 + }, + { + "epoch": 0.9764, + "grad_norm": 2.111372470855713, + "learning_rate": 1.5695664711513575e-05, + "loss": 0.2562, + "step": 4882 + }, + { + "epoch": 0.9768, + "grad_norm": 7.495279312133789, + "learning_rate": 1.5684182642193047e-05, + "loss": 0.444, + "step": 4884 + }, + { + "epoch": 0.9772, + "grad_norm": 3.3476638793945312, + "learning_rate": 1.567268949126757e-05, + "loss": 0.4219, + "step": 4886 + }, + { + "epoch": 0.9776, + "grad_norm": 5.760251045227051, + "learning_rate": 1.566118528114367e-05, + "loss": 0.3529, + "step": 4888 + }, + { + "epoch": 0.978, + "grad_norm": 11.214778900146484, + "learning_rate": 1.5649670034249372e-05, + "loss": 1.0015, + "step": 4890 + }, + { + "epoch": 0.9784, + "grad_norm": 4.737841606140137, + "learning_rate": 1.563814377303429e-05, + "loss": 0.4572, + "step": 4892 + }, + { + "epoch": 0.9788, + "grad_norm": 3.9997241497039795, + "learning_rate": 1.5626606519969373e-05, + "loss": 0.2889, + "step": 4894 + }, + { + "epoch": 0.9792, + "grad_norm": 3.1104519367218018, + "learning_rate": 1.561505829754715e-05, + "loss": 0.3164, + "step": 4896 + }, + { + "epoch": 0.9796, + "grad_norm": 3.08510422706604, + "learning_rate": 1.5603499128281437e-05, + "loss": 0.4608, + "step": 4898 + }, + { + "epoch": 0.98, + "grad_norm": 2.111323833465576, + "learning_rate": 1.5591929034707475e-05, + "loss": 0.2683, + "step": 4900 + }, + { + "epoch": 0.9804, + "grad_norm": 1.6616854667663574, + "learning_rate": 1.558034803938171e-05, + "loss": 0.3276, + "step": 4902 + }, + { + "epoch": 0.9808, + "grad_norm": 4.197687149047852, + "learning_rate": 1.5568756164881874e-05, + "loss": 0.3959, + "step": 4904 + }, + { + "epoch": 0.9812, + "grad_norm": 3.4220995903015137, + "learning_rate": 1.5557153433806974e-05, + "loss": 1.4505, + "step": 4906 + }, + { + "epoch": 0.9816, + "grad_norm": 2.4903643131256104, + "learning_rate": 1.5545539868777085e-05, + "loss": 0.2587, + "step": 4908 + }, + { + "epoch": 0.982, + "grad_norm": 6.556966781616211, + "learning_rate": 1.5533915492433437e-05, + "loss": 0.3339, + "step": 4910 + }, + { + "epoch": 0.9824, + "grad_norm": 9.041155815124512, + "learning_rate": 1.5522280327438384e-05, + "loss": 0.3373, + "step": 4912 + }, + { + "epoch": 0.9828, + "grad_norm": 1.2950966358184814, + "learning_rate": 1.5510634396475275e-05, + "loss": 0.3809, + "step": 4914 + }, + { + "epoch": 0.9832, + "grad_norm": 2.2902371883392334, + "learning_rate": 1.5498977722248398e-05, + "loss": 0.2543, + "step": 4916 + }, + { + "epoch": 0.9836, + "grad_norm": 1.8121403455734253, + "learning_rate": 1.5487310327483084e-05, + "loss": 0.6853, + "step": 4918 + }, + { + "epoch": 0.984, + "grad_norm": 7.039142608642578, + "learning_rate": 1.547563223492552e-05, + "loss": 0.387, + "step": 4920 + }, + { + "epoch": 0.9844, + "grad_norm": 0.9216674566268921, + "learning_rate": 1.5463943467342708e-05, + "loss": 0.2686, + "step": 4922 + }, + { + "epoch": 0.9848, + "grad_norm": 6.605070114135742, + "learning_rate": 1.5452244047522504e-05, + "loss": 0.2736, + "step": 4924 + }, + { + "epoch": 0.9852, + "grad_norm": 1.1695291996002197, + "learning_rate": 1.5440533998273552e-05, + "loss": 0.3433, + "step": 4926 + }, + { + "epoch": 0.9856, + "grad_norm": 3.765047550201416, + "learning_rate": 1.5428813342425194e-05, + "loss": 0.3838, + "step": 4928 + }, + { + "epoch": 0.986, + "grad_norm": 0.923960268497467, + "learning_rate": 1.5417082102827407e-05, + "loss": 0.3428, + "step": 4930 + }, + { + "epoch": 0.9864, + "grad_norm": 3.8240437507629395, + "learning_rate": 1.5405340302350876e-05, + "loss": 0.3693, + "step": 4932 + }, + { + "epoch": 0.9868, + "grad_norm": 5.2713141441345215, + "learning_rate": 1.5393587963886827e-05, + "loss": 0.3319, + "step": 4934 + }, + { + "epoch": 0.9872, + "grad_norm": 2.36156964302063, + "learning_rate": 1.538182511034708e-05, + "loss": 0.3299, + "step": 4936 + }, + { + "epoch": 0.9876, + "grad_norm": 2.984520673751831, + "learning_rate": 1.5370051764663875e-05, + "loss": 0.528, + "step": 4938 + }, + { + "epoch": 0.988, + "grad_norm": 2.9261491298675537, + "learning_rate": 1.535826794978996e-05, + "loss": 0.2225, + "step": 4940 + }, + { + "epoch": 0.9884, + "grad_norm": 11.044042587280273, + "learning_rate": 1.534647368869852e-05, + "loss": 0.5053, + "step": 4942 + }, + { + "epoch": 0.9888, + "grad_norm": 0.8446880578994751, + "learning_rate": 1.5334669004383036e-05, + "loss": 0.12, + "step": 4944 + }, + { + "epoch": 0.9892, + "grad_norm": 2.0794358253479004, + "learning_rate": 1.5322853919857337e-05, + "loss": 0.2511, + "step": 4946 + }, + { + "epoch": 0.9896, + "grad_norm": 8.476103782653809, + "learning_rate": 1.5311028458155564e-05, + "loss": 0.1624, + "step": 4948 + }, + { + "epoch": 0.99, + "grad_norm": 3.510056972503662, + "learning_rate": 1.5299192642332063e-05, + "loss": 0.2864, + "step": 4950 + }, + { + "epoch": 0.9904, + "grad_norm": 4.38172721862793, + "learning_rate": 1.528734649546133e-05, + "loss": 0.4197, + "step": 4952 + }, + { + "epoch": 0.9908, + "grad_norm": 4.263515472412109, + "learning_rate": 1.5275490040638038e-05, + "loss": 0.4229, + "step": 4954 + }, + { + "epoch": 0.9912, + "grad_norm": 4.979330062866211, + "learning_rate": 1.5263623300976997e-05, + "loss": 0.4284, + "step": 4956 + }, + { + "epoch": 0.9916, + "grad_norm": 3.7567310333251953, + "learning_rate": 1.5251746299612973e-05, + "loss": 0.2467, + "step": 4958 + }, + { + "epoch": 0.992, + "grad_norm": 8.854488372802734, + "learning_rate": 1.5239859059700792e-05, + "loss": 0.2647, + "step": 4960 + }, + { + "epoch": 0.9924, + "grad_norm": 6.956577777862549, + "learning_rate": 1.522796160441527e-05, + "loss": 0.38, + "step": 4962 + }, + { + "epoch": 0.9928, + "grad_norm": 9.31304931640625, + "learning_rate": 1.5216053956951096e-05, + "loss": 0.3113, + "step": 4964 + }, + { + "epoch": 0.9932, + "grad_norm": 2.7152953147888184, + "learning_rate": 1.5204136140522799e-05, + "loss": 0.242, + "step": 4966 + }, + { + "epoch": 0.9936, + "grad_norm": 12.489564895629883, + "learning_rate": 1.5192208178364819e-05, + "loss": 0.2681, + "step": 4968 + }, + { + "epoch": 0.994, + "grad_norm": 6.194971561431885, + "learning_rate": 1.5180270093731291e-05, + "loss": 0.3229, + "step": 4970 + }, + { + "epoch": 0.9944, + "grad_norm": 2.1949777603149414, + "learning_rate": 1.5168321909896176e-05, + "loss": 0.1453, + "step": 4972 + }, + { + "epoch": 0.9948, + "grad_norm": 1.8492348194122314, + "learning_rate": 1.5156363650153017e-05, + "loss": 0.3968, + "step": 4974 + }, + { + "epoch": 0.9952, + "grad_norm": 10.583277702331543, + "learning_rate": 1.5144395337815057e-05, + "loss": 0.7443, + "step": 4976 + }, + { + "epoch": 0.9956, + "grad_norm": 4.1900739669799805, + "learning_rate": 1.5132416996215178e-05, + "loss": 0.3932, + "step": 4978 + }, + { + "epoch": 0.996, + "grad_norm": 2.3270652294158936, + "learning_rate": 1.5120428648705722e-05, + "loss": 0.3403, + "step": 4980 + }, + { + "epoch": 0.9964, + "grad_norm": 3.059401035308838, + "learning_rate": 1.5108430318658607e-05, + "loss": 0.1496, + "step": 4982 + }, + { + "epoch": 0.9968, + "grad_norm": 5.557708263397217, + "learning_rate": 1.5096422029465171e-05, + "loss": 0.47, + "step": 4984 + }, + { + "epoch": 0.9972, + "grad_norm": 5.171732425689697, + "learning_rate": 1.5084403804536236e-05, + "loss": 0.1582, + "step": 4986 + }, + { + "epoch": 0.9976, + "grad_norm": 9.425728797912598, + "learning_rate": 1.5072375667301904e-05, + "loss": 0.527, + "step": 4988 + }, + { + "epoch": 0.998, + "grad_norm": 8.549098014831543, + "learning_rate": 1.5060337641211636e-05, + "loss": 0.5003, + "step": 4990 + }, + { + "epoch": 0.9984, + "grad_norm": 3.9038662910461426, + "learning_rate": 1.5048289749734231e-05, + "loss": 0.4701, + "step": 4992 + }, + { + "epoch": 0.9988, + "grad_norm": 15.853522300720215, + "learning_rate": 1.5036232016357622e-05, + "loss": 1.0973, + "step": 4994 + }, + { + "epoch": 0.9992, + "grad_norm": 4.941516876220703, + "learning_rate": 1.502416446458898e-05, + "loss": 0.3362, + "step": 4996 + }, + { + "epoch": 0.9996, + "grad_norm": 3.7617809772491455, + "learning_rate": 1.5012087117954641e-05, + "loss": 0.5823, + "step": 4998 + }, + { + "epoch": 1.0, + "grad_norm": 7.155248641967773, + "learning_rate": 1.5000000000000014e-05, + "loss": 0.2181, + "step": 5000 + }, + { + "epoch": 1.0, + "step": 5000, + "total_flos": 2.0304101377572864e+16, + "train_loss": 0.4523466708464082, + "train_runtime": 36528.9108, + "train_samples_per_second": 2.19, + "train_steps_per_second": 0.137 + } + ], + "logging_steps": 2, + "max_steps": 5000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 2.0304101377572864e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..f1fe6788df2fab894eddb2098ab6e2654841f0e0 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:773bc3343ecb99509165bbfb84c7f403dbaa8d679ef769e5b1d41f92b36ebdab +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..bcc40d9663e926a3569751c582000f95fea650e8 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1eb792acfae0e75583d9e37b47d3cd9f277e1db5cae2de3e648a82049f8d2eac +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..a8036b9d242f3d82093f6f725d275af6754049b6 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:572bd027f2712814ed6224ab445f1c5319a689f29ed2ce37162d0f69e71f50dd +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..abfb952ea134eeff60d1b8c5d5dd4b28bd4d0f81 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_gradnorm_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a05ae36937f432fba8bf0857373c7e7e593b5c3144e0a70c871751715519b78 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_infoBatch_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_infoBatch_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..16f74a5d7041ae64a0a4370ac759f1edde2591e8 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_infoBatch_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,15032 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004, + "learning_rate": 2.3485380412522497e-06, + "loss": 0.2346, + "step": 2 + }, + { + "epoch": 0.0008, + "learning_rate": 2.357535430610912e-06, + "loss": 0.2358, + "step": 4 + }, + { + "epoch": 0.0012, + "learning_rate": 2.366547719345306e-06, + "loss": 0.1932, + "step": 6 + }, + { + "epoch": 0.0016, + "learning_rate": 2.3755748898855234e-06, + "loss": 1.9608, + "step": 8 + }, + { + "epoch": 0.002, + "learning_rate": 2.3846169246326332e-06, + "loss": 0.6351, + "step": 10 + }, + { + "epoch": 0.0024, + "learning_rate": 2.3936738059587174e-06, + "loss": 0.3705, + "step": 12 + }, + { + "epoch": 0.0028, + "learning_rate": 2.4027455162069537e-06, + "loss": 0.5752, + "step": 14 + }, + { + "epoch": 0.0032, + "learning_rate": 2.411832037691545e-06, + "loss": 0.3304, + "step": 16 + }, + { + "epoch": 0.0036, + "learning_rate": 2.420933352697865e-06, + "loss": 0.7345, + "step": 18 + }, + { + "epoch": 0.004, + "learning_rate": 2.430049443482434e-06, + "loss": 0.2339, + "step": 20 + }, + { + "epoch": 0.0044, + "learning_rate": 2.439180292272967e-06, + "loss": 1.001, + "step": 22 + }, + { + "epoch": 0.0048, + "learning_rate": 2.448325881268406e-06, + "loss": 0.5155, + "step": 24 + }, + { + "epoch": 0.0052, + "learning_rate": 2.457486192638958e-06, + "loss": 0.2923, + "step": 26 + }, + { + "epoch": 0.0056, + "learning_rate": 2.4666612085261277e-06, + "loss": 0.3701, + "step": 28 + }, + { + "epoch": 0.006, + "learning_rate": 2.475850911042752e-06, + "loss": 0.6947, + "step": 30 + }, + { + "epoch": 0.0064, + "learning_rate": 2.4850552822730346e-06, + "loss": 0.2979, + "step": 32 + }, + { + "epoch": 0.0068, + "learning_rate": 2.4942743042725836e-06, + "loss": 0.4334, + "step": 34 + }, + { + "epoch": 0.0072, + "learning_rate": 2.503507959068455e-06, + "loss": 0.4388, + "step": 36 + }, + { + "epoch": 0.0076, + "learning_rate": 2.5127562286591313e-06, + "loss": 0.6165, + "step": 38 + }, + { + "epoch": 0.008, + "learning_rate": 2.522019095014686e-06, + "loss": 0.2293, + "step": 40 + }, + { + "epoch": 0.0084, + "learning_rate": 2.531296540076638e-06, + "loss": 0.2533, + "step": 42 + }, + { + "epoch": 0.0088, + "learning_rate": 2.5405885457581814e-06, + "loss": 0.3732, + "step": 44 + }, + { + "epoch": 0.0092, + "learning_rate": 2.5498950939440413e-06, + "loss": 0.5192, + "step": 46 + }, + { + "epoch": 0.0096, + "learning_rate": 2.5592161664906243e-06, + "loss": 0.4124, + "step": 48 + }, + { + "epoch": 0.01, + "learning_rate": 2.5685517452260587e-06, + "loss": 0.3824, + "step": 50 + }, + { + "epoch": 0.0104, + "learning_rate": 2.5779018119501086e-06, + "loss": 0.7555, + "step": 52 + }, + { + "epoch": 0.0108, + "learning_rate": 2.5872663484343887e-06, + "loss": 0.8544, + "step": 54 + }, + { + "epoch": 0.0112, + "learning_rate": 2.596645336422219e-06, + "loss": 0.5024, + "step": 56 + }, + { + "epoch": 0.0116, + "learning_rate": 2.606038757628795e-06, + "loss": 0.2219, + "step": 58 + }, + { + "epoch": 0.012, + "learning_rate": 2.615446593741161e-06, + "loss": 0.4692, + "step": 60 + }, + { + "epoch": 0.0124, + "learning_rate": 2.6248688264182588e-06, + "loss": 0.318, + "step": 62 + }, + { + "epoch": 0.0128, + "learning_rate": 2.6343054372909648e-06, + "loss": 0.3205, + "step": 64 + }, + { + "epoch": 0.0132, + "learning_rate": 2.6437564079621235e-06, + "loss": 0.4307, + "step": 66 + }, + { + "epoch": 0.0136, + "learning_rate": 2.6532217200065826e-06, + "loss": 0.5321, + "step": 68 + }, + { + "epoch": 0.014, + "learning_rate": 2.662701354971232e-06, + "loss": 0.753, + "step": 70 + }, + { + "epoch": 0.0144, + "learning_rate": 2.6721952943750396e-06, + "loss": 0.171, + "step": 72 + }, + { + "epoch": 0.0148, + "learning_rate": 2.6817035197090825e-06, + "loss": 0.3403, + "step": 74 + }, + { + "epoch": 0.0152, + "learning_rate": 2.691226012436604e-06, + "loss": 0.4093, + "step": 76 + }, + { + "epoch": 0.0156, + "learning_rate": 2.7007627539929783e-06, + "loss": 0.1997, + "step": 78 + }, + { + "epoch": 0.016, + "learning_rate": 2.7103137257858893e-06, + "loss": 0.2962, + "step": 80 + }, + { + "epoch": 0.0164, + "learning_rate": 2.7198789091951806e-06, + "loss": 0.3657, + "step": 82 + }, + { + "epoch": 0.0168, + "learning_rate": 2.7294582855730733e-06, + "loss": 0.2657, + "step": 84 + }, + { + "epoch": 0.0172, + "learning_rate": 2.7390518362440843e-06, + "loss": 0.6661, + "step": 86 + }, + { + "epoch": 0.0176, + "learning_rate": 2.7486595425050566e-06, + "loss": 0.2768, + "step": 88 + }, + { + "epoch": 0.018, + "learning_rate": 2.7582813856253264e-06, + "loss": 0.2893, + "step": 90 + }, + { + "epoch": 0.0184, + "learning_rate": 2.7679173468465813e-06, + "loss": 0.6233, + "step": 92 + }, + { + "epoch": 0.0188, + "learning_rate": 2.777567407383033e-06, + "loss": 0.4732, + "step": 94 + }, + { + "epoch": 0.0192, + "learning_rate": 2.7872315484213954e-06, + "loss": 0.176, + "step": 96 + }, + { + "epoch": 0.0196, + "learning_rate": 2.796909751120931e-06, + "loss": 0.2902, + "step": 98 + }, + { + "epoch": 0.02, + "learning_rate": 2.8066019966134873e-06, + "loss": 0.2778, + "step": 100 + }, + { + "epoch": 0.0204, + "learning_rate": 2.816308266003538e-06, + "loss": 0.1338, + "step": 102 + }, + { + "epoch": 0.0208, + "learning_rate": 2.826028540368212e-06, + "loss": 0.2697, + "step": 104 + }, + { + "epoch": 0.0212, + "learning_rate": 2.835762800757338e-06, + "loss": 0.4748, + "step": 106 + }, + { + "epoch": 0.0216, + "learning_rate": 2.845511028193477e-06, + "loss": 0.2939, + "step": 108 + }, + { + "epoch": 0.022, + "learning_rate": 2.855273203671962e-06, + "loss": 1.2031, + "step": 110 + }, + { + "epoch": 0.0224, + "learning_rate": 2.865049308160931e-06, + "loss": 0.3395, + "step": 112 + }, + { + "epoch": 0.0228, + "learning_rate": 2.874839322601368e-06, + "loss": 0.4199, + "step": 114 + }, + { + "epoch": 0.0232, + "learning_rate": 2.8846432279071533e-06, + "loss": 0.3056, + "step": 116 + }, + { + "epoch": 0.0236, + "learning_rate": 2.8944610049650314e-06, + "loss": 0.2797, + "step": 118 + }, + { + "epoch": 0.024, + "learning_rate": 2.9042926346347835e-06, + "loss": 0.4004, + "step": 120 + }, + { + "epoch": 0.0244, + "learning_rate": 2.914138097749143e-06, + "loss": 0.307, + "step": 122 + }, + { + "epoch": 0.0248, + "learning_rate": 2.9239973751138397e-06, + "loss": 0.9108, + "step": 124 + }, + { + "epoch": 0.0252, + "learning_rate": 2.933870447507756e-06, + "loss": 0.746, + "step": 126 + }, + { + "epoch": 0.0256, + "learning_rate": 2.943757295682783e-06, + "loss": 0.324, + "step": 128 + }, + { + "epoch": 0.026, + "learning_rate": 2.953657900364055e-06, + "loss": 0.6327, + "step": 130 + }, + { + "epoch": 0.0264, + "learning_rate": 2.9635722422497983e-06, + "loss": 0.6395, + "step": 132 + }, + { + "epoch": 0.0268, + "learning_rate": 2.973500302011496e-06, + "loss": 0.379, + "step": 134 + }, + { + "epoch": 0.0272, + "learning_rate": 2.983442060293926e-06, + "loss": 0.4462, + "step": 136 + }, + { + "epoch": 0.0276, + "learning_rate": 2.9933974977150827e-06, + "loss": 0.3213, + "step": 138 + }, + { + "epoch": 0.028, + "learning_rate": 3.003366594866345e-06, + "loss": 0.8143, + "step": 140 + }, + { + "epoch": 0.0284, + "learning_rate": 3.0133493323124474e-06, + "loss": 0.3852, + "step": 142 + }, + { + "epoch": 0.0288, + "learning_rate": 3.0233456905915338e-06, + "loss": 0.465, + "step": 144 + }, + { + "epoch": 0.0292, + "learning_rate": 3.0333556502151895e-06, + "loss": 0.3986, + "step": 146 + }, + { + "epoch": 0.0296, + "learning_rate": 3.0433791916684885e-06, + "loss": 0.2418, + "step": 148 + }, + { + "epoch": 0.03, + "learning_rate": 3.0534162954100234e-06, + "loss": 0.3229, + "step": 150 + }, + { + "epoch": 0.0304, + "learning_rate": 3.0634669418719453e-06, + "loss": 0.2306, + "step": 152 + }, + { + "epoch": 0.0308, + "learning_rate": 3.0735311114600064e-06, + "loss": 0.533, + "step": 154 + }, + { + "epoch": 0.0312, + "learning_rate": 3.0836087845535933e-06, + "loss": 0.4359, + "step": 156 + }, + { + "epoch": 0.0316, + "learning_rate": 3.0936999415057645e-06, + "loss": 0.4518, + "step": 158 + }, + { + "epoch": 0.032, + "learning_rate": 3.1038045626432945e-06, + "loss": 0.5678, + "step": 160 + }, + { + "epoch": 0.0324, + "learning_rate": 3.1139226282667212e-06, + "loss": 0.3485, + "step": 162 + }, + { + "epoch": 0.0328, + "learning_rate": 3.1240541186503173e-06, + "loss": 0.401, + "step": 164 + }, + { + "epoch": 0.0332, + "learning_rate": 3.134199014042277e-06, + "loss": 0.4544, + "step": 166 + }, + { + "epoch": 0.0336, + "learning_rate": 3.1443572946645683e-06, + "loss": 0.296, + "step": 168 + }, + { + "epoch": 0.034, + "learning_rate": 3.154528940713103e-06, + "loss": 0.3007, + "step": 170 + }, + { + "epoch": 0.0344, + "learning_rate": 3.164713932357776e-06, + "loss": 0.5337, + "step": 172 + }, + { + "epoch": 0.0348, + "learning_rate": 3.1749122497423724e-06, + "loss": 0.3173, + "step": 174 + }, + { + "epoch": 0.0352, + "learning_rate": 3.1851238729848033e-06, + "loss": 0.3671, + "step": 176 + }, + { + "epoch": 0.0356, + "learning_rate": 3.195348782176948e-06, + "loss": 0.3713, + "step": 178 + }, + { + "epoch": 0.036, + "learning_rate": 3.205586957384834e-06, + "loss": 0.2597, + "step": 180 + }, + { + "epoch": 0.0364, + "learning_rate": 3.215838378648617e-06, + "loss": 0.26, + "step": 182 + }, + { + "epoch": 0.0368, + "learning_rate": 3.2261030259826253e-06, + "loss": 0.3936, + "step": 184 + }, + { + "epoch": 0.0372, + "learning_rate": 3.2363808793754036e-06, + "loss": 0.4474, + "step": 186 + }, + { + "epoch": 0.0376, + "learning_rate": 3.246671918789752e-06, + "loss": 0.3485, + "step": 188 + }, + { + "epoch": 0.038, + "learning_rate": 3.2569761241627617e-06, + "loss": 0.2185, + "step": 190 + }, + { + "epoch": 0.0384, + "learning_rate": 3.267293475405858e-06, + "loss": 0.1629, + "step": 192 + }, + { + "epoch": 0.0388, + "learning_rate": 3.277623952404835e-06, + "loss": 0.3857, + "step": 194 + }, + { + "epoch": 0.0392, + "learning_rate": 3.2879675350199004e-06, + "loss": 0.189, + "step": 196 + }, + { + "epoch": 0.0396, + "learning_rate": 3.298324203085723e-06, + "loss": 0.5942, + "step": 198 + }, + { + "epoch": 0.04, + "learning_rate": 3.3086939364114113e-06, + "loss": 0.7996, + "step": 200 + }, + { + "epoch": 0.0404, + "learning_rate": 3.3190767147806892e-06, + "loss": 0.4361, + "step": 202 + }, + { + "epoch": 0.0408, + "learning_rate": 3.329472517951747e-06, + "loss": 0.2819, + "step": 204 + }, + { + "epoch": 0.0412, + "learning_rate": 3.3398813256574745e-06, + "loss": 0.4769, + "step": 206 + }, + { + "epoch": 0.0416, + "learning_rate": 3.350303117605369e-06, + "loss": 0.2608, + "step": 208 + }, + { + "epoch": 0.042, + "learning_rate": 3.360737873477574e-06, + "loss": 0.4172, + "step": 210 + }, + { + "epoch": 0.0424, + "learning_rate": 3.3711855729310503e-06, + "loss": 0.5104, + "step": 212 + }, + { + "epoch": 0.0428, + "learning_rate": 3.3816461955974224e-06, + "loss": 0.5674, + "step": 214 + }, + { + "epoch": 0.0432, + "learning_rate": 3.3921197210832235e-06, + "loss": 0.3536, + "step": 216 + }, + { + "epoch": 0.0436, + "learning_rate": 3.4026061289697397e-06, + "loss": 0.4486, + "step": 218 + }, + { + "epoch": 0.044, + "learning_rate": 3.4131053988131947e-06, + "loss": 0.4232, + "step": 220 + }, + { + "epoch": 0.0444, + "learning_rate": 3.4236175101447257e-06, + "loss": 0.6894, + "step": 222 + }, + { + "epoch": 0.0448, + "learning_rate": 3.434142442470434e-06, + "loss": 0.2449, + "step": 224 + }, + { + "epoch": 0.0452, + "learning_rate": 3.444680175271424e-06, + "loss": 0.2809, + "step": 226 + }, + { + "epoch": 0.0456, + "learning_rate": 3.455230688003849e-06, + "loss": 0.1866, + "step": 228 + }, + { + "epoch": 0.046, + "learning_rate": 3.465793960098942e-06, + "loss": 0.3415, + "step": 230 + }, + { + "epoch": 0.0464, + "learning_rate": 3.476369970963065e-06, + "loss": 0.4282, + "step": 232 + }, + { + "epoch": 0.0468, + "learning_rate": 3.486958699977743e-06, + "loss": 0.6342, + "step": 234 + }, + { + "epoch": 0.0472, + "learning_rate": 3.497560126499706e-06, + "loss": 0.7236, + "step": 236 + }, + { + "epoch": 0.0476, + "learning_rate": 3.508174229860947e-06, + "loss": 0.3302, + "step": 238 + }, + { + "epoch": 0.048, + "learning_rate": 3.5188009893686836e-06, + "loss": 0.1749, + "step": 240 + }, + { + "epoch": 0.0484, + "learning_rate": 3.5294403843055493e-06, + "loss": 0.202, + "step": 242 + }, + { + "epoch": 0.0488, + "learning_rate": 3.5400923939294827e-06, + "loss": 0.8852, + "step": 244 + }, + { + "epoch": 0.0492, + "learning_rate": 3.5507569974738477e-06, + "loss": 0.5212, + "step": 246 + }, + { + "epoch": 0.0496, + "learning_rate": 3.5614341741474667e-06, + "loss": 0.3883, + "step": 248 + }, + { + "epoch": 0.05, + "learning_rate": 3.5721239031345966e-06, + "loss": 0.6042, + "step": 250 + }, + { + "epoch": 0.0504, + "learning_rate": 3.5828261635951177e-06, + "loss": 0.4591, + "step": 252 + }, + { + "epoch": 0.0508, + "learning_rate": 3.593540934664387e-06, + "loss": 0.2633, + "step": 254 + }, + { + "epoch": 0.0512, + "learning_rate": 3.604268195453421e-06, + "loss": 0.7281, + "step": 256 + }, + { + "epoch": 0.0516, + "learning_rate": 3.6150079250488767e-06, + "loss": 0.4613, + "step": 258 + }, + { + "epoch": 0.052, + "learning_rate": 3.6257601025130893e-06, + "loss": 0.2578, + "step": 260 + }, + { + "epoch": 0.0524, + "learning_rate": 3.636524706884178e-06, + "loss": 0.3156, + "step": 262 + }, + { + "epoch": 0.0528, + "learning_rate": 3.647301717175955e-06, + "loss": 0.281, + "step": 264 + }, + { + "epoch": 0.0532, + "learning_rate": 3.6580911123781025e-06, + "loss": 0.4728, + "step": 266 + }, + { + "epoch": 0.0536, + "learning_rate": 3.66889287145614e-06, + "loss": 0.3013, + "step": 268 + }, + { + "epoch": 0.054, + "learning_rate": 3.679706973351488e-06, + "loss": 0.3117, + "step": 270 + }, + { + "epoch": 0.0544, + "learning_rate": 3.6905333969814995e-06, + "loss": 0.7485, + "step": 272 + }, + { + "epoch": 0.0548, + "learning_rate": 3.701372121239508e-06, + "loss": 0.224, + "step": 274 + }, + { + "epoch": 0.0552, + "learning_rate": 3.712223124994867e-06, + "loss": 0.2606, + "step": 276 + }, + { + "epoch": 0.0556, + "learning_rate": 3.723086387092989e-06, + "loss": 0.148, + "step": 278 + }, + { + "epoch": 0.056, + "learning_rate": 3.7339618863553885e-06, + "loss": 0.2976, + "step": 280 + }, + { + "epoch": 0.0564, + "learning_rate": 3.744849601579722e-06, + "loss": 0.249, + "step": 282 + }, + { + "epoch": 0.0568, + "learning_rate": 3.755749511539848e-06, + "loss": 0.4693, + "step": 284 + }, + { + "epoch": 0.0572, + "learning_rate": 3.7666615949857897e-06, + "loss": 0.6652, + "step": 286 + }, + { + "epoch": 0.0576, + "learning_rate": 3.7775858306439404e-06, + "loss": 0.3661, + "step": 288 + }, + { + "epoch": 0.058, + "learning_rate": 3.7885221972168864e-06, + "loss": 0.5059, + "step": 290 + }, + { + "epoch": 0.0584, + "learning_rate": 3.799470673383677e-06, + "loss": 0.22, + "step": 292 + }, + { + "epoch": 0.0588, + "learning_rate": 3.810431237799657e-06, + "loss": 0.3767, + "step": 294 + }, + { + "epoch": 0.0592, + "learning_rate": 3.821403869096644e-06, + "loss": 0.299, + "step": 296 + }, + { + "epoch": 0.0596, + "learning_rate": 3.8323885458829745e-06, + "loss": 0.0906, + "step": 298 + }, + { + "epoch": 0.06, + "learning_rate": 3.8433852467434175e-06, + "loss": 0.2923, + "step": 300 + }, + { + "epoch": 0.0604, + "learning_rate": 3.854393950239355e-06, + "loss": 0.4484, + "step": 302 + }, + { + "epoch": 0.0608, + "learning_rate": 3.865414634908756e-06, + "loss": 0.596, + "step": 304 + }, + { + "epoch": 0.0612, + "learning_rate": 3.876447279266233e-06, + "loss": 0.3995, + "step": 306 + }, + { + "epoch": 0.0616, + "learning_rate": 3.887491861803081e-06, + "loss": 0.1691, + "step": 308 + }, + { + "epoch": 0.062, + "learning_rate": 3.898548360987321e-06, + "loss": 0.1787, + "step": 310 + }, + { + "epoch": 0.0624, + "learning_rate": 3.909616755263741e-06, + "loss": 0.3447, + "step": 312 + }, + { + "epoch": 0.0628, + "learning_rate": 3.920697023053941e-06, + "loss": 0.1873, + "step": 314 + }, + { + "epoch": 0.0632, + "learning_rate": 3.9317891427563725e-06, + "loss": 0.4102, + "step": 316 + }, + { + "epoch": 0.0636, + "learning_rate": 3.942893092746381e-06, + "loss": 0.8123, + "step": 318 + }, + { + "epoch": 0.064, + "learning_rate": 3.954008851376244e-06, + "loss": 0.3797, + "step": 320 + }, + { + "epoch": 0.0644, + "learning_rate": 3.965136396975227e-06, + "loss": 0.3143, + "step": 322 + }, + { + "epoch": 0.0648, + "learning_rate": 3.976275707849619e-06, + "loss": 0.2045, + "step": 324 + }, + { + "epoch": 0.0652, + "learning_rate": 3.987426762282726e-06, + "loss": 0.3156, + "step": 326 + }, + { + "epoch": 0.0656, + "learning_rate": 3.99858953853505e-06, + "loss": 0.2336, + "step": 328 + }, + { + "epoch": 0.066, + "learning_rate": 4.009764014844146e-06, + "loss": 0.2395, + "step": 330 + }, + { + "epoch": 0.0664, + "learning_rate": 4.0209501694248e-06, + "loss": 0.1843, + "step": 332 + }, + { + "epoch": 0.0668, + "learning_rate": 4.032147980469076e-06, + "loss": 0.4948, + "step": 334 + }, + { + "epoch": 0.0672, + "learning_rate": 4.043357426146209e-06, + "loss": 0.5784, + "step": 336 + }, + { + "epoch": 0.0676, + "learning_rate": 4.054578484602869e-06, + "loss": 0.2328, + "step": 338 + }, + { + "epoch": 0.068, + "learning_rate": 4.065811133962987e-06, + "loss": 0.2095, + "step": 340 + }, + { + "epoch": 0.0684, + "learning_rate": 4.07705535232795e-06, + "loss": 0.2187, + "step": 342 + }, + { + "epoch": 0.0688, + "learning_rate": 4.08831111777658e-06, + "loss": 0.3561, + "step": 344 + }, + { + "epoch": 0.0692, + "learning_rate": 4.0995784083651865e-06, + "loss": 0.2657, + "step": 346 + }, + { + "epoch": 0.0696, + "learning_rate": 4.110857202127611e-06, + "loss": 0.2466, + "step": 348 + }, + { + "epoch": 0.07, + "learning_rate": 4.122147477075266e-06, + "loss": 0.5165, + "step": 350 + }, + { + "epoch": 0.0704, + "learning_rate": 4.133449211197183e-06, + "loss": 0.137, + "step": 352 + }, + { + "epoch": 0.0708, + "learning_rate": 4.144762382460055e-06, + "loss": 0.1544, + "step": 354 + }, + { + "epoch": 0.0712, + "learning_rate": 4.156086968808274e-06, + "loss": 0.3579, + "step": 356 + }, + { + "epoch": 0.0716, + "learning_rate": 4.1674229481639796e-06, + "loss": 0.7326, + "step": 358 + }, + { + "epoch": 0.072, + "learning_rate": 4.178770298427114e-06, + "loss": 0.1682, + "step": 360 + }, + { + "epoch": 0.0724, + "learning_rate": 4.190128997475395e-06, + "loss": 0.191, + "step": 362 + }, + { + "epoch": 0.0728, + "learning_rate": 4.201499023164515e-06, + "loss": 0.3578, + "step": 364 + }, + { + "epoch": 0.0732, + "learning_rate": 4.212880353327968e-06, + "loss": 0.3231, + "step": 366 + }, + { + "epoch": 0.0736, + "learning_rate": 4.224272965777315e-06, + "loss": 0.591, + "step": 368 + }, + { + "epoch": 0.074, + "learning_rate": 4.235676838302072e-06, + "loss": 0.1678, + "step": 370 + }, + { + "epoch": 0.0744, + "learning_rate": 4.247091948669764e-06, + "loss": 0.2233, + "step": 372 + }, + { + "epoch": 0.0748, + "learning_rate": 4.258518274626106e-06, + "loss": 0.3319, + "step": 374 + }, + { + "epoch": 0.0752, + "learning_rate": 4.269955793894849e-06, + "loss": 0.6087, + "step": 376 + }, + { + "epoch": 0.0756, + "learning_rate": 4.281404484177978e-06, + "loss": 1.2429, + "step": 378 + }, + { + "epoch": 0.076, + "learning_rate": 4.292864323155684e-06, + "loss": 0.6571, + "step": 380 + }, + { + "epoch": 0.0764, + "learning_rate": 4.304335288486412e-06, + "loss": 0.7448, + "step": 382 + }, + { + "epoch": 0.0768, + "learning_rate": 4.3158173578069696e-06, + "loss": 0.6633, + "step": 384 + }, + { + "epoch": 0.0772, + "learning_rate": 4.327310508732434e-06, + "loss": 0.6562, + "step": 386 + }, + { + "epoch": 0.0776, + "learning_rate": 4.338814718856333e-06, + "loss": 0.333, + "step": 388 + }, + { + "epoch": 0.078, + "learning_rate": 4.350329965750618e-06, + "loss": 0.1539, + "step": 390 + }, + { + "epoch": 0.0784, + "learning_rate": 4.3618562269657285e-06, + "loss": 0.3141, + "step": 392 + }, + { + "epoch": 0.0788, + "learning_rate": 4.373393480030629e-06, + "loss": 0.2795, + "step": 394 + }, + { + "epoch": 0.0792, + "learning_rate": 4.384941702452852e-06, + "loss": 0.4677, + "step": 396 + }, + { + "epoch": 0.0796, + "learning_rate": 4.396500871718548e-06, + "loss": 0.7137, + "step": 398 + }, + { + "epoch": 0.08, + "learning_rate": 4.408070965292526e-06, + "loss": 0.2847, + "step": 400 + }, + { + "epoch": 0.0804, + "learning_rate": 4.419651960618294e-06, + "loss": 0.5573, + "step": 402 + }, + { + "epoch": 0.0808, + "learning_rate": 4.431243835118112e-06, + "loss": 0.5143, + "step": 404 + }, + { + "epoch": 0.0812, + "learning_rate": 4.442846566193041e-06, + "loss": 0.2657, + "step": 406 + }, + { + "epoch": 0.0816, + "learning_rate": 4.4544601312229185e-06, + "loss": 0.1897, + "step": 408 + }, + { + "epoch": 0.082, + "learning_rate": 4.4660845075665635e-06, + "loss": 0.3726, + "step": 410 + }, + { + "epoch": 0.0824, + "learning_rate": 4.477719672561602e-06, + "loss": 0.3428, + "step": 412 + }, + { + "epoch": 0.0828, + "learning_rate": 4.489365603524743e-06, + "loss": 0.977, + "step": 414 + }, + { + "epoch": 0.0832, + "learning_rate": 4.501022277751605e-06, + "loss": 0.1626, + "step": 416 + }, + { + "epoch": 0.0836, + "learning_rate": 4.5126896725169025e-06, + "loss": 0.2663, + "step": 418 + }, + { + "epoch": 0.084, + "learning_rate": 4.524367765074499e-06, + "loss": 0.4058, + "step": 420 + }, + { + "epoch": 0.0844, + "learning_rate": 4.536056532657295e-06, + "loss": 0.2284, + "step": 422 + }, + { + "epoch": 0.0848, + "learning_rate": 4.5477559524775e-06, + "loss": 0.8846, + "step": 424 + }, + { + "epoch": 0.0852, + "learning_rate": 4.559466001726451e-06, + "loss": 0.3191, + "step": 426 + }, + { + "epoch": 0.0856, + "learning_rate": 4.571186657574823e-06, + "loss": 0.4475, + "step": 428 + }, + { + "epoch": 0.086, + "learning_rate": 4.582917897172599e-06, + "loss": 0.3278, + "step": 430 + }, + { + "epoch": 0.0864, + "learning_rate": 4.5946596976491254e-06, + "loss": 0.2613, + "step": 432 + }, + { + "epoch": 0.0868, + "learning_rate": 4.6064120361131624e-06, + "loss": 0.2975, + "step": 434 + }, + { + "epoch": 0.0872, + "learning_rate": 4.618174889652924e-06, + "loss": 0.4246, + "step": 436 + }, + { + "epoch": 0.0876, + "learning_rate": 4.629948235336126e-06, + "loss": 0.3675, + "step": 438 + }, + { + "epoch": 0.088, + "learning_rate": 4.6417320502100286e-06, + "loss": 0.7781, + "step": 440 + }, + { + "epoch": 0.0884, + "learning_rate": 4.653526311301479e-06, + "loss": 0.596, + "step": 442 + }, + { + "epoch": 0.0888, + "learning_rate": 4.665330995616967e-06, + "loss": 0.3056, + "step": 444 + }, + { + "epoch": 0.0892, + "learning_rate": 4.677146080142667e-06, + "loss": 0.1785, + "step": 446 + }, + { + "epoch": 0.0896, + "learning_rate": 4.688971541844424e-06, + "loss": 0.3018, + "step": 448 + }, + { + "epoch": 0.09, + "learning_rate": 4.700807357667956e-06, + "loss": 0.2767, + "step": 450 + }, + { + "epoch": 0.0904, + "learning_rate": 4.712653504538672e-06, + "loss": 0.3198, + "step": 452 + }, + { + "epoch": 0.0908, + "learning_rate": 4.7245099593619495e-06, + "loss": 0.5012, + "step": 454 + }, + { + "epoch": 0.0912, + "learning_rate": 4.736376699023023e-06, + "loss": 0.5935, + "step": 456 + }, + { + "epoch": 0.0916, + "learning_rate": 4.74825370038703e-06, + "loss": 0.3158, + "step": 458 + }, + { + "epoch": 0.092, + "learning_rate": 4.76014094029921e-06, + "loss": 0.4864, + "step": 460 + }, + { + "epoch": 0.0924, + "learning_rate": 4.772038395584735e-06, + "loss": 0.4734, + "step": 462 + }, + { + "epoch": 0.0928, + "learning_rate": 4.7839460430489216e-06, + "loss": 0.7335, + "step": 464 + }, + { + "epoch": 0.0932, + "learning_rate": 4.7958638594772035e-06, + "loss": 0.5078, + "step": 466 + }, + { + "epoch": 0.0936, + "learning_rate": 4.807791821635185e-06, + "loss": 0.2652, + "step": 468 + }, + { + "epoch": 0.094, + "learning_rate": 4.8197299062686954e-06, + "loss": 0.3124, + "step": 470 + }, + { + "epoch": 0.0944, + "learning_rate": 4.831678090103828e-06, + "loss": 0.4413, + "step": 472 + }, + { + "epoch": 0.0948, + "learning_rate": 4.8436363498469865e-06, + "loss": 0.4501, + "step": 474 + }, + { + "epoch": 0.0952, + "learning_rate": 4.855604662184931e-06, + "loss": 0.6331, + "step": 476 + }, + { + "epoch": 0.0956, + "learning_rate": 4.867583003784825e-06, + "loss": 0.5424, + "step": 478 + }, + { + "epoch": 0.096, + "learning_rate": 4.8795713512942785e-06, + "loss": 0.3024, + "step": 480 + }, + { + "epoch": 0.0964, + "learning_rate": 4.891569681341395e-06, + "loss": 0.7018, + "step": 482 + }, + { + "epoch": 0.0968, + "learning_rate": 4.903577970534815e-06, + "loss": 0.2167, + "step": 484 + }, + { + "epoch": 0.0972, + "learning_rate": 4.91559619546378e-06, + "loss": 0.3205, + "step": 486 + }, + { + "epoch": 0.0976, + "learning_rate": 4.9276243326981e-06, + "loss": 0.7435, + "step": 488 + }, + { + "epoch": 0.098, + "learning_rate": 4.939662358788352e-06, + "loss": 0.3005, + "step": 490 + }, + { + "epoch": 0.0984, + "learning_rate": 4.951710250265788e-06, + "loss": 0.5277, + "step": 492 + }, + { + "epoch": 0.0988, + "learning_rate": 4.96376798364238e-06, + "loss": 0.7733, + "step": 494 + }, + { + "epoch": 0.0992, + "learning_rate": 4.975835535411023e-06, + "loss": 0.2802, + "step": 496 + }, + { + "epoch": 0.0996, + "learning_rate": 4.987912882045345e-06, + "loss": 0.2836, + "step": 498 + }, + { + "epoch": 0.1, + "learning_rate": 5.000000000000003e-06, + "loss": 0.3404, + "step": 500 + }, + { + "epoch": 0.1004, + "learning_rate": 5.012096865710493e-06, + "loss": 0.3195, + "step": 502 + }, + { + "epoch": 0.1008, + "learning_rate": 5.024203455593375e-06, + "loss": 0.2983, + "step": 504 + }, + { + "epoch": 0.1012, + "learning_rate": 5.036319746046232e-06, + "loss": 0.2711, + "step": 506 + }, + { + "epoch": 0.1016, + "learning_rate": 5.048445713447734e-06, + "loss": 0.305, + "step": 508 + }, + { + "epoch": 0.102, + "learning_rate": 5.0605813341576885e-06, + "loss": 0.4259, + "step": 510 + }, + { + "epoch": 0.1024, + "learning_rate": 5.072726584517083e-06, + "loss": 0.2535, + "step": 512 + }, + { + "epoch": 0.1028, + "learning_rate": 5.084881440848126e-06, + "loss": 0.2392, + "step": 514 + }, + { + "epoch": 0.1032, + "learning_rate": 5.097045879454308e-06, + "loss": 0.4135, + "step": 516 + }, + { + "epoch": 0.1036, + "learning_rate": 5.109219876620433e-06, + "loss": 0.4013, + "step": 518 + }, + { + "epoch": 0.104, + "learning_rate": 5.1214034086126685e-06, + "loss": 0.5724, + "step": 520 + }, + { + "epoch": 0.1044, + "learning_rate": 5.133596451678611e-06, + "loss": 0.6417, + "step": 522 + }, + { + "epoch": 0.1048, + "learning_rate": 5.145798982047253e-06, + "loss": 0.3943, + "step": 524 + }, + { + "epoch": 0.1052, + "learning_rate": 5.158010975929185e-06, + "loss": 0.3729, + "step": 526 + }, + { + "epoch": 0.1056, + "learning_rate": 5.170232409516483e-06, + "loss": 0.3718, + "step": 528 + }, + { + "epoch": 0.106, + "learning_rate": 5.182463258982837e-06, + "loss": 0.7375, + "step": 530 + }, + { + "epoch": 0.1064, + "learning_rate": 5.194703500483597e-06, + "loss": 0.1348, + "step": 532 + }, + { + "epoch": 0.1068, + "learning_rate": 5.2069531101557395e-06, + "loss": 0.5319, + "step": 534 + }, + { + "epoch": 0.1072, + "learning_rate": 5.219212064118082e-06, + "loss": 0.5472, + "step": 536 + }, + { + "epoch": 0.1076, + "learning_rate": 5.231480338471124e-06, + "loss": 0.6484, + "step": 538 + }, + { + "epoch": 0.108, + "learning_rate": 5.24375790929725e-06, + "loss": 0.3573, + "step": 540 + }, + { + "epoch": 0.1084, + "learning_rate": 5.256044752660709e-06, + "loss": 0.3922, + "step": 542 + }, + { + "epoch": 0.1088, + "learning_rate": 5.268340844607653e-06, + "loss": 0.4658, + "step": 544 + }, + { + "epoch": 0.1092, + "learning_rate": 5.2806461611662725e-06, + "loss": 0.8321, + "step": 546 + }, + { + "epoch": 0.1096, + "learning_rate": 5.2929606783466735e-06, + "loss": 0.2861, + "step": 548 + }, + { + "epoch": 0.11, + "learning_rate": 5.305284372141091e-06, + "loss": 0.4093, + "step": 550 + }, + { + "epoch": 0.1104, + "learning_rate": 5.317617218523853e-06, + "loss": 0.4708, + "step": 552 + }, + { + "epoch": 0.1108, + "learning_rate": 5.3299591934514435e-06, + "loss": 0.4865, + "step": 554 + }, + { + "epoch": 0.1112, + "learning_rate": 5.342310272862553e-06, + "loss": 0.228, + "step": 556 + }, + { + "epoch": 0.1116, + "learning_rate": 5.354670432678119e-06, + "loss": 0.5656, + "step": 558 + }, + { + "epoch": 0.112, + "learning_rate": 5.367039648801377e-06, + "loss": 0.2397, + "step": 560 + }, + { + "epoch": 0.1124, + "learning_rate": 5.379417897117909e-06, + "loss": 0.5368, + "step": 562 + }, + { + "epoch": 0.1128, + "learning_rate": 5.391805153495684e-06, + "loss": 0.5957, + "step": 564 + }, + { + "epoch": 0.1132, + "learning_rate": 5.404201393785113e-06, + "loss": 0.6151, + "step": 566 + }, + { + "epoch": 0.1136, + "learning_rate": 5.416606593819109e-06, + "loss": 0.2591, + "step": 568 + }, + { + "epoch": 0.114, + "learning_rate": 5.429020729413049e-06, + "loss": 0.2588, + "step": 570 + }, + { + "epoch": 0.1144, + "learning_rate": 5.441443776365005e-06, + "loss": 0.5438, + "step": 572 + }, + { + "epoch": 0.1148, + "learning_rate": 5.453875710455549e-06, + "loss": 0.457, + "step": 574 + }, + { + "epoch": 0.1152, + "learning_rate": 5.466316507448053e-06, + "loss": 0.2975, + "step": 576 + }, + { + "epoch": 0.1156, + "learning_rate": 5.478766143088497e-06, + "loss": 0.1815, + "step": 578 + }, + { + "epoch": 0.116, + "learning_rate": 5.49122459310568e-06, + "loss": 0.2841, + "step": 580 + }, + { + "epoch": 0.1164, + "learning_rate": 5.503691833211264e-06, + "loss": 0.2231, + "step": 582 + }, + { + "epoch": 0.1168, + "learning_rate": 5.516167839099662e-06, + "loss": 0.3638, + "step": 584 + }, + { + "epoch": 0.1172, + "learning_rate": 5.5286525864483285e-06, + "loss": 0.4434, + "step": 586 + }, + { + "epoch": 0.1176, + "learning_rate": 5.5411460509175605e-06, + "loss": 0.4929, + "step": 588 + }, + { + "epoch": 0.118, + "learning_rate": 5.553648208150724e-06, + "loss": 0.3534, + "step": 590 + }, + { + "epoch": 0.1184, + "learning_rate": 5.5661590337742255e-06, + "loss": 0.1481, + "step": 592 + }, + { + "epoch": 0.1188, + "learning_rate": 5.57867850339757e-06, + "loss": 0.527, + "step": 594 + }, + { + "epoch": 0.1192, + "learning_rate": 5.591206592613412e-06, + "loss": 0.6917, + "step": 596 + }, + { + "epoch": 0.1196, + "learning_rate": 5.603743276997597e-06, + "loss": 0.3704, + "step": 598 + }, + { + "epoch": 0.12, + "learning_rate": 5.616288532109221e-06, + "loss": 4.0349, + "step": 600 + }, + { + "epoch": 0.1204, + "learning_rate": 5.628842333490665e-06, + "loss": 0.3045, + "step": 602 + }, + { + "epoch": 0.1208, + "learning_rate": 5.641404656667652e-06, + "loss": 0.2241, + "step": 604 + }, + { + "epoch": 0.1212, + "learning_rate": 5.653975477149289e-06, + "loss": 0.1613, + "step": 606 + }, + { + "epoch": 0.1216, + "learning_rate": 5.666554770428136e-06, + "loss": 0.3998, + "step": 608 + }, + { + "epoch": 0.122, + "learning_rate": 5.679142511980168e-06, + "loss": 0.1539, + "step": 610 + }, + { + "epoch": 0.1224, + "learning_rate": 5.6917386772650015e-06, + "loss": 0.4148, + "step": 612 + }, + { + "epoch": 0.1228, + "learning_rate": 5.7043432417257076e-06, + "loss": 0.2819, + "step": 614 + }, + { + "epoch": 0.1232, + "learning_rate": 5.716956180789086e-06, + "loss": 0.2164, + "step": 616 + }, + { + "epoch": 0.1236, + "learning_rate": 5.729577469865569e-06, + "loss": 0.4973, + "step": 618 + }, + { + "epoch": 0.124, + "learning_rate": 5.74220708434926e-06, + "loss": 0.3566, + "step": 620 + }, + { + "epoch": 0.1244, + "learning_rate": 5.754844999618143e-06, + "loss": 0.4991, + "step": 622 + }, + { + "epoch": 0.1248, + "learning_rate": 5.767491191033909e-06, + "loss": 0.4631, + "step": 624 + }, + { + "epoch": 0.1252, + "learning_rate": 5.780145633942173e-06, + "loss": 0.1874, + "step": 626 + }, + { + "epoch": 0.1256, + "learning_rate": 5.7928083036724535e-06, + "loss": 0.733, + "step": 628 + }, + { + "epoch": 0.126, + "learning_rate": 5.8054791755382125e-06, + "loss": 0.1528, + "step": 630 + }, + { + "epoch": 0.1264, + "learning_rate": 5.818158224836983e-06, + "loss": 0.4017, + "step": 632 + }, + { + "epoch": 0.1268, + "learning_rate": 5.830845426850263e-06, + "loss": 0.233, + "step": 634 + }, + { + "epoch": 0.1272, + "learning_rate": 5.8435407568437194e-06, + "loss": 0.4766, + "step": 636 + }, + { + "epoch": 0.1276, + "learning_rate": 5.856244190067155e-06, + "loss": 0.303, + "step": 638 + }, + { + "epoch": 0.128, + "learning_rate": 5.868955701754577e-06, + "loss": 0.892, + "step": 640 + }, + { + "epoch": 0.1284, + "learning_rate": 5.881675267124245e-06, + "loss": 0.32, + "step": 642 + }, + { + "epoch": 0.1288, + "learning_rate": 5.894402861378714e-06, + "loss": 0.3009, + "step": 644 + }, + { + "epoch": 0.1292, + "learning_rate": 5.907138459704886e-06, + "loss": 0.3443, + "step": 646 + }, + { + "epoch": 0.1296, + "learning_rate": 5.919882037274065e-06, + "loss": 0.3509, + "step": 648 + }, + { + "epoch": 0.13, + "learning_rate": 5.932633569241989e-06, + "loss": 0.3366, + "step": 650 + }, + { + "epoch": 0.1304, + "learning_rate": 5.9453930307488985e-06, + "loss": 0.2714, + "step": 652 + }, + { + "epoch": 0.1308, + "learning_rate": 5.958160396919584e-06, + "loss": 0.3076, + "step": 654 + }, + { + "epoch": 0.1312, + "learning_rate": 5.970935642863362e-06, + "loss": 0.554, + "step": 656 + }, + { + "epoch": 0.1316, + "learning_rate": 5.983718743674305e-06, + "loss": 0.4384, + "step": 658 + }, + { + "epoch": 0.132, + "learning_rate": 5.996509674431038e-06, + "loss": 0.5649, + "step": 660 + }, + { + "epoch": 0.1324, + "learning_rate": 6.00930841019705e-06, + "loss": 0.2312, + "step": 662 + }, + { + "epoch": 0.1328, + "learning_rate": 6.022114926020505e-06, + "loss": 0.3836, + "step": 664 + }, + { + "epoch": 0.1332, + "learning_rate": 6.0349291969344426e-06, + "loss": 0.5206, + "step": 666 + }, + { + "epoch": 0.1336, + "learning_rate": 6.047751197956836e-06, + "loss": 0.4562, + "step": 668 + }, + { + "epoch": 0.134, + "learning_rate": 6.060580904090489e-06, + "loss": 0.4143, + "step": 670 + }, + { + "epoch": 0.1344, + "learning_rate": 6.0734182903232475e-06, + "loss": 0.5291, + "step": 672 + }, + { + "epoch": 0.1348, + "learning_rate": 6.086263331627974e-06, + "loss": 0.3471, + "step": 674 + }, + { + "epoch": 0.1352, + "learning_rate": 6.0991160029626e-06, + "loss": 0.3539, + "step": 676 + }, + { + "epoch": 0.1356, + "learning_rate": 6.111976279270187e-06, + "loss": 0.318, + "step": 678 + }, + { + "epoch": 0.136, + "learning_rate": 6.124844135478966e-06, + "loss": 0.7735, + "step": 680 + }, + { + "epoch": 0.1364, + "learning_rate": 6.137719546502394e-06, + "loss": 0.2544, + "step": 682 + }, + { + "epoch": 0.1368, + "learning_rate": 6.1506024872392e-06, + "loss": 0.2349, + "step": 684 + }, + { + "epoch": 0.1372, + "learning_rate": 6.163492932573429e-06, + "loss": 0.5242, + "step": 686 + }, + { + "epoch": 0.1376, + "learning_rate": 6.176390857374501e-06, + "loss": 0.6503, + "step": 688 + }, + { + "epoch": 0.138, + "learning_rate": 6.189296236497251e-06, + "loss": 0.4389, + "step": 690 + }, + { + "epoch": 0.1384, + "learning_rate": 6.202209044781979e-06, + "loss": 0.2549, + "step": 692 + }, + { + "epoch": 0.1388, + "learning_rate": 6.215129257054525e-06, + "loss": 0.295, + "step": 694 + }, + { + "epoch": 0.1392, + "learning_rate": 6.228056848126223e-06, + "loss": 0.4915, + "step": 696 + }, + { + "epoch": 0.1396, + "learning_rate": 6.240991792794137e-06, + "loss": 0.4563, + "step": 698 + }, + { + "epoch": 0.14, + "learning_rate": 6.253934065840883e-06, + "loss": 0.2687, + "step": 700 + }, + { + "epoch": 0.1404, + "learning_rate": 6.2668836420348374e-06, + "loss": 0.9878, + "step": 702 + }, + { + "epoch": 0.1408, + "learning_rate": 6.279840496130188e-06, + "loss": 0.1749, + "step": 704 + }, + { + "epoch": 0.1412, + "learning_rate": 6.2928046028668185e-06, + "loss": 0.2862, + "step": 706 + }, + { + "epoch": 0.1416, + "learning_rate": 6.305775936970606e-06, + "loss": 0.716, + "step": 708 + }, + { + "epoch": 0.142, + "learning_rate": 6.3187544731532205e-06, + "loss": 0.3125, + "step": 710 + }, + { + "epoch": 0.1424, + "learning_rate": 6.331740186112359e-06, + "loss": 0.4097, + "step": 712 + }, + { + "epoch": 0.1428, + "learning_rate": 6.344733050531709e-06, + "loss": 0.4544, + "step": 714 + }, + { + "epoch": 0.1432, + "learning_rate": 6.357733041081015e-06, + "loss": 0.3562, + "step": 716 + }, + { + "epoch": 0.1436, + "learning_rate": 6.370740132416133e-06, + "loss": 0.1895, + "step": 718 + }, + { + "epoch": 0.144, + "learning_rate": 6.383754299179072e-06, + "loss": 0.5115, + "step": 720 + }, + { + "epoch": 0.1444, + "learning_rate": 6.3967755159980485e-06, + "loss": 0.1732, + "step": 722 + }, + { + "epoch": 0.1448, + "learning_rate": 6.409803757487532e-06, + "loss": 0.4188, + "step": 724 + }, + { + "epoch": 0.1452, + "learning_rate": 6.422838998248301e-06, + "loss": 0.4768, + "step": 726 + }, + { + "epoch": 0.1456, + "learning_rate": 6.435881212867485e-06, + "loss": 0.3368, + "step": 728 + }, + { + "epoch": 0.146, + "learning_rate": 6.4489303759186385e-06, + "loss": 0.4055, + "step": 730 + }, + { + "epoch": 0.1464, + "learning_rate": 6.4619864619616975e-06, + "loss": 0.1908, + "step": 732 + }, + { + "epoch": 0.1468, + "learning_rate": 6.475049445543222e-06, + "loss": 0.5821, + "step": 734 + }, + { + "epoch": 0.1472, + "learning_rate": 6.48811930119619e-06, + "loss": 0.3148, + "step": 736 + }, + { + "epoch": 0.1476, + "learning_rate": 6.5011960034403e-06, + "loss": 0.3084, + "step": 738 + }, + { + "epoch": 0.148, + "learning_rate": 6.514279526781853e-06, + "loss": 0.5149, + "step": 740 + }, + { + "epoch": 0.1484, + "learning_rate": 6.5273698457137965e-06, + "loss": 0.5676, + "step": 742 + }, + { + "epoch": 0.1488, + "learning_rate": 6.540466934715955e-06, + "loss": 0.3943, + "step": 744 + }, + { + "epoch": 0.1492, + "learning_rate": 6.553570768254831e-06, + "loss": 0.3466, + "step": 746 + }, + { + "epoch": 0.1496, + "learning_rate": 6.566681320783848e-06, + "loss": 0.3531, + "step": 748 + }, + { + "epoch": 0.15, + "learning_rate": 6.579798566743313e-06, + "loss": 0.5142, + "step": 750 + }, + { + "epoch": 0.1504, + "learning_rate": 6.592922480560483e-06, + "loss": 0.1846, + "step": 752 + }, + { + "epoch": 0.1508, + "learning_rate": 6.606053036649618e-06, + "loss": 0.2183, + "step": 754 + }, + { + "epoch": 0.1512, + "learning_rate": 6.619190209412025e-06, + "loss": 0.3156, + "step": 756 + }, + { + "epoch": 0.1516, + "learning_rate": 6.632333973236113e-06, + "loss": 0.3086, + "step": 758 + }, + { + "epoch": 0.152, + "learning_rate": 6.6454843024974465e-06, + "loss": 0.409, + "step": 760 + }, + { + "epoch": 0.1524, + "learning_rate": 6.6586411715587805e-06, + "loss": 0.9792, + "step": 762 + }, + { + "epoch": 0.1528, + "learning_rate": 6.671804554770128e-06, + "loss": 0.4433, + "step": 764 + }, + { + "epoch": 0.1532, + "learning_rate": 6.6849744264688e-06, + "loss": 0.3605, + "step": 766 + }, + { + "epoch": 0.1536, + "learning_rate": 6.698150760979456e-06, + "loss": 0.3358, + "step": 768 + }, + { + "epoch": 0.154, + "learning_rate": 6.711333532614177e-06, + "loss": 0.2954, + "step": 770 + }, + { + "epoch": 0.1544, + "learning_rate": 6.724522715672421e-06, + "loss": 0.9552, + "step": 772 + }, + { + "epoch": 0.1548, + "learning_rate": 6.737718284441256e-06, + "loss": 0.2483, + "step": 774 + }, + { + "epoch": 0.1552, + "learning_rate": 6.750920213195242e-06, + "loss": 0.3083, + "step": 776 + }, + { + "epoch": 0.1556, + "learning_rate": 6.764128476196494e-06, + "loss": 0.3583, + "step": 778 + }, + { + "epoch": 0.156, + "learning_rate": 6.777343047694894e-06, + "loss": 0.5823, + "step": 780 + }, + { + "epoch": 0.1564, + "learning_rate": 6.7905639019278925e-06, + "loss": 0.3252, + "step": 782 + }, + { + "epoch": 0.1568, + "learning_rate": 6.803791013120824e-06, + "loss": 0.3863, + "step": 784 + }, + { + "epoch": 0.1572, + "learning_rate": 6.817024355486707e-06, + "loss": 0.2895, + "step": 786 + }, + { + "epoch": 0.1576, + "learning_rate": 6.8302639032264836e-06, + "loss": 0.2712, + "step": 788 + }, + { + "epoch": 0.158, + "learning_rate": 6.8435096305289765e-06, + "loss": 0.3518, + "step": 790 + }, + { + "epoch": 0.1584, + "learning_rate": 6.856761511570944e-06, + "loss": 0.4877, + "step": 792 + }, + { + "epoch": 0.1588, + "learning_rate": 6.870019520517217e-06, + "loss": 0.3351, + "step": 794 + }, + { + "epoch": 0.1592, + "learning_rate": 6.883283631520579e-06, + "loss": 0.2451, + "step": 796 + }, + { + "epoch": 0.1596, + "learning_rate": 6.896553818721985e-06, + "loss": 0.3386, + "step": 798 + }, + { + "epoch": 0.16, + "learning_rate": 6.909830056250522e-06, + "loss": 0.2594, + "step": 800 + }, + { + "epoch": 0.1604, + "learning_rate": 6.9231123182234895e-06, + "loss": 0.3837, + "step": 802 + }, + { + "epoch": 0.1608, + "learning_rate": 6.936400578746436e-06, + "loss": 0.4079, + "step": 804 + }, + { + "epoch": 0.1612, + "learning_rate": 6.949694811913237e-06, + "loss": 0.4006, + "step": 806 + }, + { + "epoch": 0.1616, + "learning_rate": 6.96299499180605e-06, + "loss": 0.3535, + "step": 808 + }, + { + "epoch": 0.162, + "learning_rate": 6.976301092495548e-06, + "loss": 0.7625, + "step": 810 + }, + { + "epoch": 0.1624, + "learning_rate": 6.989613088040787e-06, + "loss": 0.7168, + "step": 812 + }, + { + "epoch": 0.1628, + "learning_rate": 7.002930952489353e-06, + "loss": 0.2407, + "step": 814 + }, + { + "epoch": 0.1632, + "learning_rate": 7.016254659877404e-06, + "loss": 0.3096, + "step": 816 + }, + { + "epoch": 0.1636, + "learning_rate": 7.029584184229641e-06, + "loss": 0.5736, + "step": 818 + }, + { + "epoch": 0.164, + "learning_rate": 7.042919499559539e-06, + "loss": 0.385, + "step": 820 + }, + { + "epoch": 0.1644, + "learning_rate": 7.056260579869152e-06, + "loss": 0.3728, + "step": 822 + }, + { + "epoch": 0.1648, + "learning_rate": 7.06960739914943e-06, + "loss": 0.3751, + "step": 824 + }, + { + "epoch": 0.1652, + "learning_rate": 7.082959931380013e-06, + "loss": 0.399, + "step": 826 + }, + { + "epoch": 0.1656, + "learning_rate": 7.09631815052946e-06, + "loss": 0.3039, + "step": 828 + }, + { + "epoch": 0.166, + "learning_rate": 7.109682030555285e-06, + "loss": 0.4125, + "step": 830 + }, + { + "epoch": 0.1664, + "learning_rate": 7.123051545403873e-06, + "loss": 0.397, + "step": 832 + }, + { + "epoch": 0.1668, + "learning_rate": 7.136426669010686e-06, + "loss": 0.3452, + "step": 834 + }, + { + "epoch": 0.1672, + "learning_rate": 7.1498073753002375e-06, + "loss": 0.3908, + "step": 836 + }, + { + "epoch": 0.1676, + "learning_rate": 7.1631936381861544e-06, + "loss": 0.5209, + "step": 838 + }, + { + "epoch": 0.168, + "learning_rate": 7.1765854315712325e-06, + "loss": 0.2205, + "step": 840 + }, + { + "epoch": 0.1684, + "learning_rate": 7.189982729347485e-06, + "loss": 0.2259, + "step": 842 + }, + { + "epoch": 0.1688, + "learning_rate": 7.203385505396197e-06, + "loss": 0.4867, + "step": 844 + }, + { + "epoch": 0.1692, + "learning_rate": 7.216793733587966e-06, + "loss": 0.3432, + "step": 846 + }, + { + "epoch": 0.1696, + "learning_rate": 7.230207387782771e-06, + "loss": 0.6726, + "step": 848 + }, + { + "epoch": 0.17, + "learning_rate": 7.243626441830001e-06, + "loss": 0.2187, + "step": 850 + }, + { + "epoch": 0.1704, + "learning_rate": 7.257050869568527e-06, + "loss": 0.5516, + "step": 852 + }, + { + "epoch": 0.1708, + "learning_rate": 7.270480644826739e-06, + "loss": 0.3985, + "step": 854 + }, + { + "epoch": 0.1712, + "learning_rate": 7.28391574142262e-06, + "loss": 0.1689, + "step": 856 + }, + { + "epoch": 0.1716, + "learning_rate": 7.297356133163711e-06, + "loss": 0.3583, + "step": 858 + }, + { + "epoch": 0.172, + "learning_rate": 7.3108017938473485e-06, + "loss": 0.2683, + "step": 860 + }, + { + "epoch": 0.1724, + "learning_rate": 7.324252697260479e-06, + "loss": 0.2452, + "step": 862 + }, + { + "epoch": 0.1728, + "learning_rate": 7.337708817179875e-06, + "loss": 0.2439, + "step": 864 + }, + { + "epoch": 0.1732, + "learning_rate": 7.351170127372196e-06, + "loss": 0.4721, + "step": 866 + }, + { + "epoch": 0.1736, + "learning_rate": 7.36463660159386e-06, + "loss": 0.4382, + "step": 868 + }, + { + "epoch": 0.174, + "learning_rate": 7.378108213591355e-06, + "loss": 0.2452, + "step": 870 + }, + { + "epoch": 0.1744, + "learning_rate": 7.39158493710103e-06, + "loss": 1.0547, + "step": 872 + }, + { + "epoch": 0.1748, + "learning_rate": 7.405066745849345e-06, + "loss": 0.3522, + "step": 874 + }, + { + "epoch": 0.1752, + "learning_rate": 7.418553613552822e-06, + "loss": 0.5788, + "step": 876 + }, + { + "epoch": 0.1756, + "learning_rate": 7.432045513918121e-06, + "loss": 0.4213, + "step": 878 + }, + { + "epoch": 0.176, + "learning_rate": 7.445542420642091e-06, + "loss": 0.2654, + "step": 880 + }, + { + "epoch": 0.1764, + "learning_rate": 7.459044307411826e-06, + "loss": 0.3236, + "step": 882 + }, + { + "epoch": 0.1768, + "learning_rate": 7.472551147904703e-06, + "loss": 0.501, + "step": 884 + }, + { + "epoch": 0.1772, + "learning_rate": 7.486062915788446e-06, + "loss": 0.2851, + "step": 886 + }, + { + "epoch": 0.1776, + "learning_rate": 7.499579584721173e-06, + "loss": 0.4505, + "step": 888 + }, + { + "epoch": 0.178, + "learning_rate": 7.513101128351446e-06, + "loss": 0.3765, + "step": 890 + }, + { + "epoch": 0.1784, + "learning_rate": 7.5266275203183395e-06, + "loss": 0.2033, + "step": 892 + }, + { + "epoch": 0.1788, + "learning_rate": 7.540158734251412e-06, + "loss": 0.5312, + "step": 894 + }, + { + "epoch": 0.1792, + "learning_rate": 7.553694743770917e-06, + "loss": 0.194, + "step": 896 + }, + { + "epoch": 0.1796, + "learning_rate": 7.567235522487698e-06, + "loss": 0.2968, + "step": 898 + }, + { + "epoch": 0.18, + "learning_rate": 7.580781044003312e-06, + "loss": 0.5278, + "step": 900 + }, + { + "epoch": 0.1804, + "learning_rate": 7.5943312819100875e-06, + "loss": 0.2778, + "step": 902 + }, + { + "epoch": 0.1808, + "learning_rate": 7.607886209791095e-06, + "loss": 0.3144, + "step": 904 + }, + { + "epoch": 0.1812, + "learning_rate": 7.6214458012203726e-06, + "loss": 1.0147, + "step": 906 + }, + { + "epoch": 0.1816, + "learning_rate": 7.635010029762755e-06, + "loss": 0.6004, + "step": 908 + }, + { + "epoch": 0.182, + "learning_rate": 7.648578868974102e-06, + "loss": 0.4529, + "step": 910 + }, + { + "epoch": 0.1824, + "learning_rate": 7.662152292401265e-06, + "loss": 0.3965, + "step": 912 + }, + { + "epoch": 0.1828, + "learning_rate": 7.675730273582142e-06, + "loss": 0.6047, + "step": 914 + }, + { + "epoch": 0.1832, + "learning_rate": 7.689312786045822e-06, + "loss": 0.4892, + "step": 916 + }, + { + "epoch": 0.1836, + "learning_rate": 7.702899803312443e-06, + "loss": 0.6653, + "step": 918 + }, + { + "epoch": 0.184, + "learning_rate": 7.716491298893441e-06, + "loss": 0.2551, + "step": 920 + }, + { + "epoch": 0.1844, + "learning_rate": 7.730087246291498e-06, + "loss": 0.3608, + "step": 922 + }, + { + "epoch": 0.1848, + "learning_rate": 7.74368761900062e-06, + "loss": 0.255, + "step": 924 + }, + { + "epoch": 0.1852, + "learning_rate": 7.757292390506184e-06, + "loss": 0.2387, + "step": 926 + }, + { + "epoch": 0.1856, + "learning_rate": 7.770901534284991e-06, + "loss": 0.3268, + "step": 928 + }, + { + "epoch": 0.186, + "learning_rate": 7.78451502380532e-06, + "loss": 0.5202, + "step": 930 + }, + { + "epoch": 0.1864, + "learning_rate": 7.798132832526976e-06, + "loss": 0.4435, + "step": 932 + }, + { + "epoch": 0.1868, + "learning_rate": 7.811754933901346e-06, + "loss": 0.424, + "step": 934 + }, + { + "epoch": 0.1872, + "learning_rate": 7.825381301371444e-06, + "loss": 0.273, + "step": 936 + }, + { + "epoch": 0.1876, + "learning_rate": 7.839011908371987e-06, + "loss": 0.3811, + "step": 938 + }, + { + "epoch": 0.188, + "learning_rate": 7.852646728329358e-06, + "loss": 0.5099, + "step": 940 + }, + { + "epoch": 0.1884, + "learning_rate": 7.866285734661845e-06, + "loss": 0.3226, + "step": 942 + }, + { + "epoch": 0.1888, + "learning_rate": 7.879928900779441e-06, + "loss": 0.2931, + "step": 944 + }, + { + "epoch": 0.1892, + "learning_rate": 7.893576200084164e-06, + "loss": 0.2911, + "step": 946 + }, + { + "epoch": 0.1896, + "learning_rate": 7.907227605969852e-06, + "loss": 0.3712, + "step": 948 + }, + { + "epoch": 0.19, + "learning_rate": 7.92088309182239e-06, + "loss": 0.7545, + "step": 950 + }, + { + "epoch": 0.1904, + "learning_rate": 7.934542631019767e-06, + "loss": 0.3636, + "step": 952 + }, + { + "epoch": 0.1908, + "learning_rate": 7.948206196931937e-06, + "loss": 0.4194, + "step": 954 + }, + { + "epoch": 0.1912, + "learning_rate": 7.961873762921153e-06, + "loss": 0.2978, + "step": 956 + }, + { + "epoch": 0.1916, + "learning_rate": 7.97554530234174e-06, + "loss": 0.3529, + "step": 958 + }, + { + "epoch": 0.192, + "learning_rate": 7.989220788540351e-06, + "loss": 0.3457, + "step": 960 + }, + { + "epoch": 0.1924, + "learning_rate": 8.002900194855927e-06, + "loss": 0.3039, + "step": 962 + }, + { + "epoch": 0.1928, + "learning_rate": 8.016583494619764e-06, + "loss": 0.2949, + "step": 964 + }, + { + "epoch": 0.1932, + "learning_rate": 8.03027066115557e-06, + "loss": 0.2815, + "step": 966 + }, + { + "epoch": 0.1936, + "learning_rate": 8.043961667779511e-06, + "loss": 0.1881, + "step": 968 + }, + { + "epoch": 0.194, + "learning_rate": 8.057656487800274e-06, + "loss": 0.2736, + "step": 970 + }, + { + "epoch": 0.1944, + "learning_rate": 8.071355094519103e-06, + "loss": 0.5013, + "step": 972 + }, + { + "epoch": 0.1948, + "learning_rate": 8.085057461229862e-06, + "loss": 0.2481, + "step": 974 + }, + { + "epoch": 0.1952, + "learning_rate": 8.098763561219089e-06, + "loss": 0.3193, + "step": 976 + }, + { + "epoch": 0.1956, + "learning_rate": 8.112473367766056e-06, + "loss": 0.4712, + "step": 978 + }, + { + "epoch": 0.196, + "learning_rate": 8.126186854142744e-06, + "loss": 0.44, + "step": 980 + }, + { + "epoch": 0.1964, + "learning_rate": 8.139903993614075e-06, + "loss": 0.3096, + "step": 982 + }, + { + "epoch": 0.1968, + "learning_rate": 8.153624759437718e-06, + "loss": 0.6017, + "step": 984 + }, + { + "epoch": 0.1972, + "learning_rate": 8.167349124864389e-06, + "loss": 0.2472, + "step": 986 + }, + { + "epoch": 0.1976, + "learning_rate": 8.181077063137735e-06, + "loss": 0.2665, + "step": 988 + }, + { + "epoch": 0.198, + "learning_rate": 8.194808547494386e-06, + "loss": 0.5729, + "step": 990 + }, + { + "epoch": 0.1984, + "learning_rate": 8.208543551164178e-06, + "loss": 0.2761, + "step": 992 + }, + { + "epoch": 0.1988, + "learning_rate": 8.22228204736997e-06, + "loss": 0.2797, + "step": 994 + }, + { + "epoch": 0.1992, + "learning_rate": 8.236024009327877e-06, + "loss": 0.3786, + "step": 996 + }, + { + "epoch": 0.1996, + "learning_rate": 8.249769410247239e-06, + "loss": 0.8245, + "step": 998 + }, + { + "epoch": 0.2, + "learning_rate": 8.263518223330695e-06, + "loss": 0.3504, + "step": 1000 + }, + { + "epoch": 0.2004, + "learning_rate": 8.277270421774231e-06, + "loss": 0.4531, + "step": 1002 + }, + { + "epoch": 0.2008, + "learning_rate": 8.29102597876723e-06, + "loss": 0.2867, + "step": 1004 + }, + { + "epoch": 0.2012, + "learning_rate": 8.304784867492532e-06, + "loss": 0.2964, + "step": 1006 + }, + { + "epoch": 0.2016, + "learning_rate": 8.31854706112648e-06, + "loss": 0.5768, + "step": 1008 + }, + { + "epoch": 0.202, + "learning_rate": 8.332312532838972e-06, + "loss": 0.478, + "step": 1010 + }, + { + "epoch": 0.2024, + "learning_rate": 8.346081255793516e-06, + "loss": 0.395, + "step": 1012 + }, + { + "epoch": 0.2028, + "learning_rate": 8.359853203147282e-06, + "loss": 0.2287, + "step": 1014 + }, + { + "epoch": 0.2032, + "learning_rate": 8.373628348051156e-06, + "loss": 0.3035, + "step": 1016 + }, + { + "epoch": 0.2036, + "learning_rate": 8.387406663649803e-06, + "loss": 0.6876, + "step": 1018 + }, + { + "epoch": 0.204, + "learning_rate": 8.401188123081642e-06, + "loss": 0.2352, + "step": 1020 + }, + { + "epoch": 0.2044, + "learning_rate": 8.414972699479062e-06, + "loss": 0.363, + "step": 1022 + }, + { + "epoch": 0.2048, + "learning_rate": 8.428760365968329e-06, + "loss": 0.6763, + "step": 1024 + }, + { + "epoch": 0.2052, + "learning_rate": 8.442551095669627e-06, + "loss": 0.3197, + "step": 1026 + }, + { + "epoch": 0.2056, + "learning_rate": 8.456344861697293e-06, + "loss": 0.5819, + "step": 1028 + }, + { + "epoch": 0.206, + "learning_rate": 8.470141637159605e-06, + "loss": 0.5474, + "step": 1030 + }, + { + "epoch": 0.2064, + "learning_rate": 8.483941395159114e-06, + "loss": 0.514, + "step": 1032 + }, + { + "epoch": 0.2068, + "learning_rate": 8.497744108792431e-06, + "loss": 0.4252, + "step": 1034 + }, + { + "epoch": 0.2072, + "learning_rate": 8.511549751150478e-06, + "loss": 0.189, + "step": 1036 + }, + { + "epoch": 0.2076, + "learning_rate": 8.52535829531845e-06, + "loss": 0.2618, + "step": 1038 + }, + { + "epoch": 0.208, + "learning_rate": 8.539169714375883e-06, + "loss": 0.3523, + "step": 1040 + }, + { + "epoch": 0.2084, + "learning_rate": 8.552983981396707e-06, + "loss": 0.4636, + "step": 1042 + }, + { + "epoch": 0.2088, + "learning_rate": 8.566801069449304e-06, + "loss": 0.3688, + "step": 1044 + }, + { + "epoch": 0.2092, + "learning_rate": 8.580620951596553e-06, + "loss": 0.3542, + "step": 1046 + }, + { + "epoch": 0.2096, + "learning_rate": 8.594443600895886e-06, + "loss": 0.363, + "step": 1048 + }, + { + "epoch": 0.21, + "learning_rate": 8.60826899039934e-06, + "loss": 0.2398, + "step": 1050 + }, + { + "epoch": 0.2104, + "learning_rate": 8.622097093153612e-06, + "loss": 0.4084, + "step": 1052 + }, + { + "epoch": 0.2108, + "learning_rate": 8.635927882200128e-06, + "loss": 0.6268, + "step": 1054 + }, + { + "epoch": 0.2112, + "learning_rate": 8.649761330575e-06, + "loss": 0.3162, + "step": 1056 + }, + { + "epoch": 0.2116, + "learning_rate": 8.663597411309268e-06, + "loss": 0.3656, + "step": 1058 + }, + { + "epoch": 0.212, + "learning_rate": 8.677436097428766e-06, + "loss": 0.4112, + "step": 1060 + }, + { + "epoch": 0.2124, + "learning_rate": 8.691277361954266e-06, + "loss": 0.262, + "step": 1062 + }, + { + "epoch": 0.2128, + "learning_rate": 8.705121177901537e-06, + "loss": 0.3361, + "step": 1064 + }, + { + "epoch": 0.2132, + "learning_rate": 8.718967518281292e-06, + "loss": 0.3664, + "step": 1066 + }, + { + "epoch": 0.2136, + "learning_rate": 8.732816356099459e-06, + "loss": 0.3092, + "step": 1068 + }, + { + "epoch": 0.214, + "learning_rate": 8.746667664356962e-06, + "loss": 0.3142, + "step": 1070 + }, + { + "epoch": 0.2144, + "learning_rate": 8.760521416049986e-06, + "loss": 0.4472, + "step": 1072 + }, + { + "epoch": 0.2148, + "learning_rate": 8.774377584169934e-06, + "loss": 0.3453, + "step": 1074 + }, + { + "epoch": 0.2152, + "learning_rate": 8.788236141703477e-06, + "loss": 0.4003, + "step": 1076 + }, + { + "epoch": 0.2156, + "learning_rate": 8.802097061632706e-06, + "loss": 0.3086, + "step": 1078 + }, + { + "epoch": 0.216, + "learning_rate": 8.81596031693499e-06, + "loss": 0.339, + "step": 1080 + }, + { + "epoch": 0.2164, + "learning_rate": 8.829825880583224e-06, + "loss": 0.2767, + "step": 1082 + }, + { + "epoch": 0.2168, + "learning_rate": 8.84369372554578e-06, + "loss": 0.8795, + "step": 1084 + }, + { + "epoch": 0.2172, + "learning_rate": 8.85756382478659e-06, + "loss": 0.2632, + "step": 1086 + }, + { + "epoch": 0.2176, + "learning_rate": 8.87143615126518e-06, + "loss": 0.4702, + "step": 1088 + }, + { + "epoch": 0.218, + "learning_rate": 8.88531067793674e-06, + "loss": 0.2533, + "step": 1090 + }, + { + "epoch": 0.2184, + "learning_rate": 8.899187377752173e-06, + "loss": 0.3283, + "step": 1092 + }, + { + "epoch": 0.2188, + "learning_rate": 8.913066223658141e-06, + "loss": 0.3652, + "step": 1094 + }, + { + "epoch": 0.2192, + "learning_rate": 8.926947188597127e-06, + "loss": 0.3045, + "step": 1096 + }, + { + "epoch": 0.2196, + "learning_rate": 8.940830245507473e-06, + "loss": 0.2594, + "step": 1098 + }, + { + "epoch": 0.22, + "learning_rate": 8.954715367323473e-06, + "loss": 0.3972, + "step": 1100 + }, + { + "epoch": 0.2204, + "learning_rate": 8.968602526975317e-06, + "loss": 0.3633, + "step": 1102 + }, + { + "epoch": 0.2208, + "learning_rate": 8.982491697389344e-06, + "loss": 0.7652, + "step": 1104 + }, + { + "epoch": 0.2212, + "learning_rate": 8.996382851487839e-06, + "loss": 0.406, + "step": 1106 + }, + { + "epoch": 0.2216, + "learning_rate": 9.010275962189356e-06, + "loss": 0.3475, + "step": 1108 + }, + { + "epoch": 0.222, + "learning_rate": 9.024171002408509e-06, + "loss": 0.4952, + "step": 1110 + }, + { + "epoch": 0.2224, + "learning_rate": 9.03806794505621e-06, + "loss": 0.3108, + "step": 1112 + }, + { + "epoch": 0.2228, + "learning_rate": 9.051966763039708e-06, + "loss": 0.2764, + "step": 1114 + }, + { + "epoch": 0.2232, + "learning_rate": 9.065867429262497e-06, + "loss": 0.6096, + "step": 1116 + }, + { + "epoch": 0.2236, + "learning_rate": 9.07976991662453e-06, + "loss": 0.417, + "step": 1118 + }, + { + "epoch": 0.224, + "learning_rate": 9.093674198022198e-06, + "loss": 0.3713, + "step": 1120 + }, + { + "epoch": 0.2244, + "learning_rate": 9.107580246348395e-06, + "loss": 0.281, + "step": 1122 + }, + { + "epoch": 0.2248, + "learning_rate": 9.121488034492567e-06, + "loss": 0.2905, + "step": 1124 + }, + { + "epoch": 0.2252, + "learning_rate": 9.135397535340768e-06, + "loss": 0.415, + "step": 1126 + }, + { + "epoch": 0.2256, + "learning_rate": 9.149308721775717e-06, + "loss": 0.5256, + "step": 1128 + }, + { + "epoch": 0.226, + "learning_rate": 9.16322156667684e-06, + "loss": 0.213, + "step": 1130 + }, + { + "epoch": 0.2264, + "learning_rate": 9.177136042920338e-06, + "loss": 0.2986, + "step": 1132 + }, + { + "epoch": 0.2268, + "learning_rate": 9.191052123379227e-06, + "loss": 0.5944, + "step": 1134 + }, + { + "epoch": 0.2272, + "learning_rate": 9.204969780923396e-06, + "loss": 0.3058, + "step": 1136 + }, + { + "epoch": 0.2276, + "learning_rate": 9.218888988419656e-06, + "loss": 0.3131, + "step": 1138 + }, + { + "epoch": 0.228, + "learning_rate": 9.232809718731822e-06, + "loss": 0.2773, + "step": 1140 + }, + { + "epoch": 0.2284, + "learning_rate": 9.246731944720663e-06, + "loss": 0.2218, + "step": 1142 + }, + { + "epoch": 0.2288, + "learning_rate": 9.26065563924414e-06, + "loss": 0.5905, + "step": 1144 + }, + { + "epoch": 0.2292, + "learning_rate": 9.274580775157299e-06, + "loss": 0.5809, + "step": 1146 + }, + { + "epoch": 0.2296, + "learning_rate": 9.288507325312319e-06, + "loss": 0.9934, + "step": 1148 + }, + { + "epoch": 0.23, + "learning_rate": 9.302435262558752e-06, + "loss": 0.3389, + "step": 1150 + }, + { + "epoch": 0.2304, + "learning_rate": 9.316364559743298e-06, + "loss": 0.4396, + "step": 1152 + }, + { + "epoch": 0.2308, + "learning_rate": 9.330295189710153e-06, + "loss": 0.3441, + "step": 1154 + }, + { + "epoch": 0.2312, + "learning_rate": 9.344227125300788e-06, + "loss": 0.243, + "step": 1156 + }, + { + "epoch": 0.2316, + "learning_rate": 9.358160339354196e-06, + "loss": 0.6547, + "step": 1158 + }, + { + "epoch": 0.232, + "learning_rate": 9.372094804706867e-06, + "loss": 0.539, + "step": 1160 + }, + { + "epoch": 0.2324, + "learning_rate": 9.386030494192826e-06, + "loss": 0.2274, + "step": 1162 + }, + { + "epoch": 0.2328, + "learning_rate": 9.39996738064379e-06, + "loss": 0.4394, + "step": 1164 + }, + { + "epoch": 0.2332, + "learning_rate": 9.413905436889032e-06, + "loss": 0.6466, + "step": 1166 + }, + { + "epoch": 0.2336, + "learning_rate": 9.427844635755615e-06, + "loss": 0.2289, + "step": 1168 + }, + { + "epoch": 0.234, + "learning_rate": 9.441784950068357e-06, + "loss": 0.2568, + "step": 1170 + }, + { + "epoch": 0.2344, + "learning_rate": 9.455726352649904e-06, + "loss": 0.314, + "step": 1172 + }, + { + "epoch": 0.2348, + "learning_rate": 9.469668816320777e-06, + "loss": 0.5363, + "step": 1174 + }, + { + "epoch": 0.2352, + "learning_rate": 9.483612313899446e-06, + "loss": 0.2801, + "step": 1176 + }, + { + "epoch": 0.2356, + "learning_rate": 9.497556818202297e-06, + "loss": 0.2197, + "step": 1178 + }, + { + "epoch": 0.236, + "learning_rate": 9.511502302043859e-06, + "loss": 0.2505, + "step": 1180 + }, + { + "epoch": 0.2364, + "learning_rate": 9.52544873823668e-06, + "loss": 0.6641, + "step": 1182 + }, + { + "epoch": 0.2368, + "learning_rate": 9.539396099591469e-06, + "loss": 0.1448, + "step": 1184 + }, + { + "epoch": 0.2372, + "learning_rate": 9.553344358917146e-06, + "loss": 0.3817, + "step": 1186 + }, + { + "epoch": 0.2376, + "learning_rate": 9.567293489020816e-06, + "loss": 0.8022, + "step": 1188 + }, + { + "epoch": 0.238, + "learning_rate": 9.581243462708009e-06, + "loss": 0.3459, + "step": 1190 + }, + { + "epoch": 0.2384, + "learning_rate": 9.595194252782461e-06, + "loss": 0.4435, + "step": 1192 + }, + { + "epoch": 0.2388, + "learning_rate": 9.609145832046469e-06, + "loss": 0.1686, + "step": 1194 + }, + { + "epoch": 0.2392, + "learning_rate": 9.623098173300656e-06, + "loss": 0.331, + "step": 1196 + }, + { + "epoch": 0.2396, + "learning_rate": 9.637051249344225e-06, + "loss": 0.2455, + "step": 1198 + }, + { + "epoch": 0.24, + "learning_rate": 9.651005032974991e-06, + "loss": 0.4365, + "step": 1200 + }, + { + "epoch": 0.2404, + "learning_rate": 9.664959496989285e-06, + "loss": 0.3641, + "step": 1202 + }, + { + "epoch": 0.2408, + "learning_rate": 9.678914614182184e-06, + "loss": 0.4462, + "step": 1204 + }, + { + "epoch": 0.2412, + "learning_rate": 9.69287035734747e-06, + "loss": 0.5387, + "step": 1206 + }, + { + "epoch": 0.2416, + "learning_rate": 9.706826699277714e-06, + "loss": 0.2123, + "step": 1208 + }, + { + "epoch": 0.242, + "learning_rate": 9.720783612764307e-06, + "loss": 0.1672, + "step": 1210 + }, + { + "epoch": 0.2424, + "learning_rate": 9.734741070597535e-06, + "loss": 0.402, + "step": 1212 + }, + { + "epoch": 0.2428, + "learning_rate": 9.74869904556662e-06, + "loss": 0.3074, + "step": 1214 + }, + { + "epoch": 0.2432, + "learning_rate": 9.762657510459774e-06, + "loss": 0.4685, + "step": 1216 + }, + { + "epoch": 0.2436, + "learning_rate": 9.776616438064255e-06, + "loss": 0.2402, + "step": 1218 + }, + { + "epoch": 0.244, + "learning_rate": 9.790575801166422e-06, + "loss": 0.4516, + "step": 1220 + }, + { + "epoch": 0.2444, + "learning_rate": 9.804535572551782e-06, + "loss": 0.3528, + "step": 1222 + }, + { + "epoch": 0.2448, + "learning_rate": 9.818495725005043e-06, + "loss": 0.3985, + "step": 1224 + }, + { + "epoch": 0.2452, + "learning_rate": 9.832456231310194e-06, + "loss": 0.3186, + "step": 1226 + }, + { + "epoch": 0.2456, + "learning_rate": 9.846417064250459e-06, + "loss": 0.3778, + "step": 1228 + }, + { + "epoch": 0.246, + "learning_rate": 9.860378196608552e-06, + "loss": 0.2362, + "step": 1230 + }, + { + "epoch": 0.2464, + "learning_rate": 9.874339601166479e-06, + "loss": 0.1868, + "step": 1232 + }, + { + "epoch": 0.2468, + "learning_rate": 9.888301250705765e-06, + "loss": 0.4621, + "step": 1234 + }, + { + "epoch": 0.2472, + "learning_rate": 9.902263118007513e-06, + "loss": 0.5526, + "step": 1236 + }, + { + "epoch": 0.2476, + "learning_rate": 9.916225175852278e-06, + "loss": 0.3089, + "step": 1238 + }, + { + "epoch": 0.248, + "learning_rate": 9.930187397020385e-06, + "loss": 0.2226, + "step": 1240 + }, + { + "epoch": 0.2484, + "learning_rate": 9.944149754291716e-06, + "loss": 0.3528, + "step": 1242 + }, + { + "epoch": 0.2488, + "learning_rate": 9.95811222044596e-06, + "loss": 0.3573, + "step": 1244 + }, + { + "epoch": 0.2492, + "learning_rate": 9.972074768262572e-06, + "loss": 0.1671, + "step": 1246 + }, + { + "epoch": 0.2496, + "learning_rate": 9.986037370520855e-06, + "loss": 0.412, + "step": 1248 + }, + { + "epoch": 0.25, + "learning_rate": 9.999999999999996e-06, + "loss": 0.1531, + "step": 1250 + }, + { + "epoch": 0.2504, + "learning_rate": 1.0013962629479139e-05, + "loss": 0.3207, + "step": 1252 + }, + { + "epoch": 0.2508, + "learning_rate": 1.0027925231737419e-05, + "loss": 0.2661, + "step": 1254 + }, + { + "epoch": 0.2512, + "learning_rate": 1.0041887779554034e-05, + "loss": 0.328, + "step": 1256 + }, + { + "epoch": 0.2516, + "learning_rate": 1.0055850245708276e-05, + "loss": 0.3226, + "step": 1258 + }, + { + "epoch": 0.252, + "learning_rate": 1.0069812602979607e-05, + "loss": 0.4203, + "step": 1260 + }, + { + "epoch": 0.2524, + "learning_rate": 1.0083774824147717e-05, + "loss": 0.4026, + "step": 1262 + }, + { + "epoch": 0.2528, + "learning_rate": 1.0097736881992482e-05, + "loss": 0.4214, + "step": 1264 + }, + { + "epoch": 0.2532, + "learning_rate": 1.011169874929423e-05, + "loss": 0.2217, + "step": 1266 + }, + { + "epoch": 0.2536, + "learning_rate": 1.0125660398833514e-05, + "loss": 0.5101, + "step": 1268 + }, + { + "epoch": 0.254, + "learning_rate": 1.013962180339144e-05, + "loss": 0.1238, + "step": 1270 + }, + { + "epoch": 0.2544, + "learning_rate": 1.0153582935749533e-05, + "loss": 0.2849, + "step": 1272 + }, + { + "epoch": 0.2548, + "learning_rate": 1.01675437686898e-05, + "loss": 0.5306, + "step": 1274 + }, + { + "epoch": 0.2552, + "learning_rate": 1.0181504274994952e-05, + "loss": 0.755, + "step": 1276 + }, + { + "epoch": 0.2556, + "learning_rate": 1.0195464427448212e-05, + "loss": 0.169, + "step": 1278 + }, + { + "epoch": 0.256, + "learning_rate": 1.0209424198833571e-05, + "loss": 0.5774, + "step": 1280 + }, + { + "epoch": 0.2564, + "learning_rate": 1.0223383561935738e-05, + "loss": 0.609, + "step": 1282 + }, + { + "epoch": 0.2568, + "learning_rate": 1.0237342489540218e-05, + "loss": 0.2018, + "step": 1284 + }, + { + "epoch": 0.2572, + "learning_rate": 1.0251300954433374e-05, + "loss": 0.3262, + "step": 1286 + }, + { + "epoch": 0.2576, + "learning_rate": 1.0265258929402458e-05, + "loss": 0.5228, + "step": 1288 + }, + { + "epoch": 0.258, + "learning_rate": 1.0279216387235686e-05, + "loss": 0.5594, + "step": 1290 + }, + { + "epoch": 0.2584, + "learning_rate": 1.029317330072228e-05, + "loss": 0.2991, + "step": 1292 + }, + { + "epoch": 0.2588, + "learning_rate": 1.0307129642652523e-05, + "loss": 0.1439, + "step": 1294 + }, + { + "epoch": 0.2592, + "learning_rate": 1.0321085385817811e-05, + "loss": 0.2284, + "step": 1296 + }, + { + "epoch": 0.2596, + "learning_rate": 1.033504050301071e-05, + "loss": 0.9279, + "step": 1298 + }, + { + "epoch": 0.26, + "learning_rate": 1.0348994967025004e-05, + "loss": 0.2219, + "step": 1300 + }, + { + "epoch": 0.2604, + "learning_rate": 1.0362948750655768e-05, + "loss": 0.3813, + "step": 1302 + }, + { + "epoch": 0.2608, + "learning_rate": 1.0376901826699337e-05, + "loss": 0.6812, + "step": 1304 + }, + { + "epoch": 0.2612, + "learning_rate": 1.0390854167953526e-05, + "loss": 0.4394, + "step": 1306 + }, + { + "epoch": 0.2616, + "learning_rate": 1.0404805747217532e-05, + "loss": 0.2347, + "step": 1308 + }, + { + "epoch": 0.262, + "learning_rate": 1.0418756537291984e-05, + "loss": 0.3204, + "step": 1310 + }, + { + "epoch": 0.2624, + "learning_rate": 1.0432706510979175e-05, + "loss": 0.403, + "step": 1312 + }, + { + "epoch": 0.2628, + "learning_rate": 1.0446655641082846e-05, + "loss": 0.4053, + "step": 1314 + }, + { + "epoch": 0.2632, + "learning_rate": 1.0460603900408526e-05, + "loss": 0.2479, + "step": 1316 + }, + { + "epoch": 0.2636, + "learning_rate": 1.0474551261763312e-05, + "loss": 0.2161, + "step": 1318 + }, + { + "epoch": 0.264, + "learning_rate": 1.0488497697956134e-05, + "loss": 0.1956, + "step": 1320 + }, + { + "epoch": 0.2644, + "learning_rate": 1.0502443181797696e-05, + "loss": 0.1479, + "step": 1322 + }, + { + "epoch": 0.2648, + "learning_rate": 1.0516387686100549e-05, + "loss": 0.2796, + "step": 1324 + }, + { + "epoch": 0.2652, + "learning_rate": 1.0530331183679216e-05, + "loss": 0.1504, + "step": 1326 + }, + { + "epoch": 0.2656, + "learning_rate": 1.054427364735009e-05, + "loss": 0.6028, + "step": 1328 + }, + { + "epoch": 0.266, + "learning_rate": 1.0558215049931634e-05, + "loss": 0.4585, + "step": 1330 + }, + { + "epoch": 0.2664, + "learning_rate": 1.0572155364244378e-05, + "loss": 0.7566, + "step": 1332 + }, + { + "epoch": 0.2668, + "learning_rate": 1.058609456311096e-05, + "loss": 0.3651, + "step": 1334 + }, + { + "epoch": 0.2672, + "learning_rate": 1.0600032619356203e-05, + "loss": 0.2722, + "step": 1336 + }, + { + "epoch": 0.2676, + "learning_rate": 1.0613969505807167e-05, + "loss": 0.3377, + "step": 1338 + }, + { + "epoch": 0.268, + "learning_rate": 1.0627905195293127e-05, + "loss": 0.3644, + "step": 1340 + }, + { + "epoch": 0.2684, + "learning_rate": 1.0641839660645795e-05, + "loss": 0.5298, + "step": 1342 + }, + { + "epoch": 0.2688, + "learning_rate": 1.0655772874699206e-05, + "loss": 0.5939, + "step": 1344 + }, + { + "epoch": 0.2692, + "learning_rate": 1.066970481028984e-05, + "loss": 0.2809, + "step": 1346 + }, + { + "epoch": 0.2696, + "learning_rate": 1.0683635440256694e-05, + "loss": 2.2745, + "step": 1348 + }, + { + "epoch": 0.27, + "learning_rate": 1.0697564737441242e-05, + "loss": 0.2676, + "step": 1350 + }, + { + "epoch": 0.2704, + "learning_rate": 1.0711492674687674e-05, + "loss": 3.3743, + "step": 1352 + }, + { + "epoch": 0.2708, + "learning_rate": 1.0725419224842695e-05, + "loss": 0.3574, + "step": 1354 + }, + { + "epoch": 0.2712, + "learning_rate": 1.0739344360755855e-05, + "loss": 0.661, + "step": 1356 + }, + { + "epoch": 0.2716, + "learning_rate": 1.0753268055279332e-05, + "loss": 0.3495, + "step": 1358 + }, + { + "epoch": 0.272, + "learning_rate": 1.0767190281268171e-05, + "loss": 0.5978, + "step": 1360 + }, + { + "epoch": 0.2724, + "learning_rate": 1.0781111011580336e-05, + "loss": 0.4832, + "step": 1362 + }, + { + "epoch": 0.2728, + "learning_rate": 1.07950302190766e-05, + "loss": 0.5057, + "step": 1364 + }, + { + "epoch": 0.2732, + "learning_rate": 1.0808947876620766e-05, + "loss": 0.3304, + "step": 1366 + }, + { + "epoch": 0.2736, + "learning_rate": 1.0822863957079654e-05, + "loss": 0.388, + "step": 1368 + }, + { + "epoch": 0.274, + "learning_rate": 1.0836778433323153e-05, + "loss": 0.9154, + "step": 1370 + }, + { + "epoch": 0.2744, + "learning_rate": 1.0850691278224277e-05, + "loss": 0.4937, + "step": 1372 + }, + { + "epoch": 0.2748, + "learning_rate": 1.0864602464659227e-05, + "loss": 0.368, + "step": 1374 + }, + { + "epoch": 0.2752, + "learning_rate": 1.0878511965507428e-05, + "loss": 0.4809, + "step": 1376 + }, + { + "epoch": 0.2756, + "learning_rate": 1.0892419753651598e-05, + "loss": 0.2023, + "step": 1378 + }, + { + "epoch": 0.276, + "learning_rate": 1.0906325801977795e-05, + "loss": 0.3484, + "step": 1380 + }, + { + "epoch": 0.2764, + "learning_rate": 1.0920230083375465e-05, + "loss": 0.4061, + "step": 1382 + }, + { + "epoch": 0.2768, + "learning_rate": 1.0934132570737497e-05, + "loss": 0.1752, + "step": 1384 + }, + { + "epoch": 0.2772, + "learning_rate": 1.0948033236960285e-05, + "loss": 0.2129, + "step": 1386 + }, + { + "epoch": 0.2776, + "learning_rate": 1.0961932054943785e-05, + "loss": 0.2675, + "step": 1388 + }, + { + "epoch": 0.278, + "learning_rate": 1.0975828997591484e-05, + "loss": 1.1039, + "step": 1390 + }, + { + "epoch": 0.2784, + "learning_rate": 1.098972403781064e-05, + "loss": 0.4685, + "step": 1392 + }, + { + "epoch": 0.2788, + "learning_rate": 1.1003617148512154e-05, + "loss": 0.4317, + "step": 1394 + }, + { + "epoch": 0.2792, + "learning_rate": 1.101750830261065e-05, + "loss": 0.7233, + "step": 1396 + }, + { + "epoch": 0.2796, + "learning_rate": 1.1031397473024676e-05, + "loss": 0.1612, + "step": 1398 + }, + { + "epoch": 0.28, + "learning_rate": 1.104528463267652e-05, + "loss": 0.2938, + "step": 1400 + }, + { + "epoch": 0.2804, + "learning_rate": 1.1059169754492518e-05, + "loss": 0.3687, + "step": 1402 + }, + { + "epoch": 0.2808, + "learning_rate": 1.1073052811402867e-05, + "loss": 0.8868, + "step": 1404 + }, + { + "epoch": 0.2812, + "learning_rate": 1.108693377634185e-05, + "loss": 0.3738, + "step": 1406 + }, + { + "epoch": 0.2816, + "learning_rate": 1.1100812622247821e-05, + "loss": 0.3485, + "step": 1408 + }, + { + "epoch": 0.282, + "learning_rate": 1.1114689322063252e-05, + "loss": 0.3147, + "step": 1410 + }, + { + "epoch": 0.2824, + "learning_rate": 1.1128563848734815e-05, + "loss": 0.3907, + "step": 1412 + }, + { + "epoch": 0.2828, + "learning_rate": 1.1142436175213404e-05, + "loss": 0.6556, + "step": 1414 + }, + { + "epoch": 0.2832, + "learning_rate": 1.1156306274454211e-05, + "loss": 0.2847, + "step": 1416 + }, + { + "epoch": 0.2836, + "learning_rate": 1.117017411941677e-05, + "loss": 0.2451, + "step": 1418 + }, + { + "epoch": 0.284, + "learning_rate": 1.1184039683065002e-05, + "loss": 0.3938, + "step": 1420 + }, + { + "epoch": 0.2844, + "learning_rate": 1.1197902938367289e-05, + "loss": 0.6114, + "step": 1422 + }, + { + "epoch": 0.2848, + "learning_rate": 1.1211763858296516e-05, + "loss": 0.2234, + "step": 1424 + }, + { + "epoch": 0.2852, + "learning_rate": 1.122562241583006e-05, + "loss": 0.3634, + "step": 1426 + }, + { + "epoch": 0.2856, + "learning_rate": 1.1239478583950007e-05, + "loss": 0.3457, + "step": 1428 + }, + { + "epoch": 0.286, + "learning_rate": 1.1253332335643033e-05, + "loss": 0.7505, + "step": 1430 + }, + { + "epoch": 0.2864, + "learning_rate": 1.1267183643900534e-05, + "loss": 0.8547, + "step": 1432 + }, + { + "epoch": 0.2868, + "learning_rate": 1.1281032481718701e-05, + "loss": 0.2587, + "step": 1434 + }, + { + "epoch": 0.2872, + "learning_rate": 1.1294878822098456e-05, + "loss": 0.2393, + "step": 1436 + }, + { + "epoch": 0.2876, + "learning_rate": 1.1308722638045725e-05, + "loss": 0.3114, + "step": 1438 + }, + { + "epoch": 0.288, + "learning_rate": 1.1322563902571227e-05, + "loss": 0.6077, + "step": 1440 + }, + { + "epoch": 0.2884, + "learning_rate": 1.1336402588690725e-05, + "loss": 0.3834, + "step": 1442 + }, + { + "epoch": 0.2888, + "learning_rate": 1.1350238669424993e-05, + "loss": 0.3959, + "step": 1444 + }, + { + "epoch": 0.2892, + "learning_rate": 1.1364072117799864e-05, + "loss": 0.6842, + "step": 1446 + }, + { + "epoch": 0.2896, + "learning_rate": 1.137790290684638e-05, + "loss": 0.249, + "step": 1448 + }, + { + "epoch": 0.29, + "learning_rate": 1.1391731009600652e-05, + "loss": 0.2789, + "step": 1450 + }, + { + "epoch": 0.2904, + "learning_rate": 1.1405556399104108e-05, + "loss": 0.1908, + "step": 1452 + }, + { + "epoch": 0.2908, + "learning_rate": 1.141937904840344e-05, + "loss": 0.3876, + "step": 1454 + }, + { + "epoch": 0.2912, + "learning_rate": 1.143319893055069e-05, + "loss": 0.296, + "step": 1456 + }, + { + "epoch": 0.2916, + "learning_rate": 1.1447016018603286e-05, + "loss": 0.149, + "step": 1458 + }, + { + "epoch": 0.292, + "learning_rate": 1.1460830285624112e-05, + "loss": 0.6586, + "step": 1460 + }, + { + "epoch": 0.2924, + "learning_rate": 1.1474641704681541e-05, + "loss": 0.6323, + "step": 1462 + }, + { + "epoch": 0.2928, + "learning_rate": 1.1488450248849515e-05, + "loss": 0.165, + "step": 1464 + }, + { + "epoch": 0.2932, + "learning_rate": 1.150225589120756e-05, + "loss": 0.3811, + "step": 1466 + }, + { + "epoch": 0.2936, + "learning_rate": 1.1516058604840881e-05, + "loss": 0.2297, + "step": 1468 + }, + { + "epoch": 0.294, + "learning_rate": 1.1529858362840388e-05, + "loss": 0.1948, + "step": 1470 + }, + { + "epoch": 0.2944, + "learning_rate": 1.15436551383027e-05, + "loss": 0.2888, + "step": 1472 + }, + { + "epoch": 0.2948, + "learning_rate": 1.1557448904330366e-05, + "loss": 0.2804, + "step": 1474 + }, + { + "epoch": 0.2952, + "learning_rate": 1.1571239634031666e-05, + "loss": 0.4453, + "step": 1476 + }, + { + "epoch": 0.2956, + "learning_rate": 1.158502730052093e-05, + "loss": 0.3637, + "step": 1478 + }, + { + "epoch": 0.296, + "learning_rate": 1.1598811876918352e-05, + "loss": 0.5116, + "step": 1480 + }, + { + "epoch": 0.2964, + "learning_rate": 1.161259333635019e-05, + "loss": 0.3157, + "step": 1482 + }, + { + "epoch": 0.2968, + "learning_rate": 1.1626371651948839e-05, + "loss": 0.2875, + "step": 1484 + }, + { + "epoch": 0.2972, + "learning_rate": 1.1640146796852711e-05, + "loss": 0.7522, + "step": 1486 + }, + { + "epoch": 0.2976, + "learning_rate": 1.1653918744206478e-05, + "loss": 0.528, + "step": 1488 + }, + { + "epoch": 0.298, + "learning_rate": 1.1667687467161021e-05, + "loss": 0.4627, + "step": 1490 + }, + { + "epoch": 0.2984, + "learning_rate": 1.1681452938873515e-05, + "loss": 0.2374, + "step": 1492 + }, + { + "epoch": 0.2988, + "learning_rate": 1.169521513250746e-05, + "loss": 0.5331, + "step": 1494 + }, + { + "epoch": 0.2992, + "learning_rate": 1.1708974021232763e-05, + "loss": 0.3494, + "step": 1496 + }, + { + "epoch": 0.2996, + "learning_rate": 1.1722729578225762e-05, + "loss": 0.3679, + "step": 1498 + }, + { + "epoch": 0.3, + "learning_rate": 1.1736481776669297e-05, + "loss": 0.4439, + "step": 1500 + }, + { + "epoch": 0.3004, + "learning_rate": 1.1750230589752753e-05, + "loss": 0.3022, + "step": 1502 + }, + { + "epoch": 0.3008, + "learning_rate": 1.1763975990672116e-05, + "loss": 0.2139, + "step": 1504 + }, + { + "epoch": 0.3012, + "learning_rate": 1.1777717952630023e-05, + "loss": 0.4247, + "step": 1506 + }, + { + "epoch": 0.3016, + "learning_rate": 1.1791456448835815e-05, + "loss": 0.3013, + "step": 1508 + }, + { + "epoch": 0.302, + "learning_rate": 1.180519145250561e-05, + "loss": 0.3549, + "step": 1510 + }, + { + "epoch": 0.3024, + "learning_rate": 1.1818922936862258e-05, + "loss": 0.522, + "step": 1512 + }, + { + "epoch": 0.3028, + "learning_rate": 1.1832650875135606e-05, + "loss": 0.5012, + "step": 1514 + }, + { + "epoch": 0.3032, + "learning_rate": 1.1846375240562274e-05, + "loss": 0.6997, + "step": 1516 + }, + { + "epoch": 0.3036, + "learning_rate": 1.1860096006385918e-05, + "loss": 0.2576, + "step": 1518 + }, + { + "epoch": 0.304, + "learning_rate": 1.187381314585725e-05, + "loss": 0.34, + "step": 1520 + }, + { + "epoch": 0.3044, + "learning_rate": 1.1887526632233937e-05, + "loss": 0.3399, + "step": 1522 + }, + { + "epoch": 0.3048, + "learning_rate": 1.1901236438780906e-05, + "loss": 1.3922, + "step": 1524 + }, + { + "epoch": 0.3052, + "learning_rate": 1.191494253877013e-05, + "loss": 0.373, + "step": 1526 + }, + { + "epoch": 0.3056, + "learning_rate": 1.192864490548089e-05, + "loss": 0.6975, + "step": 1528 + }, + { + "epoch": 0.306, + "learning_rate": 1.1942343512199719e-05, + "loss": 0.8468, + "step": 1530 + }, + { + "epoch": 0.3064, + "learning_rate": 1.195603833222048e-05, + "loss": 0.3416, + "step": 1532 + }, + { + "epoch": 0.3068, + "learning_rate": 1.1969729338844422e-05, + "loss": 0.627, + "step": 1534 + }, + { + "epoch": 0.3072, + "learning_rate": 1.198341650538023e-05, + "loss": 0.477, + "step": 1536 + }, + { + "epoch": 0.3076, + "learning_rate": 1.1997099805144066e-05, + "loss": 0.3668, + "step": 1538 + }, + { + "epoch": 0.308, + "learning_rate": 1.2010779211459642e-05, + "loss": 0.3565, + "step": 1540 + }, + { + "epoch": 0.3084, + "learning_rate": 1.2024454697658254e-05, + "loss": 0.386, + "step": 1542 + }, + { + "epoch": 0.3088, + "learning_rate": 1.203812623707884e-05, + "loss": 1.6389, + "step": 1544 + }, + { + "epoch": 0.3092, + "learning_rate": 1.2051793803068054e-05, + "loss": 0.3323, + "step": 1546 + }, + { + "epoch": 0.3096, + "learning_rate": 1.2065457368980227e-05, + "loss": 0.3874, + "step": 1548 + }, + { + "epoch": 0.31, + "learning_rate": 1.20791169081776e-05, + "loss": 0.9153, + "step": 1550 + }, + { + "epoch": 0.3104, + "learning_rate": 1.2092772394030141e-05, + "loss": 0.2882, + "step": 1552 + }, + { + "epoch": 0.3108, + "learning_rate": 1.210642379991583e-05, + "loss": 0.4256, + "step": 1554 + }, + { + "epoch": 0.3112, + "learning_rate": 1.2120071099220552e-05, + "loss": 0.5811, + "step": 1556 + }, + { + "epoch": 0.3116, + "learning_rate": 1.2133714265338148e-05, + "loss": 0.405, + "step": 1558 + }, + { + "epoch": 0.312, + "learning_rate": 1.2147353271670637e-05, + "loss": 0.3319, + "step": 1560 + }, + { + "epoch": 0.3124, + "learning_rate": 1.2160988091628006e-05, + "loss": 0.2205, + "step": 1562 + }, + { + "epoch": 0.3128, + "learning_rate": 1.217461869862855e-05, + "loss": 0.315, + "step": 1564 + }, + { + "epoch": 0.3132, + "learning_rate": 1.2188245066098647e-05, + "loss": 0.2961, + "step": 1566 + }, + { + "epoch": 0.3136, + "learning_rate": 1.2201867167473015e-05, + "loss": 0.2817, + "step": 1568 + }, + { + "epoch": 0.314, + "learning_rate": 1.2215484976194673e-05, + "loss": 0.7609, + "step": 1570 + }, + { + "epoch": 0.3144, + "learning_rate": 1.2229098465715002e-05, + "loss": 0.472, + "step": 1572 + }, + { + "epoch": 0.3148, + "learning_rate": 1.2242707609493809e-05, + "loss": 0.49, + "step": 1574 + }, + { + "epoch": 0.3152, + "learning_rate": 1.2256312380999373e-05, + "loss": 0.5711, + "step": 1576 + }, + { + "epoch": 0.3156, + "learning_rate": 1.2269912753708496e-05, + "loss": 0.3209, + "step": 1578 + }, + { + "epoch": 0.316, + "learning_rate": 1.2283508701106552e-05, + "loss": 0.2976, + "step": 1580 + }, + { + "epoch": 0.3164, + "learning_rate": 1.229710019668755e-05, + "loss": 0.9158, + "step": 1582 + }, + { + "epoch": 0.3168, + "learning_rate": 1.2310687213954173e-05, + "loss": 0.3954, + "step": 1584 + }, + { + "epoch": 0.3172, + "learning_rate": 1.232426972641785e-05, + "loss": 0.9305, + "step": 1586 + }, + { + "epoch": 0.3176, + "learning_rate": 1.233784770759873e-05, + "loss": 0.5347, + "step": 1588 + }, + { + "epoch": 0.318, + "learning_rate": 1.2351421131025891e-05, + "loss": 0.4727, + "step": 1590 + }, + { + "epoch": 0.3184, + "learning_rate": 1.2364989970237238e-05, + "loss": 0.2634, + "step": 1592 + }, + { + "epoch": 0.3188, + "learning_rate": 1.237855419877962e-05, + "loss": 0.449, + "step": 1594 + }, + { + "epoch": 0.3192, + "learning_rate": 1.23921137902089e-05, + "loss": 0.2159, + "step": 1596 + }, + { + "epoch": 0.3196, + "learning_rate": 1.2405668718089906e-05, + "loss": 0.3714, + "step": 1598 + }, + { + "epoch": 0.32, + "learning_rate": 1.241921895599668e-05, + "loss": 0.1906, + "step": 1600 + }, + { + "epoch": 0.3204, + "learning_rate": 1.2432764477512295e-05, + "loss": 0.2363, + "step": 1602 + }, + { + "epoch": 0.3208, + "learning_rate": 1.2446305256229076e-05, + "loss": 0.2566, + "step": 1604 + }, + { + "epoch": 0.3212, + "learning_rate": 1.2459841265748582e-05, + "loss": 0.2422, + "step": 1606 + }, + { + "epoch": 0.3216, + "learning_rate": 1.2473372479681653e-05, + "loss": 0.4343, + "step": 1608 + }, + { + "epoch": 0.322, + "learning_rate": 1.2486898871648547e-05, + "loss": 0.5041, + "step": 1610 + }, + { + "epoch": 0.3224, + "learning_rate": 1.2500420415278822e-05, + "loss": 0.3161, + "step": 1612 + }, + { + "epoch": 0.3228, + "learning_rate": 1.2513937084211546e-05, + "loss": 0.3555, + "step": 1614 + }, + { + "epoch": 0.3232, + "learning_rate": 1.2527448852095292e-05, + "loss": 0.5625, + "step": 1616 + }, + { + "epoch": 0.3236, + "learning_rate": 1.2540955692588167e-05, + "loss": 0.1647, + "step": 1618 + }, + { + "epoch": 0.324, + "learning_rate": 1.2554457579357902e-05, + "loss": 0.4411, + "step": 1620 + }, + { + "epoch": 0.3244, + "learning_rate": 1.2567954486081873e-05, + "loss": 0.2501, + "step": 1622 + }, + { + "epoch": 0.3248, + "learning_rate": 1.2581446386447171e-05, + "loss": 0.3227, + "step": 1624 + }, + { + "epoch": 0.3252, + "learning_rate": 1.2594933254150647e-05, + "loss": 0.5015, + "step": 1626 + }, + { + "epoch": 0.3256, + "learning_rate": 1.2608415062898963e-05, + "loss": 0.2546, + "step": 1628 + }, + { + "epoch": 0.326, + "learning_rate": 1.262189178640864e-05, + "loss": 0.3265, + "step": 1630 + }, + { + "epoch": 0.3264, + "learning_rate": 1.2635363398406133e-05, + "loss": 0.2006, + "step": 1632 + }, + { + "epoch": 0.3268, + "learning_rate": 1.2648829872627797e-05, + "loss": 0.3451, + "step": 1634 + }, + { + "epoch": 0.3272, + "learning_rate": 1.266229118282012e-05, + "loss": 0.2554, + "step": 1636 + }, + { + "epoch": 0.3276, + "learning_rate": 1.2675747302739516e-05, + "loss": 0.3144, + "step": 1638 + }, + { + "epoch": 0.328, + "learning_rate": 1.2689198206152644e-05, + "loss": 0.2635, + "step": 1640 + }, + { + "epoch": 0.3284, + "learning_rate": 1.2702643866836281e-05, + "loss": 0.3291, + "step": 1642 + }, + { + "epoch": 0.3288, + "learning_rate": 1.2716084258577373e-05, + "loss": 1.8399, + "step": 1644 + }, + { + "epoch": 0.3292, + "learning_rate": 1.2729519355173254e-05, + "loss": 0.7335, + "step": 1646 + }, + { + "epoch": 0.3296, + "learning_rate": 1.2742949130431468e-05, + "loss": 1.0885, + "step": 1648 + }, + { + "epoch": 0.33, + "learning_rate": 1.2756373558169992e-05, + "loss": 0.2025, + "step": 1650 + }, + { + "epoch": 0.3304, + "learning_rate": 1.2769792612217224e-05, + "loss": 0.3502, + "step": 1652 + }, + { + "epoch": 0.3308, + "learning_rate": 1.2783206266412028e-05, + "loss": 0.382, + "step": 1654 + }, + { + "epoch": 0.3312, + "learning_rate": 1.2796614494603795e-05, + "loss": 0.369, + "step": 1656 + }, + { + "epoch": 0.3316, + "learning_rate": 1.2810017270652508e-05, + "loss": 0.2342, + "step": 1658 + }, + { + "epoch": 0.332, + "learning_rate": 1.282341456842876e-05, + "loss": 0.3294, + "step": 1660 + }, + { + "epoch": 0.3324, + "learning_rate": 1.283680636181384e-05, + "loss": 0.636, + "step": 1662 + }, + { + "epoch": 0.3328, + "learning_rate": 1.2850192624699756e-05, + "loss": 0.236, + "step": 1664 + }, + { + "epoch": 0.3332, + "learning_rate": 1.2863573330989308e-05, + "loss": 0.4658, + "step": 1666 + }, + { + "epoch": 0.3336, + "learning_rate": 1.2876948454596122e-05, + "loss": 0.2648, + "step": 1668 + }, + { + "epoch": 0.334, + "learning_rate": 1.2890317969444708e-05, + "loss": 0.3963, + "step": 1670 + }, + { + "epoch": 0.3344, + "learning_rate": 1.2903681849470535e-05, + "loss": 0.7348, + "step": 1672 + }, + { + "epoch": 0.3348, + "learning_rate": 1.291704006861998e-05, + "loss": 0.2721, + "step": 1674 + }, + { + "epoch": 0.3352, + "learning_rate": 1.2930392600850565e-05, + "loss": 0.5367, + "step": 1676 + }, + { + "epoch": 0.3356, + "learning_rate": 1.2943739420130843e-05, + "loss": 0.3246, + "step": 1678 + }, + { + "epoch": 0.336, + "learning_rate": 1.2957080500440455e-05, + "loss": 0.6991, + "step": 1680 + }, + { + "epoch": 0.3364, + "learning_rate": 1.2970415815770353e-05, + "loss": 0.2867, + "step": 1682 + }, + { + "epoch": 0.3368, + "learning_rate": 1.2983745340122589e-05, + "loss": 0.2505, + "step": 1684 + }, + { + "epoch": 0.3372, + "learning_rate": 1.299706904751064e-05, + "loss": 0.3792, + "step": 1686 + }, + { + "epoch": 0.3376, + "learning_rate": 1.3010386911959205e-05, + "loss": 0.3496, + "step": 1688 + }, + { + "epoch": 0.338, + "learning_rate": 1.3023698907504447e-05, + "loss": 0.406, + "step": 1690 + }, + { + "epoch": 0.3384, + "learning_rate": 1.3037005008193944e-05, + "loss": 0.2106, + "step": 1692 + }, + { + "epoch": 0.3388, + "learning_rate": 1.3050305188086757e-05, + "loss": 0.3277, + "step": 1694 + }, + { + "epoch": 0.3392, + "learning_rate": 1.3063599421253556e-05, + "loss": 0.4056, + "step": 1696 + }, + { + "epoch": 0.3396, + "learning_rate": 1.3076887681776504e-05, + "loss": 0.3596, + "step": 1698 + }, + { + "epoch": 0.34, + "learning_rate": 1.309016994374947e-05, + "loss": 0.4409, + "step": 1700 + }, + { + "epoch": 0.3404, + "learning_rate": 1.310344618127801e-05, + "loss": 0.2847, + "step": 1702 + }, + { + "epoch": 0.3408, + "learning_rate": 1.3116716368479415e-05, + "loss": 0.2864, + "step": 1704 + }, + { + "epoch": 0.3412, + "learning_rate": 1.3129980479482776e-05, + "loss": 0.2763, + "step": 1706 + }, + { + "epoch": 0.3416, + "learning_rate": 1.3143238488429049e-05, + "loss": 1.1652, + "step": 1708 + }, + { + "epoch": 0.342, + "learning_rate": 1.3156490369471018e-05, + "loss": 0.4035, + "step": 1710 + }, + { + "epoch": 0.3424, + "learning_rate": 1.316973609677351e-05, + "loss": 0.2387, + "step": 1712 + }, + { + "epoch": 0.3428, + "learning_rate": 1.3182975644513286e-05, + "loss": 0.4479, + "step": 1714 + }, + { + "epoch": 0.3432, + "learning_rate": 1.319620898687917e-05, + "loss": 0.8652, + "step": 1716 + }, + { + "epoch": 0.3436, + "learning_rate": 1.3209436098072102e-05, + "loss": 0.218, + "step": 1718 + }, + { + "epoch": 0.344, + "learning_rate": 1.32226569523051e-05, + "loss": 0.2029, + "step": 1720 + }, + { + "epoch": 0.3444, + "learning_rate": 1.3235871523803501e-05, + "loss": 1.029, + "step": 1722 + }, + { + "epoch": 0.3448, + "learning_rate": 1.324907978680475e-05, + "loss": 0.2859, + "step": 1724 + }, + { + "epoch": 0.3452, + "learning_rate": 1.3262281715558738e-05, + "loss": 0.2413, + "step": 1726 + }, + { + "epoch": 0.3456, + "learning_rate": 1.3275477284327572e-05, + "loss": 0.2727, + "step": 1728 + }, + { + "epoch": 0.346, + "learning_rate": 1.3288666467385815e-05, + "loss": 0.4255, + "step": 1730 + }, + { + "epoch": 0.3464, + "learning_rate": 1.3301849239020537e-05, + "loss": 0.3515, + "step": 1732 + }, + { + "epoch": 0.3468, + "learning_rate": 1.3315025573531193e-05, + "loss": 0.4142, + "step": 1734 + }, + { + "epoch": 0.3472, + "learning_rate": 1.3328195445229865e-05, + "loss": 0.5722, + "step": 1736 + }, + { + "epoch": 0.3476, + "learning_rate": 1.3341358828441214e-05, + "loss": 0.3139, + "step": 1738 + }, + { + "epoch": 0.348, + "learning_rate": 1.3354515697502548e-05, + "loss": 0.3476, + "step": 1740 + }, + { + "epoch": 0.3484, + "learning_rate": 1.3367666026763879e-05, + "loss": 0.2781, + "step": 1742 + }, + { + "epoch": 0.3488, + "learning_rate": 1.338080979058797e-05, + "loss": 0.3358, + "step": 1744 + }, + { + "epoch": 0.3492, + "learning_rate": 1.3393946963350378e-05, + "loss": 0.4356, + "step": 1746 + }, + { + "epoch": 0.3496, + "learning_rate": 1.340707751943951e-05, + "loss": 0.214, + "step": 1748 + }, + { + "epoch": 0.35, + "learning_rate": 1.3420201433256682e-05, + "loss": 0.3111, + "step": 1750 + }, + { + "epoch": 0.3504, + "learning_rate": 1.3433318679216145e-05, + "loss": 0.3076, + "step": 1752 + }, + { + "epoch": 0.3508, + "learning_rate": 1.3446429231745162e-05, + "loss": 0.3227, + "step": 1754 + }, + { + "epoch": 0.3512, + "learning_rate": 1.3459533065284039e-05, + "loss": 0.3104, + "step": 1756 + }, + { + "epoch": 0.3516, + "learning_rate": 1.3472630154286197e-05, + "loss": 0.2995, + "step": 1758 + }, + { + "epoch": 0.352, + "learning_rate": 1.348572047321814e-05, + "loss": 0.3618, + "step": 1760 + }, + { + "epoch": 0.3524, + "learning_rate": 1.3498803996559692e-05, + "loss": 0.5547, + "step": 1762 + }, + { + "epoch": 0.3528, + "learning_rate": 1.3511880698803803e-05, + "loss": 0.2593, + "step": 1764 + }, + { + "epoch": 0.3532, + "learning_rate": 1.3524950554456773e-05, + "loss": 0.5573, + "step": 1766 + }, + { + "epoch": 0.3536, + "learning_rate": 1.3538013538038296e-05, + "loss": 0.3563, + "step": 1768 + }, + { + "epoch": 0.354, + "learning_rate": 1.3551069624081356e-05, + "loss": 0.5477, + "step": 1770 + }, + { + "epoch": 0.3544, + "learning_rate": 1.3564118787132507e-05, + "loss": 0.5512, + "step": 1772 + }, + { + "epoch": 0.3548, + "learning_rate": 1.3577161001751692e-05, + "loss": 0.2617, + "step": 1774 + }, + { + "epoch": 0.3552, + "learning_rate": 1.3590196242512461e-05, + "loss": 0.7771, + "step": 1776 + }, + { + "epoch": 0.3556, + "learning_rate": 1.3603224484001944e-05, + "loss": 0.2697, + "step": 1778 + }, + { + "epoch": 0.356, + "learning_rate": 1.361624570082092e-05, + "loss": 1.3417, + "step": 1780 + }, + { + "epoch": 0.3564, + "learning_rate": 1.362925986758386e-05, + "loss": 0.7912, + "step": 1782 + }, + { + "epoch": 0.3568, + "learning_rate": 1.364226695891898e-05, + "loss": 0.3164, + "step": 1784 + }, + { + "epoch": 0.3572, + "learning_rate": 1.3655266949468287e-05, + "loss": 0.7582, + "step": 1786 + }, + { + "epoch": 0.3576, + "learning_rate": 1.3668259813887637e-05, + "loss": 0.6588, + "step": 1788 + }, + { + "epoch": 0.358, + "learning_rate": 1.3681245526846773e-05, + "loss": 0.1786, + "step": 1790 + }, + { + "epoch": 0.3584, + "learning_rate": 1.3694224063029386e-05, + "loss": 0.3206, + "step": 1792 + }, + { + "epoch": 0.3588, + "learning_rate": 1.3707195397133176e-05, + "loss": 1.1091, + "step": 1794 + }, + { + "epoch": 0.3592, + "learning_rate": 1.3720159503869806e-05, + "loss": 0.4157, + "step": 1796 + }, + { + "epoch": 0.3596, + "learning_rate": 1.3733116357965156e-05, + "loss": 0.6235, + "step": 1798 + }, + { + "epoch": 0.36, + "learning_rate": 1.374606593415911e-05, + "loss": 0.2646, + "step": 1800 + }, + { + "epoch": 0.3604, + "learning_rate": 1.3759008207205855e-05, + "loss": 0.3316, + "step": 1802 + }, + { + "epoch": 0.3608, + "learning_rate": 1.377194315187377e-05, + "loss": 0.3196, + "step": 1804 + }, + { + "epoch": 0.3612, + "learning_rate": 1.3784870742945468e-05, + "loss": 0.4325, + "step": 1806 + }, + { + "epoch": 0.3616, + "learning_rate": 1.3797790955218014e-05, + "loss": 0.3826, + "step": 1808 + }, + { + "epoch": 0.362, + "learning_rate": 1.3810703763502744e-05, + "loss": 0.4145, + "step": 1810 + }, + { + "epoch": 0.3624, + "learning_rate": 1.3823609142625492e-05, + "loss": 0.203, + "step": 1812 + }, + { + "epoch": 0.3628, + "learning_rate": 1.3836507067426563e-05, + "loss": 0.3604, + "step": 1814 + }, + { + "epoch": 0.3632, + "learning_rate": 1.3849397512760793e-05, + "loss": 0.5699, + "step": 1816 + }, + { + "epoch": 0.3636, + "learning_rate": 1.38622804534976e-05, + "loss": 0.5671, + "step": 1818 + }, + { + "epoch": 0.364, + "learning_rate": 1.3875155864521027e-05, + "loss": 0.4769, + "step": 1820 + }, + { + "epoch": 0.3644, + "learning_rate": 1.3888023720729806e-05, + "loss": 0.4216, + "step": 1822 + }, + { + "epoch": 0.3648, + "learning_rate": 1.3900883997037393e-05, + "loss": 0.4301, + "step": 1824 + }, + { + "epoch": 0.3652, + "learning_rate": 1.391373666837202e-05, + "loss": 0.2884, + "step": 1826 + }, + { + "epoch": 0.3656, + "learning_rate": 1.3926581709676746e-05, + "loss": 0.5972, + "step": 1828 + }, + { + "epoch": 0.366, + "learning_rate": 1.3939419095909506e-05, + "loss": 0.334, + "step": 1830 + }, + { + "epoch": 0.3664, + "learning_rate": 1.3952248802043158e-05, + "loss": 0.2576, + "step": 1832 + }, + { + "epoch": 0.3668, + "learning_rate": 1.396507080306555e-05, + "loss": 0.2223, + "step": 1834 + }, + { + "epoch": 0.3672, + "learning_rate": 1.397788507397949e-05, + "loss": 0.628, + "step": 1836 + }, + { + "epoch": 0.3676, + "learning_rate": 1.3990691589802943e-05, + "loss": 0.2766, + "step": 1838 + }, + { + "epoch": 0.368, + "learning_rate": 1.4003490325568956e-05, + "loss": 0.3813, + "step": 1840 + }, + { + "epoch": 0.3684, + "learning_rate": 1.4016281256325688e-05, + "loss": 0.3667, + "step": 1842 + }, + { + "epoch": 0.3688, + "learning_rate": 1.4029064357136632e-05, + "loss": 0.888, + "step": 1844 + }, + { + "epoch": 0.3692, + "learning_rate": 1.4041839603080411e-05, + "loss": 0.4, + "step": 1846 + }, + { + "epoch": 0.3696, + "learning_rate": 1.4054606969251096e-05, + "loss": 0.441, + "step": 1848 + }, + { + "epoch": 0.37, + "learning_rate": 1.4067366430758004e-05, + "loss": 0.2491, + "step": 1850 + }, + { + "epoch": 0.3704, + "learning_rate": 1.4080117962725929e-05, + "loss": 0.4036, + "step": 1852 + }, + { + "epoch": 0.3708, + "learning_rate": 1.4092861540295107e-05, + "loss": 0.2898, + "step": 1854 + }, + { + "epoch": 0.3712, + "learning_rate": 1.4105597138621281e-05, + "loss": 0.3645, + "step": 1856 + }, + { + "epoch": 0.3716, + "learning_rate": 1.411832473287575e-05, + "loss": 0.3966, + "step": 1858 + }, + { + "epoch": 0.372, + "learning_rate": 1.4131044298245416e-05, + "loss": 0.3166, + "step": 1860 + }, + { + "epoch": 0.3724, + "learning_rate": 1.414375580993284e-05, + "loss": 0.3688, + "step": 1862 + }, + { + "epoch": 0.3728, + "learning_rate": 1.4156459243156275e-05, + "loss": 0.2432, + "step": 1864 + }, + { + "epoch": 0.3732, + "learning_rate": 1.416915457314973e-05, + "loss": 0.2888, + "step": 1866 + }, + { + "epoch": 0.3736, + "learning_rate": 1.418184177516301e-05, + "loss": 0.388, + "step": 1868 + }, + { + "epoch": 0.374, + "learning_rate": 1.4194520824461782e-05, + "loss": 0.4759, + "step": 1870 + }, + { + "epoch": 0.3744, + "learning_rate": 1.420719169632754e-05, + "loss": 0.729, + "step": 1872 + }, + { + "epoch": 0.3748, + "learning_rate": 1.4219854366057821e-05, + "loss": 0.6791, + "step": 1874 + }, + { + "epoch": 0.3752, + "learning_rate": 1.4232508808966085e-05, + "loss": 0.5223, + "step": 1876 + }, + { + "epoch": 0.3756, + "learning_rate": 1.424515500038185e-05, + "loss": 0.2895, + "step": 1878 + }, + { + "epoch": 0.376, + "learning_rate": 1.4257792915650735e-05, + "loss": 0.4267, + "step": 1880 + }, + { + "epoch": 0.3764, + "learning_rate": 1.4270422530134425e-05, + "loss": 0.1615, + "step": 1882 + }, + { + "epoch": 0.3768, + "learning_rate": 1.4283043819210906e-05, + "loss": 0.4619, + "step": 1884 + }, + { + "epoch": 0.3772, + "learning_rate": 1.4295656758274288e-05, + "loss": 0.301, + "step": 1886 + }, + { + "epoch": 0.3776, + "learning_rate": 1.430826132273499e-05, + "loss": 0.309, + "step": 1888 + }, + { + "epoch": 0.378, + "learning_rate": 1.4320857488019826e-05, + "loss": 0.4885, + "step": 1890 + }, + { + "epoch": 0.3784, + "learning_rate": 1.4333445229571857e-05, + "loss": 0.1768, + "step": 1892 + }, + { + "epoch": 0.3788, + "learning_rate": 1.4346024522850704e-05, + "loss": 0.3811, + "step": 1894 + }, + { + "epoch": 0.3792, + "learning_rate": 1.4358595343332342e-05, + "loss": 0.6493, + "step": 1896 + }, + { + "epoch": 0.3796, + "learning_rate": 1.437115766650933e-05, + "loss": 0.623, + "step": 1898 + }, + { + "epoch": 0.38, + "learning_rate": 1.4383711467890772e-05, + "loss": 0.1717, + "step": 1900 + }, + { + "epoch": 0.3804, + "learning_rate": 1.4396256723002398e-05, + "loss": 0.2049, + "step": 1902 + }, + { + "epoch": 0.3808, + "learning_rate": 1.4408793407386584e-05, + "loss": 0.2326, + "step": 1904 + }, + { + "epoch": 0.3812, + "learning_rate": 1.4421321496602423e-05, + "loss": 0.2033, + "step": 1906 + }, + { + "epoch": 0.3816, + "learning_rate": 1.4433840966225767e-05, + "loss": 0.3281, + "step": 1908 + }, + { + "epoch": 0.382, + "learning_rate": 1.444635179184927e-05, + "loss": 0.4581, + "step": 1910 + }, + { + "epoch": 0.3824, + "learning_rate": 1.4458853949082434e-05, + "loss": 0.9499, + "step": 1912 + }, + { + "epoch": 0.3828, + "learning_rate": 1.4471347413551665e-05, + "loss": 0.3208, + "step": 1914 + }, + { + "epoch": 0.3832, + "learning_rate": 1.4483832160900332e-05, + "loss": 0.5199, + "step": 1916 + }, + { + "epoch": 0.3836, + "learning_rate": 1.4496308166788731e-05, + "loss": 0.6524, + "step": 1918 + }, + { + "epoch": 0.384, + "learning_rate": 1.4508775406894315e-05, + "loss": 0.1941, + "step": 1920 + }, + { + "epoch": 0.3844, + "learning_rate": 1.4521233856911499e-05, + "loss": 0.5848, + "step": 1922 + }, + { + "epoch": 0.3848, + "learning_rate": 1.4533683492551942e-05, + "loss": 0.2831, + "step": 1924 + }, + { + "epoch": 0.3852, + "learning_rate": 1.4546124289544446e-05, + "loss": 0.1514, + "step": 1926 + }, + { + "epoch": 0.3856, + "learning_rate": 1.4558556223634988e-05, + "loss": 0.379, + "step": 1928 + }, + { + "epoch": 0.386, + "learning_rate": 1.4570979270586944e-05, + "loss": 0.9767, + "step": 1930 + }, + { + "epoch": 0.3864, + "learning_rate": 1.4583393406180886e-05, + "loss": 0.3891, + "step": 1932 + }, + { + "epoch": 0.3868, + "learning_rate": 1.4595798606214882e-05, + "loss": 0.2937, + "step": 1934 + }, + { + "epoch": 0.3872, + "learning_rate": 1.460819484650431e-05, + "loss": 0.3543, + "step": 1936 + }, + { + "epoch": 0.3876, + "learning_rate": 1.4620582102882086e-05, + "loss": 0.3585, + "step": 1938 + }, + { + "epoch": 0.388, + "learning_rate": 1.4632960351198618e-05, + "loss": 0.4539, + "step": 1940 + }, + { + "epoch": 0.3884, + "learning_rate": 1.4645329567321875e-05, + "loss": 0.2152, + "step": 1942 + }, + { + "epoch": 0.3888, + "learning_rate": 1.4657689727137441e-05, + "loss": 0.2859, + "step": 1944 + }, + { + "epoch": 0.3892, + "learning_rate": 1.4670040806548551e-05, + "loss": 0.4946, + "step": 1946 + }, + { + "epoch": 0.3896, + "learning_rate": 1.468238278147614e-05, + "loss": 0.4176, + "step": 1948 + }, + { + "epoch": 0.39, + "learning_rate": 1.4694715627858904e-05, + "loss": 0.3109, + "step": 1950 + }, + { + "epoch": 0.3904, + "learning_rate": 1.470703932165332e-05, + "loss": 0.5821, + "step": 1952 + }, + { + "epoch": 0.3908, + "learning_rate": 1.471935383883372e-05, + "loss": 0.5339, + "step": 1954 + }, + { + "epoch": 0.3912, + "learning_rate": 1.4731659155392339e-05, + "loss": 0.208, + "step": 1956 + }, + { + "epoch": 0.3916, + "learning_rate": 1.4743955247339286e-05, + "loss": 0.3648, + "step": 1958 + }, + { + "epoch": 0.392, + "learning_rate": 1.4756242090702744e-05, + "loss": 0.377, + "step": 1960 + }, + { + "epoch": 0.3924, + "learning_rate": 1.476851966152887e-05, + "loss": 0.3574, + "step": 1962 + }, + { + "epoch": 0.3928, + "learning_rate": 1.4780787935881913e-05, + "loss": 0.2881, + "step": 1964 + }, + { + "epoch": 0.3932, + "learning_rate": 1.4793046889844255e-05, + "loss": 0.3261, + "step": 1966 + }, + { + "epoch": 0.3936, + "learning_rate": 1.4805296499516397e-05, + "loss": 0.5676, + "step": 1968 + }, + { + "epoch": 0.394, + "learning_rate": 1.4817536741017155e-05, + "loss": 0.3629, + "step": 1970 + }, + { + "epoch": 0.3944, + "learning_rate": 1.482976759048351e-05, + "loss": 0.2987, + "step": 1972 + }, + { + "epoch": 0.3948, + "learning_rate": 1.4841989024070809e-05, + "loss": 0.573, + "step": 1974 + }, + { + "epoch": 0.3952, + "learning_rate": 1.485420101795274e-05, + "loss": 0.1936, + "step": 1976 + }, + { + "epoch": 0.3956, + "learning_rate": 1.4866403548321385e-05, + "loss": 0.6765, + "step": 1978 + }, + { + "epoch": 0.396, + "learning_rate": 1.4878596591387327e-05, + "loss": 0.3789, + "step": 1980 + }, + { + "epoch": 0.3964, + "learning_rate": 1.4890780123379563e-05, + "loss": 0.4383, + "step": 1982 + }, + { + "epoch": 0.3968, + "learning_rate": 1.4902954120545686e-05, + "loss": 0.4974, + "step": 1984 + }, + { + "epoch": 0.3972, + "learning_rate": 1.491511855915187e-05, + "loss": 0.3304, + "step": 1986 + }, + { + "epoch": 0.3976, + "learning_rate": 1.4927273415482913e-05, + "loss": 0.237, + "step": 1988 + }, + { + "epoch": 0.398, + "learning_rate": 1.4939418665842307e-05, + "loss": 0.5226, + "step": 1990 + }, + { + "epoch": 0.3984, + "learning_rate": 1.4951554286552261e-05, + "loss": 0.3, + "step": 1992 + }, + { + "epoch": 0.3988, + "learning_rate": 1.4963680253953763e-05, + "loss": 0.5579, + "step": 1994 + }, + { + "epoch": 0.3992, + "learning_rate": 1.4975796544406617e-05, + "loss": 0.3143, + "step": 1996 + }, + { + "epoch": 0.3996, + "learning_rate": 1.49879031342895e-05, + "loss": 0.307, + "step": 1998 + }, + { + "epoch": 0.4, + "learning_rate": 1.4999999999999992e-05, + "loss": 0.3944, + "step": 2000 + }, + { + "epoch": 0.4004, + "learning_rate": 1.501208711795465e-05, + "loss": 0.3738, + "step": 2002 + }, + { + "epoch": 0.4008, + "learning_rate": 1.502416446458897e-05, + "loss": 0.4013, + "step": 2004 + }, + { + "epoch": 0.4012, + "learning_rate": 1.5036232016357613e-05, + "loss": 0.32, + "step": 2006 + }, + { + "epoch": 0.4016, + "learning_rate": 1.5048289749734206e-05, + "loss": 0.4505, + "step": 2008 + }, + { + "epoch": 0.402, + "learning_rate": 1.5060337641211642e-05, + "loss": 0.2053, + "step": 2010 + }, + { + "epoch": 0.4024, + "learning_rate": 1.5072375667301895e-05, + "loss": 0.1435, + "step": 2012 + }, + { + "epoch": 0.4028, + "learning_rate": 1.5084403804536214e-05, + "loss": 0.4951, + "step": 2014 + }, + { + "epoch": 0.4032, + "learning_rate": 1.5096422029465178e-05, + "loss": 0.4172, + "step": 2016 + }, + { + "epoch": 0.4036, + "learning_rate": 1.5108430318658597e-05, + "loss": 0.658, + "step": 2018 + }, + { + "epoch": 0.404, + "learning_rate": 1.5120428648705714e-05, + "loss": 0.3053, + "step": 2020 + }, + { + "epoch": 0.4044, + "learning_rate": 1.513241699621517e-05, + "loss": 0.525, + "step": 2022 + }, + { + "epoch": 0.4048, + "learning_rate": 1.5144395337815064e-05, + "loss": 0.5678, + "step": 2024 + }, + { + "epoch": 0.4052, + "learning_rate": 1.5156363650153008e-05, + "loss": 0.2781, + "step": 2026 + }, + { + "epoch": 0.4056, + "learning_rate": 1.5168321909896166e-05, + "loss": 0.1991, + "step": 2028 + }, + { + "epoch": 0.406, + "learning_rate": 1.51802700937313e-05, + "loss": 0.4295, + "step": 2030 + }, + { + "epoch": 0.4064, + "learning_rate": 1.5192208178364808e-05, + "loss": 0.4525, + "step": 2032 + }, + { + "epoch": 0.4068, + "learning_rate": 1.5204136140522792e-05, + "loss": 0.9057, + "step": 2034 + }, + { + "epoch": 0.4072, + "learning_rate": 1.521605395695107e-05, + "loss": 0.3233, + "step": 2036 + }, + { + "epoch": 0.4076, + "learning_rate": 1.522796160441526e-05, + "loss": 0.2702, + "step": 2038 + }, + { + "epoch": 0.408, + "learning_rate": 1.5239859059700784e-05, + "loss": 0.4625, + "step": 2040 + }, + { + "epoch": 0.4084, + "learning_rate": 1.5251746299612964e-05, + "loss": 0.2028, + "step": 2042 + }, + { + "epoch": 0.4088, + "learning_rate": 1.526362330097697e-05, + "loss": 0.3898, + "step": 2044 + }, + { + "epoch": 0.4092, + "learning_rate": 1.5275490040638044e-05, + "loss": 0.1968, + "step": 2046 + }, + { + "epoch": 0.4096, + "learning_rate": 1.5287346495461322e-05, + "loss": 0.8213, + "step": 2048 + }, + { + "epoch": 0.41, + "learning_rate": 1.529919264233204e-05, + "loss": 0.4901, + "step": 2050 + }, + { + "epoch": 0.4104, + "learning_rate": 1.531102845815557e-05, + "loss": 0.2451, + "step": 2052 + }, + { + "epoch": 0.4108, + "learning_rate": 1.5322853919857327e-05, + "loss": 0.3188, + "step": 2054 + }, + { + "epoch": 0.4112, + "learning_rate": 1.5334669004383025e-05, + "loss": 0.3506, + "step": 2056 + }, + { + "epoch": 0.4116, + "learning_rate": 1.5346473688698514e-05, + "loss": 0.2336, + "step": 2058 + }, + { + "epoch": 0.412, + "learning_rate": 1.5358267949789968e-05, + "loss": 0.2218, + "step": 2060 + }, + { + "epoch": 0.4124, + "learning_rate": 1.537005176466387e-05, + "loss": 0.4653, + "step": 2062 + }, + { + "epoch": 0.4128, + "learning_rate": 1.5381825110347072e-05, + "loss": 1.3487, + "step": 2064 + }, + { + "epoch": 0.4132, + "learning_rate": 1.539358796388683e-05, + "loss": 0.7694, + "step": 2066 + }, + { + "epoch": 0.4136, + "learning_rate": 1.540534030235087e-05, + "loss": 0.2363, + "step": 2068 + }, + { + "epoch": 0.414, + "learning_rate": 1.5417082102827397e-05, + "loss": 0.173, + "step": 2070 + }, + { + "epoch": 0.4144, + "learning_rate": 1.542881334242517e-05, + "loss": 0.5383, + "step": 2072 + }, + { + "epoch": 0.4148, + "learning_rate": 1.5440533998273542e-05, + "loss": 1.0686, + "step": 2074 + }, + { + "epoch": 0.4152, + "learning_rate": 1.5452244047522493e-05, + "loss": 0.3303, + "step": 2076 + }, + { + "epoch": 0.4156, + "learning_rate": 1.54639434673427e-05, + "loss": 0.258, + "step": 2078 + }, + { + "epoch": 0.416, + "learning_rate": 1.5475632234925495e-05, + "loss": 0.3545, + "step": 2080 + }, + { + "epoch": 0.4164, + "learning_rate": 1.548731032748309e-05, + "loss": 0.3901, + "step": 2082 + }, + { + "epoch": 0.4168, + "learning_rate": 1.5498977722248388e-05, + "loss": 0.4711, + "step": 2084 + }, + { + "epoch": 0.4172, + "learning_rate": 1.551063439647525e-05, + "loss": 0.5243, + "step": 2086 + }, + { + "epoch": 0.4176, + "learning_rate": 1.552228032743839e-05, + "loss": 0.2197, + "step": 2088 + }, + { + "epoch": 0.418, + "learning_rate": 1.553391549243343e-05, + "loss": 0.2571, + "step": 2090 + }, + { + "epoch": 0.4184, + "learning_rate": 1.5545539868777075e-05, + "loss": 0.3383, + "step": 2092 + }, + { + "epoch": 0.4188, + "learning_rate": 1.5557153433806954e-05, + "loss": 0.5351, + "step": 2094 + }, + { + "epoch": 0.4192, + "learning_rate": 1.556875616488188e-05, + "loss": 0.1115, + "step": 2096 + }, + { + "epoch": 0.4196, + "learning_rate": 1.55803480393817e-05, + "loss": 0.6507, + "step": 2098 + }, + { + "epoch": 0.42, + "learning_rate": 1.5591929034707468e-05, + "loss": 0.1762, + "step": 2100 + }, + { + "epoch": 0.4204, + "learning_rate": 1.5603499128281447e-05, + "loss": 0.3471, + "step": 2102 + }, + { + "epoch": 0.4208, + "learning_rate": 1.5615058297547144e-05, + "loss": 0.2303, + "step": 2104 + }, + { + "epoch": 0.4212, + "learning_rate": 1.5626606519969366e-05, + "loss": 0.5319, + "step": 2106 + }, + { + "epoch": 0.4216, + "learning_rate": 1.5638143773034268e-05, + "loss": 0.3562, + "step": 2108 + }, + { + "epoch": 0.422, + "learning_rate": 1.5649670034249376e-05, + "loss": 0.2375, + "step": 2110 + }, + { + "epoch": 0.4224, + "learning_rate": 1.5661185281143663e-05, + "loss": 0.2021, + "step": 2112 + }, + { + "epoch": 0.4228, + "learning_rate": 1.5672689491267562e-05, + "loss": 0.6684, + "step": 2114 + }, + { + "epoch": 0.4232, + "learning_rate": 1.5684182642193024e-05, + "loss": 0.2059, + "step": 2116 + }, + { + "epoch": 0.4236, + "learning_rate": 1.5695664711513582e-05, + "loss": 0.1567, + "step": 2118 + }, + { + "epoch": 0.424, + "learning_rate": 1.5707135676844312e-05, + "loss": 0.3879, + "step": 2120 + }, + { + "epoch": 0.4244, + "learning_rate": 1.5718595515822016e-05, + "loss": 1.0412, + "step": 2122 + }, + { + "epoch": 0.4248, + "learning_rate": 1.5730044206105146e-05, + "loss": 0.557, + "step": 2124 + }, + { + "epoch": 0.4252, + "learning_rate": 1.574148172537389e-05, + "loss": 0.3364, + "step": 2126 + }, + { + "epoch": 0.4256, + "learning_rate": 1.5752908051330232e-05, + "loss": 0.3997, + "step": 2128 + }, + { + "epoch": 0.426, + "learning_rate": 1.5764323161697923e-05, + "loss": 0.4115, + "step": 2130 + }, + { + "epoch": 0.4264, + "learning_rate": 1.577572703422268e-05, + "loss": 0.5489, + "step": 2132 + }, + { + "epoch": 0.4268, + "learning_rate": 1.5787119646672025e-05, + "loss": 0.9006, + "step": 2134 + }, + { + "epoch": 0.4272, + "learning_rate": 1.579850097683548e-05, + "loss": 0.1823, + "step": 2136 + }, + { + "epoch": 0.4276, + "learning_rate": 1.58098710025246e-05, + "loss": 0.2523, + "step": 2138 + }, + { + "epoch": 0.428, + "learning_rate": 1.582122970157288e-05, + "loss": 0.2921, + "step": 2140 + }, + { + "epoch": 0.4284, + "learning_rate": 1.5832577051836016e-05, + "loss": 0.3707, + "step": 2142 + }, + { + "epoch": 0.4288, + "learning_rate": 1.5843913031191722e-05, + "loss": 0.4968, + "step": 2144 + }, + { + "epoch": 0.4292, + "learning_rate": 1.585523761753994e-05, + "loss": 0.2209, + "step": 2146 + }, + { + "epoch": 0.4296, + "learning_rate": 1.586655078880281e-05, + "loss": 0.4668, + "step": 2148 + }, + { + "epoch": 0.43, + "learning_rate": 1.587785252292473e-05, + "loss": 0.4699, + "step": 2150 + }, + { + "epoch": 0.4304, + "learning_rate": 1.5889142797872383e-05, + "loss": 0.4345, + "step": 2152 + }, + { + "epoch": 0.4308, + "learning_rate": 1.5900421591634806e-05, + "loss": 0.1995, + "step": 2154 + }, + { + "epoch": 0.4312, + "learning_rate": 1.5911688882223415e-05, + "loss": 0.3445, + "step": 2156 + }, + { + "epoch": 0.4316, + "learning_rate": 1.5922944647672044e-05, + "loss": 0.5169, + "step": 2158 + }, + { + "epoch": 0.432, + "learning_rate": 1.5934188866037007e-05, + "loss": 1.9181, + "step": 2160 + }, + { + "epoch": 0.4324, + "learning_rate": 1.5945421515397125e-05, + "loss": 0.2543, + "step": 2162 + }, + { + "epoch": 0.4328, + "learning_rate": 1.5956642573853787e-05, + "loss": 0.3556, + "step": 2164 + }, + { + "epoch": 0.4332, + "learning_rate": 1.5967852019530918e-05, + "loss": 0.3566, + "step": 2166 + }, + { + "epoch": 0.4336, + "learning_rate": 1.5979049830575193e-05, + "loss": 0.6089, + "step": 2168 + }, + { + "epoch": 0.434, + "learning_rate": 1.599023598515585e-05, + "loss": 0.5277, + "step": 2170 + }, + { + "epoch": 0.4344, + "learning_rate": 1.6001410461464945e-05, + "loss": 0.4254, + "step": 2172 + }, + { + "epoch": 0.4348, + "learning_rate": 1.601257323771727e-05, + "loss": 0.4298, + "step": 2174 + }, + { + "epoch": 0.4352, + "learning_rate": 1.6023724292150377e-05, + "loss": 0.1705, + "step": 2176 + }, + { + "epoch": 0.4356, + "learning_rate": 1.6034863603024768e-05, + "loss": 0.3131, + "step": 2178 + }, + { + "epoch": 0.436, + "learning_rate": 1.604599114862375e-05, + "loss": 0.7403, + "step": 2180 + }, + { + "epoch": 0.4364, + "learning_rate": 1.6057106907253614e-05, + "loss": 0.5402, + "step": 2182 + }, + { + "epoch": 0.4368, + "learning_rate": 1.606821085724362e-05, + "loss": 0.437, + "step": 2184 + }, + { + "epoch": 0.4372, + "learning_rate": 1.6079302976946052e-05, + "loss": 0.3781, + "step": 2186 + }, + { + "epoch": 0.4376, + "learning_rate": 1.6090383244736253e-05, + "loss": 0.31, + "step": 2188 + }, + { + "epoch": 0.438, + "learning_rate": 1.6101451639012675e-05, + "loss": 0.4118, + "step": 2190 + }, + { + "epoch": 0.4384, + "learning_rate": 1.6112508138196912e-05, + "loss": 0.2098, + "step": 2192 + }, + { + "epoch": 0.4388, + "learning_rate": 1.6123552720733763e-05, + "loss": 0.2724, + "step": 2194 + }, + { + "epoch": 0.4392, + "learning_rate": 1.613458536509124e-05, + "loss": 0.8093, + "step": 2196 + }, + { + "epoch": 0.4396, + "learning_rate": 1.614560604976064e-05, + "loss": 0.2262, + "step": 2198 + }, + { + "epoch": 0.44, + "learning_rate": 1.615661475325658e-05, + "loss": 0.3455, + "step": 2200 + }, + { + "epoch": 0.4404, + "learning_rate": 1.616761145411702e-05, + "loss": 0.5875, + "step": 2202 + }, + { + "epoch": 0.4408, + "learning_rate": 1.6178596130903352e-05, + "loss": 0.2722, + "step": 2204 + }, + { + "epoch": 0.4412, + "learning_rate": 1.618956876220034e-05, + "loss": 0.3831, + "step": 2206 + }, + { + "epoch": 0.4416, + "learning_rate": 1.620052932661632e-05, + "loss": 0.2698, + "step": 2208 + }, + { + "epoch": 0.442, + "learning_rate": 1.621147780278311e-05, + "loss": 0.3893, + "step": 2210 + }, + { + "epoch": 0.4424, + "learning_rate": 1.6222414169356056e-05, + "loss": 0.2368, + "step": 2212 + }, + { + "epoch": 0.4428, + "learning_rate": 1.6233338405014204e-05, + "loss": 0.235, + "step": 2214 + }, + { + "epoch": 0.4432, + "learning_rate": 1.6244250488460146e-05, + "loss": 0.3385, + "step": 2216 + }, + { + "epoch": 0.4436, + "learning_rate": 1.6255150398420273e-05, + "loss": 0.3234, + "step": 2218 + }, + { + "epoch": 0.444, + "learning_rate": 1.6266038113644605e-05, + "loss": 0.2758, + "step": 2220 + }, + { + "epoch": 0.4444, + "learning_rate": 1.6276913612907005e-05, + "loss": 0.4083, + "step": 2222 + }, + { + "epoch": 0.4448, + "learning_rate": 1.6287776875005127e-05, + "loss": 0.3699, + "step": 2224 + }, + { + "epoch": 0.4452, + "learning_rate": 1.6298627878760488e-05, + "loss": 0.3347, + "step": 2226 + }, + { + "epoch": 0.4456, + "learning_rate": 1.6309466603018497e-05, + "loss": 0.2307, + "step": 2228 + }, + { + "epoch": 0.446, + "learning_rate": 1.6320293026648508e-05, + "loss": 0.148, + "step": 2230 + }, + { + "epoch": 0.4464, + "learning_rate": 1.6331107128543856e-05, + "loss": 0.479, + "step": 2232 + }, + { + "epoch": 0.4468, + "learning_rate": 1.634190888762189e-05, + "loss": 0.4867, + "step": 2234 + }, + { + "epoch": 0.4472, + "learning_rate": 1.635269828282404e-05, + "loss": 0.2868, + "step": 2236 + }, + { + "epoch": 0.4476, + "learning_rate": 1.6363475293115818e-05, + "loss": 0.2859, + "step": 2238 + }, + { + "epoch": 0.448, + "learning_rate": 1.6374239897486905e-05, + "loss": 0.3376, + "step": 2240 + }, + { + "epoch": 0.4484, + "learning_rate": 1.6384992074951118e-05, + "loss": 0.3676, + "step": 2242 + }, + { + "epoch": 0.4488, + "learning_rate": 1.6395731804546575e-05, + "loss": 0.768, + "step": 2244 + }, + { + "epoch": 0.4492, + "learning_rate": 1.640645906533561e-05, + "loss": 0.1921, + "step": 2246 + }, + { + "epoch": 0.4496, + "learning_rate": 1.6417173836404878e-05, + "loss": 0.4381, + "step": 2248 + }, + { + "epoch": 0.45, + "learning_rate": 1.6427876096865397e-05, + "loss": 0.5386, + "step": 2250 + }, + { + "epoch": 0.4504, + "learning_rate": 1.643856582585253e-05, + "loss": 0.2152, + "step": 2252 + }, + { + "epoch": 0.4508, + "learning_rate": 1.6449243002526146e-05, + "loss": 0.3849, + "step": 2254 + }, + { + "epoch": 0.4512, + "learning_rate": 1.6459907606070513e-05, + "loss": 0.1825, + "step": 2256 + }, + { + "epoch": 0.4516, + "learning_rate": 1.6470559615694445e-05, + "loss": 0.1681, + "step": 2258 + }, + { + "epoch": 0.452, + "learning_rate": 1.6481199010631312e-05, + "loss": 0.3049, + "step": 2260 + }, + { + "epoch": 0.4524, + "learning_rate": 1.649182577013905e-05, + "loss": 0.1576, + "step": 2262 + }, + { + "epoch": 0.4528, + "learning_rate": 1.650243987350029e-05, + "loss": 0.2379, + "step": 2264 + }, + { + "epoch": 0.4532, + "learning_rate": 1.6513041300022253e-05, + "loss": 0.2395, + "step": 2266 + }, + { + "epoch": 0.4536, + "learning_rate": 1.652363002903693e-05, + "loss": 0.5379, + "step": 2268 + }, + { + "epoch": 0.454, + "learning_rate": 1.6534206039901054e-05, + "loss": 0.3202, + "step": 2270 + }, + { + "epoch": 0.4544, + "learning_rate": 1.6544769311996146e-05, + "loss": 0.3384, + "step": 2272 + }, + { + "epoch": 0.4548, + "learning_rate": 1.655531982472857e-05, + "loss": 0.4476, + "step": 2274 + }, + { + "epoch": 0.4552, + "learning_rate": 1.656585755752956e-05, + "loss": 0.3494, + "step": 2276 + }, + { + "epoch": 0.4556, + "learning_rate": 1.657638248985527e-05, + "loss": 0.4311, + "step": 2278 + }, + { + "epoch": 0.456, + "learning_rate": 1.65868946011868e-05, + "loss": 0.4375, + "step": 2280 + }, + { + "epoch": 0.4564, + "learning_rate": 1.6597393871030257e-05, + "loss": 0.3618, + "step": 2282 + }, + { + "epoch": 0.4568, + "learning_rate": 1.660788027891677e-05, + "loss": 0.2206, + "step": 2284 + }, + { + "epoch": 0.4572, + "learning_rate": 1.6618353804402573e-05, + "loss": 0.4364, + "step": 2286 + }, + { + "epoch": 0.4576, + "learning_rate": 1.6628814427068944e-05, + "loss": 0.3653, + "step": 2288 + }, + { + "epoch": 0.458, + "learning_rate": 1.663926212652242e-05, + "loss": 0.2758, + "step": 2290 + }, + { + "epoch": 0.4584, + "learning_rate": 1.6649696882394625e-05, + "loss": 0.9123, + "step": 2292 + }, + { + "epoch": 0.4588, + "learning_rate": 1.666011867434252e-05, + "loss": 0.19, + "step": 2294 + }, + { + "epoch": 0.4592, + "learning_rate": 1.667052748204825e-05, + "loss": 1.1338, + "step": 2296 + }, + { + "epoch": 0.4596, + "learning_rate": 1.6680923285219308e-05, + "loss": 0.3372, + "step": 2298 + }, + { + "epoch": 0.46, + "learning_rate": 1.6691306063588583e-05, + "loss": 0.3797, + "step": 2300 + }, + { + "epoch": 0.4604, + "learning_rate": 1.6701675796914273e-05, + "loss": 0.5665, + "step": 2302 + }, + { + "epoch": 0.4608, + "learning_rate": 1.6712032464980094e-05, + "loss": 0.3574, + "step": 2304 + }, + { + "epoch": 0.4612, + "learning_rate": 1.672237604759516e-05, + "loss": 0.2265, + "step": 2306 + }, + { + "epoch": 0.4616, + "learning_rate": 1.6732706524594138e-05, + "loss": 0.7398, + "step": 2308 + }, + { + "epoch": 0.462, + "learning_rate": 1.6743023875837233e-05, + "loss": 0.9553, + "step": 2310 + }, + { + "epoch": 0.4624, + "learning_rate": 1.6753328081210244e-05, + "loss": 0.3291, + "step": 2312 + }, + { + "epoch": 0.4628, + "learning_rate": 1.6763619120624592e-05, + "loss": 0.1931, + "step": 2314 + }, + { + "epoch": 0.4632, + "learning_rate": 1.6773896974017373e-05, + "loss": 0.2166, + "step": 2316 + }, + { + "epoch": 0.4636, + "learning_rate": 1.6784161621351377e-05, + "loss": 0.2324, + "step": 2318 + }, + { + "epoch": 0.464, + "learning_rate": 1.679441304261516e-05, + "loss": 0.3303, + "step": 2320 + }, + { + "epoch": 0.4644, + "learning_rate": 1.6804651217823048e-05, + "loss": 0.438, + "step": 2322 + }, + { + "epoch": 0.4648, + "learning_rate": 1.681487612701519e-05, + "loss": 0.1737, + "step": 2324 + }, + { + "epoch": 0.4652, + "learning_rate": 1.6825087750257624e-05, + "loss": 0.2907, + "step": 2326 + }, + { + "epoch": 0.4656, + "learning_rate": 1.683528606764222e-05, + "loss": 0.2737, + "step": 2328 + }, + { + "epoch": 0.466, + "learning_rate": 1.6845471059286893e-05, + "loss": 1.1182, + "step": 2330 + }, + { + "epoch": 0.4664, + "learning_rate": 1.6855642705335428e-05, + "loss": 0.2093, + "step": 2332 + }, + { + "epoch": 0.4668, + "learning_rate": 1.6865800985957718e-05, + "loss": 0.2632, + "step": 2334 + }, + { + "epoch": 0.4672, + "learning_rate": 1.687594588134968e-05, + "loss": 0.7126, + "step": 2336 + }, + { + "epoch": 0.4676, + "learning_rate": 1.6886077371733275e-05, + "loss": 0.3135, + "step": 2338 + }, + { + "epoch": 0.468, + "learning_rate": 1.68961954373567e-05, + "loss": 0.2669, + "step": 2340 + }, + { + "epoch": 0.4684, + "learning_rate": 1.690630005849423e-05, + "loss": 0.3582, + "step": 2342 + }, + { + "epoch": 0.4688, + "learning_rate": 1.6916391215446403e-05, + "loss": 0.2549, + "step": 2344 + }, + { + "epoch": 0.4692, + "learning_rate": 1.6926468888539988e-05, + "loss": 0.1866, + "step": 2346 + }, + { + "epoch": 0.4696, + "learning_rate": 1.693653305812805e-05, + "loss": 0.2429, + "step": 2348 + }, + { + "epoch": 0.47, + "learning_rate": 1.6946583704589973e-05, + "loss": 0.46, + "step": 2350 + }, + { + "epoch": 0.4704, + "learning_rate": 1.6956620808331505e-05, + "loss": 0.55, + "step": 2352 + }, + { + "epoch": 0.4708, + "learning_rate": 1.6966644349784805e-05, + "loss": 0.8673, + "step": 2354 + }, + { + "epoch": 0.4712, + "learning_rate": 1.697665430940846e-05, + "loss": 0.5867, + "step": 2356 + }, + { + "epoch": 0.4716, + "learning_rate": 1.698665066768755e-05, + "loss": 0.6155, + "step": 2358 + }, + { + "epoch": 0.472, + "learning_rate": 1.699663340513365e-05, + "loss": 0.4652, + "step": 2360 + }, + { + "epoch": 0.4724, + "learning_rate": 1.7006602502284913e-05, + "loss": 0.6428, + "step": 2362 + }, + { + "epoch": 0.4728, + "learning_rate": 1.7016557939706068e-05, + "loss": 0.5824, + "step": 2364 + }, + { + "epoch": 0.4732, + "learning_rate": 1.70264996979885e-05, + "loss": 0.2662, + "step": 2366 + }, + { + "epoch": 0.4736, + "learning_rate": 1.7036427757750198e-05, + "loss": 0.437, + "step": 2368 + }, + { + "epoch": 0.474, + "learning_rate": 1.7046342099635938e-05, + "loss": 0.3453, + "step": 2370 + }, + { + "epoch": 0.4744, + "learning_rate": 1.7056242704317212e-05, + "loss": 0.4872, + "step": 2372 + }, + { + "epoch": 0.4748, + "learning_rate": 1.706612955249224e-05, + "loss": 0.8936, + "step": 2374 + }, + { + "epoch": 0.4752, + "learning_rate": 1.7076002624886156e-05, + "loss": 0.6397, + "step": 2376 + }, + { + "epoch": 0.4756, + "learning_rate": 1.708586190225085e-05, + "loss": 0.5755, + "step": 2378 + }, + { + "epoch": 0.476, + "learning_rate": 1.709570736536521e-05, + "loss": 0.8526, + "step": 2380 + }, + { + "epoch": 0.4764, + "learning_rate": 1.710553899503496e-05, + "loss": 0.4317, + "step": 2382 + }, + { + "epoch": 0.4768, + "learning_rate": 1.7115356772092844e-05, + "loss": 0.3072, + "step": 2384 + }, + { + "epoch": 0.4772, + "learning_rate": 1.7125160677398625e-05, + "loss": 0.2822, + "step": 2386 + }, + { + "epoch": 0.4776, + "learning_rate": 1.7134950691839063e-05, + "loss": 0.231, + "step": 2388 + }, + { + "epoch": 0.478, + "learning_rate": 1.7144726796328034e-05, + "loss": 0.5123, + "step": 2390 + }, + { + "epoch": 0.4784, + "learning_rate": 1.7154488971806518e-05, + "loss": 0.771, + "step": 2392 + }, + { + "epoch": 0.4788, + "learning_rate": 1.716423719924266e-05, + "loss": 0.3119, + "step": 2394 + }, + { + "epoch": 0.4792, + "learning_rate": 1.7173971459631783e-05, + "loss": 0.8048, + "step": 2396 + }, + { + "epoch": 0.4796, + "learning_rate": 1.718369173399646e-05, + "loss": 1.0699, + "step": 2398 + }, + { + "epoch": 0.48, + "learning_rate": 1.7193398003386507e-05, + "loss": 0.2423, + "step": 2400 + }, + { + "epoch": 0.4804, + "learning_rate": 1.7203090248879063e-05, + "loss": 0.6115, + "step": 2402 + }, + { + "epoch": 0.4808, + "learning_rate": 1.7212768451578602e-05, + "loss": 0.3441, + "step": 2404 + }, + { + "epoch": 0.4812, + "learning_rate": 1.7222432592616963e-05, + "loss": 0.3284, + "step": 2406 + }, + { + "epoch": 0.4816, + "learning_rate": 1.7232082653153416e-05, + "loss": 0.5938, + "step": 2408 + }, + { + "epoch": 0.482, + "learning_rate": 1.724171861437467e-05, + "loss": 0.2126, + "step": 2410 + }, + { + "epoch": 0.4824, + "learning_rate": 1.7251340457494937e-05, + "loss": 0.2864, + "step": 2412 + }, + { + "epoch": 0.4828, + "learning_rate": 1.726094816375591e-05, + "loss": 0.2283, + "step": 2414 + }, + { + "epoch": 0.4832, + "learning_rate": 1.7270541714426923e-05, + "loss": 0.2939, + "step": 2416 + }, + { + "epoch": 0.4836, + "learning_rate": 1.7280121090804817e-05, + "loss": 0.1899, + "step": 2418 + }, + { + "epoch": 0.484, + "learning_rate": 1.7289686274214106e-05, + "loss": 0.4852, + "step": 2420 + }, + { + "epoch": 0.4844, + "learning_rate": 1.7299237246007018e-05, + "loss": 0.5644, + "step": 2422 + }, + { + "epoch": 0.4848, + "learning_rate": 1.7308773987563393e-05, + "loss": 0.7407, + "step": 2424 + }, + { + "epoch": 0.4852, + "learning_rate": 1.7318296480290912e-05, + "loss": 0.3082, + "step": 2426 + }, + { + "epoch": 0.4856, + "learning_rate": 1.732780470562496e-05, + "loss": 0.4842, + "step": 2428 + }, + { + "epoch": 0.486, + "learning_rate": 1.7337298645028764e-05, + "loss": 0.6445, + "step": 2430 + }, + { + "epoch": 0.4864, + "learning_rate": 1.7346778279993413e-05, + "loss": 0.4648, + "step": 2432 + }, + { + "epoch": 0.4868, + "learning_rate": 1.7356243592037872e-05, + "loss": 0.5855, + "step": 2434 + }, + { + "epoch": 0.4872, + "learning_rate": 1.736569456270903e-05, + "loss": 0.3935, + "step": 2436 + }, + { + "epoch": 0.4876, + "learning_rate": 1.7375131173581737e-05, + "loss": 1.2955, + "step": 2438 + }, + { + "epoch": 0.488, + "learning_rate": 1.7384553406258836e-05, + "loss": 0.5488, + "step": 2440 + }, + { + "epoch": 0.4884, + "learning_rate": 1.73939612423712e-05, + "loss": 0.6714, + "step": 2442 + }, + { + "epoch": 0.4888, + "learning_rate": 1.740335466357778e-05, + "loss": 0.3115, + "step": 2444 + }, + { + "epoch": 0.4892, + "learning_rate": 1.7412733651565607e-05, + "loss": 0.3188, + "step": 2446 + }, + { + "epoch": 0.4896, + "learning_rate": 1.7422098188049888e-05, + "loss": 0.9011, + "step": 2448 + }, + { + "epoch": 0.49, + "learning_rate": 1.7431448254773936e-05, + "loss": 0.4676, + "step": 2450 + }, + { + "epoch": 0.4904, + "learning_rate": 1.7440783833509373e-05, + "loss": 0.4209, + "step": 2452 + }, + { + "epoch": 0.4908, + "learning_rate": 1.7450104906055956e-05, + "loss": 0.3071, + "step": 2454 + }, + { + "epoch": 0.4912, + "learning_rate": 1.7459411454241816e-05, + "loss": 0.6033, + "step": 2456 + }, + { + "epoch": 0.4916, + "learning_rate": 1.746870345992336e-05, + "loss": 0.2668, + "step": 2458 + }, + { + "epoch": 0.492, + "learning_rate": 1.747798090498531e-05, + "loss": 0.8674, + "step": 2460 + }, + { + "epoch": 0.4924, + "learning_rate": 1.7487243771340865e-05, + "loss": 0.2945, + "step": 2462 + }, + { + "epoch": 0.4928, + "learning_rate": 1.749649204093154e-05, + "loss": 0.2766, + "step": 2464 + }, + { + "epoch": 0.4932, + "learning_rate": 1.750572569572741e-05, + "loss": 0.2639, + "step": 2466 + }, + { + "epoch": 0.4936, + "learning_rate": 1.7514944717726962e-05, + "loss": 0.2383, + "step": 2468 + }, + { + "epoch": 0.494, + "learning_rate": 1.7524149088957244e-05, + "loss": 0.3438, + "step": 2470 + }, + { + "epoch": 0.4944, + "learning_rate": 1.753333879147387e-05, + "loss": 0.1535, + "step": 2472 + }, + { + "epoch": 0.4948, + "learning_rate": 1.7542513807361037e-05, + "loss": 0.4201, + "step": 2474 + }, + { + "epoch": 0.4952, + "learning_rate": 1.755167411873159e-05, + "loss": 0.3436, + "step": 2476 + }, + { + "epoch": 0.4956, + "learning_rate": 1.7560819707727027e-05, + "loss": 0.3246, + "step": 2478 + }, + { + "epoch": 0.496, + "learning_rate": 1.7569950556517563e-05, + "loss": 0.365, + "step": 2480 + }, + { + "epoch": 0.4964, + "learning_rate": 1.757906664730213e-05, + "loss": 0.3508, + "step": 2482 + }, + { + "epoch": 0.4968, + "learning_rate": 1.758816796230845e-05, + "loss": 0.3415, + "step": 2484 + }, + { + "epoch": 0.4972, + "learning_rate": 1.759725448379304e-05, + "loss": 0.6858, + "step": 2486 + }, + { + "epoch": 0.4976, + "learning_rate": 1.7606326194041278e-05, + "loss": 0.3102, + "step": 2488 + }, + { + "epoch": 0.498, + "learning_rate": 1.7615383075367363e-05, + "loss": 0.394, + "step": 2490 + }, + { + "epoch": 0.4984, + "learning_rate": 1.762442511011447e-05, + "loss": 0.1987, + "step": 2492 + }, + { + "epoch": 0.4988, + "learning_rate": 1.763345228065469e-05, + "loss": 0.6002, + "step": 2494 + }, + { + "epoch": 0.4992, + "learning_rate": 1.7642464569389083e-05, + "loss": 0.6154, + "step": 2496 + }, + { + "epoch": 0.4996, + "learning_rate": 1.7651461958747745e-05, + "loss": 0.2287, + "step": 2498 + }, + { + "epoch": 0.5, + "learning_rate": 1.766044443118977e-05, + "loss": 0.2461, + "step": 2500 + }, + { + "epoch": 0.5004, + "learning_rate": 1.766941196920342e-05, + "loss": 0.1849, + "step": 2502 + }, + { + "epoch": 0.5008, + "learning_rate": 1.767836455530598e-05, + "loss": 0.2931, + "step": 2504 + }, + { + "epoch": 0.5012, + "learning_rate": 1.7687302172043933e-05, + "loss": 0.3053, + "step": 2506 + }, + { + "epoch": 0.5016, + "learning_rate": 1.7696224801992947e-05, + "loss": 0.2649, + "step": 2508 + }, + { + "epoch": 0.502, + "learning_rate": 1.7705132427757885e-05, + "loss": 0.5216, + "step": 2510 + }, + { + "epoch": 0.5024, + "learning_rate": 1.77140250319729e-05, + "loss": 0.2452, + "step": 2512 + }, + { + "epoch": 0.5028, + "learning_rate": 1.7722902597301385e-05, + "loss": 0.197, + "step": 2514 + }, + { + "epoch": 0.5032, + "learning_rate": 1.7731765106436073e-05, + "loss": 0.4559, + "step": 2516 + }, + { + "epoch": 0.5036, + "learning_rate": 1.774061254209905e-05, + "loss": 0.7388, + "step": 2518 + }, + { + "epoch": 0.504, + "learning_rate": 1.7749444887041793e-05, + "loss": 0.8467, + "step": 2520 + }, + { + "epoch": 0.5044, + "learning_rate": 1.7758262124045192e-05, + "loss": 0.1547, + "step": 2522 + }, + { + "epoch": 0.5048, + "learning_rate": 1.776706423591959e-05, + "loss": 0.2313, + "step": 2524 + }, + { + "epoch": 0.5052, + "learning_rate": 1.7775851205504816e-05, + "loss": 0.2627, + "step": 2526 + }, + { + "epoch": 0.5056, + "learning_rate": 1.778462301567023e-05, + "loss": 0.2751, + "step": 2528 + }, + { + "epoch": 0.506, + "learning_rate": 1.7793379649314736e-05, + "loss": 0.4582, + "step": 2530 + }, + { + "epoch": 0.5064, + "learning_rate": 1.7802121089366832e-05, + "loss": 0.3208, + "step": 2532 + }, + { + "epoch": 0.5068, + "learning_rate": 1.7810847318784635e-05, + "loss": 0.1699, + "step": 2534 + }, + { + "epoch": 0.5072, + "learning_rate": 1.7819558320555895e-05, + "loss": 0.2221, + "step": 2536 + }, + { + "epoch": 0.5076, + "learning_rate": 1.7828254077698103e-05, + "loss": 0.7615, + "step": 2538 + }, + { + "epoch": 0.508, + "learning_rate": 1.7836934573258392e-05, + "loss": 0.3235, + "step": 2540 + }, + { + "epoch": 0.5084, + "learning_rate": 1.7845599790313735e-05, + "loss": 0.327, + "step": 2542 + }, + { + "epoch": 0.5088, + "learning_rate": 1.785424971197082e-05, + "loss": 0.4965, + "step": 2544 + }, + { + "epoch": 0.5092, + "learning_rate": 1.786288432136618e-05, + "loss": 0.2072, + "step": 2546 + }, + { + "epoch": 0.5096, + "learning_rate": 1.7871503601666233e-05, + "loss": 0.2234, + "step": 2548 + }, + { + "epoch": 0.51, + "learning_rate": 1.788010753606722e-05, + "loss": 0.5857, + "step": 2550 + }, + { + "epoch": 0.5104, + "learning_rate": 1.7888696107795343e-05, + "loss": 0.2425, + "step": 2552 + }, + { + "epoch": 0.5108, + "learning_rate": 1.7897269300106735e-05, + "loss": 0.3323, + "step": 2554 + }, + { + "epoch": 0.5112, + "learning_rate": 1.790582709628753e-05, + "loss": 0.9971, + "step": 2556 + }, + { + "epoch": 0.5116, + "learning_rate": 1.7914369479653854e-05, + "loss": 0.9396, + "step": 2558 + }, + { + "epoch": 0.512, + "learning_rate": 1.7922896433551903e-05, + "loss": 0.2327, + "step": 2560 + }, + { + "epoch": 0.5124, + "learning_rate": 1.7931407941357945e-05, + "loss": 0.3977, + "step": 2562 + }, + { + "epoch": 0.5128, + "learning_rate": 1.793990398647835e-05, + "loss": 0.5989, + "step": 2564 + }, + { + "epoch": 0.5132, + "learning_rate": 1.7948384552349655e-05, + "loss": 0.2198, + "step": 2566 + }, + { + "epoch": 0.5136, + "learning_rate": 1.795684962243855e-05, + "loss": 0.2942, + "step": 2568 + }, + { + "epoch": 0.514, + "learning_rate": 1.796529918024196e-05, + "loss": 0.2296, + "step": 2570 + }, + { + "epoch": 0.5144, + "learning_rate": 1.7973733209287032e-05, + "loss": 0.2521, + "step": 2572 + }, + { + "epoch": 0.5148, + "learning_rate": 1.798215169313121e-05, + "loss": 0.4464, + "step": 2574 + }, + { + "epoch": 0.5152, + "learning_rate": 1.7990554615362193e-05, + "loss": 0.1775, + "step": 2576 + }, + { + "epoch": 0.5156, + "learning_rate": 1.79989419595981e-05, + "loss": 0.6601, + "step": 2578 + }, + { + "epoch": 0.516, + "learning_rate": 1.800731370948734e-05, + "loss": 0.3002, + "step": 2580 + }, + { + "epoch": 0.5164, + "learning_rate": 1.8015669848708757e-05, + "loss": 0.2971, + "step": 2582 + }, + { + "epoch": 0.5168, + "learning_rate": 1.802401036097167e-05, + "loss": 0.6488, + "step": 2584 + }, + { + "epoch": 0.5172, + "learning_rate": 1.803233523001577e-05, + "loss": 0.2016, + "step": 2586 + }, + { + "epoch": 0.5176, + "learning_rate": 1.804064443961135e-05, + "loss": 0.4956, + "step": 2588 + }, + { + "epoch": 0.518, + "learning_rate": 1.804893797355914e-05, + "loss": 0.1899, + "step": 2590 + }, + { + "epoch": 0.5184, + "learning_rate": 1.8057215815690494e-05, + "loss": 0.3495, + "step": 2592 + }, + { + "epoch": 0.5188, + "learning_rate": 1.8065477949867327e-05, + "loss": 0.3839, + "step": 2594 + }, + { + "epoch": 0.5192, + "learning_rate": 1.8073724359982184e-05, + "loss": 0.4356, + "step": 2596 + }, + { + "epoch": 0.5196, + "learning_rate": 1.808195502995827e-05, + "loss": 0.3281, + "step": 2598 + }, + { + "epoch": 0.52, + "learning_rate": 1.809016994374947e-05, + "loss": 0.2735, + "step": 2600 + }, + { + "epoch": 0.5204, + "learning_rate": 1.8098369085340397e-05, + "loss": 0.5462, + "step": 2602 + }, + { + "epoch": 0.5208, + "learning_rate": 1.81065524387464e-05, + "loss": 0.2245, + "step": 2604 + }, + { + "epoch": 0.5212, + "learning_rate": 1.8114719988013606e-05, + "loss": 0.2426, + "step": 2606 + }, + { + "epoch": 0.5216, + "learning_rate": 1.8122871717218968e-05, + "loss": 0.1494, + "step": 2608 + }, + { + "epoch": 0.522, + "learning_rate": 1.813100761047028e-05, + "loss": 0.9317, + "step": 2610 + }, + { + "epoch": 0.5224, + "learning_rate": 1.8139127651906176e-05, + "loss": 0.3669, + "step": 2612 + }, + { + "epoch": 0.5228, + "learning_rate": 1.8147231825696258e-05, + "loss": 0.5173, + "step": 2614 + }, + { + "epoch": 0.5232, + "learning_rate": 1.8155320116040976e-05, + "loss": 0.4069, + "step": 2616 + }, + { + "epoch": 0.5236, + "learning_rate": 1.8163392507171834e-05, + "loss": 0.7102, + "step": 2618 + }, + { + "epoch": 0.524, + "learning_rate": 1.817144898335129e-05, + "loss": 0.4785, + "step": 2620 + }, + { + "epoch": 0.5244, + "learning_rate": 1.8179489528872797e-05, + "loss": 0.6653, + "step": 2622 + }, + { + "epoch": 0.5248, + "learning_rate": 1.818751412806095e-05, + "loss": 0.4296, + "step": 2624 + }, + { + "epoch": 0.5252, + "learning_rate": 1.819552276527134e-05, + "loss": 0.3324, + "step": 2626 + }, + { + "epoch": 0.5256, + "learning_rate": 1.8203515424890738e-05, + "loss": 0.3864, + "step": 2628 + }, + { + "epoch": 0.526, + "learning_rate": 1.821149209133704e-05, + "loss": 0.224, + "step": 2630 + }, + { + "epoch": 0.5264, + "learning_rate": 1.8219452749059322e-05, + "loss": 0.4208, + "step": 2632 + }, + { + "epoch": 0.5268, + "learning_rate": 1.82273973825379e-05, + "loss": 0.22, + "step": 2634 + }, + { + "epoch": 0.5272, + "learning_rate": 1.8235325976284276e-05, + "loss": 0.245, + "step": 2636 + }, + { + "epoch": 0.5276, + "learning_rate": 1.8243238514841258e-05, + "loss": 0.9012, + "step": 2638 + }, + { + "epoch": 0.528, + "learning_rate": 1.8251134982782952e-05, + "loss": 0.2609, + "step": 2640 + }, + { + "epoch": 0.5284, + "learning_rate": 1.8259015364714786e-05, + "loss": 0.4267, + "step": 2642 + }, + { + "epoch": 0.5288, + "learning_rate": 1.826687964527355e-05, + "loss": 0.4534, + "step": 2644 + }, + { + "epoch": 0.5292, + "learning_rate": 1.8274727809127437e-05, + "loss": 0.5397, + "step": 2646 + }, + { + "epoch": 0.5296, + "learning_rate": 1.828255984097604e-05, + "loss": 0.4225, + "step": 2648 + }, + { + "epoch": 0.53, + "learning_rate": 1.8290375725550413e-05, + "loss": 0.4784, + "step": 2650 + }, + { + "epoch": 0.5304, + "learning_rate": 1.8298175447613093e-05, + "loss": 0.468, + "step": 2652 + }, + { + "epoch": 0.5308, + "learning_rate": 1.8305958991958125e-05, + "loss": 0.6033, + "step": 2654 + }, + { + "epoch": 0.5312, + "learning_rate": 1.8313726343411092e-05, + "loss": 0.537, + "step": 2656 + }, + { + "epoch": 0.5316, + "learning_rate": 1.832147748682912e-05, + "loss": 0.5131, + "step": 2658 + }, + { + "epoch": 0.532, + "learning_rate": 1.8329212407101e-05, + "loss": 0.7175, + "step": 2660 + }, + { + "epoch": 0.5324, + "learning_rate": 1.8336931089147065e-05, + "loss": 0.5153, + "step": 2662 + }, + { + "epoch": 0.5328, + "learning_rate": 1.8344633517919394e-05, + "loss": 0.1645, + "step": 2664 + }, + { + "epoch": 0.5332, + "learning_rate": 1.8352319678401677e-05, + "loss": 0.1632, + "step": 2666 + }, + { + "epoch": 0.5336, + "learning_rate": 1.8359989555609344e-05, + "loss": 0.4144, + "step": 2668 + }, + { + "epoch": 0.534, + "learning_rate": 1.836764313458962e-05, + "loss": 0.197, + "step": 2670 + }, + { + "epoch": 0.5344, + "learning_rate": 1.8375280400421407e-05, + "loss": 0.2132, + "step": 2672 + }, + { + "epoch": 0.5348, + "learning_rate": 1.8382901338215515e-05, + "loss": 0.3925, + "step": 2674 + }, + { + "epoch": 0.5352, + "learning_rate": 1.8390505933114503e-05, + "loss": 0.5536, + "step": 2676 + }, + { + "epoch": 0.5356, + "learning_rate": 1.839809417029283e-05, + "loss": 0.1782, + "step": 2678 + }, + { + "epoch": 0.536, + "learning_rate": 1.8405666034956842e-05, + "loss": 0.2738, + "step": 2680 + }, + { + "epoch": 0.5364, + "learning_rate": 1.8413221512344805e-05, + "loss": 0.2328, + "step": 2682 + }, + { + "epoch": 0.5368, + "learning_rate": 1.842076058772692e-05, + "loss": 0.1638, + "step": 2684 + }, + { + "epoch": 0.5372, + "learning_rate": 1.8428283246405386e-05, + "loss": 0.39, + "step": 2686 + }, + { + "epoch": 0.5376, + "learning_rate": 1.8435789473714384e-05, + "loss": 0.9356, + "step": 2688 + }, + { + "epoch": 0.538, + "learning_rate": 1.844327925502015e-05, + "loss": 0.45, + "step": 2690 + }, + { + "epoch": 0.5384, + "learning_rate": 1.8450752575720964e-05, + "loss": 0.2089, + "step": 2692 + }, + { + "epoch": 0.5388, + "learning_rate": 1.8458209421247205e-05, + "loss": 0.2815, + "step": 2694 + }, + { + "epoch": 0.5392, + "learning_rate": 1.8465649777061384e-05, + "loss": 0.4918, + "step": 2696 + }, + { + "epoch": 0.5396, + "learning_rate": 1.8473073628658116e-05, + "loss": 0.5572, + "step": 2698 + }, + { + "epoch": 0.54, + "learning_rate": 1.8480480961564266e-05, + "loss": 0.4397, + "step": 2700 + }, + { + "epoch": 0.5404, + "learning_rate": 1.848787176133881e-05, + "loss": 0.22, + "step": 2702 + }, + { + "epoch": 0.5408, + "learning_rate": 1.8495246013573047e-05, + "loss": 0.3939, + "step": 2704 + }, + { + "epoch": 0.5412, + "learning_rate": 1.850260370389049e-05, + "loss": 0.2493, + "step": 2706 + }, + { + "epoch": 0.5416, + "learning_rate": 1.850994481794691e-05, + "loss": 0.2634, + "step": 2708 + }, + { + "epoch": 0.542, + "learning_rate": 1.851726934143048e-05, + "loss": 0.6888, + "step": 2710 + }, + { + "epoch": 0.5424, + "learning_rate": 1.8524577260061628e-05, + "loss": 0.3229, + "step": 2712 + }, + { + "epoch": 0.5428, + "learning_rate": 1.8531868559593205e-05, + "loss": 0.2556, + "step": 2714 + }, + { + "epoch": 0.5432, + "learning_rate": 1.8539143225810453e-05, + "loss": 0.2831, + "step": 2716 + }, + { + "epoch": 0.5436, + "learning_rate": 1.8546401244531028e-05, + "loss": 0.3295, + "step": 2718 + }, + { + "epoch": 0.544, + "learning_rate": 1.8553642601605066e-05, + "loss": 0.4372, + "step": 2720 + }, + { + "epoch": 0.5444, + "learning_rate": 1.856086728291516e-05, + "loss": 0.3676, + "step": 2722 + }, + { + "epoch": 0.5448, + "learning_rate": 1.856807527437643e-05, + "loss": 0.3733, + "step": 2724 + }, + { + "epoch": 0.5452, + "learning_rate": 1.857526656193652e-05, + "loss": 0.2454, + "step": 2726 + }, + { + "epoch": 0.5456, + "learning_rate": 1.8582441131575658e-05, + "loss": 0.2234, + "step": 2728 + }, + { + "epoch": 0.546, + "learning_rate": 1.8589598969306643e-05, + "loss": 0.5919, + "step": 2730 + }, + { + "epoch": 0.5464, + "learning_rate": 1.859674006117491e-05, + "loss": 0.2506, + "step": 2732 + }, + { + "epoch": 0.5468, + "learning_rate": 1.860386439325853e-05, + "loss": 0.6134, + "step": 2734 + }, + { + "epoch": 0.5472, + "learning_rate": 1.8610971951668268e-05, + "loss": 0.4984, + "step": 2736 + }, + { + "epoch": 0.5476, + "learning_rate": 1.8618062722547544e-05, + "loss": 0.4341, + "step": 2738 + }, + { + "epoch": 0.548, + "learning_rate": 1.862513669207257e-05, + "loss": 0.5518, + "step": 2740 + }, + { + "epoch": 0.5484, + "learning_rate": 1.8632193846452274e-05, + "loss": 0.6177, + "step": 2742 + }, + { + "epoch": 0.5488, + "learning_rate": 1.8639234171928348e-05, + "loss": 0.2369, + "step": 2744 + }, + { + "epoch": 0.5492, + "learning_rate": 1.8646257654775354e-05, + "loss": 0.1664, + "step": 2746 + }, + { + "epoch": 0.5496, + "learning_rate": 1.8653264281300612e-05, + "loss": 0.6844, + "step": 2748 + }, + { + "epoch": 0.55, + "learning_rate": 1.866025403784439e-05, + "loss": 0.543, + "step": 2750 + }, + { + "epoch": 0.5504, + "learning_rate": 1.8667226910779767e-05, + "loss": 0.2655, + "step": 2752 + }, + { + "epoch": 0.5508, + "learning_rate": 1.8674182886512776e-05, + "loss": 0.1754, + "step": 2754 + }, + { + "epoch": 0.5512, + "learning_rate": 1.8681121951482393e-05, + "loss": 0.9122, + "step": 2756 + }, + { + "epoch": 0.5516, + "learning_rate": 1.8688044092160554e-05, + "loss": 0.3925, + "step": 2758 + }, + { + "epoch": 0.552, + "learning_rate": 1.869494929505219e-05, + "loss": 0.5426, + "step": 2760 + }, + { + "epoch": 0.5524, + "learning_rate": 1.8701837546695256e-05, + "loss": 0.9349, + "step": 2762 + }, + { + "epoch": 0.5528, + "learning_rate": 1.870870883366075e-05, + "loss": 0.5577, + "step": 2764 + }, + { + "epoch": 0.5532, + "learning_rate": 1.871556314255275e-05, + "loss": 0.3198, + "step": 2766 + }, + { + "epoch": 0.5536, + "learning_rate": 1.8722400460008434e-05, + "loss": 0.3837, + "step": 2768 + }, + { + "epoch": 0.554, + "learning_rate": 1.8729220772698093e-05, + "loss": 0.2701, + "step": 2770 + }, + { + "epoch": 0.5544, + "learning_rate": 1.8736024067325195e-05, + "loss": 0.2273, + "step": 2772 + }, + { + "epoch": 0.5548, + "learning_rate": 1.8742810330626335e-05, + "loss": 0.537, + "step": 2774 + }, + { + "epoch": 0.5552, + "learning_rate": 1.8749579549371373e-05, + "loss": 0.2907, + "step": 2776 + }, + { + "epoch": 0.5556, + "learning_rate": 1.8756331710363368e-05, + "loss": 0.1757, + "step": 2778 + }, + { + "epoch": 0.556, + "learning_rate": 1.876306680043863e-05, + "loss": 0.3735, + "step": 2780 + }, + { + "epoch": 0.5564, + "learning_rate": 1.876978480646677e-05, + "loss": 0.8837, + "step": 2782 + }, + { + "epoch": 0.5568, + "learning_rate": 1.8776485715350665e-05, + "loss": 0.4366, + "step": 2784 + }, + { + "epoch": 0.5572, + "learning_rate": 1.878316951402658e-05, + "loss": 0.7376, + "step": 2786 + }, + { + "epoch": 0.5576, + "learning_rate": 1.878983618946409e-05, + "loss": 0.4663, + "step": 2788 + }, + { + "epoch": 0.558, + "learning_rate": 1.879648572866617e-05, + "loss": 0.2834, + "step": 2790 + }, + { + "epoch": 0.5584, + "learning_rate": 1.8803118118669203e-05, + "loss": 0.1675, + "step": 2792 + }, + { + "epoch": 0.5588, + "learning_rate": 1.8809733346543006e-05, + "loss": 0.431, + "step": 2794 + }, + { + "epoch": 0.5592, + "learning_rate": 1.881633139939087e-05, + "loss": 0.4389, + "step": 2796 + }, + { + "epoch": 0.5596, + "learning_rate": 1.8822912264349532e-05, + "loss": 0.3202, + "step": 2798 + }, + { + "epoch": 0.56, + "learning_rate": 1.882947592858927e-05, + "loss": 0.7309, + "step": 2800 + }, + { + "epoch": 0.5604, + "learning_rate": 1.8836022379313877e-05, + "loss": 0.4349, + "step": 2802 + }, + { + "epoch": 0.5608, + "learning_rate": 1.884255160376072e-05, + "loss": 0.1949, + "step": 2804 + }, + { + "epoch": 0.5612, + "learning_rate": 1.8849063589200744e-05, + "loss": 0.3349, + "step": 2806 + }, + { + "epoch": 0.5616, + "learning_rate": 1.885555832293849e-05, + "loss": 0.1959, + "step": 2808 + }, + { + "epoch": 0.562, + "learning_rate": 1.8862035792312145e-05, + "loss": 0.2014, + "step": 2810 + }, + { + "epoch": 0.5624, + "learning_rate": 1.886849598469356e-05, + "loss": 0.5043, + "step": 2812 + }, + { + "epoch": 0.5628, + "learning_rate": 1.8874938887488246e-05, + "loss": 0.2931, + "step": 2814 + }, + { + "epoch": 0.5632, + "learning_rate": 1.888136448813544e-05, + "loss": 0.3209, + "step": 2816 + }, + { + "epoch": 0.5636, + "learning_rate": 1.888777277410812e-05, + "loss": 0.3802, + "step": 2818 + }, + { + "epoch": 0.564, + "learning_rate": 1.8894163732912972e-05, + "loss": 0.2898, + "step": 2820 + }, + { + "epoch": 0.5644, + "learning_rate": 1.890053735209053e-05, + "loss": 0.5746, + "step": 2822 + }, + { + "epoch": 0.5648, + "learning_rate": 1.890689361921506e-05, + "loss": 0.2194, + "step": 2824 + }, + { + "epoch": 0.5652, + "learning_rate": 1.8913232521894737e-05, + "loss": 0.7404, + "step": 2826 + }, + { + "epoch": 0.5656, + "learning_rate": 1.891955404777151e-05, + "loss": 0.4223, + "step": 2828 + }, + { + "epoch": 0.566, + "learning_rate": 1.8925858184521248e-05, + "loss": 0.9859, + "step": 2830 + }, + { + "epoch": 0.5664, + "learning_rate": 1.893214491985374e-05, + "loss": 0.5151, + "step": 2832 + }, + { + "epoch": 0.5668, + "learning_rate": 1.8938414241512634e-05, + "loss": 0.4669, + "step": 2834 + }, + { + "epoch": 0.5672, + "learning_rate": 1.89446661372756e-05, + "loss": 0.2311, + "step": 2836 + }, + { + "epoch": 0.5676, + "learning_rate": 1.8950900594954226e-05, + "loss": 0.2681, + "step": 2838 + }, + { + "epoch": 0.568, + "learning_rate": 1.895711760239413e-05, + "loss": 0.296, + "step": 2840 + }, + { + "epoch": 0.5684, + "learning_rate": 1.896331714747493e-05, + "loss": 0.5119, + "step": 2842 + }, + { + "epoch": 0.5688, + "learning_rate": 1.89694992181103e-05, + "loss": 0.2214, + "step": 2844 + }, + { + "epoch": 0.5692, + "learning_rate": 1.8975663802247975e-05, + "loss": 0.6633, + "step": 2846 + }, + { + "epoch": 0.5696, + "learning_rate": 1.8981810887869784e-05, + "loss": 0.2545, + "step": 2848 + }, + { + "epoch": 0.57, + "learning_rate": 1.898794046299167e-05, + "loss": 0.5691, + "step": 2850 + }, + { + "epoch": 0.5704, + "learning_rate": 1.8994052515663708e-05, + "loss": 0.3815, + "step": 2852 + }, + { + "epoch": 0.5708, + "learning_rate": 1.9000147033970144e-05, + "loss": 0.3799, + "step": 2854 + }, + { + "epoch": 0.5712, + "learning_rate": 1.90062240060294e-05, + "loss": 0.3199, + "step": 2856 + }, + { + "epoch": 0.5716, + "learning_rate": 1.901228341999412e-05, + "loss": 0.1976, + "step": 2858 + }, + { + "epoch": 0.572, + "learning_rate": 1.9018325264051136e-05, + "loss": 0.6731, + "step": 2860 + }, + { + "epoch": 0.5724, + "learning_rate": 1.9024349526421596e-05, + "loss": 0.2571, + "step": 2862 + }, + { + "epoch": 0.5728, + "learning_rate": 1.9030356195360868e-05, + "loss": 0.2343, + "step": 2864 + }, + { + "epoch": 0.5732, + "learning_rate": 1.903634525915866e-05, + "loss": 0.3876, + "step": 2866 + }, + { + "epoch": 0.5736, + "learning_rate": 1.904231670613899e-05, + "loss": 0.5748, + "step": 2868 + }, + { + "epoch": 0.574, + "learning_rate": 1.904827052466019e-05, + "loss": 0.1481, + "step": 2870 + }, + { + "epoch": 0.5744, + "learning_rate": 1.905420670311502e-05, + "loss": 0.5691, + "step": 2872 + }, + { + "epoch": 0.5748, + "learning_rate": 1.9060125229930572e-05, + "loss": 0.1555, + "step": 2874 + }, + { + "epoch": 0.5752, + "learning_rate": 1.906602609356838e-05, + "loss": 0.4366, + "step": 2876 + }, + { + "epoch": 0.5756, + "learning_rate": 1.907190928252441e-05, + "loss": 0.2662, + "step": 2878 + }, + { + "epoch": 0.576, + "learning_rate": 1.9077774785329078e-05, + "loss": 0.6932, + "step": 2880 + }, + { + "epoch": 0.5764, + "learning_rate": 1.908362259054731e-05, + "loss": 0.2412, + "step": 2882 + }, + { + "epoch": 0.5768, + "learning_rate": 1.9089452686778487e-05, + "loss": 0.3372, + "step": 2884 + }, + { + "epoch": 0.5772, + "learning_rate": 1.9095265062656542e-05, + "loss": 0.2651, + "step": 2886 + }, + { + "epoch": 0.5776, + "learning_rate": 1.9101059706849957e-05, + "loss": 0.1816, + "step": 2888 + }, + { + "epoch": 0.578, + "learning_rate": 1.910683660806177e-05, + "loss": 0.2195, + "step": 2890 + }, + { + "epoch": 0.5784, + "learning_rate": 1.911259575502962e-05, + "loss": 0.1733, + "step": 2892 + }, + { + "epoch": 0.5788, + "learning_rate": 1.9118337136525754e-05, + "loss": 0.2056, + "step": 2894 + }, + { + "epoch": 0.5792, + "learning_rate": 1.912406074135706e-05, + "loss": 0.1937, + "step": 2896 + }, + { + "epoch": 0.5796, + "learning_rate": 1.912976655836507e-05, + "loss": 0.4733, + "step": 2898 + }, + { + "epoch": 0.58, + "learning_rate": 1.9135454576426006e-05, + "loss": 0.4264, + "step": 2900 + }, + { + "epoch": 0.5804, + "learning_rate": 1.9141124784450786e-05, + "loss": 0.8671, + "step": 2902 + }, + { + "epoch": 0.5808, + "learning_rate": 1.9146777171385053e-05, + "loss": 0.2106, + "step": 2904 + }, + { + "epoch": 0.5812, + "learning_rate": 1.9152411726209172e-05, + "loss": 0.6077, + "step": 2906 + }, + { + "epoch": 0.5816, + "learning_rate": 1.9158028437938316e-05, + "loss": 0.4557, + "step": 2908 + }, + { + "epoch": 0.582, + "learning_rate": 1.916362729562239e-05, + "loss": 0.5624, + "step": 2910 + }, + { + "epoch": 0.5824, + "learning_rate": 1.9169208288346168e-05, + "loss": 0.4124, + "step": 2912 + }, + { + "epoch": 0.5828, + "learning_rate": 1.9174771405229187e-05, + "loss": 0.5527, + "step": 2914 + }, + { + "epoch": 0.5832, + "learning_rate": 1.9180316635425876e-05, + "loss": 0.3937, + "step": 2916 + }, + { + "epoch": 0.5836, + "learning_rate": 1.9185843968125543e-05, + "loss": 0.4412, + "step": 2918 + }, + { + "epoch": 0.584, + "learning_rate": 1.9191353392552346e-05, + "loss": 0.579, + "step": 2920 + }, + { + "epoch": 0.5844, + "learning_rate": 1.919684489796539e-05, + "loss": 0.4809, + "step": 2922 + }, + { + "epoch": 0.5848, + "learning_rate": 1.9202318473658703e-05, + "loss": 0.4309, + "step": 2924 + }, + { + "epoch": 0.5852, + "learning_rate": 1.9207774108961273e-05, + "loss": 0.3991, + "step": 2926 + }, + { + "epoch": 0.5856, + "learning_rate": 1.9213211793237052e-05, + "loss": 0.5323, + "step": 2928 + }, + { + "epoch": 0.586, + "learning_rate": 1.9218631515885004e-05, + "loss": 0.3075, + "step": 2930 + }, + { + "epoch": 0.5864, + "learning_rate": 1.92240332663391e-05, + "loss": 0.3632, + "step": 2932 + }, + { + "epoch": 0.5868, + "learning_rate": 1.922941703406835e-05, + "loss": 0.5095, + "step": 2934 + }, + { + "epoch": 0.5872, + "learning_rate": 1.923478280857682e-05, + "loss": 0.6724, + "step": 2936 + }, + { + "epoch": 0.5876, + "learning_rate": 1.9240130579403663e-05, + "loss": 0.3858, + "step": 2938 + }, + { + "epoch": 0.588, + "learning_rate": 1.924546033612313e-05, + "loss": 0.7653, + "step": 2940 + }, + { + "epoch": 0.5884, + "learning_rate": 1.9250772068344577e-05, + "loss": 0.4519, + "step": 2942 + }, + { + "epoch": 0.5888, + "learning_rate": 1.9256065765712524e-05, + "loss": 0.2317, + "step": 2944 + }, + { + "epoch": 0.5892, + "learning_rate": 1.9261341417906615e-05, + "loss": 0.4186, + "step": 2946 + }, + { + "epoch": 0.5896, + "learning_rate": 1.9266599014641724e-05, + "loss": 0.2112, + "step": 2948 + }, + { + "epoch": 0.59, + "learning_rate": 1.9271838545667876e-05, + "loss": 0.1999, + "step": 2950 + }, + { + "epoch": 0.5904, + "learning_rate": 1.927706000077034e-05, + "loss": 0.2858, + "step": 2952 + }, + { + "epoch": 0.5908, + "learning_rate": 1.9282263369769633e-05, + "loss": 0.6399, + "step": 2954 + }, + { + "epoch": 0.5912, + "learning_rate": 1.9287448642521507e-05, + "loss": 0.2195, + "step": 2956 + }, + { + "epoch": 0.5916, + "learning_rate": 1.9292615808917024e-05, + "loss": 0.2078, + "step": 2958 + }, + { + "epoch": 0.592, + "learning_rate": 1.9297764858882516e-05, + "loss": 0.229, + "step": 2960 + }, + { + "epoch": 0.5924, + "learning_rate": 1.9302895782379648e-05, + "loss": 0.5712, + "step": 2962 + }, + { + "epoch": 0.5928, + "learning_rate": 1.9308008569405424e-05, + "loss": 0.346, + "step": 2964 + }, + { + "epoch": 0.5932, + "learning_rate": 1.9313103209992205e-05, + "loss": 0.4485, + "step": 2966 + }, + { + "epoch": 0.5936, + "learning_rate": 1.9318179694207722e-05, + "loss": 0.1452, + "step": 2968 + }, + { + "epoch": 0.594, + "learning_rate": 1.932323801215512e-05, + "loss": 0.8225, + "step": 2970 + }, + { + "epoch": 0.5944, + "learning_rate": 1.9328278153972943e-05, + "loss": 0.4978, + "step": 2972 + }, + { + "epoch": 0.5948, + "learning_rate": 1.933330010983518e-05, + "loss": 0.4499, + "step": 2974 + }, + { + "epoch": 0.5952, + "learning_rate": 1.9338303869951266e-05, + "loss": 0.2387, + "step": 2976 + }, + { + "epoch": 0.5956, + "learning_rate": 1.934328942456612e-05, + "loss": 0.1881, + "step": 2978 + }, + { + "epoch": 0.596, + "learning_rate": 1.934825676396015e-05, + "loss": 0.4256, + "step": 2980 + }, + { + "epoch": 0.5964, + "learning_rate": 1.9353205878449257e-05, + "loss": 0.3065, + "step": 2982 + }, + { + "epoch": 0.5968, + "learning_rate": 1.935813675838491e-05, + "loss": 0.2337, + "step": 2984 + }, + { + "epoch": 0.5972, + "learning_rate": 1.9363049394154088e-05, + "loss": 0.437, + "step": 2986 + }, + { + "epoch": 0.5976, + "learning_rate": 1.9367943776179375e-05, + "loss": 0.841, + "step": 2988 + }, + { + "epoch": 0.598, + "learning_rate": 1.937281989491892e-05, + "loss": 0.6724, + "step": 2990 + }, + { + "epoch": 0.5984, + "learning_rate": 1.9377677740866457e-05, + "loss": 0.2856, + "step": 2992 + }, + { + "epoch": 0.5988, + "learning_rate": 1.9382517304551397e-05, + "loss": 0.4911, + "step": 2994 + }, + { + "epoch": 0.5992, + "learning_rate": 1.9387338576538743e-05, + "loss": 0.3059, + "step": 2996 + }, + { + "epoch": 0.5996, + "learning_rate": 1.9392141547429183e-05, + "loss": 0.3701, + "step": 2998 + }, + { + "epoch": 0.6, + "learning_rate": 1.9396926207859085e-05, + "loss": 0.2143, + "step": 3000 + }, + { + "epoch": 0.6004, + "learning_rate": 1.9401692548500504e-05, + "loss": 0.3199, + "step": 3002 + }, + { + "epoch": 0.6008, + "learning_rate": 1.9406440560061214e-05, + "loss": 0.2458, + "step": 3004 + }, + { + "epoch": 0.6012, + "learning_rate": 1.9411170233284728e-05, + "loss": 0.2691, + "step": 3006 + }, + { + "epoch": 0.6016, + "learning_rate": 1.9415881558950302e-05, + "loss": 0.3939, + "step": 3008 + }, + { + "epoch": 0.602, + "learning_rate": 1.942057452787297e-05, + "loss": 0.2283, + "step": 3010 + }, + { + "epoch": 0.6024, + "learning_rate": 1.942524913090354e-05, + "loss": 0.1634, + "step": 3012 + }, + { + "epoch": 0.6028, + "learning_rate": 1.9429905358928645e-05, + "loss": 0.254, + "step": 3014 + }, + { + "epoch": 0.6032, + "learning_rate": 1.9434543202870723e-05, + "loss": 0.5594, + "step": 3016 + }, + { + "epoch": 0.6036, + "learning_rate": 1.9439162653688063e-05, + "loss": 0.8102, + "step": 3018 + }, + { + "epoch": 0.604, + "learning_rate": 1.9443763702374815e-05, + "loss": 0.4838, + "step": 3020 + }, + { + "epoch": 0.6044, + "learning_rate": 1.944834633996098e-05, + "loss": 0.3318, + "step": 3022 + }, + { + "epoch": 0.6048, + "learning_rate": 1.9452910557512494e-05, + "loss": 0.5093, + "step": 3024 + }, + { + "epoch": 0.6052, + "learning_rate": 1.9457456346131172e-05, + "loss": 0.2928, + "step": 3026 + }, + { + "epoch": 0.6056, + "learning_rate": 1.9461983696954756e-05, + "loss": 0.3434, + "step": 3028 + }, + { + "epoch": 0.606, + "learning_rate": 1.9466492601156964e-05, + "loss": 0.4909, + "step": 3030 + }, + { + "epoch": 0.6064, + "learning_rate": 1.947098304994744e-05, + "loss": 0.1868, + "step": 3032 + }, + { + "epoch": 0.6068, + "learning_rate": 1.947545503457184e-05, + "loss": 0.2308, + "step": 3034 + }, + { + "epoch": 0.6072, + "learning_rate": 1.9479908546311787e-05, + "loss": 0.314, + "step": 3036 + }, + { + "epoch": 0.6076, + "learning_rate": 1.9484343576484935e-05, + "loss": 0.2668, + "step": 3038 + }, + { + "epoch": 0.608, + "learning_rate": 1.9488760116444966e-05, + "loss": 0.1619, + "step": 3040 + }, + { + "epoch": 0.6084, + "learning_rate": 1.949315815758161e-05, + "loss": 0.7027, + "step": 3042 + }, + { + "epoch": 0.6088, + "learning_rate": 1.949753769132067e-05, + "loss": 0.6427, + "step": 3044 + }, + { + "epoch": 0.6092, + "learning_rate": 1.9501898709124008e-05, + "loss": 0.2667, + "step": 3046 + }, + { + "epoch": 0.6096, + "learning_rate": 1.95062412024896e-05, + "loss": 0.219, + "step": 3048 + }, + { + "epoch": 0.61, + "learning_rate": 1.9510565162951534e-05, + "loss": 0.8339, + "step": 3050 + }, + { + "epoch": 0.6104, + "learning_rate": 1.951487058208003e-05, + "loss": 0.8492, + "step": 3052 + }, + { + "epoch": 0.6108, + "learning_rate": 1.9519157451481453e-05, + "loss": 0.5937, + "step": 3054 + }, + { + "epoch": 0.6112, + "learning_rate": 1.952342576279833e-05, + "loss": 0.2114, + "step": 3056 + }, + { + "epoch": 0.6116, + "learning_rate": 1.9527675507709364e-05, + "loss": 0.201, + "step": 3058 + }, + { + "epoch": 0.612, + "learning_rate": 1.953190667792947e-05, + "loss": 0.3731, + "step": 3060 + }, + { + "epoch": 0.6124, + "learning_rate": 1.953611926520976e-05, + "loss": 0.2534, + "step": 3062 + }, + { + "epoch": 0.6128, + "learning_rate": 1.9540313261337578e-05, + "loss": 0.3001, + "step": 3064 + }, + { + "epoch": 0.6132, + "learning_rate": 1.9544488658136522e-05, + "loss": 0.4541, + "step": 3066 + }, + { + "epoch": 0.6136, + "learning_rate": 1.954864544746643e-05, + "loss": 0.2633, + "step": 3068 + }, + { + "epoch": 0.614, + "learning_rate": 1.955278362122344e-05, + "loss": 0.3884, + "step": 3070 + }, + { + "epoch": 0.6144, + "learning_rate": 1.955690317133996e-05, + "loss": 0.1967, + "step": 3072 + }, + { + "epoch": 0.6148, + "learning_rate": 1.9561004089784726e-05, + "loss": 0.324, + "step": 3074 + }, + { + "epoch": 0.6152, + "learning_rate": 1.956508636856278e-05, + "loss": 0.2663, + "step": 3076 + }, + { + "epoch": 0.6156, + "learning_rate": 1.956914999971551e-05, + "loss": 0.5241, + "step": 3078 + }, + { + "epoch": 0.616, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.3999, + "step": 3080 + }, + { + "epoch": 0.6164, + "learning_rate": 1.9577221287492368e-05, + "loss": 0.1894, + "step": 3082 + }, + { + "epoch": 0.6168, + "learning_rate": 1.95812289283811e-05, + "loss": 0.6478, + "step": 3084 + }, + { + "epoch": 0.6172, + "learning_rate": 1.958521789017376e-05, + "loss": 0.405, + "step": 3086 + }, + { + "epoch": 0.6176, + "learning_rate": 1.958918816509367e-05, + "loss": 0.3625, + "step": 3088 + }, + { + "epoch": 0.618, + "learning_rate": 1.9593139745400575e-05, + "loss": 0.3253, + "step": 3090 + }, + { + "epoch": 0.6184, + "learning_rate": 1.9597072623390668e-05, + "loss": 0.1851, + "step": 3092 + }, + { + "epoch": 0.6188, + "learning_rate": 1.9600986791396597e-05, + "loss": 0.2763, + "step": 3094 + }, + { + "epoch": 0.6192, + "learning_rate": 1.9604882241787496e-05, + "loss": 0.4612, + "step": 3096 + }, + { + "epoch": 0.6196, + "learning_rate": 1.9608758966968983e-05, + "loss": 0.1768, + "step": 3098 + }, + { + "epoch": 0.62, + "learning_rate": 1.9612616959383187e-05, + "loss": 0.6885, + "step": 3100 + }, + { + "epoch": 0.6204, + "learning_rate": 1.9616456211508752e-05, + "loss": 0.2756, + "step": 3102 + }, + { + "epoch": 0.6208, + "learning_rate": 1.9620276715860856e-05, + "loss": 0.2522, + "step": 3104 + }, + { + "epoch": 0.6212, + "learning_rate": 1.962407846499124e-05, + "loss": 0.3695, + "step": 3106 + }, + { + "epoch": 0.6216, + "learning_rate": 1.9627861451488187e-05, + "loss": 0.3333, + "step": 3108 + }, + { + "epoch": 0.622, + "learning_rate": 1.9631625667976584e-05, + "loss": 0.3991, + "step": 3110 + }, + { + "epoch": 0.6224, + "learning_rate": 1.963537110711789e-05, + "loss": 0.4863, + "step": 3112 + }, + { + "epoch": 0.6228, + "learning_rate": 1.9639097761610174e-05, + "loss": 0.1974, + "step": 3114 + }, + { + "epoch": 0.6232, + "learning_rate": 1.964280562418815e-05, + "loss": 0.3138, + "step": 3116 + }, + { + "epoch": 0.6236, + "learning_rate": 1.964649468762313e-05, + "loss": 0.4458, + "step": 3118 + }, + { + "epoch": 0.624, + "learning_rate": 1.9650164944723116e-05, + "loss": 0.4031, + "step": 3120 + }, + { + "epoch": 0.6244, + "learning_rate": 1.965381638833274e-05, + "loss": 0.3673, + "step": 3122 + }, + { + "epoch": 0.6248, + "learning_rate": 1.9657449011333328e-05, + "loss": 0.5973, + "step": 3124 + }, + { + "epoch": 0.6252, + "learning_rate": 1.96610628066429e-05, + "loss": 0.6502, + "step": 3126 + }, + { + "epoch": 0.6256, + "learning_rate": 1.9664657767216176e-05, + "loss": 0.4321, + "step": 3128 + }, + { + "epoch": 0.626, + "learning_rate": 1.9668233886044594e-05, + "loss": 0.6005, + "step": 3130 + }, + { + "epoch": 0.6264, + "learning_rate": 1.967179115615633e-05, + "loss": 0.2311, + "step": 3132 + }, + { + "epoch": 0.6268, + "learning_rate": 1.96753295706163e-05, + "loss": 0.2646, + "step": 3134 + }, + { + "epoch": 0.6272, + "learning_rate": 1.967884912252619e-05, + "loss": 0.2615, + "step": 3136 + }, + { + "epoch": 0.6276, + "learning_rate": 1.9682349805024443e-05, + "loss": 0.4409, + "step": 3138 + }, + { + "epoch": 0.628, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.8077, + "step": 3140 + }, + { + "epoch": 0.6284, + "learning_rate": 1.9689294534523833e-05, + "loss": 0.577, + "step": 3142 + }, + { + "epoch": 0.6288, + "learning_rate": 1.969273856798585e-05, + "loss": 0.2104, + "step": 3144 + }, + { + "epoch": 0.6292, + "learning_rate": 1.969616370495806e-05, + "loss": 0.7868, + "step": 3146 + }, + { + "epoch": 0.6296, + "learning_rate": 1.9699569938762972e-05, + "loss": 0.2986, + "step": 3148 + }, + { + "epoch": 0.63, + "learning_rate": 1.9702957262759964e-05, + "loss": 0.4153, + "step": 3150 + }, + { + "epoch": 0.6304, + "learning_rate": 1.9706325670345276e-05, + "loss": 0.6267, + "step": 3152 + }, + { + "epoch": 0.6308, + "learning_rate": 1.9709675154952013e-05, + "loss": 0.3221, + "step": 3154 + }, + { + "epoch": 0.6312, + "learning_rate": 1.9713005710050203e-05, + "loss": 0.7248, + "step": 3156 + }, + { + "epoch": 0.6316, + "learning_rate": 1.971631732914674e-05, + "loss": 0.3045, + "step": 3158 + }, + { + "epoch": 0.632, + "learning_rate": 1.9719610005785466e-05, + "loss": 0.3088, + "step": 3160 + }, + { + "epoch": 0.6324, + "learning_rate": 1.9722883733547128e-05, + "loss": 0.3413, + "step": 3162 + }, + { + "epoch": 0.6328, + "learning_rate": 1.9726138506049434e-05, + "loss": 0.7554, + "step": 3164 + }, + { + "epoch": 0.6332, + "learning_rate": 1.972937431694704e-05, + "loss": 0.2729, + "step": 3166 + }, + { + "epoch": 0.6336, + "learning_rate": 1.9732591159931564e-05, + "loss": 1.2597, + "step": 3168 + }, + { + "epoch": 0.634, + "learning_rate": 1.9735789028731603e-05, + "loss": 0.1684, + "step": 3170 + }, + { + "epoch": 0.6344, + "learning_rate": 1.9738967917112752e-05, + "loss": 0.2218, + "step": 3172 + }, + { + "epoch": 0.6348, + "learning_rate": 1.9742127818877605e-05, + "loss": 0.3772, + "step": 3174 + }, + { + "epoch": 0.6352, + "learning_rate": 1.974526872786577e-05, + "loss": 0.1799, + "step": 3176 + }, + { + "epoch": 0.6356, + "learning_rate": 1.974839063795389e-05, + "loss": 0.9437, + "step": 3178 + }, + { + "epoch": 0.636, + "learning_rate": 1.9751493543055634e-05, + "loss": 0.3048, + "step": 3180 + }, + { + "epoch": 0.6364, + "learning_rate": 1.975457743712173e-05, + "loss": 0.1598, + "step": 3182 + }, + { + "epoch": 0.6368, + "learning_rate": 1.9757642314139977e-05, + "loss": 0.1697, + "step": 3184 + }, + { + "epoch": 0.6372, + "learning_rate": 1.976068816813523e-05, + "loss": 1.0069, + "step": 3186 + }, + { + "epoch": 0.6376, + "learning_rate": 1.976371499316945e-05, + "loss": 2.0965, + "step": 3188 + }, + { + "epoch": 0.638, + "learning_rate": 1.9766722783341675e-05, + "loss": 0.2904, + "step": 3190 + }, + { + "epoch": 0.6384, + "learning_rate": 1.9769711532788083e-05, + "loss": 0.4692, + "step": 3192 + }, + { + "epoch": 0.6388, + "learning_rate": 1.9772681235681933e-05, + "loss": 0.4559, + "step": 3194 + }, + { + "epoch": 0.6392, + "learning_rate": 1.9775631886233655e-05, + "loss": 0.1823, + "step": 3196 + }, + { + "epoch": 0.6396, + "learning_rate": 1.977856347869079e-05, + "loss": 0.5915, + "step": 3198 + }, + { + "epoch": 0.64, + "learning_rate": 1.9781476007338054e-05, + "loss": 0.448, + "step": 3200 + }, + { + "epoch": 0.6404, + "learning_rate": 1.9784369466497333e-05, + "loss": 0.4329, + "step": 3202 + }, + { + "epoch": 0.6408, + "learning_rate": 1.978724385052766e-05, + "loss": 0.3447, + "step": 3204 + }, + { + "epoch": 0.6412, + "learning_rate": 1.97900991538253e-05, + "loss": 0.2528, + "step": 3206 + }, + { + "epoch": 0.6416, + "learning_rate": 1.9792935370823673e-05, + "loss": 0.3236, + "step": 3208 + }, + { + "epoch": 0.642, + "learning_rate": 1.979575249599344e-05, + "loss": 0.3237, + "step": 3210 + }, + { + "epoch": 0.6424, + "learning_rate": 1.979855052384247e-05, + "loss": 0.8081, + "step": 3212 + }, + { + "epoch": 0.6428, + "learning_rate": 1.980132944891586e-05, + "loss": 0.7209, + "step": 3214 + }, + { + "epoch": 0.6432, + "learning_rate": 1.9804089265795956e-05, + "loss": 0.2478, + "step": 3216 + }, + { + "epoch": 0.6436, + "learning_rate": 1.9806829969102356e-05, + "loss": 0.3537, + "step": 3218 + }, + { + "epoch": 0.644, + "learning_rate": 1.9809551553491918e-05, + "loss": 0.266, + "step": 3220 + }, + { + "epoch": 0.6444, + "learning_rate": 1.981225401365877e-05, + "loss": 0.2914, + "step": 3222 + }, + { + "epoch": 0.6448, + "learning_rate": 1.981493734433433e-05, + "loss": 0.2098, + "step": 3224 + }, + { + "epoch": 0.6452, + "learning_rate": 1.981760154028731e-05, + "loss": 0.2386, + "step": 3226 + }, + { + "epoch": 0.6456, + "learning_rate": 1.982024659632372e-05, + "loss": 0.2761, + "step": 3228 + }, + { + "epoch": 0.646, + "learning_rate": 1.9822872507286887e-05, + "loss": 0.352, + "step": 3230 + }, + { + "epoch": 0.6464, + "learning_rate": 1.9825479268057472e-05, + "loss": 0.1894, + "step": 3232 + }, + { + "epoch": 0.6468, + "learning_rate": 1.9828066873553445e-05, + "loss": 0.7558, + "step": 3234 + }, + { + "epoch": 0.6472, + "learning_rate": 1.9830635318730155e-05, + "loss": 0.3247, + "step": 3236 + }, + { + "epoch": 0.6476, + "learning_rate": 1.983318459858028e-05, + "loss": 0.1933, + "step": 3238 + }, + { + "epoch": 0.648, + "learning_rate": 1.9835714708133858e-05, + "loss": 0.1732, + "step": 3240 + }, + { + "epoch": 0.6484, + "learning_rate": 1.983822564245833e-05, + "loss": 0.3231, + "step": 3242 + }, + { + "epoch": 0.6488, + "learning_rate": 1.9840717396658483e-05, + "loss": 0.5472, + "step": 3244 + }, + { + "epoch": 0.6492, + "learning_rate": 1.9843189965876525e-05, + "loss": 0.2673, + "step": 3246 + }, + { + "epoch": 0.6496, + "learning_rate": 1.9845643345292055e-05, + "loss": 0.3432, + "step": 3248 + }, + { + "epoch": 0.65, + "learning_rate": 1.9848077530122083e-05, + "loss": 0.407, + "step": 3250 + }, + { + "epoch": 0.6504, + "learning_rate": 1.9850492515621038e-05, + "loss": 0.1423, + "step": 3252 + }, + { + "epoch": 0.6508, + "learning_rate": 1.9852888297080785e-05, + "loss": 1.0527, + "step": 3254 + }, + { + "epoch": 0.6512, + "learning_rate": 1.985526486983063e-05, + "loss": 0.1812, + "step": 3256 + }, + { + "epoch": 0.6516, + "learning_rate": 1.9857622229237315e-05, + "loss": 0.2226, + "step": 3258 + }, + { + "epoch": 0.652, + "learning_rate": 1.985996037070505e-05, + "loss": 0.6073, + "step": 3260 + }, + { + "epoch": 0.6524, + "learning_rate": 1.986227928967551e-05, + "loss": 0.4975, + "step": 3262 + }, + { + "epoch": 0.6528, + "learning_rate": 1.9864578981627844e-05, + "loss": 0.3497, + "step": 3264 + }, + { + "epoch": 0.6532, + "learning_rate": 1.986685944207868e-05, + "loss": 0.6462, + "step": 3266 + }, + { + "epoch": 0.6536, + "learning_rate": 1.9869120666582153e-05, + "loss": 0.7872, + "step": 3268 + }, + { + "epoch": 0.654, + "learning_rate": 1.9871362650729877e-05, + "loss": 0.3285, + "step": 3270 + }, + { + "epoch": 0.6544, + "learning_rate": 1.9873585390151003e-05, + "loss": 0.304, + "step": 3272 + }, + { + "epoch": 0.6548, + "learning_rate": 1.9875788880512183e-05, + "loss": 0.5204, + "step": 3274 + }, + { + "epoch": 0.6552, + "learning_rate": 1.987797311751759e-05, + "loss": 0.2902, + "step": 3276 + }, + { + "epoch": 0.6556, + "learning_rate": 1.9880138096908955e-05, + "loss": 0.4497, + "step": 3278 + }, + { + "epoch": 0.656, + "learning_rate": 1.9882283814465528e-05, + "loss": 0.4726, + "step": 3280 + }, + { + "epoch": 0.6564, + "learning_rate": 1.9884410266004134e-05, + "loss": 0.2077, + "step": 3282 + }, + { + "epoch": 0.6568, + "learning_rate": 1.988651744737914e-05, + "loss": 0.2514, + "step": 3284 + }, + { + "epoch": 0.6572, + "learning_rate": 1.9888605354482494e-05, + "loss": 0.2483, + "step": 3286 + }, + { + "epoch": 0.6576, + "learning_rate": 1.9890673983243704e-05, + "loss": 0.2162, + "step": 3288 + }, + { + "epoch": 0.658, + "learning_rate": 1.9892723329629885e-05, + "loss": 0.8051, + "step": 3290 + }, + { + "epoch": 0.6584, + "learning_rate": 1.9894753389645723e-05, + "loss": 0.2689, + "step": 3292 + }, + { + "epoch": 0.6588, + "learning_rate": 1.989676415933351e-05, + "loss": 0.2902, + "step": 3294 + }, + { + "epoch": 0.6592, + "learning_rate": 1.9898755634773155e-05, + "loss": 0.3065, + "step": 3296 + }, + { + "epoch": 0.6596, + "learning_rate": 1.9900727812082174e-05, + "loss": 0.2219, + "step": 3298 + }, + { + "epoch": 0.66, + "learning_rate": 1.9902680687415704e-05, + "loss": 0.4113, + "step": 3300 + }, + { + "epoch": 0.6604, + "learning_rate": 1.9904614256966514e-05, + "loss": 0.2822, + "step": 3302 + }, + { + "epoch": 0.6608, + "learning_rate": 1.9906528516965014e-05, + "loss": 0.1722, + "step": 3304 + }, + { + "epoch": 0.6612, + "learning_rate": 1.9908423463679246e-05, + "loss": 0.3316, + "step": 3306 + }, + { + "epoch": 0.6616, + "learning_rate": 1.9910299093414926e-05, + "loss": 0.3408, + "step": 3308 + }, + { + "epoch": 0.662, + "learning_rate": 1.991215540251542e-05, + "loss": 0.8848, + "step": 3310 + }, + { + "epoch": 0.6624, + "learning_rate": 1.9913992387361744e-05, + "loss": 0.948, + "step": 3312 + }, + { + "epoch": 0.6628, + "learning_rate": 1.9915810044372618e-05, + "loss": 1.0203, + "step": 3314 + }, + { + "epoch": 0.6632, + "learning_rate": 1.9917608370004414e-05, + "loss": 0.2257, + "step": 3316 + }, + { + "epoch": 0.6636, + "learning_rate": 1.9919387360751216e-05, + "loss": 0.4582, + "step": 3318 + }, + { + "epoch": 0.664, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.5519, + "step": 3320 + }, + { + "epoch": 0.6644, + "learning_rate": 1.992288732375458e-05, + "loss": 0.6513, + "step": 3322 + }, + { + "epoch": 0.6648, + "learning_rate": 1.9924608289187786e-05, + "loss": 0.2682, + "step": 3324 + }, + { + "epoch": 0.6652, + "learning_rate": 1.992630990608929e-05, + "loss": 0.2769, + "step": 3326 + }, + { + "epoch": 0.6656, + "learning_rate": 1.9927992171141707e-05, + "loss": 0.1696, + "step": 3328 + }, + { + "epoch": 0.666, + "learning_rate": 1.992965508106537e-05, + "loss": 0.7024, + "step": 3330 + }, + { + "epoch": 0.6664, + "learning_rate": 1.9931298632618355e-05, + "loss": 0.2495, + "step": 3332 + }, + { + "epoch": 0.6668, + "learning_rate": 1.993292282259647e-05, + "loss": 0.3333, + "step": 3334 + }, + { + "epoch": 0.6672, + "learning_rate": 1.9934527647833276e-05, + "loss": 0.594, + "step": 3336 + }, + { + "epoch": 0.6676, + "learning_rate": 1.9936113105200085e-05, + "loss": 0.3013, + "step": 3338 + }, + { + "epoch": 0.668, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.2684, + "step": 3340 + }, + { + "epoch": 0.6684, + "learning_rate": 1.9939225903997748e-05, + "loss": 0.3212, + "step": 3342 + }, + { + "epoch": 0.6688, + "learning_rate": 1.9940753239360047e-05, + "loss": 0.6376, + "step": 3344 + }, + { + "epoch": 0.6692, + "learning_rate": 1.9942261194715236e-05, + "loss": 0.2834, + "step": 3346 + }, + { + "epoch": 0.6696, + "learning_rate": 1.994374976712348e-05, + "loss": 0.3618, + "step": 3348 + }, + { + "epoch": 0.67, + "learning_rate": 1.9945218953682736e-05, + "loss": 0.423, + "step": 3350 + }, + { + "epoch": 0.6704, + "learning_rate": 1.994666875152874e-05, + "loss": 0.4896, + "step": 3352 + }, + { + "epoch": 0.6708, + "learning_rate": 1.994809915783505e-05, + "loss": 0.3755, + "step": 3354 + }, + { + "epoch": 0.6712, + "learning_rate": 1.9949510169813003e-05, + "loss": 0.4349, + "step": 3356 + }, + { + "epoch": 0.6716, + "learning_rate": 1.9950901784711768e-05, + "loss": 0.3235, + "step": 3358 + }, + { + "epoch": 0.672, + "learning_rate": 1.9952273999818312e-05, + "loss": 0.4766, + "step": 3360 + }, + { + "epoch": 0.6724, + "learning_rate": 1.995362681245744e-05, + "loss": 0.4098, + "step": 3362 + }, + { + "epoch": 0.6728, + "learning_rate": 1.995496021999177e-05, + "loss": 0.2835, + "step": 3364 + }, + { + "epoch": 0.6732, + "learning_rate": 1.995627421982176e-05, + "loss": 0.8841, + "step": 3366 + }, + { + "epoch": 0.6736, + "learning_rate": 1.9957568809385693e-05, + "loss": 0.4558, + "step": 3368 + }, + { + "epoch": 0.674, + "learning_rate": 1.9958843986159705e-05, + "loss": 0.1409, + "step": 3370 + }, + { + "epoch": 0.6744, + "learning_rate": 1.9960099747657774e-05, + "loss": 0.2651, + "step": 3372 + }, + { + "epoch": 0.6748, + "learning_rate": 1.9961336091431725e-05, + "loss": 0.5318, + "step": 3374 + }, + { + "epoch": 0.6752, + "learning_rate": 1.996255301507125e-05, + "loss": 0.7934, + "step": 3376 + }, + { + "epoch": 0.6756, + "learning_rate": 1.9963750516203884e-05, + "loss": 0.3448, + "step": 3378 + }, + { + "epoch": 0.676, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.406, + "step": 3380 + }, + { + "epoch": 0.6764, + "learning_rate": 1.996608724164801e-05, + "loss": 0.2374, + "step": 3382 + }, + { + "epoch": 0.6768, + "learning_rate": 1.9967226461403934e-05, + "loss": 0.2613, + "step": 3384 + }, + { + "epoch": 0.6772, + "learning_rate": 1.9968346249541848e-05, + "loss": 0.3975, + "step": 3386 + }, + { + "epoch": 0.6776, + "learning_rate": 1.996944660387867e-05, + "loss": 0.5213, + "step": 3388 + }, + { + "epoch": 0.678, + "learning_rate": 1.9970527522269204e-05, + "loss": 0.6237, + "step": 3390 + }, + { + "epoch": 0.6784, + "learning_rate": 1.997158900260614e-05, + "loss": 0.1334, + "step": 3392 + }, + { + "epoch": 0.6788, + "learning_rate": 1.997263104282007e-05, + "loss": 0.5528, + "step": 3394 + }, + { + "epoch": 0.6792, + "learning_rate": 1.9973653640879486e-05, + "loss": 0.1869, + "step": 3396 + }, + { + "epoch": 0.6796, + "learning_rate": 1.9974656794790777e-05, + "loss": 0.3726, + "step": 3398 + }, + { + "epoch": 0.68, + "learning_rate": 1.9975640502598246e-05, + "loss": 0.1469, + "step": 3400 + }, + { + "epoch": 0.6804, + "learning_rate": 1.99766047623841e-05, + "loss": 0.4592, + "step": 3402 + }, + { + "epoch": 0.6808, + "learning_rate": 1.997754957226847e-05, + "loss": 0.243, + "step": 3404 + }, + { + "epoch": 0.6812, + "learning_rate": 1.9978474930409396e-05, + "loss": 0.1693, + "step": 3406 + }, + { + "epoch": 0.6816, + "learning_rate": 1.9979380835002846e-05, + "loss": 0.8831, + "step": 3408 + }, + { + "epoch": 0.682, + "learning_rate": 1.9980267284282718e-05, + "loss": 0.6159, + "step": 3410 + }, + { + "epoch": 0.6824, + "learning_rate": 1.9981134276520828e-05, + "loss": 0.8759, + "step": 3412 + }, + { + "epoch": 0.6828, + "learning_rate": 1.9981981810026932e-05, + "loss": 0.1728, + "step": 3414 + }, + { + "epoch": 0.6832, + "learning_rate": 1.998280988314872e-05, + "loss": 0.6799, + "step": 3416 + }, + { + "epoch": 0.6836, + "learning_rate": 1.9983618494271825e-05, + "loss": 0.8391, + "step": 3418 + }, + { + "epoch": 0.684, + "learning_rate": 1.998440764181981e-05, + "loss": 0.3572, + "step": 3420 + }, + { + "epoch": 0.6844, + "learning_rate": 1.99851773242542e-05, + "loss": 0.1743, + "step": 3422 + }, + { + "epoch": 0.6848, + "learning_rate": 1.9985927540074453e-05, + "loss": 0.1885, + "step": 3424 + }, + { + "epoch": 0.6852, + "learning_rate": 1.9986658287817992e-05, + "loss": 0.5103, + "step": 3426 + }, + { + "epoch": 0.6856, + "learning_rate": 1.998736956606018e-05, + "loss": 0.3958, + "step": 3428 + }, + { + "epoch": 0.686, + "learning_rate": 1.9988061373414342e-05, + "loss": 1.154, + "step": 3430 + }, + { + "epoch": 0.6864, + "learning_rate": 1.9988733708531772e-05, + "loss": 0.2142, + "step": 3432 + }, + { + "epoch": 0.6868, + "learning_rate": 1.9989386570101712e-05, + "loss": 0.688, + "step": 3434 + }, + { + "epoch": 0.6872, + "learning_rate": 1.9990019956851384e-05, + "loss": 0.4478, + "step": 3436 + }, + { + "epoch": 0.6876, + "learning_rate": 1.9990633867545956e-05, + "loss": 0.5574, + "step": 3438 + }, + { + "epoch": 0.688, + "learning_rate": 1.9991228300988586e-05, + "loss": 0.4537, + "step": 3440 + }, + { + "epoch": 0.6884, + "learning_rate": 1.9991803256020393e-05, + "loss": 0.3938, + "step": 3442 + }, + { + "epoch": 0.6888, + "learning_rate": 1.999235873152047e-05, + "loss": 0.1542, + "step": 3444 + }, + { + "epoch": 0.6892, + "learning_rate": 1.9992894726405894e-05, + "loss": 0.5876, + "step": 3446 + }, + { + "epoch": 0.6896, + "learning_rate": 1.9993411239631713e-05, + "loss": 0.6761, + "step": 3448 + }, + { + "epoch": 0.69, + "learning_rate": 1.999390827019096e-05, + "loss": 0.3152, + "step": 3450 + }, + { + "epoch": 0.6904, + "learning_rate": 1.9994385817114644e-05, + "loss": 0.8256, + "step": 3452 + }, + { + "epoch": 0.6908, + "learning_rate": 1.999484387947177e-05, + "loss": 0.5866, + "step": 3454 + }, + { + "epoch": 0.6912, + "learning_rate": 1.9995282456369313e-05, + "loss": 1.0252, + "step": 3456 + }, + { + "epoch": 0.6916, + "learning_rate": 1.9995701546952252e-05, + "loss": 0.5242, + "step": 3458 + }, + { + "epoch": 0.692, + "learning_rate": 1.9996101150403543e-05, + "loss": 0.766, + "step": 3460 + }, + { + "epoch": 0.6924, + "learning_rate": 1.9996481265944146e-05, + "loss": 0.3243, + "step": 3462 + }, + { + "epoch": 0.6928, + "learning_rate": 1.9996841892833e-05, + "loss": 0.3133, + "step": 3464 + }, + { + "epoch": 0.6932, + "learning_rate": 1.999718303036705e-05, + "loss": 0.4218, + "step": 3466 + }, + { + "epoch": 0.6936, + "learning_rate": 1.9997504677881224e-05, + "loss": 0.3534, + "step": 3468 + }, + { + "epoch": 0.694, + "learning_rate": 1.9997806834748455e-05, + "loss": 0.2749, + "step": 3470 + }, + { + "epoch": 0.6944, + "learning_rate": 1.999808950037968e-05, + "loss": 0.4349, + "step": 3472 + }, + { + "epoch": 0.6948, + "learning_rate": 1.9998352674223816e-05, + "loss": 0.3441, + "step": 3474 + }, + { + "epoch": 0.6952, + "learning_rate": 1.9998596355767805e-05, + "loss": 0.3047, + "step": 3476 + }, + { + "epoch": 0.6956, + "learning_rate": 1.999882054453657e-05, + "loss": 0.3622, + "step": 3478 + }, + { + "epoch": 0.696, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.3609, + "step": 3480 + }, + { + "epoch": 0.6964, + "learning_rate": 1.9999210442038164e-05, + "loss": 0.6904, + "step": 3482 + }, + { + "epoch": 0.6968, + "learning_rate": 1.9999376150010868e-05, + "loss": 0.3592, + "step": 3484 + }, + { + "epoch": 0.6972, + "learning_rate": 1.99995223636881e-05, + "loss": 0.3775, + "step": 3486 + }, + { + "epoch": 0.6976, + "learning_rate": 1.9999649082784807e-05, + "loss": 0.5266, + "step": 3488 + }, + { + "epoch": 0.698, + "learning_rate": 1.9999756307053947e-05, + "loss": 0.6816, + "step": 3490 + }, + { + "epoch": 0.6984, + "learning_rate": 1.9999844036286483e-05, + "loss": 0.2889, + "step": 3492 + }, + { + "epoch": 0.6988, + "learning_rate": 1.9999912270311376e-05, + "loss": 0.3702, + "step": 3494 + }, + { + "epoch": 0.6992, + "learning_rate": 1.9999961008995607e-05, + "loss": 0.178, + "step": 3496 + }, + { + "epoch": 0.6996, + "learning_rate": 1.9999990252244153e-05, + "loss": 0.7471, + "step": 3498 + }, + { + "epoch": 0.7, + "learning_rate": 2e-05, + "loss": 0.2377, + "step": 3500 + }, + { + "epoch": 0.7004, + "learning_rate": 1.9999990252244153e-05, + "loss": 0.3256, + "step": 3502 + }, + { + "epoch": 0.7008, + "learning_rate": 1.9999961008995607e-05, + "loss": 0.2226, + "step": 3504 + }, + { + "epoch": 0.7012, + "learning_rate": 1.9999912270311376e-05, + "loss": 0.2917, + "step": 3506 + }, + { + "epoch": 0.7016, + "learning_rate": 1.9999844036286483e-05, + "loss": 0.1633, + "step": 3508 + }, + { + "epoch": 0.702, + "learning_rate": 1.9999756307053947e-05, + "loss": 0.7728, + "step": 3510 + }, + { + "epoch": 0.7024, + "learning_rate": 1.9999649082784807e-05, + "loss": 0.1809, + "step": 3512 + }, + { + "epoch": 0.7028, + "learning_rate": 1.99995223636881e-05, + "loss": 0.3258, + "step": 3514 + }, + { + "epoch": 0.7032, + "learning_rate": 1.9999376150010868e-05, + "loss": 0.2028, + "step": 3516 + }, + { + "epoch": 0.7036, + "learning_rate": 1.9999210442038164e-05, + "loss": 0.438, + "step": 3518 + }, + { + "epoch": 0.704, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.515, + "step": 3520 + }, + { + "epoch": 0.7044, + "learning_rate": 1.999882054453657e-05, + "loss": 0.4518, + "step": 3522 + }, + { + "epoch": 0.7048, + "learning_rate": 1.9998596355767805e-05, + "loss": 0.3598, + "step": 3524 + }, + { + "epoch": 0.7052, + "learning_rate": 1.9998352674223816e-05, + "loss": 0.9684, + "step": 3526 + }, + { + "epoch": 0.7056, + "learning_rate": 1.999808950037968e-05, + "loss": 0.3941, + "step": 3528 + }, + { + "epoch": 0.706, + "learning_rate": 1.9997806834748455e-05, + "loss": 0.3803, + "step": 3530 + }, + { + "epoch": 0.7064, + "learning_rate": 1.9997504677881224e-05, + "loss": 0.2901, + "step": 3532 + }, + { + "epoch": 0.7068, + "learning_rate": 1.999718303036705e-05, + "loss": 0.2548, + "step": 3534 + }, + { + "epoch": 0.7072, + "learning_rate": 1.9996841892833e-05, + "loss": 0.5869, + "step": 3536 + }, + { + "epoch": 0.7076, + "learning_rate": 1.9996481265944146e-05, + "loss": 0.3545, + "step": 3538 + }, + { + "epoch": 0.708, + "learning_rate": 1.9996101150403547e-05, + "loss": 0.2871, + "step": 3540 + }, + { + "epoch": 0.7084, + "learning_rate": 1.9995701546952252e-05, + "loss": 0.3907, + "step": 3542 + }, + { + "epoch": 0.7088, + "learning_rate": 1.9995282456369313e-05, + "loss": 0.4414, + "step": 3544 + }, + { + "epoch": 0.7092, + "learning_rate": 1.9994843879471766e-05, + "loss": 0.4361, + "step": 3546 + }, + { + "epoch": 0.7096, + "learning_rate": 1.9994385817114644e-05, + "loss": 0.2388, + "step": 3548 + }, + { + "epoch": 0.71, + "learning_rate": 1.999390827019096e-05, + "loss": 0.2895, + "step": 3550 + }, + { + "epoch": 0.7104, + "learning_rate": 1.9993411239631713e-05, + "loss": 1.0413, + "step": 3552 + }, + { + "epoch": 0.7108, + "learning_rate": 1.9992894726405898e-05, + "loss": 0.1996, + "step": 3554 + }, + { + "epoch": 0.7112, + "learning_rate": 1.999235873152047e-05, + "loss": 0.3968, + "step": 3556 + }, + { + "epoch": 0.7116, + "learning_rate": 1.9991803256020393e-05, + "loss": 0.3373, + "step": 3558 + }, + { + "epoch": 0.712, + "learning_rate": 1.9991228300988586e-05, + "loss": 0.236, + "step": 3560 + }, + { + "epoch": 0.7124, + "learning_rate": 1.9990633867545956e-05, + "loss": 0.4169, + "step": 3562 + }, + { + "epoch": 0.7128, + "learning_rate": 1.9990019956851384e-05, + "loss": 0.2312, + "step": 3564 + }, + { + "epoch": 0.7132, + "learning_rate": 1.9989386570101716e-05, + "loss": 0.4805, + "step": 3566 + }, + { + "epoch": 0.7136, + "learning_rate": 1.9988733708531772e-05, + "loss": 0.55, + "step": 3568 + }, + { + "epoch": 0.714, + "learning_rate": 1.9988061373414342e-05, + "loss": 0.3489, + "step": 3570 + }, + { + "epoch": 0.7144, + "learning_rate": 1.998736956606018e-05, + "loss": 0.8761, + "step": 3572 + }, + { + "epoch": 0.7148, + "learning_rate": 1.998665828781799e-05, + "loss": 0.3442, + "step": 3574 + }, + { + "epoch": 0.7152, + "learning_rate": 1.9985927540074453e-05, + "loss": 0.6211, + "step": 3576 + }, + { + "epoch": 0.7156, + "learning_rate": 1.99851773242542e-05, + "loss": 0.3727, + "step": 3578 + }, + { + "epoch": 0.716, + "learning_rate": 1.9984407641819812e-05, + "loss": 0.1953, + "step": 3580 + }, + { + "epoch": 0.7164, + "learning_rate": 1.9983618494271825e-05, + "loss": 0.4018, + "step": 3582 + }, + { + "epoch": 0.7168, + "learning_rate": 1.998280988314872e-05, + "loss": 0.1557, + "step": 3584 + }, + { + "epoch": 0.7172, + "learning_rate": 1.9981981810026932e-05, + "loss": 0.2449, + "step": 3586 + }, + { + "epoch": 0.7176, + "learning_rate": 1.9981134276520828e-05, + "loss": 0.2847, + "step": 3588 + }, + { + "epoch": 0.718, + "learning_rate": 1.9980267284282718e-05, + "loss": 0.1784, + "step": 3590 + }, + { + "epoch": 0.7184, + "learning_rate": 1.9979380835002846e-05, + "loss": 1.1533, + "step": 3592 + }, + { + "epoch": 0.7188, + "learning_rate": 1.9978474930409396e-05, + "loss": 0.4813, + "step": 3594 + }, + { + "epoch": 0.7192, + "learning_rate": 1.9977549572268467e-05, + "loss": 0.5766, + "step": 3596 + }, + { + "epoch": 0.7196, + "learning_rate": 1.99766047623841e-05, + "loss": 0.6874, + "step": 3598 + }, + { + "epoch": 0.72, + "learning_rate": 1.9975640502598246e-05, + "loss": 0.2518, + "step": 3600 + }, + { + "epoch": 0.7204, + "learning_rate": 1.9974656794790777e-05, + "loss": 0.2797, + "step": 3602 + }, + { + "epoch": 0.7208, + "learning_rate": 1.9973653640879486e-05, + "loss": 0.1849, + "step": 3604 + }, + { + "epoch": 0.7212, + "learning_rate": 1.9972631042820074e-05, + "loss": 0.2053, + "step": 3606 + }, + { + "epoch": 0.7216, + "learning_rate": 1.997158900260614e-05, + "loss": 0.6522, + "step": 3608 + }, + { + "epoch": 0.722, + "learning_rate": 1.9970527522269204e-05, + "loss": 0.2018, + "step": 3610 + }, + { + "epoch": 0.7224, + "learning_rate": 1.9969446603878673e-05, + "loss": 0.5973, + "step": 3612 + }, + { + "epoch": 0.7228, + "learning_rate": 1.9968346249541848e-05, + "loss": 0.2925, + "step": 3614 + }, + { + "epoch": 0.7232, + "learning_rate": 1.9967226461403934e-05, + "loss": 0.3656, + "step": 3616 + }, + { + "epoch": 0.7236, + "learning_rate": 1.996608724164801e-05, + "loss": 0.6096, + "step": 3618 + }, + { + "epoch": 0.724, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.5893, + "step": 3620 + }, + { + "epoch": 0.7244, + "learning_rate": 1.9963750516203887e-05, + "loss": 0.2738, + "step": 3622 + }, + { + "epoch": 0.7248, + "learning_rate": 1.996255301507125e-05, + "loss": 0.4979, + "step": 3624 + }, + { + "epoch": 0.7252, + "learning_rate": 1.9961336091431728e-05, + "loss": 0.4536, + "step": 3626 + }, + { + "epoch": 0.7256, + "learning_rate": 1.9960099747657774e-05, + "loss": 0.9271, + "step": 3628 + }, + { + "epoch": 0.726, + "learning_rate": 1.9958843986159705e-05, + "loss": 0.3412, + "step": 3630 + }, + { + "epoch": 0.7264, + "learning_rate": 1.9957568809385693e-05, + "loss": 0.3404, + "step": 3632 + }, + { + "epoch": 0.7268, + "learning_rate": 1.995627421982176e-05, + "loss": 0.7596, + "step": 3634 + }, + { + "epoch": 0.7272, + "learning_rate": 1.995496021999177e-05, + "loss": 0.2768, + "step": 3636 + }, + { + "epoch": 0.7276, + "learning_rate": 1.995362681245744e-05, + "loss": 0.9415, + "step": 3638 + }, + { + "epoch": 0.728, + "learning_rate": 1.9952273999818312e-05, + "loss": 0.6654, + "step": 3640 + }, + { + "epoch": 0.7284, + "learning_rate": 1.9950901784711768e-05, + "loss": 0.6587, + "step": 3642 + }, + { + "epoch": 0.7288, + "learning_rate": 1.9949510169813006e-05, + "loss": 0.2327, + "step": 3644 + }, + { + "epoch": 0.7292, + "learning_rate": 1.994809915783505e-05, + "loss": 0.3901, + "step": 3646 + }, + { + "epoch": 0.7296, + "learning_rate": 1.9946668751528745e-05, + "loss": 0.429, + "step": 3648 + }, + { + "epoch": 0.73, + "learning_rate": 1.9945218953682736e-05, + "loss": 1.0547, + "step": 3650 + }, + { + "epoch": 0.7304, + "learning_rate": 1.994374976712348e-05, + "loss": 0.2893, + "step": 3652 + }, + { + "epoch": 0.7308, + "learning_rate": 1.9942261194715236e-05, + "loss": 0.4064, + "step": 3654 + }, + { + "epoch": 0.7312, + "learning_rate": 1.9940753239360047e-05, + "loss": 0.1817, + "step": 3656 + }, + { + "epoch": 0.7316, + "learning_rate": 1.9939225903997748e-05, + "loss": 0.8632, + "step": 3658 + }, + { + "epoch": 0.732, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.374, + "step": 3660 + }, + { + "epoch": 0.7324, + "learning_rate": 1.993611310520009e-05, + "loss": 0.8245, + "step": 3662 + }, + { + "epoch": 0.7328, + "learning_rate": 1.993452764783328e-05, + "loss": 0.2846, + "step": 3664 + }, + { + "epoch": 0.7332, + "learning_rate": 1.993292282259647e-05, + "loss": 0.5571, + "step": 3666 + }, + { + "epoch": 0.7336, + "learning_rate": 1.9931298632618352e-05, + "loss": 0.3411, + "step": 3668 + }, + { + "epoch": 0.734, + "learning_rate": 1.9929655081065373e-05, + "loss": 0.3066, + "step": 3670 + }, + { + "epoch": 0.7344, + "learning_rate": 1.9927992171141707e-05, + "loss": 0.356, + "step": 3672 + }, + { + "epoch": 0.7348, + "learning_rate": 1.992630990608929e-05, + "loss": 0.3193, + "step": 3674 + }, + { + "epoch": 0.7352, + "learning_rate": 1.9924608289187786e-05, + "loss": 0.2007, + "step": 3676 + }, + { + "epoch": 0.7356, + "learning_rate": 1.992288732375458e-05, + "loss": 0.4266, + "step": 3678 + }, + { + "epoch": 0.736, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.732, + "step": 3680 + }, + { + "epoch": 0.7364, + "learning_rate": 1.9919387360751216e-05, + "loss": 0.4391, + "step": 3682 + }, + { + "epoch": 0.7368, + "learning_rate": 1.9917608370004417e-05, + "loss": 0.2738, + "step": 3684 + }, + { + "epoch": 0.7372, + "learning_rate": 1.991581004437262e-05, + "loss": 0.4109, + "step": 3686 + }, + { + "epoch": 0.7376, + "learning_rate": 1.9913992387361747e-05, + "loss": 0.4999, + "step": 3688 + }, + { + "epoch": 0.738, + "learning_rate": 1.991215540251542e-05, + "loss": 0.2641, + "step": 3690 + }, + { + "epoch": 0.7384, + "learning_rate": 1.9910299093414932e-05, + "loss": 0.5075, + "step": 3692 + }, + { + "epoch": 0.7388, + "learning_rate": 1.9908423463679246e-05, + "loss": 0.202, + "step": 3694 + }, + { + "epoch": 0.7392, + "learning_rate": 1.990652851696501e-05, + "loss": 0.1682, + "step": 3696 + }, + { + "epoch": 0.7396, + "learning_rate": 1.9904614256966517e-05, + "loss": 0.5852, + "step": 3698 + }, + { + "epoch": 0.74, + "learning_rate": 1.9902680687415704e-05, + "loss": 2.1427, + "step": 3700 + }, + { + "epoch": 0.7404, + "learning_rate": 1.9900727812082177e-05, + "loss": 0.2434, + "step": 3702 + }, + { + "epoch": 0.7408, + "learning_rate": 1.9898755634773155e-05, + "loss": 0.2879, + "step": 3704 + }, + { + "epoch": 0.7412, + "learning_rate": 1.9896764159333518e-05, + "loss": 0.3001, + "step": 3706 + }, + { + "epoch": 0.7416, + "learning_rate": 1.9894753389645723e-05, + "loss": 0.2511, + "step": 3708 + }, + { + "epoch": 0.742, + "learning_rate": 1.9892723329629885e-05, + "loss": 0.2972, + "step": 3710 + }, + { + "epoch": 0.7424, + "learning_rate": 1.9890673983243704e-05, + "loss": 0.4436, + "step": 3712 + }, + { + "epoch": 0.7428, + "learning_rate": 1.9888605354482494e-05, + "loss": 0.3948, + "step": 3714 + }, + { + "epoch": 0.7432, + "learning_rate": 1.9886517447379143e-05, + "loss": 0.3562, + "step": 3716 + }, + { + "epoch": 0.7436, + "learning_rate": 1.9884410266004134e-05, + "loss": 0.1311, + "step": 3718 + }, + { + "epoch": 0.744, + "learning_rate": 1.988228381446553e-05, + "loss": 0.3712, + "step": 3720 + }, + { + "epoch": 0.7444, + "learning_rate": 1.9880138096908955e-05, + "loss": 0.7064, + "step": 3722 + }, + { + "epoch": 0.7448, + "learning_rate": 1.987797311751759e-05, + "loss": 0.5662, + "step": 3724 + }, + { + "epoch": 0.7452, + "learning_rate": 1.9875788880512183e-05, + "loss": 0.5175, + "step": 3726 + }, + { + "epoch": 0.7456, + "learning_rate": 1.9873585390151007e-05, + "loss": 0.3971, + "step": 3728 + }, + { + "epoch": 0.746, + "learning_rate": 1.987136265072988e-05, + "loss": 0.5853, + "step": 3730 + }, + { + "epoch": 0.7464, + "learning_rate": 1.9869120666582153e-05, + "loss": 0.8736, + "step": 3732 + }, + { + "epoch": 0.7468, + "learning_rate": 1.9866859442078685e-05, + "loss": 0.1814, + "step": 3734 + }, + { + "epoch": 0.7472, + "learning_rate": 1.9864578981627844e-05, + "loss": 0.3798, + "step": 3736 + }, + { + "epoch": 0.7476, + "learning_rate": 1.986227928967551e-05, + "loss": 0.3681, + "step": 3738 + }, + { + "epoch": 0.748, + "learning_rate": 1.985996037070505e-05, + "loss": 0.3748, + "step": 3740 + }, + { + "epoch": 0.7484, + "learning_rate": 1.985762222923732e-05, + "loss": 0.1893, + "step": 3742 + }, + { + "epoch": 0.7488, + "learning_rate": 1.985526486983063e-05, + "loss": 0.2361, + "step": 3744 + }, + { + "epoch": 0.7492, + "learning_rate": 1.985288829708079e-05, + "loss": 0.2941, + "step": 3746 + }, + { + "epoch": 0.7496, + "learning_rate": 1.9850492515621038e-05, + "loss": 0.4058, + "step": 3748 + }, + { + "epoch": 0.75, + "learning_rate": 1.9848077530122083e-05, + "loss": 0.3607, + "step": 3750 + }, + { + "epoch": 0.7504, + "learning_rate": 1.9845643345292055e-05, + "loss": 0.2806, + "step": 3752 + }, + { + "epoch": 0.7508, + "learning_rate": 1.9843189965876525e-05, + "loss": 0.4488, + "step": 3754 + }, + { + "epoch": 0.7512, + "learning_rate": 1.9840717396658483e-05, + "loss": 0.4541, + "step": 3756 + }, + { + "epoch": 0.7516, + "learning_rate": 1.983822564245833e-05, + "loss": 0.1691, + "step": 3758 + }, + { + "epoch": 0.752, + "learning_rate": 1.983571470813386e-05, + "loss": 0.2956, + "step": 3760 + }, + { + "epoch": 0.7524, + "learning_rate": 1.9833184598580276e-05, + "loss": 0.682, + "step": 3762 + }, + { + "epoch": 0.7528, + "learning_rate": 1.983063531873016e-05, + "loss": 0.3372, + "step": 3764 + }, + { + "epoch": 0.7532, + "learning_rate": 1.982806687355345e-05, + "loss": 0.2437, + "step": 3766 + }, + { + "epoch": 0.7536, + "learning_rate": 1.982547926805747e-05, + "loss": 0.3936, + "step": 3768 + }, + { + "epoch": 0.754, + "learning_rate": 1.982287250728689e-05, + "loss": 0.2227, + "step": 3770 + }, + { + "epoch": 0.7544, + "learning_rate": 1.9820246596323724e-05, + "loss": 1.0594, + "step": 3772 + }, + { + "epoch": 0.7548, + "learning_rate": 1.981760154028731e-05, + "loss": 0.3343, + "step": 3774 + }, + { + "epoch": 0.7552, + "learning_rate": 1.981493734433433e-05, + "loss": 0.4314, + "step": 3776 + }, + { + "epoch": 0.7556, + "learning_rate": 1.9812254013658773e-05, + "loss": 0.6609, + "step": 3778 + }, + { + "epoch": 0.756, + "learning_rate": 1.9809551553491918e-05, + "loss": 0.6671, + "step": 3780 + }, + { + "epoch": 0.7564, + "learning_rate": 1.9806829969102356e-05, + "loss": 0.2766, + "step": 3782 + }, + { + "epoch": 0.7568, + "learning_rate": 1.9804089265795963e-05, + "loss": 0.2916, + "step": 3784 + }, + { + "epoch": 0.7572, + "learning_rate": 1.9801329448915863e-05, + "loss": 0.2167, + "step": 3786 + }, + { + "epoch": 0.7576, + "learning_rate": 1.979855052384247e-05, + "loss": 0.2376, + "step": 3788 + }, + { + "epoch": 0.758, + "learning_rate": 1.979575249599344e-05, + "loss": 0.3167, + "step": 3790 + }, + { + "epoch": 0.7584, + "learning_rate": 1.979293537082368e-05, + "loss": 0.4623, + "step": 3792 + }, + { + "epoch": 0.7588, + "learning_rate": 1.9790099153825303e-05, + "loss": 0.6544, + "step": 3794 + }, + { + "epoch": 0.7592, + "learning_rate": 1.9787243850527663e-05, + "loss": 0.266, + "step": 3796 + }, + { + "epoch": 0.7596, + "learning_rate": 1.978436946649733e-05, + "loss": 0.2438, + "step": 3798 + }, + { + "epoch": 0.76, + "learning_rate": 1.978147600733806e-05, + "loss": 0.2665, + "step": 3800 + }, + { + "epoch": 0.7604, + "learning_rate": 1.9778563478690793e-05, + "loss": 0.473, + "step": 3802 + }, + { + "epoch": 0.7608, + "learning_rate": 1.977563188623365e-05, + "loss": 0.168, + "step": 3804 + }, + { + "epoch": 0.7612, + "learning_rate": 1.977268123568194e-05, + "loss": 0.5511, + "step": 3806 + }, + { + "epoch": 0.7616, + "learning_rate": 1.9769711532788086e-05, + "loss": 0.243, + "step": 3808 + }, + { + "epoch": 0.762, + "learning_rate": 1.9766722783341682e-05, + "loss": 0.5393, + "step": 3810 + }, + { + "epoch": 0.7624, + "learning_rate": 1.9763714993169448e-05, + "loss": 0.2495, + "step": 3812 + }, + { + "epoch": 0.7628, + "learning_rate": 1.9760688168135236e-05, + "loss": 0.6578, + "step": 3814 + }, + { + "epoch": 0.7632, + "learning_rate": 1.9757642314139977e-05, + "loss": 0.7466, + "step": 3816 + }, + { + "epoch": 0.7636, + "learning_rate": 1.9754577437121733e-05, + "loss": 0.47, + "step": 3818 + }, + { + "epoch": 0.764, + "learning_rate": 1.9751493543055638e-05, + "loss": 0.3776, + "step": 3820 + }, + { + "epoch": 0.7644, + "learning_rate": 1.974839063795389e-05, + "loss": 0.6388, + "step": 3822 + }, + { + "epoch": 0.7648, + "learning_rate": 1.9745268727865774e-05, + "loss": 0.2455, + "step": 3824 + }, + { + "epoch": 0.7652, + "learning_rate": 1.97421278188776e-05, + "loss": 0.686, + "step": 3826 + }, + { + "epoch": 0.7656, + "learning_rate": 1.973896791711276e-05, + "loss": 0.3503, + "step": 3828 + }, + { + "epoch": 0.766, + "learning_rate": 1.9735789028731607e-05, + "loss": 0.4005, + "step": 3830 + }, + { + "epoch": 0.7664, + "learning_rate": 1.9732591159931567e-05, + "loss": 0.4274, + "step": 3832 + }, + { + "epoch": 0.7668, + "learning_rate": 1.9729374316947037e-05, + "loss": 0.523, + "step": 3834 + }, + { + "epoch": 0.7672, + "learning_rate": 1.972613850604944e-05, + "loss": 0.3672, + "step": 3836 + }, + { + "epoch": 0.7676, + "learning_rate": 1.972288373354713e-05, + "loss": 0.2974, + "step": 3838 + }, + { + "epoch": 0.768, + "learning_rate": 1.9719610005785463e-05, + "loss": 0.6598, + "step": 3840 + }, + { + "epoch": 0.7684, + "learning_rate": 1.9716317329146743e-05, + "loss": 0.2017, + "step": 3842 + }, + { + "epoch": 0.7688, + "learning_rate": 1.9713005710050206e-05, + "loss": 0.4016, + "step": 3844 + }, + { + "epoch": 0.7692, + "learning_rate": 1.9709675154952017e-05, + "loss": 0.6506, + "step": 3846 + }, + { + "epoch": 0.7696, + "learning_rate": 1.9706325670345276e-05, + "loss": 0.4656, + "step": 3848 + }, + { + "epoch": 0.77, + "learning_rate": 1.970295726275997e-05, + "loss": 0.3334, + "step": 3850 + }, + { + "epoch": 0.7704, + "learning_rate": 1.9699569938762975e-05, + "loss": 0.16, + "step": 3852 + }, + { + "epoch": 0.7708, + "learning_rate": 1.969616370495806e-05, + "loss": 0.403, + "step": 3854 + }, + { + "epoch": 0.7712, + "learning_rate": 1.969273856798586e-05, + "loss": 0.7375, + "step": 3856 + }, + { + "epoch": 0.7716, + "learning_rate": 1.9689294534523836e-05, + "loss": 0.5612, + "step": 3858 + }, + { + "epoch": 0.772, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.1526, + "step": 3860 + }, + { + "epoch": 0.7724, + "learning_rate": 1.9682349805024447e-05, + "loss": 0.4092, + "step": 3862 + }, + { + "epoch": 0.7728, + "learning_rate": 1.9678849122526195e-05, + "loss": 0.1654, + "step": 3864 + }, + { + "epoch": 0.7732, + "learning_rate": 1.9675329570616302e-05, + "loss": 0.3381, + "step": 3866 + }, + { + "epoch": 0.7736, + "learning_rate": 1.967179115615633e-05, + "loss": 0.342, + "step": 3868 + }, + { + "epoch": 0.774, + "learning_rate": 1.966823388604459e-05, + "loss": 0.3977, + "step": 3870 + }, + { + "epoch": 0.7744, + "learning_rate": 1.966465776721618e-05, + "loss": 0.3547, + "step": 3872 + }, + { + "epoch": 0.7748, + "learning_rate": 1.9661062806642906e-05, + "loss": 0.3208, + "step": 3874 + }, + { + "epoch": 0.7752, + "learning_rate": 1.9657449011333328e-05, + "loss": 0.463, + "step": 3876 + }, + { + "epoch": 0.7756, + "learning_rate": 1.9653816388332743e-05, + "loss": 0.268, + "step": 3878 + }, + { + "epoch": 0.776, + "learning_rate": 1.965016494472312e-05, + "loss": 0.3971, + "step": 3880 + }, + { + "epoch": 0.7764, + "learning_rate": 1.964649468762313e-05, + "loss": 0.2727, + "step": 3882 + }, + { + "epoch": 0.7768, + "learning_rate": 1.964280562418815e-05, + "loss": 0.2382, + "step": 3884 + }, + { + "epoch": 0.7772, + "learning_rate": 1.963909776161018e-05, + "loss": 0.2968, + "step": 3886 + }, + { + "epoch": 0.7776, + "learning_rate": 1.963537110711789e-05, + "loss": 0.3061, + "step": 3888 + }, + { + "epoch": 0.778, + "learning_rate": 1.9631625667976584e-05, + "loss": 0.3788, + "step": 3890 + }, + { + "epoch": 0.7784, + "learning_rate": 1.9627861451488194e-05, + "loss": 0.4671, + "step": 3892 + }, + { + "epoch": 0.7788, + "learning_rate": 1.9624078464991246e-05, + "loss": 0.1623, + "step": 3894 + }, + { + "epoch": 0.7792, + "learning_rate": 1.962027671586086e-05, + "loss": 0.2169, + "step": 3896 + }, + { + "epoch": 0.7796, + "learning_rate": 1.9616456211508756e-05, + "loss": 0.2837, + "step": 3898 + }, + { + "epoch": 0.78, + "learning_rate": 1.9612616959383194e-05, + "loss": 0.4458, + "step": 3900 + }, + { + "epoch": 0.7804, + "learning_rate": 1.9608758966968987e-05, + "loss": 0.3534, + "step": 3902 + }, + { + "epoch": 0.7808, + "learning_rate": 1.96048822417875e-05, + "loss": 0.6948, + "step": 3904 + }, + { + "epoch": 0.7812, + "learning_rate": 1.9600986791396597e-05, + "loss": 0.3448, + "step": 3906 + }, + { + "epoch": 0.7816, + "learning_rate": 1.9597072623390668e-05, + "loss": 0.5059, + "step": 3908 + }, + { + "epoch": 0.782, + "learning_rate": 1.9593139745400578e-05, + "loss": 0.1454, + "step": 3910 + }, + { + "epoch": 0.7824, + "learning_rate": 1.9589188165093666e-05, + "loss": 0.9051, + "step": 3912 + }, + { + "epoch": 0.7828, + "learning_rate": 1.9585217890173765e-05, + "loss": 0.3168, + "step": 3914 + }, + { + "epoch": 0.7832, + "learning_rate": 1.95812289283811e-05, + "loss": 0.3584, + "step": 3916 + }, + { + "epoch": 0.7836, + "learning_rate": 1.957722128749237e-05, + "loss": 0.435, + "step": 3918 + }, + { + "epoch": 0.784, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.3508, + "step": 3920 + }, + { + "epoch": 0.7844, + "learning_rate": 1.9569149999715518e-05, + "loss": 0.6136, + "step": 3922 + }, + { + "epoch": 0.7848, + "learning_rate": 1.9565086368562784e-05, + "loss": 0.4244, + "step": 3924 + }, + { + "epoch": 0.7852, + "learning_rate": 1.9561004089784722e-05, + "loss": 0.314, + "step": 3926 + }, + { + "epoch": 0.7856, + "learning_rate": 1.9556903171339966e-05, + "loss": 0.525, + "step": 3928 + }, + { + "epoch": 0.786, + "learning_rate": 1.955278362122344e-05, + "loss": 0.1512, + "step": 3930 + }, + { + "epoch": 0.7864, + "learning_rate": 1.954864544746643e-05, + "loss": 0.2557, + "step": 3932 + }, + { + "epoch": 0.7868, + "learning_rate": 1.954448865813652e-05, + "loss": 0.617, + "step": 3934 + }, + { + "epoch": 0.7872, + "learning_rate": 1.9540313261337585e-05, + "loss": 0.3144, + "step": 3936 + }, + { + "epoch": 0.7876, + "learning_rate": 1.9536119265209763e-05, + "loss": 0.1951, + "step": 3938 + }, + { + "epoch": 0.788, + "learning_rate": 1.9531906677929472e-05, + "loss": 0.2585, + "step": 3940 + }, + { + "epoch": 0.7884, + "learning_rate": 1.9527675507709364e-05, + "loss": 0.2495, + "step": 3942 + }, + { + "epoch": 0.7888, + "learning_rate": 1.9523425762798335e-05, + "loss": 0.4248, + "step": 3944 + }, + { + "epoch": 0.7892, + "learning_rate": 1.9519157451481456e-05, + "loss": 0.2485, + "step": 3946 + }, + { + "epoch": 0.7896, + "learning_rate": 1.9514870582080035e-05, + "loss": 0.5991, + "step": 3948 + }, + { + "epoch": 0.79, + "learning_rate": 1.9510565162951545e-05, + "loss": 0.473, + "step": 3950 + }, + { + "epoch": 0.7904, + "learning_rate": 1.95062412024896e-05, + "loss": 0.5128, + "step": 3952 + }, + { + "epoch": 0.7908, + "learning_rate": 1.950189870912401e-05, + "loss": 0.2812, + "step": 3954 + }, + { + "epoch": 0.7912, + "learning_rate": 1.9497537691320667e-05, + "loss": 0.283, + "step": 3956 + }, + { + "epoch": 0.7916, + "learning_rate": 1.9493158157581617e-05, + "loss": 0.3539, + "step": 3958 + }, + { + "epoch": 0.792, + "learning_rate": 1.948876011644497e-05, + "loss": 0.3928, + "step": 3960 + }, + { + "epoch": 0.7924, + "learning_rate": 1.948434357648493e-05, + "loss": 0.5356, + "step": 3962 + }, + { + "epoch": 0.7928, + "learning_rate": 1.9479908546311787e-05, + "loss": 0.1452, + "step": 3964 + }, + { + "epoch": 0.7932, + "learning_rate": 1.9475455034571843e-05, + "loss": 0.3331, + "step": 3966 + }, + { + "epoch": 0.7936, + "learning_rate": 1.9470983049947443e-05, + "loss": 1.0376, + "step": 3968 + }, + { + "epoch": 0.794, + "learning_rate": 1.9466492601156964e-05, + "loss": 0.2996, + "step": 3970 + }, + { + "epoch": 0.7944, + "learning_rate": 1.9461983696954767e-05, + "loss": 0.4756, + "step": 3972 + }, + { + "epoch": 0.7948, + "learning_rate": 1.9457456346131175e-05, + "loss": 0.9092, + "step": 3974 + }, + { + "epoch": 0.7952, + "learning_rate": 1.9452910557512494e-05, + "loss": 0.1543, + "step": 3976 + }, + { + "epoch": 0.7956, + "learning_rate": 1.9448346339960984e-05, + "loss": 0.2494, + "step": 3978 + }, + { + "epoch": 0.796, + "learning_rate": 1.9443763702374818e-05, + "loss": 0.3534, + "step": 3980 + }, + { + "epoch": 0.7964, + "learning_rate": 1.9439162653688066e-05, + "loss": 0.4852, + "step": 3982 + }, + { + "epoch": 0.7968, + "learning_rate": 1.9434543202870726e-05, + "loss": 0.1804, + "step": 3984 + }, + { + "epoch": 0.7972, + "learning_rate": 1.9429905358928655e-05, + "loss": 0.23, + "step": 3986 + }, + { + "epoch": 0.7976, + "learning_rate": 1.9425249130903544e-05, + "loss": 0.395, + "step": 3988 + }, + { + "epoch": 0.798, + "learning_rate": 1.942057452787297e-05, + "loss": 0.4776, + "step": 3990 + }, + { + "epoch": 0.7984, + "learning_rate": 1.94158815589503e-05, + "loss": 0.2685, + "step": 3992 + }, + { + "epoch": 0.7988, + "learning_rate": 1.941117023328473e-05, + "loss": 0.8614, + "step": 3994 + }, + { + "epoch": 0.7992, + "learning_rate": 1.940644056006122e-05, + "loss": 0.164, + "step": 3996 + }, + { + "epoch": 0.7996, + "learning_rate": 1.94016925485005e-05, + "loss": 0.6664, + "step": 3998 + }, + { + "epoch": 0.8, + "learning_rate": 1.939692620785909e-05, + "loss": 1.0899, + "step": 4000 + }, + { + "epoch": 0.8004, + "learning_rate": 1.939214154742919e-05, + "loss": 0.241, + "step": 4002 + }, + { + "epoch": 0.8008, + "learning_rate": 1.9387338576538746e-05, + "loss": 0.3113, + "step": 4004 + }, + { + "epoch": 0.8012, + "learning_rate": 1.9382517304551393e-05, + "loss": 0.3109, + "step": 4006 + }, + { + "epoch": 0.8016, + "learning_rate": 1.9377677740866464e-05, + "loss": 0.3024, + "step": 4008 + }, + { + "epoch": 0.802, + "learning_rate": 1.9372819894918922e-05, + "loss": 0.3257, + "step": 4010 + }, + { + "epoch": 0.8024, + "learning_rate": 1.936794377617938e-05, + "loss": 0.2264, + "step": 4012 + }, + { + "epoch": 0.8028, + "learning_rate": 1.9363049394154102e-05, + "loss": 0.3203, + "step": 4014 + }, + { + "epoch": 0.8032, + "learning_rate": 1.9358136758384917e-05, + "loss": 0.1769, + "step": 4016 + }, + { + "epoch": 0.8036, + "learning_rate": 1.935320587844926e-05, + "loss": 0.8297, + "step": 4018 + }, + { + "epoch": 0.804, + "learning_rate": 1.9348256763960146e-05, + "loss": 0.2152, + "step": 4020 + }, + { + "epoch": 0.8044, + "learning_rate": 1.934328942456613e-05, + "loss": 0.2592, + "step": 4022 + }, + { + "epoch": 0.8048, + "learning_rate": 1.9338303869951273e-05, + "loss": 0.4668, + "step": 4024 + }, + { + "epoch": 0.8052, + "learning_rate": 1.9333300109835186e-05, + "loss": 0.2541, + "step": 4026 + }, + { + "epoch": 0.8056, + "learning_rate": 1.9328278153972943e-05, + "loss": 0.364, + "step": 4028 + }, + { + "epoch": 0.806, + "learning_rate": 1.9323238012155125e-05, + "loss": 0.1516, + "step": 4030 + }, + { + "epoch": 0.8064, + "learning_rate": 1.931817969420773e-05, + "loss": 0.3028, + "step": 4032 + }, + { + "epoch": 0.8068, + "learning_rate": 1.93131032099922e-05, + "loss": 0.4675, + "step": 4034 + }, + { + "epoch": 0.8072, + "learning_rate": 1.930800856940543e-05, + "loss": 0.2212, + "step": 4036 + }, + { + "epoch": 0.8076, + "learning_rate": 1.9302895782379648e-05, + "loss": 0.177, + "step": 4038 + }, + { + "epoch": 0.808, + "learning_rate": 1.929776485888252e-05, + "loss": 0.3425, + "step": 4040 + }, + { + "epoch": 0.8084, + "learning_rate": 1.9292615808917024e-05, + "loss": 0.4274, + "step": 4042 + }, + { + "epoch": 0.8088, + "learning_rate": 1.9287448642521517e-05, + "loss": 0.4591, + "step": 4044 + }, + { + "epoch": 0.8092, + "learning_rate": 1.9282263369769637e-05, + "loss": 0.1999, + "step": 4046 + }, + { + "epoch": 0.8096, + "learning_rate": 1.9277060000770342e-05, + "loss": 0.984, + "step": 4048 + }, + { + "epoch": 0.81, + "learning_rate": 1.927183854566788e-05, + "loss": 0.3304, + "step": 4050 + }, + { + "epoch": 0.8104, + "learning_rate": 1.9266599014641727e-05, + "loss": 0.3851, + "step": 4052 + }, + { + "epoch": 0.8108, + "learning_rate": 1.9261341417906622e-05, + "loss": 0.2367, + "step": 4054 + }, + { + "epoch": 0.8112, + "learning_rate": 1.925606576571252e-05, + "loss": 0.6992, + "step": 4056 + }, + { + "epoch": 0.8116, + "learning_rate": 1.925077206834459e-05, + "loss": 0.4852, + "step": 4058 + }, + { + "epoch": 0.812, + "learning_rate": 1.9245460336123136e-05, + "loss": 0.3408, + "step": 4060 + }, + { + "epoch": 0.8124, + "learning_rate": 1.924013057940367e-05, + "loss": 0.2963, + "step": 4062 + }, + { + "epoch": 0.8128, + "learning_rate": 1.923478280857682e-05, + "loss": 0.3995, + "step": 4064 + }, + { + "epoch": 0.8132, + "learning_rate": 1.922941703406836e-05, + "loss": 0.298, + "step": 4066 + }, + { + "epoch": 0.8136, + "learning_rate": 1.9224033266339103e-05, + "loss": 0.2862, + "step": 4068 + }, + { + "epoch": 0.814, + "learning_rate": 1.9218631515885007e-05, + "loss": 0.4487, + "step": 4070 + }, + { + "epoch": 0.8144, + "learning_rate": 1.9213211793237066e-05, + "loss": 0.1839, + "step": 4072 + }, + { + "epoch": 0.8148, + "learning_rate": 1.9207774108961276e-05, + "loss": 0.2285, + "step": 4074 + }, + { + "epoch": 0.8152, + "learning_rate": 1.9202318473658707e-05, + "loss": 0.8615, + "step": 4076 + }, + { + "epoch": 0.8156, + "learning_rate": 1.9196844897965387e-05, + "loss": 0.165, + "step": 4078 + }, + { + "epoch": 0.816, + "learning_rate": 1.919135339255235e-05, + "loss": 0.242, + "step": 4080 + }, + { + "epoch": 0.8164, + "learning_rate": 1.9185843968125546e-05, + "loss": 0.1812, + "step": 4082 + }, + { + "epoch": 0.8168, + "learning_rate": 1.918031663542588e-05, + "loss": 0.3765, + "step": 4084 + }, + { + "epoch": 0.8172, + "learning_rate": 1.917477140522919e-05, + "loss": 0.501, + "step": 4086 + }, + { + "epoch": 0.8176, + "learning_rate": 1.916920828834617e-05, + "loss": 0.328, + "step": 4088 + }, + { + "epoch": 0.818, + "learning_rate": 1.9163627295622394e-05, + "loss": 0.2432, + "step": 4090 + }, + { + "epoch": 0.8184, + "learning_rate": 1.9158028437938313e-05, + "loss": 0.469, + "step": 4092 + }, + { + "epoch": 0.8188, + "learning_rate": 1.9152411726209183e-05, + "loss": 0.673, + "step": 4094 + }, + { + "epoch": 0.8192, + "learning_rate": 1.9146777171385057e-05, + "loss": 0.4013, + "step": 4096 + }, + { + "epoch": 0.8196, + "learning_rate": 1.914112478445079e-05, + "loss": 0.2116, + "step": 4098 + }, + { + "epoch": 0.82, + "learning_rate": 1.913545457642601e-05, + "loss": 0.1591, + "step": 4100 + }, + { + "epoch": 0.8204, + "learning_rate": 1.9129766558365082e-05, + "loss": 0.2063, + "step": 4102 + }, + { + "epoch": 0.8208, + "learning_rate": 1.9124060741357065e-05, + "loss": 0.327, + "step": 4104 + }, + { + "epoch": 0.8212, + "learning_rate": 1.911833713652576e-05, + "loss": 0.9468, + "step": 4106 + }, + { + "epoch": 0.8216, + "learning_rate": 1.911259575502963e-05, + "loss": 0.3993, + "step": 4108 + }, + { + "epoch": 0.822, + "learning_rate": 1.9106836608061775e-05, + "loss": 0.6771, + "step": 4110 + }, + { + "epoch": 0.8224, + "learning_rate": 1.910105970684996e-05, + "loss": 0.2673, + "step": 4112 + }, + { + "epoch": 0.8228, + "learning_rate": 1.909526506265654e-05, + "loss": 0.3472, + "step": 4114 + }, + { + "epoch": 0.8232, + "learning_rate": 1.908945268677849e-05, + "loss": 0.3008, + "step": 4116 + }, + { + "epoch": 0.8236, + "learning_rate": 1.9083622590547313e-05, + "loss": 1.0971, + "step": 4118 + }, + { + "epoch": 0.824, + "learning_rate": 1.9077774785329085e-05, + "loss": 0.4168, + "step": 4120 + }, + { + "epoch": 0.8244, + "learning_rate": 1.9071909282524422e-05, + "loss": 0.4436, + "step": 4122 + }, + { + "epoch": 0.8248, + "learning_rate": 1.9066026093568383e-05, + "loss": 0.2096, + "step": 4124 + }, + { + "epoch": 0.8252, + "learning_rate": 1.9060125229930576e-05, + "loss": 0.4834, + "step": 4126 + }, + { + "epoch": 0.8256, + "learning_rate": 1.9054206703115013e-05, + "loss": 0.1804, + "step": 4128 + }, + { + "epoch": 0.826, + "learning_rate": 1.9048270524660203e-05, + "loss": 0.3803, + "step": 4130 + }, + { + "epoch": 0.8264, + "learning_rate": 1.9042316706138994e-05, + "loss": 0.2355, + "step": 4132 + }, + { + "epoch": 0.8268, + "learning_rate": 1.9036345259158664e-05, + "loss": 0.3365, + "step": 4134 + }, + { + "epoch": 0.8272, + "learning_rate": 1.903035619536087e-05, + "loss": 0.4663, + "step": 4136 + }, + { + "epoch": 0.8276, + "learning_rate": 1.9024349526421603e-05, + "loss": 0.413, + "step": 4138 + }, + { + "epoch": 0.828, + "learning_rate": 1.901832526405114e-05, + "loss": 0.3424, + "step": 4140 + }, + { + "epoch": 0.8284, + "learning_rate": 1.9012283419994112e-05, + "loss": 0.682, + "step": 4142 + }, + { + "epoch": 0.8288, + "learning_rate": 1.9006224006029414e-05, + "loss": 0.176, + "step": 4144 + }, + { + "epoch": 0.8292, + "learning_rate": 1.9000147033970148e-05, + "loss": 0.262, + "step": 4146 + }, + { + "epoch": 0.8296, + "learning_rate": 1.899405251566371e-05, + "loss": 0.3485, + "step": 4148 + }, + { + "epoch": 0.83, + "learning_rate": 1.8987940462991666e-05, + "loss": 0.3318, + "step": 4150 + }, + { + "epoch": 0.8304, + "learning_rate": 1.8981810887869797e-05, + "loss": 0.5296, + "step": 4152 + }, + { + "epoch": 0.8308, + "learning_rate": 1.8975663802247978e-05, + "loss": 0.1958, + "step": 4154 + }, + { + "epoch": 0.8312, + "learning_rate": 1.8969499218110302e-05, + "loss": 0.4368, + "step": 4156 + }, + { + "epoch": 0.8316, + "learning_rate": 1.8963317147474943e-05, + "loss": 0.4005, + "step": 4158 + }, + { + "epoch": 0.832, + "learning_rate": 1.8957117602394133e-05, + "loss": 0.3331, + "step": 4160 + }, + { + "epoch": 0.8324, + "learning_rate": 1.8950900594954233e-05, + "loss": 0.4985, + "step": 4162 + }, + { + "epoch": 0.8328, + "learning_rate": 1.8944666137275596e-05, + "loss": 0.7635, + "step": 4164 + }, + { + "epoch": 0.8332, + "learning_rate": 1.8938414241512644e-05, + "loss": 0.4494, + "step": 4166 + }, + { + "epoch": 0.8336, + "learning_rate": 1.8932144919853744e-05, + "loss": 0.2144, + "step": 4168 + }, + { + "epoch": 0.834, + "learning_rate": 1.892585818452125e-05, + "loss": 0.2754, + "step": 4170 + }, + { + "epoch": 0.8344, + "learning_rate": 1.8919554047771508e-05, + "loss": 0.2023, + "step": 4172 + }, + { + "epoch": 0.8348, + "learning_rate": 1.891323252189474e-05, + "loss": 0.2307, + "step": 4174 + }, + { + "epoch": 0.8352, + "learning_rate": 1.890689361921507e-05, + "loss": 0.3971, + "step": 4176 + }, + { + "epoch": 0.8356, + "learning_rate": 1.8900537352090523e-05, + "loss": 0.1508, + "step": 4178 + }, + { + "epoch": 0.836, + "learning_rate": 1.8894163732912986e-05, + "loss": 0.2087, + "step": 4180 + }, + { + "epoch": 0.8364, + "learning_rate": 1.8887772774108122e-05, + "loss": 0.2302, + "step": 4182 + }, + { + "epoch": 0.8368, + "learning_rate": 1.8881364488135445e-05, + "loss": 0.3505, + "step": 4184 + }, + { + "epoch": 0.8372, + "learning_rate": 1.887493888748825e-05, + "loss": 0.4814, + "step": 4186 + }, + { + "epoch": 0.8376, + "learning_rate": 1.886849598469357e-05, + "loss": 0.4146, + "step": 4188 + }, + { + "epoch": 0.838, + "learning_rate": 1.886203579231215e-05, + "loss": 0.2581, + "step": 4190 + }, + { + "epoch": 0.8384, + "learning_rate": 1.8855558322938492e-05, + "loss": 0.2841, + "step": 4192 + }, + { + "epoch": 0.8388, + "learning_rate": 1.8849063589200754e-05, + "loss": 0.6467, + "step": 4194 + }, + { + "epoch": 0.8392, + "learning_rate": 1.8842551603760725e-05, + "loss": 0.8178, + "step": 4196 + }, + { + "epoch": 0.8396, + "learning_rate": 1.8836022379313884e-05, + "loss": 0.6337, + "step": 4198 + }, + { + "epoch": 0.84, + "learning_rate": 1.8829475928589265e-05, + "loss": 0.2102, + "step": 4200 + }, + { + "epoch": 0.8404, + "learning_rate": 1.882291226434954e-05, + "loss": 0.6056, + "step": 4202 + }, + { + "epoch": 0.8408, + "learning_rate": 1.8816331399390874e-05, + "loss": 0.2383, + "step": 4204 + }, + { + "epoch": 0.8412, + "learning_rate": 1.880973334654301e-05, + "loss": 0.1391, + "step": 4206 + }, + { + "epoch": 0.8416, + "learning_rate": 1.88031181186692e-05, + "loss": 0.3828, + "step": 4208 + }, + { + "epoch": 0.842, + "learning_rate": 1.8796485728666172e-05, + "loss": 0.1717, + "step": 4210 + }, + { + "epoch": 0.8424, + "learning_rate": 1.8789836189464092e-05, + "loss": 0.171, + "step": 4212 + }, + { + "epoch": 0.8428, + "learning_rate": 1.8783169514026574e-05, + "loss": 0.1578, + "step": 4214 + }, + { + "epoch": 0.8432, + "learning_rate": 1.877648571535068e-05, + "loss": 0.3115, + "step": 4216 + }, + { + "epoch": 0.8436, + "learning_rate": 1.8769784806466775e-05, + "loss": 0.3348, + "step": 4218 + }, + { + "epoch": 0.844, + "learning_rate": 1.8763066800438638e-05, + "loss": 0.2157, + "step": 4220 + }, + { + "epoch": 0.8444, + "learning_rate": 1.8756331710363375e-05, + "loss": 0.2821, + "step": 4222 + }, + { + "epoch": 0.8448, + "learning_rate": 1.8749579549371387e-05, + "loss": 0.2338, + "step": 4224 + }, + { + "epoch": 0.8452, + "learning_rate": 1.8742810330626338e-05, + "loss": 0.6062, + "step": 4226 + }, + { + "epoch": 0.8456, + "learning_rate": 1.8736024067325188e-05, + "loss": 0.4804, + "step": 4228 + }, + { + "epoch": 0.846, + "learning_rate": 1.8729220772698106e-05, + "loss": 0.4012, + "step": 4230 + }, + { + "epoch": 0.8464, + "learning_rate": 1.8722400460008437e-05, + "loss": 0.4059, + "step": 4232 + }, + { + "epoch": 0.8468, + "learning_rate": 1.8715563142552758e-05, + "loss": 0.186, + "step": 4234 + }, + { + "epoch": 0.8472, + "learning_rate": 1.8708708833660748e-05, + "loss": 0.5428, + "step": 4236 + }, + { + "epoch": 0.8476, + "learning_rate": 1.870183754669526e-05, + "loss": 0.1886, + "step": 4238 + }, + { + "epoch": 0.848, + "learning_rate": 1.8694949295052198e-05, + "loss": 0.4143, + "step": 4240 + }, + { + "epoch": 0.8484, + "learning_rate": 1.8688044092160558e-05, + "loss": 0.3927, + "step": 4242 + }, + { + "epoch": 0.8488, + "learning_rate": 1.868112195148239e-05, + "loss": 0.3764, + "step": 4244 + }, + { + "epoch": 0.8492, + "learning_rate": 1.867418288651278e-05, + "loss": 0.6172, + "step": 4246 + }, + { + "epoch": 0.8496, + "learning_rate": 1.866722691077977e-05, + "loss": 0.383, + "step": 4248 + }, + { + "epoch": 0.85, + "learning_rate": 1.8660254037844384e-05, + "loss": 0.1933, + "step": 4250 + }, + { + "epoch": 0.8504, + "learning_rate": 1.8653264281300626e-05, + "loss": 0.2578, + "step": 4252 + }, + { + "epoch": 0.8508, + "learning_rate": 1.8646257654775357e-05, + "loss": 1.1302, + "step": 4254 + }, + { + "epoch": 0.8512, + "learning_rate": 1.8639234171928355e-05, + "loss": 0.5884, + "step": 4256 + }, + { + "epoch": 0.8516, + "learning_rate": 1.8632193846452267e-05, + "loss": 0.3488, + "step": 4258 + }, + { + "epoch": 0.852, + "learning_rate": 1.8625136692072587e-05, + "loss": 0.3662, + "step": 4260 + }, + { + "epoch": 0.8524, + "learning_rate": 1.861806272254755e-05, + "loss": 0.7697, + "step": 4262 + }, + { + "epoch": 0.8528, + "learning_rate": 1.8610971951668265e-05, + "loss": 0.4307, + "step": 4264 + }, + { + "epoch": 0.8532, + "learning_rate": 1.8603864393258547e-05, + "loss": 0.7126, + "step": 4266 + }, + { + "epoch": 0.8536, + "learning_rate": 1.8596740061174912e-05, + "loss": 0.2076, + "step": 4268 + }, + { + "epoch": 0.854, + "learning_rate": 1.8589598969306646e-05, + "loss": 0.3352, + "step": 4270 + }, + { + "epoch": 0.8544, + "learning_rate": 1.858244113157566e-05, + "loss": 0.1921, + "step": 4272 + }, + { + "epoch": 0.8548, + "learning_rate": 1.8575266561936533e-05, + "loss": 0.4544, + "step": 4274 + }, + { + "epoch": 0.8552, + "learning_rate": 1.8568075274376432e-05, + "loss": 0.2682, + "step": 4276 + }, + { + "epoch": 0.8556, + "learning_rate": 1.8560867282915164e-05, + "loss": 0.6296, + "step": 4278 + }, + { + "epoch": 0.856, + "learning_rate": 1.8553642601605083e-05, + "loss": 0.2553, + "step": 4280 + }, + { + "epoch": 0.8564, + "learning_rate": 1.8546401244531034e-05, + "loss": 0.1875, + "step": 4282 + }, + { + "epoch": 0.8568, + "learning_rate": 1.8539143225810457e-05, + "loss": 0.3017, + "step": 4284 + }, + { + "epoch": 0.8572, + "learning_rate": 1.85318685595932e-05, + "loss": 0.2168, + "step": 4286 + }, + { + "epoch": 0.8576, + "learning_rate": 1.852457726006163e-05, + "loss": 0.4579, + "step": 4288 + }, + { + "epoch": 0.858, + "learning_rate": 1.8517269341430485e-05, + "loss": 0.5025, + "step": 4290 + }, + { + "epoch": 0.8584, + "learning_rate": 1.8509944817946917e-05, + "loss": 0.2379, + "step": 4292 + }, + { + "epoch": 0.8588, + "learning_rate": 1.8502603703890484e-05, + "loss": 0.2666, + "step": 4294 + }, + { + "epoch": 0.8592, + "learning_rate": 1.8495246013573064e-05, + "loss": 0.301, + "step": 4296 + }, + { + "epoch": 0.8596, + "learning_rate": 1.8487871761338817e-05, + "loss": 0.5496, + "step": 4298 + }, + { + "epoch": 0.86, + "learning_rate": 1.848048096156426e-05, + "loss": 0.493, + "step": 4300 + }, + { + "epoch": 0.8604, + "learning_rate": 1.847307362865813e-05, + "loss": 1.074, + "step": 4302 + }, + { + "epoch": 0.8608, + "learning_rate": 1.8465649777061387e-05, + "loss": 0.2843, + "step": 4304 + }, + { + "epoch": 0.8612, + "learning_rate": 1.8458209421247208e-05, + "loss": 0.4592, + "step": 4306 + }, + { + "epoch": 0.8616, + "learning_rate": 1.8450752575720967e-05, + "loss": 0.1719, + "step": 4308 + }, + { + "epoch": 0.862, + "learning_rate": 1.8443279255020163e-05, + "loss": 0.2841, + "step": 4310 + }, + { + "epoch": 0.8624, + "learning_rate": 1.843578947371439e-05, + "loss": 0.263, + "step": 4312 + }, + { + "epoch": 0.8628, + "learning_rate": 1.842828324640539e-05, + "loss": 0.367, + "step": 4314 + }, + { + "epoch": 0.8632, + "learning_rate": 1.8420760587726935e-05, + "loss": 0.2086, + "step": 4316 + }, + { + "epoch": 0.8636, + "learning_rate": 1.8413221512344808e-05, + "loss": 0.5268, + "step": 4318 + }, + { + "epoch": 0.864, + "learning_rate": 1.8405666034956846e-05, + "loss": 0.7135, + "step": 4320 + }, + { + "epoch": 0.8644, + "learning_rate": 1.8398094170292826e-05, + "loss": 0.2892, + "step": 4322 + }, + { + "epoch": 0.8648, + "learning_rate": 1.8390505933114507e-05, + "loss": 0.9768, + "step": 4324 + }, + { + "epoch": 0.8652, + "learning_rate": 1.838290133821552e-05, + "loss": 0.2315, + "step": 4326 + }, + { + "epoch": 0.8656, + "learning_rate": 1.8375280400421414e-05, + "loss": 0.4819, + "step": 4328 + }, + { + "epoch": 0.866, + "learning_rate": 1.8367643134589613e-05, + "loss": 2.4172, + "step": 4330 + }, + { + "epoch": 0.8664, + "learning_rate": 1.8359989555609365e-05, + "loss": 0.4939, + "step": 4332 + }, + { + "epoch": 0.8668, + "learning_rate": 1.835231967840168e-05, + "loss": 0.4809, + "step": 4334 + }, + { + "epoch": 0.8672, + "learning_rate": 1.834463351791939e-05, + "loss": 0.279, + "step": 4336 + }, + { + "epoch": 0.8676, + "learning_rate": 1.8336931089147082e-05, + "loss": 0.6394, + "step": 4338 + }, + { + "epoch": 0.868, + "learning_rate": 1.8329212407101006e-05, + "loss": 0.5079, + "step": 4340 + }, + { + "epoch": 0.8684, + "learning_rate": 1.8321477486829128e-05, + "loss": 0.1634, + "step": 4342 + }, + { + "epoch": 0.8688, + "learning_rate": 1.8313726343411085e-05, + "loss": 0.3815, + "step": 4344 + }, + { + "epoch": 0.8692, + "learning_rate": 1.8305958991958135e-05, + "loss": 0.6596, + "step": 4346 + }, + { + "epoch": 0.8696, + "learning_rate": 1.82981754476131e-05, + "loss": 0.3578, + "step": 4348 + }, + { + "epoch": 0.87, + "learning_rate": 1.8290375725550417e-05, + "loss": 0.3849, + "step": 4350 + }, + { + "epoch": 0.8704, + "learning_rate": 1.8282559840976053e-05, + "loss": 0.6584, + "step": 4352 + }, + { + "epoch": 0.8708, + "learning_rate": 1.827472780912744e-05, + "loss": 0.1314, + "step": 4354 + }, + { + "epoch": 0.8712, + "learning_rate": 1.8266879645273557e-05, + "loss": 0.5345, + "step": 4356 + }, + { + "epoch": 0.8716, + "learning_rate": 1.825901536471478e-05, + "loss": 0.2651, + "step": 4358 + }, + { + "epoch": 0.872, + "learning_rate": 1.8251134982782966e-05, + "loss": 0.3699, + "step": 4360 + }, + { + "epoch": 0.8724, + "learning_rate": 1.824323851484126e-05, + "loss": 0.4277, + "step": 4362 + }, + { + "epoch": 0.8728, + "learning_rate": 1.823532597628428e-05, + "loss": 0.1984, + "step": 4364 + }, + { + "epoch": 0.8732, + "learning_rate": 1.8227397382537893e-05, + "loss": 0.2397, + "step": 4366 + }, + { + "epoch": 0.8736, + "learning_rate": 1.8219452749059336e-05, + "loss": 0.3895, + "step": 4368 + }, + { + "epoch": 0.874, + "learning_rate": 1.8211492091337048e-05, + "loss": 0.2706, + "step": 4370 + }, + { + "epoch": 0.8744, + "learning_rate": 1.8203515424890734e-05, + "loss": 0.3483, + "step": 4372 + }, + { + "epoch": 0.8748, + "learning_rate": 1.8195522765271346e-05, + "loss": 0.3288, + "step": 4374 + }, + { + "epoch": 0.8752, + "learning_rate": 1.8187514128060956e-05, + "loss": 0.4265, + "step": 4376 + }, + { + "epoch": 0.8756, + "learning_rate": 1.8179489528872804e-05, + "loss": 0.3418, + "step": 4378 + }, + { + "epoch": 0.876, + "learning_rate": 1.8171448983351284e-05, + "loss": 0.1947, + "step": 4380 + }, + { + "epoch": 0.8764, + "learning_rate": 1.816339250717185e-05, + "loss": 0.486, + "step": 4382 + }, + { + "epoch": 0.8768, + "learning_rate": 1.8155320116040983e-05, + "loss": 0.2271, + "step": 4384 + }, + { + "epoch": 0.8772, + "learning_rate": 1.814723182569625e-05, + "loss": 0.2799, + "step": 4386 + }, + { + "epoch": 0.8776, + "learning_rate": 1.8139127651906193e-05, + "loss": 0.3795, + "step": 4388 + }, + { + "epoch": 0.878, + "learning_rate": 1.813100761047029e-05, + "loss": 0.3313, + "step": 4390 + }, + { + "epoch": 0.8784, + "learning_rate": 1.8122871717218974e-05, + "loss": 0.4207, + "step": 4392 + }, + { + "epoch": 0.8788, + "learning_rate": 1.8114719988013612e-05, + "loss": 0.2667, + "step": 4394 + }, + { + "epoch": 0.8792, + "learning_rate": 1.8106552438746413e-05, + "loss": 0.6344, + "step": 4396 + }, + { + "epoch": 0.8796, + "learning_rate": 1.8098369085340404e-05, + "loss": 0.1939, + "step": 4398 + }, + { + "epoch": 0.88, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.4813, + "step": 4400 + }, + { + "epoch": 0.8804, + "learning_rate": 1.8081955029958265e-05, + "loss": 0.1579, + "step": 4402 + }, + { + "epoch": 0.8808, + "learning_rate": 1.807372435998219e-05, + "loss": 0.2881, + "step": 4404 + }, + { + "epoch": 0.8812, + "learning_rate": 1.806547794986733e-05, + "loss": 0.1966, + "step": 4406 + }, + { + "epoch": 0.8816, + "learning_rate": 1.8057215815690487e-05, + "loss": 1.179, + "step": 4408 + }, + { + "epoch": 0.882, + "learning_rate": 1.8048937973559148e-05, + "loss": 0.3418, + "step": 4410 + }, + { + "epoch": 0.8824, + "learning_rate": 1.8040644439611355e-05, + "loss": 0.201, + "step": 4412 + }, + { + "epoch": 0.8828, + "learning_rate": 1.8032335230015777e-05, + "loss": 0.2972, + "step": 4414 + }, + { + "epoch": 0.8832, + "learning_rate": 1.8024010360971665e-05, + "loss": 0.2529, + "step": 4416 + }, + { + "epoch": 0.8836, + "learning_rate": 1.8015669848708774e-05, + "loss": 0.1608, + "step": 4418 + }, + { + "epoch": 0.884, + "learning_rate": 1.8007313709487345e-05, + "loss": 0.1991, + "step": 4420 + }, + { + "epoch": 0.8844, + "learning_rate": 1.7998941959598093e-05, + "loss": 0.2215, + "step": 4422 + }, + { + "epoch": 0.8848, + "learning_rate": 1.7990554615362207e-05, + "loss": 0.3424, + "step": 4424 + }, + { + "epoch": 0.8852, + "learning_rate": 1.7982151693131213e-05, + "loss": 0.3691, + "step": 4426 + }, + { + "epoch": 0.8856, + "learning_rate": 1.7973733209287036e-05, + "loss": 0.5981, + "step": 4428 + }, + { + "epoch": 0.886, + "learning_rate": 1.7965299180241963e-05, + "loss": 0.2012, + "step": 4430 + }, + { + "epoch": 0.8864, + "learning_rate": 1.7956849622438568e-05, + "loss": 0.3592, + "step": 4432 + }, + { + "epoch": 0.8868, + "learning_rate": 1.794838455234966e-05, + "loss": 0.622, + "step": 4434 + }, + { + "epoch": 0.8872, + "learning_rate": 1.7939903986478357e-05, + "loss": 0.2996, + "step": 4436 + }, + { + "epoch": 0.8876, + "learning_rate": 1.7931407941357938e-05, + "loss": 1.1143, + "step": 4438 + }, + { + "epoch": 0.888, + "learning_rate": 1.7922896433551913e-05, + "loss": 0.3867, + "step": 4440 + }, + { + "epoch": 0.8884, + "learning_rate": 1.7914369479653864e-05, + "loss": 0.7692, + "step": 4442 + }, + { + "epoch": 0.8888, + "learning_rate": 1.7905827096287525e-05, + "loss": 0.2722, + "step": 4444 + }, + { + "epoch": 0.8892, + "learning_rate": 1.7897269300106752e-05, + "loss": 0.3725, + "step": 4446 + }, + { + "epoch": 0.8896, + "learning_rate": 1.7888696107795347e-05, + "loss": 0.2583, + "step": 4448 + }, + { + "epoch": 0.89, + "learning_rate": 1.7880107536067228e-05, + "loss": 0.1842, + "step": 4450 + }, + { + "epoch": 0.8904, + "learning_rate": 1.787150360166623e-05, + "loss": 0.5126, + "step": 4452 + }, + { + "epoch": 0.8908, + "learning_rate": 1.78628843213662e-05, + "loss": 0.3411, + "step": 4454 + }, + { + "epoch": 0.8912, + "learning_rate": 1.7854249711970826e-05, + "loss": 0.2847, + "step": 4456 + }, + { + "epoch": 0.8916, + "learning_rate": 1.7845599790313732e-05, + "loss": 0.2502, + "step": 4458 + }, + { + "epoch": 0.892, + "learning_rate": 1.783693457325841e-05, + "loss": 0.6698, + "step": 4460 + }, + { + "epoch": 0.8924, + "learning_rate": 1.782825407769811e-05, + "loss": 0.4827, + "step": 4462 + }, + { + "epoch": 0.8928, + "learning_rate": 1.7819558320555902e-05, + "loss": 0.3018, + "step": 4464 + }, + { + "epoch": 0.8932, + "learning_rate": 1.7810847318784632e-05, + "loss": 0.7386, + "step": 4466 + }, + { + "epoch": 0.8936, + "learning_rate": 1.780212108936685e-05, + "loss": 0.4142, + "step": 4468 + }, + { + "epoch": 0.894, + "learning_rate": 1.7793379649314743e-05, + "loss": 0.2513, + "step": 4470 + }, + { + "epoch": 0.8944, + "learning_rate": 1.7784623015670237e-05, + "loss": 0.2127, + "step": 4472 + }, + { + "epoch": 0.8948, + "learning_rate": 1.777585120550481e-05, + "loss": 0.6447, + "step": 4474 + }, + { + "epoch": 0.8952, + "learning_rate": 1.7767064235919594e-05, + "loss": 0.1491, + "step": 4476 + }, + { + "epoch": 0.8956, + "learning_rate": 1.77582621240452e-05, + "loss": 0.3971, + "step": 4478 + }, + { + "epoch": 0.896, + "learning_rate": 1.77494448870418e-05, + "loss": 0.2888, + "step": 4480 + }, + { + "epoch": 0.8964, + "learning_rate": 1.774061254209907e-05, + "loss": 0.9432, + "step": 4482 + }, + { + "epoch": 0.8968, + "learning_rate": 1.773176510643608e-05, + "loss": 0.2322, + "step": 4484 + }, + { + "epoch": 0.8972, + "learning_rate": 1.7722902597301388e-05, + "loss": 0.1741, + "step": 4486 + }, + { + "epoch": 0.8976, + "learning_rate": 1.7714025031972894e-05, + "loss": 0.4056, + "step": 4488 + }, + { + "epoch": 0.898, + "learning_rate": 1.77051324277579e-05, + "loss": 0.265, + "step": 4490 + }, + { + "epoch": 0.8984, + "learning_rate": 1.769622480199295e-05, + "loss": 0.535, + "step": 4492 + }, + { + "epoch": 0.8988, + "learning_rate": 1.7687302172043926e-05, + "loss": 0.6054, + "step": 4494 + }, + { + "epoch": 0.8992, + "learning_rate": 1.7678364555305982e-05, + "loss": 0.6542, + "step": 4496 + }, + { + "epoch": 0.8996, + "learning_rate": 1.7669411969203424e-05, + "loss": 0.4859, + "step": 4498 + }, + { + "epoch": 0.9, + "learning_rate": 1.7660444431189777e-05, + "loss": 0.648, + "step": 4500 + }, + { + "epoch": 0.9004, + "learning_rate": 1.765146195874774e-05, + "loss": 0.4497, + "step": 4502 + }, + { + "epoch": 0.9008, + "learning_rate": 1.76424645693891e-05, + "loss": 0.2055, + "step": 4504 + }, + { + "epoch": 0.9012, + "learning_rate": 1.7633452280654696e-05, + "loss": 0.204, + "step": 4506 + }, + { + "epoch": 0.9016, + "learning_rate": 1.762442511011448e-05, + "loss": 0.1499, + "step": 4508 + }, + { + "epoch": 0.902, + "learning_rate": 1.761538307536738e-05, + "loss": 0.5326, + "step": 4510 + }, + { + "epoch": 0.9024, + "learning_rate": 1.7606326194041285e-05, + "loss": 0.3766, + "step": 4512 + }, + { + "epoch": 0.9028, + "learning_rate": 1.759725448379305e-05, + "loss": 0.3735, + "step": 4514 + }, + { + "epoch": 0.9032, + "learning_rate": 1.7588167962308458e-05, + "loss": 0.5167, + "step": 4516 + }, + { + "epoch": 0.9036, + "learning_rate": 1.7579066647302147e-05, + "loss": 0.1449, + "step": 4518 + }, + { + "epoch": 0.904, + "learning_rate": 1.756995055651757e-05, + "loss": 0.2699, + "step": 4520 + }, + { + "epoch": 0.9044, + "learning_rate": 1.7560819707727037e-05, + "loss": 0.6012, + "step": 4522 + }, + { + "epoch": 0.9048, + "learning_rate": 1.7551674118731585e-05, + "loss": 0.1999, + "step": 4524 + }, + { + "epoch": 0.9052, + "learning_rate": 1.7542513807361044e-05, + "loss": 0.4144, + "step": 4526 + }, + { + "epoch": 0.9056, + "learning_rate": 1.7533338791473875e-05, + "loss": 0.1586, + "step": 4528 + }, + { + "epoch": 0.906, + "learning_rate": 1.7524149088957238e-05, + "loss": 0.2591, + "step": 4530 + }, + { + "epoch": 0.9064, + "learning_rate": 1.751494471772697e-05, + "loss": 0.1599, + "step": 4532 + }, + { + "epoch": 0.9068, + "learning_rate": 1.750572569572742e-05, + "loss": 0.2821, + "step": 4534 + }, + { + "epoch": 0.9072, + "learning_rate": 1.7496492040931548e-05, + "loss": 0.3283, + "step": 4536 + }, + { + "epoch": 0.9076, + "learning_rate": 1.7487243771340862e-05, + "loss": 0.6565, + "step": 4538 + }, + { + "epoch": 0.908, + "learning_rate": 1.747798090498533e-05, + "loss": 0.2341, + "step": 4540 + }, + { + "epoch": 0.9084, + "learning_rate": 1.7468703459923365e-05, + "loss": 0.46, + "step": 4542 + }, + { + "epoch": 0.9088, + "learning_rate": 1.745941145424182e-05, + "loss": 0.1777, + "step": 4544 + }, + { + "epoch": 0.9092, + "learning_rate": 1.7450104906055973e-05, + "loss": 0.2298, + "step": 4546 + }, + { + "epoch": 0.9096, + "learning_rate": 1.744078383350938e-05, + "loss": 0.5817, + "step": 4548 + }, + { + "epoch": 0.91, + "learning_rate": 1.7431448254773943e-05, + "loss": 0.1931, + "step": 4550 + }, + { + "epoch": 0.9104, + "learning_rate": 1.7422098188049885e-05, + "loss": 0.4753, + "step": 4552 + }, + { + "epoch": 0.9108, + "learning_rate": 1.7412733651565624e-05, + "loss": 0.1035, + "step": 4554 + }, + { + "epoch": 0.9112, + "learning_rate": 1.7403354663577782e-05, + "loss": 0.2955, + "step": 4556 + }, + { + "epoch": 0.9116, + "learning_rate": 1.739396124237121e-05, + "loss": 0.303, + "step": 4558 + }, + { + "epoch": 0.912, + "learning_rate": 1.738455340625883e-05, + "loss": 0.2593, + "step": 4560 + }, + { + "epoch": 0.9124, + "learning_rate": 1.7375131173581744e-05, + "loss": 0.2255, + "step": 4562 + }, + { + "epoch": 0.9128, + "learning_rate": 1.7365694562709038e-05, + "loss": 0.9077, + "step": 4564 + }, + { + "epoch": 0.9132, + "learning_rate": 1.7356243592037865e-05, + "loss": 0.163, + "step": 4566 + }, + { + "epoch": 0.9136, + "learning_rate": 1.7346778279993433e-05, + "loss": 0.6253, + "step": 4568 + }, + { + "epoch": 0.914, + "learning_rate": 1.733729864502877e-05, + "loss": 0.2147, + "step": 4570 + }, + { + "epoch": 0.9144, + "learning_rate": 1.7327804705624962e-05, + "loss": 0.3021, + "step": 4572 + }, + { + "epoch": 0.9148, + "learning_rate": 1.731829648029091e-05, + "loss": 0.2321, + "step": 4574 + }, + { + "epoch": 0.9152, + "learning_rate": 1.730877398756341e-05, + "loss": 0.4107, + "step": 4576 + }, + { + "epoch": 0.9156, + "learning_rate": 1.7299237246007025e-05, + "loss": 0.2684, + "step": 4578 + }, + { + "epoch": 0.916, + "learning_rate": 1.7289686274214113e-05, + "loss": 0.1622, + "step": 4580 + }, + { + "epoch": 0.9164, + "learning_rate": 1.7280121090804824e-05, + "loss": 0.2293, + "step": 4582 + }, + { + "epoch": 0.9168, + "learning_rate": 1.727054171442693e-05, + "loss": 0.1063, + "step": 4584 + }, + { + "epoch": 0.9172, + "learning_rate": 1.7260948163755918e-05, + "loss": 0.8674, + "step": 4586 + }, + { + "epoch": 0.9176, + "learning_rate": 1.7251340457494934e-05, + "loss": 0.561, + "step": 4588 + }, + { + "epoch": 0.918, + "learning_rate": 1.7241718614374688e-05, + "loss": 0.2385, + "step": 4590 + }, + { + "epoch": 0.9184, + "learning_rate": 1.7232082653153422e-05, + "loss": 0.247, + "step": 4592 + }, + { + "epoch": 0.9188, + "learning_rate": 1.722243259261697e-05, + "loss": 0.2991, + "step": 4594 + }, + { + "epoch": 0.9192, + "learning_rate": 1.7212768451578595e-05, + "loss": 0.2215, + "step": 4596 + }, + { + "epoch": 0.9196, + "learning_rate": 1.7203090248879084e-05, + "loss": 0.9766, + "step": 4598 + }, + { + "epoch": 0.92, + "learning_rate": 1.7193398003386517e-05, + "loss": 1.1035, + "step": 4600 + }, + { + "epoch": 0.9204, + "learning_rate": 1.7183691733996463e-05, + "loss": 0.663, + "step": 4602 + }, + { + "epoch": 0.9208, + "learning_rate": 1.7173971459631803e-05, + "loss": 0.4635, + "step": 4604 + }, + { + "epoch": 0.9212, + "learning_rate": 1.7164237199242663e-05, + "loss": 0.4949, + "step": 4606 + }, + { + "epoch": 0.9216, + "learning_rate": 1.7154488971806525e-05, + "loss": 0.4645, + "step": 4608 + }, + { + "epoch": 0.922, + "learning_rate": 1.7144726796328027e-05, + "loss": 0.1758, + "step": 4610 + }, + { + "epoch": 0.9224, + "learning_rate": 1.713495069183907e-05, + "loss": 0.6391, + "step": 4612 + }, + { + "epoch": 0.9228, + "learning_rate": 1.7125160677398632e-05, + "loss": 0.2492, + "step": 4614 + }, + { + "epoch": 0.9232, + "learning_rate": 1.7115356772092847e-05, + "loss": 0.4253, + "step": 4616 + }, + { + "epoch": 0.9236, + "learning_rate": 1.710553899503497e-05, + "loss": 0.4568, + "step": 4618 + }, + { + "epoch": 0.924, + "learning_rate": 1.709570736536522e-05, + "loss": 0.3771, + "step": 4620 + }, + { + "epoch": 0.9244, + "learning_rate": 1.708586190225086e-05, + "loss": 0.164, + "step": 4622 + }, + { + "epoch": 0.9248, + "learning_rate": 1.7076002624886152e-05, + "loss": 0.396, + "step": 4624 + }, + { + "epoch": 0.9252, + "learning_rate": 1.7066129552492258e-05, + "loss": 0.2026, + "step": 4626 + }, + { + "epoch": 0.9256, + "learning_rate": 1.705624270431722e-05, + "loss": 0.1792, + "step": 4628 + }, + { + "epoch": 0.926, + "learning_rate": 1.7046342099635945e-05, + "loss": 0.6862, + "step": 4630 + }, + { + "epoch": 0.9264, + "learning_rate": 1.70364277577502e-05, + "loss": 0.2399, + "step": 4632 + }, + { + "epoch": 0.9268, + "learning_rate": 1.702649969798851e-05, + "loss": 0.4187, + "step": 4634 + }, + { + "epoch": 0.9272, + "learning_rate": 1.7016557939706078e-05, + "loss": 0.2873, + "step": 4636 + }, + { + "epoch": 0.9276, + "learning_rate": 1.700660250228492e-05, + "loss": 0.4824, + "step": 4638 + }, + { + "epoch": 0.928, + "learning_rate": 1.6996633405133673e-05, + "loss": 0.1764, + "step": 4640 + }, + { + "epoch": 0.9284, + "learning_rate": 1.6986650667687556e-05, + "loss": 0.4637, + "step": 4642 + }, + { + "epoch": 0.9288, + "learning_rate": 1.6976654309408468e-05, + "loss": 0.2203, + "step": 4644 + }, + { + "epoch": 0.9292, + "learning_rate": 1.69666443497848e-05, + "loss": 0.437, + "step": 4646 + }, + { + "epoch": 0.9296, + "learning_rate": 1.6956620808331515e-05, + "loss": 0.3477, + "step": 4648 + }, + { + "epoch": 0.93, + "learning_rate": 1.694658370458998e-05, + "loss": 0.2178, + "step": 4650 + }, + { + "epoch": 0.9304, + "learning_rate": 1.6936533058128042e-05, + "loss": 0.2164, + "step": 4652 + }, + { + "epoch": 0.9308, + "learning_rate": 1.692646888854001e-05, + "loss": 0.6418, + "step": 4654 + }, + { + "epoch": 0.9312, + "learning_rate": 1.691639121544641e-05, + "loss": 0.8349, + "step": 4656 + }, + { + "epoch": 0.9316, + "learning_rate": 1.690630005849424e-05, + "loss": 0.6952, + "step": 4658 + }, + { + "epoch": 0.932, + "learning_rate": 1.6896195437356696e-05, + "loss": 0.2393, + "step": 4660 + }, + { + "epoch": 0.9324, + "learning_rate": 1.6886077371733295e-05, + "loss": 0.4838, + "step": 4662 + }, + { + "epoch": 0.9328, + "learning_rate": 1.6875945881349686e-05, + "loss": 0.166, + "step": 4664 + }, + { + "epoch": 0.9332, + "learning_rate": 1.6865800985957725e-05, + "loss": 0.2499, + "step": 4666 + }, + { + "epoch": 0.9336, + "learning_rate": 1.6855642705335435e-05, + "loss": 0.7193, + "step": 4668 + }, + { + "epoch": 0.934, + "learning_rate": 1.68454710592869e-05, + "loss": 0.3137, + "step": 4670 + }, + { + "epoch": 0.9344, + "learning_rate": 1.6835286067642228e-05, + "loss": 0.8274, + "step": 4672 + }, + { + "epoch": 0.9348, + "learning_rate": 1.6825087750257617e-05, + "loss": 0.2555, + "step": 4674 + }, + { + "epoch": 0.9352, + "learning_rate": 1.681487612701521e-05, + "loss": 0.867, + "step": 4676 + }, + { + "epoch": 0.9356, + "learning_rate": 1.6804651217823055e-05, + "loss": 0.4446, + "step": 4678 + }, + { + "epoch": 0.936, + "learning_rate": 1.6794413042615168e-05, + "loss": 0.1552, + "step": 4680 + }, + { + "epoch": 0.9364, + "learning_rate": 1.6784161621351374e-05, + "loss": 0.333, + "step": 4682 + }, + { + "epoch": 0.9368, + "learning_rate": 1.677389697401739e-05, + "loss": 0.2411, + "step": 4684 + }, + { + "epoch": 0.9372, + "learning_rate": 1.67636191206246e-05, + "loss": 0.3131, + "step": 4686 + }, + { + "epoch": 0.9376, + "learning_rate": 1.675332808121025e-05, + "loss": 0.6303, + "step": 4688 + }, + { + "epoch": 0.938, + "learning_rate": 1.6743023875837253e-05, + "loss": 0.1713, + "step": 4690 + }, + { + "epoch": 0.9384, + "learning_rate": 1.6732706524594145e-05, + "loss": 0.3464, + "step": 4692 + }, + { + "epoch": 0.9388, + "learning_rate": 1.672237604759517e-05, + "loss": 0.2466, + "step": 4694 + }, + { + "epoch": 0.9392, + "learning_rate": 1.671203246498009e-05, + "loss": 0.2354, + "step": 4696 + }, + { + "epoch": 0.9396, + "learning_rate": 1.670167579691429e-05, + "loss": 0.2268, + "step": 4698 + }, + { + "epoch": 0.94, + "learning_rate": 1.6691306063588593e-05, + "loss": 0.781, + "step": 4700 + }, + { + "epoch": 0.9404, + "learning_rate": 1.668092328521931e-05, + "loss": 0.4361, + "step": 4702 + }, + { + "epoch": 0.9408, + "learning_rate": 1.6670527482048242e-05, + "loss": 0.4691, + "step": 4704 + }, + { + "epoch": 0.9412, + "learning_rate": 1.6660118674342525e-05, + "loss": 0.1552, + "step": 4706 + }, + { + "epoch": 0.9416, + "learning_rate": 1.6649696882394635e-05, + "loss": 0.3179, + "step": 4708 + }, + { + "epoch": 0.942, + "learning_rate": 1.6639262126522414e-05, + "loss": 0.2057, + "step": 4710 + }, + { + "epoch": 0.9424, + "learning_rate": 1.6628814427068968e-05, + "loss": 0.2305, + "step": 4712 + }, + { + "epoch": 0.9428, + "learning_rate": 1.661835380440258e-05, + "loss": 0.3541, + "step": 4714 + }, + { + "epoch": 0.9432, + "learning_rate": 1.6607880278916778e-05, + "loss": 0.1726, + "step": 4716 + }, + { + "epoch": 0.9436, + "learning_rate": 1.6597393871030264e-05, + "loss": 0.2562, + "step": 4718 + }, + { + "epoch": 0.944, + "learning_rate": 1.6586894601186824e-05, + "loss": 0.4462, + "step": 4720 + }, + { + "epoch": 0.9444, + "learning_rate": 1.6576382489855278e-05, + "loss": 0.2801, + "step": 4722 + }, + { + "epoch": 0.9448, + "learning_rate": 1.656585755752957e-05, + "loss": 0.0874, + "step": 4724 + }, + { + "epoch": 0.9452, + "learning_rate": 1.655531982472859e-05, + "loss": 0.6066, + "step": 4726 + }, + { + "epoch": 0.9456, + "learning_rate": 1.6544769311996153e-05, + "loss": 0.1971, + "step": 4728 + }, + { + "epoch": 0.946, + "learning_rate": 1.653420603990106e-05, + "loss": 0.4693, + "step": 4730 + }, + { + "epoch": 0.9464, + "learning_rate": 1.6523630029036924e-05, + "loss": 0.3922, + "step": 4732 + }, + { + "epoch": 0.9468, + "learning_rate": 1.651304130002226e-05, + "loss": 0.5497, + "step": 4734 + }, + { + "epoch": 0.9472, + "learning_rate": 1.6502439873500294e-05, + "loss": 0.7392, + "step": 4736 + }, + { + "epoch": 0.9476, + "learning_rate": 1.6491825770139058e-05, + "loss": 0.7815, + "step": 4738 + }, + { + "epoch": 0.948, + "learning_rate": 1.6481199010631305e-05, + "loss": 0.3759, + "step": 4740 + }, + { + "epoch": 0.9484, + "learning_rate": 1.6470559615694455e-05, + "loss": 0.1912, + "step": 4742 + }, + { + "epoch": 0.9488, + "learning_rate": 1.645990760607052e-05, + "loss": 0.2983, + "step": 4744 + }, + { + "epoch": 0.9492, + "learning_rate": 1.644924300252614e-05, + "loss": 0.3935, + "step": 4746 + }, + { + "epoch": 0.9496, + "learning_rate": 1.643856582585255e-05, + "loss": 0.2654, + "step": 4748 + }, + { + "epoch": 0.95, + "learning_rate": 1.6427876096865407e-05, + "loss": 0.6673, + "step": 4750 + }, + { + "epoch": 0.9504, + "learning_rate": 1.641717383640488e-05, + "loss": 0.4188, + "step": 4752 + }, + { + "epoch": 0.9508, + "learning_rate": 1.6406459065335616e-05, + "loss": 0.4247, + "step": 4754 + }, + { + "epoch": 0.9512, + "learning_rate": 1.6395731804546596e-05, + "loss": 0.2073, + "step": 4756 + }, + { + "epoch": 0.9516, + "learning_rate": 1.6384992074951128e-05, + "loss": 0.2881, + "step": 4758 + }, + { + "epoch": 0.952, + "learning_rate": 1.63742398974869e-05, + "loss": 0.4745, + "step": 4760 + }, + { + "epoch": 0.9524, + "learning_rate": 1.6363475293115838e-05, + "loss": 0.2035, + "step": 4762 + }, + { + "epoch": 0.9528, + "learning_rate": 1.6352698282824045e-05, + "loss": 0.2754, + "step": 4764 + }, + { + "epoch": 0.9532, + "learning_rate": 1.63419088876219e-05, + "loss": 0.3887, + "step": 4766 + }, + { + "epoch": 0.9536, + "learning_rate": 1.633110712854385e-05, + "loss": 0.5377, + "step": 4768 + }, + { + "epoch": 0.954, + "learning_rate": 1.6320293026648515e-05, + "loss": 0.385, + "step": 4770 + }, + { + "epoch": 0.9544, + "learning_rate": 1.6309466603018504e-05, + "loss": 0.665, + "step": 4772 + }, + { + "epoch": 0.9548, + "learning_rate": 1.6298627878760495e-05, + "loss": 0.3396, + "step": 4774 + }, + { + "epoch": 0.9552, + "learning_rate": 1.6287776875005148e-05, + "loss": 0.3024, + "step": 4776 + }, + { + "epoch": 0.9556, + "learning_rate": 1.6276913612907015e-05, + "loss": 0.9523, + "step": 4778 + }, + { + "epoch": 0.956, + "learning_rate": 1.6266038113644612e-05, + "loss": 0.4543, + "step": 4780 + }, + { + "epoch": 0.9564, + "learning_rate": 1.6255150398420266e-05, + "loss": 0.2607, + "step": 4782 + }, + { + "epoch": 0.9568, + "learning_rate": 1.624425048846017e-05, + "loss": 0.3273, + "step": 4784 + }, + { + "epoch": 0.9572, + "learning_rate": 1.623333840501421e-05, + "loss": 0.1997, + "step": 4786 + }, + { + "epoch": 0.9576, + "learning_rate": 1.6222414169356063e-05, + "loss": 0.9789, + "step": 4788 + }, + { + "epoch": 0.958, + "learning_rate": 1.6211477802783102e-05, + "loss": 0.1896, + "step": 4790 + }, + { + "epoch": 0.9584, + "learning_rate": 1.6200529326616343e-05, + "loss": 0.1752, + "step": 4792 + }, + { + "epoch": 0.9588, + "learning_rate": 1.618956876220035e-05, + "loss": 0.6748, + "step": 4794 + }, + { + "epoch": 0.9592, + "learning_rate": 1.6178596130903345e-05, + "loss": 0.2534, + "step": 4796 + }, + { + "epoch": 0.9596, + "learning_rate": 1.616761145411704e-05, + "loss": 0.2456, + "step": 4798 + }, + { + "epoch": 0.96, + "learning_rate": 1.6156614753256587e-05, + "loss": 0.5542, + "step": 4800 + }, + { + "epoch": 0.9604, + "learning_rate": 1.6145606049760648e-05, + "loss": 0.168, + "step": 4802 + }, + { + "epoch": 0.9608, + "learning_rate": 1.613458536509123e-05, + "loss": 0.557, + "step": 4804 + }, + { + "epoch": 0.9612, + "learning_rate": 1.612355272073378e-05, + "loss": 1.2112, + "step": 4806 + }, + { + "epoch": 0.9616, + "learning_rate": 1.6112508138196922e-05, + "loss": 0.3005, + "step": 4808 + }, + { + "epoch": 0.962, + "learning_rate": 1.610145163901268e-05, + "loss": 0.5581, + "step": 4810 + }, + { + "epoch": 0.9624, + "learning_rate": 1.6090383244736277e-05, + "loss": 0.5705, + "step": 4812 + }, + { + "epoch": 0.9628, + "learning_rate": 1.6079302976946062e-05, + "loss": 0.3162, + "step": 4814 + }, + { + "epoch": 0.9632, + "learning_rate": 1.606821085724363e-05, + "loss": 0.4402, + "step": 4816 + }, + { + "epoch": 0.9636, + "learning_rate": 1.6057106907253607e-05, + "loss": 0.5406, + "step": 4818 + }, + { + "epoch": 0.964, + "learning_rate": 1.6045991148623756e-05, + "loss": 0.3452, + "step": 4820 + }, + { + "epoch": 0.9644, + "learning_rate": 1.6034863603024775e-05, + "loss": 0.254, + "step": 4822 + }, + { + "epoch": 0.9648, + "learning_rate": 1.602372429215038e-05, + "loss": 0.2359, + "step": 4824 + }, + { + "epoch": 0.9652, + "learning_rate": 1.6012573237717265e-05, + "loss": 0.2111, + "step": 4826 + }, + { + "epoch": 0.9656, + "learning_rate": 1.600141046146497e-05, + "loss": 0.51, + "step": 4828 + }, + { + "epoch": 0.966, + "learning_rate": 1.5990235985155856e-05, + "loss": 0.6615, + "step": 4830 + }, + { + "epoch": 0.9664, + "learning_rate": 1.597904983057519e-05, + "loss": 0.374, + "step": 4832 + }, + { + "epoch": 0.9668, + "learning_rate": 1.5967852019530942e-05, + "loss": 0.2478, + "step": 4834 + }, + { + "epoch": 0.9672, + "learning_rate": 1.5956642573853794e-05, + "loss": 0.3665, + "step": 4836 + }, + { + "epoch": 0.9676, + "learning_rate": 1.5945421515397135e-05, + "loss": 0.9694, + "step": 4838 + }, + { + "epoch": 0.968, + "learning_rate": 1.5934188866037014e-05, + "loss": 0.1263, + "step": 4840 + }, + { + "epoch": 0.9684, + "learning_rate": 1.5922944647672068e-05, + "loss": 0.3722, + "step": 4842 + }, + { + "epoch": 0.9688, + "learning_rate": 1.591168888222342e-05, + "loss": 0.3362, + "step": 4844 + }, + { + "epoch": 0.9692, + "learning_rate": 1.5900421591634816e-05, + "loss": 0.187, + "step": 4846 + }, + { + "epoch": 0.9696, + "learning_rate": 1.5889142797872407e-05, + "loss": 0.1769, + "step": 4848 + }, + { + "epoch": 0.97, + "learning_rate": 1.5877852522924736e-05, + "loss": 0.2286, + "step": 4850 + }, + { + "epoch": 0.9704, + "learning_rate": 1.5866550788802818e-05, + "loss": 0.5829, + "step": 4852 + }, + { + "epoch": 0.9708, + "learning_rate": 1.5855237617539932e-05, + "loss": 0.461, + "step": 4854 + }, + { + "epoch": 0.9712, + "learning_rate": 1.584391303119173e-05, + "loss": 0.675, + "step": 4856 + }, + { + "epoch": 0.9716, + "learning_rate": 1.5832577051836023e-05, + "loss": 0.5681, + "step": 4858 + }, + { + "epoch": 0.972, + "learning_rate": 1.582122970157289e-05, + "loss": 0.4279, + "step": 4860 + }, + { + "epoch": 0.9724, + "learning_rate": 1.5809871002524592e-05, + "loss": 0.6236, + "step": 4862 + }, + { + "epoch": 0.9728, + "learning_rate": 1.5798500976835503e-05, + "loss": 0.1893, + "step": 4864 + }, + { + "epoch": 0.9732, + "learning_rate": 1.5787119646672032e-05, + "loss": 0.2659, + "step": 4866 + }, + { + "epoch": 0.9736, + "learning_rate": 1.577572703422267e-05, + "loss": 0.2365, + "step": 4868 + }, + { + "epoch": 0.974, + "learning_rate": 1.5764323161697946e-05, + "loss": 0.1899, + "step": 4870 + }, + { + "epoch": 0.9744, + "learning_rate": 1.575290805133024e-05, + "loss": 0.2395, + "step": 4872 + }, + { + "epoch": 0.9748, + "learning_rate": 1.5741481725373896e-05, + "loss": 0.4953, + "step": 4874 + }, + { + "epoch": 0.9752, + "learning_rate": 1.5730044206105156e-05, + "loss": 0.4436, + "step": 4876 + }, + { + "epoch": 0.9756, + "learning_rate": 1.571859551582204e-05, + "loss": 0.4702, + "step": 4878 + }, + { + "epoch": 0.976, + "learning_rate": 1.570713567684432e-05, + "loss": 0.2448, + "step": 4880 + }, + { + "epoch": 0.9764, + "learning_rate": 1.5695664711513575e-05, + "loss": 0.197, + "step": 4882 + }, + { + "epoch": 0.9768, + "learning_rate": 1.5684182642193047e-05, + "loss": 0.3381, + "step": 4884 + }, + { + "epoch": 0.9772, + "learning_rate": 1.567268949126757e-05, + "loss": 0.3129, + "step": 4886 + }, + { + "epoch": 0.9776, + "learning_rate": 1.566118528114367e-05, + "loss": 0.2857, + "step": 4888 + }, + { + "epoch": 0.978, + "learning_rate": 1.5649670034249372e-05, + "loss": 0.3989, + "step": 4890 + }, + { + "epoch": 0.9784, + "learning_rate": 1.563814377303429e-05, + "loss": 0.2535, + "step": 4892 + }, + { + "epoch": 0.9788, + "learning_rate": 1.5626606519969373e-05, + "loss": 0.5057, + "step": 4894 + }, + { + "epoch": 0.9792, + "learning_rate": 1.561505829754715e-05, + "loss": 0.3461, + "step": 4896 + }, + { + "epoch": 0.9796, + "learning_rate": 1.5603499128281437e-05, + "loss": 0.3915, + "step": 4898 + }, + { + "epoch": 0.98, + "learning_rate": 1.5591929034707475e-05, + "loss": 1.065, + "step": 4900 + }, + { + "epoch": 0.9804, + "learning_rate": 1.558034803938171e-05, + "loss": 0.1922, + "step": 4902 + }, + { + "epoch": 0.9808, + "learning_rate": 1.5568756164881874e-05, + "loss": 0.2575, + "step": 4904 + }, + { + "epoch": 0.9812, + "learning_rate": 1.5557153433806974e-05, + "loss": 0.4548, + "step": 4906 + }, + { + "epoch": 0.9816, + "learning_rate": 1.5545539868777085e-05, + "loss": 0.3118, + "step": 4908 + }, + { + "epoch": 0.982, + "learning_rate": 1.5533915492433437e-05, + "loss": 0.4803, + "step": 4910 + }, + { + "epoch": 0.9824, + "learning_rate": 1.5522280327438384e-05, + "loss": 0.6689, + "step": 4912 + }, + { + "epoch": 0.9828, + "learning_rate": 1.5510634396475275e-05, + "loss": 0.1997, + "step": 4914 + }, + { + "epoch": 0.9832, + "learning_rate": 1.5498977722248398e-05, + "loss": 0.7271, + "step": 4916 + }, + { + "epoch": 0.9836, + "learning_rate": 1.5487310327483084e-05, + "loss": 0.306, + "step": 4918 + }, + { + "epoch": 0.984, + "learning_rate": 1.547563223492552e-05, + "loss": 0.5183, + "step": 4920 + }, + { + "epoch": 0.9844, + "learning_rate": 1.5463943467342708e-05, + "loss": 0.3036, + "step": 4922 + }, + { + "epoch": 0.9848, + "learning_rate": 1.5452244047522504e-05, + "loss": 0.3329, + "step": 4924 + }, + { + "epoch": 0.9852, + "learning_rate": 1.5440533998273552e-05, + "loss": 0.3657, + "step": 4926 + }, + { + "epoch": 0.9856, + "learning_rate": 1.5428813342425194e-05, + "loss": 0.2421, + "step": 4928 + }, + { + "epoch": 0.986, + "learning_rate": 1.5417082102827407e-05, + "loss": 0.5742, + "step": 4930 + }, + { + "epoch": 0.9864, + "learning_rate": 1.5405340302350876e-05, + "loss": 0.5177, + "step": 4932 + }, + { + "epoch": 0.9868, + "learning_rate": 1.5393587963886827e-05, + "loss": 0.1877, + "step": 4934 + }, + { + "epoch": 0.9872, + "learning_rate": 1.538182511034708e-05, + "loss": 0.2602, + "step": 4936 + }, + { + "epoch": 0.9876, + "learning_rate": 1.5370051764663875e-05, + "loss": 0.4534, + "step": 4938 + }, + { + "epoch": 0.988, + "learning_rate": 1.535826794978996e-05, + "loss": 0.3768, + "step": 4940 + }, + { + "epoch": 0.9884, + "learning_rate": 1.534647368869852e-05, + "loss": 0.3063, + "step": 4942 + }, + { + "epoch": 0.9888, + "learning_rate": 1.5334669004383036e-05, + "loss": 0.0741, + "step": 4944 + }, + { + "epoch": 0.9892, + "learning_rate": 1.5322853919857337e-05, + "loss": 0.3292, + "step": 4946 + }, + { + "epoch": 0.9896, + "learning_rate": 1.5311028458155564e-05, + "loss": 0.2408, + "step": 4948 + }, + { + "epoch": 0.99, + "learning_rate": 1.5299192642332063e-05, + "loss": 0.2445, + "step": 4950 + }, + { + "epoch": 0.9904, + "learning_rate": 1.528734649546133e-05, + "loss": 0.3176, + "step": 4952 + }, + { + "epoch": 0.9908, + "learning_rate": 1.5275490040638038e-05, + "loss": 0.2961, + "step": 4954 + }, + { + "epoch": 0.9912, + "learning_rate": 1.5263623300976997e-05, + "loss": 0.2968, + "step": 4956 + }, + { + "epoch": 0.9916, + "learning_rate": 1.5251746299612973e-05, + "loss": 0.2894, + "step": 4958 + }, + { + "epoch": 0.992, + "learning_rate": 1.5239859059700792e-05, + "loss": 0.5629, + "step": 4960 + }, + { + "epoch": 0.9924, + "learning_rate": 1.522796160441527e-05, + "loss": 0.1152, + "step": 4962 + }, + { + "epoch": 0.9928, + "learning_rate": 1.5216053956951096e-05, + "loss": 0.4876, + "step": 4964 + }, + { + "epoch": 0.9932, + "learning_rate": 1.5204136140522799e-05, + "loss": 0.3215, + "step": 4966 + }, + { + "epoch": 0.9936, + "learning_rate": 1.5192208178364819e-05, + "loss": 0.6442, + "step": 4968 + }, + { + "epoch": 0.994, + "learning_rate": 1.5180270093731291e-05, + "loss": 0.2577, + "step": 4970 + }, + { + "epoch": 0.9944, + "learning_rate": 1.5168321909896176e-05, + "loss": 0.2175, + "step": 4972 + }, + { + "epoch": 0.9948, + "learning_rate": 1.5156363650153017e-05, + "loss": 0.3426, + "step": 4974 + }, + { + "epoch": 0.9952, + "learning_rate": 1.5144395337815057e-05, + "loss": 0.2833, + "step": 4976 + }, + { + "epoch": 0.9956, + "learning_rate": 1.5132416996215178e-05, + "loss": 0.3747, + "step": 4978 + }, + { + "epoch": 0.996, + "learning_rate": 1.5120428648705722e-05, + "loss": 0.3992, + "step": 4980 + }, + { + "epoch": 0.9964, + "learning_rate": 1.5108430318658607e-05, + "loss": 0.0992, + "step": 4982 + }, + { + "epoch": 0.9968, + "learning_rate": 1.5096422029465171e-05, + "loss": 0.2954, + "step": 4984 + }, + { + "epoch": 0.9972, + "learning_rate": 1.5084403804536236e-05, + "loss": 0.4991, + "step": 4986 + }, + { + "epoch": 0.9976, + "learning_rate": 1.5072375667301904e-05, + "loss": 0.6695, + "step": 4988 + }, + { + "epoch": 0.998, + "learning_rate": 1.5060337641211636e-05, + "loss": 0.2008, + "step": 4990 + }, + { + "epoch": 0.9984, + "learning_rate": 1.5048289749734231e-05, + "loss": 0.3324, + "step": 4992 + }, + { + "epoch": 0.9988, + "learning_rate": 1.5036232016357622e-05, + "loss": 0.2545, + "step": 4994 + }, + { + "epoch": 0.9992, + "learning_rate": 1.502416446458898e-05, + "loss": 0.1334, + "step": 4996 + }, + { + "epoch": 0.9996, + "learning_rate": 1.5012087117954641e-05, + "loss": 0.1989, + "step": 4998 + }, + { + "epoch": 1.0, + "learning_rate": 1.5000000000000014e-05, + "loss": 0.3766, + "step": 5000 + }, + { + "epoch": 1.0, + "step": 5000, + "total_flos": 2.001217390064435e+16, + "train_loss": 0.412522501501441, + "train_runtime": 19931.4096, + "train_samples_per_second": 4.014, + "train_steps_per_second": 0.251 + } + ], + "logging_steps": 2, + "max_steps": 5000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 2.001217390064435e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..6761a1a19b2f21af127917e3c78a0ec7c2381c20 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45b2eeedd0d6520fd2a0c0c85d30a54e2351235802b6866978dcced8db6bc812 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f5dab68d6d34afde4658b65b9d5dd990b1283c6 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2778a0e66078c292206cbb8b19dff167b4cd5d1028821ea65cb96f9026004fe7 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..01ae9549e22c23564d536080bf7eefe8fcd75d16 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e38760d002324d890547a7148e2c92a438d844ab20ab5aaaeec233959ddeb95 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth new file mode 100644 index 0000000000000000000000000000000000000000..55c3352476e06052b81a5296d880615086fa2175 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_infoBatch_scenario18_new_10000_random0_0625_seed1/server_model_round3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9a6b23a7c83b9fcca86779ee894d6c01068f7b79c1298315eb04d2bce014509 +size 639793378 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f833e7e7715444f5b220eac439d3f4be0b91f07f --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/0_trainer_state.json @@ -0,0 +1,15032 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004, + "learning_rate": 1.939214154742919e-05, + "loss": 0.0105, + "step": 2 + }, + { + "epoch": 0.0008, + "learning_rate": 1.9387338576538743e-05, + "loss": 0.0192, + "step": 4 + }, + { + "epoch": 0.0012, + "learning_rate": 1.9382517304551397e-05, + "loss": 0.0097, + "step": 6 + }, + { + "epoch": 0.0016, + "learning_rate": 1.937767774086646e-05, + "loss": 0.2823, + "step": 8 + }, + { + "epoch": 0.002, + "learning_rate": 1.937281989491892e-05, + "loss": 0.4585, + "step": 10 + }, + { + "epoch": 0.0024, + "learning_rate": 1.936794377617938e-05, + "loss": 0.038, + "step": 12 + }, + { + "epoch": 0.0028, + "learning_rate": 1.9363049394154095e-05, + "loss": 0.029, + "step": 14 + }, + { + "epoch": 0.0032, + "learning_rate": 1.935813675838491e-05, + "loss": 0.0736, + "step": 16 + }, + { + "epoch": 0.0036, + "learning_rate": 1.9353205878449257e-05, + "loss": 0.0032, + "step": 18 + }, + { + "epoch": 0.004, + "learning_rate": 1.934825676396015e-05, + "loss": 0.0063, + "step": 20 + }, + { + "epoch": 0.0044, + "learning_rate": 1.9343289424566122e-05, + "loss": 0.0124, + "step": 22 + }, + { + "epoch": 0.0048, + "learning_rate": 1.9338303869951273e-05, + "loss": 0.0706, + "step": 24 + }, + { + "epoch": 0.0052, + "learning_rate": 1.9333300109835182e-05, + "loss": 0.918, + "step": 26 + }, + { + "epoch": 0.0056, + "learning_rate": 1.9328278153972947e-05, + "loss": 0.0286, + "step": 28 + }, + { + "epoch": 0.006, + "learning_rate": 1.932323801215512e-05, + "loss": 0.2863, + "step": 30 + }, + { + "epoch": 0.0064, + "learning_rate": 1.931817969420773e-05, + "loss": 0.3894, + "step": 32 + }, + { + "epoch": 0.0068, + "learning_rate": 1.931310320999221e-05, + "loss": 0.0064, + "step": 34 + }, + { + "epoch": 0.0072, + "learning_rate": 1.9308008569405424e-05, + "loss": 0.0155, + "step": 36 + }, + { + "epoch": 0.0076, + "learning_rate": 1.9302895782379648e-05, + "loss": 0.0014, + "step": 38 + }, + { + "epoch": 0.008, + "learning_rate": 1.9297764858882516e-05, + "loss": 0.0218, + "step": 40 + }, + { + "epoch": 0.0084, + "learning_rate": 1.9292615808917027e-05, + "loss": 0.0996, + "step": 42 + }, + { + "epoch": 0.0088, + "learning_rate": 1.9287448642521513e-05, + "loss": 0.0664, + "step": 44 + }, + { + "epoch": 0.0092, + "learning_rate": 1.9282263369769633e-05, + "loss": 0.0138, + "step": 46 + }, + { + "epoch": 0.0096, + "learning_rate": 1.9277060000770342e-05, + "loss": 0.0532, + "step": 48 + }, + { + "epoch": 0.01, + "learning_rate": 1.9271838545667876e-05, + "loss": 0.0129, + "step": 50 + }, + { + "epoch": 0.0104, + "learning_rate": 1.9266599014641727e-05, + "loss": 0.0458, + "step": 52 + }, + { + "epoch": 0.0108, + "learning_rate": 1.9261341417906622e-05, + "loss": 0.0342, + "step": 54 + }, + { + "epoch": 0.0112, + "learning_rate": 1.9256065765712524e-05, + "loss": 0.2085, + "step": 56 + }, + { + "epoch": 0.0116, + "learning_rate": 1.925077206834458e-05, + "loss": 0.1011, + "step": 58 + }, + { + "epoch": 0.012, + "learning_rate": 1.9245460336123136e-05, + "loss": 0.1017, + "step": 60 + }, + { + "epoch": 0.0124, + "learning_rate": 1.924013057940367e-05, + "loss": 0.0037, + "step": 62 + }, + { + "epoch": 0.0128, + "learning_rate": 1.923478280857683e-05, + "loss": 0.1095, + "step": 64 + }, + { + "epoch": 0.0132, + "learning_rate": 1.9229417034068352e-05, + "loss": 0.0015, + "step": 66 + }, + { + "epoch": 0.0136, + "learning_rate": 1.9224033266339103e-05, + "loss": 0.0014, + "step": 68 + }, + { + "epoch": 0.014, + "learning_rate": 1.921863151588501e-05, + "loss": 0.2079, + "step": 70 + }, + { + "epoch": 0.0144, + "learning_rate": 1.9213211793237056e-05, + "loss": 0.0162, + "step": 72 + }, + { + "epoch": 0.0148, + "learning_rate": 1.9207774108961273e-05, + "loss": 0.0724, + "step": 74 + }, + { + "epoch": 0.0152, + "learning_rate": 1.9202318473658703e-05, + "loss": 0.0903, + "step": 76 + }, + { + "epoch": 0.0156, + "learning_rate": 1.9196844897965393e-05, + "loss": 0.004, + "step": 78 + }, + { + "epoch": 0.016, + "learning_rate": 1.9191353392552346e-05, + "loss": 0.041, + "step": 80 + }, + { + "epoch": 0.0164, + "learning_rate": 1.9185843968125546e-05, + "loss": 2.2191, + "step": 82 + }, + { + "epoch": 0.0168, + "learning_rate": 1.9180316635425883e-05, + "loss": 0.021, + "step": 84 + }, + { + "epoch": 0.0172, + "learning_rate": 1.9174771405229187e-05, + "loss": 1.1025, + "step": 86 + }, + { + "epoch": 0.0176, + "learning_rate": 1.9169208288346168e-05, + "loss": 0.0088, + "step": 88 + }, + { + "epoch": 0.018, + "learning_rate": 1.9163627295622397e-05, + "loss": 0.4058, + "step": 90 + }, + { + "epoch": 0.0184, + "learning_rate": 1.915802843793832e-05, + "loss": 0.8453, + "step": 92 + }, + { + "epoch": 0.0188, + "learning_rate": 1.9152411726209176e-05, + "loss": 0.0936, + "step": 94 + }, + { + "epoch": 0.0192, + "learning_rate": 1.9146777171385053e-05, + "loss": 0.1813, + "step": 96 + }, + { + "epoch": 0.0196, + "learning_rate": 1.9141124784450786e-05, + "loss": 0.0284, + "step": 98 + }, + { + "epoch": 0.02, + "learning_rate": 1.9135454576426013e-05, + "loss": 0.0028, + "step": 100 + }, + { + "epoch": 0.0204, + "learning_rate": 1.9129766558365076e-05, + "loss": 0.712, + "step": 102 + }, + { + "epoch": 0.0208, + "learning_rate": 1.9124060741357062e-05, + "loss": 0.3253, + "step": 104 + }, + { + "epoch": 0.0212, + "learning_rate": 1.9118337136525764e-05, + "loss": 0.0407, + "step": 106 + }, + { + "epoch": 0.0216, + "learning_rate": 1.9112595755029625e-05, + "loss": 0.4018, + "step": 108 + }, + { + "epoch": 0.022, + "learning_rate": 1.9106836608061775e-05, + "loss": 0.0862, + "step": 110 + }, + { + "epoch": 0.0224, + "learning_rate": 1.9101059706849957e-05, + "loss": 0.696, + "step": 112 + }, + { + "epoch": 0.0228, + "learning_rate": 1.9095265062656546e-05, + "loss": 0.4302, + "step": 114 + }, + { + "epoch": 0.0232, + "learning_rate": 1.9089452686778487e-05, + "loss": 0.1808, + "step": 116 + }, + { + "epoch": 0.0236, + "learning_rate": 1.9083622590547313e-05, + "loss": 0.5403, + "step": 118 + }, + { + "epoch": 0.024, + "learning_rate": 1.9077774785329085e-05, + "loss": 0.0417, + "step": 120 + }, + { + "epoch": 0.0244, + "learning_rate": 1.907190928252441e-05, + "loss": 0.4641, + "step": 122 + }, + { + "epoch": 0.0248, + "learning_rate": 1.906602609356838e-05, + "loss": 0.3158, + "step": 124 + }, + { + "epoch": 0.0252, + "learning_rate": 1.906012522993057e-05, + "loss": 0.1203, + "step": 126 + }, + { + "epoch": 0.0256, + "learning_rate": 1.9054206703115024e-05, + "loss": 0.0341, + "step": 128 + }, + { + "epoch": 0.026, + "learning_rate": 1.9048270524660197e-05, + "loss": 0.0819, + "step": 130 + }, + { + "epoch": 0.0264, + "learning_rate": 1.904231670613899e-05, + "loss": 0.0192, + "step": 132 + }, + { + "epoch": 0.0268, + "learning_rate": 1.9036345259158664e-05, + "loss": 0.0457, + "step": 134 + }, + { + "epoch": 0.0272, + "learning_rate": 1.9030356195360875e-05, + "loss": 0.7786, + "step": 136 + }, + { + "epoch": 0.0276, + "learning_rate": 1.90243495264216e-05, + "loss": 0.018, + "step": 138 + }, + { + "epoch": 0.028, + "learning_rate": 1.901832526405114e-05, + "loss": 0.0158, + "step": 140 + }, + { + "epoch": 0.0284, + "learning_rate": 1.901228341999412e-05, + "loss": 0.0375, + "step": 142 + }, + { + "epoch": 0.0288, + "learning_rate": 1.9006224006029404e-05, + "loss": 0.0755, + "step": 144 + }, + { + "epoch": 0.0292, + "learning_rate": 1.900014703397015e-05, + "loss": 0.5203, + "step": 146 + }, + { + "epoch": 0.0296, + "learning_rate": 1.899405251566371e-05, + "loss": 0.2616, + "step": 148 + }, + { + "epoch": 0.03, + "learning_rate": 1.8987940462991673e-05, + "loss": 0.0115, + "step": 150 + }, + { + "epoch": 0.0304, + "learning_rate": 1.8981810887869784e-05, + "loss": 0.1419, + "step": 152 + }, + { + "epoch": 0.0308, + "learning_rate": 1.8975663802247978e-05, + "loss": 0.1156, + "step": 154 + }, + { + "epoch": 0.0312, + "learning_rate": 1.8969499218110305e-05, + "loss": 0.0151, + "step": 156 + }, + { + "epoch": 0.0316, + "learning_rate": 1.8963317147474933e-05, + "loss": 0.0936, + "step": 158 + }, + { + "epoch": 0.032, + "learning_rate": 1.8957117602394133e-05, + "loss": 0.0135, + "step": 160 + }, + { + "epoch": 0.0324, + "learning_rate": 1.8950900594954226e-05, + "loss": 0.0207, + "step": 162 + }, + { + "epoch": 0.0328, + "learning_rate": 1.8944666137275606e-05, + "loss": 0.0113, + "step": 164 + }, + { + "epoch": 0.0332, + "learning_rate": 1.893841424151264e-05, + "loss": 0.0075, + "step": 166 + }, + { + "epoch": 0.0336, + "learning_rate": 1.893214491985374e-05, + "loss": 0.034, + "step": 168 + }, + { + "epoch": 0.034, + "learning_rate": 1.892585818452126e-05, + "loss": 0.1702, + "step": 170 + }, + { + "epoch": 0.0344, + "learning_rate": 1.8919554047771508e-05, + "loss": 0.1335, + "step": 172 + }, + { + "epoch": 0.0348, + "learning_rate": 1.8913232521894737e-05, + "loss": 0.1512, + "step": 174 + }, + { + "epoch": 0.0352, + "learning_rate": 1.890689361921507e-05, + "loss": 0.003, + "step": 176 + }, + { + "epoch": 0.0356, + "learning_rate": 1.890053735209053e-05, + "loss": 0.0259, + "step": 178 + }, + { + "epoch": 0.036, + "learning_rate": 1.8894163732912972e-05, + "loss": 0.0796, + "step": 180 + }, + { + "epoch": 0.0364, + "learning_rate": 1.888777277410812e-05, + "loss": 0.0715, + "step": 182 + }, + { + "epoch": 0.0368, + "learning_rate": 1.8881364488135445e-05, + "loss": 0.001, + "step": 184 + }, + { + "epoch": 0.0372, + "learning_rate": 1.8874938887488252e-05, + "loss": 0.4859, + "step": 186 + }, + { + "epoch": 0.0376, + "learning_rate": 1.886849598469356e-05, + "loss": 0.0004, + "step": 188 + }, + { + "epoch": 0.038, + "learning_rate": 1.8862035792312148e-05, + "loss": 0.2631, + "step": 190 + }, + { + "epoch": 0.0384, + "learning_rate": 1.8855558322938495e-05, + "loss": 0.6661, + "step": 192 + }, + { + "epoch": 0.0388, + "learning_rate": 1.8849063589200747e-05, + "loss": 0.0116, + "step": 194 + }, + { + "epoch": 0.0392, + "learning_rate": 1.8842551603760732e-05, + "loss": 0.0212, + "step": 196 + }, + { + "epoch": 0.0396, + "learning_rate": 1.8836022379313884e-05, + "loss": 0.0071, + "step": 198 + }, + { + "epoch": 0.04, + "learning_rate": 1.8829475928589272e-05, + "loss": 0.5283, + "step": 200 + }, + { + "epoch": 0.0404, + "learning_rate": 1.8822912264349532e-05, + "loss": 0.0884, + "step": 202 + }, + { + "epoch": 0.0408, + "learning_rate": 1.8816331399390874e-05, + "loss": 0.0006, + "step": 204 + }, + { + "epoch": 0.0412, + "learning_rate": 1.8809733346543013e-05, + "loss": 0.0003, + "step": 206 + }, + { + "epoch": 0.0416, + "learning_rate": 1.8803118118669203e-05, + "loss": 0.0014, + "step": 208 + }, + { + "epoch": 0.042, + "learning_rate": 1.879648572866617e-05, + "loss": 0.5919, + "step": 210 + }, + { + "epoch": 0.0424, + "learning_rate": 1.878983618946409e-05, + "loss": 0.0035, + "step": 212 + }, + { + "epoch": 0.0428, + "learning_rate": 1.8783169514026584e-05, + "loss": 0.0034, + "step": 214 + }, + { + "epoch": 0.0432, + "learning_rate": 1.8776485715350672e-05, + "loss": 0.0057, + "step": 216 + }, + { + "epoch": 0.0436, + "learning_rate": 1.876978480646677e-05, + "loss": 0.0199, + "step": 218 + }, + { + "epoch": 0.044, + "learning_rate": 1.8763066800438638e-05, + "loss": 0.0049, + "step": 220 + }, + { + "epoch": 0.0444, + "learning_rate": 1.8756331710363375e-05, + "loss": 0.0878, + "step": 222 + }, + { + "epoch": 0.0448, + "learning_rate": 1.8749579549371377e-05, + "loss": 0.091, + "step": 224 + }, + { + "epoch": 0.0452, + "learning_rate": 1.874281033062634e-05, + "loss": 0.0008, + "step": 226 + }, + { + "epoch": 0.0456, + "learning_rate": 1.8736024067325195e-05, + "loss": 0.0191, + "step": 228 + }, + { + "epoch": 0.046, + "learning_rate": 1.8729220772698096e-05, + "loss": 0.0138, + "step": 230 + }, + { + "epoch": 0.0464, + "learning_rate": 1.8722400460008444e-05, + "loss": 0.291, + "step": 232 + }, + { + "epoch": 0.0468, + "learning_rate": 1.8715563142552758e-05, + "loss": 0.0112, + "step": 234 + }, + { + "epoch": 0.0472, + "learning_rate": 1.8708708833660755e-05, + "loss": 0.0389, + "step": 236 + }, + { + "epoch": 0.0476, + "learning_rate": 1.8701837546695256e-05, + "loss": 0.0032, + "step": 238 + }, + { + "epoch": 0.048, + "learning_rate": 1.8694949295052194e-05, + "loss": 0.0025, + "step": 240 + }, + { + "epoch": 0.0484, + "learning_rate": 1.8688044092160554e-05, + "loss": 0.0257, + "step": 242 + }, + { + "epoch": 0.0488, + "learning_rate": 1.8681121951482397e-05, + "loss": 0.0601, + "step": 244 + }, + { + "epoch": 0.0492, + "learning_rate": 1.867418288651278e-05, + "loss": 0.0011, + "step": 246 + }, + { + "epoch": 0.0496, + "learning_rate": 1.8667226910779763e-05, + "loss": 1.1157, + "step": 248 + }, + { + "epoch": 0.05, + "learning_rate": 1.866025403784439e-05, + "loss": 0.005, + "step": 250 + }, + { + "epoch": 0.0504, + "learning_rate": 1.8653264281300622e-05, + "loss": 0.2359, + "step": 252 + }, + { + "epoch": 0.0508, + "learning_rate": 1.864625765477535e-05, + "loss": 0.0195, + "step": 254 + }, + { + "epoch": 0.0512, + "learning_rate": 1.8639234171928355e-05, + "loss": 0.0147, + "step": 256 + }, + { + "epoch": 0.0516, + "learning_rate": 1.8632193846452274e-05, + "loss": 0.1987, + "step": 258 + }, + { + "epoch": 0.052, + "learning_rate": 1.8625136692072573e-05, + "loss": 0.0044, + "step": 260 + }, + { + "epoch": 0.0524, + "learning_rate": 1.8618062722547554e-05, + "loss": 0.8275, + "step": 262 + }, + { + "epoch": 0.0528, + "learning_rate": 1.8610971951668268e-05, + "loss": 0.4029, + "step": 264 + }, + { + "epoch": 0.0532, + "learning_rate": 1.8603864393258534e-05, + "loss": 0.063, + "step": 266 + }, + { + "epoch": 0.0536, + "learning_rate": 1.8596740061174915e-05, + "loss": 0.0373, + "step": 268 + }, + { + "epoch": 0.054, + "learning_rate": 1.8589598969306646e-05, + "loss": 0.0323, + "step": 270 + }, + { + "epoch": 0.0544, + "learning_rate": 1.8582441131575664e-05, + "loss": 0.1636, + "step": 272 + }, + { + "epoch": 0.0548, + "learning_rate": 1.8575266561936526e-05, + "loss": 0.0944, + "step": 274 + }, + { + "epoch": 0.0552, + "learning_rate": 1.8568075274376432e-05, + "loss": 0.023, + "step": 276 + }, + { + "epoch": 0.0556, + "learning_rate": 1.856086728291516e-05, + "loss": 0.4673, + "step": 278 + }, + { + "epoch": 0.056, + "learning_rate": 1.855364260160507e-05, + "loss": 0.0031, + "step": 280 + }, + { + "epoch": 0.0564, + "learning_rate": 1.8546401244531034e-05, + "loss": 0.2522, + "step": 282 + }, + { + "epoch": 0.0568, + "learning_rate": 1.853914322581045e-05, + "loss": 0.0182, + "step": 284 + }, + { + "epoch": 0.0572, + "learning_rate": 1.8531868559593208e-05, + "loss": 1.0468, + "step": 286 + }, + { + "epoch": 0.0576, + "learning_rate": 1.8524577260061628e-05, + "loss": 0.0313, + "step": 288 + }, + { + "epoch": 0.058, + "learning_rate": 1.8517269341430482e-05, + "loss": 0.3143, + "step": 290 + }, + { + "epoch": 0.0584, + "learning_rate": 1.8509944817946917e-05, + "loss": 0.3706, + "step": 292 + }, + { + "epoch": 0.0588, + "learning_rate": 1.8502603703890488e-05, + "loss": 0.0147, + "step": 294 + }, + { + "epoch": 0.0592, + "learning_rate": 1.849524601357305e-05, + "loss": 0.155, + "step": 296 + }, + { + "epoch": 0.0596, + "learning_rate": 1.848787176133882e-05, + "loss": 0.0422, + "step": 298 + }, + { + "epoch": 0.06, + "learning_rate": 1.848048096156427e-05, + "loss": 0.025, + "step": 300 + }, + { + "epoch": 0.0604, + "learning_rate": 1.8473073628658123e-05, + "loss": 0.0382, + "step": 302 + }, + { + "epoch": 0.0608, + "learning_rate": 1.8465649777061384e-05, + "loss": 0.0911, + "step": 304 + }, + { + "epoch": 0.0612, + "learning_rate": 1.8458209421247208e-05, + "loss": 0.7937, + "step": 306 + }, + { + "epoch": 0.0616, + "learning_rate": 1.845075257572097e-05, + "loss": 0.0951, + "step": 308 + }, + { + "epoch": 0.062, + "learning_rate": 1.8443279255020153e-05, + "loss": 0.0533, + "step": 310 + }, + { + "epoch": 0.0624, + "learning_rate": 1.843578947371439e-05, + "loss": 0.313, + "step": 312 + }, + { + "epoch": 0.0628, + "learning_rate": 1.8428283246405386e-05, + "loss": 0.3859, + "step": 314 + }, + { + "epoch": 0.0632, + "learning_rate": 1.8420760587726928e-05, + "loss": 0.0416, + "step": 316 + }, + { + "epoch": 0.0636, + "learning_rate": 1.841322151234481e-05, + "loss": 0.0248, + "step": 318 + }, + { + "epoch": 0.064, + "learning_rate": 1.8405666034956846e-05, + "loss": 0.4857, + "step": 320 + }, + { + "epoch": 0.0644, + "learning_rate": 1.8398094170292833e-05, + "loss": 0.1366, + "step": 322 + }, + { + "epoch": 0.0648, + "learning_rate": 1.83905059331145e-05, + "loss": 0.257, + "step": 324 + }, + { + "epoch": 0.0652, + "learning_rate": 1.838290133821552e-05, + "loss": 0.0899, + "step": 326 + }, + { + "epoch": 0.0656, + "learning_rate": 1.8375280400421418e-05, + "loss": 0.0744, + "step": 328 + }, + { + "epoch": 0.066, + "learning_rate": 1.836764313458962e-05, + "loss": 0.0335, + "step": 330 + }, + { + "epoch": 0.0664, + "learning_rate": 1.8359989555609355e-05, + "loss": 0.0021, + "step": 332 + }, + { + "epoch": 0.0668, + "learning_rate": 1.8352319678401677e-05, + "loss": 0.0141, + "step": 334 + }, + { + "epoch": 0.0672, + "learning_rate": 1.8344633517919398e-05, + "loss": 0.0305, + "step": 336 + }, + { + "epoch": 0.0676, + "learning_rate": 1.8336931089147076e-05, + "loss": 0.0082, + "step": 338 + }, + { + "epoch": 0.068, + "learning_rate": 1.8329212407101e-05, + "loss": 0.0646, + "step": 340 + }, + { + "epoch": 0.0684, + "learning_rate": 1.832147748682912e-05, + "loss": 0.0571, + "step": 342 + }, + { + "epoch": 0.0688, + "learning_rate": 1.831372634341109e-05, + "loss": 0.0065, + "step": 344 + }, + { + "epoch": 0.0692, + "learning_rate": 1.8305958991958125e-05, + "loss": 0.094, + "step": 346 + }, + { + "epoch": 0.0696, + "learning_rate": 1.8298175447613103e-05, + "loss": 0.6047, + "step": 348 + }, + { + "epoch": 0.07, + "learning_rate": 1.8290375725550417e-05, + "loss": 0.0032, + "step": 350 + }, + { + "epoch": 0.0704, + "learning_rate": 1.8282559840976043e-05, + "loss": 0.2335, + "step": 352 + }, + { + "epoch": 0.0708, + "learning_rate": 1.8274727809127447e-05, + "loss": 0.0018, + "step": 354 + }, + { + "epoch": 0.0712, + "learning_rate": 1.8266879645273557e-05, + "loss": 0.0051, + "step": 356 + }, + { + "epoch": 0.0716, + "learning_rate": 1.8259015364714793e-05, + "loss": 0.0159, + "step": 358 + }, + { + "epoch": 0.072, + "learning_rate": 1.8251134982782955e-05, + "loss": 0.0011, + "step": 360 + }, + { + "epoch": 0.0724, + "learning_rate": 1.824323851484126e-05, + "loss": 0.8996, + "step": 362 + }, + { + "epoch": 0.0728, + "learning_rate": 1.8235325976284276e-05, + "loss": 0.0109, + "step": 364 + }, + { + "epoch": 0.0732, + "learning_rate": 1.8227397382537903e-05, + "loss": 0.1093, + "step": 366 + }, + { + "epoch": 0.0736, + "learning_rate": 1.8219452749059332e-05, + "loss": 1.0616, + "step": 368 + }, + { + "epoch": 0.074, + "learning_rate": 1.8211492091337038e-05, + "loss": 0.1312, + "step": 370 + }, + { + "epoch": 0.0744, + "learning_rate": 1.8203515424890744e-05, + "loss": 0.0033, + "step": 372 + }, + { + "epoch": 0.0748, + "learning_rate": 1.819552276527134e-05, + "loss": 0.0946, + "step": 374 + }, + { + "epoch": 0.0752, + "learning_rate": 1.8187514128060953e-05, + "loss": 0.0008, + "step": 376 + }, + { + "epoch": 0.0756, + "learning_rate": 1.8179489528872808e-05, + "loss": 0.014, + "step": 378 + }, + { + "epoch": 0.076, + "learning_rate": 1.817144898335129e-05, + "loss": 0.0952, + "step": 380 + }, + { + "epoch": 0.0764, + "learning_rate": 1.8163392507171837e-05, + "loss": 0.2003, + "step": 382 + }, + { + "epoch": 0.0768, + "learning_rate": 1.8155320116040983e-05, + "loss": 0.3122, + "step": 384 + }, + { + "epoch": 0.0772, + "learning_rate": 1.8147231825696255e-05, + "loss": 0.0009, + "step": 386 + }, + { + "epoch": 0.0776, + "learning_rate": 1.813912765190618e-05, + "loss": 0.0093, + "step": 388 + }, + { + "epoch": 0.078, + "learning_rate": 1.813100761047028e-05, + "loss": 1.0851, + "step": 390 + }, + { + "epoch": 0.0784, + "learning_rate": 1.812287171721897e-05, + "loss": 0.0022, + "step": 392 + }, + { + "epoch": 0.0788, + "learning_rate": 1.8114719988013616e-05, + "loss": 0.0274, + "step": 394 + }, + { + "epoch": 0.0792, + "learning_rate": 1.8106552438746403e-05, + "loss": 0.042, + "step": 396 + }, + { + "epoch": 0.0796, + "learning_rate": 1.80983690853404e-05, + "loss": 0.0171, + "step": 398 + }, + { + "epoch": 0.08, + "learning_rate": 1.809016994374947e-05, + "loss": 0.0144, + "step": 400 + }, + { + "epoch": 0.0804, + "learning_rate": 1.8081955029958272e-05, + "loss": 0.0007, + "step": 402 + }, + { + "epoch": 0.0808, + "learning_rate": 1.8073724359982194e-05, + "loss": 0.5943, + "step": 404 + }, + { + "epoch": 0.0812, + "learning_rate": 1.806547794986733e-05, + "loss": 0.0923, + "step": 406 + }, + { + "epoch": 0.0816, + "learning_rate": 1.8057215815690497e-05, + "loss": 0.0183, + "step": 408 + }, + { + "epoch": 0.082, + "learning_rate": 1.804893797355914e-05, + "loss": 0.1164, + "step": 410 + }, + { + "epoch": 0.0824, + "learning_rate": 1.8040644439611355e-05, + "loss": 0.0032, + "step": 412 + }, + { + "epoch": 0.0828, + "learning_rate": 1.8032335230015777e-05, + "loss": 0.0584, + "step": 414 + }, + { + "epoch": 0.0832, + "learning_rate": 1.802401036097167e-05, + "loss": 0.0388, + "step": 416 + }, + { + "epoch": 0.0836, + "learning_rate": 1.8015669848708764e-05, + "loss": 0.0073, + "step": 418 + }, + { + "epoch": 0.084, + "learning_rate": 1.8007313709487338e-05, + "loss": 0.05, + "step": 420 + }, + { + "epoch": 0.0844, + "learning_rate": 1.7998941959598104e-05, + "loss": 0.0234, + "step": 422 + }, + { + "epoch": 0.0848, + "learning_rate": 1.79905546153622e-05, + "loss": 0.1556, + "step": 424 + }, + { + "epoch": 0.0852, + "learning_rate": 1.7982151693131206e-05, + "loss": 0.0332, + "step": 426 + }, + { + "epoch": 0.0856, + "learning_rate": 1.7973733209287036e-05, + "loss": 0.0063, + "step": 428 + }, + { + "epoch": 0.086, + "learning_rate": 1.796529918024197e-05, + "loss": 0.1626, + "step": 430 + }, + { + "epoch": 0.0864, + "learning_rate": 1.7956849622438554e-05, + "loss": 0.0222, + "step": 432 + }, + { + "epoch": 0.0868, + "learning_rate": 1.7948384552349662e-05, + "loss": 0.0706, + "step": 434 + }, + { + "epoch": 0.0872, + "learning_rate": 1.7939903986478354e-05, + "loss": 0.016, + "step": 436 + }, + { + "epoch": 0.0876, + "learning_rate": 1.793140794135795e-05, + "loss": 0.0346, + "step": 438 + }, + { + "epoch": 0.088, + "learning_rate": 1.7922896433551917e-05, + "loss": 0.027, + "step": 440 + }, + { + "epoch": 0.0884, + "learning_rate": 1.7914369479653858e-05, + "loss": 0.0166, + "step": 442 + }, + { + "epoch": 0.0888, + "learning_rate": 1.7905827096287535e-05, + "loss": 0.2021, + "step": 444 + }, + { + "epoch": 0.0892, + "learning_rate": 1.7897269300106735e-05, + "loss": 0.0047, + "step": 446 + }, + { + "epoch": 0.0896, + "learning_rate": 1.7888696107795347e-05, + "loss": 0.5875, + "step": 448 + }, + { + "epoch": 0.09, + "learning_rate": 1.788010753606722e-05, + "loss": 0.0561, + "step": 450 + }, + { + "epoch": 0.0904, + "learning_rate": 1.7871503601666237e-05, + "loss": 0.4616, + "step": 452 + }, + { + "epoch": 0.0908, + "learning_rate": 1.786288432136619e-05, + "loss": 0.0012, + "step": 454 + }, + { + "epoch": 0.0912, + "learning_rate": 1.785424971197082e-05, + "loss": 0.0002, + "step": 456 + }, + { + "epoch": 0.0916, + "learning_rate": 1.7845599790313742e-05, + "loss": 1.082, + "step": 458 + }, + { + "epoch": 0.092, + "learning_rate": 1.78369345732584e-05, + "loss": 0.1396, + "step": 460 + }, + { + "epoch": 0.0924, + "learning_rate": 1.7828254077698106e-05, + "loss": 0.0009, + "step": 462 + }, + { + "epoch": 0.0928, + "learning_rate": 1.7819558320555902e-05, + "loss": 0.5841, + "step": 464 + }, + { + "epoch": 0.0932, + "learning_rate": 1.7810847318784635e-05, + "loss": 0.0205, + "step": 466 + }, + { + "epoch": 0.0936, + "learning_rate": 1.7802121089366836e-05, + "loss": 0.3841, + "step": 468 + }, + { + "epoch": 0.094, + "learning_rate": 1.779337964931475e-05, + "loss": 0.0196, + "step": 470 + }, + { + "epoch": 0.0944, + "learning_rate": 1.7784623015670234e-05, + "loss": 0.6998, + "step": 472 + }, + { + "epoch": 0.0948, + "learning_rate": 1.777585120550482e-05, + "loss": 0.2599, + "step": 474 + }, + { + "epoch": 0.0952, + "learning_rate": 1.7767064235919597e-05, + "loss": 0.9076, + "step": 476 + }, + { + "epoch": 0.0956, + "learning_rate": 1.7758262124045195e-05, + "loss": 0.0066, + "step": 478 + }, + { + "epoch": 0.096, + "learning_rate": 1.7749444887041803e-05, + "loss": 0.204, + "step": 480 + }, + { + "epoch": 0.0964, + "learning_rate": 1.7740612542099057e-05, + "loss": 0.2682, + "step": 482 + }, + { + "epoch": 0.0968, + "learning_rate": 1.7731765106436076e-05, + "loss": 0.1142, + "step": 484 + }, + { + "epoch": 0.0972, + "learning_rate": 1.7722902597301385e-05, + "loss": 0.0062, + "step": 486 + }, + { + "epoch": 0.0976, + "learning_rate": 1.7714025031972907e-05, + "loss": 1.0577, + "step": 488 + }, + { + "epoch": 0.098, + "learning_rate": 1.770513242775789e-05, + "loss": 0.0752, + "step": 490 + }, + { + "epoch": 0.0984, + "learning_rate": 1.769622480199295e-05, + "loss": 0.1167, + "step": 492 + }, + { + "epoch": 0.0988, + "learning_rate": 1.768730217204394e-05, + "loss": 0.3442, + "step": 494 + }, + { + "epoch": 0.0992, + "learning_rate": 1.7678364555305976e-05, + "loss": 0.3032, + "step": 496 + }, + { + "epoch": 0.0996, + "learning_rate": 1.7669411969203424e-05, + "loss": 0.0809, + "step": 498 + }, + { + "epoch": 0.1, + "learning_rate": 1.7660444431189784e-05, + "loss": 0.0624, + "step": 500 + }, + { + "epoch": 0.1004, + "learning_rate": 1.7651461958747745e-05, + "loss": 0.0247, + "step": 502 + }, + { + "epoch": 0.1008, + "learning_rate": 1.7642464569389087e-05, + "loss": 0.0325, + "step": 504 + }, + { + "epoch": 0.1012, + "learning_rate": 1.76334522806547e-05, + "loss": 0.0309, + "step": 506 + }, + { + "epoch": 0.1016, + "learning_rate": 1.7624425110114478e-05, + "loss": 0.3071, + "step": 508 + }, + { + "epoch": 0.102, + "learning_rate": 1.7615383075367373e-05, + "loss": 0.0552, + "step": 510 + }, + { + "epoch": 0.1024, + "learning_rate": 1.7606326194041278e-05, + "loss": 0.083, + "step": 512 + }, + { + "epoch": 0.1028, + "learning_rate": 1.759725448379305e-05, + "loss": 0.0455, + "step": 514 + }, + { + "epoch": 0.1032, + "learning_rate": 1.758816796230846e-05, + "loss": 0.1071, + "step": 516 + }, + { + "epoch": 0.1036, + "learning_rate": 1.7579066647302134e-05, + "loss": 0.0809, + "step": 518 + }, + { + "epoch": 0.104, + "learning_rate": 1.7569950556517566e-05, + "loss": 0.1379, + "step": 520 + }, + { + "epoch": 0.1044, + "learning_rate": 1.7560819707727027e-05, + "loss": 0.098, + "step": 522 + }, + { + "epoch": 0.1048, + "learning_rate": 1.7551674118731595e-05, + "loss": 0.0042, + "step": 524 + }, + { + "epoch": 0.1052, + "learning_rate": 1.7542513807361037e-05, + "loss": 0.0036, + "step": 526 + }, + { + "epoch": 0.1056, + "learning_rate": 1.7533338791473872e-05, + "loss": 0.0127, + "step": 528 + }, + { + "epoch": 0.106, + "learning_rate": 1.752414908895725e-05, + "loss": 0.0074, + "step": 530 + }, + { + "epoch": 0.1064, + "learning_rate": 1.7514944717726962e-05, + "loss": 0.8875, + "step": 532 + }, + { + "epoch": 0.1068, + "learning_rate": 1.7505725695727417e-05, + "loss": 0.5276, + "step": 534 + }, + { + "epoch": 0.1072, + "learning_rate": 1.749649204093155e-05, + "loss": 0.0012, + "step": 536 + }, + { + "epoch": 0.1076, + "learning_rate": 1.748724377134087e-05, + "loss": 0.0057, + "step": 538 + }, + { + "epoch": 0.108, + "learning_rate": 1.747798090498532e-05, + "loss": 0.0055, + "step": 540 + }, + { + "epoch": 0.1084, + "learning_rate": 1.746870345992336e-05, + "loss": 0.3102, + "step": 542 + }, + { + "epoch": 0.1088, + "learning_rate": 1.745941145424182e-05, + "loss": 0.0051, + "step": 544 + }, + { + "epoch": 0.1092, + "learning_rate": 1.7450104906055963e-05, + "loss": 0.1361, + "step": 546 + }, + { + "epoch": 0.1096, + "learning_rate": 1.744078383350937e-05, + "loss": 0.0676, + "step": 548 + }, + { + "epoch": 0.11, + "learning_rate": 1.7431448254773943e-05, + "loss": 0.3866, + "step": 550 + }, + { + "epoch": 0.1104, + "learning_rate": 1.7422098188049885e-05, + "loss": 0.435, + "step": 552 + }, + { + "epoch": 0.1108, + "learning_rate": 1.741273365156561e-05, + "loss": 1.7106, + "step": 554 + }, + { + "epoch": 0.1112, + "learning_rate": 1.740335466357779e-05, + "loss": 0.0229, + "step": 556 + }, + { + "epoch": 0.1116, + "learning_rate": 1.7393961242371207e-05, + "loss": 0.0235, + "step": 558 + }, + { + "epoch": 0.112, + "learning_rate": 1.7384553406258842e-05, + "loss": 0.0269, + "step": 560 + }, + { + "epoch": 0.1124, + "learning_rate": 1.7375131173581737e-05, + "loss": 0.0249, + "step": 562 + }, + { + "epoch": 0.1128, + "learning_rate": 1.7365694562709038e-05, + "loss": 0.039, + "step": 564 + }, + { + "epoch": 0.1132, + "learning_rate": 1.7356243592037882e-05, + "loss": 0.0191, + "step": 566 + }, + { + "epoch": 0.1136, + "learning_rate": 1.7346778279993417e-05, + "loss": 0.0521, + "step": 568 + }, + { + "epoch": 0.114, + "learning_rate": 1.7337298645028768e-05, + "loss": 0.1259, + "step": 570 + }, + { + "epoch": 0.1144, + "learning_rate": 1.7327804705624955e-05, + "loss": 0.0221, + "step": 572 + }, + { + "epoch": 0.1148, + "learning_rate": 1.7318296480290916e-05, + "loss": 0.584, + "step": 574 + }, + { + "epoch": 0.1152, + "learning_rate": 1.7308773987563403e-05, + "loss": 0.3576, + "step": 576 + }, + { + "epoch": 0.1156, + "learning_rate": 1.7299237246007018e-05, + "loss": 0.1187, + "step": 578 + }, + { + "epoch": 0.116, + "learning_rate": 1.728968627421411e-05, + "loss": 0.1556, + "step": 580 + }, + { + "epoch": 0.1164, + "learning_rate": 1.7280121090804813e-05, + "loss": 0.0084, + "step": 582 + }, + { + "epoch": 0.1168, + "learning_rate": 1.7270541714426926e-05, + "loss": 0.1878, + "step": 584 + }, + { + "epoch": 0.1172, + "learning_rate": 1.7260948163755925e-05, + "loss": 0.0495, + "step": 586 + }, + { + "epoch": 0.1176, + "learning_rate": 1.7251340457494937e-05, + "loss": 0.266, + "step": 588 + }, + { + "epoch": 0.118, + "learning_rate": 1.7241718614374674e-05, + "loss": 0.8291, + "step": 590 + }, + { + "epoch": 0.1184, + "learning_rate": 1.7232082653153426e-05, + "loss": 0.0665, + "step": 592 + }, + { + "epoch": 0.1188, + "learning_rate": 1.722243259261697e-05, + "loss": 0.0025, + "step": 594 + }, + { + "epoch": 0.1192, + "learning_rate": 1.7212768451578612e-05, + "loss": 0.1388, + "step": 596 + }, + { + "epoch": 0.1196, + "learning_rate": 1.720309024887907e-05, + "loss": 0.0219, + "step": 598 + }, + { + "epoch": 0.12, + "learning_rate": 1.7193398003386514e-05, + "loss": 0.0206, + "step": 600 + }, + { + "epoch": 0.1204, + "learning_rate": 1.718369173399647e-05, + "loss": 0.3012, + "step": 602 + }, + { + "epoch": 0.1208, + "learning_rate": 1.717397145963179e-05, + "loss": 0.0354, + "step": 604 + }, + { + "epoch": 0.1212, + "learning_rate": 1.7164237199242663e-05, + "loss": 0.3381, + "step": 606 + }, + { + "epoch": 0.1216, + "learning_rate": 1.7154488971806515e-05, + "loss": 0.012, + "step": 608 + }, + { + "epoch": 0.122, + "learning_rate": 1.7144726796328037e-05, + "loss": 0.01, + "step": 610 + }, + { + "epoch": 0.1224, + "learning_rate": 1.7134950691839063e-05, + "loss": 0.1012, + "step": 612 + }, + { + "epoch": 0.1228, + "learning_rate": 1.7125160677398632e-05, + "loss": 0.0089, + "step": 614 + }, + { + "epoch": 0.1232, + "learning_rate": 1.7115356772092854e-05, + "loss": 0.018, + "step": 616 + }, + { + "epoch": 0.1236, + "learning_rate": 1.710553899503496e-05, + "loss": 0.0405, + "step": 618 + }, + { + "epoch": 0.124, + "learning_rate": 1.7095707365365218e-05, + "loss": 0.0351, + "step": 620 + }, + { + "epoch": 0.1244, + "learning_rate": 1.7085861902250864e-05, + "loss": 0.4298, + "step": 622 + }, + { + "epoch": 0.1248, + "learning_rate": 1.7076002624886156e-05, + "loss": 0.0399, + "step": 624 + }, + { + "epoch": 0.1252, + "learning_rate": 1.706612955249225e-05, + "loss": 0.118, + "step": 626 + }, + { + "epoch": 0.1256, + "learning_rate": 1.705624270431721e-05, + "loss": 0.0466, + "step": 628 + }, + { + "epoch": 0.126, + "learning_rate": 1.7046342099635945e-05, + "loss": 0.1067, + "step": 630 + }, + { + "epoch": 0.1264, + "learning_rate": 1.703642775775021e-05, + "loss": 0.0028, + "step": 632 + }, + { + "epoch": 0.1268, + "learning_rate": 1.70264996979885e-05, + "loss": 0.0874, + "step": 634 + }, + { + "epoch": 0.1272, + "learning_rate": 1.7016557939706075e-05, + "loss": 0.0072, + "step": 636 + }, + { + "epoch": 0.1276, + "learning_rate": 1.7006602502284923e-05, + "loss": 0.1478, + "step": 638 + }, + { + "epoch": 0.128, + "learning_rate": 1.6996633405133656e-05, + "loss": 0.0016, + "step": 640 + }, + { + "epoch": 0.1284, + "learning_rate": 1.6986650667687562e-05, + "loss": 0.7672, + "step": 642 + }, + { + "epoch": 0.1288, + "learning_rate": 1.6976654309408468e-05, + "loss": 0.0356, + "step": 644 + }, + { + "epoch": 0.1292, + "learning_rate": 1.6966644349784812e-05, + "loss": 0.0654, + "step": 646 + }, + { + "epoch": 0.1296, + "learning_rate": 1.6956620808331505e-05, + "loss": 0.3329, + "step": 648 + }, + { + "epoch": 0.13, + "learning_rate": 1.694658370458998e-05, + "loss": 0.0003, + "step": 650 + }, + { + "epoch": 0.1304, + "learning_rate": 1.6936533058128056e-05, + "loss": 0.0302, + "step": 652 + }, + { + "epoch": 0.1308, + "learning_rate": 1.6926468888539988e-05, + "loss": 0.0034, + "step": 654 + }, + { + "epoch": 0.1312, + "learning_rate": 1.691639121544641e-05, + "loss": 0.0221, + "step": 656 + }, + { + "epoch": 0.1316, + "learning_rate": 1.690630005849423e-05, + "loss": 0.0497, + "step": 658 + }, + { + "epoch": 0.132, + "learning_rate": 1.6896195437356706e-05, + "loss": 0.0259, + "step": 660 + }, + { + "epoch": 0.1324, + "learning_rate": 1.6886077371733285e-05, + "loss": 0.3392, + "step": 662 + }, + { + "epoch": 0.1328, + "learning_rate": 1.687594588134968e-05, + "loss": 0.0058, + "step": 664 + }, + { + "epoch": 0.1332, + "learning_rate": 1.686580098595772e-05, + "loss": 0.0008, + "step": 666 + }, + { + "epoch": 0.1336, + "learning_rate": 1.685564270533544e-05, + "loss": 0.1172, + "step": 668 + }, + { + "epoch": 0.134, + "learning_rate": 1.6845471059286896e-05, + "loss": 0.0693, + "step": 670 + }, + { + "epoch": 0.1344, + "learning_rate": 1.683528606764223e-05, + "loss": 0.2912, + "step": 672 + }, + { + "epoch": 0.1348, + "learning_rate": 1.6825087750257624e-05, + "loss": 0.1856, + "step": 674 + }, + { + "epoch": 0.1352, + "learning_rate": 1.6814876127015198e-05, + "loss": 0.003, + "step": 676 + }, + { + "epoch": 0.1356, + "learning_rate": 1.680465121782306e-05, + "loss": 0.0009, + "step": 678 + }, + { + "epoch": 0.136, + "learning_rate": 1.6794413042615168e-05, + "loss": 0.0036, + "step": 680 + }, + { + "epoch": 0.1364, + "learning_rate": 1.6784161621351384e-05, + "loss": 0.0019, + "step": 682 + }, + { + "epoch": 0.1368, + "learning_rate": 1.6773896974017367e-05, + "loss": 0.7971, + "step": 684 + }, + { + "epoch": 0.1372, + "learning_rate": 1.67636191206246e-05, + "loss": 0.0043, + "step": 686 + }, + { + "epoch": 0.1376, + "learning_rate": 1.6753328081210254e-05, + "loss": 0.3297, + "step": 688 + }, + { + "epoch": 0.138, + "learning_rate": 1.674302387583724e-05, + "loss": 0.2305, + "step": 690 + }, + { + "epoch": 0.1384, + "learning_rate": 1.673270652459414e-05, + "loss": 0.0414, + "step": 692 + }, + { + "epoch": 0.1388, + "learning_rate": 1.672237604759516e-05, + "loss": 0.0808, + "step": 694 + }, + { + "epoch": 0.1392, + "learning_rate": 1.6712032464980098e-05, + "loss": 0.0226, + "step": 696 + }, + { + "epoch": 0.1396, + "learning_rate": 1.6701675796914284e-05, + "loss": 0.2197, + "step": 698 + }, + { + "epoch": 0.14, + "learning_rate": 1.669130606358859e-05, + "loss": 0.0045, + "step": 700 + }, + { + "epoch": 0.1404, + "learning_rate": 1.668092328521932e-05, + "loss": 0.2025, + "step": 702 + }, + { + "epoch": 0.1408, + "learning_rate": 1.667052748204825e-05, + "loss": 0.0205, + "step": 704 + }, + { + "epoch": 0.1412, + "learning_rate": 1.6660118674342525e-05, + "loss": 0.3811, + "step": 706 + }, + { + "epoch": 0.1416, + "learning_rate": 1.6649696882394638e-05, + "loss": 0.0012, + "step": 708 + }, + { + "epoch": 0.142, + "learning_rate": 1.663926212652242e-05, + "loss": 0.0027, + "step": 710 + }, + { + "epoch": 0.1424, + "learning_rate": 1.6628814427068947e-05, + "loss": 0.0094, + "step": 712 + }, + { + "epoch": 0.1428, + "learning_rate": 1.6618353804402573e-05, + "loss": 0.0014, + "step": 714 + }, + { + "epoch": 0.1432, + "learning_rate": 1.6607880278916778e-05, + "loss": 0.0675, + "step": 716 + }, + { + "epoch": 0.1436, + "learning_rate": 1.6597393871030267e-05, + "loss": 0.0018, + "step": 718 + }, + { + "epoch": 0.144, + "learning_rate": 1.6586894601186804e-05, + "loss": 0.0057, + "step": 720 + }, + { + "epoch": 0.1444, + "learning_rate": 1.6576382489855274e-05, + "loss": 0.0002, + "step": 722 + }, + { + "epoch": 0.1448, + "learning_rate": 1.6565857557529574e-05, + "loss": 0.0031, + "step": 724 + }, + { + "epoch": 0.1452, + "learning_rate": 1.6555319824728577e-05, + "loss": 0.0211, + "step": 726 + }, + { + "epoch": 0.1456, + "learning_rate": 1.654476931199616e-05, + "loss": 0.5884, + "step": 728 + }, + { + "epoch": 0.146, + "learning_rate": 1.6534206039901057e-05, + "loss": 0.0072, + "step": 730 + }, + { + "epoch": 0.1464, + "learning_rate": 1.6523630029036937e-05, + "loss": 0.0262, + "step": 732 + }, + { + "epoch": 0.1468, + "learning_rate": 1.6513041300022253e-05, + "loss": 0.0568, + "step": 734 + }, + { + "epoch": 0.1472, + "learning_rate": 1.6502439873500294e-05, + "loss": 0.0219, + "step": 736 + }, + { + "epoch": 0.1476, + "learning_rate": 1.649182577013906e-05, + "loss": 0.0125, + "step": 738 + }, + { + "epoch": 0.148, + "learning_rate": 1.6481199010631312e-05, + "loss": 0.0033, + "step": 740 + }, + { + "epoch": 0.1484, + "learning_rate": 1.6470559615694452e-05, + "loss": 0.0358, + "step": 742 + }, + { + "epoch": 0.1488, + "learning_rate": 1.6459907606070513e-05, + "loss": 0.0027, + "step": 744 + }, + { + "epoch": 0.1492, + "learning_rate": 1.6449243002526153e-05, + "loss": 0.0066, + "step": 746 + }, + { + "epoch": 0.1496, + "learning_rate": 1.6438565825852543e-05, + "loss": 0.9157, + "step": 748 + }, + { + "epoch": 0.15, + "learning_rate": 1.6427876096865397e-05, + "loss": 1.4896, + "step": 750 + }, + { + "epoch": 0.1504, + "learning_rate": 1.641717383640488e-05, + "loss": 0.0099, + "step": 752 + }, + { + "epoch": 0.1508, + "learning_rate": 1.640645906533562e-05, + "loss": 0.246, + "step": 754 + }, + { + "epoch": 0.1512, + "learning_rate": 1.639573180454658e-05, + "loss": 0.0012, + "step": 756 + }, + { + "epoch": 0.1516, + "learning_rate": 1.638499207495112e-05, + "loss": 0.007, + "step": 758 + }, + { + "epoch": 0.152, + "learning_rate": 1.6374239897486902e-05, + "loss": 0.0073, + "step": 760 + }, + { + "epoch": 0.1524, + "learning_rate": 1.636347529311582e-05, + "loss": 0.0057, + "step": 762 + }, + { + "epoch": 0.1528, + "learning_rate": 1.6352698282824052e-05, + "loss": 0.0959, + "step": 764 + }, + { + "epoch": 0.1532, + "learning_rate": 1.6341908887621897e-05, + "loss": 0.0301, + "step": 766 + }, + { + "epoch": 0.1536, + "learning_rate": 1.6331107128543863e-05, + "loss": 0.4602, + "step": 768 + }, + { + "epoch": 0.154, + "learning_rate": 1.6320293026648505e-05, + "loss": 0.5994, + "step": 770 + }, + { + "epoch": 0.1544, + "learning_rate": 1.63094666030185e-05, + "loss": 0.0361, + "step": 772 + }, + { + "epoch": 0.1548, + "learning_rate": 1.6298627878760488e-05, + "loss": 0.0039, + "step": 774 + }, + { + "epoch": 0.1552, + "learning_rate": 1.6287776875005134e-05, + "loss": 0.3862, + "step": 776 + }, + { + "epoch": 0.1556, + "learning_rate": 1.6276913612907012e-05, + "loss": 0.0158, + "step": 778 + }, + { + "epoch": 0.156, + "learning_rate": 1.6266038113644605e-05, + "loss": 0.3118, + "step": 780 + }, + { + "epoch": 0.1564, + "learning_rate": 1.625515039842028e-05, + "loss": 0.1354, + "step": 782 + }, + { + "epoch": 0.1568, + "learning_rate": 1.624425048846016e-05, + "loss": 0.0425, + "step": 784 + }, + { + "epoch": 0.1572, + "learning_rate": 1.6233338405014204e-05, + "loss": 0.1001, + "step": 786 + }, + { + "epoch": 0.1576, + "learning_rate": 1.622241416935606e-05, + "loss": 0.2451, + "step": 788 + }, + { + "epoch": 0.158, + "learning_rate": 1.6211477802783105e-05, + "loss": 0.3585, + "step": 790 + }, + { + "epoch": 0.1584, + "learning_rate": 1.6200529326616322e-05, + "loss": 0.0001, + "step": 792 + }, + { + "epoch": 0.1588, + "learning_rate": 1.6189568762200352e-05, + "loss": 0.0509, + "step": 794 + }, + { + "epoch": 0.1592, + "learning_rate": 1.617859613090335e-05, + "loss": 0.0014, + "step": 796 + }, + { + "epoch": 0.1596, + "learning_rate": 1.6167611454117027e-05, + "loss": 0.0247, + "step": 798 + }, + { + "epoch": 0.16, + "learning_rate": 1.615661475325659e-05, + "loss": 0.0164, + "step": 800 + }, + { + "epoch": 0.1604, + "learning_rate": 1.6145606049760644e-05, + "loss": 0.0382, + "step": 802 + }, + { + "epoch": 0.1608, + "learning_rate": 1.613458536509125e-05, + "loss": 0.2081, + "step": 804 + }, + { + "epoch": 0.1612, + "learning_rate": 1.612355272073377e-05, + "loss": 0.0109, + "step": 806 + }, + { + "epoch": 0.1616, + "learning_rate": 1.611250813819692e-05, + "loss": 0.1382, + "step": 808 + }, + { + "epoch": 0.162, + "learning_rate": 1.6101451639012675e-05, + "loss": 0.355, + "step": 810 + }, + { + "epoch": 0.1624, + "learning_rate": 1.609038324473626e-05, + "loss": 0.0015, + "step": 812 + }, + { + "epoch": 0.1628, + "learning_rate": 1.6079302976946062e-05, + "loss": 0.0008, + "step": 814 + }, + { + "epoch": 0.1632, + "learning_rate": 1.606821085724362e-05, + "loss": 0.0011, + "step": 816 + }, + { + "epoch": 0.1636, + "learning_rate": 1.605710690725362e-05, + "loss": 0.1272, + "step": 818 + }, + { + "epoch": 0.164, + "learning_rate": 1.604599114862375e-05, + "loss": 0.001, + "step": 820 + }, + { + "epoch": 0.1644, + "learning_rate": 1.6034863603024775e-05, + "loss": 0.01, + "step": 822 + }, + { + "epoch": 0.1648, + "learning_rate": 1.6023724292150387e-05, + "loss": 0.0913, + "step": 824 + }, + { + "epoch": 0.1652, + "learning_rate": 1.601257323771727e-05, + "loss": 0.0152, + "step": 826 + }, + { + "epoch": 0.1656, + "learning_rate": 1.600141046146495e-05, + "loss": 0.0498, + "step": 828 + }, + { + "epoch": 0.166, + "learning_rate": 1.5990235985155863e-05, + "loss": 0.0014, + "step": 830 + }, + { + "epoch": 0.1664, + "learning_rate": 1.59790498305752e-05, + "loss": 0.0007, + "step": 832 + }, + { + "epoch": 0.1668, + "learning_rate": 1.5967852019530932e-05, + "loss": 0.0424, + "step": 834 + }, + { + "epoch": 0.1672, + "learning_rate": 1.5956642573853784e-05, + "loss": 0.0029, + "step": 836 + }, + { + "epoch": 0.1676, + "learning_rate": 1.594542151539713e-05, + "loss": 0.1625, + "step": 838 + }, + { + "epoch": 0.168, + "learning_rate": 1.593418886603702e-05, + "loss": 0.4397, + "step": 840 + }, + { + "epoch": 0.1684, + "learning_rate": 1.592294464767205e-05, + "loss": 0.0001, + "step": 842 + }, + { + "epoch": 0.1688, + "learning_rate": 1.591168888222342e-05, + "loss": 0.5525, + "step": 844 + }, + { + "epoch": 0.1692, + "learning_rate": 1.5900421591634806e-05, + "loss": 0.001, + "step": 846 + }, + { + "epoch": 0.1696, + "learning_rate": 1.588914279787239e-05, + "loss": 0.2071, + "step": 848 + }, + { + "epoch": 0.17, + "learning_rate": 1.5877852522924743e-05, + "loss": 0.1003, + "step": 850 + }, + { + "epoch": 0.1704, + "learning_rate": 1.5866550788802818e-05, + "loss": 0.002, + "step": 852 + }, + { + "epoch": 0.1708, + "learning_rate": 1.5855237617539946e-05, + "loss": 0.002, + "step": 854 + }, + { + "epoch": 0.1712, + "learning_rate": 1.584391303119172e-05, + "loss": 0.0375, + "step": 856 + }, + { + "epoch": 0.1716, + "learning_rate": 1.5832577051836023e-05, + "loss": 0.006, + "step": 858 + }, + { + "epoch": 0.172, + "learning_rate": 1.5821229701572894e-05, + "loss": 1.2357, + "step": 860 + }, + { + "epoch": 0.1724, + "learning_rate": 1.5809871002524605e-05, + "loss": 0.0632, + "step": 862 + }, + { + "epoch": 0.1728, + "learning_rate": 1.5798500976835493e-05, + "loss": 0.0037, + "step": 864 + }, + { + "epoch": 0.1732, + "learning_rate": 1.5787119646672025e-05, + "loss": 0.6166, + "step": 866 + }, + { + "epoch": 0.1736, + "learning_rate": 1.5775727034222685e-05, + "loss": 0.0706, + "step": 868 + }, + { + "epoch": 0.174, + "learning_rate": 1.5764323161697936e-05, + "loss": 0.149, + "step": 870 + }, + { + "epoch": 0.1744, + "learning_rate": 1.575290805133023e-05, + "loss": 0.0026, + "step": 872 + }, + { + "epoch": 0.1748, + "learning_rate": 1.5741481725373896e-05, + "loss": 0.0041, + "step": 874 + }, + { + "epoch": 0.1752, + "learning_rate": 1.573004420610516e-05, + "loss": 0.0028, + "step": 876 + }, + { + "epoch": 0.1756, + "learning_rate": 1.5718595515822023e-05, + "loss": 0.0067, + "step": 878 + }, + { + "epoch": 0.176, + "learning_rate": 1.5707135676844326e-05, + "loss": 0.0047, + "step": 880 + }, + { + "epoch": 0.1764, + "learning_rate": 1.5695664711513582e-05, + "loss": 0.1763, + "step": 882 + }, + { + "epoch": 0.1768, + "learning_rate": 1.568418264219303e-05, + "loss": 0.4129, + "step": 884 + }, + { + "epoch": 0.1772, + "learning_rate": 1.5672689491267573e-05, + "loss": 0.0077, + "step": 886 + }, + { + "epoch": 0.1776, + "learning_rate": 1.566118528114367e-05, + "loss": 0.0093, + "step": 888 + }, + { + "epoch": 0.178, + "learning_rate": 1.5649670034249382e-05, + "loss": 0.0561, + "step": 890 + }, + { + "epoch": 0.1784, + "learning_rate": 1.5638143773034265e-05, + "loss": 0.1085, + "step": 892 + }, + { + "epoch": 0.1788, + "learning_rate": 1.5626606519969373e-05, + "loss": 0.0065, + "step": 894 + }, + { + "epoch": 0.1792, + "learning_rate": 1.5615058297547144e-05, + "loss": 0.0132, + "step": 896 + }, + { + "epoch": 0.1796, + "learning_rate": 1.560349912828145e-05, + "loss": 0.008, + "step": 898 + }, + { + "epoch": 0.18, + "learning_rate": 1.5591929034707475e-05, + "loss": 0.145, + "step": 900 + }, + { + "epoch": 0.1804, + "learning_rate": 1.5580348039381698e-05, + "loss": 0.2927, + "step": 902 + }, + { + "epoch": 0.1808, + "learning_rate": 1.5568756164881887e-05, + "loss": 0.1332, + "step": 904 + }, + { + "epoch": 0.1812, + "learning_rate": 1.5557153433806967e-05, + "loss": 0.0025, + "step": 906 + }, + { + "epoch": 0.1816, + "learning_rate": 1.554553986877708e-05, + "loss": 0.0002, + "step": 908 + }, + { + "epoch": 0.182, + "learning_rate": 1.553391549243344e-05, + "loss": 0.0039, + "step": 910 + }, + { + "epoch": 0.1824, + "learning_rate": 1.552228032743839e-05, + "loss": 0.0002, + "step": 912 + }, + { + "epoch": 0.1828, + "learning_rate": 1.551063439647526e-05, + "loss": 0.0011, + "step": 914 + }, + { + "epoch": 0.1832, + "learning_rate": 1.54989777222484e-05, + "loss": 0.0208, + "step": 916 + }, + { + "epoch": 0.1836, + "learning_rate": 1.548731032748309e-05, + "loss": 0.0041, + "step": 918 + }, + { + "epoch": 0.184, + "learning_rate": 1.54756322349255e-05, + "loss": 0.0034, + "step": 920 + }, + { + "epoch": 0.1844, + "learning_rate": 1.5463943467342697e-05, + "loss": 0.5626, + "step": 922 + }, + { + "epoch": 0.1848, + "learning_rate": 1.5452244047522504e-05, + "loss": 0.8541, + "step": 924 + }, + { + "epoch": 0.1852, + "learning_rate": 1.5440533998273556e-05, + "loss": 0.3846, + "step": 926 + }, + { + "epoch": 0.1856, + "learning_rate": 1.5428813342425177e-05, + "loss": 0.001, + "step": 928 + }, + { + "epoch": 0.186, + "learning_rate": 1.54170821028274e-05, + "loss": 0.2608, + "step": 930 + }, + { + "epoch": 0.1864, + "learning_rate": 1.5405340302350866e-05, + "loss": 0.0022, + "step": 932 + }, + { + "epoch": 0.1868, + "learning_rate": 1.5393587963886837e-05, + "loss": 0.0015, + "step": 934 + }, + { + "epoch": 0.1872, + "learning_rate": 1.5381825110347082e-05, + "loss": 0.0034, + "step": 936 + }, + { + "epoch": 0.1876, + "learning_rate": 1.5370051764663875e-05, + "loss": 0.0174, + "step": 938 + }, + { + "epoch": 0.188, + "learning_rate": 1.5358267949789975e-05, + "loss": 2.0412, + "step": 940 + }, + { + "epoch": 0.1884, + "learning_rate": 1.5346473688698514e-05, + "loss": 0.7176, + "step": 942 + }, + { + "epoch": 0.1888, + "learning_rate": 1.5334669004383032e-05, + "loss": 0.7102, + "step": 944 + }, + { + "epoch": 0.1892, + "learning_rate": 1.532285391985734e-05, + "loss": 1.3094, + "step": 946 + }, + { + "epoch": 0.1896, + "learning_rate": 1.531102845815557e-05, + "loss": 0.0413, + "step": 948 + }, + { + "epoch": 0.19, + "learning_rate": 1.5299192642332042e-05, + "loss": 0.1397, + "step": 950 + }, + { + "epoch": 0.1904, + "learning_rate": 1.5287346495461322e-05, + "loss": 0.107, + "step": 952 + }, + { + "epoch": 0.1908, + "learning_rate": 1.527549004063805e-05, + "loss": 0.0871, + "step": 954 + }, + { + "epoch": 0.1912, + "learning_rate": 1.5263623300976983e-05, + "loss": 0.0419, + "step": 956 + }, + { + "epoch": 0.1916, + "learning_rate": 1.5251746299612964e-05, + "loss": 0.2999, + "step": 958 + }, + { + "epoch": 0.192, + "learning_rate": 1.523985905970079e-05, + "loss": 0.0128, + "step": 960 + }, + { + "epoch": 0.1924, + "learning_rate": 1.5227961604415273e-05, + "loss": 0.13, + "step": 962 + }, + { + "epoch": 0.1928, + "learning_rate": 1.5216053956951081e-05, + "loss": 0.0128, + "step": 964 + }, + { + "epoch": 0.1932, + "learning_rate": 1.5204136140522806e-05, + "loss": 0.6996, + "step": 966 + }, + { + "epoch": 0.1936, + "learning_rate": 1.5192208178364815e-05, + "loss": 0.022, + "step": 968 + }, + { + "epoch": 0.194, + "learning_rate": 1.5180270093731305e-05, + "loss": 0.0511, + "step": 970 + }, + { + "epoch": 0.1944, + "learning_rate": 1.516832190989618e-05, + "loss": 0.1377, + "step": 972 + }, + { + "epoch": 0.1948, + "learning_rate": 1.5156363650153013e-05, + "loss": 0.1297, + "step": 974 + }, + { + "epoch": 0.1952, + "learning_rate": 1.514439533781507e-05, + "loss": 0.1839, + "step": 976 + }, + { + "epoch": 0.1956, + "learning_rate": 1.5132416996215166e-05, + "loss": 0.4618, + "step": 978 + }, + { + "epoch": 0.196, + "learning_rate": 1.5120428648705722e-05, + "loss": 0.658, + "step": 980 + }, + { + "epoch": 0.1964, + "learning_rate": 1.5108430318658597e-05, + "loss": 0.0276, + "step": 982 + }, + { + "epoch": 0.1968, + "learning_rate": 1.5096422029465185e-05, + "loss": 0.0538, + "step": 984 + }, + { + "epoch": 0.1972, + "learning_rate": 1.5084403804536228e-05, + "loss": 0.4564, + "step": 986 + }, + { + "epoch": 0.1976, + "learning_rate": 1.5072375667301893e-05, + "loss": 0.0293, + "step": 988 + }, + { + "epoch": 0.198, + "learning_rate": 1.5060337641211649e-05, + "loss": 0.0061, + "step": 990 + }, + { + "epoch": 0.1984, + "learning_rate": 1.504828974973422e-05, + "loss": 0.0115, + "step": 992 + }, + { + "epoch": 0.1988, + "learning_rate": 1.5036232016357613e-05, + "loss": 0.0571, + "step": 994 + }, + { + "epoch": 0.1992, + "learning_rate": 1.5024164464588987e-05, + "loss": 0.3275, + "step": 996 + }, + { + "epoch": 0.1996, + "learning_rate": 1.5012087117954648e-05, + "loss": 0.1458, + "step": 998 + }, + { + "epoch": 0.2, + "learning_rate": 1.4999999999999999e-05, + "loss": 0.0153, + "step": 1000 + }, + { + "epoch": 0.2004, + "learning_rate": 1.4987903134289514e-05, + "loss": 0.0208, + "step": 1002 + }, + { + "epoch": 0.2008, + "learning_rate": 1.4975796544406627e-05, + "loss": 0.0091, + "step": 1004 + }, + { + "epoch": 0.2012, + "learning_rate": 1.496368025395377e-05, + "loss": 0.0659, + "step": 1006 + }, + { + "epoch": 0.2016, + "learning_rate": 1.4951554286552274e-05, + "loss": 1.4437, + "step": 1008 + }, + { + "epoch": 0.202, + "learning_rate": 1.493941866584231e-05, + "loss": 0.0591, + "step": 1010 + }, + { + "epoch": 0.2024, + "learning_rate": 1.4927273415482927e-05, + "loss": 0.634, + "step": 1012 + }, + { + "epoch": 0.2028, + "learning_rate": 1.4915118559151873e-05, + "loss": 0.0484, + "step": 1014 + }, + { + "epoch": 0.2032, + "learning_rate": 1.4902954120545692e-05, + "loss": 0.0196, + "step": 1016 + }, + { + "epoch": 0.2036, + "learning_rate": 1.489078012337956e-05, + "loss": 0.139, + "step": 1018 + }, + { + "epoch": 0.204, + "learning_rate": 1.4878596591387334e-05, + "loss": 0.0229, + "step": 1020 + }, + { + "epoch": 0.2044, + "learning_rate": 1.4866403548321397e-05, + "loss": 0.1789, + "step": 1022 + }, + { + "epoch": 0.2048, + "learning_rate": 1.485420101795274e-05, + "loss": 0.1511, + "step": 1024 + }, + { + "epoch": 0.2052, + "learning_rate": 1.4841989024070816e-05, + "loss": 0.0208, + "step": 1026 + }, + { + "epoch": 0.2056, + "learning_rate": 1.482976759048351e-05, + "loss": 0.0206, + "step": 1028 + }, + { + "epoch": 0.206, + "learning_rate": 1.4817536741017162e-05, + "loss": 0.019, + "step": 1030 + }, + { + "epoch": 0.2064, + "learning_rate": 1.4805296499516411e-05, + "loss": 0.1799, + "step": 1032 + }, + { + "epoch": 0.2068, + "learning_rate": 1.4793046889844254e-05, + "loss": 0.0082, + "step": 1034 + }, + { + "epoch": 0.2072, + "learning_rate": 1.4780787935881918e-05, + "loss": 0.9444, + "step": 1036 + }, + { + "epoch": 0.2076, + "learning_rate": 1.4768519661528884e-05, + "loss": 0.0176, + "step": 1038 + }, + { + "epoch": 0.208, + "learning_rate": 1.475624209070275e-05, + "loss": 0.0163, + "step": 1040 + }, + { + "epoch": 0.2084, + "learning_rate": 1.4743955247339299e-05, + "loss": 0.0444, + "step": 1042 + }, + { + "epoch": 0.2088, + "learning_rate": 1.4731659155392337e-05, + "loss": 0.031, + "step": 1044 + }, + { + "epoch": 0.2092, + "learning_rate": 1.4719353838833727e-05, + "loss": 0.0166, + "step": 1046 + }, + { + "epoch": 0.2096, + "learning_rate": 1.4707039321653333e-05, + "loss": 0.007, + "step": 1048 + }, + { + "epoch": 0.21, + "learning_rate": 1.469471562785891e-05, + "loss": 0.0098, + "step": 1050 + }, + { + "epoch": 0.2104, + "learning_rate": 1.4682382781476147e-05, + "loss": 0.0094, + "step": 1052 + }, + { + "epoch": 0.2108, + "learning_rate": 1.467004080654855e-05, + "loss": 0.1137, + "step": 1054 + }, + { + "epoch": 0.2112, + "learning_rate": 1.4657689727137448e-05, + "loss": 0.0128, + "step": 1056 + }, + { + "epoch": 0.2116, + "learning_rate": 1.4645329567321873e-05, + "loss": 0.0293, + "step": 1058 + }, + { + "epoch": 0.212, + "learning_rate": 1.4632960351198624e-05, + "loss": 0.0258, + "step": 1060 + }, + { + "epoch": 0.2124, + "learning_rate": 1.4620582102882093e-05, + "loss": 0.1549, + "step": 1062 + }, + { + "epoch": 0.2128, + "learning_rate": 1.460819484650431e-05, + "loss": 0.0244, + "step": 1064 + }, + { + "epoch": 0.2132, + "learning_rate": 1.4595798606214887e-05, + "loss": 0.0245, + "step": 1066 + }, + { + "epoch": 0.2136, + "learning_rate": 1.4583393406180898e-05, + "loss": 0.0068, + "step": 1068 + }, + { + "epoch": 0.214, + "learning_rate": 1.4570979270586951e-05, + "loss": 0.0339, + "step": 1070 + }, + { + "epoch": 0.2144, + "learning_rate": 1.4558556223635004e-05, + "loss": 0.1695, + "step": 1072 + }, + { + "epoch": 0.2148, + "learning_rate": 1.4546124289544444e-05, + "loss": 0.0029, + "step": 1074 + }, + { + "epoch": 0.2152, + "learning_rate": 1.4533683492551949e-05, + "loss": 0.0013, + "step": 1076 + }, + { + "epoch": 0.2156, + "learning_rate": 1.4521233856911512e-05, + "loss": 0.0006, + "step": 1078 + }, + { + "epoch": 0.216, + "learning_rate": 1.4508775406894313e-05, + "loss": 0.0411, + "step": 1080 + }, + { + "epoch": 0.2164, + "learning_rate": 1.4496308166788737e-05, + "loss": 0.0126, + "step": 1082 + }, + { + "epoch": 0.2168, + "learning_rate": 1.4483832160900328e-05, + "loss": 0.0176, + "step": 1084 + }, + { + "epoch": 0.2172, + "learning_rate": 1.4471347413551673e-05, + "loss": 0.0293, + "step": 1086 + }, + { + "epoch": 0.2176, + "learning_rate": 1.445885394908245e-05, + "loss": 0.0361, + "step": 1088 + }, + { + "epoch": 0.218, + "learning_rate": 1.4446351791849276e-05, + "loss": 0.0035, + "step": 1090 + }, + { + "epoch": 0.2184, + "learning_rate": 1.4433840966225775e-05, + "loss": 0.1905, + "step": 1092 + }, + { + "epoch": 0.2188, + "learning_rate": 1.4421321496602423e-05, + "loss": 0.1718, + "step": 1094 + }, + { + "epoch": 0.2192, + "learning_rate": 1.440879340738659e-05, + "loss": 0.002, + "step": 1096 + }, + { + "epoch": 0.2196, + "learning_rate": 1.439625672300241e-05, + "loss": 0.0145, + "step": 1098 + }, + { + "epoch": 0.22, + "learning_rate": 1.4383711467890779e-05, + "loss": 0.7069, + "step": 1100 + }, + { + "epoch": 0.2204, + "learning_rate": 1.4371157666509335e-05, + "loss": 0.0021, + "step": 1102 + }, + { + "epoch": 0.2208, + "learning_rate": 1.435859534333234e-05, + "loss": 0.8011, + "step": 1104 + }, + { + "epoch": 0.2212, + "learning_rate": 1.434602452285071e-05, + "loss": 0.0017, + "step": 1106 + }, + { + "epoch": 0.2216, + "learning_rate": 1.4333445229571872e-05, + "loss": 0.0273, + "step": 1108 + }, + { + "epoch": 0.222, + "learning_rate": 1.4320857488019824e-05, + "loss": 0.0022, + "step": 1110 + }, + { + "epoch": 0.2224, + "learning_rate": 1.4308261322734998e-05, + "loss": 0.0037, + "step": 1112 + }, + { + "epoch": 0.2228, + "learning_rate": 1.4295656758274285e-05, + "loss": 0.0088, + "step": 1114 + }, + { + "epoch": 0.2232, + "learning_rate": 1.4283043819210915e-05, + "loss": 0.0199, + "step": 1116 + }, + { + "epoch": 0.2236, + "learning_rate": 1.4270422530134439e-05, + "loss": 0.0012, + "step": 1118 + }, + { + "epoch": 0.224, + "learning_rate": 1.4257792915650733e-05, + "loss": 0.0026, + "step": 1120 + }, + { + "epoch": 0.2244, + "learning_rate": 1.4245155000381857e-05, + "loss": 1.2346, + "step": 1122 + }, + { + "epoch": 0.2248, + "learning_rate": 1.4232508808966099e-05, + "loss": 0.0026, + "step": 1124 + }, + { + "epoch": 0.2252, + "learning_rate": 1.4219854366057828e-05, + "loss": 1.0424, + "step": 1126 + }, + { + "epoch": 0.2256, + "learning_rate": 1.4207191696327546e-05, + "loss": 0.0145, + "step": 1128 + }, + { + "epoch": 0.226, + "learning_rate": 1.4194520824461782e-05, + "loss": 0.1324, + "step": 1130 + }, + { + "epoch": 0.2264, + "learning_rate": 1.4181841775163017e-05, + "loss": 0.2862, + "step": 1132 + }, + { + "epoch": 0.2268, + "learning_rate": 1.4169154573149745e-05, + "loss": 0.0663, + "step": 1134 + }, + { + "epoch": 0.2272, + "learning_rate": 1.4156459243156281e-05, + "loss": 0.5969, + "step": 1136 + }, + { + "epoch": 0.2276, + "learning_rate": 1.4143755809932847e-05, + "loss": 0.0771, + "step": 1138 + }, + { + "epoch": 0.228, + "learning_rate": 1.4131044298245416e-05, + "loss": 0.0266, + "step": 1140 + }, + { + "epoch": 0.2284, + "learning_rate": 1.4118324732875754e-05, + "loss": 0.2363, + "step": 1142 + }, + { + "epoch": 0.2288, + "learning_rate": 1.4105597138621278e-05, + "loss": 0.2288, + "step": 1144 + }, + { + "epoch": 0.2292, + "learning_rate": 1.4092861540295112e-05, + "loss": 0.0811, + "step": 1146 + }, + { + "epoch": 0.2296, + "learning_rate": 1.4080117962725936e-05, + "loss": 0.1722, + "step": 1148 + }, + { + "epoch": 0.23, + "learning_rate": 1.4067366430758003e-05, + "loss": 0.3868, + "step": 1150 + }, + { + "epoch": 0.2304, + "learning_rate": 1.4054606969251103e-05, + "loss": 0.0094, + "step": 1152 + }, + { + "epoch": 0.2308, + "learning_rate": 1.4041839603080425e-05, + "loss": 0.0652, + "step": 1154 + }, + { + "epoch": 0.2312, + "learning_rate": 1.402906435713663e-05, + "loss": 0.0154, + "step": 1156 + }, + { + "epoch": 0.2316, + "learning_rate": 1.4016281256325695e-05, + "loss": 0.0087, + "step": 1158 + }, + { + "epoch": 0.232, + "learning_rate": 1.4003490325568953e-05, + "loss": 0.0016, + "step": 1160 + }, + { + "epoch": 0.2324, + "learning_rate": 1.399069158980295e-05, + "loss": 0.0224, + "step": 1162 + }, + { + "epoch": 0.2328, + "learning_rate": 1.3977885073979503e-05, + "loss": 0.0404, + "step": 1164 + }, + { + "epoch": 0.2332, + "learning_rate": 1.396507080306555e-05, + "loss": 0.0111, + "step": 1166 + }, + { + "epoch": 0.2336, + "learning_rate": 1.3952248802043165e-05, + "loss": 0.0377, + "step": 1168 + }, + { + "epoch": 0.234, + "learning_rate": 1.3939419095909521e-05, + "loss": 0.0198, + "step": 1170 + }, + { + "epoch": 0.2344, + "learning_rate": 1.3926581709676752e-05, + "loss": 0.0585, + "step": 1172 + }, + { + "epoch": 0.2348, + "learning_rate": 1.3913736668372035e-05, + "loss": 0.0138, + "step": 1174 + }, + { + "epoch": 0.2352, + "learning_rate": 1.39008839970374e-05, + "loss": 0.036, + "step": 1176 + }, + { + "epoch": 0.2356, + "learning_rate": 1.3888023720729816e-05, + "loss": 0.008, + "step": 1178 + }, + { + "epoch": 0.236, + "learning_rate": 1.3875155864521026e-05, + "loss": 0.8989, + "step": 1180 + }, + { + "epoch": 0.2364, + "learning_rate": 1.3862280453497606e-05, + "loss": 0.7727, + "step": 1182 + }, + { + "epoch": 0.2368, + "learning_rate": 1.38493975127608e-05, + "loss": 0.118, + "step": 1184 + }, + { + "epoch": 0.2372, + "learning_rate": 1.3836507067426563e-05, + "loss": 0.1138, + "step": 1186 + }, + { + "epoch": 0.2376, + "learning_rate": 1.3823609142625499e-05, + "loss": 0.0669, + "step": 1188 + }, + { + "epoch": 0.238, + "learning_rate": 1.3810703763502742e-05, + "loss": 0.1956, + "step": 1190 + }, + { + "epoch": 0.2384, + "learning_rate": 1.379779095521802e-05, + "loss": 0.0357, + "step": 1192 + }, + { + "epoch": 0.2388, + "learning_rate": 1.3784870742945483e-05, + "loss": 0.0177, + "step": 1194 + }, + { + "epoch": 0.2392, + "learning_rate": 1.377194315187377e-05, + "loss": 0.0213, + "step": 1196 + }, + { + "epoch": 0.2396, + "learning_rate": 1.3759008207205863e-05, + "loss": 0.2876, + "step": 1198 + }, + { + "epoch": 0.24, + "learning_rate": 1.3746065934159125e-05, + "loss": 0.506, + "step": 1200 + }, + { + "epoch": 0.2404, + "learning_rate": 1.373311635796516e-05, + "loss": 0.3393, + "step": 1202 + }, + { + "epoch": 0.2408, + "learning_rate": 1.3720159503869821e-05, + "loss": 0.024, + "step": 1204 + }, + { + "epoch": 0.2412, + "learning_rate": 1.3707195397133172e-05, + "loss": 0.0056, + "step": 1206 + }, + { + "epoch": 0.2416, + "learning_rate": 1.3694224063029396e-05, + "loss": 0.0655, + "step": 1208 + }, + { + "epoch": 0.242, + "learning_rate": 1.3681245526846789e-05, + "loss": 0.0533, + "step": 1210 + }, + { + "epoch": 0.2424, + "learning_rate": 1.366825981388764e-05, + "loss": 0.075, + "step": 1212 + }, + { + "epoch": 0.2428, + "learning_rate": 1.365526694946829e-05, + "loss": 0.0429, + "step": 1214 + }, + { + "epoch": 0.2432, + "learning_rate": 1.3642266958918978e-05, + "loss": 0.0057, + "step": 1216 + }, + { + "epoch": 0.2436, + "learning_rate": 1.3629259867583867e-05, + "loss": 1.1752, + "step": 1218 + }, + { + "epoch": 0.244, + "learning_rate": 1.3616245700820935e-05, + "loss": 0.8958, + "step": 1220 + }, + { + "epoch": 0.2444, + "learning_rate": 1.3603224484001954e-05, + "loss": 0.0067, + "step": 1222 + }, + { + "epoch": 0.2448, + "learning_rate": 1.3590196242512468e-05, + "loss": 0.052, + "step": 1224 + }, + { + "epoch": 0.2452, + "learning_rate": 1.357716100175169e-05, + "loss": 0.01, + "step": 1226 + }, + { + "epoch": 0.2456, + "learning_rate": 1.3564118787132514e-05, + "loss": 0.0213, + "step": 1228 + }, + { + "epoch": 0.246, + "learning_rate": 1.355106962408137e-05, + "loss": 0.1736, + "step": 1230 + }, + { + "epoch": 0.2464, + "learning_rate": 1.3538013538038303e-05, + "loss": 0.0756, + "step": 1232 + }, + { + "epoch": 0.2468, + "learning_rate": 1.3524950554456786e-05, + "loss": 0.5746, + "step": 1234 + }, + { + "epoch": 0.2472, + "learning_rate": 1.3511880698803803e-05, + "loss": 1.4771, + "step": 1236 + }, + { + "epoch": 0.2476, + "learning_rate": 1.3498803996559699e-05, + "loss": 0.0712, + "step": 1238 + }, + { + "epoch": 0.248, + "learning_rate": 1.3485720473218158e-05, + "loss": 0.1826, + "step": 1240 + }, + { + "epoch": 0.2484, + "learning_rate": 1.3472630154286195e-05, + "loss": 0.0044, + "step": 1242 + }, + { + "epoch": 0.2488, + "learning_rate": 1.3459533065284045e-05, + "loss": 0.0407, + "step": 1244 + }, + { + "epoch": 0.2492, + "learning_rate": 1.3446429231745177e-05, + "loss": 0.0238, + "step": 1246 + }, + { + "epoch": 0.2496, + "learning_rate": 1.3433318679216152e-05, + "loss": 0.0624, + "step": 1248 + }, + { + "epoch": 0.25, + "learning_rate": 1.3420201433256697e-05, + "loss": 0.6614, + "step": 1250 + }, + { + "epoch": 0.2504, + "learning_rate": 1.3407077519439517e-05, + "loss": 0.159, + "step": 1252 + }, + { + "epoch": 0.2508, + "learning_rate": 1.3393946963350384e-05, + "loss": 0.036, + "step": 1254 + }, + { + "epoch": 0.2512, + "learning_rate": 1.3380809790587983e-05, + "loss": 0.0189, + "step": 1256 + }, + { + "epoch": 0.2516, + "learning_rate": 1.3367666026763886e-05, + "loss": 0.0222, + "step": 1258 + }, + { + "epoch": 0.252, + "learning_rate": 1.3354515697502555e-05, + "loss": 0.0585, + "step": 1260 + }, + { + "epoch": 0.2524, + "learning_rate": 1.334135882844122e-05, + "loss": 0.0078, + "step": 1262 + }, + { + "epoch": 0.2528, + "learning_rate": 1.3328195445229872e-05, + "loss": 0.3968, + "step": 1264 + }, + { + "epoch": 0.2532, + "learning_rate": 1.3315025573531193e-05, + "loss": 0.0053, + "step": 1266 + }, + { + "epoch": 0.2536, + "learning_rate": 1.3301849239020544e-05, + "loss": 0.0176, + "step": 1268 + }, + { + "epoch": 0.254, + "learning_rate": 1.3288666467385834e-05, + "loss": 0.0362, + "step": 1270 + }, + { + "epoch": 0.2544, + "learning_rate": 1.327547728432757e-05, + "loss": 0.1498, + "step": 1272 + }, + { + "epoch": 0.2548, + "learning_rate": 1.3262281715558744e-05, + "loss": 0.5832, + "step": 1274 + }, + { + "epoch": 0.2552, + "learning_rate": 1.3249079786804769e-05, + "loss": 0.4015, + "step": 1276 + }, + { + "epoch": 0.2556, + "learning_rate": 1.3235871523803508e-05, + "loss": 0.0566, + "step": 1278 + }, + { + "epoch": 0.256, + "learning_rate": 1.3222656952305115e-05, + "loss": 0.0116, + "step": 1280 + }, + { + "epoch": 0.2564, + "learning_rate": 1.32094360980721e-05, + "loss": 0.0038, + "step": 1282 + }, + { + "epoch": 0.2568, + "learning_rate": 1.3196208986879175e-05, + "loss": 0.4463, + "step": 1284 + }, + { + "epoch": 0.2572, + "learning_rate": 1.3182975644513301e-05, + "loss": 0.2508, + "step": 1286 + }, + { + "epoch": 0.2576, + "learning_rate": 1.3169736096773515e-05, + "loss": 0.0115, + "step": 1288 + }, + { + "epoch": 0.258, + "learning_rate": 1.3156490369471024e-05, + "loss": 0.3043, + "step": 1290 + }, + { + "epoch": 0.2584, + "learning_rate": 1.3143238488429049e-05, + "loss": 0.1336, + "step": 1292 + }, + { + "epoch": 0.2588, + "learning_rate": 1.3129980479482783e-05, + "loss": 0.1383, + "step": 1294 + }, + { + "epoch": 0.2592, + "learning_rate": 1.311671636847943e-05, + "loss": 0.0084, + "step": 1296 + }, + { + "epoch": 0.2596, + "learning_rate": 1.3103446181278017e-05, + "loss": 0.1379, + "step": 1298 + }, + { + "epoch": 0.26, + "learning_rate": 1.3090169943749477e-05, + "loss": 0.591, + "step": 1300 + }, + { + "epoch": 0.2604, + "learning_rate": 1.3076887681776502e-05, + "loss": 0.0274, + "step": 1302 + }, + { + "epoch": 0.2608, + "learning_rate": 1.3063599421253563e-05, + "loss": 0.0103, + "step": 1304 + }, + { + "epoch": 0.2612, + "learning_rate": 1.3050305188086774e-05, + "loss": 0.9319, + "step": 1306 + }, + { + "epoch": 0.2616, + "learning_rate": 1.303700500819395e-05, + "loss": 0.0498, + "step": 1308 + }, + { + "epoch": 0.262, + "learning_rate": 1.3023698907504452e-05, + "loss": 0.003, + "step": 1310 + }, + { + "epoch": 0.2624, + "learning_rate": 1.3010386911959205e-05, + "loss": 0.7415, + "step": 1312 + }, + { + "epoch": 0.2628, + "learning_rate": 1.2997069047510647e-05, + "loss": 0.0109, + "step": 1314 + }, + { + "epoch": 0.2632, + "learning_rate": 1.2983745340122604e-05, + "loss": 0.0063, + "step": 1316 + }, + { + "epoch": 0.2636, + "learning_rate": 1.2970415815770352e-05, + "loss": 0.1319, + "step": 1318 + }, + { + "epoch": 0.264, + "learning_rate": 1.2957080500440462e-05, + "loss": 0.0077, + "step": 1320 + }, + { + "epoch": 0.2644, + "learning_rate": 1.294373942013084e-05, + "loss": 0.3287, + "step": 1322 + }, + { + "epoch": 0.2648, + "learning_rate": 1.293039260085057e-05, + "loss": 0.0164, + "step": 1324 + }, + { + "epoch": 0.2652, + "learning_rate": 1.2917040068619995e-05, + "loss": 0.0111, + "step": 1326 + }, + { + "epoch": 0.2656, + "learning_rate": 1.2903681849470533e-05, + "loss": 0.0752, + "step": 1328 + }, + { + "epoch": 0.266, + "learning_rate": 1.2890317969444715e-05, + "loss": 0.0147, + "step": 1330 + }, + { + "epoch": 0.2664, + "learning_rate": 1.2876948454596137e-05, + "loss": 0.3875, + "step": 1332 + }, + { + "epoch": 0.2668, + "learning_rate": 1.2863573330989315e-05, + "loss": 0.3236, + "step": 1334 + }, + { + "epoch": 0.2672, + "learning_rate": 1.2850192624699771e-05, + "loss": 0.4717, + "step": 1336 + }, + { + "epoch": 0.2676, + "learning_rate": 1.2836806361813846e-05, + "loss": 0.0307, + "step": 1338 + }, + { + "epoch": 0.268, + "learning_rate": 1.2823414568428767e-05, + "loss": 0.0162, + "step": 1340 + }, + { + "epoch": 0.2684, + "learning_rate": 1.2810017270652506e-05, + "loss": 0.2951, + "step": 1342 + }, + { + "epoch": 0.2688, + "learning_rate": 1.2796614494603806e-05, + "loss": 0.0029, + "step": 1344 + }, + { + "epoch": 0.2692, + "learning_rate": 1.2783206266412034e-05, + "loss": 0.1598, + "step": 1346 + }, + { + "epoch": 0.2696, + "learning_rate": 1.2769792612217222e-05, + "loss": 0.681, + "step": 1348 + }, + { + "epoch": 0.27, + "learning_rate": 1.2756373558169999e-05, + "loss": 0.0096, + "step": 1350 + }, + { + "epoch": 0.2704, + "learning_rate": 1.2742949130431466e-05, + "loss": 0.0369, + "step": 1352 + }, + { + "epoch": 0.2708, + "learning_rate": 1.272951935517326e-05, + "loss": 0.0063, + "step": 1354 + }, + { + "epoch": 0.2712, + "learning_rate": 1.271608425857739e-05, + "loss": 0.0049, + "step": 1356 + }, + { + "epoch": 0.2716, + "learning_rate": 1.2702643866836281e-05, + "loss": 0.0025, + "step": 1358 + }, + { + "epoch": 0.272, + "learning_rate": 1.268919820615265e-05, + "loss": 0.1271, + "step": 1360 + }, + { + "epoch": 0.2724, + "learning_rate": 1.2675747302739531e-05, + "loss": 0.0918, + "step": 1362 + }, + { + "epoch": 0.2728, + "learning_rate": 1.2662291182820116e-05, + "loss": 0.091, + "step": 1364 + }, + { + "epoch": 0.2732, + "learning_rate": 1.2648829872627812e-05, + "loss": 0.0084, + "step": 1366 + }, + { + "epoch": 0.2736, + "learning_rate": 1.2635363398406133e-05, + "loss": 0.1524, + "step": 1368 + }, + { + "epoch": 0.274, + "learning_rate": 1.2621891786408647e-05, + "loss": 0.0683, + "step": 1370 + }, + { + "epoch": 0.2744, + "learning_rate": 1.2608415062898978e-05, + "loss": 0.0087, + "step": 1372 + }, + { + "epoch": 0.2748, + "learning_rate": 1.2594933254150654e-05, + "loss": 0.6757, + "step": 1374 + }, + { + "epoch": 0.2752, + "learning_rate": 1.2581446386447178e-05, + "loss": 0.0493, + "step": 1376 + }, + { + "epoch": 0.2756, + "learning_rate": 1.2567954486081888e-05, + "loss": 0.1016, + "step": 1378 + }, + { + "epoch": 0.276, + "learning_rate": 1.2554457579357909e-05, + "loss": 0.0031, + "step": 1380 + }, + { + "epoch": 0.2764, + "learning_rate": 1.2540955692588184e-05, + "loss": 0.092, + "step": 1382 + }, + { + "epoch": 0.2768, + "learning_rate": 1.2527448852095298e-05, + "loss": 0.0023, + "step": 1384 + }, + { + "epoch": 0.2772, + "learning_rate": 1.2513937084211553e-05, + "loss": 0.0026, + "step": 1386 + }, + { + "epoch": 0.2776, + "learning_rate": 1.250042041527882e-05, + "loss": 0.0048, + "step": 1388 + }, + { + "epoch": 0.278, + "learning_rate": 1.2486898871648557e-05, + "loss": 0.0206, + "step": 1390 + }, + { + "epoch": 0.2784, + "learning_rate": 1.247337247968167e-05, + "loss": 0.0426, + "step": 1392 + }, + { + "epoch": 0.2788, + "learning_rate": 1.245984126574858e-05, + "loss": 0.0305, + "step": 1394 + }, + { + "epoch": 0.2792, + "learning_rate": 1.2446305256229083e-05, + "loss": 0.2939, + "step": 1396 + }, + { + "epoch": 0.2796, + "learning_rate": 1.2432764477512294e-05, + "loss": 0.5525, + "step": 1398 + }, + { + "epoch": 0.28, + "learning_rate": 1.2419218955996687e-05, + "loss": 0.011, + "step": 1400 + }, + { + "epoch": 0.2804, + "learning_rate": 1.2405668718089921e-05, + "loss": 0.0922, + "step": 1402 + }, + { + "epoch": 0.2808, + "learning_rate": 1.2392113790208897e-05, + "loss": 0.1708, + "step": 1404 + }, + { + "epoch": 0.2812, + "learning_rate": 1.2378554198779627e-05, + "loss": 0.5684, + "step": 1406 + }, + { + "epoch": 0.2816, + "learning_rate": 1.2364989970237253e-05, + "loss": 0.012, + "step": 1408 + }, + { + "epoch": 0.282, + "learning_rate": 1.2351421131025898e-05, + "loss": 0.0182, + "step": 1410 + }, + { + "epoch": 0.2824, + "learning_rate": 1.2337847707598745e-05, + "loss": 0.0524, + "step": 1412 + }, + { + "epoch": 0.2828, + "learning_rate": 1.2324269726417848e-05, + "loss": 0.0014, + "step": 1414 + }, + { + "epoch": 0.2832, + "learning_rate": 1.231068721395418e-05, + "loss": 0.2019, + "step": 1416 + }, + { + "epoch": 0.2836, + "learning_rate": 1.2297100196687565e-05, + "loss": 0.0193, + "step": 1418 + }, + { + "epoch": 0.284, + "learning_rate": 1.2283508701106559e-05, + "loss": 0.0355, + "step": 1420 + }, + { + "epoch": 0.2844, + "learning_rate": 1.2269912753708502e-05, + "loss": 0.1228, + "step": 1422 + }, + { + "epoch": 0.2848, + "learning_rate": 1.2256312380999371e-05, + "loss": 0.0907, + "step": 1424 + }, + { + "epoch": 0.2852, + "learning_rate": 1.2242707609493817e-05, + "loss": 0.0012, + "step": 1426 + }, + { + "epoch": 0.2856, + "learning_rate": 1.2229098465715002e-05, + "loss": 0.0089, + "step": 1428 + }, + { + "epoch": 0.286, + "learning_rate": 1.221548497619468e-05, + "loss": 0.3213, + "step": 1430 + }, + { + "epoch": 0.2864, + "learning_rate": 1.2201867167473022e-05, + "loss": 0.1484, + "step": 1432 + }, + { + "epoch": 0.2868, + "learning_rate": 1.2188245066098646e-05, + "loss": 0.0592, + "step": 1434 + }, + { + "epoch": 0.2872, + "learning_rate": 1.2174618698628558e-05, + "loss": 0.0336, + "step": 1436 + }, + { + "epoch": 0.2876, + "learning_rate": 1.2160988091628023e-05, + "loss": 0.0127, + "step": 1438 + }, + { + "epoch": 0.288, + "learning_rate": 1.2147353271670644e-05, + "loss": 0.0035, + "step": 1440 + }, + { + "epoch": 0.2884, + "learning_rate": 1.2133714265338163e-05, + "loss": 0.0012, + "step": 1442 + }, + { + "epoch": 0.2888, + "learning_rate": 1.212007109922055e-05, + "loss": 0.0045, + "step": 1444 + }, + { + "epoch": 0.2892, + "learning_rate": 1.2106423799915836e-05, + "loss": 0.0462, + "step": 1446 + }, + { + "epoch": 0.2896, + "learning_rate": 1.2092772394030156e-05, + "loss": 0.0016, + "step": 1448 + }, + { + "epoch": 0.29, + "learning_rate": 1.2079116908177599e-05, + "loss": 0.0136, + "step": 1450 + }, + { + "epoch": 0.2904, + "learning_rate": 1.2065457368980234e-05, + "loss": 0.0161, + "step": 1452 + }, + { + "epoch": 0.2908, + "learning_rate": 1.2051793803068054e-05, + "loss": 0.1453, + "step": 1454 + }, + { + "epoch": 0.2912, + "learning_rate": 1.2038126237078849e-05, + "loss": 0.343, + "step": 1456 + }, + { + "epoch": 0.2916, + "learning_rate": 1.202445469765827e-05, + "loss": 0.0215, + "step": 1458 + }, + { + "epoch": 0.292, + "learning_rate": 1.2010779211459649e-05, + "loss": 0.0019, + "step": 1460 + }, + { + "epoch": 0.2924, + "learning_rate": 1.1997099805144074e-05, + "loss": 0.2219, + "step": 1462 + }, + { + "epoch": 0.2928, + "learning_rate": 1.1983416505380228e-05, + "loss": 0.0151, + "step": 1464 + }, + { + "epoch": 0.2932, + "learning_rate": 1.1969729338844432e-05, + "loss": 0.0007, + "step": 1466 + }, + { + "epoch": 0.2936, + "learning_rate": 1.1956038332220497e-05, + "loss": 0.004, + "step": 1468 + }, + { + "epoch": 0.294, + "learning_rate": 1.1942343512199726e-05, + "loss": 0.1382, + "step": 1470 + }, + { + "epoch": 0.2944, + "learning_rate": 1.1928644905480899e-05, + "loss": 0.0002, + "step": 1472 + }, + { + "epoch": 0.2948, + "learning_rate": 1.1914942538770129e-05, + "loss": 0.0011, + "step": 1474 + }, + { + "epoch": 0.2952, + "learning_rate": 1.1901236438780913e-05, + "loss": 0.0017, + "step": 1476 + }, + { + "epoch": 0.2956, + "learning_rate": 1.1887526632233952e-05, + "loss": 0.0011, + "step": 1478 + }, + { + "epoch": 0.296, + "learning_rate": 1.187381314585725e-05, + "loss": 0.0007, + "step": 1480 + }, + { + "epoch": 0.2964, + "learning_rate": 1.1860096006385925e-05, + "loss": 0.0253, + "step": 1482 + }, + { + "epoch": 0.2968, + "learning_rate": 1.1846375240562274e-05, + "loss": 0.0019, + "step": 1484 + }, + { + "epoch": 0.2972, + "learning_rate": 1.1832650875135613e-05, + "loss": 0.0158, + "step": 1486 + }, + { + "epoch": 0.2976, + "learning_rate": 1.1818922936862273e-05, + "loss": 0.0162, + "step": 1488 + }, + { + "epoch": 0.298, + "learning_rate": 1.1805191452505606e-05, + "loss": 0.0017, + "step": 1490 + }, + { + "epoch": 0.2984, + "learning_rate": 1.179145644883582e-05, + "loss": 0.0047, + "step": 1492 + }, + { + "epoch": 0.2988, + "learning_rate": 1.1777717952630038e-05, + "loss": 0.0014, + "step": 1494 + }, + { + "epoch": 0.2992, + "learning_rate": 1.1763975990672123e-05, + "loss": 0.0012, + "step": 1496 + }, + { + "epoch": 0.2996, + "learning_rate": 1.1750230589752763e-05, + "loss": 0.001, + "step": 1498 + }, + { + "epoch": 0.3, + "learning_rate": 1.1736481776669307e-05, + "loss": 0.0003, + "step": 1500 + }, + { + "epoch": 0.3004, + "learning_rate": 1.1722729578225769e-05, + "loss": 0.0006, + "step": 1502 + }, + { + "epoch": 0.3008, + "learning_rate": 1.170897402123278e-05, + "loss": 0.001, + "step": 1504 + }, + { + "epoch": 0.3012, + "learning_rate": 1.1695215132507466e-05, + "loss": 1.751, + "step": 1506 + }, + { + "epoch": 0.3016, + "learning_rate": 1.1681452938873521e-05, + "loss": 0.0102, + "step": 1508 + }, + { + "epoch": 0.302, + "learning_rate": 1.166768746716102e-05, + "loss": 0.1221, + "step": 1510 + }, + { + "epoch": 0.3024, + "learning_rate": 1.1653918744206484e-05, + "loss": 0.0009, + "step": 1512 + }, + { + "epoch": 0.3028, + "learning_rate": 1.164014679685271e-05, + "loss": 0.2667, + "step": 1514 + }, + { + "epoch": 0.3032, + "learning_rate": 1.1626371651948844e-05, + "loss": 0.0016, + "step": 1516 + }, + { + "epoch": 0.3036, + "learning_rate": 1.1612593336350206e-05, + "loss": 0.0897, + "step": 1518 + }, + { + "epoch": 0.304, + "learning_rate": 1.159881187691835e-05, + "loss": 0.0659, + "step": 1520 + }, + { + "epoch": 0.3044, + "learning_rate": 1.158502730052094e-05, + "loss": 0.007, + "step": 1522 + }, + { + "epoch": 0.3048, + "learning_rate": 1.1571239634031681e-05, + "loss": 0.0044, + "step": 1524 + }, + { + "epoch": 0.3052, + "learning_rate": 1.1557448904330366e-05, + "loss": 0.0159, + "step": 1526 + }, + { + "epoch": 0.3056, + "learning_rate": 1.1543655138302707e-05, + "loss": 0.0067, + "step": 1528 + }, + { + "epoch": 0.306, + "learning_rate": 1.1529858362840388e-05, + "loss": 0.0036, + "step": 1530 + }, + { + "epoch": 0.3064, + "learning_rate": 1.1516058604840888e-05, + "loss": 0.0055, + "step": 1532 + }, + { + "epoch": 0.3068, + "learning_rate": 1.1502255891207579e-05, + "loss": 0.0389, + "step": 1534 + }, + { + "epoch": 0.3072, + "learning_rate": 1.1488450248849522e-05, + "loss": 0.0589, + "step": 1536 + }, + { + "epoch": 0.3076, + "learning_rate": 1.1474641704681551e-05, + "loss": 0.0161, + "step": 1538 + }, + { + "epoch": 0.308, + "learning_rate": 1.1460830285624127e-05, + "loss": 1.0377, + "step": 1540 + }, + { + "epoch": 0.3084, + "learning_rate": 1.1447016018603293e-05, + "loss": 0.0039, + "step": 1542 + }, + { + "epoch": 0.3088, + "learning_rate": 1.1433198930550705e-05, + "loss": 0.026, + "step": 1544 + }, + { + "epoch": 0.3092, + "learning_rate": 1.1419379048403447e-05, + "loss": 0.4884, + "step": 1546 + }, + { + "epoch": 0.3096, + "learning_rate": 1.1405556399104114e-05, + "loss": 0.2229, + "step": 1548 + }, + { + "epoch": 0.31, + "learning_rate": 1.139173100960065e-05, + "loss": 0.0013, + "step": 1550 + }, + { + "epoch": 0.3104, + "learning_rate": 1.1377902906846388e-05, + "loss": 0.0014, + "step": 1552 + }, + { + "epoch": 0.3108, + "learning_rate": 1.1364072117799883e-05, + "loss": 0.0792, + "step": 1554 + }, + { + "epoch": 0.3112, + "learning_rate": 1.1350238669424991e-05, + "loss": 1.1305, + "step": 1556 + }, + { + "epoch": 0.3116, + "learning_rate": 1.1336402588690732e-05, + "loss": 0.022, + "step": 1558 + }, + { + "epoch": 0.312, + "learning_rate": 1.1322563902571227e-05, + "loss": 0.3897, + "step": 1560 + }, + { + "epoch": 0.3124, + "learning_rate": 1.1308722638045732e-05, + "loss": 0.1331, + "step": 1562 + }, + { + "epoch": 0.3128, + "learning_rate": 1.1294878822098471e-05, + "loss": 0.0025, + "step": 1564 + }, + { + "epoch": 0.3132, + "learning_rate": 1.12810324817187e-05, + "loss": 0.0279, + "step": 1566 + }, + { + "epoch": 0.3136, + "learning_rate": 1.1267183643900541e-05, + "loss": 0.323, + "step": 1568 + }, + { + "epoch": 0.314, + "learning_rate": 1.1253332335643048e-05, + "loss": 0.0173, + "step": 1570 + }, + { + "epoch": 0.3144, + "learning_rate": 1.1239478583950014e-05, + "loss": 0.0108, + "step": 1572 + }, + { + "epoch": 0.3148, + "learning_rate": 1.1225622415830075e-05, + "loss": 0.0321, + "step": 1574 + }, + { + "epoch": 0.3152, + "learning_rate": 1.1211763858296514e-05, + "loss": 0.1335, + "step": 1576 + }, + { + "epoch": 0.3156, + "learning_rate": 1.1197902938367296e-05, + "loss": 0.0616, + "step": 1578 + }, + { + "epoch": 0.316, + "learning_rate": 1.1184039683065019e-05, + "loss": 0.0207, + "step": 1580 + }, + { + "epoch": 0.3164, + "learning_rate": 1.1170174119416778e-05, + "loss": 0.0119, + "step": 1582 + }, + { + "epoch": 0.3168, + "learning_rate": 1.1156306274454218e-05, + "loss": 0.0554, + "step": 1584 + }, + { + "epoch": 0.3172, + "learning_rate": 1.1142436175213402e-05, + "loss": 0.0224, + "step": 1586 + }, + { + "epoch": 0.3176, + "learning_rate": 1.1128563848734822e-05, + "loss": 0.0285, + "step": 1588 + }, + { + "epoch": 0.318, + "learning_rate": 1.1114689322063252e-05, + "loss": 0.443, + "step": 1590 + }, + { + "epoch": 0.3184, + "learning_rate": 1.1100812622247828e-05, + "loss": 0.0695, + "step": 1592 + }, + { + "epoch": 0.3188, + "learning_rate": 1.108693377634186e-05, + "loss": 0.1371, + "step": 1594 + }, + { + "epoch": 0.3192, + "learning_rate": 1.1073052811402865e-05, + "loss": 0.0203, + "step": 1596 + }, + { + "epoch": 0.3196, + "learning_rate": 1.1059169754492527e-05, + "loss": 0.1742, + "step": 1598 + }, + { + "epoch": 0.32, + "learning_rate": 1.1045284632676535e-05, + "loss": 0.0016, + "step": 1600 + }, + { + "epoch": 0.3204, + "learning_rate": 1.1031397473024683e-05, + "loss": 0.3776, + "step": 1602 + }, + { + "epoch": 0.3208, + "learning_rate": 1.1017508302610665e-05, + "loss": 0.0878, + "step": 1604 + }, + { + "epoch": 0.3212, + "learning_rate": 1.1003617148512154e-05, + "loss": 0.0084, + "step": 1606 + }, + { + "epoch": 0.3216, + "learning_rate": 1.0989724037810646e-05, + "loss": 0.2983, + "step": 1608 + }, + { + "epoch": 0.322, + "learning_rate": 1.09758289975915e-05, + "loss": 0.0118, + "step": 1610 + }, + { + "epoch": 0.3224, + "learning_rate": 1.0961932054943783e-05, + "loss": 0.0278, + "step": 1612 + }, + { + "epoch": 0.3228, + "learning_rate": 1.0948033236960292e-05, + "loss": 0.0214, + "step": 1614 + }, + { + "epoch": 0.3232, + "learning_rate": 1.0934132570737515e-05, + "loss": 0.0144, + "step": 1616 + }, + { + "epoch": 0.3236, + "learning_rate": 1.0920230083375472e-05, + "loss": 0.0017, + "step": 1618 + }, + { + "epoch": 0.324, + "learning_rate": 1.090632580197781e-05, + "loss": 0.0129, + "step": 1620 + }, + { + "epoch": 0.3244, + "learning_rate": 1.0892419753651606e-05, + "loss": 0.001, + "step": 1622 + }, + { + "epoch": 0.3248, + "learning_rate": 1.0878511965507435e-05, + "loss": 0.0108, + "step": 1624 + }, + { + "epoch": 0.3252, + "learning_rate": 1.0864602464659242e-05, + "loss": 0.021, + "step": 1626 + }, + { + "epoch": 0.3256, + "learning_rate": 1.0850691278224285e-05, + "loss": 0.0164, + "step": 1628 + }, + { + "epoch": 0.326, + "learning_rate": 1.0836778433323161e-05, + "loss": 0.0028, + "step": 1630 + }, + { + "epoch": 0.3264, + "learning_rate": 1.082286395707966e-05, + "loss": 0.026, + "step": 1632 + }, + { + "epoch": 0.3268, + "learning_rate": 1.0808947876620773e-05, + "loss": 0.0125, + "step": 1634 + }, + { + "epoch": 0.3272, + "learning_rate": 1.0795030219076596e-05, + "loss": 0.0452, + "step": 1636 + }, + { + "epoch": 0.3276, + "learning_rate": 1.0781111011580342e-05, + "loss": 0.2657, + "step": 1638 + }, + { + "epoch": 0.328, + "learning_rate": 1.0767190281268187e-05, + "loss": 0.0028, + "step": 1640 + }, + { + "epoch": 0.3284, + "learning_rate": 1.0753268055279328e-05, + "loss": 0.0006, + "step": 1642 + }, + { + "epoch": 0.3288, + "learning_rate": 1.0739344360755862e-05, + "loss": 0.0006, + "step": 1644 + }, + { + "epoch": 0.3292, + "learning_rate": 1.0725419224842713e-05, + "loss": 0.014, + "step": 1646 + }, + { + "epoch": 0.3296, + "learning_rate": 1.0711492674687683e-05, + "loss": 0.0132, + "step": 1648 + }, + { + "epoch": 0.33, + "learning_rate": 1.0697564737441257e-05, + "loss": 0.0267, + "step": 1650 + }, + { + "epoch": 0.3304, + "learning_rate": 1.0683635440256692e-05, + "loss": 0.0079, + "step": 1652 + }, + { + "epoch": 0.3308, + "learning_rate": 1.0669704810289847e-05, + "loss": 0.0005, + "step": 1654 + }, + { + "epoch": 0.3312, + "learning_rate": 1.0655772874699222e-05, + "loss": 0.182, + "step": 1656 + }, + { + "epoch": 0.3316, + "learning_rate": 1.0641839660645806e-05, + "loss": 0.0142, + "step": 1658 + }, + { + "epoch": 0.332, + "learning_rate": 1.0627905195293133e-05, + "loss": 0.0925, + "step": 1660 + }, + { + "epoch": 0.3324, + "learning_rate": 1.0613969505807165e-05, + "loss": 0.0015, + "step": 1662 + }, + { + "epoch": 0.3328, + "learning_rate": 1.0600032619356208e-05, + "loss": 0.004, + "step": 1664 + }, + { + "epoch": 0.3332, + "learning_rate": 1.0586094563110975e-05, + "loss": 0.0015, + "step": 1666 + }, + { + "epoch": 0.3336, + "learning_rate": 1.0572155364244385e-05, + "loss": 0.0073, + "step": 1668 + }, + { + "epoch": 0.334, + "learning_rate": 1.0558215049931641e-05, + "loss": 0.0062, + "step": 1670 + }, + { + "epoch": 0.3344, + "learning_rate": 1.0544273647350088e-05, + "loss": 0.0035, + "step": 1672 + }, + { + "epoch": 0.3348, + "learning_rate": 1.0530331183679225e-05, + "loss": 0.0028, + "step": 1674 + }, + { + "epoch": 0.3352, + "learning_rate": 1.0516387686100564e-05, + "loss": 0.9553, + "step": 1676 + }, + { + "epoch": 0.3356, + "learning_rate": 1.0502443181797703e-05, + "loss": 0.0052, + "step": 1678 + }, + { + "epoch": 0.336, + "learning_rate": 1.0488497697956141e-05, + "loss": 0.0463, + "step": 1680 + }, + { + "epoch": 0.3364, + "learning_rate": 1.0474551261763312e-05, + "loss": 0.0009, + "step": 1682 + }, + { + "epoch": 0.3368, + "learning_rate": 1.0460603900408533e-05, + "loss": 0.0115, + "step": 1684 + }, + { + "epoch": 0.3372, + "learning_rate": 1.0446655641082864e-05, + "loss": 0.2466, + "step": 1686 + }, + { + "epoch": 0.3376, + "learning_rate": 1.0432706510979174e-05, + "loss": 0.0008, + "step": 1688 + }, + { + "epoch": 0.338, + "learning_rate": 1.0418756537291991e-05, + "loss": 0.0032, + "step": 1690 + }, + { + "epoch": 0.3384, + "learning_rate": 1.040480574721753e-05, + "loss": 0.0028, + "step": 1692 + }, + { + "epoch": 0.3388, + "learning_rate": 1.0390854167953533e-05, + "loss": 0.003, + "step": 1694 + }, + { + "epoch": 0.3392, + "learning_rate": 1.0376901826699352e-05, + "loss": 0.0013, + "step": 1696 + }, + { + "epoch": 0.3396, + "learning_rate": 1.0362948750655766e-05, + "loss": 0.0401, + "step": 1698 + }, + { + "epoch": 0.34, + "learning_rate": 1.034899496702501e-05, + "loss": 0.0009, + "step": 1700 + }, + { + "epoch": 0.3404, + "learning_rate": 1.0335040503010725e-05, + "loss": 0.2153, + "step": 1702 + }, + { + "epoch": 0.3408, + "learning_rate": 1.0321085385817818e-05, + "loss": 0.0017, + "step": 1704 + }, + { + "epoch": 0.3412, + "learning_rate": 1.0307129642652538e-05, + "loss": 0.0028, + "step": 1706 + }, + { + "epoch": 0.3416, + "learning_rate": 1.0293173300722288e-05, + "loss": 0.0801, + "step": 1708 + }, + { + "epoch": 0.342, + "learning_rate": 1.0279216387235693e-05, + "loss": 0.0461, + "step": 1710 + }, + { + "epoch": 0.3424, + "learning_rate": 1.0265258929402454e-05, + "loss": 0.2391, + "step": 1712 + }, + { + "epoch": 0.3428, + "learning_rate": 1.025130095443338e-05, + "loss": 0.505, + "step": 1714 + }, + { + "epoch": 0.3432, + "learning_rate": 1.0237342489540228e-05, + "loss": 0.0166, + "step": 1716 + }, + { + "epoch": 0.3436, + "learning_rate": 1.0223383561935736e-05, + "loss": 1.3769, + "step": 1718 + }, + { + "epoch": 0.344, + "learning_rate": 1.0209424198833578e-05, + "loss": 0.0053, + "step": 1720 + }, + { + "epoch": 0.3444, + "learning_rate": 1.0195464427448212e-05, + "loss": 0.9961, + "step": 1722 + }, + { + "epoch": 0.3448, + "learning_rate": 1.0181504274994957e-05, + "loss": 0.0379, + "step": 1724 + }, + { + "epoch": 0.3452, + "learning_rate": 1.0167543768689816e-05, + "loss": 0.0128, + "step": 1726 + }, + { + "epoch": 0.3456, + "learning_rate": 1.0153582935749533e-05, + "loss": 0.0154, + "step": 1728 + }, + { + "epoch": 0.346, + "learning_rate": 1.0139621803391448e-05, + "loss": 0.0719, + "step": 1730 + }, + { + "epoch": 0.3464, + "learning_rate": 1.012566039883353e-05, + "loss": 0.1059, + "step": 1732 + }, + { + "epoch": 0.3468, + "learning_rate": 1.0111698749294227e-05, + "loss": 0.1562, + "step": 1734 + }, + { + "epoch": 0.3472, + "learning_rate": 1.0097736881992497e-05, + "loss": 0.5621, + "step": 1736 + }, + { + "epoch": 0.3476, + "learning_rate": 1.0083774824147713e-05, + "loss": 0.7934, + "step": 1738 + }, + { + "epoch": 0.348, + "learning_rate": 1.0069812602979617e-05, + "loss": 0.1682, + "step": 1740 + }, + { + "epoch": 0.3484, + "learning_rate": 1.0055850245708291e-05, + "loss": 0.0656, + "step": 1742 + }, + { + "epoch": 0.3488, + "learning_rate": 1.0041887779554041e-05, + "loss": 0.002, + "step": 1744 + }, + { + "epoch": 0.3492, + "learning_rate": 1.0027925231737428e-05, + "loss": 0.0117, + "step": 1746 + }, + { + "epoch": 0.3496, + "learning_rate": 1.0013962629479137e-05, + "loss": 0.0623, + "step": 1748 + }, + { + "epoch": 0.35, + "learning_rate": 1.0000000000000003e-05, + "loss": 0.0032, + "step": 1750 + }, + { + "epoch": 0.3504, + "learning_rate": 9.986037370520871e-06, + "loss": 0.0338, + "step": 1752 + }, + { + "epoch": 0.3508, + "learning_rate": 9.97207476826258e-06, + "loss": 0.5644, + "step": 1754 + }, + { + "epoch": 0.3512, + "learning_rate": 9.958112220445967e-06, + "loss": 0.1492, + "step": 1756 + }, + { + "epoch": 0.3516, + "learning_rate": 9.944149754291716e-06, + "loss": 0.0372, + "step": 1758 + }, + { + "epoch": 0.352, + "learning_rate": 9.930187397020392e-06, + "loss": 0.0346, + "step": 1760 + }, + { + "epoch": 0.3524, + "learning_rate": 9.916225175852295e-06, + "loss": 0.0076, + "step": 1762 + }, + { + "epoch": 0.3528, + "learning_rate": 9.902263118007511e-06, + "loss": 0.0774, + "step": 1764 + }, + { + "epoch": 0.3532, + "learning_rate": 9.88830125070578e-06, + "loss": 0.3081, + "step": 1766 + }, + { + "epoch": 0.3536, + "learning_rate": 9.874339601166477e-06, + "loss": 0.0206, + "step": 1768 + }, + { + "epoch": 0.354, + "learning_rate": 9.860378196608559e-06, + "loss": 0.0161, + "step": 1770 + }, + { + "epoch": 0.3544, + "learning_rate": 9.846417064250476e-06, + "loss": 0.1463, + "step": 1772 + }, + { + "epoch": 0.3548, + "learning_rate": 9.832456231310192e-06, + "loss": 0.4359, + "step": 1774 + }, + { + "epoch": 0.3552, + "learning_rate": 9.81849572500505e-06, + "loss": 0.757, + "step": 1776 + }, + { + "epoch": 0.3556, + "learning_rate": 9.804535572551797e-06, + "loss": 0.0211, + "step": 1778 + }, + { + "epoch": 0.356, + "learning_rate": 9.790575801166429e-06, + "loss": 0.0178, + "step": 1780 + }, + { + "epoch": 0.3564, + "learning_rate": 9.77661643806427e-06, + "loss": 0.1074, + "step": 1782 + }, + { + "epoch": 0.3568, + "learning_rate": 9.76265751045978e-06, + "loss": 0.081, + "step": 1784 + }, + { + "epoch": 0.3572, + "learning_rate": 9.748699045566628e-06, + "loss": 0.5425, + "step": 1786 + }, + { + "epoch": 0.3576, + "learning_rate": 9.734741070597552e-06, + "loss": 0.039, + "step": 1788 + }, + { + "epoch": 0.358, + "learning_rate": 9.720783612764316e-06, + "loss": 0.0063, + "step": 1790 + }, + { + "epoch": 0.3584, + "learning_rate": 9.70682669927772e-06, + "loss": 0.0249, + "step": 1792 + }, + { + "epoch": 0.3588, + "learning_rate": 9.69287035734747e-06, + "loss": 0.0203, + "step": 1794 + }, + { + "epoch": 0.3592, + "learning_rate": 9.67891461418219e-06, + "loss": 0.0108, + "step": 1796 + }, + { + "epoch": 0.3596, + "learning_rate": 9.664959496989283e-06, + "loss": 0.0254, + "step": 1798 + }, + { + "epoch": 0.36, + "learning_rate": 9.651005032974998e-06, + "loss": 0.0419, + "step": 1800 + }, + { + "epoch": 0.3604, + "learning_rate": 9.637051249344242e-06, + "loss": 0.0114, + "step": 1802 + }, + { + "epoch": 0.3608, + "learning_rate": 9.623098173300656e-06, + "loss": 0.0075, + "step": 1804 + }, + { + "epoch": 0.3612, + "learning_rate": 9.609145832046475e-06, + "loss": 0.0036, + "step": 1806 + }, + { + "epoch": 0.3616, + "learning_rate": 9.595194252782476e-06, + "loss": 0.0032, + "step": 1808 + }, + { + "epoch": 0.362, + "learning_rate": 9.581243462708017e-06, + "loss": 0.1616, + "step": 1810 + }, + { + "epoch": 0.3624, + "learning_rate": 9.567293489020833e-06, + "loss": 0.038, + "step": 1812 + }, + { + "epoch": 0.3628, + "learning_rate": 9.553344358917144e-06, + "loss": 0.0036, + "step": 1814 + }, + { + "epoch": 0.3632, + "learning_rate": 9.539396099591476e-06, + "loss": 1.0386, + "step": 1816 + }, + { + "epoch": 0.3636, + "learning_rate": 9.525448738236695e-06, + "loss": 0.3256, + "step": 1818 + }, + { + "epoch": 0.364, + "learning_rate": 9.511502302043866e-06, + "loss": 0.0025, + "step": 1820 + }, + { + "epoch": 0.3644, + "learning_rate": 9.497556818202304e-06, + "loss": 0.0038, + "step": 1822 + }, + { + "epoch": 0.3648, + "learning_rate": 9.483612313899444e-06, + "loss": 0.0782, + "step": 1824 + }, + { + "epoch": 0.3652, + "learning_rate": 9.469668816320782e-06, + "loss": 0.0032, + "step": 1826 + }, + { + "epoch": 0.3656, + "learning_rate": 9.45572635264992e-06, + "loss": 1.2482, + "step": 1828 + }, + { + "epoch": 0.366, + "learning_rate": 9.441784950068364e-06, + "loss": 0.2404, + "step": 1830 + }, + { + "epoch": 0.3664, + "learning_rate": 9.427844635755624e-06, + "loss": 0.0357, + "step": 1832 + }, + { + "epoch": 0.3668, + "learning_rate": 9.413905436889032e-06, + "loss": 0.0786, + "step": 1834 + }, + { + "epoch": 0.3672, + "learning_rate": 9.399967380643799e-06, + "loss": 0.0336, + "step": 1836 + }, + { + "epoch": 0.3676, + "learning_rate": 9.386030494192843e-06, + "loss": 0.2265, + "step": 1838 + }, + { + "epoch": 0.368, + "learning_rate": 9.372094804706873e-06, + "loss": 0.0192, + "step": 1840 + }, + { + "epoch": 0.3684, + "learning_rate": 9.358160339354203e-06, + "loss": 0.0064, + "step": 1842 + }, + { + "epoch": 0.3688, + "learning_rate": 9.344227125300786e-06, + "loss": 0.8553, + "step": 1844 + }, + { + "epoch": 0.3692, + "learning_rate": 9.330295189710161e-06, + "loss": 0.0415, + "step": 1846 + }, + { + "epoch": 0.3696, + "learning_rate": 9.316364559743315e-06, + "loss": 0.0155, + "step": 1848 + }, + { + "epoch": 0.37, + "learning_rate": 9.30243526255875e-06, + "loss": 0.2644, + "step": 1850 + }, + { + "epoch": 0.3704, + "learning_rate": 9.288507325312326e-06, + "loss": 0.0468, + "step": 1852 + }, + { + "epoch": 0.3708, + "learning_rate": 9.274580775157295e-06, + "loss": 0.2861, + "step": 1854 + }, + { + "epoch": 0.3712, + "learning_rate": 9.260655639244147e-06, + "loss": 0.1977, + "step": 1856 + }, + { + "epoch": 0.3716, + "learning_rate": 9.24673194472068e-06, + "loss": 0.0353, + "step": 1858 + }, + { + "epoch": 0.372, + "learning_rate": 9.232809718731822e-06, + "loss": 0.0353, + "step": 1860 + }, + { + "epoch": 0.3724, + "learning_rate": 9.218888988419664e-06, + "loss": 0.0731, + "step": 1862 + }, + { + "epoch": 0.3728, + "learning_rate": 9.204969780923412e-06, + "loss": 0.027, + "step": 1864 + }, + { + "epoch": 0.3732, + "learning_rate": 9.191052123379235e-06, + "loss": 0.0613, + "step": 1866 + }, + { + "epoch": 0.3736, + "learning_rate": 9.177136042920346e-06, + "loss": 0.0188, + "step": 1868 + }, + { + "epoch": 0.374, + "learning_rate": 9.163221566676847e-06, + "loss": 0.0039, + "step": 1870 + }, + { + "epoch": 0.3744, + "learning_rate": 9.149308721775723e-06, + "loss": 0.0453, + "step": 1872 + }, + { + "epoch": 0.3748, + "learning_rate": 9.135397535340766e-06, + "loss": 0.0559, + "step": 1874 + }, + { + "epoch": 0.3752, + "learning_rate": 9.121488034492574e-06, + "loss": 0.0099, + "step": 1876 + }, + { + "epoch": 0.3756, + "learning_rate": 9.107580246348402e-06, + "loss": 0.0092, + "step": 1878 + }, + { + "epoch": 0.376, + "learning_rate": 9.093674198022196e-06, + "loss": 0.0232, + "step": 1880 + }, + { + "epoch": 0.3764, + "learning_rate": 9.079769916624536e-06, + "loss": 0.0245, + "step": 1882 + }, + { + "epoch": 0.3768, + "learning_rate": 9.065867429262493e-06, + "loss": 0.0056, + "step": 1884 + }, + { + "epoch": 0.3772, + "learning_rate": 9.051966763039716e-06, + "loss": 0.0041, + "step": 1886 + }, + { + "epoch": 0.3776, + "learning_rate": 9.038067945056225e-06, + "loss": 0.9488, + "step": 1888 + }, + { + "epoch": 0.378, + "learning_rate": 9.024171002408507e-06, + "loss": 0.0276, + "step": 1890 + }, + { + "epoch": 0.3784, + "learning_rate": 9.010275962189362e-06, + "loss": 0.032, + "step": 1892 + }, + { + "epoch": 0.3788, + "learning_rate": 8.996382851487854e-06, + "loss": 0.1746, + "step": 1894 + }, + { + "epoch": 0.3792, + "learning_rate": 8.982491697389344e-06, + "loss": 0.1554, + "step": 1896 + }, + { + "epoch": 0.3796, + "learning_rate": 8.968602526975324e-06, + "loss": 0.0045, + "step": 1898 + }, + { + "epoch": 0.38, + "learning_rate": 8.954715367323473e-06, + "loss": 0.1064, + "step": 1900 + }, + { + "epoch": 0.3804, + "learning_rate": 8.940830245507482e-06, + "loss": 0.0112, + "step": 1902 + }, + { + "epoch": 0.3808, + "learning_rate": 8.926947188597142e-06, + "loss": 0.0071, + "step": 1904 + }, + { + "epoch": 0.3812, + "learning_rate": 8.913066223658148e-06, + "loss": 0.155, + "step": 1906 + }, + { + "epoch": 0.3816, + "learning_rate": 8.89918737775218e-06, + "loss": 0.0091, + "step": 1908 + }, + { + "epoch": 0.382, + "learning_rate": 8.885310677936757e-06, + "loss": 0.0205, + "step": 1910 + }, + { + "epoch": 0.3824, + "learning_rate": 8.871436151265186e-06, + "loss": 0.0012, + "step": 1912 + }, + { + "epoch": 0.3828, + "learning_rate": 8.857563824786606e-06, + "loss": 0.4258, + "step": 1914 + }, + { + "epoch": 0.3832, + "learning_rate": 8.843693725545789e-06, + "loss": 0.02, + "step": 1916 + }, + { + "epoch": 0.3836, + "learning_rate": 8.829825880583231e-06, + "loss": 0.0131, + "step": 1918 + }, + { + "epoch": 0.384, + "learning_rate": 8.815960316934988e-06, + "loss": 0.0503, + "step": 1920 + }, + { + "epoch": 0.3844, + "learning_rate": 8.802097061632713e-06, + "loss": 0.2336, + "step": 1922 + }, + { + "epoch": 0.3848, + "learning_rate": 8.788236141703493e-06, + "loss": 0.0135, + "step": 1924 + }, + { + "epoch": 0.3852, + "learning_rate": 8.77437758416993e-06, + "loss": 0.1468, + "step": 1926 + }, + { + "epoch": 0.3856, + "learning_rate": 8.760521416049993e-06, + "loss": 0.0044, + "step": 1928 + }, + { + "epoch": 0.386, + "learning_rate": 8.74666766435696e-06, + "loss": 0.0766, + "step": 1930 + }, + { + "epoch": 0.3864, + "learning_rate": 8.732816356099465e-06, + "loss": 0.0139, + "step": 1932 + }, + { + "epoch": 0.3868, + "learning_rate": 8.718967518281309e-06, + "loss": 0.0387, + "step": 1934 + }, + { + "epoch": 0.3872, + "learning_rate": 8.705121177901534e-06, + "loss": 0.0016, + "step": 1936 + }, + { + "epoch": 0.3876, + "learning_rate": 8.691277361954275e-06, + "loss": 0.4318, + "step": 1938 + }, + { + "epoch": 0.388, + "learning_rate": 8.677436097428782e-06, + "loss": 0.0437, + "step": 1940 + }, + { + "epoch": 0.3884, + "learning_rate": 8.663597411309275e-06, + "loss": 0.0906, + "step": 1942 + }, + { + "epoch": 0.3888, + "learning_rate": 8.649761330575016e-06, + "loss": 0.0051, + "step": 1944 + }, + { + "epoch": 0.3892, + "learning_rate": 8.635927882200126e-06, + "loss": 0.0066, + "step": 1946 + }, + { + "epoch": 0.3896, + "learning_rate": 8.62209709315362e-06, + "loss": 0.0085, + "step": 1948 + }, + { + "epoch": 0.39, + "learning_rate": 8.608268990399356e-06, + "loss": 0.0036, + "step": 1950 + }, + { + "epoch": 0.3904, + "learning_rate": 8.594443600895892e-06, + "loss": 0.0025, + "step": 1952 + }, + { + "epoch": 0.3908, + "learning_rate": 8.58062095159656e-06, + "loss": 0.0031, + "step": 1954 + }, + { + "epoch": 0.3912, + "learning_rate": 8.566801069449302e-06, + "loss": 0.0465, + "step": 1956 + }, + { + "epoch": 0.3916, + "learning_rate": 8.552983981396714e-06, + "loss": 0.0014, + "step": 1958 + }, + { + "epoch": 0.392, + "learning_rate": 8.53916971437589e-06, + "loss": 0.0458, + "step": 1960 + }, + { + "epoch": 0.3924, + "learning_rate": 8.525358295318457e-06, + "loss": 0.237, + "step": 1962 + }, + { + "epoch": 0.3928, + "learning_rate": 8.511549751150487e-06, + "loss": 0.0186, + "step": 1964 + }, + { + "epoch": 0.3932, + "learning_rate": 8.497744108792438e-06, + "loss": 0.0061, + "step": 1966 + }, + { + "epoch": 0.3936, + "learning_rate": 8.48394139515912e-06, + "loss": 0.1367, + "step": 1968 + }, + { + "epoch": 0.394, + "learning_rate": 8.470141637159612e-06, + "loss": 0.0254, + "step": 1970 + }, + { + "epoch": 0.3944, + "learning_rate": 8.4563448616973e-06, + "loss": 0.1038, + "step": 1972 + }, + { + "epoch": 0.3948, + "learning_rate": 8.442551095669632e-06, + "loss": 0.0008, + "step": 1974 + }, + { + "epoch": 0.3952, + "learning_rate": 8.428760365968335e-06, + "loss": 0.0063, + "step": 1976 + }, + { + "epoch": 0.3956, + "learning_rate": 8.41497269947907e-06, + "loss": 0.0079, + "step": 1978 + }, + { + "epoch": 0.396, + "learning_rate": 8.401188123081648e-06, + "loss": 0.0085, + "step": 1980 + }, + { + "epoch": 0.3964, + "learning_rate": 8.387406663649811e-06, + "loss": 0.1731, + "step": 1982 + }, + { + "epoch": 0.3968, + "learning_rate": 8.373628348051163e-06, + "loss": 0.0342, + "step": 1984 + }, + { + "epoch": 0.3972, + "learning_rate": 8.35985320314729e-06, + "loss": 0.0012, + "step": 1986 + }, + { + "epoch": 0.3976, + "learning_rate": 8.346081255793524e-06, + "loss": 0.0039, + "step": 1988 + }, + { + "epoch": 0.398, + "learning_rate": 8.332312532838978e-06, + "loss": 0.0764, + "step": 1990 + }, + { + "epoch": 0.3984, + "learning_rate": 8.318547061126487e-06, + "loss": 0.0518, + "step": 1992 + }, + { + "epoch": 0.3988, + "learning_rate": 8.30478486749254e-06, + "loss": 0.0097, + "step": 1994 + }, + { + "epoch": 0.3992, + "learning_rate": 8.291025978767237e-06, + "loss": 0.0084, + "step": 1996 + }, + { + "epoch": 0.3996, + "learning_rate": 8.277270421774238e-06, + "loss": 0.0015, + "step": 1998 + }, + { + "epoch": 0.4, + "learning_rate": 8.263518223330701e-06, + "loss": 0.0075, + "step": 2000 + }, + { + "epoch": 0.4004, + "learning_rate": 8.249769410247246e-06, + "loss": 0.0054, + "step": 2002 + }, + { + "epoch": 0.4008, + "learning_rate": 8.236024009327884e-06, + "loss": 0.0362, + "step": 2004 + }, + { + "epoch": 0.4012, + "learning_rate": 8.222282047369979e-06, + "loss": 0.0025, + "step": 2006 + }, + { + "epoch": 0.4016, + "learning_rate": 8.208543551164187e-06, + "loss": 0.0016, + "step": 2008 + }, + { + "epoch": 0.402, + "learning_rate": 8.194808547494392e-06, + "loss": 0.0018, + "step": 2010 + }, + { + "epoch": 0.4024, + "learning_rate": 8.181077063137742e-06, + "loss": 0.0013, + "step": 2012 + }, + { + "epoch": 0.4028, + "learning_rate": 8.167349124864396e-06, + "loss": 0.0956, + "step": 2014 + }, + { + "epoch": 0.4032, + "learning_rate": 8.153624759437726e-06, + "loss": 0.0012, + "step": 2016 + }, + { + "epoch": 0.4036, + "learning_rate": 8.139903993614082e-06, + "loss": 0.4304, + "step": 2018 + }, + { + "epoch": 0.404, + "learning_rate": 8.12618685414275e-06, + "loss": 0.0009, + "step": 2020 + }, + { + "epoch": 0.4044, + "learning_rate": 8.112473367766065e-06, + "loss": 0.0003, + "step": 2022 + }, + { + "epoch": 0.4048, + "learning_rate": 8.098763561219096e-06, + "loss": 0.0009, + "step": 2024 + }, + { + "epoch": 0.4052, + "learning_rate": 8.085057461229869e-06, + "loss": 0.0228, + "step": 2026 + }, + { + "epoch": 0.4056, + "learning_rate": 8.07135509451911e-06, + "loss": 0.0089, + "step": 2028 + }, + { + "epoch": 0.406, + "learning_rate": 8.057656487800281e-06, + "loss": 0.38, + "step": 2030 + }, + { + "epoch": 0.4064, + "learning_rate": 8.04396166777952e-06, + "loss": 0.0118, + "step": 2032 + }, + { + "epoch": 0.4068, + "learning_rate": 8.030270661155576e-06, + "loss": 2.0078, + "step": 2034 + }, + { + "epoch": 0.4072, + "learning_rate": 8.016583494619772e-06, + "loss": 0.1384, + "step": 2036 + }, + { + "epoch": 0.4076, + "learning_rate": 8.002900194855934e-06, + "loss": 0.0006, + "step": 2038 + }, + { + "epoch": 0.408, + "learning_rate": 7.989220788540358e-06, + "loss": 0.0016, + "step": 2040 + }, + { + "epoch": 0.4084, + "learning_rate": 7.975545302341748e-06, + "loss": 0.3658, + "step": 2042 + }, + { + "epoch": 0.4088, + "learning_rate": 7.96187376292116e-06, + "loss": 0.0008, + "step": 2044 + }, + { + "epoch": 0.4092, + "learning_rate": 7.948206196931944e-06, + "loss": 0.0086, + "step": 2046 + }, + { + "epoch": 0.4096, + "learning_rate": 7.934542631019774e-06, + "loss": 0.0022, + "step": 2048 + }, + { + "epoch": 0.41, + "learning_rate": 7.920883091822399e-06, + "loss": 0.0064, + "step": 2050 + }, + { + "epoch": 0.4104, + "learning_rate": 7.907227605969859e-06, + "loss": 0.016, + "step": 2052 + }, + { + "epoch": 0.4108, + "learning_rate": 7.893576200084169e-06, + "loss": 0.0017, + "step": 2054 + }, + { + "epoch": 0.4112, + "learning_rate": 7.879928900779448e-06, + "loss": 0.1641, + "step": 2056 + }, + { + "epoch": 0.4116, + "learning_rate": 7.866285734661853e-06, + "loss": 0.0045, + "step": 2058 + }, + { + "epoch": 0.412, + "learning_rate": 7.852646728329365e-06, + "loss": 0.1767, + "step": 2060 + }, + { + "epoch": 0.4124, + "learning_rate": 7.839011908371994e-06, + "loss": 0.0469, + "step": 2062 + }, + { + "epoch": 0.4128, + "learning_rate": 7.82538130137145e-06, + "loss": 0.0009, + "step": 2064 + }, + { + "epoch": 0.4132, + "learning_rate": 7.811754933901353e-06, + "loss": 0.0026, + "step": 2066 + }, + { + "epoch": 0.4136, + "learning_rate": 7.798132832526985e-06, + "loss": 0.0006, + "step": 2068 + }, + { + "epoch": 0.414, + "learning_rate": 7.784515023805328e-06, + "loss": 0.0084, + "step": 2070 + }, + { + "epoch": 0.4144, + "learning_rate": 7.770901534284998e-06, + "loss": 0.036, + "step": 2072 + }, + { + "epoch": 0.4148, + "learning_rate": 7.757292390506191e-06, + "loss": 0.0021, + "step": 2074 + }, + { + "epoch": 0.4152, + "learning_rate": 7.743687619000625e-06, + "loss": 0.0011, + "step": 2076 + }, + { + "epoch": 0.4156, + "learning_rate": 7.730087246291504e-06, + "loss": 0.0044, + "step": 2078 + }, + { + "epoch": 0.416, + "learning_rate": 7.716491298893446e-06, + "loss": 0.0016, + "step": 2080 + }, + { + "epoch": 0.4164, + "learning_rate": 7.70289980331245e-06, + "loss": 0.0039, + "step": 2082 + }, + { + "epoch": 0.4168, + "learning_rate": 7.689312786045829e-06, + "loss": 0.0115, + "step": 2084 + }, + { + "epoch": 0.4172, + "learning_rate": 7.67573027358215e-06, + "loss": 0.1626, + "step": 2086 + }, + { + "epoch": 0.4176, + "learning_rate": 7.662152292401272e-06, + "loss": 0.0003, + "step": 2088 + }, + { + "epoch": 0.418, + "learning_rate": 7.648578868974109e-06, + "loss": 0.0017, + "step": 2090 + }, + { + "epoch": 0.4184, + "learning_rate": 7.635010029762762e-06, + "loss": 0.0007, + "step": 2092 + }, + { + "epoch": 0.4188, + "learning_rate": 7.621445801220381e-06, + "loss": 0.0049, + "step": 2094 + }, + { + "epoch": 0.4192, + "learning_rate": 7.607886209791102e-06, + "loss": 0.0417, + "step": 2096 + }, + { + "epoch": 0.4196, + "learning_rate": 7.594331281910096e-06, + "loss": 0.0489, + "step": 2098 + }, + { + "epoch": 0.42, + "learning_rate": 7.580781044003319e-06, + "loss": 0.0003, + "step": 2100 + }, + { + "epoch": 0.4204, + "learning_rate": 7.567235522487705e-06, + "loss": 0.0026, + "step": 2102 + }, + { + "epoch": 0.4208, + "learning_rate": 7.553694743770925e-06, + "loss": 0.0179, + "step": 2104 + }, + { + "epoch": 0.4212, + "learning_rate": 7.540158734251418e-06, + "loss": 0.0307, + "step": 2106 + }, + { + "epoch": 0.4216, + "learning_rate": 7.526627520318347e-06, + "loss": 0.0022, + "step": 2108 + }, + { + "epoch": 0.422, + "learning_rate": 7.513101128351452e-06, + "loss": 0.3295, + "step": 2110 + }, + { + "epoch": 0.4224, + "learning_rate": 7.49957958472118e-06, + "loss": 0.0452, + "step": 2112 + }, + { + "epoch": 0.4228, + "learning_rate": 7.486062915788453e-06, + "loss": 0.0252, + "step": 2114 + }, + { + "epoch": 0.4232, + "learning_rate": 7.472551147904709e-06, + "loss": 0.1338, + "step": 2116 + }, + { + "epoch": 0.4236, + "learning_rate": 7.4590443074118325e-06, + "loss": 0.0007, + "step": 2118 + }, + { + "epoch": 0.424, + "learning_rate": 7.445542420642099e-06, + "loss": 0.0919, + "step": 2120 + }, + { + "epoch": 0.4244, + "learning_rate": 7.4320455139181265e-06, + "loss": 0.0012, + "step": 2122 + }, + { + "epoch": 0.4248, + "learning_rate": 7.418553613552829e-06, + "loss": 0.0017, + "step": 2124 + }, + { + "epoch": 0.4252, + "learning_rate": 7.405066745849353e-06, + "loss": 0.0014, + "step": 2126 + }, + { + "epoch": 0.4256, + "learning_rate": 7.391584937101039e-06, + "loss": 0.0111, + "step": 2128 + }, + { + "epoch": 0.426, + "learning_rate": 7.378108213591362e-06, + "loss": 0.137, + "step": 2130 + }, + { + "epoch": 0.4264, + "learning_rate": 7.3646366015938665e-06, + "loss": 0.0031, + "step": 2132 + }, + { + "epoch": 0.4268, + "learning_rate": 7.351170127372202e-06, + "loss": 0.0008, + "step": 2134 + }, + { + "epoch": 0.4272, + "learning_rate": 7.337708817179881e-06, + "loss": 0.4422, + "step": 2136 + }, + { + "epoch": 0.4276, + "learning_rate": 7.324252697260486e-06, + "loss": 0.0109, + "step": 2138 + }, + { + "epoch": 0.428, + "learning_rate": 7.310801793847355e-06, + "loss": 0.001, + "step": 2140 + }, + { + "epoch": 0.4284, + "learning_rate": 7.297356133163718e-06, + "loss": 0.142, + "step": 2142 + }, + { + "epoch": 0.4288, + "learning_rate": 7.283915741422627e-06, + "loss": 0.0055, + "step": 2144 + }, + { + "epoch": 0.4292, + "learning_rate": 7.2704806448267454e-06, + "loss": 0.0026, + "step": 2146 + }, + { + "epoch": 0.4296, + "learning_rate": 7.2570508695685335e-06, + "loss": 0.0008, + "step": 2148 + }, + { + "epoch": 0.43, + "learning_rate": 7.243626441830008e-06, + "loss": 0.0018, + "step": 2150 + }, + { + "epoch": 0.4304, + "learning_rate": 7.2302073877827775e-06, + "loss": 0.0015, + "step": 2152 + }, + { + "epoch": 0.4308, + "learning_rate": 7.216793733587975e-06, + "loss": 0.0014, + "step": 2154 + }, + { + "epoch": 0.4312, + "learning_rate": 7.203385505396203e-06, + "loss": 0.0123, + "step": 2156 + }, + { + "epoch": 0.4316, + "learning_rate": 7.189982729347494e-06, + "loss": 0.0005, + "step": 2158 + }, + { + "epoch": 0.432, + "learning_rate": 7.176585431571239e-06, + "loss": 0.0231, + "step": 2160 + }, + { + "epoch": 0.4324, + "learning_rate": 7.163193638186161e-06, + "loss": 0.0031, + "step": 2162 + }, + { + "epoch": 0.4328, + "learning_rate": 7.149807375300244e-06, + "loss": 0.0035, + "step": 2164 + }, + { + "epoch": 0.4332, + "learning_rate": 7.136426669010694e-06, + "loss": 0.382, + "step": 2166 + }, + { + "epoch": 0.4336, + "learning_rate": 7.123051545403881e-06, + "loss": 0.0029, + "step": 2168 + }, + { + "epoch": 0.434, + "learning_rate": 7.109682030555291e-06, + "loss": 0.0422, + "step": 2170 + }, + { + "epoch": 0.4344, + "learning_rate": 7.096318150529467e-06, + "loss": 0.0592, + "step": 2172 + }, + { + "epoch": 0.4348, + "learning_rate": 7.08295993138002e-06, + "loss": 0.0406, + "step": 2174 + }, + { + "epoch": 0.4352, + "learning_rate": 7.069607399149436e-06, + "loss": 0.1623, + "step": 2176 + }, + { + "epoch": 0.4356, + "learning_rate": 7.056260579869158e-06, + "loss": 0.0026, + "step": 2178 + }, + { + "epoch": 0.436, + "learning_rate": 7.042919499559545e-06, + "loss": 0.0032, + "step": 2180 + }, + { + "epoch": 0.4364, + "learning_rate": 7.029584184229647e-06, + "loss": 0.0028, + "step": 2182 + }, + { + "epoch": 0.4368, + "learning_rate": 7.016254659877412e-06, + "loss": 0.0017, + "step": 2184 + }, + { + "epoch": 0.4372, + "learning_rate": 7.0029309524893605e-06, + "loss": 0.0823, + "step": 2186 + }, + { + "epoch": 0.4376, + "learning_rate": 6.989613088040794e-06, + "loss": 0.459, + "step": 2188 + }, + { + "epoch": 0.438, + "learning_rate": 6.976301092495555e-06, + "loss": 0.003, + "step": 2190 + }, + { + "epoch": 0.4384, + "learning_rate": 6.962994991806057e-06, + "loss": 0.1091, + "step": 2192 + }, + { + "epoch": 0.4388, + "learning_rate": 6.9496948119132435e-06, + "loss": 0.0031, + "step": 2194 + }, + { + "epoch": 0.4392, + "learning_rate": 6.936400578746443e-06, + "loss": 0.0019, + "step": 2196 + }, + { + "epoch": 0.4396, + "learning_rate": 6.923112318223497e-06, + "loss": 0.0004, + "step": 2198 + }, + { + "epoch": 0.44, + "learning_rate": 6.9098300562505296e-06, + "loss": 0.0154, + "step": 2200 + }, + { + "epoch": 0.4404, + "learning_rate": 6.8965538187219916e-06, + "loss": 0.0018, + "step": 2202 + }, + { + "epoch": 0.4408, + "learning_rate": 6.883283631520587e-06, + "loss": 0.0155, + "step": 2204 + }, + { + "epoch": 0.4412, + "learning_rate": 6.870019520517224e-06, + "loss": 0.0016, + "step": 2206 + }, + { + "epoch": 0.4416, + "learning_rate": 6.856761511570951e-06, + "loss": 0.0045, + "step": 2208 + }, + { + "epoch": 0.442, + "learning_rate": 6.843509630528984e-06, + "loss": 0.002, + "step": 2210 + }, + { + "epoch": 0.4424, + "learning_rate": 6.83026390322649e-06, + "loss": 0.0272, + "step": 2212 + }, + { + "epoch": 0.4428, + "learning_rate": 6.817024355486716e-06, + "loss": 0.0018, + "step": 2214 + }, + { + "epoch": 0.4432, + "learning_rate": 6.80379101312083e-06, + "loss": 0.0035, + "step": 2216 + }, + { + "epoch": 0.4436, + "learning_rate": 6.790563901927899e-06, + "loss": 0.0188, + "step": 2218 + }, + { + "epoch": 0.444, + "learning_rate": 6.777343047694901e-06, + "loss": 0.135, + "step": 2220 + }, + { + "epoch": 0.4444, + "learning_rate": 6.764128476196501e-06, + "loss": 0.0145, + "step": 2222 + }, + { + "epoch": 0.4448, + "learning_rate": 6.750920213195249e-06, + "loss": 0.0014, + "step": 2224 + }, + { + "epoch": 0.4452, + "learning_rate": 6.737718284441263e-06, + "loss": 0.001, + "step": 2226 + }, + { + "epoch": 0.4456, + "learning_rate": 6.724522715672429e-06, + "loss": 0.0015, + "step": 2228 + }, + { + "epoch": 0.446, + "learning_rate": 6.711333532614184e-06, + "loss": 0.0006, + "step": 2230 + }, + { + "epoch": 0.4464, + "learning_rate": 6.698150760979463e-06, + "loss": 0.0012, + "step": 2232 + }, + { + "epoch": 0.4468, + "learning_rate": 6.684974426468806e-06, + "loss": 0.0003, + "step": 2234 + }, + { + "epoch": 0.4472, + "learning_rate": 6.671804554770135e-06, + "loss": 0.3798, + "step": 2236 + }, + { + "epoch": 0.4476, + "learning_rate": 6.658641171558787e-06, + "loss": 0.0003, + "step": 2238 + }, + { + "epoch": 0.448, + "learning_rate": 6.645484302497453e-06, + "loss": 0.0036, + "step": 2240 + }, + { + "epoch": 0.4484, + "learning_rate": 6.63233397323612e-06, + "loss": 0.0003, + "step": 2242 + }, + { + "epoch": 0.4488, + "learning_rate": 6.619190209412032e-06, + "loss": 0.0022, + "step": 2244 + }, + { + "epoch": 0.4492, + "learning_rate": 6.606053036649625e-06, + "loss": 0.0954, + "step": 2246 + }, + { + "epoch": 0.4496, + "learning_rate": 6.59292248056049e-06, + "loss": 0.0009, + "step": 2248 + }, + { + "epoch": 0.45, + "learning_rate": 6.579798566743321e-06, + "loss": 0.0014, + "step": 2250 + }, + { + "epoch": 0.4504, + "learning_rate": 6.5666813207838555e-06, + "loss": 0.0021, + "step": 2252 + }, + { + "epoch": 0.4508, + "learning_rate": 6.5535707682548376e-06, + "loss": 0.0008, + "step": 2254 + }, + { + "epoch": 0.4512, + "learning_rate": 6.540466934715962e-06, + "loss": 0.0011, + "step": 2256 + }, + { + "epoch": 0.4516, + "learning_rate": 6.527369845713805e-06, + "loss": 0.0018, + "step": 2258 + }, + { + "epoch": 0.452, + "learning_rate": 6.514279526781859e-06, + "loss": 0.0012, + "step": 2260 + }, + { + "epoch": 0.4524, + "learning_rate": 6.501196003440308e-06, + "loss": 0.0008, + "step": 2262 + }, + { + "epoch": 0.4528, + "learning_rate": 6.488119301196198e-06, + "loss": 0.0006, + "step": 2264 + }, + { + "epoch": 0.4532, + "learning_rate": 6.475049445543229e-06, + "loss": 0.0034, + "step": 2266 + }, + { + "epoch": 0.4536, + "learning_rate": 6.461986461961704e-06, + "loss": 0.1481, + "step": 2268 + }, + { + "epoch": 0.454, + "learning_rate": 6.448930375918645e-06, + "loss": 0.4991, + "step": 2270 + }, + { + "epoch": 0.4544, + "learning_rate": 6.435881212867492e-06, + "loss": 0.0016, + "step": 2272 + }, + { + "epoch": 0.4548, + "learning_rate": 6.4228389982483085e-06, + "loss": 0.0005, + "step": 2274 + }, + { + "epoch": 0.4552, + "learning_rate": 6.4098037574875395e-06, + "loss": 0.8545, + "step": 2276 + }, + { + "epoch": 0.4556, + "learning_rate": 6.396775515998055e-06, + "loss": 0.0008, + "step": 2278 + }, + { + "epoch": 0.456, + "learning_rate": 6.383754299179079e-06, + "loss": 0.0041, + "step": 2280 + }, + { + "epoch": 0.4564, + "learning_rate": 6.37074013241614e-06, + "loss": 0.0004, + "step": 2282 + }, + { + "epoch": 0.4568, + "learning_rate": 6.357733041081022e-06, + "loss": 0.0011, + "step": 2284 + }, + { + "epoch": 0.4572, + "learning_rate": 6.3447330505317154e-06, + "loss": 0.0805, + "step": 2286 + }, + { + "epoch": 0.4576, + "learning_rate": 6.3317401861123654e-06, + "loss": 0.0002, + "step": 2288 + }, + { + "epoch": 0.458, + "learning_rate": 6.318754473153228e-06, + "loss": 0.0004, + "step": 2290 + }, + { + "epoch": 0.4584, + "learning_rate": 6.305775936970613e-06, + "loss": 0.0049, + "step": 2292 + }, + { + "epoch": 0.4588, + "learning_rate": 6.292804602866825e-06, + "loss": 0.0117, + "step": 2294 + }, + { + "epoch": 0.4592, + "learning_rate": 6.279840496130195e-06, + "loss": 0.0007, + "step": 2296 + }, + { + "epoch": 0.4596, + "learning_rate": 6.266883642034844e-06, + "loss": 0.0143, + "step": 2298 + }, + { + "epoch": 0.46, + "learning_rate": 6.25393406584089e-06, + "loss": 0.0073, + "step": 2300 + }, + { + "epoch": 0.4604, + "learning_rate": 6.240991792794144e-06, + "loss": 0.0012, + "step": 2302 + }, + { + "epoch": 0.4608, + "learning_rate": 6.228056848126229e-06, + "loss": 0.0974, + "step": 2304 + }, + { + "epoch": 0.4612, + "learning_rate": 6.215129257054532e-06, + "loss": 0.0018, + "step": 2306 + }, + { + "epoch": 0.4616, + "learning_rate": 6.202209044781986e-06, + "loss": 0.0629, + "step": 2308 + }, + { + "epoch": 0.462, + "learning_rate": 6.189296236497257e-06, + "loss": 0.6752, + "step": 2310 + }, + { + "epoch": 0.4624, + "learning_rate": 6.176390857374508e-06, + "loss": 0.0162, + "step": 2312 + }, + { + "epoch": 0.4628, + "learning_rate": 6.163492932573437e-06, + "loss": 0.0219, + "step": 2314 + }, + { + "epoch": 0.4632, + "learning_rate": 6.1506024872392075e-06, + "loss": 0.1942, + "step": 2316 + }, + { + "epoch": 0.4636, + "learning_rate": 6.137719546502401e-06, + "loss": 0.0023, + "step": 2318 + }, + { + "epoch": 0.464, + "learning_rate": 6.124844135478973e-06, + "loss": 0.0161, + "step": 2320 + }, + { + "epoch": 0.4644, + "learning_rate": 6.1119762792701935e-06, + "loss": 0.004, + "step": 2322 + }, + { + "epoch": 0.4648, + "learning_rate": 6.099116002962606e-06, + "loss": 0.0637, + "step": 2324 + }, + { + "epoch": 0.4652, + "learning_rate": 6.086263331627981e-06, + "loss": 0.0005, + "step": 2326 + }, + { + "epoch": 0.4656, + "learning_rate": 6.073418290323254e-06, + "loss": 0.3307, + "step": 2328 + }, + { + "epoch": 0.466, + "learning_rate": 6.060580904090495e-06, + "loss": 0.0011, + "step": 2330 + }, + { + "epoch": 0.4664, + "learning_rate": 6.047751197956843e-06, + "loss": 1.0839, + "step": 2332 + }, + { + "epoch": 0.4668, + "learning_rate": 6.034929196934449e-06, + "loss": 0.0115, + "step": 2334 + }, + { + "epoch": 0.4672, + "learning_rate": 6.022114926020511e-06, + "loss": 0.0241, + "step": 2336 + }, + { + "epoch": 0.4676, + "learning_rate": 6.009308410197056e-06, + "loss": 0.0007, + "step": 2338 + }, + { + "epoch": 0.468, + "learning_rate": 5.996509674431044e-06, + "loss": 0.2926, + "step": 2340 + }, + { + "epoch": 0.4684, + "learning_rate": 5.983718743674312e-06, + "loss": 0.0031, + "step": 2342 + }, + { + "epoch": 0.4688, + "learning_rate": 5.970935642863369e-06, + "loss": 0.001, + "step": 2344 + }, + { + "epoch": 0.4692, + "learning_rate": 5.95816039691959e-06, + "loss": 0.0069, + "step": 2346 + }, + { + "epoch": 0.4696, + "learning_rate": 5.945393030748905e-06, + "loss": 0.0012, + "step": 2348 + }, + { + "epoch": 0.47, + "learning_rate": 5.932633569241996e-06, + "loss": 0.0012, + "step": 2350 + }, + { + "epoch": 0.4704, + "learning_rate": 5.919882037274072e-06, + "loss": 0.0328, + "step": 2352 + }, + { + "epoch": 0.4708, + "learning_rate": 5.907138459704893e-06, + "loss": 0.0007, + "step": 2354 + }, + { + "epoch": 0.4712, + "learning_rate": 5.894402861378721e-06, + "loss": 0.0427, + "step": 2356 + }, + { + "epoch": 0.4716, + "learning_rate": 5.881675267124251e-06, + "loss": 0.044, + "step": 2358 + }, + { + "epoch": 0.472, + "learning_rate": 5.868955701754584e-06, + "loss": 0.0094, + "step": 2360 + }, + { + "epoch": 0.4724, + "learning_rate": 5.856244190067162e-06, + "loss": 0.0067, + "step": 2362 + }, + { + "epoch": 0.4728, + "learning_rate": 5.843540756843726e-06, + "loss": 0.3038, + "step": 2364 + }, + { + "epoch": 0.4732, + "learning_rate": 5.83084542685027e-06, + "loss": 0.0022, + "step": 2366 + }, + { + "epoch": 0.4736, + "learning_rate": 5.818158224836989e-06, + "loss": 0.0221, + "step": 2368 + }, + { + "epoch": 0.474, + "learning_rate": 5.805479175538219e-06, + "loss": 0.1523, + "step": 2370 + }, + { + "epoch": 0.4744, + "learning_rate": 5.79280830367246e-06, + "loss": 0.0017, + "step": 2372 + }, + { + "epoch": 0.4748, + "learning_rate": 5.78014563394218e-06, + "loss": 0.0042, + "step": 2374 + }, + { + "epoch": 0.4752, + "learning_rate": 5.7674911910339145e-06, + "loss": 0.1167, + "step": 2376 + }, + { + "epoch": 0.4756, + "learning_rate": 5.75484499961815e-06, + "loss": 0.0005, + "step": 2378 + }, + { + "epoch": 0.476, + "learning_rate": 5.742207084349267e-06, + "loss": 0.0005, + "step": 2380 + }, + { + "epoch": 0.4764, + "learning_rate": 5.729577469865576e-06, + "loss": 0.0022, + "step": 2382 + }, + { + "epoch": 0.4768, + "learning_rate": 5.716956180789093e-06, + "loss": 0.0096, + "step": 2384 + }, + { + "epoch": 0.4772, + "learning_rate": 5.7043432417257135e-06, + "loss": 0.0137, + "step": 2386 + }, + { + "epoch": 0.4776, + "learning_rate": 5.691738677265008e-06, + "loss": 0.0078, + "step": 2388 + }, + { + "epoch": 0.478, + "learning_rate": 5.679142511980175e-06, + "loss": 0.0145, + "step": 2390 + }, + { + "epoch": 0.4784, + "learning_rate": 5.666554770428143e-06, + "loss": 0.0042, + "step": 2392 + }, + { + "epoch": 0.4788, + "learning_rate": 5.653975477149296e-06, + "loss": 0.0041, + "step": 2394 + }, + { + "epoch": 0.4792, + "learning_rate": 5.641404656667659e-06, + "loss": 0.0195, + "step": 2396 + }, + { + "epoch": 0.4796, + "learning_rate": 5.628842333490671e-06, + "loss": 0.0094, + "step": 2398 + }, + { + "epoch": 0.48, + "learning_rate": 5.616288532109228e-06, + "loss": 0.0094, + "step": 2400 + }, + { + "epoch": 0.4804, + "learning_rate": 5.603743276997604e-06, + "loss": 0.004, + "step": 2402 + }, + { + "epoch": 0.4808, + "learning_rate": 5.591206592613419e-06, + "loss": 0.0042, + "step": 2404 + }, + { + "epoch": 0.4812, + "learning_rate": 5.578678503397576e-06, + "loss": 0.0026, + "step": 2406 + }, + { + "epoch": 0.4816, + "learning_rate": 5.566159033774232e-06, + "loss": 0.004, + "step": 2408 + }, + { + "epoch": 0.482, + "learning_rate": 5.55364820815073e-06, + "loss": 0.1211, + "step": 2410 + }, + { + "epoch": 0.4824, + "learning_rate": 5.541146050917566e-06, + "loss": 0.0102, + "step": 2412 + }, + { + "epoch": 0.4828, + "learning_rate": 5.528652586448335e-06, + "loss": 0.0006, + "step": 2414 + }, + { + "epoch": 0.4832, + "learning_rate": 5.516167839099668e-06, + "loss": 0.0029, + "step": 2416 + }, + { + "epoch": 0.4836, + "learning_rate": 5.50369183321127e-06, + "loss": 0.0065, + "step": 2418 + }, + { + "epoch": 0.484, + "learning_rate": 5.491224593105687e-06, + "loss": 0.046, + "step": 2420 + }, + { + "epoch": 0.4844, + "learning_rate": 5.478766143088503e-06, + "loss": 0.3806, + "step": 2422 + }, + { + "epoch": 0.4848, + "learning_rate": 5.46631650744806e-06, + "loss": 1.0648, + "step": 2424 + }, + { + "epoch": 0.4852, + "learning_rate": 5.453875710455556e-06, + "loss": 0.0014, + "step": 2426 + }, + { + "epoch": 0.4856, + "learning_rate": 5.4414437763650116e-06, + "loss": 0.001, + "step": 2428 + }, + { + "epoch": 0.486, + "learning_rate": 5.429020729413056e-06, + "loss": 0.0006, + "step": 2430 + }, + { + "epoch": 0.4864, + "learning_rate": 5.416606593819116e-06, + "loss": 0.0021, + "step": 2432 + }, + { + "epoch": 0.4868, + "learning_rate": 5.40420139378512e-06, + "loss": 0.0005, + "step": 2434 + }, + { + "epoch": 0.4872, + "learning_rate": 5.391805153495691e-06, + "loss": 0.0005, + "step": 2436 + }, + { + "epoch": 0.4876, + "learning_rate": 5.379417897117916e-06, + "loss": 0.2633, + "step": 2438 + }, + { + "epoch": 0.488, + "learning_rate": 5.367039648801384e-06, + "loss": 0.0054, + "step": 2440 + }, + { + "epoch": 0.4884, + "learning_rate": 5.354670432678126e-06, + "loss": 0.0064, + "step": 2442 + }, + { + "epoch": 0.4888, + "learning_rate": 5.3423102728625596e-06, + "loss": 0.1656, + "step": 2444 + }, + { + "epoch": 0.4892, + "learning_rate": 5.32995919345145e-06, + "loss": 0.1383, + "step": 2446 + }, + { + "epoch": 0.4896, + "learning_rate": 5.317617218523859e-06, + "loss": 0.2552, + "step": 2448 + }, + { + "epoch": 0.49, + "learning_rate": 5.305284372141097e-06, + "loss": 0.0041, + "step": 2450 + }, + { + "epoch": 0.4904, + "learning_rate": 5.29296067834668e-06, + "loss": 0.0008, + "step": 2452 + }, + { + "epoch": 0.4908, + "learning_rate": 5.280646161166279e-06, + "loss": 0.035, + "step": 2454 + }, + { + "epoch": 0.4912, + "learning_rate": 5.26834084460766e-06, + "loss": 0.0028, + "step": 2456 + }, + { + "epoch": 0.4916, + "learning_rate": 5.256044752660715e-06, + "loss": 0.0118, + "step": 2458 + }, + { + "epoch": 0.492, + "learning_rate": 5.243757909297257e-06, + "loss": 0.0267, + "step": 2460 + }, + { + "epoch": 0.4924, + "learning_rate": 5.231480338471131e-06, + "loss": 0.0009, + "step": 2462 + }, + { + "epoch": 0.4928, + "learning_rate": 5.219212064118089e-06, + "loss": 0.0034, + "step": 2464 + }, + { + "epoch": 0.4932, + "learning_rate": 5.2069531101557455e-06, + "loss": 0.0004, + "step": 2466 + }, + { + "epoch": 0.4936, + "learning_rate": 5.194703500483604e-06, + "loss": 0.0019, + "step": 2468 + }, + { + "epoch": 0.494, + "learning_rate": 5.182463258982844e-06, + "loss": 0.4016, + "step": 2470 + }, + { + "epoch": 0.4944, + "learning_rate": 5.170232409516489e-06, + "loss": 0.0028, + "step": 2472 + }, + { + "epoch": 0.4948, + "learning_rate": 5.158010975929191e-06, + "loss": 0.0032, + "step": 2474 + }, + { + "epoch": 0.4952, + "learning_rate": 5.14579898204726e-06, + "loss": 0.0005, + "step": 2476 + }, + { + "epoch": 0.4956, + "learning_rate": 5.133596451678618e-06, + "loss": 0.0002, + "step": 2478 + }, + { + "epoch": 0.496, + "learning_rate": 5.121403408612674e-06, + "loss": 0.001, + "step": 2480 + }, + { + "epoch": 0.4964, + "learning_rate": 5.10921987662044e-06, + "loss": 0.0034, + "step": 2482 + }, + { + "epoch": 0.4968, + "learning_rate": 5.097045879454315e-06, + "loss": 0.1624, + "step": 2484 + }, + { + "epoch": 0.4972, + "learning_rate": 5.084881440848133e-06, + "loss": 0.0001, + "step": 2486 + }, + { + "epoch": 0.4976, + "learning_rate": 5.072726584517088e-06, + "loss": 0.0305, + "step": 2488 + }, + { + "epoch": 0.498, + "learning_rate": 5.060581334157695e-06, + "loss": 0.0024, + "step": 2490 + }, + { + "epoch": 0.4984, + "learning_rate": 5.048445713447741e-06, + "loss": 0.0016, + "step": 2492 + }, + { + "epoch": 0.4988, + "learning_rate": 5.036319746046238e-06, + "loss": 0.0016, + "step": 2494 + }, + { + "epoch": 0.4992, + "learning_rate": 5.024203455593382e-06, + "loss": 0.0008, + "step": 2496 + }, + { + "epoch": 0.4996, + "learning_rate": 5.0120968657104995e-06, + "loss": 0.0026, + "step": 2498 + }, + { + "epoch": 0.5, + "learning_rate": 5.000000000000008e-06, + "loss": 0.0408, + "step": 2500 + }, + { + "epoch": 0.5004, + "learning_rate": 4.987912882045352e-06, + "loss": 0.1945, + "step": 2502 + }, + { + "epoch": 0.5008, + "learning_rate": 4.97583553541103e-06, + "loss": 0.006, + "step": 2504 + }, + { + "epoch": 0.5012, + "learning_rate": 4.963767983642387e-06, + "loss": 0.0019, + "step": 2506 + }, + { + "epoch": 0.5016, + "learning_rate": 4.951710250265794e-06, + "loss": 0.0026, + "step": 2508 + }, + { + "epoch": 0.502, + "learning_rate": 4.9396623587883585e-06, + "loss": 0.0009, + "step": 2510 + }, + { + "epoch": 0.5024, + "learning_rate": 4.9276243326981066e-06, + "loss": 0.0004, + "step": 2512 + }, + { + "epoch": 0.5028, + "learning_rate": 4.915596195463787e-06, + "loss": 0.0095, + "step": 2514 + }, + { + "epoch": 0.5032, + "learning_rate": 4.903577970534821e-06, + "loss": 0.2579, + "step": 2516 + }, + { + "epoch": 0.5036, + "learning_rate": 4.891569681341401e-06, + "loss": 0.0091, + "step": 2518 + }, + { + "epoch": 0.504, + "learning_rate": 4.879571351294285e-06, + "loss": 0.0064, + "step": 2520 + }, + { + "epoch": 0.5044, + "learning_rate": 4.867583003784831e-06, + "loss": 0.0037, + "step": 2522 + }, + { + "epoch": 0.5048, + "learning_rate": 4.855604662184938e-06, + "loss": 0.0481, + "step": 2524 + }, + { + "epoch": 0.5052, + "learning_rate": 4.843636349846993e-06, + "loss": 0.0091, + "step": 2526 + }, + { + "epoch": 0.5056, + "learning_rate": 4.831678090103834e-06, + "loss": 0.0011, + "step": 2528 + }, + { + "epoch": 0.506, + "learning_rate": 4.819729906268702e-06, + "loss": 0.3763, + "step": 2530 + }, + { + "epoch": 0.5064, + "learning_rate": 4.8077918216351915e-06, + "loss": 0.0003, + "step": 2532 + }, + { + "epoch": 0.5068, + "learning_rate": 4.795863859477209e-06, + "loss": 0.0447, + "step": 2534 + }, + { + "epoch": 0.5072, + "learning_rate": 4.783946043048928e-06, + "loss": 0.0171, + "step": 2536 + }, + { + "epoch": 0.5076, + "learning_rate": 4.772038395584741e-06, + "loss": 0.0279, + "step": 2538 + }, + { + "epoch": 0.508, + "learning_rate": 4.760140940299216e-06, + "loss": 0.0004, + "step": 2540 + }, + { + "epoch": 0.5084, + "learning_rate": 4.7482537003870366e-06, + "loss": 0.0003, + "step": 2542 + }, + { + "epoch": 0.5088, + "learning_rate": 4.736376699023029e-06, + "loss": 0.0007, + "step": 2544 + }, + { + "epoch": 0.5092, + "learning_rate": 4.724509959361956e-06, + "loss": 0.0013, + "step": 2546 + }, + { + "epoch": 0.5096, + "learning_rate": 4.712653504538679e-06, + "loss": 0.0034, + "step": 2548 + }, + { + "epoch": 0.51, + "learning_rate": 4.700807357667963e-06, + "loss": 0.0009, + "step": 2550 + }, + { + "epoch": 0.5104, + "learning_rate": 4.68897154184443e-06, + "loss": 0.0006, + "step": 2552 + }, + { + "epoch": 0.5108, + "learning_rate": 4.677146080142674e-06, + "loss": 0.6802, + "step": 2554 + }, + { + "epoch": 0.5112, + "learning_rate": 4.665330995616974e-06, + "loss": 0.0022, + "step": 2556 + }, + { + "epoch": 0.5116, + "learning_rate": 4.653526311301486e-06, + "loss": 0.0055, + "step": 2558 + }, + { + "epoch": 0.512, + "learning_rate": 4.641732050210034e-06, + "loss": 0.0007, + "step": 2560 + }, + { + "epoch": 0.5124, + "learning_rate": 4.6299482353361316e-06, + "loss": 0.0259, + "step": 2562 + }, + { + "epoch": 0.5128, + "learning_rate": 4.61817488965293e-06, + "loss": 0.2568, + "step": 2564 + }, + { + "epoch": 0.5132, + "learning_rate": 4.606412036113168e-06, + "loss": 0.0006, + "step": 2566 + }, + { + "epoch": 0.5136, + "learning_rate": 4.594659697649132e-06, + "loss": 0.0441, + "step": 2568 + }, + { + "epoch": 0.514, + "learning_rate": 4.582917897172605e-06, + "loss": 0.0026, + "step": 2570 + }, + { + "epoch": 0.5144, + "learning_rate": 4.57118665757483e-06, + "loss": 0.0491, + "step": 2572 + }, + { + "epoch": 0.5148, + "learning_rate": 4.559466001726457e-06, + "loss": 0.0989, + "step": 2574 + }, + { + "epoch": 0.5152, + "learning_rate": 4.547755952477506e-06, + "loss": 0.0042, + "step": 2576 + }, + { + "epoch": 0.5156, + "learning_rate": 4.536056532657302e-06, + "loss": 0.0136, + "step": 2578 + }, + { + "epoch": 0.516, + "learning_rate": 4.524367765074505e-06, + "loss": 0.0004, + "step": 2580 + }, + { + "epoch": 0.5164, + "learning_rate": 4.512689672516909e-06, + "loss": 0.0934, + "step": 2582 + }, + { + "epoch": 0.5168, + "learning_rate": 4.501022277751611e-06, + "loss": 0.0611, + "step": 2584 + }, + { + "epoch": 0.5172, + "learning_rate": 4.48936560352475e-06, + "loss": 0.7639, + "step": 2586 + }, + { + "epoch": 0.5176, + "learning_rate": 4.477719672561609e-06, + "loss": 0.0066, + "step": 2588 + }, + { + "epoch": 0.518, + "learning_rate": 4.46608450756657e-06, + "loss": 0.0016, + "step": 2590 + }, + { + "epoch": 0.5184, + "learning_rate": 4.4544601312229244e-06, + "loss": 0.0065, + "step": 2592 + }, + { + "epoch": 0.5188, + "learning_rate": 4.442846566193048e-06, + "loss": 0.0012, + "step": 2594 + }, + { + "epoch": 0.5192, + "learning_rate": 4.431243835118119e-06, + "loss": 0.3658, + "step": 2596 + }, + { + "epoch": 0.5196, + "learning_rate": 4.419651960618301e-06, + "loss": 0.0006, + "step": 2598 + }, + { + "epoch": 0.52, + "learning_rate": 4.408070965292531e-06, + "loss": 0.004, + "step": 2600 + }, + { + "epoch": 0.5204, + "learning_rate": 4.396500871718555e-06, + "loss": 0.0125, + "step": 2602 + }, + { + "epoch": 0.5208, + "learning_rate": 4.384941702452858e-06, + "loss": 0.6267, + "step": 2604 + }, + { + "epoch": 0.5212, + "learning_rate": 4.373393480030634e-06, + "loss": 1.1629, + "step": 2606 + }, + { + "epoch": 0.5216, + "learning_rate": 4.361856226965735e-06, + "loss": 0.0005, + "step": 2608 + }, + { + "epoch": 0.522, + "learning_rate": 4.3503299657506235e-06, + "loss": 0.0061, + "step": 2610 + }, + { + "epoch": 0.5224, + "learning_rate": 4.338814718856338e-06, + "loss": 0.0014, + "step": 2612 + }, + { + "epoch": 0.5228, + "learning_rate": 4.32731050873244e-06, + "loss": 0.0214, + "step": 2614 + }, + { + "epoch": 0.5232, + "learning_rate": 4.315817357806976e-06, + "loss": 0.0048, + "step": 2616 + }, + { + "epoch": 0.5236, + "learning_rate": 4.304335288486418e-06, + "loss": 0.0012, + "step": 2618 + }, + { + "epoch": 0.524, + "learning_rate": 4.29286432315569e-06, + "loss": 0.0012, + "step": 2620 + }, + { + "epoch": 0.5244, + "learning_rate": 4.281404484177984e-06, + "loss": 0.0137, + "step": 2622 + }, + { + "epoch": 0.5248, + "learning_rate": 4.269955793894855e-06, + "loss": 0.1938, + "step": 2624 + }, + { + "epoch": 0.5252, + "learning_rate": 4.258518274626112e-06, + "loss": 0.3781, + "step": 2626 + }, + { + "epoch": 0.5256, + "learning_rate": 4.24709194866977e-06, + "loss": 0.0219, + "step": 2628 + }, + { + "epoch": 0.526, + "learning_rate": 4.235676838302077e-06, + "loss": 0.0578, + "step": 2630 + }, + { + "epoch": 0.5264, + "learning_rate": 4.224272965777321e-06, + "loss": 0.0025, + "step": 2632 + }, + { + "epoch": 0.5268, + "learning_rate": 4.212880353327975e-06, + "loss": 0.0033, + "step": 2634 + }, + { + "epoch": 0.5272, + "learning_rate": 4.20149902316452e-06, + "loss": 0.03, + "step": 2636 + }, + { + "epoch": 0.5276, + "learning_rate": 4.1901289974754014e-06, + "loss": 0.0072, + "step": 2638 + }, + { + "epoch": 0.528, + "learning_rate": 4.178770298427119e-06, + "loss": 0.0214, + "step": 2640 + }, + { + "epoch": 0.5284, + "learning_rate": 4.167422948163985e-06, + "loss": 0.453, + "step": 2642 + }, + { + "epoch": 0.5288, + "learning_rate": 4.1560869688082796e-06, + "loss": 0.0127, + "step": 2644 + }, + { + "epoch": 0.5292, + "learning_rate": 4.14476238246006e-06, + "loss": 0.5087, + "step": 2646 + }, + { + "epoch": 0.5296, + "learning_rate": 4.133449211197189e-06, + "loss": 0.0679, + "step": 2648 + }, + { + "epoch": 0.53, + "learning_rate": 4.122147477075271e-06, + "loss": 0.0071, + "step": 2650 + }, + { + "epoch": 0.5304, + "learning_rate": 4.110857202127616e-06, + "loss": 0.0007, + "step": 2652 + }, + { + "epoch": 0.5308, + "learning_rate": 4.0995784083651925e-06, + "loss": 0.0188, + "step": 2654 + }, + { + "epoch": 0.5312, + "learning_rate": 4.088311117776585e-06, + "loss": 0.0576, + "step": 2656 + }, + { + "epoch": 0.5316, + "learning_rate": 4.077055352327955e-06, + "loss": 0.0105, + "step": 2658 + }, + { + "epoch": 0.532, + "learning_rate": 4.065811133962992e-06, + "loss": 0.0133, + "step": 2660 + }, + { + "epoch": 0.5324, + "learning_rate": 4.054578484602876e-06, + "loss": 0.0142, + "step": 2662 + }, + { + "epoch": 0.5328, + "learning_rate": 4.043357426146215e-06, + "loss": 0.022, + "step": 2664 + }, + { + "epoch": 0.5332, + "learning_rate": 4.032147980469081e-06, + "loss": 0.8566, + "step": 2666 + }, + { + "epoch": 0.5336, + "learning_rate": 4.020950169424807e-06, + "loss": 0.2143, + "step": 2668 + }, + { + "epoch": 0.534, + "learning_rate": 4.009764014844152e-06, + "loss": 0.3086, + "step": 2670 + }, + { + "epoch": 0.5344, + "learning_rate": 3.9985895385350556e-06, + "loss": 0.4878, + "step": 2672 + }, + { + "epoch": 0.5348, + "learning_rate": 3.9874267622827316e-06, + "loss": 0.0104, + "step": 2674 + }, + { + "epoch": 0.5352, + "learning_rate": 3.976275707849625e-06, + "loss": 0.0079, + "step": 2676 + }, + { + "epoch": 0.5356, + "learning_rate": 3.965136396975232e-06, + "loss": 0.0033, + "step": 2678 + }, + { + "epoch": 0.536, + "learning_rate": 3.954008851376251e-06, + "loss": 0.4278, + "step": 2680 + }, + { + "epoch": 0.5364, + "learning_rate": 3.942893092746386e-06, + "loss": 0.0073, + "step": 2682 + }, + { + "epoch": 0.5368, + "learning_rate": 3.931789142756379e-06, + "loss": 0.4347, + "step": 2684 + }, + { + "epoch": 0.5372, + "learning_rate": 3.920697023053948e-06, + "loss": 0.0025, + "step": 2686 + }, + { + "epoch": 0.5376, + "learning_rate": 3.909616755263746e-06, + "loss": 0.1917, + "step": 2688 + }, + { + "epoch": 0.538, + "learning_rate": 3.898548360987326e-06, + "loss": 0.0013, + "step": 2690 + }, + { + "epoch": 0.5384, + "learning_rate": 3.887491861803087e-06, + "loss": 0.072, + "step": 2692 + }, + { + "epoch": 0.5388, + "learning_rate": 3.8764472792662385e-06, + "loss": 0.8614, + "step": 2694 + }, + { + "epoch": 0.5392, + "learning_rate": 3.865414634908762e-06, + "loss": 0.008, + "step": 2696 + }, + { + "epoch": 0.5396, + "learning_rate": 3.8543939502393615e-06, + "loss": 0.0058, + "step": 2698 + }, + { + "epoch": 0.54, + "learning_rate": 3.8433852467434226e-06, + "loss": 0.0507, + "step": 2700 + }, + { + "epoch": 0.5404, + "learning_rate": 3.83238854588298e-06, + "loss": 0.0032, + "step": 2702 + }, + { + "epoch": 0.5408, + "learning_rate": 3.821403869096651e-06, + "loss": 0.0099, + "step": 2704 + }, + { + "epoch": 0.5412, + "learning_rate": 3.8104312377996623e-06, + "loss": 0.2467, + "step": 2706 + }, + { + "epoch": 0.5416, + "learning_rate": 3.7994706733836827e-06, + "loss": 0.2366, + "step": 2708 + }, + { + "epoch": 0.542, + "learning_rate": 3.788522197216892e-06, + "loss": 0.0588, + "step": 2710 + }, + { + "epoch": 0.5424, + "learning_rate": 3.7775858306439463e-06, + "loss": 0.0292, + "step": 2712 + }, + { + "epoch": 0.5428, + "learning_rate": 3.7666615949857953e-06, + "loss": 0.0052, + "step": 2714 + }, + { + "epoch": 0.5432, + "learning_rate": 3.7557495115398535e-06, + "loss": 0.0081, + "step": 2716 + }, + { + "epoch": 0.5436, + "learning_rate": 3.7448496015797275e-06, + "loss": 0.0013, + "step": 2718 + }, + { + "epoch": 0.544, + "learning_rate": 3.733961886355394e-06, + "loss": 0.1362, + "step": 2720 + }, + { + "epoch": 0.5444, + "learning_rate": 3.7230863870929947e-06, + "loss": 0.0042, + "step": 2722 + }, + { + "epoch": 0.5448, + "learning_rate": 3.7122231249948726e-06, + "loss": 0.0021, + "step": 2724 + }, + { + "epoch": 0.5452, + "learning_rate": 3.7013721212395136e-06, + "loss": 0.0029, + "step": 2726 + }, + { + "epoch": 0.5456, + "learning_rate": 3.690533396981505e-06, + "loss": 0.1235, + "step": 2728 + }, + { + "epoch": 0.546, + "learning_rate": 3.6797069733514936e-06, + "loss": 0.1934, + "step": 2730 + }, + { + "epoch": 0.5464, + "learning_rate": 3.6688928714561465e-06, + "loss": 0.0059, + "step": 2732 + }, + { + "epoch": 0.5468, + "learning_rate": 3.658091112378108e-06, + "loss": 0.0006, + "step": 2734 + }, + { + "epoch": 0.5472, + "learning_rate": 3.6473017171759618e-06, + "loss": 0.1533, + "step": 2736 + }, + { + "epoch": 0.5476, + "learning_rate": 3.6365247068841837e-06, + "loss": 0.006, + "step": 2738 + }, + { + "epoch": 0.548, + "learning_rate": 3.6257601025130953e-06, + "loss": 0.0026, + "step": 2740 + }, + { + "epoch": 0.5484, + "learning_rate": 3.6150079250488822e-06, + "loss": 0.046, + "step": 2742 + }, + { + "epoch": 0.5488, + "learning_rate": 3.6042681954534264e-06, + "loss": 0.0022, + "step": 2744 + }, + { + "epoch": 0.5492, + "learning_rate": 3.5935409346643923e-06, + "loss": 0.0079, + "step": 2746 + }, + { + "epoch": 0.5496, + "learning_rate": 3.582826163595123e-06, + "loss": 0.0029, + "step": 2748 + }, + { + "epoch": 0.55, + "learning_rate": 3.572123903134602e-06, + "loss": 0.18, + "step": 2750 + }, + { + "epoch": 0.5504, + "learning_rate": 3.5614341741474722e-06, + "loss": 0.0218, + "step": 2752 + }, + { + "epoch": 0.5508, + "learning_rate": 3.5507569974738532e-06, + "loss": 0.002, + "step": 2754 + }, + { + "epoch": 0.5512, + "learning_rate": 3.5400923939294883e-06, + "loss": 0.0112, + "step": 2756 + }, + { + "epoch": 0.5516, + "learning_rate": 3.529440384305555e-06, + "loss": 0.2137, + "step": 2758 + }, + { + "epoch": 0.552, + "learning_rate": 3.5188009893686903e-06, + "loss": 0.3312, + "step": 2760 + }, + { + "epoch": 0.5524, + "learning_rate": 3.5081742298609524e-06, + "loss": 0.1612, + "step": 2762 + }, + { + "epoch": 0.5528, + "learning_rate": 3.4975601264997115e-06, + "loss": 0.1436, + "step": 2764 + }, + { + "epoch": 0.5532, + "learning_rate": 3.4869586999777484e-06, + "loss": 0.0137, + "step": 2766 + }, + { + "epoch": 0.5536, + "learning_rate": 3.4763699709630705e-06, + "loss": 0.0026, + "step": 2768 + }, + { + "epoch": 0.554, + "learning_rate": 3.4657939600989475e-06, + "loss": 0.0014, + "step": 2770 + }, + { + "epoch": 0.5544, + "learning_rate": 3.4552306880038543e-06, + "loss": 0.0151, + "step": 2772 + }, + { + "epoch": 0.5548, + "learning_rate": 3.44468017527143e-06, + "loss": 0.0127, + "step": 2774 + }, + { + "epoch": 0.5552, + "learning_rate": 3.4341424424704394e-06, + "loss": 0.1252, + "step": 2776 + }, + { + "epoch": 0.5556, + "learning_rate": 3.423617510144731e-06, + "loss": 0.0027, + "step": 2778 + }, + { + "epoch": 0.556, + "learning_rate": 3.4131053988132e-06, + "loss": 0.004, + "step": 2780 + }, + { + "epoch": 0.5564, + "learning_rate": 3.4026061289697443e-06, + "loss": 0.0185, + "step": 2782 + }, + { + "epoch": 0.5568, + "learning_rate": 3.392119721083229e-06, + "loss": 0.0103, + "step": 2784 + }, + { + "epoch": 0.5572, + "learning_rate": 3.381646195597428e-06, + "loss": 0.0099, + "step": 2786 + }, + { + "epoch": 0.5576, + "learning_rate": 3.3711855729310563e-06, + "loss": 0.0055, + "step": 2788 + }, + { + "epoch": 0.558, + "learning_rate": 3.3607378734775787e-06, + "loss": 0.0038, + "step": 2790 + }, + { + "epoch": 0.5584, + "learning_rate": 3.3503031176053746e-06, + "loss": 0.017, + "step": 2792 + }, + { + "epoch": 0.5588, + "learning_rate": 3.3398813256574804e-06, + "loss": 0.0032, + "step": 2794 + }, + { + "epoch": 0.5592, + "learning_rate": 3.329472517951752e-06, + "loss": 0.0018, + "step": 2796 + }, + { + "epoch": 0.5596, + "learning_rate": 3.3190767147806947e-06, + "loss": 0.002, + "step": 2798 + }, + { + "epoch": 0.56, + "learning_rate": 3.3086939364114155e-06, + "loss": 0.0099, + "step": 2800 + }, + { + "epoch": 0.5604, + "learning_rate": 3.2983242030857287e-06, + "loss": 0.0085, + "step": 2802 + }, + { + "epoch": 0.5608, + "learning_rate": 3.287967535019906e-06, + "loss": 0.0252, + "step": 2804 + }, + { + "epoch": 0.5612, + "learning_rate": 3.2776239524048405e-06, + "loss": 0.0029, + "step": 2806 + }, + { + "epoch": 0.5616, + "learning_rate": 3.2672934754058637e-06, + "loss": 0.0071, + "step": 2808 + }, + { + "epoch": 0.562, + "learning_rate": 3.2569761241627672e-06, + "loss": 0.5598, + "step": 2810 + }, + { + "epoch": 0.5624, + "learning_rate": 3.2466719187897576e-06, + "loss": 0.4035, + "step": 2812 + }, + { + "epoch": 0.5628, + "learning_rate": 3.236380879375409e-06, + "loss": 0.0011, + "step": 2814 + }, + { + "epoch": 0.5632, + "learning_rate": 3.226103025982631e-06, + "loss": 0.1521, + "step": 2816 + }, + { + "epoch": 0.5636, + "learning_rate": 3.2158383786486225e-06, + "loss": 0.0194, + "step": 2818 + }, + { + "epoch": 0.564, + "learning_rate": 3.2055869573848395e-06, + "loss": 0.0012, + "step": 2820 + }, + { + "epoch": 0.5644, + "learning_rate": 3.1953487821769522e-06, + "loss": 0.0068, + "step": 2822 + }, + { + "epoch": 0.5648, + "learning_rate": 3.1851238729848088e-06, + "loss": 0.7723, + "step": 2824 + }, + { + "epoch": 0.5652, + "learning_rate": 3.174912249742378e-06, + "loss": 0.0058, + "step": 2826 + }, + { + "epoch": 0.5656, + "learning_rate": 3.1647139323577813e-06, + "loss": 0.0013, + "step": 2828 + }, + { + "epoch": 0.566, + "learning_rate": 3.1545289407131086e-06, + "loss": 0.2129, + "step": 2830 + }, + { + "epoch": 0.5664, + "learning_rate": 3.144357294664574e-06, + "loss": 0.0047, + "step": 2832 + }, + { + "epoch": 0.5668, + "learning_rate": 3.134199014042283e-06, + "loss": 0.0015, + "step": 2834 + }, + { + "epoch": 0.5672, + "learning_rate": 3.1240541186503228e-06, + "loss": 0.1139, + "step": 2836 + }, + { + "epoch": 0.5676, + "learning_rate": 3.1139226282667267e-06, + "loss": 0.0077, + "step": 2838 + }, + { + "epoch": 0.568, + "learning_rate": 3.1038045626433e-06, + "loss": 0.0368, + "step": 2840 + }, + { + "epoch": 0.5684, + "learning_rate": 3.09369994150577e-06, + "loss": 0.1915, + "step": 2842 + }, + { + "epoch": 0.5688, + "learning_rate": 3.083608784553598e-06, + "loss": 0.0052, + "step": 2844 + }, + { + "epoch": 0.5692, + "learning_rate": 3.073531111460012e-06, + "loss": 0.0008, + "step": 2846 + }, + { + "epoch": 0.5696, + "learning_rate": 3.0634669418719508e-06, + "loss": 0.0035, + "step": 2848 + }, + { + "epoch": 0.57, + "learning_rate": 3.053416295410029e-06, + "loss": 0.0148, + "step": 2850 + }, + { + "epoch": 0.5704, + "learning_rate": 3.043379191668494e-06, + "loss": 0.0013, + "step": 2852 + }, + { + "epoch": 0.5708, + "learning_rate": 3.033355650215195e-06, + "loss": 0.0121, + "step": 2854 + }, + { + "epoch": 0.5712, + "learning_rate": 3.023345690591538e-06, + "loss": 0.0007, + "step": 2856 + }, + { + "epoch": 0.5716, + "learning_rate": 3.013349332312453e-06, + "loss": 0.0044, + "step": 2858 + }, + { + "epoch": 0.572, + "learning_rate": 3.0033665948663504e-06, + "loss": 0.0039, + "step": 2860 + }, + { + "epoch": 0.5724, + "learning_rate": 2.9933974977150882e-06, + "loss": 0.0644, + "step": 2862 + }, + { + "epoch": 0.5728, + "learning_rate": 2.983442060293931e-06, + "loss": 0.023, + "step": 2864 + }, + { + "epoch": 0.5732, + "learning_rate": 2.9735003020115015e-06, + "loss": 0.1772, + "step": 2866 + }, + { + "epoch": 0.5736, + "learning_rate": 2.9635722422498038e-06, + "loss": 0.0068, + "step": 2868 + }, + { + "epoch": 0.574, + "learning_rate": 2.953657900364061e-06, + "loss": 0.0031, + "step": 2870 + }, + { + "epoch": 0.5744, + "learning_rate": 2.9437572956827885e-06, + "loss": 0.1387, + "step": 2872 + }, + { + "epoch": 0.5748, + "learning_rate": 2.9338704475077616e-06, + "loss": 0.0002, + "step": 2874 + }, + { + "epoch": 0.5752, + "learning_rate": 2.9239973751138452e-06, + "loss": 0.0245, + "step": 2876 + }, + { + "epoch": 0.5756, + "learning_rate": 2.9141380977491483e-06, + "loss": 0.0207, + "step": 2878 + }, + { + "epoch": 0.576, + "learning_rate": 2.904292634634789e-06, + "loss": 0.003, + "step": 2880 + }, + { + "epoch": 0.5764, + "learning_rate": 2.894461004965037e-06, + "loss": 0.0017, + "step": 2882 + }, + { + "epoch": 0.5768, + "learning_rate": 2.884643227907158e-06, + "loss": 0.02, + "step": 2884 + }, + { + "epoch": 0.5772, + "learning_rate": 2.8748393226013736e-06, + "loss": 0.0114, + "step": 2886 + }, + { + "epoch": 0.5776, + "learning_rate": 2.8650493081609365e-06, + "loss": 0.0041, + "step": 2888 + }, + { + "epoch": 0.578, + "learning_rate": 2.8552732036719676e-06, + "loss": 0.0003, + "step": 2890 + }, + { + "epoch": 0.5784, + "learning_rate": 2.845511028193483e-06, + "loss": 0.0027, + "step": 2892 + }, + { + "epoch": 0.5788, + "learning_rate": 2.8357628007573434e-06, + "loss": 0.2252, + "step": 2894 + }, + { + "epoch": 0.5792, + "learning_rate": 2.8260285403682166e-06, + "loss": 0.0089, + "step": 2896 + }, + { + "epoch": 0.5796, + "learning_rate": 2.8163082660035425e-06, + "loss": 0.0076, + "step": 2898 + }, + { + "epoch": 0.58, + "learning_rate": 2.806601996613493e-06, + "loss": 0.001, + "step": 2900 + }, + { + "epoch": 0.5804, + "learning_rate": 2.7969097511209353e-06, + "loss": 0.0018, + "step": 2902 + }, + { + "epoch": 0.5808, + "learning_rate": 2.787231548421401e-06, + "loss": 0.0299, + "step": 2904 + }, + { + "epoch": 0.5812, + "learning_rate": 2.7775674073830384e-06, + "loss": 0.0157, + "step": 2906 + }, + { + "epoch": 0.5816, + "learning_rate": 2.767917346846586e-06, + "loss": 0.0044, + "step": 2908 + }, + { + "epoch": 0.582, + "learning_rate": 2.7582813856253323e-06, + "loss": 0.004, + "step": 2910 + }, + { + "epoch": 0.5824, + "learning_rate": 2.748659542505062e-06, + "loss": 0.005, + "step": 2912 + }, + { + "epoch": 0.5828, + "learning_rate": 2.7390518362440886e-06, + "loss": 0.2294, + "step": 2914 + }, + { + "epoch": 0.5832, + "learning_rate": 2.729458285573079e-06, + "loss": 0.0012, + "step": 2916 + }, + { + "epoch": 0.5836, + "learning_rate": 2.719878909195186e-06, + "loss": 0.0915, + "step": 2918 + }, + { + "epoch": 0.584, + "learning_rate": 2.7103137257858948e-06, + "loss": 0.0009, + "step": 2920 + }, + { + "epoch": 0.5844, + "learning_rate": 2.700762753992984e-06, + "loss": 0.0009, + "step": 2922 + }, + { + "epoch": 0.5848, + "learning_rate": 2.6912260124366087e-06, + "loss": 0.0041, + "step": 2924 + }, + { + "epoch": 0.5852, + "learning_rate": 2.681703519709088e-06, + "loss": 0.1642, + "step": 2926 + }, + { + "epoch": 0.5856, + "learning_rate": 2.672195294375044e-06, + "loss": 0.0008, + "step": 2928 + }, + { + "epoch": 0.586, + "learning_rate": 2.6627013549712368e-06, + "loss": 0.0039, + "step": 2930 + }, + { + "epoch": 0.5864, + "learning_rate": 2.653221720006587e-06, + "loss": 0.003, + "step": 2932 + }, + { + "epoch": 0.5868, + "learning_rate": 2.643756407962128e-06, + "loss": 0.0101, + "step": 2934 + }, + { + "epoch": 0.5872, + "learning_rate": 2.6343054372909703e-06, + "loss": 0.0313, + "step": 2936 + }, + { + "epoch": 0.5876, + "learning_rate": 2.6248688264182643e-06, + "loss": 0.0089, + "step": 2938 + }, + { + "epoch": 0.588, + "learning_rate": 2.6154465937411656e-06, + "loss": 0.0345, + "step": 2940 + }, + { + "epoch": 0.5884, + "learning_rate": 2.606038757628799e-06, + "loss": 0.0025, + "step": 2942 + }, + { + "epoch": 0.5888, + "learning_rate": 2.5966453364222234e-06, + "loss": 0.03, + "step": 2944 + }, + { + "epoch": 0.5892, + "learning_rate": 2.587266348434394e-06, + "loss": 0.098, + "step": 2946 + }, + { + "epoch": 0.5896, + "learning_rate": 2.577901811950114e-06, + "loss": 0.0021, + "step": 2948 + }, + { + "epoch": 0.59, + "learning_rate": 2.5685517452260642e-06, + "loss": 0.0431, + "step": 2950 + }, + { + "epoch": 0.5904, + "learning_rate": 2.559216166490629e-06, + "loss": 0.5247, + "step": 2952 + }, + { + "epoch": 0.5908, + "learning_rate": 2.549895093944046e-06, + "loss": 0.0151, + "step": 2954 + }, + { + "epoch": 0.5912, + "learning_rate": 2.540588545758187e-06, + "loss": 0.0243, + "step": 2956 + }, + { + "epoch": 0.5916, + "learning_rate": 2.5312965400766433e-06, + "loss": 0.2673, + "step": 2958 + }, + { + "epoch": 0.592, + "learning_rate": 2.5220190950146906e-06, + "loss": 0.2361, + "step": 2960 + }, + { + "epoch": 0.5924, + "learning_rate": 2.512756228659137e-06, + "loss": 0.0484, + "step": 2962 + }, + { + "epoch": 0.5928, + "learning_rate": 2.5035079590684598e-06, + "loss": 0.001, + "step": 2964 + }, + { + "epoch": 0.5932, + "learning_rate": 2.4942743042725883e-06, + "loss": 0.1469, + "step": 2966 + }, + { + "epoch": 0.5936, + "learning_rate": 2.485055282273039e-06, + "loss": 0.0117, + "step": 2968 + }, + { + "epoch": 0.594, + "learning_rate": 2.4758509110427563e-06, + "loss": 0.0027, + "step": 2970 + }, + { + "epoch": 0.5944, + "learning_rate": 2.466661208526133e-06, + "loss": 0.0005, + "step": 2972 + }, + { + "epoch": 0.5948, + "learning_rate": 2.4574861926389636e-06, + "loss": 0.0011, + "step": 2974 + }, + { + "epoch": 0.5952, + "learning_rate": 2.448325881268411e-06, + "loss": 0.0018, + "step": 2976 + }, + { + "epoch": 0.5956, + "learning_rate": 2.439180292272971e-06, + "loss": 0.0016, + "step": 2978 + }, + { + "epoch": 0.596, + "learning_rate": 2.4300494434824385e-06, + "loss": 0.0597, + "step": 2980 + }, + { + "epoch": 0.5964, + "learning_rate": 2.4209333526978694e-06, + "loss": 0.0088, + "step": 2982 + }, + { + "epoch": 0.5968, + "learning_rate": 2.4118320376915495e-06, + "loss": 0.0025, + "step": 2984 + }, + { + "epoch": 0.5972, + "learning_rate": 2.402745516206958e-06, + "loss": 0.0023, + "step": 2986 + }, + { + "epoch": 0.5976, + "learning_rate": 2.3936738059587217e-06, + "loss": 0.0026, + "step": 2988 + }, + { + "epoch": 0.598, + "learning_rate": 2.3846169246326387e-06, + "loss": 0.1479, + "step": 2990 + }, + { + "epoch": 0.5984, + "learning_rate": 2.375574889885528e-06, + "loss": 0.0011, + "step": 2992 + }, + { + "epoch": 0.5988, + "learning_rate": 2.36654771934531e-06, + "loss": 0.0009, + "step": 2994 + }, + { + "epoch": 0.5992, + "learning_rate": 2.3575354306109177e-06, + "loss": 0.0002, + "step": 2996 + }, + { + "epoch": 0.5996, + "learning_rate": 2.348538041252255e-06, + "loss": 0.7103, + "step": 2998 + }, + { + "epoch": 0.6, + "learning_rate": 2.339555568810228e-06, + "loss": 0.0029, + "step": 3000 + }, + { + "epoch": 0.6004, + "learning_rate": 2.330588030796582e-06, + "loss": 0.0014, + "step": 3002 + }, + { + "epoch": 0.6008, + "learning_rate": 2.321635444694024e-06, + "loss": 0.2558, + "step": 3004 + }, + { + "epoch": 0.6012, + "learning_rate": 2.3126978279560674e-06, + "loss": 0.0435, + "step": 3006 + }, + { + "epoch": 0.6016, + "learning_rate": 2.3037751980070544e-06, + "loss": 0.0023, + "step": 3008 + }, + { + "epoch": 0.602, + "learning_rate": 2.294867572242119e-06, + "loss": 0.0859, + "step": 3010 + }, + { + "epoch": 0.6024, + "learning_rate": 2.2859749680271004e-06, + "loss": 0.0976, + "step": 3012 + }, + { + "epoch": 0.6028, + "learning_rate": 2.2770974026986182e-06, + "loss": 0.0036, + "step": 3014 + }, + { + "epoch": 0.6032, + "learning_rate": 2.26823489356393e-06, + "loss": 0.0024, + "step": 3016 + }, + { + "epoch": 0.6036, + "learning_rate": 2.2593874579009488e-06, + "loss": 0.0008, + "step": 3018 + }, + { + "epoch": 0.604, + "learning_rate": 2.250555112958207e-06, + "loss": 0.0011, + "step": 3020 + }, + { + "epoch": 0.6044, + "learning_rate": 2.24173787595481e-06, + "loss": 0.0006, + "step": 3022 + }, + { + "epoch": 0.6048, + "learning_rate": 2.232935764080414e-06, + "loss": 0.0015, + "step": 3024 + }, + { + "epoch": 0.6052, + "learning_rate": 2.2241487944951846e-06, + "loss": 0.0008, + "step": 3026 + }, + { + "epoch": 0.6056, + "learning_rate": 2.215376984329771e-06, + "loss": 0.0054, + "step": 3028 + }, + { + "epoch": 0.606, + "learning_rate": 2.2066203506852646e-06, + "loss": 0.3544, + "step": 3030 + }, + { + "epoch": 0.6064, + "learning_rate": 2.197878910633171e-06, + "loss": 0.0086, + "step": 3032 + }, + { + "epoch": 0.6068, + "learning_rate": 2.189152681215363e-06, + "loss": 0.0044, + "step": 3034 + }, + { + "epoch": 0.6072, + "learning_rate": 2.180441679444106e-06, + "loss": 0.1111, + "step": 3036 + }, + { + "epoch": 0.6076, + "learning_rate": 2.1717459223019e-06, + "loss": 0.0406, + "step": 3038 + }, + { + "epoch": 0.608, + "learning_rate": 2.1630654267416106e-06, + "loss": 0.0125, + "step": 3040 + }, + { + "epoch": 0.6084, + "learning_rate": 2.1544002096862647e-06, + "loss": 0.091, + "step": 3042 + }, + { + "epoch": 0.6088, + "learning_rate": 2.1457502880291802e-06, + "loss": 0.0258, + "step": 3044 + }, + { + "epoch": 0.6092, + "learning_rate": 2.137115678633821e-06, + "loss": 0.0025, + "step": 3046 + }, + { + "epoch": 0.6096, + "learning_rate": 2.128496398333767e-06, + "loss": 0.003, + "step": 3048 + }, + { + "epoch": 0.61, + "learning_rate": 2.11989246393278e-06, + "loss": 0.0075, + "step": 3050 + }, + { + "epoch": 0.6104, + "learning_rate": 2.111303892204659e-06, + "loss": 0.0053, + "step": 3052 + }, + { + "epoch": 0.6108, + "learning_rate": 2.102730699893264e-06, + "loss": 0.0043, + "step": 3054 + }, + { + "epoch": 0.6112, + "learning_rate": 2.0941729037124723e-06, + "loss": 0.0042, + "step": 3056 + }, + { + "epoch": 0.6116, + "learning_rate": 2.085630520346145e-06, + "loss": 1.9227, + "step": 3058 + }, + { + "epoch": 0.612, + "learning_rate": 2.0771035664480953e-06, + "loss": 0.0006, + "step": 3060 + }, + { + "epoch": 0.6124, + "learning_rate": 2.068592058642057e-06, + "loss": 0.0168, + "step": 3062 + }, + { + "epoch": 0.6128, + "learning_rate": 2.060096013521651e-06, + "loss": 0.0017, + "step": 3064 + }, + { + "epoch": 0.6132, + "learning_rate": 2.051615447650349e-06, + "loss": 0.0862, + "step": 3066 + }, + { + "epoch": 0.6136, + "learning_rate": 2.0431503775614504e-06, + "loss": 0.001, + "step": 3068 + }, + { + "epoch": 0.614, + "learning_rate": 2.034700819758042e-06, + "loss": 0.0057, + "step": 3070 + }, + { + "epoch": 0.6144, + "learning_rate": 2.0262667907129697e-06, + "loss": 0.0385, + "step": 3072 + }, + { + "epoch": 0.6148, + "learning_rate": 2.0178483068687925e-06, + "loss": 0.0308, + "step": 3074 + }, + { + "epoch": 0.6152, + "learning_rate": 2.009445384637809e-06, + "loss": 0.1353, + "step": 3076 + }, + { + "epoch": 0.6156, + "learning_rate": 2.001058040401903e-06, + "loss": 0.0021, + "step": 3078 + }, + { + "epoch": 0.616, + "learning_rate": 1.992686290512662e-06, + "loss": 0.0009, + "step": 3080 + }, + { + "epoch": 0.6164, + "learning_rate": 1.9843301512912425e-06, + "loss": 0.0006, + "step": 3082 + }, + { + "epoch": 0.6168, + "learning_rate": 1.975989639028333e-06, + "loss": 0.0002, + "step": 3084 + }, + { + "epoch": 0.6172, + "learning_rate": 1.967664769984231e-06, + "loss": 0.0129, + "step": 3086 + }, + { + "epoch": 0.6176, + "learning_rate": 1.9593555603886526e-06, + "loss": 0.0002, + "step": 3088 + }, + { + "epoch": 0.618, + "learning_rate": 1.9510620264408586e-06, + "loss": 1.5326, + "step": 3090 + }, + { + "epoch": 0.6184, + "learning_rate": 1.9427841843095076e-06, + "loss": 0.0069, + "step": 3092 + }, + { + "epoch": 0.6188, + "learning_rate": 1.9345220501326754e-06, + "loss": 0.0034, + "step": 3094 + }, + { + "epoch": 0.6192, + "learning_rate": 1.926275640017817e-06, + "loss": 0.0071, + "step": 3096 + }, + { + "epoch": 0.6196, + "learning_rate": 1.9180449700417316e-06, + "loss": 0.0009, + "step": 3098 + }, + { + "epoch": 0.62, + "learning_rate": 1.909830056250528e-06, + "loss": 0.0039, + "step": 3100 + }, + { + "epoch": 0.6204, + "learning_rate": 1.9016309146596035e-06, + "loss": 0.2689, + "step": 3102 + }, + { + "epoch": 0.6208, + "learning_rate": 1.893447561253604e-06, + "loss": 0.8079, + "step": 3104 + }, + { + "epoch": 0.6212, + "learning_rate": 1.8852800119863946e-06, + "loss": 0.0408, + "step": 3106 + }, + { + "epoch": 0.6216, + "learning_rate": 1.8771282827810322e-06, + "loss": 0.0926, + "step": 3108 + }, + { + "epoch": 0.622, + "learning_rate": 1.868992389529718e-06, + "loss": 0.0137, + "step": 3110 + }, + { + "epoch": 0.6224, + "learning_rate": 1.860872348093824e-06, + "loss": 0.3767, + "step": 3112 + }, + { + "epoch": 0.6228, + "learning_rate": 1.8527681743037452e-06, + "loss": 0.0583, + "step": 3114 + }, + { + "epoch": 0.6232, + "learning_rate": 1.8446798839590252e-06, + "loss": 0.0037, + "step": 3116 + }, + { + "epoch": 0.6236, + "learning_rate": 1.8366074928281674e-06, + "loss": 0.0052, + "step": 3118 + }, + { + "epoch": 0.624, + "learning_rate": 1.828551016648712e-06, + "loss": 0.0076, + "step": 3120 + }, + { + "epoch": 0.6244, + "learning_rate": 1.8205104711272025e-06, + "loss": 0.1665, + "step": 3122 + }, + { + "epoch": 0.6248, + "learning_rate": 1.8124858719390526e-06, + "loss": 0.001, + "step": 3124 + }, + { + "epoch": 0.6252, + "learning_rate": 1.80447723472866e-06, + "loss": 0.1499, + "step": 3126 + }, + { + "epoch": 0.6256, + "learning_rate": 1.796484575109263e-06, + "loss": 0.0074, + "step": 3128 + }, + { + "epoch": 0.626, + "learning_rate": 1.7885079086629587e-06, + "loss": 0.0053, + "step": 3130 + }, + { + "epoch": 0.6264, + "learning_rate": 1.7805472509406784e-06, + "loss": 0.0069, + "step": 3132 + }, + { + "epoch": 0.6268, + "learning_rate": 1.7726026174621014e-06, + "loss": 0.0419, + "step": 3134 + }, + { + "epoch": 0.6272, + "learning_rate": 1.7646740237157267e-06, + "loss": 0.012, + "step": 3136 + }, + { + "epoch": 0.6276, + "learning_rate": 1.7567614851587433e-06, + "loss": 0.0022, + "step": 3138 + }, + { + "epoch": 0.628, + "learning_rate": 1.7488650172170506e-06, + "loss": 0.0013, + "step": 3140 + }, + { + "epoch": 0.6284, + "learning_rate": 1.7409846352852167e-06, + "loss": 0.4145, + "step": 3142 + }, + { + "epoch": 0.6288, + "learning_rate": 1.7331203547264486e-06, + "loss": 0.0017, + "step": 3144 + }, + { + "epoch": 0.6292, + "learning_rate": 1.7252721908725656e-06, + "loss": 0.0033, + "step": 3146 + }, + { + "epoch": 0.6296, + "learning_rate": 1.717440159023962e-06, + "loss": 0.0055, + "step": 3148 + }, + { + "epoch": 0.63, + "learning_rate": 1.7096242744495872e-06, + "loss": 0.0283, + "step": 3150 + }, + { + "epoch": 0.6304, + "learning_rate": 1.701824552386907e-06, + "loss": 0.0007, + "step": 3152 + }, + { + "epoch": 0.6308, + "learning_rate": 1.694041008041879e-06, + "loss": 0.0041, + "step": 3154 + }, + { + "epoch": 0.6312, + "learning_rate": 1.6862736565889105e-06, + "loss": 0.0012, + "step": 3156 + }, + { + "epoch": 0.6316, + "learning_rate": 1.6785225131708816e-06, + "loss": 0.1999, + "step": 3158 + }, + { + "epoch": 0.632, + "learning_rate": 1.6707875928990025e-06, + "loss": 0.002, + "step": 3160 + }, + { + "epoch": 0.6324, + "learning_rate": 1.6630689108529352e-06, + "loss": 0.0158, + "step": 3162 + }, + { + "epoch": 0.6328, + "learning_rate": 1.655366482080607e-06, + "loss": 0.0006, + "step": 3164 + }, + { + "epoch": 0.6332, + "learning_rate": 1.647680321598325e-06, + "loss": 0.0016, + "step": 3166 + }, + { + "epoch": 0.6336, + "learning_rate": 1.6400104443906539e-06, + "loss": 0.0012, + "step": 3168 + }, + { + "epoch": 0.634, + "learning_rate": 1.6323568654103827e-06, + "loss": 0.0069, + "step": 3170 + }, + { + "epoch": 0.6344, + "learning_rate": 1.6247195995785925e-06, + "loss": 0.0014, + "step": 3172 + }, + { + "epoch": 0.6348, + "learning_rate": 1.6170986617844853e-06, + "loss": 0.0009, + "step": 3174 + }, + { + "epoch": 0.6352, + "learning_rate": 1.6094940668854998e-06, + "loss": 0.0031, + "step": 3176 + }, + { + "epoch": 0.6356, + "learning_rate": 1.601905829707172e-06, + "loss": 0.0193, + "step": 3178 + }, + { + "epoch": 0.636, + "learning_rate": 1.5943339650431589e-06, + "loss": 0.2355, + "step": 3180 + }, + { + "epoch": 0.6364, + "learning_rate": 1.5867784876551984e-06, + "loss": 0.0038, + "step": 3182 + }, + { + "epoch": 0.6368, + "learning_rate": 1.5792394122730793e-06, + "loss": 0.0016, + "step": 3184 + }, + { + "epoch": 0.6372, + "learning_rate": 1.5717167535946155e-06, + "loss": 0.1911, + "step": 3186 + }, + { + "epoch": 0.6376, + "learning_rate": 1.5642105262856155e-06, + "loss": 0.2849, + "step": 3188 + }, + { + "epoch": 0.638, + "learning_rate": 1.5567207449798527e-06, + "loss": 0.002, + "step": 3190 + }, + { + "epoch": 0.6384, + "learning_rate": 1.54924742427904e-06, + "loss": 0.0005, + "step": 3192 + }, + { + "epoch": 0.6388, + "learning_rate": 1.5417905787527975e-06, + "loss": 0.0085, + "step": 3194 + }, + { + "epoch": 0.6392, + "learning_rate": 1.5343502229386175e-06, + "loss": 0.0016, + "step": 3196 + }, + { + "epoch": 0.6396, + "learning_rate": 1.5269263713418847e-06, + "loss": 0.0161, + "step": 3198 + }, + { + "epoch": 0.64, + "learning_rate": 1.5195190384357373e-06, + "loss": 0.0171, + "step": 3200 + }, + { + "epoch": 0.6404, + "learning_rate": 1.512128238661189e-06, + "loss": 0.0017, + "step": 3202 + }, + { + "epoch": 0.6408, + "learning_rate": 1.5047539864269534e-06, + "loss": 1.1593, + "step": 3204 + }, + { + "epoch": 0.6412, + "learning_rate": 1.4973962961095122e-06, + "loss": 0.009, + "step": 3206 + }, + { + "epoch": 0.6416, + "learning_rate": 1.4900551820530895e-06, + "loss": 0.0066, + "step": 3208 + }, + { + "epoch": 0.642, + "learning_rate": 1.4827306585695234e-06, + "loss": 0.0015, + "step": 3210 + }, + { + "epoch": 0.6424, + "learning_rate": 1.4754227399383748e-06, + "loss": 0.0065, + "step": 3212 + }, + { + "epoch": 0.6428, + "learning_rate": 1.468131440406797e-06, + "loss": 0.0095, + "step": 3214 + }, + { + "epoch": 0.6432, + "learning_rate": 1.4608567741895507e-06, + "loss": 0.0022, + "step": 3216 + }, + { + "epoch": 0.6436, + "learning_rate": 1.4535987554689701e-06, + "loss": 0.3137, + "step": 3218 + }, + { + "epoch": 0.644, + "learning_rate": 1.4463573983949353e-06, + "loss": 0.0024, + "step": 3220 + }, + { + "epoch": 0.6444, + "learning_rate": 1.4391327170848402e-06, + "loss": 0.0011, + "step": 3222 + }, + { + "epoch": 0.6448, + "learning_rate": 1.4319247256235723e-06, + "loss": 0.0006, + "step": 3224 + }, + { + "epoch": 0.6452, + "learning_rate": 1.4247334380634803e-06, + "loss": 0.0808, + "step": 3226 + }, + { + "epoch": 0.6456, + "learning_rate": 1.4175588684243458e-06, + "loss": 0.0037, + "step": 3228 + }, + { + "epoch": 0.646, + "learning_rate": 1.4104010306933592e-06, + "loss": 0.0062, + "step": 3230 + }, + { + "epoch": 0.6464, + "learning_rate": 1.4032599388250934e-06, + "loss": 0.0032, + "step": 3232 + }, + { + "epoch": 0.6468, + "learning_rate": 1.39613560674147e-06, + "loss": 0.0038, + "step": 3234 + }, + { + "epoch": 0.6472, + "learning_rate": 1.389028048331732e-06, + "loss": 0.0207, + "step": 3236 + }, + { + "epoch": 0.6476, + "learning_rate": 1.3819372774524542e-06, + "loss": 0.0075, + "step": 3238 + }, + { + "epoch": 0.648, + "learning_rate": 1.374863307927431e-06, + "loss": 0.0026, + "step": 3240 + }, + { + "epoch": 0.6484, + "learning_rate": 1.3678061535477271e-06, + "loss": 0.198, + "step": 3242 + }, + { + "epoch": 0.6488, + "learning_rate": 1.360765828071653e-06, + "loss": 0.0002, + "step": 3244 + }, + { + "epoch": 0.6492, + "learning_rate": 1.353742345224649e-06, + "loss": 0.0036, + "step": 3246 + }, + { + "epoch": 0.6496, + "learning_rate": 1.3467357186993878e-06, + "loss": 0.0033, + "step": 3248 + }, + { + "epoch": 0.65, + "learning_rate": 1.339745962155612e-06, + "loss": 0.0072, + "step": 3250 + }, + { + "epoch": 0.6504, + "learning_rate": 1.3327730892202373e-06, + "loss": 0.0013, + "step": 3252 + }, + { + "epoch": 0.6508, + "learning_rate": 1.3258171134872254e-06, + "loss": 0.0061, + "step": 3254 + }, + { + "epoch": 0.6512, + "learning_rate": 1.3188780485176078e-06, + "loss": 0.0187, + "step": 3256 + }, + { + "epoch": 0.6516, + "learning_rate": 1.3119559078394473e-06, + "loss": 0.0007, + "step": 3258 + }, + { + "epoch": 0.652, + "learning_rate": 1.3050507049478111e-06, + "loss": 0.0083, + "step": 3260 + }, + { + "epoch": 0.6524, + "learning_rate": 1.2981624533047444e-06, + "loss": 0.0005, + "step": 3262 + }, + { + "epoch": 0.6528, + "learning_rate": 1.291291166339248e-06, + "loss": 0.0014, + "step": 3264 + }, + { + "epoch": 0.6532, + "learning_rate": 1.2844368574472466e-06, + "loss": 0.0223, + "step": 3266 + }, + { + "epoch": 0.6536, + "learning_rate": 1.2775995399915664e-06, + "loss": 0.0087, + "step": 3268 + }, + { + "epoch": 0.654, + "learning_rate": 1.270779227301906e-06, + "loss": 0.0009, + "step": 3270 + }, + { + "epoch": 0.6544, + "learning_rate": 1.263975932674808e-06, + "loss": 0.0085, + "step": 3272 + }, + { + "epoch": 0.6548, + "learning_rate": 1.2571896693736674e-06, + "loss": 0.0012, + "step": 3274 + }, + { + "epoch": 0.6552, + "learning_rate": 1.2504204506286278e-06, + "loss": 0.0029, + "step": 3276 + }, + { + "epoch": 0.6556, + "learning_rate": 1.2436682896366337e-06, + "loss": 0.048, + "step": 3278 + }, + { + "epoch": 0.656, + "learning_rate": 1.2369331995613698e-06, + "loss": 1.6575, + "step": 3280 + }, + { + "epoch": 0.6564, + "learning_rate": 1.2302151935332295e-06, + "loss": 0.0425, + "step": 3282 + }, + { + "epoch": 0.6568, + "learning_rate": 1.2235142846493365e-06, + "loss": 0.1979, + "step": 3284 + }, + { + "epoch": 0.6572, + "learning_rate": 1.2168304859734203e-06, + "loss": 0.0007, + "step": 3286 + }, + { + "epoch": 0.6576, + "learning_rate": 1.2101638105359136e-06, + "loss": 0.0018, + "step": 3288 + }, + { + "epoch": 0.658, + "learning_rate": 1.2035142713338333e-06, + "loss": 0.0716, + "step": 3290 + }, + { + "epoch": 0.6584, + "learning_rate": 1.196881881330797e-06, + "loss": 0.3609, + "step": 3292 + }, + { + "epoch": 0.6588, + "learning_rate": 1.1902666534569962e-06, + "loss": 0.0026, + "step": 3294 + }, + { + "epoch": 0.6592, + "learning_rate": 1.1836686006091302e-06, + "loss": 0.0007, + "step": 3296 + }, + { + "epoch": 0.6596, + "learning_rate": 1.1770877356504673e-06, + "loss": 0.0223, + "step": 3298 + }, + { + "epoch": 0.66, + "learning_rate": 1.1705240714107314e-06, + "loss": 0.004, + "step": 3300 + }, + { + "epoch": 0.6604, + "learning_rate": 1.163977620686121e-06, + "loss": 0.0019, + "step": 3302 + }, + { + "epoch": 0.6608, + "learning_rate": 1.157448396239278e-06, + "loss": 0.0005, + "step": 3304 + }, + { + "epoch": 0.6612, + "learning_rate": 1.1509364107992592e-06, + "loss": 0.0011, + "step": 3306 + }, + { + "epoch": 0.6616, + "learning_rate": 1.144441677061513e-06, + "loss": 0.0056, + "step": 3308 + }, + { + "epoch": 0.662, + "learning_rate": 1.137964207687856e-06, + "loss": 0.011, + "step": 3310 + }, + { + "epoch": 0.6624, + "learning_rate": 1.131504015306445e-06, + "loss": 0.1228, + "step": 3312 + }, + { + "epoch": 0.6628, + "learning_rate": 1.1250611125117561e-06, + "loss": 0.0267, + "step": 3314 + }, + { + "epoch": 0.6632, + "learning_rate": 1.1186355118645586e-06, + "loss": 0.0134, + "step": 3316 + }, + { + "epoch": 0.6636, + "learning_rate": 1.112227225891881e-06, + "loss": 0.3537, + "step": 3318 + }, + { + "epoch": 0.664, + "learning_rate": 1.1058362670870293e-06, + "loss": 0.0252, + "step": 3320 + }, + { + "epoch": 0.6644, + "learning_rate": 1.0994626479094728e-06, + "loss": 0.0085, + "step": 3322 + }, + { + "epoch": 0.6648, + "learning_rate": 1.0931063807849395e-06, + "loss": 0.4518, + "step": 3324 + }, + { + "epoch": 0.6652, + "learning_rate": 1.0867674781052651e-06, + "loss": 0.1067, + "step": 3326 + }, + { + "epoch": 0.6656, + "learning_rate": 1.0804459522284904e-06, + "loss": 0.0078, + "step": 3328 + }, + { + "epoch": 0.666, + "learning_rate": 1.0741418154787519e-06, + "loss": 0.0129, + "step": 3330 + }, + { + "epoch": 0.6664, + "learning_rate": 1.067855080146264e-06, + "loss": 0.0015, + "step": 3332 + }, + { + "epoch": 0.6668, + "learning_rate": 1.0615857584873702e-06, + "loss": 0.0012, + "step": 3334 + }, + { + "epoch": 0.6672, + "learning_rate": 1.0553338627244013e-06, + "loss": 0.005, + "step": 3336 + }, + { + "epoch": 0.6676, + "learning_rate": 1.0490994050457738e-06, + "loss": 0.1467, + "step": 3338 + }, + { + "epoch": 0.668, + "learning_rate": 1.042882397605872e-06, + "loss": 0.2648, + "step": 3340 + }, + { + "epoch": 0.6684, + "learning_rate": 1.0366828525250717e-06, + "loss": 0.0008, + "step": 3342 + }, + { + "epoch": 0.6688, + "learning_rate": 1.0305007818897017e-06, + "loss": 0.1773, + "step": 3344 + }, + { + "epoch": 0.6692, + "learning_rate": 1.0243361977520261e-06, + "loss": 0.0003, + "step": 3346 + }, + { + "epoch": 0.6696, + "learning_rate": 1.0181891121302169e-06, + "loss": 0.2488, + "step": 3348 + }, + { + "epoch": 0.67, + "learning_rate": 1.012059537008333e-06, + "loss": 0.0029, + "step": 3350 + }, + { + "epoch": 0.6704, + "learning_rate": 1.0059474843362927e-06, + "loss": 0.0034, + "step": 3352 + }, + { + "epoch": 0.6708, + "learning_rate": 9.998529660298562e-07, + "loss": 0.0037, + "step": 3354 + }, + { + "epoch": 0.6712, + "learning_rate": 9.937759939706004e-07, + "loss": 0.0003, + "step": 3356 + }, + { + "epoch": 0.6716, + "learning_rate": 9.87716580005883e-07, + "loss": 0.0804, + "step": 3358 + }, + { + "epoch": 0.672, + "learning_rate": 9.816747359488666e-07, + "loss": 0.0006, + "step": 3360 + }, + { + "epoch": 0.6724, + "learning_rate": 9.756504735784033e-07, + "loss": 0.0003, + "step": 3362 + }, + { + "epoch": 0.6728, + "learning_rate": 9.696438046391332e-07, + "loss": 0.0023, + "step": 3364 + }, + { + "epoch": 0.6732, + "learning_rate": 9.6365474084134e-07, + "loss": 0.4959, + "step": 3366 + }, + { + "epoch": 0.6736, + "learning_rate": 9.576832938610113e-07, + "loss": 0.5776, + "step": 3368 + }, + { + "epoch": 0.674, + "learning_rate": 9.51729475339811e-07, + "loss": 0.0192, + "step": 3370 + }, + { + "epoch": 0.6744, + "learning_rate": 9.457932968849826e-07, + "loss": 0.0042, + "step": 3372 + }, + { + "epoch": 0.6748, + "learning_rate": 9.39874770069431e-07, + "loss": 0.0028, + "step": 3374 + }, + { + "epoch": 0.6752, + "learning_rate": 9.339739064316222e-07, + "loss": 0.0028, + "step": 3376 + }, + { + "epoch": 0.6756, + "learning_rate": 9.280907174755904e-07, + "loss": 0.0067, + "step": 3378 + }, + { + "epoch": 0.676, + "learning_rate": 9.22225214670921e-07, + "loss": 0.0028, + "step": 3380 + }, + { + "epoch": 0.6764, + "learning_rate": 9.1637740945269e-07, + "loss": 0.0012, + "step": 3382 + }, + { + "epoch": 0.6768, + "learning_rate": 9.105473132215137e-07, + "loss": 0.0013, + "step": 3384 + }, + { + "epoch": 0.6772, + "learning_rate": 9.047349373434578e-07, + "loss": 0.0067, + "step": 3386 + }, + { + "epoch": 0.6776, + "learning_rate": 8.989402931500446e-07, + "loss": 0.0097, + "step": 3388 + }, + { + "epoch": 0.678, + "learning_rate": 8.93163391938231e-07, + "loss": 0.0021, + "step": 3390 + }, + { + "epoch": 0.6784, + "learning_rate": 8.874042449703801e-07, + "loss": 0.0326, + "step": 3392 + }, + { + "epoch": 0.6788, + "learning_rate": 8.816628634742452e-07, + "loss": 0.0028, + "step": 3394 + }, + { + "epoch": 0.6792, + "learning_rate": 8.759392586429416e-07, + "loss": 0.0125, + "step": 3396 + }, + { + "epoch": 0.6796, + "learning_rate": 8.702334416349312e-07, + "loss": 0.0019, + "step": 3398 + }, + { + "epoch": 0.68, + "learning_rate": 8.645454235739947e-07, + "loss": 0.0004, + "step": 3400 + }, + { + "epoch": 0.6804, + "learning_rate": 8.588752155492164e-07, + "loss": 0.0849, + "step": 3402 + }, + { + "epoch": 0.6808, + "learning_rate": 8.53222828614948e-07, + "loss": 0.0096, + "step": 3404 + }, + { + "epoch": 0.6812, + "learning_rate": 8.475882737908292e-07, + "loss": 0.0008, + "step": 3406 + }, + { + "epoch": 0.6816, + "learning_rate": 8.419715620616841e-07, + "loss": 0.1289, + "step": 3408 + }, + { + "epoch": 0.682, + "learning_rate": 8.363727043776093e-07, + "loss": 0.0014, + "step": 3410 + }, + { + "epoch": 0.6824, + "learning_rate": 8.307917116538356e-07, + "loss": 0.0085, + "step": 3412 + }, + { + "epoch": 0.6828, + "learning_rate": 8.252285947708127e-07, + "loss": 0.3126, + "step": 3414 + }, + { + "epoch": 0.6832, + "learning_rate": 8.196833645741254e-07, + "loss": 0.013, + "step": 3416 + }, + { + "epoch": 0.6836, + "learning_rate": 8.141560318744601e-07, + "loss": 0.0008, + "step": 3418 + }, + { + "epoch": 0.684, + "learning_rate": 8.086466074476562e-07, + "loss": 0.1136, + "step": 3420 + }, + { + "epoch": 0.6844, + "learning_rate": 8.031551020346117e-07, + "loss": 0.0053, + "step": 3422 + }, + { + "epoch": 0.6848, + "learning_rate": 7.976815263412973e-07, + "loss": 0.0223, + "step": 3424 + }, + { + "epoch": 0.6852, + "learning_rate": 7.922258910387292e-07, + "loss": 0.003, + "step": 3426 + }, + { + "epoch": 0.6856, + "learning_rate": 7.867882067629484e-07, + "loss": 0.0024, + "step": 3428 + }, + { + "epoch": 0.686, + "learning_rate": 7.813684841149971e-07, + "loss": 0.0003, + "step": 3430 + }, + { + "epoch": 0.6864, + "learning_rate": 7.759667336609022e-07, + "loss": 0.003, + "step": 3432 + }, + { + "epoch": 0.6868, + "learning_rate": 7.705829659316533e-07, + "loss": 0.0005, + "step": 3434 + }, + { + "epoch": 0.6872, + "learning_rate": 7.652171914231799e-07, + "loss": 0.0078, + "step": 3436 + }, + { + "epoch": 0.6876, + "learning_rate": 7.598694205963353e-07, + "loss": 0.0031, + "step": 3438 + }, + { + "epoch": 0.688, + "learning_rate": 7.54539663876872e-07, + "loss": 0.1865, + "step": 3440 + }, + { + "epoch": 0.6884, + "learning_rate": 7.492279316554229e-07, + "loss": 0.0076, + "step": 3442 + }, + { + "epoch": 0.6888, + "learning_rate": 7.439342342874767e-07, + "loss": 0.0008, + "step": 3444 + }, + { + "epoch": 0.6892, + "learning_rate": 7.386585820933844e-07, + "loss": 0.0018, + "step": 3446 + }, + { + "epoch": 0.6896, + "learning_rate": 7.334009853582758e-07, + "loss": 0.0022, + "step": 3448 + }, + { + "epoch": 0.69, + "learning_rate": 7.281614543321247e-07, + "loss": 0.1392, + "step": 3450 + }, + { + "epoch": 0.6904, + "learning_rate": 7.229399992296626e-07, + "loss": 0.0108, + "step": 3452 + }, + { + "epoch": 0.6908, + "learning_rate": 7.177366302303667e-07, + "loss": 0.0096, + "step": 3454 + }, + { + "epoch": 0.6912, + "learning_rate": 7.125513574784947e-07, + "loss": 0.0005, + "step": 3456 + }, + { + "epoch": 0.6916, + "learning_rate": 7.073841910829759e-07, + "loss": 0.0015, + "step": 3458 + }, + { + "epoch": 0.692, + "learning_rate": 7.022351411174855e-07, + "loss": 0.0034, + "step": 3460 + }, + { + "epoch": 0.6924, + "learning_rate": 6.971042176203547e-07, + "loss": 0.2101, + "step": 3462 + }, + { + "epoch": 0.6928, + "learning_rate": 6.919914305945785e-07, + "loss": 0.0686, + "step": 3464 + }, + { + "epoch": 0.6932, + "learning_rate": 6.868967900077983e-07, + "loss": 0.0004, + "step": 3466 + }, + { + "epoch": 0.6936, + "learning_rate": 6.818203057922768e-07, + "loss": 0.0006, + "step": 3468 + }, + { + "epoch": 0.694, + "learning_rate": 6.767619878448795e-07, + "loss": 0.0008, + "step": 3470 + }, + { + "epoch": 0.6944, + "learning_rate": 6.717218460270558e-07, + "loss": 0.016, + "step": 3472 + }, + { + "epoch": 0.6948, + "learning_rate": 6.666998901648214e-07, + "loss": 0.0068, + "step": 3474 + }, + { + "epoch": 0.6952, + "learning_rate": 6.616961300487346e-07, + "loss": 0.0089, + "step": 3476 + }, + { + "epoch": 0.6956, + "learning_rate": 6.567105754338821e-07, + "loss": 0.1573, + "step": 3478 + }, + { + "epoch": 0.696, + "learning_rate": 6.517432360398523e-07, + "loss": 0.0163, + "step": 3480 + }, + { + "epoch": 0.6964, + "learning_rate": 6.467941215507456e-07, + "loss": 0.0025, + "step": 3482 + }, + { + "epoch": 0.6968, + "learning_rate": 6.418632416150894e-07, + "loss": 0.0041, + "step": 3484 + }, + { + "epoch": 0.6972, + "learning_rate": 6.369506058459107e-07, + "loss": 0.0012, + "step": 3486 + }, + { + "epoch": 0.6976, + "learning_rate": 6.320562238206262e-07, + "loss": 0.104, + "step": 3488 + }, + { + "epoch": 0.698, + "learning_rate": 6.271801050810833e-07, + "loss": 0.0016, + "step": 3490 + }, + { + "epoch": 0.6984, + "learning_rate": 6.223222591335454e-07, + "loss": 0.0001, + "step": 3492 + }, + { + "epoch": 0.6988, + "learning_rate": 6.174826954486047e-07, + "loss": 0.1133, + "step": 3494 + }, + { + "epoch": 0.6992, + "learning_rate": 6.126614234612583e-07, + "loss": 0.0067, + "step": 3496 + }, + { + "epoch": 0.6996, + "learning_rate": 6.078584525708164e-07, + "loss": 0.017, + "step": 3498 + }, + { + "epoch": 0.7, + "learning_rate": 6.030737921409158e-07, + "loss": 0.1886, + "step": 3500 + }, + { + "epoch": 0.7004, + "learning_rate": 5.98307451499498e-07, + "loss": 0.0009, + "step": 3502 + }, + { + "epoch": 0.7008, + "learning_rate": 5.935594399387867e-07, + "loss": 0.0027, + "step": 3504 + }, + { + "epoch": 0.7012, + "learning_rate": 5.888297667152743e-07, + "loss": 0.4559, + "step": 3506 + }, + { + "epoch": 0.7016, + "learning_rate": 5.841184410496992e-07, + "loss": 0.0006, + "step": 3508 + }, + { + "epoch": 0.702, + "learning_rate": 5.794254721270343e-07, + "loss": 0.0072, + "step": 3510 + }, + { + "epoch": 0.7024, + "learning_rate": 5.747508690964609e-07, + "loss": 0.0821, + "step": 3512 + }, + { + "epoch": 0.7028, + "learning_rate": 5.70094641071357e-07, + "loss": 0.0028, + "step": 3514 + }, + { + "epoch": 0.7032, + "learning_rate": 5.654567971292779e-07, + "loss": 0.0014, + "step": 3516 + }, + { + "epoch": 0.7036, + "learning_rate": 5.608373463119376e-07, + "loss": 0.0194, + "step": 3518 + }, + { + "epoch": 0.704, + "learning_rate": 5.562362976251867e-07, + "loss": 0.007, + "step": 3520 + }, + { + "epoch": 0.7044, + "learning_rate": 5.51653660039021e-07, + "loss": 0.0016, + "step": 3522 + }, + { + "epoch": 0.7048, + "learning_rate": 5.470894424875095e-07, + "loss": 0.0083, + "step": 3524 + }, + { + "epoch": 0.7052, + "learning_rate": 5.4254365386883e-07, + "loss": 0.0318, + "step": 3526 + }, + { + "epoch": 0.7056, + "learning_rate": 5.380163030452445e-07, + "loss": 0.3076, + "step": 3528 + }, + { + "epoch": 0.706, + "learning_rate": 5.335073988430351e-07, + "loss": 0.1199, + "step": 3530 + }, + { + "epoch": 0.7064, + "learning_rate": 5.29016950052561e-07, + "loss": 0.0038, + "step": 3532 + }, + { + "epoch": 0.7068, + "learning_rate": 5.24544965428162e-07, + "loss": 0.3369, + "step": 3534 + }, + { + "epoch": 0.7072, + "learning_rate": 5.200914536882162e-07, + "loss": 0.0018, + "step": 3536 + }, + { + "epoch": 0.7076, + "learning_rate": 5.156564235150674e-07, + "loss": 0.0014, + "step": 3538 + }, + { + "epoch": 0.708, + "learning_rate": 5.112398835550348e-07, + "loss": 0.0663, + "step": 3540 + }, + { + "epoch": 0.7084, + "learning_rate": 5.06841842418393e-07, + "loss": 0.0032, + "step": 3542 + }, + { + "epoch": 0.7088, + "learning_rate": 5.024623086793323e-07, + "loss": 0.0007, + "step": 3544 + }, + { + "epoch": 0.7092, + "learning_rate": 4.981012908759941e-07, + "loss": 0.0015, + "step": 3546 + }, + { + "epoch": 0.7096, + "learning_rate": 4.937587975104007e-07, + "loss": 0.0011, + "step": 3548 + }, + { + "epoch": 0.71, + "learning_rate": 4.894348370484658e-07, + "loss": 0.0149, + "step": 3550 + }, + { + "epoch": 0.7104, + "learning_rate": 4.851294179199695e-07, + "loss": 0.4839, + "step": 3552 + }, + { + "epoch": 0.7108, + "learning_rate": 4.808425485185486e-07, + "loss": 0.0013, + "step": 3554 + }, + { + "epoch": 0.7112, + "learning_rate": 4.7657423720166907e-07, + "loss": 0.001, + "step": 3556 + }, + { + "epoch": 0.7116, + "learning_rate": 4.723244922906367e-07, + "loss": 0.0018, + "step": 3558 + }, + { + "epoch": 0.712, + "learning_rate": 4.6809332207053194e-07, + "loss": 0.0005, + "step": 3560 + }, + { + "epoch": 0.7124, + "learning_rate": 4.63880734790243e-07, + "loss": 0.0378, + "step": 3562 + }, + { + "epoch": 0.7128, + "learning_rate": 4.5968673866242374e-07, + "loss": 0.0097, + "step": 3564 + }, + { + "epoch": 0.7132, + "learning_rate": 4.555113418634782e-07, + "loss": 0.0006, + "step": 3566 + }, + { + "epoch": 0.7136, + "learning_rate": 4.5135455253357387e-07, + "loss": 0.0196, + "step": 3568 + }, + { + "epoch": 0.714, + "learning_rate": 4.4721637877656155e-07, + "loss": 0.0007, + "step": 3570 + }, + { + "epoch": 0.7144, + "learning_rate": 4.4309682866004457e-07, + "loss": 0.0293, + "step": 3572 + }, + { + "epoch": 0.7148, + "learning_rate": 4.389959102152752e-07, + "loss": 0.1028, + "step": 3574 + }, + { + "epoch": 0.7152, + "learning_rate": 4.349136314372204e-07, + "loss": 0.0751, + "step": 3576 + }, + { + "epoch": 0.7156, + "learning_rate": 4.3085000028449065e-07, + "loss": 0.0077, + "step": 3578 + }, + { + "epoch": 0.716, + "learning_rate": 4.268050246793265e-07, + "loss": 0.0015, + "step": 3580 + }, + { + "epoch": 0.7164, + "learning_rate": 4.2277871250763327e-07, + "loss": 0.0124, + "step": 3582 + }, + { + "epoch": 0.7168, + "learning_rate": 4.1877107161890416e-07, + "loss": 0.1592, + "step": 3584 + }, + { + "epoch": 0.7172, + "learning_rate": 4.1478210982624166e-07, + "loss": 0.0012, + "step": 3586 + }, + { + "epoch": 0.7176, + "learning_rate": 4.108118349063306e-07, + "loss": 0.0048, + "step": 3588 + }, + { + "epoch": 0.718, + "learning_rate": 4.06860254599426e-07, + "loss": 0.0983, + "step": 3590 + }, + { + "epoch": 0.7184, + "learning_rate": 4.0292737660933446e-07, + "loss": 0.0019, + "step": 3592 + }, + { + "epoch": 0.7188, + "learning_rate": 3.9901320860340373e-07, + "loss": 0.0016, + "step": 3594 + }, + { + "epoch": 0.7192, + "learning_rate": 3.951177582125043e-07, + "loss": 0.0028, + "step": 3596 + }, + { + "epoch": 0.7196, + "learning_rate": 3.912410330310157e-07, + "loss": 0.0755, + "step": 3598 + }, + { + "epoch": 0.72, + "learning_rate": 3.873830406168133e-07, + "loss": 0.0248, + "step": 3600 + }, + { + "epoch": 0.7204, + "learning_rate": 3.835437884912496e-07, + "loss": 0.0008, + "step": 3602 + }, + { + "epoch": 0.7208, + "learning_rate": 3.797232841391441e-07, + "loss": 0.0007, + "step": 3604 + }, + { + "epoch": 0.7212, + "learning_rate": 3.759215350087586e-07, + "loss": 0.0081, + "step": 3606 + }, + { + "epoch": 0.7216, + "learning_rate": 3.7213854851181455e-07, + "loss": 0.0084, + "step": 3608 + }, + { + "epoch": 0.722, + "learning_rate": 3.6837433202341676e-07, + "loss": 0.001, + "step": 3610 + }, + { + "epoch": 0.7224, + "learning_rate": 3.646288928821129e-07, + "loss": 0.0017, + "step": 3612 + }, + { + "epoch": 0.7228, + "learning_rate": 3.609022383898286e-07, + "loss": 0.0017, + "step": 3614 + }, + { + "epoch": 0.7232, + "learning_rate": 3.5719437581185236e-07, + "loss": 0.0006, + "step": 3616 + }, + { + "epoch": 0.7236, + "learning_rate": 3.535053123768717e-07, + "loss": 0.0016, + "step": 3618 + }, + { + "epoch": 0.724, + "learning_rate": 3.4983505527688477e-07, + "loss": 0.0045, + "step": 3620 + }, + { + "epoch": 0.7244, + "learning_rate": 3.4618361166726123e-07, + "loss": 0.001, + "step": 3622 + }, + { + "epoch": 0.7248, + "learning_rate": 3.4255098866667114e-07, + "loss": 0.0014, + "step": 3624 + }, + { + "epoch": 0.7252, + "learning_rate": 3.3893719335709953e-07, + "loss": 0.0052, + "step": 3626 + }, + { + "epoch": 0.7256, + "learning_rate": 3.3534223278382405e-07, + "loss": 0.0031, + "step": 3628 + }, + { + "epoch": 0.726, + "learning_rate": 3.3176611395540625e-07, + "loss": 0.0058, + "step": 3630 + }, + { + "epoch": 0.7264, + "learning_rate": 3.282088438436726e-07, + "loss": 0.0006, + "step": 3632 + }, + { + "epoch": 0.7268, + "learning_rate": 3.246704293837022e-07, + "loss": 0.0015, + "step": 3634 + }, + { + "epoch": 0.7272, + "learning_rate": 3.211508774738148e-07, + "loss": 0.882, + "step": 3636 + }, + { + "epoch": 0.7276, + "learning_rate": 3.176501949755573e-07, + "loss": 0.0015, + "step": 3638 + }, + { + "epoch": 0.728, + "learning_rate": 3.1416838871369036e-07, + "loss": 0.0082, + "step": 3640 + }, + { + "epoch": 0.7284, + "learning_rate": 3.1070546547616875e-07, + "loss": 0.021, + "step": 3642 + }, + { + "epoch": 0.7288, + "learning_rate": 3.072614320141498e-07, + "loss": 0.0008, + "step": 3644 + }, + { + "epoch": 0.7292, + "learning_rate": 3.038362950419427e-07, + "loss": 0.0005, + "step": 3646 + }, + { + "epoch": 0.7296, + "learning_rate": 3.004300612370292e-07, + "loss": 0.0244, + "step": 3648 + }, + { + "epoch": 0.73, + "learning_rate": 2.9704273724003865e-07, + "loss": 0.026, + "step": 3650 + }, + { + "epoch": 0.7304, + "learning_rate": 2.9367432965472506e-07, + "loss": 0.0161, + "step": 3652 + }, + { + "epoch": 0.7308, + "learning_rate": 2.903248450479879e-07, + "loss": 0.2039, + "step": 3654 + }, + { + "epoch": 0.7312, + "learning_rate": 2.8699428994979906e-07, + "loss": 0.2474, + "step": 3656 + }, + { + "epoch": 0.7316, + "learning_rate": 2.836826708532603e-07, + "loss": 0.0278, + "step": 3658 + }, + { + "epoch": 0.732, + "learning_rate": 2.8038999421453716e-07, + "loss": 0.0046, + "step": 3660 + }, + { + "epoch": 0.7324, + "learning_rate": 2.771162664528726e-07, + "loss": 0.0005, + "step": 3662 + }, + { + "epoch": 0.7328, + "learning_rate": 2.73861493950569e-07, + "loss": 0.0155, + "step": 3664 + }, + { + "epoch": 0.7332, + "learning_rate": 2.706256830529608e-07, + "loss": 0.3881, + "step": 3666 + }, + { + "epoch": 0.7336, + "learning_rate": 2.6740884006843826e-07, + "loss": 0.1359, + "step": 3668 + }, + { + "epoch": 0.734, + "learning_rate": 2.6421097126839825e-07, + "loss": 0.0003, + "step": 3670 + }, + { + "epoch": 0.7344, + "learning_rate": 2.6103208288724815e-07, + "loss": 0.0063, + "step": 3672 + }, + { + "epoch": 0.7348, + "learning_rate": 2.578721811223961e-07, + "loss": 0.1027, + "step": 3674 + }, + { + "epoch": 0.7352, + "learning_rate": 2.5473127213422985e-07, + "loss": 0.0434, + "step": 3676 + }, + { + "epoch": 0.7356, + "learning_rate": 2.516093620461135e-07, + "loss": 0.0047, + "step": 3678 + }, + { + "epoch": 0.736, + "learning_rate": 2.485064569443696e-07, + "loss": 0.0005, + "step": 3680 + }, + { + "epoch": 0.7364, + "learning_rate": 2.4542256287827026e-07, + "loss": 0.0016, + "step": 3682 + }, + { + "epoch": 0.7368, + "learning_rate": 2.423576858600263e-07, + "loss": 0.006, + "step": 3684 + }, + { + "epoch": 0.7372, + "learning_rate": 2.3931183186477137e-07, + "loss": 0.0005, + "step": 3686 + }, + { + "epoch": 0.7376, + "learning_rate": 2.3628500683055e-07, + "loss": 0.0014, + "step": 3688 + }, + { + "epoch": 0.738, + "learning_rate": 2.3327721665832303e-07, + "loss": 0.0473, + "step": 3690 + }, + { + "epoch": 0.7384, + "learning_rate": 2.3028846721191767e-07, + "loss": 0.0738, + "step": 3692 + }, + { + "epoch": 0.7388, + "learning_rate": 2.2731876431806854e-07, + "loss": 0.002, + "step": 3694 + }, + { + "epoch": 0.7392, + "learning_rate": 2.243681137663467e-07, + "loss": 0.0038, + "step": 3696 + }, + { + "epoch": 0.7396, + "learning_rate": 2.2143652130921068e-07, + "loss": 0.005, + "step": 3698 + }, + { + "epoch": 0.74, + "learning_rate": 2.1852399266194646e-07, + "loss": 0.0156, + "step": 3700 + }, + { + "epoch": 0.7404, + "learning_rate": 2.1563053350266983e-07, + "loss": 0.0831, + "step": 3702 + }, + { + "epoch": 0.7408, + "learning_rate": 2.1275614947233957e-07, + "loss": 0.0013, + "step": 3704 + }, + { + "epoch": 0.7412, + "learning_rate": 2.0990084617470207e-07, + "loss": 0.0011, + "step": 3706 + }, + { + "epoch": 0.7416, + "learning_rate": 2.0706462917632676e-07, + "loss": 0.1935, + "step": 3708 + }, + { + "epoch": 0.742, + "learning_rate": 2.0424750400655947e-07, + "loss": 0.0087, + "step": 3710 + }, + { + "epoch": 0.7424, + "learning_rate": 2.014494761575314e-07, + "loss": 0.0013, + "step": 3712 + }, + { + "epoch": 0.7428, + "learning_rate": 1.9867055108414135e-07, + "loss": 0.0009, + "step": 3714 + }, + { + "epoch": 0.7432, + "learning_rate": 1.959107342040445e-07, + "loss": 0.0022, + "step": 3716 + }, + { + "epoch": 0.7436, + "learning_rate": 1.9317003089764473e-07, + "loss": 0.0264, + "step": 3718 + }, + { + "epoch": 0.744, + "learning_rate": 1.904484465080858e-07, + "loss": 0.1079, + "step": 3720 + }, + { + "epoch": 0.7444, + "learning_rate": 1.877459863412334e-07, + "loss": 0.0015, + "step": 3722 + }, + { + "epoch": 0.7448, + "learning_rate": 1.8506265566567317e-07, + "loss": 0.0034, + "step": 3724 + }, + { + "epoch": 0.7452, + "learning_rate": 1.8239845971269377e-07, + "loss": 0.0119, + "step": 3726 + }, + { + "epoch": 0.7456, + "learning_rate": 1.7975340367628157e-07, + "loss": 0.0672, + "step": 3728 + }, + { + "epoch": 0.746, + "learning_rate": 1.7712749271311503e-07, + "loss": 0.0008, + "step": 3730 + }, + { + "epoch": 0.7464, + "learning_rate": 1.7452073194253126e-07, + "loss": 0.0323, + "step": 3732 + }, + { + "epoch": 0.7468, + "learning_rate": 1.7193312644655512e-07, + "loss": 0.0023, + "step": 3734 + }, + { + "epoch": 0.7472, + "learning_rate": 1.6936468126984796e-07, + "loss": 0.0006, + "step": 3736 + }, + { + "epoch": 0.7476, + "learning_rate": 1.668154014197232e-07, + "loss": 0.9237, + "step": 3738 + }, + { + "epoch": 0.748, + "learning_rate": 1.6428529186614417e-07, + "loss": 0.002, + "step": 3740 + }, + { + "epoch": 0.7484, + "learning_rate": 1.6177435754167413e-07, + "loss": 0.0247, + "step": 3742 + }, + { + "epoch": 0.7488, + "learning_rate": 1.5928260334151736e-07, + "loss": 0.0036, + "step": 3744 + }, + { + "epoch": 0.7492, + "learning_rate": 1.5681003412347573e-07, + "loss": 0.0009, + "step": 3746 + }, + { + "epoch": 0.7496, + "learning_rate": 1.543566547079467e-07, + "loss": 0.0537, + "step": 3748 + }, + { + "epoch": 0.75, + "learning_rate": 1.519224698779198e-07, + "loss": 0.0034, + "step": 3750 + }, + { + "epoch": 0.7504, + "learning_rate": 1.4950748437896344e-07, + "loss": 0.0242, + "step": 3752 + }, + { + "epoch": 0.7508, + "learning_rate": 1.4711170291921485e-07, + "loss": 0.0014, + "step": 3754 + }, + { + "epoch": 0.7512, + "learning_rate": 1.4473513016937223e-07, + "loss": 0.0014, + "step": 3756 + }, + { + "epoch": 0.7516, + "learning_rate": 1.4237777076268723e-07, + "loss": 0.1292, + "step": 3758 + }, + { + "epoch": 0.752, + "learning_rate": 1.400396292949513e-07, + "loss": 0.0007, + "step": 3760 + }, + { + "epoch": 0.7524, + "learning_rate": 1.3772071032449152e-07, + "loss": 0.3469, + "step": 3762 + }, + { + "epoch": 0.7528, + "learning_rate": 1.3542101837215938e-07, + "loss": 0.0122, + "step": 3764 + }, + { + "epoch": 0.7532, + "learning_rate": 1.3314055792132075e-07, + "loss": 0.0023, + "step": 3766 + }, + { + "epoch": 0.7536, + "learning_rate": 1.308793334178482e-07, + "loss": 0.0003, + "step": 3768 + }, + { + "epoch": 0.754, + "learning_rate": 1.2863734927012205e-07, + "loss": 0.3769, + "step": 3770 + }, + { + "epoch": 0.7544, + "learning_rate": 1.2641460984899822e-07, + "loss": 0.0039, + "step": 3772 + }, + { + "epoch": 0.7548, + "learning_rate": 1.242111194878204e-07, + "loss": 0.0482, + "step": 3774 + }, + { + "epoch": 0.7552, + "learning_rate": 1.2202688248241335e-07, + "loss": 0.0013, + "step": 3776 + }, + { + "epoch": 0.7556, + "learning_rate": 1.198619030910475e-07, + "loss": 0.0008, + "step": 3778 + }, + { + "epoch": 0.756, + "learning_rate": 1.1771618553447439e-07, + "loss": 0.0004, + "step": 3780 + }, + { + "epoch": 0.7564, + "learning_rate": 1.1558973399586671e-07, + "loss": 0.0094, + "step": 3782 + }, + { + "epoch": 0.7568, + "learning_rate": 1.1348255262085939e-07, + "loss": 0.0055, + "step": 3784 + }, + { + "epoch": 0.7572, + "learning_rate": 1.1139464551750857e-07, + "loss": 0.1448, + "step": 3786 + }, + { + "epoch": 0.7576, + "learning_rate": 1.0932601675629483e-07, + "loss": 0.001, + "step": 3788 + }, + { + "epoch": 0.758, + "learning_rate": 1.0727667037011668e-07, + "loss": 0.003, + "step": 3790 + }, + { + "epoch": 0.7584, + "learning_rate": 1.052466103542793e-07, + "loss": 0.064, + "step": 3792 + }, + { + "epoch": 0.7588, + "learning_rate": 1.0323584066648907e-07, + "loss": 0.0153, + "step": 3794 + }, + { + "epoch": 0.7592, + "learning_rate": 1.0124436522684355e-07, + "loss": 0.0081, + "step": 3796 + }, + { + "epoch": 0.7596, + "learning_rate": 9.9272187917826e-08, + "loss": 0.0012, + "step": 3798 + }, + { + "epoch": 0.76, + "learning_rate": 9.73193125842975e-08, + "loss": 0.0059, + "step": 3800 + }, + { + "epoch": 0.7604, + "learning_rate": 9.538574303348925e-08, + "loss": 0.0128, + "step": 3802 + }, + { + "epoch": 0.7608, + "learning_rate": 9.34714830349892e-08, + "loss": 0.0006, + "step": 3804 + }, + { + "epoch": 0.7612, + "learning_rate": 9.157653632075547e-08, + "loss": 0.0037, + "step": 3806 + }, + { + "epoch": 0.7616, + "learning_rate": 8.970090658507403e-08, + "loss": 0.0045, + "step": 3808 + }, + { + "epoch": 0.762, + "learning_rate": 8.784459748458429e-08, + "loss": 0.4598, + "step": 3810 + }, + { + "epoch": 0.7624, + "learning_rate": 8.600761263825585e-08, + "loss": 0.0552, + "step": 3812 + }, + { + "epoch": 0.7628, + "learning_rate": 8.418995562738175e-08, + "loss": 1.0306, + "step": 3814 + }, + { + "epoch": 0.7632, + "learning_rate": 8.239162999558625e-08, + "loss": 0.0031, + "step": 3816 + }, + { + "epoch": 0.7636, + "learning_rate": 8.061263924878604e-08, + "loss": 0.0006, + "step": 3818 + }, + { + "epoch": 0.764, + "learning_rate": 7.885298685522124e-08, + "loss": 0.0012, + "step": 3820 + }, + { + "epoch": 0.7644, + "learning_rate": 7.71126762454233e-08, + "loss": 0.0039, + "step": 3822 + }, + { + "epoch": 0.7648, + "learning_rate": 7.539171081221597e-08, + "loss": 0.0017, + "step": 3824 + }, + { + "epoch": 0.7652, + "learning_rate": 7.369009391071213e-08, + "loss": 0.5775, + "step": 3826 + }, + { + "epoch": 0.7656, + "learning_rate": 7.200782885829371e-08, + "loss": 0.0004, + "step": 3828 + }, + { + "epoch": 0.766, + "learning_rate": 7.034491893463059e-08, + "loss": 0.005, + "step": 3830 + }, + { + "epoch": 0.7664, + "learning_rate": 6.870136738164723e-08, + "loss": 0.0016, + "step": 3832 + }, + { + "epoch": 0.7668, + "learning_rate": 6.707717740353059e-08, + "loss": 0.0009, + "step": 3834 + }, + { + "epoch": 0.7672, + "learning_rate": 6.547235216672443e-08, + "loss": 0.0295, + "step": 3836 + }, + { + "epoch": 0.7676, + "learning_rate": 6.388689479991606e-08, + "loss": 0.0086, + "step": 3838 + }, + { + "epoch": 0.768, + "learning_rate": 6.232080839403631e-08, + "loss": 0.0022, + "step": 3840 + }, + { + "epoch": 0.7684, + "learning_rate": 6.077409600225181e-08, + "loss": 0.0209, + "step": 3842 + }, + { + "epoch": 0.7688, + "learning_rate": 5.9246760639954935e-08, + "loss": 0.1186, + "step": 3844 + }, + { + "epoch": 0.7692, + "learning_rate": 5.773880528476605e-08, + "loss": 0.0027, + "step": 3846 + }, + { + "epoch": 0.7696, + "learning_rate": 5.625023287652131e-08, + "loss": 0.4762, + "step": 3848 + }, + { + "epoch": 0.77, + "learning_rate": 5.4781046317266e-08, + "loss": 0.0022, + "step": 3850 + }, + { + "epoch": 0.7704, + "learning_rate": 5.333124847126003e-08, + "loss": 0.2856, + "step": 3852 + }, + { + "epoch": 0.7708, + "learning_rate": 5.1900842164952505e-08, + "loss": 0.008, + "step": 3854 + }, + { + "epoch": 0.7712, + "learning_rate": 5.048983018699938e-08, + "loss": 0.002, + "step": 3856 + }, + { + "epoch": 0.7716, + "learning_rate": 4.9098215288234664e-08, + "loss": 0.0025, + "step": 3858 + }, + { + "epoch": 0.772, + "learning_rate": 4.772600018168816e-08, + "loss": 0.0009, + "step": 3860 + }, + { + "epoch": 0.7724, + "learning_rate": 4.637318754256215e-08, + "loss": 0.0042, + "step": 3862 + }, + { + "epoch": 0.7728, + "learning_rate": 4.503978000823028e-08, + "loss": 0.0032, + "step": 3864 + }, + { + "epoch": 0.7732, + "learning_rate": 4.3725780178243135e-08, + "loss": 0.157, + "step": 3866 + }, + { + "epoch": 0.7736, + "learning_rate": 4.243119061430823e-08, + "loss": 0.0048, + "step": 3868 + }, + { + "epoch": 0.774, + "learning_rate": 4.115601384029666e-08, + "loss": 0.1666, + "step": 3870 + }, + { + "epoch": 0.7744, + "learning_rate": 3.990025234222872e-08, + "loss": 0.0011, + "step": 3872 + }, + { + "epoch": 0.7748, + "learning_rate": 3.8663908568276064e-08, + "loss": 0.0223, + "step": 3874 + }, + { + "epoch": 0.7752, + "learning_rate": 3.7446984928753984e-08, + "loss": 0.009, + "step": 3876 + }, + { + "epoch": 0.7756, + "learning_rate": 3.6249483796118036e-08, + "loss": 0.1641, + "step": 3878 + }, + { + "epoch": 0.776, + "learning_rate": 3.507140750495741e-08, + "loss": 0.0011, + "step": 3880 + }, + { + "epoch": 0.7764, + "learning_rate": 3.391275835199159e-08, + "loss": 0.0012, + "step": 3882 + }, + { + "epoch": 0.7768, + "learning_rate": 3.2773538596068134e-08, + "loss": 0.0044, + "step": 3884 + }, + { + "epoch": 0.7772, + "learning_rate": 3.165375045815266e-08, + "loss": 0.0007, + "step": 3886 + }, + { + "epoch": 0.7776, + "learning_rate": 3.0553396121331126e-08, + "loss": 0.0018, + "step": 3888 + }, + { + "epoch": 0.778, + "learning_rate": 2.9472477730796423e-08, + "loss": 0.0199, + "step": 3890 + }, + { + "epoch": 0.7784, + "learning_rate": 2.8410997393861772e-08, + "loss": 0.0643, + "step": 3892 + }, + { + "epoch": 0.7788, + "learning_rate": 2.736895717993071e-08, + "loss": 0.0353, + "step": 3894 + }, + { + "epoch": 0.7792, + "learning_rate": 2.6346359120514863e-08, + "loss": 0.0009, + "step": 3896 + }, + { + "epoch": 0.7796, + "learning_rate": 2.5343205209225062e-08, + "loss": 0.0005, + "step": 3898 + }, + { + "epoch": 0.78, + "learning_rate": 2.4359497401756915e-08, + "loss": 0.0014, + "step": 3900 + }, + { + "epoch": 0.7804, + "learning_rate": 2.339523761590412e-08, + "loss": 0.0046, + "step": 3902 + }, + { + "epoch": 0.7808, + "learning_rate": 2.2450427731534052e-08, + "loss": 0.0004, + "step": 3904 + }, + { + "epoch": 0.7812, + "learning_rate": 2.152506959060774e-08, + "loss": 0.3569, + "step": 3906 + }, + { + "epoch": 0.7816, + "learning_rate": 2.061916499715544e-08, + "loss": 0.0016, + "step": 3908 + }, + { + "epoch": 0.782, + "learning_rate": 1.973271571728441e-08, + "loss": 1.1582, + "step": 3910 + }, + { + "epoch": 0.7824, + "learning_rate": 1.8865723479174482e-08, + "loss": 0.0236, + "step": 3912 + }, + { + "epoch": 0.7828, + "learning_rate": 1.8018189973069144e-08, + "loss": 0.0043, + "step": 3914 + }, + { + "epoch": 0.7832, + "learning_rate": 1.7190116851280024e-08, + "loss": 0.0077, + "step": 3916 + }, + { + "epoch": 0.7836, + "learning_rate": 1.6381505728176872e-08, + "loss": 0.6479, + "step": 3918 + }, + { + "epoch": 0.784, + "learning_rate": 1.5592358180189782e-08, + "loss": 0.001, + "step": 3920 + }, + { + "epoch": 0.7844, + "learning_rate": 1.482267574580254e-08, + "loss": 0.0476, + "step": 3922 + }, + { + "epoch": 0.7848, + "learning_rate": 1.4072459925548176e-08, + "loss": 0.0016, + "step": 3924 + }, + { + "epoch": 0.7852, + "learning_rate": 1.3341712182011191e-08, + "loss": 0.197, + "step": 3926 + }, + { + "epoch": 0.7856, + "learning_rate": 1.2630433939825326e-08, + "loss": 0.0025, + "step": 3928 + }, + { + "epoch": 0.786, + "learning_rate": 1.1938626585660252e-08, + "loss": 0.0006, + "step": 3930 + }, + { + "epoch": 0.7864, + "learning_rate": 1.126629146823044e-08, + "loss": 0.0023, + "step": 3932 + }, + { + "epoch": 0.7868, + "learning_rate": 1.0613429898288507e-08, + "loss": 0.0025, + "step": 3934 + }, + { + "epoch": 0.7872, + "learning_rate": 9.980043148619668e-09, + "loss": 0.0008, + "step": 3936 + }, + { + "epoch": 0.7876, + "learning_rate": 9.366132454046162e-09, + "loss": 0.0014, + "step": 3938 + }, + { + "epoch": 0.788, + "learning_rate": 8.771699011416169e-09, + "loss": 0.0284, + "step": 3940 + }, + { + "epoch": 0.7884, + "learning_rate": 8.196743979610455e-09, + "loss": 0.0267, + "step": 3942 + }, + { + "epoch": 0.7888, + "learning_rate": 7.641268479531283e-09, + "loss": 0.0058, + "step": 3944 + }, + { + "epoch": 0.7892, + "learning_rate": 7.105273594107953e-09, + "loss": 0.0031, + "step": 3946 + }, + { + "epoch": 0.7896, + "learning_rate": 6.588760368289038e-09, + "loss": 0.002, + "step": 3948 + }, + { + "epoch": 0.79, + "learning_rate": 6.091729809042379e-09, + "loss": 0.0037, + "step": 3950 + }, + { + "epoch": 0.7904, + "learning_rate": 5.614182885357311e-09, + "loss": 0.0018, + "step": 3952 + }, + { + "epoch": 0.7908, + "learning_rate": 5.156120528233555e-09, + "loss": 0.0004, + "step": 3954 + }, + { + "epoch": 0.7912, + "learning_rate": 4.717543630688992e-09, + "loss": 0.0075, + "step": 3956 + }, + { + "epoch": 0.7916, + "learning_rate": 4.298453047749674e-09, + "loss": 0.0038, + "step": 3958 + }, + { + "epoch": 0.792, + "learning_rate": 3.898849596456477e-09, + "loss": 0.0017, + "step": 3960 + }, + { + "epoch": 0.7924, + "learning_rate": 3.518734055855122e-09, + "loss": 0.2065, + "step": 3962 + }, + { + "epoch": 0.7928, + "learning_rate": 3.1581071670006013e-09, + "loss": 0.0186, + "step": 3964 + }, + { + "epoch": 0.7932, + "learning_rate": 2.8169696329527484e-09, + "loss": 0.0021, + "step": 3966 + }, + { + "epoch": 0.7936, + "learning_rate": 2.495322118778454e-09, + "loss": 0.0009, + "step": 3968 + }, + { + "epoch": 0.794, + "learning_rate": 2.193165251545004e-09, + "loss": 0.0019, + "step": 3970 + }, + { + "epoch": 0.7944, + "learning_rate": 1.910499620323414e-09, + "loss": 0.0012, + "step": 3972 + }, + { + "epoch": 0.7948, + "learning_rate": 1.647325776182873e-09, + "loss": 0.0059, + "step": 3974 + }, + { + "epoch": 0.7952, + "learning_rate": 1.4036442321951892e-09, + "loss": 0.0012, + "step": 3976 + }, + { + "epoch": 0.7956, + "learning_rate": 1.1794554634314558e-09, + "loss": 0.8227, + "step": 3978 + }, + { + "epoch": 0.796, + "learning_rate": 9.74759906957612e-10, + "loss": 0.0002, + "step": 3980 + }, + { + "epoch": 0.7964, + "learning_rate": 7.895579618388827e-10, + "loss": 0.1621, + "step": 3982 + }, + { + "epoch": 0.7968, + "learning_rate": 6.238499891353389e-10, + "loss": 0.0039, + "step": 3984 + }, + { + "epoch": 0.7972, + "learning_rate": 4.77636311903007e-10, + "loss": 0.1681, + "step": 3986 + }, + { + "epoch": 0.7976, + "learning_rate": 3.509172151938689e-10, + "loss": 0.9139, + "step": 3988 + }, + { + "epoch": 0.798, + "learning_rate": 2.436929460525317e-10, + "loss": 0.0123, + "step": 3990 + }, + { + "epoch": 0.7984, + "learning_rate": 1.559637135173375e-10, + "loss": 0.01, + "step": 3992 + }, + { + "epoch": 0.7988, + "learning_rate": 8.772968862369447e-11, + "loss": 0.0031, + "step": 3994 + }, + { + "epoch": 0.7992, + "learning_rate": 3.899100439408443e-11, + "loss": 0.0007, + "step": 3996 + }, + { + "epoch": 0.7996, + "learning_rate": 9.74775584916543e-12, + "loss": 0.0005, + "step": 3998 + }, + { + "epoch": 0.8, + "learning_rate": 0.0, + "loss": 0.0093, + "step": 4000 + }, + { + "epoch": 0.8004, + "learning_rate": 9.74775584916543e-12, + "loss": 0.0017, + "step": 4002 + }, + { + "epoch": 0.8008, + "learning_rate": 3.899100439408443e-11, + "loss": 0.0996, + "step": 4004 + }, + { + "epoch": 0.8012, + "learning_rate": 8.772968862369447e-11, + "loss": 0.0516, + "step": 4006 + }, + { + "epoch": 0.8016, + "learning_rate": 1.559637135173375e-10, + "loss": 0.0134, + "step": 4008 + }, + { + "epoch": 0.802, + "learning_rate": 2.436929460525317e-10, + "loss": 0.0007, + "step": 4010 + }, + { + "epoch": 0.8024, + "learning_rate": 3.509172151938689e-10, + "loss": 0.0011, + "step": 4012 + }, + { + "epoch": 0.8028, + "learning_rate": 4.77636311903007e-10, + "loss": 0.0324, + "step": 4014 + }, + { + "epoch": 0.8032, + "learning_rate": 6.238499891353389e-10, + "loss": 0.0081, + "step": 4016 + }, + { + "epoch": 0.8036, + "learning_rate": 7.895579618388827e-10, + "loss": 0.0003, + "step": 4018 + }, + { + "epoch": 0.804, + "learning_rate": 9.74759906957612e-10, + "loss": 0.0157, + "step": 4020 + }, + { + "epoch": 0.8044, + "learning_rate": 1.1794554634314558e-09, + "loss": 0.5333, + "step": 4022 + }, + { + "epoch": 0.8048, + "learning_rate": 1.4036442321951892e-09, + "loss": 0.0063, + "step": 4024 + }, + { + "epoch": 0.8052, + "learning_rate": 1.647325776182873e-09, + "loss": 0.151, + "step": 4026 + }, + { + "epoch": 0.8056, + "learning_rate": 1.910499620322304e-09, + "loss": 0.1686, + "step": 4028 + }, + { + "epoch": 0.806, + "learning_rate": 2.193165251545004e-09, + "loss": 0.0029, + "step": 4030 + }, + { + "epoch": 0.8064, + "learning_rate": 2.495322118778454e-09, + "loss": 0.0098, + "step": 4032 + }, + { + "epoch": 0.8068, + "learning_rate": 2.8169696329527484e-09, + "loss": 0.1035, + "step": 4034 + }, + { + "epoch": 0.8072, + "learning_rate": 3.1581071670006013e-09, + "loss": 0.1847, + "step": 4036 + }, + { + "epoch": 0.8076, + "learning_rate": 3.518734055855122e-09, + "loss": 0.0403, + "step": 4038 + }, + { + "epoch": 0.808, + "learning_rate": 3.898849596456477e-09, + "loss": 0.0006, + "step": 4040 + }, + { + "epoch": 0.8084, + "learning_rate": 4.298453047749674e-09, + "loss": 0.0207, + "step": 4042 + }, + { + "epoch": 0.8088, + "learning_rate": 4.717543630688992e-09, + "loss": 0.0025, + "step": 4044 + }, + { + "epoch": 0.8092, + "learning_rate": 5.156120528233555e-09, + "loss": 0.0029, + "step": 4046 + }, + { + "epoch": 0.8096, + "learning_rate": 5.614182885357311e-09, + "loss": 0.0166, + "step": 4048 + }, + { + "epoch": 0.81, + "learning_rate": 6.091729809042379e-09, + "loss": 0.0007, + "step": 4050 + }, + { + "epoch": 0.8104, + "learning_rate": 6.588760368289038e-09, + "loss": 0.0405, + "step": 4052 + }, + { + "epoch": 0.8108, + "learning_rate": 7.105273594106843e-09, + "loss": 0.0226, + "step": 4054 + }, + { + "epoch": 0.8112, + "learning_rate": 7.641268479531283e-09, + "loss": 0.0714, + "step": 4056 + }, + { + "epoch": 0.8116, + "learning_rate": 8.196743979610455e-09, + "loss": 0.012, + "step": 4058 + }, + { + "epoch": 0.812, + "learning_rate": 8.771699011416169e-09, + "loss": 0.0086, + "step": 4060 + }, + { + "epoch": 0.8124, + "learning_rate": 9.366132454046162e-09, + "loss": 0.001, + "step": 4062 + }, + { + "epoch": 0.8128, + "learning_rate": 9.980043148618557e-09, + "loss": 0.0362, + "step": 4064 + }, + { + "epoch": 0.8132, + "learning_rate": 1.0613429898287397e-08, + "loss": 0.0024, + "step": 4066 + }, + { + "epoch": 0.8136, + "learning_rate": 1.126629146823044e-08, + "loss": 0.0005, + "step": 4068 + }, + { + "epoch": 0.814, + "learning_rate": 1.1938626585660252e-08, + "loss": 0.0004, + "step": 4070 + }, + { + "epoch": 0.8144, + "learning_rate": 1.2630433939825326e-08, + "loss": 0.0425, + "step": 4072 + }, + { + "epoch": 0.8148, + "learning_rate": 1.3341712182011191e-08, + "loss": 0.003, + "step": 4074 + }, + { + "epoch": 0.8152, + "learning_rate": 1.4072459925548176e-08, + "loss": 0.0041, + "step": 4076 + }, + { + "epoch": 0.8156, + "learning_rate": 1.482267574580143e-08, + "loss": 0.009, + "step": 4078 + }, + { + "epoch": 0.816, + "learning_rate": 1.5592358180189782e-08, + "loss": 0.0132, + "step": 4080 + }, + { + "epoch": 0.8164, + "learning_rate": 1.6381505728176872e-08, + "loss": 0.004, + "step": 4082 + }, + { + "epoch": 0.8168, + "learning_rate": 1.7190116851278916e-08, + "loss": 0.0013, + "step": 4084 + }, + { + "epoch": 0.8172, + "learning_rate": 1.8018189973068036e-08, + "loss": 0.0027, + "step": 4086 + }, + { + "epoch": 0.8176, + "learning_rate": 1.8865723479174482e-08, + "loss": 0.0232, + "step": 4088 + }, + { + "epoch": 0.818, + "learning_rate": 1.973271571728441e-08, + "loss": 0.0019, + "step": 4090 + }, + { + "epoch": 0.8184, + "learning_rate": 2.0619164997154327e-08, + "loss": 0.0131, + "step": 4092 + }, + { + "epoch": 0.8188, + "learning_rate": 2.1525069590606628e-08, + "loss": 0.0017, + "step": 4094 + }, + { + "epoch": 0.8192, + "learning_rate": 2.2450427731534052e-08, + "loss": 0.0807, + "step": 4096 + }, + { + "epoch": 0.8196, + "learning_rate": 2.339523761590301e-08, + "loss": 0.0669, + "step": 4098 + }, + { + "epoch": 0.82, + "learning_rate": 2.4359497401756915e-08, + "loss": 0.004, + "step": 4100 + }, + { + "epoch": 0.8204, + "learning_rate": 2.5343205209225062e-08, + "loss": 0.4262, + "step": 4102 + }, + { + "epoch": 0.8208, + "learning_rate": 2.6346359120513755e-08, + "loss": 1.8131, + "step": 4104 + }, + { + "epoch": 0.8212, + "learning_rate": 2.7368957179929602e-08, + "loss": 0.081, + "step": 4106 + }, + { + "epoch": 0.8216, + "learning_rate": 2.8410997393860663e-08, + "loss": 0.0002, + "step": 4108 + }, + { + "epoch": 0.822, + "learning_rate": 2.9472477730796423e-08, + "loss": 0.0004, + "step": 4110 + }, + { + "epoch": 0.8224, + "learning_rate": 3.0553396121330015e-08, + "loss": 0.0007, + "step": 4112 + }, + { + "epoch": 0.8228, + "learning_rate": 3.165375045815266e-08, + "loss": 0.0017, + "step": 4114 + }, + { + "epoch": 0.8232, + "learning_rate": 3.2773538596068134e-08, + "loss": 0.0384, + "step": 4116 + }, + { + "epoch": 0.8236, + "learning_rate": 3.391275835199159e-08, + "loss": 0.0004, + "step": 4118 + }, + { + "epoch": 0.824, + "learning_rate": 3.50714075049563e-08, + "loss": 0.0033, + "step": 4120 + }, + { + "epoch": 0.8244, + "learning_rate": 3.6249483796116924e-08, + "loss": 0.0893, + "step": 4122 + }, + { + "epoch": 0.8248, + "learning_rate": 3.744698492875287e-08, + "loss": 0.0151, + "step": 4124 + }, + { + "epoch": 0.8252, + "learning_rate": 3.866390856827495e-08, + "loss": 0.0092, + "step": 4126 + }, + { + "epoch": 0.8256, + "learning_rate": 3.990025234222761e-08, + "loss": 0.1874, + "step": 4128 + }, + { + "epoch": 0.826, + "learning_rate": 4.115601384029555e-08, + "loss": 0.0029, + "step": 4130 + }, + { + "epoch": 0.8264, + "learning_rate": 4.243119061430823e-08, + "loss": 0.2209, + "step": 4132 + }, + { + "epoch": 0.8268, + "learning_rate": 4.372578017824203e-08, + "loss": 0.2698, + "step": 4134 + }, + { + "epoch": 0.8272, + "learning_rate": 4.503978000822917e-08, + "loss": 0.0005, + "step": 4136 + }, + { + "epoch": 0.8276, + "learning_rate": 4.6373187542561036e-08, + "loss": 0.0015, + "step": 4138 + }, + { + "epoch": 0.828, + "learning_rate": 4.772600018168705e-08, + "loss": 0.061, + "step": 4140 + }, + { + "epoch": 0.8284, + "learning_rate": 4.9098215288234664e-08, + "loss": 0.0326, + "step": 4142 + }, + { + "epoch": 0.8288, + "learning_rate": 5.048983018699938e-08, + "loss": 0.0187, + "step": 4144 + }, + { + "epoch": 0.8292, + "learning_rate": 5.190084216495139e-08, + "loss": 0.0005, + "step": 4146 + }, + { + "epoch": 0.8296, + "learning_rate": 5.333124847126003e-08, + "loss": 0.3412, + "step": 4148 + }, + { + "epoch": 0.83, + "learning_rate": 5.4781046317264886e-08, + "loss": 0.0013, + "step": 4150 + }, + { + "epoch": 0.8304, + "learning_rate": 5.625023287652021e-08, + "loss": 0.0267, + "step": 4152 + }, + { + "epoch": 0.8308, + "learning_rate": 5.7738805284764945e-08, + "loss": 0.0234, + "step": 4154 + }, + { + "epoch": 0.8312, + "learning_rate": 5.9246760639953824e-08, + "loss": 0.0145, + "step": 4156 + }, + { + "epoch": 0.8316, + "learning_rate": 6.07740960022507e-08, + "loss": 0.0005, + "step": 4158 + }, + { + "epoch": 0.832, + "learning_rate": 6.232080839403631e-08, + "loss": 0.0254, + "step": 4160 + }, + { + "epoch": 0.8324, + "learning_rate": 6.388689479991606e-08, + "loss": 0.0167, + "step": 4162 + }, + { + "epoch": 0.8328, + "learning_rate": 6.547235216672332e-08, + "loss": 0.0002, + "step": 4164 + }, + { + "epoch": 0.8332, + "learning_rate": 6.707717740353059e-08, + "loss": 0.0006, + "step": 4166 + }, + { + "epoch": 0.8336, + "learning_rate": 6.870136738164612e-08, + "loss": 0.0832, + "step": 4168 + }, + { + "epoch": 0.834, + "learning_rate": 7.034491893462947e-08, + "loss": 0.0074, + "step": 4170 + }, + { + "epoch": 0.8344, + "learning_rate": 7.200782885829371e-08, + "loss": 0.0008, + "step": 4172 + }, + { + "epoch": 0.8348, + "learning_rate": 7.369009391071103e-08, + "loss": 0.0025, + "step": 4174 + }, + { + "epoch": 0.8352, + "learning_rate": 7.539171081221486e-08, + "loss": 0.0004, + "step": 4176 + }, + { + "epoch": 0.8356, + "learning_rate": 7.711267624542218e-08, + "loss": 0.0092, + "step": 4178 + }, + { + "epoch": 0.836, + "learning_rate": 7.885298685522014e-08, + "loss": 0.0081, + "step": 4180 + }, + { + "epoch": 0.8364, + "learning_rate": 8.061263924878493e-08, + "loss": 0.0009, + "step": 4182 + }, + { + "epoch": 0.8368, + "learning_rate": 8.239162999558514e-08, + "loss": 0.0003, + "step": 4184 + }, + { + "epoch": 0.8372, + "learning_rate": 8.418995562738175e-08, + "loss": 0.1072, + "step": 4186 + }, + { + "epoch": 0.8376, + "learning_rate": 8.600761263825475e-08, + "loss": 0.0012, + "step": 4188 + }, + { + "epoch": 0.838, + "learning_rate": 8.784459748458318e-08, + "loss": 0.0089, + "step": 4190 + }, + { + "epoch": 0.8384, + "learning_rate": 8.970090658507291e-08, + "loss": 0.0009, + "step": 4192 + }, + { + "epoch": 0.8388, + "learning_rate": 9.157653632075435e-08, + "loss": 0.0141, + "step": 4194 + }, + { + "epoch": 0.8392, + "learning_rate": 9.34714830349881e-08, + "loss": 0.0003, + "step": 4196 + }, + { + "epoch": 0.8396, + "learning_rate": 9.538574303348813e-08, + "loss": 0.0004, + "step": 4198 + }, + { + "epoch": 0.84, + "learning_rate": 9.731931258429638e-08, + "loss": 0.0368, + "step": 4200 + }, + { + "epoch": 0.8404, + "learning_rate": 9.92721879178249e-08, + "loss": 0.0114, + "step": 4202 + }, + { + "epoch": 0.8408, + "learning_rate": 1.0124436522684244e-07, + "loss": 0.0195, + "step": 4204 + }, + { + "epoch": 0.8412, + "learning_rate": 1.0323584066648795e-07, + "loss": 0.0057, + "step": 4206 + }, + { + "epoch": 0.8416, + "learning_rate": 1.0524661035427819e-07, + "loss": 0.0033, + "step": 4208 + }, + { + "epoch": 0.842, + "learning_rate": 1.0727667037011558e-07, + "loss": 0.0012, + "step": 4210 + }, + { + "epoch": 0.8424, + "learning_rate": 1.0932601675629372e-07, + "loss": 0.0068, + "step": 4212 + }, + { + "epoch": 0.8428, + "learning_rate": 1.1139464551750745e-07, + "loss": 0.0009, + "step": 4214 + }, + { + "epoch": 0.8432, + "learning_rate": 1.1348255262085828e-07, + "loss": 1.0382, + "step": 4216 + }, + { + "epoch": 0.8436, + "learning_rate": 1.155897339958656e-07, + "loss": 0.0274, + "step": 4218 + }, + { + "epoch": 0.844, + "learning_rate": 1.1771618553447328e-07, + "loss": 0.1012, + "step": 4220 + }, + { + "epoch": 0.8444, + "learning_rate": 1.198619030910464e-07, + "loss": 0.2737, + "step": 4222 + }, + { + "epoch": 0.8448, + "learning_rate": 1.2202688248241224e-07, + "loss": 0.0004, + "step": 4224 + }, + { + "epoch": 0.8452, + "learning_rate": 1.2421111948781928e-07, + "loss": 0.0026, + "step": 4226 + }, + { + "epoch": 0.8456, + "learning_rate": 1.264146098489971e-07, + "loss": 0.0075, + "step": 4228 + }, + { + "epoch": 0.846, + "learning_rate": 1.2863734927012094e-07, + "loss": 0.0012, + "step": 4230 + }, + { + "epoch": 0.8464, + "learning_rate": 1.3087933341784708e-07, + "loss": 0.012, + "step": 4232 + }, + { + "epoch": 0.8468, + "learning_rate": 1.3314055792131964e-07, + "loss": 0.0015, + "step": 4234 + }, + { + "epoch": 0.8472, + "learning_rate": 1.3542101837215826e-07, + "loss": 0.0012, + "step": 4236 + }, + { + "epoch": 0.8476, + "learning_rate": 1.377207103244904e-07, + "loss": 0.014, + "step": 4238 + }, + { + "epoch": 0.848, + "learning_rate": 1.400396292949502e-07, + "loss": 0.003, + "step": 4240 + }, + { + "epoch": 0.8484, + "learning_rate": 1.4237777076268611e-07, + "loss": 0.0073, + "step": 4242 + }, + { + "epoch": 0.8488, + "learning_rate": 1.4473513016937112e-07, + "loss": 0.0008, + "step": 4244 + }, + { + "epoch": 0.8492, + "learning_rate": 1.4711170291921374e-07, + "loss": 0.0078, + "step": 4246 + }, + { + "epoch": 0.8496, + "learning_rate": 1.4950748437896235e-07, + "loss": 0.0017, + "step": 4248 + }, + { + "epoch": 0.85, + "learning_rate": 1.519224698779187e-07, + "loss": 0.0004, + "step": 4250 + }, + { + "epoch": 0.8504, + "learning_rate": 1.5435665470794558e-07, + "loss": 0.2634, + "step": 4252 + }, + { + "epoch": 0.8508, + "learning_rate": 1.568100341234735e-07, + "loss": 0.0016, + "step": 4254 + }, + { + "epoch": 0.8512, + "learning_rate": 1.5928260334151625e-07, + "loss": 0.0085, + "step": 4256 + }, + { + "epoch": 0.8516, + "learning_rate": 1.6177435754167193e-07, + "loss": 0.0015, + "step": 4258 + }, + { + "epoch": 0.852, + "learning_rate": 1.6428529186614195e-07, + "loss": 0.0051, + "step": 4260 + }, + { + "epoch": 0.8524, + "learning_rate": 1.6681540141972208e-07, + "loss": 0.0021, + "step": 4262 + }, + { + "epoch": 0.8528, + "learning_rate": 1.6936468126984685e-07, + "loss": 0.0046, + "step": 4264 + }, + { + "epoch": 0.8532, + "learning_rate": 1.71933126446554e-07, + "loss": 0.0262, + "step": 4266 + }, + { + "epoch": 0.8536, + "learning_rate": 1.7452073194253018e-07, + "loss": 0.0004, + "step": 4268 + }, + { + "epoch": 0.854, + "learning_rate": 1.7712749271311392e-07, + "loss": 0.0887, + "step": 4270 + }, + { + "epoch": 0.8544, + "learning_rate": 1.7975340367627935e-07, + "loss": 0.0049, + "step": 4272 + }, + { + "epoch": 0.8548, + "learning_rate": 1.8239845971269266e-07, + "loss": 0.4632, + "step": 4274 + }, + { + "epoch": 0.8552, + "learning_rate": 1.8506265566567095e-07, + "loss": 0.0029, + "step": 4276 + }, + { + "epoch": 0.8556, + "learning_rate": 1.877459863412323e-07, + "loss": 0.0005, + "step": 4278 + }, + { + "epoch": 0.856, + "learning_rate": 1.904484465080836e-07, + "loss": 0.0012, + "step": 4280 + }, + { + "epoch": 0.8564, + "learning_rate": 1.9317003089764365e-07, + "loss": 0.0008, + "step": 4282 + }, + { + "epoch": 0.8568, + "learning_rate": 1.9591073420404227e-07, + "loss": 0.012, + "step": 4284 + }, + { + "epoch": 0.8572, + "learning_rate": 1.9867055108414023e-07, + "loss": 0.0014, + "step": 4286 + }, + { + "epoch": 0.8576, + "learning_rate": 2.0144947615753029e-07, + "loss": 0.0008, + "step": 4288 + }, + { + "epoch": 0.858, + "learning_rate": 2.0424750400655836e-07, + "loss": 0.0004, + "step": 4290 + }, + { + "epoch": 0.8584, + "learning_rate": 2.0706462917632453e-07, + "loss": 0.1732, + "step": 4292 + }, + { + "epoch": 0.8588, + "learning_rate": 2.0990084617470096e-07, + "loss": 0.6045, + "step": 4294 + }, + { + "epoch": 0.8592, + "learning_rate": 2.1275614947233846e-07, + "loss": 0.0183, + "step": 4296 + }, + { + "epoch": 0.8596, + "learning_rate": 2.156305335026676e-07, + "loss": 0.0011, + "step": 4298 + }, + { + "epoch": 0.86, + "learning_rate": 2.1852399266194535e-07, + "loss": 0.0028, + "step": 4300 + }, + { + "epoch": 0.8604, + "learning_rate": 2.2143652130920846e-07, + "loss": 0.0013, + "step": 4302 + }, + { + "epoch": 0.8608, + "learning_rate": 2.243681137663456e-07, + "loss": 0.0058, + "step": 4304 + }, + { + "epoch": 0.8612, + "learning_rate": 2.2731876431806631e-07, + "loss": 0.0083, + "step": 4306 + }, + { + "epoch": 0.8616, + "learning_rate": 2.3028846721191656e-07, + "loss": 0.0027, + "step": 4308 + }, + { + "epoch": 0.862, + "learning_rate": 2.3327721665832192e-07, + "loss": 0.0026, + "step": 4310 + }, + { + "epoch": 0.8624, + "learning_rate": 2.3628500683054778e-07, + "loss": 0.0029, + "step": 4312 + }, + { + "epoch": 0.8628, + "learning_rate": 2.3931183186477026e-07, + "loss": 0.0082, + "step": 4314 + }, + { + "epoch": 0.8632, + "learning_rate": 2.423576858600252e-07, + "loss": 0.0007, + "step": 4316 + }, + { + "epoch": 0.8636, + "learning_rate": 2.4542256287826915e-07, + "loss": 0.0407, + "step": 4318 + }, + { + "epoch": 0.864, + "learning_rate": 2.4850645694436736e-07, + "loss": 0.9766, + "step": 4320 + }, + { + "epoch": 0.8644, + "learning_rate": 2.516093620461113e-07, + "loss": 0.0006, + "step": 4322 + }, + { + "epoch": 0.8648, + "learning_rate": 2.547312721342277e-07, + "loss": 0.001, + "step": 4324 + }, + { + "epoch": 0.8652, + "learning_rate": 2.5787218112239387e-07, + "loss": 0.9077, + "step": 4326 + }, + { + "epoch": 0.8656, + "learning_rate": 2.6103208288724704e-07, + "loss": 0.0812, + "step": 4328 + }, + { + "epoch": 0.866, + "learning_rate": 2.6421097126839603e-07, + "loss": 0.0031, + "step": 4330 + }, + { + "epoch": 0.8664, + "learning_rate": 2.6740884006843604e-07, + "loss": 0.3848, + "step": 4332 + }, + { + "epoch": 0.8668, + "learning_rate": 2.7062568305295856e-07, + "loss": 0.005, + "step": 4334 + }, + { + "epoch": 0.8672, + "learning_rate": 2.7386149395056685e-07, + "loss": 0.0056, + "step": 4336 + }, + { + "epoch": 0.8676, + "learning_rate": 2.771162664528704e-07, + "loss": 0.0145, + "step": 4338 + }, + { + "epoch": 0.868, + "learning_rate": 2.8038999421453493e-07, + "loss": 0.0048, + "step": 4340 + }, + { + "epoch": 0.8684, + "learning_rate": 2.8368267085325806e-07, + "loss": 0.1429, + "step": 4342 + }, + { + "epoch": 0.8688, + "learning_rate": 2.8699428994979684e-07, + "loss": 0.0006, + "step": 4344 + }, + { + "epoch": 0.8692, + "learning_rate": 2.9032484504798565e-07, + "loss": 0.0059, + "step": 4346 + }, + { + "epoch": 0.8696, + "learning_rate": 2.9367432965472395e-07, + "loss": 0.0447, + "step": 4348 + }, + { + "epoch": 0.87, + "learning_rate": 2.970427372400364e-07, + "loss": 0.0008, + "step": 4350 + }, + { + "epoch": 0.8704, + "learning_rate": 3.004300612370281e-07, + "loss": 0.0022, + "step": 4352 + }, + { + "epoch": 0.8708, + "learning_rate": 3.0383629504194047e-07, + "loss": 0.0025, + "step": 4354 + }, + { + "epoch": 0.8712, + "learning_rate": 3.072614320141487e-07, + "loss": 0.1889, + "step": 4356 + }, + { + "epoch": 0.8716, + "learning_rate": 3.1070546547616653e-07, + "loss": 0.1937, + "step": 4358 + }, + { + "epoch": 0.872, + "learning_rate": 3.1416838871368925e-07, + "loss": 0.0032, + "step": 4360 + }, + { + "epoch": 0.8724, + "learning_rate": 3.1765019497555617e-07, + "loss": 0.0033, + "step": 4362 + }, + { + "epoch": 0.8728, + "learning_rate": 3.211508774738126e-07, + "loss": 0.0002, + "step": 4364 + }, + { + "epoch": 0.8732, + "learning_rate": 3.2467042938369997e-07, + "loss": 0.0007, + "step": 4366 + }, + { + "epoch": 0.8736, + "learning_rate": 3.2820884384367037e-07, + "loss": 1.5057, + "step": 4368 + }, + { + "epoch": 0.874, + "learning_rate": 3.3176611395540514e-07, + "loss": 0.0031, + "step": 4370 + }, + { + "epoch": 0.8744, + "learning_rate": 3.3534223278382294e-07, + "loss": 0.0065, + "step": 4372 + }, + { + "epoch": 0.8748, + "learning_rate": 3.389371933570973e-07, + "loss": 0.0063, + "step": 4374 + }, + { + "epoch": 0.8752, + "learning_rate": 3.425509886666689e-07, + "loss": 0.0063, + "step": 4376 + }, + { + "epoch": 0.8756, + "learning_rate": 3.46183611667259e-07, + "loss": 1.2236, + "step": 4378 + }, + { + "epoch": 0.876, + "learning_rate": 3.4983505527688366e-07, + "loss": 0.0027, + "step": 4380 + }, + { + "epoch": 0.8764, + "learning_rate": 3.5350531237686945e-07, + "loss": 0.0591, + "step": 4382 + }, + { + "epoch": 0.8768, + "learning_rate": 3.5719437581185014e-07, + "loss": 0.001, + "step": 4384 + }, + { + "epoch": 0.8772, + "learning_rate": 3.609022383898264e-07, + "loss": 0.0038, + "step": 4386 + }, + { + "epoch": 0.8776, + "learning_rate": 3.6462889288211065e-07, + "loss": 0.0103, + "step": 4388 + }, + { + "epoch": 0.878, + "learning_rate": 3.6837433202341453e-07, + "loss": 0.0106, + "step": 4390 + }, + { + "epoch": 0.8784, + "learning_rate": 3.721385485118123e-07, + "loss": 0.1586, + "step": 4392 + }, + { + "epoch": 0.8788, + "learning_rate": 3.7592153500875637e-07, + "loss": 0.0655, + "step": 4394 + }, + { + "epoch": 0.8792, + "learning_rate": 3.7972328413914185e-07, + "loss": 0.0103, + "step": 4396 + }, + { + "epoch": 0.8796, + "learning_rate": 3.835437884912474e-07, + "loss": 0.0068, + "step": 4398 + }, + { + "epoch": 0.88, + "learning_rate": 3.8738304061681107e-07, + "loss": 0.0056, + "step": 4400 + }, + { + "epoch": 0.8804, + "learning_rate": 3.912410330310146e-07, + "loss": 0.1909, + "step": 4402 + }, + { + "epoch": 0.8808, + "learning_rate": 3.9511775821250206e-07, + "loss": 0.0742, + "step": 4404 + }, + { + "epoch": 0.8812, + "learning_rate": 3.990132086034015e-07, + "loss": 0.0041, + "step": 4406 + }, + { + "epoch": 0.8816, + "learning_rate": 4.0292737660933224e-07, + "loss": 0.0022, + "step": 4408 + }, + { + "epoch": 0.882, + "learning_rate": 4.068602545994238e-07, + "loss": 0.0025, + "step": 4410 + }, + { + "epoch": 0.8824, + "learning_rate": 4.1081183490632837e-07, + "loss": 0.0014, + "step": 4412 + }, + { + "epoch": 0.8828, + "learning_rate": 4.1478210982623944e-07, + "loss": 0.0005, + "step": 4414 + }, + { + "epoch": 0.8832, + "learning_rate": 4.1877107161890194e-07, + "loss": 0.0013, + "step": 4416 + }, + { + "epoch": 0.8836, + "learning_rate": 4.2277871250763105e-07, + "loss": 0.0016, + "step": 4418 + }, + { + "epoch": 0.884, + "learning_rate": 4.268050246793254e-07, + "loss": 0.0013, + "step": 4420 + }, + { + "epoch": 0.8844, + "learning_rate": 4.308500002844884e-07, + "loss": 0.0085, + "step": 4422 + }, + { + "epoch": 0.8848, + "learning_rate": 4.349136314372182e-07, + "loss": 0.0005, + "step": 4424 + }, + { + "epoch": 0.8852, + "learning_rate": 4.38995910215273e-07, + "loss": 0.0016, + "step": 4426 + }, + { + "epoch": 0.8856, + "learning_rate": 4.4309682866004235e-07, + "loss": 0.4321, + "step": 4428 + }, + { + "epoch": 0.886, + "learning_rate": 4.472163787765593e-07, + "loss": 0.056, + "step": 4430 + }, + { + "epoch": 0.8864, + "learning_rate": 4.5135455253357165e-07, + "loss": 1.4391, + "step": 4432 + }, + { + "epoch": 0.8868, + "learning_rate": 4.55511341863476e-07, + "loss": 0.002, + "step": 4434 + }, + { + "epoch": 0.8872, + "learning_rate": 4.596867386624215e-07, + "loss": 0.0011, + "step": 4436 + }, + { + "epoch": 0.8876, + "learning_rate": 4.638807347902408e-07, + "loss": 0.0011, + "step": 4438 + }, + { + "epoch": 0.888, + "learning_rate": 4.680933220705297e-07, + "loss": 0.0011, + "step": 4440 + }, + { + "epoch": 0.8884, + "learning_rate": 4.723244922906345e-07, + "loss": 0.0124, + "step": 4442 + }, + { + "epoch": 0.8888, + "learning_rate": 4.7657423720166685e-07, + "loss": 0.8861, + "step": 4444 + }, + { + "epoch": 0.8892, + "learning_rate": 4.808425485185464e-07, + "loss": 0.0032, + "step": 4446 + }, + { + "epoch": 0.8896, + "learning_rate": 4.851294179199673e-07, + "loss": 0.0241, + "step": 4448 + }, + { + "epoch": 0.89, + "learning_rate": 4.894348370484636e-07, + "loss": 0.0018, + "step": 4450 + }, + { + "epoch": 0.8904, + "learning_rate": 4.937587975103985e-07, + "loss": 0.0556, + "step": 4452 + }, + { + "epoch": 0.8908, + "learning_rate": 4.981012908759919e-07, + "loss": 0.0013, + "step": 4454 + }, + { + "epoch": 0.8912, + "learning_rate": 5.02462308679329e-07, + "loss": 0.0106, + "step": 4456 + }, + { + "epoch": 0.8916, + "learning_rate": 5.068418424183908e-07, + "loss": 0.0182, + "step": 4458 + }, + { + "epoch": 0.892, + "learning_rate": 5.112398835550325e-07, + "loss": 0.0068, + "step": 4460 + }, + { + "epoch": 0.8924, + "learning_rate": 5.156564235150652e-07, + "loss": 0.0631, + "step": 4462 + }, + { + "epoch": 0.8928, + "learning_rate": 5.20091453688214e-07, + "loss": 0.0022, + "step": 4464 + }, + { + "epoch": 0.8932, + "learning_rate": 5.245449654281598e-07, + "loss": 0.0009, + "step": 4466 + }, + { + "epoch": 0.8936, + "learning_rate": 5.290169500525588e-07, + "loss": 0.0544, + "step": 4468 + }, + { + "epoch": 0.894, + "learning_rate": 5.335073988430328e-07, + "loss": 0.002, + "step": 4470 + }, + { + "epoch": 0.8944, + "learning_rate": 5.380163030452423e-07, + "loss": 0.0119, + "step": 4472 + }, + { + "epoch": 0.8948, + "learning_rate": 5.425436538688278e-07, + "loss": 0.0031, + "step": 4474 + }, + { + "epoch": 0.8952, + "learning_rate": 5.470894424875073e-07, + "loss": 0.0169, + "step": 4476 + }, + { + "epoch": 0.8956, + "learning_rate": 5.516536600390188e-07, + "loss": 0.0001, + "step": 4478 + }, + { + "epoch": 0.896, + "learning_rate": 5.562362976251845e-07, + "loss": 0.0016, + "step": 4480 + }, + { + "epoch": 0.8964, + "learning_rate": 5.608373463119354e-07, + "loss": 0.0015, + "step": 4482 + }, + { + "epoch": 0.8968, + "learning_rate": 5.654567971292757e-07, + "loss": 0.0018, + "step": 4484 + }, + { + "epoch": 0.8972, + "learning_rate": 5.700946410713548e-07, + "loss": 0.1544, + "step": 4486 + }, + { + "epoch": 0.8976, + "learning_rate": 5.747508690964587e-07, + "loss": 0.0142, + "step": 4488 + }, + { + "epoch": 0.898, + "learning_rate": 5.79425472127032e-07, + "loss": 0.2206, + "step": 4490 + }, + { + "epoch": 0.8984, + "learning_rate": 5.841184410496969e-07, + "loss": 0.0103, + "step": 4492 + }, + { + "epoch": 0.8988, + "learning_rate": 5.888297667152709e-07, + "loss": 0.0806, + "step": 4494 + }, + { + "epoch": 0.8992, + "learning_rate": 5.935594399387845e-07, + "loss": 0.003, + "step": 4496 + }, + { + "epoch": 0.8996, + "learning_rate": 5.983074514994957e-07, + "loss": 1.2235, + "step": 4498 + }, + { + "epoch": 0.9, + "learning_rate": 6.030737921409136e-07, + "loss": 0.001, + "step": 4500 + }, + { + "epoch": 0.9004, + "learning_rate": 6.078584525708142e-07, + "loss": 0.0004, + "step": 4502 + }, + { + "epoch": 0.9008, + "learning_rate": 6.12661423461256e-07, + "loss": 0.0087, + "step": 4504 + }, + { + "epoch": 0.9012, + "learning_rate": 6.174826954486024e-07, + "loss": 0.0007, + "step": 4506 + }, + { + "epoch": 0.9016, + "learning_rate": 6.223222591335421e-07, + "loss": 0.0046, + "step": 4508 + }, + { + "epoch": 0.902, + "learning_rate": 6.271801050810811e-07, + "loss": 0.0016, + "step": 4510 + }, + { + "epoch": 0.9024, + "learning_rate": 6.32056223820623e-07, + "loss": 0.0031, + "step": 4512 + }, + { + "epoch": 0.9028, + "learning_rate": 6.369506058459074e-07, + "loss": 0.007, + "step": 4514 + }, + { + "epoch": 0.9032, + "learning_rate": 6.418632416150861e-07, + "loss": 0.1816, + "step": 4516 + }, + { + "epoch": 0.9036, + "learning_rate": 6.467941215507434e-07, + "loss": 0.0263, + "step": 4518 + }, + { + "epoch": 0.904, + "learning_rate": 6.517432360398501e-07, + "loss": 0.0095, + "step": 4520 + }, + { + "epoch": 0.9044, + "learning_rate": 6.567105754338798e-07, + "loss": 0.0246, + "step": 4522 + }, + { + "epoch": 0.9048, + "learning_rate": 6.616961300487323e-07, + "loss": 0.2855, + "step": 4524 + }, + { + "epoch": 0.9052, + "learning_rate": 6.666998901648192e-07, + "loss": 0.0046, + "step": 4526 + }, + { + "epoch": 0.9056, + "learning_rate": 6.717218460270536e-07, + "loss": 0.0018, + "step": 4528 + }, + { + "epoch": 0.906, + "learning_rate": 6.767619878448772e-07, + "loss": 0.1913, + "step": 4530 + }, + { + "epoch": 0.9064, + "learning_rate": 6.818203057922745e-07, + "loss": 0.13, + "step": 4532 + }, + { + "epoch": 0.9068, + "learning_rate": 6.86896790007796e-07, + "loss": 0.0026, + "step": 4534 + }, + { + "epoch": 0.9072, + "learning_rate": 6.919914305945751e-07, + "loss": 0.0008, + "step": 4536 + }, + { + "epoch": 0.9076, + "learning_rate": 6.971042176203513e-07, + "loss": 0.0498, + "step": 4538 + }, + { + "epoch": 0.908, + "learning_rate": 7.022351411174833e-07, + "loss": 0.012, + "step": 4540 + }, + { + "epoch": 0.9084, + "learning_rate": 7.073841910829737e-07, + "loss": 0.0011, + "step": 4542 + }, + { + "epoch": 0.9088, + "learning_rate": 7.125513574784915e-07, + "loss": 0.0011, + "step": 4544 + }, + { + "epoch": 0.9092, + "learning_rate": 7.177366302303634e-07, + "loss": 0.0025, + "step": 4546 + }, + { + "epoch": 0.9096, + "learning_rate": 7.229399992296604e-07, + "loss": 0.0019, + "step": 4548 + }, + { + "epoch": 0.91, + "learning_rate": 7.281614543321214e-07, + "loss": 0.2116, + "step": 4550 + }, + { + "epoch": 0.9104, + "learning_rate": 7.334009853582736e-07, + "loss": 0.0134, + "step": 4552 + }, + { + "epoch": 0.9108, + "learning_rate": 7.386585820933822e-07, + "loss": 0.0043, + "step": 4554 + }, + { + "epoch": 0.9112, + "learning_rate": 7.439342342874733e-07, + "loss": 0.1601, + "step": 4556 + }, + { + "epoch": 0.9116, + "learning_rate": 7.492279316554207e-07, + "loss": 0.0115, + "step": 4558 + }, + { + "epoch": 0.912, + "learning_rate": 7.545396638768698e-07, + "loss": 0.0005, + "step": 4560 + }, + { + "epoch": 0.9124, + "learning_rate": 7.598694205963331e-07, + "loss": 0.0232, + "step": 4562 + }, + { + "epoch": 0.9128, + "learning_rate": 7.652171914231777e-07, + "loss": 0.0038, + "step": 4564 + }, + { + "epoch": 0.9132, + "learning_rate": 7.7058296593165e-07, + "loss": 0.0047, + "step": 4566 + }, + { + "epoch": 0.9136, + "learning_rate": 7.759667336608989e-07, + "loss": 0.0047, + "step": 4568 + }, + { + "epoch": 0.914, + "learning_rate": 7.813684841149938e-07, + "loss": 0.0103, + "step": 4570 + }, + { + "epoch": 0.9144, + "learning_rate": 7.86788206762945e-07, + "loss": 0.0008, + "step": 4572 + }, + { + "epoch": 0.9148, + "learning_rate": 7.92225891038727e-07, + "loss": 0.6776, + "step": 4574 + }, + { + "epoch": 0.9152, + "learning_rate": 7.976815263412951e-07, + "loss": 0.0016, + "step": 4576 + }, + { + "epoch": 0.9156, + "learning_rate": 8.031551020346085e-07, + "loss": 0.4515, + "step": 4578 + }, + { + "epoch": 0.916, + "learning_rate": 8.086466074476528e-07, + "loss": 0.0005, + "step": 4580 + }, + { + "epoch": 0.9164, + "learning_rate": 8.141560318744568e-07, + "loss": 0.0227, + "step": 4582 + }, + { + "epoch": 0.9168, + "learning_rate": 8.19683364574122e-07, + "loss": 0.0006, + "step": 4584 + }, + { + "epoch": 0.9172, + "learning_rate": 8.252285947708105e-07, + "loss": 0.0018, + "step": 4586 + }, + { + "epoch": 0.9176, + "learning_rate": 8.307917116538322e-07, + "loss": 0.0092, + "step": 4588 + }, + { + "epoch": 0.918, + "learning_rate": 8.363727043776071e-07, + "loss": 0.001, + "step": 4590 + }, + { + "epoch": 0.9184, + "learning_rate": 8.419715620616819e-07, + "loss": 0.0003, + "step": 4592 + }, + { + "epoch": 0.9188, + "learning_rate": 8.475882737908258e-07, + "loss": 0.0007, + "step": 4594 + }, + { + "epoch": 0.9192, + "learning_rate": 8.532228286149447e-07, + "loss": 0.0011, + "step": 4596 + }, + { + "epoch": 0.9196, + "learning_rate": 8.58875215549213e-07, + "loss": 0.0006, + "step": 4598 + }, + { + "epoch": 0.92, + "learning_rate": 8.645454235739925e-07, + "loss": 0.0018, + "step": 4600 + }, + { + "epoch": 0.9204, + "learning_rate": 8.702334416349279e-07, + "loss": 0.4101, + "step": 4602 + }, + { + "epoch": 0.9208, + "learning_rate": 8.759392586429383e-07, + "loss": 0.0223, + "step": 4604 + }, + { + "epoch": 0.9212, + "learning_rate": 8.816628634742419e-07, + "loss": 0.1053, + "step": 4606 + }, + { + "epoch": 0.9216, + "learning_rate": 8.874042449703779e-07, + "loss": 0.0676, + "step": 4608 + }, + { + "epoch": 0.922, + "learning_rate": 8.931633919382276e-07, + "loss": 0.0006, + "step": 4610 + }, + { + "epoch": 0.9224, + "learning_rate": 8.989402931500424e-07, + "loss": 0.2833, + "step": 4612 + }, + { + "epoch": 0.9228, + "learning_rate": 9.047349373434544e-07, + "loss": 0.0014, + "step": 4614 + }, + { + "epoch": 0.9232, + "learning_rate": 9.105473132215115e-07, + "loss": 0.0952, + "step": 4616 + }, + { + "epoch": 0.9236, + "learning_rate": 9.163774094526867e-07, + "loss": 0.0251, + "step": 4618 + }, + { + "epoch": 0.924, + "learning_rate": 9.222252146709177e-07, + "loss": 0.4384, + "step": 4620 + }, + { + "epoch": 0.9244, + "learning_rate": 9.280907174755871e-07, + "loss": 0.042, + "step": 4622 + }, + { + "epoch": 0.9248, + "learning_rate": 9.3397390643162e-07, + "loss": 0.9683, + "step": 4624 + }, + { + "epoch": 0.9252, + "learning_rate": 9.398747700694288e-07, + "loss": 0.4798, + "step": 4626 + }, + { + "epoch": 0.9256, + "learning_rate": 9.457932968849792e-07, + "loss": 0.0086, + "step": 4628 + }, + { + "epoch": 0.926, + "learning_rate": 9.517294753398076e-07, + "loss": 0.0035, + "step": 4630 + }, + { + "epoch": 0.9264, + "learning_rate": 9.576832938610082e-07, + "loss": 0.0018, + "step": 4632 + }, + { + "epoch": 0.9268, + "learning_rate": 9.636547408413366e-07, + "loss": 0.0028, + "step": 4634 + }, + { + "epoch": 0.9272, + "learning_rate": 9.696438046391298e-07, + "loss": 0.0166, + "step": 4636 + }, + { + "epoch": 0.9276, + "learning_rate": 9.756504735784e-07, + "loss": 0.0208, + "step": 4638 + }, + { + "epoch": 0.928, + "learning_rate": 9.816747359488632e-07, + "loss": 0.0651, + "step": 4640 + }, + { + "epoch": 0.9284, + "learning_rate": 9.877165800058796e-07, + "loss": 0.09, + "step": 4642 + }, + { + "epoch": 0.9288, + "learning_rate": 9.93775993970597e-07, + "loss": 0.001, + "step": 4644 + }, + { + "epoch": 0.9292, + "learning_rate": 9.998529660298528e-07, + "loss": 0.0026, + "step": 4646 + }, + { + "epoch": 0.9296, + "learning_rate": 1.0059474843362893e-06, + "loss": 0.004, + "step": 4648 + }, + { + "epoch": 0.93, + "learning_rate": 1.0120595370083296e-06, + "loss": 0.0592, + "step": 4650 + }, + { + "epoch": 0.9304, + "learning_rate": 1.0181891121302145e-06, + "loss": 0.4859, + "step": 4652 + }, + { + "epoch": 0.9308, + "learning_rate": 1.0243361977520227e-06, + "loss": 0.0107, + "step": 4654 + }, + { + "epoch": 0.9312, + "learning_rate": 1.0305007818896983e-06, + "loss": 0.002, + "step": 4656 + }, + { + "epoch": 0.9316, + "learning_rate": 1.0366828525250683e-06, + "loss": 0.0009, + "step": 4658 + }, + { + "epoch": 0.932, + "learning_rate": 1.042882397605869e-06, + "loss": 0.0907, + "step": 4660 + }, + { + "epoch": 0.9324, + "learning_rate": 1.0490994050457704e-06, + "loss": 0.0162, + "step": 4662 + }, + { + "epoch": 0.9328, + "learning_rate": 1.0553338627243981e-06, + "loss": 0.0041, + "step": 4664 + }, + { + "epoch": 0.9332, + "learning_rate": 1.0615857584873668e-06, + "loss": 0.1031, + "step": 4666 + }, + { + "epoch": 0.9336, + "learning_rate": 1.0678550801462606e-06, + "loss": 0.8854, + "step": 4668 + }, + { + "epoch": 0.934, + "learning_rate": 1.0741418154787487e-06, + "loss": 0.0029, + "step": 4670 + }, + { + "epoch": 0.9344, + "learning_rate": 1.0804459522284872e-06, + "loss": 0.01, + "step": 4672 + }, + { + "epoch": 0.9348, + "learning_rate": 1.0867674781052617e-06, + "loss": 0.0011, + "step": 4674 + }, + { + "epoch": 0.9352, + "learning_rate": 1.0931063807849363e-06, + "loss": 0.0333, + "step": 4676 + }, + { + "epoch": 0.9356, + "learning_rate": 1.0994626479094694e-06, + "loss": 0.0032, + "step": 4678 + }, + { + "epoch": 0.936, + "learning_rate": 1.1058362670870259e-06, + "loss": 0.0066, + "step": 4680 + }, + { + "epoch": 0.9364, + "learning_rate": 1.1122272258918775e-06, + "loss": 0.0473, + "step": 4682 + }, + { + "epoch": 0.9368, + "learning_rate": 1.1186355118645552e-06, + "loss": 0.2942, + "step": 4684 + }, + { + "epoch": 0.9372, + "learning_rate": 1.1250611125117527e-06, + "loss": 0.0052, + "step": 4686 + }, + { + "epoch": 0.9376, + "learning_rate": 1.1315040153064416e-06, + "loss": 0.2521, + "step": 4688 + }, + { + "epoch": 0.938, + "learning_rate": 1.1379642076878528e-06, + "loss": 0.004, + "step": 4690 + }, + { + "epoch": 0.9384, + "learning_rate": 1.1444416770615097e-06, + "loss": 0.0009, + "step": 4692 + }, + { + "epoch": 0.9388, + "learning_rate": 1.150936410799256e-06, + "loss": 0.0304, + "step": 4694 + }, + { + "epoch": 0.9392, + "learning_rate": 1.1574483962392747e-06, + "loss": 0.0022, + "step": 4696 + }, + { + "epoch": 0.9396, + "learning_rate": 1.1639776206861176e-06, + "loss": 0.0034, + "step": 4698 + }, + { + "epoch": 0.94, + "learning_rate": 1.170524071410728e-06, + "loss": 0.0009, + "step": 4700 + }, + { + "epoch": 0.9404, + "learning_rate": 1.177087735650464e-06, + "loss": 0.0009, + "step": 4702 + }, + { + "epoch": 0.9408, + "learning_rate": 1.1836686006091268e-06, + "loss": 0.0522, + "step": 4704 + }, + { + "epoch": 0.9412, + "learning_rate": 1.1902666534569928e-06, + "loss": 0.2714, + "step": 4706 + }, + { + "epoch": 0.9416, + "learning_rate": 1.1968818813307936e-06, + "loss": 0.0029, + "step": 4708 + }, + { + "epoch": 0.942, + "learning_rate": 1.2035142713338299e-06, + "loss": 0.479, + "step": 4710 + }, + { + "epoch": 0.9424, + "learning_rate": 1.2101638105359104e-06, + "loss": 0.0028, + "step": 4712 + }, + { + "epoch": 0.9428, + "learning_rate": 1.2168304859734159e-06, + "loss": 0.0316, + "step": 4714 + }, + { + "epoch": 0.9432, + "learning_rate": 1.223514284649333e-06, + "loss": 0.0004, + "step": 4716 + }, + { + "epoch": 0.9436, + "learning_rate": 1.2302151935332263e-06, + "loss": 0.025, + "step": 4718 + }, + { + "epoch": 0.944, + "learning_rate": 1.2369331995613654e-06, + "loss": 0.0037, + "step": 4720 + }, + { + "epoch": 0.9444, + "learning_rate": 1.2436682896366292e-06, + "loss": 0.0398, + "step": 4722 + }, + { + "epoch": 0.9448, + "learning_rate": 1.2504204506286234e-06, + "loss": 0.0015, + "step": 4724 + }, + { + "epoch": 0.9452, + "learning_rate": 1.257189669373664e-06, + "loss": 0.0027, + "step": 4726 + }, + { + "epoch": 0.9456, + "learning_rate": 1.2639759326748047e-06, + "loss": 0.0028, + "step": 4728 + }, + { + "epoch": 0.946, + "learning_rate": 1.2707792273019026e-06, + "loss": 0.0157, + "step": 4730 + }, + { + "epoch": 0.9464, + "learning_rate": 1.277599539991562e-06, + "loss": 0.0018, + "step": 4732 + }, + { + "epoch": 0.9468, + "learning_rate": 1.2844368574472433e-06, + "loss": 0.0074, + "step": 4734 + }, + { + "epoch": 0.9472, + "learning_rate": 1.2912911663392447e-06, + "loss": 0.028, + "step": 4736 + }, + { + "epoch": 0.9476, + "learning_rate": 1.298162453304741e-06, + "loss": 0.2095, + "step": 4738 + }, + { + "epoch": 0.948, + "learning_rate": 1.3050507049478078e-06, + "loss": 0.0615, + "step": 4740 + }, + { + "epoch": 0.9484, + "learning_rate": 1.3119559078394439e-06, + "loss": 0.4802, + "step": 4742 + }, + { + "epoch": 0.9488, + "learning_rate": 1.3188780485176044e-06, + "loss": 0.0013, + "step": 4744 + }, + { + "epoch": 0.9492, + "learning_rate": 1.3258171134872223e-06, + "loss": 0.0411, + "step": 4746 + }, + { + "epoch": 0.9496, + "learning_rate": 1.3327730892202329e-06, + "loss": 0.0334, + "step": 4748 + }, + { + "epoch": 0.95, + "learning_rate": 1.3397459621556086e-06, + "loss": 0.0011, + "step": 4750 + }, + { + "epoch": 0.9504, + "learning_rate": 1.3467357186993847e-06, + "loss": 0.001, + "step": 4752 + }, + { + "epoch": 0.9508, + "learning_rate": 1.3537423452246446e-06, + "loss": 0.0111, + "step": 4754 + }, + { + "epoch": 0.9512, + "learning_rate": 1.3607658280716495e-06, + "loss": 0.0011, + "step": 4756 + }, + { + "epoch": 0.9516, + "learning_rate": 1.367806153547724e-06, + "loss": 0.0065, + "step": 4758 + }, + { + "epoch": 0.952, + "learning_rate": 1.3748633079274275e-06, + "loss": 0.0007, + "step": 4760 + }, + { + "epoch": 0.9524, + "learning_rate": 1.381937277452451e-06, + "loss": 0.0087, + "step": 4762 + }, + { + "epoch": 0.9528, + "learning_rate": 1.3890280483317287e-06, + "loss": 0.0386, + "step": 4764 + }, + { + "epoch": 0.9532, + "learning_rate": 1.3961356067414667e-06, + "loss": 0.003, + "step": 4766 + }, + { + "epoch": 0.9536, + "learning_rate": 1.403259938825089e-06, + "loss": 0.0075, + "step": 4768 + }, + { + "epoch": 0.954, + "learning_rate": 1.4104010306933558e-06, + "loss": 0.0021, + "step": 4770 + }, + { + "epoch": 0.9544, + "learning_rate": 1.4175588684243413e-06, + "loss": 0.0075, + "step": 4772 + }, + { + "epoch": 0.9548, + "learning_rate": 1.4247334380634759e-06, + "loss": 0.0643, + "step": 4774 + }, + { + "epoch": 0.9552, + "learning_rate": 1.4319247256235692e-06, + "loss": 0.0639, + "step": 4776 + }, + { + "epoch": 0.9556, + "learning_rate": 1.4391327170848368e-06, + "loss": 0.0078, + "step": 4778 + }, + { + "epoch": 0.956, + "learning_rate": 1.4463573983949309e-06, + "loss": 0.003, + "step": 4780 + }, + { + "epoch": 0.9564, + "learning_rate": 1.4535987554689668e-06, + "loss": 0.1756, + "step": 4782 + }, + { + "epoch": 0.9568, + "learning_rate": 1.4608567741895462e-06, + "loss": 0.004, + "step": 4784 + }, + { + "epoch": 0.9572, + "learning_rate": 1.4681314404067925e-06, + "loss": 0.0094, + "step": 4786 + }, + { + "epoch": 0.9576, + "learning_rate": 1.4754227399383703e-06, + "loss": 0.0007, + "step": 4788 + }, + { + "epoch": 0.958, + "learning_rate": 1.482730658569519e-06, + "loss": 0.0103, + "step": 4790 + }, + { + "epoch": 0.9584, + "learning_rate": 1.490055182053085e-06, + "loss": 0.0017, + "step": 4792 + }, + { + "epoch": 0.9588, + "learning_rate": 1.497396296109509e-06, + "loss": 0.0041, + "step": 4794 + }, + { + "epoch": 0.9592, + "learning_rate": 1.504753986426949e-06, + "loss": 0.0042, + "step": 4796 + }, + { + "epoch": 0.9596, + "learning_rate": 1.5121282386611846e-06, + "loss": 0.0888, + "step": 4798 + }, + { + "epoch": 0.96, + "learning_rate": 1.5195190384357329e-06, + "loss": 0.0018, + "step": 4800 + }, + { + "epoch": 0.9604, + "learning_rate": 1.5269263713418803e-06, + "loss": 0.0084, + "step": 4802 + }, + { + "epoch": 0.9608, + "learning_rate": 1.5343502229386143e-06, + "loss": 0.0213, + "step": 4804 + }, + { + "epoch": 0.9612, + "learning_rate": 1.5417905787527932e-06, + "loss": 0.002, + "step": 4806 + }, + { + "epoch": 0.9616, + "learning_rate": 1.5492474242790355e-06, + "loss": 0.0006, + "step": 4808 + }, + { + "epoch": 0.962, + "learning_rate": 1.5567207449798493e-06, + "loss": 0.1064, + "step": 4810 + }, + { + "epoch": 0.9624, + "learning_rate": 1.5642105262856122e-06, + "loss": 0.0634, + "step": 4812 + }, + { + "epoch": 0.9628, + "learning_rate": 1.571716753594612e-06, + "loss": 0.0013, + "step": 4814 + }, + { + "epoch": 0.9632, + "learning_rate": 1.5792394122730759e-06, + "loss": 0.0023, + "step": 4816 + }, + { + "epoch": 0.9636, + "learning_rate": 1.586778487655194e-06, + "loss": 0.0011, + "step": 4818 + }, + { + "epoch": 0.964, + "learning_rate": 1.5943339650431544e-06, + "loss": 0.0014, + "step": 4820 + }, + { + "epoch": 0.9644, + "learning_rate": 1.6019058297071676e-06, + "loss": 0.0141, + "step": 4822 + }, + { + "epoch": 0.9648, + "learning_rate": 1.6094940668854964e-06, + "loss": 0.0028, + "step": 4824 + }, + { + "epoch": 0.9652, + "learning_rate": 1.617098661784482e-06, + "loss": 0.0582, + "step": 4826 + }, + { + "epoch": 0.9656, + "learning_rate": 1.624719599578588e-06, + "loss": 0.0114, + "step": 4828 + }, + { + "epoch": 0.966, + "learning_rate": 1.6323568654103783e-06, + "loss": 0.0036, + "step": 4830 + }, + { + "epoch": 0.9664, + "learning_rate": 1.6400104443906507e-06, + "loss": 0.3592, + "step": 4832 + }, + { + "epoch": 0.9668, + "learning_rate": 1.6476803215983216e-06, + "loss": 0.0038, + "step": 4834 + }, + { + "epoch": 0.9672, + "learning_rate": 1.6553664820806026e-06, + "loss": 0.056, + "step": 4836 + }, + { + "epoch": 0.9676, + "learning_rate": 1.6630689108529307e-06, + "loss": 0.0284, + "step": 4838 + }, + { + "epoch": 0.968, + "learning_rate": 1.670787592898998e-06, + "loss": 0.3617, + "step": 4840 + }, + { + "epoch": 0.9684, + "learning_rate": 1.6785225131708772e-06, + "loss": 0.0079, + "step": 4842 + }, + { + "epoch": 0.9688, + "learning_rate": 1.686273656588907e-06, + "loss": 0.0372, + "step": 4844 + }, + { + "epoch": 0.9692, + "learning_rate": 1.6940410080418744e-06, + "loss": 0.0017, + "step": 4846 + }, + { + "epoch": 0.9696, + "learning_rate": 1.7018245523869025e-06, + "loss": 0.0041, + "step": 4848 + }, + { + "epoch": 0.97, + "learning_rate": 1.7096242744495827e-06, + "loss": 0.0045, + "step": 4850 + }, + { + "epoch": 0.9704, + "learning_rate": 1.7174401590239587e-06, + "loss": 0.0135, + "step": 4852 + }, + { + "epoch": 0.9708, + "learning_rate": 1.7252721908725612e-06, + "loss": 0.0009, + "step": 4854 + }, + { + "epoch": 0.9712, + "learning_rate": 1.7331203547264442e-06, + "loss": 0.0002, + "step": 4856 + }, + { + "epoch": 0.9716, + "learning_rate": 1.7409846352852123e-06, + "loss": 0.0228, + "step": 4858 + }, + { + "epoch": 0.972, + "learning_rate": 1.7488650172170462e-06, + "loss": 0.0048, + "step": 4860 + }, + { + "epoch": 0.9724, + "learning_rate": 1.7567614851587388e-06, + "loss": 0.0882, + "step": 4862 + }, + { + "epoch": 0.9728, + "learning_rate": 1.7646740237157222e-06, + "loss": 0.002, + "step": 4864 + }, + { + "epoch": 0.9732, + "learning_rate": 1.772602617462097e-06, + "loss": 0.002, + "step": 4866 + }, + { + "epoch": 0.9736, + "learning_rate": 1.7805472509406752e-06, + "loss": 0.0047, + "step": 4868 + }, + { + "epoch": 0.974, + "learning_rate": 1.7885079086629553e-06, + "loss": 0.0009, + "step": 4870 + }, + { + "epoch": 0.9744, + "learning_rate": 1.7964845751092585e-06, + "loss": 0.0032, + "step": 4872 + }, + { + "epoch": 0.9748, + "learning_rate": 1.8044772347286555e-06, + "loss": 0.0009, + "step": 4874 + }, + { + "epoch": 0.9752, + "learning_rate": 1.8124858719390482e-06, + "loss": 0.4364, + "step": 4876 + }, + { + "epoch": 0.9756, + "learning_rate": 1.820510471127198e-06, + "loss": 0.022, + "step": 4878 + }, + { + "epoch": 0.976, + "learning_rate": 1.8285510166487075e-06, + "loss": 0.002, + "step": 4880 + }, + { + "epoch": 0.9764, + "learning_rate": 1.836607492828163e-06, + "loss": 0.0056, + "step": 4882 + }, + { + "epoch": 0.9768, + "learning_rate": 1.8446798839590207e-06, + "loss": 0.0322, + "step": 4884 + }, + { + "epoch": 0.9772, + "learning_rate": 1.8527681743037418e-06, + "loss": 0.0012, + "step": 4886 + }, + { + "epoch": 0.9776, + "learning_rate": 1.8608723480938196e-06, + "loss": 0.0084, + "step": 4888 + }, + { + "epoch": 0.978, + "learning_rate": 1.8689923895297135e-06, + "loss": 0.0235, + "step": 4890 + }, + { + "epoch": 0.9784, + "learning_rate": 1.8771282827810278e-06, + "loss": 0.1283, + "step": 4892 + }, + { + "epoch": 0.9788, + "learning_rate": 1.8852800119863902e-06, + "loss": 0.0261, + "step": 4894 + }, + { + "epoch": 0.9792, + "learning_rate": 1.8934475612535995e-06, + "loss": 0.0127, + "step": 4896 + }, + { + "epoch": 0.9796, + "learning_rate": 1.901630914659599e-06, + "loss": 0.0065, + "step": 4898 + }, + { + "epoch": 0.98, + "learning_rate": 1.9098300562505232e-06, + "loss": 0.0029, + "step": 4900 + }, + { + "epoch": 0.9804, + "learning_rate": 1.918044970041727e-06, + "loss": 1.0418, + "step": 4902 + }, + { + "epoch": 0.9808, + "learning_rate": 1.9262756400178138e-06, + "loss": 0.0013, + "step": 4904 + }, + { + "epoch": 0.9812, + "learning_rate": 1.934522050132672e-06, + "loss": 0.0032, + "step": 4906 + }, + { + "epoch": 0.9816, + "learning_rate": 1.942784184309503e-06, + "loss": 0.0003, + "step": 4908 + }, + { + "epoch": 0.982, + "learning_rate": 1.9510620264408543e-06, + "loss": 0.0015, + "step": 4910 + }, + { + "epoch": 0.9824, + "learning_rate": 1.9593555603886483e-06, + "loss": 0.0013, + "step": 4912 + }, + { + "epoch": 0.9828, + "learning_rate": 1.9676647699842267e-06, + "loss": 0.0008, + "step": 4914 + }, + { + "epoch": 0.9832, + "learning_rate": 1.9759896390283286e-06, + "loss": 0.0025, + "step": 4916 + }, + { + "epoch": 0.9836, + "learning_rate": 1.9843301512912383e-06, + "loss": 0.01, + "step": 4918 + }, + { + "epoch": 0.984, + "learning_rate": 1.9926862905126574e-06, + "loss": 0.4986, + "step": 4920 + }, + { + "epoch": 0.9844, + "learning_rate": 2.0010580404018985e-06, + "loss": 0.0199, + "step": 4922 + }, + { + "epoch": 0.9848, + "learning_rate": 2.009445384637805e-06, + "loss": 0.1664, + "step": 4924 + }, + { + "epoch": 0.9852, + "learning_rate": 2.0178483068687882e-06, + "loss": 0.0247, + "step": 4926 + }, + { + "epoch": 0.9856, + "learning_rate": 2.026266790712965e-06, + "loss": 0.0024, + "step": 4928 + }, + { + "epoch": 0.986, + "learning_rate": 2.0347008197580376e-06, + "loss": 0.0054, + "step": 4930 + }, + { + "epoch": 0.9864, + "learning_rate": 2.0431503775614457e-06, + "loss": 0.0009, + "step": 4932 + }, + { + "epoch": 0.9868, + "learning_rate": 2.051615447650345e-06, + "loss": 0.0003, + "step": 4934 + }, + { + "epoch": 0.9872, + "learning_rate": 2.060096013521645e-06, + "loss": 0.0004, + "step": 4936 + }, + { + "epoch": 0.9876, + "learning_rate": 2.068592058642053e-06, + "loss": 0.0003, + "step": 4938 + }, + { + "epoch": 0.988, + "learning_rate": 2.077103566448091e-06, + "loss": 0.0286, + "step": 4940 + }, + { + "epoch": 0.9884, + "learning_rate": 2.08563052034614e-06, + "loss": 0.0069, + "step": 4942 + }, + { + "epoch": 0.9888, + "learning_rate": 2.094172903712468e-06, + "loss": 0.0057, + "step": 4944 + }, + { + "epoch": 0.9892, + "learning_rate": 2.1027306998932596e-06, + "loss": 0.0102, + "step": 4946 + }, + { + "epoch": 0.9896, + "learning_rate": 2.1113038922046547e-06, + "loss": 0.0016, + "step": 4948 + }, + { + "epoch": 0.99, + "learning_rate": 2.1198924639327757e-06, + "loss": 0.0619, + "step": 4950 + }, + { + "epoch": 0.9904, + "learning_rate": 2.1284963983337626e-06, + "loss": 0.0007, + "step": 4952 + }, + { + "epoch": 0.9908, + "learning_rate": 2.1371156786338167e-06, + "loss": 0.0404, + "step": 4954 + }, + { + "epoch": 0.9912, + "learning_rate": 2.145750288029176e-06, + "loss": 0.0497, + "step": 4956 + }, + { + "epoch": 0.9916, + "learning_rate": 2.1544002096862605e-06, + "loss": 0.0051, + "step": 4958 + }, + { + "epoch": 0.992, + "learning_rate": 2.1630654267416064e-06, + "loss": 0.0707, + "step": 4960 + }, + { + "epoch": 0.9924, + "learning_rate": 2.1717459223018955e-06, + "loss": 0.0016, + "step": 4962 + }, + { + "epoch": 0.9928, + "learning_rate": 2.180441679444102e-06, + "loss": 0.4568, + "step": 4964 + }, + { + "epoch": 0.9932, + "learning_rate": 2.1891526812153585e-06, + "loss": 0.0016, + "step": 4966 + }, + { + "epoch": 0.9936, + "learning_rate": 2.1978789106331657e-06, + "loss": 0.0058, + "step": 4968 + }, + { + "epoch": 0.994, + "learning_rate": 2.2066203506852603e-06, + "loss": 0.0025, + "step": 4970 + }, + { + "epoch": 0.9944, + "learning_rate": 2.2153769843297655e-06, + "loss": 0.0164, + "step": 4972 + }, + { + "epoch": 0.9948, + "learning_rate": 2.22414879449518e-06, + "loss": 0.0047, + "step": 4974 + }, + { + "epoch": 0.9952, + "learning_rate": 2.2329357640804084e-06, + "loss": 0.0079, + "step": 4976 + }, + { + "epoch": 0.9956, + "learning_rate": 2.241737875954806e-06, + "loss": 0.0048, + "step": 4978 + }, + { + "epoch": 0.996, + "learning_rate": 2.2505551129582025e-06, + "loss": 0.0106, + "step": 4980 + }, + { + "epoch": 0.9964, + "learning_rate": 2.2593874579009445e-06, + "loss": 0.0009, + "step": 4982 + }, + { + "epoch": 0.9968, + "learning_rate": 2.268234893563924e-06, + "loss": 0.0011, + "step": 4984 + }, + { + "epoch": 0.9972, + "learning_rate": 2.2770974026986127e-06, + "loss": 0.0004, + "step": 4986 + }, + { + "epoch": 0.9976, + "learning_rate": 2.285974968027095e-06, + "loss": 0.2893, + "step": 4988 + }, + { + "epoch": 0.998, + "learning_rate": 2.294867572242114e-06, + "loss": 0.0251, + "step": 4990 + }, + { + "epoch": 0.9984, + "learning_rate": 2.30377519800705e-06, + "loss": 0.0003, + "step": 4992 + }, + { + "epoch": 0.9988, + "learning_rate": 2.312697827956063e-06, + "loss": 0.0024, + "step": 4994 + }, + { + "epoch": 0.9992, + "learning_rate": 2.321635444694019e-06, + "loss": 0.1456, + "step": 4996 + }, + { + "epoch": 0.9996, + "learning_rate": 2.3305880307965766e-06, + "loss": 0.0247, + "step": 4998 + }, + { + "epoch": 1.0, + "learning_rate": 2.339555568810223e-06, + "loss": 0.0042, + "step": 5000 + }, + { + "epoch": 1.0, + "step": 5000, + "total_flos": 2.119495705899827e+16, + "train_loss": 0.10178471852300863, + "train_runtime": 28904.0774, + "train_samples_per_second": 2.768, + "train_steps_per_second": 0.173 + } + ], + "logging_steps": 2, + "max_steps": 5000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": {}, + "total_flos": 2.119495705899827e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/client_0/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/client_0/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be7c4e6ebf3adf60d40279039dada843bba117e7 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/client_0/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b50752233e93c3e8adbdc78f6616b7330389bf23016294ab06b96cdb455bbb03 +size 7675680156 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/client_0/global_step5000/mp_rank_00_model_states.pt b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/client_0/global_step5000/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..197e7958d2f50c2e87e0848654e568ecc525d9ad --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/client_0/global_step5000/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fd962f80ef7adcaea396fa8365179b27e8994e591ef0cb30b19b80c2b64eabb +size 1279785836 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/client_0/latest b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/client_0/latest new file mode 100644 index 0000000000000000000000000000000000000000..f805186fa43374540c3fa51dfd3cca9ac06e56a5 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/client_0/latest @@ -0,0 +1 @@ +global_step5000 \ No newline at end of file diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/client_0/scheduler.pt b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/client_0/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd1852e7a489cd0644da937c44bbca6c59d22aa7 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/client_0/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6114a580a27b4e640069b712a5ebef4e480ec3412605f0bdab51e5b751aa8599 +size 1064 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/client_0/zero_to_fp32.py b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/client_0/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..e93cb1c95f15c1474642edb1978714075361bc04 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/client_0/zero_to_fp32.py @@ -0,0 +1,758 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: + shared_tensor = state_dict[converted_tensors[tensor_id]] + state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + state_dict[name] = tensor.contiguous() + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in shard_state_dict: + del state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ba42b4932cae92aa3ef42e1fe2f6e6fe64be05e5 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9006f124e89064d1bff9d515435bc1591b292e92bdc09653c77b2984e213c1c +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth new file mode 100644 index 0000000000000000000000000000000000000000..cb7aa6d292b191a1fc0c6c606fddc9fbbcb9c1ed --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5c018ba7b91d636950545bc36d2d4772225382af950e34184d775197d85994e +size 1279587682 diff --git a/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth new file mode 100644 index 0000000000000000000000000000000000000000..06358bba631d95bb54ac54a1823abfc384ff97a5 --- /dev/null +++ b/new_checkpoints/client_states_v9_NEURIPS_DISJOINT_Memonly_LORA_llava_lr2e-5_bs1_gradacc32_iter0_5_selfsup_scenario18_new_10000_random0_0625_seed1/server_model_round2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75a8c1d59daed33118977f4df676896c0bbf78ca9f5f14482d6344ce3c4cb773 +size 1279587682